diff --git a/.bazelrc b/.bazelrc
index 590a87f5732..7a32ca68e40 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -30,6 +30,10 @@ build:monolithic --define framework_shared_object=false
 # opts in to modular op registration support by default.
 build --define framework_shared_object=true
 
+# Flags for open source build, always set to be true.
+build --define open_source_build=true
+test --define open_source_build=true
+
 # Please note that MKL on MacOS or windows is still not supported.
 # If you would like to use a local MKL instead of downloading, please set the
 # environment variable "TF_MKL_ROOT" every time before build.
@@ -108,6 +112,10 @@ build --spawn_strategy=standalone
 build --strategy=Genrule=standalone
 build -c opt
 
+# By default, build TF in C++ 14 mode.
+build --cxxopt=-std=c++14
+build --host_cxxopt=-std=c++14
+
 # Make Bazel print out all options from rc files.
 build --announce_rc
 
diff --git a/CODEOWNERS b/CODEOWNERS
index 2828cf3baf8..25ff318d2d8 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,13 +1,14 @@
 # Where component owners are known, add them here.
 
-/tensorflow/c/eager @jaingurav @alextp
+/tensorflow/c/eager @jaingaurav @alextp
 /tensorflow/core/common_runtime/eager @jaingaurav @alextp
 /tenosrflow/core/debug @caisq
 /tensorflow/core/nccl/ @azaks2 @chsigg
 /tensorflow/core/platform/windows/ @mrry
 /tensorflow/core/platform/s3 @yongtang
+/tensorflow/python/autograph/ @mdanatg @kkimdev
 /tensorflow/python/debug @caisq
-/tensorflow/python/eager @jaingurav @alextp
+/tensorflow/python/eager @jaingaurav @alextp
 /tensorflow/python/tools/api/generator/ @annarev
 /tensorflow/tensorboard/ @jart
 /tensorflow/tools/docs/ @markdaoust
@@ -15,6 +16,7 @@
 # contrib
 
 # NEED OWNER: /tensorflow/contrib/all_reduce
+/tensorflow/contrib/autograph/ @mdanatg @kkimdev
 /tensorflow/contrib/batching/ @alextp @chrisolston
 /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon
 /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva
@@ -26,11 +28,10 @@
 /tensorflow/contrib/data/ @mrry
 /tensorflow/tensorflow/contrib/distribute @joshl @priyag @sourabhbajaj @frankchn
 /tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi
-/tensorflow/contrib/eager @jaingurav @alextp
+/tensorflow/contrib/eager @jaingaurav @alextp
 /tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo
 /tensorflow/contrib/ffmpeg/ @fredbertsch
 /tensorflow/contrib/framework/ @ebrevdo
-/tensorflow/contrib/gan/ @joel-shor
 /tensorflow/contrib/graph_editor/ @purpledog
 # NEED OWNER: /tensorflow/contrib/grid_rnn/
 /tensorflow/contrib/hadoop @yongtang
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index a4647020ff7..72304bee694 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -60,7 +60,13 @@ If you are experiencing or witnessing conflict, we ask you to use the following
 
 ## Reporting Violations
 
-Violations of the Code of Conduct can be reported to TensorFlow’s Project Stewards, Edd Wilder-James (ewj@google.com) and Sarah Novotny (sarahnovotny@google.com). The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
+Violations of the Code of Conduct can be reported to TensorFlow’s Project
+Stewards, Edd Wilder-James (ewj@google.com) and Thea Lamkin
+(thealamkin@google.com). The Project Steward will determine whether the Code of
+Conduct was violated, and will issue an appropriate sanction, possibly including
+a written warning or expulsion from the project, project sponsored spaces, or
+project forums. We ask that you make a good-faith effort to resolve your
+conflict via the conflict resolution policy before submitting a report.
 
 Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report.
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4ed8a8bf2b2..2b285cd91d7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -29,7 +29,8 @@ Follow either of the two links above to access the appropriate CLA and instructi
 ### Contributing code
 
 If you have improvements to TensorFlow, send us your pull requests! For those
-just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/).
+just getting started, Github has a
+[how to](https://help.github.com/articles/using-pull-requests/).
 
 TensorFlow team members will be assigned to review your pull requests. Once the
 pull requests are approved and pass continuous integration checks, a TensorFlow
diff --git a/README.md b/README.md
index 5a66b9bb03a..1eb06225176 100644
--- a/README.md
+++ b/README.md
@@ -2,61 +2,58 @@
   <img src="https://www.tensorflow.org/images/tf_logo_social.png">
 </div>
 
------------------
-
-
 | **`Documentation`** |
 |-----------------|
 | [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
-**TensorFlow** is an open source software library for numerical computation
-using data flow graphs. The graph nodes represent mathematical operations, while
-the graph edges represent the multidimensional data arrays (tensors) that flow
-between them. This flexible architecture enables you to deploy computation to
-one or more CPUs or GPUs in a desktop, server, or mobile device without
-rewriting code. TensorFlow also includes
-[TensorBoard](https://github.com/tensorflow/tensorboard), a data visualization
-toolkit.
+[TensorFlow](https://www.tensorflow.org/) is an end-to-end open source platform
+for machine learning. It has a comprehensive, flexible ecosystem of
+[tools](https://www.tensorflow.org/resources/tools),
+[libraries](https://www.tensorflow.org/resources/libraries-extensions), and
+[community](https://www.tensorflow.org/community) resources that lets
+researchers push the state-of-the-art in ML and developers easily build and
+deploy ML powered applications.
 
-TensorFlow was originally developed by researchers and engineers
-working on the Google Brain team within Google's Machine Intelligence Research
-organization for the purposes of conducting machine learning and deep neural
-networks research.  The system is general enough to be applicable in a wide
-variety of other domains, as well.
+TensorFlow was originally developed by researchers and engineers working on the
+Google Brain team within Google's Machine Intelligence Research organization for
+the purposes of conducting machine learning and deep neural networks research.
+The system is general enough to be applicable in a wide variety of other
+domains, as well.
 
-TensorFlow provides stable Python and C APIs as well as non-guaranteed backwards
-compatible API's for C++, Go, Java, JavaScript, and Swift.
+TensorFlow provides stable [Python](https://www.tensorflow.org/api_docs/python)
+and [C++](https://www.tensorflow.org/api_docs/cc) APIs, as well as
+non-guaranteed backwards compatible API for
+[other languages](https://www.tensorflow.org/api_docs).
 
-Keep up to date with release announcements and security updates by
-subscribing to
+Keep up-to-date with release announcements and security updates by subscribing
+to
 [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+See all the [mailing lists](https://www.tensorflow.org/community/forums).
 
-## Installation
+## Install
+
+See the [TensorFlow install guide](https://www.tensorflow.org/install) for the
+[pip package](https://www.tensorflow.org/install/pip), to
+[enable GPU support](https://www.tensorflow.org/install/gpu), use a
+[Docker container](https://www.tensorflow.org/install/docker), and
+[build from source](https://www.tensorflow.org/install/source).
 
 To install the current release for CPU-only:
 
 ```
-pip install tensorflow
+$ pip install tensorflow
 ```
 
-Use the GPU package for CUDA-enabled GPU cards:
+Use the GPU package for
+[CUDA-enabled GPU cards](https://www.tensorflow.org/install/gpu):
 
 ```
-pip install tensorflow-gpu
+$ pip install tensorflow-gpu
 ```
 
-*See [Installing TensorFlow](https://www.tensorflow.org/install) for detailed
-instructions, and how to build from source.*
-
-People who are a little more adventurous can also try our nightly binaries:
-
-**Nightly pip packages** * We are pleased to announce that TensorFlow now offers
-nightly pip packages under the
+*Nightly binaries are available for testing using the
 [tf-nightly](https://pypi.python.org/pypi/tf-nightly) and
-[tf-nightly-gpu](https://pypi.python.org/pypi/tf-nightly-gpu) project on PyPi.
-Simply run `pip install tf-nightly` or `pip install tf-nightly-gpu` in a clean
-environment to install the nightly TensorFlow build. We support CPU and GPU
-packages on Linux, Mac, and Windows.
+[tf-nightly-gpu](https://pypi.python.org/pypi/tf-nightly-gpu) packages on PyPi.*
 
 #### *Try your first TensorFlow program*
 
@@ -74,8 +71,8 @@ $ python
 'Hello, TensorFlow!'
 ```
 
-Learn more examples about how to do specific tasks in TensorFlow at the
-[tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
+For more examples, see the
+[TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
@@ -116,6 +113,8 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 Build Type                                                                        | Status                                                                                                                                                                                        | Artifacts
 --------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux AMD ROCm GPU** Nightly                                                    | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                  | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
+**Linux AMD ROCm GPU** Stable Release                                             | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                 | [Release](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/)
 **Linux s390x** Nightly                                                           | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
 **Linux s390x CPU** Stable Release                                                | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                      | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
 **Linux ppc64le CPU** Nightly                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
@@ -126,20 +125,23 @@ Build Type
 **Linux CPU with Intel® MKL-DNN** <br> **Supports Python 2.7, 3.4, 3.5, and 3.6** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild)      | [1.13.1 pypi](https://pypi.org/project/intel-tensorflow/)
 **Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 pypi](https://tensorflow.pypi.thoth-station.ninja/index/)
 
-## For more information
+## Resources
 
-*   [TensorFlow Website](https://www.tensorflow.org)
-*   [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
-*   [TensorFlow Model Zoo](https://github.com/tensorflow/models)
+*   [TensorFlow.org](https://www.tensorflow.org)
+*   [TensorFlow tutorials](https://www.tensorflow.org/tutorials/)
+*   [TensorFlow official models](https://github.com/tensorflow/models/tree/master/official)
+*   [TensorFlow examples](https://github.com/tensorflow/examples)
+*   [TensorFlow in Practice from Coursera](https://www.coursera.org/specializations/tensorflow-in-practice)
+*   [TensorFlow blog](https://blog.tensorflow.org)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)
-*   [TensorFlow Blog](https://blog.tensorflow.org)
-*   [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
-*   [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
-*   [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
-*   [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
-*   [TensorFlow Visualization Toolkit](https://github.com/tensorflow/tensorboard)
+*   [TensorFlow YouTube](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+*   [TensorFlow roadmap](https://www.tensorflow.org/community/roadmap)
+*   [TensorFlow white papers](https://www.tensorflow.org/about/bib)
+*   [TensorBoard visualization toolkit](https://github.com/tensorflow/tensorboard)
 
-Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
+Learn more about the
+[TensorFlow community](https://www.tensorflow.org/community) and how to
+[contribute](https://www.tensorflow.org/community/contribute).
 
 ## License
 
diff --git a/RELEASE.md b/RELEASE.md
index 6a4c2d6486d..801b9c8a2c8 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -43,6 +43,11 @@
 *   Transitive dependencies on :pooling_ops were removed. Some users may need to
     add explicit dependencies on :pooling_ops if they reference the operators
     from that library.
+*   tf.keras.optimizers default learning rate changes:
+    *   Adadelta: 1.000 to 0.001
+    *   Adagrad: 0.01 to 0.001
+    *   Adamax: 0.002 to 0.001
+    *   NAdam: 0.002 to 0.001
 
 ## Bug Fixes and Other Changes
 
@@ -746,7 +751,7 @@ Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, A
   and [programmers guide page](http://tensorflow.org/versions/r1.9/programmers_guide/keras).
 * Update `tf.keras` to the Keras 2.1.6 API.
 * Added [`tf.keras.layers.CuDNNGRU`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNGRU) and [`tf.keras.layers.CuDNNLSTM`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNLSTM) layers. [Try it](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb?linkId=53292082).
-* Adding support of core [feature columns](https://www.tensorflow.org/get_started/feature_columns) and [losses](https://www.tensorflow.org/api_docs/python/tf/losses) to [gradient boosted trees estimators](https://github.com/tensorflow/models/tree/master/official/boosted_trees).
+* Adding support of core [feature columns](https://www.tensorflow.org/get_started/feature_columns) and [losses](https://www.tensorflow.org/api_docs/python/tf/losses) to [gradient boosted trees estimators](https://github.com/tensorflow/models/tree/master/official/r1/boosted_trees).
 * The [python interface](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/lite)
   for the [TFLite Optimizing Converter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/toco/README.md)
   has been expanded, and the command line interface (AKA: `toco`, `tflite_convert`) is once again
diff --git a/WORKSPACE b/WORKSPACE
index 43312f350d6..74ea14d0fd7 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -7,7 +7,7 @@ http_archive(
     sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9",
     strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149",
     urls = [
-        "http://mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
+        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
         "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",  # 2019-06-13
     ],
 )
@@ -49,9 +49,14 @@ remote_config_workspace()
 # Apple and Swift rules.
 http_archive(
     name = "build_bazel_rules_apple",
-    sha256 = "23792cd999f97fc97284d1c44cb1324bfdd0bc54aa68ad513fa3705aca3b1f9e",
-    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.15.0/rules_apple.0.15.0.tar.gz"],
+    sha256 = "6efdde60c91724a2be7f89b0c0a64f01138a45e63ba5add2dca2645d981d23a1",
+    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.17.2/rules_apple.0.17.2.tar.gz"],
 )  # https://github.com/bazelbuild/rules_apple/releases
+http_archive(
+    name = "build_bazel_rules_swift",
+    sha256 = "96a86afcbdab215f8363e65a10cf023b752e90b23abf02272c4fc668fcb70311",
+    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.11.1/rules_swift.0.11.1.tar.gz"],
+)  # https://github.com/bazelbuild/rules_swift/releases
 http_archive(
     name = "build_bazel_apple_support",
     sha256 = "7356dbd44dea71570a929d1d4731e870622151a5f27164d966dda97305f33471",
@@ -62,11 +67,6 @@ http_archive(
     sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e",
     urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.8.0/bazel-skylib.0.8.0.tar.gz"],
 )  # https://github.com/bazelbuild/bazel-skylib/releases
-http_archive(
-    name = "build_bazel_rules_swift",
-    sha256 = "9efe9699e9765e6b4a5e063e4a08f6b163cccaf0443f775d935baf5c3cd6ed0e",
-    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.9.0/rules_swift.0.9.0.tar.gz"],
-)  # https://github.com/bazelbuild/rules_swift/releases
 http_archive(
     name = "com_github_apple_swift_swift_protobuf",
     type = "zip",
@@ -104,8 +104,7 @@ http_archive(
     build_file = "//:models.BUILD",
     sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
-        "http://download.tensorflow.org/models/inception_v1.zip",
+        "https://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
     ],
 )
 
@@ -114,8 +113,7 @@ http_archive(
     build_file = "//:models.BUILD",
     sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
-        "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
+        "https://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
     ],
 )
 
@@ -124,8 +122,7 @@ http_archive(
     build_file = "//:models.BUILD",
     sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
-        "http://download.tensorflow.org/models/mobile_multibox_v1a.zip",
+        "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
     ],
 )
 
@@ -134,8 +131,7 @@ http_archive(
     build_file = "//:models.BUILD",
     sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
-        "http://download.tensorflow.org/models/stylize_v1.zip",
+        "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
     ],
 )
 
@@ -144,7 +140,6 @@ http_archive(
     build_file = "//:models.BUILD",
     sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
-        "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
+        "https://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
diff --git a/configure.py b/configure.py
index 64022101e97..a01d952bb1e 100644
--- a/configure.py
+++ b/configure.py
@@ -1145,78 +1145,6 @@ def set_trisycl_include_dir(environ_cp):
   write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
 
 
-def set_mpi_home(environ_cp):
-  """Set MPI_HOME."""
-
-  default_mpi_home = which('mpirun') or which('mpiexec') or ''
-  default_mpi_home = os.path.dirname(os.path.dirname(default_mpi_home))
-
-  def valid_mpi_path(mpi_home):
-    exists = (
-        os.path.exists(os.path.join(mpi_home, 'include')) and
-        (os.path.exists(os.path.join(mpi_home, 'lib')) or
-         os.path.exists(os.path.join(mpi_home, 'lib64')) or
-         os.path.exists(os.path.join(mpi_home, 'lib32'))))
-    if not exists:
-      print(
-          'Invalid path to the MPI Toolkit. %s or %s or %s or %s cannot be found'
-          % (os.path.join(mpi_home, 'include'),
-             os.path.exists(os.path.join(mpi_home, 'lib')),
-             os.path.exists(os.path.join(mpi_home, 'lib64')),
-             os.path.exists(os.path.join(mpi_home, 'lib32'))))
-    return exists
-
-  _ = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='MPI_HOME',
-      var_default=default_mpi_home,
-      ask_for_var='Please specify the MPI toolkit folder.',
-      check_success=valid_mpi_path,
-      error_msg='',
-      suppress_default_error=True)
-
-
-def set_other_mpi_vars(environ_cp):
-  """Set other MPI related variables."""
-  # Link the MPI header files
-  mpi_home = environ_cp.get('MPI_HOME')
-  symlink_force('%s/include/mpi.h' % mpi_home, 'third_party/mpi/mpi.h')
-
-  # Determine if we use OpenMPI or MVAPICH, these require different header files
-  # to be included here to make bazel dependency checker happy
-  if os.path.exists(os.path.join(mpi_home, 'include/mpi_portable_platform.h')):
-    symlink_force(
-        os.path.join(mpi_home, 'include/mpi_portable_platform.h'),
-        'third_party/mpi/mpi_portable_platform.h')
-    # TODO(gunan): avoid editing files in configure
-    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI = False',
-                 'MPI_LIB_IS_OPENMPI = True')
-  else:
-    # MVAPICH / MPICH
-    symlink_force(
-        os.path.join(mpi_home, 'include/mpio.h'), 'third_party/mpi/mpio.h')
-    symlink_force(
-        os.path.join(mpi_home, 'include/mpicxx.h'), 'third_party/mpi/mpicxx.h')
-    # TODO(gunan): avoid editing files in configure
-    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI = True',
-                 'MPI_LIB_IS_OPENMPI = False')
-
-  if os.path.exists(os.path.join(mpi_home, 'lib/libmpi.so')):
-    symlink_force(
-        os.path.join(mpi_home, 'lib/libmpi.so'), 'third_party/mpi/libmpi.so')
-  elif os.path.exists(os.path.join(mpi_home, 'lib64/libmpi.so')):
-    symlink_force(
-        os.path.join(mpi_home, 'lib64/libmpi.so'), 'third_party/mpi/libmpi.so')
-  elif os.path.exists(os.path.join(mpi_home, 'lib32/libmpi.so')):
-    symlink_force(
-        os.path.join(mpi_home, 'lib32/libmpi.so'), 'third_party/mpi/libmpi.so')
-
-  else:
-    raise ValueError(
-        'Cannot find the MPI library file in %s/lib or %s/lib64 or %s/lib32' %
-        (mpi_home, mpi_home, mpi_home))
-
-
 def system_specific_test_config(env):
   """Add default build and test flags required for TF tests to bazelrc."""
   write_to_bazelrc('test --flaky_test_attempts=3')
@@ -1549,11 +1477,6 @@ def main():
     raise UserInputError('SYCL / CUDA / ROCm are mututally exclusive. '
                          'At most 1 GPU platform can be configured.')
 
-  set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
-  if environ_cp.get('TF_NEED_MPI') == '1':
-    set_mpi_home(environ_cp)
-    set_other_mpi_vars(environ_cp)
-
   set_cc_opt_flags(environ_cp)
   set_system_libs_flag(environ_cp)
   if is_windows():
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 61539c5e586..4d34f9849b7 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -7,7 +7,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
 load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_binary_deps",
 )
 load(
@@ -356,6 +356,15 @@ config_setting(
     },
 )
 
+# Flag to indicate open source build, .bazelrc always has it set to be true
+config_setting(
+    name = "oss",
+    define_values = {
+        "open_source_build": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "using_cuda_clang_with_dynamic_build",
     define_values = {
@@ -364,11 +373,20 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "build_oss_using_cuda_clang",
+    define_values = {
+        "using_cuda_clang": "true",
+        "open_source_build": "true",
+    },
+)
+
 # Setting to use when loading kernels dynamically
 config_setting(
     name = "dynamic_loaded_kernels",
     define_values = {
         "dynamic_loaded_kernels": "true",
+        "framework_shared_object": "true",
     },
     visibility = ["//visibility:public"],
 )
@@ -389,16 +407,18 @@ config_setting(
 )
 
 config_setting(
-    name = "using_rocm_hipcc",
+    name = "build_oss_using_cuda_nvcc",
     define_values = {
-        "using_rocm_hipcc": "true",
+        "using_cuda_nvcc": "true",
+        "open_source_build": "true",
     },
 )
 
 config_setting(
-    name = "with_mpi_support",
-    values = {"define": "with_mpi_support=true"},
-    visibility = ["//visibility:public"],
+    name = "using_rocm_hipcc",
+    define_values = {
+        "using_rocm_hipcc": "true",
+    },
 )
 
 config_setting(
@@ -444,6 +464,7 @@ config_setting(
 package_group(
     name = "internal",
     packages = [
+        "//perftools/accelerators/xprof/api/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",
@@ -607,6 +628,7 @@ tf_cc_shared_object(
         "//tensorflow/c:version_script.lds",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
     ],
 )
 
@@ -750,8 +772,8 @@ genrule(
     mkdir $@
     for f in $(SRCS); do
       d="$${f%/*}"
-      d="$${d#bazel-out*genfiles/}"
-      d="$${d#*external/eigen_archive/}"
+      d="$${d#bazel-out/*/genfiles/}"
+      d="$${d#bazel-out/*/bin/}"
 
       if [[ $${d} == *local_config_* ]]; then
         continue
@@ -763,6 +785,9 @@ genrule(
         if [[ $${TF_SYSTEM_LIBS:-} == *$${extname}* ]]; then
           continue
         fi
+
+        d="$${d#*external/farmhash_archive/src}"
+        d="$${d#*external/$${extname}/}"
       fi
 
       mkdir -p "$@/$${d}"
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 6d1c40a2428..2962a7a60e2 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -27,11 +27,27 @@ import sys as _sys
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.tools import module_util as _module_util
+from tensorflow.python.platform import tf_logging as _logging
 
 # API IMPORTS PLACEHOLDER
 
 # WRAPPER_PLACEHOLDER
 
+if "dev" in __version__:   # pylint: disable=undefined-variable
+  _logging.warning("""
+
+  TensorFlow's `tf-nightly` package will soon be updated to TensorFlow 2.0.
+
+  Please upgrade your code to TensorFlow 2.0:
+    * https://www.tensorflow.org/beta/guide/migration_guide
+
+  Or install the latest stable TensorFlow 1.X release:
+    * `pip install -U "tensorflow==1.*"`
+
+  Otherwise your code may be broken by the change.
+
+  """)
+
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
 # We're using bitwise, but there's nothing special about that.
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index dd5a3a08765..ffc457de4aa 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -73,7 +73,7 @@ tf_cuda_library(
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
-            "//tensorflow/core:lib_platform",
+            "//tensorflow/core/platform:platform",
             "//tensorflow/core:op_gen_lib",
             "//tensorflow/core/distributed_runtime:server_lib",
         ],
@@ -264,10 +264,10 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/platform",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -355,6 +355,7 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_helper",
+        ":tf_tensor_internal",
     ] + select({
         "//tensorflow:android": [
             ":c_api_internal",
@@ -467,7 +468,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:spectral_ops_op_lib",
@@ -503,6 +503,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -579,7 +580,7 @@ tf_cuda_cc_test(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    tags = ["noasan"],
+    tags = ["no_cuda_on_cpu_tap"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
@@ -588,10 +589,11 @@ tf_cuda_cc_test(
         ":kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 62b2504a26d..ed4f10e0f77 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -1024,7 +1024,7 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
       desc->colocation_constraints.insert(location);
     }
   } else {
-    desc->node_builder.Attr(attr_name, attr_value);
+    desc->node_builder.Attr(attr_name, std::move(attr_value));
   }
 
   status->status = Status::OK();
@@ -1045,7 +1045,8 @@ static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
           std::vector<string>(desc->colocation_constraints.begin(),
                               desc->colocation_constraints.end()));
     }
-    status->status = desc->node_builder.Finalize(&desc->graph->graph, &ret);
+    status->status = desc->node_builder.Finalize(&desc->graph->graph, &ret,
+                                                 /*consume=*/true);
 
     if (TF_GetCode(status) == TF_OK) {
       // Run shape inference function for newly added node.
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index ad0c4068d45..f04f0175696 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -596,7 +598,10 @@ struct TF_CheckpointReader : public tensorflow::checkpoint::CheckpointReader {
 TF_CheckpointReader* TF_NewCheckpointReader(const char* filename,
                                             TF_Status* status) {
   TF_CheckpointReader* reader = new TF_CheckpointReader(filename, status);
-  if (!status->status.ok()) return nullptr;
+  if (!status->status.ok()) {
+    TF_DeleteCheckpointReader(reader);
+    return nullptr;
+  }
   const auto& m = reader->GetVariableToDataTypeMap();
   for (auto it = m.begin(); it != m.end(); ++it)
     reader->variable_list.push_back(it->first);
@@ -995,3 +1000,170 @@ TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext(
           << handle->DebugString();
   return ret;
 }
+
+TF_ShapeAndTypeList* TF_NewShapeAndTypeList(int num_items) {
+  TF_ShapeAndTypeList* result = new TF_ShapeAndTypeList;
+  result->num_items = num_items;
+  result->items = (num_items == 0) ? nullptr : new TF_ShapeAndType[num_items]();
+  return result;
+}
+
+void TF_ShapeAndTypeListSetShape(TF_ShapeAndTypeList* shape_list, int index,
+                                 const int64_t* dims, int num_dims) {
+  DCHECK(index >= 0 && index < shape_list->num_items);
+  TF_ShapeAndType& shape = shape_list->items[index];
+  DCHECK(shape.dims == nullptr) << "Shape at " << index << " is already set!";
+  DCHECK(num_dims >= 0) << "Number of dimensions cannot be negative!";
+  shape.num_dims = num_dims;
+  shape.dims = new int64_t[num_dims];
+  memcpy(shape.dims, dims, sizeof(int64_t) * num_dims);
+}
+
+void TF_ShapeAndTypeListSetUnknownShape(TF_ShapeAndTypeList* shape_list,
+                                        int index) {
+  DCHECK(index >= 0 && index < shape_list->num_items);
+  TF_ShapeAndType& shape = shape_list->items[index];
+  DCHECK(shape.dims == nullptr) << "Shape at " << index << " is already set!";
+  shape.num_dims = -1;
+  shape.dims = nullptr;
+}
+
+void TF_ShapeAndTypeListSetDtype(TF_ShapeAndTypeList* shape_list, int index,
+                                 TF_DataType dtype) {
+  DCHECK(index >= 0 && index < shape_list->num_items);
+  TF_ShapeAndType& shape_and_type = shape_list->items[index];
+  shape_and_type.dtype = dtype;
+}
+
+void TF_DeleteShapeAndTypeList(TF_ShapeAndTypeList* shape_list) {
+  if (shape_list == nullptr) return;
+  for (size_t i = 0; i < shape_list->num_items; ++i) {
+    delete[] shape_list->items[i].dims;
+  }
+  delete[] shape_list->items;
+  delete shape_list;
+}
+
+void TF_DeleteShapeAndTypeListArray(TF_ShapeAndTypeList** shape_list_array,
+                                    int num_items) {
+  if (shape_list_array == nullptr) return;
+  for (int i = 0; i < num_items; ++i) {
+    TF_DeleteShapeAndTypeList(shape_list_array[i]);
+  }
+  delete[] shape_list_array;
+}
+
+namespace tensorflow {
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
+}  // namespace tensorflow
+
+void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes,
+                     TF_Tensor** input_tensors,
+                     TF_ShapeAndTypeList* input_tensors_as_shapes,
+                     TF_ShapeAndTypeList** input_resource_shapes_and_types,
+                     TF_ShapeAndTypeList** output_shapes,
+                     TF_ShapeAndTypeList*** output_resource_shapes_and_types,
+                     TF_Status* status) {
+  using tensorflow::NodeDef;
+  using tensorflow::OpRegistrationData;
+  using tensorflow::Tensor;
+  using tensorflow::shape_inference::DimensionHandle;
+  using tensorflow::shape_inference::InferenceContext;
+  using tensorflow::shape_inference::ShapeAndType;
+  using tensorflow::shape_inference::ShapeHandle;
+
+  const int num_inputs = input_shapes->num_items;
+  NodeDef node_def;
+  node_def.set_name(tfe_op->operation.Name());
+  node_def.set_op(tfe_op->operation.Name());
+  for (int i = 0; i < num_inputs; ++i) {
+    node_def.add_input("dummy_input");
+  }
+  tfe_op->operation.Attrs().FillAttrValueMap(node_def.mutable_attr());
+
+  const tensorflow::OpRegistrationData* op_reg_data;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUp(node_def.op(), &op_reg_data);
+  if (!status->status.ok()) return;
+
+  // Initialize a input_tensor vector with `nullptr` values.
+  std::vector<const Tensor*> input_tensors_vector(num_inputs, nullptr);
+  // A vector to keep track of newly created `tf::Tensor` objects.
+  std::vector<Tensor> all_input_tensors;
+  // Update the vector with information from `input_tensors` if provided.
+  if (input_tensors != nullptr) {
+    // Note that we take the address of the elements in `all_input_tensors`
+    // below. Allocate enough space so that no reallocation happens, which will
+    // make the pointers invalid.
+    all_input_tensors.reserve(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      if (input_tensors[i] == nullptr) continue;
+      all_input_tensors.emplace_back();
+      Tensor& input_tensor = all_input_tensors.back();
+      status->status = TF_TensorToTensor(input_tensors[i], &input_tensor);
+      if (!status->status.ok()) return;
+      input_tensors_vector[i] = &input_tensor;
+    }
+  }
+
+  // Create an inference context with dummy values, which will be updated later.
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &node_def, op_reg_data->op_def,
+                     std::vector<ShapeHandle>(num_inputs), input_tensors_vector,
+                     {},
+                     std::vector<std::unique_ptr<std::vector<ShapeAndType>>>());
+
+  // Set input_shapes.
+  for (int i = 0; i < num_inputs; ++i) {
+    std::vector<DimensionHandle> dims;
+    const TF_ShapeAndType& input_shape = input_shapes->items[i];
+    if (input_shape.num_dims == InferenceContext::kUnknownRank) {
+      c.SetInput(i, c.UnknownShape());
+      continue;
+    }
+    for (int j = 0; j < input_shape.num_dims; ++j) {
+      dims.push_back(c.MakeDim(input_shape.dims[j]));
+    }
+    c.SetInput(i, c.MakeShape(dims));
+  }
+
+  // TODO(bgogul): Handle input_tensors_as_shapes.
+  // TODO(bgogul): Handle input_resource_shapes_and_types.
+
+  status->status = c.construction_status();
+  if (!status->status.ok()) return;
+
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    status->status =
+        InvalidArgument("No shape inference function exists for op '",
+                        node_def.op(), "', did you forget to define it?");
+    return;
+  }
+
+  status->status = c.Run(op_reg_data->shape_inference_fn);
+  if (!status->status.ok()) return;
+
+  // Set output_shapes.
+  TF_ShapeAndTypeList* output_shapes_result =
+      TF_NewShapeAndTypeList(c.num_outputs());
+  for (int i = 0; i < c.num_outputs(); ++i) {
+    ShapeHandle shape_handle = c.output(i);
+    TF_ShapeAndType& shape = output_shapes_result->items[i];
+    shape.num_dims = c.Rank(shape_handle);
+    if (shape.num_dims == InferenceContext::kUnknownRank) {
+      shape.dims = nullptr;
+      continue;
+    }
+    shape.dims = new int64_t[shape.num_dims];
+    for (size_t j = 0; j < shape.num_dims; ++j) {
+      shape.dims[j] = c.Value(c.Dim(shape_handle, j));
+    }
+  }
+  if (output_shapes != nullptr) *output_shapes = output_shapes_result;
+
+  // TODO(bgogul): Set output_resource_shapes_and_types.
+}
+
+void TF_ImportGraphDefOptionsSetValidateColocationConstraints(
+    TF_ImportGraphDefOptions* opts, unsigned char enable) {
+  opts->opts.validate_colocation_constraints = enable;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index d91f3ab8b05..126db2640f6 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -343,6 +343,65 @@ TF_CAPI_EXPORT extern TFE_TensorHandle*
 TFE_ConsumeInputConcreteTensorFromTraceContext(TFE_TraceContext* trace_ctx,
                                                unsigned int idx);
 
+// Information about the shape of a Tensor and its type.
+struct TF_ShapeAndType {
+  // Number of dimensions. -1 indicates unknown rank.
+  int num_dims;
+  // Array of dimensions. -1 indicates unknown dim.
+  int64_t* dims;
+  // The data type. May be 0 to denote unknown type.
+  TF_DataType dtype;
+};
+
+typedef struct TF_ShapeAndType TF_ShapeAndType;
+
+// A list of TF_ShapeAndType elements..
+struct TF_ShapeAndTypeList {
+  int num_items;
+  TF_ShapeAndType* items;
+};
+typedef struct TF_ShapeAndTypeList TF_ShapeAndTypeList;
+
+// API for manipulating TF_ShapeAndTypeList objects.
+//
+TF_CAPI_EXPORT extern TF_ShapeAndTypeList* TF_NewShapeAndTypeList(
+    int num_shapes);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetShape(
+    TF_ShapeAndTypeList* shape_list, int index, const int64_t* dims,
+    int num_dims);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetUnknownShape(
+    TF_ShapeAndTypeList* shape_list, int index);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetDtype(
+    TF_ShapeAndTypeList* shape_list, int index, TF_DataType dtype);
+TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeList(
+    TF_ShapeAndTypeList* shape_list);
+TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeListArray(
+    TF_ShapeAndTypeList** shape_list_array, int num_items);
+
+// Infer shapes for the given `op`. The arguments mimic the arguments of the
+// `shape_inference::InferenceContext` constructor. Note the following:
+//   - The inputs of the `op` are not used for shape inference. So, it is
+//     OK to not have the inputs properly set in `op`. See `input_tensors`
+//     if you want shape inference to consider the input tensors of the
+//     op for shape inference.
+//   - The types need not be set in `input_shapes` as it is not used.
+//   - The number of `input_tensors` should be the same as the number of items
+//     in `input_shapes`.
+//
+// The results are returned in `output_shapes` and
+// `output_resource_shapes_and_types`. The caller is responsible for freeing the
+// memory in these buffers by calling `TF_DeleteShapeAndTypeList`.
+TF_CAPI_EXPORT extern void TFE_InferShapes(
+    TFE_Op* op, TF_ShapeAndTypeList* input_shapes, TF_Tensor** input_tensors,
+    TF_ShapeAndTypeList* input_tensor_as_shapes,
+    TF_ShapeAndTypeList** input_resource_shapes_and_types,
+    TF_ShapeAndTypeList** output_shapes,
+    TF_ShapeAndTypeList*** output_resource_shapes_and_types, TF_Status* status);
+
+TF_CAPI_EXPORT extern void
+TF_ImportGraphDefOptionsSetValidateColocationConstraints(
+    TF_ImportGraphDefOptions* opts, unsigned char enable);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index 55f3a8599fd..ed0ab7c26f8 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/c_api_experimental.h"
+
+#include "absl/types/optional.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -431,5 +433,155 @@ TEST_F(AddEagerOpToGraphTest,
   TFE_DeleteTensorHandle(matrix);
 }
 
+class ShapeInferenceTest : public ::testing::Test {
+ protected:
+  ShapeInferenceTest()
+      : status_(TF_NewStatus()), tfe_context_options_(TFE_NewContextOptions()) {
+    tfe_context_ = TFE_NewContext(tfe_context_options_, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  }
+
+  ~ShapeInferenceTest() override {
+    TFE_DeleteContextOptions(tfe_context_options_);
+    TFE_DeleteContext(tfe_context_);
+    TF_DeleteStatus(status_);
+  }
+
+  // Checks the expected result of shape inference for the given `op`.
+  void CheckOutputShapes(
+      TFE_Op* op,
+      const std::vector<absl::optional<std::vector<int64_t>>>& input_shapes_vec,
+      const std::vector<TF_Tensor*>& input_tensors,
+      const absl::optional<std::vector<int64_t>>& expected_shape) {
+    // Create input_shapes.
+    TF_ShapeAndTypeList* input_shapes =
+        TF_NewShapeAndTypeList(input_shapes_vec.size());
+    for (size_t i = 0; i < input_shapes_vec.size(); ++i) {
+      const auto& input_shape = input_shapes_vec[i];
+      if (input_shape.has_value()) {
+        TF_ShapeAndTypeListSetShape(input_shapes, i, input_shape->data(),
+                                    input_shape->size());
+      } else {
+        TF_ShapeAndTypeListSetUnknownShape(input_shapes, i);
+      }
+    }
+    TF_ShapeAndTypeList* output_shapes;
+    TFE_InferShapes(op, input_shapes,
+                    input_tensors.empty()
+                        ? nullptr
+                        : const_cast<TF_Tensor**>(input_tensors.data()),
+                    /*input_tensors_as_shapes*/ nullptr,
+                    /*input_resource_shapes_and_types*/ nullptr, &output_shapes,
+                    /*output_resource_shapes_and_types*/ nullptr, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(output_shapes->num_items, 1);
+
+    int num_dims = output_shapes->items[0].num_dims;
+    int64_t* dims = output_shapes->items[0].dims;
+
+    if (!expected_shape.has_value()) {
+      EXPECT_EQ(num_dims, -1);
+      EXPECT_EQ(dims, nullptr);
+      return;
+    }
+
+    EXPECT_EQ(num_dims, expected_shape->size());
+    for (size_t i = 0; i < num_dims; ++i) {
+      EXPECT_EQ(dims[i], (*expected_shape)[i]);
+    }
+    TF_DeleteShapeAndTypeList(input_shapes);
+    TF_DeleteShapeAndTypeList(output_shapes);
+  }
+
+  absl::optional<std::vector<int64_t>> make_shape(
+      std::vector<int64_t>&& dims) const {
+    return absl::make_optional(dims);
+  }
+
+  absl::optional<std::vector<int64_t>> unknown_shape() const {
+    return absl::nullopt;
+  }
+
+  static constexpr int64_t kUnknownDim =
+      shape_inference::InferenceContext::kUnknownDim;
+  TF_Status* status_;
+  TFE_ContextOptions* tfe_context_options_;
+  TFE_Context* tfe_context_;
+};
+
+TEST_F(ShapeInferenceTest, InfersShapesFromInputShapes) {
+  TFE_Op* matmul_op;
+  matmul_op = TFE_NewOp(tfe_context_, "MatMul", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  // Infer shape when everything is known.
+  CheckOutputShapes(matmul_op,
+                    /*input_shapes*/ {make_shape({3, 2}), make_shape({2, 4})},
+                    /*input_tensors*/ {},
+                    /*expected_shape*/ make_shape({3, 4}));
+
+  // Infer shape when second operand has unknown shape.
+  CheckOutputShapes(matmul_op,
+                    /*input_shapes*/ {make_shape({3, 2}), unknown_shape()},
+                    /*input_tensors*/ {},
+                    /*expected_shape*/ make_shape({3, kUnknownDim}));
+
+  // Infer shape when some dimensions are unknown.
+  CheckOutputShapes(
+      matmul_op,
+      /*input_shapes*/ {make_shape({kUnknownDim, 2}), make_shape({2, 4})},
+      /*input_tensors*/ {},
+      /*expected_shape*/ make_shape({kUnknownDim, 4}));
+
+  // Infer shape when everything is unknown.
+  CheckOutputShapes(matmul_op,
+                    /*input_shapes*/ {unknown_shape(), unknown_shape()},
+                    /*input_tensors*/ {},
+                    /*expected_shape*/ make_shape({kUnknownDim, kUnknownDim}));
+
+  TFE_DeleteOp(matmul_op);
+  // TODO(bgogul): Add some death tests where status is not OK.
+}
+
+TEST_F(ShapeInferenceTest, InfersShapesFromInputTensors) {
+  // Prepare some tensors for shape.
+  TF_Tensor* tensor_1X6 = Int32Tensor({1, 6});
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TF_Tensor* tensor_1X1X6 = Int32Tensor({1, 1, 6});
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  TFE_Op* reshape_op = TFE_NewOp(tfe_context_, "Reshape", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpSetAttrType(reshape_op, "T", TF_FLOAT);
+  TFE_OpSetAttrType(reshape_op, "Tshape", TF_INT32);
+  CheckOutputShapes(reshape_op,
+                    /* input_shapes*/ {unknown_shape(), unknown_shape()},
+                    /* input_tensors*/ {nullptr, tensor_1X6},
+                    /*expected_shape*/ make_shape({1, 6}));
+  TFE_DeleteOp(reshape_op);
+  reshape_op = nullptr;
+
+  TFE_Op* fill_op = TFE_NewOp(tfe_context_, "Fill", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpSetAttrType(fill_op, "T", TF_FLOAT);
+  TFE_OpSetAttrType(fill_op, "Tshape", TF_INT32);
+
+  float five = 5.0;
+  TFE_TensorHandle* scalar = TestScalarTensorHandle(five);
+  TF_Tensor* scalarTensor = TFE_TensorHandleResolve(scalar, status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  CheckOutputShapes(fill_op,
+                    /* input_shapes*/ {unknown_shape(), unknown_shape()},
+                    /* input_tensors*/ {tensor_1X1X6, scalarTensor},
+                    /*expected_shape*/ make_shape({1, 1, 6}));
+  TFE_DeleteOp(fill_op);
+  fill_op = nullptr;
+
+  TFE_DeleteTensorHandle(scalar);
+  TF_DeleteTensor(scalarTensor);
+  TF_DeleteTensor(tensor_1X1X6);
+  TF_DeleteTensor(tensor_1X6);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 20815813d06..bb2be3db087 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -41,6 +41,7 @@ namespace {
 // node names, so if necessary we add a suffix to make
 // names unique. If we have an input named "A" and a node in the function
 // body named "a", they will be renamed to "a" and "a_0".
+// TODO(b/139886381) Unify this and the one in graph_to_functiondef.cc
 class NodeNameMapping {
  public:
   NodeNameMapping() = default;
@@ -64,14 +65,14 @@ class NodeNameMapping {
   string Lookup(const string& name) const;
 
  private:
-  string UniquifyHelper(const string& name) const;
+  string UniquifyHelper(const string& name);
   static string Normalize(string name);
 
   // The normalized/uniquified names already used as
   // input names (in signature), output names (in signature), and node names
   // (in node_def).
   // This is a superset of values in name_mapping_.
-  std::unordered_set<string> used_names_;
+  std::unordered_map<string, uint64> used_names_;
   // Mapping from original node name from the graph to the normalized
   // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
@@ -102,13 +103,16 @@ string NodeNameMapping::Normalize(string name) {
   return i == n ? "unknown" : name.substr(i);
 }
 
-string NodeNameMapping::UniquifyHelper(const string& name) const {
+string NodeNameMapping::UniquifyHelper(const string& name) {
+  auto it = used_names_.emplace(name, 0);
   // If the name hasn't been used yet, use it as-is.
-  if (used_names_.find(name) == used_names_.end()) return name;
+  if (it.second) return name;
+
   // Add a suffix to name to make it unique.
-  for (int i = 0;; ++i) {
-    const string candidate = strings::StrCat(name, "_", i);
-    if (used_names_.find(candidate) == used_names_.end()) return candidate;
+  while (true) {
+    const string candidate = strings::StrCat(name, "_", it.first->second);
+    it.first->second++;
+    if (used_names_.emplace(candidate, 0).second) return candidate;
   }
 }
 
@@ -120,16 +124,13 @@ string NodeNameMapping::GetInputName(const string& name) {
 
 string NodeNameMapping::GetOutputName(const string& name) {
   const string& input_name = UniquifyHelper(Normalize(name));
-  // Record that we used this name, but don't add it to name_mapping_
-  // since this name is not for a node.
-  used_names_.insert(input_name);
+  // Don't add it to name_mapping_ since this name is not for a node.
   return input_name;
 }
 
 string NodeNameMapping::Uniquify(const string& name) {
   const string uniqued = UniquifyHelper(name);
   name_mapping_[name] = uniqued;
-  used_names_.insert(uniqued);
   return uniqued;
 }
 
@@ -139,7 +140,7 @@ Status NodeNameMapping::UseOutputName(const string& name) {
     return InvalidArgument("Cannot have duplicate output names. Name '", name,
                            "' appears more than once in 'output_names' array.");
   }
-  used_names_.insert(iter, name);
+  used_names_.emplace(name, 0);
   return Status::OK();
 }
 
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 49076039fa7..c97fa93e3a5 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -22,15 +22,16 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/tf_status.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
-#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -233,7 +234,7 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
     // Create C++ Tensor
     Tensor src(tensorflow::DT_STRING, TensorShape(dims));
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
-      src.flat<string>()(i) = data[i];
+      src.flat<tstring>()(i) = data[i];
     }
     TF_Tensor* dst = TF_TensorFromTensor(src, status);
     ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
@@ -243,7 +244,7 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
     ASSERT_EQ(Status::OK(), TF_TensorToTensor(dst, &output)) << line;
     ASSERT_EQ(src.NumElements(), output.NumElements()) << line;
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
-      ASSERT_EQ(data[i], output.flat<string>()(i)) << line;
+      ASSERT_EQ(data[i], output.flat<tstring>()(i)) << line;
     }
 
     TF_DeleteTensor(dst);
@@ -556,7 +557,7 @@ TEST(CAPI, Graph) {
       EXPECT_FALSE(found_add);
       found_add = true;
     } else {
-      ADD_FAILURE() << "Unexpected NodeDef: " << ProtoDebugString(n);
+      ADD_FAILURE() << "Unexpected NodeDef: " << n.DebugString();
     }
   }
   EXPECT_TRUE(found_placeholder);
@@ -581,20 +582,20 @@ TEST(CAPI, Graph) {
   // Compare with first GraphDef + added NodeDef.
   NodeDef* added_node = graph_def.add_node();
   *added_node = node_def;
-  EXPECT_EQ(ProtoDebugString(graph_def), ProtoDebugString(graph_def2));
+  EXPECT_EQ(graph_def.DebugString(), graph_def2.DebugString());
 
   // Look up some nodes by name.
   TF_Operation* neg2 = TF_GraphOperationByName(graph, "neg");
   EXPECT_TRUE(neg == neg2);
   NodeDef node_def2;
   ASSERT_TRUE(GetNodeDef(neg2, &node_def2));
-  EXPECT_EQ(ProtoDebugString(node_def), ProtoDebugString(node_def2));
+  EXPECT_EQ(node_def.DebugString(), node_def2.DebugString());
 
   TF_Operation* feed2 = TF_GraphOperationByName(graph, "feed");
   EXPECT_TRUE(feed == feed2);
   ASSERT_TRUE(GetNodeDef(feed, &node_def));
   ASSERT_TRUE(GetNodeDef(feed2, &node_def2));
-  EXPECT_EQ(ProtoDebugString(node_def), ProtoDebugString(node_def2));
+  EXPECT_EQ(node_def.DebugString(), node_def2.DebugString());
 
   // Test iterating through the nodes of a graph.
   found_placeholder = false;
@@ -618,7 +619,7 @@ TEST(CAPI, Graph) {
       found_neg = true;
     } else {
       ASSERT_TRUE(GetNodeDef(oper, &node_def));
-      ADD_FAILURE() << "Unexpected Node: " << ProtoDebugString(node_def);
+      ADD_FAILURE() << "Unexpected Node: " << node_def.DebugString();
     }
   }
   EXPECT_TRUE(found_placeholder);
@@ -1385,7 +1386,7 @@ TEST(CAPI, SavedModel) {
     tensorflow::Example example;
     auto* feature_map = example.mutable_features()->mutable_feature();
     (*feature_map)["x"].mutable_float_list()->add_value(i);
-    input.flat<string>()(i) = example.SerializeAsString();
+    input.flat<tstring>()(i) = example.SerializeAsString();
   }
 
   const tensorflow::string input_op_name(
@@ -2498,6 +2499,38 @@ TEST(TestKernel, TestGetRegisteredKernelsForOpNoKernels) {
 
 #undef EXPECT_TF_META
 
+TEST(CAPI, TestTensorAligned) {
+  int64_t dim = 7;
+  size_t tensor_size_bytes = dim * TF_DataTypeSize(TF_FLOAT);
+  TF_Tensor* a = TF_AllocateTensor(
+      /*dtype=*/TF_FLOAT, /*dims=*/&dim, /*num_dims=*/1,
+      /*len=*/tensor_size_bytes);
+  float* data = reinterpret_cast<float*>(TF_TensorData(a));
+  for (int i = 0; i < dim; ++i) {
+    data[i] = 0;
+  }
+  if (EIGEN_MAX_ALIGN_BYTES > 0) {
+    EXPECT_TRUE(TF_TensorIsAligned(a));
+  }
+  TF_DeleteTensor(a);
+}
+
+TEST(CAPI, TestTensorIsNotAligned) {
+  // Test unaligned access via a Slice.
+  Tensor x(DT_FLOAT, TensorShape({30}));
+  x.flat<float>().setConstant(0.0);
+
+  // Take an unaligned slice.
+  Tensor y = x.Slice(1, 13);
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* a = TF_TensorFromTensor(y, status);
+  if (EIGEN_MAX_ALIGN_BYTES > 0) {
+    EXPECT_FALSE(TF_TensorIsAligned(a));
+  }
+  TF_DeleteStatus(status);
+  TF_DeleteTensor(a);
+}
+
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 7eddc17a8e5..5c42e508f71 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -8,12 +8,12 @@ load(
     "tfe_xla_copts",
 )
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_device_tracer_test_flags",
     "tf_kernel_tests_linkstatic",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -156,6 +156,7 @@ tf_cuda_cc_test(
     ],
     deps = [
         ":c_api",
+        ":c_api_experimental",
         ":c_api_internal",
         ":c_api_test_util",
         "//tensorflow/c:c_test_util",
@@ -235,9 +236,11 @@ tf_cuda_cc_test(
     ],
     args =
         ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
+    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
+        ":c_api",
         ":c_api_experimental",
         ":c_api_test_util",
         "//tensorflow/c:c_test_util",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 22c1f219f38..b70f40cc46a 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -202,9 +202,11 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
         "Currently, TFE_NewContext only supports tensorflow::GrpcServer."));
   }
 
-  LOG_AND_RETURN_IF_ERROR(grpc_server->Start());
-
-  tensorflow::uint64 context_id = tensorflow::random::New64();
+  tensorflow::uint64 context_id = tensorflow::EagerContext::NewContextId();
+  // Make master eager context accessible by local eager service, which might
+  // receive send tensor requests from remote workers.
+  LOG_AND_RETURN_IF_ERROR(grpc_server->AddMasterEagerContextToEagerService(
+      context_id, ctx->context));
 
   std::vector<string> remote_workers;
   grpc_server->master_env()->worker_cache->ListWorkers(&remote_workers);
@@ -240,9 +242,11 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
           &remote_eager_workers));
 
   // Initialize remote eager workers.
-  LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
-      remote_workers, context_id, keep_alive_secs, server_def,
-      remote_eager_workers.get(), ctx->context->Async(), base_request));
+  // TODO(b/138847548) Create remote eager contexts in async mode by default.
+  LOG_AND_RETURN_IF_ERROR(
+      CreateRemoteContexts(remote_workers, context_id, keep_alive_secs,
+                           server_def, remote_eager_workers.get(),
+                           ctx->context->Executor()->Async(), base_request));
 
   tensorflow::RemoteRendezvous* r =
       grpc_server->worker_env()->rendezvous_mgr->Find(context_id);
@@ -261,15 +265,21 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
 
   auto* device_mgr = grpc_server->worker_env()->device_mgr;
-  auto remote_mgr =
-      absl::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/true);
+  auto remote_mgr = absl::make_unique<tensorflow::eager::RemoteMgr>(
+      /*is_master=*/true, ctx->context);
 
-  return ctx->context->InitializeRemoteMaster(
+  LOG_AND_RETURN_IF_ERROR(ctx->context->InitializeRemoteMaster(
       std::move(server), grpc_server->worker_env(), worker_session,
       std::move(remote_eager_workers), std::move(remote_device_mgr),
       remote_workers, context_id, r, device_mgr, keep_alive_secs,
-      worker_session->cluster_flr.get(), std::move(remote_mgr));
+      worker_session->cluster_flr.get(), std::move(remote_mgr)));
+
+  // NOTE: We start the server after all other initialization, because the
+  // GrpcServer cannot be destroyed after it is started.
+  LOG_AND_RETURN_IF_ERROR(grpc_server->Start());
 #undef LOG_AND_RETURN_IF_ERROR
+
+  return tensorflow::Status::OK();
 }
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -365,12 +375,6 @@ void TFE_ContextOptionsSetDevicePlacementPolicy(
   options->device_placement_policy = policy;
 }
 
-TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
-                                                        unsigned char enable,
-                                                        TF_Status* status) {
-  status->status = ctx->context->SetAsyncForThread(enable);
-}
-
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
@@ -455,18 +459,6 @@ extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
       ctx->context->GetDevicePlacementPolicy());
 }
 
-void TFE_ContextAsyncWait(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->context->AsyncWait();
-}
-
-void TFE_ContextGetStatus(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->context->GetStatus();
-}
-
-void TFE_ContextAsyncClearError(TFE_Context* ctx) {
-  ctx->context->ClearAsyncError();
-}
-
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
@@ -571,7 +563,8 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
     const tensorflow::Tensor* t = nullptr;
     tensorflow::TensorHandle* h_cpu = nullptr;
     status->status = EagerCopyToDevice(
-        handle, handle->Context(), handle->Context()->HostCPU(), false, &h_cpu);
+        handle, handle->Context(), handle->Context()->Executor(),
+        handle->Context()->HostCPU(), false, &h_cpu);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -671,7 +664,7 @@ void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
-  TF_AttrType ret;
+  TF_AttrType ret = TF_ATTR_INT;
   status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                               attr_name, &ret, is_list);
   return ret;
@@ -683,10 +676,11 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
                                   TF_Status* status) {
   TF_AttrType ret;
   TFE_Op* op = TFE_NewOp(ctx, op_or_function_name, status);
-  if (!status->status.ok()) {
-    return TF_ATTR_INT;  // Same dummy return as TFE_OpGetAttrType.
+  if (status->status.ok()) {
+    ret = TFE_OpGetAttrType(op, attr_name, is_list, status);
+  } else {
+    ret = TF_ATTR_INT;  // Same dummy return as TFE_OpGetAttrType.
   }
-  ret = TFE_OpGetAttrType(op, attr_name, is_list, status);
   TFE_DeleteOp(op);
   return ret;
 }
@@ -922,6 +916,7 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
     return nullptr;
   }
   status->status = tensorflow::EagerCopyToDevice(h->handle, ctx->context,
+                                                 ctx->context->Executor(),
                                                  device, false, &handle);
   if (status->status.ok()) {
     return new TFE_TensorHandle(handle);
@@ -957,12 +952,10 @@ unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
   ctx->context->SetShouldStoreGraphs(true);
-  ctx->context->SetShouldStoreStepStats(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
   ctx->context->SetShouldStoreGraphs(false);
-  ctx->context->SetShouldStoreStepStats(false);
 }
 
 }  // extern "C"
@@ -974,7 +967,7 @@ TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,
 
 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
-  TFE_ContextAsyncWait(ctx, status);
+  status->status = ctx->context->Executor()->WaitForAllPendingNodes();
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*ctx->context->MetadataMu());
   status->status = MessageToBuffer(*ctx->context->RunMetadataProto(), buf);
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
old mode 100755
new mode 100644
index f6850118b89..d29e66dc1b8
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -77,7 +77,7 @@ typedef enum TFE_ContextDevicePlacementPolicy {
 // LINT.ThenChange(//tensorflow/core/common_runtime/eager/context.h)
 
 // Sets the default execution mode (sync/async). Note that this can be
-// overridden per thread using TFE_ContextSetAsyncForThread.
+// overridden per thread using TFE_ContextSetExecutorForThread.
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*,
                                                       unsigned char enable);
 
@@ -89,6 +89,9 @@ TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*);
 
 // "Context" under which operations/functions are executed. It encapsulates
 // things like the available devices, resource manager etc.
+// TFE_Context must outlive all tensor handles created using it. In other
+// words, TFE_DeleteContext() must be called after all tensor handles have
+// been deleted (with TFE_DeleteTensorHandle).
 //
 // TODO(ashankar): Merge with TF_Session?
 typedef struct TFE_Context TFE_Context;
@@ -115,11 +118,6 @@ TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalDevicePlacementPolicy(
 TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
 TFE_ContextGetDevicePlacementPolicy(TFE_Context* ctx);
 
-// Overrides the execution mode (sync/async) for the current thread.
-TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
-                                                        unsigned char enable,
-                                                        TF_Status* status);
-
 // A tensorflow.ServerDef specifies remote workers (in addition to the current
 // workers name). Operations created on this context can then be executed on
 // any of these remote workers by setting an appropriate device.
@@ -132,25 +130,6 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
                                                    size_t proto_len,
                                                    TF_Status* status);
 
-// Causes the calling thread to block till all ops dispatched in async mode
-// have been executed. Note that "execution" here refers to kernel execution /
-// scheduling of copies, etc. Similar to sync execution, it doesn't guarantee
-// that lower level device queues (like GPU streams) have been flushed.
-//
-// This call may not block for execution of ops enqueued concurrently with this
-// call.
-TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context*,
-                                                TF_Status* status);
-
-// When an error happens, any pending operations are discarded and newly issued
-// ops return an error. This call clears the error state and re-enables
-// execution of newly issued ops.
-//
-// Note that outputs of discarded ops remain in a corrupt state and should not
-// be used for future calls.
-// TODO(agarwal): mark the affected handles and raise errors if they are used.
-TF_CAPI_EXPORT extern void TFE_ContextAsyncClearError(TFE_Context*);
-
 // A handle to a tensor on a device.
 //
 // Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 32f28a0712c..a9ad77198e7 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -32,9 +32,7 @@ void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   op->operation.ConsumeInput(h->handle);
 }
 
-TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx) {
-  return new TFE_Profiler(ctx);
-}
+TFE_Profiler* TFE_NewProfiler() { return new TFE_Profiler(); }
 
 bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
   return profiler->profiler->Status().ok();
@@ -55,23 +53,10 @@ void TFE_ProfilerSerializeToString(TFE_Profiler* profiler, TF_Buffer* buf,
   };
 }
 
-TFE_ProfilerContext* TFE_NewProfilerContext() {
-  return new TFE_ProfilerContext;
-}
-
-void TFE_ProfilerContextSetEagerContext(TFE_ProfilerContext* profiler_context,
-                                        TFE_Context* eager_context) {
-  profiler_context->profiler_context.eager_context = eager_context->context;
-}
-
-void TFE_DeleteProfilerContext(TFE_ProfilerContext* profiler_context) {
-  delete profiler_context;
-}
-
-void TFE_StartProfilerServer(TFE_ProfilerContext* context, int port) {
-  // Release child thread intentionally. The child thread can be terminate by
+void TFE_StartProfilerServer(int port) {
+  // Release child thread intentionally. The child thread can be terminated by
   // terminating the main thread.
-  tensorflow::StartProfilerServer(&context->profiler_context, port).release();
+  tensorflow::StartProfilerServer(port).release();
 }
 
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
@@ -587,3 +572,30 @@ void TFE_OpSetCancellationManager(TFE_Op* op,
   op->operation.SetCancellationManager(
       &cancellation_manager->cancellation_manager);
 }
+
+TFE_Executor* TFE_NewExecutor(bool is_async) {
+  return new TFE_Executor(is_async);
+}
+
+void TFE_DeleteExecutor(TFE_Executor* executor) { delete executor; }
+
+bool TFE_ExecutorIsAsync(TFE_Executor* executor) {
+  return executor->executor()->Async();
+}
+
+void TFE_ExecutorWaitForAllPendingNodes(TFE_Executor* executor,
+                                        TF_Status* status) {
+  status->status = executor->executor()->WaitForAllPendingNodes();
+}
+
+void TFE_ExecutorClearError(TFE_Executor* executor) {
+  executor->executor()->ClearError();
+}
+
+void TFE_ContextSetExecutorForThread(TFE_Context* ctx, TFE_Executor* executor) {
+  ctx->context->SetExecutorForThread(executor->executor());
+}
+
+TFE_Executor* TFE_ContextGetExecutorForThread(TFE_Context* ctx) {
+  return new TFE_Executor(ctx->context->Executor());
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index cdf1492c0bc..e5a9459faff 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -25,8 +25,6 @@ extern "C" {
 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
                                               TF_Status* status);
 
-typedef struct TFE_ProfilerContext TFE_ProfilerContext;
-
 // A profiler which will start profiling when creating the object and will stop
 // when the object is destroyed. It will profile all operations run under the
 // given TFE_Context. Multiple instance of it can be created, but at most one
@@ -34,7 +32,7 @@ typedef struct TFE_ProfilerContext TFE_ProfilerContext;
 // Thread-safety: TFE_Profiler is thread-safe.
 typedef struct TFE_Profiler TFE_Profiler;
 
-TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx);
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler();
 TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
 TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
 
@@ -44,27 +42,14 @@ TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Profiler* profiler,
                                                          TF_Buffer* buf,
                                                          TF_Status* status);
 
-// Return a new profiler context object.
-TF_CAPI_EXPORT extern TFE_ProfilerContext* TFE_NewProfilerContext(void);
-
-// Set the eager context in TFE_ProfilerServerOptions
-TF_CAPI_EXPORT extern void TFE_ProfilerContextSetEagerContext(
-    TFE_ProfilerContext* profiler_context, TFE_Context* eager_context);
-
-// Destroy a profiler context object.
-TF_CAPI_EXPORT extern void TFE_DeleteProfilerContext(
-    TFE_ProfilerContext* profiler_context);
-
 // Start a profiler grpc server which listens to specified port. It will start
 // the server on its own thread. It can be shutdown by terminating tensorflow.
 // It can be used in both Eager mode and graph mode. Creating multiple profiler
 // server is allowed. The service defined in
 // tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
-// tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
-// file following
-// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
-TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_ProfilerContext* context,
-                                                   int port);
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture trace file
+// following https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern void TFE_StartProfilerServer(int port);
 
 // Enables only graph collection in RunMetadata on the functions executed from
 // this context.
@@ -367,6 +352,51 @@ TF_CAPI_EXPORT extern void TFE_OpSetCancellationManager(
     TFE_Op* op, TFE_CancellationManager* cancellation_manager,
     TF_Status* status);
 
+// -----------------------------------------------------------------------------
+// Eager Executor APIs.
+typedef struct TFE_Executor TFE_Executor;
+
+// Creates a new eager Executor. Nodes in one executor are guaranteed to be
+// executed in sequence. Assigning nodes to different executors allows executing
+// nodes in parallel.
+TF_CAPI_EXPORT extern TFE_Executor* TFE_NewExecutor(bool is_async);
+
+// Deletes the eager Executor without waiting for enqueued nodes. Please call
+// TFE_ExecutorWaitForAllPendingNodes before calling this API if you want to
+// make sure all nodes are finished.
+TF_CAPI_EXPORT extern void TFE_DeleteExecutor(TFE_Executor*);
+
+// Returns true if the executor is in async mode.
+TF_CAPI_EXPORT extern bool TFE_ExecutorIsAsync(TFE_Executor*);
+
+// Causes the calling thread to block till all ops dispatched in this executor
+// have been executed. Note that "execution" here refers to kernel execution /
+// scheduling of copies, etc. Similar to sync execution, it doesn't guarantee
+// that lower level device queues (like GPU streams) have been flushed.
+//
+// This call may not block for execution of ops enqueued concurrently with this
+// call.
+TF_CAPI_EXPORT extern void TFE_ExecutorWaitForAllPendingNodes(
+    TFE_Executor*, TF_Status* status);
+
+// When an error happens, any pending operations are discarded and newly issued
+// ops return an error. This call clears the error state and re-enables
+// execution of newly issued ops.
+//
+// Note that outputs of discarded ops remain in a corrupt state and should not
+// be used for future calls.
+// TODO(agarwal): mark the affected handles and raise errors if they are used.
+TF_CAPI_EXPORT extern void TFE_ExecutorClearError(TFE_Executor*);
+
+// Sets a custom Executor for current thread. All nodes created by this thread
+// will be added to this Executor. It will override current executor.
+TF_CAPI_EXPORT extern void TFE_ContextSetExecutorForThread(TFE_Context*,
+                                                           TFE_Executor*);
+
+// Returns the Executor for current thread.
+TF_CAPI_EXPORT extern TFE_Executor* TFE_ContextGetExecutorForThread(
+    TFE_Context*);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index 249d6c8960b..ab76ad10adc 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/cc/profiler/profiler.h"
 #include "tensorflow/core/lib/monitoring/collection_registry.h"
@@ -43,12 +44,9 @@ void ExecuteWithProfiling(bool async) {
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
-  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
-  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
-  TFE_Profiler* profiler = TFE_NewProfiler(profiler_context);
+  TFE_Profiler* profiler = TFE_NewProfiler();
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
-  TFE_DeleteProfilerContext(profiler_context);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
@@ -71,8 +69,10 @@ void ExecuteWithProfiling(bool async) {
   ASSERT_EQ(1, num_retvals);
   TF_Buffer* profiler_result = TF_NewBuffer();
   if (async) {
-    TFE_ContextAsyncWait(ctx, status);
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
     ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
   }
   TFE_ProfilerSerializeToString(profiler, profiler_result, status);
   TFE_DeleteProfiler(profiler);
@@ -85,7 +85,10 @@ void ExecuteWithProfiling(bool async) {
   if (!gpu_device_name.empty()) {
     EXPECT_TRUE(HasSubstr(profile_proto_str, "/device:GPU:0"));
     // device name with "stream:all" is collected by Device Tracer.
+#ifndef TENSORFLOW_USE_ROCM
+    // ROCm platform does not yet support stream level tracing
     EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
+#endif
   }
   // "/host:CPU" is collected by TraceMe
   EXPECT_TRUE(HasSubstr(profile_proto_str, "/host:CPU"));
@@ -110,27 +113,14 @@ TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
 TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
 
 TEST(CAPI, MultipleProfilerSession) {
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
-  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
-
-  TFE_Profiler* profiler1 = TFE_NewProfiler(profiler_context);
+  TFE_Profiler* profiler1 = TFE_NewProfiler();
   EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
 
-  TFE_Profiler* profiler2 = TFE_NewProfiler(profiler_context);
+  TFE_Profiler* profiler2 = TFE_NewProfiler();
   EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
 
   TFE_DeleteProfiler(profiler1);
   TFE_DeleteProfiler(profiler2);
-  TFE_DeleteProfilerContext(profiler_context);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
 }
 
 TEST(CAPI, MonitoringCounter0) {
@@ -307,5 +297,205 @@ TEST(CAPI, CancellationManager) {
   TFE_DeleteCancellationManager(c_mgr);
 }
 
+TEST(CAPI, Function_ident_CPU) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  for (bool async : {false, true, false}) {
+    TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_Executor* executor = TFE_NewExecutor(async);
+    TFE_ContextSetExecutorForThread(ctx, executor);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    TF_Tensor* t =
+        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TF_DeleteTensor(t);
+
+    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_OpAddInput(op, h, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+    std::vector<TFE_TensorHandle*> result;
+    result.push_back(nullptr);
+    int num_retvals = 1;
+    TFE_Execute(op, result.data(), &num_retvals, status);
+    TFE_DeleteOp(op);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    ASSERT_EQ(num_retvals, 1);
+
+    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+    TFE_ContextSetExecutorForThread(ctx, old_executor);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
+    TFE_DeleteExecutor(old_executor);
+    TFE_DeleteTensorHandle(h);
+    TF_DeleteTensor(r);
+    TFE_DeleteTensorHandle(result[0]);
+  }
+  TFE_ContextRemoveFunction(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContext(ctx);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST(CAPI, Function_ident_XLA_CPU) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  for (bool async : {false, true, false}) {
+    TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_Executor* executor = TFE_NewExecutor(async);
+    TFE_ContextSetExecutorForThread(ctx, executor);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
+    TF_Tensor* t =
+        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TF_DeleteTensor(t);
+
+    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_OpAddInput(op, h, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+    // Now run it via XLA.
+    TFE_OpSetXLACompilation(op, true);
+
+    std::vector<TFE_TensorHandle*> result;
+    result.push_back(nullptr);
+    int num_retvals = 1;
+    TFE_Execute(op, result.data(), &num_retvals, status);
+    TFE_DeleteOp(op);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    ASSERT_EQ(num_retvals, 1);
+
+    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+    TFE_ContextSetExecutorForThread(ctx, old_executor);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
+    TFE_DeleteExecutor(old_executor);
+    TFE_DeleteTensorHandle(h);
+    TF_DeleteTensor(r);
+    TFE_DeleteTensorHandle(result[0]);
+  }
+  TFE_ContextRemoveFunction(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContext(ctx);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+void Executor_MatMul_CPU(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_Executor* executor = TFE_NewExecutor(async);
+  TFE_ContextSetExecutorForThread(ctx, executor);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_TensorHandle* retvals[2] = {nullptr, nullptr};
+  int num_retvals = 2;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(1, num_retvals);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_ContextSetExecutorForThread(ctx, old_executor);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteExecutor(old_executor);
+  TFE_DeleteContext(ctx);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, Executor_MatMul_CPU) { Executor_MatMul_CPU(false); }
+TEST(CAPI, Executor_MatMul_CPUAsync) { Executor_MatMul_CPU(true); }
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index fe0c952dacb..5efed2ca76d 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -76,7 +76,14 @@ struct TFE_Context {
             async, device_mgr, device_mgr_owned, rendezvous,
             custom_kernel_creator)) {}
 
-  ~TFE_Context() { context->Unref(); }
+  ~TFE_Context() {
+    // TODO(iga): Add a separate API method to shutdown TFE_Context so that we
+    // don't send RPCs and block in destructor.
+    context->WaitForAndCloseRemoteContexts();
+    // context->RefCountIsOne() should be true here.
+    // TODO(iga): Remove EagerContext refcounting.
+    context->Unref();
+  }
 
   tensorflow::EagerContext* context;
 };
@@ -130,14 +137,8 @@ struct TFE_Op {
   std::unique_ptr<TFE_OpInferenceContext> inference_ctx;
 };
 
-struct TFE_ProfilerContext {
-  tensorflow::ProfilerContext profiler_context;
-};
-
 struct TFE_Profiler {
-  explicit TFE_Profiler(TFE_ProfilerContext* ctx) {
-    profiler = tensorflow::ProfilerSession::Create(&ctx->profiler_context);
-  }
+  explicit TFE_Profiler() { profiler = tensorflow::ProfilerSession::Create(); }
 
   std::unique_ptr<tensorflow::ProfilerSession> profiler;
 };
@@ -291,4 +292,19 @@ struct TFE_CancellationManager {
   tensorflow::CancellationManager cancellation_manager;
 };
 
+struct TFE_Executor {
+  explicit TFE_Executor(bool async)
+      : owned_executor(new tensorflow::EagerExecutor(async)) {}
+
+  explicit TFE_Executor(tensorflow::EagerExecutor* executor)
+      : owned_executor(nullptr), unowned_executor(executor) {}
+
+  tensorflow::EagerExecutor* executor() {
+    return owned_executor == nullptr ? unowned_executor : owned_executor.get();
+  }
+
+  std::unique_ptr<tensorflow::EagerExecutor> owned_executor;
+  tensorflow::EagerExecutor* unowned_executor;
+};
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index e80620c9a64..d3b755fee6e 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string.h>
 
 #include "absl/strings/match.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
@@ -78,7 +79,10 @@ void BM_Execute(int iters, int async) {
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   if (async) {
-    TFE_ContextAsyncWait(ctx, status);
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(matmul);
@@ -89,6 +93,41 @@ void BM_Execute(int iters, int async) {
 }
 BENCHMARK(BM_Execute)->Arg(0)->Arg(1);
 
+void BM_Execute_Identity(int iters, int async) {
+  tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(async ? "ExecuteIdentityAsync"
+                                      : "ExecuteIdentity");
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* identity = IdentityOp(ctx, m);
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TFE_Execute(identity, &retvals[0], &num_retvals, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  }
+  if (async) {
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
+  }
+  tensorflow::testing::StopTiming();
+  TFE_DeleteOp(identity);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+BENCHMARK(BM_Execute_Identity)->Arg(0)->Arg(1);
+
 TEST(CAPI, Context) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -196,8 +235,10 @@ void TestRemoteExecute(bool async) {
 
   TFE_DeleteOp(matmul);
 
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
   TFE_DeleteContext(ctx);
 
   TF_DeleteStatus(status);
@@ -282,9 +323,11 @@ void TestRemoteExecuteSilentCopies(bool async) {
 
   TFE_DeleteOp(matmul);
 
-  TFE_ContextAsyncWait(ctx, status);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
   TFE_DeleteContext(ctx);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TF_DeleteStatus(status);
 
@@ -298,7 +341,7 @@ TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
   TestRemoteExecuteSilentCopies(true);
 }
 
-void TestRemoteExecuteDeleteTensorAfterContext(bool async) {
+void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
   // This server def has the task index set to 0.
@@ -324,33 +367,49 @@ void TestRemoteExecuteDeleteTensorAfterContext(bool async) {
   TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  // Use large matrices so that RPCs don't return before we get a chance
+  // to call TFE_DeleteContext.
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle100x100();
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle100x100();
   const char remote_device_name[] =
       "/job:localhost/replica:0/task:1/device:CPU:0";
   auto* h0_task1 =
       TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* h1_task1 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
 
   TFE_DeleteTensorHandle(h0_task0);
-
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContext(ctx);
-
-  // Delete tensors after context is deleted.
+  TFE_DeleteTensorHandle(h1_task0);
   TFE_DeleteTensorHandle(h0_task1);
+  TFE_DeleteTensorHandle(h1_task1);
+  TFE_DeleteTensorHandle(retvals[0]);
 
-  TF_DeleteStatus(status);
+  TFE_DeleteOp(matmul);
+
+  TFE_DeleteContext(ctx);
 
   // TODO(b/136478427): Figure out how to correctly shut the server down.
   worker_server.release();
 }
 
-TEST(CAPI, RemoteExecuteDeleteTensorAfterContext) {
-  TestRemoteExecuteDeleteTensorAfterContext(false);
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(false);
 }
-TEST(CAPI, RemoteExecuteDeleteTensorAfterContextAsync) {
-  TestRemoteExecuteDeleteTensorAfterContext(true);
+
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
 }
 
 void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
@@ -397,8 +456,10 @@ void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
 
   TFE_DeleteOp(matmul);
 
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
   TF_DeleteStatus(status);
 }
 
@@ -433,8 +494,9 @@ void TestRemoteExecuteChangeServerDef(bool async) {
       "/job:localhost/replica:0/task:0/device:CPU:0";
   CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name);
 
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   // TODO(b/136478427): Figure out how to correctly shut the server down.
   worker_server.release();
@@ -476,8 +538,9 @@ void TestRemoteExecuteChangeServerDef(bool async) {
   CheckRemoteMatMulExecutesOK(ctx, new_remote_device_name,
                               new_local_device_name);
 
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
 
   TF_DeleteStatus(status);
 
@@ -610,8 +673,11 @@ void TensorHandleCopyBetweenDevicesError(bool async) {
   TFE_TensorHandle* hcopy =
       TFE_TensorHandleCopyToDevice(hcpu, ctx, kCPUDevice, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_ContextAsyncWait(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get()));
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
   TFE_DeleteTensorHandle(hcopy);
   TFE_DeleteTensorHandle(hcpu);
   if (hdevice != nullptr) TFE_DeleteTensorHandle(hdevice);
@@ -740,8 +806,10 @@ void TensorHandleSilentCopy(bool async) {
 
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_ContextAsyncWait(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
   TFE_DeleteContext(ctx);
 }
 
@@ -786,8 +854,10 @@ void TensorHandleSilentCopyLocal(bool async) {
 
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_ContextAsyncWait(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
   TFE_DeleteContext(ctx);
 }
 TEST(CAPI, TensorHandleSilentCopyLocal) { TensorHandleSilentCopyLocal(false); }
@@ -921,8 +991,10 @@ TEST(CAPI, TensorHandleDevices) {
   }
 
   TFE_DeleteTensorHandle(hcpu);
-  TFE_ContextAsyncWait(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
   TFE_DeleteContext(ctx);
 }
 
@@ -1000,9 +1072,11 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
     retvals[0] = nullptr;
     TFE_Execute(matmul2, &retvals[0], &num_retvals, status);
     EXPECT_NE(TF_OK, TF_GetCode(status));
-    TFE_ContextAsyncClearError(ctx);
-    TFE_ContextAsyncWait(ctx, status);
-    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorClearError(executor);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
   }
   // Following works in async mode since TFE_ContextAsyncClearError was called.
   TF_SetStatus(status, TF_OK, "");
@@ -1220,147 +1294,6 @@ void ExecuteWithTracing(bool async) {
 TEST(CAPI, ExecuteWithTracing) { ExecuteWithTracing(false); }
 TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithTracing(true); }
 
-TEST(CAPI, Function_ident_CPU) {
-  // First create a simple identity function.
-  TF_Graph* function_graph = TF_NewGraph();
-  TF_OperationDescription* arg_descr =
-      TF_NewOperation(function_graph, "Placeholder", "arg");
-  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
-  TF_Status* status = TF_NewStatus();
-  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_OperationDescription* id_descr =
-      TF_NewOperation(function_graph, "Identity", "id");
-  TF_SetAttrType(id_descr, "T", TF_INT32);
-  TF_AddInput(id_descr, {arg, 0});
-  TF_Operation* id = TF_FinishOperation(id_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_Output input{arg, 0};
-  TF_Output output{id, 0};
-  TF_Function* fn =
-      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
-                         &output, nullptr, nullptr, "test", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteGraph(function_graph);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-  TFE_ContextAddFunction(ctx, fn, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteFunction(fn);
-
-  for (bool async : {false, true, false}) {
-    TFE_ContextSetAsyncForThread(ctx, static_cast<unsigned char>(async),
-                                 status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
-    TF_Tensor* t =
-        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TF_DeleteTensor(t);
-
-    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TFE_OpAddInput(op, h, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-
-    std::vector<TFE_TensorHandle*> result;
-    result.push_back(nullptr);
-    int num_retvals = 1;
-    TFE_Execute(op, result.data(), &num_retvals, status);
-    TFE_DeleteOp(op);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    ASSERT_EQ(num_retvals, 1);
-
-    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-    TFE_DeleteTensorHandle(h);
-    TF_DeleteTensor(r);
-    TFE_DeleteTensorHandle(result[0]);
-  }
-  TFE_ContextRemoveFunction(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContext(ctx);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteStatus(status);
-}
-
-#ifdef TENSORFLOW_EAGER_USE_XLA
-TEST(CAPI, Function_ident_XLA_CPU) {
-  // First create a simple identity function.
-  TF_Graph* function_graph = TF_NewGraph();
-  TF_OperationDescription* arg_descr =
-      TF_NewOperation(function_graph, "Placeholder", "arg");
-  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
-  TF_Status* status = TF_NewStatus();
-  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_OperationDescription* id_descr =
-      TF_NewOperation(function_graph, "Identity", "id");
-  TF_SetAttrType(id_descr, "T", TF_INT32);
-  TF_AddInput(id_descr, {arg, 0});
-  TF_Operation* id = TF_FinishOperation(id_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_Output input{arg, 0};
-  TF_Output output{id, 0};
-  TF_Function* fn =
-      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
-                         &output, nullptr, nullptr, "test", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteGraph(function_graph);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-  TFE_ContextAddFunction(ctx, fn, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteFunction(fn);
-
-  for (bool async : {false, true, false}) {
-    TFE_ContextSetAsyncForThread(ctx, static_cast<unsigned char>(async),
-                                 status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
-    TF_Tensor* t =
-        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TF_DeleteTensor(t);
-
-    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TFE_OpAddInput(op, h, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-
-    // Now run it via XLA.
-    TFE_OpSetXLACompilation(op, true);
-
-    std::vector<TFE_TensorHandle*> result;
-    result.push_back(nullptr);
-    int num_retvals = 1;
-    TFE_Execute(op, result.data(), &num_retvals, status);
-    TFE_DeleteOp(op);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    ASSERT_EQ(num_retvals, 1);
-
-    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-    TFE_DeleteTensorHandle(h);
-    TF_DeleteTensor(r);
-    TFE_DeleteTensorHandle(result[0]);
-  }
-  TFE_ContextRemoveFunction(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContext(ctx);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteStatus(status);
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 string MatMulFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
@@ -1474,7 +1407,10 @@ void BM_ExecuteFunction(int iters, int async) {
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   if (async) {
-    TFE_ContextAsyncWait(ctx, status);
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 17d17c0b7f7..51566b35a9f 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -85,6 +85,24 @@ TFE_TensorHandle* TestMatrixTensorHandle() {
   return th;
 }
 
+TFE_TensorHandle* TestMatrixTensorHandle100x100() {
+  constexpr int64_t dims[] = {100, 100};
+  constexpr int num_elements = dims[0] * dims[1];
+  float data[num_elements];
+  for (int i = 0; i < num_elements; ++i) {
+    data[i] = 1.0f;
+  }
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2() {
   int64_t dims[] = {3, 2};
   double data[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -128,6 +146,19 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+TFE_Op* IdentityOp(TFE_Context* ctx, TFE_TensorHandle* a) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Identity", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
 TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a) {
   TF_Status* status = TF_NewStatus();
 
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 4ff3ff4301f..28062222cf0 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
 
 #include "tensorflow/c/eager/c_api.h"
-
 #include "tensorflow/core/platform/types.h"
 
 // Return a tensor handle containing a float scalar
@@ -34,6 +33,9 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle();
 // Return a tensor handle containing a 2x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle();
 
+// Return a tensor handle containing a 100x100 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle100x100();
+
 // Return a tensor handle containing a 3x2 matrix of doubles
 TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2();
 
@@ -43,6 +45,9 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2();
 // Return a matmul op multiplying `a` by `b`.
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
+// Return an identity op.
+TFE_Op* IdentityOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
 // Return a shape op fetching the shape of `a`.
 TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a);
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 0545e3f7ce0..edb2733ab32 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -18,6 +18,7 @@ limitations under the License.
 // Language-agnostic gradient tape. Does not perform backpropagation, just
 // maintains the data structures required to do so.
 
+#include <stack>
 #include <vector>
 
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -209,7 +210,9 @@ class ForwardAccumulator {
   // ForwardAccumulator.
   explicit ForwardAccumulator(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace)
-      : vspace_(vspace), backward_tape_(nullptr), accumulating_(false) {}
+      : vspace_(vspace) {
+    call_state_.emplace(nullptr, false);
+  }
 
   virtual ~ForwardAccumulator() {
     for (auto accumulated : accumulated_gradients_) {
@@ -262,6 +265,12 @@ class ForwardAccumulator {
       const std::function<BackwardFunction*()>& backward_function_getter,
       const std::function<void(BackwardFunction*)>& backward_function_deleter);
 
+  // Returns true if `Accumulate` is active somewhere above on the stack and
+  // there isn't an intervening PushState. This is useful for ordering
+  // ForwardAccumulators, where more deeply nested accumulators should not see
+  // computations from less deeply nested accumulators.
+  bool BusyAccumulating() const { return call_state_.top().accumulating; }
+
   // Fetches the current Jacobian-vector product associated with `tensor_id`, or
   // a nullptr if none is available.
   //
@@ -276,6 +285,15 @@ class ForwardAccumulator {
   bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids,
                     gtl::ArraySlice<tensorflow::DataType> dtypes);
 
+  // Temporarily push or pop transient state for this accumulator.
+  //
+  // Allows an accumulator which is currently processing an operation to
+  // temporarily reset its state. Without pushing and poping, accumulators
+  // ignore operations executed as a direct result of their own jvp
+  // computations.
+  void PushState() { call_state_.emplace(nullptr, false); }
+  void PopState() { call_state_.pop(); }
+
  private:
   // Helper for Accumulate: uses a GradientTape to compute forward gradients
   // from a backward gradient function. Fills `out_grads` corresponding to
@@ -283,7 +301,7 @@ class ForwardAccumulator {
   //
   // Executes the backward function in order to trace its gradient, which will
   // waste computation if executing eagerly (when graph building the unneeded
-  // computation is pruned). Temporarily sets `backward_tape_` so that
+  // computation is pruned). Temporarily sets `backward_tape` so that
   // Accumulate will forward op executions to the tape while the backward
   // function is running; this effectively adds the backward tape to the active
   // set (but does not require complicated callbacks to the language bindings).
@@ -299,16 +317,26 @@ class ForwardAccumulator {
   // Not owned; provides operations on Tensors which are currently only
   // available in language bindings (e.g. Python).
   const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace_;
-  // Set temporarily while in the Accumulate method; if backward_tape_ is not
-  // nullptr then we forward op executions to it so Accumulate can compute a
-  // backward pass on its backward function.
-  //
-  // Not owned by the ForwardAccumulator. The method which sets `backward_tape_`
-  // keeps ownership.
-  GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape_;
-  // While the Accumulate method is running (accumulating_ is True), any op
-  // executions not forwarded to backward_tape_ should be ignored.
-  bool accumulating_;
+
+  struct AccumulatorCallState {
+    AccumulatorCallState(
+        GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape,
+        bool accumulating)
+        : backward_tape(backward_tape), accumulating(accumulating) {}
+    // Set temporarily while in the Accumulate method; if backward_tape is not
+    // nullptr then we forward op executions to it so Accumulate can compute a
+    // backward pass on its backward function.
+    //
+    // Not owned by the ForwardAccumulator. The method which sets
+    // `backward_tape` keeps ownership.
+    GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape;
+    // While the Accumulate method is running (accumulating is True), any op
+    // executions not forwarded to backward_tape should be ignored.
+    bool accumulating;
+  };
+  // A deque-backed stack, whose element references are not invalidated by
+  // pushes and pops at the back.
+  std::stack<AccumulatorCallState> call_state_;
 };
 
 // Template instantiations here
@@ -841,12 +869,12 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 bool ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ShouldRecord(
     gtl::ArraySlice<int64> tensor_ids,
     gtl::ArraySlice<tensorflow::DataType> dtypes) {
-  if (backward_tape_ != nullptr) {
-    // If we're forwarding Accumulate calls to backward_tape_'s RecordOperation,
+  if (call_state_.top().backward_tape != nullptr) {
+    // If we're forwarding Accumulate calls to backward_tape's RecordOperation,
     // we should also delegate ShouldRecord.
-    return backward_tape_->ShouldRecord(tensor_ids, dtypes);
+    return call_state_.top().backward_tape->ShouldRecord(tensor_ids, dtypes);
   }
-  if (accumulating_) {
+  if (call_state_.top().accumulating) {
     return false;
   }
   for (int i = 0; i < tensor_ids.size(); ++i) {
@@ -878,9 +906,10 @@ ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ForwardpropFromTape(
   */
   std::unique_ptr<GradientTape<Gradient, BackwardFunction, TapeTensor>> tape(
       new GradientTape<Gradient, BackwardFunction, TapeTensor>(false));
-  backward_tape_ = tape.get();
+  AccumulatorCallState& call_state = call_state_.top();
+  call_state.backward_tape = tape.get();
   auto pop_backward_tape =
-      gtl::MakeCleanup([this] { this->backward_tape_ = nullptr; });
+      gtl::MakeCleanup([&call_state] { call_state.backward_tape = nullptr; });
   std::vector<Gradient*> forwardprop_aids;
   std::vector<int64> sources;
   std::unordered_set<int64> sources_set;
@@ -955,10 +984,10 @@ Status ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
     const ForwardFunction<Gradient>* forward_function,
     const std::function<BackwardFunction*()>& backward_function_getter,
     const std::function<void(BackwardFunction*)>& backward_function_deleter) {
-  if (backward_tape_ != nullptr) {
-    // If backward_tape_ is not null, then this call to Accumulate is the result
+  if (call_state_.top().backward_tape != nullptr) {
+    // If backward_tape is not null, then this call to Accumulate is the result
     // of a still-active call to Accumulate which is running operations. We
-    // forward these operations to backward_tape_ so the outer Accumulate call
+    // forward these operations to backward_tape so the outer Accumulate call
     // can do its work.
     //
     // Rather than re-entering and delegating Accumulate like this, we could
@@ -966,9 +995,9 @@ Status ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
     // (so it can deactivate itself and activate its GradientTape). Currently
     // that is managed by the language binding and would require relatively
     // messy callbacks.
-    backward_tape_->RecordOperation(op_type, output_tensors, input_tensor_id,
-                                    input_dtypes, backward_function_getter,
-                                    backward_function_deleter);
+    call_state_.top().backward_tape->RecordOperation(
+        op_type, output_tensors, input_tensor_id, input_dtypes,
+        backward_function_getter, backward_function_deleter);
     return Status::OK();
   }
   if (!ShouldRecord(input_tensor_id, input_dtypes)) {
@@ -1006,9 +1035,8 @@ Status ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
 
   // Avoid infinite recursion. Whichever forward function we run, it'll end up
   // executing ops, and we don't want to watch those with this accumulator.
-  accumulating_ = true;
-  auto reset_accumulating =
-      gtl::MakeCleanup([this] { this->accumulating_ = false; });
+  call_state_.emplace(nullptr, true);
+  auto pop_call_state = gtl::MakeCleanup([this] { this->call_state_.pop(); });
 
   std::vector<Gradient*> forward_grads;
   if (forward_function == nullptr) {
diff --git a/tensorflow/c/experimental/rendezvous.cc b/tensorflow/c/experimental/rendezvous.cc
index 0ee4907b7a4..7a90bde8fe4 100644
--- a/tensorflow/c/experimental/rendezvous.cc
+++ b/tensorflow/c/experimental/rendezvous.cc
@@ -45,6 +45,9 @@ CRemoteRendezvous::CRemoteRendezvous(const WorkerEnv* env, int64 step_id,
 void CRemoteRendezvous::RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                                             const Rendezvous::Args& args,
                                             DoneCallback done) {
+  if (args.cancellation_manager != nullptr) {
+    VLOG(1) << "WARNING: CRemoteRendezvous does not support cancellation.";
+  }
   TF_ParsedKey key;
   key.src_device = parsed.src_device.data();
   key.src_device_len = parsed.src_device.size();
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 7184ad68fb7..a4d51a1b3b2 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -63,12 +63,26 @@ cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
 libdir=\${exec_prefix}/${LIBDIR}
-includedir=\${prefix}/include
+includedir=\${prefix}/include/tensorflow
 
 Name: TensorFlow
 Version: ${TF_VERSION}
 Description: Library for computation using data flow graphs for scalable machine learning
 Requires:
-Libs: -L\${libdir} -ltensorflow
+Libs: -L\${libdir} -ltensorflow -ltensorflow_framework
+Cflags: -I\${includedir}
+EOF
+
+cat << EOF > tensorflow_cc.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/${LIBDIR}
+includedir=\${prefix}/include/tensorflow
+
+Name: TensorFlow
+Version: ${TF_VERSION}
+Description: Library for computation using data flow graphs for scalable machine learning
+Requires:
+Libs: -L\${libdir} -ltensorflow_cc -ltensorflow_framework
 Cflags: -I\${includedir}
 EOF
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 94685c8ffaf..b067176f3be 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -189,8 +190,8 @@ void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
 void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
                   TF_Status* status) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
-  if (i < 0 || i >= cc_ctx->num_inputs()) {
-    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+  if (i < 0 || i >= cc_ctx->num_outputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "output index out of range");
     return;
   }
   ::tensorflow::Tensor cc_tensor;
@@ -240,3 +241,14 @@ TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
 int64_t TF_StepId(TF_OpKernelContext* ctx) {
   return reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->step_id();
 }
+
+TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
+                             TF_DataType dtype, int64_t* dims, int num_dims,
+                             size_t len) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+  tensorflow::AllocatorAttributes attr = cc_ctx->output_alloc_attr(index);
+  auto* allocator = cc_ctx->get_allocator(attr);
+  void* data = tensorflow::allocate_tensor("TF_AllocateOutput", len, allocator);
+  return TF_NewTensor(dtype, dims, num_dims, data, len,
+                      tensorflow::deallocate_buffer, allocator);
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index a192437a52f..8d0518ae170 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -180,6 +180,16 @@ TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt32(
     TF_OpKernelConstruction* ctx, const char* attr_name, int32_t* val,
     TF_Status* status);
 
+// Allocates Tensor for output at given index. Caller takes ownership of
+// returned TF_Tensor and should deallocate it using TF_DeleteTensor(tensor).
+//
+// This function should be used to allocate outputs inside kernel
+// compute function.
+TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
+                                            int index, TF_DataType dtype,
+                                            int64_t* dims, int num_dims,
+                                            size_t len);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 0e65d18ec81..05277b6c12c 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -12,17 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
 
 #include "tensorflow/c/kernels.h"
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
-#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -309,4 +315,144 @@ TEST(TestKernel, TestHostMemory) {
   TF_DeleteKernelBuilder(builder);
   ASSERT_TRUE(delete_called);
 }
+
+class DeviceKernelOpTest : public OpsTestBase {
+ protected:
+  void SetupOp(const char* op_name, const char* kernel_name,
+               void (*compute_func)(void*, TF_OpKernelContext*)) {
+    TF_KernelBuilder* builder = TF_NewKernelBuilder(
+        op_name, device_name_, nullptr, compute_func, nullptr);
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+
+#if GOOGLE_CUDA
+    std::unique_ptr<Device> device(
+        DeviceFactory::NewDevice(device_name_, {}, "/job:a/replica:0/task:0"));
+    OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
+#endif
+    TF_ASSERT_OK(NodeDefBuilder(op_name, op_name).Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+
+#if GOOGLE_CUDA
+  const char* device_name_ = tensorflow::DEVICE_GPU;
+#else
+  const char* device_name_ = tensorflow::DEVICE_CPU;
+#endif
+};
+
+REGISTER_OP("AllocateOutputOp1").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateOutputSizeOne) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    // Allocate output
+    int64_t dim = 1;
+    size_t tensor_size_bytes = TF_DataTypeSize(TF_FLOAT);
+    TF_Tensor* output = TF_AllocateOutput(
+        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*len=*/tensor_size_bytes);
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(1, TF_NumDims(output));
+    EXPECT_EQ(1, TF_Dim(output, 0));
+
+    // Set output to 3
+    float* data = reinterpret_cast<float*>(TF_TensorData(output));
+    float value = 3.0f;
+#if GOOGLE_CUDA
+    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
+    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, &value,
+                                                  tensor_size_bytes);
+#else
+    *data = value;
+#endif
+
+    TF_Status* s = TF_NewStatus();
+    TF_SetOutput(ctx, 0, output, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateOutputOp1", "AllocateOutput1", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [1] values: 3>",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateOutputOp0").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateEmptyOutput) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    // Allocate empty output
+    int64_t dim = 0;
+    TF_Tensor* output = TF_AllocateOutput(
+        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*len=*/0);
+
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(1, TF_NumDims(output));
+    EXPECT_EQ(0, TF_Dim(output, 0));
+
+    TF_Status* s = TF_NewStatus();
+    TF_SetOutput(ctx, 0, output, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateOutputOp0", "AllocateOutput0", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [0] values: >",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateOutputOp2x3").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    // Allocate 2x3 output
+    int64_t dim[2] = {2, 3};
+    size_t tensor_size_bytes = 6 * TF_DataTypeSize(TF_FLOAT);
+    TF_Tensor* output = TF_AllocateOutput(
+        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/dim,
+        /*num_dims=*/2, /*len=*/tensor_size_bytes);
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(2, TF_NumDims(output));
+    EXPECT_EQ(2, TF_Dim(output, 0));
+    EXPECT_EQ(3, TF_Dim(output, 1));
+
+    // Set output to [1 2 3 4 5 6]
+    void* data = TF_TensorData(output);
+    float value[6] = {1, 2, 3, 4, 5, 6};
+#if GOOGLE_CUDA
+    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
+    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, value,
+                                                  tensor_size_bytes);
+#else
+    memcpy(data, value, tensor_size_bytes);
+#endif
+
+    TF_Status* s = TF_NewStatus();
+    TF_SetOutput(ctx, 0, output, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateOutputOp2x3", "AllocateOutput2x3", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
+            output->DebugString(100));
+}
 }  // namespace tensorflow
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index deb36166a47..2ad778d6057 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -31,6 +31,37 @@ using tensorflow::TensorBuffer;
 using tensorflow::errors::FailedPrecondition;
 using tensorflow::errors::InvalidArgument;
 
+namespace tensorflow {
+void* allocate_tensor(const char* operation, size_t len, Allocator* allocator) {
+  void* data = allocator->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len);
+  if (LogMemory::IsEnabled() && data != nullptr) {
+    LogMemory::RecordRawAllocation(
+        operation, LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, len, data,
+        allocator);
+  }
+  return data;
+}
+
+void* allocate_tensor(const char* operation, size_t len) {
+  return allocate_tensor(operation, len, cpu_allocator());
+}
+
+void deallocate_buffer(void* data, size_t len, void* arg) {
+  Allocator* allocator = nullptr;
+  if (arg == nullptr) {
+    allocator = cpu_allocator();
+  } else {
+    allocator = reinterpret_cast<Allocator*>(arg);
+  }
+  if (LogMemory::IsEnabled() && data != nullptr) {
+    LogMemory::RecordRawDeallocation(
+        "TensorFlow C Api", LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, data,
+        allocator, false);
+  }
+  allocator->DeallocateRaw(data);
+}
+}  // namespace tensorflow
+
 namespace {
 class TF_ManagedBuffer : public TensorBuffer {
  public:
@@ -63,36 +94,15 @@ class TF_ManagedBuffer : public TensorBuffer {
   bool OwnsMemory() const override { return false; }
 };
 
-void* allocate_tensor(const char* operation, size_t len) {
-  void* data =
-      tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len);
-  if (tensorflow::LogMemory::IsEnabled() && data != nullptr) {
-    tensorflow::LogMemory::RecordRawAllocation(
-        operation, tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID,
-        len, data, tensorflow::cpu_allocator());
-  }
-  return data;
-}
-
-void deallocate_buffer(void* data, size_t len, void* arg) {
-  if (tensorflow::LogMemory::IsEnabled() && data != nullptr) {
-    tensorflow::LogMemory::RecordRawDeallocation(
-        "TensorFlow C Api",
-        tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, data,
-        tensorflow::cpu_allocator(), false);
-  }
-  tensorflow::cpu_allocator()->DeallocateRaw(data);
-}
-
 }  // namespace
 
-TF_Tensor::~TF_Tensor() { buffer->Unref(); }
-
 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                              int num_dims, size_t len) {
-  void* data = allocate_tensor("TF_AllocateTensor", len);
-  return TF_NewTensor(dtype, dims, num_dims, data, len, deallocate_buffer,
-                      nullptr);
+  void* data = tensorflow::allocate_tensor("TF_AllocateTensor", len,
+                                           tensorflow::cpu_allocator());
+  return TF_NewTensor(dtype, dims, num_dims, data, len,
+                      tensorflow::deallocate_buffer,
+                      tensorflow::cpu_allocator());
 }
 
 TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
@@ -117,8 +127,8 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     //
     // Other types have the same representation, so copy only if it is safe to
     // do so.
-    buf = new TF_ManagedBuffer(allocate_tensor("TF_NewTensor", len), len,
-                               deallocate_buffer, nullptr);
+    buf = new TF_ManagedBuffer(tensorflow::allocate_tensor("TF_NewTensor", len),
+                               len, tensorflow::deallocate_buffer, nullptr);
     std::memcpy(buf->data(), data, len);
     // Free the original buffer.
     deallocator(data, len, deallocator_arg);
@@ -126,9 +136,12 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
   }
 
-  TF_Tensor* ret = new TF_Tensor{dtype, tensorflow::TensorShape(dimvec), buf};
+  TF_Tensor* ret =
+      new TF_Tensor{Tensor(static_cast<tensorflow::DataType>(dtype),
+                           tensorflow::TensorShape(dimvec), buf)};
+  buf->Unref();
   size_t elem_size = TF_DataTypeSize(dtype);
-  if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
+  if (elem_size > 0 && len < (elem_size * ret->tensor.NumElements())) {
     delete ret;
     return nullptr;
   }
@@ -139,7 +152,7 @@ TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
   // It is safe to move the Tensor if and only if we own the unique reference to
   // it. In that case, we might as well not delete and reallocate, but a future
   // implementation might need to do so.
-  TensorBuffer* buf = tensor->buffer;
+  TensorBuffer* buf = tensorflow::TensorCApi::Buffer(tensor->tensor);
   if (buf->RefCountIsOne() && buf->root_buffer()->RefCountIsOne() &&
       buf->OwnsMemory()) {
     return tensor;
@@ -149,13 +162,23 @@ TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
 
 void TF_DeleteTensor(TF_Tensor* t) { delete t; }
 
-TF_DataType TF_TensorType(const TF_Tensor* t) { return t->dtype; }
-int TF_NumDims(const TF_Tensor* t) { return t->shape.dims(); }
-int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
-  return static_cast<int64_t>(t->shape.dim_size(dim_index));
+TF_DataType TF_TensorType(const TF_Tensor* t) {
+  return static_cast<TF_DataType>(t->tensor.dtype());
+}
+
+int TF_NumDims(const TF_Tensor* t) { return t->tensor.dims(); }
+
+int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
+  return static_cast<int64_t>(t->tensor.dim_size(dim_index));
+}
+
+size_t TF_TensorByteSize(const TF_Tensor* t) {
+  return tensorflow::TensorCApi::Buffer(t->tensor)->size();
+}
+
+void* TF_TensorData(const TF_Tensor* t) {
+  return tensorflow::TensorCApi::Buffer(t->tensor)->data();
 }
-size_t TF_TensorByteSize(const TF_Tensor* t) { return t->buffer->size(); }
-void* TF_TensorData(const TF_Tensor* t) { return t->buffer->data(); }
 
 int64_t TF_TensorElementCount(const TF_Tensor* t) {
   int64_t result = 1;
@@ -166,63 +189,17 @@ int64_t TF_TensorElementCount(const TF_Tensor* t) {
   return result;
 }
 
-// Returns the number of elements that would be present in a tensor with the
-// given shape.
-static int64_t ShapeNumElements(const int64_t* dims, int num_dims) {
-  int64_t result = 1;
-  for (int dim = 0; dim < num_dims; ++dim) {
-    result *= dims[dim];
-  }
-  return result;
-}
-
-static void UnrefIfNonNull(::tensorflow::TensorBuffer* buf) {
-  if (buf != nullptr) {
-    buf->Unref();
-  }
-}
-
-static void RefIfNonNull(::tensorflow::TensorBuffer* buf) {
-  if (buf != nullptr) {
-    buf->Ref();
-  }
-}
-
 void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
                           TF_Tensor* to, const int64_t* new_dims,
                           int num_new_dims, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
-  size_t in_size = TF_DataTypeSize(TF_TensorType(from));
-  if (in_size == 0) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "input tensor has a zero-sized data type");
-    return;
-  }
-  size_t out_size = TF_DataTypeSize(type);
-  if (out_size == 0) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "output tensor has a zero-sized data type");
-    return;
-  }
-
-  if (ShapeNumElements(new_dims, num_new_dims) * out_size !=
-      TF_TensorElementCount(from) * in_size) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "input tensor is not compatible with output shape");
-    return;
-  }
-
-  tensorflow::TensorShapeProto p;
+  tensorflow::TensorShape s;
   for (int i = 0; i < num_new_dims; ++i) {
-    p.add_dim()->set_size(new_dims[i]);
-  }
-  to->shape = tensorflow::TensorShape(p);
-  to->dtype = type;
-  if (to->buffer != from->buffer) {
-    UnrefIfNonNull(to->buffer);
-    to->buffer = from->buffer;
-    RefIfNonNull(to->buffer);
+    s.AddDim(new_dims[i]);
   }
+  Status cc_status(to->tensor.BitcastFrom(
+      from->tensor, static_cast<tensorflow::DataType>(type), s));
+  Set_TF_Status_from_Status(status, cc_status);
 }
 
 // --------------------------------------------------------------------------
@@ -332,17 +309,19 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
     return t;
   }
   if (src.dtype() != tensorflow::DT_STRING) {
-    TensorBuffer* buf = tensorflow::TensorCApi::Buffer(src);
-    buf->Ref();
-    return new TF_Tensor{static_cast<TF_DataType>(src.dtype()), src.shape(),
-                         buf};
+    auto* result = new TF_Tensor();
+    if (!result->tensor.CopyFrom(src, src.shape())) {
+      delete result;
+      return nullptr;
+    }
+    return result;
   }
   // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly
   // encoded sequence of strings.
 
   // Compute bytes needed for encoding.
   size_t size = 0;
-  const auto& srcarray = src.flat<string>();
+  const auto& srcarray = src.flat<tstring>();
   for (int i = 0; i < srcarray.size(); ++i) {
     const string& s = srcarray(i);
     // uint64 starting_offset, TF_StringEncode-d string.
@@ -393,14 +372,14 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
 }
 
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
-  if (src->dtype == TF_RESOURCE) {
-    if (src->shape.dims() != 0) {
+  if (src->tensor.dtype() == DT_RESOURCE) {
+    if (src->tensor.dims() != 0) {
       return InvalidArgument(
           "Malformed TF_RESOURCE tensor: expected a scalar, got a tensor with "
           "shape ",
-          src->shape.DebugString());
+          src->tensor.shape().DebugString());
     }
-    *dst = Tensor(tensorflow::DT_RESOURCE, src->shape);
+    *dst = Tensor(tensorflow::DT_RESOURCE, src->tensor.shape());
     if (!dst->scalar<tensorflow::ResourceHandle>()().ParseFromString(
             string(static_cast<const char*>(TF_TensorData(src)),
                    TF_TensorByteSize(src)))) {
@@ -409,14 +388,13 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
     }
     return Status::OK();
   }
-  if (src->dtype != TF_STRING) {
-    *dst =
-        tensorflow::TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer);
+  if (src->tensor.dtype() != DT_STRING) {
+    *dst = src->tensor;
     return Status::OK();
   }
   // TF_STRING tensors require copying since Tensor class expects a sequence of
   // string objects.
-  const tensorflow::int64 num_elements = src->shape.num_elements();
+  const tensorflow::int64 num_elements = src->tensor.NumElements();
   const char* input = reinterpret_cast<const char*>(TF_TensorData(src));
   const size_t src_size = TF_TensorByteSize(src);
   if (static_cast<tensorflow::int64>(src_size / sizeof(tensorflow::uint64)) <
@@ -427,8 +405,8 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
   const char* data_start = input + sizeof(tensorflow::uint64) * num_elements;
   const char* limit = input + src_size;
 
-  *dst = Tensor(static_cast<tensorflow::DataType>(src->dtype), src->shape);
-  auto dstarray = dst->flat<string>();
+  *dst = Tensor(src->tensor.dtype(), src->tensor.shape());
+  auto dstarray = dst->flat<tstring>();
   for (tensorflow::int64 i = 0; i < num_elements; ++i) {
     tensorflow::uint64 offset =
         reinterpret_cast<const tensorflow::uint64*>(input)[i];
@@ -447,3 +425,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
 }
 
 }  // namespace tensorflow
+
+bool TF_TensorIsAligned(const TF_Tensor* tensor) {
+  return tensor->tensor.IsAligned();
+}
diff --git a/tensorflow/c/tf_tensor.h b/tensorflow/c/tf_tensor.h
index 5d4f70c1b6b..462fdc8b497 100644
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_TF_TENSOR_H_
 #define TENSORFLOW_C_TF_TENSOR_H_
 
+#include <stdbool.h>
 #include <stdint.h>
 
 #include "tensorflow/c/tf_datatype.h"
@@ -175,6 +176,9 @@ TF_CAPI_EXPORT extern size_t TF_StringDecode(const char* src, size_t src_len,
 // TF_STRING tensor.
 TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 
+// Returns bool iff this tensor is aligned.
+TF_CAPI_EXPORT extern bool TF_TensorIsAligned(const TF_Tensor*);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/tf_tensor_internal.h b/tensorflow/c/tf_tensor_internal.h
index 6def66c9412..ea7d49b5966 100644
--- a/tensorflow/c/tf_tensor_internal.h
+++ b/tensorflow/c/tf_tensor_internal.h
@@ -23,13 +23,12 @@ limitations under the License.
 // Internal structures used by the C API. These are likely to change and should
 // not be depended on.
 
-struct TF_Tensor {
-  ~TF_Tensor();
-
-  TF_DataType dtype;
-  tensorflow::TensorShape shape;
-  tensorflow::TensorBuffer* buffer;
-};
+// This struct forms part of the C API's public interface. It must strictly be
+// passed to or returned from C functions *by pointer*. Otherwise, changes to
+// its internal structure will break the C API's binary interface.
+typedef struct TF_Tensor {
+  ::tensorflow::Tensor tensor;
+} TF_Tensor;
 
 namespace tensorflow {
 
@@ -42,5 +41,13 @@ class TensorCApi {
   }
 };
 
+// Allocates tensor data buffer using specified allocator.
+// `operation` is a name for this operation.
+void* allocate_tensor(const char* operation, size_t len, Allocator* allocator);
+
+// Deallocates tensor data buffer.
+// Defaults to deallocating using CPU allocator. You can pass pointer to
+// a different Allocator as `arg`.
+void deallocate_buffer(void* data, size_t len, void* arg);
 }  // namespace tensorflow
 #endif  // TENSORFLOW_C_TF_TENSOR_INTERNAL_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 07de89f997e..40b182c8acf 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -649,7 +649,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
     ],
@@ -667,7 +666,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index a0353bf17a6..919e2dfc638 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/types.pb_text.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -193,12 +193,12 @@ string PrintTensor(const TensorProto& tensor_proto) {
       string ret;
       for (int64 i = 0; i < num_elts; ++i) {
         if (i > 0) strings::StrAppend(&ret, " ");
-        strings::StrAppend(&ret, absl::CEscape(t.flat<string>()(i)));
+        strings::StrAppend(&ret, absl::CEscape(t.flat<tstring>()(i)));
       }
       return ret;
     }
     default: {
-      LOG(FATAL) << "Not handling type " << EnumName_DataType(t.dtype());
+      LOG(FATAL) << "Not handling type " << DataType_Name(t.dtype());
       return string();
     }
   }
@@ -223,7 +223,7 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) {
     case AttrValue::kB:
       return attr_value.b() ? "true" : "false";
     case AttrValue::kType:
-      return EnumName_DataType(attr_value.type());
+      return DataType_Name(attr_value.type());
     case AttrValue::kShape:
       return PrintTensorShape(attr_value.shape());
     case AttrValue::kTensor:
@@ -254,8 +254,7 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) {
       } else if (attr_value.list().type_size() > 0) {
         for (int i = 0; i < attr_value.list().type_size(); ++i) {
           if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret,
-                             EnumName_DataType(attr_value.list().type(i)));
+          strings::StrAppend(&ret, DataType_Name(attr_value.list().type(i)));
         }
       } else if (attr_value.list().shape_size() > 0) {
         for (int i = 0; i < attr_value.list().shape_size(); ++i) {
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index ac05e3cf95b..178b4da972a 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -200,10 +200,10 @@ TEST(CCOpTest, TemplatedConst) {
   test::ExpectTensorEqual<float>(
       out, test::AsTensor<float>({3.f, 2.f, -1.f, 0.f}, {2, 2}));
 
-  auto c2 = ops::Const<string>(root, {{"this"}, {"is"}, {"a"}, {"constant"}});
+  auto c2 = ops::Const<tstring>(root, {{"this"}, {"is"}, {"a"}, {"constant"}});
   test::GetTensor(root, c2, &out);
-  test::ExpectTensorEqual<string>(
-      out, test::AsTensor<string>({"this", "is", "a", "constant"}, {4, 1}));
+  test::ExpectTensorEqual<tstring>(
+      out, test::AsTensor<tstring>({"this", "is", "a", "constant"}, {4, 1}));
 }
 
 TEST(CCOpTest, EmptyConst) {
diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
index 920a8e79556..8516dfd7a29 100644
--- a/tensorflow/cc/framework/ops.cc
+++ b/tensorflow/cc/framework/ops.cc
@@ -97,7 +97,7 @@ Input::Initializer::Initializer(
     Tensor elem = e.tensor;
     if (first.tensor.dtype() == DT_STRING) {
       for (int i = 0; i < elem.NumElements(); ++i) {
-        t.flat<string>()(offset + i) = elem.flat<string>()(i);
+        t.flat<tstring>()(offset + i) = elem.flat<tstring>()(i);
       }
       offset += elem.NumElements();
     } else {
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 0717e7dd4b3..1414e861002 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -111,7 +111,7 @@ class Input {
     Initializer(const T& v) {  // NOLINT(runtime/explicit)
       typedef typename RealType<T>::type RealT;
       Tensor t(DataTypeToEnum<RealT>::v(), TensorShape());
-      t.flat<T>()(0) = RealT(v);
+      t.flat<RealT>()(0) = RealT(v);
       tensor = t;
     }
 
@@ -125,7 +125,7 @@ class Input {
       typedef typename RealType<T>::type RealT;
       Tensor t(DataTypeToEnum<RealT>::v(), shape);
       for (int64 i = 0; i < t.NumElements(); ++i) {
-        t.flat<T>()(i) = RealT(v);
+        t.flat<RealT>()(i) = RealT(v);
       }
       tensor = t;
     }
@@ -170,7 +170,7 @@ class Input {
     // START_SKIP_DOXYGEN
     template <typename T, bool = std::is_convertible<T, string>::value>
     struct RealType {
-      typedef string type;
+      typedef tstring type;
     };
 
     template <typename T>
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index e93ca8633e6..b5cac5fec28 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -272,7 +272,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
   std::unordered_set<string> current_constraints(colocation_constraints_);
   const AttrSlice attrs = colocate_with_op.node()->attrs();
   std::vector<string> node_constraints;
-  if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
+  if (TryGetNodeAttr(attrs, kColocationAttrName, &node_constraints)) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
       if (absl::ConsumePrefix(&s, kColocationGroupPrefix)) {
@@ -299,7 +299,7 @@ const std::vector<Operation>& Scope::control_deps() const {
   return impl()->control_deps_;
 }
 
-void Scope::UpdateStatus(const Status s) const {
+void Scope::UpdateStatus(const Status& s) const {
   impl()->status_->Update(s);
   if (impl()->exit_on_error_ && !ok()) {
     LOG(FATAL) << *impl()->status_;
@@ -318,7 +318,7 @@ Status Scope::ToGraph(Graph* g, GraphConstructorOptions opts) const {
   if (ok()) {
     GraphDef graph_def;
     graph()->ToGraphDef(&graph_def);
-    UpdateStatus(ConvertGraphDefToGraph(opts, graph_def, g));
+    UpdateStatus(ConvertGraphDefToGraph(opts, std::move(graph_def), g));
   }
   return *impl()->status_;
 }
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index ef2daff1357..63a555b7217 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -177,7 +177,7 @@ class Scope {
   /// Note: The status object is shared between all children of this scope.
   /// If the resulting status is not Status::OK() and exit_on_error_ is set on
   /// this scope, this function exits by calling LOG(FATAL).
-  void UpdateStatus(const Status s) const;
+  void UpdateStatus(const Status& s) const;
 
   // START_SKIP_DOXYGEN
 
diff --git a/tensorflow/cc/ops/const_op_test.cc b/tensorflow/cc/ops/const_op_test.cc
index 69b5d7fd47c..345cd23b9ec 100644
--- a/tensorflow/cc/ops/const_op_test.cc
+++ b/tensorflow/cc/ops/const_op_test.cc
@@ -97,7 +97,7 @@ TEST(ConstOpTest, WithExplicitShape) {
   auto d = ops::Const(root, {"1", "2", "3", "4", "5", "6"}, {2, 3});
   TF_CHECK_OK(root.status());
   EXPECT_EQ(d.op().output_type(0), DT_STRING);
-  ExpectNodeEqual<string>(d.node(), {"1", "2", "3", "4", "5", "6"}, {2, 3});
+  ExpectNodeEqual<tstring>(d.node(), {"1", "2", "3", "4", "5", "6"}, {2, 3});
 }
 
 TEST(ConstOpTest, FromProto) {
@@ -144,7 +144,7 @@ TEST(ConstOpTest, TemplatedConst) {
   auto c1 = ops::Const<int>(root, {1, 2});
   ExpectTypeAndShape(c1.node(), DT_INT32, {2});
 
-  auto c2 = ops::Const<string>(root, {{"this"}, {"is"}, {"a"}, {"constant"}});
+  auto c2 = ops::Const<tstring>(root, {{"this"}, {"is"}, {"a"}, {"constant"}});
   ExpectTypeAndShape(c2.node(), DT_STRING, {4, 1});
 }
 
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index d18a0bcab0c..5b4a105eb28 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -9,6 +9,7 @@ tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
     tags = [
+        "no_rocm",  # stream level tracing not supported on ROCm
         "nogpu",  # b/77649654
     ],
     deps = [
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 01752b65f2f..39b84922d13 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -10,7 +10,7 @@ load(
     "tf_cc_test",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_static",
     "if_static_and_not_mobile",
 )
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index dfc7ccd9542..a3b80fbdba5 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -75,7 +75,7 @@ Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
 
 Tensor CreateStringTensor(const string& value) {
   Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = value;
+  tensor.scalar<tstring>()() = value;
   return tensor;
 }
 
@@ -219,7 +219,7 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
   // Add variables to the graph.
   Tensor variables_path_tensor(DT_STRING, TensorShape({}));
-  variables_path_tensor.scalar<string>()() = variables_path;
+  variables_path_tensor.scalar<tstring>()() = variables_path;
 
   std::vector<std::pair<string, Tensor>> inputs = {
       {string(variable_filename_const_op_name), variables_path_tensor}};
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 422994ba07c..aa2031d17d2 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -63,8 +63,8 @@ class LoaderTest : public ::testing::Test {
         bundle.session->Run({}, {"filename_tensor:0"}, {}, &path_outputs));
     ASSERT_EQ(1, path_outputs.size());
 
-    test::ExpectTensorEqual<string>(
-        test::AsTensor<string>({"foo.txt"}, TensorShape({})), path_outputs[0]);
+    test::ExpectTensorEqual<tstring>(
+        test::AsTensor<tstring>({"foo.txt"}, TensorShape({})), path_outputs[0]);
   }
 
   void CheckSavedModelBundle(const string& export_dir,
@@ -78,14 +78,14 @@ class LoaderTest : public ::testing::Test {
     const string output_name =
         signature_def.outputs().at(kRegressOutputs).name();
 
-    std::vector<string> serialized_examples;
+    std::vector<tstring> serialized_examples;
     for (float x : {0, 1, 2, 3}) {
       serialized_examples.push_back(MakeSerializedExample(x));
     }
 
     // Validate the half plus two behavior.
     Tensor input =
-        test::AsTensor<string>(serialized_examples, TensorShape({4}));
+        test::AsTensor<tstring>(serialized_examples, TensorShape({4}));
     std::vector<Tensor> outputs;
     TF_ASSERT_OK(bundle.session->Run({{input_name, input}}, {output_name}, {},
                                      &outputs));
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
index fca45c869fd..b1440655c72 100644
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -1,7 +1,7 @@
 # Description:
 # CLIF wrappers for TensorFlow SavedModels.
 
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_py_clif_cc")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_py_clif_cc")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index 799856f7fd4..d6d99229372 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -48,12 +48,12 @@ Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
                     export_dir);
 }
 
-Status FindMetaGraphDef(const SavedModel& saved_model_proto,
-                        const std::unordered_set<string>& tags,
+Status FindMetaGraphDef(const std::unordered_set<string>& tags,
+                        SavedModel* saved_model_proto,
                         MetaGraphDef* meta_graph_def) {
   LOG(INFO) << "Reading meta graph with tags { " << absl::StrJoin(tags, " ")
             << " }";
-  for (const MetaGraphDef& graph_def : saved_model_proto.meta_graphs()) {
+  for (MetaGraphDef& graph_def : *saved_model_proto->mutable_meta_graphs()) {
     // Get tags from the graph_def.
     std::unordered_set<string> graph_tags;
     for (const string& tag : graph_def.meta_info_def().tags()) {
@@ -61,7 +61,7 @@ Status FindMetaGraphDef(const SavedModel& saved_model_proto,
     }
     // Match with the set of tags provided.
     if (graph_tags == tags) {
-      *meta_graph_def = graph_def;
+      *meta_graph_def = std::move(graph_def);
       return Status::OK();
     }
   }
@@ -81,7 +81,8 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
                                       MetaGraphDef* const meta_graph_def) {
   SavedModel saved_model_proto;
   TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
-  TF_RETURN_IF_ERROR(FindMetaGraphDef(saved_model_proto, tags, meta_graph_def));
+  TF_RETURN_IF_ERROR(
+      FindMetaGraphDef(tags, &saved_model_proto, meta_graph_def));
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index eeb91017890..0ec48ec9357 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -42,6 +42,10 @@ void GetTensorNamesFromTensorInfo(const TensorInfo& tensor_info,
     tensor_names->insert(coo_sparse.values_tensor_name());
     tensor_names->insert(coo_sparse.indices_tensor_name());
     tensor_names->insert(coo_sparse.dense_shape_tensor_name());
+  } else if (tensor_info.has_composite_tensor()) {
+    for (const auto& component : tensor_info.composite_tensor().components()) {
+      tensor_names->insert(component.name());
+    }
   } else {
     tensor_names->insert(tensor_info.name());
   }
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index 979b23c3fc5..274a1630a05 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -425,5 +425,63 @@ TEST_F(FreezeTest, GraphDefWithAndWithoutDependentResourceVariables) {
   TestFreezeGraphWithAndWithoutDependentVariables(true);
 }
 
+TEST_F(FreezeTest, InputsAndOutputsCompositeTensorSignatureDef) {
+  // Test that inputs and outputs get correctly populated for a
+  // SignatureDef containing composite tensor inputs and outputs.
+  SavedModelBundle saved_model_bundle;
+  SignatureDef signature_def;
+
+  TensorInfo& in = (*signature_def.mutable_inputs())["input_arg"];
+  in.mutable_composite_tensor()->add_components()->set_name("input1:0");
+  in.mutable_composite_tensor()->add_components()->set_name("input2:0");
+
+  TensorInfo& out = (*signature_def.mutable_outputs())["output_arg"];
+  out.mutable_composite_tensor()->add_components()->set_name("output2:0");
+  out.mutable_composite_tensor()->add_components()->set_name("output1:0");
+
+  AddSignatureDefToSavedModelBundle(signature_def, "signature_def",
+                                    &saved_model_bundle);
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+  std::unordered_set<string> expected_inputs = {"input1:0", "input2:0"};
+  std::unordered_set<string> expected_outputs = {"output1:0", "output2:0"};
+  EXPECT_EQ(expected_inputs, inputs);
+  EXPECT_EQ(expected_outputs, outputs);
+}
+
+TEST_F(FreezeTest, InputsAndOutputsSparseCooSignatureDef) {
+  // Test that inputs and outputs get correctly populated for a
+  // SignatureDef containing composite tensor inputs and outputs.
+  SavedModelBundle saved_model_bundle;
+  SignatureDef signature_def;
+
+  TensorInfo& in = (*signature_def.mutable_inputs())["input_arg"];
+  in.mutable_coo_sparse()->set_values_tensor_name("input1:0");
+  in.mutable_coo_sparse()->set_indices_tensor_name("input2:0");
+  in.mutable_coo_sparse()->set_dense_shape_tensor_name("input3:0");
+
+  TensorInfo& out = (*signature_def.mutable_outputs())["output_arg"];
+  out.mutable_coo_sparse()->set_values_tensor_name("output1:0");
+  out.mutable_coo_sparse()->set_indices_tensor_name("output2:0");
+  out.mutable_coo_sparse()->set_dense_shape_tensor_name("output3:0");
+
+  AddSignatureDefToSavedModelBundle(signature_def, "signature_def",
+                                    &saved_model_bundle);
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+  std::unordered_set<string> expected_inputs = {"input1:0", "input2:0",
+                                                "input3:0"};
+  std::unordered_set<string> expected_outputs = {"output1:0", "output2:0",
+                                                 "output3:0"};
+  EXPECT_EQ(expected_inputs, inputs);
+  EXPECT_EQ(expected_outputs, outputs);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 88b00cb2eea..bff56bdda89 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
 package(
     default_visibility = [
@@ -144,8 +144,57 @@ cc_library(
     ],
 )
 
+XLA_DEVICE_DEPS = [
+    ":common",
+    ":xla_launch_util",
+    ":xla_tensor",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/synchronization",
+    "@com_google_absl//absl/types:optional",
+    "//tensorflow/compiler/jit/ops:xla_ops",
+    "//tensorflow/compiler/tf2xla:common",
+    "//tensorflow/compiler/tf2xla:tf2xla_util",
+    "//tensorflow/compiler/tf2xla:xla_compiler",
+    "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+    "//tensorflow/compiler/xla:util",
+    "//tensorflow/compiler/xla/client:client_library",
+    "//tensorflow/compiler/xla/client:global_data",
+    "//tensorflow/compiler/xla/client:local_client",
+    "//tensorflow/compiler/xla/service:stream_pool",
+    "//tensorflow/core:array_ops_op_lib",
+    "//tensorflow/core:control_flow_ops_op_lib",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:core_cpu_internal",
+    "//tensorflow/core:dataset_ops_op_lib",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:functional_ops_op_lib",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:math_ops_op_lib",
+    "//tensorflow/core:nn_ops_op_lib",
+    "//tensorflow/core:no_op_op_lib",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:resource_variable_ops_op_lib",
+    "//tensorflow/core:sendrecv_ops_op_lib",
+    "//tensorflow/core:state_ops_op_lib",
+    "//tensorflow/core:stream_executor_no_cuda",
+    "//tensorflow/core/kernels:constant_op",
+    "//tensorflow/core/kernels:fifo_queue",
+    "//tensorflow/core/kernels:function_ops",
+    "//tensorflow/core/kernels:identity_op",
+    "//tensorflow/core/kernels:resource_variable_ops",
+    "//tensorflow/core/kernels:shape_ops",
+    "//tensorflow/core/kernels:variable_ops",
+    "//tensorflow/core/kernels/data:generator_dataset_op",
+    "//tensorflow/core/kernels/data:iterator_ops",
+    "//tensorflow/core/kernels/data:optional_ops",
+    "//tensorflow/core/kernels/data:prefetch_dataset_op",
+    "//tensorflow/core/profiler/lib:traceme",
+    "//tensorflow/stream_executor/platform",
+]
+
 cc_library(
-    name = "xla_device",
+    name = "xla_device_no_jit_rewrite_registration",
     srcs = [
         "xla_compile_on_demand_op.cc",
         "xla_device.cc",
@@ -158,56 +207,22 @@ cc_library(
         "xla_device_context.h",
         "xla_device_ops.h",
     ],
+    deps = XLA_DEVICE_DEPS,
+)
+
+cc_library(
+    name = "xla_device",
+    hdrs = [
+        "xla_compile_on_demand_op.h",
+        "xla_device.h",
+        "xla_device_context.h",
+        "xla_device_ops.h",
+    ],
     # Public visibility is needed for external TF/XLA backends.
     visibility = ["//visibility:public"],
-    deps = [
-        ":common",
+    deps = XLA_DEVICE_DEPS + [
         ":jit_compilation_passes",
-        ":xla_launch_util",
-        ":xla_tensor",
-        "//tensorflow/compiler/jit/ops:xla_ops",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:tf2xla_util",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:stream_pool",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:control_flow_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:no_op_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:resource_variable_ops_op_lib",
-        "//tensorflow/core:sendrecv_ops_op_lib",
-        "//tensorflow/core:state_ops_op_lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/kernels:constant_op",
-        "//tensorflow/core/kernels:fifo_queue",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:identity_op",
-        "//tensorflow/core/kernels:resource_variable_ops",
-        "//tensorflow/core/kernels:shape_ops",
-        "//tensorflow/core/kernels:variable_ops",
-        "//tensorflow/core/kernels/data:generator_dataset_op",
-        "//tensorflow/core/kernels/data:iterator_ops",
-        "//tensorflow/core/kernels/data:optional_ops",
-        "//tensorflow/core/kernels/data:prefetch_dataset_op",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
+        ":xla_device_no_jit_rewrite_registration",
     ],
 )
 
@@ -281,6 +296,7 @@ cc_library(
     hdrs = ["xla_compilation_cache.h"],
     deps = [
         ":xla_activity_listener",
+        ":xla_activity_proto_cc",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
@@ -292,6 +308,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -324,17 +342,21 @@ cc_library(
     alwayslink = 1,
 )
 
+# Linked by tensorflow core, without registration of jit compilation passes
+# which is not necessary to create and run a XlaLocalLaunchBase kernel.
+# Linking jit compilation passes could cause programs stuck right now (b/140069592).
 cc_library(
-    name = "xla_kernel_creator",
+    name = "xla_kernel_creator_util",
     srcs = [
-        "xla_kernel_creator.cc",
-        "xla_kernel_creator.h",
+        "xla_kernel_creator_util.cc",
     ],
+    hdrs = ["xla_kernel_creator_util.h"],
+    visibility = ["//tensorflow/core/common_runtime/eager:__pkg__"],
     deps = [
         ":common",
         ":compilability_check_util",
         ":compilation_passes",
-        "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/jit/kernels:xla_ops_no_jit_rewrite_registration",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -347,6 +369,23 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_kernel_creator",
+    srcs = [
+        "xla_kernel_creator.cc",
+        "xla_kernel_creator.h",
+    ],
+    deps = [
+        ":jit_compilation_passes",
+        ":xla_kernel_creator_util",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "xla_kernel_creator_test",
     srcs = [
@@ -498,6 +537,7 @@ cc_library(
     srcs = [
         "build_xla_ops_pass.cc",
         "clone_constants_for_better_clustering.cc",
+        "cluster_scoping_pass.cc",
         "deadness_analysis.cc",
         "deadness_analysis_internal.h",
         "encapsulate_subgraphs_pass.cc",
@@ -513,6 +553,7 @@ cc_library(
     hdrs = [
         "build_xla_ops_pass.h",
         "clone_constants_for_better_clustering.h",
+        "cluster_scoping_pass.h",
         "deadness_analysis.h",
         "encapsulate_subgraphs_pass.h",
         "encapsulate_xla_computations_pass.h",
@@ -677,6 +718,7 @@ tf_cc_test(
     srcs = [
         "build_xla_ops_pass_test.cc",
         "clone_constants_for_better_clustering_test.cc",
+        "cluster_scoping_pass_test.cc",
         "encapsulate_subgraphs_pass_test.cc",
         "encapsulate_xla_computations_pass_test.cc",
         "extract_outside_compilation_pass_test.cc",
@@ -800,6 +842,8 @@ cc_library(
         ":flags",
         ":resource_operation_safety_analysis",
         ":union_find",
+        ":xla_activity_listener",
+        ":xla_activity_proto_cc",
         ":xla_cluster_util",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
@@ -837,6 +881,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_proto_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/memory",
@@ -901,6 +946,7 @@ cc_library(
     srcs = ["xla_activity_logging_listener.cc"],
     deps = [
         ":xla_activity_listener",
+        ":xla_activity_proto_cc",
         "//tensorflow/core:logger",
         "@com_google_absl//absl/memory",
     ],
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 1265ff9138a..61695d532d1 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -48,6 +48,19 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+struct DebuggingOpts {
+  // If true, insert Print nodes to print every output from an XLA cluster.
+  bool print_outputs;
+
+  // If true, insert CheckNumerics nodes for every floating point typed input to
+  // an XLA cluster.
+  bool check_input_numerics;
+
+  // If true, insert CheckNumerics nodes for every floating point typed output
+  // from an XLA cluster.
+  bool check_output_numerics;
+};
+
 void MoveOutgoingEdges(Graph* g, Node* old_node, Node* new_node) {
   std::vector<const Edge*> out_edges(old_node->out_edges().begin(),
                                      old_node->out_edges().end());
@@ -78,7 +91,8 @@ Operation DataToControl(const Scope& scope, Output data) {
 // Replaces each outgoing edge from `old_node` with a merge node that merges in
 // the corresponding output from `new_node`.
 void MergeOutgoingDataEdges(const Scope& s, Node* old_node, Node* new_node,
-                            bool insert_print_nodes) {
+                            absl::string_view cluster_name,
+                            const DebuggingOpts& debugging_opts) {
   if (!s.status().ok()) {
     return;
   }
@@ -93,23 +107,36 @@ void MergeOutgoingDataEdges(const Scope& s, Node* old_node, Node* new_node,
     int oidx = e->src_output();
     Output merged_output = merged_outputs[oidx];
     if (merged_output.node() == nullptr) {
-      ops::Merge merge_op(s.WithOpName(absl::StrCat("merge_oidx_", oidx)),
-                          {Output(old_node, oidx), Output(new_node, oidx)});
-      if (insert_print_nodes) {
+      Output new_output(new_node, oidx);
+      if (debugging_opts.print_outputs) {
         string cpu_device = "/job:localhost/replica:0/task:0/device:CPU:0";
-        ops::Print print_op(s.WithOpName(absl::StrCat("print_", oidx))
+        ops::Print print_op(s.WithOpName("print_", oidx)
                                 .WithDevice(cpu_device)
                                 .WithAssignedDevice(cpu_device),
-                            merge_op.output, {merge_op.output},
+                            new_output, {new_output},
                             ops::Print::Attrs{}
                                 .Message(absl::StrCat("output ", oidx, " from ",
                                                       old_node->name(), " is "))
                                 .FirstN(1000)
                                 .Summarize(-1));
-        merged_output = merged_outputs[oidx] = print_op;
-      } else {
-        merged_output = merged_outputs[oidx] = merge_op.output;
+        new_output = print_op;
       }
+
+      if (debugging_opts.check_output_numerics &&
+          DataTypeIsFloating(new_output.type())) {
+        ops::CheckNumerics check_numerics_op(
+            s.WithOpName("check_output_", oidx)
+                .WithDevice(new_node->requested_device())
+                .WithAssignedDevice(new_node->assigned_device_name()),
+            new_output,
+            absl::StrCat("CheckNumerics failed for output ", oidx, "(",
+                         new_output.name(), ") from cluster ", cluster_name));
+        new_output = check_numerics_op;
+      }
+
+      ops::Merge merge_op(s.WithOpName("merge_oidx_", oidx),
+                          {Output(old_node, oidx), new_output});
+      merged_output = merged_outputs[oidx] = merge_op.output;
     }
 
     Node* dst = e->dst();
@@ -324,11 +351,34 @@ xla::StatusOr<jit::DeviceId> InferDeviceForCluster(
   return result;
 }
 
+std::vector<Output> GetXlaRunArgs(const Scope& s,
+                                  const XlaClusterInfo& cluster_info,
+                                  const DebuggingOpts& debugging_opts) {
+  std::vector<Output> xla_run_args;
+  xla_run_args.reserve(cluster_info.non_constant_inputs.size() +
+                       cluster_info.resource_inputs.size());
+  int input_idx = 0;
+  for (const Output& o : cluster_info.non_constant_inputs) {
+    if (debugging_opts.check_input_numerics && DataTypeIsFloating(o.type())) {
+      ops::CheckNumerics check_numerics_op(
+          s.WithOpName("check_input_", input_idx), o,
+          absl::StrCat("CheckNumerics failed for input ", input_idx, "(",
+                       o.name(), ") into ", cluster_info.function.name()));
+      xla_run_args.push_back(check_numerics_op);
+    } else {
+      xla_run_args.push_back(o);
+    }
+    input_idx++;
+  }
+  absl::c_copy(cluster_info.resource_inputs, std::back_inserter(xla_run_args));
+  return xla_run_args;
+}
+
 Status ReplaceNodeWithXlaCompileAndXlaRun(
     jit::DeviceInfoCache* device_info_cache,
     const GraphOptimizationPassOptions& options,
     const FunctionLibraryDefinition& flib_def, bool lazy_compilation_enabled,
-    bool insert_print_nodes, Graph* g, Node* n) {
+    const DebuggingOpts& debugging_opts, Graph* g, Node* n) {
   XlaClusterInfo cluster_info;
   TF_RETURN_IF_ERROR(GetXlaClusterInfo(n, &cluster_info));
 
@@ -361,12 +411,12 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(
   TF_RETURN_IF_ERROR(
       CopyIncomingControlEdges(g, /*from=*/n, /*to=*/xla_compile.key.node()));
 
+  std::vector<Output> xla_run_args =
+      GetXlaRunArgs(root, cluster_info, debugging_opts);
+
   if (requires_compilation) {
     // "Strict" compilation:  every _XlaCompile invocation must compile the
     // cluster.
-    std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
-    absl::c_copy(cluster_info.resource_inputs,
-                 std::back_inserter(xla_run_args));
     ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
                          xla_compile.key, n->output_types());
 
@@ -391,9 +441,6 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(
     Output predicated_compilation_key = s.output_true;
     Output inverse_predicated_compilation_key = s.output_false;
 
-    std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
-    absl::c_copy(cluster_info.resource_inputs,
-                 std::back_inserter(xla_run_args));
     ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
                          predicated_compilation_key, n->output_types());
 
@@ -402,7 +449,7 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(
 
     MergeOutgoingDataEdges(root, /*old_node=*/n,
                            /*new_node=*/xla_run.operation.node(),
-                           insert_print_nodes);
+                           cluster_info.function.name(), debugging_opts);
 
     TF_RETURN_IF_ERROR(root.status());
 
@@ -443,15 +490,25 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
       enable_lazy_compilation_
           ? *enable_lazy_compilation_
           : GetBuildXlaOpsPassFlags()->tf_xla_enable_lazy_compilation;
-  bool insert_print_nodes =
-      GetBuildXlaOpsPassFlags()->tf_xla_print_cluster_outputs;
 
   jit::DeviceInfoCache device_info_cache;
+  const BuildXlaOpsPassFlags& flags = *GetBuildXlaOpsPassFlags();
+
+  DebuggingOpts debugging_opts;
+  debugging_opts.print_outputs = flags.tf_xla_print_cluster_outputs;
+  debugging_opts.check_input_numerics =
+      flags.tf_xla_check_cluster_input_numerics;
+  debugging_opts.check_output_numerics =
+      flags.tf_xla_check_cluster_output_numerics;
+
+  VLOG(1) << "print_outputs = " << debugging_opts.print_outputs;
+  VLOG(1) << "check_input_numerics = " << debugging_opts.check_input_numerics;
+  VLOG(1) << "check_output_numerics = " << debugging_opts.check_output_numerics;
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
         &device_info_cache, options, *options.flib_def,
-        lazy_compilation_enabled, insert_print_nodes, graph, n));
+        lazy_compilation_enabled, debugging_opts, graph, n));
   }
 
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass.cc b/tensorflow/compiler/jit/cluster_scoping_pass.cc
new file mode 100644
index 00000000000..f4b9f93c616
--- /dev/null
+++ b/tensorflow/compiler/jit/cluster_scoping_pass.cc
@@ -0,0 +1,163 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/cluster_scoping_pass.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+namespace {
+
+class ClusterScopingPassImpl {
+ public:
+  ClusterScopingPassImpl(Graph* graph,
+                         OptimizerOptions::GlobalJitLevel global_jit_level)
+      : graph_(graph),
+        global_jit_level_(global_jit_level),
+        unique_scope_id_(0) {}
+
+  Status Run();
+
+ private:
+  Status ScopingForPipelineStages();
+
+  size_t GetUniqueScopeId() { return unique_scope_id_++; }
+
+  void AddScopeToAllTransitivePredecessors(Node* start);
+
+  void AddScopeToAllTransitiveSuccessors(Node* start);
+
+ private:
+  Graph* graph_;
+  OptimizerOptions::GlobalJitLevel global_jit_level_;
+  size_t unique_scope_id_;
+};
+
+absl::optional<string> GetXlaInternalScope(Node* node) {
+  string scope;
+  if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
+    return scope;
+  }
+
+  return absl::nullopt;
+}
+
+void SetXlaInternalScope(Node* node, StringPiece scope) {
+  node->AddAttr(kXlaInternalScopeAttr, scope);
+}
+
+// NB! We append a new scope as suffix to the _XlaInternalScope attribute
+// instead of overriding the old value.  In other words, appending scope B to
+// scope A creates the conjunction of the scopes A and B (i.e, A & B) and,
+// in effect, the node gets both the old and new scopes.  As a unique scope
+// disallows a node being merged with nodes in other scopes, the scope
+// conjunction preserves the semantic of the old scope (i.e., the node still
+// cannot be merged with the previously incompatible nodes.)
+//
+// For example, the below case should be rare in practice but can serve for the
+// purpose of discussion.  After adding scopes for both Stage and Unstage,
+// Node_Y will receive both scopes "unstage" and "stage", while Node_X receives
+// only scope "stage".  The semantic of scope "unstage" is preserved although
+// scope "stage" is later appended.  As a result, Node_X and Node_Y will be put
+// into different clusters.
+//
+//                Unstage -> Node_Y (scope "unstage & stage")
+//                              |
+//                              V
+//  Node_X (scope "stage") -> Stage
+//
+void AddOrAppendXlaInternalScope(Node* node, absl::string_view suffix) {
+  string updated_scope;
+  absl::optional<string> cur_scope = GetXlaInternalScope(node);
+  if (cur_scope == absl::nullopt) {
+    updated_scope = std::string(suffix);
+  } else {
+    updated_scope = absl::StrCat(cur_scope.value(), "&", suffix);
+  }
+  SetXlaInternalScope(node, updated_scope);
+}
+
+void ClusterScopingPassImpl::AddScopeToAllTransitivePredecessors(Node* start) {
+  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+
+  std::vector<Node*> starts;
+  starts.push_back(start);
+  auto enter = [&](Node* n) { AddOrAppendXlaInternalScope(n, unique_suffix); };
+  ReverseDFSFrom(*graph_, starts, enter, /*leave=*/nullptr,
+                 /*stable_comparator=*/NodeComparatorName());
+}
+
+void ClusterScopingPassImpl::AddScopeToAllTransitiveSuccessors(Node* start) {
+  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+
+  std::vector<Node*> starts;
+  starts.push_back(start);
+  auto enter = [&](Node* n) { AddOrAppendXlaInternalScope(n, unique_suffix); };
+  DFSFrom(*graph_, starts, enter, /*leave=*/nullptr,
+          /*stable_comparator=*/NodeComparatorName(),
+          // Do not filter any edges to better capture the semantics of
+          // transitive closure of successors.  We may revisit this when
+          // we see more cases needing cluster scoping in the future.
+          /*edge_filter=*/nullptr);
+}
+
+// This preserves the parallelism between pipeline stages.  For example, below
+// is a typical pattern of input pipelining in Tensorflow and this heuristic
+// ensures Node_X and Node_Y are put into different clusters.  Without the
+// heuristic, they may be put into the same cluster and it can introduce
+// artificial dependencies and incur great performance loss.  In this example,
+// Node_Y becomes dependent on IteratorGetNext and the latencies add up if
+// Node_X and Node_Y are in the same cluster.
+//
+// IteratorGetNext -> Node_X -> Stage
+//
+// Unstage -> Node_Y
+//
+Status ClusterScopingPassImpl::ScopingForPipelineStages() {
+  for (Node* n : graph_->nodes()) {
+    DCHECK(n);
+    if (n->type_string() == "Unstage") {
+      AddScopeToAllTransitiveSuccessors(n);
+    }
+    if (n->type_string() == "Stage") {
+      AddScopeToAllTransitivePredecessors(n);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ClusterScopingPassImpl::Run() {
+  if (global_jit_level_ == OptimizerOptions::OFF) {
+    return Status::OK();
+  }
+
+  return ScopingForPipelineStages();
+}
+}  // namespace
+
+Status ClusterScopingPass::Run(const GraphOptimizationPassOptions& options) {
+  Graph* graph = options.graph->get();
+
+  return ClusterScopingPassImpl{graph, GetGlobalJitLevelForGraph(options)}
+      .Run();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass.h b/tensorflow/compiler/jit/cluster_scoping_pass.h
new file mode 100644
index 00000000000..9651c3f878c
--- /dev/null
+++ b/tensorflow/compiler/jit/cluster_scoping_pass.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// This pass adds scopes to nodes in the _XlaInternalScope attribute to guide
+// the later clustering passes.  A major reason to do this is to prevent the
+// clustering from losing critical parallelism in the Tensorflow graph, which
+// can incur great performance degradation.
+//
+// This pass must be run before MarkForCompilationPass, as it stores the
+// scoping information that MarkForCompilationPass will need to respect for
+// clustering decision.
+class ClusterScopingPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
new file mode 100644
index 00000000000..b3e63b8c298
--- /dev/null
+++ b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/cluster_scoping_pass.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+Status ClusterScoping(std::unique_ptr<Graph>* graph) {
+  FixupSourceAndSinkEdges(graph->get());
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  FunctionDefLibrary fdef_lib;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  opt_options.flib_def = &flib_def;
+  SessionOptions session_options;
+  session_options.env = Env::Default();
+  session_options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(OptimizerOptions::ON_2);
+  opt_options.session_options = &session_options;
+
+  ClusterScopingPass pass;
+  return pass.Run(opt_options);
+}
+
+absl::flat_hash_map<string, string> GetXlaInternalScopes(const Graph& graph) {
+  absl::flat_hash_map<string, string> scopes;
+  for (Node* node : graph.nodes()) {
+    string scope;
+    if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
+      scopes[node->name()] = scope;
+    }
+  }
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "_XlaInternalScopes:";
+    for (const auto& p : scopes) {
+      VLOG(2) << " " << p.first << " -> " << p.second;
+    }
+  }
+  return scopes;
+}
+
+Node* BuildStageNode(GraphDefBuilder& builder, string name,
+                     std::initializer_list<DataType> dtypes,
+                     absl::Span<const ops::NodeOut> values) {
+  auto opts = builder.opts()
+                  .WithName(std::move(name))
+                  .WithAttr("dtypes", std::move(dtypes));
+  if (opts.HaveError()) {
+    return nullptr;
+  }
+
+  NodeBuilder node_builder(name, "Stage", opts.op_registry());
+  node_builder.Input(values);
+  return opts.FinalizeBuilder(&node_builder);
+}
+
+TEST(XlaCompilationTest, StagePipelinePreserved) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    // Graph:
+    //       b
+    //       |
+    //       v
+    // a -> add0 (ClusterX) -> relu0 (ClusterX) -> stage
+    //
+    //             b
+    //             |
+    //             v
+    // unstage -> add1 (ClusterY) -> relu1 (ClusterY)
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("a")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::SourceOp("Const", builder.opts()
+                                         .WithName("b")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* unstage = ops::SourceOp(
+        "Unstage",
+        builder.opts().WithName("unstage").WithAttr("dtypes", {DT_FLOAT}));
+
+    Node* add0 = ops::BinaryOp("Add", a, b, builder.opts().WithName("add0"));
+    Node* add1 =
+        ops::BinaryOp("Add", unstage, b, builder.opts().WithName("add1"));
+    Node* relu0 = ops::UnaryOp("Relu", add0, builder.opts().WithName("relu0"));
+    ops::UnaryOp("Relu", add1, builder.opts().WithName("relu1"));
+    BuildStageNode(builder, "stage", {DT_FLOAT}, {relu0});
+
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(ClusterScoping(&graph));
+
+  auto scopes = GetXlaInternalScopes(*graph);
+  EXPECT_NE(scopes["add0"], scopes["add1"]);
+  EXPECT_EQ(scopes["add0"], scopes["relu0"]);
+  EXPECT_EQ(scopes["add1"], scopes["relu1"]);
+}
+
+TEST(XlaCompilationTest, StagePipelinePreservedAndInitialScopesRespected) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    // Graph:
+    //       b
+    //       |
+    //       v
+    // a -> add0 (ClusterA) -> relu0 (ClusterB) -> stage
+    //
+    //             b
+    //             |
+    //             v
+    // unstage -> add1 (ClusterC) -> relu1 (ClusterD)
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("a")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::SourceOp("Const", builder.opts()
+                                         .WithName("b")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* unstage = ops::SourceOp(
+        "Unstage",
+        builder.opts().WithName("unstage").WithAttr("dtypes", {DT_FLOAT}));
+
+    // Intentionally give add0 and add1 the same initial scope but they should
+    // be separated by the ClusterScopingPass.
+    Node* add0 = ops::BinaryOp("Add", a, b,
+                               builder.opts().WithName("add0").WithAttr(
+                                   kXlaInternalScopeAttr, "ClusterA"));
+    Node* add1 = ops::BinaryOp("Add", unstage, b,
+                               builder.opts().WithName("add1").WithAttr(
+                                   kXlaInternalScopeAttr, "ClusterA"));
+    Node* relu0 = ops::UnaryOp("Relu", add0,
+                               builder.opts().WithName("relu0").WithAttr(
+                                   kXlaInternalScopeAttr, "ClusterB"));
+    ops::UnaryOp("Relu", add1,
+                 builder.opts().WithName("relu1").WithAttr(
+                     kXlaInternalScopeAttr, "ClusterD"));
+    BuildStageNode(builder, "stage", {DT_FLOAT}, {relu0});
+
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(ClusterScoping(&graph));
+
+  auto scopes = GetXlaInternalScopes(*graph);
+  EXPECT_NE(scopes["add0"], scopes["add1"]);
+  EXPECT_NE(scopes["add0"], scopes["relu0"]);
+  EXPECT_NE(scopes["add1"], scopes["relu1"]);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 5e3b93d30e5..6498436fbd9 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
@@ -83,7 +86,7 @@ Status MakeCallNodeFromAttribute(const Node& node, const std::string& attr_name,
 
 }  // anonymous namespace
 
-std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+RecursiveCompilabilityChecker::UncompilableNodesMap
 RecursiveCompilabilityChecker::FindUncompilableNodes(
     const Node& node, FunctionLibraryRuntime* lib_runtime,
     const std::vector<RecursiveCompilabilityChecker::StackFrame>*
@@ -98,12 +101,14 @@ RecursiveCompilabilityChecker::FindUncompilableNodes(
     }
   }
   stack_trace.emplace_back(StackFrameView{node.name(), ""});
-  std::vector<UncompilableNodeInfo> uncompilable_nodes;
-  IsCompilableNode(node, lib_runtime, &stack_trace, &uncompilable_nodes);
+
+  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes;
+  IsCompilableNode(node, lib_runtime, &stack_trace,
+                   /*encapsulating_function=*/nullptr, &uncompilable_nodes);
   return uncompilable_nodes;
 }
 
-std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+RecursiveCompilabilityChecker::UncompilableNodesMap
 RecursiveCompilabilityChecker::FindUncompilableNodes(
     const NodeDef& call_def, FunctionLibraryRuntime* lib_runtime,
     const std::vector<RecursiveCompilabilityChecker::StackFrame>*
@@ -118,8 +123,10 @@ RecursiveCompilabilityChecker::FindUncompilableNodes(
     }
   }
   stack_trace.emplace_back(StackFrameView{call_def.name(), ""});
-  std::vector<UncompilableNodeInfo> uncompilable_nodes;
-  IsCompilableCall(call_def, lib_runtime, &stack_trace, &uncompilable_nodes);
+
+  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes;
+  IsCompilableCall(call_def, lib_runtime, &stack_trace,
+                   /*encapsulating_function=*/nullptr, &uncompilable_nodes);
   return uncompilable_nodes;
 }
 
@@ -154,16 +161,18 @@ bool RecursiveCompilabilityChecker::HasXLAKernel(const Node& node) const {
 bool RecursiveCompilabilityChecker::IsCompilableIf(
     const Node& if_node, FunctionLibraryRuntime* lib_runtime,
     std::vector<StackFrameView>* stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_nodes) const {
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
   bool is_compilable = true;
   is_compilable &= ExtractNodeDefAndCheckCompilability(
-      if_node, "then_branch", "if_then", lib_runtime, stack_trace,
-      uncompilable_nodes);
+      if_node, "then_branch", "if_then", encapsulating_function, lib_runtime,
+      stack_trace, uncompilable_nodes);
   if (!uncompilable_nodes && !is_compilable) return is_compilable;
 
   is_compilable &= ExtractNodeDefAndCheckCompilability(
-      if_node, "else_branch", "if_else", lib_runtime, stack_trace,
-      uncompilable_nodes);
+      if_node, "else_branch", "if_else", encapsulating_function, lib_runtime,
+      stack_trace, uncompilable_nodes);
 
   return is_compilable;
 }
@@ -174,37 +183,43 @@ bool RecursiveCompilabilityChecker::IsCompilableIf(
 bool RecursiveCompilabilityChecker::IsCompilableWhile(
     const Node& while_node, FunctionLibraryRuntime* lib_runtime,
     std::vector<StackFrameView>* stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_nodes) const {
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
   bool is_compilable = true;
   is_compilable &= ExtractNodeDefAndCheckCompilability(
-      while_node, "cond", "while_cond", lib_runtime, stack_trace,
-      uncompilable_nodes);
+      while_node, "cond", "while_cond", encapsulating_function, lib_runtime,
+      stack_trace, uncompilable_nodes);
+
   if (!uncompilable_nodes && !is_compilable) return is_compilable;
 
   is_compilable &= ExtractNodeDefAndCheckCompilability(
-      while_node, "body", "while_body", lib_runtime, stack_trace,
-      uncompilable_nodes);
+      while_node, "body", "while_body", encapsulating_function, lib_runtime,
+      stack_trace, uncompilable_nodes);
 
   return is_compilable;
 }
 
 bool RecursiveCompilabilityChecker::ExtractNodeDefAndCheckCompilability(
     const Node& node, const std::string& attr_name,
-    const std::string& call_name, FunctionLibraryRuntime* lib_runtime,
+    const std::string& call_name, NameAttrList* encapsulating_function,
+    FunctionLibraryRuntime* lib_runtime,
     std::vector<StackFrameView>* stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_nodes) const {
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
   NodeDef call;
   call.set_name(call_name);
   if (!MakeCallNodeFromAttribute(node, attr_name, &call).ok()) {
     const auto uncompilable_reason = absl::StrCat(
         "missing '", attr_name, "' attribute from node", node.name());
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     VLOG(2) << "Rejecting node " << node.name() << ": " << uncompilable_reason
             << ".";
     return false;
   }
-  if (!IsCompilableCall(call, lib_runtime, stack_trace, uncompilable_nodes)) {
+  if (!IsCompilableCall(call, lib_runtime, stack_trace, encapsulating_function,
+                        uncompilable_nodes)) {
     VLOG(2) << "Rejecting node " << node.name()
             << ": can't compile : " << call.op();
     return false;
@@ -218,24 +233,33 @@ bool RecursiveCompilabilityChecker::ExtractNodeDefAndCheckCompilability(
 bool RecursiveCompilabilityChecker::IsCompilableCall(
     const NodeDef& call_def, FunctionLibraryRuntime* lib_runtime,
     std::vector<StackFrameView>* stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_nodes) const {
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
   if (stack_trace->size() > kMaxRecursionDepth) {
     std::string uncompilable_reason = "function depth limit exceeded";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     VLOG(2) << "Rejecting " << call_def.op() << ": " << uncompilable_reason
             << ".";
     return false;
   }
 
   FunctionLibraryRuntime::Handle handle;
-  Status status = InstantiateFunctionCall(call_def, lib_runtime, &handle);
-  if (!status.ok()) {
+  Status s;
+  NameAttrList function;
+  s = NameAndAttrsFromFunctionCall(call_def, &function);
+  if (s.ok()) {
+    s = lib_runtime->Instantiate(function.name(), AttrSlice(&function.attr()),
+                                 &handle);
+  }
+
+  if (!s.ok()) {
     std::string uncompilable_reason = "could not instantiate call";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     VLOG(2) << "Rejecting " << call_def.DebugString() << ": "
-            << uncompilable_reason << " : " << status;
+            << uncompilable_reason << " : " << s;
     return false;
   }
 
@@ -244,9 +268,9 @@ bool RecursiveCompilabilityChecker::IsCompilableCall(
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   bool is_compilable = true;
   for (const Node* node : fbody->graph->op_nodes()) {
-    stack_trace->emplace_back(StackFrameView{node->name(), call_def.op()});
-    is_compilable &=
-        IsCompilableNode(*node, lib_runtime, stack_trace, uncompilable_nodes);
+    stack_trace->emplace_back(StackFrameView{node->name(), function.name()});
+    is_compilable &= IsCompilableNode(*node, lib_runtime, stack_trace,
+                                      &function, uncompilable_nodes);
     stack_trace->pop_back();
     if (!uncompilable_nodes && !is_compilable) return is_compilable;
   }
@@ -263,20 +287,28 @@ bool RecursiveCompilabilityChecker::OpIsInaccurate(const Node& node) const {
 bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) const {
   // b/128001705: SelfAdjointEigV2 and Svd performance issues.
   // b/135640736: MatrixInverse performance issues.
+  // https://github.com/tensorflow/tensorflow/pull/31012:
+  //    ResizeNearestNeighbor, ResizeBilinear, and ResizeBilinearGrad sometimes
+  //    create convolutions too large for CuDNN to handle.
   return node.type_string() == "SelfAdjointEigV2" ||
          node.type_string() == "Svd" || node.type_string() == "Qr" ||
-         node.type_string() == "MatrixInverse";
+         node.type_string() == "MatrixInverse" ||
+         node.type_string() == "ResizeNearestNeighbor" ||
+         node.type_string() == "ResizeBilinear" ||
+         node.type_string() == "ResizeBilinearGrad";
 }
 
 bool RecursiveCompilabilityChecker::IsCompilableNode(
     const Node& node, FunctionLibraryRuntime* lib_runtime,
     std::vector<StackFrameView>* stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_nodes) const {
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
   auto stack_depth = stack_trace->size();
   if (node.IsSource() || node.IsSink()) {
     absl::string_view uncompilable_reason = "source or sink node";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -287,7 +319,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
       (node.type_string() == "_Arg" || node.type_string() == "_Retval")) {
     absl::string_view uncompilable_reason = "top level _Arg or _Retval";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -299,33 +331,35 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     absl::string_view uncompilable_reason =
         "_scoped_allocator or _forward_from attribute";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
 
   if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
     if (!IsCompilableCall(node.def(), lib_runtime, stack_trace,
-                          uncompilable_nodes)) {
+                          encapsulating_function, uncompilable_nodes)) {
       LogNotCompilable(node, "unsupported function");
       return false;
     }
   } else if (!HasXLAKernel(node)) {
     absl::string_view uncompilable_reason = "unsupported op";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
 
-  if (node.type_string() == "While" &&
-      !IsCompilableWhile(node, lib_runtime, stack_trace, uncompilable_nodes)) {
+  if (node.IsWhileNode() &&
+      !IsCompilableWhile(node, lib_runtime, stack_trace, encapsulating_function,
+                         uncompilable_nodes)) {
     LogNotCompilable(node, "unsupported while");
     return false;
   }
 
   if (node.IsIfNode() &&
-      !IsCompilableIf(node, lib_runtime, stack_trace, uncompilable_nodes)) {
+      !IsCompilableIf(node, lib_runtime, stack_trace, encapsulating_function,
+                      uncompilable_nodes)) {
     LogNotCompilable(node, "unsupported if");
     return false;
   }
@@ -334,7 +368,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
       IsStatefulRandomOp(node.type_string())) {
     absl::string_view uncompilable_reason = "stateful random op";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -342,7 +376,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
   if (!op_filter_.allow_control_trigger && node.IsControlTrigger()) {
     absl::string_view uncompilable_reason = "not allowed control trigger";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -351,7 +385,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
       IsAssertOrCheckNumerics(node.type_string())) {
     absl::string_view uncompilable_reason = "Assert or CheckNumerics";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -360,7 +394,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
       OpProducesOrConsumesVariant(node)) {
     absl::string_view uncompilable_reason = "DT_VARIANT producer/consumer";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -368,7 +402,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
   if (!op_filter_.allow_stack_ops && IsStackOp(node)) {
     absl::string_view uncompilable_reason = "Stack op";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -376,7 +410,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
   if (!op_filter_.allow_tensor_array_ops && IsTensorArrayOp(node)) {
     absl::string_view uncompilable_reason = "TensorArray op";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -386,7 +420,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     absl::string_view uncompilable_reason =
         "resource variable op in called function";
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -394,16 +428,22 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
   if (!op_filter_.allow_inaccurate_ops && OpIsInaccurate(node)) {
     absl::string_view uncompilable_reason =
         "operation with numerical accuracy issues";
+    BroadcastOptimizationRemark(XlaOptimizationRemark::INACCURATE_OPERATION,
+                                node.DebugString())
+        .IgnoreError();
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
 
   if (!op_filter_.allow_slow_ops && OpIsSlow(node)) {
     absl::string_view uncompilable_reason = "slow operation";
+    BroadcastOptimizationRemark(XlaOptimizationRemark::SLOW_OPERATION,
+                                node.DebugString())
+        .IgnoreError();
     MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              uncompilable_nodes);
+                              encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
@@ -432,8 +472,9 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
 /*static*/ void RecursiveCompilabilityChecker::MaybeMarkUncompilableNode(
     const absl::string_view reason,
     const std::vector<StackFrameView>& stack_trace,
-    std::vector<UncompilableNodeInfo>* uncompilable_node_list) {
-  if (!uncompilable_node_list) return;
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes) {
+  if (!uncompilable_nodes) return;
 
   UncompilableNodeInfo node_info;
   node_info.uncompilable_reason = std::string(reason);
@@ -445,7 +486,20 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
                     });
 
   node_info.name = std::string(stack_trace.back().name);
-  (*uncompilable_node_list).push_back(std::move(node_info));
+  auto function =
+      encapsulating_function ? *encapsulating_function : NameAttrList();
+  auto function_identifier = function.ShortDebugString();
+
+  auto it = uncompilable_nodes->find(function_identifier);
+  if (it == uncompilable_nodes->end()) {
+    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+        uncompileable_node_info{std::move(node_info)};
+    uncompilable_nodes->emplace(
+        std::move(function_identifier),
+        std::make_pair(function, std::move(uncompileable_node_info)));
+  } else {
+    it->second.second.emplace_back(std::move(node_info));
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 2ad3496bb7c..04639df14a1 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -129,19 +130,35 @@ class RecursiveCompilabilityChecker {
                                 const DeviceType* jit_device_type)
       : op_filter_(*op_filter), jit_device_type_(*jit_device_type) {}
 
-  // Returns a list of uncompilable nodes. When `node` is inside a function
-  // body, users can set `node_stack_trace` to provide an additional
-  // context for `node`'s placement within the outer most graph.
-  std::vector<UncompilableNodeInfo> FindUncompilableNodes(
+  using UncompilableNodesMap =
+      std::map<std::string,
+               std::pair<NameAttrList, std::vector<UncompilableNodeInfo>>>;
+
+  // Returns a map where the key is the function identifier(short debug
+  // string) of the function encapsulating the uncompilable nodes, and the
+  // value is a pair of NameAttrList of the function and a vector of
+  // uncompilable node info. When uncompilable node is not inside any
+  // function call nodes, then key is a ShortDebugString() of an empty
+  // NameAttrList.
+  //
+  // Also, when `node` is inside a function body, users can set
+  // `node_stack_trace` to provide an additional context for `node`'s
+  // placement within the outer most graph.
+  UncompilableNodesMap FindUncompilableNodes(
       const Node& node, FunctionLibraryRuntime* lib_runtime,
       const std::vector<StackFrame>* node_stack_trace = nullptr) const;
 
-  // Returns a list of uncompilable nodes in `call_def` that cannot be
-  // compiled by XLA. It is assumed that `call_def` is a call operation.
-  // When `node` is inside a function body, users can set
+  // Returns a map where the key is the function identifier(short debug
+  // string) of the function encapsulating the uncompilable nodes, and the
+  // value is a pair of NameAttrList of the function and a vector of
+  // uncompilable node info. When uncompilable node is not inside any
+  // function call nodes, then key is a ShortDebugString() of an empty
+  // NameAttrList.
+  //
+  // Also, when `node` is inside a function body, users can set
   // `node_stack_trace` to provide an additional context for `node`'s
   // placement within the outer most graph.
-  std::vector<UncompilableNodeInfo> FindUncompilableNodes(
+  UncompilableNodesMap FindUncompilableNodes(
       const NodeDef& call_def, FunctionLibraryRuntime* lib_runtime,
       const std::vector<StackFrame>* node_stack_trace = nullptr) const;
 
@@ -176,27 +193,31 @@ class RecursiveCompilabilityChecker {
   bool IsCompilableNode(
       const Node& node, FunctionLibraryRuntime* lib_runtime,
       std::vector<StackFrameView>* stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_nodes = nullptr) const;
+      NameAttrList* encapsulating_function = nullptr,
+      UncompilableNodesMap* uncompilable_nodes = nullptr) const;
   bool IsCompilableCall(
       const NodeDef& call_def, FunctionLibraryRuntime* lib_runtime,
       std::vector<StackFrameView>* stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_nodes = nullptr) const;
-  bool IsCompilableIf(
-      const Node& if_node, FunctionLibraryRuntime* lib_runtime,
-      std::vector<StackFrameView>* stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_nodes) const;
-  bool IsCompilableWhile(
-      const Node& while_node, FunctionLibraryRuntime* lib_runtime,
-      std::vector<StackFrameView>* stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_nodes) const;
+      NameAttrList* encapsulating_function = nullptr,
+      UncompilableNodesMap* uncompilable_nodes = nullptr) const;
+  bool IsCompilableIf(const Node& if_node, FunctionLibraryRuntime* lib_runtime,
+                      std::vector<StackFrameView>* stack_trace,
+                      NameAttrList* encapsulating_function,
+                      UncompilableNodesMap* uncompilable_nodes) const;
+  bool IsCompilableWhile(const Node& while_node,
+                         FunctionLibraryRuntime* lib_runtime,
+                         std::vector<StackFrameView>* stack_trace,
+                         NameAttrList* encapsulating_function,
+                         UncompilableNodesMap* uncompilable_nodes) const;
 
   // Returns compilability of node def retrieved from `node`'s attribute with
   // name `attr_name`.
   bool ExtractNodeDefAndCheckCompilability(
       const Node& node, const std::string& attr_name,
-      const std::string& call_name, FunctionLibraryRuntime* lib_runtime,
+      const std::string& call_name, NameAttrList* encapsulating_function,
+      FunctionLibraryRuntime* lib_runtime,
       std::vector<StackFrameView>* stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_nodes) const;
+      UncompilableNodesMap* uncompilable_nodes) const;
 
   bool IsStackOp(const Node& node) const {
     const XlaResourceOpInfo* op_info =
@@ -231,7 +252,8 @@ class RecursiveCompilabilityChecker {
   static void MaybeMarkUncompilableNode(
       const absl::string_view reason,
       const std::vector<StackFrameView>& stack_trace,
-      std::vector<UncompilableNodeInfo>* uncompilable_node_list);
+      NameAttrList* encapsulating_function,
+      UncompilableNodesMap* uncompilable_nodes_map);
 
   // Make sure we don't recurse infinitely on recursive functions.
   const int kMaxRecursionDepth = 10;
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index 90d69680514..0dd3b8141c9 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -117,10 +119,15 @@ TEST_F(CompilabilityCheckUtilTest, CheckNonFunctionalNodes) {
   const auto uncompilable_nodes =
       checker_->FindUncompilableNodes(*uncompilable_op, flib_runtime);
   ASSERT_EQ(1, uncompilable_nodes.size());
-  const auto& node_info = uncompilable_nodes.at(0);
-  EXPECT_EQ("unsupported op", node_info.uncompilable_reason);
-  ASSERT_EQ(1, node_info.stack_trace.size());
-  ASSERT_EQ("", node_info.stack_trace.at(0).function_name);
+  auto node_info_it =
+      uncompilable_nodes.find(NameAttrList().ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), node_info_it);
+  const auto& uncompilable_nodes_inside_function = node_info_it->second.second;
+  ASSERT_EQ(1, uncompilable_nodes_inside_function.size());
+  const auto& uncompilable_node_info = uncompilable_nodes_inside_function.at(0);
+  EXPECT_EQ("unsupported op", uncompilable_node_info.uncompilable_reason);
+  ASSERT_EQ(1, uncompilable_node_info.stack_trace.size());
+  ASSERT_EQ("", uncompilable_node_info.stack_trace.at(0).function_name);
 }
 
 TEST_F(CompilabilityCheckUtilTest, CheckSimpleFunctionNode) {
@@ -147,12 +154,18 @@ TEST_F(CompilabilityCheckUtilTest, CheckSimpleFunctionNode) {
       checker_->FindUncompilableNodes(*functional_node, flib_runtime);
 
   EXPECT_EQ(1, uncompilable_nodes.size());
-  const auto& node_info = uncompilable_nodes.at(0);
+  NameAttrList function;
+  function.set_name(kUncompilableFunctionName);
+  const auto node_info_it =
+      uncompilable_nodes.find(function.ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), node_info_it);
+  const auto& uncompilable_node_list = node_info_it->second.second;
+  ASSERT_EQ(1, uncompilable_node_list.size());
+  const auto& node_info = uncompilable_node_list.at(0);
   const auto& node_stack = node_info.stack_trace;
   ASSERT_EQ(2, node_stack.size());
   EXPECT_EQ("D", node_stack.at(0).name);
   EXPECT_EQ(kUncompilableFunctionNodeName, node_stack.at(1).name);
-
   EXPECT_EQ(kUncompilableFunctionNodeName, node_info.name);
   EXPECT_EQ("unsupported op", node_info.uncompilable_reason);
 }
@@ -212,7 +225,15 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalWhileNode) {
       checker_->FindUncompilableNodes(**while_node_it, flib_runtime);
   ASSERT_EQ(1, uncompilable_nodes.size());
 
-  const auto& node_info = uncompilable_nodes.at(0);
+  NameAttrList function;
+  function.set_name(kUncompilableFunctionName);
+  const auto node_info_it =
+      uncompilable_nodes.find(function.ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), node_info_it);
+  const auto& uncompilable_node_list = node_info_it->second.second;
+  ASSERT_EQ(1, uncompilable_node_list.size());
+  const auto& node_info = uncompilable_node_list.at(0);
+
   const auto& node_stack = node_info.stack_trace;
   ASSERT_EQ(2, node_stack.size());
   const auto& stacktrace_first_node_info = node_stack.at(0);
@@ -280,7 +301,14 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
       checker_->FindUncompilableNodes(**if_node_it, flib_runtime);
   ASSERT_EQ(2, uncompilable_nodes.size());
 
-  const auto& uncompilable_node_one = uncompilable_nodes.at(0);
+  NameAttrList function_one;
+  function_one.set_name(kUncompilableFunctionName);
+  auto it = uncompilable_nodes.find(function_one.ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), it);
+
+  const auto& uncompilable_node_list = it->second.second;
+  ASSERT_EQ(1, uncompilable_node_list.size());
+  const auto& uncompilable_node_one = uncompilable_node_list.at(0);
   const auto& node_one_stack = uncompilable_node_one.stack_trace;
 
   ASSERT_EQ(2, node_one_stack.size());
@@ -296,7 +324,14 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
   EXPECT_EQ(kUncompilableFunctionNodeName, uncompilable_node_one.name);
   EXPECT_EQ("unsupported op", uncompilable_node_one.uncompilable_reason);
 
-  const auto& uncompilable_node_two = uncompilable_nodes.at(1);
+  NameAttrList function_two;
+  function_two.set_name(kUncompilableFunctionTwoName);
+  it = uncompilable_nodes.find(function_two.ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), it);
+
+  const auto& uncompilable_node_two_list = it->second.second;
+  ASSERT_EQ(1, uncompilable_node_two_list.size());
+  const auto& uncompilable_node_two = uncompilable_node_two_list.at(0);
   const auto& node_two_stack = uncompilable_node_two.stack_trace;
   ASSERT_EQ(2, node_two_stack.size());
   const auto& node_two_stacktrace_first_node = node_two_stack.at(0);
diff --git a/tensorflow/compiler/jit/defs.cc b/tensorflow/compiler/jit/defs.cc
index f847d66f3c6..b23f6ec35f5 100644
--- a/tensorflow/compiler/jit/defs.cc
+++ b/tensorflow/compiler/jit/defs.cc
@@ -18,6 +18,12 @@ limitations under the License.
 namespace tensorflow {
 
 const char* const kXlaCompileAttr = "_XlaCompile";
+
+// User-provided through jit_scope APIs. Effective only when auto_jit is OFF.
 const char* const kXlaScopeAttr = "_XlaScope";
 
+// Automatically inserted by auto_jit to guide clustering results.  Effective
+// only when auto_jit is ON.
+const char* const kXlaInternalScopeAttr = "_XlaInternalScope";
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.h b/tensorflow/compiler/jit/defs.h
index a3aabc949db..bf8009344df 100644
--- a/tensorflow/compiler/jit/defs.h
+++ b/tensorflow/compiler/jit/defs.h
@@ -24,6 +24,7 @@ namespace tensorflow {
 // Name of attribute used to tag operators for compilation with XLA
 extern const char* const kXlaCompileAttr;  // "_XlaCompile"
 extern const char* const kXlaScopeAttr;    // "_XlaScope"
+extern const char* const kXlaInternalScopeAttr;  // "_XlaInternalScope"
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 6992a0165d4..e0c0c0b18cc 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1317,7 +1317,7 @@ Status EncapsulateSubgraphsPass::Run(
 bool IsXlaCompiledKernel(const Node& node) {
   bool is_compiled = false;
   bool has_compilation_attr =
-      GetNodeAttr(node.attrs(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
+      TryGetNodeAttr(node.attrs(), kXlaCompiledKernelAttr, &is_compiled) &&
       is_compiled;
   return has_compilation_attr ? is_compiled : false;
 }
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 2c2cd094133..b9889988cc0 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -245,8 +245,8 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // while iterating.
   std::vector<Node*> launch_nodes;
   for (Node* n : graph->nodes()) {
-    string name;
-    if (GetNodeAttr(n->attrs(), kXlaClusterAttr, &name).ok()) {
+    const string& name = GetNodeAttrString(n->attrs(), kXlaClusterAttr);
+    if (!name.empty()) {
       launch_nodes.push_back(n);
     }
   }
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 8935cdfc240..b35e08fb1f0 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/lower_functional_ops.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
@@ -369,7 +369,8 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
   return new_def;
 }
 
-Status ValidateOutsideCompilationCallNode(Node* call_node) {
+TF_ATTRIBUTE_NOINLINE Status
+ValidateOutsideCompilationCallNode(Node* call_node) {
   // DT_INT64 as input/output for outside compilation is not supported yet:
   // b/120809951.
   for (const Edge* e : call_node->in_edges()) {
@@ -402,7 +403,7 @@ Status ValidateOutsideCompilationCallNode(Node* call_node) {
 }
 
 // Replace outside compilation function call node with XlaHostCompute node.
-xla::StatusOr<Node*> ReplaceOutsideCompilationCallNode(
+TF_ATTRIBUTE_NOINLINE xla::StatusOr<Node*> ReplaceOutsideCompilationCallNode(
     Graph* g, Node* call_node, const std::map<string, int>& host_compute_core,
     const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
   // Build XlaHostCompute NodeDef.
@@ -440,7 +441,7 @@ Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
         n->ClearAttr(attr_name);
         n->AddAttr(attr_name, branch_func);
       }
-    } else if (n->type_string() == "While") {
+    } else if (n->IsWhileNode()) {
       for (const string& attr_name : std::vector<string>{"cond", "body"}) {
         NameAttrList branch_func;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
@@ -523,16 +524,14 @@ xla::StatusOr<std::vector<DataType>> UpdateTypesAttribute(
 
 // Add edges from lifted outside compilation argument nodes to `n` in Graph `g`.
 void AddEdgesFromOutsideCompilationNodes(
-    const int original_arg_count, const std::vector<DataType>& data_types,
-    const std::vector<std::pair<Node*, Node*>>&
-        lifted_arg_nodes_and_outside_compilation_nodes,
-    Graph* g, Node* n) {
+    const int original_arg_count, const int arg_to_input_edge_offset,
+    const std::vector<DataType>& data_types,
+    const std::vector<Node*>& outside_compilation_nodes, Graph* g, Node* n) {
   // Add edges from outside compilation nodes to While node.
   for (int i = original_arg_count; i < data_types.size(); i++) {
     Node* outside_compilation_node =
-        lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count]
-            .second;
-    g->AddEdge(outside_compilation_node, 0, n, i);
+        outside_compilation_nodes[i - original_arg_count];
+    g->AddEdge(outside_compilation_node, 0, n, i + arg_to_input_edge_offset);
   }
 }
 
@@ -573,14 +572,15 @@ Status AddMatchingRetvalNode(const FunctionBody& function_body,
 
 void ReplaceLiftedArgNodePlaceholderWithArg(
     const FunctionBody& function_body, const int original_arg_count,
-    const int arg_idx,
-    const std::vector<std::pair<Node*, Node*>>&
-        lifted_arg_nodes_and_outside_compilation_nodes,
+    const int arg_idx, const std::vector<Node*>& lifted_arg_nodes,
     Node* arg_node) {
-  Node* lifted_arg_node =
-      lifted_arg_nodes_and_outside_compilation_nodes[arg_idx -
-                                                     original_arg_count]
-          .first;
+  Node* lifted_arg_node = lifted_arg_nodes[arg_idx - original_arg_count];
+  // This might happen because lifted_arg_node only exists in one branch of an
+  // If node, and we are handling the other branch.
+  if (!lifted_arg_node) {
+    return;
+  }
+
   for (const Edge* e : lifted_arg_node->out_edges()) {
     if (e->IsControlEdge()) {
       function_body.graph->AddControlEdge(arg_node, e->dst());
@@ -588,7 +588,6 @@ void ReplaceLiftedArgNodePlaceholderWithArg(
       function_body.graph->AddEdge(arg_node, 0, e->dst(), e->dst_input());
     }
   }
-
   function_body.graph->RemoveNode(lifted_arg_node);
 }
 
@@ -597,7 +596,7 @@ void ReplaceLiftedArgNodePlaceholderWithArg(
 Status PostprocessLiftedArgsForWhile(
     const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
-  TF_RET_CHECK(n->type_string() == "While");
+  TF_RET_CHECK(n->IsWhileNode());
 
   // Check if there is any lifted args in body function.
   NameAttrList body_func;
@@ -629,12 +628,25 @@ Status PostprocessLiftedArgsForWhile(
                            n));
 
   // Add edges from outside compilation nodes to While node.
-  AddEdgesFromOutsideCompilationNodes(
-      original_arg_count, data_types,
-      lifted_arg_nodes_and_outside_compilation_nodes, g, n);
+  std::vector<Node*> outside_compilation_nodes;
+  std::transform(
+      lifted_arg_nodes_and_outside_compilation_nodes.begin(),
+      lifted_arg_nodes_and_outside_compilation_nodes.end(),
+      std::back_inserter(outside_compilation_nodes),
+      [](const std::pair<Node*, Node*>& pair) { return pair.second; });
+  AddEdgesFromOutsideCompilationNodes(original_arg_count,
+                                      /*arg_to_input_edge_offset=*/0,
+                                      data_types, outside_compilation_nodes, g,
+                                      n);
 
   // In body_graph, create new _Arg/_Retval nodes, and replace lifted arg
   // nodes with the new _Arg nodes.
+  std::vector<Node*> lifted_arg_nodes;
+  std::transform(
+      lifted_arg_nodes_and_outside_compilation_nodes.begin(),
+      lifted_arg_nodes_and_outside_compilation_nodes.end(),
+      std::back_inserter(lifted_arg_nodes),
+      [](const std::pair<Node*, Node*>& pair) { return pair.first; });
   for (int i = original_arg_count; i < data_types.size(); i++) {
     TF_ASSIGN_OR_RETURN(Node * arg_node,
                         AddOutsideCompilationInputArgToFunctionBody(
@@ -644,8 +656,7 @@ Status PostprocessLiftedArgsForWhile(
         AddMatchingRetvalNode(*body_function_body, i, data_types[i], arg_node));
 
     ReplaceLiftedArgNodePlaceholderWithArg(
-        *body_function_body, original_arg_count, i,
-        lifted_arg_nodes_and_outside_compilation_nodes, arg_node);
+        *body_function_body, original_arg_count, i, lifted_arg_nodes, arg_node);
   }
 
   FunctionDef rewritten_body_function_def;
@@ -682,6 +693,219 @@ Status PostprocessLiftedArgsForWhile(
   return Status::OK();
 }
 
+Status PostprocessLiftedArgsForIf(
+    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    Graph* g, Node* n, FunctionLibraryDefinition* fld) {
+  TF_RET_CHECK(n->IsIfNode());
+
+  NameAttrList then_branch_func;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "then_branch", &then_branch_func));
+  const FunctionDef* then_branch_function_def =
+      fld->Find(then_branch_func.name());
+  TF_RET_CHECK(then_branch_function_def);
+
+  NameAttrList else_branch_func;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "else_branch", &else_branch_func));
+  const FunctionDef* else_branch_function_def =
+      fld->Find(else_branch_func.name());
+  TF_RET_CHECK(else_branch_function_def);
+
+  // Nothing to do if neither branch contains any lifted arguments.
+  if (!HasLiftedArgs(*then_branch_function_def) &&
+      !HasLiftedArgs(*else_branch_function_def)) {
+    return Status::OK();
+  }
+
+  std::unique_ptr<FunctionBody> then_branch_function_body;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *then_branch_function_def, AttrSlice(&then_branch_func.attr()), fld,
+      &then_branch_function_body));
+
+  std::unique_ptr<FunctionBody> else_branch_function_body;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *else_branch_function_def, AttrSlice(&else_branch_func.attr()), fld,
+      &else_branch_function_body));
+
+  // Then and else branches have same argument count and argument data types.
+  int original_arg_count = then_branch_function_body->arg_nodes.size();
+
+  TF_ASSIGN_OR_RETURN(
+      auto then_branch_lifted_arg_nodes_and_outside_compilation_nodes,
+      LiftedArgsAndOutsideCompilationNodesInFunctionBody(
+          *then_branch_function_body, outside_compilation_attr_to_node));
+
+  TF_ASSIGN_OR_RETURN(
+      auto else_branch_lifted_arg_nodes_and_outside_compilation_nodes,
+      LiftedArgsAndOutsideCompilationNodesInFunctionBody(
+          *else_branch_function_body, outside_compilation_attr_to_node));
+
+  // Merge lifted args from then and else branches.
+  std::vector<Node*> outside_compilation_nodes;
+  std::vector<Node*> then_branch_lifted_arg_nodes;
+  for (const auto& pair :
+       then_branch_lifted_arg_nodes_and_outside_compilation_nodes) {
+    outside_compilation_nodes.push_back(pair.second);
+    then_branch_lifted_arg_nodes.push_back(pair.first);
+  }
+  for (const auto& pair :
+       else_branch_lifted_arg_nodes_and_outside_compilation_nodes) {
+    if (std::find(outside_compilation_nodes.begin(),
+                  outside_compilation_nodes.end(),
+                  pair.second) == outside_compilation_nodes.end()) {
+      outside_compilation_nodes.push_back(pair.second);
+      // Then branch does not contain this lifted arg. Add an empty item to
+      // then_branch_lifted_arg_nodes.
+      then_branch_lifted_arg_nodes.push_back(nullptr);
+    }
+  }
+  // Reorder else_branch_lifted_arg_nodes_and_outside_compilation_nodes.
+  std::vector<Node*> else_branch_lifted_arg_nodes(
+      outside_compilation_nodes.size());
+  for (const auto& pair :
+       else_branch_lifted_arg_nodes_and_outside_compilation_nodes) {
+    auto iter = std::find(outside_compilation_nodes.begin(),
+                          outside_compilation_nodes.end(), pair.second);
+    TF_RET_CHECK(iter != outside_compilation_nodes.end());
+    int index = iter - outside_compilation_nodes.begin();
+    else_branch_lifted_arg_nodes[index] = pair.first;
+  }
+
+  // Append lifted args' types to If node's Tin attribute.
+  std::vector<DataType> data_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tin", &data_types));
+  for (Node* n : outside_compilation_nodes) {
+    data_types.push_back(n->output_type(0));
+  }
+  n->ClearAttr("Tin");
+  n->AddAttr("Tin", data_types);
+
+  // Add edges from outside compilation nodes to If node. If node's input #0
+  // is predicate input, input #1 maps to _Arg #0 of branch functions, thus
+  // arg_to_input_edge_offset is set to 1.
+  AddEdgesFromOutsideCompilationNodes(original_arg_count,
+                                      /*arg_to_input_edge_offset=*/1,
+                                      data_types, outside_compilation_nodes, g,
+                                      n);
+
+  for (int i = original_arg_count; i < data_types.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(Node * then_branch_arg_node,
+                        AddOutsideCompilationInputArgToFunctionBody(
+                            *then_branch_function_body, i, data_types[i]));
+
+    ReplaceLiftedArgNodePlaceholderWithArg(
+        *then_branch_function_body, original_arg_count, i,
+        then_branch_lifted_arg_nodes, then_branch_arg_node);
+
+    TF_ASSIGN_OR_RETURN(Node * else_branch_arg_node,
+                        AddOutsideCompilationInputArgToFunctionBody(
+                            *else_branch_function_body, i, data_types[i]));
+
+    ReplaceLiftedArgNodePlaceholderWithArg(
+        *else_branch_function_body, original_arg_count, i,
+        else_branch_lifted_arg_nodes, else_branch_arg_node);
+  }
+
+  FunctionDef rewritten_then_branch_function_def;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *then_branch_function_body->graph, then_branch_func.name(),
+      HostGraphControlRetMapping, &rewritten_then_branch_function_def));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(then_branch_func.name(),
+                                          rewritten_then_branch_function_def));
+
+  FunctionDef rewritten_else_branch_function_def;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *else_branch_function_body->graph, else_branch_func.name(),
+      HostGraphControlRetMapping, &rewritten_else_branch_function_def));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(else_branch_func.name(),
+                                          rewritten_else_branch_function_def));
+  return Status::OK();
+}
+
+Status PostprocessLiftedArgsForCall(
+    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    Graph* g, Node* n, FunctionLibraryDefinition* fld) {
+  const FunctionDef* fdef = fld->Find(n->type_string());
+  TF_RET_CHECK(fdef);
+
+  // Nothing to do if the function does not contain any lifted arguments.
+  if (!HasLiftedArgs(*fdef)) {
+    return Status::OK();
+  }
+
+  std::unique_ptr<FunctionBody> fbody;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, n->attrs(), fld, &fbody));
+
+  int original_arg_count = fbody->arg_nodes.size();
+
+  TF_ASSIGN_OR_RETURN(auto lifted_arg_nodes_and_outside_compilation_nodes,
+                      LiftedArgsAndOutsideCompilationNodesInFunctionBody(
+                          *fbody, outside_compilation_attr_to_node));
+
+  // Append lifted args' types to call node's input data types.
+  std::vector<DataType> data_types(n->input_types().begin(),
+                                   n->input_types().end());
+  for (auto pair : lifted_arg_nodes_and_outside_compilation_nodes) {
+    Node* outside_compilation_node = pair.second;
+    DataType data_type;
+    TF_RET_CHECK(outside_compilation_node->IsIdentity() ||
+                 outside_compilation_node->type_string() == "Placeholder");
+    if (outside_compilation_node->IsIdentity()) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(outside_compilation_node->def(), "T", &data_type));
+    } else {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(outside_compilation_node->def(), "dtype", &data_type));
+    }
+    data_types.push_back(data_type);
+  }
+
+  std::vector<Node*> lifted_arg_nodes;
+  std::transform(
+      lifted_arg_nodes_and_outside_compilation_nodes.begin(),
+      lifted_arg_nodes_and_outside_compilation_nodes.end(),
+      std::back_inserter(lifted_arg_nodes),
+      [](const std::pair<Node*, Node*>& pair) { return pair.first; });
+  for (int i = original_arg_count; i < data_types.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Node * arg_node,
+        AddOutsideCompilationInputArgToFunctionBody(*fbody, i, data_types[i]));
+
+    ReplaceLiftedArgNodePlaceholderWithArg(*fbody, original_arg_count, i,
+                                           lifted_arg_nodes, arg_node);
+  }
+
+  FunctionDef rewritten_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, n->type_string(),
+                                        HostGraphControlRetMapping,
+                                        &rewritten_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(n->type_string(), rewritten_fdef));
+
+  // We need to recreate the node. Otherwise TF will not know n->num_inputs()
+  // has increased.
+  NodeDef node_def = n->def();
+  for (int i = original_arg_count; i < data_types.size(); i++) {
+    Node* outside_compilation_node =
+        lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count]
+            .second;
+    node_def.add_input(absl::StrCat(outside_compilation_node->name(), ":", 0));
+  }
+  TF_ASSIGN_OR_RETURN(n, ReplaceNode(g, n, node_def));
+
+  // Add edges from outside compilation nodes to call node.
+  std::vector<Node*> outside_compilation_nodes;
+  std::transform(
+      lifted_arg_nodes_and_outside_compilation_nodes.begin(),
+      lifted_arg_nodes_and_outside_compilation_nodes.end(),
+      std::back_inserter(outside_compilation_nodes),
+      [](const std::pair<Node*, Node*>& pair) { return pair.second; });
+  AddEdgesFromOutsideCompilationNodes(original_arg_count,
+                                      /*arg_to_input_edge_offset=*/0,
+                                      data_types, outside_compilation_nodes, g,
+                                      n);
+
+  return Status::OK();
+}
+
 // Creates a mapping from outside compilation cluster name to lifted argument
 // placeholder.
 xla::StatusOr<std::unordered_map<string, Node*>> OutsideCompilationAttrToNode(
@@ -690,10 +914,9 @@ xla::StatusOr<std::unordered_map<string, Node*>> OutsideCompilationAttrToNode(
   for (Node* n : g.op_nodes()) {
     bool is_lifted_arg;
     string outside_compilation_attr;
-    if (GetNodeAttr(n->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg).ok() &&
-        GetNodeAttr(n->def(), "_xla_outside_compilation",
-                    &outside_compilation_attr)
-            .ok()) {
+    if (TryGetNodeAttr(n->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg) &&
+        TryGetNodeAttr(n->def(), "_xla_outside_compilation",
+                       &outside_compilation_attr)) {
       TF_RET_CHECK(is_lifted_arg);
       TF_RET_CHECK(n->IsIdentity() || n->type_string() == "Placeholder");
       outside_compilation_attr_to_node[outside_compilation_attr] = n;
@@ -707,15 +930,34 @@ Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) {
   TF_ASSIGN_OR_RETURN(auto outside_compilation_attr_to_node,
                       OutsideCompilationAttrToNode(*g));
 
+  std::vector<Node*> call_nodes;
   for (Node* n : g->op_nodes()) {
     if (!HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) {
       continue;
     }
 
-    if (n->type_string() == "While") {
+    if (n->IsWhileNode()) {
       TF_RETURN_IF_ERROR(PostprocessLiftedArgsForWhile(
           outside_compilation_attr_to_node, g, n, fld));
     }
+
+    if (n->IsIfNode()) {
+      TF_RETURN_IF_ERROR(PostprocessLiftedArgsForIf(
+          outside_compilation_attr_to_node, g, n, fld));
+    }
+
+    // Outside compilation host side function call will always be direct
+    // function call nodes.
+    // Function call nodes need to be handled separately because we rewrite
+    // nodes in `PostprocessLiftedArgsForCall`.
+    if (fld->Contains(n->type_string())) {
+      call_nodes.push_back(n);
+    }
+  }
+
+  for (Node* n : call_nodes) {
+    TF_RETURN_IF_ERROR(PostprocessLiftedArgsForCall(
+        outside_compilation_attr_to_node, g, n, fld));
   }
 
   return Status::OK();
@@ -1065,9 +1307,9 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
 }
 
 // Builds XlaSendToHost node which sends cond predicate to host.
-xla::StatusOr<Node*> BuildSendIfPredNode(const string& name,
-                                         const string& host_transfer_key,
-                                         Node* pred_node, Graph* g) {
+TF_ATTRIBUTE_NOINLINE xla::StatusOr<Node*> BuildSendIfPredNode(
+    const string& name, const string& host_transfer_key, Node* pred_node,
+    Graph* g) {
   NodeDefBuilder send_pred_builder(name, "XlaSendToHost");
   send_pred_builder.Attr("Tinput", DT_BOOL);
   send_pred_builder.Attr("key", absl::StrCat(host_transfer_key, "_dtoh_0"));
@@ -1130,15 +1372,13 @@ Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
 }
 
 // Builds host side graph for If node.
-Status BuildHostGraphForIfNode(const string& xla_cluster_attr_name,
-                               const string& outside_compilation_attr_name,
-                               const string& xla_cluster_name,
-                               const string& if_node_name,
-                               const string& host_transfer_key,
-                               const string& host_graph_func_name,
-                               FunctionLibraryDefinition* fld,
-                               const string& then_branch_host_func_name,
-                               const string& else_branch_host_func_name) {
+TF_ATTRIBUTE_NOINLINE Status BuildHostGraphForIfNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const string& if_node_name, const string& host_transfer_key,
+    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const string& then_branch_host_func_name,
+    const string& else_branch_host_func_name) {
   Graph host_graph(fld);
   string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
   AttrValue device_ordinal_value;
@@ -1215,10 +1455,9 @@ Status BuildHostGraphForIfNode(const string& xla_cluster_attr_name,
 }
 
 // Rewrites loop cond to add a node which sends loop cond to host.
-Status AddSendLoopPredToLoopCond(FunctionLibraryDefinition* fld,
-                                 const NameAttrList& loop_cond_func,
-                                 const string& while_node_name,
-                                 const string& host_transfer_key) {
+TF_ATTRIBUTE_NOINLINE Status AddSendLoopPredToLoopCond(
+    FunctionLibraryDefinition* fld, const NameAttrList& loop_cond_func,
+    const string& while_node_name, const string& host_transfer_key) {
   // Instantiate the loop cond function.
   std::unique_ptr<FunctionBody> fbody;
   TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fld->Find(loop_cond_func.name()),
@@ -1406,7 +1645,7 @@ Status RewriteHostWhileLoopBody(
 }
 
 // Builds host side graph for while node.
-Status BuildHostGraphForWhileNode(
+TF_ATTRIBUTE_NOINLINE Status BuildHostGraphForWhileNode(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const string& while_node_name, const string& host_transfer_key,
@@ -1503,10 +1742,6 @@ Status BuildHostGraphForFuncCallNode(
   call_builder.Attr(kXlaHasHostTransferAttrName, true);
   call_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
   call_builder.Attr(outside_compilation_attr_name, call_builder.node_name());
-  // Make sure control outputs of this function call node will be respected when
-  // this node is lowered.
-  call_builder.Attr(LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr,
-                    true);
   NodeDef call_def;
   TF_RETURN_IF_ERROR(call_builder.Finalize(&call_def));
   Status s;
@@ -1529,6 +1764,221 @@ Status BuildHostGraphForFuncCallNode(
   return Status::OK();
 }
 
+TF_ATTRIBUTE_NOINLINE Status ExtractOutsideCompilationForFuncCallNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<string>* host_graphs,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  bool func_has_outside_compilation = false;
+  NameAttrList func;
+  if (fld->Contains(n->type_string())) {
+    func.set_name(n->type_string());
+    typedef protobuf::Map<string, AttrValue> AttrMap;
+    *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
+  } else if (n->IsPartitionedCall()) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &func));
+  } else {
+    TF_RET_CHECK(n->type_string() == FunctionLibraryDefinition::kGradientOp);
+    func.set_name(FunctionLibraryDefinition::kGradientOp);
+    *func.mutable_attr() = n->def().attr();
+  }
+  string new_func_name = absl::StrCat(n->name(), "_oc");
+  string host_func_name = absl::StrCat("oc_func_call_host_", n->name());
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      func, new_func_name, host_func_name, host_compute_core, flr, fld,
+      shape_inference_graphs, &func_has_outside_compilation));
+
+  // If the function call does not have outside compilation, nothing to do.
+  if (!func_has_outside_compilation) {
+    return Status::OK();
+  }
+
+  *has_outside_compilation = true;
+
+  // Change `n` to call the new function directly.
+  auto replace_builder =
+      absl::make_unique<NodeDefBuilder>(n->name(), new_func_name, fld);
+  std::vector<NodeDefBuilder::NodeOut> inputs(n->num_inputs());
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    TF_RET_CHECK(e->dst_input() >= 0 && e->dst_input() < inputs.size());
+    inputs[e->dst_input()] =
+        NodeDefBuilder::NodeOut{e->src()->name(), e->src_output(),
+                                e->src()->output_type(e->src_output())};
+  }
+  for (const auto& input : inputs) {
+    replace_builder->Input(input);
+  }
+  for (const auto& attr : n->attrs()) {
+    replace_builder->Attr(attr.first, attr.second);
+  }
+  auto replace_def = absl::make_unique<NodeDef>();
+  TF_RETURN_IF_ERROR(replace_builder->Finalize(replace_def.get()));
+  TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, *replace_def));
+  replace->AddAttr(kXlaTokenInputNodesAttrName,
+                   std::vector<string>{kXlaTokenArgNodeName});
+
+  // Build host side graph for the function call.
+  string oc_host_graph_name =
+      absl::StrCat("oc_func_host_graph_", replace->name());
+  TF_RETURN_IF_ERROR(BuildHostGraphForFuncCallNode(
+      xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
+      replace->name(), host_func_name, oc_host_graph_name, fld));
+
+  // Record the host graph.
+  host_graphs->push_back(oc_host_graph_name);
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationForIfNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<string>* host_graphs,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  // Instantiate "then_branch" and "else_branch".
+  NameAttrList then_branch, else_branch;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "then_branch", &then_branch));
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "else_branch", &else_branch));
+
+  // Extract outside compilation for then_branch and else_branch.
+  bool then_branch_has_outside_compilation = false;
+  bool else_branch_has_outside_compilation = false;
+  string then_branch_host_func_name =
+             absl::StrCat("oc_then_branch_host_if_", n->name()),
+         else_branch_host_func_name =
+             absl::StrCat("oc_else_branch_host_if_", n->name());
+  string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
+         else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      then_branch, then_branch_xla_func_name, then_branch_host_func_name,
+      host_compute_core, flr, fld, shape_inference_graphs,
+      &then_branch_has_outside_compilation));
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      else_branch, else_branch_xla_func_name, else_branch_host_func_name,
+      host_compute_core, flr, fld, shape_inference_graphs,
+      &else_branch_has_outside_compilation));
+
+  // If then/else branch do not have outside compilation, nothing to do.
+  if (!then_branch_has_outside_compilation &&
+      !else_branch_has_outside_compilation) {
+    return Status::OK();
+  }
+
+  *has_outside_compilation = true;
+
+  // Change If node to call the new functions.
+  then_branch.set_name(then_branch_xla_func_name);
+  n->ClearAttr("then_branch");
+  n->AddAttr("then_branch", then_branch);
+  else_branch.set_name(else_branch_xla_func_name);
+  n->ClearAttr("else_branch");
+  n->AddAttr("else_branch", else_branch);
+
+  string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
+
+  // XLA computation: add a SendToHost node to send cond predicate.
+  Node* pred_node;
+  TF_RETURN_IF_ERROR(n->input_node(0, &pred_node));
+  TF_ASSIGN_OR_RETURN(
+      Node * send_pred_node,
+      BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
+                          host_transfer_key, pred_node, g));
+  n->AddAttr(kXlaTokenInputNodesAttrName,
+             std::vector<string>{send_pred_node->name()});
+
+  // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
+  // visit If node after `send_pred_node`, thus the token output for
+  // `send_pred_node` has been generated.
+  g->AddControlEdge(send_pred_node, n);
+
+  // Build host side graph for the "If" node.
+  string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
+  TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      n->name(), host_transfer_key, oc_host_graph_name, fld,
+      then_branch_host_func_name, else_branch_host_func_name));
+  host_graphs->push_back(oc_host_graph_name);
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationForWhileNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<string>* host_graphs,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  // Instantiate "cond" and "body".
+  NameAttrList cond, body;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "cond", &cond));
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "body", &body));
+
+  // Extract outside compilation for cond and body.
+  bool cond_has_outside_compilation = false;
+  bool body_has_outside_compilation = false;
+  string cond_host_func_name = absl::StrCat("oc_cond_host_while_", n->name()),
+         body_host_func_name = absl::StrCat("oc_body_host_while_", n->name());
+  string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
+         body_xla_func_name = absl::StrCat(body.name(), "_oc");
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
+      fld, shape_inference_graphs, &cond_has_outside_compilation));
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      body, body_xla_func_name, body_host_func_name, host_compute_core, flr,
+      fld, shape_inference_graphs, &body_has_outside_compilation));
+
+  // If cond/body do not have outside compilation, nothing to do.
+  if (!cond_has_outside_compilation && !body_has_outside_compilation) {
+    return Status::OK();
+  }
+
+  *has_outside_compilation = true;
+
+  // Change While node to call the new functions.
+  cond.set_name(cond_xla_func_name);
+  n->ClearAttr("cond");
+  n->AddAttr("cond", cond);
+  body.set_name(body_xla_func_name);
+  n->ClearAttr("body");
+  n->AddAttr("body", body);
+
+  string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
+
+  // XLA computation: rewrite cond function to add a SendToHost node to send
+  // loop predicate.
+  TF_RETURN_IF_ERROR(
+      AddSendLoopPredToLoopCond(fld, cond, n->name(), host_transfer_key));
+  n->AddAttr(kXlaTokenInputNodesAttrName,
+             std::vector<string>{kXlaTokenArgNodeName});
+
+  // Build host side graph for the "While" node.
+  string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
+  TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+      n->name(), host_transfer_key, oc_host_graph_name, fld,
+      cond_host_func_name, body_host_func_name));
+  host_graphs->push_back(oc_host_graph_name);
+
+  return Status::OK();
+}
+
 Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
     Graph* g, const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
@@ -1540,193 +1990,32 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
   for (Node* n : g->nodes()) {
     if (n->IsIfNode()) {
       if_nodes.push_back(n);
-    } else if (n->type_string() == "While") {
+    } else if (n->IsWhileNode()) {
       while_nodes.push_back(n);
-    } else if (fld->Contains(n->type_string())) {
+    } else if (IsFunctionCall(*fld, *n)) {
       func_call_nodes.push_back(n);
-    } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) {
-      // Only gradient for user-defined function should be considered as
-      // function call node.
-      NameAttrList original_func;
-      TF_RETURN_IF_ERROR(GetNodeAttr(
-          n->def(), FunctionLibraryDefinition::kFuncAttr, &original_func));
-      if (fld->Contains(original_func.name())) {
-        func_call_nodes.push_back(n);
-      }
     }
   }
 
   for (Node* n : func_call_nodes) {
-    // Extract outside compilation for the function call.
-    bool func_has_outside_compilation = false;
-    NameAttrList func;
-    func.set_name(n->type_string());
-    typedef protobuf::Map<string, AttrValue> AttrMap;
-    *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
-    string new_func_name = absl::StrCat(n->name(), "_oc");
-    string host_func_name = absl::StrCat("oc_func_call_host_", n->name());
-    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFuncCallNode(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        func, new_func_name, host_func_name, host_compute_core, flr, fld,
-        shape_inference_graphs, &func_has_outside_compilation));
-
-    // If the function call does not have outside compilation, nothing to do.
-    if (!func_has_outside_compilation) {
-      continue;
-    }
-
-    *has_outside_compilation = true;
-
-    // Change `n` to call the new function directly.
-    NodeDefBuilder replace_builder(n->name(), new_func_name, fld);
-    for (const Edge* e : n->in_edges()) {
-      if (e->IsControlEdge()) {
-        continue;
-      }
-      replace_builder.Input(e->src()->name(), e->src_output(),
-                            e->src()->output_type(e->src_output()));
-    }
-    for (const auto& attr : n->attrs()) {
-      replace_builder.Attr(attr.first, attr.second);
-    }
-    NodeDef replace_def;
-    TF_RETURN_IF_ERROR(replace_builder.Finalize(&replace_def));
-    TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, replace_def));
-    replace->AddAttr(kXlaTokenInputNodesAttrName,
-                     std::vector<string>{kXlaTokenArgNodeName});
-
-    // Build host side graph for the function call.
-    string oc_host_graph_name =
-        absl::StrCat("oc_func_host_graph_", replace->name());
-    TF_RETURN_IF_ERROR(BuildHostGraphForFuncCallNode(
-        xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
-        replace->name(), host_func_name, oc_host_graph_name, fld));
-
-    // Record the host graph.
-    host_graphs->push_back(oc_host_graph_name);
+        host_compute_core, g, n, flr, fld, host_graphs, shape_inference_graphs,
+        has_outside_compilation));
   }
 
   for (Node* n : if_nodes) {
-    // Instantiate "then_branch" and "else_branch".
-    NameAttrList then_branch, else_branch;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "then_branch", &then_branch));
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "else_branch", &else_branch));
-
-    // Extract outside compilation for then_branch and else_branch.
-    bool then_branch_has_outside_compilation = false;
-    bool else_branch_has_outside_compilation = false;
-    string then_branch_host_func_name =
-               absl::StrCat("oc_then_branch_host_if_", n->name()),
-           else_branch_host_func_name =
-               absl::StrCat("oc_else_branch_host_if_", n->name());
-    string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
-           else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
-    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForIfNode(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        then_branch, then_branch_xla_func_name, then_branch_host_func_name,
-        host_compute_core, flr, fld, shape_inference_graphs,
-        &then_branch_has_outside_compilation));
-    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
-        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        else_branch, else_branch_xla_func_name, else_branch_host_func_name,
-        host_compute_core, flr, fld, shape_inference_graphs,
-        &else_branch_has_outside_compilation));
-
-    // If then/else branch do not have outside compilation, nothing to do.
-    if (!then_branch_has_outside_compilation &&
-        !else_branch_has_outside_compilation) {
-      continue;
-    }
-
-    *has_outside_compilation = true;
-
-    // Change If node to call the new functions.
-    then_branch.set_name(then_branch_xla_func_name);
-    n->ClearAttr("then_branch");
-    n->AddAttr("then_branch", then_branch);
-    else_branch.set_name(else_branch_xla_func_name);
-    n->ClearAttr("else_branch");
-    n->AddAttr("else_branch", else_branch);
-
-    string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
-
-    // XLA computation: add a SendToHost node to send cond predicate.
-    Node* pred_node;
-    TF_RETURN_IF_ERROR(n->input_node(0, &pred_node));
-    TF_ASSIGN_OR_RETURN(
-        Node * send_pred_node,
-        BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
-                            host_transfer_key, pred_node, g));
-    n->AddAttr(kXlaTokenInputNodesAttrName,
-               std::vector<string>{send_pred_node->name()});
-
-    // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
-    // visit If node after `send_pred_node`, thus the token output for
-    // `send_pred_node` has been generated.
-    g->AddControlEdge(send_pred_node, n);
-
-    // Build host side graph for the "If" node.
-    string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
-    TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
-        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        n->name(), host_transfer_key, oc_host_graph_name, fld,
-        then_branch_host_func_name, else_branch_host_func_name));
-    host_graphs->push_back(oc_host_graph_name);
+        host_compute_core, g, n, flr, fld, host_graphs, shape_inference_graphs,
+        has_outside_compilation));
   }
 
   for (Node* n : while_nodes) {
-    // Instantiate "cond" and "body".
-    NameAttrList cond, body;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "cond", &cond));
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "body", &body));
-
-    // Extract outside compilation for cond and body.
-    bool cond_has_outside_compilation = false;
-    bool body_has_outside_compilation = false;
-    string cond_host_func_name = absl::StrCat("oc_cond_host_while_", n->name()),
-           body_host_func_name = absl::StrCat("oc_body_host_while_", n->name());
-    string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
-           body_xla_func_name = absl::StrCat(body.name(), "_oc");
-    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForWhileNode(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
-        fld, shape_inference_graphs, &cond_has_outside_compilation));
-    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
-        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        body, body_xla_func_name, body_host_func_name, host_compute_core, flr,
-        fld, shape_inference_graphs, &body_has_outside_compilation));
-
-    // If cond/body do not have outside compilation, nothing to do.
-    if (!cond_has_outside_compilation && !body_has_outside_compilation) {
-      continue;
-    }
-
-    *has_outside_compilation = true;
-
-    // Change While node to call the new functions.
-    cond.set_name(cond_xla_func_name);
-    n->ClearAttr("cond");
-    n->AddAttr("cond", cond);
-    body.set_name(body_xla_func_name);
-    n->ClearAttr("body");
-    n->AddAttr("body", body);
-
-    string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
-
-    // XLA computation: rewrite cond function to add a SendToHost node to send
-    // loop predicate.
-    TF_RETURN_IF_ERROR(
-        AddSendLoopPredToLoopCond(fld, cond, n->name(), host_transfer_key));
-    n->AddAttr(kXlaTokenInputNodesAttrName,
-               std::vector<string>{kXlaTokenArgNodeName});
-
-    // Build host side graph for the "While" node.
-    string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
-    TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
-        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        n->name(), host_transfer_key, oc_host_graph_name, fld,
-        cond_host_func_name, body_host_func_name));
-    host_graphs->push_back(oc_host_graph_name);
+        host_compute_core, g, n, flr, fld, host_graphs, shape_inference_graphs,
+        has_outside_compilation));
   }
 
   return Status::OK();
@@ -1889,11 +2178,11 @@ Status ExtractOutsideCompilationForFunction(
 
   // Encapsulate outside_compilation cluster into function call node.
   std::unique_ptr<Graph> graph_out;
-  RewriteOutsideCompilationSubgraphFn rewrite_fn(
+  auto rewrite_fn = absl::make_unique<RewriteOutsideCompilationSubgraphFn>(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       new_func_name);
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
-      outside_compilation_attr_name, *fbody->graph, rewrite_fn,
+      outside_compilation_attr_name, *fbody->graph, *rewrite_fn,
       /*reuse_existing_functions=*/true, &graph_out, fld));
 
   // Replace outside_compilation function nodes with HostCompute ops.
@@ -1908,26 +2197,26 @@ Status ExtractOutsideCompilationForFunction(
       // If we could not infer shapes for XlaSendFromHost inputs statically, we
       // will set the "shape_inference_graph" attribute. In that case, copy
       // outside compilation subgraph as shape inference graph in `fld`.
-      NameAttrList shape_inference_graph;
+      auto shape_inference_graph = absl::make_unique<NameAttrList>();
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "shape_inference_graph",
-                                     &shape_inference_graph));
-      if (!shape_inference_graph.name().empty()) {
-        shape_inference_graphs->push_back(shape_inference_graph.name());
+                                     shape_inference_graph.get()));
+      if (!shape_inference_graph->name().empty()) {
+        shape_inference_graphs->push_back(shape_inference_graph->name());
         shape_inference_graphs_to_rewrite.push_back(
-            shape_inference_graph.name());
+            shape_inference_graph->name());
 
         const FunctionDef* xla_fdef = fld->Find(n->name());
         if (!xla_fdef) {
           return errors::Internal("Cannot find XLA function ", n->name());
         }
-        FunctionDef shape_inference_fdef = *xla_fdef;
-        shape_inference_fdef.mutable_signature()->set_name(
-            shape_inference_graph.name());
-        if (fld->Find(shape_inference_graph.name())) {
-          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph.name(),
-                                                  shape_inference_fdef));
+        auto shape_inference_fdef = absl::make_unique<FunctionDef>(*xla_fdef);
+        shape_inference_fdef->mutable_signature()->set_name(
+            shape_inference_graph->name());
+        if (fld->Find(shape_inference_graph->name())) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph->name(),
+                                                  *shape_inference_fdef));
         } else {
-          TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+          TF_RETURN_IF_ERROR(fld->AddFunctionDef(*shape_inference_fdef));
         }
       }
     }
@@ -1972,15 +2261,15 @@ Status ExtractOutsideCompilationForFunction(
   TF_RETURN_IF_ERROR(
       ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
                          outside_compilation_host_graphs, fld, &host_graph));
-  FunctionDef host_graph_fdef;
+  auto host_graph_fdef = absl::make_unique<FunctionDef>();
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*host_graph, host_graph_func_name,
                                         HostGraphControlRetMapping,
-                                        &host_graph_fdef));
+                                        host_graph_fdef.get()));
   if (fld->Find(host_graph_func_name)) {
     TF_RETURN_IF_ERROR(
-        fld->ReplaceFunction(host_graph_func_name, host_graph_fdef));
+        fld->ReplaceFunction(host_graph_func_name, *host_graph_fdef));
   } else {
-    TF_RETURN_IF_ERROR(fld->AddFunctionDef(host_graph_fdef));
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(*host_graph_fdef));
   }
 
   // Shape inference graphs might contain Placeholder nodes for outside
@@ -1999,19 +2288,19 @@ Status ExtractOutsideCompilationForFunction(
   }
 
   // Replace original function.
-  FunctionDef updated_fdef;
+  auto updated_fdef = absl::make_unique<FunctionDef>();
   TF_RETURN_IF_ERROR(
-      GraphToFunctionDef(*graph_out, new_func_name, &updated_fdef));
+      GraphToFunctionDef(*graph_out, new_func_name, updated_fdef.get()));
   const FunctionDef* original_fdef = fld->Find(func_name);
   if (original_fdef) {
     for (const auto& attr : original_fdef->attr()) {
-      (*updated_fdef.mutable_attr())[attr.first] = attr.second;
+      (*updated_fdef->mutable_attr())[attr.first] = attr.second;
     }
   }
   if (fld->Find(new_func_name)) {
-    TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, updated_fdef));
+    TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, *updated_fdef));
   } else {
-    TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(*updated_fdef));
   }
   if (VLOG_IS_ON(4)) {
     DumpGraphToFile(
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index f69a28b71b3..53f9b70c876 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -105,6 +105,8 @@ void AllocateAndParseFlags() {
   build_ops_flags = new BuildXlaOpsPassFlags;
   build_ops_flags->tf_xla_enable_lazy_compilation = true;
   build_ops_flags->tf_xla_print_cluster_outputs = false;
+  build_ops_flags->tf_xla_check_cluster_input_numerics = false;
+  build_ops_flags->tf_xla_check_cluster_output_numerics = false;
   build_ops_flags->tf_xla_disable_constant_folding = false;
 
   mark_for_compilation_flags = new MarkForCompilationPassFlags;
@@ -144,6 +146,14 @@ void AllocateAndParseFlags() {
             &build_ops_flags->tf_xla_print_cluster_outputs,
             "If true then insert Print nodes to print out values produced by "
             "XLA clusters."),
+       Flag("tf_xla_check_cluster_input_numerics",
+            &build_ops_flags->tf_xla_check_cluster_input_numerics,
+            "If true then insert CheckNumerics nodes to to check all cluster "
+            "inputs."),
+       Flag("tf_xla_check_cluster_output_numerics",
+            &build_ops_flags->tf_xla_check_cluster_output_numerics,
+            "If true then insert CheckNumerics nodes to to check all cluster "
+            "outputs."),
 
        Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
             "Switch a device into 'on-demand' mode, where instead of "
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 91e93f30d11..9307874133c 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -103,6 +103,14 @@ struct BuildXlaOpsPassFlags {
   // clusters.  Useful for debugging.
   bool tf_xla_print_cluster_outputs;
 
+  // If true, insert CheckNumerics nodes for every floating point typed input to
+  // an XLA cluster.
+  bool tf_xla_check_cluster_input_numerics;
+
+  // If true, insert CheckNumerics nodes for every floating point typed output
+  // from an XLA cluster.
+  bool tf_xla_check_cluster_output_numerics;
+
   // Disables all constant folding. The primary use for this is for testing to
   // guarantee that tests are run on XLA and not on TF's CPU implementation.
   bool tf_xla_disable_constant_folding;
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 127f0d4a82e..4773e8dc562 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
 #include "tensorflow/compiler/jit/clone_constants_for_better_clustering.h"
+#include "tensorflow/compiler/jit/cluster_scoping_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 #include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
@@ -50,6 +51,9 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 25,
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 5,
                       CloneConstantsForBetterClusteringPass);
 
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 9,
+                      ClusterScopingPass);
+
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 49b8731ca0b..e09dfd2b49c 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -5,33 +5,48 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+XLA_OPS_DEPS = [
+    "@com_google_absl//absl/container:flat_hash_map",
+    "@com_google_absl//absl/memory",
+    "//tensorflow/compiler/jit:common",
+    "//tensorflow/compiler/jit:flags",
+    "//tensorflow/compiler/jit:xla_activity_listener",
+    "//tensorflow/compiler/jit:xla_activity_proto_cc",
+    "//tensorflow/compiler/jit:xla_compilation_cache",
+    "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+    "//tensorflow/compiler/jit:xla_launch_util",
+    "//tensorflow/compiler/tf2xla:common",
+    "//tensorflow/compiler/tf2xla:tf2xla_util",
+    "//tensorflow/compiler/tf2xla:xla_compiler",
+    "//tensorflow/compiler/xla:status_macros",
+    "//tensorflow/compiler/xla:statusor",
+    "//tensorflow/compiler/xla/client:client_library",
+    "//tensorflow/compiler/xla/client:local_client",
+    "//tensorflow/compiler/xla/service:compiler",
+    "//tensorflow/core:core_cpu_internal",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:state_ops_op_lib",
+    "//tensorflow/core:stream_executor_no_cuda",
+    "//tensorflow/core/profiler/lib:traceme",
+    "//tensorflow/stream_executor:tf_allocator_adapter",
+]
+
+# Linked by tensorflow core, without registration of jit compilation passes.
 cc_library(
-    name = "xla_ops",
+    name = "xla_ops_no_jit_rewrite_registration",
     srcs = ["xla_ops.cc"],
     hdrs = ["xla_ops.h"],
-    deps = [
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/compiler/jit:flags",
-        "//tensorflow/compiler/jit:xla_compilation_cache",
-        "//tensorflow/compiler/jit:xla_device",
-        "//tensorflow/compiler/jit:xla_launch_util",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:tf2xla_util",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:state_ops_op_lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/stream_executor:tf_allocator_adapter",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
+    deps = XLA_OPS_DEPS,
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_ops",
+    hdrs = ["xla_ops.h"],
+    deps = XLA_OPS_DEPS + [
+        ":xla_ops_no_jit_rewrite_registration",
+        "//tensorflow/compiler/jit:jit_compilation_passes",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 788e90ffe99..fabd0374013 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -62,8 +63,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   DeviceType device_type = ctx->device_type();
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
-  std::unique_ptr<se::TfAllocatorAdapter> xla_allocator;
-  se::DeviceMemoryAllocator* device_allocator = nullptr;
+  se::DeviceMemoryAllocator* custom_allocator = nullptr;
 
   if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
     platform_id = se::host::kHostPlatformId;
@@ -83,23 +83,13 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
     // (which xla_allocator above uses) as on an XlaDevice, this is a dummy
     // allocator that returns XlaTensor objects. The XlaCompiler needs a real
     // allocator to allocate real buffers.
-
     platform_id = xla_device_metadata->platform()->id();
-    device_allocator =
+    custom_allocator =
         xla_device_metadata->client()->backend().memory_allocator();
   }
 
-  if (!device_allocator) {
-    xla::StatusOr<se::Platform*> maybe_platform =
-        se::MultiPlatformManager::PlatformWithId(platform_id);
-    OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status());
-
-    xla_allocator = absl::make_unique<se::TfAllocatorAdapter>(
-        maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({}));
-  }
-
   return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                         std::move(xla_allocator), device_allocator);
+                         custom_allocator);
 }
 
 // A closure describing how to run a compiled version of a TensorFlow function.
@@ -184,6 +174,31 @@ class XlaExecutableClosureStore {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
 
+// Return allocator from platform info if non-null, or populate and return a
+// pointer to the allocator adapter with allocator from context.
+//
+// This is necessary because for XLA devices the underlying TF allocator returns
+// dummy tensors.
+se::DeviceMemoryAllocator* GetAllocator(
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
+    OpKernelContext* ctx, const XlaPlatformInfo& platform_info) {
+  if (platform_info.custom_allocator()) {
+    return platform_info.custom_allocator();
+  }
+  if (!ctx->op_device_context()) {
+    // Stream is not set for the host platform.
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
+            .ValueOrDie();
+    tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}), platform);
+    return &tf_allocator_adapter->value();
+  }
+  // platform_info.
+  tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}),
+                                ctx->op_device_context()->stream());
+  return &tf_allocator_adapter->value();
+}
+
 }  // namespace
 
 XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
@@ -280,6 +295,7 @@ static Status CompileToLocalExecutable(
   TF_RETURN_IF_ERROR(SnapshotResourceVariables(ctx, resources, variables));
   *client = static_cast<xla::LocalClient*>(cache->client());
 
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
   XlaCompiler::Options options;
   options.client = *client;
   if (ctx->op_device_context() != nullptr) {
@@ -291,7 +307,8 @@ static Status CompileToLocalExecutable(
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls =
       (platform_info.platform_id() == se::host::kHostPlatformId);
-  options.device_allocator = platform_info.allocator();
+  options.device_allocator =
+      GetAllocator(&tf_allocator_adapter, ctx, platform_info);
   if (platform_info.xla_device_metadata()) {
     options.shape_representation_fn =
         platform_info.xla_device_metadata()->shape_representation_fn();
@@ -349,8 +366,11 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 
   VLOG(1) << "Executing XLA Computation...";
 
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+  se::DeviceMemoryAllocator* allocator =
+      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
   XlaComputationLaunchContext launch_context(
-      client, platform_info_.allocator(),
+      client, allocator,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       platform_info_.UseMultipleStreams());
   launch_context.PopulateInputs(ctx, kernel, variables,
@@ -360,21 +380,28 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(platform_info_.allocator());
+  run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
-  auto run_result = executable->Run(launch_context.arguments(), run_options);
+  xla::StatusOr<xla::ScopedShapedBuffer> run_result;
+  if (!stream || platform_info_.platform_id() == se::host::kHostPlatformId) {
+    run_result = executable->Run(launch_context.arguments(), run_options);
+  } else {
+    run_result = executable->RunAsync(launch_context.arguments(), run_options);
+  }
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
   OP_REQUIRES_OK(ctx, launch_context.PopulateOutputs(
                           ctx, kernel, run_result.ConsumeValueOrDie(),
-                          /*missing_ctx_input_prefix=*/0));
+                          /*missing_ctx_input_prefix=*/0, input_output_alias));
   VLOG(1) << "Done";
 }
 
@@ -467,6 +494,10 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     if (status.code() == error::UNIMPLEMENTED) {
       LOG(WARNING) << "Compilation failed:" << status.ToString()
                    << ".  Falling back to TF function call.";
+
+      BroadcastOptimizationRemark(
+          XlaOptimizationRemark::UNIMPLEMENTED_OPERATION, status.ToString())
+          .IgnoreError();
       executable = nullptr;
       mutex_lock guard(cannot_compile_cluster_mu_);
       cannot_compile_cluster_ = true;
@@ -498,7 +529,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
           client, executable, kernel, std::move(variables), constants_.size()));
 
   Tensor compilation_key(cpu_allocator, DT_STRING, TensorShape({}));
-  compilation_key.flat<string>()(0) = key;
+  compilation_key.flat<tstring>()(0) = key;
 
   Tensor compilation_successful(cpu_allocator, DT_BOOL, TensorShape({}));
   compilation_successful.flat<bool>()(0) = true;
@@ -513,13 +544,16 @@ XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
   Tensor key_tensor = ctx->input(ctx->num_inputs() - 1);
-  const XlaExecutableClosureStore::KeyT& key = key_tensor.flat<string>()(0);
+  const XlaExecutableClosureStore::KeyT& key = key_tensor.flat<tstring>()(0);
 
   XlaExecutableClosure closure =
       XlaExecutableClosureStore::Global()->Consume(key);
 
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+  se::DeviceMemoryAllocator* allocator =
+      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
   XlaComputationLaunchContext launch_context(
-      closure.client(), platform_info_.allocator(),
+      closure.client(), allocator,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       /*use_multiple_streams=*/platform_info_.UseMultipleStreams());
 
@@ -544,19 +578,28 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(platform_info_.allocator());
+  run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
-  auto run_result =
-      closure.executable()->Run(launch_context.arguments(), run_options);
+  xla::StatusOr<xla::ScopedShapedBuffer> run_result;
+  if (!stream || platform_info_.platform_id() == se::host::kHostPlatformId) {
+    run_result =
+        closure.executable()->Run(launch_context.arguments(), run_options);
+  } else {
+    run_result =
+        closure.executable()->RunAsync(launch_context.arguments(), run_options);
+  }
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time in computation: " << elapsed << "us";
 
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      closure.executable()->executable()->module().input_output_alias_config();
+
   tensorflow::profiler::TraceMe hlo_module_activity(
       [&] {
         return absl::StrCat("Populate Outputs (", ctx->num_outputs(), ")");
@@ -567,7 +610,8 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       ctx,
       launch_context.PopulateOutputs(
           ctx, closure.compilation_result(), run_result.ConsumeValueOrDie(),
-          /*missing_ctx_input_prefix=*/closure.num_constant_args()));
+          /*missing_ctx_input_prefix=*/closure.num_constant_args(),
+          input_output_alias));
 }
 
 REGISTER_KERNEL_BUILDER(Name("XlaLaunch").Device(DEVICE_CPU), XlaLocalLaunchOp);
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 3a1009ec8a7..bc6829a6c77 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -37,18 +37,14 @@ class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
   XlaPlatformInfo(XlaPlatformInfo&&) = default;
-  explicit XlaPlatformInfo(
-      const DeviceType device_type, se::Platform::Id platform_id,
-      const XlaDevice::Metadata* xla_device_metadata,
-      std::unique_ptr<se::TfAllocatorAdapter> xla_allocator,
-      se::DeviceMemoryAllocator* device_allocator)
+  explicit XlaPlatformInfo(const DeviceType device_type,
+                           se::Platform::Id platform_id,
+                           const XlaDevice::Metadata* xla_device_metadata,
+                           se::DeviceMemoryAllocator* device_allocator)
       : device_type_(device_type),
         platform_id_(platform_id),
         xla_device_metadata_(xla_device_metadata),
-        xla_allocator_(std::move(xla_allocator)),
-        device_allocator_(device_allocator) {
-    CHECK((device_allocator_ != nullptr) ^ (xla_allocator_.get() != nullptr));
-  }
+        device_allocator_(device_allocator) {}
 
   XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
 
@@ -56,9 +52,11 @@ class XlaPlatformInfo {
     return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
   }
 
-  se::DeviceMemoryAllocator* allocator() const {
-    return device_allocator_ ? device_allocator_ : xla_allocator_.get();
+  // Non-null only when run on an XLA device.
+  se::DeviceMemoryAllocator* custom_allocator() const {
+    return device_allocator_;
   }
+
   DeviceType device_type() const { return device_type_; }
 
   // This is equal to xla_device_metadata()->platform()->id() if
@@ -82,11 +80,8 @@ class XlaPlatformInfo {
   const XlaDevice::Metadata* xla_device_metadata_;
 
   // If the op associated with this XlaPlatformInfo is placed on an XLA device
-  // then device_allocator_ is the xla::Backend's memory allocator and
-  // xla_allocator_ is null.  If the op is placed on a regular CPU or GPU device
-  // then device_allocator_ is null and xla_allocator_ points to an appropriate
-  // se::TfAllocatorAdapter instance.
-  std::unique_ptr<se::TfAllocatorAdapter> xla_allocator_;
+  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
+  // is placed on a regular CPU or GPU device then device_allocator_ is null.
   se::DeviceMemoryAllocator* device_allocator_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index b819998bdc7..90755a1cb70 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -677,8 +677,7 @@ bool MarkForCompilationPassImpl::IsScalarIntegerResourceOperation(
   }
 
   DataType dtype;
-  if (!GetNodeAttr(n->def(), "dtype", &dtype).ok() ||
-      !DataTypeIsInteger(dtype)) {
+  if (!TryGetNodeAttr(n->def(), "dtype", &dtype) || !DataTypeIsInteger(dtype)) {
     return false;
   }
 
@@ -695,7 +694,7 @@ bool MarkForCompilationPassImpl::IsScalarIntegerResourceOperation(
   }
 
   const TensorProto* proto = nullptr;
-  if (!GetNodeAttr(const_input->def(), "value", &proto).ok()) {
+  if (!TryGetNodeAttr(const_input->def(), "value", &proto)) {
     return false;
   }
 
@@ -924,20 +923,35 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
 }
 
 absl::optional<string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
-  // Look for an _XlaScope on both nodes.  If both nodes have a scope and the
-  // scopes do not match, do not cluster along this edge. This restriction is
-  // overridden if the global_jit_level_ is ON. If even one of the nodes lacks
-  // an _XlaScope attribute, then it is treated as a "bridge" and a cluster may
-  // be created along it.  We may want to restrict this behavior to require all
-  // nodes marked with _XlaCompile=true to also have a _XlaScope property set
-  // (and raise an error otherwise); but for now we don't do this.
-  if (global_jit_level_ != OptimizerOptions::OFF) {
-    return absl::nullopt;
-  }
+  // Look for either _XlaScope or _XlaInternalScope on both nodes to guide
+  // clustering.  If both nodes have a scope and the scopes do not match, do
+  // not cluster along this edge.  If even one of the nodes lacks a scope
+  // attribute, then it is treated as a "bridge" and a cluster may be created
+  // along it.
+  //
+  // The difference between _XlaScope and _XlaInternalScope is that _XlaScope is
+  // provided by users through jit_scope APIs, while _XlaInternalScope is
+  // automatically generated by the ClusterScopingPass when auto_jit is on.  As
+  // such, we respect _XlaScope only when auto_jit is off, while respecting
+  // _XlaInternalScope only when auto_jit is on.
+  //
+  // We may want to restrict the _XlaScope behavior to require all nodes marked
+  // with _XlaCompile=true to also have a _XlaScope property set (and raise an
+  // error otherwise); but for now we don't do this.
 
-  string scope;
-  if (GetNodeAttr(node->attrs(), kXlaScopeAttr, &scope).ok()) {
-    return scope;
+  if (global_jit_level_ != OptimizerOptions::OFF) {
+    // If global_jit_level_ is ON, respect only _XlaInternalScope.
+    const string& scope =
+        GetNodeAttrString(node->attrs(), kXlaInternalScopeAttr);
+    if (!scope.empty()) {
+      return scope;
+    }
+  } else {
+    // If global_jit_level_ is OFF, respect only _XlaScope.
+    const string& scope = GetNodeAttrString(node->attrs(), kXlaScopeAttr);
+    if (!scope.empty()) {
+      return scope;
+    }
   }
 
   return absl::nullopt;
@@ -970,8 +984,7 @@ Status MarkForCompilationPassImpl::BuildInitialClusterSet() {
     int effective_cluster_size =
         (node->IsIdentity() || node->IsConstant()) ? 0 : 1;
 
-    bool has_functional_control_flow =
-        node->type_string() == "While" || node->IsIfNode();
+    bool has_functional_control_flow = node->IsWhileNode() || node->IsIfNode();
 
     absl::optional<DeadnessPredicate> deadness_predicate;
     if (deadness_analysis_) {
@@ -1000,7 +1013,7 @@ Status MarkForCompilationPassImpl::BuildInitialClusterSet() {
     bool is_xla_compile_attr_true = false;
 
     bool xla_compile_attr;
-    if (GetNodeAttr(node->attrs(), kXlaCompileAttr, &xla_compile_attr).ok()) {
+    if (TryGetNodeAttr(node->attrs(), kXlaCompileAttr, &xla_compile_attr)) {
       is_xla_compile_attr_true |= xla_compile_attr;
     }
 
@@ -1549,9 +1562,7 @@ StatusOr<bool> MarkForCompilationPassImpl::ShouldCompileClusterImpl(
            XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
        global_jit_level_ != OptimizerOptions::OFF);
 
-  if (!should_compile &&
-      registration->autoclustering_policy ==
-          XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested &&
+  if (!should_compile && global_jit_level_ != OptimizerOptions::OFF &&
       device_type.type_string() == DEVICE_CPU) {
     static std::once_flag once;
     std::call_once(once, [] {
@@ -1628,10 +1639,9 @@ std::atomic<int64>* GetPointerToFuel(int64 initial_value) {
 }
 }  // anonymous namespace
 
-bool IsCompilable(
-    FunctionLibraryRuntime* flr, const NodeDef& ndef,
-    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>*
-        uncompilable_node_info) {
+bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
+                  RecursiveCompilabilityChecker::UncompilableNodesMap*
+                      uncompilable_node_info) {
   Device* device = flr->device();
   const XlaOpRegistry::DeviceRegistration* registration;
   CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
@@ -1657,8 +1667,8 @@ bool IsCompilable(
     return checker.IsCompilableCall(ndef, flr);
   }
 
-  std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
-      uncompilable_node_result = checker.FindUncompilableNodes(ndef, flr);
+  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_node_result =
+      checker.FindUncompilableNodes(ndef, flr);
   uncompilable_node_info->swap(uncompilable_node_result);
   return uncompilable_node_info->empty();
 }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index e186763b5e4..7adfc1419bf 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -52,10 +52,9 @@ class MarkForCompilationPass : public GraphOptimizationPass {
 // function is compilable iff every operator in the function body is
 // compilable. If 'ndef' is not compilable and 'uncompilable_node_info' is not
 // null, we will populate 'uncompilable_node_info' with uncompilable node info.
-bool IsCompilable(
-    FunctionLibraryRuntime* flr, const NodeDef& ndef,
-    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>*
-        uncompilable_node_info = nullptr);
+bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
+                  RecursiveCompilabilityChecker::UncompilableNodesMap*
+                      uncompilable_node_info = nullptr);
 
 namespace testing {
 // DO NOT USE IN PRODUCTION.
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index cbe60b05eef..f10b4d0b4cb 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -52,7 +52,7 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   std::unordered_map<string, string> ids;
   for (Node* node : graph.nodes()) {
     string cluster;
-    if (GetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster).ok()) {
+    if (TryGetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster)) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
     }
@@ -1718,5 +1718,91 @@ TEST(XlaCompilationTest, UnsupportedEnterExitPattern) {
   EXPECT_EQ(0, clusters.size());
 }
 
+namespace {
+Node* MakeStageNode(GraphDefBuilder& builder, string name,
+                    std::initializer_list<DataType> dtypes,
+                    absl::Span<const ops::NodeOut> values) {
+  auto opts = builder.opts()
+                  .WithName(std::move(name))
+                  .WithAttr("dtypes", std::move(dtypes));
+  if (opts.HaveError()) {
+    return nullptr;
+  }
+
+  NodeBuilder node_builder(name, "Stage", opts.op_registry());
+  node_builder.Input(values);
+  return opts.FinalizeBuilder(&node_builder);
+}
+}  // namespace
+
+TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
+  auto build_staged_graph = [](std::unique_ptr<Graph>* graph) -> Status {
+    // Construct a graph as below with two pipeline stages and test that nodes
+    // in different stages will not be merged if ClusterScopingPass is on.
+    //
+    //       b
+    //       |
+    //       v
+    // a -> add0 -> relu0 -> stage
+    //
+    //             b
+    //             |
+    //             v
+    // unstage -> add1 -> relu1
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("a")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::SourceOp("Const", builder.opts()
+                                         .WithName("b")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* unstage = ops::SourceOp(
+        "Unstage",
+        builder.opts().WithName("unstage").WithAttr("dtypes", {DT_FLOAT}));
+
+    Node* add0 = ops::BinaryOp("Add", a, b, builder.opts().WithName("add0"));
+    Node* add1 =
+        ops::BinaryOp("Add", unstage, b, builder.opts().WithName("add1"));
+    Node* relu0 = ops::UnaryOp("Relu", add0, builder.opts().WithName("relu0"));
+    ops::UnaryOp("Relu", add1, builder.opts().WithName("relu1"));
+    MakeStageNode(builder, "stage", {DT_FLOAT}, {relu0});
+
+    return GraphDefBuilderToGraph(builder, graph->get());
+  };
+
+  // All nodes go into the same cluster if ClusterScopingPass is off.
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    TF_ASSERT_OK(build_staged_graph(&graph));
+
+    TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
+        &graph,
+        MarkForCompilationPassTestHelper::Options().WithNoClusterScoping()));
+
+    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    EXPECT_EQ(clusters["add0"], clusters["add1"]);
+    EXPECT_EQ(clusters["add0"], clusters["relu1"]);
+    EXPECT_EQ(clusters["relu0"], clusters["add1"]);
+    EXPECT_EQ(clusters["relu0"], clusters["relu1"]);
+  }
+
+  // By default, ClusterScopingPass is on and different pipeline stages should
+  // not be merged.
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    TF_ASSERT_OK(build_staged_graph(&graph));
+
+    TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    EXPECT_NE(clusters["add0"], clusters["add1"]);
+    EXPECT_NE(clusters["add0"], clusters["relu1"]);
+    EXPECT_NE(clusters["relu0"], clusters["add1"]);
+    EXPECT_NE(clusters["relu0"], clusters["relu1"]);
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index fa5abdfe508..44bd7b47d54 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
+
+#include "tensorflow/compiler/jit/cluster_scoping_pass.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/public/session_options.h"
@@ -48,8 +50,14 @@ namespace tensorflow {
   opt_options.graph = graph;
   opt_options.session_options = &session_options;
   opt_options.flib_def = flib_def;
-  MarkForCompilationPass pass;
-  return pass.RunForTest(
+
+  if (options.enable_cluster_scoping) {
+    ClusterScopingPass cluster_scoping_pass;
+    TF_RETURN_IF_ERROR(cluster_scoping_pass.Run(opt_options));
+  }
+
+  MarkForCompilationPass mark_for_compilation_pass;
+  return mark_for_compilation_pass.RunForTest(
       opt_options,
       /*disable_deadness_analysis=*/options.disable_deadness_analysis);
 }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
index b81fca43c80..f482a80f5b5 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
@@ -24,8 +24,12 @@ class MarkForCompilationPassTestHelper {
   struct Options {
     bool enable_global_jit;
     bool disable_deadness_analysis;
+    bool enable_cluster_scoping;
 
-    Options() : enable_global_jit(true), disable_deadness_analysis(true) {}
+    Options()
+        : enable_global_jit(true),
+          disable_deadness_analysis(true),
+          enable_cluster_scoping(true) {}
 
     Options WithNoGlobalJit() {
       Options copy = *this;
@@ -38,6 +42,12 @@ class MarkForCompilationPassTestHelper {
       copy.disable_deadness_analysis = false;
       return copy;
     }
+
+    Options WithNoClusterScoping() {
+      Options copy = *this;
+      copy.enable_cluster_scoping = false;
+      return copy;
+    }
   };
 
   // Runs the MarkForCompilation pass on `graph` after assigning all nodes in
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index b878f05e1df..932e0769813 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -135,7 +135,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
 
     if (constant_value) {
       const TensorProto* proto = nullptr;
-      if (!GetNodeAttr(node->def(), "value", &proto).ok()) {
+      if (!TryGetNodeAttr(node->def(), "value", &proto)) {
         if (listener->IsInterested()) {
           *listener << "\ncould not find \"value\" attribute in node";
         }
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test.cc b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
index 2154e371e83..c4db4b082ad 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
@@ -45,8 +45,8 @@ class AutoClusteringTestImpl : public AutoClusteringTest {
 TEST_F(AutoClusteringTestImpl, KerasImagenetMain) {
   // Generated from
   //
-  //  bazel run -c opt --config=cuda                                           \
-  //   tensorflow_models/official/resnet/keras:keras_imagenet_main             \
+  //  TARGET_PATH=tensorflow_models/official/vision/image_classification       \
+  //  bazel run -c opt --config=cuda ${TARGET_PATH}:resnet_imagenet_main       \
   //    -- --skip_eval --num_gpus=1 --dtype=fp16 --batch_size=192              \
   //    --train_steps=210 --enable_xla --enable_eager=true
   //
@@ -57,8 +57,8 @@ TEST_F(AutoClusteringTestImpl, KerasImagenetMain) {
 TEST_F(AutoClusteringTestImpl, KerasImagenetMainGraphMode) {
   // Generated from
   //
-  // bazel run -c opt --config=cuda                                            \
-  //   tensorflow_models/official/resnet/keras:keras_imagenet_main             \
+  //  TARGET_PATH=tensorflow_models/official/vision/image_classification       \
+  //  bazel run -c opt --config=cuda ${TARGET_PATH}:resnet_imagenet_main       \
   //   -- --use_synthetic_data --num_gpus=1 --batch_size=117 --train_steps=600 \
   //   --skip_eval=True --logtostderr --enable_xla
   TF_ASSERT_OK(
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index faeb3883b48..726f7f0b068 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -186,7 +186,7 @@ Status AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
                          /*input_buffer_bytes=*/k_buffer_size,
                          /*output_buffer_bytes=*/k_buffer_size,
                          io::ZlibCompressionOptions::GZIP());
-  string decompressed_pbtxt_string;
+  tstring decompressed_pbtxt_string;
   Status s = in.ReadNBytes(INT_MAX, &decompressed_pbtxt_string);
   if (!s.ok() && !errors::IsOutOfRange(s)) {
     // OutOfRange is fine since we set the number of read bytes to INT_MAX.
diff --git a/tensorflow/compiler/jit/xla_activity.proto b/tensorflow/compiler/jit/xla_activity.proto
index 1edde32cc46..50bfb297fa1 100644
--- a/tensorflow/compiler/jit/xla_activity.proto
+++ b/tensorflow/compiler/jit/xla_activity.proto
@@ -94,3 +94,27 @@ message XlaJitCompilationActivity {
   // Total microseconds spent in (re-)compiling this cluster so far.
   int64 cumulative_compile_time_us = 4;
 }
+
+// LINT.IfChange
+//
+// Used for logging situations seen in Tensorflow models being optimized that
+// are known to not perform well with XLA.
+//
+// Next ID: 3
+message XlaOptimizationRemark {
+  // Next ID: 6
+  enum Warning {
+    NONE = 0;
+    INACCURATE_OPERATION = 1;
+    SLOW_OPERATION = 2;
+    UNIMPLEMENTED_OPERATION = 3;
+    SLOW_IMAGE_RESIZE_DIMENSIONS = 4;
+    MEGAMORPHIC_FUNCTION = 5;
+  }
+
+  Warning warning = 1;
+
+  // Information such as which node was the problem.
+  string debug_information = 2;
+}
+// LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/compiler/jit/xla_activity_listener.h)
diff --git a/tensorflow/compiler/jit/xla_activity_listener.cc b/tensorflow/compiler/jit/xla_activity_listener.cc
index 1f14cc90527..a1ea6a6bf8e 100644
--- a/tensorflow/compiler/jit/xla_activity_listener.cc
+++ b/tensorflow/compiler/jit/xla_activity_listener.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -71,6 +72,21 @@ Status BroadcastXlaActivity(
   });
 }
 
+Status BroadcastOptimizationRemark(XlaOptimizationRemark optimization_remark) {
+  VLOG(2) << "OptimizationRemark: " << optimization_remark.DebugString();
+  return ForEachListener([&](XlaActivityListener* listener) {
+    return listener->Listen(optimization_remark);
+  });
+}
+
+Status BroadcastOptimizationRemark(
+    XlaOptimizationRemark::Warning optimization_warning,
+    string debug_information) {
+  XlaOptimizationRemark remark;
+  remark.set_warning(optimization_warning);
+  remark.set_debug_information(std::move(debug_information));
+  return BroadcastOptimizationRemark(std::move(remark));
+}
 void RegisterXlaActivityListener(
     std::unique_ptr<XlaActivityListener> listener) {
   XlaActivityListenerList* listener_list = GetXlaActivityListenerList();
diff --git a/tensorflow/compiler/jit/xla_activity_listener.h b/tensorflow/compiler/jit/xla_activity_listener.h
index 547181d6010..05328c896d3 100644
--- a/tensorflow/compiler/jit/xla_activity_listener.h
+++ b/tensorflow/compiler/jit/xla_activity_listener.h
@@ -27,6 +27,18 @@ Status BroadcastXlaActivity(XlaAutoClusteringActivity auto_clustering_activity);
 // Broadcast `jit_compilation_activity` to all the registered listeners.
 Status BroadcastXlaActivity(XlaJitCompilationActivity jit_compilation_activity);
 
+// Broadcast `jit_compilation_activity` to all the registered listeners.
+Status BroadcastOptimizationRemark(XlaOptimizationRemark optimization_remark);
+
+// LINT.IfChange
+// Called after TensorFlow realizes possible lost performance. The parameters in
+// this should match all of the values in the XlaOptimizationRemark proto.
+Status BroadcastOptimizationRemark(
+    XlaOptimizationRemark::Warning optimization_warning,
+    string debug_information);
+
+// LINT.ThenChange(//tensorflow/compiler/jit/xla_activity.proto)
+
 // Various components of the system can subclass XlaActivityListener to
 // notifications on auto-clustering and JIT compilation events.
 //
@@ -41,6 +53,9 @@ class XlaActivityListener {
   virtual Status Listen(
       const XlaJitCompilationActivity& jit_compilation_activity) = 0;
 
+  // Called after TensorFlow realizes possible lost performance.
+  virtual Status Listen(const XlaOptimizationRemark& optimization_remark) = 0;
+
   // Called at program exit in best-effort manner to give listeners a chance to
   // flush their state.
   //
diff --git a/tensorflow/compiler/jit/xla_activity_listener_test.cc b/tensorflow/compiler/jit/xla_activity_listener_test.cc
index 4d087e2caac..034adbf44fe 100644
--- a/tensorflow/compiler/jit/xla_activity_listener_test.cc
+++ b/tensorflow/compiler/jit/xla_activity_listener_test.cc
@@ -43,6 +43,10 @@ class TestListener : public XlaActivityListener {
     return Status::OK();
   }
 
+  Status Listen(const XlaOptimizationRemark& optimization_remark) override {
+    return Status::OK();
+  }
+
   ~TestListener() override {}
 
   const XlaAutoClusteringActivity& auto_clustering_activity() const {
diff --git a/tensorflow/compiler/jit/xla_activity_logging_listener.cc b/tensorflow/compiler/jit/xla_activity_logging_listener.cc
index a36bd3bd707..87e39a5481f 100644
--- a/tensorflow/compiler/jit/xla_activity_logging_listener.cc
+++ b/tensorflow/compiler/jit/xla_activity_logging_listener.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/core/platform/logger.h"
 
@@ -59,6 +60,23 @@ class XlaActivityLoggingListener final : public XlaActivityListener {
     return Status::OK();
   }
 
+  Status Listen(const XlaOptimizationRemark& optimization_remark) override {
+    if (!IsEnabled()) {
+      VLOG(3) << "Logging XlaJitCompilationActivity disabled";
+      return Status::OK();
+    }
+
+    if (Logger* logger = Logger::GetSingletonAsync()) {
+      VLOG(2) << "Logging XlaJitCompilationActivity";
+      VLOG(3) << optimization_remark.DebugString();
+      logger->LogProto(optimization_remark);
+    } else {
+      VLOG(2) << "Not logging: logger not ready yet.";
+    }
+
+    return Status::OK();
+  }
+
  private:
   bool IsEnabled() {
     static bool result = ComputeIsEnabled();
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 035a50e1852..1e440031570 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <numeric>
 
+#include "absl/base/call_once.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -224,6 +227,20 @@ Status XlaCompilationCache::CompileSingleOp(
                      out_compilation_result, out_executable);
 }
 
+namespace {
+// Print something that users can search for to definitively ascertain that XLA
+// was used for their TF model.
+//
+// Prints only once to avoid spamming LOG(INFO).
+void LogOnceXlaCompiledFirstCluster() {
+  static absl::once_flag log_once;
+  absl::call_once(log_once, [] {
+    LOG(INFO) << "Compiled cluster using XLA!  This line is logged at most "
+                 "once for the lifetime of the process.";
+  });
+}
+}  // namespace
+
 Status XlaCompilationCache::CompileImpl(
     const XlaCompiler::Options& options, const NameAttrList& function,
     absl::Span<const XlaCompiler::Argument> args,
@@ -301,6 +318,9 @@ Status XlaCompilationCache::CompileImpl(
       }
 
       if (is_megamorphic) {
+        BroadcastOptimizationRemark(XlaOptimizationRemark::MEGAMORPHIC_FUNCTION,
+                                    function.name())
+            .IgnoreError();
         VLOG(3) << "Not compiling cluster " << function.name()
                 << " because it is megamorphic.";
         return false;
@@ -346,11 +366,13 @@ Status XlaCompilationCache::CompileImpl(
 
     const uint64 compile_end_us = env->NowMicros();
     const uint64 compile_time_us = compile_end_us - compile_start_us;
+    metrics::UpdateXlaCompilationTime(compile_time_us);
     {
       mutex_lock lock(cluster_compile_stats_mu_);
       auto it = cluster_compile_stats_.find(function.name());
       it->second.compile_count++;
       it->second.cumulative_compile_time_us += compile_time_us;
+      LogOnceXlaCompiledFirstCluster();
       VLOG(1) << "compiled " << function.name() << " "
               << it->second.compile_count
               << " times, compile time: " << compile_time_us
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 24d29f4c808..3dc8379ebaa 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -83,9 +83,11 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       executable->Run(launch_context.arguments(), run_options);
   TF_RETURN_IF_ERROR(run_result.status());
 
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
   TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
       ctx, result, run_result.ConsumeValueOrDie(),
-      /*missing_ctx_input_prefix=*/0));
+      /*missing_ctx_input_prefix=*/0, input_output_alias));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index fbfda449ebd..85c09a027d3 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -98,10 +98,10 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 14> kAllXlaCpuTypes = {
-    {DT_UINT8, DT_QUINT8, DT_INT8, DT_QINT8, DT_INT32, DT_QINT32, DT_INT64,
-     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL,
-     DT_BFLOAT16}};
+constexpr std::array<DataType, 16> kAllXlaCpuTypes = {
+    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_INT8, DT_QINT8, DT_INT16, DT_INT32,
+     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_CPU, XlaCompileOp, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 1d8b4beb8bd..be2038a7a8a 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -203,6 +203,8 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
       device_ordinal_(options.device_ordinal),
       jit_device_name_(options.compilation_device_name),
       platform_(options.platform),
+      intra_op_parallelism_threads_(
+          session_options.config.intra_op_parallelism_threads()),
       use_multiple_streams_(options.use_multiple_streams),
       shape_representation_fn_(options.shape_representation_fn),
       allowed_devices_(options.allowed_devices) {
@@ -233,10 +235,13 @@ xla::LocalClient* XlaDevice::client() const {
   // don't want to do it until we get a chance to hook the platform up
   // to a simulator.
 
+  xla::LocalClientOptions options;
+  options.set_platform(platform_)
+      .set_allowed_devices(allowed_devices_)
+      .set_intra_op_parallelism_threads(intra_op_parallelism_threads_);
   // TODO(b/78468222): This can fail, at least when the backend is GPU and
   // there is no GPU on the host.
-  return xla::ClientLibrary::GetOrCreateLocalClient(platform_, allowed_devices_)
-      .ValueOrDie();
+  return xla::ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
 }
 
 Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) {
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 51910c6fabc..877580e73f9 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -202,6 +202,8 @@ class XlaDevice : public LocalDevice {
   const DeviceType jit_device_name_;
   // The platform for this device.
   se::Platform* const platform_;  // Not owned.
+  // Intra-op threads to spawn (from SessionOptions).
+  const int intra_op_parallelism_threads_;
   // Memory allocator associated with this device.
   Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr;  // Not owned.
 
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index ea784e72137..5e4c6340f42 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -90,8 +90,9 @@ XlaDeviceContext::XlaDeviceContext(
   CHECK(host_to_device_stream_ != nullptr);
   CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
-    shape_representation_fn_ = [](const TensorShape& shape,
-                                  DataType dtype) -> xla::StatusOr<xla::Shape> {
+    shape_representation_fn_ =
+        [](const TensorShape& shape, DataType dtype,
+           bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
       return xla_shape;
@@ -130,9 +131,10 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
   CHECK(xla_tensor);
 
   Status status = [&]() -> Status {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape,
-                        shape_representation_fn_(device_tensor->shape(),
-                                                 device_tensor->dtype()));
+    TF_ASSIGN_OR_RETURN(
+        xla::Shape shape,
+        shape_representation_fn_(device_tensor->shape(), device_tensor->dtype(),
+                                 /*use_fast_memory=*/false));
 
     // The device tensor should always be fresh.
     TF_RET_CHECK(!xla_tensor->has_shaped_buffer());
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 2c8203b1c5d..99e95314f64 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -212,11 +212,11 @@ class XlaAssignVariableOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
                               .Device(DEVICE)                                  \
                               .HostMemory("output")                            \
-                              .TypeConstraint<string>("T"),                    \
+                              .TypeConstraint<tstring>("T"),                   \
                           ArgOp);                                              \
   REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kRetOp)              \
                               .Device(DEVICE)                                  \
-                              .TypeConstraint<string>("T")                     \
+                              .TypeConstraint<tstring>("T")                    \
                               .HostMemory("input"),                            \
                           RetvalOp);
 
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 8934b52d686..cead23d816e 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -147,10 +147,10 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 14> kAllXlaGpuTypes = {
-    {DT_UINT8, DT_QUINT8, DT_INT8, DT_QINT8, DT_INT32, DT_QINT32, DT_INT64,
-     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL,
-     DT_BFLOAT16}};
+constexpr std::array<DataType, 16> kAllXlaGpuTypes = {
+    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_INT8, DT_QINT8, DT_INT16, DT_INT32,
+     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_GPU, XlaCompileOp, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index c138fd1ff39..e3706a09278 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -14,243 +14,20 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
-#include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/jit/compilability_check_util.h"
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
-namespace {
-
-// Utility which searches for values in a sorted list by scanning over it once.
-// No matter how many times ScanForValue is called, the list is scanned at most
-// once. However, if a call to ScanForValue skips over a value, that value is
-// not revisited in future calls to ScanForValue, so callers must take
-// care to order their calls.
-//
-// Useful for merging multiple sorted lists in O(n) time.
-class SinglePassSearch {
- public:
-  // Creates a SinglePassSearch object that can be used to search in `values`.
-  // Does not take ownership of `values`. `values` must outlive this.
-  // `values` must be sorted.
-  explicit SinglePassSearch(const std::vector<int>* values)
-      : current_index_(0), values_(values) {}
-
-  // Scans forward in the vector looking for "value", updating the internal
-  // position in to the vector.
-  // Returns true iff the vector contains the given value at or after current
-  // position.
-  // Not thread-safe.
-  bool ScanForValue(int value) {
-    while (current_index_ < values_->size() &&
-           (*values_)[current_index_] <= value) {
-      if ((*values_)[current_index_] == value) {
-        current_index_++;
-        return true;
-      }
-      current_index_++;
-    }
-    return false;
-  }
-
- private:
-  int current_index_;
-  const std::vector<int>* values_;
-};
-}  // namespace
 
 bool XlaKernelCreator::CanCreateKernel(const FunctionLibraryRuntime& flr,
                                        const NodeDef& node_def) const {
-  const FunctionDef* function_def =
-      flr.GetFunctionLibraryDefinition()->Find(node_def.name());
-  if (function_def == nullptr) {
-    // The node def is not calling a function. Individual ops can be
-    // run directly using on-demand mode, no need to create XlaLaunch
-    // kernel for them.
-    return false;
-  }
-
-  // If kXlaCompileAttr is set on the node_def, use its value.
-  const auto& it = node_def.attr().find(kXlaCompileAttr);
-  if (it != node_def.attr().end()) {
-    return it->second.b();
-  }
-
-  // kXlaCompileAttr is not set on node_def, check if it is set on
-  // FunctionDef.
-  bool xla_compile = false;
-  Status status = flr.GetFunctionLibraryDefinition()->GetAttr(
-      node_def, kXlaCompileAttr, &xla_compile);
-  if (!status.ok() || !xla_compile) {
-    if (VLOG_IS_ON(3)) {
-      if (!status.ok()) {
-        VLOG(3) << "No " << kXlaCompileAttr << " attr defined for "
-                << node_def.op() << ". status=" << status.ToString();
-      } else {
-        VLOG(3) << node_def.op() << " is explicitly marked not to be compiled";
-      }
-    }
-    return false;
-  }
-  return true;
-}
-
-// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
-// runtime, returns this function's body in `fbody` as well as the indices
-// of its constant and resource arguments.
-// `fbody` is owned by `flr`.
-// `constant_arg_indices` and `resource_arg_indices` should be empty vector.
-// They are sorted in ascending order on this function's return.
-Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
-                                       const NodeDef& node_def,
-                                       const FunctionBody** fbody,
-                                       std::vector<int>* constant_arg_indices,
-                                       std::vector<int>* resource_arg_indices) {
-  FunctionLibraryRuntime::Handle handle;
-  // If node_def is not instantiable, e.g., the function does not exist,
-  // simply bail out.
-  TF_RETURN_IF_ERROR(
-      flr->Instantiate(node_def.op(), AttrSlice(&node_def.attr()), &handle));
-  *fbody = flr->GetFunctionBody(handle);
-  CHECK(*fbody);  // Can't be nullptr since we just instantiated it.
-  const DataTypeVector& arg_types = (*fbody)->arg_types;
-  std::vector<bool> const_args(arg_types.size());
-  // If we can't analyze the const args. Bail out.
-  TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*((*fbody)->graph), &const_args,
-                             /*compile_time_const_nodes=*/nullptr, flr));
-
-  for (int i = 0; i < const_args.size(); ++i) {
-    if (const_args[i]) {
-      constant_arg_indices->push_back(i);
-    }
-  }
-
-  // There can be hundreds of resource variables. Reserve the space for them.
-  // We don't reserve for constants above as they are usually few.
-  resource_arg_indices->reserve(arg_types.size());
-  for (int i = 0; i < arg_types.size(); ++i) {
-    if (arg_types[i] == DT_RESOURCE) {
-      resource_arg_indices->push_back(i);
-    }
-  }
-
-  return Status::OK();
+  return CanCreateXlaKernel(flr, node_def);
 }
 
 Status XlaKernelCreator::CreateKernel(FunctionLibraryRuntime* flr,
                                       const NodeDef& node_def,
                                       std::unique_ptr<OpKernel>* kernel) const {
-  if (!CanCreateKernel(*flr, node_def)) {
-    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
-  }
-
-  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
-
-  // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterCompilationKernels();
-  std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
-      uncompilable_node_info;
-  if (!IsCompilable(flr, node_def, &uncompilable_node_info)) {
-    string message = absl::StrCat(
-        "Function invoked by the following node is not compilable: ",
-        node_def.ShortDebugString(), ".\n");
-    absl::StrAppend(&message, "Uncompilable nodes:\n");
-    for (const auto& node_info : uncompilable_node_info) {
-      string node_message =
-          absl::StrCat("\t", node_info.name, ": ",
-                       node_info.uncompilable_reason, "\n", "\tStacktrace:\n");
-      for (const auto& stack_frame : node_info.stack_trace) {
-        absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
-                              stack_frame.name, stack_frame.function_name);
-      }
-      absl::StrAppend(&message, node_message);
-    }
-    VLOG(1) << message;
-    // node_def is calling a function that XLA can't compile.
-    return errors::InvalidArgument(message);
-  }
-
-  // Get function body, constant args, and resource args.
-  const FunctionBody* fbody = nullptr;
-  std::vector<int> constant_arg_indices;
-  std::vector<int> resource_arg_indices;
-  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
-
-  // Set input and output memory types.
-  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
-  // These indices are used only for optimization purposes. They allow us
-  // to loop over constant_arg_indices and resource_arg_indices only once
-  // while iterating over all the function arguments checking if it is a
-  // resource or a constant.
-  // The reason we optimized this code is because functions can have a lot of
-  // captured arguments. For example, the backward pass of ResNet50 takes in all
-  // 214 variables and a similar number of activations.
-  SinglePassSearch constants_search(&constant_arg_indices);
-  SinglePassSearch resources_search(&resource_arg_indices);
-  for (int i = 0; i < fbody->arg_types.size(); ++i) {
-    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
-      // Compile-time constants and resource handles are expected to be in
-      // host memory.
-      input_memory_types[i] = HOST_MEMORY;
-    }
-  }
-  // One might wonder, about the case where a compile-time constant argument
-  // (which must be in host memory) is also used as an input into an op,
-  // e.g. Add, that expects its inputs in device memory. Here is how it
-  // works now.
-  // First, what do we mean by "op expects an input in XYZ memory"?
-  // There are two types of "ops" here: the tf2xla kernel and the HLO
-  // computation it builds. The tf2xla kernel needs to retrieve the actual
-  // numeric value of the compile-time constant tensors, so it really expects
-  // them to be on in host memory. However, for other inputs, it refers to them
-  // using xla::ComputationDataHandle, which is just a symbolic handle that
-  // xla::ComputationBuilder assigns. How does this handle gets assigned for
-  // constant arguments? Even constant arguments get an _Arg node in the graph
-  // instatiated for Function compilation. The tf2xla kernel for constant _Arg
-  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
-  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
-  // constant XlaLiteral is included in the HLO graph, and subsequently, in
-  // the actual executable, which is copied to the device before being
-  // executed. Thus, when this executable runs, the constant is available in
-  // device memory.
-
-  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
-  // in device memory except for resources.
-  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
-  for (int i = 0; i < fbody->ret_types.size(); ++i) {
-    if (fbody->ret_types[i] == DT_RESOURCE) {
-      output_memory_types[i] = HOST_MEMORY;
-    }
-  }
-
-  // Create the kernel.
-  NameAttrList function;
-  function.set_name(node_def.op());
-  *(function.mutable_attr()) = node_def.attr();
-
-  Device* dev = flr->device();
-  Status s;
-  OpKernelConstruction construction(
-      DeviceType(dev->device_type()), dev,
-      dev->GetAllocator(AllocatorAttributes()), &node_def,
-      &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types,
-      fbody->ret_types, output_memory_types, flr->graph_def_version(), &s);
-
-  *kernel = absl::make_unique<XlaLocalLaunchBase>(
-      &construction, constant_arg_indices, resource_arg_indices, function);
-  return s;
+  return CreateXlaKernel(flr, node_def, kernel);
 }
 
 namespace {
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.h b/tensorflow/compiler/jit/xla_kernel_creator.h
index 739cf02d877..8815ee49ce5 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.h
+++ b/tensorflow/compiler/jit/xla_kernel_creator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
-#define TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -39,4 +39,4 @@ class XlaKernelCreator : public CustomKernelCreator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
new file mode 100644
index 00000000000..96bde65003f
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@@ -0,0 +1,259 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/kernels/xla_ops.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
+//
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(const std::vector<int>* values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_->size() &&
+           (*values_)[current_index_] <= value) {
+      if ((*values_)[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
+  }
+
+ private:
+  int current_index_;
+  const std::vector<int>* values_;
+};
+}  // namespace
+
+bool CanCreateXlaKernel(const FunctionLibraryRuntime& flr,
+                        const NodeDef& node_def) {
+  const FunctionDef* function_def =
+      flr.GetFunctionLibraryDefinition()->Find(node_def.name());
+  if (function_def == nullptr) {
+    // The node def is not calling a function. Individual ops can be
+    // run directly using on-demand mode, no need to create XlaLaunch
+    // kernel for them.
+    return false;
+  }
+
+  // If kXlaCompileAttr is set on the node_def, use its value.
+  const auto& it = node_def.attr().find(kXlaCompileAttr);
+  if (it != node_def.attr().end()) {
+    return it->second.b();
+  }
+
+  // kXlaCompileAttr is not set on node_def, check if it is set on
+  // FunctionDef.
+  bool xla_compile = false;
+  Status status = flr.GetFunctionLibraryDefinition()->GetAttr(
+      node_def, kXlaCompileAttr, &xla_compile);
+  if (!status.ok() || !xla_compile) {
+    if (VLOG_IS_ON(3)) {
+      if (!status.ok()) {
+        VLOG(3) << "No " << kXlaCompileAttr << " attr defined for "
+                << node_def.op() << ". status=" << status.ToString();
+      } else {
+        VLOG(3) << node_def.op() << " is explicitly marked not to be compiled";
+      }
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
+// runtime, returns this function's body in `fbody` as well as the indices
+// of its constant and resource arguments.
+// `fbody` is owned by `flr`.
+// `constant_arg_indices` and `resource_arg_indices` should be empty vector.
+// They are sorted in ascending order on this function's return.
+Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
+                                       const NodeDef& node_def,
+                                       const FunctionBody** fbody,
+                                       std::vector<int>* constant_arg_indices,
+                                       std::vector<int>* resource_arg_indices) {
+  FunctionLibraryRuntime::Handle handle;
+  // If node_def is not instantiable, e.g., the function does not exist,
+  // simply bail out.
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(node_def.op(), AttrSlice(&node_def.attr()), &handle));
+  *fbody = flr->GetFunctionBody(handle);
+  CHECK(*fbody);  // Can't be nullptr since we just instantiated it.
+  const DataTypeVector& arg_types = (*fbody)->arg_types;
+  std::vector<bool> const_args(arg_types.size());
+  // If we can't analyze the const args. Bail out.
+  TF_RETURN_IF_ERROR(
+      BackwardsConstAnalysis(*((*fbody)->graph), &const_args,
+                             /*compile_time_const_nodes=*/nullptr, flr));
+
+  for (int i = 0; i < const_args.size(); ++i) {
+    if (const_args[i]) {
+      constant_arg_indices->push_back(i);
+    }
+  }
+
+  // There can be hundreds of resource variables. Reserve the space for them.
+  // We don't reserve for constants above as they are usually few.
+  resource_arg_indices->reserve(arg_types.size());
+  for (int i = 0; i < arg_types.size(); ++i) {
+    if (arg_types[i] == DT_RESOURCE) {
+      resource_arg_indices->push_back(i);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                       std::unique_ptr<OpKernel>* kernel) {
+  if (!CanCreateXlaKernel(*flr, node_def)) {
+    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
+  }
+
+  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
+
+  // Make sure that kernels have been registered on the JIT device.
+  XlaOpRegistry::RegisterCompilationKernels();
+  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
+  if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
+    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+        uncompilable_node_info;
+    for (const auto& it : uncompilable_nodes_map) {
+      for (const auto& info : it.second.second) {
+        uncompilable_node_info.emplace_back(info);
+      }
+    }
+    string message = absl::StrCat(
+        "Function invoked by the following node is not compilable: ",
+        node_def.ShortDebugString(), ".\n");
+    absl::StrAppend(&message, "Uncompilable nodes:\n");
+    for (const auto& node_info : uncompilable_node_info) {
+      string node_message =
+          absl::StrCat("\t", node_info.name, ": ",
+                       node_info.uncompilable_reason, "\n", "\tStacktrace:\n");
+      for (const auto& stack_frame : node_info.stack_trace) {
+        absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
+                              stack_frame.name, stack_frame.function_name);
+      }
+      absl::StrAppend(&message, node_message);
+    }
+    VLOG(1) << message;
+    // node_def is calling a function that XLA can't compile.
+    return errors::InvalidArgument(message);
+  }
+
+  // Get function body, constant args, and resource args.
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  // Set input and output memory types.
+  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(&constant_arg_indices);
+  SinglePassSearch resources_search(&resource_arg_indices);
+  for (int i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = HOST_MEMORY;
+    }
+  }
+  // One might wonder, about the case where a compile-time constant argument
+  // (which must be in host memory) is also used as an input into an op,
+  // e.g. Add, that expects its inputs in device memory. Here is how it
+  // works now.
+  // First, what do we mean by "op expects an input in XYZ memory"?
+  // There are two types of "ops" here: the tf2xla kernel and the HLO
+  // computation it builds. The tf2xla kernel needs to retrieve the actual
+  // numeric value of the compile-time constant tensors, so it really expects
+  // them to be on in host memory. However, for other inputs, it refers to them
+  // using xla::ComputationDataHandle, which is just a symbolic handle that
+  // xla::ComputationBuilder assigns. How does this handle gets assigned for
+  // constant arguments? Even constant arguments get an _Arg node in the graph
+  // instatiated for Function compilation. The tf2xla kernel for constant _Arg
+  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
+  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+  // constant XlaLiteral is included in the HLO graph, and subsequently, in
+  // the actual executable, which is copied to the device before being
+  // executed. Thus, when this executable runs, the constant is available in
+  // device memory.
+
+  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
+  // in device memory except for resources.
+  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
+  for (int i = 0; i < fbody->ret_types.size(); ++i) {
+    if (fbody->ret_types[i] == DT_RESOURCE) {
+      output_memory_types[i] = HOST_MEMORY;
+    }
+  }
+
+  // Create the kernel.
+  NameAttrList function;
+  function.set_name(node_def.op());
+  *(function.mutable_attr()) = node_def.attr();
+
+  Device* dev = flr->device();
+  Status s;
+  OpKernelConstruction construction(
+      DeviceType(dev->device_type()), dev,
+      dev->GetAllocator(AllocatorAttributes()), &node_def,
+      &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types,
+      fbody->ret_types, output_memory_types, flr->graph_def_version(), &s);
+
+  *kernel = absl::make_unique<XlaLocalLaunchBase>(
+      &construction, constant_arg_indices, resource_arg_indices, function);
+  return s;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.h b/tensorflow/compiler/jit/xla_kernel_creator_util.h
new file mode 100644
index 00000000000..71398c334fc
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FunctionLibraryRuntime;
+class OpKernel;
+
+  // Given a NodeDef 'node_def' and the function library runtime 'flr', returns
+  // true if 'node_def' is a call to a compilable function defined in 'flr',
+  // with the kXlaCompileAttr set.
+bool CanCreateXlaKernel(const FunctionLibraryRuntime& flr,
+                        const NodeDef& node_def);
+
+// Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
+Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                       std::unique_ptr<OpKernel>* kernel);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index e9c4eb6e8ee..176c39aeb4c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -42,6 +42,13 @@ namespace tensorflow {
 namespace {
 using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
+
+const char kPossibleNonVariableResourceHintMessage[] =
+    "If the error is similar to `Trying to access resource using the wrong "
+    "type`, this is likely because XLA only accepts Resource Variables as "
+    "inputs by snapshotting their values. Other TensorFlow resource types like "
+    "TensorList/TensorArray/Stack are not supported. Try removing non-variable "
+    "resource inputs to XLA.";
 }  // anonymous namespace
 
 VariableInfo::VariableInfo(int index, Var* var) : index_(index), var_(var) {}
@@ -88,7 +95,12 @@ static Status GetVariableInfosFromCtxInputs(
       [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
 
   std::vector<core::RefCountPtr<Var>> variables;
-  TF_RETURN_IF_ERROR(LookupResources(ctx, resource_handles, &variables));
+
+  Status s = LookupResources(ctx, resource_handles, &variables);
+  if (!s.ok()) {
+    errors::AppendToMessage(&s, kPossibleNonVariableResourceHintMessage);
+    return s;
+  }
 
   result->clear();
   result->reserve(variable_indices.size());
@@ -235,9 +247,32 @@ void XlaComputationLaunchContext::PopulateInputs(
   }
 }
 
+namespace {
+
+bool MustAliasOutput(const xla::HloInputOutputAliasConfig& input_output_alias,
+                     int output_num) {
+  xla::ShapeIndex output_index;
+  if (input_output_alias.shape().IsTuple()) {
+    output_index = {output_num};
+  } else {
+    DCHECK_EQ(output_num, 0)
+        << "output_num must be 0 for non-tuple shapes but is " << output_num;
+    output_index = {};
+  }
+  if (input_output_alias.shape().tuple_shapes_size() == 0) {
+    return false;
+  }
+  return input_output_alias.OutputHasAlias(output_index) &&
+         input_output_alias.GetAliasedParameter(output_index).value().kind ==
+             xla::HloInputOutputAliasConfig::kUserAlias;
+}
+
+}  // namespace
+
 Status XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
-    ScopedShapedBuffer output, int missing_ctx_input_prefix) {
+    ScopedShapedBuffer output, int missing_ctx_input_prefix,
+    const xla::HloInputOutputAliasConfig& input_output_alias) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
@@ -331,8 +366,16 @@ Status XlaComputationLaunchContext::PopulateOutputs(
             << "Invalid input for outputs " << i << ": " << input_index;
         ctx->set_output(i, ctx->input(input_index));
       } else {
+        if (MustAliasOutput(input_output_alias, output_num)) {
+          DCHECK(output.buffer({output_num}).is_null())
+              << "Expected output buffer to be aliased, but it is not nil.";
+        }
         se::DeviceMemoryBase buffer = output.buffer({output_num});
         if (allocate_xla_tensors_) {
+          if (MustAliasOutput(input_output_alias, output_num)) {
+            return errors::Unimplemented(
+                "Aliasing is not yet supported for allocate_xla_tensors_.");
+          }
           Tensor* output_tensor;
           TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
           XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
@@ -347,8 +390,18 @@ Status XlaComputationLaunchContext::PopulateOutputs(
             CHECK_EQ(output_tensor->TotalBytes(), 0);
           }
         } else {
+          bool is_aliased = false;
+          if (MustAliasOutput(input_output_alias, output_num)) {
+            int xla_param = input_output_alias.GetAliasedParameter({output_num})
+                                .value()
+                                .parameter_number;
+            DCHECK(arg_ptrs_[xla_param] != nullptr);
+            buffer = arg_ptrs_[xla_param]->buffer({});
+            is_aliased = true;
+          }
           Tensor output_tensor = XlaTensorBuffer::MakeTensor(
-              ctx->expected_output_dtype(i), shape, buffer, allocator);
+              ctx->expected_output_dtype(i), shape,
+              /*unref_buffer=*/!is_aliased, buffer, allocator);
           output.set_buffer(se::OwningDeviceMemory(), {output_num});
           ctx->set_output(i, output_tensor);
         }
@@ -412,7 +465,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       se::DeviceMemoryBase buffer = output.buffer({output_num});
       output.set_buffer(se::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
-          write.type, write.shape, buffer, allocator);
+          write.type, write.shape, /*unref_buffer=*/true, buffer, allocator);
       *variable_infos[i].var()->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 429ff0a065c..3df36e25daa 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -149,10 +149,10 @@ class XlaComputationLaunchContext {
   //
   // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
   // missing and adjusts input indices accordingly.
-  Status PopulateOutputs(OpKernelContext* ctx,
-                         const XlaCompiler::CompilationResult* kernel,
-                         xla::ScopedShapedBuffer output,
-                         int missing_ctx_input_prefix);
+  Status PopulateOutputs(
+      OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+      xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
+      const xla::HloInputOutputAliasConfig& input_output_alias);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
@@ -193,12 +193,15 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
   static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
-                           se::DeviceMemoryBase buffer, Allocator* allocator) {
+                           bool unref_buffer, se::DeviceMemoryBase buffer,
+                           Allocator* allocator) {
     size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
     auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
                                               buffer.size(), allocator);
     Tensor t(dtype, shape, tensor_buffer);
-    tensor_buffer->Unref();
+    if (unref_buffer) {
+      tensor_buffer->Unref();
+    }
     return t;
   }
 
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 247bb83e7f7..1e556822f4b 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -19,10 +19,23 @@ filegroup(
     srcs = glob(["**/*.td"]),
 )
 
+cc_library(
+    name = "op_name_mapper",
+    srcs = ["op_name_mapper.cc"],
+    hdrs = ["op_name_mapper.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+    ],
+)
+
 cc_library(
     name = "tf_mlir_opt_main",
     srcs = ["tf_mlir_opt_main.cc"],
+    copts = ["-std=c++14"],
     deps = [
+        ":init_mlir",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_legalize_tf",
@@ -31,12 +44,14 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
-        "//tensorflow/compiler/mlir/xla",
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:lhlo",
         "//tensorflow/compiler/mlir/xla:xla_dialect_registration",
         "//tensorflow/compiler/mlir/xla:xla_legalize_control_flow",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
         "//tensorflow/compiler/mlir/xla:xla_legalize_to_standard",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
         "@llvm//:support",
         "@local_config_mlir//:AffineDialectRegistration",
         "@local_config_mlir//:MlirOptLib",
@@ -49,16 +64,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "init_mlir",
+    srcs = ["init_mlir.cc"],
+    hdrs = ["init_mlir.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@llvm//:support",
+    ],
+)
+
 tf_cc_binary(
     name = "tf-opt",
     deps = [
         ":tf_mlir_opt_main",
+        "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
     ],
 )
 
 tf_cc_binary(
     name = "tf-mlir-translate",
+    srcs = ["tf_mlir_translate_main.cc"],
     deps = [
+        ":init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
@@ -66,12 +94,14 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
         "//tensorflow/compiler/mlir/xla:xla_mlir_translate",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_proto_cc",
         "//tensorflow/stream_executor/lib",
         "@llvm//:support",
         "@local_config_mlir//:IR",
+        "@local_config_mlir//:Support",
+        "@local_config_mlir//:TranslateClParser",
         "@local_config_mlir//:Translation",
-        "@local_config_mlir//:tools/mlir-translate/mlir-translate",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/init_mlir.cc b/tensorflow/compiler/mlir/init_mlir.cc
new file mode 100644
index 00000000000..54f8a57d8a6
--- /dev/null
+++ b/tensorflow/compiler/mlir/init_mlir.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/init_mlir.h"
+
+#include "tensorflow/core/platform/init_main.h"
+
+namespace tensorflow {
+
+InitMlir::InitMlir(int *argc, char ***argv) : init_llvm_(*argc, *argv) {
+  constexpr char kSeparator[] = "--";
+
+  // Find index of separator between two sets of flags.
+  int pass_remainder = 1;
+  bool split = false;
+  for (int i = 0; i < *argc; ++i) {
+    if (llvm::StringRef((*argv)[i]) == kSeparator) {
+      pass_remainder = i;
+      *argc -= (i + 1);
+      split = true;
+      break;
+    }
+  }
+
+  tensorflow::port::InitMain((*argv)[0], &pass_remainder, argv);
+  if (split) {
+    *argc += pass_remainder;
+    (*argv)[1] = (*argv)[0];
+    ++*argv;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/init_mlir.h b/tensorflow/compiler/mlir/init_mlir.h
new file mode 100644
index 00000000000..91020c1758b
--- /dev/null
+++ b/tensorflow/compiler/mlir/init_mlir.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+
+namespace tensorflow {
+
+// Initializer to perform both InitLLVM and TF"s InitMain initialization.
+// InitMain also performs flag parsing and '--' is used to separate flags passed
+// to it: Flags before the first '--' are parsed by InitMain and argc and argv
+// progressed to the flags post. If there is no separator, then no flags are
+// parsed by InitMain and argc/argv left unadjusted.
+// TODO(jpienaar): The way help flag is handled could be improved.
+class InitMlir {
+ public:
+  InitMlir(int *argc, char ***argv);
+
+ private:
+  llvm::InitLLVM init_llvm_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 7846716e9dd..663740bf692 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -26,8 +26,8 @@ filegroup(
     name = "tensorflow_lite_ops_td_files",
     srcs = [
         "ir/tfl_ops.td",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@local_config_mlir//:OpBaseTdFiles",
-        "@local_config_mlir//:QuantizationOpsTdFiles",
     ],
 )
 
@@ -146,6 +146,7 @@ cc_library(
     hdrs = [
         "utils/validators.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         "@local_config_mlir//:Dialect",
         "@local_config_mlir//:IR",
@@ -166,8 +167,9 @@ cc_library(
         "ir/tfl_traits.h",
         "transforms/passes.h",
         "utils/attribute_utils.h",
-        "utils/quantization_utils.h",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_traits.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_lite_ops_inc_gen",
         ":validators",
@@ -181,51 +183,36 @@ cc_library(
         "@local_config_mlir//:QuantOps",
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
-        "@local_config_mlir//:TypeUtilities",
     ],
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tensorflow_lite_quantization_utils",
-    srcs = [
-        "utils/generated_op_quant_spec_getters.inc",
-        "utils/quantization_driver.cc",
-        "utils/quantization_utils.cc",
-    ],
-    hdrs = [
-        "utils/quantization_utils.h",
-    ],
-    deps = [
-        ":tensorflow_lite",
-        "//tensorflow/core:lib_proto_parsing",
-        "@com_google_absl//absl/memory",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
-    ],
-)
-
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
+        "transforms/extract_ophint.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
         "transforms/generated_prepare_tf.inc",
+        "transforms/legalize_ophint_func_op.cc",
         "transforms/legalize_tf.cc",
         "transforms/lower_static_tensor_list.cc",
+        "transforms/prepare_composite_functions_tf.cc",
         "transforms/prepare_tf.cc",
+        "transforms/trim_functions_tf.cc",
     ],
     hdrs = [
         "transforms/passes.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
+        ":common",
         ":tensorflow_lite",
-        ":tensorflow_lite_quantization_utils",
         ":validators",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
@@ -234,7 +221,6 @@ cc_library(
         "@local_config_mlir//:QuantOps",
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
-        "@local_config_mlir//:TypeUtilities",
     ],
     alwayslink = 1,
 )
@@ -248,13 +234,16 @@ cc_library(
     hdrs = [
         "transforms/passes.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_lite",
         ":validators",
+        "//tensorflow/compiler/mlir/tensorflow",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:Pass",
+        "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
     ],
     alwayslink = 1,
@@ -267,14 +256,16 @@ cc_library(
         "transforms/post_quantize.cc",
         "transforms/prepare_quantize.cc",
         "transforms/quantize.cc",
+        "utils/generated_op_quant_spec_getters.inc",
     ],
     hdrs = [
         "transforms/passes.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_lite",
-        ":tensorflow_lite_quantization_utils",
         ":validators",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "@com_google_absl//absl/memory",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
@@ -287,32 +278,26 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_native_cc_binary(
-    name = "op_quant_spec_getters_gen",
+filegroup(
+    name = "generated_op_quant_spec_getters",
     srcs = [
-        "tools/op_quant_spec_getters_gen.cc",
-    ],
-    deps = [
-        "@llvm//:support",
-        "@llvm//:tablegen",
-        "@local_config_mlir//:TableGen",
+        "utils/generated_op_quant_spec_getters.inc",
     ],
 )
 
 genrule(
     name = "op_quant_spec_getters_inc",
     srcs = [
-        "@local_config_mlir//:include/mlir/Dialect/QuantOps/QuantPredicates.td",
-        "@local_config_mlir//:include/mlir/IR/OpBase.td",
-        ":ir/tfl_ops.td",
+        "ir/tfl_ops.td",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
     ],
     outs = [
         "utils/generated_op_quant_spec_getters.inc",
     ],
-    cmd = ("$(location :op_quant_spec_getters_gen) " +
+    cmd = ("$(location //tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen) " +
            "-I external/local_config_mlir/include " +
            "$(location //tensorflow/compiler/mlir/lite:ir/tfl_ops.td) " + " -o $@"),
-    tools = [":op_quant_spec_getters_gen"],
+    tools = ["//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen"],
 )
 
 # Library with tensorflow Lite dialect static initialization.
@@ -321,6 +306,7 @@ cc_library(
     srcs = [
         "ir/dialect_registration.cc",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_lite",
         "@local_config_mlir//:IR",
@@ -329,9 +315,9 @@ cc_library(
 )
 
 tf_native_cc_binary(
-    name = "operator-writer-gen",
+    name = "operator-converter-gen",
     srcs = [
-        "operator_writer_gen.cc",
+        "operator_converter_gen.cc",
     ],
     deps = [
         "@llvm//:support",
@@ -341,30 +327,30 @@ tf_native_cc_binary(
 )
 
 genrule(
-    name = "operator_writer_inc",
+    name = "operator_converter_inc",
     srcs = [
-        "@local_config_mlir//:include/mlir/Dialect/QuantOps/QuantPredicates.td",
-        "@local_config_mlir//:include/mlir/IR/OpBase.td",
-        ":ir/tfl_ops.td",
+        "ir/tfl_ops.td",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
     ],
     outs = [
-        "operator_writers.inc",
+        "operator_converters.inc",
     ],
-    cmd = ("$(location :operator-writer-gen) " +
+    cmd = ("$(location :operator-converter-gen) " +
            "-I external/local_config_mlir/include " +
            "$(location //tensorflow/compiler/mlir/lite:ir/tfl_ops.td) " + " -o $@"),
-    tools = [":operator-writer-gen"],
+    tools = [":operator-converter-gen"],
 )
 
 cc_library(
     name = "flatbuffer_tflite_operator_lib",
     srcs = [
         "flatbuffer_operator.cc",
-        "operator_writers.inc",
+        "operator_converters.inc",
     ],
     hdrs = [
         "flatbuffer_operator.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -394,6 +380,7 @@ cc_library(
     hdrs = [
         "emit_error_reporter.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         "//tensorflow/lite/core/api",
         "@local_config_mlir//:IR",
@@ -405,18 +392,23 @@ cc_library(
     srcs = [
         "flatbuffer_import.cc",
         "flatbuffer_translate.cc",
+        "utils/convert_type.cc",
     ],
     hdrs = [
         "flatbuffer_import.h",
         "flatbuffer_translate.h",
+        "utils/convert_type.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
         ":tensorflow_lite_dialect_registration",
+        "//tensorflow/compiler/mlir:op_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
@@ -426,6 +418,7 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -459,12 +452,24 @@ cc_library(
     hdrs = [
         "tf_tfl_translate_cl.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         "@llvm//:support",
     ],
     alwayslink = 1,
 )
 
+cc_library(
+    name = "common",
+    hdrs = [
+        "common/tfl_pass_config.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        "@llvm//:support",
+    ],
+)
+
 filegroup(
     name = "tf_tfl_translate_main",
     srcs = [
@@ -476,10 +481,13 @@ tf_cc_binary(
     name = "tf_tfl_translate",
     srcs = [":tf_tfl_translate_main"],
     deps = [
+        ":common",
         ":flatbuffer_translate_lib",
         ":tensorflow_lite",
+        ":tf_tfl_passes",
         ":tf_tfl_translate_cl_options",
         ":tf_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
@@ -497,6 +505,7 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform/default/build_config:base",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/delegates/flex:delegate",
@@ -510,12 +519,42 @@ tf_cc_binary(
     ],
 )
 
+cc_library(
+    name = "tf_tfl_passes",
+    srcs = ["tf_tfl_passes.cc"],
+    hdrs = [
+        "tf_tfl_passes.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":common",
+        ":tensorflow_lite_legalize_tf",
+        ":tensorflow_lite_optimize",
+        ":tensorflow_lite_quantize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "@llvm//:support",
+        "@local_config_mlir//:Analysis",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:Parser",
+        "@local_config_mlir//:Pass",
+        "@local_config_mlir//:QuantOps",
+        "@local_config_mlir//:QuantOpsDialectRegistration",
+        "@local_config_mlir//:Support",
+        "@local_config_mlir//:Transforms",
+    ],
+)
+
 cc_library(
     name = "tf_to_tfl_flatbuffer",
     srcs = ["tf_to_tfl_flatbuffer.cc"],
     hdrs = [
         "tf_to_tfl_flatbuffer.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":flatbuffer_translate_lib",
         ":tensorflow_lite",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
new file mode 100644
index 00000000000..3b3ba4dc686
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
+
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+namespace TFL {
+
+// A config that controls which passes get run as part TFLite converter.
+struct PassConfig {
+  PassConfig()
+      : emit_builtin_tflite_ops(true),
+        run_quantize(false),
+        emit_quant_adaptor_ops(false),
+        lower_tensor_list_ops(false),
+        trim_functions_whitelist({}) {}
+
+  // If `emit_builtin_tflite_ops` is true, TF Lite legalization passes will be
+  // added, which produces TF Lite ops.
+  bool emit_builtin_tflite_ops;
+  // If run_quantize is true, quantization passes will be added.
+  bool run_quantize;
+  // If `emit_quant_adaptor_ops` is true, Quantize and
+  // Dequantize ops are added as part of running quantization passes.
+  bool emit_quant_adaptor_ops;
+  // If `lower_tensor_list_ops` is true, tensorlist ops will be lowered to basic
+  // TF ops before legalization to TF Lite dialect.
+  bool lower_tensor_list_ops;
+  // The whitelist of functions that would be preserved after trimming.
+  llvm::ArrayRef<std::string> trim_functions_whitelist;
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 5256013bbce..74cecd6fbb6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -15,13 +15,29 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 
+#include <algorithm>
+#include <cctype>
 #include <iostream>
+#include <sstream>
 #include <string>
+#include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
@@ -31,78 +47,590 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Support/FileUtilities.h"  // TF:local_config_mlir
 #include "mlir/Translation.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+using llvm::ArrayRef;
 using mlir::Builder;
+using mlir::DenseElementsAttr;
 using mlir::FuncOp;
 using mlir::Location;
 using mlir::MLIRContext;
 using mlir::OpBuilder;
+using mlir::Operation;
+using mlir::OperationState;
 using mlir::OwningModuleRef;
+using mlir::Value;
+using mlir::quant::QuantizedType;
 using tflite::TensorT;
 using xla::StatusOr;
 
 namespace errors = tensorflow::errors;
+namespace tfl = mlir::TFL;
 
 namespace {
 bool IsScalar(const TensorT& tensor) {
-  // TODO(krzysd): We can't distinguish scalars and unranked tensors
+  // TODO(b/138222071) We can't distinguish scalars and unranked tensors
   // Work out a way to handle this and stub out the code until then
   return tensor.shape.empty() && false;
 }
 
-StatusOr<mlir::Type> GetTensorElementType(const TensorT& tensor,
-                                          Builder builder) {
-  switch (tensor.type) {
-    case tflite::TensorType_FLOAT32:
-      return builder.getF32Type();
-    case tflite::TensorType_FLOAT16:
-      return builder.getF16Type();
-    case tflite::TensorType_INT32:
-      return builder.getIntegerType(32);
-    case tflite::TensorType_UINT8:
-      return builder.getIntegerType(8);
-    case tflite::TensorType_INT64:
-      return builder.getIntegerType(64);
-    case tflite::TensorType_STRING:
-      return errors::InvalidArgument("String tensors are not supported");
-    case tflite::TensorType_BOOL:
-      return builder.getI1Type();
-    case tflite::TensorType_INT16:
-      return builder.getIntegerType(16);
-    case tflite::TensorType_COMPLEX64:
-      return mlir::ComplexType::get(builder.getF32Type());
-    case tflite::TensorType_INT8:
-      return builder.getIntegerType(8);
-  }
-  return errors::OutOfRange("Unknown tensor type");
+bool IsQuantized(const TensorT& tensor) {
+  return (tensor.quantization != nullptr) &&
+         !tensor.quantization->zero_point.empty();
 }
 
-StatusOr<mlir::Type> GetTensorType(const TensorT& tensor, Builder builder) {
-  TF_ASSIGN_OR_RETURN(auto elem_type, GetTensorElementType(tensor, builder));
-  if (IsScalar(tensor)) {
+// Create the MLIR NamedLoc location corresponding to a given tensor
+Location TensorLoc(const TensorT& tensor, Builder builder, Location base) {
+  if (tensor.name.empty()) {
+    return base;
+  }
+  return mlir::NameLoc::get(builder.getIdentifier(tensor.name), base);
+}
+
+// Returns the correct type for a quantized tensor
+// We have a special case for constants since they have a higher minimum value.
+StatusOr<QuantizedType> GetQuantizedType(const TensorT& tensor, Builder builder,
+                                         bool is_constant = false) {
+  tflite::QuantizationParametersT& quant_params = *tensor.quantization;
+  if (quant_params.details.AsCustomQuantization()) {
+    return errors::Unimplemented("Cannot handle experimental quantization");
+  }
+
+  bool is_signed = true;
+  mlir::IntegerType storage_type;
+  if (tensor.type == tflite::TensorType_UINT8) {
+    is_signed = false;
+    storage_type = builder.getIntegerType(8);
+  } else {
+    auto raw_elem_type = ConvertElementType(tensor.type, builder);
+    if (!raw_elem_type.isa<mlir::IntegerType>()) {
+      return errors::InvalidArgument(
+          "Quantized tensors must be stored as integers");
+    }
+    storage_type = raw_elem_type.cast<mlir::IntegerType>();
+  }
+
+  // TFlite uses narrow-range [u]int8 for constant buffers of quantized weights.
+  // Since we don't know which ones are weights, we represent this optimization
+  // as a change in the storage bounds for the type for all constants of this
+  // type.
+  bool is_weight_buffer = is_constant && (storage_type.getWidth() == 8);
+
+  int64_t storage_min = QuantizedType::getDefaultMininumForInteger(
+                            is_signed, storage_type.getWidth()) +
+                        is_weight_buffer;
+  int64_t storage_max = QuantizedType::getDefaultMaxinumForInteger(
+      is_signed, storage_type.getWidth());
+  uint32_t flags =
+      is_signed ? mlir::quant::QuantizationFlags::FlagValue::Signed : 0;
+
+  if (0 != quant_params.quantized_dimension) {
+    llvm::SmallVector<double, 4> scales(quant_params.scale.begin(),
+                                        quant_params.scale.end());
+    return mlir::quant::UniformQuantizedPerAxisType::get(
+        flags, storage_type, builder.getF32Type(), scales,
+        quant_params.zero_point, quant_params.quantized_dimension, storage_min,
+        storage_max);
+  }
+  return mlir::quant::UniformQuantizedType::get(
+      flags, storage_type, builder.getF32Type(), quant_params.scale.at(0),
+      quant_params.zero_point.at(0), storage_min, storage_max);
+}
+
+// TODO(b/138222071) Remove shapeless_are_scalars once we can reliably
+// make that distinction and don't have to rely on context
+// (input to main and constants must have static shape)
+StatusOr<mlir::TensorType> GetTensorType(const TensorT& tensor, Builder builder,
+                                         bool shapeless_are_scalars = false,
+                                         bool is_constant = false) {
+  mlir::Type elem_type = ConvertElementType(tensor.type, builder);
+  // TODO(b/139554398) Store min/max (even for non-quantized tensors) somewhere
+  // if it's set
+  if (IsQuantized(tensor)) {
+    TF_ASSIGN_OR_RETURN(elem_type,
+                        GetQuantizedType(tensor, builder, is_constant));
+  }
+
+  if (IsScalar(tensor) || (shapeless_are_scalars && tensor.shape.empty())) {
     return builder.getTensorType({}, elem_type);
   }
 
   if (!tensor.shape.empty()) {
-    llvm::SmallVector<int64_t, 4> shape;
-    for (int32_t i : tensor.shape) {
-      shape.push_back(int64_t{i});
-    }
+    llvm::SmallVector<int64_t, 4> shape(tensor.shape.begin(),
+                                        tensor.shape.end());
     return builder.getTensorType(shape, elem_type);
   }
 
   return builder.getTensorType(elem_type);
 }
 
+StatusOr<std::string> OpNameForOpCode(const tflite::OperatorCodeT opcode) {
+  // TODO(krzysd) Support custom ops
+  if (opcode.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+    return errors::Unimplemented("unsupported custom operation: ",
+                                 opcode.custom_code);
+  }
+  if (opcode.builtin_code == tflite::BuiltinOperator_IF) {
+    return std::string("tf.If");
+  }
+  if (opcode.builtin_code == tflite::BuiltinOperator_WHILE) {
+    return std::string("tf.While");
+  }
+
+  const char* op_name = tflite::EnumNameBuiltinOperator(opcode.builtin_code);
+  std::string lowered_name = llvm::StringRef(op_name).lower();
+  return llvm::Twine("tfl.", lowered_name).str();
+}
+
+// The buffers in TFLite flatbuffers have their contents stored as a vector of
+// bytes that represent little-endian values.
+// The read_size parameter is present to allow reading both float16 and float32s
+// without a case split.
+template <typename T>
+std::vector<T> ReadAsLittleEndian(ArrayRef<uint8_t> bytes) {
+  std::vector<T> ret;
+  size_t read_size = sizeof(T);
+  int bytes_len = bytes.size();
+  assert(bytes_len % read_size == 0);
+
+  size_t elem_count = bytes_len / read_size;
+  ret.reserve(elem_count);
+
+  const char* data_ptr = reinterpret_cast<const char*>(bytes.data());
+  for (int i = 0; i < elem_count; i++) {
+    ret.push_back(
+        llvm::support::endian::readNext<T, llvm::support::little,
+                                        llvm::support::unaligned>(data_ptr));
+  }
+  return ret;
+}
+
+tensorflow::TensorProto ConvertTfliteConstTensor(
+    const tflite::TensorT& tensor, const std::vector<uint8_t>& buffer) {
+  tensorflow::TensorProto ret;
+  ret.set_dtype(TflTypeToTfType(tensor.type));
+
+  tensorflow::TensorShapeProto* shape = ret.mutable_tensor_shape();
+  shape->set_unknown_rank(false);
+  for (auto dim : tensor.shape) {
+    shape->add_dim()->set_size(int64_t{dim});
+  }
+  std::string content;
+  content.assign(reinterpret_cast<const char*>(buffer.data()), buffer.size());
+  ret.set_tensor_content(content);
+  return ret;
+}
+
+StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
+    mlir::RankedTensorType shaped_type, mlir::FloatType elem_type,
+    const std::vector<uint8_t>& buffer) {
+  size_t bytes_len = buffer.size();
+
+  // The bytes of floats are stored little-endian.
+  switch (elem_type.getWidth()) {
+    case 16: {
+      assert(bytes_len % 2 == 0);
+      size_t elem_count = bytes_len / 2;
+      std::vector<llvm::APFloat> values;
+      values.reserve(elem_count);
+
+      const char* data = reinterpret_cast<const char*>(buffer.data());
+      auto& semantics = elem_type.getFloatSemantics();
+
+      for (int i = 0; i < elem_count; i++) {
+        uint16_t bit_repr =
+            llvm::support::endian::readNext<uint16_t, llvm::support::little,
+                                            llvm::support::unaligned>(data);
+        llvm::APInt int_repr(16, bit_repr);
+        values.emplace_back(semantics, int_repr);
+      }
+
+      return DenseElementsAttr::get(shaped_type, values);
+    }
+    case 32: {
+      assert(bytes_len % 4 == 0);
+      size_t elem_count = bytes_len / 4;
+      std::vector<float> values;
+      values.reserve(elem_count);
+
+      const char* data = reinterpret_cast<const char*>(buffer.data());
+
+      for (int i = 0; i < elem_count; i++) {
+        uint32_t bit_repr =
+            llvm::support::endian::readNext<uint32_t, llvm::support::little,
+                                            llvm::support::unaligned>(data);
+        values.push_back(absl::bit_cast<float>(bit_repr));
+      }
+      return DenseElementsAttr::get(shaped_type, ArrayRef<float>(values));
+    }
+  }
+  return errors::InvalidArgument("unsupported bit width", elem_type.getWidth());
+}
+
+StatusOr<mlir::ElementsAttr> ConvertIntBuffer(
+    mlir::RankedTensorType shaped_type, mlir::Type elem_type,
+    const std::vector<uint8_t>& buffer) {
+  unsigned bit_width;
+  mlir::RankedTensorType buffer_type;
+  if (auto itype = elem_type.dyn_cast<mlir::IntegerType>()) {
+    bit_width = itype.getWidth();
+  } else if (auto qtype = elem_type.dyn_cast<QuantizedType>()) {
+    bit_width = qtype.getStorageTypeIntegralWidth();
+    shaped_type = mlir::RankedTensorType::get(shaped_type.getShape(),
+                                              qtype.getStorageType());
+  } else {
+    return errors::InvalidArgument("unsupported integer constant type");
+  }
+
+  switch (bit_width) {
+    case 1: {
+      // vector<bool> doesn't convert to an ArrayRef
+      llvm::SmallVector<bool, 8> values;
+      values.reserve(buffer.size());
+      for (auto b : buffer) {
+        values.emplace_back(b != 0);
+      }
+      return DenseElementsAttr::get(shaped_type, ArrayRef<bool>(values));
+    }
+    case 8: {
+      return DenseElementsAttr::get(shaped_type, ArrayRef<uint8_t>(buffer));
+    }
+    case 16: {
+      auto values = ReadAsLittleEndian<uint16_t>(buffer);
+      return DenseElementsAttr::get(shaped_type, ArrayRef<uint16_t>(values));
+    }
+    case 32: {
+      auto values = ReadAsLittleEndian<uint32_t>(buffer);
+      return DenseElementsAttr::get(shaped_type, ArrayRef<uint32_t>(values));
+    }
+    case 64: {
+      auto values = ReadAsLittleEndian<uint64_t>(buffer);
+      return DenseElementsAttr::get(shaped_type, ArrayRef<uint64_t>(values));
+    }
+    default:
+      return errors::Unimplemented("Cannot handle bit width ", bit_width);
+  }
+}
+
+StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
+                                  const std::vector<uint8_t>& buffer,
+                                  OpBuilder builder, Location loc) {
+  TF_ASSIGN_OR_RETURN(auto type, GetTensorType(tensor, builder,
+                                               /*shapeless_are_scalars=*/true,
+                                               /*is_constant=*/true));
+  auto shaped_type = type.dyn_cast<mlir::RankedTensorType>();
+  if (!shaped_type) {
+    return errors::Internal("Constant doesn't have a shape");
+  }
+
+  auto elem_type = shaped_type.getElementType();
+
+  mlir::ElementsAttr value;
+  if (auto float_type = elem_type.dyn_cast<mlir::FloatType>()) {
+    TF_ASSIGN_OR_RETURN(value,
+                        ConvertFloatBuffer(shaped_type, float_type, buffer));
+  } else if (elem_type.isa<mlir::IntegerType>() ||
+             elem_type.isa<QuantizedType>()) {
+    TF_ASSIGN_OR_RETURN(value,
+                        ConvertIntBuffer(shaped_type, elem_type, buffer));
+  } else if (elem_type.isa<mlir::TF::TensorFlowType>()) {
+    auto& dialect = elem_type.getDialect();
+    tensorflow::TensorProto repr = ConvertTfliteConstTensor(tensor, buffer);
+    std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
+
+    value = builder.getOpaqueElementsAttr(&dialect, shaped_type, mangled);
+  } else {
+    return errors::Unimplemented("Constant of unsupported type");
+  }
+
+  if (IsQuantized(tensor)) {
+    auto op = builder.create<tfl::QConstOp>(
+        loc, builder.getTypeAttr(shaped_type), value);
+    return op.getOperation();
+  }
+  auto op = builder.create<tfl::ConstOp>(loc, value);
+  return op.getOperation();
+}
+
+llvm::SmallVector<mlir::NamedAttribute, 4> ConvertSubgraphIdxsToFunctionAttrs(
+    tflite::BuiltinOptionsUnion options,
+    const std::vector<std::string>& func_names, Builder builder) {
+  if (auto* opts = options.AsIfOptions()) {
+    uint32_t then_idx = opts->then_subgraph_index;
+    auto then_attr = builder.getSymbolRefAttr(func_names.at(then_idx));
+    uint32_t else_idx = opts->else_subgraph_index;
+    auto else_attr = builder.getSymbolRefAttr(func_names.at(else_idx));
+
+    return {builder.getNamedAttr("then_branch", then_attr),
+            builder.getNamedAttr("else_branch", else_attr),
+            // TODO(b/139667752): Analyze statelessness correctly
+            builder.getNamedAttr("is_stateless", builder.getBoolAttr(false))};
+  }
+  if (auto* opts = options.AsWhileOptions()) {
+    uint32_t cond_idx = opts->cond_subgraph_index;
+    auto cond_attr = builder.getSymbolRefAttr(func_names.at(cond_idx));
+    uint32_t body_idx = opts->body_subgraph_index;
+    auto body_attr = builder.getSymbolRefAttr(func_names.at(body_idx));
+
+    return {builder.getNamedAttr("cond", cond_attr),
+            builder.getNamedAttr("body", body_attr),
+            // TODO(b/139667752): Analyze statelessness correctly
+            builder.getNamedAttr("is_stateless", builder.getBoolAttr(false))};
+  }
+  return {};
+}
+
+// TODO(krzysd) Handle function calls
+StatusOr<Operation*> ConvertOp(
+    const tflite::OperatorT& op, const std::vector<Value*> vals_map,
+    Value* optional_arg_marker, const std::vector<std::string>& op_names,
+    const std::vector<std::string>& func_names,
+    const std::vector<std::unique_ptr<tflite::TensorT>>& tensors, Location loc,
+    OpBuilder builder) {
+  llvm::SmallVector<Value*, 4> operands;
+  llvm::SmallVector<mlir::Type, 2> outputTypes;
+
+  if (op.outputs.empty()) {
+    auto err = errors::InvalidArgument("operator with no outputs");
+    return emitError(loc, err.ToString()), err;
+  }
+
+  const std::string& op_name = op_names.at(op.opcode_index);
+  OperationState op_state(loc, op_name);
+
+  for (auto input_num : op.inputs) {
+    if (input_num == -1) {
+      assert(optional_arg_marker != nullptr);
+      op_state.addOperands({optional_arg_marker});
+    } else {
+      op_state.addOperands({vals_map.at(input_num)});
+    }
+  }
+
+  for (auto output_num : op.outputs) {
+    auto& tensor = *tensors.at(output_num);
+    auto type_or_err = GetTensorType(tensor, builder);
+    if (!type_or_err.ok()) {
+      return emitError(loc, type_or_err.status().ToString()),
+             type_or_err.status();
+    }
+    auto type = type_or_err.ConsumeValueOrDie();
+
+    // Special case for reshape, which stores its return shape in an option
+    // that we need to extract from
+    // Note: UniqueOp is handled by the typing information on its output tensor
+    if (auto* opts = op.builtin_options.AsReshapeOptions()) {
+      llvm::SmallVector<int64_t, 4> shape(opts->new_shape.begin(),
+                                          opts->new_shape.end());
+      type = builder.getTensorType(ArrayRef<int64_t>(shape),
+                                   type.getElementType());
+    }
+
+    // Special case for quantize: return type must also be in qtype attribute
+    if (op_name == "tfl.quantize") {
+      op_state.addAttribute("qtype", builder.getTypeAttr(type));
+    }
+
+    op_state.addTypes({type});
+  }
+
+  llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
+  mlir::BuiltinOptionsToAttributes(op.builtin_options, builder, attrs);
+  op_state.addAttributes(attrs);
+
+  // Handle the conversion from subgraph index to functions for If and While
+  auto function_ref_attrs = ConvertSubgraphIdxsToFunctionAttrs(
+      op.builtin_options, func_names, builder);
+  op_state.addAttributes(function_ref_attrs);
+
+  return builder.createOperation(op_state);
+}
+
+// Build a FuncOp from a tflite SubGraph
+// The op_names are a mapping from indexes into the TFLite operators array to
+// the operator name MLIR expects (tfl.foo_op). The buffers are directly taken
+// from the deserialized flatbuffer as we do not have the type information to
+// interpret them until this point. The base_loc parameter is the location of
+// the flatbuffer as a whole (usually a file). The add_pseudo_input_ops flag
+// controls whether we create the dummy ops for input that the TFLite dialect
+// has in the main function (and only the main function).
+StatusOr<FuncOp> ConvertSubgraph(
+    const tflite::SubGraphT& subgraph, llvm::StringRef name,
+    const std::vector<std::string>& op_names,
+    const std::vector<std::string>& func_names,
+    const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
+    Location base_loc, Builder builder, bool add_pseudo_input_ops = false) {
+  llvm::SmallVector<mlir::Type, 2> ret_types;
+  llvm::SmallVector<mlir::Type, 4> input_types;
+
+  auto func_loc = mlir::NameLoc::get(builder.getIdentifier(name), base_loc);
+
+  // Construct function type
+  for (auto input : subgraph.inputs) {
+    auto& tensor = *subgraph.tensors.at(input);
+    // TODO(b/138222071) Graph inputs must have static shape per the exporter,
+    // but we cannot differentiate scalars from unranked tensors.
+    // Here we reverse the default assumption that shape = [] means unranked.
+    // when processing main()
+    auto type_or_err =
+        GetTensorType(tensor, builder,
+                      /*shapeless_are_scalars=*/add_pseudo_input_ops,
+                      /*is_constant=*/false);
+    if (!type_or_err.ok()) {
+      emitError(func_loc, "error reading argument types")
+          << type_or_err.status().ToString();
+      return type_or_err.status();
+    }
+    auto type = type_or_err.ConsumeValueOrDie();
+    input_types.push_back(type);
+  }
+
+  llvm::SmallVector<bool, 16> is_op_output(subgraph.tensors.size(), false);
+  for (auto& op : subgraph.operators) {
+    for (auto output : op->outputs) {
+      is_op_output[output] = true;
+    }
+  }
+
+  for (auto output : subgraph.outputs) {
+    bool is_constant = !is_op_output[output];
+    auto type_or_err = GetTensorType(*subgraph.tensors.at(output), builder,
+                                     /*shapeless_are_scalars=*/is_constant,
+                                     /*is_constant=*/is_constant);
+    if (!type_or_err.ok()) {
+      emitError(func_loc, "error reading return types")
+          << type_or_err.status().ToString();
+      return type_or_err.status();
+    }
+    auto type = type_or_err.ConsumeValueOrDie();
+    ret_types.push_back(type);
+  }
+  auto func_type = builder.getFunctionType(input_types, ret_types);
+
+  // Construct function object
+  auto func = FuncOp::create(func_loc, name, func_type, /* attrs= */ {});
+  func.addEntryBlock();
+  auto& body = func.getBody();
+  OpBuilder op_builder{body};
+
+  std::vector<Value*> vals_map(subgraph.tensors.size(), nullptr);
+  Value* maybe_optional_arg_marker = nullptr;
+
+  // Get or construct MLIR values for each input
+  for (int i = 0, e = subgraph.inputs.size(); i < e; i++) {
+    auto input_tensor = subgraph.inputs[i];
+    const auto& tensor = *subgraph.tensors.at(input_tensor);
+    auto loc = TensorLoc(tensor, builder, base_loc);
+    if (nullptr != vals_map[input_tensor]) {
+      auto err = errors::FailedPrecondition("duplicate input arguments");
+      return emitError(loc, err.ToString()), err;
+    }
+    if (add_pseudo_input_ops) {
+      auto* input = func.getArgument(i);
+      auto op = op_builder.create<tfl::InputOp>(loc, input);
+      vals_map[input_tensor] = op.output();
+    } else {
+      vals_map[input_tensor] = func.getArgument(i);
+    }
+  }
+
+  // Construct MLIR operators from TFLite operators
+  for (auto& op : subgraph.operators) {
+    for (auto input_num : op->inputs) {
+      // The operators in a graph are topologically sorted
+      // and so if no previous operation has produced a tensor
+      // it must be a constant.
+      if (input_num == -1) {
+        if (maybe_optional_arg_marker == nullptr) {
+          maybe_optional_arg_marker =
+              op_builder
+                  .create<mlir::ConstantOp>(base_loc, builder.getNoneType(),
+                                            builder.getUnitAttr())
+                  .getResult();
+        }
+      } else if (nullptr == vals_map.at(input_num)) {
+        auto& const_tensor = *subgraph.tensors[input_num];
+        auto const_loc = TensorLoc(const_tensor, builder, base_loc);
+        auto op_or_err =
+            BuildConstOp(const_tensor, buffers[const_tensor.buffer]->data,
+                         op_builder, const_loc);
+        if (!op_or_err.ok()) {
+          return emitError(const_loc, op_or_err.status().ToString()),
+                 op_or_err.status();
+        }
+        vals_map[input_num] = op_or_err.ValueOrDie()->getResult(0);
+      }
+    }
+
+    // The NameLoc corresponding to the name of the first output tensor
+    auto op_loc =
+        op->outputs.empty()
+            ? base_loc
+            : TensorLoc(*subgraph.tensors[op->outputs[0]], builder, base_loc);
+    // If there's an optional argument, maybe_optional_arg_marker has been set
+    // to a valid Value*
+    TF_ASSIGN_OR_RETURN(
+        auto* mlir_op,
+        ConvertOp(*op, vals_map, maybe_optional_arg_marker, op_names,
+                  func_names, subgraph.tensors, op_loc, op_builder));
+    for (auto pair : llvm::enumerate(mlir_op->getResults())) {
+      vals_map[op->outputs[pair.index()]] = pair.value();
+    }
+  }
+
+  // Construct return values
+  llvm::SmallVector<Value*, 4> return_operands;
+  for (auto index : subgraph.outputs) {
+    if (nullptr == vals_map.at(index)) {
+      auto& const_tensor = *subgraph.tensors[index];
+      auto const_loc = TensorLoc(const_tensor, builder, base_loc);
+      auto op_or_err =
+          BuildConstOp(const_tensor, buffers[const_tensor.buffer]->data,
+                       op_builder, const_loc);
+      if (!op_or_err.ok()) {
+        return emitError(const_loc, op_or_err.status().ToString()),
+               op_or_err.status();
+      }
+      vals_map[index] = op_or_err.ValueOrDie()->getResult(0);
+    }
+    return_operands.push_back(vals_map[index]);
+  }
+
+  op_builder.create<mlir::ReturnOp>(base_loc, return_operands);
+
+  return func;
+}
+
+// TFLite subgraphs do not necessarily have names, though MLIR functions must
+// have them, so we generate a name for subgraphs that are missing one here.
+// Note: in TFLite, the first subgraph is the entry point, and in MLIR that
+// represents TFLite, this entry point must be called "main"
+// TODO(b/131175224,b/132239787) Support multiple entry points
+std::string SubgraphName(unsigned index, const tflite::SubGraphT& subgraph) {
+  if (subgraph.name.empty()) {
+    if (index == 0) {
+      return "main";
+    } else {
+      return llvm::formatv("fn_{0}", index).str();
+    }
+  } else {
+    return subgraph.name;
+  }
+}
 }  // namespace
 
 OwningModuleRef tflite::FlatBufferToMlir(absl::string_view buffer,
@@ -117,39 +645,51 @@ OwningModuleRef tflite::FlatBufferToMlir(absl::string_view buffer,
   std::unique_ptr<ModelT> model(model_ptr->GetModel()->UnPack());
 
   auto builder = Builder(context);
-  auto module = mlir::ModuleOp::create(base_loc);
 
-  // TODO(krzysd): Actually account for the FlatBuffer schema version
+  std::vector<std::string> operator_names;
+  operator_names.reserve(model->operator_codes.size());
+
+  for (auto& opcode : model->operator_codes) {
+    auto operator_name_or_error = OpNameForOpCode(*opcode);
+    if (!operator_name_or_error.ok()) {
+      return emitError(base_loc, operator_name_or_error.status().ToString()),
+             nullptr;
+    }
+    operator_names.push_back(operator_name_or_error.ConsumeValueOrDie());
+  }
+
+  std::vector<std::string> func_names;
+  for (auto& subgraph : model->subgraphs) {
+    func_names.push_back(subgraph->name);
+  }
+
+  auto module = mlir::ModuleOp::create(base_loc);
+  // We currently don't use this to make decisions, but we could
+  // use it in exports or if there are breaking changes
   module.setAttr("tfl.schema_version",
                  builder.getI32IntegerAttr(model->version));
-
-  for (auto& subgraph : model->subgraphs) {
-    llvm::SmallVector<mlir::Type, 2> ret_types;
-    llvm::SmallVector<mlir::Type, 4> input_types;
-
-    for (auto input : subgraph->inputs) {
-      auto type_or_err = GetTensorType(*subgraph->tensors[input], builder);
-      if (!type_or_err.ok()) {
-        return emitError(base_loc, type_or_err.status().ToString()), nullptr;
-      }
-      input_types.push_back(type_or_err.ConsumeValueOrDie());
-    }
-
-    auto func_type = builder.getFunctionType(input_types, ret_types);
-    auto func_loc = mlir::NameLoc::get(builder.getIdentifier(subgraph->name),
-                                       base_loc, context);
-    auto func =
-        FuncOp::create(func_loc, subgraph->name, func_type, /* attrs= */ {});
-    func.addEntryBlock();
-
-    // TODO(krzysd): convert TFLite ops to MLIR ops
-    // Note: EnumNamesBuiltinOperator has the names of the builtin ops in
-    // uppercase. We will want them in lowercase with a tfl. prefix for MLIR
-    OpBuilder op_builder{func.getBody()};
-    op_builder.create<mlir::ReturnOp>(base_loc);
-    module.push_back(func);
+  if (!model->description.empty()) {
+    module.setAttr("tfl.description",
+                   builder.getStringAttr(model->description));
   }
 
+  for (auto e : llvm::enumerate(model->subgraphs)) {
+    auto& subgraph = e.value();
+    std::string name = SubgraphName(e.index(), *subgraph);
+    auto func_or_error = ConvertSubgraph(
+        *subgraph, name, operator_names, func_names, model->buffers, base_loc,
+        // Only the entry point needs pseudo_input_ops
+        // TODO(b/131175224,b/132239787) Support multiple entry points
+        builder, /* add_pseudo_input_ops = */ e.index() == 0);
+    if (!func_or_error.ok()) {
+      return emitError(base_loc, "could not translate function ")
+                 << subgraph->name,
+             nullptr;
+    }
+    module.push_back(func_or_error.ConsumeValueOrDie());
+  }
+  // TFLite subgraphs do not necessarily have names,
+
   return OwningModuleRef(module);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 6d85f6f19e2..a18e54ac5bb 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -98,6 +101,11 @@ static int ConvertI32AttrForOptionWriter(
   return i.getSExtValue();
 }
 
+static int ConvertPositiveI32AttrForOptionWriter(
+    llvm::APInt i, flatbuffers::FlatBufferBuilder* builder) {
+  return ConvertI32AttrForOptionWriter(i, builder);
+}
+
 static flatbuffers::Offset<flatbuffers::Vector<int32_t>>
 ConvertI64ArrayAttrForOptionWriter(mlir::ArrayAttr attrArray,
                                    flatbuffers::FlatBufferBuilder* builder) {
@@ -144,5 +152,59 @@ static tflite::LSTMKernelType ConvertTFL_LSTMKernelTypeAttrForOptionWriter(
       .Case("BASIC", tflite::LSTMKernelType_BASIC);
 }
 
+static mlir::Attribute BuildBoolAttr(bool value, mlir::Builder builder) {
+  return builder.getBoolAttr(value);
+}
+
+static mlir::Attribute BuildF32Attr(float value, mlir::Builder builder) {
+  return builder.getF32FloatAttr(value);
+}
+
+static mlir::Attribute BuildI32Attr(int32_t value, mlir::Builder builder) {
+  return builder.getI32IntegerAttr(value);
+}
+
+static mlir::Attribute BuildI64ArrayAttr(std::vector<int32_t> value,
+                                         mlir::Builder builder) {
+  std::vector<int64_t> typecast(value.begin(), value.end());
+  return builder.getI64ArrayAttr(typecast);
+}
+
+static mlir::Attribute BuildPositiveI32Attr(int32_t value,
+                                            mlir::Builder builder) {
+  return builder.getI32IntegerAttr(value);
+}
+
+static mlir::Attribute BuildTFL_AFAttr(tflite::ActivationFunctionType value,
+                                       mlir::Builder builder) {
+  const char* option_name = tflite::EnumNameActivationFunctionType(value);
+  return builder.getStringAttr(option_name);
+}
+
+static mlir::Attribute BuildTFL_FullyConnectedOptionsWeightFormatAttr(
+    tflite::FullyConnectedOptionsWeightsFormat value, mlir::Builder builder) {
+  const char* option_name =
+      tflite::EnumNameFullyConnectedOptionsWeightsFormat(value);
+  return builder.getStringAttr(option_name);
+}
+
+static mlir::Attribute BuildTFL_LSTMKernelTypeAttr(tflite::LSTMKernelType value,
+                                                   mlir::Builder builder) {
+  const char* option_name = tflite::EnumNameLSTMKernelType(value);
+  return builder.getStringAttr(option_name);
+}
+
+static mlir::Attribute BuildTFL_MirrorPaddingAttr(tflite::MirrorPadMode value,
+                                                  mlir::Builder builder) {
+  const char* option_name = tflite::EnumNameMirrorPadMode(value);
+  return builder.getStringAttr(option_name);
+}
+
+static mlir::Attribute BuildTFL_PaddingAttr(tflite::Padding value,
+                                            mlir::Builder builder) {
+  const char* option_name = tflite::EnumNamePadding(value);
+  return builder.getStringAttr(option_name);
+}
+
 // Pull in FlatBuffer writers for TFLite generated using TableGen
-#include "tensorflow/compiler/mlir/lite/operator_writers.inc"
+#include "tensorflow/compiler/mlir/lite/operator_converters.inc"
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
index e35780b11ec..35293c1b812 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -25,6 +25,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -42,6 +45,14 @@ llvm::Optional<flatbuffers::Offset<tflite::Operator>> CreateFlatBufferOperator(
     const std::vector<int32_t> &operands, const std::vector<int32_t> &results,
     flatbuffers::FlatBufferBuilder *fbb);
 
+// Populate the array of mlir::NamedAttributes corresponding to the given
+// tflite::FlatbufferOptionsUnion.
+// We use an out parameter per LLVM convention
+void BuiltinOptionsToAttributes(
+    tflite::BuiltinOptionsUnion op_union, mlir::Builder builder,
+    // NOLINTNEXTLINE
+    llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes);
+
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_OPERATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index fca80f836aa..aa57ff7f751 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
@@ -48,14 +49,14 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/FileUtilities.h"  // TF:local_config_mlir
 #include "mlir/Translation.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/op_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils//convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -89,6 +90,8 @@ using mlir::TranslateFromMLIRRegistration;
 using mlir::Type;
 using mlir::UnknownLoc;
 using mlir::Value;
+using tensorflow::OpLocNameMapper;
+using tensorflow::OpNameMapper;
 using tensorflow::Status;
 using tflite::flex::IsWhitelistedFlexOp;
 using xla::StatusOr;
@@ -105,9 +108,10 @@ using llvm::cl::opt;
 
 // These command line flags enable control of the translation implementation.
 bool emit_builtin_tflite_ops;
-bool emit_select_tf_ops;
 bool emit_custom_ops;
+bool emit_select_tf_ops;
 bool lower_tensor_list_ops;
+bool strip_debug_info;
 
 // NOLINTNEXTLINE
 static opt<bool, true> emit_builtin_tflite_ops_flag(
@@ -117,7 +121,7 @@ static opt<bool, true> emit_builtin_tflite_ops_flag(
     llvm::cl::location(emit_builtin_tflite_ops), llvm::cl::init(true));
 
 // NOLINTNEXTLINE
-static opt<bool, true> emit_select_tf_Ops_flag(
+static opt<bool, true> emit_select_tf_ops_flag(
     "emit-select-tf-ops",
     llvm::cl::desc(
         "Emit Select TF operations (Flex ops) in the generated TFLite model"),
@@ -135,6 +139,11 @@ static opt<bool, true> lower_tensor_list_ops_flag(
     llvm::cl::desc("Lower the TensorList ops within the TFLite dialect"),
     llvm::cl::location(lower_tensor_list_ops), llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+static opt<bool, true> strip_debug_info_flag(
+    "strip-debug-info", llvm::cl::desc("Strip debug info during export"),
+    llvm::cl::location(strip_debug_info), llvm::cl::init(false));
+
 ABSL_CONST_INIT const absl::string_view kFlexOpNamePrefix = "Flex";
 
 // Use initial buffer size in flatbuffer builder to be same as the initial size
@@ -188,6 +197,10 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
       auto qtype = type.cast<mlir::quant::UniformQuantizedType>();
       return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
     }
+    case mlir::quant::QuantizationTypes::UniformQuantizedPerAxis: {
+      auto qtype = type.cast<mlir::quant::UniformQuantizedPerAxisType>();
+      return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
+    }
     default:
       // TFLite export fills FLOAT32 for unknown data types. Returning an error
       // for now for safety and this could be revisited when required.
@@ -200,11 +213,13 @@ static bool IsInput(Operation* op) {
          op->getName().getStringRef() == "tf.Placeholder.input";
 }
 
-static bool IsConstOrInput(Operation* op) {
-  return (isa<mlir::ConstantOp>(op) || isa<mlir::TF::ConstOp>(op) ||
-          isa<tfl::ConstOp>(op) || isa<tfl::QConstOp>(op) || IsInput(op));
+static bool IsConst(Operation* op) {
+  return isa<mlir::ConstantOp>(op) || isa<mlir::TF::ConstOp>(op) ||
+         isa<tfl::ConstOp>(op) || isa<tfl::QConstOp>(op);
 }
 
+static bool IsConstOrInput(Operation* op) { return IsConst(op) || IsInput(op); }
+
 template <typename T>
 static bool HasValidTFLiteType(Value* value, T& error_handler) {
   // None type is allowed to represent unspecified operands.
@@ -222,7 +237,7 @@ static bool HasValidTFLiteType(Value* value, T& error_handler) {
     return false;
   }
   if (auto* inst = value->getDefiningOp()) {
-    if (IsConstOrInput(inst) && !type.hasStaticShape()) {
+    if (IsInput(inst) && !type.hasStaticShape()) {
       return error_handler.emitError("should have static shape, got ")
                  << type.getShape(),
              false;
@@ -306,8 +321,8 @@ static std::unique_ptr<::tensorflow::NodeDef> getTensorFlowNodeDef(
   // We pass empty string for the original node_def name since Flex runtime
   // does not care about this being set correctly on node_def. There is no
   // "easy" (see b/120948529) way yet to get this from MLIR inst.
-  auto status_or_node_def =
-      tensorflow::ConvertTFDialectOpToNodeDef(inst, /*name=*/"");
+  auto status_or_node_def = tensorflow::ConvertTFDialectOpToNodeDef(
+      inst, /*name=*/"", /*ignore_unregistered_attrs=*/true);
   if (!status_or_node_def.ok()) {
     inst->emitOpError(
         Twine("failed to obtain TensorFlow nodedef with status: " +
@@ -328,13 +343,17 @@ class Translator {
   static Optional<std::string> Translate(ModuleOp module,
                                          bool emit_builtin_tflite_ops,
                                          bool emit_select_tf_ops,
-                                         bool emit_custom_ops);
+                                         bool emit_custom_ops,
+                                         OpNameMapper* op_name_mapper);
 
  private:
   enum class OpType : char { kTfliteBuiltin, kSelectTf, kCustomOp };
   explicit Translator(ModuleOp module, bool emit_builtin_tflite_ops,
-                      bool emit_select_tf_ops, bool emit_custom_ops)
-      : module_(module), builder_(kInitialBufferSize) {
+                      bool emit_select_tf_ops, bool emit_custom_ops,
+                      OpNameMapper* op_name_mapper)
+      : module_(module),
+        name_mapper_(*op_name_mapper),
+        builder_(kInitialBufferSize) {
     // The first buffer must be empty according to the schema definition.
     empty_buffer_ = tflite::CreateBuffer(builder_);
     buffers_.push_back(empty_buffer_);
@@ -353,10 +372,6 @@ class Translator {
 
   Optional<std::string> TranslateInternal();
 
-  // Returns name that should be used by tensors for values generated by this
-  // operation.
-  std::string GetName(Operation* inst);
-
   // Returns TFLite buffer populated with constant value if the operation is
   // TFLite constant operation. Otherwise, returns an empty buffer. Emits error
   // and returns llvm::None on failure.
@@ -368,9 +383,14 @@ class Translator {
                                                      const std::string& name,
                                                      unsigned buffer_idx);
 
-  CustomOptionsOffset CreateIfOpCustomOptions(mlir::TF::IfOp op);
-
-  CustomOptionsOffset CreateWhileOpCustomOptions(mlir::TF::WhileOp op);
+  // TODO(b/137395003): Legalize control flow ops to TFLite dialect, and remove
+  // these 2 functions here.
+  BufferOffset<tflite::Operator> BuildIfOperator(
+      mlir::TF::IfOp op, const std::vector<int32_t>& operands,
+      const std::vector<int32_t>& results);
+  BufferOffset<tflite::Operator> BuildWhileOperator(
+      mlir::TF::WhileOp op, const std::vector<int32_t>& operands,
+      const std::vector<int32_t>& results);
 
   Optional<CustomOptionsOffset> CreateFlexOpCustomOptions(
       const ::tensorflow::NodeDef& node_def, const mlir::Location& loc);
@@ -399,14 +419,17 @@ class Translator {
   // mapping.
   void InitializeNamesFromAttribute(FuncOp fn);
 
+  // Determines if the specified operation op's operand at operand_index
+  // is marked as a stateful operand.
+  bool IsStatefulOperand(mlir::Operation* op, int operand_index);
+
   // Returns a unique name for `op`.
   std::string UniqueName(mlir::Operation* op);
 
-  // Returns a unique name starting with a given prefix.
-  std::string UniqueName(llvm::StringRef prefix);
-
   ModuleOp module_;
 
+  tensorflow::OpNameMapper& name_mapper_;
+
   flatbuffers::FlatBufferBuilder builder_;
   BufferOffset<tflite::Buffer> empty_buffer_;
 
@@ -421,55 +444,14 @@ class Translator {
   absl::flat_hash_map<std::string, int> subgraph_index_map_;
   absl::flat_hash_set<OpType> enabled_op_types_;
 
-  // Maps from op to name.
-  absl::flat_hash_map<mlir::Operation*, std::string> op_to_name_;
-  absl::flat_hash_map<std::string, int64_t> name_to_count_;
-
   // Points to TensorFlow and TFLite dialects, respectively. nullptr if the
   // dialect is not registered.
   const Dialect* tf_dialect_;
   const Dialect* tfl_dialect_;
-
-  // Suffix used to generate unique tensor names from operation names.
-  int name_counter_ = 0;
 };
 
-std::string Translator::GetName(Operation* inst) {
-  if (auto name_loc = inst->getLoc().dyn_cast<mlir::NameLoc>())
-    return name_loc.getName().str();
-
-  if (auto call_loc = inst->getLoc().dyn_cast<mlir::CallSiteLoc>()) {
-    // Return name if CallSiteLoc's callee has a NameLoc (as should be the case
-    // if imported with DebugInfo), else use the fallback naming scheme below.
-    if (auto name_loc = call_loc.getCallee().dyn_cast<mlir::NameLoc>())
-      return name_loc.getName().str();
-  }
-
-  // If the location is none of the expected types, then simply use name
-  // generated using the op type.
-  return inst->getName().getStringRef().str();
-}
-
-std::string Translator::UniqueName(llvm::StringRef prefix) {
-  // Keep incrementing the counter until we find a unique name.
-  std::string name = prefix;
-  int64_t& prefix_count = name_to_count_[name];
-  int64_t val = prefix_count;
-  while (val != 0) {
-    name = (prefix + llvm::Twine(prefix_count)).str();
-    ++prefix_count;
-    val = name_to_count_[name];
-  }
-  name_to_count_[name] = 1;
-  return name;
-}
-
 std::string Translator::UniqueName(mlir::Operation* op) {
-  auto& name = op_to_name_[op];
-  if (!name.empty()) return name;
-  // Update the value in the map with unique name.
-  name = UniqueName(GetName(op));
-  return name;
+  return name_mapper_.GetUniqueName(op);
 }
 
 Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
@@ -510,8 +492,18 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
   // However, we output all known shapes for better round-tripping
   std::vector<int32_t> shape;
   if (auto* inst = value->getDefiningOp()) {
-    if (type.hasStaticShape()) {
-      auto shape_ref = type.getShape();
+    if (type.hasStaticShape() || IsConst(inst)) {
+      // Const op can have a result of dynamic shaped type (e.g. due to constant
+      // folding), but we can still derive the shape of a constant tensor
+      // for its attribute type.
+      llvm::ArrayRef<int64_t> shape_ref;
+      if (type.hasStaticShape()) {
+        shape_ref = type.getShape();
+      } else {
+        mlir::Attribute tensor_attr = inst->getAttr("value");
+        shape_ref = tensor_attr.getType().cast<TensorType>().getShape();
+      }
+
       auto is_out_of_range = [](int64_t dim) {
         return dim > std::numeric_limits<int32_t>::max();
       };
@@ -535,40 +527,65 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         builder_, /*min=*/0, /*max=*/0,
         builder_.CreateVector<float>({static_cast<float>(qtype.getScale())}),
         builder_.CreateVector<int64_t>({qtype.getZeroPoint()}));
+  } else if (auto qtype =
+                 element_type
+                     .dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+    std::vector<float> scales(qtype.getScales().begin(),
+                              qtype.getScales().end());
+    q_params = tflite::CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>(scales),
+        builder_.CreateVector<int64_t>(qtype.getZeroPoints()),
+        tflite::QuantizationDetails_NONE, /*details=*/0,
+        qtype.getQuantizedDimension());
   } else {
     q_params = tflite::CreateQuantizationParameters(builder_);
   }
-
+  // Check if the value's uses includes an op and usage at an operand index
+  // marked as a stateful. If so, set the tensor's is_variable as true
+  // This is v1 ref variable semantics in the TFLite runtime.
+  bool is_variable = false;
+  for (auto& use : value->getUses()) {
+    is_variable = IsStatefulOperand(use.getOwner(), use.getOperandNumber());
+    if (is_variable) {
+      break;
+    }
+  }
   return tflite::CreateTensor(
-      builder_, builder_.CreateVector(shape), tflite_element_type, buffer_idx,
-      builder_.CreateString(name), q_params, /*is_variable=*/false);
+      builder_, builder_.CreateVector(shape), tflite_element_type,
+      (is_variable ? 0 : buffer_idx), builder_.CreateString(name), q_params,
+      /*is_variable=*/is_variable);
 }
 
-CustomOptionsOffset Translator::CreateIfOpCustomOptions(mlir::TF::IfOp op) {
-  int then_subgraph_index = subgraph_index_map_.at(op.getThen().str());
-  int else_subgraph_index = subgraph_index_map_.at(op.getElse().str());
-
-  auto flex_builder = absl::make_unique<flexbuffers::Builder>();
-  flex_builder->Map([&]() {
-    flex_builder->Int("then_subgraph_index", then_subgraph_index);
-    flex_builder->Int("else_subgraph_index", else_subgraph_index);
-  });
-  flex_builder->Finish();
-  return builder_.CreateVector(flex_builder->GetBuffer());
+BufferOffset<tflite::Operator> Translator::BuildIfOperator(
+    mlir::TF::IfOp op, const std::vector<int32_t>& operands,
+    const std::vector<int32_t>& results) {
+  auto opcode_index = GetOpcodeIndex("if", tflite::BuiltinOperator_IF);
+  int then_subgraph_index = subgraph_index_map_.at(op.then_branch().str());
+  int else_subgraph_index = subgraph_index_map_.at(op.else_branch().str());
+  auto builtin_options = tflite::CreateIfOptions(builder_, then_subgraph_index,
+                                                 else_subgraph_index)
+                             .Union();
+  auto inputs = builder_.CreateVector(operands);
+  auto outputs = builder_.CreateVector(results);
+  return tflite::CreateOperator(builder_, opcode_index, inputs, outputs,
+                                tflite::BuiltinOptions_IfOptions,
+                                builtin_options);
 }
 
-CustomOptionsOffset Translator::CreateWhileOpCustomOptions(
-    mlir::TF::WhileOp op) {
-  int cond_subgraph_index = subgraph_index_map_.at(op.getCond().str());
-  int body_subgraph_index = subgraph_index_map_.at(op.getBody().str());
-
-  auto flex_builder = absl::make_unique<flexbuffers::Builder>();
-  flex_builder->Map([&]() {
-    flex_builder->Int("cond_subgraph_index", cond_subgraph_index);
-    flex_builder->Int("body_subgraph_index", body_subgraph_index);
-  });
-  flex_builder->Finish();
-  return builder_.CreateVector(flex_builder->GetBuffer());
+BufferOffset<tflite::Operator> Translator::BuildWhileOperator(
+    mlir::TF::WhileOp op, const std::vector<int32_t>& operands,
+    const std::vector<int32_t>& results) {
+  auto opcode_index = GetOpcodeIndex("while", tflite::BuiltinOperator_WHILE);
+  int cond_subgraph_index = subgraph_index_map_.at(op.cond().str());
+  int body_subgraph_index = subgraph_index_map_.at(op.body().str());
+  auto builtin_options = tflite::CreateWhileOptions(
+                             builder_, cond_subgraph_index, body_subgraph_index)
+                             .Union();
+  auto inputs = builder_.CreateVector(operands);
+  auto outputs = builder_.CreateVector(results);
+  return tflite::CreateOperator(builder_, opcode_index, inputs, outputs,
+                                tflite::BuiltinOptions_WhileOptions,
+                                builtin_options);
 }
 
 Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
@@ -712,63 +729,60 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
 
   if (dialect == tf_dialect_) {
     std::string op_name;
+    if (auto ifOp = dyn_cast<mlir::TF::IfOp>(inst)) {
+      return BuildIfOperator(ifOp, operands, results);
+    } else if (auto whileOp = dyn_cast<mlir::TF::WhileOp>(inst)) {
+      return BuildWhileOperator(whileOp, operands, results);
+    }
+
     CustomOptionsOffset custom_options;
 
-    if (auto ifOp = dyn_cast<mlir::TF::IfOp>(inst)) {
-      op_name = "Experimental_If";
-      custom_options = CreateIfOpCustomOptions(ifOp);
-    } else if (auto whileOp = dyn_cast<mlir::TF::WhileOp>(inst)) {
-      op_name = "Experimental_While";
-      custom_options = CreateWhileOpCustomOptions(whileOp);
-    } else {
-      // Ops in TF dialect can either be custom ops or flex ops.
-      // The reason we go directly from TensorFlow dialect MLIR to tensorflow
-      // node instead of going to TF table gen'd ops via generated code is that
-      // we do not want to restrict custom and flex op conversion support to
-      // only those TF ops that are currently registered in MLIR. The current
-      // model is of an open op system.
-      //
-      //  The following algorithm is followed:
-      //   if flex is enabled and the op is whitelisted as flex
-      //     we emit op as flex.
-      //   if custom is enabled
-      //    we emit the op as custom.
-      auto node_def = getTensorFlowNodeDef(inst);
-      if (!node_def) {
+    // Ops in TF dialect can either be custom ops or flex ops.
+    // The reason we go directly from TensorFlow dialect MLIR to tensorflow
+    // node instead of going to TF table gen'd ops via generated code is that
+    // we do not want to restrict custom and flex op conversion support to
+    // only those TF ops that are currently registered in MLIR. The current
+    // model is of an open op system.
+    //
+    //  The following algorithm is followed:
+    //   if flex is enabled and the op is whitelisted as flex
+    //     we emit op as flex.
+    //   if custom is enabled
+    //    we emit the op as custom.
+    auto node_def = getTensorFlowNodeDef(inst);
+    if (!node_def) {
+      return llvm::None;
+    }
+
+    // Flex op case
+    // Eventually, the whitelist will go away and we will rely on some TF op
+    // trait (e.g. No side effect) to determine if it is a supported "Flex"
+    // op or not.
+    if (enabled_op_types_.contains(OpType::kSelectTf) &&
+        IsWhitelistedFlexOp(node_def->op())) {
+      // Construct ops as flex op encoding TensorFlow node definition
+      // as custom options.
+      // Flex ops are named with the kFlexOpNamePrefix prefix to the actual
+      // TF op name.
+      op_name = std::string(kFlexOpNamePrefix) + node_def->op();
+      if (auto options = CreateFlexOpCustomOptions(*node_def, inst->getLoc())) {
+        custom_options = *options;
+      } else {
         return llvm::None;
       }
-
-      // Flex op case
-      // Eventually, the whitelist will go away and we will rely on some TF op
-      // trait (e.g. No side effect) to determine if it is a supported "Flex"
-      // op or not.
-      if (enabled_op_types_.contains(OpType::kSelectTf) &&
-          IsWhitelistedFlexOp(node_def->op())) {
-        // Construct ops as flex op encoding TensorFlow node definition
-        // as custom options.
-        // Flex ops are named with the kFlexOpNamePrefix prefix to the actual
-        // TF op name.
-        op_name = std::string(kFlexOpNamePrefix) + node_def->op();
-        if (auto options =
-                CreateFlexOpCustomOptions(*node_def, inst->getLoc())) {
-          custom_options = *options;
-        } else {
-          return llvm::None;
-        }
-      } else if (enabled_op_types_.contains(OpType::kCustomOp)) {
-        // Generic case of custom ops - write using flex buffers since that
-        // is the only custom options supported by TFLite today.
-        op_name = node_def->op();
-        if (auto options =
-                CreateCustomOpCustomOptions(*node_def, inst->getLoc())) {
-          custom_options = *options;
-        } else {
-          return llvm::None;
-        }
+    } else if (enabled_op_types_.contains(OpType::kCustomOp)) {
+      // Generic case of custom ops - write using flex buffers since that
+      // is the only custom options supported by TFLite today.
+      op_name = node_def->op();
+      if (auto options =
+              CreateCustomOpCustomOptions(*node_def, inst->getLoc())) {
+        custom_options = *options;
       } else {
-        return inst->emitOpError("is neither a custom op nor a flex op"),
-               llvm::None;
+        return llvm::None;
       }
+    } else {
+      return inst->emitOpError("is neither a custom op nor a flex op"),
+             llvm::None;
     }
 
     uint32_t opcode_index =
@@ -804,8 +818,8 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn) {
       return;
     }
     for (auto it : llvm::enumerate(fn.getArguments())) {
-      op_to_name_[*it.value()->user_begin()] = input_names[it.index()];
-      ++name_to_count_[input_names[it.index()].str()];
+      name_mapper_.InitOpName(*it.value()->user_begin(),
+                              input_names[it.index()]);
     }
   }
 
@@ -825,8 +839,7 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn) {
       // insert an op so that we can have a buffer named such. This cannot
       // currently happen due to pseudo_input nodes.
       if (auto op = it.value()->getDefiningOp()) {
-        op_to_name_[op] = output_names[it.index()];
-        name_to_count_[output_names[it.index()].str()] = 1;
+        name_mapper_.InitOpName(op, output_names[it.index()]);
       } else {
         fn.emitWarning() << "output is not due to an op and '"
                          << output_names[it.index()]
@@ -836,6 +849,27 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn) {
   }
 }
 
+bool Translator::IsStatefulOperand(mlir::Operation* op, int operand_index) {
+  std::vector<int> operand_indices;
+  // TODO(b/138254427): When the bug is addressed, we'll be able to inspect
+  // for the presence of a specific OpTrait using mlir::Operation, without
+  // having to cast it to specific ops like below.
+  // Until then, when a new RNN/LSTM op is added to TFLite and has stateful
+  // tensors as operands, they will need to be added here as well.
+  if (auto tfl = llvm::dyn_cast<mlir::TFL::LSTMOp>(op)) {
+    operand_indices = tfl.GetStatefulOperands();
+  } else if (auto tfl =
+                 llvm::dyn_cast<mlir::TFL::UnidirectionalSequenceLSTMOp>(op)) {
+    operand_indices = tfl.GetStatefulOperands();
+  } else if (auto tfl =
+                 llvm::dyn_cast<mlir::TFL::UnidirectionalSequenceRNNOp>(op)) {
+    operand_indices = tfl.GetStatefulOperands();
+  } else if (auto tfl = llvm::dyn_cast<mlir::TFL::SVDFOp>(op)) {
+    operand_indices = tfl.GetStatefulOperands();
+  }
+  return absl::c_find(operand_indices, operand_index) != operand_indices.end();
+}
+
 Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(FuncOp fn) {
   InitializeNamesFromAttribute(fn);
   std::vector<BufferOffset<tflite::Tensor>> tensors;
@@ -855,6 +889,10 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(FuncOp fn) {
     if (!tensor_or) return false;
     tensors.push_back(*tensor_or);
 
+    // TODO(ashwinm): Check if for stateful tensors, if it is also needed to
+    // make the Buffer empty apart from setting the buffer_idx=0 in the Tensor.
+    // This does not seem to affect runtime behavior for RNN/LSTM, but would be
+    // good for reducing memory footprint.
     if (auto* inst = value->getDefiningOp()) {
       auto buffer_or = BuildBuffer(inst);
       if (!buffer_or) return false;
@@ -942,10 +980,11 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(FuncOp fn) {
 Optional<std::string> Translator::Translate(ModuleOp module,
                                             bool emit_builtin_tflite_ops,
                                             bool emit_select_tf_ops,
-                                            bool emit_custom_ops) {
+                                            bool emit_custom_ops,
+                                            OpNameMapper* op_name_mapper) {
   if (!IsValidTFLiteMlirModule(module)) return llvm::None;
   Translator translator(module, emit_builtin_tflite_ops, emit_select_tf_ops,
-                        emit_custom_ops);
+                        emit_custom_ops, op_name_mapper);
   return translator.TranslateInternal();
 }
 
@@ -979,8 +1018,14 @@ Optional<std::string> Translator::TranslateInternal() {
     subgraphs.push_back(*subgraph_or);
   }
 
+  std::string model_description;
+  if (auto attr = module_.getAttrOfType<StringAttr>("tfl.description")) {
+    model_description = attr.getValue().str();
+  } else {
+    model_description = "MLIR Converted.";
+  }
   // Build the model and finish the model building process.
-  auto description = builder_.CreateString("MLIR Converted.");
+  auto description = builder_.CreateString(model_description.data());
   auto model = tflite::CreateModel(
       builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
       builder_.CreateVector(subgraphs), description,
@@ -1005,21 +1050,38 @@ Optional<std::string> Translator::TranslateInternal() {
 //
 bool tflite::MlirToFlatBufferTranslateFunction(
     ModuleOp module, std::string* serialized_flatbuffer,
-    bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
-    bool emit_custom_ops) {
-  auto maybe_translated = Translator::Translate(
-      module, emit_builtin_tflite_ops, emit_select_tf_ops, emit_custom_ops);
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    OpNameMapper* op_name_mapper) {
+  auto maybe_translated =
+      Translator::Translate(module, emit_builtin_tflite_ops, emit_select_tf_ops,
+                            emit_custom_ops, op_name_mapper);
   if (!maybe_translated) return true;
   *serialized_flatbuffer = std::move(*maybe_translated);
   return false;
 }
 
+bool tflite::MlirToFlatBufferTranslateFunction(
+    ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
+    bool emit_custom_ops) {
+  OpLocNameMapper op_name_mapper;
+  return MlirToFlatBufferTranslateFunction(
+      module, serialized_flatbuffer, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, &op_name_mapper);
+}
+
 static mlir::LogicalResult MlirToFlatBufferFileTranslateFunction(
     ModuleOp module, llvm::StringRef filename) {
   std::string serialized_flatbuffer;
+  std::unique_ptr<OpNameMapper> op_name_mapper;
+  if (strip_debug_info) {
+    op_name_mapper = std::make_unique<tensorflow::OpStripNameMapper>();
+  } else {
+    op_name_mapper = std::make_unique<OpLocNameMapper>();
+  }
   if (tflite::MlirToFlatBufferTranslateFunction(
           module, &serialized_flatbuffer, emit_builtin_tflite_ops,
-          emit_select_tf_ops, emit_custom_ops))
+          emit_select_tf_ops, emit_custom_ops, op_name_mapper.get()))
     return mlir::failure();
 
   auto file = openOutputFile(filename);
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.h b/tensorflow/compiler/mlir/lite/flatbuffer_translate.h
index 820b2697e43..477a477dde6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/op_name_mapper.h"
 
 // These flags are used to control the emission or not of different kinds of ops
 // during the flatbuffer translation.
@@ -27,16 +28,25 @@ extern bool emit_select_tf_ops;
 extern bool emit_custom_ops;
 // The flag to control whether to lower tensorlist ops into TF ops.
 extern bool lower_tensor_list_ops;
+// The flag to control whether debug info gets stripped on export.
+extern bool strip_debug_info;
 
 namespace tflite {
 
 // Translates the given MLIR `module` into a FlatBuffer and stores the
-// serialized flatbuffer into the string.
+// serialized flatbuffer into the string. This uses OpLocNameMapper to convert
+// location of the op to name in flatbuffer.
 bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
-                                       std::string *serialized_flatbuffer,
+                                       std::string* serialized_flatbuffer,
                                        bool emit_builtin_tflite_ops,
                                        bool emit_select_tf_ops,
                                        bool emit_custom_ops);
+
+// Same as the above but with a custom op name mapper.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    tensorflow::OpNameMapper* op_name_mapper);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/lite/g3doc/tfl_ops.md b/tensorflow/compiler/mlir/lite/g3doc/tfl_ops.md
deleted file mode 100755
index 74e4fc47868..00000000000
--- a/tensorflow/compiler/mlir/lite/g3doc/tfl_ops.md
+++ /dev/null
@@ -1,1606 +0,0 @@
-<!-- Autogenerated by mlir-tblgen; don't manually edit -->
-# Operation definition
-## tfl.abs (TFL::AbsOp)
-Absolute value operator
-
-### Description:
-
-Given a tensor `x`, this operation returns a tensor containing the absolute
-value of each element in `x`. For example, if x is an input element and y is
-an output element, this operation computes \\(y = |x|\\).
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.add_n (TFL::AddNOp)
-add_n operator
-
-### Description:
-
-Adds all input tensors element-wise.
-
-### Operands:
-1. `inputs`: tensor of 32-bit float or 32-bit integer values
-
-### Attributes:
-
-### Results:
-1. `sum`: tensor of 32-bit float or 32-bit integer values
-
-## tfl.add (TFL::AddOp)
-Addition operator
-
-### Description:
-
-Element-wise addition operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.average_pool_2d (TFL::AveragePool2DOp)
-Average_pool_2d operator
-
-### Description:
-
-Performs average-pooling operation on input.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `filter_height` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `filter_width` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `padding` | `StringAttr` | padding enum attribute |
-| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.batch_to_space_nd (TFL::BatchToSpaceNdOp)
-BatchToSpaceNd operator
-
-### Description:
-
-This operation reshapes the "batch" dimension 0 into space dimensions.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `block_shape`: tensor of 32-bit integer values
-1. `indices`: tensor of 32-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.ceil (TFL::CeilOp)
-Ceil operator
-
-### Description:
-
-Returns element-wise ceil value of the input.
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tfl.concatenation (TFL::ConcatenationOp)
-Concatenation operator
-
-### Description:
-
-Concatenates tensors along one dimension
-
-### Operands:
-1. `values`: tensor of 32-bit float or 64-bit integer or 32-bit integer or 16-bit integer or 8-bit integer or quantized type with 8 bits storage type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float or 64-bit integer or 32-bit integer or 16-bit integer or 8-bit integer or quantized type with 8 bits storage type values
-
-## tfl.pseudo_const (TFL::ConstOp)
-Constant pseudo op.
-
-### Description:
-
-Represents a constant value in TensorFlow Lite dialect. This is not an
-actual operation and it will be lowered to buffer instead.
-
-The op is allowed to have all the same type of attributes as tf.Const does
-(e.g., opaque TF attributes are allowed).
-
-### Operands:
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.conv_2d (TFL::Conv2DOp)
-Convolution operator
-
-### Description:
-
-Performs convolution operation on inputs.
-
-Inputs:
-  `inputs[0]`: required: the input activation tensor
-  `inputs[1]`: required: the filter weight tensor
-  `inputs[2]`: optional: the bias tensor
-
-### Operands:
-1. `input`: tensor of any type values
-1. `filter`: tensor of any type values
-1. `bias`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `dilation_h_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `dilation_w_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-| `padding` | `StringAttr` | padding enum attribute |
-| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.cos (TFL::CosOp)
-Cosine operator
-
-### Description:
-
-Computes element-wise Cosine of input
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tfl.depthwise_conv_2d (TFL::DepthwiseConv2DOp)
-Depthwise-separable convolution operator
-
-### Description:
-
-Performs convolution operation on inputs.
-
-Inputs:
-  `inputs[0]`: required: the input activation tensor
-  `inputs[1]`: required: the filter weight tensor
-  `inputs[2]`: optional: the bias tensor
-
-### Operands:
-1. `input`: tensor of any type values
-1. `filter`: tensor of any type values
-1. `bias`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `dilation_h_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `dilation_w_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-| `padding` | `StringAttr` | padding enum attribute |
-| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `depth_multiplier` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.dequantize (TFL::DequantizeOp)
-Dequantize operator
-
-### Description:
-
-Converts quantized array of integers to floating-points according to the
-quantization parameters.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.div (TFL::DivOp)
-Division operator
-
-### Description:
-
-Element-wise division operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.elu (TFL::EluOp)
-Exponential Linear Unit operator
-
-### Description:
-
-Computes the exponential linear
-  f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
-element-wise.
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.equal (TFL::EqualOp)
-Equal operator
-
-### Description:
-
-Returns the truth element of x == y element-wise
-
-### Operands:
-1. `x`: tensor of 1-bit integer or 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-1. `y`: tensor of 1-bit integer or 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.exp (TFL::ExpOp)
-Natural exponentiation operator
-
-### Description:
-
-Performs element-wise natural exponentiation operation on input.
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.expand_dims (TFL::ExpandDimsOp)
-Inserts a dimension of 1 into a tensor's shape.
-
-### Description:
-
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-zero; if you specify a negative number for `axis` it is counted backward from
-the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
-
-### Operands:
-1. `input`: tensor of any type values
-1. `dim`: tensor of any integer type
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.fake_quant (TFL::FakeQuantOp)
-FakeQuant operator
-
-### Description:
-
-Fake-quantize the 'inputs' tensor of type float via float scalars min and
-max to 'outputs' tensor of same shape as inputs.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `minmax` | `ArrayAttr` | min-max range pair attribute |
-| `num_bits` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `narrow_range` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.fill (TFL::FillOp)
-Fill the tensor with given value.
-
-### Description:
-
-Fill the tensor with given value.
-
-### Operands:
-1. `dims`: tensor of 32/64-bit integer values
-1. `value`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `res`: tensor of any type values
-
-## tfl.floor_div (TFL::FloorDivOp)
-Floor div operator
-
-### Description:
-
-Element-wise floor div operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.floor_mod (TFL::FloorModOp)
-Division reminder
-
-### Description:
-
-Element-wise division reminder operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.floor (TFL::FloorOp)
-Floor operator
-
-### Description:
-
-Returns element-wise floor value of the input.
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tfl.fully_connected (TFL::FullyConnectedOp)
-Fully connected op
-
-### Description:
-
-
-### Operands:
-1. `input`: tensor of 32-bit float values
-1. `filter`: tensor of 32-bit float values
-1. `bias`: tensor of 32-bit float values or none type
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-| `weights_format` | `StringAttr` | fully connected options weights format attribute |
-| `keep_num_dims` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float values
-
-## tfl.gather (TFL::GatherOp)
-Gather operator
-
-### Description:
-
-Gather slices from `params` axis `axis` according to `indices`.
-
-### Operands:
-1. `params`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer or TFLite string type values
-1. `indices`: tensor of 32-bit integer or 64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer or TFLite string type values
-
-## tfl.greater_equal (TFL::GreaterEqualOp)
-Greater_equal operator
-
-### Description:
-
-Element-wise greater_equal operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.greater (TFL::GreaterOp)
-Greater operator
-
-### Description:
-
-Element-wise greater operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.pseudo_input (TFL::InputOp)
-Input pseudo operator
-
-### Description:
-
-Takes one of the function arguments as input and returns it as result.  This
-is a NOP and is used to attach attributes such as tensor name to function
-arguments.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.leaky_relu (TFL::LeakyReluOp)
-Leaky Relu operator
-
-### Description:
-
-Element-wise Leaky ReLU operator
-  x -> x >= 0 ? x : (alpha * x)
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `alpha` | `FloatAttr` | 32-bit float attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.less_equal (TFL::LessEqualOp)
-Less_equal operator
-
-### Description:
-
-Element-wise less_equal operation.
-
-### Operands:
-1. `lhs`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-1. `rhs`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.less (TFL::LessOp)
-Less operator
-
-### Description:
-
-Element-wise less operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.log (TFL::LogOp)
-Natural logarithm operator
-
-### Description:
-
-Performs element-wise natural logarithm operation on input.
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.log_softmax (TFL::LogSoftmaxOp)
-Log softmax operator
-
-### Description:
-
-Computes element-wise log softmax activations with the following formula
-
-  input - log(reduce_sum(exp(input), dim))
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.logical_and (TFL::LogicalAndOp)
-Logical AND operator
-
-### Description:
-
-Element-wise logical AND operation.
-
-### Operands:
-1. `lhs`: tensor of 1-bit integer values
-1. `rhs`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.logical_not (TFL::LogicalNotOp)
-Logical NOT operator
-
-### Description:
-
-Element-wise logical NOT operation.
-
-### Operands:
-1. `lhs`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.logical_or (TFL::LogicalOrOp)
-Logical OR operator
-
-### Description:
-
-Element-wise logical OR operation.
-
-### Operands:
-1. `lhs`: tensor of 1-bit integer values
-1. `rhs`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.logistic (TFL::LogisticOp)
-Logistic operator
-
-### Description:
-
-Computes element-wise Sigmoid of input
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tfl.max_pool_2d (TFL::MaxPool2DOp)
-Max Pool 2D op
-
-### Description:
-
-Performs max pool 2D on input.
-
-Inputs:
-  `inputs[0]`: required: the input tensor
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `padding` | `StringAttr` | padding enum attribute |
-| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `filter_width` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `filter_height` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.maximum (TFL::MaximumOp)
-Max operator
-
-### Description:
-
-Element-wise max operation.
-
-### Operands:
-1. `lhs`: tensor of floating-point or 32/64-bit integer values
-1. `rhs`: tensor of floating-point or 32/64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `max`: tensor of floating-point or 32/64-bit integer values
-
-## tfl.mean (TFL::MeanOp)
-Mean operator
-
-### Description:
-
-Computes the mean of elements across dimensions of a tensor.
-Reduces input_tensor along the dimensions given in axis.
-Unless keepdims is true, the rank of the tensor is reduced by 1 for
-each entry in axis. If keepdims is true, the reduced dimensions are retained
-with length 1.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `axis`: tensor of 32-bit integer or 64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-
-## tfl.minimum (TFL::MinimumOp)
-Min operator
-
-### Description:
-
-Element-wise min operation.
-
-### Operands:
-1. `lhs`: tensor of floating-point or 32/64-bit integer values
-1. `rhs`: tensor of floating-point or 32/64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `min`: tensor of floating-point or 32/64-bit integer values
-
-## tfl.mul (TFL::MulOp)
-Multiplication operator
-
-### Description:
-
-Element-wise multiplication operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.neg (TFL::NegOp)
-Negation operator
-
-### Description:
-
-Computes element-wise negation of input
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.not_equal (TFL::NotEqualOp)
-Not_equal operator
-
-### Description:
-
-Element-wise not_equal operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 1-bit integer values
-
-## tfl.pack (TFL::PackOp)
-Packs a list of tensors along a dimension into one tensor
-
-### Description:
-
-Packs a list of `values_count` rank-`R` tensors into one rank-`(R+1)`
-tensor.
-
-Packs the `values_count` tensors in `values` into a tensor with rank one
-higher than each tensor in `values`, by packing them along the `axis`
-dimension.
-
-Given a list of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of `unpack`.
-
-### Operands:
-1. `values`: tensor of 32-bit float or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `values_count` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.pad (TFL::PadOp)
-Padding operator
-
-### Description:
-
-This operation pads a `input` with zeros according to the `paddings` you
-specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]`
-indicates how many zeros to add before the contents of `input` in that
-dimension, and `paddings[D, 1]` indicates how many zeros to add after the
-contents of `input` in that dimension.
-
-The padded size of each dimension D of the output is:
-
-  `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `padding`: tensor of 32/64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.padv2 (TFL::PadV2Op)
-Padding operator v2
-
-### Description:
-
-This operation pads a `input` according to the `paddings` and
-`constant_values` you specify. `paddings` is an integer tensor with shape
-`[Dn, 2]`, where n is the rank of `input`. For each dimension D of `input`,
-`paddings[D, 0]` indicates how many zeros to add before the contents of
-`input` in that dimension, and `paddings[D, 1]` indicates how many zeros to
-add after the contents of `input` in that dimension. `constant_values` is a
-scalar tensor of the same type as `input` that indicates the value to use
-for padding `input`.
-
-The padded size of each dimension D of the output is:
-
-  `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `padding`: tensor of 32/64-bit integer values
-1. `constant_values`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.pow (TFL::PowOp)
-Power operator
-
-### Description:
-
-Element-wise power operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.pseudo_qconst (TFL::QConstOp)
-Quantized constant pseudo op
-
-### Description:
-
-Represents a quantized constant value in TensorFlow Lite dialect. This is
-not an actual operation and it will be lowered to buffer instead. The
-quantization parameters are stored as a type attribute in this constant.
-
-### Operands:
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `qtype` | `TypeAttr` | Tensor type attribute attribute |
-| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.quantize (TFL::QuantizeOp)
-Quantize operator
-
-### Description:
-
-Converts floating point tensors to quantized integer tensors according to
-the quantization parameters defined in the type attribute.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `qtype` | `TypeAttr` | Tensor type attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.range (TFL::RangeOp)
-Range operator
-
-### Description:
-
-Returns a 1D tensor defined by a sequence from `start` to `limit` with
-a given `delta`.
-
-### Operands:
-1. `start`: tensor of any type values
-1. `limit`: tensor of any type values
-1. `delta`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `result`: tensor of any type values
-
-## tfl.rank (TFL::RankOp)
-Rank operator.
-
-### Description:
-
-Returns the rank of a tensor.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any integer type
-
-## tfl.reduce_max (TFL::ReduceMaxOp)
-Max-reduction operator
-
-### Description:
-
-Computes the max reduction along the specified axes
-
-### Operands:
-1. `input`: tensor of any type values
-1. `axes`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. &laquo;unnamed&raquo;: tensor of any type values
-
-## tfl.reduce_min (TFL::ReduceMinOp)
-Min-reduction operator
-
-### Description:
-
-Computes the min reduction along the specified axes
-
-### Operands:
-1. `input`: tensor of any type values
-1. `axes`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. &laquo;unnamed&raquo;: tensor of any type values
-
-## tfl.relu6 (TFL::Relu6Op)
-Relu6 operator
-
-### Description:
-
-Element-wise Relu6 operator
-  x -> max(0, min(6, x))
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.relu (TFL::ReluOp)
-Relu operator
-
-### Description:
-
-Element-wise Relu operator
-  x -> max(0, x)
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.reshape (TFL::ReshapeOp)
-Reshape operator
-
-### Description:
-
-Produces a tensor with the same values but different static shape defined
-by the output type.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `new_shape` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.resize_bilinear (TFL::ResizeBilinearOp)
-ResizeBilinear Op
-
-### Description:
-
-Resize `images` to `size` using bilinear interpolation.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 32-bit integer values
-1. `size`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `align_corners` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float values
-
-## tfl.reverse_v2 (TFL::ReverseV2Op)
-ReverseV2 Operator
-
-### Description:
-
-Reverses specific dimensions of a tensor.
-
-Given a tensor, and a int32/int64 tensor axis representing the set
-of dimensions of tensor to reverse.
-This operation reverses each dimension i for
-which there exists j s.t. axis[j] == i.
-
-Args:
-  tensor: A Tensor. Must be one of the following types:
-  int16, int32, int64, float32 Up to 8-D.
-
-  axis: A Tensor. Must be one of the following types: int32, int64.
-  with only 1 element which is the axis index.
-  TODO: Add support for multiple elements.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-1. `axis`: tensor of 32-bit integer or 64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer values
-
-## tfl.rsqrt (TFL::RsqrtOp)
-Reciprocal of square root operator
-
-### Description:
-
-Computes element-wise reverse square root of input
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.select (TFL::SelectOp)
-Select operator
-
-### Description:
-
-Select values of 'x' if the corresponding value of 'condition' is true or
-the value of 'y' if false. There are valid condition input sizes:
-
-1. Either the same shape (in which case the select is elementwise), or
-2. condition must be Rank 1 and match over the first dimension.
-
-### Operands:
-1. `condition`: tensor of 1-bit integer values
-1. `x`: tensor of 32-bit float or 1-bit integer or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
-1. `y`: tensor of 32-bit float or 1-bit integer or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.shape (TFL::ShapeOp)
-Shape operator
-
-### Description:
-
-Returns the shape of a tensor.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `out_type` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.sin (TFL::SinOp)
-Sine operator
-
-### Description:
-
-Computes element-wise Sine of input
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tfl.softmax (TFL::SoftmaxOp)
-Softmax operator
-
-### Description:
-
-Computes element-wise softmax activiations with the following formula
-
-  exp(input) / tf.reduce_sum(exp(input * beta), dim)
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `beta` | `FloatAttr` | 32-bit float attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.space_to_batch_nd (TFL::SpaceToBatchNdOp)
-SpaceToBatchNd operator
-
-### Description:
-
-This operation reshapes space dimensions into the "batch" dimension 0
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `block_shape`: tensor of 32-bit integer values
-1. `paddings`: tensor of 32-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.split (TFL::SplitOp)
-Splits a tensor into `num_split` tensors along one dimension.
-
-### Description:
-
-Splits the `value` tensor along `split_dim` into a number of sub-tensors
-with same shape as the original one, except for `split_dim`. Same as
-tf.Split.
-
-### Operands:
-1. `split_dim`: tensor of 32-bit integer values
-1. `value`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_splits` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `outputs`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.split_v (TFL::SplitVOp)
-Splits a tensor into `num_split` tensors along one dimension.
-
-### Description:
-
-Splits the `value` tensor along `split_dim` into a number of sub-tensors
-with same shape as the original one, except for `split_dim`. The grouping
-of the resultant sub-tensors is decided by `size-splits`. Same as tf.SplitV.
-
-### Operands:
-1. `value`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-1. `size_splits`: tensor of 32-bit integer values
-1. `split_dim`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_splits` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `outputs`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
-
-## tfl.sqrt (TFL::SqrtOp)
-Square root operator
-
-### Description:
-
-Computes element-wise Square root of input
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.square (TFL::SquareOp)
-Square operator
-
-### Description:
-
-Computes element-wise Square of input
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.squared_difference (TFL::SquaredDifferenceOp)
-Squared difference operator
-
-### Description:
-
-Element-wise squared difference operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.squeeze (TFL::SqueezeOp)
-Removes dimensions of size 1 from the shape of a tensor.
-
-### Description:
-
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
-
-For example:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `squeeze_dims` | `ArrayAttr` | 64-bit integer array attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.strided_slice (TFL::StridedSliceOp)
-StridedSlice Op
-
-### Description:
-
-Return a strided slice from `input`.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-1. `begin`: tensor of 32-bit integer values
-1. `end`: tensor of 32-bit integer values
-1. `strides`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `begin_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `end_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `ellipsis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `new_axis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `shrink_axis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
-
-## tfl.sub (TFL::SubOp)
-Subtraction operator
-
-### Description:
-
-Element-wise subtraction operation.
-
-### Operands:
-1. `lhs`: tensor of any type values
-1. `rhs`: tensor of any type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.sum (TFL::SumOp)
-Sum operator
-
-### Description:
-
-Computes the sum reduction along the specified axes
-
-### Operands:
-1. `input`: tensor of any type values
-1. `axes`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. &laquo;unnamed&raquo;: tensor of any type values
-
-## tfl.tanh (TFL::TanhOp)
-Hyperbolic tangent operator
-
-### Description:
-
-Computes element-wise Hyperbolic tangent of input
-
-### Operands:
-1. `x`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.tile (TFL::TileOp)
-Tile operator.
-
-### Description:
-
- Constructs a tensor by tiling a given tensor.
-
-This operation creates a new tensor by replicating input
-multiples times. The output tensor's i'th dimension has
-input.dims(i) * multiples[i] elements, and the values of input
-are replicated multiples[i] times along the 'i'th dimension.
-For example, tiling [a b c d] by [2] produces [a b c d a b c d].
-
-### Operands:
-1. `input`: tensor of any type values
-1. `multiples`: tensor of 32/64-bit integer values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.topk_v2 (TFL::TopKV2Op)
-TopK operator
-
-### Description:
-
-Returns the top `k` largest element along each last dimensional slice of
-`input` and the indices of values within the last dimension of the input
-tensor.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
-1. `k`: tensor of 32-bit integer values
-
-### Attributes:
-
-### Results:
-1. `values`: tensor of any type values
-1. `indices`: tensor of 32-bit integer values
-
-## tfl.transpose (TFL::TransposeOp)
-Transpose operator
-
-### Description:
-
-Returns the Transpose of x
-
-### Operands:
-1. `x`: tensor of any type values
-1. `perm`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of any type values
-
-## tfl.unidirectional_sequence_lstm (TFL::UnidirectionalSequenceLSTMOp)
-Unidirectional sequence lstm operator
-
-### Description:
-
-A recurrent neural network specified by an LSTM cell. This Op supports
-unrolling the input along the time or batch dimensions, and
-implements the following operation for
-each element in the sequence s = 1...sequence_length:
-  outputs[s] = state = activation(LSTMOp(inputs[s]))
-
-where LSTMOp is LSTM TF Lite Op and the “activation” is the function passed
-as the “fused_activation_function” argument (if not “NONE”).
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer values
-1. `input_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `input_to_forget_weights`: tensor of 32-bit float or 8-bit integer values
-1. `input_to_cell_weights`: tensor of 32-bit float or 8-bit integer values
-1. `input_to_output_weights`: tensor of 32-bit float or 8-bit integer values
-1. `recurrent_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `recurrent_to_forget_weights`: tensor of 32-bit float or 8-bit integer values
-1. `recurrent_to_cell_weights`: tensor of 32-bit float or 8-bit integer values
-1. `recurrent_to_output_weights`: tensor of 32-bit float or 8-bit integer values
-1. `cell_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `cell_to_forget_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `cell_to_output_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `input_gate_bias`: tensor of 32-bit float values or none type
-1. `forget_gate_bias`: tensor of 32-bit float values
-1. `cell_bias`: tensor of 32-bit float values
-1. `output_gate_bias`: tensor of 32-bit float values
-1. `projection_weights`: tensor of 32-bit float or 8-bit integer values or none type
-1. `projection_bias`: tensor of 32-bit float values or none type
-1. `input_activation_state`: stateful tensor
-1. `input_cell_state`: stateful tensor
-1. `input_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
-1. `forget_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
-1. `cell_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
-1. `output_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
-| `cell_clip` | `FloatAttr` | 32-bit float attribute attribute |
-| `proj_clip` | `FloatAttr` | 32-bit float attribute attribute |
-| `time_major` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `output`: tensor of any type values
-
-## tfl.unpack (TFL::UnpackOp)
-Unpacks a tensor along a dimension into multiple tensors
-
-### Description:
-
-Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-  dimension unpacked along is gone, unlike `split`).
-
-If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-  and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of `pack`.
-
-### Operands:
-1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num` | `IntegerAttr` | 32-bit integer attribute attribute |
-| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
-
-### Results:
-1. `outputs`: tensor of 32-bit float or 8-bit integer or 32-bit integer values
-
-## tfl.zeros_like (TFL::ZerosLikeOp)
-ZerosLike operator
-
-### Description:
-
-Returns a tensor of zeros with the same shape and type as the input tensor.
-
-### Operands:
-1. `input`: tensor of any type values
-
-### Attributes:
-
-### Results:
-1. `output`: tensor of any type values
-
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 6c91470da07..c3dd7f5a398 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -15,13 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
+#include <cstdint>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Matchers.h"  // TF:local_config_mlir
 #include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
@@ -54,13 +62,21 @@ inline bool IsTrailingDimensions(ArrayRef<int64_t> a, ArrayRef<int64_t> b) {
   return std::equal(a.rbegin(), a.rend(), b.rbegin());
 }
 
+// Returns true if it is a shaped type of f32 elements.
+inline bool IsF32ShapedType(Type t) {
+  if (auto shaped_type = t.dyn_cast_or_null<ShapedType>()) {
+    return shaped_type.getElementType().isF32();
+  }
+  return false;
+}
+
 // Performs const folding `calculate` with broadcast behavior on the two
 // attributes `operand1` and `operand2` and returns the result if possible.
 // The two operands are expected to both be scalar values.
 template <class AttrElementT,
           class ElementValueT = typename AttrElementT::ValueType,
           class CalculationT =
-              std::function<ElementValueT(ElementValueT, ElementValueT)>>
+              llvm::function_ref<ElementValueT(ElementValueT, ElementValueT)>>
 Attribute ConstFoldBinaryOpScalarScalar(Type result_type, Attribute operand1,
                                         Attribute operand2,
                                         const CalculationT &calculate) {
@@ -75,100 +91,68 @@ Attribute ConstFoldBinaryOpScalarScalar(Type result_type, Attribute operand1,
                            calculate(lhs.getValue(), rhs.getValue()));
 }
 
-// TODO: We have multiple functions to handle different attriubte kinds in the
-// following. Consider add methods to ElementsAttr to unify these functions.
-
-// Performs const folding `calculate` with broadcast behavior on the two
-// attributes `operand1` and `operand2` and returns the result if possible.
-// This function assumes that both operands are `AttrElementT` attributes.
-template <class AttrElementT,
-          class ElementValueT = typename AttrElementT::ValueType,
-          class CalculationT =
-              std::function<ElementValueT(ElementValueT, ElementValueT)>>
-Attribute ConstFoldBinaryOpSplatSplat(Type result_type, Attribute operand1,
-                                      Attribute operand2,
-                                      const CalculationT &calculate) {
-  auto type = result_type.cast<ShapedType>();
-  auto elem_type = type.getElementType();
-
-  auto element_result = ConstFoldBinaryOpScalarScalar<AttrElementT>(
-      elem_type, operand1, operand2, calculate);
-  if (!element_result) return {};
-
-  return DenseElementsAttr::get(type, element_result);
-}
-
 /// Performs const folding `calculate` with broadcast behavior on the two
 /// attributes `operand1` and `operand2` and returns the result if possible.
-/// This function assumes the first operand is a DenseElementsAttr and the
-/// second one is a SplatElementsAttr, and both are verified to have value
+/// This function assumes the both operands are verified to have value
 /// attributes of broadcastable types.
 template <class AttrElementT,
           class ElementValueT = typename AttrElementT::ValueType,
           class CalculationT =
-              std::function<ElementValueT(ElementValueT, ElementValueT)>>
-Attribute ConstFoldBinaryOpDenseSplat(Type result_type, Attribute operand1,
-                                      Attribute operand2,
+              llvm::function_ref<ElementValueT(ElementValueT, ElementValueT)>>
+Attribute ConstFoldBinaryOpDenseDense(Type result_type, DenseElementsAttr lhs,
+                                      DenseElementsAttr rhs,
                                       const CalculationT &calculate) {
-  auto lhs = operand1.cast<DenseElementsAttr>();
-
-  // TODO: Support broadcast behavior
-  if (lhs.getType() != result_type || operand2.getType() != result_type)
-    return {};
-
-  auto rhs = operand2.cast<SplatElementsAttr>().getSplatValue();
   auto type = result_type.cast<ShapedType>();
 
-  SmallVector<ElementValueT, 16> new_values;
-  new_values.reserve(lhs.rawSize());
-
-  // Add the splat value to each of the values in the dense elements
-  // attribute.
-  auto rhs_val = rhs.cast<AttrElementT>().getValue();
-  for (auto old_val : lhs.getValues<ElementValueT>()) {
-    new_values.push_back(calculate(old_val, rhs_val));
-  }
-
-  return DenseElementsAttr::get(type, new_values);
-}
-
-/// Performs const folding `calculate` with broadcast behavior on the two
-/// attributes `operand1` and `operand2` and returns the result if possible.
-/// This function assumes the both operands are DenseElementsAttr and verified
-/// to have value attributes of broadcastable types.
-template <class AttrElementT,
-          class ElementValueT = typename AttrElementT::ValueType,
-          class CalculationT =
-              std::function<ElementValueT(ElementValueT, ElementValueT)>>
-Attribute ConstFoldBinaryOpDenseDense(Type result_type, Attribute operand1,
-                                      Attribute operand2,
-                                      const CalculationT &calculate) {
-  auto lhs = operand1.cast<DenseElementsAttr>();
-  auto rhs = operand2.cast<DenseElementsAttr>();
-
   if (lhs.getType() != rhs.getType()) {
     // We only support the case that one of the operand's dimensions are
     // a perfect suffix of the other.
     // TODO: support the general broadcast behavior.
     auto lhs_shape = lhs.getType().getShape();
     auto rhs_shape = rhs.getType().getShape();
-    if (!IsTrailingDimensions(lhs_shape, rhs_shape) &&
-        !IsTrailingDimensions(rhs_shape, lhs_shape))
+    if (IsTrailingDimensions(lhs_shape, rhs_shape)) {
+      if (!type.hasStaticShape()) type = rhs.getType();
+    } else if (IsTrailingDimensions(rhs_shape, lhs_shape)) {
+      if (!type.hasStaticShape()) type = lhs.getType();
+    } else {
       return {};
+    }
+  } else if (!type.hasStaticShape()) {
+    type = lhs.getType();
+  }
+
+  const bool rhs_is_splat = rhs.isSplat();
+  const bool lhs_is_splat = lhs.isSplat();
+
+  // If both of them are splat, compute and return.
+  if (lhs_is_splat && rhs_is_splat) {
+    auto element_result = AttrElementT::get(
+        type.getElementType(), calculate(lhs.getSplatValue<ElementValueT>(),
+                                         rhs.getSplatValue<ElementValueT>()));
+    if (!element_result) return {};
+
+    return DenseElementsAttr::get(type, element_result);
   }
 
   auto lhs_num_elements = lhs.getType().getNumElements();
   auto rhs_num_elements = rhs.getType().getNumElements();
-
-  auto type = result_type.cast<ShapedType>();
-  auto num_elements = type.getNumElements();
+  auto num_elements = std::max(lhs_num_elements, rhs_num_elements);
 
   // We assume the arguments have broadcast-compatible types. Make sure again.
   assert(std::max(lhs_num_elements, rhs_num_elements) == num_elements);
   assert(num_elements % std::min(lhs_num_elements, rhs_num_elements) == 0);
 
-  SmallVector<ElementValueT, 16> lhs_old_values(lhs.getValues<ElementValueT>());
-  SmallVector<ElementValueT, 16> rhs_old_values(rhs.getValues<ElementValueT>());
+  SmallVector<ElementValueT, 16> lhs_old_values;
+  SmallVector<ElementValueT, 16> rhs_old_values;
+  if (lhs_is_splat)
+    lhs_old_values.push_back(lhs.getSplatValue<ElementValueT>());
+  else
+    lhs_old_values = llvm::to_vector<16>(lhs.getValues<ElementValueT>());
+  if (rhs_is_splat)
+    rhs_old_values.push_back(rhs.getSplatValue<ElementValueT>());
+  else
+    rhs_old_values = llvm::to_vector<16>(rhs.getValues<ElementValueT>());
+
   SmallVector<ElementValueT, 16> new_values;
   new_values.reserve(num_elements);
 
@@ -186,8 +170,8 @@ Attribute ConstFoldBinaryOpDenseDense(Type result_type, Attribute operand1,
     // operand with more elements, since the result has the same number of
     // elements, we are only going over its elements once. The modulo operation
     // also works for that.
-    int lhs_index = i % lhs_num_elements;
-    int rhs_index = i % rhs_num_elements;
+    int lhs_index = lhs_is_splat ? 0 : (i % lhs_num_elements);
+    int rhs_index = rhs_is_splat ? 0 : (i % rhs_num_elements);
 
     new_values.push_back(
         calculate(lhs_old_values[lhs_index], rhs_old_values[rhs_index]));
@@ -203,7 +187,7 @@ Attribute ConstFoldBinaryOpDenseDense(Type result_type, Attribute operand1,
 template <class AttrElementT,
           class ElementValueT = typename AttrElementT::ValueType,
           class CalculationT =
-              std::function<ElementValueT(ElementValueT, ElementValueT)>>
+              llvm::function_ref<ElementValueT(ElementValueT, ElementValueT)>>
 Attribute ConstFoldBinaryOp(Type result_type, Attribute operand1,
                             Attribute operand2, const CalculationT &calculate,
                             bool is_commutative) {
@@ -212,30 +196,11 @@ Attribute ConstFoldBinaryOp(Type result_type, Attribute operand1,
     if (operand2.dyn_cast_or_null<AttrElementT>())
       return ConstFoldBinaryOpScalarScalar<AttrElementT>(result_type, operand1,
                                                          operand2, calculate);
-  } else if (auto lhs = operand1.dyn_cast_or_null<SplatElementsAttr>()) {
-    // Splat op splat case
-    if (auto rhs = operand2.dyn_cast_or_null<SplatElementsAttr>())
-      return ConstFoldBinaryOpSplatSplat<AttrElementT>(
-          result_type, lhs.getSplatValue(), rhs.getSplatValue(), calculate);
-
-    // Splat op dense case
-    if (auto rhs = operand2.dyn_cast_or_null<DenseElementsAttr>()) {
-      if (is_commutative) {
-        // Swap the two constant values to fall into the following case
-        return ConstFoldBinaryOpDenseSplat<AttrElementT>(result_type, operand2,
-                                                         operand1, calculate);
-      }
-    }
-  } else if (auto lhs = operand1.dyn_cast_or_null<DenseElementsAttr>()) {
-    // Dense op splat case
-    if (auto rhs = operand2.dyn_cast_or_null<SplatElementsAttr>())
-      return ConstFoldBinaryOpDenseSplat<AttrElementT>(result_type, operand1,
-                                                       operand2, calculate);
-
-    // Dense op dense case
-    if (auto rhs = operand2.dyn_cast_or_null<DenseElementsAttr>())
-      return ConstFoldBinaryOpDenseDense<AttrElementT>(result_type, operand1,
-                                                       operand2, calculate);
+  } else if (operand1.dyn_cast_or_null<DenseElementsAttr>() &&
+             operand2.dyn_cast_or_null<DenseElementsAttr>()) {
+    return ConstFoldBinaryOpDenseDense<AttrElementT>(
+        result_type, operand1.cast<DenseElementsAttr>(),
+        operand2.cast<DenseElementsAttr>(), calculate);
   }
 
   // TODO: support other attribute kinds
@@ -249,8 +214,9 @@ Attribute ConstFoldBinaryOp(Type result_type, Attribute operand1,
 /// `intCalculate` is chosen to conduct the calculate.
 Attribute ConstFoldBinaryOp(
     Type result_type, ArrayRef<Attribute> operands,
-    std::function<APFloat(APFloat, APFloat)> float_calculate,
-    std::function<APInt(APInt, APInt)> int_calculate, bool is_commutative) {
+    llvm::function_ref<APFloat(APFloat, APFloat)> float_calculate,
+    llvm::function_ref<APInt(APInt, APInt)> int_calculate,
+    bool is_commutative) {
   // Note: All types are wrapped in tensor types in TFlite. E.g., f32 is
   // represented as tensor<f32>. So we are only handling tensor types here.
   auto type = result_type.dyn_cast<ShapedType>();
@@ -269,6 +235,32 @@ Attribute ConstFoldBinaryOp(
   return {};
 }
 
+/// Performs const folding a attributes `operand` and returns the result if
+/// possible.
+/// The function currently asserts that the `result_type` to be a f32 tensor
+/// type.
+/// TODO: Extend this function to handle integral tensor for ops like
+/// "tfl.logical_not".
+Attribute ConstFoldUnaryOp(Type result_type, Attribute operand,
+                           llvm::function_ref<APFloat(APFloat)> calculate) {
+  assert(IsF32ShapedType(result_type));
+  auto result_shape_type = result_type.cast<ShapedType>();
+
+  if (auto dense_elements = operand.dyn_cast_or_null<DenseElementsAttr>()) {
+    SmallVector<APFloat, 16> new_values;
+    const int num_elements = result_shape_type.getNumElements();
+    new_values.reserve(num_elements);
+
+    for (APFloat old_value : dense_elements.getValues<APFloat>()) {
+      new_values.push_back(calculate(old_value));
+    }
+
+    return DenseElementsAttr::get(result_shape_type, new_values);
+  }
+
+  return {};
+}
+
 void buildComparisonBinOp(Builder *builder, OperationState *result, Value *lhs,
                           Value *rhs) {
   auto result_type =
@@ -410,6 +402,23 @@ static LogicalResult Verify(PackOp op) {
   if (op.getOperation()->getNumOperands() != op.values_count())
     return op.emitOpError("input count should match 'values_count' attribute");
 
+  Value *operand0 = op.getOperand(0);
+  auto input_type = operand0->getType().cast<ShapedType>();
+
+  // Check axis bounds.
+  int64_t axis_value = op.axis().getSExtValue();
+  if (abs(axis_value) > input_type.getRank())
+    return op.emitOpError("op attribute 'axis' is out of bounds, got ")
+           << axis_value;
+
+  // Make sure all inputs have the same shape and element type.
+  // TODO(rahulsp): Simplify once b/135032064 is fixed.
+  for (Value *operand : op.getOperands()) {
+    auto other_type = operand->getType().cast<ShapedType>();
+    if (input_type != other_type)
+      return op.emitOpError("operands should be of the same type");
+  }
+
   return success();
 }
 
@@ -453,12 +462,87 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
   // Remove identity reshape.
   if (getType() == getOperand()->getType()) return getOperand();
 
+  // Constant folding
+  assert(operands.size() == 1);
+  if (auto dense_elements = operands[0].dyn_cast_or_null<DenseElementsAttr>()) {
+    auto result_shape_type = getType().cast<ShapedType>();
+    return dense_elements.reshape(result_shape_type);
+  }
+
   return nullptr;
 }
 
 void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                             MLIRContext *context) {
-  results.push_back(llvm::make_unique<RemoveAdjacentReshape>(context));
+  results.insert<RemoveAdjacentReshape>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// SliceOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SliceOp op) {
+  auto input_type = op.input()->getType().cast<ShapedType>();
+  auto begin_type = op.begin()->getType().cast<ShapedType>();
+  auto size_type = op.size()->getType().cast<ShapedType>();
+  if (input_type.hasStaticShape() && begin_type.hasStaticShape() &&
+      size_type.hasStaticShape()) {
+    if (input_type.getRank() != begin_type.getNumElements()) {
+      return op.emitError(
+          "begin tensor elements size is not equal to input tensor rank");
+    }
+
+    if (input_type.getRank() != size_type.getNumElements()) {
+      return op.emitError(
+          "size tensor elements size is not equal to input tensor rank");
+    }
+  }
+
+  DenseIntElementsAttr begin;
+  if (matchPattern(op.begin(), m_Constant(&begin))) {
+    int axis = 0;
+    for (auto begin_i : llvm::enumerate(begin)) {
+      if (begin_i.value().getSExtValue() < 0) {
+        return op.emitError(
+            llvm::formatv("begin[{0}] cannot be negative", axis));
+      }
+      axis++;
+    }
+  }
+
+  DenseIntElementsAttr size;
+  if (matchPattern(op.size(), m_Constant(&size))) {
+    int axis = 0;
+    for (auto size_i : llvm::enumerate(size)) {
+      if (size_i.value().getSExtValue() < -1) {
+        return op.emitError(
+            llvm::formatv("size[{0}] cannot be negative other than -1", axis));
+      }
+      axis++;
+    }
+  }
+
+  if (begin && size && input_type.hasStaticShape()) {
+    const int input_rank = begin.getNumElements();
+    for (uint64_t i = 0; i < input_rank; i++) {
+      int begin_i =
+          begin.getValue({i}).cast<IntegerAttr>().getValue().getSExtValue();
+      int size_i =
+          size.getValue({i}).cast<IntegerAttr>().getValue().getSExtValue();
+      int dim_i = input_type.getShape()[i];
+      if (begin_i >= dim_i) {
+        return op.emitOpError(llvm::formatv(
+            "begin[{0}] cannot exceed dimension length: {1}", i, dim_i));
+      }
+      if (size_i >= 0 && begin_i + size_i > dim_i) {
+        return op.emitError(llvm::formatv(
+            "begin[{0}] + size[{0}] cannot exceed dimension length: {1}", i,
+            dim_i));
+      }
+    }
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -486,7 +570,7 @@ static void BuildTopKOp(Builder *builder, OperationState *result, Value *input,
   if (matchPattern(k, m_Constant(&cst)))
     // These casts should all be valid due to how Tensor constants are stored.
     // TODO(jpienaar): This should use a helper function.
-    const_k = cst.getValue({}).cast<IntegerAttr>().getValue().getSExtValue();
+    const_k = cst.getValue<IntegerAttr>({}).getValue().getSExtValue();
 
   auto val_type = input->getType().cast<TensorType>();
   // If value is unranked, then so is results.
@@ -543,7 +627,7 @@ struct DropFakeQuant : public RewritePattern {
 
 void FakeQuantOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                               MLIRContext *context) {
-  results.push_back(llvm::make_unique<DropFakeQuant>(context));
+  results.insert<DropFakeQuant>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -562,12 +646,422 @@ static LogicalResult Verify(UnpackOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SplitOp
+//===----------------------------------------------------------------------===//
+
+// Extracts and returns the signed integer constant in a 0-rank integer tensor
+// if 'value' is a constant.
+static llvm::Optional<int64_t> ExtractConstantIntFromTensor(Value *value) {
+  ElementsAttr attr;
+  if (!matchPattern(value, m_Constant(&attr))) return {};
+
+  IntegerAttr int_attr = attr.getValue(llvm::None).cast<IntegerAttr>();
+  return int_attr.getValue().getSExtValue();
+}
+
+static LogicalResult Verify(SplitOp op) {
+  int64_t num_splits = op.num_splits().getSExtValue();
+  if (op.getOperation()->getNumResults() != num_splits)
+    return op.emitOpError("output count should match 'num_splits' attribute");
+
+  // If 'split_dim' is not a constant, there are no other checks.
+  llvm::Optional<int64_t> split_dim_opt =
+      ExtractConstantIntFromTensor(op.split_dim());
+  if (!split_dim_opt) return success();
+
+  // If 'input' is not a ranked tensor, there are no other checks.
+  auto input_type = op.value()->getType().dyn_cast<RankedTensorType>();
+  if (!input_type) return success();
+
+  int64_t split_dim = split_dim_opt.getValue();
+  const int64_t rank = input_type.getRank();
+  if (split_dim < 0) split_dim += rank;
+  if (split_dim < 0 || split_dim >= rank)
+    return op.emitOpError("'split_dim' should be in [-rank, rank)");
+
+  // If the 'split_dim' dimension of the 'input' tensor has a dynamic size,
+  // there are no other checks.
+  const int64_t dim_size = input_type.getDimSize(split_dim);
+  if (ShapedType::isDynamic(dim_size)) return success();
+
+  if (dim_size % num_splits != 0)
+    return op.emitOpError("'num_splits' should evenly divide 'split_dim' axis");
+
+  // Creates sliced tensor type.
+  auto slice_shape = input_type.getShape().vec();
+  slice_shape[split_dim] = dim_size / num_splits;
+  RankedTensorType slice_type =
+      RankedTensorType::get(slice_shape, input_type.getElementType());
+
+  // Verifies result tensor types.
+  for (int64_t i = 0; i < num_splits; ++i) {
+    Value *result = op.getResult(i);
+    auto result_type = result->getType().dyn_cast<RankedTensorType>();
+    if (!result_type || result_type != slice_type)
+      return op.emitOpError() << "output #" << i << " should be " << slice_type;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MeanOp
 //===----------------------------------------------------------------------===//
 
 // TODO(b/133854225): Implement shape inference to Mean
 
+//===----------------------------------------------------------------------===//
+// LSTMOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(LSTMOp op) {
+  auto operands = op.GetStatefulOperands();
+  if (operands.size() == 2 && operands[0] == 18 && operands[1] == 19) {
+    return success();
+  }
+  return op.emitError("LSTMOp expected to have two stateful operands");
+}
+
+//===----------------------------------------------------------------------===//
+// UnidirectionalSequenceLSTMOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(UnidirectionalSequenceLSTMOp op) {
+  auto operands = op.GetStatefulOperands();
+  if (operands.size() == 2 && operands[0] == 18 && operands[1] == 19) {
+    return success();
+  }
+  return op.emitError(
+      "UnidirectionalSequenceLSTMOp expected to have two stateful operands");
+}
+
+//===----------------------------------------------------------------------===//
+// UnidirectionalSequenceRNNOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(UnidirectionalSequenceRNNOp op) {
+  auto operands = op.GetStatefulOperands();
+  if (operands.size() == 1 && operands[0] == 4) {
+    return success();
+  }
+  return op.emitError(
+      "UnidirectionalSequenceRNNOp expected to have one stateful operand");
+}
+
+//===----------------------------------------------------------------------===//
+// SvdfOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SVDFOp op) {
+  auto operands = op.GetStatefulOperands();
+  if (operands.size() == 1 && operands[0] == 4) {
+    return success();
+  }
+  return op.emitError("SvdfOp expected to have one stateful operand");
+}
+
+//===----------------------------------------------------------------------===//
+// AbsOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AbsOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat { return llvm::abs(value); };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// SinOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SinOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::sin(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// CosOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult CosOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::cos(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// LogOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult LogOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::log(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// SqrtOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::sqrt(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// RsqrtOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult RsqrtOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = 1.f / std::sqrt(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// SquareOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SquareOp::fold(ArrayRef<Attribute> operands) {
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat { return value * value; };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
+//===----------------------------------------------------------------------===//
+// RankOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1);
+  auto result_type = getType().cast<ShapedType>();
+  if (auto elements_attr = operands[0].dyn_cast_or_null<ElementsAttr>()) {
+    auto rank = static_cast<int32_t>(elements_attr.getType().getRank());
+    return DenseElementsAttr::get(result_type, {rank});
+  }
+
+  // Also fold if `input` has a known rank.
+  auto input_type = input()->getType().cast<ShapedType>();
+  // Do not fold if rank is zero because the TFLite converter doesn't
+  // distinguish between unranked input and scalar input due to b/138865275.
+  // TODO(b/138865275): Remove `input_type.getRank() != 0` in the following
+  // predicate and fold the op when rank is zero.
+  if (input_type.hasRank() && input_type.getRank() != 0) {
+    auto rank = static_cast<int32_t>(input_type.getRank());
+    return DenseElementsAttr::get(result_type, {rank});
+  }
+
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// ConstOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+
+  // Return the held attribute value.
+  return value();
+}
+
+//===----------------------------------------------------------------------===//
+// RangeOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Compute the length of a range (1-D) tensor given `start`, `limit`, `delta`.
+// Template parameter `FloatOrInt` must be standard C integer or floating-point
+// types.
+template <typename FloatOrInt>
+int GetLengthOfRange(FloatOrInt start, FloatOrInt limit, FloatOrInt delta) {
+  // Refer to the implementation in
+  // tensorflow/lite/kernels/range.cc.
+  return std::is_integral<FloatOrInt>::value
+             ? ((std::abs(limit - start) + std::abs(delta) - 1) /
+                std::abs(delta))
+             : std::ceil(std::abs((limit - start) / delta));
+}
+
+// Builds a constant range tensor of `result_elem_type` elements.
+// Template parameter `FloatOrIntAtrr` must be mlir::IntegerAttr or
+// mlir::FloatAttr.
+template <typename FloatOrIntAtrr>
+DenseElementsAttr BuildConstRangeTensor(Type result_elem_type, int num_elements,
+                                        FloatOrIntAtrr start_attr,
+                                        FloatOrIntAtrr delta_attr) {
+  using ValueType = typename FloatOrIntAtrr::ValueType;  // APInt or APFloat
+  ValueType start = start_attr.getValue();
+  ValueType delta = delta_attr.getValue();
+
+  SmallVector<ValueType, 16> new_values;
+  new_values.reserve(num_elements);
+  ValueType new_value = start;
+  for (int i = 0; i < num_elements; ++i) {
+    new_values.push_back(new_value);
+    new_value = new_value + delta;
+  }
+  // Result is always a 1-D tensor.
+  auto new_result_type =
+      RankedTensorType::get({num_elements}, result_elem_type);
+  return DenseElementsAttr::get(new_result_type, new_values);
+}
+}  // namespace
+
+OpFoldResult RangeOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 3);
+  auto start_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
+  auto limit_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
+  auto delta_tensor = operands[2].dyn_cast_or_null<ElementsAttr>();
+  if (start_tensor && limit_tensor && delta_tensor) {
+    // Operands should all be scalars
+    assert(start_tensor.getType().getRank() == 0 &&
+           limit_tensor.getType().getRank() == 0 &&
+           delta_tensor.getType().getRank() == 0);
+    Type elem_type = getType().cast<ShapedType>().getElementType();
+    if (elem_type.isa<IntegerType>()) {
+      auto start_attr = start_tensor.getValue<IntegerAttr>({});
+      auto limit_attr = limit_tensor.getValue<IntegerAttr>({});
+      auto delta_attr = delta_tensor.getValue<IntegerAttr>({});
+      const int num_elements = GetLengthOfRange(
+          start_attr.getInt(), limit_attr.getInt(), delta_attr.getInt());
+      return BuildConstRangeTensor(elem_type, num_elements, start_attr,
+                                   delta_attr);
+    } else if (elem_type.isa<FloatType>()) {
+      auto start_attr = start_tensor.getValue<FloatAttr>({});
+      auto limit_attr = limit_tensor.getValue<FloatAttr>({});
+      auto delta_attr = delta_tensor.getValue<FloatAttr>({});
+      const int num_elements = GetLengthOfRange(start_attr.getValueAsDouble(),
+                                                limit_attr.getValueAsDouble(),
+                                                delta_attr.getValueAsDouble());
+      return BuildConstRangeTensor(elem_type, num_elements, start_attr,
+                                   delta_attr);
+    }
+  }
+
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Computes the permutation of a constant `input_tensor` according to `perm`.
+// The function recursively traverses the dimensions of the output tensor in
+// a row-major order and writes the value in the output tensor into
+// `new_values`.
+void ComputePermutation(ElementsAttr input_tensor, ArrayRef<int32_t> perm,
+                        ArrayRef<int64_t> output_shape, int num_dimensions,
+                        int output_axis, std::vector<uint64_t> *input_indices,
+                        std::vector<Attribute> *new_values) {
+  // Refer to the implementation of `Transpose` function in
+  // tensorflow/lite/kernels/internal/reference/reference_ops.h
+  assert(output_axis < num_dimensions);
+  const int input_axis = perm[output_axis];
+  for (int i = 0; i < output_shape[output_axis]; ++i) {
+    // Update the input indices on `input_axis`.
+    input_indices->at(input_axis) = i;
+    // Write the value from `input_tensor` if it is the last axis or
+    // recurse into the next axis.
+    const bool is_last_axis = output_axis == num_dimensions - 1;
+    if (is_last_axis) {
+      new_values->push_back(input_tensor.getValue(*input_indices));
+    } else {
+      ComputePermutation(input_tensor, perm, output_shape, num_dimensions,
+                         output_axis + 1, input_indices, new_values);
+    }
+  }
+}
+
+}  // namespace
+
+OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2);
+  auto input_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
+  auto perm_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!input_tensor || !perm_tensor) return nullptr;
+
+  // Do not try to fold elements attr of a quant type because
+  // DenseElementsAttr does not support it.
+  if (!getType().cast<ShapedType>().getElementType().isIntOrFloat())
+    return nullptr;
+
+  assert(perm_tensor.getType().getRank() == 1);
+  const int num_dimensions = input_tensor.getType().getRank();
+  assert(perm_tensor.getType().getNumElements() == num_dimensions);
+
+  ArrayRef<int64_t> input_shape = input_tensor.getType().getShape();
+  auto output_type = getType().cast<ShapedType>();
+
+  SmallVector<int32_t, 4> perm;
+  SmallVector<int64_t, 4> output_shape;
+  for (int i = 0; i < num_dimensions; ++i) {
+    perm.push_back(
+        perm_tensor.getValue<IntegerAttr>({static_cast<uint64_t>(i)}).getInt());
+    output_shape.push_back(input_shape[perm[i]]);
+
+    // Check that the derived output shape matches the static shape.
+    assert(!output_type.hasStaticShape() ||
+           output_type.getShape()[i] == output_shape[i]);
+  }
+
+  std::vector<Attribute> new_values;
+  new_values.reserve(input_tensor.getType().getNumElements());
+  std::vector<uint64_t> input_indices(num_dimensions);
+  ComputePermutation(input_tensor, perm, output_shape, num_dimensions,
+                     /*output_axis=*/0, &input_indices, &new_values);
+  auto result_type =
+      RankedTensorType::get(output_shape, output_type.getElementType());
+  return DenseElementsAttr::get(result_type, new_values);
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
@@ -575,5 +1069,16 @@ static LogicalResult Verify(UnpackOp op) {
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
 
+Operation *TensorFlowLiteDialect::materializeConstant(OpBuilder &builder,
+                                                      Attribute value,
+                                                      Type type, Location loc) {
+  // If this is an opaque elements attribute or the result type doesn't match
+  // the attribute type, then generate a tfl.pseudo_const.
+  if (value.isa<OpaqueElementsAttr>() ||
+      (value.isa<ElementsAttr>() && value.getType() != type))
+    return builder.create<ConstOp>(loc, type, value.cast<ElementsAttr>());
+  return nullptr;
+}
+
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 5eac0511ab7..c60a17a24da 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
@@ -36,6 +37,11 @@ namespace TFL {
 class TensorFlowLiteDialect : public Dialect {
  public:
   explicit TensorFlowLiteDialect(MLIRContext *context);
+
+  // Registered hook to materialize a constant operation from a given attribute
+  // value with the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
 };
 
 #define GET_OP_CLASSES
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 8c78f7a9dc8..458ff270e91 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -24,7 +24,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 #endif // OP_BASE
 
-include "mlir/Dialect/QuantOps/QuantPredicates.td"
+include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
 def TFL_Dialect : Dialect {
   let name = "tfl";
@@ -95,49 +95,6 @@ def TFL_MirrorPaddingAttr : StrEnumAttr<"Padding", "Mirror pad enum", [
       TFL_MIRRORPAD_Reflect, TFL_MIRRORPAD_Symmetric
     ]>;
 
-//===----------------------------------------------------------------------===//
-// Min-max range pair definitions.
-//===----------------------------------------------------------------------===//
-
-// A pair of floating point values which defines the min and max of a value
-// range for quantization. The attribute is allowed to be empty or
-// have 2 elements.
-def MinMaxAttr : Attr<Or<[CPred<"$_self.cast<ArrayAttr>().size() == 0">,
-                             CPred<"$_self.cast<ArrayAttr>().size() == 2">]>,
-                      "min-max range pair"> {
-  let storageType = [{ ArrayAttr }];
-  let returnType = [{ ArrayRef<Attribute> }];
-}
-
-//===----------------------------------------------------------------------===//
-// QuantizedType definitions.
-//===----------------------------------------------------------------------===//
-
-// The base class of a quantized type.
-class TFL_QuantizedType<string n, list<int> params, bit signed>
-  : Type<And<[CPred<"$_self.isa<mlir::quant::QuantizedType>()">,
-              CPred<"$_self.cast<mlir::quant::QuantizedType>()" #
-                    ".getStorageTypeIntegralWidth() == " # !head(params)>]>,
-    "Q" # !if (signed, "I", "UI") # !head(params) # " type"> {
-  string name = n;
-  string asTraitArgsStr =
-    StrJoinInt<params>.result # !if(signed, ", true", ", false");
-}
-
-// Uniform quantized types. Two integers "smantissa" and "sexp" are used to
-// express the Mantissa and Exponent components of the floating-point scale so
-// the scale of the quantized type is "smantissa * 10 ^ sexp".
-class TFL_UInt8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
-    : TFL_QuantizedType<"Uniform",
-                        [8, zero_pt, smantissa, sexp, 0, 255], 0>;
-class TFL_Int8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
-    : TFL_QuantizedType<"Uniform",
-                        [8, zero_pt, smantissa, sexp, -128, 127], 1>;
-
-// 8-bits quantized types. The definitions can be used to specify tensor types.
-def TFL_QUI8 : TFL_QuantizedType<"Uniform", [8], 0>;
-def TFL_QI8 : TFL_QuantizedType<"Uniform", [8], 1>;
-
 //===----------------------------------------------------------------------===//
 // TensorType attribute definitions.
 //===----------------------------------------------------------------------===//
@@ -163,20 +120,12 @@ def TFL_IntTensor : TypeAlias<I32Tensor, "tensor of any integer type">;
 
 // This is used to represent the type of "ref tensors" or tensors that are
 // used as variables to track state.
-// TODO(ashwinm): This is a placeholder until we have first class support
-// for variables.
 def TFL_StatefulTensor : TypeAlias<AnyTensor, "stateful tensor">;
 
 // Tensor or None type.
 class TFL_TensorOfOrNone<list<Type> allowedTypes, string description = ""> :
   AnyTypeOf<[TensorOf<allowedTypes>, NoneType], description>;
 
-// Type Constraint operand `idx`'s type is NOT `type`.
-// TODO(b/131936589): Once this bug is fixed, we should be able to use
-// Neg<TCopVTEtIs<idx, NoneType>>> and can remove this.
-class TFL_TCopIsNot<int idx, Type type> :
-   Neg<CPred<"$_op.getOperand(" # idx # ")->getType().isa<" # type # ">()">>;
-
 def TFL_FpOrI32OrI64Tensor : TensorOf<[AnyFloat, TFL_Int32Or64]>;
 
 //===----------------------------------------------------------------------===//
@@ -258,31 +207,11 @@ def TFL_ComparisonBinaryBuilder : OpBuilder<
   }]>;
 
 //===----------------------------------------------------------------------===//
-// TFL native op traits (for quantization).
-//
-// Ops in this link should have those traits specified:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-//===----------------------------------------------------------------------===//
+// TFL native op trait for stateful operands.
 
-// Specify this trait if the op has a fixed output value range.
-class TFL_FixedResultScale<TFL_QuantizedType qt> : NativeOpTrait<!strconcat(
-  "TFL::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
+class StatefulOperands<list<int> operands>
+    : ParamNativeOpTrait<"TFL::StatefulOperands", StrJoinInt<operands>.result>;
 
-// Specify this trait if the op requires same inputs and outputs quantization
-// scales.
-def TFL_SameOperandsAndResultsScale : NativeOpTrait<
-  "TFL::SameOperandsAndResultsScale">;
-
-// Specify this trait if the b-th input of the op is a bias input, which needs
-// a scale based on the scales of op1 and op2.
-class TFL_AccumulatorUniformScale<int bias, int op1, int op2> : NativeOpTrait<
-  !strconcat("TFL::AccumulatorUniformScale<",
-             StrJoinInt<[bias, op1, op2]>.result,
-             ">::Impl")>;
-
-// Specify this trait if the op doesn't have quantizable ouput. We shouldn't
-// apply quantization on this op.
-def TFL_NoQuantizableResult : NativeOpTrait<"TFL::NoQuantizableResult">;
 
 //===----------------------------------------------------------------------===//
 // TFL op base class.
@@ -310,7 +239,7 @@ class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
 }
 
 class TFL_ConvOp<string mnemonic, string opSummary> :
-    TFL_Op<mnemonic, [NoSideEffect, TFL_AccumulatorUniformScale<2, 0, 1>]> {
+    TFL_Op<mnemonic, [NoSideEffect, AccumulatorUniformScale<2, 0, 1>]> {
   let summary = opSummary # " operator";
 
   let description = [{
@@ -325,7 +254,7 @@ class TFL_ConvOp<string mnemonic, string opSummary> :
   let arguments = (
     ins AnyTensor:$input,
     AnyTensor:$filter,
-    AnyTensor:$bias,
+    TFL_TensorOfOrNone<[AnyType]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
     TFL_AFAttr:$fused_activation_function,
@@ -355,6 +284,8 @@ an output element, this operation computes \\(y = |x|\\).
   let arguments = (ins AnyTensor:$x);
 
   let results = (outs AnyTensor:$y);
+
+  let hasFolder = 1;
 }
 
 def TFL_AddOp : TFL_Op<"add", [Broadcastable, NoSideEffect, Commutative]> {
@@ -400,8 +331,35 @@ def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect]> {
   );
 }
 
+def TFL_ReduceAnyOp : TFL_Op<"reduce_any", [NoSideEffect]> {
+  let summary = [{
+Computes the "logical or" of elements across dimensions of a tensor.
+  }];
+
+  let description = [{
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+  }];
+
+  let arguments = (ins
+    I1Tensor:$input,
+    I32Tensor:$reduction_indices,
+
+    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
+  );
+
+  let results = (outs
+    I1Tensor:$output
+  );
+
+  let hasOptions = 1;
+  let customOption = "ReducerOptions";
+}
+
 def TFL_AveragePool2DOp:
-    TFL_Op<"average_pool_2d", [NoSideEffect, TFL_SameOperandsAndResultsScale]> {
+    TFL_Op<"average_pool_2d", [NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Average_pool_2d operator";
 
   let description = [{
@@ -424,6 +382,32 @@ def TFL_AveragePool2DOp:
   let customOption = "Pool2DOptions";
 }
 
+def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> {
+  let summary = "ArgMax operator";
+
+  let description = [{
+    Returns the index with the largest value across dimensions of a tensor.
+  }];
+
+  let arguments = (
+    // TODO: Add support for uint8.
+    ins TensorOf<[F32, I32, I8]>:$input,
+    TFL_I32OrI64Tensor:$dim
+  );
+
+  let results = (outs
+    TFL_I32OrI64Tensor:$output
+  );
+
+  let hasOptions = 1;
+
+  DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{
+    return getResult()->getType().cast<TensorType>().getElementType().
+        cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
+            tflite::TensorType_INT32;
+    }]>;
+}
+
 def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
   let summary = "ArgMin operator";
 
@@ -443,6 +427,14 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
   let results = (outs
     TFL_I32OrI64Tensor:$output
   );
+
+  let hasOptions = 1;
+
+  DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{
+    return getResult()->getType().cast<TensorType>().getElementType().
+        cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
+            tflite::TensorType_INT32;
+    }]>;
 }
 
 def TFL_CeilOp: TFL_Op<"ceil", [NoSideEffect, SameOperandsAndResultType]> {
@@ -462,7 +454,7 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
     NoSideEffect,
     PredOpTrait<"values and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_SameOperandsAndResultsScale
+    SameOperandsAndResultsScale
   ]> {
   let summary = "Concatenation operator";
 
@@ -472,14 +464,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins Variadic<TensorOf<
-      [F32, I64, I32, I16, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TensorOf<
-      [F32, I64, I32, I16, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$output
   );
 
   let hasOptions = 1;
@@ -500,6 +492,8 @@ def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [NoSideEffect,
   let arguments = (ins ElementsAttr:$value);
 
   let results = (outs AnyTensor:$output);
+
+  let hasFolder = 1;
 }
 
 def TFL_Conv2DOp : TFL_ConvOp<"conv_2d", "Convolution">;
@@ -514,6 +508,8 @@ def TFL_CosOp: TFL_Op<"cos", [NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins TFL_FpTensor:$x);
 
   let results = (outs TFL_FpTensor:$y);
+
+  let hasFolder = 1;
 }
 
 def TFL_DepthwiseConv2DOp :
@@ -532,13 +528,14 @@ def TFL_FullyConnectedOptionsWeightFormatAttr :
 
 // TODO(jpienaar): Update post discussion on semantics of FC OP.
 // TODO(jpienaar): Include more shape verification.
-def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> {
+def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
+    NoSideEffect, AccumulatorUniformScale<2, 0, 1>]> {
   let summary = "Fully connected op";
 
   let arguments = (ins
-    TensorOf<[F32]>:$input,
-    TensorOf<[F32]>:$filter,
-    TFL_TensorOfOrNone<[F32]>:$bias,
+    TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
+    TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$filter,
+    TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
 
     TFL_AFAttr:$fused_activation_function,
     TFL_FullyConnectedOptionsWeightFormatAttr:$weights_format,
@@ -547,7 +544,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> {
 
   // Depending on the weights format, this op can have one or two outputs.
   let results = (outs
-    Variadic<TensorOf<[F32]>>:$output
+    Variadic<TensorOf<[F32, QI8, QUI8, QI16, QUI16]>>:$output
   );
 
   let hasOptions = 1;
@@ -555,6 +552,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> {
 
 def TFL_GatherOp : TFL_Op<"gather", [
     NoSideEffect,
+    SameOperandsAndResultsScale,
     TFL_OperandHasAtleastRank<0, 1>,
     PredOpTrait<"params and output must have same element type",
       TCresVTEtIsSameAsOp<0, 0>>
@@ -566,7 +564,7 @@ def TFL_GatherOp : TFL_Op<"gather", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, TFL_Str]>:$params,
+    TensorOf<[F32, I8, I32, I64, TFL_Str, QI8, QUI8]>:$params,
     TensorOf<[I32, I64]>:$indices,
     I32Attr:$axis
   );
@@ -579,7 +577,7 @@ def TFL_GatherOp : TFL_Op<"gather", [
   ];
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64, TFL_Str]>:$output
+    TensorOf<[F32, I16, I32, I64, TFL_Str, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -592,19 +590,19 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [NoSideEffect]> {
     Gather slices from `params` into a Tensor with shape specified by `indices`.
   }];
 
-  // TODO: missing Uint8.
   let arguments = (ins
-    TensorOf<[F32, I8, I64, I32]>:$params,
+    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$params,
     TFL_I32OrI64Tensor:$indices
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I64, I32]>:$output
+    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
   );
 }
 
 // Same type check of lhs and rhs is handled by the Broadcastable trait.
-def TFL_LessEqualOp : TFL_Op<"less_equal", [Broadcastable, NoSideEffect]> {
+def TFL_LessEqualOp : TFL_Op<"less_equal", [
+    Broadcastable, NoSideEffect, NoQuantizableResult]> {
   let summary = "Less_equal operator";
 
   let description = [{
@@ -612,8 +610,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [Broadcastable, NoSideEffect]> {
   }];
 
   let arguments = (
-      ins TensorOf<[F32, I32, I64, I8, TFL_Uint8]>:$lhs,
-      TensorOf<[F32, I32, I64, I8, TFL_Uint8]>:$rhs);
+      ins TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
+      TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -645,7 +643,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-      TensorOf<[F32]>:$input,
+      TensorOf<[F32, QI8, QUI8]>:$input,
       I32Attr:$radius,
       F32Attr:$bias,
       F32Attr:$alpha,
@@ -653,13 +651,14 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TensorOf<[F32]>:$output
+    TensorOf<[F32, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [Broadcastable, NoSideEffect]> {
+def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [
+    Broadcastable, NoSideEffect, NoQuantizableResult]> {
   let summary = "Greater_equal operator";
 
   let description = [{
@@ -682,7 +681,7 @@ def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [Broadcastable, NoSideEffect]>
 }
 
 def TFL_NotEqualOp : TFL_Op<"not_equal", [
-    Broadcastable, Commutative, NoSideEffect, TFL_NoQuantizableResult]> {
+    Broadcastable, Commutative, NoSideEffect, NoQuantizableResult]> {
   let summary = "Not_equal operator";
 
   let description = [{
@@ -747,8 +746,28 @@ def TFL_EluOp: TFL_Op<"elu", [NoSideEffect, SameOperandsAndResultType]> {
   let hasOptions = 0;
 }
 
+def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
+    [NoSideEffect,
+     PredOpTrait<"value and output must have same element type",
+       TCresVTEtIsSameAsOp<0, 1>>
+    ]> {
+  let summary = "Embedding lookup operator";
+
+  let description = [{
+    Looks up ids in a list of embedding tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[I32]>:$lookup,
+    TensorOf<[F32, I8, TFL_Uint8]>:$value
+   );
+
+  let results = (outs TensorOf<[F32, I8, TFL_Uint8]>:$output);
+}
+
 def TFL_EqualOp: TFL_Op<"equal", [Commutative, Broadcastable,
-  PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> {
+    NoQuantizableResult,
+    PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> {
   let summary = "Equal operator";
 
   let description = [{
@@ -757,8 +776,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, Broadcastable,
 
   let arguments = (
     ins
-    TensorOf<[I1, F32, I32, I64, I8, TFL_Uint8]>:$x,
-    TensorOf<[I1, F32, I32, I64, I8, TFL_Uint8]>:$y
+    TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$x,
+    TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$y
   );
 
   let results = (outs TFL_BoolTensor:$output);
@@ -773,9 +792,9 @@ def TFL_ExpOp: TFL_Op<"exp", [NoSideEffect, SameOperandsAndResultType]> {
     Performs element-wise natural exponentiation operation on input.
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
 
   let hasOptions = 0b1;
 }
@@ -825,7 +844,7 @@ size 1.
 }
 
 def TFL_SqueezeOp: TFL_Op<"squeeze", [NoSideEffect,
-                                      TFL_SameOperandsAndResultsScale]> {
+                                      SameOperandsAndResultsScale]> {
   let summary = "Removes dimensions of size 1 from the shape of a tensor.";
 
   let description = [{
@@ -917,17 +936,15 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [Broadcastable, NoSideEffect]> {
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TensorOf<[I32, I64, F32]>:$lhs,
+    TensorOf<[I32, I64, F32]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TensorOf<[I32, I64, F32]>:$output);
 
-  let parser = [{ return mlir::impl::parseBinaryOp(parser, result); }];
-
-  let printer = [{ return mlir::impl::printBinaryOp(getOperation(), p); }];
+  let builders = [TFL_BroadcastableBinaryBuilder];
 }
 
-def TFL_GreaterOp : TFL_Op<"greater", [NoSideEffect, TFL_NoQuantizableResult]> {
+def TFL_GreaterOp : TFL_Op<"greater", [NoSideEffect, NoQuantizableResult]> {
   let summary = "Greater operator";
 
   let description = [{
@@ -962,6 +979,25 @@ def TFL_InputOp : Op<TFL_Dialect, "pseudo_input", [SameOperandsAndResultType]> {
   let results = (outs AnyTensor:$output);
 }
 
+def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect]> {
+  let summary = "L2 Normalize Operator";
+
+  let description = [{
+    L2Normalization Op
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32, QUI8, QI8, I8]>:$input,
+    TFL_AFAttr:$fused_activation_function
+  );
+
+  let results = (outs TensorOf<[F32, QUI8, QI8, I8]>:$output);
+
+  let hasOptions = 1;
+
+  let customOption = "L2NormOptions";
+}
+
 def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Leaky Relu operator";
 
@@ -983,7 +1019,7 @@ def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [NoSideEffect, SameOperandsAndResultTy
   let hasOptions = 0b1;
 }
 
-def TFL_LessOp : TFL_Op<"less", [NoSideEffect, TFL_NoQuantizableResult]> {
+def TFL_LessOp : TFL_Op<"less", [NoSideEffect, NoQuantizableResult]> {
   let summary = "Less operator";
 
   let description = [{
@@ -1051,6 +1087,24 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
   let printer = [{ return mlir::impl::printBinaryOp(getOperation(), p); }];
 }
 
+def TFL_LogisticOp: TFL_Op<"logistic", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    // zero_point = 0
+    // scale = 1. / (max_value + 1)
+    FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
+    FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>]> {
+  let summary = "Logistic operator";
+
+  let description = [{
+    Computes element-wise Sigmoid of input
+  }];
+
+  let arguments = (ins TensorOf<[AnyFloat, QI8, QUI8]>:$x);
+
+  let results = (outs TensorOf<[AnyFloat, QI8, QUI8]>:$y);
+}
+
 def TFL_LogOp: TFL_Op<"log", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Natural logarithm operator";
 
@@ -1061,6 +1115,8 @@ def TFL_LogOp: TFL_Op<"log", [NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins AnyTensor:$x);
 
   let results = (outs AnyTensor:$y);
+
+  let hasFolder = 1;
 }
 
 // TODO(b/130643170): Adds some constraint for the input/output element types.
@@ -1069,8 +1125,8 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     SameOperandsAndResultShape,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
-    TFL_FixedResultScale<TFL_Int8UniformQuantizedType<127, 625, -4>>,
-    TFL_FixedResultScale<TFL_UInt8UniformQuantizedType<255, 625, -4>>]> {
+    FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
+    FixedResultScale<UInt8UniformQuantizedType<255, 625, -4>>]> {
   let summary = "Log softmax operator";
 
   let description = [{
@@ -1096,13 +1152,13 @@ def MaxPoolOperandAndResultConstraints : PredOpTrait<"MaxPool2D operand and "
   And<[
     // The input and output tensors should have the same elemental type
     // and they should be one of the specified types below.
-    TCopVTEtIs<0, AnyTypeOf<[F32, TFL_QI8, TFL_QUI8]>>,
+    TCopVTEtIs<0, AnyTypeOf<[F32, QI8, QUI8]>>,
     TFL_TCresVTEtIsSameAsOp<0, 0>]>>;
 
 def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
     NoSideEffect,
     MaxPoolOperandAndResultConstraints,
-    TFL_SameOperandsAndResultsScale]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Max Pool 2D op";
 
   let description = [{
@@ -1129,25 +1185,28 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
   let customOption = "Pool2DOptions";
 }
 
-def TFL_MaximumOp : TFL_Op<"maximum", [Broadcastable, NoSideEffect, Commutative]> {
+def TFL_MaximumOp : TFL_Op<"maximum", [
+    Broadcastable, NoSideEffect, Commutative, SameOperandsAndResultsScale]> {
   let summary = "Max operator";
   let description = [{
     Element-wise max operation.
   }];
 
   let arguments = (
-    ins TFL_FpOrI32OrI64Tensor:$lhs,
-    TFL_FpOrI32OrI64Tensor:$rhs
+    ins TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
+    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
   );
 
-  let results = (outs TFL_FpOrI32OrI64Tensor:$max);
+  let results = (outs
+    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$max
+  );
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 
   let hasOptions = 0;
 }
 
-def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> {
+def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -1159,12 +1218,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8]>:$input,
+    TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
     TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs TensorOf<[F32, I32, I64, I8]>:$output);
+  let results = (outs
+    TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1198,7 +1258,24 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   let hasOptions = 1;
 }
 
-def TFL_SliceOp : TFL_Op<"slice", [NoSideEffect]> {
+def TFL_RoundOp: TFL_Op<"round", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Round operator";
+
+  let description = [{
+Rounds the values of a tensor to the nearest integer, element-wise.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32]>:$x
+  );
+
+  let results = (outs
+    TensorOf<[F32]>:$y
+  );
+}
+
+def TFL_SliceOp : TFL_Op<"slice", [
+    NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Return a slice from 'input'.";
 
   let description = [{
@@ -1206,6 +1283,11 @@ The output tensor is a tensor with dimensions described by 'size'
 whose values are extracted from 'input' starting at the offsets in
 'begin'.
 
+`begin` is zero-based; `size` is one-based. If size[i] is -1, all remaining
+elements in dimension i are included in the slice. In other words, this is
+equivalent to setting:
+  size[i] = input.dim_size(i) - begin[i]
+
 *Requirements*:
   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
   }];
@@ -1219,6 +1301,8 @@ whose values are extracted from 'input' starting at the offsets in
   let results = (outs
     AnyTensor:$output
   );
+
+  let verifier = [{ return Verify(*this); }];
 }
 
 def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
@@ -1230,7 +1314,7 @@ def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
 
   let arguments = (ins
     AnyTensor:$input,
-    TFL_I32OrI64Tensor:$axes,
+    I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1249,7 +1333,7 @@ def TFL_ReduceMinOp: TFL_Op<"reduce_min", [NoSideEffect]> {
 
   let arguments = (ins
     AnyTensor:$input,
-    TFL_I32OrI64Tensor:$axes,
+    I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1268,7 +1352,7 @@ def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [NoSideEffect]> {
 
   let arguments = (ins
     AnyTensor:$input,
-    TFL_I32OrI64Tensor:$axes,
+    I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1287,7 +1371,7 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
 
   let arguments = (ins
     TensorOf<[F32, I8, I32, I64]>:$input,
-    TFL_I32OrI64Tensor:$axes,
+    I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1297,18 +1381,21 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
   let customOption = "ReducerOptions";
 }
 
-def TFL_MinimumOp : TFL_Op<"minimum", [Broadcastable, NoSideEffect, Commutative]> {
+def TFL_MinimumOp : TFL_Op<"minimum", [
+    Broadcastable, NoSideEffect, Commutative, SameOperandsAndResultsScale]> {
   let summary = "Min operator";
   let description = [{
     Element-wise min operation.
   }];
 
   let arguments = (
-    ins TFL_FpOrI32OrI64Tensor:$lhs,
-    TFL_FpOrI32OrI64Tensor:$rhs
+    ins TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
+    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
   );
 
-  let results = (outs TFL_FpOrI32OrI64Tensor:$min);
+  let results = (outs
+    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$min
+  );
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 
@@ -1402,6 +1489,7 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect]> {
 
 def TFL_PadOp : TFL_Op<"pad", [
     NoSideEffect,
+    SameOperandsAndResultsScale,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>]> {
   let summary = "Padding operator";
@@ -1431,16 +1519,17 @@ def TFL_PadOp : TFL_Op<"pad", [
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8, I32, I64]>:$input,
+    ins TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$padding);
 
-  let results = (outs TensorOf<[F32, I8, I32, I64]>:$output);
+  let results = (outs TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 }
 
 def TFL_PadV2Op : TFL_Op<"padv2", [
     NoSideEffect,
+    SameOperandsAndResultsScale,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandHasRank<2, 0>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
@@ -1475,11 +1564,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8, I32, I64]>:$input,
+    ins TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$padding,
     TensorOf<[F32, I8, I32, I64]>:$constant_values);
 
-  let results = (outs TensorOf<[F32, I8, I32, I64]>:$output);
+  let results = (outs TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 }
@@ -1511,9 +1600,13 @@ def TFL_RankOp: TFL_Op<"rank", [NoSideEffect]> {
   let arguments = (ins AnyTensor:$input);
 
   let results = (outs TFL_IntTensor:$output);
+
+  let hasFolder = 1;
 }
 
-def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
+                                SameOperandsAndResultShape,
+                                SameOperandsAndResultsScale]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -1526,7 +1619,9 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect, SameOperandsAndResultType]> {
   let results = (outs AnyTensor:$y);
 }
 
-def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
+                                  SameOperandsAndResultShape,
+                                  SameOperandsAndResultsScale]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -1540,7 +1635,7 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect, SameOperandsAndResultType]> {
 }
 
 def TFL_ReshapeOp: TFL_Op<"reshape", [
-    NoSideEffect, TFL_SameOperandsAndResultsScale]> {
+    NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Reshape operator";
 
   let description = [{
@@ -1577,9 +1672,8 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 `seq_dim` reversed.
   }];
 
-  // Missing Uint8.
   let arguments = (ins
-    TensorOf<[F32, I16, I32, I64]>:$input,
+    TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$input,
     TFL_I32OrI64Tensor:$seq_lengths,
 
     I32Attr:$seq_dim,
@@ -1587,7 +1681,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64]>:$output
+    TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$output
   );
 
   let hasOptions = 1;
@@ -1603,9 +1697,11 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins AnyTensor:$x);
 
   let results = (outs AnyTensor:$y);
+
+  let hasFolder = 1;
 }
 
-def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect, TFL_NoQuantizableResult]> {
+def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect, NoQuantizableResult]> {
   let summary = "Shape operator";
 
   let description = [{
@@ -1623,20 +1719,12 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect, TFL_NoQuantizableResult]> {
   let hasOptions = 1;
 }
 
-def TFL_LogisticOp: TFL_Op<"logistic", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Logistic operator";
-
-  let description = [{
-    Computes element-wise Sigmoid of input
-  }];
-
-  let arguments = (ins TFL_FpTensor:$x);
-
-  let results = (outs TFL_FpTensor:$y);
-}
-
 // TODO(jpienaar): Flesh this out.
-def TFL_RangeOp: TFL_Op<"range", [NoSideEffect]> {
+def TFL_RangeOp: TFL_Op<"range", [NoSideEffect, TFL_OperandHasRank<0, 0>,
+    TFL_OperandHasRank<1, 0>, TFL_OperandHasRank<2, 0>,
+    PredOpTrait<"operands and output must have same element type",
+      And<[TCresVTEtIsSameAsOp<0, 0>, TCresVTEtIsSameAsOp<0, 1>,
+           TCresVTEtIsSameAsOp<0, 2>]>>]> {
   let summary = "Range operator";
 
   let description = [{
@@ -1650,6 +1738,8 @@ def TFL_RangeOp: TFL_Op<"range", [NoSideEffect]> {
     AnyTensor:$delta);
 
   let results = (outs AnyTensor:$result);
+
+  let hasFolder = 1;
 }
 
 def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
@@ -1703,9 +1793,8 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    // TODO: Missing uint8.
-    TensorOf<[F32, I1, I8, I16, I32, I64]>:$x,
-    TensorOf<[F32, I1, I8, I16, I32, I64]>:$y);
+    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
+    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
   let results = (outs AnyTensor:$output);
 
   // TODO(jpienaar): autogenerate this.
@@ -1730,6 +1819,8 @@ def TFL_SinOp: TFL_Op<"sin", [NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins TFL_FpTensor:$x);
 
   let results = (outs TFL_FpTensor:$y);
+
+  let hasFolder = 1;
 }
 
 // TODO(b/130643170): Adds some constraint for the input/output element types.
@@ -1738,8 +1829,8 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
-    TFL_FixedResultScale<TFL_Int8UniformQuantizedType<-128, 390625, -8>>,
-    TFL_FixedResultScale<TFL_UInt8UniformQuantizedType<0, 390625, -8>>]> {
+    FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
+    FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>]> {
   let summary = "Softmax operator";
 
   let description = [{
@@ -1765,9 +1856,11 @@ def TFL_SqrtOp: TFL_Op<"sqrt", [NoSideEffect, SameOperandsAndResultType]> {
     Computes element-wise Square root of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
+
+  let hasFolder = 1;
 }
 
 def TFL_SquareOp: TFL_Op<"square", [NoSideEffect, SameOperandsAndResultType]> {
@@ -1777,11 +1870,13 @@ def TFL_SquareOp: TFL_Op<"square", [NoSideEffect, SameOperandsAndResultType]> {
     Computes element-wise Square of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TensorOf<[AnyFloat, QI8, QUI8]>:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TensorOf<[AnyFloat, QI8, QUI8]>:$y);
 
   let hasOptions = 0b1;
+
+  let hasFolder = 1;
 }
 
 def TFL_SubOp : TFL_Op<"sub", [Broadcastable, NoSideEffect]> {
@@ -1833,22 +1928,21 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [Broadcastable, NoSid
 
 def TFL_TanhOp: TFL_Op<"tanh", [
     NoSideEffect,
-    SameOperandsAndResultType,
+    SameOperandsAndResultShape,
     // central_value = min_value / 2 + (max_value - 1) / 2 + 1
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
-    TFL_FixedResultScale<TFL_Int8UniformQuantizedType<0, 78125, -7>>,
-    TFL_FixedResultScale<TFL_UInt8UniformQuantizedType<128, 78125, -7>>]> {
+    FixedResultScale<Int8UniformQuantizedType<0, 78125, -7>>,
+    FixedResultScale<UInt8UniformQuantizedType<128, 78125, -7>>]> {
   let summary = "Hyperbolic tangent operator";
 
   let description = [{
     Computes element-wise Hyperbolic tangent of input
   }];
 
-  // TODO(haoliang): missing Uint8.
-  let arguments = (ins TensorOf<[F32, I16, I8]>:$x);
+  let arguments = (ins TensorOf<[F32, I16, I8, QI8, QUI8, TFL_Uint8]>:$x);
 
-  let results = (outs TensorOf<[F32, I16, I8]>:$y);
+  let results = (outs TensorOf<[F32, I16, I8, QI8, QUI8, TFL_Uint8]>:$y);
 }
 
 def TFL_TileOp: TFL_Op<"tile", [NoSideEffect,
@@ -1865,9 +1959,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect,
    For example, tiling [a b c d] by [2] produces [a b c d a b c d].
   }];
 
-  let arguments = (ins AnyTensor:$input, TFL_I32OrI64Tensor:$multiples);
+  let arguments = (ins
+    TensorOf<[F32, I1, I32, I64, TFL_Uint8]>:$input,
+    TFL_I32OrI64Tensor:$multiples);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TensorOf<[F32, I1, I32, I64, TFL_Uint8]>:$output);
 
   let hasOptions = 0;
 }
@@ -1887,8 +1983,7 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   }];
 
   let arguments = (ins
-    // TODO: Missing uint8
-    TensorOf<[F32, I8, I32, I64]>:$input,
+    TensorOf<[F32, I8, I32, I64, TFL_Uint8]>:$input,
     I32Tensor:$k);
 
   let results = (outs
@@ -1906,11 +2001,13 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
 // dimensions.
 def TFL_TransposeOp : TFL_Op<"transpose",
   [NoSideEffect,
+   TFL_OperandHasRank<1,1>,
    // TODO(jpienaar): these are only true dynamically, change so that it works
    // with unknowns.
-   // TFL_OperandHasRank<1,1>,
    // TFL_OperandRankEquals1DimOfOperand<0, 1>,
-   TFL_SameOperandsAndResultsScale]> {
+   PredOpTrait<"input and output must have same element type",
+   TCresVTEtIsSameAsOp<0, 0>>,
+   SameOperandsAndResultsScale]> {
   let summary = "Transpose operator";
 
   let description = [{
@@ -1919,12 +2016,14 @@ def TFL_TransposeOp : TFL_Op<"transpose",
 
   let arguments = (
     ins AnyTensor:$x,
-    AnyTensor:$perm
+    TensorOf<[I32]>:$perm
   );
 
   let results = (outs
     AnyTensor:$y
   );
+
+  let hasFolder = 1;
 }
 
 def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect]> {
@@ -1948,14 +2047,14 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32]>:$input,
+    TensorOf<[F32, I8, I32, QI8, QUI8]>:$input,
 
     I32Attr:$num,
     I32Attr:$axis
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I8, I32]>>:$outputs
+    Variadic<TensorOf<[F32, I8, I32, QI8, QUI8]>>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1979,6 +2078,7 @@ def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [NoSideEffect]> {
 
 def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
     NoSideEffect,
+    SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
       TCresVTEtIsSameAsOp<0, 0>>
   ]> {
@@ -1989,18 +2089,19 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64]>:$input,
+    TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TensorOf<[I32]>:$block_shape,
     TensorOf<[I32]>:$indices
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64]>:$output
+    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
   );
 }
 
 def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
     NoSideEffect,
+    SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
       TCresVTEtIsSameAsOp<0, 0>>
   ]> {
@@ -2011,17 +2112,76 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64]>:$input,
+    TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TensorOf<[I32]>:$block_shape,
     TensorOf<[I32]>:$paddings
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64]>:$output
+    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
   );
 }
 
-def TFL_SplitOp : TFL_Op<"split", [NoSideEffect]> {
+def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    PredOpTrait<"input and output must have same element type",
+      TCresVTEtIsSameAsOp<0, 0>>
+  ]> {
+  let summary = "SpaceToDepth operator";
+
+  let description = [{
+    Rearranges blocks of spatial data, into depth. More specifically,
+    this op outputs a copy of the input tensor where values from the `height`
+    and `width` dimensions are moved to the `depth` dimension.
+    `block_size` indicates the input block size.
+   }];
+
+  let arguments = (ins
+    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
+    I32Attr:$block_size
+  );
+
+  let results = (outs
+    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
+  );
+
+  let hasOptions = 1;
+}
+
+def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    PredOpTrait<"input and output must have same element type",
+      TCresVTEtIsSameAsOp<0, 0>>
+  ]> {
+  let summary = "DepthToSpace operator";
+
+  let description = [{
+    Rearranges data from depth into blocks of spatial data.
+    This is the reverse transformation of SpaceToDepth. More specifically,
+    this op outputs a copy of the input tensor where values from the `depth`
+    dimension are moved in spatial blocks to the `height` and `width`
+    dimensions. The attr `block_size` indicates the input block size and how
+    the data is moved.
+   }];
+
+  let arguments = (ins
+    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
+    I32Attr:$block_size
+  );
+
+  let results = (outs
+    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
+  );
+
+  let hasOptions = 1;
+}
+
+def Rank0I32Tensor : Type<And<[I32Tensor.predicate, HasAnyRankOfPred<[0]>]>,
+                          "tensor<i32>">;
+
+def TFL_SplitOp : TFL_Op<"split", [NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
   let description = [{
@@ -2031,19 +2191,21 @@ def TFL_SplitOp : TFL_Op<"split", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    I32Tensor:$split_dim,
-    TensorOf<[F32, I16, I32, I64]>:$value,
-    I32Attr:$num_splits
+    Rank0I32Tensor:$split_dim,
+    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$value,
+    PositiveI32Attr:$num_splits
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I16, I32, I64]>>:$outputs
+    Variadic<TensorOf<[F32, I16, I32, I64, QI8, QUI8]>>:$outputs
   );
 
+  let verifier = [{ return Verify(*this); }];
+
   let hasOptions = 1;
 }
 
-def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect]> {
+def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
   let description = [{
@@ -2053,20 +2215,21 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I16, I32, I64]>:$value,
+    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$value,
     I32Tensor:$size_splits,
     I32Tensor:$split_dim,
     I32Attr:$num_splits
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I16, I32, I64]>>:$outputs
+    Variadic<TensorOf<[F32, I16, I32, I64, QI8, QUI8]>>:$outputs
   );
 
   let hasOptions = 1;
 }
 
-def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [NoSideEffect]> {
+def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
+    NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "ResizeBilinear Op";
 
   let description = [{
@@ -2075,22 +2238,82 @@ def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [NoSideEffect]> {
 
   let arguments = (ins
     // TODO(ycling): Support quantized types.
-    TensorOf<[F32, I32]>:$input,
+    TensorOf<[F32, I32, QI8, QUI8]>:$input,
     TensorOf<[I32]>:$size,
     BoolAttr:$align_corners);
 
   let results = (outs
-    TensorOf<[F32]>:$output
+    TensorOf<[F32, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
 }
 
+def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
+                                [NoSideEffect,
+                                 SameOperandsAndResultsScale]> {
+  let summary = "ResizeNearestNeighbor Op";
+
+  let description = [{
+    Resize `images` to `size` using nearest neighbor interpolation.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
+    TensorOf<[I32]>:$size,
+    BoolAttr:$align_corners
+  );
+
+  let results = (outs
+    TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$output
+  );
+
+  let hasOptions = 1;
+}
+
+def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [NoSideEffect]> {
+  let summary = "Converts a sparse representation into a dense tensor.";
+
+  let description = [{
+Builds an array `dense` with shape `output_shape` such that
+
+```
+# If sparse_indices is scalar
+dense[i] = (i == sparse_indices ? sparse_values : default_value)
+
+# If sparse_indices is a vector, then for each i
+dense[sparse_indices[i]] = sparse_values[i]
+
+# If sparse_indices is an n by d matrix, then for each i in [0, n)
+dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+```
+
+All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+scalar, all sparse indices are set to this single value.
+
+Indices should be sorted in lexicographic order, and indices must not
+contain any repeats. If `validate_indices` is true, these properties
+are checked during execution.
+  }];
+
+  let arguments = (ins
+    TFL_I32OrI64Tensor:$sparse_indices,
+    TFL_I32OrI64Tensor:$output_shape,
+    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$sparse_values,
+    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$default_value
+  );
+
+  let results = (outs
+    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$dense
+  );
+}
+
 def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   [
     NoSideEffect,
     PredOpTrait<"input and output must have same element type",
-    TCresVTEtIsSameAsOp<0, 0>>
+    TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    SameOperandsAndResultsScale
   ]> {
   let summary = "StridedSlice Op";
 
@@ -2099,7 +2322,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32, I64, I8]>:$input,
+    TensorOf<[F32, I32, I64, I8, QI8, QUI8]>:$input,
     TensorOf<[I32]>:$begin,
     TensorOf<[I32]>:$end,
     TensorOf<[I32]>:$strides,
@@ -2112,7 +2335,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8]>:$output
+    TensorOf<[F32, I32, I64, I8, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2207,7 +2430,7 @@ in the unique output `y`. In other words:
 // Quantization ops.
 //===----------------------------------------------------------------------===//
 def TFL_DequantizeOp: TFL_Op<"dequantize", [
-    NoSideEffect, TFL_NoQuantizableResult]> {
+    NoSideEffect, NoQuantizableResult]> {
   let summary = "Dequantize operator";
 
   let description = [{
@@ -2243,7 +2466,7 @@ def TFL_FakeQuantOp : TFL_Op<"fake_quant", [NoSideEffect]> {
 }
 
 def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
-    NoSideEffect, FirstAttrDerivedResultType, TFL_NoQuantizableResult]> {
+    NoSideEffect, FirstAttrDerivedResultType, NoQuantizableResult]> {
   let summary = "Quantized constant pseudo op";
 
   let description = [{
@@ -2261,7 +2484,7 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
 }
 
 def TFL_QuantizeOp: TFL_Op<"quantize", [
-    NoSideEffect, FirstAttrDerivedResultType, TFL_NoQuantizableResult]> {
+    NoSideEffect, FirstAttrDerivedResultType, NoQuantizableResult]> {
   let summary = "Quantize operator";
 
   let description = [{
@@ -2305,7 +2528,7 @@ def LstmProjectionWeightBiasConstraint : PredOpTrait<
   "projection bias must not be specified",
    Or<[
       And<[TCopVTEtIs<16, NoneType>, TCopVTEtIs<17, NoneType>]>,
-      TFL_TCopIsNot<16, NoneType>]>>;
+      Neg<TypeIsPred<"projection_weights", NoneType>>]>>;
 
 // TODO(b/137798843): Need to add two additional constraints for both LSTM and
 // UnidirectionalSequenceLstm
@@ -2327,7 +2550,8 @@ def TFL_LSTMOp :
           [LstmMandatoryInputsConstraint,
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
-           LstmResultConstraint]> {
+           LstmResultConstraint,
+           StatefulOperands<[18, 19]>]> {
   let summary = "The full lstm operator";
 
   let description = [{
@@ -2405,9 +2629,11 @@ Ba et al. “Layer Normalization”
   let results = (outs AnyTensor:$output);
 
   let hasOptions = 1;
+
+  let verifier = [{ return Verify(*this); }];
 }
 
-// UnidirectionalSequenceLstm op .
+// UnidirectionalSequenceLstm op.
 // TODO(ashwinm): Add constraint to validate the combination of operands
 // that are valid for hybrid vs fully quantized vs float only semantics
 def TFL_UnidirectionalSequenceLSTMOp :
@@ -2415,7 +2641,8 @@ def TFL_UnidirectionalSequenceLSTMOp :
           [LstmMandatoryInputsConstraint,
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
-           LstmResultConstraint]> {
+           LstmResultConstraint,
+           StatefulOperands<[18, 19]>]> {
   let summary = "Unidirectional sequence lstm operator";
 
   let description = [{
@@ -2482,6 +2709,129 @@ def TFL_UnidirectionalSequenceLSTMOp :
   let results = (outs AnyTensor:$output);
 
   let hasOptions = 1;
+
+  let verifier = [{ return Verify(*this); }];
+}
+
+def RnnResultConstraint : PredOpTrait<
+  "the input and result tensor elemental types must be same",
+  TCresVTEtIsSameAsOp<0, 0>>;
+
+// UnidirectionalSequenceRNN op.
+def TFL_UnidirectionalSequenceRNNOp :
+  TFL_Op<"unidirectional_sequence_rnn",
+         [RnnResultConstraint, StatefulOperands<[4]>]> {
+
+  let summary = "Unidirectional sequence rnn operator";
+
+  let description = [{
+    A recurrent neural network specified by an RNN cell. This Op takes in input
+    in a format {batch_size, seq_len, input_size} or
+    {seq_len, batch_size, input_size} if it's time-majored.
+
+    It implements the following operation for
+    each element in the sequence s = 1...sequence_length:
+      outputs[s] = state = activation(RNNOp(inputs[s]))
+
+    where RNNOp is RNNOp TF Lite Op and the “activation” is the function passed
+    as the “fused_activation_function” argument (if not “NONE”).
+  }];
+
+  let arguments = (
+    ins TensorOf<[F32, I8]>:$input,
+
+    // Weights
+    TensorOf<[F32, I8]>:$input_to_input_weights,
+
+    // Recurrent weights
+    TensorOf<[F32, I8]>:$recurrent_to_input_weights,
+
+    // Bias
+    TensorOf<[F32]>:$input_gate_bias,
+
+    // Hidden state.
+    TFL_StatefulTensor:$hidden_state,
+
+    // Attributes
+    BoolAttr:$time_major,
+    TFL_AFAttr:$fused_activation_function
+  );
+
+  let results = (outs TensorOf<[F32, I8]>:$output);
+
+  let hasOptions = 1;
+
+  let customOption = "SequenceRNNOptions";
+
+  let verifier = [{ return Verify(*this); }];
+}
+
+def TFL_WhereOp : TFL_Op<"where", [NoSideEffect]> {
+  let summary = "Returns locations of nonzero / true values in a tensor.";
+
+  let description = [{
+This operation returns the coordinates of true elements in `condition`. The
+coordinates are returned in a 2-D tensor where the first dimension (rows)
+represents the number of true elements, and the second dimension (columns)
+represents the coordinates of the true elements. Keep in mind, the shape of
+the output tensor can vary depending on how many true values there are in
+`condition`. Indices are output in row-major order.
+  }];
+
+  let arguments = (ins
+    I1Tensor:$input
+  );
+
+  // TODO(haoliang): TF Lite only support I32 output right now, need to fix
+  // either here or in the kernel.
+  let results = (outs
+    TFL_I32OrI64Tensor:$index
+  );
+}
+
+def SVDFResultConstraint: PredOpTrait<
+  "the input and result tensor elemental types must be same",
+  TCresVTEtIsSameAsOp<0, 0>>;
+
+// SVDF op.
+def TFL_SVDFOp :
+  TFL_Op<"svdf",
+         [SVDFResultConstraint, StatefulOperands<[4]>]> {
+
+  let summary = "Single value decomposition filter operator";
+
+  let description = [{
+    The SVDF op is a decomposition of a densely connected op into low rank
+    filters.
+    For details: https://research.google.com/pubs/pub43813.html
+                 https://arxiv.org/abs/1812.02802
+  }];
+
+  let arguments = (
+    ins TensorOf<[F32, I8]>:$input,
+
+    // Feature Weights.
+    TensorOf<[F32, I8]>:$feature_weights,
+
+    // Time weights
+    TensorOf<[F32, I8]>:$time_weights,
+
+    // Bias
+    TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
+
+    // Activation state.
+    TFL_StatefulTensor:$activation_state,
+
+    // Attributes
+    I32Attr:$rank,
+    TFL_AFAttr:$fused_activation_function
+  );
+
+  let results = (outs TensorOf<[F32, I8]>:$output);
+
+  let hasOptions = 1;
+
+  let verifier = [{ return Verify(*this); }];
 }
 
 #endif // TFL_OPS
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_traits.h b/tensorflow/compiler/mlir/lite/ir/tfl_traits.h
index 807c1100b71..af8c707a04e 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_traits.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_traits.h
@@ -18,108 +18,32 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_TRAITS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_TRAITS_H_
 
-#include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 
 namespace mlir {
 namespace OpTrait {
 namespace TFL {
 
-using QuantizedType = mlir::quant::QuantizedType;
-using UniformQuantizedType = mlir::quant::UniformQuantizedType;
-
-// The base class that all the quantization related OpTrait implements.
-template <typename ConcreteType, template <typename> class TraitType>
-struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
-  static bool IsBias(int index) { return false; }
-  static bool IsQuantizable() { return true; }
-};
-
-// This class provides the API for TFL ops that requires same input and output
-// scale as the quantization results. This is used as a trait like this:
-//
-//   class TransposeOp
-//       : public Op<TransposeOp, OpTrait::TFL::SameOperandsAndResultsScale> {
-//
-template <typename ConcreteType>
-class SameOperandsAndResultsScale
-    : public QuantizationSpecTraitBase<ConcreteType,
-                                       SameOperandsAndResultsScale> {};
-
-// This class provides the API for TFL ops that has a fixed output value range.
+// The trait to specify that the specified operands of the TFL op are stateful.
 // This is used as a trait like this:
 //
-//   class SoftmaxOp
-//       : public Op<SoftmaxOp,
-//           OpTrait::TFL::FixedResultUniformScale<
-//               8, -128, 390625, -8, 0, 255, false>::Impl> {
+//   class LSTMOp
+//       : public Op<LSTMOp, OpTrait::TFL::StatefulOperands<18, 19>::Impl> {
 //
-// TODO(fengliuai): create a better way to epxress floating point scale in the
-// template argument list.
-template <unsigned BitWidth, int ZeroPoint, int ScaleMantissa, int ScaleExp,
-          int64_t StorageTypeMin, int64_t StorageTypeMax, bool Sign>
-class FixedResultUniformScale {
+template <int... Operands>
+class StatefulOperands {
  public:
   template <typename ConcreteType>
   class Impl
-      : public QuantizationSpecTraitBase<
-            ConcreteType, FixedResultUniformScale<
-                              BitWidth, ZeroPoint, ScaleMantissa, ScaleExp,
-                              StorageTypeMin, StorageTypeMax, Sign>::Impl> {
+      : public TraitBase<ConcreteType, StatefulOperands<Operands...>::Impl> {
    public:
-    QuantizedType GetResultQuantizedType(int index) {
-      auto op = this->getOperation();
-      auto result_type =
-          op->getResult(index)->getType().template cast<TensorType>();
-      Builder builder(op->getContext());
-      IntegerType storage_type = builder.getIntegerType(BitWidth);
-      const double scale = static_cast<double>(ScaleMantissa) *
-                           ::pow(10.0, static_cast<double>(ScaleExp));
-      return UniformQuantizedType::getChecked(
-          Sign, storage_type, result_type.getElementType(), scale, ZeroPoint,
-          StorageTypeMin, StorageTypeMax, builder.getUnknownLoc());
-    }
-  };
-};
-
-// This class provides the API for TFL ops that has input as bias. This is used
-// as a trait like this:
-//
-//   class Conv2DOp
-//       : public Op<Conv2DOp, OpTrait::TFL::AccumulatorScale<2, 0, 1>::Impl> {
-//
-// TODO(fengliuai): supports a configurable accumulator bit width.
-template <int Bias, int... Operands>
-class AccumulatorUniformScale {
- public:
-  template <typename ConcreteType>
-  class Impl
-      : public QuantizationSpecTraitBase<
-            ConcreteType, AccumulatorUniformScale<Bias, Operands...>::Impl> {
-   public:
-    // Whether the index-th operand is a bias.
-    static bool IsBias(int index) { return index == Bias; }
-
-    // Returns the indexes of all the non-bias operands.
-    static std::vector<int> GetAllNonBiasOperands() {
+    static std::vector<int> GetStatefulOperands() {
       return std::vector<int>({Operands...});
     }
   };
 };
 
-// This class provides the API for TFL ops that shouldn't be quantized. This is
-// used as a trait like this:
-//
-//   class LessOp : public Op<LessOp, OpTrait::TFL::NoQuantizableResult> {
-//
-template <typename ConcreteType>
-class NoQuantizableResult
-    : public QuantizationSpecTraitBase<ConcreteType, NoQuantizableResult> {
- public:
-  static bool IsQuantizable() { return false; }
-};
-
 }  // namespace TFL
 }  // namespace OpTrait
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
index ff27ad76136..52a8bd35d36 100644
--- a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
+++ b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
@@ -79,9 +78,7 @@ static std::string TfLiteTensorString(const TfLiteTensor& tensor) {
 }
 
 int main(int argc, char** argv) {
-  llvm::PrettyStackTraceProgram x(argc, argv);
   llvm::InitLLVM y(argc, argv);
-
   llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR TFLite runner\n");
 
   auto file_or_err = llvm::MemoryBuffer::getFileOrSTDIN(inputFileName.c_str());
diff --git a/tensorflow/compiler/mlir/lite/operator_writer_gen.cc b/tensorflow/compiler/mlir/lite/operator_converter_gen.cc
similarity index 84%
rename from tensorflow/compiler/mlir/lite/operator_writer_gen.cc
rename to tensorflow/compiler/mlir/lite/operator_converter_gen.cc
index fd8325577d9..5db1aa1a3c0 100644
--- a/tensorflow/compiler/mlir/lite/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/lite/operator_converter_gen.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
@@ -247,6 +247,51 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
         "}\n";
 }
 
+// Emit a function that converts a BuiltinOptionsUnion to a vector of attributes
+// Signature:
+// void mlir::BuiltinOptionsToAttributes(
+//     tflite::BuiltinOptionsUnion op_union,
+//     mlir::Builder builder,
+//     llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes);
+static void EmitBuiltinOptionsToAttributes(const RecordKeeper &record_keeper,
+                                           const std::vector<Record *> &defs,
+                                           raw_ostream *ostream) {
+  raw_ostream &os = *ostream;
+
+  // Signature
+  os << "void mlir::BuiltinOptionsToAttributes("
+        "tflite::BuiltinOptionsUnion op_union, "
+        "mlir::Builder builder, "
+        "llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes) {\n";
+
+  const auto attr_type = record_keeper.getClass("Attr");
+  for (const auto *def : defs) {
+    if (!def->getValueAsBit("hasOptions")) continue;
+    auto option_name = GetOperatorOptionName(*def);
+    os << formatv("  if(const auto *op = op_union.As{0}()) {\n", option_name);
+
+    // We only care about options that are in arguments
+    auto *arg_values = def->getValueAsDag("arguments");
+    for (unsigned i = 0, e = arg_values->getNumArgs(); i != e; ++i) {
+      auto arg = arg_values->getArg(i);
+      DefInit *arg_def = dyn_cast<DefInit>(arg);
+      if (!arg_def) continue;
+      if (arg_def->getDef()->isSubClassOf(attr_type)) {
+        StringRef arg_name = arg_values->getArgNameStr(i);
+        StringRef attr_type = mlir::tblgen::Attribute(arg_def).getAttrDefName();
+        os << formatv(
+            "    attributes.emplace_back(builder.getNamedAttr(\"{0}\","
+            " Build{1}(op->{0}, builder)));\n",
+            arg_name, attr_type);
+      }
+    }
+
+    os << "    return;\n";
+    os << "  }\n";
+  }
+  // Fallthrough case is no attributes
+  os << "}";
+}
 // The function below has a non-constant reference as that is required by LLVM's
 // TableGenMain.
 // NOLINTNEXTLINE
@@ -278,15 +323,14 @@ static bool OperatorWritersMain(raw_ostream &os, RecordKeeper &records) {
   EmitGetBuiltinOpCode(defs, &os);
   os << "\n\n";
   EmitBuildOperator(defs, &os);
+  os << "\n\n";
+  EmitBuiltinOptionsToAttributes(records, defs, &os);
 
   return false;
 }
 
 int main(int argc, char **argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-  llvm::PrettyStackTraceProgram X(argc, argv);
-
-  llvm::llvm_shutdown_obj Y;
+  llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
   return TableGenMain(argv[0], &OperatorWritersMain);
 }
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index 0eab2981a83..5094b015f68 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -15,7 +15,10 @@ cc_library(
     hdrs = [
         "graphdef_to_tfl_flatbuffer.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
+        "//tensorflow/compiler/mlir/lite:common",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index 2a60715e13d..b2bca0b4f54 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -129,16 +131,26 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
   bool emit_custom_ops = toco_flags.allow_custom_ops();
   specs.prune_unused_nodes = true;
+  specs.convert_legacy_fed_inputs = true;
+  specs.graph_as_function = false;
   WarningUnusedFlags(model_flags, toco_flags);
 
-  bool emit_quant_adaptor_ops = false;
-  bool lower_tensor_list_ops = true;
   TF_ASSIGN_OR_RETURN(
       auto module, ConvertGraphdefToMlir(input, debug_info, specs, &context));
-  return ConvertTFControlFlowToTFLOrFlatbuffer(
+
+  mlir::PassManager pm;
+  bool run_quantize = tensorflow::ShouldRunQuantizePasses(module.get());
+  mlir::TFL::PassConfig pass_config;
+  pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
+  pass_config.run_quantize = run_quantize;
+  pass_config.lower_tensor_list_ops = true;
+
+  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+
+  return ConvertTFExecutorToTFLOrFlatbuffer(
       module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, emit_quant_adaptor_ops,
-      lower_tensor_list_ops, result);
+      emit_select_tf_ops, emit_custom_ops, /*emit_quant_adaptor_ops=*/false,
+      /*lower_tensor_list_ops=*/true, result, &pm);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
new file mode 100644
index 00000000000..57b9a48e5de
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -0,0 +1,60 @@
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["@local_config_mlir//:subpackages"],
+    packages = ["//tensorflow/compiler/mlir/..."],
+)
+
+exports_files(["quantization_traits.h"])
+
+filegroup(
+    name = "quantization_td_files",
+    srcs = [
+        "quantization.td",
+        "@local_config_mlir//:OpBaseTdFiles",
+        "@local_config_mlir//:QuantizationOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "quantization_lib",
+    srcs = [
+        "quantization_driver.cc",
+        "quantization_utils.cc",
+    ],
+    hdrs = [
+        "quantization_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:QuantOps",
+        "@local_config_mlir//:StandardOps",
+        "@local_config_mlir//:Support",
+        # TODO(fengliuai): remove this dependence.
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ],
+)
+
+tf_native_cc_binary(
+    name = "op_quant_spec_getters_gen",
+    srcs = [
+        "tools/op_quant_spec_getters_gen.cc",
+    ],
+    deps = [
+        "@llvm//:support",
+        "@llvm//:tablegen",
+        "@local_config_mlir//:TableGen",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
new file mode 100644
index 00000000000..24b299ba39b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the quantization definition file for TensorFlow.
+
+#ifdef TF_Quantization
+#else
+#define TF_Quantization
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+include "mlir/Dialect/QuantOps/QuantPredicates.td"
+
+
+//===----------------------------------------------------------------------===//
+// Min-max range pair definitions.
+//===----------------------------------------------------------------------===//
+
+// A pair of floating point values which defines the min and max of a value
+// range for quantization. The attribute is allowed to be empty or
+// have 2 elements.
+def MinMaxAttr : Attr<Or<[CPred<"$_self.cast<ArrayAttr>().size() == 0">,
+                             CPred<"$_self.cast<ArrayAttr>().size() == 2">]>,
+                      "min-max range pair"> {
+  let storageType = [{ ArrayAttr }];
+  let returnType = [{ ArrayRef<Attribute> }];
+}
+
+//===----------------------------------------------------------------------===//
+// QuantizedType definitions.
+//===----------------------------------------------------------------------===//
+
+// The base class of a quantized type.
+class QuantizedType<string n, list<int> params, bit signed>
+  : Type<And<[CPred<"$_self.isa<mlir::quant::QuantizedType>()">,
+              CPred<"$_self.cast<mlir::quant::QuantizedType>()" #
+                    ".getStorageTypeIntegralWidth() == " # !head(params)>]>,
+    "Q" # !if (signed, "I", "UI") # !head(params) # " type"> {
+  string name = n;
+  string asTraitArgsStr =
+    StrJoinInt<params>.result # !if(signed, ", true", ", false");
+}
+
+// Uniform quantized types. Two integers "smantissa" and "sexp" are used to
+// express the Mantissa and Exponent components of the floating-point scale so
+// the scale of the quantized type is "smantissa * 10 ^ sexp".
+class UInt8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
+    : QuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, 0, 255], 0>;
+class Int8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
+    : QuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, -128, 127], 1>;
+
+// General uniform quantized types. The definitions can be used to specify
+// operand's tensor types.
+def QUI8 : QuantizedType<"Uniform", [8], 0>;
+def QI8 : QuantizedType<"Uniform", [8], 1>;
+def QUI16 : QuantizedType<"Uniform", [16], 0>;
+def QI16 : QuantizedType<"Uniform", [16], 1>;
+def QUI32 : QuantizedType<"Uniform", [32], 0>;
+def QI32 : QuantizedType<"Uniform", [32], 1>;
+
+//===----------------------------------------------------------------------===//
+// TFL native op traits (for quantization).
+//
+// Ops in this link should have those traits specified:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+//===----------------------------------------------------------------------===//
+
+// Specify this trait if the op has a fixed output value range.
+class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
+  "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
+
+// Specify this trait if the op requires same inputs and outputs quantization
+// scales.
+def SameOperandsAndResultsScale : NativeOpTrait<
+  "quant::SameOperandsAndResultsScale">;
+
+// Specify this trait if the b-th input of the op is a bias input, which needs
+// a scale based on the scales of op1 and op2.
+class AccumulatorUniformScale<int bias, int op1, int op2> : NativeOpTrait<
+  !strconcat("quant::AccumulatorUniformScale<",
+             StrJoinInt<[bias, op1, op2]>.result,
+             ">::Impl")>;
+
+// Specify this trait if the op doesn't have quantizable ouput. We shouldn't
+// apply quantization on this op.
+def NoQuantizableResult : NativeOpTrait<"quant::NoQuantizableResult">;
+
+#endif // TF_Quantization
diff --git a/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
similarity index 87%
rename from tensorflow/compiler/mlir/lite/utils/quantization_driver.cc
rename to tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 1ab00ec3634..63c055c1ac8 100644
--- a/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -17,13 +17,14 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
@@ -32,47 +33,15 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_traits.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TFL {
 namespace {
-
-using QuantParams = quant::QuantizedType;
-using AccumulatorScaleFunc =
-    std::function<QuantParams(const std::vector<QuantParams> &)>;
-using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
-using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
-
-// Quantization specs of ops, driving the TF Lite quantization algorithm.
-struct OpQuantSpec {
-  // Whether the op has quantizable result. This flag is set to false if the op
-  // has "TFL::NoQuantizableResult" trait.
-  bool is_quantizable = true;
-
-  // Whether it requires same inputs and result scale. This flag is set to true
-  // if the op has "TFL::SameOperandsAndResultScale" trait.
-  bool requires_same_scale = false;
-
-  // Maps the operand index of a bias input to its quantization specifications,
-  // including the non-bias operand indexes and the method retrieving
-  // quantization parameters from list of parameters of the non-bias operands.
-  // This map is empty if the op doesn't havea bias operand.
-  std::unordered_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>
-      biases_params;
-
-  // Quantization parameters for value restricted outputs. This is the
-  // "hard-coded" parameters and should be used unconditionally for the
-  // quantized op. This vector is empty if the op doesn't have value resctricted
-  // outputs.
-  llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
-};
-
 static bool EmptyParams(QuantParams p) { return p == quant::QuantizedType(); }
 
 // The state for each op result during the quantization parameters propagation.
@@ -125,8 +94,12 @@ struct RequantizeState {
 //
 class QuantizationDriver {
  public:
-  explicit QuantizationDriver(FuncOp fn, bool is_signed)
-      : fn_(fn), builder_(fn.getBody()), is_signed_(is_signed) {}
+  explicit QuantizationDriver(FuncOp fn, bool is_signed,
+                              OpQuantSpecGetter op_quant_spec_getter)
+      : fn_(fn),
+        builder_(fn.getBody()),
+        is_signed_(is_signed),
+        op_quant_spec_getter_(op_quant_spec_getter) {}
 
   // The entry point of the quantization parameters propagation.
   void Run();
@@ -146,17 +119,19 @@ class QuantizationDriver {
   // result.
   void Finalize();
 
-  // Whether the constant is used as a bias input of another op. Here we assume
-  // bias is used immediately by the user. This assumption is always correct
-  // after constant folding.
-  bool UsedAsBias(ConstantOp cst) {
-    Value *value = cst.getResult();
-    for (auto &use : value->getUses()) {
-      auto biases = GetQuantSpec(use.getOwner())->biases_params;
-      if (biases.find(use.getOperandNumber()) != biases.end()) return true;
-    }
-    return false;
-  }
+  // The quantization parameters of bias operand are usually determined by
+  // other operands, so if a constant is used by different ops as bias, it needs
+  // to be duplicated, thus each op can assign its own quantization parameter
+  // for this bias. Also this methods add all the non-bias constants to a set
+  // for looking up later.
+  void PreprocessConstantOps();
+
+  // Setup all the data structures for quantization propagation.
+  void SetupAllStates();
+
+  // Whether the constant is a weight, which shouldn't be shared by different
+  // ops.
+  bool IsWeight(Operation *cst) { return llvm::is_contained(weights_, cst); }
 
   // Returns all the related quantization constraints of the op.
   std::unique_ptr<OpQuantSpec> GetQuantSpec(Operation *op);
@@ -294,6 +269,11 @@ class QuantizationDriver {
   OpBuilder builder_;
   bool is_signed_;
 
+  // We should distinguish weights and bias constants. Biases are specified by
+  // the quantization spec or are the operands of ops with same scale spec. The
+  // rest are weights.
+  llvm::DenseSet<Operation *> weights_;
+
   // All the ops needs to propagate the quantization parameters to.
   std::vector<Operation *> work_list_;
   std::unordered_set<Operation *> quantized_;
@@ -316,14 +296,13 @@ class QuantizationDriver {
   // This vector is to preserve the arguments order, so the newly inserted
   // quantized ops for the arguments are deterministically ordered.
   llvm::SmallVector<BlockArgument *, 4> args_;
-};
 
-#include "tensorflow/compiler/mlir/lite/utils/generated_op_quant_spec_getters.inc"
+  OpQuantSpecGetter op_quant_spec_getter_;
+};
 }  // namespace
 
-// TODO(fengliuai): cache the quantization parameters.
 std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation *op) {
-  return GetOpQuantSpec(op);
+  return op_quant_spec_getter_(op);
 }
 
 bool QuantizationDriver::IsQuantized(Operation *op) {
@@ -354,10 +333,10 @@ bool QuantizationDriver::SetConstantResultParams(Operation *op) {
   if (!matchPattern(res, m_Constant(&attr))) {
     return false;
   }
-  // TODO(fengliuai): the bit width should be determined by its user.
+  // TODO(fengliuai): make storage_type_width and narrow_range configurable.
   auto final_type =
-      GetUniformQuantizedTypeForElementsAttr(
-          attr, /*storage_type_width=*/8, is_signed_, /*narrow_range_=*/false)
+      GetUniformQuantizedTypeForElementsAttr(attr, /*storage_type_width=*/8,
+                                             is_signed_, /*narrow_range_=*/true)
           .dyn_cast_or_null<quant::QuantizedType>();
   if (!final_type) return false;
   return SetResultParams(op, 0, final_type);
@@ -432,6 +411,9 @@ void QuantizationDriver::QuantizeValue(Value *value, QuantParams params,
                                        Location loc) {
   Type expressed_type = value->getType();
   Type new_type = params.castFromExpressedType(expressed_type);
+  // This value isn't an expressed type (float), skip.
+  if (!new_type) return;
+
   TypeAttr type_attr = builder_.getTypeAttr(new_type);
   auto quantize =
       builder_.create<TFL::QuantizeOp>(loc, new_type, value, type_attr);
@@ -482,10 +464,15 @@ void QuantizationDriver::RequantizeValue(Value *value, RequantizeState *state,
   } else {
     Type expressed_type =
         quant::QuantizedType::castToExpressedType(value->getType());
+    if (!expressed_type) return;
+
     // The value needs to be requantized. A Quantize op will be created to use
     // it as the operand and replace its uses.
     new_type = state->params.castFromExpressedType(expressed_type);
   }
+  // This value isn't an expressed type (float), skip.
+  if (!new_type) return;
+
   TypeAttr type_attr = builder_.getTypeAttr(new_type);
   auto requantize_op =
       builder_.create<TFL::QuantizeOp>(loc, new_type, value, type_attr);
@@ -560,12 +547,39 @@ QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
   return {};
 }
 
-// This method scans the operations in the function to setup the initial
-// states for quantization parameter propagation.
-// TODO(fengliuai): This algorithm assumes there are only one pair of
-// tfl.quantize and tfl.dequantize ops between two quantizable ops. A sanity
-// check should be applied.
-void QuantizationDriver::Initialize() {
+void QuantizationDriver::PreprocessConstantOps() {
+  fn_.walk<ConstantOp>([&](ConstantOp cst) {
+    // Non-float tensors are neither weights or require quantization.
+    if (!cst.getType().cast<ShapedType>().getElementType().isa<FloatType>()) {
+      return;
+    }
+
+    Value *value = cst.getResult();
+    SmallVector<std::pair<Operation *, int>, 4> bias_users;
+    for (auto &use : value->getUses()) {
+      auto spec = GetQuantSpec(use.getOwner());
+      auto biases = spec->biases_params;
+      Operation *user = use.getOwner();
+      int operand_num = use.getOperandNumber();
+
+      // The user doesn't use this value as a bias operand nor require same
+      // scale.
+      if (biases.find(operand_num) == biases.end() &&
+          !spec->requires_same_scale) {
+        weights_.insert(cst);
+      } else {
+        bias_users.push_back({user, operand_num});
+      }
+    }
+    builder_.setInsertionPoint(cst);
+    for (int i = 1; i < bias_users.size(); ++i) {
+      auto copied = builder_.create<ConstantOp>(cst.getLoc(), cst.getValue());
+      bias_users[i].first->setOperand(bias_users[i].second, copied.getResult());
+    }
+  });
+}
+
+void QuantizationDriver::SetupAllStates() {
   llvm::DenseMap<Value *, int> value_to_state;
 
   fn_.walk([&](Operation *op) {
@@ -603,6 +617,21 @@ void QuantizationDriver::Initialize() {
   });
 }
 
+// This method scans the operations in the function to setup the initial
+// states for quantization parameter propagation.
+// TODO(fengliuai): This algorithm assumes there are only one pair of
+// tfl.quantize and tfl.dequantize ops between two quantizable ops. A sanity
+// check should be applied.
+void QuantizationDriver::Initialize() {
+  // Duplicate the bias constant, so the states can be setup correctly.
+  // TODO(fengliuai): Function definition should also be duplicated if there are
+  // multiple call sites.
+  PreprocessConstantOps();
+
+  // Setup all the internal states.
+  SetupAllStates();
+}
+
 bool QuantizationDriver::PropagateParams() {
   // TODO(fengliuai): uses a typed indicator instead of a bool value.
   bool changed = false;
@@ -610,8 +639,8 @@ bool QuantizationDriver::PropagateParams() {
     Operation *op = work_list_.back();
     work_list_.pop_back();
 
-    // This op has been quantized, so we should consider it again.
-    if (quantized_.find(op) != quantized_.end()) continue;
+    // This op has been quantized, so we should not consider it again.
+    if (llvm::is_contained(quantized_, op)) continue;
     quantized_.insert(op);
 
     auto spec = GetQuantSpec(op);
@@ -621,9 +650,8 @@ bool QuantizationDriver::PropagateParams() {
     if (!spec->is_quantizable) continue;
 
     if (auto cst = llvm::dyn_cast<ConstantOp>(op)) {
-      // This constant is used as a bias in another op, then the quantization
-      // parameters are determined by that op.
-      if (UsedAsBias(cst) || IsQuantized(op)) continue;
+      // If it isn't a weight or has been quantized, skip.
+      if (!IsWeight(cst) || IsQuantized(op)) continue;
 
       // The quantization parameters are determined by the content of the
       // constant.
@@ -648,11 +676,13 @@ bool QuantizationDriver::PropagateParams() {
       for (int res = 0, e = op->getNumResults(); res != e; ++res)
         changed |= SetResultParams(op, res, params);
     }
+
     // TODO(fengliuai): make the bit width configurable.
     auto key = std::make_pair(8, is_signed_);
     auto &restricted_outputs = spec->restricted_output_params[key];
-    for (int i = 0, e = restricted_outputs.size(); i != e; ++i)
+    for (int i = 0, e = restricted_outputs.size(); i != e; ++i) {
       changed |= SetResultParams(op, i, restricted_outputs[i]);
+    }
 
     for (auto &it : spec->biases_params) {
       auto params =
@@ -712,8 +742,9 @@ void QuantizationDriver::Run() {
   }
 }
 
-void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed) {
-  QuantizationDriver(func, is_signed).Run();
+void ApplyQuantizationParamsPropagation(
+    mlir::FuncOp func, bool is_signed, OpQuantSpecGetter op_quant_spec_getter) {
+  QuantizationDriver(func, is_signed, op_quant_spec_getter).Run();
 }
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
new file mode 100644
index 00000000000..b64776ddee7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
@@ -0,0 +1,126 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow Lite dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_TRAITS_H_
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+
+namespace mlir {
+namespace OpTrait {
+namespace quant {
+
+using QuantizedType = mlir::quant::QuantizedType;
+using UniformQuantizedType = mlir::quant::UniformQuantizedType;
+
+// The base class that all the quantization related OpTrait implements.
+template <typename ConcreteType, template <typename> class TraitType>
+struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
+  static bool IsBias(int index) { return false; }
+  static bool IsQuantizable() { return true; }
+};
+
+// This class provides the API for TFL ops that requires same input and output
+// scale as the quantization results. This is used as a trait like this:
+//
+//   class TransposeOp
+//       : public Op<TransposeOp, OpTrait::TFL::SameOperandsAndResultsScale> {
+//
+template <typename ConcreteType>
+class SameOperandsAndResultsScale
+    : public QuantizationSpecTraitBase<ConcreteType,
+                                       SameOperandsAndResultsScale> {};
+
+// This class provides the API for TFL ops that has a fixed output value range.
+// This is used as a trait like this:
+//
+//   class SoftmaxOp
+//       : public Op<SoftmaxOp,
+//           OpTrait::TFL::FixedResultUniformScale<
+//               8, -128, 390625, -8, 0, 255, false>::Impl> {
+//
+// TODO(fengliuai): create a better way to epxress floating point scale in the
+// template argument list.
+template <unsigned BitWidth, int ZeroPoint, int ScaleMantissa, int ScaleExp,
+          int64_t StorageTypeMin, int64_t StorageTypeMax, bool Sign>
+class FixedResultUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, FixedResultUniformScale<
+                              BitWidth, ZeroPoint, ScaleMantissa, ScaleExp,
+                              StorageTypeMin, StorageTypeMax, Sign>::Impl> {
+   public:
+    QuantizedType GetResultQuantizedType(int index) {
+      auto op = this->getOperation();
+      auto result_type =
+          op->getResult(index)->getType().template cast<TensorType>();
+      Builder builder(op->getContext());
+      IntegerType storage_type = builder.getIntegerType(BitWidth);
+      const double scale = static_cast<double>(ScaleMantissa) *
+                           ::pow(10.0, static_cast<double>(ScaleExp));
+      return UniformQuantizedType::getChecked(
+          Sign, storage_type, result_type.getElementType(), scale, ZeroPoint,
+          StorageTypeMin, StorageTypeMax, builder.getUnknownLoc());
+    }
+  };
+};
+
+// This class provides the API for TFL ops that has input as bias. This is used
+// as a trait like this:
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::TFL::AccumulatorScale<2, 0, 1>::Impl> {
+//
+// TODO(fengliuai): supports a configurable accumulator bit width.
+template <int Bias, int... Operands>
+class AccumulatorUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, AccumulatorUniformScale<Bias, Operands...>::Impl> {
+   public:
+    // Whether the index-th operand is a bias.
+    static bool IsBias(int index) { return index == Bias; }
+
+    // Returns the indexes of all the non-bias operands.
+    static std::vector<int> GetAllNonBiasOperands() {
+      return std::vector<int>({Operands...});
+    }
+  };
+};
+
+// This class provides the API for TFL ops that shouldn't be quantized. This is
+// used as a trait like this:
+//
+//   class LessOp : public Op<LessOp, OpTrait::TFL::NoQuantizableResult> {
+//
+template <typename ConcreteType>
+class NoQuantizableResult
+    : public QuantizationSpecTraitBase<ConcreteType, NoQuantizableResult> {
+ public:
+  static bool IsQuantizable() { return false; }
+};
+
+}  // namespace quant
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_TRAITS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
similarity index 90%
rename from tensorflow/compiler/mlir/lite/utils/quantization_utils.cc
rename to tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index da797db4cd4..31a7a181124 100644
--- a/tensorflow/compiler/mlir/lite/utils/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 
 #include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:local_config_mlir
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
@@ -61,6 +61,17 @@ TypeAttr GetQuantizedTypeAttr(Builder builder, Type input_type, Attribute min,
                               narrow_range.getValue(), /*is_signed=*/false);
 }
 
+TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
+                                                TypeAttr source, Type target) {
+  if (!source || !source.getValue().isa<TensorType>()) return {};
+  auto ele_type = source.getValue().cast<TensorType>().getElementType();
+  if (auto quantized_type = ele_type.dyn_cast<quant::QuantizedType>()) {
+    Type final_type = quantized_type.castFromExpressedType(target);
+    if (final_type) return builder.getTypeAttr(final_type);
+  }
+  return {};
+}
+
 Type GetUniformQuantizedTypeForElementsAttr(ElementsAttr attr,
                                             unsigned storage_type_width,
                                             bool is_signed, bool narrow_range) {
diff --git a/tensorflow/compiler/mlir/lite/utils/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
similarity index 52%
rename from tensorflow/compiler/mlir/lite/utils/quantization_utils.h
rename to tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 941ce636bc1..e101893b06d 100644
--- a/tensorflow/compiler/mlir/lite/utils/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -16,18 +16,54 @@ limitations under the License.
 // This header file defines common utils used by TFLite transformation
 // passes to work with op attributes.
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_QUANTIZATION_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_QUANTIZATION_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_UTILS_H_
+
+#include <unordered_map>
 
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 
 namespace mlir {
 namespace TFL {
 
+using QuantParams = quant::QuantizedType;
+using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
+using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
+using AccumulatorScaleFunc =
+    std::function<QuantParams(const std::vector<QuantParams>&)>;
+
+// Quantization spec of an op, driving the quantization algorithm.
+struct OpQuantSpec {
+  // Whether the op has quantizable result. This flag is set to false if the op
+  // has "TFL::NoQuantizableResult" trait.
+  bool is_quantizable = true;
+
+  // Whether it requires same inputs and result scale. This flag is set to true
+  // if the op has "TFL::SameOperandsAndResultScale" trait.
+  bool requires_same_scale = false;
+
+  // Maps the operand index of a bias input to its quantization specifications,
+  // including the non-bias operand indexes and the method retrieving
+  // quantization parameters from list of parameters of the non-bias operands.
+  // This map is empty if the op doesn't havea bias operand.
+  std::unordered_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>
+      biases_params;
+
+  // Quantization parameters for value restricted outputs. This is the
+  // "hard-coded" parameters and should be used unconditionally for the
+  // quantized op. This vector is empty if the op doesn't have value resctricted
+  // outputs.
+  llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
+};
+
+// A function signature for getting the particular OpQuantSpec for the provided
+// op.
+typedef std::unique_ptr<OpQuantSpec> (*OpQuantSpecGetter)(Operation* op);
+
 // A generic rewrite pattern which matches any N-in-1-out operations with
 // quantization parameters propagated to all the operands and results values.
 // The quantization parameters are annotated by the Q/DQ op pairs. Each matched
@@ -49,11 +85,11 @@ struct GenericFullQuantizationPattern : public RewritePattern {
       return matchFailure();
     }
     auto quantize_op = cast<Q>(op);
-    auto quantized_op = quantize_op.input()->getDefiningOp();
+    Operation* quantized_op = quantize_op.input()->getDefiningOp();
     // If it is a block argument, requantize op, or has more than one result, we
     // shouldn't rewrite this op.
     if (!quantized_op || llvm::isa<Q>(quantized_op) ||
-        llvm::isa<DQ>(quantized_op) || quantized_op->getNumResults() != 1) {
+        llvm::isa<DQ>(quantized_op)) {
       return matchFailure();
     }
 
@@ -61,21 +97,66 @@ struct GenericFullQuantizationPattern : public RewritePattern {
     // inputs.
     SmallVector<Value*, 4> inputs;
     inputs.reserve(quantized_op->getNumOperands());
-    for (int i = 0, e = quantized_op->getNumOperands(); i != e; ++i) {
-      auto* operand = quantized_op->getOperand(i);
+    for (auto operand : quantized_op->getOperands()) {
+      Type operand_type = operand->getType();
+      if (operand_type.isa<NoneType>()) {
+        inputs.push_back(operand);
+        continue;
+      }
+      auto operand_ele_type =
+          operand->getType().cast<TensorType>().getElementType();
       if (auto op_inst = dyn_cast_or_null<DQ>(operand->getDefiningOp())) {
         inputs.push_back(op_inst.input());
+      } else if (operand_ele_type.isa<IntegerType>()) {
+        // If the operand is an integer tensor, then it doesn't require the
+        // DQ op in the pattern.
+        inputs.push_back(operand);
       } else {
         return matchFailure();
       }
     }
+
+    // Collect all the quantized outputs and replace them by the results of the
+    // new quantized op.
+    llvm::SmallDenseMap<Value*, int> outputs_replaced;
+    SmallVector<Type, 4> output_types;
+    output_types.reserve(quantized_op->getNumResults());
+    for (auto enumerated_result : llvm::enumerate(quantized_op->getResults())) {
+      Value* result = enumerated_result.value();
+      Type result_type = result->getType();
+      // Add this to the test coverage once we create test ops with none type
+      // results.
+      if (result_type.isa<NoneType>()) {
+        outputs_replaced.insert({result, enumerated_result.index()});
+        output_types.push_back(result_type);
+        continue;
+      }
+      if (!result->hasOneUse()) return matchFailure();
+      Type result_ele_type =
+          result->getType().cast<TensorType>().getElementType();
+      if (auto user = dyn_cast_or_null<Q>(*result->user_begin())) {
+        outputs_replaced.insert({user.output(), enumerated_result.index()});
+        output_types.push_back(user.getType());
+      } else if (result_ele_type.template isa<IntegerType>()) {
+        // If the result is an integer tensor, then it doesn't require the
+        // D op in the pattern.
+        outputs_replaced.insert({result, enumerated_result.index()});
+        output_types.push_back(result_ele_type);
+      } else {
+        return matchFailure();
+      }
+    }
+
     // Use OpBuilder so we can use op name to create the new op.
     OpBuilder builder(quantized_op);
-    OperationState new_state(
-        quantized_op->getLoc(), quantized_op->getName().getStringRef(), inputs,
-        op->getResult(0)->getType(), quantized_op->getAttrs());
+    OperationState new_state(quantized_op->getLoc(),
+                             quantized_op->getName().getStringRef(), inputs,
+                             output_types, quantized_op->getAttrs());
     Operation* new_op = builder.createOperation(new_state);
-    rewriter.replaceOp(op, {new_op->getResult(0)});
+    for (auto output : outputs_replaced) {
+      output.getFirst()->replaceAllUsesWith(
+          new_op->getResult(output.getSecond()));
+    }
     return matchSuccess();
   }
 };
@@ -95,6 +176,16 @@ TypeAttr GetQuantizedTypeAttr(Builder builder, Type input_type, Attribute min,
                               Attribute max, IntegerAttr num_bits,
                               BoolAttr narrow_range);
 
+// Casts the `target` type to a quantized type by using the quantization
+// parameters from the type in the `source` type attribute.
+// Examples:
+//   f32 -> !quant.uniform<i8:f32, 1.0>
+//   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+// The result is wrapped by a type attribute. Returns nullptr if the cast isn't
+// valid.
+TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
+                                                TypeAttr source, Type target);
+
 // Quantizes the elements in the attribute `real_value` by the quantization
 // parameters in `tensor_type`. Returns empty Attribute if the
 // `tensor_type` is not a QuantizedType or the quantization fails.
@@ -119,9 +210,10 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
 // quantization parameters are stored as adjacent quantize and dequantize ops
 // and the propagation results are materialized by inserting pairs of quantize
 // and dequantize ops to this function.
-void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed);
+void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
+                                        OpQuantSpecGetter op_quant_spec_getter);
 
 }  // end namespace TFL
 }  // end namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_QUANTIZATION_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
similarity index 90%
rename from tensorflow/compiler/mlir/lite/tools/op_quant_spec_getters_gen.cc
rename to tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index 9be4a0bf9d7..b381a5fa898 100644
--- a/tensorflow/compiler/mlir/lite/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -38,9 +38,10 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
   llvm::Regex acc_uniform_trait_regex{"AccumulatorUniformScale<([0-9]*),"};
   llvm::Regex fixed_uniform_trait_regex{
       "FixedResultUniformScale<([0-9]+).*(true|false)>"};
-  emitSourceFileHeader("TensorFlow Lite Ops Quant Spec Getters", os);
+  emitSourceFileHeader("Generated Ops Quant Spec Getters", os);
 
-  // Retrieve all the definitions derived from TFL_Op and sort by record name.
+  // Retrieve all the definitions derived from Op defintion and sort by record
+  // name.
   std::vector<Record *> defs = records.getAllDerivedDefinitions("Op");
   llvm::sort(defs, LessRecord());
 
@@ -53,9 +54,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
     for (const auto t : op.getTraits()) {
       if (auto opTrait = llvm::dyn_cast<mlir::tblgen::NativeOpTrait>(&t)) {
         auto trait = opTrait->getTrait();
-        // We only handle TFL specific native op traits.
-        if (!trait.startswith("TFL::")) continue;
-        trait.consume_front("TFL::");
+        if (!trait.consume_front("OpTrait::quant::")) continue;
 
         OUT(2) << "if (auto tfl = llvm::dyn_cast<" << op.getQualCppClassName()
                << ">(op)) {\n";
@@ -74,7 +73,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
           OUT(4) << "for (int i = 0, e = op->getNumResults(); i != e; ++i)\n";
           OUT(6) << "spec->restricted_output_params[std::make_pair("
                  << matches[1] << ", " << matches[2]
-                 << ")].push_back(tfl.OpTrait::TFL::" << trait << "<"
+                 << ")].push_back(tfl.OpTrait::quant::" << trait << "<"
                  << op.getQualCppClassName()
                  << ">::GetResultQuantizedType(i));\n";
           matches.clear();
@@ -98,7 +97,6 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
 }
 
 int main(int argc, char **argv) {
-  llvm::PrettyStackTraceProgram X(argc, argv);
   llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
   return TableGenMain(argv[0], &OpQuantSpecWriter);
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index da779c14ea8..68a9fb7bc3e 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -test-constant-fold | FileCheck %s
+// RUN: tf-opt %s -test-constant-fold | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: @add_float
 func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) {
@@ -109,6 +109,36 @@ func @mul_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   return %5, %6, %7, %8 : tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>
 }
 
+// CHECK-LABEL: @elementwise_unary_ops
+func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
+  %0 = constant dense<-1.0> : tensor<f32>
+  %1 = constant dense<1.0> : tensor<f32>
+  %2 = constant dense<1.0> : tensor<f32>
+  %3 = constant dense<1.0> : tensor<f32>
+  %4 = constant dense<4.0> : tensor<f32>
+  %5 = constant dense<4.0> : tensor<f32>
+  %6 = constant dense<2.0> : tensor<f32>
+
+  // CHECK-DAG: [[cst0:%.*]] = constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: [[cst1:%.*]] = constant dense<0.841470957> : tensor<f32>
+  // CHECK-DAG: [[cst2:%.*]] = constant dense<0.540302277> : tensor<f32>
+  // CHECK-DAG: [[cst3:%.*]] = constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: [[cst4:%.*]] = constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-DAG: [[cst5:%.*]] = constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: [[cst6:%.*]] = constant dense<4.000000e+00> : tensor<f32>
+  // CHECK: return [[cst0]], [[cst1]], [[cst2]], [[cst3]], [[cst4]], [[cst5]], [[cst6]]
+
+  %7 = "tfl.abs"(%0) : (tensor<f32>) -> tensor<f32>
+  %8 = "tfl.sin"(%1) : (tensor<f32>) -> tensor<f32>
+  %9 = "tfl.cos"(%2) : (tensor<f32>) -> tensor<f32>
+  %10 = "tfl.log"(%3) : (tensor<f32>) -> tensor<f32>
+  %11 = "tfl.sqrt"(%4) : (tensor<f32>) -> tensor<f32>
+  %12 = "tfl.rsqrt"(%5) : (tensor<f32>) -> tensor<f32>
+  %13 = "tfl.square"(%6) : (tensor<f32>) -> tensor<f32>
+
+  return %7, %8, %9, %10, %11, %12, %13 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+
 // CHECK-LABEL: @mul_int
 func @mul_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %0 = constant dense<8> : tensor<i32>
@@ -273,3 +303,179 @@ func @add_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 // CHECK:  %0 = "tfl.add"
 // CHECK:  return %0
 }
+
+// CHECK-LABEL: @rank
+func @rank() -> tensor<1xi32> {
+  %cst = constant dense<[[1], [2]]> : tensor<2x1xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.rank"(%cst) : (tensor<2x1xi32>) -> tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// CHECK-LABEL: @rank_input_known_rank
+func @rank_input_known_rank(%arg0 : tensor<2x1xi32>) -> tensor<1xi32> {
+  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.rank"(%arg0) : (tensor<2x1xi32>) -> tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// CHECK-LABEL: @reshape
+func @reshape() -> tensor<1x2xi32> {
+  %cst = constant dense<[1, 2]> : tensor<2xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}1, 2]]> : tensor<1x2xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.reshape"(%cst) : (tensor<2xi32>) -> tensor<1x2xi32>
+  return %0 : tensor<1x2xi32>
+}
+// CHECK-LABEL: @pseudo_const
+func @pseudo_const() -> tensor<i32> {
+  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<i32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+
+// CHECK-LABEL: @range_int
+func @range_int() -> tensor<?xi32> {
+  %cst = constant dense<0> : tensor<i32>
+  %cst_1 = constant dense<4> : tensor<i32>
+  %cst_2 = constant dense<1> : tensor<i32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// CHECK-LABEL: @range_float
+func @range_float() -> tensor<?xf32> {
+  %cst = constant dense<0.0> : tensor<f32>
+  %cst_1 = constant dense<4.0> : tensor<f32>
+  %cst_2 = constant dense<1.0> : tensor<f32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+
+// CHECK-LABEL: @range_float_neg_delta
+func @range_float_neg_delta() -> tensor<?xf32> {
+  %cst = constant dense<0.0> : tensor<f32>
+  %cst_1 = constant dense<-4.0> : tensor<f32>
+  %cst_2 = constant dense<-1.0> : tensor<f32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @range_float_nonzero_base
+func @range_float_nonzero_base() -> tensor<?xf32> {
+  %cst = constant dense<2.0> : tensor<f32>
+  %cst_1 = constant dense<7.0> : tensor<f32>
+  %cst_2 = constant dense<1.5> : tensor<f32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @transpose_no_fold
+func @transpose_no_fold(%arg0 : tensor<2xi32>) -> tensor<2x2xi32> {
+  %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
+
+  // CHECK: tfl.transpose
+  %0 = "tfl.transpose"(%cst, %arg0) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @transpose_1d
+// Basic 1D identity
+func @transpose_1d() -> tensor<3xi32> {
+  %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
+  %cst_perm = constant dense<0> : tensor<1xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<3xi32>
+  return %0 : tensor<3xi32>
+}
+
+// CHECK-LABEL: @transpose_dynamic
+func @transpose_dynamic() -> tensor<?xi32> {
+  %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
+  %cst_perm = constant dense<0> : tensor<1xi32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// CHECK-LABEL: @transpose_2d
+func @transpose_2d() -> tensor<2x2xi32> {
+  %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
+  %cst_perm = constant dense<[1, 0]> : tensor<2xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @transpose_2d_identity
+func @transpose_2d_identity() -> tensor<2x2xi32> {
+  %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
+  %cst_perm = constant dense<[0, 1]> : tensor<2xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @transpose_3d
+// A test case adopted from TransposeTest.Test3DInputConstTensor in
+// tensorflow/lite/kernels/transpose_test.cc
+func @transpose_3d() -> tensor<4x2x3xi32> {
+  %cst = constant dense<[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]> : tensor<2x3x4xi32>
+  %cst_perm = constant dense<[2, 0, 1]> : tensor<3xi32>
+
+  // CHECK: [[cst:%.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
+  // CHECK: return [[cst]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x2x3xi32>
+  return %0 : tensor<4x2x3xi32>
+}
+
+// CHECK-LABEL: @ConstantFoldBinaryOpDynamicOutput
+func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
+  %cst = constant dense<10> : tensor<i32>
+  %cst_0 = "tfl.pseudo_const"() {value = dense<[5, 10]> : tensor<2xi32>} : () -> tensor<?xi32>
+  %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  return %87 : tensor<?xi32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: return [[cst]]
+}
+
+// CHECK-LABEL: @add_dense_dense_int_same_shape_dynamic
+func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
+  %0 = constant dense<[15, 23, -44, -2]> : tensor<4xi32>
+  %1 = constant dense<[-10, -1, 42, 100]> : tensor<4xi32>
+
+  %2 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<?xi32>
+
+  return %2 : tensor<?xi32>
+
+  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return [[cst]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.line.part.pbtxt b/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.line.part.pbtxt
index 1bf0b075baf..c1bb797ebee 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.line.part.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.line.part.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -mlir-pretty-debuginfo -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes=1,224,224,3 -tf-output-arrays=MobilenetV1/MobilenetV1/Conv2d_0/BatchNorm/FusedBatchNorm -tf-debug-info=%s.debug %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+# RUN: tf_tfl_translate -mlir-pretty-debuginfo -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes=1,224,224,3 -tf-output-arrays=MobilenetV1/MobilenetV1/Conv2d_0/BatchNorm/FusedBatchNorm -tf-debug-info=%s.debug %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 # CHECK: fake/user/code/file_C.py:27:1: error: 'tf.Conv2D' op attribute 'data_format' failed to satisfy constraint: 'NHWC' or 'NCHW' convnet data format
 
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.stack.part.pbtxt b/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.stack.part.pbtxt
index edad75c4fc2..d3dcbc65719 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.stack.part.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/v1_1.0_224_frozen.wrong_attr.stack.part.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -mlir-pretty-debuginfo -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes=1,224,224,3 -tf-output-arrays=MobilenetV1/MobilenetV1/Conv2d_0/BatchNorm/FusedBatchNorm -tf-debug-info=%s.debug %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+# RUN: tf_tfl_translate -mlir-pretty-debuginfo -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes=1,224,224,3 -tf-output-arrays=MobilenetV1/MobilenetV1/Conv2d_0/BatchNorm/FusedBatchNorm -tf-debug-info=%s.debug %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 # CHECK: fake/user/code/file_C.py:27:1: error: 'tf.Conv2D' op attribute 'data_format' failed to satisfy constraint: 'NHWC' or 'NCHW' convnet data format
 # CHECK: fake/user/code/file_D.py:28:1: note: called from
diff --git a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
new file mode 100644
index 00000000000..5cbcb1e1cb8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
@@ -0,0 +1,155 @@
+// RUN: tf-opt -tfl-extract-ophint %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: extractSimpleOphint
+func @extractSimpleOphint() {
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @d4b1eb00b81211e99426dc4a3e957995(%0) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation", _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %3 = "tf.Mul"(%2, %1) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractPackedInputOphint
+func @extractPackedInputOphint() {
+// CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @47393154b9af11e99426dc4a3e957995(%[[PACK]]) : (tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_stack", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_stack", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "47393154b9af11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack-47393154b9af11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractFirstInputOphint
+func @extractFirstInputOphint() {
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b703f0f4b9ec11e99426dc4a3e957995(%0) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_first", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "first", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_first", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "first", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_first", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_first", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "b703f0f4b9ec11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_first-b703f0f4b9ec11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractLastInputOphint
+func @extractLastInputOphint() {
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @e31fcf90b9ed11e99426dc4a3e957995(%1) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_last", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "last", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_last", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "last", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_last", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_last", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "e31fcf90b9ed11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_last-e31fcf90b9ed11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractPackOneInputOphint
+func @extractPackOneInputOphint() {
+// CHECK:  %[[RESHAPE:[0-9]*]] = "tfl.reshape"(%0) : (tensor<1x16x1xf32>) -> tensor<1x1x16x1xf32>
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @33fab028b9ef11e99426dc4a3e957995(%[[RESHAPE]]) : (tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Identity"(%2) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_pack_input_one", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "33fab028b9ef11e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_pack_input_one-33fab028b9ef11e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractStackInputOutputOphint
+func @extractStackInputOutputOphint() {
+// CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b92ed354b9f011e99426dc4a3e957995(%[[PACK]]) : (tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
+// CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[OP_HINT_CALL]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
+// CHECK:  %[[OUTPUT1:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT2:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#1) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %8 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_2"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %9 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_3"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %10 = "tf.Add"(%8, %9) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %11 = "tf.Identity"(%10) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK-LABEL: extractMultipleInputsOutputsOphint
+func @extractMultipleInputsOutputsOphint() {
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]]:2 = call @a6ca45beb9f411e99426dc4a3e957995(%0, %1) : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
+// CHECK:  %[[OUTPUT1:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]#0) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[OUTPUT2:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]#1) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %3 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 1 : i64, _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %5 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_1"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %6 = "tf.Mul"(%2, %5) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %7 = "tf.Identity"(%6) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %8 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_2"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %9 = "tf.Sigmoid"(%4) {T = "tfdtype$DT_FLOAT", name = "Sigmoid_3"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %10 = "tf.Add"(%8, %9) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  %11 = "tf.Identity"(%10) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+  return
+}
+
+// CHECK:  func @d4b1eb00b81211e99426dc4a3e957995(tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation"}
+// CHECK:  func @47393154b9af11e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_stack"}
+// CHECK:  func @b703f0f4b9ec11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_first"}
+// CHECK:  func @e31fcf90b9ed11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_last"}
+// CHECK:  func @33fab028b9ef11e99426dc4a3e957995(tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_pack_input_one"}
+// CHECK:  func @b92ed354b9f011e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_stack_input_output"}
+// CHECK:  func @a6ca45beb9f411e99426dc4a3e957995(tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
+// CHECK:    attributes  {_tflite_function_name = "cool_activation_multiple_input_output"}
+
+
+// -----
+
+// expected-error@+1 {{Found malformed ophint regions: missing inputs or outputs.}}
+module {
+func @extractOphintFailure() {
+  %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
+  %1 = call @AnotherFunc(%0) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %3 = "tf.Mul"(%2, %1) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  return
+}
+
+func @AnotherFunc(%arg0: tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32> {
+  %0 = "tf.Identity"(%arg0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation", _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  return %0 : tensor<1x16x16x1xf32>
+}
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
new file mode 100644
index 00000000000..b6231c050b5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
@@ -0,0 +1,89 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Ensure constants roundtrip exactly
+
+func @bool() -> tensor<4xi1> {
+  // CHECK-LABEL: @bool
+  // CHECK: value = dense<[false, true, true, false]> : tensor<4xi1>
+  %0 = "tfl.pseudo_const"() { value = dense<[false, true, true, false]> : tensor<4xi1> } : () -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+
+func @complex64() -> tensor<4x!tf.complex64> {
+  // CHECK-LABEL: @complex64
+  // CHECK: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C455836342074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3230303F5C3030305C3030305C3230303F5C3030305C3030305C303030405C3030305C3030305C303030405C3030305C30303040405C3030305C30303040405C3030305C3030305C323030405C3030305C3030305C3230304022"> : tensor<4x!tf.complex64>
+  %0 = "tfl.pseudo_const"() { value = opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C455836342074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3230303F5C3030305C3030305C3230303F5C3030305C3030305C303030405C3030305C3030305C303030405C3030305C30303040405C3030305C30303040405C3030305C3030305C323030405C3030305C3030305C3230304022"> : tensor<4x!tf.complex64> } : () -> tensor<4x!tf.complex64>
+  return %0 : tensor<4x!tf.complex64>
+}
+
+// TODO(b/138847107) this should work but doesn't
+// func @f16() -> tensor<4xf16> {
+//   %0 = "tfl.pseudo_const"() { value = dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf16> } : () -> tensor<4xf16>
+//   return %0 : tensor<4xf16>
+// }
+
+func @f32() -> tensor<4xf32> {
+  // CHECK-LABEL: @f32
+  // CHECK: value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32>
+  %0 = "tfl.pseudo_const"() { value = dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32> } : () -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+func @i8() -> tensor<4xi8> {
+  // CHECK-LABEL: @i8
+  // CHECK: value = dense<[1, 2, 3, 4]> : tensor<4xi8>
+  %0 = "tfl.pseudo_const" () { value = dense<[1, 2, 3, 4]> : tensor<4xi8> } : () -> tensor<4xi8>
+  return %0 : tensor<4xi8>
+}
+
+func @i16() -> tensor<4xi16> {
+  // CHECK-LABEL: @i16
+  // CHECK: value = dense<[1, 2, 3, 258]> : tensor<4xi16>
+  %0 = "tfl.pseudo_const" () { value = dense<[1, 2, 3, 258]> : tensor<4xi16> } : () -> tensor<4xi16>
+  return %0 : tensor<4xi16>
+}
+
+func @i32() -> tensor<4xi32> {
+  // CHECK-LABEL: @i32
+  // CHECK: value = dense<[1, 2, 3, 16909060]> : tensor<4xi32>
+  // Check bytes come back in the right order
+  %0 = "tfl.pseudo_const" () { value = dense<[1, 2, 3, 16909060]> : tensor<4xi32> } : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+func @i64() -> tensor<4xi64> {
+  // CHECK-LABEL: @i64
+  // CHECK: value = dense<[1, 2, 3, 72623859790382856]> : tensor<4xi64>
+  %0 = "tfl.pseudo_const" () { value = dense<[1, 2, 3, 72623859790382856]> : tensor<4xi64> } : () -> tensor<4xi64>
+  return %0 : tensor<4xi64>
+}
+
+// TODO(krzysd) Add a test for strings. This isn't too urgent, since they use
+// the same sort of opaque round-trip we get for complex64, but it might be good
+// to check
+
+func @uint8() -> tensor<4x!tf.uint8> {
+  // CHECK-LABEL: @uint8
+  // CHECK: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F55494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3333365C3235355C3237365C33353722"> : tensor<4x!tf.uint8>
+  %0 = "tfl.pseudo_const"() { value = opaque<"tf", "0x746674656E736F722464747970653A2044545F55494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3333365C3235355C3237365C33353722"> : tensor<4x!tf.uint8> } : () -> tensor<4x!tf.uint8>
+  return %0 : tensor<4x!tf.uint8>
+}
+
+func @qi32_per_axis() -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>> {
+  // CHECK-LABEL: @qi32_per_axis
+  // CHECK: {qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
+  %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
+  return %0 : tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
+}
+
+func @qu8() -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>> {
+  // CHECK-LABEL: @qu8
+  // CHECK: {qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>
+  %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
+  return %0 : tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
+}
+
+// Identity function to make the exporter happy
+func @main(%arg0: tensor<4xi8>) -> tensor<4xi8> {
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<4xi8>) -> tensor<4xi8>
+  return %0 : tensor<4xi8>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir
new file mode 100644
index 00000000000..3f3cad12b61
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir
@@ -0,0 +1,20 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Confirm function references in if ops are preserved
+func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+// CHECK:   %{{.*}} = "tf.If"(%{{.*}}, %{{.*}}, %{{.*}}) {else_branch = @cond_false, is_stateless = false, then_branch = @cond_true} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tfl.pseudo_input"(%arg1) : (tensor<1xf32>) -> tensor<1xf32>
+  %2 = "tfl.less"(%0, %1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1>
+  %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  return %3 : tensor<1xf32>
+}
+
+func @cond_true(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @cond_false(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir
new file mode 100644
index 00000000000..4cfa8e39969
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir
@@ -0,0 +1,10 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+
+// Confirm a wide array of attribute survives the round-trip
+func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16xf32>):
+  // CHECK: "tfl.average_pool_2d"(%{{.*}}) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> loc("Input")
+  %1 = "tfl.average_pool_2d"(%0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> loc("avgpool")
+  return %1 : tensor<1x1x1x16xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
index a92e985c668..c9528aed3e2 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
@@ -1,13 +1,16 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Confirm float constants and operators survive a roundtrip
 
 func @main(tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>):
-  // CHECK: func @main(%arg0: tensor<4xf32>)
-  // CHECK-NEXT:   return
-  // CHECK-NEXT: }
-
+  // CHECK: [[INPUT:%.*]] = "tfl.pseudo_input"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: [[CONST:%.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
+  // CHECK-NEXT: [[SQDIFF:%.*]] = tfl.squared_difference [[INPUT]], [[CONST]] : tensor<4xf32>
+  // CHECK-NEXT: %{{.*}} = tfl.mul [[INPUT]], [[SQDIFF]] {fused_activation_function = "NONE"} : tensor<4xf32>
   %0 = "tfl.pseudo_input" (%arg0) : (tensor<4xf32>) -> tensor<4xf32> loc("Input")
   %1 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  // Confirm that attributes that cannot be stored in the flatbuffer options
+  // for a given operator are dropped silently.
   %2 = "tfl.squared_difference"(%0, %1) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("squared_difference")
   %3 = "tfl.mul"(%0, %2) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("mul")
   %4 = "tfl.div"(%3, %2) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("div")
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir
new file mode 100644
index 00000000000..ce62aa381f1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir
@@ -0,0 +1,13 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Test to make sure optional parameters survive a roundtrip
+
+func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+// CHECK: [[NONE:%.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%{{.()}}, %{{.*}}, [[NONE]])
+// CHECK-SAME: (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>, tensor<40x40xf32>)
+  %cst = constant unit
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
+  %1 = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
+  %2:2 = "tfl.fully_connected"(%0, %1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>, tensor<40x40xf32>)
+  return %2 : tensor<40x40xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
new file mode 100644
index 00000000000..18e2888dfcd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
@@ -0,0 +1,19 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+
+func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
+// CHECK:   %{{.*}} = "tfl.quantize"(%{{.*}}) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
+// The float values here doesn't match exactly because double -> float -> double is lossy
+// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>
+// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>
+// CHECK:   %{{.*}} = "tfl.dequantize"(%{{.*}}) : (tensor<1x1001x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x1001xf32>
+
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  %2 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>
+  %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+  %4 = "tfl.conv_2d"(%1, %2, %3) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>, tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  %5 = "tfl.reshape"(%4) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>) -> tensor<1x1001x!quant.uniform<u8:f32, 0.023528476789885875>>
+  %6 = "tfl.softmax"(%5) {beta = 1.000000e+00 : f32} : (tensor<1x1001x!quant.uniform<u8:f32, 0.023528476789885875>>) -> tensor<1x1001x!quant.uniform<u8:f32, 3.906250e-03>>
+  %7 = "tfl.dequantize"(%6) : (tensor<1x1001x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x1001xf32>
+  return %7 : tensor<1x1001xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir
new file mode 100644
index 00000000000..85596169508
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir
@@ -0,0 +1,9 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Confirm we can extract type info from reshape
+
+func @main() -> tensor<2x2xf32> {
+  // CHECK: %{{.*}} = "tfl.reshape"(%{{.*}}) : (tensor<4xf32>) -> tensor<2x2xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %1 = "tfl.reshape" (%0) : (tensor<4xf32>) -> tensor<2x2xf32> loc("reshape")
+  return %1 : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
index 600c7a02ed5..714027d67d1 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
@@ -1,10 +1,17 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Check a few basic properties of the import-export,
+// including constants retaining their shape
+// and the module including the TFLite version.
 
 func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
 ^bb0(%arg0: tensor<3x2xi32>):
-  // CHECK: func @main(%arg0: tensor<3x2xi32>) {
-  // CHECK-NEXT:   return
-  // CHECK-NEXT: }
+  // CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}
+
+  // CHECK:          %{{.*}} = "tfl.pseudo_const"() {value = dense<{{\[\[1, 2\], \[3, 4\], \[5, 6\]\]}}> : tensor<3x2xi32>}
+  // CHECK-NEXT:     [[SUB:%.*]] = tfl.sub %{{.*}}, %{{.*}} {fused_activation_function = "RELU6"} : tensor<3x2xi32>
+  // CHECK-NEXT:     [[SCALAR:%.*]] = "tfl.pseudo_const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT:     [[ADD:%.*]] = "tfl.add"([[SCALAR]], [[SUB]]) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+  // CHECK-NEXT:     return [[ADD]] : tensor<3x2xi32>
 
   %0 = "tfl.pseudo_input" (%arg0) : (tensor<3x2xi32>) -> tensor<3x2xi32> loc("Input")
   %1 = "tfl.pseudo_const" () {value = dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/while_op.mlir
new file mode 100644
index 00000000000..141423f9231
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/while_op.mlir
@@ -0,0 +1,27 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Check to see if function references in while loops are preserved
+func @main(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+// TODO(b/138222071) Expect first output to be a scalar
+// CHECK:   %{{.*}}:2 = "tf.While"(%{{.*}}, %{{.*}}) {body = @body, cond = @cond, is_stateless = false} : (tensor<i32>, tensor<1xf32>) -> (tensor<*xi32>, tensor<1xf32>)
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tfl.pseudo_input"(%arg1) : (tensor<1xf32>) -> tensor<1xf32>
+
+  // While %0 is greater than zero, element wise add %1 with itself.
+  %2:2 = "tf.While"(%0, %1) {
+    cond = @cond, body = @body, is_stateless = false
+  } : (tensor<i32>, tensor<1xf32>) -> (tensor<i32>, tensor<1xf32>)
+  return %2#1 : tensor<1xf32>
+}
+
+func @cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
+  %0 = "std.constant" () {value = dense<0> : tensor<i32>} : () -> tensor<i32> loc("Const")
+  %1 = "tfl.greater"(%arg0, %0) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+
+func @body(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> (tensor<*xi32>, tensor<*xf32>) {
+  %0 = "std.constant" () {value = dense<1> : tensor<i32>} : () -> tensor<i32> loc("Const")
+  %1 = "tfl.sub"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %2 = tfl.add %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<*xf32>
+  return %1, %2 : tensor<*xi32>, tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir
new file mode 100644
index 00000000000..06f304c55ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-ophint-func-op.mlir
@@ -0,0 +1,26 @@
+// RUN: tf-opt -tfl-legalize-ophint-func-op %s | FileCheck %s
+
+module {
+  // CHECK-LABEL: func @testConvertUnidirectionalSequenceRNN
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<1x3xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<1x3xf32>)
+  func @testConvertUnidirectionalSequenceRNN(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x4xf32> {
+    // CHECK:  %[[CST:.*]] = constant dense<0.000000e+00> : tensor<1x4xf32>
+    // CHECK:  %[[CST_0:.*]] = constant dense<0.000000e+00> : tensor<4xf32>
+    // CHECK:  %[[CST_1:.*]] = constant dense<0.000000e+00> : tensor<4x3xf32>
+    // CHECK:  %[[CST_2:.*]] = constant dense<0.000000e+00> : tensor<4x4xf32>
+    // CHECK:  %[[PACKED_INPUT:[a-z0-9]*]] = "tfl.pack"(%[[ARG_0]], %[[ARG_1]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
+    // CHECK:  %[[FUSED_OUTPUT:[a-z0-9]*]] = "tfl.unidirectional_sequence_rnn"(%[[PACKED_INPUT]], %[[CST_1]], %[[CST_2]], %[[CST_0]], %[[CST]]) {fused_activation_function = "TANH", time_major = true} : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
+    // CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[FUSED_OUTPUT]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
+
+    %cst = constant dense<0.000000e+00> : tensor<1x4xf32>
+    %cst0 = constant dense<0.000000e+00> : tensor<4xf32>
+    %cst1 = constant dense<0.000000e+00> : tensor<4x3xf32>
+    %cst2 = constant dense<0.000000e+00> : tensor<4x4xf32>
+    %2 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x1x3xf32>
+    %3 = call @a9211722c23011e9875cdc4a3e957995(%2, %cst1, %cst2, %cst0, %cst) : (tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
+    %4:2 = "tfl.unpack"(%3) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xf32>)
+    return %4#0 : tensor<1x4xf32>
+  }
+  func @a9211722c23011e9875cdc4a3e957995(tensor<2x1x3xf32>, tensor<4x3xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>) -> tensor<2x1x4xf32>
+  attributes  {_tflite_function_name = "UnidirectionalSequenceRnn"}
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 616922ba8d3..9c029bfc1d1 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -142,7 +142,7 @@ func @const() -> tensor<2xi32> {
   return %0: tensor<2xi32>
 
 // CHECK-LABEL: @const
-// CHECK: %0 = "tfl.pseudo_const"() {value = opaque<"tf", "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK: "tfl.pseudo_const"() {value = opaque<"tf", "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> tensor<2xi32>
 }
 
 func @placeholder(%arg0: tensor<f32>) -> tensor<f32> {
@@ -213,6 +213,20 @@ func @sigmoid(%arg0: tensor<?x88xf16>) -> tensor<?x88xf16> {
 // CHECK:  %0 = "tfl.logistic"(%arg0) : (tensor<?x88xf16>) -> tensor<?x88xf16>
 }
 
+func @sqrt(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = "tf.Sqrt"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+// CHECK-LABEL: sqrt
+// CHECK:  %0 = "tfl.sqrt"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+}
+
+func @square(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = "tf.Square"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+// CHECK-LABEL: square
+// CHECK:  %0 = "tfl.square"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+}
+
 func @log_softmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.LogSoftmax"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
   return %0 : tensor<8x16xf32>
@@ -289,6 +303,14 @@ func @abs(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
 // CHECK:  %0 = "tfl.abs"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
+func @any(%arg0: tensor<2x2xi1>, %arg1: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Any"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+
+// CHECK-LABEL:any
+// CHECK:  %0 = "tfl.reduce_any"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+}
+
 func @ceil(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Ceil"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
   return %0 : tensor<8x16xf32>
@@ -442,12 +464,12 @@ func @less_equal(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x1
 // CHECK:  return %0 : tensor<8x16xi1>
 }
 
-func @rank(%arg0: tensor<11x16xf32>) -> tensor<1xi32> {
-  %0 = "tf.Rank"(%arg0) : (tensor<11x16xf32>) -> tensor<1xi32>
+func @rank(%arg0: tensor<*xf32>) -> tensor<1xi32> {
+  %0 = "tf.Rank"(%arg0) : (tensor<*xf32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 
 // CHECK-LABEL:rank
-// CHECK:  %0 = "tfl.rank"(%arg0) : (tensor<11x16xf32>) -> tensor<1xi32>
+// CHECK:  %0 = "tfl.rank"(%arg0) : (tensor<*xf32>) -> tensor<1xi32>
 }
 
 func @floor(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
@@ -487,6 +509,15 @@ func @select(%arg0: tensor<8xi1>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>) ->
 // CHECK:  return %0 : tensor<8xf32>
 }
 
+func @select_v2(%arg0: tensor<8xi1>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>) -> tensor<8xf32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8xi1>, tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+  return %0: tensor<8xf32>
+
+// CHECK-LABEL: select_v2
+// CHECK:  %0 = "tfl.select"(%arg0, %arg1, %arg2)
+// CHECK:  return %0 : tensor<8xf32>
+}
+
 func @sin(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "tf.Sin"(%arg0) : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
@@ -629,6 +660,17 @@ func @pad(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x f32> {
   // CHECK:  return %0 : tensor<?xf32>
 }
 
+func @tile(tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32> {
+^bb0(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>):
+  %cst = constant dense<[1, 2]> : tensor<2xi32>
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32>
+  return %0 : tensor<2x6xf32>
+
+  // CHECK-LABEL: tile
+  // CHECK:  %0 = "tfl.tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32>
+  // CHECK:  return %0 : tensor<2x6xf32>
+}
+
 func @padv2(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<2x1x3xf32>, %arg1: tensor<3x2xi32>):
   %cst = constant dense<2.0> : tensor<f32>
@@ -782,12 +824,12 @@ func @space_to_batch_nd(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<2xi32>, %arg2:
   // CHECK: %0 = "tfl.space_to_batch_nd"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
 }
 
-func @split(%arg0: tensor<1xi32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3xf32> {
-  %0:3 = "tf.Split"(%arg0, %arg1) {num_split = 3 : i64} : (tensor<1xi32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
+func @split(%arg0: tensor<i32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3xf32> {
+  %0:3 = "tf.Split"(%arg0, %arg1) {num_split = 3 : i64} : (tensor<i32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
   return %0#0 : tensor<1x4x3xf32>
 
   // CHECK-LABEL: split
-  // CHECK: %0:3 = "tfl.split"(%arg0, %arg1) {num_splits = 3 : i32} : (tensor<1xi32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
+  // CHECK: %0:3 = "tfl.split"(%arg0, %arg1) {num_splits = 3 : i32} : (tensor<i32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
 }
 
 func @splitv(%arg0: tensor<1x4x3x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<1xi32>) -> tensor<1x4x2x3xf32> {
@@ -941,3 +983,104 @@ func @OneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3
 // CHECK-LABEL: OneHot
 // CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
 }
+
+func @argmax(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.ArgMax"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+
+// CHECK-LABEL: argmax
+// CHECK:  %0 = "tfl.arg_max"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+}
+
+func @argmax64(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i64> {
+  %0 = "tf.ArgMax"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<i64>
+  return %0 : tensor<i64>
+
+// CHECK-LABEL: argmax64
+// CHECK:  %0 = "tfl.arg_max"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<i64>
+}
+
+func @space_to_depth(%arg0: tensor<1x2x2x1xf32>) -> tensor<?xf32> {
+  %0 = "tf.SpaceToDepth"(%arg0) {block_size = 2: i64,  data_format = "NHWC"}: (tensor<1x2x2x1xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+
+  // CHECK-LABEL: space_to_depth
+  // CHECK: %[[ARG:.*]]: tensor<1x2x2x1xf32>
+  // CHECK: "tfl.space_to_depth"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x2x2x1xf32>) -> tensor<?xf32>
+}
+
+func @round(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = "tf.Round"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+
+  // CHECK-LABEL: round
+  // CHECK: %[[ARG:.*]]: tensor<8x16xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.round"(%[[ARG]]) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %[[RESULT]] : tensor<8x16xf32>
+}
+
+func @resize_nearest_neighbor(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
+  %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+  // CHECK-LABEL: resize_nearest_neighbor
+  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+}
+
+// Note: half_pixel_centers isn't supported by TFLite, so it's not legalized.
+func @resize_nearest_neighbor_with_half_pixel_centers(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
+  %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+  // CHECK-LABEL: resize_nearest_neighbor_with_half_pixel_centers
+  // CHECK: "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = true}
+}
+
+func @sparse_to_dense_with_scalar_sparse_indices(%arg0: tensor<i32>, %arg1: tensor<3xi32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x?x?xf32> {
+  %0 = "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) {validate_indices = true}: (tensor<i32>, tensor<3xi32>, tensor<f32>, tensor<f32>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+  // CHECK-LABEL: sparse_to_dense_with_scalar_sparse_indices
+  // CHECK: "tfl.sparse_to_dense"(%arg0, %arg1, %arg2, %arg3) : (tensor<i32>, tensor<3xi32>, tensor<f32>, tensor<f32>) -> tensor<?x?x?xf32>
+}
+
+func @sparse_to_dense_with_vector_sparse_indices(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>, %arg2: tensor<3xf32>, %arg3: tensor<f32>) -> tensor<?x?x?xf32> {
+  %0 = "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) {validate_indices = true}: (tensor<3xi32>, tensor<3xi32>, tensor<3xf32>, tensor<f32>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+  // CHECK-LABEL: sparse_to_dense_with_vector_sparse_indices
+  // CHECK: "tfl.sparse_to_dense"(%arg0, %arg1, %arg2, %arg3) : (tensor<3xi32>, tensor<3xi32>, tensor<3xf32>, tensor<f32>) -> tensor<?x?x?xf32>
+}
+
+func @sparse_to_dense_with_2d_sparse_indices(%arg0: tensor<3x2xi32>, %arg1: tensor<3xi32>, %arg2: tensor<2xf32>, %arg3: tensor<f32>) -> tensor<?x?x?xf32> {
+  %0 = "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) {validate_indices = true}: (tensor<3x2xi32>, tensor<3xi32>, tensor<2xf32>, tensor<f32>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+  // CHECK-LABEL: sparse_to_dense_with_2d_sparse_indices
+  // CHECK: "tfl.sparse_to_dense"(%arg0, %arg1, %arg2, %arg3) : (tensor<3x2xi32>, tensor<3xi32>, tensor<2xf32>, tensor<f32>) -> tensor<?x?x?xf32>
+}
+
+func @where(%arg0: tensor<3x5xi1>) -> tensor<?x2xi64> {
+  %0 = "tf.Where"(%arg0) : (tensor<3x5xi1>) -> tensor<?x2xi64>
+  return %0 : tensor<?x2xi64>
+  // CHECK-LABEL: where
+  // CHECK: "tfl.where"(%arg0) : (tensor<3x5xi1>) -> tensor<?x2xi64>
+}
+
+func @floor_mod(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> tensor<5xf32> {
+  %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+  // CHECK-LABEL: floor_mod
+  // CHECK: "tfl.floor_mod"(%arg0, %arg1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+}
+
+func @exp(%arg0: tensor<5xf32>) -> tensor<5xf32> {
+  %0 = "tf.Exp"(%arg0) : (tensor<5xf32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+  // CHECK-LABEL: exp
+  // CHECK: "tfl.exp"(%arg0) : (tensor<5xf32>) -> tensor<5xf32>
+}
+
+func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
+  %0 = "tf.DepthToSpace"(%arg0) {block_size = 2: i64,  data_format = "NHWC"}: (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+  return %0 : tensor<1x2x2x1xf32>
+
+  // CHECK-LABEL: depth_to_space
+  // CHECK: %[[ARG:.*]]: tensor<1x1x1x4xf32>
+  // CHECK: "tfl.depth_to_space"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 1fe6757c0c7..817ced79ced 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt -tfl-lower-static-tensor-list %s | FileCheck %s --dump-input-on-failure
-func @tensorlistGetItem(tensor<3x10xf32>, tensor<1xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<3x10xf32>) {
-^bb0(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>):
+
+func @tensorlistGetItem(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>) -> (tensor<10xf32>, tensor<3x10xf32>) {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<3x10xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<10xf32>>>
   %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<i32>, tensor<1xi32>) -> tensor<10xf32>
   %2 = "tf.TensorListStack"(%0, %arg1) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<1xi32>) -> tensor<3x10xf32>
@@ -11,8 +11,7 @@ func @tensorlistGetItem(tensor<3x10xf32>, tensor<1xi32>, tensor<i32>) -> (tensor
 // CHECK: return %0, %arg0 : tensor<10xf32>, tensor<3x10xf32>
 }
 
-func @tensorlistGetItemWithUnknownRank(tensor<*xf32>, tensor<1xi32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
-^bb0(%arg0: tensor<*xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>):
+func @tensorlistGetItemWithUnknownRank(%arg0: tensor<*xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<*xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<*xf32>>>
   %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) : (tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>, tensor<1xi32>) -> tensor<*xf32>
   %2 = "tf.TensorListStack"(%0, %arg1) : (tensor<!tf.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<*xf32>
@@ -23,8 +22,7 @@ func @tensorlistGetItemWithUnknownRank(tensor<*xf32>, tensor<1xi32>, tensor<i32>
 // CHECK: return %0, %arg0 : tensor<*xf32>, tensor<*xf32>
 }
 
-func @tensorlistSetItem(tensor<3x10xf32>, tensor<1xi32>, tensor<i32>, tensor<10xf32>) -> tensor<3x10xf32> {
-^bb0(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>, %arg3: tensor<10xf32>):
+func @tensorlistSetItem(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>, %arg3: tensor<10xf32>) -> tensor<3x10xf32> {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<3x10xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<10xf32>>>
   %1 = "tf.TensorListSetItem"(%0, %arg2, %arg3) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<i32>, tensor<10xf32>) -> tensor<!tf.variant<tensor<10xf32>>>
   %2 = "tf.TensorListStack"(%1, %arg1) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<1xi32>) -> tensor<3x10xf32>
@@ -56,8 +54,7 @@ func @tensorlistSetItem(tensor<3x10xf32>, tensor<1xi32>, tensor<i32>, tensor<10x
 // CHECK:  return %15 : tensor<3x10xf32>
 }
 
-func @tensorlistSetItemWithScalarElements(tensor<5xf32>, tensor<0xi32>, tensor<i32>, tensor<f32>) -> tensor<5xf32> {
-^bb0(%arg0: tensor<5xf32>, %arg1: tensor<0xi32>, %arg2: tensor<i32>, %arg3: tensor<f32>):
+func @tensorlistSetItemWithScalarElements(%arg0: tensor<5xf32>, %arg1: tensor<0xi32>, %arg2: tensor<i32>, %arg3: tensor<f32>) -> tensor<5xf32> {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<5xf32>, tensor<0xi32>) -> tensor<!tf.variant<tensor<f32>>>
   %1 = "tf.TensorListSetItem"(%0, %arg2, %arg3) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
   %2 = "tf.TensorListStack"(%1, %arg1) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> tensor<5xf32>
@@ -89,24 +86,23 @@ func @tensorlistSetItemWithScalarElements(tensor<5xf32>, tensor<0xi32>, tensor<i
 // CHECK:  return %15 : tensor<5xf32>
 }
 
-func @tensorlistReserve(tensor<3xi32>, tensor<i32>, tensor<i32>) -> tensor<?x?x?xf32> {
-^bb0(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>):
+func @tensorlistReserve(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<?x?x?xf32> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<?x?x?xf32>>>
   %1 = "tf.TensorListGetItem"(%0, %arg2, %arg0) : (tensor<!tf.variant<tensor<?x?x?xf32>>>, tensor<i32>, tensor<3xi32>) -> tensor<?x?x?xf32>
   return %1 : tensor<?x?x?xf32>
 
 // CHECK-LABEL: tensorlistReserve
-// CHECK:  %cst = constant dense<0> : tensor<i32>
-// CHECK:  %0 = "tf.ExpandDims"(%arg1, %cst) : (tensor<i32>, tensor<i32>) -> tensor<1xi32>
-// CHECK:  %1 = "tf.Concat"(%cst, %0, %arg0) {N = 2 : i64} : (tensor<i32>, tensor<1xi32>, tensor<3xi32>) -> tensor<4xi32>
-// CHECK:  %cst_0 = constant dense<0.000000e+00> : tensor<f32>
-// CHECK:  %2 = "tf.Fill"(%1, %cst_0) : (tensor<4xi32>, tensor<f32>) -> tensor<?x?x?x?xf32>
-// CHECK:  %3 = "tf.Gather"(%2, %arg2) {validate_indices = true} : (tensor<?x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
-// CHECK:  return %3 : tensor<?x?x?xf32>
+// CHECK-DAG:  [[ZERO1:%cst.*]] = constant dense<0> : tensor<i32>
+// CHECK-DAG:  [[ZERO2:%cst.*]] = constant dense<0> : tensor<i32>
+// CHECK-DAG:  [[DIM0:%.*]] = "tf.ExpandDims"(%arg1, [[ZERO1]]) : (tensor<i32>, tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG:  [[SHAPE:%.*]] = "tf.Concat"([[ZERO2]], [[DIM0]], %arg0) {N = 2 : i64} : (tensor<i32>, tensor<1xi32>, tensor<3xi32>) -> tensor<4xi32>
+// CHECK-DAG:  [[VALUES:%.*]] = constant dense<0.000000e+00> : tensor<f32>
+// CHECK:      [[LIST:%.*]] = "tf.Fill"([[SHAPE]], [[VALUES]]) : (tensor<4xi32>, tensor<f32>) -> tensor<?x?x?x?xf32>
+// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], %arg2) {validate_indices = true} : (tensor<?x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+// CHECK:      return [[RESULT]] : tensor<?x?x?xf32>
 }
 
-func @tensorlistReserveUnrankedElements(tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<*xf32> {
-^bb0(%arg0: tensor<?xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>):
+func @tensorlistReserveUnrankedElements(%arg0: tensor<?xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<*xf32> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<?xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*xf32>>>
   %1 = "tf.TensorListGetItem"(%0, %arg2, %arg0) : (tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>, tensor<?xi32>) -> tensor<*xf32>
   return %1 : tensor<*xf32>
@@ -117,13 +113,42 @@ func @tensorlistReserveUnrankedElements(tensor<?xi32>, tensor<i32>, tensor<i32>)
 // CHECK:  return [[RESULT2]] : tensor<*xf32>
 }
 
-func @tensorlistWhileLoop(tensor<2x3xf32>) -> tensor<*xf32> {
-^bb0(%arg0: tensor<2x3xf32>):
+func @EmptyTensorList(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<?x?x?xf32> {
+  %0 = "tf.EmptyTensorList"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<?x?x?xf32>>>
+  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg0) : (tensor<!tf.variant<tensor<?x?x?xf32>>>, tensor<i32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return %1 : tensor<?x?x?xf32>
+
+// CHECK-LABEL: EmptyTensorList
+// CHECK-SAME:  ([[ELEM_SHAPE:%.*]]: tensor<3xi32>, [[MAX_ELEMS:%.*]]: tensor<i32>, [[IDX:%.*]]: tensor<i32>)
+// CHECK-DAG:  [[DIM0:%cst.*]] = constant dense<0> : tensor<1xi32>
+// CHECK-DAG:  [[ZERO:%cst.*]] = constant dense<0> : tensor<i32>
+// CHECK-DAG:  [[SHAPE:%.*]] = "tf.Concat"([[ZERO]], [[DIM0]], [[ELEM_SHAPE]]) {N = 2 : i64} : (tensor<i32>, tensor<1xi32>, tensor<3xi32>) -> tensor<4xi32>
+// CHECK-DAG:  [[VALUES:%.*]] = constant dense<0.000000e+00> : tensor<f32>
+// CHECK:      [[LIST:%.*]] = "tf.Fill"([[SHAPE]], [[VALUES]]) : (tensor<4xi32>, tensor<f32>) -> tensor<?x?x?x?xf32>
+// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], [[IDX]]) {validate_indices = true} : (tensor<?x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+// CHECK:      return [[RESULT]] : tensor<?x?x?xf32>
+}
+
+func @tensorlistPushBack(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<10xf32>) -> tensor<?x10xf32> {
+  %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<3x10xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<10xf32>>>
+  %1 = "tf.TensorListPushBack"(%0, %arg2) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<10xf32>) -> tensor<!tf.variant<tensor<10xf32>>>
+  %2 = "tf.TensorListStack"(%1, %arg1) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<1xi32>) -> tensor<?x10xf32>
+  return %2 : tensor<?x10xf32>
+
+// CHECK-LABEL: tensorlistPushBack
+// CHECK-SAME:  ([[INPUT:%.*]]: tensor<3x10xf32>, [[ELEM_SHAPE:%.*]]: tensor<1xi32>, [[ITEM:%.*]]: tensor<10xf32>)
+// CHECK:   [[ZERO:%.*]] = constant dense<0> : tensor<i32>
+// CHECK:   [[EXP_ITEM:%.*]] = "tf.ExpandDims"([[ITEM]], [[ZERO]]) {{.*}} -> tensor<1x10xf32>
+// CHECK:   [[RESULT:%.*]] = "tf.Concat"(%cst, [[INPUT]], [[EXP_ITEM]]) {N = 2 : i64} : {{.*}} -> tensor<?x10xf32>
+// CHECK:   return [[RESULT]] : tensor<?x10xf32>
+}
+
+func @tensorlistWhileLoop(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
   %cst = constant dense<3> : tensor<1xi32>
   %cst_0 = constant dense<0> : tensor<i32>
   %cst_1 = constant dense<-1> : tensor<i32>
   %0 = "tf.TensorListFromTensor"(%arg0, %cst) : (tensor<2x3xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<3xf32>>>
-  %1:2 = "tf.While"(%cst_0, %0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_VARIANT"], body = @tensorlistWhileBody, cond = @tensorlistWhileCond} : (tensor<i32>, tensor<!tf.variant<tensor<3xf32>>>) -> (tensor<i32>, tensor<!tf.variant<tensor<*xf32>>>)
+  %1:2 = "tf.While"(%cst_0, %0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_VARIANT"], body = @tensorlistWhileBody, cond = @tensorlistWhileCond, is_stateless = false} : (tensor<i32>, tensor<!tf.variant<tensor<3xf32>>>) -> (tensor<i32>, tensor<!tf.variant<tensor<*xf32>>>)
   %2 = "tf.TensorListStack"(%1#1, %cst_1) : (tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>) -> tensor<*xf32>
   return %2 : tensor<*xf32>
 
@@ -136,8 +161,7 @@ func @tensorlistWhileLoop(tensor<2x3xf32>) -> tensor<*xf32> {
 // CHECK:  return %0#1 : tensor<*xf32>
 }
 
-func @tensorlistWhileBody(tensor<*xi32>, tensor<!tf.variant>) -> (tensor<*xi32>, tensor<!tf.variant>) {
-^bb0(%arg0: tensor<*xi32>, %arg1: tensor<!tf.variant>):
+func @tensorlistWhileBody(%arg0: tensor<*xi32>, %arg1: tensor<!tf.variant>) -> (tensor<*xi32>, tensor<!tf.variant>) {
   %cst = constant dense<1> : tensor<i32>
   %0 = "tf.Add"(%arg0, %cst) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
   %1 = "tf.Identity"(%arg1) : (tensor<!tf.variant>) -> tensor<!tf.variant>
@@ -151,8 +175,7 @@ func @tensorlistWhileBody(tensor<*xi32>, tensor<!tf.variant>) -> (tensor<*xi32>,
 // CHECK:  return %0, %1 : tensor<*xi32>, tensor<*xf32>
 }
 
-func @tensorlistWhileCond(tensor<*xi32>, tensor<!tf.variant>) -> tensor<*xi1> {
-^bb0(%arg0: tensor<*xi32>, %arg1: tensor<!tf.variant>):
+func @tensorlistWhileCond(%arg0: tensor<*xi32>, %arg1: tensor<!tf.variant>) -> tensor<*xi1> {
   %cst = constant dense<2> : tensor<i32>
   %0 = "tf.Less"(%arg0, %cst) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
   return %0 : tensor<*xi1>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_builtin.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_builtin.mlir
index 6f0882f7260..408fb516dac 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_builtin.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_builtin.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-builtin-tflite-ops=false -o - | flatbuffer_to_string - | FileCheck %s; test ${PIPESTATUS[1]} -eq 1
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-builtin-tflite-ops=false -o - | flatbuffer_to_string - | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 # CHECK: loc("disable_builtin.mlir":2:1): is a TFLite builtin op but builtin emission is not enabled
 # CHECK-NEXT: Verification failed.
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex.mlir
index be62118804a..c4dd8b5bacf 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s; test ${PIPESTATUS[1]} -eq 1
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 # CHECK:  loc("disable_flex.mlir":96:8): error: 'tf.div' op is a Flex op but Flex ops are not enabled for emission
 # CHECK-NEXT:  Verification failed.
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/dynamic_shape_constant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/dynamic_shape_constant.mlir
new file mode 100644
index 00000000000..1eae96217a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/dynamic_shape_constant.mlir
@@ -0,0 +1,25 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string -
+
+func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %cst = "tfl.pseudo_const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<?xi32>
+  %0 = "tfl.pseudo_input" (%arg0) : (tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<?xi32>) -> tensor<2xi32>
+  return %1 : tensor<2xi32>
+}
+
+
+// CHECK:    tensors: [ {
+// CHECK-NEXT:      shape: [ 2 ],
+// CHECK-NEXT:      type: INT32,
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "tfl.pseudo_const",
+// CHECK-NEXT:      quantization: {
+// CHECK-NEXT:
+// CHECK-NEXT:      }
+
+// CHECK:   buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 1, 0, 0, 0, 2, 0, 0, 0 ]
+// CHECK-NEXT:   }, {
+
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
index 7702045547e..726441876cd 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
@@ -1,12 +1,12 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s --dump-input-on-failure
+
 
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
 // CHECK-NEXT:     builtin_code: LESS
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: CUSTOM,
-// CHECK-NEXT:     custom_code: "Experimental_If"
+// CHECK-NEXT:     builtin_code: IF
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
@@ -52,8 +52,12 @@
 // CHECK-NEXT:       opcode_index: 1,
 // CHECK-NEXT:       inputs: [ 2, 0, 1 ],
 // CHECK-NEXT:       outputs: [ 3 ],
-// CHECK-NEXT:       custom_options: [ 116, 104, 101, 110, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 101, 108, 115, 101, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 2, 21, 42, 2, 1, 2, 2, 1, 4, 4, 4, 36, 1 ]
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:       builtin_options_type: IfOptions,
+// CHECK-NEXT:       builtin_options: {
+// CHECK-NEXT:         then_subgraph_index: 1,
+// CHECK-NEXT:         else_subgraph_index: 2
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "main"
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     tensors: [ {
@@ -88,7 +92,7 @@
 // CHECK-NEXT:       builtin_options: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "cond_true"
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     tensors: [ {
@@ -123,7 +127,7 @@
 // CHECK-NEXT:       builtin_options: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "cond_false"
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   description: "MLIR Converted.",
@@ -156,7 +160,7 @@ func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tfl.pseudo_input"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
   %1 = "tfl.pseudo_input"(%arg1) : (tensor<1xf32>) -> tensor<1xf32>
   %2 = "tfl.less"(%0, %1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1>
-  %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   return %3 : tensor<1xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
new file mode 100644
index 00000000000..ddb122f6e37
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -0,0 +1,283 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+// CHECK: {
+// CHECK-NEXT:   version: 3,
+// CHECK-NEXT:   operator_codes: [ {
+// CHECK-NEXT:     builtin_code: LSTM
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   subgraphs: [ {
+// CHECK-NEXT:     tensors: [ {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 1,
+// CHECK-NEXT:       name: "tfl.pseudo_input",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 2,
+// CHECK-NEXT:       name: "tfl.pseudo_input1",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 3,
+// CHECK-NEXT:       name: "tfl.pseudo_input2",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 4,
+// CHECK-NEXT:       name: "tfl.pseudo_input3",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 5,
+// CHECK-NEXT:       name: "tfl.pseudo_input4",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 6,
+// CHECK-NEXT:       name: "tfl.pseudo_input5",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 7,
+// CHECK-NEXT:       name: "tfl.pseudo_input6",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 8,
+// CHECK-NEXT:       name: "tfl.pseudo_input7",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 9,
+// CHECK-NEXT:       name: "tfl.pseudo_input8",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 10,
+// CHECK-NEXT:       name: "tfl.pseudo_input9",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 11,
+// CHECK-NEXT:       name: "tfl.pseudo_input10",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 12,
+// CHECK-NEXT:       name: "tfl.pseudo_input11",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 13,
+// CHECK-NEXT:       name: "tfl.pseudo_input12",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 14,
+// CHECK-NEXT:       name: "tfl.pseudo_input13",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 15,
+// CHECK-NEXT:       name: "tfl.pseudo_input14",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 16,
+// CHECK-NEXT:       name: "tfl.pseudo_input15",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 17,
+// CHECK-NEXT:       name: "tfl.pseudo_input16",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 18,
+// CHECK-NEXT:       name: "tfl.pseudo_input17",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       name: "Const",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       name: "Const1",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 21,
+// CHECK-NEXT:       name: "tfl.pseudo_input18",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 22,
+// CHECK-NEXT:       name: "tfl.pseudo_input19",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 23,
+// CHECK-NEXT:       name: "tfl.pseudo_input20",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 24,
+// CHECK-NEXT:       name: "tfl.pseudo_input21",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 25,
+// CHECK-NEXT:       name: "tfl.lstm",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23 ],
+// CHECK-NEXT:     outputs: [ 24 ],
+// CHECK-NEXT:     operators: [ {
+// CHECK-NEXT:       inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ],
+// CHECK-NEXT:       outputs: [ 24 ],
+// CHECK-NEXT:       builtin_options_type: LSTMOptions,
+// CHECK-NEXT:       builtin_options: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     name: "main"
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   description: "MLIR Converted.",
+// CHECK-NEXT:   buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   } ]
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+
+
+^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>, %arg22: tensor<4 x f32>, %arg23: tensor<4 x f32>):
+  %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %4 = "tfl.pseudo_input" (%arg4) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %5 = "tfl.pseudo_input" (%arg5) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %6 = "tfl.pseudo_input" (%arg6) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %7 = "tfl.pseudo_input" (%arg7) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %8 = "tfl.pseudo_input" (%arg8) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %9 = "tfl.pseudo_input" (%arg9) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %10 = "tfl.pseudo_input" (%arg10) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %11 = "tfl.pseudo_input" (%arg11) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %12 = "tfl.pseudo_input" (%arg12) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %13 = "tfl.pseudo_input" (%arg13) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %14 = "tfl.pseudo_input" (%arg14) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %15 = "tfl.pseudo_input" (%arg15) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %16 = "tfl.pseudo_input" (%arg16) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %17 = "tfl.pseudo_input" (%arg17) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %18 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %19 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %20 = "tfl.pseudo_input" (%arg20) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %21 = "tfl.pseudo_input" (%arg21) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %22 = "tfl.pseudo_input" (%arg22) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %23 = "tfl.pseudo_input" (%arg23) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %24 = "tfl.lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %24 : tensor<4xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
index eb9119d1c46..43ee98934e0 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
@@ -1,4 +1,5 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - -strip-debug-info | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s --check-prefix=STRIP
 
 func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
   attributes {tf.entry_function = {inputs = "input", outputs = "SameNameAsOutput"}} {
@@ -16,6 +17,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:       type: INT32,
 // CHECK-NEXT:       buffer: 1,
 // CHECK-NEXT:       name: "input",
+// STRIP:            buffer: 1,
+// STRIP-NEXT:       name: "input",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
@@ -24,6 +27,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:       type: INT32,
 // CHECK-NEXT:       buffer: 2,
 // CHECK-NEXT:       name: "Const",
+// STRIP:            buffer: 2,
+// STRIP-NEXT:       name: "0",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
@@ -32,6 +37,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:       type: INT32,
 // CHECK-NEXT:       buffer: 3,
 // CHECK-NEXT:       name: "sub",
+// STRIP:            buffer: 3,
+// STRIP-NEXT:       name: "1",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
@@ -40,6 +47,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:       type: INT32,
 // CHECK-NEXT:       buffer: 4,
 // CHECK-NEXT:       name: "SameNameAsOutput1",
+// STRIP:            buffer: 4,
+// STRIP-NEXT:       name: "2",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
@@ -48,6 +57,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:       type: INT32,
 // CHECK-NEXT:       buffer: 5,
 // CHECK-NEXT:       name: "SameNameAsOutput",
+// STRIP:            buffer: 5,
+// STRIP-NEXT:       name: "SameNameAsOutput",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
new file mode 100644
index 00000000000..3ab36f554ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
@@ -0,0 +1,93 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+// CHECK:      {
+// CHECK-NEXT:     version: 3,
+// CHECK-NEXT:     operator_codes: [ {
+// CHECK-NEXT:       builtin_code: SVDF
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     subgraphs: [ {
+// CHECK-NEXT:       tensors: [ {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 1,
+// CHECK-NEXT:         name: "tfl.pseudo_input",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 2,
+// CHECK-NEXT:         name: "tfl.pseudo_input1",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 3,
+// CHECK-NEXT:         name: "tfl.pseudo_input2",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 4,
+// CHECK-NEXT:         name: "tfl.pseudo_input3",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         name: "Const",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         },
+// CHECK-NEXT:         is_variable: true
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 6,
+// CHECK-NEXT:         name: "tfl.svdf",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       } ],
+// CHECK-NEXT:       inputs: [ 0, 1, 2, 3 ],
+// CHECK-NEXT:       outputs: [ 5 ],
+// CHECK-NEXT:       operators: [ {
+// CHECK-NEXT:         inputs: [ 0, 1, 2, 3, 4 ],
+// CHECK-NEXT:         outputs: [ 5 ],
+// CHECK-NEXT:         builtin_options_type: SVDFOptions,
+// CHECK-NEXT:         builtin_options: {
+// CHECK-NEXT:           rank: 2,
+// CHECK-NEXT:           fused_activation_function: RELU
+// CHECK-NEXT:         }
+// CHECK-NEXT:       } ],
+// CHECK-NEXT:       name: "main"
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     description: "MLIR Converted.",
+// CHECK-NEXT:     buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     } ]
+// CHECK-NEXT:   }
+// CHECK-EMPTY:
+
+^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>):
+  %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %4 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %5 = "tfl.svdf"(%0, %1, %2, %3, %4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %5 : tensor<4xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
new file mode 100644
index 00000000000..e2ffb24a6b3
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -0,0 +1,282 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+// CHECK: {
+// CHECK-NEXT:   version: 3,
+// CHECK-NEXT:   operator_codes: [ {
+// CHECK-NEXT:     builtin_code: UNIDIRECTIONAL_SEQUENCE_LSTM
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   subgraphs: [ {
+// CHECK-NEXT:     tensors: [ {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 1,
+// CHECK-NEXT:       name: "tfl.pseudo_input",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 2,
+// CHECK-NEXT:       name: "tfl.pseudo_input1",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 3,
+// CHECK-NEXT:       name: "tfl.pseudo_input2",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 4,
+// CHECK-NEXT:       name: "tfl.pseudo_input3",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 5,
+// CHECK-NEXT:       name: "tfl.pseudo_input4",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 6,
+// CHECK-NEXT:       name: "tfl.pseudo_input5",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 7,
+// CHECK-NEXT:       name: "tfl.pseudo_input6",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 8,
+// CHECK-NEXT:       name: "tfl.pseudo_input7",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 9,
+// CHECK-NEXT:       name: "tfl.pseudo_input8",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 10,
+// CHECK-NEXT:       name: "tfl.pseudo_input9",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 11,
+// CHECK-NEXT:       name: "tfl.pseudo_input10",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 12,
+// CHECK-NEXT:       name: "tfl.pseudo_input11",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 13,
+// CHECK-NEXT:       name: "tfl.pseudo_input12",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 14,
+// CHECK-NEXT:       name: "tfl.pseudo_input13",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 15,
+// CHECK-NEXT:       name: "tfl.pseudo_input14",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 16,
+// CHECK-NEXT:       name: "tfl.pseudo_input15",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 17,
+// CHECK-NEXT:       name: "tfl.pseudo_input16",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 18,
+// CHECK-NEXT:       name: "tfl.pseudo_input17",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       name: "Const",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       name: "Const1",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 21,
+// CHECK-NEXT:       name: "tfl.pseudo_input18",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 22,
+// CHECK-NEXT:       name: "tfl.pseudo_input19",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 23,
+// CHECK-NEXT:       name: "tfl.pseudo_input20",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 24,
+// CHECK-NEXT:       name: "tfl.pseudo_input21",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       buffer: 25,
+// CHECK-NEXT:       name: "tfl.unidirectional_sequence_lstm",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23 ],
+// CHECK-NEXT:     outputs: [ 24 ],
+// CHECK-NEXT:     operators: [ {
+// CHECK-NEXT:       inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ],
+// CHECK-NEXT:       outputs: [ 24 ],
+// CHECK-NEXT:         builtin_options_type: UnidirectionalSequenceLSTMOptions,
+// CHECK-NEXT:         builtin_options: {
+// CHECK-NEXT:           time_major: true
+// CHECK-NEXT:         }
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     name: "main"
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   description: "MLIR Converted.",
+// CHECK-NEXT:   buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   } ]
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+
+^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>, %arg22: tensor<4 x f32>, %arg23: tensor<4 x f32>):
+  %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %4 = "tfl.pseudo_input" (%arg4) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %5 = "tfl.pseudo_input" (%arg5) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %6 = "tfl.pseudo_input" (%arg6) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %7 = "tfl.pseudo_input" (%arg7) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %8 = "tfl.pseudo_input" (%arg8) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %9 = "tfl.pseudo_input" (%arg9) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %10 = "tfl.pseudo_input" (%arg10) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %11 = "tfl.pseudo_input" (%arg11) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %12 = "tfl.pseudo_input" (%arg12) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %13 = "tfl.pseudo_input" (%arg13) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %14 = "tfl.pseudo_input" (%arg14) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %15 = "tfl.pseudo_input" (%arg15) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %16 = "tfl.pseudo_input" (%arg16) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %17 = "tfl.pseudo_input" (%arg17) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %18 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %19 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %20 = "tfl.pseudo_input" (%arg20) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %21 = "tfl.pseudo_input" (%arg21) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %22 = "tfl.pseudo_input" (%arg22) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %23 = "tfl.pseudo_input" (%arg23) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %24 = "tfl.unidirectional_sequence_lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %24 : tensor<4xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
new file mode 100644
index 00000000000..3d91f66501d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -0,0 +1,93 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+// CHECK:      {
+// CHECK-NEXT:     version: 3,
+// CHECK-NEXT:     operator_codes: [ {
+// CHECK-NEXT:       builtin_code: UNIDIRECTIONAL_SEQUENCE_RNN
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     subgraphs: [ {
+// CHECK-NEXT:       tensors: [ {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 1,
+// CHECK-NEXT:         name: "tfl.pseudo_input",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 2,
+// CHECK-NEXT:         name: "tfl.pseudo_input1",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 3,
+// CHECK-NEXT:         name: "tfl.pseudo_input2",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 4,
+// CHECK-NEXT:         name: "tfl.pseudo_input3",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         name: "Const",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         },
+// CHECK-NEXT:         is_variable: true
+// CHECK-NEXT:       }, {
+// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         buffer: 6,
+// CHECK-NEXT:         name: "tfl.unidirectional_sequence_rnn",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         }
+// CHECK-NEXT:       } ],
+// CHECK-NEXT:       inputs: [ 0, 1, 2, 3 ],
+// CHECK-NEXT:       outputs: [ 5 ],
+// CHECK-NEXT:       operators: [ {
+// CHECK-NEXT:         inputs: [ 0, 1, 2, 3, 4 ],
+// CHECK-NEXT:         outputs: [ 5 ],
+// CHECK-NEXT:         builtin_options_type: SequenceRNNOptions,
+// CHECK-NEXT:         builtin_options: {
+// CHECK-NEXT:           time_major: true,
+// CHECK-NEXT:           fused_activation_function: TANH
+// CHECK-NEXT:         }
+// CHECK-NEXT:       } ],
+// CHECK-NEXT:       name: "main"
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     description: "MLIR Converted.",
+// CHECK-NEXT:     buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:     } ]
+// CHECK-NEXT:   }
+// CHECK-EMPTY:
+
+^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>):
+  %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32>
+  %4 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %5 = "tfl.unidirectional_sequence_rnn"(%0, %1, %2, %3, %4) {fused_activation_function = "TANH", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %5 : tensor<4xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
index 14f8174e9bf..eb20f3759dd 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
 ^bb0(%arg0: tensor<3x2xi32>):
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
index fd403aa72c5..bf76f4feae6 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
@@ -3,8 +3,7 @@
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: CUSTOM,
-// CHECK-NEXT:     custom_code: "Experimental_While"
+// CHECK-NEXT:     builtin_code: WHILE
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     builtin_code: GREATER
 // CHECK-NEXT:   }, {
@@ -49,8 +48,12 @@
 // CHECK-NEXT:     operators: [ {
 // CHECK-NEXT:       inputs: [ 0, 1 ],
 // CHECK-NEXT:       outputs: [ 2, 3 ],
-// CHECK-NEXT:       custom_options: [ 99, 111, 110, 100, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 98, 111, 100, 121, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 2, 21, 42, 2, 1, 2, 2, 1, 4, 4, 4, 36, 1 ]
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:       builtin_options_type: WhileOptions,
+// CHECK-NEXT:       builtin_options: {
+// CHECK-NEXT:         cond_subgraph_index: 1,
+// CHECK-NEXT:         body_subgraph_index: 2
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "main"
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     tensors: [ {
@@ -91,7 +94,7 @@
 // CHECK-NEXT:       opcode_index: 1,
 // CHECK-NEXT:       inputs: [ 0, 2 ],
 // CHECK-NEXT:       outputs: [ 3 ]
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "cond"
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     tensors: [ {
@@ -151,7 +154,7 @@
 // CHECK-NEXT:       builtin_options: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
-// CHECK-NEXT:     } ]
+// CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "body"
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   description: "MLIR Converted.",
@@ -192,7 +195,7 @@ func @main(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
 
   // While %0 is greater than zero, element wise add %1 with itself.
   %2:2 = "tf.While"(%0, %1) {
-    cond = @cond, body = @body
+    cond = @cond, body = @body, is_stateless = false
   } : (tensor<i32>, tensor<1xf32>) -> (tensor<i32>, tensor<1xf32>)
   return %2#1 : tensor<1xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index aaa560c0fd6..fe6dc486822 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -80,7 +80,6 @@ func @testGatherUnsupportedRank(%arg0 : tensor<f32>, %arg1 : tensor<1xi32>) -> t
   return %0 : tensor<?xf32>
 }
 
-
 // -----
 
 // CHECK-LABEL: testAbs
@@ -155,6 +154,26 @@ func @testSinWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 
 // -----
 
+// test invalid Sqrt input
+func @testSqrtWithWrongInputType(tensor<? x i32>) -> tensor<? x i32> {
+^bb0(%arg0: tensor<? x i32>):
+  // expected-error @+1 {{tfl.sqrt' op operand #0 must be tensor of floating-point values}}
+  %0 = "tfl.sqrt"(%arg0): (tensor<? x i32>) -> tensor<? x i32>
+  return %0#0 : tensor<? x i32>
+}
+
+// -----
+
+// test invalid Square input
+func @testSquareWithWrongInputType(tensor<? x i32>) -> tensor<? x i32> {
+^bb0(%arg0: tensor<? x i32>):
+  // expected-error @+1 {{tfl.square' op operand #0 must be tensor of floating-point or QI8 type or QUI8 type values}}
+  %0 = "tfl.square"(%arg0): (tensor<? x i32>) -> tensor<? x i32>
+  return %0#0 : tensor<? x i32>
+}
+
+// -----
+
 // CHECK-LABEL: testSqrt
 func @testSqrt(tensor<? x f32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<? x f32>):
@@ -171,6 +190,18 @@ func @testSquare(tensor<? x f32>) -> tensor<? x f32> {
   return %0 : tensor<? x f32>
 }
 
+func @testQuantizedSquare(tensor<? x !quant.uniform<u8:f32, 0.1>>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+^bb0(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>):
+  %0 = "tfl.square"(%arg0): (tensor<? x !quant.uniform<u8:f32, 0.1>>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+  return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
+}
+
+func @testQuantizedResizeNearestNeighbor(tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+^bb0(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
+  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false } : (tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+  return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
+}
+
 // CHECK-LABEL: testTanh
 func @testTanh(tensor<? x f32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<? x f32>):
@@ -179,6 +210,18 @@ func @testTanh(tensor<? x f32>) -> tensor<? x f32> {
   return %0 : tensor<? x f32>
 }
 
+// CHECK-LABEL: testTanhWithQI8
+func @testTanhWithQI8(%arg0: tensor<? x !quant.uniform<i8:f32, 0.1>>) -> tensor<? x !quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.tanh"(%arg0): (tensor<? x !quant.uniform<i8:f32, 0.1>>) -> tensor<? x !quant.uniform<i8:f32, 0.1>>
+  return %0 : tensor<? x !quant.uniform<i8:f32, 0.1>>
+}
+
+// CHECK-LABEL: testTanhWithQUI8
+func @testTanhWithQUI8(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+  %0 = "tfl.tanh"(%arg0): (tensor<? x !quant.uniform<u8:f32, 0.1>>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+  return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
+}
+
 // CHECK-LABEL: testZerosLike
 func @testZerosLike(tensor<? x f32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<? x f32>):
@@ -287,11 +330,9 @@ func @testFloorDivF32(%arg0: tensor<2 x f32>, %arg1: tensor<2 x i32>) -> tensor<
 // -----
 
 // CHECK-LABEL: testFloorMod
-func @testFloorMod(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
-^bb0(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>):
-  // CHECK: tfl.floor_mod %arg0, %arg1
-  %0 = tfl.floor_mod %arg0, %arg1 : tensor<? x i32>
-  return %0#0 : tensor<? x i32>
+func @testFloorMod(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>) -> tensor<? x i32> {
+  %0 = "tfl.floor_mod"(%arg0, %arg1) : (tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32>
+  return %0 : tensor<? x i32>
 }
 
 // CHECK-LABEL: testPow
@@ -310,6 +351,13 @@ func @testConv2D(tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<16xf32>)
   return %0 : tensor<256x30x30x16xf32>
 }
 
+
+func @testConv2DNoBias(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>, %arg2: none) -> tensor<256x30x30x16xf32> {
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2)
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32, fused_activation_function = "RELU6"} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, none) -> tensor<256x30x30x16xf32>
+  return %0 : tensor<256x30x30x16xf32>
+}
+
 // CHECK-LABEL: testFakeQuant
 func @testFakeQuant(tensor<? x f32>, f32, f32) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<? x f32>, %arg1: f32, %arg2: f32):
@@ -489,13 +537,22 @@ func @testLogistic(tensor<1x2x3x4x5xbf16>) -> tensor<1x2x3x4x5xbf16> {
 // test invalid Logistic input
 func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 ^bb0(%arg0: tensor<?xi32>):
-  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point values}}
+  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point or QI8 type or QUI8 type values}}
   %0 = "tfl.logistic"(%arg0): (tensor<?xi32>) -> tensor<?xi32>
   return %0#0 : tensor<?xi32>
 }
 
 // -----
 
+// CHECK-LABEL: testUnidirectionalSequenceRnn
+func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: testUnidirectionalSequenceLstm
 func @testUnidirectionalSequenceLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
@@ -768,6 +825,22 @@ func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 
 // -----
 
+func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x4x2xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x4x2xi32>
+  return %0 : tensor<1x4x2xi32>
+}
+
+// -----
+
+func @packNegInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
+  return %0 : tensor<2x1x4xi32>
+}
+
+// -----
+
 func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
   // expected-error @+1 {{input count should match 'values_count' attribute}}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 1 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
@@ -776,6 +849,22 @@ func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 
 // -----
 
+func @pack(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{operands should be of the same type}}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// -----
+
+func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{op attribute 'axis' is out of bounds, got 3}}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 3 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// -----
+
 func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   // CHECK: "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
@@ -785,6 +874,14 @@ func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 
 // -----
 
+func @unpackQuantized(%arg0: tensor<2x3x!quant.uniform<u8:f32, 0.02>>) -> tensor<2x!quant.uniform<u8:f32, 0.02>> {
+  %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3x!quant.uniform<u8:f32, 0.02>>) -> (tensor<2x!quant.uniform<u8:f32, 0.02>>, tensor<2x!quant.uniform<u8:f32, 0.02>>, tensor<2x!quant.uniform<u8:f32, 0.02>>)
+  return %0#0 : tensor<2x!quant.uniform<u8:f32, 0.02>>
+
+}
+
+// -----
+
 func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   // expected-error @+1 {{output count should match 'num' attribute}}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 2 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
@@ -879,7 +976,7 @@ func @testResizeBilinear(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>)
 // -----
 
 func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) -> tensor<?xi32> {
-  // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float values}}
+  // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float or QI8 type or QUI8 type values}}
   %0 = "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -893,6 +990,18 @@ func @testStridedSlice(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2:
   return %0 : tensor<1x2x2x5xf32>
 }
 
+// CHECK-LABEL: testStridedSliceWithQI8
+func @testStridedSliceWithQI8(%arg0: tensor<12x2x2x5x!quant.uniform<i8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!quant.uniform<i8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>>
+  return %0 : tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>>
+}
+
+// CHECK-LABEL: testStridedSliceWithQUI8
+func @testStridedSliceWithQUI8(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>> {
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>>
+  return %0 : tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>>
+}
+
 // -----
 
 func @testStridedSliceWithInvalidOutputType(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xi32> {
@@ -917,3 +1026,401 @@ func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>,
   %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi8>
   return %0 : tensor<*xi8>
 }
+
+// -----
+
+func @testArgMax(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i32> {
+  // CHECK: "tfl.arg_max"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+  %0 = "tfl.arg_max"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testArgMin(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i32> {
+  // CHECK: "tfl.arg_min"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+  %0 = "tfl.arg_min"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: testSpaceToDepth
+func @testSpaceToDepthF32(%arg0: tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32> {
+  // CHECK: %[[ARG:.*]]: tensor<1x2x2x1xf32>
+  // CHECK: "tfl.space_to_depth"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32>
+  %0 = "tfl.space_to_depth"(%arg0) {block_size = 2: i32} : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32>
+  return %0 : tensor<1x1x1x4xf32>
+}
+
+// -----
+
+func @testSpaceToDepthInvalidOutputType(%arg0: tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xi32> {
+  // expected-error @+1 {{'tfl.space_to_depth' op failed to verify that input and output must have same element type}}
+  %0 = "tfl.space_to_depth"(%arg0) {block_size = 2: i32} : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xi32>
+  return %0 : tensor<1x1x1x4xi32>
+}
+
+// -----
+
+func @testRange(%arg0 : tensor<i32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<?xi32> {
+  %0 = "tfl.range"(%arg0, %arg1, %arg2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+func @testRangeNonScalarTensorInput(%arg0 : tensor<1xi32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<?xi32> {
+  // expected-error @+1 {{op failed to verify that operand 0 is 0-D}}
+  %0 = "tfl.range"(%arg0, %arg1, %arg2) : (tensor<1xi32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+func @testRangeOutputTypeMismatch(%arg0 : tensor<i32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<?xf32> {
+  // expected-error @+1 {{op failed to verify that operands and output must have same element type}}
+  %0 = "tfl.range"(%arg0, %arg1, %arg2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func @transpose(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2xi32>) -> tensor<2x2xi32> {
+  %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_perm_not_i32(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2xf32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{op operand #1 must be tensor of 32-bit integer values}}
+  %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2xf32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_element_type(%arg0 : tensor<2x2xf32>, %arg1 : tensor<2xi32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{input and output must have same element type}}
+  %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_1d_perm(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2x2xi32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{op failed to verify that operand 1 is 1-D}}
+  %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// -----
+
+func @anyWithI64Axis(%arg0: tensor<2x2xi1>, %arg1: tensor<i64>) -> tensor<i1> {
+  // expected-error @+1 {{tfl.reduce_any' op operand #1 must be tensor of 32-bit integer values}}
+  %0 = "tfl.reduce_any"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// -----
+
+func @testRoundInvalidInputType(%arg: tensor<?xi32>) -> tensor<?xi32> {
+  // expected-error @+1 {{'tfl.round' op operand #0 must be tensor of 32-bit float values}}
+  %0 = "tfl.round"(%arg) : (tensor<?xi32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+func @testSplitWithQuantizedTypes(%arg0 : tensor<i32>, %arg1 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.split"(%arg0, %arg1) {num_splits = 1 : i32} : (tensor<i32>, tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @testSplitVWithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.split_v"(%arg0, %arg1, %arg2) {num_splits = 1 : i32} : (tensor<10x!quant.uniform<u8:f32, 1.0>>, tensor<i32>, tensor<i32>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @whereWithI32Input(%arg0: tensor<3x5xi32>) -> tensor<?x2xi64> {
+  // expected-error @+1 {{'tfl.where' op operand #0 must be tensor of 1-bit integer values}}
+  %0 = "tfl.where"(%arg0) : (tensor<3x5xi32>) -> tensor<?x2xi64>
+  return %0 : tensor<?x2xi64>
+}
+
+// -----
+
+func @testMinimumWithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>, %arg1 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.minimum"(%arg0, %arg1) : (tensor<10x!quant.uniform<u8:f32, 1.0>>, tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @testMaximumWithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>, %arg1 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.maximum"(%arg0, %arg1) : (tensor<10x!quant.uniform<u8:f32, 1.0>>, tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @testReluWithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.relu"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @testRelu6WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.relu6"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
+func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?xf32>) -> tensor<?xf32> {
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func @testEmbeddingLookupInvalidResultType(%arg0 : tensor<?xi32>, %arg1 : tensor<?xf32>) -> tensor<?xi32> {
+  // expected-error @+1 {{'tfl.embedding_lookup' op result #0 must be tensor of 32-bit float or 8-bit integer or TFLite uint8 type values}}
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?xf32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi32>, %arg1 : tensor<?xi8>) -> tensor<?xf32> {
+  // expected-error @+1 {{'tfl.embedding_lookup' op failed to verify that value and output must have same element type}}
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?xi8>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func @testQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+  %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
+  return %0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
+}
+
+// -----
+
+// CHECK-LABEL: testSvdf
+func @testSvdf(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func @testSvdfUnsupportedType(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>, %arg2: tensor<? x i32>, %arg3: tensor<? x i32>, %arg4: tensor<? x i32>) -> tensor<? x f32> {
+  // expected-error @+1 {{'tfl.svdf' op operand #0 must be tensor of 32-bit float or 8-bit integer values}}
+  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+// -----
+
+// CHECK-LABEL: testDepthToSpace
+func @testDepthToSpaceF32(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
+  // CHECK: %[[ARG:.*]]: tensor<1x1x1x4xf32>
+  // CHECK: "tfl.depth_to_space"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+  %0 = "tfl.depth_to_space"(%arg0) {block_size = 2: i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+  return %0 : tensor<1x2x2x1xf32>
+}
+
+// -----
+
+func @testDepthToSpaceInvalidOutputType(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xi32> {
+  // expected-error @+1 {{'tfl.depth_to_space' op failed to verify that input and output must have same element type}}
+  %0 = "tfl.depth_to_space"(%arg0) {block_size = 2: i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xi32>
+  return %0 : tensor<1x2x2x1xi32>
+}
+
+// -----
+
+func @testSlice(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  %0 = "tfl.slice"(%arg0, %arg1, %arg2) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceBadBeginDimension(%arg0: tensor<2x3x5xf32>, %arg1: tensor<2xi32>, %arg2: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  // expected-error @+1 {{begin tensor elements size is not equal to input tensor rank}}
+  %0 = "tfl.slice"(%arg0, %arg1, %arg2) : (tensor<2x3x5xf32>, tensor<2xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceBadSizeDimension(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>, %arg2: tensor<2xi32>) -> tensor<?x3x5xf32> {
+  // expected-error @+1 {{size tensor elements size is not equal to input tensor rank}}
+  %0 = "tfl.slice"(%arg0, %arg1, %arg2) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<2xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceBadBegin(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  %cst = constant dense<[2, -1, 5]> : tensor<3xi32>
+  // expected-error @+1 {{begin[1] cannot be negative}}
+  %0 = "tfl.slice"(%arg0, %cst, %arg1) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceNegativeSize(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  %cst = constant dense<[-2, -1, 5]> : tensor<3xi32>
+  // expected-error @+1 {{size[0] cannot be negative other than -1}}
+  %0 = "tfl.slice"(%arg0, %arg1, %cst) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceSizeOutOfRange(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  %cst = constant dense<[2, 1, 5]> : tensor<3xi32>
+  %cst_1 = constant dense<[0, 1, 1]> : tensor<3xi32>
+  // expected-error @+1 {{begin[2] + size[2] cannot exceed dimension length: 5}}
+  %0 = "tfl.slice"(%arg0, %cst_1, %cst) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSliceBeginOutOfRange(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>) -> tensor<?x3x5xf32> {
+  %cst = constant dense<[1, 1, 1]> : tensor<3xi32>
+  %cst_1 = constant dense<[2, 1, 3]> : tensor<3xi32>
+  // expected-error @+1 {{begin[0] cannot exceed dimension length: 2}}
+  %0 = "tfl.slice"(%arg0, %cst_1, %cst) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %0 : tensor<?x3x5xf32>
+}
+
+// -----
+
+func @testSplitOpWithBadNumSplits(%arg0 : tensor<16xf32>) -> () {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op attribute 'num_splits' failed to satisfy constraint: positive 32-bit integer attribute}}
+  "tfl.split"(%split_dim, %arg0) {num_splits = 0 : i32} : (tensor<i32>, tensor<16xf32>) -> ()
+  return
+}
+
+// -----
+
+func @testSplitOpWithMismatchedNumResults(%arg0 : tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op output count should match 'num_splits' attribute}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 4 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %0, %1 : tensor<8xf32>, tensor<8xf32>
+}
+
+// -----
+
+func @testSplitOpWithBadSplitDimTensorType(%arg0: tensor<16x4x4xf32>) -> tensor<16x4x4xf32> {
+  %split_dim = constant dense<0> : tensor<2x2xi32>
+  // expected-error @+1 {{'tfl.split' op operand #0 must be tensor<i32>}}
+  %0 = "tfl.split"(%split_dim, %arg0) {num_splits = 1 : i32} : (tensor<2x2xi32>, tensor<16x4x4xf32>) -> tensor<16x4x4xf32>
+  return %0 : tensor<16x4x4xf32>
+}
+
+// -----
+
+func @testSplitOpWithBadSplitDimUnrankedTensorType(%arg0: tensor<16x4x4xf32>, %split_dim : tensor<? x i32>) -> tensor<16x4x4xf32> {
+  // expected-error @+1 {{'tfl.split' op operand #0 must be tensor<i32>}}
+  %0 = "tfl.split"(%split_dim, %arg0) {num_splits = 1 : i32} : (tensor<?xi32>, tensor<16x4x4xf32>) -> tensor<16x4x4xf32>
+  return %0 : tensor<16x4x4xf32>
+}
+
+// -----
+
+func @testSplitOpWithOutOfRangeSplitDim(%arg0 : tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>) {
+  %split_dim = constant dense<1> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op 'split_dim' should be in [-rank, rank)}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %0, %1 : tensor<8xf32>, tensor<8xf32>
+}
+
+// -----
+
+func @testSplitOpWithOutOfRangeSplitDimTFLConst(%arg0 : tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>) {
+  %split_dim = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // expected-error @+1 {{'tfl.split' op 'split_dim' should be in [-rank, rank)}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %0, %1 : tensor<8xf32>, tensor<8xf32>
+}
+
+// -----
+
+func @testSplitOpWithOutOfRangeSplitDimNegative(%arg0 : tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>) {
+  %split_dim = constant dense<-2> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op 'split_dim' should be in [-rank, rank)}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %0, %1 : tensor<8xf32>, tensor<8xf32>
+}
+
+// -----
+
+func @testSplitOpWithUnevenDivision(%arg0 : tensor<16xf32>) -> (tensor<6xf32>, tensor<5xf32>, tensor<5xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op 'num_splits' should evenly divide 'split_dim' axis}}
+  %0, %1, %2 = "tfl.split"(%split_dim, %arg0) {num_splits = 3 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<6xf32>, tensor<5xf32>, tensor<5xf32>)
+  return %0, %1, %2 : tensor<6xf32>, tensor<5xf32>, tensor<5xf32>
+}
+
+// -----
+
+func @testSplitOpWithMismatchTensorTypeSplitDimOut0(%arg0 : tensor<16xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op output #0 should be 'tensor<8xf32>'}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<4xf32>, tensor<4xf32>)
+  return %0, %1 : tensor<4xf32>, tensor<4xf32>
+}
+
+// -----
+
+func @testSplitOpWithMismatchTensorTypeSplitDimOut1(%arg0 : tensor<16xf32>) -> (tensor<8xf32>, tensor<4xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op output #1 should be 'tensor<8xf32>'}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16xf32>) -> (tensor<8xf32>, tensor<4xf32>)
+  return %0, %1 : tensor<8xf32>, tensor<4xf32>
+}
+
+// -----
+
+func @testSplitOpWithMismatchTensorTypeNonSplitDim(%arg0 : tensor<16x4xf32>) -> (tensor<8x2xf32>, tensor<8x2xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  // expected-error @+1 {{'tfl.split' op output #0 should be 'tensor<8x4xf32>'}}
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16x4xf32>) -> (tensor<8x2xf32>, tensor<8x2xf32>)
+  return %0, %1 : tensor<8x2xf32>, tensor<8x2xf32>
+}
+
+// -----
+
+func @testSplitOpWithValidTensorType(%arg0 : tensor<16x4xf32>) -> (tensor<8x4xf32>, tensor<8x4xf32>, tensor<16x2xf32>, tensor<16x2xf32>) {
+  %split_dim_0 = constant dense<0> : tensor<i32>
+  %0, %1 = "tfl.split"(%split_dim_0, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16x4xf32>) -> (tensor<8x4xf32>, tensor<8x4xf32>)
+  %split_dim_1 = constant dense<1> : tensor<i32>
+  %2, %3 = "tfl.split"(%split_dim_1, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16x4xf32>) -> (tensor<16x2xf32>, tensor<16x2xf32>)
+  return %0, %1, %2, %3 : tensor<8x4xf32>, tensor<8x4xf32>, tensor<16x2xf32>, tensor<16x2xf32>
+}
+
+// -----
+
+func @testSplitOpWithValidTensorTypeDynamic(%arg0 : tensor<16x?xf32>) -> (tensor<8x?xf32>, tensor<8x?xf32>) {
+  %split_dim = constant dense<0> : tensor<i32>
+  %0, %1 = "tfl.split"(%split_dim, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<16x?xf32>) -> (tensor<8x?xf32>, tensor<8x?xf32>)
+  return %0, %1 : tensor<8x?xf32>, tensor<8x?xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index e7ebace3a54..15c4898341f 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -96,6 +96,54 @@ func @intermOpUsedTwice(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf
 
 }
 
+// CHECK-LABEL: @fuseMulIntoFullyConnected
+func @fuseMulIntoFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32> {
+  %cst0 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %cst1 = constant dense<2.0> : tensor<2xf32>
+  %cst2 = constant dense<[1.0, 2.0]> : tensor<2xf32>
+
+  %0 = "tfl.fully_connected"(%arg0, %cst0, %cst1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  %1 = "tfl.mul"(%0, %cst2) {fused_activation_function = "RELU6"} : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+
+  return %1 : tensor<4x2xf32>
+
+// CHECK:  %cst = constant dense<{{\[\[}}1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00]]> : tensor<2x2xf32>
+// CHECK:  %cst_0 = constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
+// CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %cst_0) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:  return %0 : tensor<4x2xf32>
+}
+
+// CHECK-LABEL: @fuseMulIntoFullyConnectedBroadcast
+func @fuseMulIntoFullyConnectedBroadcast(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32> {
+  %cst0 = constant dense<[[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]> : tensor<2x3xf32>
+  %cst1 = constant dense<2.0> : tensor<2xf32>
+  %cst2 = constant dense<[1.0, 2.0]> : tensor<2xf32>
+  %0 = "tfl.fully_connected"(%arg0, %cst0, %cst1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x3xf32>, tensor<2x3xf32>, tensor<2xf32>) -> tensor<1x2xf32>
+  // %cst2 isn't broadcast-compatible to %cst0, but tf.Mul is able to fold them.
+  %1 = "tfl.mul"(%0, %cst2) {fused_activation_function = "RELU6"} : (tensor<1x2xf32>, tensor<2xf32>) -> tensor<1x2xf32>
+  return %1 : tensor<1x2xf32>
+
+// CHECK:  %cst = constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [2.000000e+00, 4.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
+// CHECK:  %cst_0 = constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
+// CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %cst_0) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:  return %0 : tensor<1x2xf32>
+}
+
+// CHECK-LABEL: @fuseMulIntoFullyConnectedNoBias
+func @fuseMulIntoFullyConnectedNoBias(%arg0: tensor<4x2xf32>, %arg1: none) -> tensor<4x2xf32> {
+  %cst0 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %cst2 = constant dense<[1.0, 2.0]> : tensor<2xf32>
+
+  %0 = "tfl.fully_connected"(%arg0, %cst0, %arg1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
+  %1 = "tfl.mul"(%0, %cst2) {fused_activation_function = "RELU6"} : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+
+  return %1 : tensor<4x2xf32>
+
+// CHECK:  %cst = constant dense<{{\[\[}}1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00]]> : tensor<2x2xf32>
+// CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %arg1) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
+// CHECK:  return %0 : tensor<4x2xf32>
+}
+
 // CHECK-LABEL: @fuseMulIntoDepthwiseConv2d
 func @fuseMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x112x112x2xf32> {
   %cst0 = constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], [[13.0, 14.0], [15.0, 16.0], [17.0, 18.0]]]]> : tensor<1x3x3x2xf32>
@@ -130,11 +178,11 @@ func @notFuseMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x
 // CHECK:  return %1
 }
 
-// CHECK-LABEL: @FuseFullyConnectedAdd
-func @FuseFullyConnectedAdd(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+// CHECK-LABEL: @FuseFullyConnectedAddUnit
+func @FuseFullyConnectedAddUnit(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
   %cst = constant unit
-  %0 = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
-  %1 = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32>
+  %1 = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32>
   %cst2 = constant dense<2.0> : tensor<40x40xf32>
 
   %2 = "tfl.fully_connected" (%0, %1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>)
@@ -146,6 +194,37 @@ func @FuseFullyConnectedAdd(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>)
   // CHECK: %0 = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32>
   // CHECK: %1 = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32>
   // CHECK: %2 = "tfl.fully_connected"(%0, %1, %cst)
+  // CHECK: return %2
+}
+
+// CHECK-LABEL: @FuseFullyConnectedAddConst
+func @FuseFullyConnectedAddConst(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+  %cst = constant dense<3.0> : tensor<40x40xf32>
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
+  %1 = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32> loc("Input")
+  %cst2 = constant dense<2.0> : tensor<40x40xf32>
+
+  %2 = "tfl.fully_connected" (%0, %1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, tensor<40x40xf32>) -> (tensor<40x40xf32>)
+  %3 = "tfl.add"(%2, %cst2) {fused_activation_function = "NONE"} : (tensor<40x40xf32>, tensor<40x40xf32>) -> tensor<40x40xf32>
+
+  return %3 : tensor<40x40xf32>
+
+  // CHECK: %[[cst:.*]] = constant dense<5.000000e+00> : tensor<40x40xf32>
+  // CHECK: %[[cst_0:.*]] = "tfl.pseudo_input"(%arg0) : (tensor<40x37xf32>) -> tensor<40x37xf32>
+  // CHECK: %[[cst_1:.*1]] = "tfl.pseudo_input"(%arg1) : (tensor<40x37xf32>) -> tensor<40x37xf32>
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%[[cst_0]], %[[cst_1]], %[[cst]])
+  // CHECK: return %[[fc]]
+}
+
+// CHECK-LABEL: @FuseFullyConnectedRelu
+func @FuseFullyConnectedRelu(%arg0: tensor<1x256xf32>, %arg1: tensor<128x256xf32>, %arg2: tensor<128xf32>) -> tensor<1x128xf32> {
+  %0 = "tfl.fully_connected" (%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x256xf32>, tensor<128x256xf32>, tensor<128xf32>) -> tensor<1x128xf32>
+  %1 = "tfl.relu"(%0) : (tensor<1x128xf32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+  // CHECK: %[[RES:[0-9].*]] = "tfl.fully_connected"
+  // CHECK-SAME: fused_activation_function = "RELU"
+  // CHECK: return %[[RES]]
 }
 
 // CHECK-LABEL: @NoPadStridedSliceNonNewAxisMask
@@ -176,3 +255,53 @@ func @PadStridedSliceNewAxisMask(%arg0: tensor<2x3xf32>) -> tensor<1x2x3x1xf32>
   // CHECK: %1 = "tfl.reshape"(%0) : (tensor<2x3xf32>) -> tensor<1x2x3x1xf32>
   // CHECK: %2 = "tfl.strided_slice"(%1, %cst, %cst, %cst_0) {begin_mask = 15 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x2x3x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
 }
+
+// CHECK-LABEL: @L2NormalizePattern
+func @L2NormalizePattern(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %cst = constant dense<[0]> : tensor<1xi32>
+  %0 = "tfl.square"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = false} : (tensor<2xf32>, tensor<1xi32>) -> tensor<f32>
+  %2 = "tfl.rsqrt"(%1) : (tensor<f32>) -> tensor<f32>
+  %3 = "tfl.mul"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
+  return %3: tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: return %[[RES]]
+}
+
+// CHECK-LABEL: @L2NormalizePattern1
+func @L2NormalizePattern1(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %cst = constant dense<[0]> : tensor<1xi32>
+  %0 = "tfl.square"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = false} : (tensor<2xf32>, tensor<1xi32>) -> tensor<f32>
+  %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
+  %3 = "tfl.div"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
+  return %3: tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: return %[[RES]]
+}
+
+// CHECK-LABEL: @InvalidL2NormalizePattern
+// Div and square ops must take the same argument to be eligible.
+func @InvalidL2NormalizePattern(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %cst = constant dense<[0]> : tensor<1xi32>
+  %0 = "tfl.square"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = false} : (tensor<2xf32>, tensor<1xi32>) -> tensor<f32>
+  %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
+  %3 = "tfl.div"(%arg1, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
+  return %3: tensor<2xf32>
+  // CHECK: %3 = "tfl.div"([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
+  // CHECK: return %3
+}
+
+// CHECK-LABEL: @InvalidL2NormalizePatternMorethan1Dimension
+// Input has higher rank, it should be limited to 1D only.
+func @InvalidL2NormalizePatternMorethan1Dimension(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<[0]> : tensor<1xi32>
+  %0 = "tfl.square"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = false} : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<f32>
+  %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
+  %3 = "tfl.div"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  return %3: tensor<2x2xf32>
+  // CHECK: %3 = "tfl.div"([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  // CHECK: return %3
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
new file mode 100644
index 00000000000..cabbc4d9da5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -0,0 +1,12 @@
+// RUN: tf-opt -tfl-prepare-composite-funcs-tf %s | FileCheck %s --dump-input-on-failure
+
+func @foo(%arg0: tensor<?xf32>, %arg1: tensor<?xi32>) -> tensor<?xf32> attributes  {tf._implements = "embedding_matmul", tf._reference = "mlir"} {
+  %0 = "tf.Fill" (%arg1, %arg0) : (tensor<? x i32>, tensor<? x f32>) -> tensor<? x f32>
+  %1 = "tf.MatMul" (%0, %arg0) : (tensor<? x f32>, tensor<? x f32>) -> tensor<? x f32>
+  return %1 : tensor<?xf32>
+}
+
+// CHECK:       func @foo([[VAL_0:%.*]]: tensor<?xf32>, [[VAL_1:%.*]]: tensor<?xi32>) -> tensor<?xf32>
+// CHECK:        attributes  {tf._implements = "fused_tfl_embedding_lookup", tf._reference = "mlir"}
+// CHECK:           [[VAL_2:%.*]] = "tfl.embedding_lookup"([[VAL_1]], [[VAL_0]]) : (tensor<?xi32>, tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return [[VAL_2]] : tensor<?xf32>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index a3e7c01ca91..bf695e130d0 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -35,6 +35,27 @@ func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>
 // CHECK: return %6
 }
 
+// CHECK-LABEL: QuantizeFullyConnected
+func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
+^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %cst = constant dense<-1.23697901> : tensor<32xf32>
+  %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
+  %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>
+  %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>) -> tensor<32x3x3x3xf32>
+  %5 = "tfl.fully_connected"(%2, %4, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+
+// CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
+// CHECK: %2 = "tfl.dequantize"(%arg0)
+// CHECK: %3 = "tfl.pseudo_qconst"()
+// CHECK: %4 = "tfl.dequantize"(%3)
+// CHECK: %5 = "tfl.fully_connected"(%2, %4, %1)
+// CHECK: %6 = "tfl.quantize"(%5)
+// CHECK: return %6
+}
 
 // CHECK-LABEL: QuantizeDepthwiseConv2D
 func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
@@ -74,6 +95,80 @@ func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:
 // CHECK: return %3 : tensor<1x1x1x16xf32>
 }
 
+// CHECK-LABEL: QuantizeMaximum
+func @QuantizeMaximum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.dequantize"(%arg1) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32>
+  %2 = "tfl.maximum"(%0, %1) : (tensor<1x6x6x16xf32>, tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  return %2 : tensor<1x6x6x16xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.dequantize"(%arg1)
+// CHECK: %2 = "tfl.maximum"(%0, %1)
+// CHECK: %3 = "tfl.quantize"(%2)
+// CHECK: %4 = "tfl.dequantize"(%3)
+// CHECK: return %4 : tensor<1x6x6x16xf32>
+}
+
+// CHECK-LABEL: QuantizeMinimum
+func @QuantizeMinimum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.dequantize"(%arg1) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32>
+  %2 = "tfl.minimum"(%0, %1) : (tensor<1x6x6x16xf32>, tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  return %2 : tensor<1x6x6x16xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.dequantize"(%arg1)
+// CHECK: %2 = "tfl.minimum"(%0, %1)
+// CHECK: %3 = "tfl.quantize"(%2)
+// CHECK: %4 = "tfl.dequantize"(%3)
+// CHECK: return %4 : tensor<1x6x6x16xf32>
+}
+
+// CHECK-LABEL: QuantizeSlice
+func @QuantizeSlice(tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32> {
+^bb0(%arg0: tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x3x5xf32>
+  %1 = "tfl.slice"(%0, %arg1, %arg2) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32>
+  return %1 : tensor<?x3x5xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.slice"(%0, %arg1, %arg2)
+// CHECK: %2 = "tfl.quantize"(%1)
+// CHECK: %3 = "tfl.dequantize"(%2)
+// CHECK: return %3 : tensor<?x3x5xf32>
+}
+
+// CHECK-LABEL: QuantizeStridedSlice
+func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32> {
+^bb0(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>) -> tensor<12x2x2x5xf32>
+  %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  return %1 : tensor<1x2x2x5xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3)
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>}
+// CHECK: %3 = "tfl.dequantize"(%2)
+// CHECK: return %3 : tensor<1x2x2x5xf32>
+}
+
+// CHECK-LABEL: QuantizePad
+func @QuantizePad(tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<3x2xi32>) -> tensor<?xf32> {
+^bb0(%arg0: tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<3x2xi32>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x1x3xf32>
+  %1 = "tfl.pad"(%0, %arg1) : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<?xf32>
+  return %1 : tensor<?xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.pad"(%0, %arg1)
+// CHECK: %2 = "tfl.quantize"(%1)
+// CHECK: %3 = "tfl.dequantize"(%2)
+// CHECK: return %3 : tensor<?xf32>
+}
+
 // CHECK-LABEL: QuantizeReshape2D
 func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x36x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -102,6 +197,31 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
 
+// CHECK-LABEL: QuantizeLogistic
+func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  return %1 : tensor<1x6x6x16xf32>
+
+// CHECK: %0 = "tfl.dequantize"(%arg0)
+// CHECK: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x6x6x16xf32>
+// CHECK: return %3 : tensor<1x6x6x16xf32>
+}
+
+// CHECK-LABEL: NotQuantizeConcatConstantOperand
+func @NotQuantizeConcatConstantOperand(%arg0: tensor<2xf32>) -> tensor<2x2xf32> {
+  %0 = constant dense<1.0> : tensor<2xf32>
+  %1 = "tfl.concatenation"(%arg0, %0) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
+  return %1 : tensor<2x2xf32>
+
+// CHECK-NEXT: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<2xf32>
+// CHECK-NEXT: %[[cc:.*]] = "tfl.concatenation"(%arg0, %[[cst]])
+// CHECK-NEXT: return %[[cc]]
+}
+
 // CHECK-LABEL: QuantizeConcatOperand0ToAll
 func @QuantizeConcatOperand0ToAll(tensor<2x!quant.uniform<u8:f32, 0.1:128>>, tensor<2xf32>) -> tensor<2x2xf32> {
 ^bb0(%arg0: tensor<2x!quant.uniform<u8:f32, 0.1:128>>, %arg1: tensor<2xf32>):
@@ -248,7 +368,35 @@ func @QuantizeConstant() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK: %cst = constant dense{{.*}}tensor<2x3xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8<1:255>:f32, 0.023622047244094488:128>>}
 // CHECK: %1 = "tfl.dequantize"(%0)
 // CHECK: return %1 : tensor<2x3xf32>
-}
\ No newline at end of file
+}
+
+// CHECK-LABEL: QuantizeSharedBiases
+func @QuantizeSharedBiases(
+    %arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 1.0>>,
+    %arg1: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 1.0>>,
+    %arg2: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 2.0>>) -> (tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>) {
+  %cst = constant dense<1.0> : tensor<32xf32>
+  %1 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x224x224x3xf32>
+  %2 = "tfl.dequantize"(%arg1) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 1.0>>) -> tensor<32x3x3x3xf32>
+  %conv1 = "tfl.conv_2d"(%1, %2, %cst) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %3 = "tfl.quantize"(%conv1) {qtype = tensor<1x112x112x32xf32>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 1.0>>
+
+  %4 = "tfl.dequantize"(%3) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x112x112x32xf32>
+  %5 = "tfl.dequantize"(%arg2) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 2.0>>) -> tensor<32x3x3x3xf32>
+  %conv2 = "tfl.conv_2d"(%4, %5, %cst) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x32xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x56x56x32xf32>
+  %6 = "tfl.quantize"(%conv2) {qtype = tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>} : (tensor<1x56x56x32xf32>) -> tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>
+
+  return %6 : tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>
+
+// CHECK: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]])
+// CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
+// CHECK: %[[cst_0:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
+// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]])
+// CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
+// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
+// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq_0]])
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 0edb4f40cdc..ad11764851c 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -63,102 +63,239 @@ func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8
   return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
 
 // CHECK-LABEL: fusedBatchNorm
-// CHECK:%cst = constant dense<1.000000e-03> : tensor<f32>
+// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
 //              variance + epsilon
-// CHECK:  %0 = "tf.Add"(%arg4, %cst) : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+// CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
 //              rsqrt(variance + epsilon)
-// CHECK:  %1 = "tf.Rsqrt"(%0) : (tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  %[[RSQRT:.*]] = "tf.Rsqrt"(%[[ADD1]])
 //              scale * rsqrt(variance + epsilon)
-// CHECK:  %2 = "tf.Mul"(%arg1, %1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  %[[MUL1:.*]] = "tf.Mul"(%[[ARG1:.*]], %[[RSQRT]])
 //              x * scale * rsqrt(variance + epsilon)
-// CHECK:  %3 = "tf.Mul"(%arg0, %2) : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK:  %[[MUL2:.*]] = "tf.Mul"(%[[ARG0:.*]], %[[MUL1]])
 //              mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %4 = "tf.Mul"(%arg3, %2) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  %[[MUL3:.*]] = "tf.Mul"(%[[ARG3:.*]], %[[MUL1]])
 //              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %5 = "tf.Sub"(%arg2, %4) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  %[[SUB:.*]] = "tf.Sub"(%[[ARG2:.*]], %[[MUL3]])
 //              x * scale * rsqrt(variance + epsilon) +
 //              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %6 = "tf.Add"(%3, %5) : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK:  %[[ADD2:.*]] = "tf.Add"(%[[MUL2]], %[[SUB]])
 
-// CHECK:  %7:5 = "tf.FusedBatchNorm"(%6, %arg1, %arg2, %arg3, %arg4)
-// CHECK:  %8:5 = "tf.FusedBatchNorm"(%7#0, %arg1, %arg2, %arg3, %arg4)
+// CHECK:  %[[BATCHNORM1:.*]]:5 = "tf.FusedBatchNorm"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+// CHECK:  {{.*}} = "tf.FusedBatchNorm"(%[[BATCHNORM1]]#0, %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
 }
 
-func @fakeQuantNotFollowedByQuant(tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>) {
-^bb0(%arg0: tensor<8x8x8x8xf32>):
-  %arg1 = constant dense<-0.1> : tensor<f32>
-  %arg2 = constant dense<0.2> : tensor<f32>
-  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>) -> tensor<8x8x8x8xf32>
-  return %0 : tensor<8x8x8x8xf32>
+func @fusedBatchNormV3(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
+^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>):
+  // OK
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", U = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // Unsupported training
+  %1:6 = "tf.FusedBatchNormV3"( %0#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", U = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true}  : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // Use other output
+  %2:6 = "tf.FusedBatchNormV3"( %1#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", U = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+
+  return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
+
+// CHECK-LABEL: fusedBatchNormV3
+// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
+//              variance + epsilon
+// CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
+//              rsqrt(variance + epsilon)
+// CHECK:  %[[RSQRT:.*]] = "tf.Rsqrt"(%[[ADD1]])
+//              scale * rsqrt(variance + epsilon)
+// CHECK:  %[[MUL1:.*]] = "tf.Mul"(%[[ARG1:.*]], %[[RSQRT]])
+//              x * scale * rsqrt(variance + epsilon)
+// CHECK:  %[[MUL2:.*]] = "tf.Mul"(%[[ARG0:.*]], %[[MUL1]])
+//              mean * scale * rsqrt(variance + epsilon)
+// CHECK:  %[[MUL3:.*]] = "tf.Mul"(%[[ARG3:.*]], %[[MUL1]])
+//              offset - mean * scale * rsqrt(variance + epsilon)
+// CHECK:  %[[SUB:.*]] = "tf.Sub"(%[[ARG2:.*]], %[[MUL3]])
+//              x * scale * rsqrt(variance + epsilon) +
+//              offset - mean * scale * rsqrt(variance + epsilon)
+// CHECK:  %[[ADD2:.*]] = "tf.Add"(%[[MUL2]], %[[SUB]])
+
+// CHECK:  %[[BATCHNORM1:.*]]:6 = "tf.FusedBatchNormV3"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+// CHECK:  %[[BATCHNORM2:.*]]:6 = "tf.FusedBatchNormV3"(%[[BATCHNORM1]]#0, %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+}
+
+// CHECK-LABEL: fakeQuantForActivation
+func @fakeQuantForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
+^bb0(%arg0: tensor<8xf32>):
+  %arg1 = constant dense<0.0> : tensor<f32>
+  %arg2 = constant dense<255.0> : tensor<f32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
+  return %0 : tensor<8xf32>
+
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
+// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK:  %2 = "tfl.dequantize"(%1)
+// CHECK:  return %2
+}
+
+// CHECK-LABEL: fakeQuantForActivationNoDuplication
+func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>) {
+^bb0(%arg0: tensor<8xf32>):
+  %arg1 = constant dense<0.0> : tensor<f32>
+  %arg2 = constant dense<255.0> : tensor<f32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<8xf32>) -> tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>
+  return %1 : tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>
 
-// CHECK-LABEL: fakeQuantNotFollowedByQuant
 // CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 3 : i64}
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}
-// CHECK:  %2 = "tfl.dequantize"(%1) : (tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>)
-// CHECK:  return %2 : tensor<8x8x8x8xf32>
+// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK:  return %1
 }
 
-func @fakeQuantFollowedByQuant(tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>) {
-^bb0(%arg0: tensor<8x8x8x8xf32>):
-  %arg1 = constant dense<-0.1> : tensor<f32>
-  %arg2 = constant dense<0.2> : tensor<f32>
-  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>) -> tensor<8x8x8x8xf32>
-  %1 = "tfl.quantize"(%0) {qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>
-  %2 = "tfl.dequantize"(%1) : (tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>) -> tensor<8x8x8x8xf32>
-  return %2 : tensor<8x8x8x8xf32>
+// CHECK-LABEL: fakeQuantFolded
+func @fakeQuantFolded() -> (tensor<8xf32>) {
+  %in = constant dense<0.0> : tensor<8xf32>
+  %min = constant dense<0.0> : tensor<f32>
+  %max = constant dense<255.0> : tensor<f32>
+  %mini = "tf.Identity"(%min) : (tensor<f32>) -> tensor<f32>
+  %maxi = "tf.Identity"(%max) : (tensor<f32>) -> tensor<f32>
+  %rst = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
+  return %rst : tensor<8xf32>
 
-// CHECK-LABEL: fakeQuantFollowedByQuant
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 3 : i64}
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}
-// CHECK:  %2 = "tfl.dequantize"(%1) : (tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>)
-// CHECK:  return %2 : tensor<8x8x8x8xf32>
+// CHECK: %cst = constant dense<0.000000e+00> : tensor<8xf32>
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %1 = "tfl.dequantize"(%0)
+// CHECK: return %1 : tensor<8xf32>
 }
 
-func @fakeQuantVarsNotConst(tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<8x8x8x8xf32>) {
-^bb0(%arg0: tensor<8x8x8x8xf32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
-  %1 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg3, %arg4) {num_bits = 3, narrow_range = false} : (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>) -> tensor<8x8x8x8xf32>
-  return %1 : tensor<8x8x8x8xf32>
+// CHECK-LABEL: fakeQuantNotFolded
+func @fakeQuantNotFolded(tensor<8xf32>, tensor<f32>, tensor<f32>) -> (tensor<8xf32>) {
+^bb0(%arg0: tensor<8xf32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
+  %1 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg3, %arg4) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
+  return %1 : tensor<8xf32>
 
-// CHECK-LABEL: fakeQuantVarsNotConst
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {narrow_range = false, num_bits = 3 : i64}
-// CHECK:  return %0 : tensor<8x8x8x8xf32>
+// CHECK: %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2)
+// CHECK: return %0 : tensor<8xf32>
 }
 
-func @fakeQuantFollowedByTranspose(tensor<3x3x3x16xf32>, tensor<f32>, tensor<f32>) -> (tensor<16x3x3x3xf32>) {
-^bb0(%arg0: tensor<3x3x3x16xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
-  %cst_0 = constant dense<[3, 0, 1, 2]> : tensor<4xi32>
-  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<3x3x3x16xf32>, tensor<f32>, tensor<f32>) -> tensor<3x3x3x16xf32>
-  %1 = "tf.Transpose"(%0, %cst_0): (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
-  return %1 : tensor<16x3x3x3xf32>
-
 // CHECK-LABEL: fakeQuantFollowedByTranspose
-// CHECK:  %cst = constant dense<[3, 0, 1, 2]> : tensor<4xi32>
-// CHECK:  %0 = "tf.Transpose"(%arg0, %cst) : (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
-// CHECK:  %1 = "tf.FakeQuantWithMinMaxVars"(%0, %arg1, %arg2) {narrow_range = false, num_bits = 3 : i64}
-// CHECK:  return %1 : tensor<16x3x3x3xf32>
-}
+func @fakeQuantFollowedByTranspose(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>) {
+^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
+  %cst_0 = constant dense<[1, 0]> : tensor<2xi32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> tensor<1x2xf32>
+  %1 = "tf.Transpose"(%0, %cst_0): (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
+  return %1 : tensor<2x1xf32>
 
-func @fakeQuantFollowedByReshape(tensor<3x3x3x4xf32>, tensor<f32>, tensor<f32>) -> (tensor<1x3x3x12xf32>) {
-^bb0(%arg0: tensor<3x3x3x4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
-  %cst_0 = constant dense<[1, 3, 3, 12]> : tensor<4xi64>
-  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<3x3x3x4xf32>, tensor<f32>, tensor<f32>) -> tensor<3x3x3x4xf32>
-  %1 = "tf.Reshape"(%0, %cst_0) : (tensor<3x3x3x4xf32>, tensor<4xi64>) -> tensor<1x3x3x12xf32>
-  return %1 : tensor<1x3x3x12xf32>
+// CHECK:  %cst = constant
+// CHECK:  %0 = "tf.Transpose"(%arg0, %cst)
+// CHECK:  %1 = "tf.FakeQuantWithMinMaxVars"(%0, %arg1, %arg2)
+// CHECK:  return %1
+}
 
 // CHECK-LABEL: fakeQuantFollowedByReshape
-// CHECK:  %cst = constant dense<[1, 3, 3, 12]> : tensor<4xi64>
-// CHECK:  %0 = "tf.Reshape"(%arg0, %cst) : (tensor<3x3x3x4xf32>, tensor<4xi64>) -> tensor<1x3x3x12xf32>
-// CHECK:  %1 = "tf.FakeQuantWithMinMaxVars"(%0, %arg1, %arg2) {narrow_range = false, num_bits = 3 : i64}
-// CHECK:  return %1 : tensor<1x3x3x12xf32>
+func @fakeQuantFollowedByReshape(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>) {
+^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
+  %cst_0 = constant dense<[2, -1]> : tensor<2xi64>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> tensor<1x2xf32>
+  %1 = "tf.Reshape"(%0, %cst_0) : (tensor<1x2xf32>, tensor<2xi64>) -> tensor<2x1xf32>
+  return %1 : tensor<2x1xf32>
+
+// CHECK:  %cst = constant
+// CHECK:  %0 = "tf.Reshape"(%arg0, %cst)
+// CHECK-SAME: tensor<2x1xf32>
+// CHECK:  %1 = "tf.FakeQuantWithMinMaxVars"(%0, %arg1, %arg2)
+// CHECK:  return %1
 }
 
-func @identity(tensor<10xi32>) -> tensor<10xi32> {
-^bb0(%arg0: tensor<10xi32>):
+// CHECK-LABEL: QDQsFollowedByTranspose
+func @QDQsFollowedByTranspose(tensor<1x2xf32>) -> (tensor<2x1xf32>) {
+^bb0(%arg0: tensor<1x2xf32>):
+  %cst_0 = constant dense<[1, 0]> : tensor<2xi32>
+  %0 = "tfl.quantize"(%arg0){qtype = tensor<1x2x!quant.uniform<u8:f32, 1.0>>}: (tensor<1x2xf32>) -> (tensor<1x2x!quant.uniform<u8:f32, 1.0>>)
+  %1 = "tfl.dequantize"(%0): (tensor<1x2x!quant.uniform<u8:f32, 1.0>>) -> (tensor<1x2xf32>)
+  %2 = "tf.Transpose"(%1, %cst_0): (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+
+// CHECK: %cst = constant
+// CHECK: %0 = "tf.Transpose"
+// CHECK-SAME: -> tensor<2x1xf32>
+// CHECK: %1 = "tfl.quantize"(%0) {qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-SAME: -> tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK: %2 = "tfl.dequantize"(%1)
+// CHECK-SAME: -> tensor<2x1xf32>
+// CHECK: return %2
+}
+
+// CHECK-LABEL: QDQFollowedByReshape
+func @QDQFollowedByReshape(tensor<1x2xf32>) -> (tensor<2x1xf32>) {
+^bb0(%arg0: tensor<1x2xf32>):
+  %cst_0 = constant dense<[2, 1]> : tensor<2xi32>
+  %0 = "tfl.quantize"(%arg0){qtype = tensor<1x2x!quant.uniform<u8:f32, 1.0>>}: (tensor<1x2xf32>) -> (tensor<1x2x!quant.uniform<u8:f32, 1.0>>)
+  %1 = "tfl.dequantize"(%0): (tensor<1x2x!quant.uniform<u8:f32, 1.0>>) -> (tensor<1x2xf32>)
+  %2 = "tf.Reshape"(%1, %cst_0): (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+
+// CHECK: %cst = constant
+// CHECK: %0 = "tf.Reshape"
+// CHECK-SAME: -> tensor<2x1xf32>
+// CHECK: %1 = "tfl.quantize"(%0) {qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-SAME: -> tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK: %2 = "tfl.dequantize"(%1)
+// CHECK-SAME: -> tensor<2x1xf32>
+// CHECK: return %2
+}
+
+// CHECK-LABEL: QDQFollowedByRank
+func @QDQFollowedByRank(%arg0: tensor<1x2xf32>) -> (tensor<i32>) {
+  %0 = "tfl.quantize"(%arg0){qtype = tensor<1x2x!quant.uniform<u8:f32, 1.0>>}: (tensor<1x2xf32>) -> (tensor<1x2x!quant.uniform<u8:f32, 1.0>>)
+  %1 = "tfl.dequantize"(%0): (tensor<1x2x!quant.uniform<u8:f32, 1.0>>) -> (tensor<1x2xf32>)
+  %2 = "tf.Rank"(%1): (tensor<1x2xf32>) -> tensor<i32>
+  return %2 : tensor<i32>
+
+// CHECK-NEXT: %[[R:.*]] = "tf.Rank"(%arg0)
+// CHECK-NEXT: return %[[R]] : tensor<i32>
+}
+
+// CHECK-LABEL: fakeQuantWithConv2D
+func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30x16xf32>) {
+^bb0(%arg: tensor<256x32x32x3xf32>) :
+  %in = constant dense<0.0> : tensor<3x3x3x16xf32>
+  %min = constant dense<0.0> : tensor<f32>
+  %max = constant dense<255.0> : tensor<f32>
+  %mini = "tf.Identity"(%min) : (tensor<f32>) -> tensor<f32>
+  %maxi = "tf.Identity"(%max) : (tensor<f32>) -> tensor<f32>
+  %fq = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<3x3x3x16xf32>, tensor<f32>, tensor<f32>) -> tensor<3x3x3x16xf32>
+  %rst = "tf.Conv2D"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
+  return %rst : tensor<256x30x30x16xf32>
+
+// CHECK: %cst = constant dense<0.000000e+00> : tensor<16xf32>
+// CHECK: %cst_0 = constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
+// CHECK: %0 = "tfl.quantize"(%cst_0) {qtype = tensor<16x3x3x3x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %1 = "tfl.dequantize"(%0)
+// CHECK: %2 = "tfl.conv_2d"(%arg0, %1, %cst)
+// CHECK: return %2
+}
+
+// CHECK-LABEL: fakeQuantWithDepthwiseConv2D
+func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30x16xf32>) {
+^bb0(%arg: tensor<256x32x32x3xf32>) :
+  %in = constant dense<0.0> : tensor<3x3x3x16xf32>
+  %min = constant dense<0.0> : tensor<f32>
+  %max = constant dense<255.0> : tensor<f32>
+  %mini = "tf.Identity"(%min) : (tensor<f32>) -> tensor<f32>
+  %maxi = "tf.Identity"(%max) : (tensor<f32>) -> tensor<f32>
+  %fq = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<3x3x3x16xf32>, tensor<f32>, tensor<f32>) -> tensor<3x3x3x16xf32>
+  %rst = "tf.DepthwiseConv2dNative"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
+  return %rst : tensor<256x30x30x16xf32>
+
+// CHECK: %cst = constant dense<0.000000e+00> : tensor<48xf32>
+// CHECK: %cst_0 = constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
+// CHECK: %0 = "tfl.quantize"(%cst_0) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %1 = "tfl.dequantize"(%0)
+// CHECK: %2 = "tfl.depthwise_conv_2d"(%arg0, %1, %cst)
+// CHECK: return %2
+}
+
+func @identity(%arg0: tensor<10xi32>, %arg1: tensor<20xi32>, %arg2: tensor<30xi32>) -> (tensor<10xi32>, tensor<20xi32>, tensor<30xi32>) {
   %0 = "tf.Identity"(%arg0) : (tensor<10xi32>) -> tensor<10xi32>
-  return %0: tensor<10xi32>
+  %1:2 = "tf.IdentityN"(%arg1,%arg2) : (tensor<20xi32>, tensor<30xi32>) -> (tensor<20xi32>, tensor<30xi32>)
+  return %0, %1#0, %1#1: tensor<10xi32>, tensor<20xi32>, tensor<30xi32>
 
 // CHECK-LABEL: identity
-// CHECK: return %arg0
+// CHECK: return %arg0, %arg1, %arg2
 }
 
 
@@ -195,3 +332,19 @@ func @matmulNoTransposeB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf32>
   // CHECK: %7 = "tf.Transpose"(%arg1, %6) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
   // CHECK: %8 = "tf.MatMul"(%3, %7) {transpose_a = false, transpose_b = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
 }
+
+func @snapshot(%arg0: tensor<3xi32>) -> tensor<3xi32> {
+  %0 = "tf.Snapshot"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
+  return %0 : tensor<3xi32>
+  // Should be converted to Identity and then from Identity to value
+  // CHECK-LABEL: snapshot
+  // CHECK:  return %arg0 : tensor<3xi32>
+}
+
+func @stop_gradient(%arg0: tensor<3xi32>) -> tensor<3xi32> {
+  %0 = "tf.StopGradient"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
+  return %0 : tensor<3xi32>
+  // Should be converted to Identity and then from Identity to value
+  // CHECK-LABEL: stop_gradient
+  // CHECK:  return %arg0 : tensor<3xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index b3b439b2b8a..dc24b1004d7 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -82,6 +82,35 @@ func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500
 // CHECK: return %2
 }
 
+// CHECK-LABEL: QuantizeFullyConnected
+func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
+^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %cst = constant dense<-1.23697901> : tensor<32xf32>
+  %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
+  %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>
+  %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>) -> tensor<32x3x3x3xf32>
+  %5 = "tfl.fully_connected"(%2, %4, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+
+// CHECK: %0 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<-7254> : tensor<32xi32>}
+// CHECK: %1 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}
+// CHECK: %2 = "tfl.fully_connected"(%arg0, %1, %0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK: return %2
+}
+
+// CHECK-LABEL: QuantizeNoBiasFullyConnected
+func @QuantizeNoBiasFullyConnected(%arg0: tensor<3x!quant.uniform<u8:f32, 1.0>>, %arg1: tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>, %arg2: none) -> tensor<3x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.dequantize"(%arg0) : (tensor<3x!quant.uniform<u8:f32, 1.0>>) -> tensor<3xf32>
+  %1 = "tfl.dequantize"(%arg1) : (tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>) -> tensor<3xf32>
+  %2 = "tfl.fully_connected"(%0, %1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<3xf32>, tensor<3xf32>, none) -> tensor<3xf32>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<3x!quant.uniform<u8:f32, 1.0>>} : (tensor<3xf32>) -> tensor<3x!quant.uniform<u8:f32, 1.0>>
+  return %3 : tensor<3x!quant.uniform<u8:f32, 1.0>>
+
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2)
+// CHECK-NEXT: return %[[fc]]
+}
+
 // CHECK-LABEL: QuantizeAveragePool2D
 func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -118,6 +147,18 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: return %1 : tensor<1x6x6x16xf32>
 }
 
+// CHECK-LABEL: QuantizeLogistic
+func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  return %1 : tensor<1x6x6x16xf32>
+
+// CHECK: %0 = "tfl.logistic"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
+// CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>)
+// CHECK: return %1
+}
+
 // CHECK-LABEL: QuantizeAdd
 func @QuantizeAdd(tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017922:119>>, tensor<1x56x56x24x!quant.uniform<u8:f32, 0.40149296779258581:136>>) -> tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>> {
 ^bb0(%arg0: tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017922:119>>, %arg1: tensor<1x56x56x24x!quant.uniform<u8:f32, 0.40149296779258581:136>>):
@@ -167,4 +208,16 @@ func @QuantizeMaxPool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>
 // CHECK: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16x!quant.uniform<u8:f32, 7.812500e-03:128>>
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x1x1x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16xf32>
 // CHECK: return %1 : tensor<1x1x1x16xf32>
+}
+
+// CHECK-LABEL: QuantizeSplit
+func @QuantizeSplit(%arg: tensor<4x!quant.uniform<u8:f32, 1.0>>, %cst: tensor<i32>) -> (tensor<2x!quant.uniform<u8:f32, 1.0>>,tensor<2x!quant.uniform<u8:f32, 1.0>>) {
+  %0 = "tfl.dequantize"(%arg) : (tensor<4x!quant.uniform<u8:f32, 1.0>>) -> tensor<4xf32>
+  %1:2 = "tfl.split"(%cst, %0) {num_splits = 2 : i32} : (tensor<i32>, tensor<4xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  %2 = "tfl.quantize"(%1#0) {qtype = tensor<2x!quant.uniform<u8:f32, 1.0>>} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.0>>
+  %3 = "tfl.quantize"(%1#1) {qtype = tensor<2x!quant.uniform<u8:f32, 1.0>>} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.0>>
+  return %2, %3 : tensor<2x!quant.uniform<u8:f32, 1.0>>, tensor<2x!quant.uniform<u8:f32, 1.0>>
+
+// CHECK: %0:2 = "tfl.split"(%arg1, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>)
+// CHECK: return %0#0, %0#1
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/trim-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/trim-functions-tf.mlir
new file mode 100644
index 00000000000..95844ccad1c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/trim-functions-tf.mlir
@@ -0,0 +1,21 @@
+// RUN: tf-opt -tfl-trim-funcs-tf -tfl-trim-funcs-whitelist="bar,foobar" %s | FileCheck %s --dump-input-on-failure
+
+func @foo(%arg0: tensor<1x4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+func @bar(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+func @foobar(%arg0: tensor<1x4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %0 = "tfl.pseudo_input"(%arg0) : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// CHECK-DAG: func @main
+// CHECK-DAG: func @foobar
+// CHECK-NOT: func @foo
+// CHECK-NOT: func @bar
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
new file mode 100644
index 00000000000..25d15614ef6
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
+
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Transforms/Passes.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+
+namespace mlir {
+/// Create a pass to convert from the TFExecutor to the TF control dialect.
+std::unique_ptr<FunctionPassBase> CreateTFExecutorToControlDialectConversion();
+}  // namespace mlir
+
+namespace tensorflow {
+
+bool ShouldRunQuantizePasses(mlir::ModuleOp m) {
+  if (mlir::FuncOp main_fn = m.lookupSymbol<mlir::FuncOp>("main")) {
+    return main_fn.getAttrOfType<mlir::UnitAttr>("tf.quantize") !=
+           mlir::Attribute();
+  }
+  return false;
+}
+
+void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
+                                mlir::PassManager* pass_manager) {
+  pass_manager->addPass(mlir::CreateTFExecutorToControlDialectConversion());
+  pass_manager->addPass(mlir::TFControlFlow::CreateRaiseTFControlFlowPass());
+  // Ophint extraction will happen after island extraction pass.
+  pass_manager->addPass(mlir::TFL::CreateExtractOphintPass());
+  // Convert composite op pass will happen after ophint extraction pass.
+  pass_manager->addPass(mlir::TFL::CreateLegalizeOphintFuncOpPass());
+
+  if (pass_config.lower_tensor_list_ops) {
+    // Execute this pass before `CanonicalizerPass` in case some TensorList
+    // ops are constant folded into variant types.
+    // TODO(b/137125056): Move this pass after `CanonicalizerPass` after we
+    // handle constant ops that produce `TensorList`.
+    // TODO(haoliang): Add this pass by default.
+    pass_manager->addPass(mlir::TFL::CreateLowerStaticTensorListPass());
+  }
+
+  // TODO(jpienaar): Revise post dialect constants.
+  pass_manager->addPass(mlir::TF::CreateDecodeConstantPass());
+  // Canonicalization includes const folding, which is utilized here to optimize
+  // away ops that can't get constant folded after PrepareTF pass. For example,
+  // tf.Conv2D is split into tf.Transpose and tfl.Conv2D.
+  pass_manager->addPass(mlir::createCanonicalizerPass());
+
+  // The below passes only make sense if Builtin TFLite ops are enabled
+  // for emission.
+  if (pass_config.emit_builtin_tflite_ops) {
+    // Prepare for TFLite dialect, rerun canonicalization, and then legalize to
+    // the TFLite dialect.
+    pass_manager->addPass(mlir::TFL::CreatePrepareTFPass());
+    pass_manager->addPass(mlir::createCanonicalizerPass());
+    pass_manager->addPass(mlir::TFL::CreateLegalizeTFPass());
+    pass_manager->addPass(mlir::TFL::CreateOptimizePass());
+    if (pass_config.run_quantize) {
+      pass_manager->addPass(mlir::TFL::CreatePrepareQuantizePass(
+          /*quantize_sign=*/false));
+      pass_manager->addPass(mlir::TFL::CreateQuantizePass());
+      pass_manager->addPass(mlir::TFL::CreatePostQuantizePass(
+          pass_config.emit_quant_adaptor_ops));
+    }
+    pass_manager->addPass(mlir::createCanonicalizerPass());
+    pass_manager->addPass(mlir::createCSEPass());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.h b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
new file mode 100644
index 00000000000..653e4ec5245
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
+
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassManager.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+
+namespace tensorflow {
+
+// Quantization passess will run only when the user specifies a quantized type
+// in the `-tf-inference-type` flag, which is converted to the function
+// attribute "tf.quantize" by the importer module.
+// TODO(fengliuai): switch to the cmd flag once the flags are moved to this
+// file with main method.
+bool ShouldRunQuantizePasses(mlir::ModuleOp m);
+
+// Add the TF to TFLite passes, specified in the pass_config, into a
+// pass_manager.
+void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
+                                mlir::PassManager* pass_manager);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 9656abb1611..33044a63271 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
@@ -24,7 +23,10 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/Support/FileUtilities.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_translate.h"
+#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
@@ -37,8 +39,8 @@ using mlir::FuncOp;
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using stream_executor::port::StatusOr;
-using tensorflow::Status;
 
+// Debugging flag to print function mapping in the flatbuffer.
 // NOLINTNEXTLINE
 static llvm::cl::opt<bool> print_function_result_mapping(
     "print-function-result-mapping",
@@ -99,9 +101,8 @@ static int PrintFunctionResultMapping(const std::string &result,
 }
 
 int main(int argc, char **argv) {
-  llvm::PrettyStackTraceProgram x(argc, argv);
   // TODO(jpienaar): Revise the command line option parsing here.
-  llvm::InitLLVM y(argc, argv);
+  tensorflow::InitMlir y(&argc, &argv);
 
   // TODO(antiagainst): We are pulling in multiple transformations as follows.
   // Each transformation has its own set of command-line options; options of one
@@ -112,14 +113,9 @@ int main(int argc, char **argv) {
   // We need to disable duplicated ones to provide a cleaner command-line option
   // interface. That also means we need to relay the value set in one option to
   // all its aliases.
-
   llvm::cl::ParseCommandLineOptions(
       argc, argv, "TF GraphDef to TFLite FlatBuffer converter\n");
 
-  // TODO(ashwinm): Enable command line parsing for both sides.
-  int fake_argc = 1;
-  tensorflow::port::InitMain(argv[0], &fake_argc, &argv);
-
   MLIRContext context;
   llvm::SourceMgr source_mgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
@@ -135,11 +131,22 @@ int main(int argc, char **argv) {
   // message. So we can just return here.
   if (!module.ok()) return kTrFailure;
 
+  mlir::PassManager pm;
+  bool run_quantize =
+      tensorflow::ShouldRunQuantizePasses(module.ValueOrDie().get());
+  mlir::TFL::PassConfig pass_config;
+  pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
+  pass_config.emit_quant_adaptor_ops = emit_quant_adaptor_ops;
+  pass_config.lower_tensor_list_ops = lower_tensor_list_ops;
+  pass_config.run_quantize = run_quantize;
+
+  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+
   std::string result;
-  auto status = tensorflow::ConvertTFControlFlowToTFLOrFlatbuffer(
+  auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
       module.ValueOrDie().get(), output_mlir, emit_builtin_tflite_ops,
       emit_select_tf_ops, emit_custom_ops, emit_quant_adaptor_ops,
-      lower_tensor_list_ops, &result);
+      lower_tensor_list_ops, &result, &pm);
   if (!status.ok()) return kTrFailure;
 
   auto output = mlir::openOutputFile(output_file_name);
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index bc2f36beb4d..fed9f1739ad 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -31,6 +31,11 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+namespace mlir {
+/// Create a pass to convert from the TFExecutor to the TF control dialect.
+std::unique_ptr<FunctionPassBase> CreateTFExecutorToControlDialectConversion();
+}  // namespace mlir
+
 namespace tensorflow {
 
 using mlir::MLIRContext;
@@ -79,79 +84,23 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     return tensorflow::GraphdefToSplattedMlirTranslateFunction(
         input_filename, debug_info_file, input_arrays, input_dtypes,
         input_shapes, output_arrays, inference_type, min_values, max_values,
-        prune_unused_nodes, context);
+        prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+        /*graph_as_function=*/false, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
       input_filename, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, inference_type, min_values, max_values, prune_unused_nodes,
-      context);
+      /*convert_legacy_fed_inputs=*/true, /*graph_as_function=*/false, context);
 }
 
-bool ShouldRunQuantizePasses(mlir::ModuleOp m) {
-  if (mlir::FuncOp main_fn = m.lookupSymbol<mlir::FuncOp>("main")) {
-    return main_fn.getAttrOfType<mlir::UnitAttr>("tf.quantize") !=
-           mlir::Attribute();
-  }
-  return false;
-}
-
-void AddTFToTFLConversionPasses(bool emit_builtin_tflite_ops, bool run_quantize,
-                                bool emit_quant_adaptor_ops,
-                                bool lower_tensor_list_ops,
-                                mlir::PassManager *pass_manager) {
-  pass_manager->addPass(mlir::TFControlFlow::CreateRaiseTFControlFlowPass());
-
-  if (lower_tensor_list_ops) {
-    // Execute this pass before `CanonicalizerPass` in case some TensorList
-    // ops are constant folded into variant types.
-    // TODO(b/137125056): Move this pass after `CanonicalizerPass` after we
-    // handle constant ops that produce `TensorList`.
-    // TODO(haoliang): Add this pass by default.
-    pass_manager->addPass(mlir::TFL::CreateLowerStaticTensorListPass());
-  }
-
-  // TODO(jpienaar): Revise post dialect constants.
-  pass_manager->addPass(mlir::TF::CreateDecodeConstantPass());
-  // Canonicalization includes const folding, which is utilized here to optimize
-  // away ops that can't get constant folded after PrepareTF pass. For example,
-  // tf.Conv2D is split into tf.Transpose and tfl.Conv2D.
-  pass_manager->addPass(mlir::createCanonicalizerPass());
-
-  // The below passes only make sense if Builtin TFLite ops are enabled
-  // for emission.
-  if (emit_builtin_tflite_ops) {
-    // Prepare for TFLite dialect, rerun canonicalization, and then legalize to
-    // the TFLite dialect.
-    pass_manager->addPass(mlir::TFL::CreatePrepareTFPass());
-    pass_manager->addPass(mlir::createCanonicalizerPass());
-    pass_manager->addPass(mlir::TFL::CreateLegalizeTFPass());
-    pass_manager->addPass(mlir::TFL::CreateOptimizePass());
-    if (run_quantize) {
-      pass_manager->addPass(mlir::TFL::CreatePrepareQuantizePass(
-          /*quantize_sign=*/false));
-      pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-      pass_manager->addPass(
-          mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
-    }
-    pass_manager->addPass(mlir::createCanonicalizerPass());
-    pass_manager->addPass(mlir::createCSEPass());
-  }
-}
-
-Status ConvertTFControlFlowToTFLOrFlatbuffer(
+Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops, bool emit_quant_adaptor_ops,
-    bool lower_tensor_list_ops, std::string *result) {
+    bool lower_tensor_list_ops, std::string *result,
+    mlir::PassManager *pass_manager) {
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
-  mlir::PassManager pm;
-  bool run_quantize = ShouldRunQuantizePasses(module);
-
-  AddTFToTFLConversionPasses(emit_builtin_tflite_ops, run_quantize,
-                             emit_quant_adaptor_ops, lower_tensor_list_ops,
-                             &pm);
-
-  if (failed(pm.run(module))) {
+  if (failed(pass_manager->run(module))) {
     return statusHandler.ConsumeStatus();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 68ab674872f..2979e4617b0 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -41,37 +41,16 @@ LoadFromGraphdefOrMlirSource(
     bool prune_unused_nodes, llvm::SourceMgr* source_mgr,
     mlir::MLIRContext* context);
 
-// Quantization passess will run only when the user specifies a quantized type
-// in the `-tf-inference-type` flag, which is converted to the function
-// attribute "tf.quantize" by the importer module.
-// TODO(fengliuai): switch to the cmd flag once the flags are moved to this
-// file with main method.
-bool ShouldRunQuantizePasses(mlir::ModuleOp m);
-
-// Add the MLIR passes that convert TF control flow dialect to TF Lite dialect
-// to a MLIR `pass_manager`. These passes first raise the control flow in the TF
-// control flow dialect, decode the constant tensors, and then legalize the
-// module to TF Lite dialect with some optimizations afterwards.
-// If `emit_builtin_tflite_ops` is true, TF Lite legalization passes will be
-// added, which produces TF Lite ops. If `run_quantize` is true, quantization
-// passes will be added. If `emit_quant_adaptor_ops` is true, Quantize and
-// Dequantize ops are added to the inputs and outputs of the quantized model.
-// If `lower_tensor_list_ops` is true, tensorlist ops will be lowered to basic
-// TF ops before legalization to TF Lite dialect.
-void AddTFToTFLConversionPasses(bool emit_builtin_tflite_ops, bool run_quantize,
-                                bool emit_quant_adaptor_ops,
-                                bool lower_tensor_list_ops,
-                                mlir::PassManager* pass_manager);
-
-// Taking a MLIR module in TF control flow dialect and a set of parameters,
+// Taking a MLIR module in TF executor dialect and a set of parameters,
 // applies a set of passes to convert the module to TF Lite dialect and
 // serializes the result to a string. Depending on an attribute in the module
 // main function, Quantization is applied. If `export_to_mlir` is true, the
 // result is exported in MLIR text format, otherwise exported in flat buffer.
-Status ConvertTFControlFlowToTFLOrFlatbuffer(
+Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops, bool emit_quant_adaptor_ops,
-    bool lower_tensor_list_ops, std::string* result);
+    bool lower_tensor_list_ops, std::string* result,
+    mlir::PassManager* pass_manager);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
new file mode 100644
index 00000000000..b6a898e6cda
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
@@ -0,0 +1,595 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+#include <queue>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Analysis/LoopAnalysis.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/SymbolTable.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "mlir/Support/Functional.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+constexpr char kTfLiteFunctionName[] = "_tflite_function_name";
+constexpr char kTfLiteFunctionUUID[] = "_tflite_function_uuid";
+constexpr char kTfLiteFunctionInputIndex[] = "_tflite_function_input_index";
+constexpr char kTfLiteFunctionOutputIndex[] = "_tflite_function_output_index";
+constexpr char kTfLiteFunctionSortIndex[] = "_tflite_function_sort_index";
+constexpr char kTfLiteFunctionAggregate[] = "_tflite_function_aggregate";
+
+constexpr char kStrategyNone[] = "None";
+constexpr char kStrategyStack[] = "stack";
+constexpr char kStrategyFirst[] = "first";
+constexpr char kStrategyLast[] = "last";
+
+//  A Ophinted op typically looks like below"
+//
+//     InputOp1        InputOp2    InputOp3
+//       /  \            |             |
+//    val1  val2        val3         val4
+//      |    |           |             |
+//  identOp1 identOp2  identOp3      identOp4
+//     \     |           |            /
+//      \    |           |           /
+//  ....   a bunch of operations (needs to be fused) ...
+//                   /       \
+//                  /         \
+//      identOp1 (output)    identOp2 (output)
+//           |                  |
+//       Other ops           Other ops
+//
+//
+//  In this pass, we are trying to convert them into the following format:
+//
+//                     ||
+//                     ||
+//                    \ /
+//
+//     InputOp1        InputOp2    InputOp3
+//       /  \            |             /
+//    val1  val2        val3         val4
+//      \    |           |            /
+//       PackOp          |           /
+//       \    |          |          /
+//        \   |          |         /
+//           Call funcOp (fusedOp - name like 'UnidirectionalSequenceRNN')
+//            (The funcOp will be inserted at the bottom of the module, also
+// .          note every funcOp will be unique.)
+//                   |
+//                  UnpackOp
+//                 /      \
+//                /        \
+//       Other ops         Other ops
+struct OphintCompositeOp {
+  // OphintCompositeOp is a conceptually "composite op" which will be converted
+  // to a "fused op" later.
+  //
+  // As a "composite op", it has "inputs" and "outputs", and all the inputs
+  // and outputs are annotated by special-annotated identity ops.
+  //
+  // All inputs and outputs need to be processed based on different strategies,
+  // See all the different strategies under
+  // tensorflow/lite/python/op_hint.py
+  //
+  // For example, "stack" strategy means we need to pack the inputs together
+  // or unpack the outputs.
+ public:
+  OphintCompositeOp(StringRef uuid, StringRef function_name)
+      : uuid(uuid), function_name(function_name) {}
+
+  void AddInput(int index, Operation* op, StringRef aggregation,
+                int sort_index) {
+    auto it = inputs.find(index);
+    if (it == inputs.end()) {
+      AggregatedOperand operand;
+      operand.aggregation = aggregation;
+      it = inputs.insert({index, operand}).first;
+    }
+    // TODO(renjieliu): check aggregation strategy stays the same.
+    // Also needs to make sure if aggregation strategy is "None" we should not
+    // have more than one op.
+    it->second.ops[sort_index] = op;
+  }
+
+  void AddOutput(int index, Operation* op, llvm::StringRef aggregation,
+                 int sort_index) {
+    auto it = outputs.find(index);
+    if (it == outputs.end()) {
+      AggregatedOperand operand;
+      operand.aggregation = aggregation;
+      it = outputs.insert({index, operand}).first;
+    }
+    // TODO(renjieliu): check aggregation strategy stays the same.
+    // Also needs to make sure if aggregation strategy is "None" we should not
+    // have more than one op.
+    it->second.ops[sort_index] = op;
+  }
+
+  std::vector<Operation*> GetAllInputOps() {
+    std::vector<Operation*> all_input_ops;
+    for (const auto& kv : inputs) {
+      if (kv.second.aggregation == kStrategyFirst) {
+        all_input_ops.push_back(kv.second.ops.at(0));
+        continue;
+      }
+      for (const auto& operandKv : kv.second.ops) {
+        all_input_ops.push_back(operandKv.second);
+      }
+    }
+    return all_input_ops;
+  }
+
+  std::vector<Operation*> GetAllOutputOps() {
+    std::vector<Operation*> all_output_ops;
+    for (const auto& kv : outputs) {
+      for (const auto& operand_kv : kv.second.ops) {
+        all_output_ops.push_back(operand_kv.second);
+      }
+    }
+    return all_output_ops;
+  }
+
+  // This function will process the aggregated inputs based on different
+  // strategies like "first", "last", "stack".
+  std::map<int, Value*> GetAggregatedInputs(OpBuilder* builder) {
+    std::map<int, Value*> aggregated_inputs;
+    for (const auto& kv : inputs) {
+      Value* op_input = nullptr;
+      const AggregatedOperand& operand = kv.second;
+      // Dealiong with "stack" strategy:
+      // This breaks into two parts:
+      // 1) If the ops only has one element, we only add a reshape op to expand
+      // the dim.
+      // 2) If the ops contain more than one element, we need to append a
+      // pack_op after the input ops.
+      if (operand.aggregation == kStrategyStack) {
+        if (operand.ops.size() == 1) {
+          // If ops size is 1, it will be simply expanding dimensions at dim 0.
+          Operation* current_identity_op = operand.ops.begin()->second;
+          Value* input = current_identity_op->getOperand(0);
+          RankedTensorType input_type =
+              input->getType().cast<RankedTensorType>();
+          // The Reshape will be {1, (original_shape)}
+          SmallVector<int64_t, 4> reshape_op_shape;
+          reshape_op_shape.push_back(1);
+          for (const auto& dim : input_type.getShape()) {
+            reshape_op_shape.push_back(dim);
+          }
+          auto reshape_output_type = builder->getTensorType(
+              reshape_op_shape, input_type.getElementType());
+          Operation* first_use = current_identity_op->getNextNode();
+          builder->setInsertionPoint(first_use);
+          Operation* reshape = builder->create<TFL::ReshapeOp>(
+              first_use->getLoc(), reshape_output_type, input);
+          op_input = reshape->getResult(0);
+
+        } else {
+          // Insert a pack op to pack all the inputs together.
+          std::vector<Value*> pack_input_operands;
+          std::vector<Value*> packed_input_consumers;
+          for (int i = 0, e = operand.ops.size(); i < e; ++i) {
+            pack_input_operands.push_back(operand.ops.at(i)->getOperand(0));
+            packed_input_consumers.push_back(operand.ops.at(i)->getResult(0));
+          }
+          // Find the first op that consumes the last value of the aggregated
+          // inputs.
+          Operation* first_use = *(packed_input_consumers.back()->user_begin());
+          // The pack reshape will be {N, (original_shape)}
+          SmallVector<int64_t, 4> pack_shape;
+          pack_shape.push_back(pack_input_operands.size());
+          RankedTensorType type = operand.ops.at(0)
+                                      ->getResult(0)
+                                      ->getType()
+                                      .cast<RankedTensorType>();
+          for (const auto& dim : type.getShape()) {
+            pack_shape.push_back(dim);
+          }
+          auto pack_input_type =
+              builder->getTensorType(pack_shape, type.getElementType());
+          builder->setInsertionPoint(first_use);
+          Operation* pack_op = builder->create<TFL::PackOp>(
+              first_use->getLoc(), pack_input_type, pack_input_operands,
+              builder->getI32IntegerAttr(pack_input_operands.size()),
+              builder->getI32IntegerAttr(0));
+          op_input = pack_op->getResult(0);
+        }
+      } else if (operand.aggregation == kStrategyLast) {
+        // This handle the strategy "last", if simply takes the last input.
+        op_input = operand.ops.at(operand.ops.size() - 1)->getOperand(0);
+      } else {
+        // This handle the strategy "first" and default, if simply takes the
+        // first input.
+        op_input = operand.ops.at(0)->getOperand(0);
+      }
+      aggregated_inputs[kv.first] = op_input;
+    }
+    return aggregated_inputs;
+  }
+
+  // For now, we just return the first output's location which the fused op will
+  // be inserted in.
+  Operation* GetFirstOutputOp() { return outputs.begin()->second.ops.at(0); }
+
+  // Since we have differnt aggregation strategies, e.g., "first", "last",
+  // "stack". We don't somehow aggregated to get the outputs for the funcOp.
+  // This function is simply compute the RankedTensorType (shape & element type)
+  std::map<int, Type> GetAggregatedOuputTypes(OpBuilder* builder) {
+    std::map<int, Type> aggregated_output_types;
+    for (const auto& kv : outputs) {
+      const AggregatedOperand& operand = kv.second;
+      if (operand.aggregation == kStrategyStack) {
+        const int output_numer = operand.ops.size();
+        Value* first_output = operand.ops.at(0)->getOperand(0);
+        RankedTensorType first_output_type =
+            first_output->getType().cast<RankedTensorType>();
+        // The aggregated output shape will be {N, original_shape}.
+        SmallVector<int64_t, 4> shape;
+        shape.push_back(output_numer);
+        for (const auto& dim : first_output_type.getShape()) {
+          shape.push_back(dim);
+        }
+        aggregated_output_types[kv.first] =
+            builder->getTensorType(shape, first_output_type.getElementType());
+      } else if (operand.aggregation == kStrategyLast) {
+        Value* last_output =
+            operand.ops.at(operand.ops.size() - 1)->getOperand(0);
+        aggregated_output_types[kv.first] = last_output->getType();
+      } else {
+        Value* first_output = operand.ops.at(0)->getOperand(0);
+        aggregated_output_types[kv.first] = first_output->getType();
+      }
+    }
+    return aggregated_output_types;
+  }
+
+  void AggregateAndRewireOutputs(OpBuilder* builder, Operation* fused_op) {
+    // TODO(renjieliu): Consider get rid of the ophinted identity nodes here
+    // as well or just rely on the general path to get rid of the identity
+    // nodes.
+    int output_index = 0;
+    for (const auto& kv : outputs) {
+      const AggregatedOperand& operand = kv.second;
+      // This handles the "stack" stratefy. It push a unpack_op before all the
+      // outputs and make all the outputs point to the unpack_op.
+      if (operand.aggregation == kStrategyStack) {
+        // TODO(renjieliu): Revisit here if we need to handle
+        // operand.ops().size() == 1 case. Insert a unpack op to unpack the
+        // outputs.
+        const int output_number = operand.ops.size();
+        // Find the first output.
+        Operation* first_output = operand.ops.at(0);
+        Location insert_loc = first_output->getLoc();
+        SmallVector<Type, 4> unpack_output_types(
+            output_number, first_output->getOperand(0)->getType());
+
+        builder->setInsertionPoint(first_output);
+        Operation* unpack_op = builder->create<TFL::UnpackOp>(
+            insert_loc, unpack_output_types, fused_op->getResult(output_index),
+            builder->getI32IntegerAttr(output_number),
+            builder->getI32IntegerAttr(0));
+        // For every unpack output, make sure they point to the right ones.
+        for (int i = 0; i < output_number; ++i) {
+          Operation* to_be_replaced_op = operand.ops.at(i);
+          to_be_replaced_op->replaceUsesOfWith(to_be_replaced_op->getOperand(0),
+                                               unpack_op->getResult(i));
+        }
+      } else if (operand.aggregation == kStrategyLast) {
+        // This handles the strategy "last", it simply takes the last output.
+        Operation* op = operand.ops.at(operand.ops.size() - 1);
+        op->replaceUsesOfWith(op->getOperand(0), fused_op->getResult(kv.first));
+      } else {
+        // This handles the strategy "first" and default, it simply takes the
+        // first output.
+        Operation* op = operand.ops.at(0);
+        op->replaceUsesOfWith(op->getOperand(0), fused_op->getResult(kv.first));
+      }
+
+      output_index++;
+    }
+  }
+
+  LogicalResult VerifyOphint() const {
+    if (inputs.empty() || outputs.empty()) return failure();
+    return success();
+  }
+
+  StringRef uuid;
+  StringRef function_name;
+
+ private:
+  // The AggregatedOperand is used to hold one "aggregated operand".
+  // For example, this can be
+  // {
+  //    aggregation = "stack",
+  //    {0: ident_op1, 1: ident_op2, 2: ident_op3}
+  // }
+  struct AggregatedOperand {
+    StringRef aggregation;
+    std::map<int, Operation*> ops;
+  };
+
+  std::map<int, AggregatedOperand> inputs;
+  std::map<int, AggregatedOperand> outputs;
+};
+
+Operation* BuildFusedFuncOp(StringRef func_name, StringRef fused_func_type,
+                            Operation* insert_before_op,
+                            const std::map<int, Value*>& inputs,
+                            const std::map<int, Type>& output_types,
+                            OpBuilder* builder, ModuleOp* module_op) {
+  SmallVector<Type, 4> input_types;
+  SmallVector<Value*, 4> input_values;
+  for (const auto& kv : inputs) {
+    Value* input = kv.second;
+    input_types.push_back(input->getType());
+    input_values.push_back(input);
+  }
+
+  SmallVector<Type, 4> func_output_types;
+  for (const auto& kv : output_types) {
+    func_output_types.push_back(kv.second);
+  }
+
+  FunctionType function_type =
+      builder->getFunctionType(/*inputs=*/input_types,
+                               /*results=*/func_output_types);
+
+  SmallVector<NamedAttribute, 4> attrs;
+  attrs.push_back(builder->getNamedAttr(
+      kTfLiteFunctionName, builder->getStringAttr(fused_func_type)));
+  FuncOp func_op = FuncOp::create(insert_before_op->getLoc(), func_name,
+                                  function_type, llvm::makeArrayRef(attrs));
+  module_op->push_back(func_op);
+  builder->setInsertionPoint(insert_before_op);
+  return builder->create<CallOp>(insert_before_op->getLoc(), func_op,
+                                 input_values);
+}
+
+llvm::StringMap<OphintCompositeOp> FindAllOphintNodes(Block* bb) {
+  llvm::StringMap<OphintCompositeOp> ophint_composite_ops;
+  for (auto& op : *bb) {
+    auto nameAttr = op.getAttrOfType<StringAttr>(kTfLiteFunctionName);
+    if (!nameAttr) continue;
+    StringRef function_name = nameAttr.getValue();
+    auto uuidAttr = op.getAttrOfType<StringAttr>(kTfLiteFunctionUUID);
+    if (!uuidAttr) continue;
+    StringRef uuid = uuidAttr.getValue();
+    auto it = ophint_composite_ops.find(uuid);
+    if (it == ophint_composite_ops.end()) {
+      OphintCompositeOp ophint_composite_op(uuid, function_name);
+      it = ophint_composite_ops.insert({uuid, ophint_composite_op}).first;
+    }
+
+    // The default aggregation strategy is "NONE".
+    StringRef aggregation = kStrategyNone;
+    auto aggregationAttr =
+        op.getAttrOfType<StringAttr>(kTfLiteFunctionAggregate);
+    if (aggregationAttr != nullptr) aggregation = aggregationAttr.getValue();
+
+    // The default sort index is 0.
+    int sortIndex = 0;
+    auto sortIndexAttr =
+        op.getAttrOfType<IntegerAttr>(kTfLiteFunctionSortIndex);
+    if (sortIndexAttr != nullptr) sortIndex = sortIndexAttr.getInt();
+
+    auto inputIndexAttr =
+        op.getAttrOfType<IntegerAttr>(kTfLiteFunctionInputIndex);
+    if (inputIndexAttr != nullptr) {
+      it->second.AddInput(inputIndexAttr.getInt(), &op, aggregation, sortIndex);
+    } else {
+      auto outputIndexAttr =
+          op.getAttrOfType<IntegerAttr>(kTfLiteFunctionOutputIndex);
+      it->second.AddOutput(outputIndexAttr.getInt(), &op, aggregation,
+                           sortIndex);
+    }
+  }
+
+  return ophint_composite_ops;
+}
+
+llvm::DenseSet<Operation*> BfsForReachableOps(ArrayRef<Operation*> input_ops) {
+  llvm::DenseSet<Operation*> reachable_ops;
+  std::queue<Operation*> ops_queue;
+  for (auto& input_op : input_ops) {
+    for (Value* value : input_op->getOperands()) {
+      Operation* op = value->getDefiningOp();
+      if (op != nullptr) ops_queue.push(op);
+    }
+  }
+
+  while (!ops_queue.empty()) {
+    Operation* current_op = ops_queue.front();
+    ops_queue.pop();
+    reachable_ops.insert(current_op);
+    for (Value* value : current_op->getOperands()) {
+      Operation* upstream_op = value->getDefiningOp();
+      // Not visited, put it into the queue.
+      if (upstream_op != nullptr &&
+          !llvm::is_contained(reachable_ops, upstream_op)) {
+        ops_queue.emplace(upstream_op);
+      }
+    }
+  }
+
+  return reachable_ops;
+}
+
+// Convert ophint to stub will remove all ops within the ophint region and
+// place a new fused op right before the first op.
+LogicalResult ConvertOphintToStub(StringRef stub_name,
+                                  OphintCompositeOp ophint_composite_op,
+                                  OpBuilder* builder, ModuleOp* module_op) {
+  // Step 1, find all ops reachable by inputs.
+  const llvm::DenseSet<Operation*>& reachable_by_inputs =
+      BfsForReachableOps(ophint_composite_op.GetAllInputOps());
+
+  // Step 2, find all ops reachable by outputs.
+  const llvm::DenseSet<Operation*>& reachable_by_outputs =
+      BfsForReachableOps(ophint_composite_op.GetAllOutputOps());
+
+  // Step 3, deal with inputs aggregation strategies.
+  const std::map<int, Value*>& aggregated_inputs =
+      ophint_composite_op.GetAggregatedInputs(builder);
+
+  // Step 4, get aggregated output types.
+  const std::map<int, Type>& aggregated_output_types =
+      ophint_composite_op.GetAggregatedOuputTypes(builder);
+
+  // Step 5, create & place the fused op and rewire the inputs.
+  // Here we use a funcOp to represent the fused op. This "funcOp" will be
+  // coonverted to other ops (like UnidirectionalSequenceRNNOp) in the
+  // legalization phase.
+  Operation* inserted_before_op = ophint_composite_op.GetFirstOutputOp();
+  Operation* fused_op = BuildFusedFuncOp(
+      stub_name, ophint_composite_op.function_name, inserted_before_op,
+      aggregated_inputs, aggregated_output_types, builder, module_op);
+
+  for (const auto& kv : aggregated_inputs) {
+    Operation* op = kv.second->getDefiningOp();
+    if (op == nullptr) return failure();
+    op->moveBefore(fused_op);
+  }
+
+  // Step 6, deal with outputs aggregation strategies and rewire the outputs.
+  ophint_composite_op.AggregateAndRewireOutputs(builder, fused_op);
+
+  // Step 7, remove all the removable ops where
+  // (reachable_by_outputs - reachable_by_inputs) as removable and the rest
+  // ops are not removable.
+  auto removeRemovableOps = [&](Operation* op) {
+    if (!llvm::is_contained(reachable_by_inputs, op) &&
+        llvm::is_contained(reachable_by_outputs, op)) {
+      op->dropAllDefinedValueUses();
+      op->dropAllReferences();
+      op->erase();
+    }
+  };
+
+  builder->getBlock()->walk(removeRemovableOps);
+  return success();
+}
+
+struct ExtractOphintPass : public ModulePass<ExtractOphintPass> {
+  void runOnModule() override;
+  void Verify();
+
+ private:
+  int ophint_composite_ops_count = 0;
+};
+
+// TODO(renjieliu): Current ophint extraction does not support inputs/outputs
+// cross functions, we need to do that.
+void ExtractOphintPass::runOnModule() {
+  ModuleOp module = getModule();
+  for (auto function : module.getOps<FuncOp>()) {
+    // Process block by block.
+    for (auto& bb : function.getBody()) {
+      // Find ophints.
+      const llvm::StringMap<OphintCompositeOp>& ophint_composite_ops =
+          FindAllOphintNodes(&bb);
+      if (ophint_composite_ops.empty()) continue;
+
+      // Verify: Make sure all ophint_composite_ops are valid.
+      for (const auto& kv : ophint_composite_ops) {
+        if (failed(kv.getValue().VerifyOphint())) {
+          module.emitError()
+              << "Found malformed ophint regions: missing inputs or outputs.";
+          return signalPassFailure();
+        }
+      }
+
+      ophint_composite_ops_count = ophint_composite_ops.size();
+
+      // Convert.
+      OpBuilder builder(&bb);
+      for (const auto& kv : ophint_composite_ops) {
+        if (failed(ConvertOphintToStub(kv.getKey(), kv.getValue(), &builder,
+                                       &module))) {
+          module.emitError()
+              << "Convert ophint failed, malformed inputs or outputs.";
+          return signalPassFailure();
+        }
+      }
+    }
+  }
+}
+
+void ExtractOphintPass::Verify() {
+  ModuleOp module = getModule();
+  int ophint_func_op_count = 0;
+  for (FuncOp func : getModule().getOps<FuncOp>()) {
+    for (const NamedAttribute attr : func.getAttrs()) {
+      if (attr.first == kTfLiteFunctionName) {
+        ophint_func_op_count++;
+        if (func.getNumArguments() == 0) {
+          module.emitError() << "Ophint function has no inputs.";
+          return signalPassFailure();
+        }
+        if (func.getType().getNumResults() == 0) {
+          module.emitError() << "Ophint function has no outputs.";
+          return signalPassFailure();
+        }
+      }
+    }
+  }
+  if (ophint_func_op_count != ophint_composite_ops_count) {
+    module.emitError()
+        << "Ophint converted functions do not match ophint regions founded.";
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+/// Creates an instance of the TensorFlow Lite dialect ExtractOphintPass
+/// pass.
+std::unique_ptr<ModulePassBase> CreateExtractOphintPass() {
+  return std::make_unique<ExtractOphintPass>();
+}
+
+static PassRegistration<ExtractOphintPass> pass(
+    "tfl-extract-ophint", "Extract Ophint for TfLite dialect.");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
new file mode 100644
index 00000000000..2ea5dba3e17
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
@@ -0,0 +1,209 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/SymbolTable.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+constexpr char kTfLiteFunctionName[] = "_tflite_function_name";
+constexpr char kUnidirectionalSequenceRnn[] = "UnidirectionalSequenceRnn";
+
+// This pass is used for converting to TFLite composite op like
+// UnidirectionalSequenceRNN, UnidirectionalSequenceLSTM or SVDF Op. Currently,
+// this pass is only for ophint converted function op only. See below diagram:
+//
+// InputOp1      InputOp2 ...
+//    \            /
+//     \          /
+//    call funcOp (say UnidirectionalSequenceRNN)
+//           |
+//           |
+//        OutputOp1
+//
+//   funcOp() { '_tflite_function_name' = 'UnidirectionalSequenceRNN'}
+//
+//          ||
+//          ||
+//         \ /
+//
+// InputOp1      InputOp2 ...
+//    \            /
+//     \          /
+//    tfl.UnidirectionalSequenceRNN
+//           |
+//           |
+//        OutputOp1
+struct LegalizeOphintFuncOpPass : public ModulePass<LegalizeOphintFuncOpPass> {
+  void runOnModule() override;
+};
+
+llvm::StringMap<FuncOp> FindCompositeFuncOps(ModuleOp module) {
+  llvm::StringMap<FuncOp> composite_func_ops;
+  for (FuncOp func : module.getOps<FuncOp>()) {
+    if (func.getAttr(kTfLiteFunctionName))
+      composite_func_ops[func.getName()] = func;
+  }
+  return composite_func_ops;
+}
+
+LogicalResult BuildUnidirectionalSequenceRnnOp(FuncOp composite_func_op,
+                                               CallOp* call_op,
+                                               OpBuilder* builder,
+                                               Operation** fused_op) {
+  // UnidirectionalSequenceRnn takes exactly 5 inputs.
+  if (composite_func_op.getNumArguments() != 5) return failure();
+  if (call_op->getNumOperands() != 5) return failure();
+  // UnidirectionalSequenceRnn has exactly 1 input.
+  if (call_op->getNumResults() != 1) return failure();
+
+  // Inputs is indexed at 0.
+  Value* input = call_op->getOperand(0);
+  // Input_weight is indexed at 1.
+  Value* weight = call_op->getOperand(1);
+  // Recurrent_weight is indexed at 2.
+  Value* recurrent_weight = call_op->getOperand(2);
+  // Bias is indexed at 3.
+  Value* bias = call_op->getOperand(3);
+  // Hidden_state is indexed at 4.
+  Value* hidden_state = call_op->getOperand(4);
+
+  // Build Output.
+  auto output_type = call_op->getResult(0)->getType();
+
+  // Currently, ophinted RNN only supports time_major = True.
+  const bool time_major = true;
+  // Activation will always be TanH.
+  StringAttr fused_activation_function = builder->getStringAttr("TANH");
+
+  builder->setInsertionPoint(call_op->getOperation());
+  *fused_op = builder->create<TFL::UnidirectionalSequenceRNNOp>(
+      call_op->getLoc(), output_type, input, weight, recurrent_weight, bias,
+      hidden_state, builder->getBoolAttr(time_major),
+      fused_activation_function);
+  return success();
+}
+
+LogicalResult ConvertTfLiteFusedOpIfAvaiable(StringRef func_name,
+                                             FuncOp composite_func_op,
+                                             CallOp* call_op,
+                                             OpBuilder* builder) {
+  Operation* fused_op = nullptr;
+  if (func_name == kUnidirectionalSequenceRnn) {
+    // TODO(renjieliu): Validate the func op inputs.
+    LogicalResult build_fused_op_result = BuildUnidirectionalSequenceRnnOp(
+        composite_func_op, call_op, builder, &fused_op);
+    if (failed(build_fused_op_result)) return build_fused_op_result;
+  } else {  // If we support more fused op, we should add the conversion here.
+    return failure();
+  }
+
+  call_op->replaceAllUsesWith(fused_op);
+
+  // Delete call op.
+  Operation* call = call_op->getOperation();
+  call->dropAllDefinedValueUses();
+  call->dropAllReferences();
+  call->erase();
+  return success();
+}
+
+LogicalResult ConvertCallOps(llvm::StringMap<FuncOp>* composite_func_ops,
+                             ModuleOp* module) {
+  for (auto func : module->getOps<FuncOp>()) {
+    // Ideally it will be much simpler if we can just use walk, but we also
+    // want to early return if encounter errors. :(
+    OpBuilder builder(func.getBody());
+    // The call_op replacement within this loop works like an in-place
+    // replacement, so it should be safe to do so.
+    for (auto call_op :
+         llvm::make_early_inc_range(builder.getBlock()->getOps<CallOp>())) {
+      auto it = composite_func_ops->find(call_op.getCallee());
+      if (it == composite_func_ops->end()) return failure();
+
+      // Replace the call op with TfLite fused op.
+      // Currently it's only handled case by case, but ideally it would be
+      // much better if we can do this automatically.
+      FuncOp composite_func_op = it->second;
+      StringRef func_name = composite_func_op.getAttr(kTfLiteFunctionName)
+                                .cast<StringAttr>()
+                                .getValue();
+      if (failed(ConvertTfLiteFusedOpIfAvaiable(func_name, composite_func_op,
+                                                &call_op, &builder)))
+        return failure();
+
+      composite_func_ops->erase(it);
+      // Delete func op.
+      Operation* func = composite_func_op.getOperation();
+      func->erase();
+    }
+  }
+  return success();
+}
+
+void LegalizeOphintFuncOpPass::runOnModule() {
+  ModuleOp module = getModule();
+  // Find all composite funcs, then for every call op inside every func op
+  // within the module, we go ahead and replace the callop with the tflite
+  // corresponding op and destroy the func op. This two-phase processing is
+  // intended:
+  //
+  // Every func op is meant to be used exactly once.
+  // Instead of finding the composite func then loop through the graph and
+  // convert the call op immediately, we break finding & converting into two
+  // phases. This changes the complexity from O(op_in_module *
+  // function_in_module * attr_in_func) to O(op_in_module) * O(map_look_up) +
+  // O(function_in_module * attr_in_func). O(op_in_module) is the dominant
+  // factor here and map look up should be very cheap.
+  llvm::StringMap<FuncOp> composite_func_ops = FindCompositeFuncOps(module);
+  if (composite_func_ops.empty()) return;
+  if (failed(ConvertCallOps(&composite_func_ops, &module))) {
+    module.emitError() << "Legalize ophint: ConvertCallOps failed.";
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+/// Creates an instance of the TensorFlow Lite dialect LegalizeOphintFuncOpPass
+/// pass.
+std::unique_ptr<ModulePassBase> CreateLegalizeOphintFuncOpPass() {
+  return std::make_unique<LegalizeOphintFuncOpPass>();
+}
+
+static PassRegistration<LegalizeOphintFuncOpPass> pass(
+    "tfl-legalize-ophint-func-op", "Convert composite op for TfLite dialect.");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 90ff6713874..94efc7d2719 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // TFLite legalization patterns
 
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
@@ -29,7 +29,6 @@ class ExtractI32At<int i> : NativeCodeCall<
     "$_builder.getI32IntegerAttr($_self.cast<ArrayAttr>().getValue()[" # i #
     "].cast<IntegerAttr>().getInt())">;
 
-
 // Merge the two Attributes to a ArrayAttr;
 def Merge2AttrsToArray : NativeCodeCall<"$_builder.getArrayAttr({$0, $1})">;
 
@@ -80,6 +79,7 @@ def : Pat<(TF_AvgPoolOp $value,
               /*stride_w=*/ExtractI32At<2>:$strides,
               /*fused_activation_function=*/TFL_AF_None)>;
 
+def : Pat<(TF_ArgMaxOp $input, $dim), (TFL_ArgMaxOp $input, $dim)>;
 def : Pat<(TF_ArgMinOp $input, $dim), (TFL_ArgMinOp $input, $dim)>;
 
 def : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>;
@@ -134,10 +134,14 @@ def : Pat<(TF_ReverseSequenceOp $input, $seq_lengths, $seq_dim, $batch_dim),
           (TFL_ReverseSequenceOp $input, $seq_lengths,
            (convertIntAttrTo32Bit $seq_dim),
            (convertIntAttrTo32Bit $batch_dim))>;
+def : Pat<(TF_RoundOp $arg), (TFL_RoundOp $arg)>;
 def : Pat<(TF_RsqrtOp $arg), (TFL_RsqrtOp $arg)>;
+def : Pat<(TF_SqrtOp $arg), (TFL_SqrtOp $arg)>;
+def : Pat<(TF_SquareOp $arg), (TFL_SquareOp $arg)>;
 // TODO(jpienaar): this is not true for all selects, TF's select supports rank 0
 // condition
 def : Pat<(TF_SelectOp $cond, $x, $y), (TFL_SelectOp $cond, $x, $y)>;
+def : Pat<(TF_SelectV2Op $cond, $x, $y), (TFL_SelectOp $cond, $x, $y)>;
 def : Pat<(TF_ShapeOp $arg), (TFL_ShapeOp $arg)>;
 def : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
 def : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
@@ -146,6 +150,7 @@ def : Pat<(TF_SoftmaxOp $arg), (TFL_SoftmaxOp $arg, ConstF32Attr<"1.0">)>;
 def : Pat<(TF_SqueezeOp $arg, $squeeze_dims), (TFL_SqueezeOp $arg, $squeeze_dims)>;
 def : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
 def : Pat<(TF_TransposeOp $arg, $perm), (TFL_TransposeOp $arg, $perm)>;
+def : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
 def : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
 
 // The following two rules can both match an tf.Placeholder.input node with
@@ -250,6 +255,8 @@ def : Pat<(TF_EqualOp $arg0, $arg1), (TFL_EqualOp $arg0, $arg1)>;
 
 def : Pat<(TF_PadOp $arg0, $arg1), (TFL_PadOp $arg0, $arg1)>;
 
+def : Pat<(TF_TileOp $arg0, $arg1), (TFL_TileOp $arg0, $arg1)>;
+
 def : Pat<(TF_PadV2Op $arg0, $arg1, $cst), (TFL_PadV2Op $arg0, $arg1, $cst)>;
 
 def : Pat<(TF_MeanOp $arg0, $arg1, BoolAttr:$arg2), (TFL_MeanOp $arg0, $arg1, $arg2)>;
@@ -265,16 +272,29 @@ def : Pat<(TF_MaxOp $arg0, $arg1, BoolAttr:$arg2), (TFL_ReduceMaxOp $arg0, $arg1
 
 def : Pat<(TF_ProdOp $arg0, $arg1, BoolAttr:$arg2), (TFL_ReduceProdOp $arg0, $arg1, $arg2)>;
 
+def : Pat<(TF_AnyOp $input, $reduction_indices, $keep_dims),
+          (TFL_ReduceAnyOp $input, $reduction_indices, $keep_dims)>;
+
 def : Pat<(TF_CastOp $arg0, BoolAttr:$arg1), (TFL_CastOp $arg0)>;
 
 def : Pat<(TF_BatchToSpaceNDOp $input, $block_shape, $crops), (TFL_BatchToSpaceNdOp $input, $block_shape, $crops)>;
 
 def : Pat<(TF_SpaceToBatchNDOp $input, $block_shape, $paddings), (TFL_SpaceToBatchNdOp $input, $block_shape, $paddings)>;
 
+def : Pat<(TF_SpaceToDepthOp $input, $block_size, IsDataFormatNHWC:$data_format),
+          (TFL_SpaceToDepthOp $input, (convertIntAttrTo32Bit $block_size))>;
+
+def : Pat<(TF_DepthToSpaceOp $input, $block_size, IsDataFormatNHWC:$data_format),
+          (TFL_DepthToSpaceOp $input, (convertIntAttrTo32Bit $block_size))>;
+
 def : Pat<(TF_ResizeBilinearOp $images, $size, $align_corners, ConstBoolAttrFalse:$half_pixel_centers), (TFL_ResizeBilinearOp $images, $size, $align_corners)>;
+def : Pat<(TF_ResizeNearestNeighborOp $images, $size, $align_corners, ConstBoolAttrFalse:$half_pixel_centers), (TFL_ResizeNearestNeighborOp $images, $size, $align_corners)>;
 
 def : Pat<(TF_MirrorPadOp $arg0, $arg1, $cst), (TFL_MirrorPadOp $arg0, $arg1, $cst)>;
 
+def : Pat<(TF_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values, $default_value, $validate_indices),
+          (TFL_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values, $default_value)>;
+
 def : Pat<
   (TF_StridedSliceOp $input, $begin, $end, $strides, $begin_mask, $end_mask, $ellipsis_mask, $new_axis_mask, $shrink_axis_mask),
   (TFL_StridedSliceOp $input, $begin, $end, $strides,
@@ -283,4 +303,7 @@ def : Pat<
 
 def : Pat<(TF_UniqueOp $arg0),(TFL_UniqueOp $arg0)>;
 
+def : Pat<(TF_FloorModOp $arg0, $arg1), (TFL_FloorModOp $arg0, $arg1)>;
+def : Pat<(TF_ExpOp $arg0), (TFL_ExpOp $arg0)>;
+
 def : Pat<(TF_LRNOp $arg0, $radius, F32Attr:$bias, F32Attr:$alpha, F32Attr:$beta), (TFL_LocalResponseNormalizationOp $arg0, (convertIntAttrTo32Bit $radius), $bias, $alpha, $beta)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index faf80f3acb8..b20af2b4215 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -33,9 +33,9 @@ limitations under the License.
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -205,16 +205,18 @@ void LegalizeTF::runOnFunction() {
 
   // Add the generated patterns to the list.
   populateWithGenerated(ctx, &patterns);
-  RewriteListBuilder<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
-                     ConvertTFPackOp, ConvertTFSplitOp, ConvertTFSplitVOp,
-                     ConvertTFUnpackOp>::build(patterns, ctx);
-  applyPatternsGreedily(func, std::move(patterns));
+  patterns.insert<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
+                  ConvertTFPackOp, ConvertTFSplitOp, ConvertTFSplitVOp,
+                  ConvertTFUnpackOp>(ctx);
+  applyPatternsGreedily(func, patterns);
 }
 
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect LegalizeTF pass.
-FunctionPassBase* CreateLegalizeTFPass() { return new LegalizeTF(); }
+std::unique_ptr<FunctionPassBase> CreateLegalizeTFPass() {
+  return std::make_unique<LegalizeTF>();
+}
 
 static PassRegistration<LegalizeTF> pass(
     "tfl-legalize-tf", "Legalize from TensorFlow to TensorFlow Lite dialect");
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 44ff796b7cc..716c8216433 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Block.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
@@ -35,15 +36,14 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
@@ -82,7 +82,7 @@ struct LowerStaticTensorListPass
 
   // Changes the function type of `cond_func` and `body_func`, and the result
   // type of the `WhileOp`.
-  LogicalResult UpdateWhileFunctionType(TF::WhileOp *while_op);
+  LogicalResult UpdateWhileFunctionType(TF::WhileOp op);
 };
 
 Value *CreateI32SplatConst(Operation *op, PatternRewriter *rewriter,
@@ -100,10 +100,10 @@ Value *CreateI32SplatTensor(Operation *op, PatternRewriter *rewriter,
       shape_tensor, scalar_val);
 }
 
-struct ConvertTFTensorListSetItem : public RewritePattern {
+struct ConvertTFTensorListSetItem
+    : public OpRewritePattern<TF::TensorListSetItemOp> {
   explicit ConvertTFTensorListSetItem(MLIRContext *context)
-      : RewritePattern(TF::TensorListSetItemOp::getOperationName(), 1,
-                       context) {}
+      : OpRewritePattern<TF::TensorListSetItemOp>(context, 1) {}
   // This function rewrites the original op into a series of slice and concat op
   // to produce the same result. It first slices the first `$index` rows. Then
   // expands the dimension of the `$item`, followed by another slice of the
@@ -116,23 +116,21 @@ struct ConvertTFTensorListSetItem : public RewritePattern {
   //        (Slice $input, [0, 0, ...], (Concat (ExpandDims $index, expand_dim =
   //        0), [-1, -1, ...])), (ExpandDims $item, expand_dim = 0), (Slice
   //        $input, [$index + 1, 0, 0, ...], [-1, -1, ...]))>;
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(TF::TensorListSetItemOp op,
                                      PatternRewriter &rewriter) const override {
-    TF::TensorListSetItemOp tf_op = cast<TF::TensorListSetItemOp>(op);
-
-    auto input = tf_op.input_handle();
+    auto input = op.input_handle();
     auto shape_dtype = rewriter.getIntegerType(32);
     auto input_rank = rewriter.create<TF::RankOp>(
-        op->getLoc(), rewriter.getTensorType({}, shape_dtype), input);
-    auto item = tf_op.item();
+        op.getLoc(), rewriter.getTensorType({}, shape_dtype), input);
+    auto item = op.item();
     auto item_rank = rewriter.create<TF::RankOp>(
-        op->getLoc(), rewriter.getTensorType({}, shape_dtype), item);
+        op.getLoc(), rewriter.getTensorType({}, shape_dtype), item);
 
     // Prepare the start position for the first slice op, which is [0, 0, ..,
     // 0].
     auto scalar_zero = CreateI32SplatConst(op, &rewriter, {}, 0);
     auto position_shape = rewriter.create<TF::ExpandDimsOp>(
-        op->getLoc(), rewriter.getTensorType({1}, shape_dtype), input_rank,
+        op.getLoc(), rewriter.getTensorType({1}, shape_dtype), input_rank,
         scalar_zero);
     // Fill all 0s into the first position tensor.
     auto first_start_position =
@@ -141,33 +139,33 @@ struct ConvertTFTensorListSetItem : public RewritePattern {
     // Prepare the start position for the second slice op, which is
     // [index + 1, 0, 0 .. 0].
     // Calculate the first dimension, which is index + 1.
-    auto index = tf_op.index();
+    auto index = op.index();
     auto vector_type = rewriter.getTensorType({1}, shape_dtype);
     auto begin = rewriter.create<TF::AddOp>(
-        op->getLoc(), rewriter.getTensorType(shape_dtype), index,
+        op.getLoc(), rewriter.getTensorType(shape_dtype), index,
         CreateI32SplatConst(op, &rewriter, {1}, 1));
 
     // Followed by the first dimension `begin`, are `item_rank` of 0s.
     auto item_position_shape = rewriter.create<TF::ExpandDimsOp>(
-        op->getLoc(), rewriter.getTensorType({1}, shape_dtype), item_rank,
+        op.getLoc(), rewriter.getTensorType({1}, shape_dtype), item_rank,
         scalar_zero);
     auto partial_second_start_position =
         CreateI32SplatTensor(op, &rewriter, item_position_shape, 0);
     auto position_type = first_start_position->getType();
     // Concatenate `begin` with the remaining 0s.
     auto second_start_position = rewriter.create<TF::ConcatOp>(
-        op->getLoc(), position_type, scalar_zero,
+        op.getLoc(), position_type, scalar_zero,
         ArrayRef<Value *>({begin, partial_second_start_position}),
         rewriter.getI64IntegerAttr(2));
 
     // Create the size parameter for the first slice op, which is [index, -1,
     // -1, .., -1].
     auto size1_leading_dim = rewriter.create<TF::ExpandDimsOp>(
-        op->getLoc(), vector_type, index, scalar_zero);
+        op.getLoc(), vector_type, index, scalar_zero);
     auto partial_size1 =
         CreateI32SplatTensor(op, &rewriter, item_position_shape, -1);
     auto size1 = rewriter.create<TF::ConcatOp>(
-        op->getLoc(), position_type, scalar_zero,
+        op.getLoc(), position_type, scalar_zero,
         ArrayRef<Value *>({size1_leading_dim, partial_size1}),
         rewriter.getI64IntegerAttr(2));
 
@@ -179,14 +177,14 @@ struct ConvertTFTensorListSetItem : public RewritePattern {
     auto element_type = input->getType().cast<TensorType>().getElementType();
     auto unranked_tensor = rewriter.getTensorType(element_type);
     auto slice1 = rewriter.create<TF::SliceOp>(
-        op->getLoc(), unranked_tensor, input, first_start_position, size1);
+        op.getLoc(), unranked_tensor, input, first_start_position, size1);
     auto slice2 = rewriter.create<TF::SliceOp>(
-        op->getLoc(), unranked_tensor, input, second_start_position, size2);
+        op.getLoc(), unranked_tensor, input, second_start_position, size2);
 
     // Expand the dimension of item so that it will have the same rank with
     // input.
     auto expanded_item = rewriter.create<TF::ExpandDimsOp>(
-        op->getLoc(), unranked_tensor, item, scalar_zero);
+        op.getLoc(), unranked_tensor, item, scalar_zero);
 
     // Concatenate three parts together to generate the final result.
     rewriter.replaceOpWithNewOp<TF::ConcatOp>(
@@ -198,52 +196,54 @@ struct ConvertTFTensorListSetItem : public RewritePattern {
   }
 };
 
-struct ConvertTFTensorListReserve : public RewritePattern {
-  explicit ConvertTFTensorListReserve(MLIRContext *context)
-      : RewritePattern(TF::TensorListReserveOp::getOperationName(), 1,
-                       context) {}
+// Rewrites op of the template type initializing a TensorList with a list of ops
+// to generate an equivalent raw tensor. Derived classes are required to
+// override GetNumElements method.
+template <typename OpT>
+struct ConvertTFTensorListInitOp : public OpRewritePattern<OpT> {
+  explicit ConvertTFTensorListInitOp(MLIRContext *context)
+      : OpRewritePattern<OpT>(context, 1) {}
+
+  // Create and return a 1-d tensor with exactly one element equal to the number
+  // of list elements to initialize the output tensor list with.
+  virtual Value *GetNumElements(OpT op, PatternRewriter *rewriter) const = 0;
 
   // Rewrites the original op into `tf.fill`. The result tensor shape is
   // [num_element, element_shape]. All the values in the result tensor will be
   // initialized to 0.
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(OpT op,
                                      PatternRewriter &rewriter) const override {
-    TF::TensorListReserveOp tf_op = cast<TF::TensorListReserveOp>(op);
-
-    auto element_shape = tf_op.element_shape();
+    auto element_shape = op.element_shape();
     auto shape_dtype = getElementTypeOrSelf(element_shape->getType());
-    auto num_elements = tf_op.num_elements();
-    Type element_dtype = tf_op.element_dtype();
+    Type element_dtype = op.element_dtype();
 
     int64_t result_rank = -1;  // -1 means unknown result rank.
     Type result_type = rewriter.getTensorType(element_dtype);
-    if (auto element_type = tf_op.element_type().dyn_cast<RankedTensorType>()) {
+    if (auto element_type =
+            op.element_type().template dyn_cast<RankedTensorType>()) {
       result_rank = element_type.getRank() + 1;
       // If element type is ranked, then result type will have unknown leading
       // dimension and element shape for the following dimensions.
       //
-      // Note: leading dim is not inferred here even if num_elements input is a
-      // constant.
+      // Note: leading dim is not inferred here even when it is a constant.
       SmallVector<int64_t, 4> result_shape = {-1};
       ArrayRef<int64_t> shape = element_type.getShape();
       result_shape.append(shape.begin(), shape.end());
       result_type = rewriter.getTensorType(result_shape, element_dtype);
     }
 
-    // The output shape of the result tensor should be [num_elements +
-    // element_shape].
-    auto scalar_zero = CreateI32SplatConst(op, &rewriter, {}, 0);
-    auto leading_dim = rewriter.create<TF::ExpandDimsOp>(
-        op->getLoc(), rewriter.getTensorType({1}, shape_dtype), num_elements,
-        scalar_zero);
-
     // Create a 1-D RankedTensorType for result's shape. Number of elements in
     // it is equal to the rank of the result, if known. Otherwise, the number of
     // elements are unknown and represented with -1. In both cases, we can
     // specify dimension using rank of the result.
     Type shape_type = rewriter.getTensorType({result_rank}, shape_dtype);
+
+    // Add number of elements as the prefix to the element shape to get shape of
+    // the output tensor.
+    auto leading_dim = GetNumElements(op, &rewriter);
+    auto scalar_zero = CreateI32SplatConst(op, &rewriter, {}, 0);
     auto list_shape = rewriter.create<TF::ConcatOp>(
-        op->getLoc(), shape_type, scalar_zero,
+        op.getLoc(), shape_type, scalar_zero,
         ArrayRef<Value *>({leading_dim, element_shape}),
         rewriter.getI64IntegerAttr(2));
 
@@ -251,9 +251,92 @@ struct ConvertTFTensorListReserve : public RewritePattern {
     // as specified by element_dtype.
     auto zero_type = rewriter.getTensorType({}, element_dtype);
     auto zero_attr = rewriter.getZeroAttr(zero_type);
-    auto zero = rewriter.create<ConstantOp>(op->getLoc(), zero_type, zero_attr);
+    auto zero = rewriter.create<ConstantOp>(op.getLoc(), zero_type, zero_attr);
 
     rewriter.replaceOpWithNewOp<TF::FillOp>(op, result_type, list_shape, zero);
+    return Pattern::matchSuccess();
+  }
+};
+
+struct ConvertTFTensorListReserve
+    : public ConvertTFTensorListInitOp<TF::TensorListReserveOp> {
+  explicit ConvertTFTensorListReserve(MLIRContext *context)
+      : ConvertTFTensorListInitOp(context) {}
+
+  Value *GetNumElements(TF::TensorListReserveOp op,
+                        PatternRewriter *rewriter) const override {
+    auto scalar_zero = CreateI32SplatConst(op, rewriter, {}, 0);
+    auto shape_dtype = getElementTypeOrSelf(op.element_shape()->getType());
+    return rewriter->create<TF::ExpandDimsOp>(
+        op.getLoc(), rewriter->getTensorType({1}, shape_dtype),
+        op.num_elements(), scalar_zero);
+  }
+};
+
+// TODO(hinsu): Replace with declarative patterns once the RewriterGen infra
+// supports patterns involving variadic operand ops.
+//
+// Note that we ignore the second operand `max_num_elements` as we don't have
+// any restrictions on the number of elements we can support. So this may
+// have a different behavior compared to TensorFlow in case of errors.
+struct ConvertTFEmptyTensorList
+    : public ConvertTFTensorListInitOp<TF::EmptyTensorListOp> {
+  explicit ConvertTFEmptyTensorList(MLIRContext *context)
+      : ConvertTFTensorListInitOp(context) {}
+
+  Value *GetNumElements(TF::EmptyTensorListOp op,
+                        PatternRewriter *rewriter) const override {
+    return CreateI32SplatConst(op, rewriter, {1}, 0);
+  }
+};
+
+struct ConvertTFTensorListPushBack : public RewritePattern {
+  explicit ConvertTFTensorListPushBack(MLIRContext *context)
+      : RewritePattern(TF::TensorListPushBackOp::getOperationName(), 1,
+                       context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    TF::TensorListPushBackOp push_back_op = cast<TF::TensorListPushBackOp>(op);
+    Value *item = push_back_op.tensor();
+    Type dtype = getElementTypeOrSelf(*item);
+
+    // Returns a new type by prepending the specified dimension to the shape of
+    // the given type if it is a ranked type.
+    auto with_leading_dim = [&](int64_t dim, Type type) -> Type {
+      if (RankedTensorType ty = type.dyn_cast<RankedTensorType>()) {
+        llvm::SmallVector<int64_t, 4> shape = {dim};
+        shape.append(ty.getShape().begin(), ty.getShape().end());
+        return rewriter.getTensorType(shape, dtype);
+      }
+
+      return rewriter.getTensorType(dtype);
+    };
+
+    // Expand the shape of the item so that it will have rank same as the input
+    // tensor and it is compatible for the Concat Op.
+    Type expanded_item_type = with_leading_dim(1, item->getType());
+    auto scalar_zero = CreateI32SplatConst(op, &rewriter, {}, 0);
+    auto expanded_item = rewriter.create<TF::ExpandDimsOp>(
+        op->getLoc(), expanded_item_type, item, scalar_zero);
+
+    // If the variant type in the output handle has item shape available, use it
+    // to derive the output shape by setting unknown leading dimension.
+    // Otherwise, result type will be of unranked type.
+    Type handle_type = push_back_op.output_handle()->getType();
+    TF::VariantType handle_dtype =
+        getElementTypeOrSelf(handle_type).cast<TF::VariantType>();
+    Type result_type = rewriter.getTensorType(dtype);
+    if (!handle_dtype.getSubtypes().empty()) {
+      result_type = with_leading_dim(-1, handle_dtype.getSubtypes()[0]);
+    }
+
+    // Concatenate tensor stored in the input handle with the expanded item to
+    // get a tensor equivalent to the TensorList generated by this op.
+    rewriter.replaceOpWithNewOp<TF::ConcatOp>(
+        op, result_type, scalar_zero,
+        ArrayRef<Value *>({push_back_op.input_handle(), expanded_item}),
+        rewriter.getI64IntegerAttr(2));
     return matchSuccess();
   }
 };
@@ -267,17 +350,17 @@ namespace {
 }  // namespace TFL
 
 LogicalResult LowerStaticTensorListPass::UpdateWhileFunctionType(
-    TF::WhileOp *while_op) {
+    TF::WhileOp op) {
   SmallVector<Type, 8> unranked_argument_types;
-  for (const auto &operand : while_op->getOperands()) {
+  for (const auto &operand : op.getOperands()) {
     unranked_argument_types.push_back(
         UnrankedTensorType::get(getElementTypeOrSelf(operand->getType())));
   }
 
   auto *context = &getContext();
   auto module = getModule();
-  FuncOp cond_func = module.lookupSymbol<FuncOp>(while_op->getCond());
-  FuncOp body_func = module.lookupSymbol<FuncOp>(while_op->getBody());
+  FuncOp cond_func = module.lookupSymbol<FuncOp>(op.cond());
+  FuncOp body_func = module.lookupSymbol<FuncOp>(op.body());
 
   if (cond_func) {
     // Change `cond_func`'s argument types to `unranked_argument_types`.
@@ -313,9 +396,9 @@ LogicalResult LowerStaticTensorListPass::UpdateWhileFunctionType(
     }
   }
 
-  for (int i = 0; i < while_op->getNumOperands(); ++i) {
-    auto operand = while_op->getOperand(i);
-    auto result = while_op->getResult(i);
+  for (int i = 0; i < op.getNumOperands(); ++i) {
+    auto operand = op.getOperand(i);
+    auto result = op.getResult(i);
     if (getElementTypeOrSelf(result->getType()).isa<TF::VariantType>()) {
       // If we notice the result type is a DT_VARIANT, we change the
       // corresponding result type to unranked tensor type.
@@ -357,7 +440,11 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
         }
         auto c = ConvertTFTensorListReserve(context);
         rewriter->setInsertionPoint(op);
-        c.matchAndRewrite(op, *rewriter);
+        c.matchAndRewrite(tf_op, *rewriter);
+      } else if (auto tf_op = llvm::dyn_cast<TF::EmptyTensorListOp>(op)) {
+        auto c = ConvertTFEmptyTensorList(context);
+        rewriter->setInsertionPoint(op);
+        c.matchAndRewrite(tf_op, *rewriter);
       } else if (auto tf_op = llvm::dyn_cast<TF::TensorListGetItemOp>(op)) {
         auto c = TFL::ConvertTFTensorListGetItem(context);
         rewriter->setInsertionPoint(op);
@@ -365,14 +452,18 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
       } else if (auto tf_op = llvm::dyn_cast<TF::TensorListSetItemOp>(op)) {
         auto c = ConvertTFTensorListSetItem(context);
         rewriter->setInsertionPoint(op);
-        c.matchAndRewrite(op, *rewriter);
+        c.matchAndRewrite(tf_op, *rewriter);
       } else if (auto tf_op = llvm::dyn_cast<TF::TensorListStackOp>(op)) {
         auto c = TFL::ConvertTFTensorListStack(context);
         rewriter->setInsertionPoint(op);
         c.matchAndRewrite(op, *rewriter);
+      } else if (auto tf_op = llvm::dyn_cast<TF::TensorListPushBackOp>(op)) {
+        auto c = ConvertTFTensorListPushBack(context);
+        rewriter->setInsertionPoint(op);
+        c.matchAndRewrite(op, *rewriter);
       } else if (auto tf_op = llvm::dyn_cast<TF::WhileOp>(op)) {
         if (op->getAttr("T")) op->removeAttr(Identifier::get("T", context));
-        UpdateWhileFunctionType(&tf_op);
+        UpdateWhileFunctionType(tf_op);
       } else if (auto tf_op = llvm::dyn_cast<TF::IdentityOp>(op)) {
         if (op->getAttr("T")) op->removeAttr(Identifier::get("T", context));
         tf_op.getResult()->setType(tf_op.getOperand()->getType());
@@ -408,8 +499,8 @@ void LowerStaticTensorListPass::runOnModule() {
 
 /// Creates an instance of the TensorFlow Lite dialect LowerStaticTensorList
 /// pass.
-ModulePassBase *TFL::CreateLowerStaticTensorListPass() {
-  return new LowerStaticTensorListPass();
+std::unique_ptr<ModulePassBase> TFL::CreateLowerStaticTensorListPass() {
+  return std::make_unique<LowerStaticTensorListPass>();
 }
 
 static PassRegistration<LowerStaticTensorListPass> pass(
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 8e3d9690486..33d85b633d5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -21,14 +21,22 @@ limitations under the License.
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Matchers.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
 namespace TFL {
@@ -37,85 +45,166 @@ namespace TFL {
 // The actual Optimize Pass.
 namespace {
 
+using ::llvm::cast;
+
 // Optimize TFLite operations in functions.
 struct Optimize : public FunctionPass<Optimize> {
   void runOnFunction() override;
 };
 
+// Returns whether the given type `a` is broadcast-compatible with `b`.
+bool IsBroadcastableElementsAttrAndType(Type a, Type b) {
+  return OpTrait::util::getBroadcastedType(a, b) != Type();
+}
+
 // Returns whether the given `a` and `b` ElementsAttr have broadcast-compatible
 // types.
 bool IsBroadcastableElementsAttrs(Attribute a, Attribute b) {
-  return OpTrait::util::getBroadcastedType(a.getType(), b.getType()) != Type();
+  return IsBroadcastableElementsAttrAndType(a.getType(), b.getType());
 }
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
-// Fuse Add with FullyConnected.
-// Note that this assumes that the bias in the fullyConnected
-// is always None.
+
+// Fuse Add with proceeding FullyConnected.
 // TODO(b/136285429): Move to tablegen when variadic is supported
-// and add support for bias with noneType type.
-struct FuseFullyConnectedAndAdd : public RewritePattern {
-  explicit FuseFullyConnectedAndAdd(MLIRContext *context)
-      : RewritePattern(TFL::AddOp::getOperationName(),
-                       {"tfl.fully_connected", "tfl.add", "std.constant"}, 4,
-                       context) {}
+struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
+  using OpRewritePattern<TFL::AddOp>::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation *add_op,
+  PatternMatchResult matchAndRewrite(TFL::AddOp add_op,
                                      PatternRewriter &rewriter) const override {
+    // Add.
+    DenseElementsAttr added_value;
+    Value *constant_val = add_op.rhs();
+    if (!matchPattern(constant_val, m_Constant(&added_value)))
+      return matchFailure();
+
     // Fully Connected.
-    Operation *fully_connected = add_op->getOperand(0)->getDefiningOp();
-    if (!fully_connected || !isa<TFL::FullyConnectedOp>(fully_connected))
+    auto fc_op =
+        dyn_cast_or_null<TFL::FullyConnectedOp>(add_op.lhs()->getDefiningOp());
+    if (!fc_op) return matchFailure();
+
+    Value *filter = fc_op.filter();
+    Value *bias = fc_op.bias();
+    ElementsAttr bias_value;
+    const bool is_none_bias = bias->getType().isa<NoneType>();
+    if (!is_none_bias && !matchPattern(bias, m_Constant(&bias_value)))
       return matchFailure();
-    TFL::FullyConnectedOp fully_connected_op =
-        llvm::cast<TFL::FullyConnectedOp>(fully_connected);
-    Value *input = fully_connected_op.input();
-    Value *filter = fully_connected_op.filter();
-
-    // Make sure the bias is None.
-    // TODO(karimnosseir): Support non None case.
-    Operation *bias_op = fully_connected_op.bias()->getDefiningOp();
-    if (!bias_op || !isa<ConstantOp>(bias_op)) return matchFailure();
-    if (!fully_connected_op.bias()->getType().isa<NoneType>())
-      return matchFailure();
-
-    auto activation_func = fully_connected_op.getAttrOfType<StringAttr>(
-        "fused_activation_function");
-    if (!activation_func) return matchFailure();
-    if (activation_func.cast<StringAttr>().getValue() != "NONE")
-      return matchFailure();
-
-    auto weight_format =
-        fully_connected_op.getAttrOfType<StringAttr>("weights_format");
-    if (!weight_format) return matchFailure();
-
-    auto keep_num_dims =
-        fully_connected_op.getAttrOfType<BoolAttr>("keep_num_dims");
-    if (!keep_num_dims) return matchFailure();
-
-    auto constant_op = add_op->getOperand(1)->getDefiningOp();
-    if (!constant_op) return matchFailure();
-    if (!isa<ConstantOp>(constant_op)) return matchFailure();
-
-    auto add_value = constant_op->getAttrOfType<Attribute>("value");
-    if (!add_value) return matchFailure();
-    if (!((add_value.cast<ElementsAttr>().getType().getElementType().isF32())))
-      return matchFailure();
-
-    auto fused_activation_func =
-        add_op->getAttrOfType<StringAttr>("fused_activation_function");
-    if (!fused_activation_func) return matchFailure();
+    if (fc_op.fused_activation_function() != "NONE") return matchFailure();
 
     // Rewrite
-    // TODO(karimnosseir): Check what constraints needed to apply.
-    // TODO(b/136171362): Check for single output consumer.
+    Location loc = fc_op.getLoc();
+    // If bias isn't None, it needs to be added as well.
+    if (is_none_bias) {
+      bias = constant_val;
+    } else {
+      auto none_af = rewriter.getStringAttr("NONE");
+      bias = rewriter.create<AddOp>(loc, bias, constant_val, none_af).output();
+    }
     rewriter.replaceOpWithNewOp<TFL::FullyConnectedOp>(
-        add_op, add_op->getResult(0)->getType(),
-        /*input=*/input,
+        add_op, add_op.getType(),
+        /*input=*/fc_op.input(),
         /*filter=*/filter,
-        /*bias=*/add_op->getOperand(1),
-        /*fused_activation_function=*/fused_activation_func,
-        /*weights_format=*/weight_format,
-        /*keep_num_dims=*/keep_num_dims);
+        /*bias=*/bias,
+        /*fused_activation_function=*/
+        rewriter.getStringAttr(add_op.fused_activation_function()),
+        /*weights_format=*/rewriter.getStringAttr(fc_op.weights_format()),
+        /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.keep_num_dims()));
+
+    return matchSuccess();
+  }
+};
+
+// TODO(b/136285429): Move to tablegen when variadic is supported.
+struct FuseFullyConnectedAndRelu : public OpRewritePattern<TFL::ReluOp> {
+  using OpRewritePattern<TFL::ReluOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TFL::ReluOp relu_op,
+                                     PatternRewriter &rewriter) const override {
+    Operation *input = relu_op.getOperand()->getDefiningOp();
+    if (!isa_and_nonnull<FullyConnectedOp>(input)) return matchFailure();
+    auto fully_connected_op = cast<FullyConnectedOp>(input);
+    if (fully_connected_op.fused_activation_function() != "NONE")
+      return matchFailure();
+
+    auto new_activation_func = rewriter.getStringAttr("RELU");
+    auto new_weights_format =
+        rewriter.getStringAttr(fully_connected_op.weights_format());
+    auto new_keep_num_dims =
+        rewriter.getBoolAttr(fully_connected_op.keep_num_dims());
+    rewriter.replaceOpWithNewOp<FullyConnectedOp>(
+        relu_op, relu_op.getType(), fully_connected_op.input(),
+        fully_connected_op.filter(), fully_connected_op.bias(),
+        new_activation_func, new_weights_format, new_keep_num_dims);
+
+    return matchSuccess();
+  }
+};
+
+// Fuse Mul with proceeding FullyConnected.
+// TODO(b/136285429): Move to tablegen when variadic is supported
+struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
+  using OpRewritePattern<TFL::MulOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TFL::MulOp mul_op,
+                                     PatternRewriter &rewriter) const override {
+    // Mul.
+    DenseElementsAttr cst;
+    Value *constant_val = mul_op.rhs();
+    if (!matchPattern(constant_val, m_Constant(&cst))) return matchFailure();
+
+    // Fully Connected.
+    auto fc_op =
+        dyn_cast_or_null<TFL::FullyConnectedOp>(mul_op.lhs()->getDefiningOp());
+    if (!fc_op) return matchFailure();
+    Value *filter = fc_op.filter();
+    Value *bias = fc_op.bias();
+    ElementsAttr cst_tmp;
+    if (!matchPattern(filter, m_Constant(&cst_tmp))) return matchFailure();
+    if (!bias->getType().isa<NoneType>() &&
+        !matchPattern(bias, m_Constant(&cst_tmp)))
+      return matchFailure();
+    if (fc_op.fused_activation_function().equals("None")) return matchFailure();
+
+    // Broadcast the constant operand of Mul if it isn't compatible to the
+    // filter input. We only support broadcasting the operand along the depth
+    // dimension, when the operand's depth is 1.
+    Value *new_const_val = constant_val;
+    if (!IsBroadcastableElementsAttrAndType(cst.getType(), filter->getType())) {
+      auto original_shape = cst.getType().getShape();
+      llvm::SmallVector<int64_t, 4> normalized_shape(original_shape.begin(),
+                                                     original_shape.end());
+      normalized_shape.push_back(1);
+      auto new_cst = cst.reshape(rewriter.getTensorType(
+          normalized_shape, cst.getType().getElementType()));
+      Type new_type = new_cst.getType();
+      if (!IsBroadcastableElementsAttrAndType(new_type, filter->getType())) {
+        return matchFailure();
+      }
+      auto new_op =
+          rewriter.create<ConstantOp>(mul_op.getLoc(), new_type, new_cst);
+      new_const_val = new_op.getResult();
+    }
+
+    // Rewrite. Since the folder of TFL::MulOp couldn't broadcast the operands,
+    // TF::MulOp is used to fold the constant.
+    // TODO(b/139192933): switch to the TFL constant folding
+    Location loc = fc_op.getLoc();
+    auto new_filter =
+        rewriter.create<TF::MulOp>(loc, filter, new_const_val).z();
+    // If bias isn't None, it needs to be multiplied as well.
+    if (!bias->getType().isa<NoneType>()) {
+      bias = rewriter.create<TF::MulOp>(loc, bias, constant_val).z();
+    }
+
+    rewriter.replaceOpWithNewOp<TFL::FullyConnectedOp>(
+        mul_op, mul_op.getType(),
+        /*input=*/fc_op.input(),
+        /*filter=*/new_filter,
+        /*bias=*/bias,
+        /*fused_activation_function=*/
+        rewriter.getStringAttr(mul_op.fused_activation_function()),
+        /*weights_format=*/rewriter.getStringAttr(fc_op.weights_format()),
+        /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.keep_num_dims()));
 
     return matchSuccess();
   }
@@ -154,12 +243,12 @@ struct PadStridedSliceDims : public RewritePattern {
 
     // Insert a new reshape op.
     Value *original_input = strided_slice.input();
-    const RankedTensorType &original_input_type =
-        original_input->getType().template cast<RankedTensorType>();
+    RankedTensorType original_input_type =
+        original_input->getType().cast<RankedTensorType>();
     const ArrayRef<int64_t> &original_input_shape =
         original_input_type.getShape();
-    const RankedTensorType &begin_type =
-        strided_slice.begin()->getType().template cast<RankedTensorType>();
+    RankedTensorType begin_type =
+        strided_slice.begin()->getType().cast<RankedTensorType>();
     const int dim_size = begin_type.getShape()[0];
     SmallVector<int64_t, 4> new_shape;
     int mask = 1;
@@ -204,19 +293,22 @@ struct PadStridedSliceDims : public RewritePattern {
 
 void Optimize::runOnFunction() {
   OwningRewritePatternList patterns;
+  auto *ctx = &getContext();
   auto func = getFunction();
+
   // Add the generated patterns to the list.
-  TFL::populateWithGenerated(&getContext(), &patterns);
-  patterns.push_back(
-      llvm::make_unique<FuseFullyConnectedAndAdd>(&getContext()));
-  patterns.push_back(llvm::make_unique<PadStridedSliceDims>(&getContext()));
-  applyPatternsGreedily(func, std::move(patterns));
+  TFL::populateWithGenerated(ctx, &patterns);
+  patterns.insert<FuseFullyConnectedAndAdd, FuseFullyConnectedAndRelu,
+                  FuseFullyConnectedAndMul, PadStridedSliceDims>(ctx);
+  applyPatternsGreedily(func, patterns);
 }
 
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect Optimize pass.
-FunctionPassBase *CreateOptimizePass() { return new Optimize(); }
+std::unique_ptr<FunctionPassBase> CreateOptimizePass() {
+  return std::make_unique<Optimize>();
+}
 
 static PassRegistration<Optimize> pass(
     "tfl-optimize", "Optimize within the TensorFlow Lite dialect");
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 6d7e3aa24db..51610832db6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the optimization pattern definition file for TensorFlow Lite.
 
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 def F32ElementsAttr : ElementsAttrBase<
@@ -110,3 +110,40 @@ def : Pat<(TFL_MulOp (TFL_DepthwiseConv2DOp $input,
 // with the same scale. We want to remove the redundancy.
 // TODO(fengliuai): move this to the sanity check of pre-quantize pass.
 def : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in)>;
+
+// Constraint that makes sure both operands are the same operands.
+def EqualOperands : Constraint<CPred<"$0 == $1">>;
+
+// Checks if the operand has rank == n
+class OperandHasRank<int n> : Constraint<
+  CPred<"$0->getType().cast<ShapedType>().getRank() == " # n>>;
+
+// This pattern constructs L2NormalizationOp from
+// Mul->Rsqrt->Sum->Square
+// Currently L2Normalization doesn't support activation function
+// in TFLite.
+def : Pat<(TFL_MulOp $operand1,
+                     (TFL_RsqrtOp
+                        (TFL_SumOp
+                           (TFL_SquareOp $square_operand),
+                           (ConstantOp I32ElementsAttr:$constant),
+                           $keep_dims)),
+                     TFL_AF_None),
+           (TFL_L2NormalizationOp $operand1, TFL_AF_None),
+           [(EqualOperands $operand1, $square_operand),
+            (OperandHasRank<1> $operand1)]>;
+
+// This pattern constructs L2NormalizationOp from
+// Div->sqrt->Sum->Square
+// Currently L2Normalization doesn't support activation function
+// in TFLite.
+def : Pat<(TFL_DivOp $operand1,
+                     (TFL_SqrtOp
+                        (TFL_SumOp
+                           (TFL_SquareOp $square_operand),
+                           (ConstantOp I32ElementsAttr:$constant),
+                           $keep_dims)),
+                     TFL_AF_None),
+           (TFL_L2NormalizationOp $operand1, TFL_AF_None),
+           [(EqualOperands $operand1, $square_operand),
+            (OperandHasRank<1> $operand1)]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 561c0de815f..fb01ba0e9c8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASSES_H_
 
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+
 namespace mlir {
 class FunctionPassBase;
 class ModulePassBase;
@@ -23,29 +27,47 @@ class ModulePassBase;
 namespace TFL {
 
 // Creates an instance of the TensorFlow Lite dialect LegalizeTF pass.
-FunctionPassBase *CreateLegalizeTFPass();
+std::unique_ptr<FunctionPassBase> CreateLegalizeTFPass();
 
 // Creates an instance of the TensorFlow Lite dialect Optimize pass.
-FunctionPassBase *CreateOptimizePass();
+std::unique_ptr<FunctionPassBase> CreateOptimizePass();
 
 // Creates an instance of the TensorFlow Lite dialect PrepareTF pass.
-FunctionPassBase *CreatePrepareTFPass();
+std::unique_ptr<FunctionPassBase> CreatePrepareTFPass();
 
 // Creates an instance of the TensorFlow Lite dialect LowerStaticTensorList
 // pass.
-ModulePassBase *CreateLowerStaticTensorListPass();
+std::unique_ptr<ModulePassBase> CreateLowerStaticTensorListPass();
 
 // Creates an instance of the TensorFlow Lite dialect Quantize pass.
-FunctionPassBase *CreateQuantizePass();
+std::unique_ptr<FunctionPassBase> CreateQuantizePass();
 
 // Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
 // When `quantize_sign` is true, constant tensors will use int8 quantization
 // scheme.
 // TODO(fengliuai): make the bit width configurable.
-FunctionPassBase *CreatePrepareQuantizePass(bool quantize_sign);
+std::unique_ptr<FunctionPassBase> CreatePrepareQuantizePass(bool quantize_sign);
 
 // Creates a instance of the TensorFlow Lite dialect PostQuantize pass.
-FunctionPassBase *CreatePostQuantizePass(bool emit_quant_adaptor_ops);
+std::unique_ptr<FunctionPassBase> CreatePostQuantizePass(
+    bool emit_quant_adaptor_ops);
+
+// Creates an instance of the TensorFlow Lite dialect TrimFunctions
+// pass.
+std::unique_ptr<ModulePassBase> CreateTrimFunctionsPass(
+    llvm::ArrayRef<std::string> trim_funcs_whitelist);
+
+// Creates an instance of the TensorFlow Lite dialect PrepareCompositeFunctions
+// pass.
+std::unique_ptr<FunctionPassBase> CreatePrepareCompositeFunctionsPass();
+
+// Creates a instance of the TensorFlow Lite dialect ExtractOphint pass.
+std::unique_ptr<ModulePassBase> CreateExtractOphintPass();
+
+// Creates a instance of the TensorFlow Lite dialect LegalizeOphintFuncOpPass
+// pass. The composite op is created from the ophint extraction pass.
+std::unique_ptr<ModulePassBase> CreateLegalizeOphintFuncOpPass();
+
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 94c19d27adc..17e715960d9 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 
 //===----------------------------------------------------------------------===//
 // The post-quantize Pass.
@@ -125,8 +125,9 @@ void PostQuantizePass::runOnFunction() {
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect PostQuantize pass.
-FunctionPassBase* CreatePostQuantizePass(bool emit_quant_adaptor_ops) {
-  return new PostQuantizePass(emit_quant_adaptor_ops);
+std::unique_ptr<FunctionPassBase> CreatePostQuantizePass(
+    bool emit_quant_adaptor_ops) {
+  return std::make_unique<PostQuantizePass>(emit_quant_adaptor_ops);
 }
 
 static PassRegistration<PostQuantizePass> pass(
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
new file mode 100644
index 00000000000..58e58c05c4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Identifier.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/SymbolTable.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+// Abstracts the conversion of the embedded lookup composite function.
+class ConvertEmbeddedLookupFunc {
+ public:
+  explicit ConvertEmbeddedLookupFunc(FuncOp func) : func_(func) {}
+
+  void RewriteFunc() {
+    func_.eraseBody();
+    func_.addEntryBlock();
+    func_.setAttr(
+        "tf._implements",
+        StringAttr::get("fused_tfl_embedding_lookup", func_.getContext()));
+    Value* lookup = func_.getArgument(1);
+    Value* value = func_.getArgument(0);
+    auto output_type = func_.getType().getResult(0);
+
+    OpBuilder builder(func_.getBody());
+    auto op = builder.create<mlir::TFL::EmbeddingLookupOp>(
+        func_.getLoc(), output_type, lookup, value);
+
+    builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResult());
+  }
+
+  LogicalResult VerifySignature() {
+    if (func_.getNumArguments() != 2) {
+      return func_.emitError()
+             << "Invalid number of arguments in the embedding "
+                "matmal composite function";
+    }
+    if (func_.getType().getNumResults() != 1) {
+      return func_.emitError() << "Invalid number of results in the embedding "
+                                  "matmal composite function";
+    }
+    return success();
+  }
+
+ private:
+  FuncOp func_;
+};
+
+// This pass uses mechanisms listed in RFC:
+// https://github.com/tensorflow/community/pull/113
+// It prepares composite functions that are attributed to indicate
+// a specific interface (LSTM, SVDF, Embedding lookup etc.) by replacing the
+// body with the corresponding fused TFLite op. The replacement need not always
+// be a fused op, though that is the primary use case.
+class PrepareCompositeFunctionsPass
+    : public FunctionPass<PrepareCompositeFunctionsPass> {
+ public:
+  explicit PrepareCompositeFunctionsPass() {}
+
+ private:
+  void runOnFunction() override;
+};
+
+void PrepareCompositeFunctionsPass::runOnFunction() {
+  // TODO(ashwinm): Explore if we can generalize this pass by simply taking
+  // a map<func annotation, tfl op> and doing the transform. This should be
+  // revisited after we add LSTM composite op to this pass.
+  auto func = getFunction();
+  auto attr = func.getAttrOfType<StringAttr>("tf._implements");
+  if (!attr || attr.getValue() != "embedding_matmul") return;
+  // Convert the composite embedding_matmul function body to a
+  // TFLite fused embedding_lookup op.
+  ConvertEmbeddedLookupFunc convert_embedded_lookup(func);
+  if (failed(convert_embedded_lookup.VerifySignature())) {
+    return signalPassFailure();
+  }
+  convert_embedded_lookup.RewriteFunc();
+}
+}  // namespace
+
+std::unique_ptr<FunctionPassBase> CreatePrepareCompositeFunctionsPass() {
+  return std::unique_ptr<PrepareCompositeFunctionsPass>();
+}
+
+static PassRegistration<PrepareCompositeFunctionsPass> pass(
+    "tfl-prepare-composite-funcs-tf",
+    "Prepares composite functions in Tensorflow dialect of MLIR ");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 62c3de86e72..e3dabb7a48d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -18,7 +18,7 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 
-// Converts tf.FusedBatchNorm into a sequence of more primitive arithmetic
+// Converts tf.FusedBatchNorm & tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
 // operations. Specifically, performs the following calculation:
 //
 //   (x - mean) * scale / sqrt(variance + epsilon) + offset
@@ -29,9 +29,9 @@ def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 // is then to compute
 //   (x * multiplier) + (offset - mean * multiplier).
 def : Pattern<
-    (TF_FusedBatchNormOp $x, $scale, $offset, $mean, $variance,
-                         F32Attr:$epsilon, $data_format,
-                         FalseBoolAttr:$is_training),
+    (TF_FusedBatchNormOp:$root
+        $x, $scale, $offset, $mean, $variance,
+        F32Attr:$epsilon, $data_format, FalseBoolAttr:$is_training),
     [(TF_AddOp
         (TF_MulOp
             $x,
@@ -41,21 +41,40 @@ def : Pattern<
                     (TF_AddOp $variance,
                               (TF_ConstOp $epsilon))))),
         (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
-     /*batch_mean=*/(verifyUnusedValue),
-     /*batch_variance=*/(verifyUnusedValue),
-     /*reserve_space_1=*/(verifyUnusedValue),
-     /*reserve_space_2=*/(verifyUnusedValue)
-    ]>;
+     // We already guaranteed that the last four results has no use so it does
+     // not matter what value we provide here for replacement.
+     /*batch_mean=*/(replaceWithValue $x),
+     /*batch_variance=*/(replaceWithValue $x),
+     /*reserve_space_1=*/(replaceWithValue $x),
+     /*reserve_space_2=*/(replaceWithValue $x)],
+    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4)]>;
 
-// TODO(jpienaar): Move to opbase something more general.
-def TFi32ElementsAttr : Attr<CPred<"$_self.isa<DenseIntElementsAttr>">,
-                                   "scalar int attribute"> {
-  let storageType = [{ DenseIntElementAttr }];
-  let constBuilderCall = "$_builder.getDenseElementsAttr("
-    "$_builder.getTensorType({}, $_builder.getIntegerType(32)), "
-      "{$_builder.getI32IntegerAttr($0)})";
-}
-class TFi32<int v> : ConstantAttr<TFi32ElementsAttr, !cast<string>(v)>;
+def : Pattern<
+    (TF_FusedBatchNormV3Op:$root
+        $x, $scale, $offset, $mean, $variance,
+        F32Attr:$epsilon, $data_format, FalseBoolAttr:$is_training),
+    [(TF_AddOp
+        (TF_MulOp
+            $x,
+            (TF_MulOp:$multiplier
+                $scale,
+                (TF_RsqrtOp
+                    (TF_AddOp $variance,
+                              (TF_ConstOp $epsilon))))),
+        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
+     // We already guaranteed that the last five results have no use so it does
+     // not matter what value we provide here for replacement.
+     /*batch_mean=*/(replaceWithValue $x),
+     /*batch_variance=*/(replaceWithValue $x),
+     /*reserve_space_1=*/(replaceWithValue $x),
+     /*reserve_space_2=*/(replaceWithValue $x),
+     /*reserve_space_3=*/(replaceWithValue $x)],
+    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
+     (HasNoUseOf:$root__5)]>;
+
+class TFi32<int v> : ConstantAttr<I32ElementsAttr, !cast<string>(v)>;
 
 // Matmul without transpose on b to matmul with explicit transpose op and
 // transposed b.
@@ -75,10 +94,14 @@ def : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrTrue, $bt),
              /*delta=*/(ConstantOp TFi32<-1>)), (ConstantOp TFi32<1>))), $b,
            ConstBoolAttrFalse, $bt)>;
 
+def : Pat<(TF_SnapshotOp $arg), (TF_IdentityOp $arg)>;
+def : Pat<(TF_StopGradientOp $arg), (TF_IdentityOp $arg)>;
+
 //===----------------------------------------------------------------------===//
 // Op removal patterns.
 //===----------------------------------------------------------------------===//
 def : Pat<(TF_IdentityOp $arg), (replaceWithValue $arg)>;
+def : Pat<(TF_IdentityNOp $arg), (replaceWithValue $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Op quantization pass-through patterns.
@@ -98,3 +121,27 @@ def : Pat<(TF_ReshapeOp
               $shape),
           (TF_FakeQuantWithMinMaxVarsOp (TF_ReshapeOp $input, $shape),
               $min, $max, $num_bits, $narrow_range)>;
+
+// Casts result type of $1 to a quantized type by using the quantization
+// parameters from the type in $0.
+def UpdateShape : NativeCodeCall<
+  "CastQuantizedTypeAttrFromExpressedType($_builder, $0, $1->getType())">;
+
+// When the op is passing-through, the output types of the quantized ops need
+// to be updated as well. Since the quantize op manages its own type by the
+// "qtype" attribute, we should update the type shape in this attribute.
+def : Pat<(TF_TransposeOp:$old_value
+              (TFL_DequantizeOp (TFL_QuantizeOp $input, $qtype)), $perm),
+          (TFL_DequantizeOp (TFL_QuantizeOp (TF_TransposeOp $input, $perm),
+                                            (UpdateShape $qtype, $old_value)))>;
+
+def : Pat<(TF_ReshapeOp:$old_value
+              (TFL_DequantizeOp (TFL_QuantizeOp $input, $qtype)), $shape),
+          (TFL_DequantizeOp
+              (TFL_QuantizeOp (TF_ReshapeOp $input, $shape),
+              (UpdateShape $qtype, $old_value)))>;
+
+// The Rank op produces result which is independent with the quantization
+// parameters of the input, so we can remove the quantization ops.
+def : Pat<(TF_RankOp (TFL_DequantizeOp (TFL_QuantizeOp $input, $qtype))),
+          (TF_RankOp $input)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index c91cdb3df45..9ad26e4d782 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 // This transformation pass applies quantization propagation on TFLite dialect.
 
+#include "absl/memory/memory.h"
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 
 //===----------------------------------------------------------------------===//
 // The prepare-quantize Pass.
@@ -27,6 +29,7 @@ namespace mlir {
 namespace TFL {
 
 namespace {
+
 // Applies prepare quantization on the model in TFL dialect. This pass runs
 // before the quantization pass and propagate the quantization parameters
 // across ops. This step is necessary for post-training quantization and also
@@ -47,15 +50,19 @@ class PrepareQuantizePass : public FunctionPass<PrepareQuantizePass> {
   bool quantize_sign_;
 };
 
+#include "tensorflow/compiler/mlir/lite/utils/generated_op_quant_spec_getters.inc"
+
 void PrepareQuantizePass::runOnFunction() {
-  ApplyQuantizationParamsPropagation(getFunction(), quantize_sign_);
+  ApplyQuantizationParamsPropagation(getFunction(), quantize_sign_,
+                                     GetOpQuantSpec);
 }
 
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
-FunctionPassBase *CreatePrepareQuantizePass(bool quantize_sign) {
-  return new PrepareQuantizePass(quantize_sign);
+std::unique_ptr<FunctionPassBase> CreatePrepareQuantizePass(
+    bool quantize_sign) {
+  return std::make_unique<PrepareQuantizePass>(quantize_sign);
 }
 
 static PassRegistration<PrepareQuantizePass> pass(
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 6f2e9e6ea1e..7c7983ae254 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -48,9 +48,9 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -71,54 +71,79 @@ struct PrepareTFPass : public FunctionPass<PrepareTFPass> {
 };
 
 // TODO(fengliuai): move this rule to PreparePatterns.td
-// Inserts a "tfl.quantize" and "tfl.dequantize" op pair after the
+// Inserts a "tfl.quantize" and "tfl.dequantize" op pair (QDQs) after the
 // "tf.FakeQuantWithMinMaxVarsOp" to be constant folded. Since the constant
 // folding logic will use a "std.constant" op to replace the
 // "tf.FakeQuantWithMinMaxVarsOp", the "tfl.quantize" op is used to preserve
 // the quantization parameters as a TypeAttr and "tfl.dequantize" op used to
-// convert the output type to the next op.
+// convert the output type to the next op. Here are the transformations:
+//
+// input   min cst       max cst          input   min cst       max cst
+//  \       |             |                \       |             |
+//   \  (tf.Identity) (tf.Identity)   =>    \  (tf.Identity) (tf.Identity)
+//    \     |             |                  \     |             |
+//       tf.FakeQuantWithMinMaxVars       tf.FakeQuantWithMinMaxVars
+//                   |                                 |
+//                                                tf.quantize
+//                                                     |
+//                                                tf.dequantize
+//                                                     |
+// If the input is a constant, the result pattern will eventually converted to
+
+//            quant-emulated input
+//                   |
+//               tf.quantize
+//                   |
+//              tf.dequantize
+//                   |
 struct InsertTFLQuantOpsAfterTFFakeQuantOp : public RewritePattern {
   InsertTFLQuantOpsAfterTFFakeQuantOp(MLIRContext *context)
-      : RewritePattern(TF::FakeQuantWithMinMaxVarsOp::getOperationName(), 1,
+      : RewritePattern(TF::FakeQuantWithMinMaxVarsOp::getOperationName(), 3,
                        context) {}
-  struct MatchedState : public PatternState {
-    FloatAttr min;
-    FloatAttr max;
-    APInt num_bits;
-    bool narrow_range;
-  };
-
-  PatternMatchResult match(Operation *op) const override {
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
     auto tf_op = cast<TF::FakeQuantWithMinMaxVarsOp>(op);
+    // We don't want to insert quantize/dequantize if the quantize op exists.
     auto res = tf_op.outputs();
     if (!res->hasOneUse() || isa<QuantizeOp>(*res->user_begin()))
       return matchFailure();
-    auto state = absl::make_unique<MatchedState>();
-    ElementsAttr min_value, max_value;
-    if (!matchPattern(tf_op.min(), m_Constant(&min_value)))
-      return matchFailure();
-    if (!matchPattern(tf_op.max(), m_Constant(&max_value)))
-      return matchFailure();
-    state->min = ExtractSingleElementAsFloat(min_value);
-    state->max = ExtractSingleElementAsFloat(max_value);
-    if (!state->min || !state->max) return matchFailure();
-    state->num_bits = tf_op.num_bits();
-    state->narrow_range = tf_op.narrow_range();
-    return matchSuccess(std::move(state));
-  }
 
-  void rewrite(Operation *op, std::unique_ptr<PatternState> state,
-               PatternRewriter &rewriter) const override {
-    auto &s = *static_cast<MatchedState *>(state.get());
-    Location loc = op->getLoc();
-    Value *copied = OpBuilder(op).clone(*op)->getResult(0);
-    Type res_type = copied->getType();
-    Type storage_type = rewriter.getIntegerType(s.num_bits.getSExtValue());
-    TypeAttr qtype = GetQuantizedTypeAttr(rewriter, res_type, s.min, s.max,
-                                          storage_type, s.narrow_range);
-    Value *quantize_op =
-        rewriter.create<TFL::QuantizeOp>(loc, qtype.getValue(), copied, qtype);
-    rewriter.replaceOpWithNewOp<TFL::DequantizeOp>(op, res_type, quantize_op);
+    // Extract the min/max constant values from the operands. We also consider
+    // a special case that there are tf.Identity ops between the min/max
+    // constants and the tf.FakeQuantWithMinMaxVarsOp.
+    Value *min = tf_op.min(), *max = tf_op.max();
+    ElementsAttr min_value, max_value;
+    if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min->getDefiningOp()))
+      min = id1.input();
+    if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max->getDefiningOp()))
+      max = id2.input();
+    if (!matchPattern(min, m_Constant(&min_value))) return matchFailure();
+    if (!matchPattern(max, m_Constant(&max_value))) return matchFailure();
+    FloatAttr min_attr = ExtractSingleElementAsFloat(min_value);
+    FloatAttr max_attr = ExtractSingleElementAsFloat(max_value);
+    if (!min_attr || !max_attr) return matchFailure();
+
+    // Use the min/max from the operands and the num_bits and narrow_range
+    // attribute to create the quantization parameter for the new quantize op.
+    rewriter.setInsertionPoint(op->getBlock(), ++Block::iterator(op));
+    Type num_bits = rewriter.getIntegerType(tf_op.num_bits().getSExtValue());
+    bool narrow_range = tf_op.narrow_range();
+    Type res_type = tf_op.getType();
+    TypeAttr qtype = GetQuantizedTypeAttr(rewriter, res_type, min_attr,
+                                          max_attr, num_bits, narrow_range);
+
+    // Finally, use the quantization parameter to create the quantize and
+    // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
+    // and its users.
+    Value *value = tf_op.outputs();
+    auto quantize = rewriter.create<TFL::QuantizeOp>(
+        op->getLoc(), qtype.getValue(), value, qtype);
+    auto dequantize = rewriter.create<TFL::DequantizeOp>(op->getLoc(), res_type,
+                                                         quantize.output());
+    value->replaceAllUsesWith(dequantize);
+    quantize.getOperation()->replaceUsesOfWith(dequantize, value);
+
+    return matchSuccess();
   }
 };
 
@@ -170,7 +195,7 @@ struct ConvertTFConvOp : public RewritePattern {
     IntegerAttr height, width;
     if (!TFIntListIs1XY1(op, "strides", &height, &width)) return matchFailure();
 
-    auto state = llvm::make_unique<ConvertTFConvOpMatchState>();
+    auto state = std::make_unique<ConvertTFConvOpMatchState>();
 
     state->stride_height = height;
     state->stride_width = width;
@@ -352,25 +377,34 @@ class ConvertTFDepthwiseConv2dNative
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
+  // This pattern was intented to uses TFL QDQs to preserve the quantization
+  // parameters from the TF Quant ops, thus this pattern should run with the
+  // first `applyPatternsGreedily` method, which would otherwise removes the
+  // TF FakeQuant ops by the constant folding.
+  patterns.insert<InsertTFLQuantOpsAfterTFFakeQuantOp>(&getContext());
   TFL::populateWithGenerated(&getContext(), &patterns);
   // TODO(karimnosseir): Split to separate pass probably after
   // deciding on long term plan for this optimization.
   // This will allow optimizing any TF_Mul->TF_Conv in the graph
   // and any expanded from FusedBatchNorm. We need to do this
   // before converting TF_Conv to TFL_Conv
-  applyPatternsGreedily(func, std::move(patterns));
-  patterns.push_back(llvm::make_unique<ConvertTFConv2D>(&getContext()));
-  patterns.push_back(
-      llvm::make_unique<ConvertTFDepthwiseConv2dNative>(&getContext()));
-  patterns.push_back(
-      llvm::make_unique<InsertTFLQuantOpsAfterTFFakeQuantOp>(&getContext()));
-  applyPatternsGreedily(func, std::move(patterns));
+  applyPatternsGreedily(func, patterns);
+
+  // Load the generated pattern again, so new quantization pass-through
+  // will be applied.
+  patterns.clear();
+  TFL::populateWithGenerated(&getContext(), &patterns);
+  patterns.insert<ConvertTFConv2D, ConvertTFDepthwiseConv2dNative>(
+      &getContext());
+  applyPatternsGreedily(func, patterns);
 }
 
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect PrepareTF pass.
-FunctionPassBase *CreatePrepareTFPass() { return new PrepareTFPass(); }
+std::unique_ptr<FunctionPassBase> CreatePrepareTFPass() {
+  return std::make_unique<PrepareTFPass>();
+}
 
 static PassRegistration<PrepareTFPass> pass(
     "tfl-prepare-tf", "Prepare TF for legalization to TensorFlow Lite dialect");
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 91bb26a976b..e4029d7f13f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/utils/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 
 namespace mlir {
@@ -55,14 +55,16 @@ void QuantizePass::runOnFunction() {
   auto func = getFunction();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(ctx, &patterns);
-  mlir::RewriteListBuilder<mlir::TFL::GenericFullQuantizationPattern<
-      mlir::TFL::QuantizeOp, mlir::TFL::DequantizeOp>>::build(patterns, ctx);
-  applyPatternsGreedily(func, std::move(patterns));
+  patterns.insert<mlir::TFL::GenericFullQuantizationPattern<
+      mlir::TFL::QuantizeOp, mlir::TFL::DequantizeOp>>(ctx);
+  applyPatternsGreedily(func, patterns);
 }
 }  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect QuantizeTFL pass.
-FunctionPassBase* CreateQuantizePass() { return new QuantizePass(); }
+std::unique_ptr<FunctionPassBase> CreateQuantizePass() {
+  return std::make_unique<QuantizePass>();
+}
 
 static PassRegistration<QuantizePass> pass(
     "tfl-quantize", "Apply quantization on models in TensorFlow Lite dialect");
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index 7fcf926d89f..369b5300540 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the quantization pattern definition file for TensorFlow Lite.
 
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Quantize attribute $0 by using quantization parameter from %1.
diff --git a/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
new file mode 100644
index 00000000000..1cd4f42810e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
@@ -0,0 +1,133 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <queue>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Identifier.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/SymbolTable.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+
+// The cmd line flag to specify the whitelist of functions. Rest are trimmed
+// after this pass is run.
+// NOLINTNEXTLINE
+static llvm::cl::list<std::string> trim_funcs_whitelist(
+    "tfl-trim-funcs-whitelist", llvm::cl::value_desc("list"),
+    llvm::cl::desc("comma seprarated list of whitelisted functions. The first "
+                   "function specified will be used as main."),
+    llvm::cl::CommaSeparated);
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+// The pass to trim functions before we legalize to TFL
+// dialect using the specified whitelist.
+class TrimFunctionsPass : public mlir::ModulePass<TrimFunctionsPass> {
+ public:
+  explicit TrimFunctionsPass() : trim_funcs_whitelist_(trim_funcs_whitelist) {}
+  explicit TrimFunctionsPass(llvm::ArrayRef<std::string> trim_funcs_whitelist)
+      : trim_funcs_whitelist_(trim_funcs_whitelist) {}
+
+ private:
+  void runOnModule() override;
+  bool TrimModule();
+  void Verify();
+
+  llvm::ArrayRef<std::string> trim_funcs_whitelist_;
+};
+
+void TrimFunctionsPass::runOnModule() {
+  // trim the functions in the module using the trim_funcs_whitelist_
+  // by removing functions not in the whitelist.
+  if (TrimModule()) {
+    // verify the updated module is still valid, if not signal the
+    // pass as failed.
+    Verify();
+  }
+}
+
+bool TrimFunctionsPass::TrimModule() {
+  // if no trim_funcs_whitelist_ is specified, this pass is a no-op.
+  if (trim_funcs_whitelist_.empty()) return false;
+
+  llvm::SmallVector<FuncOp, 4> funcs_to_trim;
+  for (auto func : getModule().getOps<FuncOp>()) {
+    if (llvm::is_contained(trim_funcs_whitelist_, func.getName())) {
+      // If no main is specified in the whitelist, use the 1st func
+      // in trim_funcs_whitelist as the main.
+      // TODO(ashwinm): Currently tflite flatbuffer export assumes there is
+      // always a main. This is strictly not required for TFlite. We need to
+      // remove that restriction once we have support to attribute the main
+      // tensorflow function in MLIR TF import using an entry_point attr.
+      if (!llvm::is_contained(trim_funcs_whitelist_, "main") &&
+          func.getName() == trim_funcs_whitelist_[0]) {
+        func.setName("main");
+      }
+    } else {
+      funcs_to_trim.push_back(func);
+    }
+  }
+
+  // remove all unexported functions from the module.
+  for (auto func : funcs_to_trim) {
+    func.erase();
+  }
+  return true;
+}
+
+// validate that all reachable functions from the remaining functions are
+// also in the whitelist.
+void TrimFunctionsPass::Verify() {
+  // TODO(ashwinm): Instead, we should make sure that references to all
+  // SymbolRefAttrs of all ops are present.
+  SymbolTable symbol_table = SymbolTable(getModule());
+  llvm::SetVector<FuncOp> reachable_funcs;
+  for (auto func : getModule().getOps<FuncOp>()) {
+    func.walk<CallOp>([&](CallOp op) {
+      if (!symbol_table.lookup<FuncOp>(op.getCallee())) {
+        getModule().emitError()
+            << func.getName() << " is not in the funcs whitelist";
+        return signalPassFailure();
+      }
+    });
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow Lite dialect TrimFunctions
+/// pass.
+std::unique_ptr<ModulePassBase> CreateTrimFunctionsPass(
+    llvm::ArrayRef<std::string> trim_funcs_whitelist) {
+  return std::make_unique<TrimFunctionsPass>(trim_funcs_whitelist);
+}
+
+static PassRegistration<TrimFunctionsPass> pass(
+    "tfl-trim-funcs-tf",
+    "Trim functions to restrict them to a specified whitelist prior to "
+    "legalization to TensorFlow lite dialect");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
index a1a427a0381..33da9929711 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
@@ -25,7 +25,7 @@ FloatAttr ExtractSingleElementAsFloat(ElementsAttr attr) {
     return {};
   }
   SmallVector<uint64_t, 8> index(attr.getType().getRank(), 0);
-  return attr.getValue(index).cast<FloatAttr>();
+  return attr.getValue<FloatAttr>(index);
 }
 
 FloatAttr GetSingleElementAsFloatOrSelf(Attribute attr) {
@@ -42,7 +42,7 @@ IntegerAttr ExtractSingleElementAsInteger(ElementsAttr attr) {
     return {};
   }
   SmallVector<uint64_t, 8> index(attr.getType().getRank(), 0);
-  return attr.getValue(index).cast<IntegerAttr>();
+  return attr.getValue<IntegerAttr>(index);
 }
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.h b/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
index efa782ce4e8..263a0a8dc93 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
@@ -19,7 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
 
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
new file mode 100644
index 00000000000..5dcd40aab6b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
+
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
+  switch (type) {
+    case tflite::TensorType_FLOAT32:
+      return builder.getF32Type();
+    case tflite::TensorType_FLOAT16:
+      return builder.getF16Type();
+    case tflite::TensorType_INT32:
+      return builder.getIntegerType(32);
+    case tflite::TensorType_UINT8:
+      return mlir::TF::Uint8Type::get(builder.getContext());
+    case tflite::TensorType_INT64:
+      return builder.getIntegerType(64);
+    case tflite::TensorType_STRING:
+      return mlir::TF::StringType::get(builder.getContext());
+    case tflite::TensorType_BOOL:
+      return builder.getI1Type();
+    case tflite::TensorType_INT16:
+      return builder.getIntegerType(16);
+    case tflite::TensorType_COMPLEX64:
+      return mlir::TF::Complex64Type::get(builder.getContext());
+    case tflite::TensorType_INT8:
+      return builder.getIntegerType(8);
+  }
+}
+
+tensorflow::DataType TflTypeToTfType(tflite::TensorType type) {
+  switch (type) {
+    case tflite::TensorType_BOOL:
+      return tensorflow::DT_BOOL;
+    case tflite::TensorType_COMPLEX64:
+      return tensorflow::DT_COMPLEX64;
+    case tflite::TensorType_FLOAT16:
+      return tensorflow::DT_HALF;
+    case tflite::TensorType_FLOAT32:
+      return tensorflow::DT_FLOAT;
+    case tflite::TensorType_INT8:
+      return tensorflow::DT_INT8;
+    case tflite::TensorType_INT16:
+      return tensorflow::DT_INT16;
+    case tflite::TensorType_INT32:
+      return tensorflow::DT_INT32;
+    case tflite::TensorType_INT64:
+      return tensorflow::DT_INT64;
+    case tflite::TensorType_STRING:
+      return tensorflow::DT_STRING;
+    case tflite::TensorType_UINT8:
+      return tensorflow::DT_UINT8;
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.h b/tensorflow/compiler/mlir/lite/utils/convert_type.h
new file mode 100644
index 00000000000..ff4ccb325a8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
+
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace mlir {
+class Builder;
+}
+
+namespace tflite {
+// Convert the scalar type of a TFlite tensor to the corresponding MLIR type.
+mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder);
+
+// Convert the scalar type of a TFLite tensor to the corresponding
+// Tensorflow type
+tensorflow::DataType TflTypeToTfType(tflite::TensorType type);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/validators.h b/tensorflow/compiler/mlir/lite/utils/validators.h
index 8cd375a61f7..c68cd0e8605 100644
--- a/tensorflow/compiler/mlir/lite/utils/validators.h
+++ b/tensorflow/compiler/mlir/lite/utils/validators.h
@@ -19,8 +19,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/op_name_mapper.cc b/tensorflow/compiler/mlir/op_name_mapper.cc
new file mode 100644
index 00000000000..cd0bc0d3e02
--- /dev/null
+++ b/tensorflow/compiler/mlir/op_name_mapper.cc
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/op_name_mapper.h"
+
+#include "llvm/ADT/APInt.h"
+
+namespace tensorflow {
+
+using llvm::StringRef;
+using mlir::Operation;
+
+OpNameMapper::~OpNameMapper() {}
+
+std::string OpNameMapper::GetUniqueName(llvm::StringRef prefix) {
+  std::string name = prefix;
+  auto& val = name_to_count_[name];
+  if (!val) {
+    ++val;
+    return name;
+  }
+
+  llvm::SmallString<64> probe_name(prefix);
+  while (true) {
+    probe_name.resize(prefix.size());
+    // TODO(jpienaar): Subtract one so that the initial suffix is 0 instead
+    // of 1.
+    // TODO(jpienaar): Switch to radix 36 and update tests.
+    llvm::APInt(32, val++).toString(probe_name, /*Radix=*/10,
+                                    /*Signed=*/false);
+    if (!name_to_count_.count(probe_name)) {
+      name = llvm::StringRef(probe_name);
+      break;
+    }
+  }
+  return name;
+}
+
+const std::string& OpNameMapper::GetUniqueName(Operation* op) {
+  auto& name = op_to_name_[op];
+  if (!name.empty()) return name;
+  // Update the value in the map with unique name.
+  name = GetUniqueName(GetName(op));
+  return name;
+}
+
+int OpNameMapper::InitOpName(mlir::Operation* op, llvm::StringRef name) {
+  op_to_name_[op] = name;
+  return name_to_count_[name]++;
+}
+
+std::string OpLocNameMapper::GetName(Operation* op) {
+  if (auto name_loc = op->getLoc().dyn_cast<mlir::NameLoc>())
+    return name_loc.getName().str();
+
+  if (auto call_loc = op->getLoc().dyn_cast<mlir::CallSiteLoc>()) {
+    // Return name if CallSiteLoc's callee has a NameLoc (as should be the case
+    // if imported with DebugInfo), else use the fallback naming scheme below.
+    if (auto name_loc = call_loc.getCallee().dyn_cast<mlir::NameLoc>())
+      return name_loc.getName().str();
+  }
+
+  // If the location is none of the expected types, then simply use name
+  // generated using the op type.
+  return op->getName().getStringRef();
+}
+
+std::string OpStripNameMapper::GetName(Operation* op) {
+  return llvm::APInt(32, count_++)
+      .toString(/*Radix=*/36,
+                /*Signed=*/false);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/op_name_mapper.h b/tensorflow/compiler/mlir/op_name_mapper.h
new file mode 100644
index 00000000000..2232ce2a80f
--- /dev/null
+++ b/tensorflow/compiler/mlir/op_name_mapper.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_OP_NAME_MAPPER_H_
+#define TENSORFLOW_COMPILER_MLIR_OP_NAME_MAPPER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+
+namespace tensorflow {
+
+// Mapper from operation to name.
+class OpNameMapper {
+ public:
+  // Returns unique name for the operation.
+  const std::string& GetUniqueName(mlir::Operation* op);
+
+  // Returns unique name for the given prefix.
+  std::string GetUniqueName(llvm::StringRef prefix);
+
+  // Initializes operation to map to name. Returns number of operations already
+  // named 'name' which should be 0 else GetUniqueName could return the same
+  // names for different ops.
+  // Note: its up to the caller to decide the behavior when assigning two ops
+  // to the same name.
+  int InitOpName(mlir::Operation* op, llvm::StringRef name);
+
+  virtual ~OpNameMapper();
+
+ private:
+  // Returns name from the location of the operation.
+  virtual std::string GetName(mlir::Operation* op) = 0;
+
+  // Maps from op to name.
+  llvm::StringMap<int64_t> name_to_count_;
+  absl::flat_hash_map<mlir::Operation*, std::string> op_to_name_;
+};
+
+// OpNameMapper that returns, for ops not initialized to a specific name, a name
+// based on the location of the operation.
+class OpLocNameMapper : public OpNameMapper {
+ private:
+  std::string GetName(mlir::Operation* op) override;
+};
+
+// OpNameMapper that returns, for ops not initialized to a specific name, a
+// short name.
+class OpStripNameMapper : public OpNameMapper {
+ private:
+  std::string GetName(mlir::Operation* op) override;
+
+  // Number of ops mapped.
+  int count_ = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_OP_NAME_MAPPER_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index abe8df63b20..f696eab4d44 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,5 +1,5 @@
 load("@local_config_mlir//:tblgen.bzl", "gentbl")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary")
 
 package(
     default_visibility = [":friends"],
@@ -10,8 +10,6 @@ package_group(
     name = "friends",
     includes = ["@local_config_mlir//:subpackages"],
     packages = [
-        "//learning/brain/experimental/mlir/...",
-        "//learning/brain/google/xla/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/python/...",
     ],
@@ -70,7 +68,31 @@ gentbl(
     td_file = "ir/tf_executor_ops.td",
     td_srcs = [
         "@local_config_mlir//:include/mlir/IR/OpBase.td",
-        "@local_config_mlir//:include/mlir/StandardOps/Ops.td",
+        "@local_config_mlir//:include/mlir/Dialect/StandardOps/Ops.td",
+    ],
+)
+
+gentbl(
+    name = "tensorflow_device_ops_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "ir/tf_device.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "ir/tf_device.cc.inc",
+        ),
+        (
+            "-gen-op-doc",
+            "g3doc/tf_device.md",
+        ),
+    ],
+    tblgen = "@local_config_mlir//:mlir-tblgen",
+    td_file = "ir/tf_device_ops.td",
+    td_srcs = [
+        "@local_config_mlir//:include/mlir/IR/OpBase.td",
+        "@local_config_mlir//:include/mlir/Dialect/StandardOps/Ops.td",
     ],
 )
 
@@ -93,30 +115,41 @@ cc_library(
     name = "tensorflow",
     srcs = [
         "ir/control_flow_ops.cc",
+        "ir/tf_device.cc",
         "ir/tf_executor.cc",
         "ir/tf_executor.cc.inc",
         "ir/tf_executor.h.inc",
         "ir/tf_ops.cc",
         "ir/tf_ops.cc.inc",
         "ir/tf_ops.h.inc",
+        "transforms/cluster_formation.cc",
+        "transforms/cluster_outlining.cc",
+        "transforms/executor_island_coarsening.cc",
         "transforms/functional_control_flow_to_cfg.cc",
         "transforms/generated_canonicalize.inc",
         "transforms/generated_optimize.inc",
+        "transforms/graph_pruning.cc",
         "transforms/optimize.cc",
         "transforms/raise_control_flow.cc",
+        "transforms/tpu_rewrite_pass.cc",
         "translate/control_to_executor_dialect.cc",
+        "translate/executor_to_control_dialect.cc",
     ],
     hdrs = [
         "ir/control_flow_ops.h",
+        "ir/tf_device.h",
         "ir/tf_executor.h",
         "ir/tf_ops.h",
+        "ir/tf_traits.h",
         "ir/tf_types.def",
         "ir/tf_types.h",
         "transforms/passes.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":tensorflow_canonicalize_inc_gen",
+        ":tensorflow_device_ops_inc_gen",
         ":tensorflow_executor_inc_gen",
         ":tensorflow_ops_inc_gen",
         ":tensorflow_optimize_inc_gen",
@@ -131,7 +164,6 @@ cc_library(
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
         "@local_config_mlir//:TransformUtils",
-        "@local_config_mlir//:TypeUtilities",
     ],
     # TODO(jpienaar): Merge in the dialect registration.
     alwayslink = 1,
@@ -141,6 +173,7 @@ cc_library(
 cc_library(
     name = "tensorflow_dialect_registration",
     srcs = ["ir/dialect_registration.cc"],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow",
         "@local_config_mlir//:IR",
@@ -152,12 +185,13 @@ cc_library(
     name = "convert_graphdef",
     srcs = [
         "translate/export_graphdef.cc",
-        "translate/import_graphdef.cc",
+        "translate/import_model.cc",
     ],
     hdrs = [
         "translate/export_graphdef.h",
-        "translate/import_graphdef.h",
+        "translate/import_model.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_tensor",
         ":convert_type",
@@ -166,6 +200,7 @@ cc_library(
         ":mangling_util",
         ":mlir_roundtrip_flags",
         ":tensorflow",
+        "//tensorflow/cc/saved_model:loader_lite",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
@@ -173,6 +208,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_proto_cc",
+        "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -182,6 +218,7 @@ cc_library(
         "@com_google_absl//absl/types:optional",
         "@llvm//:support",
         "@local_config_mlir//:IR",
+        "@local_config_mlir//:Pass",
         "@local_config_mlir//:StandardDialectRegistration",
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
@@ -196,6 +233,7 @@ cc_library(
     hdrs = [
         "utils/import_utils.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":error_util",
         "//tensorflow/core:lib",
@@ -213,6 +251,7 @@ cc_library(
     hdrs = [
         "utils/export_utils.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_tensor",
         ":convert_type",
@@ -244,6 +283,7 @@ cc_library(
     hdrs = [
         "translate/export_tf_dialect_op.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_type",
         ":export_utils",
@@ -252,6 +292,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_proto_cc",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@llvm//:support",
         "@local_config_mlir//:IR",
     ],
@@ -260,6 +302,7 @@ cc_library(
 cc_library(
     name = "translate_tf_dialect_op",
     srcs = ["translate/translate_tf_dialect_op.cc"],
+    copts = ["-std=c++14"],
     deps = [
         ":export_tf_dialect_op",
         "@llvm//:support",
@@ -274,6 +317,7 @@ cc_library(
     name = "mlir_roundtrip_pass",
     srcs = ["translate/mlir_roundtrip_pass.cc"],
     hdrs = ["translate/mlir_roundtrip_pass.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
@@ -281,15 +325,18 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_proto_cc",
+        "@local_config_mlir//:Analysis",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:StandardOps",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
     name = "mlir_roundtrip_flags",
     srcs = ["translate/mlir_roundtrip_flags.cc"],
     hdrs = ["translate/mlir_roundtrip_flags.h"],
+    copts = ["-std=c++14"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:framework",
@@ -307,6 +354,7 @@ cc_library(
     name = "convert_type",
     srcs = ["utils/convert_type.cc"],
     hdrs = ["utils/convert_type.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow",
         ":tensorflow_dialect_registration",
@@ -325,6 +373,7 @@ cc_library(
     name = "convert_tensor",
     srcs = ["utils/convert_tensor.cc"],
     hdrs = ["utils/convert_tensor.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_type",
         ":mangling_util",
@@ -344,6 +393,7 @@ cc_library(
     name = "mangling_util",
     srcs = ["utils/mangling_util.cc"],
     hdrs = ["utils/mangling_util.h"],
+    copts = ["-std=c++14"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -356,6 +406,7 @@ cc_library(
     name = "error_util",
     srcs = ["utils/error_util.cc"],
     hdrs = ["utils/error_util.h"],
+    copts = ["-std=c++14"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
@@ -375,11 +426,11 @@ cc_library(
         "transforms/constant_fold.h",
         "transforms/decode_constant.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_tensor",
         ":eval_util",
         ":tensorflow",
-        ":tf_graph_optimization_pass",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/core:framework",
@@ -396,6 +447,7 @@ cc_library(
 
 cc_library(
     name = "tf_dialect_lib",
+    copts = ["-std=c++14"],
     deps = [
         ":tensorflow_dialect_registration",
         ":tf_dialect_passes",
@@ -406,9 +458,11 @@ cc_library(
 cc_library(
     name = "tf_graph_optimization_pass",
     srcs = ["transforms/tf_graph_optimization_pass.cc"],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
+        ":mlir_roundtrip_pass",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow_pass_registration",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -428,6 +482,7 @@ cc_library(
     name = "eval_util",
     srcs = ["utils/eval_util.cc"],
     hdrs = ["utils/eval_util.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_tensor",
         ":convert_type",
@@ -460,6 +515,7 @@ cc_library(
     hdrs = [
         "translate/tf_mlir_translate.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_graphdef",
         ":error_util",
@@ -486,6 +542,7 @@ cc_library(
     hdrs = [
         "translate/tf_mlir_translate_cl.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         "@llvm//:support",
     ],
@@ -497,6 +554,7 @@ cc_library(
     srcs = [
         "translate/tf_mlir_translate_registration.cc",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
diff --git a/tensorflow/compiler/mlir/tensorflow/g3doc/tf_ops.md b/tensorflow/compiler/mlir/tensorflow/g3doc/tf_ops.md
deleted file mode 100755
index cedeba5dae1..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/g3doc/tf_ops.md
+++ /dev/null
@@ -1,2761 +0,0 @@
-<!-- Autogenerated by mlir-tblgen; don't manually edit -->
-# Operation definition
-## tf.Abs (TF::AbsOp)
-Computes the absolute value of a tensor.
-
-### Description:
-
-Given a tensor `x`, this operation returns a tensor containing the absolute
-value of each element in `x`. For example, if x is an input element and y is
-an output element, this operation computes \\(y = |x|\\).
-
-### Operands:
-1. `x`: tensor of floating-point or 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 32/64-bit integer values
-
-## tf.AddN (TF::AddNOp)
-Add all input tensors element wise.
-
-### Description:
-
-
-### Operands:
-1. `inputs`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow variant type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `sum`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow variant type values
-
-## tf.Add (TF::AddOp)
-Returns x + y element-wise.
-
-### Description:
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number or TensorFlow string type values
-1. `y`: tensor of number or TensorFlow string type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number or TensorFlow string type values
-
-## tf.AddV2 (TF::AddV2Op)
-Returns x + y element-wise.
-
-### Description:
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.AvgPool (TF::AvgPoolOp)
-Performs average pooling on the input.
-
-### Description:
-
-Each entry in `output` is the mean of the corresponding size `ksize`
-window in `value`.
-
-### Operands:
-1. `value`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `ksize` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
-| `strides` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
-| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
-| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.BatchToSpaceND (TF::BatchToSpaceNDOp)
-BatchToSpace for N-D tensors of type T.
-
-### Description:
-
-This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-`block_shape + [batch]`, interleaves these blocks back into the grid defined by
-the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-the input.  The spatial dimensions of this intermediate result are then
-optionally cropped according to `crops` to produce the output.  This is the
-reverse of SpaceToBatch.  See below for a precise description.
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `block_shape`: tensor of 32/64-bit integer values
-1. `crops`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tcrops` | `Attribute` | derived attribute attribute |
-| `Tblock_shape` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.BiasAdd (TF::BiasAddOp)
-Adds `bias` to `value`.
-
-### Description:
-
-This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-
-### Operands:
-1. `value`: tensor of number values
-1. `bias`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.Bitcast (TF::BitcastOp)
-
-Bitcasts a tensor from one type to another without copying data.
-  
-
-### Description:
-
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
-(e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
-gives module error.
-For example,
-
-Example 1:
-```python
->>> a = [1., 2., 3.]
->>> equality_bitcast = tf.bitcast(a,tf.complex128)
-tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
->>> equality_cast = tf.cast(a,tf.complex128)
->>> print(equality_cast)
-tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
-```
-Example 2:
-```python
->>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
-<tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
-```
-Example 3:
-```python
->>> x = [1., 2., 3.]
->>> y = [0., 2., 3.]
->>> equality= tf.equal(x,y)
->>> equality_cast = tf.cast(equality,tf.float32)
->>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
->>> print(equality)
-tf.Tensor([False True True], shape=(3,), dtype=bool)
->>> print(equality_cast)
-tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
->>> print(equality_bitcast)
-tf.Tensor(
-[[ 0 0 0 0]
- [ 0 0 128 63]
- [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
-```
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-
-### Operands:
-1. `input`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `type` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.BroadcastTo (TF::BroadcastToOp)
-Broadcast an array for a compatible shape.
-
-### Description:
-
-Broadcasting is the process of making arrays to have compatible shapes
-for arithmetic operations. Two shapes are compatible if for each
-dimension pair they are either equal or one of them is one. When trying
-to broadcast a Tensor to a shape, it starts with the trailing dimensions,
-and works its way forward.
-
-For example,
-
-```python
->>> x = tf.constant([1, 2, 3])
->>> y = tf.broadcast_to(x, [3, 3])
->>> sess.run(y)
-array([[1, 2, 3],
-       [1, 2, 3],
-       [1, 2, 3]], dtype=int32)
-```
-
-In the above example, the input Tensor with the shape of `[1, 3]`
-is broadcasted to output Tensor with shape of `[3, 3]`.
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `shape`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Cast (TF::CastOp)
-Cast x of type SrcT to y of DstT.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `Truncate` | `BoolAttr` | bool attribute attribute |
-| `SrcT` | `Attribute` | derived attribute attribute |
-| `DstT` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of tf.dtype values
-
-## tf.Ceil (TF::CeilOp)
-Returns element-wise smallest integer not less than x.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tf.Concat (TF::ConcatOp)
-Concatenates tensors along one dimension.
-
-### Description:
-
-
-### Operands:
-1. `concat_dim`: tensor of 32-bit integer values
-1. `values`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 2 attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.ConcatV2 (TF::ConcatV2Op)
-Concatenates tensors along one dimension.
-
-### Description:
-
-
-### Operands:
-1. `values`: tensor of tf.dtype values
-1. `axis`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 2 attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Conj (TF::ConjOp)
-Returns the complex conjugate of a complex number.
-
-### Description:
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-complex numbers that are the complex conjugate of each element in `input`. The
-complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-real part and *b* is the imaginary part.
-
-The complex conjugate returned by this operation is of the form \\(a - bj\\).
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-```
-
-### Operands:
-1. `input`: tensor of complex128 type or complex64 type or TensorFlow variant type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of complex128 type or complex64 type or TensorFlow variant type values
-
-## tf.Const (TF::ConstOp)
-Constant tensor op
-
-### Description:
-
-
-### Operands:
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
-| `dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Conv2D (TF::Conv2DOp)
-
-Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-  
-
-### Description:
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, out_channels]`, this op
-performs the following:
-
-1. Flattens the filter to a 2-D matrix with shape
-   `[filter_height * filter_width * in_channels, output_channels]`.
-2. Extracts image patches from the input tensor to form a *virtual*
-   tensor of shape `[batch, out_height, out_width,
-   filter_height * filter_width * in_channels]`.
-3. For each patch, right-multiplies the filter matrix and the image patch
-   vector.
-
-In detail, with the default NHWC format,
-
-    output[b, i, j, k] =
-        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-                        filter[di, dj, q, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-### Operands:
-1. `input`: tensor of floating-point values
-1. `filter`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `strides` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `use_cudnn_on_gpu` | `BoolAttr` | bool attribute attribute |
-| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID, or EXPLICIT attribute |
-| `explicit_paddings` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
-| `dilations` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.Cos (TF::CosOp)
-Computes cos of x element-wise.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.DepthwiseConv2dNative (TF::DepthwiseConv2dNativeOp)
-
-Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-  
-
-### Description:
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-a different filter to each input channel (expanding from 1 channel to
-`channel_multiplier` channels for each), then concatenates the results
-together. Thus, the output has `in_channels * channel_multiplier` channels.
-
-```
-for k in 0..in_channels-1
-  for q in 0..channel_multiplier-1
-    output[b, i, j, k * channel_multiplier + q] =
-      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-                        filter[di, dj, k, q]
-```
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-### Operands:
-1. `input`: tensor of floating-point values
-1. `filter`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `strides` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
-| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
-| `dilations` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.Div (TF::DivOp)
-Returns x / y element-wise.
-
-### Description:
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Elu (TF::EluOp)
-
-Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-  
-
-### Description:
-
-See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-](http://arxiv.org/abs/1511.07289)
-
-### Operands:
-1. `features`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `activations`: tensor of floating-point values
-
-## tf.Equal (TF::EqualOp)
-Returns the truth value of (x == y) element-wise.
-
-### Description:
-
-*NOTE*: `Equal` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-```python
-x = tf.constant([2, 4])
-y = tf.constant(2)
-tf.math.equal(x, y) ==> array([True, False])
-
-x = tf.constant([2, 4])
-y = tf.constant([2, 4])
-tf.math.equal(x, y) ==> array([True,  True])
-```
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.ExpandDims (TF::ExpandDimsOp)
-Inserts a dimension of 1 into a tensor's shape.
-
-### Description:
-
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-zero; if you specify a negative number for `axis` it is counted backward from
-the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `dim`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tdim` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.FakeQuantWithMinMaxArgs (TF::FakeQuantWithMinMaxArgsOp)
-
-Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-  
-
-### Description:
-
-Attributes `[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-
-Before quantization, `min` and `max` values are adjusted with the following
-logic.
-It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-the behavior can be unexpected:
-If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-`min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-
-Quantization is called fake since the output is still in floating point.
-
-### Operands:
-1. `inputs`: tensor of 32-bit float values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `min` | `FloatAttr` | 32-bit float attribute attribute |
-| `max` | `FloatAttr` | 32-bit float attribute attribute |
-| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `narrow_range` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `outputs`: tensor of 32-bit float values
-
-## tf.FakeQuantWithMinMaxVars (TF::FakeQuantWithMinMaxVarsOp)
-
-Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
-  
-
-### Description:
-
-and `max` to 'outputs' tensor of same shape as `inputs`.
-
-`[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-
-Before quantization, `min` and `max` values are adjusted with the following
-logic.
-It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-the behavior can be unexpected:
-If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-`min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-
-This operation has a gradient and thus allows for training `min` and `max`
-values.
-
-### Operands:
-1. `inputs`: tensor of 32-bit float values
-1. `min`: tensor of 32-bit float values
-1. `max`: tensor of 32-bit float values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `narrow_range` | `BoolAttr` | bool attribute attribute |
-
-### Results:
-1. `outputs`: tensor of 32-bit float values
-
-## tf.Fill (TF::FillOp)
-Creates a tensor filled with a scalar value.
-
-### Description:
-
-This operation creates a tensor of shape `dims` and fills it with `value`.
-
-For example:
-
-```
-# Output tensor has shape [2, 3].
-fill([2, 3], 9) ==> [[9, 9, 9]
-                     [9, 9, 9]]
-```
-
-`tf.fill` differs from `tf.constant` in a few ways:
-
-*   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
-    Tensor values.
-*   `tf.fill` creates an Op in the computation graph that constructs the actual
-    Tensor value at runtime. This is in contrast to `tf.constant` which embeds
-    the entire Tensor into the graph with a `Const` node.
-*   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
-    based on other runtime Tensors, unlike `tf.constant`.
-
-### Operands:
-1. `dims`: tensor of 32/64-bit integer values
-1. `value`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `index_type` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.FloorDiv (TF::FloorDivOp)
-Returns x // y element-wise.
-
-### Description:
-
-*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Floor (TF::FloorOp)
-Returns element-wise largest integer not greater than x.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point values
-
-## tf.FusedBatchNorm (TF::FusedBatchNormOp)
-Batch normalization.
-
-### Description:
-
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-
-### Operands:
-1. `x`: tensor of 32-bit float values
-1. `scale`: tensor of 32-bit float values
-1. `offset`: tensor of 32-bit float values
-1. `mean`: tensor of 32-bit float values
-1. `variance`: tensor of 32-bit float values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `epsilon` | `FloatAttr` | 32-bit float attribute attribute |
-| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
-| `is_training` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of 32-bit float values
-1. `batch_mean`: tensor of 32-bit float values
-1. `batch_variance`: tensor of 32-bit float values
-1. `reserve_space_1`: tensor of 32-bit float values
-1. `reserve_space_2`: tensor of 32-bit float values
-
-## tf.Gather (TF::GatherOp)
-Gather slices from `params` according to `indices`.
-
-### Description:
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-If `indices` is a permutation and `len(indices) == params.shape[0]` then
-this operation will permute `params` accordingly.
-
-`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-`indices` are always validated to be within range. If assigned to GPU,
-out-of-bound indices result in safe but unspecified behavior, which may include
-raising an error.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-
-### Operands:
-1. `params`: tensor of tf.dtype values
-1. `indices`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `validate_indices` | `BoolAttr` | bool attribute attribute |
-| `Tindices` | `Attribute` | derived attribute attribute |
-| `Tparams` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.GatherV2 (TF::GatherV2Op)
-
-Gather slices from `params` axis `axis` according to `indices`.
-  
-
-### Description:
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
-
-```python
-    # Scalar indices (output is rank(params) - 1).
-    output[a_0, ..., a_n, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices, b_0, ..., b_n]
-
-    # Vector indices (output is rank(params)).
-    output[a_0, ..., a_n, i, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-
-    # Higher rank indices (output is rank(params) + rank(indices) - 1).
-    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-
-Note that on CPU, if an out of bound index is found, an error is returned.
-On GPU, if an out of bound index is found, a 0 is stored in the
-corresponding output value.
-
-See also `tf.batch_gather` and `tf.gather_nd`.
-
-### Operands:
-1. `params`: tensor of tf.dtype values
-1. `indices`: tensor of 32/64-bit integer values
-1. `axis`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `batch_dims` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `Tindices` | `Attribute` | derived attribute attribute |
-| `Tparams` | `Attribute` | derived attribute attribute |
-| `Taxis` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.GreaterEqual (TF::GreaterEqualOp)
-Returns the truth value of (x >= y) element-wise.
-
-### Description:
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.Greater (TF::GreaterOp)
-Returns the truth value of (x > y) element-wise.
-
-### Description:
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.IdentityN (TF::IdentityNOp)
-
-Returns a list of tensors with the same shapes and contents as the input
-  
-
-### Description:
-
-tensors.
-
-This op can be used to override the gradient for complicated functions. For
-example, suppose y = f(x) and we wish to apply a custom function g for backprop
-such that dx = g(dy). In Python,
-
-```python
-with tf.get_default_graph().gradient_override_map(
-    {'IdentityN': 'OverrideGradientWithG'}):
-  y, _ = identity_n([f(x), x])
-
-@tf.RegisterGradient('OverrideGradientWithG')
-def ApplyG(op, dy, _):
-  return [None, g(dy)]  # Do not backprop to f(x).
-```
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Identity (TF::IdentityOp)
-Identity op
-
-### Description:
-
-Returns a tensor with the same shape and contents as input.
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Invert (TF::InvertOp)
-
-Invert (flip) each bit of supported types; for example, type `uint8` value 01010101 becomes 10101010.
-  
-
-### Description:
-
-Flip each bit of supported types.  For example, type `int8` (decimal 2) binary 00000010 becomes (decimal -3) binary 11111101.
-This operation is performed on each element of the tensor argument `x`.
-
-### Operands:
-1. `x`: tensor of 8/16/32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of 8/16/32/64-bit integer values
-
-## tf.LeakyRelu (TF::LeakyReluOp)
-Computes rectified linear: `max(features, features * alpha)`.
-
-### Description:
-
-
-### Operands:
-1. `features`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `alpha` | `FloatAttr` | 32-bit float attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `activations`: tensor of floating-point values
-
-## tf.LessEqual (TF::LessEqualOp)
-Returns the truth value of (x <= y) element-wise.
-
-### Description:
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.Less (TF::LessOp)
-Returns the truth value of (x < y) element-wise.
-
-### Description:
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.Log (TF::LogOp)
-Computes natural logarithm of x element-wise.
-
-### Description:
-
-I.e., \\(y = \log_e x\\).
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.LogSoftmax (TF::LogSoftmaxOp)
-Computes log softmax activations.
-
-### Description:
-
-For each batch `i` and class `j` we have
-
-    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-
-### Operands:
-1. `logits`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `logsoftmax`: tensor of floating-point values
-
-## tf.LogicalAnd (TF::LogicalAndOp)
-Returns the truth value of x AND y element-wise.
-
-### Description:
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 1-bit integer values
-1. `y`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.LogicalNot (TF::LogicalNotOp)
-Returns the truth value of NOT x element-wise.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `y`: tensor of 1-bit integer values
-
-## tf.LogicalOr (TF::LogicalOrOp)
-Returns the truth value of x OR y element-wise.
-
-### Description:
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 1-bit integer values
-1. `y`: tensor of 1-bit integer values
-
-### Attributes:
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.MatMul (TF::MatMulOp)
-
-Multiply the matrix "a" by the matrix "b".
-  
-
-### Description:
-
-The inputs must be two-dimensional matrices and the inner dimension of
-"a" (after being transposed if transpose_a is true) must match the
-outer dimension of "b" (after being transposed if transposed_b is
-true).
-
-*Note*: The default kernel implementation for MatMul on GPUs uses
-cublas.
-
-### Operands:
-1. `a`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-1. `b`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `transpose_a` | `BoolAttr` | bool attribute attribute |
-| `transpose_b` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `product`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-## tf.Max (TF::MaxOp)
-
-Computes the maximum of elements across dimensions of a tensor.
-  
-
-### Description:
-
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-### Operands:
-1. `input`: tensor of number values
-1. `reduction_indices`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.MaxPool (TF::MaxPoolOp)
-Performs max pooling on the input.
-
-### Description:
-
-
-### Operands:
-1. `input`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `ksize` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
-| `strides` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
-| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
-| `data_format` | `StringAttr` | string attribute whose value is NHWC, or NCHW, or NCHW_VECT_C attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of 8/16/32/64-bit integer or floating-point values
-
-## tf.Maximum (TF::MaximumOp)
-Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-
-### Description:
-
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of floating-point or 32/64-bit integer values
-1. `y`: tensor of floating-point or 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of floating-point or 32/64-bit integer values
-
-## tf.Mean (TF::MeanOp)
-Computes the mean of elements across dimensions of a tensor.
-
-### Description:
-
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-### Operands:
-1. `input`: tensor of number values
-1. `reduction_indices`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.Min (TF::MinOp)
-
-Computes the minimum of elements across dimensions of a tensor.
-  
-
-### Description:
-
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-### Operands:
-1. `input`: tensor of number values
-1. `reduction_indices`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.Minimum (TF::MinimumOp)
-Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-
-### Description:
-
-*NOTE*: `Minimum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of floating-point or 32/64-bit integer values
-1. `y`: tensor of floating-point or 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of floating-point or 32/64-bit integer values
-
-## tf.MulNoNan (TF::MulNoNanOp)
-
-Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
-  
-
-### Description:
-
-*NOTE*: `MulNoNan` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-1. `y`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-
-## tf.Mul (TF::MulOp)
-Returns x * y element-wise.
-
-### Description:
-
-*NOTE*: `Multiply` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Neg (TF::NegOp)
-Computes numerical negative value element-wise.
-
-### Description:
-
-I.e., \\(y = -x\\).
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-## tf.NoOp (TF::NoOp)
-Does nothing. Only useful as a placeholder for control edges.
-
-### Description:
-
-
-### Operands:
-
-### Attributes:
-
-### Results:
-
-## tf.NotEqual (TF::NotEqualOp)
-Returns the truth value of (x != y) element-wise.
-
-### Description:
-
-*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 1-bit integer values
-
-## tf.Pack (TF::PackOp)
-
-Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-  
-
-### Description:
-
-Packs the `N` tensors in `values` into a tensor with rank one higher than each
-tensor in `values`, by packing them along the `axis` dimension.
-Given a list of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of `unpack`.
-
-### Operands:
-1. `values`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
-| `axis` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Pad (TF::PadOp)
-Pads a tensor with zeros.
-
-### Description:
-
-This operation pads a `input` with zeros according to the `paddings` you
-specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many zeros to add before the contents of `input` in that dimension, and
-`paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-in that dimension.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `paddings`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tpaddings` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.PadV2 (TF::PadV2Op)
-Pads a tensor.
-
-### Description:
-
-This operation pads `input` according to the `paddings` and `constant_values`
-you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many padding values to add before the contents of `input` in that dimension,
-and `paddings[D, 1]` indicates how many padding values to add after the contents
-of `input` in that dimension. `constant_values` is a scalar tensor of the same
-type as `input` that indicates the value to use for padding `input`.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# 'constant_values' is 0
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `paddings`: tensor of 32/64-bit integer values
-1. `constant_values`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tpaddings` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Placeholder.input (TF::PlaceholderInputOp)
-PlaceholderInput op
-
-### Description:
-
-Inserts a placeholder for a tensor that will be always fed.
-
-### Operands:
-1. `arg`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `min` | `FloatAttr` | 32-bit float attribute attribute |
-| `max` | `FloatAttr` | 32-bit float attribute attribute |
-| `type` | `TypeAttr` | integer type attribute |
-| `dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Placeholder (TF::PlaceholderOp)
-Placeholder op
-
-### Description:
-
-Inserts a placeholder for a tensor that will be always fed.
-
-### Operands:
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.QuantizeAndDequantize (TF::QuantizeAndDequantizeOp)
-Use QuantizeAndDequantizeV2 instead.
-
-### Description:
-
-
-### Operands:
-1. `input`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `signed_input` | `BoolAttr` | bool attribute attribute |
-| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `range_given` | `BoolAttr` | bool attribute attribute |
-| `input_min` | `FloatAttr` | 32-bit float attribute attribute |
-| `input_max` | `FloatAttr` | 32-bit float attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.QuantizeAndDequantizeV2 (TF::QuantizeAndDequantizeV2Op)
-Quantizes then dequantizes a tensor.
-
-### Description:
-
-This op simulates the precision loss from the quantized forward pass by:
-
-1. Quantizing the tensor to fixed point numbers, which should match the target
-   quantization method when it is used in inference.
-2. Dequantizing it back to floating point numbers for the following ops, most
-   likely matmul.
-
-There are different ways to quantize. This version uses only scaling, so 0.0
-maps to 0.
-
-From the specified 'num_bits' in the quantized output type, it determines
-minimum and maximum representable quantized values.
-
-e.g.
-
-*   [-128, 127] for signed, num_bits = 8, or
-*   [0, 255] for unsigned, num_bits = 8.
-
-If range_given == False, the initial input_min, input_max will be determined
-automatically as the minimum and maximum values in the input tensor, otherwise
-the specified values of input_min, input_max are used.
-
-Note: If the input_min, input_max are specified, they do not need to equal the
-actual minimum and maximum values in the tensor. e.g. in some cases it may be
-beneficial to specify these values such that the low probability extremes of the
-input distribution are clipped.
-
-This op determines the maximum scale_factor that would map the initial
-[input_min, input_max] range to a range that lies within the representable
-quantized range.
-
-It determines the scale from one of input_min and input_max, then updates the
-other one to maximize the respresentable range.
-
-e.g.
-
-*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-    5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
-    would update input_max to be 127 / 12.8 = 9.921875
-*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-    10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
-    would update input_min to be 128.0 / 12.7 = -10.07874
-*   if the output is unsigned, input_min is forced to be 0, and only the
-    specified input_max is used.
-
-After determining the scale_factor and updating the input range, it applies the
-following to each value in the 'input' tensor.
-
-output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
-
-The above round function rounds the value based on the given round_mode.
-
-### Operands:
-1. `input`: tensor of floating-point values
-1. `input_min`: tensor of floating-point values
-1. `input_max`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `signed_input` | `BoolAttr` | bool attribute attribute |
-| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `range_given` | `BoolAttr` | bool attribute attribute |
-| `round_mode` | `StringAttr` | string attribute whose value is HALF_TO_EVEN, or HALF_UP attribute |
-| `narrow_range` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.QuantizeAndDequantizeV3 (TF::QuantizeAndDequantizeV3Op)
-Quantizes then dequantizes a tensor.
-
-### Description:
-
-This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-tensor, so its value can change during training.
-
-### Operands:
-1. `input`: tensor of floating-point values
-1. `input_min`: tensor of floating-point values
-1. `input_max`: tensor of floating-point values
-1. `num_bits`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `signed_input` | `BoolAttr` | bool attribute attribute |
-| `range_given` | `BoolAttr` | bool attribute attribute |
-| `narrow_range` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.RandomUniform (TF::RandomUniformOp)
-Outputs random values from a uniform distribution.
-
-### Description:
-
-The generated values follow a uniform distribution in the range `[0, 1)`. The
-lower bound 0 is included in the range, while the upper bound 1 is excluded.
-
-### Operands:
-1. `shape`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `seed` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `seed2` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of floating-point values
-
-## tf.Range (TF::RangeOp)
-Creates a sequence of numbers.
-
-### Description:
-
-This operation creates a sequence of numbers that begins at `start` and
-extends by increments of `delta` up to but not including `limit`.
-
-For example:
-
-```
-# 'start' is 3
-# 'limit' is 18
-# 'delta' is 3
-tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-```
-
-### Operands:
-1. `start`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
-1. `limit`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
-1. `delta`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
-
-## tf.Rank (TF::RankOp)
-Returns the rank of a tensor.
-
-### Description:
-
-This operation returns an integer representing the rank of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-# shape of tensor 't' is [2, 2, 3]
-rank(t) ==> 3
-```
-
-**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-of a tensor is the number of indices required to uniquely select each element
-of the tensor. Rank is also known as "order", "degree", or "ndims."
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of 32-bit integer values
-
-## tf.RealDiv (TF::RealDivOp)
-Returns x / y element-wise for real types.
-
-### Description:
-
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Reciprocal (TF::ReciprocalOp)
-Computes the reciprocal of x element-wise.
-
-### Description:
-
-I.e., \\(y = 1 / x\\).
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-## tf.Relu6 (TF::Relu6Op)
-Computes rectified linear 6: `min(max(features, 0), 6)`.
-
-### Description:
-
-
-### Operands:
-1. `features`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `activations`: tensor of 8/16/32/64-bit integer or floating-point values
-
-## tf.Relu (TF::ReluOp)
-Computes rectified linear: `max(features, 0)`.
-
-### Description:
-
-
-### Operands:
-1. `features`: tensor of 8/16/32/64-bit integer or floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `activations`: tensor of 8/16/32/64-bit integer or floating-point values
-
-## tf.Reshape (TF::ReshapeOp)
-Reshapes a tensor.
-
-### Description:
-
-Given `tensor`, this operation returns a tensor that has the same values
-as `tensor` with shape `shape`.
-
-If one component of `shape` is the special value -1, the size of that dimension
-is computed so that the total size remains constant.  In particular, a `shape`
-of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-
-If `shape` is 1-D or higher, then the operation returns a tensor with shape
-`shape` filled with the values of `tensor`. In this case, the number of elements
-implied by `shape` must be the same as the number of elements in `tensor`.
-
-For example:
-
-```
-# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# tensor 't' has shape [9]
-reshape(t, [3, 3]) ==> [[1, 2, 3],
-                        [4, 5, 6],
-                        [7, 8, 9]]
-
-# tensor 't' is [[[1, 1], [2, 2]],
-#                [[3, 3], [4, 4]]]
-# tensor 't' has shape [2, 2, 2]
-reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-                        [3, 3, 4, 4]]
-
-# tensor 't' is [[[1, 1, 1],
-#                 [2, 2, 2]],
-#                [[3, 3, 3],
-#                 [4, 4, 4]],
-#                [[5, 5, 5],
-#                 [6, 6, 6]]]
-# tensor 't' has shape [3, 2, 3]
-# pass '[-1]' to flatten 't'
-reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-
-# -1 can also be used to infer the shape
-
-# -1 is inferred to be 9:
-reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 2:
-reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 3:
-reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-                              [2, 2, 2],
-                              [3, 3, 3]],
-                             [[4, 4, 4],
-                              [5, 5, 5],
-                              [6, 6, 6]]]
-
-# tensor 't' is [7]
-# shape `[]` reshapes to a scalar
-reshape(t, []) ==> 7
-```
-
-### Operands:
-1. `tensor`: tensor of tf.dtype values
-1. `shape`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tshape` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.ResizeBilinear (TF::ResizeBilinearOp)
-Resize `images` to `size` using bilinear interpolation.
-
-### Description:
-
-Input images can be of different types but output images are always float.
-
-### Operands:
-1. `images`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `size`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `align_corners` | `BoolAttr` | bool attribute attribute |
-| `half_pixel_centers` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `resized_images`: tensor of 32-bit float values
-
-## tf.ReverseV2 (TF::ReverseV2Op)
-Reverses specific dimensions of a tensor.
-
-### Description:
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is [-1]
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-### Operands:
-1. `tensor`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-1. `axis`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
-
-## tf.Rsqrt (TF::RsqrtOp)
-Computes reciprocal of square root of x element-wise.
-
-### Description:
-
-I.e., \\(y = 1 / \sqrt{x}\\).
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.Select (TF::SelectOp)
-Selects elements from `x` or `y`, depending on `condition`.
-
-### Description:
-
-The `x`, and `y` tensors must all have the same shape, and the
-output will also have that shape.
-
-The `condition` tensor must be a scalar if `x` and `y` are scalars.
-If `x` and `y` are vectors or higher rank, then `condition` must be either a
-scalar, a vector with size matching the first dimension of `x`, or must have
-the same shape as `x`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be
-taken from `x` (if true) or `y` (if false).
-
-If `condition` is a vector and `x` and `y` are higher rank matrices, then
-it chooses which row (outer dimension) to copy from `x` and `y`.
-If `condition` has the same shape as `x` and `y`, then it chooses which
-element to copy from `x` and `y`.
-
-For example:
-
-```python
-# 'condition' tensor is [[True,  False]
-#                        [False, True]]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e)  # => [[1, 6], [7, 4]]
-
-
-# 'condition' tensor is [True, False]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e) ==> [[1, 2],
-                             [7, 8]]
-
-```
-
-### Operands:
-1. `condition`: tensor of 1-bit integer values
-1. `t`: tensor of tf.dtype values
-1. `e`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Shape (TF::ShapeOp)
-Returns the shape of a tensor.
-
-### Description:
-
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `out_type` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of 32/64-bit integer values
-
-## tf.Sigmoid (TF::SigmoidOp)
-Computes sigmoid of `x` element-wise.
-
-### Description:
-
-Specifically, `y = 1 / (1 + exp(-x))`.
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.Sin (TF::SinOp)
-Computes sin of x element-wise.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.Slice (TF::SliceOp)
-Return a slice from 'input'.
-
-### Description:
-
-The output tensor is a tensor with dimensions described by 'size'
-whose values are extracted from 'input' starting at the offsets in
-'begin'.
-
-*Requirements*:
-  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `begin`: tensor of 32/64-bit integer values
-1. `size`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Index` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Softmax (TF::SoftmaxOp)
-Computes softmax activations.
-
-### Description:
-
-For each batch `i` and class `j` we have
-
-    $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
-
-### Operands:
-1. `logits`: tensor of floating-point values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `softmax`: tensor of floating-point values
-
-## tf.SpaceToBatchND (TF::SpaceToBatchNDOp)
-SpaceToBatch for N-D tensors of type T.
-
-### Description:
-
-This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-grid of blocks of shape `block_shape`, and interleaves these blocks with the
-"batch" dimension (0) such that in the output, the spatial dimensions
-`[1, ..., M]` correspond to the position within the grid, and the batch
-dimension combines both the position within a spatial block and the original
-batch position.  Prior to division into blocks, the spatial dimensions of the
-input are optionally zero padded according to `paddings`.  See below for a
-precise description.
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `block_shape`: tensor of 32/64-bit integer values
-1. `paddings`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tpaddings` | `Attribute` | derived attribute attribute |
-| `Tblock_shape` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Split (TF::SplitOp)
-Splits a tensor into `num_split` tensors along one dimension.
-
-### Description:
-
-
-### Operands:
-1. `split_dim`: tensor of 32-bit integer values
-1. `value`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_split` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.SplitV (TF::SplitVOp)
-Splits a tensor into `num_split` tensors along one dimension.
-
-### Description:
-
-
-### Operands:
-1. `value`: tensor of tf.dtype values
-1. `size_splits`: tensor of 32/64-bit integer values
-1. `split_dim`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_split` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
-| `Tlen` | `Attribute` | derived attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Sqrt (TF::SqrtOp)
-Computes square root of x element-wise.
-
-### Description:
-
-I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-
-### Operands:
-1. `x`: tensor of floating-point or 64/128-bit complex type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of floating-point or 64/128-bit complex type values
-
-## tf.Square (TF::SquareOp)
-Computes square of x element-wise.
-
-### Description:
-
-I.e., \\(y = x * x = x^2\\).
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-## tf.SquaredDifference (TF::SquaredDifferenceOp)
-Returns (x - y)(x - y) element-wise.
-
-### Description:
-
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
-
-## tf.Squeeze (TF::SqueezeOp)
-Removes dimensions of size 1 from the shape of a tensor.
-
-### Description:
-
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
-
-For example:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `squeeze_dims` | `ArrayAttr` | 64-bit integer array attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.StridedSlice (TF::StridedSliceOp)
-Return a strided slice from `input`.
-
-### Description:
-
-Note, most python users will want to use the Python `Tensor.__getitem__`
-or `Variable.__getitem__` rather than this op directly.
-
-The goal of this op is to produce a new tensor with a subset of
-the elements from the `n` dimensional `input` tensor. The subset is chosen using
-a sequence of `m` sparse range specifications encoded into the arguments
-of this function. Note, in some cases
-`m` could be equal to `n`, but this need not be the case. Each
-range specification entry can be one of the following:
-
-- An ellipsis (...). Ellipses are used to imply zero or more
-  dimensions of full-dimension selection and are produced using
-  `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-
-- A new axis. This is used to insert a new shape=1 dimension and is
-  produced using `new_axis_mask`. For example, `foo[:, ...]` where
-  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-
-
-- A range `begin:end:stride`. This is used to specify how much to choose from
-  a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-  which represents the index of the first value to select while `end` represents
-  the index of the last value to select. The number of values selected in each
-  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-  `begin` and `end` can be negative where `-1` is the last element, `-2` is
-  the second to last. `begin_mask` controls whether to replace the explicitly
-  given `begin` with an implicit effective value of `0` if `stride > 0` and
-  `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-  required to create the largest open interval. For example, given a shape
-  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-  first dimension of a tensor while dropping the last two (in the original
-  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-
-- A single index. This is used to keep only elements that have a given
-  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-  shape `(6,)` tensor. This is encoded in `begin` and `end` and
-  `shrink_axis_mask`.
-
-Each conceptual range specification is encoded in the op's argument. This
-encoding is best understand by considering a non-trivial example. In
-particular,
-`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-
-```
-begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-end = [2, 4, x, x, -3, x]
-strides = [1, 1, x, x, -1, 1]
-begin_mask = 1<<4 | 1 << 5 = 48
-end_mask = 1<<5 = 32
-ellipsis_mask = 1<<3 = 8
-new_axis_mask = 1<<2 4
-shrink_axis_mask = 1<<0
-```
-
-In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-the slice becomes (2, 1, 5, 5, 2, 5).
-Let us walk step by step through each argument specification.
-
-1.  The first argument in the example slice is turned into `begin = 1` and
-`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-also set the appropriate bit in `shrink_axis_mask`.
-
-2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-zero bits contributed.
-
-3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-dimension in the final shape. Dummy values are contributed to begin,
-end and stride, while the new_axis_mask bit is set.
-
-4. `...` grab the full ranges from as many dimensions as needed to
-fully specify a slice for every dimension of the input shape.
-
-5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-with a dimension that has shape `s` is converted to a positive index
-`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-is done internally so begin, end and strides receive x, -3, and -1.
-The appropriate begin_mask bit is set to indicate the start range is the
-full range (ignoring the x).
-
-6. `:` indicates that the entire contents of the corresponding dimension
-is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-`end_mask` are also set.
-
-*Requirements*:
-  `0 != strides[i] for i in [0, m)`
-  `ellipsis_mask must be a power of two (only one ellipsis)`
-
-### Operands:
-1. `input`: tensor of tf.dtype values
-1. `begin`: tensor of 32/64-bit integer values
-1. `end`: tensor of 32/64-bit integer values
-1. `strides`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `begin_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `end_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `ellipsis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `new_axis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `shrink_axis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Index` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Sub (TF::SubOp)
-Returns x - y element-wise.
-
-### Description:
-
-*NOTE*: `Subtract` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Sum (TF::SumOp)
-Computes the sum of elements across dimensions of a tensor.
-
-### Description:
-
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-### Operands:
-1. `input`: tensor of number values
-1. `reduction_indices`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `keep_dims` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tidx` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of number values
-
-## tf.TensorListFromTensor (TF::TensorListFromTensorOp)
-
-Creates a TensorList which, when stacked, has the value of `tensor`.
-  
-
-### Description:
-
-Each tensor in the result list corresponds to one row of the input tensor.
-
-tensor: The input tensor.
-output_handle: The list.
-
-### Operands:
-1. `tensor`: tensor of tf.dtype values
-1. `element_shape`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `shape_type` | `Attribute` | derived attribute attribute |
-| `element_dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output_handle`: tensor of TensorFlow variant type values
-
-## tf.TensorListGetItem (TF::TensorListGetItemOp)
-
-
-### Description:
-
-
-### Operands:
-1. `input_handle`: tensor of TensorFlow variant type values
-1. `index`: tensor of 32-bit integer values
-1. `element_shape`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `element_dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `item`: tensor of tf.dtype values
-
-## tf.TensorListReserve (TF::TensorListReserveOp)
-List of the given size with empty elements.
-
-### Description:
-
-element_shape: the shape of the future elements of the list
-num_elements: the number of elements to reserve
-handle: the output list
-element_dtype: the desired type of elements in the list.
-
-### Operands:
-1. `element_shape`: tensor of 32/64-bit integer values
-1. `num_elements`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `element_dtype` | `TypeAttr` | any type attribute attribute |
-| `shape_type` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `handle`: tensor of TensorFlow variant type values
-
-## tf.TensorListSetItem (TF::TensorListSetItemOp)
-
-
-### Description:
-
-
-### Operands:
-1. `input_handle`: tensor of TensorFlow variant type values
-1. `index`: tensor of 32-bit integer values
-1. `item`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `element_dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output_handle`: tensor of TensorFlow variant type values
-
-## tf.TensorListStack (TF::TensorListStackOp)
-Stacks all tensors in the list.
-
-### Description:
-
-Requires that all tensors have the same shape.
-
-input_handle: the input list
-tensor: the gathered result
-num_elements: optional. If not -1, the number of elements in the list.
-
-### Operands:
-1. `input_handle`: tensor of TensorFlow variant type values
-1. `element_shape`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num_elements` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `element_dtype` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `tensor`: tensor of tf.dtype values
-
-## tf.TopKV2 (TF::TopKV2Op)
-
-Finds values and indices of the `k` largest elements for the last dimension.
-  
-
-### Description:
-
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-### Operands:
-1. `input`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `k`: tensor of 32-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `sorted` | `BoolAttr` | bool attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `values`: tensor of 8/16/32/64-bit integer or floating-point values
-1. `indices`: tensor of 32-bit integer values
-
-## tf.Transpose (TF::TransposeOp)
-Shuffle dimensions of x according to a permutation.
-
-### Description:
-
-The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-
-### Operands:
-1. `x`: tensor of tf.dtype values
-1. `perm`: tensor of 32/64-bit integer values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-| `Tperm` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of tf.dtype values
-
-## tf.TruncateDiv (TF::TruncateDivOp)
-Returns x / y element-wise for integer types.
-
-### Description:
-
-Truncation designates that negative numbers will round fractional quantities
-toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-than Python semantics. See `FloorDiv` for a division function that matches
-Python Semantics.
-
-*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-### Operands:
-1. `x`: tensor of number values
-1. `y`: tensor of number values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of number values
-
-## tf.Unpack (TF::UnpackOp)
-
-Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-  
-
-### Description:
-
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-  dimension unpacked along is gone, unlike `split`).
-
-If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-  and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of `pack`.
-
-### Operands:
-1. `value`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `num` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 0 attribute |
-| `axis` | `IntegerAttr` | 64-bit integer attribute attribute |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `output`: tensor of tf.dtype values
-
-## tf.Xdivy (TF::XdivyOp)
-Returns 0 if x == 0, and x / y otherwise, elementwise.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-1. `y`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `z`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
-
-## tf.ZerosLike (TF::ZerosLikeOp)
-Returns a tensor of zeros with the same shape and type as x.
-
-### Description:
-
-
-### Operands:
-1. `x`: tensor of tf.dtype values
-
-### Attributes:
-| Attribute | MLIR Type | Description |
-| :-------: | :-------: | ----------- |
-| `T` | `Attribute` | derived attribute attribute |
-
-### Results:
-1. `y`: tensor of tf.dtype values
-
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 2756b4c0885..4bf7029421e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -65,7 +65,7 @@ class TFControlType : public Type::TypeBase<TFControlType, Type> {
 // tensor needs its own _tf.Enter to be made available inside the while loop.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // This is defined in Tensorflow as:
 //
@@ -100,7 +100,7 @@ class EnterOp
 // of the operand type along with the index of the first match encountered.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // This is defined in TensorFlow as:
 //
@@ -130,7 +130,7 @@ class MergeOp : public Op<MergeOp, OpTrait::VariadicOperands,
 // of a while loop. Each loop variable needs its own NextIteration op.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // NextIteration op is broken into _tf.NextIteration.sink and
 // _tf.NextIteration.source because NextIteration is a back-edge in Tensorflow
@@ -182,7 +182,7 @@ class NextIterationSinkOp
 // Tensorflow while loops.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // This is defined in Tensorflow as:
 //
@@ -212,7 +212,7 @@ class LoopCondOp
 // condition, and returns two values matching the type of the data predicate.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // This is defined in TensorFlow as:
 //
@@ -246,7 +246,7 @@ class SwitchOp : public Op<SwitchOp, OpTrait::AtLeastNOperands<2>::Impl,
 // outside of loop. Each returned tensor needs its own _tf.Exit.
 //
 // More details can be found in Tensorflow Controlflow white paper:
-// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 //
 // This is defined in Tensorflow as:
 //
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
index 333711f52f6..235980e05c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -25,5 +26,7 @@ static DialectRegistration<TFControlFlow::TFControlFlowDialect>
 static DialectRegistration<TF::TensorFlowDialect> tf_ops;
 static DialectRegistration<tf_executor::TensorFlowExecutorDialect>
     tf_excutor_dialect;
+static DialectRegistration<tf_device::TensorFlowDeviceDialect>
+    tf_device_dialect;
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
new file mode 100644
index 00000000000..cac27164ef7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+
+namespace mlir {
+namespace tf_device {
+
+TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext *context)
+    : Dialect(/*name=*/"tf_device", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
+
+}  // namespace tf_device
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
new file mode 100644
index 00000000000..91370bc6501
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the tf_device dialect: it contains operations that model
+// TensorFlow's actions to launch computations on accelerator devices.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
+
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Dialect.h"  // TF:local_config_mlir
+
+namespace mlir {
+namespace tf_device {
+
+// The TensorFlow Device dialect.
+//
+// This dialect contains operations to describe/launch computations on devices.
+// These operations do not map 1-1 to TensorFlow ops and requires a lowering
+// pass later to transform them into Compile/Run op pairs, like XlaCompile and
+// XlaRun.
+class TensorFlowDeviceDialect : public Dialect {
+ public:
+  // Constructing TensorFlowDevice dialect under an non-null MLIRContext.
+  explicit TensorFlowDeviceDialect(MLIRContext *context);
+};
+
+// Declares the operations for this dialect using the generated header.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h.inc"
+
+}  // namespace tf_device
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
new file mode 100644
index 00000000000..3220f0f98dc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -0,0 +1,129 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the definition file for the TensorFlow Device Dialect.
+
+#ifdef TF_DEVICE_DIALECT
+#else
+#define TF_DEVICE_DIALECT
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Device Dialect definitions
+//===----------------------------------------------------------------------===//
+
+def TfDevice_Dialect : Dialect {
+  let name = "tf_device";
+
+  let description = [{
+    The TensorFlow Device dialect.
+
+    This dialect contains operations to describe/launch computations on devices.
+    These operations do not map 1-1 to TensorFlow ops and requires a lowering
+    pass later to transform them into Compile/Run op pairs, like XlaCompile and
+    XlaRun.
+}];
+
+  let cppNamespace = "tf_device";
+}
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Device Dialect Ops definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for the operation in this dialect.
+class TfDevice_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TfDevice_Dialect, mnemonic, traits> { }
+
+def TfDevice_LaunchOp : TfDevice_Op<"launch",
+                                    [SingleBlockImplicitTerminator<"ReturnOp">]>
+{
+  let summary = [{The `tf_device.launch` op captures all needed live-in values
+                  and launches containing operations on target device.}];
+
+  let arguments = (ins
+    StrAttr:$device
+  );
+
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let regions = (region SizedRegion<1>:$body);
+
+  let extraClassDeclaration = [{
+    Block &GetBody() { return getOperation()->getRegion(0).front(); }
+    StringRef getDevice() { return device(); }
+  }];
+
+  let builders = [
+    OpBuilder<[{Builder *builder, OperationState *result,
+                StringAttr device, ArrayRef<Type> result_types}],
+      [{
+        result->addAttribute("device", device);
+        result->addTypes(result_types);
+        result->addRegion();
+      }]
+    >
+  ];
+}
+
+def TfDevice_ReturnOp : TfDevice_Op<"return",
+                                    [Terminator, HasParent<"LaunchOp">]> {
+  let summary = [{
+    The `tf_device.return` operation terminates and returns values from
+    `tf_device.launch` operation;
+  }];
+
+  let arguments = (ins
+    Variadic<AnyType>:$results
+  );
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result",
+    [{
+      build(builder, result, {});
+    }]>
+   ];
+
+  let verifier = ?;
+}
+
+def TfDevice_LaunchFuncOp : TfDevice_Op<"launch_func", []> {
+  let summary = [{
+    The `tf_device.launch_func` launches a function on target device.
+  }];
+
+  let arguments = (ins
+    StrAttr:$device,
+    SymbolRefAttr:$func,
+    Variadic<AnyType>:$operands);
+
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let extraClassDeclaration = [{
+    StringRef getFunc() { return func(); }
+    StringRef getDevice() { return device(); }
+    FunctionType getFuncType();
+  }];
+}
+
+#endif // TF_DEVICE_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 29d73a71ad9..77d412f02c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -16,27 +16,52 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
 #include <algorithm>
+#include <iterator>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Dialect/Traits.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Matchers.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
 #include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
 namespace tf_executor {
+namespace {
+
+// If the given tensor has elements of type variant, then returns a new type
+// after dropping subtypes info. Otherwise, returns the original type as is.
+Type DropVariantSubTypes(Type ty) {
+  ShapedType shaped_ty = ty.cast<ShapedType>();
+  Type element_ty = shaped_ty.getElementType();
+  if (!element_ty.isa<TF::VariantType>()) return ty;
+
+  Type variant_ty = TF::VariantType::get(ty.getContext());
+  if (shaped_ty.hasRank()) {
+    return RankedTensorType::get(shaped_ty.getShape(), variant_ty);
+  }
+
+  return UnrankedTensorType::get(variant_ty);
+}
+
+}  // namespace
 
 //===----------------------------------------------------------------------===//
 // TF Executor Dialect
@@ -77,21 +102,6 @@ void TensorFlowExecutorDialect::printType(Type type, raw_ostream &os) const {
 
 namespace {
 
-// Inserts `tf_executor.Terminator` at the end of the region's only block if it
-// does not have a terminator already. If the region is empty, insert a new
-// block first.
-template <typename Terminator>
-void EnsureExecutorTerminator(Region *region, Builder *builder, Location loc) {
-  if (region->empty()) region->push_back(new Block);
-
-  Block &block = region->back();
-  if (!block.empty() && block.back().isKnownTerminator()) return;
-
-  OperationState terminator_state(loc, Terminator::getOperationName());
-  Terminator::build(builder, &terminator_state, {});
-  block.push_back(Operation::create(terminator_state));
-}
-
 // Verifies that every control operands are at the end of the list.
 // Used by the constraint `ControlOperandsAfterAllData` in ODS.
 LogicalResult VerifyControlOperandsAfterAllData(Operation *op) {
@@ -108,10 +118,16 @@ LogicalResult VerifyControlOperandsAfterAllData(Operation *op) {
   return success();
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.graph
 //===----------------------------------------------------------------------===//
 
+FetchOp GraphOp::GetFetch() { return llvm::cast<FetchOp>(GetBody().back()); }
+
+namespace {
+
 LogicalResult Verify(GraphOp graph) {
   auto *executorDialect = graph.getDialect();
 
@@ -123,6 +139,9 @@ LogicalResult Verify(GraphOp graph) {
   for (Operation &op : graph.GetBody()) {
     if (op.getDialect() != executorDialect)
       return op.emitOpError() << "unallowed inside a tf_executor.graph region";
+    if (isa<GraphOp>(op))
+      return op.emitOpError()
+             << "unallowed directly inside another tf_executor.graph";
   }
 
   Operation &fetch = graph.GetBody().back();
@@ -174,8 +193,7 @@ ParseResult ParseGraphOp(OpAsmParser *parser, OperationState *result) {
 
   // Ensure that the region is well formed: it contains at least a block with
   // a FetchOp terminator.
-  EnsureExecutorTerminator<FetchOp>(&body, &parser->getBuilder(),
-                                    result->location);
+  GraphOp::ensureTerminator(body, parser->getBuilder(), result->location);
 
   // Get the results type from the terminator type inside the graph.
   Operation &fetch = body.back().back();
@@ -196,10 +214,14 @@ ParseResult ParseGraphOp(OpAsmParser *parser, OperationState *result) {
   return success();
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.fetch
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 void Print(FetchOp fetch, OpAsmPrinter *p) {
   *p << fetch.getOperationName();
   if (fetch.getNumOperands() > 0) {
@@ -224,10 +246,16 @@ ParseResult ParseFetchOp(OpAsmParser *parser, OperationState *result) {
   );
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.island
 //===----------------------------------------------------------------------===//
 
+YieldOp IslandOp::GetYield() { return llvm::cast<YieldOp>(GetBody().back()); }
+
+namespace {
+
 LogicalResult Verify(IslandOp island) {
   if (island.GetBody().empty())
     return island.emitOpError() << "expects a non-empty body";
@@ -281,8 +309,7 @@ ParseResult ParseIslandOp(OpAsmParser *parser, OperationState *result) {
   if (parser->parseOperandList(op_infos, OpAsmParser::Delimiter::OptionalParen))
     return failure();
   if (!op_infos.empty()) {
-    SmallVector<Type, 2> types;
-    types.push_back(control_type);
+    SmallVector<Type, 2> types(op_infos.size(), control_type);
     parser->resolveOperands(op_infos, types, loc, result->operands);
   }
 
@@ -301,8 +328,7 @@ ParseResult ParseIslandOp(OpAsmParser *parser, OperationState *result) {
 
   if (parser->parseRegion(body, llvm::None, llvm::None)) return failure();
 
-  EnsureExecutorTerminator<YieldOp>(&body, &parser->getBuilder(),
-                                    result->location);
+  IslandOp::ensureTerminator(body, parser->getBuilder(), result->location);
 
   // Get the results type for the island from the terminator operands.
   Operation &yield = body.back().back();
@@ -315,10 +341,14 @@ ParseResult ParseIslandOp(OpAsmParser *parser, OperationState *result) {
   return success();
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.yield
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 void Print(YieldOp yield, OpAsmPrinter *p) {
   *p << yield.getOperationName();
   if (yield.getNumOperands() > 0) {
@@ -341,10 +371,14 @@ ParseResult ParseYieldOp(OpAsmParser *parser, OperationState *result) {
       parser->parseOptionalAttributeDict(result->attributes));
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.Switch
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 ParseResult ParseSwitchOp(OpAsmParser *parser, OperationState *result) {
   SmallVector<OpAsmParser::OperandType, 2> op_infos;
   SmallVector<Type, 1> types;
@@ -398,10 +432,14 @@ void Print(SwitchOp switch_op, OpAsmPrinter *p) {
   p->printOptionalAttrDict(switch_op.getAttrs());
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.SwitchN
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 LogicalResult Verify(SwitchNOp switchn) {
   IntegerAttr num_outs = switchn.getAttrOfType<IntegerAttr>("num_outs");
   if (!num_outs)
@@ -467,8 +505,9 @@ ParseResult ParseSwitchNOp(OpAsmParser *parser, OperationState *result) {
 
   // `types` already contains the type for the data, add an i32 for the
   // output_index, and then the optional control inputs.
-  types.push_back(parser->getBuilder().getIntegerType(32));
-  Type control_type = ControlType::get(parser->getBuilder().getContext());
+  auto builder = parser->getBuilder();
+  types.push_back(builder.getTensorType({}, builder.getIntegerType(32)));
+  Type control_type = ControlType::get(builder.getContext());
   types.append(op_infos.size() - 2, control_type);
 
   if (parser->resolveOperands(op_infos, types, loc, result->operands))
@@ -481,10 +520,14 @@ ParseResult ParseSwitchNOp(OpAsmParser *parser, OperationState *result) {
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.Merge
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 LogicalResult Verify(MergeOp merge) {
   if (!merge.getNumOperands())
     return merge.emitOpError() << "expects at least one operand";
@@ -498,8 +541,17 @@ LogicalResult Verify(MergeOp merge) {
   Type broadcasted_type = merge.output()->getType();
   for (Type operand_type : merge.getOperandTypes()) {
     if (operand_type.isa<ControlType>()) break;
+
+    // TODO(hinsu): Update ControlOperandsAfterAllData trait to verify this
+    // constraint.
+    if (!operand_type.isa<TensorType>())
+      return merge.emitOpError("expects data operands to have tensor type");
+
+    // Variant types may have opaque subtypes information that need not match
+    // between the two types so drop them before computing the broadcasted type.
     Type new_broadcasted_type =
-        OpTrait::util::getBroadcastedType(broadcasted_type, operand_type);
+        OpTrait::util::getBroadcastedType(DropVariantSubTypes(broadcasted_type),
+                                          DropVariantSubTypes(operand_type));
     if (!new_broadcasted_type)
       return merge.emitOpError()
              << "expects all operands to be broadcastable"
@@ -508,10 +560,8 @@ LogicalResult Verify(MergeOp merge) {
     // This is because for example starting with a result of tensor<4xf32>, if
     // the first operand is unranked, the broadcasted type will be unranked.
     // Then any tensor operand will be broadcastable to this unranked type.
-    if ((broadcasted_type.isa<TensorType>() &&
-         !broadcasted_type.cast<TensorType>().hasRank()) ||
-        (new_broadcasted_type.isa<TensorType>() &&
-         new_broadcasted_type.cast<TensorType>().hasRank()))
+    if (!broadcasted_type.cast<TensorType>().hasRank() ||
+        new_broadcasted_type.cast<TensorType>().hasRank())
       broadcasted_type = new_broadcasted_type;
   }
 
@@ -519,11 +569,33 @@ LogicalResult Verify(MergeOp merge) {
 }
 
 void Print(MergeOp merge, OpAsmPrinter *p) {
+  // Use short form only when there are exactly two data operands and their
+  // type matches the output type. Otherwise, use the generic printer.
+  bool use_short_form = true;
+  int num_data_operands = 0;
+
+  Type output_type = merge.output()->getType();
+  for (Type operand_type : merge.getOperandTypes()) {
+    if (operand_type.isa<ControlType>()) break;
+    num_data_operands++;
+
+    if (operand_type != output_type) {
+      use_short_form = false;
+      break;
+    }
+  }
+
   *p << merge.getOperationName() << ' ';
   p->printOperands(merge.getOperands());
 
   // Print the type signature of the operation.
-  *p << " : " << merge.getType(0);
+  *p << " : ";
+  if (!use_short_form || num_data_operands != 2) {
+    p->printFunctionalType(merge.getOperation());
+  } else {
+    *p << output_type;
+  }
+
   p->printOptionalAttrDict(merge.getAttrs());
 }
 
@@ -537,25 +609,38 @@ ParseResult ParseMergeOp(OpAsmParser *parser, OperationState *result) {
     return parser->emitError(parser->getNameLoc())
            << " expects only a single data type";
 
-  // Expect the type once, but use it for both operands.
-  types.push_back(types.front());
-  // Extra operands are expected to be control inputs.
-  Type control_type = ControlType::get(parser->getBuilder().getContext());
-  types.append(op_infos.size() - 2, control_type);
+  // Support parsing either a functional type (in which case all the types are
+  // fully qualified) or a short form with a single type (in which case the data
+  // inputs and the output are all using this type).
+  if (FunctionType type = types.front().dyn_cast<FunctionType>()) {
+    result->types.assign(type.getResults().begin(), type.getResults().end());
+    types.assign(type.getInputs().begin(), type.getInputs().end());
+  } else {
+    // In case of the short form, use the parsed type for both the operands and
+    // the remaining operands are expected to be control inputs.
+    types.push_back(types.front());
+    Type control_type = ControlType::get(parser->getBuilder().getContext());
+    types.append(op_infos.size() - 2, control_type);
+
+    RankedTensorType i32_tensor =
+        RankedTensorType::get({}, parser->getBuilder().getIntegerType(32));
+    result->types = {types.front(), i32_tensor, control_type};
+  }
 
   if (parser->resolveOperands(op_infos, types, loc, result->operands))
     return failure();
-  RankedTensorType i32_tensor =
-      RankedTensorType::get({}, parser->getBuilder().getIntegerType(32));
-  result->types = {types.front(), i32_tensor, control_type};
 
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.Enter
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 // Default number for the parallel_iterations attributes on Enter nodes.
 constexpr int kDefaultParallelIterations = 10;
 
@@ -638,10 +723,14 @@ ParseResult ParseEnterOp(OpAsmParser *parser, OperationState *result) {
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.NextIteration.Source
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 LogicalResult Verify(NextIterationSourceOp source) {
   Value *token = source.token();
   if (!token->hasOneUse())
@@ -668,10 +757,14 @@ ParseResult ParseNextIterationSourceOp(OpAsmParser *parser,
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.NextIteration.Sink
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 LogicalResult Verify(NextIterationSinkOp sink) {
   Value *token = sink.token();
   Operation *definingOp = token->getDefiningOp();
@@ -720,10 +813,14 @@ ParseResult ParseNextIterationSinkOp(OpAsmParser *parser,
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.Exit
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 void Print(ExitOp exit, OpAsmPrinter *p) {
   *p << exit.getOperationName() << ' ';
   p->printOperands(exit.getOperands());
@@ -748,10 +845,14 @@ ParseResult ParseExitOp(OpAsmParser *parser, OperationState *result) {
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.ControlTrigger
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 void Print(ControlTriggerOp trigger, OpAsmPrinter *p) {
   *p << trigger.getOperationName() << ' ';
   p->printOperands(trigger.getOperands());
@@ -774,10 +875,14 @@ ParseResult ParseControlTriggerOp(OpAsmParser *parser, OperationState *result) {
   return parser->parseOptionalAttributeDict(result->attributes);
 }
 
+}  // anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // tf_executor.LoopCond
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 void Print(LoopCondOp loop_cond, OpAsmPrinter *p) {
   *p << loop_cond.getOperationName() << ' ';
   p->printOperands(loop_cond.getOperands());
@@ -832,6 +937,179 @@ ParseResult ParseLoopCondOp(OpAsmParser *parser, OperationState *result) {
 
 }  // namespace
 
+//===----------------------------------------------------------------------===//
+// Canonicalization patterns
+//===----------------------------------------------------------------------===//
+
+// TODO(lyandy): Add canonicalization for dedupping control inputs.
+
+//===----------------------------------------------------------------------===//
+// tf_executor.graph
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Finds in a block if the op of type `InnerOpT` is the first operation and
+// optionally followed by a terminator.
+template <typename InnerOpT>
+bool HasSingleOpInBlock(Block *block) {
+  if (block->empty()) return false;
+  if (!llvm::isa<InnerOpT>(block->front())) return false;
+  // Either InnerOpT is the only instruction in the block, or there is a
+  // possible terminator.
+  return std::next(block->begin()) == block->end() ||
+         std::next(block->begin(), 2) == block->end();
+}
+
+// This pattern matches GraphOps with only one FetchOp (empty) and remaps the
+// results of the GraphOp to the operands of the FetchOp.
+struct DropEmptyGraph : public OpRewritePattern<GraphOp> {
+  using OpRewritePattern<GraphOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(GraphOp op,
+                                     PatternRewriter &rewriter) const override {
+    Block &block = op.GetBody();
+    // Check if graph only has one fetch.
+    if (&block.front() != &block.back()) return matchFailure();
+
+    // Map graph results to fetch operands.
+    llvm::SmallVector<Value *, 8> new_rets(op.GetFetch().fetches());
+    rewriter.replaceOp(op, new_rets);
+
+    return matchSuccess();
+  }
+};
+
+// This pattern matches GraphOps with only one island, pulls out all inner ops
+// of the island to the block containing the GraphOp, and then removes the
+// GraphOp.
+struct HoistInnerOpsSingleIslandGraph : public OpRewritePattern<GraphOp> {
+  using OpRewritePattern<GraphOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(GraphOp op,
+                                     PatternRewriter &rewriter) const override {
+    Block &block = op.GetBody();
+    // Check if graph only has one island.
+    if (!HasSingleOpInBlock<IslandOp>(&block)) return matchFailure();
+
+    FetchOp fetch_op = op.GetFetch();
+    auto island_op = llvm::cast<IslandOp>(block.front());
+    YieldOp yield_op = island_op.GetYield();
+
+    // Map graph results to inner ops results of single island.
+    llvm::SmallVector<Value *, 8> new_rets;
+    for (Value *operand : fetch_op.fetches()) {
+      // Control results should not be propagated out.
+      if (operand->getType().isa<ControlType>()) break;
+
+      if (operand->getDefiningOp() != island_op) {
+        // Operand is not from island, simply propagate it out.
+        new_rets.push_back(operand);
+      } else {
+        // Lookup yield operand in island for inner op result.
+        auto result = llvm::cast<OpResult>(operand);
+        new_rets.push_back(yield_op.getOperand(result->getResultNumber()));
+      }
+    }
+
+    // Move inner ops from island to block containing graph.
+    auto &island_body = island_op.GetBody().getOperations();
+    Operation *operation = op.getOperation();
+    operation->getBlock()->getOperations().splice(
+        operation->getIterator(), island_body, island_body.begin(),
+        std::prev(island_body.end()));
+    rewriter.replaceOp(op, new_rets);
+
+    return matchSuccess();
+  }
+};
+}  // anonymous namespace
+
+void GraphOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<DropEmptyGraph, HoistInnerOpsSingleIslandGraph>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// tf_executor.island
+//===----------------------------------------------------------------------===//
+
+namespace {
+// This pattern matches and removes IslandOps with no inner ops, no control
+// operands and no data results. Control result users will have their relevant
+// operands removed.
+struct DropEmptyIslandNoOperandNoDataResult
+    : public OpRewritePattern<IslandOp> {
+  using OpRewritePattern<IslandOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(IslandOp op,
+                                     PatternRewriter &rewriter) const override {
+    if (op.getNumOperands() != 0 || op.getNumResults() != 1 ||
+        !HasSingleOpInBlock<YieldOp>(&op.GetBody()))
+      return matchFailure();
+
+    for (auto &use : llvm::make_early_inc_range(op.control()->getUses()))
+      use.getOwner()->eraseOperand(use.getOperandNumber());
+
+    rewriter.replaceOp(op, {nullptr});
+
+    return matchSuccess();
+  }
+};
+
+// This pattern matches and removes IslandOps with no inner ops, no control
+// operands, one data result and no control result user. The single data result
+// (from YieldOps first operand) is forwarded to the IslandOp single data result
+// users.
+struct DropEmptyIslandNoOperandOneDataResult
+    : public OpRewritePattern<IslandOp> {
+  using OpRewritePattern<IslandOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(IslandOp op,
+                                     PatternRewriter &rewriter) const override {
+    if (op.getNumOperands() != 0 || op.getNumResults() != 2 ||
+        !op.control()->use_empty() ||
+        !HasSingleOpInBlock<YieldOp>(&op.GetBody()))
+      return matchFailure();
+
+    rewriter.replaceOp(op, {op.GetYield().getOperand(0), nullptr});
+
+    return matchSuccess();
+  }
+};
+
+// TODO(lyandy): Add canonicalization for empty IslandOps with more than one
+// control operand and no data results.
+
+}  // anonymous namespace
+
+void IslandOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<DropEmptyIslandNoOperandNoDataResult,
+                 DropEmptyIslandNoOperandOneDataResult>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Folders
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// tf_executor.island
+//===----------------------------------------------------------------------===//
+
+LogicalResult IslandOp::fold(llvm::ArrayRef<Attribute> operands,
+                             llvm::SmallVectorImpl<OpFoldResult> &results) {
+  // This folds IslandOps with no inner ops, one control operand and no data
+  // results. The single control operand is forwarded to the IslandOp control
+  // result users.
+  if (getNumOperands() != 1 || getNumResults() != 1 ||
+      !HasSingleOpInBlock<YieldOp>(&GetBody()))
+    return failure();
+
+  results.emplace_back(getOperand(0));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 125ef1bfda6..50412544460 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -55,12 +55,14 @@ def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">;
 // Token type.
 def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">;
 
+// TODO(hinsu): Define and use TensorType instead of AnyType for data operands
+// and results. For example, MergeOp output type.
+
 //===----------------------------------------------------------------------===//
 // TensorFlow Executor Type Constraint
 //===----------------------------------------------------------------------===//
 
-// Predicate to verify that the opId'th operand can be broadcasted to the type
-// of the  resId'th result.
+// Predicate to verify all control inputs appear after any non-control inputs.
 def ControlOperandsAfterAllData :
     PredOpTrait<"all control inputs must appear after any non-control input",
                 CPred<"succeeded(VerifyControlOperandsAfterAllData(&$_op))">>;
@@ -79,7 +81,8 @@ class TfExecutor_Op<string mnemonic, list<OpTrait> traits = []> :
   let parser = [{ return Parse$cppClass(parser, result); }];
 }
 
-def TfExecutor_GraphOp : TfExecutor_Op<"graph", []> {
+def TfExecutor_GraphOp : TfExecutor_Op<"graph",
+    [SingleBlockImplicitTerminator<"FetchOp">]> {
   let summary = [{The `tf_executor.graph` operation contains a region with a
     single block that lists the operations in a TensorFlow graph.}];
 
@@ -120,10 +123,14 @@ def TfExecutor_GraphOp : TfExecutor_Op<"graph", []> {
 
   let extraClassDeclaration = [{
     Block &GetBody() { return getOperation()->getRegion(0).front(); }
+    FetchOp GetFetch();
   }];
+
+  let hasCanonicalizer = 1;
 }
 
-def TfExecutor_FetchOp : TfExecutor_Op<"fetch", [Terminator, ControlOperandsAfterAllData]> {
+def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
+    [Terminator, ControlOperandsAfterAllData, HasParent<"GraphOp">]> {
   let summary = [{
     The `tf_executor.fetch` operation terminates the graph and returns values";
   }];
@@ -137,10 +144,18 @@ def TfExecutor_FetchOp : TfExecutor_Op<"fetch", [Terminator, ControlOperandsAfte
     Variadic<AnyType>:$fetches
   );
 
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result",
+    [{
+      build(builder, result, {});
+    }]>
+   ];
+
   let verifier = ?;
 }
 
-def TfExecutor_IslandOp : TfExecutor_Op<"island", []> {
+def TfExecutor_IslandOp : TfExecutor_Op<"island",
+    [HasParent<"GraphOp">, SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = [{
     The `tf_executor.island` operation is a wrapper for operations in other
     dialects to be nested in a `tf_executor.graph`.
@@ -190,11 +205,16 @@ def TfExecutor_IslandOp : TfExecutor_Op<"island", []> {
 
   let extraClassDeclaration = [{
     Block &GetBody() { return getOperation()->getRegion(0).front(); }
+    YieldOp GetYield();
   }];
+
+  let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
-def TfExecutor_YieldOp :
-    TfExecutor_Op<"yield", [Terminator, ControlOperandsAfterAllData]> {
+def TfExecutor_YieldOp : TfExecutor_Op<"yield",
+    [Terminator, ControlOperandsAfterAllData, HasParent<"IslandOp">]> {
   let summary = [{
     The `tf_executor.yield` operation terminates and returns values for the
     `tf_executor.island` operation.
@@ -204,11 +224,18 @@ def TfExecutor_YieldOp :
     Variadic<AnyType>:$fetches
   );
 
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result",
+    [{
+      build(builder, result, {});
+    }]>
+   ];
+
   let verifier = ?;
 }
 
 def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
-    [NoSideEffect, ControlOperandsAfterAllData,
+    [NoSideEffect, ControlOperandsAfterAllData, HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to true result",
                  TCOpIsBroadcastableToRes<0, 0>>,
      PredOpTrait<"data operand must be broadcastable to false result",
@@ -221,7 +248,7 @@ def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
 
   let description = [{
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     This is defined in TensorFlow as:
 
@@ -253,8 +280,8 @@ def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
    let verifier = ?;
 }
 
-def TfExecutor_SwitchNOp :
-    TfExecutor_Op<"SwitchN", [NoSideEffect, ControlOperandsAfterAllData]> {
+def TfExecutor_SwitchNOp : TfExecutor_Op<"SwitchN",
+    [NoSideEffect, ControlOperandsAfterAllData, HasParent<"GraphOp">]> {
   let summary = [{
     The "tf_executor.SwitchN" operation takes two inputs, `data` and `index` and
     an integer attribute `num_outs` indicating the number of outputs. The `data`
@@ -282,7 +309,7 @@ def TfExecutor_SwitchNOp :
 
   let arguments = (ins
     AnyType:$data,
-    I32:$index,
+    TensorOf<[I32]>:$index,
     // Optional extra control inputs.
     Variadic<TfeControlType>:$controlInputs,
     I64Attr:$num_outs
@@ -294,7 +321,8 @@ def TfExecutor_SwitchNOp :
   );
 }
 
-def TfExecutor_MergeOp : TfExecutor_Op<"Merge", [NoSideEffect, ControlOperandsAfterAllData]> {
+def TfExecutor_MergeOp : TfExecutor_Op<"Merge",
+    [NoSideEffect, ControlOperandsAfterAllData, HasParent<"GraphOp">]> {
   let summary = [{
     The "tf_executor.Merge" operation takes a list of input operands and returns
     a value of the operand type along with the index of the first match encountered.
@@ -302,7 +330,7 @@ def TfExecutor_MergeOp : TfExecutor_Op<"Merge", [NoSideEffect, ControlOperandsAf
 
   let description = [{
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     This is defined in TensorFlow as:
 
@@ -322,14 +350,14 @@ def TfExecutor_MergeOp : TfExecutor_Op<"Merge", [NoSideEffect, ControlOperandsAf
   );
 
   let results = (outs
-    AnyType:$output,
+    AnyTensor:$output,
     TensorOf<[I32]>:$valueIndex,
     TfeControlType:$control
   );
 }
 
 def TfExecutor_EnterOp : TfExecutor_Op<"Enter",
-    [NoSideEffect, ControlOperandsAfterAllData,
+    [NoSideEffect, ControlOperandsAfterAllData, HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to result",
                  TCOpIsBroadcastableToRes<0, 0>>]>{
   let summary = [{
@@ -339,7 +367,7 @@ def TfExecutor_EnterOp : TfExecutor_Op<"Enter",
 
   let description = [{
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     Each tensor needs its own tf_executor.Enter to be made available inside a
     while loop.
@@ -378,7 +406,8 @@ def TfExecutor_EnterOp : TfExecutor_Op<"Enter",
   let verifier = ?;
 }
 
-def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [NoSideEffect]> {
+def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source",
+    [NoSideEffect, HasParent<"GraphOp">]> {
   let summary = [{
     The "tf_executor.NextIteration.Source" is paired with a
     "tf_executor.NextIteration.sink" to represent NextIteration op in
@@ -390,7 +419,7 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No
     of a while loop. Each loop variable needs its own NextIteration op.
 
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     In the TF executor dialect, the NextIteration op is broken into
     tf_executor.NextIteration.sink and tf_executor.NextIteration.source because
@@ -415,10 +444,6 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No
     Note: Additional result corresponds to the control output.
   }];
 
-  let arguments = (ins
-    Variadic<TfeControlType>:$controlInputs
-  );
-
   let results = (outs
     AnyType:$output,
     // The NextIteration.Source operation returns an extra token consumed by the sink.
@@ -428,19 +453,26 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No
 
   let builders = [OpBuilder<
     "Builder *builder, OperationState *result, Type result_type, "
-    "ArrayRef<Value *> control_inputs = {}, ArrayRef<NamedAttribute> attributes = {}",
+    "ArrayRef<NamedAttribute> attributes = {}",
     [{
       Type token_type = TokenType::get(builder->getContext());
       Type control_type = ControlType::get(builder->getContext());
       result->types = { result_type, token_type, control_type };
-      result->operands.append(control_inputs.begin(), control_inputs.end());
       result->attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
+
+  let extraClassDeclaration = [{
+    NextIterationSinkOp GetSink() {
+      return cast<NextIterationSinkOp>(*token()->user_begin());
+    }
+  }];
+
 }
 
 
-def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink"> {
+def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
+    [HasParent<"GraphOp">]> {
   let summary = [{
     The "tf_executor.NextIteration.Sink" is paired with a
     "tf_executor.NextIteration.source" to represent NextIteration op in
@@ -452,7 +484,7 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink"> {
     of a while loop. Each loop variable needs its own NextIteration op.
 
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     In the TF executor dialect, the NextIteration op is broken into
     tf_executor.NextIteration.sink and tf_executor.NextIteration.source because
@@ -500,7 +532,7 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink"> {
 }
 
 def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
-    [NoSideEffect,
+    [NoSideEffect, HasParent<"GraphOp">,
      PredOpTrait<"data operand must be broadcastable to result",
                  TCOpIsBroadcastableToRes<0, 0>>]>{
 
@@ -512,7 +544,7 @@ def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
 
   let description = [{
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     This is defined in Tensorflow as:
 
@@ -540,7 +572,8 @@ def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
   let verifier = ?;
 }
 
-def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger", [NoSideEffect]> {
+def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
+    [NoSideEffect, HasParent<"GraphOp">]> {
   let summary = [{
     The `tf_executor.ControlTrigger` operation is similar to a no-op except that
     it always produces a valid output even when inputs are dead.
@@ -576,7 +609,8 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger", [NoSideEffect]
    ];
 }
 
-def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond", [NoSideEffect]> {
+def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond",
+    [NoSideEffect, HasParent<"GraphOp">]> {
   let summary = [{
     The "tf_executor.LoopCond" operation forwards a boolean value as loop
     condition of Tensorflow while loops.
@@ -584,7 +618,7 @@ def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond", [NoSideEffect]> {
 
   let description = [{
     More details can be found in Tensorflow Control Flow white paper:
-    http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
+    https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf
 
     This is defined in Tensorflow as:
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 9c256034c2b..f7311f61985 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -123,6 +123,65 @@ def TF_AddV2Op : TF_Op<"AddV2", [Broadcastable, Commutative, NoSideEffect]>,
   let hasCanonicalizer = 1;
 }
 
+def TF_AnyOp : TF_Op<"Any", [NoSideEffect]> {
+  let summary = [{
+Computes the "logical or" of elements across dimensions of a tensor.
+  }];
+
+  let description = [{
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+  }];
+
+  let arguments = (ins
+    I1Tensor:$input,
+    TF_I32OrI64Tensor:$reduction_indices,
+
+    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
+  );
+
+  let results = (outs
+    I1Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_ArgMaxOp : TF_Op<"ArgMax", [NoSideEffect]> {
+  let summary = [{
+Returns the index with the largest value across dimensions of a tensor.
+  }];
+
+  let description = [{
+Note that in case of ties the identity of the return value is not guaranteed.
+
+Usage:
+  ```python
+  import tensorflow as tf
+  a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+  b = tf.math.argmax(input = a)
+  c = tf.keras.backend.eval(b)
+  # c = 4
+  # here a[4] = 166.32 which is the largest element of a across axis 0
+  ```
+  }];
+
+  let arguments = (ins
+    TF_NumberTensor:$input,
+    TF_I32OrI64Tensor:$dimension
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr output_type = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_ArgMinOp : TF_Op<"ArgMin", [NoSideEffect]> {
   let summary = [{
 Returns the index with the smallest value across dimensions of a tensor.
@@ -136,7 +195,7 @@ Usage:
   import tensorflow as tf
   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
   b = tf.math.argmin(input = a)
-  c = tf.keras.backend.eval(b)  
+  c = tf.keras.backend.eval(b)
   # c = 0
   # here a[0] = 1 which is the smallest element of a across axis 0
   ```
@@ -156,6 +215,28 @@ Usage:
   TF_DerivedResultTypeAttr output_type = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_AssertOp : TF_Op<"Assert", []> {
+  let summary = "Asserts that the given condition is true.";
+
+  let description = [{
+If `condition` evaluates to false, print the list of tensors in `data`.
+`summarize` determines how many entries of the tensors to print.
+  }];
+
+  let arguments = (ins
+    I1Tensor:$condition,
+    Variadic<TF_Tensor>:$data,
+
+    DefaultValuedAttr<I64Attr, "3">:$summarize
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<1>;
+
+  let hasCanonicalizer = 1;
+}
+
 def TF_AvgPoolOp : TF_Op<"AvgPool", [NoSideEffect]> {
   let summary = "Performs average pooling on the input.";
 
@@ -528,6 +609,115 @@ Given an input tensor, this function computes cosine of every
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_DepthToSpaceOp : TF_Op<"DepthToSpace", [NoSideEffect]> {
+  let summary = "DepthToSpace for tensors of type T.";
+
+  let description = [{
+Rearranges data from depth into blocks of spatial data.
+This is the reverse transformation of SpaceToDepth. More specifically,
+this op outputs a copy of the input tensor where values from the `depth`
+dimension are moved in spatial blocks to the `height` and `width` dimensions.
+The attr `block_size` indicates the input block size and how the data is moved.
+
+  * Chunks of data of size `block_size * block_size` from depth are rearranged
+    into non-overlapping blocks of size `block_size x block_size`
+  * The width the output tensor is `input_depth * block_size`, whereas the
+    height is `input_height * block_size`.
+  * The Y, X coordinates within each block of the output image are determined
+    by the high order component of the input channel index.
+  * The depth of the input tensor must be divisible by
+    `block_size * block_size`.
+
+The `data_format` attr specifies the layout of the input and output tensors
+with the following options:
+  "NHWC": `[ batch, height, width, channels ]`
+  "NCHW": `[ batch, channels, height, width ]`
+  "NCHW_VECT_C":
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
+
+It is useful to consider the operation as transforming a 6-D Tensor.
+e.g. for data_format = NHWC,
+     Each element in the input tensor can be specified via 6 coordinates,
+     ordered by decreasing memory layout significance as:
+     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+                        within the input image, bX, bY means coordinates
+                        within the output block, oC means output channels).
+     The output would be the input transposed to the following layout:
+     n,iY,bY,iX,bX,oC
+
+This operation is useful for resizing the activations between convolutions
+(but keeping all data), e.g. instead of pooling. It is also useful for training
+purely convolutional models.
+
+For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+block_size = 2:
+
+```
+x = [[[[1, 2, 3, 4]]]]
+
+```
+
+This operation will output a tensor of shape `[1, 2, 2, 1]`:
+
+```
+   [[[[1], [2]],
+     [[3], [4]]]]
+```
+
+Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+the corresponding output will have 2x2 elements and will have a depth of
+1 channel (1 = `4 / (block_size * block_size)`).
+The output element shape is `[2, 2, 1]`.
+
+For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+
+```
+x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+```
+
+This operation, for block size of 2, will return the following tensor of shape
+`[1, 2, 2, 3]`
+
+```
+   [[[[1, 2, 3], [4, 5, 6]],
+     [[7, 8, 9], [10, 11, 12]]]]
+
+```
+
+Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+
+```
+x =  [[[[1, 2, 3, 4],
+       [5, 6, 7, 8]],
+      [[9, 10, 11, 12],
+       [13, 14, 15, 16]]]]
+```
+
+the operator will return the following tensor of shape `[1 4 4 1]`:
+
+```
+x = [[[ [1],   [2],  [5],  [6]],
+      [ [3],   [4],  [7],  [8]],
+      [ [9],  [10], [13],  [14]],
+      [ [11], [12], [15],  [16]]]]
+
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    Confined<I64Attr, [IntMinValue<2>]>:$block_size,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NCHW_VECT_C"]>, "NHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_DepthwiseConv2dNativeOp : TF_Op<"DepthwiseConv2dNative", [NoSideEffect]> {
   let summary = [{
 Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
@@ -646,6 +836,51 @@ tf.math.equal(x, y) ==> array([True,  True])
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ExpOp : TF_Op<"Exp", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Computes exponential of x element-wise.  \\(y = e^x\\).
+  }];
+
+  let description = [{
+This function computes the exponential of every element in the input tensor.
+  i.e. `exp(x)` or `e^(x)`, where `x` is the input tensor.
+  `e` denotes Euler's number and is approximately equal to 2.718281.
+  Output is positive for any real input.
+
+  ```python
+  x = tf.constant(2.0)
+  tf.math.exp(x) ==> 7.389056
+
+  x = tf.constant([2.0, 8.0])
+  tf.math.exp(x) ==> array([7.389056, 2980.958], dtype=float32)
+  ```
+
+  For complex numbers, the exponential value is calculated as follows:
+
+  ```
+  e^(x+iy) = e^x * e^iy = e^x * (cos y + i sin y)
+  ```
+
+  Let's consider complex number 1+1j as an example.
+  e^1 * (cos 1 + i sin 1) = 2.7182818284590 * (0.54030230586+0.8414709848j)
+
+  ```python
+  x = tf.constant(1 + 1j)
+  tf.math.exp(x) ==> 1.4686939399158851+2.2873552871788423j
+  ```
+  }];
+
+  let arguments = (ins
+    TF_FpOrComplexTensor:$x
+  );
+
+  let results = (outs
+    TF_FpOrComplexTensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ExpandDimsOp : TF_Op<"ExpandDims", [NoSideEffect]> {
   let summary = "Inserts a dimension of 1 into a tensor's shape.";
 
@@ -858,6 +1093,32 @@ def TF_FloorDivOp : TF_Op<"FloorDiv", [Broadcastable, NoSideEffect]>,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_FloorModOp : TF_Op<"FloorMod", [Broadcastable, NoSideEffect]>,
+                    WithBroadcastableBinOpBuilder {
+  let summary = [{
+Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+  }];
+
+  let description = [{
+true, this follows Python semantics in that the result here is consistent
+with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+
+*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TF_FpOrI32OrI64Tensor:$x,
+    TF_FpOrI32OrI64Tensor:$y
+  );
+
+  let results = (outs
+    TF_FpOrI32OrI64Tensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_FusedBatchNormOp : TF_Op<"FusedBatchNorm", [NoSideEffect]> {
   let summary = "Batch normalization.";
 
@@ -893,6 +1154,39 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 }
 
+def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
   let summary = "Gather slices from `params` according to `indices`.";
 
@@ -945,13 +1239,13 @@ Gather slices from `params` into a Tensor with shape specified by `indices`.
   }];
 
   let description = [{
-`indices` is an K-dimensional integer tensor, best thought of as a
+`indices` is a K-dimensional integer tensor, best thought of as a
 (K-1)-dimensional tensor of indices into `params`, where each element defines a
 slice of `params`:
 
     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
 
-Whereas in `tf.gather` `indices` defines slices into the first
+Whereas in `tf.gather` `indices` defines slices into the `axis`
 dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
 first `N` dimensions of `params`, where `N = indices.shape[-1]`.
 
@@ -1224,10 +1518,10 @@ for dtype in dtype_list:
                                       input_tensor, bitwise_ops.invert(input_tensor)),
                                     bitwise_ops.invert(
                                       tf.constant(0, dtype=dtype))]
-  
+
   expected = tf.constant([0, 0, 0, 0], dtype=tf.float32)
   tf.assert_equal(tf.cast(not_a_and_a, tf.float32), expected)
-  
+
   expected = tf.cast([not_0] * 4, tf.float32)
   tf.assert_equal(tf.cast(not_a_or_a, tf.float32), expected)
 
@@ -2402,6 +2696,29 @@ Input images can be of different types but output images are always float.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResizeNearestNeighborOp : TF_Op<"ResizeNearestNeighbor", [NoSideEffect]> {
+  let summary = [{
+Resize `images` to `size` using nearest neighbor interpolation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I16, I32, I64, I8]>:$images,
+    I32Tensor:$size,
+
+    DefaultValuedAttr<BoolAttr, "false">:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I16, I32, I64, I8]>:$resized_images
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ReverseSequenceOp : TF_Op<"ReverseSequence", [NoSideEffect]> {
   let summary = "Reverses variable length slices.";
 
@@ -2543,6 +2860,27 @@ reverse(t, dims) ==> [[[[8, 9, 10, 11],
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_RoundOp : TF_Op<"Round", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+Rounds the values of a tensor to the nearest integer, element-wise.
+  }];
+
+  let description = [{
+Rounds half to even.  Also known as bankers rounding. If you want to round
+according to the current system rounding mode use std::cint.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_RsqrtOp : TF_Op<"Rsqrt", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes reciprocal of square root of x element-wise.";
 
@@ -2618,6 +2956,25 @@ select(condition, t, e) ==> [[1, 2],
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect]> {
+  let summary = "";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I1Tensor:$condition,
+    TF_Tensor:$t,
+    TF_Tensor:$e
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_ShapeOp : TF_Op<"Shape", [NoSideEffect]> {
   let summary = "Returns the shape of a tensor.";
 
@@ -2650,6 +3007,31 @@ shape(t) ==> [2, 2, 3]
   let hasFolder = 1;
 }
 
+def TF_ShapeNOp : TF_Op<"ShapeN", [NoSideEffect]> {
+  let summary = "Returns shape of tensors.";
+
+  let description = [{
+This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$input,
+
+    Confined<I64Attr, [IntMinValue<1>]>:$N
+  );
+
+  let results = (outs
+    Variadic<TF_I32OrI64Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_SigmoidOp : TF_Op<"Sigmoid", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes sigmoid of `x` element-wise.";
 
@@ -2719,6 +3101,23 @@ whose values are extracted from 'input' starting at the offsets in
   TF_DerivedOperandTypeAttr Index = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SnapshotOp : TF_Op<"Snapshot", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Returns a copy of the input tensor.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_SoftmaxOp : TF_Op<"Softmax", [NoSideEffect]> {
   let summary = "Computes softmax activations.";
 
@@ -2772,6 +3171,151 @@ precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SpaceToDepthOp : TF_Op<"SpaceToDepth", [NoSideEffect]> {
+  let summary = "SpaceToDepth for tensors of type T.";
+
+  let description = [{
+Rearranges blocks of spatial data, into depth. More specifically,
+this op outputs a copy of the input tensor where values from the `height`
+and `width` dimensions are moved to the `depth` dimension.
+The attr `block_size` indicates the input block size.
+
+  * Non-overlapping blocks of size `block_size x block size` are rearranged
+    into depth at each location.
+  * The depth of the output tensor is `block_size * block_size * input_depth`.
+  * The Y, X coordinates within each block of the input become the high order
+    component of the output channel index.
+  * The input tensor's height and width must be divisible by block_size.
+
+The `data_format` attr specifies the layout of the input and output tensors
+with the following options:
+  "NHWC": `[ batch, height, width, channels ]`
+  "NCHW": `[ batch, channels, height, width ]`
+  "NCHW_VECT_C":
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
+
+It is useful to consider the operation as transforming a 6-D Tensor.
+e.g. for data_format = NHWC,
+     Each element in the input tensor can be specified via 6 coordinates,
+     ordered by decreasing memory layout significance as:
+     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+                        within the output image, bX, bY means coordinates
+                        within the input block, iC means input channels).
+     The output would be a transpose to the following layout:
+     n,oY,oX,bY,bX,iC
+
+This operation is useful for resizing the activations between convolutions
+(but keeping all data), e.g. instead of pooling. It is also useful for training
+purely convolutional models.
+
+For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+block_size = 2:
+
+```
+x = [[[[1], [2]],
+      [[3], [4]]]]
+```
+
+This operation will output a tensor of shape `[1, 1, 1, 4]`:
+
+```
+[[[[1, 2, 3, 4]]]]
+```
+
+Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+the corresponding output will have a single element (i.e. width and height are
+both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+The output element shape is `[1, 1, 4]`.
+
+For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+This operation, for block_size of 2, will return the following tensor of shape
+`[1, 1, 1, 12]`
+
+```
+[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+```
+
+Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+
+```
+x = [[[[1],   [2],  [5],  [6]],
+      [[3],   [4],  [7],  [8]],
+      [[9],  [10], [13],  [14]],
+      [[11], [12], [15],  [16]]]]
+```
+
+the operator will return the following tensor of shape `[1 2 2 4]`:
+
+```
+x = [[[[1, 2, 3, 4],
+       [5, 6, 7, 8]],
+      [[9, 10, 11, 12],
+       [13, 14, 15, 16]]]]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    Confined<I64Attr, [IntMinValue<2>]>:$block_size,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NCHW_VECT_C"]>, "NHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_SparseToDenseOp : TF_Op<"SparseToDense", [NoSideEffect]> {
+  let summary = "Converts a sparse representation into a dense tensor.";
+
+  let description = [{
+Builds an array `dense` with shape `output_shape` such that
+
+```
+# If sparse_indices is scalar
+dense[i] = (i == sparse_indices ? sparse_values : default_value)
+
+# If sparse_indices is a vector, then for each i
+dense[sparse_indices[i]] = sparse_values[i]
+
+# If sparse_indices is an n by d matrix, then for each i in [0, n)
+dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+```
+
+All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+scalar, all sparse indices are set to this single value.
+
+Indices should be sorted in lexicographic order, and indices must not
+contain any repeats. If `validate_indices` is true, these properties
+are checked during execution.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$sparse_indices,
+    TF_I32OrI64Tensor:$output_shape,
+    TF_Tensor:$sparse_values,
+    TF_Tensor:$default_value,
+
+    DefaultValuedAttr<BoolAttr, "true">:$validate_indices
+  );
+
+  let results = (outs
+    TF_Tensor:$dense
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
@@ -2910,6 +3454,42 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Stops gradient computation.";
+
+  let description = [{
+When executed in a graph, this op outputs its input tensor as-is.
+
+When building ops to compute gradients, this op prevents the contribution of
+its inputs to be taken into account.  Normally, the gradient generator adds ops
+to a graph to compute the derivatives of a specified 'loss' by recursively
+finding out inputs that contributed to its computation.  If you insert this op
+in the graph it inputs are masked from the gradient generator.  They are not
+taken into account for computing gradients.
+
+This is useful any time you want to compute a value with TensorFlow but need
+to pretend that the value was a constant. Some examples include:
+
+*  The *EM* algorithm where the *M-step* should not involve backpropagation
+   through the output of the *E-step*.
+*  Contrastive divergence training of Boltzmann machines where, when
+   differentiating the energy function, the training must not backpropagate
+   through the graph that generated the samples from the model.
+*  Adversarial training, where no backprop should happen through the adversarial
+   example generation process.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_StridedSliceOp : TF_Op<"StridedSlice", [NoSideEffect]> {
   let summary = "Return a strided slice from `input`.";
 
@@ -3143,6 +3723,31 @@ def TF_TensorListGetItemOp : TF_Op<"TensorListGetItem", [NoSideEffect]> {
   TF_DerivedResultTypeAttr element_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_TensorListPushBackOp : TF_Op<"TensorListPushBack", [NoSideEffect]> {
+  let summary = [{
+Returns a list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+  }];
+
+  let description = [{
+tensor: The tensor to put on the list.
+input_handle: The old list.
+output_handle: A list with the elements of the old list followed by tensor.
+element_dtype: the type of elements in the list.
+element_shape: a shape compatible with that of elements in the list.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_handle,
+    TF_Tensor:$tensor
+  );
+
+  let results = (outs
+    TF_VariantTensor:$output_handle
+  );
+
+  TF_DerivedOperandTypeAttr element_dtype = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_TensorListSetItemOp : TF_Op<"TensorListSetItem", [NoSideEffect]> {
   let summary = "";
 
@@ -3187,6 +3792,30 @@ num_elements: optional. If not -1, the number of elements in the list.
   TF_DerivedResultTypeAttr element_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_TileOp : TF_Op<"Tile", [NoSideEffect]> {
+  let summary = "Constructs a tensor by tiling a given tensor.";
+
+  let description = [{
+This operation creates a new tensor by replicating `input` `multiples` times.
+The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+and the values of `input` are replicated `multiples[i]` times along the 'i'th
+dimension. For example, tiling `[a b c d]` by `[2]` produces
+`[a b c d a b c d]`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$multiples
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tmultiples = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_TopKV2Op : TF_Op<"TopKV2", [NoSideEffect]> {
   let summary = [{
 Finds values and indices of the `k` largest elements for the last dimension.
@@ -3346,6 +3975,82 @@ This is the opposite of `pack`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_WhereOp : TF_Op<"Where", [NoSideEffect]> {
+  let summary = "Returns locations of nonzero / true values in a tensor.";
+
+  let description = [{
+This operation returns the coordinates of true elements in `condition`. The
+coordinates are returned in a 2-D tensor where the first dimension (rows)
+represents the number of true elements, and the second dimension (columns)
+represents the coordinates of the true elements. Keep in mind, the shape of
+the output tensor can vary depending on how many true values there are in
+`condition`. Indices are output in row-major order.
+
+For example:
+
+```
+# 'input' tensor is [[True, False]
+#                    [True, False]]
+# 'input' has two true values, so output has two coordinates.
+# 'input' has rank of 2, so coordinates have two indices.
+where(input) ==> [[0, 0],
+                  [1, 0]]
+
+# `condition` tensor is [[[True, False]
+#                     [True, False]]
+#                    [[False, True]
+#                     [False, True]]
+#                    [[False, False]
+#                     [False, True]]]
+# 'input' has 5 true values, so output has 5 coordinates.
+# 'input' has rank of 3, so coordinates have three indices.
+where(input) ==> [[0, 0, 0],
+                  [0, 1, 0],
+                  [1, 0, 1],
+                  [1, 1, 1],
+                  [2, 1, 1]]
+
+# `condition` tensor is [[[1.5,  0.0]
+#                     [-0.5, 0.0]]
+#                    [[0.0,  0.25]
+#                     [0.0,  0.75]]
+#                    [[0.0,  0.0]
+#                     [0.0,  0.01]]]
+# 'input' has 5 nonzero values, so output has 5 coordinates.
+# 'input' has rank of 3, so coordinates have three indices.
+where(input) ==> [[0, 0, 0],
+                  [0, 1, 0],
+                  [1, 0, 1],
+                  [1, 1, 1],
+                  [2, 1, 1]]
+
+# `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+#                     [0.0 + 0.5j, 0.0  + 0.0j]]
+#                    [[0.0 + 0.0j, 0.25 + 1.5j]
+#                     [0.0 + 0.0j, 0.75 + 0.0j]]
+#                    [[0.0 + 0.0j, 0.0  + 0.0j]
+#                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+# 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+# 'input' has rank of 3, so coordinates have three indices.
+where(input) ==> [[0, 0, 0],
+                  [0, 1, 0],
+                  [1, 0, 1],
+                  [1, 1, 1],
+                  [2, 1, 1]]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$input
+  );
+
+  let results = (outs
+    I64Tensor:$index
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XdivyOp : TF_Op<"Xdivy", [Broadcastable, NoSideEffect]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index f374b6b0b77..080e78042a7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -16,7 +16,8 @@ limitations under the License.
 // This is the base operation definition file for TensorFlow.
 //
 // This file includes the definition for the TensorFlow dialect, base TensorFlow
-// op, and various commonly used TensorFlow types, attributes, and builders.
+// op, and various commonly used TensorFlow traits, types, attributes, and
+// builders.
 
 #ifdef TF_OP_BASE
 #else
@@ -50,6 +51,16 @@ TODO: Make invariants more structured so that we can reference them in ops.
   let cppNamespace = "TF";
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFlow traits
+//===----------------------------------------------------------------------===//
+
+// Specify this trait if the op requires all outputs to have the same type and
+// the inputs either have the same type as result or a ref type corresponding to
+// the result type.
+def TF_OperandsSameAsResultsTypeOrRef : NativeOpTrait<
+  "TF::OperandsSameAsResultsTypeOrRef">;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
@@ -65,6 +76,12 @@ class TF_Op<string mnemonic, list<OpTrait> traits = []> :
 def TF_TFDialectType :
     Type<CPred<"$_self.isa<TensorFlowType>()">, "TensorFlow type">;
 
+// Class for any TensorFlow dialect specific type
+class TF_TensorFlowType <string name, string description> :
+    Type<CPred<"$_self.isa<mlir::TF::" # name # "Type>()">,
+         "TensorFlow " # description # " type">,
+    BuildableType<"getType<mlir::TF::" # name # "Type>()">;
+
 // Any tensor element type allowed in TensorFlow ops
 def TF_ElementType : Type<Or<[AnyFloat.predicate, AnyInteger.predicate,
                                  TF_TFDialectType.predicate]>,
@@ -80,11 +97,34 @@ def TF_I32Or64 : IntOfWidths<[32, 64]>;
 
 def TF_I32OrI64Tensor : TensorOf<[TF_I32Or64]>;
 
-def TF_Int : IntOfWidths<[8, 16, 32, 64]>;
+def TF_Uint8  : TF_TensorFlowType<"Uint8", "uint8">;
+def TF_Uint16 : TF_TensorFlowType<"Uint16", "uint16">;
+def TF_Uint32 : TF_TensorFlowType<"Uint32", "uint32">;
+def TF_Uint64 : TF_TensorFlowType<"Uint64", "uint64">;
+
+// Any unsigned integer type
+def TF_UInt : AnyTypeOf<[TF_Uint8, TF_Uint16, TF_Uint32, TF_Uint64]>;
+
+// Any signed integer type
+def TF_SInt : IntOfWidths<[8, 16, 32, 64]>;
+
+// Any integer type
+def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt]>;
 
 // Any integer tensor types
 def TF_IntTensor : TensorOf<[TF_Int]>;
 
+//===----------------------------------------------------------------------===//
+// Quantized types
+def TF_Qint8   : TF_TensorFlowType<"Qint8", "qint8">;
+def TF_Qint16  : TF_TensorFlowType<"Qint16", "qint16">;
+def TF_Qint32  : TF_TensorFlowType<"Qint32", "qint32">;
+def TF_Quint8  : TF_TensorFlowType<"Quint8", "quint8">;
+def TF_Quint16 : TF_TensorFlowType<"Quint16", "quint16">;
+
+// Any quantized type
+def TF_AnyQuantized : AnyTypeOf<[TF_Qint8, TF_Qint16, TF_Qint32, TF_Quint8,
+                              TF_Quint16]>;
 //===----------------------------------------------------------------------===//
 // Floating-point types
 
@@ -98,12 +138,10 @@ def TF_FpTensor : TensorOf<[AnyFloat]>;
 //===----------------------------------------------------------------------===//
 // Complex types
 
-def TF_Complex64 :
-    Type<CPred<"$_self.isa<TF::Complex64Type>()">, "complex64 type">;
+def TF_Complex64 : TF_TensorFlowType<"Complex64", "complex64">;
 def TF_Complex64Tensor : TensorOf<[TF_Complex64]>;
 
-def TF_Complex128 :
-    Type<CPred<"$_self.isa<TF::Complex128Type>()">, "complex128 type">;
+def TF_Complex128 : TF_TensorFlowType<"Complex128", "complex128">;
 def TF_Complex128Tensor : TensorOf<[TF_Complex128]>;
 
 def TF_AnyComplex : AnyTypeOf<[TF_Complex64, TF_Complex128],
@@ -114,19 +152,13 @@ def TF_ComplexTensor : TensorOf<[TF_AnyComplex]>;
 //===----------------------------------------------------------------------===//
 // String/variant/resource types
 
-def TF_Str : Type<CPred<"$_self.isa<mlir::TF::StringType>()">,
-                  "TensorFlow string type">,
-             BuildableType<"getType<mlir::TF::StringType>()">;
+def TF_Str : TF_TensorFlowType<"String", "string">;
 def TF_StrTensor : TensorOf<[TF_Str]>;
 
-def TF_Variant : Type<CPred<"$_self.isa<mlir::TF::VariantType>()">,
-                      "TensorFlow variant type">,
-                 BuildableType<"getType<mlir::TF::VariantType>()">;
+def TF_Variant : TF_TensorFlowType<"Variant", "variant">;
 def TF_VariantTensor : TensorOf<[TF_Variant]>;
 
-def TF_Resource : Type<CPred<"$_self.isa<mlir::TF::ResourceType>()">,
-                       "TensorFlow variant type">,
-                  BuildableType<"getType<mlir::TF::ResourceType>()">;
+def TF_Resource : TF_TensorFlowType<"Resource", "resource">;
 def TF_ResourceTensor : TensorOf<[TF_Resource]>;
 
 //===----------------------------------------------------------------------===//
@@ -141,7 +173,8 @@ def TF_IntOrFpTensor : TensorOf<[TF_Int, AnyFloat]>;
 
 def TF_FpOrComplexTensor : TensorOf<[AnyFloat, TF_AnyComplex]>;
 
-def TF_AnyNumber : AnyTypeOf<[TF_Int, AnyFloat, TF_AnyComplex], "number">;
+def TF_AnyNumber : AnyTypeOf<[TF_Int, AnyFloat, TF_AnyQuantized, TF_AnyComplex],
+                             "number">;
 
 def TF_NumberTensor : TensorOf<[TF_AnyNumber]>;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index e39a6768ea4..587849c6a95 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -16,11 +16,16 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 #include <algorithm>
+#include <functional>
+#include <numeric>
+#include <string>
 
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
@@ -29,23 +34,19 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // TF:local_config_mlir
 #include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Parser.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
 #include "mlir/Support/STLExtras.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TF {
 
-namespace {
-#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
-}  // namespace
-
 //===----------------------------------------------------------------------===//
 // TF op helper functions
 //===----------------------------------------------------------------------===//
@@ -75,10 +76,11 @@ static inline bool HasRankAtLeast(Value *value, int64_t rank) {
     return ranked_type.getRank() >= rank;
   return type.isa<UnrankedTensorType>();
 }
+
 // Returns true if the given pair of TensorFlow types can be cast to one
 // another. In other words, a single run-time value is legal for both the types.
 // For example, tensor<*xf32> and tensor<3xf32> are cast compatible.
-bool AreCastCompatible(Type a, Type b) {
+static bool AreCastCompatible(Type a, Type b) {
   if (TensorCastOp::areCastCompatible(a, b)) return true;
 
   // Variant types may optionally contain subtypes information that need not
@@ -89,13 +91,21 @@ bool AreCastCompatible(Type a, Type b) {
          getElementTypeOrSelf(b).getKind() == TensorFlowTypes::VARIANT;
 }
 
+static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
+  return dim_or_rank == -1;
+}
+
+namespace {
+#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
+}  // namespace
+
 //===----------------------------------------------------------------------===//
 // AddOp
 //===----------------------------------------------------------------------===//
 
 void AddOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                         MLIRContext *context) {
-  RewriteListBuilder<AddToAddV2>::build(results, context);
+  results.insert<AddToAddV2>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +114,36 @@ void AddOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                           MLIRContext *context) {
-  RewriteListBuilder<AddV2OfNegLeft, AddV2OfNegRight>::build(results, context);
+  results.insert<AddV2OfNegLeft, AddV2OfNegRight>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// AssertOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Removes Assert with constant true predicate.
+struct AssertWithTrue : public OpRewritePattern<AssertOp> {
+  using OpRewritePattern<AssertOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AssertOp op,
+                                     PatternRewriter &rewriter) const override {
+    ElementsAttr cst;
+    if (matchPattern(op.condition(), m_Constant(&cst))) {
+      if (cst.getValue<BoolAttr>({}).getValue()) {
+        rewriter.replaceOp(op, llvm::None);
+        return matchSuccess();
+      }
+    }
+    return matchFailure();
+  }
+};
+}  // namespace
+
+void AssertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<AssertWithTrue>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -113,7 +152,7 @@ void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void BitcastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                             MLIRContext *context) {
-  RewriteListBuilder<BitcastSameType, BitcastNested>::build(results, context);
+  results.insert<BitcastSameType, BitcastNested>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -134,7 +173,7 @@ static LogicalResult Verify(BroadcastToOp op) {
 
 void CastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                          MLIRContext *context) {
-  RewriteListBuilder<CastSameType>::build(results, context);
+  results.insert<CastSameType>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -143,7 +182,7 @@ void CastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void ConjOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                          MLIRContext *context) {
-  RewriteListBuilder<ConjNested>::build(results, context);
+  results.insert<ConjNested>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -199,7 +238,23 @@ void ConstOp::build(Builder *builder, OperationState *result, Type type,
 
 void DivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                         MLIRContext *context) {
-  RewriteListBuilder<DivWithSqrtDivisor>::build(results, context);
+  results.insert<DivWithSqrtDivisor>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// EmptyTensorListOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(EmptyTensorListOp op) {
+  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
+      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
+  }
+
+  if (!IsOfRankOrUnranked(op.max_num_elements(), 0)) {
+    return op.emitOpError("requires max_num_elements operand to be 0D tensor");
+  }
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -282,45 +337,39 @@ static LogicalResult Verify(FusedBatchNormOp op) {
 // IfOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult IfOp::verify() {
-  auto thenAttr = getAttrOfType<SymbolRefAttr>("then_branch");
-  if (!thenAttr) return emitOpError("requires then_branch attribute");
-
-  auto elseAttr = getAttrOfType<SymbolRefAttr>("else_branch");
-  if (!elseAttr) return emitOpError("requires else_branch attribute");
-
-  auto module = getParentOfType<ModuleOp>();
-  auto thenFn = module.lookupSymbol<FuncOp>(thenAttr.getValue());
+static LogicalResult Verify(IfOp op) {
+  auto module = op.getParentOfType<ModuleOp>();
+  auto thenFn = module.lookupSymbol<FuncOp>(op.then_branch());
   if (!thenFn)
-    return emitOpError("then_branch refers to an undefined function : ")
-           << thenAttr;
-  auto elseFn = module.lookupSymbol<FuncOp>(elseAttr.getValue());
+    return op.emitOpError("then_branch refers to an undefined function : ")
+           << op.then_branch();
+  auto elseFn = module.lookupSymbol<FuncOp>(op.else_branch());
   if (!elseFn)
-    return emitOpError("else_branch refers to an undefined function : ")
-           << elseAttr;
+    return op.emitOpError("else_branch refers to an undefined function : ")
+           << op.else_branch();
   auto thenFuncType = thenFn.getType();
   auto elseFuncType = elseFn.getType();
 
   // Non-conditional operands starting with the second operand are passed to
   // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expectedNumInputs = getNumOperands() - 1;
+  unsigned expectedNumInputs = op.getNumOperands() - 1;
   if (thenFuncType.getNumInputs() != expectedNumInputs ||
       elseFuncType.getNumInputs() != expectedNumInputs)
-    return emitError("branches should have " + Twine(expectedNumInputs) +
-                     " inputs");
+    return op.emitError("branches should have " + Twine(expectedNumInputs) +
+                        " inputs");
 
   for (unsigned i = 0; i < expectedNumInputs; ++i) {
-    auto operandType = getOperand(i + 1)->getType().cast<TensorType>();
+    auto operandType = op.getOperand(i + 1)->getType().cast<TensorType>();
     auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
     if (!AreCastCompatible(operandType, thenInputType))
-      return emitError(
+      return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
                         thenInputType, operandType, i));
 
     auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
     if (!AreCastCompatible(operandType, elseInputType))
-      return emitError(
+      return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
                         elseInputType, operandType, i));
@@ -328,30 +377,30 @@ LogicalResult IfOp::verify() {
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
     if (!AreCastCompatible(thenInputType, elseInputType))
-      return emitError(llvm::formatv(
+      return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
           thenInputType, elseInputType, i));
   }
 
   // Branches' results should be pair-wise compatible with the op results.
-  unsigned expectedNumResults = getNumResults();
+  unsigned expectedNumResults = op.getNumResults();
   if (thenFuncType.getNumResults() != expectedNumResults ||
       elseFuncType.getNumResults() != expectedNumResults)
-    return emitError("branches should have " + Twine(expectedNumResults) +
-                     " results");
+    return op.emitError("branches should have " + Twine(expectedNumResults) +
+                        " results");
 
   for (unsigned i = 0; i < expectedNumResults; ++i) {
-    auto resultType = getResult(i)->getType().cast<TensorType>();
+    auto resultType = op.getResult(i)->getType().cast<TensorType>();
     auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
     if (!AreCastCompatible(thenResultType, resultType))
-      return emitError(
+      return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
                         thenResultType, resultType, i));
 
     auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
     if (!AreCastCompatible(elseResultType, resultType))
-      return emitError(
+      return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
                         elseResultType, resultType, i));
@@ -365,7 +414,7 @@ LogicalResult IfOp::verify() {
 
 void InvertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                            MLIRContext *context) {
-  RewriteListBuilder<InvertNested>::build(results, context);
+  results.insert<InvertNested>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -399,7 +448,7 @@ OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
 
 void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                         MLIRContext *context) {
-  RewriteListBuilder<LogOfSoftmax>::build(results, context);
+  results.insert<LogOfSoftmax>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -408,10 +457,9 @@ void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void LogicalNotOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
-  RewriteListBuilder<LogicalNotNested, LogicalNotOfEqual, LogicalNotOfNotEqual,
-                     LogicalNotOfGreater, LogicalNotOfGreaterEqual,
-                     LogicalNotOfLess, LogicalNotOfLessEqual>::build(results,
-                                                                     context);
+  results.insert<LogicalNotNested, LogicalNotOfEqual, LogicalNotOfNotEqual,
+                 LogicalNotOfGreater, LogicalNotOfGreaterEqual,
+                 LogicalNotOfLess, LogicalNotOfLessEqual>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -420,7 +468,7 @@ void LogicalNotOp::getCanonicalizationPatterns(
 
 void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                         MLIRContext *context) {
-  RewriteListBuilder<NegNested>::build(results, context);
+  results.insert<NegNested>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -429,7 +477,7 @@ void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void ReciprocalOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
-  RewriteListBuilder<ReciprocalNested>::build(results, context);
+  results.insert<ReciprocalNested>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -488,7 +536,7 @@ void RankOp::build(Builder *builder, OperationState *result, Value *input) {
 
 void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                             MLIRContext *context) {
-  RewriteListBuilder<RealDivWithSqrtDivisor>::build(results, context);
+  results.insert<RealDivWithSqrtDivisor>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -499,12 +547,13 @@ void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 // m_Constant.
 static LogicalResult Verify(ReshapeOp op) {
   auto shapeType = op.shape()->getType().cast<TensorType>();
+  if (!shapeType.hasRank()) return success();
   if (shapeType.getRank() != 1)
     return op.emitOpError("shape must be 1D tensor");
   auto rankByShape = shapeType.getShape()[0];
   auto typeOfTensor = op.tensor()->getType().cast<TensorType>();
   // No compile time verification for unknown sized shape.
-  if (rankByShape == -1 || !typeOfTensor.hasRank()) return success();
+  if (rankByShape == -1 || !typeOfTensor.hasStaticShape()) return success();
   // Check values if constant shape. No compiling time verification for
   // non-constant shape.
   auto *shapeOp = op.shape()->getDefiningOp();
@@ -529,7 +578,7 @@ static LogicalResult Verify(ReshapeOp op) {
   unsigned numByShape = 1;
   unsigned unknownDimCount = 0;
   for (int i = 0, e = rankByShape; i != e; ++i) {
-    auto num = shapeCstAttr.getValue(i).cast<IntegerAttr>().getInt();
+    auto num = shapeCstAttr.getValue<IntegerAttr>(i).getInt();
     // The dimension size value can be -1, and that the real size needs to
     // be computed so that the total size remains constant. At most one
     // component of shape can be -1.
@@ -561,53 +610,105 @@ static LogicalResult Verify(ReshapeOp op) {
 
 void ReshapeOp::build(Builder *builder, OperationState *result, Value *tensor,
                       Value *shape) {
-  auto etype = tensor->getType().cast<ShapedType>().getElementType();
+  auto ttype = tensor->getType().cast<ShapedType>();
+  auto etype = ttype.getElementType();
+
+  auto unranked = [builder, etype, result, shape, tensor]() {
+    return ReshapeOp::build(builder, result, builder->getTensorType(etype),
+                            tensor, shape);
+  };
+
+  // If tensor is unranked then we have no info about output of shape.
+  if (!ttype.hasRank()) return unranked();
+
   DenseIntElementsAttr attr_shape;
   if (matchPattern(shape, m_Constant(&attr_shape))) {
     llvm::SmallVector<int64_t, 4> const_shape;
-    if (attr_shape.isSplat()) {
-      const_shape.assign(attr_shape.getType().getNumElements(),
-                         (*attr_shape.begin()).getSExtValue());
-    } else {
-      const_shape.reserve(attr_shape.getType().getNumElements());
-      for (auto dim : attr_shape) const_shape.push_back(dim.getSExtValue());
+    const_shape.reserve(attr_shape.getNumElements());
+
+    // Detect if reshape output shape is folded.
+    bool flatten = false;
+    int unknown_index = -1;
+    // The product of constant shape argument excluding unknown dimension.
+    int64_t product_cshape = 1;
+    for (auto e : llvm::enumerate(attr_shape)) {
+      int64_t val = e.value().getSExtValue();
+      if (IsUnknownDimOrRank(val)) {
+        if (flatten) {
+          mlir::emitError(result->location)
+              << "only one unknown dimension allowed";
+          return;
+        }
+        flatten = true;
+        unknown_index = e.index();
+      } else {
+        product_cshape *= val;
+      }
+      const_shape.push_back(val);
+    }
+
+    // Compute the value of the uknown dimension.
+    if (flatten) {
+      // Compute number of elements in tensor shape.
+      auto tshape = ttype.getShape();
+      int64_t product_tshape = std::accumulate(tshape.begin(), tshape.end(), 1,
+                                               std::multiplies<int64_t>());
+      // Set the unknown dimension such that total number of elements remain
+      // constant.
+      // Note: The case where the ratio is not integral, and so the total size
+      // of reshape not constant, is checked in verify function.
+      const_shape[unknown_index] = product_tshape / product_cshape;
     }
     return ReshapeOp::build(builder, result,
                             builder->getTensorType(const_shape, etype), tensor,
                             shape);
   }
-  return ReshapeOp::build(builder, result, builder->getTensorType(etype),
-                          tensor, shape);
+  return unranked();
 }
 
 //===----------------------------------------------------------------------===//
 // ShapeOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult Verify(ShapeOp op) {
-  auto inputType = op.input()->getType();
-  auto resultType = op.getType().dyn_cast<RankedTensorType>();
-  if (!resultType || resultType.getShape().size() != 1)
-    return op.emitOpError("requires 1D result type");
+namespace {
+// Validates Shape/ShapeN operand and associated result types.
+LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
+                                          Type result_type,
+                                          int variadic_idx = -1) {
+  std::string variadic_idx_str =
+      variadic_idx < 0 ? "" : llvm::formatv(" #{0}", variadic_idx).str();
 
-  auto rankedTensorType = inputType.dyn_cast<RankedTensorType>();
-  if (rankedTensorType) {
+  auto result_ranked_type = result_type.dyn_cast<RankedTensorType>();
+  if (!result_ranked_type || result_ranked_type.getShape().size() != 1)
+    return op->emitOpError("requires 1D type for result") << variadic_idx_str;
+
+  auto operand_ranked_type = operand_type.dyn_cast<RankedTensorType>();
+  if (operand_ranked_type) {
     // The operand is a ranked tensor.
-    if (resultType.hasStaticShape()) {
-      if ((!rankedTensorType.getShape().empty() &&
-           resultType.getDimSize(0) != rankedTensorType.getShape().size()))
-        return op.emitOpError(
-            "requires dimension size of result to match rank of operand");
-    }
-  } else {
+    if (result_ranked_type.hasStaticShape() &&
+        !operand_ranked_type.getShape().empty() &&
+        result_ranked_type.getDimSize(0) !=
+            operand_ranked_type.getShape().size())
+      return op->emitOpError("requires dimension size of result")
+             << variadic_idx_str << " to match rank of operand"
+             << variadic_idx_str;
+  } else if (result_ranked_type.hasStaticShape()) {
     // The operand is an unranked tensor, verify that the result is dynamic.
-    if (resultType.hasStaticShape())
-      return op.emitOpError("requires dynamic shape result for unranked input");
+    return op->emitOpError("requires dynamic shape result")
+           << variadic_idx_str << " for unranked operand" << variadic_idx_str;
   }
 
-  Type elt = op.getType().cast<ShapedType>().getElementType();
-  if (elt.isInteger(32) || elt.isInteger(64)) return success();
-  return op.emitOpError("requires int32 or int64 return type");
+  Type element_type = result_ranked_type.getElementType();
+  if (!element_type.isInteger(32) && !element_type.isInteger(64))
+    return op->emitOpError("requires int32 or int64 return type for result")
+           << variadic_idx_str;
+
+  return success();
+}
+}  // anonymous namespace
+
+static LogicalResult Verify(ShapeOp op) {
+  return VerifyShapeOperandAndResult(op, op.input()->getType(), op.getType());
 }
 
 OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
@@ -630,6 +731,30 @@ OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
   return b.getDenseElementsAttr(resultType, dimensions);
 }
 
+//===----------------------------------------------------------------------===//
+// ShapeNOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ShapeNOp op) {
+  const uint64_t n_attr = op.N().getZExtValue();
+
+  if (op.getNumOperands() != n_attr)
+    return op.emitOpError() << "requires " << n_attr << " operand(s), got "
+                            << op.getNumOperands() << " operand(s)";
+
+  if (op.getNumResults() != n_attr)
+    return op.emitOpError() << "requires " << n_attr << " result(s), got "
+                            << op.getNumResults() << " result(s)";
+
+  for (auto i : llvm::seq<uint64_t>(0, n_attr)) {
+    auto verification = VerifyShapeOperandAndResult(
+        op, op.getOperand(i)->getType(), op.getResult(i)->getType(), i);
+    if (failed(verification)) return verification;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // SoftmaxOp
 //===----------------------------------------------------------------------===//
@@ -647,7 +772,7 @@ static LogicalResult Verify(SoftmaxOp op) {
 
 void SquareOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                            MLIRContext *context) {
-  RewriteListBuilder<SquareOfSub>::build(results, context);
+  results.insert<SquareOfSub>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -656,7 +781,7 @@ void SquareOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 void SubOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                         MLIRContext *context) {
-  RewriteListBuilder<SubOfNeg>::build(results, context);
+  results.insert<SubOfNeg>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -707,10 +832,10 @@ void TransposeOp::build(Builder *builder, OperationState *result, Value *x,
     llvm::SmallVector<int64_t, 4> const_shape;
     if (attr_shape.isSplat()) {
       const_shape.assign(
-          attr_shape.getType().getNumElements(),
+          attr_shape.getNumElements(),
           x_type.getDimSize((*attr_shape.begin()).getSExtValue()));
     } else {
-      const_shape.reserve(attr_shape.getType().getNumElements());
+      const_shape.reserve(attr_shape.getNumElements());
       for (auto dim : attr_shape)
         const_shape.push_back(x_type.getDimSize(dim.getSExtValue()));
     }
@@ -727,32 +852,35 @@ void TransposeOp::build(Builder *builder, OperationState *result, Value *x,
 
 void TruncateDivOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
-  RewriteListBuilder<TruncateDivWithSqrtDivisor>::build(results, context);
+  results.insert<TruncateDivWithSqrtDivisor>(context);
 }
 
 //===----------------------------------------------------------------------===//
 // WhileOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult WhileOp::verify() {
-  auto condAttr = getAttrOfType<SymbolRefAttr>("cond");
-  if (!condAttr) return emitOpError("requires cond attribute");
+static LogicalResult Verify(WhileOp op) {
+  auto module = op.getParentOfType<ModuleOp>();
+  auto condFn = module.lookupSymbol<FuncOp>(op.cond());
+  auto bodyFn = module.lookupSymbol<FuncOp>(op.body());
+  if (!condFn) {
+    return op.emitOpError("cond refers to an undefined function : ")
+           << op.cond();
+  }
+  if (!bodyFn) {
+    return op.emitOpError("body refers to an undefined function : ")
+           << op.body();
+  }
 
-  auto module = getParentOfType<ModuleOp>();
-  auto condFn = module.lookupSymbol<FuncOp>(condAttr.getValue());
   auto condFuncType = condFn.getType();
+  auto bodyFuncType = bodyFn.getType();
 
   // Verify that the cond function has exactly one result.
   if (condFuncType.getNumResults() != 1)
-    return emitOpError("requires cond function to have exactly one result");
+    return op.emitOpError("requires cond function to have exactly one result");
 
-  auto bodyAttr = getAttrOfType<SymbolRefAttr>("body");
-  if (!bodyAttr) return emitOpError("requires body attribute");
-  auto bodyFn = module.lookupSymbol<FuncOp>(bodyAttr.getValue());
-  auto bodyFuncType = bodyFn.getType();
-
-  SmallVector<Type, 4> operands(getOperandTypes());
-  SmallVector<Type, 4> results(getResultTypes());
+  SmallVector<Type, 4> operands(op.getOperandTypes());
+  SmallVector<Type, 4> results(op.getResultTypes());
 
   // Collect all the type lists for the op so that different pairs of type lists
   // can be compared for the compatibility.
@@ -796,7 +924,7 @@ LogicalResult WhileOp::verify() {
 
       int aSize = a.second.size();
       if (aSize != b.second.size())
-        return emitOpError(
+        return op.emitOpError(
             llvm::formatv("requires the number of {0}s to be equal to the "
                           "number of {1}s. Found {2} and {3}, respectively",
                           a.first, b.first, aSize, b.second.size()));
@@ -806,7 +934,7 @@ LogicalResult WhileOp::verify() {
         auto bType = b.second[idx];
 
         if (!AreCastCompatible(aType, bType))
-          return emitError(llvm::formatv(
+          return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
               a.first, aType, b.first, bType, idx));
       }
@@ -821,7 +949,7 @@ LogicalResult WhileOp::verify() {
 
 void XdivyOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                           MLIRContext *context) {
-  RewriteListBuilder<XdivyWithSqrtDivisor>::build(results, context);
+  results.insert<XdivyWithSqrtDivisor>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -840,7 +968,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc.inc"
-      , IfOp, WhileOp>();
+      >();
   addTypes<
 #define HANDLE_TF_TYPE(tftype, enumerant, name) tftype##Type,
 #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
@@ -954,27 +1082,5 @@ Operation *TensorFlowDialect::materializeConstant(OpBuilder &builder,
   return nullptr;
 }
 
-// Verifies that the Op is a well-formed TensorFlow op, checking that all inputs
-// and results are Tensor or other TensorFlow types, etc.
-LogicalResult verifyTensorFlowOp(Operation *op) {
-  if (op->getName().getDialect() != "tf")
-    return op->emitError("TensorFlow op ")
-           << op->getName() << " should start with 'tf.'";
-
-  for (Type type : op->getOperandTypes()) {
-    if (!IsValidTFTensorType(type))
-      return op->emitOpError(
-          "requires operands to have a valid TensorFlow tensor type");
-  }
-
-  for (Type type : op->getResultTypes()) {
-    if (!IsValidTFTensorType(type))
-      return op->emitOpError(
-          "requires results to have a valid TensorFlow tensor type");
-  }
-
-  return success();
-}
-
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 723aa67c6c4..8a2fa9dd7fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -27,7 +27,8 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
@@ -64,20 +65,6 @@ class TensorFlowDialect : public Dialect {
                                  Location loc) override;
 };
 
-// This verifies that the Op is a well-formed TensorFlow op, checking
-// that all inputs and results are Tensor or other TensorFlow types, etc.
-static LogicalResult verifyTensorFlowOp(Operation *op);
-
-// This Trait should be used by all TensorFlow Ops.
-//
-template <typename ConcreteType>
-class TensorFlowOp : public OpTrait::TraitBase<ConcreteType, TensorFlowOp> {
- public:
-  static LogicalResult verifyTrait(Operation *op) {
-    return verifyTensorFlowOp(op);
-  }
-};
-
 // TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
 // purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
 // `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
@@ -89,88 +76,6 @@ class TensorFlowOp : public OpTrait::TraitBase<ConcreteType, TensorFlowOp> {
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc"
 
-// The "tf.If" operation takes a condition operand, a list of inputs, and a
-// function attribute for the then/else branches.  The condition operand
-// doesn't have to be a boolean tensor.  It is handled according to these
-// rules, quoting the TensorFlow op definition:
-//
-//   If the tensor is a scalar of non-boolean type, the scalar is converted to
-//   a boolean according to the following rule: if the scalar is a numerical
-//   value, non-zero means True and zero means False; if the scalar is a
-//   string, non-empty means True and empty means False. If the tensor is not a
-//   scalar, being empty means False and being non-empty means True.
-//
-// This is defined in TensorFlow as:
-//
-// REGISTER_OP("If")
-//     .Input("cond: Tcond")
-//     .Input("input: Tin")
-//     .Output("output: Tout")
-//     .Attr("Tcond: type")
-//     .Attr("Tin: list(type) >= 0")
-//     .Attr("Tout: list(type) >= 0")
-//     .Attr("then_branch: func")
-//     .Attr("else_branch: func")
-//
-// Note: Additional result corresponds to the control output.
-class IfOp : public Op<IfOp, TensorFlowOp, OpTrait::AtLeastNOperands<1>::Impl,
-                       OpTrait::VariadicResults> {
- public:
-  using Op::Op;
-  static StringRef getOperationName() { return "tf.If"; }
-
-  Value *getCondition() { return getOperand(0); }
-
-  // TODO(b/132271680): This is not following Google naming style
-  StringRef getThen() {
-    return getAttrOfType<SymbolRefAttr>("then_branch").getValue();
-  }
-
-  StringRef getElse() {
-    return getAttrOfType<SymbolRefAttr>("else_branch").getValue();
-  }
-
-  LogicalResult verify();
-};
-
-// The "tf.While" operation takes a list of inputs and function attributes for
-// the loop condition and body.  Inputs are updated repeatedly by the body
-// function while the loop condition with the tensors evaluates to true.  The
-// condition result doesn't have to be a boolean tensor.  It is handled
-// according to these rules, quoting the TensorFlow op definition:
-//
-//   If the tensor is a scalar of non-boolean type, the scalar is converted to
-//   a boolean according to the following rule: if the scalar is a numerical
-//   value, non-zero means True and zero means False; if the scalar is a
-//   string, non-empty means True and empty means False. If the tensor is not a
-//   scalar, being empty means False and being non-empty means True.
-//
-// This is defined in TensorFlow as:
-//
-// REGISTER_OP("While")
-//      .Input("input: T")
-//      .Output("output: T")
-//      .Attr("T: list(type) >= 0")
-//      .Attr("cond: func")
-//      .Attr("body: func")
-//      .Attr("output_shapes: list(shape) = []")
-//
-class WhileOp : public Op<WhileOp, TensorFlowOp, OpTrait::VariadicOperands,
-                          OpTrait::VariadicResults> {
- public:
-  using Op::Op;
-  static StringRef getOperationName() { return "tf.While"; }
-
-  StringRef getCond() {
-    return getAttrOfType<SymbolRefAttr>("cond").getValue();
-  }
-  StringRef getBody() {
-    return getAttrOfType<SymbolRefAttr>("body").getValue();
-  }
-
-  LogicalResult verify();
-};
-
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index b2fcb01c2d5..d889a5d038a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -30,6 +30,37 @@ limitations under the License.
 
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
 
+class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  TF_DerivedOperandTypeAttr shape_type = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{
+    if (handle_dtype().getSubtypes().size() != 1) {
+      return emitOpError(
+          "must have exactly one subtype in the result variant type");
+    }
+
+    return Verify(*this);
+  }];
+
+  DerivedTypeAttr element_dtype = DerivedTypeAttr<
+      "return getElementTypeOrSelf(element_type());">;
+
+  let extraClassDeclaration = [{
+    // Returns type of the TensorList element produced by this op.
+    TensorType element_type() { return handle_dtype().getSubtypes()[0]; }
+
+    // Returns data type of the result handle. Returned type contains type of
+    // the TensorList element as a subtype.
+    VariantType handle_dtype() {
+      return getElementTypeOrSelf(handle()->getType()).cast<TF::VariantType>();
+    }
+  }];
+}
+
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
 // its type encoding the tensor's shape and data type.
 def TF_ConstOp : TF_Op<"Const", [NoSideEffect]> {
@@ -55,12 +86,30 @@ def TF_ConstOp : TF_Op<"Const", [NoSideEffect]> {
   let hasFolder = 1;
 }
 
+def TF_EmptyTensorListOp : TF_TensorListInitOp<"EmptyTensorList"> {
+  let summary = "Creates and returns an empty tensor list.";
+
+  let description = [{
+All list elements must be tensors of dtype element_dtype and shape compatible
+with element_shape.
+
+handle: an empty tensor list.
+element_dtype: the type of elements in the list.
+element_shape: a shape compatible with that of elements in the list.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$element_shape,
+    I32Tensor:$max_num_elements
+  );
+}
+
 // TODO(fengliuai): The tf.Identity is side-effect free and it doesn't change
 // the status of the system during the execution. However it shouldn't be folded
 // in general if it used to serve for caching and some other invariant checks,
 // so we removed the side-effect free property in the op definition. This is a
 // hack, and we should fix it if we have a better way to model it.
-def TF_IdentityOp : TF_Op<"Identity", [SameOperandsAndResultType]> {
+def TF_IdentityOp : TF_Op<"Identity", [TF_OperandsSameAsResultsTypeOrRef]> {
   let summary = "Identity op";
 
   let description = [{
@@ -78,6 +127,50 @@ Returns a tensor with the same shape and contents as input.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_IfOp : TF_Op<"If", []> {
+  let summary = "output = cond ? then_branch(input) : else_branch(input)";
+
+  let description = [{
+output = cond ? then_branch(input) : else_branch(input)
+
+cond: A Tensor. If the tensor is a scalar of non-boolean type, the
+    scalar is converted to a boolean according to the
+    following rule: if the scalar is a numerical value, non-zero means
+    True and zero means False; if the scalar is a string, non-empty
+    means True and empty means False. If the tensor is not a scalar,
+    being empty means False and being non-empty means True.
+input: A list of input tensors.
+then_branch: A function that takes 'inputs' and returns a list of
+    tensors, whose types are the same as what else_branch returns.
+else_branch: A function that takes 'inputs' and returns a list of
+    tensors.  whose types are the same as what then_branch returns.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$cond,
+    Variadic<TF_Tensor>:$input,
+
+    SymbolRefAttr:$then_branch,
+    SymbolRefAttr:$else_branch,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$output_shapes,
+
+    // Used to map StatelessIf and If op defined in TensorFlow to a common op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_MeanOp : TF_Op<"Mean", [NoSideEffect]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
@@ -147,7 +240,53 @@ Inserts a placeholder for a tensor that will be always fed.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_TensorListReserveOp : TF_Op<"TensorListReserve", [NoSideEffect]> {
+def TF_WhileOp : TF_Op<"While", []> {
+  let summary = [{
+output = input; While (Cond(output)) { output = Body(output) }
+  }];
+
+  let description = [{
+output = input; While (Cond(output)) { output = Body(output) }
+
+input: A list of input tensors whose types are T.
+output: A list of output tensors whose types are T.
+cond: A function takes 'input' and returns a tensor.  If the tensor is
+    a scalar of non-boolean, the scalar is converted to a boolean
+    according to the following rule: if the scalar is a numerical
+    value, non-zero means True and zero means False; if the scalar is
+    a string, non-empty means True and empty means False. If the
+    tensor is not a scalar, non-emptiness means True and False
+    otherwise.
+body: A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$input,
+
+    SymbolRefAttr:$cond,
+    SymbolRefAttr:$body,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$output_shapes,
+    DefaultValuedAttr<I64Attr, "10">:$parallel_iterations,
+
+    // Used to map StatelessWhile and While op defined in TensorFlow to a common
+    // op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_TensorListReserveOp : TF_TensorListInitOp<"TensorListReserve"> {
   let summary = "List of the given size with empty elements.";
 
   let description = [{
@@ -161,35 +300,6 @@ element_dtype: the desired type of elements in the list.
     TF_I32OrI64Tensor:$element_shape,
     I32Tensor:$num_elements
   );
-
-  let results = (outs
-    TF_VariantTensor:$handle
-  );
-
-  TF_DerivedOperandTypeAttr shape_type = TF_DerivedOperandTypeAttr<0>;
-
-  let verifier = [{
-    if (handle_dtype().getSubtypes().size() != 1) {
-      return emitOpError(
-          "must have exactly one subtype in the result variant type");
-    }
-
-    return Verify(*this);
-  }];
-
-  DerivedTypeAttr element_dtype = DerivedTypeAttr<
-      "return getElementTypeOrSelf(element_type());">;
-
-  let extraClassDeclaration = [{
-    // Returns type of the TensorList element produced by this op.
-    TensorType element_type() { return handle_dtype().getSubtypes()[0]; }
-
-    // Returns data type of the result handle. Returned type contains type of
-    // the TensorList element as a subtype.
-    VariantType handle_dtype() {
-      return getElementTypeOrSelf(handle()->getType()).cast<TF::VariantType>();
-    }
-  }];
 }
 
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
new file mode 100644
index 00000000000..b96026c8189
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
+
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace OpTrait {
+namespace TF {
+
+// Verifies if 'ref_type' is a REF type corresponding to 'type'.
+static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
+                                               mlir::Type ref_type) {
+  auto ref_type_kind = ref_type.getKind();
+  switch (type.getKind()) {
+    case mlir::StandardTypes::F16:
+      return success(ref_type_kind == mlir::TF::TensorFlowTypes::HALF_REF);
+    case mlir::StandardTypes::F32:
+      return success(ref_type_kind == mlir::TF::TensorFlowTypes::FLOAT_REF);
+    case mlir::StandardTypes::F64:
+      return success(ref_type_kind == mlir::TF::TensorFlowTypes::DOUBLE_REF);
+    case mlir::StandardTypes::BF16:
+      return success(ref_type_kind == mlir::TF::TensorFlowTypes::BFLOAT16_REF);
+    case mlir::StandardTypes::Integer: {
+      const auto& itype = type.cast<mlir::IntegerType>();
+      switch (itype.getWidth()) {
+        case 1:
+          return success(ref_type_kind == mlir::TF::TensorFlowTypes::BOOL_REF);
+        case 8:
+          return success(ref_type_kind == mlir::TF::TensorFlowTypes::INT8_REF);
+        case 16:
+          return success(ref_type_kind == mlir::TF::TensorFlowTypes::INT16_REF);
+        case 32:
+          return success(ref_type_kind == mlir::TF::TensorFlowTypes::INT32_REF);
+        case 64:
+          return success(ref_type_kind == mlir::TF::TensorFlowTypes::INT64_REF);
+        default:
+          return failure();
+      }
+    }
+#define HANDLE_TF_TYPE(tftype, enumerant, name) \
+  case mlir::TF::TensorFlowTypes::enumerant:    \
+    return success(ref_type_kind == mlir::TF::TensorFlowTypes::enumerant##_REF);
+
+#define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+    default:
+      return failure();
+  }
+}
+
+// This class provides verification for ops that are known to have the same
+// result types and all operands are either of the same type as result or a REF
+// type corresponding to the result type.
+template <typename ConcreteType>
+class OperandsSameAsResultsTypeOrRef
+    : public TraitBase<ConcreteType, OperandsSameAsResultsTypeOrRef> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    LogicalResult shapeMatch = impl::verifySameOperandsAndResultShape(op);
+    if (failed(shapeMatch)) return shapeMatch;
+
+    auto type = getElementTypeOrSelf(op->getResult(0)->getType());
+
+    // Verify that the first result type is same as the rest of the results.
+    // We skip the comparison against itself.
+    for (auto resultType : llvm::drop_begin(op->getResultTypes(), 1)) {
+      resultType = getElementTypeOrSelf(resultType);
+      if (resultType != type)
+        return op->emitOpError() << "requires the same type for all results";
+    }
+
+    for (auto opType : op->getOperandTypes()) {
+      opType = getElementTypeOrSelf(opType);
+      if (opType != type && failed(VerifyRefTypeMatch(type, opType))) {
+        return op->emitError() << "requires all operands to be either same "
+                                  "as or ref type of results";
+      }
+    }
+    return success();
+  }
+};
+
+}  // namespace TF
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
index 9f1154b84f1..e5041d0ab99 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
@@ -32,28 +32,33 @@ HANDLE_TF_TYPE(String, STRING, "string")
 HANDLE_TF_TYPE(Resource, RESOURCE, "resource")
 HANDLE_TF_TYPE(Complex64, COMPLEX64, "complex64")
 HANDLE_TF_TYPE(Complex128, COMPLEX128, "complex128")
-HANDLE_TF_TYPE(FloatRef, FLOAT_REF, "f32ref")
-HANDLE_TF_TYPE(DoubleRef, DOUBLE_REF, "f64ref")
-HANDLE_TF_TYPE(Uint8Ref, UINT8_REF, "uint8ref")
-HANDLE_TF_TYPE(Int8Ref, INT8_REF, "int8ref")
-HANDLE_TF_TYPE(Uint16Ref, UINT16_REF, "uint16ref")
-HANDLE_TF_TYPE(Int16Ref, INT16_REF, "int16ref")
-HANDLE_TF_TYPE(Uint32Ref, UINT32_REF, "uint32ref")
-HANDLE_TF_TYPE(Int32Ref, INT32_REF, "int32ref")
-HANDLE_TF_TYPE(Uint64Ref, UINT64_REF, "uint64ref")
-HANDLE_TF_TYPE(Int64Ref, INT64_REF, "int64ref")
-HANDLE_TF_TYPE(StringRef, STRING_REF, "stringref")
-HANDLE_TF_TYPE(BoolRef, BOOL_REF, "boolref")
-HANDLE_TF_TYPE(Quint8Ref, QUINT8_REF, "quint8ref")
-HANDLE_TF_TYPE(Qint8Ref, QINT8_REF, "qint8ref")
-HANDLE_TF_TYPE(Quint16Ref, QUINT16_REF, "quint16ref")
-HANDLE_TF_TYPE(Qint16Ref, QINT16_REF, "qint16ref")
-HANDLE_TF_TYPE(Qint32Ref, QINT32_REF, "qint32ref")
-HANDLE_TF_TYPE(Bfloat16Ref, BFLOAT16_REF, "bfloat16ref")
-HANDLE_TF_TYPE(Complex64Ref, COMPLEX64_REF, "complex64ref")
-HANDLE_TF_TYPE(Complex128Ref, COMPLEX128_REF, "complex128ref")
-HANDLE_TF_TYPE(HalfRef, HALF_REF, "halfref")
-HANDLE_TF_TYPE(ResourceRef, RESOURCE_REF, "resourceref")
+
+#ifndef HANDLE_TF_REF_TYPE
+#define HANDLE_TF_REF_TYPE(class, enumerant, name) \
+  HANDLE_TF_TYPE(class, enumerant, name)
+#endif
+HANDLE_TF_REF_TYPE(FloatRef, FLOAT_REF, "f32ref")
+HANDLE_TF_REF_TYPE(DoubleRef, DOUBLE_REF, "f64ref")
+HANDLE_TF_REF_TYPE(Uint8Ref, UINT8_REF, "uint8ref")
+HANDLE_TF_REF_TYPE(Int8Ref, INT8_REF, "int8ref")
+HANDLE_TF_REF_TYPE(Uint16Ref, UINT16_REF, "uint16ref")
+HANDLE_TF_REF_TYPE(Int16Ref, INT16_REF, "int16ref")
+HANDLE_TF_REF_TYPE(Uint32Ref, UINT32_REF, "uint32ref")
+HANDLE_TF_REF_TYPE(Int32Ref, INT32_REF, "int32ref")
+HANDLE_TF_REF_TYPE(Uint64Ref, UINT64_REF, "uint64ref")
+HANDLE_TF_REF_TYPE(Int64Ref, INT64_REF, "int64ref")
+HANDLE_TF_REF_TYPE(StringRef, STRING_REF, "stringref")
+HANDLE_TF_REF_TYPE(BoolRef, BOOL_REF, "boolref")
+HANDLE_TF_REF_TYPE(Quint8Ref, QUINT8_REF, "quint8ref")
+HANDLE_TF_REF_TYPE(Qint8Ref, QINT8_REF, "qint8ref")
+HANDLE_TF_REF_TYPE(Quint16Ref, QUINT16_REF, "quint16ref")
+HANDLE_TF_REF_TYPE(Qint16Ref, QINT16_REF, "qint16ref")
+HANDLE_TF_REF_TYPE(Qint32Ref, QINT32_REF, "qint32ref")
+HANDLE_TF_REF_TYPE(Bfloat16Ref, BFLOAT16_REF, "bfloat16ref")
+HANDLE_TF_REF_TYPE(Complex64Ref, COMPLEX64_REF, "complex64ref")
+HANDLE_TF_REF_TYPE(Complex128Ref, COMPLEX128_REF, "complex128ref")
+HANDLE_TF_REF_TYPE(HalfRef, HALF_REF, "halfref")
+HANDLE_TF_REF_TYPE(ResourceRef, RESOURCE_REF, "resourceref")
 
 #ifndef HANDLE_CUSTOM_TF_TYPE
 #define HANDLE_CUSTOM_TF_TYPE(class, enumerant, name) \
@@ -64,10 +69,11 @@ HANDLE_CUSTOM_TF_TYPE(Variant, VARIANT, "variant")
 
 #ifndef HANDLE_LAST_TF_TYPE
 #define HANDLE_LAST_TF_TYPE(class, enumerant, name) \
-  HANDLE_TF_TYPE(class, enumerant, name)
+  HANDLE_TF_REF_TYPE(class, enumerant, name)
 #endif
 HANDLE_LAST_TF_TYPE(VariantRef, VARIANT_REF, "variantref")
 #undef HANDLE_LAST_TF_TYPE
 
+#undef HANDLE_TF_REF_TYPE
 #undef HANDLE_TF_TYPE
 #endif
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index ffd6bee1e37..65feaa8b84c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -1,5 +1,21 @@
 // RUN: tf-opt %s -canonicalize | FileCheck %s
 
+// CHECK-LABEL: func @tfAssertTrue
+func @tfAssertTrue(%arg0: tensor<1x1x6x2xf32>) {
+  %t = constant dense<true> : tensor<i1>
+  // CHECK-NOT: tf.Assert
+  "tf.Assert"(%t, %arg0) {summarize = 3} : (tensor<i1>, tensor<1x1x6x2xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @tfAssertFalse
+func @tfAssertFalse(%arg0: tensor<1x1x6x2xf32>) {
+  %f = constant dense<false> : tensor<i1>
+  // CHECK: tf.Assert
+  "tf.Assert"(%f, %arg0) {summarize = 3} : (tensor<i1>, tensor<1x1x6x2xf32>) -> ()
+  return
+}
+
 // CHECK-LABEL: func @testLeakyRelu
 func @testLeakyRelu(%arg0 : tensor<16xf32>) -> (tensor<16xf32>) {
   %2 = "tf.LeakyRelu"(%arg0) {alpha = 1.0 : f32} : (tensor<16xf32>) -> tensor<16xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
new file mode 100644
index 00000000000..9e2fdcc1ee5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
@@ -0,0 +1,292 @@
+// RUN: tf-opt %s -split-input-file -tf-device-cluster-formation | FileCheck %s
+
+// Simple case, single device cluster.
+
+module {
+  // CHECK-LABEL: func @singlecluster
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @singlecluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[C_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[TPU0_OUTPUT]])
+        %5 = "tf.D"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %5 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Single device cluster, live-in value comes directly from function argument.
+
+module {
+  // CHECK-LABEL: func @arglivein
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @arglivein(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.A"(%arg0) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]], %[[ARG_0]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.B"(%3, %arg0) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[B_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[TPU0_OUTPUT]])
+        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %5 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Single device cluster, live-in value comes from other islands.
+
+module {
+  // CHECK-LABEL: func @argliveinotherislands
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @argliveinotherislands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      // CHECK: %[[OTHER_ISLAND_OUTPUT:[0-9]*]]:2 = tf_executor.island {
+      %1:2 = tf_executor.island {
+        %3 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %3 : tensor<?xi32>
+      }
+
+      %2:2 = tf_executor.island {
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.A"(%arg0) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]], %[[OTHER_ISLAND_OUTPUT]]#0) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.B"(%3, %1#0) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[B_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[TPU0_OUTPUT]])
+        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %5 : tensor<?xi32>
+      }
+
+      tf_executor.fetch %2#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Single device cluster, no live-in values.
+
+module {
+  // CHECK-LABEL: func @nolivein
+  func @nolivein() -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"() : () -> tensor<?xi32>
+        %3 = "tf.A"() {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[A_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_OUTPUT]])
+        %4 = "tf.B"(%3) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %4 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Multiple clusters of different devices. Clusters depend on each other.
+
+module {
+  // CHECK-LABEL: func @multiplerelatedclusters
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @multiplerelatedclusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[C_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[TPU0_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %5 = "tf.D"(%4) {device = "gpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: "tf_device.return"(%[[D_OUTPUT]])
+
+        // CHECK: tf_executor.yield %[[GPU0_OUTPUT]]
+        tf_executor.yield %5 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Multiple clusters of different devices. Clusters do not depend on each other.
+
+module {
+  // CHECK-LABEL: func @multipleunrelatedclusters
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @multipleunrelatedclusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[C_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %5 = "tf.D"(%2) {device = "gpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: "tf_device.return"(%[[D_OUTPUT]])
+
+        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[TPU0_OUTPUT]], %[[GPU0_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[E_OUTPUT]]
+        tf_executor.yield %6 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Single device with non-continous instructions in original block.
+
+module {
+  // CHECK-LABEL: func @noncontinoussinglecluster
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @noncontinoussinglecluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // Note that tf.C is moved before tf_device.launch.
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        %4 = "tf.C"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[D_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[C_OUTPUT]], %[[TPU0_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[E_OUTPUT]]
+        tf_executor.yield %6 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Multiple device clusters with intertwined instructions in original block.
+
+module {
+  // CHECK-LABEL: func @intertwinedclusters
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @intertwinedclusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: "tf_device.return"(%[[C_OUTPUT]])
+        // CHECK: {device = "gpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        %4 = "tf.C"(%arg0) {device = "gpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: "tf_device.return"(%[[D_OUTPUT]])
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[GPU0_OUTPUT]], %[[TPU0_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[E_OUTPUT]]
+        tf_executor.yield %6 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
new file mode 100644
index 00000000000..f8797678231
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
@@ -0,0 +1,112 @@
+// RUN: tf-opt %s -split-input-file -tf-device-cluster-outlining | FileCheck %s
+
+// Tests simple case of a single `tf_device.launch`.
+
+module {
+  // CHECK-LABEL: func @multiplelaunches
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "tpu0", func = @tpu0_func}
+        %3 = "tf_device.launch"() ( {
+          %4 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+          "tf_device.return"(%4) : (tensor<?xi32>) -> ()
+        }) {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[C_OUTPUT]]
+        tf_executor.yield %3 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+
+// CHECK-LABEL: func @tpu0_func
+// CHECK-SAME: (%[[TPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK: %[[TPU0_FUNC_B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_FUNC_ARG_0]])
+// CHECK: return %[[TPU0_FUNC_B_OUTPUT]]
+}
+
+// -----
+
+// Tests that multiple `tf_device.launch` that depend on each other are
+// correctly handled.
+
+module {
+  // CHECK-LABEL: func @multiplelaunches
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "tpu0", func = @tpu0_func}
+        %3 = "tf_device.launch"() ( {
+          %6 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+          "tf_device.return"(%6) : (tensor<?xi32>) -> ()
+        }) {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[C_OUTPUT]])
+        %4 = "tf.D"(%3) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[C_OUTPUT]], %[[D_OUTPUT]]) {device = "gpu0", func = @gpu0_func}
+        %5 = "tf_device.launch"() ( {
+          %6 = "tf.E"(%3) : (tensor<?xi32>) -> tensor<?xi32>
+          %7 = "tf.F"(%4, %6) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+          "tf_device.return"(%7) : (tensor<?xi32>) -> ()
+        }) {device = "gpu0"} : () -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[E_OUTPUT]]
+        tf_executor.yield %5 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+
+// CHECK-LABEL: func @tpu0_func
+// CHECK-SAME: (%[[TPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK: %[[TPU0_FUNC_B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_FUNC_ARG_0]])
+// CHECK: return %[[TPU0_FUNC_B_OUTPUT]]
+
+// CHECK-LABEL: func @gpu0_func
+// CHECK-SAME: (%[[GPU0_FUNC_ARG_0:[a-z0-9]*]]: tensor<?xi32>, %[[GPU0_FUNC_ARG_1:[a-z0-9]*]]: tensor<?xi32>) -> tensor<?xi32>
+// CHECK: %[[GPU0_FUNC_E_OUTPUT:[0-9]*]] = "tf.E"(%[[GPU0_FUNC_ARG_0]])
+// CHECK: %[[GPU0_FUNC_F_OUTPUT:[0-9]*]] = "tf.F"(%[[GPU0_FUNC_ARG_1]], %[[GPU0_FUNC_E_OUTPUT]])
+// CHECK: return %[[GPU0_FUNC_F_OUTPUT]]
+}
+
+// -----
+
+// Tests outlining launches with no live-in values.
+
+module {
+  // CHECK-LABEL: func @multiplelaunches
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf_device.launch_func"() {device = "tpu0", func = @tpu0_func}
+        %2 = "tf_device.launch"() ( {
+          %3 = "tf.A"() : () -> tensor<?xi32>
+          "tf_device.return"(%3) : (tensor<?xi32>) -> ()
+        }) {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[A_OUTPUT]]
+        tf_executor.yield %2 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    return %0 : tensor<?xi32>
+  }
+
+// CHECK-LABEL: func @tpu0_func
+// CHECK-SAME: () -> tensor<?xi32>
+// CHECK: %[[TPU0_FUNC_A_OUTPUT:[0-9]*]] = "tf.A"()
+// CHECK: return %[[TPU0_FUNC_A_OUTPUT]]
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 51aaf6edad4..115d39d7701 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -42,3 +42,38 @@ func @tfConst() -> (tensor<4xf32>, tensor<1x1x6x2xf32>) {
   // CHECK-DAG: constant dense<0.242886767> : tensor<1x1x6x2xf32>
   return %0, %21 : tensor<4xf32>, tensor<1x1x6x2xf32>
 }
+
+// CHECK-LABEL: func @testAdd() -> tensor<2x2xi32>
+func @testAdd() -> tensor<2x2xi32> {
+^bb0:
+  %0 = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
+  %1 = constant dense<1> : tensor<2xi32>
+  %2 = "tf.Add"(%0, %1) {device = "", name = "add"} : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  // CHECK:         [[cst:%.*]] = constant dense<{{\[\[}}1, 2], {{\[}}3, 4]]> : tensor<2x2xi32>
+  // CHECK-NEXT:    return [[cst]] : tensor<2x2xi32>
+  return %2: tensor<2x2xi32>
+}
+
+// Ops with side effects should not get constant folded.
+// CHECK-LABEL: func @testSideEffectOp() -> tensor<3xf32>
+func @testSideEffectOp() -> tensor<3xf32> {
+  %0 = constant dense<[3]> : tensor<1xi32>
+  %1 = "tf.RandomUniform"(%0) {device = "", seed = 3 : i64, seed2 = 5 : i64} : (tensor<1xi32>) -> tensor<3xf32>
+  // CHECK: %[[random:.*]] = "tf.RandomUniform"
+  // CHECK: return %[[random]]
+  return %1: tensor<3xf32>
+}
+
+// Ops with unimplemnted attributes which couldn't be added to the TFE_Op.
+// CHECK-LABEL: func @testUnimplementedOp() -> (tensor<i32>, tensor<i32>)
+func @testUnimplementedOp() -> (tensor<i32>, tensor<i32>) {
+  %0 = constant dense<1> : tensor<i32>
+  %1 = constant dense<2> : tensor<i32>
+  %2 = "tf.Maximum"(%0, %1) {_output_shapes = ["tfshape$"]} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %3 = "tf.Minimum"(%0, %1) {random_attr = "hello"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %2, %3: tensor<i32>, tensor<i32>
+
+// CHECK-NEXT: %[[CST:.*]] = constant
+// CHECK-NEXT: %[[CST1:.*]] = constant
+// CHECK-NEXT: return %[[CST]], %[[CST1]]
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
index b1a9dd71fc7..48f4c8f77df 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
@@ -79,7 +79,7 @@ func @LoopTest() {
 // CHECK-NEXT:       %{{[0-9]*}} = "tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Add"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
 // CHECK-NEXT:       tf_executor.yield %{{[0-9]*}} : tensor<*xi32>
 // CHECK-NEXT:     }
-// CHECK-NEXT:     %[[CT:[0-9]*]] = tf_executor.ControlTrigger %2, %12#1, %9#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"}
+// CHECK-NEXT:     %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[NOOP]], %[[ADD]]#1, %[[EXIT]]#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"}
 // CHECK-NEXT:     tf_executor.NextIteration.Sink [%[[NEXTIT_SRC]]#1] %[[ADD]]#0, %[[CT]] : tensor<*xi32> {T =  "tfdtype$DT_INT32", device =  "", id = 0 : i64, name =  "while/NextIteration"}
 // CHECK-NEXT:     tf_executor.fetch
 // CHECK-NEXT:   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/empty-main.mlir b/tensorflow/compiler/mlir/tensorflow/tests/empty-main.mlir
new file mode 100644
index 00000000000..4a4aa277067
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/empty-main.mlir
@@ -0,0 +1,15 @@
+// RUN: tf-opt -tf-executor-to-control-conversion %s  | FileCheck %s --check-prefix=CONTROL --dump-input=fail
+// RUN: tf-opt -tf-control-to-executor-conversion %s  | FileCheck %s --check-prefix=EXECUTOR --dump-input=fail
+
+// CONTROL-LABEL: func @main
+// CONTROL-NEXT:    return
+
+// EXECUTOR-LABEL: func @main
+// EXECUTOR-NEXT:    tf_executor.graph {
+// EXECUTOR-NEXT:      tf_executor.fetch
+// EXECUTOR-NEXT:    }
+// EXECUTOR-NEXT:    return
+
+func @main() {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
new file mode 100644
index 00000000000..5b4e8e16cbb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
@@ -0,0 +1,349 @@
+// RUN: tf-opt %s -canonicalize | FileCheck %s --dump-input=fail
+
+
+// Test single graph with no outputs and one island is folded away.
+// CHECK-LABEL: func @graph_with_no_outputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @graph_with_no_outputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3 : tensor<i1>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-NEXT: %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT: "tf.opB"(%[[OP_A]])
+// CHECK-NEXT: return
+
+
+// Test single graph with some outputs and one island is folded away.
+// CHECK-LABEL: func @graph_with_outputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @graph_with_outputs(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:3 = tf_executor.graph {
+    %1:4 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.opC"(%4) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3, %5, %4 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %1#1, %1#0, %1#2 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  return %0#2, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT: "tf.opC"(%[[OP_B]])
+// CHECK-NEXT: return %[[OP_B]], %[[OP_A]] : tensor<i1>, tensor<i1>
+
+
+// Test nested graphs and islands.
+// CHECK-LABEL: func @nested_graph
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @nested_graph(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:3 = tf_executor.graph {
+    %1:4 = tf_executor.island {
+      %2:3 = tf_executor.graph {
+        %3:4 = tf_executor.island {
+          %4 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+          %5 = "tf.opB"(%4) : (tensor<i1>) -> tensor<i1>
+          %6 = "tf.opC"(%5) : (tensor<i1>) -> tensor<i1>
+          tf_executor.yield %4, %6, %5 : tensor<i1>, tensor<i1>, tensor<i1>
+        }
+        tf_executor.fetch %3#2, %3#0, %3#1 : tensor<i1>, tensor<i1>, tensor<i1>
+      }
+      tf_executor.yield %2#1, %2#1, %2#0 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %1#1, %1#0, %1#2 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  return %0#2, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT: "tf.opC"(%[[OP_B]])
+// CHECK-NEXT: return %[[OP_B]], %[[OP_A]] : tensor<i1>, tensor<i1>
+
+
+// Test single graph with multiple islands is unmodified.
+// CHECK-LABEL: func @graph_with_multiple_islands
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @graph_with_multiple_islands(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:3 = tf_executor.graph {
+    %1:4 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.opC"(%4) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3, %5, %4 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    %6:3 = tf_executor.island {
+      %7 = "tf.opD"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %8 = "tf.opE"(%7) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %8, %7 : tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %1#1, %1#0, %6#0 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  return %0#2, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[GRAPH:[0-9]*]]:3 = tf_executor.graph {
+// CHECK-NEXT:   %[[ISLAND_0:[0-9]*]]:4 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_B]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]], %[[OP_C]], %[[OP_B]] : tensor<i1>, tensor<i1>, tensor<i1>
+// CHECK:        %[[ISLAND_1:[0-9]*]]:3 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_D:[0-9]*]] = "tf.opD"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_D]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_E]], %[[OP_D]] : tensor<i1>, tensor<i1>
+// CHECK:        tf_executor.fetch %[[ISLAND_0]]#1, %[[ISLAND_0]]#0, %[[ISLAND_1]]#0 : tensor<i1>, tensor<i1>, tensor<i1>
+// CHECK:      return %[[GRAPH]]#2, %[[GRAPH]]#1 : tensor<i1>, tensor<i1>
+
+
+// Test single graph with an island and executor ops is unmodified.
+// CHECK-LABEL: func @graph_with_island_and_executor_op
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @graph_with_island_and_executor_op(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:3 = tf_executor.graph {
+    %1:4 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.opC"(%4) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3, %5, %4 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    %6:2 = tf_executor.LoopCond %1#0 : tensor<i1>
+    tf_executor.fetch %1#1, %1#0, %6#0 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  return %0#2, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[GRAPH:[0-9]*]]:3 = tf_executor.graph {
+// CHECK-NEXT:   %[[ISLAND:[0-9]*]]:4 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_B]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]], %[[OP_C]], %[[OP_B]] : tensor<i1>, tensor<i1>, tensor<i1>
+// CHECK:        %[[LOOP_COND:[0-9]*]]:2 = tf_executor.LoopCond %[[ISLAND]]#0
+// CHECK-NEXT:   tf_executor.fetch %[[ISLAND]]#1, %[[ISLAND]]#0, %[[LOOP_COND]]#0 : tensor<i1>, tensor<i1>, tensor<i1>
+// CHECK:      return %[[GRAPH]]#2, %[[GRAPH]]#1 : tensor<i1>, tensor<i1>
+
+
+// Test multiple graphs collapsed.
+// CHECK-LABEL: func @multiple_graphs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @multiple_graphs(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %2:4 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.opC"(%4) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3, %5, %4 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %arg0, %2#0, %2#1, %2#2 : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  %1:3 = tf_executor.graph {
+    %6:3 = tf_executor.island {
+      %7 = "tf.opD"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %8 = "tf.opE"(%7) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %8, %7 : tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %arg0, %6#0, %6#1 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  return %1#1, %1#0, %1#2, %0#1, %0#0, %0#3 : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT: "tf.opC"(%[[OP_B]])
+// CHECK-NEXT: %[[OP_D:[0-9]*]] = "tf.opD"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_D]])
+// CHECK-NEXT: return %[[OP_E]], %[[ARG_0]], %[[OP_D]], %[[OP_A]], %[[ARG_0]], %[[OP_B]] : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+
+
+// Test empty graph with no outputs.
+// CHECK-LABEL: func @empty_graph_with_no_outputs
+func @empty_graph_with_no_outputs() {
+  tf_executor.graph {
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-NEXT: return
+
+
+// Test empty graph with some outputs.
+// CHECK-LABEL: func @empty_graph_with_outputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i1>)
+func @empty_graph_with_outputs(%arg0 : tensor<i1>, %arg1 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:2 = tf_executor.graph {
+    tf_executor.fetch %arg1, %arg0 : tensor<i1>, tensor<i1>
+  }
+  return %0#0, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: return %[[ARG_1]], %[[ARG_0]] : tensor<i1>, tensor<i1>
+
+
+// Test multiple empty graphs.
+// CHECK-LABEL: func @empty_graphs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i1>)
+func @empty_graphs(%arg0 : tensor<i1>, %arg1 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0 = tf_executor.graph {
+    tf_executor.fetch %arg1 : tensor<i1>
+  }
+  tf_executor.graph {
+    tf_executor.fetch
+  }
+  %1 = tf_executor.graph {
+    tf_executor.fetch %arg0 : tensor<i1>
+  }
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: return %[[ARG_1]], %[[ARG_0]] : tensor<i1>, tensor<i1>
+
+
+// Test empty graphs and graphs with a single island.
+// CHECK-LABEL: func @empty_and_filled_graphs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @empty_and_filled_graphs(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %2:4 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %4 = "tf.opB"(%3) : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.opC"(%4) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3, %5, %4 : tensor<i1>, tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %arg0, %2#0, %2#1, %2#2 : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  tf_executor.graph {
+    tf_executor.fetch
+  }
+  %1:3 = tf_executor.graph {
+    %6:3 = tf_executor.island {
+      %7 = "tf.opD"(%arg0) : (tensor<i1>) -> tensor<i1>
+      %8 = "tf.opE"(%7) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %8, %7 : tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %arg0, %6#0, %6#1 : tensor<i1>, tensor<i1>, tensor<i1>
+  }
+  %9 = tf_executor.graph {
+    tf_executor.fetch %arg0 : tensor<i1>
+  }
+  return %1#1, %1#0, %9, %0#1, %0#0, %0#3 : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-NEXT: %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT: "tf.opC"(%[[OP_B]])
+// CHECK-NEXT: %[[OP_D:[0-9]*]] = "tf.opD"(%[[ARG_0]])
+// CHECK-NEXT: %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_D]])
+// CHECK-NEXT: return %[[OP_E]], %[[ARG_0]], %[[ARG_0]], %[[OP_A]], %[[ARG_0]], %[[OP_B]] : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
+
+
+// Test single empty island in graph with control output in graph fetch results
+// in graph being removed.
+// CHECK-LABEL: func @single_empty_island_single_graph_control
+func @single_empty_island_single_graph_control() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    tf_executor.fetch %0 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-NEXT: return
+
+
+// Test empty island with no operands and no data result user is removed.
+// Control result users should also have their respective operands removed.
+// CHECK-LABEL: func @empty_island_no_operand_no_data_result
+func @empty_island_no_operand_no_data_result() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    %1 = tf_executor.island(%0) {
+      %3 = "tf.opA"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    %2 = tf_executor.island(%0, %1) {
+      %4 = "tf.opB"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[ISLAND_0:[0-9]*]] = tf_executor.island {
+// CHECK-NEXT:     "tf.opA"
+// CHECK:        tf_executor.island(%[[ISLAND_0]]) {
+// CHECK-NEXT:     "tf.opB"
+// CHECK-NOT:    tf_executor.island
+
+
+// Test empty island with one operand and no data results is removed and the
+// operand is forwarded to its control result users.
+// CHECK-LABEL: func @empty_island_one_operand_no_data_result
+func @empty_island_one_operand_no_data_result() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      %3 = "tf.opA"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    %1 = tf_executor.island(%0) {
+      tf_executor.yield
+    }
+    %2 = tf_executor.island(%1) {
+      %4 = "tf.opB"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[ISLAND_1:[0-9]*]] = tf_executor.island {
+// CHECK-NEXT:     "tf.opA"
+// CHECK:        tf_executor.island(%[[ISLAND_1]]) {
+// CHECK-NEXT:     "tf.opB"
+// CHECK-NOT:    tf_executor.island
+
+
+// Test empty island with no operands, one data result and no control result
+// users is removed and its data result forwarded to its users.
+// CHECK-LABEL: func @empty_island_no_operand_one_data_no_control_result
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @empty_island_no_operand_one_data_no_control_result(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %0:2 = tf_executor.island() {
+      tf_executor.yield %arg0 : tensor<i1>
+    }
+    %1 = tf_executor.island {
+      %3 = "tf.opA"(%0#0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield
+    }
+    %2 = tf_executor.island() {
+      %4 = "tf.opB"(%0#0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     "tf.opA"(%[[ARG_0]])
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     "tf.opB"(%[[ARG_0]])
+// CHECK-NOT:    tf_executor.island
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
new file mode 100644
index 00000000000..a9e83dd006c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
@@ -0,0 +1,460 @@
+// RUN: tf-opt %s -tf-executor-island-coarsening | FileCheck %s --dump-input=fail
+
+
+// Test that islands linked by a control dependency are merged.
+// CHECK-LABEL: func @control_input
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @control_input(%arg0 : tensor<i1>) -> tensor<f32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3 : tensor<i1>
+    }
+    %2:2 = tf_executor.island(%1#1) {
+      %4 = "tf.opB"() : () -> tensor<f32>
+      tf_executor.yield %4 : tensor<f32>
+    }
+    tf_executor.fetch %2#0 : tensor<f32>
+  }
+  return %0 : tensor<f32>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"
+// CHECK-NEXT:     tf_executor.yield %[[OP_B]] : tensor<f32>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0 : tensor<f32>
+
+
+// Test that islands linked by a data dependency are merged.
+// CHECK-LABEL: func @data_input
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @data_input(%arg0 : tensor<i1>) -> tensor<i1> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3 : tensor<i1>
+    }
+    %2:2 = tf_executor.island {
+      %4 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %4 : tensor<i1>
+    }
+    tf_executor.fetch %2#0 : tensor<i1>
+  }
+  return %0 : tensor<i1>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_B]] : tensor<i1>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0 : tensor<i1>
+
+
+// Test empty/trivial islands are merged.
+// CHECK-LABEL: func @empty_islands
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i1>)
+func @empty_islands(%arg0 : tensor<i1>, %arg1 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:2 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      tf_executor.yield %arg1 : tensor<i1>
+    }
+    %2:2 = tf_executor.island {
+      tf_executor.yield %arg0 : tensor<i1>
+    }
+    %3:2 = tf_executor.island {
+      tf_executor.yield %1#0 : tensor<i1>
+    }
+    %4:2 = tf_executor.island {
+      tf_executor.yield %2#0 : tensor<i1>
+    }
+    %5:3 = tf_executor.island {
+      %10:2 = "tf.opA"(%3#0, %4#0) : (tensor<i1>, tensor<i1>) -> (tensor<i1>, tensor<i1>)
+      tf_executor.yield %10#0, %10#1 : tensor<i1>, tensor<i1>
+    }
+    %6:2 = tf_executor.island {
+      tf_executor.yield %5#0 : tensor<i1>
+    }
+    %7:2 = tf_executor.island {
+      tf_executor.yield %5#1 : tensor<i1>
+    }
+    %8:3 = tf_executor.island {
+      tf_executor.yield %6#0, %7#0 : tensor<i1>, tensor<i1>
+    }
+    %9 = tf_executor.island(%8#2) {
+      tf_executor.yield
+    }
+    tf_executor.fetch %8#0, %8#1 : tensor<i1>, tensor<i1>
+  }
+  return %0#0, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:3 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]]:2 = "tf.opA"(%[[ARG_1]], %[[ARG_0]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]]#0, %[[OP_A]]#1 : tensor<i1>, tensor<i1>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0, %[[ISLAND]]#1 : tensor<i1>, tensor<i1>
+
+
+// Test merging islands handle merging results.
+// CHECK-LABEL: func @multiple_outputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>, %[[ARG_1:[a-z0-9]*]]: tensor<i1>)
+func @multiple_outputs(%arg0 : tensor<i1>, %arg1 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
+  %0:2 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3 : tensor<i1>
+    }
+    %2:2 = tf_executor.island(%1#1) {
+      %4 = "tf.opB"(%arg1) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %4 : tensor<i1>
+    }
+    tf_executor.fetch %1#0, %2#0 : tensor<i1>, tensor<i1>
+  }
+  return %0#0, %0#1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:3 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[ARG_1]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]], %[[OP_B]] : tensor<i1>, tensor<i1>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0, %[[ISLAND]]#1 : tensor<i1>, tensor<i1>
+
+
+// Test merging islands with multiple inner ops.
+// CHECK-LABEL: func @multi_op_regions
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i32>, %[[ARG_1:[a-z0-9]*]]: tensor<i32>)
+func @multi_op_regions(%arg0 : tensor<i32>, %arg1 : tensor<i32>) -> tensor<i32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %3 = "tf.opB"(%2, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_executor.yield %3 : tensor<i32>
+    }
+    %4:2 = tf_executor.island {
+      %5 = "tf.opC"(%1#0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %6 = "tf.opD"(%5, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_executor.yield %6 : tensor<i32>
+    }
+    tf_executor.fetch %4#0 : tensor<i32>
+  }
+  return %0 : tensor<i32>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]], %[[ARG_0]])
+// CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_B]], %[[ARG_1]])
+// CHECK-NEXT:     %[[OP_D:[0-9]*]] = "tf.opD"(%[[OP_C]], %[[ARG_0]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_D]] : tensor<i32>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0 : tensor<i32>
+
+
+// Test merging multiple islands with multiple inner ops preserves order.
+// CHECK-LABEL: func @transitive_preserve_order
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i32>, %[[ARG_1:[a-z0-9]*]]: tensor<i32>)
+func @transitive_preserve_order(%arg0 : tensor<i32>, %arg1 : tensor<i32>) -> tensor<i32> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %3 = "tf.opB"(%2, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_executor.yield %3 : tensor<i32>
+    }
+    %4:2 = tf_executor.island {
+      %5 = "tf.opC"(%1#0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %6 = "tf.opD"(%5, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_executor.yield %6 : tensor<i32>
+    }
+    %7:2 = tf_executor.island {
+      %8 = "tf.opE"(%4#0, %1#0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %9 = "tf.opF"(%8, %8) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_executor.yield %9 : tensor<i32>
+    }
+    tf_executor.fetch %7#0 : tensor<i32>
+  }
+  return %0 : tensor<i32>
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]], %[[ARG_0]])
+// CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_B]], %[[ARG_1]])
+// CHECK-NEXT:     %[[OP_D:[0-9]*]] = "tf.opD"(%[[OP_C]], %[[ARG_0]])
+// CHECK-NEXT:     %[[OP_E:[0-9]*]] = "tf.opE"(%[[OP_D]], %[[OP_B]])
+// CHECK-NEXT:     %[[OP_F:[0-9]*]] = "tf.opF"(%[[OP_E]], %[[OP_E]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_F]] : tensor<i32>
+// CHECK:        tf_executor.fetch %[[ISLAND]]#0 : tensor<i32>
+
+
+// Test if islands can be merged when non dependent islands are interleaved.
+// CHECK-LABEL: func @islands_interleaved
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i32>, %[[ARG_1:[a-z0-9]*]]: tensor<i32>)
+func @islands_interleaved(%arg0 : tensor<i32>, %arg1 : tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %7 = "tf.opA"(%arg0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %7 : tensor<i32>
+    }
+    %2:2 = tf_executor.island {
+      %8 = "tf.opB"(%arg1) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %8 : tensor<i32>
+    }
+    %3:2 = tf_executor.island {
+      %9 = "tf.opC"(%1#0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %9 : tensor<i32>
+    }
+    %4:2 = tf_executor.island {
+      %10 = "tf.opD"(%2#0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %10 : tensor<i32>
+    }
+    %5:2 = tf_executor.island(%3#1) {
+      %11 = "tf.opE"(%arg0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %11 : tensor<i32>
+    }
+    %6:2 = tf_executor.island {
+      %12 = "tf.opF"(%arg1) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %12 : tensor<i32>
+    }
+    tf_executor.fetch %4#0, %3#0 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+// CHECK:        %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"(%[[ARG_1]])
+// CHECK-NEXT:     %[[OP_D:[0-9]*]] = "tf.opD"(%[[OP_B]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_D]] : tensor<i32>
+// CHECK:        %[[ISLAND_1:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"(%[[ARG_0]])
+// CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
+// CHECK-NEXT:     %{{[0-9]*}} = "tf.opE"(%[[ARG_0]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_C]] : tensor<i32>
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     %[[OP_F:[0-9]*]] = "tf.opF"(%[[ARG_1]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_F]] : tensor<i32>
+// CHECK:        tf_executor.fetch %[[ISLAND_0]]#0, %[[ISLAND_1]]#0 : tensor<i32>, tensor<i32>
+
+
+// Test only islands are merged when other tf_executor ops are interleaved.
+// CHECK-LABEL: func @merge_islands_only
+func @merge_islands_only() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %14 = "tf.opA"() : () -> tensor<i32>
+      tf_executor.yield %14 : tensor<i32>
+    }
+    %1:2 = tf_executor.Enter %0#0 frame "while/while_context" : (tensor<i32>) -> (tensor<*xi32>, !tf_executor.control)
+    %2 = tf_executor.island {
+      "tf.opB"() : () -> ()
+      tf_executor.yield
+    }
+    %3:3 = tf_executor.NextIteration.Source : tensor<*xi32>
+    %4:3 = tf_executor.Merge %3#0, %1#0 : tensor<*xi32>
+    %5:2 = tf_executor.island(%4#2) {
+      %15 = "tf.opC"() : () -> tensor<i32>
+      tf_executor.yield %15 : tensor<i32>
+    }
+    %6:2 = tf_executor.island {
+      %16 = "tf.opD"(%4#0, %5#0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      tf_executor.yield %16 : tensor<*xi1>
+    }
+    %7:2 = tf_executor.LoopCond %6#0 : (tensor<*xi1>) -> (tensor<i1>, !tf_executor.control)
+    %8:3 = tf_executor.Switch %4#0, %7#0 : tensor<*xi32>
+    %9:2 = tf_executor.Exit %8#0 : tensor<*xi32>
+    %10:2 = tf_executor.island {
+      %17 = "tf.opE"(%8#1) : (tensor<*xi32>) -> tensor<*xi32>
+      tf_executor.yield %17 : tensor<*xi32>
+    }
+    %11:2 = tf_executor.island(%10#1) {
+      %18 = "tf.opF"() : () -> tensor<i32>
+      tf_executor.yield %18 : tensor<i32>
+    }
+    %12:2 = tf_executor.island {
+      %19 = "tf.opG"(%10#0, %11#0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.yield %19 : tensor<*xi32>
+    }
+    %13 = tf_executor.ControlTrigger %2, %12#1, %9#1
+    tf_executor.NextIteration.Sink [%3#1] %12#0, %13 : tensor<*xi32>
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:.*]] = "tf.opA"
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<i32>
+// CHECK:        %[[ENTER:[0-9]*]]:2 = tf_executor.Enter %[[ISLAND_0]]#0
+// CHECK-NEXT:   %[[ISLAND_1:[0-9]*]] = tf_executor.island {
+// CHECK-NEXT:     "tf.opB"()
+// CHECK-NEXT:     tf_executor.yield
+// CHECK:        %[[NEXTIT_SRC:[0-9]*]]:3 = tf_executor.NextIteration.Source
+// CHECK-NEXT:   %[[MERGE:[0-9]*]]:3 = tf_executor.Merge %[[NEXTIT_SRC]]#0, %[[ENTER]]#0
+// CHECK-NEXT:   %[[ISLAND_2:[0-9]*]]:2 = tf_executor.island(%[[MERGE]]#2) {
+// CHECK-NEXT:     %[[OP_C:.*]] = "tf.opC"
+// CHECK-NEXT:     %[[OP_D:[0-9]*]] = "tf.opD"(%[[MERGE]]#0, %[[OP_C]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_D]] : tensor<*xi1>
+// CHECK:        %[[COND:[0-9]*]]:2 = tf_executor.LoopCond %[[ISLAND_2:[0-9]*]]#0
+// CHECK-NEXT:   %[[SWITCH:[0-9]*]]:3 = tf_executor.Switch %[[MERGE]]#0, %[[COND]]#0
+// CHECK-NEXT:   %[[EXIT:[0-9]*]]:2 = tf_executor.Exit %[[SWITCH]]#0
+// CHECK-NEXT:   %[[ISLAND_3:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_E:[0-9]*]] = "tf.opE"(%[[SWITCH]]#1)
+// CHECK-NEXT:     %[[OP_F:.*]] = "tf.opF"
+// CHECK-NEXT:     %[[OP_G:[0-9]*]] = "tf.opG"(%[[OP_E]], %[[OP_F]])
+// CHECK-NEXT:     tf_executor.yield %[[OP_G]] : tensor<*xi32>
+// CHECK:        %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[ISLAND_1]], %[[ISLAND_3]]#1, %[[EXIT]]#1
+// CHECK-NEXT:   tf_executor.NextIteration.Sink [%[[NEXTIT_SRC]]#1] %[[ISLAND_3]]#0, %[[CT]]
+
+
+// Test no merging took place as cycle would be formed otherwise.
+// CHECK-LABEL: func @simple_potential_cycle
+func @simple_potential_cycle() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %3 = "tf.opA"() : () -> tensor<1xf32>
+      tf_executor.yield %3 : tensor<1xf32>
+    }
+    %1 = tf_executor.ControlTrigger %0#1
+    %2:3 = tf_executor.island(%1) {
+      %4 = "tf.opB"() : () -> tensor<1xf32>
+      tf_executor.yield %0#0, %4 : tensor<1xf32>, tensor<1xf32>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<1xf32>
+// CHECK:        %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[ISLAND]]#1
+// CHECK-NEXT:   tf_executor.island(%[[CT]]) {
+// CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"
+// CHECK-NEXT:     tf_executor.yield %[[ISLAND]]#0, %[[OP_B]] : tensor<1xf32>, tensor<1xf32>
+
+
+// Test if island was merged into its result.
+// CHECK-LABEL: func @merge_into_result
+func @merge_into_result() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %3 = "tf.opA"() : () -> tensor<1xf32>
+      tf_executor.yield %3 : tensor<1xf32>
+    }
+    %1 = tf_executor.ControlTrigger {}
+    %2:3 = tf_executor.island(%1) {
+      %4 = "tf.opB"() : () -> tensor<1xf32>
+      tf_executor.yield %0#0, %4 : tensor<1xf32>, tensor<1xf32>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[CT:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK-NEXT:   tf_executor.island(%[[CT]]) {
+// CHECK-NEXT:     "tf.opA"
+// CHECK-NEXT:     "tf.opB"
+// CHECK-NEXT:     tf_executor.yield
+
+
+// Test merging island into data result nested in a graph of another island.
+// CHECK-LABEL: func @merge_into_nested_data_result
+func @merge_into_nested_data_result() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %1 = "tf.opA"() : () -> tensor<1xf32>
+      tf_executor.yield %1 : tensor<1xf32>
+    }
+    %2:2 = tf_executor.island {
+      %3 = tf_executor.graph {
+        %4 = tf_executor.ControlTrigger {}
+        %5:2 = tf_executor.island(%4) {
+          %6 = "tf.opB"(%0#0) : (tensor<1xf32>) -> tensor<1xf32>
+          tf_executor.yield %6 : tensor<1xf32>
+        }
+        tf_executor.fetch %5#0 : tensor<1xf32>
+      }
+      tf_executor.yield %3 : tensor<1xf32>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     [[OP_A:[0-9*]]] = "tf.opA"
+// CHECK-NEXT:     [[INNER_GRAPH:[0-9]*]] = tf_executor.graph {
+// CHECK-NEXT:       [[CT:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK-NEXT:       [[ISLAND_1:[0-9]*]]:2 = tf_executor.island(%[[CT]]) {
+// CHECK-NEXT:         [[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
+// CHECK-NEXT:         tf_executor.yield %[[OP_B]] : tensor<1xf32>
+// CHECK:            tf_executor.fetch %[[ISLAND_1]]#0 : tensor<1xf32>
+// CHECK:          tf_executor.yield
+
+
+// Test merging islands in a nested graph.
+// CHECK-LABEL: func @merge_islands_inner_graph
+func @merge_islands_inner_graph() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %1 = "tf.opA"() : () -> tensor<1xf32>
+      tf_executor.yield %1 : tensor<1xf32>
+    }
+    %2:2 = tf_executor.island {
+      %3 = tf_executor.graph {
+        %4:2 = tf_executor.island {
+          %5 = "tf.opB"() : () -> tensor<1xf32>
+          tf_executor.yield %5 : tensor<1xf32>
+        }
+        %6:2 = tf_executor.island {
+          %7 = "tf.opC"() : () -> tensor<1xf32>
+          tf_executor.yield %7 : tensor<1xf32>
+        }
+        %8:2 = tf_executor.island(%4#1) {
+          %9 = "tf.opD"(%6#0) : (tensor<1xf32>) -> tensor<1xf32>
+          tf_executor.yield %9 : tensor<1xf32>
+        }
+        tf_executor.fetch %8#0 : tensor<1xf32>
+      }
+      tf_executor.yield %3 : tensor<1xf32>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     [[OP_A:[0-9*]]] = "tf.opA"
+// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<1xf32>
+// CHECK:        tf_executor.island {
+// CHECK-NEXT:     [[INNER_GRAPH:[0-9]*]] = tf_executor.graph {
+// CHECK-NEXT:       [[ISLAND_1:[0-9]*]]:2 = tf_executor.island {
+// CHECK-NEXT:         "tf.opB"
+// CHECK-NEXT:         [[OP_C:[0-9]*]] = "tf.opC"
+// CHECK-NEXT:         [[OP_D:[0-9]*]] = "tf.opD"(%[[OP_C]])
+// CHECK-NEXT:         tf_executor.yield %[[OP_D]] : tensor<1xf32>
+// CHECK:            tf_executor.fetch %[[ISLAND_1]]#0 : tensor<1xf32>
+// CHECK:          tf_executor.yield %[[INNER_GRAPH]] : tensor<1xf32>
+
+
+// Test merging islands with control island operands and island results only if
+// they are the closest ones.
+// CHECK-LABEL: func @merge_islands_closest_control
+func @merge_islands_closest_control() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    %1 = tf_executor.ControlTrigger %0
+    %2 = tf_executor.ControlTrigger {}
+    %3 = tf_executor.island(%0, %2) {
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: %[[ISLAND:[0-9]*]] = tf_executor.island {
+// CHECK: tf_executor.ControlTrigger %[[ISLAND]]
+// CHECK: %[[CT:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK: tf_executor.island(%[[ISLAND]], %[[CT]]) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir
new file mode 100644
index 00000000000..11b9b1a564d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir
@@ -0,0 +1,99 @@
+// RUN: tf-opt -tf-executor-to-control-conversion %s  | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @LoopTest() {
+func @LoopTest() {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      tf_executor.yield %cst : tensor<i32>
+    }
+    %1:2 = tf_executor.Enter %0#0 frame "while/while_context" : (tensor<i32>) -> (tensor<*xi32>, !tf_executor.control) {T = "tfdtype$DT_INT32", device = "", name = "while/Enter"}
+    %2 = tf_executor.island {
+      "tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> ()
+      tf_executor.yield
+    }
+    %3:3 = tf_executor.NextIteration.Source : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"}
+    %4:3 = tf_executor.Merge %3#0, %1#0 : tensor<*xi32> {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"}
+    %5:2 = tf_executor.island(%4#2) {
+      %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+      tf_executor.yield %cst : tensor<i32>
+    }
+    %6:2 = tf_executor.island {
+      %14 = "tf.Less"(%4#0, %5#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      tf_executor.yield %14 : tensor<*xi1>
+    }
+    %7:2 = tf_executor.LoopCond %6#0 : (tensor<*xi1>) -> (tensor<i1>, !tf_executor.control) {device = "", name = "while/LoopCond"}
+    %8:3 = tf_executor.Switch %4#0, %7#0 : tensor<*xi32> {T = "tfdtype$DT_INT32", _class = ["loc = @while/Merge"], device = "", name = "while/Switch"}
+    %9:2 = tf_executor.Exit %8#0 : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"}
+    %10:2 = tf_executor.island {
+      %14 = "tf.Identity"(%8#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> tensor<*xi32>
+      tf_executor.yield %14 : tensor<*xi32>
+    }
+    %11:2 = tf_executor.island(%10#1) {
+      %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+      tf_executor.yield %cst : tensor<i32>
+    }
+    %12:2 = tf_executor.island {
+      %14 = "tf.Add"(%10#0, %11#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.yield %14 : tensor<*xi32>
+    }
+    %13 = tf_executor.ControlTrigger %2, %12#1, %9#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"}
+    tf_executor.NextIteration.Sink [%3#1] %12#0, %13 : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"}
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-NEXT:   %[[CONST:[0-9]*]]:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
+// CHECK-NEXT:   %[[ENTER:[0-9]*]]:2 = "_tf.Enter"(%[[CONST]]#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while/while_context", is_constant = false, name = "while/Enter", parallel_iterations = 10 : i64} : (tensor<i32>) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[NOOP:[0-9]*]] = "_tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> !_tf.control
+// CHECK-NEXT:   %[[SOURCE:[0-9]*]]:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} : () -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[MERGE:[0-9]*]]:3 = "_tf.Merge"(%[[SOURCE]]#0, %[[ENTER]]#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<i32>, !_tf.control)
+// CHECK-NEXT:   %[[CONST_LESS:[0-9]*]]:2 = "_tf.Const"(%[[MERGE]]#2) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<2> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control)
+// CHECK-NEXT:   %[[LESS:[0-9]*]]:2 = "_tf.Less"(%[[MERGE]]#0, %[[CONST_LESS]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi1>, !_tf.control)
+// CHECK-NEXT:   %[[COND:[0-9]*]]:2 = "_tf.LoopCond"(%[[LESS]]#0) {device = "", name = "while/LoopCond"} : (tensor<*xi1>) -> (tensor<i1>, !_tf.control)
+// CHECK-NEXT:   %[[SWITCH:[0-9]*]]:3 = "_tf.Switch"(%[[MERGE]]#0, %[[COND]]#0) {T = "tfdtype$DT_INT32", _class = ["loc = @while/Merge"], device = "", name = "while/Switch"} : (tensor<*xi32>, tensor<i1>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[EXIT:[0-9]*]]:2 = "_tf.Exit"(%[[SWITCH]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[IDENTITY:[0-9]*]]:2 = "_tf.Identity"(%[[SWITCH]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[CONST_ADD:[0-9]*]]:2 = "_tf.Const"(%[[IDENTITY]]#1) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<3> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control)
+// CHECK-NEXT:   %[[ADD:[0-9]*]]:2 = "_tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT:   %[[CT:[0-9]*]] = "_tf.ControlTrigger"(%[[NOOP]], %[[ADD]]#1, %[[EXIT]]#1) {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"} : (!_tf.control, !_tf.control, !_tf.control) -> !_tf.control
+// CHECK-NEXT:   %[[SINK:[0-9]*]] = "_tf.NextIteration.sink"(%[[ADD]]#0, %[[CT]]) {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} : (tensor<*xi32>, !_tf.control) -> !_tf.control
+// CHECK-NEXT:   return
+
+// CHECK-LABEL: func @multiple_ops_region
+func @multiple_ops_region(%arg0 : tensor<*xi32>, %arg1 : tensor<i32>) {
+  tf_executor.graph {
+    %0:2 = tf_executor.island {
+      // The 4 operations are independent, but the current conversion will add
+      // control dependencies conservatively.
+      %1 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add1"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      %2 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add2"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      %3 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add3"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      %4 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add4"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.yield %4 : tensor<*xi32>
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-NEXT: %[[ADD1:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add1"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT: %[[ADD2:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD1]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add2"} : (tensor<*xi32>, tensor<i32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT: %[[ADD3:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD2]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add3"} : (tensor<*xi32>, tensor<i32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
+// CHECK-NEXT: %[[ADD4:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD3]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add4"} : (tensor<*xi32>, tensor<i32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
+
+// CHECK-LABEL: func @switchN(
+func @switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %fetches = tf_executor.graph {
+
+// CHECK: [[S1:%.*]]:6 = "_tf._SwitchN"(%arg1, %arg0) {num_outs = 5 : i64}
+     %1:6 = tf_executor.SwitchN %arg1, %arg0 of 5 : tensor<*xf32>
+
+// CHECK: "_tf._SwitchN"(%arg1, %arg0, [[S1]]#5) {num_outs = 12 : i64}
+     %2:13 = tf_executor.SwitchN %arg1, %arg0 of 12 (%1#5) : tensor<*xf32>
+
+     tf_executor.fetch %2#0 : tensor<*xf32>
+  }
+  return %fetches : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
index 82fc0171fa6..2a0434b69e0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
@@ -7,7 +7,7 @@ func @testIf1Else(tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 func @testIf1Result(tensor<i1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>):
   %1 = "tf.If"(%arg0, %arg1, %arg2) {
-    then_branch = @testIf1Then, else_branch = @testIf1Else
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
   } : (tensor<i1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
 // CHECK:   %0 = extract_element %arg0[] : tensor<i1>
@@ -31,7 +31,7 @@ func @testIf3Else(tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16>
 func @testIf3Result(tensor<i1>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16>) {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
   %1:3 = "tf.If"(%arg0, %arg1) {
-    then_branch = @testIf3Then, else_branch = @testIf3Else
+    then_branch = @testIf3Then, else_branch = @testIf3Else, is_stateless = false
   } : (tensor<i1>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16>)
 
 // CHECK:   %0 = extract_element %arg0[] : tensor<i1>
@@ -57,7 +57,7 @@ func @testIf1Casts(tensor<i1>, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2x2xf32>, %arg2: tensor<*xf32>):
 
   %1 = "tf.If"(%arg0, %arg1, %arg2) {
-    then_branch = @testIf1Then, else_branch = @testIf1Else
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
   } : (tensor<i1>, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32>
 
 // CHECK:  %0 = extract_element %arg0[] : tensor<i1>
@@ -97,7 +97,7 @@ func @testIf1x4(tensor<4xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> {
 
   // expected-error @+1 {{only supports zero-D bool tensors now}}
   %1 = "tf.If"(%arg0, %arg1, %arg2) {
-    then_branch = @testIf1Then, else_branch = @testIf1Else
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
   } : (tensor<4xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
   return %1 : tensor<*xf32>
@@ -113,7 +113,7 @@ func @testWhile2Body(tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf
 func @testWhile2Result(tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>):
   %1:2 = "tf.While"(%arg0, %arg1) {
-    cond = @testWhile2Cond, body = @testWhile2Body
+    cond = @testWhile2Cond, body = @testWhile2Body, is_stateless = false
   } : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
 
 // CHECK:   br ^bb1(%arg0, %arg1 : tensor<*xf32>, tensor<*xf32>)
@@ -138,7 +138,7 @@ func @testWhile0Body() -> ()
 func @testWhile0Result() {
 
 ^bb0:
-  "tf.While"() { cond = @testWhile0Cond, body = @testWhile0Body } : () -> ()
+  "tf.While"() { cond = @testWhile0Cond, body = @testWhile0Body, is_stateless = false } : () -> ()
 // CHECK:   br ^bb1
 // CHECK: ^bb1:
 // CHECK:   %0 = call @testWhile0Cond() : () -> tensor<i1>
@@ -162,7 +162,7 @@ func @testComplexWhile1Result(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb1(%0: tensor<*xf32>, %1: tensor<*xf32>):
   %2 = addf %0, %1 : tensor<*xf32>
   %3:2 = "tf.While"(%0, %2) {
-    cond = @testWhile2Cond, body = @testWhile2Body
+    cond = @testWhile2Cond, body = @testWhile2Body, is_stateless = false
   } : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
 
 // CHECK:   br ^bb2(%0, %2 : tensor<*xf32>, tensor<*xf32>)
@@ -194,7 +194,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<?x?xf32>)
 // CHECK-LABEL: func @testWhileCasts(%arg0: tensor<1x3xf32>)
 func @testWhileCasts(%arg0: tensor<1x3xf32>) -> (tensor<?x?xf32>) {
   %0 = "tf.While"(%arg0) {
-    cond = @testWhileCond, body = @testWhileBody
+    cond = @testWhileCond, body = @testWhileBody, is_stateless = false
   } : (tensor<1x3xf32>) -> (tensor<?x?xf32>)
 
 // CHECK:   %0 = tensor_cast %arg0 : tensor<1x3xf32> to tensor<?x3xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if-fail.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if-fail.mlir
index 779fe9011ff..e13d5584c7f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if-fail.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if-fail.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=FunctionalizeControlFlowPass 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=FunctionalizeControlFlowPass 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 // CHECK:       FunctionalizeControlFlowPass: Graph contains node with inputs predicated on incompatible predicates: {s(Cond:0,then)} and {s(Cond:0,else)}
 // CHECK-NEXT:  for node {{[{][{]node Add[}][}]}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if.mlir
index d3b2d835c27..0d40a4d383c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functionalize-if.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=FunctionalizeControlFlowPass | FileCheck %s
+// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=FunctionalizeControlFlowPass | FileCheck %s --dump-input-on-failure
 
 func @main() {
   %0 = "_tf._TPUReplicate"() {computation = @foo, Tinputs = [], Tbroadcast_inputs = [], NumVariables = 0, Tguaranteed_constants = [], output_types = []} : () -> !_tf.control loc("_TPUReplicate")
@@ -17,18 +17,18 @@ func @foo() {
 
 // Match the name of the cloned function with functionalized control-flow at call site
 // CHECK: func @main()
-// CHECK-NEXT: computation = @[[FUNCTIONALIZE_FUNC:[A-Za-z0-9_]*]]
+// CHECK: computation = @[[FUNCTIONALIZE_FUNC:[A-Za-z0-9_]*]]
 
 
 // In the newly cloned function, check that we have a _tf.If operation and capture the then and else branch.
 // CHECK: func @[[FUNCTIONALIZE_FUNC]]
-// CHECK: "_tf.If"
+// CHECK: "tf.If"
 // CHECK-SAME:  else_branch = @[[ELSE_FUNC:[A-Za-z0-9_]*]]
 // CHECK-SAME:  then_branch = @[[THEN_FUNC:[A-Za-z0-9_]*]]
 
 // We expect the _tf.Add in the else func and the _tf.Mul in the then func
 
 // CHECK: func @[[ELSE_FUNC]]
-// CHECK: "_tf.Add"
+// CHECK: "tf.Add"
 // CHECK: func @[[THEN_FUNC]]
-// CHECK: "_tf.Mul"
+// CHECK: "tf.Mul"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
new file mode 100644
index 00000000000..bd10512ff72
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
@@ -0,0 +1,86 @@
+// RUN: tf-opt %s -tf-executor-graph-pruning | FileCheck %s --dump-input=fail
+
+// Two islands chained by data-flow contributing to the graph return are
+// preserved.
+// CHECK-LABEL: func @chained_islands(
+func @chained_islands(%arg0 : i32) -> i32 {
+// CHECK: island
+// CHECK: island
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      tf_executor.yield %arg0 : i32
+    }
+    %2:2 = tf_executor.island {
+      tf_executor.yield %1#0 : i32
+    }
+    tf_executor.fetch %2#0 : i32
+  }
+  return %0 : i32
+}
+
+// Check that empty islands that don't contribute to the fetch are removed.
+// CHECK-LABEL: func @empty_islands(
+func @empty_islands() {
+// CHECK-NOT: tf_executor.island
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_executor.yield
+    }
+    %1 = tf_executor.island {
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// Check that an unused island that doesn't contribute to the fetch is removed.
+// CHECK-LABEL: func @dead_island(
+func @dead_island(%arg0 : i32) -> i32 {
+// CHECK: tf_executor.island
+// CHECK-NOT: tf_executor.island
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %a = "op.A"(%arg0) : (i32) -> i32
+      %b = "op.B"(%a) : (i32) -> i32
+      tf_executor.yield %b : i32
+    }
+    %2:2 = tf_executor.island {
+      %a = "op.A"(%1#0) : (i32) -> i32
+      tf_executor.yield %a : i32
+    }
+    tf_executor.fetch %1#0 : i32
+  }
+  return %0 : i32
+}
+
+
+// Check that NextIteration.sink node isn't deleted when the source is still
+// used, even though it does not have any result.
+// CHECK-LABEL: func @nextiteration_sink_preserved(
+func @nextiteration_sink_preserved(%arg0 : i32) -> i32 {
+// CHECK: tf_executor.NextIteration.Source
+// CHECK: tf_executor.NextIteration.Sink
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.NextIteration.Source : i32
+    tf_executor.NextIteration.Sink[%1#1] %1#0 : i32
+    tf_executor.fetch %1#0 : i32
+  }
+  return %0 : i32
+}
+
+// Check that NextIteration.sink node is deleted when the source does not have
+// any user other than the sink.
+// CHECK-LABEL: func @nextiteration_deleted(
+func @nextiteration_deleted(%arg0 : i32) -> i32 {
+// CHECK-NOT: tf_executor.NextIteration.Source
+// CHECK-NOT: tf_executor.NextIteration.Sink
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.NextIteration.Source : i32
+    // intentionally take an output dependency on the source here.
+    tf_executor.NextIteration.Sink[%1#1] %1#0 : i32
+    tf_executor.fetch %arg0 : i32
+  }
+  return %0 : i32
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
index ffbd84c7ee7..a2b9efff36b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
@@ -38,8 +38,14 @@ versions {
 
 # CHECK: func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32>
 # CHECK: attributes {tf.entry_function = {inputs = "input0, input1", outputs = "Add"}} {
-# CHECK:   %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim {\0A  size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK:   %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim {\0A  size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK:   %2:2 = "_tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<10xi32>, tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK:   return %2#0 : tensor<10xi32>
-# CHECK: }
+
+# CHECK:   %[[INPUT0:[0-9]+]]:2 = tf_executor.island
+# CHECK-NEXT: "tf.Placeholder.input"(%arg0)
+
+# CHECK:   %[[INPUT1:[0-9]+]]:2 = tf_executor.island
+# CHECK-NEXT: "tf.Placeholder.input"(%arg1)
+
+# CHECK:   %[[add:[0-9]+]]:2 = tf_executor.island
+# CHECK-NEXT: "tf.Add"(%[[INPUT0]]#0, %[[INPUT1]]#0)
+
+# CHECK:   fetch %[[add]]#0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
index da77c16ca64..74adc38d87d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
@@ -40,7 +40,9 @@ library {
       }
     }
     # Drop the control dependency on arg for the node "test"
-    # CHECK:   %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "test", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
+    # CHECK-LABEL: func @foo
+    # CHECK: tf_executor.island {
+    # CHECK-NEXT:   "tf.Const"()
     node_def {
       name: "test"
       op: "Const"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt
new file mode 100644
index 00000000000..019deaf4df4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt
@@ -0,0 +1,90 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+
+node {
+  name: "x"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\x00\x00\x80\x3F\x00\x00\x00\x40\x00\x00\x40\x40\x00\x00\x80\x40\x00\x00\xA0\x40\x00\x00\xC0\x40"
+        # CHECK: value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
+      }
+    }
+  }
+}
+node {
+  name: "y"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00"
+        # CHECK: value = dense<{{\[\[}}1, 3, 2], [5, 4, 7]]> : tensor<2x3xi64>
+      }
+    }
+  }
+}
+node {
+  name: "z"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00\x07\x00\x00\x00"
+        # CHECK: value = dense<{{\[\[}}1, 3, 2], [5, 4, 7]]> : tensor<2x3xi32>
+      }
+    }
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
index 81466e6d937..93a2f602c65 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/empty-value-attr.pbtxt
@@ -75,8 +75,8 @@ versions {
 }
 
 # Match partitioned call in main and capture the callee name.
-# CHECK: func @main
-# CHECK-NEXT: _tf.PartitionedCall
+# CHECK-LABEL: func @main
+# CHECK: tf.PartitionedCall
 # CHECK-SAME: f = @[[FUNCTION:[a-zA-Z0-9_]*]]
 
 # Verify that callee has the unit attribute tf._input_shapes.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt
new file mode 100644
index 00000000000..cbfa973fd64
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-if-ops.pbtxt
@@ -0,0 +1,256 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulIf,StatelessIf -o - | FileCheck %s
+
+# Verify that TensorFlow If and StatelessIf ops are mapped to the
+# composite If op in MLIR with is_stateless attribute set accordingly to
+# distinguish between them.
+
+# CHECK-DAG: "tf.If"{{.*}} is_stateless = false, name = "StatefulIf"
+# CHECK-DAG: "tf.If"{{.*}} is_stateless = true, name = "StatelessIf"
+
+node {
+  name: "tf.Less"
+  op: "Less"
+  input: "a"
+  input: "b"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "StatefulIf"
+  op: "If"
+  input: "tf.Less"
+  input: "a"
+  input: "b"
+  attr {
+    key: "Tcond"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "else_branch"
+    value {
+      func {
+        name: "cond_false"
+      }
+    }
+  }
+  attr {
+    key: "then_branch"
+    value {
+      func {
+        name: "cond_true"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "StatelessIf"
+  op: "StatelessIf"
+  input: "tf.Less"
+  input: "a"
+  input: "b"
+  attr {
+    key: "Tcond"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "else_branch"
+    value {
+      func {
+        name: "cond_false"
+      }
+    }
+  }
+  attr {
+    key: "then_branch"
+    value {
+      func {
+        name: "cond_true"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "StatefulIf"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "main1"
+  op: "_Retval"
+  input: "StatelessIf"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "a"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "b"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+library {
+  function {
+    signature {
+      name: "cond_true"
+      input_arg {
+        name: "cond_true"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_true1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_true2"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Add"
+      op: "Add"
+      input: "cond_true"
+      input: "cond_true1"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Add"
+      }
+    }
+    ret {
+      key: "cond_true2"
+      value: "tf.Add:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "cond_false"
+      input_arg {
+        name: "cond_false"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_false1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_false2"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Mul"
+      op: "Mul"
+      input: "cond_false"
+      input: "cond_false1"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Mul"
+      }
+    }
+    ret {
+      key: "cond_false2"
+      value: "tf.Mul:z:0"
+    }
+  }
+}
+versions {
+  producer: 115
+  min_consumer: 12
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt
new file mode 100644
index 00000000000..953f83a9f68
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/functional-while-ops.pbtxt
@@ -0,0 +1,283 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=iter,val -tf-input-data-types=DT_INT32,DT_FLOAT -tf-input-shapes=':' -tf-output-arrays=StatefulWhile:1,StatelessWhile:1 -o - | FileCheck %s
+
+# Verify that TensorFlow While and StatelessWhile ops are mapped to the
+# composite While op in MLIR with is_stateless attribute set accordingly to
+# distinguish between them.
+
+# CHECK-DAG: "tf.While"{{.*}} is_stateless = false, name = "StatefulWhile"
+# CHECK-DAG: "tf.While"{{.*}} is_stateless = true, name = "StatelessWhile"
+
+node {
+  name: "StatefulWhile"
+  op: "While"
+  input: "iter"
+  input: "val"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "body"
+    value {
+      func {
+        name: "body"
+      }
+    }
+  }
+  attr {
+    key: "cond"
+    value {
+      func {
+        name: "cond"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "StatelessWhile"
+  op: "StatelessWhile"
+  input: "iter"
+  input: "val"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "body"
+    value {
+      func {
+        name: "body"
+      }
+    }
+  }
+  attr {
+    key: "cond"
+    value {
+      func {
+        name: "cond"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "StatefulWhile:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "main1"
+  op: "_Retval"
+  input: "StatelessWhile:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "iter"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "val"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+library {
+  function {
+    signature {
+      name: "cond"
+      input_arg {
+        name: "cond"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "cond1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond2"
+        type: DT_BOOL
+      }
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "Const"
+      }
+    }
+    node_def {
+      name: "tf.Greater"
+      op: "Greater"
+      input: "cond"
+      input: "Const:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Greater"
+      }
+    }
+    ret {
+      key: "cond2"
+      value: "tf.Greater:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "body"
+      input_arg {
+        name: "body"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "body1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "body2"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "body3"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "Const"
+      }
+    }
+    node_def {
+      name: "tf.Sub"
+      op: "Sub"
+      input: "body"
+      input: "Const:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Sub"
+      }
+    }
+    node_def {
+      name: "tf.Add"
+      op: "Add"
+      input: "body1"
+      input: "body1"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Add"
+      }
+    }
+    ret {
+      key: "body2"
+      value: "tf.Sub:z:0"
+    }
+    ret {
+      key: "body3"
+      value: "tf.Add:z:0"
+    }
+  }
+}
+versions {
+  producer: 115
+  min_consumer: 12
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt
deleted file mode 100644
index ae252ef83dd..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt
+++ /dev/null
@@ -1,99 +0,0 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
-
-node {
-  name: "Empty/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:TPU:0"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\200\000\000\000\200\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Empty"
-  op: "Empty"
-  input: "Empty/shape"
-  device: "/job:localhost/replica:0/task:0/device:TPU:0"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_BFLOAT16
-    }
-  }
-  attr {
-    key: "init"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "Empty/_0"
-  op: "_Send"
-  input: "Empty"
-  device: "/job:localhost/replica:0/task:0/device:TPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_BFLOAT16
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/device:CPU:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/device:TPU:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "edge_5_Empty"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 26
-}
-
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_INT32", name = "Empty/shape", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Empty"(%0#0) {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_BFLOAT16", init = false, name = "Empty"} : (tensor<2xi32>) -> (tensor<128x128xbf16>, !_tf.control)
-# CHECK-NEXT:    %2 = "_tf._Send"(%1#0) {T = "tfdtype$DT_BFLOAT16", client_terminated = false, device = "/job:localhost/replica:0/task:0/device:TPU:0", name = "Empty/_0", recv_device = "/job:localhost/replica:0/task:0/device:CPU:0", send_device = "/job:localhost/replica:0/task:0/device:TPU:0", send_device_incarnation = 1 : i64, tensor_name = "edge_5_Empty"} : (tensor<128x128xbf16>) -> !_tf.control
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt
deleted file mode 100644
index 0333193be8d..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt
+++ /dev/null
@@ -1,1550 +0,0 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
-
-node {
-  name: "placeholder_0_arg"
-  op: "_Arg"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Shape"
-  op: "Shape"
-  input: "placeholder_0_arg"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "out_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/strided_slice/stack"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/strided_slice/stack_1"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/strided_slice/stack_2"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/strided_slice"
-  op: "StridedSlice"
-  input: "tpu/tpu/Shape"
-  input: "tpu/tpu/strided_slice/stack"
-  input: "tpu/tpu/strided_slice/stack_1"
-  input: "tpu/tpu/strided_slice/stack_2"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "begin_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "ellipsis_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "end_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "new_axis_mask"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shrink_axis_mask"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims"
-  op: "ExpandDims"
-  input: "tpu/tpu/strided_slice"
-  input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/Const"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/concat/axis"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/concat"
-  op: "ConcatV2"
-  input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims"
-  input: "tpu/tpu/Plus1RNNCellZeroState/Const"
-  input: "tpu/tpu/Plus1RNNCellZeroState/concat/axis"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/zeros/Const"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/zeros"
-  op: "Fill"
-  input: "tpu/tpu/Plus1RNNCellZeroState/concat"
-  input: "tpu/tpu/Plus1RNNCellZeroState/zeros/Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1"
-  op: "ExpandDims"
-  input: "tpu/tpu/strided_slice"
-  input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Plus1RNNCellZeroState/Const_1"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/sequence_length"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/ExpandDims/dim"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/ExpandDims"
-  op: "ExpandDims"
-  input: "tpu/tpu/strided_slice"
-  input: "tpu/tpu/ExpandDims/dim"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Const"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/concat/axis"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/concat"
-  op: "ConcatV2"
-  input: "tpu/tpu/ExpandDims"
-  input: "tpu/tpu/Const"
-  input: "tpu/tpu/concat/axis"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/zeros/Const"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/zeros"
-  op: "Fill"
-  input: "tpu/tpu/concat"
-  input: "tpu/tpu/zeros/Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Const_1"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Const_2"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Min"
-  op: "Min"
-  input: "tpu/tpu/sequence_length"
-  input: "tpu/tpu/Const_2"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Const_3"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/Max"
-  op: "Max"
-  input: "tpu/tpu/sequence_length"
-  input: "tpu/tpu/Const_3"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "tpu/tpu/LessEqual/y"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/LessEqual"
-  op: "LessEqual"
-  input: "tpu/tpu/sequence_length"
-  input: "tpu/tpu/LessEqual/y"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/LessEqual_1/y"
-  op: "Const"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/LessEqual_1"
-  op: "LessEqual"
-  input: "tpu/tpu/Max"
-  input: "tpu/tpu/LessEqual_1/y"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Switch"
-  op: "Switch"
-  input: "tpu/tpu/LessEqual_1"
-  input: "tpu/tpu/LessEqual_1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/switch_t"
-  op: "Identity"
-  input: "tpu/tpu/cond/Switch:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/switch_f"
-  op: "Identity"
-  input: "tpu/tpu/cond/Switch"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/pred_id"
-  op: "Identity"
-  input: "tpu/tpu/LessEqual_1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Switch_1"
-  op: "Switch"
-  input: "tpu/tpu/zeros"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Switch_2"
-  op: "Switch"
-  input: "tpu/tpu/Plus1RNNCellZeroState/zeros"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add/y"
-  op: "Const"
-  input: "^tpu/tpu/cond/switch_f"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add/Switch"
-  op: "Switch"
-  input: "placeholder_0_arg"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@Placeholder"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add"
-  op: "Add"
-  input: "tpu/tpu/cond/add/Switch"
-  input: "tpu/tpu/cond/add/y"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add_1/y"
-  op: "Const"
-  input: "^tpu/tpu/cond/switch_f"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add_1/Switch"
-  op: "Switch"
-  input: "tpu/tpu/Plus1RNNCellZeroState/zeros"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/add_1"
-  op: "Add"
-  input: "tpu/tpu/cond/add_1/Switch"
-  input: "tpu/tpu/cond/add_1/y"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Greater/y"
-  op: "Const"
-  input: "^tpu/tpu/cond/switch_f"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Greater/Switch"
-  op: "Switch"
-  input: "tpu/tpu/Min"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/Min"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Greater"
-  op: "Greater"
-  input: "tpu/tpu/cond/Greater/Switch"
-  input: "tpu/tpu/cond/Greater/y"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Switch"
-  op: "Switch"
-  input: "tpu/tpu/cond/Greater"
-  input: "tpu/tpu/cond/Greater"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/switch_t"
-  op: "Identity"
-  input: "tpu/tpu/cond/cond/Switch:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/switch_f"
-  op: "Identity"
-  input: "tpu/tpu/cond/cond/Switch"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/pred_id"
-  op: "Identity"
-  input: "tpu/tpu/cond/Greater"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Switch_1"
-  op: "Switch"
-  input: "tpu/tpu/cond/add"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Switch_2"
-  op: "Switch"
-  input: "tpu/tpu/cond/add_1"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add_1"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select/Switch"
-  op: "Switch"
-  input: "tpu/tpu/LessEqual"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/LessEqual"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select/Switch_1"
-  op: "Switch"
-  input: "tpu/tpu/cond/cond/Select/Switch"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_BOOL
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/LessEqual"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select/Switch_2"
-  op: "Switch"
-  input: "tpu/tpu/zeros"
-  input: "tpu/tpu/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select/Switch_3"
-  op: "Switch"
-  input: "tpu/tpu/cond/cond/Select/Switch_2"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select/Switch_4"
-  op: "Switch"
-  input: "tpu/tpu/cond/add"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select"
-  op: "Select"
-  input: "tpu/tpu/cond/cond/Select/Switch_1"
-  input: "tpu/tpu/cond/cond/Select/Switch_3"
-  input: "tpu/tpu/cond/cond/Select/Switch_4"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select_1/Switch"
-  op: "Switch"
-  input: "tpu/tpu/cond/add_1/Switch"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select_1/Switch_1"
-  op: "Switch"
-  input: "tpu/tpu/cond/add_1"
-  input: "tpu/tpu/cond/cond/pred_id"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add_1"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Select_1"
-  op: "Select"
-  input: "tpu/tpu/cond/cond/Select/Switch_1"
-  input: "tpu/tpu/cond/cond/Select_1/Switch"
-  input: "tpu/tpu/cond/cond/Select_1/Switch_1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@tpu/tpu/cond/add_1"
-      }
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Merge"
-  op: "Merge"
-  input: "tpu/tpu/cond/cond/Select"
-  input: "tpu/tpu/cond/cond/Switch_1:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/cond/Merge_1"
-  op: "Merge"
-  input: "tpu/tpu/cond/cond/Select_1"
-  input: "tpu/tpu/cond/cond/Switch_2:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Merge"
-  op: "Merge"
-  input: "tpu/tpu/cond/cond/Merge"
-  input: "tpu/tpu/cond/Switch_1:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/tpu/cond/Merge_1"
-  op: "Merge"
-  input: "tpu/tpu/cond/cond/Merge_1"
-  input: "tpu/tpu/cond/Switch_2:1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/NoOp"
-  op: "NoOp"
-  device: "/device:TPU_REPLICATED_CORE"
-}
-node {
-  name: "tpu/packed"
-  op: "Pack"
-  input: "tpu/tpu/cond/Merge"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "tpu/Identity"
-  op: "Identity"
-  input: "tpu/packed"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu/Identity_1"
-  op: "Identity"
-  input: "tpu/tpu/cond/Merge_1"
-  device: "/device:TPU_REPLICATED_CORE"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "tpu_identity_0_retval_RetVal"
-  op: "_Retval"
-  input: "tpu/Identity"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "tpu_identity_1_0_retval_RetVal"
-  op: "_Retval"
-  input: "tpu/Identity_1"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "index"
-    value {
-      i: 1
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 26
-}
-
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf._Arg"() {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 0 : i64, name = "placeholder_0_arg"} : () -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Shape"(%0#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Shape", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor<?xi32>, !_tf.control)
-# CHECK-NEXT:    %2 = "_tf.NoOp"() {device = "/device:TPU_REPLICATED_CORE", name = "tpu/NoOp"} : () -> !_tf.control
-# CHECK-NEXT:    %3:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %4:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %5:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_2", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %6:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_3", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %7:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/ExpandDims/dim", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %8:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/LessEqual/y", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %9:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/LessEqual_1/y", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %10:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/Const", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %11:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/Const_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %12:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %13:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %14:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/concat/axis", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %15:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/Plus1RNNCellZeroState/zeros/Const", value = dense<0.000000e+00> : tensor<f32>} : () -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:    %16:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/concat/axis", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %17:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/sequence_length", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %18:2 = "_tf.LessEqual"(%17#0, %8#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/LessEqual"} : (tensor<1xi32>, tensor<i32>) -> (tensor<1xi1>, !_tf.control)
-# CHECK-NEXT:    %19:2 = "_tf.Max"(%17#0, %6#0) {T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", keep_dims = false, name = "tpu/tpu/Max"} : (tensor<1xi32>, tensor<1xi32>) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %20:2 = "_tf.LessEqual"(%19#0, %9#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/LessEqual_1"} : (tensor<i32>, tensor<i32>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %21:3 = "_tf.Switch"(%20#0, %20#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch"} : (tensor<i1>, tensor<i1>) -> (tensor<i1>, tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %22:2 = "_tf.Identity"(%21#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/switch_f"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %23:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/cond/Greater/y", value = dense<0> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %24:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/cond/add/y", value = dense<1.000000e+00> : tensor<f32>} : (!_tf.control) -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:    %25:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/cond/add_1/y", value = dense<1.000000e+00> : tensor<f32>} : (!_tf.control) -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:    %26:2 = "_tf.Identity"(%21#1) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/switch_t"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %27:2 = "_tf.Identity"(%20#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/pred_id"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %28:3 = "_tf.Switch"(%0#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@Placeholder"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add/Switch"} : (tensor<*xf32>, tensor<i1>) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %29:2 = "_tf.Add"(%28#0, %24#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add"} : (tensor<*xf32>, tensor<f32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %30:3 = "_tf.Switch"(%18#0, %27#0) {T = "tfdtype$DT_BOOL", _class = ["loc:@tpu/tpu/LessEqual"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch"} : (tensor<1xi1>, tensor<i1>) -> (tensor<1xi1>, tensor<1xi1>, !_tf.control)
-# CHECK-NEXT:    %31:2 = "_tf.Min"(%17#0, %5#0) {T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", keep_dims = false, name = "tpu/tpu/Min"} : (tensor<1xi32>, tensor<1xi32>) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %32:3 = "_tf.Switch"(%31#0, %27#0) {T = "tfdtype$DT_INT32", _class = ["loc:@tpu/tpu/Min"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Greater/Switch"} : (tensor<i32>, tensor<i1>) -> (tensor<i32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %33:2 = "_tf.Greater"(%32#0, %23#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Greater"} : (tensor<i32>, tensor<i32>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %34:3 = "_tf.Switch"(%33#0, %33#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch"} : (tensor<i1>, tensor<i1>) -> (tensor<i1>, tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %35:2 = "_tf.Identity"(%34#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/switch_f"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %36:2 = "_tf.Identity"(%34#1) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/switch_t"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %37:2 = "_tf.Identity"(%33#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/pred_id"} : (tensor<i1>) -> (tensor<i1>, !_tf.control)
-# CHECK-NEXT:    %38:3 = "_tf.Switch"(%30#0, %37#0) {T = "tfdtype$DT_BOOL", _class = ["loc:@tpu/tpu/LessEqual"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_1"} : (tensor<1xi1>, tensor<i1>) -> (tensor<1xi1>, tensor<1xi1>, !_tf.control)
-# CHECK-NEXT:    %39:3 = "_tf.Switch"(%29#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_4"} : (tensor<*xf32>, tensor<i1>) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %40:3 = "_tf.Switch"(%29#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch_1"} : (tensor<*xf32>, tensor<i1>) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %41:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %42:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %43:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack_2", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %44:2 = "_tf.StridedSlice"(%1#0, %41#0, %42#0, %43#0) {Index = "tfdtype$DT_INT32", T = "tfdtype$DT_INT32", begin_mask = 0 : i64, device = "/device:TPU_REPLICATED_CORE", ellipsis_mask = 0 : i64, end_mask = 0 : i64, name = "tpu/tpu/strided_slice", new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %45:2 = "_tf.ExpandDims"(%44#0, %7#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/ExpandDims"} : (tensor<i32>, tensor<i32>) -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %46:2 = "_tf.ConcatV2"(%45#0, %3#0, %16#0) {N = 2 : i64, T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/concat"} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> (tensor<2xi32>, !_tf.control)
-# CHECK-NEXT:    %47:2 = "_tf.ExpandDims"(%44#0, %12#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims"} : (tensor<i32>, tensor<i32>) -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %48:2 = "_tf.ConcatV2"(%47#0, %10#0, %14#0) {N = 2 : i64, T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/concat"} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> (tensor<2xi32>, !_tf.control)
-# CHECK-NEXT:    %49:2 = "_tf.Fill"(%48#0, %15#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", index_type = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/zeros"} : (tensor<2xi32>, tensor<f32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %50:3 = "_tf.Switch"(%49#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch_2"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %51:3 = "_tf.Switch"(%49#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add_1/Switch"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %52:2 = "_tf.Add"(%51#0, %25#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add_1"} : (tensor<?x1xf32>, tensor<f32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %53:3 = "_tf.Switch"(%52#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1/Switch_1"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %54:3 = "_tf.Switch"(%52#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch_2"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %55:3 = "_tf.Switch"(%51#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1/Switch"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %56:2 = "_tf.Select"(%38#0, %55#0, %53#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1"} : (tensor<1xi1>, tensor<?x1xf32>, tensor<?x1xf32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %57:3 = "_tf.Merge"(%56#0, %54#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Merge_1"} : (tensor<?x1xf32>, tensor<?x1xf32>) -> (tensor<?x1xf32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %58:3 = "_tf.Merge"(%57#0, %50#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Merge_1"} : (tensor<?x1xf32>, tensor<?x1xf32>) -> (tensor<?x1xf32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %59:2 = "_tf.Identity"(%58#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/Identity_1"} : (tensor<?x1xf32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %60 = "_tf._Retval"(%59#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 1 : i64, name = "tpu_identity_1_0_retval_RetVal"} : (tensor<?x1xf32>) -> !_tf.control
-# CHECK-NEXT:    %61:2 = "_tf.ExpandDims"(%44#0, %13#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1"} : (tensor<i32>, tensor<i32>) -> (tensor<1xi32>, !_tf.control)
-# CHECK-NEXT:    %62:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/zeros/Const", value = dense<0.000000e+00> : tensor<f32>} : () -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:    %63:2 = "_tf.Fill"(%46#0, %62#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", index_type = "tfdtype$DT_INT32", name = "tpu/tpu/zeros"} : (tensor<2xi32>, tensor<f32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %64:3 = "_tf.Switch"(%63#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch_1"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %65:3 = "_tf.Switch"(%63#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_2"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %66:3 = "_tf.Switch"(%65#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_3"} : (tensor<?x1xf32>, tensor<i1>) -> (tensor<?x1xf32>, tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %67:2 = "_tf.Select"(%38#0, %66#0, %39#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select"} : (tensor<1xi1>, tensor<?x1xf32>, tensor<*xf32>) -> (tensor<?x1xf32>, !_tf.control)
-# CHECK-NEXT:    %68:3 = "_tf.Merge"(%67#0, %40#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Merge"} : (tensor<?x1xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %69:3 = "_tf.Merge"(%68#0, %64#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Merge"} : (tensor<*xf32>, tensor<?x1xf32>) -> (tensor<*xf32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %70:2 = "_tf.Pack"(%69#0) {N = 1 : i64, T = "tfdtype$DT_FLOAT", axis = 0 : i64, device = "/device:TPU_REPLICATED_CORE:0", name = "tpu/packed"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %71:2 = "_tf.Identity"(%70#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", name = "tpu/Identity"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    %72 = "_tf._Retval"(%71#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 0 : i64, name = "tpu_identity_0_retval_RetVal"} : (tensor<*xf32>) -> !_tf.control
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
new file mode 100644
index 00000000000..1bf5037a75f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -0,0 +1,254 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-graph-as-function -o - | FileCheck %s --dump-input=fail
+
+# Verify main graph was converted to a function, args/rets are mapped correctly,
+# and ops in the main graph are retained. In addition, check if subsequent
+# functions are converted.
+
+# CHECK:      func @main(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> (tensor<f32>, tensor<f32>)
+# CHECK:      attributes {tf.entry_function = {inputs = "args_0, args_1", outputs = "rets_0_RetVal, rets_1_RetVal"}} {
+# CHECK:          %[[ISLAND_0:[0-9]]]:2 = tf_executor.island {
+# CHECK:            "tf.Const"
+# CHECK:          %[[ISLAND_1:[0-9]]]:2 = tf_executor.island {
+# CHECK:            "tf.Identity"(%[[ISLAND_0]]#0)
+# CHECK:          %[[ISLAND_2:[0-9]]]:2 = tf_executor.island {
+# CHECK:            "tf.StatefulPartitionedCall"
+# CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
+# CHECK:          tf_executor.fetch %[[ISLAND_1]]#0, %[[ISLAND_2]]#0 : tensor<f32>, tensor<f32>
+# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
+
+node {
+  name: "args_0"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "args_1"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_handle_dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_handle_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 32
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "statefulpartitionedcall"
+  op: "StatefulPartitionedCall"
+  input: "const"
+  input: "args_1"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "PartitionedCall-1205"
+    }
+  }
+  attr {
+    key: "config"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "config_proto"
+    value {
+      s: "\n\007\n\003GPU\020\000\n\007\n\003CPU\020\0012\002J\0008\001"
+    }
+  }
+  attr {
+    key: "executor_type"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "function"
+      }
+    }
+  }
+}
+node {
+  name: "identity"
+  op: "Identity"
+  input: "const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "rets_0"
+  op: "_Retval"
+  input: "identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "rets_1"
+  op: "_Retval"
+  input: "statefulpartitionedcall"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "function"
+      input_arg {
+        name: "inputs"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "statefulpartitionedcall_args_1"
+        type: DT_RESOURCE
+      }
+      output_arg {
+        name: "identity"
+        type: DT_FLOAT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "Identity"
+      op: "Identity"
+      input: "inputs"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "identity"
+      value: "Identity:output:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_user_specified_name"
+          value {
+            s: "inputs"
+          }
+        }
+      }
+    }
+    arg_attr {
+      key: 1
+      value {
+      }
+    }
+  }
+}
+versions {
+  producer: 121
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
index 82146716fff..9ce15315832 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
@@ -1,209 +1,8 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
 node {
-  name: "Placeholder"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-node {
-  name: "Placeholder_1"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-node {
-  name: "input0"
-  op: "TPUReplicatedInput"
-  input: "Placeholder"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "input1"
-  op: "TPUReplicatedInput"
-  input: "Placeholder_1"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "cluster/pivot"
-  op: "NoOp"
-}
-node {
-  name: "TPUReplicateMetadata"
-  op: "TPUReplicateMetadata"
-  input: "^cluster/pivot"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "computation_shape"
-    value {
-      list {
-      }
-    }
-  }
-  attr {
-    key: "device_assignment"
-    value {
-      list {
-      }
-    }
-  }
-  attr {
-    key: "host_compute_core"
-    value {
-      list {
-      }
-    }
-  }
-  attr {
-    key: "num_cores_per_replica"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "num_replicas"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "topology"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "use_tpu"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "replicated_input_0"
-  op: "Identity"
-  input: "input0"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "replicated_input_1"
-  op: "Identity"
-  input: "input1"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/maximum_iterations"
+  name: "Constant"
   op: "Const"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "while/iteration_counter"
-  op: "Const"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
   attr {
     key: "dtype"
     value {
@@ -223,1968 +22,37 @@ node {
   }
 }
 node {
-  name: "while/Enter"
-  op: "Enter"
-  input: "while/iteration_counter"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "while/Enter_1"
-  op: "Enter"
-  input: "replicated_input_0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "while/Enter_2"
-  op: "Enter"
-  input: "replicated_input_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "while/Merge"
-  op: "Merge"
-  input: "while/Enter"
-  input: "while/NextIteration"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Merge_1"
-  op: "Merge"
-  input: "while/Enter_1"
-  input: "while/NextIteration_1"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Merge_2"
-  op: "Merge"
-  input: "while/Enter_2"
-  input: "while/NextIteration_2"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Less/Enter"
-  op: "Enter"
-  input: "while/maximum_iterations"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "while/Less"
-  op: "Less"
-  input: "while/Merge"
-  input: "while/Less/Enter"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/less_than_5_If8q4vKg9jA"
-  op: "less_than_5_If8q4vKg9jA"
-  input: "while/Merge_1"
-  input: "^while/Merge"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/LogicalAnd"
-  op: "LogicalAnd"
-  input: "while/Less"
-  input: "while/less_than_5_If8q4vKg9jA"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/LoopCond"
-  op: "LoopCond"
-  input: "while/LogicalAnd"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Switch"
-  op: "Switch"
-  input: "while/Merge"
-  input: "while/LoopCond"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Merge"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Switch_1"
-  op: "Switch"
-  input: "while/Merge_1"
-  input: "while/LoopCond"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Merge_1"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Switch_2"
-  op: "Switch"
-  input: "while/Merge_2"
-  input: "while/LoopCond"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Merge_2"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Identity"
-  op: "Identity"
-  input: "while/Switch:1"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Identity_1"
-  op: "Identity"
-  input: "while/Switch_1:1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Identity_2"
-  op: "Identity"
-  input: "while/Switch_2:1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/add/y"
-  op: "Const"
-  input: "^while/Identity"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "while/add"
-  op: "Add"
-  input: "while/Identity"
-  input: "while/add/y"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/add_1/y"
-  op: "Const"
-  input: "^while/Identity"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "while/add_1"
-  op: "Add"
-  input: "while/Identity_1"
-  input: "while/add_1/y"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/mul_2_Da30D05wlPU"
-  op: "mul_2_Da30D05wlPU"
-  input: "while/Identity_1"
-  input: "while/Identity_2"
-  input: "^while/Identity"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/NextIteration"
-  op: "NextIteration"
-  input: "while/add"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/NextIteration_1"
-  op: "NextIteration"
-  input: "while/add_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/NextIteration_2"
-  op: "NextIteration"
-  input: "while/mul_2_Da30D05wlPU"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Exit"
-  op: "Exit"
-  input: "while/Switch"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Exit_1"
-  op: "Exit"
-  input: "while/Switch_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "while/Exit_2"
-  op: "Exit"
-  input: "while/Switch_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Shape"
-  input: "while/Exit_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "out_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "gradients/grad_ys_0"
-  op: "Const"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/grad_ys_0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "index_type"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "gradients/f_count"
-  op: "Const"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/f_count_1"
-  op: "Enter"
-  input: "gradients/f_count"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/Merge"
-  op: "Merge"
-  input: "gradients/f_count_1"
-  input: "gradients/NextIteration"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/Switch"
-  op: "Switch"
-  input: "gradients/Merge"
-  input: "while/LoopCond"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/Add/y"
-  op: "Const"
-  input: "^while/Identity"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Add"
-  op: "Add"
-  input: "gradients/Switch:1"
-  input: "gradients/Add/y"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/f_count_2"
-  op: "Exit"
-  input: "gradients/Switch"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/b_count"
-  op: "Const"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/b_count_1"
-  op: "Enter"
-  input: "gradients/f_count_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/Merge_1"
-  op: "Merge"
-  input: "gradients/b_count_1"
-  input: "gradients/NextIteration_1"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/GreaterEqual/Enter"
-  op: "Enter"
-  input: "gradients/b_count"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/GreaterEqual"
-  op: "GreaterEqual"
-  input: "gradients/Merge_1"
-  input: "gradients/GreaterEqual/Enter"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/b_count_2"
-  op: "LoopCond"
-  input: "gradients/GreaterEqual"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/Switch_1"
-  op: "Switch"
-  input: "gradients/Merge_1"
-  input: "gradients/b_count_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/Sub"
-  op: "Sub"
-  input: "gradients/Switch_1:1"
-  input: "gradients/GreaterEqual/Enter"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/b_count_3"
-  op: "Exit"
-  input: "gradients/Switch_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/zeros_like"
-  op: "ZerosLike"
-  input: "while/Exit_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/Exit_2_grad/b_exit"
-  op: "Enter"
-  input: "gradients/Fill"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/Exit_1_grad/b_exit"
-  op: "Enter"
-  input: "gradients/zeros_like"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/Switch_2_grad/b_switch"
-  op: "Merge"
-  input: "gradients/while/Exit_2_grad/b_exit"
-  input: "gradients/while/Switch_2_grad_1/NextIteration"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/Merge_2_grad/Switch"
-  op: "Switch"
-  input: "gradients/while/Switch_2_grad/b_switch"
-  input: "gradients/b_count_2"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/while/Switch_2_grad/b_switch"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/Enter_2_grad/Exit"
-  op: "Exit"
-  input: "gradients/while/Merge_2_grad/Switch"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const"
-  op: "Const"
-  input: "^cluster/pivot"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_1"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul"
-  op: "Mul"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const"
-  input: "while/maximum_iterations"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_1"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc"
-  op: "StackV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_1"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "elem_type"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "stack_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter"
-  op: "Enter"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc"
-  attr {
-    key: "T"
-    value {
-      type: DT_RESOURCE
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2"
-  op: "StackPushV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter"
-  input: "while/Identity_1"
-  input: "^gradients/Add"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "swap_memory"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter"
-  op: "Enter"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc"
-  attr {
-    key: "T"
-    value {
-      type: DT_RESOURCE
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2"
-  op: "StackPopV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter"
-  input: "^gradients/Sub"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "elem_type"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1"
-  op: "Const"
-  input: "^cluster/pivot"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_2"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1"
-  op: "Mul"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1"
-  input: "while/maximum_iterations"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_2"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1"
-  op: "StackV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@while/Identity_2"
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "elem_type"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "stack_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1"
-  op: "Enter"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_RESOURCE
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1"
-  op: "StackPushV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1"
-  input: "while/Identity_2"
-  input: "^gradients/Add"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "swap_memory"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/NextIteration"
-  op: "NextIteration"
-  input: "gradients/Add"
-  input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2"
-  input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter"
-  op: "Enter"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1"
-  attr {
-    key: "T"
-    value {
-      type: DT_RESOURCE
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "frame_name"
-    value {
-      s: "gradients/while/while_context"
-    }
-  }
-  attr {
-    key: "is_constant"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "parallel_iterations"
-    value {
-      i: 10
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1"
-  op: "StackPopV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter"
-  input: "^gradients/Sub"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "elem_type"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient"
-  op: "SymbolicGradient"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1"
-  input: "gradients/while/Merge_2_grad/Switch:1"
-  input: "^gradients/Sub"
-  attr {
-    key: "Tin"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_FLOAT
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    key: "Tout"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-  attr {
-    key: "f"
-    value {
-      func {
-        name: "mul_2_Da30D05wlPU"
-        attr {
-          key: "_tpu_replicate"
-          value {
-            s: "cluster"
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"
-  op: "ControlTrigger"
-  input: "^cluster/pivot"
-  input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2"
-  input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/NextIteration_1"
-  op: "NextIteration"
-  input: "gradients/Sub"
-  input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "gradients/while/Switch_2_grad_1/NextIteration"
-  op: "NextIteration"
-  input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient:1"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "NoOp"
-  op: "NoOp"
-  input: "^cluster/pivot"
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "Identity"
-  op: "Identity"
-  input: "gradients/while/Enter_2_grad/Exit"
-  device: "/device:TPU_REPLICATED_CORE:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_tpu_replicate"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "output0"
-  op: "TPUReplicatedOutput"
-  input: "Identity"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "num_replicas"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "TPUCompilationResult"
-  op: "TPUCompilationResult"
-  input: "^TPUReplicateMetadata"
-  attr {
-    key: "_tpu_compilation_status"
-    value {
-      s: "cluster"
-    }
-  }
-}
-node {
-  name: "output_0_shard_0"
-  op: "Identity"
-  input: "output0"
-  input: "^NoOp"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "ConfigureDistributedTPU"
-  op: "ConfigureDistributedTPU"
-  device: "/device:TPU_SYSTEM:0"
-  attr {
-    key: "embedding_config"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "is_global_init"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "tpu_embedding_config"
-    value {
-      s: ""
-    }
-  }
+  name: "_tf.foo"
+  op: "foo"
+  input: "Constant"
 }
 library {
   function {
     signature {
-      name: "mul_2_Da30D05wlPU"
+      name: "foo"
       input_arg {
-        name: "mul_2_da30d05wlpu"
-        type: DT_FLOAT
-      }
-      input_arg {
-        name: "mul_2_da30d05wlpu1"
-        type: DT_FLOAT
+        name: "arg"
+        type: DT_INT32
       }
       output_arg {
-        name: "mul_2_da30d05wlpu2"
-        type: DT_FLOAT
-      }
-    }
-    node_def {
-      name: "mul/y"
-      op: "Const"
-      attr {
-        key: "dtype"
-        value {
-          type: DT_FLOAT
-        }
-      }
-      attr {
-        key: "value"
-        value {
-          tensor {
-            dtype: DT_FLOAT
-            tensor_shape {
-              dim {
-                size: 1
-              }
-              dim {
-                size: 1
-              }
-            }
-            float_val: 2
-          }
-        }
-      }
-    }
-    node_def {
-      name: "mul_0"
-      op: "Mul"
-      input: "mul_2_da30d05wlpu1"
-      input: "mul/y:output:0"
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
+        name: "return_value"
+        type: DT_INT32
       }
     }
     ret {
-      key: "mul_2_da30d05wlpu2"
-      value: "mul_0:z:0"
-    }
-    attr {
-      key: "_noinline"
-      value {
-        b: true
-      }
-    }
-  }
-  function {
-    signature {
-      name: "less_than_5_If8q4vKg9jA"
-      input_arg {
-        name: "less_than_5_if8q4vkg9ja"
-        type: DT_FLOAT
-      }
-      output_arg {
-        name: "less_than_5_if8q4vkg9ja1"
-        type: DT_BOOL
-      }
-    }
-    node_def {
-      name: "Less/y"
-      op: "Const"
-      attr {
-        key: "dtype"
-        value {
-          type: DT_FLOAT
-        }
-      }
-      attr {
-        key: "value"
-        value {
-          tensor {
-            dtype: DT_FLOAT
-            tensor_shape {
-            }
-            float_val: 5
-          }
-        }
-      }
-    }
-    node_def {
-      name: "Less"
-      op: "Less"
-      input: "less_than_5_if8q4vkg9ja"
-      input: "Less/y:output:0"
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
-      }
-    }
-    ret {
-      key: "less_than_5_if8q4vkg9ja1"
-      value: "Less:z:0"
-    }
-    attr {
-      key: "_noinline"
-      value {
-        b: true
-      }
+      key: "return_value"
+      value: "arg"
     }
   }
 }
 versions {
-  producer: 27
+  producer: 62
   min_consumer: 12
 }
 
-# CHECK:       func @main() {
-# CHECK:         %30:2 = "_tf.less_than_5_If8q4vKg9jA0"(%23#0, %29#2) {_tpu_replicate = "cluster", device = "", name = "while/less_than_5_If8q4vKg9jA"} : (tensor<*xf32>, !_tf.control) -> (tensor<*xi1>, !_tf.control)
-# CHECK:         %73:2 = "_tf.mul_2_Da30D05wlPU0"(%58#0, %72#0, %47#1) {_tpu_replicate = "cluster", device = "", name = "while/mul_2_Da30D05wlPU"} : (tensor<*xf32>, tensor<*xf32>, !_tf.control) -> (tensor<*xf32>, !_tf.control)
-# CHECK:         return
-# CHECK-NEXT:  }
-# CHECK:       func @less_than_5_If8q4vKg9jA0(%arg0: tensor<*xf32>) -> tensor<*xi1>
-# CHECK-NEXT:    attributes  {tf._noinline = true} {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Less/y", value = dense<5.000000e+00> : tensor<f32>} : () -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Less"(%arg0, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Less"} : (tensor<*xf32>, tensor<f32>) -> (tensor<*xi1>, !_tf.control)
-# CHECK-NEXT:    return %1#0 : tensor<*xi1>
-# CHECK-NEXT:  }
-# CHECK:       func @mul_2_Da30D05wlPU0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32>
-# CHECK-NEXT:    attributes  {tf._noinline = true} {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "mul/y", value = dense<2.000000e+00> : tensor<1x1xf32>} : () -> (tensor<1x1xf32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Mul"(%arg1, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<1x1xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    return %1#0 : tensor<*xf32>
-# CHECK-NEXT:  }
+# Verify that we can import a custom operation that maps to a function and that
+# the names are matching between the function definition and the uses / call
+# site (a numerical suffix may be appended).
+
+# CHECK: "tf.foo0"(
+# CHECK: func @foo0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
index 46682ab866e..b26d7e7f2ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt
@@ -1,7 +1,15 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
-# CHECK:  %3:2 = "_tf.Conv2D"(%2#0, %1#0) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], name = "MobilenetV1/MobilenetV1/Conv2d_0/Conv2D", padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}
-# CHECK-NEXT: %4:2 = "_tf.MaxPool"(%3#0) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", device = "", ksize = [1, 2, 2, 1], name = "MaxPool", padding = "SAME", strides = [1, 2, 2, 1]}
+# Verify that the data_format attributes is pulled from the default value in the
+# registry when not present in the GraphDef
+# CHECK:  tf.Conv2D
+# CHECK-SAME: data_format = "NHWC"
+
+# Verify that we can also pull some attributes that are needed to be able to
+# create a Graph in memory, like `T`.
+# CHECK: tf.MaxPool
+# CHECK-SAME: T = "tfdtype$DT_FLOAT"
+
 node {
   name: "input"
   op: "Placeholder"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
index fcd0e62ab63..157db7d5331 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt
@@ -74,6 +74,9 @@ library {
     }
     # The attribute "experimental_ints_on_device" and the return type INT32
     # ensure that kDeviceRetOp is used instead of kRetOp
+    # CHECK-LABEL: func @foo
+    # CHECK:    tf.experimental_ints_on_device = true
+    # CHECK:    return %{{.*}} tensor<i32>
     attr {
       key: "experimental_ints_on_device"
       value {
@@ -87,13 +90,3 @@ versions {
   min_consumer: 12
 }
 
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf.PartitionedCall"() {Tin = [], Tout = ["tfdtype$DT_INT32"], config = "", config_proto = "", device = "", executor_type = "", f = @foo0, name = "PartitionedCall"} : () -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
-# CHECK:  func @foo0() -> tensor<i32>
-# CHECK-NEXT:    attributes  {tf.experimental_ints_on_device = true} {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<5> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Identity"(%0#0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor<i32>) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    return %1#0 : tensor<i32>
-# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
index 441eca84e7e..12d05c1195f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt
@@ -1,6 +1,9 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
-# CHECK:  %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41540A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20310A20207D0A7D0A"> : tensor<1xf32>} : () -> (tensor<1xf32>, !_tf.control)
+# This test is intended to verify the tensor_content field on import of an empty
+# tensor.
+# CHECK:  tf.Const
+# CHECK-SAME: value = dense<0.000000e+00>
 
 node {
   name: "Const"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
index e8b9ce86ddb..0176edb4b21 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt
@@ -1,5 +1,13 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
+# CHECK-LABEL:       func @main() {
+
+# Verify that the NameAttrList is properly turned into reference to functions on import
+# CHECK:    tf.Case
+# CHECK-SAME: branches = [@[[FOO:[a-z0-9]+]], @[[BAR:[a-z0-9]+]]]
+# CHECK-DAG:  func @[[FOO]]()
+# CHECK-DAG:  func @[[BAR]]()
+
 node {
   name: "predicate"
   op: "Const"
@@ -152,16 +160,3 @@ versions {
   min_consumer: 12
 }
 
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "predicate", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo0, @bar0], device = "", name = "Case", output_shapes = []} : (tensor<i32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
-# CHECK:  func @foo0() -> tensor<10xf32> {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "const_1", value = dense<1.000000e+00> : tensor<10xf32>} : () -> (tensor<10xf32>, !_tf.control)
-# CHECK-NEXT:    return %0#0 : tensor<10xf32>
-# CHECK-NEXT:  }
-# CHECK:  func @bar0() -> tensor<10xf32> {
-# CHECK-NEXT:    %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "const_2", value = dense<2.000000e+00> : tensor<10xf32>} : () -> (tensor<10xf32>, !_tf.control)
-# CHECK-NEXT:    return %0#0 : tensor<10xf32>
-# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
new file mode 100644
index 00000000000..9238ea92a20
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
@@ -0,0 +1,111 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input=fail
+
+# Verify for functions with control return values, the island with only a
+# consumed control return value has its control output added to the GraphOps
+# FetchOp.
+
+# Match the island containing the "tf.Neg", capture the output
+# CHECK:          %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island {{.*[[:space:]].*}} "tf.Neg"
+
+# Check that the tf.Neg control is passed to the fetch
+# CHECK:          tf_executor.fetch {{.*}} %[[ISLAND_0]]#1 : tensor<*xf32>, !tf_executor.control
+
+node {
+  name: "const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "test_fn_call"
+  op: "StatefulPartitionedCall"
+  input: "const"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "test_fn"
+      }
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "test_fn"
+      input_arg {
+        name: "a"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "d"
+        type: DT_FLOAT
+      }
+      control_output: "must_execute"
+    }
+    node_def {
+      name: "b"
+      op: "Neg"
+      input: "a"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "c"
+      op: "Identity"
+      input: "a"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "d"
+      value: "c:output:0"
+    }
+    control_ret {
+      key: "must_execute"
+      value: "b"
+    }
+  }
+}
+versions {
+  producer: 121
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
new file mode 100644
index 00000000000..adad8b109b6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
@@ -0,0 +1,100 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s --dump-input=fail
+
+# Verify for functions with control return values, the island with a consumed
+# data output and a consumed control has both its outputs added to the GraphOps
+# FetchOp.
+
+# Match the island containing the "tf.Neg", capture the output
+# CHECK:          %[[ISLAND:[0-9]*]]:2 = tf_executor.island {{.*[[:space:]].*}} "tf.Neg"
+
+# Check that the tf.Neg data output and control are passed to the fetch
+# CHECK:          tf_executor.fetch %[[ISLAND]]#0, %[[ISLAND]]#1 : tensor<*xf32>, !tf_executor.control
+
+node {
+  name: "const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "test_fn_call"
+  op: "StatefulPartitionedCall"
+  input: "const"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "test_fn"
+      }
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "test_fn"
+      input_arg {
+        name: "a"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "c"
+        type: DT_FLOAT
+      }
+      control_output: "must_execute"
+    }
+    node_def {
+      name: "b"
+      op: "Neg"
+      input: "a"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "c"
+      value: "b:y:0"
+    }
+    control_ret {
+      key: "must_execute"
+      value: "b"
+    }
+  }
+}
+versions {
+  producer: 121
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
index 40392a6954a..6a2a411d115 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt
@@ -1,5 +1,11 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
+# Verify that we properly import call site function attributes.
+# CHECK: tf.If
+# CHECK-SAME: then_branch = @
+# CHECK-SAME: then_branch.how_many = 32
+# CHECK-SAME: then_branch.ping = "ack"
+
 node {
   name: "Placeholder"
   op: "Placeholder"
@@ -503,36 +509,3 @@ versions {
   producer: 27
   min_consumer: 12
 }
-
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf.ConfigureDistributedTPU"() {device = "/device:TPU_SYSTEM:0", embedding_config = "", is_global_init = false, name = "ConfigureDistributedTPU", tpu_embedding_config = ""} : () -> (tensor<*x!tf.string>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %2:2 = "_tf.TPUReplicatedInput"(%1#0) {N = 1 : i64, T = "tfdtype$DT_INT32", device = "", name = "input0"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %3:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder_1", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %4:2 = "_tf.TPUReplicatedInput"(%3#0) {N = 1 : i64, T = "tfdtype$DT_INT32", device = "", name = "input1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %5 = "_tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> !_tf.control
-# CHECK-NEXT:    %6 = "_tf.NoOp"(%5) {_tpu_replicate = "cluster", device = "", name = "NoOp"} : (!_tf.control) -> !_tf.control
-# CHECK-NEXT:    %7 = "_tf.TPUReplicateMetadata"(%5) {_tpu_replicate = "cluster", computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true} : (!_tf.control) -> !_tf.control
-# CHECK-NEXT:    %8:2 = "_tf.TPUCompilationResult"(%7) {_tpu_compilation_status = "cluster", device = "", name = "TPUCompilationResult"} : (!_tf.control) -> (tensor<!tf.string>, !_tf.control)
-# CHECK-NEXT:    %9:2 = "_tf.Identity"(%2#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %10:2 = "_tf.Identity"(%4#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_1"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %11:2 = "_tf.Less"(%9#0, %10#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "Less"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi1>, !_tf.control)
-# CHECK-NEXT:    %12:3 = "_tf.If"(%11#0, %10#0, %9#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], _tpu_replicate = "cluster", device = "", else_branch = @cond_false0, name = "cond", output_shapes = ["tfshape$unknown_rank: true\0A", "tfshape$unknown_rank: true\0A"], then_branch = @cond_true0, then_branch.how_many = 32 : i64, then_branch.ping = "ack"} : (tensor<*xi1>, tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %13:2 = "_tf.Identity"(%12#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "/device:TPU_REPLICATED_CORE:0", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %14:2 = "_tf.TPUReplicatedOutput"(%13#0) {T = "tfdtype$DT_INT32", device = "", name = "output0", num_replicas = 1 : i64} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %15:2 = "_tf.Identity"(%14#0, %6) {T = "tfdtype$DT_INT32", device = "", name = "output_0_shard_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %16:2 = "_tf.Identity"(%12#1) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "/device:TPU_REPLICATED_CORE:0", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %17:2 = "_tf.TPUReplicatedOutput"(%16#0) {T = "tfdtype$DT_INT32", device = "", name = "output1", num_replicas = 1 : i64} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %18:2 = "_tf.Identity"(%17#0, %6) {T = "tfdtype$DT_INT32", device = "", name = "output_1_shard_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
-# CHECK:  func @cond_false0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
-# CHECK-NEXT:    %0:2 = "_tf.Identity"(%arg0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Identity"(%arg1) {T = "tfdtype$DT_INT32", device = "", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    return %1#0, %0#0 : tensor<*xi32>, tensor<*xi32>
-# CHECK-NEXT:  }
-# CHECK:  func @cond_true0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
-# CHECK-NEXT:    %0:2 = "_tf.Identity"(%arg0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Identity"(%arg1) {T = "tfdtype$DT_INT32", device = "", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:    return %0#0, %1#0 : tensor<*xi32>, tensor<*xi32>
-# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt
index 41107cfbff4..e0e60c04865 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt
@@ -1,5 +1,9 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
+# Verify that the return type of the functions is properly inferred
+#CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32>
+#CHECK: func @identity0(%arg0: tensor<*xi32>) -> tensor<*xi32>
+
 node {
   name: "Placeholder"
   op: "Placeholder"
@@ -139,16 +143,3 @@ versions {
   min_consumer: 12
 }
 
-#CHECK: func @main() {
-#CHECK-NEXT:   %0:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_BOOL", name = "Placeholder", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi1>, !_tf.control)
-#CHECK-NEXT:   %1:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder_1", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control)
-#CHECK-NEXT:   %2:2 = "_tf.If"(%0#0, %1#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32"], device = "", else_branch = @get_zeros0, name = "If", output_shapes = [], then_branch = @identity0} : (tensor<*xi1>, tensor<*xi32>) -> (tensor<*xi32>, !_tf.control)
-#CHECK-NEXT:   return
-#CHECK-NEXT: }
-#CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32> {
-#CHECK-NEXT:   %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "const", value = dense<[1, 2]> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control)
-#CHECK-NEXT:   return %0#0 : tensor<2xi32>
-#CHECK-NEXT: }
-#CHECK: func @identity0(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-#CHECK-NEXT:   return %arg0 : tensor<*xi32>
-#CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
index c1045bf19af..b7179ae1dcc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt
@@ -1,5 +1,12 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
+# In GraphDef custom gradient functions are modeled using GradientDef which
+# links the function and its gradient. In MLIR a TF ops gradient function is
+# added to its list of function attributes.
+
+# CHECK: func @foo0(
+# CHECK-NEXT:   tf.gradient = @foo_grad
+
 node {
   name: "Const"
   op: "Const"
@@ -269,26 +276,3 @@ versions {
   producer: 29
   min_consumer: 12
 }
-
-# CHECK:      func @main() {
-# CHECK-NEXT:   %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = dense<2.500000e-01> : tensor<f32>} : () -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.foo0"(%0#0) {_disable_call_shape_inference = true, device = "", name = "foo"} : (tensor<f32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   %2:2 = "_tf.Shape"(%1#0) {T = "tfdtype$DT_FLOAT", device = "", name = "gradients/Shape", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor<?xi32>, !_tf.control)
-# CHECK-NEXT:   %3:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "gradients/grad_ys_0", value = dense<1.000000e+00> : tensor<f32>} : () -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:   %4:2 = "_tf.Fill"(%2#0, %3#0) {T = "tfdtype$DT_FLOAT", device = "", index_type = "tfdtype$DT_INT32", name = "gradients/Fill"} : (tensor<?xi32>, tensor<f32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   %5:2 = "_tf.SymbolicGradient"(%0#0, %4#0) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], device = "", f = @foo0, f._disable_call_shape_inference = true, name = "gradients/foo_grad/SymbolicGradient"} : (tensor<f32>, tensor<*xf32>) -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:   return
-# CHECK-NEXT: }
-# CHECK: func @foo_grad0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32>
-# CHECK-NEXT:   attributes  {tf._disable_call_shape_inference = true} {
-# CHECK-NEXT:   %0:2 = "_tf.Mul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   return %0#0 : tensor<*xf32>
-# CHECK-NEXT: }
-# CHECK: func @foo0(%arg0: tensor<*xf32>) -> tensor<*xf32>
-# CHECK-NEXT:   attributes  {tf._disable_call_shape_inference = true, tf.gradient = @foo_grad0} {
-# CHECK-NEXT:   %0:2 = "_tf.Exp"(%arg0) {T = "tfdtype$DT_FLOAT", device = "", name = "Exp"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.Neg"(%arg0) {T = "tfdtype$DT_FLOAT", device = "", name = "Neg"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   %2:2 = "_tf.Exp"(%1#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Exp_1"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   %3:2 = "_tf.Sub"(%0#0, %2#0) {T = "tfdtype$DT_FLOAT", device = "", name = "sub_0"} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, !_tf.control)
-# CHECK-NEXT:   return %3#0 : tensor<*xf32>
-# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
similarity index 64%
rename from tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt
rename to tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
index 456bf4951bd..ba94c600cf2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-input-func-arg-name-collision.pbtxt
@@ -1,5 +1,12 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes='' -tf-output-arrays=while:2 -o - | FileCheck %s
 
+# This check that we don't error out when importing GraphDef containing
+# functions with arg name that are the same as the graph input name
+
+# CHECK: func @main(%arg0: tensor<i32>) -> tensor<i32>
+# CHECK: func @while_body
+# CHECK: func @while_cond
+
 node {
   name: "input"
   op: "Placeholder"
@@ -295,23 +302,3 @@ versions {
   min_consumer: 12
 }
 
-# CHECK: func @main(%arg0: tensor<i32>) -> tensor<i32>
-# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "while"}} {
-# CHECK-NEXT:   %0:2 = "_tf.Placeholder.input"(%arg0) {_user_specified_name = "input", device = "", dtype = "tfdtype$DT_INT32", name = "input", shape = "tfshape$"} : (tensor<i32>) -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/loop_counter", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %2:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/maximum_iterations", value = dense<-1> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %3:4 = "_tf.While"(%1#0, %2#0, %0#0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_INT32", "tfdtype$DT_INT32"], _lower_using_switch_merge = true, body = @while_body_60, cond = @while_cond_50, device = "", name = "while", output_shapes = ["tfshape$", "tfshape$", "tfshape$"], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:   return %3#2 : tensor<i32>
-# CHECK-NEXT: }
-# CHECK: func @while_body_60(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, tensor<*xi32>) {
-# CHECK-NEXT:   %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Add/y", value = dense<1> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "add_1/y", value = dense<1> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %2:2 = "_tf.Add"(%arg2, %0#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:   %3:2 = "_tf.Add"(%arg0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "add_1"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:   return %3#0, %arg1, %2#0 : tensor<*xi32>, tensor<*xi32>, tensor<*xi32>
-# CHECK-NEXT: }
-# CHECK: func @while_cond_50(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>) -> tensor<*xi1> {
-# CHECK-NEXT:   %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Less/y", value = dense<10> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.Less"(%arg2, %0#0) {T = "tfdtype$DT_INT32", device = "", name = "Less"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi1>, !_tf.control)
-# CHECK-NEXT:   return %1#0 : tensor<*xi1>
-# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
index 83ca4466869..17b2655aa5d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
@@ -36,15 +36,13 @@ versions {
   min_consumer: 12
 }
 
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0 = "_tf.foo0"() {device = "", name = "unnamed"} : () -> !_tf.control
-# CHECK-NEXT:    %1 = "_tf.bar0"() {device = "", name = "unnamed1"} : () -> !_tf.control
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
-# CHECK:  func @foo0() {
-# CHECK-NEXT:    %0 = "_tf.bar0"() {device = "", name = "unnamed"} : () -> !_tf.control
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
-# CHECK:  func @bar0() {
-# CHECK-NEXT:    return
-# CHECK-NEXT:  }
+# Verify that functions from the library are properly imported.
+
+# CHECK-LABEL:  func @main() {
+# CHECK:    "tf.foo0"()
+# CHECK:    "tf.bar0"()
+
+# CHECK-LABEL:  func @foo0() {
+# CHECK: "tf.bar0"()
+
+# CHECK-LABEL:  func @bar0() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
index 97e22256495..0a5aba285dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-malformed.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 this is not a valid graph def
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
index daef0054fd6..37f7a876814 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
@@ -1,5 +1,16 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=out:1,out -o - | FileCheck %s
 
+# Verify that we match correctly the input / output when they are scalar.
+
+# CHECK: func @main(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>)
+# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "out"}} {
+# CHECK:  "tf.Placeholder.input"(%arg0)
+
+# CHECK: tf.Relu
+# CHECK:  %[[IDENTITY:[0-9]+]]:3 = tf_executor.island
+# CHECK-NEXT: tf.Identity
+# CHECK:  fetch %[[IDENTITY]]#1, %[[IDENTITY]]#0 : tensor<f32>, tensor<f32>
+
 node {
   name: "input"
   op: "Placeholder"
@@ -52,11 +63,3 @@ node {
 versions {
   producer: 27
 }
-
-# CHECK: func @main(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>)
-# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "out"}} {
-# CHECK-NEXT:  %0:2 = "_tf.Placeholder.input"(%arg0) {device = "/device:CPU:0", dtype = "tfdtype$DT_FLOAT", name = "input", shape = "tfshape$"} : (tensor<f32>) -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:  %1:2 = "_tf.Relu"(%0#0) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "Relu"} : (tensor<f32>) -> (tensor<f32>, !_tf.control)
-# CHECK-NEXT:  %2:3 = "_tf.IdentityN"(%1#0, %1#0) {T = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], device = "", name = "out"} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>, !_tf.control)
-# CHECK-NEXT:  return %2#1, %2#0 : tensor<f32>, tensor<f32>
-# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
index 32b816f5e39..9ae5601fa57 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
@@ -104,8 +104,8 @@ versions {
 }
 
 # CHECK: func @main
-# CHECK: "_tf.PartitionedCall"()
+# CHECK: "tf.PartitionedCall"()
 # CHECK-SAME: Tout = ["tfdtype$DT_UINT8"]
 # CHECK-SAME: f = @[[FUNCTION:[A-Za-z0-9_]*]]
 # CHECK: func @[[FUNCTION]]() -> tensor<!tf.uint8>
-# CHECK: return {{%[0-9]*#[0-9]*}} : tensor<!tf.uint8>
+# CHECK: return {{.*}} : tensor<!tf.uint8>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
index 4fa8407c0dd..6816088322d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-undefined-output.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=NotANodeInTheGraph -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=NotANodeInTheGraph -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 # CHECK: Graph import failed: Invalid argument: Output NotANodeInTheGraph was not found in graph
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
index 5f8e7854161..20bf33d7fb2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt
@@ -29,7 +29,6 @@ node {
             size: 2
           }
         }
-        tensor_content: "\350\251\242>\276\335r?"
       }
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
index ac84234e4ac..4ada2f6f71c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
@@ -1,5 +1,14 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s
 
+# Verify that importing a Graph with a backedge leads to two NextIteration nodes
+# to break the cycle.
+
+# CHECK-LABEL: func @main()
+# CHECK:    %[[NEXTITERATION:[0-9]+]]:3 = tf_executor.NextIteration.Source
+# CHECK:    tf_executor.Merge {{.*}} %[[NEXTITERATION]]#0
+
+# CHECK:    tf_executor.NextIteration.Sink [%[[NEXTITERATION]]#1]
+
 node {
   name: "Const"
   op: "Const"
@@ -203,20 +212,3 @@ versions {
   producer: 27
 }
 
-# CHECK:       func @main() {
-# CHECK-NEXT:    %0:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", name = "while/NextIteration"} : () -> (tensor<*xi32>, !_tf.control) loc("while/NextIteration")
-# CHECK-NEXT:    %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control) loc("Const")
-# CHECK-NEXT:    %2:2 = "_tf.Enter"(%1#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while/while_context", is_constant = false, name = "while/Enter", parallel_iterations = 10 : i64} : (tensor<i32>) -> (tensor<*xi32>, !_tf.control) loc("while/Enter")
-# CHECK-NEXT:    %3:3 = "_tf.Merge"(%2#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<i32>, !_tf.control) loc("while/Merge")
-# CHECK-NEXT:    %4:2 = "_tf.Const"(%3#2) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<10> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control) loc("while/Less/y")
-# CHECK-NEXT:    %5:2 = "_tf.Less"(%3#0, %4#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi1>, !_tf.control) loc("while/Less")
-# CHECK-NEXT:    %6:2 = "_tf.LoopCond"(%5#0) {device = "", name = "while/LoopCond"} : (tensor<*xi1>) -> (tensor<i1>, !_tf.control) loc("while/LoopCond")
-# CHECK-NEXT:    %7:3 = "_tf.Switch"(%3#0, %6#0) {T = "tfdtype$DT_INT32", _class = ["loc:@while/Merge"], device = "", name = "while/Switch"} : (tensor<*xi32>, tensor<i1>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) loc("while/Switch")
-# CHECK-NEXT:    %8:2 = "_tf.Exit"(%7#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) loc("while/Exit")
-# CHECK-NEXT:    %9:2 = "_tf.Identity"(%7#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) loc("while/Identity")
-# CHECK-NEXT:    %10:2 = "_tf.Const"(%9#1) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<1> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control) loc("while/Add/y")
-# CHECK-NEXT:    %11:2 = "_tf.Add"(%9#0, %10#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control) loc("while/Add")
-# CHECK-NEXT:    %12 = "_tf.NextIteration.sink"(%11#0) {T = "tfdtype$DT_INT32", device = "", name = "while/NextIteration"} : (tensor<*xi32>) -> !_tf.control loc("while/NextIteration")
-# CHECK-NEXT:    return loc(unknown)
-# CHECK-NEXT:  }
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt
new file mode 100644
index 00000000000..6fec080be58
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/invalid-output-index.pbtxt
@@ -0,0 +1,14 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input:1 -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
+
+# CHECK: Graph import failed: Invalid argument: Invalid output index 1 specified for node: input
+
+node {
+  name: "input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt
new file mode 100644
index 00000000000..c6d00a6f337
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/legacy-fed-input-without-inputs.pbtxt
@@ -0,0 +1,30 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=input -tf-convert-legacy-fed-inputs -o - | FileCheck %s
+
+# Verify that invalid LegacyFedInput ops without any inputs are replaced with
+# Placeholder ops.
+
+# CHECK-NOT: LegacyFedInput
+# CHECK: tf.Placeholder.input{{.*}}(tensor<f32>) -> tensor<f32>
+# CHECK-NOT: LegacyFedInput
+
+node {
+  name: "input"
+  op: "LegacyFedInput"
+  attr {
+    key: "input_def"
+    value {
+      s: "name: \"batch_1\"\n[dist_belief.ImageInputDef.ext] {\n  num_rows: 128\n  num_cols: 128\n  mean_value: 128\n  std_value: 128\n  colorspace: RGB\n}\n"
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
index 6baa4973407..09a900e8917 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt
@@ -1,5 +1,13 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
 
+# Verify that a NextIteration node feeding two different merge nodes is properly
+# Imported.
+
+# CHECK-LABEL: func @main()
+# CHECK:         %[[NEXTITERATION:[0-9]+]]:3 = tf_executor.NextIteration.Source
+# CHECK:         tf_executor.Merge {{.*}}, %[[NEXTITERATION]]#0
+# CHECK:         tf_executor.Merge {{.*}}, %[[NEXTITERATION]]#0
+
 node {
   name: "Const"
   op: "Const"
@@ -137,14 +145,3 @@ versions {
   producer: 62
 }
 
-# CHECK:      func @main() {
-# CHECK-NEXT:   %0:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", name = "NextIteration"} : () -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:   %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Add/y", value = dense<1> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %2:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<0> : tensor<i32>} : () -> (tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %3:2 = "_tf.Enter"(%2#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while_context", is_constant = false, name = "Enter", parallel_iterations = 10 : i64} : (tensor<i32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:   %4:3 = "_tf.Merge"(%3#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:   %5:2 = "_tf.Add"(%4#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>, !_tf.control)
-# CHECK-NEXT:   %6 = "_tf.NextIteration.sink"(%5#0) {T = "tfdtype$DT_INT32", device = "", name = "NextIteration"} : (tensor<*xi32>) -> !_tf.control
-# CHECK-NEXT:   %7:3 = "_tf.Merge"(%3#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "Use_NextIteration_Again"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<i32>, !_tf.control)
-# CHECK-NEXT:   return
-# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
index a745cf302e9..7715a0eb9df 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt
@@ -1,5 +1,10 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=input0,input1,unused_input -tf-input-data-types=DT_INT32,DT_INT32,DT_INT32 -tf-input-shapes=10:10:10 -tf-output-arrays=Add -o - | FileCheck %s
 
+# Verify that an unused Node (here named "Prune") isn't converted when we
+# request pruning on import.
+# CHECK-LABEL:  func @main
+# CHECK-NOT:  Prune
+
 node {
   name: "Prune"
   op: "Const"
@@ -66,13 +71,3 @@ node {
 versions {
   producer: 27
 }
-
-# CHECK:  func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>, %arg2: tensor<10xi32>) -> tensor<10xi32>
-# CHECK-NEXT:  attributes  {tf.entry_function = {inputs = "input0, input1, unused_input", outputs = "Add"}} {
-# CHECK-NEXT:    %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim {\0A  size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK-NEXT:    %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim {\0A  size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK-NEXT:    %2:2 = "_tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<10xi32>, tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK-NEXT:    %3:2 = "_tf.Placeholder.input"(%arg2) {device = "", dtype = "tfdtype$DT_INT32", name = "unused_input", shape = "tfshape$dim {\0A  size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control)
-# CHECK-NEXT:    return %2#0 : tensor<10xi32>
-# CHECK-NEXT: }
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
index 096264737da..748bc996f36 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/quint8-const.pbtxt
@@ -27,6 +27,6 @@ versions {
   producer: 70
 }
 
-# CHECK: "_tf.Const"()
+# CHECK: tf.Const
 # CHECK-SAME: name = "Quantized_Constant"
 # CHECK-SAME: value = opaque<"tf", "{{0[xX][0-9a-fA-F]*}}"> : tensor<!tf.quint8>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
index 32007150bcd..54877e873e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/stateful-attribute.pbtxt
@@ -82,7 +82,7 @@ versions {
 
 # Find PartitionedCall ops in main and match the callee name.
 # CHECK: func @main
-# CHECK: "_tf.PartitionedCall"
+# CHECK: "tf.PartitionedCall"
 # CHECK-SAME: f = @[[FUNCTION_FOO:[a-zA-Z0-9_]*]]
 
 # Find callee and verify it has the stateful attribute set.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
index 790fb0c7334..707b04473f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt
@@ -1,4 +1,9 @@
 # RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - | FileCheck %s
+
+# CHECK: tf.Const
+# CHECK-SAME: _output_shapes = ["tfshape$dim { size: 3 }"]
+# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2033207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C30303022"> : tensor<3x!tf.string>
+
 node {
   name: "save/SaveV2/shape_and_slices"
   op: "Const"
@@ -40,8 +45,3 @@ node {
 versions {
   producer: 74
 }
-
-# CHECK: func @main() {
-# CHECK-NEXT: %0:2 = "_tf.Const"() {_output_shapes = ["tfshape$dim {\0A  size: 3\0A}\0A"], device = "", dtype = "tfdtype$DT_STRING", name = "save/SaveV2/shape_and_slices", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E470A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20330A20207D0A7D0A737472696E675F76616C3A2022220A737472696E675F76616C3A2022220A737472696E675F76616C3A2022220A"> : tensor<3x!tf.string>} : () -> (tensor<3x!tf.string>, !_tf.control)
-# CHECK-NEXT: return
-# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
new file mode 100644
index 00000000000..ea3b143d63e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
@@ -0,0 +1,270 @@
+# RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - | FileCheck %s --dump-input-on-failure
+
+# CHECK: tf_executor.SwitchN
+# CHECK-SAME: of 3 : tensor<i32>
+# CHECK-SAME: T = "tfdtype$DT_INT32"
+# CHECK-SAME: name = "Case/branch_index/_3"
+
+node {
+  name: "Case/branch_index"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Case/input_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Case/branch_index/_3"
+  op: "_SwitchN"
+  input: "Case/branch_index"
+  input: "Case/branch_index"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "num_outs"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Case/Case/input_0/_7"
+  op: "_SwitchN"
+  input: "Case/input_0"
+  input: "Case/branch_index"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "Case/input_0"
+      }
+    }
+  }
+  attr {
+    key: "num_outs"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Case/pivot_0/_4"
+  op: "Identity"
+  input: "Case/branch_index/_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Case/pivot_1/_5"
+  op: "Identity"
+  input: "Case/branch_index/_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Case/pivot_2/_6"
+  op: "Identity"
+  input: "Case/branch_index/_3:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Case/branch0/_0/mul/y"
+  op: "Const"
+  input: "^Case/pivot_0/_4"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Case/branch1/_1/mul/y"
+  op: "Const"
+  input: "^Case/pivot_1/_5"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Case/branch2/_2/mul/y"
+  op: "Const"
+  input: "^Case/pivot_2/_6"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "Case/branch0/_0/mul_0"
+  op: "Mul"
+  input: "Case/Case/input_0/_7"
+  input: "Case/branch0/_0/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Case/branch1/_1/mul_0"
+  op: "Mul"
+  input: "Case/Case/input_0/_7:1"
+  input: "Case/branch1/_1/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Case/branch2/_2/mul_0"
+  op: "Mul"
+  input: "Case/Case/input_0/_7:2"
+  input: "Case/branch2/_2/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Case/merge/_9"
+  op: "Merge"
+  input: "Case/branch0/_0/mul_0"
+  input: "Case/branch1/_1/mul_0"
+  input: "Case/branch2/_2/mul_0"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "__inference_run_240_RetVal"
+  op: "_Retval"
+  input: "Case/merge/_9"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 126
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
index a8802a99456..cc24caae6e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/tensor-list.pbtxt
@@ -209,10 +209,10 @@ versions {
 }
 
 # Verify that list element shape and dtype are expected.
-# CHECK:  _tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> (tensor<!tf.variant<tensor<2x2xf32>>>, !_tf.control)
+# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<2x2xf32>>>
 
 # Nested variant type.
-# CHECK:  _tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> (tensor<!tf.variant<tensor<2x2x!tf.variant>>>, !_tf.control)
+# CHECK:  tf.TensorListReserve{{.*}}(tensor<2xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<2x2x!tf.variant>>>
 
-# CHECK:  _tf.TensorListSetItem{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>) -> (tensor<!tf.variant<tensor<2x2xf32>>>, !_tf.control)
-# CHECK:  _tf.TensorListStack{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>) -> (tensor<?x2x2xf32>, !_tf.control)
+# CHECK:  tf.TensorListSetItem{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>) -> tensor<!tf.variant<tensor<2x2xf32>>>
+# CHECK:  tf.TensorListStack{{.*}}(tensor<!tf.variant<tensor<2x2xf32>>>, tensor<i32>) -> tensor<?x2x2xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
index 2259d301dc8..4566ffb507c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
@@ -13,15 +13,19 @@ func @foo(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
 // The IsolatePlacerInspectionRequiredOpsPass adds Identities for each input/output of function-calling ops.
 
 // Capture the result of input to function call.
-// CHECK:      [[VARIABLE_REG:%[0-9]*]]:2 = "_tf.VarHandleOp"()
+// CHECK:      [[VARIABLE_REG:%[0-9]*]]:2 = tf_executor.island
+// CHECK-NEXT:      "tf.VarHandleOp"()
 
 // Test for the presence of Identity op between input and function call.
-// CHECK-NEXT: [[IDENTITY_REG:%[0-9]*]]:2 = "_tf.Identity"([[VARIABLE_REG]]#0)
-// CHECK-NEXT: [[CALL_RESULT_REG:%[0-9]*]]:2 = "_tf.StatefulPartitionedCall"([[IDENTITY_REG]]#0)
+// CHECK: [[IDENTITY_REG:%[0-9]*]]:2 = tf_executor.island
+// CHECK-NEXT: "tf.Identity"([[VARIABLE_REG]]#0)
+
+// CHECK: [[CALL_RESULT_REG:%[0-9]*]]:2 = tf_executor.island
+// CHECK-NEXT: "tf.StatefulPartitionedCall"([[IDENTITY_REG]]#0)
 // CHECK-SAME: f = @[[FUNCTION:[a-zA-Z0-9_]*]]
 
 // Match the inserted Identity op for call output.
-// CHECK-NEXT: "_tf.Identity"([[CALL_RESULT_REG]]#0)
+// CHECK: "tf.Identity"([[CALL_RESULT_REG]]#0)
 
 // Match the function name
 // CHECK: func @[[FUNCTION]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/convert_tensor.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/convert_tensor.mlir
new file mode 100644
index 00000000000..52e4c529815
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/convert_tensor.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+func @main() -> (tensor<1x2xf16>, tensor<2xf16>) {
+  %0:2 = "_tf.Const"() {device = "", name = "foo", dtype = "tfdtype$DT_HALF", value = dense<1.0> : tensor<1x2xf16>} : () -> (tensor<1x2xf16>, !_tf.control)
+  %1:2 = "_tf.Const"() {device = "", name = "bar", dtype = "tfdtype$DT_HALF", value = dense<[1.0, 2.0]> : tensor<2xf16>} : () -> (tensor<2xf16>, !_tf.control)
+  return %0#0, %1#0 : tensor<1x2xf16>, tensor<2xf16>
+
+// CHECK: node {
+// CHECK-NEXT: name: "foo"
+// CHECK-NEXT: op: "Const"
+// CHECK: half_val: 15360
+// CHECK: name: "bar"
+// CHECK-NEXT: op: "Const"
+// CHECK: half_val: 15360
+// CHECK: half_val: 16384
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
new file mode 100644
index 00000000000..ccd058842a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
@@ -0,0 +1,34 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "tf.Placeholder.input"(%arg0) : (tensor<f32>) -> tensor<f32>
+  %1 = "tf.Placeholder.input"(%arg1) : (tensor<f32>) -> tensor<f32>
+  %2 = "tf.Less"(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatefulIf")
+  %4 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatelessIf")
+  return %3, %4 : tensor<f32>, tensor<f32>
+}
+
+func @cond_true(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @cond_false(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// Verify that If op is mapped to TensorFlow StatelessIf op if the is_stateless
+// attribute is present and otherwise it is mapped to TensorFlow If op. In both
+// cases, the additional attribute should be dropped.
+
+// CHECK: name: "StatefulIf"
+// CHECK-NOT: name:
+// CHECK: op: "If"
+// CHECK-NOT: is_stateless
+
+// CHECK: name: "StatelessIf"
+// CHECK-NOT: name:
+// CHECK: op: "StatelessIf"
+// CHECK-NOT: is_stateless
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
new file mode 100644
index 00000000000..0009c7a4dc4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
@@ -0,0 +1,43 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+func @main(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %iter = "tf.Placeholder.input"(%arg0) : (tensor<i32>) -> tensor<i32> loc("iter")
+  %val = "tf.Placeholder.input"(%arg1) : (tensor<f32>) -> tensor<f32> loc("val")
+
+  // Element wise add `val` with itself for `iter` number of times.
+  %2:2 = "tf.While"(%iter, %val) {
+    cond = @cond, body = @body, is_stateless = false
+  } : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatefulWhile")
+  %3:2 = "tf.While"(%iter, %val) {
+    cond = @cond, body = @body, is_stateless = true
+  } : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatelessWhile")
+
+  return %2#1, %3#1 : tensor<f32>, tensor<f32>
+}
+
+func @cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
+  %0 = "tf.Const" () {value = dense<0> : tensor<i32>} : () -> tensor<i32> loc("Const")
+  %1 = "tf.Greater"(%arg0, %0) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+
+func @body(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> (tensor<*xi32>, tensor<*xf32>) {
+  %0 = "tf.Const" () {value = dense<1> : tensor<i32>} : () -> tensor<i32> loc("Const")
+  %1 = "tf.Sub"(%arg0, %0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %2 = "tf.Add"(%arg1, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %1, %2 : tensor<*xi32>, tensor<*xf32>
+}
+
+// Verify that While op is mapped to TensorFlow StatelessWhile op if the
+// is_stateless attribute is present and otherwise it is mapped to TensorFlow
+// While op. In both cases, the additional attribute should be dropped.
+
+// CHECK: name: "StatefulWhile"
+// CHECK-NOT: name:
+// CHECK: op: "While"
+// CHECK-NOT: is_stateless
+
+// CHECK: name: "StatelessWhile"
+// CHECK-NOT: name:
+// CHECK: op: "StatelessWhile"
+// CHECK-NOT: is_stateless
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/missing-main.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/missing-main.mlir
index 041be4b9fe0..f73e93369d5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/missing-main.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/missing-main.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[1]} -eq 0
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - 2>&1 | FileCheck %s; test ${PIPESTATUS[0]} -ne 0
 
 // CHECK: Graph export failed: Failed precondition: entry function `main` must be present
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/optimize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/optimize.mlir
new file mode 100644
index 00000000000..a3c2d18c671
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/optimize.mlir
@@ -0,0 +1,35 @@
+// RUN: tf-opt -tf-optimize %s | FileCheck %s
+
+// CHECK-LABEL: convbiasaddmul
+func @convbiasaddmul(%arg: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf32> {
+  %filter = constant dense<2.0> : tensor<3x3x3x16xf32>
+  %bias = constant dense<3.0> : tensor<16xf32>
+  %value = constant dense<4.0> : tensor<16xf32>
+  %0 = "tf.Conv2D"(%arg, %filter) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
+  %1 = "tf.BiasAdd"(%0, %bias) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"}: (tensor<256x30x30x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  %2 = "tf.Mul"(%1, %value) {T = "tfdtype$DT_FLOAT"} : (tensor<256x30x30x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  return %2 : tensor<256x30x30x16xf32>
+
+// CHECK-NEXT: %[[cst:.*]] = constant dense<8.000000e+00> : tensor<3x3x3x16xf32>
+// CHECK-NEXT: %[[cst_0:.*]] = constant dense<1.200000e+01> : tensor<16xf32>
+// CHECK-NEXT: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
+// CHECK-NEXT: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]])
+// CHECK-NEXT: return %[[bias]] : tensor<256x30x30x16xf32>
+}
+
+// CHECK-LABEL: convaddv2mul
+func @convaddv2mul(%arg: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf32> {
+  %filter = constant dense<2.0> : tensor<3x3x3x16xf32>
+  %bias = constant dense<3.0> : tensor<16xf32>
+  %value = constant dense<4.0> : tensor<16xf32>
+  %0 = "tf.Conv2D"(%arg, %filter) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
+  %1 = "tf.AddV2"(%0, %bias) {T = "tfdtype$DT_FLOAT"}: (tensor<256x30x30x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  %2 = "tf.Mul"(%1, %value) {T = "tfdtype$DT_FLOAT"} : (tensor<256x30x30x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  return %2 : tensor<256x30x30x16xf32>
+
+// CHECK-NEXT: %[[cst:.*]] = constant dense<8.000000e+00> : tensor<3x3x3x16xf32>
+// CHECK-NEXT: %[[cst_0:.*]] = constant dense<1.200000e+01> : tensor<16xf32>
+// CHECK-NEXT: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
+// CHECK-NEXT: %[[add:.*]] = "tf.AddV2"(%[[conv]], %[[cst_0]])
+// CHECK-NEXT: return %[[add]] : tensor<256x30x30x16xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-control.mlir b/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-control.mlir
new file mode 100644
index 00000000000..271b6ec92f9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-control.mlir
@@ -0,0 +1,12 @@
+// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=MlirRoundtripPass | FileCheck %s --dump-input-on-failure
+
+// The test uses the tf_graph_optimization_pass to run the MlirRoundtripPass.
+// We convert mlir -> Graph -> mlir -> Graph -> mlir
+
+func @main() {
+  "_tf.NoOp"() {} : () -> () loc("X")
+  return
+}
+
+// Check for the presence of tf.NoOp in the final output.
+// CHECK: tf.NoOp
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir b/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir
new file mode 100644
index 00000000000..6b245236d35
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir
@@ -0,0 +1,19 @@
+// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=MlirRoundtripPass | FileCheck %s --dump-input-on-failure
+
+module {
+  func @main() {
+    tf_executor.graph {
+      %0 = tf_executor.island {
+        "tf.NoOp"() {} : () -> () loc("X")
+        tf_executor.yield
+      }
+      tf_executor.fetch
+    }
+    return
+  }
+}
+
+// The test uses the tf_graph_optimization_pass to run the MlirRoundtripPass.
+// We convert mlir -> Graph -> mlir -> Graph -> mlir
+// Check for the presence of tf.NoOp in the final output.
+// CHECK: tf.NoOp
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/BUILD
new file mode 100644
index 00000000000..ff3b70a22c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/BUILD
@@ -0,0 +1,19 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(licenses = ["notice"])
+
+tf_cc_test(
+    name = "half_plus_two",
+    srcs = ["half_plus_two.cc"],
+    data = [
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    ],
+    deps = [
+        "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/half_plus_two.cc b/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/half_plus_two.cc
new file mode 100644
index 00000000000..b18e6c0b188
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/savedmodel2mlir/half_plus_two.cc
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unordered_set>
+
+#include "tensorflow/cc/saved_model/tag_constants.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+// TODO(silvasean): Add a FileCheck based testing harness for SavedModel to
+// replace the following. The source should be TensorFlow Python code. Then we
+// can generate SavedModel directories on the fly and import them. Check
+// directives can be embedded into the same file as the source.
+TEST(SavedModel, HalfPlusTwo) {
+  const char kSavedModel[] = "cc/saved_model/testdata/half_plus_two/00000123";
+  const std::string saved_model_dir = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(), kSavedModel);
+  std::unordered_set<std::string> tags{tensorflow::kSavedModelTagServe};
+
+  mlir::MLIRContext context;
+  auto module = tensorflow::SavedModelToMlirImport(
+      saved_model_dir, tags, /*debug_info_file=*/"", &context);
+  auto* block = module->getBody();
+
+  // testdata/half_plus_two does not use any functions. So we only have the
+  // mandatory module terminator op inside its block.
+  EXPECT_TRUE(std::next(block->begin()) == block->end());
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 3b21c528c90..dd6d77f7816 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics | FileCheck %s --dump-input=fail
 
 //===--------------------------------------------------------------------===//
 //  Test TF opaque attributes
@@ -65,6 +65,32 @@ func @testTFComplex(tensor<*x!tf.complex64>, tensor<*x!tf.complex128>) -> (!tf.c
 
 // -----
 
+// CHECK-LABEL: func @testIdentity
+func @testIdentity(%arg0: tensor<4x2x!tf.stringref>) -> tensor<4x2x!tf.string> {
+  // CHECK: tf.Identity
+  %0 = "tf.Identity"(%arg0) : (tensor<4x2x!tf.stringref>) -> tensor<4x2x!tf.string>
+  return %0 : tensor<4x2x!tf.string>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBitcast
+func @testBitcast(%arg0: tensor<3x4x!tf.uint16>) -> tensor<3x4x!tf.quint16> {
+  // CHECK: tf.Bitcast
+  %0 = "tf.Bitcast"(%arg0) : (tensor<3x4x!tf.uint16>) -> tensor<3x4x!tf.quint16>
+  return %0 : tensor<3x4x!tf.quint16>
+}
+
+// -----
+
+func @testIdentityWrongType(%arg0: tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref> {
+  // expected-error @+1 {{requires all operands to be either same as or ref type of results}}
+  %0 = "tf.Identity"(%arg0) : (tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref>
+  return %0 : tensor<4x2x!tf.stringref>
+}
+
+// -----
+
 // TODO(hinsu): Move this to MLIR core once the test dialect have a custom type.
 
 // Check that broadcastable trait accepts TF specific element type
@@ -133,9 +159,18 @@ func @testLeakyWrongAlphaType(tensor<16xf32>) -> tensor<16xf32> {
 }
 
 // -----
-// CHECK-LABEL: func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>)
-func @testReshape(tensor<*xf32>, tensor<*xf32>, tensor<10000xf32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>) {
-^bb0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>):
+
+// CHECK-LABEL: func @testMul
+func @testMul(%arg0: tensor<2x!tf.uint16>) -> (tensor<2x!tf.uint16>) {
+  // CHECK: tf.Mul
+  %0 = "tf.Mul"(%arg0, %arg0) {T = "tfdtype$DT_UINT16", device = "/device:CPU:0", name = "Mul"} : (tensor<2x!tf.uint16>, tensor<2x!tf.uint16>) -> tensor<2x!tf.uint16>
+  return %0 : tensor<2x!tf.uint16>
+}
+
+// -----
+
+// CHECK-LABEL: func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>)
+func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
   // CHECK: %cst = constant dense<100> : tensor<2xi32>
   %shape1 = constant dense<100> : tensor<2xi32>
   // CHECK: %0 = "tf.Reshape"(%arg0, %cst) : (tensor<*xf32>, tensor<2xi32>) -> tensor<100x100xf32>
@@ -150,7 +185,11 @@ func @testReshape(tensor<*xf32>, tensor<*xf32>, tensor<10000xf32>) -> (tensor<10
   %shape3 = constant dense<[-1, 100]> : tensor<2xi32>
   // CHECK: %4 = "tf.Reshape"(%arg2, %cst_0) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32", device = "", name = "Reshape_1"} : (tensor<10000xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   %r4 = "tf.Reshape"(%arg2, %shape3) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
-  return %r1, %r2, %r3, %r4: tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>
+  // CHECK: "tf.Reshape"(%arg0, %arg3)
+  %r5 = "tf.Reshape"(%arg0, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<*xf32>, tensor<*xi32>) -> (tensor<*xf32>)
+  // CHECK: "tf.Reshape"(%arg2, %arg3)
+  %r6 = "tf.Reshape"(%arg2, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<*xi32>) -> (tensor<*xf32>)
+  return %r1, %r2, %r3, %r4, %r5, %r6: tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>
 }
 
 // -----
@@ -190,6 +229,14 @@ func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<100x100xf32> {
   return %r1 : tensor<100x100xf32>
 }
 
+// -----
+// tf.Reshape with a first operand that has non-static shape.
+func @testReshape(%arg0: tensor<10x10x?xf32>) -> tensor<10x10xf32> {
+  %shape1 = constant dense<[10, 10]> : tensor<2xi32>
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> (tensor<10x10xf32>)
+  return %r1 : tensor<10x10xf32>
+}
+
 // -----
 
 // CHECK-LABEL: func @testValidAvgPool
@@ -478,7 +525,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 func @testValidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
   %1 = "tf.If"(%arg0, %arg1) {
-    then_branch = @testIfThen, else_branch = @testIfElse
+    then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -492,10 +539,11 @@ func @testIfElse(f32) -> f32
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, f32) -> f32 {
 ^bb0(%arg0: tensor<i1>, %arg1: f32):
-  // expected-error @+1 {{requires operands to have a valid TensorFlow tensor type}}
+  // expected-error @+1 {{operand #1 must be tensor of tf.dtype values}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
-    else_branch = @testIfElse
+    else_branch = @testIfElse,
+    is_stateless = false
   } : (tensor<i1>, f32) -> f32
 
   return %1 : f32
@@ -508,9 +556,9 @@ func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{requires then_branch attribute}}
+  // expected-error @+1 {{requires attribute 'then_branch'}}
   %1 = "tf.If"(%arg0, %arg1) {
-    else_branch = @testIfElse
+    else_branch = @testIfElse, is_stateless = false
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -527,7 +575,8 @@ func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{branches should have 1 inputs}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
-    else_branch = @testIfElse
+    else_branch = @testIfElse,
+    is_stateless = false
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -544,7 +593,8 @@ func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{then branch input type tensor<*xf16> is incompatible with operand type tensor<2xf32>}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
-    else_branch = @testIfElse
+    else_branch = @testIfElse,
+    is_stateless = false
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -561,7 +611,8 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{branches inputs have incompatible types tensor<2xf32> and tensor<3xf32>}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
-    else_branch = @testIfElse
+    else_branch = @testIfElse,
+    is_stateless = false
   } : (tensor<i1>, tensor<*xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -578,7 +629,8 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{else branch result type tensor<3xf32> is incompatible with op result type tensor<2xf32>}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
-    else_branch = @testIfElse
+    else_branch = @testIfElse,
+    is_stateless = false
   } : (tensor<i1>, tensor<*xf32>) -> tensor<2xf32>
 
   return %1 : tensor<2xf32>
@@ -615,12 +667,31 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
 }
 
+// -----
+func @testWhileUndefinedCond(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // expected-error @+1 {{cond refers to an undefined function : undefined_func}}
+  %0 = "tf.While"(%arg0, %arg1) {cond = @undefined_func, body = @body, is_stateless = false} : (tensor<i1>, tensor<f32>) -> (tensor<f32>)
+  return %0 : tensor<f32>
+}
+
+func @body(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32>
+
+// -----
+func @testWhileUndefinedBody(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // expected-error @+1 {{body refers to an undefined function : undefined_func}}
+  %0 = "tf.While"(%arg0, %arg1) {cond = @cond, body = @undefined_func, is_stateless = false} : (tensor<i1>, tensor<f32>) -> (tensor<f32>)
+  return %0 : tensor<f32>
+}
+
+func @cond(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<i1>
+
 // -----
 
 func @testWhileCond(tensor<*xf32>) -> ()
@@ -632,7 +703,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // expected-error @+1 {{requires cond function to have exactly one result}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
@@ -649,7 +721,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xi32>) {
   // expected-error @+1 {{operand type tensor<*xf32> is incompatible with result type}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xi32>)
 
   return %1 : tensor<*xi32>
@@ -666,7 +739,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // expected-error @+1 {{operand type tensor<*xf32> is incompatible with cond function input type}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
@@ -683,7 +757,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // expected-error @+1 {{requires the number of operands to be equal to the number of body function inputs. Found 1 and 2, respectively}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
@@ -700,7 +775,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // expected-error @+1 {{body function result type tensor<*xi32> is incompatible with result type}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
@@ -717,7 +793,8 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // expected-error @+1 {{cond function input type tensor<3xf32> is incompatible with body function input type}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
-    body = @testWhileBody
+    body = @testWhileBody,
+    is_stateless = false
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   return %1 : tensor<*xf32>
@@ -747,7 +824,7 @@ func @testShapeWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf3
 
 func @testShapeWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<*xi32> {
 ^bb0(%arg0: tensor<1x32x32x16xf32>):
-  // expected-error @+1 {{requires 1D result type}}
+  // expected-error @+1 {{requires 1D type for result}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<1x32x32x16xf32>) -> tensor<*xi32>
   return %0 : tensor<*xi32>
 }
@@ -763,15 +840,77 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 // -----
 
-func @testShapeWrongResultDim(tensor<*xf32>) -> tensor<2xi32> {
+func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result for unranked input}}
+  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
 // -----
 
+// CHECK-LABEL: func @testValidShapeN
+func @testValidShapeN(%arg0 : tensor<1x32x32x16xf32>, %arg1 : tensor<*xf32>) -> (tensor<4xi32>, tensor<?xi32>) {
+  // CHECK-NEXT: "tf.ShapeN"
+  %0:2 = "tf.ShapeN"(%arg0, %arg1) {N = 2 : i64} : (tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<4xi32>, tensor<?xi32>)
+  return %0#0, %0#1 : tensor<4xi32>, tensor<?xi32>
+}
+
+// -----
+
+func @testShapeNWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf32> {
+  // expected-error @+1 {{result #1 must be tensor of 32/64-bit integer values}}
+  %0:2 = "tf.ShapeN"(%arg0, %arg0) {N = 2 : i64} : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<4xf32>)
+  return %0#1 : tensor<4xf32>
+}
+
+// -----
+
+func @testShapeNWrongResultDim(tensor<1x32x32x16xf32>) -> tensor<*xi32> {
+^bb0(%arg0: tensor<1x32x32x16xf32>):
+  // expected-error @+1 {{requires 1D type for result #1}}
+  %0:2 = "tf.ShapeN"(%arg0, %arg0) {N = 2 : i64} : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<*xi32>)
+  return %0#1 : tensor<*xi32>
+}
+
+// -----
+
+func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
+^bb0(%arg0: tensor<1x32x32x16xf32>):
+  // expected-error @+1 {{requires dimension size of result #1 to match rank of operand #1}}
+  %0:2 = "tf.ShapeN"(%arg0, %arg0) {N = 2 : i64} : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<2xi32>)
+  return %0#1 : tensor<2xi32>
+}
+
+// -----
+
+func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
+^bb0(%arg0: tensor<*xf32>):
+  // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}}
+  %0:2 = "tf.ShapeN"(%arg0, %arg0) {N = 2 : i64} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<2xi32>)
+  return %0#1 : tensor<2xi32>
+}
+
+// -----
+
+func @testShapeNWrongNumOperands(tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  // expected-error @+1 {{requires 3 operand(s), got 2 operand(s)}}
+  %0:3 = "tf.ShapeN"(%arg0, %arg0) {N = 3 : i64} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>)
+  return
+}
+
+// -----
+
+func @testShapeNWrongNumResults(tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  // expected-error @+1 {{requires 3 result(s), got 2 result(s)}}
+  %0:2 = "tf.ShapeN"(%arg0, %arg0, %arg0) {N = 3 : i64} : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<?xi32>)
+  return
+}
+
+// -----
+
 // Test invalid tf.Const
 func @testConst() -> tensor<f32> {
   // expected-error @+1 {{attribute 'value' failed to satisfy constraint: constant vector/tensor}}
@@ -837,3 +976,4 @@ func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xi32>) -> tensor<?xf32
   %0 = "tf.ConcatV2"(%arg, %axis) {N = 1: i64} : (tensor<8x16xf32>, tensor<1xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 510aaccb26a..2890656c013 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -68,6 +68,30 @@ func @simpleIsland_with_attributes(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: func @simpleIsland_with_multiple_control_inputs(%arg0: tensor<*xf32>)
+func @simpleIsland_with_multiple_control_inputs(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = tf_executor.graph {
+    %1 = tf_executor.island {
+      tf_executor.yield
+    }
+    %2 = tf_executor.island {
+      tf_executor.yield
+    }
+    %3:2 = tf_executor.island(%1, %2) {
+      tf_executor.yield %arg0 : tensor<*xf32>
+    }
+    tf_executor.fetch %3#0 : tensor<*xf32>
+  }
+// CHECK:      %[[ISLAND0:[0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield
+// CHECK:      %[[ISLAND1:[0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield
+// CHECK:      %[[ISLAND2:[0-9]*]]:2 = tf_executor.island(%[[ISLAND0]], %[[ISLAND1]]) {
+// CHECK:      tf_executor.fetch %[[ISLAND2]]#0 : tensor<*xf32>
+
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @fetchWithControlDep(%arg0: tensor<*xf32>)
 func @fetchWithControlDep(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %result = tf_executor.graph {
@@ -153,8 +177,8 @@ func @switch_with_attributes(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<
   return %result : tensor<*xf32>
 }
 
-// CHECK-LABEL: func @switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-func @switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-LABEL: func @switchN(
+func @switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
 // CHECK: %1:6 = tf_executor.SwitchN %arg1, %arg0 of 5 : tensor<*xf32>
@@ -210,6 +234,43 @@ func @switch_merge_with_attributes(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> t
   return %result : tensor<*xf32>
 }
 
+// Verify that long form printing is used when operand types do not match the
+// result type and then it can be parsed again correctly.
+// CHECK-LABEL: func @merge_different_operand_types
+func @merge_different_operand_types(%arg0: tensor<*xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<*xf32>, tensor<4xf32>) -> (tensor<4xf32>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = tf_executor.Merge %arg0, %arg1  : (tensor<*xf32>, tensor<4xf32>) -> (tensor<4xf32>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<4xf32>
+  }
+  return %result : tensor<4xf32>
+}
+
+// Verify that long form printing is used when there is only one data operand
+// and then it can be parsed again correctly.
+// CHECK-LABEL: func @merge_one_data_operand
+func @merge_one_data_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = tf_executor.Merge %arg0  : (tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<*xf32>
+  }
+  return %result : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @merge_with_variant_type
+func @merge_with_variant_type(%arg0: tensor<!tf.variant>, %arg1: tensor<!tf.variant<tensor<4xi32>>>) -> tensor<!tf.variant<tensor<8xf32>>> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<!tf.variant>, tensor<!tf.variant<tensor<4xi32>>>) -> (tensor<!tf.variant<tensor<8xf32>>>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<!tf.variant>, tensor<!tf.variant<tensor<4xi32>>>) -> (tensor<!tf.variant<tensor<8xf32>>>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<!tf.variant<tensor<8xf32>>>
+  }
+  return %result : tensor<!tf.variant<tensor<8xf32>>>
+}
+
 // CHECK-LABEL: func @enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
 func @enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
   %result = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index 366cd825f65..ee3d2b91732 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -27,7 +27,7 @@ func @empty_graph() {
 // Check that an empty graph is invalid (it needs a region).
 func @empty_graph() {
  "tf_executor.graph" () ({
-// expected-error@-1 {{'tf_executor.graph' op expects a non-empty body}}
+// expected-error@-1 {{'tf_executor.graph' op expects a non-empty block}}
  ^entry:
   }) : () -> ()
   return
@@ -47,6 +47,17 @@ func @graph_with_invalid_op(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
+// Check that tf_executor.graph can't be nested directly in a tf_executor.graph.
+func @nested_graph() {
+  tf_executor.graph {
+    tf_executor.graph {}
+// expected-error@-1 {{'tf_executor.graph' op unallowed directly inside another tf_executor.graph}}
+  }
+  return
+}
+
+// -----
+
 // Check that a tf_executor.fetch is terminating a tf_executor.graph (custom parser)
 func @graph_with_invalid_terminator(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   tf_executor.graph {
@@ -58,11 +69,23 @@ func @graph_with_invalid_terminator(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.fetch parent is a graph.
+func @parent_is_graph() {
+  "some.op"() ({
+    tf_executor.fetch
+// expected-error@-1 {{'tf_executor.fetch' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that a tf_executor.fetch is terminating a tf_executor.graph (verifier)
 func @graph_with_invalid_terminator(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+// expected-error@+2 {{'tf_executor.graph' op expects regions to end with 'tf_executor.fetch', found 'tf_executor.yield'}}
+// expected-note@+1 {{in custom textual format, the absence of terminator implies 'tf_executor.fetch'}}
   "tf_executor.graph" () ({
     tf_executor.yield
-// expected-error@-1 {{'tf_executor.yield' op invalid tf_executor.graph terminator, fetch expected}}
   }) : () -> ()
   return %arg0 : tensor<*xf32>
 }
@@ -149,6 +172,17 @@ func @invalid_fetch(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) -> tensor<
 
 // -----
 
+// Check that a tf_executor.island parent is a graph.
+func @parent_is_graph() {
+  "some.op"() ({
+    %ctl = tf_executor.island {}
+// expected-error@-1 {{'tf_executor.island' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that an island can't have other operands than controls.
 func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
   tf_executor.graph {
@@ -189,7 +223,7 @@ func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
 func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
   tf_executor.graph {
     "tf_executor.island"() ({
-// expected-error@-1 {{'tf_executor.island' op expects a non-empty body}}
+// expected-error@-1 {{'tf_executor.island' op expects a non-empty block}}
  ^entry:
     }) : () -> (!tf_executor.control)
   }
@@ -202,8 +236,9 @@ func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
 func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
   tf_executor.graph {
     "tf_executor.island"() ({
+// expected-error@-1 {{'tf_executor.island' op expects regions to end with 'tf_executor.yield', found 'std.return'}}
+// expected-note@-2 {{in custom textual format, the absence of terminator implies 'tf_executor.yield'}}
       return
-// expected-error@-1 {{'std.return' op invalid tf_executor.island terminator, yield expected}}
     }) : () -> (!tf_executor.control)
   }
   return
@@ -211,6 +246,17 @@ func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
 
 // -----
 
+// Check that a tf_executor.yield parent is a tf_executor.island.
+func @parent_is_island() {
+  "some.op"() ({
+    tf_executor.yield
+// expected-error@-1 {{'tf_executor.yield' op expects parent op 'tf_executor.island'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that an island yield matches the island results.
 func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
   tf_executor.graph {
@@ -276,6 +322,17 @@ func @invalid_yield(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
 
 // -----
 
+// Check that a tf_executor.Switch parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: tensor<i1>) {
+  "some.op"() ({
+    %true, %false, %ctlSwitch = tf_executor.Switch %arg0, %arg1 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.Switch' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that a switch always takes two arguments.
 func @invalid_switch(%arg0: tensor<*xf32>) {
   tf_executor.graph {
@@ -335,11 +392,22 @@ func @invalid_switch(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.SwitchN parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: tensor<i32>) {
+  "some.op"() ({
+     %1:6 = tf_executor.SwitchN %arg0, %arg1 of 5 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.SwitchN' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that switchN result numbers matches the num_out attribute.
-func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-     %1:3 = "tf_executor.SwitchN"(%arg1, %arg0) {num_outs = 5} : (tensor<*xf32>, i32) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
+     %1:3 = "tf_executor.SwitchN"(%arg1, %arg0) {num_outs = 5} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
 // expected-error@-1 {{'tf_executor.SwitchN' op expect `num_outs` (5) results but got 2}}
 
      tf_executor.fetch %1#0 : tensor<*xf32>
@@ -350,10 +418,10 @@ func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 // -----
 
 // Check that switchN result type matches the input type.
-func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-     %1:3 = "tf_executor.SwitchN"(%arg1, %arg0) {num_outs = 2} : (tensor<*xf32>, i32) -> (tensor<*xf32>, i32, !tf_executor.control)
+     %1:3 = "tf_executor.SwitchN"(%arg1, %arg0) {num_outs = 2} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, i32, !tf_executor.control)
 // expected-error@-1 {{'tf_executor.SwitchN' op type mismatch between data operand and result: 'tensor<*xf32>' vs 'i32'}}
 
      tf_executor.fetch %1#0 : tensor<*xf32>
@@ -364,7 +432,7 @@ func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 // -----
 
 // Check that switchN custom type has a single entry.
-func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
      %1:3 = tf_executor.SwitchN %arg1, %arg0 of 2 : tensor<*xf32>, i32
@@ -377,6 +445,17 @@ func @invalid_switchN(%arg0: i32, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.Merge parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>) {
+  "some.op"() ({
+    %value, %idx, %ctlMerge = tf_executor.Merge %arg0, %arg0 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.Merge' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that merge has at least one operand.
 func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
   %result = tf_executor.graph {
@@ -431,6 +510,18 @@ func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<4xf32>) -> tensor<8xf32>
 
 // -----
 
+// Check that merge data inputs of variant type are broadcastable to the output
+func @invalid_merge(%arg0: tensor<*x!tf.variant>, %arg1: tensor<4x!tf.variant>) -> tensor<8x!tf.variant> {
+  %result = tf_executor.graph {
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<*x!tf.variant>, tensor<4x!tf.variant>) -> (tensor<8x!tf.variant>, tensor<i32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable but got 'tensor<8x!tf.variant>' vs 'tensor<4x!tf.variant>'}}
+    tf_executor.fetch %value : tensor<8x!tf.variant>
+  }
+  return %result : tensor<8x!tf.variant>
+}
+
+// -----
+
 // Check that merge data inputs can't appear after control input.
 func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
   %result = tf_executor.graph {
@@ -446,6 +537,17 @@ func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.Enter parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>) {
+  "some.op"() ({
+    %res:2 = tf_executor.Enter %arg0 frame "some/fra\"me" : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.Enter' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 // Check that Enter return value is the same type as the input.
 func @invalid_enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
   %result = tf_executor.graph {
@@ -458,6 +560,28 @@ func @invalid_enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.NextIteration.Sink parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: !tf_executor.token) {
+  "some.op"() ({
+    tf_executor.NextIteration.Sink[%arg1] %arg0 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.NextIteration.Sink' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// Check that a tf_executor.NextIteration.Source parent is a graph.
+func @parent_is_graph() {
+  "some.op"() ({
+    %1:3 = tf_executor.NextIteration.Source : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.NextIteration.Source' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 func @invalid_nextiteration(%arg0: tensor<*xf32>, %arg1: !tf_executor.token) -> tensor<*xf32> {
   %0 = tf_executor.graph {
     %1:3 = tf_executor.NextIteration.Source : tensor<*xf32>
@@ -521,6 +645,17 @@ func @invalid_nextiteration(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
 
 // -----
 
+// Check that a tf_executor.Exit parent is a graph.
+func @parent_is_graph(%arg0: tensor<*xf32>) {
+  "some.op"() ({
+    %1:2 = tf_executor.Exit %arg0 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor.Exit' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
 func @exit(%arg0: tensor<*xi32>) -> tensor<*xf32> {
   %0 = tf_executor.graph {
     %1:2 = "tf_executor.Exit"(%arg0) : (tensor<*xi32>) -> (tensor<*xf32>, !tf_executor.control)
@@ -529,3 +664,25 @@ func @exit(%arg0: tensor<*xi32>) -> tensor<*xf32> {
   }
   return %0 : tensor<*xf32>
 }
+
+// -----
+
+// Check that a tf_executor.ControlTrigger parent is a graph.
+func @parent_is_graph(%arg0: !tf_executor.control, %arg1: !tf_executor.control) {
+  "some.op"() ({
+    %0 = tf_executor.ControlTrigger %arg0, %arg1
+// expected-error@-1 {{'tf_executor.ControlTrigger' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// Check that a tf_executor.LoopCond parent is a graph.
+func @parent_is_graph(%arg0: tensor<i1>, %arg1: !tf_executor.control) {
+  "some.op"() ({
+    %1:2 = tf_executor.LoopCond %arg0, %arg1 : tensor<i1>
+// expected-error@-1 {{'tf_executor.LoopCond' op expects parent op 'tf_executor.graph'}}
+  }) : () -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
new file mode 100644
index 00000000000..dc2f60b6441
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -0,0 +1,406 @@
+// RUN: tf-opt %s -split-input-file -tf-tpu-rewrite | FileCheck %s
+
+// Tests simple case of `tf_device.launch_func` on TPU with single input and
+// single output.
+
+module {
+  // CHECK-LABEL: func @single_tpu_launch_func
+  func @single_tpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests that launch_func without _tpu_replicate attribute is ignored.
+
+module {
+  // CHECK-LABEL: func @single_gpu_launch_func
+  func @single_gpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    %1 = "tf_device.launch_func"(%0) {device = "gpu0", func = @gpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: tf_device.launch_func
+    // CHECK-SAME: {device = "gpu0", func = @gpu0_func}
+
+    return %1 : tensor<?xi32>
+  }
+
+  func @gpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests of `tf_device.launch_func` on TPU with nested function calls.
+
+module {
+  // CHECK-LABEL: func @with_nested_func
+  func @with_nested_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-SAME: func @nested_func
+    // CHECK-SAME: tf.D
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = call @nested_func(%0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  func @nested_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests of `tf_device.launch_func` on TPU with referenced function that's not
+// via a standard call op.
+
+module {
+  // CHECK-LABEL: func @with_referenced_func
+  func @with_referenced_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-SAME: func @referenced_func
+    // CHECK-SAME: tf.D
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) {body = @referenced_func} : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+
+  func @referenced_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests rewriting `tf_device.launch_func` on TPU with a chain of referenced
+// functions.
+
+module {
+  // CHECK-LABEL: func @with_referenced_func_chain
+  func @with_referenced_func_chain(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-SAME: @referenced_func1
+    // CHECK-SAME: tf.D
+    // CHECK-SAME: @referenced_func2
+    // CHECK-SAME: tf.E
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) {body = @referenced_func1} : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+
+  func @referenced_func1(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = call @referenced_func2(%0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  func @referenced_func2(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.E"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests rewriting `tf_device.launch_func` on TPU with multiple calls to same
+// function.
+
+module {
+  // CHECK-LABEL: func @with_multiple_call_same_referenced_func
+  func @with_multiple_call_same_referenced_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-COUNT-2: call @referenced_func
+    // CHECK-COUNT-1: func @referenced_func
+    // CHECK-SAME: tf.D
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) {body = @referenced_func1} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = call @referenced_func(%0) : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = call @referenced_func(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    return %2 : tensor<?xi32>
+  }
+
+  func @referenced_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %1 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests multiple `tf_device.launch_func` on TPU with different computation.
+
+module {
+  // CHECK-LABEL: func @multiple_launch_different_func
+  func @multiple_launch_different_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func0} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-NOT: func = @tpu0_func0
+    // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "tpu0", func = @tpu0_func1} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
+    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[EXECUTE0_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster1"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.D
+    // CHECK-NOT: func = @tpu0_func1
+    // CHECK: %[[EXECUTE1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[EXECUTE0_OUTPUT]], %[[COMPILE1_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %3 = "tf.C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE1_OUTPUT]])
+
+    return %3 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func0(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+
+  func @tpu0_func1(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests multiple `tf_device.launch_func` on TPU with same computation.
+
+module {
+  // CHECK-LABEL: func @multiple_launch_same_func
+  func @multiple_launch_same_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
+    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[EXECUTE0_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster1"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-NOT: func = @tpu0_func
+    // CHECK: %[[EXECUTE1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[EXECUTE0_OUTPUT]], %[[COMPILE1_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %3 = "tf.C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE1_OUTPUT]])
+
+    return %3 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests Functions referenced by TPU function via SymbolRefAttr nested in
+// ArrayAttr and DictionaryAttr.
+
+module {
+  // CHECK-LABEL: func @single_tpu_launch_func
+  func @single_tpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: _tpu_replicate = "cluster0"
+    // CHECK-SAME: module
+    // CHECK-SAME: func @main
+    // CHECK-SAME: tf.B
+    // CHECK-SAME: func @referenced_func2
+    // CHECK-SAME: tf.H
+    // CHECK-SAME: func @referenced_func3
+    // CHECK-SAME: tf.I
+    // CHECK-SAME: func @referenced_func0
+    // CHECK-SAME: tf.F
+    // CHECK-SAME: func @referenced_func1
+    // CHECK-SAME: tf.G
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
+    // CHECK-SAME: Targs = [tensor<?xi32>]
+    // CHECK-SAME: Tresults = [tensor<?xi32>]
+
+    %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
+
+    return %2 : tensor<?xi32>
+    // CHECK: return %[[C_OUTPUT]]
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf.D"(%0) {array_attr_funcs = [@referenced_func0, @referenced_func1]} : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf.E"(%1) {dictionary_attr_funcs = {fn1 = @referenced_func2, fn2 = @referenced_func3}} : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+
+  func @referenced_func0(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %1 = "tf.F"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  func @referenced_func1(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %1 = "tf.G"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  func @referenced_func2(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %1 = "tf.H"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  func @referenced_func3(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %1 = "tf.I"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+}
+
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 473f69f87e7..0653c1d109e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -20,9 +20,7 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 /// TODO(b/130756570): Support OpBase constraints in PatternRewrites.
 def SingleResultAndOperandHaveSameElementType : Constraint<
-  CPred<"$0->getResult(0)->getType().cast<ShapedType>()"
-        ".getElementType() == "
-        "$1->getType().cast<ShapedType>().getElementType()">>;
+  CPred<"getElementTypeOrSelf($0) == getElementTypeOrSelf($1)">>;
 
 //===----------------------------------------------------------------------===//
 // Add op patterns.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
new file mode 100644
index 00000000000..2511ff2fdb3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -0,0 +1,232 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation forms clusters from instructions in same island and
+// assigned to save devices. Clusters are represented as regions.
+// Note that side-effecting ops are not correctly handled yet.
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/BlockAndValueMapping.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace mlir {
+namespace TFDevice {
+
+namespace {
+
+struct ClusterFormationPass : public FunctionPass<ClusterFormationPass> {
+  void runOnFunction() override;
+};
+
+// Cluster structure captures all the operations that are assigned to same
+// device and can form a legal strict cluster.
+// Ops must follow same ordering in their parent block. We rely on this
+// assumption to perform analysis.
+struct Cluster {
+  llvm::SmallVector<Operation*, 4> ops;
+  StringRef device;
+};
+
+StringRef GetDevice(Operation* op) {
+  auto device_attr = op->getAttrOfType<StringAttr>("device");
+  return device_attr ? device_attr.getValue() : "";
+}
+
+// An op can be merged into cluster if all of its operands are one of the
+// following:
+//  1) A block argument
+//  2) A value produced by other islands
+//  1) Defined before the cluster
+//  2) Defined by an operation in the cluster
+// TODO(ycao): This is not optimal as it doesn't consider the situation of
+// defining_op's operands all meet the requirements above. In that case, the
+// defining_op can be moved and to_merge op would be legal to absorb.
+// TODO(ycao): Take op side-effects into consideration since they can not be
+// re-ordered but forming clusters of non-continuous ops is effectively
+// re-ordering them..
+bool CanMergeIntoCluster(const Cluster& c, Operation* to_merge) {
+  return llvm::all_of(to_merge->getOperands(), [&](Value* operand) {
+    // Block arguments.
+    if (isa<BlockArgument>(operand)) return true;
+
+    Operation* defining_op = operand->getDefiningOp();
+
+    // Operand produced by other islands.
+    if (defining_op->getBlock() != c.ops.front()->getBlock()) return true;
+
+    // Defining op is before the cluster.
+    if (defining_op->isBeforeInBlock(c.ops.front())) return true;
+
+    // Defining op is between first and last operation in cluster. Note that
+    // cluster may contain operations that are non-continuous in their original
+    // block, thus we also need to check defining_op is also assigned to
+    // cluster's device to be sure. This is a faster check than linearly
+    // searching through all ops in cluster.
+    if (defining_op->isBeforeInBlock(c.ops.back()->getNextNode()) &&
+        GetDevice(defining_op) == c.device)
+      return true;
+
+    // Other cases, operand is generated after or outside the cluster, this
+    // means it is illegal to merge operation.
+    return false;
+  });
+}
+
+void ReplaceLiveOutExternalUses(llvm::ArrayRef<Value*> live_outs,
+                                tf_device::LaunchOp launch_op) {
+  Region* launch_op_region = &launch_op.body();
+  for (const auto& p : llvm::zip(live_outs, launch_op.getResults())) {
+    Value* from = std::get<0>(p);
+    for (auto& use : from->getUses()) {
+      if (launch_op_region->isAncestor(use.getOwner()->getParentRegion()))
+        continue;
+      use.set(std::get<1>(p));
+    }
+  }
+}
+
+// Get all escaped live-out values of a region.
+void GetLiveOuts(Region* region, llvm::SmallVectorImpl<Value*>* live_outs) {
+  live_outs->clear();
+
+  for (Operation& op : region->front()) {
+    for (Value* v : op.getResults()) {
+      // A value is live-out if any of its users are not inside value producer's
+      // region.
+      bool is_live_out = llvm::any_of(v->getUsers(), [&](Operation* user) {
+        return !region->isAncestor(user->getParentRegion());
+      });
+
+      if (is_live_out) live_outs->emplace_back(v);
+    }
+  }
+}
+
+// Build a `tf_device.launch` op with a region that contains all the operations
+// in given cluster. Then all ops in cluster are replaced by `tf_device.launch`.
+void BuildLaunchForCluster(const Cluster& c, OpBuilder* builder) {
+  // Set insertion point to right after all operations in cluster.
+  builder->setInsertionPoint(c.ops.back()->getNextNode());
+
+  // Create a stand-alone region to hold all instructions in the cluster.
+  Region region;
+  region.push_back(new Block);
+
+  // Move all operations in cluster to newly created region, stripping their
+  // "device" attribute since launch op already carries device information.
+  Block* block = &region.front();
+  for (Operation* op : c.ops) {
+    op->moveBefore(block, block->end());
+    op->removeAttr(builder->getIdentifier("device"));
+  }
+
+  // Get all escaped live-out values of region, they are used later to determine
+  // return values and types of launch op.
+  llvm::SmallVector<Value*, 4> live_outs;
+  GetLiveOuts(&region, &live_outs);
+
+  // Build a `tf_device.return` op at end of region, with all live-out values
+  // as operand.
+  OpBuilder return_builder(builder->getContext());
+  return_builder.setInsertionPointToEnd(block);
+  return_builder.create<tf_device::ReturnOp>(return_builder.getUnknownLoc(),
+                                             live_outs);
+
+  llvm::SmallVector<Type, 4> live_out_types;
+  live_out_types.reserve(live_outs.size());
+  for (Value* v : live_outs) {
+    live_out_types.emplace_back(v->getType());
+  }
+
+  tf_device::LaunchOp launch_op = builder->create<tf_device::LaunchOp>(
+      builder->getUnknownLoc(), builder->getStringAttr(c.device),
+      live_out_types);
+
+  // Attach the region to launch_op.
+  launch_op.body().takeBody(region);
+
+  // Replace any external uses of live-out values with return values of launch
+  // op. So live-out values no longer escape the region.
+  ReplaceLiveOutExternalUses(live_outs, launch_op);
+}
+
+void ClusterFormationPass::runOnFunction() {
+  OpBuilder builder(getFunction().getContext());
+  getFunction().walk<tf_executor::IslandOp>([&](tf_executor::IslandOp island) {
+    // Iteratively find clusters of different devices within an island.
+    // Whenever we see an operation that is assigned to an accelerator device
+    // (ie. device != ""), we try to merge it into the last cluster of same
+    // device. If that is infeasible (say because of violating def-before-use),
+    // create a new cluster with that operation and move on.
+    llvm::MapVector<StringRef, Cluster> nearest_clusters;
+    for (Operation& op : llvm::make_early_inc_range(island.GetBody())) {
+      auto device = GetDevice(&op);
+      if (device == "") continue;
+
+      // If no cluster of same device has been formed yet, create a new cluster
+      // with op alone.
+      auto it = nearest_clusters.find(device);
+      if (it == nearest_clusters.end()) {
+        nearest_clusters[device] = Cluster{{&op}, device};
+        continue;
+      }
+
+      // Check if it is legal to merge op into nearest cluster of same device.
+      // If positive, update cluster and move on to next operation.
+      Cluster& nearest_cluster = it->second;
+      if (CanMergeIntoCluster(nearest_cluster, &op)) {
+        nearest_cluster.ops.emplace_back(&op);
+        continue;
+      }
+
+      // If nearest cluster of same device can not absorb `op`, then that
+      // cluster needs to be finalized by building a `tf_device.launch` op with
+      // a region that contains all operations in clusters.
+      BuildLaunchForCluster(nearest_cluster, &builder);
+
+      // Create a new cluster to hold op alone and update nearest_clusters.
+      nearest_clusters[device] = Cluster{{&op}, device};
+    }
+
+    // At the end, there might be left-over found clusters that need to be
+    // built.
+    for (auto& device_cluster : nearest_clusters)
+      BuildLaunchForCluster(device_cluster.second, &builder);
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<FunctionPassBase> CreateClusterFormationPass() {
+  return std::make_unique<ClusterFormationPass>();
+}
+
+static PassRegistration<ClusterFormationPass> pass(
+    "tf-device-cluster-formation",
+    "Form clusters from instructions assigned to same device");
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
new file mode 100644
index 00000000000..414b4a0d161
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This pass outlines regions of `tf_device.launch` into functions and replaces
+// `tf_device.launch` with equivalent `tf_device.launch_func` operations.
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "mlir/Transforms/RegionUtils.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TFDevice {
+
+namespace {
+
+struct ClusterOutliningPass : public ModulePass<ClusterOutliningPass> {
+  void runOnModule() override;
+};
+
+void ReplaceLaunchReturnWithReturn(tf_device::ReturnOp launch_return_op,
+                                   OpBuilder* builder) {
+  llvm::SmallVector<Value*, 4> operands(launch_return_op.getOperands());
+  builder->create<ReturnOp>(launch_return_op.getLoc(), operands);
+  launch_return_op.erase();
+}
+
+// Builds a function that outlines region attached to launch_op and inserts
+// built function into given module.
+FuncOp BuildFunction(StringRef device, llvm::ArrayRef<Value*> live_ins,
+                     tf_device::LaunchOp launch_op,
+                     ModuleManager* module_manager, OpBuilder* builder) {
+  llvm::SmallVector<Type, 4> operand_types;
+  operand_types.reserve(live_ins.size());
+  for (Value* v : live_ins) operand_types.emplace_back(v->getType());
+
+  llvm::SmallVector<Type, 4> result_types(launch_op.getResultTypes());
+
+  auto func_type =
+      FunctionType::get(operand_types, result_types, builder->getContext());
+
+  std::string func_name_prefix = Twine(device, "_func").str();
+  FuncOp outlined_func =
+      FuncOp::create(launch_op.getLoc(), func_name_prefix, func_type);
+
+  // Create function body.
+  Block* outlined_func_block = outlined_func.addEntryBlock();
+
+  // Replace uses of live-in values within launch_op region with function
+  // arguments.
+  Region& launch_op_region = launch_op.body();
+  for (const auto& p :
+       llvm::zip(live_ins, outlined_func_block->getArguments())) {
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               launch_op_region);
+  }
+
+  // Move all instructions in launch_op into outlined_function's only block.
+  auto& launch_op_body = launch_op_region.front().getOperations();
+  outlined_func_block->getOperations().splice(
+      outlined_func_block->end(), launch_op_body, launch_op_body.begin(),
+      launch_op_body.end());
+
+  // Replace `tf_device.launch_return` terminator with `std.return` in function
+  // body.
+  auto launch_return_op =
+      cast<tf_device::ReturnOp>(outlined_func_block->getTerminator());
+  builder->setInsertionPoint(launch_return_op);
+  ReplaceLaunchReturnWithReturn(launch_return_op, builder);
+
+  module_manager->insert(outlined_func);
+  return outlined_func;
+}
+
+// Outlines body of `tf_device.launch` into a function and create a
+// `tf_device.launch_func` to invoke that function. `tf_device.launch` is
+// removed afterwards.`
+void OutlineLaunch(tf_device::LaunchOp launch_op, ModuleManager* module_manager,
+                   OpBuilder* builder) {
+  llvm::SetVector<Value*> live_ins;
+  getUsedValuesDefinedAbove(launch_op.body(), launch_op.body(), live_ins);
+
+  StringRef device = launch_op.getAttrOfType<StringAttr>("device").getValue();
+
+  FuncOp outlined_func = BuildFunction(device, live_ins.getArrayRef(),
+                                       launch_op, module_manager, builder);
+  builder->setInsertionPoint(launch_op);
+  tf_device::LaunchFuncOp launch_func_op =
+      builder->create<tf_device::LaunchFuncOp>(
+          launch_op.getLoc(), outlined_func.getType().getResults(),
+          builder->getStringAttr(device),
+          builder->getSymbolRefAttr(outlined_func.getName()),
+          live_ins.getArrayRef());
+
+  launch_op.replaceAllUsesWith(launch_func_op);
+  launch_op.erase();
+}
+
+void ClusterOutliningPass::runOnModule() {
+  ModuleOp m = getModule();
+  ModuleManager module_manager(m);
+  OpBuilder builder(m.getContext());
+  m.walk<tf_device::LaunchOp>([&](tf_device::LaunchOp launch) {
+    OutlineLaunch(launch, &module_manager, &builder);
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<ModulePassBase> CreateClusterOutliningPass() {
+  return std::make_unique<ClusterOutliningPass>();
+}
+
+static PassRegistration<ClusterOutliningPass> pass(
+    "tf-device-cluster-outlining",
+    "Outline regions of tf_device.launch operations.");
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.cc
index 6ce5233cb1e..3e6e2a6058e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.cc
@@ -64,7 +64,9 @@ struct DecodeConstant : public FunctionPass<DecodeConstant> {
 
 }  // namespace
 
-FunctionPassBase *CreateDecodeConstantPass() { return new DecodeConstant(); }
+std::unique_ptr<FunctionPassBase> CreateDecodeConstantPass() {
+  return std::make_unique<DecodeConstant>();
+}
 
 static PassRegistration<DecodeConstant> pass(
     "tf-decode-constant", "Decode opaque constant into human-readable ones");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h b/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h
index a0cd77b393f..2e66de0c4d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h
@@ -23,7 +23,7 @@ namespace TF {
 // Creates a pass to decode and reset opaque values in constant ops into
 // readable values.
 // Note that this pass assumes RaiseTFControlFlow pass has already been run.
-FunctionPassBase *CreateDecodeConstantPass();
+std::unique_ptr<FunctionPassBase> CreateDecodeConstantPass();
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
new file mode 100644
index 00000000000..496e99e4ff7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
@@ -0,0 +1,329 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass takes TFExecutor dialect IslandOps and merges them.
+// Note, this currently does not handle TensorFlow V1 style control flow/frames
+// or side effecting ops yet.
+
+#include <iterator>
+#include <tuple>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace mlir {
+namespace tf_executor {
+
+namespace {
+
+// IslandType is an enum representing if an island is the island (parent)
+// merging another island or is the island (child) being being merged.
+enum IslandType { kParentIsland, kChildIsland };
+
+// Output is a helper struct holding a result index and island type (parent or
+// child).
+struct Output {
+  Output(IslandType island_type, int result_index)
+      : island_type(island_type), result_index(result_index) {}
+
+  IslandType island_type;
+  int result_index;
+};
+
+struct ExecutorIslandCoarsening
+    : public FunctionPass<ExecutorIslandCoarsening> {
+  void runOnFunction() override;
+
+ private:
+  void MergeIslands(IslandOp parent, IslandOp child,
+                    IslandType insert_position);
+  bool MergeIslandWithOperand(IslandOp child);
+  bool MergeIslandWithResult(IslandOp parent);
+};
+
+// Finds the operation leading to an island that the island can be merged with.
+// This looks for the operation, either control input or data input to an op,
+// that is closest to the island in the graph. If no candidate can be found or
+// the op found is not an island, an empty optional is returned.
+llvm::Optional<IslandOp> GetOperandCandidateToMergeWith(IslandOp island) {
+  Operation* graph_op = island.getParentOp();
+  Operation* candidate = nullptr;
+
+  // Check island control operands.
+  for (Value* input : island.controlInputs()) {
+    Operation* def = input->getDefiningOp();
+    DCHECK_EQ(def->getParentOp(), graph_op);
+    if (!candidate || candidate->isBeforeInBlock(def)) candidate = def;
+  }
+
+  // Check island data operands.
+  island.walk([graph_op, &candidate](Operation* op) {
+    for (Value* input : op->getOperands()) {
+      Operation* def = input->getDefiningOp();
+      if (!def || def->getParentOp() != graph_op) continue;
+      if (!candidate || candidate->isBeforeInBlock(def)) candidate = def;
+    }
+  });
+
+  if (!candidate || !llvm::isa<IslandOp>(candidate)) return llvm::None;
+
+  return llvm::Optional<IslandOp>(llvm::cast<IslandOp>(candidate));
+}
+
+// Finds the operation leading from an island that the island can be merged
+// with. This looks for the operation, either control output or data output to
+// an op, that is closest to the island in the graph. If no candidate can be
+// found or the op found is not an island, an empty optional is returned.
+llvm::Optional<IslandOp> GetResultCandidateToMergeWith(IslandOp island) {
+  Operation* graph_op = island.getParentOp();
+  Operation* candidate = nullptr;
+
+  // Check island control results.
+  for (Operation* user : island.control()->getUsers()) {
+    DCHECK_EQ(user->getParentOp(), graph_op);
+    if (!candidate || user->isBeforeInBlock(candidate)) candidate = user;
+  }
+
+  // Check island data results.
+  Block& graph_body = llvm::cast<GraphOp>(graph_op).GetBody();
+  for (Value* result : island.outputs()) {
+    for (Operation* user : result->getUsers()) {
+      Operation* def = graph_body.findAncestorInstInBlock(*user);
+      DCHECK_NE(def, nullptr);
+      if (!candidate || def->isBeforeInBlock(candidate)) candidate = def;
+    }
+  }
+
+  if (!candidate || !llvm::isa<IslandOp>(candidate)) return llvm::None;
+
+  return llvm::Optional<IslandOp>(llvm::cast<IslandOp>(candidate));
+}
+
+// Collects the operands for the new island by collecting all control inputs of
+// the islands being merged.
+llvm::SmallSetVector<Value*, 8> GetNewIslandOperands(IslandOp parent,
+                                                     IslandOp child) {
+  llvm::SmallSetVector<Value*, 8> operands;
+  operands.insert(parent.getOperands().begin(), parent.getOperands().end());
+  operands.insert(child.getOperands().begin(), child.getOperands().end());
+  operands.remove(parent.control());
+  return operands;
+}
+
+// Collects the results for the new island by going through each data output of
+// the islands being merged. Unused results outside of the merged island to be
+// formed are pruned. If the child island inner ops consume the parent island
+// control output, the child island inner ops will have that respective control
+// input pruned. Results of the parent island that are consumed by the child
+// island are replaced by the respective inner ops output from the parent
+// island.
+llvm::SmallVector<Output, 8> GetNewIslandResultsAndForwardOutputs(
+    mlir::MLIRContext* context, IslandOp parent, IslandOp child,
+    llvm::SmallVector<Type, 8>* result_types) {
+  llvm::SmallVector<Output, 8> results;
+
+  YieldOp yield_op = parent.GetYield();
+  Block& child_body = child.GetBody();
+  for (auto& ret_and_idx : llvm::enumerate(parent.outputs())) {
+    bool output_captured = false;
+    Value* yield_input = yield_op.getOperand(ret_and_idx.index());
+    for (auto& use :
+         llvm::make_early_inc_range(ret_and_idx.value()->getUses())) {
+      if (child_body.findAncestorInstInBlock(*use.getOwner())) {
+        // Forward output from inner op.
+        use.set(yield_input);
+      } else if (!output_captured) {
+        results.push_back(
+            Output(IslandType::kParentIsland, ret_and_idx.index()));
+        result_types->push_back(ret_and_idx.value()->getType());
+        output_captured = true;
+      }
+    }
+  }
+
+  for (auto& ret_and_idx : llvm::enumerate(child.outputs())) {
+    if (!ret_and_idx.value()->use_empty()) {
+      results.push_back(Output(IslandType::kChildIsland, ret_and_idx.index()));
+      result_types->push_back(ret_and_idx.value()->getType());
+    }
+  }
+
+  // IslandOps always have a control output.
+  result_types->push_back(ControlType::get(context));
+
+  return results;
+}
+
+// Creates the new merged island.
+IslandOp CreateNewIsland(Operation* old_island,
+                         llvm::ArrayRef<Type> result_types,
+                         llvm::ArrayRef<Value*> operands) {
+  OpBuilder builder(old_island);
+  auto new_island = builder.create<IslandOp>(
+      old_island->getLoc(), result_types, operands, ArrayRef<NamedAttribute>{});
+  new_island.body().push_back(new Block);
+  return new_island;
+}
+
+// Creates respective YieldOp for the new merged island.
+YieldOp CreateNewIslandYieldOp(IslandOp new_island,
+                               llvm::ArrayRef<Output> results, IslandOp parent,
+                               IslandOp child) {
+  llvm::SmallVector<Value*, 8> yield_operands;
+  yield_operands.reserve(results.size());
+  for (auto ret_vals : llvm::zip(results, new_island.outputs())) {
+    // Get consumed output (island type and result index).
+    const auto& output = std::get<0>(ret_vals);
+    IslandOp& output_island =
+        output.island_type == IslandType::kParentIsland ? parent : child;
+    Value* result = output_island.getResult(output.result_index);
+    // Replace original result with new island result.
+    result->replaceAllUsesWith(std::get<1>(ret_vals));
+    // Find YieldOp in original island, grab the associated operand (inner op
+    // output) and add it as a operand to the YieldOp of the merged island.
+    yield_operands.push_back(
+        output_island.GetYield().getOperand(output.result_index));
+  }
+
+  // Create YieldOp for the new island.
+  OpBuilder builder(&new_island.GetBody(), new_island.GetBody().end());
+  return builder.create<YieldOp>(new_island.getLoc(), yield_operands);
+}
+
+// Moves inner ops (excluding last op/YieldOp) from islands being merged into
+// the new merged island.
+void MoveInnerOpsToNewIsland(IslandOp parent, IslandOp child,
+                             Operation* new_yield_op) {
+  Block* block = new_yield_op->getBlock();
+
+  auto move_inner_ops = [block, new_yield_op](IslandOp island) {
+    auto& island_body = island.GetBody().getOperations();
+    block->getOperations().splice(new_yield_op->getIterator(), island_body,
+                                  island_body.begin(),
+                                  std::prev(island_body.end()));
+  };
+
+  move_inner_ops(parent);
+  move_inner_ops(child);
+}
+
+// Merges two islands and places new merged island before parent or child.
+void ExecutorIslandCoarsening::MergeIslands(IslandOp parent, IslandOp child,
+                                            IslandType insert_position) {
+  // Collect operands for the new merged island.
+  llvm::SmallSetVector<Value*, 8> operands =
+      GetNewIslandOperands(parent, child);
+
+  // Collect results and result types for the new merged island.
+  llvm::SmallVector<Type, 8> result_types;
+  llvm::SmallVector<Output, 8> results = GetNewIslandResultsAndForwardOutputs(
+      &getContext(), parent, child, &result_types);
+
+  // Create the new merged island.
+  IslandOp new_island = CreateNewIsland(
+      insert_position == IslandType::kParentIsland ? parent : child,
+      result_types, operands.getArrayRef());
+
+  // Create associated YieldOp for the new merged island.
+  YieldOp new_yield_op =
+      CreateNewIslandYieldOp(new_island, results, parent, child);
+
+  // Move inner ops from original islands into the new island.
+  MoveInnerOpsToNewIsland(parent, child, new_yield_op.getOperation());
+
+  // Update control inputs to point to the new merged island.
+  child.control()->replaceAllUsesWith(new_island.control());
+  parent.control()->replaceAllUsesWith(new_island.control());
+
+  // Remove merged islands.
+  child.erase();
+  parent.erase();
+}
+
+// Merges island with the operand closest to the island in the graph. The
+// operand must be another IslandOp for merging to take place. A new island is
+// created and the islands being merged are removed if a merge took place.
+// Returns true if the island was merged with its operand.
+bool ExecutorIslandCoarsening::MergeIslandWithOperand(IslandOp child) {
+  // Find candidate operand to merge island with.
+  llvm::Optional<IslandOp> candidate = GetOperandCandidateToMergeWith(child);
+  if (!candidate.hasValue()) return false;
+  auto& parent = candidate.getValue();
+  MergeIslands(parent, child, IslandType::kParentIsland);
+  return true;
+}
+
+// Merges island with the result closest to the island in the graph. The result
+// must be another IslandOp for merging to take place. A new island is created
+// and the islands being merged are removed if a merge took place. Returns true
+// if the island was merged with its result.
+bool ExecutorIslandCoarsening::MergeIslandWithResult(IslandOp parent) {
+  // Find candidate result to merge island with.
+  llvm::Optional<IslandOp> candidate = GetResultCandidateToMergeWith(parent);
+  if (!candidate.hasValue()) return false;
+  auto& child = candidate.getValue();
+  MergeIslands(parent, child, IslandType::kChildIsland);
+  return false;
+}
+
+void ExecutorIslandCoarsening::runOnFunction() {
+  getFunction().walk<GraphOp>([this](GraphOp graph) {
+    Block& graph_body = graph.GetBody();
+
+    bool updated = false;
+    do {
+      updated = false;
+
+      auto reversed = llvm::reverse(graph_body);
+      for (Operation& operation : llvm::make_early_inc_range(reversed)) {
+        auto island = llvm::dyn_cast<IslandOp>(operation);
+        if (!island) continue;
+        updated |= MergeIslandWithResult(island);
+      }
+
+      for (Operation& operation : llvm::make_early_inc_range(graph_body)) {
+        auto island = llvm::dyn_cast<IslandOp>(operation);
+        if (!island) continue;
+        updated |= MergeIslandWithOperand(island);
+      }
+    } while (updated);
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<FunctionPassBase> CreateTFExecutorIslandCoarseningPass() {
+  return std::make_unique<ExecutorIslandCoarsening>();
+}
+
+static PassRegistration<ExecutorIslandCoarsening> pass(
+    "tf-executor-island-coarsening", "Merges TFExecutor dialect IslandOps");
+
+}  // namespace tf_executor
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index af3e1e05ade..ade8cc17032 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -16,13 +16,13 @@ limitations under the License.
 // This transformation pass transforms functional control flow operations in the
 // standard TensorFlow dialect to MLIR Control Flow Graph (CFG) form.
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -150,12 +150,12 @@ static LogicalResult LowerIfOp(IfOp op) {
   OpBuilder builder(op_inst);
 
   // Lower the condition to a boolean value (i1).
-  Value* cond_i1 = LowerCondition(loc, op.getCondition(), &builder);
+  Value* cond_i1 = LowerCondition(loc, op.cond(), &builder);
   if (!cond_i1) return failure();
 
   auto module = op_inst->getParentOfType<ModuleOp>();
-  auto then_fn = module.lookupSymbol<FuncOp>(op.getThen());
-  auto else_fn = module.lookupSymbol<FuncOp>(op.getElse());
+  auto then_fn = module.lookupSymbol<FuncOp>(op.then_branch());
+  auto else_fn = module.lookupSymbol<FuncOp>(op.else_branch());
 
   // Split the basic block before the 'if'.  The new dest will be our merge
   // point.
@@ -211,8 +211,8 @@ static LogicalResult LowerWhileOp(WhileOp op) {
   OpBuilder builder(op_inst);
 
   auto module = op_inst->getParentOfType<ModuleOp>();
-  auto cond_fn = module.lookupSymbol<FuncOp>(op.getCond());
-  auto body_fn = module.lookupSymbol<FuncOp>(op.getBody());
+  auto cond_fn = module.lookupSymbol<FuncOp>(op.cond());
+  auto body_fn = module.lookupSymbol<FuncOp>(op.body());
 
   // Split the block containing the While op into two blocks.  One containing
   // operations before the While op and other containing the rest.  Create two
@@ -331,8 +331,8 @@ void FunctionalControlFlowToCFG::runOnFunction() {
 
 }  // namespace
 
-FunctionPassBase* CreateTFFunctionalControlFlowToCFG() {
-  return new FunctionalControlFlowToCFG();
+std::unique_ptr<FunctionPassBase> CreateTFFunctionalControlFlowToCFG() {
+  return std::make_unique<FunctionalControlFlowToCFG>();
 }
 
 static PassRegistration<FunctionalControlFlowToCFG> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
new file mode 100644
index 00000000000..5d3c612e5cd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace tf_executor {
+
+// Prunes a TF graph eliminating dead nodes.
+void prune_graph(GraphOp graph) {
+  // A graph has a single block which forms a DAG: nodes that aren't reachable
+  // from the `fetch` operands can be eliminated.
+
+  // Delete unreachable node from the graph. We traverse it in reverse order so
+  // that we just have to check that a node does not have any users to delete
+  // it.
+  for (Operation &op : llvm::make_early_inc_range(
+           llvm::drop_begin(llvm::reverse(graph.GetBody()), 1))) {
+    // NextIteration.Sink operation are handled specially: they are live if the
+    // source is live, and removed when the source is processed.
+    if (auto sinkOp = dyn_cast<NextIterationSinkOp>(op)) continue;
+
+    // For NextIteration.Source, we just check that the source does not have any
+    // other user than the sink.
+    if (auto sourceOp = dyn_cast<NextIterationSourceOp>(op)) {
+      Operation *sink = sourceOp.GetSink().getOperation();
+      if (llvm::any_of(sourceOp.getResults(), [sink](Value *result) {
+            return llvm::any_of(result->getUsers(), [sink](Operation *user) {
+              return user != sink;
+            });
+          }))
+        continue;
+
+      // No other users than the sink, erase the pair!
+      sink->erase();
+      sourceOp.erase();
+      continue;
+    }
+
+    // General case.
+    if (op.use_empty()) op.erase();
+  }
+}
+
+namespace {
+
+// This transformation pass prunes a TF graph eliminating dead-nodes.
+struct GraphPruning : public FunctionPass<GraphPruning> {
+  void runOnFunction() override {
+    getFunction().walk<tf_executor::GraphOp>(
+        [](tf_executor::GraphOp graph) { prune_graph(graph); });
+  }
+};
+
+}  // namespace
+
+FunctionPassBase *CreateTFExecutorGraphPruningPass() {
+  return new GraphPruning();
+}
+
+static PassRegistration<GraphPruning> pass(
+    "tf-executor-graph-pruning", "Prune a TensorFlow Graph from dead nodes.");
+
+}  // namespace tf_executor
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 72775d078f9..5e0e961cc46 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <iostream>
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -35,13 +35,15 @@ struct TFOptimizePass : public FunctionPass<TFOptimizePass> {
     OwningRewritePatternList patterns;
     auto func = getFunction();
     populateWithGenerated(&getContext(), &patterns);
-    applyPatternsGreedily(func, std::move(patterns));
+    applyPatternsGreedily(func, patterns);
   }
 };
 
 }  // namespace
 
-FunctionPassBase* CreateTFOptimizePass() { return new TFOptimizePass(); }
+std::unique_ptr<FunctionPassBase> CreateTFOptimizePass() {
+  return std::make_unique<TFOptimizePass>();
+}
 
 static PassRegistration<TFOptimizePass> pass("tf-optimize", "Optimizes TF.");
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
index 7dcf7c3819f..49793f43cf3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "NHWC">;
@@ -21,6 +21,7 @@ def BroadcastableElements :
     Constraint<CPred<"TFL::IsBroadcastableElementsAttrs($0, $1)">>;
 def F32ElementsAttr : ElementsAttrBase<
     CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+def DefinedByConv2D : Constraint<CPred<"llvm::isa<mlir::TF::Conv2DOp>($0->getDefiningOp())">>;
 
 // If we see a Conv2D op followed by Mul, then multiply the filter
 // with the value in Mul.
@@ -41,3 +42,40 @@ def FuseMulAndConv2D : Pat<(TF_MulOp (TF_Conv2DOp $input,
                        $padding, $explicit_padding, $data_format,
                        $dilations),
           [(BroadcastableElements $filter, $value)]>;
+
+// This rule does the following pattern match and rewrite:
+//
+//       input     bias                    input  value  bias  value
+//          |      /                =>       \    /        \    /
+//          BiasAdd    value                   Mul          Mul
+//                \    /                           \       /
+//                  Mul                             BiasAdd
+// This is to enable the FuseMulAndConv2D pattern.
+def PassthroughMulAndBiasAdd :
+  Pat<(TF_MulOp
+        (TF_BiasAddOp $input,
+          (ConstantOp F32ElementsAttr:$bias), IsDataFormatNHWC:$same_format),
+        (ConstantOp F32ElementsAttr:$value)),
+      (TF_BiasAddOp
+          (TF_MulOp $input, (ConstantOp $value)),
+          (TF_MulOp (ConstantOp $bias), (ConstantOp $value)),
+          $same_format),
+      [(DefinedByConv2D $input)]>;
+
+
+// This rule does the following pattern match and rewrite:
+//
+//       input     bias                    input  value  bias  value
+//          |      /                =>       \    /        \    /
+//           AddV2    value                   Mul          Mul
+//                \    /                           \       /
+//                  Mul                             AddV2
+// This is to enable the FuseMulAndConv2D pattern.
+def PassthroughMulAndAddV2 :
+  Pat<(TF_MulOp
+        (TF_AddV2Op $input, (ConstantOp F32ElementsAttr:$bias)),
+        (ConstantOp F32ElementsAttr:$value)),
+      (TF_AddV2Op
+          (TF_MulOp $input, (ConstantOp $value)),
+          (TF_MulOp (ConstantOp $bias), (ConstantOp $value))),
+      [(DefinedByConv2D $input)]>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 1202d4d432c..e66fd89eb8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -22,19 +22,49 @@ namespace mlir {
 namespace TF {
 // Transforms functional control flow operations in the standard TensorFlow
 // dialect to MLIR Control Flow Graph (CFG) form.
-FunctionPassBase *CreateTFFunctionalControlFlowToCFG();
+std::unique_ptr<FunctionPassBase> CreateTFFunctionalControlFlowToCFG();
 
 // Optimizes Tensorflow graph.
-FunctionPassBase *CreateTFOptimizePass();
+std::unique_ptr<FunctionPassBase> CreateTFOptimizePass();
 
 }  // namespace TF
 
 namespace TFControlFlow {
 // Raises from the "TensorFlow Control Flow" dialect to the standard TensorFlow
 // dialect.
-FunctionPassBase *CreateRaiseTFControlFlowPass();
+std::unique_ptr<FunctionPassBase> CreateRaiseTFControlFlowPass();
 
 }  // namespace TFControlFlow
+
+namespace tf_executor {
+class GraphOp;
+
+// Create a pass to merge IslandOps from TFExecutor dialect.
+std::unique_ptr<FunctionPassBase> CreateTFExecutorIslandCoarseningPass();
+
+// Create a pass to prune tf_executor.graph from dead nodes.
+FunctionPassBase* CreateTFExecutorGraphPruningPass();
+
+// Prune a tf_executor.graph operation from dead nodes.
+void prune_graph(GraphOp graph);
+
+}  // namespace tf_executor
+
+namespace TFDevice {
+// Creates a pass that forms clusters from instructions that are assigned to
+// same device.
+std::unique_ptr<FunctionPassBase> CreateClusterFormationPass();
+
+// Creates a pass that outlines regions of tf_device.launch operations.
+std::unique_ptr<ModulePassBase> CreateClusterOutliningPass();
+}  // namespace TFDevice
+
+namespace TFTPU {
+// Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime
+// ops
+std::unique_ptr<ModulePassBase> CreateTPURewritePass();
+}  // namespace TFTPU
+
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/raise_control_flow.cc b/tensorflow/compiler/mlir/tensorflow/transforms/raise_control_flow.cc
index 3e058127fe2..69bfd75e1e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/raise_control_flow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/raise_control_flow.cc
@@ -145,8 +145,8 @@ void RaiseTFControlFlow::rewriteOps() {
 
 }  // namespace
 
-FunctionPassBase *CreateRaiseTFControlFlowPass() {
-  return new RaiseTFControlFlow();
+std::unique_ptr<FunctionPassBase> CreateRaiseTFControlFlowPass() {
+  return std::make_unique<RaiseTFControlFlow>();
 }
 
 static PassRegistration<RaiseTFControlFlow> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index 60f7ed35a0b..c5f21fa3029 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/function.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
new file mode 100644
index 00000000000..84d2690f787
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -0,0 +1,275 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// Rewrites `tf_device.launch_func` operations assigned to TPU into actual TPU
+// jit-compile runtime ops.
+//
+// For example:
+//   %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster", func =
+//         @tpu_func}
+//   %2 = "tf.SomeOp"(%1)
+//
+// Would become following ops (unimportant attributes, types are omitted):
+//    %1 = "tf.Shape"(%0)
+//    %2:2 = "tf.MLIRCompileToTPU"(%1) {module = "<Serialized @tpu_func>"}
+//    "tf.TPUCompileSucceededAssert"(%2#0)
+//    %3 = "tf.TPUExecute"(%0, %2#1)
+//    %4 = "tf.SomeOp"(%3)
+
+namespace {
+struct TPURewritePass : public ModulePass<TPURewritePass> {
+  void runOnModule() override;
+};
+
+// Recursively visits all attributes of `op` to find any Attribute of type
+// `SymbolRefAttr`.
+llvm::SmallVector<SymbolRefAttr, 8> GetAllSymbolRefAttrs(Operation* op) {
+  llvm::SmallVector<SymbolRefAttr, 8> symbol_ref_attrs;
+
+  llvm::SmallVector<Attribute, 8> worklist;
+  for (auto named_attr : op->getAttrs()) {
+    worklist.push_back(named_attr.second);
+  }
+
+  while (!worklist.empty()) {
+    Attribute attr = worklist.pop_back_val();
+
+    if (SymbolRefAttr symbol_ref_attr = attr.dyn_cast<SymbolRefAttr>()) {
+      // Found a SymbolRefAttr, add it to result list.
+      symbol_ref_attrs.push_back(symbol_ref_attr);
+    } else if (ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>()) {
+      // Found an ArrayAttr, add its nested Attributes to worklist for further
+      // inspection.
+      worklist.append(array_attr.begin(), array_attr.end());
+    } else if (DictionaryAttr dict_attr = attr.dyn_cast<DictionaryAttr>()) {
+      // Found a DictionaryAttr, add its nested value Attributes to worklist for
+      // further inspection.
+      for (NamedAttribute named_attr : dict_attr.getValue()) {
+        worklist.push_back(named_attr.second);
+      }
+    }
+  }
+
+  return symbol_ref_attrs;
+}
+
+// Creates a new self-contained module that contains `entry_func` and all
+// referenced functions in `entry_func`. entry_func is renamed to "main".
+// Return value is serialized text formate of newly-created module.
+std::string EncapsulateFuncAndSerialize(FuncOp entry_func) {
+  ModuleOp module = entry_func.getParentOfType<ModuleOp>();
+  llvm::SmallVector<FuncOp, 4> referenced({entry_func});
+
+  // Create a new module to hold func and all referenced functions.
+  OwningModuleRef module_for_func =
+      ModuleOp::create(mlir::UnknownLoc::get(entry_func.getContext()));
+  ModuleManager module_manager(module_for_func.get());
+
+  while (!referenced.empty()) {
+    auto func = referenced.pop_back_val();
+
+    // Skip functions that have already been cloned into new module.
+    if (module_manager.lookupSymbol<FuncOp>(func.getName())) continue;
+
+    // Find any SymbolRefAttr in func that maps to a FuncOp. We need to clone
+    // all found FuncOps to new_module to make sure new_module is
+    // self-contained.
+    func.walk([&](Operation* op) {
+      for (auto symbol_ref_attr : GetAllSymbolRefAttrs(op)) {
+        FuncOp referenced_func =
+            module.lookupSymbol<FuncOp>(symbol_ref_attr.getValue());
+
+        // Skip Symbols that do not map to a function.
+        if (!referenced_func) continue;
+
+        referenced.emplace_back(referenced_func);
+      }
+    });
+
+    auto clone = func.clone();
+    if (clone.getName() == entry_func.getName()) {
+      // We can simply change name of TPU program's main function because there
+      // should be no other reference to it.
+      clone.setName("main");
+    }
+    module_manager.insert(clone);
+  }
+
+  // Serialize module and return.
+  std::string txt_module;
+  {
+    llvm::raw_string_ostream os(txt_module);
+    module_for_func.get().print(os);
+  }
+  return txt_module;
+}
+
+// Create a `tf.MLIRCompileToTPU` that contains a MLIR module that is
+// functionally equivalent to the function referenced by launch_func.
+Operation* BuildCompileOp(tf_device::LaunchFuncOp launch_func,
+                          OpBuilder* builder) {
+  // TODO(b/139377366): Use tf_tpu.compile build method when it is defined.
+  OperationState compile_op_state(launch_func.getLoc(), "tf.MLIRCompileToTPU");
+
+  // Build a shape op for each input to launch_func.
+  // TODO(b/139377366): When shape inference is ready, we can use compile time
+  // shape inference to get inputs that have static shapes and only use shape
+  // ops for the rest.
+  llvm::SmallVector<Value*, 4> compile_op_operands;
+  compile_op_operands.reserve(launch_func.getNumOperands());
+
+  for (Value* v : launch_func.getOperands()) {
+    auto shape_op = builder->create<TF::ShapeOp>(
+        launch_func.getLoc(),
+        builder->getTensorType({-1}, builder->getIntegerType(64)), v);
+    compile_op_operands.emplace_back(shape_op.getResult());
+  }
+  compile_op_state.addOperands(compile_op_operands);
+
+  SymbolRefAttr func_attr = launch_func.getAttrOfType<SymbolRefAttr>("func");
+  if (!func_attr) {
+    launch_func.emitOpError("does not have `func` attribute");
+    return nullptr;
+  }
+  FuncOp func = launch_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
+      func_attr.getValue());
+
+  std::string txt_module = EncapsulateFuncAndSerialize(func);
+  compile_op_state.addAttribute("module", builder->getStringAttr(txt_module));
+
+  // Copy all launch_func attributes other than `func`.
+  for (auto attr : launch_func.getAttrs()) {
+    if (attr.first == "func") continue;
+    compile_op_state.attributes.emplace_back(attr);
+  }
+
+  // Result #0 is a string indicating whether compilation is successful or not.
+  compile_op_state.addTypes(
+      builder->getTensorType({}, builder->getType<TF::StringType>()));
+
+  // Result #1 is key to look up executable binary in compilation cache.
+  compile_op_state.addTypes(
+      builder->getTensorType({}, builder->getType<TF::StringType>()));
+
+  return builder->createOperation(compile_op_state);
+}
+
+// Creates a `tf.TPUExecute` op that executes TPU program generated by
+// `compile_op`.
+Operation* BuildExecuteOp(Operation* compile_op,
+                          tf_device::LaunchFuncOp launch_func,
+                          OpBuilder* builder) {
+  // TODO(b/139377366): Use tf.TPUExecute build method when it is defined.
+  OperationState execute_op_state(launch_func.getLoc(), "tf.TPUExecute");
+
+  // TPUExecute inherits all launch_func inputs.
+  llvm::SmallVector<Value*, 4> tensor_inputs(launch_func.getOperands());
+  execute_op_state.addOperands(tensor_inputs);
+
+  // TODO(b/139377366): Need to snapshot all resource variable inputs in
+  // follow-up CLs.
+
+  // Set Targs of TPUExecute according to launch_func input types.
+  llvm::SmallVector<Attribute, 4> tensor_input_types_attrs;
+  tensor_input_types_attrs.reserve(tensor_inputs.size());
+  for (Value* v : tensor_inputs) {
+    tensor_input_types_attrs.emplace_back(builder->getTypeAttr(v->getType()));
+  }
+  execute_op_state.addAttribute(
+      "Targs", builder->getArrayAttr(tensor_input_types_attrs));
+
+  // TPUExecute takes an additional input for compilation cache key.
+  execute_op_state.addOperands(compile_op->getResult(1));
+
+  // Set Tresults of TPUExecute according to launch_func results types.
+  llvm::SmallVector<Attribute, 4> output_types_attrs;
+  output_types_attrs.reserve(launch_func.getNumResults());
+  for (Value* v : launch_func.getResults()) {
+    output_types_attrs.emplace_back(builder->getTypeAttr(v->getType()));
+  }
+  execute_op_state.addAttribute("Tresults",
+                                builder->getArrayAttr(output_types_attrs));
+
+  // TPUExecute has same output types as launch_func.
+  llvm::SmallVector<Type, 4> output_types(launch_func.getResultTypes());
+  execute_op_state.addTypes(output_types);
+
+  return builder->createOperation(execute_op_state);
+}
+
+// Creates a `tf.TPUCompileSucceededAssert` operation that parses compilation
+// status of `compile_op` to check whether compilation is successful.
+void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
+                                      OpBuilder* builder) {
+  OperationState assert_op_state(compile_op->getLoc(),
+                                 "tf.TPUCompileSucceededAssert");
+  assert_op_state.addOperands(compile_op->getResult(0));
+  builder->createOperation(assert_op_state);
+}
+
+// Rewrites a `tf_device.launch_func` operation into a set of TPU Runtime
+// Operations that jit-compiles and executes function in `tf_device.launch_func`
+// on TPU.
+void Rewrite(tf_device::LaunchFuncOp launch_func, OpBuilder* builder) {
+  builder->setInsertionPoint(launch_func);
+  Operation* compile_op = BuildCompileOp(launch_func, builder);
+  BuildTPUCompileSucceededAssertOp(compile_op, builder);
+  // TODO(ycao): Right now we only support single-core case. The right thing to
+  // do is to read from launch_func attributes to determine how many execute
+  // ops to build.
+  Operation* execute_op = BuildExecuteOp(compile_op, launch_func, builder);
+  launch_func.replaceAllUsesWith(execute_op);
+  launch_func.erase();
+}
+
+void TPURewritePass::runOnModule() {
+  OpBuilder builder(&getContext());
+  getModule().walk<tf_device::LaunchFuncOp>([&](tf_device::LaunchFuncOp op) {
+    // Skip non-tpu device launch_func.
+    if (!op.getAttrOfType<StringAttr>("_tpu_replicate")) return;
+    Rewrite(op, &builder);
+  });
+
+  // TODO(b/139377366): Remove functions that are no longer needed.
+}
+
+}  // namespace
+
+std::unique_ptr<ModulePassBase> CreateTPURewritePass() {
+  return std::make_unique<TPURewritePass>();
+}
+
+static PassRegistration<TPURewritePass> pass(
+    "tf-tpu-rewrite",
+    "Rewriting `tf_device.launch_func` on TPUs into TPU runtime ops");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
index 4d9b3ca7ab7..1b48d92171e 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -147,7 +147,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
     if (op.getName().getStringRef() == "_tf.Switch") {
       replacement = builder.create<tf_executor::SwitchOp>(
           loc, types, operands, ArrayRef<NamedAttribute>{});
-    } else if (op.getName().getStringRef() == "_tf.SwitchN") {
+    } else if (op.getName().getStringRef() == "_tf._SwitchN") {
       replacement = builder.create<tf_executor::SwitchNOp>(
           loc, types, operands, ArrayRef<NamedAttribute>{});
     } else if (op.getName().getStringRef() == "_tf.Merge") {
@@ -155,7 +155,7 @@ void ControlToExecutorDialectConversion::runOnFunction() {
           loc, types, operands, ArrayRef<NamedAttribute>{});
     } else if (op.getName().getStringRef() == "_tf.NextIteration.source") {
       replacement = builder.create<tf_executor::NextIterationSourceOp>(
-          loc, op.getResult(0)->getType(), operands);
+          loc, op.getResult(0)->getType());
       // Record a mapping of the name to the nextiteration.source so that when
       // we convert the sink we can get the token.
       StringAttr frame = op.getAttrOfType<StringAttr>("name");
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc b/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc
index 0c265da11f2..2b076e3d5f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
@@ -129,10 +129,7 @@ static bool DerivedAttrWritersMain(raw_ostream &os, RecordKeeper &records) {
 }
 
 int main(int argc, char **argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-  llvm::PrettyStackTraceProgram X(argc, argv);
-
-  llvm::llvm_shutdown_obj Y;
+  llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
   return TableGenMain(argv[0], &DerivedAttrWritersMain);
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
new file mode 100644
index 00000000000..2d906d84db3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
@@ -0,0 +1,210 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass transforms from TF executor dialect to MLIR TF
+// contol dialect.
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+#define DEBUG_TYPE "tf-executor-to-ctl"
+
+namespace mlir {
+
+namespace {
+struct ExecutorToControlDialectConversion
+    : public FunctionPass<ExecutorToControlDialectConversion> {
+  void runOnFunction() override;
+};
+}  // end anonymous namespace
+
+static bool HasSingleGraph(FuncOp function) {
+  // We expect the function has only one region with one block,
+  if (function.getBlocks().size() != 1) return false;
+  auto &block = function.front();
+  // and the block contains two ops,
+  if (std::next(block.begin()) == block.end()) return false;
+  // one GraphOp,
+  if (!isa<tf_executor::GraphOp>(block.begin())) return false;
+  // followed by a terminator.
+  if (!std::next(block.begin())->isKnownTerminator()) return false;
+  return true;
+}
+
+void ExecutorToControlDialectConversion::runOnFunction() {
+  if (!HasSingleGraph(getFunction())) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expect a Function with a single block and a single graph op,"
+                  " skip tf_executor dialect conversion\n");
+    return;
+  }
+  Type control_type = TFControlFlow::TFControlType::get(&getContext());
+
+  Block &body = getFunction().front();
+  OpBuilder builder(&body, body.begin());
+  auto graph = cast<tf_executor::GraphOp>(body.front());
+  SmallString<64> new_op_name;
+  for (auto &op : llvm::make_early_inc_range(graph.GetBody())) {
+    LLVM_DEBUG(llvm::dbgs() << "Process: " << op.getName() << "\n");
+    if (auto fetch = dyn_cast<tf_executor::FetchOp>(op)) {
+      // Replace all the operands of the fetch op with the uses of the graph
+      // results, the graph op will then be removed.
+      for (auto ops_and_ret_vals :
+           llvm::zip(graph.getResults(), fetch.getOperands()))
+        std::get<0>(ops_and_ret_vals)
+            ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals));
+      continue;
+    }
+    if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+      Value *ctl_sequence = nullptr;
+      Operation *last_replaced_op = nullptr;
+      for (Operation &wrapped_op : island.GetBody()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << " In island: " << wrapped_op.getName() << "\n");
+        if (isa<tf_executor::YieldOp>(wrapped_op)) {
+          for (auto ops_and_ret_vals :
+               llvm::zip(island.getResults(), wrapped_op.getOperands()))
+            std::get<0>(ops_and_ret_vals)
+                ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals));
+          break;
+        }
+        // Add a leading _ off the name.
+        new_op_name = "_";
+        new_op_name += wrapped_op.getName().getStringRef();
+        OperationState state(wrapped_op.getLoc(), new_op_name);
+
+        // Add an operand for each non-control input we find. Collect control
+        // values separately to add them to the island operands
+        state.operands.append(wrapped_op.getOperands().begin(),
+                              wrapped_op.getOperands().end());
+
+        // Chain operations through a control dependency, except for the first
+        // operations in the sequence that carry the control dependencies held
+        // by the island itself.
+        if (ctl_sequence) {
+          state.operands.push_back(ctl_sequence);
+        } else {
+          for (Value *ctl_operand : island.getOperands())
+            state.operands.push_back(ctl_operand);
+        }
+
+        // Add a result type for each result
+        state.types.append(wrapped_op.getResultTypes().begin(),
+                           wrapped_op.getResultTypes().end());
+        state.types.push_back(control_type);
+
+        // Create the replacement operation.
+        auto *replacement = builder.createOperation(state);
+        replacement->setAttrs(wrapped_op.getAttrList());
+
+        for (auto ops_and_ret_vals :
+             llvm::zip(wrapped_op.getResults(), replacement->getResults()))
+          std::get<0>(ops_and_ret_vals)
+              ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals));
+
+        ctl_sequence = replacement->getResult(replacement->getNumResults() - 1);
+        last_replaced_op = replacement;
+      }
+      for (Value *island_ctl : island.getResults())
+        island_ctl->replaceAllUsesWith(
+            last_replaced_op->getResult(last_replaced_op->getNumResults() - 1));
+      op.erase();
+      continue;
+    }
+
+    new_op_name.clear();
+    if (isa<tf_executor::SwitchOp>(op)) {
+      new_op_name = "_tf.Switch";
+    } else if (isa<tf_executor::SwitchNOp>(op)) {
+      new_op_name = "_tf._SwitchN";
+    } else if (isa<tf_executor::MergeOp>(op)) {
+      new_op_name = "_tf.Merge";
+    } else if (isa<tf_executor::NextIterationSourceOp>(op)) {
+      new_op_name = "_tf.NextIteration.source";
+    } else if (isa<tf_executor::NextIterationSinkOp>(op)) {
+      new_op_name = "_tf.NextIteration.sink";
+    } else if (isa<tf_executor::LoopCondOp>(op)) {
+      new_op_name = "_tf.LoopCond";
+    } else if (isa<tf_executor::EnterOp>(op)) {
+      new_op_name = "_tf.Enter";
+    } else if (isa<tf_executor::ExitOp>(op)) {
+      new_op_name = "_tf.Exit";
+    } else if (isa<tf_executor::ControlTriggerOp>(op)) {
+      new_op_name = "_tf.ControlTrigger";
+    } else {
+      op.emitOpError() << "unhandled op in tf_executor to _tf conversion";
+      return signalPassFailure();
+    }
+    OperationState state(op.getLoc(), new_op_name);
+    // Token results are dropped when we process the source op, the operand
+    // becomes nullptr by the time we process the sink op, filter it out here.
+    auto non_null_operands =
+        llvm::make_filter_range(op.getOperands(), [](Value *v) { return v; });
+    state.operands.append(non_null_operands.begin(), non_null_operands.end());
+    for (Type result_type : op.getResultTypes()) {
+      // Filter out TokenType, they don't exist in the control dialect.
+      if (result_type.isa<tf_executor::TokenType>()) continue;
+      if (!result_type.isa<tf_executor::ControlType>())
+        state.types.push_back(result_type);
+      else
+        state.types.push_back(control_type);
+    }
+    // The control dialect has a control result for the sink operation.
+    if (isa<tf_executor::NextIterationSinkOp>(op))
+      state.types.push_back(control_type);
+
+    // Create the replacement operation.
+    auto *replacement = builder.createOperation(state);
+    replacement->setAttrs(op.getAttrList());
+
+    if (auto next_iteration =
+            dyn_cast<tf_executor::NextIterationSourceOp>(op)) {
+      next_iteration.output()->replaceAllUsesWith(replacement->getResult(0));
+      next_iteration.token()->dropAllUses();
+      next_iteration.control()->replaceAllUsesWith(replacement->getResult(1));
+    } else {
+      for (auto ops_and_ret_vals :
+           llvm::zip(op.getResults(), replacement->getResults()))
+        std::get<0>(ops_and_ret_vals)
+            ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals));
+    }
+    op.erase();
+  }
+  graph.erase();
+}
+
+std::unique_ptr<FunctionPassBase> CreateTFExecutorToControlDialectConversion() {
+  return std::make_unique<ExecutorToControlDialectConversion>();
+}
+
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::ExecutorToControlDialectConversion> pass(
+    "tf-executor-to-control-conversion",
+    "Convert from TF executor dialect to TF control dialect");
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 3d98cdf4ea4..9868c4a4ac5 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
@@ -34,8 +35,10 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassManager.h"  // TF:local_config_mlir
 #include "mlir/Support/DebugStringHelper.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
@@ -55,6 +58,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
+namespace mlir {
+/// Create a pass to convert from the TFExecutor to the TF control dialect.
+std::unique_ptr<FunctionPassBase> CreateTFExecutorToControlDialectConversion();
+}  // namespace mlir
+
 namespace tensorflow {
 using llvm::cast;
 using llvm::dyn_cast;
@@ -201,10 +209,8 @@ std::string Exporter::UniqueName(mlir::Operation* op) {
 StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
     mlir::BlockArgument* arg, unsigned index) {
   auto node_def = absl::make_unique<NodeDef>();
-  node_def->set_name(UniqueName(arg->getContainingRegion()
-                                    ->getParentOfType<mlir::FuncOp>()
-                                    .getName()
-                                    .str()));
+  node_def->set_name(UniqueName(
+      arg->getParentRegion()->getParentOfType<mlir::FuncOp>().getName().str()));
   node_def->set_op(FunctionLibraryDefinition::kArgOp);
   DataType dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(
@@ -294,13 +300,17 @@ Status Exporter::AddInstructionNode(mlir::Operation* inst) {
     // check is too conservative given we could use a OpDef.
     if (auto abstract_op = inst->getAbstractOperation()) {
       if (&abstract_op->dialect == tf_dialect_) {
-        TF_ASSIGN_OR_RETURN(node_def, ConvertTFDialectOpToNodeDef(inst, name));
+        TF_ASSIGN_OR_RETURN(
+            node_def, ConvertTFDialectOpToNodeDef(
+                          inst, name, /*ignore_unregistered_attrs=*/false));
       }
     }
     // Convert TF control flow dialect ops.
     if (!node_def) {
-      TF_ASSIGN_OR_RETURN(node_def,
-                          GetOperationNodeDef(inst, name.c_str(), getTFOpName));
+      absl::flat_hash_set<absl::string_view> attrs_to_ignore;
+      TF_ASSIGN_OR_RETURN(
+          node_def, GetOperationNodeDef(attrs_to_ignore, inst, name.c_str(),
+                                        getTFOpName));
     }
     Node* node = graph_->AddNode(*node_def, &status);
     TF_RETURN_IF_ERROR(status);
@@ -326,7 +336,7 @@ Status Exporter::AddArgumentNode(mlir::BlockArgument* arg, unsigned index) {
   // is an input node. We recover the original input node and skip adding the
   // argument node. The new input node will be handled as normal in the
   // following steps.
-  if (arg->getContainingRegion()->getParentOfType<mlir::FuncOp>().getName() ==
+  if (arg->getParentRegion()->getParentOfType<mlir::FuncOp>().getName() ==
       "main") {
     if (!arg->hasOneUse()) {
       return errors::FailedPrecondition(
@@ -556,7 +566,8 @@ Status Exporter::ConvertLibFunction(const ExporterConfigs& configs,
 
   // Ignore the gradient and is_stateful attribute on the function as they have
   // been handled above.
-  absl::flat_hash_set<string> attrs_to_ignore = {grad_string, stateful_string};
+  absl::flat_hash_set<absl::string_view> attrs_to_ignore = {
+      grad_string.data(), stateful_string.data()};
   llvm::SmallVector<mlir::NamedAttribute, 8> funcAttrs(
       function.getDialectAttrs());
   TF_RETURN_IF_ERROR(
@@ -604,6 +615,12 @@ Status Exporter::Convert(mlir::ModuleOp module, const ExporterConfigs& configs,
 Status ConvertMlirToGraph(mlir::ModuleOp module, const ExporterConfigs& confs,
                           std::unique_ptr<Graph>* graph,
                           FunctionLibraryDefinition* flib_def) {
+  mlir::PassManager pass_manager;
+  pass_manager.addPass(mlir::CreateTFExecutorToControlDialectConversion());
+  if (mlir::failed(pass_manager.run(module))) {
+    return errors::FailedPrecondition(
+        "Failed to convert TFExecutor Dialect to Control Dialect.");
+  }
   return Exporter::Convert(module, confs, graph, flib_def);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index c2caf3f18f9..993a44452ea 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringSet.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
@@ -65,7 +68,7 @@ Status SetAttribute(absl::string_view name, ContainerT types,
 // definitions and isn't a header file.
 #include "tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator.inc"
 
-static StatusOr<string> getTensorFlowOpName(llvm::StringRef op_name) {
+StatusOr<string> getTensorFlowOpName(llvm::StringRef op_name) {
   if (!op_name.consume_front("tf.")) {
     return errors::FailedPrecondition("op name not prefixed with 'tf.': " +
                                       op_name.str());
@@ -73,12 +76,54 @@ static StatusOr<string> getTensorFlowOpName(llvm::StringRef op_name) {
   return op_name.str();
 }
 
+// Collect all the unregistered attributes for an TF dialect operation.
+// Attributes "name" and "device" are not included because they are not part
+// of an TF op attributes.
+Status GetUnregisteredAttrs(
+    mlir::Operation* inst,
+    absl::flat_hash_set<absl::string_view>* attrs_to_ignore) {
+  TF_ASSIGN_OR_RETURN(auto op_name,
+                      getTensorFlowOpName(inst->getName().getStringRef()));
+
+  const tensorflow::OpRegistrationData* op_reg_data;
+  auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (!status.ok()) {
+    // This is likely a function call node, so we should continue.
+    VLOG(1) << status.ToString();
+    return Status::OK();
+  }
+
+  // Collect all the registered attributes.
+  llvm::DenseSet<llvm::StringRef> registered_attrs;
+  registered_attrs.insert("name");
+  registered_attrs.insert("device");
+  for (const auto& attr_def : op_reg_data->op_def.attr()) {
+    registered_attrs.insert(attr_def.name());
+  }
+  // Attributes are not in the registered attributes set will be ignored.
+  for (auto& attr : inst->getAttrs()) {
+    auto attr_name = attr.first.c_str();
+    if (registered_attrs.find(attr_name) == registered_attrs.end()) {
+      attrs_to_ignore->insert(attr_name);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
-    mlir::Operation* inst, llvm::StringRef name) {
-  TF_ASSIGN_OR_RETURN(auto node_def,
-                      GetOperationNodeDef(inst, name, getTensorFlowOpName));
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs) {
+  // The elements are owned by the MLIRContext.
+  absl::flat_hash_set<absl::string_view> attrs_to_ignore;
+  if (ignore_unregistered_attrs) {
+    TF_RETURN_IF_ERROR(GetUnregisteredAttrs(inst, &attrs_to_ignore));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto node_def,
+      GetOperationNodeDef(attrs_to_ignore, inst, name, getTensorFlowOpName));
 
   // Use auto generated function to populate derived attribute.
   //
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
index 6d32a318a30..26e84d631a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -24,9 +24,13 @@ limitations under the License.
 namespace tensorflow {
 
 // Converts an MLIR operation to TensorFlow NodeDef with given node name. This
-// name should be unique to the graph it is being inserted to.
+// name should be unique to the graph it is being inserted to. If the
+// `ignore_unregistered_attrs` argument is set to true, the attributes which are
+// not in the op registry will be ignored. Set it to true if the returned
+// NodeDef will be excuted by the linked TF Eager runtime.
 stream_executor::port::StatusOr<std::unique_ptr<NodeDef>>
-ConvertTFDialectOpToNodeDef(mlir::Operation* inst, llvm::StringRef name);
+ConvertTFDialectOpToNodeDef(mlir::Operation* inst, llvm::StringRef name,
+                            bool ignore_unregistered_attrs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
similarity index 61%
rename from tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc
rename to tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 2ac09e3540d..34cdc609164 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+#include <iterator>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
@@ -36,9 +39,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
@@ -49,8 +52,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -66,49 +72,37 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
+static inline absl::string_view StringRefToView(llvm::StringRef ref) {
+  return {ref.data(), ref.size()};
+}
+
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
 namespace {
 
-// Stateful helper class to import a GraphDef into an MLIR Module. The nodes
-// defined in the graph is converted to a function called "main". All the
-// library function definitions are converted to MLIR functions in the module.
-class Importer {
- public:
-  // Main entry point: converts the given graph to an MLIR Module.
-  static StatusOr<mlir::OwningModuleRef> Convert(
-      mlir::MLIRContext* context, const Graph& graph,
-      const GraphDebugInfo& debug_info,
-      const FunctionLibraryDefinition& flib_def, const NodeSpecs& specs);
-
- private:
-  // Most types with subtypes have only one subtype.
-  using ElementSubtypes = llvm::SmallVector<mlir::TensorType, 1>;
-
-  explicit Importer(
+// Stateful helper class to import a TensorFlow model into an MLIR Module.
+//
+// This is the base class that contains common utilties shared between the
+// GraphDef importer and SavedModel importer.
+//
+// A subclass is expected to call `PrepareConvert` first to perform necessary
+// preparation over the graph and also certain internal bookkeeping data.
+// Afterwards the other protected methods can be called.
+class ImporterBase {
+ protected:
+  explicit ImporterBase(
       const FunctionLibraryDefinition& flib, const GraphDebugInfo& debug_info,
       const NodeSpecs& specs, mlir::ModuleOp module,
       std::unordered_map<std::string, std::string>* tf_name_to_mlir_name)
-      : module_(module),
+      : builder_(module.getContext()),
+        module_(module),
         context_(module.getContext()),
         tf_name_to_mlir_name_(tf_name_to_mlir_name),
         graph_flib_(flib),
         specs_(specs),
         debug_info_(debug_info) {}
 
-  // Prepares converting the graph to an MLIR module. This step removes the
-  // backedges of the graph, orders the nodes and infers the shapes.
-  Status PrepareConvert(const Graph& graph);
-
-  // Returns the function signature of the main function of converted MLIR
-  // module, the input nodes and output nodes. The type and shape information
-  // for the function arguments are read from the specs_, but the type and shape
-  // information for the function returns are inferred by the shape_refiner_.
-  StatusOr<mlir::FunctionType> InferMainFunctionType(
-      absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
-
   // Returns the inferred function signature of the given function body. Input
   // types are unranked tensor of the respective datatype in the function and
   // result types are inferred by the shape_refiner_. Result types need not be
@@ -116,25 +110,54 @@ class Importer {
   // depends on an op with static output shape like tf.Const.
   StatusOr<mlir::FunctionType> InferLibFunctionType(const FunctionBody& fbody);
 
+  // Extracts arg and ret nodes from FunctionBody.
+  void GetArgsAndRetsFromFunctionBody(
+      const FunctionBody& fbody,
+      absl::InlinedVector<OutputTensor, 4>* arg_nodes,
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes,
+      absl::InlinedVector<Node*, 4>* control_ret_nodes);
+
+  // Prepares converting the graph to an MLIR module. This step removes the
+  // backedges of the graph, orders the nodes and infers the shapes.
+  Status PrepareConvert(const Graph& graph);
+
   // Converts the prepared graph to a Function and adds it to the module. A set
   // of nodes from the graph are given to converted to the arguments and returns
   // of the function.
   Status Convert(llvm::StringRef func_name, mlir::FunctionType func_type,
                  const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
                  const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
+                 const absl::InlinedVector<Node*, 4>& control_ret_nodes,
                  llvm::ArrayRef<mlir::NamedAttribute> attrs);
 
+  // Finds out the function definition for the given function name from the
+  // graph and converts it to a function of the module. This method is called
+  // on demand because the graph flib_def does not provide an iterator
+  // interface.
+  Status ConvertLibFunction(llvm::StringRef func_name);
+
+  // Returns the list of nodes in the graph. Nodes are presented in the reverse
+  // order of a post-order depth-first visit starting from the graph's source
+  // nodes.
+  llvm::ArrayRef<Node*> GetOrderedNodes() const { return ordered_nodes_; }
+
+  // Returns the inferred output type at index `idx` of the `node` in the
+  // context.
+  StatusOr<mlir::TensorType> InferOutputType(const Node& node, int idx,
+                                             mlir::Builder builder);
+
+ private:
+  // Most types with subtypes have only one subtype.
+  using ElementSubtypes = llvm::SmallVector<mlir::TensorType, 1>;
+
   // Adds all the ordered_nodes to the shape refiner shape_refiner_. Then all
   // data type and shape information is maintained by the shape_refiner_.
   Status AddNodesToShapeRefiner();
 
-  // Returns the inferred input type at index `idx` of the node in the context.
-  StatusOr<mlir::TensorType> InferInputType(
-      ExtendedInferenceContext* shape_context, int idx, mlir::Builder builder);
-
-  // Returns the inferred output type at index `idx` of the node in the context.
-  StatusOr<mlir::TensorType> InferOutputType(
-      ExtendedInferenceContext* shape_context, int idx, mlir::Builder builder);
+  // Returns the inferred input type at index `idx` of the `node` in the
+  // context.
+  StatusOr<mlir::TensorType> InferInputType(const Node& node, int idx,
+                                            mlir::Builder builder);
 
   // Converts the inferred shape referred to by 'handle' in 'context', with
   // given element type, and returns an MLIR tensor type.
@@ -157,7 +180,7 @@ class Importer {
 
   // Converts the tensor proto into an MLIR elements attribute.
   StatusOr<mlir::ElementsAttr> ConvertTensorProto(const TensorProto& value) {
-    return ::tensorflow::ConvertTensorProto(value, builder_.get());
+    return ::tensorflow::ConvertTensorProto(value, &builder_);
   }
 
   // Converts func name in graphdef to mlir::SymbolRefAttribute.
@@ -176,6 +199,13 @@ class Importer {
       const std::string& base_name, const AttrValue& value,
       llvm::SmallVector<mlir::NamedAttribute, 4>* attributes);
 
+  // Helper to create either a tf_executor operation or a TF operation wrapped
+  // in an island.
+  mlir::Operation* createOperation(
+      const Node& node, llvm::StringRef op_name,
+      const mlir::OperationState& result,
+      const llvm::SmallVectorImpl<mlir::Value*>& control_operands);
+
   // Converts one NodeDef from the input GraphDef into an Operation and
   // inserts it into the MLIR module using builder_.
   Status ConvertNode(const Node& node);
@@ -200,25 +230,15 @@ class Importer {
   Status AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
                      int dst_input);
 
-  // Gets the "source" of a NextIteration operation. If it doesn't exist,
-  // creates and inserts it to the front of the basic block.
-  mlir::Operation* GetOrCreateNextIterationSource(mlir::Operation* sink,
-                                                  mlir::Operation* dst);
-
-  // Finds out the function definition for the given function name from the
-  // graph and converts it to a function of the module. This method is called
-  // on demand because the graph flib_def does not provide an iterator
-  // interface. The consequence is that only the referred functions are added to
-  // the MLIR module.
-  Status ConvertLibFunction(const std::string& func_name);
-
   // Adds the input arguments and return operation to the function. The
   // arguments are added as basic block argument. Also the argument types and
   // the id of the nodes from the input graph needs to be specified.
   Status ConvertFunctionArgAndRets(
-      mlir::Block* bb, llvm::ArrayRef<mlir::Type> arg_types,
+      mlir::Block* bb, mlir::tf_executor::GraphOp graph_op,
+      llvm::ArrayRef<mlir::Type> arg_types,
       const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
-      const absl::InlinedVector<OutputTensor, 4>& ret_nodes);
+      const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
+      const absl::InlinedVector<Node*, 4>& control_ret_nodes);
 
   // Gets the location information of the given node. It uses the
   // "original_node_name" in the NodeDef to get the corresponding file location
@@ -257,13 +277,12 @@ class Importer {
 
   // All nodes and version information about the (copied) imported graph.
   std::unique_ptr<Graph> graph_;
-  const VersionDef* graph_versions_;
   std::vector<Node*> ordered_nodes_;
 
   // Maps from a Node ID to a MLIR value.
   using NodeValueMap = absl::flat_hash_map<int, mlir::Operation*>;
 
-  std::unique_ptr<mlir::OpBuilder> builder_;
+  mlir::OpBuilder builder_;
   mlir::ModuleOp module_;
   mlir::MLIRContext* context_;
   std::unordered_map<std::string, std::string>* tf_name_to_mlir_name_;
@@ -274,11 +293,67 @@ class Importer {
   std::unique_ptr<ShapeRefiner> shape_refiner_;
 };
 
-// Adds the default attributes to each node def if they are missing from the
-// GraphDef.
-Status AddDefaultsToNodeDef(GraphDef* graph_def) {
+// Returns true if the node with given name has a non primary output that is
+// used by some other node as an input. Returns false if no outputs are in use
+// or only the first output is in use.
+bool HasNonPrimaryOutputInUse(const GraphDef& graph_def,
+                              const std::string& node) {
+  for (const auto& node_def : graph_def.node()) {
+    for (const auto& input : node_def.input()) {
+      if (absl::StartsWith(input, node + ":") && input != node + ":0") {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Updates the given LegacyFedInput node with Placeholder node if it is one of
+// the inputs. Returns an error if non primary output of the LegacyFedInput node
+// is in use and therefore can not be replaced by the Placeholder node that only
+// has a single output.
+Status UpdateLegacyFedInputNode(const GraphDef& graph_def,
+                                const NodeSpecs::InputArrays& inputs,
+                                NodeDef* node) {
+  const std::string& node_name = node->name();
+  auto it = inputs.find(node_name);
+
+  // Node is not an input.
+  if (it == inputs.end()) return Status::OK();
+
+  if (HasNonPrimaryOutputInUse(graph_def, node_name)) {
+    return errors::InvalidArgument(
+        "LegacyFedInput node ", node->name(),
+        " has non primary output in use and can not be replaced with "
+        "Placeholder node");
+  }
+
+  // Update op name, drop inputs and set attributes required by the Placeholder
+  // op.
+  *node->mutable_op() = "Placeholder";
+  node->clear_attr();
+  node->clear_input();
+  AddNodeAttr("dtype", it->second.imported_dtype, node);
+  AddNodeAttr("shape", it->second.shape, node);
+  return Status::OK();
+}
+
+// Preprocesses GraphDef before it can be converted to Graph by,
+// - Adding the default attributes to each node def if they are missing from
+//   the GraphDef.
+// - Replacing LegacyFedInput nodes with Placeholder nodes if
+//   convert_legacy_fed_inputs option is enabled.
+Status PreprocessGraphDef(const NodeSpecs* specs, GraphDef* graph_def) {
   const tensorflow::OpRegistrationData* op_reg_data;
   for (auto& node_def : *graph_def->mutable_node()) {
+    // TODO(hinsu): Completely deprecate support for LegacyFedInput ops. One
+    // solution could be have a tool to let users upgrade old serialized graphs.
+    if (specs && specs->convert_legacy_fed_inputs &&
+        node_def.op() == "LegacyFedInput") {
+      TF_RETURN_IF_ERROR(
+          UpdateLegacyFedInputNode(*graph_def, specs->inputs, &node_def));
+    }
+
     auto status =
         tensorflow::OpRegistry::Global()->LookUp(node_def.op(), &op_reg_data);
     if (!status.ok()) {
@@ -291,7 +366,7 @@ Status AddDefaultsToNodeDef(GraphDef* graph_def) {
   return Status::OK();
 }
 
-Status Importer::RemoveBackedges(const Graph& graph) {
+Status ImporterBase::RemoveBackedges(const Graph& graph) {
   // TODO(fengliuai): Converting to GraphDef and back is the easiest way to
   // clone a graph.
   // TODO(fengliuai): clone the graph without going to graph_def first.
@@ -300,8 +375,8 @@ Status Importer::RemoveBackedges(const Graph& graph) {
   graph_ = absl::make_unique<Graph>(graph.flib_def());
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
-  TF_RETURN_IF_ERROR(
-      ::tensorflow::ConvertGraphDefToGraph(opts, graph_def, graph_.get()));
+  TF_RETURN_IF_ERROR(::tensorflow::ConvertGraphDefToGraph(
+      opts, std::move(graph_def), graph_.get()));
 
   // Remove all the backedges. So the nodes can be added to the shape refiner.
   TF_RETURN_IF_ERROR(back_edge_helper_.Remove(graph_.get()));
@@ -330,7 +405,7 @@ Status Importer::RemoveBackedges(const Graph& graph) {
   return Status::OK();
 }
 
-StatusOr<Node*> Importer::ReplaceWithPlaceholderNode(
+StatusOr<Node*> ImporterBase::ReplaceWithPlaceholderNode(
     const TensorShapeProto& shape, DataType dtype, Node* input_node) {
   Node* placeholder_node;
   NodeBuilder builder(input_node->name(), "Placeholder");
@@ -351,7 +426,8 @@ StatusOr<Node*> Importer::ReplaceWithPlaceholderNode(
   return placeholder_node;
 }
 
-Status Importer::GetInputOutputNodes(std::unordered_set<const Node*>* nodes) {
+Status ImporterBase::GetInputOutputNodes(
+    std::unordered_set<const Node*>* nodes) {
   auto node_name_map = graph_->BuildNodeNameIndex();
   auto add_node = [&](const string& name) {
     auto it = node_name_map.find(name);
@@ -375,9 +451,9 @@ Status Importer::GetInputOutputNodes(std::unordered_set<const Node*>* nodes) {
 }
 
 // TODO(fengliuai): Replace the iterative algorithm by an one pass propagation
-Status Importer::AddNodesToShapeRefiner() {
-  shape_refiner_ =
-      absl::make_unique<ShapeRefiner>(*graph_versions_, graph_->op_registry());
+Status ImporterBase::AddNodesToShapeRefiner() {
+  shape_refiner_ = absl::make_unique<ShapeRefiner>(graph_->versions(),
+                                                   graph_->op_registry());
   // Some operations (for example "TPUExecute") don't have shape inference
   // function defined, so we should set this to false for adding nodes with
   // these types of operations.
@@ -527,8 +603,11 @@ Status Importer::AddNodesToShapeRefiner() {
   return Status::OK();
 }
 
-StatusOr<mlir::TensorType> Importer::InferInputType(
-    ExtendedInferenceContext* shape_context, int idx, mlir::Builder builder) {
+StatusOr<mlir::TensorType> ImporterBase::InferInputType(const Node& node,
+                                                        int idx,
+                                                        mlir::Builder builder) {
+  ExtendedInferenceContext* shape_context =
+      shape_refiner_->GetExtendedContext(&node);
   DataType dtype = shape_context->input_type(idx);
   auto* context = shape_context->get_context();
   return ConvertDataTypeAndShape(dtype, context->input(idx),
@@ -536,8 +615,10 @@ StatusOr<mlir::TensorType> Importer::InferInputType(
                                  context, builder);
 }
 
-StatusOr<mlir::TensorType> Importer::InferOutputType(
-    ExtendedInferenceContext* shape_context, int idx, mlir::Builder builder) {
+StatusOr<mlir::TensorType> ImporterBase::InferOutputType(
+    const Node& node, int idx, mlir::Builder builder) {
+  ExtendedInferenceContext* shape_context =
+      shape_refiner_->GetExtendedContext(&node);
   DataType dtype = shape_context->output_type(idx);
   auto* context = shape_context->get_context();
   return ConvertDataTypeAndShape(dtype, context->output(idx),
@@ -545,7 +626,7 @@ StatusOr<mlir::TensorType> Importer::InferOutputType(
                                  context, builder);
 }
 
-StatusOr<mlir::TensorType> Importer::ConvertDataTypeAndShape(
+StatusOr<mlir::TensorType> ImporterBase::ConvertDataTypeAndShape(
     DataType dtype, const shape_inference::ShapeHandle& handle,
     const std::vector<shape_inference::ShapeAndType>* handle_subtypes,
     shape_inference::InferenceContext* context, mlir::Builder builder) {
@@ -564,7 +645,7 @@ StatusOr<mlir::TensorType> Importer::ConvertDataTypeAndShape(
   return ConvertElementTypeAndShape(element_type, handle, context, builder);
 }
 
-StatusOr<mlir::TensorType> Importer::ConvertElementTypeAndShape(
+StatusOr<mlir::TensorType> ImporterBase::ConvertElementTypeAndShape(
     mlir::Type element_type, const shape_inference::ShapeHandle& handle,
     shape_inference::InferenceContext* context, mlir::Builder builder) {
   if (!context->RankKnown(handle)) {
@@ -591,7 +672,7 @@ StatusOr<mlir::TensorType> Importer::ConvertElementTypeAndShape(
       llvm::makeArrayRef(dimensions.begin(), dimensions.end()), element_type);
 }
 
-StatusOr<Importer::ElementSubtypes> Importer::ConvertSubtypes(
+StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
     const std::vector<shape_inference::ShapeAndType>* handle_subtypes,
     shape_inference::InferenceContext* context, mlir::Builder builder) {
   ElementSubtypes subtypes;
@@ -610,64 +691,64 @@ StatusOr<Importer::ElementSubtypes> Importer::ConvertSubtypes(
   return subtypes;
 }
 
-Status Importer::ConvertFunctionCallAttribute(
+Status ImporterBase::ConvertFunctionCallAttribute(
     const std::string& base_name, const AttrValue& value,
     llvm::SmallVector<mlir::NamedAttribute, 4>* attributes) {
   TF_ASSIGN_OR_RETURN(auto func_attr,
                       ConvertFunctionCallName(value.func().name()));
-  attributes->push_back(builder_->getNamedAttr(base_name, func_attr));
+  attributes->push_back(builder_.getNamedAttr(base_name, func_attr));
 
   for (const auto& it : value.func().attr()) {
     auto name = absl::StrCat(base_name, ".", it.first);
     TF_ASSIGN_OR_RETURN(auto value, ConvertAttributeValue(it.second));
-    attributes->push_back(builder_->getNamedAttr(name, value));
+    attributes->push_back(builder_.getNamedAttr(name, value));
   }
   return Status::OK();
 }
 
-StatusOr<mlir::SymbolRefAttr> Importer::ConvertFunctionCallName(
+StatusOr<mlir::SymbolRefAttr> ImporterBase::ConvertFunctionCallName(
     const std::string& func_name) {
   TF_RETURN_IF_ERROR(ConvertLibFunction(func_name));
   auto mlir_func_name = (*tf_name_to_mlir_name_)[func_name];
   auto func = module_.lookupSymbol<mlir::FuncOp>(mlir_func_name);
-  return builder_->getSymbolRefAttr(func);
+  return builder_.getSymbolRefAttr(func);
 }
 
-StatusOr<mlir::Attribute> Importer::ConvertAttributeValue(
+StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
     const AttrValue& value) {
   switch (value.value_case()) {
     case AttrValue::kI:
-      return builder_->getI64IntegerAttr(value.i());
+      return builder_.getI64IntegerAttr(value.i());
     case AttrValue::kS:
-      return builder_->getStringAttr(value.s());
+      return builder_.getStringAttr(value.s());
     case AttrValue::kF:
-      return builder_->getFloatAttr(builder_->getF32Type(), value.f());
+      return builder_.getFloatAttr(builder_.getF32Type(), value.f());
     case AttrValue::kB:
-      return builder_->getBoolAttr(value.b());
+      return builder_.getBoolAttr(value.b());
     case AttrValue::kType:
-      return builder_->getStringAttr(
+      return builder_.getStringAttr(
           mangling_util::MangleDataType(value.type()));
     case AttrValue::kShape:
-      return builder_->getStringAttr(mangling_util::MangleShape(value.shape()));
+      return builder_.getStringAttr(mangling_util::MangleShape(value.shape()));
     case AttrValue::kTensor:
       return ConvertTensorProto(value.tensor());
     case AttrValue::kList: {
       absl::InlinedVector<mlir::Attribute, 8> attrs;
       for (const auto& item : value.list().i())
-        attrs.push_back(builder_->getI64IntegerAttr(item));
+        attrs.push_back(builder_.getI64IntegerAttr(item));
       for (const auto& item : value.list().s())
-        attrs.push_back(builder_->getStringAttr(item));
+        attrs.push_back(builder_.getStringAttr(item));
       for (const auto& item : value.list().f())
-        attrs.push_back(builder_->getFloatAttr(builder_->getF32Type(), item));
+        attrs.push_back(builder_.getFloatAttr(builder_.getF32Type(), item));
       for (const auto& item : value.list().b())
-        attrs.push_back(builder_->getBoolAttr(item));
+        attrs.push_back(builder_.getBoolAttr(item));
       for (const auto& item : value.list().type()) {
-        attrs.push_back(builder_->getStringAttr(
+        attrs.push_back(builder_.getStringAttr(
             mangling_util::MangleDataType(static_cast<DataType>(item))));
       }
       for (const auto& item : value.list().shape()) {
         attrs.push_back(
-            builder_->getStringAttr(mangling_util::MangleShape(item)));
+            builder_.getStringAttr(mangling_util::MangleShape(item)));
       }
       for (const auto& item : value.list().tensor()) {
         TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorProto(item));
@@ -680,13 +761,13 @@ StatusOr<mlir::Attribute> Importer::ConvertAttributeValue(
               "func attributes with non-zero attr.size()");
         attrs.push_back(attr);
       }
-      return builder_->getArrayAttr(
+      return builder_.getArrayAttr(
           llvm::makeArrayRef(attrs.begin(), attrs.end()));
     }
     case AttrValue::kFunc:
       return errors::Unknown("kFunc type should be handled separately!");
     case AttrValue::VALUE_NOT_SET:
-      return builder_->getUnitAttr();
+      return builder_.getUnitAttr();
     // kPlaceholder is not implemented.
     default:
       return errors::Unimplemented(
@@ -694,20 +775,36 @@ StatusOr<mlir::Attribute> Importer::ConvertAttributeValue(
   }
 }
 
-Status Importer::ConvertLibFunction(const std::string& func_name) {
+void ImporterBase::GetArgsAndRetsFromFunctionBody(
+    const FunctionBody& fbody, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes,
+    absl::InlinedVector<Node*, 4>* control_ret_nodes) {
+  arg_nodes->reserve(fbody.arg_nodes.size());
+  ret_nodes->reserve(fbody.ret_nodes.size());
+  for (auto arg : fbody.arg_nodes) {
+    arg_nodes->emplace_back(arg, 0);
+  }
+  for (auto ret : fbody.ret_nodes) {
+    ret_nodes->emplace_back(ret, 0);
+  }
+  *control_ret_nodes = fbody.control_ret_nodes;
+}
+
+Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   // If the library function has been converted already, nothing needs to be
   // done.
   if (tf_name_to_mlir_name_->find(func_name) != tf_name_to_mlir_name_->end())
     return Status::OK();
 
-  std::string mlir_func_name = graph_flib_.UniqueFunctionName(func_name);
+  std::string mlir_func_name =
+      graph_flib_.UniqueFunctionName(StringRefToView(func_name));
   (*tf_name_to_mlir_name_)[func_name] = mlir_func_name;
 
   const auto& func_lib = graph_flib_;
   const auto* func_def = func_lib.Find(func_name);
   if (func_def == nullptr) {
     return errors::FailedPrecondition(
-        absl::StrCat("Failed to find function '", func_name,
+        absl::StrCat("Failed to find function '", StringRefToView(func_name),
                      "'. The imported TensorFlow GraphDef is ill-formed."));
   }
 
@@ -726,14 +823,14 @@ Status Importer::ConvertLibFunction(const std::string& func_name) {
                         ConvertAttributeValue(name_and_value.second));
     std::string attr_name =
         mangling_util::MangleAttributeName(name_and_value.first);
-    attributes.push_back(builder_->getNamedAttr(attr_name, attr));
+    attributes.push_back(builder_.getNamedAttr(attr_name, attr));
   }
 
   // Checks opdef stateful attribute and import that as Function Attribute
   if (func_def->signature().is_stateful()) {
     auto stateful_str = mlir::TF::TensorFlowDialect::GetStatefulAttrName();
     attributes.push_back(
-        builder_->getNamedAttr(stateful_str, builder_->getUnitAttr()));
+        builder_.getNamedAttr(stateful_str, builder_.getUnitAttr()));
   }
 
   // Checks for an associated custom gradient function. Adds it to the attribute
@@ -743,99 +840,135 @@ Status Importer::ConvertLibFunction(const std::string& func_name) {
     TF_RETURN_IF_ERROR(ConvertLibFunction(grad_func_name));
     auto mlir_grad_func_name = (*tf_name_to_mlir_name_)[grad_func_name];
     auto grad_func = module_.lookupSymbol<mlir::FuncOp>(mlir_grad_func_name);
-    auto gradient_attr = builder_->getSymbolRefAttr(grad_func);
+    auto gradient_attr = builder_.getSymbolRefAttr(grad_func);
     auto grad_string = mlir::TF::TensorFlowDialect::GetGradientAttrName();
-    attributes.push_back(builder_->getNamedAttr(grad_string, gradient_attr));
+    attributes.push_back(builder_.getNamedAttr(grad_string, gradient_attr));
   }
 
   // Converts the graph to a MLIR function and adds it to the module. Uses the
   // default node spec without any inputs or outputs as the function graph has
   // special '_Arg' and '_Retval' ops for argument and return values.
   NodeSpecs specs;
-  Importer child_importer(graph_flib_, debug_info_, specs, module_,
-                          tf_name_to_mlir_name_);
+  ImporterBase child_importer(graph_flib_, debug_info_, specs, module_,
+                              tf_name_to_mlir_name_);
   TF_RETURN_IF_ERROR(child_importer.PrepareConvert(*fbody->graph));
 
   TF_ASSIGN_OR_RETURN(auto func_type,
                       child_importer.InferLibFunctionType(*fbody));
 
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
-  arg_nodes.reserve(fbody->arg_nodes.size());
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
-  ret_nodes.reserve(fbody->ret_nodes.size());
-  for (auto arg : fbody->arg_nodes) {
-    arg_nodes.emplace_back(arg, 0);
-  }
-  for (auto ret : fbody->ret_nodes) {
-    ret_nodes.emplace_back(ret, 0);
-  }
+  absl::InlinedVector<Node*, 4> control_ret_nodes;
+  GetArgsAndRetsFromFunctionBody(*fbody, &arg_nodes, &ret_nodes,
+                                 &control_ret_nodes);
 
   TF_RETURN_IF_ERROR(child_importer.Convert(
-      mlir_func_name, func_type, arg_nodes, ret_nodes,
+      mlir_func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes,
       llvm::makeArrayRef(attributes.begin(), attributes.end())));
   return Status::OK();
 }
 
-Status Importer::PrepareConvert(const Graph& graph) {
-  graph_versions_ = &graph.versions();
+Status ImporterBase::PrepareConvert(const Graph& graph) {
   TF_RETURN_IF_ERROR(RemoveBackedges(graph));
   TF_RETURN_IF_ERROR(AddNodesToShapeRefiner());
   return Status::OK();
 }
 
-Status Importer::ConvertFunctionArgAndRets(
-    mlir::Block* bb, llvm::ArrayRef<mlir::Type> arg_types,
+Status ImporterBase::Convert(
+    llvm::StringRef func_name, mlir::FunctionType func_type,
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
-    const absl::InlinedVector<OutputTensor, 4>& ret_nodes) {
+    const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
+    const absl::InlinedVector<Node*, 4>& control_ret_nodes,
+    llvm::ArrayRef<mlir::NamedAttribute> attrs) {
+  // TODO(b/122040776): Uses debug info for FunctionDef.
+  auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
+                                       func_name, func_type, attrs);
+
+  module_.push_back(function);
+  // Seeds the builder with an initial block.
+  function.addEntryBlock();
+  builder_ = mlir::OpBuilder(function.getBody());
+  auto* bb = &function.front();
+
+  // Create the graph operation in which we will convert the individual nodes.
+  auto graph = builder_.create<mlir::tf_executor::GraphOp>(
+      function.getLoc(), func_type.getResults());
+  builder_.createBlock(&graph.body());
+
+  for (const Node* node : ordered_nodes_) {
+    TF_RETURN_IF_ERROR(ConvertNode(*node));
+  }
+
+  // Adds the backedges back to the function by creating the source and sink
+  // pairs.
+  TF_RETURN_IF_ERROR(AddBackedges());
+
+  return ConvertFunctionArgAndRets(bb, graph, func_type.getInputs(), arg_nodes,
+                                   ret_nodes, control_ret_nodes);
+}
+
+Status ImporterBase::ConvertFunctionArgAndRets(
+    mlir::Block* bb, mlir::tf_executor::GraphOp graph_op,
+    llvm::ArrayRef<mlir::Type> arg_types,
+    const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
+    const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
+    const absl::InlinedVector<Node*, 4>& control_ret_nodes) {
   for (int i = 0, e = arg_types.size(); i < e; ++i) {
-    auto* inst = node_values_[arg_nodes[i].node->id()];
-    auto* bb_arg = bb->addArgument(arg_types[i]);
+    // The lookup can't fail here: otherwise some nodes in the function haven't
+    // be converted to mlir operations and don't have a mapping.
+    mlir::Operation* island =
+        node_values_.find(arg_nodes[i].node->id())->second;
+    // We are looking for the instruction inside the island
+    mlir::Block& body = island->getRegion(0).front();
+    mlir::Operation* inst = &body.front();
+
+    auto* bb_arg = bb->getArgument(i);
     mlir::Value* arg_def = bb_arg;
 
-    // If this is an input node add argument to the operation operands by
-    // creating a new input operation.
-    if (StringPiece(arg_nodes[i].node->type_string()) !=
-        FunctionLibraryDefinition::kArgOp) {
-      auto inst_name = inst->getName().getStringRef();
-      mlir::OperationState state(inst->getLoc(),
-                                 inst_name.str().append(".input"));
-      state.attributes.append(inst->getAttrs().begin(), inst->getAttrs().end());
-
-      // If there are quantization specifications, add them as the attributes
-      auto name = inst->getAttrOfType<mlir::StringAttr>("name").getValue();
-      auto input_spec_it = specs_.inputs.find(name.str());
-      if (input_spec_it != specs_.inputs.end()) {
-        auto input_spec = input_spec_it->second;
-        if (IsQuantizationType(input_spec.final_dtype)) {
-          // Uses the MLIR built-in type so it can be handled easily later.
-          auto final_type = mlir::IntegerType::get(
-              GetQuantizationTypeWidth(input_spec.final_dtype), context_);
-          state.attributes.push_back(builder_->getNamedAttr(
-              "min", builder_->getF32FloatAttr(input_spec.min_value)));
-          state.attributes.push_back(builder_->getNamedAttr(
-              "max", builder_->getF32FloatAttr(input_spec.max_value)));
-          state.attributes.push_back(builder_->getNamedAttr(
-              "type", builder_->getTypeAttr(final_type)));
-          inst->getParentOfType<mlir::FuncOp>().setAttr(
-              "tf.quantize", builder_->getUnitAttr());
-        }
-      }
-
-      for (auto* r : inst->getResults()) state.types.push_back(r->getType());
-
-      state.operands.append(inst->getOperands().begin(),
-                            inst->getOperands().end());
-      state.operands.push_back(bb_arg);
-      builder_->setInsertionPoint(inst);
-      auto* input = builder_->createOperation(state);
-      arg_def = input->getResult(arg_nodes[i].index);
-      // Verify on the equivalent TF op would have failed, but catching this
-      // earlier for now as this exposed a bug. TODO(jpienaar): remove post
-      // dialect refactoring.
-      DCHECK(input->getResult(0)->getType() == input->getOperand(0)->getType())
-          << "invalid placeholder_input constructed";
+    // If this is an arg node, just forward the entry block argument
+    if (arg_nodes[i].node->IsArg()) {
+      island->getResult(0)->replaceAllUsesWith(arg_def);
+      island->dropAllReferences();
+      island->erase();
+      continue;
     }
 
+    // This is an input node, we'll create a new input operation by suffixing
+    // the existing one with .input.
+    auto inst_name = inst->getName().getStringRef();
+    mlir::OperationState state(inst->getLoc(),
+                               inst_name.str().append(".input"));
+    state.attributes.append(inst->getAttrs().begin(), inst->getAttrs().end());
+
+    // If there are quantization specifications, add them as the attributes
+    auto name = inst->getAttrOfType<mlir::StringAttr>("name").getValue();
+    auto input_spec_it = specs_.inputs.find(name.str());
+    if (input_spec_it != specs_.inputs.end()) {
+      auto input_spec = input_spec_it->second;
+      if (IsQuantizationType(input_spec.final_dtype)) {
+        // Uses the MLIR built-in type so it can be handled easily later.
+        auto final_type = mlir::IntegerType::get(
+            GetQuantizationTypeWidth(input_spec.final_dtype), context_);
+        state.attributes.push_back(builder_.getNamedAttr(
+            "min", builder_.getF32FloatAttr(input_spec.min_value)));
+        state.attributes.push_back(builder_.getNamedAttr(
+            "max", builder_.getF32FloatAttr(input_spec.max_value)));
+        state.attributes.push_back(
+            builder_.getNamedAttr("type", builder_.getTypeAttr(final_type)));
+        inst->getParentOfType<mlir::FuncOp>().setAttr("tf.quantize",
+                                                      builder_.getUnitAttr());
+      }
+    }
+
+    for (auto* r : inst->getResults()) state.types.push_back(r->getType());
+
+    state.operands.append(inst->getOperands().begin(),
+                          inst->getOperands().end());
+    state.operands.push_back(bb_arg);
+    builder_.setInsertionPoint(inst);
+    auto* input = builder_.createOperation(state);
+    arg_def = input->getResult(arg_nodes[i].index);
+
     for (auto index = 0; index < inst->getNumResults(); index++) {
       inst->getResult(index)->replaceAllUsesWith(arg_def);
     }
@@ -843,32 +976,47 @@ Status Importer::ConvertFunctionArgAndRets(
     inst->erase();
   }
 
-  absl::InlinedVector<mlir::Value*, 8> inst_to_returned;
+  llvm::SmallVector<mlir::Value*, 8> inst_to_return;
   for (const auto& ret : ret_nodes) {
     auto* inst = node_values_[ret.node->id()];
     auto op = absl::string_view(ret.node->type_string());
     if (op == FunctionLibraryDefinition::kRetOp ||
         op == FunctionLibraryDefinition::kDeviceRetOp) {
+      // Lookup the instruction inside the island
+      auto island_op = llvm::cast<mlir::tf_executor::IslandOp>(inst);
+      mlir::Operation* inner_op = &island_op.GetBody().front();
       // Remove kRetOp or kDeviceRetOp operation and return its operand.
       // kRetOp and kDeviceRetOp should have just one operand unless they have
       // control dependencies.
-      if (inst->getNumOperands() != 1)
+      if (inner_op->getNumOperands() != 1)
         return errors::Unimplemented("Return node with multiple inputs.");
-      inst_to_returned.push_back(inst->getOperand(0));
-      node_values_[ret.node->id()]->dropAllReferences();
-      node_values_[ret.node->id()]->erase();
+      inst_to_return.push_back(inner_op->getOperand(0));
+      inst->dropAllReferences();
+      inst->erase();
     } else {
-      inst_to_returned.push_back(inst->getResult(ret.index));
+      inst_to_return.push_back(inst->getResult(ret.index));
     }
   }
-  builder_->setInsertionPointToEnd(bb);
-  builder_->create<mlir::ReturnOp>(
-      mlir::UnknownLoc::get(context_),
-      llvm::makeArrayRef(inst_to_returned.begin(), inst_to_returned.end()));
+
+  for (Node* control_ret : control_ret_nodes) {
+    auto* inst = node_values_[control_ret->id()];
+    inst_to_return.push_back(*std::prev(inst->result_end()));
+  }
+
+  // Terminate the function by adding a Fetch operation to terminate the graph
+  // and a return operation to return the Graph results.
+  builder_.setInsertionPointToEnd(&graph_op.body().front());
+  builder_.create<mlir::tf_executor::FetchOp>(graph_op.getLoc(),
+                                              inst_to_return);
+  inst_to_return.assign(graph_op.getResults().begin(),
+                        graph_op.getResults().end());
+  builder_.setInsertionPointToEnd(bb);
+  builder_.create<mlir::ReturnOp>(mlir::UnknownLoc::get(context_),
+                                  inst_to_return);
   return Status::OK();
 }
 
-mlir::Location Importer::GetLocation(const NodeDef& node_def) {
+mlir::Location ImporterBase::GetLocation(const NodeDef& node_def) {
   const auto& debug_info = debug_info_.traces();
 
   // Get the CallSiteLoc for a node name.
@@ -900,14 +1048,14 @@ mlir::Location Importer::GetLocation(const NodeDef& node_def) {
 
     // Use the front FileLineColLoc to generate a NameLoc.
     mlir::Location node_name_loc =
-        mlir::NameLoc::get(name_id, locations.front(), context_);
+        mlir::NameLoc::get(name_id, locations.front());
 
     // If there are more locations then generate a stack trace, otherwise just
     // return the name loc.
     auto callsite_locs = llvm::makeArrayRef(locations).drop_front();
     return callsite_locs.empty()
                ? node_name_loc
-               : mlir::CallSiteLoc::get(node_name_loc, callsite_locs, context_);
+               : mlir::CallSiteLoc::get(node_name_loc, callsite_locs);
   };
 
   // For NextIteration nodes, location is used to pair source and sink nodes.
@@ -950,7 +1098,8 @@ mlir::Location Importer::GetLocation(const NodeDef& node_def) {
   }
 }
 
-std::string Importer::GetLocationStr(const Node& node, bool includeNodeName) {
+std::string ImporterBase::GetLocationStr(const Node& node,
+                                         bool includeNodeName) {
   const auto location = GetLocation(node.def());
   std::string s;
   llvm::raw_string_ostream ss(s);
@@ -963,7 +1112,80 @@ std::string Importer::GetLocationStr(const Node& node, bool includeNodeName) {
   return s;
 }
 
-Status Importer::ConvertNode(const Node& node) {
+mlir::Operation* ImporterBase::createOperation(
+    const Node& node, llvm::StringRef op_name,
+    const mlir::OperationState& result,
+    const llvm::SmallVectorImpl<mlir::Value*>& control_operands) {
+  // For the tf.executor specific operations (not wrapped in an island), we
+  // have an extra returned value for the control result, and we concatenate
+  // control and non-control operands.
+  mlir::SmallVector<mlir::Type, 4> types(result.types);
+  types.push_back(mlir::tf_executor::ControlType::get(builder_.getContext()));
+  mlir::SmallVector<mlir::Value*, 4> operands(result.operands);
+  operands.append(control_operands.begin(), control_operands.end());
+
+  auto loc = result.location;
+  // Dispatch based on the name and create the appropriate operation.
+  if (node.IsSwitch()) {
+    // Switch and _SwitchN both are in switch class, differentiate based on
+    // number of outputs.
+    if (node.num_outputs() > 2) {
+      return builder_.create<mlir::tf_executor::SwitchNOp>(loc, types, operands,
+                                                           result.attributes);
+    }
+    return builder_.create<mlir::tf_executor::SwitchOp>(loc, types, operands,
+                                                        result.attributes);
+  }
+  if (node.IsMerge()) {
+    return builder_.create<mlir::tf_executor::MergeOp>(loc, types, operands,
+                                                       result.attributes);
+  }
+  if (node.IsNextIteration()) {
+    // NextIteration is a bit special, we create a pair of operations that are
+    // linked together through a token returned by the source.
+    // We make use of a separate builder to insert the source at the top of
+    // the block.
+    mlir::OpBuilder builder_at_begin(builder_.getBlock(),
+                                     builder_.getBlock()->begin());
+    auto source_op =
+        builder_at_begin.create<mlir::tf_executor::NextIterationSourceOp>(
+            loc, operands[0]->getType(), result.attributes);
+    return builder_.create<mlir::tf_executor::NextIterationSinkOp>(
+        loc, source_op.token(), operands, result.attributes);
+  }
+  if (node.IsLoopCond()) {
+    return builder_.create<mlir::tf_executor::LoopCondOp>(loc, types, operands,
+                                                          result.attributes);
+  }
+  if (node.IsEnter()) {
+    return builder_.create<mlir::tf_executor::EnterOp>(loc, types, operands,
+                                                       result.attributes);
+  }
+  if (node.IsExit()) {
+    return builder_.create<mlir::tf_executor::ExitOp>(loc, types, operands,
+                                                      result.attributes);
+  }
+  if (node.IsControlTrigger()) {
+    return builder_.create<mlir::tf_executor::ControlTriggerOp>(
+        loc, operands, result.attributes);
+  }
+  // Regular TensorFlow operation are wrapped in a tf_executor.island.
+  auto island = builder_.create<mlir::tf_executor::IslandOp>(
+      result.location, types, control_operands,
+      mlir::ArrayRef<mlir::NamedAttribute>{});
+  island.body().push_back(new mlir::Block);
+  mlir::OpBuilder island_builder(&island.GetBody());
+
+  // Create the operation inside the island now.
+  mlir::Operation* inner_op = island_builder.createOperation(result);
+
+  // Add the terminator for the island
+  mlir::SmallVector<mlir::Value*, 8> ret_vals(inner_op->getResults());
+  island_builder.create<mlir::tf_executor::YieldOp>(result.location, ret_vals);
+  return island.getOperation();
+}
+
+Status ImporterBase::ConvertNode(const Node& node) {
   if (!node.IsOp()) {
     // Don't import the pseudo-nodes _SOURCE or _SINK. These are added by
     // Graph and don't exist in GraphDef.
@@ -979,9 +1201,12 @@ Status Importer::ConvertNode(const Node& node) {
     node_type_name = (*tf_name_to_mlir_name_)[node_type_name];
   }
 
-  const char* kTfControlFlowFormPrefix = "_tf.";
-  std::string op_name = kTfControlFlowFormPrefix + node_type_name;
+  auto get_full_op_name = [&](const std::string& op_name) {
+    const char* kTfPrefix = "tf.";
+    return kTfPrefix + op_name;
+  };
 
+  std::string op_name = get_full_op_name(node_type_name);
   if (back_edge_node_output_.contains(&node)) {
     op_name = op_name + ".sink";
   }
@@ -989,7 +1214,6 @@ Status Importer::ConvertNode(const Node& node) {
   const auto& node_def = node.def();
   mlir::OperationState result(GetLocation(node_def), op_name);
 
-  ExtendedInferenceContext* context = shape_refiner_->GetExtendedContext(&node);
   for (int i = 0; i < node.num_outputs(); ++i) {
     // The backedge has been removed, so we shouldn't count the corresponding
     // output from the src node when converting to an operation.
@@ -997,11 +1221,9 @@ Status Importer::ConvertNode(const Node& node) {
         back_edge_node_output_[&node] == i) {
       continue;
     }
-    TF_ASSIGN_OR_RETURN(auto type, InferOutputType(context, i, *builder_));
+    TF_ASSIGN_OR_RETURN(auto type, InferOutputType(node, i, builder_));
     result.types.push_back(type);
   }
-  result.types.push_back(
-      builder_->getType<mlir::TFControlFlow::TFControlType>());
 
   // Surprisingly input edges can be nondeterministically ordered. This
   // particularly seems to be the case for the control edges between _SOURCE
@@ -1019,6 +1241,10 @@ Status Importer::ConvertNode(const Node& node) {
   });
 
   result.operands.reserve(in_edges.size());
+
+  // Collect the control operands separately, they will be held by the island.
+  mlir::SmallVector<mlir::Value*, 8> control_operands;
+
   for (const auto* input_edge : in_edges) {
     const Node& input_node = *input_edge->src();
     if (input_node.IsSource()) {
@@ -1046,9 +1272,10 @@ Status Importer::ConvertNode(const Node& node) {
       return errors::FailedPrecondition(
           "Graph not traversed in reverse post order; use seen before def!");
     mlir::Operation* inst = node_values_[input_node.id()];
-    result.operands.push_back(inst->getResult(input_edge->IsControlEdge()
-                                                  ? inst->getNumResults() - 1
-                                                  : input_edge->src_output()));
+    if (input_edge->IsControlEdge())
+      control_operands.push_back(inst->getResult(inst->getNumResults() - 1));
+    else
+      result.operands.push_back(inst->getResult(input_edge->src_output()));
   }
 
   using FuncPairType = std::pair<const std::string*, const AttrValue*>;
@@ -1064,7 +1291,7 @@ Status Importer::ConvertNode(const Node& node) {
       funcs.emplace_back(&attr_name, &attr_value);
     } else {
       TF_ASSIGN_OR_RETURN(auto attr, ConvertAttributeValue(attr_value));
-      result.attributes.push_back(builder_->getNamedAttr(attr_name, attr));
+      result.attributes.push_back(builder_.getNamedAttr(attr_name, attr));
     }
   }
 
@@ -1077,12 +1304,32 @@ Status Importer::ConvertNode(const Node& node) {
                                                     &result.attributes));
   }
 
-  result.attributes.push_back(builder_->getNamedAttr(
-      "name", builder_->getStringAttr(std::string(node.name()))));
-  result.attributes.push_back(builder_->getNamedAttr(
-      "device", builder_->getStringAttr(std::string(node_def.device()))));
+  result.attributes.push_back(builder_.getNamedAttr(
+      "name", builder_.getStringAttr(std::string(node.name()))));
+  result.attributes.push_back(builder_.getNamedAttr(
+      "device", builder_.getStringAttr(std::string(node_def.device()))));
+
+  // Map If and StatelessIf op in TensorFlow to the common If op in MLIR and add
+  // the differentiating attribute.
+  if (node.IsIfNode()) {
+    result.name = mlir::OperationName(get_full_op_name("If"), context_);
+    mlir::BoolAttr val = builder_.getBoolAttr(node_type_name == "StatelessIf");
+    result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
+  }
+
+  // Map While and StatelessWhile op in TensorFlow to the common While op in
+  // MLIR and add the differentiating attribute.
+  if (node.IsWhileNode()) {
+    result.name = mlir::OperationName(get_full_op_name("While"), context_);
+    mlir::BoolAttr val =
+        builder_.getBoolAttr(node_type_name == "StatelessWhile");
+    result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
+  }
+
+  // Register the mapping between the TF node and the newly created operation.
+  node_values_[node.id()] =
+      createOperation(node, op_name, result, control_operands);
 
-  node_values_[node.id()] = builder_->createOperation(result);
   return Status::OK();
 }
 
@@ -1098,7 +1345,7 @@ Status Importer::ConvertNode(const Node& node) {
 //   operation.
 // TODO(fengliuai): Preserve the order of the results and operands if
 // necessary.
-Status Importer::AddBackedges() {
+Status ImporterBase::AddBackedges() {
   for (auto it : back_edge_dst_inputs_) {
     BackEdge& edge = it.second;
     if (!edge.src->IsNextIteration() || !edge.dst->IsMerge()) {
@@ -1112,9 +1359,10 @@ Status Importer::AddBackedges() {
   return Status::OK();
 }
 
-Status Importer::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
-                             int dst_input) {
-  mlir::Operation* source = GetOrCreateNextIterationSource(sink, dst);
+Status ImporterBase::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
+                                 int dst_input) {
+  // Get the NextIteration.Source operation from the token operand of the sink.
+  mlir::Operation* source = sink->getOperand(0)->getDefiningOp();
 
   // Adds the "source" to the operands of the dst by creating a new dst
   // operation.
@@ -1130,12 +1378,11 @@ Status Importer::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
       state.operands.push_back(dst->getOperand(input - 1));
     }
   }
-  state.attributes.append(dst->getAttrs().begin(), dst->getAttrs().end());
-  for (auto* result : dst->getResults()) {
-    state.types.push_back(result->getType());
-  }
-  builder_->setInsertionPoint(dst);
-  auto* new_dst = builder_->createOperation(state);
+  state.attributes.assign(dst->getAttrs().begin(), dst->getAttrs().end());
+  state.types.assign(dst->getResultTypes().begin(),
+                     dst->getResultTypes().end());
+  builder_.setInsertionPoint(dst);
+  auto* new_dst = builder_.createOperation(state);
 
   // Replaces the output uses of the old operation by the corresponding
   // result of the new operation, and deletes the old operation.
@@ -1148,134 +1395,7 @@ Status Importer::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
   return Status::OK();
 }
 
-mlir::Operation* Importer::GetOrCreateNextIterationSource(
-    mlir::Operation* sink, mlir::Operation* dst) {
-  auto iter = next_iteration_sink_source_.find(sink);
-  if (iter != next_iteration_sink_source_.end()) return iter->second;
-
-  auto inst_name = sink->getName().getStringRef();
-  inst_name.consume_back(".sink");
-  mlir::OperationState src_state(sink->getLoc(),
-                                 inst_name.str().append(".source"));
-  src_state.attributes.append(sink->getAttrs().begin(), sink->getAttrs().end());
-  src_state.types.push_back(dst->getResult(0)->getType());
-  src_state.types.push_back(
-      builder_->getType<mlir::TFControlFlow::TFControlType>());
-  builder_->setInsertionPoint(dst->getBlock(), dst->getBlock()->begin());
-  mlir::Operation* source = builder_->createOperation(src_state);
-  next_iteration_sink_source_[sink] = source;
-  return source;
-}
-
-Status Importer::Convert(llvm::StringRef func_name,
-                         mlir::FunctionType func_type,
-                         const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
-                         const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
-                         llvm::ArrayRef<mlir::NamedAttribute> attrs) {
-  // TODO(b/122040776): Uses debug info for FunctionDef.
-  auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
-                                       func_name, func_type, attrs);
-
-  module_.push_back(function);
-  builder_ = absl::make_unique<mlir::OpBuilder>(function.getBody());
-  // Seeds the builder with an initial block.
-  auto* bb = builder_->createBlock(&function.getBody());
-
-  for (const Node* node : ordered_nodes_) {
-    TF_RETURN_IF_ERROR(ConvertNode(*node));
-  }
-
-  // Adds the backedges back to the function by creating the source and sink
-  // pairs.
-  TF_RETURN_IF_ERROR(AddBackedges());
-
-  return ConvertFunctionArgAndRets(bb, func_type.getInputs(), arg_nodes,
-                                   ret_nodes);
-}
-
-StatusOr<mlir::FunctionType> Importer::InferMainFunctionType(
-    absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
-  // Finds out all the input nodes and output nodes.
-  if (!specs_.inputs.empty() || !specs_.output_arrays.empty()) {
-    arg_nodes->resize(specs_.inputs.size());
-    ret_nodes->resize(specs_.output_arrays_order.size());
-
-    for (Node* n : ordered_nodes_) {
-      // Handle inputs/arguments.
-      auto input_it = specs_.inputs.find(n->name());
-      if (input_it != specs_.inputs.end()) {
-        (*arg_nodes)[std::distance(specs_.inputs.begin(), input_it)] = {n, 0};
-      }
-
-      // Handle outputs/returns.
-      if (specs_.output_arrays.find(n->name()) != specs_.output_arrays.end()) {
-        for (int i = 0, e = specs_.output_arrays_order.size(); i != e; ++i) {
-          std::pair<std::string, std::string> name_and_port =
-              absl::StrSplit(specs_.output_arrays_order[i], ':');
-          auto name = name_and_port.first;
-          if (name != n->name()) continue;
-          int port = 0;
-          if (!name_and_port.second.empty() &&
-              !absl::SimpleAtoi(name_and_port.second, &port)) {
-            return errors::InvalidArgument("Invalid port specification: ",
-                                           specs_.output_arrays_order[i]);
-          }
-          (*ret_nodes)[i] = {n, port};
-        }
-      }
-    }
-  }
-
-  int i = 0;
-  for (auto it : specs_.inputs) {
-    if (arg_nodes->at(i++).node == nullptr) {
-      return errors::InvalidArgument("Input ", it.first,
-                                     " was not found in graph");
-    }
-  }
-  for (int i = 0, e = specs_.output_arrays_order.size(); i != e; ++i) {
-    if (ret_nodes->at(i).node == nullptr) {
-      return errors::InvalidArgument("Output ", specs_.output_arrays_order[i],
-                                     " was not found in graph");
-    }
-  }
-
-  // Starts to construct the function type.
-  llvm::SmallVector<mlir::Type, 4> arg_types;
-  llvm::SmallVector<mlir::Type, 4> ret_types;
-  arg_types.reserve(specs_.inputs.size());
-  ret_types.reserve(specs_.output_arrays.size());
-  mlir::Builder builder(context_);
-
-  // Input nodes as function arguments.
-  for (const auto& input : specs_.inputs) {
-    mlir::Type element_type;
-    const auto& node_info = input.second;
-    TF_RETURN_IF_ERROR(::tensorflow::ConvertDataType(node_info.imported_dtype,
-                                                     builder, &element_type));
-    llvm::SmallVector<int64_t, 4> shape;
-    TF_RETURN_IF_ERROR(ConvertToMlirShape(node_info.shape, &shape));
-    arg_types.push_back(builder.getTensorType(shape, element_type));
-  }
-
-  // Output nodes as function returns.
-  for (const auto& ret : *ret_nodes) {
-    if (ret.node->num_outputs() < 1) {
-      return errors::FailedPrecondition(
-          "Invalid output node; should have at least 1 output: " +
-          ret.node->name());
-    }
-    auto* shape_context = shape_refiner_->GetExtendedContext(ret.node);
-    TF_ASSIGN_OR_RETURN(auto type,
-                        InferOutputType(shape_context, ret.index, builder));
-    ret_types.push_back(type);
-  }
-
-  return builder.getFunctionType(arg_types, ret_types);
-}
-
-StatusOr<mlir::FunctionType> Importer::InferLibFunctionType(
+StatusOr<mlir::FunctionType> ImporterBase::InferLibFunctionType(
     const FunctionBody& fbody) {
   mlir::Builder builder(context_);
 
@@ -1297,76 +1417,273 @@ StatusOr<mlir::FunctionType> Importer::InferLibFunctionType(
     // Find node in the graph using the node id instead of using `ret` directly
     // because the graph has been cloned.
     auto* node = graph_->FindNodeId(ret->id());
-    auto* shape_context = shape_refiner_->GetExtendedContext(node);
 
     // Return type of the function is type of the only input of the respective
     // return node in the function.
-    TF_ASSIGN_OR_RETURN(auto type,
-                        InferInputType(shape_context, /*idx=*/0, builder));
+    TF_ASSIGN_OR_RETURN(auto type, InferInputType(*node, /*idx=*/0, builder));
     ret_types.push_back(type);
   }
 
   return builder.getFunctionType(arg_types, ret_types);
 }
 
-StatusOr<mlir::OwningModuleRef> Importer::Convert(
+// Stateful helper class to import a TensorFlow model expressed in GraphDef into
+// an MLIR Module.
+//
+// The nodes defined in the graph is converted to a function called "main". All
+// the library function definitions are converted to MLIR functions in the
+// module.
+class GraphDefImporter : public ImporterBase {
+ public:
+  // Main entry point: converts the given graph to an MLIR Module.
+  static StatusOr<mlir::OwningModuleRef> Convert(
+      mlir::MLIRContext* context, const Graph& graph,
+      const GraphDebugInfo& debug_info,
+      const FunctionLibraryDefinition& flib_def, const NodeSpecs& specs);
+
+ private:
+  explicit GraphDefImporter(
+      const FunctionLibraryDefinition& flib, const GraphDebugInfo& debug_info,
+      const NodeSpecs& specs, mlir::ModuleOp module,
+      std::unordered_map<std::string, std::string>* tf_name_to_mlir_name)
+      : ImporterBase(flib, debug_info, specs, module, tf_name_to_mlir_name) {}
+
+  // Returns the function signature of the main function of converted MLIR
+  // module, the input nodes and output nodes. The type and shape information
+  // for the function arguments are read from `specs`, but the type and shape
+  // information for the function returns are inferred by the shape refiner in
+  // ImporterBase.
+  StatusOr<mlir::FunctionType> InferMainFunctionType(
+      const NodeSpecs& specs, mlir::MLIRContext* context,
+      absl::InlinedVector<OutputTensor, 4>* arg_nodes,
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
+};
+
+StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
     const NodeSpecs& specs) {
   mlir::OwningModuleRef module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
   std::unordered_map<std::string, std::string> tf_name_to_mlir_name;
-  Importer importer(flib_def, debug_info, specs, module.get(),
-                    &tf_name_to_mlir_name);
-  TF_RETURN_IF_ERROR(importer.PrepareConvert(graph));
 
-  // Collects the argument and return nodes by looking up the node names
-  // specified by the user.
+  GraphDefImporter importer(flib_def, debug_info, specs, module.get(),
+                            &tf_name_to_mlir_name);
+
+  mlir::FunctionType func_type;
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
-  TF_ASSIGN_OR_RETURN(auto func_type,
-                      importer.InferMainFunctionType(&arg_nodes, &ret_nodes));
-
-  // TODO(prakalps): Refactor to keep attribute strings (tf.entry_function,
-  // tf.versions) shared by importer and exporter in a centralized place.
-  // Record the input and output mapping.
+  absl::InlinedVector<Node*, 4> control_ret_nodes;
   llvm::SmallVector<mlir::NamedAttribute, 1> attrs;
-  if (!specs.inputs.empty() || !specs.output_arrays.empty()) {
-    mlir::Builder b(context);
-    std::string s;
-    llvm::raw_string_ostream ss(s);
-    mlir::interleaveComma(
-        specs.inputs, ss,
-        [&](const std::pair<std::string, ArrayInfo>& v) { ss << v.first; });
-    auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
-    s.clear();
-    mlir::interleaveComma(specs.output_arrays, ss,
-                          [&](const std::string& v) { ss << v; });
-    auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
+  std::unique_ptr<FunctionBody> graph_fbody;
+  if (specs.graph_as_function) {
+    if (specs.prune_unused_nodes || !specs.inputs.empty() ||
+        !specs.output_arrays.empty() || !specs.output_arrays_order.empty())
+      return errors::InvalidArgument(
+          "Pruning of graph is currently unsupported when the main graph is "
+          "converted to a function.");
+    // Converts graph into a FunctionDef.
+    FunctionDef graph_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(graph, "main", &graph_fdef));
 
-    attrs.push_back(b.getNamedAttr("tf.entry_function",
-                                   b.getDictionaryAttr({inputs, outputs})));
+    // Converts FunctionDef into a FunctionBody.
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(graph_fdef, AttrSlice(),
+                                               &flib_def, &graph_fbody));
+
+    TF_RETURN_IF_ERROR(importer.PrepareConvert(*graph_fbody->graph));
+    TF_ASSIGN_OR_RETURN(func_type, importer.InferLibFunctionType(*graph_fbody));
+    importer.GetArgsAndRetsFromFunctionBody(*graph_fbody, &arg_nodes,
+                                            &ret_nodes, &control_ret_nodes);
+
+    if (!arg_nodes.empty() || !ret_nodes.empty()) {
+      mlir::Builder b(context);
+      std::string s;
+      llvm::raw_string_ostream ss(s);
+      auto node_name = [&](const Node* node) { ss << node->name(); };
+      mlir::interleaveComma(graph_fbody->arg_nodes, ss, node_name);
+      auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
+      s.clear();
+      mlir::interleaveComma(graph_fbody->ret_nodes, ss, node_name);
+      auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
+
+      attrs.push_back(b.getNamedAttr("tf.entry_function",
+                                     b.getDictionaryAttr({inputs, outputs})));
+    }
+  } else {
+    TF_RETURN_IF_ERROR(importer.PrepareConvert(graph));
+
+    // Collects the argument and return nodes by looking up the node names
+    // specified by the user.
+    TF_ASSIGN_OR_RETURN(func_type, importer.InferMainFunctionType(
+                                       specs, context, &arg_nodes, &ret_nodes));
+
+    // TODO(prakalps): Refactor to keep attribute strings (tf.entry_function,
+    // tf.versions) shared by importer and exporter in a centralized place.
+    // Record the input and output mapping.
+    if (!specs.inputs.empty() || !specs.output_arrays.empty()) {
+      mlir::Builder b(context);
+      std::string s;
+      llvm::raw_string_ostream ss(s);
+      mlir::interleaveComma(
+          specs.inputs, ss,
+          [&](const std::pair<std::string, ArrayInfo>& v) { ss << v.first; });
+      auto inputs = b.getNamedAttr("inputs", b.getStringAttr(ss.str()));
+      s.clear();
+      mlir::interleaveComma(specs.output_arrays, ss);
+      auto outputs = b.getNamedAttr("outputs", b.getStringAttr(ss.str()));
+
+      attrs.push_back(b.getNamedAttr("tf.entry_function",
+                                     b.getDictionaryAttr({inputs, outputs})));
+    }
   }
 
   // Record version info.
-  if (importer.graph_versions_) {
-    mlir::Builder b(context);
-    auto producer = b.getNamedAttr(
-        "producer", b.getI32IntegerAttr(importer.graph_versions_->producer()));
-    auto min_consumer = b.getNamedAttr(
-        "min_consumer",
-        b.getI32IntegerAttr(importer.graph_versions_->min_consumer()));
-    auto bad_consumers = b.getNamedAttr(
-        "bad_consumers", b.getI32ArrayAttr(llvm::ArrayRef<int32_t>(
-                             importer.graph_versions_->bad_consumers().begin(),
-                             importer.graph_versions_->bad_consumers().end())));
-    module->setAttr("tf.versions",
-                    b.getDictionaryAttr(llvm::ArrayRef<mlir::NamedAttribute>(
-                        {producer, min_consumer, bad_consumers})));
+  const auto& graph_versions = graph.versions();
+  mlir::Builder b(context);
+  auto producer = b.getNamedAttr(
+      "producer", b.getI32IntegerAttr(graph_versions.producer()));
+  auto min_consumer = b.getNamedAttr(
+      "min_consumer", b.getI32IntegerAttr(graph_versions.min_consumer()));
+  auto bad_consumers = b.getNamedAttr(
+      "bad_consumers", b.getI32ArrayAttr(llvm::ArrayRef<int32_t>(
+                           graph_versions.bad_consumers().begin(),
+                           graph_versions.bad_consumers().end())));
+  module->setAttr("tf.versions",
+                  b.getDictionaryAttr(llvm::ArrayRef<mlir::NamedAttribute>(
+                      {producer, min_consumer, bad_consumers})));
+
+  TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
+      "main", func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs));
+  return module;
+}
+
+StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
+    const NodeSpecs& specs, mlir::MLIRContext* context,
+    absl::InlinedVector<OutputTensor, 4>* arg_nodes,
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
+  // Finds out all the input nodes and output nodes.
+  if (!specs.inputs.empty() || !specs.output_arrays.empty()) {
+    arg_nodes->resize(specs.inputs.size());
+    ret_nodes->resize(specs.output_arrays_order.size());
+
+    for (Node* n : GetOrderedNodes()) {
+      // Handle inputs/arguments.
+      auto input_it = specs.inputs.find(n->name());
+      if (input_it != specs.inputs.end()) {
+        (*arg_nodes)[std::distance(specs.inputs.begin(), input_it)] = {n, 0};
+      }
+
+      // Handle outputs/returns.
+      if (specs.output_arrays.find(n->name()) != specs.output_arrays.end()) {
+        for (int i = 0, e = specs.output_arrays_order.size(); i != e; ++i) {
+          std::pair<std::string, std::string> name_and_port =
+              absl::StrSplit(specs.output_arrays_order[i], ':');
+          auto name = name_and_port.first;
+          if (name != n->name()) continue;
+          int port = 0;
+          if (!name_and_port.second.empty() &&
+              !absl::SimpleAtoi(name_and_port.second, &port)) {
+            return errors::InvalidArgument("Invalid port specification: ",
+                                           specs.output_arrays_order[i]);
+          }
+          (*ret_nodes)[i] = {n, port};
+        }
+      }
+    }
+  }
+
+  int i = 0;
+  for (auto it : specs.inputs) {
+    if (arg_nodes->at(i++).node == nullptr) {
+      return errors::InvalidArgument("Input ", it.first,
+                                     " was not found in graph");
+    }
+  }
+  for (int i = 0, e = specs.output_arrays_order.size(); i != e; ++i) {
+    if (ret_nodes->at(i).node == nullptr) {
+      return errors::InvalidArgument("Output ", specs.output_arrays_order[i],
+                                     " was not found in graph");
+    }
+  }
+
+  // Starts to construct the function type.
+  llvm::SmallVector<mlir::Type, 4> arg_types;
+  llvm::SmallVector<mlir::Type, 4> ret_types;
+  arg_types.reserve(specs.inputs.size());
+  ret_types.reserve(specs.output_arrays.size());
+  mlir::Builder builder(context);
+
+  // Input nodes as function arguments.
+  for (const auto& input : specs.inputs) {
+    mlir::Type element_type;
+    const auto& node_info = input.second;
+    TF_RETURN_IF_ERROR(::tensorflow::ConvertDataType(node_info.imported_dtype,
+                                                     builder, &element_type));
+    llvm::SmallVector<int64_t, 4> shape;
+    TF_RETURN_IF_ERROR(ConvertToMlirShape(node_info.shape, &shape));
+    arg_types.push_back(builder.getTensorType(shape, element_type));
+  }
+
+  // Output nodes as function returns.
+  for (const auto& ret : *ret_nodes) {
+    if (ret.node->num_outputs() <= ret.index) {
+      return errors::InvalidArgument("Invalid output index ", ret.index,
+                                     " specified for node: ", ret.node->name());
+    }
+    TF_ASSIGN_OR_RETURN(auto type,
+                        InferOutputType(*ret.node, ret.index, builder));
+    ret_types.push_back(type);
+  }
+
+  return builder.getFunctionType(arg_types, ret_types);
+}
+
+// Stateful helper class to import a TensorFlow model expressed in SavedModel
+// into an MLIR Module.
+class SavedModelImporter : public ImporterBase {
+ public:
+  // Main entry point: converts all functions in the given meta graph to an MLIR
+  // Module.
+  static StatusOr<mlir::OwningModuleRef> Convert(
+      const MetaGraphDef& meta_graph, const GraphDebugInfo& debug_info,
+      bool add_default_attributes, mlir::MLIRContext* context);
+
+ private:
+  explicit SavedModelImporter(
+      const FunctionLibraryDefinition& flib, const GraphDebugInfo& debug_info,
+      const NodeSpecs& specs, mlir::ModuleOp module,
+      std::unordered_map<std::string, std::string>* tf_name_to_mlir_name)
+      : ImporterBase(flib, debug_info, specs, module, tf_name_to_mlir_name) {}
+};
+
+StatusOr<mlir::OwningModuleRef> SavedModelImporter::Convert(
+    const MetaGraphDef& meta_graph, const GraphDebugInfo& debug_info,
+    bool add_default_attributes, mlir::MLIRContext* context) {
+  NodeSpecs specs;
+  mlir::OwningModuleRef module =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
+  std::unordered_map<std::string, std::string> tf_name_to_mlir_name;
+
+  const auto& graphdef = meta_graph.graph_def();
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  Graph graph(OpRegistry::Global());
+
+  GraphDef preprocessed_graphdef(graphdef);
+  if (add_default_attributes) {
+    TF_RETURN_IF_ERROR(PreprocessGraphDef(nullptr, &preprocessed_graphdef));
   }
 
   TF_RETURN_IF_ERROR(
-      importer.Convert("main", func_type, arg_nodes, ret_nodes, attrs));
+      ConvertGraphDefToGraph(options, preprocessed_graphdef, &graph));
+
+  SavedModelImporter importer(graph.flib_def(), debug_info, specs, module.get(),
+                              &tf_name_to_mlir_name);
+
+  auto fn_names = graph.flib_def().ListFunctionNames();
+  for (const auto& fn_name : fn_names) {
+    TF_RETURN_IF_ERROR(importer.ConvertLibFunction(fn_name));
+  }
   return module;
 }
 }  // namespace
@@ -1381,10 +1698,10 @@ StatusOr<mlir::OwningModuleRef> ConvertGraphdefToMlir(
 
   GraphDef preprocessed_graphdef(graphdef);
   if (add_default_attributes) {
-    TF_RETURN_IF_ERROR(AddDefaultsToNodeDef(&preprocessed_graphdef));
+    TF_RETURN_IF_ERROR(PreprocessGraphDef(&specs, &preprocessed_graphdef));
   }
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, preprocessed_graphdef, &graph));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+      options, std::move(preprocessed_graphdef), &graph));
 
   return ConvertGraphToMlir(graph, debug_info, graph.flib_def(), specs,
                             context);
@@ -1394,7 +1711,14 @@ StatusOr<mlir::OwningModuleRef> ConvertGraphToMlir(
     const Graph& graph, const GraphDebugInfo& debug_info,
     const FunctionLibraryDefinition& flib_def, const NodeSpecs& specs,
     mlir::MLIRContext* context) {
-  return Importer::Convert(context, graph, debug_info, flib_def, specs);
+  return GraphDefImporter::Convert(context, graph, debug_info, flib_def, specs);
+}
+
+StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
+    const SavedModelBundle& saved_model, const GraphDebugInfo& debug_info,
+    mlir::MLIRContext* context, bool add_default_attributes) {
+  return SavedModelImporter::Convert(saved_model.meta_graph_def, debug_info,
+                                     add_default_attributes, context);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
similarity index 69%
rename from tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h
rename to tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index c494526bb4d..98bb607fa6a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_GRAPHDEF_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_GRAPHDEF_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
 
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -27,20 +28,26 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Given a GraphDef, returns a MLIR module containing the graph in control-flow
-// form.
+// Given a GraphDef, returns a MLIR module containing the graph, expressed with
+// tf_executor dialect.
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertGraphdefToMlir(
     const GraphDef& graphdef, const GraphDebugInfo& debug_info,
     const NodeSpecs& specs, mlir::MLIRContext* context,
     bool add_default_attributes = true);
 
-// Given a Graph, returns a MLIR module containing the graph in control-flow
-// form.
+// Given a Graph, returns a MLIR module containing the graph, expressed with
+// tf_executor dialect.
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertGraphToMlir(
     const Graph& graph, const GraphDebugInfo& debug_info,
     const FunctionLibraryDefinition& flib_def, const NodeSpecs& specs,
     mlir::MLIRContext* context);
 
+// Given a SavedModel, returns a MLIR module containing the functions, expressed
+// with tf_executor dialect.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
+    const SavedModelBundle& saved_model, const GraphDebugInfo& debug_info,
+    mlir::MLIRContext* context, bool add_default_attributes = true);
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_GRAPHDEF_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index 3fc7ee55b4f..6adf1f07339 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -56,6 +56,13 @@ struct NodeSpecs {
   // setting prune_unused_nodes to true, would prune unreachable nodes if
   // output_arrays is specified.
   bool prune_unused_nodes = false;
+  // If true, inputs of type LegacyFedInput are replaced with Placeholder ops.
+  // LegacyFedInput ops have two outputs unlike Placeholder which has only one
+  // output, so if both outputs of the LegacyFedInput ops are used then returns
+  // an error.
+  bool convert_legacy_fed_inputs = false;
+  // If true, the main graph will be treated as a function.
+  bool graph_as_function = false;
 };
 
 struct ExporterConfigs {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
index 3d71910edcd..3ebd722c580 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h"
 
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "mlir/Analysis/Verifier.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
@@ -35,9 +36,15 @@ Status MlirRoundtripPass::Run(const GraphOptimizationPassOptions& options) {
   TF_ASSIGN_OR_RETURN(auto module,
                       ConvertGraphToMlir(**options.graph, debug_info,
                                          *options.flib_def, specs, &context));
-  // TODO(jpienaar): Remove, just simple verification that this works.
-  module->dump();
-  return ConvertMlirToGraph(*module, confs, options.graph, options.flib_def);
+  if (failed(mlir::verify(*module))) {
+    // TODO(jpienaar): Remove, just simple verification that this works.
+    module->dump();
+    return errors::Internal("Verifier failed on MLIR import for the graph");
+  }
+  auto status =
+      ConvertMlirToGraph(*module, confs, options.graph, options.flib_def);
+  if (!status.ok()) module->dump();
+  return status;
 }
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
index 96a66d4eab3..41417edcecf 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
 
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 5c7b1e824fe..604fced24d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/Parser.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
@@ -46,6 +46,7 @@ static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view inference_type, absl::string_view min_values,
     absl::string_view max_values, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function,
     mlir::MLIRContext* context) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(tensorflow::LoadProtoFromFile(input_filename, &graphdef));
@@ -57,6 +58,8 @@ static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
 
   NodeSpecs specs;
   specs.prune_unused_nodes = prune_unused_nodes;
+  specs.convert_legacy_fed_inputs = convert_legacy_fed_inputs;
+  specs.graph_as_function = graph_as_function;
   TF_RETURN_IF_ERROR(ParseInputArrayInfo(
       input_arrays, input_dtypes, input_shapes, inference_type, min_values,
       max_values, &specs.inputs));
@@ -71,15 +74,51 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view inference_type, absl::string_view min_values,
     absl::string_view max_values, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function,
     mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input_filename, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, inference_type, min_values, max_values, prune_unused_nodes,
-      context);
+      convert_legacy_fed_inputs, graph_as_function, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return nullptr;
   }
+
+  return module_or.ConsumeValueOrDie();
+}
+
+mlir::OwningModuleRef SavedModelToMlirImport(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags,
+    absl::string_view debug_info_file, mlir::MLIRContext* context) {
+  SessionOptions session_options;
+  RunOptions run_options;
+  tensorflow::SavedModelBundle bundle;
+  auto load_status = LoadSavedModel(
+      session_options, run_options,
+      std::string(saved_model_dir.data(), saved_model_dir.length()), tags,
+      &bundle);
+  if (!load_status.ok()) {
+    LOG(ERROR) << "Failed to load saved model '" << saved_model_dir
+               << "': " << load_status;
+    return nullptr;
+  }
+
+  GraphDebugInfo debug_info;
+  if (!debug_info_file.empty()) {
+    if (!LoadProtoFromFile(debug_info_file, &debug_info).ok()) {
+      LOG(ERROR) << "Failed to load debug info file: " << debug_info_file;
+      return nullptr;
+    }
+  }
+
+  auto module_or = ConvertSavedModelToMlir(bundle, debug_info, context);
+
+  if (!module_or.status().ok()) {
+    LOG(ERROR) << "SavedModel import failed: " << module_or.status();
+    return nullptr;
+  }
   return module_or.ConsumeValueOrDie();
 }
 
@@ -89,11 +128,12 @@ mlir::OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view inference_type, absl::string_view min_values,
     absl::string_view max_values, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function,
     mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input_filename, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, inference_type, min_values, max_values, prune_unused_nodes,
-      context);
+      convert_legacy_fed_inputs, graph_as_function, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return nullptr;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index 794a2ef9fcb..290223017b4 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
 
+#include <string>
+#include <unordered_set>
+
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
@@ -33,6 +36,7 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view inference_type, absl::string_view min_values,
     absl::string_view max_values, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function,
     mlir::MLIRContext* context);
 
 // Similar as the above function, but replaces all constant tensors
@@ -43,7 +47,16 @@ mlir::OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
     absl::string_view input_shapes, absl::string_view output_arrays,
     absl::string_view inference_type, absl::string_view min_values,
     absl::string_view max_values, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function,
     mlir::MLIRContext* context);
+
+// Converts a TensorFlow SavedModel stored in the directory with the given
+// `saved_model_dir` into a MLIR module. Creates MLIR entities into the
+// given MLIR `context`.
+mlir::OwningModuleRef SavedModelToMlirImport(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags,
+    absl::string_view debug_info_file, mlir::MLIRContext* context);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
index 8e74296b4fc..80df3665007 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
@@ -84,3 +84,21 @@ opt<bool> prune_unused_nodes(
     "tf-prune-unused-nodes",
     llvm::cl::desc("Prune unused nodes in the input graphdef "),
     llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+opt<bool> convert_legacy_fed_inputs(
+    "tf-convert-legacy-fed-inputs",
+    llvm::cl::desc(
+        "Eliminate LegacyFedInput nodes by replacing them with Placeholder "),
+    llvm::cl::init(false));
+
+opt<bool> graph_as_function("tf-graph-as-function",
+                            llvm::cl::desc("Treat main graph as a function "),
+                            llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MeataGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
index 8cf17e3a3f0..c5d609acb95 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
@@ -35,5 +35,9 @@ extern llvm::cl::opt<std::string> min_values;
 extern llvm::cl::opt<std::string> max_values;
 extern llvm::cl::opt<std::string> debug_info_file;
 extern llvm::cl::opt<bool> prune_unused_nodes;
+extern llvm::cl::opt<bool> convert_legacy_fed_inputs;
+extern llvm::cl::opt<bool> graph_as_function;
+
+extern llvm::cl::opt<std::string> saved_model_tags;
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 7d7632d7e82..90e305f64aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -45,18 +45,30 @@ static OwningModuleRef GraphdefToMlirTranslateFunction(
   return tensorflow::GraphdefToMlirTranslateFunction(
       StringRefToView(input_filename), debug_info_file, input_arrays,
       input_dtypes, input_shapes, output_arrays, inference_type, min_values,
-      max_values, prune_unused_nodes, context);
+      max_values, prune_unused_nodes, convert_legacy_fed_inputs,
+      graph_as_function, context);
 }
 
 static TranslateToMLIRRegistration GraphdefToMlirTranslate(
     "graphdef-to-mlir", GraphdefToMlirTranslateFunction);
 
+static OwningModuleRef SavedModelToMlirTranslateFunction(
+    llvm::StringRef input_filename, MLIRContext* context) {
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  return tensorflow::SavedModelToMlirImport(StringRefToView(input_filename),
+                                            tags, debug_info_file, context);
+}
+
+static TranslateToMLIRRegistration SavedModelToMlirTranslate(
+    "savedmodel-to-mlir", SavedModelToMlirTranslateFunction);
+
 static OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input_filename, MLIRContext* context) {
   return tensorflow::GraphdefToSplattedMlirTranslateFunction(
       StringRefToView(input_filename), debug_info_file, input_arrays,
       input_dtypes, input_shapes, output_arrays, inference_type, min_values,
-      max_values, prune_unused_nodes, context);
+      max_values, prune_unused_nodes, convert_legacy_fed_inputs,
+      graph_as_function, context);
 }
 
 static TranslateToMLIRRegistration GraphdefToSplattedMlirTranslate(
@@ -67,8 +79,8 @@ static LogicalResult MlirToGraphdefTranslateFunction(
   if (!module) return failure();
 
   std::error_code error;
-  auto result = llvm::make_unique<llvm::ToolOutputFile>(output_filename, error,
-                                                        llvm::sys::fs::F_None);
+  auto result = std::make_unique<llvm::ToolOutputFile>(output_filename, error,
+                                                       llvm::sys::fs::F_None);
   if (error) {
     LOG(ERROR) << error.message();
     return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index 9c02ce2278f..ac0f4d2adc0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -59,7 +59,8 @@ static LogicalResult MlirToTfNodeDef(ModuleOp module,
     return failure();
   }
 
-  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(op, "node_name");
+  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
+      op, "node_name", /*ignore_unregistered_attrs=*/false);
   if (!node_def_or.ok()) {
     op->emitError("failed to convert to TF NodeDef:")
         << node_def_or.status().ToString();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index f66b07b246a..df19e169d3c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -42,20 +42,23 @@ namespace tensorflow {
 
 using llvm::ArrayRef;
 using llvm::SmallVector;
-using mlir::Attribute;
-using mlir::BoolAttr;
 using mlir::Builder;
 using mlir::DenseFPElementsAttr;
 using mlir::DenseIntElementsAttr;
 using mlir::ElementsAttr;
-using mlir::FloatAttr;
-using mlir::IntegerAttr;
 using mlir::OpaqueElementsAttr;
 using mlir::ShapedType;
-using mlir::SplatElementsAttr;
 using mlir::Type;
 using tensorflow::errors::InvalidArgument;
 
+void ConvertToMlirShape(const TensorShape& input_shape,
+                        llvm::SmallVectorImpl<int64_t>* shape) {
+  shape->reserve(input_shape.dims());
+  for (const auto& d : input_shape) {
+    shape->push_back(d.size);
+  }
+}
+
 Status ConvertToMlirShape(const TensorShapeProto& input_shape,
                           llvm::SmallVectorImpl<int64_t>* shape) {
   shape->reserve(input_shape.dim_size());
@@ -69,174 +72,72 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
   return Status::OK();
 }
 
-// Converts an TensorFlow tensor proto to an MLIR opaque elements attribute.
-StatusOr<ElementsAttr> ConvertToOpaqueElementsAttr(
-    const TensorProto& input_tensor, ShapedType type, Builder* builder) {
-  // TODO(shpeisman): restructure code to reuse dialect pointer across calls.
-  auto* dialect = builder->getContext()->getRegisteredDialect("tf");
-  return builder->getOpaqueElementsAttr(
-      dialect, type, mangling_util::MangleTensor(input_tensor));
+static TensorProto ConvertToProto(const Tensor& input_tensor,
+                                  bool use_tensor_content = true) {
+  TensorProto tensor_proto;
+  // Using tensor content (mostly*) reduces serialization overhead during RPC
+  // calls, but is less human reader friendly. People reading protobufs are less
+  // frequent than serialization, so default to using tensor content
+  // representation.
+  // * For scalars and short strings it may be marginally worse and a more
+  //   intelligent decision could be made by caller.
+  if (use_tensor_content)
+    input_tensor.AsProtoTensorContent(&tensor_proto);
+  else
+    input_tensor.AsProtoField(&tensor_proto);
+  return tensor_proto;
 }
 
-// Template predicate that provides a constant member `value` equal to true if
-// a sequence of `From` values can be copied wholesale to locations for `To`
-// values.
-
-// Primary template declaration
-template <typename From, typename To, typename Enable = void>
-struct IsBatchCopyable;
-
-// Partial template specialization: allow wholesale copy for the same type
-template <typename Self>
-struct IsBatchCopyable<Self, Self> : std::true_type {};
-
-// SFINAE: integral types depend on the bitwidth
-template <typename From, typename To>
-struct IsBatchCopyable<
-    From, To,
-    typename std::enable_if<std::is_integral<From>::value &&
-                            std::is_integral<To>::value>::type> {
-  static constexpr bool value =
-      std::numeric_limits<From>::digits == std::numeric_limits<To>::digits;
-};
-
-// Converts an TensorFlow tensor proto to an MLIR dense elements attribute.
-// To save the memory held by the attribute, the value is casted to the
-// specified type.
-template <typename ProtoT, typename MlirT>
-typename std::enable_if<IsBatchCopyable<ProtoT, MlirT>::value,
-                        StatusOr<ElementsAttr>>::type
-ConvertToDenseElementsAttr(
-    const tensorflow::protobuf::RepeatedField<ProtoT>& values, ShapedType type,
-    Builder* builder) {
-  return mlir::DenseElementsAttr::get(
-      type, llvm::makeArrayRef(values.data(), values.size()));
+static std::string MangleTensor(const Tensor& tensor) {
+  return mangling_util::MangleTensor(ConvertToProto(tensor));
 }
 
-template <typename ProtoT, typename MlirT>
-typename std::enable_if<!IsBatchCopyable<ProtoT, MlirT>::value,
-                        StatusOr<ElementsAttr>>::type
-ConvertToDenseElementsAttr(
-    const tensorflow::protobuf::RepeatedField<ProtoT>& values, ShapedType type,
-    Builder* builder) {
-  std::vector<MlirT> buff;
-  buff.reserve(values.size());
-  for (auto value : values) {
-    buff.push_back(value);
-  }
-  return mlir::DenseElementsAttr::get(type, llvm::makeArrayRef(buff));
-}
-
-// Converts an TensorFlow tensor proto with DT_FLOAT data type into an MLIR
-// elements attribute.
-StatusOr<ElementsAttr> ConvertFloatTensor(const TensorProto& input_tensor,
-                                          ShapedType type, Builder* builder) {
-  // When the repeated "float_val" field only has one element, it is converted
-  // to a splat elements attribute; When it has more than one element, it is
-  // converted to a dense elements attribute; otherwise, convert the whole
-  // tensor to an opaque elements attribute if the "tensor_content" field is
-  // set.
-  auto repeated_val_size = input_tensor.float_val_size();
-  if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) {
-    return ConvertToDenseElementsAttr<float, float>(input_tensor.float_val(),
-                                                    type, builder);
-  }
-  return ConvertToOpaqueElementsAttr(input_tensor, type, builder);
-}
-
-// Converts an TensorFlow tensor proto with DT_INT32, DT_INT16, DT_INT8,
-// DT_UINT8, DT_QUINT8 data type into an MLIR elements attribute.
+// Converts a TensorFlow tensor into an MLIR elements attribute.
 template <typename T>
-StatusOr<ElementsAttr> ConvertIntTensor(const TensorProto& input_tensor,
-                                        ShapedType type, Builder* builder) {
-  // When the repeated "int_val" field only has one element, it is converted to
-  // a splat elements attribute; When it has more than one element, it is
-  // converted to a dense elements attribute; otherwise, convert the whole
-  // tensor to an opaque elements attribute if the "tensor_content" field is
-  // set.
-  auto repeated_val_size = input_tensor.int_val_size();
-  if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) {
-    return ConvertToDenseElementsAttr<int, T>(input_tensor.int_val(), type,
-                                              builder);
-  }
-  return ConvertToOpaqueElementsAttr(input_tensor, type, builder);
-}
-
-// Converts an TensorFlow tensor proto with DT_INT64 data type into an MLIR
-// elements attribute.
-StatusOr<ElementsAttr> ConvertInt64Tensor(const TensorProto& input_tensor,
-                                          ShapedType type, Builder* builder) {
-  // When the repeated "int64_val" field only has one element, it is converted
-  // to a splat elements attribute; When it has more than one element, it is
-  // converted to a dense elements attribute; otherwise, convert the whole
-  // tensor to an opaque elements attribute if the "tensor_content" field is
-  // set.
-  auto repeated_val_size = input_tensor.int64_val_size();
-  if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) {
-    return ConvertToDenseElementsAttr<decltype(input_tensor.int64_val(0)),
-                                      uint64_t>(input_tensor.int64_val(), type,
-                                                builder);
-  }
-  return ConvertToOpaqueElementsAttr(input_tensor, type, builder);
-}
-
-// Converts an TensorFlow tensor proto with DT_BOOL data type into an MLIR
-// elements attribute.
-StatusOr<ElementsAttr> ConvertBoolTensor(const TensorProto& input_tensor,
+StatusOr<ElementsAttr> ConvertFlatTensor(const Tensor& input_tensor,
                                          ShapedType type, Builder* builder) {
-  // When the repeated "bool_val" field only has one element, it is converted to
-  // a splat elements attribute; When it has more than one element, it is
-  // converted to a dense elements attribute; otherwise, convert the whole
-  // tensor to an opaque elements attribute if the "tensor_content" field is
-  // set.
-  auto repeated_val_size = input_tensor.bool_val_size();
-  if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) {
-    const auto& proto = input_tensor.bool_val();
-    return mlir::DenseElementsAttr::get(
-        type, llvm::makeArrayRef(proto.data(), proto.size()));
+  auto arr = input_tensor.flat<T>();
+  return mlir::DenseElementsAttr::get(
+      type, llvm::makeArrayRef(arr.data(), arr.size()));
+}
+
+StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
+                                     Builder* builder) {
+  const auto& input_dtype = input_tensor.dtype();
+  const auto& input_shape = input_tensor.shape();
+  Type elt_type;
+  TF_RETURN_IF_ERROR(ConvertDataType(input_dtype, *builder, &elt_type));
+  SmallVector<int64_t, 4> shape;
+  ConvertToMlirShape(input_shape, &shape);
+  auto type = builder->getTensorType(shape, elt_type);
+
+#define CONVERT_FLAT(DTYPE, CTYPE) \
+  case DTYPE:                      \
+    return ConvertFlatTensor<CTYPE>(input_tensor, type, builder);
+
+  // TODO(fengliuai): customize the conversions for more types.
+  switch (input_dtype) {
+    CONVERT_FLAT(DT_BOOL, bool)
+    CONVERT_FLAT(DT_FLOAT, float)
+    CONVERT_FLAT(DT_INT32, int32)
+    CONVERT_FLAT(DT_INT64, int64)
+    default:
+      // TODO(shpeisman): restructure code to reuse dialect pointer across
+      // calls.
+      auto* dialect = builder->getContext()->getRegisteredDialect("tf");
+      return builder->getOpaqueElementsAttr(dialect, type,
+                                            MangleTensor(input_tensor));
   }
-  return ConvertToOpaqueElementsAttr(input_tensor, type, builder);
+
+#undef CONVERT_FLAT
 }
 
 StatusOr<ElementsAttr> ConvertTensorProto(const TensorProto& input_tensor,
                                           Builder* builder) {
-  const auto& input_dtype = input_tensor.dtype();
-  const auto& input_shape = input_tensor.tensor_shape();
-  Type elt_type;
-  TF_RETURN_IF_ERROR(ConvertDataType(input_dtype, *builder, &elt_type));
-  SmallVector<int64_t, 4> shape;
-  TF_RETURN_IF_ERROR(ConvertToMlirShape(input_shape, &shape));
-  auto type = builder->getTensorType(shape, elt_type);
-
-  // TODO(fengliuai): customize the conversions for more types.
-  switch (input_dtype) {
-    case DT_FLOAT:
-      return ConvertFloatTensor(input_tensor, type, builder);
-    case DT_INT32:
-      return ConvertIntTensor<uint32_t>(input_tensor, type, builder);
-    case DT_INT64:
-      return ConvertInt64Tensor(input_tensor, type, builder);
-    case DT_BOOL:
-      return ConvertBoolTensor(input_tensor, type, builder);
-    default:
-      // The value of the opaque elements attribute contains the whole tensor
-      // proto, not just the tensor content.
-
-      // TODO(shpeisman): restructure code to reuse dialect pointer across
-      // calls.
-      auto* dialect = builder->getContext()->getRegisteredDialect("tf");
-
-      return builder->getOpaqueElementsAttr(
-          dialect, type, mangling_util::MangleTensor(input_tensor));
-  }
-}
-
-StatusOr<mlir::ElementsAttr> ConvertTensor(const Tensor& input_tensor,
-                                           mlir::Builder* builder) {
-  TensorProto input_proto;
-  // This decodes the tensor content into a proper proto field.
-  input_tensor.AsProtoField(&input_proto);
-  return ConvertTensorProto(input_proto, builder);
+  Tensor t;
+  if (!t.FromProto(input_tensor))
+    return InvalidArgument("Failed to parse input_tensor.");
+  return ConvertTensor(t, builder);
 }
 
 Status ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
@@ -247,7 +148,7 @@ Status ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
   return Status::OK();
 }
 
-// Converts an MLIR opaque elements attribute to an TensorFlow tensor proto.
+// Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
 Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
                                  TensorProto* output_tensor) {
   if (attr.isa<OpaqueElementsAttr>()) {
@@ -258,49 +159,70 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute to an TensorFlow tensor proto
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
 // with the float_val field updated.
 Status ConvertFloatElementsAttr(const ElementsAttr attr,
                                 TensorProto* output_tensor) {
   if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    for (auto value : elts.getValues<float>()) {
-      output_tensor->add_float_val(value);
+    if (elts.isSplat()) {
+      output_tensor->add_float_val(elts.getSplatValue<float>());
+    } else {
+      for (auto value : elts.getValues<float>())
+        output_tensor->add_float_val(value);
     }
-  } else {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
+    return Status::OK();
   }
-  return Status::OK();
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-// Converts an MLIR elements attribute to an TensorFlow tensor proto
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the half_val field updated.
+Status ConvertHalfElementsAttr(const ElementsAttr attr,
+                               TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_half_val(
+          (*elts.begin()).bitcastToAPInt().getSExtValue());
+    } else {
+      for (auto value : elts.getFloatValues())
+        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
+    }
+    return Status::OK();
+  }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
+}
+
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
 // with the int_val field updated.
 Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
                               TensorProto* output_tensor) {
   if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (auto val : elts) {
-      output_tensor->add_int_val(val.getSExtValue());
+    if (elts.isSplat()) {
+      output_tensor->add_int_val((*elts.begin()).getSExtValue());
+    } else {
+      for (auto val : elts) output_tensor->add_int_val(val.getSExtValue());
     }
-  } else {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
+    return Status::OK();
   }
-  return Status::OK();
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-// Converts an MLIR elements attribute to an TensorFlow tensor proto
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
 // with the int64_val field updated.
 Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
                                 TensorProto* output_tensor) {
   if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (auto val : elts) {
-      output_tensor->add_int64_val(val.getSExtValue());
+    if (elts.isSplat()) {
+      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
+    } else {
+      for (auto val : elts) output_tensor->add_int64_val(val.getSExtValue());
     }
-  } else {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
+    return Status::OK();
   }
-  return Status::OK();
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-// Converts an MLIR elements attribute to an TensorFlow tensor proto
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
 // with bool_val field updated.
 Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
                                TensorProto* output_tensor) {
@@ -308,10 +230,9 @@ Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
     for (auto val : elts) {
       output_tensor->add_bool_val(val.getBoolValue());
     }
-  } else {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
+    return Status::OK();
   }
-  return Status::OK();
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
 Status ConvertToTensorProto(const ElementsAttr attr,
@@ -327,6 +248,9 @@ Status ConvertToTensorProto(const ElementsAttr attr,
   switch (output_dtype) {
     case DT_FLOAT:
       return ConvertFloatElementsAttr(attr, output_tensor);
+    case DT_HALF:
+      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
+      return ConvertHalfElementsAttr(attr, output_tensor);
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index 5d6cd1bb222..4e59cec86ab 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -29,12 +29,8 @@ using testing::HasSubstr;
 
 TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
   MLIRContext context;
-
-  auto emit_error = [&](const std::string& msg) {
-    emitError(FileLineColLoc::get(Identifier::get("test.cc", &context), 10, 32,
-                                  &context),
-              msg);
-  };
+  auto id = Identifier::get("test.cc", &context);
+  auto loc = FileLineColLoc::get(id, 0, 0, &context);
 
   // Test OK without diagnostic gets passed through.
   {
@@ -44,7 +40,7 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
   // Verify diagnostics are captured as Unknown status.
   {
     StatusScopedDiagnosticHandler handler(&context);
-    emit_error("Diagnostic message");
+    emitError(loc) << "Diagnostic message";
     ASSERT_TRUE(tensorflow::errors::IsUnknown(handler.ConsumeStatus()));
   }
 
@@ -58,8 +54,8 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
   // Verify diagnostic reported are append to passed in error.
   {
     auto function = [&]() {
-      emit_error("Diagnostic message reported");
-      emit_error("Second diagnostic message reported");
+      emitError(loc) << "Diagnostic message reported";
+      emitError(loc) << "Second diagnostic message reported";
       return tensorflow::errors::Internal("Passed in error");
     };
     Status s = StatusScopedDiagnosticHandler(&context).Combine(function());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
index a821c868d4a..29a4388de30 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
@@ -78,7 +78,8 @@ mlir::LogicalResult EvaluateOperation(
   if (auto attr = inst->getAttrOfType<mlir::StringAttr>("name")) {
     node_name = attr.getValue();
   }
-  auto node_def_or = ConvertTFDialectOpToNodeDef(inst, node_name.c_str());
+  auto node_def_or = ConvertTFDialectOpToNodeDef(
+      inst, node_name.c_str(), /*ignore_unregistered_attrs=*/true);
   RETURN_FAILURE_IF_ERROR(node_def_or.status());
   const auto& node_def = node_def_or.ValueOrDie();
   TFE_Op* op = TFE_NewOp(context, node_def->op().c_str(), status);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index a2f803c0858..48826520949 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/Identifier.h"  // TF:local_config_mlir
@@ -30,7 +31,6 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/Support/DebugStringHelper.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -160,9 +160,34 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
+// Updates NodeDef constructed out of an MLIR If op to map it to either
+// TensorFlow StatelessIf or If op depending on the additional attribute.
+void UpdateCompositeIfOp(NodeDef* node_def) {
+  auto it = node_def->mutable_attr()->find("is_stateless");
+  if (it != node_def->attr().end()) {
+    if (it->second.b()) {
+      *node_def->mutable_op() = "StatelessIf";
+    }
+    node_def->mutable_attr()->erase(it);
+  }
+}
+
+// Updates NodeDef constructed out of an MLIR While op to map it to either
+// TensorFlow StatelessWhile or While op depending on the additional attribute.
+void UpdateCompositeWhileOp(NodeDef* node_def) {
+  auto it = node_def->mutable_attr()->find("is_stateless");
+  if (it != node_def->attr().end()) {
+    if (it->second.b()) {
+      *node_def->mutable_op() = "StatelessWhile";
+    }
+    node_def->mutable_attr()->erase(it);
+  }
+}
+
 }  // anonymous namespace
 
 StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
+    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
     mlir::Operation* inst, llvm::StringRef name,
     OpNameMappingFunc op_name_func) {
   auto node_def = absl::make_unique<NodeDef>();
@@ -184,7 +209,6 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   }
 
   // Add the node attributes.
-  absl::flat_hash_set<string> attrs_to_ignore;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       ConvertAttributes(inst->getAttrs(), attrs_to_ignore,
                         node_def->mutable_attr()),
@@ -194,12 +218,16 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   TF_RETURN_IF_ERROR(ConvertLocation(
       inst->getLoc(), node_def->mutable_experimental_debug_info()));
 
+  if (node_def->op() == "If") UpdateCompositeIfOp(node_def.get());
+  if (node_def->op() == "While") UpdateCompositeWhileOp(node_def.get());
+
   return node_def;
 }
 
-Status ConvertAttributes(const llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                         const absl::flat_hash_set<string>& attrs_to_ignore,
-                         AttrValueMap* values) {
+Status ConvertAttributes(
+    const llvm::ArrayRef<mlir::NamedAttribute> attrs,
+    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
+    AttrValueMap* values) {
   AttrValueMap func_call_attrs;
   for (const mlir::NamedAttribute& named_attr : attrs) {
     auto name_strref = named_attr.first.str();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 4c6d8ade04a..0f1994aca43 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -43,17 +43,21 @@ using OpNameMappingFunc = std::function<StatusOr<std::string>(llvm::StringRef)>;
 
 // Converts an MLIR operation to TensorFlow NodeDef with given node name. This
 // name should be unique to the graph it is being inserted into. `op_name_func`
-// is to map the op name of `inst` to its op name in TensorFlow.
+// is to map the op name of `inst` to its op name in TensorFlow. "name" and
+// "device" attributes are ignored by default. Use attrs_to_ignore to specify
+// any other attributes that should be ignored.
 StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
+    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
     mlir::Operation* inst, llvm::StringRef name,
     OpNameMappingFunc op_name_func);
 
 // Converts MLIR attributes with values to their tensorflow equivalent.
 // "name" and "device" attributes are ignored by default. Use attrs_to_ignore to
 // specify any other attributes that should be ignored.
-Status ConvertAttributes(const llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                         const absl::flat_hash_set<string>& attrs_to_ignore,
-                         AttrValueMap* values);
+Status ConvertAttributes(
+    const llvm::ArrayRef<mlir::NamedAttribute> attrs,
+    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
+    AttrValueMap* values);
 
 // Sets type attribute with the given name. If the attribute already exists with
 // a different value, returns an error.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
index 776a7ac71b2..691caab526a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
@@ -69,7 +69,7 @@ MangledKind GetMangledKind(absl::string_view str) {
 }
 
 string MangleShape(const TensorShapeProto& shape) {
-  return absl::StrCat(kTensorShapePrefix, shape.DebugString());
+  return absl::StrCat(kTensorShapePrefix, shape.ShortDebugString());
 }
 
 Status DemangleShape(absl::string_view str, TensorShapeProto* proto) {
@@ -85,7 +85,7 @@ Status DemangleShape(absl::string_view str, TensorShapeProto* proto) {
 }
 
 string MangleTensor(const TensorProto& tensor) {
-  return absl::StrCat(kTensorPrefix, tensor.DebugString());
+  return absl::StrCat(kTensorPrefix, tensor.ShortDebugString());
 }
 
 Status DemangleTensor(absl::string_view str, TensorProto* proto) {
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index aaf4f68f739..3f649c67abf 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassManager.h"  // TF:local_config_mlir
 #include "mlir/Support/FileUtilities.h"  // TF:local_config_mlir
 #include "mlir/Support/MlirOptMain.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -58,8 +58,7 @@ static llvm::cl::opt<bool> verify_passes(
 static std::vector<const mlir::PassRegistryEntry *> *pass_list;
 
 int main(int argc, char **argv) {
-  llvm::PrettyStackTraceProgram x(argc, argv);
-  llvm::InitLLVM y(argc, argv);
+  tensorflow::InitMlir y(&argc, &argv);
 
   // Register any pass manager command line options.
   mlir::registerPassManagerCLOptions();
@@ -71,10 +70,6 @@ int main(int argc, char **argv) {
   llvm::cl::ParseCommandLineOptions(argc, argv,
                                     "TF MLIR modular optimizer driver\n");
 
-  // TODO(jpienaar): Enable command line parsing for both sides.
-  int fake_argc = 1;
-  tensorflow::port::InitMain(argv[0], &fake_argc, &argv);
-
   // Set up the input file.
   std::string error_message;
   auto file = mlir::openInputFile(input_filename, &error_message);
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
new file mode 100644
index 00000000000..fc61e4bc5d0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/InitLLVM.h"
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "mlir/Support/TranslateClParser.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/core/platform/init_main.h"
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
+                                                 llvm::cl::desc("<input file>"),
+                                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> output_filename(
+    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+    llvm::cl::init("-"));
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  // Add flags for all the registered translations.
+  llvm::cl::opt<const mlir::TranslateFunction*, false, mlir::TranslationParser>
+      requested_translation("", llvm::cl::desc("Translation to perform"),
+                            llvm::cl::Required);
+  llvm::cl::ParseCommandLineOptions(argc, argv, "TF MLIR translation driver\n");
+
+  mlir::MLIRContext context;
+  return failed(
+      (*requested_translation)(input_filename, output_filename, &context));
+}
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index c36299ee263..35c8d2bd0eb 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -13,49 +13,63 @@ package_group(
         "//babelfish/device/...",
         "//learning/brain/experimental/mlir/...",
         "//tensorflow/compiler/mlir/...",
+        "//tensorflow/compiler/xla/...",
         "//third_party/mlir_edge/...",
     ],
 )
 
 filegroup(
-    name = "xla_ops_td_files",
+    name = "hlo_ops_td_files",
     srcs = [
-        "ir/xla_ops.td",
+        "ir/hlo_ops.td",
+        "ir/hlo_ops_base.td",
+        "ir/lhlo_ops.td",
         "@local_config_mlir//:OpBaseTdFiles",
     ],
 )
 
 gentbl(
-    name = "xla_ops_inc_gen",
+    name = "hlo_ops_inc_gen",
     tbl_outs = [
-        (
-            "-gen-op-decls",
-            "ir/xla_ops.h.inc",
-        ),
-        (
-            "-gen-op-defs",
-            "ir/xla_ops.cc.inc",
-        ),
+        ("-gen-op-decls", "ir/hlo_ops.h.inc"),
+        ("-gen-op-defs", "ir/hlo_ops.cc.inc"),
     ],
     tblgen = "@local_config_mlir//:mlir-tblgen",
-    td_file = "ir/xla_ops.td",
-    td_srcs = [
-        ":xla_ops_td_files",
+    td_file = "ir/hlo_ops.td",
+    td_srcs = [":hlo_ops_td_files"],
+)
+
+gentbl(
+    name = "hlo_ops_base_inc_gen",
+    tbl_outs = [
+        ("-gen-op-decls", "ir/hlo_ops_base.h.inc"),
+        ("-gen-op-defs", "ir/hlo_ops_base.cc.inc"),
     ],
+    tblgen = "@local_config_mlir//:mlir-tblgen",
+    td_file = "ir/hlo_ops_base.td",
+    td_srcs = [":hlo_ops_td_files"],
+)
+
+gentbl(
+    name = "lhlo_ops_inc_gen",
+    tbl_outs = [
+        ("-gen-op-decls", "ir/lhlo_ops.h.inc"),
+        ("-gen-op-defs", "ir/lhlo_ops.cc.inc"),
+    ],
+    tblgen = "@local_config_mlir//:mlir-tblgen",
+    td_file = "ir/lhlo_ops.td",
+    td_srcs = [":hlo_ops_td_files"],
 )
 
 gentbl(
     name = "xla_legalize_tf_inc_gen",
     tbl_outs = [
-        (
-            "-gen-rewriters",
-            "transforms/generated_legalize_tf.inc",
-        ),
+        ("-gen-rewriters", "transforms/generated_legalize_tf.inc"),
     ],
     tblgen = "@local_config_mlir//:mlir-tblgen",
     td_file = "transforms/legalize_tf_patterns.td",
     td_srcs = [
-        ":xla_ops_td_files",
+        ":hlo_ops_td_files",
         "@local_config_mlir//:StdOpsTdFiles",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
     ],
@@ -67,8 +81,9 @@ cc_library(
         "transforms/generated_legalize_tf.inc",
         "transforms/legalize_tf.cc",
     ],
+    copts = ["-std=c++14"],
     deps = [
-        ":xla",
+        ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
@@ -83,15 +98,12 @@ cc_library(
 gentbl(
     name = "xla_legalize_to_standard_inc_gen",
     tbl_outs = [
-        (
-            "-gen-rewriters",
-            "transforms/generated_legalize_to_standard.inc",
-        ),
+        ("-gen-rewriters", "transforms/generated_legalize_to_standard.inc"),
     ],
     tblgen = "@local_config_mlir//:mlir-tblgen",
     td_file = "transforms/legalize_to_standard_patterns.td",
     td_srcs = [
-        ":xla_ops_td_files",
+        ":hlo_ops_td_files",
         "@local_config_mlir//:StdOpsTdFiles",
     ],
 )
@@ -101,8 +113,9 @@ cc_library(
     srcs = [
         "transforms/legalize_control_flow.cc",
     ],
+    copts = ["-std=c++14"],
     deps = [
-        ":xla",
+        ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
@@ -115,11 +128,10 @@ cc_library(
 
 cc_library(
     name = "xla_legalize_to_standard",
-    srcs = [
-        "transforms/legalize_to_standard.cc",
-    ],
+    srcs = ["transforms/legalize_to_standard.cc"],
+    copts = ["-std=c++14"],
     deps = [
-        ":xla",
+        ":hlo",
         ":xla_legalize_to_standard_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm//:support",
@@ -132,19 +144,47 @@ cc_library(
 )
 
 cc_library(
-    name = "xla",
+    name = "hlo",
     srcs = [
-        "ir/xla_ops.cc",
-        "ir/xla_ops.cc.inc",
-        "ir/xla_ops.h.inc",
+        "ir/hlo_ops.cc",
+        "ir/hlo_ops.cc.inc",
+        "ir/hlo_ops.h.inc",
     ],
     hdrs = [
-        "ir/xla_ops.h",
+        "ir/hlo_ops.h",
         "transforms/passes.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
-        ":xla_ops_inc_gen",
+        ":hlo_ops_base_inc_gen",
+        ":hlo_ops_inc_gen",
+        "@llvm//:support",
+        "@local_config_mlir//:Analysis",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:Pass",
+        "@local_config_mlir//:StandardOps",
+        "@local_config_mlir//:Support",
+        "@local_config_mlir//:TransformUtils",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "lhlo",
+    srcs = [
+        "ir/lhlo_ops.cc",
+        "ir/lhlo_ops.cc.inc",
+        "ir/lhlo_ops.h.inc",
+    ],
+    hdrs = [
+        "ir/lhlo_ops.h",
+        "transforms/passes.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":hlo_ops_base_inc_gen",
+        ":lhlo_ops_inc_gen",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
         "@local_config_mlir//:IR",
@@ -152,7 +192,6 @@ cc_library(
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
         "@local_config_mlir//:TransformUtils",
-        "@local_config_mlir//:TypeUtilities",
     ],
     alwayslink = 1,
 )
@@ -161,8 +200,10 @@ cc_library(
 cc_library(
     name = "xla_dialect_registration",
     srcs = ["ir/dialect_registration.cc"],
+    copts = ["-std=c++14"],
     deps = [
-        ":xla",
+        ":hlo",
+        ":lhlo",
         "@local_config_mlir//:IR",
     ],
     alwayslink = 1,
@@ -172,11 +213,11 @@ cc_library(
     name = "type_to_shape",
     srcs = ["type_to_shape.cc"],
     hdrs = ["type_to_shape.h"],
+    copts = ["-std=c++14"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:Support",
     ],
@@ -190,6 +231,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
         "@local_config_mlir//:IR",
     ],
@@ -202,9 +244,10 @@ cc_library(
         "operator_writers.inc",
     ],
     hdrs = ["mlir_hlo_to_hlo.h"],
+    copts = ["-std=c++14"],
     deps = [
+        ":hlo",
         ":type_to_shape",
-        ":xla",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -223,12 +266,9 @@ cc_library(
 
 cc_library(
     name = "hlo_to_mlir_hlo",
-    srcs = [
-        "hlo_to_mlir_hlo.cc",
-    ],
-    hdrs = [
-        "hlo_to_mlir_hlo.h",
-    ],
+    srcs = ["hlo_to_mlir_hlo.cc"],
+    hdrs = ["hlo_to_mlir_hlo.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":hlo_module_importer",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -248,8 +288,9 @@ cc_library(
         "hlo_function_importer.h",
         "hlo_module_importer.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
-        ":xla",
+        ":hlo",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status",
@@ -258,6 +299,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:lib",
         "@llvm//:support",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:StandardOps",
@@ -266,12 +308,9 @@ cc_library(
 
 cc_library(
     name = "xla_mlir_translate",
-    srcs = [
-        "xla_mlir_translate.cc",
-    ],
-    hdrs = [
-        "xla_mlir_translate.h",
-    ],
+    srcs = ["xla_mlir_translate.cc"],
+    hdrs = ["xla_mlir_translate.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":hlo_to_mlir_hlo",
         ":mlir_hlo_to_hlo",
@@ -290,11 +329,8 @@ cc_library(
 
 tf_native_cc_binary(
     name = "operator_writer_gen",
-    srcs = [
-        "operator_writer_gen.cc",
-    ],
+    srcs = ["operator_writer_gen.cc"],
     deps = [
-        "@llvm//:config",
         "@llvm//:support",
         "@llvm//:tablegen",
         "@local_config_mlir//:TableGen",
@@ -305,13 +341,13 @@ genrule(
     name = "operator_writer_inc",
     srcs = [
         "@local_config_mlir//:include/mlir/IR/OpBase.td",
-        "//tensorflow/compiler/mlir/xla:ir/xla_ops.td",
-    ],
-    outs = [
-        "operator_writers.inc",
+        "//tensorflow/compiler/mlir/xla:ir/hlo_ops.td",
+        "//tensorflow/compiler/mlir/xla:ir/hlo_ops_base.td",
     ],
+    outs = ["operator_writers.inc"],
     cmd = ("$(location :operator_writer_gen) " +
            "-I external/local_config_mlir/include " +
-           "$(location //tensorflow/compiler/mlir/xla:ir/xla_ops.td) " + " -o $@"),
+           "$(location //tensorflow/compiler/mlir/xla:ir/hlo_ops.td) " +
+           " -o $@"),
     tools = [":operator_writer_gen"],
 )
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index b9ba5fcb9fb..8a69310ced9 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Identifier.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -77,6 +77,11 @@ StatusOr<DenseElementsAttr> CreateDenseAttrFromLiteral(ShapedType type,
     DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::S16, int16)
     DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::S32, int32)
     DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::S64, int64)
+    // TODO(b/130356985): Update once MLIR supports unsigned integers.
+    DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::U8, uint8)
+    DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::U16, uint16)
+    DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::U32, uint32)
+    DENSE_ELEMENT_ATTR_BUILDER(PrimitiveType::U64, uint64)
     default:
       return tensorflow::errors::Internal(
           absl::StrCat("Unsupported type: ",
@@ -174,18 +179,19 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kIota: {
       return func_builder
-          ->create<mlir::XLA::IotaOp>(
+          ->create<mlir::xla_hlo::IotaOp>(
               loc, result_type,
               func_builder->getI64IntegerAttr(
                   static_cast<HloIotaInstruction*>(instruction)
                       ->iota_dimension()))
           .getOperation();
     }
-#define MakeAndReturn(mlir_op)                                                 \
-  {                                                                            \
-    mlir::Operation* new_operation = func_builder->create<mlir::XLA::mlir_op>( \
-        loc, result_type, operands, attributes);                               \
-    return new_operation;                                                      \
+#define MakeAndReturn(mlir_op)                                              \
+  {                                                                         \
+    mlir::Operation* new_operation =                                        \
+        func_builder->create<mlir::xla_hlo::mlir_op>(loc, result_type,      \
+                                                     operands, attributes); \
+    return new_operation;                                                   \
   }
     case HloOpcode::kBroadcast: {
       // Note that the HLO broadcast is more powerful than the XLA broadcast op.
@@ -237,7 +243,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       // TODO(b/132057942): Change to explicitly passing an integer instead of
       // call getI64IntegerAttr here.
       return func_builder
-          ->create<mlir::XLA::GatherOp>(
+          ->create<mlir::xla_hlo::GatherOp>(
               loc, result_type, operands[0], operands[1],
               func_builder->getI64IntegerAttr(
                   gather_dimensions.index_vector_dim()),
@@ -247,7 +253,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kDynamicUpdateSlice: {
       return func_builder
-          ->create<mlir::XLA::DynamicUpdateSliceOp>(
+          ->create<mlir::xla_hlo::DynamicUpdateSliceOp>(
               loc, result_type, operands[0], operands[1],
               llvm::ArrayRef<Value*>(operands.begin() + 2, operands.end()))
           .getOperation();
@@ -268,15 +274,15 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       }
 
       return func_builder
-          ->create<mlir::XLA::PadOp>(loc, result_type, operands[0], operands[1],
-                                     Convert(edge_padding_low),
-                                     Convert(edge_padding_high),
-                                     Convert(interior_padding))
+          ->create<mlir::xla_hlo::PadOp>(loc, result_type, operands[0],
+                                         operands[1], Convert(edge_padding_low),
+                                         Convert(edge_padding_high),
+                                         Convert(interior_padding))
           .getOperation();
     }
     case HloOpcode::kSlice: {
       return func_builder
-          ->create<mlir::XLA::SliceOp>(
+          ->create<mlir::xla_hlo::SliceOp>(
               loc, result_type, operands[0],
               ConvertDimensions(instruction->slice_starts()),
               ConvertDimensions(instruction->slice_limits()))
@@ -286,7 +292,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       // TODO(b/132057942): Support taking an uint64_t instead of an IntegerAttr
       // for concatenate dimension.
       return func_builder
-          ->create<mlir::XLA::ConcatenateOp>(
+          ->create<mlir::xla_hlo::ConcatenateOp>(
               loc, result_type, operands,
               builder_->getI64IntegerAttr(instruction->concatenate_dimension()))
           .getOperation();
@@ -297,7 +303,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       // TODO(b/132057942): Make more convenient constructors, e.g. pass
       // mlir function pointer instead of a function attr.
       return func_builder
-          ->create<mlir::XLA::ReduceOp>(
+          ->create<mlir::xla_hlo::ReduceOp>(
               loc, result_type, operands,
               func_builder->getSymbolRefAttr(reduction),
               ConvertDimensions(instruction->dimensions()))
@@ -305,7 +311,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kReverse: {
       return func_builder
-          ->create<mlir::XLA::ReverseOp>(
+          ->create<mlir::xla_hlo::ReverseOp>(
               loc, result_type, operands[0],
               ConvertDimensions(instruction->dimensions()))
           .getOperation();
@@ -324,7 +330,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       auto cond_attr = func_builder->getSymbolRefAttr(cond);
       auto body_attr = func_builder->getSymbolRefAttr(body);
 
-      Operation* op = func_builder->create<mlir::XLA::WhileOp>(
+      Operation* op = func_builder->create<mlir::xla_hlo::WhileOp>(
           loc, types, operands, cond_attr, body_attr);
       return op;
     }
@@ -350,10 +356,19 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kAdd, AddOp);
       NoAttributeCase(kAnd, AndOp);
       NoAttributeCase(kConvert, ConvertOp);
+      NoAttributeCase(kClamp, ClampOp);
       NoAttributeCase(kDivide, DivOp);
+      NoAttributeCase(kExp, ExpOp);
+      NoAttributeCase(kFloor, FloorOp);
+      NoAttributeCase(kLog, LogOp);
       NoAttributeCase(kMaximum, MaxOp);
       NoAttributeCase(kMinimum, MinOp);
       NoAttributeCase(kMultiply, MulOp);
+      // The dimensions attribute is not present on the HLO Reshape instruction.
+      // If dimensions are non-default, the XLA builder implementes it as a
+      // separate transpose.
+      NoAttributeCase(kReshape, ReshapeOp);
+      NoAttributeCase(kRsqrt, RsqrtOp);
       NoAttributeCase(kSelect, SelectOp);
       NoAttributeCase(kSubtract, SubOp);
       NoAttributeCase(kTanh, TanhOp);
@@ -365,7 +380,6 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kCopy, CopyOp);
       // TODO(b/129422361) Ops below need additional work to handle attributes.
       NoAttributeCase(kConvolution, ConvOp);
-      NoAttributeCase(kReshape, ReshapeOp);
 #undef NoAttributeCase
 #undef MakeAndReturn
     case HloOpcode::kAddDependency:
@@ -374,7 +388,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       // is not mentioned in xla client anywhere or in the hlo of our sample
       // models.
     default: {
-      mlir::OperationState result(loc, "xla.unknown");
+      mlir::OperationState result(loc, "xla_hlo.unknown");
       result.addOperands(operands);
       result.addTypes(result_type);
       for (auto attr : attributes) {
@@ -429,6 +443,15 @@ StatusOr<mlir::RankedTensorType> HloFunctionImporter::ConvertTensorType(
       return builder_->getTensorType(array, builder_->getIntegerType(32));
     case PrimitiveType::S64:
       return builder_->getTensorType(array, builder_->getIntegerType(64));
+    // TODO(b/130356985): Update once MLIR supports unsigned integers.
+    case PrimitiveType::U8:
+      return builder_->getTensorType(array, builder_->getIntegerType(8));
+    case PrimitiveType::U16:
+      return builder_->getTensorType(array, builder_->getIntegerType(16));
+    case PrimitiveType::U32:
+      return builder_->getTensorType(array, builder_->getIntegerType(32));
+    case PrimitiveType::U64:
+      return builder_->getTensorType(array, builder_->getIntegerType(64));
     default:
       return tensorflow::errors::Internal(
           absl::StrCat("Unsupported type: ", PrimitiveType_Name(type)));
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index ee321432f4d..13671dd0310 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -88,7 +89,8 @@ class HloFunctionImporter {
       xla::HloInstruction* instruction);
 
   // Converts the dimensions of an HLO instruction into an MLIR attribute.
-  mlir::ElementsAttr ConvertDimensions(llvm::ArrayRef<int64> op_dimensions);
+  mlir::ElementsAttr ConvertDimensions(
+      llvm::ArrayRef<tensorflow::int64> op_dimensions);
 
   // Converts Array ref to an ElementsAttr.
   mlir::ElementsAttr Convert(llvm::ArrayRef<int64_t> op_dimensions);
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
index f11e06a56f9..ba6519211ce 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/hlo_module_importer.h"
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.h b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
index 6603ef8500f..5e8005f9489 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
index 79eda9cd278..f5e5b0ad257 100644
--- a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
-using namespace mlir;
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 
 // Static initialization for XLA dialect registration.
-static DialectRegistration<XLA::XLADialect> XlaOps;
+static mlir::DialectRegistration<mlir::xla_hlo::XlaHloDialect> xla_hlo_ops;
+static mlir::DialectRegistration<mlir::xla_lhlo::XlaLhloDialect> xla_lhlo_ops;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
new file mode 100644
index 00000000000..a5df379d90b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -0,0 +1,239 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the XLA dialect.
+
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Dialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h.inc"
+
+using namespace mlir;
+using namespace mlir::xla_hlo;
+
+XlaHloDialect::XlaHloDialect(MLIRContext* context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc.inc"
+      >();
+
+  // Support unknown operations because not all XLA operations are registered.
+  // allowUnknownOperations();
+}
+
+Operation* XlaHloDialect::materializeConstant(OpBuilder& builder,
+                                              Attribute value, Type type,
+                                              Location loc) {
+  // If this is an opaque elements attribute, then generate an xla_hlo.constant.
+  if (value.isa<OpaqueElementsAttr>())
+    return builder.create<xla_hlo::ConstOp>(loc, type,
+                                            value.cast<ElementsAttr>());
+  return nullptr;
+}
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc.inc"
+
+//===----------------------------------------------------------------------===//
+// ConstOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+
+  // Return the held attribute value.
+  return value();
+}
+
+// Builds a constant op with the specified attribute `value`.
+void ConstOp::build(Builder* builder, OperationState* result, Attribute value) {
+  Type type;
+  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
+    type = elemAttr.getType();
+  } else if (value.isa<BoolAttr>() || value.isa<FloatAttr>() ||
+             value.isa<IntegerAttr>()) {
+    // All XLA types must be tensor types. In the build() method, we want to
+    // provide more flexiblity by allowing attributes of scalar types. But we
+    // need to wrap it up with ElementsAttr to construct valid XLA constants.
+    type = RankedTensorType::get(/*shape=*/{}, value.getType());
+    value = DenseElementsAttr::get(type.cast<TensorType>(), value);
+  }
+
+  // TODO: support other XLA specific types.
+  assert(type && "unsupported attribute type for building xla_hlo.constant");
+  result->types.push_back(type);
+  result->addAttribute("value", value);
+}
+
+//===----------------------------------------------------------------------===//
+// ConvertOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Converts the values of an ElementsAttr into the corresponding type.
+ElementsAttr ConvertElements(const ElementsAttr& elements, Type newType) {
+  auto oldType = getElementTypeOrSelf(elements);
+  size_t bitWidth = newType.isBF16() ? 64 : newType.getIntOrFloatBitWidth();
+
+  if (oldType.isa<FloatType>()) {
+    // mapValues always takes a function returning APInt, even when the output
+    // is actually float.
+    using func_type = APInt(const APFloat&);
+    if (auto newFloatType = newType.dyn_cast<FloatType>()) {
+      // Float -> Float
+      return elements.mapValues(
+          newType, llvm::function_ref<func_type>([&newFloatType](
+                                                     const APFloat& floatVal) {
+            APFloat newDouble(FloatAttr::getValueAsDouble(floatVal));
+            bool losesInfo = false;
+            newDouble.convert(newFloatType.getFloatSemantics(),
+                              llvm::APFloat::rmNearestTiesToEven, &losesInfo);
+            return newDouble.bitcastToAPInt();
+          }));
+    }
+    // Float -> Int
+    return elements.mapValues(
+        newType,
+        llvm::function_ref<func_type>([&bitWidth](const APFloat& floatVal) {
+          return APInt(bitWidth, FloatAttr::getValueAsDouble(floatVal));
+        }));
+  }
+
+  // oldType is Integer
+  // mapValues always takes a function returning APInt, even when the output
+  // is actually float.
+  using func_type = APInt(const APInt&);
+  if (auto newFloatType = newType.dyn_cast<FloatType>()) {
+    // Int -> Float
+    return elements.mapValues(
+        newType,
+        llvm::function_ref<func_type>([&newFloatType](const APInt& intVal) {
+          APFloat newDouble(static_cast<double>(intVal.getLimitedValue()));
+          bool losesInfo = false;
+          newDouble.convert(newFloatType.getFloatSemantics(),
+                            llvm::APFloat::rmNearestTiesToEven, &losesInfo);
+          return newDouble.bitcastToAPInt();
+        }));
+  }
+  // newType is Integer
+  // Int -> Int
+  return elements.mapValues(
+      newType, llvm::function_ref<func_type>([&bitWidth](const APInt& intVal) {
+        return APInt(bitWidth, intVal.getLimitedValue());
+      }));
+}
+
+}  // namespace
+
+OpFoldResult ConvertOp::fold(ArrayRef<Attribute> operands) {
+  if (getOperand()->getType() == getResult()->getType()) return getOperand();
+
+  // If the operand is constant, we can do the conversion now.
+  if (auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>()) {
+    return ConvertElements(elementsAttr, getElementTypeOrSelf(getResult()));
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// IotaOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
+  const auto output_type = getResult()->getType().cast<ShapedType>();
+  const auto output_size = output_type.getNumElements();
+  const auto dimension = iota_dimension().getLimitedValue();
+  const auto max_dim_size = output_type.getDimSize(dimension);
+  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
+
+  llvm::SmallVector<APInt, 10> values;
+  values.reserve(output_size);
+
+  int64_t increase_stride = output_size;
+  for (int i = 0; i <= dimension; i++) {
+    increase_stride /= output_type.getDimSize(i);
+  }
+
+  int64_t current_value = 0;
+  for (int i = 0; i < output_size; i++) {
+    int64_t value = (current_value / increase_stride) % max_dim_size;
+    values.push_back(APInt(bitwidth, value));
+    ++current_value;
+  }
+
+  return DenseIntElementsAttr::get(output_type, values);
+}
+
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
+  if (getOperand()->getType() == getType()) {
+    return getOperand();
+  }
+
+  if (auto prev_op =
+          dyn_cast_or_null<ReshapeOp>(getOperand()->getDefiningOp())) {
+    setOperand(prev_op.getOperand());
+    return getResult();
+  }
+
+  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
+    return elements.reshape(getResult()->getType().cast<ShapedType>());
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+  for (auto it : llvm::enumerate(permutation().getValues<APInt>())) {
+    if (it.index() != it.value()) {
+      return {};
+    }
+  }
+  return getOperand();
+}
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
similarity index 66%
rename from tensorflow/compiler/mlir/xla/ir/xla_ops.h
rename to tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 2be8160d4ec..3260a829734 100644
--- a/tensorflow/compiler/mlir/xla/ir/xla_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -15,24 +15,29 @@ limitations under the License.
 
 // This file defines the operations used in the XLA dialect.
 
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_OPS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Dialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/Support/Functional.h"  // TF:local_config_mlir
 
 namespace mlir {
-class Builder;
+class OpBuilder;
 
-namespace XLA {
+namespace xla_hlo {
 
-class XLADialect : public Dialect {
+class XlaHloDialect : public Dialect {
  public:
-  XLADialect(MLIRContext *context);
-  static StringRef getDialectNamespace() { return "xla"; }
+  explicit XlaHloDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "xla_hlo"; }
 
   // Registered hook to materialize a constant operation from a given attribute
   // value with the desired resultant type.
@@ -41,9 +46,9 @@ class XLADialect : public Dialect {
 };
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h.inc"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h.inc"
 
-}  // end namespace XLA
+}  // end namespace xla_hlo
 }  // end namespace mlir
 
-#endif  //  TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_OPS_H_
+#endif  //  TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
similarity index 50%
rename from tensorflow/compiler/mlir/xla/ir/xla_ops.td
rename to tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index a05dd9b3d1d..7775377c94b 100644
--- a/tensorflow/compiler/mlir/xla/ir/xla_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -15,22 +15,27 @@ limitations under the License.
 
 // This is the operation definition file for XLA.
 
-#ifdef XLA_OPS
+#ifdef HLO_OPS
 #else
-#define XLA_OPS
+#define HLO_OPS
 
 #ifdef OP_BASE
 #else
 include "mlir/IR/OpBase.td"
 #endif // OP_BASE
 
-def XLA_Dialect : Dialect {
-  let name = "xla";
-  let cppNamespace = "XLA";
+#ifdef HLO_OPS_BASE
+#else
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
+#endif
+
+def HLO_Dialect : Dialect {
+  let name = "xla_hlo";
+  let cppNamespace = "xla_hlo";
 }
 
-class XLA_Op<string mnemonic, list<OpTrait> traits> :
-    Op<XLA_Dialect, mnemonic, traits> {
+class HLO_Op<string mnemonic, list<OpTrait> traits> :
+    Op<HLO_Dialect, mnemonic, traits> {
   // Whether this operation has a custom conversion to HLO or not.
   bit hasCustomHLOConverter = 0b0;
 }
@@ -39,44 +44,34 @@ class XLA_Op<string mnemonic, list<OpTrait> traits> :
 // XLA type definitions.
 //===----------------------------------------------------------------------===//
 
-def XLA_Int : IntOfWidths<[8, 16, 32, 64]>;
-
 // Any integer tensor types
-def XLA_IntTensor : StaticShapeTensorOf<[XLA_Int]>;
+def HLO_IntTensor : StaticShapeTensorOf<[HLO_Int]>;
 
 // Any floating-point tensor types
-def XLA_FpTensor : StaticShapeTensorOf<[AnyFloat]>;
+def HLO_FpTensor : StaticShapeTensorOf<[AnyFloat]>;
 
-def XLA_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
-
-def XLA_PredTensor : StaticShapeTensorOf<[XLA_Pred]>;
+def HLO_PredTensor : StaticShapeTensorOf<[HLO_Pred]>;
 
 // Any integer or floating-point tensor types
-def XLA_IntOrFpTensor : StaticShapeTensorOf<[XLA_Int, AnyFloat]>;
+def HLO_IntOrFpTensor : StaticShapeTensorOf<[HLO_Int, AnyFloat]>;
 
-def XLA_Tensor : StaticShapeTensorOf<[AnyFloat, AnyInteger]>;
+def HLO_Tensor : StaticShapeTensorOf<[AnyFloat, AnyInteger]>;
 
-def XLA_Tuple : NestedTupleOf<[XLA_Tensor]>;
+def HLO_Tuple : NestedTupleOf<[HLO_Tensor]>;
 
-def XLA_TensorOrTuple : AnyTypeOf<[XLA_Tensor, XLA_Tuple]>;
+def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
 //===----------------------------------------------------------------------===//
 // XLA nullary op definitions.
 //===----------------------------------------------------------------------===//
 
-def XLA_ConstOp : XLA_Op<"constant", [NoSideEffect]> {
-  let summary = "Constant operator";
-
-  let description = [{
-    Represents a constant value.
-  }];
-
+def HLO_ConstOp : BASE_HLO_ConstOp, HLO_Op<"constant", [NoSideEffect]> {
   let arguments = (ins
     ElementsAttr:$value
   );
 
   let results = (outs
-    XLA_Tensor:$output
+    HLO_Tensor:$output
   );
 
   let builders = [OpBuilder<
@@ -89,16 +84,10 @@ def XLA_ConstOp : XLA_Op<"constant", [NoSideEffect]> {
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_IotaOp : XLA_Op<"iota", [NoSideEffect]> {
-  let summary = "Iota operator";
-
-  let description = [{
-    Creates a rank 1 array of values starting at zero and incrementing by one.
-  }];
-
+def HLO_IotaOp : BASE_HLO_IotaOp, HLO_Op<"iota", [NoSideEffect]> {
   let arguments = (ins I64Attr:$iota_dimension);
 
-  let results = (outs XLA_Tensor:$output);
+  let results = (outs HLO_Tensor:$output);
 
   let hasFolder = 1;
 
@@ -110,32 +99,17 @@ def XLA_IotaOp : XLA_Op<"iota", [NoSideEffect]> {
 // XLA unary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-class XLA_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits>:
-    XLA_Op<mnemonic, traits>, Arguments<(ins XLA_Tensor:$operand)>,
-    Results<(outs XLA_Tensor:$res)>;
+class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits>:
+    HLO_Op<mnemonic, traits> {
 
-def XLA_AbsOp: XLA_UnaryElementwiseOp<"abs", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Absolute value operator";
-
-  let description = [{
-    Returns `abs(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
+    let arguments = (ins HLO_Tensor);
+    let results = (outs HLO_Tensor);
 }
 
-def XLA_ConvertOp : XLA_UnaryElementwiseOp<
-      "convert", [NoSideEffect, SameOperandsAndResultShape]> {
-  let summary = "Convert operator";
-
-  let description = [{
-    Performs element-wise conversion of values from one type to another, e.g.
-    float to int.
-
-    See https://www.tensorflow.org/xla/operation_semantics#convertelementtype.
-  }];
+def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AbsOp;
 
+def HLO_ConvertOp : HLO_UnaryElementwiseOp<
+      "convert", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ConvertOp {
   let hasFolder = 1;
 
   // TODO(b/130357376) Convert has a special constructor. Use a custom
@@ -143,153 +117,65 @@ def XLA_ConvertOp : XLA_UnaryElementwiseOp<
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_NegOp: XLA_UnaryElementwiseOp<"neg", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Negation operator";
+def HLO_ExpOp: HLO_UnaryElementwiseOp<"exp", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ExpOp;
 
-  let description = [{
-    Returns `-operand` element-wise.
+def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_FloorOp;
 
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
+def HLO_LogOp: HLO_UnaryElementwiseOp<"log", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_LogOp;
 
-def XLA_SignOp: XLA_UnaryElementwiseOp<"sign", [NoSideEffect, SameOperandsAndResultShape]> {
-  let summary = "Sign operator";
+def HLO_NegOp: HLO_UnaryElementwiseOp<"neg", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_NegOp;
 
-  let description = [{
-    Returns `sign(operand)` element-wise, where
+def HLO_RsqrtOp: HLO_UnaryElementwiseOp<"rsqrt", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RsqrtOp;
 
-    ```
-    sign(x) = -1  : x < 0
-            = -0  : x = -0
-            = NaN : x = NaN
-            = +0  : x = +0
-            = 1   : x > 0
-    ```
+def HLO_SignOp: HLO_UnaryElementwiseOp<"sign", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_SignOp;
 
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-
-def XLA_TanhOp: XLA_UnaryElementwiseOp<"tanh",
-    [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Tanh operator";
-
-  let description = [{
-    Returns `tanh(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
+def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
+    [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_TanhOp;
 
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 
-// The broadcasting dimensions correspond to a tuple that describes how a
-// smaller rank shape is broadcast into a larger rank shape. For example,
-// given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
-// matching the matrix to dimensions 1 and 2 of the cuboid.
-def BroadcastDimAttr : OptionalAttr<ElementsAttr>;
-
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
-class XLA_BinaryElementwiseOp<string mnemonic,
-    list<OpTrait> traits, dag args = (ins)> :
-        XLA_Op<mnemonic, traits>,
-        Arguments<(
-            ins XLA_Tensor:$lhs,
-            XLA_Tensor:$rhs,
-            BroadcastDimAttr:$broadcast_dimensions
-        )>,
-        Results<(outs XLA_Tensor:$res)> {
+class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
+        HLO_Op<mnemonic, traits> {
+  let arguments = (ins
+      HLO_Tensor:$lhs,
+      HLO_Tensor:$rhs,
+      BroadcastDimAttr:$broadcast_dimensions
+  );
+  let results = (outs HLO_Tensor);
   let parser = [{ return mlir::impl::parseBinaryOp(parser, result); }];
   let printer = [{ return mlir::impl::printBinaryOp(getOperation(), p); }];
 }
 
-def XLA_AddOp : XLA_BinaryElementwiseOp<"add",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Addition operator";
+def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp;
 
-  let description = [{
-    Returns `lhs + rhs` element-wise.
+def HLO_DivOp : HLO_BinaryElementwiseOp<"div",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp;
 
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
+def HLO_MaxOp : HLO_BinaryElementwiseOp<"max",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp;
 
-def XLA_DivOp : XLA_BinaryElementwiseOp<"div",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Division operator";
+def HLO_MinOp : HLO_BinaryElementwiseOp<"min",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp;
 
-  let description = [{
-    Returns `lhs / rhs` element-wise.
+def HLO_MulOp : HLO_BinaryElementwiseOp<"mul",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp;
 
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
+def HLO_SubOp : HLO_BinaryElementwiseOp<"sub",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp;
 
-def XLA_MaxOp : XLA_BinaryElementwiseOp<"max",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Maximum operator";
-
-  let description = [{
-    Returns `max(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-
-def XLA_MinOp : XLA_BinaryElementwiseOp<"min",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Minimum operator";
-
-  let description = [{
-    Returns `min(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-
-def XLA_MulOp : XLA_BinaryElementwiseOp<"mul",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Multiplication operator";
-
-  let description = [{
-    Returns `lhs * rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-
-def XLA_SubOp : XLA_BinaryElementwiseOp<"sub",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Subtraction operator";
-
-  let description = [{
-    Returns `lhs - rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-
-def XLA_AndOp: XLA_BinaryElementwiseOp<"and", [Commutative, NoSideEffect]>;
+def HLO_AndOp: HLO_BinaryElementwiseOp<"and", [Commutative, NoSideEffect]>, BASE_HLO_AndOp;
 
 //===----------------------------------------------------------------------===//
 // XLA control flow op definitions.
 //===----------------------------------------------------------------------===//
-def XLA_WhileOp: XLA_Op<"while", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "While operator";
+def HLO_WhileOp: HLO_Op<"while", [NoSideEffect, SameOperandsAndResultType]> {
+  string summary = "While operator";
 
-  let description = [{
+  string description = [{
     Returns the result of executing a body function until the cond body returns
     true.
 
@@ -297,34 +183,26 @@ def XLA_WhileOp: XLA_Op<"while", [NoSideEffect, SameOperandsAndResultType]> {
   }];
 
   let arguments = (ins
-    Variadic<XLA_TensorOrTuple>:$val,
+    Variadic<HLO_TensorOrTuple>:$val,
     SymbolRefAttr:$cond,
     SymbolRefAttr:$body
   );
 
-  let results = (outs Variadic<XLA_TensorOrTuple>:$res);
+  let results = (outs Variadic<HLO_TensorOrTuple>);
 
   // TODO(b/129422361): WhileOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_ReduceOp: XLA_Op<"reduce", [NoSideEffect]> {
-  let summary = "Reduce operator";
-
-  let description = [{
-    Returns the result of executing a reduction function on one or more arrays
-    in parallel.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reduce.
-  }];
+def HLO_ReduceOp: HLO_Op<"reduce", [NoSideEffect]>, BASE_HLO_ReduceOp {
 
   let arguments = (ins
-    Variadic<XLA_TensorOrTuple>:$operands_and_init,
+    Variadic<HLO_TensorOrTuple>:$operands_and_init,
     SymbolRefAttr:$computation,
     ElementsAttr:$dimensions
   );
 
-  let results = (outs Variadic<XLA_Tensor>:$res);
+  let results = (outs Variadic<HLO_Tensor>);
 
   // TODO(b/129422361): ReduceOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
@@ -333,147 +211,67 @@ def XLA_ReduceOp: XLA_Op<"reduce", [NoSideEffect]> {
 //===----------------------------------------------------------------------===//
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
-def XLA_GetTupleElementOp: XLA_Op<"get_tuple_element", [NoSideEffect]> {
-  let summary = "GetTupleElement operator";
-
-  let description = [{
-    Returns a member of a tuple specified by an index.
-
-    See https://www.tensorflow.org/xla/operation_semantics#gettupleelement.
-  }];
-
+def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [NoSideEffect]>, BASE_HLO_GetTupleElementOp {
   let arguments = (ins
-    XLA_Tuple,
+    HLO_Tuple,
     I32Attr:$index
   );
 
-  let results = (outs XLA_TensorOrTuple);
+  let results = (outs HLO_TensorOrTuple);
 
   // GetTupleElementOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_TupleOp : XLA_Op<"tuple", [NoSideEffect]> {
-   let summary = "XLA's tuple op";
-
-   let description = [{
-     Groups a set of tensor inputs into a single tuple object.
-
-     See https://www.tensorflow.org/xla/operation_semantics#tuple.
-   }];
-
-   let arguments = (ins Variadic<XLA_TensorOrTuple>:$val);
-   let results = (outs XLA_Tuple:$res);
+def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
+   let arguments = (ins Variadic<HLO_TensorOrTuple>:$val);
+   let results = (outs HLO_Tuple);
 
   // TupleOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// Precision Config enum definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA PrecisionConfig proto enum.
-def XLA_PRECISION_DEFAULT : StrEnumAttrCase<"DEFAULT">;
-def XLA_PRECISION_HIGH    : StrEnumAttrCase<"HIGH">;
-def XLA_PRECISION_HIGHEST : StrEnumAttrCase<"HIGHEST">;
-
-def XLA_PrecisionAttr : StrEnumAttr<"Precision",
-    "XLA precision for an operand. Has backend specific meaning.",
-    [XLA_PRECISION_DEFAULT,  XLA_PRECISION_HIGH, XLA_PRECISION_HIGHEST]>;
-
-// TODO(b/129153247) See if it's possible to also validate the size.
-def XLA_PrecisionConfigAttr:
-    OptionalAttr<
-          TypedArrayAttrBase<XLA_PrecisionAttr, "Precision Config attribute">>;
-
-//===----------------------------------------------------------------------===//
-// Comparison op definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA ComparisonDirection enum.
-def XLA_COMPARISON_DIRECTION_EQ : StrEnumAttrCase<"EQ">;
-def XLA_COMPARISON_DIRECTION_NE : StrEnumAttrCase<"NE">;
-def XLA_COMPARISON_DIRECTION_GE : StrEnumAttrCase<"GE">;
-def XLA_COMPARISON_DIRECTION_GT : StrEnumAttrCase<"GT">;
-def XLA_COMPARISON_DIRECTION_LE : StrEnumAttrCase<"LE">;
-def XLA_COMPARISON_DIRECTION_LT : StrEnumAttrCase<"LT">;
-
-def XLA_ComparisonDirectionAttr : StrEnumAttr<"ComparisonDirection",
-    "Which comparison operation to perform.",
-    [
-      XLA_COMPARISON_DIRECTION_EQ,
-      XLA_COMPARISON_DIRECTION_NE,
-      XLA_COMPARISON_DIRECTION_GE,
-      XLA_COMPARISON_DIRECTION_GT,
-      XLA_COMPARISON_DIRECTION_LE,
-      XLA_COMPARISON_DIRECTION_LT
-    ]>;
-
-def XLA_CompareOp: XLA_Op<"compare",
-      [NoSideEffect, SameOperandsAndResultShape]> {
+def HLO_CompareOp: HLO_Op<"compare",
+      [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_CompareOp {
   let arguments = (ins
-      XLA_Tensor:$lhs,
-      XLA_Tensor:$rhs,
+      HLO_Tensor:$lhs,
+      HLO_Tensor:$rhs,
       BroadcastDimAttr:$broadcast_dimensions,
-      XLA_ComparisonDirectionAttr:$comparison_direction
+      HLO_ComparisonDirectionAttr:$comparison_direction
   );
-  let results = (outs I1Tensor:$res);
-  let summary = "Comparison operator";
-
-  let description = [{
-    Compares `lhs` and `rhs` elementwise according to `comparison_direction`.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
-  }];
+  let results = (outs HLO_PredTensor);
 }
 
 //===----------------------------------------------------------------------===//
 // XLA Slice definitions.
 //===----------------------------------------------------------------------===//
 
-def XLA_SliceOp: XLA_UnaryElementwiseOp<"slice",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
+def HLO_SliceOp: HLO_Op<
+      "slice",
+      [NoSideEffect, SameOperandsAndResultElementType,
+       AllTypesMatch<["start_indices", "limit_indices"]>]> {
   let arguments = (
-    ins XLA_Tensor:$operand,
+    ins HLO_Tensor:$operand,
     ElementsAttr:$start_indices,
     ElementsAttr:$limit_indices
   );
 
-  let results = (outs XLA_Tensor:$res);
-
-  let summary = "Slice operator";
-
-  let description = [{
-    Slices a portion of the `operand` into a new configuration.
-
-    See https://www.tensorflow.org/xla/operation_semantics#slice.
-  }];
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129422361) Two of the required arguments comes from the start and
   // limit indices which aren't handled by the codegen.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_DynamicUpdateSliceOp: XLA_UnaryElementwiseOp<"dynamic-update-slice",
-      [NoSideEffect, AllElementTypesMatch<["operand", "res"]>]> {
+def HLO_DynamicUpdateSliceOp: HLO_Op<"dynamic-update-slice",
+      [NoSideEffect, AllElementTypesMatch<["operand", "result"]>]> {
   let arguments = (ins
-    XLA_Tensor:$operand,
-    XLA_Tensor:$update,
-    Variadic<XLA_Tensor>:$start_indices
+    HLO_Tensor:$operand,
+    HLO_Tensor:$update,
+    Variadic<HLO_Tensor>:$start_indices
   );
 
-  let results = (outs XLA_Tensor:$res);
-
-  let summary = "Dynamic Update Slice operator";
-
-  let description = [{
-    DynamicUpdateSlice generates a result which is the value of the input array
-    operand, with a slice update overwritten at start_indices.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dynamicupdateslice.
-  }];
+  let results = (outs HLO_Tensor:$result);
 
   // TODO(b/129422361) Requires a custom constructor.
   let hasCustomHLOConverter = 1;
@@ -484,52 +282,30 @@ def XLA_DynamicUpdateSliceOp: XLA_UnaryElementwiseOp<"dynamic-update-slice",
 // XLA Other op definitions.
 //===----------------------------------------------------------------------===//
 
-def XLA_BatchNormInferenceOp : XLA_Op<"batch_norm_inference", [NoSideEffect]> {
-  let summary = "Batch Normalization for Inference";
-
-  let description = [{
-    Normalizes an array across batch and spatial dimensions.
-
-    See https://www.tensorflow.org/xla/operation_semantics#batchnorminference
-  }];
+def HLO_BatchNormInferenceOp : HLO_Op<"batch_norm_inference", [NoSideEffect]>,
+    BASE_HLO_BatchNormInferenceOp {
 
   let arguments = (ins
-    XLA_Tensor:$operand,
-    XLA_Tensor:$scale,
-    XLA_Tensor:$offset,
-    XLA_Tensor:$mean,
-    XLA_Tensor:$variance,
+    HLO_Tensor:$operand,
+    HLO_Tensor:$scale,
+    HLO_Tensor:$offset,
+    HLO_Tensor:$mean,
+    HLO_Tensor:$variance,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
 
-  let results = (outs
-    XLA_Tensor:$res
-  );
+  let results = (outs HLO_Tensor);
 }
 
-def XLA_BroadcastOp : XLA_Op<"broadcast",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Broadcast a tensor to a higher rank by prepending dimensions";
-
-  let description = [{
-    Broadcasts the operand tensor to a higher rank by prepending
-    `broadcast_sizes` to the dimensions. The current values of the operand are
-    copied into the other dimensions.
-
-    This is a more limited form of broadcasting, that corresponds to the XLA
-    client Broadcast method. For a more general form of broadcasting, see the
-    BroadcastInDimOp.
-
-    See https://www.tensorflow.org/xla/operation_semantics#broadcast.
-  }];
-
+def HLO_BroadcastOp : HLO_Op<"broadcast",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_BroadcastOp {
   let arguments = (ins
-    XLA_Tensor:$operand,
+    HLO_Tensor:$operand,
     ElementsAttr:$broadcast_sizes
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129012527) These should be expressed as type constraints.
   let verifier = [{
@@ -546,7 +322,7 @@ def XLA_BroadcastOp : XLA_Op<"broadcast",
           "broadcast_sizes has rank {0} instead of rank 1", sizesRank));
     }
 
-    auto resultType = res()->getType().cast<RankedTensorType>();
+    auto resultType = getResult()->getType().cast<RankedTensorType>();
     auto resultRank = resultType.getRank();
     auto operandType = operand()->getType().cast<RankedTensorType>();
     auto operandRank = operandType.getRank();
@@ -560,12 +336,7 @@ def XLA_BroadcastOp : XLA_Op<"broadcast",
                         resultRank, operandRank, sizesSize));
     }
 
-    auto raw_sizes = sizes.getValues<int64_t>();
-    llvm::SmallVector<int64_t, 10> expectedShape(raw_sizes.begin(),
-                                                 raw_sizes.end());
-    if (sizes.isSplat()) {
-      expectedShape.resize(sizesSize, raw_sizes.front());
-    }
+    llvm::SmallVector<int64_t, 10> expectedShape(sizes.getValues<int64_t>());
 
     auto operandShape = operandType.getShape();
     expectedShape.insert(expectedShape.end(), operandShape.begin(),
@@ -585,33 +356,14 @@ def XLA_BroadcastOp : XLA_Op<"broadcast",
   }];
 }
 
-def XLA_BroadcastInDimOp : XLA_Op<"broadcast_in_dim",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Broadcast a tensor into the given shape by adding dimensions.";
-
-  let description = [{
-    Broadcasts the `operand` tensor to a higher rank. This is not the limited
-    form of broadcasting exposed as the XLA client broadcast op, but rather the
-    more powerful "InDim" broadcasting, which is closer to the HLO broadcast op
-    and exposed in the XLA client BroadcastInDim method.
-
-    `broadcast_dimensions` maps the operand dimension number to the target shape
-    dimension number. It must have the same size as the rank of the operand. The
-    mapped dimensions must either be the same size or the dimension being
-    broadcast from must be size 1 (degenerate broadcasting).
-
-    For a scalar (0D tensor) operand, `broadcast_dimensions` must be empty. The
-    The scalar value will be broadcast to every element in the target shape.
-
-    See https://www.tensorflow.org/xla/broadcasting.
-  }];
-
+def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_BroadcastInDimOp {
   let arguments = (ins
-    XLA_Tensor:$operand,
+    HLO_Tensor:$operand,
     BroadcastDimAttr:$broadcast_dimensions
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129012527) These should be expressed as type constraints.
   let verifier = [{
@@ -649,7 +401,7 @@ def XLA_BroadcastInDimOp : XLA_Op<"broadcast_in_dim",
           dimensionsSize, operandRank));
     }
 
-    auto resultType = res()->getType().cast<RankedTensorType>();
+    auto resultType = getResult()->getType().cast<RankedTensorType>();
     auto resultRank = resultType.getRank();
     if (resultRank < operandRank) {
       return emitOpError(
@@ -658,7 +410,7 @@ def XLA_BroadcastInDimOp : XLA_Op<"broadcast_in_dim",
     }
 
     for (int i = 0; i != dimensionsSize; ++i) {
-      auto dimIndex = dimensions.getValue(i).cast<IntegerAttr>().getInt();
+      auto dimIndex = dimensions.getValue<int64_t>(i);
       if (dimIndex >= resultRank) {
         return emitOpError(
             llvm::formatv("broadcast_dimensions contains invalid value {0} for "
@@ -684,29 +436,15 @@ def XLA_BroadcastInDimOp : XLA_Op<"broadcast_in_dim",
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_ClampOp : XLA_Op<"clamp",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Clamp operator";
-
-  let description = [{
-    Clamps an operand to within the range between a minimum and maximum value.
-
-    Note: All three arrays must be the same shape. Alternatively, as a
-          restricted form of broadcasting, min and/or max can be a scalar (0D
-          tensor) of the element type of the tensor operand.
-
-    See https://www.tensorflow.org/xla/operation_semantics#clamp.
-  }];
-
+def HLO_ClampOp : HLO_Op<"clamp",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ClampOp {
   let arguments = (ins
-    XLA_Tensor:$min,
-    XLA_Tensor:$operand,
-    XLA_Tensor:$max
+    HLO_Tensor:$min,
+    HLO_Tensor:$operand,
+    HLO_Tensor:$max
   );
 
-  let results = (outs
-    XLA_Tensor:$res
-  );
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129012527) These should be expressed as type constraints.
   let verifier = [{
@@ -739,18 +477,11 @@ def XLA_ClampOp : XLA_Op<"clamp",
   }];
 }
 
-def XLA_ConcatenateOp : XLA_Op<"concatenate",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-   let summary = "XLA's concantenate op";
-
-   let description = [{
-     Concatenates a set of tensors along the specified dimension.
-
-     See https://www.tensorflow.org/xla/operation_semantics#concatenate.
-   }];
+def HLO_ConcatenateOp : HLO_Op<"concatenate",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ConcatenateOp {
 
    let arguments = (
-     ins Variadic<XLA_Tensor>:$val,
+     ins Variadic<HLO_Tensor>:$val,
          I64Attr: $dimension
    );
 
@@ -781,101 +512,72 @@ def XLA_ConcatenateOp : XLA_Op<"concatenate",
      return success();
    }];
 
-   let results = (outs XLA_Tensor:$res);
+   let results = (outs HLO_Tensor);
 
   // TODO(b/129422361) ConcatOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_ConvOp : XLA_Op<"conv", [NoSideEffect]> {
-  let summary = "Convolution operator";
-
-  let description = [{
-    Computes a convolution of the kind used in neural networks.
-
-    See https://www.tensorflow.org/xla/operation_semantics#conv_convolution.
-  }];
-
+def HLO_ConvOp : HLO_Op<"conv", [NoSideEffect]>, BASE_HLO_ConvOp {
   let arguments = (ins
-    XLA_Tensor:$lhs,
-    XLA_Tensor:$rhs
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129422361) Needs additional work to handle attributes.
   // Conv has custom handling because its other args are passed as attributes
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_CopyOp: XLA_UnaryElementwiseOp<"copy", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Copy operator";
+def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]> {
+  string summary = "Copy operator";
 
-  let description = [{
+  string description = [{
     Returns a copy of `operand`.
   }];
 
+  let arguments = (ins HLO_Tensor);
+  let results = (outs HLO_Tensor);
+
   // TODO(b/129422361) Implement special handling.
   // Copy has an HloOpcode, but is not one of the ops defined in xla_builder.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_DotOp: XLA_Op<"dot", [NoSideEffect]> {
+def HLO_DotOp: HLO_Op<"dot", [NoSideEffect]>, BASE_HLO_DotOp {
   let arguments = (
-        ins XLA_Tensor:$lhs,
-        XLA_Tensor:$rhs,
-        XLA_PrecisionConfigAttr:$precision_config
+        ins HLO_Tensor:$lhs,
+        HLO_Tensor:$rhs,
+        HLO_PrecisionConfigAttr:$precision_config
     );
-  let results = (outs XLA_Tensor:$res);
-
-  let description = [{
-    Performs dot products between vectors, vector/matrix and matrix/matrix
-    multiplication.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dot.
-  }];
+  let results = (outs HLO_Tensor);
 }
 
-def XLA_GatherOp: XLA_Op<"gather", [NoSideEffect]> {
+def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
   let arguments = (
-      ins XLA_Tensor:$operand,
-          XLA_IntTensor:$start_indices,
+      ins HLO_Tensor:$operand,
+          HLO_IntTensor:$start_indices,
           I64Attr: $index_vector_dim,
-          ElementsAttr: $offsets_dim,
+          ElementsAttr: $offset_dims,
           ElementsAttr: $slice_sizes,
-          ElementsAttr: $collapsed_slice_sizes,
+          ElementsAttr: $collapsed_slice_dims,
           ElementsAttr: $start_index_map
   );
 
-  let results = (outs XLA_Tensor:$res);
-
-  let summary = "Gather operator";
-
-  let description = [{
-    Stitches together several slices of an input array.
-
-    See https://www.tensorflow.org/xla/operation_semantics#gather.
-  }];
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129422361) Attributes are not by the codegen. The optional argument
   // (dimensions) needs to be added as an attribute.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_ReshapeOp: XLA_Op<"reshape",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let arguments = (ins XLA_Tensor:$operand);
-
-  let results = (outs XLA_Tensor:$res);
-
-  let summary = "Reshape operator";
-
-  let description = [{
-    Reshapes the dimensions of `operand` into a new configuration.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reshape.
-  }];
+def HLO_ReshapeOp: HLO_Op<"reshape",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReshapeOp {
+  let arguments = (ins HLO_Tensor:$operand);
 
+  let results = (outs HLO_Tensor);
   let hasFolder = 1;
 
   // TODO(b/129422361) One of the required arguments comes from the new shape,
@@ -885,29 +587,14 @@ def XLA_ReshapeOp: XLA_Op<"reshape",
 }
 
 
-def XLA_SelectOp: XLA_Op<"select", [NoSideEffect]> {
-  let summary = "Select operator";
-
-  let description = [{
-    Constructs an output tensor from the elements of `on_true` and `on_false`
-    based on the values of `pred`.
-
-    `on_true` and `on_false` must be the same shape. For each element of `pred`,
-    `res` has the corresponding element of `on_true` or `on_false` depending on
-    the value in `pred`. `pred` must be the same shape as `on_true` and
-    `on_false` or a scalar, in which case `res` is equal to either `on_true` or
-    `on_false`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#select.
-  }];
-
+def HLO_SelectOp: HLO_Op<"select", [NoSideEffect]>, BASE_HLO_SelectOp {
   let arguments = (ins
-    XLA_PredTensor:$pred,
-    XLA_Tensor:$on_true,
-    XLA_Tensor:$on_false
+    HLO_PredTensor:$pred,
+    HLO_Tensor:$on_true,
+    HLO_Tensor:$on_false
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129012527) These should be expressed as type constraints.
   let verifier = [{
@@ -938,48 +625,30 @@ def XLA_SelectOp: XLA_Op<"select", [NoSideEffect]> {
   }];
 }
 
-def XLA_ReverseOp: XLA_Op<"reverse",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Reverse operator";
-
-  let description = [{
-    Reverses the specified dimensions of `operand` according to the given
-    `dimensions`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
-  }];
-
+def HLO_ReverseOp: HLO_Op<"reverse",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReverseOp {
   let arguments = (ins
-    XLA_Tensor:$operand,
+    HLO_Tensor:$operand,
     ElementsAttr:$dimensions
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   // TODO(b/129422361): ReverseOp has a custom constructor for HLO.
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_PadOp: XLA_Op<"pad",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Pad operator";
-
-  let description = [{
-    Pads the edges of `operand` with the `padding_value` and according to
-    the passed configuration.
-
-    See https://www.tensorflow.org/xla/operation_semantics#pad.
-  }];
-
+def HLO_PadOp: HLO_Op<"pad",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PadOp {
   let arguments = (ins
-    XLA_Tensor:$operand,
-    XLA_Tensor:$padding_value,
+    HLO_Tensor:$operand,
+    HLO_Tensor:$padding_value,
     ElementsAttr: $edge_padding_low,
     ElementsAttr: $edge_padding_high,
     ElementsAttr: $interior_padding
   );
 
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
 
   let description = [{
     Pads the `operand` according to TBD.
@@ -1018,8 +687,8 @@ def XLA_PadOp: XLA_Op<"pad",
 
     for (int i = 0, e = input_shape.size(); i < e; i++) {
       int expected_output = input_shape[i]
-          + padding_low.getValue(i).cast<IntegerAttr>().getInt()
-          + padding_high.getValue(i).cast<IntegerAttr>().getInt();
+          + padding_low.getValue<IntegerAttr>(i).getInt()
+          + padding_high.getValue<IntegerAttr>(i).getInt();
       if (expected_output != output_shape[i]) {
         return emitOpError(llvm::formatv("Expected output shape ({0}) and "
             "output shape ({1}) should match.",
@@ -1034,23 +703,15 @@ def XLA_PadOp: XLA_Op<"pad",
   let hasCustomHLOConverter = 1;
 }
 
-def XLA_TransposeOp: XLA_Op<"transpose",
-      [NoSideEffect, SameOperandsAndResultElementType]> {
-  let summary = "Transpose operator";
-
-  let description = [{
-    Permutes the dimensions of `operand` according to the given `permutation`.
-
-    `res_dimensions[i] = operand_dimensions[permutation[i]]`
-
-    See https://www.tensorflow.org/xla/operation_semantics#transpose.
-  }];
-
+def HLO_TransposeOp: HLO_Op<"transpose",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_TransposeOp {
   let arguments = (ins
-    XLA_Tensor:$operand,
+    HLO_Tensor:$operand,
     ElementsAttr:$permutation
   );
-  let results = (outs XLA_Tensor:$res);
+  let results = (outs HLO_Tensor);
+
+  let hasFolder = 1;
 
   // TODO(b/129012527) These should be expressed as type constraints.
   let verifier = [{
@@ -1076,7 +737,7 @@ def XLA_TransposeOp: XLA_Op<"transpose",
           permutationSize, operandRank));
     }
 
-    auto resultType = res()->getType().cast<RankedTensorType>();
+    auto resultType = getResult()->getType().cast<RankedTensorType>();
     auto resultRank = resultType.getRank();
     if (resultRank != operandRank) {
       return emitOpError(
@@ -1088,7 +749,7 @@ def XLA_TransposeOp: XLA_Op<"transpose",
 
     auto expectedShape = SmallVector<int64_t, 10>(operandRank);
     for (int i = 0; i != operandRank; ++i) {
-      auto permutedDim = permutation().getValue(i).cast<IntegerAttr>().getInt();
+      auto permutedDim = permutation().getValue<IntegerAttr>(i).getInt();
       expectedShape[i] = operandType.getDimSize(permutedDim);
     }
 
@@ -1105,4 +766,4 @@ def XLA_TransposeOp: XLA_Op<"transpose",
   }];
 }
 
-#endif // XLA_OPS
+#endif // HLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
new file mode 100644
index 00000000000..28d6efd0aad
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -0,0 +1,528 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef HLO_OPS_BASE
+#else
+#define HLO_OPS_BASE
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def HLO_Int : IntOfWidths<[8, 16, 32, 64]>;
+def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
+
+//===----------------------------------------------------------------------===//
+// XLA nullary op definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_ConstOp {
+  string summary = "Constant operator";
+
+  string description = [{
+    Represents a constant value.
+  }];
+}
+
+class BASE_HLO_IotaOp {
+  string summary = "Iota operator";
+
+  string description = [{
+    Creates a rank 1 array of values starting at zero and incrementing by one.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA unary elementwise op definitions.
+//===----------------------------------------------------------------------===//
+// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
+
+class BASE_HLO_AbsOp {
+  string summary = "Absolute value operator";
+
+  string description = [{
+    Returns `abs(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_ConvertOp {
+  string summary = "Convert operator";
+
+  string description = [{
+    Performs element-wise conversion of values from one type to another, e.g.
+    float to int.
+
+    See https://www.tensorflow.org/xla/operation_semantics#convertelementtype.
+  }];
+}
+
+class BASE_HLO_ExpOp {
+  string summary = "Exponential operator";
+
+  string description = [{
+    Returns `e^(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_FloorOp {
+  string summary = "Floor operator";
+
+  string description = [{
+    Returns `Floor(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_LogOp {
+  string summary = "Logarithm operator";
+
+  string description = [{
+    Returns `log(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_NegOp {
+  string summary = "Negation operator";
+
+  string description = [{
+    Returns `-operand` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_RsqrtOp {
+  string summary = "Reciprocal Square-root operator";
+
+  string description = [{
+    Returns `1.0 / sqrt(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_SignOp {
+  string summary = "Sign operator";
+
+  string description = [{
+    Returns `sign(operand)` element-wise, where
+
+    ```
+    sign(x) = -1  : x < 0
+            = -0  : x = -0
+            = NaN : x = NaN
+            = +0  : x = +0
+            = 1   : x > 0
+    ```
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+class BASE_HLO_TanhOp {
+  string summary = "Tanh operator";
+
+  string description = [{
+    Returns `tanh(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+//===----------------------------------------------------------------------===//
+
+// The broadcasting dimensions correspond to a tuple that describes how a
+// smaller rank shape is broadcast into a larger rank shape. For example,
+// given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
+// matching the matrix to dimensions 1 and 2 of the cuboid.
+def BroadcastDimAttr : OptionalAttr<ElementsAttr>;
+
+class BASE_HLO_AddOp {
+  string summary = "Addition operator";
+
+  string description = [{
+    Returns `lhs + rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_DivOp {
+  string summary = "Division operator";
+
+  string description = [{
+    Returns `lhs / rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_MaxOp {
+  string summary = "Maximum operator";
+
+  string description = [{
+    Returns `max(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_MinOp {
+  string summary = "Minimum operator";
+
+  string description = [{
+    Returns `min(lhs, rhs)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_MulOp {
+  string summary = "Multiplication operator";
+
+  string description = [{
+    Returns `lhs * rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_SubOp {
+  string summary = "Subtraction operator";
+
+  string description = [{
+    Returns `lhs - rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+class BASE_HLO_AndOp {
+  string summary = "Logical and";
+
+  string description = [{
+    Returns `lhs /\ rhs` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA control flow op definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_ReduceOp {
+  string summary = "Reduce operator";
+
+  string description = [{
+    Returns the result of executing a reduction function on one or more arrays
+    in parallel.
+
+    See https://www.tensorflow.org/xla/operation_semantics#reduce.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA tuple op definitions.
+//===----------------------------------------------------------------------===//
+class BASE_HLO_GetTupleElementOp {
+  string summary = "GetTupleElement operator";
+
+  string description = [{
+    Returns a member of a tuple specified by an index.
+
+    See https://www.tensorflow.org/xla/operation_semantics#gettupleelement.
+  }];
+}
+
+class BASE_HLO_TupleOp {
+   string summary = "XLA's tuple op";
+
+   string description = [{
+     Groups a set of tensor inputs into a single tuple object.
+
+     See https://www.tensorflow.org/xla/operation_semantics#tuple.
+   }];
+}
+
+//===----------------------------------------------------------------------===//
+// Precision Config enum definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA PrecisionConfig proto enum.
+def HLO_PRECISION_DEFAULT : StrEnumAttrCase<"DEFAULT">;
+def HLO_PRECISION_HIGH    : StrEnumAttrCase<"HIGH">;
+def HLO_PRECISION_HIGHEST : StrEnumAttrCase<"HIGHEST">;
+
+def HLO_PrecisionAttr : StrEnumAttr<"Precision",
+    "XLA precision for an operand. Has backend specific meaning.",
+    [HLO_PRECISION_DEFAULT,  HLO_PRECISION_HIGH, HLO_PRECISION_HIGHEST]>;
+
+// TODO(b/129153247) See if it's possible to also validate the size.
+def HLO_PrecisionConfigAttr:
+    OptionalAttr<
+          TypedArrayAttrBase<HLO_PrecisionAttr, "Precision Config attribute">>;
+
+//===----------------------------------------------------------------------===//
+// Comparison op definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA ComparisonDirection enum.
+def HLO_COMPARISON_DIRECTION_EQ : StrEnumAttrCase<"EQ">;
+def HLO_COMPARISON_DIRECTION_NE : StrEnumAttrCase<"NE">;
+def HLO_COMPARISON_DIRECTION_GE : StrEnumAttrCase<"GE">;
+def HLO_COMPARISON_DIRECTION_GT : StrEnumAttrCase<"GT">;
+def HLO_COMPARISON_DIRECTION_LE : StrEnumAttrCase<"LE">;
+def HLO_COMPARISON_DIRECTION_LT : StrEnumAttrCase<"LT">;
+
+def HLO_ComparisonDirectionAttr : StrEnumAttr<"ComparisonDirection",
+    "Which comparison operation to perform.",
+    [
+      HLO_COMPARISON_DIRECTION_EQ,
+      HLO_COMPARISON_DIRECTION_NE,
+      HLO_COMPARISON_DIRECTION_GE,
+      HLO_COMPARISON_DIRECTION_GT,
+      HLO_COMPARISON_DIRECTION_LE,
+      HLO_COMPARISON_DIRECTION_LT
+    ]>;
+
+class BASE_HLO_CompareOp {
+  string summary = "Comparison operator";
+
+  string description = [{
+    Compares `lhs` and `rhs` elementwise according to `comparison_direction`.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA Slice definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_SliceOp {
+  string summary = "Slice operator";
+
+  string description = [{
+    Slices a portion of the `operand` into a new configuration.
+
+    See https://www.tensorflow.org/xla/operation_semantics#slice.
+  }];
+}
+
+class BASE_HLO_DynamicUpdateSliceOp {
+  string summary = "Dynamic Update Slice operator";
+
+  string description = [{
+    DynamicUpdateSlice generates a result which is the value of the input array
+    operand, with a slice update overwritten at start_indices.
+
+    See https://www.tensorflow.org/xla/operation_semantics#dynamicupdateslice.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA Other op definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_BatchNormInferenceOp {
+  string summary = "Batch Normalization for Inference";
+
+  string description = [{
+    Normalizes an array across batch and spatial dimensions.
+
+    See https://www.tensorflow.org/xla/operation_semantics#batchnorminference
+  }];
+}
+
+class BASE_HLO_BroadcastOp  {
+  string summary = "Broadcast a tensor to a higher rank by prepending dimensions";
+
+  string description = [{
+    Broadcasts the operand tensor to a higher rank by prepending
+    `broadcast_sizes` to the dimensions. The current values of the operand are
+    copied into the other dimensions.
+
+    This is a more limited form of broadcasting, that corresponds to the XLA
+    client Broadcast method. For a more general form of broadcasting, see the
+    BroadcastInDimOp.
+
+    See https://www.tensorflow.org/xla/operation_semantics#broadcast.
+  }];
+}
+
+class BASE_HLO_BroadcastInDimOp  {
+  string summary = "Broadcast a tensor into the given shape by adding dimensions.";
+
+  string description = [{
+    Broadcasts the `operand` tensor to a higher rank. This is not the limited
+    form of broadcasting exposed as the XLA client broadcast op, but rather the
+    more powerful "InDim" broadcasting, which is closer to the HLO broadcast op
+    and exposed in the XLA client BroadcastInDim method.
+
+    `broadcast_dimensions` maps the operand dimension number to the target shape
+    dimension number. It must have the same size as the rank of the operand. The
+    mapped dimensions must either be the same size or the dimension being
+    broadcast from must be size 1 (degenerate broadcasting).
+
+    For a scalar (0D tensor) operand, `broadcast_dimensions` must be empty. The
+    The scalar value will be broadcast to every element in the target shape.
+
+    See https://www.tensorflow.org/xla/broadcasting.
+  }];
+}
+
+class BASE_HLO_ClampOp  {
+  string summary = "Clamp operator";
+
+  string description = [{
+    Clamps an operand to within the range between a minimum and maximum value.
+
+    Note: All three arrays must be the same shape. Alternatively, as a
+          restricted form of broadcasting, min and/or max can be a scalar (0D
+          tensor) of the element type of the tensor operand.
+
+    See https://www.tensorflow.org/xla/operation_semantics#clamp.
+  }];
+}
+
+class BASE_HLO_ConcatenateOp {
+   string summary = "XLA's concantenate op";
+
+   string description = [{
+     Concatenates a set of tensors along the specified dimension.
+
+     See https://www.tensorflow.org/xla/operation_semantics#concatenate.
+   }];
+}
+
+class BASE_HLO_ConvOp {
+  string summary = "Convolution operator";
+
+  string description = [{
+    Computes a convolution of the kind used in neural networks.
+
+    See https://www.tensorflow.org/xla/operation_semantics#conv_convolution.
+  }];
+}
+
+class BASE_HLO_DotOp {
+  string summary = "Dot operator";
+  string description = [{
+    Performs dot products between vectors, vector/matrix and matrix/matrix
+    multiplication.
+
+    See https://www.tensorflow.org/xla/operation_semantics#dot.
+  }];
+}
+
+class BASE_HLO_GatherOp{
+  string summary = "Gather operator";
+
+  string description = [{
+    Stitches together several slices of an input array.
+
+    See https://www.tensorflow.org/xla/operation_semantics#gather.
+  }];
+}
+
+class BASE_HLO_ReshapeOp {
+  string summary = "Reshape operator";
+
+  string description = [{
+    Reshapes the dimensions of `operand` into a new configuration.
+
+    See https://www.tensorflow.org/xla/operation_semantics#reshape.
+  }];
+}
+
+class BASE_HLO_SelectOp {
+  string summary = "Select operator";
+
+  string description = [{
+    Constructs an output tensor from the elements of `on_true` and `on_false`
+    based on the values of `pred`.
+
+    `on_true` and `on_false` must be the same shape. For each element of `pred`,
+    `res` has the corresponding element of `on_true` or `on_false` depending on
+    the value in `pred`. `pred` must be the same shape as `on_true` and
+    `on_false` or a scalar, in which case `res` is equal to either `on_true` or
+    `on_false`.
+
+    See https://www.tensorflow.org/xla/operation_semantics#select.
+  }];
+}
+
+class BASE_HLO_ReverseOp {
+  string summary = "Reverse operator";
+
+  string description = [{
+    Reverses the specified dimensions of `operand` according to the given
+    `dimensions`.
+
+    See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
+  }];
+}
+
+class BASE_HLO_PadOp {
+  string summary = "Pad operator";
+
+  string description = [{
+    Pads the edges of `operand` with the `padding_value` and according to
+    the passed configuration.
+
+    See https://www.tensorflow.org/xla/operation_semantics#pad.
+  }];
+}
+
+class BASE_HLO_TransposeOp {
+  string summary = "Transpose operator";
+
+  string description = [{
+    Permutes the dimensions of `operand` according to the given `permutation`.
+
+    `res_dimensions[i] = operand_dimensions[permutation[i]]`
+
+    See https://www.tensorflow.org/xla/operation_semantics#transpose.
+  }];
+}
+
+#endif // HLO_OPS_BASE
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
new file mode 100644
index 00000000000..312654ef320
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the XLA dialect.
+
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Dialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h.inc"
+
+namespace mlir {
+namespace xla_lhlo {
+
+XlaLhloDialect::XlaLhloDialect(MLIRContext* context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc.inc"
+      >();
+}
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc.inc"
+
+// TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
new file mode 100644
index 00000000000..f73e5026541
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LXLA dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Dialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/Support/Functional.h"  // TF:local_config_mlir
+
+namespace mlir {
+class OpBuilder;
+
+namespace xla_lhlo {
+
+class XlaLhloDialect : public Dialect {
+ public:
+  explicit XlaLhloDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "xla_lhlo"; }
+};
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h.inc"
+
+}  // namespace xla_lhlo
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
new file mode 100644
index 00000000000..003247cca8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -0,0 +1,323 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for LXLA.
+
+#ifdef LHLO_OPS
+#else
+#define LHLO_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+#ifdef HLO_OPS_BASE
+#else
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
+#endif
+
+def LHLO_Dialect : Dialect {
+  let name = "xla_lhlo";
+  let cppNamespace = "xla_lhlo";
+}
+
+//===----------------------------------------------------------------------===//
+// XLA type definitions.
+//===----------------------------------------------------------------------===//
+
+// Any integer tensor types
+def LHLO_IntBuffer : StaticShapeMemRefOf<[HLO_Int]>;
+
+// Any floating-point tensor types
+def LHLO_FpBuffer : StaticShapeMemRefOf<[AnyFloat]>;
+
+
+def LHLO_PredBuffer : StaticShapeMemRefOf<[HLO_Pred]>;
+
+// Any integer or floating-point tensor types
+def LHLO_IntOrFpBuffer : StaticShapeMemRefOf<[HLO_Int, AnyFloat]>;
+
+def LHLO_Buffer : StaticShapeMemRefOf<[AnyFloat, AnyInteger]>;
+
+def LHLO_TupleBuffer : NestedTupleOf<[LHLO_Buffer]>;
+
+def LHLO_BufferOrTuple : AnyTypeOf<[LHLO_Buffer, LHLO_TupleBuffer]>;
+
+//===----------------------------------------------------------------------===//
+// XLA nullary op definitions.
+//===----------------------------------------------------------------------===//
+
+class LHLO_Op<string mnemonic, list<OpTrait> traits> : Op<LHLO_Dialect,
+                                                          mnemonic, traits>;
+
+def LHLO_ConstOp : BASE_HLO_ConstOp, LHLO_Op<"constant", []> {
+  let arguments = (ins
+    ElementsAttr:$value,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_IotaOp : BASE_HLO_IotaOp, LHLO_Op<"iota", []> {
+  let arguments = (ins I64Attr:$iota_dimension,
+                   LHLO_Buffer:$output);
+}
+
+//===----------------------------------------------------------------------===//
+// XLA unary elementwise op definitions.
+//===----------------------------------------------------------------------===//
+// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
+
+class LHLO_UnaryElementwiseOp<string mnemonic> :
+    LHLO_Op<mnemonic, [SameTypeOperands]> {
+  let arguments = (ins LHLO_Buffer:$input,
+                       LHLO_Buffer:$output);
+}
+
+def LHLO_AbsOp: LHLO_UnaryElementwiseOp<"abs">, BASE_HLO_AbsOp;
+
+def LHLO_ConvertOp : LHLO_UnaryElementwiseOp<"convert">, BASE_HLO_ConvertOp;
+
+def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exp">, BASE_HLO_ExpOp;
+
+def LHLO_NegOp: LHLO_UnaryElementwiseOp<"neg">, BASE_HLO_NegOp;
+
+def LHLO_SignOp: LHLO_UnaryElementwiseOp<"sign">, BASE_HLO_SignOp;
+
+def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh">, BASE_HLO_TanhOp;
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+//===----------------------------------------------------------------------===//
+
+class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
+        LHLO_Op<mnemonic, traits> {
+  let arguments = (ins
+      LHLO_Buffer:$lhs,
+      LHLO_Buffer:$rhs,
+      LHLO_Buffer:$out,
+      BroadcastDimAttr:$broadcast_dimensions
+  );
+}
+
+def LHLO_AddOp : LHLO_BinaryElementwiseOp<"add", []>, BASE_HLO_AddOp;
+
+def LHLO_DivOp : LHLO_BinaryElementwiseOp<"div", []>, BASE_HLO_DivOp;
+
+def LHLO_MaxOp : LHLO_BinaryElementwiseOp<"max", []>, BASE_HLO_MaxOp;
+
+def LHLO_MinOp : LHLO_BinaryElementwiseOp<"min", []>, BASE_HLO_MinOp;
+
+def LHLO_MulOp : LHLO_BinaryElementwiseOp<"mul", []>, BASE_HLO_MulOp;
+
+def LHLO_SubOp : LHLO_BinaryElementwiseOp<"sub", []>, BASE_HLO_SubOp;
+
+def LHLO_AndOp: LHLO_BinaryElementwiseOp<"and", []>, BASE_HLO_AndOp;
+
+//===----------------------------------------------------------------------===//
+// XLA control flow op definitions.
+//===----------------------------------------------------------------------===//
+
+// TODO(b/139813999): specify required function signature in a type-safe way.
+def LHLO_ReduceOp: LHLO_Op<"reduce", [SameVariadicOperandSize]>, BASE_HLO_ReduceOp {
+  let arguments = (ins
+    Variadic<LHLO_BufferOrTuple>:$operands_and_init,
+    Variadic<LHLO_BufferOrTuple>:$out,
+    SymbolRefAttr:$computation,
+    ElementsAttr:$dimensions
+  );
+}
+//===----------------------------------------------------------------------===//
+// XLA tuple op definitions.
+//===----------------------------------------------------------------------===//
+
+def LHLO_GetTupleElementOp: LHLO_Op<"get_tuple_element", []>, BASE_HLO_GetTupleElementOp {
+  let arguments = (ins
+    LHLO_TupleBuffer:$input,
+    LHLO_BufferOrTuple:$out,
+    I32Attr:$index
+  );
+}
+
+def LHLO_TupleOp : LHLO_Op<"tuple", []>, BASE_HLO_TupleOp {
+   let arguments = (ins
+     Variadic<LHLO_BufferOrTuple>:$val,
+                 LHLO_TupleBuffer:$out);
+}
+
+def LHLO_CompareOp: LHLO_Op<"compare", []>, BASE_HLO_CompareOp {
+  let arguments = (ins
+    LHLO_Buffer:$lhs,
+    LHLO_Buffer:$rhs,
+    LHLO_PredBuffer:$out,
+    BroadcastDimAttr:$broadcast_dimensions,
+    HLO_ComparisonDirectionAttr:$comparison_direction
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// XLA Slice definitions.
+//===----------------------------------------------------------------------===//
+
+def LHLO_SliceOp: LHLO_Op<
+      "slice",
+      [AllTypesMatch<["start_indices", "limit_indices"]>]> {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$output,
+    ElementsAttr:$start_indices,
+    ElementsAttr:$limit_indices
+  );
+}
+
+def HLO_DynamicUpdateSliceOp: LHLO_Op<"dynamic-update-slice", []> {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$update,
+    LHLO_Buffer:$output,
+    Variadic<LHLO_Buffer>:$start_indices
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// XLA Other op definitions.
+//===----------------------------------------------------------------------===//
+
+def HLO_BatchNormInferenceOp : LHLO_Op<"batch_norm_inference", []>,
+    BASE_HLO_BatchNormInferenceOp {
+
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$scale,
+    LHLO_Buffer:$offset,
+    LHLO_Buffer:$mean,
+    LHLO_Buffer:$variance,
+    LHLO_Buffer:$output,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index
+  );
+}
+
+def LHLO_BroadcastOp : LHLO_Op<"broadcast",
+      []>, BASE_HLO_BroadcastOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$output,
+    ElementsAttr:$broadcast_sizes
+  );
+}
+
+def LHLO_BroadcastInDimOp : LHLO_Op<"broadcast_in_dim",
+      []>, BASE_HLO_BroadcastInDimOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$output,
+    BroadcastDimAttr:$broadcast_dimensions
+  );
+}
+
+def LHLO_ClampOp : LHLO_Op<"clamp", []>, BASE_HLO_ClampOp {
+  let arguments = (ins
+    LHLO_Buffer:$min,
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$max,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_ConcatenateOp : LHLO_Op<"concatenate", []>, BASE_HLO_ConcatenateOp {
+   let arguments = (ins
+     Variadic<LHLO_Buffer>:$val,
+     LHLO_Buffer:$output,
+     I64Attr: $dimension
+   );
+}
+
+def LHLO_ConvOp : LHLO_Op<"conv", []>, BASE_HLO_ConvOp {
+  let arguments = (ins
+    LHLO_Buffer:$lhs,
+    LHLO_Buffer:$rhs,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_DotOp: LHLO_Op<"dot", []>, BASE_HLO_DotOp {
+  let arguments = (ins
+    LHLO_Buffer:$lhs,
+    LHLO_Buffer:$rhs,
+    HLO_PrecisionConfigAttr:$precision_config,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_GatherOp: LHLO_Op<"gather", []>, BASE_HLO_GatherOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_IntBuffer:$start_indices,
+    I64Attr: $index_vector_dim,
+    ElementsAttr: $offset_dims,
+    ElementsAttr: $slice_sizes,
+    ElementsAttr: $collapsed_slice_dims,
+    ElementsAttr: $start_index_map,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_ReshapeOp: LHLO_Op<"reshape", []>, BASE_HLO_ReshapeOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$output
+  );
+}
+
+
+def LHLO_SelectOp: LHLO_Op<"select", []>, BASE_HLO_SelectOp {
+  let arguments = (ins
+    LHLO_PredBuffer:$pred,
+    LHLO_Buffer:$on_true,
+    LHLO_Buffer:$on_false,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_ReverseOp: LHLO_Op<"reverse", []>, BASE_HLO_ReverseOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    ElementsAttr:$dimensions,
+    LHLO_Buffer:$output
+  );
+}
+
+def LHLO_PadOp: LHLO_Op<"pad", []>, BASE_HLO_PadOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$padding_value,
+    ElementsAttr: $edge_padding_low,
+    ElementsAttr: $edge_padding_high,
+    ElementsAttr: $interior_padding,
+    LHLO_Buffer: $output
+  );
+}
+
+def LHLO_TransposeOp: LHLO_Op<"transpose", []>, BASE_HLO_TransposeOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    ElementsAttr:$permutation,
+    LHLO_Buffer:$output
+  );
+}
+
+
+#endif // LHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_ops.cc b/tensorflow/compiler/mlir/xla/ir/xla_ops.cc
deleted file mode 100644
index 25da9da3d1d..00000000000
--- a/tensorflow/compiler/mlir/xla/ir/xla_ops.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the XLA dialect.
-
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
-
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
-#include "mlir/IR/Builders.h"  // TF:local_config_mlir
-#include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
-#include "mlir/Support/TypeUtilities.h"  // TF:local_config_mlir
-
-using namespace mlir;
-using namespace mlir::XLA;
-
-XLADialect::XLADialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.cc.inc"
-      >();
-
-  // Support unknown operations because not all XLA operations are registered.
-  allowUnknownOperations();
-}
-
-Operation* XLADialect::materializeConstant(OpBuilder& builder, Attribute value,
-                                           Type type, Location loc) {
-  // If this is an opaque elements attribute, then generate an xla.constant.
-  if (value.isa<OpaqueElementsAttr>())
-    return builder.create<XLA::ConstOp>(loc, type, value.cast<ElementsAttr>());
-  return nullptr;
-}
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.cc.inc"
-
-//===----------------------------------------------------------------------===//
-// ConstOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.empty() && "constant has no operands");
-
-  // Return the held attribute value.
-  return value();
-}
-
-// Builds a constant op with the specified attribute `value`.
-void ConstOp::build(Builder* builder, OperationState* result, Attribute value) {
-  Type type;
-  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
-    type = elemAttr.getType();
-  } else if (value.isa<BoolAttr>() || value.isa<FloatAttr>() ||
-             value.isa<IntegerAttr>()) {
-    // All XLA types must be tensor types. In the build() method, we want to
-    // provide more flexiblity by allowing attributes of scalar types. But we
-    // need to wrap it up with ElementsAttr to construct valid XLA constants.
-    type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    value = DenseElementsAttr::get(type.cast<TensorType>(), value);
-  }
-
-  // TODO: support other XLA specific types.
-  assert(type && "unsupported attribute type for building xla.constant");
-  result->types.push_back(type);
-  result->addAttribute("value", value);
-}
-
-//===----------------------------------------------------------------------===//
-// ConvertOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ConvertOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.size() == 1 && "convert must take one operand");
-  auto operand = operands[0];
-
-  if (!operand) return {};
-
-  if (auto elementsAttr = operand.dyn_cast<ElementsAttr>()) {
-    auto inType = elementsAttr.getType();
-    auto outType = getResult()->getType().cast<ShapedType>();
-
-    if (inType == outType) {
-      return operand;
-    }
-
-    auto inElement = inType.getElementType();
-    auto outElement = outType.getElementType();
-    size_t bitWidth =
-        outElement.isBF16() ? 64 : outElement.getIntOrFloatBitWidth();
-
-    if (inElement.isa<FloatType>()) {
-      if (outElement.isa<IntegerType>()) {
-        auto func = [&](const APFloat& floatValue) -> APInt {
-          return APInt(bitWidth, FloatAttr::getValueAsDouble(floatValue));
-        };
-        llvm::function_ref<APInt(const APFloat&)> func_ref = func;
-        return elementsAttr.mapValues(outType.getElementType(), func_ref);
-      }
-
-      if (outElement.isa<FloatType>()) {
-        auto& semantics = outElement.cast<FloatType>().getFloatSemantics();
-        auto func = [&](const APFloat& floatValue) -> APInt {
-          APFloat newDouble(FloatAttr::getValueAsDouble(floatValue));
-          bool losesInfo = false;
-          newDouble.convert(semantics, llvm::APFloat::rmNearestTiesToEven,
-                            &losesInfo);
-          return newDouble.bitcastToAPInt();
-        };
-        llvm::function_ref<APInt(const APFloat&)> func_ref = func;
-        return elementsAttr.mapValues(outType.getElementType(), func_ref);
-      }
-    }
-
-    if (inElement.isa<IntegerType>()) {
-      if (outElement.isa<IntegerType>()) {
-        auto func = [&](const APInt& val) -> APInt {
-          return APInt(bitWidth, val.getLimitedValue());
-        };
-        llvm::function_ref<APInt(const APInt&)> func_ref = func;
-        return elementsAttr.mapValues(outType.getElementType(), func_ref);
-      }
-
-      if (outElement.isa<FloatType>()) {
-        auto& semantics = outElement.cast<FloatType>().getFloatSemantics();
-        auto func = [&](const APInt& val) -> APInt {
-          APFloat newDouble(static_cast<double>(val.getLimitedValue()));
-          bool losesInfo = false;
-          newDouble.convert(semantics, llvm::APFloat::rmNearestTiesToEven,
-                            &losesInfo);
-          return newDouble.bitcastToAPInt();
-        };
-        llvm::function_ref<APInt(const APInt&)> func_ref = func;
-        return elementsAttr.mapValues(outType.getElementType(), func_ref);
-      }
-    }
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// IotaOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  const auto output_type = getResult()->getType().cast<ShapedType>();
-  const auto output_size = output_type.getNumElements();
-  const auto dimension = iota_dimension().getLimitedValue();
-  const auto max_dim_size = output_type.getDimSize(dimension);
-  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
-
-  llvm::SmallVector<APInt, 10> values;
-  values.reserve(output_size);
-
-  int64_t increase_stride = output_size;
-  for (int i = 0; i <= dimension; i++) {
-    increase_stride /= output_type.getDimSize(i);
-  }
-
-  int64_t current_value = 0;
-  for (int i = 0; i < output_size; i++) {
-    int64_t value = (current_value / increase_stride) % max_dim_size;
-    values.push_back(APInt(bitwidth, value));
-    ++current_value;
-  }
-
-  return DenseIntElementsAttr::get(output_type, values);
-}
-
-//===----------------------------------------------------------------------===//
-// ReshapeOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.size() == 1 && "convert must take one operand");
-  auto operand = operands[0];
-  if (!operand) return {};
-
-  if (auto elements = operand.dyn_cast<DenseElementsAttr>()) {
-    return elements.reshape(getResult()->getType().cast<ShapedType>());
-  }
-
-  return {};
-}
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 2ec1324a1cf..230044d538b 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -22,14 +22,14 @@ limitations under the License.
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
@@ -37,11 +37,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+using tensorflow::int64;
+
 static std::vector<int64> ConvertDenseIntAttr(mlir::DenseIntElementsAttr attr) {
-  llvm::ArrayRef<int64> raw_data = attr.getValues<int64>();
-  if (attr.isSplat())
-    return std::vector<int64>(attr.getType().getNumElements(), raw_data[0]);
-  return raw_data;
+  auto values = attr.getValues<int64>();
+  return {values.begin(), values.end()};
 }
 
 // Converts the broadcast_dimensions attribute into a span of dimension numbers
@@ -154,7 +154,7 @@ class ConvertToHloModule {
   // if an error was encountered.
   LogicalResult RunOnFunction(mlir::FuncOp f);
 
-  xla::HloModuleProto ConsumeMainProto() {
+  ::xla::HloModuleProto ConsumeMainProto() {
     return lowered_computation_[module_.lookupSymbol<mlir::FuncOp>("main")]
         .proto();
   }
@@ -176,8 +176,8 @@ LogicalResult Lower(mlir::Operation* inst, xla::XlaBuilder* builder,
   if (auto xla_op = CreateXlaOperator(inst, value_lowering)) return success();
 
   // TODO(riverriddle) We currently don't support lowering constant operations.
-  if (isa<mlir::XLA::ConstOp>(inst)) {
-    inst->emitError("unable to lower 'xla.constant' operation");
+  if (isa<mlir::xla_hlo::ConstOp>(inst)) {
+    inst->emitError("unable to lower 'xla_hlo.constant' operation");
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 0fb315b90f9..6aecf70b385 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
@@ -51,8 +51,8 @@ static std::string GetConversionFunction(
   return "Convert_" + named_attr.name.str();
 }
 
-using ArgumentName = string;
-using ArgumentDeclaration = string;
+using ArgumentName = std::string;
+using ArgumentDeclaration = std::string;
 using Argument = std::pair<ArgumentName, ArgumentDeclaration>;
 using ArgumentList = std::vector<Argument>;
 
@@ -63,7 +63,7 @@ static std::string BuildOperator(const Operator& op) {
 
   // Signature.
   os << "static xla::XlaOp " << GetOperatorBuilderName(op_name)
-     << "(mlir::XLA::" << op_name.str() << " xla_op, "
+     << "(mlir::xla_hlo::" << op_name.str() << " xla_op, "
      << "llvm::DenseMap<mlir::Value*, xla::XlaOp>* "
         "value_lowering) {\n";
 
@@ -148,7 +148,7 @@ static void EmitBuilder(const std::vector<Record*>& defs,
     StringRef op_name = def->getName().drop_front(4);
 
     // Try to cast to each op and call the corresponding op builder.
-    os << "  if (auto xla_op = llvm::dyn_cast<mlir::XLA::" << op_name
+    os << "  if (auto xla_op = llvm::dyn_cast<mlir::xla_hlo::" << op_name
        << ">(op))\n     return " << GetOperatorBuilderName(op_name)
        << "(xla_op, value_lowering);\n";
   }
@@ -163,17 +163,17 @@ static void EmitBuilder(const std::vector<Record*>& defs,
 static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
   emitSourceFileHeader("MLIR XLA Builders", os);
 
-  // Retrieve all the definitions derived from XLA_Op and sort by record name.
-  std::vector<Record*> defs = records.getAllDerivedDefinitions("XLA_Op");
+  // Retrieve all the definitions derived from HLO_Op and sort by record name.
+  std::vector<Record*> defs = records.getAllDerivedDefinitions("HLO_Op");
   llvm::sort(defs, LessRecord());
 
   for (const auto* def : defs) {
     // XLA ops in the .td file are expected to follow the naming convention:
-    // XLA_<OpName>Op.
-    // The generated XLA op C++ class should be XLA::<OpName>Op.
-    if (!def->getName().startswith("XLA_"))
+    // HLO_<OpName>Op.
+    // The generated XLA op C++ class should be HLO::<OpName>Op.
+    if (!def->getName().startswith("HLO_"))
       PrintFatalError(def->getLoc(),
-                      "unexpected op name format: 'XLA_' prefix missing");
+                      "unexpected op name format: 'HLO_' prefix missing");
     if (!def->getName().endswith("Op"))
       PrintFatalError(def->getLoc(),
                       "unexpected op name format: 'Op' suffix missing");
@@ -187,10 +187,7 @@ static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
 }
 
 int main(int argc, char** argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-  llvm::PrettyStackTraceProgram X(argc, argv);
-
-  llvm::llvm_shutdown_obj Y;
+  llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
   return TableGenMain(argv[0], &OperatorWritersMain);
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/convert.mlir b/tensorflow/compiler/mlir/xla/tests/convert.mlir
index 93de3b30ec0..76cdab37a4e 100644
--- a/tensorflow/compiler/mlir/xla/tests/convert.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/convert.mlir
@@ -1,218 +1,203 @@
-// RUN: tf-opt %s -split-input-file -xla-legalize-to-std | FileCheck %s
+// RUN: tf-opt %s -split-input-file -canonicalize | FileCheck %s
 
 // -----
 
-// CHECK-LABEL: func @convert.1(%arg0: tensor<f32>) -> tensor<f32> {
-func @convert.1(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) : (tensor<f32>) -> tensor<f32>
-  %0 = "xla.convert"(%arg0) : (tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT: return %0 : tensor<f32>
+// CHECK-LABEL: func @same_type
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @same_type(%arg: tensor<f32>) -> tensor<f32> {
+  %0 = "xla_hlo.convert"(%arg) : (tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: return [[ARG]]
   return %0 : tensor<f32>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.2(%arg0: tensor<i32>) -> tensor<i32> {
-func @convert.2(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<i32>
-  %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<i32>
-  // CHECK-NEXT: return %0 : tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.3(%arg0: tensor<i32>) -> tensor<i64> {
-func @convert.3(%arg0: tensor<i32>) -> tensor<i64> {
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<i64>
-  %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<i64>
-  // CHECK-NEXT: return %0 : tensor<i64>
+// CHECK-LABEL: func @int_widening
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @int_widening(%arg: tensor<i32>) -> tensor<i64> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.convert"([[ARG]]) : (tensor<i32>) -> tensor<i64>
+  %0 = "xla_hlo.convert"(%arg) : (tensor<i32>) -> tensor<i64>
+  // CHECK-NEXT: return [[RES]]
   return %0 : tensor<i64>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.4(%arg0: tensor<f32>) -> tensor<i32> {
-func @convert.4(%arg0: tensor<f32>) -> tensor<i32> {
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) : (tensor<f32>) -> tensor<i32>
-  %0 = "xla.convert"(%arg0) : (tensor<f32>) -> tensor<i32>
-  // CHECK-NEXT: return %0 : tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.5(%arg0: tensor<i32>) -> tensor<f32> {
-func @convert.5(%arg0: tensor<i32>) -> tensor<f32> {
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
-  %0 = "xla.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
-  // CHECK-NEXT: return %0 : tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// -----
-
-
-// CHECK-LABEL: func @convert.const.1() -> tensor<f32> {
-func @convert.const.1() -> tensor<f32> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f32>
-  %cst = constant  dense<42.0> : tensor<f32>
-  %0 = "xla.convert"(%cst) : (tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT: return %cst : tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// -----
-
-// check-label: func @convert.const.2() -> tensor<i32> {
-func @convert.const.2() -> tensor<i32> {
-  // check-next: %cst = constant dense<42> : tensor<i32>
-  %cst = constant  dense<42> : tensor<i32>
-  %0 = "xla.convert"(%cst) : (tensor<i32>) -> tensor<i32>
-  // check-next: return %cst : tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.const.3() -> tensor<i32> {
-func @convert.const.3() -> tensor<i32> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i32>
-  %cst = constant  dense<42.0> : tensor<f32>
-  %0 = "xla.convert"(%cst) : (tensor<f32>) -> tensor<i32>
-  // CHECK-NEXT: return %cst : tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.const.4() -> tensor<f32> {
-func @convert.const.4() -> tensor<f32> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f32>
-  %cst = constant  dense<42> : tensor<i32>
-  %0 = "xla.convert"(%cst) : (tensor<i32>) -> tensor<f32>
-  // CHECK-NEXT: return %cst : tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.const.5() -> tensor<bf16> {
-func @convert.const.5() -> tensor<bf16> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<bf16>
-  %cst = constant  dense<42> : tensor<i32>
-  %0 = "xla.convert"(%cst) : (tensor<i32>) -> tensor<bf16>
-  // CHECK-NEXT: return %cst : tensor<bf16>
-  return %0 : tensor<bf16>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.const.6() -> tensor<i16> {
-func @convert.const.6() -> tensor<i16> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i16>
-  %cst = constant  dense<42.0> : tensor<bf16>
-  %0 = "xla.convert"(%cst) : (tensor<bf16>) -> tensor<i16>
-  // CHECK-NEXT: return %cst : tensor<i16>
+// CHECK-LABEL: func @int_narrowing
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @int_narrowing(%arg: tensor<i32>) -> tensor<i16> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.convert"([[ARG]]) : (tensor<i32>) -> tensor<i16>
+  %0 = "xla_hlo.convert"(%arg) : (tensor<i32>) -> tensor<i16>
+  // CHECK-NEXT: return [[RES]]
   return %0 : tensor<i16>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.7() -> tensor<i32> {
-func @convert.const.7() -> tensor<i32> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i32>
-  %cst = constant  dense<42> : tensor<i64>
-  %0 = "xla.convert"(%cst) : (tensor<i64>) -> tensor<i32>
-  // CHECK-NEXT: return %cst : tensor<i32>
+// CHECK-LABEL: func @float_int
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @float_int(%arg: tensor<f32>) -> tensor<i32> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.convert"([[ARG]]) : (tensor<f32>) -> tensor<i32>
+  %0 = "xla_hlo.convert"(%arg) : (tensor<f32>) -> tensor<i32>
+  // CHECK-NEXT: return [[RES]]
   return %0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.8() -> tensor<i64> {
-func @convert.const.8() -> tensor<i64> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i64>
-  %cst = constant  dense<42> : tensor<i32>
-  %0 = "xla.convert"(%cst) : (tensor<i32>) -> tensor<i64>
-  // CHECK-NEXT: return %cst : tensor<i64>
-  return %0 : tensor<i64>
-}
-
-// -----
-
-// CHECK-LABEL: func @convert.const.9() -> tensor<f32> {
-func @convert.const.9() -> tensor<f32> {
-  // CHECK-NEXT: %cst = constant  dense<4.200000e+01> : tensor<f32>
-  %cst = constant  dense<42.0> : tensor<f64>
-  %0 = "xla.convert"(%cst) : (tensor<f64>) -> tensor<f32>
-  // CHECK-NEXT: return %cst : tensor<f32>
+// CHECK-LABEL: func @int_float
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @int_float(%arg: tensor<i32>) -> tensor<f32> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.convert"([[ARG]]) : (tensor<i32>) -> tensor<f32>
+  %0 = "xla_hlo.convert"(%arg) : (tensor<i32>) -> tensor<f32>
+  // CHECK-NEXT: return [[RES]]
   return %0 : tensor<f32>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.9() -> tensor<bf16> {
-func @convert.const.9() -> tensor<bf16> {
-  // CHECK-NEXT: %cst = constant  dense<4.200000e+01> : tensor<bf16>
-  %cst = constant  dense<42.0> : tensor<f32>
-  %0 = "xla.convert"(%cst) : (tensor<f32>) -> tensor<bf16>
-  // CHECK-NEXT: return %cst : tensor<bf16>
+// CHECK-LABEL: func @high_rank_tensor
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @high_rank_tensor(%arg: tensor<2x3xi32>) -> tensor<2x3xf32> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.convert"([[ARG]]) : (tensor<2x3xi32>) -> tensor<2x3xf32>
+  %0 = "xla_hlo.convert"(%arg) : (tensor<2x3xi32>) -> tensor<2x3xf32>
+  // CHECK-NEXT: return [[RES]]
+  return %0 : tensor<2x3xf32>
+}
+
+// -----
+
+
+// CHECK-LABEL: func @const_same_type
+func @const_same_type() -> tensor<i32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i32>
+  %cst = constant dense<42> : tensor<i32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @const_float_int
+func @const_float_int() -> tensor<i32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i32>
+  %cst = constant dense<42.0> : tensor<f32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<f32>) -> tensor<i32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @const_int_float
+func @const_int_float() -> tensor<f32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.{{0*}}e+00> : tensor<f32>
+  %cst = constant dense<4> : tensor<i32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<i32>) -> tensor<f32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @const_int_bf16
+func @const_int_bf16() -> tensor<bf16> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.{{0*}}e+00> : tensor<bf16>
+  %cst = constant dense<4> : tensor<i32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<i32>) -> tensor<bf16>
+  // CHECK-NEXT: return [[CST]]
   return %0 : tensor<bf16>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.10() -> tensor<f64> {
-func @convert.const.10() -> tensor<f64> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f64>
-  %cst = constant  dense<42.0> : tensor<bf16>
-  %0 = "xla.convert"(%cst) : (tensor<bf16>) -> tensor<f64>
-  // CHECK-NEXT: return %cst : tensor<f64>
-  return %0 : tensor<f64>
+// CHECK-LABEL: func @const_bf16_int
+func @const_bf16_int() -> tensor<i16> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i16>
+  %cst = constant dense<42.0> : tensor<bf16>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<bf16>) -> tensor<i16>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i16>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.11() -> tensor<f64> {
-func @convert.const.11() -> tensor<f64> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f64>
-  %cst = constant  dense<42.0> : tensor<bf16>
-  %0 = "xla.convert"(%cst) : (tensor<bf16>) -> tensor<f64>
-  // CHECK-NEXT: return %cst : tensor<f64>
-  return %0 : tensor<f64>
+// CHECK-LABEL: func @const_int_narrowing
+func @const_int_narrowing() -> tensor<i32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i32>
+  %cst = constant dense<42> : tensor<i64>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<i64>) -> tensor<i32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i32>
 }
 
-
 // -----
 
-// CHECK-LABEL: func @convert.const.12() -> tensor<i64> {
-func @convert.const.12() -> tensor<i64> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i64>
-  %cst = constant  dense<42.0> : tensor<bf16>
-  %0 = "xla.convert"(%cst) : (tensor<bf16>) -> tensor<i64>
-  // CHECK-NEXT: return %cst : tensor<i64>
+// CHECK-LABEL: func @const_int_widening
+func @const_int_widening() -> tensor<i64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i64>
+  %cst = constant dense<42> : tensor<i32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<i32>) -> tensor<i64>
+  // CHECK-NEXT: return [[CST]]
   return %0 : tensor<i64>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.13() -> tensor<i64> {
-func @convert.const.13() -> tensor<i64> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<i64>
-  %cst = constant  dense<42> : tensor<i16>
-  %0 = "xla.convert"(%cst) : (tensor<i16>) -> tensor<i64>
-  // CHECK-NEXT: return %cst : tensor<i64>
-  return %0 : tensor<i64>
+// CHECK-LABEL: func @const_float_narrowing
+func @const_float_narrowing() -> tensor<f32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.2{{0*}}e+00> : tensor<f32>
+  %cst = constant dense<4.2> : tensor<f64>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<f64>) -> tensor<f32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<f32>
 }
 
 // -----
 
-// CHECK-LABEL: func @convert.const.14() -> tensor<f64> {
-func @convert.const.14() -> tensor<f64> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f64>
-  %cst = constant  dense<42> : tensor<i16>
-  %0 = "xla.convert"(%cst) : (tensor<i16>) -> tensor<f64>
-  // CHECK-NEXT: return %cst : tensor<f64>
+// CHECK-LABEL: func @const_f32_bf16
+func @const_f32_bf16() -> tensor<bf16> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.2{{0*}}e+01> : tensor<bf16>
+  %cst = constant dense<42.0> : tensor<f32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<f32>) -> tensor<bf16>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<bf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @const_bf16_f64
+func @const_bf16_f64() -> tensor<f64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.2{{0*}}e+00> : tensor<f64>
+  %cst = constant dense<4.2> : tensor<bf16>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<bf16>) -> tensor<f64>
+  // CHECK-NEXT: return [[CST]]
   return %0 : tensor<f64>
 }
+
+// -----
+
+// CHECK-LABEL: func @const_bf16_int
+func @const_bf16_int() -> tensor<i64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i64>
+  %cst = constant dense<42.0> : tensor<bf16>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<bf16>) -> tensor<i64>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i64>
+}
+
+
+// -----
+
+// CHECK-LABEL: func @const_high_rank_tensor
+func @const_high_rank_tensor() -> tensor<2x3xi32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<[
+  // CHECK-SAME:     [1, 2, 3], [4, 5, 6]
+  // CHECK-SAME: ]> : tensor<2x3xi32>
+  %cst = constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>
+  %0 = "xla_hlo.convert"(%cst) : (tensor<2x3xf32>) -> tensor<2x3xi32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<2x3xi32>
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/iota.mlir b/tensorflow/compiler/mlir/xla/tests/iota.mlir
index 10559a4bfe8..46e0984cd77 100644
--- a/tensorflow/compiler/mlir/xla/tests/iota.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/iota.mlir
@@ -5,7 +5,7 @@
 // CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
 func @iota.const.1() -> tensor<4xi32> {
   // CHECK-NEXT: %cst = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  %0 = "xla.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
   // CHECK-NEXT: return %cst : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -15,7 +15,7 @@ func @iota.const.1() -> tensor<4xi32> {
 // CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
 func @iota.const.2() -> tensor<2x4xi32> {
   // CHECK-NEXT: %cst = constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
-  %0 = "xla.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
   // CHECK-NEXT: return %cst : tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
@@ -25,7 +25,7 @@ func @iota.const.2() -> tensor<2x4xi32> {
 // CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
 func @iota.const.3() -> tensor<2x4xi32> {
   // CHECK-NEXT: %cst = constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
-  %0 = "xla.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
   // CHECK-NEXT: return %cst : tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
@@ -35,7 +35,7 @@ func @iota.const.3() -> tensor<2x4xi32> {
 // CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
 func @iota.const.4() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %cst = constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
-  %0 = "xla.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %cst : tensor<2x3x4xi32>
   return %0 : tensor<2x3x4xi32>
 }
@@ -45,7 +45,7 @@ func @iota.const.4() -> tensor<2x3x4xi32> {
 // CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
 func @iota.const.5() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %cst = constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
-  %0 = "xla.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %cst : tensor<2x3x4xi32>
   return %0 : tensor<2x3x4xi32>
 }
@@ -55,7 +55,7 @@ func @iota.const.5() -> tensor<2x3x4xi32> {
 // CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
 func @iota.const.6() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %cst = constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
-  %0 = "xla.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %cst : tensor<2x3x4xi32>
   return %0 : tensor<2x3x4xi32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
index 74dd0034283..92d9c3530fc 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
@@ -2,16 +2,16 @@
 
 // CHECK-LABEL: func @cond(%arg0: tensor<i64>) -> tensor<i1> {
 func @cond(%arg0: tensor<i64>) -> tensor<i1> {
-  // CHECK-NEXT: %0 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-  %0 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK-NEXT: %0 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %0 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
   // CHECK-NEXT: return %0 : tensor<i1>
   return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: func @loop(%arg0: tensor<i64>) -> tensor<i64> {
 func @loop(%arg0: tensor<i64>) -> tensor<i64> {
-  // CHECK-NEXT: %0 = xla.add %arg0, %arg0 {name = "compare.0"} : tensor<i64>
-  %0 = "xla.add"(%arg0, %arg0) {name = "compare.0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-NEXT: %0 = xla_hlo.add %arg0, %arg0 {name = "compare.0"} : tensor<i64>
+  %0 = "xla_hlo.add"(%arg0, %arg0) {name = "compare.0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
   // CHECK-NEXT: return %0 : tensor<i64>
   return %0 : tensor<i64>
 }
@@ -27,7 +27,7 @@ func @main(%arg0: tensor<i64>) -> tensor<i64> {
   // CHECK-NEXT:   %4 = call @loop(%3) : (tensor<i64>) -> tensor<i64>
   // CHECK-NEXT:   br ^bb1(%4 : tensor<i64>)
   // CHECK-NEXT: b3(%5: tensor<i64>):	// pred: ^bb1
-  %0 = "xla.while"(%arg0) {body = @loop, cond = @cond} : (tensor<i64>) -> tensor<i64>
+  %0 = "xla_hlo.while"(%arg0) {body = @loop, cond = @cond} : (tensor<i64>) -> tensor<i64>
   // CHECK-NEXT:   return %5 : tensor<i64>
   return %0 : tensor<i64>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 69be9789818..5b45862a2b3 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -xla-legalize-tf %s | FileCheck %s
+// RUN: tf-opt -xla-legalize-tf %s | FileCheck %s --dump-input-on-failure
 
 //===----------------------------------------------------------------------===//
 // BatchNorm op legalizations.
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: fusedBatchNorm_notraining
 func @fusedBatchNorm_notraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK-NEXT: "xla.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: "xla_hlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
   %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   return %0#0 : tensor<8x8x8x8xf32>
 }
@@ -25,14 +25,14 @@ func @fusedBatchNorm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>,
 
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK-NEXT: %0 = "xla.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>}
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_NCHW
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK-NEXT: %0 = "xla.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
@@ -42,14 +42,14 @@ func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tens
 
 // CHECK-LABEL: func @biasAdd_NHWC_invalid
 func @biasAdd_NHWC_invalid(%arg0: tensor<1x32x10x2xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x2xi32> {
-  // CHECK-NOT: xla.add
+  // CHECK-NOT: xla_hlo.add
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x2xi32>, tensor<32xi32>) -> tensor<1x32x10x2xi32>
   return %0 : tensor<1x32x10x2xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_NCHW_invalid
 func @biasAdd_NCHW_invalid(%arg0: tensor<1x10x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x10x10x32xi32> {
-  // CHECK-NOT: xla.add
+  // CHECK-NOT: xla_hlo.add
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x10x10x32xi32>, tensor<32xi32>) -> tensor<1x10x10x32xi32>
   return %0 : tensor<1x10x10x32xi32>
 }
@@ -60,29 +60,31 @@ func @biasAdd_NCHW_invalid(%arg0: tensor<1x10x10x32xi32>, %arg1: tensor<32xi32>)
 
 // CHECK-LABEL: func @add
 func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla.add %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
   %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
+  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %1: tensor<2xi32>
 }
 
 // CHECK-LABEL: func @broadcast_add
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @broadcast_multi_dim_add
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: "xla.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
 // CHECK-LABEL: func @div
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla.div %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %0 = xla_hlo.div %arg0, %arg0 : tensor<2xi32>
   // CHECK-NEXT:  return %0 : tensor<2xi32>
   %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
@@ -90,14 +92,14 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: func @broadcast_div
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla.div"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.div"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @mul
 func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla.mul %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %0 = xla_hlo.mul %arg0, %arg0 : tensor<2xi32>
   // CHECK-NEXT:  return %0 : tensor<2xi32>
   %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
@@ -105,28 +107,28 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: func @broadcast_mul
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla.mul"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.mul"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @real_div
 func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla.div %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %0 = xla_hlo.div %arg0, %arg0 : tensor<2xi32>
   %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
 
 // CHECK-LABEL: func @broadcast_real_div
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla.div"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.div"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.RealDiv"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @sub
 func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla.sub %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %0 = xla_hlo.sub %arg0, %arg0 : tensor<2xi32>
   // CHECK-NEXT:  return %0 : tensor<2xi32>
   %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
@@ -134,7 +136,7 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: func @broadcast_sub
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla.sub"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.sub"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
@@ -156,7 +158,7 @@ func @identity(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: @const
 func @const() -> tensor<2xi32> {
-  // tf.Const is legalized into xla.constant, which is folded into constant.
+  // tf.Const is legalized into xla_hlo.constant, which is folded into constant.
 
   // CHECK-NEXT: constant dense<0> : tensor<2xi32>
   %0 = "tf.Const"() {device = "", name = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<2xi32>} : () -> (tensor<2xi32>)
@@ -170,7 +172,7 @@ func @const() -> tensor<2xi32> {
 // CHECK-LABEL: func @relu
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK-NEXT: %cst = constant dense<0> : tensor<1xi32>
-  // CHECK-NEXT: %0 = xla.max %arg0, %cst : tensor<1xi32>
+  // CHECK-NEXT: %0 = xla_hlo.max %arg0, %cst : tensor<1xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
@@ -179,7 +181,7 @@ func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK-NEXT: %cst = constant dense<0> : tensor<1xi32>
   // CHECK-NEXT: %cst_0 = constant dense<6> : tensor<1xi32>
-  // CHECK-NEXT: %0 = "xla.clamp"(%cst, %arg0, %cst_0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %0 = "xla_hlo.clamp"(%cst, %arg0, %cst_0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Relu6"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
@@ -190,7 +192,7 @@ func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: reshape
 func @reshape(%arg0: tensor<2xf32>, %arg1: tensor<2xi32>) -> tensor<1x1xf32> {
-  // CHECK:  %0 = "xla.reshape"(%arg0) : (tensor<2xf32>) -> tensor<1x1xf32>
+  // CHECK:  %0 = "xla_hlo.reshape"(%arg0) : (tensor<2xf32>) -> tensor<1x1xf32>
   %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x1xf32>
   return %0 : tensor<1x1xf32>
 }
@@ -204,7 +206,7 @@ func @reshape_dynamic(%arg0: tensor<*xf32>, %arg1: tensor<2xi32>) -> tensor<?x?x
 
 // CHECK-LABEL: squeeze
 func @squeeze(%arg0: tensor<1x1x10xf32>) -> tensor<1x10xf32> {
-  // CHECK-NEXT: %0 = "xla.reshape"(%arg0) : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
   %0 = "tf.Squeeze"(%arg0) : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
   return %0 : tensor<1x10xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index d75b283e633..6dad19179f1 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -3,16 +3,16 @@
 // CHECK-LABEL: func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT:   %0 = addf %arg0, %arg1 : tensor<4xf32>
-  %0 = "xla.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK-NEXT:   %1 = mulf %0, %arg1 : tensor<4xf32>
-  %1 = "xla.mul"(%0, %arg1) {name = "mul.4"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %1 = "xla_hlo.mul"(%0, %arg1) {name = "mul.4"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK-NEXT:   %2 = subf %1, %arg1 : tensor<4xf32>
-  %2 = "xla.sub"(%1, %arg1) {name = "sub.5"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %2 = "xla_hlo.sub"(%1, %arg1) {name = "sub.5"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK-NEXT:   %3 = divf %2, %arg1 : tensor<4xf32>
-  %3 = "xla.div"(%2, %arg1) {name = "div.6"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %3 = "xla_hlo.div"(%2, %arg1) {name = "div.6"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK-NEXT:   return %3 : tensor<4xf32>
   return %3 : tensor<4xf32>
@@ -21,16 +21,16 @@ func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf
 // CHECK-LABEL: func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-NEXT:   %0 = addi %arg0, %arg1 : tensor<4xi32>
-  %0 = "xla.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
 
   // CHECK-NEXT:   %1 = muli %0, %arg1 : tensor<4xi32>
-  %1 = "xla.mul"(%0, %arg1) {name = "mul.4"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  %1 = "xla_hlo.mul"(%0, %arg1) {name = "mul.4"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
 
   // CHECK-NEXT:   %2 = subi %1, %arg1 : tensor<4xi32>
-  %2 = "xla.sub"(%1, %arg1) {name = "sub.5"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  %2 = "xla_hlo.sub"(%1, %arg1) {name = "sub.5"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
 
   // CHECK-NEXT:   %3 = divis %2, %arg1 : tensor<4xi32>
-  %3 = "xla.div"(%2, %arg1) {name = "div.6"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  %3 = "xla_hlo.div"(%2, %arg1) {name = "div.6"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
 
   // CHECK-NEXT:   return %3 : tensor<4xi32>
   return %3 : tensor<4xi32>
@@ -41,23 +41,23 @@ func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32
 // them to separate broadcast and binary op.
 // CHECK-LABEL: func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
 func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: %0 = "xla.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "add.3"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %0 = "xla.add"(%arg0, %arg1) {
+  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "add.3"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {
       name = "add.3", broadcast_dimensions = dense<1> : tensor<1xi64>} :
           (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
 
-  // CHECK-NEXT: %1 = "xla.mul"(%0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "mul.4"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %1 = "xla.mul"(%0, %arg1) {
+  // CHECK-NEXT: %1 = "xla_hlo.mul"(%0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "mul.4"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
+  %1 = "xla_hlo.mul"(%0, %arg1) {
       name = "mul.4", broadcast_dimensions = dense<1> : tensor<1xi64>} :
           (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
 
-  // CHECK-NEXT: %2 = "xla.sub"(%1, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "sub.5"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %2 = "xla.sub"(%1, %arg1) {
+  // CHECK-NEXT: %2 = "xla_hlo.sub"(%1, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "sub.5"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
+  %2 = "xla_hlo.sub"(%1, %arg1) {
       name = "sub.5", broadcast_dimensions = dense<1> : tensor<1xi64>} :
           (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
 
-  // CHECK-NEXT: %3 = "xla.div"(%2, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "div.6"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %3 = "xla.div"(%2, %arg1) {
+  // CHECK-NEXT: %3 = "xla_hlo.div"(%2, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "div.6"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
+  %3 = "xla_hlo.div"(%2, %arg1) {
       name = "div.6", broadcast_dimensions = dense<1> : tensor<1xi64>} :
           (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
 
@@ -68,17 +68,17 @@ func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tens
 // CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
 func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
-  %0 = "xla.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %0 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: %1 = cmpi "ne", %arg0, %arg0 : tensor<4xi32>
-  %1 = "xla.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %1 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: %2 = cmpi "slt", %arg0, %arg0 : tensor<4xi32>
-  %2 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %2 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: %3 = cmpi "sle", %arg0, %arg0 : tensor<4xi32>
-  %3 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %3 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: %4 = cmpi "sgt", %arg0, %arg0 : tensor<4xi32>
-  %4 = "xla.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %4 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: %5 = cmpi "sge", %arg0, %arg0 : tensor<4xi32>
-  %5 = "xla.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %5 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: return %0, %1, %2, %3, %4, %5 : tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
   return %0, %1, %2, %3, %4, %5 : tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
 }
@@ -86,17 +86,17 @@ func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi
 // CHECK-LABEL: func @compare_float
 func @compare_float(%arg0: tensor<4xf32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpf "oeq", %arg0, %arg0 : tensor<4xf32>
-  %0 = "xla.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %0 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   // CHECK-NEXT: %1 = cmpf "une", %arg0, %arg0 : tensor<4xf32>
-  %1 = "xla.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %1 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   // CHECK-NEXT: %2 = cmpf "olt", %arg0, %arg0 : tensor<4xf32>
-  %2 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %2 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   // CHECK-NEXT: %3 = cmpf "ole", %arg0, %arg0 : tensor<4xf32>
-  %3 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %3 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   // CHECK-NEXT: %4 = cmpf "ogt", %arg0, %arg0 : tensor<4xf32>
-  %4 = "xla.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %4 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   // CHECK-NEXT: %5 = cmpf "oge", %arg0, %arg0 : tensor<4xf32>
-  %5 = "xla.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  %5 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   return %0, %1, %2, %3, %4, %5: tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
new file mode 100644
index 00000000000..070386a0393
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -0,0 +1,143 @@
+// RUN: tf-opt %s -verify-diagnostics -split-input-file
+
+// -----
+
+func @enforce_static_shapes(%arg0: memref<?xf32>, %arg1: memref<?xf32>) -> () {
+  // expected-error@+1{{op operand #0 must be statically shaped memref of floating-point or integer values}}
+  "xla_lhlo.tanh"(%arg0, %arg1) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// -----
+
+func @enforce_same_shape(%arg0: memref<1xf32>, %arg1: memref<2xf32>) -> () {
+  // expected-error@+1{{'xla_lhlo.tanh' op requires all operands to have the same type}}
+  "xla_lhlo.tanh"(%arg0, %arg1) : (memref<1xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @add_memrefs
+func @add_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
+  "xla_lhlo.add"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @abs_memref
+func @abs_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.abs"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @convert_memref
+func @convert_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.convert"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @exp_memref
+func @exp_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.exp"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @neg_memref
+func @neg_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.neg"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sign_memref
+func @sign_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.sign"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_memref
+func @tanh_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.tanh"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @add_memref
+func @add_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.add"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @div_memref
+func @div_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.div"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @max_memref
+func @max_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.max"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @min_memref
+func @min_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.min"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @mul_memref
+func @mul_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.mul"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sub_memref
+func @sub_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.sub"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @and_memref
+func @and_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
+  "xla_lhlo.and"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+func @reduce_computation(%sum: memref<1xf32>, %element: memref<1xf32>) -> () {
+  "xla_lhlo.add"(%element, %sum, %sum) : (memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @reduce_memref
+func @reduce_memref(%input: memref<10xf32>, %out: memref<1xf32>) -> () {
+  "xla_lhlo.reduce"(%input, %out) {computation = @reduce_computation,
+                                   dimensions = dense<[0]> : tensor<1xi64>} : (memref<10xf32>, memref<1xf32>) -> ()
+  return
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index fcd93bb1b97..06c98fb39b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -4,7 +4,7 @@
 
 func @enforce_static_shapes(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // expected-error@+1 {{op operand #0 must be statically shaped tensor}}
-  %0 = "xla.tanh"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  %0 = "xla_hlo.tanh"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0: tensor<*xf32>
 }
 
@@ -12,7 +12,7 @@ func @enforce_static_shapes(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @add_tensors
 func @add_tensors(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32> {
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -20,7 +20,7 @@ func @add_tensors(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @add_scalars
 func @add_scalars(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   return %0: tensor<i32>
 }
 
@@ -28,7 +28,7 @@ func @add_scalars(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
 
 // CHECK-LABEL: func @add_scalar_tensor
 func @add_scalar_tensor(%arg0: tensor<1xi32>, %arg1: tensor<i32>) -> tensor<1xi32> {
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -36,7 +36,7 @@ func @add_scalar_tensor(%arg0: tensor<1xi32>, %arg1: tensor<i32>) -> tensor<1xi3
 
 // CHECK-LABEL: func @batch_norm_inference
 func @batch_norm_inference(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> tensor<8x8x8x8xf32> {
-  %0 = "xla.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  %0 = "xla_hlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
   return %0 : tensor<8x8x8x8xf32>
 }
 
@@ -44,7 +44,7 @@ func @batch_norm_inference(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %ar
 
 // CHECK-LABEL: func @broadcast
 func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -52,7 +52,7 @@ func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_nonint_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -60,7 +60,7 @@ func @broadcast_nonint_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_splat_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<2.0> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<2.0> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -68,7 +68,7 @@ func @broadcast_splat_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_sparse_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -76,7 +76,7 @@ func @broadcast_sparse_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -84,7 +84,7 @@ func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{result rank (3) does not match operand rank (1) plus size of broadcast_sizes (3)}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -92,7 +92,7 @@ func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{result has shape [1, 3] instead of [2, 3]}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
   return %0 : tensor<1x3xi32>
 }
 
@@ -100,7 +100,7 @@ func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x
 
 func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{result has shape [2, 1] instead of [2, 3]}}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
   return %0 : tensor<2x1xi32>
 }
 
@@ -108,7 +108,7 @@ func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2
 
 // CHECK-LABEL: func @broadcast_in_dim
 func @broadcast_in_dim(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
   return %0 : tensor<1x2x2xi32>
 }
 
@@ -116,7 +116,7 @@ func @broadcast_in_dim(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
 
 // CHECK-LABEL: func @broadcast_in_dim_zero_rank
 func @broadcast_in_dim_zero_rank(%arg0: tensor<i32>) -> tensor<1x2x3xi32> {
-  %0 = "xla.broadcast_in_dim"(%arg0) : (tensor<i32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) : (tensor<i32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -124,7 +124,7 @@ func @broadcast_in_dim_zero_rank(%arg0: tensor<i32>) -> tensor<1x2x3xi32> {
 
 func @broadcast_in_dim_bad_nonint_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -132,7 +132,7 @@ func @broadcast_in_dim_bad_nonint_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1
 
 func @broadcast_in_dim_bad_splat_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2.0> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2.0> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -140,7 +140,7 @@ func @broadcast_in_dim_bad_splat_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x
 
 func @broadcast_in_dim_bad_sparse_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -148,7 +148,7 @@ func @broadcast_in_dim_bad_sparse_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1
 
 func @broadcast_in_dim_bad_dimension_rank(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions has rank 2 instead of rank 1}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -156,7 +156,7 @@ func @broadcast_in_dim_bad_dimension_rank(%arg0: tensor<1x2xi32>) -> tensor<1x2x
 
 func @broadcast_in_dim_bad_dimension_size(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions size (1) does not match operand rank (2)}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -164,7 +164,7 @@ func @broadcast_in_dim_bad_dimension_size(%arg0: tensor<1x2xi32>) -> tensor<1x2x
 
 func @broadcast_in_dim_bad_rank_decrease(%arg0: tensor<1x2x3xi32>) -> tensor<3xi32> {
   // expected-error@+1 {{result rank (1) is less than operand rank (3)}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x3xi32>) -> tensor<3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x3xi32>) -> tensor<3xi32>
   return %0 : tensor<3xi32>
 }
 
@@ -172,7 +172,7 @@ func @broadcast_in_dim_bad_rank_decrease(%arg0: tensor<1x2x3xi32>) -> tensor<3xi
 
 func @broadcast_in_dim_dimension_values_too_large(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions contains invalid value 9 for result result with rank 3}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[9, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[9, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -180,7 +180,7 @@ func @broadcast_in_dim_dimension_values_too_large(%arg0: tensor<1x2xi32>) -> ten
 
 func @broadcast_in_dim_bad_shape_mismatch(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{size of operand dimension 0 (3) is not equal to 1 or size of result dimension 1 (2)}}
-  %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -188,7 +188,7 @@ func @broadcast_in_dim_bad_shape_mismatch(%arg0: tensor<3xi32>) -> tensor<1x2x3x
 
 // CHECK-LABEL: func @comp_eq
 func @comp_eq(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
-  %0 = "xla.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   return %0 : tensor<3xi1>
 }
 
@@ -196,7 +196,7 @@ func @comp_eq(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
 
 func @comp_bad_direction(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
   // expected-error@+1 {{'comparison_direction' failed to satisfy constraint}}
-  %0 = "xla.compare"(%arg0, %arg1) {comparison_direction = "FOOBAR"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "FOOBAR"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   return %0 : tensor<3xi1>
 }
 
@@ -204,7 +204,7 @@ func @comp_bad_direction(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3
 
 func @comp_no_direction(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
   // expected-error@+1 {{op requires attribute 'comparison_direction'}}
-  %0 = "xla.compare"(%arg0, %arg1) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %0 = "xla_hlo.compare"(%arg0, %arg1) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   return %0 : tensor<3xi1>
 }
 
@@ -212,7 +212,7 @@ func @comp_no_direction(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3x
 
 // CHECK-LABEL: func @conv
 func @conv(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi32> {
-  %0 = "xla.conv"(%arg0, %arg1) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+  %0 = "xla_hlo.conv"(%arg0, %arg1) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
   return %0: tensor<3xi32>
 }
 
@@ -220,7 +220,7 @@ func @conv(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi32> {
 
 // CHECK-LABEL: func @copy
 func @copy(%arg0: tensor<1xi32>) -> tensor<1xi32> {
-  %0 = "xla.copy"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = "xla_hlo.copy"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -228,7 +228,7 @@ func @copy(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @clamp
 func @clamp(%arg0: tensor<1xi32>) -> tensor<1xi32> {
-  %0 = "xla.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %0 = "xla_hlo.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -236,39 +236,39 @@ func @clamp(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @clamp_scalar
 func @clamp_scalar(%arg0: tensor<1xi32>, %arg1: tensor<i32>) -> tensor<1xi32> {
-  %0 = "xla.clamp"(%arg1, %arg0, %arg1) : (tensor<i32>, tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %0 = "xla_hlo.clamp"(%arg1, %arg0, %arg1) : (tensor<i32>, tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
+func @clamp_invalid_clamp_element_type(%arg0: tensor<1xi32>, %arg1: tensor<1xf32>) -> tensor<1xi32> {
+  // expected-error@+1 {{'xla_hlo.clamp' op requires the same element type for all operands and results}}
+  %0 = "xla_hlo.clamp"(%arg1, %arg0, %arg0) : (tensor<1xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
+func @clamp_invalid_clamp_shape(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
+  // expected-error@+1 {{min shape [2] is not scalar and does not match operand shape [1]}}
+  %0 = "xla_hlo.clamp"(%arg1, %arg0, %arg0) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
 // -----
 
 func @clamp_invalid_min_element_type(%arg0: tensor<1xi32>, %arg1: tensor<1xf32>) -> tensor<1xi32> {
-  // expected-error@+1 {{'xla.clamp' op requires the same element type for all operands and results}}
-  %0 = "xla.clamp"(%arg1, %arg0, %arg0) : (tensor<1xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
-  return %0: tensor<1xi32>
-}
-
-// -----
-
-func @clamp_invalid_min_shape(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
-  // expected-error@+1 {{min shape [2] is not scalar and does not match operand shape [1]}}
-  %0 = "xla.clamp"(%arg1, %arg0, %arg0) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  // expected-error@+1 {{'xla_hlo.min' op requires the same element type for all operands and results}}
+  %0 = "xla_hlo.min"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xf32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
 // -----
 
 func @clamp_invalid_max_element_type(%arg0: tensor<1xi32>, %arg1: tensor<1xf32>) -> tensor<1xi32> {
-  // expected-error@+1 {{'xla.clamp' op requires the same element type for all operands and results}}
-  %0 = "xla.clamp"(%arg0, %arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>, tensor<1xf32>) -> tensor<1xi32>
-  return %0: tensor<1xi32>
-}
-
-// -----
-
-func @clamp_invalid_max_shape(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
-  // expected-error@+1 {{max shape [2] is not scalar and does not match operand shape [1]}}
-  %0 = "xla.clamp"(%arg0, %arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<1xi32>
+  // expected-error@+1 {{'xla_hlo.max' op requires the same element type for all operands and results}}
+  %0 = "xla_hlo.max"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xf32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -276,7 +276,7 @@ func @clamp_invalid_max_shape(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> ten
 
 // CHECK-LABEL: func @dot_vector
 func @dot_vector(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) -> tensor<i32> {
-  %0 = "xla.dot"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<i32>
+  %0 = "xla_hlo.dot"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<i32>
   return %0: tensor<i32>
 }
 
@@ -284,7 +284,7 @@ func @dot_vector(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) -> tensor<i32>
 
 // CHECK-LABEL: func @dot_matrix
 func @dot_matrix(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-  %0 = "xla.dot"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  %0 = "xla_hlo.dot"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   return %0: tensor<2x2xi32>
 }
 
@@ -292,7 +292,7 @@ func @dot_matrix(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi
 
 // CHECK-LABEL: func @dot_precision_config
 func @dot_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-  %0 = "xla.dot"(%arg0, %arg1) {precision_config = ["HIGH", "HIGHEST"]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["HIGH", "HIGHEST"]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   return %0: tensor<2x2xi32>
 }
 
@@ -300,7 +300,7 @@ func @dot_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> te
 
 func @dot_bad_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
   // expected-error@+1 {{'precision_config' failed to satisfy constraint}}
-  %0 = "xla.dot"(%arg0, %arg1) {precision_config = ["FOO", "HIGHEST"]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["FOO", "HIGHEST"]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   return %0: tensor<2x2xi32>
 }
 
@@ -308,15 +308,47 @@ func @dot_bad_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -
 
 // CHECK-LABEL: func @tanh
 func @tanh(%arg0: tensor<1xf32>) -> tensor<1xf32> {
-  %0 = "xla.tanh"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  %0 = "xla_hlo.tanh"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
   return %0: tensor<1xf32>
 }
 
 // -----
 
+func @exp_invalid_result_type(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-error@+1 {{'xla_hlo.exp' op requires the same type for all operands and results}}
+  %0 = "xla_hlo.exp"(%arg0) : (tensor<1xf32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
+func @floor_invalid_result_type(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-error@+1 {{'xla_hlo.floor' op requires the same type for all operands and results}}
+  %0 = "xla_hlo.floor"(%arg0) : (tensor<1xf32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
+func @log_invalid_result_type(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-error@+1 {{'xla_hlo.log' op requires the same type for all operands and results}}
+  %0 = "xla_hlo.log"(%arg0) : (tensor<1xf32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
+func @rsqrt_invalid_result_type(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-error@+1 {{'xla_hlo.rsqrt' op requires the same type for all operands and results}}
+  %0 = "xla_hlo.rsqrt"(%arg0) : (tensor<1xf32>) -> tensor<1xi32>
+  return %0: tensor<1xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @reshape_same_shape
 func @reshape_same_shape(%arg0: tensor<1xi32>) -> tensor<1xi32> {
-  %0 = "xla.reshape"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -324,7 +356,7 @@ func @reshape_same_shape(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @reshape_different_shape
 func @reshape_different_shape(%arg0: tensor<1x16xi32>) -> tensor<4x4xi32> {
-  %0 = "xla.reshape"(%arg0) : (tensor<1x16xi32>) -> tensor<4x4xi32>
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x16xi32>) -> tensor<4x4xi32>
   return %0: tensor<4x4xi32>
 }
 
@@ -332,7 +364,7 @@ func @reshape_different_shape(%arg0: tensor<1x16xi32>) -> tensor<4x4xi32> {
 
 // CHECK-LABEL: func @reshape_from_scalar
 func @reshape_from_scalar(%arg0: tensor<i32>) -> tensor<1xi32> {
-  %0 = "xla.reshape"(%arg0) : (tensor<i32>) -> tensor<1xi32>
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<i32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
 
@@ -340,7 +372,7 @@ func @reshape_from_scalar(%arg0: tensor<i32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @reshape_to_scalar
 func @reshape_to_scalar(%arg0: tensor<1xi32>) -> tensor<i32> {
-  %0 = "xla.reshape"(%arg0) : (tensor<1xi32>) -> tensor<i32>
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<1xi32>) -> tensor<i32>
   return %0: tensor<i32>
 }
 
@@ -348,7 +380,7 @@ func @reshape_to_scalar(%arg0: tensor<1xi32>) -> tensor<i32> {
 
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -356,7 +388,7 @@ func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi3
 
 // CHECK-LABEL: func @select_scalar_pred
 func @select_scalar_pred(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -364,7 +396,7 @@ func @select_scalar_pred(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tenso
 
 func @select_bad_pred_type(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // expected-error@+1 {{must be statically shaped tensor of pred (AKA boolean or 1-bit integer)}}
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<3xi32>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi32>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -372,7 +404,7 @@ func @select_bad_pred_type(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>, %arg2:
 
 func @select_bad_shape_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // expected-error@+1 {{on_true type (tensor<2x4xi32>) does not match on_false type (tensor<2x3xi32>)}}
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -380,7 +412,7 @@ func @select_bad_shape_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %ar
 
 func @select_bad_element_type_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // expected-error@+1 {{on_true type (tensor<2x3xf32>) does not match on_false type (tensor<2x3xi32>)}}
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -388,15 +420,39 @@ func @select_bad_element_type_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf3
 
 func @select_bad_pred_shape(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // expected-error@+1 {{red shape ([3]) is not scalar and does not match operand shapes ([2, 3])}}
-  %0 = "xla.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
 // -----
 
+// CHECK-LABEL: func @slice
+func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
+  return %0 : tensor<1x4xi32>
+}
+
+// -----
+
+func @slice_indices_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
+  // expected-error@+1 {{failed to verify that all of {start_indices, limit_indices} have same type}}
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 2, 3]> : tensor<3xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
+  return %0 : tensor<1x4xi32>
+}
+
+// -----
+
+func @slice_operand_result_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xf32> {
+  // expected-error@+1 {{requires the same element type for all operands and results}}
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @transpose
-func @transpose(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -404,7 +460,7 @@ func @transpose(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
 
 func @transpose_bad_permutations_float(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1.0, 0.0, 3.0, 2.0]> : tensor<4xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1.0, 0.0, 3.0, 2.0]> : tensor<4xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -412,7 +468,7 @@ func @transpose_bad_permutations_float(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x
 
 func @transpose_bad_permutations_splat(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<2.0> : tensor<2xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<2.0> : tensor<2xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -420,7 +476,7 @@ func @transpose_bad_permutations_splat(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x
 
 func @transpose_bad_permutations_sparse(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla.transpose"(%arg0) {permutation = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -428,7 +484,7 @@ func @transpose_bad_permutations_sparse(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2
 
 func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -436,7 +492,7 @@ func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1
 
 func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation size (1) does not match operand rank (4)}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0: tensor<2x1x4x3xi32>
 }
 
@@ -444,7 +500,7 @@ func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1
 
 func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2xi32> {
   // expected-error@+1 {{result rank (1) does not match operand rank (4)}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
 
@@ -452,7 +508,7 @@ func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  ten
 
 func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<1x2x3x4xi32> {
   // expected-error@+1 {{result shape is [1, 2, 3, 4] instead of [2, 1, 4, 3]}}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32>
   return %0: tensor<1x2x3x4xi32>
 }
 
@@ -460,6 +516,6 @@ func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x2x3x4xi32>)
 
 // CHECK-LABEL: func @tuple
 func @tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>> {
-  %0 = "xla.tuple"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
+  %0 = "xla_hlo.tuple"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
   return %0: tuple<tensor<1xi32>, tensor<1x2xf32>>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/reshape.mlir b/tensorflow/compiler/mlir/xla/tests/reshape.mlir
index ee29a718abf..34cb3cb2729 100644
--- a/tensorflow/compiler/mlir/xla/tests/reshape.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/reshape.mlir
@@ -1,80 +1,149 @@
-// RUN: tf-opt %s -split-input-file -xla-legalize-to-std | FileCheck %s
+// RUN: tf-opt %s -split-input-file -canonicalize | FileCheck %s
 
-// -----
-
-// CHECK-LABEL: func @reshape.const.1() -> tensor<f32> {
-func @reshape.const.1() -> tensor<f32> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<f32>
-  %cst = constant  {name = "constant.1"} dense<42.0> : tensor<1x1xf32>
-  %0 = "xla.reshape"(%cst) : (tensor<1x1xf32>) -> tensor<f32>
-  // CHECK-NEXT: return %cst : tensor<f32>
-  return %0 : tensor<f32>
+// CHECK-LABEL: func @const_fold_collapse_to_scalar
+func @const_fold_collapse_to_scalar() -> tensor<i32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<i32>
+  %cst = constant dense<42> : tensor<1x1xi32>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<1x1xi32>) -> tensor<i32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func @reshape.const.2() -> tensor<2xf32> {
-func @reshape.const.2() -> tensor<2xf32> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<2xf32>
-  %cst = constant  {name = "constant.1"} dense<42.0> : tensor<1x2xf32>
-  %0 = "xla.reshape"(%cst) : (tensor<1x2xf32>) -> tensor<2xf32>
-  // CHECK-NEXT: return %cst : tensor<2xf32>
-  return %0 : tensor<2xf32>
+// CHECK-LABEL: func @const_fold_collapse_to_tensor
+func @const_fold_collapse_to_tensor() -> tensor<2xi32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<2xi32>
+  %cst = constant dense<42> : tensor<1x2xi32>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<1x2xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<2xi32>
 }
 
 // -----
 
-// CHECK-LABEL: func @reshape.const.3() -> tensor<1xf32> {
-func @reshape.const.3() -> tensor<1xf32> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<1xf32>
-  %cst = constant  {name = "constant.1"} dense<42.0> : tensor<f32>
-  %0 = "xla.reshape"(%cst) : (tensor<f32>) -> tensor<1xf32>
-  // CHECK-NEXT: return %cst : tensor<1xf32>
-  return %0 : tensor<1xf32>
+// CHECK-LABEL: func @const_fold_expand
+func @const_fold_expand() -> tensor<1xi32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<1xi32>
+  %cst = constant dense<42> : tensor<i32>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<1xi32>
 }
 
 // -----
 
-// CHECK-LABEL: func @reshape.const.4() -> tensor<16xi64> {
-func @reshape.const.4() -> tensor<16xi64> {
-  // CHECK-NEXT: %cst = constant dense<42> : tensor<16xi64>
-  %cst = constant  dense<42> : tensor<4x4xi64>
-  %0 = "xla.reshape"(%cst) : (tensor<4x4xi64>) -> tensor<16xi64>
-  // CHECK-NEXT: return %cst : tensor<16xi64>
+// CHECK-LABEL: func @const_fold_nontrivial
+func @const_fold_nontrivial() -> tensor<16xi64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<16xi64>
+  %cst = constant dense<42> : tensor<4x4xi64>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<4x4xi64>) -> tensor<16xi64>
+  // CHECK-NEXT: return [[CST]]
   return %0 : tensor<16xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func @reshape.const.5() -> tensor<16xf64> {
-func @reshape.const.5() -> tensor<16xf64> {
-  // CHECK-NEXT: %cst = constant dense<4.200000e+01> : tensor<16xf64>
-  %cst = constant  dense<4.200000e+01> : tensor<4x4xf64>
-  %0 = "xla.reshape"(%cst) : (tensor<4x4xf64>) -> tensor<16xf64>
-  // CHECK-NEXT: return %cst : tensor<16xf64>
-  return %0 : tensor<16xf64>
+// CHECK-LABEL: func @const_fold_flatten
+func @const_fold_flatten() -> tensor<16xi64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<42> : tensor<16xi64>
+  %cst = constant dense<42> : tensor<4x4xi64>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<4x4xi64>) -> tensor<16xi64>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<16xi64>
 }
 
-
 // -----
 
-// CHECK-LABEL: func @reshape.const.6() -> tensor<6xi32> {
-func @reshape.const.6() -> tensor<6xi32> {
-  // CHECK-NEXT: %cst = constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
-  %cst = constant  {name = "constant.1"} dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %0 = "xla.reshape"(%cst) : (tensor<3x2xi32>) -> tensor<6xi32>
-  // CHECK-NEXT: return %cst : tensor<6xi32>
+// CHECK-LABEL: func @const_fold_6
+func @const_fold_6() -> tensor<6xi32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
+  %cst = constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<3x2xi32>) -> tensor<6xi32>
+  // CHECK-NEXT: return [[CST]]
   return %0 : tensor<6xi32>
 }
 
+// -----
+
+// CHECK-LABEL: func @const_fold_same_shape
+func @const_fold_same_shape() -> tensor<2x3xi32> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<[
+  // CHECK-SAME:   [1, 2, 3], [4, 5, 6]
+  // CHECK-SAME: ]> : tensor<2x3xi32>
+  %cst = constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<6xi32>) -> tensor<2x3xi32>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<2x3xi32>
+}
 
 // -----
 
-// CHECK-LABEL: func @reshape.const.7() -> tensor<2x3xi32> {
-func @reshape.const.7() -> tensor<2x3xi32> {
-  // CHECK-NEXT: %cst = constant dense<{{\[\[}}1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
-  %cst = constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
-  %0 = "xla.reshape"(%cst) : (tensor<6xi32>) -> tensor<2x3xi32>
-  // CHECK-NEXT: return %cst : tensor<2x3xi32>
+// CHECK-LABEL: func @const_fold_float
+func @const_fold_float() -> tensor<16xf64> {
+  // CHECK-NEXT: [[CST:%.+]] = constant dense<4.2{{0*}}e+00> : tensor<16xf64>
+  %cst = constant dense<4.2> : tensor<4x4xf64>
+  %0 = "xla_hlo.reshape"(%cst) : (tensor<4x4xf64>) -> tensor<16xf64>
+  // CHECK-NEXT: return [[CST]]
+  return %0 : tensor<16xf64>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_same_shape
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @non_const_same_shape(%arg : tensor<2x3xi32>) -> tensor<2x3xi32> {
+  // CHECK-NEXT: return [[ARG]]
+  %0 = "xla_hlo.reshape"(%arg) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
-}
\ No newline at end of file
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_chained_reshape
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @non_const_chained_reshape(%arg : tensor<2x3xi32>) -> (tensor<3x2xi32>, tensor<6xi32>) {
+  // CHECK-NEXT: "xla_hlo.reshape"([[ARG]]) : (tensor<2x3xi32>) -> tensor<3x2xi32>
+  // CHECK-NEXT: "xla_hlo.reshape"([[ARG]]) : (tensor<2x3xi32>) -> tensor<6xi32>
+  %0 = "xla_hlo.reshape"(%arg) : (tensor<2x3xi32>) -> tensor<3x2xi32>
+  %1 = "xla_hlo.reshape"(%0) : (tensor<3x2xi32>) -> tensor<6xi32>
+  return %0, %1 : tensor<3x2xi32>, tensor<6xi32> // return both so nothing is removed
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_chained_reshape_unused_parent
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @non_const_chained_reshape_unused_parent(%arg : tensor<2x3xi32>) -> tensor<6xi32> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.reshape"([[ARG]]) : (tensor<2x3xi32>) -> tensor<6xi32>
+  %0 = "xla_hlo.reshape"(%arg) : (tensor<2x3xi32>) -> tensor<3x2xi32>
+  %1 = "xla_hlo.reshape"(%0) : (tensor<3x2xi32>) -> tensor<6xi32>
+  // CHECK-NEXT: return [[RES]]
+  return %1 : tensor<6xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_chained_reshape_becomes_noop
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @non_const_chained_reshape_becomes_noop(%arg : tensor<2x3xi32>) -> tensor<2x3xi32> {
+  %0 = "xla_hlo.reshape"(%arg) : (tensor<2x3xi32>) -> tensor<3x2xi32>
+  %1 = "xla_hlo.reshape"(%0) : (tensor<3x2xi32>) -> tensor<2x3xi32>
+  // CHECK-NEXT: return [[ARG]]
+  return %1 : tensor<2x3xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_many_chained_reshapes
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @non_const_many_chained_reshapes(%arg : tensor<2x3x4xi32>) -> tensor<1x2x4x3xi32> {
+  // CHECK-NEXT: [[RES:%.+]] = "xla_hlo.reshape"([[ARG]]) : (tensor<2x3x4xi32>) -> tensor<1x2x4x3xi32>
+  %0 = "xla_hlo.reshape"(%arg) : (tensor<2x3x4xi32>) -> tensor<4x3x2xi32>
+  %1 = "xla_hlo.reshape"(%0) : (tensor<4x3x2xi32>) -> tensor<12x2xi32>
+  %2 = "xla_hlo.reshape"(%1) : (tensor<12x2xi32>) -> tensor<2x12xi32>
+  %3 = "xla_hlo.reshape"(%2) : (tensor<2x12xi32>) -> tensor<24xi32>
+  %4 = "xla_hlo.reshape"(%3) : (tensor<24xi32>) -> tensor<1x2x4x3xi32>
+  // CHECK-NEXT: return [[RES]]
+  return %4 : tensor<1x2x4x3xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/add.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/add.hlotxt
index d285df18bc9..96423e0d12b 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/add.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/add.hlotxt
@@ -13,15 +13,15 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4], Arg_2.3: f32[], Arg_3.4: f32[])
   %Arg_3.4 = f32[] parameter(3)
 
   // Add two tensors
-  // CHECK-NEXT:   %0 = "xla.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT:   %0 = "xla_hlo.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 
   // Add two scalars
-  // CHECK-NEXT: %1 = "xla.add"(%arg2, %arg3) {name = "add.4"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: %1 = "xla_hlo.add"(%arg2, %arg3) {name = "add.4"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %add.4 = f32[] add(f32[] %Arg_2.3, f32[] %Arg_3.4)
 
   // Add a tensor and scalar
-  // CHECK-NEXT: %2 = "xla.add"(%0, %1) {name = "add.5"} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT: %2 = "xla_hlo.add"(%0, %1) {name = "add.5"} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK-NEXT: return %2 : tensor<4xf32>
   ROOT %add.5 = f32[4] add(f32[4] %add.3, f32[] %add.4)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/add.mlir b/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
index 4009759f3b8..a77b90ca083 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
@@ -6,9 +6,9 @@ func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1)
 
   // CHECK-NEXT: %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK-NEXT: ROOT %add.4 = f32[4] add(f32[4] %add.3, f32[4] %Arg_1.2)
-  %1 = "xla.add"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %1 = "xla_hlo.add"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/and.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/and.hlotxt
index 1826809db63..25cf3ecd16a 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/and.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/and.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.and"(%arg0, %arg1) {name = "and.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.and"(%arg0, %arg1) {name = "and.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %and.3 = f32[4] and(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/binary_op_broadcast.mlir b/tensorflow/compiler/mlir/xla/tests/translate/binary_op_broadcast.mlir
index 9aff6393e86..38aa4f04bad 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/binary_op_broadcast.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/binary_op_broadcast.mlir
@@ -8,19 +8,19 @@ func @main(%arg0: tensor<1x4xi32>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3x4xi
   // CHECK-NEXT: %broadcast.5 = s32[2,4] broadcast(s32[4] %reshape.4)
   // CHECK-NEXT: %Arg_1.2 = s32[2,4] parameter(1)
   // CHECK-NEXT: %add.6 = s32[2,4] add(s32[2,4] %broadcast.5, s32[2,4] %Arg_1.2)
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<1x4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1x4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
 
   // Broadcast up rank
   // CHECK-NEXT: %broadcast.7 = s32[2,3,4] broadcast(s32[2,4] %Arg_1.2), dimensions={0,2}
   // CHECK-NEXT: %Arg_2.3 = s32[2,3,4] parameter(2)
   // CHECK-NEXT: %add.8 = s32[2,3,4] add(s32[2,3,4] %broadcast.7, s32[2,3,4] %Arg_2.3)
-  %1 = "xla.add"(%arg1, %arg2) {broadcast_dimensions = dense<[0,2]> : tensor<2xi64>} : (tensor<2x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
+  %1 = "xla_hlo.add"(%arg1, %arg2) {broadcast_dimensions = dense<[0,2]> : tensor<2xi64>} : (tensor<2x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
 
   // Broadcast up rank + degenerate broadcast
   // CHECK-NEXT: %broadcast.9 = s32[2,1,4] broadcast(s32[1,4] %Arg_0.1), dimensions={1,2}
   // CHECK-NEXT: %reshape.10 = s32[2,4] reshape(s32[2,1,4] %broadcast.9)
   // CHECK-NEXT: %broadcast.11 = s32[2,3,4] broadcast(s32[2,4] %reshape.10), dimensions={0,2}
   // CHECK-NEXT: ROOT %add.12 = s32[2,3,4] add(s32[2,3,4] %broadcast.11, s32[2,3,4] %Arg_2.3)
-  %2 = "xla.add"(%arg0, %arg2) {broadcast_dimensions = dense<[1,2]> : tensor<2xi64>} : (tensor<1x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
+  %2 = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<[1,2]> : tensor<2xi64>} : (tensor<1x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
   return %2 : tensor<2x3x4xi32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/broadcast.mlir b/tensorflow/compiler/mlir/xla/tests/translate/broadcast.mlir
index 1d231535703..0b64ab23d54 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/broadcast.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/broadcast.mlir
@@ -4,6 +4,6 @@
 func @main(%arg0: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
   // CHECK-NEXT: %Arg_0.1 = s32[4] parameter(0)
   // CHECK-NEXT: ROOT %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] %Arg_0.1), dimensions={3}
-  %0 = "xla.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
   return %0 : tensor<1x2x3x4xi32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/broadcast_in_dim.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/broadcast_in_dim.hlotxt
index d9c2e9fe094..3d520fc1bc2 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/broadcast_in_dim.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/broadcast_in_dim.hlotxt
@@ -6,14 +6,14 @@ HloModule main
 ENTRY %main {
   %Arg_0.1 = f32[1, 2] parameter(0)
 
-  // CHECK-NEXT: %0 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "broadcast.2"} : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "broadcast.2"} : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
   %broadcast.2 = f32[1,2,3] broadcast(%Arg_0.1), dimensions={0,1}
 
   // Degenerate broadcast
-  // CHECK-NEXT: %1 = "xla.broadcast_in_dim"(%arg0) {name = "broadcast.3"} : (tensor<1x2xf32>) -> tensor<3x2xf32>
+  // CHECK-NEXT: %1 = "xla_hlo.broadcast_in_dim"(%arg0) {name = "broadcast.3"} : (tensor<1x2xf32>) -> tensor<3x2xf32>
   broadcast.3 = f32[3,2] broadcast(%Arg_0.1), dimensions={}
 
-  // CHECK-NEXT: %2 = "xla.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>, name = "broadcast.4"} : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
+  // CHECK-NEXT: %2 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>, name = "broadcast.4"} : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
   // CHECK-NEXT: return %2 : tensor<3x1x2xf32>
   ROOT broadcast.4 = f32[3,1,2] broadcast(%Arg_0.1), dimensions={1, 2}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/call.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/call.hlotxt
index c7ea0f9637e..350c372796d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/call.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/call.hlotxt
@@ -4,16 +4,16 @@ HloModule foo
 
 // CHECK-LABEL: func @call(%arg0: tensor<i64>) -> tensor<i64> {
 %call (arg_1: s64[]) -> s64[] {
-  %arg_1 = s64[] parameter(0), metadata={op_name="XLA_Args"}
-  // CHECK-NEXT: %0 = "xla.add"(%arg0, %arg0) {name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg0) {name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
   // CHECK-NEXT: return %0 : tensor<i64>
   ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
 }
 
 // CHECK-LABEL: func @main(%arg0: tensor<i64>) -> tensor<i64> {
 ENTRY %foo (arg0.1: s64[]) -> s64[] {
-  %arg0.1 = s64[] parameter(0), metadata={op_name="XLA_Args"}
+  %arg0.1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
   // CHECK-NEXT: %0 = call @call(%arg0) : (tensor<i64>) -> tensor<i64>
   // CHECK-NEXT: return %0 : tensor<i64>
   ROOT %call.2 = s64[] call(%arg0.1), to_apply=%call
-}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/clamp.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/clamp.hlotxt
new file mode 100644
index 00000000000..ea0ca3c1031
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/clamp.hlotxt
@@ -0,0 +1,16 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule main.5
+
+// CHECK-LABEL: func @main(
+// CHECK-SAME: [[A0:%.+]]: tensor<f32>, [[A1:%.+]]: tensor<4xf32>, [[A2:%.+]]: tensor<f32>) -> tensor<4xf32> {
+ENTRY %foo.5 (Arg_0.1: f32[], Arg_1.2: f32[4], Arg_1.3: f32[]) -> f32[4] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[4] parameter(1)
+  %Arg_2.3 = f32[] parameter(2)
+
+  // CHECK-NEXT: [[R0:%.+]] = "xla_hlo.clamp"([[A0]], [[A1]], [[A2]]) {name = "clamp.3"} : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT: return [[R0]] : tensor<4xf32>
+  ROOT %clamp.3 = f32[4] clamp(f32[] %Arg_0.1, f32[4] %Arg_1.2, f32[] %Arg_2.3)
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/comp.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/comp.hlotxt
index ed3019b81cb..637629d9744 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/comp.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/comp.hlotxt
@@ -8,14 +8,14 @@ ENTRY %main (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[1]) -> pred[3] {
   %Arg_1.2 = f32[3] parameter(1)
   %Arg_2.3 = f32[1] parameter(2)
 
-  // CHECK-NEXT: %0 = "xla.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "compare.4"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  // CHECK-NEXT: %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "compare.4"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
 
-  // CHECK-NEXT: %1 = "xla.compare"(%arg0, %arg1) {comparison_direction = "LE", name = "compare.5"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  // CHECK-NEXT: %1 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "LE", name = "compare.5"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
 
   // Requires broadcast of compatible tensors.
-  // CHECK-NEXT: %2 = "xla.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "compare.6"} : (tensor<3xf32>, tensor<1xf32>) -> tensor<3xi1>
+  // CHECK-NEXT: %2 = "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "compare.6"} : (tensor<3xf32>, tensor<1xf32>) -> tensor<3xi1>
   // CHECK-NEXT: return %2 : tensor<3xi1>
   ROOT %compare.6 = pred[3] compare(Arg_0.1, Arg_2.3), direction=GT
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/concat.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/concat.hlotxt
index e73447d768d..b23c22b73c0 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/concat.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/concat.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4, 1], Arg_1.2: f32[4, 2]) -> f32[4, 3] {
   %Arg_0.1 = f32[4, 1] parameter(0)
   %Arg_1.2 = f32[4, 2] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<4x1xf32>, tensor<4x2xf32>) -> tensor<4x3xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<4x1xf32>, tensor<4x2xf32>) -> tensor<4x3xf32>
   // CHECK-NEXT: return %0 : tensor<4x3xf32>
   ROOT %concatenate.3 = f32[4, 3] concatenate(f32[4, 1] %Arg_0.1, f32[4, 2] %Arg_1.2), dimensions={1}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conv.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/conv.hlotxt
index 0de3ac6bffe..35fe1363b2e 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conv.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/conv.hlotxt
@@ -6,12 +6,12 @@ HloModule tfcompile.7
 // implementations with attributes, etc.
 // CHECK-LABEL: func @main(%arg0: tensor<1x16x16x1xf32>) -> tuple<tensor<1x16x16x1xf32>> {
 ENTRY %tfcompile.7 {
-  %arg0.1 = f32[1,16,16,1]{3,2,1,0} parameter(0), metadata={op_name="XLA_Args"}
+  %arg0.1 = f32[1,16,16,1]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:   %0 = "xla.copy"(%arg0) {name = "copy.1"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %copy.1 = f32[1,16,16,1]{2,1,3,0} copy(%arg0.1), metadata={op_name="XLA_Args"}
+  // CHECK-NEXT:   %0 = "xla_hlo.copy"(%arg0) {name = "copy.1"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %copy.1 = f32[1,16,16,1]{2,1,3,0} copy(%arg0.1), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:   %1 = "xla.reshape"(%0) {name = "reshape.2"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  // CHECK-NEXT:   %1 = "xla_hlo.reshape"(%0) {name = "reshape.2"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
   %reshape.2 = f32[1,16,16,1]{2,1,3,0} reshape(%copy.1)
 
   // Note that double brackets "[[" have to be escaped as they denote variables
@@ -19,13 +19,13 @@ ENTRY %tfcompile.7 {
   // CHECK-NEXT:   %cst = constant  {name = "constant.3"} dense<{{\[\[\[\[}}5.000000e-01]], {{\[\[}}-6.000000e-01]]], {{\[\[\[}}3.000000e-01]], {{\[\[}}-1.000000e-01]]]]> : tensor<2x2x1x1xf32>
   %constant.3 = f32[2,2,1,1]{3,2,1,0} constant({{{{0.5}}, {{-0.6}}}, {{{0.3}}, {{-0.1}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:   %2 = "xla.conv"(%1, %cst) {name = "convolution.4"} : (tensor<1x16x16x1xf32>, tensor<2x2x1x1xf32>) -> tensor<1x16x16x1xf32>
+  // CHECK-NEXT:   %2 = "xla_hlo.conv"(%1, %cst) {name = "convolution.4"} : (tensor<1x16x16x1xf32>, tensor<2x2x1x1xf32>) -> tensor<1x16x16x1xf32>
   %convolution.4 = f32[1,16,16,1]{2,1,3,0} convolution(%reshape.2, %constant.3), window={size=2x2 pad=0_1x0_1}, dim_labels=b01f_01io->b01f, metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:   %3 = "xla.reshape"(%2) {name = "reshape.5"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-  %reshape.5 = f32[1,16,16,1]{3,2,1,0} reshape(%convolution.4), metadata={op_name="XLA_Retvals"}
+  // CHECK-NEXT:   %3 = "xla_hlo.reshape"(%2) {name = "reshape.5"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+  %reshape.5 = f32[1,16,16,1]{3,2,1,0} reshape(%convolution.4), metadata={op_name="HLO_Retvals"}
 
-  // CHECK-NEXT:   %4 = "xla.tuple"(%3) {name = "tuple.6"} : (tensor<1x16x16x1xf32>) -> tuple<tensor<1x16x16x1xf32>>
+  // CHECK-NEXT:   %4 = "xla_hlo.tuple"(%3) {name = "tuple.6"} : (tensor<1x16x16x1xf32>) -> tuple<tensor<1x16x16x1xf32>>
   // CHECK-NEXT:   return %4 : tuple<tensor<1x16x16x1xf32>>
-  ROOT %tuple.6 = (f32[1,16,16,1]{3,2,1,0}) tuple(%reshape.5), metadata={op_name="XLA_Retvals"}
-}
+  ROOT %tuple.6 = (f32[1,16,16,1]{3,2,1,0}) tuple(%reshape.5), metadata={op_name="HLO_Retvals"}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/convert.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/convert.hlotxt
index 3c0c7a9c1d1..f22646fc23e 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/convert.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/convert.hlotxt
@@ -7,13 +7,13 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f64[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.convert"(%arg0) {name = "convert.3"} : (tensor<4xf32>) -> tensor<4xf64>
+  // CHECK-NEXT: %0 = "xla_hlo.convert"(%arg0) {name = "convert.3"} : (tensor<4xf32>) -> tensor<4xf64>
   %convert.3 = f64[4] convert(f32[4] %Arg_0.1)
 
-  // CHECK-NEXT: %1 = "xla.convert"(%arg1) {name = "convert.4"} : (tensor<f32>) -> tensor<f64>
+  // CHECK-NEXT: %1 = "xla_hlo.convert"(%arg1) {name = "convert.4"} : (tensor<f32>) -> tensor<f64>
   %convert.4 = f64[] convert(f32[] %Arg_1.2)
 
-  // CHECK-NEXT: %2 = "xla.add"(%0, %1) {name = "add.5"} : (tensor<4xf64>, tensor<f64>) -> tensor<4xf64>
+  // CHECK-NEXT: %2 = "xla_hlo.add"(%0, %1) {name = "add.5"} : (tensor<4xf64>, tensor<f64>) -> tensor<4xf64>
   // CHECK-NEXT: return %2 : tensor<4xf64>
   ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[] %convert.4)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/div.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/div.hlotxt
index 602ad96b852..772e47a0a35 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/div.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/div.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.div"(%arg0, %arg1) {name = "divide.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.div"(%arg0, %arg1) {name = "divide.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %divide.3 = f32[4] divide(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/dot.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/dot.hlotxt
index 5b7d0c6c2ef..88beb2f4803 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/dot.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/dot.hlotxt
@@ -7,17 +7,17 @@ ENTRY %main (Arg_0.1: f32[1, 4], Arg_1.2: f32[4, 1]) -> f32[] {
   %Arg_0.1 = f32[1, 4] parameter(0)
   %Arg_1.2 = f32[4, 1] parameter(1)
 
-  // CHECK-NEXT:   %0 = "xla.dot"(%arg0, %arg1) {name = "dot.3", precision_config = ["HIGH", "HIGHEST"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:   %0 = "xla_hlo.dot"(%arg0, %arg1) {name = "dot.3", precision_config = ["HIGH", "HIGHEST"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   dot.3 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={high,highest}
 
-  // CHECK-NEXT:   %1 = "xla.dot"(%arg0, %arg1) {name = "dot.4", precision_config = ["HIGHEST", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:   %1 = "xla_hlo.dot"(%arg0, %arg1) {name = "dot.4", precision_config = ["HIGHEST", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   dot.4 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={highest,default}
 
-  // CHECK-NEXT:   %2 = "xla.dot"(%arg0, %arg1) {name = "dot.5", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:   %2 = "xla_hlo.dot"(%arg0, %arg1) {name = "dot.5", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   %dot.5 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:   %3 = "xla.dot"(%arg0, %arg1) {name = "dot.6", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:   %3 = "xla_hlo.dot"(%arg0, %arg1) {name = "dot.6", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   // CHECK-NEXT:   return %3 : tensor<f32>
   ROOT %dot.6 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/dynamic-update-slice.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/dynamic-update-slice.hlotxt
index d31160cfb21..85369451e2f 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/dynamic-update-slice.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/dynamic-update-slice.hlotxt
@@ -9,7 +9,7 @@ HloModule main
   %Arg_2.3 = f32[] parameter(2)
   %Arg_3.4 = f32[] parameter(3)
 
-  // CHECK-NEXT: %0 = "xla.dynamic-update-slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<f32>, tensor<f32>) -> tensor<4x4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<f32>, tensor<f32>) -> tensor<4x4xf32>
   // CHECK-NEXT: return %0 : tensor<4x4xf32>
   ROOT %dynamic-update-slice.5 = f32[4, 4] dynamic-update-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4)
 }
@@ -20,7 +20,7 @@ HloModule main
   %Arg_1.2 = f32[2] parameter(1)
   %Arg_2.3 = f32[] parameter(2)
 
-  // CHECK-NEXT: %0 = "xla.dynamic-update-slice"(%arg0, %arg1, %arg2) : (tensor<4xf32>, tensor<2xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.dynamic-update-slice"(%arg0, %arg1, %arg2) : (tensor<4xf32>, tensor<2xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %dynamic-update-slice.5 = f32[4] dynamic-update-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3)
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/exp.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/exp.hlotxt
new file mode 100644
index 00000000000..fb523f9cd16
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/exp.hlotxt
@@ -0,0 +1,12 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule foo
+
+// CHECK-LABEL: func @main(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+ENTRY %foo (arg0.1: f32[16]) -> f32[16] {
+  %arg0.1 = f32[16] parameter(0)
+
+  // CHECK-NEXT: %0 = "xla_hlo.exp"(%arg0) {name = "exp.2"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: return %0 : tensor<16xf32>
+  ROOT %exp.2 = f32[16] exponential(f32[16] %arg0.1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/floor.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/floor.hlotxt
new file mode 100644
index 00000000000..80e66da5642
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/floor.hlotxt
@@ -0,0 +1,13 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule foo
+
+// CHECK-LABEL: func @main(
+// CHECK-SAME: [[A0:%.+]]: tensor<16xf32>) -> tensor<16xf32> {
+ENTRY %foo (arg0.1: f32[16]) -> f32[16] {
+  %arg0.1 = f32[16] parameter(0)
+
+  // CHECK-NEXT: [[R0:%.+]] = "xla_hlo.floor"([[A0]]) {name = "floor.2"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: return [[R0]] : tensor<16xf32>
+  ROOT %floor.2 = f32[16] floor(f32[16] %arg0.1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
index a4e5b19e1e1..fca13d7f0b7 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
@@ -9,95 +9,95 @@ ENTRY %tfcompile.48 {
   %arg0.1 = f32[1,300] parameter(0)
   %arg1.2 = f32[1,300,3,1] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.reshape"(%arg0) {name = "reshape.3"} : (tensor<1x300xf32>) -> tensor<1x300xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.reshape"(%arg0) {name = "reshape.3"} : (tensor<1x300xf32>) -> tensor<1x300xf32>
   %reshape.3 = f32[1,300] reshape(%arg0.1)
 
-  // CHECK-NEXT: %1 = "xla.transpose"(%0) {name = "transpose.27", permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %1 = "xla_hlo.transpose"(%0) {name = "transpose.27", permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
   %transpose.27 = f32[300,1] transpose(%reshape.3), dimensions={1,0}
 
-  // CHECK-NEXT: %2 = "xla.reshape"(%1) {name = "reshape.28"} : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
+  // CHECK-NEXT: %2 = "xla_hlo.reshape"(%1) {name = "reshape.28"} : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
   %reshape.28 = f32[300,1,1] reshape(%transpose.27)
 
-  // CHECK-NEXT: %3 = "xla.reshape"(%2) {name = "reshape.29"} : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %3 = "xla_hlo.reshape"(%2) {name = "reshape.29"} : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
   %reshape.29 = f32[300,1] reshape(%reshape.28)
 
-  // CHECK-NEXT: %4 = "xla.broadcast_in_dim"(%3) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "broadcast.30"} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %4 = "xla_hlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "broadcast.30"} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
   %broadcast.30 = f32[300,1,5] broadcast(%reshape.29), dimensions={0,1}
 
   // CHECK-NEXT: %cst = constant  {name = "constant.8"} dense<1.000000e+00> : tensor<f32>
   %constant.8 = f32[] constant(1)
 
-  // CHECK-NEXT: %5 = "xla.broadcast_in_dim"(%cst) {name = "broadcast.9"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %5 = "xla_hlo.broadcast_in_dim"(%cst) {name = "broadcast.9"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.9 = f32[300,1,5] broadcast(%constant.8), dimensions={}
 
-  // CHECK-NEXT: %6 = "xla.mul"(%4, %5) {name = "multiply.31"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %6 = "xla_hlo.mul"(%4, %5) {name = "multiply.31"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %multiply.31 = f32[300,1,5] multiply(%broadcast.30, %broadcast.9)
 
   // CHECK-NEXT: %cst_0 = constant  {name = "constant.32"} dense<0.000000e+00> : tensor<f32>
   %constant.32 = f32[] constant(0)
 
-  // CHECK-NEXT: %7 = "xla.broadcast_in_dim"(%cst_0) {name = "broadcast.33"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %7 = "xla_hlo.broadcast_in_dim"(%cst_0) {name = "broadcast.33"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.33 = f32[300,1,5] broadcast(%constant.32), dimensions={}
 
-  // CHECK-NEXT: %8 = "xla.compare"(%6, %7) {comparison_direction = "GT", name = "compare.34"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
+  // CHECK-NEXT: %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "GT", name = "compare.34"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
   %compare.34 = pred[300,1,5] compare(%multiply.31, %broadcast.33), direction=GT
 
   // CHECK-NEXT: %cst_1 = constant  {name = "constant.10"} dense<0.000000e+00> : tensor<f32>
   %constant.10 = f32[] constant(0)
 
-  // CHECK-NEXT: %9 = "xla.broadcast_in_dim"(%cst_1) {name = "broadcast.11"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %9 = "xla_hlo.broadcast_in_dim"(%cst_1) {name = "broadcast.11"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.11 = f32[300,1,5] broadcast(%constant.10), dimensions={}
 
   // CHECK-NEXT: %cst_2 = constant  {name = "constant.40"} dense<0.000000e+00> : tensor<f32>
   %constant.40 = f32[] constant(0)
 
-  // CHECK-NEXT: %10 = "xla.broadcast_in_dim"(%cst_2) {name = "broadcast.41"} : (tensor<f32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %10 = "xla_hlo.broadcast_in_dim"(%cst_2) {name = "broadcast.41"} : (tensor<f32>) -> tensor<300x5xf32>
   %broadcast.41 = f32[300,5] broadcast(%constant.40), dimensions={}
 
-  // CHECK-NEXT: %11 = "xla.copy"(%arg1) {name = "copy.1"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %11 = "xla_hlo.copy"(%arg1) {name = "copy.1"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %copy.1 = f32[1,300,3,1] copy(%arg1.2)
 
-  // CHECK-NEXT: %12 = "xla.reshape"(%11) {name = "reshape.4"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %12 = "xla_hlo.reshape"(%11) {name = "reshape.4"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %reshape.4 = f32[1,300,3,1] reshape(%copy.1)
 
-  // CHECK-NEXT: %13 = "xla.reshape"(%12) {name = "reshape.24"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
+  // CHECK-NEXT: %13 = "xla_hlo.reshape"(%12) {name = "reshape.24"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
   %reshape.24 = f32[1,300,3] reshape(%reshape.4)
 
-  // CHECK-NEXT: %14 = "xla.transpose"(%13) {name = "transpose.25", permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
+  // CHECK-NEXT: %14 = "xla_hlo.transpose"(%13) {name = "transpose.25", permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
   %transpose.25 = f32[300,1,3] transpose(%reshape.24), dimensions={1,0,2}
 
-  // CHECK-NEXT: %15 = "xla.reshape"(%14) {name = "reshape.26"} : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
+  // CHECK-NEXT: %15 = "xla_hlo.reshape"(%14) {name = "reshape.26"} : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
   %reshape.26 = f32[300,3] reshape(%transpose.25)
 
   // CHECK-NEXT: %cst_3 = constant  {name = "constant.35"} dense<{{\[\[}}-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
   %constant.35 = f32[3,5] constant({ { -0.106023, 0.121505, 0.800239, -0.768885, 0.0966113 }, { 0.689014, -0.407056, -0.797853, 0.00378925, -0.208881 }, { -0.608529, 0.0276617, 0.268557, 0.577401, -0.428437 } })
 
   // TODO(b/129709049) consider making this default precision config implied.
-  // CHECK-NEXT: %16 = "xla.dot"(%15, %cst_3) {name = "dot.36", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %16 = "xla_hlo.dot"(%15, %cst_3) {name = "dot.36", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
   %dot.36 = f32[300,5] dot(%reshape.26, %constant.35), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   // CHECK-NEXT: %cst_4 = constant  {name = "constant.37"} dense<0.000000e+00> : tensor<5xf32>
   %constant.37 = f32[5]{0} constant({0, 0, 0, 0, 0})
 
-  // CHECK-NEXT: %17 = "xla.broadcast_in_dim"(%cst_4) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.38"} : (tensor<5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %17 = "xla_hlo.broadcast_in_dim"(%cst_4) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.38"} : (tensor<5xf32>) -> tensor<300x5xf32>
   %broadcast.38 = f32[300,5] broadcast(%constant.37), dimensions={1}
 
-  // CHECK-NEXT: %18 = "xla.add"(%16, %17) {name = "add.39"} : (tensor<300x5xf32>, tensor<300x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %18 = "xla_hlo.add"(%16, %17) {name = "add.39"} : (tensor<300x5xf32>, tensor<300x5xf32>) -> tensor<300x5xf32>
   %add.39 = f32[300,5] add(%dot.36, %broadcast.38)
 
-  // CHECK-NEXT: %19 = "xla.max"(%10, %18) {name = "maximum.42"} : (tensor<300x5xf32>, tensor<300x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %19 = "xla_hlo.max"(%10, %18) {name = "maximum.42"} : (tensor<300x5xf32>, tensor<300x5xf32>) -> tensor<300x5xf32>
   %maximum.42 = f32[300,5] maximum(%broadcast.41, %add.39)
 
-  // CHECK-NEXT: %20 = "xla.reshape"(%19) {name = "reshape.44"} : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %20 = "xla_hlo.reshape"(%19) {name = "reshape.44"} : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
   %reshape.44 = f32[300,1,5] reshape(%maximum.42)
 
-  // CHECK-NEXT: %21 = "xla.select"(%8, %9, %20) {name = "select.45"} : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %21 = "xla_hlo.select"(%8, %9, %20) {name = "select.45"} : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %select.45 = f32[300,1,5] select(%compare.34, %broadcast.11, %reshape.44)
 
-  // CHECK-NEXT: %22 = "xla.reshape"(%21) {name = "reshape.46"} : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %22 = "xla_hlo.reshape"(%21) {name = "reshape.46"} : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %reshape.46 = f32[300,1,5] reshape(%select.45)
 
-  // CHECK-NEXT: %23 = "xla.tuple"(%22) {name = "tuple.47"} : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
+  // CHECK-NEXT: %23 = "xla_hlo.tuple"(%22) {name = "tuple.47"} : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
   // CHECK-NEXT: return %23 : tuple<tensor<300x1x5xf32>>
   ROOT %tuple.47 = (f32[300,1,5]) tuple(%reshape.46)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/iota.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/iota.hlotxt
index 9a4944d414e..35c762c067c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/iota.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/iota.hlotxt
@@ -4,14 +4,14 @@ HloModule main.5
 
 // CHECK-LABEL: func @main() -> tensor<4xf32> {
 ENTRY %iota.1 () -> f32[4] {
-  // CHECK-NEXT: %0 = "xla.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %iota.0 = f32[4] iota(), iota_dimension=0
 }
 
 // CHECK-LABEL: func @iota.2() -> tensor<4x5xf32> {
 %iota.2 () -> f32[4, 5] {
-  // CHECK-NEXT: %0 = "xla.iota"() {iota_dimension = 1 : i64} : () -> tensor<4x5xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<4x5xf32>
   // CHECK-NEXT: return %0 : tensor<4x5xf32>
   ROOT %iota.0 = f32[4, 5] iota(), iota_dimension=1
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/log.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/log.hlotxt
new file mode 100644
index 00000000000..616ad0c0eb4
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/log.hlotxt
@@ -0,0 +1,12 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule foo
+
+// CHECK-LABEL: func @main(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+ENTRY %foo (arg0.1: f32[16]) -> f32[16] {
+  %arg0.1 = f32[16] parameter(0)
+
+  // CHECK-NEXT: %0 = "xla_hlo.log"(%arg0) {name = "log.2"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: return %0 : tensor<16xf32>
+  ROOT %log.2 = f32[16] log(f32[16] %arg0.1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/max.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/max.hlotxt
index dd6c0f504f5..f4ba76b4675 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/max.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/max.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.max"(%arg0, %arg1) {name = "maximum.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.max"(%arg0, %arg1) {name = "maximum.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %maximum.3 = f32[4] maximum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/min.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/min.hlotxt
index 5efe44aa53a..880fc0f76ca 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/min.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/min.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.min"(%arg0, %arg1) {name = "minimum.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.min"(%arg0, %arg1) {name = "minimum.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %minimum.3 = f32[4] minimum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/mul.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/mul.hlotxt
index 1bfb6662124..ad7feef19bc 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/mul.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/mul.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.mul"(%arg0, %arg1) {name = "multiply.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.mul"(%arg0, %arg1) {name = "multiply.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %multiply.3 = f32[4] multiply(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/pad.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/pad.hlotxt
index 412f267ce42..84e1fbc9cf6 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/pad.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/pad.hlotxt
@@ -7,7 +7,7 @@ ENTRY %padding.1 (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.pad"(%arg0, %arg1) {edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.pad"(%arg0, %arg1) {edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %pad.3 = f32[4] pad(%Arg_0.1, %Arg_1.2), padding=0_0_0
 }
@@ -17,7 +17,7 @@ ENTRY %padding.1 (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f32[4] {
   %Arg_0.1 = f32[4, 4, 4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.pad"(%arg0, %arg1) {edge_padding_high = dense<[2, 4, 6]> : tensor<3xi64>, edge_padding_low = dense<[1, 3, 5]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<4x4x4xf32>, tensor<f32>) -> tensor<7x11x15xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.pad"(%arg0, %arg1) {edge_padding_high = dense<[2, 4, 6]> : tensor<3xi64>, edge_padding_low = dense<[1, 3, 5]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<4x4x4xf32>, tensor<f32>) -> tensor<7x11x15xf32>
   // CHECK-NEXT: return %0 : tensor<7x11x15xf32>
   ROOT %pad.3 = f32[7, 11, 15] pad(%Arg_0.1, %Arg_1.2), padding=1_2x3_4x5_6
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/reduce.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/reduce.hlotxt
index 37e638eb1f7..e4dc4d5e211 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/reduce.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/reduce.hlotxt
@@ -33,19 +33,19 @@ ENTRY %foo.5 (Arg_0.1: f32[4, 4], Arg_1.2: f32[4], Arg_2.3: f32[]) -> ((f32[], f
   %Arg_1.2 = f32[4] parameter(1)
   %Arg_2.3 = f32[] parameter(2)
 
-  // CHECK-NEXT: %0 = "xla.reduce"(%arg0, %arg0, %arg2, %arg2) {computation = @reduce_helper.3, dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+  // CHECK-NEXT: %0 = "xla_hlo.reduce"(%arg0, %arg0, %arg2, %arg2) {computation = @reduce_helper.3, dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
   %reduce.1 = f32[4] reduce(%Arg_0.1, %Arg_1.2), dimensions={0}, to_apply=%reduce_helper.1
 
-  // CHECK-NEXT: %1 = "xla.reduce"(%arg0, %arg1) {computation = @reduce_helper.1, dimensions = dense<0> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %1 = "xla_hlo.reduce"(%arg0, %arg1) {computation = @reduce_helper.1, dimensions = dense<0> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4xf32>
   %reduce.2 = f32[] reduce(%reduce.1, %Arg_2.3), dimensions={0}, to_apply=%reduce_helper.2
 
-  // CHECK-NEXT: %2 = "xla.reduce"(%1, %arg2) {computation = @reduce_helper.2, dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: %2 = "xla_hlo.reduce"(%1, %arg2) {computation = @reduce_helper.2, dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<f32>
   %reduce.3 = f32[] reduce(%Arg_0.1, %Arg_2.3), dimensions={0, 1}, to_apply=%reduce_helper.2
 
-  // CHECK-NEXT: %3 = "xla.reduce"(%arg0, %arg2) {computation = @reduce_helper.2, dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: %3 = "xla_hlo.reduce"(%arg0, %arg2) {computation = @reduce_helper.2, dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>, tensor<f32>) -> tensor<f32>
   %reduce.4 = (f32[], f32[]) reduce(%Arg_0.1, %Arg_0.1, %Arg_2.3, %Arg_2.3), dimensions={0, 1}, to_apply=%reduce_helper.3
 
-  // CHECK-NEXT: %4 = "xla.sub"(%2, %3) {name = "sub.5"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: %4 = "xla_hlo.sub"(%2, %3) {name = "sub.5"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %sub.5 = f32[] subtract(%reduce.2, %reduce.3)
 
   ROOT %tuple.6 = ((f32[], f32[]), f32[]) tuple(%reduce.4, %sub.5)
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/reverse.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/reverse.hlotxt
index 7c8303d5966..f89f3eb89bf 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/reverse.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/reverse.hlotxt
@@ -6,7 +6,7 @@ HloModule main.5
 ENTRY %reverse.1 (Arg_0.1: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
 
-  // CHECK-NEXT: %0 = "xla.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT reverse.2 = f32[4] reverse(%Arg_0.1), dimensions={0}
 }
@@ -15,7 +15,7 @@ ENTRY %reverse.1 (Arg_0.1: f32[4]) -> f32[4] {
 %reverse.2 (Arg_0.1: f32[4, 4]) -> f32[4, 4] {
   %Arg_0.1 = f32[4, 4] parameter(0)
 
-  // CHECK-NEXT: %0 = "xla.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
   // CHECK-NEXT: return %0 : tensor<4x4xf32>
   ROOT reverse.2 = f32[4, 4] reverse(%Arg_0.1), dimensions={0, 1}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/rsqrt.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/rsqrt.hlotxt
new file mode 100644
index 00000000000..a7b9b73f239
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/rsqrt.hlotxt
@@ -0,0 +1,13 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule foo
+
+// CHECK-LABEL: func @main(
+// CHECK-SAME: [[ARG0:%.+]]: tensor<16xf32>) -> tensor<16xf32> {
+ENTRY %foo (arg0.1: f32[16]) -> f32[16] {
+  %arg0.1 = f32[16] parameter(0)
+
+  // CHECK-NEXT: [[P0:%.+]] = "xla_hlo.rsqrt"([[ARG0]]) {name = "rsqrt.2"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: return [[P0]] : tensor<16xf32>
+  ROOT %rsqrt.2 = f32[16] rsqrt(f32[16] %arg0.1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/select.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/select.hlotxt
index b9ae08d8c8c..d3fe6a51e56 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/select.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/select.hlotxt
@@ -8,7 +8,7 @@ ENTRY %main {
   %Arg_1.2 = s32[2,3] parameter(1)
   %Arg_2.3 = s32[2,3] parameter(2)
 
-  // CHECK-NEXT: %0 = "xla.select"(%arg0, %arg1, %arg2) {name = "select.4"} : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  // CHECK-NEXT: %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) {name = "select.4"} : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   // CHECK-NEXT: return %0 : tensor<2x3xi32>
   ROOT %select.4 = s32[2,3] select(pred[2,3] %Arg_0.1, s32[2,3] %Arg_1.2, s32[2,3] %Arg_2.3)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/select.mlir b/tensorflow/compiler/mlir/xla/tests/translate/select.mlir
index 4990ae712f8..f00aa0ade15 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/select.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/select.mlir
@@ -7,7 +7,7 @@ func @main(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>
   // CHECK-NEXT: %Arg_2.3 = s32[2,3] parameter(2)
 
   // CHECK-NEXT: ROOT %select.4 = s32[2,3] select(pred[2,3] %Arg_0.1, s32[2,3] %Arg_1.2, s32[2,3] %Arg_2.3)
-  %0 = "xla.select"(%arg0, %arg1, %arg2) {name = "select.4"} : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) {name = "select.4"} : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
index 83d85f7d45e..5d358596d54 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
+++ b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
@@ -139,8 +139,8 @@ dynamic_parameter_binding {
 }
 
 # CHECK-LABEL: func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
-# CHECK-NEXT:   %0 = "xla.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+# CHECK-NEXT:   %0 = "xla_hlo.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 # TODO(b/129709049) consider making this default precision config inferred.
-# CHECK-NEXT:   %1 = "xla.dot"(%0, %arg1) {name = "dot.4", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+# CHECK-NEXT:   %1 = "xla_hlo.dot"(%0, %arg1) {name = "dot.4", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
 # CHECK-NEXT:   return %1 : tensor<f32>
 # CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlotxt
index 09462625bbb..b3f8e977bfe 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlotxt
@@ -7,11 +7,11 @@ ENTRY %main.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[] {
   %Arg_0.1 = f32[4]{0} parameter(0)
   %Arg_1.2 = f32[4]{0} parameter(1)
 
-  // CHECK-NEXT:   %0 = "xla.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT:   %0 = "xla_hlo.add"(%arg0, %arg1) {name = "add.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   %add.3 = f32[4]{0} add(f32[4]{0} %Arg_0.1, f32[4]{0} %Arg_1.2)
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:   %1 = "xla.dot"(%0, %arg1) {name = "dot.4", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+  // CHECK-NEXT:   %1 = "xla_hlo.dot"(%0, %arg1) {name = "dot.4", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
   // CHECK-NEXT:   return %1 : tensor<f32>
   ROOT %dot.4 = f32[] dot(f32[4]{0} %add.3, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/simple.mlir b/tensorflow/compiler/mlir/xla/tests/translate/simple.mlir
index f6e277c97de..e68262ba9ff 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/simple.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/simple.mlir
@@ -2,8 +2,8 @@
 
 func @main(tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
-  %0 = "xla.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  %1 = "xla.dot"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %1 = "xla_hlo.dot"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/sub.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/sub.hlotxt
index 6fc493aa764..24d4dff6270 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/sub.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/sub.hlotxt
@@ -7,7 +7,7 @@ ENTRY %foo.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.sub"(%arg0, %arg1) {name = "subtract.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.sub"(%arg0, %arg1) {name = "subtract.3"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: return %0 : tensor<4xf32>
   ROOT %subtract.3 = f32[4] subtract(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/tanh.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/tanh.hlotxt
index 54dc0faef09..054e6af355e 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/tanh.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/tanh.hlotxt
@@ -4,9 +4,9 @@ HloModule foo
 
 // CHECK-LABEL: func @main(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
 ENTRY %foo (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
-  %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="XLA_Args"}
+  %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT: %0 = "xla.tanh"(%arg0) {name = "tanh.3"} : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.tanh"(%arg0) {name = "tanh.3"} : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
   // CHECK-NEXT: return %0 : tensor<1x16x16x3xf32>
   ROOT %tanh.3 = f32[1,16,16,3]{3,2,1,0} tanh(f32[1,16,16,3]{3,2,1,0} %arg0.1), metadata={op_type="Tanh" op_name="embedded_inference/tanh_model/Tanh"}
-}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/transpose.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/transpose.hlotxt
index 335e54669eb..203152d1ca4 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/transpose.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/transpose.hlotxt
@@ -6,7 +6,7 @@ HloModule main
 ENTRY %main {
   %Arg_0.1 = s32[1,2,3,4] parameter(0)
 
-  // CHECK-NEXT: %0 = "xla.transpose"(%arg0) {name = "transpose.2", permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  // CHECK-NEXT: %0 = "xla_hlo.transpose"(%arg0) {name = "transpose.2", permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   // CHECK-NEXT: return %0 : tensor<2x1x4x3xi32>
   ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] %Arg_0.1), dimensions={1,0,3,2}
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/transpose.mlir b/tensorflow/compiler/mlir/xla/tests/translate/transpose.mlir
index e28d0a37d84..77048e6c902 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/transpose.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/transpose.mlir
@@ -5,7 +5,7 @@ func @main(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   // CHECK-NEXT: %Arg_0.1 = s32[1,2,3,4] parameter(0)
 
   // CHECK-NEXT: ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] %Arg_0.1), dimensions={1,0,3,2}
-  %0 = "xla.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   return %0 : tensor<2x1x4x3xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/tuple.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/tuple.hlotxt
index c98fa93fcd9..bcaf1c81982 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/tuple.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/tuple.hlotxt
@@ -7,10 +7,10 @@ ENTRY %main(Arg_0.1: s32[1], Arg_1.2: f32[1, 2]) -> (s32[1], f32[1,2]) {
   %Arg_0.1 = s32[1] parameter(0)
   %Arg_1.2 = f32[1, 2] parameter(1)
 
-  // CHECK-NEXT: %0 = "xla.tuple"(%arg0) {name = "tuple.3"} : (tensor<1xi32>) -> tuple<tensor<1xi32>>
+  // CHECK-NEXT: %0 = "xla_hlo.tuple"(%arg0) {name = "tuple.3"} : (tensor<1xi32>) -> tuple<tensor<1xi32>>
   %tuple.3 = (s32[1]) tuple(%Arg_0.1)
 
-  // CHECK-NEXT: %1 = "xla.tuple"(%arg0, %arg1) {name = "tuple.4"} : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
+  // CHECK-NEXT: %1 = "xla_hlo.tuple"(%arg0, %arg1) {name = "tuple.4"} : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
   // CHECK-NEXT: return %1 : tuple<tensor<1xi32>, tensor<1x2xf32>>
   ROOT %tuple.4 = (s32[1], f32[1,2]) tuple(%Arg_0.1, %Arg_1.2)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
new file mode 100644
index 00000000000..2db52dd9023
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
@@ -0,0 +1,44 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule tfcompile.1
+
+// CHECK-LABEL: func @main() -> tensor<i1> {
+ENTRY %tfcompile.1 {
+  // CHECK-NEXT: %cst = constant  {name = "constant.0"} dense<1.000000e+00> : tensor<f32>
+  %constant.0 = f32[] constant(1)
+
+  // CHECK-NEXT: %cst_0 = constant  {name = "constant.1"} dense<1.000000e+00> : tensor<f64>
+  %constant.1 = f64[] constant(1)
+
+  // CHECK-NEXT: %cst_1 = constant  {name = "constant.2"} dense<1> : tensor<i8>
+  %constant.2 = s8[] constant(1)
+
+  // CHECK-NEXT: %cst_2 = constant  {name = "constant.3"} dense<1> : tensor<i16>
+  %constant.3 = s16[] constant(1)
+
+  // CHECK-NEXT: %cst_3 = constant  {name = "constant.4"} dense<1> : tensor<i32>
+  %constant.4 = s32[] constant(1)
+
+  // CHECK-NEXT: %cst_4 = constant  {name = "constant.5"} dense<1> : tensor<i64>
+  %constant.5 = s64[] constant(1)
+
+  // TODO(b/130356985): Update once MLIR supports unsigned integers.
+  // CHECK-NEXT: %cst_5 = constant  {name = "constant.6"} dense<1> : tensor<i8>
+  %constant.6 = u8[] constant(1)
+
+  // TODO(b/130356985): Update once MLIR supports unsigned integers.
+  // CHECK-NEXT: %cst_6 = constant  {name = "constant.7"} dense<1> : tensor<i16>
+  %constant.7 = u16[] constant(1)
+
+  // TODO(b/130356985): Update once MLIR supports unsigned integers.
+  // CHECK-NEXT: %cst_7 = constant  {name = "constant.8"} dense<1> : tensor<i32>
+  %constant.8 = u32[] constant(1)
+
+  // TODO(b/130356985): Update once MLIR supports unsigned integers.
+  // CHECK-NEXT: %cst_8 = constant  {name = "constant.9"} dense<1> : tensor<i64>
+  %constant.9 = u64[] constant(1)
+
+  // CHECK-NEXT: %cst_9 = constant  {name = "constant.10"} dense<true> : tensor<i1>
+  // CHECK-NEXT: return %cst_9 : tensor<i1>
+  ROOT %constant.10 = pred[] constant(1)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/unknown.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/unknown.hlotxt
index 42d52fd78c8..daf7dd8d01d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/unknown.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/unknown.hlotxt
@@ -6,6 +6,6 @@ HloModule main
 ENTRY %main (Arg_0.1: f32[1, 4], Arg_1.2: f32[4, 1]) -> f32[1] {
   %Arg_0.1 = f32[1] parameter(0)
 
-  // CHECK-NEXT: %0 = "xla.unknown"(%arg0, %arg0) {name = "add-dependency.2"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  // CHECK-NEXT: %0 = "xla_hlo.unknown"(%arg0, %arg0) {name = "add-dependency.2"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   ROOT add-dependency.2 = f32[1] add-dependency(Arg_0.1, Arg_0.1)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
index a6d2a48797e..784ad891111 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
@@ -4,24 +4,24 @@ HloModule foo
 
 // CHECK-LABEL: func @cond(%arg0: tensor<i64>) -> tensor<i1> {
 %cond (arg_1: s64[]) -> pred[] {
-  %arg_1 = s64[] parameter(0), metadata={op_name="XLA_Args"}
-  // CHECK-NEXT: %0 = "xla.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  // CHECK-NEXT: %0 = "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
   // CHECK-NEXT: return %0 : tensor<i1>
   ROOT %compare.2 = pred[] compare(%arg_1, %arg_1), direction=LT, metadata={op_type="Less" op_name="Less"}
 }
 
 // CHECK-LABEL: func @loop(%arg0: tensor<i64>) -> tensor<i64> {
 %loop (arg_1: s64[]) -> s64[] {
-  %arg_1 = s64[] parameter(0), metadata={op_name="XLA_Args"}
-  // CHECK-NEXT: %0 = "xla.add"(%arg0, %arg0) {name = "compare.0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg0) {name = "compare.0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
   // CHECK-NEXT: return %0 : tensor<i64>
   ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
 }
 
 // CHECK-LABEL: func @main(%arg0: tensor<i64>) -> tensor<i64> {
 ENTRY %foo (arg0.1: s64[]) -> s64[] {
-  %arg0.1 = s64[] parameter(0), metadata={op_name="XLA_Args"}
-  // CHECK-NEXT: %0 = "xla.while"(%arg0) {body = @loop, cond = @cond} : (tensor<i64>) -> tensor<i64>
+  %arg0.1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  // CHECK-NEXT: %0 = "xla_hlo.while"(%arg0) {body = @loop, cond = @cond} : (tensor<i64>) -> tensor<i64>
   // CHECK-NEXT: return %0 : tensor<i64>
   ROOT %while.2 = s64[] while(%arg0.1), body=%loop, condition=%cond
-}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/transpose.mlir b/tensorflow/compiler/mlir/xla/tests/transpose.mlir
new file mode 100644
index 00000000000..0ed7e709ed4
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/transpose.mlir
@@ -0,0 +1,29 @@
+// RUN: tf-opt %s -split-input-file -canonicalize | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @remove_noop
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @remove_noop(%arg : tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32> {
+  %0 = "xla_hlo.transpose"(%arg) {permutation = dense<[0, 1, 2, 3]> : tensor<4xi64>}: (tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32>
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : tensor<2x3x9x5xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @keep_real_transpose
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @keep_real_transpose(%arg : tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32> {
+  // CHECK-NEXT: "xla_hlo.transpose"([[ARG]])
+  %0 = "xla_hlo.transpose"(%arg) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}: (tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32>
+  return %0 : tensor<3x2x5x9xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @keep_same_shape_real_transpose
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @keep_same_shape_real_transpose(%arg : tensor<4x4xi32>) -> tensor<4x4xi32> {
+  // CHECK-NEXT: "xla_hlo.transpose"([[ARG]])
+  %0 = "xla_hlo.transpose"(%arg) {permutation = dense<[1, 0]> : tensor<2xi64>}: (tensor<4x4xi32>) -> tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
index cf271f42814..b40c89c1f8c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file implements logic for lowering XLA dialect to Standard dialect.
 
 #include "llvm/ADT/StringSwitch.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Block.h"  // TF:local_config_mlir
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
@@ -23,28 +24,27 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
 using mlir::PassRegistration;
 
 namespace mlir {
-namespace XLA {
+namespace xla_hlo {
 namespace {
 struct LegalizeControlFlow : public mlir::FunctionPass<LegalizeControlFlow> {
   // Perform the lowering to MLIR control flow.
   void runOnFunction() override;
 };
 
-bool LowerWhileOp(mlir::XLA::WhileOp while_op) {
+bool LowerWhileOp(mlir::xla_hlo::WhileOp while_op) {
   // Converts an xla while loop into control flow. This mostly generates the
   // right MLIR boilerplate for calling the body / condition functions, then
   // branching on their results appropriately. The operation should look similar
   // to below:
   //
   //   <prior operations>
-  //   %0 = "xla.while"(%arg0) {body: @loop, cond: @cond}
+  //   %0 = "xla_hlo.while"(%arg0) {body: @loop, cond: @cond}
   //   <post operations>
   auto* opInst = while_op.getOperation();
   mlir::OpBuilder builder(while_op);
@@ -147,9 +147,14 @@ void LegalizeControlFlow::runOnFunction() {
   }
 }
 }  // namespace
-}  // namespace XLA
+}  // namespace xla_hlo
 }  // namespace mlir
 
-static PassRegistration<mlir::XLA::LegalizeControlFlow> legalize_cf_pass(
+std::unique_ptr<mlir::FunctionPassBase>
+mlir::xla_hlo::createLegalizeControlFlowPass() {
+  return std::make_unique<LegalizeControlFlow>();
+}
+
+static PassRegistration<mlir::xla_hlo::LegalizeControlFlow> legalize_cf_pass(
     "xla-legalize-control-flow",
     "Legalize from XLA control flow to MLIR control flow");
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index a10329cea06..00c9c238f1e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -17,10 +17,11 @@ limitations under the License.
 
 #include <numeric>
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
 using namespace mlir;
@@ -32,7 +33,9 @@ struct LegalizeTF : public FunctionPass<LegalizeTF> {
 };
 }  // end anonymous namespace
 
-FunctionPassBase *mlir::XLA::createLegalizeTFPass() { return new LegalizeTF(); }
+std::unique_ptr<mlir::FunctionPassBase> mlir::xla_hlo::createLegalizeTFPass() {
+  return std::make_unique<LegalizeTF>();
+}
 
 /// Returns if the given TF data format string is the default format.
 static bool isDefaultDataFormat(StringRef format) { return format == "NHWC"; }
@@ -127,11 +130,11 @@ static ElementsAttr getBroadcastDimensionsAttr(Builder &b, Value *x, Value *y) {
 }
 
 namespace mlir {
-namespace XLA {
+namespace xla {
 namespace {
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 }  // end anonymous namespace
-}  // end namespace XLA
+}  // end namespace xla
 }  // end namespace mlir
 
 /// Perform the lowering to XLA dialect.
@@ -140,8 +143,8 @@ void LegalizeTF::runOnFunction() {
   auto func = getFunction();
 
   // Add the generated patterns to the list.
-  XLA::populateWithGenerated(func.getContext(), &patterns);
-  applyPatternsGreedily(func, std::move(patterns));
+  xla::populateWithGenerated(func.getContext(), &patterns);
+  applyPatternsGreedily(func, patterns);
 }
 
 static PassRegistration<LegalizeTF> pass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 7835fcf9213..1730e5374a4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -16,9 +16,9 @@ limitations under the License.
 // This is the legalization pattern definition file for TF to XLA.
 
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/xla/ir/xla_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def NullElementsAttr : NativeCodeCall<"ElementsAttr()">;
 
@@ -30,17 +30,19 @@ def FeatureDimension : NativeCodeCall<
     "getFeatureDimensionAttr($_builder, $0, $1)">;
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 
-def : Pattern<(TF_FusedBatchNormOp F32Tensor:$x, F32Tensor:$scale,
-                               F32Tensor:$offset, F32Tensor:$mean,
-                               F32Tensor:$variance, F32Attr:$epsilon,
+def : Pattern<
+    (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon,
                                $data_format, FalseBoolAttr:$is_training),
-           [(XLA_BatchNormInferenceOp $x, $scale, $offset, $mean, $variance,
-               $epsilon, (FeatureDimension $data_format, $x)),
-            /*batch_mean=*/(verifyUnusedValue),
-            /*batch_variance=*/(verifyUnusedValue),
-            /*reserve_space_1=*/(verifyUnusedValue),
-            /*reserve_space_2=*/(verifyUnusedValue)
-           ]>;
+    [(HLO_BatchNormInferenceOp $x, $scale, $offset, $mean, $variance,
+                               $epsilon, (FeatureDimension $data_format, $x)),
+     // We already guaranteed that the last four results has no use so it
+     // does not matter what value we provide here for replacement.
+     /*batch_mean=*/(replaceWithValue $x),
+     /*batch_variance=*/(replaceWithValue $x),
+     /*reserve_space_1=*/(replaceWithValue $x),
+     /*reserve_space_2=*/(replaceWithValue $x)],
+    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4)]>;
 
 //===----------------------------------------------------------------------===//
 // Bias op patterns.
@@ -60,7 +62,7 @@ def ValidBiasAddFeatureDimension : Constraint<
 
 def : Pat<(TF_BiasAddOp IsAtleast3DShapeTensor:$input, Is1DShapeTensor:$bias,
                         TF_ConvnetDataFormatAttr:$data_format),
-          (XLA_AddOp $input, $bias,
+          (HLO_AddOp $input, $bias,
               (BiasAddFeatureDimension $data_format, $input)),
           [(ValidBiasAddFeatureDimension $data_format, $input, $bias)]>;
 
@@ -76,11 +78,12 @@ class DirectBinaryPat<Op FromOp, Op ToOp>
   : Pat<(FromOp AnyTensor:$l, AnyTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
-foreach fromToBinPair = [[TF_AddOp, XLA_AddOp],
-                         [TF_DivOp, XLA_DivOp],
-                         [TF_MulOp, XLA_MulOp],
-                         [TF_RealDivOp, XLA_DivOp],
-                         [TF_SubOp, XLA_SubOp]] in
+foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
+		         [TF_AddV2Op, HLO_AddOp],
+                         [TF_DivOp, HLO_DivOp],
+                         [TF_MulOp, HLO_MulOp],
+                         [TF_RealDivOp, HLO_DivOp],
+                         [TF_SubOp, HLO_SubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 //===----------------------------------------------------------------------===//
@@ -94,7 +97,7 @@ def : Pat<(TF_IdentityOp $op), (replaceWithValue $op)>;
 //===----------------------------------------------------------------------===//
 
 // TODO(riverriddle) Formalize a policy on converting opaque attributes.
-def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (XLA_ConstOp $value),
+def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (HLO_ConstOp $value),
           [(AnyStaticShapeTensor $res)]>;
 
 //===----------------------------------------------------------------------===//
@@ -105,11 +108,11 @@ class ConstantSplat<string value> : NativeCodeCall<
     "getSplat($_builder, $0, " # value # ")">;
 
 def : Pat<(TF_ReluOp AnyTensor:$input),
-          (XLA_MaxOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
+          (HLO_MaxOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
                      (NullElementsAttr))>;
 
 def : Pat<(TF_Relu6Op AnyTensor:$input),
-          (XLA_ClampOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
+          (HLO_ClampOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
                        (ConstantOp (ConstantSplat<"6"> $input)))>;
 
 //===----------------------------------------------------------------------===//
@@ -117,7 +120,7 @@ def : Pat<(TF_Relu6Op AnyTensor:$input),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_ReshapeOp:$res AnyStaticShapeTensor:$arg, $ignored),
-          (XLA_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>;
+          (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>;
 
 def : Pat<(TF_SqueezeOp AnyStaticShapeTensor:$arg, $ignored_dims),
-          (XLA_ReshapeOp $arg)>;
+          (HLO_ReshapeOp $arg)>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
index 4ac42d39f06..934e9f91820 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // This file implements logic for lowering XLA dialect to Standard dialect.
 
 #include "llvm/ADT/StringSwitch.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
-#include "mlir/StandardOps/Ops.h"  // TF:local_config_mlir
-#include "tensorflow/compiler/mlir/xla/ir/xla_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
 using mlir::Builder;
@@ -30,13 +30,13 @@ using mlir::OwningRewritePatternList;
 using mlir::PassRegistration;
 
 namespace mlir {
-namespace XLA {
+namespace xla_hlo {
 namespace {
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_to_standard.inc"
 
 struct CompareIConvert : public RewritePattern {
   explicit CompareIConvert(MLIRContext *context)
-      : RewritePattern("xla.compare", 1, context) {}
+      : RewritePattern("xla_hlo.compare", 1, context) {}
 
   PatternMatchResult matchAndRewrite(Operation *op,
                                      PatternRewriter &rewriter) const override {
@@ -75,7 +75,7 @@ struct CompareIConvert : public RewritePattern {
 
 struct CompareFConvert : public RewritePattern {
   explicit CompareFConvert(MLIRContext *context)
-      : RewritePattern("xla.compare", 1, context) {}
+      : RewritePattern("xla_hlo.compare", 1, context) {}
 
   PatternMatchResult matchAndRewrite(Operation *op,
                                      PatternRewriter &rewriter) const override {
@@ -113,7 +113,7 @@ struct CompareFConvert : public RewritePattern {
 };
 
 }  // end anonymous namespace
-}  // end namespace XLA
+}  // end namespace xla_hlo
 }  // end namespace mlir
 
 namespace {
@@ -123,8 +123,9 @@ struct LegalizeToStandard : public FunctionPass<LegalizeToStandard> {
 };
 }  // end anonymous namespace
 
-FunctionPassBase *mlir::XLA::createLegalizeToStdPass() {
-  return new LegalizeToStandard();
+std::unique_ptr<mlir::FunctionPassBase>
+mlir::xla_hlo::createLegalizeToStdPass() {
+  return std::make_unique<LegalizeToStandard>();
 }
 
 /// Perform the lowering to standard dialect.
@@ -132,12 +133,11 @@ void LegalizeToStandard::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
 
-  mlir::XLA::populateWithGenerated(func.getContext(), &patterns);
-  patterns.push_back(
-      llvm::make_unique<mlir::XLA::CompareFConvert>(&getContext()));
-  patterns.push_back(
-      llvm::make_unique<mlir::XLA::CompareIConvert>(&getContext()));
-  applyPatternsGreedily(func, std::move(patterns));
+  mlir::xla_hlo::populateWithGenerated(func.getContext(), &patterns);
+  patterns
+      .insert<mlir::xla_hlo::CompareFConvert, mlir::xla_hlo::CompareIConvert>(
+          &getContext());
+  applyPatternsGreedily(func, patterns);
 }
 
 static PassRegistration<LegalizeToStandard> legalize_pass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
index 5f03ee6e70d..d0925cc9fb7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
@@ -16,8 +16,8 @@ limitations under the License.
 // This is the legalization pattern definition file for XLA to StandardOps.
 
 include "mlir/IR/OpBase.td"
-include "mlir/StandardOps/Ops.td"
-include "tensorflow/compiler/mlir/xla/ir/xla_ops.td"
+include "mlir/Dialect/StandardOps/Ops.td"
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
@@ -28,37 +28,36 @@ def IsSameSizePred : CPred<
     "== $1->getType().cast<ShapedType>().getShape()">;
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 
-def : Pat<(XLA_AddOp XLA_FpTensor:$l, XLA_FpTensor:$r,
+def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (AddFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_SubOp XLA_FpTensor:$l, XLA_FpTensor:$r,
+def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (SubFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_MulOp XLA_FpTensor:$l, XLA_FpTensor:$r,
+def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (MulFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_DivOp XLA_FpTensor:$l, XLA_FpTensor:$r,
+def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (DivFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
 
-def : Pat<(XLA_AddOp XLA_IntTensor:$l, XLA_IntTensor:$r,
+def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (AddIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_SubOp XLA_IntTensor:$l, XLA_IntTensor:$r,
+def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (SubIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_MulOp XLA_IntTensor:$l, XLA_IntTensor:$r,
+def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (MulIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(XLA_DivOp XLA_IntTensor:$l, XLA_IntTensor:$r,
+def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r,
                      IsNullAttr:$broadcast_dimensions),
           (DivISOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 2ed045396e7..3eb97dd6a0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -16,18 +16,23 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_
 
+#include <memory>
+
 namespace mlir {
 class FunctionPassBase;
 
-namespace XLA {
+namespace xla_hlo {
 
 /// Lowers from TF dialect to XLA dialect.
-FunctionPassBase *createLegalizeTFPass();
+std::unique_ptr<FunctionPassBase> createLegalizeTFPass();
 
-// Lowers from XLA dialect to Standard dialect.
-FunctionPassBase *createLegalizeToStdPass();
+/// Lowers XLA control flow ops to the Standard dialect.
+std::unique_ptr<FunctionPassBase> createLegalizeControlFlowPass();
 
-}  // end namespace XLA
+/// Lowers from XLA dialect to Standard dialect.
+std::unique_ptr<FunctionPassBase> createLegalizeToStdPass();
+
+}  // end namespace xla_hlo
 }  // end namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index 40c896fef9c..e64182889cb 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include "absl/base/integral_types.h"
 #include "mlir/IR/AffineMap.h"  // TF:local_config_mlir
 #include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
@@ -25,11 +24,13 @@ limitations under the License.
 #include "mlir/Support/DebugStringHelper.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 using mlir::IntegerType;
 using mlir::MemRefType;
 using mlir::RankedTensorType;
 using mlir::VectorType;
+using tensorflow::int64;
 using xla::PrimitiveType;
 using xla::ShapeUtil;
 
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index 9a77be947d5..57922fe1532 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -15,20 +15,48 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 
+#include <iostream>
+
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 using mlir::Builder;
 using mlir::MLIRContext;
-using ::testing::EqualsProto;
 
 namespace xla {
 namespace {
 
+// Simple implementation of a proto matcher comparing string representations.
+// Only works as ShapeProto's textual representation is deterministic.
+class ProtoStringMatcher {
+ public:
+  explicit ProtoStringMatcher(const tensorflow::protobuf::Message& expected)
+      : expected_(expected.SerializeAsString()) {}
+
+  template <typename Message>
+  bool MatchAndExplain(const Message& p, testing::MatchResultListener*) const {
+    return p.SerializeAsString() == expected_;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_;
+  }
+
+ private:
+  const std::string expected_;
+};
+
+inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
+    const tensorflow::protobuf::Message& x) {
+  return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x));
+}
+
 TEST(TypeToShapeTest, ConvertPrimitiveTypes) {
   MLIRContext context;
   Builder b(&context);
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index 9804858c084..ad7e4724d90 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/xla_mlir_translate.h"
 
-#include "google/protobuf/text_format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
@@ -26,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 using stream_executor::port::Status;
 using stream_executor::port::StatusOr;  // NOLINT TODO(b/130822468) fix this
@@ -34,13 +34,13 @@ namespace xla {
 
 namespace {
 // Error collector that simply ignores errors reported.
-class NoOpErrorCollector : public ::proto2::io::ErrorCollector {
+class NoOpErrorCollector : public tensorflow::protobuf::io::ErrorCollector {
  public:
   void AddError(int line, int column, const string& message) override {}
 };
 
 bool LoadHloProto(const std::string& contents, HloProto* hlo_proto) {
-  ::proto2::TextFormat::Parser parser;
+  tensorflow::protobuf::TextFormat::Parser parser;
   NoOpErrorCollector collector;
   parser.RecordErrorsTo(&collector);
   return hlo_proto->ParseFromString(contents) ||
@@ -114,8 +114,8 @@ static mlir::LogicalResult MlirHloToHloTranslateFunction(
   if (!module) return mlir::failure();
 
   std::error_code error;
-  auto result = llvm::make_unique<llvm::ToolOutputFile>(output_filename, error,
-                                                        llvm::sys::fs::F_None);
+  auto result = std::make_unique<llvm::ToolOutputFile>(output_filename, error,
+                                                       llvm::sys::fs::F_None);
   if (error) {
     LOG(ERROR) << error.message();
     return mlir::failure();
@@ -147,8 +147,8 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   if (!module) return mlir::failure();
 
   std::error_code error;
-  auto result = llvm::make_unique<llvm::ToolOutputFile>(output_filename, error,
-                                                        llvm::sys::fs::F_None);
+  auto result = std::make_unique<llvm::ToolOutputFile>(output_filename, error,
+                                                       llvm::sys::fs::F_None);
   if (error) {
     LOG(ERROR) << error.message();
     return mlir::failure();
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 15bb0a863d1..307eb1d3213 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -4,7 +4,7 @@ load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -36,7 +36,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = [":friends"],
     deps = [
-        "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -46,6 +45,7 @@ py_library(
         "//tensorflow/python:random_seed",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
+        "//tensorflow/python/compiler/xla:compiler_py",
         "//third_party/py/numpy",
     ],
 )
@@ -665,6 +665,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_diag_ops_test",
+    size = "medium",
+    timeout = "long",
+    srcs = ["matrix_diag_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "momentum_test",
     size = "small",
@@ -1024,7 +1037,10 @@ tf_xla_py_test(
     name = "unary_ops_test",
     size = "medium",
     srcs = ["unary_ops_test.py"],
-    tags = ["notap"],  # b/136030724
+    tags = [
+        "noguitar",  # TODO(b/140174740): Re-enable when fixed.
+        "notap",  # b/136030724
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1179,6 +1195,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
@@ -1187,7 +1204,7 @@ cuda_py_test(
     srcs = ["jit_test.py"],
     additional_deps = [
         ":test_utils",
-        "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/python/compiler/xla:compiler_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -1204,21 +1221,23 @@ cuda_py_test(
         "nogpu",
         "no_cuda_on_cpu_tap",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
     name = "dense_layer_test",
-    size = "small",
+    size = "medium",
     srcs = ["dense_layer_test.py"],
     additional_deps = [
         ":test_utils",
-        "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/python/compiler/xla:compiler_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:layers",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cc_library(
@@ -1304,6 +1323,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 # An example of ahead-of-time compilation using tfcompile.  The
@@ -1382,3 +1402,20 @@ tf_xla_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_xla_py_test(
+    name = "conv_node_name_test",
+    size = "medium",
+    srcs = ["conv_node_name_test.py"],
+    shard_count = 5,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index 369d0097a0f..e08435b5713 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -56,9 +56,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
         # Run a step of AdagradDA
         update.run()
 
-        # Let g to be gradient accumulator, gg to be gradient squared
-        # accumulator, T be the global step, lr is the learning rate, and k the
-        # initial gradient squared accumulator value.
+        # Let g be the gradient accumulator, gg be the gradient squared
+        # accumulator, T be the global step, lr be the learning rate,
+        # and k the initial gradient squared accumulator value.
         # w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})}
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 0171be42148..14af571d62f 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -1464,53 +1463,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
-  def testMatrixSetDiag(self):
-    # TODO(penporn): Once XLA supports MatrixSetDiagV2, change the call to
-    # gen_array_ops.matrix_set_diag (V1) to array_ops.matrix_set_diag (V2).
-    for dtype in self.numeric_types:
-      # Square
-      self._testBinary(
-          gen_array_ops.matrix_set_diag,
-          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
-                   dtype=dtype),
-          np.array([1.0, 2.0, 3.0], dtype=dtype),
-          expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]],
-                            dtype=dtype))
-
-      self._testBinary(
-          gen_array_ops.matrix_set_diag,
-          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
-                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]],
-                   dtype=dtype),
-          np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype),
-          expected=np.array(
-              [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]],
-               [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]],
-              dtype=dtype))
-
-      # Rectangular
-      self._testBinary(
-          gen_array_ops.matrix_set_diag,
-          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype),
-          np.array([3.0, 4.0], dtype=dtype),
-          expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype))
-
-      self._testBinary(
-          gen_array_ops.matrix_set_diag,
-          np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype),
-          np.array([3.0, 4.0], dtype=dtype),
-          expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype))
-
-      self._testBinary(
-          gen_array_ops.matrix_set_diag,
-          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
-                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype),
-          np.array([[-1.0, -2.0], [-4.0, -5.0]],
-                   dtype=dtype),
-          expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
-                             [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
-                            dtype=dtype))
-
   def testBroadcastTo(self):
     for dtype in self.all_types:
       x = np.random.randint(0, high=100, size=[2, 3])
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 96d389a81f2..a3b17e42fb0 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -3,7 +3,7 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_exec_compatible_with",
 )
diff --git a/tensorflow/compiler/tests/conv_node_name_test.py b/tensorflow/compiler/tests/conv_node_name_test.py
new file mode 100644
index 00000000000..85e8bce8617
--- /dev/null
+++ b/tensorflow/compiler/tests/conv_node_name_test.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Convolution node name match via the XLA JIT.
+
+The canned results in these tests are created by running each test using the
+Tensorflow CPU device and saving the output.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import googletest
+
+
+class ConvolutionNodeNameTest(xla_test.XLATestCase):
+  """Verify convolution node name match.
+
+  Verify convolution node names on TPU and CPU match with dilation > 1.
+  """
+
+  def _verifyNodeNameMatch(self, layer, input_sizes, filter_sizes, strides,
+                           dilations):
+
+    def _GetNodeNames(use_xla):
+      with self.session():
+        input_tensor = array_ops.placeholder(np.float32, shape=input_sizes)
+
+        if use_xla:
+          with self.test_scope():
+            # pylint: disable=protected-access
+            graph = ops.get_default_graph()
+            graph._set_control_flow_context(
+                control_flow_ops.XLAControlFlowContext())
+            # pylint: enable=protected-access
+            conv2d_op = layer(
+                filters=64,
+                kernel_size=filter_sizes,
+                dilation_rate=dilations,
+                padding="same")
+            _ = conv2d_op(input_tensor)
+            return [n.name for n in ops.get_default_graph().as_graph_def().node]
+        else:
+          with ops.device("CPU"):
+            conv2d_op = layer(
+                filters=64,
+                kernel_size=filter_sizes,
+                dilation_rate=dilations,
+                padding="same")
+            _ = conv2d_op(input_tensor)
+            names = [
+                n.name for n in ops.get_default_graph().as_graph_def().node
+            ]
+            # filter out space to depth ops.
+            return [
+                name for name in names
+                if "space" not in name and "Space" not in name
+            ]
+
+    xla_names = _GetNodeNames(use_xla=True)
+    no_xla_names = _GetNodeNames(use_xla=False)
+    self.assertListEqual(
+        xla_names,
+        no_xla_names,
+    )
+
+  def testConv1DNodeNameMatch(self):
+    input_sizes = [8, 16, 3]
+    filter_sizes = [7]
+    strides = 1
+    dilations = [2]
+    layer = layers.Conv1D
+    self._verifyNodeNameMatch(layer, input_sizes, filter_sizes, strides,
+                              dilations)
+
+  def testConv2DNodeNameMatch(self):
+    input_sizes = [8, 16, 16, 3]
+    filter_sizes = [7, 7]
+    strides = 1
+    dilations = [2, 2]
+    layer = layers.Conv2D
+    self._verifyNodeNameMatch(layer, input_sizes, filter_sizes, strides,
+                              dilations)
+
+  def testConv3DNodeNameMatch(self):
+    input_sizes = [8, 16, 16, 16, 3]
+    filter_sizes = [7, 7, 7]
+    strides = 1
+    dilations = [2, 2, 2]
+    layer = layers.Conv3D
+    self._verifyNodeNameMatch(layer, input_sizes, filter_sizes, strides,
+                              dilations)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index 74f16292334..8020aa28ce4 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -22,8 +22,8 @@ import os
 import numpy as np
 
 from tensorflow.compiler.tests import test_utils
-from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.compiler.xla import jit
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index c55bc23cf47..a49985f0446 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -87,6 +88,32 @@ def ConfigsToTest():
     yield i, f, o, s, p
 
 
+def ConfigsWithDilationsToTest():
+  """Iterator for different convolution shapes, strides and paddings.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, dilation, padding), the
+    depthwise
+    convolution parameters.
+  """
+  input_sizes = [[4, 6, 6, 48], [4, 8, 8, 84], [4, 36, 36, 2], [4, 148, 148, 2],
+                 [3, 300, 300, 3]]
+  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [5, 5, 2, 1], [4, 4, 2, 8],
+                  [2, 2, 3, 8]]
+  out_sizes = [[4, 6, 6, 96], [4, 8, 8, 84], [4, 36, 36, 2], [4, 74, 74, 16],
+               [3, 296, 296, 24]]
+  strides = [1, 1, 2, 2, 1]
+  dilations = [2, 2, 4, 2, 4]
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [SAME, SAME, SAME, SAME, VALID]
+  for i, f, o, s, d, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                              dilations, paddings):
+    yield i, f, o, s, d, p
+
+
 def CheckGradConfigsToTest():
   """Iterator for different convolution shapes, strides and paddings.
 
@@ -315,6 +342,118 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
         padding="VALID",
         expected=expected_output)
 
+  # This is testing that depthwise_conv2d with dilation produces
+  # the same results between CPU and TPU. It also tests that NCHW
+  # and NWHC formats agree.
+  def _VerifyValuesWithDilation(self,
+                                tensor_in_sizes,
+                                filter_in_sizes,
+                                stride,
+                                dilation,
+                                padding,
+                                data_type,
+                                data_format="NHWC"):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [filter_rows, filter_cols,
+        input_depth, depth_multiplier].
+      stride: Stride.
+      dilation: Dilation.
+      padding: Padding type.
+      data_type: The data type to use.
+      data_format: The data_format of the input. "NHWC" or "NCHW".
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input and filter tensor with numbers incrementing from 1.
+    x1 = np.array([f * 1.0 for f in range(1, total_size_1 + 1)],
+                  dtype=data_type).reshape(tensor_in_sizes)
+    x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)],
+                  dtype=data_type).reshape(filter_in_sizes)
+    with self.session() as sess:
+      if data_type == np.float32:
+        # TODO(b/64210055): Tolerance for TPU is high.
+        tolerance = 1e-2
+      else:
+        self.assertEqual(data_type, np.float64)
+        tolerance = 1e-8
+
+      t1 = array_ops.placeholder(shape=tensor_in_sizes, dtype=data_type)
+      t2 = array_ops.placeholder(shape=filter_in_sizes, dtype=data_type)
+
+      native_t1 = t1
+      strides = [1, stride, stride, 1]
+      dilations = [dilation, dilation]
+      if data_format == "NCHW":
+        # Transpose from NWHC input to NCHW
+        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        strides = [1, 1, stride, stride]
+
+      with self.test_scope():
+        conv_native = nn_impl.depthwise_conv2d(
+            native_t1,
+            t2,
+            strides=strides,
+            rate=dilations,
+            data_format=data_format,
+            padding=padding)
+
+      if data_format == "NCHW":
+        # Transpose back from NCHW to NHWC
+        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
+
+      with ops.device("CPU"):
+        # CPU only support NHWC format
+        strides = [1, stride, stride, 1]
+        conv_interface = nn_impl.depthwise_conv2d(
+            t1, t2, strides=strides, rate=dilations, padding=padding)
+
+      native_result = sess.run(conv_native, {t1: x1, t2: x2})
+      interface_result = sess.run(conv_interface, {t1: x1, t2: x2})
+
+    print("data_type:", data_type, "max diff = ",
+          np.amax(np.absolute(native_result - interface_result)))
+    self.assertAllClose(
+        np.ravel(native_result), np.ravel(interface_result), rtol=tolerance)
+
+  def testDilationDepthwiseConv2DWith(self):
+    for index, (input_size, filter_size, _, stride, dilation,
+                padding) in enumerate(ConfigsWithDilationsToTest()):
+      print("Testing DilationDepthwiseConv2D,", index, "th config:", input_size,
+            "*", filter_size, "stride:", stride, "dilation: ", dilation,
+            "padding:", padding)
+      for data_type in self.float_types:
+        # TODO(phawkins): the reference implementation only supports float32.
+        if data_type == np.float32:
+          self._VerifyValuesWithDilation(input_size, filter_size, stride,
+                                         dilation, padding, data_type)
+
+  def testDilationDepthwiseConv2DWithFormat(self):
+    for index, (input_size, filter_size, _, stride, dilation,
+                padding) in enumerate(ConfigsWithDilationsToTest()):
+      print("Testing DilationDepthwiseConv2DFormat,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "dilation:",
+            dilation, "padding:", padding)
+      for data_type in self.float_types:
+        # TODO(phawkins): the reference implementation only supports float32.
+        if data_type == np.float32:
+          self._VerifyValuesWithDilation(
+              input_size,
+              filter_size,
+              stride,
+              dilation,
+              padding,
+              data_type,
+              data_format="NCHW")
+
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
                             stride, padding):
     x1 = np.random.rand(*filter_sizes).astype(np.float32)
@@ -420,5 +559,139 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
           padding,
           data_format="NCHW")
 
+  def _CompareBackpropInputWithDilation(self, input_sizes, filter_sizes,
+                                        output_sizes, stride, dilation,
+                                        padding):
+    x1 = np.random.rand(*filter_sizes).astype(np.float32)
+    x2 = np.random.rand(*output_sizes).astype(np.float32)
+
+    def _GetVal(use_xla):
+      with self.session():
+        t1 = array_ops.placeholder(np.float32, shape=filter_sizes)
+        t2 = array_ops.placeholder(np.float32, shape=output_sizes)
+        if use_xla:
+          with self.test_scope():
+            t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
+            backprop = nn_ops.depthwise_conv2d_native_backprop_input(
+                t0,
+                t1,
+                t2,
+                strides=[1, stride, stride, 1],
+                dilations=[1, dilation, dilation, 1],
+                padding=padding)
+        else:
+          # TODO(wangtao): figure out gradient with stride > 1.
+          # depthwise_conv2d_native_backprop_input on CPU doesn't support
+          # dilation.
+          t3 = array_ops.space_to_batch(
+              t2, block_size=dilation, paddings=[[0, 0], [0, 0]])
+          input_sizes_transform = [
+              input_sizes[0] * dilation * dilation, input_sizes[1] // dilation,
+              input_sizes[2] // dilation, input_sizes[3]
+          ]
+          t0 = constant_op.constant(
+              input_sizes_transform, shape=[len(input_sizes)])
+          backprop_naive = nn_ops.depthwise_conv2d_native_backprop_input(
+              t0, t1, t3, strides=[1, stride, stride, 1], padding=padding)
+          backprop = array_ops.batch_to_space(
+              backprop_naive, [[0, 0], [0, 0]], block_size=dilation)
+
+        ret = backprop.eval({t1: x1, t2: x2})
+        self.assertShapeEqual(ret, backprop)
+        return ret
+
+    gpu_value = _GetVal(use_xla=True)
+    cpu_value = _GetVal(use_xla=False)
+
+    # TODO (b/64210055): Tolerance for TPU is high.
+    self.assertAllClose(cpu_value, gpu_value, rtol=1e-2, atol=1e-3)
+
+  def testDilationDepthwiseConv2DInputGradWithCompare(self):
+    for index, (input_size, filter_size, output_size, stride, dilation,
+                padding) in enumerate(ConfigsWithDilationsToTest()):
+      print("Testing DilationDepthwiseConv2DInputGradWithDilationCompare,",
+            index, "th config:", input_size, "*", filter_size, "stride:",
+            stride, "dilation:", dilation, "padding:", padding)
+      # TODO(wangtao): implement CPU grad computation with stride > 1.
+      if stride == 1:
+        self._CompareBackpropInputWithDilation(input_size, filter_size,
+                                               output_size, stride, dilation,
+                                               padding)
+
+  def _CompareBackpropFilterWithDilation(self,
+                                         input_sizes,
+                                         filter_sizes,
+                                         output_sizes,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         data_format="NHWC"):
+    x0 = np.random.rand(*input_sizes).astype(np.float32)
+    x2 = np.random.rand(*output_sizes).astype(np.float32)
+
+    def _GetVal(use_xla):
+      with self.session():
+        t0 = array_ops.placeholder(np.float32, shape=input_sizes)
+        t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
+        t2 = array_ops.placeholder(np.float32, shape=output_sizes)
+        native_t0 = t0
+        native_t2 = t2
+        strides = [1, stride, stride, 1]
+        dilations = [1, dilation, dilation, 1]
+
+        if use_xla:
+          if data_format == "NCHW":
+            # Transpose from NWHC input to NCHW
+            # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+            native_t0 = array_ops.transpose(t0, [0, 3, 1, 2])
+            native_t2 = array_ops.transpose(t2, [0, 3, 1, 2])
+            strides = [1, 1, stride, stride]
+            dilations = [1, 1, dilation, dilation]
+          with self.test_scope():
+            backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
+                native_t0,
+                t1,
+                native_t2,
+                strides=strides,
+                padding=padding,
+                dilations=dilations,
+                data_format=data_format)
+        else:
+          # For CPU, the format NCHW is not supported. Therefore we always use
+          # NHWC here.
+          # depthwise_conv2d_native_backprop_filter on CPU doesn't support
+          # dilation.
+          native_t3 = array_ops.space_to_batch(
+              native_t2, block_size=dilation, paddings=[[0, 0], [0, 0]])
+          native_t0_transform = array_ops.space_to_batch(
+              native_t0, block_size=dilation, paddings=[[0, 0], [0, 0]])
+          backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
+              native_t0_transform,
+              t1,
+              native_t3,
+              strides=strides,
+              padding=padding)
+        ret = backprop.eval({t0: x0, t2: x2})
+        self.assertShapeEqual(ret, backprop)
+        return ret
+
+    gpu_value = _GetVal(use_xla=True)
+    cpu_value = _GetVal(use_xla=False)
+    # TODO(b/64210055): Tolerance for TPU is high.
+    self.assertAllClose(cpu_value, gpu_value, rtol=1e-3, atol=1e-4)
+
+  def testDilationDepthwiseConv2DFilterGradCompare(self):
+    for index, (input_size, filter_size, output_size, stride, dilation,
+                padding) in enumerate(ConfigsWithDilationsToTest()):
+      print("Testing DilationDepthwiseConv2DFilterGradCompare,", index,
+            "th config:", input_size, "*", filter_size, "producing output",
+            output_size, "stride:", stride, "dilation:", dilation, "padding:",
+            padding)
+      if stride == 1:
+        # TODO(wangtao): implement CPU grad computation with stride > 1.
+        self._CompareBackpropFilterWithDilation(input_size, filter_size,
+                                                output_size, stride, dilation,
+                                                padding)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index d2c459bf1ec..a03980f20ba 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -693,8 +693,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
         return x, y
 
       wholly_compiled_f = def_function.function(f)
-      op_by_op_f = function.defun_with_attributes(
-          f, attributes={'_XlaCompile': False})
+      op_by_op_f = def_function.function(f, experimental_compile=False)
 
       x = constant_op.constant([0.0, 2.0], name='data')
 
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index fb4b2711905..5889a011296 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -514,6 +514,27 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
                            [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
                           dtype=np.float32))
 
+  def testAlignCorners3x3To12x12_uint8(self):
+    # TODO(b/72099414): enable the test for TPU when the issue is fixed.
+    if (self.device not in ["XLA_GPU", "XLA_CPU"]):
+      return
+    # Ensure that resize with convolution works on XLA/GPU for integer types
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8), [12, 12],
+        expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
+                          dtype=np.uint8))
+
 
 class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase):
 
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 29444c19014..109a7932c20 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -22,10 +22,10 @@ import os
 import numpy as np
 
 from tensorflow.compiler.tests import test_utils
-from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.compiler.xla import jit
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
diff --git a/tensorflow/compiler/tests/matrix_diag_ops_test.py b/tensorflow/compiler/tests/matrix_diag_ops_test.py
new file mode 100644
index 00000000000..6437c2749af
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_diag_ops_test.py
@@ -0,0 +1,655 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA matrix diag ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.compat import compat
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+# Test cases shared by MatrixDiagV2, MatrixDiagPartV2, and MatrixSetDiagV2.
+# Copied from //third_party/tensorflow/python/kernel_tests/diag_op_test.py
+def square_cases():
+  # pyformat: disable
+  mat = np.array([[[1, 2, 3, 4, 5],
+                   [6, 7, 8, 9, 1],
+                   [3, 4, 5, 6, 7],
+                   [8, 9, 1, 2, 3],
+                   [4, 5, 6, 7, 8]],
+                  [[9, 1, 2, 3, 4],
+                   [5, 6, 7, 8, 9],
+                   [1, 2, 3, 4, 5],
+                   [6, 7, 8, 9, 1],
+                   [2, 3, 4, 5, 6]]])
+  tests = dict()
+  # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals)
+  tests[-1, -1] = (np.array([[6, 4, 1, 7],
+                             [5, 2, 8, 5]]),
+                   np.array([[[0, 0, 0, 0, 0],
+                              [6, 0, 0, 0, 0],
+                              [0, 4, 0, 0, 0],
+                              [0, 0, 1, 0, 0],
+                              [0, 0, 0, 7, 0]],
+                             [[0, 0, 0, 0, 0],
+                              [5, 0, 0, 0, 0],
+                              [0, 2, 0, 0, 0],
+                              [0, 0, 8, 0, 0],
+                              [0, 0, 0, 5, 0]]]))
+  tests[-4, -3] = (np.array([[[8, 5],
+                              [4, 0]],
+                             [[6, 3],
+                              [2, 0]]]),
+                   np.array([[[0, 0, 0, 0, 0],
+                              [0, 0, 0, 0, 0],
+                              [0, 0, 0, 0, 0],
+                              [8, 0, 0, 0, 0],
+                              [4, 5, 0, 0, 0]],
+                             [[0, 0, 0, 0, 0],
+                              [0, 0, 0, 0, 0],
+                              [0, 0, 0, 0, 0],
+                              [6, 0, 0, 0, 0],
+                              [2, 3, 0, 0, 0]]]))
+  tests[-2, 1] = (np.array([[[2, 8, 6, 3, 0],
+                             [1, 7, 5, 2, 8],
+                             [6, 4, 1, 7, 0],
+                             [3, 9, 6, 0, 0]],
+                            [[1, 7, 4, 1, 0],
+                             [9, 6, 3, 9, 6],
+                             [5, 2, 8, 5, 0],
+                             [1, 7, 4, 0, 0]]]),
+                  np.array([[[1, 2, 0, 0, 0],
+                             [6, 7, 8, 0, 0],
+                             [3, 4, 5, 6, 0],
+                             [0, 9, 1, 2, 3],
+                             [0, 0, 6, 7, 8]],
+                            [[9, 1, 0, 0, 0],
+                             [5, 6, 7, 0, 0],
+                             [1, 2, 3, 4, 0],
+                             [0, 7, 8, 9, 1],
+                             [0, 0, 4, 5, 6]]]))
+  tests[2, 4] = (np.array([[[5, 0, 0],
+                            [4, 1, 0],
+                            [3, 9, 7]],
+                           [[4, 0, 0],
+                            [3, 9, 0],
+                            [2, 8, 5]]]),
+                 np.array([[[0, 0, 3, 4, 5],
+                            [0, 0, 0, 9, 1],
+                            [0, 0, 0, 0, 7],
+                            [0, 0, 0, 0, 0],
+                            [0, 0, 0, 0, 0]],
+                           [[0, 0, 2, 3, 4],
+                            [0, 0, 0, 8, 9],
+                            [0, 0, 0, 0, 5],
+                            [0, 0, 0, 0, 0],
+                            [0, 0, 0, 0, 0]]]))
+  # pyformat: enable
+  return (mat, tests)
+
+
+def tall_cases():
+  # pyformat: disable
+  mat = np.array([[[1, 2, 3],
+                   [4, 5, 6],
+                   [7, 8, 9],
+                   [9, 8, 7],
+                   [6, 5, 4]],
+                  [[3, 2, 1],
+                   [1, 2, 3],
+                   [4, 5, 6],
+                   [7, 8, 9],
+                   [9, 8, 7]]])
+  tests = dict()
+  # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals)
+  tests[0, 0] = (np.array([[1, 5, 9],
+                           [3, 2, 6]]),
+                 np.array([[[1, 0, 0],
+                            [0, 5, 0],
+                            [0, 0, 9],
+                            [0, 0, 0]],
+                           [[3, 0, 0],
+                            [0, 2, 0],
+                            [0, 0, 6],
+                            [0, 0, 0]]]))
+  tests[-4, -3] = (np.array([[[9, 5],
+                              [6, 0]],
+                             [[7, 8],
+                              [9, 0]]]),
+                   np.array([[[0, 0, 0],
+                              [0, 0, 0],
+                              [0, 0, 0],
+                              [9, 0, 0],
+                              [6, 5, 0]],
+                             [[0, 0, 0],
+                              [0, 0, 0],
+                              [0, 0, 0],
+                              [7, 0, 0],
+                              [9, 8, 0]]]))
+  tests[-2, -1] = (np.array([[[4, 8, 7],
+                              [7, 8, 4]],
+                             [[1, 5, 9],
+                              [4, 8, 7]]]),
+                   np.array([[[0, 0, 0],
+                              [4, 0, 0],
+                              [7, 8, 0],
+                              [0, 8, 7],
+                              [0, 0, 4]],
+                             [[0, 0, 0],
+                              [1, 0, 0],
+                              [4, 5, 0],
+                              [0, 8, 9],
+                              [0, 0, 7]]]))
+  tests[-2, 1] = (np.array([[[2, 6, 0],
+                             [1, 5, 9],
+                             [4, 8, 7],
+                             [7, 8, 4]],
+                            [[2, 3, 0],
+                             [3, 2, 6],
+                             [1, 5, 9],
+                             [4, 8, 7]]]),
+                  np.array([[[1, 2, 0],
+                             [4, 5, 6],
+                             [7, 8, 9],
+                             [0, 8, 7],
+                             [0, 0, 4]],
+                            [[3, 2, 0],
+                             [1, 2, 3],
+                             [4, 5, 6],
+                             [0, 8, 9],
+                             [0, 0, 7]]]))
+  tests[1, 2] = (np.array([[[3, 0],
+                            [2, 6]],
+                           [[1, 0],
+                            [2, 3]]]),
+                 np.array([[[0, 2, 3],
+                            [0, 0, 6],
+                            [0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
+                           [[0, 2, 1],
+                            [0, 0, 3],
+                            [0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]))
+  # pyformat: enable
+  return (mat, tests)
+
+
+def fat_cases():
+  # pyformat: disable
+  mat = np.array([[[1, 2, 3, 4],
+                   [5, 6, 7, 8],
+                   [9, 1, 2, 3]],
+                  [[4, 5, 6, 7],
+                   [8, 9, 1, 2],
+                   [3, 4, 5, 6]]])
+  tests = dict()
+  # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals)
+  tests[0, 0] = (np.array([[1, 6, 2],
+                           [4, 9, 5]]),
+                 np.array([[[1, 0, 0, 0],
+                            [0, 6, 0, 0],
+                            [0, 0, 2, 0]],
+                           [[4, 0, 0, 0],
+                            [0, 9, 0, 0],
+                            [0, 0, 5, 0]]]))
+  tests[2, 2] = (np.array([[3, 8],
+                           [6, 2]]),
+                 np.array([[[0, 0, 3, 0],
+                            [0, 0, 0, 8],
+                            [0, 0, 0, 0]],
+                           [[0, 0, 6, 0],
+                            [0, 0, 0, 2],
+                            [0, 0, 0, 0]]]))
+  tests[-2, 0] = (np.array([[[1, 6, 2],
+                             [5, 1, 0],
+                             [9, 0, 0]],
+                            [[4, 9, 5],
+                             [8, 4, 0],
+                             [3, 0, 0]]]),
+                  np.array([[[1, 0, 0, 0],
+                             [5, 6, 0, 0],
+                             [9, 1, 2, 0]],
+                            [[4, 0, 0, 0],
+                             [8, 9, 0, 0],
+                             [3, 4, 5, 0]]]))
+  tests[-1, 1] = (np.array([[[2, 7, 3],
+                             [1, 6, 2],
+                             [5, 1, 0]],
+                            [[5, 1, 6],
+                             [4, 9, 5],
+                             [8, 4, 0]]]),
+                  np.array([[[1, 2, 0, 0],
+                             [5, 6, 7, 0],
+                             [0, 1, 2, 3]],
+                            [[4, 5, 0, 0],
+                             [8, 9, 1, 0],
+                             [0, 4, 5, 6]]]))
+  tests[0, 3] = (np.array([[[4, 0, 0],
+                            [3, 8, 0],
+                            [2, 7, 3],
+                            [1, 6, 2]],
+                           [[7, 0, 0],
+                            [6, 2, 0],
+                            [5, 1, 6],
+                            [4, 9, 5]]]),
+                 np.array([[[1, 2, 3, 4],
+                            [0, 6, 7, 8],
+                            [0, 0, 2, 3]],
+                           [[4, 5, 6, 7],
+                            [0, 9, 1, 2],
+                            [0, 0, 5, 6]]]))
+  # pyformat: enable
+  return (mat, tests)
+
+
+class MatrixDiagTest(xla_test.XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self,
+                                     params,
+                                     solution,
+                                     rtol=1e-3,
+                                     atol=1e-5):
+    """Verifies that matrix_diag produces `solution` when fed `params`.
+
+    Args:
+      params: dictionary containing input parameters to matrix_diag.
+      solution: numpy array representing the expected output of matrix_diag.
+      rtol: relative tolerance for equality test.
+      atol: absolute tolerance for equality test.
+    """
+    diagonal = params["diagonal"]
+    with self.session() as session:
+      for dtype in self.numeric_types - {np.int8, np.uint8}:
+        expected = solution.astype(dtype)
+        with self.test_scope():
+          params["diagonal"] = array_ops.placeholder(
+              dtype, diagonal.shape, name="diagonal")
+          output = array_ops.matrix_diag(**params)
+        result = session.run(output,
+                             {params["diagonal"]: diagonal.astype(dtype)})
+        self.assertEqual(output.dtype, expected.dtype)
+        self.assertAllCloseAccordingToType(
+            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+
+  # Generic tests applicable to both v1 and v2 ops.
+  # Originally from unary_ops_tests.py.
+  def testV1(self):
+    # pyformat: disable
+    vecs1 = np.array([[1, 2],
+                      [3, 4]])
+    solution1 = np.array([[[1, 0], [0, 2]],
+                          [[3, 0], [0, 4]]])
+    vecs2 = np.array([1, 2, 3, 4])
+    solution2 = np.array([[1, 0, 0, 0],
+                          [0, 2, 0, 0],
+                          [0, 0, 3, 0],
+                          [0, 0, 0, 4]])
+    vecs3 = np.array([[[1, 2, 3],
+                       [4, 5, 6]],
+                      [[7,  8,  9],  # pylint: disable=bad-whitespace
+                       [10, 11, 12]]])
+    solution3 = np.array([[[[1, 0, 0],
+                            [0, 2, 0],
+                            [0, 0, 3]],
+                           [[4, 0, 0],
+                            [0, 5, 0],
+                            [0, 0, 6]]],
+                          [[[7, 0, 0],
+                            [0, 8, 0],
+                            [0, 0, 9]],
+                           [[10, 0, 0],
+                            [0, 11, 0],
+                            [0, 0, 12]]]])
+    # pyformat: enable
+    self._assertOpOutputMatchesExpected({"diagonal": vecs1}, solution1)
+    self._assertOpOutputMatchesExpected({"diagonal": vecs2}, solution2)
+    self._assertOpOutputMatchesExpected({"diagonal": vecs3}, solution3)
+
+  # From here onwards are v2-only tests.
+  def testSquare(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for _, tests in [square_cases()]:
+        for diag_index, (vecs, solution) in tests.items():
+          self._assertOpOutputMatchesExpected(
+              {
+                  "diagonal": vecs[0],
+                  "k": diag_index
+              }, solution[0])
+
+  def testSquareBatch(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for _, tests in [square_cases()]:
+        for diag_index, (vecs, solution) in tests.items():
+          self._assertOpOutputMatchesExpected(
+              {
+                  "diagonal": vecs,
+                  "k": diag_index
+              }, solution)
+
+  def testRectangularBatch(self):
+    # LINT.IfChange
+    if not compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      return
+
+    # Stores expected num_rows and num_cols (when the other is given).
+    # expected[(d_lower, d_upper)] = (expected_num_rows, expected_num_cols)
+    test_list = list()
+
+    # Square cases:
+    expected = {
+        (-1, -1): (5, 4),
+        (-4, -3): (5, 2),
+        (-2, 1): (5, 5),
+        (2, 4): (3, 5),
+    }
+    test_list.append((expected, square_cases()))
+
+    # Tall cases
+    expected = {
+        (0, 0): (3, 3),
+        (-4, -3): (5, 2),
+        (-2, -1): (4, 3),
+        (-2, 1): (3, 3),
+        (1, 2): (2, 3)
+    }
+    test_list.append((expected, tall_cases()))
+
+    # Fat cases
+    expected = {
+        (2, 2): (2, 4),
+        (-2, 0): (3, 3),
+        (-1, 1): (3, 3),
+        (0, 3): (3, 3)
+    }
+    test_list.append((expected, fat_cases()))
+
+    # Giving both num_rows and num_cols
+    for _, tests in [tall_cases(), fat_cases()]:
+      for diag_index, (vecs, solution) in tests.items():
+        self._assertOpOutputMatchesExpected(
+            {
+                "diagonal": vecs,
+                "k": diag_index,
+                "num_rows": solution.shape[-2],
+                "num_cols": solution.shape[-1]
+            }, solution)
+
+    # Giving just num_rows or num_cols.
+    for expected, (_, tests) in test_list:
+      for diag_index, (new_num_rows, new_num_cols) in expected.items():
+        vecs, solution = tests[diag_index]
+        solution_given_num_rows = solution.take(
+            indices=range(new_num_cols), axis=-1)
+        self._assertOpOutputMatchesExpected(
+            {
+                "diagonal": vecs,
+                "k": diag_index,
+                "num_rows": solution_given_num_rows.shape[-2]
+            }, solution_given_num_rows)
+        solution_given_num_cols = solution.take(
+            indices=range(new_num_rows), axis=-2)
+        self._assertOpOutputMatchesExpected(
+            {
+                "diagonal": vecs,
+                "k": diag_index,
+                "num_cols": solution_given_num_cols.shape[-1]
+            }, solution_given_num_cols)
+
+  def testPadding(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for padding_value in [555, -11]:
+        for _, tests in [square_cases(), tall_cases(), fat_cases()]:
+          for diag_index, (vecs, solution) in tests.items():
+            mask = (solution == 0)
+            solution = solution + (mask * padding_value)
+            self._assertOpOutputMatchesExpected(
+                {
+                    "diagonal": vecs,
+                    "k": diag_index,
+                    "num_rows": solution.shape[-2],
+                    "num_cols": solution.shape[-1],
+                    "padding_value": padding_value
+                }, solution)
+
+
+class MatrixSetDiagTest(xla_test.XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self,
+                                     params,
+                                     solution,
+                                     rtol=1e-3,
+                                     atol=1e-5):
+    """Verifies that matrix_set_diag produces `solution` when fed `params`.
+
+    Args:
+      params: dictionary containing input parameters to matrix_set_diag.
+      solution: numpy array representing the expected output of matrix_set_diag.
+      rtol: relative tolerance for equality test.
+      atol: absolute tolerance for equality test.
+    """
+    input = params["input"]  # pylint: disable=redefined-builtin
+    diagonal = params["diagonal"]
+    with self.session() as session:
+      for dtype in self.numeric_types - {np.int8, np.uint8}:
+        expected = solution.astype(dtype)
+        with self.test_scope():
+          params["input"] = array_ops.placeholder(
+              dtype, input.shape, name="input")
+          params["diagonal"] = array_ops.placeholder(
+              dtype, diagonal.shape, name="diagonal")
+          output = array_ops.matrix_set_diag(**params)
+        result = session.run(
+            output, {
+                params["input"]: input.astype(dtype),
+                params["diagonal"]: diagonal.astype(dtype)
+            })
+        self.assertEqual(output.dtype, expected.dtype)
+        self.assertAllCloseAccordingToType(
+            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+
+  # Generic tests applicable to both v1 and v2 ops.
+  # Originally from binary_ops_tests.py.
+  def testV1(self):
+    test_cases = list()
+
+    # pyformat: disable
+    # pylint: disable=bad-whitespace
+    # Square cases.
+    input = np.array([[0, 1, 0],  # pylint: disable=redefined-builtin
+                      [1, 0, 1],
+                      [1, 1, 1]])
+    diag = np.array([1, 2, 3])
+    solution = np.array([[1, 1, 0],
+                         [1, 2, 1],
+                         [1, 1, 3]])
+    test_cases.append(({"input": input, "diagonal": diag}, solution))
+
+    input = np.array([[[1, 0, 3],
+                       [0, 2, 0],
+                       [1, 0, 3]],
+                      [[4, 0, 4],
+                       [0, 5, 0],
+                       [2, 0, 6]]])
+    diag = np.array([[-1,  0, -3],
+                     [-4, -5, -6]])
+    solution = np.array([[[-1, 0,  3],
+                          [ 0, 0,  0],
+                          [ 1, 0, -3]],
+                         [[-4,  0,  4],
+                          [ 0, -5,  0],
+                          [ 2,  0, -6]]])
+    test_cases.append(({"input": input, "diagonal": diag}, solution))
+
+    # Rectangular cases.
+    input = np.array([[0, 1, 0],
+                      [1, 0, 1]])
+    diag = np.array([3, 4])
+    solution = np.array([[3, 1, 0],
+                         [1, 4, 1]])
+    test_cases.append(({"input": input, "diagonal": diag}, solution))
+
+    input = np.array([[0, 1],
+                      [1, 0],
+                      [1, 1]])
+    diag = np.array([3, 4])
+    solution = np.array([[3, 1],
+                         [1, 4],
+                         [1, 1]])
+    test_cases.append(({"input": input, "diagonal": diag}, solution))
+
+    input = np.array([[[1, 0, 3],
+                       [0, 2, 0]],
+                      [[4, 0, 4],
+                       [0, 5, 0]]])
+    diag = np.array([[-1, -2], [-4, -5]])
+    solution = np.array([[[-1,  0, 3],
+                          [ 0, -2, 0]],
+                         [[-4,  0, 4],
+                          [ 0, -5, 0]]])
+    test_cases.append(({"input": input, "diagonal": diag}, solution))
+    # pylint: enable=bad-whitespace
+    # pyformat: enable
+
+    for test in test_cases:
+      self._assertOpOutputMatchesExpected(test[0], test[1])
+
+  # From here onwards are v2-only tests.
+  def testSingleMatrix(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for _, tests in [square_cases(), tall_cases(), fat_cases()]:
+        for diag_index, (vecs, banded_mat) in tests.items():
+          mask = (banded_mat[0] == 0)
+          input_mat = np.random.randint(10, size=mask.shape)
+          solution = input_mat * mask + banded_mat[0]
+          self._assertOpOutputMatchesExpected(
+              {
+                  "input": input_mat,
+                  "diagonal": vecs[0],
+                  "k": diag_index
+              }, solution)
+
+  def testBatch(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for _, tests in [square_cases(), tall_cases(), fat_cases()]:
+        for diag_index, (vecs, banded_mat) in tests.items():
+          mask = (banded_mat == 0)
+          input_mat = np.random.randint(10, size=mask.shape)
+          solution = input_mat * mask + banded_mat
+          self._assertOpOutputMatchesExpected(
+              {
+                  "input": input_mat,
+                  "diagonal": vecs,
+                  "k": diag_index
+              }, solution)
+
+
+class MatrixDiagPartTest(xla_test.XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self,
+                                     params,
+                                     solution,
+                                     rtol=1e-3,
+                                     atol=1e-5):
+    """Verifies that matrix_diag_part produces `solution` when fed `params`.
+
+    Args:
+      params: dictionary containing input parameters to matrix_diag_part.
+      solution: numpy array representing the expected output.
+      rtol: relative tolerance for equality test.
+      atol: absolute tolerance for equality test.
+    """
+    input = params["input"]  # pylint: disable=redefined-builtin
+    with self.session() as session:
+      for dtype in self.numeric_types - {np.int8, np.uint8}:
+        expected = solution.astype(dtype)
+        with self.test_scope():
+          params["input"] = array_ops.placeholder(
+              dtype, input.shape, name="input")
+          output = array_ops.matrix_diag_part(**params)
+        result = session.run(output, {
+            params["input"]: input.astype(dtype),
+        })
+        self.assertEqual(output.dtype, expected.dtype)
+        self.assertAllCloseAccordingToType(
+            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+
+  # Generic tests applicable to both v1 and v2 ops.
+  # Originally from unary_ops_tests.py.
+  def testV1(self):
+    matrices = np.arange(3 * 2 * 4).reshape([3, 2, 4])
+    solution = np.array([[0, 5], [8, 13], [16, 21]])
+    self._assertOpOutputMatchesExpected({"input": matrices}, solution)
+
+  # From here onwards are v2-only tests.
+  def testSingleMatrix(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for mat, tests in [square_cases(), tall_cases(), fat_cases()]:
+        for diag_index, (solution, _) in tests.items():
+          self._assertOpOutputMatchesExpected({
+              "input": mat[0],
+              "k": diag_index
+          }, solution[0])
+
+  def testBatch(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for mat, tests in [square_cases(), tall_cases(), fat_cases()]:
+        for diag_index, (solution, _) in tests.items():
+          self._assertOpOutputMatchesExpected({
+              "input": mat,
+              "k": diag_index
+          }, solution)
+
+  def testPadding(self):
+    # LINT.IfChange
+    if compat.forward_compatible(2019, 8, 31):
+    # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
+      for padding_value in [555, -11]:
+        for mat, tests in [square_cases(), tall_cases(), fat_cases()]:
+          for diag_index, (solution, _) in tests.items():
+            mask = (solution == 0)
+            solution = solution + (mask * padding_value)
+            self._assertOpOutputMatchesExpected(
+                {
+                    "input": mat,
+                    "k": diag_index,
+                    "padding_value": padding_value
+                }, solution)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/stateful_random_ops_test.py b/tensorflow/compiler/tests/stateful_random_ops_test.py
index a54cd60cfd7..343969c40d7 100644
--- a/tensorflow/compiler/tests/stateful_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateful_random_ops_test.py
@@ -278,10 +278,11 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       maxval = 1
       if dtype.is_integer:
         maxval = 100
-      x = gen.uniform(shape=[n], maxval=maxval, dtype=dtype).numpy()
+      t = gen.uniform(shape=[n], maxval=maxval, dtype=dtype)
+      x = t.numpy().astype(float)
       if maxval > 1:
         # Normalize y to range [0, 1).
-        x = x.astype(float) / maxval
+        x = x / maxval
       # Tests that the values are distributed amongst 10 bins with equal
       # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
       # p=0.05. This test is probabilistic and would be flaky if the random
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 8eba83e285d..6576e274300 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -86,9 +86,9 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         x = stateless.stateless_random_uniform(
             shape=[n], seed=seed_t, maxval=maxval, dtype=dtype)
         y = sess.run(x, {seed_t: [565656, 121212]})
-        if maxval > 1:
-          # Normalize y to range [0, 1).
-          y = y.astype(float) / maxval
+        # Convert y to float and normalize its value to range [0, 1) when
+        # maxval != 1.
+        y = y.astype(float) / maxval
         # Tests that the values are distributed amongst 10 bins with equal
         # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
         # p=0.05. This test is probabilistic and would be flaky if the random
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index b24e807b034..7d2425ee205 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
+from absl.testing import parameterized
 import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
@@ -29,7 +30,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
-class ListOpsTest(xla_test.XLATestCase):
+class ListOpsTest(parameterized.TestCase, xla_test.XLATestCase):
 
   def testElementShape(self):
     with self.session() as sess, self.test_scope():
@@ -204,6 +205,20 @@ class ListOpsTest(xla_test.XLATestCase):
       self.assertAllEqual(t.shape.as_list(), [None])
       self.assertAllEqual(t, [1.0, 2.0])
 
+  @parameterized.named_parameters(
+      ("FlatList", [1.0, 2.0, 3.0], [], [0, 2], [1.0, 3.0]),
+      ("NestedList", [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]
+                     ], [2], [1], [[3.0, 4.0]]),
+      ("EmptyIndices", [1.0, 2.0, 3.0], [], [], []),
+  )
+  def testGather(self, input_list, element_shape, indices, output):
+    with self.session(), self.test_scope():
+      tensor_list = list_ops.tensor_list_from_tensor(
+          input_list, element_shape=element_shape)
+      gather_t = list_ops.tensor_list_gather(
+          tensor_list, indices, element_dtype=dtypes.float32)
+      self.assertAllEqual(gather_t, output)
+
   def testStackWithUninitializedTensors(self):
     with self.session(), self.test_scope():
       l = list_ops.tensor_list_reserve(
@@ -224,6 +239,6 @@ class ListOpsTest(xla_test.XLATestCase):
       self.assertAllEqual(z, [0.0, 0.0])
 
 if __name__ == "__main__":
-  os.environ['TF_XLA_FLAGS'] = ('--tf_xla_min_cluster_size=2 ' +
-                                os.environ.get('TF_XLA_FLAGS', ''))
+  os.environ["TF_XLA_FLAGS"] = ("--tf_xla_min_cluster_size=2 " +
+                                os.environ.get("TF_XLA_FLAGS", ""))
   test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index bac30b63bf8..349dabbb393 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -27,7 +27,6 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -108,31 +107,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-1, 1]], dtype=dtype),
           expected=np.array([[-1, 1]], dtype=dtype))
 
-      # TODO(penporn): Once XLA supports MatrixDiagV2, change the call to
-      # gen_array_ops.matrix_diag* (V1) to array_ops.matrix_diag* (V2).
-      self._assertOpOutputMatchesExpected(
-          gen_array_ops.matrix_diag, np.array([[1, 2], [3, 4]], dtype=dtype),
-          np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype))
-      self._assertOpOutputMatchesExpected(
-          gen_array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype),
-          np.array(
-              [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]],
-              dtype=dtype))
-      self._assertOpOutputMatchesExpected(
-          gen_array_ops.matrix_diag,
-          np.array(
-              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype),
-          np.array(
-              [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]], [[4, 0, 0], [0, 5, 0], [
-                  0, 0, 6
-              ]]], [[[7, 0, 0], [0, 8, 0], [0, 0, 9]], [[10, 0, 0], [0, 11, 0],
-                                                        [0, 0, 12]]]],
-              dtype=dtype))
-      self._assertOpOutputMatchesExpected(
-          gen_array_ops.matrix_diag_part,
-          np.arange(3 * 2 * 4).reshape([3, 2, 4]).astype(dtype),
-          np.array([[0, 5], [8, 13], [16, 21]], dtype=dtype))
-
       self._assertOpOutputMatchesExpected(
           array_ops.prevent_gradient,
           np.array([[-1, 1]], dtype=dtype),
@@ -323,11 +297,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
 
       self._assertOpOutputMatchesExpected(
           math_ops.tanh,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [[0.76159418, 0.76159418, 0.76159418, 0.76159418],
-               [0.76159418, 0.96402758, 0.99505478, 0.99932933]],
-              dtype=dtype))
+          np.array(
+              [[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20], [19, -19, 22, -22]],
+              dtype=dtype),
+          expected=np.array([[0.76159418, 0.96402758, 0.99505478, 0.99932933],
+                             [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
+                            dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           nn_ops.log_softmax,
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 59e46b06d68..d6e02ecc827 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -25,12 +25,12 @@ import re
 
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.contrib.compiler import jit
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compiler.xla import jit
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index bfaae215709..79afa0b82dd 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -17,19 +17,13 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
 )
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 # Placeholder for Google-internal load statements.
 
-# NOTE: we always assume that if_static returns "otherwise" list in open source.
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "if_static",
-)
-
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -53,10 +47,10 @@ cc_library(
 
 alias(
     name = "tensorrt_lib",
-    actual = if_static(
-        "@local_config_tensorrt//:tensorrt",
-        ":tensorrt_stub",
-    ),
+    actual = select({
+        "//tensorflow:oss": ":tensorrt_stub",
+        "//conditions:default": "@local_config_tensorrt//:tensorrt",
+    }),
     visibility = ["//visibility:private"],
 )
 
@@ -97,10 +91,17 @@ cc_library(
         ":utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:core_cpu_lib_no_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor",
         "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/stream_executor/lib",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
@@ -168,8 +169,12 @@ tf_cuda_cc_test(
         ":trt_op_kernels",
         ":trt_op_libs",
         ":trt_resources",
+        ":trt_conversion",
+        ":utils",
         "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core:framework",
@@ -235,12 +240,10 @@ tf_custom_op_py_library(
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
-        "utils/calibration_resource.cc",
         "utils/trt_int8_calibrator.cc",
         "utils/trt_lru_cache.cc",
     ],
     hdrs = [
-        "utils/calibration_resource.h",
         "utils/trt_int8_calibrator.h",
         "utils/trt_lru_cache.h",
     ],
@@ -250,6 +253,9 @@ tf_cuda_library(
         ":utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
@@ -320,11 +326,13 @@ tf_cuda_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -489,7 +497,10 @@ cc_library(
     srcs = ["utils/py_utils.cc"],
     hdrs = ["utils/py_utils.h"],
     copts = tf_copts(),
-    deps = if_tensorrt([":tensorrt_lib"]),
+    deps = if_tensorrt([
+        ":tensorrt_lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
 )
 
 tf_py_wrap_cc(
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index fb5dda9953e..cd5c7d126c6 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
@@ -125,46 +124,37 @@ Status GetEngineInfo(const Graph* g,
        ++it) {
     const Node* node = *it;
     if (segment_nodes.count(node) == 0) continue;
-    auto node_device = node->requested_device();
-    if (!node_device.empty()) {
-      // If device is set, it means device placement may have been done before,
-      // so we need to assign a device for the TRTEngineOp to maintain the
-      // invariance.
-      // If the device is CPU in this case, it tries to find the first available
-      // GPU and use it as the device.
-      DeviceNameUtils::ParsedName parsed_name;
-      const bool parse_succeeded =
-          DeviceNameUtils::ParseFullName(node_device, &parsed_name);
-      if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) {
-        string msg;
-        if (!parse_succeeded) {
-          msg = StrCat("Failed to parse assigned device of node ", node->name(),
-                       ". ");
-        } else {
-          msg = StrCat("Node ", node->name(), " was assigned to the CPU. ");
-        }
-        VLOG(1) << msg << "Attempting to place on GPU.";
-        TfGpuId tf_gpu_id;
-        PlatformGpuId platform_gpu_id;
-        std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
-        if (tf_gpu_id.value() >= 0) {
-          parsed_name.type = "GPU";
-          parsed_name.id = tf_gpu_id.value();
-          segment_devices.insert(DeviceNameUtils::FullName(
-              parsed_name.job, parsed_name.replica, parsed_name.task,
-              parsed_name.type, parsed_name.id));
-        }
-      } else {
-        segment_devices.insert(node_device);
-      }
+
+    std::string device_name;
+    if (!node->requested_device().empty()) {
+      device_name = node->requested_device();
     } else if (node->has_assigned_device_name()) {
       // It appears that nodes will not have assigned devices at this point in
       // execution.
-      segment_devices.insert(node->assigned_device_name());
+      device_name = node->assigned_device_name();
     } else {
       VLOG(2) << "Node " << node->name()
               << " neither have requested device nor assigned device";
     }
+
+    if (!device_name.empty()) {
+      // If device is set, it means device placement may have been done before,
+      // so we need to assign a device for the TRTEngineOp if the assigned
+      // device is a GPU device.
+      DeviceNameUtils::ParsedName parsed_name;
+      const bool parse_succeeded =
+          DeviceNameUtils::ParseFullName(device_name, &parsed_name);
+      if (!parse_succeeded) {
+        VLOG(1) << "Failed to parse "
+                << (node->requested_device().empty() ? "assigned" : "requested")
+                << " device " << device_name << " of node " << node->name();
+      } else if (parsed_name.type != "GPU") {
+        VLOG(1) << "Node " << node->name()
+                << " was assigned to a non-GPU device " << device_name;
+      } else {
+        segment_devices.insert(device_name);
+      }
+    }
     subgraph_nodes.push_back(node);
 
     const int node_id = node->id();
@@ -269,8 +259,20 @@ Status GetEngineInfo(const Graph* g,
                  << ") devices for the segment. Picking first one to continue.";
     info->device = *segment_devices.begin();
   } else {
-    VLOG(1) << "No device is assigned to the segment. "
-            << "A device will be assigned during graph execution (inference).";
+    TfGpuId tf_gpu_id;
+    PlatformGpuId platform_gpu_id;
+    std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
+    if (tf_gpu_id.value() >= 0) {
+      DeviceNameUtils::ParsedName parsed_name;
+      parsed_name.type = "GPU";
+      parsed_name.has_type = true;
+      parsed_name.id = tf_gpu_id.value();
+      parsed_name.has_id = true;
+      info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
+    } else {
+      VLOG(1) << "No device is assigned to the segment. A device will be "
+                 "assigned during graph execution (inference).";
+    }
   }
   return Status::OK();
 }
@@ -325,8 +327,6 @@ Status CreateTRTNode(const ConversionParams& params,
                      nvinfer1::IGpuAllocator* alloc,
                      std::vector<Node*>* engine_nodes) {
   const auto& info = infos.at(pos);
-  std::vector<TensorShapeProto> output_shape_protos;
-  std::vector<TensorShapeProto> input_shape_protos;
   std::vector<PartialTensorShape> input_shapes;
   std::vector<NodeDefBuilder::NodeOut> inputs;
   std::vector<Node*> input_nodes;
@@ -360,25 +360,16 @@ Status CreateTRTNode(const ConversionParams& params,
     } else {
       // Data edges
       if (!conn.is_input_edge) {
-        // Set the shapes and data types of output edge.
-        TensorShapeProto out_shape;
-        // shape of the output node inside segment
-        conn.inside_shape.AsProto(&out_shape);
-        if (output_shape_protos.size() <= conn.port_number) {
-          output_shape_protos.resize(conn.port_number + 1);
+        // Set the data types of output edge.
+        if (out_types.size() <= conn.port_number) {
           out_types.resize(conn.port_number + 1);
         }
-        output_shape_protos.at(conn.port_number) = out_shape;
         out_types.at(conn.port_number) = conn.connection_type;
       } else {
         // Set the shapes and data types of input edge.
-        TensorShapeProto in_shape;
-        conn.outside_shape.AsProto(&in_shape);
-        if (input_shape_protos.size() <= conn.port_number) {
-          input_shape_protos.resize(conn.port_number + 1);
+        if (input_shapes.size() <= conn.port_number) {
           input_shapes.resize(conn.port_number + 1);
         }
-        input_shape_protos.at(conn.port_number) = in_shape;
         input_shapes.at(conn.port_number) = conn.outside_shape;
         // Shape must be fully defined (excluding batch dimension) for static
         // mode.
@@ -440,8 +431,6 @@ Status CreateTRTNode(const ConversionParams& params,
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string = string(static_cast<const char*>(engine_data->data()),
                             engine_data->size());
-  } else {
-    segment_string = info.segment_graph_def.SerializeAsString();
   }
 
   string prec_string;
@@ -461,15 +450,13 @@ Status CreateTRTNode(const ConversionParams& params,
   }
 
   NodeDef trt_node;
+  NameAttrList function;
+  function.set_name(StrCat(info.engine_name, "_native_segment"));
   Status status =
-      node_builder.Attr("input_shapes", input_shape_protos)
-          .Attr("output_shapes", output_shape_protos)
+      node_builder
           .Attr("static_engine",
                 info.engine_type == EngineInfo::EngineType::TRTStatic)
-          .Attr("segment_funcdef_name",
-                params.use_function_backup
-                    ? StrCat(info.engine_name, "_native_segment")
-                    : "")
+          .Attr("segment_func", function)
           .Attr("serialized_segment", segment_string)
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
@@ -538,103 +525,27 @@ Status CreateTRTNode(const ConversionParams& params,
   return Status::OK();
 }
 
-// Function to construct a funcdef from the segment and add it to the graph.
-Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph,
-                                                const GraphDef& segment,
-                                                const string& engine_name) {
-  Graph sgraph(graph->flib_def());
-  GraphConstructorOptions gcopts;
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, &sgraph));
-  std::map<string, Node*> io_nodes;
-  int num_inputs = 0;
-  for (auto n : sgraph.op_nodes()) {
-    if (absl::StartsWith(n->name(), kInputPHName)) {
-      num_inputs++;
-      io_nodes.insert({n->name(), n});
-    } else if (absl::StartsWith(n->name(), kOutputPHName)) {
-      io_nodes.insert({n->name(), n});
-    }
-  }
-
-  for (int i = 0; i < num_inputs; ++i) {
-    auto name = StrCat(kInputPHName, i);
-    auto node = io_nodes[name];
-    NodeDef nd;
-    NodeDefBuilder node_builder(StrCat(name, "_Arg"),
-                                FunctionLibraryDefinition::kArgOp);
-    VLOG(1) << "Adding " << StrCat(name, "_Arg");
-    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
-                           .Attr("index", i)
-                           .Finalize(&nd));
-    Status s;
-    auto node_arg = sgraph.AddNode(nd, &s);
-    if (!s.ok()) {
-      LOG(ERROR) << "Couldn't add _Arg node for " << name;
-    }
-    for (auto edge : node->out_edges()) {
-      sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input());
-      VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0
-              << " - > " << edge->dst()->name() << ":" << edge->dst_input();
-      if (!s.ok()) {
-        LOG(ERROR) << "Failed to update edge from " << node_arg->name()
-                   << " to " << edge->dst()->name() << ":" << edge->dst_input();
-      }
-    }
-    sgraph.RemoveNode(node);
-  }
-
-  for (int i = 0; i < io_nodes.size() - num_inputs; ++i) {
-    auto name = StrCat(kOutputPHName, i);
-    auto node = io_nodes[name];
-    NodeDef nd;
-    NodeDefBuilder node_builder(StrCat(name, "_Ret"),
-                                FunctionLibraryDefinition::kRetOp);
-    auto edge = *(node->in_edges().begin());
-    NodeDefBuilder::NodeOut nout(edge->src()->name(), edge->src_output(),
-                                 edge->src()->output_type(edge->src_output()));
-    VLOG(1) << " input " << nout.node << ":" << nout.index
-            << " dtype=" << DataTypeString(nout.data_type);
-    // nvcc complains that Input(<brace-enclosed initializer list>) is
-    // ambiguous, so do not use Input({nout}).
-    node_builder.Input(nout);
-    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
-                           .Attr("index", i)
-                           .Finalize(&nd));
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << nd.DebugString();
-    }
-    Status s;
-    auto node_ret = sgraph.AddNode(nd, &s);
-    if (!s.ok()) {
-      LOG(ERROR) << "Couldn't add _Ret node for " << name;
-    }
-    VLOG(1) << "Update edge from " << edge->src()->name() << ":"
-            << edge->src_output() << " - > " << node_ret->name() << ":" << 0;
-    sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0);
-    s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0);
-    if (!s.ok()) {
-      LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":"
-                 << edge->src_output() << " - > " << node_ret->name() << ":"
-                 << 0;
-    }
-    sgraph.RemoveNode(node);
-  }
-  FunctionDefLibrary fdeflib;
-  auto native_segment = fdeflib.add_function();
+Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
+                                      Graph* graph, const string& engine_name) {
+  Graph segment_graph(graph->flib_def());
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            segment_graph_def, &segment_graph));
+  FunctionDefLibrary library;
+  auto segment_func = library.add_function();
   TF_RETURN_IF_ERROR(GraphToFunctionDef(
-      sgraph, StrCat(engine_name, "_native_segment"), native_segment));
+      segment_graph, StrCat(engine_name, "_native_segment"), segment_func));
   // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on
   // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32
   // would be on host if the op generating the tensor has host memory tag set.
-  (*native_segment
-        ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr]
+  (*segment_func->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr]
       .set_b(true);
   if (VLOG_IS_ON(7)) {
     VLOG(7) << engine_name << " Function_Def ";
-    VLOG(7) << native_segment->DebugString();
+    VLOG(7) << segment_func->DebugString();
   }
-  VLOG(1) << "Adding funcdef to graphlib";
-  TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
+  VLOG(1) << "Adding funcdef " << segment_func->signature().name()
+          << " to graphlib";
+  TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(library));
   return Status::OK();
 }
 
@@ -691,16 +602,10 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
 // Entry function from optimization pass.
 Status ConvertAfterShapes(const ConversionParams& params) {
   // Sanity checks.
-  if (params.precision_mode == TrtPrecisionMode::INT8) {
-    if (params.use_calibration && !params.use_function_backup) {
-      return errors::InvalidArgument(
-          "Calibration requires enabling fallback to TF function execution.");
-    }
-  } else {
-    if (params.use_calibration) {
-      return errors::InvalidArgument(
-          "Calibration with FP32 or FP16 is not supported.");
-    }
+  if (params.precision_mode != TrtPrecisionMode::INT8 &&
+      params.use_calibration) {
+    return errors::InvalidArgument(
+        "Calibration with FP32 or FP16 is not supported.");
   }
 
   // Convert graphdef to graph.
@@ -761,14 +666,14 @@ Status ConvertAfterShapes(const ConversionParams& params) {
                                    : EngineInfo::EngineType::TRTStatic);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    if (params.use_function_backup) {
-      status = RegisterSegmentFunctionToFunctionLibrary(
-          &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
-      if (!status.ok()) {
-        LOG(WARNING) << "Failed to register segment graphdef as a function "
-                     << t << ": " << status;
-        continue;
-      }
+
+    status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
+                                            &graph, curr_engine.engine_name);
+
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to register segment graphdef to the library " << t
+                   << ": " << status;
+      continue;
     }
 
     engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index d7f1df5a102..9288829574e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -46,8 +47,6 @@ struct ConversionParams {
   // maximum number of cached engines
   int max_cached_engines = 1;
   bool use_calibration = true;
-  // Whether to use function fallback for TRTEngineOp
-  bool use_function_backup = true;
 };
 
 // Method to call from optimization pass
@@ -57,6 +56,11 @@ Status ConvertAfterShapes(const ConversionParams& params);
 std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
                                                  const EngineInfo& engine);
 
+// Helper method that registers `segment_graph` as a function to the function
+// library in `graph`.
+Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
+                                      Graph* graph, const string& engine_name);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index c068c4cc06c..43f920b9ccc 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -40,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -76,18 +76,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-// TODO(aaroey): put these constants into some class.
-const char* const kInputPHName = "TensorRTInputPH_";
-const char* const kOutputPHName = "TensorRTOutputPH_";
+namespace convert {
 
 bool IsEngineInput(absl::string_view name) {
-  return absl::StartsWith(name, kInputPHName);
+  return absl::StartsWith(name, IONamePrefixes::kInputPHName);
 }
 bool IsEngineOutput(absl::string_view name) {
-  return absl::StartsWith(name, kOutputPHName);
+  return absl::StartsWith(name, IONamePrefixes::kOutputPHName);
 }
 
-namespace convert {
 using absl::StrAppend;
 using absl::StrCat;
 
@@ -620,7 +617,7 @@ bool AreDimsStaticWithDifferentSize(const nvinfer1::Dims& lhs,
 }
 
 static std::vector<std::pair<int, int>> CreateSamePadding(
-    const nvinfer1::DimsHW& stride, const nvinfer1::DimsHW& kernel,
+    const nvinfer1::Dims& stride, const nvinfer1::Dims& kernel,
     const std::vector<int64_t>& input_dims) {
   std::vector<std::pair<int, int>> padding(input_dims.size());
   CHECK_EQ(stride.nbDims, input_dims.size());  // TODO(jie): N+C? NC+?
@@ -779,7 +776,9 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
 
   nvinfer1::TensorFormats getAllowedFormats() const override { return 1; }
 
-  bool isShape() const override { return false; }
+  bool isShapeTensor() const override { return false; }
+
+  bool isExecutionTensor() const override { return true; }
 #endif
 
  private:
@@ -847,6 +846,30 @@ string TRT_TensorOrWeights::DebugString() const {
   return output;
 }
 
+// Perform 5 dimensional reorder of data on CPU
+// This is done once at convert time and does not affect GPU inference perf
+// Example: reorder NDHWC (Tensorflow) -> NCDHW (TensorRT)
+template <typename T>
+void Reorder5(const nvinfer1::Dims& shape, const T* idata,
+              const nvinfer1::Dims& istrides, T* odata,
+              const nvinfer1::Dims& ostrides) {
+  for (int k = 0; k < shape.d[0]; ++k) {
+    for (int c = 0; c < shape.d[1]; ++c) {
+      for (int d = 0; d < shape.d[2]; ++d) {
+        for (int r = 0; r < shape.d[3]; ++r) {
+          for (int s = 0; s < shape.d[4]; ++s) {
+            odata[k * ostrides.d[0] + c * ostrides.d[1] + d * ostrides.d[2] +
+                  r * ostrides.d[3] + s * ostrides.d[4]] =
+                idata[k * istrides.d[0] + c * istrides.d[1] +
+                      d * istrides.d[2] + r * istrides.d[3] +
+                      s * istrides.d[4]];
+          }
+        }
+      }
+    }
+  }
+}
+
 // TODO(jie): reorder4 & reorder2 should be merged?
 // TODO(aaroey): fix the order of parameters.
 template <typename T>
@@ -945,6 +968,67 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   }
 }
 
+// Initialize a Dims object with arbitrary dimension
+nvinfer1::Dims InitDimsN(std::initializer_list<int> list) {
+  nvinfer1::Dims dim;
+  dim.nbDims = list.size();
+  std::copy(list.begin(), list.end(), dim.d);
+  return dim;
+}
+
+// Reorder 3D convolution weights from TF to TRT
+void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
+                         TRT_ShapedWeights* oweights, const int num_groups) {
+  DCHECK(iweights.TrtDType() == oweights->TrtDType());
+  CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
+  // K indexes over output channels, C over input channels, and R, S, D over the
+  // height, width, depth
+  const int d = iweights.shape_.d[0];
+  const int r = iweights.shape_.d[1];
+  const int s = iweights.shape_.d[2];
+  // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
+  const int c = iweights.shape_.d[3] / num_groups;
+  const int k = iweights.shape_.d[4] * num_groups;
+
+  VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.shape_.d[3]
+          << " becomes " << c << ", k: " << iweights.shape_.d[4] << " becomes "
+          << k << ", d: " << d << ", r: " << r << ", s: " << s;
+
+  oweights->shape_.d[0] = iweights.shape_.d[4];  // k / num_groups;
+  oweights->shape_.d[1] = iweights.shape_.d[3];  // c * num_groups;
+  oweights->shape_.d[2] = d;
+  oweights->shape_.d[3] = r;
+  oweights->shape_.d[4] = s;
+
+  nvinfer1::Dims shape =
+      InitDimsN({k, c, d, r, s});  // KCDRS shape (same as output)
+
+  nvinfer1::Dims ostrides =
+      InitDimsN({c * d * r * s, d * r * s, r * s, s,
+                 1});  // Output = KCDRS = k*CDRS + c*DRS + d*RS + r*S + s
+
+  nvinfer1::Dims istrides =
+      InitDimsN({1, k, r * s * c * k, s * c * k,
+                 c * k});  // Input = DRSCK = k*1 + c*K + d*RSCK + r*SCK + s*CK
+
+  switch (iweights.TrtDType()) {
+    case nvinfer1::DataType::kFLOAT: {
+      Reorder5(shape, static_cast<float const*>(iweights.GetValues()), istrides,
+               static_cast<float*>(oweights->GetValues()), ostrides);
+      break;
+    }
+    case nvinfer1::DataType::kHALF: {
+      Reorder5(shape, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(oweights->GetValues()),
+               ostrides);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
+                 << DebugString(iweights.TrtDType());
+  }
+}
+
 TRT_ShapedWeights TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype,
                                                  const nvinfer1::Dims& dims) {
   TensorShape shape;
@@ -1453,6 +1537,15 @@ bool IsClipOrRelu(const nvinfer1::ILayer* layer) {
 #endif
 }
 
+bool IsAdd(const nvinfer1::ILayer* layer) {
+  if (layer->getType() != nvinfer1::LayerType::kELEMENTWISE) {
+    return false;
+  }
+  auto operation =
+      static_cast<const nvinfer1::IElementWiseLayer*>(layer)->getOperation();
+  return operation == nvinfer1::ElementWiseOperation::kSUM;
+}
+
 }  // namespace
 
 void Converter::MaybeApplyQuantizationRanges() {
@@ -1508,11 +1601,25 @@ void Converter::MaybeApplyQuantizationRanges() {
     }
   }
   // Identify fused tensors.
+  // Conv+BiasAdd+Add+Activation(Clip or Relu), Conv+BiasAdd+Add,
   // Conv+BiasAdd+Activation(Clip or Relu), Conv+BiasAdd,
   // Conv+Activation(Clip or Relu) are fused.
   std::set<nvinfer1::ITensor*> fused_tensors;
   typedef std::function<bool(const nvinfer1::ILayer*)> matcher;
   const std::vector<std::pair<string, std::vector<matcher>>> fused_patterns = {
+      {"Fused Conv+Bias+Add+Activation",
+       {
+           IsConvolution,
+           IsScale,
+           IsAdd,
+           IsClipOrRelu,
+       }},
+      {"Fused Conv+Bias+Add",
+       {
+           IsConvolution,
+           IsScale,
+           IsAdd,
+       }},
       {"Fused Conv+Bias+Activation",
        {
            IsConvolution,
@@ -2600,6 +2707,203 @@ Status ConvertConv2DBackpropInput(OpConverterParams* params) {
   return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
 }
 
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+Status ConvertConv3DHelper(OpConverterParams* params, int group,
+                           bool is_conv3d_backprop_input = false) {
+  const int kNumDims = 5;
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TRT_TensorOrWeights backprop_output_size;
+  nvinfer1::ITensor* tensor = nullptr;
+  if (is_conv3d_backprop_input) {
+    // In the case when Conv3dBackpropInput is used for conv3d_transpose, these
+    // inputs correspond to: output size, filter, and input.
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params,
+        {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
+    backprop_output_size = inputs.at(0);
+    tensor = inputs.at(2).tensor();
+  } else {
+    TF_RETURN_IF_ERROR(
+        CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
+    tensor = inputs.at(0).tensor();
+  }
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  const TRT_ShapedWeights weights_drsck = inputs.at(1).weights();
+  if (weights_drsck.shape_.nbDims != kNumDims) {
+    return errors::InvalidArgument("Conv3D expects kernel of dimension 5, at ",
+                                   node_def.name());
+  }
+  TFAttrs attrs(node_def);
+  auto data_format = attrs.get<string>("data_format");
+  const bool is_ndhwc = (data_format == "NDHWC");  // Or NCDHW 01234 - > 02341
+  const int d_index = is_ndhwc ? 1 : 2;
+  const int h_index = is_ndhwc ? 2 : 3;
+  const int w_index = is_ndhwc ? 3 : 4;
+  const int c_index = is_ndhwc ? 4 : 1;
+  auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
+  if (tf_dilations.size() != kNumDims) {
+    return errors::InvalidArgument(
+        "Convolution dilations field must specify 5 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
+    return errors::Unimplemented(
+        "Dilation rate must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+
+  const nvinfer1::Dims3 dilation_dhw(
+      tf_dilations[d_index], tf_dilations[h_index], tf_dilations[w_index]);
+  if (is_conv3d_backprop_input &&
+      (dilation_dhw.d[0] != 1 || dilation_dhw.d[1] != 1 ||
+       dilation_dhw.d[2] != 1)) {
+    return errors::Unimplemented(
+        "Dilation with Conv3DBackpropInputV2 (conv3d_transpose) is not "
+        "supported",
+        ", at ", node_def.name());
+  }
+
+  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
+  if (tf_stride.size() != kNumDims) {
+    return errors::InvalidArgument(
+        "Convolution strides field must specify 5 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
+    return errors::Unimplemented(
+        "Stride must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+
+  const nvinfer1::Dims3 stride_dhw(tf_stride[d_index], tf_stride[h_index],
+                                   tf_stride[w_index]);
+  const auto tensor_dim = tensor->getDimensions();
+
+  // Asymmetric padding on Deconv not supported for now
+  if (is_conv3d_backprop_input && attrs.get<string>("padding") == "SAME") {
+    const int tensor_c_idx = c_index - 1;
+    const int num_groups = (group == 0) ? tensor_dim.d[tensor_c_idx] : group;
+
+    TRT_ShapedWeights weights =
+        params->weight_store->GetTempWeights(weights_drsck);
+
+    nvinfer1::Dims3 effective_kernel_size(
+        weights.shape_.d[0] +
+            (weights.shape_.d[0] - 1) * (dilation_dhw.d[0] - 1),  // D
+        weights.shape_.d[1] +
+            (weights.shape_.d[1] - 1) * (dilation_dhw.d[1] - 1),  // R
+        weights.shape_.d[2] +
+            (weights.shape_.d[2] - 1) * (dilation_dhw.d[2] - 1)  // S
+    );
+
+    const auto output_size_weights =
+        static_cast<int*>(backprop_output_size.weights().GetValues());
+    const std::vector<int64_t> input_dims = {output_size_weights[d_index],
+                                             output_size_weights[h_index],
+                                             output_size_weights[w_index]};
+
+    const std::vector<std::pair<int, int>> padding =
+        CreateSamePadding(stride_dhw, effective_kernel_size, input_dims);
+
+    if (padding[0].first != padding[0].second ||
+        padding[1].first != padding[1].second ||
+        padding[2].first != padding[2].second) {
+      return errors::Unimplemented(
+          "Asymmetric padding with Conv3DBackpropInputV2 (conv3d_transpose) is "
+          "not supported, at ",
+          node_def.name());
+    }
+  }
+
+  if (params->validation_only)
+    return Status::OK();  // Finished validation checks
+
+  // Transpose to NCDHW (NCDHW is required for IConvLayer).
+  const bool need_transpose = is_ndhwc;
+  if (need_transpose) {
+    TF_RETURN_IF_ERROR(
+        params->converter->TransposeTensor(tensor, {0, 4, 1, 2, 3}, &tensor));
+  }
+
+  // group == 0 signifies that this is a depthwise convolution, so set
+  // num_groups to size of input's channel dim. For a non-depthwise conv,
+  // num_groups will be 1.
+  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
+
+  // For conv, TF weights are DRSCK, and TRT expects KCDRS.
+  // For backprop, TF weights are DRSKC, and TRT expects KCDRS.
+  // Therefore, this reorder will work for both cases.
+  TRT_ShapedWeights weights =
+      params->weight_store->GetTempWeights(weights_drsck);
+  ReorderDRSCKToKCDRS(weights_drsck, &weights, num_groups);
+  TRT_ShapedWeights biases(weights.TrtDType());
+  const int output_axis = is_conv3d_backprop_input ? 1 : 0;
+  const int noutput = weights.shape_.d[output_axis] * num_groups;
+  nvinfer1::Dims3 kernel_size_drs(weights.shape_.d[2],  // D
+                                  weights.shape_.d[3],  // R
+                                  weights.shape_.d[4]   // S
+  );
+
+  // Add convolution.
+  nvinfer1::ILayer* conv_layer = nullptr;
+  if (is_conv3d_backprop_input) {
+    nvinfer1::IDeconvolutionLayer* layer =
+        params->converter->network()->addDeconvolutionNd(
+            *tensor, noutput, kernel_size_drs, weights.GetTrtWeights(),
+            biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStrideNd(stride_dhw);  // change to nd set stride
+
+    // TensorRT 5.1.3 added support for padding modes.
+    if (attrs.get<string>("padding") == "SAME") {
+      VLOG(2) << "Using SAME padding";
+      // SAME_UPPER means that post padding is preferred.
+      layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+    }
+
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    conv_layer = layer;
+  } else {
+    nvinfer1::IConvolutionLayer* layer =
+        params->converter->network()->addConvolutionNd(
+            *tensor, noutput, kernel_size_drs, weights.GetTrtWeights(),
+            biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStrideNd(stride_dhw);
+
+    if (attrs.get<string>("padding") == "SAME") {
+      VLOG(2) << "Using SAME padding";
+      layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+    }
+
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    layer->setDilationNd(dilation_dhw);
+    conv_layer = layer;
+  }
+  nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
+
+  // Restore transpose.
+  if (need_transpose) {
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor));
+  }
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+Status ConvertConv3D(OpConverterParams* params) {
+  return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/false);
+}
+
+Status ConvertConv3DBackpropInputV2(OpConverterParams* params) {
+  return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/true);
+}
+#endif  // #if IS_TRT_VERSION_GE(6, 0, 0, 0)
+
 Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -3908,6 +4212,7 @@ Status ConvertPad(OpConverterParams* params) {
       *tensor, pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  params->converter->MarkQuantizationRangesAsInferrable(tensor, output_tensor);
 
   if (!legit_pad) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
@@ -5093,6 +5398,8 @@ static void RegisterValidatableOpConverters(
   (*registration)["Relu6"] = ConvertRelu6;
   (*registration)["Reshape"] = ConvertReshape;
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  (*registration)["Conv3D"] = ConvertConv3D;
+  (*registration)["Conv3DBackpropInputV2"] = ConvertConv3DBackpropInputV2;
   for (auto resize_mode : {"ResizeBilinear", "ResizeNearestNeighbor"}) {
     (*registration)[resize_mode] = ConvertResize;
   }
@@ -5194,26 +5501,44 @@ Status ConvertGraphDefToEngine(
   }
 
   // Build the network
-  VLOG(1) << "Starting engine conversion ";
+  if (VLOG_IS_ON(1)) {
+    string mode_str;
+    TF_RETURN_IF_ERROR(TrtPrecisionModeToName(precision_mode, &mode_str));
+    VLOG(1) << "Starting engine conversion, precision mode: " << mode_str;
+  }
   Converter converter(trt_network.get(), precision_mode, use_calibration);
   std::vector<Converter::EngineOutputInfo> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
-    string node_name = node_def.name();
-    VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op();
-    if (IsEngineInput(node_name) && (node_def.op() == "Placeholder")) {
+    const string& node_name = node_def.name();
+    VLOG(2) << "Converting node " << node_name << ", op=" << node_def.op();
+    if (IsEngineInput(node_name)) {
       int32 slot_number = -1;
-      if (!strings::safe_strto32(  // non-absl ok
-              node_name.c_str() + strlen(kInputPHName), &slot_number)) {
-        return errors::InvalidArgument("Failed to parse slot number from ",
-                                       node_name);
+      string type_key;
+      if (node_def.op() == "Placeholder") {
+        if (!strings::safe_strto32(  // non-absl ok
+                node_name.c_str() + strlen(IONamePrefixes::kInputPHName),
+                &slot_number)) {
+          return errors::InvalidArgument("Failed to parse slot number from ",
+                                         node_name);
+        }
+        type_key = "dtype";
+      } else if (tensorflow::grappler::IsArg(node_def)) {
+        // Maybe remove the dependence on grappler and re-implement IsArg,
+        // which is pretty simple (but could change if new Arg nodes are added)
+        slot_number = node_def.attr().at("index").i();
+        type_key = "T";
+      } else {
+        return errors::InvalidArgument(
+            "Node ", node_name,
+            " with is neither Placeholder nor Arg, instead ", node_def.op());
       }
       nvinfer1::DataType trt_dtype;
       nvinfer1::Dims trt_dims;
       int batch_size = -1;
       auto shape = input_shapes.at(slot_number);
       auto status = ValidateTensorProperties(
-          node_def.op(), node_def.attr().at("dtype").type(), shape,
+          node_def.op(), node_def.attr().at(type_key).type(), shape,
           /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size);
       if (!status.ok()) {
         const string error_message =
@@ -5229,12 +5554,23 @@ Status ConvertGraphDefToEngine(
       // engines offline, by calling sess.run() and cache/serialize the engines.
       TF_RETURN_IF_ERROR(
           converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size));
-    } else if (IsEngineOutput(node_name) && (node_def.op() == "Identity")) {
+    } else if (IsEngineOutput(node_name)) {
       int32 slot_number = -1;
-      if (!strings::safe_strto32(  // non-absl ok
-              node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
-        return errors::InvalidArgument("Failed to parse slot number from ",
-                                       node_name);
+      if (node_def.op() == "Identity") {
+        if (!strings::safe_strto32(  // non-absl ok
+                node_name.c_str() + strlen(IONamePrefixes::kOutputPHName),
+                &slot_number)) {
+          return errors::InvalidArgument("Failed to parse slot number from ",
+                                         node_name);
+        }
+      } else if (tensorflow::grappler::IsRetval(node_def)) {
+        slot_number = node_def.attr().at("index").i();
+      } else {
+        return errors::InvalidArgument(
+            "Node with name ", node_name,
+            " starting with IONamePrefixes::kOutputPHName is "
+            "neither Identity nor Retval, instead ",
+            node_def.op());
       }
       // Get output type that TensorFlow expects
       TFAttrs attrs(node_def);
@@ -5247,8 +5583,6 @@ Status ConvertGraphDefToEngine(
       output_tensors.at(slot_number) = {node_def.input(0), node_name,
                                         trt_dtype};
     } else {
-      VLOG(2) << "Converting node: " << node_def.name() << " , "
-              << node_def.op();
       TF_RETURN_IF_ERROR(converter.ConvertNode(node_def));
     }
   }
@@ -5303,7 +5637,8 @@ Status ConvertSegmentToGraphDef(
 
     // Add dummy input/output nodes to the segment graphdef.
     if (connection.is_input_edge) {
-      const string node_name = StrCat(kInputPHName, connection.port_number);
+      const string node_name =
+          StrCat(IONamePrefixes::kInputPHName, connection.port_number);
       if (marker_nodes.count(node_name)) {
         VLOG(1) << "Reusing input " << node_name << " for the edge "
                 << connection.outside_node_name << ":"
@@ -5313,16 +5648,18 @@ Status ConvertSegmentToGraphDef(
       }
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
-      NodeDefBuilder builder(node_name, "Placeholder");
+      NodeDefBuilder builder(node_name, "_Arg");
       auto status = builder.Attr("shape", partial_shape)
-                        .Attr("dtype", dtype)
+                        .Attr("T", dtype)
+                        .Attr("index", connection.port_number)
                         .Finalize(seg_node);
       VLOG(1) << "Constructing input " << node_name << " for the edge "
               << connection.outside_node_name << ":" << connection.outside_port
               << " -> " << connection.inside_node_name << ":"
               << connection.inside_port;
     } else {
-      const string node_name = StrCat(kOutputPHName, connection.port_number);
+      const string node_name =
+          StrCat(IONamePrefixes::kOutputPHName, connection.port_number);
       if (marker_nodes.count(node_name)) {
         VLOG(1) << "Reusing output " << node_name << " for the edge "
                 << connection.inside_node_name << ":" << connection.inside_port
@@ -5332,9 +5669,10 @@ Status ConvertSegmentToGraphDef(
       }
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
-      NodeDefBuilder builder(node_name, "Identity");
+      NodeDefBuilder builder(node_name, "_Retval");
       auto status =
-          builder
+          builder.Attr("T", dtype)
+              .Attr("index", connection.port_number)
               .Input(connection.inside_node_name, connection.inside_port, dtype)
               .Finalize(seg_node);
       VLOG(1) << "Constructing output " << node_name << " for the edge "
@@ -5360,12 +5698,12 @@ Status ConvertSegmentToGraphDef(
     if (connection.is_control_edge() || !connection.is_input_edge) continue;
     auto snode =
         segment_def->mutable_node(old_to_new_id_map[connection.inside_id]);
-    const string placeholder_name =
-        StrCat(kInputPHName, connection.port_number);
+    const string arg_name =
+        StrCat(IONamePrefixes::kInputPHName, connection.port_number);
     VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
             << " from " << snode->input(connection.inside_port) << " to "
-            << placeholder_name;
-    snode->set_input(connection.inside_port, placeholder_name);
+            << arg_name;
+    snode->set_input(connection.inside_port, arg_name);
   }
   std::set<string> subgraph_node_names;
   for (const Node* node : subgraph_nodes) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index a6a7afe121e..9d475e25ff7 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
@@ -38,8 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-extern const char* const kInputPHName;
-extern const char* const kOutputPHName;
 
 namespace convert {
 
@@ -120,8 +117,8 @@ struct EngineInfo {
   bool use_calibration;
 };
 
-// Constructs a graphdef from the segment in the given graph. Adds placeholder
-// nodes for input edges (InputPH_*) and identity nodes for output edges
+// Constructs a graphdef from the segment in the given graph. Adds _Arg
+// nodes for input edges (InputPH_*) and _Retval nodes for output edges
 // (OutputPH_*). This function needs to be called before TensorRT nodes
 // inserted in order to correctly get sizes from the original graph.
 //
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index b6a3587005c..84898108a4d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -307,7 +307,9 @@ class FakeITensor : public nvinfer1::ITensor {
 
   nvinfer1::TensorFormats getAllowedFormats() const override { return 1; }
 
-  bool isShape() const override { return false; }
+  bool isShapeTensor() const override { return false; }
+  bool isExecutionTensor() const override { return true; }
+
 #endif
 
  private:
@@ -1158,7 +1160,7 @@ class ConvertGraphDefToEngineTest : public ::testing::Test {
     int batch_size = -1;
     for (const NodeDef& node : gdef.node()) {
       absl::string_view node_name(node.name());
-      if (absl::ConsumePrefix(&node_name, kInputPHName)) {
+      if (absl::ConsumePrefix(&node_name, IONamePrefixes::kInputPHName)) {
         int port = -1;
         EXPECT_TRUE(absl::SimpleAtoi(node_name, &port)) << node.name();
         if (input_shapes.size() < port + 1) input_shapes.resize(port + 1);
@@ -1188,11 +1190,13 @@ class ConvertGraphDefToEngineTest : public ::testing::Test {
 
 TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) {
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName(StrCat(kInputPHName, 0)), DT_FLOAT,
-                                ops::Placeholder::Shape({1, 1}));
+  auto input =
+      ops::Placeholder(s.WithOpName(StrCat(IONamePrefixes::kInputPHName, 0)),
+                       DT_FLOAT, ops::Placeholder::Shape({1, 1}));
   auto output = ops::Identity(s.WithOpName("identity1"), input);
   output = ops::Identity(s.WithOpName("identity2"), output);
-  output = ops::Identity(s.WithOpName(StrCat(kOutputPHName, 0)), output);
+  output = ops::Identity(s.WithOpName(StrCat(IONamePrefixes::kOutputPHName, 0)),
+                         output);
   // If the converter marks the input tensor as output tensor, the conversion
   // below will fail with:
   // > TensorRTOutputPH_0 cannot be both input and output
@@ -1453,6 +1457,9 @@ class OpConverterTest : public ::testing::Test {
     return converter_->quantization_ranges_;
   }
 
+  void PropagateQuantizationRanges() {
+    converter_->PropagateQuantizationRanges();
+  }
   std::unique_ptr<Converter> converter_;
 
  protected:
@@ -3971,6 +3978,340 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   }
 }
 
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+TEST_F(OpConverterTest, ConvertConv3D) {
+  // Get nodedef for Conv3D layer.
+  auto get_conv3d_nodedef =
+      [](std::vector<int> strides = {1, 1, 1, 1, 1}, string padding = "SAME",
+         string data_format = "NCDHW",
+         std::vector<int> dilations = {1, 1, 1, 1, 1},
+         bool is_conv3d_backprop_input = false) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+
+    if (is_conv3d_backprop_input) {
+      auto input_sizes =
+          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+      ops::Conv3DBackpropInputV2::Attrs attrs =
+          ops::Conv3DBackpropInputV2::Attrs()
+              .DataFormat(data_format)
+              .Dilations(dilations);
+      auto conv3d =
+          ops::Conv3DBackpropInputV2(s.WithOpName("my_conv3d"), input_sizes,
+                                     filter, input, strides, padding, attrs);
+      return conv3d.operation.node()->def();
+    } else {
+      ops::Conv3D::Attrs attrs =
+          ops::Conv3D::Attrs().DataFormat(data_format).Dilations(dilations);
+      auto conv3d = ops::Conv3D(s.WithOpName("my_conv3d"), input, filter,
+                                strides, padding, attrs);
+      return conv3d.operation.node()->def();
+    }
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef();
+
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input\" for Conv3D must be a tensor, at my_conv3d");
+  }
+  {
+    // Filter is tensor, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3, 3, 1, 1, 3, 3, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"filter\" for Conv3D must be a constant, at my_conv3d");
+  }
+  {
+    // Filter is not 5D, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Conv3D expects kernel of dimension 5, at my_conv3d");
+  }
+  {
+    // Dilations is not 5D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>(
+        "weights", {3, 3, 1, 1, 1},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9});  // Dimensions, then values
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution dilations field must specify 5 dimensions, at my_conv3d");
+  }
+  {
+    // Dilation value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 2, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv3d");
+  }
+  {
+    // Dilation value is not 1 for channel (NDHWC), should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC", {1, 1, 1, 1, 2});
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv3d");
+  }
+  {
+    // Dilation + Conv3DBackpropInputV2, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
+                                          {1, 1, 2, 1, 1}, true);
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation with Conv3DBackpropInputV2 "
+                               "(conv3d_transpose) is not supported, "
+                               "at my_conv3d");
+  }
+  {
+    // Asymmetric+ Conv3DBackpropInputV2, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
+                                          {1, 1, 1, 1, 1}, true);
+    AddTestTensor("input", {1, 2, 2, 2});
+    AddTestWeights<float>("weights", {1, 1, 2, 1, 1}, {1, 1});
+    AddTestWeights<int>("input_sizes", {8}, {1, 2, 3, 4, 5, 6, 7, 8});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Asymmetric padding with Conv3DBackpropInputV2 "
+                               "(conv3d_transpose) is not supported, at "
+                               "my_conv3d");
+  }
+  {
+    // Strides is not 5D, should fail.
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1, 1}, "SAME", "NCDHW",
+                                          {1, 1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 2, 2});
+    AddTestWeights<float>("weights", {1, 1, 2, 1, 1}, {1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution strides field must specify 5 dimensions, at my_conv3d");
+  }
+  {
+    // Stride value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv3d_nodedef({1, 2, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Stride must be 1 for batch and channel dimensions, at my_conv3d");
+  }
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    bool is_conv3d_backprop_input;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  // Start here
+  const int kConv3DOKCases = 8;
+  TestParams ok_params[kConv3DOKCases] = {
+      // Basic - just 1x1 conv - input = output
+      TestParams{
+          /*input_dims=*/{1, 3, 3, 3},  // CDHW
+          /*input=*/{1, 2,  15,  3, 6,  -3, 22, 1, 88, 56, 36, 1,  1, 105,
+                     1, 16, -28, 1, 42, 9,  3,  1, 7,  1,  11, 61, 5},
+          /*filter_dims=*/{1, 1, 1, 1, 1},  // DRSCK
+          /*filter=*/{1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"VALID",
+          /*data_format=*/"NCDHW",
+          /*dilations=*/{1, 1, 1, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 3, 3, 3},
+          /*expected_output=*/{1,  2,  15, 3, 6,   -3, 22, 1,   88,
+                               56, 36, 1,  1, 105, 1,  16, -28, 1,
+                               42, 9,  3,  1, 7,   1,  11, 61,  5}},
+      // Basic - 2x1 filter
+      TestParams{/*input_dims=*/{1, 3, 3, 3},  // CDHW
+                 /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6},
+                 /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+                 /*filter=*/{1, 1},
+                 /*strides=*/{1, 1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCDHW",
+                 /*dilations=*/{1, 1, 1, 1, 1},
+                 /*is_conv3d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 3, 3},
+                 /*expected_output=*/
+                 {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7}},
+      // SAME padding (Asymmetric)
+      TestParams{
+          /*input_dims=*/{1, 2, 3, 2},  // CDHW
+          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+          /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+          /*filter=*/{-1, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"SAME",
+          /*data_format=*/"NCDHW",
+          /*dilations=*/{1, 1, 1, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 2, 3, 2},
+          /*expected_output=*/
+          {6, 6, 6, 6, 6, 6, -6, -7, -8, -9, -10,
+           -11}  // Diff in first 2 depths is const 6
+      },
+      // SAME padding (Symmetric)
+      TestParams{
+          /*input_dims=*/{1, 2, 3, 2},  // CDHW
+          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+          /*filter_dims=*/{3, 1, 1, 1, 1},  // DRSCK
+          /*filter=*/{-1, 0, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"SAME",
+          /*data_format=*/"NCDHW",
+          /*dilations=*/{1, 1, 1, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 2, 3, 2},
+          /*expected_output=*/
+          {6, 7, 8, 9, 10, 11, 0, -1, -2, -3, -4,
+           -5}  // Swaps front two depths, negates
+      },
+
+      // NDHWC (multi-channel)
+      TestParams{
+          /*input_dims=*/{2, 3, 2, 2},  // DHWC
+          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+          /*filter_dims=*/{2, 1, 1, 2, 1},  // DRSCK
+          /*filter=*/{-1, 1, 1, -1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"VALID",
+          /*data_format=*/"NDHWC",
+          /*dilations=*/{1, 1, 1, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 3, 2, 1},
+          /*expected_output=*/{0, 0, 0, 0, 0, 0}  // Each filter opposes the
+                                                  // other
+      },
+
+      // Dilated
+      TestParams{
+          /*input_dims=*/{1, 3, 3, 3},  // CDHW
+          /*input=*/{1,   1,   1,   1,   1, 1, 1, 1, 1, -10, -10, -10, -10, -10,
+                     -10, -10, -10, -10, 7, 7, 7, 7, 7, 7,   7,   7,   7},
+          /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+          /*filter=*/{1, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"VALID",
+          /*data_format=*/"NCDHW",
+          /*dilations=*/{1, 1, 2, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 1, 3, 3},
+          /*expected_output=*/{8, 8, 8, 8, 8, 8, 8, 8, 8}  // Only front depth
+                                                           // is valid, skips
+                                                           // neg values
+      },
+      // Strided
+      TestParams{
+          /*input_dims=*/{1, 3, 3, 3},
+          /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
+                     0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
+          /*filter_dims=*/{1, 1, 1, 1, 1},
+          /*filter=*/{1},
+          /*strides=*/{1, 1, 2, 2, 2},
+          /*padding=*/"VALID",
+          /*data_format=*/"NCDHW",
+          /*dilations=*/{1, 1, 1, 1, 1},
+          /*is_conv3d_backprop_input=*/false,
+          /*expected_output_dims=*/{1, 2, 2, 2},
+          /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8}  // Should only pick up
+                                                        // the corners
+      },
+      // Transpose Strided
+      TestParams{/*input_dims=*/{1, 2, 2, 2},  // CDHW
+                 /*input=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                 /*filter_dims=*/{1, 1, 1, 1, 1},
+                 /*filter=*/{1},
+                 /*strides=*/{1, 1, 2, 2, 2},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCDHW",
+                 /*dilations=*/{1, 1, 1, 1, 1},
+                 /*is_conv3d_backprop_input=*/true,
+                 /*expected_output_dims=*/{1, 3, 3, 3},
+                 /*expected_output=*/
+                 {1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}},  // Cube
+                                                            // expands and
+                                                            // fills
+                                                            // center with
+                                                            // zeroes
+
+  };
+
+  for (int i = 0; i < kConv3DOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_conv3d_nodedef(
+        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
+        ok_params[i].dilations, ok_params[i].is_conv3d_backprop_input);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+    if (ok_params[i].is_conv3d_backprop_input) {
+      AddTestWeights<float>(
+          "input_sizes",
+          {static_cast<int>(ok_params[i].expected_output.size())},
+          ok_params[i].expected_output);
+    }
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_conv3d", &output));
+    ASSERT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>(ok_params[i].input)}};
+    DataVec output_data{
+        {"my_conv3d",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
+
 TEST_F(OpConverterTest, ConvertTopK) {
   // TODO(tmorris): This test isn't setting the input dtype properly. TopK with
   // int32 is unsupported by TRT.
@@ -5847,6 +6188,111 @@ TEST_F(OpConverterTest, ConvertResize) {
 }
 #endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
+NodeDef MakePadNodeDef(std::string name, DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+  auto padding = ops::Placeholder(s.WithOpName("padding"), DT_INT32);
+  auto pad = ops::Pad(s.WithOpName(name), input, padding);
+  return pad.operation.node()->def();
+}
+
+template <typename CType>
+struct PadTestParams {
+  std::vector<int> input_dims;
+  std::vector<int> pad_dims;
+  std::vector<CType> input_values;
+  std::vector<int> expected_output_dims;
+  std::vector<CType> expected_output_values;
+};
+
+template <DataType dtype>
+void TestConvertPad(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  std::vector<PadTestParams<CType>> params{
+      {
+          /*input_dims=*/{1, 2, 1},  // H, W, C
+          /*pad_dims=*/{4, 2},       // #dims, {pad_before, pad_after}
+          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
+          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
+          /*expected_output_values=*/
+          CastTestVector<float, CType>({0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0}),
+      },
+  };
+
+  for (int i = 0; i < params.size(); ++i) {
+    test->Reset();
+    // Create pad node.
+    NodeDef node_def = MakePadNodeDef("my_pad", dtype);
+    // Create input tensor
+    test->AddTestTensor("input", params[i].input_dims, /*batch_size=*/1,
+                        /*trt_dtype=*/TfDataTypeToTrt(dtype));
+    // Create output size.
+    test->AddTestWeights<int32>("padding", params[i].pad_dims,
+                                {0, 0, 1, 0, 0, 1, 0, 0});
+    test->RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("padding", &output));
+
+    // Create input data for tensors.
+    const DataVec input_data{
+        {"input", test::AsTensor<CType>(params[i].input_values)}};
+    DataVec output_data{
+        {"my_pad",
+         ConstructTensor<CType>(params[i].expected_output_values.size())}};
+
+    test->BuildAndRun(
+        input_data, &output_data,
+        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    ExpectArrayAlmostEqual(params[i].expected_output_values,
+                           GetSpanForData<CType>(output_data[0]), CType(1e-5));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertPad) {
+  {
+    // First input is weight, should fail.
+    Reset();
+    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
+    AddTestWeights<float>("input", {1, 2}, {1, 2});
+    AddTestWeights<int>("padding", {1, 2}, {1, 2});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"tensor\" for Pad must be a "
+                               "tensor");
+  }
+  {
+    // padding is a tensor, should fail.
+    Reset();
+    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
+    AddTestTensor("input", {1, 2});
+    AddTestTensor("padding", {1, 2});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"paddings\" for Pad must be a "
+                               "constant");
+  }
+  TestConvertPad<DT_FLOAT>(this);
+  TestConvertPad<DT_HALF>(this);
+  {
+    // Make sure that ranges are inferred across a Pad.
+    Reset();
+    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
+    AddTestTensor("input", {1, 2, 1});
+    AddTestWeights<int>("padding", {4, 2}, {0, 0, 1, 0, 0, 1, 0, 0});
+    TRT_TensorOrWeights input;
+    TRT_TensorOrWeights output;
+    RunValidationAndConversion(node_def);
+    TF_EXPECT_OK(GetTensorOrWeights("input", &input));
+    TF_EXPECT_OK(GetTensorOrWeights("my_pad", &output));
+    converter_->ProvideQuantizationRange(input.tensor(), -5.0f, 5.0f);
+    // Input range should be inferred across pad.
+    PropagateQuantizationRanges();
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(5.0f, ranges[input.tensor()]);
+    EXPECT_EQ(5.0f, ranges[output.tensor()]);
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 6af483d37cf..35a8c6340f8 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -67,9 +67,6 @@ Status TRTOptimizationPass::Init(
   if (params.count("use_calibration")) {
     use_calibration_ = params.at("use_calibration").b();
   }
-  if (params.count("use_function_backup")) {
-    use_function_backup_ = params.at("use_function_backup").b();
-  }
   return Status::OK();
 }
 
@@ -193,31 +190,30 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
     LOG(INFO) << CurrentStackTrace();
     PrintDebugInfo(cluster, item);
   }
-  int max_dim = -1;
-  if (!item.feed.empty()) {
-    for (const auto& f : item.feed) {
-      const auto& shape = f.second.shape();
-      if (shape.dims() > 0) {
-        if (shape.dim_size(0) > max_dim) max_dim = shape.dim_size(0);
+  if (!is_dynamic_op_) {
+    int max_batch_dim = -1;
+    if (!item.feed.empty()) {
+      for (const auto& f : item.feed) {
+        const auto& shape = f.second.shape();
+        if (shape.dims() > 0) {
+          if (shape.dim_size(0) > max_batch_dim)
+            max_batch_dim = shape.dim_size(0);
+          VLOG(2) << "Setting max_batch_dim to " << max_batch_dim
+                  << " using batch dimension of " << f.first << " with shape "
+                  << shape;
+        }
       }
     }
-  }
-  if (maximum_batch_size_ < 0) {  // automatic batch size from input
-    if (max_dim > 0) {
-      maximum_batch_size_ = max_dim;
-      VLOG(1) << "Setting maximum batch size to " << max_dim;
-    } else {
-      maximum_batch_size_ = 128;
-      LOG(WARNING) << "Maximum batch size is not set"
-                      " and can't be deduced from inputs setting it to"
-                   << maximum_batch_size_
-                   << ". Suggest configuring it from configuration parameters";
-    }
-  } else {
-    if (max_dim > maximum_batch_size_) {
-      LOG(WARNING) << "Configured batch size " << maximum_batch_size_
-                   << " is less than input batch size " << max_dim
-                   << " adjusting maximum batch size to match input batch size";
+    if (max_batch_dim > maximum_batch_size_) {
+      return errors::InvalidArgument(
+          "Specified max_batch_size=", maximum_batch_size_,
+          " is less than maximum batch dimension of inputs (", max_batch_dim,
+          "). ", "To continue, set max_batch_size to >= ", max_batch_dim);
+    } else if (max_batch_dim < maximum_batch_size_) {
+      LOG(INFO) << "Specified max_batch_size=" << maximum_batch_size_
+                << " is larger than maximum batch dimension of inputs ("
+                << max_batch_dim << "). "
+                << "This can result in poor performance.";
     }
   }
   grappler::GraphProperties static_graph_properties(item);
@@ -259,7 +255,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
   cp.is_dyn_op = is_dynamic_op_;
   cp.max_cached_engines = max_cached_batches_;
   cp.use_calibration = use_calibration_;
-  cp.use_function_backup = use_function_backup_;
   auto status = ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index d3fd914b302..35a92341ee9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -40,13 +40,14 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
         is_dynamic_op_(false),
         max_cached_batches_(1),
         max_workspace_size_bytes_(256LL << 20),
-        use_calibration_(true),
-        use_function_backup_(true) {
+        use_calibration_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
   string name() const override { return name_; };
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Init(
       const RewriterConfig_CustomGraphOptimizer* config = nullptr) override;
 
@@ -71,8 +72,6 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
   int64_t max_workspace_size_bytes_;
   bool use_calibration_;
 
-  // Whether to allow TF function fallback path in TRTEngineOp.
-  bool use_function_backup_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 91c8c660f85..eb60829d31d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -23,6 +23,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+class IONamePrefixes {
+ public:
+  static constexpr const char* const kInputPHName = "TensorRTInputPH_";
+  static constexpr const char* const kOutputPHName = "TensorRTOutputPH_";
+};
+
 template <typename T>
 struct TrtDestroyer {
   void operator()(T* t) {
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 2898602b879..3143b06817e 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -39,28 +39,25 @@ class GetCalibrationDataOp : public OpKernel {
     // TODO(laigd): it will allocate the tensor on the device and copy the
     // serialized string to that tensor, and later sess.run() will copy it back
     // to host. We need to optimize this.
-    const string& resource_name = context->input(0).scalar<string>()();
 
+    const string& resource_name = context->input(0).scalar<tstring>()();
     // Get the resource.
-    TRTCalibrationResource* resource = nullptr;
+    TRTEngineCacheResource* resource = nullptr;
     OP_REQUIRES_OK(context, context->resource_manager()->Lookup(
-                                std::string(kCalibrationContainerName),
-                                resource_name, &resource));
+                                std::string(kTfTrtContainerName), resource_name,
+                                &resource));
     core::ScopedUnref sc(resource);
 
     // Serialize the resource as output.
-    string serialized_resource;
-    OP_REQUIRES_OK(context, resource->SerializeToString(&serialized_resource));
+    string serialized_resource = resource->calib_ctx_->TerminateCalibration();
+    OP_REQUIRES(context, !serialized_resource.empty(),
+                errors::Unknown("Calibration table is empty."));
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
 
-    // Destroy the resource.
-    OP_REQUIRES_OK(context,
-                   context->resource_manager()->Delete<TRTCalibrationResource>(
-                       std::string(kCalibrationContainerName), resource_name));
-    output->scalar<string>()() = serialized_resource;
+    output->scalar<tstring>()() = serialized_resource;
   }
 };
 
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index ab0b21edc41..646a44f1405 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -17,18 +17,23 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -54,6 +59,7 @@ using ::stream_executor::port::StatusOr;
 
 // A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
+
 class AsyncHelper : public core::RefCounted {
  public:
   AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {}
@@ -87,10 +93,15 @@ class TRTEngineOp : public AsyncOpKernel {
                VectorTensorShapeHasher>;
 
   // Execute calibration
-  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
+  void ExecuteCalibration(OpKernelContext* ctx,
+                          TRTEngineCacheResource* cache_res,
+                          AsyncHelper* helper);
 
   // Construct a function handle for executing native funcdef graph
-  Status ConstructFunctionHandle(OpKernelContext* ctx);
+  // These are the exact same function.
+
+  Status ConstructFunctionHandle(FunctionLibraryRuntime* lib,
+                                 const string& device_name);
 
   // Execute replaced native segment as function Op.
   void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
@@ -101,15 +112,15 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // Allocate necessary resources for calibration
   Status AllocateCalibrationResources(OpKernelContext* ctx,
-                                      TRTEngineCacheResource* cache_res,
-                                      TRTCalibrationResource** cr);
+                                      TRTEngineCacheResource* cache_res);
 
   Status GetEngineCacheResource(OpKernelContext* ctx,
                                 TRTEngineCacheResource** cache_res);
 
   // Get engine for the input shape
   StatusOr<EngineContext*> GetEngine(
-      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx);
+      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,
+      TRTEngineCacheResource* cache_res);
 
   // Verify that the input shapes are consistent and can be handled by this op.
   Status VerifyInputShapes(const std::vector<TensorShape>& shapes);
@@ -127,10 +138,8 @@ class TRTEngineOp : public AsyncOpKernel {
   // serialized protobuf segment or trt engine depending on static_engine_ flag.
   string serialized_segment_;
 
-  // Name of the function for TF native execution of the segment. If empty, it
-  // means TF native execution is not allowed, and if TRT engine fails to run
-  // an error will be returned.
-  string funcdef_name_;
+  // The function for TF native execution of the segment.
+  NameAttrList func_;
 
   // GraphDef representation of the segment.
   GraphDef segment_graph_;
@@ -150,7 +159,7 @@ class TRTEngineOp : public AsyncOpKernel {
 
   int64 workspace_size_;
   mutex engine_mutex_;
-  FunctionLibraryRuntime::Handle native_func_;
+  FunctionLibraryRuntime::Handle func_handle_;
 
   // The finalized calibrator for inference.
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
@@ -179,23 +188,61 @@ void* GetTensorAddress(const Tensor* tensor_ptr) {
   }
 }
 
-Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
+static Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle,
+                                    FunctionLibraryRuntime* flib_runtime,
+                                    GraphDef* graph_def) {
+  const FunctionLibraryDefinition* flib_def =
+      flib_runtime->GetFunctionLibraryDefinition();
+  const FunctionBody* fbody;
+  fbody = flib_runtime->GetFunctionBody(handle);
+  if (!fbody) {
+    return errors::Internal(
+        "Function body is null when converting from FuncDef to GraphDef.");
+  }
+  std::unique_ptr<Graph> graph(new Graph(flib_def));
+  CopyGraph(*fbody->graph, graph.get());
+
+  auto replace_name = [](const char* const prefix, string* name) {
+    if (absl::StartsWith(*name, absl::AsciiStrToLower(prefix))) {
+      name->replace(0, strlen(prefix), prefix);
+      return true;
+    }
+    return false;
+  };
+  graph->ToGraphDef(graph_def);
+  // GraphToFunctionDef() will convert all the node names to lowercase.
+  for (auto& node : *graph_def->mutable_node()) {
+    if (!replace_name(IONamePrefixes::kInputPHName, node.mutable_name())) {
+      if (replace_name(IONamePrefixes::kOutputPHName, node.mutable_name())) {
+        // Instantiation of the function will append _RetVal to the node name,
+        // need to remove it for backward compatibility.
+        const char* const suffix_to_remove = "_RetVal";
+        if (absl::EndsWith(node.name(), suffix_to_remove)) {
+          node.mutable_name()->erase(node.name().size() -
+                                     strlen(suffix_to_remove));
+        }
+      }
+    }
+    for (auto& input : *node.mutable_input()) {
+      if (!replace_name(IONamePrefixes::kInputPHName, &input)) {
+        replace_name(IONamePrefixes::kOutputPHName, &input);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib,
+                                            const string& device_name) {
   VLOG(1) << "Constructing function handle";
-  auto lib = ctx->function_library();
   if (lib == nullptr) {
     return errors::Internal("Context function library is null");
   }
-  auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
-  if (fdef == nullptr) {
-    return errors::Internal("Native FunctionDef ", funcdef_name_,
-                            " can't be found in function library");
-  }
   FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.state_handle = "";
-  inst_ops.target = ctx->device()->name();
-  native_func_ = 0;
-  return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops,
-                          &native_func_);
+  inst_ops.target = device_name;
+  return lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), inst_ops,
+                          &func_handle_);
 }
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
@@ -206,15 +253,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("workspace_size_bytes", &workspace_size_));
   OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_));
-  if (!static_engine_) {
-    OP_REQUIRES(context, segment_graph_.ParseFromString(serialized_segment_),
-                errors::InvalidArgument("Failed to parse segment graphdef!"));
-    VLOG(1) << "Size of serialized GraphDef: "
-            << serialized_segment_.capacity();
-    string tmp;
-    // Swap with temporary empty string to deallocate the CPU memory.
-    serialized_segment_.swap(tmp);
-  }
+
   VLOG(1) << "Constructing " << name();
   string precision_string;
   OP_REQUIRES_OK(context,
@@ -222,12 +261,25 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   string calibration_data;
   OP_REQUIRES_OK(context,
                  context->GetAttr("calibration_data", &calibration_data));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("segment_funcdef_name", &funcdef_name_));
+  OP_REQUIRES_OK(context, context->GetAttr("segment_func", &func_));
+  OP_REQUIRES(context, !func_.name().empty(),
+              errors::InvalidArgument(
+                  "The TF function for the TRT segment could not be empty"));
   OP_REQUIRES_OK(context,
                  TrtPrecisionModeFromName(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("use_calibration", &use_calibration_));
+  func_handle_ = kInvalidHandle;
+  if (!static_engine_) {
+    FunctionLibraryRuntime* lib = context->function_library();
+    OP_REQUIRES_OK(context,
+                   ConstructFunctionHandle(lib, context->device()->name()));
+    OP_REQUIRES_OK(context,
+                   FunctionDefToGraphDef(func_handle_, lib, &segment_graph_));
+  }
+  // TODO(laigd): calibration_data is used in TF v1.x and we keep it only for
+  // backward compatibility reasons. Remove it once all known users switch to
+  // 2.0.
   calibration_mode_ =
       (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 &&
        calibration_data.empty());
@@ -235,20 +287,19 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
   }
-  native_func_ = kInvalidHandle;
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
 }
 
 void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
-  OP_REQUIRES_ASYNC(ctx, !funcdef_name_.empty(),
-                    errors::Internal("Fallback path is disabled, for ", name()),
-                    *helper);
   std::vector<Tensor> inputs;
   std::vector<Tensor>* outputs = new std::vector<Tensor>();
-  if (native_func_ == kInvalidHandle) {
-    OP_REQUIRES_OK_ASYNC(ctx, ConstructFunctionHandle(ctx), *helper);
+  if (func_handle_ == kInvalidHandle) {
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ConstructFunctionHandle(ctx->function_library(), ctx->device()->name()),
+        *helper);
   }
   auto lib = ctx->function_library();
   FunctionLibraryRuntime::Options opts;
@@ -261,7 +312,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   }
   helper->Ref();  // Increment count for calculating native graph
   VLOG(1) << "Executing native segment: " << name();
-  lib->Run(opts, native_func_, inputs, outputs,
+  lib->Run(opts, func_handle_, inputs, outputs,
            [this, ctx, outputs, helper](const Status& s) {
              core::ScopedUnref sc(helper);
              OP_REQUIRES_OK_ASYNC(ctx, s, *helper);
@@ -274,27 +325,14 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
 }
 
 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
+                                     TRTEngineCacheResource* cache_res,
                                      AsyncHelper* helper) {
   VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
   core::ScopedUnref sc(helper);
-  // Get the cache resource outside the LookupOrCreate() below to avoid
-  // deadlock.
-  TRTEngineCacheResource* cache_res = nullptr;
-  OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper);
-  core::ScopedUnref unref_cache_res(cache_res);
-  TRTCalibrationResource* calib_res = nullptr;
-  OP_REQUIRES_OK_ASYNC(
-      ctx,
-      ctx->resource_manager()->LookupOrCreate(
-          std::string(kCalibrationContainerName), name(),
-          reinterpret_cast<TRTCalibrationResource**>(&calib_res),
-          {[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status {
-            return this->AllocateCalibrationResources(ctx, cache_res, cr);
-          }}),
-      *helper);
-  core::ScopedUnref calib_sc(calib_res);
-  int num_inputs = ctx->num_inputs();
+
+  CalibrationContext* calib_ctx = cache_res->calib_ctx_.get();
+  const int num_inputs = ctx->num_inputs();
   // TODO(laigd): need to check that input shape matches.
   // Pass input data to calibrator
   std::unordered_map<string, void*> input_data;
@@ -307,9 +345,9 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                       *helper);
     // Check the allocated buffer is sufficient for input
     const auto device_tensor =
-        calib_res->device_tensors_.at(i).AccessTensor(ctx);
+        calib_ctx->device_tensors_.at(i).AccessTensor(ctx);
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
-    input_data.emplace(StrCat(kInputPHName, i), data_address);
+    input_data.emplace(StrCat(IONamePrefixes::kInputPHName, i), data_address);
   }
   VLOG(2) << "Filled map for sending";
   // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
@@ -326,7 +364,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
   // until setDone() is called later by the calibration thread in
   // AllocateCalibrationResources(). In that case, this setBatch() will always
   // be able to detect the error and return false.
-  OP_REQUIRES_ASYNC(ctx, calib_res->calibrator_->setBatch(input_data, *stream),
+  OP_REQUIRES_ASYNC(ctx, calib_ctx->calibrator_->setBatch(input_data, *stream),
                     errors::Internal("Failed to feed calibration data"),
                     *helper);
   VLOG(2) << "Passed calibration data";
@@ -354,9 +392,8 @@ Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {
   return Status::OK();
 }
 
-Status TRTEngineOp::GetEngineInputShapes(
-    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
-    std::vector<TensorShape>* engine_input_shapes) {
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes) {
   auto match_shape = [](const TensorShape& actual_shape,
                         const TensorShape& cached_shape) {
     // Match the rank.
@@ -369,16 +406,17 @@ Status TRTEngineOp::GetEngineInputShapes(
     }
     return true;
   };
-  auto match_shapes = [&](const std::vector<TensorShape>& actual_shapes,
-                          const std::vector<TensorShape>& cached_shapes) {
-    for (int i = 0; i < actual_shapes.size(); ++i) {
-      if (!match_shape(actual_shapes[i], cached_shapes[i])) {
-        return false;
-      }
+  for (int i = 0; i < actual_shapes.size(); ++i) {
+    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
+      return false;
     }
-    return true;
-  };
+  }
+  return true;
+}
 
+Status TRTEngineOp::GetEngineInputShapes(
+    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
+    std::vector<TensorShape>* engine_input_shapes) {
   // VerifyInputShapes() already ensured that all input shapes have same
   // batch size, and are not scalars.
   *engine_input_shapes = actual_input_shapes;
@@ -392,7 +430,7 @@ Status TRTEngineOp::GetEngineInputShapes(
           ", cached size: ", cached_input_shapes.size(),
           " vs. actual size: ", actual_input_shapes.size());
     }
-    if (match_shapes(actual_input_shapes, cached_input_shapes)) {
+    if (AreShapesCompatible(actual_input_shapes, cached_input_shapes)) {
       const int cached_batch_size = cached_input_shapes[0].dim_size(0);
       if (min_matched_batch_size > cached_batch_size) {
         min_matched_batch_size = cached_batch_size;
@@ -407,10 +445,44 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
   core::ScopedUnref sc(helper);
-  if (calibration_mode_) {
-    ExecuteCalibration(ctx, helper);
+
+  // Get TRT resource.
+  TRTEngineCacheResource* cache_res = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper);
+  core::ScopedUnref unref_cache_res(cache_res);
+
+  // Run calibration if in int8+calibration mode.
+  // * Logic in TF 1.x:
+  //   - During conversion: calibration_mode_ is true and cache size is 0, so it
+  //     will run calibration.
+  //   - During inference: calibration_data will be set, so calibration_mode_ is
+  //     false and it won't trigger calibration.
+  // * Logic in TF 2.0:
+  //   - During conversion: similar to 1.x.
+  //   - During inference: calibration_data will still be empty, but cache will
+  //     contain the the calibrated engine, so it won't trigger calibration.
+  //
+  // TODO(laigd): consider the following alternatives:
+  // 1. Serialize the state (calibration or inference) using
+  //    TRTEngineInstance proto (or a new proto), so we know which mode we're
+  //    in and don't run calibration during inference (which is invalid).
+  // 2. Reuse the calibration_data attribute or use a new attribute in the
+  //    NodeDef to indicate whether it's in calibration mode.
+  if (calibration_mode_ && cache_res->cache_.size() == 0) {
+    if (!cache_res->calib_ctx_) {
+      // TODO(laigd): better encapsulation.
+      mutex_lock lock(engine_mutex_);
+      if (!cache_res->calib_ctx_) {
+        OP_REQUIRES_OK_ASYNC(ctx, AllocateCalibrationResources(ctx, cache_res),
+                             *helper);
+      }
+    }
+    // TODO(laigd): check that the input shapes match the shapes of the
+    // persistent tensor in the calibration resource.
+    ExecuteCalibration(ctx, cache_res, helper);
     return;
   }
+
   // Get shapes of inputs to engine.
   std::vector<TensorShape> input_shapes;
   input_shapes.reserve(ctx->num_inputs());
@@ -418,8 +490,9 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     input_shapes.push_back(ctx->input(i).shape());
   }
   OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_shapes), *helper);
-  StatusOr<EngineContext*> status = GetEngine(input_shapes, ctx);
+  StatusOr<EngineContext*> status = GetEngine(input_shapes, ctx, cache_res);
   OP_REQUIRES_OK_ASYNC(ctx, status.status(), *helper);
+
   EngineContext* engine_context = status.ValueOrDie();
   if (!engine_context->cuda_engine) {
     VLOG(1) << "Engine retrieval for input shapes: "
@@ -446,9 +519,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
   // input.
   const int num_batch = ctx->input(0).shape().dim_size(0);
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
+
   std::vector<void*> buffers(num_binding);
+
   for (int i = 0; i < ctx->num_inputs(); i++) {
-    const string input_name = StrCat(kInputPHName, i);
+    const string input_name = StrCat(IONamePrefixes::kInputPHName, i);
     const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       const string msg =
@@ -490,7 +565,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
 
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
-    const string output_name = StrCat(kOutputPHName, i);
+    const string output_name = StrCat(IONamePrefixes::kOutputPHName, i);
     const int binding_index = cuda_engine->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
@@ -580,7 +655,7 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
 
   // Get engine cache.
   return ctx->resource_manager()->LookupOrCreate(
-      "TF-TRT-Engine-Cache", string(resource_name), cache_res,
+      std::string(kTfTrtContainerName), std::string(resource_name), cache_res,
       {[this, ctx](TRTEngineCacheResource** cr) -> Status {
         *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
         return Status::OK();
@@ -588,14 +663,13 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
 }
 
 StatusOr<EngineContext*> TRTEngineOp::GetEngine(
-    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
+    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,
+    TRTEngineCacheResource* cache_res) {
   static EngineContext empty_context;
-  TRTEngineCacheResource* cache_res = nullptr;
-  TF_RETURN_IF_ERROR(GetEngineCacheResource(ctx, &cache_res));
-  core::ScopedUnref sc(cache_res);
 
   mutex_lock lock(engine_mutex_);
-  // TODO(tmorris): using first input to get batch size - is this reliable?
+  // Using first input to get batch size is reliable - VerifyInputShapes() has
+  // verified that.
   const int batch_size = input_shapes[0].dim_size(0);
   auto& cache = cache_res->cache_;
   auto allocator = cache_res->allocator_.get();
@@ -607,9 +681,7 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
   // single element containing the only engine.
   if (static_engine_) {
     if (cache.size()) {
-      // Batch size of engine must be >= the input batch size
-      // TODO(tmorris): use match compatible function?
-      if (cache.begin()->first[0].dim_size(0) >= batch_size) {
+      if (AreShapesCompatible(input_shapes, cache.begin()->first)) {
         return cache.begin()->second.get();
       }
       return &empty_context;
@@ -648,9 +720,7 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
     return cache.at(engine_input_shapes).get();
   }  // static_engine_
 
-  // Handle the dynamic engine case.
-  // See if there is a compatible engine cached. The batch size should be <= the
-  // cached batch size.
+  // Handle the dynamic engine case. See if there is a compatible engine cached.
   std::vector<TensorShape> engine_input_shapes;
   TF_RETURN_IF_ERROR(
       GetEngineInputShapes(cache, input_shapes, &engine_input_shapes));
@@ -694,17 +764,19 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
   return cache.at(engine_input_shapes).get();
 }
 
+// TODO(hinsu): Move this allocation to CalibrationContext constructor, if
+// possible.
 Status TRTEngineOp::AllocateCalibrationResources(
-    OpKernelContext* ctx, TRTEngineCacheResource* cache_res,
-    TRTCalibrationResource** cr) {
-  auto cres = new TRTCalibrationResource();
-  *cr = cres;
+    OpKernelContext* ctx, TRTEngineCacheResource* cache_res) {
+  cache_res->calib_ctx_ = absl::make_unique<CalibrationContext>();
+  auto* cres = cache_res->calib_ctx_.get();
+
   // Get the input shapes.
   const int batch_size = ctx->input(0).dim_size(0);
   const int num_inputs = ctx->num_inputs();
   std::vector<TensorShape> shapes;
   cres->device_tensors_.resize(num_inputs);
-  VLOG(1) << " Constructing calibrator";
+  VLOG(1) << "Constructing calibrator";
   for (int i = 0; i < num_inputs; i++) {
     // allocate workspace on device for inputs
     const Tensor& t = ctx->input(i);
@@ -719,7 +791,7 @@ Status TRTEngineOp::AllocateCalibrationResources(
           "Unsupported data type encountered in input ", i);
     }
     cres->device_buffers_.emplace(
-        StrCat(kInputPHName, i),
+        StrCat(IONamePrefixes::kInputPHName, i),
         std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
   }
   cres->calibrator_.reset(
@@ -733,55 +805,52 @@ Status TRTEngineOp::AllocateCalibrationResources(
   }
 
   cache_res->Ref();
-  cres->thr_.reset(
-      new std::thread([this, cres, shapes, platform_gpu_id, cache_res]() {
-        core::ScopedUnref sc(cache_res);
+  cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id,
+                                    cache_res]() {
+    core::ScopedUnref sc(cache_res);
 
-        LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
-                  << ", Calibration Resource @ " << cres;
-        auto err = cudaSetDevice(platform_gpu_id);
-        if (err != cudaSuccess) {
-          // TODO(aaroey): should return error here.
-          LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
-                     << " in calibration thread";
-        }
-        std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
-                                                       shapes.end());
-        // ConvertGraphDefToEngine() will try to build the engine. This thread
-        // will loop inside buildCudaEngine() consuming the calibration data
-        // that is set by the TF op, and drive the builder until calibrator
-        // returns false. Engine is discarded after calibration table is
-        // generated
-        //
-        // TODO(aaroey): maybe setting the max batch size using the python
-        // calibration wrapper class.
-        auto s = convert::ConvertGraphDefToEngine(
-            this->segment_graph_, TrtPrecisionMode::INT8,
-            cres->calibrator_->getBatchSize(), this->workspace_size_,
-            partial_shapes, &cres->logger_, cache_res->allocator_.get(),
-            cres->calibrator_.get(), &cres->engine_,
-            /*use_calibration=*/true,
-            /*convert_successfully=*/nullptr);
-        if (!s.ok()) {
-          LOG(ERROR) << "Calibration failed: " << s;
-          cres->calibrator_->setDone();  // Ignore further pushes
-        }
+    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
+              << ", Calibration Resource @ " << cres;
+    auto err = cudaSetDevice(platform_gpu_id);
+    if (err != cudaSuccess) {
+      // TODO(aaroey): should return error here.
+      LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
+                 << " in calibration thread";
+    }
+    std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
+                                                   shapes.end());
+    // ConvertGraphDefToEngine() will try to build the engine. This thread
+    // will loop inside buildCudaEngine() consuming the calibration data
+    // that is set by the TF op, and drive the builder until calibrator
+    // returns false. Engine is discarded after calibration table is
+    // generated
+    //
+    // TODO(aaroey): maybe setting the max batch size using the python
+    // calibration wrapper class.
+    auto s = convert::ConvertGraphDefToEngine(
+        this->segment_graph_, TrtPrecisionMode::INT8,
+        cres->calibrator_->getBatchSize(), this->workspace_size_,
+        partial_shapes, &cache_res->GetLogger(), cache_res->allocator_.get(),
+        cres->calibrator_.get(), &cres->engine_,
+        /*use_calibration=*/true,
+        /*convert_successfully=*/nullptr);
+    if (!s.ok()) {
+      LOG(ERROR) << "Calibration failed: " << s;
+      cres->calibrator_->setDone();  // Ignore further pushes
+    } else {
+      // Transfer the ownership of the engine to the engine cache, so we can
+      // dump it out during conversion for TF 2.0.
+      mutex_lock lock(this->engine_mutex_);
+      this->calibrator_ = std::move(cres->calibrator_);
+      TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+          cres->engine_->createExecutionContext());
+      cache_res->cache_.emplace(
+          shapes, absl::make_unique<EngineContext>(std::move(cres->engine_),
+                                                   std::move(exec_context)));
+    }
 
-        // Transfer the ownership of the engine to the engine cache, so we can
-        // dump it out during conversion for TF 2.0.
-        if (cache_res) {
-          mutex_lock lock(this->engine_mutex_);
-          cres->SetCalibrationTable();
-          this->calibrator_ = std::move(cres->calibrator_);
-          TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
-              cres->engine_->createExecutionContext());
-          cache_res->cache_.emplace(
-              shapes, absl::make_unique<EngineContext>(
-                          std::move(cres->engine_), std::move(exec_context)));
-        }
-
-        VLOG(1) << "Calibration loop terminated " << this->name();
-      }));
+    VLOG(1) << "Calibration loop terminated " << this->name();
+  }));
   VLOG(1) << "initialized calibrator resource";
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index d859d5f957f..4228136e0c8 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -22,11 +22,17 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
 #include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@@ -39,6 +45,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
+using ::absl::StrCat;
 using ::testing::ElementsAre;
 
 class TRTEngineOpTestBase : public OpsTestBase {
@@ -50,25 +57,32 @@ class TRTEngineOpTestBase : public OpsTestBase {
 
     // Create simple TF graph.
     Scope s = Scope::NewRootScope();
-    auto feed = ops::Placeholder(s.WithOpName("TensorRTInputPH_0"), dtype,
-                                 ops::Placeholder::Shape({-1, -1}));
+    auto feed = ops::_Arg(s.WithOpName("TensorRTInputPH_0"), dtype, 0);
     auto add = ops::Add(s.WithOpName("add"), feed, feed);
-    ops::Identity(s.WithOpName("TensorRTOutputPH_0"), add);
+    ops::_Retval(s.WithOpName("TensorRTOutputPH_0"), add, 0);
 
     // Serialize the graph. TRTEngineOp will convert it using dynamic mode.
     GraphDef graph_def;
     TF_ASSERT_OK(s.ToGraphDef(&graph_def));
+    Graph* graph = s.graph();
+    const char* op_name = "myop";
+    TF_ASSERT_OK(
+        convert::RegisterGraphToFunctionLibrary(graph_def, graph, op_name));
+    TF_ASSERT_OK(flib_def_->AddLibrary(graph->flib_def()));
+
     PartialTensorShape shape({-1, -1});
 
     // Create the op.
     OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
-    TF_ASSERT_OK(NodeDefBuilder("myop", "TRTEngineOp")
+    NameAttrList function;
+    function.set_name(StrCat(op_name, "_native_segment"));
+    TF_ASSERT_OK(NodeDefBuilder(op_name, "TRTEngineOp")
                      .Input(FakeInput(1, dtype))
                      .Attr("input_shapes", {shape})
                      .Attr("output_shapes", {shape})
                      .Attr("static_engine", false)
-                     .Attr("segment_funcdef_name", "")  // no native fallback
-                     .Attr("serialized_segment", graph_def.SerializeAsString())
+                     .Attr("segment_func", function)
+                     .Attr("serialized_segment", "")
                      .Attr("calibration_data", "")
                      .Attr("max_cached_engines_count", max_cached_engines_count)
                      .Attr("workspace_size_bytes", 1 << 20)
@@ -76,7 +90,7 @@ class TRTEngineOpTestBase : public OpsTestBase {
                      .Attr("use_calibration", false)
                      .Attr("OutT", {dtype})
                      .Finalize(OpsTestBase::node_def()));
-    TF_ASSERT_OK(OpsTestBase::InitOp());
+    TF_ASSERT_OK(InitOpWithFunctionLibrary());
   }
 
   template <typename T>
@@ -90,9 +104,20 @@ class TRTEngineOpTestBase : public OpsTestBase {
     inputs_.clear();
     gtl::STLDeleteElements(&tensors_);
   }
+
+ private:
+  Status InitOpWithFunctionLibrary() {
+    OpKernel* kernel = nullptr;
+    Status status = CreateOpKernel(device_type_, device_, allocator(),
+                                   pflr_->GetFLR(device_->name()), node_def_,
+                                   TF_GRAPH_DEF_VERSION, &kernel);
+    kernel_ = std::unique_ptr<OpKernel>(kernel);
+    if (kernel_ != nullptr) input_types_ = kernel_->input_types();
+    return status;
+  }
 };
 
-TEST_F(TRTEngineOpTestBase, dynamic_shapes) {
+TEST_F(TRTEngineOpTestBase, DynamicShapes) {
   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/4);
 
   // Execute the op with batch size > 1.
@@ -101,8 +126,8 @@ TEST_F(TRTEngineOpTestBase, dynamic_shapes) {
 
   // Get the engine cache.
   TRTEngineCacheResource* cache_resource = nullptr;
-  TF_ASSERT_OK(device_->resource_manager()->Lookup("TF-TRT-Engine-Cache",
-                                                   "myop", &cache_resource));
+  TF_ASSERT_OK(
+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
   core::ScopedUnref sc(cache_resource);
 
   // It should contain only one engine.
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 8f6f08710d1..891b75be824 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
@@ -40,11 +41,9 @@ namespace tensorflow {
 namespace tensorrt {
 using ::nvinfer1::IRuntime;
 
-class CreateTRTEngineCacheHandle : public OpKernel {
+class CreateTRTResourceHandle : public OpKernel {
  public:
-  explicit CreateTRTEngineCacheHandle(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
+  explicit CreateTRTResourceHandle(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("resource_name", &resource_name_));
   }
 
@@ -57,12 +56,11 @@ class CreateTRTEngineCacheHandle : public OpKernel {
         OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
                                                &handle_, attr));
 
-        VLOG(1) << "Creating TRT engine cache resource handle for container "
-                << container_ << " and op " << resource_name_ << " on device "
-                << ctx->device()->name();
+        VLOG(1) << "Creating TRT engine cache resource handle for op "
+                << resource_name_ << " on device " << ctx->device()->name();
         handle_.scalar<ResourceHandle>()() =
-            MakeResourceHandle<TRTEngineCacheResource>(ctx, container_,
-                                                       resource_name_);
+            MakeResourceHandle<TRTEngineCacheResource>(
+                ctx, std::string(kTfTrtContainerName), resource_name_);
         initialized_ = true;
       }
     }
@@ -70,23 +68,22 @@ class CreateTRTEngineCacheHandle : public OpKernel {
   }
 
  private:
-  string container_;
   string resource_name_;
   Tensor handle_;
   mutex mutex_;
   bool initialized_ GUARDED_BY(mutex_) = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTEngineCacheHandle);
+  TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTResourceHandle);
 };
 
-REGISTER_KERNEL_BUILDER(Name("CreateTRTEngineCacheHandle")
+REGISTER_KERNEL_BUILDER(Name("CreateTRTResourceHandle")
                             .Device(DEVICE_GPU)
-                            .HostMemory("engine_cache_handle"),
-                        CreateTRTEngineCacheHandle);
+                            .HostMemory("resource_handle"),
+                        CreateTRTResourceHandle);
 
-class PopulateTRTEngineCache : public OpKernel {
+class InitializeTRTResource : public OpKernel {
  public:
-  explicit PopulateTRTEngineCache(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit InitializeTRTResource(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("max_cached_engines_count", &max_cached_engines_));
   }
@@ -112,7 +109,7 @@ class PopulateTRTEngineCache : public OpKernel {
                                  resource->cache_.size(), " entries."));
 
     // Get the file name.
-    const string& filename = ctx->input(1).scalar<string>()();
+    const string& filename = ctx->input(1).scalar<tstring>()();
     OP_REQUIRES(ctx, !filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
@@ -124,7 +121,7 @@ class PopulateTRTEngineCache : public OpKernel {
     uint64 offset = 0;
     int num_loaded_engine = 0;
     do {
-      string record;
+      tstring record;
       Status status = reader->ReadRecord(&offset, &record);
       if (errors::IsOutOfRange(status)) break;
 
@@ -150,48 +147,51 @@ class PopulateTRTEngineCache : public OpKernel {
                                      raw_engine->createExecutionContext())));
       ++num_loaded_engine;
     } while (1);
-    VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines to container "
-            << handle.container() << " for op " << handle.name()
-            << " on device " << ctx->device()->name() << " from file "
-            << filename;
+    VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines for op "
+            << handle.name() << " on device " << ctx->device()->name()
+            << " from file " << filename;
   }
 
  private:
   // Maximum number of cached engines
   int max_cached_engines_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PopulateTRTEngineCache);
+  TF_DISALLOW_COPY_AND_ASSIGN(InitializeTRTResource);
 };
 
-REGISTER_KERNEL_BUILDER(Name("PopulateTRTEngineCache")
+REGISTER_KERNEL_BUILDER(Name("InitializeTRTResource")
                             .Device(DEVICE_GPU)
-                            .HostMemory("engine_cache_handle"),
-                        PopulateTRTEngineCache);
+                            .HostMemory("resource_handle"),
+                        InitializeTRTResource);
 
-class DumpTRTEngineCache : public OpKernel {
+class SerializeTRTResource : public OpKernel {
  public:
-  explicit DumpTRTEngineCache(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("delete_cache_after_dump",
-                                     &delete_cache_after_dump_));
+  explicit SerializeTRTResource(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("delete_resource", &delete_resource_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const string& container = ctx->input(0).scalar<string>()();
-    const string& resource_name = ctx->input(1).scalar<string>()();
-    const string& filename = ctx->input(2).scalar<string>()();
+    const string& resource_name = ctx->input(0).scalar<tstring>()();
+    const string& filename = ctx->input(1).scalar<tstring>()();
     OP_REQUIRES(ctx, !filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
+    // Lookup engine cache resource.
     TRTEngineCacheResource* resource = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup(
-                            container, resource_name, &resource));
+    OP_REQUIRES_OK(
+        ctx, ctx->resource_manager()->Lookup(std::string(kTfTrtContainerName),
+                                             resource_name, &resource));
     core::ScopedUnref unref_me(resource);
 
+    // Terminate the calibration if any.
+    if (resource->calib_ctx_) resource->calib_ctx_->TerminateCalibration();
+
     // Serialize the engines and write them to file.
     std::unique_ptr<WritableFile> file;
     OP_REQUIRES_OK(ctx, ctx->env()->NewWritableFile(filename, &file));
     auto writer = absl::make_unique<io::RecordWriter>(file.get());
 
+    int num_serialized_engines = 0;
     for (const auto& pair : resource->cache_) {
       // Ignore engines that failed to build.
       const std::unique_ptr<EngineContext>& engine = pair.second;
@@ -211,30 +211,29 @@ class DumpTRTEngineCache : public OpKernel {
 
       OP_REQUIRES_OK(ctx,
                      writer->WriteRecord(engine_instance.SerializeAsString()));
+      ++num_serialized_engines;
     }
-    VLOG(1) << "Serialized " << resource->cache_.size()
-            << " TRT engines in container " << container << " for op "
+    VLOG(1) << "Serialized " << num_serialized_engines << " TRT engines for op "
             << resource_name << " on device " << ctx->device()->name()
             << " to file " << filename;
 
-    if (delete_cache_after_dump_) {
-      VLOG(1) << "Destroying TRT engine cache resource in container "
-              << container << " for op " << resource_name << " on device "
-              << ctx->device()->name();
+    if (delete_resource_) {
+      VLOG(1) << "Destroying TRT engine cache resource for op " << resource_name
+              << " on device " << ctx->device()->name();
       OP_REQUIRES_OK(ctx,
                      ctx->resource_manager()->Delete<TRTEngineCacheResource>(
-                         container, resource_name));
+                         std::string(kTfTrtContainerName), resource_name));
     }
   }
 
  private:
-  bool delete_cache_after_dump_ = false;
+  bool delete_resource_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DumpTRTEngineCache);
+  TF_DISALLOW_COPY_AND_ASSIGN(SerializeTRTResource);
 };
 
-REGISTER_KERNEL_BUILDER(Name("DumpTRTEngineCache").Device(DEVICE_GPU),
-                        DumpTRTEngineCache);
+REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
+                        SerializeTRTResource);
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index b3e541aab40..d27a67582d8 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -92,11 +92,10 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   SetDevice(DEVICE_GPU, std::move(device));
 
   // Create the resource handle.
-  const string container = "mycontainer";
+  const string container(kTfTrtContainerName);
   const string resource_name = "myresource";
   Reset();
-  TF_ASSERT_OK(NodeDefBuilder("op", "CreateTRTEngineCacheHandle")
-                   .Attr("container", container)
+  TF_ASSERT_OK(NodeDefBuilder("op", "CreateTRTResourceHandle")
                    .Attr("resource_name", resource_name)
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
@@ -108,7 +107,7 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   EXPECT_TRUE(
       errors::IsNotFound(rm->Lookup(container, resource_name, &resource)));
 
-  // Create the resouce using an empty file with PopulateTRTEngineCache.
+  // Create the resouce using an empty file with InitializeTRTResource.
   Reset();
   Env* env = Env::Default();
   const string filename = io::JoinPath(testing::TmpDir(), "trt_engine_file");
@@ -116,7 +115,7 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
     std::unique_ptr<WritableFile> file;
     TF_ASSERT_OK(env->NewWritableFile(filename, &file));
   }
-  TF_ASSERT_OK(NodeDefBuilder("op", "PopulateTRTEngineCache")
+  TF_ASSERT_OK(NodeDefBuilder("op", "InitializeTRTResource")
                    .Input(FakeInput(DT_RESOURCE))
                    .Input(FakeInput(DT_STRING))
                    .Attr("max_cached_engines_count", 1)
@@ -137,18 +136,16 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
       absl::make_unique<EngineContext>(std::move(engine), std::move(context)));
   resource->Unref();
 
-  // Serialize the engine using DumpTRTEngineCache op.
+  // Serialize the engine using SerializeTRTResource op.
   Reset();
-  TF_ASSERT_OK(NodeDefBuilder("op", "DumpTRTEngineCache")
-                   .Attr("delete_cache_after_dump", true)
-                   .Input(FakeInput(DT_STRING))
+  TF_ASSERT_OK(NodeDefBuilder("op", "SerializeTRTResource")
+                   .Attr("delete_resource", true)
                    .Input(FakeInput(DT_STRING))
                    .Input(FakeInput(DT_STRING))
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<string>(TensorShape({}), {container});
-  AddInputFromArray<string>(TensorShape({}), {resource_name});
-  AddInputFromArray<string>(TensorShape({}), {filename});
+  AddInputFromArray<tstring>(TensorShape({}), {resource_name});
+  AddInputFromArray<tstring>(TensorShape({}), {filename});
   TF_ASSERT_OK(RunOpKernel());
 
   // Make sure the cache is deleted.
@@ -178,14 +175,14 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
 
   // Recreate the cache resource.
   Reset();
-  TF_ASSERT_OK(NodeDefBuilder("op", "PopulateTRTEngineCache")
+  TF_ASSERT_OK(NodeDefBuilder("op", "InitializeTRTResource")
                    .Input(FakeInput(DT_RESOURCE))
                    .Input(FakeInput(DT_STRING))
                    .Attr("max_cached_engines_count", 1)
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<ResourceHandle>(TensorShape({}), {handle});
-  AddInputFromArray<string>(TensorShape({}), {filename});
+  AddInputFromArray<tstring>(TensorShape({}), {filename});
   TF_ASSERT_OK(RunOpKernel());
   EXPECT_TRUE(rm->Lookup(container, resource_name, &resource).ok());
   EXPECT_EQ(1, resource->cache_.size());
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index b8f9058d8f6..7d8ff6dbe43 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 // key to cache the instantiated functions for different executor subgraphs.
 REGISTER_OP("TRTEngineOp")
     .Attr("serialized_segment: string")
-    .Attr("segment_funcdef_name: string")
+    .Attr("segment_func: func = {}")
     .Attr("InT: list({int8,float16,float32,int32})")
     .Attr("OutT: list({int8,float16,float32,int32})")
     .Attr("max_cached_engines_count: int = 1")
@@ -51,10 +51,11 @@ REGISTER_OP("TRTEngineOp")
     // inference function as a workaround.
     .SetShapeFn(shape_inference::UnknownShape)
     // Deprecated attributes.
+    .Attr("segment_funcdef_name: string = ''")
     .Attr("cached_engine_batches: list(int) >= 0 = []")
     .Attr("fixed_input_size: bool = true")
-    .Attr("input_shapes: list(shape)")
-    .Attr("output_shapes: list(shape)")
+    .Attr("input_shapes: list(shape) = []")
+    .Attr("output_shapes: list(shape) = []")
     .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 67177efe228..01911de66ec 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -24,23 +24,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("CreateTRTEngineCacheHandle")
-    .Attr("container: string")
+REGISTER_OP("CreateTRTResourceHandle")
     .Attr("resource_name: string")
-    .Output("engine_cache_handle: resource")
+    .Output("resource_handle: resource")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("PopulateTRTEngineCache")
+REGISTER_OP("InitializeTRTResource")
     .Attr("max_cached_engines_count: int = 1")
-    .Input("engine_cache_handle: resource")
+    .Input("resource_handle: resource")
     .Input("filename: string")
     .SetIsStateful()
     .SetShapeFn(shape_inference::NoOutputs);
 
-REGISTER_OP("DumpTRTEngineCache")
-    .Attr("delete_cache_after_dump: bool = false")
-    .Input("container: string")
+REGISTER_OP("SerializeTRTResource")
+    .Attr("delete_resource: bool = false")
     .Input("resource_name: string")
     .Input("filename: string")
     .SetIsStateful()
diff --git a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc
deleted file mode 100644
index 5d6e11b536e..00000000000
--- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-
-namespace tensorflow {
-namespace tensorrt {
-
-const absl::string_view kCalibrationContainerName = "TF-TRT-Calibration";
-
-TRTCalibrationResource::~TRTCalibrationResource() {
-  VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-}
-
-string TRTCalibrationResource::DebugString() const {
-  std::stringstream oss;
-  using std::dec;
-  using std::endl;
-  using std::hex;
-  oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
-      << " Builder    = " << hex << builder_.get() << dec << endl
-      << " Engine     = " << hex << engine_.get() << dec << endl
-      << " Logger     = " << hex << &logger_ << dec << endl
-      << " Thread     = " << hex << thr_.get() << dec << endl;
-  return oss.str();
-}
-
-void TRTCalibrationResource::SetCalibrationTable() {
-  calibration_table_ = calibrator_->getCalibrationTableAsString();
-}
-
-Status TRTCalibrationResource::SerializeToString(string* serialized) {
-  calibrator_->waitAndSetDone();
-  thr_->join();
-  *serialized = calibration_table_;
-  if (serialized->empty()) {
-    return errors::Unknown("Calibration table is empty.");
-  }
-  return Status::OK();
-}
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h
deleted file mode 100644
index e7c29e9f1ed..00000000000
--- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
-#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
-
-#include <list>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "third_party/tensorrt/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-ABSL_CONST_INIT extern const absl::string_view kCalibrationContainerName;
-
-class TRTCalibrationResource : public ResourceBase {
- public:
-  ~TRTCalibrationResource() override;
-
-  string DebugString() const override;
-
-  void SetCalibrationTable();
-
-  Status SerializeToString(string* serialized);
-
-  // Lookup table for temporary staging areas of input tensors for calibration.
-  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
-
-  // Temporary staging areas for calibration inputs.
-  std::vector<PersistentTensor> device_tensors_;
-
-  string calibration_table_;
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
-  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
-  Logger logger_;
-  // TODO(sami): Use threadpool threads!
-  std::unique_ptr<std::thread> thr_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index 008cabb9cb4..885f58cd70c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInfer.h"
 #endif
 
@@ -23,13 +24,16 @@ namespace tensorflow {
 namespace tensorrt {
 
 bool IsGoogleTensorRTEnabled() {
-  // TODO(laigd): consider also checking if tensorrt shared libraries are
-  // accessible. We can then direct users to this function to make sure they can
-  // safely write code that uses tensorrt conditionally. E.g. if it does not
-  // check for for tensorrt, and user mistakenly uses tensorrt, they will just
-  // crash and burn.
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
-  return true;
+  auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries();
+  if (!handle_or.ok()) {
+    LOG(WARNING) << "Cannot dlopen some TensorRT libraries. If you would like "
+                    "to use Nvidia GPU with TensorRT, please make sure the "
+                    "missing libraries mentioned above are installed properly.";
+    return false;
+  } else {
+    return true;
+  }
 #else
   return false;
 #endif
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index 43dcd52b5a2..5ab6bf1a317 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -30,6 +30,28 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+string CalibrationContext::TerminateCalibration() {
+  mutex_lock l(mu_);
+  if (terminated_) return calibration_table_;
+
+  TRTInt8Calibrator* raw_calibrator = calibrator_.get();
+  raw_calibrator->waitAndSetDone();
+  terminated_ = true;
+
+  // At this point the calibration thread `thr_` is woken up and can
+  // transfer the ownership of `calibrator_` and `engine_` at any time, so
+  // it's not safe to use `calibrator_` below, but we can still access it
+  // using raw pointer.
+  // TODO(laigd): make TRTEngineOp::AllocateCalibrationResources() a member
+  // function of this class instead.
+
+  thr_->join();
+  calibration_table_ = raw_calibrator->getCalibrationTableAsString();
+  return calibration_table_;
+}
+
+const absl::string_view kTfTrtContainerName = "TF-TRT";
+
 Logger& TRTEngineCacheResource::GetLogger() {
   static Logger* logger = new Logger();
   return *logger;
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 442e0bcfb53..8d603ac4d55 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -17,10 +17,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
 
 #include <list>
+#include <thread>
 #include <unordered_map>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -137,6 +139,31 @@ struct EngineContext {
       GUARDED_BY(mu);
 };
 
+// Contains the context required to build the calibration data.
+class CalibrationContext {
+ public:
+  string TerminateCalibration();
+
+  // Lookup table for temporary staging areas of input tensors for calibration.
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+  // Temporary staging areas for calibration inputs.
+  std::vector<PersistentTensor> device_tensors_;
+
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  // TODO(sami): Use threadpool threads!
+  std::unique_ptr<std::thread> thr_;
+
+ private:
+  mutex mu_;
+  bool terminated_ GUARDED_BY(mu_) = false;
+  std::string calibration_table_ GUARDED_BY(mu_);
+};
+
+ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
+
 class TRTEngineCacheResource : public ResourceBase {
  public:
   // According to the TensorRT API, the logger is considered a singleton by the
@@ -159,6 +186,10 @@ class TRTEngineCacheResource : public ResourceBase {
   LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
            VectorTensorShapeHasher>
       cache_;
+
+  // TODO(hinsu): Use different calibration context for the available shapes and
+  // attach it to each item of the cache.
+  std::unique_ptr<CalibrationContext> calib_ctx_;
 };
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 6a28a5acb14..f6bf672d6a0 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test")
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
@@ -29,6 +29,7 @@ package_group(
     packages = [
         "//learning/brain/tools/tf_replay/...",
         "//tensorflow/...",
+        "//tensorflow_models/...",
     ],
 )
 
@@ -202,13 +203,15 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":common",
+        ":frontend_attributes_util",
         ":host_compute_metadata_proto",
+        ":rearrange_function_argument",
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
-        "//tensorflow/compiler/tf2xla:rearrange_function_argument",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -269,6 +272,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "frontend_attributes_util",
+    srcs = ["frontend_attributes_util.cc"],
+    hdrs = ["frontend_attributes_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "sharding_util",
     srcs = ["sharding_util.cc"],
@@ -577,6 +595,7 @@ cc_library(
         "functionalize_while.h",
     ],
     deps = [
+        ":frontend_attributes_util",
         ":functionalize_cond",
         ":functionalize_control_flow_util",
         ":tf2xla_util",
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index ad2cc7b32f0..48513a43fb3 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -91,7 +91,7 @@ Status GetCompileTimeConstInputs(const NodeDef& node, const OpKernel* op_kernel,
                                  FunctionLibraryRuntime* flib_runtime) {
   DCHECK(op_def != nullptr || op_kernel != nullptr);
   // TODO(b/124403063): Implement similar functionality for function call nodes.
-  if (node.op() == "While") {
+  if (node.op() == "While" || node.op() == "StatelessWhile") {
     // For While nodes, recurse into the body and cond graphs.
     const FunctionBody* fcond = nullptr;
     const FunctionBody* fbody = nullptr;
diff --git a/tensorflow/compiler/tf2xla/frontend_attributes_util.cc b/tensorflow/compiler/tf2xla/frontend_attributes_util.cc
new file mode 100644
index 00000000000..e0c70b81771
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/frontend_attributes_util.cc
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+const char kXlaFrontendAttributesAttrName[] = "_XlaFrontendAttributes";
+
+xla::StatusOr<absl::optional<xla::FrontendAttributes>>
+GetFrontendAttributesFromAttrSlice(const AttrSlice& attrs) {
+  const AttrValue* attr = attrs.Find(kXlaFrontendAttributesAttrName);
+  if (attr == nullptr) {
+    return xla::StatusOr<absl::optional<xla::FrontendAttributes>>(
+        absl::nullopt);
+  }
+  xla::FrontendAttributes attributes;
+  if (!attributes.ParseFromString(attr->s())) {
+    return errors::InvalidArgument(
+        "Experimental _XlaFrontendAttributes attribute was not a valid encoded "
+        "xla::FrontendAttributes proto.");
+  }
+  return absl::optional<xla::FrontendAttributes>(attributes);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/frontend_attributes_util.h b/tensorflow/compiler/tf2xla/frontend_attributes_util.h
new file mode 100644
index 00000000000..421f21e71d1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/frontend_attributes_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+
+namespace tensorflow {
+
+// Frontend Attributes Id.
+extern const char kXlaFrontendAttributesAttrName[];
+// Return the FrontendAttributes stored in the AttrSlice if there are some.
+//
+// Return an InvalidArgument error if some attributes are present but
+// cannot be parsed.
+xla::StatusOr<absl::optional<xla::FrontendAttributes>>
+GetFrontendAttributesFromAttrSlice(const AttrSlice& attrs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
index 54cebc61778..793a56e865d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -48,6 +48,43 @@ xla::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
   return AddNodeDefToGraph(ret_def, graph);
 }
 
+Status ExtractWhileLoopFrames(
+    const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
+    std::unordered_map<string, WhileLoopFrame>* frames) {
+  for (Node* node : graph->op_nodes()) {
+    const ControlFlowInfo& cf = cf_info[node->id()];
+
+    VLOG(2) << "node: " << node->name() << " (" << node->id()
+            << ") frame_name: " << cf.frame_name
+            << " frame: " << (cf.frame ? cf.frame->name() : "---")
+            << " parent_frame: "
+            << (cf.parent_frame ? cf.parent_frame->name() : "---");
+    TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr);
+
+    WhileLoopFrame& frame = (*frames)[cf.frame_name];
+    WhileLoopFrame* parent =
+        &(*frames)[cf_info[cf.parent_frame->id()].frame_name];
+    if (frame.parent == nullptr) {
+      frame.parent = parent;
+      frame.name = cf.frame_name;
+      ++parent->num_children;
+    }
+
+    if (IsEnter(node)) {
+      WhileLoopArg arg;
+      arg.enter = node;
+      TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant",
+                                     &arg.is_loop_invariant));
+      frame.args.push_back(arg);
+    } else if (IsLoopCond(node)) {
+      frame.loop_cond = node;
+    }
+    frame.nodes.insert(node);
+  }
+
+  return Status::OK();
+}
+
 // Check that the graph has no cycle containing the given node.
 Status CheckNodeNotInCycle(const Node* node, const int num_nodes) {
   std::vector<const Node*> ready;
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
index 582b49d5116..f986376c8e3 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -18,12 +18,56 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
 
-// Utility functions shared between functionalize cond and while.
+// Utility functions shared between functionalize cond and while
+// or used by other graph optimization passes.
 
 namespace tensorflow {
 
+// Information about a loop argument.
+struct WhileLoopArg {
+  // Every loop argument has an Enter node.
+  Node* enter;
+
+  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
+  // attribute on the Enter node.
+  bool is_loop_invariant;
+
+  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
+  // arguments must have all of the following nodes:
+  Node* merge = nullptr;
+  Node* switch_node = nullptr;
+  Node* next_iteration = nullptr;
+  Node* exit = nullptr;
+};
+
+// Information about a loop frame.
+struct WhileLoopFrame {
+  string name;
+
+  // Pointer to the parent frame. The root frame has a pointer to itself.
+  WhileLoopFrame* parent = nullptr;
+  int num_children = 0;
+
+  // Arguments to this loop.
+  std::vector<WhileLoopArg> args;
+
+  // The loop condition of the loop. There should be exactly one loop condition
+  // in every loop.
+  Node* loop_cond = nullptr;
+
+  // Set of nodes that belong to the loop frame.
+  std::unordered_set<Node*> nodes;
+};
+
+// Extracts v1 while loops within a graph and creates a map of
+// <ControlFLowInfo.name, WhileLoopFrame>.
+Status ExtractWhileLoopFrames(
+    const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
+    std::unordered_map<string, WhileLoopFrame>* frames);
+
 // Check that the graph has no cycle containing the given node.
 Status CheckNodeNotInCycle(const Node* node, const int num_nodes);
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index e4a21f90598..74790f9ee4d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -42,42 +43,6 @@ namespace {
 
 using xla::StatusOr;
 
-// Information about a loop argument.
-struct Arg {
-  // Every loop argument has an Enter node.
-  Node* enter;
-
-  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
-  // attribute on the Enter node.
-  bool is_loop_invariant;
-
-  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
-  // arguments must have all of the following nodes:
-  Node* merge = nullptr;
-  Node* switch_node = nullptr;
-  Node* next_iteration = nullptr;
-  Node* exit = nullptr;
-};
-
-// Information about a loop frame.
-struct Frame {
-  string name;
-
-  // Pointer to the parent frame. The root frame has a pointer to itself.
-  Frame* parent = nullptr;
-  int num_children = 0;
-
-  // Arguments to this loop.
-  std::vector<Arg> args;
-
-  // The loop condition of the loop. There should be exactly one loop condition
-  // in every loop.
-  Node* loop_cond = nullptr;
-
-  // Set of nodes that belong to the loop frame.
-  std::unordered_set<Node*> nodes;
-};
-
 // Copies a subgraph from `graph` to `output` by performing a reverse DFS
 // starting at nodes in vector `stack`.
 // `node_map` is a vector indexed by source node ID to dest nodes.
@@ -93,7 +58,7 @@ struct Frame {
 // taking from the Switch node was not necessarily the first output, but _Arg
 // nodes only have one output. By adding the Switch node to `squash_src_outputs`
 // we rewrite the src_output of the corresponding edge to be 0.
-Status CopySubgraph(const Graph& graph, const Frame* frame,
+Status CopySubgraph(const Graph& graph, const WhileLoopFrame* frame,
                     std::vector<Node*> stack,
                     const std::vector<bool>& squash_src_outputs,
                     std::vector<Node*>* node_map, Graph* output) {
@@ -154,7 +119,7 @@ StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
 }
 
 // Builds a graph for the loop condition.
-Status BuildLoopCondition(const Graph& graph, Frame* frame,
+Status BuildLoopCondition(const Graph& graph, WhileLoopFrame* frame,
                           std::unique_ptr<Graph>* cond_output) {
   VLOG(2) << "Building loop condition for " << frame->name;
   *cond_output = absl::make_unique<Graph>(graph.op_registry());
@@ -166,7 +131,7 @@ Status BuildLoopCondition(const Graph& graph, Frame* frame,
 
   // Build one _Arg node for each Enter node.
   for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
+    const WhileLoopArg& arg = frame->args[i];
 
     TF_ASSIGN_OR_RETURN(Node * arg_node,
                         BuildArgNode(output, arg.enter->input_type(0), i));
@@ -190,7 +155,7 @@ Status BuildLoopCondition(const Graph& graph, Frame* frame,
 }
 
 // Builds a graph for the loop body.
-Status BuildLoopBody(const Graph& graph, Frame* frame,
+Status BuildLoopBody(const Graph& graph, WhileLoopFrame* frame,
                      DataTypeVector* arg_types,
                      std::unique_ptr<Graph>* body_output) {
   VLOG(2) << "Building loop body for " << frame->name;
@@ -206,7 +171,7 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
   next_iterations.reserve(frame->args.size());
   arg_types->reserve(frame->args.size());
   for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
+    const WhileLoopArg& arg = frame->args[i];
 
     DataType dtype = arg.enter->input_type(0);
     arg_types->push_back(dtype);
@@ -297,7 +262,7 @@ Status AddMissingFunctionDef(const FunctionDef& fdef,
 }
 
 Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
-                         Graph* graph, Frame* frame,
+                         Graph* graph, WhileLoopFrame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
           << DumpGraphToFile("functionalize_before", *graph, library);
@@ -307,8 +272,8 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   // shared Enter node. We clone Enter nodes with multiple successors to
   // maintain the invariant of a unique Enter node per argument of the final
   // loop.
-  std::vector<Arg> args;
-  for (const Arg& arg : frame->args) {
+  std::vector<WhileLoopArg> args;
+  for (const WhileLoopArg& arg : frame->args) {
     if (arg.is_loop_invariant) {
       args.push_back(arg);
     } else {
@@ -319,7 +284,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
           continue;
         }
         TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name();
-        Arg new_arg;
+        WhileLoopArg new_arg;
         new_arg.is_loop_invariant = false;
         if (i == 0) {
           new_arg.enter = arg.enter;
@@ -342,7 +307,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   frame->args = std::move(args);
 
   std::sort(frame->args.begin(), frame->args.end(),
-            [](const Arg& a, const Arg& b) {
+            [](const WhileLoopArg& a, const WhileLoopArg& b) {
               return NodeCmpByNameResourcesLast()(a.enter, b.enter);
             });
 
@@ -368,7 +333,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   //               ^                  ^
   //               |                  |
   //              ...                ...
-  for (Arg& arg : frame->args) {
+  for (WhileLoopArg& arg : frame->args) {
     if (!arg.is_loop_invariant) {
       // Follow the edge from the Enter to Merge.
       const Edge* enter_merge = nullptr;
@@ -530,6 +495,12 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   builder.Attr("cond", cond_name);
   builder.Attr("body", body_name);
   string outside_compilation;
+  string frontend_attributes;
+  if (GetNodeAttr(frame->loop_cond->def(), kXlaFrontendAttributesAttrName,
+                  &frontend_attributes)
+          .ok()) {
+    builder.Attr(kXlaFrontendAttributesAttrName, frontend_attributes);
+  }
   if (GetNodeAttr(frame->loop_cond->def(), kXlaOutsideCompilationAttrName,
                   &outside_compilation)
           .ok()) {
@@ -537,7 +508,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   }
   std::vector<NodeDefBuilder::NodeOut> inputs;
   for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
+    const WhileLoopArg& arg = frame->args[i];
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
     if (in_edge->IsControlEdge()) {
@@ -553,7 +524,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
 
   // Copies edges to the Enter nodes and from the Exit nodes onto the While.
   for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
+    const WhileLoopArg& arg = frame->args[i];
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
     if (in_edge->IsControlEdge()) {
@@ -613,39 +584,11 @@ Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library,
   }
 
   // Builds Frames, indexed by name.
-  std::unordered_map<string, Frame> frames;
-  for (Node* node : graph->op_nodes()) {
-    const ControlFlowInfo& cf = cf_info[node->id()];
-
-    VLOG(2) << "node: " << node->name() << " (" << node->id()
-            << ") frame_name: " << cf.frame_name
-            << " frame: " << (cf.frame ? cf.frame->name() : "---")
-            << " parent_frame: "
-            << (cf.parent_frame ? cf.parent_frame->name() : "---");
-    TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr);
-
-    Frame& frame = frames[cf.frame_name];
-    Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name];
-    if (frame.parent == nullptr) {
-      frame.parent = parent;
-      frame.name = cf.frame_name;
-      ++parent->num_children;
-    }
-
-    if (IsEnter(node)) {
-      Arg arg;
-      arg.enter = node;
-      TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant",
-                                     &arg.is_loop_invariant));
-      frame.args.push_back(arg);
-    } else if (IsLoopCond(node)) {
-      frame.loop_cond = node;
-    }
-    frame.nodes.insert(node);
-  }
+  std::unordered_map<string, WhileLoopFrame> frames;
+  TF_RETURN_IF_ERROR(ExtractWhileLoopFrames(cf_info, graph, &frames));
 
   // Adds frames with no children (i.e., the innermost frames) to a worklist.
-  std::deque<Frame*> worklist;
+  std::deque<WhileLoopFrame*> worklist;
   for (auto& frame : frames) {
     if (frame.second.num_children == 0) {
       worklist.push_back(&frame.second);
@@ -654,7 +597,7 @@ Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library,
 
   // Eliminate loops from innermost to outermost.
   while (!worklist.empty()) {
-    Frame* frame = worklist.front();
+    WhileLoopFrame* frame = worklist.front();
     worklist.pop_front();
     if (frame->parent == frame) {
       // Skip the root frame.
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 139d6709215..d60b4ca0b2b 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -55,8 +55,8 @@ tf_kernel_library(
         "lrn_ops.cc",
         "matmul_op.cc",
         "matrix_band_part_op.cc",
+        "matrix_diag_ops.cc",
         "matrix_inverse_op.cc",
-        "matrix_set_diag_op.cc",
         "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "next_after_op.cc",
@@ -132,6 +132,8 @@ tf_kernel_library(
         ":if_op",
         ":tensor_list_utils",
         ":while_op",
+        "//tensorflow/compiler/jit:xla_activity_listener",
+        "//tensorflow/compiler/jit:xla_activity_proto_cc",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
@@ -202,6 +204,7 @@ tf_kernel_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 747ec133983..1f12c7980e7 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/pooling.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -153,52 +155,5 @@ class DiagPartOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("DiagPart"), DiagPartOp);
 
-class MatrixDiagOp : public XlaOpKernel {
- public:
-  explicit MatrixDiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
-                errors::InvalidArgument("MatrixDiag op must have at an input"));
-    const TensorShape input_shape = ctx->InputShape(0);
-
-    auto dims = input_shape.dim_sizes();
-    OP_REQUIRES(ctx, !dims.empty(),
-                errors::InvalidArgument("Expected 1 <= dims, got shape ",
-                                        input_shape.DebugString()));
-
-
-    int last_dim = dims.size() - 1;
-    int64 last_dim_size = input_shape.dim_size(last_dim);
-    absl::Span<const int64> other_dims(dims);
-    other_dims.remove_suffix(1);
-
-    xla::XlaOp input = ctx->Input(0);
-    xla::XlaOp diag = CreateDiagonal(input, last_dim_size, other_dims);
-    ctx->SetOutput(0, diag);
-  }
-};
-
-REGISTER_XLA_OP(Name("MatrixDiag"), MatrixDiagOp);
-
-class MatrixDiagPartOp : public XlaOpKernel {
- public:
-  explicit MatrixDiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    auto dims = input_shape.dim_sizes();
-
-    OP_REQUIRES(ctx, 2 <= dims.size(),
-                errors::InvalidArgument("Expected 2 <= dims, got shape ",
-                                        input_shape.DebugString()));
-
-    xla::XlaOp input = ctx->Input(0);
-    ctx->SetOutput(0, xla::GetMatrixDiagonal(input));
-  }
-};
-
-REGISTER_XLA_OP(Name("MatrixDiagPart"), MatrixDiagPartOp);
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index b309541a864..8e53ca162f5 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/kernels/image_resize_ops.h"
 
+#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -255,6 +258,15 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
 
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, out_size, align_corners);
+
+  if (dims.kernel_size[0] * dims.kernel_size[1] >
+      kMax2DKernelSize * kMax2DKernelSize) {
+    BroadcastOptimizationRemark(
+        XlaOptimizationRemark::SLOW_IMAGE_RESIZE_DIMENSIONS,
+        absl::StrFormat("%dx%d", dims.kernel_size[0], dims.kernel_size[1]))
+        .IgnoreError();
+  }
+
   xla::XlaOp output;
 
   // Concatenation and padding below currently assumes num_spatial_dims is 2 to
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
new file mode 100644
index 00000000000..7eeb05a4920
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -0,0 +1,425 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+// Reads or infers lower_diag_index and upper_diag_index from kernel's input
+// parameter "k". Also validates their values.
+std::pair<int64, int64> ProcessDiagIndex(XlaOpKernelContext* context) {
+  int64 lower_diag_index = 0;
+  int64 upper_diag_index = 0;
+  TensorShape diag_index_shape = context->InputShape("k");
+
+  // Wrapping OP_REQUIRES* macros with a function because they can "return;"
+  // early (without values) which contradicts ProcessDiagIndex's signature.
+  auto validate_diag_indices = [&]() {
+    if (diag_index_shape.dims() == 0) {
+      OP_REQUIRES_OK(context,
+                     context->ConstantInputAsIntScalar("k", &lower_diag_index));
+      upper_diag_index = lower_diag_index;
+    } else {
+      std::vector<int64> diag_index;
+      OP_REQUIRES_OK(context,
+                     context->ConstantInputAsIntVector("k", &diag_index));
+      OP_REQUIRES(
+          context, !diag_index.empty() && diag_index.size() <= 2,
+          errors::InvalidArgument(
+              "diag_index must have only one or two elements, received ",
+              diag_index.size(), " elements."));
+      lower_diag_index = diag_index[0];
+      upper_diag_index =
+          (diag_index.size() > 1) ? diag_index[1] : lower_diag_index;
+    }
+    OP_REQUIRES(
+        context, lower_diag_index <= upper_diag_index,
+        errors::InvalidArgument(
+            "lower_diag_index must not be larger than upper_diag_index: ",
+            lower_diag_index, " > ", upper_diag_index));
+  };
+  validate_diag_indices();
+  return {lower_diag_index, upper_diag_index};
+}
+
+// Makes sure lower_diag_index and upper_diag_index are consistent with the
+// input matrix size.
+void ValidateDiagIndexWithOutputMatrixSize(XlaOpKernelContext* context,
+                                           const int64 lower_diag_index,
+                                           const int64 upper_diag_index,
+                                           const int64 num_rows,
+                                           const int64 num_cols) {
+  // `lower_diag_index == 0` condition is added to handle matrix shape = 0.
+  OP_REQUIRES(context,
+              (-num_rows < lower_diag_index && lower_diag_index < num_cols) ||
+                  lower_diag_index == 0,
+              errors::InvalidArgument(
+                  "lower_diag_index is out of bound: ", lower_diag_index,
+                  " It must be between ", -num_rows, " and ", num_cols));
+  OP_REQUIRES(context,
+              (-num_rows < upper_diag_index && upper_diag_index < num_cols) ||
+                  upper_diag_index == 0,
+              errors::InvalidArgument(
+                  "upper_diag_index is out of bound: ", upper_diag_index,
+                  " It must be between ", -num_rows, " and ", num_cols));
+  OP_REQUIRES(context, lower_diag_index <= upper_diag_index,
+              errors::InvalidArgument(
+                  "lower_diag_index must not be larger than upper_diag_index: ",
+                  lower_diag_index, " > ", upper_diag_index));
+}
+
+// Kernel to set matrix diagonals.
+xla::XlaOp SetMatrixDiag(const xla::XlaOp input, const xla::XlaOp diag,
+                         const TensorShape& input_shape, const int64 diag_rank,
+                         const int64 num_diags, const int64 lower_diag_index,
+                         const int64 upper_diag_index, const int64 max_diag_len,
+                         const int64 num_rows, const int64 num_cols) {
+  // Creates a padding config.
+  const int input_rank = input_shape.dims();
+  xla::PaddingConfig padding_config;
+  padding_config = xla::MakeNoPaddingConfig(input_rank - 1);
+
+  // Processes one diagonal at a time:
+  // 1) Extracts a single diagonal (diag_slice).
+  // 2) Broadcasts its contents to fill the whole matrix (diag_broadcast).
+  // 3) Masks diag_broadcast to get the right diagonal shape.
+  //
+  // XLA can fuse multiple Broadcasts and Selects so this shouldn't be slow.
+  //
+  // For example,
+  //   diag = [[2, 3, 0], k = (-1, 1), and num_rows = 4.
+  //           [4, 5, 6],
+  //           [7, 8, 9]]
+  // The expected output is [[4, 2, 0],
+  //                         [7, 5, 4],
+  //                         [0, 8, 6],
+  //                         [0, 0, 9]]
+  // The 1st diagonal is created by:
+  // 1) Extracting diag_slice = [1, 2, 0].
+  // 2) Padding the vector to be as long as num_rows,
+  //      diag_slice = [1, 2, 0, 0],
+  //    then broadcasting diag_slice row-wise to a full matrix,
+  //      diag_broadcast = [[1, 1, 1],
+  //                        [2, 2, 2],
+  //                        [0, 0, 0],
+  //                        [0, 0, 0]]
+  //    The padding value can be anything because it will not appear in the
+  //    results after masking. Here, we use zero.
+  // 3) Masking diag_broadcast with a mask of the shape of the 1st diagonal.
+  //      mask = [[0, 1, 0],  -->  output = [[x, 2, x],
+  //              [0, 0, 1],                 [x, x, 3],
+  //              [0, 0, 0],                 [x, x, x],
+  //              [0, 0, 0]]                 [x, x, x]],
+  //    where x denotes the existing input contents.
+  std::vector<int64> broadcast_dimensions(input_rank - 1);
+  absl::c_iota(broadcast_dimensions, 0);
+  auto output = input;
+  for (int64 diag_index = lower_diag_index; diag_index <= upper_diag_index;
+       ++diag_index) {
+    // Extracts a single diagonal.
+    auto diag_slice = diag;
+    if (num_diags > 1) {
+      const int64 mapped_diag_index = upper_diag_index - diag_index;
+      diag_slice = xla::Collapse(
+          xla::SliceInDim(diag, mapped_diag_index, mapped_diag_index + 1, 1,
+                          diag_rank - 2),
+          {diag_rank - 2, diag_rank - 1});
+    }
+
+    // Pads if necessary. Always pad at the end because shorter diagonals in
+    // the input come padded at the end.
+    const int64 padding_length =
+        ((diag_index <= 0) ? num_cols : num_rows) - max_diag_len;
+    const xla::XlaOp zero = xla::ScalarLike(input, 0);
+    if (padding_length > 0) {
+      padding_config.mutable_dimensions(input_rank - 2)
+          ->set_edge_padding_high(padding_length);
+      diag_slice = xla::Pad(diag_slice, zero, padding_config);
+    }
+
+    // Broadcasts column-wise for subdiagonals; row-wise for superdiagonals.
+    broadcast_dimensions.back() =
+        (diag_index <= 0) ? input_rank - 1 : input_rank - 2;
+    xla::XlaOp diag_broadcast = xla::BroadcastInDim(
+        diag_slice, input_shape.dim_sizes(), broadcast_dimensions);
+    const auto mask = xla::GetDiagonalMask(output, diag_index);
+    output = xla::Select(mask, diag_broadcast, output);
+  }
+  return output;
+}
+
+}  // namespace
+
+class MatrixDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixDiagOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    OP_REQUIRES(
+        context, context->num_inputs() >= 1,
+        errors::InvalidArgument("MatrixDiag op must have at least one input"));
+    const TensorShape diag_shape = context->InputShape(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(diag_shape),
+                errors::InvalidArgument("Expected >= 1 dims, got shape ",
+                                        diag_shape.DebugString()));
+
+    const DataType dtype = context->expected_output_dtype(0);
+    const xla::XlaOp zero = XlaHelpers::Zero(context->builder(), dtype);
+
+    // Initializes MatrixDiagV2-specific variables.
+    // Input arguments providing the values of num_rows and num_cols can be
+    // absent (-1) and will be inferred later.
+    int64 lower_diag_index = 0;
+    int64 upper_diag_index = 0;
+    int64 num_rows = -1;
+    int64 num_cols = -1;
+    xla::XlaOp padding_value = zero;
+
+    // MatrixDiag and MatrixDiagV2 both use this OpKernel. MatrixDiag only has
+    // one input, so we have to check the number of inputs before reading
+    // additional parameters for MatrixDiagV2.
+    if (context->num_inputs() > 1) {
+      std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context);
+      OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &num_rows));
+      OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(3, &num_cols));
+      padding_value = context->Input(4);
+    }
+
+    // More size validations.
+    const int64 diag_rank = diag_shape.dims();
+    const int64 max_diag_len = diag_shape.dim_size(diag_rank - 1);
+    const int64 num_diags = upper_diag_index - lower_diag_index + 1;
+    OP_REQUIRES(
+        context,
+        num_diags == 1 || num_diags == diag_shape.dim_size(diag_rank - 2),
+        errors::InvalidArgument(
+            "The number of diagonals provided in the input does not "
+            "match the lower_diag_index and upper_diag_index range."));
+    const int64 min_num_rows = max_diag_len - std::min(upper_diag_index, 0LL);
+    const int64 min_num_cols = max_diag_len + std::max(lower_diag_index, 0LL);
+    OP_REQUIRES(context, num_rows == -1 || num_rows >= min_num_rows,
+                errors::InvalidArgument("The number of rows is too small."));
+    OP_REQUIRES(context, num_cols == -1 || num_cols >= min_num_cols,
+                errors::InvalidArgument("The number of columns is too small."));
+
+    // Infers num_rows and num_cols. If both are unknown, assume that the output
+    // is square. Otherwise, use smallest possible values.
+    if (num_rows == -1 && num_cols == -1) {
+      num_rows = std::max(min_num_rows, min_num_cols);
+      num_cols = num_rows;
+    } else if (num_rows == -1) {
+      num_rows = min_num_rows;
+    } else if (num_cols == -1) {
+      num_cols = min_num_cols;
+    }
+
+    // At least one of num_rows and num_cols must match its minimum length.
+    // Otherwise, we'll have some incomplete diagonals.
+    OP_REQUIRES(context, num_rows == min_num_rows || num_cols == min_num_cols,
+                errors::InvalidArgument(
+                    "The number of rows or columns is not consistent with "
+                    "the specified d_lower, d_upper, and diagonal."));
+
+    // Actual processing.
+    // Initializes the output tensor with padding_value.
+    TensorShape output_shape = diag_shape;
+    output_shape.RemoveLastDims((num_diags == 1) ? 1 : 2);
+    output_shape.AddDim(num_rows);
+    output_shape.AddDim(num_cols);
+    xla::XlaOp output = xla::Broadcast(padding_value, output_shape.dim_sizes());
+    xla::XlaOp diag = context->Input(0);
+    context->SetOutput(
+        0, SetMatrixDiag(output, diag, output_shape, diag_rank, num_diags,
+                         lower_diag_index, upper_diag_index, max_diag_len,
+                         num_rows, num_cols));
+  }
+};
+
+REGISTER_XLA_OP(Name("MatrixDiag"), MatrixDiagOp);
+REGISTER_XLA_OP(Name("MatrixDiagV2")
+                    .CompileTimeConstantInput("k")
+                    .CompileTimeConstantInput("num_rows")
+                    .CompileTimeConstantInput("num_cols")
+                    .CompileTimeConstantInput("padding_value"),
+                MatrixDiagOp);
+
+class MatrixDiagPartOp : public XlaOpKernel {
+ public:
+  explicit MatrixDiagPartOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const int input_rank = input_shape.dims();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    const DataType dtype = context->expected_output_dtype(0);
+    const xla::XlaOp zero = XlaHelpers::Zero(context->builder(), dtype);
+
+    // Initializes MatrixDiagPartV2-specific variables.
+    int64 lower_diag_index = 0;
+    int64 upper_diag_index = 0;
+    xla::XlaOp padding_value = zero;
+
+    // MatrixDiagPart and MatrixDiagPartV2 both use this OpKernel.
+    // MatrixDiagPart only has one input, so we have to check the number of
+    // inputs before reading additional parameters in MatrixDiagV2.
+    if (context->num_inputs() > 1) {
+      std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context);
+      padding_value = context->Input(2);
+    }
+
+    // Checks if diag sizes are consistent with input.
+    const int64 num_rows = input_shape.dim_size(input_rank - 2);
+    const int64 num_cols = input_shape.dim_size(input_rank - 1);
+    ValidateDiagIndexWithOutputMatrixSize(context, lower_diag_index,
+                                          upper_diag_index, num_rows, num_cols);
+
+    // Creates output shape.
+    TensorShape output_shape = input_shape;
+    output_shape.RemoveLastDims(2);
+    const int num_diags = upper_diag_index - lower_diag_index + 1;
+    if (num_diags > 1) output_shape.AddDim(num_diags);
+    const int32 max_diag_len =
+        std::min(num_rows + std::min(upper_diag_index, 0LL),
+                 num_cols - std::max(lower_diag_index, 0LL));
+    output_shape.AddDim(max_diag_len);
+
+    // Computes output.
+    xla::XlaOp input = context->Input(0);
+    std::vector<xla::XlaOp> diag_list;
+    xla::PaddingConfig padding_config;
+    if (num_diags == 1) {
+      context->SetOutput(0, xla::GetMatrixDiagonal(input, upper_diag_index));
+      return;
+    }
+    padding_config = xla::MakeNoPaddingConfig(input_rank - 1);
+    for (int diag_index = upper_diag_index; diag_index >= lower_diag_index;
+         --diag_index) {
+      auto single_diag = xla::GetMatrixDiagonal(input, diag_index);
+      const int64 diag_length =
+          (diag_index >= 0) ? (num_cols - diag_index) : (num_rows + diag_index);
+      const int64 padding_length = max_diag_len - diag_length;
+      if (padding_length > 0) {
+        padding_config.mutable_dimensions(input_rank - 2)
+            ->set_edge_padding_high(padding_length);
+        single_diag = xla::Pad(single_diag, padding_value, padding_config);
+      }
+      diag_list.emplace_back(single_diag);
+    }
+    auto concat =
+        xla::ConcatInDim(context->builder(), diag_list, input_rank - 2);
+    context->SetOutput(0, xla::Reshape(concat, output_shape.dim_sizes()));
+  }
+};
+
+REGISTER_XLA_OP(Name("MatrixDiagPart"), MatrixDiagPartOp);
+REGISTER_XLA_OP(Name("MatrixDiagPartV2")
+                    .CompileTimeConstantInput("k")
+                    .CompileTimeConstantInput("padding_value"),
+                MatrixDiagPartOp);
+
+class MatrixSetDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixSetDiagOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape diag_shape = context->InputShape(1);
+    const int input_rank = input_shape.dims();
+    const int diag_rank = diag_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(diag_shape),
+                errors::InvalidArgument(
+                    "diagonal must be at least 1-dim, received shape: ",
+                    diag_shape.DebugString()));
+
+    // MatrixSetDiag and MatrixSetDiagV2 both use this OpKernel. MatrixSetDiag
+    // only has two inputs, so we have to check the number of inputs before
+    // reading additional parameters in MatrixSetDiagV2.
+    int64 lower_diag_index = 0;
+    int64 upper_diag_index = 0;
+    if (context->num_inputs() > 2) {
+      std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context);
+    }
+
+    // Checks if diag sizes are consistent with input.
+    const int64 num_rows = input_shape.dim_size(input_rank - 2);
+    const int64 num_cols = input_shape.dim_size(input_rank - 1);
+    ValidateDiagIndexWithOutputMatrixSize(context, lower_diag_index,
+                                          upper_diag_index, num_rows, num_cols);
+    const Eigen::Index num_diags = upper_diag_index - lower_diag_index + 1;
+    OP_REQUIRES(
+        context,
+        lower_diag_index == upper_diag_index ||
+            (diag_shape.dim_size(input_rank - 2) == num_diags),
+        errors::InvalidArgument("The number of diagonals provided in `diag` "
+                                "is not consistent with `lower_diag_index` and "
+                                "`upper_diag_index`"));
+
+    TensorShape expected_diag_shape = input_shape;
+    expected_diag_shape.RemoveLastDims(2);
+    if (num_diags > 1) expected_diag_shape.AddDim(num_diags);
+    const int32 max_diag_len =
+        std::min(num_rows + std::min(upper_diag_index, 0LL),
+                 num_cols - std::max(lower_diag_index, 0LL));
+    expected_diag_shape.AddDim(max_diag_len);
+    OP_REQUIRES(
+        context, expected_diag_shape == diag_shape,
+        errors::InvalidArgument(
+            "Either first dimensions of diagonal don't match input.shape[:-2], "
+            "or diagonal.shape[:-1] is not equal to the longests diagonal in "
+            "range [lower_diag_index:upper_diag_index].\nInput shape: ",
+            input_shape.DebugString(),
+            "\nDiagonal shape: ", diag_shape.DebugString(),
+            "\nExpected diagonal shape: ", expected_diag_shape.DebugString()));
+
+    // Actual processing.
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp diag = context->Input(1);
+    context->SetOutput(
+        0, SetMatrixDiag(input, diag, input_shape, diag_rank, num_diags,
+                         lower_diag_index, upper_diag_index, max_diag_len,
+                         num_rows, num_cols));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+};
+
+REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
+REGISTER_XLA_OP(Name("MatrixSetDiagV2").CompileTimeConstantInput("k"),
+                MatrixSetDiagOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
deleted file mode 100644
index ee9764c0c35..00000000000
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/primitive_util.h"
-
-namespace tensorflow {
-
-class MatrixSetDiagOp : public XlaOpKernel {
- public:
-  explicit MatrixSetDiagOp(OpKernelConstruction* context)
-      : XlaOpKernel(context) {}
-
-  void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape(0);
-    const TensorShape diag_shape = context->InputShape(1);
-
-    const int rank = input_shape.dims();
-
-    // Preliminary validation of sizes.
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
-                errors::InvalidArgument(
-                    "input must be at least 2-dim, received shape: ",
-                    input_shape.DebugString()));
-
-    // Check to make sure the last dimension of diag is equal to the smaller of
-    // the last two dimensions of input.
-    const int64 m = input_shape.dim_size(rank - 2);
-    const int64 n = input_shape.dim_size(rank - 1);
-    const int64 min_dim = std::min(m, n);
-
-    TensorShape batch_shape = input_shape;
-    batch_shape.RemoveLastDims(2);
-
-    TensorShape expected_diag_shape = batch_shape;
-    expected_diag_shape.AddDim(min_dim);
-    OP_REQUIRES(context, expected_diag_shape == diag_shape,
-                errors::InvalidArgument(
-                    "must have diagonal.shape == input.shape[:-2] + "
-                    "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(),
-                    " and diagonal shape: ", diag_shape.DebugString()));
-
-    xla::XlaBuilder* builder = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp diag = context->Input(1);
-
-    auto zero = XlaHelpers::Zero(builder, context->input_type(0));
-
-    // Create an indicator tensor that is true only on the diagonal.
-    xla::XlaOp iota_m = xla::Iota(builder, xla::S32, m);
-    xla::XlaOp iota_n = xla::Iota(builder, xla::S32, n);
-    auto indicator = xla::Eq(iota_m, xla::Broadcast(iota_n, {m}),
-                             /*broadcast_dimensions=*/{0});
-    indicator = xla::Broadcast(indicator, batch_shape.dim_sizes());
-
-    // Broadcast diag up to the input shape. Use an implicit broadcast (Add/Or)
-    // because we need to broadcast on the right.
-    std::vector<int64> diag_broadcast_dims(rank - 1);
-    std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0);
-    if (min_dim != m) {
-      diag_broadcast_dims.back() = rank - 1;
-    }
-    if (context->input_xla_type(0) == xla::PRED) {
-      diag = xla::Or(diag, xla::Broadcast(zero, input_shape.dim_sizes()),
-                     /*broadcast_dimensions=*/diag_broadcast_dims);
-
-    } else {
-      diag = xla::Add(diag, xla::Broadcast(zero, input_shape.dim_sizes()),
-                      /*broadcast_dimensions=*/diag_broadcast_dims);
-    }
-
-    auto output = xla::Select(indicator, diag, input);
-    context->SetOutput(0, output);
-  }
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
-};
-
-REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index 063b97cd593..905f83fef9a 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -47,6 +47,11 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+    // TODO(b/140109958): Implement for axis != -1.
+    OP_REQUIRES(ctx, axis_ == -1,
+                errors::Unimplemented("QuantizeAndDequantizeOp with axis >= 0 "
+                                      "not yet implemented for XLA"));
     round_mode_ = ROUND_HALF_TO_EVEN;
   }
 
@@ -156,6 +161,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
 
  protected:
   int64 num_bits_ = -1;
+  int axis_;
   bool signed_input_;
   bool range_given_;
   bool narrow_range_;
diff --git a/tensorflow/compiler/tf2xla/kernels/roll_op.cc b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
index a6cc5960c90..99f4a5f46d7 100644
--- a/tensorflow/compiler/tf2xla/kernels/roll_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
@@ -47,11 +47,8 @@ class RollOp : public XlaOpKernel {
     xla::PrimitiveType shift_type = ctx->input_xla_type(1);
     int64 num_axes = axis_shape.dims() == 0 ? 1 : axis_shape.dim_size(0);
     for (int64 i = 0; i != num_axes; ++i) {
-      auto cur_axis_status = axis_shape.dims() == 0
-                                 ? axis.GetIntegralAsS64({})
-                                 : axis.GetIntegralAsS64({i});
-      OP_REQUIRES_OK(ctx, cur_axis_status.status());
-      int64 cur_axis = cur_axis_status.ValueOrDie();
+      int64 cur_axis = axis_shape.dims() == 0 ? *axis.GetIntegralAsS64({})
+                                              : *axis.GetIntegralAsS64({i});
 
       xla::XlaOp offset =
           shift_shape.dims() == 0
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 265e7e784a9..88af12dacee 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -40,9 +40,23 @@ class ShapeOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
-    OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
-    ctx->SetConstantOutput(0, shape_constant);
+    std::vector<xla::XlaOp> operands;
+    const int rank = input_shape.dims();
+    if (rank != 0) {
+      for (int64 i = 0; i < rank; ++i) {
+        operands.push_back(xla::Broadcast(
+            xla::ConvertElementType(xla::GetDimensionSize(ctx->Input(0), i),
+                                    ctx->output_xla_type(0)),
+            {1}));
+      }
+
+      ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), operands, 0));
+    } else {
+      // Rank 0 won't have dynamic size dimension, use constant output.
+      Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+      OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
+      ctx->SetConstantOutput(0, shape_constant);
+    }
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index ac3d2c22d65..4af3d4233dd 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -307,6 +308,59 @@ class TensorListGetItemOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp);
 
+class TensorListGatherOp : public XlaOpKernel {
+ public:
+  explicit TensorListGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx,
+                   (IsTensorListInitialized(ctx->Input(0), &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
+
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListGather."));
+
+    DataType indices_type = ctx->input_type(1);
+
+    const TensorShape indices_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, indices_shape.dims() == 1,
+                errors::InvalidArgument("indices must be rank 1"));
+
+    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp indices = ctx->Input(1);
+
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(list, &buffer));
+    xla::Shape buffer_xla_shape;
+    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(list, &buffer_xla_shape));
+    TensorShape buffer_shape;
+    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(buffer_xla_shape, &buffer_shape));
+
+    xla::XlaOp result;
+    OP_REQUIRES_OK(
+        ctx, XlaGather(buffer, buffer_shape, indices, indices_shape, /*axis=*/0,
+                       /*indices_are_nd=*/false, dtype_, indices_type,
+                       ctx->builder(), &result));
+    ctx->SetOutput(0, result);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGatherOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListGather"), TensorListGatherOp);
+
 class TensorListStackOp : public XlaOpKernel {
  public:
   explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 247db8d5d17..191ce9dee2b 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -270,6 +270,53 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ResourceApplyAdagrad").TypeConstraint("T", kFloatTypes),
                 ResourceApplyAdagrad);
 
+class ResourceApplyAdagradV2 : public XlaOpKernel {
+ public:
+  explicit ResourceApplyAdagradV2(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(2);
+
+    TensorShape var_shape, accum_shape;
+    xla::XlaOp var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    TensorShape lr_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    TensorShape epsilon_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(4);
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::XlaOp lr = ctx->Input(2);
+    xla::XlaOp epsilon = ctx->Input(3);
+    xla::XlaOp grad = ctx->Input(4);
+
+    accum = accum + xla::Square(grad);
+    var = var - grad * lr / (xla::Sqrt(accum) + epsilon);
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum));
+  }
+};
+REGISTER_XLA_OP(Name("ResourceApplyAdagradV2").TypeConstraint("T", kFloatTypes),
+                ResourceApplyAdagradV2);
+
 class ResourceApplyProximalAdagrad : public XlaOpKernel {
  public:
   explicit ResourceApplyProximalAdagrad(OpKernelConstruction* ctx)
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index d348d2b41dd..1991e332be8 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -69,6 +69,9 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::U8:
       literal = xla::LiteralUtil::CreateR0<uint8>(value);
       break;
+    case xla::U16:
+      literal = xla::LiteralUtil::CreateR0<uint16>(value);
+      break;
     case xla::U32:
       literal = xla::LiteralUtil::CreateR0<uint32>(value);
       break;
@@ -78,6 +81,9 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::S8:
       literal = xla::LiteralUtil::CreateR0<int8>(value);
       break;
+    case xla::S16:
+      literal = xla::LiteralUtil::CreateR0<int16>(value);
+      break;
     case xla::S32:
       literal = xla::LiteralUtil::CreateR0<int32>(value);
       break;
@@ -98,9 +104,6 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
       break;
     case xla::PRED:
       LOG(FATAL) << "pred element type is not integral";
-    case xla::S16:
-    case xla::U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case xla::BF16:
       literal =
           xla::LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(value));
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 3cc551e08aa..eaba5d3c420 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_py_clif_cc",
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index b376fe94743..b6f8928f31e 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -527,7 +527,7 @@ Status RearrangeFunctionArguments(
 
   // Rewrite If/While nodes.
   for (Node* n : g->nodes()) {
-    if (n->type_string() == "While") {
+    if (n->IsWhileNode()) {
       bool node_rewritten;
       TF_RETURN_IF_ERROR(MaybeRewriteWhileNode(get_function_body_fn, g, n, fld,
                                                &node_rewritten));
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 1243e31a047..2db431c0413 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -57,6 +57,7 @@ CreateResourceOpInfoMap() {
   add("ResourceApplyAdaMax"                  , kReadWrite, kVariable);
   add("ResourceApplyAdadelta"                , kReadWrite, kVariable);
   add("ResourceApplyAdagrad"                 , kReadWrite, kVariable);
+  add("ResourceApplyAdagradV2"               , kReadWrite, kVariable),
   add("ResourceApplyAdagradDA"               , kReadWrite, kVariable);
   add("ResourceApplyAdam"                    , kReadWrite, kVariable);
   add("ResourceApplyAddSign"                 , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 8aae498be10..4d5bf0835e1 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -53,7 +53,7 @@ xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
     const string& device_name, int num_cores_per_replica,
     absl::optional<xla::OpSharding> explicit_sharding) {
   if (device_name.empty()) {
-    return absl::optional<xla::OpSharding>();
+    return explicit_sharding;
   }
   DeviceNameUtils::ParsedName parsed_device;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index eebeec87b60..86d900363b8 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -28,6 +28,9 @@ const char kXlaHasHostTransferAttrName[] = "_xla_has_host_transfer";
 
 const char kXlaReplicaIdAttrName[] = "_xla_replica_id";
 
+const char kXlaIsPlaceholderForTailOcAttrName[] =
+    "_xla_is_placeholder_for_tail_oc";
+
 Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
   if (!HasNodeAttr(node->def(), kXlaHasHostTransferAttrName)) {
     return errors::InvalidArgument("Node ", node->DebugString(),
@@ -50,7 +53,7 @@ Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
       node->ClearAttr(attr_name);
       node->AddAttr(attr_name, branch_func);
     }
-  } else if (node->type_string() == "While") {
+  } else if (node->IsWhileNode()) {
     AttrValue device_ordinal_value;
     device_ordinal_value.set_i(device_ordinal);
     for (const string& attr_name : std::vector<string>{"cond", "body"}) {
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index be26ba5769c..31326044738 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -41,6 +41,9 @@ extern const char kXlaHasHostTransferAttrName[];
 // This attribute is the replica id for an outside compilation node node.
 extern const char kXlaReplicaIdAttrName[];
 
+// This node is a Placeholder node added for tail outside compilation.
+extern const char kXlaIsPlaceholderForTailOcAttrName[];
+
 // Sets device ordinal attribute for nodes with attribute
 // `kXlaHasHostTransferAttrName`.
 Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal);
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 3e4188f3c6d..3c2b256800c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -384,8 +384,8 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
       &second_copy_def, *g->op_registry(), /*node_offset=*/0));
 
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
-                                            second_copy_def, g.get()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+      GraphConstructorOptions(), std::move(second_copy_def), g.get()));
   TF_RETURN_IF_ERROR(RewriteAndPruneGraph(g.get(), config, feed_remapping));
 
   // Functionalize control flow.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 3e8b9eb79d8..e82546def46 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -765,7 +765,7 @@ Status PropagateConstIntoFunctionalNodes(
   for (Node* n : g->op_nodes()) {
     if (n->IsIfNode()) {
       TF_RETURN_IF_ERROR(PropagateConstIntoIfNode(g, n, lookup_fld, fld));
-    } else if (n->type_string() == "While") {
+    } else if (n->IsWhileNode()) {
       TF_RETURN_IF_ERROR(PropagateConstIntoWhileNode(g, n, lookup_fld, fld));
     }
   }
@@ -796,7 +796,7 @@ Status RewriteTensorListWithConstElement(Graph* g,
     // Find the forward While op.
     std::vector<const Edge*> fwd_while_edges;
     for (const Edge* e : n->out_edges()) {
-      if (!e->IsControlEdge() && e->dst()->type_string() == "While") {
+      if (!e->IsControlEdge() && e->dst()->IsWhileNode()) {
         fwd_while_edges.push_back(e);
       }
     }
@@ -810,8 +810,7 @@ Status RewriteTensorListWithConstElement(Graph* g,
     int fwd_while_dst_input = fwd_while_edges[0]->dst_input();
     std::vector<const Edge*> bwd_while_edges;
     for (const Edge* e : fwd_while->out_edges()) {
-      if (e->src_output() == fwd_while_dst_input &&
-          e->dst()->type_string() == "While") {
+      if (e->src_output() == fwd_while_dst_input && e->dst()->IsWhileNode()) {
         bwd_while_edges.push_back(e);
       }
     }
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index c14519c3ade..06423019f23 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -98,6 +99,20 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   absl::optional<xla::OpSharding> op_sharding =
       sharding_parse_result.ValueOrDie();
 
+  auto frontend_attributes_result =
+      GetFrontendAttributesFromAttrSlice(AttrSlice(op_kernel->def()));
+  OP_REQUIRES_OK(context, frontend_attributes_result.status());
+  absl::optional<xla::FrontendAttributes> attributes =
+      frontend_attributes_result.ValueOrDie();
+
+  xla::FrontendAttributes merged_attributes = b->frontend_attributes();
+  if (attributes.has_value()) {
+    merged_attributes.mutable_map()->insert(attributes.value().map().begin(),
+                                            attributes.value().map().end());
+  }
+  xla::XlaScopedFrontendAttributesAssignment assign_frontend_attributes(
+      b, std::move(merged_attributes));
+
   // If no sharding metadata is found, XLA is free to use whatever device it
   // wants. In practice this usually has the effect of placing things on device
   // 0.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 2ee8c7e5cfb..cfb118281e4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -76,41 +77,38 @@ Status CheckSignature(const DataTypeVector& types,
   return Status::OK();
 }
 
-// Uses the _Arg and _Retval nodes in the graph to determine a core assignment
-// for each argument and return value.
-xla::StatusOr<std::pair<std::map<int, int>, std::map<int, int>>>
-ComputeArgAndRetvalCores(const Graph& graph) {
-  auto get_sharding_for_node = [](const Node* n) -> xla::StatusOr<int> {
+// Uses the _Arg and _Retval nodes in the graph to determine an OpSharding for
+// each argument and return value.
+xla::StatusOr<
+    std::pair<std::map<int, xla::OpSharding>, std::map<int, xla::OpSharding>>>
+ComputeArgAndRetvalShardings(const Graph& graph) {
+  auto get_sharding_for_node =
+      [](const Node* n) -> xla::StatusOr<absl::optional<xla::OpSharding>> {
     TF_ASSIGN_OR_RETURN(
         auto sharding,
         ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
-    if (sharding.has_value()) {
-      TF_RET_CHECK(sharding.value().type() == xla::OpSharding::MAXIMAL);
-      return sharding.value().tile_assignment_devices(0);
-    } else {
-      return -1;
-    }
+    return sharding;
   };
-  std::map<int, int> arg_cores;
-  std::map<int, int> retval_cores;
+  std::map<int, xla::OpSharding> arg_shardings;
+  std::map<int, xla::OpSharding> retval_shardings;
   for (const Node* n : graph.nodes()) {
     if (n->IsArg()) {
-      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
-      if (core < 0) continue;
+      TF_ASSIGN_OR_RETURN(auto sharding, get_sharding_for_node(n));
+      if (!sharding.has_value()) continue;
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       TF_RET_CHECK(index >= 0) << "Negative _Arg index";
-      arg_cores[index] = core;
+      arg_shardings[index] = std::move(*sharding);
     } else if (n->IsRetval()) {
-      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
-      if (core < 0) continue;
+      TF_ASSIGN_OR_RETURN(auto sharding, get_sharding_for_node(n));
+      if (!sharding.has_value()) continue;
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       TF_RET_CHECK(index >= 0) << "Negative _Retval index";
-      retval_cores[index] = core;
+      retval_shardings[index] = std::move(*sharding);
     }
   }
-  return std::make_pair(std::move(arg_cores), std::move(retval_cores));
+  return std::make_pair(std::move(arg_shardings), std::move(retval_shardings));
 }
 
 Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
@@ -144,8 +142,8 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
 // - `args` is the list of input arguments
 // - `retvals` is the list of retvals produced by _Retval operators, in index
 //   order.
-// - `args_core` and `retval_cores` are mapping from arg/return indices to core
-//   assignments.
+// - `arg_shardings` and `retval_shardings` are mapping from arg/return indices
+//   to sharding.
 // - If `return_updated_values_for_all_resources` is true, all resources will be
 //   included in `resource_updates`, regardless of whether their value changed.
 // - Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
@@ -158,7 +156,8 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
 Status BuildComputation(
     const std::vector<XlaCompiler::Argument>& args,
     const std::vector<XlaExpression>& retvals,
-    const std::map<int, int>& arg_cores, const std::map<int, int>& retval_cores,
+    const std::map<int, xla::OpSharding>& arg_shardings,
+    const std::map<int, xla::OpSharding>& retval_shardings,
     const std::vector<std::unique_ptr<XlaResource>>& resources,
     std::unique_ptr<xla::XlaOp> token_output,
     const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
@@ -212,19 +211,20 @@ Status BuildComputation(
         output.is_constant = false;
         TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
         xla::XlaOp value = retval.handle();
-        auto it = retval_cores.find(i);
+        auto it = retval_shardings.find(i);
         xla::XlaScopedShardingAssignment assign_sharding(
-            builder, it == retval_cores.end()
+            builder, it == retval_shardings.end()
                          ? absl::optional<xla::OpSharding>()
-                         : xla::sharding_builder::AssignDevice(it->second));
+                         : it->second);
         if (shape_representation_fn) {
           // If there is a shape representation function, reshape the output
           // tensor to the shape given by the representation shape function.
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
-                                                    output.shape, output.type));
+                                                    output.shape, output.type,
+                                                    /*use_fast_memory=*/false));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
           retval_index_and_layout.emplace_back(elems.size(), shape.layout());
-        } else if (it != retval_cores.end()) {
+        } else if (it != retval_shardings.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
         }
@@ -265,8 +265,7 @@ Status BuildComputation(
   for (const XlaResource* resource : arg_resources) {
     DCHECK_LT(resource->arg_num(), args.size());
     const XlaCompiler::Argument& arg = args[resource->arg_num()];
-    auto it = arg_cores.find(resource->arg_num());
-    const int core = it == arg_cores.end() ? -1 : it->second;
+    auto it = arg_shardings.find(resource->arg_num());
     bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
     // TensorArray gradients were modified if their values changed or there are
     // any newly created gradients.
@@ -289,8 +288,8 @@ Status BuildComputation(
 
       // Request that the value be returned on a specific core.
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
+          builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
+                                             : it->second);
 
       xla::XlaOp handle;
       TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
@@ -303,7 +302,8 @@ Status BuildComputation(
       if (shape_representation_fn) {
         TF_ASSIGN_OR_RETURN(
             xla::Shape xla_shape,
-            shape_representation_fn(resource->shape(), resource->type()));
+            shape_representation_fn(resource->shape(), resource->type(),
+                                    /*use_fast_memory=*/false));
         representation_shape = xla_shape;
       }
       if (resource->representation_shape().has_value()) {
@@ -479,8 +479,8 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
   // The default shape representation function is the identity.
   if (!options_.shape_representation_fn) {
     options_.shape_representation_fn =
-        [](const TensorShape& shape,
-           DataType dtype) -> xla::StatusOr<xla::Shape> {
+        [](const TensorShape& shape, DataType dtype,
+           bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
       xla::Shape xla_shape;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
       return xla_shape;
@@ -532,6 +532,11 @@ Status XlaCompiler::FindFunctionBody(const NameAttrList& function,
 std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
+
+  // Performs a first function inlining pass before shape inference, since
+  // otherwise shape inference can't see inside functions and a comprehensive
+  // shape_map, including function ops, is needed to constant-propagate Shape
+  // Ops below.
   auto flags = GetBuildXlaOpsPassFlags();
   OptimizerOptions opts;
   opts.set_opt_level(OptimizerOptions::L0);
@@ -570,6 +575,28 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
+  // Run shape inference on the graph and optimize the graph again.
+  GraphShapeInfo shape_info;
+  InferShapes(graph.get(), /*arg_shapes=*/{},
+              flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
+      .IgnoreError();
+  auto node_name_index = graph->BuildNodeNameIndex();
+  std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+  for (const auto& node_shape_info : shape_info) {
+    const string& node_name = node_shape_info.first;
+    const std::vector<InferredShape>& output_shapes = node_shape_info.second;
+    const auto& node_iter = node_name_index.find(node_name);
+    if (node_iter != node_name_index.end()) {
+      auto& partial_shapes = shape_map[node_name];
+      for (const auto& inferred_shape : output_shapes) {
+        partial_shapes.push_back(inferred_shape.shape);
+      }
+    }
+  }
+  graph_optimizer_options.shape_map = &shape_map;
+  optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
+                     /*device=*/nullptr, &graph, graph_optimizer_options);
+
   return graph;
 }
 
@@ -596,6 +623,33 @@ Status XlaCompiler::CompileFunction(
       CheckSignature(fbody->arg_types, args),
       "Signature check failure while compiling: ", fn_name_attrs.name());
 
+  // Set shapes for _Arg nodes. They are useful for constant folding (e.g. an
+  // Xla op requires a compile-time constant input, and that input is shape of
+  // an _Arg node.
+  for (int i = 0; i < args.size(); i++) {
+    // Skip resource variables and tensor lists.
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(fbody->arg_nodes[i]->def(), "T", &dtype));
+    if (dtype == DT_RESOURCE || dtype == DT_VARIANT) {
+      continue;
+    }
+
+    if (absl::holds_alternative<xla::Shape>(args[i].shape)) {
+      xla::Shape xla_shape = absl::get<xla::Shape>(args[i].shape);
+      TensorShape tensor_shape;
+      if (XLAShapeToTensorShape(xla_shape, &tensor_shape).ok()) {
+        fbody->arg_nodes[i]->ClearAttr("_output_shapes");
+        fbody->arg_nodes[i]->AddAttr("_output_shapes",
+                                     std::vector<TensorShape>{tensor_shape});
+      }
+    } else {
+      TensorShape tensor_shape = absl::get<TensorShape>(args[i].shape);
+      fbody->arg_nodes[i]->ClearAttr("_output_shapes");
+      fbody->arg_nodes[i]->AddAttr("_output_shapes",
+                                   std::vector<TensorShape>{tensor_shape});
+    }
+  }
+
   std::unique_ptr<Graph> graph = GetGraph(fbody);
 
   // Clear the "_kernel" attribute if it is set to "host". This is used to
@@ -604,7 +658,7 @@ Status XlaCompiler::CompileFunction(
   const char* const kKernelAttr = "_kernel";
   for (Node* n : graph->nodes()) {
     string value;
-    if (GetNodeAttrSimple(n->attrs(), kKernelAttr, &value) && value == "host") {
+    if (TryGetNodeAttr(n->attrs(), kKernelAttr, &value) && value == "host") {
       n->ClearAttr(kKernelAttr);
     }
   }
@@ -659,8 +713,9 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           TF_RETURN_IF_ERROR(
               XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &shape));
         }
-        TF_ASSIGN_OR_RETURN(*xla_shape,
-                            options_.shape_representation_fn(shape, arg.type));
+        TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
+                                            shape, arg.type,
+                                            /*use_fast_memory=*/false));
       } else {
         if (absl::holds_alternative<xla::Shape>(arg.shape)) {
           *xla_shape = absl::get<xla::Shape>(arg.shape);
@@ -684,7 +739,8 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TF_ASSIGN_OR_RETURN(*xla_shape,
                               options_.shape_representation_fn(
-                                  absl::get<TensorShape>(arg.shape), arg.type));
+                                  absl::get<TensorShape>(arg.shape), arg.type,
+                                  /*use_fast_memory=*/false));
 
           return Status::OK();
         }
@@ -742,7 +798,7 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
 Status XlaCompiler::BuildArguments(
     const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
-    const std::map<int, int>& arg_cores,
+    const std::map<int, xla::OpSharding>& arg_shardings,
     std::vector<XlaExpression>* arg_expressions,
     std::vector<int>* input_to_args, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
@@ -833,10 +889,10 @@ Status XlaCompiler::BuildArguments(
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::TUPLE);
       for (int64 parameter : *input_to_args) {
-        auto it = arg_cores.find(parameter);
-        const int core = it == arg_cores.end() ? 0 : it->second;
+        auto it = arg_shardings.find(parameter);
         *tuple_sharding.add_tuple_shardings() =
-            xla::sharding_builder::AssignDevice(core);
+            it == arg_shardings.end() ? xla::sharding_builder::AssignDevice(0)
+                                      : it->second;
       }
       std::vector<bool> is_same_across_replicas;
       for (int i = 0; i < input_to_args->size(); ++i) {
@@ -867,20 +923,18 @@ Status XlaCompiler::BuildArguments(
     }
 
     for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
-      auto it = arg_cores.find(i);
-      const int core = it == arg_cores.end() ? -1 : it->second;
+      auto it = arg_shardings.find(i);
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
+          builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
+                                             : it->second);
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
   } else {
     for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
-      auto it = arg_cores.find(i);
-      const int core = it == arg_cores.end() ? -1 : it->second;
+      auto it = arg_shardings.find(i);
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
+          builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
+                                             : it->second);
       if (is_entry_computation) {
         // Add an entry to is_same_across_replicas for every leaf buffer.
         std::vector<bool> is_same_across_replicas(
@@ -1155,16 +1209,16 @@ Status XlaCompiler::CompileGraph(
     real_args.push_back(token_arg);
   }
 
-  std::map<int, int> arg_cores;
-  std::map<int, int> retval_cores;
-  TF_ASSIGN_OR_RETURN(std::tie(arg_cores, retval_cores),
-                      ComputeArgAndRetvalCores(*graph));
+  std::map<int, xla::OpSharding> arg_shardings;
+  std::map<int, xla::OpSharding> retval_shardings;
+  TF_ASSIGN_OR_RETURN(std::tie(arg_shardings, retval_shardings),
+                      ComputeArgAndRetvalShardings(*graph));
 
   std::vector<XlaExpression> arg_expressions;
   TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, real_args, options.use_tuple_arg, &builder, context, arg_cores,
-      &arg_expressions, &result->input_mapping, &result->xla_input_shapes,
-      options.is_entry_computation));
+      *graph, real_args, options.use_tuple_arg, &builder, context,
+      arg_shardings, &arg_expressions, &result->input_mapping,
+      &result->xla_input_shapes, options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
 
   // Propagate any aliases given to us by the user.
@@ -1233,7 +1287,7 @@ Status XlaCompiler::CompileGraph(
     ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
   }
   TF_RETURN_IF_ERROR(BuildComputation(
-      real_args, retvals, arg_cores, retval_cores, context->resources(),
+      real_args, retvals, arg_shardings, retval_shardings, context->resources(),
       std::move(token_output),
       options.is_entry_computation ? options_.shape_representation_fn
                                    : ShapeRepresentationFn{},
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 1cc5d8d4728..98c487c9973 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -286,7 +286,8 @@ class XlaCompiler {
     std::shared_ptr<xla::XlaComputation> computation;
   };
 
-  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>
+  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType,
+                                                  bool)>
       ShapeRepresentationFn;
   struct Options {
     // Name of the compilation device to use. It must be set by the caller.
@@ -446,7 +447,7 @@ class XlaCompiler {
                         const std::vector<XlaCompiler::Argument>& args,
                         bool use_tuple_arg, xla::XlaBuilder* builder,
                         XlaContext* context,
-                        const std::map<int, int>& arg_cores,
+                        const std::map<int, xla::OpSharding>& arg_shardings,
                         std::vector<XlaExpression>* arg_expressions,
                         std::vector<int>* input_to_args,
                         std::vector<xla::Shape>* input_shapes,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 34b785754b9..4413625dc3c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -304,7 +304,8 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForUnwrittenResource) {
 
   auto options = DefaultOptions();
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType dt) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType dt,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::Shape xla_shape;
     TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dt, shape, &xla_shape));
     *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
@@ -357,7 +358,8 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
 
   auto options = DefaultOptions();
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType dt) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType dt,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::Shape xla_shape;
     TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dt, shape, &xla_shape));
     *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
@@ -1080,7 +1082,8 @@ TEST_F(XlaCompilerTest, ResultLayoutSingle) {
   auto options = DefaultOptions();
   // Sets the representation function to return a non-default layout.
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType type,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::Shape xla_shape;
     TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
     *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
@@ -1118,7 +1121,8 @@ TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
   auto options = DefaultOptions();
   // Sets the representation function to return a non-default layout.
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType type,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::Shape xla_shape;
     TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
     *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
@@ -1252,7 +1256,8 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType type,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::PrimitiveType ptype;
     TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
     return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
@@ -1322,7 +1327,8 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
   options.shape_representation_fn =
-      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+      [](const TensorShape& shape, DataType type,
+         bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
     xla::PrimitiveType ptype;
     TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
     return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 6996e39ba16..c95cd4e5475 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -415,7 +415,8 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 
   TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
                       ctx->compiler()->options().shape_representation_fn(
-                          variable->shape(), variable->type()));
+                          variable->shape(), variable->type(),
+                          /*use_fast_memory=*/false));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(
       TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
@@ -550,9 +551,10 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
-  TF_ASSIGN_OR_RETURN(
-      xla::Shape representation_shape,
-      ctx->compiler()->options().shape_representation_fn(shape, type));
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
+                      ctx->compiler()->options().shape_representation_fn(
+                          shape, type,
+                          /*use_fast_memory=*/false));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
   if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index b11e43a74d0..fa51753aa45 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -47,19 +47,20 @@ extern const char* const DEVICE_XLA_GPU;
 
 constexpr std::array<DataType, 4> kFloatTypes = {
     {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
-constexpr std::array<DataType, 12> kNumericTypes = {
-    {DT_UINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_INT32, DT_INT64, DT_HALF,
-     DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BFLOAT16}};
+constexpr std::array<DataType, 14> kNumericTypes = {
+    {DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_INT16, DT_INT32,
+     DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128,
+     DT_BFLOAT16}};
 
-constexpr std::array<DataType, 16> kCpuAllTypes = {
-    {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
-     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
+constexpr std::array<DataType, 18> kCpuAllTypes = {
+    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8,
+     DT_INT16, DT_INT32, DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
+     DT_COMPLEX64, DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 16> kGpuAllTypes = {
-    {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
-     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
+constexpr std::array<DataType, 18> kGpuAllTypes = {
+    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8,
+     DT_INT16, DT_INT32, DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
+     DT_COMPLEX64, DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index eeb598b165b..9066fb7e1e3 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_py",
 )
 
@@ -12,6 +12,7 @@ package(
 
 package_group(
     name = "friends",
+    includes = ["//tensorflow:internal"],
     packages = [
         "//tensorflow/compiler/...",
         "//tensorflow/contrib/tpu/...",
@@ -62,6 +63,7 @@ cc_library(
     hdrs = ["bit_cast.h"],
     visibility = [":friends"],
     deps = [
+        ":types",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
         "@com_google_absl//absl/base",
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index f9c93707f7a..029a2e0081f 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -3,4 +3,5 @@
 </p>
 
 XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
-algebra that optimizes TensorFlow computations. See the [documentation](./g3doc/overview.md).
+algebra that optimizes TensorFlow computations. See the
+[documentation](./g3doc/index.md).
diff --git a/tensorflow/compiler/xla/bit_cast.h b/tensorflow/compiler/xla/bit_cast.h
index c9edd7417eb..90e9a5c25dd 100644
--- a/tensorflow/compiler/xla/bit_cast.h
+++ b/tensorflow/compiler/xla/bit_cast.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index acf59c47f3c..b46d04dc328 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -296,6 +296,8 @@ cc_library(
     srcs = ["slicing.cc"],
     hdrs = ["slicing.h"],
     deps = [
+        ":arithmetic",
+        ":constants",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_builder",
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index 03ebe4e0098..203b67082bd 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -62,12 +62,16 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) {
       return ConstantR0<complex128>(builder, static_cast<complex128>(value));
     case U8:
       return ConstantR0<uint8>(builder, static_cast<uint8>(value));
+    case U16:
+      return ConstantR0<uint16>(builder, static_cast<uint16>(value));
     case U32:
       return ConstantR0<uint32>(builder, static_cast<uint32>(value));
     case U64:
       return ConstantR0<uint64>(builder, static_cast<uint64>(value));
     case S8:
       return ConstantR0<int8>(builder, static_cast<int8>(value));
+    case S16:
+      return ConstantR0<int16>(builder, static_cast<int16>(value));
     case S32:
       return ConstantR0<int32>(builder, static_cast<int32>(value));
     case S64:
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 3d15101ea66..ad525e69289 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/lib/math.h"
+
 // This macro is required to make MSVC defines math constants in math.h
 #define _USE_MATH_DEFINES
 #include <math.h>
 
-#include "tensorflow/compiler/xla/client/lib/math.h"
-
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -26,6 +26,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
+namespace {
+
+// Evaluate the polynomial given `x` and coefficients in decreasing order.
+template <typename FP>
+XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const FP> coefficients) {
+  static_assert(std::is_floating_point<FP>::value,
+                "Template-argument 'FP' must be a floating-point type");
+  XlaOp poly = ScalarLike(x, 0.0);
+  for (FP c : coefficients) {
+    poly = poly * x + ScalarLike(x, c);
+  }
+  return poly;
+}
+
+}  // namespace
 
 // Returns operation(operand), except if `operand` is one of the types in
 // upcast_types, in which case first converts it to F32, and then converts the
@@ -134,88 +149,132 @@ XlaOp Square(XlaOp operand) { return operand * operand; }
 
 XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
 
-// Evaluate the polynomial given coefficients and `x`.
-// N.B. Coefficients should be supplied in decreasing order.
-XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients) {
-  XlaOp poly = ScalarLike(x, 0.0);
-  for (float c : coefficients) {
-    poly = poly * x + ScalarLike(x, c);
-  }
-  return poly;
-}
-
 // Computes an approximation of the error function complement (1 - erf(x)).
 //
 // Precondition: abs(x) >= 1.  Otherwise, use ErfImpl.
 //
-// This follows Cephes's f32 implementation of erfc, and so it may have errors
-// for double precision.
-//
-// See also these alternate implementations of erf and erfc:
-//
-//   https://stackoverflow.com/questions/35148198
-//   https://stackoverflow.com/questions/35966695
-//
-static XlaOp ErfcImpl(XlaOp x) {
+// This follows Cephes's f32 implementation of erfc.
+static XlaOp ErfcImpl32(XlaOp x) {
   // Coefficients for erfc(f32), from Cephes.
-  //
-  // erfc(x) = exp(-x^2) P(1/x), 1 < x < 2
-  static std::array<float, 9> kErfcPCoefficient{
+  const double kMaxlog = 88.72283905206835;
+  // erfc(x) = exp(-x^2) P(1/x^2), 1 < x < 2
+  static const std::array<float, 9> kErfcPCoefficient{
       +2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
       -5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
       +3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
   };
-  // erfc(x) = exp(-x^2) 1/x P(1/x^2), 2 < x < 14
-  static std::array<float, 8> kErfcRCoefficient{
+  // erfc(x) = exp(-x^2) R(1/x^2), 2 <= x < kMaxlog
+  static const std::array<float, 8> kErfcRCoefficient{
       -1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
       +2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
       -2.820767439740514E-1, +5.641895067754075E-1,
   };
-
   XlaOp abs_x = Abs(x);
   XlaOp z = Exp(-x * x);
   XlaOp q = ScalarLike(x, 1) / abs_x;
   XlaOp y = q * q;
   XlaOp p = Select(Lt(abs_x, ScalarLike(x, 2.0)),
-                   EvaluatePolynomial(y, kErfcPCoefficient),
-                   EvaluatePolynomial(y, kErfcRCoefficient));
+                   EvaluatePolynomial<float>(y, kErfcPCoefficient),
+                   EvaluatePolynomial<float>(y, kErfcRCoefficient));
   y = z * q * p;
-  return Select(Lt(x, ScalarLike(x, 0)), ScalarLike(x, 2.0) - y, y);
+  XlaOp y_clamp = Select(Lt(z, ScalarLike(x, -kMaxlog)), ScalarLike(x, 0), y);
+  return Select(Lt(x, ScalarLike(x, 0)), ScalarLike(x, 2.0) - y_clamp, y_clamp);
 }
 
 // Compute a polynomial approximation of the error function.
 //
 // Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
 //
-// This follows Cephes's f32 implementation of erf, so it may have errors for
-// double precision.
-static XlaOp ErfImpl(XlaOp x) {
+// This follows Cephes's f32 implementation of erf.
+static XlaOp ErfImpl32(XlaOp x) {
   // Coefficients for by erf(f32), from Cephes.
   //
   // erf(x) = x P(x^2), 0 < x < 1
-  static std::array<float, 7> kErfTCoefficient{
+  static const std::array<float, 7> kErfTCoefficient{
       +7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
       -2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
       +1.128379165726710E+0,
   };
+  return x * EvaluatePolynomial<float>(x * x, kErfTCoefficient);
+}
 
-  return x * EvaluatePolynomial(x * x, kErfTCoefficient);
+static XlaOp ErfcImpl64(XlaOp x) {
+  // Coefficients for erfc(f64), from Cephes.
+  const double kMaxlog = 7.09782712893383996843E2;
+  // erfc(x) = exp(-x^2) P(|x|) / Q(|x|), 1 < x < 8
+  static const std::array<double, 9> kErfcPCoefficient{
+      2.46196981473530512524E-10, 5.64189564831068821977E-1,
+      7.46321056442269912687E0,   4.86371970985681366614E1,
+      1.96520832956077098242E2,   5.26445194995477358631E2,
+      9.34528527171957607540E2,   1.02755188689515710272E3,
+      5.57535335369399327526E2};
+  static const std::array<double, 9> kErfcQCoefficient{
+      1.00000000000000000000E0, 1.32281951154744992508E1,
+      8.67072140885989742329E1, 3.54937778887819891062E2,
+      9.75708501743205489753E2, 1.82390916687909736289E3,
+      2.24633760818710981792E3, 1.65666309194161350182E3,
+      5.57535340817727675546E2};
+
+  // erfc(x) = exp(-x^2) R(|x|) / S(|x|), 8 <= x < kMaxlog
+  static const std::array<double, 6> kErfcRCoefficient{
+      5.64189583547755073984E-1, 1.27536670759978104416E0,
+      5.01905042251180477414E0,  6.16021097993053585195E0,
+      7.40974269950448939160E0,  2.97886665372100240670E0};
+  static const std::array<double, 7> kErfcSCoefficient{
+      1.00000000000000000000E0, 2.26052863220117276590E0,
+      9.39603524938001434673E0, 1.20489539808096656605E1,
+      1.70814450747565897222E1, 9.60896809063285878198E0,
+      3.36907645100081516050E0};
+
+  XlaOp z = -x * x;
+  XlaOp abs_x = Abs(x);
+  XlaOp y =
+      Select(Lt(abs_x, ScalarLike(x, 8.0)),
+             Exp(z) * EvaluatePolynomial<double>(abs_x, kErfcPCoefficient) /
+                 EvaluatePolynomial<double>(abs_x, kErfcQCoefficient),
+             Exp(z) * EvaluatePolynomial<double>(abs_x, kErfcRCoefficient) /
+                 EvaluatePolynomial<double>(abs_x, kErfcSCoefficient));
+  XlaOp y_clamp = Select(Lt(z, ScalarLike(x, -kMaxlog)), ScalarLike(x, 0), y);
+  return Select(Lt(x, ScalarLike(x, 0)), ScalarLike(x, 2.0) - y_clamp, y_clamp);
+}
+
+// Compute a polynomial approximation of the error function.
+//
+// Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
+static XlaOp ErfImpl64(XlaOp x) {
+  // Coefficients for by erf(f64), from Cephes.
+  //
+  // erf(x) = x T(x^2) / U(x^2), 0 < x < 1
+  static std::array<double, 5> kErfTCoefficient{
+      9.60497373987051638749E0, 9.00260197203842689217E1,
+      2.23200534594684319226E3, 7.00332514112805075473E3,
+      5.55923013010394962768E4};
+  static std::array<double, 6> kErfUCoefficient{
+      1.00000000000000000000E0, 3.35617141647503099647E1,
+      5.21357949780152679795E2, 4.59432382970980127987E3,
+      2.26290000613890934246E4, 4.92673942608635921086E4};
+  XlaOp z = x * x;
+  return x * EvaluatePolynomial<double>(z, kErfTCoefficient) /
+         EvaluatePolynomial<double>(z, kErfUCoefficient);
 }
 
 XlaOp Erfc(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erfc", x));
-
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
     // erfc(x) =
     //   erfc_impl(x)           if x > 1
     //   1 - erf_impl(x)        otherwise
-    //
+    if (shape.element_type() == F64) {
+      return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl64(x),
+                    ScalarLike(x, 1) - ErfImpl64(x));
+    }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16}, [](XlaOp x) {
-      return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl(x),
-                    ScalarLike(x, 1) - ErfImpl(x));
+    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+      return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
+                    ScalarLike(x, 1) - ErfImpl32(x));
     });
   });
 }
@@ -224,15 +283,19 @@ XlaOp Erf(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erf", x));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
     // erf(x) =
     //   erf_impl(x)            if x < 1
     //   1 - erfc_impl(x)       otherwise
-    //
+    if (shape.element_type() == F64) {
+      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl64(x),
+                    ScalarLike(x, 1) - ErfcImpl64(x));
+    }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16}, [](XlaOp x) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl(x),
-                    ScalarLike(x, 1) - ErfcImpl(x));
+    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
+                    ScalarLike(x, 1) - ErfcImpl32(x));
     });
   });
 }
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 89a58aa3970..57e50e56fa7 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -43,10 +43,6 @@ XlaOp Square(XlaOp operand);
 // Computes the reciprocal of 'operand'.
 XlaOp Reciprocal(XlaOp operand);
 
-// Evaluates a polynomial given coefficients and 'x'.
-// N.B. Coefficients should be supplied in decreasing order.
-XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients);
-
 // Computes an approximation of the error function complement (1 - erf(x)).
 XlaOp Erfc(XlaOp x);
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index d4bc560b03f..f10342a8bf8 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <limits>
 
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -138,18 +140,54 @@ XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
   });
 }
 
-XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim) {
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim, bool sparse) {
   XlaBuilder* builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
-    ShapeUtil::AppendMajorDimension(1, &index_shape);
-    std::vector<XlaOp> to_concat;
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     if (ShapeUtil::ElementHasBitWidth(index_shape, 64) &&
         input_shape.dimensions(dim) < std::numeric_limits<uint32>::max()) {
       index = ConvertElementType(index, U32);
       index_shape.set_element_type(U32);
     }
+    if (index_shape.rank() == 1) {
+      return TorchIndexSelect(input, index, 0);
+    }
+    if (!sparse) {
+      std::vector<int64> index_broacast_dims;
+      std::vector<int64> input_broacast_dims;
+      std::vector<int64> sizes;
+      for (int64 i = 0; i < index_shape.rank(); ++i) {
+        if (i < dim) {
+          input_broacast_dims.push_back(i);
+          index_broacast_dims.push_back(i);
+        } else if (i == dim) {
+          sizes.push_back(input_shape.dimensions(i));
+          input_broacast_dims.push_back(i);
+          index_broacast_dims.push_back(i + 1);
+        } else {
+          input_broacast_dims.push_back(i + 1);
+          index_broacast_dims.push_back(i + 1);
+        }
+        sizes.push_back(index_shape.dimensions(i));
+      }
+      auto mask = Eq(
+          BroadcastInDim(index, sizes, index_broacast_dims),
+          Iota(builder, ShapeUtil::MakeShape(index_shape.element_type(), sizes),
+               dim));
+      auto masked_input = Select(
+          mask, BroadcastInDim(input, sizes, input_broacast_dims),
+          Zeros(builder,
+                ShapeUtil::MakeShape(input_shape.element_type(), sizes)));
+      return Reduce(masked_input, Zero(builder, input_shape.element_type()),
+                    CreateScalarIdentityWithZeroComputation(
+                        input_shape.element_type(), builder),
+                    {dim});
+    }
+
+    ShapeUtil::AppendMajorDimension(1, &index_shape);
+    std::vector<XlaOp> to_concat;
+
     to_concat.reserve(input_shape.rank());
     for (int64 i = 0; i < input_shape.rank(); ++i) {
       if (i == dim) {
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 89ec1fe510e..9a59a048b9f 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -55,7 +55,7 @@ XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
 // [X0,X1,X2,..XN] and dim = i `index` must be an n-dimensional tensor with size
 // [X0,X1,...Y,Xi+1,...,X[N] where y >= 1 and `out` will have the same sizes as
 // `index`.
-XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim);
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim, bool sparse = true);
 
 // Returns a new tensor which indexes the input tensor along dimension dim using
 // the entries in index.
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 04d3f96b6a5..107cbae0a73 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -102,7 +102,7 @@ XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
-XLA_TEST_F(SlicingTest, TorchGather) {
+XLA_TEST_F(SlicingTest, TorchGatherSparse) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp input, index;
@@ -116,6 +116,20 @@ XLA_TEST_F(SlicingTest, TorchGather) {
                            {input_data.get(), index_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, TorchGatherDense) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<int>({{1, 2}, {3, 4}}, 0, "input", &builder, &input);
+  auto index_data =
+      CreateR2Parameter<int>({{0, 0}, {1, 0}}, 1, "index", &builder, &index);
+  TorchGather(input, index, 1, false);
+
+  ComputeAndCompareR2<int>(&builder, {{1, 1}, {4, 3}},
+                           {input_data.get(), index_data.get()});
+}
+
 XLA_TEST_F(SlicingTest, TorchIndexSelectOn0) {
   xla::XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 1bd9d7b7228..153cb9f5212 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -176,12 +176,13 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     ExecutableRunOptions run_options) {
   TF_ASSIGN_OR_RETURN(auto options_and_stream,
                       RunHelper(arguments, run_options));
-
-  if (executable_->dumping_snapshot()) {
-    return ExecuteAndDump(&options_and_stream.first, arguments);
-  }
-  return executable_->ExecuteOnStreamWrapper(
-      &options_and_stream.first, run_options.execution_profile(), arguments);
+  ExecutableRunOptions options = options_and_stream.first.run_options();
+  options.set_device_ordinal(-1);
+  auto result = RunAsync(arguments, options);
+  Status block_status = options.stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
 }
 
 StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
@@ -189,50 +190,49 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
     ExecutableRunOptions run_options) {
   TF_ASSIGN_OR_RETURN(auto options_and_stream,
                       RunHelper(arguments, run_options));
-  return executable_->ExecuteAsyncOnStream(&options_and_stream.first,
-                                           arguments);
-}
+  se::Stream* stream = run_options.stream();
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
-    const ServiceExecutableRunOptions* run_options,
-    const absl::Span<const ShapedBuffer* const> arguments) {
-  executable_->hlo_snapshot()->set_execution_platform(
-      backend_->platform()->Name());
-  TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot()));
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result,
-      executable_->ExecuteOnStream(run_options, arguments,
-                                   /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot()));
-  DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot());
-  return std::move(result);
-}
-
-Status LocalExecutable::RecordArguments(
-    const absl::Span<const ShapedBuffer* const> arguments,
-    HloSnapshot* hlo_snapshot) {
-  hlo_snapshot->clear_arguments();
-  for (const ShapedBuffer* argument : arguments) {
-    TF_ASSIGN_OR_RETURN(Literal literal, LiteralFromShapedBuffer(*argument));
-    *hlo_snapshot->add_arguments() = literal.ToProto();
+  std::shared_ptr<HloSnapshot> snapshot;
+  if (executable_->dumping_snapshot()) {
+    snapshot = std::make_shared<HloSnapshot>();
+    snapshot->set_execution_platform(backend_->platform()->Name());
+    *snapshot->mutable_hlo() = *executable_->hlo_proto();
+    for (const ShapedBuffer* arg : arguments) {
+      auto literal = std::make_shared<Literal>(arg->on_host_shape());
+      backend_->transfer_manager()->TransferLiteralFromDevice(
+          stream, *arg, literal.get(), [snapshot, literal](Status status) {
+            if (!status.ok()) {
+              LOG(ERROR) << "TransferLiteralFromDevice for HLO snapshot inputs "
+                            "failed: "
+                         << status;
+              return;
+            }
+            *snapshot->add_arguments() = literal->ToProto();
+          });
+    }
   }
-  return Status::OK();
-}
 
-Status LocalExecutable::RecordResult(const ShapedBuffer* result,
-                                     HloSnapshot* hlo_snapshot) {
-  hlo_snapshot->clear_result();
-  TF_ASSIGN_OR_RETURN(Literal literal, LiteralFromShapedBuffer(*result));
-  *hlo_snapshot->mutable_result() = literal.ToProto();
-  return Status::OK();
-}
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer outputs,
+                      executable_->ExecuteAsyncOnStreamWrapper(
+                          &options_and_stream.first, arguments));
 
-StatusOr<Literal> LocalExecutable::LiteralFromShapedBuffer(
-    const ShapedBuffer& shaped_buffer) {
-  TF_ASSIGN_OR_RETURN(auto stream,
-                      backend_->BorrowStream(shaped_buffer.device_ordinal()));
-  return backend_->transfer_manager()->TransferLiteralFromDevice(stream.get(),
-                                                                 shaped_buffer);
+  // Transfer the outputs and save the snapshot to disk.
+  if (snapshot) {
+    auto literal = std::make_shared<Literal>(outputs.on_host_shape());
+    backend_->transfer_manager()->TransferLiteralFromDevice(
+        stream, outputs, literal.get(), [snapshot, literal](Status status) {
+          if (status.ok()) {
+            *snapshot->mutable_result() = literal->ToProto();
+          } else {
+            LOG(ERROR)
+                << "TransferLiteralFromDevice for HLO snapshot outputs failed: "
+                << status;
+          }
+          DumpHloSnapshotIfEnabled(*snapshot, GetDebugOptionsFromFlags());
+        });
+  }
+
+  return std::move(outputs);
 }
 
 se::Platform* LocalClient::platform() const {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 1e7c97d6f06..b697fb031fd 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -72,23 +72,6 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
 
-  // Records the computation in a SessionModule proto with the arguments used to
-  // invoke it, and the result. Enabled by flag: --xla_dump_hlo_snapshots.
-  //
-  // The given ServiceExecutableRunOptions override any values from the
-  // XLA_FLAGS environment variable.
-  StatusOr<ScopedShapedBuffer> ExecuteAndDump(
-      const ServiceExecutableRunOptions* run_options,
-      const absl::Span<const ShapedBuffer* const> arguments);
-
-  // Records the arguments used to invoke the computation in a SessionModule
-  // proto.
-  Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
-                         HloSnapshot* hlo_snapshot);
-
-  // Records the result of the computation in a SessionModule proto.
-  Status RecordResult(const ShapedBuffer* result, HloSnapshot* hlo_snapshot);
-
   // Returns a literal containing the contents of the given ShapedBuffer.
   StatusOr<Literal> LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 318d5f3be35..dccdec22fb9 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -289,6 +289,15 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
   return Status::OK();
 }
 
+Status XlaBuilder::SetInstructionFrontendAttribute(const XlaOp op,
+                                                   std::string attribute,
+                                                   std::string value) {
+  TF_ASSIGN_OR_RETURN(auto instr_proto, LookUpMutableInstruction(op));
+  auto* frontend_attributes = instr_proto->mutable_frontend_attributes();
+  (*frontend_attributes->mutable_map())[attribute] = std::move(value);
+  return Status::OK();
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -702,6 +711,12 @@ XlaOp XlaBuilder::BroadcastInDim(
     // not necessarily the same as the dimension sizes of the output shape.
     auto output_shape =
         ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+    if (operand_shape.rank() != broadcast_dimensions.size()) {
+      return InvalidArgument(
+          "Size of broadcast_dimensions has to match operand's rank; operand "
+          "rank: %lld, size of broadcast_dimensions %u.",
+          operand_shape.rank(), broadcast_dimensions.size());
+    }
     for (int i = 0; i < broadcast_dimensions.size(); i++) {
       if (broadcast_dimensions[i] < 0 ||
           broadcast_dimensions[i] > out_dim_size.size()) {
@@ -1028,6 +1043,11 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
           "Operand to GetTupleElement() is not a tuple; got %s",
           ShapeUtil::HumanString(tuple_shape));
     }
+    if (index < 0 || index >= ShapeUtil::TupleElementCount(tuple_shape)) {
+      return InvalidArgument(
+          "GetTupleElement() index (%d) out of range for tuple shape %s", index,
+          ShapeUtil::HumanString(tuple_shape));
+    }
     *instr.mutable_shape() =
         ShapeUtil::GetTupleElementShape(tuple_shape, index).ToProto();
 
@@ -1204,8 +1224,9 @@ XlaOp XlaBuilder::ConvGeneralDilated(
           rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
     }
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
-                        MakeWindow(window_dimensions, window_strides, padding,
-                                   lhs_dilation, rhs_dilation));
+                        ShapeInference::InferWindowFromDimensions(
+                            window_dimensions, window_strides, padding,
+                            lhs_dilation, rhs_dilation));
 
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferConvolveShape(
@@ -1226,60 +1247,6 @@ XlaOp XlaBuilder::ConvGeneralDilated(
   });
 }
 
-StatusOr<Window> XlaBuilder::MakeWindow(
-    absl::Span<const int64> window_dimensions,
-    absl::Span<const int64> window_strides,
-    absl::Span<const std::pair<int64, int64>> padding,
-    absl::Span<const int64> lhs_dilation,
-    absl::Span<const int64> rhs_dilation) const {
-  const auto verify_size = [&](const size_t x, const char* x_name) {
-    if (x == 0 || x == window_dimensions.size()) {
-      return Status::OK();
-    } else {
-      return InvalidArgument(
-          "%s", absl::StrCat(
-                    "Window has different number of window dimensions than of ",
-                    x_name,
-                    "\nNumber of window dimensions: ", window_dimensions.size(),
-                    "\nNumber of ", x_name, ": ", x, "\n"));
-    }
-  };
-  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
-  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
-  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
-  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
-
-  Window window;
-  for (size_t i = 0; i < window_dimensions.size(); i++) {
-    auto dim = window.add_dimensions();
-    dim->set_size(window_dimensions[i]);
-    if (!window_strides.empty()) {
-      dim->set_stride(window_strides[i]);
-    } else {
-      dim->set_stride(1);
-    }
-    if (!padding.empty()) {
-      dim->set_padding_low(padding[i].first);
-      dim->set_padding_high(padding[i].second);
-    } else {
-      dim->set_padding_low(0);
-      dim->set_padding_high(0);
-    }
-    if (!lhs_dilation.empty()) {
-      dim->set_base_dilation(lhs_dilation[i]);
-    } else {
-      dim->set_base_dilation(1);
-    }
-    if (!rhs_dilation.empty()) {
-      dim->set_window_dilation(rhs_dilation[i]);
-    } else {
-      dim->set_window_dilation(1);
-    }
-    dim->set_window_reversal(false);
-  }
-  return window;
-}
-
 XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const absl::Span<const int64> fft_length) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1739,9 +1706,11 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
 
 XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
-                         absl::Span<const int64> slice_sizes) {
+                         absl::Span<const int64> slice_sizes,
+                         bool indices_are_sorted) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
+    instr.set_indices_are_sorted(indices_are_sorted);
 
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
@@ -1764,9 +1733,11 @@ XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
 XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                           const XlaOp& updates,
                           const XlaComputation& update_computation,
-                          const ScatterDimensionNumbers& dimension_numbers) {
+                          const ScatterDimensionNumbers& dimension_numbers,
+                          bool indices_are_sorted) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
+    instr.set_indices_are_sorted(indices_are_sorted);
 
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape,
@@ -1952,9 +1923,10 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
-                        MakeWindow(window_dimensions, window_strides, padding,
-                                   /*lhs_dilation=*/base_dilations,
-                                   /*rhs_dilation=*/window_dilations));
+                        ShapeInference::InferWindowFromDimensions(
+                            window_dimensions, window_strides, padding,
+                            /*lhs_dilation=*/base_dilations,
+                            /*rhs_dilation=*/window_dilations));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReduceWindowShape(
                                          operand_shape, init_shape,
                                          instr.window(), to_apply_shape));
@@ -2199,8 +2171,9 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape,
                         scatter.GetProgramShape());
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
-                        MakeWindow(window_dimensions, window_strides, padding,
-                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+                        ShapeInference::InferWindowFromDimensions(
+                            window_dimensions, window_strides, padding,
+                            /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferSelectAndScatterShape(
                             operand_shape, select_shape, instr.window(),
@@ -2662,6 +2635,7 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
   if (sharding_) {
     *instr.mutable_sharding() = *sharding_;
   }
+  *instr.mutable_frontend_attributes() = frontend_attributes_;
 
   handle_to_index_[handle] = instructions_.size();
   instructions_.push_back(std::move(instr));
@@ -2719,32 +2693,67 @@ void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
   }
 }
 
-StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
-    const XlaOp& op) const {
-  TF_RETURN_IF_ERROR(first_error_);
+namespace {
 
-  if (op.builder_ == nullptr) {
+template <typename InstructionType>
+StatusOr<InstructionType> LookUpInstructionByHandleInternal(
+    const absl::flat_hash_map<int64, int64>& handle_to_index,
+    const std::vector<HloInstructionProto>& instructions, int64 handle) {
+  auto it = handle_to_index.find(handle);
+  if (it == handle_to_index.end()) {
+    return InvalidArgument("No XlaOp with handle %d", handle);
+  }
+  return const_cast<InstructionType>(&instructions.at(it->second));
+}
+
+template <typename InstructionType, typename OpBuilderType,
+          typename BuilderType, typename OpType>
+StatusOr<InstructionType> LookUpInstructionInternal(
+    const absl::flat_hash_map<int64, int64>& handle_to_index,
+    const std::vector<HloInstructionProto>& instructions,
+    OpBuilderType op_builder, BuilderType builder, OpType op_handle) {
+  if (op_builder == nullptr) {
     return InvalidArgument(
         "invalid XlaOp with handle %d; the builder of this op is freed",
-        op.handle());
+        op_handle);
   }
-  if (op.builder_ != this) {
+  if (op_builder != builder) {
     return InvalidArgument(
         "XlaOp with handle %d is built by builder '%s', but is trying to use "
         "it in builder '%s'",
-        op.handle(), op.builder_->name(), this->name());
+        op_handle, op_builder->name(), builder->name());
   }
 
-  return LookUpInstructionByHandle(op.handle());
+  return LookUpInstructionByHandleInternal<InstructionType>(
+      handle_to_index, instructions, op_handle);
+}
+
+}  // namespace
+
+StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
+    const XlaOp op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+  return LookUpInstructionInternal<const HloInstructionProto*>(
+      handle_to_index_, instructions_, op.builder_, this, op.handle());
 }
 
 StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstructionByHandle(
     int64 handle) const {
-  auto it = handle_to_index_.find(handle);
-  if (it == handle_to_index_.end()) {
-    return InvalidArgument("No XlaOp with handle %d", handle);
-  }
-  return &instructions_[it->second];
+  return LookUpInstructionByHandleInternal<const HloInstructionProto*>(
+      handle_to_index_, instructions_, handle);
+}
+
+StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstruction(
+    const XlaOp op) {
+  TF_RETURN_IF_ERROR(first_error_);
+  return LookUpInstructionInternal<HloInstructionProto*>(
+      handle_to_index_, instructions_, op.builder_, this, op.handle());
+}
+
+StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstructionByHandle(
+    int64 handle) {
+  return LookUpInstructionByHandleInternal<HloInstructionProto*>(
+      handle_to_index_, instructions_, handle);
 }
 
 // Enqueues a "retrieve parameter value" instruction for a parameter that was
@@ -3361,16 +3370,18 @@ XlaOp ReducePrecision(const XlaOp operand, const int exponent_bits,
 
 XlaOp Gather(const XlaOp input, const XlaOp start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             absl::Span<const int64> slice_sizes) {
+             absl::Span<const int64> slice_sizes, bool indices_are_sorted) {
   return input.builder()->Gather(input, start_indices, dimension_numbers,
-                                 slice_sizes);
+                                 slice_sizes, indices_are_sorted);
 }
 
 XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices,
               const XlaOp updates, const XlaComputation& update_computation,
-              const ScatterDimensionNumbers& dimension_numbers) {
+              const ScatterDimensionNumbers& dimension_numbers,
+              bool indices_are_sorted) {
   return input.builder()->Scatter(input, scatter_indices, updates,
-                                  update_computation, dimension_numbers);
+                                  update_computation, dimension_numbers,
+                                  indices_are_sorted);
 }
 
 void Send(const XlaOp operand, const ChannelHandle& handle) {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 89e8be7de1e..5c28e8b5150 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -147,8 +147,8 @@ class XlaBuilder {
   // Sets OpMetadata that will be added to all instructions until cleared.
   //
   // OpMetadata is often applied to a series of XLA HLO instructions. As a
-  // result, OpMetadata is set on the Computation Builder. All subsequent
-  // instructions generated via this Computation Builder will have the same
+  // result, OpMetadata is set on the computation builder. All subsequent
+  // instructions generated via this computation builder will have the same
   // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
 
@@ -158,6 +158,35 @@ class XlaBuilder {
   // Sets an OpSharding that will be attached to all instructions until cleared.
   void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
 
+  // Sets the FrontendAttributes that will be added to all instructions until
+  // cleared.
+  //
+  // FrontendAttributes are often applied to a series of XLA HLO instructions.
+  // As a result they are set on the computation builder and all the
+  // instructions generated via the computation builder will have the same
+  // frontend attributes attached to them.
+  void SetFrontendAttributes(const FrontendAttributes& frontend_attributes) {
+    frontend_attributes_ = frontend_attributes;
+  }
+
+  // Swap the passed FrontendAttributes with the ones currently set.
+  //
+  // Return the old attributes.
+  FrontendAttributes SwapFrontendAttributes(
+      const FrontendAttributes& frontend_attributes) {
+    FrontendAttributes old_attributes = std::move(frontend_attributes_);
+    frontend_attributes_ = frontend_attributes;
+    return old_attributes;
+  }
+
+  // Returns the FrontendAttributes that will be attached to all instructions.
+  const FrontendAttributes& frontend_attributes() const {
+    return frontend_attributes_;
+  }
+
+  // Clears all the frontend attributes.
+  void ClearFrontendAttributes() { frontend_attributes_.Clear(); }
+
   // Clears the sharding. Ops will be sharded according to the default placement
   // policy.
   void ClearSharding() { sharding_ = absl::nullopt; }
@@ -314,6 +343,16 @@ class XlaBuilder {
     ShapeIndex param_index;
   };
 
+  // Looks up the HloInstruction and sets the frontend attribute "attribute" to
+  // "value".
+  //
+  // If the attribute already existed then its value is updated.
+  //
+  // Note: the attribute is only added to the HloInstruction, not to the
+  // builder.
+  Status SetInstructionFrontendAttribute(XlaOp op, string attribute,
+                                         string value);
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id, bool remove_dynamic_dimensions);
@@ -547,11 +586,13 @@ class XlaBuilder {
 
   XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
-               absl::Span<const int64> slice_sizes);
+               absl::Span<const int64> slice_sizes,
+               bool indices_are_sorted = false);
 
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
-                const ScatterDimensionNumbers& dimension_numbers);
+                const ScatterDimensionNumbers& dimension_numbers,
+                bool indices_are_sorted = false);
 
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
@@ -593,9 +634,11 @@ class XlaBuilder {
   void AddCalledComputation(const XlaComputation& computation,
                             HloInstructionProto* instr);
 
-  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
+  StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
   StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
       int64 handle) const;
+  StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
+  StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(int64 handle);
 
   // Internal helper method that does the building for an arbitrary unary op.
   XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
@@ -649,14 +692,6 @@ class XlaBuilder {
       const Shape& lhs_shape, const Shape& rhs_shape,
       const ConvolutionDimensionNumbers& dimension_numbers) const;
 
-  // Helper function for creating a Window proto from user-supplied data.
-  // Returns error if the user-supplied data was invalid.
-  StatusOr<Window> MakeWindow(absl::Span<const int64> window_dimensions,
-                              absl::Span<const int64> window_strides,
-                              absl::Span<const std::pair<int64, int64>> padding,
-                              absl::Span<const int64> lhs_dilation,
-                              absl::Span<const int64> rhs_dilation) const;
-
   int64 GetNextId() { return ++next_id_; }
 
   // Populates the module with the input/output alias information stored within
@@ -713,6 +748,8 @@ class XlaBuilder {
 
   XlaBuilder* parent_builder_{nullptr};
 
+  FrontendAttributes frontend_attributes_;
+
   friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number,
                          const Shape& shape, const string& name,
                          const std::vector<bool>& replicated_at_leaf_buffers);
@@ -968,10 +1005,12 @@ class XlaBuilder {
                                const int mantissa_bits);
   friend XlaOp Gather(XlaOp input, XlaOp start_indices,
                       const GatherDimensionNumbers& dimension_numbers,
-                      absl::Span<const int64> slice_sizes);
+                      absl::Span<const int64> slice_sizes,
+                      bool indices_are_sorted);
   friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                        const XlaComputation& update_computation,
-                       const ScatterDimensionNumbers& dimension_numbers);
+                       const ScatterDimensionNumbers& dimension_numbers,
+                       bool indices_are_sorted);
   friend void Send(XlaOp operand, const ChannelHandle& handle);
   friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
                     const ChannelHandle& handle);
@@ -1038,6 +1077,27 @@ class XlaScopedShardingAssignment {
   absl::optional<OpSharding> prev_sharding_;
 };
 
+// RAII-style object: save the current builder's frontend attributes, and merge
+// them with the new ones on construction.
+// Restore the original attributes on destruction.
+class XlaScopedFrontendAttributesAssignment {
+ public:
+  XlaScopedFrontendAttributesAssignment(xla::XlaBuilder* builder,
+                                        FrontendAttributes attributes)
+      : builder_(builder) {
+    saved_ = builder_->SwapFrontendAttributes(attributes);
+  }
+
+  ~XlaScopedFrontendAttributesAssignment() {
+    builder_->SetFrontendAttributes(saved_);
+  }
+
+ private:
+  xla::XlaBuilder* const builder_;
+  FrontendAttributes saved_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaScopedFrontendAttributesAssignment);
+};
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
@@ -1802,12 +1862,14 @@ XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
 // Enqueues a Gather node onto the computation.
 XlaOp Gather(XlaOp input, XlaOp start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             absl::Span<const int64> slice_sizes);
+             absl::Span<const int64> slice_sizes,
+             bool indices_are_sorted = false);
 
 // Enqueues a Scatter node onto the computation.
 XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
               const XlaComputation& update_computation,
-              const ScatterDimensionNumbers& dimension_numbers);
+              const ScatterDimensionNumbers& dimension_numbers,
+              bool indices_are_sorted = false);
 
 // Enqueues a Send node onto the computation for device-to-device
 // communication. This operation sends the given operand to
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 12656a89943..701729b94f3 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -978,5 +978,151 @@ TEST_F(XlaBuilderTest, CheckInputOutputAlias) {
   EXPECT_EQ(*alias_p1, ShapeIndex({0}));
 }
 
+void ExpectAttributesMatch(const FrontendAttributes& attr,
+                           const FrontendAttributes& ref) {
+  EXPECT_EQ(ref.map_size(), attr.map_size());
+  for (auto reference : ref.map()) {
+    auto other = attr.map().find(reference.first);
+    EXPECT_NE(other, attr.map().end());
+    EXPECT_EQ(other->second, reference.second);
+  }
+}
+
+void ExpectInstructionsAttributesMatch(
+    const HloModule& module, const std::vector<FrontendAttributes>& expected) {
+  ASSERT_EQ(module.computation_count(), 1);
+  auto expected_it = expected.begin();
+  for (auto inst : module.entry_computation()->instructions()) {
+    ASSERT_NE(expected_it, expected.end());
+    ExpectAttributesMatch(inst->frontend_attributes(), *expected_it);
+    expected_it++;
+  }
+  EXPECT_EQ(expected_it, expected.end());
+}
+
+TEST_F(XlaBuilderTest, SimpleSetFrontendAttributes) {
+  XlaBuilder b(TestName());
+  FrontendAttributes attributes;
+
+  ConstantR0(&b, 0);  // No attribute set
+
+  (*attributes.mutable_map())["attr_a"] = "a";
+  b.SetFrontendAttributes(attributes);
+  ConstantR0(&b, 0);  // One attribute: { "attr_a": "a" }
+
+  b.ClearFrontendAttributes();
+  ConstantR0(&b, 0);  // No attribute set
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  std::vector<FrontendAttributes> expected{FrontendAttributes(), attributes,
+                                           FrontendAttributes()};
+  ExpectInstructionsAttributesMatch(*module, expected);
+}
+
+TEST_F(XlaBuilderTest, ComplexSetFrontendAttributes) {
+  XlaBuilder b(TestName());
+
+  ConstantR0(&b, 0);  // No attribute set.
+  std::vector<FrontendAttributes> expected{FrontendAttributes()};
+
+  {
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_a"] = "a";
+    b.SetFrontendAttributes(attributes);
+    ConstantR0(&b, 0);  // One attribute: { "attr_a": "a" }
+    expected.push_back(attributes);
+  }
+
+  {
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_b"] = "b";
+    b.SetFrontendAttributes(attributes);
+    ConstantR0(&b, 0);  // One attribute: { "attr_b": "b" }
+    expected.push_back(attributes);
+  }
+
+  {
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_b"] = "b";
+    (*attributes.mutable_map())["attr_c"] = "c";
+    b.SetFrontendAttributes(attributes);
+    ConstantR0(&b, 0);  // Two attributes: { "attr_b": "b", "attr_c": "c" }
+    expected.push_back(attributes);
+  }
+
+  b.ClearFrontendAttributes();
+  ConstantR0(&b, 0);  // No attribute set
+  expected.push_back(FrontendAttributes());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  ExpectInstructionsAttributesMatch(*module, expected);
+}
+
+TEST_F(XlaBuilderTest, AddFrontendAttribute) {
+  XlaBuilder b(TestName());
+
+  ConstantR0(&b, 0);
+  std::vector<FrontendAttributes> expected{FrontendAttributes()};
+
+  // One attribute: { "attr_a": "a" }
+  {
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_a"] = "a";
+    b.SetFrontendAttributes(attributes);
+    ConstantR0(&b, 0);
+    expected.push_back(attributes);
+  }
+
+  // Two attributes: {"attra": "a", "attr_c": "c"}
+  {
+    auto op = ConstantR0(&b, 0);
+    EXPECT_IS_OK(b.SetInstructionFrontendAttribute(op, "attr_c", "c"));
+
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_a"] = "a";
+    (*attributes.mutable_map())["attr_c"] = "c";
+    expected.push_back(attributes);
+  }
+
+  // Override value of existing "attr_a"
+  // One attribute: { "attr_a", "a2"}
+  {
+    auto op = ConstantR0(&b, 0);
+    EXPECT_IS_OK(b.SetInstructionFrontendAttribute(op, "attr_a", "a2"));
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_a"] = "a2";
+    expected.push_back(attributes);
+  }
+
+  // Check "attr_a" is back to its original value
+  // One attribute: { "attr_a", "a"}
+  {
+    auto op = ConstantR0(&b, 0);
+    (void)op;
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_a"] = "a";
+    expected.push_back(attributes);
+  }
+
+  b.ClearFrontendAttributes();
+  ConstantR0(&b, 0);  // No attribute set
+  expected.push_back(FrontendAttributes());
+
+  // One attribute: { "attr_d", "d"}
+  {
+    auto op = ConstantR0(&b, 0);
+    EXPECT_IS_OK(b.SetInstructionFrontendAttribute(op, "attr_d", "d"));
+    FrontendAttributes attributes;
+    (*attributes.mutable_map())["attr_d"] = "d";
+    expected.push_back(attributes);
+  }
+
+  ConstantR0(&b, 0);  // No attribute set
+  expected.push_back(FrontendAttributes());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  ExpectInstructionsAttributesMatch(*module, expected);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 45f9cbe4ce8..13173e0dbc8 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -149,6 +149,12 @@ static void AllocateFlags() {
         return true;
       };
 
+  // Custom "sub-parser" lambda for xla_gpu_ptx_file.
+  auto setter_for_xla_gpu_ptx_file = [](string value) {
+    flag_values->add_xla_gpu_ptx_file(value);
+    return true;
+  };
+
   // Custom "sub-parser" lambda for xla_backend_extra_options.
   auto setter_for_xla_backend_extra_options =
       [](string comma_separated_values) {
@@ -244,6 +250,13 @@ static void AllocateFlags() {
           "When xla_cpu_enable_fast_math is true then this controls whether "
           "we forbid to use multiplication by the reciprocal instead of "
           "division. Ignored when xla_cpu_enable_fast_math is false."),
+      tensorflow::Flag(
+          "xla_cpu_fast_math_honor_functions",
+          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
+          flag_values->xla_cpu_fast_math_honor_functions(),
+          "When xla_cpu_enable_fast_math is true then this controls whether "
+          "we forbid to approximate calculations for functions. Ignored when "
+          "xla_cpu_enable_fast_math is false."),
       tensorflow::Flag(
           "xla_gpu_enable_fast_min_max",
           bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
@@ -342,6 +355,13 @@ static void AllocateFlags() {
           int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
           flag_values->xla_gpu_max_kernel_unroll_factor(),
           "Specify the maximum kernel unroll factor for the GPU backend."),
+      tensorflow::Flag("xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
+                       "If non-empty, speficies a file containing ptx to use. "
+                       "The filename prefix must have the same pattern as PTX "
+                       "dumped by XLA. This allows to match one specific "
+                       "module. General workflow. Get the generated module "
+                       "ptx from XLA. Modify it. Then pass it back via this "
+                       "option."),
       tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
@@ -508,6 +528,12 @@ static void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
           flag_values->xla_gpu_force_conv_nchw(),
           "For cuDNN convolutions, always NCHW layouts."),
+      tensorflow::Flag("xla_gpu_algorithm_blacklist_path",
+                       string_setter_for(
+                           &DebugOptions::set_xla_gpu_algorithm_blacklist_path),
+                       flag_values->xla_gpu_algorithm_blacklist_path(),
+                       "An AlgorithmBlacklist text proto file as a blacklist "
+                       "of convolutions to avoid to use."),
   });
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index dafc3345555..7d225e1240c 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -11,16 +11,16 @@ upper_tabs:
   lower_tabs:
     # Subsite tabs
     other:
-    - name: Guide & Tutorials
+    - name: Overview
       contents:
-      - title: XLA overview
-        path: /xla/overview
+      - title: Overview
+        path: /xla
+      - title: XLA architecture
+        path: /xla/architecture
       - title: Broadcasting semantics
         path: /xla/broadcasting
       - title: Developing a new backend for XLA
         path: /xla/developing_new_backend
-      - title: Using JIT compilation
-        path: /xla/jit
       - title: Operation semantics
         path: /xla/operation_semantics
       - title: Shapes and layout
@@ -32,6 +32,8 @@ upper_tabs:
       - title: Writing custom calls
         path: /xla/custom_call
       - heading: Tutorials
+      - title: XLA autoclustering
+        path: /xla/tutorials/autoclustering_xla
       - title: XLA compile API
         path: /xla/tutorials/xla_compile
         status: experimental
diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml
deleted file mode 100644
index 858de427119..00000000000
--- a/tensorflow/compiler/xla/g3doc/_index.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-book_path: /xla/_book.yaml
-project_path: /xla/_project.yaml
-description: <!--no description-->
-landing_page:
-  custom_css_path: /site-assets/css/style.css
-  rows:
-  - heading: XLA is a compiler that optimizes TensorFlow computations.
-    items:
-    - classname: devsite-landing-row-50
-      description: >
-        XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
-        algebra that optimizes TensorFlow computations. The results are
-        improvements in speed, memory usage, and portability on server and mobile
-        platforms. The XLA framework is experimental and in active development.
-        For details, read the <a href="./overview">XLA guide</a>.
-
-  - classname: devsite-landing-row-cards
-    items:
-    - heading: XLA - TensorFlow, compiled
-      image_path: /resources/images/tf-logo-card-16x9.png
-      path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
-      buttons:
-      - label: Read on Google Developers blog
-        path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
-    - heading: XLA at the Dev Summit
-      youtube_id: kAOanJczHA0
-      buttons:
-      - label: Watch the video
-        path: https://www.youtube.com/watch?v=kAOanJczHA0
-    - heading: XLA on GitHub
-      image_path: /resources/images/github-card-16x9.png
-      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
-      buttons:
-      - label: View on GitHub
-        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
diff --git a/tensorflow/compiler/xla/g3doc/overview.md b/tensorflow/compiler/xla/g3doc/architecture.md
similarity index 75%
rename from tensorflow/compiler/xla/g3doc/overview.md
rename to tensorflow/compiler/xla/g3doc/architecture.md
index d3428b72761..f9be646c441 100644
--- a/tensorflow/compiler/xla/g3doc/overview.md
+++ b/tensorflow/compiler/xla/g3doc/architecture.md
@@ -1,25 +1,9 @@
-# XLA Overview
+# XLA Architecture
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:50%" src="./images/xlalogo.png">
 </div>
 
-> Note: XLA is still under development.  Some use cases will not
-> see improvements in speed or decreased memory usage.
-
-XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
-algebra that optimizes TensorFlow computations. The results are improvements in
-speed, memory usage, and portability on server and mobile platforms. Initially,
-most users will not see large benefits from XLA, but are welcome to experiment
-by using XLA via [just-in-time (JIT) compilation](./jit.md) or
-[ahead-of-time (AOT) compilation](./tfcompile.md). Developers targeting new
-hardware accelerators are especially encouraged to try out XLA.
-
-The XLA framework is experimental and in active development. In particular,
-while it is unlikely that the semantics of existing operations will change, it
-is expected that more operations will be added to cover important use cases. The
-team welcomes feedback from the community about missing functionality and
-community contributions via GitHub.
 
 ## Why did we build XLA?
 
@@ -91,8 +75,3 @@ code from this LLVM IR.
 
 The GPU backend currently supports NVIDIA GPUs via the LLVM NVPTX backend; the
 CPU backend supports multiple CPU ISAs.
-
-## Supported Platforms
-
-XLA currently supports [JIT compilation](./jit.md) on x86-64 and NVIDIA GPUs; and
-[AOT compilation](./tfcompile.md) for x86-64 and ARM.
diff --git a/tensorflow/compiler/xla/g3doc/custom_call.md b/tensorflow/compiler/xla/g3doc/custom_call.md
index acc2c9a92f5..7837f0aefaf 100644
--- a/tensorflow/compiler/xla/g3doc/custom_call.md
+++ b/tensorflow/compiler/xla/g3doc/custom_call.md
@@ -128,8 +128,8 @@ using xla::ShapeUtil;
 Shape p0_shape = ShapeUtil::MakeTuple({
     ShapeUtil::MakeShape(F32, {32}),
     ShapeUtil::MakeTuple({
-        ShapeUtil::MakeTuple(F32, {64}),
-        ShapeUtil::MakeTuple(F32, {128}),
+        ShapeUtil::MakeShape(F32, {64}),
+        ShapeUtil::MakeShape(F32, {128}),
     }),
     ShapeUtil::MakeShape(F32, {256}),
 });
@@ -197,133 +197,18 @@ subbuffers of `output_tuple` are accessible by dereferencing `out`.
 ### Tuples in GPU custom-calls
 
 In GPU code, we have a function `do_custom_call(..., void** buffers, ...)`. In
-this case `buffers` is a host array of *nine* device pointers, one for each
-nested buffer. To generate the flat list, we iterate over the parameters and
-output, and then do preorder traversal of their shapes. Concretely:
+this case `buffers` is a host array of *six* device pointers, one for each leaf
+buffer in the input/output. To generate the flat list, we iterate over the
+parameters and output, and for each we do a preorder traversal of its shape.
+Concretely:
 
 ```c++
 // Layout of `buffers` parameter to GPU custom call function for custom-call
 // above.
-buffers[0] == param0
-buffers[1] == subbuf0 or null
-buffers[2] == subtuple or null
-buffers[3] == subbuf1 or null
-buffers[4] == subbuf2 or null
-buffers[5] == subbuf3 or null
-buffers[6] == output_tuple
-buffers[7] == output_subbuf0
-buffers[8] == output_subbuf1
+buffers[0] == subbuf0
+buffers[1] == subbuf1
+buffers[2] == subbuf2
+buffers[3] == subbuf3
+buffers[4] == output_subbuf0
+buffers[5] == output_subbuf1
 ```
-
-The `or null` part is significant. A sub-buffer of an input tuple will be
-non-null in the `buffers` list if XLA is able to statically analyze the program
-and figure out the address of the sub-buffer. This is usually the case, but may
-not be in programs with control flow and/or `select` ops over tuples.
-
-A correct custom-call implementation that accepts a tuple as input must always
-handle null input sub-buffers, by dereferencing the root tuple.
-
-The rule is reversed for output buffers. The output sub-buffers will always be
-populated, but it's up to the custom call to populate the root tuple at the end.
-
-See the following code.  Note that we leave out CUDA error handling for clarity,
-but you'll be thankful if you do it, because otherwise it can be hard to tell
-when a stream encounters an error.
-
-```c++
-void do_custom_call(CUstream stream, void** buffers, const char* opaque,
-                    size_t opaque_len) {
-  bool needs_sync = false;
-  const float* subbuf0 = reinterpret_cast<const float*>(buffers[1]);
-  if (subbuf0 == nullptr) {
-    needs_sync = true;
-    cudaMemcpyAsync(&subbuf0, buffers[0], sizeof(void*),
-                    cudaMemcpyDeviceToHost, stream);
-  }
-  const void** subtuple = reinterpret_cast<const void**>(buffers[2]);
-  if (subtuple == nullptr) {
-    needs_sync = true;
-    cudaMemcpyAsync(&subtuple, buffers[2], ...);
-  }
-
-  // ... similarly for other params ...
-
-  // Wait for copies enqueued above to complete.
-  if (needs_sync) {
-    cudaStreamSynchronize(stream);
-  }
-  needs_sync = false;
-
-  // Now that we have `subtuple`, we can get subbuf1 and subbuf2.
-  float* subbuf1 = buffers[3];
-  if (subbuf1 == nullptr) {
-    needs_sync = true;
-    cudaMemcpyAsync(&subbuf1, subtuple, ...);
-  }
-  float* subbuf2 = buffers[4];
-  if (subbuf2 == nullptr) {
-    needs_sync = true;
-    cudaMemcpyAsync(&subbuf2, subtuple + 1, ...);
-  }
-
-  // Wait for copies enqueued above to complete.
-  if (needs_sync) {
-    cudaStreamSynchronize(stream);
-  }
-
-  // ... actually run the kernel ...
-
-  // Fill the output tuple.
-  void* outputs[2] = {buffers[7], buffers[8]};
-  cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice,
-                  stream);
-
-  // Necessary to force the cudaMemcpyAsync above to complete before `outputs`
-  // goes out of scope.  A sync is only necessary in the tuple output case, and
-  // see below for a way to avoid this.
-  cudaStreamSynchronize(stream);
-}
-```
-
-The `cudaStreamSynchronize` at the end of the function is unfortunate, as it's
-not required in the non-tuple-output case, and it can be expensive.  One way to
-get around this would be to make `outputs` into a global variable and ensure
-that the previous cudaMemcpyAsync completed before overwriting the global and
-enqueueing another one.  This is sketched below.
-
-```
-void do_custom_call(CUstream stream, void** buffers, const char* opaque,
-                    size_t opaque_len) {
-
-  // ... Beginning of function is the same as above ...
-
-  // ... actually run the kernel ...
-
-  static std::atomic<bool> first_time{true};
-  static CUevent event;
-  static void* outputs[2];
-  if (first_time.fetch_and(false)) {
-    // First time running this function.  Initialize `event`.
-    cuEventCreate(&event, CU_EVENT_DISABLE_TIMING);
-  } else {
-    // Not first time running this function.  Wait for previous event to
-    // complete before touching `outputs`.
-    cuEventSynchronize(event);
-  }
-
-  // Fill the output tuple.
-  outputs[0] = buffers[7];
-  outputs[1] = buffers[8];
-  cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice,
-                  stream);
-
-  // Unblock `event` after the memcpy completes.
-  cuEventRecord(event, stream);
-}
-```
-
-This simple implementation would limit parallelism if you want to run this op on
-multiple GPUs concurrently (or on one GPU with multiple streams); in that case
-you might need multiple events and globals.  We have seen one implementation of
-this algorithm which keeps a pool of globals and events and periodically polls
-them (perhaps on each call to the op) to garbage collect.
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png b/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png
deleted file mode 100644
index 4e2dc091fee..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png b/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png
deleted file mode 100644
index 39d7c90c4fc..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png
deleted file mode 100644
index a38f636983b..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png
deleted file mode 100644
index 285c3a96d5a..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png
deleted file mode 100644
index 488fc2c2f10..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png
deleted file mode 100644
index d0df38cf181..00000000000
Binary files a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png and /dev/null differ
diff --git a/tensorflow/compiler/xla/g3doc/images/tf_xla_performance.png b/tensorflow/compiler/xla/g3doc/images/tf_xla_performance.png
new file mode 100644
index 00000000000..70087f5747c
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/tf_xla_performance.png differ
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
new file mode 100644
index 00000000000..c3b708d6907
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -0,0 +1,168 @@
+# XLA: Optimizing Compiler for TensorFlow
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that accelerates TensorFlow models with potentially no source code
+changes.
+
+The results are improvements in speed and memory usage: most internal benchmarks
+run ~1.15x faster after XLA is enabled. The dataset below is evaluated on a
+single NVidia V100 GPU:
+
+<div style="width:90%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:90%" src="./images/tf_xla_performance.png">
+</div>
+
+## Introduction
+
+When a TensorFlow program is run, all of the operations are executed
+individually by the TensorFlow executor. Each TensorFlow operation has a
+precompiled GPU kernel implementation that the executor dispatches to.
+
+XLA provides an alternative mode of running TF models: it compiles the
+TensorFlow graph into a sequence of computation kernels generated specifically
+for the given model. Because these kernels are unique to the model, they can
+exploit model-specific information for optimization. For example, let's look at
+an optimization XLA does in the context of a simple TensorFlow computation:
+
+```
+def model_fn(x, y, z):
+  return tf.reduce_sum(x + y * z)
+```
+
+Run without XLA, the graph launches three kernels: one for the multiplication,
+one for the addition and one for the reduction. However, XLA can optimize the
+graph so that it computes the result in a single kernel launch. It does this by
+"fusing" the addition, multiplication and reduction into a single GPU kernel.
+Moreover, this fused operation does not write out the intermediate values
+produced by `y*z` and `x+y*z` to memory; instead it "streams" the results of
+these intermediate computations directly to their users while keeping them
+entirely in GPU registers. Fusion is XLA's single most important optimization.
+Memory bandwidth is typically the scarcest resource on hardware accelerators, so
+removing memory operations is one of the best ways to improve performance.
+
+## Enable XLA for TensorFlow models
+
+### Auto-clustering
+
+A simplest way to start using XLA in TensorFlow models is to enable
+_auto-clustering_, which automatically finds _clusters_ (connected subgraphs)
+within the TensorFlow graph which can be compiled and executed using XLA.
+Auto-clustering on GPU can be enabled by either modifying the `TF_XLA_FLAGS`
+environment variable:
+
+```
+$ TF_XLA_FLAGS=--tf_xla_auto_jit=2 path/to/your/tf/program
+```
+
+Or by setting a configuration value within the program:
+
+```
+import tensorflow as tf
+
+tf.config.optimizer_set_jit(True)
+
+# ... the rest of your program ...
+```
+
+Note: The JIT level is cached for a session, and can only be set in the very
+beginning of the program. In order to change it midway through, the session
+needs to be cleared: `tf.keras.backend.clear_session()`
+
+Auto-clustering is currently optimized for GPU workloads, but it can also be
+enabled on CPU by additionally using the flag `--tf_xla_cpu_global_jit`:
+
+```
+$ TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" path/to/your/program
+```
+
+For a detailed usage example, see the
+[auto-clustering tutorial colab](./tutorials/autoclustering_xla.ipynb).
+
+### Use `xla.compile`
+
+The `xla.compile` API offers a more fine-grained control for choosing which
+functions should be compiled with XLA. However, it requires restructuring source
+code, as not all TensorFlow operations can be represented in XLA. That is, when
+using `xla.compile` you pass it the functions which should be compiled using
+XLA; a failure to compile results in an exception.
+
+See the [`xla.compile` tutorial colab](./tutorials/xla_compile.ipynb) for usage
+examples.
+
+### AOT (Ahead-of-time) compilation for CPU with `tfcompile`
+
+You can also use a standalone [`tfcompile`](./tfcompile) tool,
+which converts TensorFlow graph into executable code (for CPU only).
+
+## Inspect compiled programs
+
+XLA provides introspection facilities which let you inspect the generated
+programs. To dump the generated programs, use the environment variable
+`XLA_FLAGS`:
+
+```
+$ XLA_FLAGS="--dump_hlo_as_text --xla_dump_to=/tmp/generated"
+TF_XLA_FLAGS="--tf_xla_auto_jit=2" my/tensorflow/program
+```
+
+After the dumping is performed, you can find the following files in
+`/tmp/generated`:
+
+-   `module_XXXX.*_optimizations.txt` Generated
+    [XLA programs](./operation_semantics.md), one per each compiled cluster.
+    Attaching those when submitting XLA bug reports is extremely helpful!
+
+-   `module_XXXX.ir-*.ll` Generated files in
+    [LLVM](https://llvm.org/docs/LangRef.html) intermediate representation, with
+    [NVPTX](https://llvm.org/docs/NVPTXUsage.html) intrinsics.
+
+-   `module_XXXX.ptx` Generated
+    [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
+    files.
+
+You can also dump the graph visualizing the embedding of XLA clusters inside of
+the TensorFlow graph with:
+
+```
+$ TF_DUMP_GRAPH_PREFIX=/tmp/generated TF_XLA_FLAGS="--tf_xla_clustering_debug"
+```
+
+## Supported platforms
+
+Auto-clustering is supported on NVIDIA GPUs, and ahead-of-time compilation is
+supported on x86-64 CPUs. Auto-clustering support on multi-GPU environments and
+on a CPU is experimental.
+
+## Generating great bug reports
+
+A bug report is much easier to reproduce if it includes dumps for the generated
+XLA programs and the used auto-clustering embedding.
+To generate them for a TensorFlow program running with auto-clustering, launch:
+
+```
+$ TF_DUMP_GRAPH_PREFIX=/tmp/generated TF_XLA_FLAGS="--tf_xla_clustering_debug
+--tf_xla_auto_jit=2" XLA_FLAGS="--dump_hlo_as_text --xla_dump_to=/tmp/generated"
+my/tensorflow/program"
+```
+
+When filing bugs, attach the contents of the `/tmp/generated` directory
+(referenced above).
+
+If possible, try to isolate
+a bug to a single XLA program by using the
+[`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/replay_computation.cc)
+and iteratively running it on generated programs.
+
+## Further reading
+
+-   [XLA Architecture](./architecture.md): Overview of the XLA architecture
+-   [XLA - TensorFlow, Compiled](https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html):
+    Read on Google Developers Blog
+-   Check out the
+    [XLA source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla)
+    on Github!
+
+<iframe frameborder="0" allowfullscreen="1" allow="accelerometer; autoplay;
+encrypted-media; gyroscope; picture-in-picture" width="640" height="360"
+src="https://www.youtube.com/embed/kAOanJczHA0?origin=https%3A%2F%2Fwww.tensorflow.org&amp;autohide=1&amp;showinfo=0&amp;video-id=kAOanJczHA0&amp;enablejsapi=1&amp;widgetid=1"
+id="widget2" data-title="YouTube video player"></iframe>
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
deleted file mode 100644
index d7ce5ee1ba6..00000000000
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ /dev/null
@@ -1,163 +0,0 @@
-# Using JIT Compilation
-
-> Note: TensorFlow must be compiled from source to include XLA.
-
-## Why use just-in-time (JIT) compilation?
-
-The TensorFlow/XLA JIT compiler compiles and runs parts of TensorFlow graphs via
-XLA. The benefit of this over the standard TensorFlow implementation is that XLA
-can fuse multiple operators (kernel fusion) into a small number of compiled
-kernels. Fusing operators can reduce memory bandwidth requirements and improve
-performance compared to executing operators one-at-a-time, as the TensorFlow
-executor does.
-
-## Running TensorFlow graphs via XLA
-
-There are two ways to run TensorFlow computations via XLA, either by
-JIT-compiling operators placed on a CPU or GPU device, or by placing operators
-on the `XLA_CPU` or `XLA_GPU` TensorFlow devices. Placing operators directly on
-a TensorFlow XLA device forces the operator to run on that device and is mainly
-used for testing.
-
-> Note: The XLA CPU backend supports intra-op parallelism (i.e. it can shard a
-> single operation across multiple cores) but it does not support inter-op
-> parallelism (i.e. it cannot execute independent operations concurrently across
-> multiple cores).  The XLA GPU backend is competitive with the standard
-> TensorFlow implementation, sometimes faster, sometimes slower.
-
-### Turning on JIT compilation
-
-JIT compilation can be turned on at the session level or manually for select
-operations. Both of these approaches are zero-copy --- data does not need to be
-copied when passing data between a compiled XLA kernel and a TensorFlow operator
-placed on the same device.
-
-#### Session
-
-Turning on JIT compilation at the session level will result in all possible
-operators being greedily compiled into XLA computations. Each XLA computation
-will be compiled into one or more kernels for the underlying device.
-
-Subject to a few constraints, if there are two adjacent operators in the graph
-that both have XLA implementations, then they will be compiled into a single XLA
-computation.
-
-JIT compilation is turned on at the session level by setting the
-`global_jit_level` config to `tf.OptimizerOptions.ON_1` and passing the config
-during session initialization.
-
-```python
-# Config to turn on JIT compilation
-config = tf.ConfigProto()
-config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
-
-sess = tf.Session(config=config)
-```
-
-> Note: Turning on JIT at the session level will not result in operations being
-> compiled for the CPU. JIT compilation for CPU operations must be done via
-> the manual method documented below.
-
-#### Manual with experimental_jit_scope()
-
-JIT compilation can also be turned on manually for one or more operators. This
-is done by tagging the operators to compile with the attribute
-`_XlaCompile=true`. The simplest way to do this is via the
-`tf.contrib.compiler.jit.experimental_jit_scope()` scope defined in
-[`tensorflow/contrib/compiler/jit.py`](https://www.tensorflow.org/code/tensorflow/contrib/compiler/jit.py).
-Example usage:
-
-```python
-    jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
-
-    x = tf.placeholder(np.float32)
-    with jit_scope():
-      y = tf.add(x, x)  # The "add" will be compiled with XLA.
-```
-
-The `_XlaCompile` attribute is currently supported on a best-effort basis. If an
-operator cannot be compiled, TensorFlow will silently fall back to the normal
-implementation.
-
-#### Manual with xla.compile()
-
-Unlike experimental_jit_scope() which silently falls back to normal Tensorflow
-on uncompilable operator, xla.compile() returns an explicit error. This is
-useful if you want more predictable behaviors from XLA compilation.
-
-Please see
-[xla.compile() tutorial Colab](./tutorials/xla_compile.ipynb)
-for how to use it.
-
-### Placing operators on XLA devices
-
-Another way to run computations via XLA is to place an operator on a specific
-XLA device. This method is normally only used for testing. Valid targets are
-`XLA_CPU` or `XLA_GPU`.
-
-```python
-with tf.device("/job:localhost/replica:0/task:0/device:XLA_GPU:0"):
-  output = tf.add(input1, input2)
-```
-
-Unlike JIT compilation on the standard CPU and GPU devices, these devices make a
-copy of data when it is transferred on and off the device. The extra copy makes
-it expensive to mix XLA and TensorFlow operators in the same graph.
-
-## Tutorial
-
-This tutorial covers training a simple version of MNIST softmax with JIT turned
-on. Currently JIT at the session level, which is what is used for the tutorial,
-only supports GPU.
-
-Before starting the tutorial verify that the LD_LIBRARY environment variable or
-ldconfig contains `$CUDA_ROOT/extras/CUPTI/lib64`, which contains libraries for
-the CUDA Profiling Tools Interface
-[(CUPTI)](http://docs.nvidia.com/cuda/cupti/index.html). TensorFlow uses CUPTI
-to pull tracing information from the GPU.
-
-### Step #1: Prepare sample script
-
-Download or move
-[mnist_softmax_xla.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py)
-into a folder outside of the TensorFlow source tree.
-
-### Step #2: Run without XLA
-
-Execute the python script to train the model without XLA.
-
-```shell
-python mnist_softmax_xla.py --xla=''
-```
-
-Using the Chrome Trace Event Profiler (browse to chrome://tracing),
-open the timeline file created when the script finishes: `timeline.ctf.json`.
-The rendered timeline should look similar to the picture below with multiple
-green boxes labeled `MatMul`, possibly across multiple CPUs.
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/jit_timeline_gpu.png">
-</div>
-
-### Step #3 Run with XLA
-
-Execute the python script to train the model with XLA and turn on a debugging
-feature of XLA via an environmental variable that outputs the XLA graph.
-
-```shell
-XLA_FLAGS="--xla_hlo_profile --xla_dump_to=/tmp/foo --xla_dump_hlo_as_text"
-python mnist_softmax_xla.py
-```
-
-Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
-should look similar to the picture below with one long bar labeled `XlaLaunch`.
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/jit_timeline_gpu_xla.png">
-</div>
-
-To understand what is happening in `XlaLaunch`, look at the console output. Each
-XLA cluster that's launched will have a corresponding profile (from
-`--xla_hlo_profile`) showing how long each HLO took to run.
-
-`/tmp/foo` will contain the HLO before and after optimizations for each HLO
-module that's run. You can read this as-is, or you can visualize it using
-`tensorflow/compiler/xla/tools:interactive_graphviz`.
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index d6c99580c39..1f2790e98bb 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1379,13 +1379,16 @@ For a more intuitive description, see the "Informal Description" section below.
 :                        :                     : map indices in                :
 :                        :                     : `start_indices` to legal      :
 :                        :                     : indices into operand.         :
+| `indices_are_sorted`   | `bool`              | Whether the indices are       |
+:                        :                     : guaranteed to be sorted by    :
+:                        :                     : the caller.                   :
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
 
 The output is an array of rank `batch_dims.size` + `offset_dims.size`.
 
-The `operand.rank` must equal the sume of `offset_dims.size` and
+The `operand.rank` must equal the sum of `offset_dims.size` and
 `collapsed_slice_dims`. Also, `slice_sizes.size` has to be equal to
 `operand.rank`.
 
@@ -1443,6 +1446,10 @@ and range [`0`, `operand.rank`) \ `collapsed_slice_dims`. So if, e.g.,
 `offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`,
 `2`} then `remapped_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
 
+If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
+are sorted (in ascending `start_index_map` order) by the user. If they are not
+then the semantics is implementation defined.
+
 ### Informal Description and Examples
 
 Informally, every index `Out` in the output array corresponds to an element `E`
@@ -1980,8 +1987,12 @@ window_strides, padding)` </b>
 | `window_dilations`  | `ArraySlice<int64>` | array of integers for window     |
 :                     :                     : dilation values                  :
 | `padding`           | `Padding`           | padding type for window          |
-:                     :                     : (Padding\:\:kSame or             :
-:                     :                     : Padding\:\:kValid)               :
+:                     :                     : (Padding\:\:kSame, which pads so :
+:                     :                     : as to have the same output shape :
+:                     :                     : as input if the stride is 1, or  :
+:                     :                     : Padding\:\:kValid, which uses no :
+:                     :                     : no padding and "stops" the       :
+:                     :                     : window once it no longer fits)   :
 
 Below code and figure shows an example of using `ReduceWindow`. Input is a
 matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
@@ -2027,6 +2038,17 @@ padding.
   <img style="width:75%" src="./images/ops_reduce_window_stride.png">
 </div>
 
+For a non-trivial padding example, consider computing reduce-window minimum
+(initial value is `MAX_FLOAT`) with dimension `3` and stride `2` over the input
+array `[10000, 1000, 100, 10, 1]`. Padding `kValid` computes minimums over two
+_valid_ windows: `[10000, 1000, 100]` and `[100, 10, 1]`, resulting in the
+output `[100, 1]`. Padding `kSame` first pads the array so that the shape after
+the reduce-window would be the _same_ as input for stride one by adding initial
+elements on both sides, getting `[MAX_VALUE, 10000, 1000, 100, 10, 1,
+MAX_VALUE]`. Running reduce-window over the padded array operates on three
+windows `[MAX_VALUE, 10000, 1000]`, `[1000, 100, 10]`, `[10, 1, MAX_VALUE]`, and
+yields `[1000, 10, 1]`.
+
 The evaluation order of the reduction function is arbitrary and may be
 non-deterministic. Therefore, the reduction function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
@@ -2213,6 +2235,7 @@ Arguments                      | Type                | Semantics
 `update_window_dims`           | `ArraySlice<int64>` | The set of dimensions in `updates` shape that are _window dimensions_.
 `inserted_window_dims`         | `ArraySlice<int64>` | The set of _window dimensions_ that must be inserted into `updates` shape.
 `scatter_dims_to_operand_dims` | `ArraySlice<int64>` | A dimensions map from the scatter indices to the operand index space. This array is interpreted as mapping `i` to `scatter_dims_to_operand_dims[i]` . It has to be one-to-one and total.
+`indices_are_sorted`           | `bool`              | Whether the indices are guaranteed to be sorted by the caller.
 
 If `index_vector_dim` is equal to `scatter_indices.rank` we implicitly consider
 `scatter_indices` to have a trailing `1` dimension.
@@ -2299,6 +2322,10 @@ always be the current value from the `output` array and the second parameter
 will always be the value from the `updates` array. This is important
 specifically for cases when the `update_computation` is _not commutative_.
 
+If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
+are sorted (in ascending `start_index_map` order) by the user. If they are not
+then the semantics is implementation defined.
+
 Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e.
 the scatter op updates the elements in the input that are extracted by the
 corresponding gather op.
@@ -2517,6 +2544,11 @@ arguments to the slice operation.
 :                 :                     : respective `start_indices` value for :
 :                 :                     : the dimension and less than or equal :
 :                 :                     : to the size of the dimension.        :
+| `strides`      | `ArraySlice<int64>` | List of N integers that decides the   |
+:                 :                     : input stride of the slice.  The slice :
+:                 :                     : picks every `strides[d]` element in  :
+:                 :                     : dimension `d`.                       :
+
 
 1-dimensional example:
 
diff --git a/tensorflow/compiler/xla/g3doc/tfcompile.md b/tensorflow/compiler/xla/g3doc/tfcompile.md
index 5ee09fd302b..c80e2745341 100644
--- a/tensorflow/compiler/xla/g3doc/tfcompile.md
+++ b/tensorflow/compiler/xla/g3doc/tfcompile.md
@@ -16,9 +16,7 @@ kernels that are actually used in the computation.
 
 The compiler is built on top of the XLA framework. The code bridging TensorFlow
 to the XLA framework resides under
-[tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/),
-which also includes support for [just-in-time (JIT) compilation](jit.md) of
-TensorFlow graphs.
+[tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/).
 
 ## What does tfcompile do?
 
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
new file mode 100644
index 00000000000..78f1bca1478
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
@@ -0,0 +1,222 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "CIFT with XLA.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "b7noD9NjFRL-"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/autoclustering_xla\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mz65veHXsmnS"
+      },
+      "source": [
+        "# Classifying CIFAR-10 with XLA\n",
+        "\n",
+        "In this colab we train a TensorFlow model to classify the [CIFAR-10](https://en.wikipedia.org/wiki/CIFAR-10) dataset, and we compile it using XLA.\n",
+        "\n",
+        "We start by loading and normalizing the dataset using the Keras API:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "7vm2QsMisCxI"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "\n",
+        "# Check that GPU is available: cf. https://colab.research.google.com/notebooks/gpu.ipynb\n",
+        "assert(tf.test.is_gpu_available())\n",
+        "\n",
+        "tf.keras.backend.clear_session()\n",
+        "tf.config.optimizer.set_jit(False) # Start with XLA disabled.\n",
+        "\n",
+        "def load_data():\n",
+        "  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()\n",
+        "  x_train = x_train.astype('float32') / 256\n",
+        "  x_test = x_test.astype('float32') / 256\n",
+        "\n",
+        "  # Convert class vectors to binary class matrices.\n",
+        "  y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)\n",
+        "  y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)\n",
+        "  return ((x_train, y_train), (x_test, y_test))\n",
+        "\n",
+        "(x_train, y_train), (x_test, y_test) = load_data()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MgNM2tbgtScx"
+      },
+      "source": [
+        "We define the model, adapted from the Keras [CIFAR-10 example](https://keras.io/examples/cifar10_cnn/):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "3ZRQSwoRsKM_"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_model():\n",
+        "  return tf.keras.models.Sequential([\n",
+        "    tf.keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Conv2D(32, (3, 3)),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "    tf.keras.layers.Dropout(0.25),\n",
+        "\n",
+        "    tf.keras.layers.Conv2D(64, (3, 3), padding='same'),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Conv2D(64, (3, 3)),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "    tf.keras.layers.Dropout(0.25),\n",
+        "\n",
+        "    tf.keras.layers.Flatten(),\n",
+        "    tf.keras.layers.Dense(512),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Dropout(0.5),\n",
+        "    tf.keras.layers.Dense(10),\n",
+        "    tf.keras.layers.Activation('softmax')\n",
+        "  ])\n",
+        "\n",
+        "model = generate_model()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-M4GtGDZtb8a"
+      },
+      "source": [
+        "We train the model using the\n",
+        "[RMSprop](https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer)\n",
+        "optimizer:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "UKCmrhF0tiMa"
+      },
+      "outputs": [],
+      "source": [
+        "def compile_model(model):\n",
+        "  opt = tf.keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)\n",
+        "  model.compile(loss='categorical_crossentropy',\n",
+        "                optimizer=opt,\n",
+        "                metrics=['accuracy'])\n",
+        "  return model\n",
+        "\n",
+        "model = compile_model(model)\n",
+        "\n",
+        "def train_model(model, x_train, y_train, x_test, y_test, epochs=25):\n",
+        "  model.fit(x_train, y_train, batch_size=256, epochs=epochs, validation_data=(x_test, y_test), shuffle=True)\n",
+        "\n",
+        "def warmup(model, x_train, y_train, x_test, y_test):\n",
+        "  # Warm up the JIT, we do not wish to measure the compilation time.\n",
+        "  initial_weights = model.get_weights()\n",
+        "  train_model(model, x_train, y_train, x_test, y_test, epochs=1)\n",
+        "  model.set_weights(initial_weights)\n",
+        "\n",
+        "warmup(model, x_train, y_train, x_test, y_test)\n",
+        "%time train_model(model, x_train, y_train, x_test, y_test)\n",
+        "\n",
+        "scores = model.evaluate(x_test, y_test, verbose=1)\n",
+        "print('Test loss:', scores[0])\n",
+        "print('Test accuracy:', scores[1])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "SLpfQ0StRgsu"
+      },
+      "source": [
+        "Now let's train the model again, using the XLA compiler.\n",
+        "To enable the compiler in the middle of the application, we need to reset the Keras session."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "jxU-Tzy4SX7p"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.backend.clear_session() # We need to clear the session to enable JIT in the middle of the program.\n",
+        "tf.config.optimizer.set_jit(True) # Enable XLA.\n",
+        "model = compile_model(generate_model())\n",
+        "(x_train, y_train), (x_test, y_test) = load_data()\n",
+        "\n",
+        "warmup(model, x_train, y_train, x_test, y_test)\n",
+        "%time train_model(model, x_train, y_train, x_test, y_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iWHz6P1se92F"
+      },
+      "source": [
+        "On a machine with a Titan V GPU and an Intel Xeon E5-2690 CPU the speed up is ~1.17x."
+      ]
+    }
+  ],
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
index 2a83092805b..38abda8974f 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -370,4 +370,4 @@
       "outputs": []
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index f216bd63d77..4f309cd9f70 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
 // details.
 class Tile {
  public:
@@ -136,6 +136,7 @@ class Layout {
     Equal& MinorToMajorOnly() {
       ignore_tiles_ = true;
       ignore_element_size_ = true;
+      ignore_memory_space_ = true;
       return *this;
     }
 
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 63d9a1e9067..03b47ba7089 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -891,7 +891,7 @@ string LiteralBase::GetSparseElementAsString(
   }
 }
 
-StatusOr<int64> LiteralBase::GetIntegralAsS64(
+absl::optional<int64> LiteralBase::GetIntegralAsS64(
     absl::Span<const int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
@@ -908,12 +908,11 @@ StatusOr<int64> LiteralBase::GetIntegralAsS64(
     case U64:
       return Get<uint64>(multi_index);
     default:
-      return FailedPrecondition("Array element type is not integral: %s",
-                                PrimitiveType_Name(shape().element_type()));
+      return absl::nullopt;
   }
 }
 
-StatusOr<double> LiteralBase::GetAsDouble(
+absl::optional<double> LiteralBase::GetAsDouble(
     absl::Span<const int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
@@ -926,8 +925,27 @@ StatusOr<double> LiteralBase::GetAsDouble(
     case BF16:
       return static_cast<double>(Get<bfloat16>(multi_index));
     default:
-      return FailedPrecondition("Array element type is not floating: %s",
-                                PrimitiveType_Name(shape().element_type()));
+      return absl::nullopt;
+  }
+}
+
+absl::optional<complex128> LiteralBase::GetAsComplex128(
+    absl::Span<const int64> multi_index) const {
+  switch (shape().element_type()) {
+    case BF16:
+      return {{static_cast<double>(Get<bfloat16>(multi_index)), 0}};
+    case F16:
+      return {{static_cast<double>(Get<Eigen::half>(multi_index)), 0}};
+    case F32:
+      return {{Get<float>(multi_index), 0}};
+    case F64:
+      return {{Get<double>(multi_index), 0}};
+    case C64:
+      return {Get<complex64>(multi_index)};
+    case C128:
+      return {Get<complex128>(multi_index)};
+    default:
+      return absl::nullopt;
   }
 }
 
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index ffd5a883240..af15cab4a94 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -130,13 +130,47 @@ class LiteralBase {
   // value into text.
   string GetSparseElementAsString(int64 sparse_element_number,
                                   const ShapeIndex& shape_index = {}) const;
+
+  // Return whether the value at the specified index is equal to the provided
+  // generic `value` (T must be an arithmetic type).
+  //
+  // Precondition: must be an array.
+  template <typename T>
+  typename std::enable_if<(std::is_arithmetic<T>::value ||
+                           std::is_same<T, Eigen::half>::value ||
+                           std::is_same<T, bfloat16>::value),
+                          bool>::type
+  IsEqualAt(absl::Span<const int64> multi_index, T value) const {
+    if (auto as_s64 = GetIntegralAsS64(multi_index)) {
+      return *as_s64 == value;
+    }
+    complex128 as_complex128 = *GetAsComplex128(multi_index);
+    return as_complex128.imag() == 0 && as_complex128.real() == value;
+  }
+
+  bool IsEqualAt(absl::Span<const int64> multi_index, complex128 value) const {
+    if (auto as_s64 = GetIntegralAsS64(multi_index)) {
+      return *as_s64 == value.real() && value.imag() == 0;
+    }
+    auto as_complex128 = GetAsComplex128(multi_index);
+    return *as_complex128 == value;
+  }
+
   // As Get(), but determines the correct type and converts the value into
   // int64.  This literal must be an array.
-  StatusOr<int64> GetIntegralAsS64(absl::Span<const int64> multi_index) const;
+  absl::optional<int64> GetIntegralAsS64(
+      absl::Span<const int64> multi_index) const;
 
   // As Get(), but determines the correct type, and converts the value into
   // double. This literal must be an array.
-  StatusOr<double> GetAsDouble(absl::Span<const int64> multi_index) const;
+  absl::optional<double> GetAsDouble(absl::Span<const int64> multi_index) const;
+
+  // As Get(), but determines the correct type, and converts the value into
+  // complex128. All floating point types can be converted into complex128.
+  //
+  // This literal must be an array.
+  absl::optional<complex128> GetAsComplex128(
+      absl::Span<const int64> multi_index) const;
 
   // Returns the multi-index of the element in a sparse literal at the given
   // sparse element number.  The sparse element number is the position with in
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 8d46d30b4cf..885d18db673 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -2021,5 +2021,46 @@ TEST_F(LiteralUtilTest, BroadcastScalarToMatrix) {
             LiteralUtil::CreateR2<int32>({{9, 9}, {9, 9}}));
 }
 
+TEST_F(LiteralUtilTest, GetAsComplex128) {
+  complex128 value = {1, 0};
+  Literal c1 = LiteralUtil::CreateR0<complex128>(value);
+  EXPECT_EQ(*c1.GetAsComplex128({}), value);
+  Literal c2 = LiteralUtil::CreateR0<double>(1);
+  EXPECT_EQ(*c2.GetAsComplex128({}), value);
+  complex64 float_value = {1, 0};
+  Literal c4 = LiteralUtil::CreateR0<complex64>(float_value);
+  EXPECT_EQ(*c4.GetAsComplex128({}), value);
+  complex128 other_value = {1, 2};
+  Literal c5 = LiteralUtil::CreateR0<complex128>(other_value);
+  EXPECT_EQ(*c5.GetAsComplex128({}), other_value);
+  Literal c6 = LiteralUtil::CreateR0<int64>(1);
+  EXPECT_FALSE(c6.GetAsComplex128({}).has_value());
+}
+
+TEST_F(LiteralUtilTest, IsEqualAt) {
+  double val_double = 10.0;
+  int val_integral = 10;
+  Literal c1 = LiteralUtil::CreateR0<int>(10);
+  EXPECT_TRUE(c1.IsEqualAt({}, val_double));
+  EXPECT_TRUE(c1.IsEqualAt({}, val_integral));
+  Literal c2 = LiteralUtil::CreateR0<double>(10);
+  EXPECT_TRUE(c2.IsEqualAt({}, val_double));
+  EXPECT_TRUE(c2.IsEqualAt({}, val_integral));
+  complex128 val_complex = {10, 0};
+  EXPECT_TRUE(c2.IsEqualAt({}, val_complex));
+  EXPECT_TRUE(c1.IsEqualAt({}, val_complex));
+  Literal c3 = LiteralUtil::CreateR0<complex128>(val_complex);
+  EXPECT_TRUE(c3.IsEqualAt({}, val_double));
+  EXPECT_TRUE(c3.IsEqualAt({}, val_integral));
+  EXPECT_TRUE(c3.IsEqualAt({}, val_complex));
+  double val_inf = 1. / 0;
+  EXPECT_FALSE(c3.IsEqualAt({}, val_inf));
+  complex128 val_true_complex = {10, 3};
+  complex64 val_smaller_complex = {10, 3};
+  Literal c4 = LiteralUtil::CreateR0<complex128>(val_true_complex);
+  EXPECT_TRUE(c4.IsEqualAt({}, val_true_complex));
+  EXPECT_TRUE(c4.IsEqualAt({}, val_smaller_complex));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 95186b94511..70dc386eb14 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -147,12 +147,16 @@ Literal ConvertType(LiteralSlice literal) {
   switch (primitive_type) {
     case U8:
       return LiteralUtil::CreateR0<uint8>(1);
+    case U16:
+      return LiteralUtil::CreateR0<uint16>(1);
     case U32:
       return LiteralUtil::CreateR0<uint32>(1);
     case U64:
       return LiteralUtil::CreateR0<uint64>(1);
     case S8:
       return LiteralUtil::CreateR0<int8>(1);
+    case S16:
+      return LiteralUtil::CreateR0<int16>(1);
     case S32:
       return LiteralUtil::CreateR0<int32>(1);
     case S64:
@@ -171,9 +175,6 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<complex128>(1);
     case PRED:
       return LiteralUtil::CreateR0<bool>(true);
-    case S16:
-    case U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE_TYPE:
@@ -187,12 +188,16 @@ Literal ConvertType(LiteralSlice literal) {
   switch (primitive_type) {
     case U8:
       return LiteralUtil::CreateR0<uint8>(std::numeric_limits<uint8>::min());
+    case U16:
+      return LiteralUtil::CreateR0<uint16>(std::numeric_limits<uint16>::min());
     case U32:
       return LiteralUtil::CreateR0<uint32>(std::numeric_limits<uint32>::min());
     case U64:
       return LiteralUtil::CreateR0<uint64>(std::numeric_limits<uint64>::min());
     case S8:
       return LiteralUtil::CreateR0<int8>(std::numeric_limits<int8>::min());
+    case S16:
+      return LiteralUtil::CreateR0<int16>(std::numeric_limits<int16>::min());
     case S32:
       return LiteralUtil::CreateR0<int32>(std::numeric_limits<int32>::min());
     case S64:
@@ -209,9 +214,6 @@ Literal ConvertType(LiteralSlice literal) {
       LOG(FATAL) << "C128 element type has no minimum value";
     case PRED:
       return LiteralUtil::CreateR0<bool>(false);
-    case S16:
-    case U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
       return LiteralUtil::CreateR0<half>(
           static_cast<half>(-std::numeric_limits<float>::infinity()));
@@ -231,12 +233,16 @@ Literal ConvertType(LiteralSlice literal) {
   switch (primitive_type) {
     case U8:
       return LiteralUtil::CreateR0<uint8>(std::numeric_limits<uint8>::max());
+    case U16:
+      return LiteralUtil::CreateR0<uint16>(std::numeric_limits<uint16>::max());
     case U32:
       return LiteralUtil::CreateR0<uint32>(std::numeric_limits<uint32>::max());
     case U64:
       return LiteralUtil::CreateR0<uint64>(std::numeric_limits<uint64>::max());
     case S8:
       return LiteralUtil::CreateR0<int8>(std::numeric_limits<int8>::max());
+    case S16:
+      return LiteralUtil::CreateR0<int16>(std::numeric_limits<int16>::max());
     case S32:
       return LiteralUtil::CreateR0<int32>(std::numeric_limits<int32>::max());
     case S64:
@@ -249,9 +255,6 @@ Literal ConvertType(LiteralSlice literal) {
           std::numeric_limits<double>::infinity());
     case PRED:
       return LiteralUtil::CreateR0<bool>(true);
-    case S16:
-    case U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
       return LiteralUtil::CreateR0<half>(
           static_cast<half>(std::numeric_limits<float>::infinity()));
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index c50c0baf007..2f12db73330 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -226,8 +226,7 @@ class LiteralUtil {
   // in invocation between the above signature and this one.
   static Literal MakeTupleOwned(std::vector<Literal> elements);
 
-  // This overload lets you pass a braced list of Literals to
-  // MakeTupleOwned:
+  // This overload lets you pass a list of Literals to MakeTupleOwned:
   //
   //   LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1(...), ...).
   //
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 295d3530032..034c14e8930 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -45,7 +45,7 @@ const int kBFloat16MantissaBits = 7;
 template <typename NativeT>
 PrimitiveType NativeToPrimitiveType() {
   // Make the expression depend on the template parameter NativeT so
-  // that this compile-time error only apperas if this function is
+  // that this compile-time error only appears if this function is
   // instantiated with some concrete type that is not specialized
   // below.
   static_assert(!std::is_same<NativeT, NativeT>::value,
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index e476015f94f..b7c30531923 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -39,12 +39,17 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
 }
 
 Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
-                            const string& directory, const string& file_name) {
+                            const string& directory, const string& file_name,
+                            string* full_path) {
   tensorflow::Env* env = tensorflow::Env::Default();
   TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory));
   string safe_file_name = SanitizeFileName(file_name) + ".pb";
-  const string path = tensorflow::io::JoinPath(directory, safe_file_name);
-  return tensorflow::WriteBinaryProto(env, path, message);
+  string full_path_impl;
+  if (!full_path) {
+    full_path = &full_path_impl;
+  }
+  *full_path = tensorflow::io::JoinPath(directory, safe_file_name);
+  return tensorflow::WriteBinaryProto(env, *full_path, message);
 }
 
 }  // namespace protobuf_util
diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index e20a7e95a63..7db020982b9 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -37,8 +37,12 @@ extern bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
 // 'directory/file_name.pb'. The 'directory' is recursively created if it
 // doesn't already exist, and the 'file_name' is sanitized by replacing
 // illegal characters with underscore '_'.
+//
+// If 'full_name' is not null then it is set to the name of the file the
+// protobuf was written to.
 Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
-                            const string& directory, const string& file_name);
+                            const string& directory, const string& file_name,
+                            string* full_path = nullptr);
 
 // Registers a function that may either expand a dirpath or forward the original
 // dirpath along as-is.
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index a6a1bd1830e..4377dabaa9d 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
-load("//tensorflow:tensorflow.bzl", "tf_pybind_extension")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:default/build_config.bzl", "pyx_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps", "xla_python_default_plugins")
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -29,15 +29,14 @@ py_test(
     name = "xla_client_test",
     srcs = ["xla_client_test.py"],
     main = "xla_client_test.py",
-    python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
+    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
     deps = [
         ":custom_call_for_test",
         ":xla_client",
-        "//tensorflow/compiler/xla:xla_data_proto_py",
-        "//tensorflow/python:platform_test",
-    ],
+        ":xla_extension",
+        "@absl_py//absl/testing:absltest",
+    ] + xla_py_test_deps(),
 )
 
 cc_library(
@@ -69,7 +68,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -171,9 +169,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "device",
-    srcs = ["device.cc"],
-    hdrs = ["device.h"],
+    name = "device_state",
+    srcs = ["device_state.cc"],
+    hdrs = ["device_state.h"],
     deps = [
         ":event_pool",
         ":semaphore",
@@ -189,24 +187,11 @@ cc_library(
 
 cc_library(
     name = "local_client",
-    srcs = [
-        "local_client.cc",
-        "python_ref_manager.cc",
-        "python_ref_manager.h",
-    ],
-    hdrs = [
-        "local_client.h",
-    ],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-        "-Wno-c++98-c++11-compat",
-    ],
-    features = ["-use_header_modules"],
+    srcs = ["local_client.cc"],
+    hdrs = ["local_client.h"],
     deps = [
-        ":device",
+        ":device_state",
         ":shared_device_buffer",
-        ":types",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -222,22 +207,39 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:allocator",
         "//tensorflow/core:bfc_allocator",
         "//tensorflow/core:gpu_mem_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:tf_allocator_adapter",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "python_ref_manager",
+    srcs = ["python_ref_manager.cc"],
+    hdrs = ["python_ref_manager.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+        "-Wno-c++98-c++11-compat",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@pybind11",
     ],
 )
 
-tf_pybind_extension(
+pybind_extension(
     name = "xla_extension",
     srcs = [
         "xla.cc",
@@ -252,6 +254,7 @@ tf_pybind_extension(
     deps = [
         ":local_client",
         ":shared_device_buffer",
+        ":python_ref_manager",
         ":types",
         ":xrt",
         "@com_google_absl//absl/base",
diff --git a/tensorflow/compiler/xla/python/custom_call_for_test.pyx b/tensorflow/compiler/xla/python/custom_call_for_test.pyx
index 530dffd1755..4f7c4c3e5a8 100644
--- a/tensorflow/compiler/xla/python/custom_call_for_test.pyx
+++ b/tensorflow/compiler/xla/python/custom_call_for_test.pyx
@@ -15,7 +15,7 @@ cdef void test_subtract_f32(void* out_ptr, void** data_ptr) nogil:
 cpu_custom_call_targets = {}
 
 cdef register_custom_call_target(fn_name, void* fn):
-  cdef const char* name = "xla._CPU_CUSTOM_CALL_TARGET"
+  cdef const char* name = "xla._CUSTOM_CALL_TARGET"
   cpu_custom_call_targets[fn_name] = PyCapsule_New(fn, name, NULL)
 
 register_custom_call_target(b"test_subtract_f32", <void*>(test_subtract_f32))
diff --git a/tensorflow/compiler/xla/python/device.cc b/tensorflow/compiler/xla/python/device_state.cc
similarity index 80%
rename from tensorflow/compiler/xla/python/device.cc
rename to tensorflow/compiler/xla/python/device_state.cc
index 73df698a274..6363a5a488f 100644
--- a/tensorflow/compiler/xla/python/device.cc
+++ b/tensorflow/compiler/xla/python/device_state.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/python/device.h"
+#include "tensorflow/compiler/xla/python/device_state.h"
 
 #include <memory>
 #include <vector>
@@ -24,8 +24,9 @@ limitations under the License.
 
 namespace xla {
 
-Device::Device(se::StreamExecutor* executor, bool synchronous_deallocation,
-               bool asynchronous, bool allow_event_reuse)
+DeviceState::DeviceState(se::StreamExecutor* executor,
+                         bool synchronous_deallocation, bool asynchronous,
+                         bool allow_event_reuse)
     : synchronous_deallocation_(synchronous_deallocation),
       event_pool_(allow_event_reuse),
       compute_semaphore_(/*capacity=*/asynchronous ? 32 : 1) {
@@ -49,14 +50,14 @@ Device::Device(se::StreamExecutor* executor, bool synchronous_deallocation,
                                                      "py_xla_callback");
 }
 
-Device::~Device() {
+DeviceState::~DeviceState() {
   Status status = SynchronizeAllActivity();
   if (!status.ok()) {
     LOG(ERROR) << "Error when closing device: " << status;
   }
 }
 
-Status Device::SynchronizeAllActivity() {
+Status DeviceState::SynchronizeAllActivity() {
   Status status;
   // TODO(phawkins): in theory the call to SynchronizeAllActivity below should
   // suffice. However on the Host platform SynchronizeAllActivity is a dummy
@@ -64,6 +65,7 @@ Status Device::SynchronizeAllActivity() {
   // stopped, also block on the compute stream. If SynchronizeAllActivity is
   // fixed, we could remove the BlockHostUntilDone call.
   status.Update(compute_stream_->BlockHostUntilDone());
+  status.Update(callback_stream_->BlockHostUntilDone());
   bool ok = compute_stream_->parent()->SynchronizeAllActivity();
   if (!ok) {
     status.Update(Unknown("SynchronizeAllActivity failed."));
@@ -71,10 +73,10 @@ Status Device::SynchronizeAllActivity() {
   return status;
 }
 
-Status Device::ThenMemcpyDeviceToDevice(se::Stream* src_stream,
-                                        se::Stream* dst_stream,
-                                        se::DeviceMemoryBase src_buffer,
-                                        se::DeviceMemoryBase dst_buffer) {
+Status DeviceState::ThenMemcpyDeviceToDevice(se::Stream* src_stream,
+                                             se::Stream* dst_stream,
+                                             se::DeviceMemoryBase src_buffer,
+                                             se::DeviceMemoryBase dst_buffer) {
   // The default implementation simply calls ThenMemcpyD2D, and assumes that
   // the buffer addresses identify the devices. This does not work
   // on all platforms; this method is virtual so it can be overridden.
@@ -82,14 +84,14 @@ Status Device::ThenMemcpyDeviceToDevice(se::Stream* src_stream,
   return Status::OK();
 }
 
-void Device::ThenExecuteOnCallbackThread(se::Stream* stream,
-                                         std::function<void()> callback) const {
+void DeviceState::ThenExecuteOnCallbackThread(
+    se::Stream* stream, std::function<void()> callback) const {
   stream->ThenDoHostCallback([this, callback]() mutable {
     callback_thread_->Schedule(std::move(callback));
   });
 }
 
-se::Stream* Device::GetDeviceToDeviceStream() {
+se::Stream* DeviceState::GetDeviceToDeviceStream() {
   absl::MutexLock lock(&mu_);
   int i = next_device_to_device_stream_;
   next_device_to_device_stream_ =
diff --git a/tensorflow/compiler/xla/python/device.h b/tensorflow/compiler/xla/python/device_state.h
similarity index 91%
rename from tensorflow/compiler/xla/python/device.h
rename to tensorflow/compiler/xla/python/device_state.h
index f40c5df7c61..f108c517169 100644
--- a/tensorflow/compiler/xla/python/device.h
+++ b/tensorflow/compiler/xla/python/device_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_STATE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_STATE_H_
 
 #include <memory>
 #include <vector>
@@ -29,8 +29,9 @@ limitations under the License.
 namespace xla {
 
 // Class that encapsulates state relating to a device (e.g., a GPU) on which we
-// can perform computation and transfers.
-class Device {
+// can perform computation and transfers. DeviceState objects only exist for
+// devices local to this host.
+class DeviceState {
  public:
   // If synchronous_deallocation is true, the host must not free buffers until
   // compute/transfers that use those buffers have completed. For example, this
@@ -39,9 +40,9 @@ class Device {
   //
   // If asynchronous is false, the host will synchronize to the device after
   // each execution or transfer. This is intended for debugging only.
-  Device(se::StreamExecutor* executor, bool synchronous_deallocation,
-         bool asynchronous, bool allow_event_reuse);
-  virtual ~Device();
+  DeviceState(se::StreamExecutor* executor, bool synchronous_deallocation,
+              bool asynchronous, bool allow_event_reuse);
+  virtual ~DeviceState();
 
   bool synchronous_deallocation() const { return synchronous_deallocation_; }
 
@@ -131,4 +132,4 @@ class Device {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DEVICE_STATE_H_
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index 982bf9eb21f..1d9bd1f0695 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -85,14 +85,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "include/pybind11/pybind11.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
-#include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -106,13 +105,20 @@ limitations under the License.
 
 namespace xla {
 
-namespace py = pybind11;
+std::string CpuDevice::DebugString() const {
+  return absl::StrCat("CPU_", id());
+}
+
+std::string GpuDevice::DebugString() const {
+  return absl::StrCat("GPU_", id());
+}
 
 static StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
-    se::Platform* platform, LocalClient* client, double memory_fraction,
-    bool preallocate) {
+    se::Platform* platform,
+    absl::Span<const std::unique_ptr<DeviceState>> device_states,
+    LocalClient* client, double memory_fraction, bool preallocate) {
   CHECK_GT(client->backend().device_count(), 0);
-  std::vector<std::unique_ptr<tensorflow::Allocator>> allocators;
+  std::vector<se::MultiDeviceAdapter::AllocatorWithStream> allocators;
   for (se::StreamExecutor* executor : client->backend().stream_executors()) {
     int device_ordinal = executor->device_ordinal();
     auto sub_allocator = absl::make_unique<tensorflow::GPUMemAllocator>(
@@ -141,12 +147,23 @@ static StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
         sub_allocator.release(), allocator_memory,
         /*allow_growth=*/!preallocate,
         absl::StrCat("GPU_", device_ordinal, "_bfc"));
-    allocators.emplace_back(std::move(gpu_bfc_allocator));
+    allocators.emplace_back(std::move(gpu_bfc_allocator),
+                            device_states.at(device_ordinal)->compute_stream());
   }
   return absl::make_unique<se::MultiDeviceAdapter>(platform,
                                                    std::move(allocators));
 }
 
+static std::shared_ptr<Device> MakeDevice(const std::string& platform_name,
+                                          int id, int local_device_ordinal) {
+  if (platform_name == "cpu") {
+    return std::make_shared<CpuDevice>(id, local_device_ordinal);
+  } else {
+    CHECK_EQ(platform_name, "gpu");
+    return std::make_shared<GpuDevice>(id, local_device_ordinal);
+  }
+}
+
 StatusOr<std::shared_ptr<PyLocalClient>> PyLocalClient::Get(
     const std::string& platform_name, const std::string& xla_platform_name,
     bool asynchronous, const AllocatorConfig& allocator_config) {
@@ -162,14 +179,26 @@ StatusOr<std::shared_ptr<PyLocalClient>> PyLocalClient::Get(
                       ClientLibrary::GetOrCreateLocalClient(options));
 
   bool gpu_platform = platform_name == "gpu";
+  std::vector<std::unique_ptr<DeviceState>> device_states;
+  std::vector<std::shared_ptr<Device>> devices;
+  bool synchronous_deallocation = platform_name == "cpu";
+  for (int i = 0; i < client->device_count(); ++i) {
+    se::StreamExecutor* executor =
+        client->backend().stream_executor(i).ValueOrDie();
+    device_states.push_back(absl::make_unique<DeviceState>(
+        executor, synchronous_deallocation, asynchronous,
+        /*allow_event_reuse=*/gpu_platform));
+    devices.push_back(MakeDevice(platform_name, i, i));
+  }
+
   std::unique_ptr<se::DeviceMemoryAllocator> allocator;
   std::unique_ptr<tensorflow::Allocator> host_memory_allocator;
   if (gpu_platform) {
     if (allocator_config.kind != AllocatorConfig::Kind::kPlatform) {
-      TF_ASSIGN_OR_RETURN(
-          allocator,
-          CreateBFCAllocator(platform, client, allocator_config.memory_fraction,
-                             allocator_config.preallocate));
+      TF_ASSIGN_OR_RETURN(allocator,
+                          CreateBFCAllocator(platform, device_states, client,
+                                             allocator_config.memory_fraction,
+                                             allocator_config.preallocate));
     }
 
     tensorflow::SubAllocator* sub_allocator = new tensorflow::GpuHostAllocator(
@@ -186,29 +215,23 @@ StatusOr<std::shared_ptr<PyLocalClient>> PyLocalClient::Get(
     return Unimplemented("BFCAllocator only available for GPU.");
   }
 
-  std::vector<std::unique_ptr<Device>> devices;
-  devices.reserve(client->device_count());
-  bool synchronous_deallocation = platform_name == "cpu";
-  for (int i = 0; i < client->device_count(); ++i) {
-    se::StreamExecutor* executor =
-        client->backend().stream_executor(i).ValueOrDie();
-    devices.push_back(absl::make_unique<Device>(
-        executor, synchronous_deallocation, asynchronous,
-        /*allow_event_reuse=*/gpu_platform));
-  }
   return std::make_shared<PyLocalClient>(
-      platform_name, client, std::move(devices), std::move(allocator),
+      platform_name, client, std::move(devices), /*host_id=*/0,
+      std::move(device_states), std::move(allocator),
       std::move(host_memory_allocator));
 }
 
 PyLocalClient::PyLocalClient(
     std::string platform_name, LocalClient* client,
-    std::vector<std::unique_ptr<Device>> devices,
+    std::vector<std::shared_ptr<Device>> devices, int host_id,
+    std::vector<std::unique_ptr<DeviceState>> device_states,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
     std::unique_ptr<tensorflow::Allocator> host_memory_allocator)
     : platform_name_(std::move(platform_name)),
       client_(client),
       devices_(std::move(devices)),
+      host_id_(host_id),
+      device_states_(std::move(device_states)),
       owned_allocator_(std::move(allocator)),
       host_memory_allocator_(std::move(host_memory_allocator)),
       h2d_transfer_pool_(tensorflow::Env::Default(), "py_xla_h2d_transfer",
@@ -218,63 +241,48 @@ PyLocalClient::PyLocalClient(
   } else {
     allocator_ = client_->backend().memory_allocator();
   }
+
+  for (const std::shared_ptr<Device>& device : devices_) {
+    CHECK(id_to_device_.insert({device->id(), device}).second)
+        << "Duplicate device id: " << device->id();
+  }
 }
 
 Status PyLocalClient::TransferToInfeed(const LiteralSlice& literal,
                                        int device_ordinal) {
-  py_ref_manager().CollectGarbage();
-  py::gil_scoped_release gil_release;
   return client_->TransferToInfeedLocal(literal, device_ordinal);
 }
 
-StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
-    const Shape& shape, int device_ordinal) {
-  py_ref_manager().CollectGarbage();
-  Literal literal;
-  {
-    py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(
-        literal, client_->TransferFromOutfeedLocal(shape, device_ordinal));
-  }
-  return LiteralToPython(std::make_shared<Literal>(std::move(literal)));
+StatusOr<Literal> PyLocalClient::TransferFromOutfeed(const Shape& shape,
+                                                     int device_ordinal) {
+  return client_->TransferFromOutfeedLocal(shape, device_ordinal);
+}
+
+StatusOr<DeviceAssignment> PyLocalClient::GetDefaultDeviceAssignment(
+    int num_replicas) const {
+  return client_->backend().computation_placer()->AssignDevices(
+      num_replicas, /*computation_count=*/1);
 }
 
 /* static */
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromPython(
-    const py::object& argument, std::shared_ptr<PyLocalClient> client,
-    int device_ordinal) {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPython");
-  struct H2DTransfer {
-    PythonBufferTree tree;
-    std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref;
-  };
-  auto transfer = std::make_shared<H2DTransfer>();
-  TF_ASSIGN_OR_RETURN(transfer->tree, GetPythonBufferTree(argument));
-
-  client->py_ref_manager().CollectGarbage();
-
-  // Take a reference to the buffer to ensure that the inputs in host memory
-  // remain live until the transfer is complete.
-  transfer->py_buffer_ref = client->py_ref_manager().ManageReferences(
-      absl::MakeSpan(transfer->tree.arrays));
-  transfer->tree.arrays.clear();
-
-  // We are done manipulating Python objects; release the GIL.
-  py::gil_scoped_release gil_release;
-  VLOG(1) << "PyLocalBuffer::FromPython: shape: "
-          << transfer->tree.shape.ToString()
+StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromLiterals(
+    std::vector<BorrowingLiteral> leaves_literals, const Shape& tuple_shape,
+    std::shared_ptr<void> leaves_reference,
+    std::shared_ptr<PyLocalClient> client, int device_ordinal) {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromLiterals");
+  VLOG(1) << "PyLocalBuffer::FromLiterals: shape: " << tuple_shape.ToString()
           << " device ordinal: " << device_ordinal;
 
-  Device* device = &client->device(device_ordinal);
+  DeviceState* device = &client->device_state(device_ordinal);
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
   se::DeviceMemoryAllocator* allocator = client->allocator();
   TF_ASSIGN_OR_RETURN(
-      transfer->tree.shape,
-      transfer_manager->ChooseCompactLayoutForShape(transfer->tree.shape));
+      Shape compact_shape,
+      transfer_manager->ChooseCompactLayoutForShape(tuple_shape));
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer scoped_buffer,
                       transfer_manager->AllocateScopedShapedBuffer(
-                          transfer->tree.shape, allocator, device_ordinal));
+                          compact_shape, allocator, device_ordinal));
 
   // Make the host to device stream wait for the newly allocated buffer to be
   // available on the compute stream. We schedule this wait synchronously; while
@@ -293,21 +301,25 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromPython(
       SharedDeviceBuffer::FromScopedShapedBuffer(std::move(scoped_buffer),
                                                  definition_event);
 
+  // TODO(makro): Use move capture once C++ 14 features are available.
+  auto leaves = std::make_shared<std::vector<BorrowingLiteral>>(
+      std::move(leaves_literals));
   auto transfer_h2d = [client, transfer_manager, device, device_ordinal,
-                       device_buffer, transfer]() {
+                       device_buffer, compact_shape, leaves,
+                       leaves_reference]() {
     // This function uses TF_CHECK_OK and ValueOrDie() since we have no way to
     // report failures from a callback. However, the operations here are
     // unlikely to fail and not recoverable even if we were to fail: DMAs to
     // memory that has already been allocated, and a possible Event allocation.
-    ShapedBuffer buffer = device_buffer->AsShapedBuffer(transfer->tree.shape);
+    ShapedBuffer buffer = device_buffer->AsShapedBuffer(compact_shape);
     TF_CHECK_OK(transfer_manager->WriteTupleIndexTablesAsync(
         device->host_to_device_stream(), buffer));
     std::vector<std::shared_ptr<void>> staging_buffers;
-    staging_buffers.reserve(transfer->tree.leaves.size());
-    auto it = transfer->tree.leaves.begin();
+    staging_buffers.reserve(leaves->size());
+    auto it = leaves->begin();
     for (const ShapeUtil::IndexedShape& indexed_shape :
-         ShapeUtil::GetLeafShapes(transfer->tree.shape)) {
-      CHECK(it != transfer->tree.leaves.end());
+         ShapeUtil::GetLeafShapes(compact_shape)) {
+      CHECK(it != leaves->end());
       ShapedBuffer leaf(
           indexed_shape.shape,
           transfer_manager->HostShapeToDeviceShape(indexed_shape.shape),
@@ -352,19 +364,19 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromPython(
       device->ThenRelease(device->host_to_device_stream(), device_buffer);
     }
 
-    device->ThenRelease(device->host_to_device_stream(),
-                        std::make_pair(std::move(transfer->py_buffer_ref),
-                                       std::move(staging_buffers)));
+    device->ThenRelease(
+        device->host_to_device_stream(),
+        std::make_pair(leaves_reference, std::move(staging_buffers)));
   };
   client->h2d_transfer_pool()->Schedule(transfer_h2d);
   return absl::make_unique<PyLocalBuffer>(
-      transfer->tree.shape, std::move(device_buffer), std::move(client));
+      compact_shape, std::move(device_buffer), std::move(client));
 }
 
 /* static */ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::MakeTuple(
     const std::vector<PyLocalBuffer*> buffers,
     std::shared_ptr<PyLocalClient> client, int device_ordinal) {
-  std::vector<xla::Shape> host_shapes;
+  std::vector<Shape> host_shapes;
   std::vector<std::shared_ptr<SharedDeviceBuffer>> device_buffers;
   host_shapes.reserve(buffers.size());
   device_buffers.reserve(buffers.size());
@@ -382,7 +394,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromPython(
   se::DeviceMemoryAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
-  Device& device = client->device(device_ordinal);
+  DeviceState& device = client->device_state(device_ordinal);
 
   auto definition_event = std::make_shared<BufferDefinitionEvent>();
   TF_ASSIGN_OR_RETURN(
@@ -445,7 +457,8 @@ Status PyLocalBuffer::CopyToHostAsync() {
     }
     host_value = host_value_ = std::make_shared<HostValue>();
   }
-  se::Stream* stream = client_->device(device_ordinal_).device_to_host_stream();
+  se::Stream* stream =
+      client_->device_state(device_ordinal_).device_to_host_stream();
   WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
   host_value->value = std::make_shared<Literal>(on_host_shape_);
   TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer, AsShapedBuffer());
@@ -458,29 +471,22 @@ Status PyLocalBuffer::CopyToHostAsync() {
   return Status::OK();
 }
 
-StatusOr<py::object> PyLocalBuffer::ToPython() {
-  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToPython");
+StatusOr<std::shared_ptr<Literal>> PyLocalBuffer::ToLiteral() {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToLiteral");
   std::shared_ptr<SharedDeviceBuffer> device_buffer = DeviceBuffer();
   if (!device_buffer) {
-    return InvalidArgument("ToPython() called on invalid buffer.");
+    return InvalidArgument("ToLiteral() called on invalid buffer.");
   }
 
-  client_->py_ref_manager().CollectGarbage();
-  std::shared_ptr<Literal> literal;
+  TF_RETURN_IF_ERROR(CopyToHostAsync());
+  std::shared_ptr<HostValue> host_value;
   {
-    py::gil_scoped_release gil_release;
-    TF_RETURN_IF_ERROR(CopyToHostAsync());
-    std::shared_ptr<HostValue> host_value;
-    {
-      absl::MutexLock lock(&mu_);
-      host_value = host_value_;
-    }
-    host_value->ready.WaitForNotification();
-    TF_RETURN_IF_ERROR(host_value->status);
-    literal = host_value->value;
+    absl::MutexLock lock(&mu_);
+    host_value = host_value_;
   }
-
-  return LiteralToPython(std::move(literal));
+  host_value->ready.WaitForNotification();
+  TF_RETURN_IF_ERROR(host_value->status);
+  return host_value->value;
 }
 
 std::shared_ptr<SharedDeviceBuffer> PyLocalBuffer::DeviceBuffer() const {
@@ -524,15 +530,13 @@ PyLocalBuffer::DestructureTuple() {
 StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
     int dst_device_ordinal) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::CopyToDevice");
-  client_->py_ref_manager().CollectGarbage();
-  py::gil_scoped_release gil_release;
   std::shared_ptr<SharedDeviceBuffer> src_device_buffer = DeviceBuffer();
   if (dst_device_ordinal == device_ordinal_) {
     return absl::make_unique<PyLocalBuffer>(on_host_shape_, src_device_buffer,
                                             client_);
   }
-  Device& src_device = client_->device(device_ordinal_);
-  const Device& dst_device = client_->device(dst_device_ordinal);
+  DeviceState& src_device = client_->device_state(device_ordinal_);
+  const DeviceState& dst_device = client_->device_state(dst_device_ordinal);
 
   se::Stream* src_device_to_device_stream =
       src_device.GetDeviceToDeviceStream();
@@ -554,7 +558,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
 
   // Copy the leaf buffers.
   for (const auto& leaf : src_buffer.buffers().leaves()) {
-    const xla::ShapeIndex& index = leaf.first;
+    const ShapeIndex& index = leaf.first;
     const se::DeviceMemoryBase& input_buffer = leaf.second;
     const se::DeviceMemoryBase& output_buffer = dst_buffer.buffer(index);
     TF_RET_CHECK(input_buffer.size() == output_buffer.size())
@@ -603,43 +607,58 @@ Status PyLocalBuffer::BlockHostUntilReady() {
     return InvalidArgument("BlockHostUntilReady() called on invalid buffer.");
   }
 
-  client_->py_ref_manager().CollectGarbage();
-  py::gil_scoped_release gil_release;
-
   // This code waits at least until the buffer is ready, but it may wait longer
   // if there are other device to host transfers scheduled. If this proves to
   // be an issue, we could either use a separate stream for this purpose, or
   // poll for the buffer definition events.
-  se::Stream* stream =
-      client_->device(device_buffer->device_ordinal()).device_to_host_stream();
+  se::Stream* stream = client_->device_state(device_buffer->device_ordinal())
+                           .device_to_host_stream();
   WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
   return stream->BlockHostUntilDone();
 }
 
+static absl::optional<int> LookupDeviceOrdinal(const PyLocalClient& client,
+                                               int device_id) {
+  auto it = client.id_to_device().find(device_id);
+  CHECK(it != client.id_to_device().end())
+      << "Unknown device id: " << device_id;
+  int device_ordinal = it->second->local_device_ordinal();
+  if (device_ordinal == -1) {
+    return absl::optional<int>();
+  }
+  return device_ordinal;
+}
+
 PyLocalExecutable::PyLocalExecutable(
     std::shared_ptr<LocalExecutable> executable,
     DeviceAssignment device_assignment, std::shared_ptr<PyLocalClient> client)
     : client_(std::move(client)),
       executable_(std::move(executable)),
-      device_assignment_(std::move(device_assignment)) {}
-
-std::vector<int> PyLocalExecutable::DeviceOrdinals() const {
+      device_assignment_(std::move(device_assignment)) {
   int num_replicas = device_assignment_.replica_count();
-  std::vector<int> device_ordinals;
-  device_ordinals.reserve(num_replicas);
-  for (int i = 0; i < num_replicas; ++i) {
-    device_ordinals.push_back(device_assignment_(i, 0));
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    int device_id = device_assignment_(replica, 0);
+    absl::optional<int> device_ordinal =
+        LookupDeviceOrdinal(*client_, device_id);
+    if (!device_ordinal) {
+      VLOG(3) << "Non-local device: " << device_id;
+      continue;
+    }
+    local_replicas_.push_back(replica);
+    device_ordinals_.push_back(*device_ordinal);
   }
-  return device_ordinals;
+  CHECK_GE(local_replicas_.size(), 1);
 }
 
 StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
     absl::Span<PyLocalBuffer* const> argument_handles, int replica,
     const RunId& run_id) {
-  const int device_ordinal = device_assignment_(replica, 0);
+  const int device_id = device_assignment_(replica, 0);
+  absl::optional<int> device_ordinal = LookupDeviceOrdinal(*client_, device_id);
+  CHECK(device_ordinal);
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Execute");
   VLOG(3) << "Replica " << replica
-          << " mapped to device ordinal for execution: " << device_ordinal;
+          << " mapped to device ordinal for execution: " << *device_ordinal;
 
   absl::flat_hash_set<BufferDefinitionEvent*> events;
   std::vector<std::shared_ptr<SharedDeviceBuffer>> device_buffers;
@@ -657,11 +676,11 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
           "%d to replica %d",
           i, replica);
     }
-    if (device_buffer->device_ordinal() != device_ordinal) {
+    if (device_buffer->device_ordinal() != *device_ordinal) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
           "device %d, but replica is assigned to device %d.",
-          i, replica, device_buffer->device_ordinal(), device_ordinal);
+          i, replica, device_buffer->device_ordinal(), *device_ordinal);
     }
     TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer, handle->AsShapedBuffer());
     argument_buffers.push_back(std::move(shaped_buffer));
@@ -672,7 +691,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
             << " buffer: " << argument_buffers.back().ToString();
   }
 
-  Device* device = &client_->device(device_ordinal);
+  DeviceState* device = &client_->device_state(*device_ordinal);
   // The choice of where we wait is arbitrary; the reason for the wait is pacing
   // to avoid problems such as memory fragmentation and running ahead too far,
   // not for correctness. Placing it before the executable launch allows the
@@ -740,45 +759,49 @@ StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
 PyLocalExecutable::ExecutePerReplica(
     absl::Span<const std::vector<PyLocalBuffer*>> argument_handles) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::ExecutePerReplica");
-  const int num_devices = client_->device_count();
+  int num_local_replicas = local_replicas_.size();
+  const int num_local_devices = client_->local_device_count();
 
-  if (argument_handles.size() != num_replicas()) {
+  if (argument_handles.size() != num_local_replicas) {
     return InvalidArgument(
-        "Attempted to execute with %d replicas when replica count is %d",
-        argument_handles.size(), num_devices);
+        "Attempted to execute with %d local replicas when local replica count "
+        "is %d (total replica count: %d)",
+        argument_handles.size(), num_local_replicas, num_replicas());
   }
-  if (argument_handles.size() > num_devices) {
+  if (argument_handles.size() > num_local_devices) {
     return InvalidArgument(
         "Attempted to execute with %d replicas when device count is %d",
-        argument_handles.size(), num_devices);
+        argument_handles.size(), num_local_devices);
   }
 
-  VLOG(1) << "Executing replicated computation; num_replicas="
-          << num_replicas();
-  std::vector<StatusOr<std::unique_ptr<PyLocalBuffer>>> results(num_replicas());
-  if (num_replicas() == 1) {
+  VLOG(1) << "Executing replicated computation; num_replicas=" << num_replicas()
+          << " num_local_replicas=" << num_local_replicas;
+  std::vector<StatusOr<std::unique_ptr<PyLocalBuffer>>> results(
+      num_local_replicas);
+  if (num_local_replicas == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
-    results[0] = ExecuteHelper(argument_handles[0], /*replica=*/0, RunId());
+    results[0] =
+        ExecuteHelper(argument_handles[0], local_replicas_[0], RunId());
   } else {
     RunId run_id;
     absl::Mutex mu;
-    int running GUARDED_BY(mu) = num_replicas();
-    int failed GUARDED_BY(mu) = 0;
-    Status first_failure_status GUARDED_BY(mu);
+    int running = num_local_replicas;
+    int failed = 0;
+    Status first_failure_status;
 
-    for (int replica = 0; replica < num_replicas(); ++replica) {
-      const int device_ordinal = device_assignment_(replica, 0);
-      const Device& device = client_->device(device_ordinal);
-      device.execute_thread()->Schedule([&, replica] {
-        results[replica] =
-            ExecuteHelper(argument_handles[replica], replica, run_id);
+    for (int i = 0; i < num_local_replicas; ++i) {
+      const int replica = local_replicas_[i];
+      const int device_ordinal = device_ordinals_[i];
+      const DeviceState& device = client_->device_state(device_ordinal);
+      device.execute_thread()->Schedule([&, replica, i] {
+        results[i] = ExecuteHelper(argument_handles[i], replica, run_id);
 
         absl::MutexLock lock(&mu);
         --running;
-        if (!results[replica].ok()) {
+        if (!results[i].ok()) {
           if (failed == 0) {
-            first_failure_status = results[replica].status();
+            first_failure_status = results[i].status();
           }
           ++failed;
         }
@@ -813,18 +836,19 @@ PyLocalExecutable::ExecutePerReplica(
   }
   VLOG(1) << "Replicated execution complete.";
 
-  std::vector<std::unique_ptr<PyLocalBuffer>> wrapped_results(num_replicas());
-  for (int replica = 0; replica < num_replicas(); ++replica) {
-    auto& statusor = results[replica];
+  std::vector<std::unique_ptr<PyLocalBuffer>> wrapped_results(
+      num_local_replicas);
+  for (int i = 0; i < num_local_replicas; ++i) {
+    auto& statusor = results[i];
     if (!statusor.ok()) {
       return AppendStatus(
           statusor.status(),
           absl::StrFormat(
               "while running replica %d of a replicated computation (other "
               "replicas may have failed as well).",
-              replica));
+              local_replicas_[i]));
     }
-    wrapped_results[replica] = std::move(statusor.ValueOrDie());
+    wrapped_results[i] = std::move(statusor.ValueOrDie());
   }
   return wrapped_results;
 }
@@ -858,10 +882,8 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
           device_assignment->computation_count());
     }
   } else {
-    TF_ASSIGN_OR_RETURN(
-        device_assignment,
-        client->client()->backend().computation_placer()->AssignDevices(
-            options.num_replicas(), /*computation_count=*/1));
+    TF_ASSIGN_OR_RETURN(device_assignment, client->GetDefaultDeviceAssignment(
+                                               options.num_replicas()));
   }
 
   if (!argument_layouts) {
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index 8ad4c44d53f..37b3c56b7d2 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -23,12 +23,10 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
-#include "include/pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/python/device.h"
-#include "tensorflow/compiler/xla/python/python_ref_manager.h"
+#include "tensorflow/compiler/xla/python/device_state.h"
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -40,6 +38,50 @@ limitations under the License.
 
 namespace xla {
 
+class Device {
+ public:
+  explicit Device(int id, int local_device_ordinal, int host_id = 0)
+      : id_(id),
+        local_device_ordinal_(local_device_ordinal),
+        host_id_(host_id) {}
+  virtual ~Device() {}
+
+  // The ID of this device. IDs are unique among devices of this type
+  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
+  int id() const { return id_; }
+
+  // If this is a device local to this host, the local index of this device as
+  // according to the underlying backend. Unlike id(), this will always be in
+  // the range [0, num_local_devices), and can be used with the xla::LocalClient
+  // and xla::Backend APIs.
+  //
+  // -1 if this device is not local to this host.
+  int local_device_ordinal() const { return local_device_ordinal_; }
+
+  // The ID of this device's host. This is always 0 on single-host platforms.
+  int host_id() const { return host_id_; }
+
+  virtual std::string DebugString() const = 0;
+
+ private:
+  const int id_;
+  const int local_device_ordinal_;
+  const int host_id_;
+};
+
+class CpuDevice : public Device {
+ public:
+  using Device::Device;
+  std::string DebugString() const override;
+};
+
+class GpuDevice : public Device {
+ public:
+  using Device::Device;
+  std::string DebugString() const override;
+};
+
 struct AllocatorConfig {
   enum class Kind {
     kDefault,   // Client picks the best option for the platform.
@@ -72,19 +114,31 @@ class PyLocalClient {
   // `allocator` may null, in which case the platform default allocator is used.
   explicit PyLocalClient(
       std::string platform_name, LocalClient* client,
-      std::vector<std::unique_ptr<Device>> devices,
+      std::vector<std::shared_ptr<Device>> devices, int host_id,
+      std::vector<std::unique_ptr<DeviceState>> device_states,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tensorflow::Allocator> host_memory_allocator);
   virtual ~PyLocalClient() = default;
 
   Status TransferToInfeed(const LiteralSlice& literal, int device_ordinal);
-  StatusOr<pybind11::object> TransferFromOutfeed(const Shape& shape,
-                                                 int device_ordinal);
+  StatusOr<Literal> TransferFromOutfeed(const Shape& shape, int device_ordinal);
 
-  int device_count() const { return client_->device_count(); }
-  Device& device(int device_ordinal) const {
-    return *devices_.at(device_ordinal);
+  virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas) const;
+
+  int device_count() const { return devices_.size(); }
+  const std::vector<std::shared_ptr<Device>>& devices() { return devices_; }
+  const std::map<int, std::shared_ptr<Device>>& id_to_device() const {
+    return id_to_device_;
   }
+  int host_id() const { return host_id_; }
+  const std::string& platform_name() const { return platform_name_; }
+
+  int local_device_count() const { return device_states_.size(); }
+  DeviceState& device_state(int device_ordinal) const {
+    return *device_states_.at(device_ordinal);
+  }
+
   LocalClient* client() const { return client_; }
   se::DeviceMemoryAllocator* allocator() const { return allocator_; }
   tensorflow::Allocator* host_memory_allocator() const {
@@ -95,19 +149,18 @@ class PyLocalClient {
     return &h2d_transfer_pool_;
   }
 
-  PythonRefManager& py_ref_manager() { return py_ref_manager_; }
-
  protected:
   std::string platform_name_;
   LocalClient* client_;
 
-  // py_ref_manager_ must come after devices_ in the class destruction order
-  // (i.e., appear first in the class.)
-  // Destruction of devices waits for them to quiesce; callbacks on device
-  // streams may refer to py_ref_manager_ and we must wait for them to complete.
-  PythonRefManager py_ref_manager_;
+  // Includes all devices, including non-local devices on multi-host platforms.
+  std::vector<std::shared_ptr<Device>> devices_;
+  // Maps Device::id() to the corresponding Device.
+  std::map<int, std::shared_ptr<Device>> id_to_device_;
+  int host_id_;
 
-  std::vector<std::unique_ptr<Device>> devices_;
+  // Device states local to this host. Indexed by local device ordinal.
+  std::vector<std::unique_ptr<DeviceState>> device_states_;
   se::DeviceMemoryAllocator* allocator_;
   std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
 
@@ -128,9 +181,10 @@ class PyLocalClient {
 // Thread-safe.
 class PyLocalBuffer {
  public:
-  static StatusOr<std::unique_ptr<PyLocalBuffer>> FromPython(
-      const pybind11::object& argument, std::shared_ptr<PyLocalClient> client,
-      int device_ordinal);
+  static StatusOr<std::unique_ptr<PyLocalBuffer>> FromLiterals(
+      std::vector<BorrowingLiteral> leaves_literals, const Shape& tuple_shape,
+      std::shared_ptr<void> leaves_reference,
+      std::shared_ptr<PyLocalClient> client, int device_ordinal);
 
   static StatusOr<std::unique_ptr<PyLocalBuffer>> MakeTuple(
       const std::vector<PyLocalBuffer*> buffers,
@@ -148,16 +202,17 @@ class PyLocalBuffer {
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   int device_ordinal() const { return device_ordinal_; }
+  const std::string& platform_name() const { return client_->platform_name(); }
 
   // Returns the buffer's value as a tuple DAG of Python arrays. If the value
   // has previously been prefetched to the host, then returns the prefetched
   // version, otherwise copies the buffer to the host. Blocks until the
   // value is ready.
-  StatusOr<pybind11::object> ToPython();
+  StatusOr<std::shared_ptr<Literal>> ToLiteral();
 
   // Initiates a copy of the buffer to the host. Does not block waiting for
   // the transfer to complete. The value can be retrieved by a later call to
-  // ToPython().
+  // ToLiteral().
   Status CopyToHostAsync();
 
   // Returns the associated device buffer. Returns a nullptr if the buffer is
@@ -190,14 +245,14 @@ class PyLocalBuffer {
   std::shared_ptr<SharedDeviceBuffer> device_buffer_ GUARDED_BY(mu_);
 
   // The cached value of the buffer on the host, produced either from a call to
-  // CopyToHost or from a call to ToPython. Once a value has been fetched to
+  // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
   // the host, it persists Delete() is called or the PyLocalBuffer is destroyed.
   struct HostValue {
     absl::Notification ready;
     // status and value are valid for reading only after `ready` has been
     // notified.
     Status status;
-    std::shared_ptr<xla::Literal> value;
+    std::shared_ptr<Literal> value;
   };
   std::shared_ptr<HostValue> host_value_ GUARDED_BY(mu_);
 };
@@ -222,8 +277,12 @@ class PyLocalExecutable {
     return executable_->build_options().num_replicas();
   }
 
+  int64 SizeOfGeneratedCodeInBytes() const {
+    return executable_->executable()->SizeOfGeneratedCodeInBytes();
+  }
+
   // Returns the device ordinals to which each replica is assigned.
-  std::vector<int> DeviceOrdinals() const;
+  const std::vector<int>& DeviceOrdinals() const { return device_ordinals_; }
 
   const DeviceAssignment& device_assignment() const {
     return device_assignment_;
@@ -248,6 +307,13 @@ class PyLocalExecutable {
   std::shared_ptr<PyLocalClient> const client_;
   std::shared_ptr<LocalExecutable> executable_;
   const DeviceAssignment device_assignment_;
+  // The replica indices of device_assignment_ to be run by this client. On
+  // single-host platforms, this is all replicas (i.e. local_replicas_[i] = i),
+  // but this may not be the case on multi-host platforms.
+  std::vector<int> local_replicas_;
+  // device_ordinals_[i] is the device ordinal to which local_replicas_[i] is
+  // assigned.
+  std::vector<int> device_ordinals_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/python_ref_manager.cc b/tensorflow/compiler/xla/python/python_ref_manager.cc
index 1e9cc58d090..0a980f1a749 100644
--- a/tensorflow/compiler/xla/python/python_ref_manager.cc
+++ b/tensorflow/compiler/xla/python/python_ref_manager.cc
@@ -49,4 +49,9 @@ void PythonRefManager::CollectGarbage() {
   python_garbage_.clear();
 }
 
+PythonRefManager* GlobalPyRefManager() {
+  static PythonRefManager* static_ref_manager = new PythonRefManager();
+  return static_ref_manager;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/python_ref_manager.h b/tensorflow/compiler/xla/python/python_ref_manager.h
index 8be19336a89..054150faf25 100644
--- a/tensorflow/compiler/xla/python/python_ref_manager.h
+++ b/tensorflow/compiler/xla/python/python_ref_manager.h
@@ -74,6 +74,11 @@ class PythonRefManager {
   std::deque<pybind11::object> python_garbage_ GUARDED_BY(mu_);
 };
 
+// A global PythonRefManager. Unless `CollectGarbage()` is called before
+// shutdown, this container will hold on to Python objects and thus cause a
+// leak. This behavior is similar to `tensorflow::ClearDecRefCache()`.
+PythonRefManager* GlobalPyRefManager();
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PYTHON_REF_MANAGER_H_
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index bc0ee2b19b4..1873249b07c 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -104,7 +104,7 @@ struct type_caster<absl::Span<const T>> {
   using value_conv = make_caster<T>;
 
   PYBIND11_TYPE_CASTER(absl::Span<const T>,
-                       _("Span[") + value_conv::name() + _("]"));
+                       _("Span[") + value_conv::name + _("]"));
 
   // absl::Span doesn't hold ownership. We therefore need a temporary array.
   // Pybind appears to keep type_casters alive until the callee has run.
@@ -151,7 +151,7 @@ struct type_caster<xla::StatusOr<T>> {
   using value_conv = make_caster<T>;
 
   PYBIND11_TYPE_CASTER(xla::StatusOr<T>,
-                       _("StatusOr[") + value_conv::name() + _("]"));
+                       _("StatusOr[") + value_conv::name + _("]"));
 
   static handle cast(xla::StatusOr<T> src, return_value_policy policy,
                      handle parent) {
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 172e24f801e..078fee8f652 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/xrt.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -109,18 +110,23 @@ StatusOr<std::string> GetComputationHloDotGraph(
 }
 
 // Registers a 'fn_capsule' as a CPU custom call target.
-// 'fn_capsule' is a void* pointer encapsulated in a PyCapsule object, with name
-// "xla._CPU_CUSTOM_CALL_TARGET".
-Status RegisterCpuCustomCallTarget(const std::string& fn_name,
-                                   py::capsule capsule) {
-  static const char* const kName = "xla._CPU_CUSTOM_CALL_TARGET";
-  if (absl::string_view(capsule.name()) != kName) {
+// 'fn_capsule' must be a void* pointer encapsulated in a PyCapsule object,
+// with name "xla._CUSTOM_CALL_TARGET".
+// 'platform' is an XLA platform name, e.g., "Host" or "CUDA".
+Status PyRegisterCustomCallTarget(const std::string& fn_name,
+                                  py::capsule capsule,
+                                  const std::string& platform) {
+  static const char* const kName = "xla._CUSTOM_CALL_TARGET";
+  // TODO(phawkins): remove old name after fixing users.
+  static const char* const kOldCpuName = "xla._CPU_CUSTOM_CALL_TARGET";
+  if (absl::string_view(capsule.name()) != kName &&
+      absl::string_view(capsule.name()) != kOldCpuName) {
     return InvalidArgument(
-        "Argument to RegisterCpuCustomCallTargetRegistry was not a "
-        "xla._CPU_CUSTOM_CALL_TARGET capsule.");
+        "Argument to RegisterCustomCallTargetRegistry was not a "
+        "xla._CUSTOM_CALL_TARGET capsule.");
   }
   CustomCallTargetRegistry::Global()->Register(
-      fn_name, static_cast<void*>(capsule), "Host");
+      fn_name, static_cast<void*>(capsule), platform);
   return Status::OK();
 }
 
@@ -292,10 +298,34 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("computation_count", &DeviceAssignment::computation_count)
       .def("__repr__", &DeviceAssignment::ToString);
 
+  py::class_<Device, std::shared_ptr<Device>>(
+      m, "Device",
+      "A descriptor of an available device.\n\nSubclasses are used to "
+      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
+      "have additional properties specific to that device type.")
+      .def_property_readonly(
+          "id", &Device::id,
+          "Integer ID of this device.\n\nUnique across all available devices "
+          "of this type, including remote devices on multi-host platforms.")
+      .def_property_readonly("host_id", &Device::host_id,
+                             "Integer ID of this device's host.\n\n"
+                             "This is always 0 except on multi-host platforms.")
+      .def("__str__", &Device::DebugString);
+
+  py::class_<CpuDevice, Device, std::shared_ptr<CpuDevice>>(m, "CpuDevice")
+      .def("__repr__", [](const CpuDevice& device) {
+        return absl::StrFormat("CpuDevice(id=%i)", device.id());
+      });
+
+  py::class_<GpuDevice, Device, std::shared_ptr<GpuDevice>>(m, "GpuDevice")
+      .def("__repr__", [](const GpuDevice& device) {
+        return absl::StrFormat("GpuDevice(id=%i)", device.id());
+      });
+
   // Local XLA client methods.
 
-  // CPU custom-call targets.
-  m.def("RegisterCpuCustomCallTarget", &RegisterCpuCustomCallTarget);
+  // Custom-call targets.
+  m.def("RegisterCustomCallTarget", &PyRegisterCustomCallTarget);
 
   py::class_<AllocatorConfig> alloc_config(m, "AllocatorConfig");
   alloc_config.def(py::init<>())
@@ -311,21 +341,84 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static("Get", &PyLocalClient::Get, py::arg("platform"),
                   py::arg("xla_platform_id"), py::arg("asynchronous"),
                   py::arg("allocator_config") = AllocatorConfig())
-      .def("DeviceCount", &PyLocalClient::device_count)
-      .def("TransferToInfeed", &PyLocalClient::TransferToInfeed)
-      .def("TransferFromOutfeed", &PyLocalClient::TransferFromOutfeed);
+      .def("device_count", &PyLocalClient::device_count)
+      .def("local_device_count", &PyLocalClient::local_device_count)
+      .def("devices", &PyLocalClient::devices)
+      .def("host_id", &PyLocalClient::host_id)
+      .def("TransferToInfeed",
+           [](PyLocalClient* client, const LiteralSlice& literal,
+              int device_ordinal) {
+             GlobalPyRefManager()->CollectGarbage();
+             py::gil_scoped_release gil_release;
+             return client->TransferToInfeed(literal, device_ordinal);
+           })
+      .def("TransferFromOutfeed",
+           [](PyLocalClient* client, const Shape& shape,
+              int device_ordinal) -> StatusOr<py::object> {
+             GlobalPyRefManager()->CollectGarbage();
+             std::shared_ptr<Literal> literal_shared;
+             {
+               py::gil_scoped_release gil_release;
+               TF_ASSIGN_OR_RETURN(Literal literal, client->TransferFromOutfeed(
+                                                        shape, device_ordinal));
+               literal_shared = std::make_shared<Literal>(std::move(literal));
+             }
+             return LiteralToPython(std::move(literal_shared));
+           });
 
   py::class_<PyLocalBuffer>(m, "PyLocalBuffer")
-      .def_static("from_python", &PyLocalBuffer::FromPython)
+      .def_static(
+          "from_python",
+          [](const pybind11::object& argument,
+             std::shared_ptr<PyLocalClient> client,
+             int device_ordinal) -> StatusOr<std::unique_ptr<PyLocalBuffer>> {
+            GlobalPyRefManager()->CollectGarbage();
+            TF_ASSIGN_OR_RETURN(PythonBufferTree tree,
+                                GetPythonBufferTree(argument));
+            std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
+                GlobalPyRefManager()->ManageReferences(
+                    absl::MakeSpan(tree.arrays));
+            tree.arrays.clear();
+
+            std::vector<BorrowingLiteral> leaves;
+            leaves.insert(leaves.end(),
+                          std::make_move_iterator(tree.leaves.begin()),
+                          std::make_move_iterator(tree.leaves.end()));
+
+            py::gil_scoped_release gil_release;
+            return PyLocalBuffer::FromLiterals(
+                std::move(leaves), tree.shape, std::move(py_buffer_ref),
+                std::move(client), device_ordinal);
+          })
       .def_static("make_tuple", &PyLocalBuffer::MakeTuple)
-      .def("copy_to_device", &PyLocalBuffer::CopyToDevice)
+      .def("copy_to_device",
+           [](PyLocalBuffer* buffer, int dst_device_ordinal) {
+             GlobalPyRefManager()->CollectGarbage();
+             py::gil_scoped_release gil_release;
+             return buffer->CopyToDevice(dst_device_ordinal);
+           })
       .def("delete", &PyLocalBuffer::Delete)
       .def("destructure", &PyLocalBuffer::DestructureTuple)
-      .def("block_host_until_ready", &PyLocalBuffer::BlockHostUntilReady)
+      .def("block_host_until_ready",
+           [](PyLocalBuffer* buffer) {
+             GlobalPyRefManager()->CollectGarbage();
+             py::gil_scoped_release gil_release;
+             return buffer->BlockHostUntilReady();
+           })
       .def("copy_to_host_async", &PyLocalBuffer::CopyToHostAsync)
-      .def("to_py", &PyLocalBuffer::ToPython)
+      .def("to_py",
+           [](PyLocalBuffer* buffer) -> StatusOr<py::object> {
+             GlobalPyRefManager()->CollectGarbage();
+             std::shared_ptr<Literal> literal;
+             {
+               py::gil_scoped_release gil_release;
+               TF_ASSIGN_OR_RETURN(literal, buffer->ToLiteral());
+             }
+             return LiteralToPython(std::move(literal));
+           })
       .def("shape", &PyLocalBuffer::on_host_shape)
       .def("device", &PyLocalBuffer::device_ordinal)
+      .def("platform", &PyLocalBuffer::platform_name)
       .def("is_deleted",
            [](const PyLocalBuffer& buffer) {
              return buffer.DeviceBuffer() == nullptr;
@@ -347,6 +440,8 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static("Compile", &PyLocalExecutable::Compile,
                   py::call_guard<py::gil_scoped_release>())
       .def("DeviceOrdinals", &PyLocalExecutable::DeviceOrdinals)
+      .def("SizeOfGeneratedCodeInBytes",
+           &PyLocalExecutable::SizeOfGeneratedCodeInBytes)
       .def("Delete", &PyLocalExecutable::Delete)
       .def("Execute", &PyLocalExecutable::Execute,
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
@@ -365,7 +460,13 @@ PYBIND11_MODULE(xla_extension, m) {
                     &DebugOptions::set_xla_cpu_fast_math_honor_nans)
       .def_property("xla_cpu_fast_math_honor_division",
                     &DebugOptions::xla_cpu_fast_math_honor_division,
-                    &DebugOptions::set_xla_cpu_fast_math_honor_division);
+                    &DebugOptions::set_xla_cpu_fast_math_honor_division)
+      .def_property("xla_cpu_fast_math_honor_functions",
+                    &DebugOptions::xla_cpu_fast_math_honor_functions,
+                    &DebugOptions::set_xla_cpu_fast_math_honor_functions)
+      .def_property("xla_gpu_enable_fast_min_max",
+                    &DebugOptions::xla_gpu_enable_fast_min_max,
+                    &DebugOptions::set_xla_gpu_enable_fast_min_max);
 
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())
@@ -473,7 +574,8 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("IRFFT", FftType::IRFFT);
 
   ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
-          py::arg("dimension_numbers"), py::arg("slice_sizes"));
+          py::arg("dimension_numbers"), py::arg("slice_sizes"),
+          py::arg("indices_are_sorted"));
   ops.def("GetTupleElement", &GetTupleElement);
   ops.def("Infeed", &Infeed, py::arg("builder"), py::arg("shape"),
           py::arg("config") = "");
@@ -533,20 +635,26 @@ PYBIND11_MODULE(xla_extension, m) {
           py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
   ops.def(
       "Sort",
-      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
-         int64 dimension) -> XlaOp {
+      [](XlaBuilder* builder, absl::Span<const XlaOp> operands, int64 dimension,
+         absl::optional<const XlaComputation*> comparator) -> XlaOp {
         return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
           std::vector<PrimitiveType> operand_types;
           for (const auto& operand : operands) {
             TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
             operand_types.push_back(operand_shape.element_type());
           }
-          return Sort(operands,
-                      CreateScalarLtComputation(operand_types, builder),
-                      dimension);
+
+          if (comparator) {
+            return Sort(operands, **comparator, dimension);
+          } else {
+            return Sort(operands,
+                        CreateScalarLtComputation(operand_types, builder),
+                        dimension);
+          }
         });
       },
-      py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1);
+      py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1,
+      py::arg("comparator") = absl::nullopt);
   ops.def("Transpose", &Transpose);
   ops.def("TriangularSolve", &TriangularSolve);
   ops.def("Tuple", &Tuple);
@@ -640,6 +748,6 @@ PYBIND11_MODULE(xla_extension, m) {
   py::class_<ChannelHandle>(m, "ChannelHandle");
 
   tensorflow::AddXrtSubmodule(&m);
-}
+}  // NOLINT(readability/fn_size)
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 7e5692fef30..63a9ea37692 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -59,6 +59,18 @@ class Backend(object):
   def device_count(self):
     """Returns the number of devices known to the backend."""
 
+  @abc.abstractmethod
+  def local_device_count(self):
+    """Returns the number of devices local to this host."""
+
+  @abc.abstractmethod
+  def devices(self):
+    """Returns a list of `device_count()` Device subclasses."""
+
+  @abc.abstractmethod
+  def host_id(self):
+    """Returns the integer ID of this host."""
+
   @abc.abstractmethod
   def buffer_from_pyval(self, pyval, device=0):
     """Allocates a fresh buffer and populates it with `pyval`."""
@@ -93,7 +105,16 @@ class LocalBackend(Backend):
     self.client = client
 
   def device_count(self):
-    return self.client.DeviceCount()
+    return self.client.device_count()
+
+  def local_device_count(self):
+    return self.client.local_device_count()
+
+  def devices(self):
+    return self.client.devices()
+
+  def host_id(self):
+    return self.client.host_id()
 
   def buffer_from_pyval(self, pyval, device=0):
     return _xla.PyLocalBuffer.from_python(pyval, self.client, device)
@@ -109,15 +130,25 @@ class LocalBackend(Backend):
     options.debug_options.xla_cpu_fast_math_honor_infs = True
     options.debug_options.xla_cpu_fast_math_honor_nans = True
     options.debug_options.xla_cpu_fast_math_honor_division = True
+    options.debug_options.xla_cpu_fast_math_honor_functions = True
+    options.debug_options.xla_gpu_enable_fast_min_max = False
     return _xla.LocalExecutable.Compile(c_computation,
                                         compile_options.argument_layouts,
                                         options, self.client,
                                         compile_options.device_assignment)
 
 
+xla_platform_names = {
+    'cpu': 'Host',
+    'gpu': 'CUDA',
+}
+
+
 def _cpu_backend_factory():
   client = _xla.LocalClient.Get(
-      platform='cpu', xla_platform_id='Host', asynchronous=True)
+      platform='cpu',
+      xla_platform_id=xla_platform_names['cpu'],
+      asynchronous=True)
   return LocalBackend(platform='cpu', client=client)
 
 
@@ -142,7 +173,9 @@ def _gpu_backend_factory():
   config.preallocate = preallocate not in ('0', 'false', 'False')
 
   client = _xla.LocalClient.Get(
-      platform='gpu', xla_platform_id='CUDA', asynchronous=True,
+      platform='gpu',
+      xla_platform_id=xla_platform_names['gpu'],
+      asynchronous=True,
       allocator_config=config)
   return LocalBackend(platform='gpu', client=client)
 
@@ -449,6 +482,9 @@ def computation_count():
 """
 
 
+Device = _xla.Device
+
+
 class CompileOptions(object):
   """Python object for XLA compile options.
 
@@ -544,6 +580,9 @@ class Computation(object):
 #   def Execute(self, arguments : [Buffer]) -> Buffer:
 #     """Execute on one replica with Buffer arguments and return value."""
 #
+#   def SizeOfGeneratedCodeInBytes(self) -> int:
+#     """Return generated binary size, or -1 if not known."""
+#
 #   def ExecutePerReplica(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
@@ -1431,12 +1470,31 @@ class ComputationBuilder(object):
         batch_group_count,
         precision_config=precision_config)
 
-  def Sort(self, operand, dimension=-1):
-    """Enqueues a sort operation onto the computation."""
-    return ops.Sort(self._builder, [operand], dimension)
+  def Sort(self, operands, dimension=-1, comparator=None):
+    """Enqueues a sort operation onto the computation.
+
+    Args:
+      operands: either an XlaOp or a sequence of XlaOps to sort. All operands
+        must be arrays with the same dimensions.
+      dimension: the array dimension over which to sort.
+      comparator: a comparator XlaComputation. See the XLA operation semantics
+        for details.
+
+    Returns:
+      Either an XlaOp or a tuple of XlaOps (if `operands` was an XlaOp or
+      a tuple of XlaOps, respectively.)
+    """
+    operands = (
+        list(operands)
+        if isinstance(operands, collections.Sequence) else [operands])
+    return ops.Sort(self._builder, operands, dimension,
+                    comparator.computation if comparator else None)
 
   def SortKeyVal(self, keys, values, dimension=-1):
-    """Enqueues a key-value sort operation onto the computation."""
+    """Enqueues a key-value sort operation onto the computation.
+
+    Deprecated. Use `Sort` instead.
+    """
     return ops.Sort(self._builder, [keys, values], dimension)
 
   def QR(self, a, full_matrices=True):
@@ -1470,11 +1528,27 @@ class ComputationBuilder(object):
     """Enqueues a singular value decomposition."""
     return self.Tuple(*ops.SVD(a))
 
-  def Scatter(self, a, scatter_indices, updates, update_computation,
-              dimension_numbers):
+  def Gather(self,
+             a,
+             start_indices,
+             dimension_numbers,
+             slice_sizes,
+             indices_are_sorted=False):
+    """Enqueues a Gather operation onto the computation."""
+    return ops.Gather(a, start_indices, dimension_numbers, slice_sizes,
+                      indices_are_sorted)
+
+  def Scatter(self,
+              a,
+              scatter_indices,
+              updates,
+              update_computation,
+              dimension_numbers,
+              indices_are_sorted=False):
     """Enqueues a Scatter operation onto the computation."""
     return ops.Scatter(a, scatter_indices, updates,
-                       update_computation.computation, dimension_numbers)
+                       update_computation.computation, dimension_numbers,
+                       indices_are_sorted)
 
   def Fft(self, operand, fft_type, fft_lengths):
     """Enqueues a FFT operation onto the computation."""
@@ -1558,7 +1632,6 @@ _OTHER_OPS = [
     'CollectivePermute',
     'ConvertElementType',
     'Dot',
-    'Gather',
     'GetTupleElement',
     'ReducePrecision',
     'Rev',
@@ -1592,14 +1665,18 @@ def _forward_methods_to_local_builder():
 _forward_methods_to_local_builder()
 
 
-def register_cpu_custom_call_target(name, fn):
-  """Registers a CPU custom call target.
+def register_custom_call_target(name, fn, platform='cpu'):
+  """Registers a custom call target.
 
   Args:
     name: bytes containing the name of the function.
     fn: a PyCapsule object containing the function pointer.
+    platform: the target platform.
   """
-  _xla.RegisterCpuCustomCallTarget(name, fn)
+  _xla.RegisterCustomCallTarget(name, fn, xla_platform_names[platform])
+
+# Deprecated. Use register_custom_call_target instead.
+register_cpu_custom_call_target = register_custom_call_target
 
 
 class PaddingConfigDimension(object):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 16c1d4237a6..257e02ceec3 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -22,14 +22,14 @@ import functools
 import itertools
 import threading
 
+from absl.testing import absltest
 import numpy as np
 
 from tensorflow.compiler.xla.python import custom_call_for_test
 from tensorflow.compiler.xla.python import xla_client
-import unittest
 
 
-class ComputationTest(unittest.TestCase):
+class ComputationTest(absltest.TestCase):
   """Base class for running an XLA Computation through the local client."""
 
   def _NewComputation(self, name=None):
@@ -89,7 +89,7 @@ def NumpyArrayBool(*args, **kwargs):
   return np.array(*args, dtype=np.bool, **kwargs)
 
 
-class ComputationPrinting(unittest.TestCase):
+class ComputationPrinting(absltest.TestCase):
 
   def ExampleComputation(self):
     builder = xla_client.ComputationBuilder("acomputation")
@@ -311,7 +311,7 @@ class ComputationsWithConstantsTest(ComputationTest):
   def testCustomCall(self):
     c = self._NewComputation()
     for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
-      xla_client.register_cpu_custom_call_target(name, fn)
+      xla_client.register_custom_call_target(name, fn, platform="cpu")
     c.CustomCall(
         b"test_subtract_f32",
         operands=(c.ConstantF32Scalar(1.25), c.ConstantF32Scalar(0.5)),
@@ -448,14 +448,14 @@ class BufferTest(ComputationTest):
     local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
     self.assertFalse(local_buffer.is_deleted())
-    self.assertEqual(len(pieces), 0)
+    self.assertEmpty(pieces)
 
   def testDestructureTupleOneArrayElement(self):
     t = (np.array([1, 2, 3, 4], dtype=np.int32),)
     local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
     self.assertFalse(local_buffer.is_deleted())
-    self.assertEqual(len(pieces), 1)
+    self.assertLen(pieces, 1)
     array = pieces[0]
     got = array.to_py()
     want = NumpyArrayS32([1, 2, 3, 4])
@@ -472,7 +472,7 @@ class BufferTest(ComputationTest):
     for _ in range(2):
       pieces = local_buffer.destructure()
       self.assertFalse(local_buffer.is_deleted())
-      self.assertEqual(len(pieces), 2)
+      self.assertLen(pieces, 2)
       array0, array1 = pieces
       got = array0.to_py()
       want = NumpyArrayF32([1.0, 2.0, 3.0, 4.0])
@@ -486,14 +486,14 @@ class BufferTest(ComputationTest):
     local_buffer = xla_client.Buffer.from_pyval(t)
     pieces = local_buffer.destructure()
     self.assertFalse(local_buffer.is_deleted())
-    self.assertEqual(len(pieces), 2)
+    self.assertLen(pieces, 2)
     tuple0, array1 = pieces
     got = array1.to_py()
     want = NumpyArrayS32([5])
     np.testing.assert_equal(want, got)
     got = tuple0.to_py()
     self.assertEqual(type(got), tuple)
-    self.assertEqual(len(got), 2)
+    self.assertLen(got, 2)
     np.testing.assert_equal(NumpyArrayF32([1.0, 2.0]), got[0])
     np.testing.assert_equal(NumpyArrayS32([3, 4]), got[1])
 
@@ -506,7 +506,7 @@ class BufferTest(ComputationTest):
     b1 = xla_client.Buffer.from_pyval(t[1])
     btup = xla_client.Buffer.make_tuple([b0, b1], device=0)
     pieces = btup.destructure()
-    self.assertEqual(len(pieces), 2)
+    self.assertLen(pieces, 2)
     array0, array1 = pieces
     np.testing.assert_equal(
         np.array([1, 2, 3, 4], dtype=np.float32), array0.to_py())
@@ -699,7 +699,7 @@ class SingleOpTest(ComputationTest):
     rhs = NumpyArrayF32(rng.randn(10, 4, 5))
     dimension_numbers = (([2], [1]), ([0], [0]))
     c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
-    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs), rtol=1e-6)
 
   def testDotGeneralWithDotDimensionNumbersProto(self):
     c = self._NewComputation()
@@ -714,7 +714,7 @@ class SingleOpTest(ComputationTest):
     dimension_numbers.rhs_batch_dimensions.append(0)
 
     c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
-    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs), rtol=1e-6)
 
   def testDotGeneralWithPrecisionConfig(self):
     c = self._NewComputation()
@@ -730,7 +730,7 @@ class SingleOpTest(ComputationTest):
         c.Constant(rhs),
         dimension_numbers,
         precision_config=config)
-    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs), rtol=1e-6)
 
   def testConvF32Same(self):
     c = self._NewComputation()
@@ -1222,7 +1222,7 @@ class SingleOpTest(ComputationTest):
     result = xla_client.execute_with_python_values(c.Build().Compile())
     # since the result is random, we just check shape and uniqueness
     self.assertEqual(result.shape, shape)
-    self.assertEqual(len(np.unique(result)), np.prod(shape))
+    self.assertLen(np.unique(result), np.prod(shape))
 
   def testRngUniformF32(self):
     lo, hi = 2., 4.
@@ -1235,7 +1235,7 @@ class SingleOpTest(ComputationTest):
     result = xla_client.execute_with_python_values(c.Build().Compile())
     # since the result is random, we just check shape, uniqueness, and range
     self.assertEqual(result.shape, shape)
-    self.assertEqual(len(np.unique(result)), np.prod(shape))
+    self.assertLen(np.unique(result), np.prod(shape))
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
@@ -1272,12 +1272,32 @@ class SingleOpTest(ComputationTest):
     keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
     values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
     c = self._NewComputation()
-    c.SortKeyVal(c.Constant(keys), c.Constant(values), dimension=0)
+    c.Sort((c.Constant(keys), c.Constant(values)), dimension=0)
     result = xla_client.execute_with_python_values(c.Build().Compile())
     self.assertIsInstance(result, tuple)
     np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
     np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
 
+  def testSortCustomComparator(self):
+    b = self._NewComputation("comparator")
+    p0 = b.ParameterFromNumpy(NumpyArrayF32(0))
+    q0 = b.ParameterFromNumpy(NumpyArrayF32(0))
+    p1 = b.ParameterFromNumpy(NumpyArrayS32(0))
+    q1 = b.ParameterFromNumpy(NumpyArrayS32(0))
+    b.Or(b.Lt(p0, q0), b.And(b.Eq(p0, q0), b.Gt(p1, q1)))
+    comparator = b.Build()
+
+    keys = np.array([[2, 3, 1, 3], [3, 1, 2, 2]], dtype=np.float32)
+    values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
+    c = self._NewComputation()
+    c.Sort((c.Constant(keys), c.Constant(values)),
+           dimension=1,
+           comparator=comparator)
+    result = xla_client.execute_with_python_values(c.Build().Compile())
+    self.assertIsInstance(result, tuple)
+    np.testing.assert_allclose(result[0], [[1, 2, 3, 3], [1, 2, 2, 3]])
+    np.testing.assert_equal(result[1], [[2, 0, 3, 1], [5, 7, 6, 4]])
+
   def testQR(self):
     a = np.array(
         [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
@@ -1923,4 +1943,4 @@ class ComputationRootTest(ComputationTest):
 
 
 if __name__ == "__main__":
-  unittest.main()
+  absltest.main()
diff --git a/tensorflow/compiler/xla/python/xrt.py b/tensorflow/compiler/xla/python/xrt.py
index 40dea45e442..7ab2afa19d4 100644
--- a/tensorflow/compiler/xla/python/xrt.py
+++ b/tensorflow/compiler/xla/python/xrt.py
@@ -61,6 +61,15 @@ class XrtBackend(xla_client.Backend):
   def device_count(self):
     return self.context.DeviceCount()
 
+  def local_device_count(self):
+    raise NotImplementedError()
+
+  def devices(self):
+    raise NotImplementedError()
+
+  def host_id(self):
+    raise NotImplementedError()
+
   def buffer_from_pyval(self, pyval, device=0):
     return _xla.xrt.XrtBuffer.from_literal(self.context, device, pyval)
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
old mode 100644
new mode 100755
index c4af8863c05..c14048a18d6
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -4,10 +4,18 @@
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_py",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = [":friends"],
@@ -290,6 +298,64 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_live_range",
+    srcs = [
+        "hlo_live_range.cc",
+    ],
+    hdrs = [
+        "hlo_live_range.h",
+    ],
+    deps = [
+        ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_buffer",
+        ":hlo_dataflow_analysis",
+        ":hlo_ordering",
+        ":logical_buffer",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_live_range_test",
+    srcs = ["hlo_live_range_test.cc"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_live_range",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        ":hlo_value",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 tf_cc_test(
     name = "hlo_evaluator_test",
     srcs = ["hlo_evaluator_test.cc"],
@@ -565,8 +631,10 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -860,10 +928,25 @@ cc_library(
     name = "gpu_plugin",
     deps = [
         ":service",
+        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
-        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/core:stream_executor_no_cuda",
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+    ]) + if_rocm_is_configured([
+        "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler",
+        "//tensorflow/core/platform/default/build_config:stream_executor_rocm",
+    ]),
+)
+
+cc_library(
+    name = "mlir_gpu_plugin",
+    deps = [
+        ":service",
+        "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
+        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
@@ -950,6 +1033,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_description",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
@@ -1111,8 +1195,10 @@ cc_library(
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
+        ":hlo_live_range",
         ":hlo_proto",
         ":logical_buffer",
+        ":memory_space_assignment",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1213,6 +1299,7 @@ cc_library(
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
+        ":hlo_live_range",
         ":hlo_ordering",
         ":hlo_proto",
         ":tuple_points_to_analysis",
@@ -1424,6 +1511,7 @@ cc_library(
     hdrs = ["fusion_queue.h"],
     deps = [
         ":hlo",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1679,6 +1767,7 @@ cc_library(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_creation_utils",
+        ":hlo_evaluator",
         ":hlo_pass",
         ":hlo_query",
         ":pattern_matcher",
@@ -1692,6 +1781,39 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tree_reduction_rewriter",
+    srcs = ["tree_reduction_rewriter.cc"],
+    hdrs = ["tree_reduction_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_creation_utils",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1891,6 +2013,41 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "depthwise_convolution_converter",
+    srcs = ["depthwise_convolution_converter.cc"],
+    hdrs = ["depthwise_convolution_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "depthwise_convolution_converter_test",
+    size = "small",
+    srcs = ["depthwise_convolution_converter_test.cc"],
+    deps = [
+        ":depthwise_convolution_converter",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
 cc_library(
     name = "while_loop_analysis",
     srcs = ["while_loop_analysis.cc"],
@@ -2096,13 +2253,14 @@ cc_library(
     hdrs = ["dynamic_dimension_inference.h"],
     deps = [
         ":hlo",
+        ":hlo_casting_utils",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:macros",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -2782,6 +2940,30 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_space_assignment",
+    srcs = ["memory_space_assignment.cc"],
+    hdrs = ["memory_space_assignment.h"],
+    deps = [
+        ":heap_simulator",
+        ":hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_assignment_test",
+    srcs = ["memory_space_assignment_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_matchers",
+        ":memory_space_assignment",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
@@ -4221,3 +4403,18 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:prng",
     ],
 )
+
+cc_library(
+    name = "slow_operation_alarm",
+    srcs = ["slow_operation_alarm.cc"],
+    hdrs = ["slow_operation_alarm.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index eef570e2540..077b76c4c64 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -60,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace xla {
 
@@ -170,6 +172,10 @@ bool IsUnstridedSlice(const HloInstruction* hlo) {
 // more general case a worklist based approach would be needed.
 class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
  public:
+  explicit AlgebraicSimplifierVisitor(const AlgebraicSimplifierOptions& options,
+                                      AlgebraicSimplifier* simplifier)
+      : options_(options), simplifier_(simplifier) {}
+
   Status HandleAdd(HloInstruction* add) override;
 
   Status HandleAnd(HloInstruction* logical_and) override;
@@ -204,10 +210,18 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   Status HandleDot(HloInstruction* dot) override;
 
+  Status HandleGather(HloInstruction* gather) override;
+
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
 
   Status HandleLog(HloInstruction* log) override;
 
+  Status HandleMaximum(HloInstruction* maximum) override;
+
+  Status HandleMinimum(HloInstruction* minimum) override;
+
+  Status HandleClamp(HloInstruction* clamp) override;
+
   Status HandleMultiply(HloInstruction* multiply) override;
 
   Status HandleNegate(HloInstruction* negate) override;
@@ -224,7 +238,7 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   Status HandleReshape(HloInstruction* reshape) override;
 
-  Status HandleReduce(HloInstruction* reduce) override;
+  Status HandleReduce(HloInstruction* hlo) override;
 
   Status HandleReduceWindow(HloInstruction* reduce_window) override;
 
@@ -246,16 +260,11 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   Status HandleMap(HloInstruction* map) override;
 
   // Runs the visitor on a computation.
-  static bool Run(HloComputation* computation,
-                  const AlgebraicSimplifierOptions& options,
-                  AlgebraicSimplifier* simplifier);
+  bool Run(HloComputation* computation,
+           const AlgebraicSimplifierOptions& options,
+           AlgebraicSimplifier* simplifier);
 
  private:
-  explicit AlgebraicSimplifierVisitor(HloComputation* computation,
-                                      const AlgebraicSimplifierOptions& options,
-                                      AlgebraicSimplifier* simplifier)
-      : computation_(computation), options_(options), simplifier_(simplifier) {}
-
   // Removes degenerate dimension from dot.
   StatusOr<bool> RemoveDegenerateDimensionFromDot(HloInstruction* dot);
 
@@ -385,6 +394,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Useful when we want to use the same visitor over multiple computations.
+  void ResetState(HloComputation* computation);
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -403,12 +415,18 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
+void AlgebraicSimplifierVisitor::ResetState(HloComputation* computation) {
+  changed_ = false;
+  ResetVisitStates();
+  computation_ = computation;
+}
+
 bool AlgebraicSimplifierVisitor::Run(HloComputation* computation,
                                      const AlgebraicSimplifierOptions& options,
                                      AlgebraicSimplifier* simplifier) {
-  AlgebraicSimplifierVisitor visitor(computation, options, simplifier);
-  TF_CHECK_OK(computation->Accept(&visitor));
-  return visitor.changed_ || visitor.changed();
+  ResetState(computation);
+  TF_CHECK_OK(computation->Accept(this));
+  return changed_ || changed();
 }
 
 bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
@@ -431,8 +449,8 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
   CHECK_EQ(ShapeUtil::ByteSizeOf(instruction->shape()),
            ShapeUtil::ByteSizeOf(operand->shape()));
 
-  auto bitcast = computation_->AddInstruction(HloInstruction::CreateUnary(
-      instruction->shape(), HloOpcode::kBitcast, operand));
+  auto bitcast = computation_->AddInstruction(
+      HloInstruction::CreateBitcast(instruction->shape(), operand));
   TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
@@ -573,8 +591,7 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   HloInstruction* op;
   if (Match(bitcast, m::Bitcast(m::Bitcast(m::Op(&op))))) {
     return ReplaceWithNewInstruction(
-        bitcast,
-        HloInstruction::CreateUnary(bitcast->shape(), HloOpcode::kBitcast, op));
+        bitcast, HloInstruction::CreateBitcast(bitcast->shape(), op));
   }
   // All bitcasts can be eliminated (assuming layout constraints are
   // satisified).
@@ -1875,6 +1892,175 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
+  const Shape& operand_shape = gather->operand(0)->shape();
+  // If the operand of a gather is very small, it is easier to fuse a
+  // sequence of selects.
+  if (operand_shape.rank() == 1 &&
+      operand_shape.dimensions(0) <= options_.very_small_gather_size() &&
+      gather->gather_dimension_numbers().index_vector_dim() ==
+          gather->operand(1)->shape().rank() &&
+      gather->gather_dimension_numbers().collapsed_slice_dims_size() == 1) {
+    const Shape& index_shape = gather->operand(1)->shape();
+    const int64 operand_elements = operand_shape.dimensions(0);
+    auto get_value = [&](int64 i) {
+      auto slice = computation_->AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(operand_shape.element_type(), {1}),
+          gather->mutable_operand(0), {i}, {i + 1}, {1}));
+      auto scalar = computation_->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(operand_shape.element_type(), {}), slice));
+      return computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(gather->shape(), scalar, {}));
+    };
+    auto result = get_value(0);
+    auto one = computation_->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::One(index_shape.element_type())));
+    auto index = one;
+    auto pred_shape = ShapeUtil::ChangeElementType(gather->shape(), PRED);
+    auto iter_shape = ShapeUtil::ChangeElementType(gather->shape(),
+                                                   index_shape.element_type());
+    for (int64 i = 1; i < operand_elements; ++i) {
+      auto broadcasted_index = computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(iter_shape, index, {}));
+      auto index_mask =
+          computation_->AddInstruction(HloInstruction::CreateCompare(
+              pred_shape, gather->mutable_operand(1), broadcasted_index,
+              ComparisonDirection::kGe));
+      result = computation_->AddInstruction(
+          HloInstruction::CreateTernary(gather->shape(), HloOpcode::kSelect,
+                                        index_mask, get_value(i), result));
+      index = computation_->AddInstruction(HloInstruction::CreateBinary(
+          index->shape(), HloOpcode::kAdd, index, one));
+    }
+    return ReplaceInstruction(gather, result);
+  }
+  return Status::OK();
+}
+
+namespace {
+StatusOr<std::unique_ptr<HloInstruction>> MinMaxToClamp(
+    HloInstruction* clamp_lower_bound_bcast, HloInstruction* to_clamp,
+    HloInstruction* clamp_upper_bound_bcast) {
+  HloInstruction* clamp_lower_bound;
+  CHECK(Match(clamp_lower_bound_bcast,
+              m::Broadcast(m::ConstantEffectiveScalar(&clamp_lower_bound))))
+      << clamp_lower_bound_bcast->ToString();
+
+  HloInstruction* clamp_upper_bound;
+  CHECK(Match(clamp_upper_bound_bcast,
+              m::Broadcast(m::ConstantEffectiveScalar(&clamp_upper_bound))))
+      << clamp_upper_bound_bcast->ToString();
+
+  const Literal& lower_bound =
+      Cast<HloConstantInstruction>(clamp_lower_bound)->literal();
+  const Literal& upper_bound =
+      Cast<HloConstantInstruction>(clamp_upper_bound)->literal();
+
+  std::unique_ptr<HloInstruction> lower_bound_instr =
+      HloInstruction::CreateConstant(lower_bound.Clone());
+  std::unique_ptr<HloInstruction> upper_bound_instr =
+      HloInstruction::CreateConstant(upper_bound.Clone());
+
+  std::unique_ptr<HloInstruction> cloned_instruction =
+      HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(lower_bound_instr->shape(), PRED),
+          lower_bound_instr.get(), upper_bound_instr.get(),
+          ComparisonDirection::kLt);
+
+  HloEvaluator evaluator;
+  TF_ASSIGN_OR_RETURN(auto result,
+                      evaluator.Evaluate(cloned_instruction.get()));
+  if (result.IsAll(true)) {
+    return HloInstruction::CreateTernary(to_clamp->shape(), HloOpcode::kClamp,
+                                         clamp_lower_bound_bcast, to_clamp,
+                                         clamp_upper_bound_bcast);
+  }
+  return std::unique_ptr<HloInstruction>();
+}
+}  // namespace
+
+Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(maximum, m::Maximum(m::Op(&lhs), m::Op(&rhs))));
+
+  HloInstruction* clamp_upper_bound_bcast;
+  HloInstruction* clamp_lower_bound_bcast;
+  HloInstruction* to_clamp;
+  if (Match(maximum, m::MaximumAnyOrder(
+                         m::Broadcast(&clamp_lower_bound_bcast,
+                                      m::ConstantEffectiveScalar()),
+                         m::MinimumAnyOrder(
+                             m::Op(&to_clamp),
+                             m::Broadcast(&clamp_upper_bound_bcast,
+                                          m::ConstantEffectiveScalar()))))) {
+    TF_ASSIGN_OR_RETURN(auto clamp,
+                        MinMaxToClamp(clamp_lower_bound_bcast, to_clamp,
+                                      clamp_upper_bound_bcast));
+    if (clamp) {
+      return ReplaceWithNewInstruction(maximum, std::move(clamp));
+    }
+  }
+
+  HloInstruction* clamp_lower_bound;
+  HloInstruction* clamp_upper_bound;
+  HloInstruction* max_operand;
+  HloInstruction* clamp;
+  if (Match(maximum,
+            m::MaximumAnyOrder(
+                m::Op(&max_operand),
+                m::Clamp(&clamp, m::Op(&clamp_lower_bound), m::Op(&to_clamp),
+                         m::Op(&clamp_upper_bound))))) {
+    if (max_operand == clamp_lower_bound &&
+        ReplaceInstructionIfSameShape(maximum, clamp)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(minimum, m::Minimum(m::Op(&lhs), m::Op(&rhs))));
+
+  HloInstruction* clamp_upper_bound_bcast;
+  HloInstruction* clamp_lower_bound_bcast;
+  HloInstruction* to_clamp;
+  if (Match(minimum, m::MinimumAnyOrder(
+                         m::Broadcast(&clamp_upper_bound_bcast,
+                                      m::ConstantEffectiveScalar()),
+                         m::MaximumAnyOrder(
+                             m::Op(&to_clamp),
+                             m::Broadcast(&clamp_lower_bound_bcast,
+                                          m::ConstantEffectiveScalar()))))) {
+    TF_ASSIGN_OR_RETURN(auto clamp,
+                        MinMaxToClamp(clamp_lower_bound_bcast, to_clamp,
+                                      clamp_upper_bound_bcast));
+    if (clamp) {
+      return ReplaceWithNewInstruction(minimum, std::move(clamp));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleClamp(HloInstruction* clamp) {
+  HloInstruction* clamp_lower_bound;
+  HloInstruction* clamp_upper_bound;
+  HloInstruction* to_clamp;
+  CHECK(Match(clamp, m::Clamp(m::Op(&clamp_lower_bound), m::Op(&to_clamp),
+                              m::Op(&clamp_upper_bound))));
+
+  // clamp(a, clamp(a, x, b), b) -> clamp(a, x, b)
+  if (Match(to_clamp, m::Clamp(m::Op().Is(clamp_lower_bound), m::Op(),
+                               m::Op().Is(clamp_upper_bound))) &&
+      ReplaceInstructionIfSameShape(clamp, to_clamp)) {
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(multiply, m::Multiply(m::Op(&lhs), m::Op(&rhs))));
@@ -2385,9 +2571,11 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     TF_ASSIGN_OR_RETURN(
         HloInstruction * slice,
         MakeSliceHlo(nonzero_pad, start_indices, end_indices, strides));
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        pad->shape(), slice->mutable_shape()));
 
     // Verify that the slice shape matches the pad shape.
-    TF_RET_CHECK(ShapeUtil::Compatible(slice->shape(), pad->shape()));
+    TF_RET_CHECK(ShapeUtil::Equal(slice->shape(), pad->shape()));
 
     return ReplaceInstruction(pad, slice);
   }
@@ -2699,9 +2887,9 @@ Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
     // this.  But that's OK for our purposes here.)
     int64 iota_upper_bound = iota->shape().dimensions(
         Cast<HloIotaInstruction>(iota)->iota_dimension());
-    StatusOr<int64> divisor_val = divisor->literal().GetIntegralAsS64(
+    absl::optional<int64> divisor_val = divisor->literal().GetIntegralAsS64(
         std::vector<int64>(0, divisor->shape().dimensions_size()));
-    if (divisor_val.ok() && divisor_val.ValueOrDie() >= iota_upper_bound) {
+    if (divisor_val && *divisor_val >= iota_upper_bound) {
       return ReplaceInstruction(remainder, iota);
     }
   }
@@ -2727,12 +2915,12 @@ Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
     // smaller.
     int64 iota_upper_bound = iota->shape().dimensions(
         Cast<HloIotaInstruction>(iota)->iota_dimension());
-    StatusOr<int64> divisor_val = divisor->literal().GetIntegralAsS64(
+    absl::optional<int64> divisor_val = divisor->literal().GetIntegralAsS64(
         std::vector<int64>(0, divisor->shape().dimensions_size()));
-    if (divisor_val.ok()) {
+    if (divisor_val) {
       // Check whether divisor_val + iota_upper_bound - 1 overflows.
       absl::optional<int64> max_val =
-          OverflowSafeAdd(divisor_val.ValueOrDie(), iota_upper_bound);
+          OverflowSafeAdd(*divisor_val, iota_upper_bound);
       if (max_val.has_value() &&
           FitsInIntegralType(*max_val, iota->shape().element_type())) {
         return ReplaceWithNewInstruction(
@@ -3026,7 +3214,11 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     return Status::OK();
   }
 
-  TF_ASSIGN_OR_RETURN(replaced, TryToReorderSliceAndReshape(slice));
+  // Do not try to reorder slices and reshapes after layout assignment as it may
+  // be invalid.
+  if (!options_.is_layout_sensitive()) {
+    TF_ASSIGN_OR_RETURN(replaced, TryToReorderSliceAndReshape(slice));
+  }
   if (replaced) {
     return Status::OK();
   }
@@ -3807,7 +3999,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
     std::vector<int64> dims(operand->shape().dimensions_size());
     std::iota(dims.begin(), dims.end(), 0);
     return computation_->AddInstruction(
-        HloInstruction::CreateUnary(shape, HloOpcode::kBitcast, operand));
+        HloInstruction::CreateBitcast(shape, operand));
   };
 
   // Replace it with a dot, with bitcasts around it to get the right shape.
@@ -3946,8 +4138,9 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
   bool changed = false;
+  AlgebraicSimplifierVisitor visitor(options_, this);
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (AlgebraicSimplifierVisitor::Run(comp, options_, this)) {
+    if (visitor.Run(comp, options_, this)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 37ea35ade0d..74d8b1d4582 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -92,6 +92,13 @@ class AlgebraicSimplifierOptions {
     return enable_window_reduce_to_reduce_replacement_;
   }
 
+  // Sets the size of a gather operand that can be unrolled into many selects.
+  void set_very_small_gather_size(int64 size) {
+    very_small_gather_size_ = size;
+  }
+
+  int64 very_small_gather_size() const { return very_small_gather_size_; }
+
  private:
   ReshapeIsBitcastCallback reshape_is_bitcast_callback_;
   bool is_layout_sensitive_{false};
@@ -99,6 +106,7 @@ class AlgebraicSimplifierOptions {
   bool enable_dot_to_multiply_rewrite_{true};
   bool enable_conv_simplification_{true};
   bool enable_window_reduce_to_reduce_replacement_{true};
+  int64 very_small_gather_size_{4};
 };
 
 // A pass which performs algebraic simplifications.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 4c5e5ef9e7e..230a5a1c058 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -5543,5 +5543,103 @@ TEST_F(AlgebraicSimplifierTest, RepeatedRemainder) {
               GmockMatch(m::Remainder(m::Parameter(), m::Parameter())));
 }
 
+TEST_F(AlgebraicSimplifierTest, SlicePadLayout) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      %param.0 = f32[128,9,9,1024]{0,3,2,1} parameter(0)
+      %param.1 = f32[] parameter(1)
+      %slice = f32[128,9,9,1024]{0,3,2,1} slice(%param.0),
+        slice={[0:128], [0:9], [0:9], [0:1024]}
+      ROOT %pad = f32[128,8,9,1024]{0,3,2,1} pad(%slice, %param.1),
+        padding=0_0x-1_0x0_0x0_0
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  const Shape root_shape = m->entry_computation()->root_instruction()->shape();
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Slice().WithShapeEqualTo(&root_shape)));
+}
+
+TEST_F(AlgebraicSimplifierTest, MinOfMaxToClamp) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(3.0)
+      c1 = f32[] constant(4.0)
+      b0 = f32[4] broadcast(c0), dimensions={}
+      b1 = f32[4] broadcast(c1), dimensions={}
+      m0 = f32[4] maximum(b0, p0)
+      ROOT m1 = f32[4] minimum(m0, b1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Clamp(m::Broadcast(m::ConstantScalar(3.0)), m::Parameter(0),
+                          m::Broadcast(m::ConstantScalar(4.0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MaxOfMinToClamp) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(3.0)
+      c1 = f32[] constant(4.0)
+      b0 = f32[4] broadcast(c0), dimensions={}
+      b1 = f32[4] broadcast(c1), dimensions={}
+      m0 = f32[4] minimum(p0, b1)
+      ROOT m1 = f32[4] maximum(b0, m0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Clamp(m::Broadcast(m::ConstantScalar(3.0)), m::Parameter(0),
+                          m::Broadcast(m::ConstantScalar(4.0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ClampOfClamp) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      p2 = f32[] parameter(2)
+      c0 = f32[] clamp(p0, p1, p2)
+      ROOT c1 = f32[] clamp(p0, c0, p2)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Clamp(m::Parameter(0), m::Parameter(1), m::Parameter(2))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MaxOfClamp) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      p2 = f32[] parameter(2)
+      c0 = f32[] clamp(p0, p1, p2)
+      ROOT m0 = f32[] maximum(p0, c0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Clamp(m::Parameter(0), m::Parameter(1), m::Parameter(2))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 147f3ae7b6d..9c19308bff3 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -29,9 +29,9 @@ namespace xla {
 class BatchNormExpander : public HloModulePass {
  public:
   // When use_fusion is set, a multi-output fusion node is created.
-  BatchNormExpander(bool rewrite_training_op = false,
-                    bool rewrite_inference_op = false,
-                    bool rewrite_grad_op = false)
+  explicit BatchNormExpander(bool rewrite_training_op = false,
+                             bool rewrite_inference_op = false,
+                             bool rewrite_grad_op = false)
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op) {}
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 4d465640f2d..6331f02aa81 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -308,6 +308,28 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
   return true;
 }
 
+namespace {
+
+// Returns whether we should avoid changing the precision of inst regardless of
+// the producers and users.
+bool ShouldKeepPrecisionUnchanged(const HloInstruction* inst) {
+  if (inst->opcode() == HloOpcode::kFusion &&
+      inst->fusion_kind() == HloInstruction::FusionKind::kCustom) {
+    return ShouldKeepPrecisionUnchanged(
+        inst->fused_instructions_computation()->root_instruction());
+  }
+  // Do not change precision for side-effecting instructions, control flow, and
+  // bitcast-convert, because this pass might break the interfaces or
+  // assumptions for them.
+  return inst->opcode() == HloOpcode::kCustomCall ||      //
+         inst->opcode() == HloOpcode::kCall ||            //
+         inst->opcode() == HloOpcode::kConditional ||     //
+         inst->opcode() == HloOpcode::kBitcastConvert ||  //
+         inst->HasSideEffectNoRecurse();
+}
+
+}  // namespace
+
 void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
                                                         bool skip_parameters) {
   // We handle any fusion computation or while body/condition after the
@@ -354,15 +376,7 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
     return;
   }
 
-  // Do not change precision for instructions related to entry and exit of a
-  // computation, side-effecting instructions, control flow, and
-  // bitcast-convert, because this pass might break the interfaces or
-  // assumptions for them.
-  if (hlo->opcode() == HloOpcode::kCustomCall ||      //
-      hlo->opcode() == HloOpcode::kCall ||            //
-      hlo->opcode() == HloOpcode::kConditional ||     //
-      hlo->opcode() == HloOpcode::kBitcastConvert ||  //
-      hlo->HasSideEffectNoRecurse() ||                //
+  if (ShouldKeepPrecisionUnchanged(hlo) ||
       (hlo->opcode() == HloOpcode::kParameter && skip_parameters)) {
     return;
   }
@@ -797,6 +811,39 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
 
   // Apply the changes in changes_to_bf16_.
   for (auto& change : changes_to_bf16_) {
+    auto inst = change.first;
+    // It is possible that we marked inst to change precision even if it is an
+    // unsupported change, when inst is the root of a fusion computation and it
+    // has to match the fusion node's output precision. We do a convert instead
+    // of in-place change for such cases.
+    if (ShouldKeepPrecisionUnchanged(inst)) {
+      auto users = inst->users();
+      bool is_root = inst == inst->parent()->root_instruction();
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * copy,
+          inst->parent()->DeepCopyInstructionWithCustomCopier(
+              inst, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
+                        HloComputation* comp) {
+                if (!ContainsKey(change.second,
+                                 ShapeUtil::GetMutableSubshape(
+                                     inst->mutable_shape(), leaf_index))) {
+                  return leaf;
+                }
+                auto converted_shape =
+                    ShapeUtil::ChangeElementType(leaf->shape(), BF16);
+                UpdateLayout(&converted_shape);
+                return comp->AddInstruction(
+                    HloInstruction::CreateConvert(converted_shape, leaf));
+              }));
+      for (auto user : users) {
+        TF_RETURN_IF_ERROR(inst->ReplaceUseWithDifferentShape(user, copy));
+      }
+      if (is_root) {
+        inst->parent()->set_root_instruction(copy,
+                                             /*accept_different_shape=*/true);
+      }
+      continue;
+    }
     for (const auto& entry : change.second) {
       auto subshape = entry.first;
       CHECK_EQ(subshape->element_type(), F32);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 86eb8cb240c..d716e62d467 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -422,6 +422,35 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
   EXPECT_TRUE(OutputsBF16(b_f1));
 }
 
+// Tests that a fusion with a bitcast-convert as its root is changed via adding
+// extra convert, instead of changing the type in-place.
+TEST_F(BFloat16PropagationTest, FusionWithBitcastConvertRoot) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape u32_shape = ShapeUtil::MakeShape(U32, {4, 4});
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, u32_shape, "param"));
+
+  auto builder_f = HloComputation::Builder("fusion");
+  HloInstruction* a_f = builder_f.AddInstruction(
+      HloInstruction::CreateParameter(0, u32_shape, "a"));
+  HloInstruction* bc_f = builder_f.AddInstruction(
+      HloInstruction::CreateBitcastConvert(f32_shape, a_f));
+  auto comp_f = module->AddEmbeddedComputation(builder_f.Build());
+  auto fusion = builder.AddInstruction(HloInstruction::CreateFusion(
+      f32_shape, HloInstruction::FusionKind::kLoop, {param}, comp_f));
+  auto dot = builder.AddInstruction(CreateDot(f32_shape, fusion, fusion));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_EQ(bc_f->shape(), f32_shape);
+  EXPECT_TRUE(OutputsBF16(bc_f));
+}
+
 // Tests that changes to BF16 that cannot be propagated outside a fusion are
 // discarded.
 TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 3ae7235d887..d72a91f45df 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -233,8 +234,8 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 
 void BufferAllocation::AddAssignment(const HloValue& buffer, int64 offset,
                                      int64 size) {
-  VLOG(4) << "Adding the following buffer to allocation #" << index() << ": "
-          << buffer;
+  VLOG(4) << "Adding the following buffer to allocation #" << index() << " ["
+          << offset << ", " << size << "]: " << buffer;
   CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -250,6 +251,13 @@ void BufferAllocation::AddAssignment(const HloValue& buffer, int64 offset,
   offset_size.offset = offset;
   offset_size.size = size;
   assigned_buffers_.emplace(&buffer, offset_size);
+  // For debugging purposes, store the assigned memory space in the
+  // instruction's layout.
+  HloInstruction* defining_instruction = buffer.defining_instruction();
+  if (defining_instruction->shape().has_layout()) {
+    defining_instruction->mutable_shape()->mutable_layout()->set_memory_space(
+        buffer.color().value());
+  }
 }
 
 BufferAllocationProto BufferAllocation::ToProto() const {
@@ -758,14 +766,69 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     LogicalBuffer::AlignmentFunction color_alignment,
     bool allocate_buffers_for_constants, BufferAssigner::Colorer colorer,
     const absl::flat_hash_set<HloOpcode>& reuse_checker,
-    HloDataflowAnalysis::CanShareBuffer can_share_buffer) {
+    HloDataflowAnalysis::CanShareBuffer can_share_buffer,
+    std::unique_ptr<PresetAssignments> preset_assignments) {
   BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
-                          reuse_checker);
+                          reuse_checker, std::move(preset_assignments));
   return assigner.CreateAssignment(
       module, std::move(hlo_ordering), std::move(buffer_size),
       std::move(color_alignment), std::move(can_share_buffer));
 }
 
+bool BufferAssigner::LiveRangeInterferes(const HloValue* buffer1,
+                                         const HloValue* buffer2,
+                                         BufferAssignment* assignment) {
+  CHECK((assignment->hlo_live_range().total_order_scheduled()));
+  const HloLiveRange& hlo_live_range = assignment->hlo_live_range();
+
+  const auto& buffer_live_ranges = hlo_live_range.buffer_live_ranges();
+
+  CHECK(buffer_live_ranges.contains(buffer1))
+      << "Buffer doesn't have a proper live range:" << buffer1;
+
+  CHECK(buffer_live_ranges.contains(buffer2))
+      << "Buffer doesn't have a proper live range:" << buffer2;
+
+  // Check if a user value can share the same buffer as its operand.
+  auto can_share_as_operand = [&assignment](const HloValue* user_value,
+                                            const HloValue* operand_value) {
+    return user_value->instruction()->IsUserOf(operand_value->instruction()) &&
+           assignment->dataflow_analysis().CanShareOperandBufferWithUser(
+               operand_value->instruction(), operand_value->index(),
+               user_value->instruction(), user_value->index()) &&
+           user_value->instruction()->opcode() != HloOpcode::kCopy;
+  };
+
+  auto live_range_1 = buffer_live_ranges.at(buffer1);
+  auto live_range_2 = buffer_live_ranges.at(buffer2);
+
+  if (!(live_range_1.start > live_range_2.end ||
+        live_range_2.start > live_range_1.end)) {
+    if (live_range_1.end == live_range_2.start) {
+      auto operand_value = buffer1;
+      auto user_value = buffer2;
+      if (!can_share_as_operand(user_value, operand_value)) {
+        return true;
+      }
+    } else if (live_range_2.end == live_range_1.start) {
+      auto operand_value = buffer2;
+      auto user_value = buffer1;
+      if (!can_share_as_operand(user_value, operand_value)) {
+        return true;
+      }
+    } else {
+      VLOG(4) << "Can't assign: assignee " << *buffer1 << " may interfere with "
+              << *buffer2;
+      VLOG(4) << "assigned_buffer.start: " << live_range_1.start;
+      VLOG(4) << "assigned_buffer.end: " << live_range_1.end;
+      VLOG(4) << "live_range_2.start" << live_range_2.start;
+      VLOG(4) << "live_range_2.end" << live_range_2.end;
+      return true;
+    }
+  }
+  return false;
+}
+
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
                                        const HloBuffer& hlo_buffer,
                                        BufferAssignment* assignment) {
@@ -777,7 +840,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
           << " to allocation: " << *allocation;
 
   if (hlo_buffer.color() != allocation->color()) {
-    VLOG(4) << "Can't assign: buffer has color" << hlo_buffer.color()
+    VLOG(4) << "Can't assign: buffer has color " << hlo_buffer.color()
             << " and allocation has color " << allocation->color() << ".";
     return false;
   }
@@ -833,10 +896,17 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     const HloValue& assigned_buffer =
         *CHECK_NOTNULL(dynamic_cast<const HloValue*>(buffer_offset_size.first));
     for (const HloValue* new_value : hlo_buffer.values()) {
-      if (assignment->hlo_ordering().MayInterfere(
-              assigned_buffer, *new_value, assignment->dataflow_analysis())) {
+      if (assignment->hlo_live_range().total_order_scheduled()) {
+        if (LiveRangeInterferes(new_value, &assigned_buffer, assignment)) {
+          return false;
+        }
+      } else if (assignment->hlo_ordering().MayInterfere(
+                     assigned_buffer, *new_value,
+                     assignment->dataflow_analysis())) {
+        // Fallback to partial order based interference detection (slower) when
+        // we don't have a total order scheduled module.
         VLOG(4) << "Can't assign: assignee " << assigned_buffer
-                << " may interfere with " << new_value;
+                << " may interfere with " << new_value->ToShortString();
         return false;
       }
 
@@ -847,7 +917,8 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
                 assigned_buffer_position.instruction) &&
             new_value->instruction()->opcode() == HloOpcode::kCopy) {
           VLOG(4) << "Can't assign: assignee " << assigned_buffer
-                  << " is used at copy instruction " << new_value;
+                  << " is used at copy instruction "
+                  << new_value->ToShortString();
           return false;
         }
       }
@@ -1094,8 +1165,20 @@ Status BufferAssigner::AssignBuffersForComputations(
   }
   std::vector<const HloBuffer*> sorted_buffers;
 
+  // First assign the preset allocations.
+  absl::flat_hash_set<const HloBuffer*> preset_assigned_buffers;
+
+  TF_RETURN_IF_ERROR(AssignPresetBuffers(&preset_assigned_buffers, assignment));
+
   const HloAliasAnalysis& alias_analysis = assignment->alias_analysis();
+
   for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    // Skip if the buffer is already assigned since it had a preset allocation.
+    if (preset_assigned_buffers.find(&buffer) !=
+        preset_assigned_buffers.end()) {
+      VLOG(3) << "Skip allocation for buffer: " << buffer;
+      continue;
+    }
     TF_RET_CHECK(!buffer.values().empty());
     const HloComputation* comp = buffer.values()[0]->instruction()->parent();
     if (absl::c_linear_search(computations, comp)) {
@@ -1124,9 +1207,12 @@ Status BufferAssigner::AssignBuffersForComputations(
     }
   }
 
+  HloSchedule schedule(&assignment->module());
+
   for (const HloComputation* computation : computations) {
-    const bool has_sequential_order =
-        assignment->hlo_ordering().SequentialOrder(*computation) != nullptr;
+    const HloInstructionSequence* instruction_sequence =
+        assignment->hlo_ordering().SequentialOrder(*computation);
+    const bool has_sequential_order = instruction_sequence != nullptr;
     if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
       // Every sequential computation must get an entry in the
       // buffers_to_assign_sequentially map, even if we end up with an empty
@@ -1134,6 +1220,8 @@ Status BufferAssigner::AssignBuffersForComputations(
       // run whole-module heap simulation.
       buffers_to_assign_sequentially->emplace(computation,
                                               flat_hash_set<const HloValue*>());
+
+      schedule.set_sequence(computation, *instruction_sequence);
     }
   }
 
@@ -1188,6 +1276,54 @@ BufferAssigner::SplitBuffersByColor(
   return color_map;
 }
 
+Status BufferAssigner::AssignPresetBuffers(
+    absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
+    BufferAssignment* assignment) {
+  if (!preset_assignments_) {
+    return Status::OK();
+  }
+
+  // Create an allocation for each preset color.
+  absl::flat_hash_map<LogicalBuffer::Color, BufferAllocation*,
+                      LogicalBuffer::Color::Hasher>
+      preset_allocations;
+  for (auto& color_and_size : preset_assignments_->sizes()) {
+    LogicalBuffer::Color color(color_and_size.first);
+    auto inserted = preset_allocations.emplace(
+        color, assignment->NewEmptyAllocation(color_and_size.second, color));
+    BufferAllocation* inserted_allocation = inserted.first->second;
+    VLOG(3) << "Created preset buffer allocation "
+            << inserted_allocation->index()
+            << ", color: " << inserted_allocation->color()
+            << ", size: " << inserted_allocation->size();
+  }
+
+  const HloAliasAnalysis& alias_analysis = assignment->alias_analysis();
+
+  for (auto& position_and_chunk : preset_assignments_->chunks()) {
+    const HloPosition& position = position_and_chunk.first;
+    const HloBuffer& buffer =
+        alias_analysis.GetUniqueBufferAt(position.instruction, position.index);
+    VLOG(3) << "Preset allocation for buffer: " << buffer;
+    const HeapSimulator::Chunk& chunk = position_and_chunk.second;
+    auto preset_allocations_iter = preset_allocations.find(buffer.color());
+    CHECK(preset_allocations_iter != preset_allocations.end())
+        << "No preset buffer allocation for color " << buffer.color()
+        << " found.";
+    preset_allocations_iter->second->AddAssignment(buffer.GetUniqueValue(),
+                                                   chunk.offset, chunk.size);
+    // Ensure that there is at most one preset allocation for each buffer.
+    CHECK_EQ(assigned_buffers->count(&buffer), 0);
+    assigned_buffers->emplace(&buffer);
+  }
+
+  // Upon consumption of the preset assignments, delete it so that if this
+  // method is called again, it does not assign the same buffers multiple times.
+  preset_assignments_ = {};
+
+  return Status::OK();
+}
+
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const flat_hash_map<const HloComputation*, flat_hash_set<const HloValue*>>&
         buffers_to_assign_sequentially,
@@ -1393,6 +1529,21 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer));
 
+  // Set up a schedule for each computation.
+  HloSchedule schedule(module);
+  for (const HloComputation* computation : module->computations()) {
+    const HloInstructionSequence* instruction_sequence =
+        hlo_ordering->SequentialOrder(*computation);
+    const bool has_sequential_order = instruction_sequence != nullptr;
+    if (has_sequential_order) {
+      schedule.set_sequence(computation, *instruction_sequence);
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloLiveRange> hlo_live_range,
+                      HloLiveRange::Run(schedule, *alias_analysis,
+                                        module->entry_computation(), true));
+
   VLOG(1) << "Assigning buffers to module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
   XLA_VLOG_LINES(3, alias_analysis->ToString());
@@ -1404,7 +1555,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // private.
   std::unique_ptr<BufferAssignment> assignment(new BufferAssignment(
       module, std::move(hlo_ordering), std::move(buffer_size),
-      std::move(color_alignment), std::move(alias_analysis)));
+      std::move(color_alignment), std::move(alias_analysis),
+      std::move(hlo_live_range)));
 
   TF_RETURN_IF_ERROR(
       colorer_(&assignment->alias_analysis(), assignment->hlo_ordering()));
@@ -1432,7 +1584,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // module, which reduces memory usage.
   const bool run_whole_module_heap_simulation =
       buffers_to_assign_sequentially.size() == global_computations.size();
-  VLOG(2) << "Running whole module heap simulation"
+  VLOG(2) << "Running whole module heap simulation: "
           << run_whole_module_heap_simulation;
   TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
       buffers_to_assign_sequentially, run_whole_module_heap_simulation,
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index f60ad22fa51..9caf4bee0ad 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -31,8 +31,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -445,9 +447,11 @@ class BufferAssignment {
 
   HloAliasAnalysis& alias_analysis() const { return *alias_analysis_; }
 
-  // Returns the BufferLiveness object used to construct this assignment.
   const HloOrdering& hlo_ordering() const { return *hlo_ordering_; }
 
+  // Returns the HloLiveRange object used to construct this assignment.
+  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
+
   string ToString() const;
   BufferAssignmentProto ToProto() const;
 
@@ -480,12 +484,14 @@ class BufferAssignment {
                    std::unique_ptr<HloOrdering> hlo_ordering,
                    BufferValue::SizeFunction buffer_size,
                    LogicalBuffer::AlignmentFunction color_alignment,
-                   std::unique_ptr<HloAliasAnalysis> alias_analysis)
+                   std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                   std::unique_ptr<HloLiveRange> hlo_live_range)
       : module_(module),
         hlo_ordering_(std::move(hlo_ordering)),
         buffer_size_(std::move(buffer_size)),
         color_alignment_(std::move(color_alignment)),
-        alias_analysis_(std::move(alias_analysis)) {}
+        alias_analysis_(std::move(alias_analysis)),
+        hlo_live_range_(std::move(hlo_live_range)) {}
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
@@ -545,6 +551,8 @@ class BufferAssignment {
 
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
 
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+
   Stats stats_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssignment);
@@ -558,7 +566,13 @@ class BufferAssigner {
   static Colorer DefaultColorer() {
     return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
       for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
-        value->set_color(BufferValue::Color(0));
+        HloInstruction* defining_instruction = value->defining_instruction();
+        if (defining_instruction->shape().has_layout()) {
+          value->set_color(BufferValue::Color(
+              defining_instruction->shape().layout().memory_space()));
+        } else {
+          value->set_color(BufferValue::Color(0));
+        }
       }
       return Status::OK();
     };
@@ -569,7 +583,9 @@ class BufferAssigner {
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size and
   // color_alignment are functions which returns the size and alignment of a
-  // LogicalBuffer.
+  // LogicalBuffer. If preset_assignments is provided, those pre-set assignment
+  // offsets will be used. The caller guarantees that those assignments are
+  // valid and they do not overwrite each other.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
       BufferValue::SizeFunction buffer_size,
@@ -577,14 +593,17 @@ class BufferAssigner {
       bool allocate_buffers_for_constants = false,
       Colorer colorer = DefaultColorer(),
       const absl::flat_hash_set<HloOpcode>& must_not_live_out = {},
-      HloDataflowAnalysis::CanShareBuffer can_share_buffer = nullptr);
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer = nullptr,
+      std::unique_ptr<PresetAssignments> preset_assignments = {});
 
  private:
   BufferAssigner(bool allocate_buffers_for_constants, Colorer colorer,
-                 const absl::flat_hash_set<HloOpcode>& must_not_live_out)
+                 const absl::flat_hash_set<HloOpcode>& must_not_live_out,
+                 std::unique_ptr<PresetAssignments> preset_assignments)
       : allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer),
-        must_not_live_out_(must_not_live_out) {}
+        must_not_live_out_(must_not_live_out),
+        preset_assignments_(std::move(preset_assignments)) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
@@ -606,6 +625,16 @@ class BufferAssigner {
           buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
+  // Returns true if buffer's live range interferences with buffer2's.
+  bool LiveRangeInterferes(const HloValue* buffer1, const HloValue* buffer2,
+                           BufferAssignment* assignment);
+
+  // Assigns pre-set assignments, if provided. These assignments will be added
+  // to assigned_buffers and skip buffer allocation.
+  Status AssignPresetBuffers(
+      absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
+      BufferAssignment* assignment);
+
   // Promotes operations (DUS, scatter) to be done in place: If an operation can
   // be done in place, merge its buffer with its operand buffer.
   Status MergeInplaceOpBuffers(BufferAssignment* assignment);
@@ -657,6 +686,9 @@ class BufferAssigner {
   // A set of hlo opcodes that can't live out of a computation.
   absl::flat_hash_set<HloOpcode> must_not_live_out_;
 
+  // Description of any buffer offsets that are already set by an earlier pass.
+  std::unique_ptr<PresetAssignments> preset_assignments_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 3bb98d5d1be..1c985485d43 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -143,6 +143,20 @@ class BufferAssignmentTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<BufferAssignment> RunBufferAssignmentWithPresetAssignments(
+      HloModule* module, std::unique_ptr<PresetAssignments> preset_assignments,
+      int64 alignment = 1) {
+    return BufferAssigner::Run(
+               module, absl::make_unique<DependencyHloOrdering>(module),
+               backend().compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allocate_buffers_for_constants=*/true,
+               BufferAssigner::DefaultColorer(),
+               /*must_not_live_out=*/{},
+               /*can_share_buffer=*/nullptr, std::move(preset_assignments))
+        .ConsumeValueOrDie();
+  }
+
   // Builds an x+1.0 computation to use in a Map.
   std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
     auto builder = HloComputation::Builder(name);
@@ -599,6 +613,13 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
 
   // The sub node has a valid output buffer assigned.
   GetAssignedOutputAllocation(*buffers, sub);
+
+  // Check if the HLO instructions have the correct colors in the layout.
+  EXPECT_EQ(param0->shape().layout().memory_space(), 2);
+  EXPECT_EQ(param1->shape().layout().memory_space(), 3);
+  EXPECT_EQ(mul->shape().layout().memory_space(), 4);
+  EXPECT_EQ(add->shape().layout().memory_space(), 5);
+  EXPECT_EQ(sub->shape().layout().memory_space(), 6);
 }
 
 TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
@@ -666,6 +687,86 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
 
   // The sub node has a valid output buffer assigned.
   GetAssignedOutputAllocation(*buffers, sub);
+
+  // Check if the HLO instructions have the correct colors in the layout.
+  EXPECT_EQ(mul->shape().layout().memory_space(), 1);
+  EXPECT_EQ(add->shape().layout().memory_space(), 1);
+  EXPECT_EQ(sub->shape().layout().memory_space(), 0);
+  EXPECT_EQ(param0->shape().layout().memory_space(), 0);
+  EXPECT_EQ(param1->shape().layout().memory_space(), 0);
+}
+
+TEST_F(BufferAssignmentTest, PresetAssignments) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // Similar to BasicPartiallyColored, but the color is set in the layout.
+  // The output of the mul and the add have the color 1 and have preset
+  // assignments, and the other buffers have the color 0, which allows the mul
+  // and add to share buffers.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {}));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
+  Shape f32vec100_color1 =
+      ShapeUtil::MakeShapeWithLayout(F32, {100}, {0}, {}, 0, 1);
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_color1, HloOpcode::kMultiply, broadcast, param0));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_color1, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto preset_assignments = absl::make_unique<PresetAssignments>();
+  preset_assignments->add_chunk({mul, {}}, {/*offset=*/100, /*size=*/400});
+  preset_assignments->add_chunk({add, {}}, {/*offset=*/550, /*size=*/400});
+  preset_assignments->add_size(/*memory_space=*/1, /*size=*/950);
+
+  auto buffers = RunBufferAssignmentWithPresetAssignments(
+      module.get(), std::move(preset_assignments));
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_EQ(paramscalar_buffer.color(), LogicalBuffer::Color(0));
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+  EXPECT_EQ(param0_buffer.color(), LogicalBuffer::Color(0));
+
+  // The mul and add use the same preset buffer. Ensure it has the correct color
+  // and offsets.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_EQ(mul_buffer, add_buffer);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+  EXPECT_EQ(mul_buffer.color(), LogicalBuffer::Color(1));
+
+  EXPECT_EQ(mul_buffer.assigned_buffers().size(), 2);
+  for (const auto& value_and_offsetsize : mul_buffer.assigned_buffers()) {
+    if (value_and_offsetsize.first->instruction() == mul) {
+      EXPECT_EQ(value_and_offsetsize.second.offset, 100);
+      EXPECT_EQ(value_and_offsetsize.second.size, 400);
+    } else {
+      EXPECT_EQ(value_and_offsetsize.first->instruction(), add);
+      EXPECT_EQ(value_and_offsetsize.second.offset, 550);
+      EXPECT_EQ(value_and_offsetsize.second.size, 400);
+    }
+  }
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
 }
 
 TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
@@ -1482,7 +1583,7 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {42}), "param"));
   auto bitcast = builder.AddInstruction(
-      HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param));
+      HloInstruction::CreateBitcast(param->shape(), param));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.cc b/tensorflow/compiler/xla/service/cholesky_expander.cc
index 27b1dcca2bd..74fc15a3eed 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.cc
+++ b/tensorflow/compiler/xla/service/cholesky_expander.cc
@@ -46,11 +46,12 @@ namespace {
 //   n = a.shape[-2]
 //   l = np.zeros_like(a)
 //   for j in xrange(n):
-//     row = l[..., j, :j]
-//     row_t = np.swapaxes(row, -1, -2)
-//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(row, row_t))
-//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
-//                       l[..., j, j]
+//     mask = np.zeros_like(a)
+//     mask[i, k] == 1 when i >= k and k == j
+//     l_square = np.dot(l, l_t)
+//     temp = a - l_square
+//     l[..., j, j] = temp(j, j)
+//     l = temp / l[..., j, j) * mask + l
 //   return l
 // Returns a (result, error) pair.
 std::pair<XlaOp, XlaOp> CholeskyUnblocked(
@@ -65,6 +66,11 @@ std::pair<XlaOp, XlaOp> CholeskyUnblocked(
                               /*pos=*/0,
                               /*len=*/n_dims - 2);
 
+    auto matrix_dims = AsInt64Slice(a_shape.dimensions())
+                           .subspan(
+                               /*pos=*/0,
+                               /*len=*/n_dims);
+
     XlaOp l = ZerosLike(a);
 
     // Construct the for loop body to iterate over rows.
@@ -73,63 +79,33 @@ std::pair<XlaOp, XlaOp> CholeskyUnblocked(
             XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
       std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
       std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
-      row_shape_dims.push_back(1);
-      row_shape_dims.push_back(n);
-      auto mask_zeros_row =
-          Zeros(body_builder,
-                ShapeUtil::MakeShape(a_shape.element_type(), row_shape_dims));
-
-      col_shape_dims.push_back(n);
-      col_shape_dims.push_back(1);
-      auto mask_zeros_col =
-          Zeros(body_builder,
-                ShapeUtil::MakeShape(a_shape.element_type(), col_shape_dims));
-
-      auto mask_range_row =
-          Iota(body_builder, ShapeUtil::MakeShape(S32, row_shape_dims),
-               /*iota_dimension=*/n_dims - 1);
-      auto mask_range_col =
-          Iota(body_builder, ShapeUtil::MakeShape(S32, col_shape_dims),
-               /*iota_dimension=*/n_dims - 2);
       auto body_a = loop_vars[0];
       auto body_l = loop_vars[1];
       auto seen_error = loop_vars[2];
+      auto iota_row = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
+                           n_dims - 1);
+      auto iota_col = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
+                           n_dims - 2);
+
+      auto mask_pred = Ge(iota_col, iota_row);
+      mask_pred = And(mask_pred, Eq(iota_row, i));
+      auto mask_zeros =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), matrix_dims));
+      // L * L.T, This matrix has of a lot of multiplying with zero
+      // (namely, L[:, j:] = 0) and redudant computation, but it is faster
+      // than slice.
+      auto l_square = BatchDot(body_l, false, body_l, true, precision);
+
+      // A - L*L.T
+      l_square = body_a - l_square;
+      auto l_ii = DynamicSliceInMinorDims(l_square, {i, i}, {1, 1});
+      l_ii = Sqrt(l_ii);
+      // L = (A - L*L.T) / l_ii * mask + L
+      body_l = Select(mask_pred, l_square / l_ii, mask_zeros) + body_l;
 
-      // row = l[..., i, :i]
-      // select the whole i-th row, then mask out all columns past i-1
-      auto zero = ConstantR0<int32>(body_builder, 0);
-      auto l_i = DynamicSliceInMinorDims(body_l, {i, zero}, {1, n});
-      auto row = Select(Ge(mask_range_row, i), mask_zeros_row, l_i);
-      // a[..., i, i]
-      auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
-      // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, false, row, true, precision);
-      // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
-      //                                              np.swapaxes(row, -1, -2)))
-      auto l_ii = a_ii - diag_dot;
       seen_error =
           Or(seen_error, Any(Or(Le(l_ii, ZerosLike(l_ii)), IsNan(l_ii))));
-      l_ii = Sqrt(l_ii);
-
-      // a[..., i+1:, i]
-      // select the whole i-th column, then mask out all rows above i+1
-      auto a_0i = DynamicSliceInMinorDims(body_a, {i}, {1});
-      auto a_ip1i = Select(Le(mask_range_col, i), mask_zeros_col, a_0i);
-
-      // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) /
-      //                   l[..., i, i]
-      // The columns in [i, n] are zeroed out in `row`, so we just have to
-      // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
-      // r.T)
-      auto dot = BatchDot(body_l, false, row, true, precision);
-      // np.dot(l[..., i+1:, :i], r.T)
-      auto dot_ip1 = Select(Le(mask_range_col, i), mask_zeros_col, dot);
-
-      body_l =
-          DynamicUpdateSliceInMinorDims(body_l, (a_ip1i - dot_ip1) / l_ii, {i});
-      // Assign the diagonal after the rest of the column because otherwise the
-      // column assign will wrap around and overwrite the diagonal assign.
-      body_l = DynamicUpdateSliceInMinorDims(body_l, l_ii, {i, i});
 
       return std::vector<XlaOp>{body_a, body_l, seen_error};
     };
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 631a7dd7e6a..eee2e26ec9f 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -151,13 +151,6 @@ class Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
-  // Optimizes a HLO module group, a set of module which runs concurrently on
-  // multiple devices potentially communicating data between the modules.
-  virtual Status RunHloPassesOnModuleGroup(
-      HloModuleGroup* module_group,
-      absl::Span<se::StreamExecutor* const> executors,
-      se::DeviceMemoryAllocator* device_allocator) = 0;
-
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
@@ -172,14 +165,6 @@ class Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
-  // Compiles a set of HLO modules that can run in parallel, potentially
-  // communicating data between the modules.
-  virtual StatusOr<std::vector<std::unique_ptr<Executable>>>
-  RunBackendOnModuleGroup(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) = 0;
-
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index 92d1ca4ba5d..863fd030d35 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace xla {
 
@@ -66,4 +67,23 @@ ProgramShape ComputationLayout::ComputeProgramShape() const {
   return program_shape;
 }
 
+bool ComputationLayout::operator==(const ComputationLayout& other) const {
+  return result_layout() == other.result_layout() &&
+         parameter_layouts() == other.parameter_layouts();
+}
+
+bool ComputationLayout::operator!=(const ComputationLayout& other) const {
+  return result_layout() != other.result_layout() ||
+         parameter_layouts() != other.parameter_layouts();
+}
+
+uint64 ComputationLayout::Hash() const {
+  uint64 hash_value = ShapeUtil::Hash(result_layout_.shape());
+  for (const auto& parameter_layout : parameter_layouts_) {
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, ShapeUtil::Hash(parameter_layout.shape()));
+  }
+  return hash_value;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index a2fb656677f..5aab1a5fd42 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -87,6 +87,10 @@ class ComputationLayout {
   // within this object.
   ProgramShape ComputeProgramShape() const;
 
+  bool operator==(const ComputationLayout& other) const;
+  bool operator!=(const ComputationLayout& other) const;
+  uint64 Hash() const;
+
  private:
   std::vector<ShapeLayout> parameter_layouts_;
   ShapeLayout result_layout_;
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index f1936035fed..985603b08e4 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -253,6 +253,31 @@ StatusOr<bool> TryRemoveUnusedConditionalOperands(
   }
   return true;
 }
+
+// Replaces the roots of all branches with an empty tuple if the conditional op
+// has no users. Returns if anything is changed.
+bool ReplaceRootWithEmptyTupleIfNoUsers(HloInstruction* conditional_op) {
+  const Shape empty_tuple = ShapeUtil::MakeTupleShape({});
+  if (conditional_op->user_count() == 0 &&
+      conditional_op != conditional_op->parent()->root_instruction() &&
+      !ShapeUtil::Compatible(empty_tuple, conditional_op->shape())) {
+    for (int64 branch_id = 0; branch_id < conditional_op->branch_count();
+         ++branch_id) {
+      auto branch_computation =
+          conditional_op->GetModule()->AddEmbeddedComputation(
+              conditional_op->branch_computation(branch_id)->Clone());
+      conditional_op->set_branch_computation(branch_id, branch_computation);
+      auto new_empty_root =
+          branch_computation->AddInstruction(HloInstruction::CreateTuple({}));
+      branch_computation->set_root_instruction(new_empty_root,
+                                               /*accept_different_shape=*/true);
+    }
+    *conditional_op->mutable_shape() = empty_tuple;
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
 
 StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
@@ -274,6 +299,7 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
 
   std::map<HloComputation*, std::set<int64>> changed_computations;
   for (HloInstruction* conditional_op : conditional_ops) {
+    changed |= ReplaceRootWithEmptyTupleIfNoUsers(conditional_op);
     TF_ASSIGN_OR_RETURN(bool result, TryRemoveConditional(conditional_op));
     if (!result) {
       TF_ASSIGN_OR_RETURN(result, TryRemoveUnusedConditionalOperands(
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index 58659156a75..d409e22463e 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -285,6 +285,49 @@ TEST_F(ConditionalSimplifierTest,
   EXPECT_TRUE(ConditionalSimplifier().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(ConditionalSimplifierTest, RemoveDeadRoots) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDeadRoots
+on_false {
+  t = (f32[20,40], f32[40,40]) parameter(0)
+  lhs = f32[20,40] get-tuple-element(t), index=0
+  rhs = f32[40,40] get-tuple-element(t), index=1
+  dot = f32[20,40] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  after-all = token[] after-all()
+  outfeed = token[] outfeed(dot, after-all)
+  ROOT result = (f32[20,40]) tuple(dot)
+}
+
+on_true {
+  t = (f32[20,40], f32[40,40]) parameter(0)
+  lhs = f32[20,40] get-tuple-element(t), index=0
+  add = f32[20,40] add(lhs, lhs)
+  ROOT result = (f32[20,40]) tuple(add)
+}
+
+ENTRY main {
+  c0_0 = f32[20,40] parameter(0)
+  c0_1 = f32[40,40] parameter(1)
+  p = pred[] parameter(2)
+  t = (f32[20,40], f32[40,40]) tuple(c0_0, c0_1)
+  conditional = (f32[20, 40]) conditional(p,t,t), false_computation=on_false, true_computation=on_true
+  ROOT result = () tuple()
+}
+)";
+  auto status = ParseAndReturnUnverifiedModule(hlo_string);
+  TF_ASSERT_OK(status.status());
+  HloVerifier v(false, false);
+  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
+  EXPECT_TRUE(
+      ConditionalSimplifier().Run(status.ValueOrDie().get()).ValueOrDie());
+  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
+  HloInstruction* conditional =
+      FindInstruction(status.ValueOrDie().get(), "conditional");
+  // The conditional root should be replaced with an empty tuple.
+  EXPECT_EQ(ShapeUtil::TupleElementCount(conditional->shape()), 0);
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index ff75f0f2469..20ebafcf780 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -355,7 +355,7 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
     }
     // We want to repeat 'filter' in the 'input_feature_dim' dimension
     // 'group_count' times.
-    if (filter_expansion_) {
+    if (!is_cost_viable_(convolution) || filter_expansion_) {
       Shape reshaped_filter_shape =
           ShapeUtil::DeleteDimension(kernel_input_feature_dim, filter->shape());
       auto reshaped_filter =
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index d2eea14896e..85c54d31582 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -49,7 +49,8 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  ConvolutionGroupConverter converter(nullptr, /*convert_batch_groups_only=*/
+  auto cost_model = [](HloInstruction* conv) { return true; };
+  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
                                       false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
@@ -80,7 +81,8 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  ConvolutionGroupConverter converter(nullptr, /*convert_batch_groups_only=*/
+  auto cost_model = [](HloInstruction* conv) { return true; };
+  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
                                       false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 6fa3161e578..f0ac579a387 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -235,8 +235,8 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {4}), "x"));
-  HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
+  HloInstruction* bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(ShapeUtil::MakeShape(F32, {2, 2}), x));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -258,8 +258,9 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* constant =
       builder.AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateR1<float>({1.0, 42.0})));
-  HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
+  HloInstruction* bitcast =
+      builder.AddInstruction(HloInstruction::CreateBitcast(
+          ShapeUtil::MakeShape(F32, {2, 2}), constant));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -279,8 +280,8 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {4}), "x"));
-  HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
+  HloInstruction* bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(ShapeUtil::MakeShape(F32, {2, 2}), x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
   auto module = CreateNewVerifiedModule();
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 37baf0e36df..8a5bbc4248d 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -35,6 +35,7 @@ cc_library(
     srcs = ["cpu_transfer_manager.cc"],
     hdrs = ["cpu_transfer_manager.h"],
     deps = [
+        ":cpu_runtime",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -45,7 +46,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
@@ -95,8 +95,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:conditional_to_select",
+        "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla:cpu_function_runtime",
@@ -1012,3 +1014,19 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
+
+tf_cc_test(
+    name = "vectorized_reduce_with_no_vector_registers_test",
+    size = "small",
+    srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
+    deps = [
+        ":cpu_compiler",
+        ":cpu_transfer_manager",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@llvm//:core",
+        "@llvm//:support",
+        "@llvm//:target",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 9f8f74344af..e7371c79b39 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -99,8 +99,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/rng_expander.h"
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/slice_sinker.h"
+#include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
+#include "tensorflow/compiler/xla/service/tree_reduction_rewriter.h"
 #include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -300,6 +302,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                           /*allow_mixed_precision=*/false);
 
+    pass.AddPass<TreeReductionRewriter>();
     pass.AddPass<ScatterExpander>();
     pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
@@ -606,6 +609,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   VLOG(1) << "Compiling: " << module->name();
   XLA_SCOPED_LOGGING_TIMER(
       absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
+  auto slow_compile_alarm = SlowCompilationAlarm();
 
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 476579883f3..9b79e8ca8d7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -194,13 +194,13 @@ Status CpuExecutable::ExecuteComputeFunction(
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
-  {
-    tensorflow::mutex_lock lock(mutex_);
+  if (run_options->execution_profile()) {
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+    run_options->execution_profile()->set_compute_time_ns(
+        std::max(nanoseconds, 1.0));
     // If hlo profiling was disabled then the cycle count is left empty.
     if (hlo_execution_profile) {
-      execution_profile_.set_compute_cycle_count(
+      run_options->execution_profile()->set_compute_cycle_count(
           hlo_execution_profile->total_cycles_executed(
               *module().entry_computation()));
     }
@@ -268,29 +268,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result_buffer);
 }
 
-StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  TF_ASSIGN_OR_RETURN(
-      auto result,
-      ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
-  TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
-  return std::move(result);
-}
-
 StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments) {
-  if (hlo_profiling_enabled()) {
-    return Unimplemented(
-        "Asynchronous execution on stream with hlo profiling is not yet "
-        "supported on CPU.");
-  }
-  return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr);
-}
-
-StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
     HloExecutionProfile* hlo_execution_profile) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 169acdeffd4..37af630a2d9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -55,15 +55,11 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments) override;
-
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
 
@@ -86,16 +82,6 @@ class CpuExecutable : public Executable {
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
 
  private:
-  // This is for sharing the code between ExecuteOnStream and
-  // ExecuteAsyncOnStream.
-  //
-  // Notice that it's tricky to use correctly, as the profile object (when it
-  // exists) must out-live the task.
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamImpl(
-      const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments,
-      HloExecutionProfile* hlo_execution_profile);
-
   // Creates an array suitable for passing as the "buffer_table" argument to the
   // JIT compiled function pointer.
   //
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 6620a9620b5..a6f960a5cb6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -40,10 +40,11 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
-bool IsNonComplexMatrixVectorDot(const HloInstruction* hlo) {
+bool IsNonComplexNonBatchedMatrixVectorDot(const HloInstruction* hlo) {
   const Shape& hlo_shape = hlo->shape();
   return !ShapeUtil::ElementIsComplex(hlo_shape) &&
-         hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() <= 1;
+         hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() <= 1 &&
+         hlo->dot_dimension_numbers().lhs_batch_dimensions_size() == 0;
 }
 
 bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
@@ -54,7 +55,7 @@ bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
 bool CanBeOutputFused(const HloInstruction* producer,
                       const HloInstruction* consumer) {
   return consumer->opcode() == HloOpcode::kAdd &&
-         IsNonComplexMatrixVectorDot(producer) &&
+         IsNonComplexNonBatchedMatrixVectorDot(producer) &&
          HasExactlyOneUse(*producer) == 1;
 }
 
@@ -74,10 +75,13 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   constexpr int kFusionThresholdBytes = 16 * 1024;
 
   if (CanBeOutputFused(producer, consumer)) {
+    VLOG(2) << "Fusion OK: Can create output fusion.";
     return true;
   }
 
   if (CanBeOutputFusedIntoSomeOperand(producer)) {
+    VLOG(2)
+        << "Bailing because producer can be output-fused into some operand.";
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 1509da6f7ec..f0d7461e5e7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1027,10 +1027,13 @@ StatusOr<llvm::Value*> IrEmitter::EmitElementalConvolution(
   PrimitiveType lhs_element_type = lhs->shape().element_type();
   llvm::Type* lhs_llvm_type =
       llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_);
+  // Upcast the accumulator to F32 from F16 for increased precision.
+  llvm::Type* accumulator_type =
+      lhs_element_type == F16 ? b_.getFloatTy() : lhs_llvm_type;
   llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      lhs_llvm_type, "convolution_sum_address", &b_,
+      accumulator_type, "convolution_sum_address", &b_,
       MinimumAlignmentForPrimitiveType(lhs_element_type));
-  llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type);
+  llvm::Value* constant_zero = llvm::Constant::getNullValue(accumulator_type);
   Store(constant_zero, sum_address);
 
   llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_);
@@ -1139,11 +1142,11 @@ StatusOr<llvm::Value*> IrEmitter::EmitElementalConvolution(
   TF_ASSIGN_OR_RETURN(llvm::Value* const kernel_value,
                       kernel_generator(kernel_index));
   llvm::Value* product = FMul(input_value, kernel_value);
-  llvm::Value* sum = FAdd(Load(sum_address), product);
+  llvm::Value* sum = FAdd(Load(sum_address), FPCast(product, accumulator_type));
   Store(sum, sum_address);
 
   SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-  return Load(sum_address);
+  return FPCast(Load(sum_address), lhs_llvm_type);
 }
 
 Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -1736,6 +1739,16 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     return false;
   }
 
+  int vector_register_size_in_elements =
+      target_machine_features_.vector_register_byte_size(
+          *compute_function_->function()) /
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
+  if (vector_register_size_in_elements == 0) {
+    // Either we don't know the vector register width for the target or the
+    // vector register is smaller than the size of the primitive type.
+    return false;
+  }
+
   int vectorization_factor_in_bytes =
       target_machine_features_.vectorization_factor_in_bytes();
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index d3e2e2bea95..19b0bb3f4dc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -128,6 +128,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tree_reduction_rewriter_test",
+    srcs = ["tree_reduction_rewriter_test.cc"],
+    deps = [
+        ":cpu_codegen_test",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/tests:codegen_test_base",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "cpu_infeed_test",
     srcs = ["cpu_infeed_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
new file mode 100644
index 00000000000..bcb7da0e6cf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace cpu {
+
+namespace {
+
+class TreeReductionRewriterTest : public CpuCodegenTest {};
+
+TEST_F(TreeReductionRewriterTest, SimpleRewrite) {
+  const char* hlo_text = R"(
+HloModule SimpleReduction
+
+add {
+  acc = f32[] parameter(1)
+  op = f32[] parameter(0)
+  ROOT out = f32[] add(acc, op)
+}
+
+ENTRY main {
+  input = f32[1000] parameter(0)
+  zero = f32[] constant(0)
+  ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
+}
+  )";
+
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+; CHECK-LABEL: ENTRY %main (input: f32[1000]) -> f32[] {
+; CHECK-NEXT:    %input = f32[1000]{0} parameter(0)
+; CHECK-NEXT:    %zero = f32[] constant(0)
+; CHECK-NEXT:    %reduce-window = f32[32]{0} reduce-window(%input, %zero)
+; CHECK-NEXT:    %reduce-window.1 = f32[1]{0} reduce-window(%reduce-window, %zero), window={size=32 stride=32}, to_apply=%add
+; CHECK-NEXT:    ROOT %bitcast = f32[] bitcast(%reduce-window.1)
+      )");
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
new file mode 100644
index 00000000000..2918c886f08
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+class CodegenReduceOnArchWithNoVectorRegisters : public HloTestBase {};
+
+StatusOr<unsigned> GetTargetVectorRegisterByteSize(std::string triple) {
+  // Unfortunately we need a lot of boilerplate to get to an
+  // llvm::TargetMachine.
+
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(triple, error);
+  if (target == nullptr) {
+    return InternalError("TargetRegistry::lookupTarget failed: %s", error);
+  }
+
+  llvm::LLVMContext context;
+  std::unique_ptr<llvm::Function> function =
+      absl::WrapUnique(llvm::Function::Create(
+          llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}),
+          llvm::GlobalValue::ExternalLinkage, "test"));
+
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      absl::WrapUnique(target->createTargetMachine(
+          /*TT=*/triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
+          /*RM=*/llvm::None));
+  cpu::LLVMTargetMachineFeatures target_machine_features(target_machine.get());
+  return target_machine_features.vector_register_byte_size(*function);
+}
+
+TEST_F(CodegenReduceOnArchWithNoVectorRegisters, Test) {
+  absl::string_view text = R"(
+HloModule Reduce
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  input = f32[1000,1000] parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[1000] reduce(input, constant), dimensions={0}, to_apply=add
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(text));
+  cpu::CpuCompiler cpu_compiler;
+  auto module_group = absl::make_unique<HloModuleGroup>("group");
+  module_group->push_back(std::move(hlo_module));
+
+  // Check that the GetTargetVectorRegisterByteSize is itself working.
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64,
+                          GetTargetVectorRegisterByteSize("x86_64-pc-linux"));
+  ASSERT_EQ(vector_register_byte_size_for_x86_64, 16);
+
+  std::string triple = "i686-none-android";
+
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size,
+                          GetTargetVectorRegisterByteSize(triple));
+
+  // This test is supposed to check whether the XLA CPU vectorized reduction
+  // codegen works correctly for architectures that do not have vector
+  // registers.  So first ASSERT that `triple` is actually a target with no
+  // vector registers, as otherwise the test isn't actually testing anything
+  // interesting.
+
+  ASSERT_EQ(vector_register_byte_size, 0);
+
+  cpu::CpuAotCompilationOptions aot_compilation_options(
+      /*triple=*/triple, /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"main",
+      cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_compilation_result,
+      cpu_compiler.CompileAheadOfTime(std::move(module_group),
+                                      aot_compilation_options));
+  EXPECT_EQ(aot_compilation_result.size(), 1);
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter.cc b/tensorflow/compiler/xla/service/depthwise_convolution_converter.cc
new file mode 100755
index 00000000000..37a1d1346a7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/depthwise_convolution_converter.cc
@@ -0,0 +1,215 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/depthwise_convolution_converter.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+class ConvolutionVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* convolution) override;
+
+  Status HandleBackwardFilterBatchGroupConvolution(HloInstruction* convolution);
+
+  // Runs the visitor on a computation.
+  static bool Run(HloComputation* computation,
+                  std::function<bool(HloInstruction*)> is_cost_viable);
+
+  // Returns whether any convolution ops were rewritten.
+  const bool changed() const { return changed_; }
+
+  ~ConvolutionVisitor() override = default;
+
+ private:
+  explicit ConvolutionVisitor(
+      HloComputation* computation,
+      std::function<bool(HloInstruction*)> is_cost_viable)
+      : computation_(computation), is_cost_viable_(is_cost_viable) {}
+
+  // Current HloComputation instance the ConvolutionVisitor is traversing.
+  HloComputation* computation_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+
+  std::function<bool(HloInstruction*)> is_cost_viable_;
+};
+
+bool ConvolutionVisitor::Run(
+    HloComputation* computation,
+    std::function<bool(HloInstruction*)> is_cost_viable) {
+  ConvolutionVisitor visitor(computation, is_cost_viable);
+  TF_CHECK_OK(computation->Accept(&visitor));
+  return visitor.changed_;
+}
+
+namespace {
+Shape SwapInputOutputFeatureDims(const Shape& shape, int64 input_feature_dim,
+                                 int64 output_feature_dim) {
+  int64 num_dims = shape.dimensions_size();
+  CHECK_GE(num_dims, 2);
+  Shape transformed_shape = shape;
+  transformed_shape.set_dimensions(input_feature_dim,
+                                   shape.dimensions(output_feature_dim));
+  transformed_shape.set_dimensions(output_feature_dim,
+                                   shape.dimensions(input_feature_dim));
+  return transformed_shape;
+}
+}  // namespace
+
+// This function handles batch_group_counts which are relevant only for
+// depthwise backprop filter convolutions.
+Status ConvolutionVisitor::HandleBackwardFilterBatchGroupConvolution(
+    HloInstruction* convolution) {
+  auto dim_numbers = convolution->convolution_dimension_numbers();
+  auto lhs = convolution->mutable_operand(0);
+  auto rhs = convolution->mutable_operand(1);
+  int64 batch_group_count = convolution->batch_group_count();
+
+  if (batch_group_count == 1) {
+    return Status::OK();
+  }
+
+  VLOG(2) << "Dealing with batch_group_count " << batch_group_count
+          << " for convolution " << convolution->ToString() << "\n";
+
+  int64 output_batch_dimension = dim_numbers.output_batch_dimension();
+  int64 output_feature_dimension = dim_numbers.output_feature_dimension();
+
+  // When mapping depthwise conv backward filter to batch grouped convolution,
+  // tf2xla bridge needs to swap the output batch and feature dimension. Since
+  // we want to use grouped convolution APIs, this swap needs to be reverted.
+  dim_numbers.set_output_batch_dimension(output_feature_dimension);
+  dim_numbers.set_output_feature_dimension(output_batch_dimension);
+
+  if (!is_cost_viable_(convolution)) {
+    Shape transformed_filter_grad_shape = SwapInputOutputFeatureDims(
+        convolution->shape(), dim_numbers.output_batch_dimension(),
+        dim_numbers.output_feature_dimension());
+
+    int64 num_groups = convolution->batch_group_count();
+    int64 input_batch_dimension = dim_numbers.input_batch_dimension();
+    int64 input_batch = lhs->shape().dimensions(input_batch_dimension);
+    int64 input_feature_dimension = dim_numbers.input_feature_dimension();
+    int64 input_feature = lhs->shape().dimensions(input_feature_dimension);
+
+    CHECK_EQ(input_batch, num_groups)
+        << "Feature group count should be equal to number of input features "
+           "for depthwise convolution";
+
+    auto add = [&](std::unique_ptr<HloInstruction> inst) {
+      return computation_->AddInstruction(std::move(inst));
+    };
+    // Reshape batch_dim C -> [G, C/G] - Batch and feature dims have been
+    // swapped in tf2xla bridge
+    std::vector<int64> reshape_dims = lhs->shape().dimensions();
+    reshape_dims[input_batch_dimension] =
+        reshape_dims[input_batch_dimension] / num_groups;
+    reshape_dims.insert(reshape_dims.begin() + input_batch_dimension,
+                        num_groups);
+    lhs = add(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), lhs));
+
+    // Transpose G to the axis before N, For eg: [G, C/G, H, W, N ] -> [C/G, H,
+    // W, G, N]
+    std::vector<int64> transpose_dims(lhs->shape().dimensions_size());
+    std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+    transpose_dims.erase(transpose_dims.begin() + input_batch_dimension);
+    transpose_dims.insert(transpose_dims.begin() + input_feature_dimension,
+                          input_batch_dimension);
+    std::vector<int64> transpose_reshape_dims =
+        ComposePermutations(lhs->shape().dimensions(), transpose_dims);
+    lhs = add(HloInstruction::CreateTranspose(
+        ShapeUtil::MakeShape(lhs->shape().element_type(),
+                             transpose_reshape_dims),
+        lhs, transpose_dims));
+
+    // Merge [G,N] -> [N*G]
+    Shape new_shape = lhs->shape();
+    new_shape.DeleteDimension(input_feature_dimension);
+    new_shape.set_dimensions(input_feature_dimension,
+                             input_feature * num_groups);
+    lhs = add(HloInstruction::CreateReshape(new_shape, lhs));
+
+    std::vector<HloInstruction*> new_operands = {lhs, rhs};
+    auto new_conv = convolution->CloneWithNewOperands(
+        transformed_filter_grad_shape, new_operands);
+    new_conv->set_feature_group_count(num_groups);
+    new_conv->set_batch_group_count(1);
+    new_conv->set_convolution_dimension_numbers(dim_numbers);
+    auto new_convolution = computation_->AddInstruction(std::move(new_conv));
+
+    // Another reshape is required since the filter grad shape as a result of
+    // the 'new convolution` will be [kh, kw, C_i/G = 1, C_o = C_i = G ] but the
+    // expected shape is [kh, kw, C_i = G, DM=1] assuming the Depth-Multiplier
+    // (DM) is 1 and number of input features = G as required by the depthwise
+    // conv semantics
+    auto reshape =
+        HloInstruction::CreateReshape(convolution->shape(), new_convolution);
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        convolution, std::move(reshape)));
+    changed_ = true;
+  }
+
+  return Status::OK();
+}
+
+Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+  return HandleBackwardFilterBatchGroupConvolution(convolution);
+}
+
+}  // namespace
+
+StatusOr<bool> DepthwiseConvolutionConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "DepthwiseConvolutionConverter::Run(), before:\n" +
+                        module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (ConvolutionVisitor::Run(comp, is_cost_viable_)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "DepthwiseConvolutionConverter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter.h b/tensorflow/compiler/xla/service/depthwise_convolution_converter.h
new file mode 100755
index 00000000000..a71b2b0d45d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/depthwise_convolution_converter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DEPTHWISE_CONVOLUTION_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DEPTHWISE_CONVOLUTION_CONVERTER_H_
+
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+class DepthwiseConvolutionConverter : public HloModulePass {
+ public:
+  explicit DepthwiseConvolutionConverter(
+      std::function<bool(HloInstruction*)> is_cost_viable)
+      : is_cost_viable_(is_cost_viable) {}
+
+  absl::string_view name() const override {
+    return "depthwise-convolution-converter";
+  }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Lambda containing cost model that decides whether to expand
+  // batch_group_count.
+  std::function<bool(HloInstruction*)> is_cost_viable_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DEPTHWISE_CONVOLUTION_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/depthwise_convolution_converter_test.cc b/tensorflow/compiler/xla/service/depthwise_convolution_converter_test.cc
new file mode 100755
index 00000000000..cbf748bd5c9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/depthwise_convolution_converter_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/depthwise_convolution_converter.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using DepthwiseConvolutionConverterTest = HloTestBase;
+
+TEST_F(DepthwiseConvolutionConverterTest,
+       ConvertBatchGroupCountToFeatureGroupCount) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[16,19,19,512]{3,2,1,0}, filter: f32[16,19,19,512]{3,2,1,0}) -> f32[3,3,512,1]{3,2,1,0} {
+  %input = f32[16,19,19,512]{3,2,1,0} parameter(0)
+  %filter = f32[16,19,19,512]{3,2,1,0} parameter(1)
+  ROOT %convolution = f32[3,3,512,1]{3,2,1,0} convolution(f32[16,19,19,512]{3,2,1,0} %input, f32[16,19,19,512]{3,2,1,0} %filter), window={size=19x19 pad=1_1x1_1}, dim_labels=f01b_i01o->01fb, batch_group_count=512
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  auto batch_group_count = root->batch_group_count();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto conv_dim_num = root->convolution_dimension_numbers();
+  int64 out_batch_dim = conv_dim_num.output_batch_dimension();
+  int64 out_feature_dim = conv_dim_num.output_feature_dimension();
+  auto cost_model = [](HloInstruction*) { return false; };
+  DepthwiseConvolutionConverter converter(cost_model);
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Verify that the convolution is replaced by a reshape.
+  EXPECT_EQ(root->opcode(), HloOpcode::kReshape)
+      << HloOpcodeString(root->opcode()) << " vs Reshape";
+
+  // Verify that the operand to the reshape is the new convolution
+  // with feature_group_count = batch_group_count
+  auto new_conv = root->operand(0);
+  EXPECT_EQ(new_conv->opcode(), HloOpcode::kConvolution)
+      << HloOpcodeString(new_conv->opcode()) << " vs Convolution";
+  EXPECT_EQ(new_conv->feature_group_count(), batch_group_count);
+  // Verify that the output_batch_dim and output_feature_dim
+  // have been swapped back (tf2xla swaps these dimensions to make use
+  // of batch_group convolution for computing filter grad for depthwise
+  // convolutions)
+  EXPECT_EQ(new_conv->convolution_dimension_numbers().output_batch_dimension(),
+            out_feature_dim);
+  EXPECT_EQ(
+      new_conv->convolution_dimension_numbers().output_feature_dimension(),
+      out_batch_dim);
+
+  // Verify that the operand to conv is a reshape
+  auto reshape_1 = new_conv->operand(0);
+  EXPECT_EQ(reshape_1->opcode(), HloOpcode::kReshape)
+      << HloOpcodeString(reshape_1->opcode()) << " vs Reshape";
+
+  // Verify that the operand to reshape_1 is transpose
+  auto transpose = reshape_1->operand(0);
+  EXPECT_EQ(transpose->opcode(), HloOpcode::kTranspose)
+      << HloOpcodeString(transpose->opcode()) << " vs Transpose";
+
+  // Verify that the operand to transpose is reshape
+  auto reshape_2 = transpose->operand(0);
+  EXPECT_EQ(reshape_2->opcode(), HloOpcode::kReshape)
+      << HloOpcodeString(reshape_2->opcode()) << " vs Reshape";
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 1341535aad4..94a99c77a5a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -300,7 +300,18 @@ class DfsHloVisitorBase {
 
   // Useful when we want to visit the same computation more than once with the
   // same visitor.
-  void ResetVisitStates() { visit_state_.clear(); }
+  void ResetVisitStates() {
+    // Clear the map, but don't resize the capacity across uses -- Calculating
+    // and reserving space could be expensive, and we always use the same
+    // module->instruction_count() as the capacity.
+    visit_state_.erase(visit_state_.begin(), visit_state_.end());
+  }
+
+  // Useful when we want to free up the memory used by the visit state without
+  // destroying the actual visitor subclass.
+  void DestroyVisitState() {
+    visit_state_ = absl::flat_hash_map<int, VisitState>{};
+  }
 
   void SetVisitState(int id, VisitState state) { visit_state_[id] = state; }
 
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 6a4837211e8..331c935bdc9 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -136,10 +136,6 @@ struct CanonicalDebugOptions {
   bool dump_snapshots;
 };
 
-string FilenameFor(const HloModule& module, string_view suffix) {
-  return StrFormat("module_%04d.%s", module.unique_id(), suffix);
-}
-
 void DumpToFileInDirImpl(string_view filename, string_view contents,
                          const CanonicalDebugOptions& opts) {
   if (opts.dumping_to_stdout()) {
@@ -263,6 +259,10 @@ static auto& module_id_to_step_number GUARDED_BY(mu) =
 
 }  // namespace
 
+string FilenameFor(const HloModule& module, string_view suffix) {
+  return StrFormat("module_%04d.%s", module.unique_id(), suffix);
+}
+
 void DumpToFileInDir(const HloModule& module, string_view suffix,
                      string_view contents) {
   DumpToFileInDirImpl(FilenameFor(module, suffix), contents,
diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h
index 6edc9b28dde..d245ad582c4 100644
--- a/tensorflow/compiler/xla/service/dump.h
+++ b/tensorflow/compiler/xla/service/dump.h
@@ -33,6 +33,9 @@ class BufferAssignment;
 class HloExecutionProfile;
 class HloSnapshot;
 
+// Create the filename we will use to dump in DumpToFileInDir.
+string FilenameFor(const HloModule& module, absl::string_view suffix);
+
 // Writes the given string to a file in the xla_dump_to directory specified by
 // module's DebugOptions.
 //
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 3925eeb7f62..1f7d41c7b94 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -53,6 +55,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleReshape(HloInstruction* hlo) override;
 
+  Status HandleSort(HloInstruction* hlo) override;
+
   Status HandlePad(HloInstruction* hlo) override;
 
   Status HandleBroadcast(HloInstruction* hlo) override;
@@ -161,6 +165,29 @@ Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
       });
 }
 
+Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index,
+               int64 dynamic_dimension, int64 operand_index,
+               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+        HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
+        int64 sort_dimension = sort->sort_dimension();
+        if (sort_dimension == dynamic_dimension) {
+          return Unimplemented(
+              "Dynamic dimension on sorting dimension is not supported");
+        }
+        if (sort->values_count() == 0) {
+          parent_->SetDynamicSize(hlo, {}, dynamic_dimension, dynamic_size,
+                                  constraint);
+        } else {
+          parent_->SetDynamicSize(hlo, {operand_index}, dynamic_dimension,
+                                  dynamic_size, constraint);
+        }
+
+        return Status::OK();
+      });
+}
+
 Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 5821e89612b..7a13307ffbf 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -912,6 +912,78 @@ TEST_F(DynamicDimensionInferenceTest, DynamicSliceTest) {
   EXPECT_EQ(inference_->GetDynamicSize(slice, {}, 0), size_param);
 }
 
+TEST_F(DynamicDimensionInferenceTest, SortTest) {
+  auto builder = HloComputation::Builder(TestName());
+
+  auto data_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {5, 7}), "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto compare_builder = HloComputation::Builder("condition");
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "param1"));
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {}), "param2"));
+  compare_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* compare =
+      module_->AddEmbeddedComputation(compare_builder.Build());
+
+  auto* sort = builder.AddInstruction(HloInstruction::CreateSort(
+      ShapeUtil::MakeShape(F32, {5, 7}), 1, {data_param}, compare,
+      /*is_stable=*/false));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(sort, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, MultiValueSortTest) {
+  auto builder = HloComputation::Builder(TestName());
+
+  auto shape = ShapeUtil::MakeShape(F32, {5, 7});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto compare_builder = HloComputation::Builder("condition");
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "param1"));
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {}), "param2"));
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(F32, {}), "param3"));
+  compare_builder.AddInstruction(HloInstruction::CreateParameter(
+      3, ShapeUtil::MakeShape(F32, {}), "param4"));
+  compare_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* compare =
+      module_->AddEmbeddedComputation(compare_builder.Build());
+
+  auto* sort = builder.AddInstruction(
+      HloInstruction::CreateSort(ShapeUtil::MakeTupleShape({shape, shape}), 1,
+                                 {data_param, data_param}, compare,
+                                 /*is_stable=*/false));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(sort, {0}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(sort, {1}, 0), size_param);
+}
+
 TEST_F(DynamicDimensionInferenceTest, DynamicSliceSingleElementTest) {
   // Slicing out a single element from a dynamic dimension terminates the
   // dynamic dimension.
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 4eed3b8a560..5fea5d823de 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -90,6 +90,7 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
     case HloOpcode::kAllReduce:
     case HloOpcode::kBroadcast:
     case HloOpcode::kTranspose:
+    case HloOpcode::kSort:
     case HloOpcode::kSlice:
       return nullptr;
     default:
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 48559bf5fc3..63d7f3b1c0d 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -515,15 +515,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           : input_type;
   switch (op->opcode()) {
     case HloOpcode::kLog: {
-      // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
+      // log(a+bi) = log(abs(a+bi)) + i*atan2(b,a)
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
-      llvm::Type* llvm_ty = a->getType();
-      auto sum_sq = FAdd(FMul(a, a), FMul(b, b));
-      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
-      TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a));
-      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle);
+      TF_ASSIGN_OR_RETURN(llvm::Value * angle, EmitAtan2(component_type, b, a));
+      TF_ASSIGN_OR_RETURN(llvm::Value * abs,
+                          EmitComplexAbs(component_type, operand_value));
+      TF_ASSIGN_OR_RETURN(llvm::Value * log_abs, EmitLog(component_type, abs));
+      return EmitComposeComplex(op, log_abs, angle);
     }
     case HloOpcode::kLog1p: {
       // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
@@ -639,32 +638,128 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
              =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
                i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
               ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+             =(e^(2a)-e^(-2a) +
+               i*[cos(b)sin(b)(e^(2a)+2+e^(-2a))-cos(b)sin(b)(e^(2a)-2+e^(2a)))]
+               / (cos(b)^2*(e^(2a)+2+e^(-2a)) + sin(b)^2*(e^(2a)-2+e^(2a))
+             =(e^(2a)-e^(-2a) +
+               i*cos(b)sin(b)*[e^(2a)+2+e^(-2a)-e^(2a)+2-e^(-2a)]) /
+               ([cos(b)^2 + sin(b)^2][e^(2a)+e^(-2a)])+2*[cos(b)^2 - sin(b)^2])
+             =(e^(2a)-e^(-2a) + i*cos(b)sin(b)*4) /
+              (e^(2a)+e^(-2a)+2*[cos(b)^2 - sin(b)^2])
+             =(e^(2a)-e^(-2a) + i*[sin(2b)/2]*4) /
+              (e^(2a)+e^(-2a)+2*[cos(2b)])
+             =(e^(2a)-e^(-2a) + i*2*sin(2b)) / (e^(2a) + e^(-2a) + 2*cos(2b))
       */
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a));
-      TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
-      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
-      auto exp_neg_a = FDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
-      auto exp_2a_minus_exp_neg_2a =
-          FSub(FMul(exp_a, exp_a), FMul(exp_neg_a, exp_neg_a));
-      auto cos_b_sq = FMul(cos_b, cos_b);
-      auto sin_b_sq = FMul(sin_b, sin_b);
-      auto real_num = FAdd(FMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
-                           FMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
-      auto cos_b_sin_b = FMul(cos_b, sin_b);
-      auto exp_a_plus_exp_neg_a = FAdd(exp_a, exp_neg_a);
-      auto exp_a_plus_exp_neg_a_sq =
-          FMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
-      auto exp_a_minus_exp_neg_a = FSub(exp_a, exp_neg_a);
-      auto exp_a_minus_exp_neg_a_sq =
-          FMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
-      auto imag_num = FMul(
-          cos_b_sin_b, FSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq));
-      auto denom = FAdd(FMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
-                        FMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
-      return EmitComposeComplex(op, FDiv(real_num, denom),
-                                FDiv(imag_num, denom));
+      llvm::Value* a = EmitExtractReal(operand_value);
+      llvm::Value* b = EmitExtractImag(operand_value);
+
+      llvm::Type* type = a->getType();
+
+      llvm::Value* neg_one = llvm::ConstantFP::get(type, -1.F);
+      llvm::Value* two_a = FAdd(a, a);
+      llvm::Value* neg_2a = FMul(neg_one, two_a);
+
+      // When we are calculating the real numerator, e^(2a)-e^(-2a), for small
+      // values of `a`, we will get a ULP of 2^-23 using the exp function. Using
+      // expm1 to calculate e^(2a)-e^(-2a) = [e^(2a)-1] - [e^(-2a)-1] allows our
+      // ULP to be arbitrarily small. For larger values of `a`, calculating the
+      // numerator as Exp(2a)-Exp(-2a) vs Expm1(2a)-Expm1(-2a) return virtually
+      // identical results.
+      TF_ASSIGN_OR_RETURN(llvm::Value * exp_2a_m1,
+                          EmitExpm1(component_type, two_a));
+      TF_ASSIGN_OR_RETURN(llvm::Value * exp_neg_2a_m1,
+                          EmitExpm1(component_type, neg_2a));
+      llvm::Value* real_numerator = FSub(exp_2a_m1, exp_neg_2a_m1);
+
+      // We can use the identity cos(2b)+1 = cos(b)^2-sin(b)^2+cos(b)^2+sin(b)^2
+      // = 2cos(b)^2. This gives us the ability to be more precise when the
+      // denominator is close to zero.
+      TF_ASSIGN_OR_RETURN(llvm::Value * cos_b, EmitCos(component_type, b));
+      llvm::Value* four = llvm::ConstantFP::get(type, 4.F);
+      llvm::Value* cos_b_sq = FMul(cos_b, cos_b);
+      llvm::Value* two_cos_2b_p2 = FMul(cos_b_sq, four);
+
+      // Similarly we can compute sin(2b) with the formula sin(2b) =
+      // 2*sin(b)*cos(b).
+      TF_ASSIGN_OR_RETURN(llvm::Value * sin_b, EmitSin(component_type, b));
+      llvm::Value* imag_numerator = FMul(four, FMul(cos_b, sin_b));
+
+      // Expm1(x) is about x for small values of x, but exp_sum_m2 is about x^2
+      // for small value of x. As a result, due to floating point precission
+      // issues, x^2 is a better approximation than Expm1(x) + Expm1(x) for
+      // small values of x.
+      llvm::Value* a_sqr = FMul(a, a);
+      llvm::Value* use_approx_cutoff = llvm::ConstantFP::get(type, 1e-8);
+      llvm::Value* use_approx = FCmpOLT(a_sqr, use_approx_cutoff);
+
+      llvm::Value* exp_sum_m2 =
+          Select(use_approx, a_sqr, FAdd(exp_2a_m1, exp_neg_2a_m1));
+      llvm::Value* denom = FAdd(exp_sum_m2, two_cos_2b_p2);
+
+      // As `a` grows toward +inf and -inf, the real numerator will grow towards
+      // +inf and -inf respectively, while the denominator will always grow
+      // towards +inf. The result is real_numerator/denom = NaN, when it should
+      // equal +1 and -1 respectively. Therefore, if our denominator is +inf,
+      // we just hardcode the limits for the real numbers.
+      llvm::Value* inf = llvm::ConstantFP::getInfinity(type);
+      llvm::Value* is_inf = FCmpOEQ(exp_sum_m2, inf);
+      llvm::Value* real_limit = llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::copysign, {neg_one, a}, {type}, b_);
+
+      llvm::Value* real =
+          Select(is_inf, real_limit, FDiv(real_numerator, denom));
+      llvm::Value* imag = FDiv(imag_numerator, denom);
+
+      // The complex tanh functions have a few corner cases:
+      // 1. (+0, +0) => (+0, +0)        - Handled normally
+      // 2. (x, +Inf) => (NaN, NaN)     - See below
+      // 3. (x, NaN) => (NaN, NaN)      - See below
+      // 4. (+inf, y) => (1, +0)        - Handled normally
+      // 5. (+Inf, +Inf) => (1, +/-0)   - See below
+      // 6. (+Inf, NaN) => (1, +/-0)    - See below
+      // 7. (NaN, +0) => (NaN, +0)      - See below
+      // 8. (NaN, y) => (NaN, NaN)      - Handled normally
+      // 9. (NaN, NaN) => (NaN, NaN)    - Handled normally
+      //
+      // For the cases that aren't handled normally:
+      // 2/3) Part of the calculation we do is that if exp(a) + exp(-a) = +inf,
+      //      then we return (+/-1, +/-0). However, this is only true if we
+      //      assume that a is infinity or b is finite. In the event that both a
+      //      is finite and b is either +/-Inf or NaN, then our normal
+      //      calculation would end up returing (+/-1, NaN), as opposed to (NaN,
+      //      NaN).
+      // 5/6) We always calculate the imagninary value as sin(2b)/denominator.
+      //      When the denominator is infinity, this assures us that the zero is
+      //      the correct sign. However if our imaginary input results in
+      //      sin(2b) = NaN, we calculate our imaginary result as NaN.
+      // 7)   In the event that a is NaN, the denominator will be NaN.
+      //      Therefore, the normal calculation gives (NaN, NaN) while we need
+      //      (NaN, +0).
+      if (!(b_->getFastMathFlags().noNaNs() &&
+            b_->getFastMathFlags().noInfs())) {
+        llvm::Value* abs_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                                          {a}, {type}, b_);
+        llvm::Value* zero = llvm::ConstantFP::get(type, 0.F);
+        llvm::Value* nan = llvm::ConstantFP::getNaN(type);
+
+        llvm::Value* a_is_inf = FCmpOEQ(abs_a, inf);
+        llvm::Value* b_is_zero = FCmpOEQ(b, zero);
+
+        // imag_numerator = 2sin(2b), so sin(2b) is NaN if and only if
+        // imag_numerator is NaN.
+        llvm::Value* sin_2b_is_nan =
+            b_->CreateFCmpUNO(imag_numerator, imag_numerator);
+
+        llvm::Value* real_is_nan =
+            b_->CreateAnd(sin_2b_is_nan, b_->CreateNot(a_is_inf));
+        llvm::Value* imag_is_zero =
+            b_->CreateOr(b_is_zero, b_->CreateAnd(a_is_inf, sin_2b_is_nan));
+
+        real = Select(real_is_nan, nan, real);
+        imag = Select(imag_is_zero, zero, imag);
+      }
+
+      return EmitComposeComplex(op, real, imag);
     }
     case HloOpcode::kAbs: {
       return EmitComplexAbs(component_type, operand_value);
@@ -681,18 +776,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
                              FDiv(EmitExtractImag(operand_value), cplx_abs)));
     }
     case HloOpcode::kSqrt: {
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      auto c = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
-      return EmitComplexPower(op, a, b, c, d);
+      return EmitComplexSqrt(op, component_type, operand_value);
     }
     case HloOpcode::kRsqrt: {
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      auto c = llvm::ConstantFP::get(a->getType(), -0.5);
-      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
-      return EmitComplexPower(op, a, b, c, d);
+      return EmitComplexRsqrt(op, component_type, operand_value);
     }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
@@ -783,25 +870,209 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
 // Using sqrt(a^2 + b^2) can cause overflow errors. Therefore we can use
 // sqrt(a^2 + b^2) = sqrt(a^2 * (1 + b^2/a^2))
 //                 = |a| * sqrt(1 + (b/a)^2)
-// With the assumption that |a| >= |b|
+// With the assumption that |a| >= |b|.
+//
+// This method returns the min, max, and sqrt term for this calculation. This is
+// done to prevent potential overflow errors that can occur from multiplying the
+// max with the sqrt term. (i.e. when calculating the sqrt of the absolute
+// value, we can take the sqrt of the max and the sqrt term before multiplying
+// them together.) If return_sqrt is false, it returns 1 + (b/a)^2 instead of
+// sqrt(1 + (b/a)^2).
+StatusOr<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*>>
+ElementalIrEmitter::EmitComplexAbsHelper(PrimitiveType prim_type,
+                                         llvm::Value* operand_value,
+                                         bool return_sqrt) {
+  llvm::Value* real = EmitExtractReal(operand_value);
+  llvm::Value* imag = EmitExtractImag(operand_value);
+  llvm::Value* abs_real = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::fabs, {real}, {real->getType()}, b_);
+  llvm::Value* abs_imag = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::fabs, {imag}, {imag->getType()}, b_);
+  llvm::Value* max = EmitFloatMax(abs_real, abs_imag);
+  llvm::Value* min = EmitFloatMin(abs_real, abs_imag);
+
+  llvm::Value* div = FDiv(min, max);
+  llvm::Value* div_sq = FMul(div, div);
+  llvm::Value* one = llvm::ConstantFP::get(max->getType(), 1);
+  llvm::Value* one_p_div_sq = FAdd(one, div_sq);
+  TF_ASSIGN_OR_RETURN(llvm::Value * sqrt, EmitSqrt(prim_type, one_p_div_sq));
+  return std::make_tuple(min, max, return_sqrt ? sqrt : one_p_div_sq);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexAbs(
     PrimitiveType prim_type, llvm::Value* operand_value) {
-  auto real = EmitExtractReal(operand_value);
-  auto imag = EmitExtractImag(operand_value);
-  auto abs_real = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {real},
-                                               {real->getType()}, b_);
-  auto abs_imag = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {imag},
-                                               {imag->getType()}, b_);
-  auto max = EmitFloatMax(abs_real, abs_imag);
-  auto min = EmitFloatMin(abs_real, abs_imag);
+  llvm::Value* min;
+  llvm::Value* max;
+  llvm::Value* sqrt;
+  TF_ASSIGN_OR_RETURN(
+      std::tie(min, max, sqrt),
+      EmitComplexAbsHelper(prim_type, operand_value, /*return_sqrt=*/true));
+  llvm::Value* result = FMul(max, sqrt);
+  // When (min, max) are (0, 0), (inf, inf), or (NaN, ...), `result` is NaN.
+  // In such cases, we return `min` instead of `result`.
+  return Select(FCmpUNO(result, result), min, result);
+}
 
-  auto div = FDiv(min, max);
-  auto div_sq = FMul(div, div);
-  auto one = llvm::ConstantFP::get(max->getType(), 1);
-  TF_ASSIGN_OR_RETURN(auto sqrt, EmitSqrt(prim_type, FAdd(one, div_sq)));
+// Calculates ComplexAbs in the same way, except using:
+// sqrt(|a| * sqrt(1 + (b/a)^2)) = sqrt(|a|) * pow(1 + (b/a)^2, .25)
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSqrtComplexAbs(
+    PrimitiveType prim_type, llvm::Value* operand_value) {
+  llvm::Value* min;
+  llvm::Value* max;
+  llvm::Value* one_p_div_sq;
+  TF_ASSIGN_OR_RETURN(
+      std::tie(min, max, one_p_div_sq),
+      EmitComplexAbsHelper(prim_type, operand_value, /*return_sqrt=*/false));
+  TF_ASSIGN_OR_RETURN(llvm::Value * sqrt_max, EmitSqrt(prim_type, max));
+  TF_ASSIGN_OR_RETURN(llvm::Value * pow,
+                      EmitPow(prim_type, one_p_div_sq,
+                              llvm::ConstantFP::get(max->getType(), .25)));
+  llvm::Value* result = FMul(sqrt_max, pow);
+  // When (min, max) are (0, 0), (inf, inf), or (NaN, ...), `result` is NaN.
+  // In such cases, we return `min` instead of `result`.
+  return Select(FCmpUNO(result, result), min, result);
+}
 
-  auto zero = llvm::ConstantFP::get(max->getType(), 0);
-  return Select(FCmpOEQ(max, zero), zero, FMul(max, sqrt));
+// Calculates ComplexAbs in the same way, except using:
+// rsqrt(|a| * sqrt(1 + (b/a)^2)) = rsqrt(|a|) * rsqrt(sqrt(1 + (b/a)^2))
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitRsqrtComplexAbs(
+    PrimitiveType prim_type, llvm::Value* operand_value) {
+  llvm::Value* min;
+  llvm::Value* max;
+  llvm::Value* sqrt;
+  TF_ASSIGN_OR_RETURN(
+      std::tie(min, max, sqrt),
+      EmitComplexAbsHelper(prim_type, operand_value, /*return_sqrt=*/true));
+  TF_ASSIGN_OR_RETURN(llvm::Value * rsqrt_max, EmitRsqrt(prim_type, max));
+  TF_ASSIGN_OR_RETURN(llvm::Value * rsqrt_sqrt, EmitRsqrt(prim_type, sqrt));
+  llvm::Value* result = FMul(rsqrt_max, rsqrt_sqrt);
+  TF_ASSIGN_OR_RETURN(llvm::Value * rsqrt_min, EmitRsqrt(prim_type, min));
+  // When (min, max) are (0, 0), (inf, inf), or (NaN, ...), `result` is NaN.
+  // In such cases, we return rsqrt(min) instead of `result`.
+  return Select(FCmpUNO(result, result), rsqrt_min, result);
+}
+
+// Using our EmitComplexPower formula, but setting c=0.5 and d=0, we get:
+//   e^[ln(r)*c - t*d] * [cos(ln(r)*d + t*c) + i*sin(ln(r)*d + t*c)]
+// = e^[ln(r)*0.5] * [cos(t*0.5) + i*sin(t*0.5)]
+// = r^0.5 * [cos(t/2) + i*sin(t/2)]
+// = sqrt(r) * [cos(t/2) + i*sin(t/2)]
+// where r = |a+bi| and t = atan2(b,a)
+// TODO(bixia): See doc for implementation without atan2.
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexSqrt(
+    const HloInstruction* op, PrimitiveType prim_type,
+    llvm::Value* operand_value) {
+  llvm::Type* type = static_cast<llvm::StructType*>(operand_value->getType())
+                         ->getElementType(0);
+
+  TF_ASSIGN_OR_RETURN(llvm::Value * r,
+                      EmitSqrtComplexAbs(prim_type, operand_value));
+
+  llvm::Value* a = EmitExtractReal(operand_value);
+  llvm::Value* b = EmitExtractImag(operand_value);
+  TF_ASSIGN_OR_RETURN(llvm::Value * t, EmitAtan2(prim_type, b, a));
+
+  llvm::Value* c = llvm::ConstantFP::get(type, 0.5);
+  llvm::Value* angle = FMul(t, c);
+  TF_ASSIGN_OR_RETURN(llvm::Value * cos, EmitCos(prim_type, angle));
+  TF_ASSIGN_OR_RETURN(llvm::Value * sin, EmitSin(prim_type, angle));
+
+  llvm::Value* real_part;
+  llvm::Value* imag_part;
+
+  llvm::Value* zero = llvm::ConstantFP::get(type, 0);
+
+  if (!(b_->getFastMathFlags().noNaNs() && b_->getFastMathFlags().noInfs())) {
+    llvm::Value* inf = llvm::ConstantFP::getInfinity(type);
+    llvm::Value* neg_inf = llvm::ConstantFP::getInfinity(type, true);
+    llvm::Value* nan = llvm::ConstantFP::getNaN(type);
+    llvm::Value* abs_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                                      {b}, {b->getType()}, b_);
+
+    real_part = Select(Or(FCmpOEQ(abs_b, inf), FCmpOEQ(a, inf)), inf,
+                       Select(And(FCmpOEQ(a, neg_inf), FCmpONE(abs_b, inf)),
+                              zero, FMul(r, cos)));
+
+    llvm::Value* b_signed_inf = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::copysign, {inf, b}, {b->getType()}, b_);
+    imag_part =
+        Select(Or(FCmpOEQ(abs_b, inf), FCmpOEQ(a, neg_inf)), b_signed_inf,
+               Select(FCmpUNO(r, r), nan,
+                      Select(FCmpOEQ(sin, zero), sin, FMul(r, sin))));
+  } else {
+    real_part = FMul(r, cos);
+    imag_part = Select(FCmpOEQ(sin, zero), sin, FMul(r, sin));
+  }
+
+  return Select(FCmpOEQ(r, zero), EmitComposeComplex(op, zero, zero),
+                EmitComposeComplex(op, real_part, imag_part));
+}
+
+// Similar to Sqrt, we can use our EmitComplexPower formula, but set
+// c=-0.5 and d=0. We get:
+//   e^[ln(r)*c - t*d] * [cos(ln(r)*d + t*c) + i*sin(ln(r)*d + t*c)]
+// = e^[ln(r)*-0.5] * [cos(t*-0.5) + i*sin(t*-0.5)]
+// = r^(-0.5) * [cos(-t/2) + i*sin(-t/2)]
+// = rsqrt(r) * [cos(-t/2) + i*sin(-t/2)]
+// where r = |a+bi| and t = atan2(b,a).
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexRsqrt(
+    const HloInstruction* op, PrimitiveType prim_type,
+    llvm::Value* operand_value) {
+  llvm::Type* type = static_cast<llvm::StructType*>(operand_value->getType())
+                         ->getElementType(0);
+
+  TF_ASSIGN_OR_RETURN(llvm::Value * r,
+                      EmitRsqrtComplexAbs(prim_type, operand_value));
+
+  llvm::Value* a = EmitExtractReal(operand_value);
+  llvm::Value* b = EmitExtractImag(operand_value);
+  TF_ASSIGN_OR_RETURN(llvm::Value * t, EmitAtan2(prim_type, b, a));
+
+  llvm::Value* c = llvm::ConstantFP::get(type, -0.5);
+  llvm::Value* angle = FMul(t, c);
+  TF_ASSIGN_OR_RETURN(llvm::Value * cos, EmitCos(prim_type, angle));
+  TF_ASSIGN_OR_RETURN(llvm::Value * sin, EmitSin(prim_type, angle));
+
+  llvm::Value* real_part = FMul(r, cos);
+  llvm::Value* imag_part = FMul(r, sin);
+
+  if (!(b_->getFastMathFlags().noNaNs() && b_->getFastMathFlags().noInfs())) {
+    llvm::Value* zero = llvm::ConstantFP::get(type, 0);
+    llvm::Value* neg_one = llvm::ConstantFP::get(type, -1);
+    llvm::Value* inf = llvm::ConstantFP::getInfinity(type);
+    llvm::Value* nan = llvm::ConstantFP::getNaN(type);
+    // llvm::Value* neg_inf = llvm::ConstantFP::getInfinity(type, true);
+    llvm::Value* a_signed_zero = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::copysign, {zero, a}, {a->getType()}, b_);
+    llvm::Value* b_signed_zero = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::copysign, {zero, b}, {b->getType()}, b_);
+    llvm::Value* neg_b_signed_zero = FMul(b_signed_zero, neg_one);
+
+    llvm::Value* abs_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                                      {a}, {a->getType()}, b_);
+    llvm::Value* abs_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                                      {b}, {b->getType()}, b_);
+
+    llvm::Value* is_zero_zero = And(FCmpOEQ(b, zero), FCmpOEQ(a, zero));
+    real_part = Select(
+        is_zero_zero, inf,
+        Select(Or(And(FCmpOEQ(abs_b, inf), FCmpUNO(a, a)), FCmpOEQ(abs_a, inf)),
+               a_signed_zero, FMul(r, cos)));
+    imag_part = Select(
+        is_zero_zero, nan,
+        Select(Or(And(FCmpOEQ(abs_b, inf), FCmpUNO(a, a)), FCmpOEQ(abs_a, inf)),
+               neg_b_signed_zero, FMul(r, sin)));
+  } else {
+    llvm::Value* zero = llvm::ConstantFP::get(type, 0);
+    llvm::Value* inf = llvm::ConstantFP::getInfinity(type);
+    llvm::Value* nan = llvm::ConstantFP::getNaN(type);
+
+    llvm::Value* is_zero_zero = And(FCmpOEQ(b, zero), FCmpOEQ(a, zero));
+    real_part = Select(is_zero_zero, inf, FMul(r, cos));
+    imag_part = Select(is_zero_zero, nan, FMul(r, sin));
+  }
+
+  return EmitComposeComplex(op, real_part, imag_part);
 }
 
 // (a+bi)^(c+di) =
@@ -1051,7 +1322,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   return Select(x_is_small, for_small_x, for_large_x);
 }
 
-StatusOr<llvm::Value*> ElementalIrEmitter::EmitSqrt(PrimitiveType prim_type,
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSqrt(PrimitiveType,
                                                     llvm::Value* value) {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {value},
                                       {value->getType()}, b_);
@@ -1097,7 +1368,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
   auto x_squared = FMul(x, x);
   auto x_squared_over_two = FMul(x_squared, half);
   auto for_small_x = FAdd(x, x_squared_over_two);
-  const auto kExponentIsSmallThreshold = 1e-5;
+  // At this point, the relative errors due to floating point precision loss of
+  // calculating exp(x) - 1 and the polynomial exp(x)-1 = x + x^2/2 are about
+  // equal, with a value of approximetely 2^-16.
+  const auto kExponentIsSmallThreshold = 0.009;
   auto abs_x =
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
   auto x_is_small =
@@ -1433,7 +1707,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     source_index_phis[operand_id] =
         PHI(source_index.GetType(), operand_usage_count[operand_id]);
     std::vector<llvm::Value*> operand_multi_index = source_index.multidim();
-    operand_multi_index[concat_dim] = source_index_phis[operand_id];
+    operand_multi_index[concat_dim] =
+        NSWSub(operand_multi_index[concat_dim], source_index_phis[operand_id]);
 
     // Create the terminator of the block before calling operand generators,
     // because they require non-degenerate basic blocks.
@@ -1447,25 +1722,24 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     b_->SetInsertPoint(init_block, saved_insert_point);
   }
 
-  std::vector<llvm::Value*> source_multi_index = source_index.multidim();
+  int64 concat_dim_size = 0;
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
-    auto concat_dim_size = source_index.GetConstantWithIndexType(
-        operand->shape().dimensions(concat_dim));
     int64 operand_id = to_unique_operand_id[operand];
-    source_index_phis[operand_id]->addIncoming(source_multi_index[concat_dim],
-                                               b_->GetInsertBlock());
-    CondBr(ICmpULT(source_multi_index[concat_dim], concat_dim_size),
+    source_index_phis[operand_id]->addIncoming(
+        source_index.GetConstantWithIndexType(concat_dim_size),
+        b_->GetInsertBlock());
+    concat_dim_size += operand->shape().dimensions(concat_dim);
+    CondBr(ICmpULT(source_index[concat_dim],
+                   source_index.GetConstantWithIndexType(concat_dim_size)),
            emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
     b_->SetInsertPoint(false_block);
-    source_multi_index[concat_dim] =
-        Sub(source_multi_index[concat_dim], concat_dim_size);
   }
 
   Unreachable();
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 3ba669c5365..99833a5525f 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -143,9 +143,26 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x);
 
+  virtual StatusOr<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*>>
+  EmitComplexAbsHelper(PrimitiveType prim_type, llvm::Value* operand_value,
+                       bool return_sqrt);
+
   virtual StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                                 llvm::Value* operand_value);
 
+  virtual StatusOr<llvm::Value*> EmitSqrtComplexAbs(PrimitiveType prim_type,
+                                                    llvm::Value* operand_value);
+  virtual StatusOr<llvm::Value*> EmitRsqrtComplexAbs(
+      PrimitiveType prim_type, llvm::Value* operand_value);
+
+  virtual StatusOr<llvm::Value*> EmitComplexSqrt(const HloInstruction* op,
+                                                 PrimitiveType prim_type,
+                                                 llvm::Value* operand_value);
+
+  virtual StatusOr<llvm::Value*> EmitComplexRsqrt(const HloInstruction* op,
+                                                  PrimitiveType prim_type,
+                                                  llvm::Value* operand_value);
+
   virtual llvm::Value* EmitExtractReal(llvm::Value* value);
   virtual llvm::Value* EmitExtractImag(llvm::Value* value);
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 7b60c983b30..c45ecc7c2c4 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -26,9 +26,42 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/stream_executor/device_description.h"
 
 namespace xla {
 
+StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    absl::Span<const ShapedBuffer* const> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  StatusOr<ScopedShapedBuffer> result =
+      ExecuteAsyncOnStream(run_options, arguments, hlo_execution_profile);
+  Status blocking_status = run_options->stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(blocking_status);
+  return result;
+}
+
+StatusOr<ExecutionOutput> Executable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    std::vector<ShapeTree<xla::MaybeOwningDeviceMemory>> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  StatusOr<ExecutionOutput> result = ExecuteAsyncOnStream(
+      run_options, std::move(arguments), hlo_execution_profile);
+  Status blocking_status = run_options->stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(blocking_status);
+  return result;
+}
+
+StatusOr<ExecutionOutput> Executable::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* /*run_options*/,
+    std::vector<ShapeTree<xla::MaybeOwningDeviceMemory>> /*arguments*/,
+    HloExecutionProfile* /*hlo_execution_profile*/) {
+  return Unimplemented(
+      "MaybeOwningDeviceMemory version of overload is not implemented ");
+}
+
 StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
     absl::Span<const ServiceExecutableRunOptions> run_options,
     absl::Span<const absl::Span<const ShapedBuffer* const>> arguments) {
@@ -49,8 +82,9 @@ StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
-    TF_ASSIGN_OR_RETURN(auto rv,
-                        ExecuteAsyncOnStream(&run_options[i], arguments[i]));
+    TF_ASSIGN_OR_RETURN(
+        auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i],
+                                      /*hlo_execution_profile=*/nullptr));
     return_values.push_back(std::move(rv));
   }
   for (const auto& options : run_options) {
@@ -61,27 +95,39 @@ StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
 }
 
 StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
-    const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
+    const ServiceExecutableRunOptions* run_options,
+    absl::Span<const ShapedBuffer* const> arguments) {
+  StatusOr<ScopedShapedBuffer> result =
+      ExecuteAsyncOnStreamWrapper(run_options, arguments);
+  Status block_status = run_options->stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
+}
+
+StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStreamWrapper(
+    const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments) {
   se::Stream* stream = run_options->stream();
-  std::unique_ptr<se::Timer> timer;
+  std::shared_ptr<se::Timer> timer;
+  ExecutionProfile* profile = run_options->run_options().execution_profile();
   if (profile != nullptr) {
-    timer.reset(new se::Timer(stream->parent()));
+    timer = std::make_shared<se::Timer>(stream->parent());
     stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
   }
 
   VLOG(1) << "enqueueing executable on stream...";
   // If the profiling flag isn't enabled, we pass nullptr as the profile to
   // indicate profiling is not requested.
-  std::unique_ptr<HloExecutionProfile> profile_ptr =
+  std::shared_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? absl::make_unique<HloExecutionProfile>(&hlo_profile_printer_data(),
-                                                   &hlo_profile_index_map())
+          ? std::make_shared<HloExecutionProfile>(&hlo_profile_printer_data(),
+                                                  &hlo_profile_index_map())
           : nullptr;
 
   StatusOr<ScopedShapedBuffer> return_value =
-      ExecuteOnStream(run_options, arguments, profile_ptr.get());
+      ExecuteAsyncOnStream(run_options, arguments, profile_ptr.get());
   if (!return_value.status().ok()) {
     if (profile != nullptr) {
       // Ensure the ThenStartTimer call has completed before we destroy timer.
@@ -96,30 +142,19 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
   }
 
   if (profile != nullptr) {
-    VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
+    VLOG(1) << "enqueueing 'stop timer' and profiling callback...";
     stream->ThenStopTimer(timer.get());
-    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-    VLOG(1) << "done with block-host-until-done";
 
+    // We block instead of using an async callback because reading the timer
+    // value may call back into the driver on GPU, which is not allowed.
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+    const int64 executable_size_in_bytes = SizeOfGeneratedCodeInBytes();
     // Merge in run-time profile information from execution_profile.
-    //
-    // TODO(b/71713097): This is buggy -- even though the mutex takes care of
-    // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
-    // have rewritten the execution_profile before we get to it.
-    profile->MergeFrom(execution_profile());
 
     // Overall execution time (in nanoseconds) from the executor timer.
-    if (stream->ok()) {
-      // Don't read timer->Nanoseconds() if the stream isn't OK -- that's
-      // illegal.
-      profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
-    }
+    profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
 
-    // TODO(b/28123297): On GPU we end up including transfer time in
-    // the compute time this way. Instead, we should get the correct
-    // value by measuring it. Setting the field here at least lets
-    // benchmarks provide *some* value for GPU computations.
-    //
     // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
     // the compute time without the transfer time, so this way we get the
     // correct compute time. We should instead have the correct value for
@@ -128,21 +163,23 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
       profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
     }
 
-    const int64 executable_size_in_bytes = SizeInBytes();
     if (executable_size_in_bytes != 0) {
       profile->set_executable_size_in_bytes(executable_size_in_bytes);
     }
   }
 
   if (profile_ptr != nullptr) {
-    XLA_LOG_LINES(
-        tensorflow::INFO,
-        profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
+    const se::DeviceDescription* device_description =
+        &stream->parent()->GetDeviceDescription();
+    stream->ThenDoHostCallback([profile_ptr, device_description]() {
+      XLA_LOG_LINES(tensorflow::INFO,
+                    profile_ptr->ToString(*device_description));
+    });
   }
 
   return return_value;
 }
 
-int64 Executable::SizeInBytes() { return -1; }
+int64 Executable::SizeOfGeneratedCodeInBytes() { return -1; }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 492ea72228d..223832271ec 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -123,16 +123,10 @@ class Executable {
   // enabled.
   //
   // Returns a shaped buffer containing the result of the computation.
-  virtual StatusOr<ScopedShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments,
-      HloExecutionProfile* hlo_execution_profile) = 0;
-
-  // Same as ExecuteOnStream(), but this call is non-blocking and returns as
-  // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments) = 0;
+      HloExecutionProfile* hlo_execution_profile);
 
   // Starts the given program executing on the given stream/executor.
   //
@@ -143,20 +137,31 @@ class Executable {
   //
   // If an input is donated to XLA but is not reused as output, it is returned
   // as an leftover buffer for the caller to release.
-  virtual StatusOr<ExecutionOutput> ExecuteOnStream(
+  //
+  // This call should be non-blocking and may return as soon as all of the
+  // operations are enqueued for launch on the stream. Note that some
+  // implementations may in fact block or may block in some circumstances (e.g.,
+  // when profiling); i.e., asynchronous is a "may" not a "must".
+  //
+  // If the hlo_execution_profile is provided as non-nullptr, profiling will be
+  // enabled. Note that profiling is tricky to use correctly, as the profiling
+  // objects (when they exist) must out-live the task.
+  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments,
+      HloExecutionProfile* hlo_execution_profile) = 0;
+
+  // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to
+  // complete.
+  StatusOr<ExecutionOutput> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       std::vector<ShapeTree<xla::MaybeOwningDeviceMemory>> arguments,
-      HloExecutionProfile* hlo_execution_profile) {
-    return Unimplemented(
-        "MaybeOwningDeviceMemory version of overload is not implemented ");
-  }
+      HloExecutionProfile* hlo_execution_profile);
 
   virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ShapeTree<xla::MaybeOwningDeviceMemory>> arguments) {
-    return Unimplemented(
-        "MaybeOwningDeviceMemory version of overload is not implemented ");
-  }
+      std::vector<ShapeTree<xla::MaybeOwningDeviceMemory>> arguments,
+      HloExecutionProfile* hlo_execution_profile);
 
   // Same as ExecuteOnStream(), but runs this executable on multiple
   // streams. arguments[i] contains the arguments to the execution on
@@ -171,6 +176,7 @@ class Executable {
   // called explicitly for other (async, for example) variants after the stream
   // has completed.
   virtual Status PopulateExecutionProfile(
+      ExecutionProfile* execution_profile,
       HloExecutionProfile* hlo_execution_profile, se::Stream* stream) {
     return Status::OK();
   }
@@ -179,15 +185,12 @@ class Executable {
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.
   StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
-      const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
+      const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments);
 
-  // Returns the ExecutionProfile from executing on the device. This includes
-  // the number of cycles taken for the computation or the compilation time.
-  ExecutionProfile execution_profile() const {
-    tensorflow::mutex_lock lock(mutex_);
-    return execution_profile_;
-  }
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments);
 
   const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
@@ -219,30 +222,27 @@ class Executable {
     return hlo_module_->config().entry_computation_layout().result_shape();
   }
 
-  // Returns the size of the executable in bytes. Returns -1 by default if the
-  // method is not overridden to support this kind of query.
-  virtual int64 SizeInBytes();
+  // Returns the size of the executable in bytes. Returns -1 if this query is
+  // not supported by the executable.
+  //
+  // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.).
+  virtual int64 SizeOfGeneratedCodeInBytes();
 
   // Dumping helpers.
-  void set_hlo_snapshot(std::unique_ptr<xla::HloSnapshot> hlo_snapshot) {
-    hlo_snapshot_ = std::move(hlo_snapshot);
+  void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) {
+    hlo_proto_ = std::move(hlo_proto);
   }
-  bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; }
-  HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); }
+  bool dumping_snapshot() const { return hlo_proto_ != nullptr; }
+  HloProto const* hlo_proto() const { return hlo_proto_.get(); }
 
  protected:
-  mutable tensorflow::mutex mutex_;
-
-  // Execution profile data on the device.
-  ExecutionProfile execution_profile_ GUARDED_BY(mutex_);
-
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
   // around.
   const std::shared_ptr<HloModule> hlo_module_;
 
-  // HloSnapshot this was compiled from. Null if not dumping executions.
-  std::unique_ptr<HloSnapshot> hlo_snapshot_;
+  // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
+  std::unique_ptr<HloProto const> hlo_proto_;
 
   // Execution count, used to generate a unique filename for each dumped
   // execution.
diff --git a/tensorflow/compiler/xla/service/fusion_queue.h b/tensorflow/compiler/xla/service/fusion_queue.h
index 4ddb96c5539..3eec47ee205 100644
--- a/tensorflow/compiler/xla/service/fusion_queue.h
+++ b/tensorflow/compiler/xla/service/fusion_queue.h
@@ -15,8 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_FUSION_QUEUE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_FUSION_QUEUE_H_
 
-#include <utility>
+#include <string>
+#include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -25,15 +27,11 @@ namespace xla {
 using FusionConfig = std::vector<std::vector<bool>>;
 
 // Converts fusion config to string format.
-static string FusionConfigToString(const FusionConfig& config) {
-  string s = "";
-  for (auto& edge_list : config) {
-    for (auto edge : edge_list) {
-      if (edge) {
-        s += "1";
-      } else {
-        s += "0";
-      }
+static std::string FusionConfigToString(const FusionConfig& config) {
+  std::string s;
+  for (const auto& edge_list : config) {
+    for (bool edge : edge_list) {
+      absl::StrAppend(&s, edge ? "1" : "0");
     }
   }
   return s;
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 2eae159861c..d65083d701a 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -53,7 +53,7 @@ Status GenericTransferManager::WriteSingleTupleIndexTable(
   TF_RETURN_IF_ERROR(TransferBufferToDevice(
       stream, GetByteSizeRequirement(shape), element_pointers->data(), region));
   // Ensure the buffer is transferred before we destroy element_pointers.
-  stream->ThenDoHostCallback([element_pointers]() {
+  stream->ThenRunAfterNextBlockHostUntilDone([element_pointers]() {
     /* holds reference to element_pointers in closure */
   });
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
old mode 100644
new mode 100755
index a5fc6e80cec..053c3051aea
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -3,12 +3,24 @@
 
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "if_static",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts", "tf_cuda_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+)
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = [":friends"],
@@ -186,6 +198,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "thunk_emitter",
+    srcs = ["thunk_emitter.cc"],
+    hdrs = ["thunk_emitter.h"],
+    deps = [
+        ":backend_configs",
+        ":buffer_allocations",
+        ":gpu_constants",
+        ":gpu_executable",
+        ":ir_emission_utils",
+        ":nccl_all_reduce_thunk",
+        ":thunk",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+    ],
+)
+
 cc_library(
     name = "ir_emitter",
     srcs = [
@@ -213,6 +245,7 @@ cc_library(
         ":partition_assignment",
         ":target_util",
         ":thunk",
+        ":thunk_emitter",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -222,7 +255,6 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
@@ -260,6 +292,7 @@ cc_library(
     hdrs = ["parallel_loop_emitter.h"],
     deps = [
         ":partition_assignment",
+        ":target_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
@@ -410,7 +443,6 @@ tf_cc_test(
 cc_library(
     name = "gpu_executable",
     srcs = [
-        "cholesky_thunk.cc",
         "collective_permute_thunk.cc",
         "conditional_thunk.cc",
         "convolution_thunk.cc",
@@ -431,9 +463,10 @@ cc_library(
         "triangular_solve_thunk.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
-    ],
+    ] + if_cuda_is_configured([
+        "cholesky_thunk.cc",
+    ]),
     hdrs = [
-        "cholesky_thunk.h",
         "collective_permute_thunk.h",
         "conditional_thunk.h",
         "convolution_thunk.h",
@@ -454,12 +487,13 @@ cc_library(
         "triangular_solve_thunk.h",
         "tuple_thunk.h",
         "while_thunk.h",
-    ],
+    ] + if_cuda_is_configured([
+        "cholesky_thunk.h",
+    ]),
     deps = [
         ":backend_configs",
         ":buffer_allocations",
         ":cudnn_conv_runner",
-        ":cusolver_context",
         ":gpu_debug_info_manager",
         ":gpu_types",
         ":hlo_execution_profiler",
@@ -495,17 +529,12 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/platform/default/build_config:cublas_plugin",
-        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
-        "//tensorflow/core/platform/default/build_config:cufft_plugin",
-        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:kernel",
-        "//tensorflow/stream_executor/cuda:cuda_stream",
         "//tensorflow/stream_executor/gpu:gpu_stream",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -516,8 +545,18 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+    ] + if_cuda_is_configured([
+        ":cusolver_context",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
+        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "@local_config_cuda//cuda:cuda_headers",
-    ],
+    ]) + if_rocm_is_configured([
+        "//tensorflow/core/platform/default/build_config:stream_executor_rocm",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 cc_library(
@@ -596,6 +635,7 @@ cc_library(
         ":cudnn_conv_runner",
         ":gpu_autotuning_proto",
         ":gpu_executable",
+        ":hlo_algorithm_blacklist",
         ":ir_emission_utils",
         ":stream_executor_util",
         "//tensorflow/compiler/xla:literal_util",
@@ -620,18 +660,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "scratch_allocator",
-    srcs = ["scratch_allocator.cc"],
-    hdrs = ["scratch_allocator.h"],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/stream_executor:device_memory_allocator",
-    ],
-)
-
 cc_library(
     name = "cudnn_conv_runner",
     srcs = ["cudnn_conv_runner.cc"],
@@ -703,10 +731,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cusolver"],
-        ["//tensorflow/stream_executor/cuda:cusolver_stub"],
-    ),
+        "//tensorflow/stream_executor/cuda:cusolver_lib",
+    ],
 )
 
 cc_library(
@@ -939,6 +965,38 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "cublas_gemm_pad_for_tensor_cores",
+    srcs = ["cublas_gemm_pad_for_tensor_cores.cc"],
+    hdrs = ["cublas_gemm_pad_for_tensor_cores.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+tf_cc_test(
+    name = "cublas_gemm_pad_for_tensor_cores_test",
+    srcs = ["cublas_gemm_pad_for_tensor_cores_test.cc"],
+    deps = [
+        ":cublas_gemm_pad_for_tensor_cores",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
+    ],
+)
+
 cc_library(
     name = "target_constants",
     hdrs = ["target_constants.h"],
@@ -972,20 +1030,19 @@ cc_library(
 )
 
 cc_library(
-    name = "nvptx_compiler_impl",
-    srcs = ["nvptx_compiler.cc"],
-    hdrs = ["nvptx_compiler.h"],
+    name = "gpu_compiler",
+    srcs = [
+        "gpu_compiler.cc",
+    ],
+    hdrs = [
+        "gpu_compiler.h",
+    ],
     deps = [
         ":cudnn_batchnorm_rewriter",
         ":cudnn_conv_algorithm_picker",
-        ":cudnn_conv_pad_for_tensor_cores",
         ":cudnn_conv_padding_legalization",
         ":cudnn_conv_rewriter",
-        ":cudnn_fused_conv_rewriter",
-        ":cusolver_rewriter",
         ":fusion_merger",
-        ":gemm_algorithm_picker",
-        ":gemm_rewriter",
         ":gpu_constants",
         ":gpu_copy_insertion",
         ":gpu_executable",
@@ -1013,7 +1070,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
-        "//tensorflow/compiler/xla/service:convolution_group_converter",
+        "//tensorflow/compiler/xla/service:depthwise_convolution_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
@@ -1038,6 +1095,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:rng_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
+        "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
@@ -1048,15 +1106,12 @@ cc_library(
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream_executor_headers",
-        "//tensorflow/stream_executor/cuda:cuda_diagnostics",
-        "//tensorflow/stream_executor/cuda:ptxas_utils",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1068,11 +1123,108 @@ cc_library(
 
 cc_library(
     name = "nvptx_compiler",
-    srcs = ["nvptx_compiler_registration.cc"],
-    deps = [":nvptx_compiler_impl"],
+    srcs = [
+        "nvptx_compiler_registration.cc",
+    ],
+    deps = [
+        ":nvptx_compiler_impl",
+    ],
     alwayslink = True,  # Contains compiler registration
 )
 
+cc_library(
+    name = "nvptx_compiler_impl",
+    srcs = [
+        "nvptx_compiler.cc",
+    ],
+    hdrs = [
+        "nvptx_compiler.h",
+    ],
+    deps = [
+        ":cudnn_conv_algorithm_picker",
+        ":cudnn_conv_pad_for_tensor_cores",
+        ":cudnn_conv_padding_legalization",
+        ":cudnn_conv_rewriter",
+        ":cudnn_fused_conv_rewriter",
+        ":cusolver_rewriter",
+        ":gemm_algorithm_picker",
+        ":gemm_rewriter",
+        ":gpu_compiler",
+        ":gpu_layout_assignment",
+        ":stream_executor_util",
+        ":target_constants",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_constant_folding",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:llvm_compiler",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/cuda:cuda_diagnostics",
+        "//tensorflow/stream_executor/cuda:ptxas_utils",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_compiler",
+    srcs = [
+        "amdgpu_compiler_registration.cc",
+    ],
+    deps = [
+        ":amdgpu_compiler_impl",
+    ],
+    alwayslink = True,  # Contains compiler registration
+)
+
+cc_library(
+    name = "amdgpu_compiler_impl",
+    srcs = [
+        "amdgpu_compiler.cc",
+    ],
+    hdrs = [
+        "amdgpu_compiler.h",
+    ],
+    deps = [
+        ":cudnn_conv_padding_legalization",
+        ":cudnn_conv_rewriter",
+        ":gpu_compiler",
+        ":gpu_layout_assignment",
+        ":target_constants",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_constant_folding",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:llvm_compiler",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:rocm_rocdl_path",
+    ],
+)
+
 cc_library(
     name = "cudnn_batchnorm_rewriter",
     srcs = ["cudnn_batchnorm_rewriter.cc"],
@@ -1411,3 +1563,30 @@ xla_proto_library(
         "//tensorflow/core:autotuning_proto_cc",
     ],
 )
+
+cc_library(
+    name = "hlo_algorithm_blacklist",
+    srcs = ["hlo_algorithm_blacklist.cc"],
+    hdrs = ["hlo_algorithm_blacklist.h"],
+    deps = [
+        ":gpu_autotuning_proto",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_algorithm_blacklist_test",
+    srcs = ["hlo_algorithm_blacklist_test.cc"],
+    data = ["data/hlo_algorithm_blacklist.pbtxt"],
+    deps = [
+        ":hlo_algorithm_blacklist",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor:dnn",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
new file mode 100644
index 00000000000..949707a22e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
+// TODO(whchung@gmail.com): Add gpu_conv_algorithm_picker after its PR merged.
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Returns the directory containing ROCm-Device-Libs files. This function is
+// called in AMDGPUCompiler's constructor, so can't return an error. But
+// AMDGPUCompiler::Compile will return an error when the wanted rocdl file
+// doesn't exist in the folder this function returns.
+string GetROCDLDir(const HloModuleConfig& config) {
+  std::vector<string> potential_rocdl_dirs;
+  const string datadir = config.debug_options().xla_gpu_cuda_data_dir();
+  if (!datadir.empty()) {
+    potential_rocdl_dirs.push_back(datadir);
+  }
+  potential_rocdl_dirs.push_back(tensorflow::RocdlRoot());
+
+  // Tries all potential ROCDL directories in the order they are inserted.
+  // Returns the first directory that exists in the file system.
+  for (const string& potential_rocdl_dir : potential_rocdl_dirs) {
+    if (tensorflow::Env::Default()->IsDirectory(potential_rocdl_dir).ok()) {
+      VLOG(2) << "Found ROCm-Device-Libs dir " << potential_rocdl_dir;
+      return potential_rocdl_dir;
+    }
+    VLOG(2) << "Unable to find potential ROCm-Device-Libs dir "
+            << potential_rocdl_dir;
+  }
+
+  // Last resort: maybe in the current folder.
+  return ".";
+}
+
+}  // namespace
+
+Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  // Convert convolutions into CustomCalls to MIOpen, then canonicalize them
+  // (PadInsertion).
+  HloPassPipeline pipeline("conv_canonicalization");
+  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                            /*allow_mixed_precision=*/false);
+  pipeline.AddPass<CudnnConvRewriter>();
+  pipeline.AddPass<CudnnConvPaddingLegalization>();
+
+  pipeline.AddPass<HloConstantFolding>();
+  TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+
+  return Status::OK();
+}
+
+Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  HloPassPipeline pipeline("post-layout_assignment");
+  pipeline.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false,
+      LayoutAssignment::InstructionCanChangeLayout);
+
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+
+  // TODO(whchung@gmail.com): Add gpu_conv_algorithm_picker after its PR merged.
+
+  // Clean up new_tuple described above.
+  pipeline.AddPass<TupleSimplifier>();
+
+  pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+
+  return Status::OK();
+}
+
+AMDGPUCompiler::AMDGPUCompiler()
+    : GpuCompiler(stream_executor::rocm::kROCmPlatformId, amdgpu::kTargetTriple,
+                  amdgpu::kDataLayout) {}
+
+GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
+  int isa_version = 0;
+  if (!stream_exec->GetDeviceDescription().rocm_amdgpu_isa_version(
+          &isa_version)) {
+    LOG(WARNING)
+        << "Couldn't get AMDGPU ISA version for device; assuming gfx803.";
+    isa_version = 803;
+  }
+
+  return isa_version;
+}
+
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
+                                    llvm::Module* llvm_module,
+                                    GpuVersion gpu_version,
+                                    se::StreamExecutor* stream_exec) {
+  if (rocdl_dir_.empty()) {
+    // Compute rocdl_dir_ just once and cache it in this member.
+    rocdl_dir_ = GetROCDLDir(module->config());
+  }
+
+  std::vector<uint8> hsaco;
+  {
+    XLA_SCOPED_LOGGING_TIMER(
+        "AMDGPUCompiler::CompileTargetBinary - CompileToHsaco");
+    TF_ASSIGN_OR_RETURN(hsaco,
+                        amdgpu::CompileToHsaco(llvm_module, gpu_version,
+                                               module->config(), rocdl_dir_));
+  }
+
+  llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);
+
+  if (user_post_optimization_hook_) {
+    user_post_optimization_hook_(*llvm_module);
+  }
+
+  return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
new file mode 100644
index 00000000000..d1a74a7822e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// AMDGPUCompiler generates efficient GPU executables for AMDGPU target.
+class AMDGPUCompiler : public GpuCompiler {
+ public:
+  AMDGPUCompiler();
+  ~AMDGPUCompiler() override {}
+
+  Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+      const HloModule* hlo_module, llvm::Module* llvm_module,
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
+
+ private:
+  // The parent directory of ROCm-Device-Libs IR libraries.
+  string rocdl_dir_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AMDGPUCompiler);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler_registration.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler_registration.cc
new file mode 100644
index 00000000000..3d6d19fe980
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler_registration.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"
+
+static bool InitModule() {
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::rocm::kROCmPlatformId,
+      []() { return absl::make_unique<xla::gpu::AMDGPUCompiler>(); });
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 30108315e4d..e9b371e33d8 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -34,7 +34,7 @@ namespace gpu {
 
 static constexpr double kTolerance = 0.1f;
 
-// Comparison kernel code: compare two buffers of fp16/fp32/fp64 of length
+// Comparison kernel code: compare two buffers of fp16/fp32/fp64/int8 of length
 // buffer_length where the relative error does not exceed the passed
 // rel_error_threshold. Write the number of mismatches into out parameter
 // mismatch_count.
@@ -46,12 +46,20 @@ static constexpr double kTolerance = 0.1f;
 //
 // #include<cuda_fp16.h>
 // extern "C" { // avoid name mangling
-// __device__ float canonicalize(float input) {
+// __device__ float __xla_buffer_comparator_canonicalize(float input) {
 //   // All fp16 infinities are treated as 65505 or -65505, in order to avoid
 //   // differences due to overflows.
 //   return isnan(input) ? input : max(-65505.0f, min(input, 65505.0f));
 // }
-//
+
+// __device__ float __xla_buffer_comparator_extract_int8(int pack) {
+//   // Extract the lower 8 bits from pack and convert it to float
+//   const unsigned int bit_mask = 0xff;
+//   unsigned int bits = pack & bit_mask;
+//   char* int8_ptr = (char*)&bits;
+//   return __int2float_rn(*int8_ptr);
+// }
+
 // __global__ void __xla_fp16_comparison(__half* buffer_a, __half* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -60,15 +68,15 @@ static constexpr double kTolerance = 0.1f;
 //   if (idx >= buffer_length) return;
 //   float elem_a = __half2float(buffer_a[idx]);
 //   float elem_b = __half2float(buffer_b[idx]);
-//   elem_a = canonicalize(elem_a);
-//   elem_b = canonicalize(elem_b);
+//   elem_a = __xla_buffer_comparator_canonicalize(elem_a);
+//   elem_b = __xla_buffer_comparator_canonicalize(elem_b);
 //   if (isnan(elem_a) && isnan(elem_b)) return;
 //   float rel_error = abs(elem_a - elem_b)
 //       / (max(abs(elem_a), abs(elem_b)) + 1);
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
-//
+
 // __global__ void __xla_fp32_comparison(float* buffer_a, float* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -85,7 +93,7 @@ static constexpr double kTolerance = 0.1f;
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
-//
+
 // __global__ void __xla_fp64_comparison(double* buffer_a, double* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -102,234 +110,440 @@ static constexpr double kTolerance = 0.1f;
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
+
+// __global__ void __xla_int8_comparison(int* buffer_a, int* buffer_b,
+//                                       float rel_error_threshold,
+//                                       unsigned long long buffer_length,
+//                                       int* mismatch_count) {
+//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   int pack_a = buffer_a[idx];
+//   int pack_b = buffer_b[idx];
+//   for(int i = 0; i < 4; ++i) {
+//     float elem_a = __xla_buffer_comparator_extract_int8(pack_a);
+//     float elem_b = __xla_buffer_comparator_extract_int8(pack_b);
+//     float rel_error = abs(elem_a - elem_b)
+//         / (max(abs(elem_a), abs(elem_b)) + 1);
+//     if (rel_error > rel_error_threshold || isnan(rel_error))
+//         atomicAdd(mismatch_count, 1);
+//     pack_a >>= 8;
+//     pack_b >>= 8;
+//   }
+// }
 // } // end extern declaration.
 static const char* buffer_compare_ptx = R"(
-.version 4.2
+.version 6.4
 .target sm_30
 .address_size 64
 
+	// .globl	__xla_fp16_comparison
+
 .visible .entry __xla_fp16_comparison(
-  .param .u64 __xla_fp16_comparison_param_0,
-  .param .u64 __xla_fp16_comparison_param_1,
-  .param .f32 __xla_fp16_comparison_param_2,
-  .param .u64 __xla_fp16_comparison_param_3,
-  .param .u64 __xla_fp16_comparison_param_4
+	.param .u64 __xla_fp16_comparison_param_0,
+	.param .u64 __xla_fp16_comparison_param_1,
+	.param .f32 __xla_fp16_comparison_param_2,
+	.param .u64 __xla_fp16_comparison_param_3,
+	.param .u64 __xla_fp16_comparison_param_4
 )
 {
-  .reg .pred   %p<10>;
-  .reg .b16   %rs<3>;
-  .reg .f32   %f<20>;
-  .reg .b32   %r<6>;
-  .reg .b64   %rd<12>;
-  ld.param.u64   %rd8, [__xla_fp16_comparison_param_3];
-  mov.u32   %r1, %tid.x;
-  mov.u32   %r2, %ctaid.x;
-  mov.u32   %r3, %ntid.x;
-  mad.lo.s32   %r4, %r3, %r2, %r1;
-  cvt.s64.s32   %rd4, %r4;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB7_4;
-  ld.param.u64   %rd5, [__xla_fp16_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp16_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 1;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.u16   %rs1, [%rd10];
-  // begin inline asm
-  {  cvt.f32.f16 %f6, %rs1;}
+	.reg .pred 	%p<9>;
+	.reg .b16 	%rs<3>;
+	.reg .f32 	%f<28>;
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<12>;
 
-  // end inline asm
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.u16   %rs2, [%rd11];
-  // begin inline asm
-  {  cvt.f32.f16 %f7, %rs2;}
 
-  // end inline asm
-  abs.f32   %f8, %f6;
-  setp.gtu.f32   %p2, %f8, 0f7F800000;
-  min.f32   %f9, %f6, 0f477FE100;
-  max.f32   %f10, %f9, 0fC77FE100;
-  selp.f32   %f1, %f6, %f10, %p2;
-  abs.f32   %f11, %f7;
-  setp.gtu.f32   %p3, %f11, 0f7F800000;
-  min.f32   %f12, %f7, 0f477FE100;
-  max.f32   %f13, %f12, 0fC77FE100;
-  selp.f32   %f2, %f7, %f13, %p3;
-  abs.f32   %f3, %f1;
-  setp.gtu.f32   %p4, %f3, 0f7F800000;
-  abs.f32   %f4, %f2;
-  setp.gtu.f32   %p5, %f4, 0f7F800000;
-  and.pred    %p6, %p4, %p5;
-  @%p6 bra   LBB7_4;
-  ld.param.f32   %f5, [__xla_fp16_comparison_param_2];
-  sub.f32   %f14, %f1, %f2;
-  abs.f32   %f15, %f14;
-  max.f32   %f16, %f3, %f4;
-  add.f32   %f17, %f16, 0f3F800000;
-  div.rn.f32   %f18, %f15, %f17;
-  setp.leu.f32   %p7, %f18, %f5;
-  abs.f32   %f19, %f18;
-  setp.le.f32   %p8, %f19, 0f7F800000;
-  and.pred    %p9, %p7, %p8;
-  @%p9 bra   LBB7_4;
-  ld.param.u64   %rd6, [__xla_fp16_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r5, [%rd1], 1;
-LBB7_4:
-  ret;
+	ld.param.u64 	%rd1, [__xla_fp16_comparison_param_0];
+	ld.param.u64 	%rd2, [__xla_fp16_comparison_param_1];
+	ld.param.f32 	%f10, [__xla_fp16_comparison_param_2];
+	ld.param.u64 	%rd4, [__xla_fp16_comparison_param_3];
+	ld.param.u64 	%rd3, [__xla_fp16_comparison_param_4];
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %tid.x;
+	mad.lo.s32 	%r1, %r2, %r3, %r4;
+	cvt.s64.s32	%rd5, %r1;
+	setp.ge.u64	%p1, %rd5, %rd4;
+	@%p1 bra 	BB0_9;
 
+	cvta.to.global.u64 	%rd6, %rd1;
+	mul.wide.s32 	%rd7, %r1, 2;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.u16 	%rs1, [%rd8];
+	// inline asm
+	{  cvt.f32.f16 %f26, %rs1;}
+
+	// inline asm
+	cvta.to.global.u64 	%rd9, %rd2;
+	add.s64 	%rd10, %rd9, %rd7;
+	ld.global.u16 	%rs2, [%rd10];
+	// inline asm
+	{  cvt.f32.f16 %f27, %rs2;}
+
+	// inline asm
+	abs.f32 	%f13, %f26;
+	setp.gtu.f32	%p2, %f13, 0f7F800000;
+	@%p2 bra 	BB0_3;
+
+	mov.f32 	%f14, 0f477FE100;
+	min.f32 	%f15, %f26, %f14;
+	mov.f32 	%f16, 0fC77FE100;
+	max.f32 	%f26, %f16, %f15;
+
+BB0_3:
+	abs.f32 	%f17, %f27;
+	setp.gtu.f32	%p3, %f17, 0f7F800000;
+	@%p3 bra 	BB0_5;
+
+	mov.f32 	%f18, 0f477FE100;
+	min.f32 	%f19, %f27, %f18;
+	mov.f32 	%f20, 0fC77FE100;
+	max.f32 	%f27, %f20, %f19;
+
+BB0_5:
+	abs.f32 	%f7, %f26;
+	setp.gtu.f32	%p4, %f7, 0f7F800000;
+	abs.f32 	%f8, %f27;
+	setp.gtu.f32	%p5, %f8, 0f7F800000;
+	and.pred  	%p6, %p4, %p5;
+	@%p6 bra 	BB0_9;
+
+	sub.f32 	%f21, %f26, %f27;
+	abs.f32 	%f22, %f21;
+	max.f32 	%f23, %f7, %f8;
+	add.f32 	%f24, %f23, 0f3F800000;
+	div.rn.f32 	%f9, %f22, %f24;
+	setp.gt.f32	%p7, %f9, %f10;
+	@%p7 bra 	BB0_8;
+
+	abs.f32 	%f25, %f9;
+	setp.le.f32	%p8, %f25, 0f7F800000;
+	@%p8 bra 	BB0_9;
+
+BB0_8:
+	cvta.to.global.u64 	%rd11, %rd3;
+	atom.global.add.u32 	%r5, [%rd11], 1;
+
+BB0_9:
+	ret;
 }
-  // .globl  __xla_fp32_comparison
+
+	// .globl	__xla_fp32_comparison
 .visible .entry __xla_fp32_comparison(
-  .param .u64 __xla_fp32_comparison_param_0,
-  .param .u64 __xla_fp32_comparison_param_1,
-  .param .f32 __xla_fp32_comparison_param_2,
-  .param .u64 __xla_fp32_comparison_param_3,
-  .param .u64 __xla_fp32_comparison_param_4
+	.param .u64 __xla_fp32_comparison_param_0,
+	.param .u64 __xla_fp32_comparison_param_1,
+	.param .f32 __xla_fp32_comparison_param_2,
+	.param .u64 __xla_fp32_comparison_param_3,
+	.param .u64 __xla_fp32_comparison_param_4
 )
 {
-  .reg .pred   %p<12>;
-  .reg .f32   %f<12>;
-  .reg .b32   %r<9>;
-  .reg .b64   %rd<12>;
+	.reg .pred 	%p<10>;
+	.reg .b16 	%rs<3>;
+	.reg .f32 	%f<13>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<12>;
 
-  ld.param.u64   %rd8, [__xla_fp32_comparison_param_3];
-  mov.u32   %r1, %tid.x;
-  mov.u32   %r2, %ctaid.x;
-  mov.u32   %r3, %ntid.x;
-  mad.lo.s32   %r4, %r3, %r2, %r1;
-  cvt.s64.s32   %rd4, %r4;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB8_6;
-  ld.param.u64   %rd5, [__xla_fp32_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp32_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 2;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.f32   %f1, [%rd10];
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.f32   %f2, [%rd11];
-  abs.f32   %f3, %f1;
-  setp.gtu.f32   %p2, %f3, 0f7F800000;
-  abs.f32   %f4, %f2;
-  setp.gtu.f32   %p3, %f4, 0f7F800000;
-  and.pred    %p4, %p2, %p3;
-  @%p4 bra   LBB8_6;
-  setp.neu.f32   %p5, %f3, 0f7F800000;
-  setp.neu.f32   %p6, %f4, 0f7F800000;
-  or.pred    %p7, %p5, %p6;
-  @%p7 bra   LBB8_4;
-  mov.b32   %r5, %f1;
-  mov.b32   %r6, %f2;
-  xor.b32    %r7, %r6, %r5;
-  setp.gt.s32   %p8, %r7, -1;
-  @%p8 bra   LBB8_6;
-LBB8_4:
-  ld.param.f32   %f5, [__xla_fp32_comparison_param_2];
-  sub.f32   %f6, %f1, %f2;
-  abs.f32   %f7, %f6;
-  max.f32   %f8, %f3, %f4;
-  add.f32   %f9, %f8, 0f3F800000;
-  div.rn.f32   %f10, %f7, %f9;
-  setp.leu.f32   %p9, %f10, %f5;
-  abs.f32   %f11, %f10;
-  setp.le.f32   %p10, %f11, 0f7F800000;
-  and.pred    %p11, %p9, %p10;
-  @%p11 bra   LBB8_6;
-  ld.param.u64   %rd6, [__xla_fp32_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r8, [%rd1], 1;
-LBB8_6:
-  ret;
 
+	ld.param.u64 	%rd1, [__xla_fp32_comparison_param_0];
+	ld.param.u64 	%rd2, [__xla_fp32_comparison_param_1];
+	ld.param.f32 	%f6, [__xla_fp32_comparison_param_2];
+	ld.param.u64 	%rd4, [__xla_fp32_comparison_param_3];
+	ld.param.u64 	%rd3, [__xla_fp32_comparison_param_4];
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %tid.x;
+	mad.lo.s32 	%r1, %r2, %r3, %r4;
+	cvt.s64.s32	%rd5, %r1;
+	setp.ge.u64	%p1, %rd5, %rd4;
+	@%p1 bra 	BB1_8;
+
+	cvta.to.global.u64 	%rd6, %rd1;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	cvta.to.global.u64 	%rd9, %rd2;
+	add.s64 	%rd10, %rd9, %rd7;
+	ld.global.f32 	%f1, [%rd10];
+	ld.global.f32 	%f2, [%rd8];
+	abs.f32 	%f3, %f2;
+	setp.le.f32	%p2, %f3, 0f7F800000;
+	@%p2 bra 	BB1_3;
+
+	abs.f32 	%f7, %f1;
+	setp.gtu.f32	%p3, %f7, 0f7F800000;
+	@%p3 bra 	BB1_8;
+
+BB1_3:
+	setp.neu.f32	%p4, %f3, 0f7F800000;
+	abs.f32 	%f4, %f1;
+	setp.neu.f32	%p5, %f4, 0f7F800000;
+	or.pred  	%p6, %p4, %p5;
+	@%p6 bra 	BB1_5;
+
+	mov.b32 	 %r5, %f2;
+	shr.u32 	%r6, %r5, 31;
+	cvt.u16.u32	%rs1, %r6;
+	mov.b32 	 %r7, %f1;
+	shr.u32 	%r8, %r7, 31;
+	cvt.u16.u32	%rs2, %r8;
+	setp.eq.s16	%p7, %rs1, %rs2;
+	@%p7 bra 	BB1_8;
+
+BB1_5:
+	sub.f32 	%f8, %f2, %f1;
+	abs.f32 	%f9, %f8;
+	max.f32 	%f10, %f3, %f4;
+	add.f32 	%f11, %f10, 0f3F800000;
+	div.rn.f32 	%f5, %f9, %f11;
+	setp.gt.f32	%p8, %f5, %f6;
+	@%p8 bra 	BB1_7;
+
+	abs.f32 	%f12, %f5;
+	setp.le.f32	%p9, %f12, 0f7F800000;
+	@%p9 bra 	BB1_8;
+
+BB1_7:
+	cvta.to.global.u64 	%rd11, %rd3;
+	atom.global.add.u32 	%r9, [%rd11], 1;
+
+BB1_8:
+	ret;
 }
-  // .globl  __xla_fp64_comparison
+
+	// .globl	__xla_fp64_comparison
 .visible .entry __xla_fp64_comparison(
-  .param .u64 __xla_fp64_comparison_param_0,
-  .param .u64 __xla_fp64_comparison_param_1,
-  .param .f32 __xla_fp64_comparison_param_2,
-  .param .u64 __xla_fp64_comparison_param_3,
-  .param .u64 __xla_fp64_comparison_param_4
+	.param .u64 __xla_fp64_comparison_param_0,
+	.param .u64 __xla_fp64_comparison_param_1,
+	.param .f32 __xla_fp64_comparison_param_2,
+	.param .u64 __xla_fp64_comparison_param_3,
+	.param .u64 __xla_fp64_comparison_param_4
 )
 {
-  .reg .pred   %p<16>;
-  .reg .f32   %f<2>;
-  .reg .b32   %r<13>;
-  .reg .f64   %fd<12>;
-  .reg .b64   %rd<12>;
+	.reg .pred 	%p<11>;
+	.reg .b16 	%rs<3>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<14>;
+	.reg .f64 	%fd<13>;
+	.reg .b64 	%rd<12>;
 
-  ld.param.u64   %rd8, [__xla_fp64_comparison_param_3];
-  mov.u32   %r2, %tid.x;
-  mov.u32   %r3, %ctaid.x;
-  mov.u32   %r4, %ntid.x;
-  mad.lo.s32   %r5, %r4, %r3, %r2;
-  cvt.s64.s32   %rd4, %r5;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB9_6;
-  ld.param.u64   %rd5, [__xla_fp64_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp64_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 3;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.f64   %fd1, [%rd10];
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.f64   %fd2, [%rd11];
-  abs.f64   %fd3, %fd1;
-  setp.gtu.f64   %p2, %fd3, 0d7FF0000000000000;
-  abs.f64   %fd4, %fd2;
-  setp.gtu.f64   %p3, %fd4, 0d7FF0000000000000;
-  and.pred    %p4, %p2, %p3;
-  @%p4 bra   LBB9_6;
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%r6, %temp}, %fd1;
-  }
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%temp, %r1}, %fd1;
-  }
-  and.b32    %r7, %r1, 2147483647;
-  setp.ne.s32   %p5, %r7, 2146435072;
-  setp.ne.s32   %p6, %r6, 0;
-  or.pred    %p7, %p6, %p5;
-  @%p7 bra   LBB9_4;
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%r8, %temp}, %fd2;
-  }
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%temp, %r9}, %fd2;
-  }
-  and.b32    %r10, %r9, 2147483647;
-  setp.eq.s32   %p8, %r10, 2146435072;
-  setp.eq.s32   %p9, %r8, 0;
-  and.pred    %p10, %p8, %p9;
-  xor.b32    %r11, %r9, %r1;
-  setp.gt.s32   %p11, %r11, -1;
-  and.pred    %p12, %p11, %p10;
-  @%p12 bra   LBB9_6;
-LBB9_4:
-  ld.param.f32   %f1, [__xla_fp64_comparison_param_2];
-  sub.f64   %fd5, %fd1, %fd2;
-  abs.f64   %fd6, %fd5;
-  max.f64   %fd7, %fd3, %fd4;
-  add.f64   %fd8, %fd7, 0d3FF0000000000000;
-  div.rn.f64   %fd9, %fd6, %fd8;
-  cvt.f64.f32   %fd10, %f1;
-  setp.leu.f64   %p13, %fd9, %fd10;
-  abs.f64   %fd11, %fd9;
-  setp.le.f64   %p14, %fd11, 0d7FF0000000000000;
-  and.pred    %p15, %p13, %p14;
-  @%p15 bra   LBB9_6;
-  ld.param.u64   %rd6, [__xla_fp64_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r12, [%rd1], 1;
-LBB9_6:
-  ret;
+
+	ld.param.u64 	%rd1, [__xla_fp64_comparison_param_0];
+	ld.param.u64 	%rd2, [__xla_fp64_comparison_param_1];
+	ld.param.f32 	%f1, [__xla_fp64_comparison_param_2];
+	ld.param.u64 	%rd4, [__xla_fp64_comparison_param_3];
+	ld.param.u64 	%rd3, [__xla_fp64_comparison_param_4];
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r5, %r6;
+	cvt.s64.s32	%rd5, %r1;
+	setp.ge.u64	%p1, %rd5, %rd4;
+	@%p1 bra 	BB2_11;
+
+	cvta.to.global.u64 	%rd6, %rd1;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	cvta.to.global.u64 	%rd9, %rd2;
+	add.s64 	%rd10, %rd9, %rd7;
+	ld.global.f64 	%fd1, [%rd10];
+	ld.global.f64 	%fd2, [%rd8];
+	abs.f64 	%fd3, %fd2;
+	setp.le.f64	%p2, %fd3, 0d7FF0000000000000;
+	@%p2 bra 	BB2_3;
+
+	abs.f64 	%fd5, %fd1;
+	setp.gtu.f64	%p3, %fd5, 0d7FF0000000000000;
+	@%p3 bra 	BB2_11;
+
+BB2_3:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd2;
+	}
+	and.b32  	%r7, %r2, 2147483647;
+	setp.ne.s32	%p4, %r7, 2146435072;
+	@%p4 bra 	BB2_8;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r8, %temp}, %fd2;
+	}
+	setp.ne.s32	%p5, %r8, 0;
+	@%p5 bra 	BB2_8;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	and.b32  	%r9, %r3, 2147483647;
+	setp.ne.s32	%p6, %r9, 2146435072;
+	@%p6 bra 	BB2_8;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r10, %temp}, %fd1;
+	}
+	setp.ne.s32	%p7, %r10, 0;
+	@%p7 bra 	BB2_8;
+
+	shr.u32 	%r11, %r2, 31;
+	cvt.u16.u32	%rs1, %r11;
+	shr.u32 	%r12, %r3, 31;
+	cvt.u16.u32	%rs2, %r12;
+	setp.eq.s16	%p8, %rs1, %rs2;
+	@%p8 bra 	BB2_11;
+
+BB2_8:
+	sub.f64 	%fd6, %fd2, %fd1;
+	abs.f64 	%fd7, %fd6;
+	abs.f64 	%fd8, %fd1;
+	max.f64 	%fd9, %fd3, %fd8;
+	add.f64 	%fd10, %fd9, 0d3FF0000000000000;
+	div.rn.f64 	%fd4, %fd7, %fd10;
+	cvt.f64.f32	%fd11, %f1;
+	setp.gt.f64	%p9, %fd4, %fd11;
+	@%p9 bra 	BB2_10;
+
+	abs.f64 	%fd12, %fd4;
+	setp.le.f64	%p10, %fd12, 0d7FF0000000000000;
+	@%p10 bra 	BB2_11;
+
+BB2_10:
+	cvta.to.global.u64 	%rd11, %rd3;
+	atom.global.add.u32 	%r13, [%rd11], 1;
+
+BB2_11:
+	ret;
+}
+
+	// .globl	__xla_int8_comparison
+.visible .entry __xla_int8_comparison(
+	.param .u64 __xla_int8_comparison_param_0,
+	.param .u64 __xla_int8_comparison_param_1,
+	.param .f32 __xla_int8_comparison_param_2,
+	.param .u64 __xla_int8_comparison_param_3,
+	.param .u64 __xla_int8_comparison_param_4
+)
+{
+	.reg .pred 	%p<10>;
+	.reg .f32 	%f<42>;
+	.reg .b32 	%r<23>;
+	.reg .b64 	%rd<12>;
+
+
+	ld.param.u64 	%rd2, [__xla_int8_comparison_param_0];
+	ld.param.u64 	%rd3, [__xla_int8_comparison_param_1];
+	ld.param.f32 	%f5, [__xla_int8_comparison_param_2];
+	ld.param.u64 	%rd4, [__xla_int8_comparison_param_3];
+	ld.param.u64 	%rd5, [__xla_int8_comparison_param_4];
+	cvta.to.global.u64 	%rd1, %rd5;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r5, %r6;
+	cvt.s64.s32	%rd6, %r1;
+	setp.ge.u64	%p1, %rd6, %rd4;
+	@%p1 bra 	BB3_13;
+
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r1, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd8;
+	ld.global.u32 	%r2, [%rd9];
+	cvt.s32.s8 	%r7, %r2;
+	cvt.rn.f32.s32	%f6, %r7;
+	ld.global.u32 	%r3, [%rd11];
+	cvt.s32.s8 	%r8, %r3;
+	cvt.rn.f32.s32	%f7, %r8;
+	sub.f32 	%f8, %f6, %f7;
+	abs.f32 	%f9, %f8;
+	abs.f32 	%f10, %f6;
+	abs.f32 	%f11, %f7;
+	max.f32 	%f12, %f10, %f11;
+	add.f32 	%f13, %f12, 0f3F800000;
+	div.rn.f32 	%f1, %f9, %f13;
+	setp.gt.f32	%p2, %f1, %f5;
+	@%p2 bra 	BB3_3;
+
+	abs.f32 	%f14, %f1;
+	setp.le.f32	%p3, %f14, 0f7F800000;
+	@%p3 bra 	BB3_4;
+
+BB3_3:
+	atom.global.add.u32 	%r9, [%rd1], 1;
+
+BB3_4:
+	shr.u32 	%r10, %r3, 8;
+	shr.u32 	%r11, %r2, 8;
+	cvt.s32.s8 	%r12, %r11;
+	cvt.rn.f32.s32	%f15, %r12;
+	cvt.s32.s8 	%r13, %r10;
+	cvt.rn.f32.s32	%f16, %r13;
+	sub.f32 	%f17, %f15, %f16;
+	abs.f32 	%f18, %f17;
+	abs.f32 	%f19, %f15;
+	abs.f32 	%f20, %f16;
+	max.f32 	%f21, %f19, %f20;
+	add.f32 	%f22, %f21, 0f3F800000;
+	div.rn.f32 	%f2, %f18, %f22;
+	setp.gt.f32	%p4, %f2, %f5;
+	@%p4 bra 	BB3_6;
+
+	abs.f32 	%f23, %f2;
+	setp.le.f32	%p5, %f23, 0f7F800000;
+	@%p5 bra 	BB3_7;
+
+BB3_6:
+	atom.global.add.u32 	%r14, [%rd1], 1;
+
+BB3_7:
+	shr.u32 	%r15, %r3, 16;
+	shr.u32 	%r16, %r2, 16;
+	cvt.s32.s8 	%r17, %r16;
+	cvt.rn.f32.s32	%f24, %r17;
+	cvt.s32.s8 	%r18, %r15;
+	cvt.rn.f32.s32	%f25, %r18;
+	sub.f32 	%f26, %f24, %f25;
+	abs.f32 	%f27, %f26;
+	abs.f32 	%f28, %f24;
+	abs.f32 	%f29, %f25;
+	max.f32 	%f30, %f28, %f29;
+	add.f32 	%f31, %f30, 0f3F800000;
+	div.rn.f32 	%f3, %f27, %f31;
+	setp.gt.f32	%p6, %f3, %f5;
+	@%p6 bra 	BB3_9;
+
+	abs.f32 	%f32, %f3;
+	setp.le.f32	%p7, %f32, 0f7F800000;
+	@%p7 bra 	BB3_10;
+
+BB3_9:
+	atom.global.add.u32 	%r19, [%rd1], 1;
+
+BB3_10:
+	shr.s32 	%r20, %r2, 24;
+	cvt.rn.f32.s32	%f33, %r20;
+	shr.s32 	%r21, %r3, 24;
+	cvt.rn.f32.s32	%f34, %r21;
+	sub.f32 	%f35, %f33, %f34;
+	abs.f32 	%f36, %f35;
+	abs.f32 	%f37, %f33;
+	abs.f32 	%f38, %f34;
+	max.f32 	%f39, %f37, %f38;
+	add.f32 	%f40, %f39, 0f3F800000;
+	div.rn.f32 	%f4, %f36, %f40;
+	setp.gt.f32	%p8, %f4, %f5;
+	@%p8 bra 	BB3_12;
+
+	abs.f32 	%f41, %f4;
+	setp.le.f32	%p9, %f41, 0f7F800000;
+	@%p9 bra 	BB3_13;
+
+BB3_12:
+	atom.global.add.u32 	%r22, [%rd1], 1;
+
+BB3_13:
+	ret;
 }
 )";
 
@@ -405,11 +619,13 @@ StatusOr<bool> HostCompare(se::Stream* stream, se::DeviceMemoryBase lhs,
 
   const auto canonicalize = [](ComparisonType a) -> ComparisonType {
     if (std::is_same<ElementType, Eigen::half>::value && a) {
-      constexpr ComparisonType kMaxFp16Value = 65505.;
+      constexpr ComparisonType kMaxFp16Value =
+          std::is_same<ElementType, Eigen::half>::value ? 65505. : 0;
       if (std::isnan(a)) {
         return a;
       }
-      return std::max(-kMaxFp16Value, std::min(a, kMaxFp16Value));
+      return std::max(static_cast<ComparisonType>(-kMaxFp16Value),
+                      static_cast<ComparisonType>(std::min(a, kMaxFp16Value)));
     }
     return a;
   };
@@ -472,6 +688,9 @@ StatusOr<bool> BufferComparator::CompareEqual(se::Stream* stream,
     case xla::F64:
       return CompareEqualParameterized<double, double>(
           stream, lhs, rhs, shape_, config_, "__xla_fp64_comparison");
+    case xla::S8:
+      return CompareEqualParameterized<int8, float>(
+          stream, lhs, rhs, shape_, config_, "__xla_int8_comparison");
     default:
       return Unimplemented("Unimplemented element type");
   }
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
index 139e4204304..0f547111096 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -178,6 +178,13 @@ TEST_F(BufferComparatorTest, TestNumbers) {
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({0.9}, {1}));
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({9}, {10}));
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({10}, {9}));
+
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({200}, {201}));
+  EXPECT_FALSE(CompareEqualFloatBuffers<int8>({0}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({9}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({90}, {100}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({100}, {90}));
+  EXPECT_FALSE(CompareEqualFloatBuffers<int8>({-128}, {127}));
 }
 
 TEST_F(BufferComparatorTest, TestMultiple) {
@@ -231,6 +238,23 @@ TEST_F(BufferComparatorTest, TestMultiple) {
       rhs[i] = 0;
     }
   }
+
+  {
+    EXPECT_TRUE(CompareEqualFloatBuffers<int8>({20, 30, 40, 50, 60},
+                                               {21, 31, 41, 51, 61}));
+    std::vector<float> lhs(200);
+    std::vector<float> rhs(200);
+    for (int i = 0; i < 200; i++) {
+      EXPECT_TRUE(CompareEqualFloatBuffers<int8>(lhs, rhs))
+          << "should be the same at index " << i;
+      lhs[i] = 3;
+      rhs[i] = 5;
+      EXPECT_FALSE(CompareEqualFloatBuffers<int8>(lhs, rhs))
+          << "should be the different at index " << i;
+      lhs[i] = 0;
+      rhs[i] = 0;
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.cc
new file mode 100644
index 00000000000..f2885e243e2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace gpu {
+
+static StatusOr<bool> PadForTensorCores(HloDotInstruction* dot) {
+  auto* lhs = dot->mutable_operand(0);
+  auto* rhs = dot->mutable_operand(1);
+
+  Shape lshape = lhs->shape();
+  Shape rshape = rhs->shape();
+  Shape result_shape = dot->shape();
+
+  if (lshape.element_type() != PrimitiveType::F16 ||
+      rshape.element_type() != PrimitiveType::F16) {
+    return false;
+  }
+
+  auto pad_dim = [](Shape& s, int64 dim) {
+    s.set_dimensions(dim, RoundUpToNearest<int64>(s.dimensions(dim), 8));
+  };
+
+  auto pad_matrix_dims = [&pad_dim](Shape s) {
+    pad_dim(s, 0);
+    pad_dim(s, 1);
+    return s;
+  };
+
+  Shape new_lshape = pad_matrix_dims(lshape);
+  Shape new_rshape = pad_matrix_dims(rshape);
+  Shape new_result_shape = pad_matrix_dims(result_shape);
+
+  if (new_lshape == lshape && new_rshape == rshape) {
+    return false;
+  }
+
+  VLOG(3) << "old shape: " << lshape << " " << rshape << " " << result_shape;
+  VLOG(3) << "new shape: " << new_lshape << " " << new_rshape << " "
+          << new_result_shape;
+
+  auto create_padding_config = [](Shape& shape, Shape& new_shape) {
+    PaddingConfig padding_config;
+    for (int i = 0; i < shape.rank(); ++i) {
+      auto dimension = padding_config.add_dimensions();
+      dimension->set_edge_padding_high(new_shape.dimensions()[i] -
+                                       shape.dimensions()[i]);
+      dimension->set_edge_padding_low(0);
+      dimension->set_interior_padding(0);
+    }
+    return padding_config;
+  };
+
+  auto l_padding_config = create_padding_config(lshape, new_lshape);
+  auto r_padding_config = create_padding_config(rshape, new_rshape);
+
+  HloComputation* parent = dot->parent();
+
+  HloInstruction* zero_float = parent->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<half>((half)0.0)));
+  zero_float->set_metadata(dot->metadata());
+
+  HloInstruction* lpad = parent->AddInstruction(
+      HloInstruction::CreatePad(new_lshape, lhs, zero_float, l_padding_config));
+  lpad->set_metadata(dot->metadata());
+
+  HloInstruction* rpad = parent->AddInstruction(
+      HloInstruction::CreatePad(new_rshape, rhs, zero_float, r_padding_config));
+  rpad->set_metadata(dot->metadata());
+
+  HloInstruction* new_dot = parent->AddInstruction(
+      dot->CloneWithNewOperands(new_result_shape, {lpad, rpad}));
+
+  HloInstruction* slice = parent->AddInstruction(HloInstruction::CreateSlice(
+      result_shape, new_dot, {0, 0}, result_shape.dimensions(), {1, 1}));
+  slice->set_metadata(dot->metadata());
+
+  bool is_root = dot->user_count() == 0;
+
+  TF_CHECK_OK(parent->ReplaceInstruction(dot, slice));
+
+  if (is_root) {
+    parent->set_root_instruction(slice);
+  }
+
+  return true;
+}
+
+static std::vector<HloDotInstruction*> GetRelevantDots(HloComputation* comp) {
+  std::vector<HloDotInstruction*> convs;
+  for (HloInstruction* instr : comp->instructions()) {
+    if (IsMatrixMultiplication(*instr)) {
+      convs.push_back(Cast<HloDotInstruction>(instr));
+    }
+  }
+  return convs;
+}
+
+StatusOr<bool> CublasGemmPadForTensorCores::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* comp : module->MakeNonfusionComputations()) {
+    for (HloDotInstruction* dot : GetRelevantDots(comp)) {
+      TF_ASSIGN_OR_RETURN(bool result, PadForTensorCores(dot));
+      changed |= result;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h
new file mode 100644
index 00000000000..339e7e3dce6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUBLAS_GEMM_PAD_FOR_TENSOR_CORES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUBLAS_GEMM_PAD_FOR_TENSOR_CORES_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Adds padding to dot operations to make them run faster on GPUs with
+// tensor cores (https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/).
+//
+// f16 dots are padded to have input/output shapes with dimensions that
+// are multiples of 8, so that we can use tensor cores.
+//
+// Don't run this pass on GPUs without tensor cores -- it will make them slower!
+class CublasGemmPadForTensorCores : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "cublas-gemm-pad-for-speed";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUBLAS_GEMM_PAD_FOR_TENSOR_CORES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores_test.cc
new file mode 100644
index 00000000000..df1ba164bef
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores_test.cc
@@ -0,0 +1,223 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h"
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class CublasGemmPadForTensorCoresTest : public HloTestBase {};
+
+TEST_F(CublasGemmPadForTensorCoresTest, OneDotRootComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f16[2048,1024] parameter(0)
+    %param2 = f16[1024,33708] parameter(1)
+    ROOT %dot.2309 = f16[2048,33708]{1,0} dot(f16[2048,1024]{1,0} %param1,
+                f16[1024,33708]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+                })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f16[2048, 33708]"),
+          op::Slice(AllOf(
+              op::Shape("f16[2048, 33712]"),
+              op::Dot(AllOf(op::Shape("f16[2048, 1024]"),
+                            op::Pad(AllOf(op::Shape("f16[2048, 1024]"),
+                                          op::Parameter()),
+                                    AllOf(op::Shape("f16[]"), op::Constant()))),
+                      AllOf(op::Shape("f16[1024, 33712]"),
+                            op::Pad(AllOf(op::Shape("f16[1024, 33708]"),
+                                          op::Parameter()),
+                                    AllOf(op::Shape("f16[]"), op::Constant()))),
+                      /*lhs_contracting_dim=*/1,
+                      /*rhs_contracting_dim=*/0)))));
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, TwoDotsComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f16[2048,1024] parameter(0)
+    %param2 = f16[1024,33708] parameter(1)
+    %param3 = f16[33708, 1] parameter(2)
+    %dot1 = f16[2048,33708]{1,0} dot(f16[2048,1024]{1,0} %param1,
+                f16[1024,33708]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT %dot2 = f16[2048, 1]{1,0} dot(f16[2048,33708]{1,0} %dot1,
+                f16[33708, 1]{0,1} %param3),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f16[2048, 1]"),
+          op::Slice(AllOf(
+              op::Shape("f16[2048, 8]"),
+              op::Dot(
+                  AllOf(
+                      op::Shape("f16[2048, 33712]"),
+                      AllOf(
+                          op::Shape("f16[2048, 33712]"),
+                          AllOf(
+                              op::Shape("f16[2048, 33712]"),
+                              op::Pad(
+                                  AllOf(op::Shape("f16[2048, 33708]"),
+                                        op::Slice(AllOf(
+                                            op::Shape("f16[2048, 33712]"),
+                                            op::Dot(
+                                                AllOf(op::Shape(
+                                                          "f16[2048, 1024]"),
+                                                      op::Pad()),
+                                                AllOf(op::Shape(
+                                                          "f16[1024, 33712]"),
+                                                      op::Pad()),
+                                                1, 0)))),
+                                  AllOf(op::Shape("f16[]"), op::Constant()))))),
+                  AllOf(op::Shape("f16[33712, 8]"),
+                        AllOf(op::Shape("f16[33712, 8]"),
+                              op::Pad(
+                                  AllOf(op::Shape("f16[33708, 1]"),
+                                        op::Parameter()),
+                                  AllOf(op::Shape("f16[]"), op::Constant())))),
+                  /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)))));
+
+  auto* dot2 = root->operand(0)->operand(0)->operand(0)->operand(0);
+  EXPECT_THAT(
+      dot2,
+      AllOf(op::Dot(
+          AllOf(op::Shape("f16[2048, 1024]"),
+                op::Pad(AllOf(op::Shape("f16[2048, 1024]"), op::Parameter()),
+                        AllOf(op::Shape("f16[]"), op::Constant()))),
+          AllOf(op::Shape("f16[1024, 33712]"),
+                op::Pad(AllOf(op::Shape("f16[1024, 33708]"), op::Parameter()),
+                        AllOf(op::Shape("f16[]"), op::Constant()))),
+          /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, NoDotComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %x = f32[] parameter(0)
+    %y = f32[] parameter(1)
+    ROOT %maximum = f32[] maximum(f32[] %x, f32[] %y)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, F32DotComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f32[2048,1024] parameter(0)
+    %param2 = f32[1024,33708] parameter(1)
+    ROOT %dot.2309 = f32[2048,33708]{1,0} dot(f32[2048,1024]{1,0} %param1,
+                f32[1024,33708]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}})")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, F64DotComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f64[2048,1024] parameter(0)
+    %param2 = f64[1024,33708] parameter(1)
+    ROOT %dot.2309 = f64[2048,33708]{1,0} dot(f64[2048,1024]{1,0} %param1,
+                f64[1024,33708]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}})")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, MultiplesOf8DotComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f16[2048,1024] parameter(0)
+    %param2 = f16[1024,33712] parameter(1)
+    ROOT %dot.2309 = f16[2048,33712]{1,0} dot(f16[2048,1024]{1,0} %param1,
+                f16[1024,33712]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}})")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(CublasGemmPadForTensorCoresTest, CheckSavingMetadata) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    %param1 = f16[2048,1024] parameter(0)
+    %param2 = f16[1024,33708] parameter(1)
+    ROOT %dot.2309 = f16[2048,33708]{1,0} dot(f16[2048,1024]{1,0} %param1,
+                f16[1024,33708]{0,1} %param2),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0},
+                metadata={op_type="MatMul" op_name="transformer_v2/Transformer/decode/embedding_shared_weights_1/presoftmax_linear/MatMul"}
+                })")
+                    .ValueOrDie();
+
+  SCOPED_TRACE(module->ToString());
+
+  EXPECT_TRUE(CublasGemmPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto metadata = module->entry_computation()->root_instruction()->metadata();
+  EXPECT_EQ("MatMul", metadata.op_type());
+  EXPECT_EQ(
+      "transformer_v2/Transformer/decode/embedding_shared_weights_1/"
+      "presoftmax_linear/MatMul",
+      metadata.op_name());
+}
+
+}  // anonymous namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index ce17e0253c9..7a7ab6ba05f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -142,10 +143,8 @@ StatusOr<bool> CheckRedzones(const se::cuda::RedzoneAllocator& allocator,
   XLA_SCOPED_LOGGING_TIMER_LEVEL("CudnnConvAlgorithmPicker checking redzones",
                                  2);
   using RedzoneCheckStatus = se::cuda::RedzoneAllocator::RedzoneCheckStatus;
-
   TF_ASSIGN_OR_RETURN(RedzoneCheckStatus redzone_check,
-                      allocator.CheckRedzones(stream));
-
+                      allocator.CheckRedzones());
   if (redzone_check.ok()) {
     return true;
   }
@@ -235,7 +234,6 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
   return result_or;
 }
 
-
 StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     const HloCustomCallInstruction* instr) {
   XLA_SCOPED_LOGGING_TIMER(
@@ -250,11 +248,6 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     return InternalError("Failed to synchronize GPU for autotuning.");
   }
 
-  // Create a stream for us to do our work on.
-  se::Stream stream{stream_exec_};
-  stream.Init();
-  const auto device_ordinal = stream_exec_->device_ordinal();
-
   // allocator either points to this->allocator_ or, if that's null, to a
   // se::StreamExecutorMemoryAllocator for stream_exec_.
   se::DeviceMemoryAllocator* allocator;
@@ -266,11 +259,21 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     allocator = &*se_allocator;
   }
 
+  absl::optional<se::Stream> stream_opt;
+  se::Stream* stream = [&] {
+    if (allocator->GetStream()) {
+      return allocator->GetStream();
+    }
+    stream_opt.emplace(stream_exec_);
+    stream_opt->Init();
+    return &stream_opt.value();
+  }();
+
   int64 rng_state = 0;
 
-  const auto initialize_buffer = [&stream, &result_shape,
+  const auto initialize_buffer = [stream, &result_shape,
                                   &rng_state](DeviceMemoryBase buffer) {
-    InitializeFloatBuffer(&stream, result_shape.element_type(), &rng_state,
+    InitializeFloatBuffer(stream, result_shape.element_type(), &rng_state,
                           buffer);
   };
 
@@ -278,18 +281,18 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
 
   // Allocate space for the input, filter, and output of the convolution.
   se::cuda::RedzoneAllocator input_output_allocator(
-      device_ordinal, allocator, PtxOptsFromConfig(hlo_module_config));
+      stream, allocator, PtxOptsFromConfig(hlo_module_config));
   std::vector<se::DeviceMemoryBase> operand_buffers;
   for (const auto* operand : instr->operands()) {
     TF_ASSIGN_OR_RETURN(auto buffer,
                         input_output_allocator.AllocateBytes(
-                            &stream, ShapeUtil::ByteSizeOf(operand->shape())));
+                            ShapeUtil::ByteSizeOf(operand->shape())));
     initialize_buffer(buffer);
     operand_buffers.push_back(buffer);
   }
   TF_ASSIGN_OR_RETURN(auto result_buffer,
                       input_output_allocator.AllocateBytes(
-                          &stream, ShapeUtil::ByteSizeOf(result_shape)));
+                          ShapeUtil::ByteSizeOf(result_shape)));
   initialize_buffer(result_buffer);
 
   TF_ASSIGN_OR_RETURN(auto backend_config,
@@ -311,14 +314,33 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();
 
+  const auto canonical_hlo =
+      std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
+
+  string blas_version;
+  if (auto* blas = stream_exec_->AsBlas()) {
+    (void)blas->GetVersion(&blas_version);
+  }
+
+  absl::Span<const AlgorithmDesc> blacklisted_algos =
+      GetBlacklistedConvAlgorithms(GetComputeCapability(stream_exec_),
+                                   GetCudnnVersion(stream_exec_), blas_version,
+                                   canonical_hlo);
+
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     XLA_SCOPED_LOGGING_TIMER_LEVEL(
         absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
                      AlgorithmToString(alg)),
         2);
 
+    if (absl::c_linear_search(blacklisted_algos, alg)) {
+      LOG(INFO) << "Omitted potentially buggy algorithm "
+                << AlgorithmToString(alg) << " for conv " << instr->ToString();
+      continue;
+    }
+
     se::cuda::RedzoneAllocator scratch_allocator(
-        device_ordinal, allocator, PtxOptsFromConfig(hlo_module_config));
+        stream, allocator, PtxOptsFromConfig(hlo_module_config));
     se::dnn::ProfileResult profile_result;
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
@@ -329,7 +351,7 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     options.algo_override = alg;
     Status launch_status =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, options);
+                     &scratch_allocator, stream, options);
 
     if (!launch_status.ok()) {
       continue;
@@ -352,22 +374,39 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
 
     // Check for writes to redzones.
     TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
-                        CheckRedzones(input_output_allocator, &stream,
+                        CheckRedzones(input_output_allocator, stream,
                                       "input/output", instr, &result));
 
     TF_ASSIGN_OR_RETURN(
         bool scratch_allocator_redzone_clear,
-        CheckRedzones(scratch_allocator, &stream, "scratch", instr, &result));
+        CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
 
     if (!input_output_allocator_redzone_clear ||
         !scratch_allocator_redzone_clear) {
+      AlgorithmBlacklist proto;
+      auto entry = proto.add_entries();
+      entry->set_hlo(canonical_hlo);
+      *entry->mutable_cc() = GetComputeCapability(stream_exec_);
+      *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
+      entry->set_blas_version(blas_version);
+      auto algo = entry->add_algos();
+      algo->set_id(alg.algo_id());
+      algo->set_tensor_ops(alg.tensor_ops_enabled());
+
+      LOG(ERROR)
+          << "To blacklist this algorithm for this convolution, "
+             "copy-paste the following "
+             "proto to the blacklist file pointed by XLA_FLAGS "
+             "--xla_gpu_algorithm_blacklist_path="
+          << GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path()
+          << " : " << proto.ShortDebugString();
       continue;
     }
 
     if (comparator.has_value()) {
       XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
       StatusOr<bool> compare_result = comparator->CompareEqual(
-          &stream, reference_result_buffer, result_buffer);
+          stream, reference_result_buffer, result_buffer);
       if (!compare_result.ok()) {
         LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
                    << " against " << AlgorithmToString(alg) << " for "
@@ -385,7 +424,7 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
             << instr->ToString() << " for "
             << AlgorithmToString(first_algorithm) << " vs "
             << AlgorithmToString(alg);
-        PrintPlatformInfo(&stream);
+        PrintPlatformInfo(stream);
         VLOG(1) << "Full module on failure: \n"
                 << instr->GetModule()->ToString();
         auto* fail = result.mutable_failure();
@@ -402,9 +441,9 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
       comparator.emplace(result_shape, hlo_module_config);
       TF_ASSIGN_OR_RETURN(
           reference_result_buffer,
-          input_output_allocator.AllocateBytes(&stream, result_buffer.size()));
-      stream.ThenMemcpy(&reference_result_buffer, result_buffer,
-                        result_buffer.size());
+          input_output_allocator.AllocateBytes(result_buffer.size()));
+      stream->ThenMemcpy(&reference_result_buffer, result_buffer,
+                         result_buffer.size());
       first_algorithm = alg;
     }
   }
@@ -431,6 +470,7 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
     log.set_device_pci_bus_id(
         stream_exec_->GetDeviceDescription().pci_bus_id());
+    log.set_blas_version(blas_version);
     VLOG(1) << "Autotuning result: " << log.ShortDebugString();
     // If we crash on checking failure, we are in a testing/benchmark mode, thus
     // omitting logging through the logger.
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
old mode 100644
new mode 100755
index e81850db69e..fc44a9947b4
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -89,13 +89,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
 
 // Try to match a backward filter pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
-    HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers, HloInstruction*>
+MatchBackwardFilter(HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
-  if (conv->feature_group_count() > 1) {
-    return no_match_result;
-  }
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
+
   // Step 1: match the instruction pattern without considering the paddings and
   // dimension numbers just yet. We may need some generic pattern matcher
   // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
@@ -155,6 +153,15 @@ std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
                "to fold it to a backward filter convolution.";
     return no_match_result;
   }
+  auto rhs_in =
+      conv->mutable_operand(1)->shape().dimensions(kernel_input_feature_dim);
+  if (conv->feature_group_count() > 1 && rhs_in == 1 &&
+      input_batch_dim == output_batch_dim) {
+    VLOG(1) << conv->ToString()
+            << " is a depthwise forward convolution. No need to fold to "
+               "backward filter.";
+    return no_match_result;
+  }
 
   // Step 3: fuse the matched HLOs into a backward convolution instruction.
   //
@@ -248,7 +255,62 @@ std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
     backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
+  HloInstruction* lhs = conv->mutable_operand(0);
+  if (conv->feature_group_count() == 1) {
+    return std::make_tuple(true, backward_conv_window, backward_conv_dnums,
+                           lhs);
+  }
+
+  int64 input_batch_dimension = backward_conv_dnums.input_batch_dimension();
+  int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension();
+
+  int64 input_batch = lhs->shape().dimensions(input_batch_dimension);
+  int64 input_feature = lhs->shape().dimensions(input_feature_dimension);
+
+  // Reshape batch_dim G*N -> [G,N]
+  std::vector<int64> reshape_dims = lhs->shape().dimensions();
+  auto num_groups = conv->feature_group_count();
+  CHECK_EQ(input_batch % num_groups, 0)
+      << "Input batch should be an exact multiple of feature group count";
+  reshape_dims[input_batch_dimension] =
+      reshape_dims[input_batch_dimension] / num_groups;
+  reshape_dims.insert(reshape_dims.begin() + input_batch_dimension, num_groups);
+
+  HloComputation* c = conv->parent();
+  HloInstruction* lhs_reshape_1 =
+      c->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims),
+          lhs));
+
+  // Transpose G to the axis before C/G, For eg: [G, N, C/G, H, W] -> [N, G,
+  // C/G, H, W]
+  std::vector<int64> transpose_dims(lhs_reshape_1->shape().dimensions_size());
+  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  transpose_dims.erase(transpose_dims.begin() + input_batch_dimension);
+  transpose_dims.insert(transpose_dims.begin() + input_feature_dimension,
+                        input_batch_dimension);
+  std::vector<int64> transpose_reshape_dims =
+      lhs_reshape_1->shape().dimensions();
+  transpose_reshape_dims.erase(transpose_reshape_dims.begin() +
+                               input_batch_dimension);
+  transpose_reshape_dims.insert(
+      transpose_reshape_dims.begin() + input_feature_dimension, num_groups);
+
+  HloInstruction* lhs_transpose =
+      c->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(lhs_reshape_1->shape().element_type(),
+                               transpose_reshape_dims),
+          lhs_reshape_1, transpose_dims));
+
+  // Merge [G,C/G] -> [C]
+  Shape new_shape = lhs_transpose->shape();
+  new_shape.DeleteDimension(input_feature_dimension);
+  new_shape.set_dimensions(input_feature_dimension,
+                           input_feature * conv->feature_group_count());
+  HloInstruction* lhs_reshape_2 = c->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, lhs_transpose));
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums,
+                         lhs_reshape_2);
 }
 
 // Try to match a backward input pattern that contains "conv".
@@ -258,9 +320,11 @@ MatchBackwardInput(HloInstruction* conv) {
   const auto no_match_result =
       std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
 
-  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
-  // for the backward input convolution, but at least for now with version 7.1.4
-  // it is slower. This needs to be re-evaluated for future cuDNN versions.
+  // TODO: Theoretically cuDNN supports grouped convolutions also
+  // for the backward input convolution, but based on the cudnn's current state
+  // there is not much performance improvement when using the
+  // cudnn backward input API for grouped conv.
+  // This needs to be re-evaluated for future cuDNN versions.
   // Note that we already have the necessary code down below, the only thing to
   // enable it is to remove the following early return.
   if (conv->feature_group_count() > 1) {
@@ -272,6 +336,22 @@ MatchBackwardInput(HloInstruction* conv) {
   HloInstruction* reverse_filter = conv->mutable_operand(1);
   ConvolutionDimensionNumbers dnums = conv->convolution_dimension_numbers();
 
+  // Match BackwardInput for a depthwise convolution and thunk it to forward
+  // convolution Output feature dimension and input feature dimension has been
+  // swapped in the bridge. Hence to get the actual input features we need to
+  // query the output feature dimension
+  auto kernel_out_feature_dim = dnums.kernel_output_feature_dimension();
+  auto kernel_out_features =
+      reverse_filter->shape().dimensions(kernel_out_feature_dim);
+
+  // For a depthwise convolution, the input features must be equal to the
+  // feature_group_count. We can leverage this property to match a depthwise
+  // convolution and thunk it to forward conv
+  if (conv->feature_group_count() > 1 &&
+      kernel_out_features == conv->feature_group_count()) {
+    return no_match_result;
+  }
+
   // We pattern-match to a backwards input conv if:
   //
   //  - all spatial dims of the filter are reversed
@@ -333,9 +413,8 @@ MatchBackwardInput(HloInstruction* conv) {
   Window new_window = old_window;
   for (size_t i = 0; i < input_spatial_dims.size(); ++i) {
     // Restore backward convolution's padding config from the matched pattern.
-    // See the comment in tensorflow/core/kernels/conv_grad_tuple_ops.cc
-    // for how we convert backward input convolution to a variant of forward
-    // convolution.
+    // See the comment in tensorflow/core/kernels/conv_grad_ops.h for how we
+    // convert backward input convolution to a variant of forward convolution.
     //
     // The stride of the backward convolution
     // = the base dilation factor of the forward convolution
@@ -429,11 +508,23 @@ MatchBackwardInput(HloInstruction* conv) {
   }
 
   // OK, it's a match! Switch the input feature dimension with the output
-  // feature dimension. This is the way cuDNN expects it to be.
+  // feature dimension. Also switch the output with the input. This is the way
+  // cuDNN expects it to be.
+  auto conv_dnums = conv->convolution_dimension_numbers();
   dnums.set_kernel_input_feature_dimension(
-      conv->convolution_dimension_numbers().kernel_output_feature_dimension());
+      conv_dnums.kernel_output_feature_dimension());
   dnums.set_kernel_output_feature_dimension(
-      conv->convolution_dimension_numbers().kernel_input_feature_dimension());
+      conv_dnums.kernel_input_feature_dimension());
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
+    dnums.set_input_spatial_dimensions(i,
+                                       conv_dnums.output_spatial_dimensions(i));
+    dnums.set_output_spatial_dimensions(i,
+                                        conv_dnums.input_spatial_dimensions(i));
+  }
+  dnums.set_input_feature_dimension(conv_dnums.output_feature_dimension());
+  dnums.set_input_batch_dimension(conv_dnums.output_batch_dimension());
+  dnums.set_output_feature_dimension(conv_dnums.input_feature_dimension());
+  dnums.set_output_batch_dimension(conv_dnums.input_batch_dimension());
 
   // If we matched against a constant, we need to add a reverse op that can be
   // subsumed by the cuDNN call. algebraic-simplifier will later remove any
@@ -469,7 +560,6 @@ MatchBackwardInput(HloInstruction* conv) {
   // dimensions, we need to divide the new 'kernel_input_feature_dimension' by
   // 'feature_group_count' and multiply the new
   // 'kernel_output_feature_dimension' by 'feature_group_count'.
-  Shape new_shape = rhs->shape();
   int64 input_feature_dimension = dnums.kernel_input_feature_dimension();
   int64 output_feature_dimension = dnums.kernel_output_feature_dimension();
 
@@ -477,13 +567,47 @@ MatchBackwardInput(HloInstruction* conv) {
   // feature dimensions, and we are guaranteed that the spatial dimensions are
   // adjacent.
   CHECK_EQ(std::abs(input_feature_dimension - output_feature_dimension), 1LL);
-  int64 input_features = new_shape.dimensions(input_feature_dimension);
-  int64 output_features = new_shape.dimensions(output_feature_dimension);
-  new_shape.set_dimensions(input_feature_dimension,
-                           input_features / conv->feature_group_count());
-  new_shape.set_dimensions(output_feature_dimension,
-                           output_features * conv->feature_group_count());
+  int64 input_features = rhs->shape().dimensions(input_feature_dimension);
+  int64 output_features = rhs->shape().dimensions(output_feature_dimension);
+
+  // Reshape [H, W, ..., in_depth, out_depth / G] -> [H, W, ..., G, in_depth/G,
+  // out_depth / G]
+  std::vector<int64> reshape_dims = rhs->shape().dimensions();
+  auto num_groups = conv->feature_group_count();
+  CHECK_EQ(input_features % num_groups, 0)
+      << "Input feature count should be an exact multiple of feature group "
+         "count";
+  reshape_dims[input_feature_dimension] =
+      reshape_dims[input_feature_dimension] / num_groups;
+  reshape_dims.insert(reshape_dims.begin() + input_feature_dimension,
+                      num_groups);
+
   HloComputation* c = conv->parent();
+  rhs = c->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(rhs->shape().element_type(), reshape_dims), rhs));
+
+  // Transpose [H, W, ..., G, in_depth/G, out_depth / G] -> [H, W, ...,
+  // in_depth/G, G, out_depth / G]
+  std::vector<int64> transpose_dims(rhs->shape().dimensions_size());
+  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  transpose_dims.erase(transpose_dims.begin() + input_feature_dimension);
+  transpose_dims.insert(transpose_dims.begin() + output_feature_dimension,
+                        input_feature_dimension);
+  std::vector<int64> transpose_reshape_dims = rhs->shape().dimensions();
+  transpose_reshape_dims.erase(transpose_reshape_dims.begin() +
+                               input_feature_dimension);
+  transpose_reshape_dims.insert(
+      transpose_reshape_dims.begin() + output_feature_dimension, num_groups);
+  rhs = c->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeUtil::MakeShape(rhs->shape().element_type(), transpose_reshape_dims),
+      rhs, transpose_dims));
+
+  // Reshape [H, W, ..., in_depth/G, G, out_depth / G] -> [H, W, ...,
+  // in_depth/G, out_depth]
+  Shape new_shape = rhs->shape();
+  new_shape.DeleteDimension(output_feature_dimension);
+  new_shape.set_dimensions(output_feature_dimension,
+                           output_features * num_groups);
   rhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, rhs));
   return std::make_tuple(true, new_window, dnums, rhs);
 }
@@ -503,14 +627,7 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
     Window window;
     ConvolutionDimensionNumbers dnums;
     HloInstruction* rhs;
-
-    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
-    if (match) {
-      return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
-                             conv->mutable_operand(0), conv->mutable_operand(1),
-                             window, dnums, conv->feature_group_count(),
-                             conv->metadata());
-    }
+    HloInstruction* lhs;
 
     std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
     if (match) {
@@ -519,6 +636,13 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
                              conv->feature_group_count(), conv->metadata());
     }
 
+    std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv);
+    if (match) {
+      return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
+                             lhs, conv->mutable_operand(1), window, dnums,
+                             conv->feature_group_count(), conv->metadata());
+    }
+
     // If all else fails, try a forward convolution.
     if (CanImplementAsCudnnForwardConv(conv)) {
       return CreateCudnnConv(kCudnnConvForwardCallTarget, conv->shape(),
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index dbcdc2b075b..362d8d13aab 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -135,6 +135,86 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
       << md_after_opt.DebugString() << " vs " << metadata.DebugString();
 }
 
+TEST_F(CudnnConvRewriterTest, BackwardFilterGroupConvolve) {
+  // In a nutshell, before pass:
+  // Input->batch_dim: 3 input_shape(3) = 4
+  // Input->feature_dim: 0 input_shape(0) = 32
+  // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 0
+  // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3
+  // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 2
+  // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3
+
+  // After pass: All shapes and dimension layout is brought
+  // back to normal as would be acceptable by cudnn
+  // Input->batch_dim: 0 input_shape(0) = 8
+  // Input->feature_dim: 3 input_shape(3) = 16
+  // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 2
+  // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3
+  // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 0
+  // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3
+  HloComputation::Builder builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {32, 1, 3, 4}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {8, 1, 2, 16}), "gradients"));
+  Window conv_window = default_conv_window_;
+  conv_window.mutable_dimensions(1)->set_size(2);
+  conv_window.mutable_dimensions(1)->set_window_dilation(2);
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(
+          activations->shape(), gradients->shape(), /*feature_group_count=*/4,
+          /*batch_group_count=*/1, conv_window,
+          tf_default_dnums_for_backward_filter_)
+          .ConsumeValueOrDie(),
+      activations, gradients, /*feature_group_count=*/4,
+      /*batch_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+  OpMetadata metadata;
+  metadata.set_op_name("bar");
+  conv->set_metadata(metadata);
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  // Check that metadata was preserved.
+  const auto& md_after_opt =
+      entry_computation->root_instruction()->operand(0)->metadata();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(md_after_opt, metadata))
+      << md_after_opt.DebugString() << " vs " << metadata.DebugString();
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
+  const ConvolutionDimensionNumbers conv_dim =
+      custom_call->convolution_dimension_numbers();
+  const auto lhs_a = custom_call->operand(0);
+  const auto input_shape = lhs_a->shape();
+  // The input (lhs) batch_dim(dim 0 in the original NHWC layout) gets mapped to
+  // be the feature_dim(dim 3) with a value of N*g = 32 in tf2xla. As described
+  // in conv_grad_ops.h, this swap is required to implement backprop using fwd
+  // conv. After the pass the batch_dim gets remapped to dim 0. The batch_dim
+  // value gets scaled to N = N*g/g = 32/4 = 8 to be compatible with cudnn
+  EXPECT_EQ(0, conv_dim.input_batch_dimension());
+  EXPECT_EQ(8, input_shape.dimensions(conv_dim.input_batch_dimension()));
+  // Similarly, the input (lhs) feature_dim(dim 3 in the original NHWC layout)
+  // gets mapped to be the batch_dim(dim 0) with a value of C/g = 4 in tf2xla.
+  // After the pass the batch_dim gets remapped to dim 0. The feature_dim value
+  // gets scaled to C = C/g*g = 4*4 = 16 to be compatible with cudnn
+  EXPECT_EQ(3, conv_dim.input_feature_dimension());
+  EXPECT_EQ(16, input_shape.dimensions(conv_dim.input_feature_dimension()));
+  // Similarly, the feature and batch dims of the incoming gradients (used as
+  // rhs) and the in/out dims of the output of convolution i.e, dgrad have been
+  // been modified in tf2xla (as described in conv_grad_ops.h). This pass remaps
+  // everything back for the layout to be compatible with cudnn backprop APIs.
+  EXPECT_EQ(2, conv_dim.kernel_input_feature_dimension());
+  EXPECT_EQ(3, conv_dim.kernel_output_feature_dimension());
+  EXPECT_EQ(0, conv_dim.output_batch_dimension());
+  EXPECT_EQ(3, conv_dim.output_feature_dimension());
+}
+
 TEST_F(CudnnConvRewriterTest,
        BackwardFilterConvolveEquivalentToForwardConvolution) {
   HloComputation::Builder builder(TestName());
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index c2817e36466..2c380c9860e 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -48,12 +48,10 @@ class ScratchBufAllocator : public se::ScratchAllocator {
 
   ~ScratchBufAllocator() override = default;
 
-  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
-    return scratch_.size();
-  }
+  int64 GetMemoryLimitInBytes() override { return scratch_.size(); }
 
   se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override {
+      int64 byte_size) override {
     if (allocated_) {
       return se::port::InternalError(
           "Can't allocate twice from a ScratchBufAllocator.");
@@ -73,31 +71,91 @@ class ScratchBufAllocator : public se::ScratchAllocator {
   bool allocated_ = false;
 };
 
-template <typename T>
-Status RunCudnnConvImpl(const CudnnConvParams& params,
-                        se::ScratchAllocator* scratch_allocator,
-                        se::Stream* stream, RunConvOptions options) {
-  auto input_buf = se::DeviceMemory<T>(params.input_buf);
-  auto filter_buf = se::DeviceMemory<T>(params.filter_buf);
-  auto output_buf = se::DeviceMemory<T>(params.output_buf);
-  AlgorithmConfig algorithm = params.algorithm;
+template <typename ElementType, typename OutputType>
+Status RunCudnnConvForward(CudnnConvParams params,
+                           se::ScratchAllocator* scratch_allocator,
+                           se::Stream* stream, RunConvOptions options,
+                           DeviceMemory<ElementType> input_buf,
+                           DeviceMemory<ElementType> filter_buf,
+                           DeviceMemory<OutputType> output_buf,
+                           AlgorithmConfig algorithm) {
+  if (params.conv_result_scale != 1) {
+    return InternalError(
+        "StreamExecutor doesn't support scaled convolution: %lf.",
+        params.conv_result_scale);
+  }
+  stream->ThenConvolveWithAlgorithm(
+      params.input_descriptor, input_buf, params.filter_descriptor, filter_buf,
+      params.conv_desc, params.output_descriptor, &output_buf,
+      scratch_allocator, algorithm, options.profile_result);
+  return Status::OK();
+}
 
-  if (options.algo_override) {
-    algorithm = AlgorithmConfig(*options.algo_override);
+template <typename ElementType, typename BiasType, typename OutputType>
+Status RunCudnnConvForwardActivation(CudnnConvParams params,
+                                     se::ScratchAllocator* scratch_allocator,
+                                     se::Stream* stream, RunConvOptions options,
+                                     DeviceMemory<ElementType> input_buf,
+                                     DeviceMemory<ElementType> filter_buf,
+                                     DeviceMemory<OutputType> output_buf,
+                                     AlgorithmConfig algorithm) {
+  BatchDescriptor bias_desc;
+  bias_desc.set_count(1)
+      .set_height(1)
+      .set_width(1)
+      .set_feature_map_count(params.output_descriptor.feature_map_count())
+      .set_layout(params.output_descriptor.layout());
+
+  se::DeviceMemory<OutputType> side_input(params.fusion->side_input_buf);
+  // If there is no side input, use output as the side input.
+  if (side_input.is_null()) {
+    if (params.fusion->side_input_scale != 0) {
+      return InternalError(
+          "Side input scale is not 0, yet no side input buffer is "
+          "provided");
+    }
+    // Since side-input scale is 0, the values in the side input don't
+    // matter.  The simplest thing to do would be to pass in a null buffer
+    // for the side input, but cudnn doesn't allow this.  cudnn does promise
+    // that if side-input-scale is 0 the side input won't be read, so we
+    // just pass in the output buffer, since it's handy and has the correct
+    // size.
+    side_input = output_buf;
   }
 
+  stream->ThenFusedConvolveWithAlgorithm(
+      params.input_descriptor, input_buf, params.conv_result_scale,
+      params.filter_descriptor, filter_buf, params.conv_desc, side_input,
+      params.fusion->side_input_scale, bias_desc,
+      DeviceMemory<BiasType>(params.fusion->bias_buf), params.fusion->mode,
+      params.output_descriptor, &output_buf, scratch_allocator, algorithm,
+      options.profile_result);
+
+  return Status::OK();
+}
+
+// StreamExecutor supports various data types via overloading, and the support
+// is maintained on-demand. To avoid calling into non-exist overloads, we have
+// to carefully not call into them by using enable_if.
+// TODO(timshen): Ideally, to avoid such complication in the runner, we can turn
+// StreamExecutor overloadings to template functions, and for unsupported data
+// types return runtime errors.
+// This is the specialization for double, float, and half types.  All kinds of
+// convolutions are supported here.
+template <typename ElementType, typename BiasType, typename OutputType,
+          typename std::enable_if<
+              !std::is_integral<ElementType>::value>::type* = nullptr>
+Status RunCudnnConvInternalImpl(CudnnConvParams params,
+                                se::ScratchAllocator* scratch_allocator,
+                                se::Stream* stream, RunConvOptions options,
+                                DeviceMemory<ElementType> input_buf,
+                                DeviceMemory<ElementType> filter_buf,
+                                DeviceMemory<OutputType> output_buf,
+                                AlgorithmConfig algorithm) {
   switch (params.kind) {
     case CudnnConvKind::kForward:
-      if (params.conv_result_scale != 1) {
-        return InternalError(
-            "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
-      }
-      stream->ThenConvolveWithAlgorithm(
-          params.input_descriptor, input_buf, params.filter_descriptor,
-          filter_buf, params.conv_desc, params.output_descriptor, &output_buf,
-          scratch_allocator, algorithm, options.profile_result);
-      break;
+      return RunCudnnConvForward(params, scratch_allocator, stream, options,
+                                 input_buf, filter_buf, output_buf, algorithm);
     case CudnnConvKind::kBackwardInput:
       if (params.conv_result_scale != 1) {
         return InternalError(
@@ -121,46 +179,70 @@ Status RunCudnnConvImpl(const CudnnConvParams& params,
           scratch_allocator, algorithm, options.profile_result);
       break;
     case CudnnConvKind::kForwardActivation: {
-      BatchDescriptor bias_desc;
-      bias_desc.set_count(1)
-          .set_height(1)
-          .set_width(1)
-          .set_feature_map_count(params.output_descriptor.feature_map_count())
-          .set_layout(params.output_descriptor.layout());
-
-      se::DeviceMemory<T> side_input(params.fusion->side_input_buf);
-      // If there is no side input, use output as the side input.
-      if (side_input.is_null()) {
-        if (params.fusion->side_input_scale != 0) {
-          return InternalError(
-              "Side input scale is not 0, yet no side input buffer is "
-              "provided");
-        }
-        // Since side-input scale is 0, the values in the side input don't
-        // matter.  The simplest thing to do would be to pass in a null buffer
-        // for the side input, but cudnn doesn't allow this.  cudnn does promise
-        // that if side-input-scale is 0 the side input won't be read, so we
-        // just pass in the output buffer, since it's handy and has the correct
-        // size.
-        side_input = output_buf;
-      }
-
-      stream->ThenFusedConvolveWithAlgorithm(
-          params.input_descriptor, input_buf, params.conv_result_scale,
-          params.filter_descriptor, filter_buf, params.conv_desc, side_input,
-          params.fusion->side_input_scale, bias_desc,
-          DeviceMemory<T>(params.fusion->bias_buf), params.fusion->mode,
-          params.output_descriptor, &output_buf, scratch_allocator, algorithm,
-          options.profile_result);
-      break;
+      return RunCudnnConvForwardActivation<ElementType, BiasType, OutputType>(
+          params, scratch_allocator, stream, options, input_buf, filter_buf,
+          output_buf, algorithm);
     }
   }
+  return Status::OK();
+}
+
+// Specialization for integer types.  Only two forward convolutions are allowed.
+template <typename ElementType, typename BiasType, typename OutputType,
+          typename std::enable_if<std::is_integral<ElementType>::value>::type* =
+              nullptr>
+Status RunCudnnConvInternalImpl(CudnnConvParams params,
+                                se::ScratchAllocator* scratch_allocator,
+                                se::Stream* stream, RunConvOptions options,
+                                DeviceMemory<ElementType> input_buf,
+                                DeviceMemory<ElementType> filter_buf,
+                                DeviceMemory<OutputType> output_buf,
+                                AlgorithmConfig algorithm) {
+  switch (params.kind) {
+    case CudnnConvKind::kForward:
+      return RunCudnnConvForward(params, scratch_allocator, stream, options,
+                                 input_buf, filter_buf, output_buf, algorithm);
+    case CudnnConvKind::kForwardActivation:
+      return RunCudnnConvForwardActivation<ElementType, BiasType, OutputType>(
+          params, scratch_allocator, stream, options, input_buf, filter_buf,
+          output_buf, algorithm);
+    default:
+      return InternalError(
+          "Only convolution kinds kForward and kForwardActivation are "
+          "supported for integer types");
+  }
+  return Status::OK();
+}
+
+template <typename ElementType, typename BiasType, typename OutputType>
+Status RunCudnnConvImpl(const CudnnConvParams& params,
+                        se::ScratchAllocator* scratch_allocator,
+                        se::Stream* stream, RunConvOptions options) {
+  auto input_buf = se::DeviceMemory<ElementType>(params.input_buf);
+  auto filter_buf = se::DeviceMemory<ElementType>(params.filter_buf);
+  auto output_buf = se::DeviceMemory<OutputType>(params.output_buf);
+  AlgorithmConfig algorithm = params.algorithm;
+
+  if (options.algo_override) {
+    algorithm = AlgorithmConfig(*options.algo_override);
+  }
+
+  Status run_status =
+      RunCudnnConvInternalImpl<ElementType, BiasType, OutputType>(
+          params, scratch_allocator, stream, options, input_buf, filter_buf,
+          output_buf, algorithm);
+
+  if (run_status != Status::OK()) {
+    return run_status;
+  }
 
   if (!stream->ok()) {
     return InternalError(
-        "Unable to launch convolution with type %s and algorithm (%d, %d)",
+        "Unable to launch convolution with type %s and algorithm (%d, %s)",
         CudnnConvKindToString(params.kind), algorithm.algorithm()->algo_id(),
-        algorithm.algorithm_no_scratch()->algo_id());
+        algorithm.algorithm_no_scratch().has_value()
+            ? absl::StrCat(algorithm.algorithm_no_scratch()->algo_id())
+            : "none");
   }
   return Status::OK();
 }
@@ -372,18 +454,31 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
   TF_ASSIGN_OR_RETURN(CudnnConvParams params,
                       GetCudnnConvParams(conv, operand_buffers, result_buffer));
 
-  PrimitiveType output_primitive_type =
-      conv->shape().tuple_shapes(0).element_type();
-  switch (output_primitive_type) {
+  PrimitiveType input_primitive_type = conv->operand(0)->shape().element_type();
+  switch (input_primitive_type) {
     case F16:
-      return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
-                                           options);
+      return RunCudnnConvImpl<Eigen::half, Eigen::half, Eigen::half>(
+          params, scratch_allocator, stream, options);
     case F32:
-      return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
-                                     options);
+      return RunCudnnConvImpl<float, float, float>(params, scratch_allocator,
+                                                   stream, options);
     case F64:
-      return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
-                                      options);
+      return RunCudnnConvImpl<double, double, double>(params, scratch_allocator,
+                                                      stream, options);
+    case S8: {
+      PrimitiveType output_primitive_type =
+          conv->shape().tuple_shapes(0).element_type();
+      switch (output_primitive_type) {
+        case F32:
+          return RunCudnnConvImpl<int8, float, float>(params, scratch_allocator,
+                                                      stream, options);
+        case S8:
+          return RunCudnnConvImpl<int8, float, int8>(params, scratch_allocator,
+                                                     stream, options);
+        default:
+          LOG(FATAL) << conv->ToString();
+      }
+    }
     default:
       LOG(FATAL) << conv->ToString();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
index dee257a5d97..aca7307e0c2 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
@@ -223,6 +223,7 @@ StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
   }
   auto new_conv = computation->AddInstruction(HloInstruction::CreateCustomCall(
       conv->shape(), args, kCudnnConvBiasActivationForwardCallTarget));
+  new_conv->set_feature_group_count(conv->feature_group_count());
   new_conv->set_window(conv->window());
   new_conv->set_convolution_dimension_numbers(
       conv->convolution_dimension_numbers());
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
index 7aa442d3bff..b621880f639 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -163,6 +163,26 @@ TEST_F(CudnnFusedConvRewriterTest, TestScaledConv) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, TestNoCrashOnInf) {
+  EXPECT_TRUE(RunAndCompare(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(inf)
+      zeros = f32[1,32,9,9] broadcast(zero), dimensions={}
+      alpha_conv_scalar = f32[] constant(0.999994934)
+
+      input = f32[1,17,9,9] parameter(0)
+      filter = f32[3,3,17,32] parameter(1)
+
+      conv = f32[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+      alpha_conv = f32[1,32,9,9] broadcast(alpha_conv_scalar), dimensions={}
+      scaled_conv = f32[1,32,9,9] multiply(conv, alpha_conv)
+      ROOT relu = f32[1,32,9,9] maximum(zeros, scaled_conv)
+    })",
+                            ErrorSpec{0.01}));
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndSideInput) {
   // max(0, conv(x, w) + 0.899994934 * side_input);
   TestMatchWithAllTypes(R"(
@@ -305,6 +325,30 @@ TEST_F(CudnnFusedConvRewriterTest, PreservesMetadata) {
       ::testing::ContainsRegex(R"(custom-call.*metadata=\{op_type="foo"\})"));
 }
 
+TEST_F(CudnnFusedConvRewriterTest, TestPreservesFeatureGroupCount) {
+  // The convolution below would crash if feature_count is not preserved.
+  const char* kHloString = R"(
+    HloModule jaxpr_computation__6.19
+
+    primitive_computation__1.4 {
+      parameter.5 = f32[] parameter(0)
+      parameter.6 = f32[] parameter(1)
+      ROOT add.7 = f32[] add(parameter.5, parameter.6)
+    }
+
+    ENTRY jaxpr_computation__7.8 {
+      parameter.11 = f32[2,64,64,53]{3,2,1,0} parameter(1)
+      parameter.10 = f32[3,3,1,53]{3,2,1,0} parameter(0)
+      convolution.12 = f32[2,64,64,53]{3,2,1,0} convolution(parameter.11, parameter.10), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=53
+      constant.13 = f32[] constant(0)
+      broadcast.14 = f32[2,64,64,53]{3,2,1,0} broadcast(constant.13), dimensions={}
+      maximum.15 = f32[2,64,64,53]{3,2,1,0} maximum(convolution.12, broadcast.14)
+      ROOT reduce.17 = f32[] reduce(maximum.15, constant.13), dimensions={0,1,2,3}, to_apply=primitive_computation__1.4
+    }
+  )";
+  EXPECT_TRUE(RunAndCompare(kHloString, ErrorSpec{0.01}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
index c04f6fb7bf5..53a3ca14400 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
@@ -90,67 +90,25 @@ void Callback_SubBuffers(CUstream stream, void** buffers,
                          const char* /*opaque*/, size_t /*opaque_len*/) {
   // `buffers` is a flat array containing device pointers to the following.
   //
-  //   0: root tuple of param 0
-  //   1:   param 0 at tuple index {0}, shape f32[128]
-  //   2:   param 0 at tuple index {1}, shape f32[256]
-  //   3: root tuple of param 1
-  //   4:   param 1 at tuple index {0}, shape f32[1024]
-  //   5:   param 1 at tuple index {1}, shape f32[8]
-  //   6: root tuple of custom-call result
-  //   7:   result at tuple index {0}, shape f32[8]
-  //   8:   result at tuple index {1}, shape (f32[128], f32[256])
-  //   9:     result at tuple index {1, 0}, shape f32[128]
-  //  10:     result at tuple index {1, 1}, shape f32[256]
-  //  11:   result at tuple index {2}, shape f32[1024]
+  //  0:  param 0 at tuple index {0}, shape f32[128]
+  //  1:  param 0 at tuple index {1}, shape f32[256]
+  //  2:  param 1 at tuple index {0}, shape f32[1024]
+  //  3:  param 1 at tuple index {1}, shape f32[8]
+  //  4:  result at tuple index {0}, shape f32[8]
+  //  5:  result at tuple index {1, 0}, shape f32[128]
+  //  6:  result at tuple index {1, 1}, shape f32[256]
+  //  7:  result at tuple index {2}, shape f32[1024]
   //
-  // It's the contract of custom-call that the non-root pointers (i.e.
-  // everything other than indices 0, 3, and 6) may be null, if XLA is unable to
-  // analyze the program well enough to determine for sure what's in those
-  // buffers.  For this simple example, all of the buffers should be non-null.
 
-  // Check the param 0 tuple, namely that
-  //
-  //   (*buffers[0])[0] == buffers[1] and
-  //   (*buffers[0])[1] == buffers[2].
-  //
-  // because buffers contains pointers to device memory, we have to retrieve
-  // these values via cudaMemcpy.
-  void* p0[2];
-  cudaMemcpy(p0, buffers[0], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
-  ASSERT_EQ(p0[0], buffers[1]);
-  ASSERT_EQ(p0[1], buffers[2]);
-
-  // Check the param 1 tuple, namely that
-  //
-  //   (*buffers[3])[0] == buffers[4]
-  //   (*buffers[3])[1] == buffers[5].
-  void* p1[2];
-  cudaMemcpy(p1, buffers[3], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
-  ASSERT_EQ(p1[0], buffers[4]);
-  ASSERT_EQ(p1[1], buffers[5]);
-
-  // We don't have an equivalent check for the output tuple (i.e. we don't check
-  // (*buffers[6])[0] == buffers[7]) because it's up to us to set the tuple
-  // as part of this custom-call.
-
-  // Write the results.  First set the root tuple output buffer to {b7, b8,
-  // b11}.
-  void* root[3] = {buffers[7], buffers[8], buffers[11]};
-  cudaMemcpy(buffers[6], root, 3 * sizeof(void*), cudaMemcpyHostToDevice);
-
-  // Now set the sub-tuple output buffer at index 8 to {b9, b10}.
-  void* sub_tuple[2] = {buffers[9], buffers[10]};
-  cudaMemcpy(buffers[8], sub_tuple, 2 * sizeof(void*), cudaMemcpyDeviceToHost);
-
-  // Now set output leaf buffers 7, 9, 10, and 11, copying data from the
-  // corresponding same-sized inputs.
-  cudaMemcpyAsync(buffers[7], buffers[5], 8 * sizeof(float),
+  // Set output leaf buffers, copying data from the corresponding same-sized
+  // inputs.
+  cudaMemcpyAsync(buffers[4], buffers[3], 8 * sizeof(float),
                   cudaMemcpyDeviceToDevice, stream);
-  cudaMemcpyAsync(buffers[9], buffers[1], 128 * sizeof(float),
+  cudaMemcpyAsync(buffers[5], buffers[0], 128 * sizeof(float),
                   cudaMemcpyDeviceToDevice, stream);
-  cudaMemcpyAsync(buffers[10], buffers[2], 256 * sizeof(float),
+  cudaMemcpyAsync(buffers[6], buffers[1], 256 * sizeof(float),
                   cudaMemcpyDeviceToDevice, stream);
-  cudaMemcpyAsync(buffers[11], buffers[4], 1024 * sizeof(float),
+  cudaMemcpyAsync(buffers[7], buffers[2], 1024 * sizeof(float),
                   cudaMemcpyDeviceToDevice, stream);
 }
 XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, "CUDA");
@@ -185,5 +143,45 @@ TEST_F(CustomCallTest, SubBuffers) {
   EXPECT_THAT(result.data<float>({2}), ::testing::Each(3));
 }
 
+void Callback_TupleSelect(CUstream stream, void** buffers,
+                          const char* /*opaque*/, size_t /*opaque_len*/) {
+  // Set the two output leaf buffers equal to the two input leaf buffers.
+  cudaMemcpyAsync(buffers[2], buffers[0], 10 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[3], buffers[1], 10 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_TupleSelect, "CUDA");
+// Tuple-shaped select is a case where XLA can't know all buffer assignments
+// statically ahead of time and has to walk the on-device tuple sub-buffers.
+TEST_F(CustomCallTest, TupleSelect) {
+  XlaBuilder b(TestName());
+  auto tuple_shape = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {10}),
+      ShapeUtil::MakeShape(F32, {10}),
+  });
+  auto p0 = AddParam(LiteralUtil::CreateR0(false), &b);
+  auto p1 =
+      AddParam(LiteralUtil::MakeTupleOwned(
+                   LiteralUtil::CreateR1<float>(std::vector<float>(10, 1.0f)),
+                   LiteralUtil::CreateR1<float>(std::vector<float>(10, 2.0f))),
+               &b);
+  auto p2 =
+      AddParam(LiteralUtil::MakeTupleOwned(
+                   LiteralUtil::CreateR1<float>(std::vector<float>(10, 10.0f)),
+                   LiteralUtil::CreateR1<float>(std::vector<float>(10, 20.0f))),
+               &b);
+  auto cc = CustomCall(&b, "Callback_TupleSelect",
+                       /*operands=*/{Select(p0, p1, p2)}, tuple_shape,
+                       /*opaque=*/"");
+
+  // Do a tuple-select on the custom-call result to ensure that the custom-call
+  // sets its output tuple index buffers.
+  Select(p0, p1, cc);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ComputeAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>({0}), ::testing::Each(10));
+  EXPECT_THAT(result.data<float>({1}), ::testing::Each(20));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index 5fba64e90ed..65673106391 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -48,8 +48,83 @@ CustomCallThunk::CustomCallThunk(
              instr->shape().ToString(), result_slices.shape().ToString());
 }
 
+// For each leaf in a preorder traversal of `slices`, appends its device address
+// to `buffers`.
+//
+// In the common case, this is trivial; simply iterate over the ShapeTree and
+// add every leaf to `buffers`.  But under some circumstances XLA doesn't
+// statically know the address of a leaf buffer and has to derive it by walking
+// the on-device tuple.
+static Status AppendBuffersFor(const ShapeTree<BufferAllocation::Slice>& slices,
+                               const BufferAllocations* buffer_allocations,
+                               se::Stream* stream,
+                               std::vector<void*>* buffers) {
+  // Buffer addresses we've retrieved by following device tuples.
+  ShapeTree<void*> retrieved_addrs(slices.shape());
+
+  // We make this lambda an std::function so it can capture itself.
+  std::function<StatusOr<void*>(const ShapeIndexView&)> get_addr_for =
+      [&](ShapeIndexView index) -> StatusOr<void*> {
+    auto slice = slices.element(index);
+
+    // If we know the address of this sub-buffer statically, return it.
+    if (slice.allocation() != nullptr) {
+      return buffer_allocations->GetDeviceAddress(slice).opaque();
+    }
+    // If we've already pulled the address for this sub-buffer down from the
+    // GPU, return it.
+    if (retrieved_addrs.element(index) != nullptr) {
+      return retrieved_addrs.element(index);
+    }
+
+    // Recurse to get the address of the parent sub-buffer.
+    CHECK(!index.empty()) << "Address of tuple root cannot be unknown!";
+    TF_ASSIGN_OR_RETURN(void* parent_buffer, get_addr_for(index.ConsumeBack()));
+
+    // Pull down the entirety of parent_buffer from the GPU, getting the address
+    // we're interested in plus all of its siblings.  (Perhaps only some of the
+    // siblings are unknown and we could get away without retrieving all of
+    // them.  But in practice, getting them all in one fell swoop should be just
+    // as fast as getting just one.)
+    //
+    // TODO(jlebar): This is not as efficient as possible.  In particular, at
+    // the expense of some complexity we could batch up multiple parallel D2H
+    // copies (say for multiple unrelated sub-buffers, maybe even across
+    // different parameters) and do just one BlockHostUntilDone.  Hopefully the
+    // case when we have to do any copies at all is uncommon.
+    int64 num_siblings =
+        ShapeUtil::GetSubshape(slices.shape(), index.ConsumeBack())
+            .tuple_shapes_size();
+    std::vector<void*> sibling_addrs(num_siblings);
+    TF_RETURN_IF_ERROR(
+        stream
+            ->ThenMemcpy(sibling_addrs.data(),
+                         se::DeviceMemoryBase(parent_buffer, sizeof(void*)),
+                         num_siblings * sizeof(void*))
+            .BlockHostUntilDone());
+
+    // Save the data we retrieved into retrieved_addrs.
+    for (int64 i = 0; i < num_siblings; ++i) {
+      ShapeIndex sibling_index(index.ConsumeBack());
+      sibling_index.push_back(i);
+      *retrieved_addrs.mutable_element(sibling_index) = sibling_addrs[i];
+    }
+    return sibling_addrs[index.back()];
+  };
+
+  return slices.ForEachElementWithStatus(
+      [&](const ShapeIndex& index, const BufferAllocation::Slice&) {
+        if (slices.IsLeaf(index)) {
+          TF_ASSIGN_OR_RETURN(void* addr, get_addr_for(index));
+          buffers->push_back(addr);
+        }
+        return Status::OK();
+      });
+}
+
 Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
+  se::Stream* stream = params.stream;
   auto gpu_stream = se::gpu::AsGpuStreamValue(params.stream);
   auto typed_call_target =
       reinterpret_cast<void (*)(decltype(gpu_stream), void** /*buffers*/,
@@ -57,23 +132,40 @@ Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
           call_target_);
 
   std::vector<void*> buffers;
-  auto append_buffers = [&](const ShapeTree<BufferAllocation::Slice>& slices) {
-    slices.ForEachElement([&](const ShapeIndex& /*index*/,
-                              const BufferAllocation::Slice& slice) {
-      if (slice.allocation() == nullptr) {
-        buffers.push_back(nullptr);
-      }
-      buffers.push_back(
-          params.buffer_allocations->GetDeviceAddress(slice).opaque());
-    });
-  };
   for (const auto& slices : operand_slices_) {
-    append_buffers(slices);
+    TF_RETURN_IF_ERROR(
+        AppendBuffersFor(slices, params.buffer_allocations, stream, &buffers));
   }
-  append_buffers(result_slices_);
+  TF_RETURN_IF_ERROR(AppendBuffersFor(result_slices_, params.buffer_allocations,
+                                      stream, &buffers));
 
   typed_call_target(gpu_stream, buffers.data(), opaque_.data(), opaque_.size());
-  return Status::OK();
+
+  // If the custom-call returns a tuple, populate the result tuple index
+  // buffers.
+  return result_slices_.ForEachElementWithStatus(
+      [&](const ShapeIndex& index, const BufferAllocation::Slice& slice) {
+        const Shape& subshape =
+            ShapeUtil::GetSubshape(result_slices_.shape(), index);
+        auto n = subshape.tuple_shapes_size();
+        if (!subshape.IsTuple() || n == 0) {
+          return Status::OK();
+        }
+        auto tuple_ptrs = absl::make_unique<void*[]>(n);
+        ShapeIndex subindex(index);
+        for (int i = 0; i < n; ++i) {
+          subindex.push_back(i);
+          tuple_ptrs[i] =
+              params.buffer_allocations
+                  ->GetDeviceAddress(result_slices_.element(subindex))
+                  .opaque();
+          subindex.pop_back();
+        }
+        SafeH2DMemcpy(se::DeviceMemory<void*>(
+                          params.buffer_allocations->GetDeviceAddress(slice)),
+                      std::move(tuple_ptrs), n, stream);
+        return Status::OK();
+      });
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_blacklist.pbtxt b/tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_blacklist.pbtxt
new file mode 100644
index 00000000000..5f22429962c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_blacklist.pbtxt
@@ -0,0 +1,17 @@
+entries {
+  hlo: '(f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}"'
+  cc: {major: 7, minor: 0}
+  cudnn_version: {major: 7, minor: 6, patch: 0}
+  blas_version: "9000"
+  algos: [{}, {tensor_ops: true}, {id: 1}, {id:1, tensor_ops: true}]
+}
+
+entries {
+  hlo: '(f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}"'
+  cc: {major: 7, minor: 0}
+  cudnn_version: {major: 7, minor: 6, patch: 2}
+  blas_version: "9000"
+  algos: [{}, {tensor_ops: true}, {id: 1}, {id:1, tensor_ops: true}]
+}
+
+
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c0cd4addc7e..c6df786fb51 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
 
 #include <stddef.h>
+
 #include <unordered_map>
 #include <vector>
 
@@ -144,7 +145,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
   // Binary math functions transform are of type [T] -> T.
   for (PrimitiveType input_type : input_types) {
     if (output_type != input_type) {
-      return Unimplemented("Input type ≠ output type: %s ≠ %s",
+      return Unimplemented("Input type != output type: %s != %s",
                            PrimitiveType_Name(input_type),
                            PrimitiveType_Name(output_type));
     }
@@ -152,7 +153,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
 
   return EmitDeviceFunctionCall(
       callee_name, operands, input_types, output_type,
-      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind});
+      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}, b_);
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
@@ -269,8 +270,19 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   // Upcast F16 to F32 if necessary.
   llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType();
   llvm::Value* input = FPCast(value, type);
+
+  // If |value| >= kMaxValue, tanh() is set to -1.0 or 1.0.
+  constexpr double kMaxValue = 20.0;
+  auto max_value = llvm::ConstantFP::get(type, kMaxValue);
+  llvm::Value* abs_value =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {input}, {type}, b_);
+
   llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
-  return FPCast(fast_tanh, value->getType());
+  auto one = llvm::ConstantFP::get(type, 1.0);
+  auto one_with_sign = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::copysign,
+                                                    {one, input}, {type}, b_);
+  return FPCast(Select(FCmpULT(abs_value, max_value), fast_tanh, one_with_sign),
+                value->getType());
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
@@ -280,47 +292,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
                             {prim_type, prim_type}, prim_type);
 }
 
-llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
-    const string& callee_name, absl::Span<llvm::Value* const> operands,
-    absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
-    absl::Span<const llvm::Attribute::AttrKind> attributes) {
-  std::vector<llvm::Type*> ir_input_types;
-  for (PrimitiveType input_type : input_types) {
-    ir_input_types.push_back(
-        llvm_ir::PrimitiveTypeToIrType(input_type, module_));
-  }
-  llvm::FunctionType* callee_type = llvm::FunctionType::get(
-      llvm_ir::PrimitiveTypeToIrType(output_type, module_),  // Return type.
-      ir_input_types,                                        // Parameter types.
-      false);  // No variadic arguments.
-
-  // Declares the callee if it is not declared already.
-  llvm::Function* callee = llvm::dyn_cast<llvm::Function>(
-      b_->GetInsertBlock()
-          ->getModule()
-          ->getOrInsertFunction(callee_name, callee_type)
-          .getCallee());
-
-  for (auto attribute : attributes) {
-    callee->addFnAttr(attribute);
-  }
-
-  return Call(callee, llvm_ir::AsArrayRef(operands));
-}
-
 llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
-  llvm::Value* block_id =
-      IntCast(llvm_ir::EmitCallToIntrinsic(
-                  llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_),
-              b_->getIntNTy(128), /*isSigned=*/true, "block.id");
-  llvm::Value* thread_id_in_block =
-      IntCast(llvm_ir::EmitCallToIntrinsic(
-                  llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_),
-              b_->getIntNTy(128), /*isSigned=*/true, "thread.id");
-  llvm::Value* threads_per_block =
-      IntCast(llvm_ir::EmitCallToIntrinsic(
-                  llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, {}, {}, b_),
-              b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+  llvm::Value* block_id = IntCast(
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "block.id");
+  llvm::Value* thread_id_in_block = IntCast(
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+  llvm::Value* threads_per_block = IntCast(
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockDimx, {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
@@ -408,7 +389,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
               SDiv(input_multi_index[i],
                    index_typed_const(window.dimensions(i).base_dilation()));
 
-          // We must check whether 0 ≤ input_multi_index[i] < bound, as
+          // We must check whether 0 <= input_multi_index[i] < bound, as
           // otherwise we are in the pad and so can skip the computation. This
           // comparison is equivalent to the unsigned comparison
           // input_multi_index[i] < bound, as a negative value wraps to a large
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index db4918c5890..c8a58a21980 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -100,13 +100,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
                                      llvm::Value* lhs_value,
                                      llvm::Value* rhs_value);
 
-  // Emits IR to call a device function named "callee_name" on the given
-  // operand. Returns the IR value that represents the return value.
-  llvm::Value* EmitDeviceFunctionCall(
-      const string& callee_name, absl::Span<llvm::Value* const> operands,
-      absl::Span<const PrimitiveType> input_type, PrimitiveType output_type,
-      absl::Span<const llvm::Attribute::AttrKind> attributes);
-
   // Emits IR to call an LLVM intrinsic of type [T] -> T.  Adjusts
   // callee_name according to T.  Returns the IR value that represents the
   // return value of the function.
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index da90ba989dc..991a463f2a0 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -32,20 +32,20 @@ FftScratchAllocator::FftScratchAllocator(
     int device_ordinal, se::DeviceMemoryAllocator* memory_allocator)
     : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
-int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
+int64 FftScratchAllocator::GetMemoryLimitInBytes() {
   constexpr int64 kFftScratchSize = 1LL << 32;  // 4GB by default.
   return kFftScratchSize;
 }
 
 StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
-    se::Stream* stream, int64 byte_size) {
+    int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
+  if (byte_size > GetMemoryLimitInBytes()) {
     return se::port::Status(
         se::port::error::RESOURCE_EXHAUSTED,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
+            byte_size, GetMemoryLimitInBytes()));
   }
 
   TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index be77df1eb77..95186c7f219 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -40,12 +40,12 @@ class FftScratchAllocator : public se::ScratchAllocator {
   FftScratchAllocator(int device_ordinal,
                       se::DeviceMemoryAllocator* memory_allocator);
 
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override;
+  int64 GetMemoryLimitInBytes() override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override;
+      int64 byte_size) override;
 
  private:
   const int device_ordinal_;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index 626bef76b98..98d8d00b62c 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -65,6 +65,9 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     return InternalError("Failed to synchronize GPU for autotuning.");
   }
 
+  GemmBackendConfig backend_config =
+      gemm->backend_config<GemmBackendConfig>().ValueOrDie();
+
   VLOG(3) << "Starting autotune of GemmThunk " << gemm->ToString();
 
   std::vector<se::blas::AlgorithmType> algorithms;
@@ -76,7 +79,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
   for (se::blas::AlgorithmType algorithm : algorithms) {
     // Make sure the output buffer always has the same value if we use
     // the bias parameter.
-    if (gemm->backend_config<GemmBackendConfig>().ValueOrDie().beta() != 0) {
+    if (backend_config.beta() != 0) {
       int64 rng_state = 0;
       InitializeFloatBuffer(stream, gemm->shape().element_type(), &rng_state,
                             output_buffer);
@@ -87,7 +90,8 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     // for all algorithms if we're targeting < sm_50.  But because we pass a
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
-    CHECK(RunGemm(gemm, lhs_buffer, rhs_buffer, output_buffer, stream,
+    CHECK(RunGemm(gemm, backend_config, lhs_buffer, rhs_buffer, output_buffer,
+                  stream,
                   /*implements_whole_instruction=*/true,
                   /*profiler=*/nullptr,
                   /*profile_result=*/&profile_result, algorithm)
@@ -110,7 +114,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
 
     TF_ASSIGN_OR_RETURN(
         se::cuda::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
-        allocator.CheckRedzones(stream));
+        allocator.CheckRedzones());
     if (!rz_check_status.ok()) {
       result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
       *result.mutable_failure()->mutable_msg() =
@@ -235,17 +239,22 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
 static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
                                        se::StreamExecutor* executor,
                                        se::DeviceMemoryAllocator* allocator) {
-  se::Stream stream{executor};
-  stream.Init();
-
   if (allocator == nullptr) {
     allocator = executor->GetAllocator();
   }
+  absl::optional<se::Stream> stream_opt;
+  se::Stream* stream = [&]() {
+    if (allocator->GetStream()) {
+      return allocator->GetStream();
+    }
+    stream_opt.emplace(executor);
+    stream_opt->Init();
+    return &stream_opt.value();
+  }();
 
   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
   se::cuda::RedzoneAllocator input_output_allocator(
-      executor->device_ordinal(), allocator,
-      PtxOptsFromConfig(hlo_module_config));
+      stream, allocator, PtxOptsFromConfig(hlo_module_config));
 
   BufferComparator comparator(instr->shape(), hlo_module_config);
 
@@ -254,8 +263,8 @@ static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
       [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
     TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
                         input_output_allocator.AllocateBytes(
-                            &stream, ShapeUtil::ByteSizeOf(op->shape())));
-    InitializeFloatBuffer(&stream, op->shape().element_type(), &rng_state,
+                            ShapeUtil::ByteSizeOf(op->shape())));
+    InitializeFloatBuffer(stream, op->shape().element_type(), &rng_state,
                           buffer);
     return buffer;
   };
@@ -280,11 +289,11 @@ static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();
 
-  TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
-                      DoGemmAutotune(instr, lhs, rhs, lhs_buffer, rhs_buffer,
-                                     output_buffer, reference_result_buffer,
-                                     &stream, crash_on_checking_failure,
-                                     input_output_allocator, comparator));
+  TF_ASSIGN_OR_RETURN(
+      absl::optional<se::blas::AlgorithmType> gemm_algorithm,
+      DoGemmAutotune(instr, lhs, rhs, lhs_buffer, rhs_buffer, output_buffer,
+                     reference_result_buffer, stream, crash_on_checking_failure,
+                     input_output_allocator, comparator));
 
   // We update instruction->backend_config(); if no algorithms are supported,
   // a different API is used, which does not require specifying an algorithm.
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
index df7ee3cdc69..bdf697acfba 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
@@ -32,23 +32,6 @@ namespace gpu {
 
 namespace m = match;
 
-static complex128 GetScalarConstantAsComplex(const Literal &literal) {
-  switch (literal.shape().element_type()) {
-    case F16:
-      return {static_cast<double>(literal.Get<Eigen::half>({})), 0};
-    case F32:
-      return {literal.Get<float>({}), 0};
-    case F64:
-      return {literal.Get<double>({}), 0};
-    case C64:
-      return literal.Get<complex64>({});
-    case C128:
-      return literal.Get<complex128>({});
-    default:
-      LOG(FATAL) << "Unexpected type: " << literal.shape();
-  }
-}
-
 // The rewriting proceeds in a bottom-up way:
 //
 // (kDot A B) is rewritten into a (kCustomCall:gemm A B)
@@ -103,7 +86,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       if (config.beta() == 0.0 && existing_gemm->user_count() == 1) {
         complex128 prev_alpha = {config.alpha_real(), config.alpha_imag()};
         complex128 new_alpha =
-            GetScalarConstantAsComplex(alpha->literal()) * prev_alpha;
+            *alpha->literal().GetAsComplex128({}) * prev_alpha;
         config.set_alpha_real(new_alpha.real());
         config.set_alpha_imag(new_alpha.imag());
         TF_RETURN_IF_ERROR(existing_gemm->set_backend_config(config));
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index eddc2474830..d52e5410dab 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -37,12 +37,14 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice &lhs_buffer,
                      const BufferAllocation::Slice &rhs_buffer,
                      const BufferAllocation::Slice &output_buffer,
                      bool implements_whole_instruction,
-                     const HloInstruction *hlo_instruction)
+                     const HloInstruction *hlo_instruction,
+                     const GemmBackendConfig &backend_config)
     : Thunk(Kind::kGemm, hlo_instruction),
       lhs_buffer_(lhs_buffer),
       rhs_buffer_(rhs_buffer),
       output_buffer_(output_buffer),
-      implements_whole_instruction_(implements_whole_instruction) {}
+      implements_whole_instruction_(implements_whole_instruction),
+      backend_config_(backend_config) {}
 
 Status GemmThunk::ExecuteOnStream(const ExecuteParams &params) {
   auto get_device_address = [&](const BufferAllocation::Slice &slice) {
@@ -53,8 +55,9 @@ Status GemmThunk::ExecuteOnStream(const ExecuteParams &params) {
   se::DeviceMemoryBase lhs_data = get_device_address(lhs_buffer_);
   se::DeviceMemoryBase rhs_data = get_device_address(rhs_buffer_);
   se::DeviceMemoryBase output_data = get_device_address(output_buffer_);
-  return RunGemm(hlo_instruction(), lhs_data, rhs_data, output_data,
-                 params.stream, implements_whole_instruction_, params.profiler);
+  return RunGemm(hlo_instruction(), backend_config_, lhs_data, rhs_data,
+                 output_data, params.stream, implements_whole_instruction_,
+                 params.profiler);
 }
 
 // This struct contains the metadata of a matrix, e.g., its base address and
@@ -152,8 +155,9 @@ static bool DoGemmWithAlgorithm(
       .ok();
 }
 
-Status RunGemm(const HloInstruction *gemm, se::DeviceMemoryBase lhs_buffer,
-               se::DeviceMemoryBase rhs_buffer,
+Status RunGemm(const HloInstruction *gemm,
+               const GemmBackendConfig &backend_config,
+               se::DeviceMemoryBase lhs_buffer, se::DeviceMemoryBase rhs_buffer,
                se::DeviceMemoryBase output_buffer, se::Stream *stream,
                bool implements_whole_instruction,
                HloExecutionProfiler *profiler,
@@ -162,8 +166,6 @@ Status RunGemm(const HloInstruction *gemm, se::DeviceMemoryBase lhs_buffer,
   VLOG(2) << "Executing a GemmThunk";
   CHECK(IsCublasGemm(*gemm));
 
-  TF_ASSIGN_OR_RETURN(GemmBackendConfig backend_config,
-                      gemm->backend_config<GemmBackendConfig>());
   const Shape &output_shape = gemm->shape();
   const HloInstruction *lhs = gemm->operand(0);
   const HloInstruction *rhs = gemm->operand(1);
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index adf2fa853b7..b44cc40d295 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GEMM_THUNK_H_
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
@@ -42,7 +43,8 @@ class GemmThunk : public Thunk {
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
             bool implements_whole_instruction,
-            const HloInstruction* hlo_instruction);
+            const HloInstruction* hlo_instruction,
+            const GemmBackendConfig& backend_config);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
@@ -54,23 +56,23 @@ class GemmThunk : public Thunk {
   const BufferAllocation::Slice rhs_buffer_;
   const BufferAllocation::Slice output_buffer_;
   bool implements_whole_instruction_;
+  GemmBackendConfig backend_config_;
 };
 
 // Run the given GEMM instruction `gemm` subject to the configuration
-// stored inside it's backend_config and the passed buffers.
+// in `backend_config` and the passed buffers.
 //
 // `implements_whole_instruction` is used for the default profiler creation
 // if the `profiler` is not supplied. False value indicates that the created
 // profiler will not specifically profile the `gemm` instruction.
 //
-// If `algorithm` is provided, it overrides the one specified in backend_config
-// of gemm.
-//
+// If `algorithm` is provided, it overrides the one specified in
+// `backend_config`.
 Status RunGemm(
-    const HloInstruction* gemm, se::DeviceMemoryBase lhs_buffer,
-    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
-    se::Stream* stream, bool implements_whole_instruction,
-    HloExecutionProfiler* profiler = nullptr,
+    const HloInstruction* gemm, const GemmBackendConfig& backend_config,
+    se::DeviceMemoryBase lhs_buffer, se::DeviceMemoryBase rhs_buffer,
+    se::DeviceMemoryBase output_buffer, se::Stream* stream,
+    bool implements_whole_instruction, HloExecutionProfiler* profiler = nullptr,
     se::blas::ProfileResult* profile_result = nullptr,
     absl::optional<se::blas::AlgorithmType> algorithm = absl::nullopt);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
index 6ed72437bec..35b5cfacb2d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -6,6 +6,7 @@ package xla.gpu;
 
 import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/core/protobuf/autotuning.proto";
 
 message ConvInstructionLog {
   xla.HloInstructionProto instruction = 1;
@@ -13,3 +14,20 @@ message ConvInstructionLog {
   uint64 result_address = 3;
   repeated uint64 operand_addresses = 4;
 }
+
+message BlacklistedAlgorithm {
+  int64 id = 1;
+  bool tensor_ops = 2;
+}
+
+message AlgorithmBlacklistEntry {
+  string hlo = 1;
+  tensorflow.ComputeCapability cc = 2;
+  tensorflow.CudnnVersion cudnn_version = 3;
+  string blas_version = 5;
+  repeated BlacklistedAlgorithm algos = 4;
+}
+
+message AlgorithmBlacklist {
+  repeated AlgorithmBlacklistEntry entries = 1;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
new file mode 100755
index 00000000000..de3b1efd03a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -0,0 +1,474 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+
+#include <stdlib.h>
+
+#include <atomic>
+#include <functional>
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/depthwise_convolution_converter.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/mem_wasted_on_passthrough_params.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/rng_expander.h"
+#include "tensorflow/compiler/xla/service/slice_sinker.h"
+#include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+#include "tensorflow/compiler/xla/service/transpose_folding.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
+#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace xla {
+namespace gpu {
+
+GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
+                         const char* target_triple, const char* data_layout)
+    : platform_id_(platform_id),
+      target_triple_(target_triple),
+      data_layout_(data_layout),
+      pointer_size_(llvm::DataLayout(data_layout)
+                        .getPointerSize(0 /* default address space */)) {}
+
+// Runs optimization passes on the given HLO module.
+Status GpuCompiler::OptimizeHloModule(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  {
+    HloPassPipeline pipeline("optimization");
+    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                              /*allow_mixed_precision=*/false);
+
+    // Expand random number generation.
+    pipeline.AddPass<RngExpander>();
+
+    // Remove zero-sized HLO from the input so that other passes don't have to
+    // handle it.
+    pipeline.AddPass<ZeroSizedHloElimination>();
+
+    pipeline.AddPass<GpuScatterExpander>();
+
+    pipeline.AddPass<DynamicIndexSplitter>();
+    pipeline.AddPass<GpuHloSupportChecker>();
+    ReducePrecisionInsertion::AddPasses(
+        &pipeline, hlo_module->config().debug_options(),
+        ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
+
+    // TODO(b/64094172): make Call work on GPU instead of inlining.
+    pipeline.AddPass<CallInliner>();
+    auto cost_model = [](HloInstruction* conv) {
+      // We need a cost model for GPUs. Currently, do nothing.
+      return false;
+    };
+    pipeline.AddPass<DotDecomposer>();
+    pipeline.AddPass<DepthwiseConvolutionConverter>(cost_model);
+    // Expand the sort op to support stable sorting if required.
+    pipeline.AddPass<StableSortExpander>();
+    // Convert BF16 operations to F32 operations so that the GPU backend can
+    // support BF16 operations without directly implementing a BF16 lowering for
+    // most ops.
+    pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
+
+    {
+      auto& pass =
+          pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
+      pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                            /*allow_mixed_precision=*/false);
+
+      // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls
+      // where possible.  Not every batchnorm op can be implemented as a call to
+      // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs.
+      if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
+        pass.AddPass<CudnnBatchNormRewriter>();
+      }
+      pass.AddPass<BatchNormExpander>(
+          /*rewrite_training_op=*/true,
+          /*rewrite_inference_op=*/true,
+          /*rewrite_grad_op=*/true);
+
+      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+
+      // BatchNormExpander can create zero-sized ops, so zero-sized HLO
+      // elimination has to come after that pass.
+      pipeline.AddPass<ZeroSizedHloElimination>();
+
+      AlgebraicSimplifierOptions options;
+      pass.AddPass<AlgebraicSimplifier>(options);
+      pass.AddPass<SortSimplifier>();
+      pass.AddPass<TupleSimplifier>();
+      pass.AddPass<WhileLoopConstantSinking>();
+      pass.AddPass<WhileLoopSimplifier>();
+
+      // TODO(b/134075051): Re-enable after b/134075051 is fixed.
+      // pass.AddPass<SliceSinker>();
+
+      pass.AddPass<HloDCE>();
+      pass.AddPass<ReshapeMover>();
+      pass.AddPass<HloConstantFolding>();
+      pass.AddPass<ConditionalSimplifier>();
+    }
+
+    pipeline.AddPass<TransposeFolding>(
+        [](const HloInstruction& dot,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return IsMatrixMultiplication(dot)
+                     ? candidate_operands
+                     : TransposeFolding::OperandIndices{};
+        },
+        TransposeFolding::NeverFoldTranspose);
+    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
+    pipeline.AddPass<HloDCE>();
+
+    // Run WhileLoopTripCountAnnotator at the end of the simplification
+    // pipeline, before layout assignment and fusion.  This pass does some
+    // pattern-matching on while bodies/conditions, and this is where the HLO is
+    // "nicest".
+    //
+    // It's important that we don't make semantic changes (e.g. unrolling) to
+    // any `while` loops after this point, because otherwise the trip-count
+    // annotations added by this pass may not be correct after the
+    // modifications.
+    pipeline.AddPass<WhileLoopTripCountAnnotator>();
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
+  // Run target-specific HLO optimization passes for convolution
+  // canonicalization.
+  TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
+      hlo_module, stream_exec, device_allocator));
+
+  {
+    // Run layout assignment in a separate pipeline from
+    // "post-layout-assignment" because we want everything after layout
+    // assignment to have a layout-sensitive invariant-checker, but
+    // HloPassPipeline also runs its invariant checker before any passes are
+    // run, meaning, the pipeline that contains layout assignment cannot contain
+    // a layout-sensitive verifier!
+    HloPassPipeline pipeline("layout assignment");
+    pipeline.AddPass<GpuLayoutAssignment>(
+        hlo_module->mutable_entry_computation_layout(),
+        LayoutAssignment::InstructionCanChangeLayout, stream_exec);
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
+  // Run target-specific HLO optimization passes after layout assignment.
+  TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(hlo_module, stream_exec,
+                                                     device_allocator));
+
+  {
+    HloPassFix<HloPassPipeline> fusion("fusion");
+    // We try to split variadic ops with many parameters into several such ops
+    // to avoid exceeding the parameter space.
+    fusion.AddPass<VariadicOpSplitter>();
+    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+     * fixing the ticket. */
+    fusion.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
+    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
+    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
+    fusion.AddPass<FusionMerger>();
+    fusion.AddPass<GpuMultiOutputFusion>();
+    fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
+                           /*only_fusion_computations=*/true);
+    fusion.AddPass<HloDCE>();
+    TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+
+    HloPassPipeline reduce_pipeline("reduce-precision");
+    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+     * fixing the ticket. */
+    reduce_pipeline.AddInvariantChecker<HloVerifier>(
+        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
+    ReducePrecisionInsertion::AddPasses(
+        &reduce_pipeline, hlo_module->config().debug_options(),
+        ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
+    StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module);
+    TF_RETURN_IF_ERROR(reduce_result.status());
+
+    if (reduce_result.ValueOrDie()) {
+      // Do another fusion pass, with the expectation that we may be able to
+      // fuse the new ReducePrecision operations.
+      TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+    }
+  }
+
+  return Status::OK();
+}
+
+// Modifies the given HLO module so that it will be accepted by IrEmitter.
+// Unlike optimization passes, the passes are necessary for correctness.
+Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+  // In some cases, we have to place the result of an instruction in a temporary
+  // buffer. For instance, the buffer that holds an external parameter is
+  // assumed immutable at this point, and should not be reused for output
+  // (b/27180329). Therefore, in that case, we set the output to be a copy of
+  // the parameter.
+  HloPassPipeline pipeline("GPU-ir-emit-prepare");
+  /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+   * fixing the ticket. */
+  pipeline.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false,
+      LayoutAssignment::InstructionCanChangeLayout);
+
+  // Copy insertion should be performed immediately before IR emission to avoid
+  // inserting unnecessary copies (later pass adds an instruction which
+  // materializes the value) or missing a necessary copy (later pass removes an
+  // instruction which materializes a value). DCE must be run immediately before
+  // (and sometime after) copy insertion, to avoid dead code from interfering
+  // with the rewrites.
+  pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
+  // The following pass LOGs memory waste. Add it when VLOGing is enabled only.
+  if (VLOG_IS_ON(2)) {
+    pipeline.AddPass<MemWastedOnPassthroughParams>();
+  }
+  pipeline.AddPass<GpuCopyInsertion>(GetCanShareBuffer());
+  pipeline.AddPass<GpuSanitizeConstantNames>();
+  return pipeline.Run(hlo_module).status();
+}
+
+StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
+  tensorflow::profiler::TraceMe activity(
+      [&] { return absl::StrCat("HLO Transforms:", module->name()); },
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  TF_RETURN_IF_ERROR(
+      OptimizeHloModule(module.get(), stream_exec, device_allocator));
+
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+
+  return std::move(module);
+}
+
+StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
+  auto slow_compile_alarm = SlowCompilationAlarm();
+
+  TF_RET_CHECK(stream_exec != nullptr);
+
+  llvm::LLVMContext llvm_context;
+  std::string buffer;
+  llvm::raw_string_ostream error(buffer);
+  llvm::DiagnosticPrinterRawOStream printer(error);
+  auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
+                              void* Context) {
+    auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
+    diag_info.print(*printer);
+  };
+  llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
+
+  llvm::Module llvm_module(module->name().c_str(), llvm_context);
+  // Set the target triple and the data layout.
+  llvm_module.setTargetTriple(target_triple_);
+  llvm_module.setDataLayout(data_layout_);
+
+  // Determine the HLO schedule, which is an ordering of HLO instructions.  This
+  // is used by buffer assignment to enable buffer reuse, and the same ordering
+  // must also be used to determine the thunk launch schedule.
+  std::unique_ptr<StreamAssignment> stream_assignment = AssignStreams(*module);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuHloSchedule> hlo_schedule,
+      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssigner::Run(
+          module.get(), hlo_schedule->ConsumeHloOrdering(),
+          BufferSizeBytesFunction(),
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allocate_buffers_for_constants=*/true,
+          /*colorer=*/BufferAssigner::DefaultColorer(),
+          /*must_not_live_out=*/{}, GetCanShareBuffer()));
+  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
+
+  IrEmitterContext ir_emitter_context(
+      module.get(), buffer_assignment.get(), stream_exec->platform(),
+      &stream_exec->GetDeviceDescription(), &llvm_module);
+
+  HloComputation* entry_computation = module->entry_computation();
+  IrEmitterUnnested ir_emitter(module->config(), entry_computation,
+                               &ir_emitter_context);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
+    TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
+  }
+
+  if (user_pre_optimization_hook_) {
+    user_pre_optimization_hook_(llvm_module);
+  }
+  string ir_module_string_before_opt;
+  const bool embed_ir_in_executable =
+      module->config().debug_options().xla_embed_ir_in_executable();
+  if (embed_ir_in_executable) {
+    ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
+  }
+
+  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/false);
+
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier");
+
+    std::string err;
+    llvm::raw_string_ostream err_stream(err);
+
+    // verifyModule() returns true if the module is broken.
+    TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
+        << "Invalid LLVM IR before optimizations:\n"
+        << err_stream.str()
+        << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
+           "Rerun with --xla_dump_to to get the IR. ";
+  }
+
+  GpuVersion gpu_version = GetGpuVersion(stream_exec);
+
+  using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
+  TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
+                      CompileTargetBinary(module.get(), &llvm_module,
+                                          gpu_version, stream_exec));
+
+  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
+      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
+      hlo_schedule->ThunkLaunchOrder());
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "thunk_schedule",
+                            thunk_schedule->ToString());
+  }
+
+  std::unique_ptr<HloProfileIndexMap> profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> profile_printer;
+
+  if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
+    HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    cost_analysis.set_bytes_per_second(
+        stream_exec->GetDeviceDescription().memory_bandwidth());
+    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
+    VLOG(1) << "HLO memory read+written: "
+            << tensorflow::strings::HumanReadableNumBytes(
+                   cost_analysis.bytes_accessed());
+    if (module->config().hlo_profiling_enabled()) {
+      profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
+      profile_printer = CreateHloProfilePrinterData(
+          *profile_index_map, cost_analysis, entry_computation->name());
+    }
+  }
+
+  auto* gpu_executable = new GpuExecutable(
+      backend_result.first, backend_result.second, gpu_version,
+      std::move(thunk_schedule), std::move(module),
+      std::move(buffer_assignment), std::move(profile_printer),
+      std::move(profile_index_map));
+  if (embed_ir_in_executable) {
+    DCHECK_NE("", ir_module_string_before_opt);
+    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
+  }
+  return std::unique_ptr<Executable>(gpu_executable);
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                                const AotCompilationOptions& options) {
+  return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
new file mode 100644
index 00000000000..901d994d4ad
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_compiler.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+
+namespace xla {
+namespace gpu {
+
+// The GPU compiler generates efficient GPU executables.
+class GpuCompiler : public LLVMCompiler {
+ public:
+  GpuCompiler(se::Platform::Id platform_id, const char* target_triple,
+              const char* data_layout);
+  ~GpuCompiler() override {}
+
+  // Bring in
+  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+  //     std::vector<std::unique_ptr<HloModule>> modules,
+  //     std::vector<std::vector<se::StreamExecutor*>>
+  //        stream_execs)
+  using LLVMCompiler::Compile;
+
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  Status OptimizeHloModule(HloModule* hlo_module,
+                           se::StreamExecutor* stream_exec,
+                           se::DeviceMemoryAllocator* device_allocator);
+
+  virtual Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) = 0;
+
+  virtual Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) = 0;
+
+  virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() {
+    return
+        [](const HloInstruction*, const HloInstruction*,
+           const ShapeIndex&) -> absl::optional<bool> { return absl::nullopt; };
+  }
+
+  virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
+
+  virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
+  CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
+                      GpuVersion gpu_version,
+                      se::StreamExecutor* stream_exec) = 0;
+
+  Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) override;
+
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     AotCompilationOptions const& options) override;
+
+  se::Platform::Id PlatformId() const override { return platform_id_; }
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
+    // Capture just the pointer size, not the entire GpuCompiler object.
+    int64 pointer_size = pointer_size_;
+    return [pointer_size](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape, pointer_size);
+    };
+  }
+
+ private:
+  se::Platform::Id platform_id_;
+
+  // The triple that represents our target.
+  const char* target_triple_;
+
+  // The data layout of the emitted module.
+  const char* data_layout_;
+
+  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
+  const int64 pointer_size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index e4942bd76a6..abf2cd1f23f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -81,11 +81,12 @@ void GpuExecutable::ComputeThunkAnnotations() {
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
     const HloInstruction* hlo = thunk->hlo_instruction();
     CHECK(hlo);
-    thunk_annotations_[thunk] = absl::StrFormat(
-        "%s:#tf_op=%s,hlo_op=%s,hlo_module=%s#",
-        hlo->ToStringWithCanonicalNameMap(HloPrintOptions::Canonical(),
-                                          &canonical_name_map),
-        hlo->metadata().op_name(), hlo->name(), hlo->GetModule()->name());
+    thunk_annotations_[thunk] =
+        absl::StrFormat("%s:#tf_op=%s:%s,hlo_op=%s,hlo_module=%s#",
+                        hlo->ToStringWithCanonicalNameMap(
+                            HloPrintOptions::Canonical(), &canonical_name_map),
+                        hlo->metadata().op_name(), hlo->metadata().op_type(),
+                        hlo->name(), hlo->GetModule()->name());
   }
 }
 
@@ -195,10 +196,11 @@ Status GpuExecutable::ExecuteThunks(
   }
 
   main_stream->ThenWaitFor(&sub_streams);
-  // Make sure kernels are completed before deallocating temporary buffers.
+  // Make sure kernels are completed before deallocating temporary buffers or
+  // the profiler state.
   // TODO(b/30100571): we could potentially postpone deallocating the temp
   // buffers until a different computation is executed.
-  if (block_host_until_done) {
+  if (do_profile || block_host_until_done) {
     Status block_status = main_stream->BlockHostUntilDone();
     if (!block_status.ok()) {
       return InternalError(
@@ -207,17 +209,20 @@ Status GpuExecutable::ExecuteThunks(
     }
   }
 
+  // FinishExecution() blocks until main_stream has completed if profiling is
+  // enabled; we therefore do not need to defer profile collection onto a
+  // stream.
   profiler.FinishExecution();
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
-  {
-    tensorflow::mutex_lock lock(mutex_);
+  if (run_options->run_options().execution_profile()) {
+    ExecutionProfile* profile = run_options->run_options().execution_profile();
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+    profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
 
     // If hlo profiling was disabled then the cycle count is left empty.
     if (do_profile) {
-      execution_profile_.set_compute_cycle_count(
+      profile->set_compute_cycle_count(
           hlo_execution_profile->total_cycles_executed(
               *module().entry_computation()));
     }
@@ -241,8 +246,14 @@ GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
   module_spec.AddCudaPtxInMemory(text().c_str());
 
   absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
+  if (executor->platform_kind() == se::PlatformKind::kCuda &&
+      module_spec.cuda_ptx_in_memory() == nullptr) {
+    // No custom PTX => no globals.
+    return &module_globals_.emplace(executor, std::move(globals)).first->second;
+  }
+
   se::ModuleHandle module_handle;
-  executor->LoadModule(module_spec, &module_handle);
+  TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
 
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
@@ -402,25 +413,16 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::Execute(
   return std::move(shaped_buffer);
 }
 
-StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  // TODO(b/134086343): ExecuteOnStream should not be async according to the
-  // documentation, instead ExecuteAsyncOnStream should be used.
-  return Execute(run_options, arguments, hlo_execution_profile,
-                 /*block_host_until_done=*/
-                 !run_options->allocator()->AllowsAsynchronousDeallocation());
-}
-
-StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments) {
   se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   // Force synchronous execution if the allocator requires it.
   bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
-  return Execute(run_options, arguments, nullptr, block_host_until_done);
+  return Execute(run_options, arguments, hlo_execution_profile,
+                 block_host_until_done);
 }
 
 const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
@@ -428,5 +430,14 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
       module().entry_computation()->root_instruction());
 }
 
+int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
+  // Non-empty PTX but empty cubin: compilation must have failed, return
+  // "unknown".
+  if (binary().empty() && !text_.empty()) {
+    return -1;
+  }
+  return binary().size();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 5f9fe3e71ef..0175e31568c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -61,6 +61,8 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~GpuExecutable() override;
 
+  int64 SizeOfGeneratedCodeInBytes() override;
+
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
 
@@ -78,17 +80,13 @@ class GpuExecutable : public Executable {
   // compilation is left up to the GPU driver.
   const std::vector<uint8>& binary() const { return binary_; }
 
-  // ExecuteOnStream will fail if the compute capability of the stream doesn't
-  // match the compute capability passed to this object's constructor.
-  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
+  // ExecuteAsyncOnStream will fail if the compute capability of the stream
+  // doesn't match the compute capability passed to this object's constructor.
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments) override;
-
   std::shared_ptr<const BufferAssignment> GetBufferAssignment() const {
     return assignment_;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 2d266b9bc73..c5c79f63e81 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
 #include <iterator>
+#include <stack>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -26,8 +28,8 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-
 namespace {
+
 void AppendParams(const HloInstruction& instr,
                   std::vector<HloInstruction*>* params) {
   if (instr.opcode() == HloOpcode::kFusion) {
@@ -39,6 +41,25 @@ void AppendParams(const HloInstruction& instr,
     }
   }
 }
+
+bool CodegensIntoLoop(const HloInstruction& instr) {
+  CHECK_NE(instr.opcode(), HloOpcode::kFusion) << "`instr` has to be unfused.";
+  if (instr.opcode() == HloOpcode::kReduce &&
+      !IsReductionFromOrToContiguousDimensions(instr)) {
+    return true;
+  }
+  // Reduce window codegens into loop only when windows overlap, i.e. stride is
+  // less than window size.
+  if (instr.opcode() == HloOpcode::kReduceWindow) {
+    for (const auto& dim : instr.window().dimensions()) {
+      if (dim.size() > dim.stride()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
@@ -202,19 +223,16 @@ bool IsProducerConsumerFusible(const HloInstruction& producer,
   if (!IsLoopFusible(producer) || !IsFusible(consumer)) {
     return false;
   }
-
   // Skip multiple output fusion. It's not yet supported.
   if (producer.IsMultiOutputFusion()) {
     return false;
   }
-
   // Do not fuse into reduce input fusions if the resulting kernel would suffer
   // from poor data locality (due to unfriendly input layouts).
   if (IsInputFusibleReduction(consumer) &&
       !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
     return false;
   }
-
   // We can't fuse library calls, so if a user of such an op could become a
   // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for
   // further rationale.
@@ -222,7 +240,6 @@ bool IsProducerConsumerFusible(const HloInstruction& producer,
       ImplementedAsLibraryCall(*producer.operand(0))) {
     return false;
   }
-
   // Fuse scalar constants into loop fusion nodes. This reduces the number of
   // parameters and makes matching scalar broadcasts easier.
   //
@@ -235,7 +252,6 @@ bool IsProducerConsumerFusible(const HloInstruction& producer,
     return ShapeUtil::IsEffectiveScalar(producer.shape()) &&
            consumer.opcode() == HloOpcode::kFusion;
   }
-
   return true;
 }
 
@@ -249,15 +265,12 @@ bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
   if (!IsLoopFusible(producer) || !IsFusibleAsMultiOutputFusionRoot(consumer)) {
     return false;
   }
-
   if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
     return false;
   }
-
   if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
     return false;
   }
-
   return true;
 }
 
@@ -323,6 +336,71 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
   return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
 }
 
+bool CreatesNestedLoop(const HloInstruction& producer,
+                       const HloInstruction& consumer) {
+  // If producer does not have an instruction that codegens a loop then there is
+  // nothing to do.
+  auto producer_has_loop_codegen = [&](const HloInstruction& instr) {
+    if (producer.opcode() != HloOpcode::kFusion) {
+      return CodegensIntoLoop(producer);
+    }
+    for (const auto& instr : producer.fused_instructions()) {
+      if (CodegensIntoLoop(*instr)) {
+        return true;
+      }
+    }
+    return false;
+  };
+  if (!producer_has_loop_codegen(producer)) {
+    return false;
+  }
+
+  // If consumer is a non-fusion instruction then we have to check if it
+  // generates a loop.
+  if (consumer.opcode() != HloOpcode::kFusion) {
+    return CodegensIntoLoop(consumer);
+  }
+
+  // If consumer is a fusion then we have to check if the output of producer is
+  // used directly or indirectly as an input to an HLO instruction that
+  // generates a loop, i.e. there is a path in the graph from an operand
+  // corresponding to the producer to an HLO instruction generating a loop in
+  // the consumer.
+  for (const HloInstruction* operand : consumer.operands()) {
+    if (operand != &producer) {
+      continue;
+    }
+
+    const HloInstruction* root =
+        consumer.fused_instructions_computation()->parameter_instruction(
+            consumer.operand_index(operand));
+
+    std::stack<const HloInstruction*> dfs;
+    dfs.push(root);
+    absl::flat_hash_set<const HloInstruction*> visited;
+    while (!dfs.empty()) {
+      const HloInstruction* cur = dfs.top();
+      dfs.pop();
+
+      if (visited.contains(cur)) {
+        continue;
+      }
+      visited.insert(cur);
+
+      if (CodegensIntoLoop(*cur)) {
+        return true;
+      }
+      for (const auto& user : cur->users()) {
+        if (visited.contains(user)) {
+          continue;
+        }
+        dfs.push(user);
+      }
+    }
+  }
+  return false;
+}
+
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) {
   // We can fuse reduces and loop fusions. Elementwise instructions can be fused
   // with any other instruction.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index 4956bf096a0..145975e6f49 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -67,6 +67,11 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
                            const HloInstruction& instr2);
 
+// Check if fusing producer and consumer will generate a nested loop, e.g. both
+// producer and consumer are `reduce-window` HLO instructions.
+bool CreatesNestedLoop(const HloInstruction& producer,
+                       const HloInstruction& consumer);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 2879acecbce..550f4662b55 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -166,7 +166,7 @@ Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
   // instr->operand(2), if exists, is the bias buffer. There is no need to
   // assign layout to it, as it has only one dimension.
 
-  // instr->opernad(3), if exists, is the side input buffer.
+  // instr->operand(3), if exists, is the side input buffer.
   if (instr->operand_count() == 4) {
     if (kind != CudnnConvKind::kForwardActivation) {
       return InternalError(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
new file mode 100644
index 00000000000..013fffe4fa8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
+
+namespace xla {
+namespace gpu {
+
+constexpr absl::string_view kDefaultBlacklist = R"pb(
+  entries {
+    hlo: "(f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convForward\", backend_config=\"{conv_result_scale:1}\""
+    cc { major: 7 }
+    cudnn_version { major: 7 minor: 6 patch: 2 }
+    blas_version: "10201"
+    algos { id: 1 tensor_ops: true }
+  }
+  entries {
+    hlo: "(f16[7,7,4,64]{2,1,0,3}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[256,112,112,64]{3,2,1,0}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config=\"{conv_result_scale:1}\""
+    cc { major: 7 }
+    cudnn_version { major: 7 minor: 6 patch: 2 }
+    blas_version: "10201"
+    algos { id: 1 tensor_ops: true }
+  })pb";
+
+absl::Span<const stream_executor::dnn::AlgorithmDesc>
+GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
+                             tensorflow::CudnnVersion cudnn_version,
+                             absl::string_view blas_version,
+                             absl::string_view hlo) {
+  // Key is the tuple of canonicalized hlo, compute capability major/minor,
+  // cudnn version major/minor/patch, blas version.
+  using MapType = absl::flat_hash_map<
+      std::tuple<std::string, int, int, int, int, int, std::string>,
+      std::vector<stream_executor::dnn::AlgorithmDesc>>;
+
+  static MapType* blacklist = [] {
+    MapType* list = new MapType();
+    AlgorithmBlacklist proto;
+    std::string file_path =
+        GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path();
+    if (!file_path.empty()) {
+      TF_CHECK_OK(tensorflow::ReadTextProto(tensorflow::Env::Default(),
+                                            file_path, &proto));
+    } else {
+      CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+          std::string(kDefaultBlacklist), &proto));
+    }
+    for (const auto& entry : proto.entries()) {
+      for (const auto& algo : entry.algos()) {
+        (*list)[std::make_tuple(
+                    std::string(entry.hlo()), entry.cc().major(),
+                    entry.cc().minor(), entry.cudnn_version().major(),
+                    entry.cudnn_version().minor(),
+                    entry.cudnn_version().patch(), entry.blas_version())]
+            .push_back({algo.id(), algo.tensor_ops()});
+      }
+    }
+    return list;
+  }();
+
+  auto iter = blacklist->find(std::make_tuple(
+      std::string(hlo), cc.major(), cc.minor(), cudnn_version.major(),
+      cudnn_version.minor(), cudnn_version.patch(), std::string(blas_version)));
+  if (iter != blacklist->end()) {
+    return iter->second;
+  }
+  return {};
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h
new file mode 100644
index 00000000000..0120879e9d7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+
+namespace xla {
+namespace gpu {
+
+absl::Span<const stream_executor::dnn::AlgorithmDesc>
+GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
+                             tensorflow::CudnnVersion cudnn_version,
+                             absl::string_view blas_version,
+                             absl::string_view hlo);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
new file mode 100644
index 00000000000..2f2782bd4dc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class BlacklistTest : public testing::Test {
+ protected:
+  BlacklistTest() {
+    setenv("XLA_FLAGS",
+           absl::StrCat(
+               "--xla_gpu_algorithm_blacklist_path=",
+               tensorflow::io::JoinPath(
+                   tensorflow::testing::TensorFlowSrcRoot(), "compiler", "xla",
+                   "service", "gpu", "data", "hlo_algorithm_blacklist.pbtxt"))
+               .data(),
+           0);
+  }
+};
+
+TEST_F(BlacklistTest, DefaultTest) {
+  tensorflow::ComputeCapability cc;
+  cc.set_major(7);
+  cc.set_minor(0);
+  tensorflow::CudnnVersion cudnn_version;
+  cudnn_version.set_major(7);
+  cudnn_version.set_minor(6);
+  cudnn_version.set_patch(2);
+  auto list = GetBlacklistedConvAlgorithms(
+      cc, cudnn_version, /*blas_version=*/"9000",
+      R"((f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}")");
+  ASSERT_EQ(4, list.size());
+  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(0, false), list[0]);
+  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(0, true), list[1]);
+  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(1, false), list[2]);
+  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(1, true), list[3]);
+}
+
+TEST_F(BlacklistTest, NegativeTest) {
+  tensorflow::ComputeCapability cc;
+  cc.set_major(7);
+  cc.set_minor(0);
+  tensorflow::CudnnVersion cudnn_version;
+  cudnn_version.set_major(7);
+  cudnn_version.set_minor(6);
+  cudnn_version.set_minor(2);
+  auto list =
+      GetBlacklistedConvAlgorithms(cc, cudnn_version, "9000", R"(invalid hlo)");
+  ASSERT_EQ(0, list.size());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 404d3347772..78f8e22a857 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -220,9 +220,9 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
   }
 
   // For column reduction, the tile block is tize_size_y x tile_size_x, and we
-  // are reducing along tile_size_y. Both tile_size_x and tile_size_y need to be
+  // are reducing along tile_size_y. Only tile_size_y needs to be
   // large enough to make the tiling implementation efficient.
-  return dims_in_elem[2] >= kWarpSize && dims_in_elem[1] >= kWarpSize;
+  return dims_in_elem[1] >= kWarpSize;
 }
 
 std::pair<bool, DimensionVector> GetReductionKindAndContiguousComponents(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index f380aee9d3c..16dc9cd284f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -134,14 +134,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<llvm_ir::IrArray> ConstructIrArrayForOutputs(
       const HloInstruction& hlo);
 
-  // A convenient helper for calling BufferAssignment::GetUniqueSlice.
-  BufferAllocation::Slice GetAllocationSlice(
-      const HloInstruction& hlo, const ShapeIndex& index = {}) const {
-    return ir_emitter_context_->buffer_assignment()
-        .GetUniqueSlice(&hlo, index)
-        .ConsumeValueOrDie();
-  }
-
   // Emit a singlethreaded or multithreaded loop that computes every element in
   // the result of the given HLO instruction. This produces a series of nested
   // loops (e.g. one for each dimension of the `hlo`'s shape). The body of the
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 168156edf8e..0435daee143 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -37,37 +38,28 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
-#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
-#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
-#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/replica_id_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -98,10 +90,6 @@ namespace xla {
 namespace gpu {
 
 using llvm_ir::KernelMappingScheme;
-using EmitElementFunction =
-    std::function<void(const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-                       llvm::Value* x_loc, int64 x_iter_num)>;
-
 namespace {
 
 using absl::InlinedVector;
@@ -358,238 +346,15 @@ Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
 }
 
 Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
-  // A CustomCall on the GPU backend can either be a custom-call to a
-  // user-supplied kernel, or a call into a library like cudnn.
-
-  // Lower custom-calls to cudnn batchnorm ops to specialized thunks.  It's part
-  // of the contract of these cudnn batchnorm calls that the epsilon and
-  // feature_index operands be constants.
-  if (custom_call->custom_call_target() ==
-      kCudnnBatchNormForwardInferenceCallTarget) {
-    const HloInstruction* epsilon = custom_call->operand(5);
-    CHECK(epsilon->IsConstant());
-    float epsilon_value = epsilon->literal().Get<float>({});
-
-    const HloInstruction* feature_index = custom_call->operand(6);
-    CHECK(feature_index->IsConstant());
-    int64 feature_index_value = feature_index->literal().Get<int64>({});
-
-    AddThunkToThunkSequence(
-        absl::make_unique<CudnnBatchNormForwardInferenceThunk>(
-            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
-            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
-            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
-            /*mean=*/GetAllocationSlice(*custom_call->operand(3)),
-            /*variance=*/GetAllocationSlice(*custom_call->operand(4)),
-            /*epsilon=*/epsilon_value,
-            /*feature_index=*/feature_index_value,
-            /*output=*/GetAllocationSlice(*custom_call),
-            /*hlo=*/custom_call));
-    return Status::OK();
-  }
-
-  if (custom_call->custom_call_target() ==
-      kCudnnBatchNormForwardTrainingCallTarget) {
-    const HloInstruction* epsilon = custom_call->operand(3);
-    CHECK(epsilon->IsConstant());
-    float epsilon_value = epsilon->literal().Get<float>({});
-
-    const HloInstruction* feature_index = custom_call->operand(4);
-    CHECK(feature_index->IsConstant());
-    int64 feature_index_value = feature_index->literal().Get<int64>({});
-
-    // BatchNormTraining returns a tuple of three elements: data, calculated
-    // mean, and calculated 1/sqrt(variance + epsilon).
-    const auto& assn = ir_emitter_context_->buffer_assignment();
-    auto output_data = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
-    auto output_mean = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
-    auto output_inv_stddev = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-    AddThunkToThunkSequence(
-        absl::make_unique<CudnnBatchNormForwardTrainingThunk>(
-            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
-            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
-            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
-            /*epsilon=*/epsilon_value,
-            /*feature_index=*/feature_index_value,
-            /*output_data=*/output_data,
-            /*output_mean=*/output_mean,
-            /*output_inv_stddev=*/output_inv_stddev,
-            /*output_tuple=*/GetAllocationSlice(*custom_call),
-            /*hlo=*/custom_call));
-    return Status::OK();
-  }
-
-  if (custom_call->custom_call_target() == kCudnnBatchNormBackwardCallTarget) {
-    const HloInstruction* epsilon = custom_call->operand(5);
-    CHECK(epsilon->IsConstant());
-    float epsilon_value = epsilon->literal().Get<float>({});
-
-    const HloInstruction* feature_index = custom_call->operand(6);
-    CHECK(feature_index->IsConstant());
-    int64 feature_index_value = feature_index->literal().Get<int64>({});
-
-    // BatchNormGrad returns a tuple of three elements: grad_data, grad_scale,
-    // grad_offset.
-    const auto& assn = ir_emitter_context_->buffer_assignment();
-    auto output_grad_data = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
-    auto output_grad_scale = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
-    auto output_grad_offset =
-        assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-    AddThunkToThunkSequence(absl::make_unique<CudnnBatchNormBackwardThunk>(
-        /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
-        /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
-        /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
-        /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
-        /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
-        /*epsilon=*/epsilon_value,
-        /*feature_index=*/feature_index_value,
-        /*output_grad_data=*/output_grad_data,
-        /*output_grad_scale=*/output_grad_scale,
-        /*output_grad_offset=*/output_grad_offset,
-        /*output_tuple=*/GetAllocationSlice(*custom_call),
-        /*hlo=*/custom_call));
-    return Status::OK();
-  }
-
-  if (IsCustomCallToDnnConvolution(*custom_call)) {
-    const auto& assn = ir_emitter_context_->buffer_assignment();
-    std::vector<BufferAllocation::Slice> operand_slices;
-    operand_slices.reserve(custom_call->operand_count());
-    for (const auto* operand : custom_call->operands()) {
-      operand_slices.push_back(GetAllocationSlice(*operand));
-    }
-    auto tuple_result_slice = GetAllocationSlice(*custom_call);
-    auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
-    auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
-
-    AddThunkToThunkSequence(absl::make_unique<ConvolutionThunk>(
-        Cast<HloCustomCallInstruction>(custom_call), std::move(operand_slices),
-        conv_result_slice, scratch_slice, tuple_result_slice));
-    return Status::OK();
-  }
-
-  if (custom_call->custom_call_target() == kCusolverCholeskyCallTarget) {
-    TF_ASSIGN_OR_RETURN(CholeskyOptions options,
-                        custom_call->backend_config<CholeskyOptions>());
-
-    const Shape& shape = custom_call->operand(0)->shape();
-    int ndim = shape.dimensions_size();
-    CHECK_GE(ndim, 2);
-    int64 n = shape.dimensions(ndim - 1);
-
-    const auto& dims = shape.dimensions();
-    int64 batch_size = std::accumulate(dims.begin(), dims.end() - 2, int64{1},
-                                       [](int64 a, int64 b) { return a * b; });
-
-    auto operand_buffer = GetAllocationSlice(*custom_call->operand(0));
-
-    const auto& assn = ir_emitter_context_->buffer_assignment();
-    auto a_buffer = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
-    auto workspace_buffer = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
-    auto info_buffer = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-
-    std::vector<std::unique_ptr<Thunk>> thunks;
-
-    if (operand_buffer != a_buffer) {
-      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
-          /*source_address=*/operand_buffer,
-          /*destination_buffer=*/a_buffer,
-          /*mem_size=*/ShapeUtil::ByteSizeOf(shape), custom_call));
-    }
-
-    thunks.push_back(absl::make_unique<CholeskyThunk>(
-        options, a_buffer, workspace_buffer, info_buffer,
-        custom_call->operand(0)->shape().element_type(), batch_size, n,
-        custom_call));
-
-    // Elide the sequential thunk if there's no copy.
-    if (thunks.size() == 1) {
-      AddThunkToThunkSequence(std::move(thunks[0]));
-    } else {
-      AddThunkToThunkSequence(
-          absl::make_unique<SequentialThunk>(std::move(thunks), custom_call));
-    }
-
-    return Status::OK();
-  }
-
-  if (IsCublasGemm(*custom_call)) {
-    AddThunkToThunkSequence(BuildGemmThunk(custom_call));
-    return Status::OK();
-  }
-
-  if (void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-          custom_call->custom_call_target(),
-          ir_emitter_context_->platform()->Name())) {
-    const auto& assn = ir_emitter_context_->buffer_assignment();
-    auto get_slices_for_instr = [&](const HloInstruction* instr) {
-      ShapeTree<BufferAllocation::Slice> slices(instr->shape());
-      slices.ForEachMutableElement([&](const ShapeIndex& index,
-                                       BufferAllocation::Slice* slice) {
-        StatusOr<BufferAllocation::Slice> s = assn.GetUniqueSlice(instr, index);
-        if (s.ok()) {
-          *slice = s.ValueOrDie();
-        }
-      });
-      return slices;
-    };
-    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices;
-    for (const auto* operand : custom_call->operands()) {
-      operand_slices.push_back(get_slices_for_instr(operand));
-    }
-    ShapeTree<BufferAllocation::Slice> result_slices =
-        get_slices_for_instr(custom_call);
-    AddThunkToThunkSequence(absl::make_unique<CustomCallThunk>(
-        call_target, std::move(operand_slices), std::move(result_slices),
-        Cast<HloCustomCallInstruction>(custom_call)->opaque(), custom_call));
-    return Status::OK();
-  }
-
-  return Unimplemented("No registered implementation for custom call to \"%s\"",
-                       custom_call->custom_call_target());
+  return ThunkEmitter(this).HandleCustomCall(custom_call);
 }
 
 Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
-  TF_RET_CHECK(
-      LayoutUtil::IsMonotonicWithDim0Major(fft->operand(0)->shape().layout()));
-  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
-  AddThunkToThunkSequence(BuildFftThunk(fft));
-  return Status::OK();
+  return ThunkEmitter(this).HandleFft(fft);
 }
 
 Status IrEmitterUnnested::HandleTriangularSolve(HloInstruction* hlo) {
-  auto has_fortran_layout = [](const Layout& layout) {
-    int n = layout.minor_to_major_size();
-    return layout.minor_to_major(0) == n - 2 &&
-           layout.minor_to_major(1) == n - 1;
-  };
-  TF_RET_CHECK(has_fortran_layout(hlo->operand(0)->shape().layout()));
-  TF_RET_CHECK(has_fortran_layout(hlo->operand(1)->shape().layout()));
-  TF_RET_CHECK(has_fortran_layout(hlo->shape().layout()));
-
-  std::vector<std::unique_ptr<Thunk>> thunks;
-
-  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
-  // aren't the same buffer.
-  auto operand_buffer = GetAllocationSlice(*hlo->operand(1));
-  auto destination_buffer = GetAllocationSlice(*hlo);
-  if (operand_buffer != destination_buffer) {
-    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
-        /*source_address=*/operand_buffer,
-        /*destination_buffer=*/destination_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()), hlo));
-  }
-
-  thunks.push_back(BuildTriangularSolveThunk(hlo));
-
-  // Elide the sequential thunk if there's no copy.
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
-  } else {
-    AddThunkToThunkSequence(
-        absl::make_unique<SequentialThunk>(std::move(thunks), hlo));
-  }
-  return Status::OK();
+  return ThunkEmitter(this).HandleTriangularSolve(hlo);
 }
 
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
@@ -605,7 +370,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           int unroll_factor = ComputeMaxUnrollFactor(fusion);
           thunks.push_back(BuildKernelThunk(
               fusion, /*implements_whole_instruction=*/false, unroll_factor));
-
           GpuElementalIrEmitter operand_elemental_emitter(
               hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
               GetNestedComputer());
@@ -710,7 +474,16 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
   if (LayoutUtil::Equal(copy->operand(0)->shape().layout(),
                         copy->shape().layout()) &&
       buffer_assignment.GetUniqueTopLevelSlice(copy->operand(0)).ok()) {
-    AddThunkToThunkSequence(BuildDeviceToDeviceCopyThunk(copy));
+    // Copy the operand into the output if it's not the same buffer already.
+    auto operand_buffer = GetAllocationSlice(*copy->operand(0));
+    auto destination_buffer = GetAllocationSlice(*copy);
+    if (operand_buffer != destination_buffer) {
+      AddThunkToThunkSequence(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/operand_buffer,
+          /*destination_buffer=*/destination_buffer,
+          /*mem_size=*/
+          ByteSizeOf(copy->operand(0)->shape()), copy));
+    }
     return Status::OK();
   }
   if (CheckAndEmitHloWithTile021(copy)) {
@@ -1048,7 +821,8 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
     thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/operand_buffer,
         /*destination_buffer=*/destination_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(operand->shape()), scatter));
+        /*mem_size=*/ShapeUtil::ByteSizeOf(operand->shape()),
+        /*hlo_instruction=*/nullptr));
   }
 
   thunks.push_back(
@@ -1486,17 +1260,15 @@ Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
-  return Status::OK();
-}
-
-Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
-  AddThunkToThunkSequence(BuildInfeedThunk(infeed));
-  return Status::OK();
+Status IrEmitterUnnested::HandleInfeed(HloInstruction* xla_infeed) {
+  return ThunkEmitter(this).HandleInfeed(xla_infeed);
 }
 
 Status IrEmitterUnnested::HandleOutfeed(HloInstruction* outfeed) {
-  AddThunkToThunkSequence(BuildOutfeedThunk(outfeed));
+  return ThunkEmitter(this).HandleOutfeed(outfeed);
+}
+
+Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
@@ -1720,131 +1492,6 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
       implements_whole_instruction ? inst : nullptr, unroll_factor);
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* operand = inst->operand(0);
-  CHECK_EQ(HloOpcode::kConstant, operand->opcode());
-  return absl::make_unique<HostToDeviceCopyThunk>(
-      /*source_address=*/operand->literal().untyped_data(),
-      /*destination_buffer=*/GetAllocationSlice(*inst),
-      /*mem_size=*/
-      llvm_ir::ByteSizeOf(operand->shape(),
-                          ir_emitter_context_->llvm_module()->getDataLayout()),
-      inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildDeviceToDeviceCopyThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* operand = inst->operand(0);
-  return absl::make_unique<DeviceToDeviceCopyThunk>(
-      /*source_address=*/GetAllocationSlice(*operand),
-      /*destination_buffer=*/GetAllocationSlice(*inst),
-      /*mem_size=*/
-      llvm_ir::ByteSizeOf(operand->shape(),
-                          ir_emitter_context_->llvm_module()->getDataLayout()),
-      inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
-    const HloInstruction* inst) {
-  CHECK_EQ(HloOpcode::kInfeed, inst->opcode());
-
-  ShapeTree<BufferAllocation::Slice> slices(inst->shape());
-  slices.ForEachMutableElement(
-      [&](const ShapeIndex& index, BufferAllocation::Slice* slice) {
-        *slice = ir_emitter_context_->buffer_assignment()
-                     .GetUniqueSlice(inst, index)
-                     .ConsumeValueOrDie();
-      });
-  return absl::make_unique<InfeedThunk>(slices, inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildOutfeedThunk(
-    const HloInstruction* inst) {
-  CHECK_EQ(HloOpcode::kOutfeed, inst->opcode());
-
-  ShapeTree<BufferAllocation::Slice> slices(inst->operand(0)->shape());
-  slices.ForEachMutableElement(
-      [&](const ShapeIndex& index, BufferAllocation::Slice* slice) {
-        auto status_or_slice =
-            ir_emitter_context_->buffer_assignment().GetUniqueSlice(
-                inst->operand(0), index);
-        if (status_or_slice.ok()) {
-          *slice = status_or_slice.ConsumeValueOrDie();
-        }
-      });
-  return absl::make_unique<OutfeedThunk>(std::move(slices), inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
-    const HloInstruction* inst) {
-  auto config_or = inst->backend_config<GemmBackendConfig>();
-  GemmBackendConfig gemm_config = std::move(config_or.ValueOrDie());
-  const HloInstruction* lhs = inst->operand(0);
-  const HloInstruction* rhs = inst->operand(1);
-
-  // The bias is passed inside the output buffer. If those buffers are shared
-  // we can just use it, otherwise copy the bias values into the output buffer
-  // first.
-  if (gemm_config.beta() != 0.0) {
-    const HloInstruction* bias = inst->operand(2);
-    CHECK_EQ(bias->shape(), inst->shape());
-    if (GetAllocationSlice(*bias) != GetAllocationSlice(*inst)) {
-      std::vector<std::unique_ptr<Thunk>> thunks;
-      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
-          /*source_buffer=*/GetAllocationSlice(*bias),
-          /*destination_buffer=*/GetAllocationSlice(*inst),
-          /*mem_size=*/ShapeUtil::ByteSizeOf(inst->shape()), nullptr));
-      thunks.push_back(absl::make_unique<GemmThunk>(
-          GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
-          GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
-          GetAllocationSlice(*inst),  // The output buffer.
-          /*implements_whole_instruction=*/false, inst));
-      return absl::make_unique<SequentialThunk>(std::move(thunks), inst);
-    }
-  }
-
-  return absl::make_unique<GemmThunk>(
-      GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
-      GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
-      GetAllocationSlice(*inst),  // The output buffer.
-      /*implements_whole_instruction=*/true, inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* operand = inst->operand(0);
-  return absl::make_unique<FftThunk>(
-      inst->fft_type(), inst->fft_length(),
-      /*input_buffer=*/GetAllocationSlice(*operand),
-      /*output_buffer=*/GetAllocationSlice(*inst),
-      /*input_shape=*/operand->shape(),
-      /*output_shape=*/inst->shape(), inst);
-}
-
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildTriangularSolveThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* a = inst->operand(0);
-  const HloInstruction* b = inst->operand(1);
-  int64 m = b->shape().dimensions(b->shape().rank() - 2);
-  int64 n = b->shape().dimensions(b->shape().rank() - 1);
-  int64 batch_size = std::accumulate(
-      b->shape().dimensions().begin(), b->shape().dimensions().end() - 2,
-      int64{1}, [](int64 a, int64 b) { return a * b; });
-  int64 elem_size =
-      ShapeUtil::ByteSizeOfPrimitiveType(inst->shape().element_type());
-  int64 a_batch_stride = inst->triangular_solve_options().left_side()
-                             ? m * m * elem_size
-                             : n * n * elem_size;
-  int64 b_batch_stride = m * n * elem_size;
-  return absl::make_unique<TriangularSolveThunk>(
-      inst->triangular_solve_options(),
-      /*a_input_buffer=*/GetAllocationSlice(*a),
-      /*b_input_buffer=*/GetAllocationSlice(*inst),
-      inst->shape().element_type(), batch_size, m, n, a_batch_stride,
-      b_batch_stride, inst);
-}
-
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
@@ -2200,41 +1847,6 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
   return emit_status;
 }
 
-std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
-    const HloInstruction& hlo) {
-  std::vector<IrArray> param_arrays;
-  param_arrays.reserve(hlo.operands().size());
-  for (const HloInstruction* param : hlo.operands()) {
-    param_arrays.push_back(GetIrArray(*param, hlo));
-  }
-  return param_arrays;
-}
-
-int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
-    const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
-    const std::vector<llvm::Value*>& param_buffers,
-    absl::Span<const int64> reduced_output_dims,
-    std::vector<Shape>* param_reduced_shapes,
-    std::vector<IrArray>* param_in_reduced_shape_arrays) {
-  int64 num_params = hlo.operands().size();
-  param_in_reduced_shape_arrays->reserve(num_params);
-  param_reduced_shapes->reserve(num_params);
-  for (int64 id = 0; id < num_params; ++id) {
-    if (param_buffers[id] == nullptr) {
-      param_reduced_shapes->push_back(Shape());
-      param_in_reduced_shape_arrays->push_back(IrArray());
-      continue;
-    }
-    const HloInstruction* param = hlo.operand(id);
-    param_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-        param->shape().element_type(),
-        Permute({0, 2, 1}, reduced_output_dims)));
-    param_in_reduced_shape_arrays->push_back(
-        param_arrays[id].CastToShape((*param_reduced_shapes)[id], &b_));
-  }
-  return num_params;
-}
-
 namespace {
 
 std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
@@ -2254,12 +1866,12 @@ std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
   return std::make_tuple(start_offset_x, step_x);
 }
 
-void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme,
-                           const IrArray::Index& tile_origin_index,
-                           const string& loop_name, KernelSupportLibrary* ksl,
-                           llvm::IRBuilder<>* builder, llvm::Value* y,
-                           llvm::Value* x, llvm::Type* index_ty,
-                           const EmitElementFunction& emit_elem_function) {
+void EmitFullElementalTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Type* index_ty,
+    const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
@@ -2292,14 +1904,13 @@ void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme,
            });
 }
 
-void EmitPartialElementalTile(const KernelMappingScheme* mapping_scheme,
-                              const IrArray::Index& tile_origin_index,
-                              const string& loop_name,
-                              KernelSupportLibrary* ksl,
-                              llvm::IRBuilder<>* builder, llvm::Value* y,
-                              llvm::Value* x, llvm::Value* tile_height,
-                              llvm::Value* tile_width, llvm::Type* index_ty,
-                              const EmitElementFunction& emit_elem_function) {
+void EmitPartialElementalTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    llvm::Type* index_ty,
+    const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
@@ -2361,7 +1972,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
     const IrArray::Index& tile_origin_index, const string& loop_name,
     KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    const EmitElementFunction& emit_elem_function) {
+    const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
@@ -2397,13 +2008,11 @@ void EmitTiledElementalCodeWithBoundsCheck(
 void IrEmitterUnnested::EmitTileElementForCopy(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc, int64 /*x_iter_num*/) {
-  llvm_ir::TiledParameterInfo* tiled_param_info =
-      kernel_info->GetTiledParameterInfo();
+    llvm::Value* x_loc, int64 /*x_iter_num*/,
+    absl::Span<llvm::Value* const> param_shmem_buffers) {
   // TODO(jlebar): Add AA metadata to this load.
   llvm::Instruction* load_from_shmem_buffer =
-      Load(GEP(tiled_param_info->GetBufferForParameter(0),
-               {b_.getInt64(0), x_loc, y_loc}),
+      Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x_loc, y_loc}),
            "output_element");
   llvm_ir::IrArray output_array = GetIrArray(*hlo, *hlo);
   Shape output_reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
@@ -2427,17 +2036,15 @@ void IrEmitterUnnested::EmitTileElementForCopy(
 void IrEmitterUnnested::EmitTileElementForFusion(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc, int64 /*x_iter_num*/) {
-  llvm_ir::TiledParameterInfo* tiled_param_info =
-      kernel_info->GetTiledParameterInfo();
+    llvm::Value* x_loc, int64 /*x_iter_num*/,
+    absl::Span<llvm::Value* const> param_shmem_buffers) {
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
   GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
                                      GetNestedComputer());
   FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                               &elem_emitter);
-  tiled_param_info->set_y(y_loc);
-  tiled_param_info->set_x(x_loc);
-  fused_emitter.SetTiledParameterInfo(tiled_param_info);
+                               &elem_emitter, x_loc, y_loc,
+                               param_shmem_buffers);
+
   TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
   IrArray::Index untiled_index =
       kernel_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
@@ -2501,19 +2108,6 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
     return reduction_input_addresses_;
   }
 
-  InlinedVector<HloComputation*, 1>* GetMutableReducers() { return &reducers_; }
-  const InlinedVector<HloComputation*, 1>& GetReducers() const {
-    return reducers_;
-  }
-  int GetNumberOfReduces() const { return reducers_.size(); }
-
-  InlinedVector<ShapeIndex, 1>* GetMutableReductionOutputShapeIndices() {
-    return &reduction_output_shape_indices_;
-  }
-  absl::Span<const ShapeIndex> GetReductionOutputShapeIndices() const {
-    return reduction_output_shape_indices_;
-  }
-
   bool IsRowReduction() const { return is_row_reduction_; }
 
   // Return the dimension that is being reduced between DimX and DimY.
@@ -2560,8 +2154,6 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
  private:
   AddressVector partial_result_addresses_;
   AddressVector reduction_input_addresses_;
-  InlinedVector<HloComputation*, 1> reducers_;
-  InlinedVector<ShapeIndex, 1> reduction_output_shape_indices_;
   // The address of the memory that stores the linear index of the current
   // output, assuming that the output doesn't change the layout of the kept
   // elements in the reduction input.
@@ -2570,48 +2162,10 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
   bool is_row_reduction_;
 };
 
-namespace {
-// Returns a group of instructions that generate the output for the kernel
-// containing the given HLO instruction. The result may be an unnested kReduce
-// HLO, a nested kReduce HLO of a kInput fusion, or the operands of the tuple
-// for a multiple output fusion.
-absl::Span<HloInstruction* const> GetOutputInstructions(
-    HloInstruction* const* reduce_or_tuple_pointer) {
-  HloOpcode opcode = (*reduce_or_tuple_pointer)->opcode();
-  CHECK(opcode == HloOpcode::kReduce || opcode == HloOpcode::kTuple);
-  return opcode == HloOpcode::kTuple
-             ? (*reduce_or_tuple_pointer)->operands()
-             : absl::Span<HloInstruction* const>(reduce_or_tuple_pointer, 1);
-}
-
-const HloInstruction* GetFirstReduceInstruction(
-    absl::Span<HloInstruction* const> instructions) {
-  auto first_reduce_iter =
-      absl::c_find_if(instructions, [](const HloInstruction* inst) {
-        return IsReductionFromOrToContiguousDimensions(*inst);
-      });
-  CHECK_NE(first_reduce_iter, instructions.end());
-  return *first_reduce_iter;
-}
-
-};  // namespace
-
 void IrEmitterUnnested::EmitPrologueForOneReduction(
     HloInstruction* unnested_hlo, HloInstruction* reduce_inst, int reduce_idx,
-    KernelCodegenInfo* kernel_info, GpuElementalIrEmitter* elemental_emitter,
-    ShapeIndex output_shape_index) {
-  ReductionCodegenInfo* reduction_info =
-      static_cast<ReductionCodegenInfo*>(kernel_info);
-
-  InlinedVector<HloComputation*, 1>* reducers =
-      reduction_info->GetMutableReducers();
-  CHECK(IsReductionFromOrToContiguousDimensions(*reduce_inst));
-  reducers->push_back(reduce_inst->to_apply());
-
-  InlinedVector<ShapeIndex, 1>* reduction_output_shape_indices =
-      reduction_info->GetMutableReductionOutputShapeIndices();
-  reduction_output_shape_indices->push_back(std::move(output_shape_index));
-
+    KernelCodegenInfo* kernel_info, GpuElementalIrEmitter* elemental_emitter) {
+  auto reduction_info = static_cast<ReductionCodegenInfo*>(kernel_info);
   AddressVector* reduction_input_addresses =
       reduction_info->GetMutableReductionInputAddresses();
   llvm::Type* element_type = llvm_ir::PrimitiveTypeToIrType(
@@ -2652,38 +2206,23 @@ void IrEmitterUnnested::EmitPrologueForOneReduction(
 }
 
 void IrEmitterUnnested::EmitPrologueForReduction(
-    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info,
+    absl::Span<HloInstruction* const> reduce_instructions) {
   VLOG(10) << "Emit prologue for reduction " << unnested_hlo->ToString();
-  // Find the unnested kReduce or the tuple that contains a list of kReduce.
-  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
-                                        ? unnested_hlo->fused_expression_root()
-                                        : unnested_hlo;
-  absl::Span<HloInstruction* const> output_instructions =
-      GetOutputInstructions(&reduce_or_tuple);
-  ReductionCodegenInfo* reduction_info =
-      static_cast<ReductionCodegenInfo*>(kernel_info);
+  auto reduction_info = static_cast<ReductionCodegenInfo*>(kernel_info);
   GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                           ir_emitter_context_->llvm_module(),
                                           &b_, GetNestedComputer());
   const HloInstruction* first_reduce = nullptr;
-  for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-    if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
-      continue;
-    }
-    HloInstruction* reduce_inst = output_instructions[i];
+  for (int i = 0; i < reduce_instructions.size(); i++) {
+    HloInstruction* reduce_inst = reduce_instructions[i];
     if (first_reduce == nullptr) {
       first_reduce = reduce_inst;
     } else {
       CHECK(first_reduce->dimensions() == reduce_inst->dimensions());
     }
-    ShapeIndex output_shape_index;
-    if (reduce_or_tuple->opcode() == HloOpcode::kTuple) {
-      output_shape_index = {i};
-    }
-
     EmitPrologueForOneReduction(unnested_hlo, reduce_inst, i, kernel_info,
-                                &elemental_emitter,
-                                std::move(output_shape_index));
+                                &elemental_emitter);
   }
 
   int num_partial_results = reduction_info->GetNumberOfPartialResults();
@@ -2733,17 +2272,14 @@ void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForAllReduces(
 }
 
 void IrEmitterUnnested::EmitEpilogueForReduction(
-    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
-  ReductionCodegenInfo* reduction_info =
-      static_cast<ReductionCodegenInfo*>(kernel_info);
-  int num_reduces = reduction_info->GetNumberOfReduces();
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info,
+    absl::Span<const HloInstruction* const> reduce_instructions,
+    absl::Span<const ShapeIndex> reduction_output_shape_indices,
+    absl::Span<HloComputation* const> reducers) {
+  auto reduction_info = static_cast<ReductionCodegenInfo*>(kernel_info);
+  int num_reduces = reducers.size();
   absl::Span<llvm::AllocaInst* const> partial_result_addresses =
       reduction_info->GetPartialResultAddresses();
-  const InlinedVector<HloComputation*, 1>& reducers =
-      reduction_info->GetReducers();
-  absl::Span<const ShapeIndex> reduction_output_shape_indices =
-      reduction_info->GetReductionOutputShapeIndices();
-
   if (reduction_info->IsRowReduction()) {
     EmitFullWarpShuffleDownLoopForAllReduces(reducers,
                                              partial_result_addresses);
@@ -2763,16 +2299,6 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
     llvm_ir::SetToFirstInsertPoint(if_output_inbound_data.true_block, &b_);
   }
 
-  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
-                                        ? unnested_hlo->fused_expression_root()
-                                        : unnested_hlo;
-  std::vector<const HloInstruction*> reduce_instructions;
-  absl::c_for_each(GetOutputInstructions(&reduce_or_tuple),
-                   [&](const HloInstruction* instr) {
-                     if (IsReductionFromOrToContiguousDimensions(*instr)) {
-                       reduce_instructions.push_back(instr);
-                     }
-                   });
   int num_partial_results = reduction_info->GetNumberOfPartialResults();
 
   // Emit an atomic operation that accumulates the partial reduction to the
@@ -2837,21 +2363,16 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
 }
 
 void IrEmitterUnnested::EmitTileElementForReduction(
-    HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
-    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc, int64 x_iter_num) {
+    HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
+    absl::Span<HloInstruction* const> output_instructions,
+    const llvm_ir::IrArray::Index& index, const KernelCodegenInfo* kernel_info,
+    absl::Span<HloComputation* const> reducers, int64 x_iter_num) {
   VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
   HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
                                         ? unnested_hlo->fused_expression_root()
                                         : unnested_hlo;
-  llvm_ir::TiledParameterInfo* tiled_param_info =
-      kernel_info->GetTiledParameterInfo();
-  tiled_param_info->set_y(y_loc);
-  tiled_param_info->set_x(x_loc);
-
   // Record the untransposed output linear address for the reduction.
-  const ReductionCodegenInfo* reduction_info =
-      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  auto reduction_info = dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
   int partial_result_index = reduction_info->IsRowReduction() ? 0 : x_iter_num;
   Store(reduction_info->GetUntransposedOutputLinearAddress(&b_, index),
         InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
@@ -2871,12 +2392,9 @@ void IrEmitterUnnested::EmitTileElementForReduction(
                                      GetNestedComputer());
   FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
                                &elem_emitter);
-  absl::Span<HloInstruction* const> output_instructions =
-      GetOutputInstructions(&reduce_or_tuple);
   // Construct the ElementGenerator for each reduction and extra output in the
   // the group of output instructions.
   if (unnested_hlo->opcode() == HloOpcode::kFusion) {
-    fused_emitter.SetTiledParameterInfo(tiled_param_info);
     TF_CHECK_OK(unnested_hlo->fused_expression_root()->Accept(&fused_emitter));
 
     for (int i = 0, e = output_instructions.size(); i != e; ++i) {
@@ -2899,8 +2417,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
     });
   }
 
-  Shape reduction_operand_shape =
-      GetFirstReduceInstruction(output_instructions)->operand(0)->shape();
   IrArray::Index input_index =
       reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
           index, reduction_operand_shape);
@@ -2915,9 +2431,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       reduction_info->GetPartialResultAddresses();
   absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
       reduction_info->GetReductionInputAddresses();
-  const InlinedVector<HloComputation*, 1>& reducers =
-      reduction_info->GetReducers();
-
   // Emit code to generate the input and perform the reduction computation for
   // each reduction instruction.
   for (int i = 0; i != reducers.size(); ++i) {
@@ -2942,10 +2455,11 @@ void IrEmitterUnnested::EmitTileElementForReduction(
 }
 
 // Emits a kernel for the hlo instruction using the given tiling scheme.
-void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
-                                  KernelCodegenInfo* kernel_info,
-                                  KernelSupportLibrary* ksl,
-                                  llvm::Type* index_ty) {
+void IrEmitterUnnested::EmitBlock(KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary* ksl, llvm::Value* y,
+                                  llvm::Value* x,
+                                  TileElementGenerator tile_generator) {
+  llvm::Type* index_ty = kernel_info->GetIndexType();
   KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
   absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
   absl::Span<const int64> dims_in_block =
@@ -2990,11 +2504,9 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 
   absl::Span<const int64> reduced_dims =
       mapping_scheme->GetDimensionsInElements();
-  const bool block_contains_multi_tiles =
-      mapping_scheme->GetNumberOfTilesInOneBlock() > 1;
 
   // Emit the tile with a given tile_index, by calculating the tight bounds for
-  // each dimension of the tile and then calling emit_one_tile.
+  // each dimension of the tile and then calling tile_generator.
   auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
     std::vector<llvm::Value*> output_tile_bounds(3);
     for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
@@ -3012,7 +2524,8 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 
     IrArray::Index tile_origin =
         mapping_scheme->GetElementIndexForTileOrigin(tile_index);
-    emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles);
+    tile_generator(y, x, tile_origin, "output", output_tile_bounds[1],
+                   output_tile_bounds[2], ksl);
   };
 
   const IrArray::Index starting_block =
@@ -3036,79 +2549,34 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 
 // Emits a kernel for the hlo instruction using the given kernel mapping scheme.
 //
+// The emitted code is written into the member variable b_, which corresponds to
+// the kernel thunk currently being constructed (previous call to
+// BuildKernelThunk).
+//
 // unnested_hlo: The unnested hlo instruction for which the kernel is generated.
 //   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
-// tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
-//   other tensors with the same dimensions and are safe to be tranposed via
-//   the shared memory tranpose implementation.
 // mapping_scheme: The tiling scheme to use.
 // kernel_generator: Contains function objects for code generation, such as
 //   element generator, block prologue and epilogue generators.
 // kernel_info: Represent other information to support the code generation
 //   of the tiled kernel for the hlo.
-LaunchDimensions IrEmitterUnnested::EmitKernel(
-    HloInstruction* unnested_hlo, absl::Span<const int64> tiled_param_ids,
-    const KernelCodeGenerator& kernel_generator,
-    KernelCodegenInfo* kernel_info) {
+void IrEmitterUnnested::EmitKernel(
+    HloInstruction* unnested_hlo, Thunk* kernel_thunk,
+    KernelCodegenInfo* kernel_info, TileElementGenerator tile_element_generator,
+    BlockPrologueGenerator block_prologue_generator,
+    BlockEpilogueGenerator block_epilogue_generator) {
   KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
-
-  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*unnested_hlo);
-  int64 num_params = param_arrays.size();
-  // Allocate shared memory buffers to store the tiled inputs.
-  std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
-  for (int64 id : tiled_param_ids) {
-    const HloInstruction* param = unnested_hlo->operand(id);
-    param_shmem_buffers[id] =
-        mapping_scheme->GetSharedMemoryBufferForElementType(
-            llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
-                                           module_),
-            IrName(unnested_hlo, StrCat("tile", id)));
-    VLOG(3) << "Added shmem buffer for parameter " << id << ": "
-            << llvm_ir::DumpToString(*param_shmem_buffers[id]);
-  }
-
-  const ReductionCodegenInfo* reduction_info =
-      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
-  bool is_column_reduction =
-      (reduction_info && !reduction_info->IsRowReduction());
-
-  LaunchDimensions launch_dimensions =
-      LaunchDimensions(mapping_scheme->GetNumberOfBlocks(),
-                       mapping_scheme->GetThreadsPerBlock());
+  LaunchDimensions launch_dimensions(mapping_scheme->GetNumberOfBlocks(),
+                                     mapping_scheme->GetThreadsPerBlock());
 
   // TODO(b/110211620): Enable int32 index type for column reduction.
+  auto reduction_info = dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
   llvm::Type* index_ty =
-      is_column_reduction
+      (reduction_info && !reduction_info->IsRowReduction())
           ? b_.getInt64Ty()
           : GetIndexTypeForKernel(unnested_hlo,
                                   launch_dimensions.launch_bound(), &b_);
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // For multioutput fusion, one thread needs to output a tuple with pointers to
-  // all the individual outputs.  We could do this at any point in the kernel,
-  // but we do it at the beginning in the hopes of reducing register pressure,
-  // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
-  // *anyway*.
-  if (!reduction_info && unnested_hlo->IsMultiOutputFusion()) {
-    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-      llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
-                         ConstructIrArrayForOutputs(*unnested_hlo), &b_);
-    });
-  }
-
-  // For each tiled parameter, cast its input IrArray to the corresponding
-  // reduced shape and keep the reduced shape live during IR emission.
-  std::vector<IrArray> param_in_reduced_shape_arrays;
-  std::vector<Shape> param_reduced_shapes;
-  absl::Span<const int64> reduced_dims =
-      mapping_scheme->GetDimensionsInElements();
-  int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape(
-      *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims,
-      &param_reduced_shapes, &param_in_reduced_shape_arrays);
-  DCHECK_EQ(num_shapes, num_params);
+  kernel_info->SetIndexType(index_ty);
 
   // Calculate the starting element coordinate within a tile for the current
   // thread, (y, x) from thread_id.
@@ -3119,102 +2587,20 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   kernel_info->SetLaneId(
       mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
                                                                      : nullptr);
-  kernel_info->SetIndexType(index_ty);
-
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-  // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
-  auto emit_tiled_elemental_code_with_bounds_check =
-      [&](const IrArray::Index& index, const string& loop_name,
-          llvm::Value* tile_height, llvm::Value* tile_width,
-          const EmitElementFunction& emit_elem_function) {
-        EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
-                                              &ksl, &b_, y, x, tile_height,
-                                              tile_width, emit_elem_function);
-      };
 
-  auto emit_one_tile = [&](const IrArray::Index& output_tile_origin,
-                           absl::Span<llvm::Value* const> output_tile_bounds,
-                           bool block_contains_multi_tiles) {
-    // Calculate the input tile origin from the output tile origin.
-    const IrArray::Index input_tile_origin(
-        Permute({0, 2, 1}, output_tile_origin.multidim()),
-        Permute({0, 2, 1}, output_tile_origin.dims()),
-        output_tile_origin.GetType());
-
-    // If shared memory transpose is needed, wait for all threads to reach this
-    // point, lest we copy a value from tile to output before the other thread
-    // copies it from input to tile. This is `__syncthreads` in CUDA.
-    if (!tiled_param_ids.empty()) {
-      // Copy input parameter values to shared memory buffers:
-      // tile[y, x] = input[index]
-      // Note that tile_width and tile_height are flipped here because we are
-      // reading a transposed tile.
-      emit_tiled_elemental_code_with_bounds_check(
-          input_tile_origin, "input", output_tile_bounds[2],
-          output_tile_bounds[1],
-          [&](const IrArray::Index& index, llvm::Value* y_loc,
-              llvm::Value* x_loc, int64 /*x_iter_num*/) {
-            for (int64 id : tiled_param_ids) {
-              IrArray& input_in_logical_shape =
-                  param_in_reduced_shape_arrays[id];
-              llvm::Value* shmem_buffer = param_shmem_buffers[id];
-              // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-              // global variables, so LLVM can't infer much about it.
-              Store(input_in_logical_shape.EmitReadArrayElement(
-                        index, &b_, "input_element"),
-                    GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
-            }
-          });
-
-      // Wait for all threads to reach this point using `__syncthreads` in CUDA.
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
-    }
-
-    llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
-    kernel_info->SetTiledParamInfo(&tiled_param_info);
-
-    // Write to output[index] by emitting code like normal, except that values
-    // for the tiled parameters are read from the shmem buffers.
-    emit_tiled_elemental_code_with_bounds_check(
-        output_tile_origin, "output", output_tile_bounds[1],
-        output_tile_bounds[2],
-        [&](const IrArray::Index& index, llvm::Value* y_loc, llvm::Value* x_loc,
-            int64 x_iter_num) {
-          kernel_generator.GetTileElementGenerator()(
-              unnested_hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
-        });
-
-    // If a tile block contains multiple tiles and shared memory buffers are
-    // used, we need to wait for all threads to finish using the shared memory
-    // buffer for the current tile before we move on to process the next tile
-    // and overwrite the shared memory buffers.
-    if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
-    }
-  };
-
-  const BlockPrologueGenerator& block_prologue_generator =
-      kernel_generator.GetBlockPrologueGenerator();
-  if (block_prologue_generator) {
-    block_prologue_generator(unnested_hlo, kernel_info);
-  }
-
-  EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty);
-
-  const BlockEpilogueGenerator& block_epilogue_generator =
-      kernel_generator.GetBlockEpilogueGenerator();
-  if (block_epilogue_generator) {
-    block_epilogue_generator(unnested_hlo, kernel_info);
-  }
-
-  return launch_dimensions;
+  block_prologue_generator(unnested_hlo, kernel_info);
+  EmitBlock(kernel_info, &ksl, y, x, tile_element_generator);
+  block_epilogue_generator(unnested_hlo, kernel_info);
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
+                         ir_emitter_context_->llvm_module());
 }
 
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
 // with a shape that is a 0-2-1 transpose of the output tensor shape. The caller
 // is responsible for making sure that it is safe to apply the shared memory
-// tranpose on the input parameters.
+// transpose on the input parameters.
 //
 //
 // For the purpose of tiling, the output tensors have a logical shape of three
@@ -3234,37 +2620,136 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
 //
 // TODO(b/33320379): Here each block transposes 1 tile. It may be more
 // efficient to launch fewer blocks so each transposes many tiles.
-LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
-    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
+void IrEmitterUnnested::EmitHlo021Tile(
+    HloInstruction* hlo, Thunk* kernel_thunk,
+    absl::Span<const int64> reduced_output_dims,
     absl::Span<const int64> tiled_param_ids) {
   constexpr int kNumRows = 4;
   KernelMappingScheme mapping_scheme(
       reduced_output_dims, /*tile_size_y=*/kWarpSize,
-      /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1},
+      /*tile_size_x=*/kWarpSize, /*block_size_z=*/1,
       /*num_threads_y=*/kNumRows,
-      /*num_threads_x=*/kWarpSize, &b_);
-  TileElementGenerator element_generator;
-  if (hlo->opcode() == HloOpcode::kCopy) {
-    element_generator = [&](HloInstruction* hlo,
-                            const llvm_ir::IrArray::Index& index,
-                            const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc,
-                            int64 x_iter_num) {
-      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
-    };
-  } else {
-    DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    element_generator =
-        [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
-            const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-            llvm::Value* x_loc, int64 x_iter_num) {
-          EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc,
-                                   x_iter_num);
-        };
-  }
+      /*num_threads_x=*/kWarpSize, /*is_dilated_x=*/false, &b_);
   KernelCodegenInfo kernel_info(&mapping_scheme);
-  KernelCodeGenerator kernel_generator(std::move(element_generator));
-  return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info);
+
+  std::vector<IrArray> param_arrays;
+
+  // For each tiled parameter, cast its input IrArray to the corresponding
+  // reduced shape and keep the reduced shape live during IR emission.
+  std::vector<IrArray> param_in_reduced_shape_arrays;
+  std::vector<llvm::Value*> param_shmem_buffers(hlo->operand_count(), nullptr);
+
+  for (int64 id = 0; id < hlo->operand_count(); id++) {
+    const HloInstruction* param = hlo->operand(id);
+    param_arrays.push_back(GetIrArray(*param, *hlo));
+
+    if (absl::c_linear_search(tiled_param_ids, id)) {
+      param_shmem_buffers[id] =
+          mapping_scheme.GetSharedMemoryBufferForElementType(
+              llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
+                                             module_),
+              IrName(hlo, StrCat("tile", id)));
+      VLOG(3) << "Added shmem buffer for parameter " << id << ": "
+              << llvm_ir::DumpToString(*param_shmem_buffers[id]);
+      Shape reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+          param->shape().element_type(),
+          Permute({0, 2, 1}, reduced_output_dims));
+      param_in_reduced_shape_arrays.push_back(
+          param_arrays[id].CastToShape(reduced_shape, &b_));
+    } else {
+      param_in_reduced_shape_arrays.push_back(IrArray());
+    }
+  }
+
+  EmitElementFunction element_generator =
+      [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
+          llvm::Value* x_loc, int64 x_iter_num) {
+        if (hlo->opcode() == HloOpcode::kCopy) {
+          EmitTileElementForCopy(hlo, index, &kernel_info, y_loc, x_loc,
+                                 x_iter_num, param_shmem_buffers);
+        } else {
+          CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+          EmitTileElementForFusion(hlo, index, &kernel_info, y_loc, x_loc,
+                                   x_iter_num, param_shmem_buffers);
+        }
+      };
+
+  TileElementGenerator tile_generator =
+      [&](llvm::Value* y, llvm::Value* x, const IrArray::Index& index,
+          const string& loop_name, llvm::Value* tile_height,
+          llvm::Value* tile_width, KernelSupportLibrary* ksl) {
+        // If shared memory transpose is needed, wait for all threads to reach
+        // this point, lest we copy a value from tile to output before the other
+        // thread copies it from input to tile. This is `__syncthreads` in CUDA.
+        if (!tiled_param_ids.empty()) {
+          // Calculate the input tile origin from the output tile origin.
+          const IrArray::Index input_tile_origin(
+              Permute({0, 2, 1}, index.multidim()),
+              Permute({0, 2, 1}, index.dims()), index.GetType());
+
+          // Copy input parameter values to shared memory buffers:
+          // tile[y, x] = input[index]
+          // Note that tile_width and tile_height are flipped here because we
+          // are reading a transposed tile.
+          EmitTiledElementalCodeWithBoundsCheck(
+              &mapping_scheme, input_tile_origin, "input", ksl, &b_, y, x,
+              tile_width, tile_height,
+              [&](const IrArray::Index& index, llvm::Value* y_loc,
+                  llvm::Value* x_loc, int64 /*x_iter_num*/) {
+                for (int64 id : tiled_param_ids) {
+                  IrArray& input_in_logical_shape =
+                      param_in_reduced_shape_arrays[id];
+
+                  llvm::Value* shmem_buffer = param_shmem_buffers[id];
+                  llvm::Value* zero =
+                      llvm::ConstantInt::get(kernel_info.GetIndexType(), 0);
+                  // TODO(jlebar): Add AA metadata to this store.  Tile buffers
+                  // are global variables, so LLVM can't infer much about it.
+                  Store(input_in_logical_shape.EmitReadArrayElement(
+                            index, &b_, "input_element"),
+                        GEP(shmem_buffer, {zero, y_loc, x_loc}));
+                }
+              });
+
+          // Wait for all threads to reach this point using `__syncthreads` in
+          // CUDA.
+          EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
+        }
+
+        EmitTiledElementalCodeWithBoundsCheck(&mapping_scheme, index, loop_name,
+                                              ksl, &b_, y, x, tile_height,
+                                              tile_width, element_generator);
+        bool block_contains_multi_tiles =
+            mapping_scheme.GetNumberOfTilesInOneBlock() > 1;
+
+        // If a tile block contains multiple tiles and shared memory buffers are
+        // used, we need to wait for all threads to finish using the shared
+        // memory buffer for the current tile before we move on to process the
+        // next tile and overwrite the shared memory buffers.
+        if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
+          EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
+        }
+      };
+
+  BlockPrologueGenerator hlo021_prologue = [&](HloInstruction* hlo,
+                                               KernelCodegenInfo* kernel_info) {
+    // For multioutput fusion, one thread needs to output a tuple
+    // with pointers to all the individual outputs.  We could do this
+    // at any point in the kernel, but we do it at the beginning in
+    // the hopes of reducing register pressure, since we touch
+    // threadIdx.x and blockIdx.x at the beginning of the kernel
+    // *anyway*.
+    if (hlo->IsMultiOutputFusion()) {
+      KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+        llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo),
+                           ConstructIrArrayForOutputs(*hlo), &b_);
+      });
+    }
+  };
+  BlockEpilogueGenerator epilogue_generator = [](HloInstruction*,
+                                                 KernelCodegenInfo*) {};
+  EmitKernel(hlo, kernel_thunk, &kernel_info, tile_generator, hlo021_prologue,
+             epilogue_generator);
 }
 
 namespace {
@@ -3282,7 +2767,7 @@ namespace {
 // the preload tile. If this is not true, we can't use a shmem transpose for P.
 //
 // If the computation of output element [z, y, x] only requires the element of
-// P with the same indices, the shmem tranpose implementation can be applied
+// P with the same indices, the shmem transpose implementation can be applied
 // to P safely. This is a sufficient but not necessary condition. We check all
 // the transitive users of P to see if we can find a user that may cause an
 // exception to the situation. If such a user is not found, we conclude that P
@@ -3302,7 +2787,7 @@ namespace {
 // block.
 //
 // TODO(bixia): In order to extend this for kInput fusion, that is reduction
-// with tranpose, we only need to end the use-chain checking with the input of
+// with transpose, we only need to end the use-chain checking with the input of
 // a reduce operations. In this case, the above description on "output" apply
 // to the result of such a use-chain, which provides the input to the reduce
 // operation.
@@ -3334,9 +2819,9 @@ bool IsInstructionSafeForShmemTranspose(const HloInstruction* hlo) {
   }
 }
 
-// Given a group of input parameters that are 0-2-1 tranpose of the outputs of
+// Given a group of input parameters that are 0-2-1 transpose of the outputs of
 // a fusion kernel, returns the input parameters that are safe for the shared
-// memory tranpose implementation.
+// memory transpose implementation.
 //
 // When a tile based shared memory transpose is used to implement an input with
 // 0-2-1 transpose, we preload a tile of the input elements
@@ -3354,8 +2839,7 @@ std::vector<int64> FilterInputsForShmemTranspose(const HloInstruction* fusion,
     if (IsInstructionSafeForShmemTranspose(input)) {
       filtered_input_ids.push_back(input_ids[i]);
     } else {
-      VLOG(10) << "Input not safe for shmem transpose " << input->ToString()
-               << "\n";
+      VLOG(10) << "Input not safe for shmem transpose " << input->ToString();
     }
   }
   return filtered_input_ids;
@@ -3446,15 +2930,15 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     }
   }
 
+  if (params_012.empty()) {
+    return false;
+  }
+
   VLOG(3) << "EmitHlo021Tile Emitting hlo tile 0-2-1" << hlo->ToString();
   std::unique_ptr<KernelThunk> kernel_thunk =
       BuildKernelThunk(hlo, /*implements_whole_instruction=*/true);
-  const LaunchDimensions launch_dimensions =
-      EmitHlo021Tile(hlo, *reduced_dims_021, params_012);
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
-                         ir_emitter_context_->llvm_module());
+  EmitHlo021Tile(hlo, kernel_thunk.get(), *reduced_dims_021, params_012);
   AddThunkToThunkSequence(std::move(kernel_thunk));
-
   return true;
 }
 
@@ -3578,7 +3062,7 @@ bool IsUnrollingColumnReductionBeneficial(const HloInstruction* unnested_hlo,
 
 }  // namespace
 
-std::tuple<KernelMappingScheme, bool>
+std::pair<KernelMappingScheme, bool>
 IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
     const HloInstruction* unnested_hlo, const HloInstruction* first_reduce) {
   const Shape& input_shape = first_reduce->operand(0)->shape();
@@ -3637,12 +3121,10 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
     tile_size_y = kNumElementsPerPartialSum;
   }
 
-  DimensionVector req_block_sizes{block_size_z, 1, 1};
   llvm_ir::KernelMappingScheme mapping_scheme(
-      dims_in_elem, tile_size_y, tile_size_x, req_block_sizes, num_threads_y,
-      num_threads_x, &b_);
-  mapping_scheme.SetDilatedX(dilated_x);
-  return std::make_tuple(mapping_scheme, is_row_reduction);
+      dims_in_elem, tile_size_y, tile_size_x, block_size_z, num_threads_y,
+      num_threads_x, dilated_x, &b_);
+  return std::make_pair(mapping_scheme, is_row_reduction);
 }
 
 Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
@@ -3652,11 +3134,36 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
   HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
                                         ? unnested_hlo->fused_expression_root()
                                         : unnested_hlo;
-  absl::Span<HloInstruction* const> output_instructions =
-      GetOutputInstructions(&reduce_or_tuple);
-  const HloInstruction* first_reduce =
-      GetFirstReduceInstruction(output_instructions);
+  // A group of instructions that generate the output for the kernel
+  // containing the given HLO instruction. The result may be an unnested kReduce
+  // HLO, a nested kReduce HLO of a kInput fusion, or the operands of the tuple
+  // for a multiple output fusion.
+  bool returns_tuple = false;
+  auto output_instructions = ([&]() -> absl::Span<HloInstruction* const> {
+    if (reduce_or_tuple->opcode() == HloOpcode::kReduce) {
+      return absl::Span<HloInstruction* const>(&reduce_or_tuple, 1);
+    }
+    CHECK(reduce_or_tuple->opcode() == HloOpcode::kTuple);
+    returns_tuple = true;
+    return reduce_or_tuple->operands();
+  })();
 
+  std::vector<HloInstruction*> reduce_instructions;
+  InlinedVector<ShapeIndex, 1> reduction_output_shape_indices;
+  InlinedVector<HloComputation*, 1> reducers;
+  for (int i = 0; i < output_instructions.size(); i++) {
+    HloInstruction* output_instruction = output_instructions[i];
+    if (IsReductionFromOrToContiguousDimensions(*output_instruction)) {
+      reduce_instructions.push_back(output_instruction);
+      ShapeIndex idx;
+      if (returns_tuple) {
+        idx = {i};
+      }
+      reduction_output_shape_indices.push_back(idx);
+      reducers.push_back(output_instruction->to_apply());
+    }
+  }
+  const HloInstruction* first_reduce = reduce_instructions.at(0);
   if (output_instructions.size() > 1) {
     TF_RETURN_IF_ERROR(
         AreFusedReductionOutputsConsistent(output_instructions, first_reduce));
@@ -3688,35 +3195,41 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
                                      "doesn't set the input layout of "
                                   << first_reduce->ToString();
 
-  bool is_row_reduction;
-  llvm_ir::KernelMappingScheme mapping_scheme;
-  std::tie(mapping_scheme, is_row_reduction) =
+  auto mapping_scheme_pair =
       ComputeMappingSchemeAndReductionKind(unnested_hlo, first_reduce);
-  ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
-  KernelCodeGenerator kernel_generator(
-      /*tile_element_generator=*/
-      [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
-          const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+  bool is_row_reduction = mapping_scheme_pair.second;
+  ReductionCodegenInfo reduction_info(&mapping_scheme_pair.first,
+                                      is_row_reduction);
+  EmitElementFunction emit_reduction_tile =
+      [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
           llvm::Value* x_loc, int64 x_iter_num) {
-        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc,
-                                    x_iter_num);
+        EmitTileElementForReduction(unnested_hlo, input_shape,
+                                    output_instructions, index, &reduction_info,
+                                    reducers, x_iter_num);
+      };
+
+  EmitKernel(
+      unnested_hlo, kernel_thunk.get(), &reduction_info,
+      /*tile_element_generator=*/
+      [&](llvm::Value* y, llvm::Value* x, const IrArray::Index& index,
+          const string& loop_name, llvm::Value* tile_height,
+          llvm::Value* tile_width, KernelSupportLibrary* ksl) {
+        EmitTiledElementalCodeWithBoundsCheck(
+            &mapping_scheme_pair.first, index, loop_name, ksl, &b_, y, x,
+            tile_height, tile_width, emit_reduction_tile);
       },
       /*block_prologue_generator=*/
       [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
-        EmitPrologueForReduction(hlo, kernel_info);
+        EmitPrologueForReduction(hlo, kernel_info, reduce_instructions);
       },
-      /*block_epilogue_generator*/
+      /*block_epilogue_generator=*/
       [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
-        EmitEpilogueForReduction(hlo, kernel_info);
+        EmitEpilogueForReduction(hlo, kernel_info, reduce_instructions,
+                                 reduction_output_shape_indices, reducers);
       });
 
-  LaunchDimensions launch_dimensions =
-      EmitKernel(unnested_hlo, {}, kernel_generator, &reduction_info);
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
-                         ir_emitter_context_->llvm_module());
-
   thunks.push_back(std::move(kernel_thunk));
-  std::unique_ptr<SequentialThunk> sequential_thunk =
+  auto sequential_thunk =
       absl::make_unique<SequentialThunk>(std::move(thunks), unnested_hlo);
   AddThunkToThunkSequence(std::move(sequential_thunk));
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index e5177c28484..efc3f8f3ff6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
 
@@ -47,16 +49,9 @@ namespace gpu {
 //    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
 //    really an IrEmitter, but is more an "IR generator generator".)
 //
-class IrEmitterUnnested : public IrEmitter {
+class IrEmitterUnnested : public IrEmitter,
+                          private ThunkEmitter::EmissionContext {
  public:
-  // Parameter block_contains_multi_tiles indicates whether a tile block
-  // consists of multiple tiles or not. If the tile block contains only one
-  // tile, there is no need to use atomic operation to accumulate a local result
-  // to a global result to implement reduction.
-  using TileGenerator =
-      std::function<void(const llvm_ir::IrArray::Index& output_tile_origin,
-                         absl::Span<llvm::Value* const> output_tile_bounds,
-                         bool block_contains_multi_tiles)>;
   // KernelCodegenInfo records the common information to support the code
   // generation for a kernel to process tensor elements by blocks. A block of
   // tensor elements may contain one or multiple tiles. The code generators that
@@ -68,29 +63,21 @@ class IrEmitterUnnested : public IrEmitter {
    public:
     explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
         : mapping_scheme_(mapping_scheme),
-          tiled_param_info_(nullptr),
           lane_id_(nullptr),
           index_ty_(nullptr) {}
     virtual ~KernelCodegenInfo() {}
 
     void SetLaneId(llvm::Value* v) { lane_id_ = v; }
     void SetIndexType(llvm::Type* t) { index_ty_ = t; }
-    void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
-      tiled_param_info_ = tiled_param_info;
-    }
 
     llvm::Value* GetLaneId() const { return lane_id_; }
     llvm_ir::KernelMappingScheme* GetKernelMappingScheme() const {
       return mapping_scheme_;
     }
-    llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
-      return tiled_param_info_;
-    }
     llvm::Type* GetIndexType() const { return index_ty_; }
 
    protected:
     llvm_ir::KernelMappingScheme* mapping_scheme_;
-    llvm_ir::TiledParameterInfo* tiled_param_info_;
     llvm::Value* lane_id_;
     llvm::Type* index_ty_;
   };
@@ -101,6 +88,7 @@ class IrEmitterUnnested : public IrEmitter {
   // A function object to finalize the code generation for a tile block.
   using BlockEpilogueGenerator =
       std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+
   // A function object to generate code to process one element in a tile.
   //
   // hlo: the instruction for which the code is generated for.
@@ -110,38 +98,14 @@ class IrEmitterUnnested : public IrEmitter {
   // kernel_info: Other information to support the kernel code generation.
   // x_iter_num: When a thread process N elements in the X dimension, x_iter_num
   //             has a value of 0..N-1 to identify the element being process.
-  using TileElementGenerator = std::function<void(
-      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
-      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+  using EmitElementFunction = std::function<void(
+      const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
       llvm::Value* x_loc, int64 x_iter_num)>;
 
-  // KernelCodeGenerator records the code generator objects that generate code
-  // for tile elements or tile block prologue/epilogue.
-  class KernelCodeGenerator {
-   public:
-    explicit KernelCodeGenerator(
-        TileElementGenerator tile_element_generator,
-        BlockPrologueGenerator block_prologue_generator = {},
-        BlockEpilogueGenerator block_epilogue_generator = {})
-        : tile_element_generator_(std::move(tile_element_generator)),
-          block_prologue_generator_(std::move(block_prologue_generator)),
-          block_epilogue_generator_(std::move(block_epilogue_generator)) {}
-
-    const TileElementGenerator& GetTileElementGenerator() const {
-      return tile_element_generator_;
-    }
-    const BlockPrologueGenerator& GetBlockPrologueGenerator() const {
-      return block_prologue_generator_;
-    }
-    const BlockEpilogueGenerator& GetBlockEpilogueGenerator() const {
-      return block_epilogue_generator_;
-    }
-
-   private:
-    TileElementGenerator tile_element_generator_;
-    BlockPrologueGenerator block_prologue_generator_;
-    BlockEpilogueGenerator block_epilogue_generator_;
-  };
+  using TileElementGenerator = std::function<void(
+      llvm::Value* y, llvm::Value* x, const llvm_ir::IrArray::Index& index,
+      const string& loop_name, llvm::Value* tile_height,
+      llvm::Value* tile_width, KernelSupportLibrary* ksl)>;
 
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     const HloComputation* hlo_computation,
@@ -157,7 +121,8 @@ class IrEmitterUnnested : public IrEmitter {
   Status DefaultAction(HloInstruction* hlo) override;
 
   // IrEmitterUnnested handles the following instructions differently from
-  // IrEmitter.
+  // IrEmitter. It also mixes in some special handling for custom kernels
+  // via the ThunkEmitter.
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleConvolution(HloInstruction* convolution) override;
@@ -199,10 +164,30 @@ class IrEmitterUnnested : public IrEmitter {
 
  private:
   // Add a owning Thunk object to the thunk sequence.
-  void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
+  void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) override {
     thunk_sequence_->emplace_back(std::move(thunk));
   }
 
+  // A convenient helper for calling BufferAssignment::GetUniqueSlice.
+  StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index) const override {
+    return ir_emitter_context_->buffer_assignment().GetUniqueSlice(&hlo, index);
+  }
+
+  BufferAllocation::Slice GetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index = {}) const {
+    return MaybeGetAllocationSlice(hlo, index).ConsumeValueOrDie();
+  }
+
+  int64 ByteSizeOf(const Shape& shape) const override {
+    return llvm_ir::ByteSizeOf(
+        shape, ir_emitter_context_->llvm_module()->getDataLayout());
+  }
+
+  const se::Platform* platform() const override {
+    return ir_emitter_context_->platform();
+  }
+
   // Builds the prototype of the IR kernel for `inst` and adds it to the module.
   // This kernel takes as arguments pointers to the given buffer allocations.
   llvm::Function* BuildKernelPrototype(
@@ -227,7 +212,7 @@ class IrEmitterUnnested : public IrEmitter {
   // and first_reduce are the same instruction. For a kInput fusion,
   // unnested_hlo is the fusion instruction while first_reduce is the first
   // reduce op.
-  std::tuple<llvm_ir::KernelMappingScheme, bool>
+  std::pair<llvm_ir::KernelMappingScheme, bool>
   ComputeMappingSchemeAndReductionKind(const HloInstruction* unnested_hlo,
                                        const HloInstruction* first_reduce);
 
@@ -242,76 +227,72 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel
   // for the hlo instruction.
   bool CheckAndEmitHloWithTile021(HloInstruction* hlo);
+
   // Emits a kernel for the hlo instruction using a 0-2-1 tiling algorithm and
-  // returns the launch dimensions for the kernel. This is a helper to support
+  // sets the corresponding launch dimensions. This is a helper to support
   // the implementation of CheckAndEmitHloWithTile021.
-  LaunchDimensions EmitHlo021Tile(HloInstruction* hlo,
-                                  absl::Span<const int64> reduced_output_dims,
-                                  absl::Span<const int64> tiled_param_ids);
-  // Emits a kernel for an unnested HLO instruction.
-  LaunchDimensions EmitKernel(HloInstruction* unnested_hlo,
-                              absl::Span<const int64> param_ids,
-                              const KernelCodeGenerator& kernel_generator,
-                              KernelCodegenInfo* kernel_info);
-  void EmitBlock(const TileGenerator& emit_one_tile,
-                 KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl,
-                 llvm::Type* index_ty);
+  void EmitHlo021Tile(HloInstruction* hlo, Thunk* kernel_thunk,
+                      absl::Span<const int64> reduced_output_dims,
+                      absl::Span<const int64> tiled_param_ids);
+
+  // Emits a kernel for an unnested HLO instruction, set the `kernel_thunk`
+  // launch dimensions.
+  void EmitKernel(HloInstruction* unnested_hlo, Thunk* kernel_thunk,
+                  KernelCodegenInfo* kernel_info,
+                  TileElementGenerator tile_element_generator,
+                  BlockPrologueGenerator block_prologue_generator,
+                  BlockEpilogueGenerator block_epilogue_generator);
+
+  void EmitBlock(KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl,
+                 llvm::Value* y, llvm::Value* x,
+                 TileElementGenerator tile_generator);
+
   // Emits code to process a tensor element in a tile for the given kCopy HLO
   // that performs a 0-2-1 transpose.
-  void EmitTileElementForCopy(HloInstruction* hlo,
-                              const llvm_ir::IrArray::Index& index,
-                              const KernelCodegenInfo* kernel_info,
-                              llvm::Value* y_loc, llvm::Value* x_loc,
-                              int64 x_iter_num);
+  void EmitTileElementForCopy(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc, int64 x_iter_num,
+      absl::Span<llvm::Value* const> param_shmem_buffers);
+
   // Emits code to process a tensor element in a tile for the given kLoop fusion
   // HLO containing parameters that are 0-2-1 transpose of its outputs.
-  void EmitTileElementForFusion(HloInstruction* hlo,
-                                const llvm_ir::IrArray::Index& index,
-                                const KernelCodegenInfo* kernel_info,
-                                llvm::Value* y_loc, llvm::Value* x_loc,
-                                int64 x_iter_num);
+  void EmitTileElementForFusion(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc, int64 x_iter_num,
+      absl::Span<llvm::Value* const> param_shmem_buffers);
+
   // Emits code to process a tensor element in a tile for the given input hlo
   // that is either a unnested kReduce or a kInput fusion.
-  void EmitTileElementForReduction(HloInstruction* unnested_hlo,
-                                   const llvm_ir::IrArray::Index& index,
-                                   const KernelCodegenInfo* kernel_info,
-                                   llvm::Value* y_loc, llvm::Value* x_loc,
-                                   int64 x_iter_num);
+  void EmitTileElementForReduction(
+      HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
+      absl::Span<HloInstruction* const> output_instructions,
+      const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info,
+      absl::Span<HloComputation* const> reducers, int64 x_iter_num);
+
   // Prepares for the code generation for a tile block of a reduction kernel.
-  void EmitPrologueForReduction(HloInstruction* unnested_hlo,
-                                KernelCodegenInfo* kernel_info);
+  void EmitPrologueForReduction(
+      HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info,
+      absl::Span<HloInstruction* const> reduce_instructions);
+
   void EmitPrologueForOneReduction(HloInstruction* unnested_hlo,
                                    HloInstruction* reduce_inst, int reduce_idx,
                                    KernelCodegenInfo* kernel_info,
-                                   GpuElementalIrEmitter* elemental_emitter,
-                                   ShapeIndex output_shape_index);
+                                   GpuElementalIrEmitter* elemental_emitter);
   // Wraps up the code generation for a tile block of a reduction kernel.
-  void EmitEpilogueForReduction(HloInstruction* unnested_hlo,
-                                KernelCodegenInfo* kernel_info);
+  void EmitEpilogueForReduction(
+      HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info,
+      absl::Span<const HloInstruction* const> reduce_instructions,
+      absl::Span<const ShapeIndex> reduction_output_shape_indices,
+      absl::Span<HloComputation* const> reducers);
   // For each reducer, emits the shuffle-down loop to accumulate the partial
   // result to the global result.
   void EmitFullWarpShuffleDownLoopForAllReduces(
       absl::Span<HloComputation* const> reducers,
       absl::Span<llvm::AllocaInst* const> partial_result_addresses);
 
-  // Generates the IrArray for each input of an hlo and returns a vector that
-  // constains such IrArrays.
-  std::vector<llvm_ir::IrArray> ConstructIrArrayForInputs(
-      const HloInstruction& hlo);
-
-  // For each input of the `hlo` instruction, checks its value in
-  // `param_buffers` to find out whether the input has a reduced shape. If the
-  // input has a reduced shape, constructs the reduced shape for the input and
-  // casts the original input IrArray in `param_arrays` to the reduced shape.
-  // Return the total number of inputs.
-  int ConstructInputReducedShapeAndCastInputIrArrayToShape(
-      const HloInstruction& hlo,
-      const std::vector<llvm_ir::IrArray>& param_arrays,
-      const std::vector<llvm::Value*>& param_buffers,
-      absl::Span<const int64> reduced_output_dims,
-      std::vector<Shape>* param_reduced_shapes,
-      std::vector<llvm_ir::IrArray>* param_in_reduced_shape_arrays);
-
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
   // Thunk object. The kernel implementation will be unrolled if unroll_factor
@@ -322,39 +303,11 @@ class IrEmitterUnnested : public IrEmitter {
       const HloInstruction* inst, bool implements_whole_instruction,
       int unroll_factor = 1);
 
-  // Returns a FftThunk that calls cuFFT to implement `inst`.
-  std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
-
-  // Returns a CholeskyThunk that calls cuSolver to implement `inst`.
-  std::unique_ptr<Thunk> BuildCholeskyThunk(const HloInstruction* inst);
-
-  // Returns a TriangularSolveThunk that calls cuBlas to implement `inst`.
-  std::unique_ptr<Thunk> BuildTriangularSolveThunk(const HloInstruction* inst);
-
-  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
-  // to make sure `inst` outlives the lifetime of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
-
   // Returns a thunk that, given a reduce or select-and-scatter op, initializes
   // its memory to the appropriate initial value.
   StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
       HloInstruction* hlo, const ShapeIndex& index = {});
 
-  // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
-
-  // Returns a thunk that calls device-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildDeviceToDeviceCopyThunk(
-      const HloInstruction* inst);
-
-  // Returns an InfeedThunk that performs a host-to-device memcpy to implement
-  // `inst`.
-  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
-
-  // Returns an OutfeedThunk that performs a device-to-host memcpy to implement
-  // `inst`.
-  std::unique_ptr<Thunk> BuildOutfeedThunk(const HloInstruction* inst);
-
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction 'hlo'.
   std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 2f73fd0b3d4..db26d36c71a 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -16,12 +16,12 @@ cc_library(
     name = "llvm_gpu_backend",
     srcs = [
         "dump_ir_pass.cc",
-        "nvptx_backend_lib.cc",
+        "gpu_backend_lib.cc",
         "utils.cc",
     ],
     hdrs = [
         "dump_ir_pass.h",
-        "nvptx_backend_lib.h",
+        "gpu_backend_lib.h",
         "utils.h",
     ],
     deps = [
@@ -30,6 +30,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
similarity index 54%
rename from tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
rename to tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 9f52f09004b..84616f3a37b 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 
+#include <fstream>
 #include <map>
 #include <memory>
 #include <string>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -65,6 +67,9 @@ namespace xla {
 namespace gpu {
 namespace {
 
+// Inline threshold value to use in LLVM AMDGPU backend.
+const int kAMDGPUInlineThreshold = 0x100000;
+
 // Default inline threshold value to use in llvm.
 const int kDefaultInlineThreshold = 1100;
 
@@ -124,7 +129,7 @@ void InitializePasses(llvm::PassRegistry* pass_registry) {
 // Returns the TargetMachine, given a triple.
 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
     llvm::Triple triple, absl::string_view cpu_name,
-    const HloModuleConfig& hlo_module_config) {
+    const HloModuleConfig& hlo_module_config, absl::string_view feature_str) {
   std::string error;
   const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
   if (target == nullptr) {
@@ -155,8 +160,9 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
       codegen_opt_level = CodeGenOpt::None;
   }
   return absl::WrapUnique(target->createTargetMachine(
-      triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
-      getRelocModel(), getCodeModel(), codegen_opt_level));
+      triple.str(), llvm_ir::AsStringRef(cpu_name),
+      llvm_ir::AsStringRef(feature_str), target_options, getRelocModel(),
+      getCodeModel(), codegen_opt_level));
 }
 
 // Adds the standard LLVM optimization passes, based on the speed optimization
@@ -166,13 +172,14 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
                            llvm::TargetMachine* target_machine,
                            llvm::legacy::PassManagerBase* module_passes,
-                           llvm::legacy::FunctionPassManager* function_passes) {
+                           llvm::legacy::FunctionPassManager* function_passes,
+                           int inline_threshold) {
   PassManagerBuilder builder;
   builder.OptLevel = opt_level;
   builder.SizeLevel = size_level;
 
   if (opt_level > 1) {
-    builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
+    builder.Inliner = llvm::createFunctionInliningPass(inline_threshold);
   } else {
     // Only inline functions marked with "alwaysinline".
     builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
@@ -240,13 +247,13 @@ void FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
   llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
 }
 
-// Returns whether the module could use any libdevice functions. This function
-// may have false positives -- the module might not use libdevice even if this
-// function returns true.
-bool CouldNeedLibdevice(const llvm::Module& module) {
+// Returns whether the module could use any device bitcode library functions.
+// This function may have false positives -- the module might not use libdevice
+// on NVPTX or ROCm-Device-Libs on AMDGPU even if this function returns true.
+bool CouldNeedDeviceBitcode(const llvm::Module& module) {
   for (const llvm::Function& function : module.functions()) {
     // This is a conservative approximation -- not all such functions are in
-    // libdevice.
+    // libdevice or ROCm-Device-Libs.
     if (!function.isIntrinsic() && function.isDeclaration()) {
       return true;
     }
@@ -254,11 +261,41 @@ bool CouldNeedLibdevice(const llvm::Module& module) {
   return false;
 }
 
+// Links the module with a vector of path to bitcode modules.
+// The caller must guarantee that the paths exist.
+Status LinkWithBitcodeVector(llvm::Module* module,
+                             const std::vector<string>& bitcode_path_vector) {
+  llvm::Linker linker(*module);
+
+  for (auto& bitcode_path : bitcode_path_vector) {
+    if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) {
+      LOG(ERROR) << "bitcode module is required by this HLO module but was "
+                    "not found at "
+                 << bitcode_path;
+      return xla::InternalError("bitcode module not found at %s", bitcode_path);
+    }
+
+    std::unique_ptr<llvm::Module> bitcode_module =
+        LoadIRModule(bitcode_path, &module->getContext());
+    if (linker.linkInModule(
+            std::move(bitcode_module), llvm::Linker::Flags::LinkOnlyNeeded,
+            [](Module& M, const StringSet<>& GVS) {
+              internalizeModule(M, [&GVS](const GlobalValue& GV) {
+                return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+              });
+            })) {
+      return xla::InternalError("Error linking bitcode module from %s",
+                                bitcode_path);
+    }
+  }
+  return Status::OK();
+}
+
 // Links libdevice into the given module if the module needs libdevice.
 Status LinkLibdeviceIfNecessary(llvm::Module* module,
                                 std::pair<int, int> compute_capability,
                                 const string& libdevice_dir_path) {
-  if (!CouldNeedLibdevice(*module)) {
+  if (!CouldNeedDeviceBitcode(*module)) {
     return Status::OK();
   }
 
@@ -274,38 +311,20 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module,
   }
 
   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
-  std::unique_ptr<llvm::Module> libdevice_module =
-      LoadIRModule(libdevice_path, &module->getContext());
-
-  llvm::Linker linker(*module);
-  if (linker.linkInModule(
-          std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
-          [](Module& M, const StringSet<>& GVS) {
-            internalizeModule(M, [&GVS](const GlobalValue& GV) {
-              return !GV.hasName() || (GVS.count(GV.getName()) == 0);
-            });
-          })) {
-    return xla::InternalError("Error linking libdevice from %s",
-                              libdevice_path);
-  }
-  return Status::OK();
+  return LinkWithBitcodeVector(module, {libdevice_path});
 }
 
-StatusOr<string> CompileModuleToPtx(llvm::Module* module,
-                                    std::pair<int, int> compute_capability,
-                                    const HloModuleConfig& hlo_module_config,
-                                    const string& libdevice_dir_path) {
-  // If the module has no functions or globals, there's nothing to compile. Just
-  // return an empty string.
-  if (module->empty() && module->global_empty()) {
-    VLOG(2) << "Module '" << module->getName().str()
-            << "' is empty. Skipping compilation.";
-    return string();
-  }
+Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
+                               const HloModuleConfig& hlo_module_config,
+                               const string& device_bitcode_dir_path) {
   // Link the input module with libdevice, to pull in implementations of some
   // builtins.
-  TF_RETURN_IF_ERROR(
-      LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
+  auto compute_capability = absl::get_if<std::pair<int, int>>(&gpu_version);
+  if (!compute_capability) {
+    return xla::InternalError("Incompatible compute capability was specified.");
+  }
+  TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, *compute_capability,
+                                              device_bitcode_dir_path));
 
   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
   // can access it.
@@ -319,6 +338,31 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
     }
   }
 
+  return Status::OK();
+}
+
+std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
+    llvm::Triple target_triple, std::pair<int, int> compute_capability,
+    const HloModuleConfig& hlo_module_config) {
+  // Figure out the exact name of the processor as known to the NVPTX backend
+  // from the gpu_architecture flag.
+  return GetTargetMachine(target_triple, GetSmName(compute_capability),
+                          hlo_module_config, "+ptx60");
+}
+
+using TargetModuleLinker = std::function<Status(
+    llvm::Module*, GpuVersion, const HloModuleConfig&, const string&)>;
+
+Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
+                             const HloModuleConfig& hlo_module_config,
+                             const string& device_bitcode_dir_path,
+                             TargetModuleLinker module_linker,
+                             llvm::Triple default_target_triple,
+                             llvm::TargetMachine* target_machine,
+                             int inline_threshold) {
+  TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
+                                   device_bitcode_dir_path));
+
   IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
@@ -332,13 +376,9 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
   if (target_triple.getArch() == llvm::Triple::UnknownArch) {
     LOG(WARNING) << "target triple not found in the module";
-    target_triple = llvm::Triple("nvptx64-unknown-unknown");
+    target_triple = default_target_triple;
   }
 
-  // Figure out the exact name of the processor as known to the NVPTX backend
-  // from the gpu_architecture flag.
-  std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
-      target_triple, GetSmName(compute_capability), hlo_module_config);
   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
       target_machine->getTargetIRAnalysis()));
 
@@ -365,9 +405,10 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
     LOG(ERROR) << std::string(80, '*');
   }
 
+  // Add optimization passes, and set inliner threshold.
   AddOptimizationPasses(opt_level,
-                        /*size_level=*/0, target_machine.get(), &module_passes,
-                        &function_passes);
+                        /*size_level=*/0, target_machine, &module_passes,
+                        &function_passes, inline_threshold);
 
   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
   // again after the standard optimization passes [http://b/13329423].
@@ -394,13 +435,12 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   function_passes.doFinalization();
   module_passes.run(*module);
 
-  // Finally, produce PTX.
-  return EmitModuleToPTX(module, target_machine.get());
+  return Status::OK();
 }
 
 // One-time module initializer.
 // Must be called only once -- DO NOT CALL DIRECTLY.
-void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
+void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
   // Feed all customized flags here, so we can override them with llvm_cl_opts
   // without redeploy the compiler for development purpose.
 
@@ -446,25 +486,267 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
 
 }  // namespace
 
-StatusOr<string> CompileToPtx(llvm::Module* module,
-                              std::pair<int, int> compute_capability,
+namespace nvptx {
+
+StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path) {
   static std::once_flag backend_init_flag;
-  std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
+  std::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
 
   string ptx;
+  std::unique_ptr<llvm::TargetMachine> target_machine;
   {
     tensorflow::profiler::TraceMe activity(
         [&] { return absl::StrCat("Compiling IR:", module->getName().str()); },
         tensorflow::profiler::TraceMeLevel::kInfo);
     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
-    TF_ASSIGN_OR_RETURN(
-        ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
-                                libdevice_dir_path));
+
+    // If the module has no functions or globals, there's nothing to compile.
+    // Just return an empty string.
+    if (module->empty() && module->global_empty()) {
+      VLOG(2) << "Module '" << module->getName().str()
+              << "' is empty. Skipping compilation.";
+      return string();
+    }
+
+    auto compute_capability = absl::get_if<std::pair<int, int>>(&gpu_version);
+    if (!compute_capability) {
+      return xla::InternalError(
+          "Incompatible compute capability was specified.");
+    }
+
+    llvm::Triple default_target_triple("nvptx64-unknown-unknown");
+    // Construct LLVM TargetMachine for NVPTX.
+    std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
+        default_target_triple, *compute_capability, hlo_module_config);
+
+    // Link with libdeivce, and optimize the LLVM module.
+    TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
+        module, gpu_version, hlo_module_config, libdevice_dir_path,
+        NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
+        kDefaultInlineThreshold));
+
+    // Lower optimized LLVM module to PTX.
+    ptx = EmitModuleToPTX(module, target_machine.get());
   }
   return ptx;
 }
 
+}  // namespace nvptx
+
+namespace {
+
+// Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
+static std::vector<string> GetROCDLPaths(int amdgpu_version,
+                                         const string& rocdl_dir_path) {
+  // AMDGPU version-neutral bitcodes.
+  static std::vector<string>* rocdl_filenames = new std::vector<string>(
+      {"hc.amdgcn.bc", "opencl.amdgcn.bc", "ocml.amdgcn.bc", "ockl.amdgcn.bc",
+       "oclc_finite_only_off.amdgcn.bc", "oclc_daz_opt_off.amdgcn.bc",
+       "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
+       "oclc_unsafe_math_off.amdgcn.bc"});
+
+  // Construct full path to ROCDL bitcode libraries.
+  std::vector<string> result;
+  for (auto& filename : *rocdl_filenames) {
+    result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename));
+  }
+
+  // Add AMDGPU version-specific bitcodes.
+  result.push_back(tensorflow::io::JoinPath(
+      rocdl_dir_path,
+      absl::StrCat("oclc_isa_version_", amdgpu_version, ".amdgcn.bc")));
+  return result;
+}
+
+// Emits the given module to HSA Code Object. target_machine is an initialized
+// TargetMachine for the AMDGPU target.
+StatusOr<std::vector<uint8>> EmitModuleToHsaco(
+    Module* module, llvm::TargetMachine* target_machine) {
+  auto* env = tensorflow::Env::Default();
+  std::vector<std::string> tempdir_vector;
+  env->GetLocalTempDirectories(&tempdir_vector);
+  if (tempdir_vector.empty()) {
+    return xla::InternalError(
+        "Unable to locate a temporary directory for compile-time artifacts.");
+  }
+  std::string tempdir_name = tempdir_vector.front();
+  VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
+
+  // Prepare filenames for all stages of compilation:
+  // IR, binary ISA, and HSACO.
+  std::string ir_filename = absl::StrCat(module->getModuleIdentifier(), ".ll");
+  std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
+
+  std::string isabin_filename =
+      absl::StrCat(module->getModuleIdentifier(), ".o");
+  std::string isabin_path =
+      tensorflow::io::JoinPath(tempdir_name, isabin_filename);
+
+  std::string hsaco_filename =
+      absl::StrCat(module->getModuleIdentifier(), ".hsaco");
+  std::string hsaco_path =
+      tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
+
+  std::error_code ec;
+
+  // Dump LLVM IR.
+  std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
+      new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::F_None));
+  module->print(*ir_fs, nullptr);
+  ir_fs->flush();
+
+  // Emit GCN ISA binary.
+  // The extension is stripped by IrDumpingPassManager, so we need to
+  // get creative to add a suffix.
+  std::string module_id = module->getModuleIdentifier();
+  IrDumpingPassManager codegen_passes(
+      ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
+                               "-amdgpu.dummy"),
+      "", false);
+  codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
+      llvm::Triple(module->getTargetTriple())));
+  llvm::SmallVector<char, 0> stream;
+  llvm::raw_svector_ostream pstream(stream);
+  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
+      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::F_Text));
+  module->setDataLayout(target_machine->createDataLayout());
+  target_machine->addPassesToEmitFile(codegen_passes, *isabin_fs, nullptr,
+                                      llvm::TargetMachine::CGFT_ObjectFile);
+  codegen_passes.run(*module);
+  isabin_fs->flush();
+
+  // Locate lld.
+  // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
+  // ROCm-Device-Libs PR.
+  std::string lld_path = tensorflow::io::JoinPath("/opt/rocm", "hcc/bin");
+  auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
+  if (!lld_program) {
+    return xla::InternalError("unable to find ld.lld in PATH: %s",
+                              lld_program.getError().message());
+  }
+  std::vector<llvm::StringRef> lld_args{
+      llvm_ir::AsStringRef("ld.lld"),
+      llvm_ir::AsStringRef("-flavor"),
+      llvm_ir::AsStringRef("gnu"),
+      llvm_ir::AsStringRef("-shared"),
+      llvm_ir::AsStringRef(isabin_path),
+      llvm_ir::AsStringRef("-o"),
+      llvm_ir::AsStringRef(hsaco_path),
+  };
+
+  std::string error_message;
+  int lld_result =
+      llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
+                                llvm::None, {}, 0, 0, &error_message);
+
+  if (lld_result) {
+    return xla::InternalError("ld.lld execute fail: %s", error_message);
+  }
+
+  // Read HSACO.
+  std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
+  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
+
+  std::vector<uint8> hsaco(hsaco_file_size);
+  hsaco_file.seekg(0, std::ios::beg);
+  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  return hsaco;
+}
+
+// Links ROCm-Device-Libs into the given module if the module needs it.
+Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version,
+                            const string& rocdl_dir_path) {
+  if (!CouldNeedDeviceBitcode(*module)) {
+    return Status::OK();
+  }
+
+  return LinkWithBitcodeVector(module,
+                               GetROCDLPaths(amdgpu_version, rocdl_dir_path));
+}
+
+Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
+                                const HloModuleConfig& hlo_module_config,
+                                const string& device_bitcode_dir_path) {
+  // Link the input module with ROCDL.
+  auto amdgpu_version = absl::get_if<int>(&gpu_version);
+  if (!amdgpu_version) {
+    return xla::InternalError(
+        "Incompatible AMD GCN ISA version was specified.");
+  }
+  TF_RETURN_IF_ERROR(
+      LinkROCDLIfNecessary(module, *amdgpu_version, device_bitcode_dir_path));
+
+  return Status::OK();
+}
+
+std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
+    llvm::Triple target_triple, int amdgpu_version,
+    const HloModuleConfig& hlo_module_config) {
+  return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version),
+                          hlo_module_config, "-code-object-v3");
+}
+
+void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
+  llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
+
+  // Initialize the AMDGPU target; it's the only target we link with, so call
+  // its specific initialization functions instead of the catch-all
+  // InitializeAll*.
+#if TENSORFLOW_USE_ROCM
+  LLVMInitializeAMDGPUTarget();
+  LLVMInitializeAMDGPUTargetInfo();
+  LLVMInitializeAMDGPUTargetMC();
+  LLVMInitializeAMDGPUAsmPrinter();
+#endif
+
+  llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
+  InitializePasses(registry);
+}
+
+}  // namespace
+
+namespace amdgpu {
+StatusOr<std::vector<uint8>> CompileToHsaco(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& rocdl_dir_path) {
+  static std::once_flag backend_init_flag;
+  std::call_once(backend_init_flag, AMDGPUBackendInit, hlo_module_config);
+
+  std::vector<uint8> hsaco;
+  std::unique_ptr<llvm::TargetMachine> target_machine;
+  {
+    tensorflow::profiler::TraceMe activity(
+        [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
+        tensorflow::profiler::TraceMeLevel::kInfo);
+    XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
+
+    auto amdgpu_version = absl::get_if<int>(&gpu_version);
+    if (!amdgpu_version) {
+      return xla::InternalError(
+          "Incompatible AMD GCN ISA version was specified.");
+    }
+
+    llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
+    // Construct LLVM TargetMachine for AMDGPU.
+    std::unique_ptr<llvm::TargetMachine> target_machine =
+        AMDGPUGetTargetMachine(default_target_triple, *amdgpu_version,
+                               hlo_module_config);
+
+    // Link with ROCm-Device-Libs, and optimize the LLVM module.
+    TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
+        module, gpu_version, hlo_module_config, rocdl_dir_path,
+        AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
+        kAMDGPUInlineThreshold));
+
+    // Lower optimized LLVM module to HSA code object.
+    TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
+  }
+  return hsaco;
+}
+
+}  // namespace amdgpu
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
similarity index 67%
rename from tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h
rename to tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index 9654175bfaf..526621de7a5 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -14,14 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 // LLVM-based compiler backend.
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
 
 #include <string>
 #include <utility>
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -29,6 +30,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace nvptx {
 // Compiles the argument module and returns it. libdevice_dir_path is the parent
 // directory of the libdevice bitcode libraries. The contents of the module may
 // be changed.
@@ -36,12 +38,21 @@ namespace gpu {
 // The Compile.* interfaces each create their own llvm::LLVMContext objects for
 // thread safety, but note that LLVM's multithreaded support is very
 // preliminary; multithreaded use is not recommended at this time.
-StatusOr<string> CompileToPtx(llvm::Module* module,
-                              std::pair<int, int> compute_capability,
+StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path);
+}  // namespace nvptx
+
+namespace amdgpu {
+// Compiles the argument module and returns it with LLVM AMDGPU backend.
+// rocdl_dir_path is the parent directory of ROCm-Device-Libs bitcode libraries.
+// The contents of the module may be changed.
+StatusOr<std::vector<uint8>> CompileToHsaco(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& rocdl_dir_path);
+}  // namespace amdgpu
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 536b11a00a9..9c86f7cd2a2 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -17,12 +17,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <algorithm>
-#include <iterator>
-#include <list>
 #include <memory>
-#include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -55,17 +50,15 @@ int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
                                       HloInstruction* instr2) {
   absl::flat_hash_set<HloInstruction*> in_list;
   for (auto instr : instr1->operands()) {
-    if (!IsProfitableOperand(instr)) {
-      continue;
+    if (IsProfitableOperand(instr)) {
+      in_list.insert(instr);
     }
-    in_list.insert(instr);
   }
   int64 profit = 0;
   for (auto instr : instr2->operands()) {
-    if (!IsProfitableOperand(instr) || !in_list.contains(instr)) {
-      continue;
+    if (IsProfitableOperand(instr) && in_list.contains(instr)) {
+      profit += ShapeUtil::ByteSizeOf(instr->shape());
     }
-    profit += ShapeUtil::ByteSizeOf(instr->shape());
   }
   VLOG(2) << "Fusing instr1=" << instr1->name() << " instr2=" << instr2->name()
           << ", the profit is =" << profit;
@@ -77,7 +70,6 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   if (!MultiOutputFusion::LegalToFuse(instr1, instr2)) {
     return false;
   }
-
   // If we're fusing fusions only do it if the fusion kind matches. Loop fusions
   // merge into bigger loop fusions and input (reduce) fusions become fusions
   // with multiple reduce outputs. We could fuse reduce and loop fusions
@@ -91,7 +83,6 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
        instr1->IsLoopFusion())) {
     return false;
   }
-
   // The emitter only supports in-place DUS for fusions with a single DUS at the
   // root. Don't sibling fuse DUS for now.
   // TODO(b/119178699): Multi-output fusing DUS can improve performance if we
@@ -103,15 +94,15 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
            HloOpcode::kDynamicUpdateSlice)) {
     return false;
   }
-
   // Do this check last, as it may be expensive.
   return !FusionWouldBeTooLarge(*instr1, *instr2);
 }
 
 namespace {
+
 // We prefer multi-output fusions over other fusions over unfused ops, because
 // we want to preserve fusion opportunities if possible.
-HloInstruction* GetPreferredFusionCandidate(
+HloInstruction* SelectPreferredFusionCandidate(
     const std::vector<HloInstruction*> candidates) {
   for (auto* candidate : candidates) {
     if (candidate->IsMultiOutputFusion()) {
@@ -123,8 +114,54 @@ HloInstruction* GetPreferredFusionCandidate(
       return candidate;
     }
   }
-  return candidates.empty() ? nullptr : candidates[0];
+  return candidates.empty() ? nullptr : candidates.front();
 }
+
+std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
+    const HloInstruction* producer, const HloReachabilityMap& reachability) {
+  std::vector<HloInstruction*> fusion_candidates;
+  for (HloInstruction* consumer : producer->users()) {
+    VLOG(3) << "Looking at producer " << producer->name()
+            << " and its consumer " << consumer->name();
+    if (!IsInputFusibleReduction(*consumer)) {
+      VLOG(3) << "Consumer " << consumer->name()
+              << " is not an input-fusible reduction..";
+      continue;
+    }
+    if (!IsProducerConsumerMultiOutputFusible(*producer, *consumer)) {
+      VLOG(3) << producer->name() << " and " << consumer->name()
+              << " are not fusible.";
+      continue;
+    }
+    // Do not fuse a producer if the other operands of the fusion are
+    // reachable from the producer, this would create a cycle.
+    auto operand_reachable_from_producer = [&](const HloInstruction* operand) {
+      // If a get-tuple-elment instruction is not in the reachability
+      // map, it has been created by fusion in this pass. Simply move
+      // on to its operand, which is in the reachability map.
+      if (!reachability.IsPresent(operand) &&
+          operand->opcode() == HloOpcode::kGetTupleElement) {
+        operand = operand->operand(0);
+      }
+      CHECK(reachability.IsPresent(operand) && reachability.IsPresent(producer))
+          << "Reachability map is incomplete. This should never "
+             "happen.";
+      return producer != operand && reachability.IsReachable(producer, operand);
+    };
+    if (absl::c_any_of(consumer->operands(), operand_reachable_from_producer)) {
+      VLOG(3) << producer->name() << " would introduce a cycle when fused.";
+      continue;
+    }
+    if (FusionWouldBeTooLarge(*producer, *consumer)) {
+      VLOG(3) << producer->name() << " and " << consumer->name()
+              << " would be too large of a fusion.";
+      continue;
+    }
+    fusion_candidates.push_back(consumer);
+  }
+  return fusion_candidates;
+}
+
 }  // namespace
 
 bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
@@ -144,86 +181,43 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
       VLOG(3) << producer->name() << " is a constant.";
       continue;
     }
-    std::vector<HloInstruction*> fusion_candidates;
-    for (HloInstruction* consumer : producer->users()) {
-      VLOG(3) << "Looking at producer " << producer->name()
-              << " and its consumer " << consumer->name();
-      // TODO(b/136623068): Use IsFusibleAsMultiOutputFusionRoot(...) to lift
-      // the restriction to input-fusible reductions.
-      if (!IsInputFusibleReduction(*consumer)) {
-        VLOG(3) << "Consumer " << consumer->name()
-                << " is not an input-fusible reduction.";
-        continue;
-      }
-      if (!IsProducerConsumerMultiOutputFusible(*producer, *consumer)) {
-        VLOG(3) << producer->name() << " and " << consumer->name()
-                << " are not fusible.";
-        continue;
-      }
-      // Do not fuse a producer if the other operands of the fusion are
-      // reachable from the producer, this would create a cycle.
-      if (absl::c_any_of(
-              consumer->operands(), [&](const HloInstruction* operand) {
-                // If a get-tuple-elment instruction is not in the reachability
-                // map, it has been created by fusion in this pass. Simply move
-                // on to its operand, which is in the reachability map.
-                if (!reachability()->IsPresent(operand) &&
-                    operand->opcode() == HloOpcode::kGetTupleElement) {
-                  operand = operand->operand(0);
-                }
-                CHECK(reachability()->IsPresent(operand) &&
-                      reachability()->IsPresent(producer))
-                    << "Reachability map is incomplete. This should never "
-                       "happen.";
-                return producer != operand &&
-                       reachability()->IsReachable(producer, operand);
-              })) {
-        VLOG(3) << producer->name() << " would introduce a cycle when fused.";
-        continue;
-      }
-      if (FusionWouldBeTooLarge(*producer, *consumer)) {
-        VLOG(3) << producer->name() << " and " << consumer->name()
-                << " would be too large of a fusion.";
-        continue;
-      }
-      fusion_candidates.push_back(consumer);
+    const auto candidates = GetProducerConsumerMultiOutputFusionCandidates(
+        producer, *reachability());
+    auto* consumer_for_fusion = SelectPreferredFusionCandidate(candidates);
+    if (consumer_for_fusion == nullptr) {
+      continue;
     }
-    auto* consumer_for_fusion = GetPreferredFusionCandidate(fusion_candidates);
-    if (consumer_for_fusion != nullptr) {
-      changed = true;
-      if (consumer_for_fusion->opcode() == HloOpcode::kFusion) {
-        VLOG(2) << "Fuse producer " << producer->name() << " into its consumer "
-                << consumer_for_fusion->name();
-        if (producer->opcode() == HloOpcode::kFusion) {
-          consumer_for_fusion->MergeFusionInstructionIntoMultiOutput(producer);
-        } else {
-          consumer_for_fusion->FuseInstructionIntoMultiOutput(producer);
-          CHECK_EQ(0, producer->user_count());
-          TF_CHECK_OK(computation()->RemoveInstruction(producer));
-        }
+    changed = true;
+    if (consumer_for_fusion->opcode() == HloOpcode::kFusion) {
+      VLOG(2) << "Fuse producer " << producer->name() << " into its consumer "
+              << consumer_for_fusion->name();
+      if (producer->opcode() == HloOpcode::kFusion) {
+        consumer_for_fusion->MergeFusionInstructionIntoMultiOutput(producer);
       } else {
-        HloInstruction* input_fusion =
-            computation()->AddInstruction(HloInstruction::CreateFusion(
-                consumer_for_fusion->shape(),
-                ChooseFusionKind(*producer, *consumer_for_fusion),
-                consumer_for_fusion));
-        VLOG(2) << "Fuse producer " << producer->name() << " and its consumer "
-                << consumer_for_fusion->name() << " into "
-                << input_fusion->name();
-        reachability()->Replace(consumer_for_fusion, input_fusion);
-        TF_CHECK_OK(computation()->ReplaceInstruction(consumer_for_fusion,
-                                                      input_fusion));
-        if (producer->opcode() == HloOpcode::kFusion) {
-          input_fusion->MergeFusionInstructionIntoMultiOutput(producer);
-        } else {
-          input_fusion->FuseInstructionIntoMultiOutput(producer);
-          CHECK_EQ(0, producer->user_count());
-          TF_CHECK_OK(computation()->RemoveInstruction(producer));
-        }
+        consumer_for_fusion->FuseInstructionIntoMultiOutput(producer);
+        CHECK_EQ(0, producer->user_count());
+        TF_CHECK_OK(computation()->RemoveInstruction(producer));
       }
+      continue;
+    }
+    HloInstruction* input_fusion =
+        computation()->AddInstruction(HloInstruction::CreateFusion(
+            consumer_for_fusion->shape(),
+            ChooseFusionKind(*producer, *consumer_for_fusion),
+            consumer_for_fusion));
+    VLOG(2) << "Fuse producer " << producer->name() << " and its consumer "
+            << consumer_for_fusion->name() << " into " << input_fusion->name();
+    reachability()->Replace(consumer_for_fusion, input_fusion);
+    TF_CHECK_OK(
+        computation()->ReplaceInstruction(consumer_for_fusion, input_fusion));
+    if (producer->opcode() == HloOpcode::kFusion) {
+      input_fusion->MergeFusionInstructionIntoMultiOutput(producer);
+    } else {
+      input_fusion->FuseInstructionIntoMultiOutput(producer);
+      CHECK_EQ(0, producer->user_count());
+      TF_CHECK_OK(computation()->RemoveInstruction(producer));
     }
   }
-
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
old mode 100644
new mode 100755
index 20b3d64c417..2f2a2efcecb
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -17,100 +17,35 @@ limitations under the License.
 
 #include <stdlib.h>
 
-#include <atomic>
-#include <functional>
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <utility>
+#include <fstream>
 
-#include "absl/memory/memory.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
-#include "tensorflow/compiler/xla/service/convolution_group_converter.h"
-#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
-#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
-#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h"
-#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
-#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
-#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
-#include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
-#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/compiler/xla/service/mem_wasted_on_passthrough_params.h"
-#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
-#include "tensorflow/compiler/xla/service/reshape_mover.h"
-#include "tensorflow/compiler/xla/service/rng_expander.h"
-#include "tensorflow/compiler/xla/service/slice_sinker.h"
-#include "tensorflow/compiler/xla/service/sort_simplifier.h"
-#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
-#include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
-#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
-#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
-#include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
-#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -165,6 +100,109 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
   return ".";
 }
 
+}  // namespace
+
+Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  // Convert convolutions into CustomCalls to cudnn, then canonicalize them
+  // (CudnnConvPaddingLegalization). Also expand cuSolver calls.
+  HloPassPipeline pipeline("conv_canonicalization");
+  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                            /*allow_mixed_precision=*/false);
+  pipeline.AddPass<CusolverRewriter>();
+  pipeline.AddPass<CudnnConvRewriter>();
+  pipeline.AddPass<CudnnFusedConvRewriter>();
+  pipeline.AddPass<CudnnConvPaddingLegalization>();
+  if (IsVoltaOrLater(*stream_exec)) {
+    pipeline.AddPass<CudnnConvPadForTensorCores>();
+    // CudnnConvPadForTensorCores leaves behind unnecessary
+    // tuple/get-tuple-element pairs that TupleSimplifier fixes.
+    pipeline.AddPass<TupleSimplifier>();
+  }
+
+  // tf2xla bridge, DepthwiseConvolutionConverter and CudnnConvRewriter
+  // introduces reshapes and transposes that can be eliminated using
+  // AlgebraicSimplifier
+  {
+    auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+        "algebraic_simplification_post_conv_rewriter");
+    pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                          /*allow_mixed_precision=*/false);
+
+    AlgebraicSimplifierOptions options;
+    pass.AddPass<AlgebraicSimplifier>(options);
+  }
+
+  // CudnnConvRewriter, CudnnConvPaddingLegalization and
+  // CudnnConvPadForTensorCores may add instructions which can be simplified
+  // by constant folding.
+  pipeline.AddPass<HloConstantFolding>();
+  TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+
+  return Status::OK();
+}
+
+Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    se::DeviceMemoryAllocator* device_allocator) {
+  HloPassPipeline pipeline("post-layout_assignment");
+  /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+   * fixing the ticket. */
+  pipeline.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false,
+      LayoutAssignment::InstructionCanChangeLayout);
+
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+
+  // Rewrite GEMMs into custom calls.
+  pipeline.AddPass<GemmRewriter>();
+
+  // Choose the fastest algorithm for each conv.
+  //
+  // We pick the algorithm before fusion so we can generate better HLO. After
+  // CudnnConvRewriter, our convolutions are CustomCalls which return a
+  // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of
+  // scratch:
+  //
+  //   customcall = (f32[...], f32[0])
+  //   return gte(customcall, 0)
+  //
+  // The algorithm picker then chooses the best algorithm, and potentially
+  // increases the scratch space.  It replaces customcall with new_tuple,
+  // giving us the following:
+  //
+  //   new_customcall = (f32[...], f32[N])
+  //   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
+  //   return gte(new_tuple, 0)
+  //
+  // The new tuple and gte instructions then be simplified away, because
+  // nobody is expected to use the scratch value.
+  //
+  // However, if we were to run CudnnConvAlgorithmPicker after fusion
+  // the gte(customcall, 0) would probably already be into a fusion node.  We
+  // can't simplify across HloComputation boundaries, so in this case we
+  // wouldn't be able to simplify away the new_tuple bits.
+  pipeline.AddPass<CudnnConvAlgorithmPicker>(stream_exec, device_allocator);
+
+  // Find the fastest algorithm for GEMMs.
+  pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
+
+  // Clean up new_tuple described above.
+  pipeline.AddPass<TupleSimplifier>();
+
+  pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+
+  return Status::OK();
+}
+
+namespace {
 absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
                                         const HloInstruction* operand,
                                         const ShapeIndex& user_index) {
@@ -222,387 +260,71 @@ void WarnIfBadDriverJITVersion() {
   });
 }
 
+// Try to load ptx from files defined in the FLAGS. If successful, return true.
+bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
+  // If the xla_gpu_ptx_file options is set, be explicit when a file is used
+  // and warn when a file is not used to ease catching typo in filename.
+  std::string prefix = xla::FilenameFor(*module, *ptx);
+  std::string matched_filename;
+  for (const string filename :
+       module->config().debug_options().xla_gpu_ptx_file()) {
+    // To ease comparing many PTX versions, accept different suffixes then
+    // the original filename.
+    if (absl::StartsWith(filename, prefix)) {
+      matched_filename = filename;
+      VLOG(0) << "RunBackend() - Will load PTX from file: " << filename;
+      break;
+    }
+  }
+  if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 &&
+      matched_filename.empty()) {
+    VLOG(0) << "RunBackend() - For module with prefix '" << prefix
+            << "', we did not found a PTX file to load.";
+  }
+
+  if (!matched_filename.empty()) {
+    std::ifstream ifs(matched_filename, std::ifstream::in);
+    *ptx = std::string(std::istreambuf_iterator<char>(ifs),
+                       std::istreambuf_iterator<char>());
+    CHECK(!ptx->empty()) << "Empty or non existing PTX file: "
+                         << matched_filename;
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
 
-// Runs optimization passes on the given HLO module.
-Status impl::OptimizeHloModule(HloModule* hlo_module,
-                               se::StreamExecutor* stream_exec,
-                               se::DeviceMemoryAllocator* device_allocator) {
-  {
-    HloPassPipeline pipeline("optimization");
-    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
-                                              /*allow_mixed_precision=*/false);
-
-    // Expand random number generation.
-    pipeline.AddPass<RngExpander>();
-
-    // Remove zero-sized HLO from the input so that other passes don't have to
-    // handle it.
-    pipeline.AddPass<ZeroSizedHloElimination>();
-
-    pipeline.AddPass<GpuScatterExpander>();
-
-    pipeline.AddPass<DynamicIndexSplitter>();
-    pipeline.AddPass<GpuHloSupportChecker>();
-    ReducePrecisionInsertion::AddPasses(
-        &pipeline, hlo_module->config().debug_options(),
-        ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
-
-    // TODO(b/64094172): make Call work on GPU instead of inlining.
-    pipeline.AddPass<CallInliner>();
-    auto cost_model = [](HloInstruction* conv) {
-      // We need a cost model for GPUs. Currently, do nothing.
-      return false;
-    };
-    pipeline.AddPass<DotDecomposer>();
-    pipeline.AddPass<ConvolutionGroupConverter>(
-        cost_model,
-        /*convert_batch_groups_only=*/true);
-    // Expand the sort op to support stable sorting if required.
-    pipeline.AddPass<StableSortExpander>();
-    // Convert BF16 operations to F32 operations so that the GPU backend can
-    // support BF16 operations without directly implementing a BF16 lowering for
-    // most ops.
-    pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
-
-    {
-      auto& pass =
-          pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
-      pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
-                                            /*allow_mixed_precision=*/false);
-
-      // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls
-      // where possible.  Not every batchnorm op can be implemented as a call to
-      // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs.
-      if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
-        pass.AddPass<CudnnBatchNormRewriter>();
-      }
-      pass.AddPass<BatchNormExpander>(
-          /*rewrite_training_op=*/true,
-          /*rewrite_inference_op=*/true,
-          /*rewrite_grad_op=*/true);
-
-      pipeline.AddPass<HloGetDimensionSizeRewriter>();
-
-      // BatchNormExpander can create zero-sized ops, so zero-sized HLO
-      // elimination has to come after that pass.
-      pipeline.AddPass<ZeroSizedHloElimination>();
-
-      AlgebraicSimplifierOptions options;
-      pass.AddPass<AlgebraicSimplifier>(options);
-      pass.AddPass<SortSimplifier>();
-      pass.AddPass<TupleSimplifier>();
-      pass.AddPass<WhileLoopConstantSinking>();
-      pass.AddPass<WhileLoopSimplifier>();
-
-      // TODO(b/134075051): Re-enable after b/134075051 is fixed.
-      // pass.AddPass<SliceSinker>();
-
-      pass.AddPass<HloDCE>();
-      pass.AddPass<ReshapeMover>();
-      pass.AddPass<HloConstantFolding>();
-      pass.AddPass<ConditionalSimplifier>();
-    }
-
-    pipeline.AddPass<TransposeFolding>(
-        [](const HloInstruction& dot,
-           const TransposeFolding::OperandIndices& candidate_operands) {
-          return IsMatrixMultiplication(dot)
-                     ? candidate_operands
-                     : TransposeFolding::OperandIndices{};
-        },
-        TransposeFolding::NeverFoldTranspose);
-    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
-    pipeline.AddPass<HloDCE>();
-
-    // Run WhileLoopTripCountAnnotator at the end of the simplification
-    // pipeline, before layout assignment and fusion.  This pass does some
-    // pattern-matching on while bodies/conditions, and this is where the HLO is
-    // "nicest".
-    //
-    // It's important that we don't make semantic changes (e.g. unrolling) to
-    // any `while` loops after this point, because otherwise the trip-count
-    // annotations added by this pass may not be correct after the
-    // modifications.
-    pipeline.AddPass<WhileLoopTripCountAnnotator>();
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    // Convert convolutions into CustomCalls to cudnn, then canonicalize them
-    // (CudnnConvPaddingLegalization). Also expand cuSolver calls.
-    HloPassPipeline pipeline("conv_canonicalization");
-    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
-                                              /*allow_mixed_precision=*/false);
-    pipeline.AddPass<CusolverRewriter>();
-    pipeline.AddPass<CudnnConvRewriter>();
-    pipeline.AddPass<CudnnFusedConvRewriter>();
-    pipeline.AddPass<CudnnConvPaddingLegalization>();
-    if (IsVoltaOrLater(*stream_exec)) {
-      pipeline.AddPass<CudnnConvPadForTensorCores>();
-      // CudnnConvPadForTensorCores leaves behind unnecessary
-      // tuple/get-tuple-element pairs that TupleSimplifier fixes.
-      pipeline.AddPass<TupleSimplifier>();
-    }
-    // CudnnConvRewriter, CudnnConvPaddingLegalization and
-    // CudnnConvPadForTensorCores may add instructions which can be simplified
-    // by constant folding.
-    pipeline.AddPass<HloConstantFolding>();
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    // Run layout assignment in a separate pipeline from
-    // "post-layout-assignment" because we want everything after layout
-    // assignment to have a layout-sensitive invariant-checker, but
-    // HloPassPipeline also runs its invariant checker before any passes are
-    // run, meaning, the pipeline that contains layout assignment cannot contain
-    // a layout-sensitive verifier!
-    HloPassPipeline pipeline("layout assignment");
-    pipeline.AddPass<GpuLayoutAssignment>(
-        hlo_module->mutable_entry_computation_layout(),
-        LayoutAssignment::InstructionCanChangeLayout, stream_exec);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    HloPassPipeline pipeline("post-layout_assignment");
-    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
-     * fixing the ticket. */
-    pipeline.AddInvariantChecker<HloVerifier>(
-        /*layout_sensitive=*/true,
-        /*allow_mixed_precision=*/false,
-        LayoutAssignment::InstructionCanChangeLayout);
-
-    // The LayoutAssignment pass may leave behind kCopy instructions which are
-    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions options;
-    options.set_is_layout_sensitive(true);
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
-
-    // Rewrite GEMMs into custom calls.
-    pipeline.AddPass<GemmRewriter>();
-
-    // Choose the fastest algorithm for each conv.
-    //
-    // We pick the algorithm before fusion so we can generate better HLO. After
-    // CudnnConvRewriter, our convolutions are CustomCalls which return a
-    // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of
-    // scratch:
-    //
-    //   customcall = (f32[...], f32[0])
-    //   return gte(customcall, 0)
-    //
-    // The algorithm picker then chooses the best algorithm, and potentially
-    // increases the scratch space.  It replaces customcall with new_tuple,
-    // giving us the following:
-    //
-    //   new_customcall = (f32[...], f32[N])
-    //   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
-    //   return gte(new_tuple, 0)
-    //
-    // The new tuple and gte instructions then be simplified away, because
-    // nobody is expected to use the scratch value.
-    //
-    // However, if we were to run CudnnConvAlgorithmPicker after fusion
-    // the gte(customcall, 0) would probably already be into a fusion node.  We
-    // can't simplify across HloComputation boundaries, so in this case we
-    // wouldn't be able to simplify away the new_tuple bits.
-    pipeline.AddPass<CudnnConvAlgorithmPicker>(stream_exec, device_allocator);
-
-    // Find the fastest algorithm for GEMMs.
-    pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
-
-    // Clean up new_tuple described above.
-    pipeline.AddPass<TupleSimplifier>();
-
-    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    HloPassFix<HloPassPipeline> fusion("fusion");
-    // We try to split variadic ops with many parameters into several such ops
-    // to avoid exceeding the parameter space.
-    fusion.AddPass<VariadicOpSplitter>();
-    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
-     * fixing the ticket. */
-    fusion.AddInvariantChecker<HloVerifier>(
-        /*layout_sensitive=*/true,
-        /*allow_mixed_precision=*/false,
-        LayoutAssignment::InstructionCanChangeLayout);
-    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
-    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
-    fusion.AddPass<FusionMerger>();
-    fusion.AddPass<GpuMultiOutputFusion>();
-    fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
-                           /*only_fusion_computations=*/true);
-    fusion.AddPass<HloDCE>();
-    TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
-
-    HloPassPipeline reduce_pipeline("reduce-precision");
-    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
-     * fixing the ticket. */
-    reduce_pipeline.AddInvariantChecker<HloVerifier>(
-        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false,
-        LayoutAssignment::InstructionCanChangeLayout);
-    ReducePrecisionInsertion::AddPasses(
-        &reduce_pipeline, hlo_module->config().debug_options(),
-        ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
-    StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module);
-    TF_RETURN_IF_ERROR(reduce_result.status());
-
-    if (reduce_result.ValueOrDie()) {
-      // Do another fusion pass, with the expectation that we may be able to
-      // fuse the new ReducePrecision operations.
-      TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
-    }
-  }
-
-  return Status::OK();
-}
-
-// Modifies the given HLO module so that it will be accepted by IrEmitter.
-// Unlike optimization passes, the passes are necessary for correctness.
-Status impl::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
-  // In some cases, we have to place the result of an instruction in a temporary
-  // buffer. For instance, the buffer that holds an external parameter is
-  // assumed immutable at this point, and should not be reused for output
-  // (b/27180329). Therefore, in that case, we set the output to be a copy of
-  // the parameter.
-  HloPassPipeline pipeline("GPU-ir-emit-prepare");
-  /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
-   * fixing the ticket. */
-  pipeline.AddInvariantChecker<HloVerifier>(
-      /*layout_sensitive=*/true,
-      /*allow_mixed_precision=*/false,
-      LayoutAssignment::InstructionCanChangeLayout);
-
-  // Copy insertion should be performed immediately before IR emission to avoid
-  // inserting unnecessary copies (later pass adds an instruction which
-  // materializes the value) or missing a necessary copy (later pass removes an
-  // instruction which materializes a value). DCE must be run immediately before
-  // (and sometime after) copy insertion, to avoid dead code from interfering
-  // with the rewrites.
-  pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
-  // The following pass LOGs memory waste. Add it when VLOGing is enabled only.
-  if (VLOG_IS_ON(2)) {
-    pipeline.AddPass<MemWastedOnPassthroughParams>();
-  }
-  pipeline.AddPass<GpuCopyInsertion>(&CanShareBufferHint);
-  pipeline.AddPass<GpuSanitizeConstantNames>();
-  return pipeline.Run(hlo_module).status();
-}
-
 NVPTXCompiler::NVPTXCompiler()
-    : pointer_size_(llvm::DataLayout(nvptx::kDataLayout)
-                        .getPointerSize(0 /* default address space */)) {}
+    : GpuCompiler(stream_executor::cuda::kCudaPlatformId, nvptx::kTargetTriple,
+                  nvptx::kDataLayout) {}
 
-StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
-  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
-  tensorflow::profiler::TraceMe activity(
-      [&] { return absl::StrCat("HLO Transforms:", module->name()); },
-      tensorflow::profiler::TraceMeLevel::kInfo);
-  TF_RETURN_IF_ERROR(
-      impl::OptimizeHloModule(module.get(), stream_exec, device_allocator));
-
-  TF_RETURN_IF_ERROR(impl::PrepareHloModuleForIrEmitting(module.get()));
-
-  return std::move(module);
+HloDataflowAnalysis::CanShareBuffer NVPTXCompiler::GetCanShareBuffer() {
+  return &CanShareBufferHint;
 }
 
-StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");
-
-  TF_RET_CHECK(stream_exec != nullptr);
-
-  llvm::LLVMContext llvm_context;
-  std::string buffer;
-  llvm::raw_string_ostream error(buffer);
-  llvm::DiagnosticPrinterRawOStream printer(error);
-  auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
-                              void* Context) {
-    auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
-    diag_info.print(*printer);
-  };
-  llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
-
-  llvm::Module llvm_module(module->name().c_str(), llvm_context);
-  // Set the target triple and the data layout.
-  llvm_module.setTargetTriple(nvptx::kTargetTriple);
-  llvm_module.setDataLayout(nvptx::kDataLayout);
-
-  // Determine the HLO schedule, which is an ordering of HLO instructions.  This
-  // is used by buffer assignment to enable buffer reuse, and the same ordering
-  // must also be used to determine the thunk launch schedule.
-  std::unique_ptr<StreamAssignment> stream_assignment = AssignStreams(*module);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
-
-  // Run buffer analysis on the HLO graph. This analysis figures out which
-  // temporary buffers are required to run the computation.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(
-          module.get(), hlo_schedule->ConsumeHloOrdering(),
-          BufferSizeBytesFunction(),
-          /*color_alignment=*/
-          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allocate_buffers_for_constants=*/true,
-          /*colorer=*/BufferAssigner::DefaultColorer(),
-          /*must_not_live_out=*/{}, &CanShareBufferHint));
-  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
-
-  IrEmitterContext ir_emitter_context(
-      module.get(), buffer_assignment.get(), stream_exec->platform(),
-      &stream_exec->GetDeviceDescription(), &llvm_module);
-
-  HloComputation* entry_computation = module->entry_computation();
-  IrEmitterUnnested ir_emitter(module->config(), entry_computation,
-                               &ir_emitter_context);
-
-  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
-
-  {
-    XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - IR emission");
-    TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
+GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
+  int cc_major, cc_minor;
+  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
   }
 
-  if (user_pre_optimization_hook_) {
-    user_pre_optimization_hook_(llvm_module);
-  }
-  string ir_module_string_before_opt;
-  const bool embed_ir_in_executable =
-      module->config().debug_options().xla_embed_ir_in_executable();
-  if (embed_ir_in_executable) {
-    ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
-  }
+  return std::make_pair(cc_major, cc_minor);
+}
 
-  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/false);
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+NVPTXCompiler::CompileTargetBinary(const HloModule* module,
+                                   llvm::Module* llvm_module,
+                                   GpuVersion gpu_version,
+                                   se::StreamExecutor* stream_exec) {
+  std::pair<int, int> compute_capability =
+      absl::get<std::pair<int, int>>(gpu_version);
 
-  {
-    XLA_SCOPED_LOGGING_TIMER(
-        "NVPTXCompiler::RunBackend - Running LLVM verifier");
-
-    std::string err;
-    llvm::raw_string_ostream err_stream(err);
-
-    // verifyModule() returns true if the module is broken.
-    TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
-        << "Invalid LLVM IR before optimizations:\n"
-        << err_stream.str()
-        << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-           "Rerun with --xla_dump_to to get the IR. ";
-  }
-
-  string libdevice_dir;
+  std::string libdevice_dir;
   {
     tensorflow::mutex_lock lock(mutex_);
 
@@ -616,70 +338,31 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   }
   VLOG(2) << "Libdevice dir = " << libdevice_dir << "\n";
 
-  int cc_major, cc_minor;
-  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor)) {
-    LOG(WARNING)
-        << "Couldn't get compute capability for device; assuming sm_20.";
-    cc_major = 2;
-    cc_minor = 0;
-  }
-
   string ptx;
-  {
-    XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx");
-    TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
-                                          module->config(), libdevice_dir));
+  if (!MaybeLoadPtxFromFile(module, &ptx)) {
+    XLA_SCOPED_LOGGING_TIMER(
+        "NVPTXCompiler::CompileTargetBinary - CompileToPtx");
+    TF_ASSIGN_OR_RETURN(
+        ptx, nvptx::CompileToPtx(llvm_module, gpu_version, module->config(),
+                                 libdevice_dir));
   }
 
-  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true);
+  llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/true);
 
   if (user_post_optimization_hook_) {
-    user_post_optimization_hook_(llvm_module);
+    user_post_optimization_hook_(*llvm_module);
   }
   // Write PTX to IR dump directory, if IR dumping was requested.
   if (DumpingEnabledForHloModule(*module)) {
     DumpToFileInDirOrStdout(*module, "ptx", ptx);
   }
 
-  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
-      stream_exec, ptx, cc_major, cc_minor, module->config());
+  std::vector<uint8> cubin =
+      CompilePtxOrGetCachedResult(stream_exec, ptx, compute_capability.first,
+                                  compute_capability.second, module->config());
 
-  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
-      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
-      hlo_schedule->ThunkLaunchOrder());
-  if (DumpingEnabledForHloModule(*module)) {
-    DumpToFileInDirOrStdout(*module, "thunk_schedule",
-                            thunk_schedule->ToString());
-  }
-
-  std::unique_ptr<HloProfileIndexMap> profile_index_map;
-  std::unique_ptr<HloProfilePrinterData> profile_printer;
-
-  if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
-    HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
-    cost_analysis.set_bytes_per_second(
-        stream_exec->GetDeviceDescription().memory_bandwidth());
-    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
-    VLOG(1) << "HLO memory read+written: "
-            << tensorflow::strings::HumanReadableNumBytes(
-                   cost_analysis.bytes_accessed());
-    if (module->config().hlo_profiling_enabled()) {
-      profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-      profile_printer = CreateHloProfilePrinterData(
-          *profile_index_map, cost_analysis, entry_computation->name());
-    }
-  }
-
-  auto* gpu_executable = new GpuExecutable(
-      ptx, cubin, std::make_pair(cc_major, cc_minor), std::move(thunk_schedule),
-      std::move(module), std::move(buffer_assignment),
-      std::move(profile_printer), std::move(profile_index_map));
-  if (embed_ir_in_executable) {
-    DCHECK_NE("", ir_module_string_before_opt);
-    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
-  }
-  return std::unique_ptr<Executable>(gpu_executable);
+  return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
+                                                    std::move(cubin));
 }
 
 std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
@@ -761,16 +444,5 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
   return cache_value->cubin_data;
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-NVPTXCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
-                                  const AotCompilationOptions& options) {
-  return Unimplemented(
-      "not yet implemented: NVPTXCompiler::CompileAheadOfTime");
-}
-
-se::Platform::Id NVPTXCompiler::PlatformId() const {
-  return se::cuda::kCudaPlatformId;
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index 980c00ac7da..a7b38afb8ec 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -22,72 +22,37 @@ limitations under the License.
 
 #include "absl/container/node_hash_map.h"
 #include "absl/types/optional.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/llvm_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace xla {
 namespace gpu {
 
-// Temporarily expose the optimization pipeline for the GPU backend for reuse
-// in the MLIR GPU backend.
-// TODO(b/137624192): Remove once MLIR backend uses tailored optimizations.
-namespace impl {
-
-Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         se::DeviceMemoryAllocator* device_allocator);
-Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
-
-}  // namespace impl
-
-// The GPU compiler generates efficient GPU executables.
-class NVPTXCompiler : public LLVMCompiler {
+// NVPTXCompiler generates efficient GPU executables for NVPTX target.
+class NVPTXCompiler : public GpuCompiler {
  public:
   NVPTXCompiler();
   ~NVPTXCompiler() override {}
 
-  // Bring in
-  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-  //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<se::StreamExecutor*>>
-  //        stream_execs)
-  using LLVMCompiler::Compile;
-
-  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+  Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
-  StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+  Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
-                     AotCompilationOptions const& options) override;
+  HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override;
 
-  se::Platform::Id PlatformId() const override;
+  GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
-    // Capture just the pointer size, not the entire NVPTXCompiler object.
-    int64 pointer_size = pointer_size_;
-    return [pointer_size](const Shape& shape) {
-      return ShapeUtil::ByteSizeOf(shape, pointer_size);
-    };
-  }
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+      const HloModule* hlo_module, llvm::Module* llvm_module,
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
  private:
-  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
-  const int64 pointer_size_;
-
   tensorflow::mutex mutex_;
 
   // When compiling an HLO module, we need to find a path to the nvvm libdevice
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index cb012649200..f9937ba77de 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/gpu/target_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -72,8 +73,8 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_;
   CHECK_NE(index_type, nullptr);
   std::vector<llvm_ir::IrArray::Index> array_indices;
-  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
+  llvm::Value* block_id =
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
                             static_cast<llvm::Instruction*>(block_id));
   block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
@@ -82,8 +83,8 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   //   "It is guaranteed that [...] 0  <=  %tid.x <  %ntid.x"
   //
   // %ntid.x is currently specified as 1024.
-  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
+  llvm::Value* thread_id =
+      EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
                             static_cast<llvm::Instruction*>(thread_id));
   thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 10bc82488ff..2276807d74f 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -82,6 +82,11 @@ LaunchDimensions CalculateLaunchDimensions(
   // TODO(jlebar): Investigate this further, and tune this heuristic so we can
   // run faster on the few benchmarks where smaller block size helps.
   int64 threads_per_block = ThreadsPerBlockLimit(device_desc);
+  // We unroll kernels to make use of vectorized loads/stores. This means we
+  // need more registers to hold intermediate values. Reduce the number of
+  // blocks per thread to increase the number of registers available to ptxas.
+  // Make sure we still have a multiple of 32.
+  threads_per_block = RoundUpToNearest(threads_per_block / unroll_factor, 32LL);
   if (num_elements < threads_per_block) {
     threads_per_block = num_elements;
     VLOG(2) << "Update # of threads per block to the element count ("
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
deleted file mode 100644
index 5793051771f..00000000000
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
-
-namespace xla {
-namespace gpu {
-
-StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
-    se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        absl::StrFormat(
-            "Allocating %d bytes exceeds the memory limit of %d bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
-                      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                                  /*retry_on_failure=*/false));
-  total_allocated_bytes_ += byte_size;
-
-  se::DeviceMemoryBase buffer_addr = *allocated_buffer;
-  allocated_buffers_.push_back(std::move(allocated_buffer));
-  return se::DeviceMemory<uint8>(buffer_addr);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
deleted file mode 100644
index 9654237956a..00000000000
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/stream_executor/device_memory_allocator.h"
-
-namespace xla {
-namespace gpu {
-
-class ScratchAllocator : public se::ScratchAllocator {
- public:
-  ScratchAllocator(int device_ordinal,
-                   se::DeviceMemoryAllocator* memory_allocator)
-      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
-  }
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
-                                                  int64 byte_size) override;
-
-  template <typename T>
-  StatusOr<se::DeviceMemory<T>> Allocate(se::Stream* stream,
-                                         int64 num_elements) {
-    TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8> bytes,
-                        AllocateBytes(stream, num_elements * sizeof(T)));
-    return se::DeviceMemory<T>(bytes);
-  }
-
- private:
-  const int device_ordinal_;
-  se::DeviceMemoryAllocator* memory_allocator_;
-  std::vector<se::OwningDeviceMemory> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index 1cdf9752390..117931e3398 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
@@ -201,10 +202,7 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
   }
 
   auto kernel_base = absl::make_unique<se::KernelBase>(stream_exec);
-  if (!stream_exec->GetKernel(loader_spec, kernel_base.get())) {
-    return InternalError("Unable to load kernel '%s'", kernel_name);
-  }
-
+  TF_RETURN_IF_ERROR(stream_exec->GetKernel(loader_spec, kernel_base.get()));
   return std::move(kernel_base);
 }
 
@@ -217,13 +215,9 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
   for (const se::DeviceMemoryBase& buf : args) {
     kernel_args->add_device_memory_argument(buf);
   }
-
-  if (!stream->parent()->Launch(stream, se::ThreadDim(threads_per_block),
-                                se::BlockDim(block_count), kernel,
-                                *kernel_args)) {
-    return InternalError("Unable to launch kernel");
-  }
-  return Status::OK();
+  return stream->parent()->Launch(stream, se::ThreadDim(threads_per_block),
+                                  se::BlockDim(block_count), kernel,
+                                  *kernel_args);
 }
 
 se::cuda::PtxCompilationOptions PtxOptsFromConfig(
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc
index 31f989bd58c..48c703183fc 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@@ -29,9 +29,14 @@ namespace {
 using absl::StrCat;
 
 // Wrapper structure for carrying llvm intrinsic ids for NVPTX/AMDGPU platforms.
+// On AMDGPU, some of these operations are made as device functions instead of
+// intrinsics. Therefore a variant type is used to wrap the lambda to call
+// those device functions.
 struct TargetIntrinsics {
   llvm::Intrinsic::ID nvptx_intrinsic;
-  llvm::Intrinsic::ID amdgpu_intrinsic;
+  absl::variant<llvm::Intrinsic::ID,
+                std::function<llvm::CallInst*(llvm::IRBuilder<>*)>>
+      amdgpu_intrinsic_or_function;
 };
 
 // Gets the llvm intrinsic ids on different platforms (NVPTX, AMDGPU)
@@ -66,6 +71,30 @@ struct TargetIntrinsics GetIntrinsic(TargetIntrinsicID intrin) {
       return {llvm::Intrinsic::nvvm_barrier0,
               llvm::Intrinsic::amdgcn_s_barrier};
     }
+    case TargetIntrinsicID::kBlockDimx: {
+      return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
+              [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
+                return EmitDeviceFunctionCall("__ockl_get_local_size",
+                                              {b_->getInt32(0)}, {U32}, U64, {},
+                                              b_);
+              }};
+    }
+    case TargetIntrinsicID::kBlockDimy: {
+      return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_y,
+              [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
+                return EmitDeviceFunctionCall("__ockl_get_local_size",
+                                              {b_->getInt32(1)}, {U32}, U64, {},
+                                              b_);
+              }};
+    }
+    case TargetIntrinsicID::kBlockDimz: {
+      return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_z,
+              [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
+                return EmitDeviceFunctionCall("__ockl_get_local_size",
+                                              {b_->getInt32(2)}, {U32}, U64, {},
+                                              b_);
+              }};
+    }
   }
 }
 
@@ -156,6 +185,36 @@ string ObtainDeviceFunctionName(TargetDeviceFunctionID func_id,
   }
 }
 
+llvm::CallInst* EmitDeviceFunctionCall(
+    const string& callee_name, absl::Span<llvm::Value* const> operands,
+    absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
+    absl::Span<const llvm::Attribute::AttrKind> attributes,
+    llvm::IRBuilder<>* b) {
+  std::vector<llvm::Type*> ir_input_types;
+  llvm::Module* module = b->GetInsertBlock()->getModule();
+  for (PrimitiveType input_type : input_types) {
+    ir_input_types.push_back(
+        llvm_ir::PrimitiveTypeToIrType(input_type, module));
+  }
+  llvm::FunctionType* callee_type = llvm::FunctionType::get(
+      llvm_ir::PrimitiveTypeToIrType(output_type, module),  // Return type.
+      ir_input_types,                                       // Parameter types.
+      false);  // No variadic arguments.
+
+  // Declares the callee if it is not declared already.
+  llvm::Function* callee = llvm::dyn_cast<llvm::Function>(
+      b->GetInsertBlock()
+          ->getModule()
+          ->getOrInsertFunction(callee_name, callee_type)
+          .getCallee());
+
+  for (auto attribute : attributes) {
+    callee->addFnAttr(attribute);
+  }
+
+  return b->CreateCall(callee, llvm_ir::AsArrayRef(operands));
+}
+
 llvm::CallInst* EmitCallToTargetIntrinsic(
     TargetIntrinsicID intrinsic_id, absl::Span<llvm::Value* const> operands,
     absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b) {
@@ -166,7 +225,17 @@ llvm::CallInst* EmitCallToTargetIntrinsic(
   if (target_triple.isNVPTX()) {
     llvm_intrinsic_id = gpu_intrinsic_id.nvptx_intrinsic;
   } else if (target_triple.getArch() == llvm::Triple::amdgcn) {
-    llvm_intrinsic_id = gpu_intrinsic_id.amdgpu_intrinsic;
+    llvm::Intrinsic::ID* llvm_intrinsic_id_ptr =
+        absl::get_if<llvm::Intrinsic::ID>(
+            &gpu_intrinsic_id.amdgpu_intrinsic_or_function);
+    if (llvm_intrinsic_id_ptr) {
+      llvm_intrinsic_id = *llvm_intrinsic_id_ptr;
+    } else {
+      std::function<llvm::CallInst*(llvm::IRBuilder<>*)>* builder_func =
+          absl::get_if<std::function<llvm::CallInst*(llvm::IRBuilder<>*)>>(
+              &gpu_intrinsic_id.amdgpu_intrinsic_or_function);
+      return (*builder_func)(b);
+    }
   } else {
     LOG(FATAL) << "Invalid triple " << target_triple.str();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h
index d50529e395e..4355ed21136 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@@ -39,6 +39,9 @@ enum class TargetIntrinsicID {
   kBlockIdy,
   kBlockIdz,
   kBarrierId,
+  kBlockDimx,
+  kBlockDimy,
+  kBlockDimz,
 };
 
 // Enumeration to get target specific device math function.
@@ -59,8 +62,15 @@ enum class TargetDeviceFunctionID {
   kHypot
 };
 
-// Emits a call to the specified target intrinsic with the given operands.
+// Emits IR to call a device function named "callee_name" on the given
+// operand. Returns the IR value that represents the return value.
+llvm::CallInst* EmitDeviceFunctionCall(
+    const std::string& callee_name, absl::Span<llvm::Value* const> operands,
+    absl::Span<const PrimitiveType> input_type, PrimitiveType output_type,
+    absl::Span<const llvm::Attribute::AttrKind> attributes,
+    llvm::IRBuilder<>* b);
 
+// Emits a call to the specified target intrinsic with the given operands.
 // Overloaded intrinsics (for example, "minnum") must include a type
 // in overloaded_types  for each overloaded type. Typically, overloaded
 // intrinsics have only a single overloaded type.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a9b52d985af..67051b153b1 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -7,7 +7,7 @@
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 83fb6ebb443..7491949fa59 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -52,28 +52,5 @@ void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
   EXPECT_TRUE(filecheck_result.ValueOrDie());
 }
 
-void GpuCodegenTest::MatchOptimizedHlo(absl::string_view hlo,
-                                       absl::string_view pattern,
-                                       bool print_operand_shape) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(hlo));
-  HloPrintOptions print_opts;
-  print_opts.set_print_operand_shape(print_operand_shape);
-  StatusOr<bool> filecheck_result =
-      RunFileCheck(optimized_module->ToString(print_opts), pattern);
-  TF_ASSERT_OK(filecheck_result.status());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
-}
-
-StatusOr<std::unique_ptr<HloModule>> GpuCodegenTest::GetOptimizedModule(
-    absl::string_view hlo) {
-  HloModuleConfig config;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnVerifiedModule(hlo, config));
-  return backend().compiler()->RunHloPasses(
-      std::move(module), backend().default_stream_executor(),
-      backend().default_stream_executor()->GetAllocator());
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index c3c6586d12a..59fba6325ec 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -34,21 +34,6 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
   void CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
                            absl::string_view pattern);
-
-  // Compiles the given `hlo` with optimizations, and verifies that optimized
-  // HLO matches the given FileCheck pattern.
-  void MatchOptimizedHlo(absl::string_view hlo, absl::string_view pattern,
-                         bool print_operand_shape = false);
-
-  // LikeMatchOptimizedHlo, but checks operand shapes as well.
-  void MatchOptimizedHloWithShapes(absl::string_view hlo,
-                                   absl::string_view pattern) {
-    MatchOptimizedHlo(hlo, pattern, /*print_operand_shape=*/true);
-  }
-
-  // Compiles and returns module with optimizations from a given HLO.
-  StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
-      absl::string_view hlo);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index a12932f573b..92bb84065a2 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -99,6 +99,22 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, UnnestedTransposeC128TypeRun) {
+  const char *const kHloString = R"(
+    HloModule unnested_transpose_3
+
+    ENTRY unnested_transpose_3 {
+      para0 = c128[65,65]{1,0} parameter(0)
+      ROOT copy1 = c128[65,65]{0,1} copy(para0)
+    })";
+
+  // With the current implementation for the available hardwares, we bail out
+  // from the tiled transpose implementation at the last minute. Instead of
+  // checking the transpose is not tiled, we only check the module compiled and
+  // run in this test.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
 TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   const char *const kHloString = R"(
     HloModule multiple_output_fusion_1
@@ -520,6 +536,51 @@ TEST_F(GpuKernelTilingTest,
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001}));
 }
 
+TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
+  const char *const kHloString = R"(
+  HloModule Test
+
+  scalar_add_computation.1 {
+    scalar_lhs.1 = f32[] parameter(0)
+    scalar_rhs.1 = f32[] parameter(1)
+    ROOT add.6 = f32[] add(scalar_lhs.1, scalar_rhs.1)
+  }
+  ENTRY Test {
+    param_3.241 = f16[512,2,9,9]{1,3,2,0} parameter(3)
+    constant_661 = f16[] constant(0)
+    broadcast.695 = f16[512,2,9,9]{1,3,2,0} broadcast(constant_661), dimensions={}
+    compare.42 = pred[512,2,9,9]{1,3,2,0} compare(param_3.241, broadcast.695), direction=GT
+    param_2.401 = f16[512,2,9,9]{1,3,2,0} parameter(2)
+    select.40 = f16[512,2,9,9]{1,3,2,0} select(compare.42, param_2.401, broadcast.695)
+    convert.196 = f32[512,2,9,9]{1,3,2,0} convert(select.40)
+    param_1.809 = f16[512,2,9,9]{1,3,2,0} parameter(1)
+    copy.335 = f16[512,2,9,9]{1,3,2,0} copy(param_1.809)
+    convert.218 = f32[512,2,9,9]{1,3,2,0} convert(copy.335)
+    param_0.668 = f32[2]{0} parameter(0)
+    broadcast.687 = f32[512,2,9,9]{1,3,2,0} broadcast(param_0.668), dimensions={1}
+    subtract.136 = f32[512,2,9,9]{1,3,2,0} subtract(convert.218, broadcast.687)
+    multiply.579 = f32[512,2,9,9]{1,3,2,0} multiply(convert.196, subtract.136)
+    constant_485 = f32[] constant(0)
+    reduce.139 = f32[2]{0} reduce(multiply.579, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
+    reduce.140.clone.1 = f32[2]{0} reduce(convert.196, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
+    ROOT tuple.102 = (f32[2]{0}, f32[2]{0}) tuple(reduce.139, reduce.140.clone.1)
+  })";
+
+  // Check that no loop is generated for reduction.
+  auto hlo_module =
+      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
+          .ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: reduce.0.loop_header
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
 TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
   const char *const kHloString = R"(
     HloModule reduction
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
new file mode 100644
index 00000000000..13d32672a95
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -0,0 +1,373 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
+
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+
+namespace xla {
+namespace gpu {
+
+std::unique_ptr<Thunk> ThunkEmitter::BuildFftThunk(const HloInstruction* inst) {
+  const HloInstruction* operand = inst->operand(0);
+  return absl::make_unique<FftThunk>(
+      inst->fft_type(), inst->fft_length(),
+      /*input_buffer=*/GetAllocationSlice(*operand),
+      /*output_buffer=*/GetAllocationSlice(*inst),
+      /*input_shape=*/operand->shape(),
+      /*output_shape=*/inst->shape(), inst);
+}
+
+std::unique_ptr<Thunk> ThunkEmitter::BuildTriangularSolveThunk(
+    const HloInstruction* inst) {
+  const HloInstruction* a = inst->operand(0);
+  const HloInstruction* b = inst->operand(1);
+  int64 m = b->shape().dimensions(b->shape().rank() - 2);
+  int64 n = b->shape().dimensions(b->shape().rank() - 1);
+  int64 batch_size = std::accumulate(
+      b->shape().dimensions().begin(), b->shape().dimensions().end() - 2,
+      int64{1}, [](int64 a, int64 b) { return a * b; });
+  int64 elem_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(inst->shape().element_type());
+  int64 a_batch_stride = inst->triangular_solve_options().left_side()
+                             ? m * m * elem_size
+                             : n * n * elem_size;
+  int64 b_batch_stride = m * n * elem_size;
+  return absl::make_unique<TriangularSolveThunk>(
+      inst->triangular_solve_options(),
+      /*a_input_buffer=*/GetAllocationSlice(*a),
+      /*b_input_buffer=*/GetAllocationSlice(*inst),
+      inst->shape().element_type(), batch_size, m, n, a_batch_stride,
+      b_batch_stride, inst);
+}
+
+std::unique_ptr<Thunk> ThunkEmitter::BuildGemmThunk(
+    const HloInstruction* inst) {
+  auto config_or = inst->backend_config<GemmBackendConfig>();
+  GemmBackendConfig gemm_config = std::move(config_or.ValueOrDie());
+  const HloInstruction* lhs = inst->operand(0);
+  const HloInstruction* rhs = inst->operand(1);
+
+  // The bias is passed inside the output buffer. If those buffers are shared
+  // we can just use it, otherwise copy the bias values into the output buffer
+  // first.
+  if (gemm_config.beta() != 0.0) {
+    const HloInstruction* bias = inst->operand(2);
+    CHECK_EQ(bias->shape(), inst->shape());
+    if (GetAllocationSlice(*bias) != GetAllocationSlice(*inst)) {
+      std::vector<std::unique_ptr<Thunk>> thunks;
+      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_buffer=*/GetAllocationSlice(*bias),
+          /*destination_buffer=*/GetAllocationSlice(*inst),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(inst->shape()), nullptr));
+      thunks.push_back(absl::make_unique<GemmThunk>(
+          GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+          GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+          GetAllocationSlice(*inst),  // The output buffer.
+          /*implements_whole_instruction=*/false, inst,
+          std::move(gemm_config)));
+      return absl::make_unique<SequentialThunk>(std::move(thunks), inst);
+    }
+  }
+
+  return absl::make_unique<GemmThunk>(
+      GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+      GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+      GetAllocationSlice(*inst),  // The output buffer.
+      /*implements_whole_instruction=*/true, inst, std::move(gemm_config));
+}
+
+std::unique_ptr<Thunk> ThunkEmitter::BuildInfeedThunk(
+    const HloInstruction* inst) {
+  CHECK_EQ(HloOpcode::kInfeed, inst->opcode());
+
+  ShapeTree<BufferAllocation::Slice> slices(inst->shape());
+  slices.ForEachMutableElement(
+      [&](const ShapeIndex& index, BufferAllocation::Slice* slice) {
+        *slice = GetAllocationSlice(*inst, index);
+      });
+  return absl::make_unique<InfeedThunk>(slices, inst);
+}
+
+std::unique_ptr<Thunk> ThunkEmitter::BuildOutfeedThunk(
+    const HloInstruction* inst) {
+  CHECK_EQ(HloOpcode::kOutfeed, inst->opcode());
+
+  ShapeTree<BufferAllocation::Slice> slices(inst->operand(0)->shape());
+  slices.ForEachMutableElement([&](const ShapeIndex& index,
+                                   BufferAllocation::Slice* slice) {
+    auto status_or_slice = MaybeGetAllocationSlice(*inst->operand(0), index);
+    if (status_or_slice.ok()) {
+      *slice = status_or_slice.ValueOrDie();
+    }
+  });
+  return absl::make_unique<OutfeedThunk>(std::move(slices), inst);
+}
+
+Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  // A CustomCall on the GPU backend can either be a custom-call to a
+  // user-supplied kernel, or a call into a library like cudnn.
+
+  // Lower custom-calls to cudnn batchnorm ops to specialized thunks.  It's part
+  // of the contract of these cudnn batchnorm calls that the epsilon and
+  // feature_index operands be constants.
+  if (custom_call->custom_call_target() ==
+      kCudnnBatchNormForwardInferenceCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(5);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(6);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    AddThunkToThunkSequence(
+        absl::make_unique<CudnnBatchNormForwardInferenceThunk>(
+            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
+            /*mean=*/GetAllocationSlice(*custom_call->operand(3)),
+            /*variance=*/GetAllocationSlice(*custom_call->operand(4)),
+            /*epsilon=*/epsilon_value,
+            /*feature_index=*/feature_index_value,
+            /*output=*/GetAllocationSlice(*custom_call),
+            /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (custom_call->custom_call_target() ==
+      kCudnnBatchNormForwardTrainingCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(3);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(4);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    // BatchNormTraining returns a tuple of three elements: data, calculated
+    // mean, and calculated 1/sqrt(variance + epsilon).
+    auto output_data = GetAllocationSlice(*custom_call, {0});
+    auto output_mean = GetAllocationSlice(*custom_call, {1});
+    auto output_inv_stddev = GetAllocationSlice(*custom_call, {2});
+    AddThunkToThunkSequence(
+        absl::make_unique<CudnnBatchNormForwardTrainingThunk>(
+            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
+            /*epsilon=*/epsilon_value,
+            /*feature_index=*/feature_index_value,
+            /*output_data=*/output_data,
+            /*output_mean=*/output_mean,
+            /*output_inv_stddev=*/output_inv_stddev,
+            /*output_tuple=*/GetAllocationSlice(*custom_call),
+            /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (custom_call->custom_call_target() == kCudnnBatchNormBackwardCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(5);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(6);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    // BatchNormGrad returns a tuple of three elements: grad_data, grad_scale,
+    // grad_offset.
+    auto output_grad_data = GetAllocationSlice(*custom_call, {0});
+    auto output_grad_scale = GetAllocationSlice(*custom_call, {1});
+    auto output_grad_offset = GetAllocationSlice(*custom_call, {2});
+    AddThunkToThunkSequence(absl::make_unique<CudnnBatchNormBackwardThunk>(
+        /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+        /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+        /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
+        /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
+        /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
+        /*epsilon=*/epsilon_value,
+        /*feature_index=*/feature_index_value,
+        /*output_grad_data=*/output_grad_data,
+        /*output_grad_scale=*/output_grad_scale,
+        /*output_grad_offset=*/output_grad_offset,
+        /*output_tuple=*/GetAllocationSlice(*custom_call),
+        /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (IsCustomCallToDnnConvolution(*custom_call)) {
+    std::vector<BufferAllocation::Slice> operand_slices;
+    operand_slices.reserve(custom_call->operand_count());
+    for (const auto* operand : custom_call->operands()) {
+      operand_slices.push_back(GetAllocationSlice(*operand));
+    }
+    auto tuple_result_slice = GetAllocationSlice(*custom_call);
+    auto conv_result_slice = GetAllocationSlice(*custom_call, {0});
+    auto scratch_slice = GetAllocationSlice(*custom_call, {1});
+
+    AddThunkToThunkSequence(absl::make_unique<ConvolutionThunk>(
+        Cast<HloCustomCallInstruction>(custom_call), std::move(operand_slices),
+        conv_result_slice, scratch_slice, tuple_result_slice));
+    return Status::OK();
+  }
+
+  if (custom_call->custom_call_target() == kCusolverCholeskyCallTarget) {
+    TF_ASSIGN_OR_RETURN(CholeskyOptions options,
+                        custom_call->backend_config<CholeskyOptions>());
+
+    const Shape& shape = custom_call->operand(0)->shape();
+    int ndim = shape.dimensions_size();
+    CHECK_GE(ndim, 2);
+    int64 n = shape.dimensions(ndim - 1);
+
+    const auto& dims = shape.dimensions();
+    int64 batch_size = std::accumulate(dims.begin(), dims.end() - 2, int64{1},
+                                       [](int64 a, int64 b) { return a * b; });
+
+    auto operand_buffer = GetAllocationSlice(*custom_call->operand(0));
+
+    auto a_buffer = GetAllocationSlice(*custom_call, {0});
+    auto workspace_buffer = GetAllocationSlice(*custom_call, {1});
+    auto info_buffer = GetAllocationSlice(*custom_call, {2});
+
+    std::vector<std::unique_ptr<Thunk>> thunks;
+
+    if (operand_buffer != a_buffer) {
+      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/operand_buffer,
+          /*destination_buffer=*/a_buffer,
+          /*mem_size=*/ShapeUtil::ByteSizeOf(shape), custom_call));
+    }
+
+    thunks.push_back(absl::make_unique<CholeskyThunk>(
+        options, a_buffer, workspace_buffer, info_buffer,
+        custom_call->operand(0)->shape().element_type(), batch_size, n,
+        custom_call));
+
+    // Elide the sequential thunk if there's no copy.
+    if (thunks.size() == 1) {
+      AddThunkToThunkSequence(std::move(thunks[0]));
+    } else {
+      AddThunkToThunkSequence(
+          absl::make_unique<SequentialThunk>(std::move(thunks), custom_call));
+    }
+
+    return Status::OK();
+  }
+
+  if (IsCublasGemm(*custom_call)) {
+    AddThunkToThunkSequence(BuildGemmThunk(custom_call));
+    return Status::OK();
+  }
+
+  if (void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+          custom_call->custom_call_target(), platform()->Name())) {
+    auto get_slices_for_instr = [&](const HloInstruction* instr) {
+      ShapeTree<BufferAllocation::Slice> slices(instr->shape());
+      slices.ForEachMutableElement(
+          [&](const ShapeIndex& index, BufferAllocation::Slice* slice) {
+            StatusOr<BufferAllocation::Slice> s =
+                MaybeGetAllocationSlice(*instr, index);
+            if (s.ok()) {
+              *slice = s.ValueOrDie();
+            }
+          });
+      return slices;
+    };
+    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices;
+    for (const auto* operand : custom_call->operands()) {
+      operand_slices.push_back(get_slices_for_instr(operand));
+    }
+    ShapeTree<BufferAllocation::Slice> result_slices =
+        get_slices_for_instr(custom_call);
+    AddThunkToThunkSequence(absl::make_unique<CustomCallThunk>(
+        call_target, std::move(operand_slices), std::move(result_slices),
+        Cast<HloCustomCallInstruction>(custom_call)->opaque(), custom_call));
+    return Status::OK();
+  }
+
+  return Unimplemented("No registered implementation for custom call to \"%s\"",
+                       custom_call->custom_call_target());
+}
+
+Status ThunkEmitter::HandleFft(HloInstruction* fft) {
+  TF_RET_CHECK(
+      LayoutUtil::IsMonotonicWithDim0Major(fft->operand(0)->shape().layout()));
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
+  AddThunkToThunkSequence(BuildFftThunk(fft));
+  return Status::OK();
+}
+
+Status ThunkEmitter::HandleTriangularSolve(HloInstruction* hlo) {
+  auto has_fortran_layout = [](const Layout& layout) {
+    int n = layout.minor_to_major_size();
+    return layout.minor_to_major(0) == n - 2 &&
+           layout.minor_to_major(1) == n - 1;
+  };
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(0)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(1)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->shape().layout()));
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+
+  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
+  // aren't the same buffer.
+  auto operand_buffer = GetAllocationSlice(*hlo->operand(1));
+  auto destination_buffer = GetAllocationSlice(*hlo);
+  if (operand_buffer != destination_buffer) {
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/operand_buffer,
+        /*destination_buffer=*/destination_buffer,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()), hlo));
+  }
+
+  thunks.push_back(BuildTriangularSolveThunk(hlo));
+
+  // Elide the sequential thunk if there's no copy.
+  if (thunks.size() == 1) {
+    AddThunkToThunkSequence(std::move(thunks[0]));
+  } else {
+    AddThunkToThunkSequence(
+        absl::make_unique<SequentialThunk>(std::move(thunks), hlo));
+  }
+  return Status::OK();
+}
+
+Status ThunkEmitter::HandleInfeed(HloInstruction* infeed) {
+  AddThunkToThunkSequence(BuildInfeedThunk(infeed));
+  return Status::OK();
+}
+
+Status ThunkEmitter::HandleOutfeed(HloInstruction* outfeed) {
+  AddThunkToThunkSequence(BuildOutfeedThunk(outfeed));
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.h b/tensorflow/compiler/xla/service/gpu/thunk_emitter.h
new file mode 100644
index 00000000000..55d92c74794
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.h
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_EMITTER_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace gpu {
+
+// Implements handling of GPU execution for HLO operations that are handed off
+// to specialzied thunks that do not require code generation. Intended to be
+// mixed into GPU emitters.
+class ThunkEmitter {
+ public:
+  class EmissionContext {
+   public:
+    virtual void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) = 0;
+    virtual StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
+        const HloInstruction& hlo, const ShapeIndex& index) const = 0;
+    virtual int64 ByteSizeOf(const Shape& shape) const = 0;
+    virtual const se::Platform* platform() const = 0;
+
+    virtual ~EmissionContext() = default;
+  };
+
+  explicit ThunkEmitter(EmissionContext* context) : context_(context) {}
+
+  Status HandleCustomCall(HloInstruction* custom_call);
+  Status HandleFft(HloInstruction* fft);
+  Status HandleTriangularSolve(HloInstruction* hlo);
+  Status HandleInfeed(HloInstruction* xla_infeed);
+  Status HandleOutfeed(HloInstruction* outfeed);
+
+ private:
+  EmissionContext* context_;
+
+  void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
+    return context_->AddThunkToThunkSequence(std::move(thunk));
+  }
+
+  StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index) const {
+    return context_->MaybeGetAllocationSlice(hlo, index);
+  }
+
+  int64 ByteSizeOf(const Shape& shape) { return context_->ByteSizeOf(shape); }
+
+  const se::Platform* platform() const { return context_->platform(); }
+
+  BufferAllocation::Slice GetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index = {}) const {
+    return MaybeGetAllocationSlice(hlo, index).ValueOrDie();
+  }
+
+  // Returns a FftThunk that calls cuFFT to implement `inst`.
+  std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
+
+  // Returns a CholeskyThunk that calls cuSolver to implement `inst`.
+  std::unique_ptr<Thunk> BuildCholeskyThunk(const HloInstruction* inst);
+
+  // Returns a TriangularSolveThunk that calls cuBlas to implement `inst`.
+  std::unique_ptr<Thunk> BuildTriangularSolveThunk(const HloInstruction* inst);
+
+  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
+  // to make sure `inst` outlives the lifetime of the returned Thunk object.
+  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
+
+  // Returns an InfeedThunk that performs a host-to-device memcpy to implement
+  // `inst`.
+  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
+
+  // Returns an OutfeedThunk that performs a device-to-host memcpy to implement
+  // `inst`.
+  std::unique_ptr<Thunk> BuildOutfeedThunk(const HloInstruction* inst);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 83894f17445..8d9ddb97d9e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
+#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -29,199 +31,6 @@ namespace xla {
 using absl::flat_hash_map;
 using absl::flat_hash_set;
 
-namespace {
-// FlattenSchedule walks through the instruction, and recurse into each called
-// computations. As it walks it also tracks down the ordinal number of each
-// instruction in the schedule and store it in the `instruction_schedule`. The
-// end of each computation is tracked in `computation_schedule`.
-int64 FlattenSchedule(
-    const HloComputation& computation,
-    const HloInstructionSequence& instruction_sequence,
-    const HloSchedule* schedule, int64 start_time,
-    absl::flat_hash_map<const HloInstruction*, int64>* instruction_schedule,
-    absl::flat_hash_map<const HloComputation*, int64>* computation_schedule) {
-  int64 time = start_time;
-  for (const HloInstruction* instruction :
-       instruction_sequence.instructions()) {
-    if (schedule != nullptr) {
-      // Recurse into sub computations if we have a module-scoped schedule.
-      if (instruction->opcode() == HloOpcode::kCall ||
-          instruction->opcode() == HloOpcode::kConditional) {
-        for (const HloComputation* called_computation :
-             instruction->called_computations()) {
-          const HloInstructionSequence& called_sequence =
-              schedule->sequence(called_computation);
-          time =
-              FlattenSchedule(*called_computation, called_sequence, schedule,
-                              time, instruction_schedule, computation_schedule);
-          computation_schedule->insert({called_computation, time});
-        }
-      }
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        const HloInstructionSequence& condition_sequence =
-            schedule->sequence(instruction->while_condition());
-        time = FlattenSchedule(*instruction->while_condition(),
-                               condition_sequence, schedule, time,
-                               instruction_schedule, computation_schedule);
-        computation_schedule->insert({instruction->while_condition(), time});
-        const HloInstructionSequence& body_sequence =
-            schedule->sequence(instruction->while_body());
-        time =
-            FlattenSchedule(*instruction->while_body(), body_sequence, schedule,
-                            time, instruction_schedule, computation_schedule);
-      }
-    }
-    if (instruction_schedule->count(instruction) != 0) {
-      continue;
-    }
-    instruction_schedule->insert({instruction, time++});
-  }
-  computation_schedule->insert({&computation, time});
-  return time;
-}
-
-// The aliased buffers could have overlapping live ranges.
-// NormalizeAliasedBuffers normalizes the buffer such that each alias buffer has
-// disjoint live range while keeping the live range union the same. This avoid
-// double counting aliased buffer sizes.
-//
-// Before(buffer1 and 2 are aliased):
-//
-//           +----+          live range of buffer1
-//   +------------------+    live range of buffer2
-//
-// After:
-//
-//           +----------+    live range of buffer1
-//   +------+                live range of buffer2
-//
-// Before(buffer1 and 2 are aliased):
-//
-//           +----------+    live range of buffer1
-//   +------------+          live range of buffer2
-//
-// After:
-//
-//           +----------+    live range of buffer1
-//   +------+                live range of buffer2
-//
-// Before(buffer1 and 2 are aliased):
-//
-//           +----------+    live range of buffer1
-//   +---+                   live range of buffer2
-//
-// After(unchanged):
-//
-//           +----------+    live range of buffer1
-//   +---+                   live range of buffer2
-//
-// As another example, imagine we have the following code sequence with live
-// ranges of each while-aliased buffers:
-//
-//                     a      p1    p2    e     b
-// a = ...             +
-//                     |
-// {                   |
-//   p1 = param        |       +
-//   ROOT true         |       |
-// }                   |       +
-// { // body           |
-//   p2 = param        +             +
-//   c = p2 + 1                      +
-//   d = c + 1
-//   ROOT e = d + 1                       +
-// }                                      |
-//                                        |
-// b = while (a)                          +     +
-//                                              |
-// f = b + 1                                    +
-//
-// After normalization it becomes:
-//
-//                     a      p1    p2    e     b
-// a = ...             +
-//                     |
-// {                   +
-//   p1 = param                +
-//   ROOT true                 |
-// }                           +
-// { // body
-//   p2 = param                      +
-//   c = p2 + 1                      +
-//   d = c + 1
-//   ROOT e = d + 1                       +
-// }                                      |
-//                                        |
-// b = while (a)                          +
-//                                              +
-// f = b + 1                                    +
-//
-// Note there is no overlap of live ranges after normalization.
-void NormalizeAliasedBuffers(
-    absl::flat_hash_map<const HloValue*, int64>* buffer_start_map,
-    absl::flat_hash_map<const HloValue*, int64>* buffer_end_map,
-    const std::vector<const HloValue*>& values_to_assign,
-    const HloAliasAnalysis& alias_analysis) {
-  absl::flat_hash_set<const HloValue*> values_to_assign_set(
-      values_to_assign.begin(), values_to_assign.end());
-  for (const HloBuffer& hlo_buffer : alias_analysis.buffers()) {
-    std::vector<const HloValue*> aliased_buffers;
-    for (const HloValue* hlo_value : hlo_buffer.values()) {
-      if (values_to_assign_set.count(hlo_value) != 0) {
-        aliased_buffers.push_back(hlo_value);
-        CHECK_NE(buffer_start_map->count(hlo_value), 0);
-        CHECK_NE(buffer_end_map->count(hlo_value), 0);
-      }
-    }
-    absl::c_sort(
-        aliased_buffers, [&](const HloValue* value1, const HloValue* value2) {
-          if ((*buffer_start_map)[value1] != (*buffer_start_map)[value2]) {
-            return (*buffer_start_map)[value1] < (*buffer_start_map)[value2];
-          }
-          return (*buffer_end_map)[value1] < (*buffer_end_map)[value2];
-        });
-
-    for (int64 i = 0; i < aliased_buffers.size(); ++i) {
-      // We can't use aliased_buffers.size() - 1 since aliased_buffers.size() is
-      // an unsigned integer and can be 0.
-      if (i + 1 == aliased_buffers.size()) {
-        break;
-      }
-
-      const HloValue* value1 = aliased_buffers[i];
-      const HloValue* value2 = aliased_buffers[i + 1];
-      if ((*buffer_start_map)[value1] == (*buffer_start_map)[value2]) {
-        // If value1 has the same start time as value2, make value1 disappear by
-        // setting the end time same as start time:
-        //
-        // Before:
-        // +----+           value1
-        // +----------+     value2
-        //
-        // After:
-        // +                value1
-        // +----------+     value2
-        //
-        // Note that only when heap simulator runs before copy insertion can
-        // this happen where one instruction defines multiple aliased buffers --
-        // This is illegle to execute and can be fixed by copy insertion later.
-        (*buffer_end_map)[value1] = (*buffer_start_map)[value1];
-        continue;
-      }
-
-      if ((*buffer_end_map)[value1] < (*buffer_start_map)[value2]) {
-        continue;
-      }
-
-      if ((*buffer_end_map)[value1] > (*buffer_end_map)[value2]) {
-        (*buffer_end_map)[value2] = (*buffer_end_map)[value1];
-      }
-      (*buffer_end_map)[value1] = (*buffer_start_map)[value2] - 1;
-    }
-  }
-}
-}  // namespace
-
 /*static*/
 StatusOr<int64> HeapSimulator::MinimumMemoryForModule(
     const HloSchedule& schedule,
@@ -283,8 +92,12 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
   const HloComputation* entry_computation = module.entry_computation();
   const HloInstructionSequence& instruction_sequence =
       schedule.sequence(entry_computation);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloLiveRange> hlo_live_range,
+      HloLiveRange::Run(schedule, alias_analysis, entry_computation));
   TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation,
-                                         instruction_sequence, alias_analysis));
+                                         instruction_sequence, alias_analysis,
+                                         hlo_live_range.get()));
   return heap.Finish();
 }
 
@@ -298,8 +111,13 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
         memory_by_computation) {
   HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*schedule=*/nullptr, memory_by_computation);
-  TF_RETURN_IF_ERROR(
-      heap.RunComputation(computation, instruction_sequence, alias_analysis));
+  HloSchedule schedule(computation.parent());
+  schedule.set_sequence(&computation, instruction_sequence);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloLiveRange> hlo_live_range,
+                      HloLiveRange::Run(schedule, alias_analysis, &computation,
+                                        /*module_scoped_analysis=*/false));
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         alias_analysis, hlo_live_range.get()));
   return heap.Finish();
 }
 
@@ -312,8 +130,11 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*schedule=*/schedule, nullptr);
-  TF_RETURN_IF_ERROR(
-      heap.RunComputation(computation, instruction_sequence, alias_analysis));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloLiveRange> hlo_live_range,
+      HloLiveRange::Run(*schedule, alias_analysis, &computation));
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         alias_analysis, hlo_live_range.get()));
   return heap.Finish();
 }
 
@@ -322,36 +143,24 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
 Status HeapSimulator::RunComputation(
     const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
-    const HloAliasAnalysis& alias_analysis) {
+    const HloAliasAnalysis& alias_analysis, HloLiveRange* hlo_live_range) {
   XLA_VLOG_LINES(1, computation.parent()->ToString());
   XLA_VLOG_LINES(2, computation.ToString());
 
+  VLOG(1) << hlo_live_range->ToString();
+
   HloDataflowAnalysis& dataflow_analysis = alias_analysis.dataflow_analysis();
 
-  // instruction_schedule and computation_schedule are the maps that track each
-  // instruction/computation and their ordinal in the schedule.
-  absl::flat_hash_map<const HloInstruction*, int64> instruction_schedule;
-  absl::flat_hash_map<const HloComputation*, int64> computation_schedule;
-
-  // program_end_time is the time of the last instruction scheduled. It is equal
-  // to the number of instructions in a computation.
-  int64 program_end_time =
-      FlattenSchedule(computation, instruction_sequence, schedule_, 0,
-                      &instruction_schedule, &computation_schedule);
-
-  VLOG(1) << "Program end time: " << program_end_time;
-
-  // We track the definition and free events for each buffer, then we go through
-  // each step and reply those events in program order.
-  absl::flat_hash_map<const HloValue*, int64> buffer_start_map;
-  absl::flat_hash_map<const HloValue*, int64> buffer_end_map;
+  algorithm_->SetSchedules(&hlo_live_range->flattened_instruction_sequence(),
+                           &hlo_live_range->instruction_schedule());
 
   // Record the buffer define/free event for each time step. We free all
   // remaining buffers (entry parameter, etc) after the program has finished
   // running, so we set the size of to program_end_time + 1.
-  std::vector<std::vector<const HloValue*>> buffers_defined(program_end_time +
-                                                            1);
-  std::vector<std::vector<const HloValue*>> buffers_freed(program_end_time + 1);
+  std::vector<std::vector<const HloValue*>> buffers_defined(
+      hlo_live_range->schedule_end_time() + 1);
+  std::vector<std::vector<const HloValue*>> buffers_freed(
+      hlo_live_range->schedule_end_time() + 1);
 
   // values_to_assign tracks the HloValues that we need to assign a buffer to.
   // Note that we only need to assign a buffer to a value when both of the
@@ -364,106 +173,49 @@ Status HeapSimulator::RunComputation(
   // - If the instruction is in a nested call of the current computation, only
   // assign a buffer if we are doing global heap simulation.
   std::vector<const HloValue*> values_to_assign;
+  values_to_assign.reserve(dataflow_analysis.values().size());
 
-  // Keeps track of buffer start time and buffer end time.
   for (const HloValue* value : dataflow_analysis.values()) {
-    // Ignore buffers that are not defined.
-    if (instruction_schedule.count(value->defining_instruction()) == 0) {
+    // Ignore buffers that are not tracked.
+    if (hlo_live_range->instruction_schedule().count(
+            value->defining_instruction()) == 0) {
       continue;
     }
     if (IgnoreBuffer(value)) {
       continue;
     }
     values_to_assign.push_back(value);
-    int64 buffer_start_time = instruction_schedule[value->instruction()];
-
-    int64 buffer_end_time = -1;
-    // A buffer's live range ends when the last user finishes executing.
-    for (const HloUse& use : value->uses()) {
-      const HloInstruction* used = use.instruction;
-      // As an optimization, we deem a while's init value's live range ends as
-      // soon as the loop body starts. This optimization is only applicable to
-      // the whole module simulation.
-      if (schedule_ != nullptr && used->opcode() == HloOpcode::kWhile) {
-        // The current live range is at the end of the while, move it to the
-        // beginning of the body.
-        used = used->while_body()->parameter_instruction(0);
-        VLOG(1) << "Moved value " << value->ToShortString()
-                << " to while param: " << used->ToString();
-      }
-      if (instruction_schedule.count(used) == 0) {
-        // We didn't track the instruction `used`. This happens when we do
-        // computation scope (versus module scope) heap simulation and when the
-        // used instruction is outside of the computation being simulated.
-        continue;
-      }
-      buffer_end_time = std::max(buffer_end_time, instruction_schedule[used]);
-    }
-
-    if (buffer_end_time == -1) {
-      buffer_end_time = buffer_start_time;
-    }
-
-    for (const HloPosition& position : value->positions()) {
-      const HloComputation* position_comp = position.instruction->parent();
-      // If this instruction lives out, the live range of the instruction should
-      // be extended to the end of the computation.
-      if (position.instruction == position_comp->root_instruction()) {
-        if (schedule_ == nullptr && &computation != position_comp) {
-          continue;
-        }
-        if (computation_schedule.count(position_comp) == 0) {
-          continue;
-        }
-        buffer_end_time =
-            std::max(buffer_end_time, computation_schedule[position_comp]);
-      }
-    }
-
-    // Entry parameters live across whole computation.
-    if (value->instruction()->opcode() == HloOpcode::kParameter &&
-        value->instruction()->parent() ==
-            computation.parent()->entry_computation()) {
-      buffer_end_time = program_end_time;
-    }
-
-    CHECK(buffer_start_time <= buffer_end_time);
-
-    buffer_start_map[value] = buffer_start_time;
-    buffer_end_map[value] = buffer_end_time;
   }
 
-  NormalizeAliasedBuffers(&buffer_start_map, &buffer_end_map, values_to_assign,
-                          alias_analysis);
+  auto& buffer_live_ranges = hlo_live_range->buffer_live_ranges();
 
   absl::c_sort(values_to_assign,
                [&](const HloValue* value1, const HloValue* value2) {
-                 if (buffer_start_map[value1] != buffer_start_map[value2]) {
-                   return buffer_start_map[value1] < buffer_start_map[value2];
-                 }
-
-                 if (buffer_end_map[value1] != buffer_end_map[value2]) {
-                   return buffer_end_map[value1] < buffer_end_map[value2];
-                 }
-                 return value1->id() < value2->id();
+                 const auto& live_range1 = buffer_live_ranges.at(value1);
+                 const auto& live_range2 = buffer_live_ranges.at(value2);
+                 return std::forward_as_tuple(live_range1.start,
+                                              live_range1.end, value1->id()) <
+                        std::forward_as_tuple(live_range2.start,
+                                              live_range2.end, value2->id());
                });
 
   // For each value that we need to assign a buffer to, add the define and free
   // events.
   for (const HloValue* value : values_to_assign) {
-    buffers_defined[buffer_start_map[value]].push_back(value);
-    buffers_freed[buffer_end_map[value]].push_back(value);
+    auto live_range = buffer_live_ranges.at(value);
+    buffers_defined[live_range.start].push_back(value);
+    buffers_freed[live_range.end].push_back(value);
   }
 
   // All HloValues in a hlo buffer should be allocated to the same address. This
   // map tracks the first value that got allocated in a buffer.
   absl::flat_hash_map<const HloBuffer*, const HloValue*> first_allocated_value;
 
-  VLOG(1) << "Program time" << program_end_time;
+  VLOG(1) << "Program time" << hlo_live_range->schedule_end_time();
 
   // Go through each step in the program and replay each buffer define and free
   // events.
-  for (int64 i = 0; i < program_end_time + 1; ++i) {
+  for (int64 i = 0; i < hlo_live_range->schedule_end_time() + 1; ++i) {
     VLOG(1) << "Time step: " << i;
 
     for (const HloValue* value : buffers_defined[i]) {
@@ -495,11 +247,21 @@ Status HeapSimulator::RunComputation(
             if (operand_buffer->values().size() > 1) {
               continue;
             }
-            if (buffer_end_map.count(operand_value) == 0) {
+            auto it = buffer_live_ranges.find(operand_value);
+            if (it == buffer_live_ranges.end()) {
               continue;
             }
+
+            auto& operand_live_range = it->second;
+
+            auto& user_live_range = buffer_live_ranges[value];
+
             // Can only share buffers that are about to be freed.
-            if (buffer_end_map[operand_value] != i) {
+            if (operand_live_range.end != i) {
+              continue;
+            }
+
+            if (IgnoreBuffer(operand_value)) {
               continue;
             }
 
@@ -522,7 +284,7 @@ Status HeapSimulator::RunComputation(
               ShareBuffer(value, operand_value, value->instruction());
               // The live range of the operand buffer is now extended to the end
               // of the current instruction.
-              buffer_end_map[operand_value] = buffer_end_map[value];
+              operand_live_range.end = user_live_range.end;
               VLOG(1) << "Sharing " << value->ToShortString() << " with "
                       << operand_value->ToShortString()
                       << ", size:" << size_fn_(*value);
@@ -866,29 +628,27 @@ GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
     // start of the first buffer and the end of the last co-located
     // buffer. There could be "holes" in the live ranges of each co-located
     // buffers, but in this heuristics we think they are contiguous.
-    absl::c_sort(sorted_buffer_intervals,
-                 [&](const BufferInterval& x, const BufferInterval& y) {
-                   int64 x_end = x.end;
-                   for (auto colocation : GetTransitiveColocations(x)) {
-                     x_end =
-                         std::max(x_end, buffer_intervals_.at(colocation).end);
-                   }
+    absl::c_sort(sorted_buffer_intervals, [&](const BufferInterval& x,
+                                              const BufferInterval& y) {
+      int64 x_end = x.end;
+      for (auto colocation : GetTransitiveColocations(x)) {
+        x_end = std::max(x_end, buffer_intervals_.at(colocation).end);
+      }
 
-                   int64 y_end = y.end;
-                   for (auto colocation : GetTransitiveColocations(y)) {
-                     y_end =
-                         std::max(y_end, buffer_intervals_.at(colocation).end);
-                   }
+      int64 y_end = y.end;
+      for (auto colocation : GetTransitiveColocations(y)) {
+        y_end = std::max(y_end, buffer_intervals_.at(colocation).end);
+      }
 
-                   if (x_end - x.start != y_end - y.start) {
-                     return x_end - x.start > y_end - y.start;
-                   }
+      if (x_end - x.start != y_end - y.start) {
+        return x_end - x.start > y_end - y.start;
+      }
 
-                   if (x.size != y.size) {
-                     return x.size > y.size;
-                   }
-                   return x.buffer->id() < y.buffer->id();
-                 });
+      if (x.size != y.size) {
+        return x.size > y.size;
+      }
+      return x.buffer->id() < y.buffer->id();
+    });
   } else {
     // Sort by spatial size. We don't look at co-locates as they should have the
     // same size.
@@ -910,8 +670,8 @@ GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
 
 GlobalDecreasingSizeBestFitHeap::ChunkCandidate
 GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval)
-    const {
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
+    int64 preferred_offset) const {
   VLOG(1) << "Finding chunks for buffer: "
           << buffer_interval.buffer->ToString();
   VLOG(1) << "Size " << buffer_interval.size << ", start "
@@ -960,7 +720,16 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
       return;
     }
 
-    if (free_size < min_fit_chunk.size) {
+    // If a preferred offset is provided, pick that offset.
+    if (free_offset <= preferred_offset &&
+        free_offset + free_size >= preferred_offset + buffer_interval.size) {
+      min_fit_chunk = {preferred_offset, buffer_interval.size};
+    }
+
+    // Pick the min-fit chunk only if we didn't have a preferred offset or a
+    // chunk at the preferred offset hasn't been found.
+    if ((preferred_offset < 0 || min_fit_chunk.offset != preferred_offset) &&
+        free_size < min_fit_chunk.size) {
       min_fit_chunk = {free_offset, free_size};
     }
   };
@@ -973,6 +742,12 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
     offset = std::max(offset, RoundUpToNearest(chunk.chunk_end(), alignment_));
   }
   use_free_chunk_if_smaller(offset, result_.heap_size - offset);
+  // When preferred offset is provided and the preferred offset is larger than
+  // the current heap size, simply use the preferred offset provided.
+  if (result_.heap_size <= preferred_offset) {
+    chunk_candidate.heap_size = preferred_offset + buffer_interval.size;
+    min_fit_chunk = {preferred_offset, buffer_interval.size};
+  }
 
   if (min_fit_chunk.offset == -1) {
     // Increase the heap size to fit in the last free chunk.
@@ -993,16 +768,18 @@ void GlobalDecreasingSizeBestFitHeap::CommitChunk(
   interval_tree_.Add(buffer_interval.start, buffer_interval.end,
                      chunk_candidate.chunk);
   for (auto colocation : GetTransitiveColocations(buffer_interval)) {
-    const auto emplace_result =
-        result_.chunk_map.emplace(colocation, chunk_candidate.chunk);
-    DCHECK(emplace_result.second);
+    AddToChunkMap(colocation, chunk_candidate.chunk);
     auto colocation_interval = buffer_intervals_[colocation];
     interval_tree_.Add(colocation_interval.start, colocation_interval.end,
                        chunk_candidate.chunk);
   }
 
-  const auto emplace_result =
-      result_.chunk_map.emplace(buffer_interval.buffer, chunk_candidate.chunk);
+  AddToChunkMap(buffer_interval.buffer, chunk_candidate.chunk);
+}
+
+void GlobalDecreasingSizeBestFitHeap::AddToChunkMap(const HloValue* buffer,
+                                                    Chunk chunk) {
+  const auto emplace_result = result_.chunk_map.emplace(buffer, chunk);
   DCHECK(emplace_result.second);
 }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 4d6de377813..00a748fc1e1 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -165,7 +166,8 @@ class HeapSimulator {
 
   Status RunComputation(const HloComputation& computation,
                         const HloInstructionSequence& instruction_sequence,
-                        const HloAliasAnalysis& alias_analysis);
+                        const HloAliasAnalysis& alias_analysis,
+                        HloLiveRange* live_range);
 
   bool IgnoreBuffer(const HloValue* buffer) const;
   void Alloc(const HloValue* buffer, const HloInstruction* instruction);
@@ -255,6 +257,22 @@ class HeapAlgorithm {
   // Finish collects the buffer offset assignment results.  Free may only be
   // called once, after the Alloc and Free calls.
   virtual Result Finish() = 0;
+
+  // Heap algorithms can optionally make use of the instruction/computation
+  // schedule. These data structures are guaranteed to be valid while Finish()
+  // is being called.
+  virtual void SetSchedules(
+      const HloInstructionSequence* flattened_instruction_sequence,
+      const absl::flat_hash_map<const HloInstruction*, int64>*
+          instruction_schedule) {
+    flattened_instruction_sequence_ = flattened_instruction_sequence;
+    instruction_schedule_ = instruction_schedule;
+  }
+
+ protected:
+  const HloInstructionSequence* flattened_instruction_sequence_;
+  const absl::flat_hash_map<const HloInstruction*, int64>*
+      instruction_schedule_;
 };
 
 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
@@ -370,19 +388,24 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
 
   // These two methods below are exposed to other heap algorithms that inherit
   // from this class. The Finish() method tries to find a candidate chunk for
-  // each BufferInterval, after calling GetSortedBufferIntervals. The
-  // ChunkCandidate returns the chunk and the final heap size if it chunk is to
-  // be committed. The Finish() method can then call CommitChunk to associate
-  // the chunk with the BufferInterval, if the final heap size is within the
-  // limits.
-  ChunkCandidate FindChunkCandidate(
-      const BufferInterval& buffer_interval) const;
+  // each BufferInterval, after calling GetSortedBufferIntervals. If a
+  // non-negative preferred_offset is provided, FindChunkCandidate attempts
+  // finding a chunk at this offset. The ChunkCandidate returns the chunk and
+  // the final heap size if it chunk is to be committed. The Finish() method can
+  // then call CommitChunk to associate the chunk with the BufferInterval, if
+  // the final heap size is within the limits.
+  ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval,
+                                    int64 preferred_offset = -1) const;
   void CommitChunk(const BufferInterval& buffer_interval,
                    ChunkCandidate chunk_candidate);
+  // Adds the buffer and the chunk to the result chunk map.
+  virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk);
+
+  absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_;
+  Result result_;
 
  private:
   int64 alignment_;
-  Result result_;
   Type type_;
 
   // The current time represented as an integer. It increments by 1 at each
@@ -396,7 +419,6 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   // returns all three of them.
   absl::flat_hash_set<const HloValue*> GetTransitiveColocations(
       const BufferInterval& interval) const;
-  absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_;
 };
 
 // A heap algorithm that chooses the best results from other algorithms added to
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 4f7daa84782..80a047142b4 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -442,8 +442,8 @@ TEST_F(HeapSimulatorTest, MultiplyAdd) {
   tracker.ExpectCallSequence({
       {kAlloc, tracker.BufferAt(paramA, {})},
       {kAlloc, tracker.BufferAt(paramX, {})},
-      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
       {kFree, tracker.BufferAt(mul, {})},
       {kShare, tracker.BufferAt(add, {})},
       // All params and outputs are freed at the end.
@@ -516,8 +516,8 @@ TEST_F(HeapSimulatorTest, MultiplyDot) {
   tracker.ExpectCallSequence({
       {kAlloc, tracker.BufferAt(paramA, {})},
       {kAlloc, tracker.BufferAt(paramX, {})},
-      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(dot, {})},
       // All params and outputs are freed at the end.
       {kFree, tracker.BufferAt(mul, {})},
@@ -554,8 +554,8 @@ TEST_F(HeapSimulatorTest, MultiplyDotAdd) {
   tracker.ExpectCallSequence({
       {kAlloc, tracker.BufferAt(paramA, {})},
       {kAlloc, tracker.BufferAt(paramX, {})},
-      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(dot, {})},
       {kFree, tracker.BufferAt(mul, {})},
       {kFree, tracker.BufferAt(dot, {})},
@@ -596,8 +596,8 @@ TEST_F(HeapSimulatorTest, MultiplyDotDot) {
   tracker.ExpectCallSequence({
       {kAlloc, tracker.BufferAt(paramA, {})},
       {kAlloc, tracker.BufferAt(paramX, {})},
-      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(dot0, {})},
       {kFree, tracker.BufferAt(mul, {})},  // mul no longer used
       {kAlloc, tracker.BufferAt(dot1, {})},
@@ -640,8 +640,8 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
   tracker.ExpectCallSequence({
       {kAlloc, tracker.BufferAt(paramA, {})},
       {kAlloc, tracker.BufferAt(paramX, {})},
-      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
       {kAlloc, tracker.BufferAt(dot0, {})},
       {kFree, tracker.BufferAt(mul, {})},  // mul no longer used
       {kAlloc, tracker.BufferAt(dot1, {})},
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 331bbcb7836..61e562c7eda 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 67
+// Next ID: 69
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -230,6 +230,13 @@ message HloInstructionProto {
 
   // The delta value for kRngGetAndUpdateState.
   int64 delta = 66;
+
+  // Specifies if the gather/scatter indices are guaranteed to be sorted by the
+  // caller.
+  bool indices_are_sorted = 67;
+
+  // Frontend attributes to pass to the XLA backend.
+  xla.FrontendAttributes frontend_attributes = 68;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 0c020daec30..1ef007cc817 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -1008,8 +1008,8 @@ TEST_F(HloAliasAnalysisTest, Bitcast) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      scalar_shape_, HloOpcode::kBitcast, constant));
+  auto bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(scalar_shape_, constant));
 
   module_->AddEntryComputation(builder.Build());
   SCOPED_TRACE(module_->ToString());
@@ -1076,8 +1076,8 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      scalar_shape_, HloOpcode::kBitcast, constant));
+  auto bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(scalar_shape_, constant));
   builder.AddInstruction(HloInstruction::CreateTuple({constant, bitcast}));
 
   module_->AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 639e853ada7..cbdada0b46b 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -532,11 +532,12 @@ string HloComputation::ToString(
     if (options.print_percent()) {
       s << "%";
     }
-    s << name() << " ";
+    s << PrintName(name(), options.print_ids()) << " ";
   }
 
   if (options.print_program_shape()) {
-    s << ShapeUtil::HumanString(ComputeProgramShape()) << " ";
+    s << ShapeUtil::HumanString(ComputeProgramShape(options.print_ids()))
+      << " ";
   }
   s << "{\n";
   {
@@ -753,12 +754,13 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstructionWithCustomCopier(
   return DeepCopyHelper(instruction, &index, copy_leaf);
 }
 
-ProgramShape HloComputation::ComputeProgramShape() const {
+ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const {
   ProgramShape program_shape;
 
   for (auto* param_instruction : param_instructions_) {
     *program_shape.add_parameters() = param_instruction->shape();
-    *program_shape.add_parameter_names() = param_instruction->name();
+    *program_shape.add_parameter_names() =
+        PrintName(param_instruction->name(), include_ids);
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
@@ -835,6 +837,18 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   if (new_instruction->metadata().op_name().empty()) {
     new_instruction->set_metadata(old_instruction->metadata());
   }
+  if (new_instruction->frontend_attributes().map().empty()) {
+    new_instruction->set_frontend_attributes(
+        old_instruction->frontend_attributes());
+  }
+
+  // Like the metadata above, if the user didn't specify any sharding
+  // information on the new instruction we should copy the old sharding
+  // information (if any).
+  if (!new_instruction->has_sharding()) {
+    new_instruction->set_sharding(old_instruction->sharding_ptr());
+  }
+
   TF_RETURN_IF_ERROR(old_instruction->ReplaceAllUsesWith(new_instruction));
   return RemoveInstructionAndUnusedOperands(old_instruction);
 }
@@ -856,25 +870,6 @@ std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   return unreachable_roots;
 }
 
-template <typename HloInstructionPtr>
-Status HloComputation::Accept(
-    DfsHloVisitorBase<HloInstructionPtr>* visitor) const {
-  // Visit unreachable roots. Beware that the visitor might delete the currently
-  // visited root, which would invalidate iterators if the unreachable roots
-  // weren't computed ahead of time.
-  for (HloInstruction* root : CollectUnreachableRoots()) {
-    VLOG(3) << "Traversing unreachable root: " << root->ToString();
-    // Call FinishVisit only at the end.
-    TF_RETURN_IF_ERROR(root->Accept(visitor, /*call_finish_visit=*/false));
-  }
-  // Visit the computation root instruction last.
-  return root_instruction()->Accept(visitor, /*call_finish_visit=*/true);
-}
-
-// Explicit instantiations.
-template Status HloComputation::Accept(DfsHloVisitor* visitor) const;
-template Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
-
 Status HloComputation::AcceptWithOperandOrder(
     DfsHloVisitor* visitor,
     const HloInstruction::CompareFunction& operand_order) const {
@@ -891,42 +886,6 @@ Status HloComputation::AcceptWithOperandOrder(
                                                     /*call_finish_visit=*/true);
 }
 
-template <typename HloInstructionPtr>
-Status HloComputation::AcceptOrdered(
-    DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    absl::Span<HloInstruction* const> order) const {
-  VLOG(3) << "Accepting visitor with order.";
-  for (HloInstruction* root : CollectUnreachableRoots()) {
-    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
-  }
-  TF_RET_CHECK(order.size() == instruction_count());
-  absl::flat_hash_set<const HloInstruction*> visited;
-  for (const HloInstruction* instruction : order) {
-    VLOG(3) << "Visiting ordered: " << instruction->ToString();
-    TF_RET_CHECK(instruction_iterators_.contains(instruction))
-        << "Instruction " << instruction->name() << " is not in computation "
-        << name();
-    TF_RET_CHECK(!visited.contains(instruction))
-        << "Instruction " << instruction->name()
-        << " appears more than once in order";
-    HloInstruction* mutable_instruction =
-        const_cast<HloInstruction*>(instruction);
-    TF_RETURN_IF_ERROR(visitor->Preprocess(mutable_instruction));
-    TF_RETURN_IF_ERROR(mutable_instruction->Visit(visitor));
-    visitor->SetVisited(*mutable_instruction);
-    TF_RETURN_IF_ERROR(visitor->Postprocess(mutable_instruction));
-    visited.insert(instruction);
-  }
-  TF_RETURN_IF_ERROR(visitor->FinishVisit(root_instruction()));
-  return Status::OK();
-}
-
-// Explicit instantiations.
-template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
-template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
-
 std::unique_ptr<HloComputation> HloComputation::Clone(
     const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 111b28a8610..34ff957c876 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -288,7 +288,7 @@ class HloComputation {
 
   // Computes and returns the ProgramShape of this computation (shape of
   // parameters and result with layout).
-  ProgramShape ComputeProgramShape() const;
+  ProgramShape ComputeProgramShape(bool include_ids = true) const;
 
   // Return whether `*this` and `other` are functionally equivalent.
   bool Equal(const HloComputation& other, bool is_layout_sensitive) const;
@@ -314,6 +314,8 @@ class HloComputation {
   // Replace old instruction with new instruction.  Updates uses and root
   // instruction. Removes old instruction from computation. Precondition:
   // old_instruction and new_instruction must have the compatible shapes.
+  // If |new_instruction| doesn't have any sharding information it will
+  // recieve the sharding information of |old_instruction|.
   Status ReplaceInstruction(HloInstruction* old_instruction,
                             HloInstruction* new_instruction);
 
@@ -511,6 +513,61 @@ class HloComputation {
   TF_DISALLOW_COPY_AND_ASSIGN(HloComputation);
 };
 
+template <typename HloInstructionPtr>
+Status HloComputation::Accept(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor) const {
+  // Visit unreachable roots. Beware that the visitor might delete the currently
+  // visited root, which would invalidate iterators if the unreachable roots
+  // weren't computed ahead of time.
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    VLOG(3) << "Traversing unreachable root: " << root->ToString();
+    // Call FinishVisit only at the end.
+    TF_RETURN_IF_ERROR(root->Accept(visitor, /*call_finish_visit=*/false));
+  }
+  // Visit the computation root instruction last.
+  return root_instruction()->Accept(visitor, /*call_finish_visit=*/true);
+}
+
+// Explicit instantiations.
+template Status HloComputation::Accept(DfsHloVisitor* visitor) const;
+template Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
+
+template <typename HloInstructionPtr>
+Status HloComputation::AcceptOrdered(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor,
+    absl::Span<HloInstruction* const> order) const {
+  VLOG(3) << "Accepting visitor with order.";
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
+  }
+  TF_RET_CHECK(order.size() == instruction_count());
+  absl::flat_hash_set<const HloInstruction*> visited;
+  for (const HloInstruction* instruction : order) {
+    VLOG(3) << "Visiting ordered: " << instruction->ToString();
+    TF_RET_CHECK(instruction_iterators_.contains(instruction))
+        << "Instruction " << instruction->name() << " is not in computation "
+        << name();
+    TF_RET_CHECK(!visited.contains(instruction))
+        << "Instruction " << instruction->name()
+        << " appears more than once in order";
+    HloInstruction* mutable_instruction =
+        const_cast<HloInstruction*>(instruction);
+    TF_RETURN_IF_ERROR(visitor->Preprocess(mutable_instruction));
+    TF_RETURN_IF_ERROR(mutable_instruction->Visit(visitor));
+    visitor->SetVisited(*mutable_instruction);
+    TF_RETURN_IF_ERROR(visitor->Postprocess(mutable_instruction));
+    visited.insert(instruction);
+  }
+  TF_RETURN_IF_ERROR(visitor->FinishVisit(root_instruction()));
+  return Status::OK();
+}
+
+// Explicit instantiations.
+template Status HloComputation::AcceptOrdered(
+    DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
+template Status HloComputation::AcceptOrdered(
+    ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 311b8a15504..90af8b1f487 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -154,6 +154,12 @@ int64 HloCostAnalysis::FusionParameterReadBytes(
         size += hlo == user->operand(0) ? GetShapeSize(user->shape())
                                         : GetShapeSize(hlo->shape());
         break;
+      case HloOpcode::kDynamicUpdateSlice:
+        // Uses the same shape as 'update' which is operand 1.
+        size += hlo == user->operand(0)
+                    ? GetShapeSize(user->operand(1)->shape())
+                    : GetShapeSize(hlo->shape());
+        break;
       case HloOpcode::kBroadcast:
       case HloOpcode::kReshape:
         size += GetShapeSize(hlo->shape());
@@ -699,7 +705,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
           if (fusion->fused_expression_root()->opcode() ==
               HloOpcode::kDynamicUpdateSlice) {
             current_properties_[kBytesAccessedKey] += GetShapeSize(
-                fusion->fused_expression_root()->operand(0)->shape());
+                fusion->fused_expression_root()->operand(1)->shape());
             return;
           }
         } else if (shape_index.size() == 1) {
@@ -710,7 +716,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
             current_properties_[kBytesAccessedKey] +=
                 GetShapeSize(fusion->fused_expression_root()
                                  ->operand(shape_index[0])
-                                 ->operand(0)
+                                 ->operand(1)
                                  ->shape());
             return;
           }
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 407dfe796d8..ed4bac22a9f 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1105,8 +1105,8 @@ TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      scalar_shape_, HloOpcode::kBitcast, constant));
+  auto bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(scalar_shape_, constant));
 
   module_->AddEntryComputation(builder.Build());
   SCOPED_TRACE(module_->ToString());
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index a7e1d3a80d7..9a9898fdeee 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1543,8 +1543,9 @@ class OutputBatchIndexToInputIndex {
     int64 index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
-      TF_ASSIGN_OR_RETURN(index_vector_[i],
-                          start_indices_.GetIntegralAsS64(index_vector_index_));
+      // TODO(george): OK what should happen here?
+      // seems OK to crash though.
+      index_vector_[i] = *start_indices_.GetIntegralAsS64(index_vector_index_);
     }
     return Status::OK();
   }
@@ -2295,12 +2296,10 @@ static StatusOr<bool> GenerateReduceOutputElement(
   }
 
   if (use_fast_add) {
-    TF_ASSIGN_OR_RETURN(double computed_result,
-                        init_values[0]->GetAsDouble({}));
+    double computed_result = *init_values[0]->GetAsDouble({});
     auto reduction_step =
         [&](absl::Span<const int64> input_index) -> StatusOr<bool> {
-      TF_ASSIGN_OR_RETURN(double argument,
-                          input_args[0]->GetAsDouble(input_index));
+      double argument = *input_args[0]->GetAsDouble(input_index);
       computed_result += argument;
       return true;
     };
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 9fcc6274866..9487d955f31 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -2035,8 +2035,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       int64 index_vector_dim = dim_numbers_.index_vector_dim();
       for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
         index_vector_index_[index_vector_dim] = i;
-        TF_ASSIGN_OR_RETURN(index_vector_[i], scatter_indices_.GetIntegralAsS64(
-                                                  index_vector_index_));
+        index_vector_[i] =
+            *scatter_indices_.GetIntegralAsS64(index_vector_index_);
       }
       return Status::OK();
     }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index ad58bdb11b5..1c5b166a801 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -103,9 +103,13 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
   return result;
 }
 
+const Shape& HloInputOutputAliasConfig::shape() const { return alias_.shape(); }
+
 string HloInputOutputAliasConfig::ToString() const {
   std::vector<string> pieces;
   pieces.push_back("HloInputOutputAliasConfig");
+  pieces.push_back(
+      absl::StrFormat("  Output shape: %s", alias_.shape().ToString()));
 
   ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
     const char* kind = alias.kind == AliasKind::kUserAlias ? "USER" : "SYSTEM";
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index e80567abe0a..6bd34f8a127 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -117,6 +117,9 @@ class HloInputOutputAliasConfig {
 
   Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
 
+  // Returns the shape of the output of the alias config.
+  const Shape& shape() const;
+
   string ToString() const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
old mode 100644
new mode 100755
index ddfcdcfd293..dabd7ab2836
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -550,7 +550,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         gather_slice_sizes.push_back(bound);
       }
       instruction = CreateGather(shape, operands(0), operands(1),
-                                 *gather_dimension_numbers, gather_slice_sizes);
+                                 *gather_dimension_numbers, gather_slice_sizes,
+                                 proto.indices_are_sorted());
       break;
     }
     case HloOpcode::kScatter: {
@@ -563,7 +564,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           absl::make_unique<ScatterDimensionNumbers>(
               proto.scatter_dimension_numbers());
       instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
-                                  computations(0), *scatter_dimension_numbers);
+                                  computations(0), *scatter_dimension_numbers,
+                                  proto.indices_are_sorted());
       break;
     }
     case HloOpcode::kIota:
@@ -672,6 +674,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->set_sharding(sharding);
   }
 
+  if (proto.has_frontend_attributes()) {
+    instruction->set_frontend_attributes(proto.frontend_attributes());
+  }
+
   return std::move(instruction);
 }
 
@@ -1192,6 +1198,7 @@ HloInstruction::CreateBroadcastSequence(
     if (operand->has_sharding()) {
       broadcast->set_sharding(operand->sharding());
     }
+    broadcast->set_frontend_attributes(operand->frontend_attributes());
     return broadcast;
   }
   // Do explicit broadcast for degenerate broadcast.
@@ -1217,6 +1224,7 @@ HloInstruction::CreateBroadcastSequence(
   if (operand->has_sharding()) {
     reshaped_operand->set_sharding(operand->sharding());
   }
+  reshaped_operand->set_frontend_attributes(operand->frontend_attributes());
   // Broadcast 'reshape' up to the larger size.
   auto broadcast = HloInstruction::CreateBroadcast(
       broadcast_shape, reshaped_operand, broadcast_dimensions);
@@ -1224,6 +1232,7 @@ HloInstruction::CreateBroadcastSequence(
   if (operand->has_sharding()) {
     broadcast->set_sharding(operand->sharding());
   }
+  broadcast->set_frontend_attributes(operand->frontend_attributes());
   return broadcast;
 }
 
@@ -1294,6 +1303,7 @@ void HloInstruction::SetupDerivedInstruction(
     derived_instruction->clear_sharding();
   }
   derived_instruction->set_metadata(metadata_);
+  derived_instruction->set_frontend_attributes(frontend_attributes_);
 }
 
 bool HloInstruction::HasSideEffectNoRecurse() const {
@@ -1372,19 +1382,21 @@ bool HloInstruction::HasSideEffect() const {
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateGather(
     const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    absl::Span<const int64> slice_sizes) {
+    absl::Span<const int64> slice_sizes, bool indices_are_sorted) {
   return absl::make_unique<HloGatherInstruction>(
-      shape, operand, start_indices, gather_dim_numbers, slice_sizes);
+      shape, operand, start_indices, gather_dim_numbers, slice_sizes,
+      indices_are_sorted);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateScatter(
     const Shape& shape, HloInstruction* operand,
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
-    const ScatterDimensionNumbers& scatter_dim_numbers) {
+    const ScatterDimensionNumbers& scatter_dim_numbers,
+    bool indices_are_sorted) {
   return absl::make_unique<HloScatterInstruction>(
       shape, operand, scatter_indices, updates, update_computation,
-      scatter_dim_numbers);
+      scatter_dim_numbers, indices_are_sorted);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
@@ -2179,10 +2191,20 @@ string HloInstruction::SignatureString() const {
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
+string PrintName(const string& name, bool print_ids) {
+  if (print_ids) {
+    return name;
+  } else {
+    auto dot_position = name.find_first_of(".");
+    return name.substr(0, dot_position);
+  }
+}
+
 namespace {
 
-string PrintName(const string& name, const HloPrintOptions& options) {
-  return StrCat(options.print_percent() ? "%" : "", name);
+string PrintNameInternal(const string& name, const HloPrintOptions& options) {
+  return StrCat(options.print_percent() ? "%" : "",
+                PrintName(name, options.print_ids()));
 }
 
 }  // namespace
@@ -2277,11 +2299,12 @@ string HloInstruction::ToStringWithCanonicalNameMap(
       // If we are canonicalizing instruction names and this is a top-level
       // HloInstruction::ToString() call, don't print an instruction name.
       StrAppend(&result,
-                PrintName(canonical_name_map->LookupOrInsert(name()), options),
+                PrintNameInternal(canonical_name_map->LookupOrInsert(name()),
+                                  options),
                 " = ");
     }
   } else {
-    StrAppend(&result, PrintName(name(), options), " = ");
+    StrAppend(&result, PrintNameInternal(name(), options), " = ");
   }
 
   // Print shape.
@@ -2347,10 +2370,10 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
     // part of the canonical string.
     if (options.canonicalize_instruction_names() &&
         options.is_in_nested_computation()) {
-      str.push_back(PrintName(
+      str.push_back(PrintNameInternal(
           canonical_name_map->LookupOrInsert(operand->name()), options));
     } else if (options.print_operand_names()) {
-      str.push_back(PrintName(operand->name(), options));
+      str.push_back(PrintNameInternal(operand->name(), options));
     }
     StrAppend(out, StrJoin(str, " "));
   });
@@ -2368,27 +2391,30 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   if (options.print_subcomputation_mode() ==
       HloPrintOptions::PrintSubcomputationMode::kNameOnly) {
     if (opcode() == HloOpcode::kWhile) {
+      extra.push_back(StrCat(
+          "condition=", PrintNameInternal(while_condition()->name(), options)));
       extra.push_back(
-          StrCat("condition=", PrintName(while_condition()->name(), options)));
-      extra.push_back(
-          StrCat("body=", PrintName(while_body()->name(), options)));
+          StrCat("body=", PrintNameInternal(while_body()->name(), options)));
     } else if (opcode() == HloOpcode::kSelectAndScatter) {
-      extra.push_back(StrCat("select=", PrintName(select()->name(), options)));
       extra.push_back(
-          StrCat("scatter=", PrintName(scatter()->name(), options)));
+          StrCat("select=", PrintNameInternal(select()->name(), options)));
+      extra.push_back(
+          StrCat("scatter=", PrintNameInternal(scatter()->name(), options)));
     } else if (opcode() == HloOpcode::kConditional) {
       if (operand(0)->shape().element_type() == PRED) {
-        extra.push_back(StrCat("true_computation=",
-                               PrintName(true_computation()->name(), options)));
+        extra.push_back(
+            StrCat("true_computation=",
+                   PrintNameInternal(true_computation()->name(), options)));
         extra.push_back(
             StrCat("false_computation=",
-                   PrintName(false_computation()->name(), options)));
+                   PrintNameInternal(false_computation()->name(), options)));
       } else {
         extra.push_back(StrCat(
             "branch_computations={",
             StrJoin(branch_computations(), ", ",
                     [&](string* out, const HloComputation* computation) {
-                      StrAppend(out, PrintName(computation->name(), options));
+                      StrAppend(
+                          out, PrintNameInternal(computation->name(), options));
                     }),
             "}"));
       }
@@ -2399,13 +2425,14 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                opcode() == HloOpcode::kScatter ||
                opcode() == HloOpcode::kSort) {
       extra.push_back(
-          StrCat("to_apply=", PrintName(to_apply()->name(), options)));
+          StrCat("to_apply=", PrintNameInternal(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
       extra.push_back(StrCat(
           "calls=",
           StrJoin(called_computations(), ", ",
                   [&](string* out, const HloComputation* computation) {
-                    StrAppend(out, PrintName(computation->name(), options));
+                    StrAppend(out,
+                              PrintNameInternal(computation->name(), options));
                   })));
     }
   } else if (options.print_subcomputation_mode() ==
@@ -2464,6 +2491,10 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   if (has_sharding()) {
     extra.push_back(StrCat("sharding=", sharding().ToString()));
   }
+  if (!frontend_attributes_.map().empty()) {
+    extra.push_back(StrCat("frontend_attributes=",
+                           FrontendAttributesToString(frontend_attributes_)));
+  }
   if (!outer_dimension_partitions_.empty()) {
     extra.push_back(absl::StrFormat("outer_dimension_partitions={%s}",
                                     StrJoin(outer_dimension_partitions_, ",")));
@@ -2473,8 +2504,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("control-predecessors={",
                            StrJoin(control_predecessors_, ", ",
                                    [&](string* out, HloInstruction* pre) {
-                                     StrAppend(out,
-                                               PrintName(pre->name(), options));
+                                     StrAppend(out, PrintNameInternal(
+                                                        pre->name(), options));
                                    }),
                            "}"));
   }
@@ -2524,6 +2555,8 @@ HloInstructionProto HloInstruction::ToProto() const {
     }
   }
 
+  *proto.mutable_frontend_attributes() = frontend_attributes_;
+
   return proto;
 }
 
@@ -2573,6 +2606,9 @@ bool HloInstruction::IsFusible() const {
   switch (opcode_) {
     case HloOpcode::kDomain:
     case HloOpcode::kParameter:
+    case HloOpcode::kWhile:
+    case HloOpcode::kConditional:
+    case HloOpcode::kCall:
       return false;
     // Side effecting instrutions cannot be fused.
     default:
@@ -3175,6 +3211,15 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   return InvalidArgument("Unknown fusion kind: %s", kind_name);
 }
 
+string FrontendAttributesToString(
+    const FrontendAttributes& frontend_attributes) {
+  std::vector<std::pair<string, string>> sorted_attributes(
+      frontend_attributes.map().begin(), frontend_attributes.map().end());
+  absl::c_sort(sorted_attributes);
+  return absl::StrFormat(
+      "{%s}", absl::StrJoin(sorted_attributes, ",", absl::PairFormatter("=")));
+}
+
 string PaddingConfigToString(const PaddingConfig& padding) {
   bool has_interior_padding =
       absl::c_any_of(padding.dimensions(),
@@ -3652,6 +3697,9 @@ int64 HloInstruction::feature_group_count() const {
 }
 
 void HloInstruction::set_feature_group_count(int64 feature_group_count) {
+  if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->set_feature_group_count(feature_group_count);
+  }
   Cast<HloCustomCallInstruction>(this)->set_feature_group_count(
       feature_group_count);
 }
@@ -3664,6 +3712,9 @@ int64 HloInstruction::batch_group_count() const {
 }
 
 void HloInstruction::set_batch_group_count(int64 batch_group_count) {
+  if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->set_batch_group_count(batch_group_count);
+  }
   Cast<HloCustomCallInstruction>(this)->set_batch_group_count(
       batch_group_count);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index fbaeb5d5f66..3119b52e377 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -63,6 +63,8 @@ namespace xla {
 class HloComputation;
 class HloModule;
 
+string PrintName(const string& name, bool print_ids);
+
 // A bunch of switches that control how the hlo text should be printed.
 class HloPrintOptions {
  public:
@@ -88,7 +90,8 @@ class HloPrintOptions {
         print_control_dependencies_(true),
         canonicalize_instruction_names_(false),
         indent_amount_(0),
-        is_in_nested_computation_(false) {}
+        is_in_nested_computation_(false),
+        print_ids_(true) {}
 
   static HloPrintOptions ShortParsable() {
     return HloPrintOptions()
@@ -118,6 +121,22 @@ class HloPrintOptions {
         .set_canonicalize_instruction_names(true);
   }
 
+  // Options to produce a fingerprint of an HLO.
+  static HloPrintOptions Fingerprint() {
+    return HloPrintOptions()
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
+        .set_print_metadata(false)
+        .set_print_backend_config(false)
+        .set_compact_operands(true)
+        .set_print_operand_names(false)
+        .set_print_operand_shape(true)
+        .set_print_program_shape(false)
+        .set_print_percent(false)
+        .set_print_control_dependencies(false)
+        .set_canonicalize_instruction_names(true)
+        .set_print_ids(false);
+  }
+
   // If true, large constants will be printed out.
   HloPrintOptions& set_print_large_constants(bool value) {
     print_large_constants_ = value;
@@ -154,6 +173,12 @@ class HloPrintOptions {
     return *this;
   }
 
+  // If true, all printed names include unique identifiers.
+  HloPrintOptions& set_print_ids(bool value) {
+    print_ids_ = value;
+    return *this;
+  }
+
   // If true, program shape of hlo computations will be printed.
   HloPrintOptions& set_print_program_shape(bool value) {
     print_program_shape_ = value;
@@ -216,6 +241,7 @@ class HloPrintOptions {
   bool include_layout_in_shapes() const { return include_layout_in_shapes_; }
   bool print_operand_shape() const { return print_operand_shape_; }
   bool print_operand_names() const { return print_operand_names_; }
+  bool print_ids() const { return print_ids_; }
   bool print_program_shape() const { return print_program_shape_; }
   bool print_percent() const { return print_percent_; }
   bool print_control_dependencies() const {
@@ -242,6 +268,7 @@ class HloPrintOptions {
   bool canonicalize_instruction_names_;
   int indent_amount_;
   bool is_in_nested_computation_;
+  bool print_ids_;
 };
 
 // For canonical string output, we need to have a canonical way to rename
@@ -767,13 +794,14 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      absl::Span<const int64> slice_sizes);
+      absl::Span<const int64> slice_sizes, bool indices_are_sorted);
 
   static std::unique_ptr<HloInstruction> CreateScatter(
       const Shape& shape, HloInstruction* operand,
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
-      const ScatterDimensionNumbers& scatter_dim_numbers);
+      const ScatterDimensionNumbers& scatter_dim_numbers,
+      bool indices_are_sorted);
 
   // Creates a kDomain instruction which delimits an HLO domain which have
   // the provided user and operand side metadata.
@@ -1357,6 +1385,14 @@ class HloInstruction {
   }
   Status set_backend_config(const tensorflow::protobuf::Message& proto);
 
+  void set_frontend_attributes(FrontendAttributes frontend_attributes) {
+    frontend_attributes_ = std::move(frontend_attributes);
+  }
+
+  const FrontendAttributes& frontend_attributes() const {
+    return frontend_attributes_;
+  }
+
   // Getter/setter for raw JSON-encoded backend config.  Prefer the
   // functions above that deal in proto Messages where possible.
   const string& raw_backend_config_string() const { return backend_config_; }
@@ -1851,6 +1887,18 @@ class HloInstruction {
   // HLO. See the documentation on backend_config().
   string backend_config_;
 
+  // Attributes passed from the frontend to give hints to the backend about
+  // how to compile this HLO.
+  // HLO -> HLO transforms are expected to preserve these attributes on a
+  // "best effort" basis only.
+  // For example:
+  //    x = const(10, frontend_attributes={x}
+  //    y = const(10, frontend_attributes={y}
+  //    z = add(x,y), frontend_attributes={y}
+  // Could be simplified to:
+  //    z' = const(20), frontend_attributes={?}
+  FrontendAttributes frontend_attributes_;
+
   // This field is assigned to true when backend_config_ is assigned to
   // a default configuration.
   bool is_default_config_ = false;
@@ -1881,6 +1929,8 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 // Custom (de)stringification functions for protos that live inside
 // HloInstruction.
 string PaddingConfigToString(const PaddingConfig& padding);
+string FrontendAttributesToString(
+    const FrontendAttributes& frontend_attributes);
 string OpMetadataToString(const OpMetadata& metadata);
 string RandomDistributionToString(const RandomDistribution& distribution);
 string PrecisionToString(const PrecisionConfig::Precision& precision);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 80de1d5e0bc..0a50ed04af7 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -1440,7 +1442,8 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
                                        /*collapsed_slice_dims=*/{},
                                        /*start_index_map=*/{0, 1, 2, 3, 4},
                                        /*index_vector_dim=*/4),
-                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26},
+                                   /*indices_are_sorted=*/false));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -1475,7 +1478,8 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
                                        /*collapsed_slice_dims=*/{},
                                        /*start_index_map=*/{0, 1, 2, 3, 4},
                                        /*index_vector_dim=*/2),
-                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26},
+                                   /*indices_are_sorted=*/false));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -1524,7 +1528,8 @@ TEST_F(HloInstructionTest, StringifyScatter) {
               /*update_window_dims=*/{4, 5, 6, 7, 8},
               /*inserted_window_dims=*/{},
               /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/2)));
+              /*index_vector_dim=*/2),
+          /*indices_are_sorted=*/false));
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(
@@ -1956,5 +1961,26 @@ TEST_F(HloInstructionTest, GatherDoesNotReuseElements) {
   EXPECT_FALSE(root->ReusesOperandElements(1));
 }
 
+TEST_F(HloInstructionTest, BackendConfigCanContainNonFiniteFloats) {
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto p0 = b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = b.AddInstruction(HloInstruction::CreateDot(
+      shape, p0, p0, dot_dnums, DefaultPrecisionConfig(2)));
+
+  gpu::GemmBackendConfig orig_config;
+  orig_config.set_alpha_real(std::numeric_limits<double>::infinity());
+  orig_config.set_alpha_imag(std::numeric_limits<double>::quiet_NaN());
+  TF_ASSERT_OK(dot->set_backend_config(orig_config));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto new_config,
+                          dot->backend_config<gpu::GemmBackendConfig>());
+  EXPECT_GT(new_config.alpha_real(), std::numeric_limits<double>::max());
+  EXPECT_NE(new_config.alpha_imag(), new_config.alpha_imag());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 52d8c7a43ce..183967941bf 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1706,7 +1706,7 @@ bool HloRngInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
-  return false;
+  return true;
 }
 
 std::unique_ptr<HloInstruction> HloRngInstruction::CloneWithNewOperandsImpl(
@@ -1737,7 +1737,7 @@ HloInstructionProto HloParameterInstruction::ToProto() const {
 }
 
 std::vector<string> HloParameterInstruction::ExtraAttributesToStringImpl(
-    const HloPrintOptions& /*options*/) const {
+    const HloPrintOptions& options) const {
   std::vector<string> result;
   if (!parameter_replicated_at_leaf_buffers_) {
     return result;
@@ -1746,8 +1746,10 @@ std::vector<string> HloParameterInstruction::ExtraAttributesToStringImpl(
   for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
     buffers_replicated_strs.push_back(replicated ? "true" : "false");
   }
-  result.push_back(StrCat("parameter_replication={",
-                          StrJoin(buffers_replicated_strs, ","), "}"));
+  if (options.print_ids()) {
+    result.push_back(StrCat("parameter_replication={",
+                            StrJoin(buffers_replicated_strs, ","), "}"));
+  }
   return result;
 }
 
@@ -2397,8 +2399,9 @@ HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
 HloGatherInstruction::HloGatherInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    absl::Span<const int64> slice_sizes)
-    : HloInstruction(HloOpcode::kGather, shape) {
+    absl::Span<const int64> slice_sizes, bool indices_are_sorted)
+    : HloInstruction(HloOpcode::kGather, shape),
+      indices_are_sorted_(indices_are_sorted) {
   AppendOperand(operand);
   AppendOperand(start_indices);
   gather_dimension_numbers_ =
@@ -2450,13 +2453,19 @@ HloInstructionProto HloGatherInstruction::ToProto() const {
   for (int64 bound : gather_slice_sizes()) {
     proto.add_gather_slice_sizes(bound);
   }
+  proto.set_indices_are_sorted(indices_are_sorted());
   return proto;
 }
 
 std::vector<string> HloGatherInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {GatherDimensionNumbersToString(gather_dimension_numbers()),
-          StrCat("slice_sizes={", StrJoin(gather_slice_sizes(), ","), "}")};
+  std::vector<string> attrs{
+      GatherDimensionNumbersToString(gather_dimension_numbers()),
+      StrCat("slice_sizes={", StrJoin(gather_slice_sizes(), ","), "}")};
+  if (indices_are_sorted()) {
+    attrs.push_back("indices_are_sorted=true");
+  }
+  return attrs;
 }
 
 bool HloGatherInstruction::IdenticalSlowPath(
@@ -2467,7 +2476,8 @@ bool HloGatherInstruction::IdenticalSlowPath(
   return protobuf_util::ProtobufEquals(
              gather_dimension_numbers(),
              casted_other.gather_dimension_numbers()) &&
-         gather_slice_sizes() == casted_other.gather_slice_sizes();
+         gather_slice_sizes() == casted_other.gather_slice_sizes() &&
+         indices_are_sorted() == casted_other.indices_are_sorted();
 }
 
 std::unique_ptr<HloInstruction> HloGatherInstruction::CloneWithNewOperandsImpl(
@@ -2476,15 +2486,16 @@ std::unique_ptr<HloInstruction> HloGatherInstruction::CloneWithNewOperandsImpl(
   CHECK_EQ(new_operands.size(), 2);
   return absl::make_unique<HloGatherInstruction>(
       shape, new_operands[0], new_operands[1], gather_dimension_numbers(),
-      gather_slice_sizes());
+      gather_slice_sizes(), indices_are_sorted());
 }
 
 HloScatterInstruction::HloScatterInstruction(
     const Shape& shape, HloInstruction* operand,
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
-    const ScatterDimensionNumbers& scatter_dim_numbers)
-    : HloInstruction(HloOpcode::kScatter, shape) {
+    const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted)
+    : HloInstruction(HloOpcode::kScatter, shape),
+      indices_are_sorted_(indices_are_sorted) {
   AppendOperand(operand);
   AppendOperand(scatter_indices);
   AppendOperand(updates);
@@ -2538,12 +2549,18 @@ HloScatterInstruction::MakeScatterDimNumbers(
 HloInstructionProto HloScatterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers();
+  proto.set_indices_are_sorted(indices_are_sorted());
   return proto;
 }
 
 std::vector<string> HloScatterInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {ScatterDimensionNumbersToString(scatter_dimension_numbers())};
+  std::vector<string> attrs{
+      ScatterDimensionNumbersToString(scatter_dimension_numbers())};
+  if (indices_are_sorted()) {
+    attrs.push_back("indices_are_sorted=true");
+  }
+  return attrs;
 }
 
 bool HloScatterInstruction::IdenticalSlowPath(
@@ -2554,7 +2571,8 @@ bool HloScatterInstruction::IdenticalSlowPath(
   return protobuf_util::ProtobufEquals(
              scatter_dimension_numbers(),
              casted_other.scatter_dimension_numbers()) &&
-         eq_computations(to_apply(), casted_other.to_apply());
+         eq_computations(to_apply(), casted_other.to_apply()) &&
+         indices_are_sorted() == casted_other.indices_are_sorted();
 }
 
 std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
@@ -2563,7 +2581,7 @@ std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
   CHECK_EQ(new_operands.size(), 3);
   return absl::make_unique<HloScatterInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], to_apply(),
-      scatter_dimension_numbers());
+      scatter_dimension_numbers(), indices_are_sorted());
 }
 
 HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension)
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
old mode 100644
new mode 100755
index 8e6f024e5d2..0de050108b7
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1077,10 +1077,15 @@ class HloConvolutionInstruction : public HloInstruction {
   // The number of feature groups. Must be a divisor of the input feature
   // dimension and output feature dimension.
   int64 feature_group_count() const { return feature_group_count_; }
-
+  void set_feature_group_count(int64 num_feature_groups) {
+    feature_group_count_ = num_feature_groups;
+  }
   // The number of feature groups. Must be a divisor of the input batch
   // dimension.
   int64 batch_group_count() const { return batch_group_count_; }
+  void set_batch_group_count(int64 num_batch_groups) {
+    batch_group_count_ = num_batch_groups;
+  }
 
   // Returns the information used to tell the implementation information about
   // what sort of precision is requested. The meaning of the field is backend
@@ -1401,7 +1406,7 @@ class HloGatherInstruction : public HloInstruction {
       const Shape& shape, HloInstruction* operand,
       HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      absl::Span<const int64> slice_sizes);
+      absl::Span<const int64> slice_sizes, bool indices_are_sorted);
   const GatherDimensionNumbers& gather_dimension_numbers() const {
     CHECK(gather_dimension_numbers_ != nullptr);
     return *gather_dimension_numbers_;
@@ -1409,6 +1414,10 @@ class HloGatherInstruction : public HloInstruction {
   absl::Span<const int64> gather_slice_sizes() const {
     return gather_slice_sizes_;
   }
+  bool indices_are_sorted() const { return indices_are_sorted_; }
+  void set_indices_are_sorted(bool indices_are_sorted) {
+    indices_are_sorted_ = indices_are_sorted;
+  }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1434,6 +1443,7 @@ class HloGatherInstruction : public HloInstruction {
 
   std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
   std::vector<int64> gather_slice_sizes_;
+  bool indices_are_sorted_;
 };
 
 class HloScatterInstruction : public HloInstruction {
@@ -1442,11 +1452,16 @@ class HloScatterInstruction : public HloInstruction {
       const Shape& shape, HloInstruction* operand,
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
-      const ScatterDimensionNumbers& scatter_dim_numbers);
+      const ScatterDimensionNumbers& scatter_dim_numbers,
+      bool indices_are_sorted);
   const ScatterDimensionNumbers& scatter_dimension_numbers() const {
     CHECK(scatter_dimension_numbers_ != nullptr);
     return *scatter_dimension_numbers_;
   }
+  bool indices_are_sorted() const { return indices_are_sorted_; }
+  void set_indices_are_sorted(bool indices_are_sorted) {
+    indices_are_sorted_ = indices_are_sorted;
+  }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1473,6 +1488,7 @@ class HloScatterInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   std::unique_ptr<ScatterDimensionNumbers> scatter_dimension_numbers_;
+  bool indices_are_sorted_;
 };
 
 class HloIotaInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_live_range.cc b/tensorflow/compiler/xla/service/hlo_live_range.cc
new file mode 100644
index 00000000000..8ec437ec250
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_live_range.cc
@@ -0,0 +1,235 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
+
+#include "absl/strings/str_format.h"
+
+namespace xla {
+/*static*/
+StatusOr<std::unique_ptr<HloLiveRange>> HloLiveRange::Run(
+    const HloSchedule& schedule, const HloAliasAnalysis& alias_analysis,
+    const HloComputation* computation, bool module_scoped_analysis) {
+  std::unique_ptr<HloLiveRange> hlo_live_range(
+      new HloLiveRange(schedule, alias_analysis, module_scoped_analysis));
+  hlo_live_range->schedule_end_time_ =
+      hlo_live_range->FlattenSchedule(*computation, 0);
+  hlo_live_range->CalculateBufferStartEndMap();
+  hlo_live_range->NormalizeAliasedBuffers();
+  return std::move(hlo_live_range);
+}
+
+void HloLiveRange::NormalizeAliasedBuffers() {
+  for (const HloBuffer& hlo_buffer : alias_analysis_.buffers()) {
+    std::vector<const HloValue*> aliased_buffers;
+    for (const HloValue* hlo_value : hlo_buffer.values()) {
+      if (buffer_live_ranges_.contains(hlo_value)) {
+        aliased_buffers.push_back(hlo_value);
+      }
+    }
+    absl::c_sort(
+        aliased_buffers, [&](const HloValue* value1, const HloValue* value2) {
+          const TimeBound& live_range1 = buffer_live_ranges_.at(value1);
+          const TimeBound& live_range2 = buffer_live_ranges_.at(value2);
+
+          return std::forward_as_tuple(live_range1.start, live_range1.end) <
+                 std::forward_as_tuple(live_range2.start, live_range2.end);
+        });
+
+    for (int64 i = 0; i + 1 < aliased_buffers.size(); ++i) {
+      const HloValue* value1 = aliased_buffers[i];
+      const HloValue* value2 = aliased_buffers[i + 1];
+      TimeBound& live_range1 = buffer_live_ranges_[value1];
+      TimeBound& live_range2 = buffer_live_ranges_[value2];
+      if (live_range1.start == live_range2.start) {
+        // If value1 has the same start time as value2, make value1 disappear
+        // by setting the end time same as start time:
+        //
+        // Before:
+        // +----+           value1
+        // +----------+     value2
+        //
+        // After:
+        // +                value1
+        // +----------+     value2
+        //
+        // Note that only when heap simulator runs before copy insertion can
+        // this happen where one instruction defines multiple aliased buffers
+        // -- This is illegle to execute and can be fixed by copy insertion
+        // later.
+        live_range1.end = live_range2.end;
+        continue;
+      }
+
+      if (live_range1.end < live_range2.start) {
+        continue;
+      }
+
+      if (live_range1.end > live_range2.end) {
+        live_range2.end = live_range1.end;
+      }
+      live_range1.end = live_range2.start - 1;
+    }
+  }
+}
+
+// FlattenSchedule walks through the computation and tracks down the ordinal
+// number of each instruction in the schedule.
+int64 HloLiveRange::FlattenSchedule(const HloComputation& computation,
+                                    int64 start_time) {
+  if (!schedule_.is_computation_scheduled(&computation)) {
+    total_order_scheduled_ = false;
+    return start_time;
+  }
+
+  const HloInstructionSequence& instruction_sequence =
+      schedule_.sequence(&computation);
+  int64 time = start_time;
+  for (HloInstruction* instruction : instruction_sequence.instructions()) {
+    if (module_scoped_analysis_) {
+      // Recurse into sub computations if running with module scoped analysis
+      // mode.
+      if (instruction->opcode() == HloOpcode::kCall ||
+          instruction->opcode() == HloOpcode::kConditional) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          time = FlattenSchedule(*called_computation, time);
+        }
+      }
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        time = FlattenSchedule(*instruction->while_condition(), time);
+        time++;
+        time = FlattenSchedule(*instruction->while_body(), time);
+      }
+    }
+    if (instruction_schedule_.count(instruction) != 0) {
+      continue;
+    }
+    instruction_schedule_.insert({instruction, time++});
+    flattened_instruction_sequence_.push_back(instruction);
+  }
+  computation_span_times_.try_emplace(&computation,
+                                      TimeBound{start_time, time});
+  DCHECK_EQ(instruction_schedule_.size(),
+            flattened_instruction_sequence_.size());
+  DCHECK_LE(instruction_schedule_.size(), time);
+  return time;
+}
+
+void HloLiveRange::CalculateBufferStartEndMap() {
+  for (const HloValue* value : alias_analysis_.dataflow_analysis().values()) {
+    // Ignore buffers that are not defined.
+    if (instruction_schedule_.count(value->defining_instruction()) == 0) {
+      continue;
+    }
+
+    int64 buffer_start_time = instruction_schedule_[value->instruction()];
+
+    int64 buffer_end_time = -1;
+    for (const HloUse& use : value->uses()) {
+      const HloInstruction* used = use.instruction;
+      // As an optimization, we deem a while's init value's live range ends as
+      // soon as the loop body starts. This optimization is only applicable in
+      // module scoped mode.
+      if (module_scoped_analysis_ && used->opcode() == HloOpcode::kWhile) {
+        // The current live range is at the end of the while, move it to the
+        // beginning of the body.
+        used = used->while_body()->parameter_instruction(0);
+        VLOG(1) << "Moved value " << value->ToShortString()
+                << " to while param: " << used->ToString();
+      }
+      if (instruction_schedule_.count(used) == 0) {
+        // We didn't track the instruction `used`. This happens when we do
+        // computation scope (versus module scope) heap simulation and when
+        // the used instruction is outside of the computation being simulated.
+        continue;
+      }
+      buffer_end_time = std::max(buffer_end_time, instruction_schedule_[used]);
+    }
+
+    // Parameters are defined at the beginning of the computation. This prevents
+    // any instruction that's scheduled before the parameter clobbers the
+    // parameter's buffer.
+    if (value->instruction()->opcode() == HloOpcode::kParameter) {
+      const HloComputation* computation = value->instruction()->parent();
+      auto it = computation_span_times_.find(computation);
+      if (it != computation_span_times_.end()) {
+        buffer_start_time = std::min(buffer_start_time, it->second.start);
+      }
+    }
+
+    if (buffer_end_time == -1) {
+      buffer_end_time = buffer_start_time;
+    }
+
+    for (const HloPosition& position : value->positions()) {
+      const HloComputation* position_comp = position.instruction->parent();
+      // If this instruction lives out, the live range of the instruction
+      // should be extended to the end of the computation.
+      if (position.instruction == position_comp->root_instruction()) {
+        auto it = computation_span_times_.find(position_comp);
+        if (it == computation_span_times_.end()) {
+          continue;
+        }
+        buffer_end_time = std::max(buffer_end_time, it->second.end);
+      }
+    }
+
+    const HloModule* module = value->instruction()->parent()->parent();
+
+    // Readonly entry parameters (parameters that don't alias) live across whole
+    // computation.
+    if (value->instruction()->opcode() == HloOpcode::kParameter &&
+        value->instruction()->parent() == module->entry_computation() &&
+        !module->input_output_alias_config().ParameterHasAlias(
+            value->instruction()->parameter_number(), value->index())) {
+      buffer_end_time = schedule_end_time_;
+    }
+
+    CHECK(buffer_start_time <= buffer_end_time)
+        << buffer_start_time << ", " << buffer_end_time
+        << value->instruction()->ToString();
+
+    auto& live_range = buffer_live_ranges_[value];
+    live_range.start = buffer_start_time;
+    live_range.end = buffer_end_time;
+  }
+}
+
+std::string HloLiveRange::ToString() const {
+  std::string output;
+  absl::StrAppendFormat(&output, "HloLiveRange (max %d):\n",
+                        schedule_end_time_);
+  absl::StrAppendFormat(&output, "  InstructionSequence:\n");
+  auto& instructions = flattened_instruction_sequence().instructions();
+  for (int64 i = 0; i < instructions.size(); ++i) {
+    absl::StrAppendFormat(&output, "    %d:%s\n", i, instructions[i]->name());
+  }
+
+  absl::StrAppendFormat(&output, "  BufferLiveRange:\n");
+
+  for (const HloValue* value : alias_analysis_.dataflow_analysis().values()) {
+    auto it = buffer_live_ranges_.find(value);
+    if (it != buffer_live_ranges_.end()) {
+      absl::StrAppendFormat(
+          &output, "    %s%s:%d-%d\n", value->instruction()->name(),
+          value->index().ToString(), it->second.start, it->second.end);
+    }
+  }
+
+  return output;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_live_range.h b/tensorflow/compiler/xla/service/hlo_live_range.h
new file mode 100644
index 00000000000..cc0445acd1e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_live_range.h
@@ -0,0 +1,206 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+License for the specific language governing permissions and limitations under
+the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVE_RANGE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVE_RANGE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_buffer.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+
+// Class which computes live range of the output buffers of HLOs and their
+// interference by flattening all computations. The live range is only available
+// when all global computations (while, if, call, etc) have total order
+// sequential orders.
+class HloLiveRange {
+ public:
+  // Constructs a hlo live range object for the given module and computation
+  // assuming the given HLO instruction ordering.
+  static StatusOr<std::unique_ptr<HloLiveRange>> Run(
+      const HloSchedule& schedule, const HloAliasAnalysis& alias_analysis,
+      const HloComputation* computation, bool module_scoped_analysis = true);
+
+  // LogicalTime represents the time in a virtual clock. Each instruction has
+  // one monotonically increasing logical time assigned according to the
+  // schedule.
+  using LogicalTime = int64;
+
+  struct TimeBound {
+    LogicalTime start;
+    LogicalTime end;
+
+    bool friend operator==(const TimeBound& a, const TimeBound& b) {
+      return a.start == b.start && a.end == b.end;
+    }
+    bool friend operator!=(const TimeBound& a, const TimeBound& b) {
+      return !(a == b);
+    }
+  };
+
+  std::string ToString() const;
+
+  const HloInstructionSequence& flattened_instruction_sequence() const {
+    return flattened_instruction_sequence_;
+  }
+
+  // Returns the map from instruction to the end time of that instruction.
+  const absl::flat_hash_map<const HloInstruction*, LogicalTime>&
+  instruction_schedule() const {
+    return instruction_schedule_;
+  }
+
+  // Returns the map from a hlo value to the definition time of that hlo value.
+  const absl::flat_hash_map<const HloValue*, TimeBound>& buffer_live_ranges()
+      const {
+    return buffer_live_ranges_;
+  }
+
+  absl::flat_hash_map<const HloValue*, TimeBound>& buffer_live_ranges() {
+    return buffer_live_ranges_;
+  }
+
+  // Returns the time stamp of the end of the program.
+  LogicalTime schedule_end_time() const { return schedule_end_time_; }
+
+  // Returns whether hlo live range is available on this entire module. Hlo live
+  // range is not available if the module is partially ordered.
+  bool total_order_scheduled() const { return total_order_scheduled_; }
+
+ private:
+  explicit HloLiveRange(const HloSchedule& schedule,
+                        const HloAliasAnalysis& alias_analysis,
+                        bool module_scoped_analysis)
+      : schedule_(schedule),
+        alias_analysis_(alias_analysis),
+        module_scoped_analysis_(module_scoped_analysis) {}
+
+  // FlattenSchedule walks through the instructions in `computation`, and
+  // recurse into each called computations in module_scoped_analysis mode. As it
+  // walks it also tracks down the ordinal number of each instruction in the
+  // schedule and store it in the `instruction_schedule` and
+  // 'flattened_instruction_sequence`. The end of each computation is tracked in
+  // `computation_end_time`.
+  int64 FlattenSchedule(const HloComputation& computation, int64 start_time);
+
+  // Based on the flattened schedule, calculate the start and end of each
+  // buffer.
+  void CalculateBufferStartEndMap();
+
+  // The aliased buffers could have overlapping live ranges.
+  // NormalizeAliasedBuffers normalizes the buffer such that each alias buffer
+  // has disjoint live range while keeping the live range union the same. This
+  // avoid double counting aliased buffer sizes.
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----+          live range of buffer1
+  //   +------------------+    live range of buffer2
+  //
+  // After:
+  //
+  //           +----------+    live range of buffer1
+  //   +------+                live range of buffer2
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----------+    live range of buffer1
+  //   +------------+          live range of buffer2
+  //
+  // After:
+  //
+  //           +----------+    live range of buffer1
+  //   +------+                live range of buffer2
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----------+    live range of buffer1
+  //   +---+                   live range of buffer2
+  //
+  // After(unchanged):
+  //
+  //           +----------+    live range of buffer1
+  //   +---+                   live range of buffer2
+  //
+  // As another example, imagine we have the following code sequence with live
+  // ranges of each while-aliased buffers:
+  //
+  //                     a      p1    p2    e     b
+  // a = ...             +
+  //                     |
+  // {                   |
+  //   p1 = param        |       +
+  //   ROOT true         |       |
+  // }                   |       +
+  // { // body           |
+  //   p2 = param        +             +
+  //   c = p2 + 1                      +
+  //   d = c + 1
+  //   ROOT e = d + 1                       +
+  // }                                      |
+  //                                        |
+  // b = while (a)                          +     +
+  //                                              |
+  // f = b + 1                                    +
+  //
+  // After normalization it becomes:
+  //
+  //                     a      p1    p2    e     b
+  // a = ...             +
+  //                     |
+  // {                   +
+  //   p1 = param                +
+  //   ROOT true                 |
+  // }                           +
+  // { // body
+  //   p2 = param                      +
+  //   c = p2 + 1                      +
+  //   d = c + 1
+  //   ROOT e = d + 1                       +
+  // }                                      |
+  //                                        |
+  // b = while (a)                          +
+  //                                              +
+  // f = b + 1                                    +
+  //
+  // Note there is no overlap of live ranges after normalization.
+  void NormalizeAliasedBuffers();
+
+  const HloSchedule& schedule_;
+  const HloAliasAnalysis& alias_analysis_;
+  bool module_scoped_analysis_;
+  bool total_order_scheduled_ = true;
+
+  HloInstructionSequence flattened_instruction_sequence_;
+  absl::flat_hash_map<const HloInstruction*, int64> instruction_schedule_;
+  absl::flat_hash_map<const HloComputation*, TimeBound> computation_span_times_;
+  absl::flat_hash_map<const HloValue*, TimeBound> buffer_live_ranges_;
+  LogicalTime schedule_end_time_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVE_RANGE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_live_range_test.cc b/tensorflow/compiler/xla/service/hlo_live_range_test.cc
new file mode 100644
index 00000000000..d524d9f0c82
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_live_range_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/hlo_live_range.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using TimeBound = HloLiveRange::TimeBound;
+class HloLiveRangeTest : public HloTestBase {
+ protected:
+  HloLiveRangeTest() : module_(CreateNewVerifiedModule()) {}
+  ~HloLiveRangeTest() override {}
+
+  void Analyze(const HloSchedule& schedule) {
+    alias_analysis_ = HloAliasAnalysis::Run(module_.get()).ValueOrDie();
+    hlo_live_range_ = HloLiveRange::Run(schedule, *alias_analysis_,
+                                        module_->entry_computation())
+                          .ValueOrDie();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  // Shapes for use in the examples.
+  Shape f32scalar_ = ShapeUtil::MakeShape(xla::F32, {});
+  Shape f32vec4_ = ShapeUtil::MakeShape(F32, {4});
+
+  // Returns the buffer defined at the given instruction and index.
+  const HloValue* BufferAt(const HloInstruction* instruction,
+                           const ShapeIndex& index) const {
+    return &alias_analysis_->dataflow_analysis().GetUniqueValueAt(instruction,
+                                                                  index);
+  }
+
+  HloLiveRange::TimeBound LiveRangeAt(const HloInstruction* instruction,
+                                      const ShapeIndex& index = {}) const {
+    auto* value = BufferAt(instruction, index);
+    return hlo_live_range_->buffer_live_ranges().at(value);
+  }
+};
+
+TEST_F(HloLiveRangeTest, Multiply) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  module_->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(), {paramA, paramX, mul});
+
+  Analyze(schedule);
+
+  // Parameters live from beginning to end.
+  EXPECT_EQ(LiveRangeAt(paramA), TimeBound({0, 3}));
+  EXPECT_EQ(LiveRangeAt(paramX), TimeBound({0, 3}));
+  // Mul lives after parameters are defined to the end.
+  EXPECT_EQ(LiveRangeAt(mul), TimeBound({2, 3}));
+}
+
+TEST_F(HloLiveRangeTest, MultiplyAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+  module_->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(),
+                        {paramA, paramX, mul, paramY, add});
+
+  Analyze(schedule);
+
+  // Parameters live from beginning to end.
+  EXPECT_EQ(LiveRangeAt(paramA), TimeBound({0, 5}));
+  EXPECT_EQ(LiveRangeAt(paramX), TimeBound({0, 5}));
+  EXPECT_EQ(LiveRangeAt(paramY), TimeBound({0, 5}));
+  // Mul starts after parameter are defined (Note: all parameters are defined at
+  // 0, mul starts at 2 which is an arbitrary number).
+  EXPECT_EQ(LiveRangeAt(mul), TimeBound({2, 4}));
+  // Add lives after mul is defined to the end of the program.
+  EXPECT_EQ(LiveRangeAt(add), TimeBound({4, 5}));
+}
+
+TEST_F(HloLiveRangeTest, LiveOutBuffers) {
+  // If a buffer is live out, its life range is extened to the end of
+  // computation.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({mul, add}));
+  module_->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(),
+                        {paramA, paramX, mul, paramY, add, tuple});
+
+  Analyze(schedule);
+
+  // Parameters live from beginning to end.
+  EXPECT_EQ(LiveRangeAt(paramA), TimeBound({0, 6}));
+  EXPECT_EQ(LiveRangeAt(paramX), TimeBound({0, 6}));
+  EXPECT_EQ(LiveRangeAt(paramY), TimeBound({0, 6}));
+  // Mul starts after parameter are defined (Note: all parameters are defined at
+  // 0, mul starts at 2 which is an arbitrary number).
+  EXPECT_EQ(LiveRangeAt(mul), TimeBound({2, 6}));
+  // Add lives after mul is defined to the end of the program.
+  EXPECT_EQ(LiveRangeAt(add), TimeBound({4, 6}));
+}
+
+TEST_F(HloLiveRangeTest, InstructionScheduledAfterRoot) {
+  // If a buffer is live out, its life range is extened to the end of
+  // computation.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+  auto add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({mul, add}));
+  module_->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module_.get());
+
+  // Schedule another instruction after root.
+  schedule.set_sequence(module_->entry_computation(),
+                        {paramA, paramX, mul, paramY, add, tuple, add2});
+
+  Analyze(schedule);
+
+  // Parameters live from beginning to end.
+  EXPECT_EQ(LiveRangeAt(paramA), TimeBound({0, 7}));
+  EXPECT_EQ(LiveRangeAt(paramX), TimeBound({0, 7}));
+  EXPECT_EQ(LiveRangeAt(paramY), TimeBound({0, 7}));
+  // Live out buffers live through the computation.
+
+  EXPECT_EQ(LiveRangeAt(mul), TimeBound({2, 7}));
+  EXPECT_EQ(LiveRangeAt(add), TimeBound({4, 7}));
+  EXPECT_EQ(LiveRangeAt(tuple), TimeBound({5, 7}));
+  EXPECT_EQ(LiveRangeAt(add2), TimeBound({6, 6}));
+}
+
+TEST_F(HloLiveRangeTest, AliasedParameter) {
+  // If a parameter is non-readonly(non-aliased), its live range can end in the
+  // middle of the program.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+  module_->AddEntryComputation(builder.Build());
+  // Set up alias of the first parameter.
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      {}, 0, {}, HloInputOutputAliasConfig::kUserAlias));
+
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(),
+                        {paramA, paramX, mul, paramY, add});
+
+  Analyze(schedule);
+
+  // Non-readonly parameter live like other normal buffers.
+  EXPECT_EQ(LiveRangeAt(paramA), TimeBound({0, 2}));
+
+  // Readonly parameters live from beginning to end.
+  EXPECT_EQ(LiveRangeAt(paramX), TimeBound({0, 5}));
+  EXPECT_EQ(LiveRangeAt(paramY), TimeBound({0, 5}));
+  // Mul starts after parameter are defined (Note: all parameters are defined at
+  // 0, mul starts at 2 which is an arbitrary number).
+  EXPECT_EQ(LiveRangeAt(mul), TimeBound({2, 4}));
+  // Add lives after mul is defined to the end of the program.
+  EXPECT_EQ(LiveRangeAt(add), TimeBound({4, 5}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index a75fc0bbc3f..789ec5d21a9 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -215,6 +215,8 @@ HLO_MATCHER(Constant);
 HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
+HLO_MATCHER(CopyDone);
+HLO_MATCHER(CopyStart);
 HLO_MATCHER(AllReduce);
 HLO_MATCHER(CollectivePermute);
 HLO_MATCHER(Divide);
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index fbef51c4ce6..ac74d5b0f65 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -215,7 +216,7 @@ void HloModule::ReplaceComputations(
 
 string HloModule::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
-  s << "HloModule " << name();
+  s << "HloModule " << PrintName(name(), options.print_ids());
   if (has_schedule()) {
     TF_CHECK_OK(schedule().Verify());
     s << ", is_scheduled=true";
@@ -661,6 +662,12 @@ HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
   return it == computations_in_module.end() ? nullptr : *it;
 }
 
+uint64 HloModule::Hash() const {
+  return tensorflow::Hash64Combine(
+      entry_computation_layout().Hash(),
+      entry_computation()->root_instruction()->Hash());
+}
+
 /* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 950c7a72f45..b6a72db434a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -146,9 +146,7 @@ class HloModule {
   // information on opcode, shape, operands, and typically a root instruction.
   // This function returns the same hash value for equivalent HLO modules,
   // with respect to HloInstruction::Identical() method.
-  uint64 Hash() const {
-    return entry_computation()->root_instruction()->Hash();
-  }
+  uint64 Hash() const;
 
   // Gets the computations in this module.
   //
@@ -300,6 +298,38 @@ class HloModule {
     return &fusion_config_;
   }
 
+  // Checks if this config has a list of entry parameters' HLO shardings for
+  // SPMD.
+  bool has_spmd_parameters_shardings() const {
+    return spmd_parameters_shardings_.has_value();
+  }
+
+  // Getter and setter for the list of entry parameters' HLO shardings for SPMD.
+  const std::vector<HloSharding>& spmd_parameters_shardings() const {
+    CHECK(spmd_parameters_shardings_.has_value());
+    return *spmd_parameters_shardings_;
+  }
+  void set_spmd_parameters_shardings(
+      const std::vector<HloSharding>& shardings) {
+    spmd_parameters_shardings_ = shardings;
+  }
+
+  // Checks if this config has the entry computation output's HLO sharding for
+  // SPMD.
+  bool has_spmd_output_sharding() const {
+    return spmd_output_sharding_.has_value();
+  }
+
+  // Getter and setter for the entry computation output's HLO shardings for
+  // SPMD.
+  const HloSharding& spmd_output_sharding() const {
+    CHECK(spmd_output_sharding_.has_value());
+    return *spmd_output_sharding_;
+  }
+  void set_spmd_output_sharding(const HloSharding& sharding) {
+    spmd_output_sharding_ = sharding;
+  }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -342,6 +372,14 @@ class HloModule {
 
   // Fusion configuration.
   std::vector<std::vector<bool>> fusion_config_;
+
+  // The HLO shardings of the entry computation's parameters for
+  // SPMD-partitioned programs.
+  absl::optional<std::vector<HloSharding>> spmd_parameters_shardings_;
+
+  // The HLO sharding of the entry computation's output (root) for
+  // SPMD-partitioned programs.
+  absl::optional<HloSharding> spmd_output_sharding_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index d8ded5f7641..de4df445ac5 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -34,6 +34,26 @@ namespace xla {
 // executable.
 class HloModuleConfig {
  public:
+  // Represents a pair of input and output of the entry computation that can be
+  // considered as the original and updated values of a variable maintained by
+  // the caller, and that can be transparently sharded by XLA as an internal
+  // optimization. If sharded, XLA will create separate sharding/unsharding
+  // programs, and the caller is responsible to call the XLA-generated
+  // sharding/unsharding programs before and after the sharded main program.
+  //
+  // The sharding/unsharding programs will include all the input/output pairs in
+  // shardable_value_update_pairs() as a flat tuple in their inputs/outputs,
+  // sorted by (input_parameter_number, parameter_shape_index).
+  //
+  // A typical usage pattern is to shard the variables first, then repeatedly
+  // invoke the main program, and finally invoke the unsharding program before
+  // they are used in full-shape.
+  struct ShardableValueUpdatePair {
+    int64 input_parameter_number;
+    ShapeIndex parameter_shape_index;
+    ShapeIndex output_shape_index;
+  };
+
   // A configuration can be created either with, or without an entry
   // ComputationLayout. The default ctor creates it without -- in this case
   // accessing entry_computation_layout will CHECK-fail. The ctor accepting a
@@ -118,6 +138,15 @@ class HloModuleConfig {
     static_device_assignment_ = device_assignment;
   }
 
+  const std::vector<ShardableValueUpdatePair> shardable_value_update_pairs()
+      const {
+    return shardable_value_update_pairs_;
+  }
+  void set_shardable_value_update_pairs(
+      std::vector<ShardableValueUpdatePair> pairs) {
+    shardable_value_update_pairs_ = std::move(pairs);
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -137,6 +166,8 @@ class HloModuleConfig {
 
   // Compile-time known device assignment.
   absl::optional<DeviceAssignment> static_device_assignment_;
+
+  std::vector<ShardableValueUpdatePair> shardable_value_update_pairs_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 2589de633d0..c96bfb15187 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -88,6 +88,7 @@ class HloParser {
   // Stand alone parsing utils for various aggregate data types.
   StatusOr<Shape> ParseShapeOnly();
   StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<FrontendAttributes> ParseFrontendAttributesOnly();
   StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
@@ -192,6 +193,7 @@ class HloParser {
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
+    kFrontendAttributes,
     kParameterReplication,
     kInstructionList,
     kSliceRanges,
@@ -271,6 +273,7 @@ class HloParser {
   bool ParsePaddingConfig(PaddingConfig* padding);
   bool ParseMetadata(OpMetadata* metadata);
   bool ParseSharding(OpSharding* sharding);
+  bool ParseFrontendAttributes(FrontendAttributes* frontend_attributes);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
   bool ParseParameterReplication(ParameterReplication* parameter_replication);
   bool ParseReplicaGroupsOnly(std::vector<ReplicaGroup>* replica_groups);
@@ -677,7 +680,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   // Add optional attributes.
   std::unordered_map<string, AttrConfig> attrs;
   optional<OpSharding> sharding;
+  optional<FrontendAttributes> frontend_attributes;
   attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
+  attrs["frontend_attributes"] = {
+      /*required=*/false, AttrTy::kFrontendAttributes, &frontend_attributes};
   optional<ParameterReplication> parameter_replication;
   attrs["parameter_replication"] = {/*required=*/false,
                                     AttrTy::kParameterReplication,
@@ -1678,6 +1684,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<int64>> slice_sizes;
       attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &slice_sizes};
+      optional<bool> indices_are_sorted = false;
+      attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool,
+                                     &indices_are_sorted};
 
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
@@ -1693,7 +1702,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
 
       instruction = builder->AddInstruction(HloInstruction::CreateGather(
           shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
-          dim_numbers, *slice_sizes));
+          dim_numbers, *slice_sizes, indices_are_sorted.value()));
       break;
     }
     case HloOpcode::kScatter: {
@@ -1714,6 +1723,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> update_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &update_computation};
+      optional<bool> indices_are_sorted = false;
+      attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool,
+                                     &indices_are_sorted};
 
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
           !ParseAttributes(attrs)) {
@@ -1729,7 +1741,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
 
       instruction = builder->AddInstruction(HloInstruction::CreateScatter(
           shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1],
-          /*updates=*/operands[2], *update_computation, dim_numbers));
+          /*updates=*/operands[2], *update_computation, dim_numbers,
+          indices_are_sorted.value()));
       break;
     }
     case HloOpcode::kDomain: {
@@ -1838,6 +1851,36 @@ bool HloParser::ParseSharding(OpSharding* sharding) {
   return ParseToken(TokKind::kRbrace, "expected '}' to end sharding attribute");
 }
 
+// frontend_attributes ::= '{' attributes '}'
+// attributes
+//   ::= /*empty*/
+//   ::= attribute '=' value (',' attribute '=' value)*
+bool HloParser::ParseFrontendAttributes(
+    FrontendAttributes* frontend_attributes) {
+  CHECK(frontend_attributes != nullptr);
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start frontend attributes")) {
+    return false;
+  }
+  if (lexer_.GetKind() == TokKind::kRbrace) {
+    // empty
+  } else {
+    do {
+      string attribute;
+      if (!ParseAttributeName(&attribute)) {
+        return false;
+      }
+      if (lexer_.GetKind() != TokKind::kIdent) {
+        return false;
+      }
+      (*frontend_attributes->mutable_map())[attribute] = lexer_.GetStrVal();
+      lexer_.Lex();
+    } while (EatIfPresent(TokKind::kComma));
+  }
+  return ParseToken(TokKind::kRbrace,
+                    "expects '}' at the end of frontend attributes");
+}
+
 //  ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape?
 //          ('devices=' ('[' dims ']')* device_list)? '}'
 // dims ::= int_list device_list ::= int_list
@@ -2857,6 +2900,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
         return true;
       }
+      case AttrTy::kFrontendAttributes: {
+        FrontendAttributes frontend_attributes;
+        if (!ParseFrontendAttributes(&frontend_attributes)) {
+          return false;
+        }
+        static_cast<optional<FrontendAttributes>*>(attr_out_ptr)
+            ->emplace(frontend_attributes);
+        return true;
+      }
       case AttrTy::kParameterReplication: {
         ParameterReplication parameter_replication;
         if (!ParseParameterReplication(&parameter_replication)) {
@@ -4113,6 +4165,19 @@ StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
+StatusOr<FrontendAttributes> HloParser::ParseFrontendAttributesOnly() {
+  lexer_.Lex();
+  FrontendAttributes attributes;
+  if (!ParseFrontendAttributes(&attributes)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after frontend attributes");
+  }
+  return attributes;
+}
+
 StatusOr<std::vector<bool>> HloParser::ParseParameterReplicationOnly() {
   lexer_.Lex();
   ParameterReplication parameter_replication;
@@ -4261,6 +4326,11 @@ StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   return parser.ParseShardingOnly();
 }
 
+StatusOr<FrontendAttributes> ParseFrontendAttributes(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseFrontendAttributesOnly();
+}
+
 StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
   HloParser parser(str);
   return parser.ParseParameterReplicationOnly();
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index e4214c1e6b5..91ce79ec982 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -54,6 +54,12 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
+// Parses frontend attributes from str. str is supposed to contain the body of
+// the frontend attributes , i.e. just the rhs of the
+// "frontend_attributes={...}" attribute string, e.g.,
+// "{attr_a=a,attr_b=b}".
+StatusOr<FrontendAttributes> ParseFrontendAttributes(absl::string_view str);
+
 // Parses parameter replication from str. str is supposed to contain the body of
 // the parameter replication, i.e. just the rhs of the
 // "parameter_replication={...}" attribute string, e.g., "{true, false}".
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index b9a017ada43..c913784cd13 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -875,7 +875,7 @@ ENTRY %sparse_f32_r1 () -> f32[9] {
 )"
 },
 {
-"gather",
+"Gather",
 R"(HloModule StringifyGather
 
 ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
@@ -887,7 +887,19 @@ ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]
 )"
 },
 {
-"scatter",
+"SortedGather",
+R"(HloModule StringifyGather
+
+ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
+  %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
+  %start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26}, indices_are_sorted=true
+}
+
+)"
+},
+{
+"Scatter",
 R"(HloModule StringifyScatter
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
@@ -903,6 +915,25 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
   ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, to_apply=%add_F32.v3
 }
 
+)"
+},
+{
+"SortedScatter",
+R"(HloModule StringifySortedScatter
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7,5], updates: f32[10,9,8,7,30,29,28,27,26]) -> f32[50,49,48,47,46] {
+  %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
+  %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2)
+  ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, indices_are_sorted=true, to_apply=%add_F32.v3
+}
+
 )"
 },
 {
@@ -2327,6 +2358,13 @@ TEST_F(HloParserTest, ParseSharding) {
   EXPECT_EQ(sharding.ToString(), original);
 }
 
+TEST_F(HloParserTest, ParseFrontendAttributes) {
+  const string original = "{attr_a=test_a,attr_b=b}";
+  TF_ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
+                          ParseFrontendAttributes(original));
+  EXPECT_EQ(FrontendAttributesToString(frontend_attributes), original);
+}
+
 TEST_F(HloParserTest, ParseWindow) {
   Window original = window_util::MakeWindow({1, 2, 3});
   TF_ASSERT_OK_AND_ASSIGN(Window parsed,
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 603371d830f..445a3ea97d2 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -100,6 +100,17 @@ bool CanBeRematerialized(
 using BufferId = int64;
 using BufferIdList = absl::InlinedVector<BufferId, 3>;
 
+struct RematStrategy {
+  enum {
+    // Recompute the node at a later program point.
+    kRecompute,
+    // Change the layout into a compact form and uncompress it back at a later
+    // program point.
+    kCompress,
+  } kind;
+  Shape compact_shape;
+};
+
 // We wrap HloInstruction* with an Item that holds auxiliary
 // per-instruction state.
 struct Item {
@@ -117,6 +128,10 @@ struct Item {
   // The buffers defined by this instruction.
   BufferIdList buffers_defined;
 
+  // Output buffers of this instruction. This is used to track outputs by GTE
+  // instructions (where the instruction doesn't define a buffer).
+  BufferIdList buffers_output;
+
   // The buffers used by this instruction.
   BufferIdList buffers_used;
 
@@ -251,6 +266,32 @@ class InstructionList {
     return InsertBefore(to_insert, min_position_item);
   }
 
+  void InsertAfterInstructions(Item* to_insert,
+                               absl::Span<Item* const> after_instructions) {
+    VLOG(3) << "InsertAfterInstructions: " << to_insert->instruction->name()
+            << " after {"
+            << absl::StrJoin(after_instructions, ", ",
+                             [](string* out, Item* item) {
+                               absl::StrAppend(out, item->instruction->name());
+                             })
+            << "}";
+
+    // Find the max position number of any instruction in
+    // 'after_instructions'.
+    CHECK(!after_instructions.empty());
+    Item* max_position_item = nullptr;
+    for (Item* item : after_instructions) {
+      if (max_position_item == nullptr ||
+          item->position > max_position_item->position) {
+        max_position_item = item;
+      }
+    }
+    // No rematerializable instruction should be inserted at the end of the
+    // computation.
+    CHECK(max_position_item->next != nullptr);
+    InsertBeforeInstructions(to_insert, {max_position_item->next});
+  }
+
   void Blacklist(const HloInstruction* inst) {
     GetItem(inst)->blacklisted = true;
   }
@@ -327,6 +368,7 @@ class MemoryUsageTracker {
   MemoryUsageTracker(
       const HloComputation* computation,
       const HloRematerialization::ShapeSizeFunction& size_function,
+      const HloRematerialization::CompactShapeFunction& compact_shape_function,
       const TuplePointsToAnalysis& points_to_analysis,
       const InstructionList& instruction_list);
 
@@ -338,6 +380,22 @@ class MemoryUsageTracker {
   // EndInstruction memory for dead operand(s) is freed.
   Status BeginInstruction(Item* item);
 
+  int64 RematerializationCost(const HloInstruction* instruction,
+                              int64 memory_reduced, int64 memory_limit_bytes) {
+    // If none of the users of 'instruction' have been placed in the sequence
+    // (as tracked by memory_tracker), then rematerialization of 'instruction'
+    // is a zero-cost move of 'instruction' in the sequence.
+    if (!absl::c_any_of(
+            instruction->users(),
+            [this](const HloInstruction* inst) { return IsPlaced(inst); })) {
+      return 0;
+    }
+
+    CHECK_GT(memory_reduced, 0);
+    // Return the inverse of the benefit of rematerialization.
+    return memory_limit_bytes / memory_reduced;
+  }
+
   // Finishes the placement of the current instruction. This frees any dead
   // operands or dead result of the instruction. This must be called after
   // each call to BeginInstruction.
@@ -347,17 +405,28 @@ class MemoryUsageTracker {
   // if the given instruction is rematerialized.
   int64 MemoryReducedIfRematerialized(Item* item) const;
 
+  // Returns the number of bytes that the current memory usage will be reduced
+  // if the given instruction is compact.
+  int64 MemoryReducedIfCompressed(Item* item, const Shape& compact_shape) const;
+
   // Returns the number of bytes that the current memory usage will be reduced
   // by if the given sequence of instructions is rematerialized.
   int64 MemoryReducedIfRematerialized(const absl::Span<Item*>& items) const;
 
+  Status AddCompressInstructions(Item* original_item, Item* compressed_item,
+                                 Item* uncompressed_item);
+
   // Adjusts memory usage to account for the rematerialization of
   // original_item for all remaining unplaced uses. The rematerialization
   // is remat_item. This method should be called after the HLO graph has
-  // been transformed (rematerialization instruction created and connected to
-  // uses).
+  // been transformed (rematerialization instruction created and connected
+  // to uses).
   Status AddRematerializedInstruction(Item* original_item, Item* remat_item);
 
+  std::pair<Item*, RematStrategy> PickRematerializationCandidate(
+      const InstructionList& instruction_list, int64 memory_limit_bytes,
+      absl::flat_hash_map<const HloInstruction*, bool>* remat_able);
+
   // Returns whether the given instruction has been placed (BeginInstruction
   // has been called with 'instruction' as the argument).
   bool IsPlaced(const HloInstruction* instruction) const {
@@ -390,6 +459,9 @@ class MemoryUsageTracker {
     // The materialized size of the buffer in bytes.
     const int64 size;
 
+    // Shape of the buffer.
+    Shape shape;
+
     // Whether this buffer is live-out of the computation.
     bool live_out;
 
@@ -412,19 +484,21 @@ class MemoryUsageTracker {
     }
   };
 
+  // Get the compact shape of given hlo instruction. An internal cache is used
+  // to avoid computing the shape multiple times.
+  StatusOr<Shape> GetCompactShape(const HloInstruction* hlo);
+
   // Creates a Buffer representing the given logical buffer. The buffer is added
   // to buffers_ and a reference is returned.
   Buffer& CreateBufferFromLogicalBuffer(
       const LogicalBuffer* logical_buffer,
-      const TuplePointsToAnalysis& points_to_analysis,
-      const HloRematerialization::ShapeSizeFunction& size_function,
-      bool live_out) {
+      const TuplePointsToAnalysis& points_to_analysis, bool live_out) {
     bool has_indirect_uses = false;
     ItemList users = GetUsers(instruction_list_, logical_buffer,
                               points_to_analysis, &has_indirect_uses);
     return NewBuffer(instruction_list_.GetItem(logical_buffer->instruction()),
-                     size_function(logical_buffer->shape()), std::move(users),
-                     live_out, has_indirect_uses);
+                     logical_buffer->shape(), std::move(users), live_out,
+                     has_indirect_uses);
   }
 
   // Create a new buffer representing a rematerialization of given buffer for
@@ -438,7 +512,7 @@ class MemoryUsageTracker {
     for (Item* use : rematerialized_uses) {
       CHECK(!use->placed) << use->instruction->name();
     }
-    return NewBuffer(remat_item, original_buffer.size,
+    return NewBuffer(remat_item, original_buffer.shape,
                      std::move(rematerialized_uses), /*live_out=*/false,
                      /*has_indirect_uses=*/false);
   }
@@ -449,7 +523,8 @@ class MemoryUsageTracker {
   // different computation.
   int64 AllocatedSize(BufferId buffer_id) const {
     const Buffer& buffer = buffers_.at(buffer_id);
-    HloOpcode def_opcode = buffer.defining_instruction->instruction->opcode();
+    HloInstruction* inst = buffer.defining_instruction->instruction;
+    HloOpcode def_opcode = inst->opcode();
     if (buffer.live_out || def_opcode == HloOpcode::kParameter) {
       return 0;
     } else {
@@ -473,7 +548,7 @@ class MemoryUsageTracker {
     return absl::c_linear_search(in_progress_uses, buffer_id);
   }
 
-  // Returns whether the given instruction is live at the current program
+  // Returns whether the given buffer is live at the current program
   // point.
   bool IsCurrentlyLive(BufferId buffer_id) const {
     const Buffer& buffer = buffers_[buffer_id];
@@ -481,13 +556,30 @@ class MemoryUsageTracker {
             buffer.unfinished_user_count > 0);
   }
 
+  // Returns whether the given instruction is live at the current program
+  // point.
+  bool IsInstructionCurrentlyLive(Item* instruction) const {
+    // If the instruction has not started yet, it is not alive.
+    if (!IsPlaced(instruction->instruction)) {
+      return false;
+    }
+    for (const HloInstruction* user : instruction->instruction->users()) {
+      if (!IsPlaced(user)) {
+        // If there is an unplaced user, consider this instruction currently
+        // live.
+        return true;
+      }
+    }
+    return false;
+  }
+
   // Create a new buffer, add it to buffers_, and return a reference.
-  Buffer& NewBuffer(Item* defining_instruction, int64 size, ItemList&& users,
-                    bool live_out, bool has_indirect_uses) {
+  Buffer& NewBuffer(Item* defining_instruction, const Shape& shape,
+                    ItemList&& users, bool live_out, bool has_indirect_uses) {
     int buffer_id = buffers_.size();
-    buffers_.push_back(Buffer{buffer_id, defining_instruction, size, live_out,
-                              has_indirect_uses, users,
-                              static_cast<int64>(users.size())});
+    buffers_.push_back(Buffer{
+        buffer_id, defining_instruction, size_function_(shape), shape, live_out,
+        has_indirect_uses, users, static_cast<int64>(users.size())});
     return buffers_.back();
   }
 
@@ -498,6 +590,16 @@ class MemoryUsageTracker {
   // (BeginInstruction/EndInstruction calls).
   const InstructionList& instruction_list_;
 
+  // Size function returns the bytes of a given buffer.
+  const HloRematerialization::ShapeSizeFunction& size_function_;
+
+  // Converts a shape into compact form, returns the same shape if a shape is
+  // already considered compact.
+  const HloRematerialization::CompactShapeFunction& compact_shape_function_;
+
+  // A map that caches existing known compact shape for each instruction.
+  absl::flat_hash_map<const HloInstruction*, Shape> compact_shape_;
+
   // Memory usage at the currently placed instruction.
   int64 memory_usage_ = 0;
 
@@ -512,9 +614,13 @@ class MemoryUsageTracker {
 MemoryUsageTracker::MemoryUsageTracker(
     const HloComputation* computation,
     const HloRematerialization::ShapeSizeFunction& size_function,
+    const HloRematerialization::CompactShapeFunction& compact_shape_function,
     const TuplePointsToAnalysis& points_to_analysis,
     const InstructionList& instruction_list)
-    : computation_(computation), instruction_list_(instruction_list) {
+    : computation_(computation),
+      instruction_list_(instruction_list),
+      size_function_(size_function),
+      compact_shape_function_(compact_shape_function) {
   PointsToSet::BufferSet live_out_set =
       points_to_analysis.GetPointsToSet(computation_->root_instruction())
           .CreateFlattenedSet();
@@ -556,7 +662,7 @@ MemoryUsageTracker::MemoryUsageTracker(
         }
       } else {
         buffer = &CreateBufferFromLogicalBuffer(
-            logical_buffer, points_to_analysis, size_function,
+            logical_buffer, points_to_analysis,
             ContainsKey(live_out_set, logical_buffer));
         item->buffers_defined.push_back(buffer->id);
         for (Item* user : buffer->users) {
@@ -566,6 +672,14 @@ MemoryUsageTracker::MemoryUsageTracker(
 
       logical_buffer_to_buffer_id[logical_buffer] = buffer->id;
     }
+
+    // Trace the output of each instruction. This is so that we can properly
+    // track which outputs does GTEs have.
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetPointsToSet(instruction).CreateFlattenedSet()) {
+      item->buffers_output.push_back(
+          logical_buffer_to_buffer_id[logical_buffer]);
+    }
   }
   XLA_VLOG_LINES(10, ToString());
   DCHECK(Check());
@@ -611,7 +725,8 @@ Status MemoryUsageTracker::EndInstruction() {
       // Buffer is now dead.
       VLOG(3) << "  " << buffer.ToString() << " is now dead.";
       memory_usage_ -= AllocatedSize(buffer_id);
-      CHECK_GE(memory_usage_, 0);
+      // The memory usage can become negative inside the computation as we can
+      // free up the parameter space and reuse it for other tensors.
     }
   }
 
@@ -622,7 +737,8 @@ Status MemoryUsageTracker::EndInstruction() {
     if (buffer.unfinished_user_count == 0) {
       VLOG(3) << "  " << buffer.ToString() << " is immediately dead.";
       memory_usage_ -= AllocatedSize(buffer_id);
-      CHECK_GE(memory_usage_, 0);
+      // The memory usage can become negative inside the computation as we can
+      // free up the parameter space and reuse it for other tensors.
     }
   }
 
@@ -637,6 +753,30 @@ Status MemoryUsageTracker::EndInstruction() {
   return Status::OK();
 }
 
+int64 MemoryUsageTracker::MemoryReducedIfCompressed(
+    Item* item, const Shape& compact_shape) const {
+  CHECK_NE(in_progress_item_, nullptr);
+  if (!item->placed || item == in_progress_item_) {
+    return 0;
+  }
+
+  int64 memory_reduced = 0;
+
+  // We only compress a single piece of an output at one time.
+  CHECK_EQ(item->buffers_output.size(), 1);
+  BufferId buffer_id = item->buffers_output[0];
+  if (IsCurrentlyLive(buffer_id) && !IsInUse(buffer_id) &&
+      IsInstructionCurrentlyLive(item)) {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    memory_reduced += buffer.size;
+
+    int64 compact_shape_size = size_function_(compact_shape);
+    // Account for buffers that are compressed after instruction.
+    memory_reduced -= compact_shape_size;
+  }
+  return memory_reduced;
+}
+
 int64 MemoryUsageTracker::MemoryReducedIfRematerialized(Item* item) const {
   CHECK_NE(in_progress_item_, nullptr);
   if (!item->placed || item == in_progress_item_) {
@@ -736,6 +876,56 @@ int64 MemoryUsageTracker::MemoryReducedIfRematerialized(
   return memory_reduced;
 }
 
+Status MemoryUsageTracker::AddCompressInstructions(Item* original_item,
+                                                   Item* compressed_item,
+                                                   Item* uncompressed_item) {
+  // Original buffer is now dead.
+  memory_usage_ -= size_function_(original_item->instruction->shape());
+  // Compressed buffer is now alive.
+  memory_usage_ += size_function_(compressed_item->instruction->shape());
+
+  ItemList placed_users;
+  ItemList unplaced_users;
+  CHECK_EQ(original_item->buffers_output.size(), 1);
+  BufferId original_buffer_id = original_item->buffers_output[0];
+  Buffer& original_buffer = buffers_.at(original_buffer_id);
+  for (Item* user : original_buffer.users) {
+    if (user->placed) {
+      CHECK(IsFinished(user)) << user->instruction->name();
+      placed_users.push_back(user);
+    } else {
+      unplaced_users.push_back(user);
+    }
+  }
+  original_buffer.users = std::move(placed_users);
+  original_buffer.unfinished_user_count = 0;
+  original_buffer.users.push_back(compressed_item);
+  Buffer& compressed_buffer =
+      NewBuffer(compressed_item, compressed_item->instruction->shape(),
+                {uncompressed_item}, /*live_out=*/false,
+                /*has_indirect_uses=*/false);
+  compressed_item->buffers_used = original_item->buffers_output;
+  compressed_item->buffers_output = {compressed_buffer.id};
+  compressed_item->buffers_defined.push_back(compressed_buffer.id);
+
+  Buffer& uncompressed_buffer =
+      NewBuffer(uncompressed_item, uncompressed_item->instruction->shape(),
+                std::move(unplaced_users), /*live_out=*/false,
+                /*has_indirect_uses=*/false);
+
+  uncompressed_item->buffers_used = {compressed_item->buffers_output[0]};
+  uncompressed_item->buffers_output = {uncompressed_buffer.id};
+  uncompressed_item->buffers_defined = {uncompressed_buffer.id};
+
+  for (Item* user : uncompressed_buffer.users) {
+    BufferIdList& buffers_used = user->buffers_used;
+    std::replace(buffers_used.begin(), buffers_used.end(), original_buffer_id,
+                 uncompressed_buffer.id);
+  }
+
+  return Status::OK();
+}
+
 Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
                                                         Item* remat_item) {
   VLOG(3) << "AddRematerializedInstruction: original_instruction = "
@@ -831,6 +1021,17 @@ string MemoryUsageTracker::ToString() const {
   return output;
 }
 
+StatusOr<Shape> MemoryUsageTracker::GetCompactShape(const HloInstruction* hlo) {
+  auto it = compact_shape_.find(hlo);
+  if (it != compact_shape_.end()) {
+    return it->second;
+  }
+  const Shape& original_shape = hlo->shape();
+  TF_ASSIGN_OR_RETURN(Shape min_shape, compact_shape_function_(original_shape));
+  compact_shape_[hlo] = min_shape;
+  return min_shape;
+}
+
 bool MemoryUsageTracker::Check() const {
   auto elements_are_unique = [](const BufferIdList& vec) {
     return vec.size() == std::set<BufferId>(vec.begin(), vec.end()).size();
@@ -917,12 +1118,15 @@ int64 RematerializationCost(const HloInstruction* instruction,
 // candidate which reduce memory use at the program point of the current
 // instruction as indicated by memory_tracker. nullptr is returned if no
 // candidate can be found.
-Item* PickRematerializationCandidate(
-    const MemoryUsageTracker& memory_tracker,
+std::pair<Item*, RematStrategy>
+MemoryUsageTracker::PickRematerializationCandidate(
     const InstructionList& instruction_list, int64 memory_limit_bytes,
     absl::flat_hash_map<const HloInstruction*, bool>* remat_able) {
   Item* best_item = nullptr;
   int64 best_cost = 0;
+  RematStrategy best_strategy;
+
+  VLOG(5) << "Picking candidate";
 
   // TODO(b/35244891): This is currently quadratic in the number of HLO
   // instructions.
@@ -947,44 +1151,215 @@ Item* PickRematerializationCandidate(
     if (!CanBeRematerialized(candidate, remat_able)) {
       VLOG(5) << "candidate " << candidate->name()
               << " not viable: is not rematerializable";
+
       continue;
     }
 
-    // If any of the candidate's control successor has been placed, we need to
-    // skip this candidate. Otherwise we will violate control dependency.
-    bool control_successor_placed =
-        std::any_of(candidate->control_successors().begin(),
-                    candidate->control_successors().end(),
-                    [&memory_tracker](const HloInstruction* inst) {
-                      return memory_tracker.IsPlaced(inst);
-                    });
+    if (item->buffers_output.size() == 1) {
+      // Only consider compressing single output instruction.
+      const Buffer& output_buffer = buffers_.at(item->buffers_output[0]);
+
+      if (item->placed && item != in_progress_item_ &&
+          !output_buffer.live_out) {
+        const Shape& original_shape = item->instruction->shape();
+        if (original_shape.IsArray()) {
+          Shape compact_shape = GetCompactShape(item->instruction).ValueOrDie();
+          const int64 memory_reduced =
+              MemoryReducedIfCompressed(item, compact_shape);
+          if (memory_reduced > 0) {
+            const int64 cost = memory_limit_bytes / memory_reduced;
+            if (best_item == nullptr || cost < best_cost) {
+              VLOG(3) << "candidate " << candidate->name() << "("
+                      << candidate->ToShortString() << ")"
+                      << " now best when compressed into "
+                      << compact_shape.ToString(true);
+              RematStrategy strategy;
+              strategy.kind = RematStrategy::kCompress;
+              best_strategy = strategy;
+              best_strategy.compact_shape = compact_shape;
+              best_item = item;
+              best_cost = cost;
+            }
+          }
+        }
+      }
+    }
+
+    // If any of the candidate's control successor has been placed, we need
+    // to skip this candidate. Otherwise we will violate control dependency.
+    bool control_successor_placed = std::any_of(
+        candidate->control_successors().begin(),
+        candidate->control_successors().end(),
+        [this](const HloInstruction* inst) { return IsPlaced(inst); });
 
     if (control_successor_placed) {
       continue;
     }
 
-    const int64 memory_reduced =
-        memory_tracker.MemoryReducedIfRematerialized(item);
+    const int64 memory_reduced = MemoryReducedIfRematerialized(item);
 
-    if (memory_reduced <= 0) {
-      VLOG(5) << "candidate " << candidate->name()
-              << " memory reduced = " << memory_reduced << " <=  0";
-      continue;
-    }
+    if (memory_reduced > 0) {
+      const int cost =
+          RematerializationCost(candidate, memory_reduced, memory_limit_bytes);
 
-    const int cost = RematerializationCost(candidate, memory_tracker,
-                                           memory_reduced, memory_limit_bytes);
+      VLOG(5) << "candidate " << candidate->name() << ", memory reduced "
+              << memory_reduced << ", cost per byte " << cost;
 
-    VLOG(5) << "candidate " << candidate->name() << ", memory reduced "
-            << memory_reduced << ", cost per byte " << cost;
-
-    if (best_item == nullptr || cost < best_cost) {
-      VLOG(5) << "candidate " << candidate->name() << " now best";
-      best_item = item;
-      best_cost = cost;
+      if (best_item == nullptr || cost < best_cost) {
+        VLOG(5) << "candidate " << candidate->name() << " now best";
+        best_strategy.kind = RematStrategy::kRecompute;
+        best_item = item;
+        best_cost = cost;
+      }
     }
   }
-  return best_item;
+  return {best_item, best_strategy};
+}
+
+StatusOr<int64> RematerializeInstruction(
+    MemoryUsageTracker* memory_tracker, Item* best_item,
+    absl::flat_hash_set<const HloInstruction*>* remat_move_instructions,
+    InstructionList* instruction_list) {
+  HloInstruction* best = best_item->instruction;
+  VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
+          << HumanReadableNumBytes(
+                 memory_tracker->MemoryReducedIfRematerialized(best_item))
+          << ")";
+
+  int64 net_instructions_added = 0;
+
+  HloComputation* computation = best->parent();
+
+  HloInstruction* remat =
+      computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
+
+  // Add control dependencies to the new operation.
+  for (auto successor : best->control_successors()) {
+    TF_RETURN_IF_ERROR(remat->AddControlDependencyTo(successor));
+  }
+  for (auto predecessor : best->control_predecessors()) {
+    TF_RETURN_IF_ERROR(predecessor->AddControlDependencyTo(remat));
+  }
+
+  Item* remat_item = instruction_list->CreateItem(remat);
+
+  // Replace each remaining use of 'best' with the rematerialization.
+  std::vector<HloInstruction*> best_users_copy = best->users();
+  for (HloInstruction* user : best_users_copy) {
+    if (!memory_tracker->IsPlaced(user)) {
+      VLOG(2) << "  Replacing use of " << best->name() << " in " << user->name()
+              << " with " << remat->name();
+      TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, remat));
+    }
+  }
+
+  // Account for the rematerialization in the memory tracker.
+  TF_RETURN_IF_ERROR(
+      memory_tracker->AddRematerializedInstruction(best_item, remat_item));
+
+  // Insert rematerialized instruction right before the earliest unplaced
+  // use of the instruction *and* the earliest unplaced last use of any
+  // operands of remat. Unplaced uses of the remat's operands are included
+  // because we don't want to extend the live range of remat's operands as
+  // this could increase memory usage.
+  ItemList place_before;
+  for (auto user : remat->users()) {
+    place_before.push_back(instruction_list->GetItem(user));
+  }
+  for (auto* operand : remat->operands()) {
+    for (auto* operand_user : operand->users()) {
+      if (operand_user != remat) {
+        Item* operand_user_item = instruction_list->GetItem(operand_user);
+        if (!operand_user_item->placed) {
+          place_before.push_back(operand_user_item);
+        }
+      }
+    }
+  }
+  // Insert rematerialized instruction before any of its successors to
+  // preserve ordering regarding control dependency.
+  for (auto successor : remat->control_successors()) {
+    Item* successor_item = instruction_list->GetItem(successor);
+    // Assert to make sure we never remat an operation with control
+    // successor already placed.
+    CHECK(!successor_item->placed) << successor_item->instruction->name();
+    place_before.push_back(successor_item);
+  }
+  instruction_list->InsertBeforeInstructions(remat_item, place_before);
+
+  // If the rematerialized instruction is dead then rematerialization is
+  // essentially a move. Don't delete the instruction now because we don't
+  // want duplicate HloInstruction* values during the course of the
+  // transformation because we keep maps with HloInstruction* values as
+  // keys.
+  if (best->users().empty()) {
+    VLOG(2) << best->name() << " is now dead";
+    if (ContainsKey(*remat_move_instructions, best)) {
+      // Previously, 'best' was a rematerialization which killed the
+      // instruction it was a copying of. Now 'remat' is a rematerialization
+      // of 'best' and kills 'best'. Stop rematerializing this instruction
+      // to avoid an infinite loop.
+      instruction_list->Blacklist(remat);
+    }
+    remat_move_instructions->insert(remat);
+
+  } else {
+    net_instructions_added++;
+  }
+  return net_instructions_added;
+}
+
+StatusOr<int64> CompressInstruction(MemoryUsageTracker* memory_tracker,
+                                    Item* best_item, const Shape& compact_shape,
+                                    InstructionList* instruction_list) {
+  HloInstruction* best = best_item->instruction;
+  VLOG(5) << "Transposing instruction " << best->name() << " (saving "
+          << HumanReadableNumBytes(memory_tracker->MemoryReducedIfCompressed(
+                 best_item, compact_shape))
+          << ") to" << compact_shape.ToString(true);
+
+  HloComputation* computation = best->parent();
+
+  HloInstruction* compressed = computation->AddInstruction(
+      HloInstruction::CreateUnary(compact_shape, HloOpcode::kCopy, best));
+
+  HloInstruction* uncompressed = computation->AddInstruction(
+      HloInstruction::CreateUnary(best->shape(), HloOpcode::kCopy, compressed));
+
+  Item* compressed_item = instruction_list->CreateItem(compressed);
+  compressed_item->placed = true;
+
+  Item* uncompressed_item = instruction_list->CreateItem(uncompressed);
+
+  // Replace each remaining use of 'best' with the uncompressed.
+  std::vector<HloInstruction*> best_users_copy = best->users();
+  for (HloInstruction* user : best_users_copy) {
+    if (!memory_tracker->IsPlaced(user)) {
+      VLOG(5) << "  Replacing use of " << best->name() << " in " << user->name()
+              << " with " << uncompressed->name();
+      TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, uncompressed));
+    }
+  }
+
+  // Account for the rematerialization in the memory tracker.
+  TF_RETURN_IF_ERROR(memory_tracker->AddCompressInstructions(
+      best_item, compressed_item, uncompressed_item));
+
+  // Insert rematerialized instruction right before the earliest unplaced
+  // use of the instruction.
+  ItemList place_before;
+  for (auto user : uncompressed->users()) {
+    place_before.push_back(instruction_list->GetItem(user));
+  }
+
+  instruction_list->Blacklist(compressed_item->instruction);
+  instruction_list->Blacklist(uncompressed_item->instruction);
+
+  instruction_list->InsertBeforeInstructions(uncompressed_item, place_before);
+
+  instruction_list->InsertAfterInstructions(compressed_item, {best_item});
+
+  return 2;
 }
 
 }  // namespace
@@ -993,7 +1368,8 @@ StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
     const HloInstructionSequence& order) const {
   InstructionList instruction_list(order);
-  MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
+  MemoryUsageTracker tracker(computation, size_function_,
+                             compact_shape_function_, *points_to_analysis_,
                              instruction_list);
   int64 peak_memory = tracker.memory_usage();
   for (auto* item = instruction_list.first(); item != nullptr;
@@ -1037,6 +1413,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
   InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(computation, size_function_,
+                                    compact_shape_function_,
                                     *points_to_analysis_, instruction_list);
   bool changed = false;
 
@@ -1086,8 +1463,11 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
                                        callee_usage)
               << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
-      Item* best_item = PickRematerializationCandidate(
-          memory_tracker, instruction_list, memory_limit_bytes, &remat_able);
+      Item* best_item;
+      RematStrategy best_strategy;
+      std::tie(best_item, best_strategy) =
+          memory_tracker.PickRematerializationCandidate(
+              instruction_list, memory_limit_bytes, &remat_able);
 
       if (best_item == nullptr) {
         VLOG(3) << "Unable to find rematerialization candidate at program "
@@ -1099,88 +1479,33 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
       }
 
       HloInstruction* best = best_item->instruction;
-      VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
-              << HumanReadableNumBytes(
-                     memory_tracker.MemoryReducedIfRematerialized(best_item))
-              << ")";
       changed = true;
       remat_count++;
 
-      HloInstruction* remat =
-          computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
+      int64 added_instruction = 0;
+      if (best_strategy.kind == RematStrategy::kCompress) {
+        VLOG(1) << "Compressing instruction " << best->name() << " (saving "
+                << HumanReadableNumBytes(
+                       memory_tracker.MemoryReducedIfCompressed(
+                           best_item, best_strategy.compact_shape))
+                << ")";
 
-      // Add control dependencies to the new operation.
-      for (auto successor : best->control_successors()) {
-        TF_RETURN_IF_ERROR(remat->AddControlDependencyTo(successor));
-      }
-      for (auto predecessor : best->control_predecessors()) {
-        TF_RETURN_IF_ERROR(predecessor->AddControlDependencyTo(remat));
-      }
-
-      Item* remat_item = instruction_list.CreateItem(remat);
-
-      // Replace each remaining use of 'best' with the rematerialization.
-      std::vector<HloInstruction*> best_users_copy = best->users();
-      for (HloInstruction* user : best_users_copy) {
-        if (!memory_tracker.IsPlaced(user)) {
-          VLOG(2) << "  Replacing use of " << best->name() << " in "
-                  << user->name() << " with " << remat->name();
-          TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, remat));
-        }
-      }
-
-      // Account for the rematerialization in the memory tracker.
-      TF_RETURN_IF_ERROR(
-          memory_tracker.AddRematerializedInstruction(best_item, remat_item));
-
-      // Insert rematerialized instruction right before the earliest unplaced
-      // use of the instruction *and* the earliest unplaced last use of any
-      // operands of remat. Unplaced uses of the remat's operands are included
-      // because we don't want to extend the live range of remat's operands as
-      // this could increase memory usage.
-      ItemList place_before;
-      for (auto user : remat->users()) {
-        place_before.push_back(instruction_list.GetItem(user));
-      }
-      for (auto* operand : remat->operands()) {
-        for (auto* operand_user : operand->users()) {
-          if (operand_user != remat) {
-            Item* operand_user_item = instruction_list.GetItem(operand_user);
-            if (!operand_user_item->placed) {
-              place_before.push_back(operand_user_item);
-            }
-          }
-        }
-      }
-      // Insert rematerialized instruction before any of its successors to
-      // preserve ordering regarding control dependency.
-      for (auto successor : remat->control_successors()) {
-        Item* successor_item = instruction_list.GetItem(successor);
-        // Assert to make sure we never remat an operation with control
-        // successor already placed.
-        CHECK(!successor_item->placed) << successor_item->instruction->name();
-        place_before.push_back(successor_item);
-      }
-      instruction_list.InsertBeforeInstructions(remat_item, place_before);
-
-      // If the rematerialized instruction is dead then rematerialization is
-      // essentially a move. Don't delete the instruction now because we don't
-      // want duplicate HloInstruction* values during the course of the
-      // transformation because we keep maps with HloInstruction* values as
-      // keys.
-      if (best->users().empty()) {
-        VLOG(2) << best->name() << " is now dead";
-        if (ContainsKey(remat_move_instructions, best)) {
-          // Previously, 'best' was a rematerialization which killed the
-          // instruction it was a copying of. Now 'remat' is a rematerialization
-          // of 'best' and kills 'best'. Stop rematerializing this instruction
-          // to avoid an infinite loop.
-          instruction_list.Blacklist(remat);
-        }
-        remat_move_instructions.insert(remat);
+        TF_ASSIGN_OR_RETURN(added_instruction,
+                            CompressInstruction(&memory_tracker, best_item,
+                                                best_strategy.compact_shape,
+                                                &instruction_list));
       } else {
-        net_instructions_added++;
+        VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
+                << HumanReadableNumBytes(
+                       memory_tracker.MemoryReducedIfRematerialized(best_item))
+                << ")";
+
+        TF_ASSIGN_OR_RETURN(added_instruction,
+                            RematerializeInstruction(&memory_tracker, best_item,
+                                                     &remat_move_instructions,
+                                                     &instruction_list));
       }
+      net_instructions_added += added_instruction;
 
       VLOG(1) << "memory_usage after rematerialization = "
               << HumanReadableNumBytes(memory_tracker.memory_usage());
@@ -1226,7 +1551,6 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   }
 
   // Verify some invariants on the memory tracker.
-  CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto* instruction : computation->instructions()) {
     CHECK(memory_tracker.IsPlaced(instruction)) << instruction->name();
   }
@@ -1281,11 +1605,7 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
       module->result_shape(),
       [&module_output_size, module, this](const Shape& subshape,
                                           const ShapeIndex& output_index) {
-        if (!module->input_output_alias_config().OutputHasAlias(output_index)) {
-          // Only account for non-aliased outputs to avoid double counting a
-          // parameter buffer twice.
-          module_output_size += size_function_(subshape);
-        }
+        module_output_size += size_function_(subshape);
       });
 
   const int64 adjusted_memory_limit_bytes =
@@ -1361,7 +1681,7 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
     sizes_->after_bytes = current_peak_memory;
   }
 
-  XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString());
+  XLA_VLOG_LINES(5, "After HloRematerialization:\n" + module->ToString());
 
   if (current_peak_memory > memory_limit_bytes_) {
     LOG(WARNING) << absl::StrFormat(
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 350cf0f8e8f..9ab34b4862d 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -24,6 +24,8 @@
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
 
@@ -38,6 +40,8 @@ class HloRematerialization : public HloModulePass {
  public:
   using ShapeSizeFunction = std::function<int64(const Shape&)>;
 
+  using CompactShapeFunction = std::function<StatusOr<Shape>(const Shape&)>;
+
   // Helper struct that communicates the before / after sizes for the
   // rematerialization process.
   struct RematerializationSizes {
@@ -45,23 +49,34 @@ class HloRematerialization : public HloModulePass {
     int64 after_bytes;
   };
 
+  static Shape DefaultCompactShapeFunction(const Shape& shape) { return shape; }
+
   // Constructor parameters:
   //
   //   size_function: Function which returns the size in bytes of the top-level
   //     buffer of the given shape.
   //
   //   memory_limit_bytes: The threshold number of bytes to reduce memory use to
-  //     via rematerialization.
+  //     via rematerialization. Size of aliased outputs should be subtracted
+  //     from this.
   //
   //   sizes: Pointer to data structure which records the peak memory usage of
   //     the HLO module before/after rematerialization. Value are set during
   //     Run(). Can be nullptr.
-  HloRematerialization(const ShapeSizeFunction& size_function,
-                       int64 memory_limit_bytes, RematerializationSizes* sizes)
+  //
+  //   compact_shape_function: Function which returns the compact form of a
+  //   shape. If nullptr is provided, an default identity function is used.
+  explicit HloRematerialization(
+      const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
+      RematerializationSizes* sizes,
+      CompactShapeFunction compact_shape_function = nullptr)
       : size_function_(size_function),
         memory_limit_bytes_(memory_limit_bytes),
-        sizes_(sizes) {}
-  ~HloRematerialization() {}
+        sizes_(sizes),
+        compact_shape_function_(compact_shape_function == nullptr
+                                    ? DefaultCompactShapeFunction
+                                    : std::move(compact_shape_function)) {}
+  ~HloRematerialization() override = default;
 
   absl::string_view name() const override { return "rematerialization"; }
 
@@ -108,6 +123,10 @@ class HloRematerialization : public HloModulePass {
   // module before/after rematerialization
   RematerializationSizes* sizes_;
 
+  // Converts a shape into compact form, returns the same shape if a shape is
+  // already considered compact.
+  const CompactShapeFunction compact_shape_function_;
+
   // Call graph of the hlo_module.
   std::unique_ptr<CallGraph> call_graph_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 987177e40b8..dabd9d20f64 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -534,6 +533,142 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
 INSTANTIATE_TEST_SUITE_P(IndirectUseTestInstantiation, IndirectUseTest,
                          ::testing::Values(true, false));
 
+class CompressingRematerializationTest : public RematerializationTestBase {
+ protected:
+  // A special shape size function, which pads the most minor dimension to 64.
+  static int64 ShapeSizePadMinorTo64(const Shape& shape) {
+    if (shape.IsTuple()) {
+      // Size of a tuple is 4 bytes.
+      return 4;
+    }
+    Shape descending_shape =
+        ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
+    int64 size =
+        ShapeUtil::ByteSizeOfPrimitiveType(descending_shape.element_type());
+    for (int64 i = 0; i < descending_shape.rank(); ++i) {
+      int64 dim = shape.dimensions(i);
+      if (i == descending_shape.rank() - 1) {
+        dim = RoundUpToNearest<int64>(dim, 64);
+      }
+      size *= dim;
+    }
+    return size;
+  }
+
+  // Swap the two most-minor dimensions if the second-minor dimension is bigger
+  // than the most-minor dimension.
+  static StatusOr<Shape> ChooseCompactLayoutForShape(const Shape& shape) {
+    Shape result = shape;
+    Layout layout = result.layout();
+    int64 most_minor_index = layout.minor_to_major()[0];
+    int64 second_minor_index = layout.minor_to_major()[1];
+    int64 most_minor = result.dimensions(most_minor_index);
+    int64 second_minor = result.dimensions(second_minor_index);
+    if (most_minor < second_minor) {
+      result.set_dimensions(most_minor_index, second_minor);
+      result.set_dimensions(second_minor_index, most_minor);
+    }
+    return result;
+  }
+
+  StatusOr<bool> RunHloRematerialization(int64 memory_limit_bytes,
+                                         HloModule* module) {
+    TF_EXPECT_OK(verifier().Run(module).status());
+    HloRematerialization remat(ShapeSizePadMinorTo64, memory_limit_bytes,
+                               /*sizes=*/nullptr, ChooseCompactLayoutForShape);
+    return remat.Run(module);
+  }
+};
+
+// Test rematerialization of a single instruction.
+TEST_F(CompressingRematerializationTest, SingleRemat) {
+  const string& hlo_string = R"(
+HloModule fusion, is_scheduled=true
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %entry {
+  %param.0 = f32[] parameter(0)
+  %constant = f32[] constant(0)
+  %broadcast.0 = f32[64,2]{1,0} broadcast(f32[] %param.0), dimensions={}
+  %negate = f32[64,2]{1,0} negate(f32[64,2]{1,0} broadcast.0)
+  %reduce.0 = f32[] reduce(f32[64,2]{1,0} %negate, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %reduce.1 = f32[] reduce(f32[64,2]{1,0} %broadcast.0, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %add = f32[] add(f32[] %reduce.0, f32[] %reduce.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/30 * 1024, module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* broadcast =
+      module->entry_computation()->GetInstructionWithName("broadcast.0");
+  HloInstruction* reduce =
+      module->entry_computation()->GetInstructionWithName("reduce.1");
+  EXPECT_THAT(reduce,
+              op::Reduce(op::Copy(op::Copy(broadcast)), op::Constant()));
+}
+
+TEST_F(CompressingRematerializationTest, AllUsersUseSameCopy) {
+  const string& hlo_string = R"(
+HloModule fusion, is_scheduled=true
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %entry {
+  %param.0 = f32[] parameter(0)
+  %constant = f32[] constant(0)
+  %broadcast.0 = f32[64,2]{1,0} broadcast(f32[] %param.0), dimensions={}
+  %negate = f32[64,2]{1,0} negate(f32[64,2]{1,0} broadcast.0)
+  %reduce.0 = f32[] reduce(f32[64,2]{1,0} %negate, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %reduce.1 = f32[] reduce(f32[64,2]{1,0} %negate, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %reduce.2 = f32[] reduce(f32[64,2]{1,0} %broadcast.0, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %add = f32[] add(f32[] %reduce.0, f32[] %reduce.1)
+  %reduce.3 = f32[] reduce(f32[64,2]{1,0} %broadcast.0, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %add.2 = f32[] add(f32[] %reduce.2, f32[] %reduce.3)
+  ROOT %tuple = (f32[], f32[]) tuple (f32[] add, f32[] add.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/30 * 1024, module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* broadcast =
+      module->entry_computation()->GetInstructionWithName("broadcast.0");
+
+  // Both reduces reuse the same copy instruction.
+  HloInstruction* reduce_2 =
+      module->entry_computation()->GetInstructionWithName("reduce.2");
+
+  HloInstruction* reduce_3 =
+      module->entry_computation()->GetInstructionWithName("reduce.3");
+
+  EXPECT_THAT(reduce_2,
+              op::Reduce(op::Copy(op::Copy(broadcast)), op::Constant()));
+
+  EXPECT_THAT(reduce_3,
+              op::Reduce(op::Copy(op::Copy(broadcast)), op::Constant()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 154cf7fc44f..daeb5943fda 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -208,13 +208,13 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
   ServiceExecutableRunOptions service_run_options =
       GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
                                     nullptr, RunId());
+  service_run_options.mutable_run_options()->set_execution_profile(profile);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
   TF_ASSIGN_OR_RETURN(
       ScopedShapedBuffer retval,
-      executable->ExecuteOnStreamWrapper(&service_run_options,
-                                         /*profile=*/profile, arguments));
+      executable->ExecuteOnStreamWrapper(&service_run_options, arguments));
   TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
   return std::move(retval);
 }
@@ -244,11 +244,11 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
   ServiceExecutableRunOptions service_run_options =
       GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
                                     nullptr, RunId());
+  service_run_options.mutable_run_options()->set_execution_profile(profile);
 
   TF_ASSIGN_OR_RETURN(
       ScopedShapedBuffer retval,
-      executable->ExecuteOnStreamWrapper(&service_run_options,
-                                         /*profile=*/profile, arguments));
+      executable->ExecuteOnStreamWrapper(&service_run_options, arguments));
   TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
   return std::move(retval);
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index ae7ccadbf97..1551870f734 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_static",
 )
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 80a3ebccff1..85768225892 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -102,13 +102,6 @@ StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
   return std::move(hlo_module);
 }
 
-Status InterpreterCompiler::RunHloPassesOnModuleGroup(
-    HloModuleGroup* module_group,
-    absl::Span<se::StreamExecutor* const> executors,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("Module group compilation not supported on Interpreter");
-}
-
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* /*device_allocator*/) {
@@ -133,15 +126,6 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   return std::move(executable);
 }
 
-StatusOr<std::vector<std::unique_ptr<Executable>>>
-InterpreterCompiler::RunBackendOnModuleGroup(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented(
-      "Module group compilation is not supported on Interpreter.");
-}
-
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index dc83295b527..824594dfd84 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -46,19 +46,9 @@ class InterpreterCompiler : public Compiler {
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
-  Status RunHloPassesOnModuleGroup(
-      HloModuleGroup* module_group,
-      absl::Span<se::StreamExecutor* const> executors,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
-  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 167a013408b..0dab86d986c 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -113,22 +113,15 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
-  {
-    tensorflow::mutex_lock lock(mutex_);
+  ExecutionProfile* profile = run_options->run_options().execution_profile();
+  if (profile) {
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+    profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
   }
 
   return std::move(result);
 }
 
-StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments) {
-  return tensorflow::errors::Unimplemented(
-      "ExecuteAsyncOnStream is not yet supported on Interpreter.");
-}
-
 /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) {
   if (shape.IsOpaque()) {
     return sizeof(void*);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index bda13d37636..ba010de76bd 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -46,16 +46,12 @@ class InterpreterExecutable : public Executable {
                         std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
-  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments,
       HloExecutionProfile* hlo_execution_profile) override
       LOCKS_EXCLUDED(evaluator_lock_);
 
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments) override;
-
   static int64 ShapeSizeBytes(const Shape& shape);
 
  protected:
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 6d337688a94..43493b6e154 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -58,14 +58,14 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return port::Status::OK();
   }
 
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override {
-    return false;
+  port::Status GetKernel(const MultiKernelLoaderSpec &spec,
+                         KernelBase *kernel) override {
+    return port::UnimplementedError("Not Implemented");
   }
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &kernel,
-              const KernelArgsArrayBase &args) override {
-    return false;
+  port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims, const KernelBase &kernel,
+                      const KernelArgsArrayBase &args) override {
+    return port::UnimplementedError("Not Implemented");
   }
 
   void *Allocate(uint64 size) override;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 72ffcd26a72..bf1df58f0b8 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -619,8 +619,9 @@ Status LayoutAssignment::AddMandatoryConstraints(
         TF_RET_CHECK(instruction->branch_computation(j)->num_parameters() == 1);
         ComputationLayout& branch_computation_layout =
             FindOrDie(computation_layouts_, instruction->branch_computation(k));
-        if (branch_computation_layout.result_layout() !=
-            best_branch_computation_layout.result_layout()) {
+        if (!branch_computation_layout.result_layout().MatchesLayoutInShape(
+                best_branch_computation_layout.result_layout().shape(),
+                /*minor_to_major_only=*/true)) {
           computation_layouts_.erase(instruction->branch_computation(k));
           InsertOrDie(&conditional_mismatch_,
                       instruction->branch_computation(k),
@@ -715,8 +716,10 @@ Status CheckConditionalLayout(
     absl::Span<const ComputationLayout> branch_computation_layouts) {
   for (int j = 0; j < instruction->branch_count(); ++j) {
     const HloInstruction* branch_operand = instruction->operand(j + 1);
-    TF_RET_CHECK(branch_computation_layouts[0].result_layout() ==
-                 branch_computation_layouts[j].result_layout());
+    TF_RET_CHECK(
+        branch_computation_layouts[0].result_layout().MatchesLayoutInShape(
+            branch_computation_layouts[j].result_layout().shape(),
+            /*minor_to_major_only=*/true));
     TF_RET_CHECK(
         branch_computation_layouts[j].result_layout().MatchesLayoutInShape(
             instruction->shape(), /*minor_to_major_only=*/true));
@@ -853,6 +856,30 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
           << operand_layout.ToString() << " in " << instruction->ToString();
 
+  // If the operand is only used by a conditional, do the copy inside the branch
+  // to avoid overhead for other branches.
+  if (instruction->opcode() == HloOpcode::kConditional && operand_no > 0 &&
+      instruction->operand(operand_no)->user_count() == 1) {
+    auto branch_comp = instruction->branch_computation(operand_no - 1);
+    auto param = branch_comp->parameter_instruction(0);
+    *param->mutable_shape() = operand->shape();
+    auto param_users = param->users();
+    TF_ASSIGN_OR_RETURN(HloInstruction * param_copy,
+                        CreateCopyWithNewLayout(operand_layout.shape(), param));
+    for (auto user : param_users) {
+      TF_RETURN_IF_ERROR(param->ReplaceUseWithDifferentShape(user, param_copy));
+    }
+    VLOG(4) << "New copy of " << operand->ToString() << " is "
+            << param_copy->ToString();
+    if (param == branch_comp->root_instruction()) {
+      branch_comp->set_root_instruction(param_copy,
+                                        /*accept_different_shape=*/true);
+    }
+    *FindOrDie(computation_layouts_, branch_comp).mutable_parameter_layout(0) =
+        ShapeLayout(operand->shape());
+    return Status::OK();
+  }
+
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 046ffde7616..7d5a3b6623f 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -819,8 +819,8 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
   auto constant0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
           {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  builder.AddInstruction(HloInstruction::CreateUnary(
-      constant0->shape(), HloOpcode::kBitcast, constant0));
+  builder.AddInstruction(
+      HloInstruction::CreateBitcast(constant0->shape(), constant0));
   auto m = CreateNewVerifiedModule();
   m->AddEntryComputation(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 82e955c818e..aa759b26226 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -21,23 +21,6 @@ limitations under the License.
 #endif
 
 namespace xla {
-Status LLVMCompiler::RunHloPassesOnModuleGroup(
-    HloModuleGroup* module_group,
-    absl::Span<se::StreamExecutor* const> executors,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented(
-      "Model partitioning not implemented for the CPU/GPU compilers!");
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>>
-LLVMCompiler::RunBackendOnModuleGroup(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented(
-      "Model partitioning not implemented for the CPU/GPU compilers!");
-}
-
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index 888815bea3d..bddda50d3e1 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -69,16 +69,6 @@ class LLVMCompiler : public Compiler {
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
-  Status RunHloPassesOnModuleGroup(
-      HloModuleGroup* module_group,
-      absl::Span<se::StreamExecutor* const> executors,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index ffb2df99e9c..9ffb120bb2d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -151,10 +151,9 @@ Status FusedIrEmitter::HandleGetTupleElement(
 Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
   indexed_generators_[parameter] =
       [=](const IrArray::Index& index) -> llvm::Value* {
-    if (tiled_parameter_info_) {
-      if (llvm::Value* param_tile_buffer =
-              tiled_parameter_info_->GetBufferForParameter(
-                  parameter->parameter_number())) {
+    int64 param_num = parameter->parameter_number();
+    if (param_shmem_buffers_.size() > param_num) {
+      if (llvm::Value* param_tile_buffer = param_shmem_buffers_[param_num]) {
         // TODO(jlebar): Add AA metadata to this load.  Tile buffers are global
         // variables, so LLVM's points-to analysis doesn't help us much.  And we
         // want the AA info to be present before address spaces are inferred
@@ -162,13 +161,12 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
         // address-space-based AA in LLVM, it wouldn't help us much here.
         return b_->CreateLoad(
             b_->CreateGEP(param_tile_buffer, {index.GetConstantWithIndexType(0),
-                                              tiled_parameter_info_->x(),
-                                              tiled_parameter_info_->y()}),
+                                              tile_param_x_, tile_param_y_}),
             "tiled_buffer");
       }
     }
-    return GetIrArrayForFusedParameter(parameter->parameter_number())
-        .EmitReadArrayElement(index, b_);
+    return GetIrArrayForFusedParameter(param_num).EmitReadArrayElement(index,
+                                                                       b_);
   };
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index b1aa6d59634..9b027144cd8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -60,10 +60,16 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
       std::function<std::vector<llvm_ir::IrArray>()>;
 
   FusedIrEmitter(GeneratorForOperandIrArrays operand_arrays_generator,
-                 ElementalIrEmitter* elemental_emitter)
+                 ElementalIrEmitter* elemental_emitter,
+                 llvm::Value* tile_param_x = nullptr,
+                 llvm::Value* tile_param_y = nullptr,
+                 absl::Span<llvm::Value* const> param_shmem_buffers = {})
       : operand_arrays_(),
         operand_arrays_generator_(std::move(operand_arrays_generator)),
-        tiled_parameter_info_(nullptr),
+        tile_param_x_(tile_param_x),
+        tile_param_y_(tile_param_y),
+        param_shmem_buffers_(param_shmem_buffers.begin(),
+                             param_shmem_buffers.end()),
         elemental_emitter_(elemental_emitter),
         b_(elemental_emitter->b()),
         module_(elemental_emitter->module()) {}
@@ -87,10 +93,6 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   // Returns the generator function for the given instruction.
   IndexedGenerator GetGenerator(const HloInstruction* instruction) const;
 
-  void SetTiledParameterInfo(const llvm_ir::TiledParameterInfo* info) {
-    tiled_parameter_info_ = info;
-  }
-
   // Evaluates whether fusing 'producer' into 'consumer' might cause exponential
   // behavior in FusedIrEmitter. We currently can have exponential time/memory
   // requirements for emitting certain fusion kernels, in which case we don't
@@ -118,7 +120,15 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   absl::optional<std::vector<llvm_ir::IrArray>> operand_arrays_;
   GeneratorForOperandIrArrays operand_arrays_generator_;
 
-  const llvm_ir::TiledParameterInfo* tiled_parameter_info_;
+  // The x coordinate within a tile.
+  llvm::Value* tile_param_x_;
+
+  // The y coordinate within a tile.
+  llvm::Value* tile_param_y_;
+
+  // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr
+  // if the parameter is not tiled.
+  std::vector<llvm::Value*> param_shmem_buffers_;
 
   ElementalIrEmitter* elemental_emitter_;
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index 02c719502ee..5014aa9c8ae 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -249,11 +249,26 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpOEQ(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpOGT(Args&&... args) {
+    return mixin_builder()->CreateFCmpOGT(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOGE(Args&&... args) {
+    return mixin_builder()->CreateFCmpOGE(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FCmpOLT(Args&&... args) {
     return mixin_builder()->CreateFCmpOLT(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpULT(Args&&... args) {
+    return mixin_builder()->CreateFCmpULT(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FCmpOLE(Args&&... args) {
     return mixin_builder()->CreateFCmpOLE(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index 2ef844ffa62..f586ee4bd4b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -54,6 +54,15 @@ Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
                                                   dimensions);
 }
 
+std::array<int64, 3> ElementWiseCeilOfRatio(std::array<int64, 3> dividends,
+                                            std::array<int64, 3> divisors) {
+  std::array<int64, 3> out;
+  for (int i = 0; i < 3; i++) {
+    out[i] = CeilOfRatio<int64>(dividends.at(i), divisors.at(i));
+  }
+  return out;
+}
+
 }  // namespace
 
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
@@ -94,35 +103,36 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
   return absl::nullopt;
 }
 
-KernelMappingScheme::KernelMappingScheme(
-    absl::Span<const int64> dims_in_elems, int64 tile_size_y, int64 tile_size_x,
-    absl::Span<const int64> req_block_sizes, int64 num_threads_y,
-    int64 num_threads_x, llvm::IRBuilder<>* b)
+KernelMappingScheme::KernelMappingScheme(absl::Span<const int64> dims_in_elems,
+                                         int64 tile_size_y, int64 tile_size_x,
+                                         int64 block_size_z,
+                                         int64 num_threads_y,
+                                         int64 num_threads_x, bool is_dilated_x,
+                                         llvm::IRBuilder<>* b)
     : b_(b),
-      dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()),
+      dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
       tile_sizes_{1, tile_size_y, tile_size_x},
+      dims_in_tiles_{dims_in_elems[0],
+                     CeilOfRatio<int64>(dims_in_elems[1], tile_size_y),
+                     CeilOfRatio<int64>(dims_in_elems[2], tile_size_x)},
+      block_sizes_{block_size_z, 1, 1},
+      dims_in_blocks_{CeilOfRatio<int64>(dims_in_elems[0], block_sizes_[0]),
+                      dims_in_tiles_[1], dims_in_tiles_[2]},
       num_threads_x_(num_threads_x),
       num_threads_y_(num_threads_y),
-      dilated_x_(true) {
-  DCHECK_EQ(dims_in_elems_.size(), 3);
-  DCHECK_EQ(req_block_sizes.size(), 3);
-
+      dilated_x_(is_dilated_x) {
   DCHECK_EQ(tile_size_y % num_threads_y_, 0);
   DCHECK_EQ(tile_size_x % num_threads_x_, 0);
-
-  dims_in_tiles_ = ElementWiseCeilOfRatio<int64>(dims_in_elems_, tile_sizes_);
-  block_sizes_.reserve(req_block_sizes.size());
-  absl::c_transform(req_block_sizes, dims_in_tiles_,
-                    std::back_inserter(block_sizes_),
-                    [](const int64 requested_size, const int64 max_size) {
-                      return std::min(requested_size, max_size);
-                    });
-  dims_in_blocks_ = ElementWiseCeilOfRatio<int64>(dims_in_tiles_, block_sizes_);
-
+  CHECK_EQ((dims_in_elems[0] % block_size_z), 0);
   VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]";
   VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]";
   VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
            << "]";
+  if (!dilated_x_) {
+    // dilated_x_=false is for the purpose of vectorization, which requires
+    // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
+    CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
+  }
 }
 
 IrArray::Index KernelMappingScheme::GetUnnormalizedIndex(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index f802cc27d51..46561dd3252 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -90,23 +90,24 @@ class KernelMappingScheme {
   enum { DimZ = 0, DimY, DimX, DimTot };
 
  public:
-  KernelMappingScheme() {}
   // dims_in_elems: the normalized tensor dimensions.
-  // req_block_sizes: the requested block size in number of tiles for each
-  //   dimension. The actual block size is set to min(req_block_size,
-  //   dims_in_number_of_blocks).
   KernelMappingScheme(absl::Span<const int64> dims_in_elems, int64 tile_size_y,
-                      int64 tile_size_x,
-                      absl::Span<const int64> req_block_sizes,
+                      int64 tile_size_x, int64 block_size_z,
                       int64 num_threads_y, int64 num_threads_x,
-                      llvm::IRBuilder<>* b);
+                      bool is_dilated_x, llvm::IRBuilder<>* b);
 
+  // Number of elements in each dimension (Z/Y/X respectively).
   absl::Span<const int64> GetDimensionsInElements() const {
     return dims_in_elems_;
   }
+
+  // Ratio of elements in each dimension over tile sizes for Z/Y/X
+  // respectively.
   absl::Span<const int64> GetDimensionsInTiles() const {
     return dims_in_tiles_;
   }
+
+  // Ratio of dimensions per tile over block sizes.
   absl::Span<const int64> GetDimensionsInBlocks() const {
     return dims_in_blocks_;
   }
@@ -125,10 +126,7 @@ class KernelMappingScheme {
     return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
   }
 
-  int64 GetTileSizeForDimension(int d) const {
-    DCHECK(d >= DimZ && d <= DimX);
-    return tile_sizes_[d];
-  }
+  int64 GetTileSizeForDimension(int d) const { return tile_sizes_.at(d); }
   int64 GetTileSizeForDimensionX() const {
     return GetTileSizeForDimension(DimX);
   }
@@ -138,8 +136,7 @@ class KernelMappingScheme {
 
   absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
   int64 GetTileBlockSizeForDimension(int d) const {
-    DCHECK(d >= DimZ && d <= DimX);
-    return dims_in_blocks_[d];
+    return dims_in_blocks_.at(d);
   }
 
   int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
@@ -151,14 +148,6 @@ class KernelMappingScheme {
   }
 
   bool DilatedX() const { return dilated_x_; }
-  void SetDilatedX(bool v) {
-    dilated_x_ = v;
-    if (!dilated_x_) {
-      // dilated_x_=false is for the purpose of vectorization, which requires
-      // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
-      CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
-    }
-  }
 
   IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
   // Returns the index for the first tile in the block with the given block
@@ -181,19 +170,19 @@ class KernelMappingScheme {
  private:
   llvm::IRBuilder<>* b_;
   // The number of elements in each dimension.
-  std::vector<int64> dims_in_elems_;
+  std::array<int64, 3> dims_in_elems_;
 
   // The number of elements for each dimension of a tile.
-  std::vector<int64> tile_sizes_;
+  std::array<int64, 3> tile_sizes_;
   // The number of tiles in each dimension. It is computed from dims_in_elem_
   // and tile_sizes_.
-  std::vector<int64> dims_in_tiles_;
+  std::array<int64, 3> dims_in_tiles_;
 
   // The number of tiles for each dimension of a tile block.
-  std::vector<int64> block_sizes_;
+  std::array<int64, 3> block_sizes_;
   // The number of blocks in each dimension of a tile block. It is computed from
   // dims_in_tile_ and block_sizes_.
-  std::vector<int64> dims_in_blocks_;
+  std::array<int64, 3> dims_in_blocks_;
 
   // Number of threads used to process elements in the X direction of a tile.
   int64 num_threads_x_;
@@ -208,34 +197,6 @@ class KernelMappingScheme {
   bool dilated_x_;
 };
 
-// A class to represent information for tiled parameters to support IR emission
-// for 021 transpose.
-class TiledParameterInfo {
- public:
-  TiledParameterInfo(absl::Span<llvm::Value* const> param_buffers,
-                     llvm::Value* y, llvm::Value* x)
-      : param_buffers_(param_buffers), y_(y), x_(x) {}
-
-  llvm::Value* x() const { return x_; }
-  llvm::Value* y() const { return y_; }
-
-  void set_x(llvm::Value* x) { x_ = x; }
-  void set_y(llvm::Value* y) { y_ = y; }
-
-  llvm::Value* GetBufferForParameter(int64 index) const {
-    return param_buffers_[index];
-  }
-
- private:
-  // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr
-  // if the parameter is not tiled.
-  absl::Span<llvm::Value* const> param_buffers_;
-  // The y coordinate within a tile.
-  llvm::Value* y_;
-  // The x coordinate within a tile.
-  llvm::Value* x_;
-};
-
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index aa07bed443a..c9d86f059b4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -513,6 +513,7 @@ llvm::FastMathFlags GetCpuFastMathFlags(const HloModuleConfig& module_config) {
   flags.setNoNaNs(!options.xla_cpu_fast_math_honor_nans());
   flags.setNoInfs(!options.xla_cpu_fast_math_honor_infs());
   flags.setAllowReciprocal(!options.xla_cpu_fast_math_honor_division());
+  flags.setApproxFunc(!options.xla_cpu_fast_math_honor_functions());
   return flags;
 }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
new file mode 100644
index 00000000000..7dd6686bcea
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -0,0 +1,719 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment.h"
+
+namespace xla {
+
+namespace {
+// Define a dummy chunk for chunks that will be allocated in the default memory
+// space and for keeping track of number of asynchronous copies.
+const HeapSimulator::Chunk kDummyChunk{-1, -1};
+}  // namespace
+
+std::vector<const GlobalDecreasingSizeBestFitHeap::BufferInterval*>
+AlternateMemoryBestFitHeap::GetSortedColocatedIntervals(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+  std::vector<const BufferInterval*> colocated_intervals;
+  std::vector<const BufferInterval*> worklist = {&interval};
+  while (!worklist.empty()) {
+    const BufferInterval* item = worklist.back();
+    worklist.pop_back();
+    colocated_intervals.push_back(item);
+    for (const HloValue* buffer_colocated : item->colocations) {
+      worklist.push_back(&buffer_intervals_.at(buffer_colocated));
+    }
+  }
+
+  absl::c_sort(colocated_intervals, [&](const BufferInterval* x,
+                                        const BufferInterval* y) {
+    return std::make_pair(x->start, x->end) < std::make_pair(y->start, y->end);
+  });
+  return colocated_intervals;
+}
+
+HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
+  std::vector<BufferInterval> sorted_buffer_intervals =
+      GetSortedBufferIntervals();
+
+  VLOG(1) << "Assigning buffers to alternate memory. Max heap size = "
+          << max_size_in_bytes_
+          << ", min prefetch interval = " << min_prefetch_interval_
+          << ", max prefetch interval = " << max_prefetch_interval_;
+
+  for (auto& interval : sorted_buffer_intervals) {
+    if (!interval.need_allocation) {
+      continue;
+    }
+
+    // Skip if we have already allocated for this buffer.
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*interval.buffer);
+    if (allocation_map_->contains(&buffer)) {
+      continue;
+    }
+
+    // If the buffer is a tuple, don't use this algorithm for now. The buffers
+    // that are pointed to by the tuple will still use this algorithm.
+    // TODO(berkin): Because tuples are cheap to place in the alternate memory
+    // (they are just pointers) we don't need to use prefetch/evict logic.
+    if (buffer.values()[0]->shape().IsTuple()) {
+      VLOG(4) << "Keeping buffer " << buffer.ToString()
+              << " in default mem because it is a tuple.";
+      continue;
+    }
+
+    auto colocated_intervals = GetSortedColocatedIntervals(interval);
+    bool keep_in_default_memory = false;
+    for (const BufferInterval* colocated_interval : colocated_intervals) {
+      const HloValue* value = colocated_interval->buffer;
+      // If any of the colocated values are phi buffers, we keep them in the
+      // default memory for now.
+      if (value->is_phi()) {
+        keep_in_default_memory = true;
+        VLOG(4) << "Keeping value " << value->ToShortString()
+                << " because it contains a phi node.";
+        break;
+      }
+    }
+
+    MemorySpaceAssignment::AllocationSequence* allocation_sequence =
+        &(*allocation_map_)[&buffer];
+
+    // At this point, none of the colocated buffers contain any phi buffers.
+    for (const BufferInterval* colocated_interval : colocated_intervals) {
+      if (keep_in_default_memory) {
+        break;
+      }
+      const HloValue* value = colocated_interval->buffer;
+      int64 definition_time =
+          instruction_schedule_->at(value->defining_instruction());
+      // Sort the uses by the use time.
+      std::vector<HloUse> uses = value->uses();
+      absl::c_sort(uses, [&](HloUse use1, HloUse use2) {
+        return instruction_schedule_->at(use1.instruction) <
+               instruction_schedule_->at(use2.instruction);
+      });
+      // Iterate over the uses.
+      for (HloUse use : uses) {
+        int64 use_time = instruction_schedule_->at(use.instruction);
+
+        // Bitcasts don't define buffers and don't directly consume buffers.
+        // Skip allocating buffers for bitcast uses. The uses that feed from
+        // bitcasts will be handled specially.
+        if (use.instruction->opcode() != HloOpcode::kBitcast) {
+          if (!FindAllocation(definition_time, use_time,
+                              value->defining_position(), use, value,
+                              colocated_interval->size, allocation_sequence)) {
+            // If the allocation finding failed (e.g., due to running out of
+            // asynchronous copies), then fall back to allocating the buffer
+            // entirely in the default memory.
+            pending_chunks_.clear();
+            pending_async_copies_.clear();
+            allocation_sequence->clear();
+            keep_in_default_memory = true;
+            break;
+          }
+
+          // If there are multiple uses, they can try using the memory
+          // allocation already at the alternate memory.
+          definition_time = use_time;
+        }
+      }
+    }
+
+    CommitPendingChunks();
+  }
+
+  if (VLOG_IS_ON(3)) {
+    for (const auto& alloc_pair : *allocation_map_) {
+      VLOG(3) << "Allocation for " << alloc_pair.first->ToString();
+      for (const auto& alloc : alloc_pair.second) {
+        std::string addr_str = ": default";
+        if (alloc->memory_space() == MemorySpace::kAlternate) {
+          addr_str = absl::StrCat(": alt ", alloc->chunk().offset);
+        }
+
+        VLOG(3) << "  " << alloc->start_time() << "-" << alloc->end_time()
+                << addr_str << ", " << alloc->uses().size() << " uses";
+      }
+    }
+  }
+
+  return result_;
+}
+
+HloInstruction* AlternateMemoryBestFitHeap::GetInstructionAt(int64 time) const {
+  return flattened_instruction_sequence_->instructions()[time];
+}
+
+void AlternateMemoryBestFitHeap::CommitPendingChunks() {
+  for (auto interval_and_chunk : pending_chunks_) {
+    VLOG(3) << "Committing chunk: " << interval_and_chunk.first.start << "-"
+            << interval_and_chunk.first.end << " : ["
+            << interval_and_chunk.second.chunk.offset << ", "
+            << interval_and_chunk.second.chunk.size << "]";
+    CommitChunk(interval_and_chunk.first, interval_and_chunk.second);
+  }
+  pending_chunks_.clear();
+  // Also add the pending async copies to the interval tree.
+  if (max_outstanding_async_copies_ >= 0) {
+    for (auto interval : pending_async_copies_) {
+      async_copy_interval_tree_.Add(interval.first, interval.second,
+                                    kDummyChunk);
+    }
+  }
+  pending_async_copies_.clear();
+}
+
+void AlternateMemoryBestFitHeap::AddToPendingChunks(
+    const BufferInterval& buffer_interval,
+    const ChunkCandidate& chunk_candidate) {
+  pending_chunks_.emplace_back(buffer_interval, chunk_candidate);
+}
+
+bool AlternateMemoryBestFitHeap::FindAllocation(
+    int64 start_time, int64 end_time, HloPosition defining_position, HloUse use,
+    const HloValue* buffer, int64 size,
+    MemorySpaceAssignment::AllocationSequence* allocations) {
+  HloInstruction* operand =
+      use.instruction->mutable_operand(use.operand_number);
+  // If the operand is a bitcast, we look at bitcast's operand until we find a
+  // non-bitcast operand.
+  HloInstruction* non_bitcast_operand = operand;
+  while (non_bitcast_operand->opcode() == HloOpcode::kBitcast) {
+    non_bitcast_operand = non_bitcast_operand->mutable_operand(0);
+  }
+  // Create an alternate memory interval that starts at the earliest
+  // possible position, given by max_prefetch_interval.
+  BufferInterval alternate_mem_interval;
+  alternate_mem_interval.buffer = buffer;
+  alternate_mem_interval.size = size;
+  alternate_mem_interval.start =
+      std::max(start_time, end_time - max_prefetch_interval_);
+  alternate_mem_interval.end = end_time;
+
+  VLOG(2) << "Finding allocation for " << buffer->ToShortString() << " ("
+          << start_time << ", " << end_time << "). Size = " << size
+          << ", def pos = " << defining_position.ToString()
+          << ", operand = " << operand->ToString()
+          << (non_bitcast_operand != operand
+                  ? ", non_bitcast_operand = " + non_bitcast_operand->ToString()
+                  : "");
+  CHECK_LT(start_time, end_time);
+
+  // First try keeping the allocation entirely in the alternate memory.
+  if (TryAllocatingInAlternateMemoryNoCopy(
+          start_time, end_time, defining_position, use, alternate_mem_interval,
+          non_bitcast_operand, allocations)) {
+    return true;
+  }
+
+  MemorySpaceAssignment::Allocation* prev_allocation = nullptr;
+  if (!allocations->empty()) {
+    prev_allocation = allocations->back().get();
+  }
+
+  // Since copies couldn't be removed, create an allocation in the default
+  // memory space.
+  if (prev_allocation != nullptr &&
+      prev_allocation->memory_space() == MemorySpace::kAlternate &&
+      prev_allocation->instruction() == non_bitcast_operand) {
+    // If there was an allocation for this HloValue that was in the alternate
+    // memory space, we also need to perform an eviction.
+    // TODO(berkin): For now evictions happen relative to the most recent
+    // allocation in the alternate memory. We can potentially start evictions
+    // earlier and end later.
+    VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " ("
+            << prev_allocation->start_time() << ", "
+            << prev_allocation->end_time() << ")";
+
+    // See if this interval would violate the asynchronous copy limit.
+    if (!ViolatesMaximumOutstandingAsyncCopies(prev_allocation->start_time(),
+                                               prev_allocation->end_time())) {
+      AddAsyncCopy(*prev_allocation, MemorySpace::kDefault, kDummyChunk,
+                   prev_allocation->start_time(), prev_allocation->end_time(),
+                   allocations);
+
+    } else {
+      VLOG(3) << "This violates the maximum async copies.";
+      // If the original interval violated the limit, try sub-intervals within
+      // this interval.
+      bool eviction_scheduled = false;
+      for (int64 time = prev_allocation->start_time();
+           time <= prev_allocation->end_time(); ++time) {
+        VLOG(3) << "Try evicting (" << time << ", " << time << ")";
+        if (!ViolatesMaximumOutstandingAsyncCopies(time, time)) {
+          VLOG(3) << "Eviction successful.";
+          AddAsyncCopy(*prev_allocation, MemorySpace::kDefault, kDummyChunk,
+                       time, time, allocations);
+          eviction_scheduled = true;
+          break;
+        }
+      }
+
+      if (!eviction_scheduled) {
+        // If the eviction couldn't be scheduled, then fail. This buffer will be
+        // kept in the default memory.
+        VLOG(3) << "Bailing: Could not evict " << use.ToString()
+                << " because we hit the limit of maximum asynchronous copies "
+                << "between "
+                << GetInstructionAt(prev_allocation->start_time())->ToString()
+                << " and "
+                << GetInstructionAt(prev_allocation->end_time())->ToString();
+        return false;
+      }
+    }
+  } else if (prev_allocation != nullptr &&
+             prev_allocation->memory_space() == MemorySpace::kDefault &&
+             prev_allocation->instruction() == non_bitcast_operand) {
+    // If the previous allocation was in the default memory space and was
+    // defined by the same instruction, extend that.  Otherwise, create a new
+    // allocation.
+    prev_allocation->Extend(end_time);
+  } else {
+    allocations->push_back(absl::make_unique<MemorySpaceAssignment::Allocation>(
+        non_bitcast_operand, defining_position, MemorySpace::kDefault,
+        kDummyChunk, start_time, end_time));
+  }
+
+  // Try partially placing the buffer in the alternate space. The time that is
+  // overlapped will be used to asynchronously copy the buffer from the
+  // default memory to the alternate memory.
+  //
+  //                      start                 end
+  //                      time                  time
+  //                      X---------------------X
+  // Alternate:                          +------+
+  // Default:             +---------------------+
+  //                                     ^      ^
+  //                                   Copy    Copy
+  //                                   Start   Done
+  for (alternate_mem_interval.start =
+           std::max(start_time, end_time - max_prefetch_interval_);
+       alternate_mem_interval.end - alternate_mem_interval.start >
+       min_prefetch_interval_;
+       ++alternate_mem_interval.start) {
+    VLOG(4) << "Trying alternate memory allocation ("
+            << alternate_mem_interval.start << ", "
+            << alternate_mem_interval.end << ")";
+    // If this additional asynchronous copy would violate the limit, try a
+    // different interval.
+    if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start,
+                                              alternate_mem_interval.end)) {
+      VLOG(4) << "This would violate the outstanding async copy limit.";
+      continue;
+    }
+    ChunkCandidate chunk_candidate = FindChunkCandidate(alternate_mem_interval);
+    // Check if the new heap size fits within limits.
+    if (chunk_candidate.heap_size < max_size_in_bytes_) {
+      VLOG(3) << "Move the buffer to alternate memory at "
+              << alternate_mem_interval.start
+              << ". Offset = " << chunk_candidate.chunk.offset
+              << ", size = " << chunk_candidate.chunk.size
+              << ", heap_size = " << chunk_candidate.heap_size;
+      AddToPendingChunks(alternate_mem_interval, chunk_candidate);
+
+      AddAsyncCopy(*allocations->back().get(), MemorySpace::kAlternate,
+                   chunk_candidate.chunk, alternate_mem_interval.start,
+                   end_time, allocations);
+
+      allocations->back()->AddUse(use);
+      return true;
+    }
+  }
+
+  // If a copy wasn't inserted, then add this use to the latest allocation.
+  allocations->back()->AddUse(use);
+  return true;
+}
+
+void AlternateMemoryBestFitHeap::AddAsyncCopy(
+    const MemorySpaceAssignment::Allocation& prev_allocation,
+    MemorySpace memory_space, Chunk chunk, int64 start_time, int64 end_time,
+    MemorySpaceAssignment::AllocationSequence* allocations) {
+  HloInstruction* earliest_instruction = GetInstructionAt(start_time);
+  HloInstruction* latest_instruction = GetInstructionAt(end_time);
+
+  VLOG(3) << "Copy to "
+          << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
+                  ? "default"
+                  : "alternate")
+          << " memory between instructions " << earliest_instruction->ToString()
+          << " - " << latest_instruction->ToString();
+
+  allocations->push_back(
+      absl::make_unique<MemorySpaceAssignment::CopyAllocation>(
+          prev_allocation, memory_space, chunk, start_time, end_time,
+          earliest_instruction, latest_instruction));
+
+  // Register the additional async copy with the interval tree to keep track of
+  // the limit at any given time.
+  pending_async_copies_.emplace_back(start_time, end_time);
+}
+
+bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
+    int64 start_time, int64 end_time) const {
+  if (max_outstanding_async_copies_ < 0) {
+    return false;
+  }
+
+  // Count both the asynchronous copies in the interval tree as well as the
+  // pending asynchronous copies belonging to this buffer.
+  int64 num_async_copies =
+      async_copy_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+          .size();
+
+  for (auto interval : pending_async_copies_) {
+    if (interval.second > start_time && interval.first < end_time) {
+      num_async_copies++;
+    }
+  }
+  // Add one because we are checking if adding an additional asynchronous copy
+  // would violate the limit.
+  return num_async_copies + 1 > max_outstanding_async_copies_;
+}
+
+bool AlternateMemoryBestFitHeap::TryAllocatingInAlternateMemoryNoCopy(
+    int64 start_time, int64 end_time, HloPosition defining_position, HloUse use,
+    BufferInterval alternate_mem_interval, HloInstruction* non_bitcast_operand,
+    MemorySpaceAssignment::AllocationSequence* allocations) {
+  MemorySpaceAssignment::Allocation* prev_allocation = nullptr;
+  bool can_eliminate_copy = false;
+  if (allocations->empty()) {
+    // There hasn't been any allocations for this interval so far. We can
+    // eliminate copy if the value can be placed in the alternate memory.
+    can_eliminate_copy =
+        is_allowed_in_alternate_mem_(*alternate_mem_interval.buffer);
+  } else {
+    // If there has been a previous allocation, we can eliminate the copy if the
+    // previous allocation was also in the alternate memory.
+    prev_allocation = allocations->back().get();
+    can_eliminate_copy =
+        (prev_allocation->memory_space() == MemorySpace::kAlternate);
+  }
+
+  if (!can_eliminate_copy) {
+    return false;
+  }
+
+  if (alternate_mem_interval.start != start_time) {
+    return false;
+  }
+
+  // Prefer the offset that was previously used for the previous allocation.
+  int64 preferred_offset = -1;
+  if (prev_allocation != nullptr) {
+    preferred_offset = prev_allocation->chunk().offset;
+    // If there is a previous allocation, set the start time one after the end
+    // of the previous allocation's end.
+    alternate_mem_interval.start = prev_allocation->end_time() + 1;
+  }
+
+  VLOG(4) << "We can eliminate copy to alternate memory. Preferred offset = "
+          << preferred_offset;
+  ChunkCandidate chunk_candidate =
+      FindChunkCandidate(alternate_mem_interval, preferred_offset);
+  // Check if the new heap size fits within limits. Also ensure if a
+  // preferred offset was provided, that offset was used.
+  if (chunk_candidate.heap_size < max_size_in_bytes_ &&
+      (preferred_offset == -1 ||
+       preferred_offset == chunk_candidate.chunk.offset)) {
+    VLOG(3) << "Keep the buffer in alternate memory. Offset = "
+            << chunk_candidate.chunk.offset
+            << ", size = " << chunk_candidate.chunk.size
+            << ", heap_size = " << chunk_candidate.heap_size;
+    AddToPendingChunks(alternate_mem_interval, chunk_candidate);
+
+    // If there was a previous allocation, the buffer location is the
+    // same as the previous. Otherwise, it is the operand.
+    if (prev_allocation != nullptr &&
+        prev_allocation->instruction() == non_bitcast_operand) {
+      prev_allocation->Extend(end_time);
+    } else {
+      allocations->push_back(
+          absl::make_unique<MemorySpaceAssignment::Allocation>(
+              non_bitcast_operand, defining_position, MemorySpace::kAlternate,
+              chunk_candidate.chunk, start_time, end_time));
+    }
+    allocations->back()->AddUse(use);
+    return true;
+  }
+  return false;
+}
+
+/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(
+    const HloModule& module) {
+  int64 max_copies = 0;
+  int64 current_copies = 0;
+  for (HloInstruction* instruction :
+       module.schedule().sequence(module.entry_computation()).instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopyStart) {
+      current_copies++;
+    } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+      current_copies--;
+    }
+    max_copies = std::max(max_copies, current_copies);
+  }
+  return max_copies;
+}
+
+/*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
+MemorySpaceAssignment::Run(
+    HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes,
+    int64 min_prefetch_interval, int64 max_prefetch_interval,
+    int64 alternate_memory_space_alignment_in_bytes,
+    BufferValue::SizeFunction size_fn,
+    AlternateMemoryBestFitHeap::IsAllowedInAlternateMemoryFunction
+        is_allowed_in_alternate_mem,
+    int64 max_outstanding_async_copies) {
+  CHECK(module->has_schedule());
+  VLOG(4) << "Module before memory space assignment: ";
+  XLA_VLOG_LINES(4, module->ToString());
+  VLOG(4) << "Schedule: " << module->schedule().ToString();
+  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module));
+
+  MemorySpaceAssignment memory_space_assignment(module, alternate_memory_space);
+  // TODO(berkin): Explore heap algorithms other than kSpatial.
+  auto algorithm = absl::make_unique<AlternateMemoryBestFitHeap>(
+      &memory_space_assignment.allocation_map_, max_size_in_bytes,
+      min_prefetch_interval, max_prefetch_interval, *alias_analysis,
+      alternate_memory_space_alignment_in_bytes,
+      GlobalDecreasingSizeBestFitHeap::Type::kSpatial,
+      is_allowed_in_alternate_mem, max_outstanding_async_copies);
+
+  TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module,
+                                        module->schedule(),
+                                        *alias_analysis.get(), size_fn)
+                         .status());
+
+  TF_RETURN_IF_ERROR(memory_space_assignment.Process());
+  TF_RETURN_IF_ERROR(memory_space_assignment.FixSchedule());
+
+  VLOG(4) << "Module after memory space assignment: ";
+  XLA_VLOG_LINES(4, module->ToString());
+  TF_CHECK_OK(module->schedule().Verify());
+  VLOG(1) << "Maximum number of outstanding async copies: "
+          << CountMaximumOutstandingAsyncCopies(*module);
+
+  return std::move(memory_space_assignment.preset_assignments_);
+}
+
+void MemorySpaceAssignment::Allocation::AddUse(HloUse use) {
+  HloInstruction* operand =
+      use.instruction->mutable_operand(use.operand_number);
+  // When the operand of a use is a bitcast, we place the bitcast in a separate
+  // data structure.
+  if (operand->opcode() == HloOpcode::kBitcast) {
+    bitcasts_.push_back(operand);
+  } else {
+    uses_.push_back(use);
+  }
+}
+
+Status MemorySpaceAssignment::Allocation::PropagateMemorySpaceToBitcasts(
+    const MemorySpaceAssignment& memory_space_assignment) {
+  for (HloInstruction* bitcast : bitcasts_) {
+    if (memory_space_ == MemorySpace::kAlternate) {
+      Layout* bitcast_layout = bitcast->mutable_shape()->mutable_layout();
+      bitcast_layout->set_memory_space(
+          memory_space_assignment.alternate_memory_space_);
+    }
+  }
+  return Status::OK();
+}
+
+Status MemorySpaceAssignment::Allocation::Process(
+    MemorySpaceAssignment* memory_space_assignment) {
+  // For non-copy allocations, all we need to do is to update the output memory
+  // space if placed in the alternate memory.
+  if (memory_space_ == MemorySpace::kAlternate) {
+    Layout* layout = instruction_->mutable_shape()->mutable_layout();
+    layout->set_memory_space(memory_space_assignment->alternate_memory_space_);
+  }
+  TF_RETURN_IF_ERROR(PropagateMemorySpaceToBitcasts(*memory_space_assignment));
+  return Status::OK();
+}
+
+Status MemorySpaceAssignment::CopyAllocation::Process(
+    MemorySpaceAssignment* memory_space_assignment) {
+  // Copy allocations need to insert asynchronous copy nodes.
+  HloInstruction* producing_instruction = instruction();
+  CHECK_NE(producing_instruction, nullptr);
+
+  Shape shape = producing_instruction->shape();
+  HloComputation* computation = producing_instruction->parent();
+
+  // Set the layout to include the memory space.
+  Layout* layout = shape.mutable_layout();
+  if (memory_space_ == MemorySpace::kAlternate) {
+    layout->set_memory_space(memory_space_assignment->alternate_memory_space_);
+  } else {
+    layout->set_memory_space(0);
+  }
+
+  HloInstruction* copy_start =
+      computation->AddInstruction(HloInstruction::CreateUnary(
+          ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}),
+          HloOpcode::kCopyStart, producing_instruction));
+  HloInstruction* copy_done = computation->AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start));
+  // Update the allocation with the copy done instruction so that if there
+  // are further copies from it, it can find the correct instruction.
+  instruction_ = copy_done;
+  // Also update the defining position. Note that the output of CopyDone is
+  // actually defined in the item {0} of CopyStart.
+  defining_position_ = HloPosition{copy_start, {0}};
+
+  // Replace all the uses with the new copy instruction.
+  for (HloUse use : uses_) {
+    TF_RETURN_IF_ERROR(
+        use.instruction->ReplaceOperandWith(use.operand_number, copy_done));
+  }
+
+  // Replace all the bitcasts with the new copy instruction. Note that if there
+  // is a chain of bitcasts, their operands will be replaced with copy done.
+  // For example:
+  //
+  // a = Foo()
+  // b = Bitcast(a)
+  // c = Bitcast(b)
+  //
+  // If a is moved to the alternate memory asynchronously, the graph will be
+  // changed into:
+  //
+  // a = Foo()
+  // cs = CopyStart(a)
+  // cd = CopyDone(cs)
+  // b = Bitcast(cd)
+  // c = Bitcast(cd)
+  //
+  // Because of the potential shape change in the operand (b -> cd), we use
+  // ReplaceOperandWithDifferentShape.
+  for (HloInstruction* bitcast : bitcasts_) {
+    TF_RETURN_IF_ERROR(bitcast->ReplaceOperandWithDifferentShape(
+        /*operand_num=*/0, instruction_));
+  }
+
+  // Propagate the memory space to all bitcasts.
+  TF_RETURN_IF_ERROR(PropagateMemorySpaceToBitcasts(*memory_space_assignment));
+
+  // Insert the new instructions at the appropriate places in the schedule.
+  // FixSchedule will process the maps to actually insert them.
+  memory_space_assignment->ScheduleAsynchronousCopy(
+      copy_start, copy_start_schedule_after_, copy_done,
+      copy_done_schedule_before_);
+  return Status::OK();
+}
+
+Status MemorySpaceAssignment::Process() {
+  // Insert CopyStart/CopyDone pairs.
+  int64 alternate_memory_size = 0;
+  for (auto& buffer_and_sequence : allocation_map_) {
+    for (auto& allocation : buffer_and_sequence.second) {
+      TF_RETURN_IF_ERROR(allocation->Process(this));
+      // Add the offset and size of the allocation in the alternate memory to
+      // the output map. Special case for bitcast: since bitcast doesn't define
+      // its own buffer, that shouldn't be exported as a preset chunk.
+      if (allocation->memory_space() == MemorySpace::kAlternate &&
+          allocation->instruction()->opcode() != HloOpcode::kBitcast) {
+        preset_assignments_->add_chunk(allocation->defining_position(),
+                                       allocation->chunk());
+        alternate_memory_size =
+            std::max(alternate_memory_size, allocation->chunk().chunk_end());
+      }
+    }
+  }
+
+  if (!preset_assignments_->chunks().empty()) {
+    preset_assignments_->add_size(alternate_memory_space_,
+                                  alternate_memory_size);
+  }
+
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "Exported alternate memory allocations:";
+    for (auto& pair : preset_assignments_->chunks()) {
+      VLOG(3) << " [" << pair.second.offset << ", " << pair.second.size
+              << "] : " << pair.first.ToString();
+    }
+    VLOG(3) << "Exported alternate memory sizes:";
+    for (auto& pair : preset_assignments_->sizes()) {
+      VLOG(3) << "  space: " << pair.first << ", size: " << pair.second;
+    }
+  }
+  return Status::OK();
+}
+
+void MemorySpaceAssignment::ScheduleAsynchronousCopy(
+    HloInstruction* copy_start, HloInstruction* copy_start_schedule_after,
+    HloInstruction* copy_done, HloInstruction* copy_done_schedule_before) {
+  schedule_after_[copy_start_schedule_after].push_back(copy_start);
+  schedule_before_[copy_done_schedule_before].push_back(copy_done);
+}
+
+void MemorySpaceAssignment::EnsureInstructionAndOperandsInserted(
+    HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
+    absl::flat_hash_set<HloInstruction*>* inserted_instructions) const {
+  if (inserted_instructions->contains(new_instruction)) {
+    return;
+  }
+  for (HloInstruction* operand : new_instruction->operands()) {
+    EnsureInstructionAndOperandsInserted(operand, new_sequence,
+                                         inserted_instructions);
+  }
+  VLOG(4) << "inserting: " << new_instruction->ToString();
+  new_sequence->push_back(new_instruction);
+  inserted_instructions->insert(new_instruction);
+}
+
+Status MemorySpaceAssignment::FixSchedule() {
+  CHECK(module_->has_schedule());
+  HloSchedule& schedule = module_->schedule();
+  for (const HloComputation* computation :
+       module_->MakeNonfusionComputations()) {
+    CHECK(schedule.is_computation_scheduled(computation));
+    const HloInstructionSequence& sequence = schedule.sequence(computation);
+    HloInstructionSequence new_sequence;
+
+    absl::flat_hash_set<HloInstruction*> inserted_instructions;
+
+    for (HloInstruction* instruction : sequence.instructions()) {
+      auto insts_before_iter = schedule_before_.find(instruction);
+      if (insts_before_iter != schedule_before_.end()) {
+        for (HloInstruction* new_instruction : insts_before_iter->second) {
+          EnsureInstructionAndOperandsInserted(new_instruction, &new_sequence,
+                                               &inserted_instructions);
+        }
+      }
+      // Insert only if not previously inserted.
+      if (!inserted_instructions.contains(instruction)) {
+        EnsureInstructionAndOperandsInserted(instruction, &new_sequence,
+                                             &inserted_instructions);
+      }
+      auto insts_after_iter = schedule_after_.find(instruction);
+      if (insts_after_iter != schedule_after_.end()) {
+        for (HloInstruction* new_instruction : insts_after_iter->second) {
+          EnsureInstructionAndOperandsInserted(new_instruction, &new_sequence,
+                                               &inserted_instructions);
+        }
+      }
+    }
+    schedule.set_sequence(computation, new_sequence);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
new file mode 100644
index 00000000000..71ed39ded04
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -0,0 +1,367 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This class contains pre-set assignments determined by memory space
+// assignment. It contains two data structures: (1) a chunks vector that maps a
+// defining HloPosition to a Chunk (offset and size), and (2) a sizes vector
+// that maps the memory space to its size. If there is only one alternate memory
+// space like there is currently, there will be one entry in sizes.
+class PresetAssignments {
+ public:
+  PresetAssignments() = default;
+
+  void add_chunk(const HloPosition& position,
+                 const HeapSimulator::Chunk& chunk) {
+    chunks_.emplace_back(position, chunk);
+  }
+
+  void add_size(int64 memory_space, int64 size) {
+    sizes_.emplace_back(memory_space, size);
+  }
+
+  absl::Span<const std::pair<const HloPosition, const HeapSimulator::Chunk>>
+  chunks() const {
+    return chunks_;
+  }
+
+  absl::Span<const std::pair<int64, int64>> sizes() const { return sizes_; }
+
+ private:
+  std::vector<std::pair<const HloPosition, const HeapSimulator::Chunk>> chunks_;
+  std::vector<std::pair<int64, int64>> sizes_;
+};
+
+// MemorySpaceAssignment assigns memory spaces (default or alternate) to each
+// instruction in the module. It will greedily try placing as as many values in
+// the alternate memory space as possible. It uses the heap simulator to
+// determine the actual allocation offsets of values in the alternate memory
+// space to account for fragmentation. The default memory space is assumed to be
+// large enough to hold the values that could not be placed in the alternate
+// memory space.
+class MemorySpaceAssignment {
+ public:
+  using Chunk = HeapSimulator::Chunk;
+
+  // MemorySpaceAssignment uses a notion of a slow and large default memory
+  // space and a fast and small alternate memory space.
+  enum class MemorySpace { kDefault, kAlternate };
+
+  // This class represents an allocation that might either be in the default or
+  // alternate memory. An HloValue might live in multiple different allocations
+  // over its lifetime. The lifetimes of the allocations are defined using
+  // start_time and end_time, which corresponds to the instruction indexes in
+  // the flattened schedule. Each of these allocations might partially overlap
+  // with each other. CopyAllocation defined below represents asynchronous
+  // copies between Allocations.
+  //
+  // Consider an instruction Foo, and its users Bar and Baz, and the times given
+  // in terms of the flattened schedule of the entire module:
+  //
+  //      Foo:10
+  //       /   \
+  //    Bar:14  \
+  //           Baz:25
+  //
+  // A valid memory space assignment could be like the following:
+  //
+  //  Time:         10 ... 14        ...      25
+  //                Foo    Bar                Baz
+  //  Alternate     +-------+           +-----+
+  //  Default           +---------------------+
+  //                    ^   ^           ^     ^
+  //                    |   |           |     |
+  //                evict   evict  prefetch  prefetch
+  //                start    end    start      end
+  //
+  // This would be represented with:
+  //   - Allocation(memory_space=kAlternate, start_time=10, end_time=14)
+  //   - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
+  //   - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
+  class Allocation {
+   public:
+    Allocation(HloInstruction* instruction, HloPosition defining_position,
+               MemorySpace memory_space, Chunk chunk, int64 start_time,
+               int64 end_time)
+        : instruction_(instruction),
+          defining_position_(defining_position),
+          memory_space_(memory_space),
+          chunk_(chunk),
+          start_time_(start_time),
+          end_time_(end_time) {}
+    virtual ~Allocation() = default;
+
+    // Adds a use to this allocation.
+    void AddUse(HloUse use);
+
+    // Extends the end time of this allocation.
+    void Extend(int64 end_time) { end_time_ = end_time; }
+
+    // After all of the time ranges for the allocations have been assigned,
+    // Process morphs the instructions affected to assign the memory spaces and
+    // insert asynchronous copy instructions if necessary.
+    virtual Status Process(MemorySpaceAssignment* memory_space_assignment);
+
+    // Returns the instruction that produces this allocation. It might be
+    // different than the instruction in defining_position (e.g., a
+    // GetTupleElement instruction does not define the buffer).
+    virtual HloInstruction* instruction() const { return instruction_; }
+
+    // Returns the defining position for this allocation.
+    HloPosition defining_position() const { return defining_position_; }
+
+    const std::vector<HloUse>& uses() const { return uses_; }
+    MemorySpace memory_space() const { return memory_space_; }
+    Chunk chunk() const { return chunk_; }
+    int64 start_time() const { return start_time_; }
+    int64 end_time() const { return end_time_; }
+
+   protected:
+    // Bitcasts are treated specially because they do not define buffers.  This
+    // method propagates the memory space for the bitcasts of this allocation.
+    Status PropagateMemorySpaceToBitcasts(
+        const MemorySpaceAssignment& memory_space_assignment);
+
+    HloInstruction* instruction_;
+    HloPosition defining_position_;
+    std::vector<HloUse> uses_;
+    std::vector<HloInstruction*> bitcasts_;
+    MemorySpace memory_space_;
+    Chunk chunk_;
+    int64 start_time_;
+    int64 end_time_;
+  };
+
+  // This class represents an allocation as a result of an asynchronous copy.
+  class CopyAllocation : public Allocation {
+   public:
+    CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
+                   Chunk chunk, int64 start_time, int64 end_time,
+                   HloInstruction* copy_start_schedule_after,
+                   HloInstruction* copy_done_schedule_before)
+        : Allocation(/*instruction=*/nullptr,
+                     /*defining_position=*/{nullptr, {}}, memory_space, chunk,
+                     start_time, end_time),
+          prev_allocation_(prev_allocation),
+          copy_start_schedule_after_(copy_start_schedule_after),
+          copy_done_schedule_before_(copy_done_schedule_before) {}
+
+    Status Process(MemorySpaceAssignment* memory_space_assignment) override;
+
+    HloInstruction* instruction() const override {
+      // Unless explicitly set, the instruction of a copy allocation in
+      // retrieved from the previous allocation.
+      if (instruction_ != nullptr) {
+        return instruction_;
+      } else {
+        return prev_allocation_.instruction();
+      }
+    }
+
+   private:
+    const Allocation& prev_allocation_;
+    // These variables define the scheduling boundaries where CopyStart and
+    // CopyDone can be scheduled. The earliest CopyStart can be scheduled is
+    // after copy_start_schedule_after_ and the latest CopyDone can be scheduled
+    // is before copy_done_schedule_before_.
+    HloInstruction* copy_start_schedule_after_;
+    HloInstruction* copy_done_schedule_before_;
+  };
+
+  using AllocationSequence = std::list<std::unique_ptr<Allocation>>;
+  using AllocationMap =
+      absl::flat_hash_map<const HloBuffer*, AllocationSequence>;
+
+  // Runs the MemorySpaceAssignment pass. alternate_memory_space is the
+  // architecture-specific integer value that describes the alternate memory.
+  // max_size_in_bytes is the maximum size of the alternate memory.
+  // min/max_prefetch_interval define min/max number of independent instructions
+  // that can be overlapped while prefetching to decide how early can prefetch
+  // begin. alternate_memory_space_alignment_in_bytes is the alignment required
+  // in the alternate memory space, size_fn is the size function for buffer
+  // values, and is_allowed_in_alternate_mem can be used to prevent certain
+  // HloValues (e.g., based on the opcode) to be placed on the alternate memory.
+  // max_outstanding_async_copies specifies the upper bound for number of
+  // outstanding asynchronous copies, -1 for unlimited.
+  // TODO(berkin): Use the cost model instead of using number of instructions to
+  // decide how early to prefetch.
+  static StatusOr<std::unique_ptr<PresetAssignments>> Run(
+      HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes,
+      int64 min_prefetch_interval, int64 max_prefetch_interval,
+      int64 alternate_memory_space_alignment_in_bytes,
+      BufferValue::SizeFunction size_fn,
+      std::function<bool(const HloValue&)> is_allowed_in_alternate_mem,
+      int64 max_outstanding_async_copies = -1);
+
+  // Returns the maximum number of outstanding asynchronous copies in the
+  // module.
+  static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);
+
+ private:
+  MemorySpaceAssignment(HloModule* module, int64 alternate_memory_space)
+      : module_(module),
+        alternate_memory_space_(alternate_memory_space),
+        preset_assignments_(absl::make_unique<PresetAssignments>()) {}
+
+  // Process calls Process methods of the allocations after the allocations have
+  // been finalized.
+  Status Process();
+
+  // FixSchedule inserts asynchronous copies in the schedule.
+  Status FixSchedule();
+
+  // Insert an instruction to the schedule, and make sure its dependencies
+  // (operands) are already in the schedule. If not, insert these operands
+  // before the instruction.
+  void EnsureInstructionAndOperandsInserted(
+      HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
+      absl::flat_hash_set<HloInstruction*>* inserted_instructions) const;
+
+  // Schedules a pair of asynchronous copy instructions (copy_start and
+  // copy_done) where copy_start will be scheduled after the instruction in
+  // copy_start_schedule_after and copy_done will be scheduled before the
+  // instruction in copy_done_schedule_before.
+  void ScheduleAsynchronousCopy(HloInstruction* copy_start,
+                                HloInstruction* copy_start_schedule_after,
+                                HloInstruction* copy_done,
+                                HloInstruction* copy_done_schedule_before);
+
+  HloModule* module_;
+  int64 alternate_memory_space_;
+  AllocationMap allocation_map_;
+  std::unique_ptr<PresetAssignments> preset_assignments_;
+
+  // These maps hold vectors of new instructions that need to be scheduled after
+  // (or before) the instruction in the key. FixSchedule uses these maps to
+  // modify and fix the schedule.
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
+      schedule_after_;
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
+      schedule_before_;
+};
+
+// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
+// maximum size.
+class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
+ public:
+  using IsAllowedInAlternateMemoryFunction =
+      std::function<bool(const HloValue&)>;
+  using MemorySpace = MemorySpaceAssignment::MemorySpace;
+
+  AlternateMemoryBestFitHeap(
+      MemorySpaceAssignment::AllocationMap* allocation_map,
+      int64 max_size_in_bytes, int64 min_prefetch_interval,
+      int64 max_prefetch_interval, const HloAliasAnalysis& alias_analysis,
+      int64 alignment, GlobalDecreasingSizeBestFitHeap::Type type,
+      IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem,
+      int64 max_outstanding_async_copies)
+      : GlobalDecreasingSizeBestFitHeap(alignment, type),
+        allocation_map_(allocation_map),
+        max_size_in_bytes_(max_size_in_bytes),
+        min_prefetch_interval_(min_prefetch_interval),
+        max_prefetch_interval_(max_prefetch_interval),
+        alias_analysis_(alias_analysis),
+        is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem),
+        max_outstanding_async_copies_(max_outstanding_async_copies) {}
+
+  HeapSimulator::Result Finish() override;
+
+ private:
+  // Finds an allocation for the given interval. Internally, it will attempt to
+  // find a suitable chunk candidate within the heap size and prefetch interval
+  // limits, and append the new allocation(s) to allocations. The new
+  // allocations can be in default or alternate memory spaces, or can be
+  // prefetches or evictions. Returns true if successful.
+  bool FindAllocation(int64 start_time, int64 end_time,
+                      HloPosition defining_position, HloUse use,
+                      const HloValue* buffer, int64 size,
+                      MemorySpaceAssignment::AllocationSequence* allocations);
+
+  // Try allocating in alternate memory without any copies. Returns true if
+  // successful.
+  bool TryAllocatingInAlternateMemoryNoCopy(
+      int64 start_time, int64 end_time, HloPosition defining_position,
+      HloUse use, BufferInterval alternate_mem_interval,
+      HloInstruction* non_bitcast_operand,
+      MemorySpaceAssignment::AllocationSequence* allocations);
+
+  // Returns the instruction at a particular time in the flattened instruction
+  // schedule.
+  HloInstruction* GetInstructionAt(int64 time) const;
+
+  // Given a buffer interval, returns the colocated intervals. Unlike the
+  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
+  // returns the colocated intervals sorted by scheduled time.
+  std::vector<const BufferInterval*> GetSortedColocatedIntervals(
+      const BufferInterval& interval) const;
+
+  // Since the allocations are recorded to the AllocationMap, we don't maintain
+  // result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap to avoid
+  // unnecessarily adding the chunk to the chunk map.
+  void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}
+
+  // Returns true if the addition of an asynchronous copy in the given time
+  // interval would violate the maximum number of asynchronous copies.
+  bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time,
+                                             int64 end_time) const;
+
+  // Adds an asynchronous copy to the allocations.
+  void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
+                    MemorySpace memory_space, Chunk chunk, int64 start_time,
+                    int64 end_time,
+                    MemorySpaceAssignment::AllocationSequence* allocations);
+
+  // These methods are used for delaying committing the chunk candidate until
+  // the entire live range of the buffer has been considered.
+  void AddToPendingChunks(const BufferInterval& buffer_interval,
+                          const ChunkCandidate& chunk_candidate);
+  void CommitPendingChunks();
+
+  MemorySpaceAssignment::AllocationMap* allocation_map_;
+  int64 max_size_in_bytes_;
+  // The min and max prefetch intervals decribe the number of independent HLOs
+  // overlapped while a value is being prefetched into the alternate memory
+  // (between CopyStart and CopyDone HLO instructions). max_prefetch_interval
+  // attempts to prevent bringing tensors into the alternate memory too eagerly
+  // and hence occupying the space for other tensors which might use it.
+  // min_prefetch_interval attempts to prevent cases where tensors are
+  // prefetched into the alternate memory without sufficient time for the copy
+  // to take place. In those cases, it's just better to keep the tensor in the
+  // default memory instead of hurting the critical path with this copy that
+  // likely won't finish in time.
+  // TODO(berkin): Explore heuristics that take into account the cost of copying
+  // tensors between alternate and default memories.
+  int64 min_prefetch_interval_;
+  int64 max_prefetch_interval_;
+  const HloAliasAnalysis& alias_analysis_;
+  IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_;
+  // We use a interval tree to keep track of the number of outstanding
+  // asynchronous copies.
+  BufferIntervalTree async_copy_interval_tree_;
+  int64 max_outstanding_async_copies_;
+  std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
+  std::vector<std::pair<int64, int64>> pending_async_copies_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
new file mode 100644
index 00000000000..99ce46c0799
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -0,0 +1,583 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class MemorySpaceAssignmentTest : public HloTestBase {
+ protected:
+  // We use the following two memory space values to describe the default (slow
+  // and large) and alternate (fast and small) memory spaces.
+  const int64 kDefaultMemorySpace = 0;
+  const int64 kAlternateMemorySpace = 1;
+
+  std::unique_ptr<PresetAssignments> AssignMemorySpace(
+      HloModule* module, int64 max_outstanding_async_copies = -1) {
+    auto size_fn = [](const BufferValue& buffer) {
+      return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+    };
+
+    auto is_allowed_in_alternate_mem = [](const HloValue& value) {
+      // Check if the value belongs to the entry computation.
+      HloInstruction* instruction = value.instruction();
+      HloComputation* computation = instruction->parent();
+      bool in_entry_computation =
+          (computation == computation->parent()->entry_computation());
+      if (in_entry_computation &&
+          instruction->opcode() == HloOpcode::kParameter) {
+        return false;
+      }
+      return true;
+    };
+
+    std::unique_ptr<PresetAssignments> preset_assignments =
+        MemorySpaceAssignment::Run(
+            module, kAlternateMemorySpace,
+            /*max_size_in_bytes=*/128,
+            /*min_prefetch_interval=*/2,
+            /*max_prefetch_interval=*/10,
+            /*alternate_memory_space_alignment_in_bytes=*/8, size_fn,
+            is_allowed_in_alternate_mem, max_outstanding_async_copies)
+            .ValueOrDie();
+    CheckPresetAssignments(preset_assignments.get());
+    return preset_assignments;
+  }
+
+  void CheckPresetAssignments(const PresetAssignments* preset_assignments) {
+    // Ensure that the exported preset assignments point to layouts in the
+    // alternate memory.  Also ensure that the positions are unique. Note that
+    // we're using a std::set instead of absl::flat_hash_set because we can make
+    // use of HloPosition's comparator logic instead of providing a hasher.
+    std::set<HloPosition> positions_in_preset_assignments;
+    for (auto& position_and_chunk : preset_assignments->chunks()) {
+      HloPosition position = position_and_chunk.first;
+      EXPECT_EQ(positions_in_preset_assignments.find(position),
+                positions_in_preset_assignments.end());
+      positions_in_preset_assignments.insert(position);
+      const Shape& subshape =
+          ShapeUtil::GetSubshape(position.instruction->shape(), position.index);
+      EXPECT_EQ(subshape.layout().memory_space(), kAlternateMemorySpace)
+          << "Exported position is not in alternate mem: "
+          << position.ToString();
+    }
+  }
+
+  std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
+    HloComputation::Builder builder(TestName());
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+    HloInstruction* p0 =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+    HloInstruction* p1 =
+        builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+    HloInstruction* tanh = builder.AddInstruction(
+        HloInstruction::CreateUnary(shape, HloOpcode::kTanh, p0));
+    // tanh should be placed in the alternate memory since there isn't much
+    // contention in the beginning. However, tanh has another consumer at the
+    // end. So it should be kicked out to default memory and prefetched back in.
+    // The graph below is meant to increase the contention to force
+    // eviction/prefetch behavior.
+    HloInstruction* a = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, tanh));
+    HloInstruction* b = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1));
+    HloInstruction* c = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1));
+    HloInstruction* d = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1));
+    HloInstruction* e = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, b));
+    HloInstruction* f = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, c));
+    HloInstruction* g = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, d));
+    HloInstruction* h = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, c));
+    HloInstruction* i = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, d));
+    HloInstruction* j = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, c, d));
+    HloInstruction* k = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, e, f));
+    HloInstruction* l = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, g, h));
+    HloInstruction* m = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, i, j));
+    HloInstruction* n = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, k, l));
+    HloInstruction* o = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, n, m));
+    // tanh is being used at the root instruction, and this should be
+    // prefetched.
+    HloInstruction* add = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, o, tanh));
+
+    auto module = CreateNewVerifiedModule();
+    HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+    HloSchedule schedule(module.get());
+    schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i,
+                                        j, k, l, m, n, o, add});
+    TF_CHECK_OK(module->set_schedule(schedule));
+    return module;
+  }
+};
+
+TEST_F(MemorySpaceAssignmentTest, ParameterOnly) {
+  // A module consisting of a single parameter. Inputs/outputs are currently
+  // excluded from memory space assignment.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+}
+
+TEST_F(MemorySpaceAssignmentTest, Simple) {
+  // A simple module with a few simple instructions. Expect this to be
+  // transformed with CopyStart and CopyDone instructions inserted after inputs
+  // and before outputs.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, p1));
+  HloInstruction* sub = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1));
+  HloInstruction* mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, sub));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, add, sub, mul});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  auto preset_assignments = AssignMemorySpace(module.get());
+
+  // Inputs and outputs are currently placed in the default memory. Everything
+  // else should be in the alternate memory.
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  EXPECT_THAT(mul, op::ShapeWithLayout(shape));
+  EXPECT_THAT(add, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(sub, op::ShapeWithLayout(shape_in_alternate_mem));
+
+  // Make sure the preset assignments is sane.
+  EXPECT_EQ(preset_assignments->chunks().size(), 2);
+  EXPECT_EQ(preset_assignments->sizes().size(), 1);
+  // Ensure the offset assigned to add and sub are different.
+  EXPECT_NE(preset_assignments->chunks()[0].second.offset,
+            preset_assignments->chunks()[1].second.offset);
+}
+
+TEST_F(MemorySpaceAssignmentTest, NegateChain) {
+  // The negate chain is long enough for asynchronous copy to be inserted
+  // between p1 and add.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, negate5, negate6, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(1))));
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  // Negate instructions are in the alternate memory space (1).
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
+  EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
+  // Ensure the CopyStart/CopyDone schedules.
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  EXPECT_THAT(sequence.instructions()[0], op::Parameter(0));
+  EXPECT_THAT(sequence.instructions()[1], op::Parameter(1));
+  EXPECT_THAT(sequence.instructions()[2], op::CopyStart());
+  EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
+}
+
+TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetch) {
+  std::unique_ptr<HloModule> module = CreateEvictAndPrefetchModule();
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Add(op::Add(),
+              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                            op::AsyncCopy(kDefaultMemorySpace,
+                                          kAlternateMemorySpace, op::Tanh()))));
+
+  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
+            2);
+}
+
+TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
+  std::unique_ptr<HloModule> module = CreateEvictAndPrefetchModule();
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
+
+  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
+            0);
+}
+
+TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
+  std::unique_ptr<HloModule> module = CreateEvictAndPrefetchModule();
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
+
+  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
+            1);
+}
+
+TEST_F(MemorySpaceAssignmentTest, While) {
+  auto module = CreateNewVerifiedModule();
+  Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3});
+  Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  HloInstruction* cond_limit = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(50.f)));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), cond_iter,
+                                    cond_limit, ComparisonDirection::kLt));
+  HloComputation* cond_computation =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloInstruction* body_iter = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, body_param, 1));
+  HloInstruction* body_data = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, body_param, 0));
+  HloInstruction* body_iter_increment = body_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.f)));
+  HloInstruction* body_iter_next =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          scalar_shape, HloOpcode::kAdd, body_iter, body_iter_increment));
+  HloInstruction* body_data_increment =
+      body_builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR2<float>({{1.f, 2.f, 3.f}, {4.f, 5.f, 6.f}})));
+  HloInstruction* body_data_mul =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kMultiply, body_data, body_data));
+  HloInstruction* body_data_add =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kAdd, body_data, body_data_increment));
+  HloInstruction* body_data_next =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kAdd, body_data_add, body_data_mul));
+  HloInstruction* body_out = body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_data_next, body_iter_next}));
+  HloComputation* body_computation =
+      module->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param_iter"));
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({data, iter}));
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(cond_computation,
+                        {cond_param, cond_iter, cond_limit, cond_lt});
+  schedule.set_sequence(body_computation,
+                        {body_param, body_iter, body_data, body_iter_increment,
+                         body_iter_next, body_data_increment, body_data_mul,
+                         body_data_add, body_data_next, body_out});
+  schedule.set_sequence(entry_computation, {iter, data, tuple, while_op});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  // Ensure the tuple value and buffers used in the while instruction are
+  // exempted from using the alternate memory. However, body_data_mul is
+  // independent and can be safely be placed in the alternate memory.
+  EXPECT_THAT(tuple, op::ShapeWithLayout(tuple_shape));
+  EXPECT_THAT(data, op::ShapeWithLayout(shape));
+  EXPECT_THAT(iter, op::ShapeWithLayout(scalar_shape));
+  EXPECT_THAT(body_data, op::ShapeWithLayout(shape));
+  EXPECT_THAT(body_iter, op::ShapeWithLayout(scalar_shape));
+  EXPECT_THAT(cond_iter, op::ShapeWithLayout(scalar_shape));
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
+  EXPECT_THAT(body_data_mul, op::ShapeWithLayout(shape_in_alternate_mem));
+}
+
+TEST_F(MemorySpaceAssignmentTest, Tuple) {
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape inner_tuple_shape = ShapeUtil::MakeTupleShape({shape});
+  Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({shape, shape, inner_tuple_shape});
+  HloInstruction* p = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p"));
+  HloInstruction* p0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, p, 0));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* p1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, p, 1));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+  HloInstruction* p2 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(inner_tuple_shape, p, 2));
+  HloInstruction* p2_0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, p2, 0));
+  HloInstruction* mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, p2_0));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(
+      computation, {p, p0, negate0, negate1, negate2, negate3, negate4, negate5,
+                    negate6, p1, add, p2, p2_0, mul});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_THAT(
+      mul,
+      op::Multiply(op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::GetTupleElement())),
+                   op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                                 op::GetTupleElement(op::GetTupleElement()))));
+}
+
+TEST_F(MemorySpaceAssignmentTest, Bitcast) {
+  // Bitcasts can cause the position in the alternate memory to appear multiple
+  // times in the preset assignments. This test ensure the preset assignments
+  // refer to unique positions.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* bitcast =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape, negate));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, bitcast, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate, bitcast, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_EQ(bitcast->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, Bitcast2) {
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape param_shape = ShapeUtil::MakeShape(F32, {6});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* bitcast =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape, p1));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, bitcast, negate4));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, bitcast, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  EXPECT_EQ(bitcast->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, Bitcast3) {
+  HloComputation::Builder builder(TestName());
+  Shape shape1 = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});
+  Shape shape3 = ShapeUtil::MakeShape(F32, {1, 6});
+  Shape param_shape = ShapeUtil::MakeShape(F32, {6});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape1, "p0"));
+  HloInstruction* p1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape1, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape1, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape1, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape1, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape1, HloOpcode::kNegate, negate3));
+  HloInstruction* bitcast1 =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape1, p1));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape1, HloOpcode::kAdd, bitcast1, negate4));
+  HloInstruction* bitcast2 =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape3, p1));
+  HloInstruction* bitcast3 =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape2, bitcast2));
+  HloInstruction* bitcast4 =
+      builder.AddInstruction(HloInstruction::CreateBitcast(shape2, add));
+  HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape2, HloOpcode::kMultiply, bitcast3, bitcast4));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation,
+                        {p0, p1, negate0, negate1, negate2, negate3, negate4,
+                         bitcast1, add, bitcast2, bitcast3, bitcast4, mul});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  // We expect one bitcast on the LHS of multiply since bitcast(bitcast(foo)) is
+  // converted to bitcast(foo).
+  EXPECT_THAT(
+      mul,
+      op::Multiply(
+          op::Bitcast(op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                                    op::Parameter(1))),
+          op::Bitcast(op::Add(
+              op::Bitcast(op::AsyncCopy(kAlternateMemorySpace,
+                                        kDefaultMemorySpace, op::Parameter(1))),
+              op::Negate()))));
+  EXPECT_EQ(bitcast1->shape().layout().memory_space(), kAlternateMemorySpace);
+  EXPECT_EQ(add->shape().layout().memory_space(), kAlternateMemorySpace);
+  // bitcast2 will no longer have a consumer and should get DCE'd, so we don't
+  // care about its memory space.
+  EXPECT_EQ(bitcast3->shape().layout().memory_space(), kAlternateMemorySpace);
+  EXPECT_EQ(bitcast4->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 72ca402427e..5a26ea1be22 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -38,14 +38,59 @@ cc_library(
     hdrs = ["mlir_compiler.h"],
     deps = [
         ":failover_compiler",
+        ":lhlo_dialect_emitter",
         "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_constants",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/service/gpu:gpu_hlo_schedule",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
+        "//tensorflow/compiler/xla/service/gpu:stream_assignment",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:LLVMDialect",
     ],
     alwayslink = True,  # Contains compiler registration
 )
+
+cc_library(
+    name = "lhlo_dialect_emitter",
+    srcs = ["lhlo_dialect_emitter.cc"],
+    hdrs = ["lhlo_dialect_emitter.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/gpu:thunk",
+        "//tensorflow/compiler/xla/service/gpu:thunk_emitter",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:LLVMDialect",
+        "@local_config_mlir//:StandardOps",
+    ],
+)
+
+cc_library(
+    name = "mlir_irgen_test_base",
+    testonly = True,
+    srcs = ["mlir_irgen_test_base.cc"],
+    hdrs = ["mlir_irgen_test_base.h"],
+    deps = [
+        ":failover_compiler",
+        ":mlir_compiler",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:codegen_test_base",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/core:test",
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc
index f225e92bd30..4107d92da7e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc
@@ -50,25 +50,6 @@ StatusOr<std::unique_ptr<Executable>> FailoverCompiler::RunBackend(
   return result;
 }
 
-Status FailoverCompiler::RunHloPassesOnModuleGroup(
-    HloModuleGroup* module_group,
-    absl::Span<se::StreamExecutor* const> executors,
-    se::DeviceMemoryAllocator* device_allocator) {
-  // This is not supported by GPU compiler anyway.
-  return Unimplemented(
-      "Model partitioning not implemented for the failover compiler!");
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>>
-FailoverCompiler::RunBackendOnModuleGroup(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  // This is not supported by GPU compiler anyway.
-  return Unimplemented(
-      "Model partitioning not implemented for the failover compiler!");
-}
-
 StatusOr<std::vector<std::unique_ptr<Executable>>> FailoverCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h
index cfa542f2e38..05badaa98e1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h
@@ -57,16 +57,6 @@ class FailoverCompiler final : public Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
-  Status RunHloPassesOnModuleGroup(
-      HloModuleGroup* module_group,
-      absl::Span<se::StreamExecutor* const> executors,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
@@ -78,6 +68,9 @@ class FailoverCompiler final : public Compiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
+  Compiler* GetPrimary() const { return primary_.get(); }
+  Compiler* GetSecondary() const { return secondary_.get(); }
+
  private:
   std::unique_ptr<Compiler> primary_;
   std::unique_ptr<Compiler> secondary_;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
new file mode 100644
index 00000000000..1f8241aeda3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -0,0 +1,223 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Identifier.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace mlir_gpu {
+namespace {
+
+using ::mlir::ArrayRef;
+using ::mlir::Attribute;
+using ::mlir::Builder;
+using ::mlir::FuncOp;
+using ::mlir::Identifier;
+using ::mlir::Location;
+using ::mlir::ModuleOp;
+using ::mlir::NamedAttribute;
+using ::mlir::OpBuilder;
+using ::mlir::Type;
+using ::mlir::Value;
+using ::mlir::LLVM::LLVMDialect;
+using ::xla::gpu::Thunk;
+using ::xla::gpu::ThunkEmitter;
+using ::xla::gpu::ThunkSequence;
+
+Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
+                    ArrayRef<Type> rets, ArrayRef<Value*> args,
+                    ArrayRef<std::pair<Identifier, Attribute>> attrs) {
+  switch (opcode) {
+    case HloOpcode::kAdd:
+      func_builder.create<::mlir::xla_lhlo::AddOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kMultiply:
+      func_builder.create<::mlir::xla_lhlo::MulOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kSubtract:
+      func_builder.create<::mlir::xla_lhlo::SubOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kDivide:
+      func_builder.create<::mlir::xla_lhlo::DivOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kAnd:
+      func_builder.create<::mlir::xla_lhlo::AndOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kMinimum:
+      func_builder.create<::mlir::xla_lhlo::MinOp>(loc, rets, args, attrs);
+      break;
+    case HloOpcode::kMaximum:
+      func_builder.create<::mlir::xla_lhlo::MaxOp>(loc, rets, args, attrs);
+      break;
+    default:
+      return tensorflow::errors::Internal(absl::StrCat(
+          "Opcode ", HloOpcodeString(opcode), " is not supported."));
+  }
+  return Status::OK();
+}
+
+StatusOr<::mlir::MemRefType> ConvertTensorType(const Shape& shape,
+                                               Builder builder) {
+  llvm::SmallVector<int64_t, 4> array;
+  array.reserve(shape.dimensions_size());
+  for (const auto dim : shape.dimensions()) {
+    array.push_back(dim);
+  }
+  switch (shape.element_type()) {
+    case PrimitiveType::PRED:
+      return builder.getMemRefType(array, builder.getI1Type());
+    case PrimitiveType::F16:
+      return builder.getMemRefType(array, builder.getF16Type());
+    case PrimitiveType::F32:
+      return builder.getMemRefType(array, builder.getF32Type());
+    case PrimitiveType::F64:
+      return builder.getMemRefType(array, builder.getF64Type());
+    case PrimitiveType::S8:
+      return builder.getMemRefType(array, builder.getIntegerType(8));
+    case PrimitiveType::S16:
+      return builder.getMemRefType(array, builder.getIntegerType(16));
+    case PrimitiveType::S32:
+      return builder.getMemRefType(array, builder.getIntegerType(32));
+    case PrimitiveType::S64:
+      return builder.getMemRefType(array, builder.getIntegerType(64));
+    default:
+      return tensorflow::errors::Internal(absl::StrCat(
+          "Unsupported type: ", PrimitiveType_Name(shape.element_type())));
+  }
+}
+
+StatusOr<Type> ConvertType(const Shape& shape, Builder builder) {
+  if (shape.IsTuple()) {
+    Type mlir_type;
+    llvm::SmallVector<Type, 4> contents;
+    contents.reserve(shape.tuple_shapes_size());
+    for (const auto& subtype : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(auto mlir_subtype, ConvertType(subtype, builder));
+      contents.push_back(mlir_subtype);
+    }
+    return builder.getTupleType(contents);
+  }
+  return ConvertTensorType(shape, builder);
+}
+
+StatusOr<llvm::SmallVector<Type, 4>> GetInstructionArgTypes(
+    const HloInstruction& instruction, Builder builder) {
+  llvm::SmallVector<Type, 4> arg_types;
+  for (auto operand : instruction.operands()) {
+    TF_ASSIGN_OR_RETURN(auto operand_type,
+                        ConvertType(operand->shape(), builder));
+    arg_types.push_back(operand_type);
+  }
+  TF_ASSIGN_OR_RETURN(auto operand_type,
+                      ConvertType(instruction.shape(), builder));
+  arg_types.push_back(operand_type);
+  return arg_types;
+}
+
+}  // namespace
+
+LhloDialectEmitter::LhloDialectEmitter(const HloModule& hlo_module,
+                                       const BufferAssignment& assignment,
+                                       const se::Platform* platform,
+                                       ModuleOp mlir_module)
+    : mlir_module_(mlir_module),
+      builder_(mlir_module_.getContext()),
+      buffer_assignment_(assignment),
+      platform_(platform),
+      thunk_sequence_(new ThunkSequence()) {
+  LLVMDialect* llvmDialect =
+      mlir_module.getContext()->getRegisteredDialect<LLVMDialect>();
+  pointer_size_ = llvmDialect->getLLVMModule().getDataLayout().getPointerSize();
+}
+
+void LhloDialectEmitter::AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
+  thunk_sequence_->push_back(std::move(thunk));
+}
+
+StatusOr<BufferAllocation::Slice> LhloDialectEmitter::MaybeGetAllocationSlice(
+    const HloInstruction& hlo, const ShapeIndex& index) const {
+  return buffer_assignment_.GetUniqueSlice(&hlo, index);
+}
+
+int64 LhloDialectEmitter::ByteSizeOf(const Shape& shape) const {
+  return ShapeUtil::ByteSizeOf(shape, pointer_size_);
+}
+
+const se::Platform* LhloDialectEmitter::platform() const { return platform_; }
+
+Status LhloDialectEmitter::EmitComputation(const HloComputation& computation) {
+  return computation.root_instruction()->Accept(this);
+}
+
+StatusOr<FuncOp> LhloDialectEmitter::CreateFunction(
+    const HloInstruction& instr) {
+  TF_ASSIGN_OR_RETURN(auto args, GetInstructionArgTypes(instr, builder_));
+  auto function_type = builder_.getFunctionType(args, {});
+  auto function =
+      FuncOp::create(builder_.getUnknownLoc(), instr.name(), function_type);
+  mlir_module_.push_back(function);
+  function.addEntryBlock();
+  instruction_to_mlir_func_[&instr] = function;
+  return function;
+}
+
+Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
+  OpBuilder func_builder(function.getBody());
+  llvm::SmallVector<Value*, 4> arg_values{function.args_begin(),
+                                          function.args_end()};
+  llvm::SmallVector<NamedAttribute, 10> attributes{
+      builder_.getNamedAttr("name", builder_.getStringAttr(instr->name()))};
+  TF_RETURN_IF_ERROR(InsertMlirOp(instr->opcode(), func_builder,
+                                  builder_.getUnknownLoc(), ArrayRef<Type>{},
+                                  arg_values, attributes));
+  return Status::OK();
+}
+
+Status LhloDialectEmitter::HandleFusion(HloInstruction* fusion) {
+  LOG(FATAL) << "Not implemented yet.";
+}
+
+Status LhloDialectEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  return ThunkEmitter(this).HandleCustomCall(custom_call);
+}
+
+Status LhloDialectEmitter::HandleParameter(HloInstruction* parameter) {
+  return Status::OK();
+}
+
+Status LhloDialectEmitter::FinishVisit(HloInstruction* root) {
+  return Status::OK();
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
new file mode 100644
index 00000000000..7d0c818068a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+// Implementation for the translation of HLO instructions to a ThunkSequence
+// via MLIR using the LHLO dialect.
+// Implements the DfsHloVisitor interface, emits LHLO computations as MLIR IR
+// functions and transforms them into gpu::Thunk.
+class LhloDialectEmitter : public DfsHloVisitorWithDefault,
+                           private gpu::ThunkEmitter::EmissionContext {
+ public:
+  LhloDialectEmitter(const HloModule& hlo_module,
+                     const BufferAssignment& assignment,
+                     const se::Platform* platform,
+                     ::mlir::ModuleOp mlir_module);
+  ~LhloDialectEmitter() override = default;
+
+  Status EmitComputation(const HloComputation& computation);
+
+  // The following methods implement the DfsHloVisitor interface.
+  //
+  // Default action which emits code for most operations. Operations which are
+  // special in some way are handled explicitly in HandleFoo methods.
+  Status DefaultAction(HloInstruction* instr) override;
+
+  Status HandleFusion(HloInstruction* fusion) override;
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  Status FinishVisit(HloInstruction* root) override;
+
+  // Transfers the ownship of thunk_sequence_ out.
+  std::unique_ptr<gpu::ThunkSequence> ConsumeThunkSequence() {
+    return std::move(thunk_sequence_);
+  }
+
+ private:
+  StatusOr<::mlir::FuncOp> CreateFunction(const HloInstruction& instr);
+  // Interface required by ThunkEmitter
+  void AddThunkToThunkSequence(std::unique_ptr<gpu::Thunk> thunk) override;
+  StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index) const override;
+  int64 ByteSizeOf(const Shape& shape) const override;
+  const se::Platform* platform() const override;
+
+  ::mlir::ModuleOp mlir_module_;
+  ::mlir::Builder builder_;
+  absl::flat_hash_map<const xla::HloInstruction*, ::mlir::FuncOp>
+      instruction_to_mlir_func_;
+  const BufferAssignment& buffer_assignment_;
+  const se::Platform* platform_;
+  // Cached pointer size extracted from the mlir module.
+  unsigned pointer_size_;
+  // The thunk sequence this IrEmitter generates for the input computation.
+  std::unique_ptr<gpu::ThunkSequence> thunk_sequence_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LhloDialectEmitter);
+};
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index 5421a3ae093..d240003b039 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -15,21 +15,41 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 
-#include "mlir/LLVMIR/LLVMDialect.h"  // TF:local_config_mlir
+#include <memory>
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
-namespace mlir {
+namespace mlir_gpu {
+namespace {
 
 using ::mlir::MLIRContext;
+using ::mlir::ModuleOp;
+using ::mlir::OwningModuleRef;
+using ::mlir::UnknownLoc;
 using ::mlir::LLVM::LLVMDialect;
+using ::xla::gpu::GpuExecutable;
+using ::xla::gpu::GpuHloSchedule;
+using ::xla::gpu::GpuVersion;
+using ::xla::gpu::StreamAssignment;
+using ::xla::gpu::ThunkSchedule;
 
-namespace {
 int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
   LLVMDialect* dialect = context->getRegisteredDialect<LLVMDialect>();
   llvm::Module& module = dialect->getLLVMModule();
@@ -37,6 +57,7 @@ int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
   module.setDataLayout(gpu::nvptx::kDataLayout);
   return module.getDataLayout().getPointerSize();
 }
+
 }  // namespace
 
 MlirCompiler::MlirCompiler()
@@ -51,34 +72,109 @@ StatusOr<std::unique_ptr<HloModule>> MlirCompiler::RunHloPasses(
     se::DeviceMemoryAllocator* device_allocator) {
   // Until we find a reason to do something different, run the same passes
   // that the normal GPU backend runs.
-  TF_RETURN_IF_ERROR(xla::gpu::impl::OptimizeHloModule(
-      module.get(), stream_exec, device_allocator));
-
-  TF_RETURN_IF_ERROR(
-      xla::gpu::impl::PrepareHloModuleForIrEmitting(module.get()));
+  gpu::NVPTXCompiler xla_compiler;
+  TF_RETURN_IF_ERROR(xla_compiler.OptimizeHloModule(module.get(), stream_exec,
+                                                    device_allocator));
+  TF_RETURN_IF_ERROR(xla_compiler.PrepareHloModuleForIrEmitting(module.get()));
 
   return std::move(module);
 }
 
+namespace {
+
+// TODO(b/137624192): Move this to custom call handling and share.
+absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
+                                        const HloInstruction* operand,
+                                        const ShapeIndex& user_index) {
+  if (user->opcode() == HloOpcode::kCustomCall) {
+    // Share the bias buffer with the parent instruction.
+    if (user->custom_call_target() == xla::gpu::kGemmCallTarget) {
+      if (user->operand_count() == 3 && user->operand(2) == operand) {
+        return true;
+      }
+    }
+    // The operand of cholesky can be shared with the first output.
+    if (user->custom_call_target() == xla::gpu::kCusolverCholeskyCallTarget) {
+      return user_index.size() == 1 && user_index[0] == 0;
+    }
+  }
+  return absl::nullopt;
+}
+
+// TODO(b/137624192): Share this with nvptx backend.
+GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) {
+  int cc_major, cc_minor;
+  const auto& device_description = stream_exec->GetDeviceDescription();
+  if (!device_description.cuda_compute_capability(&cc_major, &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
+  }
+  return std::make_pair(cc_major, cc_minor);
+}
+
+}  //  namespace
+
 StatusOr<std::unique_ptr<Executable>> MlirCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("Not yet implemented in MLIR compiler");
-}
+  // Determine the HLO schedule, which is an ordering of HLO instructions. This
+  // is used by buffer assignment to enable buffer reuse, and the same ordering
+  // must also be used to determine the thunk launch schedule.
+  std::unique_ptr<StreamAssignment> stream_assignment =
+      xla::gpu::AssignStreams(*module);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuHloSchedule> hlo_schedule,
+      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
 
-Status MlirCompiler::RunHloPassesOnModuleGroup(
-    HloModuleGroup* module_group,
-    absl::Span<se::StreamExecutor* const> executors,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("Not yet implemented in MLIR compiler");
-}
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
+                      BufferAssigner::Run(
+                          module.get(), hlo_schedule->ConsumeHloOrdering(),
+                          BufferSizeBytesFunction(),
+                          /*color_alignment=*/
+                          [](LogicalBuffer::Color) {
+                            return xla::gpu::kXlaAllocatedBufferAlignBytes;
+                          },
+                          /*allocate_buffers_for_constants=*/true,
+                          /*colorer=*/BufferAssigner::DefaultColorer(),
+                          /*must_not_live_out=*/{}, &CanShareBufferHint));
+  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
-StatusOr<std::vector<std::unique_ptr<Executable>>>
-MlirCompiler::RunBackendOnModuleGroup(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("Not yet implemented in MLIR compiler");
+  MLIRContext mlir_context;
+  OwningModuleRef mlir_module =
+      ModuleOp::create(UnknownLoc::get(&mlir_context));
+  LhloDialectEmitter lhlo_emitter(*module, *buffer_assignment,
+                                  stream_exec->platform(), *mlir_module);
+
+  TF_RETURN_IF_ERROR(
+      lhlo_emitter.EmitComputation(*module->entry_computation()));
+
+  if (module_hook_.callback && !module_hook_.apply_on_lowered) {
+    module_hook_.callback(*mlir_module);
+  }
+
+  // TODO(b/137624192): Emit function per hlo and turn into ptx string and blob.
+  std::string ptx;
+  std::vector<uint8> cubin;
+
+  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
+      lhlo_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
+      hlo_schedule->ThunkLaunchOrder());
+
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "thunk_schedule",
+                            thunk_schedule->ToString());
+  }
+
+  // TODO(b/137624192): Add profiling support.
+
+  return static_cast<std::unique_ptr<Executable>>(
+      absl::make_unique<GpuExecutable>(
+          ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
+          std::move(module), std::move(buffer_assignment), nullptr, nullptr));
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompiler::Compile(
@@ -94,14 +190,20 @@ MlirCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   return Unimplemented("Not yet implemented in MLIR compiler");
 }
 
-}  // namespace mlir
+void MlirCompiler::SetModuleHook(IRHook module_hook) {
+  module_hook_ = module_hook;
+}
+
+void MlirCompiler::RemoveModuleHook() { module_hook_ = {nullptr, false}; }
+
+}  // namespace mlir_gpu
 }  // namespace xla
 
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       stream_executor::cuda::kCudaPlatformId, []() {
         return absl::make_unique<xla::FailoverCompiler>(
-            absl::make_unique<xla::mlir::MlirCompiler>(),
+            absl::make_unique<xla::mlir_gpu::MlirCompiler>(),
             absl::make_unique<xla::gpu::NVPTXCompiler>());
       });
   return true;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
index f02164c4d24..fdc71903a06 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@@ -17,10 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/xla/service/compiler.h"
 
 namespace xla {
-namespace mlir {
+namespace mlir_gpu {
 
 // A Compiler implementation that converts XLAs IR to a matching MLIR dialect,
 // performs all lowering on the MLIR IR and finally converts MLIR to LLVMIR for
@@ -39,16 +40,6 @@ class MlirCompiler : public Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       se::DeviceMemoryAllocator* device_allocator) override;
 
-  Status RunHloPassesOnModuleGroup(
-      HloModuleGroup* module_group,
-      absl::Span<se::StreamExecutor* const> executors,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
-  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
-      std::unique_ptr<HloModuleGroup> module_group,
-      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
-
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
@@ -65,12 +56,21 @@ class MlirCompiler : public Compiler {
     };
   }
 
+  struct IRHook {
+    std::function<void(mlir::ModuleOp)> callback;
+    bool apply_on_lowered;
+  };
+
+  void SetModuleHook(IRHook module_hook);
+  void RemoveModuleHook();
+
  private:
   ::mlir::MLIRContext context_;
   int64 pointer_size_;
+  IRHook module_hook_;
 };
 
-}  // namespace mlir
+}  // namespace mlir_gpu
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
new file mode 100644
index 00000000000..4b6a03270c7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h"
+
+#include <functional>
+#include <utility>
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+void MlirIrGenTestBase::CompileAndVerifyIr(
+    std::unique_ptr<HloModule> hlo_module, const string& pattern,
+    bool match_lowered_ir) {
+  MlirCompiler* compiler = GetMLIRCompiler();
+  string ir;
+  compiler->SetModuleHook({[&ir](mlir::ModuleOp module) -> Status {
+                             std::string buffer_string;
+                             llvm::raw_string_ostream ostream(buffer_string);
+                             module.print(ostream);
+                             ostream.flush();
+                             ir = buffer_string;
+                             return Status::OK();
+                           },
+                           match_lowered_ir});
+  Status status = CompileToExecutable(std::move(hlo_module)).status();
+  compiler->RemoveModuleHook();
+  TF_ASSERT_OK(status);
+
+  StatusOr<bool> filecheck_result = RunFileCheck(ir, pattern);
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(filecheck_result.ValueOrDie());
+}
+
+void MlirIrGenTestBase::CompileAndVerifyIr(const string& hlo_text,
+                                           const string& expected_llvm_ir,
+                                           bool match_lowered_ir) {
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo_text, config));
+  CompileAndVerifyIr(std::move(module), expected_llvm_ir, match_lowered_ir);
+}
+
+MlirCompiler* MlirIrGenTestBase::GetMLIRCompiler() {
+  // TODO(b/137624192): Remove failover once no longer in place.
+  FailoverCompiler* failover =
+      static_cast<FailoverCompiler*>(backend().compiler());
+  return static_cast<MlirCompiler*>(failover->GetPrimary());
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
new file mode 100644
index 00000000000..613ddc27bf6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
+#include "tensorflow/compiler/xla/tests/codegen_test_base.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+// Tests that verify IR emitted by the CPU/GPU backend is as expected.
+class MlirIrGenTestBase : public CodegenTestBase {
+ protected:
+  // Compiles the given HLO module to MLIR IR and verifies the IR matches the
+  // given pattern. `pattern` is in the FileCheck pattern matching syntax
+  // (http://llvm.org/docs/CommandGuide/FileCheck.html).
+  //
+  // This function invokes the JIT compiler.
+  //
+  // If `match_lowered_ir` is true, match the version of the IR after lowering
+  // steps to LLVM IR are applied; otherwise, the IR before lowering is
+  // matched.
+  void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
+                          const string& pattern, bool match_lowered_ir = false);
+
+  // A thin wrapper around CompileAndVerifyIr that parses `hlo_text` to create
+  // an HLO module.
+  void CompileAndVerifyIr(const string& hlo_text,
+                          const string& expected_llvm_ir,
+                          bool match_lowered_ir = false);
+
+  // Compiles and returns module with optimizations from a given HLO.
+  StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
+      absl::string_view hlo);
+
+ private:
+  MlirCompiler* GetMLIRCompiler();
+};
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_IRGEN_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
new file mode 100644
index 00000000000..2e799381c48
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -0,0 +1,42 @@
+# TODO(herhut): describe this package.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/core/platform:default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+tf_cc_test(
+    name = "mlir_gpu_lhlo_gen_test",
+    srcs = ["mlir_gpu_lhlo_gen_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:mlir_gpu_plugin",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
new file mode 100644
index 00000000000..5e9413c1b5e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+class LhloGenTest : public MlirIrGenTestBase {};
+
+TEST_F(LhloGenTest, Add) {
+  CompileAndVerifyIr(R"(
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+})",
+                     R"(
+;CHECK: module {
+;CHECK:  func @add(%{{.*}}: memref<2x2xf32>, %{{.*}}: memref<2x2xf32>, %{{.*}}: memref<2x2xf32>) {
+;CHECK:    "xla_lhlo.add"(%{{.*}}, %{{.*}}, %{{.*}}) {name = "add"} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+;CHECK:  }
+;CHECK: }
+      )");
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 582e59349e8..6c31f6bdc86 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -123,7 +123,6 @@ HloInstruction* MultiOutputFusion::Fuse(HloInstruction* instr1,
   if (fused->IsMultiOutputFusion()) {
     std::swap(remaining, fused);
   }
-
   if (fused->opcode() == HloOpcode::kFusion) {
     remaining->MergeFusionInstructionIntoMultiOutput(fused);
   } else {
@@ -249,14 +248,12 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
       multioutput_user_is_not_gte(instr2)) {
     return false;
   }
-
   if (is_connected(instr1, instr2)) {
     return false;
   }
   if (!ShapesCompatibleForFusion(instr1, instr2)) {
     return false;
   }
-
   return true;
 }
 
@@ -339,4 +336,5 @@ bool MultiOutputFusion::Perform() {
 }
 
 bool MultiOutputFusion::DoProducerConsumerMultiOutputFusion() { return false; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 3d129c4ec50..9000370f6f3 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -40,8 +40,8 @@ namespace xla {
 //      fused and their fusion profit scores.
 //
 //  Function Perform() applies the optimization. It picks up the most profitable
-//  pair in the worklist_, check if it's legal to fuse and fuse the pair.
-//  After fusion, it updates the associated structure such as reachability_,
+//  pair in the worklist_, checks if it's legal to fuse and fuses the pair.
+//  After fusion, it updates the associated structures such as reachability_,
 //  candidates_ and worklist_.
 //  Note that the reachability map is updated based on the original computation.
 //  This works because the reachability is monotonically increasing with
@@ -105,13 +105,6 @@ class MultiOutputFusion : public HloModulePass {
   virtual bool DoProducerConsumerMultiOutputFusion();
 
  private:
-  // Update the internal data structures after instr1 and instr2 are fused into
-  // one fusion instruction.
-  void Update(HloInstruction* instr1, HloInstruction* instr2);
-
-  // Computation for the pass.
-  HloComputation* computation_;
-
   // An internal data structure for each instruction in current computation.
   // When an instruction is removed, member 'hlo' is set to nullptr.
   struct FusionCandidate {
@@ -119,16 +112,6 @@ class MultiOutputFusion : public HloModulePass {
     std::list<std::pair<HloInstruction*, int64>> fusibles;
     explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {}
   };
-  std::vector<FusionCandidate> candidates_;
-
-  // A map that maps an instruction to the index_.
-  absl::flat_hash_map<HloInstruction*, int> candidates_index_;
-
-  // The reachability map of current computation.
-  std::unique_ptr<HloReachabilityMap> reachability_;
-
-  // This stores all the candidate instructions in current computation.
-  std::vector<HloInstruction*> all_fusion_candidates_;
 
   // The pair of candidates to be fused and the profit score.
   struct ToBeFused {
@@ -139,7 +122,10 @@ class MultiOutputFusion : public HloModulePass {
         : instr1(instr1), instr2(instr2), score(score) {}
     bool operator<(const ToBeFused& rhs) const { return score < rhs.score; }
   };
-  std::priority_queue<ToBeFused> worklist_;
+
+  // Update the internal data structures after instr1 and instr2 are fused into
+  // one fusion instruction.
+  void Update(HloInstruction* instr1, HloInstruction* instr2);
 
   int64 get_candidate_id(HloInstruction* instr) {
     return FindOrDie(candidates_index_, instr);
@@ -156,6 +142,21 @@ class MultiOutputFusion : public HloModulePass {
   bool is_connected(HloInstruction* instr1, HloInstruction* instr2) {
     return reachability_->IsConnected(instr1, instr2);
   }
+
+  std::vector<FusionCandidate> candidates_;
+  std::priority_queue<ToBeFused> worklist_;
+
+  // A map that maps an instruction to the index_.
+  absl::flat_hash_map<HloInstruction*, int> candidates_index_;
+
+  // The reachability map of current computation.
+  std::unique_ptr<HloReachabilityMap> reachability_;
+
+  // This stores all the candidate instructions in current computation.
+  std::vector<HloInstruction*> all_fusion_candidates_;
+
+  // Computation for the pass.
+  HloComputation* computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index db2cd28d0c5..32e4c636327 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -455,9 +455,9 @@ class LayoutPattern {
   template <typename NewImpl>
   auto AppendImpl(NewImpl new_impl) const
       -> LayoutPattern<LayoutType,
-                       decltype(AllOf<Layout>(std::declval<Impl>(),
-                                              std::move(new_impl)))> {
-    auto new_allof = AllOf<Layout>(impl_, std::move(new_impl));
+                       decltype(AllOf<::xla::Layout>(std::declval<Impl>(),
+                                                     std::move(new_impl)))> {
+    auto new_allof = AllOf<::xla::Layout>(impl_, std::move(new_impl));
     return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
                                                           matched_layout_);
   }
@@ -869,7 +869,7 @@ class ShapePatternLayoutImpl {
            layout_.Match(&shape->layout(), option);
   }
 
-  bool Match(Shape* shape, MatchOption option) const {
+  bool Match(::xla::Shape* shape, MatchOption option) const {
     if (!LayoutUtil::HasLayout(*shape)) {
       EXPLAIN << "Shape does not have a layout";
       return false;
@@ -946,9 +946,10 @@ class ShapePattern {
  private:
   template <typename NewImpl>
   auto AppendImpl(NewImpl new_impl) const
-      -> ShapePattern<ShapeType, decltype(AllOf<Shape>(std::declval<Impl>(),
-                                                       std::move(new_impl)))> {
-    auto new_all_of = AllOf<Shape>(impl_, std::move(new_impl));
+      -> ShapePattern<ShapeType,
+                      decltype(AllOf<::xla::Shape>(std::declval<Impl>(),
+                                                   std::move(new_impl)))> {
+    auto new_all_of = AllOf<::xla::Shape>(impl_, std::move(new_impl));
     return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
                                                          matched_shape_);
   }
@@ -1077,7 +1078,7 @@ class ShapePattern {
   }
 
   ShapePattern<ShapeType,
-               AllOfPattern<Shape, Impl,
+               AllOfPattern<::xla::Shape, Impl,
                             ShapePatternSubshapeImpl<
                                 const ::xla::Shape,
                                 AllOfPattern<::xla::Shape, ShapePatternBaseImpl,
@@ -1090,7 +1091,7 @@ class ShapePattern {
   }
 
   ShapePattern<ShapeType,
-               AllOfPattern<Shape, Impl,
+               AllOfPattern<::xla::Shape, Impl,
                             ShapePatternSubshapeImpl<
                                 const ::xla::Shape,
                                 AllOfPattern<::xla::Shape, ShapePatternBaseImpl,
@@ -1385,11 +1386,11 @@ class HloInstructionPatternBinaryOperandsAnyOrderImpl {
       const HloInstructionPattern<OperandType2, OperandImpl2>& op2)
       : op1_(op1), op2_(op2) {}
 
-  bool Match(HloInstruction* inst, MatchOption option) const {
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
     return MatchImpl(inst, option);
   }
 
-  bool Match(const HloInstruction* inst, MatchOption option) const {
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
     return MatchImpl(inst, option);
   }
 
@@ -1663,7 +1664,7 @@ class HloInstructionPatternOneUseOrUserImpl {
 class HloInstructionPatternOneUseImpl
     : public HloInstructionPatternOneUseOrUserImpl {
  public:
-  bool Match(const HloInstruction* inst, MatchOption option) const {
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
     if (!MatchOneUser(inst, option)) {
       return false;
     }
@@ -1688,7 +1689,7 @@ class HloInstructionPatternOneUseImpl
 class HloInstructionPatternOneUserImpl
     : public HloInstructionPatternOneUseOrUserImpl {
  public:
-  bool Match(const HloInstruction* inst, MatchOption option) const {
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
     return MatchOneUser(inst, option);
   }
 
@@ -1779,30 +1780,19 @@ class HloConstantScalarImpl {
       return true;
     }
 
-    // Check that literal == static_cast<LitearlTy>(val) and
-    // val == static_cast<ValTy>(literal).  This is sufficient to ensure that
-    // the two constant scalars are actually "equal".
-    auto val_literal = LiteralUtil::CreateR0(*val_);
-    auto literal_r0_or = const_inst->literal().Reshape({});
-    auto val_as_literal_ty_or =
-        val_literal.Convert(const_inst->shape().element_type());
-    if (!literal_r0_or.ok() || !val_as_literal_ty_or.ok()) {
-      EXPLAIN << "could not construct relevant Literals (how did this happen?)";
+    auto const_inst_scalar_or = const_inst->literal().Reshape({});
+    if (!const_inst_scalar_or.ok()) {
+      EXPLAIN << "could not convert matched literal to effective scalar";
       return false;
     }
-    auto literal_r0 = std::move(literal_r0_or).ValueOrDie();
-    auto val_as_literal_ty = std::move(val_as_literal_ty_or).ValueOrDie();
-    auto literal_r0_as_val_ty_or =
-        literal_r0.Convert(val_literal.shape().element_type());
-    bool rv = literal_r0_as_val_ty_or.ok() &&  //
-              literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
-              literal_r0 == val_as_literal_ty;
-    if (!rv) {
+    Literal const_inst_scalar = std::move(const_inst_scalar_or).ValueOrDie();
+    if (!const_inst_scalar.IsEqualAt({}, *val_)) {
       EXPLAIN << "HloInstruction's constant value "
-              << literal_r0.ToStringWithoutShape()
+              << const_inst_scalar.ToStringWithoutShape()
               << " did not match expected value " << *val_;
+      return false;
     }
-    return rv;
+    return true;
   }
 
   absl::optional<ScalarTy> val_;
@@ -1815,9 +1805,9 @@ class HloInstructionPattern {
  private:
   template <typename NewImpl>
   auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
-      HloInstructionType, decltype(AllOf<HloInstruction>(
+      HloInstructionType, decltype(AllOf<::xla::HloInstruction>(
                               std::declval<Impl>(), std::move(new_impl)))> {
-    auto new_allof = AllOf<HloInstruction>(impl_, std::move(new_impl));
+    auto new_allof = AllOf<::xla::HloInstruction>(impl_, std::move(new_impl));
     return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
         std::move(new_allof), matched_inst_);
   }
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 5ec45eb491a..e3a7efff0b1 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -166,8 +166,9 @@ Service::Service(const ServiceOptions& options,
           << "Requested more replicas than there are devices.";
     }
     LOG(INFO) << StrFormat(
-        "XLA service %p executing computations on platform %s. Devices:", this,
-        execute_backend_->platform()->Name());
+        "XLA service %p initialized for platform %s (this does not guarantee "
+        "that XLA will be used). Devices:",
+        this, execute_backend_->platform()->Name());
     auto stream_executors = execute_backend_->stream_executors();
     for (int i = 0; i < execute_backend_->device_count(); ++i) {
       se::StreamExecutor* executor = stream_executors.at(i);
@@ -351,11 +352,11 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   VLOG(1) << StrFormat("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
-  std::vector<std::unique_ptr<HloSnapshot>> hlo_snapshots;
+  std::vector<std::unique_ptr<HloProto>> hlo_protos;
   for (int64 i = 0; i < module_protos.size(); ++i) {
-    auto hlo_snapshot = absl::make_unique<HloSnapshot>();
-    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i];
-    hlo_snapshots.push_back(std::move(hlo_snapshot));
+    auto hlo_proto = absl::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = *module_protos[i];
+    hlo_protos.push_back(std::move(hlo_proto));
   }
 
   VLOG(1) << "Computations:";
@@ -383,7 +384,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const auto& debug_opts = module_configs[i]->debug_options();
     if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) &&
         debug_opts.xla_dump_hlo_snapshots()) {
-      executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i]));
+      executables[i]->set_hlo_proto(std::move(hlo_protos[i]));
     }
   }
 
@@ -451,13 +452,19 @@ Service::ExecuteParallelAndRegisterResult(
       options.set_intra_op_thread_pool(
           backend->eigen_intra_op_thread_pool_device());
       options.set_device_assignment(&device_assignment);
+      // Use run-time profile information from execution_profile on the 0th
+      // device.
+      if (i == 0) {
+        options.set_execution_profile(profile);
+      }
       ServiceExecutableRunOptions run_options(options,
                                               backend->StreamBorrower());
 
       // Asynchronously launch the computation.
       TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
                           executables[i]->ExecuteAsyncOnStream(
-                              &run_options, arguments[i][replica]));
+                              &run_options, arguments[i][replica],
+                              /*hlo_execution_profile=*/nullptr));
 
       if (replica == 0 && profile != nullptr) {
         streams.back()->ThenStopTimer(timers.back().get());
@@ -490,10 +497,6 @@ Service::ExecuteParallelAndRegisterResult(
     uint64 nanoseconds =
         *std::max_element(timer_nanoseconds.begin(), timer_nanoseconds.end());
 
-    // Merge in run-time profile information from execution_profile on the
-    // zeroth device.
-    profile->MergeFrom(executables[0]->execution_profile());
-
     // Overall execution time (in nanoseconds) from the executor timer.
     profile->set_compute_and_transfer_time_ns(nanoseconds);
 
@@ -546,13 +549,13 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
+    options.set_execution_profile(profile);
     run_options.emplace_back(options, backend->StreamBorrower());
   }
 
   if (options_.number_of_replicas() == 1) {
-    TF_ASSIGN_OR_RETURN(
-        auto result, executable->ExecuteOnStreamWrapper(&run_options[0],
-                                                        profile, arguments[0]));
+    TF_ASSIGN_OR_RETURN(auto result, executable->ExecuteOnStreamWrapper(
+                                         &run_options[0], arguments[0]));
     return allocation_tracker_.Register(std::move(result), result_tag);
   }
 
@@ -692,14 +695,17 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     executable_ptrs.push_back(executable.get());
   }
 
+  std::vector<HloSnapshot> snapshots;
+  snapshots.resize(executable_ptrs.size());
   for (int i = 0; i < executable_ptrs.size(); i++) {
     if (executable_ptrs[i]->dumping_snapshot()) {
+      *snapshots[i].mutable_hlo() = *executable_ptrs[i]->hlo_proto();
       TF_ASSIGN_OR_RETURN(auto stream,
                           execute_backend_->BorrowStream(
                               all_executors[i][0]->device_ordinal()));
       TF_RETURN_IF_ERROR(RecordArguments(all_arguments[i].front(), stream.get(),
                                          execute_backend_->transfer_manager(),
-                                         executable_ptrs[i]->hlo_snapshot()));
+                                         &snapshots[i]));
     }
   }
 
@@ -746,9 +752,8 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                           execute_backend_->BorrowStream(all_executors[i][0]));
       TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                       execute_backend_->transfer_manager(),
-                                      executable->hlo_snapshot()));
-      DumpHloSnapshotIfEnabled(executable->module(),
-                               *executable->hlo_snapshot());
+                                      &snapshots[i]));
+      DumpHloSnapshotIfEnabled(executable->module(), snapshots[i]);
     }
   }
 
@@ -803,9 +808,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   const auto& debug_opts = module_config->debug_options();
   if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) &&
       debug_opts.xla_dump_hlo_snapshots()) {
-    auto hlo_snapshot = absl::make_unique<HloSnapshot>();
-    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
-    executable->set_hlo_snapshot(std::move(hlo_snapshot));
+    auto hlo_proto = absl::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = module_proto;
+    executable->set_hlo_proto(std::move(hlo_proto));
   }
 
   return std::move(executable);
@@ -891,12 +896,13 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
   TF_ASSIGN_OR_RETURN(auto stream,
                       execute_backend_->BorrowStream(
                           execute_backend_->default_stream_executor()));
+  HloSnapshot snapshot;
   if (executable->dumping_snapshot()) {
-    executable->hlo_snapshot()->set_execution_platform(
-        execute_backend_->platform()->Name());
-    TF_RETURN_IF_ERROR(RecordArguments(
-        replicated_arguments.front(), stream.get(),
-        execute_backend_->transfer_manager(), executable->hlo_snapshot()));
+    *snapshot.mutable_hlo() = *executable->hlo_proto();
+    snapshot.set_execution_platform(execute_backend_->platform()->Name());
+    TF_RETURN_IF_ERROR(
+        RecordArguments(replicated_arguments.front(), stream.get(),
+                        execute_backend_->transfer_manager(), &snapshot));
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -913,8 +919,8 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
         allocation_tracker_.ResolveForReplica(result->output(), 0));
     TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                     execute_backend_->transfer_manager(),
-                                    executable->hlo_snapshot()));
-    DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot());
+                                    &snapshot));
+    DumpHloSnapshotIfEnabled(executable->module(), snapshot);
   }
 
   VLOG(1) << "successfully completed 'execute' request";
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 3510e4913f4..30f6faada43 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1711,15 +1711,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
-  if (batch_group_count > 1 && input_batch % kernel_output_features != 0) {
+  if (batch_group_count > 1 && kernel_output_features != batch_group_count) {
     return InvalidArgument(
-        "Expected input batch (value %d) to be divisible by output feature "
-        "dimension size (value %d) for batch group count %d; "
-        "got <conv>(%s, %s)\n"
+        "Expected output feature dimension size (value %d) to be equal to "
+        "batch group count %d; got <conv>(%s, %s)\n"
         "Dimension numbers: {%s}.",
-        input_batch, kernel_output_features, batch_group_count,
-        ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
-        dnums.DebugString());
+        kernel_output_features, batch_group_count, ShapeUtil::HumanString(lhs),
+        ShapeUtil::HumanString(rhs), dnums.DebugString());
   }
 
   if (input_features % feature_group_count != 0 ||
@@ -2119,10 +2117,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 /* static */ StatusOr<Shape> ShapeInference::InferReduceWindowShape(
     const Shape& operand_shape, const Shape& init_value_shape,
     const Window& window, const ProgramShape& to_apply_shape) {
-  TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of reduce-window"));
   TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, {&init_value_shape},
                                         {operand_shape.element_type()},
                                         /*inputs=*/1));
+  return InferReduceWindowShape(operand_shape, init_value_shape, window);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferReduceWindowShape(
+    const Shape& operand_shape, const Shape& init_value_shape,
+    const Window& window) {
+  TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of reduce-window"));
   return InferWindowOutputShape(operand_shape, window,
                                 init_value_shape.element_type(),
                                 /*allow_negative_padding=*/false);
@@ -2207,6 +2211,60 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return ShapeUtil::MakeShape(U32, {});
 }
 
+/* static */ StatusOr<Window> ShapeInference::InferWindowFromDimensions(
+    absl::Span<const int64> window_dimensions,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation,
+    absl::Span<const int64> rhs_dilation) {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
+    if (x == 0 || x == window_dimensions.size()) {
+      return Status::OK();
+    } else {
+      return InvalidArgument(
+          "%s", absl::StrCat(
+                    "Window has different number of window dimensions than of ",
+                    x_name,
+                    "\nNumber of window dimensions: ", window_dimensions.size(),
+                    "\nNumber of ", x_name, ": ", x, "\n"));
+    }
+  };
+  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
+  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
+  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
+  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
+
+  Window window;
+  for (size_t i = 0; i < window_dimensions.size(); i++) {
+    auto dim = window.add_dimensions();
+    dim->set_size(window_dimensions[i]);
+    if (!window_strides.empty()) {
+      dim->set_stride(window_strides[i]);
+    } else {
+      dim->set_stride(1);
+    }
+    if (!padding.empty()) {
+      dim->set_padding_low(padding[i].first);
+      dim->set_padding_high(padding[i].second);
+    } else {
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+    }
+    if (!lhs_dilation.empty()) {
+      dim->set_base_dilation(lhs_dilation[i]);
+    } else {
+      dim->set_base_dilation(1);
+    }
+    if (!rhs_dilation.empty()) {
+      dim->set_window_dilation(rhs_dilation[i]);
+    } else {
+      dim->set_window_dilation(1);
+    }
+    dim->set_window_reversal(false);
+  }
+  return window;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, absl::Span<const int64> starts,
     absl::Span<const int64> limits, absl::Span<const int64> strides) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 590a664224e..393b45e5ac3 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -159,6 +159,10 @@ class ShapeInference {
       const Shape& operand_shape, const Shape& init_value, const Window& window,
       const ProgramShape& to_apply_shape);
 
+  static StatusOr<Shape> InferReduceWindowShape(const Shape& operand_shape,
+                                                const Shape& init_value,
+                                                const Window& window);
+
   // Infers the shape produced by scattering the given source shape to the
   // selected indices of each window on the operand shape.
   static StatusOr<Shape> InferSelectAndScatterShape(
@@ -295,6 +299,15 @@ class ShapeInference {
   static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
                                                     int64 dimension);
 
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  static StatusOr<Window> InferWindowFromDimensions(
+      absl::Span<const int64> window_dimensions,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation);
+
  private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 3bfa971f857..c241a4ac2ce 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -573,6 +573,43 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
               HasSubstr("each dimension exactly once"));
 }
 
+TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.set_input_feature_dimension(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.add_kernel_spatial_dimensions(2);
+  dnums.add_kernel_spatial_dimensions(3);
+  dnums.set_output_batch_dimension(0);
+  dnums.set_output_feature_dimension(1);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(3);
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {60, 38, 17, 13});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {38, 10, 4, 4});
+  Window window;
+  auto dim0 = window.add_dimensions();
+  auto dim1 = window.add_dimensions();
+  dim0->set_size(4);
+  dim1->set_size(4);
+  dim0->set_padding_low(0);
+  dim0->set_padding_high(2);
+  dim1->set_padding_low(2);
+  dim1->set_padding_high(1);
+  dim0->set_stride(1);
+  dim1->set_stride(1);
+  dim0->set_window_dilation(3);
+  dim1->set_window_dilation(2);
+  auto inferred_status = ShapeInference::InferConvolveShape(
+      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/6,
+      window, dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("to be equal to batch group count"));
+}
+
 namespace fft {
 
 static const char* unsupported_rank = "only supports ranks 1-3";
diff --git a/tensorflow/compiler/xla/service/slow_operation_alarm.cc b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
new file mode 100644
index 00000000000..3a0bd830d30
--- /dev/null
+++ b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
+
+#include <list>
+#include <mutex>  // NOLINT (for std::call_once, not std::mutex)
+
+#include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace xla {
+namespace {
+
+absl::Mutex mu(absl::kConstInit);
+absl::CondVar* ready;
+std::once_flag init_flag;
+std::list<SlowOperationAlarm*>* outstanding_alarms ABSL_PT_GUARDED_BY(mu) =
+    nullptr;
+
+void AlarmLoop() {
+  while (true) {
+    absl::MutexLock lock(&mu);
+
+    // Fire any alarms which are ready.
+    absl::Time now = absl::Now();
+    for (auto it = outstanding_alarms->begin();
+         it != outstanding_alarms->end();) {
+      auto next = std::next(it);
+      auto* alarm = *it;
+      // Fire the alarm if applicable.
+      if (alarm->deadline() <= now) {
+        outstanding_alarms->erase(it);
+        int64 count =
+            alarm->counter() == nullptr ? 0 : alarm->counter()->fetch_add(1);
+        // If the alarm has a counter, only fire if the count is a power of 2.
+        if (count == 0 || (count & (count - 1)) == 0) {
+          // We fire alarms with LOG(ERROR) because otherwise it might not show
+          // up without --logtostderr.
+          LOG(ERROR) << alarm->msg();
+        }
+      }
+      it = next;
+    }
+
+    if (outstanding_alarms->empty()) {
+      ready->Wait(&mu);
+      continue;
+    }
+
+    SlowOperationAlarm* next_alarm = *absl::c_min_element(
+        *outstanding_alarms,
+        [](const SlowOperationAlarm* a, const SlowOperationAlarm* b) {
+          return a->deadline() < b->deadline();
+        });
+    ready->WaitWithDeadline(&mu, next_alarm->deadline());
+  }
+}
+
+void ScheduleAlarm(SlowOperationAlarm* alarm) {
+  std::call_once(init_flag, [] {
+    ready = new absl::CondVar();
+    outstanding_alarms = new std::list<SlowOperationAlarm*>();
+    (void)tensorflow::Env::Default()->StartThread(
+        tensorflow::ThreadOptions(), "SlowOperationAlarm", [] { AlarmLoop(); });
+  });
+
+  absl::MutexLock lock(&mu);
+  outstanding_alarms->push_back(alarm);
+  ready->Signal();
+}
+
+void UnscheduleAlarm(const SlowOperationAlarm* alarm) {
+  absl::MutexLock lock(&mu);
+  CHECK(outstanding_alarms != nullptr);
+  auto it = absl::c_find(*outstanding_alarms, alarm);
+  if (it != outstanding_alarms->end()) {
+    outstanding_alarms->erase(it);
+  }
+}
+
+}  // namespace
+
+SlowOperationAlarm::SlowOperationAlarm(absl::Duration timeout, string msg,
+                                       std::atomic<int64>* counter /*=nullptr*/)
+    : deadline_(absl::Now() + timeout),
+      msg_(std::move(msg)),
+      counter_(counter) {
+  ScheduleAlarm(this);
+}
+
+SlowOperationAlarm::~SlowOperationAlarm() { UnscheduleAlarm(this); }
+
+std::unique_ptr<SlowOperationAlarm> SlowCompilationAlarm() {
+  // Pass a counter to these alarms so they only log once every power-of-two
+  // occurrences.
+  static auto* counter = new std::atomic<int64>(0);
+
+  const char* separator = "\n********************************";
+#if NDEBUG
+  return absl::make_unique<SlowOperationAlarm>(
+      absl::Duration(absl::Minutes(2)),
+      absl::StrCat(
+          separator,
+          "\nVery slow compile?  If you want to file a bug, run with envvar "
+          "XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results.",
+          separator),
+      counter);
+#else
+  return absl::make_unique<SlowOperationAlarm>(
+      absl::Duration(absl::Seconds(10)),
+      absl::StrCat(
+          separator,
+          "\nSlow compile?  XLA was built without compiler optimizations, "
+          "which can be slow.  Try rebuilding with -c opt.",
+          separator),
+      counter);
+#endif
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/slow_operation_alarm.h b/tensorflow/compiler/xla/service/slow_operation_alarm.h
new file mode 100644
index 00000000000..014fc7709f8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/slow_operation_alarm.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SLOW_OPERATION_ALARM_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SLOW_OPERATION_ALARM_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <tuple>
+
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// This RAII object asynchronously prints a warning if it's alive for more than
+// a certain amount of time.
+class SlowOperationAlarm {
+ public:
+  // If `counter` is not null, this alarm will throttle itself to logging
+  // once-every-power-of-two occurrences. The counter must outlive this object.
+  SlowOperationAlarm(absl::Duration timeout, std::string msg,
+                     std::atomic<int64>* counter = nullptr);
+  ~SlowOperationAlarm();
+
+  // Not copyable or movable, because the constructor stores a pointer to `this`
+  // into a global variable.
+  SlowOperationAlarm(const SlowOperationAlarm&) = delete;
+  SlowOperationAlarm(const SlowOperationAlarm&&) = delete;
+  SlowOperationAlarm& operator=(const SlowOperationAlarm&) = delete;
+  SlowOperationAlarm& operator=(const SlowOperationAlarm&&) = delete;
+
+  absl::Time deadline() const { return deadline_; }
+  absl::string_view msg() const { return msg_; }
+  std::atomic<int64>* counter() { return counter_; }
+
+ private:
+  absl::Time deadline_;
+  std::string msg_;
+  // counter_ may be null.  If it's not, this alarm prints something only once
+  // every power of two occurrences.
+  std::atomic<int64>* counter_;
+};
+
+// Returns an object which prints a warning about slow compilation after a
+// certain amount of time.
+//
+// In debug builds, recommends building with -c opt.
+//
+// In opt builds, recommends filing a bug.
+//
+// This is throttled to once-every-power-of-two occurrences, globally.
+ABSL_MUST_USE_RESULT std::unique_ptr<SlowOperationAlarm> SlowCompilationAlarm();
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SLOW_OPERATION_ALARM_H_
diff --git a/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc b/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc
new file mode 100644
index 00000000000..69af16ef428
--- /dev/null
+++ b/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/tree_reduction_rewriter.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+
+class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit ReductionRewriterVisitor(int64 reduce_window_size)
+      : reduce_window_size_(reduce_window_size) {}
+
+  Status HandleReduce(HloInstruction *hlo) override {
+    HloInstruction *reduced_op = hlo->mutable_operand(0);
+    HloInstruction *initial_value = hlo->mutable_operand(1);
+    const Shape &input_shape = reduced_op->shape();
+    const Shape &reduce_shape = hlo->shape();
+    if (!reduce_shape.IsArray()) {
+      return Status::OK();
+    }
+    auto reduced_dimensions = hlo->dimensions();
+    std::vector<int64> window_dimensions;
+    std::vector<int64> window_strides;
+    for (int64 dim = 0; dim < input_shape.rank(); dim++) {
+      if (!absl::c_linear_search(hlo->dimensions(), dim)) {
+        window_dimensions.push_back(1);
+        window_strides.push_back(1);
+        continue;
+      }
+      // One of the reduced dimensions is smaller than the window size,
+      // do not perform the rewrite.
+      if (input_shape.dimensions(dim) < reduce_window_size_) {
+        return Status::OK();
+      }
+
+      window_dimensions.push_back(reduce_window_size_);
+      window_strides.push_back(reduce_window_size_);
+    }
+
+    std::vector<std::pair<int64, int64>> padding =
+        MakePadding(AsInt64Slice(input_shape.dimensions()), window_dimensions,
+                    window_strides, Padding::kSame);
+
+    TF_ASSIGN_OR_RETURN(
+        Window window, ShapeInference::InferWindowFromDimensions(
+                           window_dimensions, window_strides, padding, {}, {}));
+
+    TF_ASSIGN_OR_RETURN(Shape intermediate_shape,
+                        ShapeInference::InferReduceWindowShape(
+                            input_shape, initial_value->shape(), window));
+
+    HloInstruction *reduce_window =
+        hlo->parent()->AddInstruction(HloInstruction::CreateReduceWindow(
+            intermediate_shape, reduced_op, initial_value, window,
+            hlo->to_apply()));
+
+    std::unique_ptr<HloInstruction> new_output =
+        HloInstruction::CreateReduce(reduce_shape, reduce_window, initial_value,
+                                     hlo->dimensions(), hlo->to_apply());
+
+    return ReplaceWithNewInstruction(hlo, std::move(new_output));
+  }
+
+ private:
+  int64 reduce_window_size_;
+};
+
+StatusOr<bool> TreeReductionRewriter::Run(HloModule *module) {
+  ReductionRewriterVisitor visitor(reduce_window_size_);
+  bool changed = false;
+  for (const auto &computation : module->MakeNonfusionComputations()) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+
+  return changed;
+}
+
+}  // end namespace xla
diff --git a/tensorflow/compiler/xla/service/tree_reduction_rewriter.h b/tensorflow/compiler/xla/service/tree_reduction_rewriter.h
new file mode 100644
index 00000000000..a9852d88a6e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/tree_reduction_rewriter.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
+
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Increase precision for the reduction operation by applying the reduce-window
+// first.
+//
+// E.g. suppose we want to reduce f32[1024] to a scalar. This pass first applies
+// a reduce-window (with kSame padding) of size `reduce_window_size`, and then
+// reduces the resulting array f32[32]. The rewrite is not applied if any of the
+// reduced dimensions is smaller than the `reduce_window_size`.
+//
+// Applying this pass until a fixed point performs a variant of pairwise
+// summation (https://en.wikipedia.org/wiki/Pairwise_summation), which is
+// guaranteed to have an assymptotically smaller error bound provided that
+// intermediate roundoff errors are random and have random sign.
+//
+// If this pass lowers the performance too much, the window size can always be
+// increased to a larger value.
+class TreeReductionRewriter : public HloModulePass {
+ public:
+  explicit TreeReductionRewriter(int64 reduce_window_size = 32)
+      : reduce_window_size_(reduce_window_size) {}
+  ~TreeReductionRewriter() override = default;
+  absl::string_view name() const override { return "tree_reduction_rewriter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  int64 reduce_window_size_;
+};
+
+}  // end namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index 57efee700be..0a8e2c3849f 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -266,8 +266,12 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
     int64 m_dim = (left_side) ? -1 : -2;
     int64 m = ShapeUtil::GetDimension(b_shape, m_dim);
 
+    std::vector<XlaOp> update_ops;
+    int bdims = b_shape.rank();
+    int64 block_dim = (left_side) ? bdims - 2 : bdims - 1;
+
     // Initialize the solution
-    auto x = ZerosLike(b);
+    XlaOp x;
 
     // This loop is unrolled for performance reasons, but it could be expressed
     // rolled as well since the matrices are of the same size each iteration
@@ -278,7 +282,8 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
       // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i]
 
       // Decide whether we go from first block to last or vice versa
-      auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i;
+      bool backward = left_side ^ lower ^ transpose_a;
+      auto j = backward ? num_blocks - 1 - i : i;
 
       // Get the size of the inverse blocks (the last one might be smaller)
       int64 block = (n % block_size != 0 && j + 1 == num_blocks)
@@ -304,9 +309,17 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
       if (i == 0) {
         remainder = b_row;
       } else {
-        // This matrix multiply involves a lot of multiplying with zero (namely,
-        // X[i * block_size:] = 0), but this is faster than slicing...
-        end = {k, n};
+        // This matrix multiply get rid of a lot of multiplying with zero
+        // (namely, X[i * block_size:] = 0), L[i, :i] @ X[:i]
+        if (backward) {
+          start = {j * block_size,
+                   std::max(0LL, (num_blocks - i) * block_size)};
+          end = {k, n};
+        } else {
+          start = {j * block_size, 0};
+          end = {k, std::min(i * block_size, n)};
+        }
+
         if (!left_side) {
           std::swap(end[0], end[1]);
         }
@@ -335,7 +348,16 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
             BatchDot(remainder, false, inv_block, transpose_a, precision);
         std::swap(update_starts[0], update_starts[1]);
       }
-      x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
+
+      if (i == 0) {
+        x = x_update;
+      } else {
+        if (backward) {
+          x = ConcatInDim(builder, {x_update, x}, block_dim);
+        } else {
+          x = ConcatInDim(builder, {x, x_update}, block_dim);
+        }
+      }
     }
 
     return x;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index d0515fb5825..be7ad99aac4 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -564,8 +564,8 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
-  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
-      constant2->shape(), HloOpcode::kBitcast, constant2));
+  auto bitcast = builder.AddInstruction(
+      HloInstruction::CreateBitcast(constant2->shape(), constant2));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant1, bitcast}));
 
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index ebb56746518..e2d74627c60 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -43,6 +43,8 @@ limitations under the License.
 
 namespace xla {
 
+class ShapeIndexView;
+
 // An index for specifying a particular nested subshape within a shape. Used in
 // ShapeUtil::GetSubshape and other interfaces. Shapes are recursive data
 // structures (trees) and ShapeIndex defines a path through the tree where each
@@ -69,6 +71,8 @@ class ShapeIndex {
   template <typename InputIt>
   ShapeIndex(InputIt start, InputIt end) : indices_(start, end) {}
 
+  explicit ShapeIndex(ShapeIndexView v);
+
   bool empty() const { return indices_.empty(); }
   size_t size() const { return indices_.size(); }
   void push_back(int64 value) { indices_.push_back(value); }
@@ -137,6 +141,10 @@ class ShapeIndexView {
     CHECK(!empty());
     return indices_.front();
   }
+  int64 back() const {
+    CHECK(!empty());
+    return indices_.back();
+  }
   ShapeIndexView ConsumeFront() const {
     ShapeIndexView result = *this;
     result.indices_.remove_prefix(1);
@@ -161,6 +169,9 @@ class ShapeIndexView {
   absl::Span<const int64> indices_;
 };
 
+inline ShapeIndex::ShapeIndex(ShapeIndexView v)
+    : ShapeIndex(v.begin(), v.end()) {}
+
 std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
 std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index);
 
diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h
index a657554dc2f..c20c1341541 100644
--- a/tensorflow/compiler/xla/test.h
+++ b/tensorflow/compiler/xla/test.h
@@ -41,6 +41,7 @@ limitations under the License.
 #else
 #include <gmock/gmock-generated-matchers.h>
 #include <gmock/gmock-matchers.h>
+#include <gmock/gmock-more-matchers.h>
 #endif
 
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index f67050863d3..ae0d70610be 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -3,7 +3,7 @@
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
@@ -296,9 +296,12 @@ xla_test(
 xla_test(
     name = "conv_depthwise_test",
     timeout = "long",
-    srcs = ["conv_depthwise_test.cc"],
+    srcs = [
+        "conv_depthwise_test.cc",
+    ],
     shard_count = 50,
     deps = [
+        ":conv_depthwise_common",
         ":test_macros_header",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -709,9 +712,151 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "conv_depthwise_common",
+    testonly = True,
+    srcs = ["conv_depthwise_common.cc"],
+    hdrs = ["conv_depthwise_common.h"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
-    name = "exhaustive_unary_test",
+    name = "exhaustive_unary_test_f32_or_smaller",
     srcs = ["exhaustive_unary_test.cc"],
+    copts = ["-DUNARY_TEST_TARGET_F32_OR_SMALLER"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_unary_test_f64",
+    srcs = ["exhaustive_unary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DUNARY_TEST_TARGET_F64"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_unary_test_complex",
+    srcs = ["exhaustive_unary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DUNARY_TEST_TARGET_COMPLEX"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_test_f16",
+    srcs = ["exhaustive_binary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DBINARY_TEST_TARGET_F16"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_test_bf16",
+    srcs = ["exhaustive_binary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DBINARY_TEST_TARGET_BF16"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_test_f32",
+    srcs = ["exhaustive_binary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DBINARY_TEST_TARGET_F32"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_test_f64",
+    srcs = ["exhaustive_binary_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    copts = ["-DBINARY_TEST_TARGET_F64"],
     real_hardware_only = True,  # Very slow on the interpreter.
     shard_count = 48,
     tags = [
@@ -1505,6 +1650,7 @@ xla_test(
     name = "fmax_fmin_test",
     srcs = ["fmax_fmin_test.cc"],
     deps = [
+        ":test_macros_header",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
@@ -1744,6 +1890,7 @@ xla_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service/gpu:nccl_all_reduce_thunk",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1954,7 +2101,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1987,8 +2133,8 @@ xla_test(
 )
 
 xla_test(
-    name = "fusion_test",
-    srcs = ["fusion_test.cc"],
+    name = "cpu_gpu_fusion_test",
+    srcs = ["cpu_gpu_fusion_test.cc"],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:array2d",
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 48719c6c47c..7153ace8789 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
 load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index b8439ee0fdd..efa7448f191 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/str_replace.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_common.cc b/tensorflow/compiler/xla/tests/conv_depthwise_common.cc
new file mode 100644
index 00000000000..e11ec33e730
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_common.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/conv_depthwise_common.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16,
+    bool is_scheduled) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  const string sched_tag = is_scheduled ? ", is_scheduled=true " : "";
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv %s
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        sched_tag, data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.output_feature);
+
+  } else if (spec.stride == -1) {
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv %s
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        sched_tag, data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.output_feature);
+  } else {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv %s
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        sched_tag, data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
+  }
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_common.h b/tensorflow/compiler/xla/tests/conv_depthwise_common.h
new file mode 100644
index 00000000000..0c00f8d0abe
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_common.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+string GetFloatDataType(bool use_bfloat16);
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_feature, window, stride, pad, lhs_dilate;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data);
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16,
+    bool is_scheduled = false);
+
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
index fe958242329..98f6b5bc6d7 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -22,26 +22,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/conv_depthwise_common.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 
 namespace xla {
 namespace {
 
-string GetFloatDataType(bool use_bfloat16) {
-  return use_bfloat16 ? "bf16" : "f32";
-}
-
-struct DepthwiseConvolution2DSpec {
-  int64 output_feature, window, stride, pad, lhs_dilate;
-  std::vector<int64> activation_dims;
-  std::vector<int64> activation_layout;
-  std::vector<int64> kernel_dims;
-  std::vector<int64> kernel_layout;
-  std::vector<int64> output_dims;
-  std::vector<int64> output_layout;
-};
-
 class DepthwiseConvolution2DTest
     : public HloTestBase,
       public ::testing::WithParamInterface<
@@ -70,6 +57,7 @@ static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
 
     config.kernel_dims = {kernel_size, kernel_size, 1, feature};
     config.kernel_layout = {3, 2, 1, 0};
+    config.output_layout = {3, 0, 2, 1};
 
     if (activation_size == 1 && kernel_size == 2) {
       // Test for outer dim.
@@ -87,127 +75,12 @@ static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
       config.output_dims = {batch, activation_size - kernel_size + 1,
                             activation_size - kernel_size + 1, feature};
     }
-
-    // Try this layout for all kernel shapes.
-    config.output_layout = {3, 0, 2, 1};
     config_set.push_back(config);
-
-    // Try other layouts only for certain kernel shapes.
-    if (kernel_size % 2 == 0) {
-      config.activation_layout = {0, 3, 2, 1};
-      config_set.push_back(config);
-
-      config.output_layout = {0, 3, 2, 1};
-      config_set.push_back(config);
-
-      config.activation_layout = {3, 0, 2, 1};
-      config_set.push_back(config);
-    }
   }
 
   return config_set;
 }
 
-string DepthwiseConvolution2DTestDataToString(
-    const ::testing::TestParamInfo<
-        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
-  const auto& spec = ::testing::get<0>(data.param);
-  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
-  string str = absl::StrCat(
-      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
-      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
-      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
-      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
-      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
-      absl::StrJoin(spec.output_layout, "_"), data_type);
-  // -1 indicates non-existence.
-  if (spec.stride != -1) {
-    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
-  }
-
-  // Test names are not allowed to contain the '-' character.
-  absl::c_replace(str, '-', 'n');
-  return str;
-}
-
-string BuildHloTextDepthwiseConvolution2D(
-    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
-  const string data_type = GetFloatDataType(use_bfloat16);
-  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
-    return absl::StrFormat(
-        R"(
-    HloModule TensorFlowDepthwiseConv
-
-    ENTRY main {
-      activation = %s[%s]{%s} parameter(0)
-      kernel = %s[%s]{%s} parameter(1)
-      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
-          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
-          feature_group_count=%d
-    }
-    )",
-        data_type, absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), data_type,
-        absl::StrJoin(spec.output_dims, ","),
-        absl::StrJoin(spec.output_layout, ","), data_type,
-        absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
-        spec.window, spec.window, spec.window, spec.output_feature);
-
-  } else if (spec.stride == -1) {
-    return absl::StrFormat(
-        R"(
-      HloModule TensorFlowDepthwiseConv
-
-      ENTRY main {
-        activation = %s[%s]{%s} parameter(0)
-        kernel = %s[%s]{%s} parameter(1)
-        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
-            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
-            feature_group_count=%d
-      }
-      )",
-        data_type, absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), data_type,
-        absl::StrJoin(spec.output_dims, ","),
-        absl::StrJoin(spec.output_layout, ","), data_type,
-        absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
-        spec.output_feature);
-  } else {
-    return absl::StrFormat(
-        R"(
-    HloModule TensorFlowDepthwiseConv
-
-    ENTRY main {
-      activation = %s[%s]{%s} parameter(0)
-      kernel = %s[%s]{%s} parameter(1)
-      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
-          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
-          dim_labels=b01f_01io->b01f, feature_group_count=%d
-    }
-    )",
-        data_type, absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), data_type,
-        absl::StrJoin(spec.output_dims, ","),
-        absl::StrJoin(spec.output_layout, ","), data_type,
-        absl::StrJoin(spec.activation_dims, ","),
-        absl::StrJoin(spec.activation_layout, ","), data_type,
-        absl::StrJoin(spec.kernel_dims, ","),
-        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
-        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
-  }
-}
 
 XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
   const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 0ab765aefa0..e656951a968 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -1842,15 +1842,11 @@ INSTANTIATE_TEST_CASE_P(
                       Convolve1DTestParam{130, 1, 1, 1, 3},
                       Convolve1DTestParam{64, 1, 1, 1, 1},
                       Convolve1DTestParam{128, 1, 1, 1, 1},
-// TODO(b/72566306): The following five tests failed on CPU with unreasonable
-// relative errors.  Last ran on 2018-02-22.
-#if XLA_TEST_BACKEND_GPU
                       Convolve1DTestParam{139, 1, 1, 128, 1},
                       Convolve1DTestParam{640, 3, 3, 128, 1},
                       Convolve1DTestParam{900, 1, 1, 10, 1},
                       Convolve1DTestParam{1, 10, 10, 1, 10},
                       Convolve1DTestParam{1, 10, 130, 1, 1},
-#endif
                       Convolve1DTestParam{1, 10, 130, 1, 2},
                       Convolve1DTestParam{1, 64, 64, 1, 10},
                       Convolve1DTestParam{1, 65, 65, 1, 1},
@@ -1946,7 +1942,8 @@ XLA_TEST_F(ConvolutionTest, ConvolveF32BackwardInputGroupedConvolution) {
 
 class ConvolutionHloTest : public HloTestBase {};
 
-XLA_TEST_F(ConvolutionHloTest, ConvolveF64Forward) {
+// double datatype is not yet supported in ROCm
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_GPU_ROCM(ConvolveF64Forward)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1970,7 +1967,9 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
-XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardFilter) {
+// double datatype is not yet supported in ROCm
+XLA_TEST_F(ConvolutionHloTest,
+           DISABLED_ON_GPU_ROCM(ConvolveF64BackwardFilter)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1982,7 +1981,8 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
-XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardInput) {
+// double datatype is not yet supported in ROCm
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_GPU_ROCM(ConvolveF64BackwardInput)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1995,5 +1995,18 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, ConvolveBackwardInput) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %output = f32[3,3,64,64] parameter(0)
+  %kernel = f32[672,7,7,64] parameter(1)
+  %reverse = f32[672,7,7,64]{3,2,1,0} reverse(f32[672,7,7,64]{3,2,1,0} %kernel), dimensions={1,2}
+  ROOT %convolution = f32[672,9,9,64]{3,2,1,0} convolution(f32[3,3,64,64]{3,2,1,0} %output, f32[672,7,7,64]{3,2,1,0} %reverse), window={size=7x7 pad=6_6x6_6}, dim_labels=01bf_o01i->f01b
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
similarity index 94%
rename from tensorflow/compiler/xla/tests/fusion_test.cc
rename to tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
index 2d0805cdb0e..7719e89f9e8 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
@@ -60,7 +60,7 @@ const float test_float_vals[3][test_width][test_height] = {
 
 // Test whether fusion operations are emitted with no errors and compute
 // accurate outputs.
-class FusionTest : public HloTestBase {
+class CpuGpuFusionTest : public HloTestBase {
  protected:
   template <typename T, int Arity>
   void TestElementwise2D(
@@ -148,8 +148,8 @@ class FusionTest : public HloTestBase {
   }
 };
 
-float FusionTest::ComputeElementwiseAnswerFloat(HloOpcode opcode,
-                                                absl::Span<const float> xs) {
+float CpuGpuFusionTest::ComputeElementwiseAnswerFloat(
+    HloOpcode opcode, absl::Span<const float> xs) {
   switch (opcode) {
     case HloOpcode::kAdd:
       return xs[0] + xs[1];
@@ -172,8 +172,8 @@ float FusionTest::ComputeElementwiseAnswerFloat(HloOpcode opcode,
   }
 }
 
-bool FusionTest::ComputeElementwiseAnswerCompare(ComparisonDirection direction,
-                                                 absl::Span<const float> xs) {
+bool CpuGpuFusionTest::ComputeElementwiseAnswerCompare(
+    ComparisonDirection direction, absl::Span<const float> xs) {
   switch (direction) {
     case ComparisonDirection::kEq:
       return xs[0] == xs[1];
@@ -190,7 +190,7 @@ bool FusionTest::ComputeElementwiseAnswerCompare(ComparisonDirection direction,
   }
 }
 
-XLA_TEST_F(FusionTest, Test) {
+XLA_TEST_F(CpuGpuFusionTest, Test) {
   // test expression:
   // slice(select({{T, F, T}, {F, T, F}},
   //              concat(transpose({{1.0}, {2.0}, {3.0}} +
@@ -243,7 +243,7 @@ XLA_TEST_F(FusionTest, Test) {
 }
 
 // Test whether we emit appropriate code for parameters of fusion instructions.
-XLA_TEST_F(FusionTest, Parameter) {
+XLA_TEST_F(CpuGpuFusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
@@ -268,7 +268,7 @@ XLA_TEST_F(FusionTest, Parameter) {
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
-XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
+XLA_TEST_F(CpuGpuFusionTest, RandomizedParallelPartition) {
   // Tests parallel partitioning of a fusion instruction.
   // Create shape with random outer dimension size to generate random parallel
   // partition counts for each test run.
@@ -304,7 +304,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
   }
 }
 
-XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
+XLA_TEST_F(CpuGpuFusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -328,7 +328,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
-XLA_TEST_F(FusionTest, ReshapeToScalar) {
+XLA_TEST_F(CpuGpuFusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto single_element_array = builder.AddInstruction(
@@ -343,7 +343,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -358,7 +358,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -373,7 +373,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -388,7 +388,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape__1by1by1) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -403,7 +403,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape__) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -418,7 +418,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
+XLA_TEST_F(CpuGpuFusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -433,7 +433,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Transpose_2by3) {
+XLA_TEST_F(CpuGpuFusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -448,7 +448,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Transpose_3by3) {
+XLA_TEST_F(CpuGpuFusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -463,7 +463,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Reverse) {
+XLA_TEST_F(CpuGpuFusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -479,7 +479,7 @@ XLA_TEST_F(FusionTest, Reverse) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, ReverseNegate) {
+XLA_TEST_F(CpuGpuFusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -497,7 +497,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, BroadcastNegate) {
+XLA_TEST_F(CpuGpuFusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -515,7 +515,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, SliceNegate) {
+XLA_TEST_F(CpuGpuFusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -533,7 +533,7 @@ XLA_TEST_F(FusionTest, SliceNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, DynamicSliceNegate) {
+XLA_TEST_F(CpuGpuFusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -555,7 +555,7 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, ReshapeNegate) {
+XLA_TEST_F(CpuGpuFusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -573,7 +573,7 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, TransposeNegate) {
+XLA_TEST_F(CpuGpuFusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -602,7 +602,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
   return builder.Build();
 }
 
-XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
+XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(Reduce)) {
   auto hlo_module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
@@ -621,7 +621,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, ReduceImplicitBroadcast) {
+XLA_TEST_F(CpuGpuFusionTest, ReduceImplicitBroadcast) {
   auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
@@ -643,7 +643,7 @@ XLA_TEST_F(FusionTest, ReduceImplicitBroadcast) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
+XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -696,7 +696,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // When a constant (or other op) which has multiple users is imported
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
-XLA_TEST_F(FusionTest, SharedConstant) {
+XLA_TEST_F(CpuGpuFusionTest, SharedConstant) {
   auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
@@ -729,57 +729,59 @@ XLA_TEST_F(FusionTest, SharedConstant) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, Add2D) { TestElementwise2D<float, 2>(HloOpcode::kAdd); }
+XLA_TEST_F(CpuGpuFusionTest, Add2D) {
+  TestElementwise2D<float, 2>(HloOpcode::kAdd);
+}
 
-XLA_TEST_F(FusionTest, Subtract2D) {
+XLA_TEST_F(CpuGpuFusionTest, Subtract2D) {
   TestElementwise2D<float, 2>(HloOpcode::kSubtract);
 }
 
-XLA_TEST_F(FusionTest, Multiply2D) {
+XLA_TEST_F(CpuGpuFusionTest, Multiply2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMultiply);
 }
 
-XLA_TEST_F(FusionTest, Divide2D) {
+XLA_TEST_F(CpuGpuFusionTest, Divide2D) {
   TestElementwise2D<float, 2>(HloOpcode::kDivide);
 }
 
-XLA_TEST_F(FusionTest, Power2D) {
+XLA_TEST_F(CpuGpuFusionTest, Power2D) {
   TestElementwise2D<float, 2>(HloOpcode::kPower);
 }
 
-XLA_TEST_F(FusionTest, Minimum2D) {
+XLA_TEST_F(CpuGpuFusionTest, Minimum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMinimum);
 }
 
-XLA_TEST_F(FusionTest, Maximum2D) {
+XLA_TEST_F(CpuGpuFusionTest, Maximum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMaximum);
 }
 
-XLA_TEST_F(FusionTest, Equal2D) {
+XLA_TEST_F(CpuGpuFusionTest, Equal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kEq);
 }
 
-XLA_TEST_F(FusionTest, Inequal2D) {
+XLA_TEST_F(CpuGpuFusionTest, Inequal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kNe);
 }
 
-XLA_TEST_F(FusionTest, Greater2D) {
+XLA_TEST_F(CpuGpuFusionTest, Greater2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGt);
 }
 
-XLA_TEST_F(FusionTest, Lesser2D) {
+XLA_TEST_F(CpuGpuFusionTest, Lesser2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLt);
 }
 
-XLA_TEST_F(FusionTest, GreaterOrEqual2D) {
+XLA_TEST_F(CpuGpuFusionTest, GreaterOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGe);
 }
 
-XLA_TEST_F(FusionTest, LesserOrEqual2D) {
+XLA_TEST_F(CpuGpuFusionTest, LesserOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLe);
 }
 
-XLA_TEST_F(FusionTest, Clamp2D) {
+XLA_TEST_F(CpuGpuFusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 25e82842b05..ff2fd7e2297 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -1409,6 +1409,54 @@ ENTRY MatrixVectorComplex {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
 }
 
+// Regression test for b/138155357, where we were incorrectly creating a dot-add
+// fusion where the dot had a batch dimension.  This isn't supported on the CPU
+// backend.
+XLA_TEST_F(DotOperationTextTest, FusedBatchDotRegressionTest) {
+  absl::string_view module_string = R"(
+HloModule jaxpr_computation__5.33
+
+jaxpr_computation__6.8 {
+  tuple.9 = () tuple()
+  parameter.14 = () parameter(4)
+  parameter.13 = (f32[2]{0}) parameter(3)
+  get-tuple-element.15 = f32[2]{0} get-tuple-element(parameter.13), index=0
+  reshape.16 = f32[1,2]{1,0} reshape(get-tuple-element.15)
+  parameter.10 = f32[2,2]{1,0} parameter(0)
+  reshape.17 = f32[2,1]{1,0} reshape(get-tuple-element.15)
+  dot.18 = f32[2,1]{1,0} dot(parameter.10, reshape.17), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  reshape.19 = f32[2]{0} reshape(dot.18)
+  reshape.20 = f32[2,1]{1,0} reshape(reshape.19)
+  dot.21 = f32[1,1]{1,0} dot(reshape.16, reshape.20), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  reshape.22 = f32[] reshape(dot.21)
+  parameter.11 = f32[2,1,2]{2,1,0} parameter(1)
+  broadcast.23 = f32[2,2,1]{2,1,0} broadcast(reshape.20), dimensions={1,2}
+  dot.24 = f32[2,1,1]{2,1,0} dot(parameter.11, broadcast.23), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  broadcast.25 = f32[2,1,2]{2,1,0} broadcast(reshape.16), dimensions={1,2}
+  parameter.12 = f32[2,2,1]{2,1,0} parameter(2)
+  dot.26 = f32[2,1,1]{2,1,0} dot(broadcast.25, parameter.12), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  add.27 = f32[2,1,1]{2,1,0} add(dot.24, dot.26)
+  reshape.28 = f32[2]{0} reshape(add.27)
+  ROOT tuple.29 = (f32[], f32[2]{0}) tuple(reshape.22, reshape.28)
+}
+
+ENTRY jaxpr_computation__5.33 {
+  constant.2 = f32[] constant(1)
+  broadcast.3 = f32[2,2]{1,0} broadcast(constant.2), dimensions={}
+  constant.5 = f32[2,1,2]{2,1,0} constant({ { { 1, 0 } }, { { 0, 1 } } })
+  constant.4 = f32[2,2,1]{2,1,0} constant({ { {1}, {1} }, { {1}, {1} } })
+  parameter.6 = f32[2]{0} parameter(0)
+  tuple.7 = (f32[2]{0}) tuple(parameter.6)
+  tuple.1 = () tuple()
+  call.30 = (f32[], f32[2]{0}) call(broadcast.3, constant.5, constant.4, tuple.7, tuple.1), to_apply=jaxpr_computation__6.8
+  get-tuple-element.31 = f32[] get-tuple-element(call.30), index=0
+  ROOT get-tuple-element.32 = f32[2]{0} get-tuple-element(call.30), index=1
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  EXPECT_TRUE(RunAndCompare(std::move(module), /*error=*/absl::nullopt));
+}
+
 XLA_TEST_F(DotOperationTest, ReorderContractingDimsConstLHS_RL) {
   Array3D<float> input_arr(2, 3, 2);
   Array2D<float> const_arr(2, 6);
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc
new file mode 100644
index 00000000000..c0f8a0dc626
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc
@@ -0,0 +1,392 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+
+#ifdef __FAST_MATH__
+#error("Can't be compiled with fast math on");
+#endif
+
+namespace xla {
+namespace {
+
+template <PrimitiveType T>
+using ExhaustiveBinaryTest = ExhaustiveOpTestBase<T, 2>;
+
+// Exhaustive test for binary operations for 16 bit floating point types,
+// including float16 and bfloat.
+//
+// Test parameter is a pair of (begin, end) for range under test.
+template <
+    PrimitiveType T,
+    typename std::enable_if<
+        std::is_same<typename primitive_util::PrimitiveTypeToNative<T>::type,
+                     half>::value ||
+        std::is_same<typename primitive_util::PrimitiveTypeToNative<T>::type,
+                     bfloat16>::value>::type* = nullptr>
+class Exhaustive16BitBinaryTest
+    : public ExhaustiveBinaryTest<T>,
+      public ::testing::WithParamInterface<std::pair<int64, int64>> {
+ public:
+  int64 GetInputSize() override {
+    int64 begin, end;
+    std::tie(begin, end) = GetParam();
+    return end - begin;
+  }
+
+  // Given a range of uint64 representation, uses bits 0..15 and bits 16..31 for
+  // the values of src0 and src1 for a 16 bit binary operation being tested,
+  // and generates the cartesian product of the two sets as the two inputs for
+  // the test.
+  void FillInput(std::array<Literal, 2>* input_literals) override {
+    int64 input_size = GetInputSize();
+    CHECK_EQ(input_size, (*input_literals)[0].element_count());
+    CHECK_EQ(input_size, (*input_literals)[1].element_count());
+
+    int64 begin, end;
+    std::tie(begin, end) = GetParam();
+    VLOG(2) << "Checking range [" << begin << ", " << end << "]";
+
+    absl::Span<NativeT> input_arr_0 = (*input_literals)[0].data<NativeT>();
+    absl::Span<NativeT> input_arr_1 = (*input_literals)[1].data<NativeT>();
+    for (int64 i = 0; i < input_size; i++) {
+      uint32 input_val = i + begin;
+      // Convert the lower 16 bits to the NativeT and replaced known incorrect
+      // input values with 0.
+      input_arr_0[i] = ConvertAndReplaceKnownIncorrectValueWith(input_val, 0);
+      input_arr_1[i] =
+          ConvertAndReplaceKnownIncorrectValueWith(input_val >> 16, 0);
+    }
+  }
+
+ protected:
+  using typename ExhaustiveBinaryTest<T>::NativeT;
+  using ExhaustiveBinaryTest<T>::ConvertAndReplaceKnownIncorrectValueWith;
+};
+
+using ExhaustiveF16BinaryTest = Exhaustive16BitBinaryTest<F16>;
+using ExhaustiveBF16BinaryTest = Exhaustive16BitBinaryTest<BF16>;
+
+// Returns a wrapper of the given build method, which build an HLO operation
+// with an empty broadcast dimension.
+inline std::function<XlaOp(XlaOp, XlaOp)> AddEmptyBroadcastDimension(
+    std::function<XlaOp(XlaOp, XlaOp, absl::Span<const int64>)> build_method) {
+  return [&](XlaOp src0, XlaOp src1) -> XlaOp {
+    return build_method(src0, src1, {});
+  };
+}
+
+#define XLA_TEST_16BIT(test_name, ...)            \
+  XLA_TEST_P(ExhaustiveF16BinaryTest, test_name)  \
+  __VA_ARGS__                                     \
+  XLA_TEST_P(ExhaustiveBF16BinaryTest, test_name) \
+  __VA_ARGS__
+
+XLA_TEST_16BIT(Add, {
+  auto host_add = [](float x, float y) { return x + y; };
+  Run(AddEmptyBroadcastDimension(Add), host_add);
+})
+
+XLA_TEST_16BIT(Sub, {
+  auto host_sub = [](float x, float y) { return x - y; };
+  Run(AddEmptyBroadcastDimension(Sub), host_sub);
+})
+
+// TODO(bixia): Mul fails with bfloat16 on CPU.
+XLA_TEST_16BIT(DISABLED_ON_CPU(Mul), {
+  auto host_mul = [](float x, float y) { return x * y; };
+  Run(AddEmptyBroadcastDimension(Mul), host_mul);
+})
+
+// TODO(bixia): Div fails with bfloat16 on CPU.
+XLA_TEST_16BIT(DISABLED_ON_CPU(Div), {
+  auto host_div = [](float x, float y) { return x / y; };
+  Run(AddEmptyBroadcastDimension(Div), host_div);
+})
+
+template <typename T, typename std::enable_if<
+                          std::is_same<T, float>::value ||
+                          std::is_same<T, double>::value>::type* = nullptr>
+T ReferenceMax(T x, T y) {
+  // We need to propagate NAN here becasue std::max may not propagate NAN.
+  if (std::fpclassify(x) == FP_NAN) {
+    return x;
+  }
+  if (std::fpclassify(y) == FP_NAN) {
+    return y;
+  }
+
+  return std::max<T>(x, y);
+}
+
+template <typename T, typename std::enable_if<
+                          std::is_same<T, float>::value ||
+                          std::is_same<T, double>::value>::type* = nullptr>
+T ReferenceMin(T x, T y) {
+  // We need to propagate NAN here becasue std::max may not propagate NAN.
+  if (std::fpclassify(x) == FP_NAN) {
+    return x;
+  }
+  if (std::fpclassify(y) == FP_NAN) {
+    return y;
+  }
+
+  return std::min<T>(x, y);
+}
+
+XLA_TEST_16BIT(Max,
+               { Run(AddEmptyBroadcastDimension(Max), ReferenceMax<float>); })
+
+XLA_TEST_16BIT(Min,
+               { Run(AddEmptyBroadcastDimension(Min), ReferenceMin<float>); })
+
+// TODO(bixia): Pow fails with bfloat16 on CPU.
+XLA_TEST_16BIT(DISABLED_ON_CPU(Pow),
+               { Run(AddEmptyBroadcastDimension(Pow), std::powf); })
+
+// TODO(bixia): Atan2 fails with bfloat16 on CPU.
+XLA_TEST_16BIT(DISABLED_ON_CPU(Atan2),
+               { Run(AddEmptyBroadcastDimension(Atan2), std::atan2f); })
+
+#if defined(BINARY_TEST_TARGET_F16)
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+INSTANTIATE_TEST_SUITE_P(F16, ExhaustiveF16BinaryTest,
+                         ::testing::ValuesIn(CreateExhaustiveF32Ranges()));
+#endif
+#endif
+
+#if defined(BINARY_TEST_TARGET_BF16)
+#if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
+INSTANTIATE_TEST_SUITE_P(BF16, ExhaustiveBF16BinaryTest,
+                         ::testing::ValuesIn(CreateExhaustiveF32Ranges()));
+#endif
+#endif
+
+// Exhaustive test for binary operations for float and double.
+//
+// Test parameter is a tuple of (FpValues, FpValues) describing the possible
+// values for each operand. The inputs for the test are the Cartesian product
+// of the possible values for the two operands.
+template <PrimitiveType T>
+class Exhaustive32BitOrMoreBinaryTest
+    : public ExhaustiveBinaryTest<T>,
+      public ::testing::WithParamInterface<std::tuple<FpValues, FpValues>> {
+ protected:
+  using typename ExhaustiveBinaryTest<T>::NativeT;
+  using ExhaustiveBinaryTest<T>::ConvertAndReplaceKnownIncorrectValueWith;
+
+ private:
+  int64 GetInputSize() override {
+    FpValues values_0;
+    FpValues values_1;
+    std::tie(values_0, values_1) = GetParam();
+    return values_0.GetTotalNumValues() * values_1.GetTotalNumValues();
+  }
+
+  void FillInput(std::array<Literal, 2>* input_literals) override {
+    int64 input_size = GetInputSize();
+    FpValues values_0;
+    FpValues values_1;
+    std::tie(values_0, values_1) = GetParam();
+
+    VLOG(2) << " testing " << values_0.ToString() << " " << values_1.ToString()
+            << "total values " << input_size;
+    CHECK(input_size == (*input_literals)[0].element_count() &&
+          input_size == (*input_literals)[1].element_count());
+
+    absl::Span<NativeT> input_arr_0 = (*input_literals)[0].data<NativeT>();
+    absl::Span<NativeT> input_arr_1 = (*input_literals)[1].data<NativeT>();
+
+    uint64 i = 0;
+    for (auto src0 : values_0) {
+      for (auto src1 : values_1) {
+        input_arr_0[i] = ConvertAndReplaceKnownIncorrectValueWith(src0, 1);
+        input_arr_1[i] = ConvertAndReplaceKnownIncorrectValueWith(src1, 1);
+        ++i;
+      }
+    }
+    CHECK_EQ(i, input_size);
+  }
+};
+
+using ExhaustiveF32BinaryTest = Exhaustive32BitOrMoreBinaryTest<F32>;
+using ExhaustiveF64BinaryTest = Exhaustive32BitOrMoreBinaryTest<F64>;
+
+XLA_TEST_P(ExhaustiveF32BinaryTest, Add) {
+  auto host_add = [](float x, float y) { return x + y; };
+  Run(AddEmptyBroadcastDimension(Add), host_add);
+}
+
+XLA_TEST_P(ExhaustiveF32BinaryTest, Sub) {
+  auto host_sub = [](float x, float y) { return x - y; };
+  Run(AddEmptyBroadcastDimension(Sub), host_sub);
+}
+
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(Mul)) {
+  auto host_mul = [](float x, float y) { return x * y; };
+  Run(AddEmptyBroadcastDimension(Mul), host_mul);
+}
+
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(Div)) {
+  auto host_div = [](float x, float y) { return x / y; };
+  Run(AddEmptyBroadcastDimension(Div), host_div);
+}
+
+XLA_TEST_P(ExhaustiveF32BinaryTest, Max) {
+  Run(AddEmptyBroadcastDimension(Max), ReferenceMax<float>);
+}
+
+XLA_TEST_P(ExhaustiveF32BinaryTest, Min) {
+  Run(AddEmptyBroadcastDimension(Min), ReferenceMin<float>);
+}
+
+// It is more convenient to implement Abs(complex) as a binary op than a unary
+// op, as the operations we currently support all have the same data type for
+// the source operands and the results.
+// TODO(bixia): May want to move this test to unary test if we will be able to
+// implement Abs(complex) as unary conveniently.
+//
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(AbsComplex)) {
+  auto host_abs_complex = [](float x, float y) {
+    return std::abs(std::complex<float>(x, y));
+  };
+  auto device_abs_complex = [](XlaOp x, XlaOp y) { return Abs(Complex(x, y)); };
+
+  Run(device_abs_complex, host_abs_complex);
+}
+
+#if defined(BINARY_TEST_TARGET_F32)
+
+INSTANTIATE_TEST_SUITE_P(
+    SpecialValues, ExhaustiveF32BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    SpecialAndNormalValues, ExhaustiveF32BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
+        ::testing::Values(GetNormals<float>(2000))));
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalAndSpecialValues, ExhaustiveF32BinaryTest,
+    ::testing::Combine(
+        ::testing::Values(GetNormals<float>(2000)),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalAndNormalValues, ExhaustiveF32BinaryTest,
+    ::testing::Combine(::testing::Values(GetNormals<float>(2000)),
+                       ::testing::Values(GetNormals<float>(2000))));
+
+// Tests a total of 40000 ^ 2 inputs, with 2000 ^ 2 inputs in each sub-test.
+// Comparing with the unary tests, the binary tests use a smaller set of inputs
+// for each sub-test to avoid timeout because the implementation of ExpectNear
+// more than 2x slower for binary test.
+INSTANTIATE_TEST_SUITE_P(
+    LargeAndSmallMagnituedNormalValues, ExhaustiveF32BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<float>(40000,
+                                                                         2000)),
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<float>(40000, 2000))));
+
+#endif
+
+XLA_TEST_P(ExhaustiveF64BinaryTest, Add) {
+  auto host_add = [](double x, double y) { return x + y; };
+  Run(AddEmptyBroadcastDimension(Add), host_add);
+}
+
+XLA_TEST_P(ExhaustiveF64BinaryTest, Sub) {
+  auto host_sub = [](double x, double y) { return x - y; };
+  Run(AddEmptyBroadcastDimension(Sub), host_sub);
+}
+
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(Mul)) {
+  auto host_mul = [](double x, double y) { return x * y; };
+  Run(AddEmptyBroadcastDimension(Mul), host_mul);
+}
+
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(Div)) {
+  auto host_div = [](double x, double y) { return x / y; };
+  Run(AddEmptyBroadcastDimension(Div), host_div);
+}
+
+XLA_TEST_P(ExhaustiveF64BinaryTest, Max) {
+  Run(AddEmptyBroadcastDimension(Max), ReferenceMax<double>);
+}
+
+XLA_TEST_P(ExhaustiveF64BinaryTest, Min) {
+  Run(AddEmptyBroadcastDimension(Min), ReferenceMin<double>);
+}
+
+// TODO(bixia): Need to investigate the failure on CPU and file bugs.
+XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(AbsComplex)) {
+  auto host_abs_complex = [](double x, double y) {
+    return std::abs(std::complex<double>(x, y));
+  };
+  auto device_abs_complex = [](XlaOp x, XlaOp y) { return Abs(Complex(x, y)); };
+
+  Run(device_abs_complex, host_abs_complex);
+}
+
+#if defined(BINARY_TEST_TARGET_F64)
+
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+INSTANTIATE_TEST_SUITE_P(
+    SpecialValues, ExhaustiveF64BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    SpecialAndNormalValues, ExhaustiveF64BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
+        ::testing::Values(GetNormals<double>(1000))));
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalAndSpecialValues, ExhaustiveF64BinaryTest,
+    ::testing::Combine(
+        ::testing::Values(GetNormals<double>(1000)),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalAndNormalValues, ExhaustiveF64BinaryTest,
+    ::testing::Combine(::testing::Values(GetNormals<double>(1000)),
+                       ::testing::Values(GetNormals<double>(1000))));
+
+// Tests a total of 40000 ^ 2 inputs, with 1000 ^ 2 inputs in each sub-test.
+// Similar to ExhaustiveF64BinaryTest, we use a smaller set of inputs for each
+// for each sub-test comparing with the unary test to avoid timeout.
+INSTANTIATE_TEST_SUITE_P(
+    LargeAndSmallMagnituedNormalValues, ExhaustiveF64BinaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000)),
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000))));
+#endif
+
+#endif
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc
index 465da47faeb..1d3248fe04c 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace xla {
 
-// For f32, f16, and bf16, we need 9, 5, and 4 decimal places of precision to be
-// guaranteed that we're printing the full number.
+// For f64, f32, f16, and bf16, we need 17, 9, 5, and 4 decimal places of
+// precision to be guaranteed that we're printing the full number.
 //
 // (The general formula is, given a floating-point number with S significand
 // bits, the number of decimal digits needed to print it to full precision is
@@ -26,71 +26,237 @@ namespace xla {
 //   ceil(1 + S * log_10(2)) ~= ceil(1 + S * 0.30103).
 //
 // See https://people.eecs.berkeley.edu/~wkahan/Math128/BinDecBin.pdf.)
-/*static*/
-string ExhaustiveOpTestBase::StringifyNum(float x) {
-  return absl::StrFormat("%0.9g (0x%08x)", x, BitCast<uint32>(x));
-}
+namespace {
+template <typename T>
+struct ComponentStringifyFormat {};
+
+template <>
+struct ComponentStringifyFormat<double> {
+  static constexpr absl::string_view value = "%0.17g (0x%16x)";
+};
+
+template <>
+struct ComponentStringifyFormat<float> {
+  static constexpr absl::string_view value = "%0.8g (0x%08x)";
+};
+
+template <>
+struct ComponentStringifyFormat<Eigen::half> {
+  static constexpr absl::string_view value = "%0.5g (0x%04x)";
+};
+
+template <>
+struct ComponentStringifyFormat<bfloat16> {
+  static constexpr absl::string_view value = "%0.4g (0x%04x)";
+};
+}  // namespace
 
 /*static*/
-string ExhaustiveOpTestBase::StringifyNum(half x) {
-  return absl::StrFormat("%0.5g (0x%04x)", static_cast<float>(x),
-                         BitCast<uint16>(x));
+template <PrimitiveType T, size_t N>
+string ExhaustiveOpTestBase<T, N>::StringifyNum(
+    typename ExhaustiveOpTestBase<T, N>::ComponentNativeT x) {
+  typedef typename ExhaustiveOpTestBase<T, N>::ComponentNativeT ComponentType;
+  typedef typename ExhaustiveOpTestBase<T, N>::ComponentIntegralNativeT
+      IntegralType;
+  return absl::StrFormat(ComponentStringifyFormat<ComponentType>::value,
+                         static_cast<double>(x), BitCast<IntegralType>(x));
 }
 
-/*static*/
-string ExhaustiveOpTestBase::StringifyNum(bfloat16 x) {
-  return absl::StrFormat("%0.4g (0x%04x)", static_cast<float>(x),
-                         BitCast<uint16>(x));
-}
-
-/*static*/
-std::vector<std::pair<int64, int64>>
-ExhaustiveOpTestBase::CreateExhaustiveF32Ranges() {
-  // We break up the 2^32-element space into small'ish chunks to keep peak
-  // memory usage low.
-  std::vector<std::pair<int64, int64>> result;
-  const int64 step = 1 << 25;
-  for (int64 i = 0; i < (1l << 32); i += step) {
-    result.push_back({i, i + step});
+template <PrimitiveType T, size_t N>
+void ExhaustiveOpTestBase<T, N>::ExpectNear(const InputLiterals& input_literals,
+                                            const Literal& result_literal,
+                                            EvaluateOp evaluate_op,
+                                            ErrorSpecGen error_spec_gen) {
+  // Cache for when all components are subnormal testing values.
+  std::vector<NativeRefT> pure_subnormal_cache;
+  pure_subnormal_cache.reserve(GetMaxCacheSize());
+  for (int i = 0; i < GetMaxCacheSize(); ++i) {
+    pure_subnormal_cache.push_back(
+        CallOperation(evaluate_op, FromCacheLocation(i)));
   }
-  return result;
+
+  NativeInputsList inputs_arr;
+  for (int i = 0; i < N; ++i) {
+    const Literal& literal = input_literals[i];
+    inputs_arr[i] = literal.data<NativeT>();
+  }
+
+  absl::Span<const NativeT> result_arr = result_literal.data<NativeT>();
+
+  int64 mismatches = 0;
+
+  for (int64 i = 0; i < result_arr.size(); ++i) {
+    NativeInputs inputs;
+    NativeRefInputs inputs_ref_ty;
+
+    for (int j = 0; j < N; ++j) {
+      inputs[j] = inputs_arr[j][i];
+      inputs_ref_ty[j] = static_cast<NativeRefT>(inputs[j]);
+    }
+
+    NativeT actual = result_arr[i];
+    NativeT expected =
+        static_cast<NativeT>(CallOperation(evaluate_op, inputs_ref_ty));
+    ErrorSpec error_spec = CallErrorSpec(error_spec_gen, inputs);
+
+    if (IsClose(static_cast<NativeRefT>(expected),
+                static_cast<NativeRefT>(actual), error_spec)) {
+      continue;
+    }
+
+    std::vector<NativeRefInputs> subnormal_test_inputs =
+        GetTestValuesWithSubnormalSubstitutions(inputs_ref_ty);
+
+    // Easy case: If `input` is not subnormal and !IsClose(expected, actual,
+    // error_spec), print an error.
+    if (subnormal_test_inputs.size() == 1) {
+      PrintMismatch(&mismatches, [&] {
+        return absl::StrFormat("Mismatch on %s. Expected %s, but got %s.",
+                               StringifyNum(inputs), StringifyNum(expected),
+                               StringifyNum(actual));
+      });
+      continue;
+    }
+
+    // Otherwise, we need to test the additional subnormal test values.
+    std::vector<NativeRefT> subnormal_test_results;
+    subnormal_test_results.reserve(subnormal_test_inputs.size());
+    bool passed_subnormal_test = false;
+
+    for (NativeRefInputs test_value : subnormal_test_inputs) {
+      NativeRefT result;
+      int cache_loc = GetCacheLocation(test_value);
+      if (cache_loc == kInvalidCacheIndex) {
+        result = CallOperation(evaluate_op, test_value);
+      } else {
+        result = pure_subnormal_cache[cache_loc];
+      }
+
+      if (IsClose(result, static_cast<NativeRefT>(actual), error_spec)) {
+        passed_subnormal_test = true;
+        break;
+      }
+      subnormal_test_results.push_back(std::move(result));
+    }
+
+    if (passed_subnormal_test) {
+      continue;
+    }
+
+    std::string mismatch = absl::StrFormat(
+        "Mismatch on subnormal value %s.  Expected one of:\n"
+        "  %10s (evaluated at full-precision value)\n",
+        StringifyNum(inputs), StringifyNum(expected));
+
+    CHECK_EQ(subnormal_test_inputs.size(), subnormal_test_results.size());
+    for (int i = 0; i < subnormal_test_inputs.size(); ++i) {
+      absl::StrAppend(
+          &mismatch,
+          absl::StrFormat("  %10s (evaluated at %s)\n",
+                          StringifyNum(subnormal_test_results[i]),
+                          GetSubnormalDescription(subnormal_test_inputs[i],
+                                                  inputs_ref_ty)));
+    }
+    absl::StrAppend(&mismatch,
+                    absl::StrFormat("but got %s", StringifyNum(actual)));
+
+    PrintMismatch(&mismatches, [mismatch] { return mismatch; });
+  }
+  EXPECT_EQ(mismatches, 0);
 }
 
 namespace {
-ExhaustiveOpTestBase::ErrorSpec DefaultF64SpecGenerator(float) {
-  return ExhaustiveOpTestBase::ErrorSpec{0.0001, 0.0001};
+template <PrimitiveType T, size_t N>
+inline typename ExhaustiveOpTestBase<T, N>::ErrorSpec DefaultSpecGenerator(
+    typename ExhaustiveOpTestBase<T, N>::NativeT) {
+  LOG(FATAL) << "Unhandled Type";
 }
 
-ExhaustiveOpTestBase::ErrorSpec DefaultF32SpecGenerator(float) {
-  return ExhaustiveOpTestBase::ErrorSpec{0.0001, 0.0001};
+template <PrimitiveType T, size_t N>
+inline typename ExhaustiveOpTestBase<T, N>::ErrorSpec DefaultSpecGenerator(
+    typename ExhaustiveOpTestBase<T, N>::NativeT,
+    typename ExhaustiveOpTestBase<T, N>::NativeT) {
+  LOG(FATAL) << "Unhandled Type";
 }
 
-ExhaustiveOpTestBase::ErrorSpec DefaultF16SpecGenerator(float) {
-  return ExhaustiveOpTestBase::ErrorSpec{0.001, 0.001};
+template <>
+inline ExhaustiveOpTestBase<C128, 1>::ErrorSpec DefaultSpecGenerator<C128, 1>(
+    complex128) {
+  return ExhaustiveOpTestBase<C128, 1>::ErrorSpec{0.0001, 0.0001};
 }
 
-ExhaustiveOpTestBase::ErrorSpec DefaultBF16SpecGenerator(float) {
-  return ExhaustiveOpTestBase::ErrorSpec{0.002, 0.02};
+template <>
+inline ExhaustiveOpTestBase<C64, 1>::ErrorSpec DefaultSpecGenerator<C64, 1>(
+    complex64) {
+  return ExhaustiveOpTestBase<C64, 1>::ErrorSpec{0.0001, 0.0001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F64, 1>::ErrorSpec DefaultSpecGenerator<F64, 1>(
+    double) {
+  return ExhaustiveOpTestBase<F64, 1>::ErrorSpec{0.0001, 0.0001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F32, 1>::ErrorSpec DefaultSpecGenerator<F32, 1>(
+    float) {
+  return ExhaustiveOpTestBase<F32, 1>::ErrorSpec{0.0001, 0.0001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F16, 1>::ErrorSpec DefaultSpecGenerator<F16, 1>(
+    Eigen::half) {
+  return ExhaustiveOpTestBase<F16, 1>::ErrorSpec{0.001, 0.001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<BF16, 1>::ErrorSpec DefaultSpecGenerator<BF16, 1>(
+    bfloat16) {
+  return ExhaustiveOpTestBase<BF16, 1>::ErrorSpec{0.002, 0.02};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F64, 2>::ErrorSpec DefaultSpecGenerator<F64, 2>(
+    double, double) {
+  return ExhaustiveOpTestBase<F64, 2>::ErrorSpec{0.001, 0.001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F32, 2>::ErrorSpec DefaultSpecGenerator<F32, 2>(
+    float, float) {
+  return ExhaustiveOpTestBase<F32, 2>::ErrorSpec{0.001, 0.001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<F16, 2>::ErrorSpec DefaultSpecGenerator<F16, 2>(
+    Eigen::half, Eigen::half) {
+  return ExhaustiveOpTestBase<F16, 2>::ErrorSpec{0.001, 0.001};
+}
+
+template <>
+inline ExhaustiveOpTestBase<BF16, 2>::ErrorSpec DefaultSpecGenerator<BF16, 2>(
+    bfloat16, bfloat16) {
+  return ExhaustiveOpTestBase<BF16, 2>::ErrorSpec{0.002, 0.02};
 }
 }  // namespace
 
 /*static*/
-std::function<ExhaustiveOpTestBase::ErrorSpec(float)>
-ExhaustiveOpTestBase::GetDefaultSpecGenerator(PrimitiveType ty) {
-  switch (ty) {
-    case C128:
-    case F64:
-      return DefaultF64SpecGenerator;
-    case C64:
-    case F32:
-      return DefaultF32SpecGenerator;
-    case F16:
-      return DefaultF16SpecGenerator;
-    case BF16:
-      return DefaultBF16SpecGenerator;
-    default:
-      LOG(FATAL) << "Unhandled Type";
-  }
+template <PrimitiveType T, size_t N>
+typename ExhaustiveOpTestBase<T, N>::ErrorSpecGen
+ExhaustiveOpTestBase<T, N>::GetDefaultSpecGenerator() {
+  return DefaultSpecGenerator<T, N>;
 }
 
+template class ExhaustiveOpTestBase<C128, 1>;
+template class ExhaustiveOpTestBase<C64, 1>;
+template class ExhaustiveOpTestBase<F64, 1>;
+template class ExhaustiveOpTestBase<F32, 1>;
+template class ExhaustiveOpTestBase<F16, 1>;
+template class ExhaustiveOpTestBase<BF16, 1>;
+
+template class ExhaustiveOpTestBase<F64, 2>;
+template class ExhaustiveOpTestBase<F32, 2>;
+template class ExhaustiveOpTestBase<F16, 2>;
+template class ExhaustiveOpTestBase<BF16, 2>;
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h
index 3df4de295e3..3d77b44b53a 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h
+++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h
@@ -28,8 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 
 namespace xla {
-using Eigen::half;
 
+// T: The primitive type being tested.
+// N: The number of operands that the function being tested takes.
+template <PrimitiveType T, size_t N>
 class ExhaustiveOpTestBase : public ClientLibraryTestBase {
  public:
   struct ErrorSpec {
@@ -41,11 +43,186 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase {
     // spec; this only covers the case when both `expected` and `actual` are
     // equal to 0.
     bool strict_signed_zeros = false;
+
+    ErrorSpec(float a, float r) : abs_err(a), rel_err(r) {}
   };
 
-  // `ty` is the primitive type being tested.
-  explicit ExhaustiveOpTestBase(PrimitiveType ty)
-      : ty_(ty), platform_(client_->platform()->Name()) {}
+  // Definitions depending on the primitive type T.
+
+  static constexpr bool kIsComplex = (T == C128 || T == C64);
+
+  // The primitive type used to compute the reference output.
+  struct RefT {
+    static constexpr PrimitiveType value = (T == F16 || T == BF16) ? F32 : T;
+  };
+
+  // The primitive type of the component of T. If T is not complex, then
+  // ComponentT = T.
+  struct ComponentT {
+    static constexpr PrimitiveType value =
+        !kIsComplex ? T
+                    : T == C128 ? F64 : T == C64 ? F32 : PRIMITIVE_TYPE_INVALID;
+  };
+
+  // Same as ComponentT, but for the RefT primitive type.
+  struct ComponentRefT {
+    static constexpr PrimitiveType value =
+        !kIsComplex ? RefT::value
+                    : RefT::value == C128
+                          ? F64
+                          : RefT::value == C64 ? F32 : PRIMITIVE_TYPE_INVALID;
+  };
+
+  // The primitive type of an unsigned integer that can be bitcasted to and from
+  // ComponentT.
+  struct ComponentIntegralT {
+    static constexpr PrimitiveType value =
+        (T == C128 || T == F64)
+            ? U64
+            : (T == C64 || T == F32)
+                  ? U32
+                  : (T == F16 || T == BF16) ? U16 : PRIMITIVE_TYPE_INVALID;
+  };
+
+  // Native types that correspond to the primtive types above.
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<T>::type;
+  using NativeRefT =
+      typename primitive_util::PrimitiveTypeToNative<RefT::value>::type;
+  using ComponentNativeT =
+      typename primitive_util::PrimitiveTypeToNative<ComponentT::value>::type;
+  using ComponentNativeRefT = typename primitive_util::PrimitiveTypeToNative<
+      ComponentRefT::value>::type;
+  using ComponentIntegralNativeT =
+      typename primitive_util::PrimitiveTypeToNative<
+          ComponentIntegralT::value>::type;
+
+  using InputLiterals = std::array<Literal, N>;
+
+ private:
+  // N spans corresponding to the list of literal data values.
+  using NativeInputsList = std::array<absl::Span<const NativeT>, N>;
+
+  // N data items representing a single input to an XLA function.
+  using NativeInputs = std::array<NativeT, N>;
+
+  // N data items representing a single input to an interpreter backend
+  // function.
+  using NativeRefInputs = std::array<NativeRefT, N>;
+
+  // N data items representing a single input to an XLA function.
+  using XlaInputs = std::array<XlaOp, N>;
+
+  // Representations of the reference function passed in by the user.
+  template <size_t K>
+  struct EvaluateOpWrapper {};
+  template <>
+  struct EvaluateOpWrapper<1> {
+    using type = NativeRefT (*)(NativeRefT);
+  };
+  template <>
+  struct EvaluateOpWrapper<2> {
+    using type = NativeRefT (*)(NativeRefT, NativeRefT);
+  };
+
+  // Representations of the reference function passed in by the user.
+  template <size_t K>
+  struct EnqueueOpWrapper {};
+  template <>
+  struct EnqueueOpWrapper<1> {
+    using type = std::function<XlaOp(XlaOp)>;
+    static XlaOp BuildFromInputs(XlaInputs inputs, type ty) {
+      return ty(inputs[0]);
+    }
+  };
+  template <>
+  struct EnqueueOpWrapper<2> {
+    using type = std::function<XlaOp(XlaOp, XlaOp)>;
+    static XlaOp BuildFromInputs(XlaInputs inputs, type ty) {
+      return ty(inputs[0], inputs[1]);
+    }
+  };
+
+  // Representations of the ErrorSpecGen function passed in by the user.
+  template <size_t K>
+  struct ErrorSpecGenWrapper {};
+  template <>
+  struct ErrorSpecGenWrapper<1> {
+    using type = ErrorSpec (*)(NativeT);
+  };
+  template <>
+  struct ErrorSpecGenWrapper<2> {
+    using type = ErrorSpec (*)(NativeT, NativeT);
+  };
+
+ public:
+  using ErrorSpecGen = typename ErrorSpecGenWrapper<N>::type;
+  using EvaluateOp = typename EvaluateOpWrapper<N>::type;
+  using EnqueueOp = typename EnqueueOpWrapper<N>::type;
+
+  explicit ExhaustiveOpTestBase()
+      : ty_(T), platform_(client_->platform()->Name()) {
+    SetFastMathDisabled(true);
+
+    // Run all HLO passes.  In particular, constant folding is disabled by
+    // default for tests, but we need to run it in order to tickle some bugs.
+    mutable_debug_options()->clear_xla_disable_hlo_passes();
+  }
+
+  void Run(EnqueueOp enqueue_op, EvaluateOp evaluate_op) {
+    Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator());
+  }
+
+  // A helper for implementing the Run method for exhaustive op tests. It
+  // constructs the HLO module, compiles and runs the module and checks the
+  // result.
+  //
+  // We use a function pointer for evaluate_op for performance because it is
+  // called each time an output element is compared inside a loop in routine
+  // ExpectNear.
+  void Run(EnqueueOp enqueue_op, EvaluateOp evaluate_op,
+           ErrorSpecGen error_spec_gen) {
+    InputLiterals input_literals = CreateInputLiterals();
+    FillInput(&input_literals);
+
+    XlaBuilder builder(TestName());
+    XlaInputs xla_inputs;
+    for (int i = 0; i < N; ++i) {
+      xla_inputs[i] =
+          Parameter(&builder, i, input_literals[i].shape(), "input");
+    }
+    EnqueueOpWrapper<N>::BuildFromInputs(xla_inputs, enqueue_op);
+
+    TF_ASSERT_OK_AND_ASSIGN(XlaComputation comp, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
+                            RunComputationHelper(comp, input_literals));
+    ExpectNear(input_literals, result_literal, evaluate_op, error_spec_gen);
+  }
+
+  StatusOr<Literal> RunComputationHelper(const XlaComputation& comp,
+                                         const Literal& literal) {
+    return RunComputation(comp, {&literal});
+  }
+
+  StatusOr<Literal> RunComputationHelper(
+      const XlaComputation& comp, const std::array<Literal, N>& literals) {
+    std::array<const Literal*, N> lit_ptrs;
+    for (int i = 0; i < N; ++i) {
+      lit_ptrs[i] = &literals[i];
+    }
+    return RunComputation(comp, lit_ptrs);
+  }
+
+  // We essentially reimplement LiteralTestUtil::Near here because
+  //  a) this streamlined implementation is much faster, and
+  //  b) we can print out better error messages (namely, we can print out
+  //     which floating-point value input failed, while LiteralTestUtil::Near
+  //     can only print out the input index that failed).
+  //  c) we need special handling of certain inputs.  For example, we say that
+  //     a denormal input has multiple correct outputs (namely, f(x) and f(0))
+  //     and just needs to be close to one of them.
+  void ExpectNear(const InputLiterals& input_literals,
+                  const Literal& result_literal, EvaluateOp evaluate_op,
+                  ErrorSpecGen error_spec_gen);
 
   // Builds and runs the computation using the LocalClient API, rather than the
   // plain Client API, which is used by ClientLibraryTestBase.  This is because
@@ -94,30 +271,395 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase {
     return std::move(result_literal);
   }
 
+  const string& Platform() { return platform_; }
+
   // Returns the number of elements in each input literal.
   virtual int64 GetInputSize() = 0;
 
-  Literal CreateInputLiteral() {
-    return LiteralUtil::CreateFromDimensions(ty_, {GetInputSize()});
+  // Fills the literals with values to test for.
+  virtual void FillInput(InputLiterals* literals) = 0;
+
+  // Replace infinites with max value to help compute errors.
+  static ComponentNativeRefT ReplaceInfWithMax(ComponentNativeRefT value) {
+    if (std::isinf(value)) {
+      return std::copysign(std::numeric_limits<ComponentNativeRefT>::max(),
+                           value);
+    }
+    return value;
   }
 
-  // `T` is the type of the value being compared, which is float if ty_ is of 32
-  // bits or less, and double otherwise.
-  template <typename T>
-  bool IsClose(T expected, T actual, ErrorSpec spec) {
-    static_assert(
-        std::is_same<T, float>::value || std::is_same<T, double>::value,
-        "Only supports float and double.");
-    T abs_err = std::abs(expected - actual);
-    T rel_err = abs_err / std::abs(expected);
-    if (spec.strict_signed_zeros && actual == T{0} && expected == T{0}) {
-      // Check sign of zero.
-      return std::signbit(actual) == std::signbit(expected);
+  // Returns true if both components are 0, but their sign bits differ.
+  static bool CheckSignedZeroError(ComponentNativeRefT expected,
+                                   ComponentNativeRefT actual) {
+    return expected == 0 && actual == 0 &&
+           std::signbit(expected) != std::signbit(actual);
+  }
+
+  // Sets the components to 0 if both are NaNs.
+  static void RemoveCorrespondingNaNs(ComponentNativeRefT* expected,
+                                      ComponentNativeRefT* actual) {
+    if (std::isnan(*expected) && std::isnan(*actual)) {
+      *expected = 0;
+      *actual = 0;
     }
-    return abs_err <= spec.abs_err || rel_err <= spec.rel_err ||
-           (std::isnan(expected) && std::isnan(actual)) ||
-           (std::isinf(expected) && std::isinf(actual) &&
-            (expected > 0) == (actual > 0));
+  }
+
+  // The Implementation of the functions above, except for complex inputs.
+
+  static std::complex<ComponentNativeRefT> ReplaceInfWithMax(
+      std::complex<ComponentNativeRefT> value) {
+    value.real(ReplaceInfWithMax(value.real()));
+    value.imag(ReplaceInfWithMax(value.imag()));
+    return value;
+  }
+
+  static bool CheckSignedZeroError(std::complex<ComponentNativeRefT> expected,
+                                   std::complex<ComponentNativeRefT> actual) {
+    return CheckSignedZeroError(expected.real(), actual.real()) ||
+           CheckSignedZeroError(expected.imag(), actual.imag());
+  }
+
+  static void RemoveCorrespondingNaNs(
+      std::complex<ComponentNativeRefT>* expected,
+      std::complex<ComponentNativeRefT>* actual) {
+    ComponentNativeRefT expected_real = expected->real();
+    ComponentNativeRefT expected_imag = expected->imag();
+    ComponentNativeRefT actual_real = actual->real();
+    ComponentNativeRefT actual_imag = actual->imag();
+    RemoveCorrespondingNaNs(&expected_real, &actual_real);
+    RemoveCorrespondingNaNs(&expected_imag, &actual_imag);
+    expected->real(expected_real);
+    expected->imag(expected_imag);
+    actual->real(actual_real);
+    actual->imag(actual_imag);
+  }
+
+  // Returns a list of inputs that should be tested for closeness given some
+  // original input values.
+  //
+  // For denormal component inputs, we accept answers that are close to any of:
+  //
+  //   - evaluate_op(input)
+  //   - evaluate_op(+/-0), where the sign of 0 equal to the sign of
+  //     `input`,
+  //   - evaluate_op(+/-min_normal_float), where the sign of
+  //     min_normal_float matches `input`.
+  //   - if relaxed_denormal_signs_, evaluate_op(-/+0), where the sign of
+  //     0 is the opposite of `input`.
+  //
+  // (In particular, the XLA:CPU implementation of log flushes positive
+  // denormals to min-normal-float.  This seems kind of reasonable if our
+  // goal is to avoid infinities because they cause nans?)
+  std::vector<ComponentNativeRefT> GetTestValuesWithSubnormalSubstitutions(
+      ComponentNativeRefT value) {
+    std::vector<ComponentNativeRefT> test_values;
+    if (std::fpclassify(value) == FP_SUBNORMAL) {
+      test_values.reserve(relaxed_denormal_signs_ ? 3 : 2);
+      test_values.push_back(std::copysign(0, value));
+      test_values.push_back(std::copysign(
+          std::numeric_limits<ComponentNativeRefT>::min(), value));
+      if (relaxed_denormal_signs_) {
+        test_values.push_back(std::copysign(0, -value));
+      }
+    } else {
+      test_values.push_back(value);
+    }
+    return test_values;
+  }
+
+  // Similar to complex numbers, we only need to test the components that are
+  // subnormal. We can find the subnormal testing values for each component,
+  // then take the Cartesian product of each set of component values.
+  std::vector<std::complex<ComponentNativeRefT>>
+  GetTestValuesWithSubnormalSubstitutions(
+      std::complex<ComponentNativeRefT> value) {
+    using complex = std::complex<ComponentNativeRefT>;
+
+    auto real_values = GetTestValuesWithSubnormalSubstitutions(value.real());
+    auto imag_values = GetTestValuesWithSubnormalSubstitutions(value.imag());
+
+    std::vector<complex> test_values;
+    test_values.reserve(real_values.size() * imag_values.size());
+    for (auto real : real_values) {
+      for (auto imag : imag_values) {
+        test_values.push_back(complex(real, imag));
+      }
+    }
+
+    return test_values;
+  }
+
+  // The test values for an XLA function with N operands are the Cartesian
+  // product of the test values for each of the N operands.
+  std::vector<std::array<NativeRefT, N>>
+  GetTestValuesWithSubnormalSubstitutions(
+      const std::array<NativeRefT, N>& value) {
+    std::vector<std::array<NativeRefT, N>> test_values;
+
+    std::array<std::vector<NativeRefT>, N> component_test_values;
+    int total = 1;
+    for (int i = 0; i < N; ++i) {
+      component_test_values[i] =
+          GetTestValuesWithSubnormalSubstitutions(value[i]);
+      if (!component_test_values.empty()) {
+        total *= component_test_values[i].size();
+      }
+    }
+
+    // If total == 1, then value has no subnormal components, so we can just
+    // return a vector with value in it.
+    if (total == 1) {
+      test_values.push_back(value);
+      return test_values;
+    }
+
+    test_values.reserve(total);
+
+    // Perform a Cartesian product of the vectors in component_test_values.
+    // We can calculate this by uniquely mapping each integer from 0 to
+    // (total - 1) to a list of component indices. The function that maps an
+    // integer z to the index of component j is:
+    //    component_index(j) =  (i / NumValues(0, j-1)) % NumValues(j, j)
+    // and NumIndices(x, y) is the number of values in the Cartesian product of
+    // component_test_values[x], component_test_values[x+1], ...
+    // component_test_values[y].
+    for (int i = 0; i < total; ++i) {
+      int accumulated_num_values = 1;
+      std::array<NativeRefT, N> test_value;
+      for (int j = 0; j < N; ++j) {
+        int num_indices = component_test_values[j].size();
+        int component_index = (i / accumulated_num_values) % num_indices;
+        test_value[j] = component_test_values[j][component_index];
+        accumulated_num_values *= num_indices;
+      }
+      test_values.push_back(std::move(test_value));
+    }
+    return test_values;
+  }
+
+  // The number of values that can be substituted for subnormal inputs.
+  static constexpr int kNumSubnormalSubstitutionValues = 4;
+
+  // Encodings used to determine where subnormal test values are cached.
+  static constexpr int kPositiveMin = 0;
+  static constexpr int kNegativeMin = 1;
+  static constexpr int kPositiveZero = 2;
+  static constexpr int kNegativeZero = 3;
+  static constexpr int kNonSubnormal = -1;
+  static constexpr int kInvalidCacheIndex = -1;
+
+  // Since we take the cross product of all possible test values, and each
+  // component has kNumSubnormalSubstitutionValues possible test values, then
+  // the total number of different cache locations are
+  // kNumSubnormalSubstitutionValues raised to the num_components.
+  // num_components = N for the reals, and 2*N for the complex.
+  static constexpr int GetMaxCacheSize() {
+    return pow(kNumSubnormalSubstitutionValues, N * (kIsComplex ? 2 : 1));
+  }
+
+  // When we are testing a value such that all of its components are subnormal,
+  // we also need to test inputs made up of the Cartesian product of values
+  // replaced for each subnormal component. These additional test inputs are
+  // common enough where it will be efficient to just cache the results of these
+  // Cartesian products. In order to cache these values, we need a one to one
+  // mapping between these Cartesian products and cache locations.
+  //
+  // Our mapping works by assigning each component an integer in
+  // [0, kNumSubnormalSubstitutionValues) based on its test value. By lining
+  // these integers up with the n'th component corresponding to the n'th digit,
+  // then for each Cartesian product element we essentially create a unique base
+  // kNumSubnormalSubstitutionValues number. This number represents our cache
+  // index.
+  //
+  // In the event that there a component is not a subnormal, the value should
+  // not be cached, so we return a kNonSubnormal value.
+
+  static int GetCacheLocation(ComponentNativeRefT value) {
+    bool positive = !std::signbit(value);
+    if (std::abs(value) == std::numeric_limits<ComponentNativeRefT>::min()) {
+      if (positive) {
+        return kPositiveMin;
+      } else {
+        return kNegativeMin;
+      }
+    } else if (value != 0) {
+      CHECK(std::fpclassify(value) != FP_SUBNORMAL);
+      return kNonSubnormal;
+    } else if (positive) {
+      return kPositiveZero;
+    } else {
+      return kNegativeZero;
+    }
+  }
+
+  static int GetCacheLocation(std::complex<ComponentNativeRefT> value) {
+    int real_loc = GetCacheLocation(value.real());
+    int imag_loc = GetCacheLocation(value.imag());
+    if (real_loc == kNonSubnormal || imag_loc == kNonSubnormal) {
+      return kNonSubnormal;
+    } else {
+      return real_loc * kNumSubnormalSubstitutionValues + imag_loc;
+    }
+  }
+
+  static int GetCacheLocation(const NativeRefInputs& input) {
+    int location = 0;
+    int cache_size_per_element =
+        (kIsComplex
+             ? kNumSubnormalSubstitutionValues * kNumSubnormalSubstitutionValues
+             : kNumSubnormalSubstitutionValues);
+    for (int i = 0; i < N; ++i) {
+      int comp_loc = GetCacheLocation(input[i]);
+      if (i == kNonSubnormal) {
+        return kNonSubnormal;
+      }
+      location *= cache_size_per_element;
+      location += comp_loc;
+    }
+    return location;
+  }
+
+  // The inverse function of GetCacheLocation.
+
+  template <bool complex, typename RetT>
+  static RetT FromCacheLocationComponent(int cache_loc) {
+    LOG(FATAL) << "Not implemented.";
+  }
+
+  template <>
+  static ComponentNativeRefT
+  FromCacheLocationComponent<false, ComponentNativeRefT>(int cache_loc) {
+    switch (cache_loc) {
+      case kPositiveMin:
+        return std::numeric_limits<ComponentNativeRefT>::min();
+      case kNegativeMin:
+        return -std::numeric_limits<ComponentNativeRefT>::min();
+      case kPositiveZero:
+        return static_cast<ComponentNativeRefT>(0.0);
+      case kNegativeZero:
+        return static_cast<ComponentNativeRefT>(-0.0);
+      default:
+        LOG(FATAL) << "Invalid cache_loc value of " << cache_loc;
+    }
+  }
+
+  template <>
+  static std::complex<ComponentNativeRefT>
+  FromCacheLocationComponent<true, std::complex<ComponentNativeRefT>>(
+      int cache_loc) {
+    CHECK_LT(cache_loc,
+             kNumSubnormalSubstitutionValues * kNumSubnormalSubstitutionValues);
+    CHECK_GE(cache_loc, 0);
+
+    std::complex<ComponentNativeRefT> value;
+    value.real(FromCacheLocationComponent<false, ComponentNativeRefT>(
+        cache_loc / kNumSubnormalSubstitutionValues));
+    value.imag(FromCacheLocationComponent<false, ComponentNativeRefT>(
+        cache_loc % kNumSubnormalSubstitutionValues));
+    return std::move(value);
+  }
+
+  static NativeRefInputs FromCacheLocation(int cache_loc) {
+    NativeRefInputs input;
+    int cache_size_per_element =
+        (kIsComplex
+             ? kNumSubnormalSubstitutionValues * kNumSubnormalSubstitutionValues
+             : kNumSubnormalSubstitutionValues);
+    for (int i = N - 1; i >= 0; --i) {
+      input[i] = FromCacheLocationComponent<kIsComplex, NativeRefT>(
+          cache_loc % cache_size_per_element);
+      cache_loc /= cache_size_per_element;
+    }
+
+    return input;
+  }
+
+  // Returns a string that describes the test value for the actual value.
+  std::string GetSubnormalDescription(ComponentNativeRefT test_val,
+                                      ComponentNativeRefT actual_val) {
+    const string sp_min_normal = "sign-preserving min-normal-float";
+    const string sp_zero = "sign-preserving zero";
+    const string nsp_zero = "non-sign-preserving zero";
+
+    switch (GetCacheLocation(test_val)) {
+      case kNegativeMin:
+      case kPositiveMin:
+        return sp_min_normal;
+      case kNegativeZero:
+      case kPositiveZero:
+        return (std::signbit(test_val) == std::signbit(actual_val)) ? sp_zero
+                                                                    : nsp_zero;
+      default:
+        return "";
+    }
+  }
+
+  std::string GetSubnormalDescription(
+      std::complex<ComponentNativeRefT> test_val,
+      std::complex<ComponentNativeRefT> actual_val) {
+    std::string real =
+        GetSubnormalDescription(test_val.real(), actual_val.real());
+    std::string imag =
+        GetSubnormalDescription(test_val.imag(), actual_val.imag());
+
+    if (real.empty()) {
+      if (imag.empty()) {
+        return "";
+      }
+      real = "real";
+    } else if (imag.empty()) {
+      imag = "imag";
+    }
+
+    return absl::StrCat("(", real, ", ", imag, ")");
+  }
+
+  std::string GetSubnormalDescription(std::array<NativeRefT, N> test_vals,
+                                      std::array<NativeRefT, N> actual_vals) {
+    if (N == 1) {
+      return GetSubnormalDescription(test_vals[0], actual_vals[0]);
+    }
+
+    std::array<std::string, N> str_vals;
+    for (int i = 0; i < N; ++i) {
+      str_vals[i] = GetSubnormalDescription(test_vals[i], actual_vals[i]);
+      if (str_vals[i].empty()) {
+        str_vals[i] = "original";
+      }
+    }
+
+    return absl::StrCat("(", absl::StrJoin(str_vals, ", "), ")");
+  }
+
+  InputLiterals CreateInputLiterals() {
+    InputLiterals literals;
+    for (int i = 0; i < N; ++i) {
+      literals[i] = LiteralUtil::CreateFromDimensions(T, {GetInputSize()});
+    }
+    return std::move(literals);
+  }
+
+  // Determines if two output values are sufficiently close to each other based
+  // on an error spec.
+  bool IsClose(NativeRefT expected, NativeRefT actual, ErrorSpec spec) {
+    // When two corresponding values are a NaN, they can be considered to have
+    // the same value, so the values are just set to 0.
+    RemoveCorrespondingNaNs(&expected, &actual);
+
+    if (spec.strict_signed_zeros) {
+      if (CheckSignedZeroError(expected, actual)) {
+        return false;
+      }
+    }
+
+    // Replace Inf with Max when calculating absolute or relative errors. This
+    // allows the test to pass when another value are close to Inf and the
+    // specified absolute or relative errors are not zero.
+    double abs_err =
+        std::abs(ReplaceInfWithMax(expected) - ReplaceInfWithMax(actual));
+    double rel_err = abs_err / std::abs(ReplaceInfWithMax(expected));
+
+    return abs_err <= spec.abs_err || rel_err <= spec.rel_err;
   }
 
   template <typename ErrorGenerator>
@@ -140,24 +682,6 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase {
     }
   }
 
-  template <int N>
-  struct IntegralTypeWithByteWidth {};
-
-  template <>
-  struct IntegralTypeWithByteWidth<2> {
-    using type = uint16;
-  };
-
-  template <>
-  struct IntegralTypeWithByteWidth<4> {
-    using type = uint32;
-  };
-
-  template <>
-  struct IntegralTypeWithByteWidth<8> {
-    using type = uint64;
-  };
-
   // Converts part or all bits in an uint64 to the value of the floating point
   // data type being tested.
   //
@@ -166,47 +690,57 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase {
   // bit patterns for T. This bit pattern is zero extended and stored as uint64.
   // This function is used to convert such a bit pattern stored as uint64 to
   // the input value for T.
-  //
-  // T is the type of the floating value represented by the `bits`.
-  template <typename T>
-  T ConvertValue(uint64 bits) {
-    using I = typename IntegralTypeWithByteWidth<sizeof(T)>::type;
+  static ComponentNativeT ConvertValue(uint64 bits) {
+    using I = ComponentIntegralNativeT;
     I used_bits = static_cast<I>(bits);
-    return BitCast<T>(used_bits);
+    return BitCast<ComponentNativeT>(used_bits);
   }
 
-  template <typename T>
-  T ConvertAndReplaceKnownIncorrectValueWith(uint64 bits,
-                                             int replacement_value = 0) {
+  ComponentNativeT ConvertAndReplaceKnownIncorrectValueWith(
+      uint64 bits, int replacement_value = 0) {
     if (known_incorrect_fn_ && known_incorrect_fn_(bits)) {
-      return static_cast<T>(replacement_value);
+      return static_cast<ComponentNativeT>(replacement_value);
     }
-    return ConvertValue<T>(bits);
+    return ConvertValue(bits);
   }
 
-  static string StringifyNum(float x);
+  static string StringifyNum(ComponentNativeT x);
 
-  static string StringifyNum(half x);
-
-  static string StringifyNum(bfloat16 x);
-
-  template <typename T>
-  static string StringifyNum(std::complex<T> x) {
-    return absl::StrCat(StringifyNum(x.real()), " ", StringifyNum(x.imag()));
+  static string StringifyNum(std::complex<ComponentNativeT> x) {
+    return absl::StrCat("(", StringifyNum(x.real()), ", ",
+                        StringifyNum(x.imag()), ")");
   }
 
-  template <typename T>
-  static void AppendStringifyNum(std::string* s, T x) {
+  // We also stringify the NativeRefT, so we need to generate an additional
+  // version of this function when NativeRefT != NativeT.
+  template <
+      typename T1 = NativeRefT,
+      class = typename std::enable_if<!std::is_same<NativeT, T1>::value>::type>
+  static string StringifyNum(NativeRefT x) {
+    return ExhaustiveOpTestBase<RefT::value, N>::StringifyNum(x);
+  }
+
+  static string StringifyNum(const NativeInputs& inputs) {
+    if (N == 1) {
+      return StringifyNum(inputs[0]);
+    }
+
+    std::array<std::string, N> str_vals;
+    for (int i = 0; i < N; ++i) {
+      str_vals[i] = StringifyNum(inputs[i]);
+    }
+
+    return absl::StrCat("(", absl::StrJoin(str_vals, ", "), ")");
+  }
+
+  static void AppendStringifyNum(std::string* s, NativeT x) {
     absl::StrAppend(s, StringifyNum(x));
   }
 
-  static std::function<ErrorSpec(float)> GetDefaultSpecGenerator(
-      PrimitiveType ty);
-
-  static std::vector<std::pair<int64, int64>> CreateExhaustiveF32Ranges();
+  static ErrorSpecGen GetDefaultSpecGenerator();
 
  protected:
-  // The primitive type under test.
+  // The primitive type being tested.
   const PrimitiveType ty_;
 
   // The platform under test.
@@ -225,7 +759,448 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase {
   //
   // XLA:GPU preserves denormal signs, but other backends don't.
   bool relaxed_denormal_signs_ = platform_ != "CUDA";
+
+ private:
+  using EvaluateOpInternal = NativeRefT (*)(NativeRefInputs);
+  using ErrorSpecGenInternal = ErrorSpec (*)(NativeInputs);
+
+  template <typename Type, typename FuncPtr>
+  ErrorSpec CallErrorSpec(FuncPtr* func, const std::array<Type, 1>& in) {
+    return func(in[0]);
+  }
+
+  template <typename Type, typename FuncPtr>
+  ErrorSpec CallErrorSpec(FuncPtr* func, const std::array<Type, 2>& in) {
+    return func(in[0], in[1]);
+  }
+
+  template <typename Type, typename FuncPtr>
+  Type CallOperation(FuncPtr* func, const std::array<Type, 1>& in) {
+    return func(in[0]);
+  }
+
+  template <typename Type, typename FuncPtr>
+  Type CallOperation(FuncPtr* func, const std::array<Type, 2>& in) {
+    return func(in[0], in[1]);
+  }
 };
 
+// Represents a set of 64 bit chunks by representing the starting bit chunk,
+// the last bit chunk, and the spacing between two adjacent bit chunks, without
+// actually storing all the bit chunks being generated. The bit chunk iterator
+// is provided to retrieve all the bit chunks.
+//
+// This data structure is used to generate the bit representation to test
+// operations that requires more than 64 bit input data. In this case,
+// truly exhaustive testing is not possible and we want to test a value every
+// n values, where n == spacing_.
+//
+// Currently, the iterator of BitChunks adds the `spacing_` to a bit chunk to
+// compute the next bit chunk. We can change this to use values generated
+// by a random number generator that can achieve the average spacing
+// statistically, if we will find this is necessary.
+class BitChunks {
+ public:
+  class iterator
+      : public std::iterator<std::input_iterator_tag,  // iterator_category
+                             uint64,                   // value_type
+                             uint64,                   // difference_type
+                             const uint64*,            // pointer
+                             uint64                    // reference
+                             > {
+   public:
+    iterator() {}
+
+    explicit iterator(const BitChunks* bit_chunks)
+        : bit_chunks_(bit_chunks), next_bit_chunk_(bit_chunks->start_) {}
+
+    iterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    iterator operator++(int) {
+      iterator retval = *this;
+      Next();
+      return retval;
+    }
+
+    bool operator==(iterator other) const {
+      return bit_chunks_ == other.bit_chunks_ &&
+             next_bit_chunk_ == other.next_bit_chunk_;
+    }
+
+    bool operator!=(iterator other) const { return !(*this == other); }
+
+    iterator MoveToEnd() {
+      MoveNextBitChunkToOnePassEnd();
+      return *this;
+    }
+
+    reference operator*() const {
+      CHECK(*this != this->bit_chunks_->end());
+      return next_bit_chunk_;
+    }
+
+    const BitChunks* GetBitChunks() const { return bit_chunks_; }
+
+    void Reset() { next_bit_chunk_ = bit_chunks_->start_; }
+
+    void Next() {
+      CHECK(*this != this->bit_chunks_->end());
+      if (next_bit_chunk_ == bit_chunks_->end_) {
+        MoveNextBitChunkToOnePassEnd();
+      } else {
+        next_bit_chunk_ += bit_chunks_->spacing_;
+        if (next_bit_chunk_ > bit_chunks_->end_) {
+          next_bit_chunk_ = bit_chunks_->end_;
+        }
+      }
+    }
+
+    std::string ToString() const {
+      return absl::StrFormat("0x%08x", next_bit_chunk_);
+    }
+
+   private:
+    // Move next_bit_chunk_ to 1 pass the bit_chunks_->end, to mark that the
+    // iterator has reached the end. When spacing_ is not one, or if we will
+    // change to use a random value instead of spacing_ in function Next(),
+    // normalizing the representation of the iterator ending this way can
+    // can simplify the checking for iterator ending.
+    void MoveNextBitChunkToOnePassEnd() {
+      next_bit_chunk_ = bit_chunks_->end_ + 1;
+    }
+
+    const BitChunks* bit_chunks_;
+    uint64 next_bit_chunk_;
+  };
+
+  iterator begin() const { return iterator(this); }
+  iterator end() const {
+    iterator end(this);
+    return end.MoveToEnd();
+  }
+
+  explicit BitChunks(uint64 start = 0, uint64 end = 0, uint64 spacing = 1)
+      : start_(start), end_(end), spacing_(spacing) {
+    CHECK_GE(end_, start_);
+    CHECK_NE(spacing, 0) << ToString();
+  }
+
+  int64 GetTotalBitChunks() const {
+    if (start_ == end_) {
+      return 1;
+    }
+
+    return 1 + (end_ - start_ + spacing_ - 1) / spacing_;
+  }
+
+  std::string ToString() const {
+    return absl::StrFormat("(0x%08x, 0x%08x, 0x%08x)", start_, end_, spacing_);
+  }
+
+  uint64 start_;
+  uint64 end_;
+  uint64 spacing_;
+};
+
+inline string StringifyNum(BitChunks c) { return c.ToString(); }
+
+inline string StringifyNum(BitChunks::iterator c) { return c.ToString(); }
+
+template <typename T>
+void AppendStringifyNum(std::string* s, T x) {
+  absl::StrAppend(s, StringifyNum(x));
+}
+
+// Represents a set of floating point values through the possible values for
+// the three components: mantissa, exponent, and sign. Also implements an
+// iterator for retrieving all the represented floating point values.
+class FpValues {
+ public:
+  static constexpr uint kTotalBitChunks = 3;
+
+  class iterator
+      : public std::iterator<std::input_iterator_tag,  // iterator_category
+                             uint64,                   // value_type
+                             uint64,                   // difference_type
+                             const uint64*,            // pointer
+                             uint64                    // reference
+                             > {
+   public:
+    explicit iterator(const FpValues* fp_values) : fp_values_(fp_values) {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        iters_[i] = BitChunks::iterator(&fp_values->GetBitChunks(i));
+      }
+    }
+
+    iterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    iterator operator++(int) {
+      iterator retval = *this;
+      Next();
+      return retval;
+    }
+
+    bool operator==(iterator other) const {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        if (iters_[i] != other.GetBitChunksIter(i)) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    bool operator!=(iterator other) const { return !(*this == other); }
+
+    iterator MoveToEnd() {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        iters_[i].MoveToEnd();
+      }
+      return *this;
+    }
+
+    uint64 operator*() const {
+      uint64 value = 0;
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        value = value | (*iters_[i]) << fp_values_->offsets_[i];
+      }
+      return value;
+    }
+
+    const BitChunks::iterator& GetBitChunksIter(int i) { return iters_[i]; }
+
+    std::string ToString() const {
+      return absl::StrJoin(iters_, ",",
+                           AppendStringifyNum<BitChunks::iterator>);
+    }
+
+   private:
+    // Moves the iterator for the ith BitChunks to the next value, and
+    // returns true if the new state is not the end of the iterator.
+    bool Next(int i = 0) {
+      iters_[i].Next();
+      if (iters_[i] == iters_[i].GetBitChunks()->end()) {
+        if (i == FpValues::kTotalBitChunks - 1) {
+          return false;
+        }
+        if (Next(i + 1)) {
+          iters_[i].Reset();
+          return true;
+        }
+        return false;
+      }
+      return true;
+    }
+
+    std::array<BitChunks::iterator, FpValues::kTotalBitChunks> iters_;
+    const FpValues* fp_values_;
+  };
+
+  FpValues() : bit_chunks_(), offsets_() {}
+  FpValues(absl::Span<const BitChunks> chunks, absl::Span<const int> offsets) {
+    CHECK_EQ(chunks.size(), offsets.size() - 1);
+    CHECK_EQ(chunks.size(), kTotalBitChunks);
+    std::copy_n(chunks.begin(), kTotalBitChunks, bit_chunks_.begin());
+    std::copy_n(offsets.begin(), kTotalBitChunks, offsets_.begin());
+
+    // The last value in `offsets` is the total number of bits.
+    offsets_[kTotalBitChunks] = offsets[kTotalBitChunks];
+    // Validate the input values.
+    for (int i = 0; i < kTotalBitChunks; ++i) {
+      int total_bits = offsets[i + 1] - offsets[i];
+      if (total_bits < 64) {
+        uint64 bound = 1ull << total_bits;
+        CHECK_LT(chunks[i].start_, bound);
+        CHECK_LT(chunks[i].end_, bound);
+      } else {
+        CHECK_EQ(total_bits, 64);
+      }
+    }
+  }
+
+  iterator begin() const { return iterator(this); }
+
+  iterator end() const {
+    iterator end(this);
+    return end.MoveToEnd();
+  }
+
+  int64 GetTotalNumValues() const {
+    int64 total = 1;
+    absl::c_for_each(bit_chunks_, [&](const BitChunks& chunks) {
+      total *= chunks.GetTotalBitChunks();
+    });
+    return total;
+  }
+
+  const BitChunks& GetBitChunks(int i) const { return bit_chunks_[i]; }
+
+  std::string ToString() const {
+    return absl::StrCat(
+        "[", absl::StrJoin(bit_chunks_, ",", AppendStringifyNum<BitChunks>),
+        "]");
+  }
+
+  std::array<BitChunks, kTotalBitChunks> bit_chunks_;
+  std::array<int, kTotalBitChunks + 1> offsets_;
+};
+
+template <typename T, typename std::enable_if<
+                          std::is_same<T, float>::value ||
+                          std::is_same<T, double>::value>::type* = nullptr>
+int GetMantissaTotalBits() {
+  return std::numeric_limits<T>::digits - 1;
+}
+
+template <typename T>
+int GetFpTotalBits() {
+  return sizeof(T) * 8;
+}
+
+template <typename T>
+int GetExponentTotalBits() {
+  return GetFpTotalBits<T>() - GetMantissaTotalBits<T>() - 1;
+}
+
+template <typename T>
+uint64 GetAllOneMantissa() {
+  return (1ull << GetMantissaTotalBits<T>()) - 1ull;
+}
+
+template <typename T>
+uint64 GetAllOneExponent() {
+  return (1ull << GetExponentTotalBits<T>()) - 1ull;
+}
+
+template <typename T, typename std::enable_if<
+                          std::is_same<T, float>::value ||
+                          std::is_same<T, double>::value>::type* = nullptr>
+FpValues GetFpValues(BitChunks mantissa, BitChunks exponent, BitChunks sign) {
+  int total_bits = GetFpTotalBits<T>();
+  return FpValues({mantissa, exponent, sign},
+                  {0, GetMantissaTotalBits<T>(), total_bits - 1, total_bits});
+}
+
+template <typename T>
+FpValues GetZeros() {
+  return GetFpValues<T>(BitChunks(0, 0, 1), BitChunks(0, 0, 1),
+                        BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetSubnormals(int approx_num_values) {
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64 mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2);
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(), mantissa_spacing),
+      BitChunks(0, 0, 1), BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetInfinites() {
+  uint64 all_one_exp = GetAllOneExponent<T>();
+  return GetFpValues<T>(BitChunks(0, 0, 1),
+                        BitChunks(all_one_exp, all_one_exp, 1),
+                        BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetNans(int approx_num_values) {
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64 mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2);
+  uint64 all_one_exp = GetAllOneExponent<T>();
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(), mantissa_spacing),
+      BitChunks(all_one_exp, all_one_exp, 1), BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetNormals(int approx_num_values) {
+  float component_total = std::sqrt(static_cast<float>(approx_num_values));
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(),
+                (1ull << (GetMantissaTotalBits<T>() + 1)) / component_total),
+      BitChunks(0x1, GetAllOneExponent<T>() - 1,
+                (1ull << (GetExponentTotalBits<T>() + 1)) / component_total),
+      BitChunks(0, 1, 1));
+}
+
+// Returns a vector of FpValues, which together represent about
+// `approx_num_values` floating point values of type `T`, with each FpValues
+// represents about `num_values_per_group` floating point values.
+template <typename T>
+std::vector<FpValues> GetFpValuesWithExponents(uint64 first_exponent,
+                                               uint64 exponent_spacing,
+                                               uint64 num_exponents,
+                                               uint64 approx_num_values,
+                                               uint64 num_values_per_group) {
+  const uint64 num_signs = 2;
+  uint64 approx_num_mantissa = approx_num_values / (num_exponents * num_signs);
+  uint64 num_mantissa_per_group =
+      num_values_per_group / (num_exponents * num_signs);
+  CHECK_GT(approx_num_mantissa, 0);
+  CHECK_GT(num_mantissa_per_group, 0);
+
+  CHECK_LT(first_exponent + num_exponents - 1ull, GetAllOneExponent<T>());
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64 mantissa_spacing = (1ull << mantissa) / approx_num_mantissa;
+
+  std::vector<FpValues> result;
+  for (uint64 group_start = 0; group_start < GetAllOneMantissa<T>();
+       group_start += mantissa_spacing * num_mantissa_per_group) {
+    uint64 group_end =
+        group_start + (num_mantissa_per_group - 1) * mantissa_spacing;
+    if (group_end > GetAllOneMantissa<T>()) {
+      group_end = GetAllOneMantissa<T>();
+    }
+    result.push_back(GetFpValues<T>(
+        BitChunks(group_start, group_end, mantissa_spacing),
+        BitChunks(first_exponent, first_exponent + num_exponents - 1, 1),
+        BitChunks(0, 1, 1)));
+  }
+  return result;
+}
+
+// Returns a vector of FpValues together represent about `approx_num_values`
+// "very large" floating point values and `approx_num_values` "very small"
+// floating point values of type `T`, which each FpValues represent about
+// `num_values_per_group` floating point values. Because we use FpValues as
+// a parameter for parameterized testing, the number of floating values
+// represented by each FpValues affects the input size for each sub-test and
+// the hence the peak memory usage of the test.
+template <typename T>
+std::vector<FpValues> GetFpValuesForMagnitudeExtremeNormals(
+    uint64 approx_num_values = 40000, uint64 num_values_per_group = 4000) {
+  std::vector<FpValues> large =
+      GetFpValuesWithExponents<T>(GetAllOneExponent<T>() - 5, 1, 5,
+                                  approx_num_values / 2, num_values_per_group);
+  std::vector<FpValues> small = GetFpValuesWithExponents<T>(
+      1, 1, 5, approx_num_values / 2, num_values_per_group);
+  large.insert(large.end(), small.begin(), small.end());
+  return large;
+}
+
+template <typename T>
+std::vector<FpValues> CreateFpValuesForBoundaryTest() {
+  return {GetZeros<T>(), GetSubnormals<T>(1000), GetInfinites<T>(),
+          GetNans<T>(1000)};
+}
+
+inline std::vector<std::pair<int64, int64>> CreateExhaustiveF32Ranges() {
+  // We break up the 2^32-element space into small'ish chunks to keep peak
+  // memory usage low.
+  std::vector<std::pair<int64, int64>> result;
+  const int64 step = 1 << 25;
+  for (int64 i = 0; i < (1l << 32); i += step) {
+    result.push_back({i, i + step});
+  }
+  return result;
+}
+
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_OP_TEST_UTILS_H_
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
index 0186d7d668d..3a14bb2d4cc 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
@@ -155,154 +155,8 @@ float HostDigamma(float x) {
   return result - reflection;
 }
 
-class ExhaustiveRealUnaryTestBase : public ExhaustiveOpTestBase {
- public:
-  explicit ExhaustiveRealUnaryTestBase(PrimitiveType ty)
-      : ExhaustiveOpTestBase(ty) {}
-
-  // A helper for implementing the Run method for unary op test. It constructs
-  // the HLO module, compiles and runs the module and checks the result.
-  //
-  // T: is the input and output data type.
-  // RefT: is the type used for the host function to get the reference result.
-  //  RefT is different from T when T is of less than 32 bits, that is half and
-  //  bfloat16.
-  //
-  // We use a function pointer for evaluate_op for performance because it is
-  // called each time an output element is compared inside a loop in routine
-  // ExpectNear.
-  template <typename T, typename RefT>
-  void RunImpl(std::function<XlaOp(XlaOp)> enqueue_op,
-               RefT (*evaluate_op)(RefT), const Literal& input_literal,
-               std::function<ErrorSpec(float)> error_spec_gen) {
-    XlaBuilder builder(TestName());
-    XlaOp input = Parameter(&builder, 0, input_literal.shape(), "input");
-    enqueue_op(input);
-    TF_ASSERT_OK_AND_ASSIGN(XlaComputation comp, builder.Build());
-    TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                            RunComputation(comp, {&input_literal}));
-    ExpectNear<T, RefT>(input_literal, result_literal, evaluate_op,
-                        error_spec_gen);
-  }
-
-  // We essentially reimplement LiteralTestUtil::Near here because
-  //  a) this streamlined implementation is much faster, and
-  //  b) we can print out better error messages (namely, we can print out
-  //     which floating-point value input failed, while LiteralTestUtil::Near
-  //     can only print out the input index that failed).
-  //  c) we need special handling of certain inputs.  For example, we say that
-  //     a denormal input has multiple correct outputs (namely, f(x) and f(0))
-  //     and just needs to be close to one of them.
-  template <typename T, typename RefT>
-  void ExpectNear(const Literal& input_literal, const Literal& result_literal,
-                  RefT (*evaluate_op)(RefT),
-                  std::function<ErrorSpec(float)> error_spec_gen) {
-    absl::Span<const T> input_arr = input_literal.data<T>();
-    absl::Span<const T> result_arr = result_literal.data<T>();
-    ASSERT_EQ(result_arr.size(), input_arr.size());
-    int64 mismatches = 0;
-    // Hoisting these out of the loop is a nice speedup on shards that have many
-    // denormals.
-    const T expected_at_pos_zero = static_cast<T>(evaluate_op(0));
-    const T expected_at_neg_zero = static_cast<T>(evaluate_op(-0.0));
-    const T expected_at_pos_min_normal_float =
-        static_cast<T>(evaluate_op(std::numeric_limits<RefT>::min()));
-    const T expected_at_neg_min_normal_float =
-        static_cast<T>(evaluate_op(-std::numeric_limits<RefT>::min()));
-
-    for (int64 i = 0; i < input_arr.size(); ++i) {
-      T input = input_arr[i];
-      RefT input_ref_ty = static_cast<RefT>(input);
-      T actual = result_arr[i];
-      T expected = static_cast<T>(evaluate_op(input_ref_ty));
-
-      ErrorSpec error_spec = error_spec_gen(input_ref_ty);
-
-      // We only implement fpclassify for float and double, so we call
-      // IsClose<float> for half and bfloat16.
-      if (IsClose(static_cast<RefT>(expected), static_cast<RefT>(actual),
-                  error_spec)) {
-        continue;
-      }
-
-      // Easy case: If `input` is not denormal and !IsClose(expected, actual,
-      // error_spec), print an error.
-      if (std::fpclassify(input_ref_ty) != FP_SUBNORMAL) {
-        PrintMismatch(&mismatches, [&] {
-          return absl::StrFormat("Mismatch on %s. Expected %s, but got %s.",
-                                 StringifyNum(input), StringifyNum(expected),
-                                 StringifyNum(actual));
-        });
-        continue;
-      }
-
-      // Otherwise, `input` is denormal.  For denormal inputs, we accept answers
-      // that are close to any of:
-      //
-      //   - evaluate_op(input)
-      //   - evaluate_op(+/-0), where the sign of 0 equal to the sign of
-      //     `input`,
-      //   - evaluate_op(+/-min_normal_float), where the sign of
-      //     min_normal_float matches `input`.
-      //   - if relaxed_denormal_signs_, evaluate_op(-/+0), where the sign of
-      //     0 is the opposite of `input`.
-      //
-      // (In particular, the XLA:CPU implementation of log flushes positive
-      // denormals to min-normal-float.  This seems kind of reasonable if our
-      // goal is to avoid infinities because they cause nans?)
-      T sign_preserving_ftz_expected = std::signbit(input_ref_ty)
-                                           ? expected_at_neg_zero
-                                           : expected_at_pos_zero;
-      T flush_to_normal_expected = std::signbit(input_ref_ty)
-                                       ? expected_at_neg_min_normal_float
-                                       : expected_at_pos_min_normal_float;
-      T sign_nonpreserving_ftz_expected = std::signbit(input_ref_ty)
-                                              ? expected_at_pos_zero
-                                              : expected_at_neg_zero;
-      if (IsClose(static_cast<RefT>(sign_preserving_ftz_expected),
-                  static_cast<RefT>(actual), error_spec) ||
-          IsClose(static_cast<RefT>(flush_to_normal_expected),
-                  static_cast<RefT>(actual), error_spec) ||
-          (relaxed_denormal_signs_ &&
-           IsClose(static_cast<RefT>(sign_nonpreserving_ftz_expected),
-                   static_cast<RefT>(actual), error_spec))) {
-        continue;
-      }
-
-      if (relaxed_denormal_signs_) {
-        PrintMismatch(&mismatches, [&] {
-          return absl::StrFormat(
-              "Mismatch on denormal value %s.  Expected one of:\n"
-              "  %10s (evaluated at full-precision value)\n"
-              "  %10s (evaluated at sign-preserving min-normal-float)\n"
-              "  %10s (evaluated after flushing to sign-preserving zero)\n"
-              "  %10s (evaluated after flushing to non-sign-preserving "
-              "zero)\n"
-              "but got %s.",
-              StringifyNum(input),  //
-              StringifyNum(expected), StringifyNum(flush_to_normal_expected),
-              StringifyNum(sign_preserving_ftz_expected),
-              StringifyNum(sign_nonpreserving_ftz_expected),
-              StringifyNum(actual));
-        });
-      } else {
-        PrintMismatch(&mismatches, [&] {
-          return absl::StrFormat(
-              "Mismatch on denormal value %s.  Expected one of:\n"
-              "  %10s (evaluated at full-precision value)\n"
-              "  %10s (evaluated at sign-preserving min-normal-float)\n"
-              "  %10s (evaluated after flushing to sign-preserving zero)\n"
-              "but got %s.",
-              StringifyNum(input),  //
-              StringifyNum(expected), StringifyNum(flush_to_normal_expected),
-              StringifyNum(sign_preserving_ftz_expected),  //
-              StringifyNum(actual));
-        });
-      }
-    }
-    EXPECT_EQ(mismatches, 0);
-  }
-};
+template <PrimitiveType T>
+using ExhaustiveUnaryTest = ExhaustiveOpTestBase<T, 1>;
 
 // Exhaustive test for unary operations for <= 32bit floating point types.
 //
@@ -310,53 +164,21 @@ class ExhaustiveRealUnaryTestBase : public ExhaustiveOpTestBase {
 //   - primitive type under test,
 //   - (begin, end) range under test, as zero-extended int64s bitcast to the
 //     primtive type under test.
+template <PrimitiveType T>
 class Exhaustive32BitOrLessUnaryTest
-    : public ExhaustiveRealUnaryTestBase,
-      public ::testing::WithParamInterface<
-          std::tuple<PrimitiveType, std::pair<int64, int64>>> {
+    : public ExhaustiveUnaryTest<T>,
+      public ::testing::WithParamInterface<std::pair<int64, int64>> {
  public:
-  typedef float (*F32EvaluateOp)(float);
-
-  Exhaustive32BitOrLessUnaryTest()
-      : ExhaustiveRealUnaryTestBase(std::get<0>(GetParam())) {}
-
-  void Run(std::function<XlaOp(XlaOp)> enqueue_op, F32EvaluateOp evaluate_op) {
-    return Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator(ty_));
-  }
-
-  void Run(std::function<XlaOp(XlaOp)> enqueue_op, F32EvaluateOp evaluate_op,
-           std::function<ErrorSpec(float)> error_spec_gen) {
-    SetFastMathDisabled(true);
-
-    // Run all HLO passes.  In particular, constant folding is disabled by
-    // default for tests, but we need to run it in order to tickle some bugs.
-    mutable_debug_options()->clear_xla_disable_hlo_passes();
-    Literal input_literal = CreateInputLiteral();
-    switch (ty_) {
-      case F32:
-        FillInput<float>(&input_literal);
-        return RunImpl<float, float>(enqueue_op, evaluate_op, input_literal,
-                                     error_spec_gen);
-      case F16:
-        FillInput<half>(&input_literal);
-        return RunImpl<half, float>(enqueue_op, evaluate_op, input_literal,
-                                    error_spec_gen);
-      case BF16:
-        FillInput<bfloat16>(&input_literal);
-        return RunImpl<bfloat16, float>(enqueue_op, evaluate_op, input_literal,
-                                        error_spec_gen);
-      default:
-        LOG(FATAL) << "Unhandled type.";
-    }
-  }
-
   // Sets error parameters appropriately for testing sin/cos/tan.
   void SetParamsForSinCosTan();
 
+ protected:
+  using typename ExhaustiveUnaryTest<T>::NativeT;
+
  private:
   int64 GetInputSize() override {
     int64 begin, end;
-    std::tie(begin, end) = std::get<1>(GetParam());
+    std::tie(begin, end) = GetParam();
     VLOG(2) << "Checking range [" << begin << ", " << end << ")";
     return end - begin;
   }
@@ -367,54 +189,64 @@ class Exhaustive32BitOrLessUnaryTest
   // pattern. Each bit representation is first truncated to the integral type of
   // the same bit as the type being tested, if needed, and then bitcasted to the
   // type being tested.
-  template <typename T>
-  void FillInput(Literal* input_literal) {
-    using IntegralT = typename IntegralTypeWithByteWidth<sizeof(T)>::type;
-    int64 input_size = input_literal->element_count();
+  void FillInput(std::array<Literal, 1>* input_literal) override {
+    using IntegralT =
+        typename ExhaustiveOpTestBase<T, 1>::ComponentIntegralNativeT;
+    int64 input_size = (*input_literal)[0].element_count();
     int64 begin, end;
-    std::tie(begin, end) = std::get<1>(GetParam());
+    std::tie(begin, end) = GetParam();
     VLOG(2) << "Checking range [" << begin << ", " << end << ")";
     CHECK_EQ(input_size, end - begin);
 
-    absl::Span<T> input_arr = input_literal->data<T>();
+    absl::Span<NativeT> input_arr = (*input_literal)[0].data<NativeT>();
     for (int64 i = 0; i < input_size; i++) {
       IntegralT input_val = i + begin;
-      input_arr[i] = ConvertAndReplaceKnownIncorrectValueWith<T>(input_val, 0);
+      input_arr[i] =
+          this->ConvertAndReplaceKnownIncorrectValueWith(input_val, 0);
     }
   }
 };
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Log) {
-  auto error_spec_gen = GetDefaultSpecGenerator(ty_);
-  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = [](float x) { return ErrorSpec{0.001, 0.001}; };
-  }
+typedef Exhaustive32BitOrLessUnaryTest<F32> ExhaustiveF32UnaryTest;
+typedef Exhaustive32BitOrLessUnaryTest<F16> ExhaustiveF16UnaryTest;
+typedef Exhaustive32BitOrLessUnaryTest<BF16> ExhaustiveBF16UnaryTest;
 
+#define XLA_TEST_FLOAT_32_BITS_OR_LESS(test_name, ...) \
+  XLA_TEST_P(ExhaustiveF32UnaryTest, test_name)        \
+  __VA_ARGS__                                          \
+  XLA_TEST_P(ExhaustiveF16UnaryTest, test_name)        \
+  __VA_ARGS__                                          \
+  XLA_TEST_P(ExhaustiveBF16UnaryTest, test_name)       \
+  __VA_ARGS__
+
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Log, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
+    error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
+  }
   Run(Log, std::log, error_spec_gen);
-}
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Log1p) {
-  auto error_spec_gen = GetDefaultSpecGenerator(ty_);
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = [](float x) { return ErrorSpec{0.001, 0.001}; };
+    error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
   }
-
   Run(Log1p, std::log1p, error_spec_gen);
-}
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Exp) {
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
   // When x < -105, the true value of exp(x) is smaller than the smallest F32,
   // so exp(x) should return exactly 0. We want our implementation of exp to
   // return exactly 0 as well, as not doing so implies either that our
   // implementation of exp is not following the asymptotic behavior that exp(x)
   // approaches 0 as x approaches -inf, or that our implementation is not
   // approaching 0 fast enough.
-  auto default_spec_gen = GetDefaultSpecGenerator(ty_);
-  auto error_spec_gen = [default_spec_gen](float x) {
-    if (x < -105) {
+  ErrorSpecGen error_spec_gen = +[](NativeT x) {
+    if (x < static_cast<NativeT>(-105)) {
       return ErrorSpec{0, 0};
     }
-    return default_spec_gen(x);
+    return GetDefaultSpecGenerator()(x);
   };
 
   // Our CPU implementation of exp returns one incorrect value: says
@@ -432,20 +264,13 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Exp) {
   } else {
     Run(Exp, std::exp, error_spec_gen);
   }
-}
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Expm1) {
-  auto default_spec_gen = GetDefaultSpecGenerator(ty_);
-  auto error_spec_gen = [default_spec_gen](float x) {
-    if (x < -105) {
-      return ErrorSpec{0, 0};
-    } else if (std::abs(x) < 5e-6) {
-      // For points around x=0, we should make sure that the result is accurate
-      // within 1 ULP of the value.
-      return ErrorSpec{0, 1.1921e-7};
-    }
-    return default_spec_gen(x);
-  };
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (ty_ == F32) {
+    error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
+  }
 
   // Our CPU implementation of expm1 returns one incorrect value: says
   // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
@@ -462,65 +287,73 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Expm1) {
   } else {
     Run(Expm1, std::expm1, error_spec_gen);
   }
-}
+})
 
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
 // this *did* find a bug, namely that some backends were assuming sqrt(x) ==
 // pow(x, 0.5), but this is not true for x == -inf.
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, PowOneHalf) {
-  Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); },
-      +[](float x) { return std::pow(x, 0.5f); });
-}
+XLA_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
+  EvaluateOp fn = +[](float x) { return std::pow(x, 0.5f); };
+  // TODO(b/123837116): Enable the test for all values after fixing the bug.
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    fn = +[](float x) {
+      if (x == -std::numeric_limits<float>::infinity()) {
+        return std::nanf("");
+      }
+      return std::pow(x, 0.5f);
+    };
+  }
+  Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); }, fn);
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Rsqrt) {
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
   Run(
       Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
-}
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Sqrt) {
-  auto default_spec_gen = GetDefaultSpecGenerator(ty_);
-  std::function<ErrorSpec(float)> error_spec_gen;
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "Host" || platform_ == "CUDA") {
-    error_spec_gen = [default_spec_gen](float x) {
-      ErrorSpec spec = default_spec_gen(x);
+    error_spec_gen = +[](NativeT x) {
+      auto spec = GetDefaultSpecGenerator()(x);
       spec.strict_signed_zeros = true;
       return spec;
     };
-  } else {
-    error_spec_gen = default_spec_gen;
   }
 
   Run(Sqrt, std::sqrt, error_spec_gen);
-}
+})
 
 // TODO(jlebar): Test trig functions over complex inputs.
-
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Acosh) {
+XLA_TEST_P(ExhaustiveF32UnaryTest, Acosh) {
   // Error inherited from Log, which our implementation of Acosh uses.
-  std::function<ErrorSpec(float)> error_spec_gen;
-  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = [](float x) { return ErrorSpec{0.001, 0.001}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
   }
 
   Run(Acosh, std::acosh, error_spec_gen);
 }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Asinh) {
-  // Error inherited from Log, which our implementation of Asinh uses.
-  std::function<ErrorSpec(float)> error_spec_gen;
-  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = [](float x) { return ErrorSpec{0.001, 0.001}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
+XLA_TEST_P(ExhaustiveF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }
+XLA_TEST_P(ExhaustiveBF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }
+
+// Tests for Asinh
+XLA_TEST_P(ExhaustiveF32UnaryTest, Asinh) {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
   }
+
   Run(Asinh, std::asinh, error_spec_gen);
 }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Atanh) { Run(Atanh, std::atanh); }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Acos) { Run(Acos, std::acos); }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Asin) { Run(Asin, std::asin); }
+XLA_TEST_P(ExhaustiveF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
+XLA_TEST_P(ExhaustiveBF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Cosh) {
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Atanh, { Run(Atanh, std::atanh); })
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Acos, { Run(Acos, std::acos); })
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Asin, { Run(Asin, std::asin); })
+
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
   // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
@@ -539,8 +372,9 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Cosh) {
     };
   }
   Run(Cosh, host_cosh);
-}
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Sinh) {
+})
+
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
   // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
@@ -559,76 +393,103 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Sinh) {
     };
   }
   Run(Sinh, host_sinh);
-}
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Tanh) { Run(Tanh, std::tanh); }
+})
 
-void Exhaustive32BitOrLessUnaryTest::SetParamsForSinCosTan() {
-  if (platform_ == "Host" || platform_ == "CUDA") {
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Tanh, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ == "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      return x <= static_cast<NativeT>(-20.0) || x >= static_cast<NativeT>(20.0)
+                 ? ErrorSpec{0, 0}
+                 : GetDefaultSpecGenerator()(x);
+    };
+  }
+  Run(Tanh, std::tanh, error_spec_gen);
+})
+
+template <PrimitiveType T>
+void Exhaustive32BitOrLessUnaryTest<T>::SetParamsForSinCosTan() {
+  if (this->platform_ == "Host" || this->platform_ == "CUDA") {
     return;
   }
 
   // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
   // and will not provide meaningful results for sin/cos/tan if magnitudes
   // exceed 2**p.
-  if (ty_ == F32) {
-    known_incorrect_fn_ = [](int64 v) {
+  if (T == F32) {
+    this->known_incorrect_fn_ = [](int64 v) {
       float f = BitCast<float>(static_cast<uint32>(v));
       return std::abs(f) > (1 << 13);
     };
-  } else if (ty_ == BF16) {
-    known_incorrect_fn_ = [](int64 v) {
+  } else if (T == BF16) {
+    this->known_incorrect_fn_ = [](int64 v) {
       float f = static_cast<float>(BitCast<bfloat16>(static_cast<uint16>(v)));
       return std::abs(f) > (1 << 13);
     };
   }
 }
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Cos) {
+XLA_TEST_P(ExhaustiveF32UnaryTest, Cos) {
   SetParamsForSinCosTan();
-  std::function<ErrorSpec(float)> error_spec_gen;
-  if (ty_ == F32) {
-    error_spec_gen = [](float) { return ErrorSpec{0.001, 0.001}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
-  }
-  Run(Cos, std::cos, error_spec_gen);
+  Run(
+      Cos, std::cos, +[](NativeT) {
+        return ErrorSpec{0.001, 0.001};
+      });
 }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Sin) {
+XLA_TEST_P(ExhaustiveF16UnaryTest, Cos) {
   SetParamsForSinCosTan();
-  std::function<ErrorSpec(float)> error_spec_gen;
-  if (ty_ == F32) {
-    error_spec_gen = [](float) { return ErrorSpec{0.001, 0.001}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
-  }
-  Run(Sin, std::sin, error_spec_gen);
+  Run(Cos, std::cos);
 }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Tan) {
+XLA_TEST_P(ExhaustiveBF16UnaryTest, Cos) {
   SetParamsForSinCosTan();
-  std::function<ErrorSpec(float)> error_spec_gen;
-  if (ty_ == F32) {
-    error_spec_gen = [](float) { return ErrorSpec{0.001, 0.001}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
-  }
-  Run(Tan, std::tan, error_spec_gen);
+  Run(Cos, std::cos);
+}
+
+XLA_TEST_P(ExhaustiveF32UnaryTest, Sin) {
+  SetParamsForSinCosTan();
+  Run(
+      Sin, std::sin, +[](NativeT) {
+        return ErrorSpec{0.001, 0.001};
+      });
+}
+XLA_TEST_P(ExhaustiveF16UnaryTest, Sin) {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+}
+XLA_TEST_P(ExhaustiveBF16UnaryTest, Sin) {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+}
+
+XLA_TEST_P(ExhaustiveF32UnaryTest, Tan) {
+  SetParamsForSinCosTan();
+  Run(
+      Tan, std::tan, +[](NativeT) {
+        return ErrorSpec{0.001, 0.001};
+      });
+}
+XLA_TEST_P(ExhaustiveF16UnaryTest, Tan) {
+  SetParamsForSinCosTan();
+  Run(Tan, std::tan);
+}
+XLA_TEST_P(ExhaustiveBF16UnaryTest, Tan) {
+  SetParamsForSinCosTan();
+  Run(Tan, std::tan);
 }
 
 // TODO(jlebar): Enable these.
-// XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Atan) { Run(Atan, std::atan); }
-// XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Atan2) { Run(Atan2, std::atan2); }
+// XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
+// XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Erf) { Run(Erf, std::erf); }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Erfc) { Run(Erfc, std::erfc); }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, ErfInv) { Run(ErfInv, HostErfInv); }
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Digamma) {
-  std::function<ErrorSpec(float)> error_spec_gen;
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
+XLA_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, { Run(ErfInv, HostErfInv); })
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
-    error_spec_gen = [](float) { return ErrorSpec{0.01, 0.01}; };
-  } else {
-    error_spec_gen = GetDefaultSpecGenerator(ty_);
+    error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };
   }
 
   if (platform_ == "CUDA") {
@@ -651,27 +512,25 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Digamma) {
   } else {
     Run(Digamma, HostDigamma, error_spec_gen);
   }
-}
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Lgamma) {
+})
+
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
   // Our implementation gets within 0.0001 rel error except for ~20 denormal
   // inputs on GPU.  Anyway 0.001 rel error should be good enough for lgamma.
-  auto default_spec_gen = GetDefaultSpecGenerator(ty_);
-  std::function<ErrorSpec(float)> error_spec_gen;
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
-    error_spec_gen = [default_spec_gen](float x) {
-      ErrorSpec spec = default_spec_gen(x);
+    error_spec_gen = +[](NativeT x) {
+      auto spec = GetDefaultSpecGenerator()(x);
       spec.rel_err = 0.001;
       return spec;
     };
-  } else {
-    error_spec_gen = default_spec_gen;
   }
 
   float (*host_lgamma)(float) = std::lgamma;
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
-    error_spec_gen = [](float) { return ErrorSpec{0.01, 0.01}; };
+    error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };
 
     // Overflows to inf for input 4.08500343e+36 (0x7c44af8e).
     if (ty_ == F32) {
@@ -684,28 +543,362 @@ XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Lgamma) {
     }
   }
   Run(Lgamma, host_lgamma, error_spec_gen);
-}
+})
 
-XLA_TEST_P(Exhaustive32BitOrLessUnaryTest, Round) { Run(Round, std::round); }
+XLA_TEST_FLOAT_32_BITS_OR_LESS(Round, { Run(Round, std::round); })
 
-INSTANTIATE_TEST_SUITE_P(
-    F32, Exhaustive32BitOrLessUnaryTest,
-    ::testing::Combine(::testing::Values(F32),
-                       ::testing::ValuesIn(
-                           ExhaustiveOpTestBase::CreateExhaustiveF32Ranges())));
+#if defined(UNARY_TEST_TARGET_F32_OR_SMALLER)
+
+INSTANTIATE_TEST_SUITE_P(F32, ExhaustiveF32UnaryTest,
+                         ::testing::ValuesIn(CreateExhaustiveF32Ranges()));
 
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-INSTANTIATE_TEST_SUITE_P(
-    F16, Exhaustive32BitOrLessUnaryTest,
-    ::testing::Combine(::testing::Values(F16),
-                       ::testing::Values(std::make_pair(0, 1 << 16))));
+INSTANTIATE_TEST_SUITE_P(F16, ExhaustiveF16UnaryTest,
+                         ::testing::Values(std::make_pair(0, 1 << 16)));
 #endif
 
 #if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
+INSTANTIATE_TEST_SUITE_P(BF16, ExhaustiveBF16UnaryTest,
+                         ::testing::Values(std::make_pair(0, 1 << 16)));
+#endif
+
+#endif
+
+// Exhaustive test for unary operations for double.
+//
+// Test parameter is a tuple containing
+//   - primitive type under test,
+//   - FpValues representing a set of double values.
+
+class ExhaustiveF64UnaryTest : public ExhaustiveUnaryTest<F64>,
+                               public ::testing::WithParamInterface<FpValues> {
+ private:
+  int64 GetInputSize() override {
+    FpValues values = GetParam();
+    return values.GetTotalNumValues();
+  }
+
+  void FillInput(std::array<Literal, 1>* input_literal) override {
+    FpValues fp_values = GetParam();
+    int64 input_size = (*input_literal)[0].element_count();
+    LOG(INFO) << "Checking fp values " << fp_values.ToString() << ", "
+              << input_size;
+    absl::Span<double> input_arr = (*input_literal)[0].data<double>();
+
+    uint64 i = 0;
+    for (auto bits : fp_values) {
+      input_arr[i] = this->ConvertAndReplaceKnownIncorrectValueWith(bits, 1);
+      ++i;
+    }
+    CHECK_EQ(i, input_size);
+  }
+};
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Log) { Run(Log, std::log); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Log1p) { Run(Log1p, std::log1p); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Exp) { Run(Exp, std::exp); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Expm1) { Run(Expm1, std::expm1); }
+
+// TODO(b/138385863): Turn on the test for GPU after fixing the bug.
+XLA_TEST_P(ExhaustiveF64UnaryTest, DISABLED_ON_GPU(PowOneHalf)) {
+  Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); },
+      +[](double x) { return std::pow(x, 0.5); });
+}
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Rsqrt) {
+  Run(
+      Rsqrt, +[](double x) { return 1 / std::sqrt(x); });
+}
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Sqrt) { Run(Sqrt, std::sqrt); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Acosh) { Run(Acosh, std::acosh); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Asinh) { Run(Asinh, std::asinh); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Atanh) { Run(Atanh, std::atanh); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Acos) { Run(Acos, std::acos); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Asin) { Run(Asin, std::asin); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Cosh) { Run(Cosh, std::cosh); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Sinh) { Run(Sinh, std::sinh); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Tanh) {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ == "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      return x <= static_cast<NativeT>(-20.0) || x >= static_cast<NativeT>(20.0)
+                 ? ErrorSpec{0, 0}
+                 : GetDefaultSpecGenerator()(x);
+    };
+  }
+  Run(Tanh, std::tanh, error_spec_gen);
+}
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Cos) { Run(Cos, std::cos); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Sin) { Run(Sin, std::sin); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Tan) { Run(Tan, std::tan); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Round) { Run(Round, std::round); }
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Erf) {
+  Run(Erf, std::erf, [](NativeT x) { return ErrorSpec{1e-20, 1e-20}; });
+}
+
+XLA_TEST_P(ExhaustiveF64UnaryTest, Erfc) {
+  Run(Erfc, std::erfc, [](NativeT x) { return ErrorSpec{1e-20, 1e-20}; });
+}
+
+#if defined(UNARY_TEST_TARGET_F64)
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
-    BF16, Exhaustive32BitOrLessUnaryTest,
-    ::testing::Combine(::testing::Values(BF16),
-                       ::testing::Values(std::make_pair(0, 1 << 16))));
+    SpecialValues, ExhaustiveF64UnaryTest,
+    ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()));
+
+INSTANTIATE_TEST_SUITE_P(NormalValues, ExhaustiveF64UnaryTest,
+                         ::testing::Values(GetNormals<double>(1000)));
+
+// Tests a total of 4000000000 inputs, with 16000000 inputs in each sub-test, to
+// keep the peak memory usage low.
+INSTANTIATE_TEST_SUITE_P(
+    LargeAndSmallMagnituedNormalValues, ExhaustiveF64UnaryTest,
+    ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<double>(
+        4000000000ull, 16000000)));
+#endif
+#endif
+
+// T is the Primitive Type of the complex number
+// Test parameter is a tuple containing
+//   - primitive type under test,
+//   - two FpValues representing the values for the real and imaginary
+//     components. The complex numbers for the test input is the cartesian
+//     product of the values represented by the two FpValues.
+template <PrimitiveType T>
+class ExhaustiveComplexUnaryTestBase
+    : public ExhaustiveUnaryTest<T>,
+      public ::testing::WithParamInterface<std::tuple<FpValues, FpValues>> {
+ protected:
+  using typename ExhaustiveUnaryTest<T>::NativeT;
+
+  void SetParamsForTanh() {
+    // TODO(b/138126045): Current libc++ implementation of the complex tanh
+    //                    function returns (NaN, NaN) when the imaginary
+    //                    component is more than half of the max value.
+    // TODO(b/138750327): Current libc++ implementation of the complex tanh
+    //                    function returns (1, 0) when the real component is
+    //                    negative infinity, when it should return (-1, 0).
+    // We only need to set the former as incorrect values for C128 because when
+    // testing with C64, we first cast our input to a C128 value.
+    this->known_incorrect_fn_ = [&](int64 v) {
+      double f = this->ConvertValue(v);
+      return (T == C128 &&
+              std::abs(f) > std::numeric_limits<double>::max() / 2) ||
+             f == -std::numeric_limits<double>::infinity();
+    };
+  }
+
+ private:
+  // Generates the input complex literal given the FpValues representation for
+  // the real and imaginary components.
+  void FillInput(std::array<Literal, 1>* input_literal) override {
+    FpValues real_values = std::get<0>(GetParam());
+    FpValues imag_values = std::get<1>(GetParam());
+
+    VLOG(2) << " testing input total "
+            << real_values.GetTotalNumValues() * imag_values.GetTotalNumValues()
+            << ", range " << real_values.ToString() << " "
+            << imag_values.ToString();
+
+    absl::Span<NativeT> input_arr = (*input_literal)[0].data<NativeT>();
+
+    uint64 i = 0;
+    for (auto real : real_values) {
+      for (auto imag : imag_values) {
+        input_arr[i] =
+            NativeT(this->ConvertAndReplaceKnownIncorrectValueWith(real, 1),
+                    this->ConvertAndReplaceKnownIncorrectValueWith(imag, 1));
+
+        ++i;
+      }
+    }
+  }
+
+  int64 GetInputSize() override {
+    FpValues real_values = std::get<0>(GetParam());
+    FpValues imag_values = std::get<1>(GetParam());
+    return real_values.GetTotalNumValues() * imag_values.GetTotalNumValues();
+  }
+};
+
+typedef ExhaustiveComplexUnaryTestBase<C64> ExhaustiveC64UnaryTest;
+typedef ExhaustiveComplexUnaryTestBase<C128> ExhaustiveC128UnaryTest;
+
+// TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
+XLA_TEST_P(ExhaustiveC64UnaryTest, DISABLED_ON_CPU(Log)) {
+  Run(Log, [](complex64 x) { return std::log<float>(x); });
+}
+
+XLA_TEST_P(ExhaustiveC64UnaryTest, Sqrt) {
+  Run(Sqrt, [](complex64 x) {
+    return static_cast<complex64>(
+        std::sqrt<double>(static_cast<complex128>(x)));
+  });
+}
+
+XLA_TEST_P(ExhaustiveC64UnaryTest, Rsqrt) {
+  Run(Rsqrt, [](complex64 x) {
+    return static_cast<complex64>(
+        complex128(1, 0) / std::sqrt<double>(static_cast<complex128>(x)));
+  });
+}
+
+// The current libc++ implementation of the complex tanh function provides
+// less accurate results when the denomenator of a complex tanh is small, due
+// to floating point precision loss. To avoid this issue for complex64 numbers,
+// we cast it to and from a complex128 when computing tanh.
+XLA_TEST_P(ExhaustiveC64UnaryTest, Tanh) {
+  SetParamsForTanh();
+  ErrorSpecGen error_spec_gen = +[](complex64 x) {
+    // This implementation of Tanh becomes less accurate when the denominator
+    // is small.
+    if (std::cosh(2 * x.real()) + std::cos(2 * x.imag()) < 1e-4) {
+      return ErrorSpec{5e-2, 5e-2};
+    }
+
+    return GetDefaultSpecGenerator()(x);
+  };
+  Run(
+      Tanh,
+      +[](complex64 x) {
+        return static_cast<complex64>(std::tanh(static_cast<complex128>(x)));
+      },
+      error_spec_gen);
+}
+
+#if defined(UNARY_TEST_TARGET_COMPLEX)
+INSTANTIATE_TEST_SUITE_P(
+    F32SpecialValues, ExhaustiveC64UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    F32SpecialAndNormalValues, ExhaustiveC64UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
+        ::testing::Values(GetNormals<float>(10000))));
+
+INSTANTIATE_TEST_SUITE_P(
+    F32NormalAndSpecialValues, ExhaustiveC64UnaryTest,
+    ::testing::Combine(
+        ::testing::Values(GetNormals<float>(10000)),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    F32NormalAndNormalValues, ExhaustiveC64UnaryTest,
+    ::testing::Combine(::testing::Values(GetNormals<float>(10000)),
+                       ::testing::Values(GetNormals<float>(10000))));
+
+// Tests a total of 40000 ^ 2 inputs, with 4000 ^ 2 inputs in each sub-test, to
+// keep the peak memory usage low.
+INSTANTIATE_TEST_SUITE_P(
+    F32LargeAndSmallMagnituedNormalValues, ExhaustiveC64UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<float>(40000,
+                                                                         4000)),
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<float>(40000, 4000))));
+#endif
+
+
+XLA_TEST_P(ExhaustiveC128UnaryTest, Log) {
+  // TODO(b/138578313): Enable the test for all values after fixing the bug.
+  known_incorrect_fn_ = [&](int64 v) {
+    double f = this->ConvertValue(v);
+    return std::fpclassify(f) == FP_NAN || std::abs(f) > 1.0e+300 ||
+           std::abs(f) < 1.0e-300;
+  };
+  Run(Log, [](complex128 x) { return std::log<double>(x); });
+}
+
+XLA_TEST_P(ExhaustiveC128UnaryTest, Sqrt) {
+  // Similar to the Tanh bug.
+  known_incorrect_fn_ = [&](int64 v) {
+    double f = this->ConvertValue(v);
+    return std::abs(f) > std::numeric_limits<double>::max() / 2;
+  };
+  Run(Sqrt, [](complex128 x) { return std::sqrt<double>(x); });
+}
+
+XLA_TEST_P(ExhaustiveC128UnaryTest, Rsqrt) {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ == "CUDA") {
+    // Edge case on CUDA backend where the Log of a complex number made up of
+    // the smallest denormals is more accurate than the interpreter backend.
+    error_spec_gen = [](complex128 x) {
+      constexpr double denorm_min = std::numeric_limits<double>::denorm_min();
+      if (std::abs(x.real()) == denorm_min &&
+          std::abs(x.imag()) == denorm_min) {
+        return ErrorSpec(0.5, 0.5);
+      }
+      return GetDefaultSpecGenerator()(x);
+    };
+  }
+  Run(
+      Rsqrt,
+      [](complex128 x) { return complex128(1, 0) / std::sqrt<double>(x); },
+      error_spec_gen);
+}
+
+XLA_TEST_P(ExhaustiveC128UnaryTest, Tanh) {
+  SetParamsForTanh();
+  Run(
+      Tanh, +[](complex128 x) { return std::tanh(x); });
+}
+
+#if defined(UNARY_TEST_TARGET_COMPLEX)
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+INSTANTIATE_TEST_SUITE_P(
+    SpecialValues, ExhaustiveC128UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    SpecialAndNormalValues, ExhaustiveC128UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
+        ::testing::Values(GetNormals<double>(10000))));
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalAndSpecialValues, ExhaustiveC128UnaryTest,
+    ::testing::Combine(
+        ::testing::Values(GetNormals<double>(10000)),
+        ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));
+
+INSTANTIATE_TEST_SUITE_P(
+    F32NormalAndNormalValues, ExhaustiveC128UnaryTest,
+    ::testing::Combine(::testing::Values(GetNormals<double>(10000)),
+                       ::testing::Values(GetNormals<double>(10000))));
+
+// Tests a total of 40000 ^ 2 inputs, with 2000 ^ 2 inputs in each sub-test, to
+// keep the peak memory usage low.
+INSTANTIATE_TEST_SUITE_P(
+    LargeAndSmallMagnituedNormalValues, ExhaustiveC128UnaryTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000)),
+        ::testing::ValuesIn(
+            GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000))));
+#endif
 #endif
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
index 701dac3902b..8df4a57afcd 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
@@ -84,6 +84,29 @@ void LlvmIrGenTestBase::CompileAheadOfTimeAndVerifyIr(
   EXPECT_TRUE(filecheck_result.ValueOrDie());
 }
 
+void LlvmIrGenTestBase::MatchOptimizedHlo(absl::string_view hlo,
+                                          absl::string_view pattern,
+                                          bool print_operand_shape) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(hlo));
+  HloPrintOptions print_opts;
+  print_opts.set_print_operand_shape(print_operand_shape);
+  StatusOr<bool> filecheck_result =
+      RunFileCheck(optimized_module->ToString(print_opts), pattern);
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(filecheck_result.ValueOrDie());
+}
+
+StatusOr<std::unique_ptr<HloModule>> LlvmIrGenTestBase::GetOptimizedModule(
+    absl::string_view hlo) {
+  HloModuleConfig config;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      ParseAndReturnVerifiedModule(hlo, config));
+  return backend().compiler()->RunHloPasses(
+      std::move(module), backend().default_stream_executor(),
+      backend().default_stream_executor()->GetAllocator());
+}
+
 LLVMCompiler* LlvmIrGenTestBase::GetLLVMCompiler() {
   return static_cast<LLVMCompiler*>(backend().compiler());
 }
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h
index 018f9546afc..ff69787c273 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h
@@ -58,6 +58,21 @@ class LlvmIrGenTestBase : public CodegenTestBase {
                                      const string& pattern,
                                      bool match_optimized_ir);
 
+  // Compiles the given `hlo` with optimizations, and verifies that optimized
+  // HLO matches the given FileCheck pattern.
+  void MatchOptimizedHlo(absl::string_view hlo, absl::string_view pattern,
+                         bool print_operand_shape = false);
+
+  // LikeMatchOptimizedHlo, but checks operand shapes as well.
+  void MatchOptimizedHloWithShapes(absl::string_view hlo,
+                                   absl::string_view pattern) {
+    MatchOptimizedHlo(hlo, pattern, /*print_operand_shape=*/true);
+  }
+
+  // Compiles and returns module with optimizations from a given HLO.
+  StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
+      absl::string_view hlo);
+
  private:
   LLVMCompiler* GetLLVMCompiler();
 
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index c5e1dbe7432..ff8adb0c460 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -142,6 +142,15 @@ XLA_TEST_P(ReduceWindowTest, Min3In5Stride2) {
                            {}, ErrorSpec(0.00001));
 }
 
+XLA_TEST_P(ReduceWindowTest, Min3In5Stride2Same) {
+  const auto input = CreateConstantFromLiteral(
+      LiteralUtil::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
+  ReduceWindowMin(input, {3}, {2}, Padding::kSame);
+  ComputeAndCompareLiteral(&builder_,
+                           LiteralUtil::CreateR1<float>({1000, 10, 1}), {},
+                           ErrorSpec(0.00001));
+}
+
 XLA_TEST_P(ReduceWindowTest, Min3In5Stride1WithSamePadding) {
   const auto input = CreateConstantFromLiteral(
       LiteralUtil::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 9636df2ff5f..c9c2cb7630b 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -36,6 +36,7 @@ limitations under the License.
 
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_GPU(X) X
+#define DISABLED_ON_GPU_ROCM(X) X
 #define DISABLED_ON_INTERPRETER(X) X
 
 // We need this macro instead of pasting directly to support nesting
@@ -54,6 +55,12 @@ limitations under the License.
 #ifdef XLA_TEST_BACKEND_GPU
 # undef DISABLED_ON_GPU
 # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X)
+
+#if TENSORFLOW_USE_ROCM
+# undef DISABLED_ON_GPU_ROCM
+# define DISABLED_ON_GPU_ROCM(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // TENSORFLOW_USE_ROCM
+
 #endif  // XLA_TEST_BACKEND_GPU
 
 #ifdef XLA_TEST_BACKEND_INTERPRETER
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index c3618eb20fa..4563d7e0df2 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
 #include <cmath>
 
 #include "absl/base/casts.h"
@@ -21,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 
@@ -349,13 +351,14 @@ void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
 // range [min, max]. Currently this works only for INT types.
 StatusOr<Literal> MakeFakeLiteralInternalWithBounds(const Shape& shape,
                                                     std::minstd_rand0* engine,
-                                                    int64 min, int64 max) {
+                                                    int64 min, int64 max,
+                                                    bool is_sorted) {
   if (shape.IsTuple()) {
     std::vector<Literal> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(
-          Literal element,
-          MakeFakeLiteralInternalWithBounds(element_shape, engine, min, max));
+      TF_ASSIGN_OR_RETURN(Literal element,
+                          MakeFakeLiteralInternalWithBounds(
+                              element_shape, engine, min, max, is_sorted));
       elements.push_back(std::move(element));
     }
     return LiteralUtil::MakeTupleOwned(std::move(elements));
@@ -373,34 +376,58 @@ StatusOr<Literal> MakeFakeLiteralInternalWithBounds(const Shape& shape,
     case S8:
       PopulateWithRandomIntegralDataWithBounds<int8>(
           &literal, engine, static_cast<int8>(min), static_cast<int8>(max));
+      if (is_sorted) {
+        std::sort(literal.data<int8>().begin(), literal.data<int8>().end());
+      }
       break;
     case U8:
       PopulateWithRandomIntegralDataWithBounds<uint8>(
           &literal, engine, static_cast<uint8>(min), static_cast<uint8>(max));
+      if (is_sorted) {
+        std::sort(literal.data<uint8>().begin(), literal.data<uint8>().end());
+      }
       break;
     case S16:
       PopulateWithRandomIntegralDataWithBounds<int16>(
           &literal, engine, static_cast<int16>(min), static_cast<int16>(max));
+      if (is_sorted) {
+        std::sort(literal.data<int16>().begin(), literal.data<int16>().end());
+      }
       break;
     case U16:
       PopulateWithRandomIntegralDataWithBounds<uint16>(
           &literal, engine, static_cast<uint16>(min), static_cast<uint16>(max));
+      if (is_sorted) {
+        std::sort(literal.data<uint16>().begin(), literal.data<uint16>().end());
+      }
       break;
     case S32:
       PopulateWithRandomIntegralDataWithBounds<int32>(
           &literal, engine, static_cast<int32>(min), static_cast<int32>(max));
+      if (is_sorted) {
+        std::sort(literal.data<int32>().begin(), literal.data<int32>().end());
+      }
       break;
     case U32:
       PopulateWithRandomIntegralDataWithBounds<uint32>(
           &literal, engine, static_cast<uint32>(min), static_cast<uint32>(max));
+      if (is_sorted) {
+        std::sort(literal.data<uint32>().begin(), literal.data<uint32>().end());
+      }
       break;
     case S64:
       PopulateWithRandomIntegralDataWithBounds<int64>(
           &literal, engine, static_cast<int64>(min), static_cast<int64>(max));
+      if (is_sorted) {
+        std::sort(literal.data<int64>().begin(), literal.data<int64>().end());
+      }
       break;
     case U64:
       PopulateWithRandomIntegralDataWithBounds<uint64>(
           &literal, engine, static_cast<uint64>(min), static_cast<uint64>(max));
+      if (is_sorted) {
+        std::sort(literal.data<uint64>().begin(), literal.data<uint64>().end());
+      }
       break;
     default:
       return Unimplemented(
@@ -510,6 +537,7 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
   int64 index_bound = INT64_MAX;
   bool no_duplicates = false;
   bool needs_constant = false;
+  bool needs_sorted_indices = false;
   ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
@@ -547,6 +575,13 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
                 std::min(index_bound, operand_shape.dimensions(dim_in_operand));
           }
         }
+        if (use->opcode() == HloOpcode::kScatter) {
+          needs_sorted_indices |=
+              Cast<const HloScatterInstruction>(use)->indices_are_sorted();
+        } else {
+          needs_sorted_indices |=
+              Cast<const HloGatherInstruction>(use)->indices_are_sorted();
+        }
         break;
       }
       case HloOpcode::kReduce:
@@ -579,7 +614,7 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
   }
   if (index_bound != INT64_MAX) {
     return MakeFakeLiteralInternalWithBounds(param.shape(), engine, -1,
-                                             index_bound);
+                                             index_bound, needs_sorted_indices);
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index dacb5faa228..06ea42235b2 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -424,19 +424,6 @@ T CeilOfRatio(T dividend, T divisor) {
   return tensorflow::MathUtil::CeilOfRatio<T>(dividend, divisor);
 }
 
-template <typename T>
-std::vector<T> ElementWiseCeilOfRatio(absl::Span<const T> dividends,
-                                      absl::Span<const T> divisors) {
-  std::vector<T> ceil_of_ratios;
-  CHECK_EQ(dividends.size(), divisors.size());
-  ceil_of_ratios.reserve(dividends.size());
-  absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios),
-                    [](const T dividend, const T divisor) {
-                      return CeilOfRatio<T>(dividend, divisor);
-                    });
-  return ceil_of_ratios;
-}
-
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index d91bc72c2f8..bfd79b537e3 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,15 +1,15 @@
 """Wrapper around cc_proto_library used inside the XLA codebase."""
 
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "cc_proto_library",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_static",
 )
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 
@@ -48,3 +48,6 @@ ORC_JIT_MEMORY_MAPPER_TARGETS = []
 # We link the GPU plugin into the XLA Python extension if CUDA is enabled.
 def xla_python_default_plugins():
     return if_cuda_is_configured(["//tensorflow/compiler/xla/service:gpu_plugin"])
+
+def xla_py_test_deps():
+    return []
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 7a40e4096de..09c6c793a2f 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -180,6 +180,11 @@ message DebugOptions {
   // xla_cpu_enable_fast_math is false.
   bool xla_cpu_fast_math_honor_division = 126;
 
+  // When xla_cpu_enable_fast_math is true then this controls whether we forbid
+  // to approximate calculations for functions. Ignored when
+  // xla_cpu_enable_fast_math is false.
+  bool xla_cpu_fast_math_honor_functions = 129;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
@@ -282,7 +287,13 @@ message DebugOptions {
 
   bool xla_gpu_force_conv_nchw = 125;
 
-  // Next id: 127
+  // Paths to files with ptx code.
+  repeated string xla_gpu_ptx_file = 127;
+
+  // Blacklist for cuDNN convolutions.
+  string xla_gpu_algorithm_blacklist_path = 128;
+
+  // Next id: 130
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 1bd6db2662e..f5218ad4d8c 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -294,6 +294,10 @@ message ExecutionProfile {
 
   // The size of the binary code in the executable.
   int64 executable_size_in_bytes = 6;
+
+  // Whether this profile was drawn from a cache of profiles instead of from
+  // execution on the hardware.
+  bool profile_cache_hit = 7;
 }
 
 // Handle given to a user that represents an execution that the user launched
@@ -579,6 +583,12 @@ message CholeskyOptions {
   bool lower = 1;
 }
 
+// Generic map of attributes used to pass hints / configuration options from
+// the Python frontend to the XLA backend.
+message FrontendAttributes {
+  map<string, string> map = 1;
+}
+
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 67402c11fcc..ce614904523 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -8,7 +8,7 @@ load(
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_py",
 )
 
diff --git a/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.cc b/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.cc
index 39c83c14f0a..d5f60ec33bb 100644
--- a/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.cc
+++ b/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.cc
@@ -45,7 +45,6 @@ EAGER_CLIENT_METHOD(WaitQueueDone);
 EAGER_CLIENT_METHOD(KeepAlive);
 EAGER_CLIENT_METHOD(CloseContext);
 EAGER_CLIENT_METHOD(RegisterFunction);
-EAGER_CLIENT_METHOD(SendTensor);
 #undef EAGER_CLIENT_METHOD
 
 #define WORKER_CLIENT_METHOD(method)                                           \
diff --git a/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.h b/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.h
index 2ef4efa652c..75e32e6d8f0 100644
--- a/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.h
+++ b/tensorflow/compiler/xrt/client/xrt_grpc_eager_client.h
@@ -73,9 +73,6 @@ class XrtGrpcEagerClient {
                              eager::RegisterFunctionResponse* response,
                              StatusCallback done,
                              CallOptions* call_opts = nullptr);
-  void SendTensorAsync(const eager::SendTensorRequest* request,
-                       eager::SendTensorResponse* response, StatusCallback done,
-                       CallOptions* call_opts = nullptr);
 
   // The following two methods are actually from the WorkerService API, not
   // EagerService, but are necessary for using remote Eager, and we include them
diff --git a/tensorflow/compiler/xrt/client/xrt_tf_client.cc b/tensorflow/compiler/xrt/client/xrt_tf_client.cc
index 88d0d25f84a..20206088799 100644
--- a/tensorflow/compiler/xrt/client/xrt_tf_client.cc
+++ b/tensorflow/compiler/xrt/client/xrt_tf_client.cc
@@ -286,15 +286,16 @@ XrtTensorHandle XrtTfContext::SendTensor(
     op_id = op->id;
   }
 
-  eager::SendTensorRequest request;
+  eager::EnqueueRequest request;
   request.set_context_id(context_id_);
-  request.set_op_id(op_id);
-  request.mutable_tensors()->AddAllocated(tensor_proto.release());
-  request.set_device_name(devices_.at(rpc_device_id).name());
-  auto response = std::make_shared<eager::SendTensorResponse>();
+  auto* send_tensor = request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(op_id);
+  send_tensor->mutable_tensors()->AddAllocated(tensor_proto.release());
+  send_tensor->set_device_name(devices_.at(rpc_device_id).name());
+  auto response = std::make_shared<eager::EnqueueResponse>();
   auto context_ptr = shared_from_this();
   absl::Notification done;
-  eager_client_->SendTensorAsync(
+  eager_client_->EnqueueAsync(
       &request, response.get(),
       [context_ptr, op_id, response, &done](Status status) {
         absl::MutexLock lock(&context_ptr->mu_);
@@ -440,6 +441,7 @@ XrtTensorHandle& XrtTensorHandle::operator=(XrtTensorHandle&& other) {
 void XrtTensorHandle::Serialize(eager::RemoteTensorHandle* proto) const {
   proto->set_op_id(tensor_id_.first);
   proto->set_output_num(tensor_id_.second);
+  proto->set_device(context_->devices_.at(device_id_).name());
 }
 
 AttrValue MakeAttrValue(std::string s) {
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index b791519c097..89daa98ee18 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -151,7 +151,7 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   xrt::XLAComputation computation_proto;
   OP_REQUIRES(
       ctx,
-      computation_proto.ParseFromString(computation_input.scalar<string>()()),
+      computation_proto.ParseFromString(computation_input.scalar<tstring>()()),
       errors::InvalidArgument(
           "Unable to parse computation input to XLAComputation"));
 
@@ -191,7 +191,7 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
                                              .ComputeProgramShape()
                                              .ToProto();
   Tensor program_shape_output(DT_STRING, TensorShape({1}));
-  program_shape_output.vec<string>()(0) = program_shape.SerializeAsString();
+  program_shape_output.vec<tstring>()(0) = program_shape.SerializeAsString();
   ctx->set_output(1, program_shape_output);
 }
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 231387e314f..1c4e1f7e2c7 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -260,7 +260,7 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
   xrt::XRTExecutionConfig config_proto;
   TF_RET_CHECK(
-      config_proto.ParseFromString(execution_config.scalar<string>()()));
+      config_proto.ParseFromString(execution_config.scalar<tstring>()()));
 
   int core_index_in_replica = config_proto.core_index_in_replica();
   TF_RET_CHECK(core_index_in_replica == 0);
@@ -343,12 +343,12 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   const Tensor& execution_plan = context->input(0);
   TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_plan.shape()));
   xrt::XRTChainedExecutePlan plan;
-  TF_RET_CHECK(plan.ParseFromString(execution_plan.scalar<string>()()));
+  TF_RET_CHECK(plan.ParseFromString(execution_plan.scalar<tstring>()()));
 
   const Tensor& execution_config = context->input(1);
   TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
   xrt::XRTChainedExecuteConfig config;
-  TF_RET_CHECK(config.ParseFromString(execution_config.scalar<string>()()));
+  TF_RET_CHECK(config.ParseFromString(execution_config.scalar<tstring>()()));
 
   XRTCompilationCache* cache;
   TF_RETURN_IF_ERROR(rm->Lookup<XRTCompilationCache>(
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 2ffde52af06..769ec188349 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -177,7 +177,7 @@ class XRTAllocateOp : public OpKernel {
     xrt::XLAAllocation allocation_proto;
     OP_REQUIRES(
         ctx,
-        allocation_proto.ParseFromString(allocation_info.scalar<string>()()),
+        allocation_proto.ParseFromString(allocation_info.scalar<tstring>()()),
         errors::InvalidArgument(
             "Unable to parse allocation input to XLAAllocation"));
 
@@ -419,7 +419,7 @@ class XRTMakeTupleOp : public OpKernel {
         errors::Internal("tuple description input should be a string scalar"));
     xrt::XLATupleNode tuple_proto;
     OP_REQUIRES(
-        ctx, tuple_proto.ParseFromString(tuple_info.scalar<string>()()),
+        ctx, tuple_proto.ParseFromString(tuple_info.scalar<tstring>()()),
         errors::InvalidArgument("Unable to parse tuple input to XLATupleNode"));
 
     OpInputList arg_list;
@@ -512,7 +512,7 @@ class XRTReadLiteralOp : public OpKernel {
     xla::LiteralProto literal_proto = literal.ToProto();
 
     Tensor output(DT_STRING, TensorShape({}));
-    literal_proto.SerializeToString(&output.scalar<string>()());
+    SerializeToTString(literal_proto, &output.scalar<tstring>()());
     ctx->set_output(0, output);
   }
 };
@@ -627,7 +627,7 @@ class XRTWriteLiteralOp : public OpKernel {
                 errors::Internal("literal input should be a string scalar"));
     xla::LiteralProto literal_proto;
     OP_REQUIRES(ctx,
-                literal_proto.ParseFromString(literal_info.scalar<string>()()),
+                literal_proto.ParseFromString(literal_info.scalar<tstring>()()),
                 errors::InvalidArgument(
                     "Unable to parse allocation input to LiteralProto"));
     xla::Literal literal;
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index cc6ab9a3ed4..701125f63f0 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index f0729251eeb..427a631f82d 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -127,7 +127,7 @@ xla::LiteralProto FloatMatrix(
 
 xla::Literal ReadOutputLiteral(const std::vector<Tensor>& outputs, size_t idx) {
   xla::LiteralProto response;
-  CHECK(response.ParseFromString(outputs[idx].scalar<string>()()));
+  CHECK(response.ParseFromString(outputs[idx].scalar<tstring>()()));
   return xla::Literal::CreateFromProto(response).ValueOrDie();
 }
 
@@ -316,7 +316,7 @@ TEST(RawApiTest, AllocFromTensor) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
 }
 
@@ -351,7 +351,7 @@ TEST(RawApiTest, AllocUninitialized) {
     EXPECT_EQ(outputs.size(), 1);
     xla::LiteralProto read_back_literal;
     EXPECT_TRUE(
-        read_back_literal.ParseFromString(outputs[0].scalar<string>()()));
+        read_back_literal.ParseFromString(outputs[0].scalar<tstring>()()));
     Tensor read_back_tensor;
     TF_ASSERT_OK(LiteralToHostTensor(
         xla::Literal::CreateFromProto(read_back_literal).ValueOrDie(), DT_FLOAT,
@@ -381,7 +381,7 @@ TEST(RawApiTest, AllocUninitialized) {
     EXPECT_EQ(outputs.size(), 1);
 
     xla::LiteralProto response;
-    EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+    EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
     EXPECT_TRUE(CompareLiteralProtos(response, new_literal));
   }
 }
@@ -413,7 +413,7 @@ TEST(RawApiTest, AllocFromTensorTuple) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
 }
 
@@ -439,7 +439,7 @@ TEST(RawApiTest, AllocFromTensorTupleSingle) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
 }
 
@@ -465,7 +465,7 @@ TEST(RawApiTest, AllocFromTensorRelayout) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   // We have sent literal's data (in array layout) with a attribute layout
   // {0,1}, so the expected literal read from device needs to be changed
   // accordingly.
@@ -493,7 +493,7 @@ TEST(RawApiTest, AllocAndRewrite) {
 
   int64 allocation_handle = outputs[1].scalar<int64>()();
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
 
   xla::LiteralProto new_literal =
@@ -512,7 +512,7 @@ TEST(RawApiTest, AllocAndRewrite) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto new_response;
-  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
 
   Tensor release_tensor(DT_INT64, TensorShape({1}));
@@ -652,7 +652,7 @@ TEST(RawApiTest, ReadAndWriteState) {
       session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
 }
@@ -673,7 +673,7 @@ TEST(RawApiTest, ReadAndWriteStateAutoFree) {
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
 }
 
@@ -707,13 +707,13 @@ TEST(RawApiTest, SubBuffer) {
   auto base_elements = base_literal.DecomposeTuple();
   auto nested_0_elements = base_elements[0].Clone().DecomposeTuple();
   xla::LiteralProto response_0;
-  EXPECT_TRUE(response_0.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response_0.ParseFromString(outputs[0].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(base_elements[0], response_0));
   xla::LiteralProto response_1;
-  EXPECT_TRUE(response_1.ParseFromString(outputs[1].scalar<string>()()));
+  EXPECT_TRUE(response_1.ParseFromString(outputs[1].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(base_elements[1], response_1));
   xla::LiteralProto response_00;
-  EXPECT_TRUE(response_00.ParseFromString(outputs[2].scalar<string>()()));
+  EXPECT_TRUE(response_00.ParseFromString(outputs[2].scalar<tstring>()()));
   EXPECT_TRUE(CompareLiteralToLiteralProto(nested_0_elements[0], response_00));
 }
 
@@ -779,9 +779,9 @@ TEST(RawApiTest, MakeTuple) {
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({res_0, res_1}, &outputs));
   xla::LiteralProto response_0;
-  EXPECT_TRUE(response_0.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response_0.ParseFromString(outputs[0].scalar<tstring>()()));
   xla::LiteralProto response_1;
-  EXPECT_TRUE(response_1.ParseFromString(outputs[1].scalar<string>()()));
+  EXPECT_TRUE(response_1.ParseFromString(outputs[1].scalar<tstring>()()));
 
   auto expected_0 = MakeTuple0();
   EXPECT_TRUE(CompareLiteralProtos(response_0, expected_0));
@@ -853,7 +853,7 @@ TEST(RawApiTest, ExecuteChainedOpByOp) {
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
@@ -973,7 +973,7 @@ TEST(RawApiTest, ExecuteChained) {
   EXPECT_EQ(outputs.size(), 1);
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
@@ -1022,13 +1022,13 @@ TEST(RawApiTest, CompileAndExecute) {
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
   xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<tstring>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
@@ -1077,13 +1077,13 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
   xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<tstring>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
@@ -1128,7 +1128,8 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
                            {release}, &outputs));
 
   xla::ProgramShapeProto program_shape_proto;
-  EXPECT_TRUE(program_shape_proto.ParseFromString(outputs[0].vec<string>()(0)));
+  EXPECT_TRUE(
+      program_shape_proto.ParseFromString(outputs[0].vec<tstring>()(0)));
   xla::ProgramShape program_shape(program_shape_proto);
   EXPECT_EQ(program_shape.parameters_size(), 1);
 
@@ -1196,7 +1197,7 @@ TEST(RawApiTest, DotGeneralWithLayoutTest) {
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected =
       xla::LiteralUtil::CreateR2WithLayout<float>({{18.0f}, {44.0f}}, layout);
@@ -1231,7 +1232,7 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR0<float>(3.0f);
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
@@ -1281,7 +1282,7 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto sum = xla::LiteralUtil::CreateR1<float>({9.0f, 7.0f});
   auto expected = xla::LiteralUtil::MakeTuple({&sum});
@@ -1343,7 +1344,7 @@ TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
     EXPECT_EQ(voutputs.size(), 1);
 
     xla::LiteralProto response;
-    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<string>()()));
+    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<tstring>()()));
 
     auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
     EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
@@ -1514,13 +1515,13 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
   xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
 
   auto expected = xla::LiteralUtil::CreateR0<int64>(15123899);
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
   xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<tstring>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
   EXPECT_TRUE(xla::ShapeUtil::HasPrimitiveType(
       xla::Shape(program_shape.result()), xla::S64));
@@ -1580,7 +1581,7 @@ TEST(RawApiTest, TestDeviceMemoryCompaction) {
   // we have on record.
   for (size_t i = 1, j = 0; i < handles.size(); i += 2, ++j) {
     xla::LiteralProto response;
-    EXPECT_TRUE(response.ParseFromString(outputs[j].scalar<string>()()));
+    EXPECT_TRUE(response.ParseFromString(outputs[j].scalar<tstring>()()));
     EXPECT_TRUE(CompareLiteralProtos(allocs[i].value(), response));
   }
 }
@@ -1668,7 +1669,7 @@ TEST(RawApiTest, TestDeviceMemorySwap) {
     EXPECT_EQ(outputs.size(), 1);
 
     xla::LiteralProto response;
-    EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+    EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
     auto literal = xla::Literal::CreateFromProto(response).ValueOrDie();
     EXPECT_EQ(literal, zero_literal);
   }
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 79c0a4136e1..034ecd85fd0 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 package(
@@ -42,7 +41,6 @@ py_library(
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/feature_column:feature_column_py",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/gan",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
         "//tensorflow/contrib/hadoop",
@@ -109,7 +107,7 @@ py_library(
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
+    ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
@@ -176,7 +174,7 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
         "//tensorflow/contrib/text:all_kernels",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
+    ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 0d510a16601..1611cf4f338 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -49,7 +49,6 @@ from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
 from tensorflow.contrib import feature_column
 from tensorflow.contrib import framework
-from tensorflow.contrib import gan
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib import grid_rnn
 from tensorflow.contrib import image
diff --git a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
index bf824e2760e..c51d2124920 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
@@ -18,18 +18,29 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "TuWj26KWz1fZ"
       },
       "outputs": [],
       "source": [
-        "!pip install -U -q tf-nightly"
+        "!pip install -U -q tf-nightly-2.0-preview"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Cp7iTarmz62Y"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "\n",
+        "tf = tf.compat.v2\n",
+        "tf.enable_v2_behavior()"
       ]
     },
     {
@@ -41,25 +52,21 @@
       "source": [
         "### Fibonacci numbers\n",
         "\n",
-        "https://en.wikipedia.org/wiki/Fibonacci_number"
+        "https://en.wikipedia.org/wiki/Fibonacci_number\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 11,
       "metadata": {
         "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 197
+          "height": 187
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 7512,
+          "elapsed": 709,
           "status": "ok",
-          "timestamp": 1532101577266,
+          "timestamp": 1563825398552,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -68,7 +75,7 @@
           "user_tz": 240
         },
         "id": "H7olFlMXqrHe",
-        "outputId": "472dbfe0-9449-4f93-e908-1a0785188a92"
+        "outputId": "25243e7b-99a7-4a6d-ad00-e97c52be7d97"
       },
       "outputs": [
         {
@@ -89,25 +96,19 @@
         }
       ],
       "source": [
-        "import tensorflow as tf\n",
-        "from tensorflow.contrib import autograph as ag\n",
-        "\n",
-        "\n",
+        "@tf.function\n",
         "def fib(n):\n",
         "  f1 = 0\n",
         "  f2 = 1\n",
-        "  for i in range(n):\n",
+        "  for i in tf.range(n):\n",
         "    tmp = f2\n",
         "    f2 = f2 + f1\n",
         "    f1 = tmp\n",
-        "    print(i, ': ', f2)\n",
+        "    tf.print(i, ': ', f2)\n",
         "  return f2\n",
         "\n",
         "\n",
-        "with tf.Graph().as_default():\n",
-        "  final_fib = ag.to_graph(fib)(tf.constant(10))\n",
-        "  with tf.Session() as sess:\n",
-        "    sess.run(final_fib)"
+        "_ = fib(tf.constant(10))"
       ]
     },
     {
@@ -122,68 +123,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 541
-        },
+        "colab": {},
         "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 103,
-          "status": "ok",
-          "timestamp": 1532101577412,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "UeWjK8rHq6Cj",
-        "outputId": "73ece895-12fb-489a-e52c-032945d7ed7a"
+        "id": "UeWjK8rHq6Cj"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "from __future__ import print_function\n",
-            "import tensorflow as tf\n",
-            "\n",
-            "def tf__fib(n):\n",
-            "  try:\n",
-            "    with tf.name_scope('fib'):\n",
-            "      f1 = 0\n",
-            "      f2 = 1\n",
-            "\n",
-            "      def extra_test(f1_1, f2_1):\n",
-            "        with tf.name_scope('extra_test'):\n",
-            "          return True\n",
-            "\n",
-            "      def loop_body(i, f1_1, f2_1):\n",
-            "        with tf.name_scope('loop_body'):\n",
-            "          tmp = f2_1\n",
-            "          f2_1 = f2_1 + f1_1\n",
-            "          f1_1 = tmp\n",
-            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
-            "              dynamic_print(i, ': ', f2_1)):\n",
-            "            f2, i_1 = ag__.utils.alias_tensors(f2_1, i)\n",
-            "            return f1_1, f2\n",
-            "      f1, f2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range, n),\n",
-            "          extra_test, loop_body, (f1, f2))\n",
-            "      return f2\n",
-            "  except:\n",
-            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
-        "print(ag.to_code(fib))"
+        "print(tf.autograph.to_code(fib.python_function))"
       ]
     },
     {
@@ -200,20 +148,16 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 12,
       "metadata": {
         "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 125
+          "height": 119
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 233,
+          "elapsed": 663,
           "status": "ok",
-          "timestamp": 1532101577681,
+          "timestamp": 1563825401385,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -222,7 +166,7 @@
           "user_tz": 240
         },
         "id": "33CAheYsrEQ7",
-        "outputId": "82a493ee-15b5-419d-8c9c-5f4159090a05"
+        "outputId": "2a88b65d-4fed-4d96-8770-0c68ffece861"
       },
       "outputs": [
         {
@@ -240,8 +184,9 @@
       ],
       "source": [
         "import tensorflow as tf\n",
-        "from tensorflow.contrib import autograph as ag\n",
         "\n",
+        "\n",
+        "@tf.function(experimental_autograph_options=tf.autograph.experimental.Feature.EQUALITY_OPERATORS)\n",
         "def fizzbuzz(i, n):\n",
         "  while i \u003c n:\n",
         "    msg = ''\n",
@@ -251,14 +196,11 @@
         "      msg += 'Buzz'\n",
         "    if msg == '':\n",
         "      msg = tf.as_string(i)\n",
-        "    print(msg)\n",
+        "    tf.print(msg)\n",
         "    i += 1\n",
         "  return i\n",
         "\n",
-        "with tf.Graph().as_default():\n",
-        "  final_i = ag.to_graph(fizzbuzz)(tf.constant(10), tf.constant(16))\n",
-        "  with tf.Session() as sess:\n",
-        "    sess.run(final_i)"
+        "_ = fizzbuzz(tf.constant(10), tf.constant(16))"
       ]
     },
     {
@@ -273,98 +215,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 1081
-        },
+        "colab": {},
         "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 289,
-          "status": "ok",
-          "timestamp": 1532101578003,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "bBhFIIaZrxvx",
-        "outputId": "d076a7ea-e643-4689-f90a-57f5d086dedc"
+        "id": "bBhFIIaZrxvx"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "from __future__ import print_function\n",
-            "import tensorflow as tf\n",
-            "\n",
-            "def tf__fizzbuzz(i, n):\n",
-            "  try:\n",
-            "    with tf.name_scope('fizzbuzz'):\n",
-            "\n",
-            "      def loop_test(i_1):\n",
-            "        with tf.name_scope('loop_test'):\n",
-            "          return tf.less(i_1, n)\n",
-            "\n",
-            "      def loop_body(i_1):\n",
-            "        with tf.name_scope('loop_body'):\n",
-            "          msg = ''\n",
-            "\n",
-            "          def if_true():\n",
-            "            with tf.name_scope('if_true'):\n",
-            "              msg_1, = msg,\n",
-            "              msg_1 += 'Fizz'\n",
-            "              return msg_1,\n",
-            "\n",
-            "          def if_false():\n",
-            "            with tf.name_scope('if_false'):\n",
-            "              return msg,\n",
-            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 3, 0), if_true, if_false)\n",
-            "\n",
-            "          def if_true_1():\n",
-            "            with tf.name_scope('if_true_1'):\n",
-            "              msg_2, = msg,\n",
-            "              msg_2 += 'Buzz'\n",
-            "              return msg_2,\n",
-            "\n",
-            "          def if_false_1():\n",
-            "            with tf.name_scope('if_false_1'):\n",
-            "              return msg,\n",
-            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 5, 0), if_true_1, if_false_1\n",
-            "              )\n",
-            "\n",
-            "          def if_true_2():\n",
-            "            with tf.name_scope('if_true_2'):\n",
-            "              msg_3, = msg,\n",
-            "              msg_3 = tf.as_string(i_1)\n",
-            "              return msg_3,\n",
-            "\n",
-            "          def if_false_2():\n",
-            "            with tf.name_scope('if_false_2'):\n",
-            "              return msg,\n",
-            "          msg = ag__.utils.run_cond(tf.equal(msg, ''), if_true_2, if_false_2)\n",
-            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
-            "              dynamic_print(msg)):\n",
-            "            msg_4 = ag__.utils.alias_tensors(msg)\n",
-            "            i_1 += 1\n",
-            "            return i_1,\n",
-            "      i = ag__.while_stmt(loop_test, loop_body, (i,), (tf, n, ag__, i))\n",
-            "      return i\n",
-            "  except:\n",
-            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
-        "print(ag.to_code(fizzbuzz))"
+        "print(tf.autograph.to_code(fizzbuzz.python_function))"
       ]
     },
     {
@@ -393,12 +252,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "7moIlf8VABkl"
       },
@@ -414,44 +268,47 @@
         "id": "QlEvfIQPAYF5"
       },
       "source": [
-        "#### Game of Life for AutoGraph"
+        "#### Game of Life for AutoGraph\n",
+        "\n",
+        "Note: the code may take a while to run."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "5pCK2qQSAAK4"
       },
       "outputs": [],
       "source": [
         "#@test {\"skip\": true} \n",
-        "NUM_STEPS = 100"
+        "NUM_STEPS = 75"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "GPZANPdhMagD"
+      },
+      "source": [
+        "Note: This code uses a non-vectorized algorithm, which is quite slow. For 75 steps, it will take a few minutes to run. "
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": 10,
       "metadata": {
         "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 308
+          "height": 309
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 14892,
+          "elapsed": 147654,
           "status": "ok",
-          "timestamp": 1532101593030,
+          "timestamp": 1563825336196,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -460,15 +317,15 @@
           "user_tz": 240
         },
         "id": "hC3qMqryPDHS",
-        "outputId": "8405c0e9-e518-41d6-f5bc-e78df6474169"
+        "outputId": "56a095a3-28a3-455d-e95e-2c4c9dcd97d2"
       },
       "outputs": [
         {
           "data": {
             "text/html": [
-              "\u003cvideo width=\"432.0\" height=\"288.0\" controls autoplay loop\u003e\n",
-              "  \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAACZUm1kYXQAAAKuBgX//6rcRem9\n",
-              "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTQ4IHIyNzk1IGFhYTlhYTggLSBILjI2NC9NUEVHLTQg\n",
+              "\u003cvideo width=\"432\" height=\"288\" controls autoplay loop\u003e\n",
+              "  \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAABdAG1kYXQAAAKuBgX//6rcRem9\n",
+              "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTUyIHIyODU0IGU5YTU5MDMgLSBILjI2NC9NUEVHLTQg\n",
               "QVZDIGNvZGVjIC0gQ29weWxlZnQgMjAwMy0yMDE3IC0gaHR0cDovL3d3dy52aWRlb2xhbi5vcmcv\n",
               "eDI2NC5odG1sIC0gb3B0aW9uczogY2FiYWM9MSByZWY9MyBkZWJsb2NrPTE6MDowIGFuYWx5c2U9\n",
               "MHgzOjB4MTEzIG1lPWhleCBzdWJtZT03IHBzeT0xIHBzeV9yZD0xLjAwOjAuMDAgbWl4ZWRfcmVm\n",
@@ -479,725 +336,449 @@
               "bWlkPTIgYl9hZGFwdD0xIGJfYmlhcz0wIGRpcmVjdD0xIHdlaWdodGI9MSBvcGVuX2dvcD0wIHdl\n",
               "aWdodHA9MiBrZXlpbnQ9MjUwIGtleWludF9taW49MTAgc2NlbmVjdXQ9NDAgaW50cmFfcmVmcmVz\n",
               "aD0wIHJjX2xvb2thaGVhZD00MCByYz1jcmYgbWJ0cmVlPTEgY3JmPTIzLjAgcWNvbXA9MC42MCBx\n",
-              "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAPQZYiE\n",
-              "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZZ6/h5MpYA5/oqv4s2qPbYpW3jfK6\n",
-              "zQ6q7WMrNj7Hy8jZzmBpfHCwAAO1W4riBNsrapcCk+5V1W0XkkFULR4Qe+H3uGA2HgNW0zFAAUgt\n",
-              "W4tdpXv2OEg0Vuy5W5l/xGRmEGKDyeXyrM0S6q/1EKbad0x2mcHseUqNmeOGLy1N3b376XZKZcPY\n",
-              "IXC5F2332tNMj8CwOQiXM9PiCLyCVfZ3rQSkKBTZErkpS5kXUyoJG3FdIqLjRFKEapbUjcW64HIo\n",
-              "BeIbtRyWV9FyZfcTakx2KW3eB4ZI//MDykSe8CRgN76uBEqZFXwO63wmUREhHOb5AdaLV3xyGl/I\n",
-              "RV70rU/3t9t1aq5mFD3hy1aLTAV2U7nG072dyX87F7NgCxZHT2kFxu44fxf6gqVzE3PEbGr5fx9x\n",
-              "7TKXtmY53VP8UaeCd2HJiZ/sd165SutTnfiWvaLuCnmmXGF0AGqbj9S19kgOhTubZIJBydTTqQOV\n",
-              "YRlxbgKn2nzvunv9+NDG0/2ikyyp73W15QClmjyt8dUeynoN8CwtEQ59DdrAPZe4ARZTwWAfsRXw\n",
-              "1vcZ6Gr1nCNWllQw5IyZyxQtXrfc5p4wjPvGaltciG7d3FG1SGk6HDsZy5i/PsnkjRXLUvGbzYp2\n",
-              "2gs7ZSGfSJbEifctcMGeSqhOOYORKy6f/9omoieCVEEkniBXwWZ/eImb3nxF7SFIaBjgG2j9w5ut\n",
-              "BY6zSuQ5zRCdajzJ1loNO0havI8mp5yViAeAlLKYCxeK0Lha1FskL67W1YsARZVZ5EkhqAYEeTNI\n",
-              "M38Og48OXmj6QBN7c1b9uDUTacYEXO88ZQ1gCIREIMnm2Fgkir8pN4gtSeQ12sfOVz5x5KX7sa95\n",
-              "L4LyFQPDrFZcDBr4PWLeEEv8yzk0cYHE97GmAlA6WQ0HlWsS42cnXefvTPXnx4vcq8pbEo/slAuH\n",
-              "IBsrJEN1+aMCc9FNxwUPVbZVaWVjwLY0qh+mNWEaiNGRmacDXrYWw0NjqMPiLiFHacY5oGELRgym\n",
-              "S2mSo6zhsD1wKQ3EUQtwrjKPiDYc/HCqhkVwoWKUdI8xTS60kn4f5UqB0L77Yevh/wt7AnvQKQAq\n",
-              "QAEEevggRl1uigbOBTtscnYRnAj0edW4QExAzdo+RwLWXTzW/l3cBWTrh3ORzZQlxJ8jQTvPLB+f\n",
-              "bLazJZWFQQDcWhuhQ3gYcP1ruNwIroINRIr8px0UOgAhnk6CllxMN6gA5S0YPhFVFKd3n0AAAC9f\n",
-              "vYgISQAAAltBmiRsQR/+tSqC8p1IAOZemTPutEfx0mzK8zG8tdIxonBsDpoLZ+NnIOp4qK6idP1s\n",
-              "vbGvZz/zHM86Bg3q0yx2atmtgoo/Trt3YRy3se4HTjou+tCi7oJt2d7A8vEhVDu33JNJx+WCOgP0\n",
-              "03nVdg9lBs15v/0w7qMc3zqqJXCOy/Whl9aRhcaeOEWcD7uK6mCV8a6MpDJ959xBRfv2i/qFOFbL\n",
-              "Grs58WiGJcq4MQJI+rVWuFN50oiqBgiunfUrRmdviPYpNN11V9pwcOJwssWfIE3agnor/RC7vfLY\n",
-              "YoXzaJjtWLEL92OOaHLZT0j555xfb4FZcoJee+RXovB9IaoDdYRusngtBXPMUvnO+g2Z5Qdo9P8q\n",
-              "Zb8ItBAeHT8IBZAD/Z2nEA6qbxqOBSBtQNW6ZFYLtCTIoP/bLjCDHgtZk3cf+N1CpXs15pUIYWDW\n",
-              "elZtlTkM4w4EJlLdjLZyQPAeaBx/qoLmKyTKAEhm0hU8EcTq00f6fwkWgz2J6GTGtL/vJXgC8u4o\n",
-              "nTnf+Ou7sVJGVaouXxrzx+yGVHEcp/eV4gaFA95rInngQAOZWbA3558nK61JBPZl3NjEv5B9r9pg\n",
-              "2+SYY3wBAUeu2fgAB2+yYGw82pkoJJKpzYWORs6i1vn3GEgUTcwlYsdJcraYC5SnGvqSZhX7KM72\n",
-              "uE1e9bkpvpVyG/mkACn5R4jwX3xc2utCjjZgM101rirIF/7VfDtmJsSTDes+UVhbSr3SeMSI9ixJ\n",
-              "+fVuFZ5bnQPoRIfPc+Erw+K99JiGN+HE98/eq4pPlMY9oCfVPSdNyOAAAAFfQZ5CeId/AUuqOi5D\n",
-              "jlKfxuJGZZ1+rVyomjOIykvxtsjsuCiGtElbraCSFWcn3aIYWLrF3fPovVLcOnroBkiRMsdf5yJA\n",
-              "F87MQuoKeTaGOrxojCCCS64RiHrqNsE+7mfRRUDuB4sAEHFQHxBorgTukPSvrdFr5QDq+BhZj/6H\n",
-              "KN+IutwFWKX3ZX9pO3sI8My78TgRY5AA6FEcT91WcvnMypB/OWXzK6M8fYuhVVWipAZigjVOYhcF\n",
-              "9i6GweQFX9AV9EUQOp2qFbkrT5jceBRFLX6j4JUQ781/UGTekv1fcpCmzlpNpp8GdSeWxRL4gasp\n",
-              "F5uO5KW63rlhYccBo1cFwIN8txHNnwyQNiP00XC0PWDRZfaWSxsACRWrISow71IyUfcL7JNhjTII\n",
-              "rwDYATS0xZ9ep8siFC3JTxg1eNaroYfeI4tbkRHok47Vk+CUOQPuagVBtFMOOcy2OUbw8AWlAAAA\n",
-              "ugGeYXRDfwHM79ghzBo9nMnzfQPPIuvorxBb6AC8F4fYGD/t93kNSKNSEuhUXq9FKGtxnCkxN880\n",
-              "BPb/uTbjLTQVyPNuYlGl/gTlyLcVA/cDoLrl5TvaR/AcSLFE7C/t3kLx0STNibmdAf4TsHWKSblH\n",
-              "VWB4X7oQHrrDdhwIivRgUZf7f63j2XaGB+cbp5aHCCwJoovY51YTqsZZTz70FlSnypPHQBNzif7h\n",
-              "uvZkXhtEzpu9rYMo3YECkgAAAXIBnmNqQ38BDchAitLfY16mYQAQlVmv7062W8KLpIS1/zhS50Ib\n",
-              "b3ERigmkZKZMPaCsAi+zsLcku/gHGHnVZpuCZMFs72gmyuL4JFo6VjWcr5FtBvzIgD26rBNvP73P\n",
-              "nJjl3JImmFHiKjNez/gG3zTuYyCACuJCEYXyuEmzCM13hdCPHKg5GZtso0Z1qk6T1k2oiqF/3RIn\n",
-              "kyjRWuxBlHHmJ46TXULiUY14G+RAGoXI+u/G6muNclld2bq+6Zztuy+5ynaDWNNjuN1Ag9KUIx2F\n",
-              "XwNdepmp52/rOvISNPbMJ0U26OvqplXi+qHTbg8MLpUSIGCY8w9FZ5woLAENgvgu9M79yGlL20e7\n",
-              "ypJ4RMBqHYDpEz6Z+SSjXD8LsJ7VKlwo22A5Yukp1vTp6HHA35nV+PXK09DuRWKKdQUzmXVihF51\n",
-              "/+bB0PEFdoNxGdbbM7WveaCJN8XI7JgQWvw2nPlHX8M5QyPGSJ2HEexumoFrABvRAAAB70GaaEmo\n",
-              "QWiZTAgj//61KoCPNGHq/MxnjqmxxQAEHvTwibmyMZGX3ES9Abh1tMR+/DjR+6dnqRr/VxCl6gEP\n",
-              "wJ/5EYCYfGaGmQYsLOeM3v2SZjdvqQBwrwKk5A/63kFm8fc3QCLe93Mldv3KWXHdFT7/mudSntDc\n",
-              "vJwStG4jgi5LKlWdSrVaAxOmElsF+zWNzaCIQ1dOiZqi3JKj64hOeq1XIWyGvRvh6OLKBpB4rL6W\n",
-              "ugf7H/IPbSQuF5jWV7zL5LhxWiTiI+kAZTUMfO2YOLzmhCUSN9GAmNzgY4D2awYB4V4QTDjI7kdQ\n",
-              "tL+3Pmfl1HVilu7nC9CzQSvWIosiwv4btyHTL7IPT2gusybyNfW8QO133L6KbDhhXSDWUtcIFCgn\n",
-              "QUm36C9hvgGjorpKYr5VnErpJX6fRJm76fFYs8/nt763alyqdcSrqaTOLaf/72Wkkmlwbq3nLOIw\n",
-              "ADFDkkAPwzaM811K11iK/3HaYRT3nEhjJQFk5v4WBXwIVLAZeKdtC8YoGN9K6isN142fOG3s6fm4\n",
-              "J1nMtOEZHIwep8In4slLmHh39qBzhGZO3igiVpgz7u+JMBeFkVHe72vduBjIy+1dqvxL/TPics3s\n",
-              "+alwfTMNQKave1qW+5Uj8jZQTjcLAtKvzoako9VMIOfQUQAAAQpBnoZFESw7/wC9ZU4P+UeGsidW\n",
-              "4n5tFkXmtxppYvKQ+WGj/x3AAdl6+9c9x7N2b/yJykTvVggfpMnFUWtxla4sr1ouwANom+Uf4IBJ\n",
-              "/zXPovndpGdy98nJbZxFU4rrWpr8aI4YmRX65+IGTn756CZWwXKY5DyMgKnDcCtk0HEuoHgdGhh7\n",
-              "1PG8+nue+pE9pBHqiBNWAjPd90qfMtABmMShLoXtUObqYbqXhJvVjjFhKdPS03IF24fu9Z0ax15V\n",
-              "DnkiLmgyOCvJmcdIX70L2ZEECd/hxrSq9JUVjC41OX0F/ayI6GtkPMUuZ2xWkMFo5rqOAo7v0Zlk\n",
-              "ke/79TjeY13FNiowqcbhMwfDuwAAATIBnqV0Q38BDXNpg2t4nJdhAA5ru/5Co2KbB/AnQt7fa959\n",
-              "0crOQgtTxL36jtVyKPmfuQMYuWbJ/7bYTEV8sEjceHvN6B0CSEZzVCjaPLzOQJZMQpQ4K4WKPlGc\n",
-              "lnEwYAC9Dsejj7Fbk2RyCFiJinyU2HOscjUR6fW2jRsAFpVq/PtZDVPvesPG3AqooVaKHp9Ex+Da\n",
-              "AH0OvccSugyDKsRBAEiYR8645aXxbFSzraQsELDsIIr6HRN8F3lUNVBvzNO3mxBhq4th/kgZSjjJ\n",
-              "JZrYmg3UfIUO/jn4xs2XQ9Pa7Uy5K3JhuIQwAOUKDmAMC0p6fgz2on4ceyEcfiCGDPZpPyL3391F\n",
-              "dXID0ctPQ1a+Hk7UcAc9gSDL8CZKz59YyO0ACPjfAKV3Y2dbTAKdWBsUU0EAAAFEAZ6nakN/AItk\n",
-              "aaqbMCcBE0iEIDnEBfRZN0neHQxaz5DPSzK0ZSL640q0AA5jkP0YAYAumNCN0MxJYpWFoQ9r43H0\n",
-              "i9SZLdv1UbgpG3aX6KESZW7AgdlevaBngH/w8xYsqWx5t90zzi7x9VyRYpIAD+XTrxvgBoFILNCs\n",
-              "gd+zDA9uvbAPlLMwG/qFltlwvLokMt344erv3a/C/ySOwZHFzpakInpJ7MQHkmKi1KHZB5KrfqwF\n",
-              "FnglZJwWbe7LtVojTdwQnAksziDNlEWCkMQQJwziY1KYtlXMNX8mZ3MtYR1KNf/CNin7/ys9ZQyx\n",
-              "4Zlk//H5KDc/8O2+JaxH20CAaAABxgSxo+yJal1LnRHYfOQ1TygNueW/rPAA37g/6fLS7mbYKz7k\n",
-              "dsiSiy1mAV7n/qq81UHJPShQSXK+E4Y5XKuXEWG4AAAB8UGarEmoQWyZTAgj//61KoAW7kO9JCjl\n",
-              "XSE6nAngAJVxWWFl/YDS0gZ32xjwUFed4hmI6rj18z16nS3Mz1iMmFblrtaE4zGXS046COODiIwH\n",
-              "QG5lRmcBExMKlnynQruQtA8n/NitzdP/ysLrucGyp5nKV+XyJURULfxk4kwNp0a5TFlJ1fusOOJm\n",
-              "y0hvsvEg+d4Jz3anvWT6M9n5A84CGucNifV+WlN9gI9gs3qSoCZdU/gglcFYM5u8YchzhQFyMKxn\n",
-              "kpfWK2LU7aaZHt6xLbqjuv74523K9/dtrrsFq/LySiv1P9Wk6/6d5RC72z4cyaUq6hMMn4IWWRo0\n",
-              "zJIM1/lSYsWxt5/M1Mkv00Rt8OZvmLxuFfd1BIVlANlpgZ39RYhqqzU6v1HwaW0EudelFBGhr5mf\n",
-              "GaDE05Z8ywp5rN4Qq4D4GNAGD/qgEjtaDDf4ZBAD/TAHBwxfNjm2nPAdbbbIuWSkkv8NK6EMlKqH\n",
-              "mOktd+CB3P6Szd1+HPnUsyQ3659r3XLnoi0cvM4usfW+BgxqT0mgHSgn/F6ajdTNM+a8xJQnT036\n",
-              "7195r0uF5vwi7PIviCQ2E4Vs4Wx80/8tBDEJS4qOY1YJ5aNV1OV82fB3HOimLHd2vU/d4Cv7OBh8\n",
-              "k3gNFcjeBGh+3lQcDCLZrG1mAAAA3kGeykUVLDv/AGVBMHxAlJYGEpFnv2bb0ADrwvVKxe7+SIJI\n",
-              "g0dPJdL0s9Hd2mGX7rpdIiUH9ZgtnBO+m3uPNae/YtN3u2p0kkCez2KiPNqgSoEcHM+ePgq7afkq\n",
-              "0HHTSZl/+QbjsyfbI/0lv1mLAJUd3u7VZPPHSdXK3vwLfAwOe3Nid72slU892DijWVvanzM1IzDQ\n",
-              "XfN6x6GH2qfaLrHePrJTJxXC/RSxcAol7x2JJ5OA8VjN8jXu0yKirBiYqgcdFf9odG8j4bRmE2wD\n",
-              "MG0SKuGrJfd91b6B7hbRUwAAAPYBnul0Q38Ahz7YAbwPIqnkAA5sEIcKo2/sVUP0LEeFOLjKjaet\n",
-              "5YFAjDbL5BIdGqWouG/H8ozoec2ZpUbIZu0ELtG5yXc/5opSZlnqbOpqdTQkLs6gr9dv5GbFvVjS\n",
-              "Os1j9FIMQsdc8pttosNtygWB8gLxr65El6umAZE5CVU9Mc8Xxg/tenmTduGK9Cd7qRDiu1sLYR2f\n",
-              "or3KBMo8ebz5q5EmWucvREbYSziQIIycIwJg9OG+aH+ZUEQbjbfHfaiX7yoxGJGP78aNOHP7GvC+\n",
-              "JwM6DxnSyowUBAqkW8ckgrhet8gYYrt8MIe1MPJQB6sv8hHuAXkAAAFWAZ7rakN/AI9XvmYGr0rf\n",
-              "QEvrPPTQWEAA5ru3wBCXPJiC8OaE25OBvVl2wRXqp61wQU4HxGJCAxkSOz+G3Yzvg36uCK8bPZTq\n",
-              "avaOG/H9WxjsuwAl/bIYJdnyD151CiUZ34aErVIixKJ53oKrLeHr3xLgxuH+y3w5uH5lQRsL0Pmp\n",
-              "0jQItTBkKwlPywxFk55pROuYZWi/h/N19QaFlF7WPobUElLlr+nCH+pVt1nW9/YwVGz/cO8zwmWe\n",
-              "Fb0OnFji7CYSsi9ScC3a50GjUP7IpaY5NAHv33V57bkO/BD6dnreymTbSmQdcj7PAJkvz610fMqn\n",
-              "mDGTMB31oxAIE5eWeH7mBZouSgmtxEamul7sYaTPe7mP6FqNCz0h6wLot/zAFwx9/D2+XB0x8mmS\n",
-              "b086o+gqkoYoHQeQm2Sb3MU1Bz0KHDGo9jCmsBmecxs3oNHV4KaIoLKAAAABrEGa8EmoQWyZTAgj\n",
-              "//61KoAcdmk2P6doyaR4wEHxsIcmssCD5f+3/v8PGtlbWZ+A0oGGFPTAdgmU2TFbrRxlmwUCouNe\n",
-              "8freV7blHDodFImzwP3saA3AZT6NUl7vDGH/tw5n9y8rP4XGnhEXBHK+6jIhoAYc6G1CDX0mqczJ\n",
-              "7tbei5I0YSkDjza4rJSbAF6cRoJQH3s2Q+ggBQR0BfH6N3QlPVwd9YFvP6++J+XrbNU56Pxu6Wey\n",
-              "51asar4AaARXHregTXL4xn/VNt8Ppk2xD3/1jXAVXdqMlS0tYGM/TtrcuTC63Lx21RQtklG6k0xA\n",
-              "eWm6W0oL0KTvxuyegpC2ySp5v6zpSEYvzWR4IYirfT0RYU+jLtX0t4M/L/0k8xOLTHbouoUPD6DN\n",
-              "dYYLYlVX5noJzjCAVCiS21OCcIKqWD/YiU/+dTZpdFFNdHEa/MPvUEq7cJD7ANJ0YUweepq2Eqdh\n",
-              "57SC4Tpg6jyEnFgMaHQLSz1nJNh4lxM1TPouGZ9bmQdDr9WY+nwzRBa+ZLnaqBSYKWSKEs/TNtNZ\n",
-              "ev7d+EnJUf9G9CAmmiSDlRAvAAAAz0GfDkUVLDv/AGU2nAwHHyQlvUxuENDSO8vXFIAPilnMlQWb\n",
-              "nTHwb8wkIo6JKOaIP9blrrNXcWeeQDVprB1Bn//+nbSDHls1apJcUyMHUmojA58P91gutTiF40zp\n",
-              "fDaF096G01gcvpH5Za4+DfUvxQpt/wH5PntJzggww1tLhP1NyH5U2TTgrnA/BevK2aCa9xCuCVgA\n",
-              "JJZF4uqHE//COeWbJ6LIFJPoadxAxbrAcxPQQHMzEG5G5S3Yfd+YJBLrdO35JvVrsUTYO4AfvJeC\n",
-              "zwAAAe8Bny10Q38Aj03WPPyvISnWAC7KM5WfLH925SBeAKcvJaYOa5WZCzX9H5nU/7qAFTCgAnl3\n",
-              "rAoSnKk1337XDAnLfPYAAOSIcqQwF++e4HouwNVAWCEsVyl7Y6DnBaBT2mD1H8560KoMvm3kKNNC\n",
-              "oxFCc4BdAIXk45JUbGFNGYAjCbBbJInMjwa41HA404yKnJG7rNXdBctnsSL/36UoXvVx3J2tGX84\n",
-              "+FHk7e72CsAyB49ajd62idmFQji9Jj1GaiqtCIjWs5o6Mz8s5QfrvipNYYD0YZ7gBBGm4AEz17d8\n",
-              "isscgsp4QI2odbuEJDq1nfJbW6+1HGcN1XfDC1Xfa5IptM5UYHm5zIT4rSPBIDE6l8/NhVxlFP21\n",
-              "JPQ0DZxnZFvxIBznQbqkhaGZjMafgFoRzC9Nl17x+K6e75RlplRZtXaUIbjAUFBJIQPkoIrT6/O9\n",
-              "NtkAmnl8qqUC1RktW/RjiJqOyRTTITHqNKvKy/0gb88xEvvGPgzcSs2KpkbHJWmCGIlSWEkuqcCE\n",
-              "jBn3Y8XOQxMUxEYeLPJ/9s/F2fT5NAnko+RFlv75fWLekZZP2s17yJ5ccFGhZyrkGX6u7xXK7N8G\n",
-              "Qlz8qfOHvgMQrlB8p4j7qtnPgBPf8mcsM295CuAZxkK+sut074W+0hM24VMAAADaAZ8vakN/AI9G\n",
-              "UrhSy/Rrhc/LGXguupji5cAHC2DVoxU1gWUkKeMT366GcmuxH5O8lBZJeHl8r2KNT0EaVARyW7pN\n",
-              "L4uNsKKl/WAzLJ1OZWTQf4NaAfodQGO9KzZS0j6oGvr/urKiQwbP44Tv//glYQyyCFeq+8nnrHBj\n",
-              "aACu2w1otySh0DYMX412uY6EYcx3GtQaRpNPiKQniWdVV2KH48fVxDy0uLS0SmCZEAWLVNvtWqO+\n",
-              "q2OwCBr1m50s0i8eRTlSP9xoKtxWC4ZqL77eAW3kYEBJOAywYUAAAAH6QZs0SahBbJlMCCP//rUq\n",
-              "gBY3NzYDjVIwwAKbp/vtZn3NtK6t0V/4sA0MV4ijJVoTZ+e36T0E9eQ0LOyzsqR0ULZJUDRy41oM\n",
-              "RdsBwM4wyEJC67daWmuDEXKhZo862uqAH8A0QJ5u5RKBPFpngChYYJdWzP3onEWImG8Yryy/SXt0\n",
-              "jQ5te76AagLius72bzwZ4AZfLm/04ID6oXhPwqkf1cNsu4/kIt7oCOETiL+lzwHLEnEsdPSz3DxD\n",
-              "uLGkH8o6jHofDxEXcB6cOS43aUxGKPYPtHCj2gw6RzcRoX5lD5mwqtoCTxk6N8TxyipSUyNnbA2b\n",
-              "G5NuBUVLHTce3QKY3SdkbyH/wzdOpT3YHUE+FYQwMKCF6SMyMBxp2gI9k4yUZYljUiekF2XIFkfv\n",
-              "TFy1RUmikOycLKkTYTreTarsMD5JfjZ2FJWrroj/YX+uNeGtKNZl9Zyt+k8u4Htq1bPYEjCrLHds\n",
-              "qeIuFWmvxTYEQblStjDXmWfITtxy8KvOgn9iV+KlidrnVhlE7Dz30fuHXxxFZvIzhgU9uv6sSC7T\n",
-              "vZuGMsKGBGTYmSe0P9hLI2VyM/8GUWwG/AITiU4a7OVDjUNRPaiIEt8jt2oImPIY8qcrJ82CVd+P\n",
-              "mSjoppoeHUTHmeo+koGqjhwT7ueVHNT5VZ4yuGKEDdFfEIkAAAEMQZ9SRRUsO/8AYrbCELHs5dcg\n",
-              "AyOPuRHZUWtdXLx9XaNQixO/8Cc4Q2MgEa/wKETsHiR8C1XOv7rI3JB0rg46JfjEArbHaTHmANKo\n",
-              "+czcI/sIduYNFOE3TvObMh/KtGpZSdF+qnDDtY8zD+7RQUdzmkG5zeDj3u4Vq+f3qnKCwgbU+U0R\n",
-              "dQR9Q60wXqL03p/iYVxkI8jJqvkECuxT7efJI+5rmzyP1yn+WKY2EsjjB7bwwVfe6RxBmzR9Ed/9\n",
-              "CA95ILUJxNg4HsmCO2Ko+MqZAH3wMlG18kUm2ogL3cKIkVXogjofyKhbsSpKLpFFk71DzB6NrY/3\n",
-              "HfknWM2yn9yeQB/joufGEf/bvMAS8QAAAN4Bn3F0Q38Ado97WJWiqN4XS53kTA5YWsnJBdebpf+9\n",
-              "lcN5zPySAC6fH/XzBsBKbxdm4pTiPFVrmGXyhaRiB6dxtlwj8MyI40Do8AXHq41BAunk4K4PTgzR\n",
-              "rFycWqaL549wB2C5jNCLXlq6Tuytik3ijlMSkx9noeIG2Lc83eWkRkQieksQSO4xI1tzzkdqaNhG\n",
-              "ExZARu3MauZwrBopslb/ZLdR5ZS0G6p8o9DD5cphJjxJoSV/70/0Gr+woS8Zj0JpVvvpygE5bXQp\n",
-              "/YBCqjmq4uOCyt9SvCzPelUEwXEAAAGyAZ9zakN/AHZ6+HiwE6fxvgA5rqP9zmI+FShvhJS43N4N\n",
-              "sc5a7qq0DK7DHadXkQxf+APmeqLrIGM9X5aCQgeyxdoAlcQoyNsm6ol85w5z6JV8A3YntmCae+s8\n",
-              "+8/Yheg1ctJWrSharoeypUyemQeq9Rm5cIkSOS9Ej0hbIHyFhPQW6K3SawgMNVKQ0s1BpJvXDQSY\n",
-              "x3jIEdIgEtwe7zce/DjcO3RNN3g+SlPoM7cl0qJbM44NIDG9JGXcwVrY/YKNrpChX0yegP2ZHDI1\n",
-              "MzOs5eWP/2l5loJrLid2mK4Qhw6EGFrIadsV8rSjzgHRNuzJ4U3JdubidEobU0ehkU0P6MYRK/XM\n",
-              "58mVywGbsw6LPu56h1S4w3zHGYMd1zPKOsnCUhaRfrSZTxvjerNQ22prVPqBstk4JgHdnSScrwGw\n",
-              "eQcqvIw7gKhonPDKM4fJtO4n2EsI5Cd0iGMjmgPw/PU3FL8ZP3QbYLMwZ81Wd7BLLBDf+ngKiFIe\n",
-              "it4neyhhaE/a71b8TxeM/ZrgH9+D76dlgPI1ZJW6CCVyIs6Y5gK2plkcgRYa0MwWF+1A6zPtBEgA\n",
-              "LOAAAAIIQZt4SahBbJlMCCP//rUqgBY9we30eRuAA2kMf/9/gX2SHKs8Uq31+W7Vx4LugxILnhMT\n",
-              "6icG5WQzdpL8yjIXjBq99nVaYweUdJE3LrdOpsVxNJ3kODVBkposYOoRuOMi/SNhcjrJwShp6ljG\n",
-              "Qs7tSeRJSYDkvm+SI2ckjbManbEesw6wo2ZffuryaLuWkU9SNALC+2QbPJD4bFy7sTmB9+6VOdMm\n",
-              "rnLvYN4ZyAJz7OhQG85P+JnxdgXgvSv66sWBs05p3vOE+53H+HQCMTLVgvoYmHNTIYtZ5CIln4hA\n",
-              "GrjLg53unVVQTiYlSzZrRE2vmtsqac+v6CrcbtgC4HktflvPTsvgqWNHri9NWa+EuXgx/AgGkZVJ\n",
-              "r1n6gAd3jtjLtv6YvbPiBBo2AhBUxCbYyroAjcvjwUBtRjXTdDEvdYfItmTKA7W3+KvVi/PCtod6\n",
-              "/3gOoaA7zRdO+8+MHlGl/c2xzQhj2O1n8eJkOu+NcsBkpmxyosDi11EOEaiQ6vfnOvH9MSM+7D/v\n",
-              "k91SLlwv/nF+5eDPHSLZQIoFUjHjwVoSGCdOLqmIe6tsfTERCeAhC+1bhRhe0612KIL6izjolsR2\n",
-              "nUgrl1o39HqnKAVqQ/HguEezLTgmGW27Df2kp4E1wRl/EQgEcsMfBPga1ndY4uHPYq84ArNCWk+c\n",
-              "YwxlHAPVC3PK3Zp2kQAAAWFBn5ZFFSw7/wBXFVHDEfqz5TAg6AmqzzGCl9B1ICKhB+tKz4Y9Km1L\n",
-              "/vZyZ1OR5rO815FlrTgGoncUDKVNjpKrVerCm+HleHb1b4FhYQG8B61zGq10uLuoQHIyL4Cv2/mm\n",
-              "s5Mi7ZftErBt64oWYphUyh0Hmn9dYYheGFzLdE9gvqcAEGJDyLZq+nfiK0Px8pHIgaIfsEdSUYcC\n",
-              "8Otyxta0EKY+Dm2m8AtQ8jjuDmkSHm/uLhgf1uCnztOKFhkR+ydRCeR9tnIlTfiv3gJbsPT8swjP\n",
-              "0OUm6yT8LhwwCJU0AGI9hN0/kTkz+NeSHjSPaBx26MAfS2Y5NEtva844h4B/RttjqxMsNDiDrfB4\n",
-              "5xn/Cl/3XrcF40eivyUSC+FHzx3M4BoLQLOKf7iz8hKiUrqRGVkGToUMxkr5192x9xCjbuvLRMd8\n",
-              "9Pel4WIOhSi52xuSf1eEhC5VVAp4lHpZmHCbgAAAAaABn7V0Q38AdnTaV3jxqK844c19uepGJJSA\n",
-              "C7DQuTz6pWfCzxcMbX5JwHItpyM9y3YT46z61a7h5Lyukp+nSKoO0zQhT0EB/u6ILUCNvVbb/89X\n",
-              "7TVI5UN6EFwYYfi4uoFmqb+5Cd0J/+d2405yTsK/f6WH/T+vNB1DYWrW67ctgHOgMHAWDLG9mitl\n",
-              "16bXmPVSi2sWzpWYg3147nlnaD00aZHqQlrMPzYTLLFwWHOLNqCoWpNLMMEevc8AnQWeykk9VNTU\n",
-              "NXzAXhrKDXl1tLQTxZG7GX3K9cQyeUnjfH3rMBGDD2zCLGXrMfPVl9EJ/F5M49Rjn38sXUf2JvF8\n",
-              "D9r9tV1APCHN27+egfFIMDg9OhrQMtjAe3WEfpYS7pl5yHh7ZZ2CedEo/Wf/ygYTAQFI72AaUTrV\n",
-              "n47d9OSqAdYs7lkgV0864auRyPQeTKK1Sp3ADeIFS134VGBNG1VnrfyZuznYkI2r0FVkGFrAXpUu\n",
-              "ZJmyKqqILhJ1OTBM8C0VBV2QXBYa2aSn2jj9t40/wJJWc9IGAVR0vj/u+wFocjwf4QAAAZYBn7dq\n",
-              "Q38AeUc/pR5QUuADgu7/kKjYlIf8yn+MfKKvFMJ4eRJz/DRqteBIBJsZW3T3phi3NzuSw0zOvEhr\n",
-              "CHz7xEUteyaR+fa6YCBeiCtangbUerW/UGoCobzV/74XB/lXH53NcEw+6x9o3/ZgwG/7l4psK3P0\n",
-              "EqSwtCrcKAAv8Wi0Z88mFp3Sp19shMF41mqYa8pNsyefrruQONS60LHg/1GySbrTeTWW74lCDwnt\n",
-              "BGXpwghp/QF087PP7hxkE8lvu8APh5F1FTiOCBSvJFm6yFC/tz24gmveLoV4Rq/qtYWRE09VDCDH\n",
-              "yjftToPMsyi4DoCtXsPRk5Jxr9Mn6xDxGjfz8uMmOKJ15ejPi/Sx9cR1QrBsU9dhcYifdB+c0AMF\n",
-              "PolB3N4pBZAASP6m7EzaTer6yZ2sIKcQdlGt9xsZ0SHtS2313gpdJkLEVrHpO5/BTcfUTTcK1+bC\n",
-              "PwRYX+iIyInP1m6htprdy84ySZ5IaGCpRKFxMCf5w22wXyyon+dlMPKACguyEPTCCZQ2MqEuC+sa\n",
-              "uB/hAAABxUGbvEmoQWyZTAgj//61KoAXgR9s4tVmwJ9HTza3s57iAAoQf/wjqzjlXnP+29f12EfR\n",
-              "S7B+4I2epG2qM/uoQ7VlrfXFlhjyX/aTq0n55QXAKa2xUKolKsuMfmZFFc6+GP96b13JiSidvPgt\n",
-              "2SSGnq9Yw4MfceFmgOaZRcwoMnpdb0UpI73YdP+DfypKyrkDqKWcBc/BGhrH8+XdnpCNDXfg5rMl\n",
-              "b0uFlQ11yUxnDYOfRwLbdjJA6FYddawSEVorFtY7jkSQx+OUBUgWkKC9rhKB+uV/yqQsvbuFiyYV\n",
-              "MviBpsZgSSN0TOC5JedQ5H38ENVBLjXnWZD9PQyueLoT4qwtI+7lodFSnBG3zboWdj6P7XDbgKT/\n",
-              "zKkFObUjwhstiQtohzxd5AXhBH3DQqNv6mRzuMxFDcTEo5ut/0/1HrPGOF4R3sJ/eQT+YnYseqvc\n",
-              "0m5njpgI3qkLmn8efBB4q3zWGpHCxBwC84HKjuugMICuXfcJHKn0aWkn65aEjT8AdxDWE09InGyo\n",
-              "EM1wsU0JgJ/qq/6MdHWfQW6+bt5xWlpYJ4axi9wZc3Aoz+Rixn8UVM2e/bd31+W37ucz9udquxnL\n",
-              "2JdNUAAAARlBn9pFFSw7/wBZVXkLa/7xg9HEtDOpc+GkSv0gCD3x6eQNkROUaCyL6QH8m/0USPLW\n",
-              "nllgC+uXg2X8kUpaUiErsLvwKd9y+trtKwV7xlvkAn0JqEnToCvptE1Sb8eF86DTi2ywy7WE/imn\n",
-              "jNBYQny1cV38ScnZp/V3phWQAYBG3kUdNNuj/FyVB7DgbQbTLK48AO5nLYv8B3LvBNBfBJ+ym1yg\n",
-              "YJXKwjm8kt8xUjO2UGKeggZOs7YHWr5Fj8OX4jV/B3/cMzP+f6YyrayA/80F6f9vgrbTlhWdlFQ8\n",
-              "QtrHKjmrl874OSSPJYH5wfQfF/1NrQd6soxjmSWYI9/FqOPoy6ujUPxQvg1fUda+wK31Cv8gD96H\n",
-              "LPqpgQAAAXkBn/l0Q38AeBaU9hYCjxV6lA176iBcJKIHTfhwkqkAB+a0LmdvcgdK3vyEsSkCI+8U\n",
-              "up3OQ4OQId/B45+Mf5P4Fc2VsfnQAACxyzNkvgEEYwZk+TyOR6/VZmeFNYMrBdqc2NNBlh56ISK/\n",
-              "h5V9lagvsX7yv0p9Hk6RXo3uoMgKhKOv/QgBAqhUvAKDw4DS7G31tehd/myRMmCPxIJ79bZsQe2/\n",
-              "iq7Nquzc/VDpPXFZHPvOmiyfyrt6Fxc2jLHZJGpvacPTIeLJiSaBxgRTEKBr/xXaKQjc5nLhlwgc\n",
-              "HSz1WRlyOsXOkob3rY8KoGVETaaIvHEl7sVHsV3QN7iR2rIGzf6YHv+c3l8OW1b7tAMShtcCLifl\n",
-              "8k1OtS8Z5o7MNTObuLXIONSPGo1fC97qRzqHFEfMZntEMqsFjjWPM6JduvRiAv8p/h0kRdcTeRox\n",
-              "t4PEdFJikYgCJgtFa00LDpNvd6Vv6MImiivCAgL9L7zEaNCr8p/p5ZiDugAAAO8Bn/tqQ38AfAnX\n",
-              "r+Rl0wYAC9kEZglKr0YEZPxbFiynbDVLyUoB5/4mwbggJCKqWcWLXkOc702XkfuMANGy7OD7QUCV\n",
-              "nopFHkp77AuzGvvM2JQndhYVkdbX30/kmHQDID1DcpthKQBbzUjm7wgAOqbulxKDc1OUw1plN1OA\n",
-              "iXs8Ju+zQDtZelKPfekDEF5iPA8IQMn3LLocZ168PVHW73hdmgfMFTsqduJxZ1oiezDuUBPUKdNQ\n",
-              "1lGg5KUsS5A9iNuo+n1shJKCmk20FfXGeNEywAjYeaq4bao/dd8nZn//htlIayY083IymAgdHbKW\n",
-              "UQAAAW1Bm/5JqEFsmUwUTBH//rUqgBbB5O6qXkABRezeefAxp9PjwxeDBuTTFSUNk2voPSz0T3Lj\n",
-              "1K/LmQtEI6YkskJKgxvIXHGf8LHTV/h2Mg/qV3IQ4zvBygOQs98iZyR5jgV+hQ58R6xIcus/6y5a\n",
-              "HrkViRrv8Sk7So3LYWmfkLzyR6vcCKhF/sCJsY8RS8BK5OOGU2Ll4Qs1n4jPQwTLDELf8SF2+07z\n",
-              "zB5hexERnOHmWZ9THKXS8j6NXPrj2p32k0gvmlI4b/Of9evEX9mDBp5GtQHOvTswQ/VYUajAUXz4\n",
-              "5w6EHuB/k+FBz9pe+B69syJ2X5MYn7Qi9rKpCl2kZv4uAWXuNo7oIaU7hr6elcFz53tdL9AEjCAb\n",
-              "BlT3p448134hjvo9lj95CHF5teK1w+R310Gc3NQ0eeJcsiYD2EoVrHHjVDF/m8I8JtTUFdJ3xm+G\n",
-              "muADOcIpcqYbeqyKWwHmgvRze+DMQbkLo4AlgQAAAR4Bnh1qQ38AfBSmnoPKZzTuFWeZOcrkeWeU\n",
-              "yVIALsozlefbqRZf6f7w7fkPoFSkdlxkJJsnO6qzfbc/Kotbm2yeFrIQw5yspszQL8gAAvMHKSnw\n",
-              "f4CTQ2vfLY55MADj1baDD7LZtn0UK1Eh1HnwXobc+mdHd/JEl/a2Tszf/EZ9+J7oMl+BYsjWKwNY\n",
-              "vOv5flnnPLcex/hWFIF4n+hpBybvasl5hI9mV0CeAAyAclftj8N9n7hadcpM/TOVmHbSkJ3cr/k+\n",
-              "StSwI8gY9k3tmbMSZc42caMpFr6YdNCCIj52zmNBccPNFxW+UT/4qCqtX1gc2j7obKDaWzC1yj1A\n",
-              "td8/VAjqVn+FzuuEokhhvubRT3RCdxeWnBTCG0CxwC7gAAACMkGaAknhClJlMCCP//61KoAXgkIw\n",
-              "VJpvAgAqN7f+5rJJcY8tkjj7p4LozjswOy2dTydK33mOBGS+NojRzBOlwt3ro+/vdQIUTIVrXKwh\n",
-              "2SrHPCPJXQoCjJUPkRODCmqbZeBHsv1r7iIOZPpX66HYYhWgPLvPzAb/Nqu9nQqKoyphhNy32+S5\n",
-              "qAFvjRKLSjPAx7GoKGUNMbYduhsBsrvVTwhrV8uWAls2mxYggJzVuRUZSL9cSt+tjl44BXjlbo1a\n",
-              "I7ybNHG97GCzcbSNcg0RA+iqwDsdnrZCO0zsNdWK1qVmER0PsSf0dicSrZwIcxZWy6JbkwQn5TnO\n",
-              "kAah3wAs6pJvW+a5ZiJHl6sVlU3yCOlrECAESqWu0YR75WfiMXgesBOuXGGNsC3icmPYNzM93us1\n",
-              "7GQTI6RmmFHGo+B2yAB2YJiK1YN/T0ltUuXfFAvL4UdHgEXOVIqVj+S+YpITMKy740IvYQ5zuZPD\n",
-              "ahdXF7HIU7xE0W12w+6qkuyZwxUMXLXdgx6svudMor1GNfDCdymcKIidhuuXh7vdQrgbivH7usVC\n",
-              "zjMqgjGahkW1YlmytCooEIoULx5ux9DK360iAi4u/nAomESdiosanRfQ9jQdJSpo4rurLfeCLF1Z\n",
-              "XsQAQRTcezHlxp1tz3A3WsYMA9urPBB8pUlDdB63MfZDCBphVx/Ddv1AMvPXFEPu18oREsV3BdKx\n",
-              "e3lxLWWpytzF3zXttYGgBb90j9DgRGE1uaAWyEAAAAEiQZ4gRTRMO/8AWVV6uU/hFqUNYqrP23yu\n",
-              "FpB+ECoAQNVnJ92i7ZF1i7u1D6K4L4gxm2RaiGsRDmf2iYWEjO8yGHAqwpcDep1/+H221WMh98AE\n",
-              "VV9Ferf+hy0D7Zu5rX4Hp3s1TpcNcEBIKPHVSHIzaZKKfPXkqE/ga/eepp8Bzdc39OW6g91hVVvf\n",
-              "WJxrnf77rapWbmivuJFfeO9u+RRykk/agdEi5E/5a475KGQprA2yl390PNrCvoamPyXbETwtbYAQ\n",
-              "pF9uDZkHdN/NQ1P4rz+zQLJx21eQsP9WBLswpDFYg9BjPw+3VrVEzeid2j5wJBlq+56Hw+Ex6fI6\n",
-              "1O0GbWSAC5/5Zg+kGX0Yx7/We9PseMWGwXWIVwqI7oHPEnK6wUkAAADgAZ5fdEN/AHk02mburIzA\n",
-              "1V5U+8CauxZABexQ9zxvy3GIkNn2+19EyZqnRm0DMMsXP4ZwiY8vW/qdBTlATfbmIFDxCTzt76+L\n",
-              "X3WaNfG+rqTfzj6gLFFHl5IJDtQmIC9KAmTgQM0Lp8TEDdYJnPYGFybq0Xdyl74+130DteV0SYTD\n",
-              "hgB6230zJvCx8ZW04pZHmYvtJ1LZAxF3BAWKPXcstkh7/Er8zYdPblR7K6t0r3b/sIHpME53VRBk\n",
-              "ggj1uN/p+iN4KwToxjP8kZ1opB7xpkyOQpicygiGnwjU7EpZpywAAAF2AZ5BakN/AIdka2Wer/IA\n",
-              "EJVZr+9KNmiS7zXHA/5uJU6D0CbJOrsLPWcfwAUCZZjhlCsnAlgzrrGOONmuxU3En1TfTKb/7Pu5\n",
-              "1R8PfIYkV/dZFitvMyRPMvzwXX1OcxtjbhM+M0LCh6zNEWJFi2Pi95t8cspIknD4iXNUblA3oEFp\n",
-              "VGuXt+8S3Upf64YqAxWADhb5zxXL+O/gnWiyawM9fyRrYcExecMkEiv5MHRsJs8Euzdps1vwxzNA\n",
-              "Zu4bu6ic2K2ueNja78qXGaHz7xLoPIVJv/T4KAuseyOhznfFtKf0Ey0eSBVK9qutGGF83lfe5Wtv\n",
-              "xb73lHTKLAyiyJassoDHBSQLAcUPb4nB6xWNr9G9gWtqEIp4Or9tKJzZIZ1tnIKZFZGb0ELAlV2+\n",
-              "pKKDz5nW+syHi871Soc3HtgomT3Y1cp83yQG1GdKkcJPkU1uJVzsVPzbXbSU7/z2Q7cikc4seN2D\n",
-              "ryQ1l58HjUs0ikCXV/V/CDkAAAH6QZpGSahBaJlMCCP//rUqgBbmS0XBN5gNQAaCJTjyhVwVkMwl\n",
-              "GF6KXnd0XUyzqjFCJEv0D2xQiJu8if6sKo6qHl+BP/MZw8ss5OKq407INzCjWOsjf2HTKyC5fNLK\n",
-              "wiJv+PzieOozn64ZK7RRud2QUaDe0kuhk4uCClSYQBImrxmWeEf/X9zH3+ilYhfoZigVm0IoMiuu\n",
-              "YX1ERVdg0Ld9E6wxbYMiQAGJU1qeeTwc8vb3w3kiJheTA2PNXtrJ98RwtpnhN6QxMe1dw+aQWI7S\n",
-              "j0oQ9iNx73N93RuNVRxXj/57S9VltjA0RTZBjLvYS81QDA3fBgaNHNzOBZ7dztz/rTxxOpumjTTw\n",
-              "x9FgnvlMsjx7FYPKUcXD5quVKd8lwTlOiGVI7X1HEv3Hh4EvpYVt6azhUBI1qGunVb3X1lyMhWJ9\n",
-              "p3muqcicwInEt+BuHY92HoNXaaJJbbQmNX5s3QJbI28Pg4gc2gaUF4SQRcBgM8uwcYUzxEkBS06L\n",
-              "0moZm8bwMsLYCLj3fgXOyFudpfg6jkYPDeVK811WbzEz8Hcd42XVL0EwE3bwDc+i2I4+NERo6J6l\n",
-              "d4d7nOIvqUuorZnDPtlYcfSWgBqdP0tQHvFb4Sv9QUCBvXlH2IEiNzo/daaHVtbFRNZ3cag2HOiP\n",
-              "lMxyt8xYJMnG7di2JiwAAAD7QZ5kRREsO/8AVwwP3fRRACC0tQoY45xe6yfL8KMHlR1wbd4HcPUC\n",
-              "+4PcnqOzdoNv80ufRyOopFYryJahX+qWFUVKK+nDtdvegTv/PqvENcT8ykEwwQ7z2oNUdaMITYi5\n",
-              "4tC5YA9FaLSBorMGx3aocAbiF8065MBqyaTkiW7FtGRHVSPubGixAl7hiQRoBoEipfCxkE/EBoII\n",
-              "omSCNrFRyjd8oY66cDfZt+iBI44uLDeP6eHMEpBALsV0FY7iWjBLaYO1t2PsklOb93SAExoyIX1I\n",
-              "TiPXiUgrCYe7dgepAF31BCnOuxiIAPWKLDHZLhGOJBLqdemk1EZoKCEAAAE5AZ6DdEN/AIteG4cJ\n",
-              "hGXgWAAHNd3/IaNiUh/zKhTXYgf+UKkbUvWJoLo7whMXByWkvy3MotNcPaSHeaKS5vKy/hBJIgk5\n",
-              "CWcdsbd5QzFHyjOIZiaEAA1AziqRPTDRRVYKhcrm181rAlAdaYmvKZAOu92pmI39/PSQjhiMouSe\n",
-              "XVT3pg0s+/zN7WMQCHqTmey2TTctwD0YnAH9CK4EMAw1jPCCTXgop9epuL/iXjup2S+LS3pGE3iO\n",
-              "oIHon+1ERGRC2Vp3b2QAstSXzK/2zI+bVnxf0PhgKqa/NeuEaF2SBGZ/TyqGPDnQfJRorCp1s+mw\n",
-              "tm/3aVbjKRTXeSwl+OCfF6rMqjf/Zw8/4yrjLNmiyOgD8OWqATkM50NFqOShrrTCaHdcxgVW70ss\n",
-              "cCXKxvzAUCe+4nK4C3zP8QAAAWMBnoVqQ38Ai2Rc7ISR6q0L0pberS7nbElvP1eAuajd6ehFPCEk\n",
-              "va4007gA4DkP0YAYAumNCN0kma3A2DvFPa+NTDmrilkXNhiNVTFRLzynsy8rdgQPBH6k5DFr/4eZ\n",
-              "jmJjfYPWB5+2eEYYc9uJ5Ni70hsVFfV+T8zp+ZkLZnd2wv7AZ7A8baF9R5O9oQlCkoVPxkDHTrmt\n",
-              "rElQhX8Fi0yj2+BVP5O9UNPGQU0+M3KYUTg9yTBG2cCw6Drt49/5M/86NN03F5R9JS9KGOfJjIlA\n",
-              "koCavGpTFqq7OYU0RM3ilfXBmxvL5QoIK28Uvs71J3h/IvKmg4v/14n3/eoSpqNUCC77ty2SgAAi\n",
-              "rxQNIHz2GF/lpTynlwsORrYNT1lJMVud8AAQb+/SaHWQXmhJ+8cZTt8XuMgG/t/hdF6GqyG0A/Pn\n",
-              "hWRq+asN+zBaeyQUWZrjl8ry0h3WPkAZksFb/gV7ABWxAAAB/0GaikmoQWyZTAgj//61KoAWw9mB\n",
-              "34Nmlq4DQoTYIkneVdOFHxDDrFwsv7yxZXXwNkGuLMduj7QGT/7lr2bNfzApMJfo9/ffM5g789Cz\n",
-              "1Mn0zxePHMHBL6IHHRVXWyqDMhVLYnQ9xFtc1jml18If/8STBCOf+AZjMnARcFmX1IwLt/ziVSoN\n",
-              "e4GPKKZqfZWytoW7461OuaeZ9dvtxrCL+W45zobgR5vOrVM+Opl+w/eFlupHlgpQBWgJcPy8sZC4\n",
-              "/O9laiYA63xx6M701UUvGFsRI+RM6anXyjKc7TVrmZ/YQKRjqB6Mejs2G1mTDkBn7T2ZURI2vZ3u\n",
-              "VXRNsQnGYDxRUokS3YRHs9LEF/gxKSdLEEiHDqcoIHyS2FPM+cIJRSvB7sxIA3hgfN/O4qDK6VO+\n",
-              "t71oi1H0Bkz1ugONnVTpQr+WeMS5AtXXNBMXU+ycO0+R9eRe9BwSk0V6tHm/HJ45oIYvyWTj3yZa\n",
-              "JQ6q+o4isbf26PsTbuSAcvQoMnzEXJkqElGJ8Z3rZtdkIzQW0DDnXeNRbj2wQmuUNBknMsWOw2/t\n",
-              "fD8BErzYLXI65PwTY+6R5c6RWYzF9HNMLBaO1c6cI4yEu1DMKtZW5FrmVuc6hg7VnWxgAgOdFKFA\n",
-              "QvmmcrbHsqCH4rkez1y5GoMlxeOuW5WKa/JdcefAflYgakEAAAEQQZ6oRRUsO/8AZUEtmg0dqwLy\n",
-              "ubLYtABfXw0ri+bvSnwBqWW9hB3/jYP94x5LyZNY560IvuBe5T4EX3/71Gbqj7BS5SJLQ7X1JK0z\n",
-              "I9iR6McwRU2BDEhu+2JQm1RA2fBVxnzCyNr1JVnfyyuumlkNzE8n1UgnkIbS/FMxc8DghB7zqZzK\n",
-              "rkagW0hHwSjNf+LJf3DnbXyvnzmB1lcv8Z9QlsnPKDef2giSgbZeTNWRMfeu91kckRy0SSKkaYVK\n",
-              "KUUpf450Vl2TzPLRaNhk7Du1IJzIJRf9supxssXD9v31LAVibgyznyLU/cS57Vr8KEXG+WpKysV+\n",
-              "6iQmQ/hCoRg82drzuniAPltxm8MMUZwVMGAAAAEzAZ7HdEN/AHUKF3WsfCAA7NAZyuGlRySXJzA8\n",
-              "WtPYIqCp+udF6BaVoG3w794kSqeP3syNbVlr+uFhruNMOOzTsNGrbATFZMl9DU6mhIXZ1HEAskmI\n",
-              "VVSgXlz4sVX35JqYrDPP8r9Bsg/O9tAp7LnTMjWlqOdgOPhHpyqf/hmokPsCwqtKfsDhxP/tmX60\n",
-              "fhM4KsfvpygzK8jmUmY/GDBCISRQeW6U8uaq8guf+cvy+sP09JLJ4HsULhIsm6kyYO04HBdOFUDr\n",
-              "/8IzlOKX3w/FCxhimlJIduY8iySAFQmALOuag1Ry1Z3p7NpGIGhZp/q5hzsMAsH2jpHXQPdtFNFH\n",
-              "4VkqDlRDeGqieCr6gwu3hPQQfF9yauq4qf5R+bfPha9tZ3XjpRO4eqNaj2xEQrcb5cIJOAAAAUsB\n",
-              "nslqQ38Aj1e+ZhXsJE07lvgA5ryx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v+BMMbdxEWzwYvcd\n",
-              "d3NYalS7o/aUthPBRfYGmx2hUIQijLOXN4leC3SONeoCputIRor3Lgsy985K8UL4nvf1+pFmRQg0\n",
-              "eJgJ9ubt7jVqU4S6enDDZ82+hYwxDWOROomkxsOv8nlizRgAHHE1n42Dq5sLIu8oVYp/4M1h4rCy\n",
-              "m7AmDrR9dbHlpV6pqPLshIJSKr7R6XCF5H/mgt+78ttEoS2XxbrmVQj6DQtTzcYF1gqzE9DaiXTc\n",
-              "rKcf1aBAFclenBiNHhbAMEE20Br4FIkr51a0ynzJocMgaUhstOH+7gKJGCsTPkykOiVzQeIGOfi6\n",
-              "AmLkbzIds0NOnV21ExFbxIFAMu1BymG8Kjwvo1cLb7372R2f+Qt5Z8LjmGrBAAABxUGazkmoQWyZ\n",
-              "TAgj//61KoAWP/AeMmkxh4qDG8hcZFMZjYIY//v8PGtlbWZ+A0oGGFPTAdgmU2TFbrR0QmwUCouN\n",
-              "e8fq+V7LhZ4IhSGjAEZXRALCc6lvXQaVk4Hy29vGup69bTfpCSIWWGXFW7WfQjL50GRbZZRZHQ2m\n",
-              "pjAJ2N9/bloCCNQEfrVxCeDkKfJqKlRpIdnOUaiQpsnEysqkLqMfxaCLAtiv1vFXcLPLizzlMPs7\n",
-              "NIiiAuhD4+CMokPsODEut5yq6fM1zRym2P9iids6rfyvN0EtWlvUXkAIdmS8HfE5DlX5rtipWZ2i\n",
-              "d9rb+tQcwCfWN6erokI6tARQJu2c+ZSF/sI7qofDkfNVCHii2Msza0cnJEbLkEfdF+gBET2KrdRv\n",
-              "E5mgO+6ICEAI6O/h7r7DxvTQ9Wxzo3mHNo6898yojVZYUAEyiEUBn5+alz6XfA0d5GcOXFRjv906\n",
-              "SVSt5h/ZyjXd+HmcrubYPlDuxhjCrkqyrKcbhfJHp/Mq+DI065H9OXdNO/+uDSHvPcKkibqiAVhI\n",
-              "DqTA+NZM5+PbtXMsqU6iKpSzqr3AN5mBITP84n9JoTkmCR2U/+5h8eajZc3UcAAAAOdBnuxFFSw7\n",
-              "/wBlSP3uCsGGoV8bqfG+TF6JTvUuRSAD4pZzJUFnxrFOJYnshFJtjPOw7rAcguf7FPJIlPqbN5qs\n",
-              "fqCPl7TU74m2w4/OJHMnDpS1+crxo620hZORUqqaN/UeMSuSm/KKx2/MSsIgkvOy0fYS1MAD67Fk\n",
-              "Z5FUhBYQOPZatG+Xc3Icj+kvLjp5v9fX+nJsaNN4CCl0quEK1R//8eZO87p6DKKxlnRfV62uCNE9\n",
-              "o2MWYwf9qwHYbtyqG6I4xWPTngQnrsOmiw1Sy0bIvHiKKw6nsCsKdLVPqCFU/q5rppy8Ah4AAAIT\n",
-              "AZ8LdEN/AI9CIO0JMMhrV/0AB0HLuqwUdobO4BdVbPV1Ioua5WZC0IWTaPE/7qAFTCgAnl3rAoSn\n",
-              "Kk1336t4zGyyPYAAOSIcqQwF8zee7dn7XFk1tvgy6W/qOMTmkEiEdwceoRsnhNmrNp/TK9OoMIUg\n",
-              "ShyIuwXG8nP6tDCpAEYSuvpzo5kchXf9jICMUEGqQZjLulIdzbNUEecLTDRk1r3gpdToPPcXdXTM\n",
-              "AElxf3acmkXSo1kx4tBmKJrXm4kNQ2oDIaqLOc1dGZ+ccoProxsI+jQiCldj17rGF1/E4alcIa3L\n",
-              "dIofRLGOPkev2msNj9eN+tELiQktxoUq9fKnDsRx9Nbc5IkysRYA/KsIu02gpfPyisLPQwjLSjpr\n",
-              "jTxnZViCfPC6UCMSLVKUvso8AB0eV8Q+lldoHmqd+EeBeeJOkPU3vuU/GQacMWsLnKmVt/65Nw0r\n",
-              "y1AnL9+YKkDmvNgpqgQANfZvj5NhddHche/p4la1cXWhY3W/jmtWxMTkOC4tX16bao5sNwcVWRvt\n",
-              "UHjkDIOIXB+3akBV5Lzaef6YjjT1MeUeFh/FB0tOMV3Bhvdw35krP/ItZ1RF5hRCk1oYqz0ykGZW\n",
-              "YkciBlvCsweWM2wXwX55h7SZHtxiKM3rO4Aff+TOWGbe8hXaapPE+4wKof+j5KoQ530gP62KsQIG\n",
-              "BV49pf0LYkAEd7yVzO9dhYYFAAAA+QGfDWpDfwCPWoxxjdaiaFtca/OwfG9dSAC6jYuqYuZmzKSC\n",
-              "kzbTtnf9idy9v7frgKuFjQymibohZCHRXBQdujo9Laqcw233I4Za+//Mdf06kxHe/IBTsCsxcSfV\n",
-              "ksVUEdqCe9dEwWwg//4Ee8Le2gLXqz21e4jiFyBOjP5GsM1hpupcfwZtr5Mo/ou28BY4QZExXJ0H\n",
-              "FzCqK0jKq6c//ut1tsd+kiOyZUVGRAFVkS8bi0vvjrj3zga9Zaa6Mt7yQii43DdcrobbVIWdc0QI\n",
-              "3+rsc8fgmOnJ+GJGdWYzpFLd5zMjS5ofw5IMBt0GmHVcG82Z6YQkqKJHzQAAAe9BmxJJqEFsmUwI\n",
-              "I//+tSqAFjc3NgONUfiwAKbp/vtZn3NtK6t0V/4sA0MV4unWIJlE1N72EjQeUPmvxOpceaVXIrAK\n",
-              "21oMRdsBwM4wyEJDPiji6fXmMlmmsCvOtr78Aj8gA+xKnVDFjoVlH7PPNvnMo0iZJruZeFy1B4T9\n",
-              "/2iVnlLy1r3LZhoykeyNXqaKEANWeqYl2HjpH92g+fHSONko5D2m4SRKJwFWFllUBg2RTQ3etVYS\n",
-              "PdQGNCLeaZwhH8zjnIe5Vuu46VBC79Le/PF0x5A18FileZQS8Adcvcamp8leUQ9dML537b7ARaSt\n",
-              "9Lyu3Sdke9BouNe3+hTyxzxAi1Setn//aNMjVtdKZIT0wLvPIMCsfe3gvhpNMtez9cWJYRUO4qU0\n",
-              "Dlg6h/pUIog+BzidDDvn6SZ9WUgEXhGZOFeOBYowQfwTGI3ac1V8O93aTpJwa/om7scQbOrwAjjK\n",
-              "gaYt9yqViBt3FWYRIoJJGYqmGJkf0tLvcymA+Hyayho8kg3J33tLzi7Gkd8xVzsn0AbjvoJ9u5le\n",
-              "OKsB4L1kcStddnytXouu9GStBCQSRLPeb+iGeZTwQ5uYY8D5fTAcb3C6Ob+B7IWRbbytzq93Kz0y\n",
-              "yYvbeUq1qJCNW3/zJeXeH+8yV69x5FRyM+55j6UAAAEdQZ8wRRUsO/8AYsUcQvOGOSSADI46r94B\n",
-              "/W+PEO3biH5wUahFid/4E5wZcJb1S+5KPsyD0qQEL2HibG5BPsDLysut2eDJfU6ijjP6zrYmNEWR\n",
-              "huQfgh9NsMVuoggiphkYt9ccXxVhYHn++9K8YAnkm28Kzp0jUWHgD2VeIoDjCfJPNnBqH+CERm3s\n",
-              "nubUQ9LmttVf/+MNJAJgtOFW5A6IBAcBpJtd5kPS+zJ8VxzguhOiD6Pf/zfgjMDUsehmT57QUanw\n",
-              "gbdNgBf1mSXZw3Czfs4swXmaj+42V39PQblTRJ5hVxxBfyBMHdtD+eP+pUlQP8pBAAnf3v75+Q0T\n",
-              "L19oeS5dx79IIwiodA3vtFf2KOiU2gODZqY3kJGizWNAAAAA3AGfT3RDfwB2j3tYlaKo3hdLneRM\n",
-              "Dlhayh8NourV4B4kYRi+kgAOdUf8hAGAI5XCPTeroAwXn8G2yGEphnv3FPeZqmLNmvgLgUkPciaQ\n",
-              "A3x0WVLvMk+lZn6cJdklOXHEnjNKsClw6wU0RbMDBk1zQUzYb/75rZ2h0N0KqL096XGATDutyhUZ\n",
-              "RVkyTgfbEgHdPAmzdroStgpcOUEN4xVVZX2E+XrryGs2/tIi+iUaglsBszkGSHUeEuoEpHc8PRHH\n",
-              "tDc+6s5rO2oABm+Gux/PUd+4yoXEBbF4DtdMIooAAAHGAZ9RakN/AHaNgkMVTymoPnXABzXUf7nM\n",
-              "R8KlDfCSlxubwbY5y13VVoGV2GO0t+vExf+APmeqLrIGM9X5aCQgGSaQJX4OQoECqyNRzFZQDLhW\n",
-              "KA4dfYJp7oYRPF8AMOzGYqm7AO7w7FtM2J0yD1XqM3LrKYS1dGZTAzMM0YXyhFuS7+8HWwRTCnl1\n",
-              "B1MtLMYaA8qvJY/AATH13D2takXBcx78I1sCsI+P57X6Q2Nh62/bggQuV3uhAAN0tyrIgbNQYVBH\n",
-              "gFwoUmXrxaEApAv0P2E40tM9SJDDcZe8DyE7ljCyxGjQA+gKJHzTkZCCQsmlxDg5It6wsdQ6cusN\n",
-              "DyWnlyoq3MMo7ugMYcm1YMEY73l36Y/R5wo4wUzuNvV2tJ3rSYBCfXsVjc5o1oA8OllKUpgpBG5u\n",
-              "9AavXOqCqjA07sUF9WlQ9JPrhiXa9bThYRp0lNBazKKlKwsBPK9zJ1/OayuptCCUOtFLyDYWpp2k\n",
-              "qNXWH8r0IpnJjxnQFcNmI3LKk+rH0vqX+48vd2BUqTcJ4rwX4e+V6oU1+lJyU8fmS4Kj/iQFUx5A\n",
-              "ntiGKLVWwqfkoYN2YexrEPVBTpKi81wf61aU8NAxYQAAAjdBm1ZJqEFsmUwII//+tSqAFj3B7fR5\n",
-              "G4ADaQx//3+BfZIcqzxSrotcVc8CLm7cBBc8JifUTg3KyGbsl0UtvUGR3t77PRffuzjjVfcKeiAp\n",
-              "EmDpLoqmMXTQU5wmHksjapt36fasfEiGyN1dOKyOI9nT0TFFL0pzQSss7Ux5GajOaQUF29zSIoeo\n",
-              "7hOusjWiFyZylISVuEBU8nCgDYn9P601XpFko2u3FAuYp/svCLJOzc9W7b14FY05eVZdhfmiv0Wm\n",
-              "d+i5ZPIv9mhB+8Cb50V0LQeFfsyfPeAABtfp/HIPaN+amWONE9vQ2YbC1JsqKljPbi6Vrd258gHB\n",
-              "PNyXvESqATfkK1Gnk0AWxo7XFr5y0Ce95pJr1n6gAd91M5RV5lL/XAgE7sYG4524aA+cXAa2XPdd\n",
-              "1BugfbN6YGWbktwAoVIXoUq7TnrmhBrw2FHa1aE9uMJerl9x/Rs847iKP+iuBUD2VIUOVa/G9Po0\n",
-              "ksPo1bHVIsITIKnrhXV1NabDgHAc5kIv+PJk6IroGA19oMw2I1d4rGiaYQZE9dmK1VRARJ9VXDBJ\n",
-              "Vlz3aoQhCyQZvwzvxWhVA1iU1RO1TWnJsppajNeO4Vg4/b+BSviIvrSwwqmjaRr8iuCpVTgz+ZJ6\n",
-              "95zLiSdnoIFqQJA1Hz4YR/KIOmAfhTTnHcdDelso1m8Bx2oHlzAOiYwR4NhSSRD6EhhCU2kXf5vn\n",
-              "vYdShk1Y3/pp+Wd9yZwIwTneJB0AoI0bbmfrtbbWj1oAAAFQQZ90RRUsO/8AVxVRwqizyog1fzvw\n",
-              "w3oFk0s5kH60rPhj0qbUv+9nJnU5H1hbksC+yivmpdt3FAylOp/Re8NoooEKQr4q7MX/kjNCB5zj\n",
-              "aCmG5E3TxVGWGCYMCsdEF1I+HuXX2a3wLCwf1iqCfznNMRG46GE6nIgxc91oY/zfMduLLCzyb8AQ\n",
-              "b20W2eRODsXd4+7XC1RndLreJ7Km543AdL1iUo99hYdoASXjyWRNv6wvJrmyFngIDlQOrLluZf/9\n",
-              "T8Y21pcggXpfTtvdj+B+3lZv29AFHkL2xGPZvyL4UyVUgb3U1DWd/iySeGzlK1IbRNu7obP1czi4\n",
-              "Rchm1nI/pS+cSuamJbhlQHIreF0u2/zcrSGkuOpbObSfAY//5j6RVfcQovw5wL1RQN0tcA1GtFxu\n",
-              "ZpovaLthGUkeOPh8iV5bEpupJR1R79Ew1sEkTDugAAABwQGfk3RDfwB2dNpntdq7wHtHkfExb8Mi\n",
-              "4AOIW+6weDVD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJWyNJJfpx2maEKeggtR3RVEAdA1a1truYO\n",
-              "N3PBvt2C5hri51AyWveiUQtRNh8OhcT8b+NVPo5dLHlfN2wr8ZipKDuUP3k1md+EiPqVCrK5TuMQ\n",
-              "knvfHHEV8fXqrrFiHhWYrAGbSJdOrXgrQTN4JDv0LMwXs1Nl1nmEdfSgT5BF3DohYi4r2xGfiJcJ\n",
-              "KMZ1oPHaRBjgxhu40ZP5HqUG5rQWHD92UCH/Terh0cf4e0554mxHgDF9CBXD2Ey6LaV8LB9Jb9nA\n",
-              "f7tFFMQRIVaLiP+uig+B5OoeaCY5+GdEeHuY+ZE9jNToZ4yOUwNfysZaXJBrtfqEkQosI3EYRZQA\n",
-              "COu9BHjZjXsKjEmWe9Jj9yWusbXq4WMANyEJEPNSeDcqy2nLsc2OqSE4CgyCqy8blbRZqycUiZt/\n",
-              "3NpFflI5dk/7eeQ8Uo727U5FhceNm/3Tv/0N3CZNlPGV4f+3/HHJknpIjibzMw4AkTq3Lkxy1XZ+\n",
-              "FA9yAR3cZ0/eN1EscyudULe5dTvs1EvlYMWBAAABtgGflWpDfwB5Rz+lHWcxYALocP/IVGxKQ/5l\n",
-              "P8Y+UVeKYTw8iTn+GjVV8vbhgCZ5cI/70wvHdrfJYaZZyRIawh8+61+/vwo8HAkEyAQL0QVrU8Db\n",
-              "Z7+ORIRATWUQyS/LIyP8q4/O5rf7OuybqgrrJ5JQm3dvb5EYgnYLHCULt4xtpfvTsT5gEynxu9HL\n",
-              "Km20sO4q1oqcF4MPx2dj7xETa3veUfVJqfvwop/9NWsmPrdhY/wz7rinYt2HcWm7+ulSBZtWIRv3\n",
-              "yMRoNM+lyCvZDr0PaN2HfwYWOYr/NgyLM3qvI6TujkJkGWBIPuiFK/SHsSPx7iAMcrZ3CQvQC1rq\n",
-              "psLEx1Lx0vtWsdQAcjEYe6l7VHqUFbgcjcHAYPQIIgi8NauIxLhxUOQnkJo1mXO/e5w2N9AAHA22\n",
-              "RlXXsFU92TGe3GmYdLlI4OC3IklyabPhxs95veQzY6n0a2BnyANXxWrQG1vVVVAYgtb88NEdo6By\n",
-              "gCh1aEE1VpUTP0of4shaZpNk/2gd6T34r4uIClLqdADAAdaA4/epPc357p2Ro8OkrT9okATGaQDM\n",
-              "AYBiPC2kAQBkyn5ImAAAAdBBm5pJqEFsmUwII//+tSqAF4In0o7iUdIU6DQAMu59v/f4eNbK2my3\n",
-              "LFfU4bVvmOXvurgANJp+yhdNshfKZWyf1yiq02eNo25TtXkBg+c9UZquU5KtxkSr2wTyRJb5fWbg\n",
-              "+NL8Fosje7XYkSxYEiB3sVwPhHSvNWh2d4v6fN1lP9qvuUnfb1Bn+TdruqmJdM2vx9efbO5Th2CP\n",
-              "KiH3jeuRzoCzSIUG7cY38FVzT4nUIJdz+2KjjjJ0E7ZNKQ6lROaPqjFN4utrXaZfqGFX2nWmlL+h\n",
-              "PxS7plcEcSC1oWpbRWphWgodqD5c2VmFV0yO9NkxWYeDoEeaPVORAB/gqWAbIHdoZVHMBBV6fLyv\n",
-              "D3u5FppjGB4tzB+WC5jnXJKg0Sk3SkInESay6cwWUVJt/G4Tfg6wbMdEkCvCKlRosg/RTpp5P6wR\n",
-              "Z2iZfctuN2EQi36vtriULh4PVI/bw9ZXWlyhMpAYPlW3C1NvZrlJMNaSqGSSnh5cJMfrxHquXcAN\n",
-              "CTgojRhZ3tMe14Ny/HV3UfnpEJgrqxN8KZxlRpYS28Q96uqEu6NBBsBIIz0ei/Mg1x57c0aguL4j\n",
-              "dVBDXATm12Zi0uXfiRBRiIror0O2CDrlUQAAAPNBn7hFFSw7/wBgSQL3wIE2Tv5B6OJXPcoXMcSb\n",
-              "cE8qv/1v/uy5HaAJNUQCTSWlcVovOwe/GLZOdN2BNEgb1OlzNEinzyASzg3GuZ9zFeyJHe/zvxXW\n",
-              "qHgQlhmuH8QdE1M1s5tXy5mwAyoAiCrzupaN60ez6jWL/yRvGdGiPt3qJJLeMG60zAMKa7QhUJFJ\n",
-              "FMWUFrcLW6iQXx7VTZR7Qo0gz/aCe+BxT2h34J4bdpQTH59SHjOd2X4DMr2kpW5buE3EQBEKSUD8\n",
-              "yEiNy7MVRtsZHXt1V4Pb6TljTGXtC9pzGwEXtgadiRP8dhtDjxgpVN3IyoEAAAFOAZ/XdEN/AHkx\n",
-              "u7J3fsEfo6cXtbkNOd4swcOB3voAJyKHu0c0/MGiiYXv+2wca3XUwSOEG+s8df2rHPxj/J/Armyt\n",
-              "j86AAAWOWZsl8AgjGF9fWv1mQf9jrWNuA4APvfeLBFbZJZm7otp6Fc0DFqB0XCbEvLTkRU5ySc7e\n",
-              "Y4CD3ziWyxgWkLgxNxAV0V3rzOqUGhFxcTbBCJI75knYyulzgB9+SazwgLVSR2N8nND844Y7GLCN\n",
-              "0aeRWZgNIAWJkPPhP1VnSRo1jOpV+axgAXL8ExpNwIvLk+O8lekZ0/1o7sI+uJ46XyI2SuA6uJHd\n",
-              "bwUKNMI2qDKAM6f4kKlJLSQWqzXAi8hAQzI017i25Vpi5npQJ4TsJeyOHRvmO1wY5ZnIEZHyhgB4\n",
-              "IoLWrdA5opbAou9XxH6m1F6osqepeJLd97Dr7+5BqWzoHoOLhOxNwAAAAQ4Bn9lqQ38Ah1fDGltb\n",
-              "SoFNBABy4LNe514R+dnaDTYn5E46OmsRrJgYyAm1lSXdflAXI1+CFQXE0A4eKb0poyZSLaaXfRBJ\n",
-              "r/tA3jW8xYt/UxFDszVrqnPHP/Ny6pw3mJ+pwWr+YYAHxNaLyZj85nxRNPFMUkOr96iCB+MslYrg\n",
-              "cr/vUoZCrrFka9nw08yFJlyN4Ky9KHUYJOXDrBIiz8KQQaHFalCe3rENKk9raHLB9E2PdI37xydW\n",
-              "9R3Ktqa3KW5rMJCOoArO2/3trkkCh+/FDlbsei4VdbDQ32DjCaAkDFjCyuqOJNsi8nSI2KDSRFCB\n",
-              "83l81kCObhPemVMTlMBQzSDvOtDFUtuVwHtirD8AAAFqQZvcSahBbJlMFEwR//61KoAWweTusUEY\n",
-              "AFR7WLigAceU/KgvW9LBBRTRioW652v1Xpv5tYMFhkRmmlUca4/8lM9NJwOZFgbdLq3dhRjr1SQ+\n",
-              "iitgTnIKVe77qt/yWy3INzcVxffYfGucVy2ypyvLSUZVvVzu37Ufe4d1uKQAC1EE3Wwzkx7sEK4N\n",
-              "QwJyCdTZZnLiyrlEXcLAMbB36CvMtmCiaP8XPpa1U2RaJxnBB9qYeP0+JCORflaC8m/hyWfMppd0\n",
-              "XeCFuAYTEakC9vO4HVF02QH4GZZigg7j7bXnvstEtP5QgYZViZcOoAaQGKtWm3PCHoS8mKWfCUk8\n",
-              "ZLC6z2a10V0U2DavVH2m02W1Lc4/2WzrwUTHr66DOaP+urnPdabeHdXruv1HJ087InGSipJtxGko\n",
-              "4rppNbdlP4z6g2o/ksCKcSZ76uS1diKM/39wzVYDu1tkCD1lomve9NoQwUToKqCn30PDqMAAAAEr\n",
-              "AZ/7akN/AIdka2XuDkeawxOj/BZhZtP+kNbRABb4RmWT8vSOMSH2HVKuz5/n3pn38gQM6YQqY5bV\n",
-              "v8KsLMWKt//3BpX7BUiSjA/GsXEpiGachc2o+KqjjRfujy3SLc+TvzNfgePwT9w0Jj9Y8j6ORxA7\n",
-              "13x9/iM5Lx1s2OQQyRluiOYKxXDE9QjNulPCcMLJFKpvAfnZmzl0pzzHw/ANcBEDhABHQ9ftCkUs\n",
-              "Q4pQOQF20mJ1++bXoRcUz/lR79ACwohpzpGuaQCknCVhUL3lnnyQzloB0PAIRq1VnOd+y8D18t8/\n",
-              "IEva3L9FTrRi90eT/2pNxjMaqrOmFzrhjd2kmSd3YBlll+A3KrjDn/HtXx8SDjztM7Km7BEd2LVO\n",
-              "U1pVGn0+C8gCov9gxoEAAAIMQZvgSeEKUmUwII///rUqgBet471BV4xl2QAFRvb+6Uilj9hVaCt9\n",
-              "oXOXB19FM5G4bNDJAOl9w7HrxMOF2dPOUf977Rp9NoBObCR9cN42Ht77Y+l36qfp5SrWPFz3DG9k\n",
-              "Uks1s5yfRvMME5RxPYk9+qohbe5TR7z2WNWBJjaTvhnu4485WU3BaTyIbA4BRRdj0/JwsbCXRVZy\n",
-              "OMmFdXnFdxhNGZ5JMCQy+ip435WTv8KevLzG3OUTxX5d8x0gaiQZdaPwNC9GVrgmtqTc0z7He5Hx\n",
-              "p/UnXiE+WgHU095CwXga4AbeOtQbj0tjxKUoS9sAoJ5fyTlHv9FnU0ujgUuoA3Kj0ma5qF69zgnv\n",
-              "MTXEIqf8zuYuInk435YB6s5Aa1W77q49/ZLR70JdKU9F42nWnuaGIFvaX8JNp0NTGvA0s1VSOWIl\n",
-              "YVdpY6hSPbDqLYXO/LE7X1D3sWpexh+/kcA2B6pYDzx14bD7OD1f9pMDWxIrW6BpNH75M54gOMY1\n",
-              "SxoTsfh6KVoyFK4Yqd6lPKCLY4O17tm0vzqLEva8zNeuM7b2yHKwMHpqK8FV5yaEer9Zd+uSgIqd\n",
-              "eftECExc0GDPrda1mDLPyRR8iDjZRvRS/EElnceTaWiUEonB934ThxItQqnJINdKSyNdNwx44Jgq\n",
-              "H9/Zh55FLA3sdVDr+1aesKMfNmYnbwaje7GN0y0AAAENQZ4eRTRMO/8AYEUc98FD5/CYkGD6VZTK\n",
-              "7qaMD8JeD5Yvz1s+LaCSFWcn3aLtkXWLu76WBTjEp2boTz2lISGgYIiIhTqGBdSAvn4GaApcqQ2+\n",
-              "sy0LjwIg9aZXDdjP9AWFTV1H8wY3dWCf+Rn8X8p7dsAFRxXZ4015PG0t6STtIq5DOqARSPJ32oCq\n",
-              "OenP2L2rQhT0bU7kBXZqDOvuedMFko4K8dbR3EOKtstAjt1gHGNubjQIVeNhJsdrdMtXEY7juX3P\n",
-              "NuPteAILXrR8S3R5mIOtuZ+vWEUdS+Inr7FnZsbQiIv9i7KDzU2m3LJLNdjmArFBBLgFXYHDvQmL\n",
-              "9VT51Mb8gx1TyNar/CPWDggAAADyAZ49dEN/AInJdfYNr4ilmYSAMFB4GADpypoeWWXE3q20mGL8\n",
-              "wfGmH6ZgcbtTXJWZn5/uB2IPeQFG/rqNYZ/bmIUcKhccFRuPa9wOgu4Qnm9oi81y+ChWQK1KoKDK\n",
-              "TWWDeg/SDhV8w/q9dFY0rcekgnjPKbKFgzK+IO7hoMF7vhpMoVCqvwMtBaesBfF4bzxIufyftMba\n",
-              "VRaJWuZpM22/FtH8FxujQ6EjGNr9PHZg3rsxXbkYHRqZvH6RGypNdfKRL4serPMKtCeuCWEKaj1Z\n",
-              "h+pr+ULdNvwpLLHfA3OCu3Ql8v/sLDD/O1LVB9ug+l/wHpAAAAGVAZ4/akN/AInJdjcgUcZACEqh\n",
-              "GvWiTtr19IbQdv8WE1dBOa+lNipi00vM+C9W8F7IDH0aaS+KKFaekfOwUNG520lVemVKNYbjnPl7\n",
-              "LimE+s4N2NJ5SYT5+XRMb+vTvKCkG/By5wQO/WbZo9HorEm10+Tu4CVIj+2Ky5hDZl+kA6mkBK7E\n",
-              "3LwAW+4rGYiO9JH1BLFQj0ZOJq0ybrdVynOYOw8TudsCI+I3fiT5nmYCkIO1N7h++s67fASBLfgP\n",
-              "CYo7yLNwfifRM3ay+JhoRmwX5tGJ8l9w676Zo1wDaqZ0Q5guAYSxSJk2jHShR6LxlZmIVJnq7S00\n",
-              "iBOM0mxomzMhjpxeX6zqy/aA2SEREi4ulxZsEvlIWhLQ5YFv6LMkVEh9RITRQOsKGEls7Y4eSRWc\n",
-              "f23FGWOVxL2MZUmPGVh++Xygx19XCiXwoatt/s2T7zGfLkQ2IBiMKXoeDb7yiR4q+0v6UjACWT2H\n",
-              "kOIRMpG/B4KQPsfMRT0Rk3cAwV9dNnKm4XTlo9P9TmyT71B/Greq+KvhEBDxAAACJkGaJEmoQWiZ\n",
-              "TAgj//61KoAW5ktFwTkgtAAhBassVgP2a7WSOTniW7GlpUC5YARIimzpboyDKn/53KIxVBS+A0NS\n",
-              "3NuuWMzq53zfHvhoSdYO4dYooBUDN2VkLpVK3v3kQo1FoE02X3cyV2j6ziOTJORgWGzqU5k0XKJO\n",
-              "1VCPDS1gJclQYem5NlGAENmSiR9I8XvNQLGvpLGF/2+aU31xCZzIPp4tUxyLu/gVqq+6L5DezfDz\n",
-              "gPP3+vv4JFttE5Nyc7LysmCaQfUhi6zPymHmdLjs3bZdma4hV61UMMsGBNZfYf2GUkV1dVZ9kkfz\n",
-              "RyUYJPFdwjA5S++T8sc03o81MYXnXYkO9hGiG6RRLRRV2fPSgGhghnaqxRhYVQiuVS0ENIpjxqqc\n",
-              "KBEaAMs1VoaLKEOrNhZ8yB1VLLV9KSiM7/prkkNKRuNLp0WeTv2eHtXhIdAfhKb+ic7Pb48CqpOl\n",
-              "FnnbgphlxDaS1dplrA4VxMNzEL/27xNMQzhuRvnSDNb60j/kSJHw5x2JG6G/VwCoVAfFrZll45AB\n",
-              "Puajv4y9+7flMd/pR8Rg9UAn+cey+vNCcCbbn7FNSWq2hl9cymk4fwW6iqBgiFEQ7YZtyDoNCyYz\n",
-              "KAnW0gvHCg+5n6+qxC+xDS291Y4JfSW927ZZudU0tXxvupwcKf6fDXxz/bqsOMvxj6Y81+e6Dezh\n",
-              "B2/8nCpk1Qc7N5s0JoStEQ8+K2ir0vIXayhFQIgAAAEeQZ5CRREsO/8AZTZTJbuKD3PiQhYpzA/Q\n",
-              "3Iqsld8XUz3sHppFsAHZevvXPBLN2cIUd+YCbEEH6MplVFEcbuDDV0dnlBcrCNrbp3+CAOdBsr6h\n",
-              "0YfLGDPxHlFlUCi4qTS1o0TT2Jzkq8/O+TU7SSImG1EjEmOGpKvxjn7KxERq2Pbd/0y1sNHk5hiQ\n",
-              "eJwHwc7Z19aIrWes4h3UYQqHeU6kfCpUHVgnGubU2A0Xjg0UrouNSumFogz0StLk4fuhL5slF3Bb\n",
-              "3NpP7YhgiVLV0FNM21/pfbXvRQFzmliOaZuScgePqa02nvOdEHEpGVRPLCGL/tvzSkZqhXResmQg\n",
-              "1qZ/TxlvqjWYqPRThBIk2nP66jbd6NLagdWz1BtbrwB3TQAAAVkBnmF0Q38Ajz7dDL7wKLyRAA5r\n",
-              "u/5Co2KbB/AnQg3XvWeaImUuto8KuobiZ5Rpi0jf/+r5lFprj/mYxpQ5OwqjQqFG0eXwqi1D6M23\n",
-              "HLH/3LvgYXkbAAGr9uWkQaEU+TeJ38WNXodDC29t8Y0uYEpwNzyC6FqtgkCyDYDpd/nESpdVRRJh\n",
-              "15SV0TP88AKwZsT7yWH2r5gpJv8AhXnnWmKJ/WMwiS/2+Kf3ikj614P+BDohXhMYGO4GSZ19EkRI\n",
-              "RjwO1zoy3Umd4iOMuBBPzevAs74sU7IUdkUF24rNAstoyqnAUgY510L3SgPXbZmJYMv+tRpT7ZuM\n",
-              "oLxE5ACIQ+eHStmGZgh2P1nvrIaZRiBxoWZ1B+DDOtu5OZpc7LbajGP/oy8HbEFyJIcGXHGB5VXY\n",
-              "HnskMmabuu5xyFIJcVaqbGg3TlqrbBE29OX6xO7K38oavU/okVlIM+AAAAGEAZ5jakN/AIdXv9ZL\n",
-              "/wCpeCQF0zyG8897iu+TVNq8xXl3pE8eXm424VBKoADmOQ/RgBgC6Y0IzpqUKPVKwCZafdEIuhUv\n",
-              "zhgtxewRpr3F4VdMy9NUqqvPfGroLPxDW64Af18RtCEv8t7amX9ezvEWK8AgZjHjHXeVi2k8dp4r\n",
-              "TuMjdngEOGe6y0V0qXE0vJudyGSblaiStnW6rV0e34JxbdN3Qbajy6ozlLfOkq7Wqx1iLXxa4foY\n",
-              "IPBIjzxdye8gOjZW7bP0axd+wppVHkXrrvuxUf9dp18AanJIIFv6MCm6ujRO2wyu4ZfSbZp/KVFm\n",
-              "xvxpBAJyjKSdCoPxWylEDyms9NAmwAADmUiy6WUOIsiAC130X9MRKfeLHi3miJh/YDGeINuX+P+e\n",
-              "NWBXxp3RqAzo1eISPcPztmgXUHCSN2VRpnCOFQoF4yyryK4v7s2U4a7V5e2sVJBhb7kguiVFACK3\n",
-              "rbLSCnWI4OCs6u017nghnGW3Juq0rF80iqmo5QCt19S62wAAAkZBmmhJqEFsmUwII//+tSqAFu/w\n",
-              "HjJpMYeKfGxaFh4NwH9VzFzipiNnWLhZf3lim8qQP0NcWviT9hCfSjxxrnYEE59yPQn7u6+tCr/u\n",
-              "vn8/iyWB73TxWIDTyqwOWzo0R8Wj7McP4QWP8yE0svd//Wkug5+3cHmcpP/ONbeBn+TAQ0VzErlc\n",
-              "2hXFLnmGW7EB004qvGi/S7JfG21T+V5Sx9Nre0PuomioWltV0uJSYiMg18UwZktQhoyeO+qpPgky\n",
-              "U9/xX6NUrUyAfCz03v4wSV58lpzV7BxftApX8ZGWBx2zWQV/YeOCEWbmbHqvN18Jd5FxK1iHRqe+\n",
-              "nBGg6SyBQEQQfCMxCo37AXM212ulRN9X2fE3P9HkhvkaOxQZ5AElyFJ4BlaM9J8bcUgOX6NS6Cqb\n",
-              "n7IHMcCIPjAIJ36atWVr0EheDYyrwatT/sRxqfSoF0RgoVqtGqstMXZF7XACu2N9LDV5Ss0B+mSl\n",
-              "kJJqGxc50wazbtpofP341QOLrRCoQigLO2IFkJyqTpln4FgoWIMbx8x6cKkFmIESXv7mZEx6LOrL\n",
-              "ggZa/EdzllkBPCO/+zBjmey1Y55MrbMpoidNDpdQ6yZ4UDU0ai3HtghNjtrUaVDC+dCrSCASLB02\n",
-              "bO819PX27qwUTWW1MCrVhUzQkUkht4Xa4bdnUW7zTudPa++EPxUMVY36vPDJoCGilCgIXzTOV6S9\n",
-              "OVTh4+OA6S/XkcoA6ZjbQLERX5kZSQMoFJs4bPot93titzpDSKAhc1QMx6eKK6Ol2IEAAAEkQZ6G\n",
-              "RRUsO/8AZUEFdKFRxHYcrgnLV1IJewAc5dAL6/Pr5YWcZb4ejev9b/lpY1ea5Xk1AlTe44c3rPkF\n",
-              "DXI6yAdEC7kxPh5StAse03AARSF2nro+Dr5bfPJyYF/ERJ9NScPmUIVihvTCsyh5qmuoAH9P7eCu\n",
-              "Y8rdH1hF/pTSa+Z1tzZc8gwGtgV/YsMtlWLs3VbLWxt2KTDW5Y2b0HA6zgNn25rXu72r6iiN5aw7\n",
-              "sjFipq/8rjgHE9K0EK2Opn+0SPK2Rbo28aoNdC9V8VxW1CpMNxKjFOs8YmQmJE6Qtkw+Uo5mh3ic\n",
-              "7Ng6Xje5wAF7a8Iyr8DMIwvMZnnVp6ilQ1B/LSGEPncviRIHH8w83Grtt0CsL1L2isuyMboY11N9\n",
-              "lxQPpwAAAUABnqV0Q38Aiz6zZgMl5b2XXQAXQ9yHCqNv7FVD9CxHdTnw5pqRTLAoFiba5ss3lqXG\n",
-              "QCf4/o32jzmzNKjZDN2ghdo3OS7n/NFKTMs4yX0NTqaEhdnVRvrbcGvcKo0NYMgzE8UNwneueU22\n",
-              "1vpuKbOkae4P82iS9XSi8TlOPcF8mmD+n9qfVTXzL4r0M/s5xxZempvnxqhz38EgmSM/Zw7kEyiv\n",
-              "giyuP/YjNhFl3FVcOSLiQTCj+F0nLUE7lia+UkuO/YNBXwUKZKD8Add8BG6ZTC4bD/RSktc7uv8w\n",
-              "NB82AXgnpuELTB2xZFOLAYJncjo03/3uAK678Cl8cw8fzlbnSpp5eUkHacCUtAY9LPrz/OMf2bA9\n",
-              "vBE2eUwrxz/W0Sg0tjzkUrpnJSF+xYsA2fgRolT6A0NA++mVN8PJVhaGzQAAAX4BnqdqQ38Aj1eg\n",
-              "HO2BrhbSJp3bjAA7Lyx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v6cxSu0EEWzwOVr17m7uMIt8s\n",
-              "rOS2NL0s+wNbNsQiUhFGWcubxLdtukca9QFTdaQjRXuW15l7gz2QnuVPe/r9SLMinrQ8TAT7c4JB\n",
-              "GrUpwbYY2wvPKUw4NOIKdjGz2TGxM02Yhqm+YQD7nu+MPeXg/5dBf+XeKfPK+RchTbfnRfx28pUm\n",
-              "+MUq+ynmpWVmmfO3TbD8gZCbZRUeK4LOH5lP3nvVvkbZlQVhN5vPlxxNouZsDfsmprxmWrHzH3vb\n",
-              "E+c7VsDA88L9wCH+ZmQGzxFjyOQ8cz4P9rsZSuU8vQS1h6fmk4XXUosrmweEGKJT/Sv5qb0OG8e9\n",
-              "voRxFaPrroiqkALWSnA5n4zcQMwfY/xXX1aR5rslt9ItB406qJIsbsrkl8pXUe2CwOVm9B72bhd1\n",
-              "lqsCRNktqyPMF/Ek4JsxscPvDjbSqbQZL+uT8zjgAAAB5EGarEmoQWyZTAgj//61KoAZQB+OVG5p\n",
-              "SZHABUb2//v8PGtlbWZ+A0oGGFPTAdgmU2TFbsuJ6mwUCouNe8f1I2ythN04JSJ5lx+ik6KpnC91\n",
-              "1FD3eD5Jit+kJIg5holbnldcijL50GRMV+Tt0L65TPBxqSAUdrQu+eLUTHPpJCL4CV5RJau8pEIv\n",
-              "uK3a7QA/UMQ/nrDjeZ6jqf1BF3JjbyaeIc5drvnYbR6lQ0gBIzp/QRU9xrHm8FESnIe42aooWDJ9\n",
-              "bVMccs59QBQd45WisW0MXV7NFtyepgfK7biPJN57MDsWL2A4LYHAXH6f6In3GVsSrYQ2HUKGlxpv\n",
-              "Yf/Xvk0pBnHsuIEsslXTjxwTTzuRb2YT7QCJp6yHiUVL67n8RfvHMNoHfUzP4rVgPSXcPL8FOP2d\n",
-              "F8GxovHNOmsOSUyc+t9OZXQFF+4FJNSN23FsgARohBEJ3c1u0ax3ACLYlwfCd3/U1mT29ftZkWMR\n",
-              "uj01t9v2AGHvgKM29X2Vs/ALzLNDd2OM9z+AC4TlcpgcRujIhnjHf17Je/8RMBqJCZtdfrFmz6AW\n",
-              "Z/aNIv/p/WX6adpvStFWxoDAnf+Tai9COS20TO4GHDviQkpMo6tbNTk4tiYWsmvBNq5u/aO08r2y\n",
-              "Bs1eH2kAAAD6QZ7KRRUsO/8AZUj9pUTz7rNMoHjJ4gSsLw2wABNFEVCVBZ8at73oa3C8UmeDMVba\n",
-              "M3uHP8p2EFDXTkl9EiChbxZZgpuvefKfc50lYhoTJ/7H62X0Z9NX2I7S32WT1XJeJtD32zfVBu3K\n",
-              "VmE+30x6+W2pKnyMM0ZejDKLq8WyIyi+9rC0QVVyU0N739nDCyt6aqRfMfSdljqTnwOmgDB5pHyK\n",
-              "U8Nf/BZxnIET5uBVX/VcS4bjmT9sCYYwmAz5vBy8cv5J53FYPh0/wF7kP2myhm8SfTnmNtpTej0y\n",
-              "JjLbrdGSBUAu+lwbCsr/YdOCYrxvvrklZP4j4s5VlQAAAgYBnul0Q38Aiz6zZf6skuDOogA4jl3V\n",
-              "YKO0NncAuqtob34dJ/eVmQtCFk2jxP+6gBUwoAJ5d6wKEpypNd+AlIf83kNIAAC8trXyGAv3zzzV\n",
-              "tAa7kzCHOXS39Rxic+qZEHcHH0Hx0iIZnH1UNeoS6dQYQqolDkQpOXG8nP6tDCpAEYSQsJzo5kch\n",
-              "Xf9jICMUCBjMQXeVS1i3FdA07mrKCBowVzEdee9WvqvXV7KuMTufiL0hA8BHvtD6VFvEZ6eiqgvN\n",
-              "8RNM5cYXQ2i+4Lx4R2QlAIN1NNxqM8GvSjSh/rgipqY8DwHJh8p9Jbu0Zs+w86pgxJN8m/cvWxRZ\n",
-              "yFAtI7sBhDbJnNXx83ll0o93YVJhxi0TxWXPf6PlHZeEyvr6QOF2VVafQjsZUg34P/p6tj3lkAer\n",
-              "aZouLIrbfbTrpoGdtXuXR2qC418s780GZsUBVTlvppC7dgGYqQzB5daoV61BoiIg6tQyG20Yk/Ib\n",
-              "TtwSJmeU5Eiu/zRo0bpbU2jgV79WVCB/SVzxsmoD1jJEhzN1FHxsbajOijl9Vp76GofsezNr+37n\n",
-              "UWWhPPzCk1rCLQgaI34ekcMUWq/vBK2WDe7wKACe/5M5UglN5Ct9Orsd3SfYPc0336usW56marFA\n",
-              "xW2XgVLc1GludnoFyQrT+oASHSl68jJc1j3I4WTIeU/p+eW8RtUF4AAAAR4BnutqQ38Ai1egJmdK\n",
-              "YqnGBlYUAF9obzNVJ+s4Wyt0Rq0YuZmzKSClvCu/741bUzMW9+2RqBxHf8xROd9WCD2DFO6m3iiG\n",
-              "ZOgLMC6WQsGlrWDKBATBQkW8M70y/ztO1ZzNQj1ow5FREW75+T8qWeYnaEkP0sDPfhS/8A++EHpT\n",
-              "ONUZpoNHugOpCj8EFvE/MnQhkWbqDB+V4zYJeD+V1h9PGTTPeM5Ykyq4ZMi+8E5Gka9dd2CFXMaQ\n",
-              "M99mRo+FOH0+y87A4U4JusoMgrnGwBHn7tNdR1Jgk+wKYqmIwBj2jGPnQFJXhHhE3ZkpIjaeakM2\n",
-              "8MH5c8xC359KRjK1nfiZHGSkxS98YPps7lGGiAJ2WdM/l0XaVpItX1VPHy/wAAACGUGa8EmoQWyZ\n",
-              "TAgj//61KoAWNzc2A41R+LAApun++OIZUz7EikV/szjfxvYPLx+f9K2/F/he8DHawkBMdV2wRLxA\n",
-              "t50GIuRUSWE/39Xo4nAQqkjDTJdufKMgNIx0erMAcY2QA5ejjVo1tlzncJOxCqGpuGwA+5/4IKyu\n",
-              "bmTzdPecTw0ZdpVPq5j/sb/uUTmyS5oriK2QJUn4uMhurpWU0pM90BFHxmx/55iJQnC/E4AiRjGv\n",
-              "TSfvy9eol7L6q3/AmWDGKQmta5h6TQecJSS7keMMTmFMkcgh+dQEUTFbphGIZpTz6vxfkWPPyqpQ\n",
-              "VmS0gectGBeLssajkGiu1ivhXeMUvGnpqjpc6XSD8FJ8sVdfwdsse9JozsVq/t5YFq5+AnEYcopl\n",
-              "mlIiLVwif6/glDa/FvPVZyUrYuYY9L3TA7eEHe1IcHWSOPxpnafEFBrVGoeZPrbfymiVcHOQ/3CX\n",
-              "aGrpVwdWrmOHr8jLuajUxWOW37ajHobcyT1hYWMxRTx80fZmsfvsrNw/Nztdx7LidHGE8jPZ4gQZ\n",
-              "DABlByR/bof6mTmjqkfbsR1PCXy4RDNnn9nCnaSnb8pCApsF6YsDTv0+UmVzx2ZPSdm2LhZIqOim\n",
-              "mhiXHWt+ZE1dnYkLwTdsgNYEeAUTjY5XG25CAykSMfKGwGWeeOwqKmLAqTmb7mCXXxxpy4+bbELo\n",
-              "RAxOLFOR7z+Rlt4VIVMH4QAAASRBnw5FFSw7/wBiyP2mEJvZyVx6ACpM7CM8ZBKHKR5j7ndOem+L\n",
-              "X5lQTliSlHrc19blDxI+BarmPxVVRFr/CorqLGvI+vHNUfF9L5rOth1seL+LchCRD6bYXJMlctoQ\n",
-              "KBnrSfN8OsFA3rCX0rxhgXIKgdEDuCNRYd4XCiw0AyO8VPwgQ3UKQOwN4T9AdwOVZht3xWSjlGSY\n",
-              "LTfR+DOcni9vpFUI/V99yTFNeriW/Ezi0Mmb4Xp+UrrTAn+/oqePQryHATZ97i1I4TzdZJ6ol421\n",
-              "ZZiGDIa6I2z+mz36WJISXYfn5PcaqZon5evy7wkHdXdLSXQuyy6RoW3UMK1kv4eYGMx6MEUBV881\n",
-              "1DxJ4Az2tfQhJ60iq3lK6xGARpoGTWiGA3pBAAABAwGfLXRDfwCHPtdry+v+2nyY2Sk+gF5YW5HN\n",
-              "XoAL6QRR4alJgXnPRJGLu1H/XzBsCOVwj2OHZ7/Befz18ioG7PdTUWTo/DFmzXwFwKSHq5MESJ/K\n",
-              "+czoaBaMU0SilMUvvgF9NaNkzEcYOJjCpUUkl+lvc9iWY7aNcNT0YkO2YuPLl1ZJa6XpXyzgvJfC\n",
-              "YABMMMlHP4hWdgac8C4JyYJle4OEiXwhanMhhDIkpZpmZqqPP6iXGzuSTb+0ZDMJHqoDGqJmkb8S\n",
-              "IJuvyZGNE4panvJTPVd9f7g4/aXxMPm3Cn3wfT3mTthI056NzanOEWKjM1qGy4olpTOi0cV3zUKu\n",
-              "VGl1k7sAAAHXAZ8vakN/AInJcXImIY9AsY+/nZAB2XUf7nMR8KlDfCSlxubwbY5yyAvaK6FdhjtI\n",
-              "iTEMX/gD5nqi6yBjPV+WgerMVdQiwmsTWCh4ZDRMTEvRNiTK06p6H4BM93iWfwAaKh8Gz9Gaukwy\n",
-              "InHLEZ0yD1XqM2twrrM9K/zMIWUOeN0Z6Qpdges4mCaPjYBUMA0KTxEuHmES85gUYlt0s0Ks9Nu+\n",
-              "2hfyb2t0rmyvRs70WgBBgYrdeTZMCwmoCbRHPK4oxsSlCang/p1gu/DmbjnwYRln/v7ufz7R3gdP\n",
-              "Fr7XrHKEZc+f98DBxQMF82PBbmDGtLAQXHwptz6g5mqHfaJhvvgj78jkqTGrQ4WXMBaKzHGNvGYe\n",
-              "XIR0bHtcMMQd0uz0UHs+NS8bhlZ93PGBn0DI4S7X4qFOiND2PCIg5ogjbfFqU4Kuh5oLH4L3vi2E\n",
-              "bzWP7DaofhwjMqjCqAvZAgznNJDsvnJzQxJ6Pqjj2ny04t1drdQRUisSLN+PcLenLQZbe401Xg2H\n",
-              "yhW845ouHrITGSqb9EOEeoN97gj42PjsdYRMVLRDVvCV2BOAqdLbEmICPHZnyy75qPsejK7duPuc\n",
-              "fJ9rEnjynB/HxYz7zf/RM6xyYbzIoc3AAAACEkGbNEmoQWyZTAgj//61KoAbj1lLPyvb6PAZgAh9\n",
-              "7f/9/gX2SHKs8Uq31kdycpXc3bf6XPCYn1E4Nyshm7SbxYTXwR3t77AgzFtBuE6fBgZeY48yXmAW\n",
-              "rqOr3iMlgArjVOjemrjz47grY/T9rKmhvhaqPi8pvZTzkzZCl+tV6nzXVbBFw15yZW9xk2z611V7\n",
-              "GITjv5GH4Oi/06B5IbjEMVKEcRpvt893HwIyUBXniM9I90uh0TBxOedvsxxE2iLZsr/m/GNXryb+\n",
-              "9as6btju6GU5FfXHAHKy97PxI2Rac5Rx/FoPiuKEecRx7EQrDfRmlggPPP63oMY4jkBeTzC7Drwp\n",
-              "8ik2Z4rhoAMWlcRPfXCI56oe4Jt09oRInuaD3ww9/jGDjhHIXGbNYM/s5UG1XuYLCqaLxESIyPG/\n",
-              "eNnETthXX/QZDvDCFX3YINANkqDvHlUQ+vcUvksaWF/g1aVcMu45c8BoP1coWBAVWVE6iyDMwfYl\n",
-              "RYTcnNfp26mpOfqiSJnYH+AFj0qGJttgeZBuJCzdV4F5EDreo0WWAiq/0jdXljJ+ZxDij/UazQOM\n",
-              "0ct15Q7rTOqLKy+lpOVa/koSWj06e8eyy0wY1FBSVaROGYbDgXze1QzYiVyP6+WTk1fjz+Do+J+/\n",
-              "TxVlHJsfUOz0tbPJ3R4cSjRVigTxPg9VAYynpzzMlIr0/pCOGd4XYyl3SGTwAAABOUGfUkUVLDv/\n",
-              "AGU2ltMhgssRVFnYDYHdfwUIOpARUIP1pWfDHpU2pf97OTOpyP7SrW+j72yMHgCy10/KQJvVenOE\n",
-              "eMrSHUfyq6lVIsdEDgl0M+/NXx5VMpg+IZB+I7xozsY2f0ARjiAjA8ZSqG32YEqaGwpGp+vfKL3P\n",
-              "hav1CfnyaUmopPCa0Y5ww/PZN4YINPOwE+Gg36kaKP/ME/B0d8v00CzvLXmI8pIa3TqrGIa7PF4X\n",
-              "8miGO6oXkRH45ag0gFdgkGj+BD1PvtIptIkuqTa5jzG/NewDN9cCfws/hjc474K6NoCTyr++7Tth\n",
-              "LSIM60DcVje0csuhEMwOmCNob99l/AJp/9hMVsVsEaxUNsWBZFMKnZoLJU/ljkNlTtF1zcUwJoZD\n",
-              "oLTT6FmWVzlFnyfjiJdVIqMAAYsAAAIPAZ9xdEN/AI8+s1VkrBucudR5tN1L4cUDsugAOgW+6weD\n",
-              "VD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJlwit0rQdaNL8wYmpMOBxVMKErdopYTnWfb0EZST9ZFP\n",
-              "kGeAI5wBNyE7pmk7U/hz6/Uncd5yONsvInzdtLdlFGIUuwPsZsiC4nxcPKJ4ER73zqMcPC62dMwB\n",
-              "YeP2JTSzcWxmsY8AuUeSUMff3wugzCWo2dZWIqj8MEevc9dnI6e4RX4rfqOmeKfJ7QFxuPllAOzz\n",
-              "FkyERujhdmr2mdRExctZgI01tg+iF/NwBCqP+hQ0BZaq12BgDPwBcWyuj8PXGo/75aroqbic3atK\n",
-              "78lcQoP6TccBH3q4TpJbdFKZCXZFrS7Hh71ZQxzuADlZ8DDRzGHyvFJs8+7LX0Z3SVEeli/7hzNR\n",
-              "3en2BovQV52x/rwTox00ojUHS89/I6QK5rr9xZ5z1Evdog7ewBETCofR8FQPxE+2X576ofb9SYpa\n",
-              "RU+FFWJ4WPQBj/u1ljXdmoINHOgs90YcpGG37DHSgRaxKh3h9samVWdsr/7ZPH7Krx9nfE8zJoXc\n",
-              "5Frf0sUOO22BhUTf6MatKarbA54SuNAmIi3ejRZKQJ4XCjhpsLBrmw33yy9Nk6OT0LCi0ELysL29\n",
-              "OvbOK/J+/iRz4bP6v+/3ppYXG9MzSEeggmS96wm6yOsevJy9wrAAAAHWAZ9zakN/AIdXwVSZADwX\n",
-              "ZeAC6HD/yFRsSkP+ZT/GPlFXimE8PIk5/ho1VfL2NNL2pqViOd6YYnwc7ksNMs5IkNYQ+fdC2XMm\n",
-              "GpZcBQdS+anJcAkZpOHFxqdIo1pLhI3h3bcsWXXBd+BTXZhbA2JSmhm8EWBGqSBNaO0U3Qcdcea5\n",
-              "428f3xthr08dSK0oFN+HNErgBuKfL3JZNShDHaW66u0MaG1B/cF2Go8z1F6LGKUAmsy0D/C2CM25\n",
-              "q38c827dgYTnZjZnTFxlPuxm+JuWvYpOeWyy3J/wjV/USVL+4BKz61/Ccy+EH/JkQUqRmUOtvYei\n",
-              "XxTdexyug9nI6kyTGc2H3hy0C3uFxKKFKo9PfiwDCQWhQ1+vZIsII4FYexn+pQbkz5kmdlWKB5Lx\n",
-              "ONpNVggWvIuTYEFI34NTLTOf285YYkebB68ywIJ5f1uX/OXMZ5RxH3gjNZ8mKLNX9suvs06qOt/Q\n",
-              "e2ZfZ7Orgt/l3O7GLxwWvzugIsO88I1KhpZhgYDdYZ//1lVBcwG/tKVYjF1obqjtyFctY9LPGIag\n",
-              "318ehZmIvkhW9djj90e+pnWknudbQDv3Os17s3l7qFADdqSGqYyGaSU47a6O12HCRSwmepV1bewA\n",
-              "AAIrQZt4SahBbJlMCCH//qpVAC8LE+AX+ndLRI9AAL65x3/f4eNbK2tvWi3seP5qm31GHdf4edmk\n",
-              "0/ZKv9BuxjUGH/qoYxXDUlaWZFHb65x0lomfbckqRBtklU+1LGTmYtvnPAbKnUSAh/jTBATZpFND\n",
-              "l6V6ofQ5PTBcFjOWwgI6YqalXUkmqnN6g77O4xvodhM7XQWhsA44ADmvatn61wvReF9d9MqoCN9N\n",
-              "Twpkx2kbbrSoHJrSyqidCsv+e2gnLoWDEdLGn/42++dseweQBj40iKRQ7paDrpDRwTZVjGQJ+52c\n",
-              "gaUSUp5A/cAn4FgESmp/sZ0NpfD9/7ZAmCbSUfPUar6ndxZ3XG2DXWcNFu473rzFQZNpJnXg/Pfh\n",
-              "QCQDuu/iX2Vi2NjGs1QVI3BReUxvD8Z/YeLy6w0jDh9dcJGJdKoNjb9Epdy5r0lFeFb9L8AWhdEd\n",
-              "sGreMPdTiMRlq+JOqjdogseyQTcuDo5iesxIsb0dhY+P9VqSJtTxyPO42dn6TXPZDgt1vROlp+Ic\n",
-              "VTutbib7FY5U+jSckVQsLzLRwDuIoa+HpEcHjzuwHMaHrKVljgiPeRI3Afdpqx3nHgy0MFCOhGEr\n",
-              "Jkw+Dadh5qrWjCGOX2K5HPLV0E5qw7krTDhpWX8sTsYsIqvxr/V2EjIFiKwnheBvunmhlbHNUKTl\n",
-              "ykWRC9Afa8QE+vO8sLJHYNqVh5kOrsn0+NP1Mm4JPbYiahSDJa4o8TJzkXFBAAABAkGflkUVLDv/\n",
-              "AGBJAvfAgTZO/kHo4lc9yaSVZkgaxkXEQAgySaAqoJy8U1XmJXFaLzsHv4KqZnckX0gP1AYFUr5X\n",
-              "3Zof5zltHp7OQG87KhkyMuJLOz4diYjf3ctsH2KA3/S29L1hP4qjZ9kfgNEsjrH/nSlX3ikiiFcQ\n",
-              "/2mu5vwlzQMTIUj5/0pAslvbULpI2rwxcgfjtpeW3qe/Q0sCZXyJ3L7VhEaeyKZo/ALUAi114xdn\n",
-              "Gao6fyKpZhWohGCsI53i8XO3Y7Dq+aD4ONx4A265BL770fTZiNNw+oM7dwTK1vcPMdOTVjz4fi6j\n",
-              "bCMBPzMCGM7CsAz7OQTIKiUTlOi8YAAAAakBn7V0Q38AeTG7snd+wR+ioRwfka+slSBm7w4HiigA\n",
-              "mYoe7RzT8waKJhe/5/xyHdk2lI4Qb6yur2vWdYx/k/gVzZWx+dAAALHLM2W5kE06MD+/WY8W9vMg\n",
-              "jgsWx+NCob+sUo3r0m3kC7Z6vE5pa/kp8NVK1XizBU/gSaY6/S/NP+nzZeAUHhvnb6LPnQnTmhI7\n",
-              "+CLAa1UiK6P+lwPbKP0S0Q5RWiopmhls/AKTmwxXB+WRWyrrFglLMCCi/H7yBlZCPn3f1nUi1WXW\n",
-              "txmtCNftDVTPLfu3fbw+YSszpG0LQoe/d+Hn14JtNEXcVveVKgdRtrJ2SZSzkDZoD5uTokEopKbG\n",
-              "geSmsxJSe6mDenK/tstnSjFiozTKWgyJb1mTK9iBWStV+uPeceDypkgatRgkwgz17Zgn457UL8xo\n",
-              "RIb3Rzvhn1PaM6KKHv4wQMqvpqRXKRm+SScKgBhgUzc706tHx+sk3QXrFbfmTj3VwEqpASdMV8SQ\n",
-              "Rc7Pl7VdiwexHM38nPcgZguGyvH4NF1CZay1mT9d+wee9MfU3VHZJgMp057sUGFJIJZNmQAAASYB\n",
-              "n7dqQ38Ah1fDGltbSoFNBABy4LNfpqaOuQiA03rsvInHR01iNZMDGQE2sq9jRvjWYcCsjv8TgHDx\n",
-              "TelM9UgK8aIkbW5xZBO7YH31DMzHB/HcoCKmBUni45/7i/CIo8gF1pGPr0DAA7wV6D09MIgWLTIz\n",
-              "u2RlgzWHXLOhQSqpesq6gEgghz4eO+szzJWiaji2cgnbFYV7gS1iXMpBIisJc8i3U9gywhFgtGxt\n",
-              "IPW/7TiYEwGOLwxyjZX1HkROuSI8lAAdZBpungwbYVpPKSngzu3PnOIcBqes7c29MHD8jRPn7Zrt\n",
-              "720E/jZ4jB2yT62h5AEs+TCYeJmiY6lwGwXm58hIVqeMFafCwAYhd3vDCtfE6mymrvYwtLYQ0YeE\n",
-              "Ebj2MbA5+zEAAAFwQZu6SahBbJlMFEwR//61KoAWx89GABUe1i4OfaowcQHQyqHCv9PnwkHOB5jh\n",
-              "ZaY1nqaJvfgMHLxnx0HRU319XsFiIgZ3fycxZ7MoTbod+V6rFy2y2Qtld8RvCt0Ug4PVQuLFLU9x\n",
-              "N6gbeWntqj92UVkXYHO8rtnoyHbc5vkyDRwK85+1rEknOmV2fCPAJQWJQHZKzqn/akJ6R91HlWya\n",
-              "u/8GgP8q7KTtX0XyZMALsB3jT/UhmW5AlGIwNHeW1rtDiMG/Xy+69i+m2kTOjww4y5o0/8WfwLLR\n",
-              "RKlhEE1LYjJQjoy3+hNy7YguxzdtR0GOg0UsPQLFZIBnnCwGmFharg9MSkzKoZck80tBnNzVcu5F\n",
-              "Ot8W+bdDLv2E/9UTXci1RXlM26z5jearPa/9d/CciU6kElsImbzJ5J2YpzVs+pvW89XbvAJMExZq\n",
-              "wXD26iUkefzti1p2cc2CbM5qN5CGCTCmR13du1Y9J/JQwXkxhEAAAAFiAZ/ZakN/AHwUpp6Dymc0\n",
-              "2L536BR5shJlFypABdlGcrzfdaw/6f5GB/atQKmEnLjISTsAvG6zfbdBMs7bm2yeFrIQxXuK81kC\n",
-              "9pAAAXcBlvswH72knWeKBsU0Ht1g5h3YcKtQv4e82ah693wXobc+mdHgPA3TBKIFWUv/iM+/E90G\n",
-              "S/NmTeZC+lgt/zT/+HMt/QSFK9C1+AMdH9l6Wmy5eJzA8pumBNuqAArwclv8LW1AC9Ryj7J7dIqZ\n",
-              "2nhKIYQ08cavMFAGExrDHt7RiTs4Auer+jpijDT1MWhCFcQjNZn9nbOp1MdYUZ3batlHR94YKH39\n",
-              "SB9iaEe1H+vDrSDRsP3b0PfVLevCUtQQ7tTMju5YxLigI0SkXHby6oMGwH35DOmYdZ/QEHihEbbH\n",
-              "ljlaWypqm6TR7b/zNBCPoaZiHS0IlbTr/gzMbXxGasP7GssB89XtUV2jZihKJYcij8456L2VAAAC\n",
-              "WkGb3knhClJlMCCH//6qVQAvW48vGhnpxPcAFRvWsRQfCH0ZQNKlkI/Fmy/VFBZqjdqwlFWyRDRU\n",
-              "ATa/x8nSCThm/LYIboN0iejGj3Uchm8nyLv3P3+HOOnCw7+XGsyycSpaT/SKI8hu4RwjrdDxqaYn\n",
-              "k6pZ6qjZtX+IZ04XS8X44piBkZKHHklQnddyez3eJG0JjT0fN5b/c72jAD+sOeXlR6iPKkSUzu0o\n",
-              "3ha2oHN6UEDmISbP1cbB3piI/SHrisHlFNjIuHiEdkqSzG95tlcEE5RmJMFHyIZtmV+VUnHUg//H\n",
-              "WOVjyT0+oFlaS4c8th8dtoQJgchjo9u+OPpSDxEJgWI6zeeh28ogNTGzlwRqjfRSsrTItvjA1MD/\n",
-              "oBFhKLk5Gm5LLSkMpDHu9T5I2IaoH3PKDFRJp5FswrHAqK+C6EMiKJRw3UfQ++e71IzTL0xpDNJL\n",
-              "z6AeitOHT7WHH1q0lcaxtRKIXyzlri2FOeAU+zEh7DbcM3wvbzCPYrbD4ePmP1flYALif0DM+F20\n",
-              "woqO1ciEp6KvfcdLwkVhOi6HukmunTXGsruYaqjkaLT2QlUIMJVPTAaXGvEAsJSG/0vfsDXKkk6Z\n",
-              "sB3ElNrSO3yHej1aIEgW5xnCNisEQsWn6TKnOYGilPN4ZN8EB64V0F8PWNB9Aq0baX+T8kKesmFw\n",
-              "2y/668NRP8ypn4s+0TEew3V5nLH+An+XxWolypflMoVnWhEhG2W+IIgxfWfPuSgDmqBKtSemnfnO\n",
-              "mj2z1HJ4yEmqNoBjJwYnWfK8e0PHHb381Mk1zGGJOgWAAAABUEGf/EU0TDv/AFlVerlP4Rak+BQA\n",
-              "rfH1MAekqKZtO9rI3YpPu0XbIusXd4D2mikBBjNWCs5ZCx1/nIkAW78LpHSyCScRX686DgqeELvg\n",
-              "+6gjEvz9oPv/Q5SyPMBeMNrb/QJ3ato+Qw19nLJWjl0bduh+HilMsrklIYKHCWBaC/dNC4s7Xl/r\n",
-              "RCzM7ZJuRKmUY/D5sEAdr/H6TIVmiD0u2jiehC8y8Gw6flB5fdlWyz5ArpMes88RS9cHH1n4Dp5A\n",
-              "9YiKoxa6XsjMVtwy/Q1CE1CcjEE8nX1x2wi3FF+AiuFwqQsSRlHtfUsVksDBdXLvE8zjbyOIuIMV\n",
-              "pnJU22cEHHqRAVAAAQz/a8I3JUwtCYefKDlHQuITIdlhxtkj1S9/MOKY0At1R1tnioLMWN7HUVCo\n",
-              "b6XS9uoGwS6oOJgKcTFbR1vNa4wchWq0XCPds0DBwQAAAPYBnht0Q38AeTSjvudgsbkOLNHOwJSE\n",
-              "7MIAOT4Tae/DlzyAOhFcKHSt+XmND2K3krM1WAe1ksxoXOx8R5ib25iI4yoXHAvjcPvcDoLvQIYy\n",
-              "rfzkEj8FCsgVqTty2M7mcrrsvBMmGI/tSEAq1Wpq/wSUg2I4oZj0GjiChzewD+uw3YnWAi/Ntf5Y\n",
-              "Cv2dU9qEo9e3jPCavhxnj6HVQyqcvxekJ6cEcAGQvRh8PwiQyys4LYMz+Th6jmnZO6zDQlY1h459\n",
-              "aXiX/1NPDVjhvbOibPxdXy1nW8ZFN/ZpmMtUtTAz4mvuGfLCJYTZv8r0n1cztBPRieehovEAAAGy\n",
-              "AZ4dakN/AHwTrqiSAEDVZr7cfUIfCi6SEtf6z4BBmn/qEvCbGFYoG0hJzipIIEfgPxGLOPb5hgYo\n",
-              "3EqlxYfhyi3ADlPB0rSvUe/2K1c1bOHHkBdbN7v2fRCe6cTgBUViIyBzKbW8+YVzs1NjLsftvDLF\n",
-              "Jws+AVbFUOsz2XZO6+tJqS4okplORVfI8Zh8pjE7ly6+HI7Omo301kEp6VZks8VHiVKJOuTRsuFe\n",
-              "1lak9cDIgZS7IV3MkEjdmu8V6wPVTOui5KhgRegdKpe7dvKwiZROacSHUyEpgoiQ49NAkgd9ICSC\n",
-              "nOG96XtcVUK5qLGXI1ECEXtJcuaFVMtCmmOBBiFL8jC1MpHbxQ+4k2qRSUjP3JvFi0NfrsxeXbrH\n",
-              "Ebg5vBmNpJE6T+wdC73c70xC+Mtp+wYFzu5kfTKcL8d+Nzu4GlIr338e6SWwNSpXRGjfdLp9o3Ic\n",
-              "2PzMtQmrlpbEeUDp1vnkaZoqSF5M9xanIk/zohgoPX5++NN/ebYvr56WROjUeIUdsOf6nrJlmboT\n",
-              "DZEat6r4aY15lVCgiz4Mpb/mqSazxzrszmdRYRxGsW8DnzAAAAHfQZoCSahBaJlMCHf//qmWALFy\n",
-              "5oM61QiAB+cxK4+jNCOHXw6RALujtnWF0llKsvjvaSIz+44BdTBn8Dqmduydu0Ab2yYLL8rBa9BR\n",
-              "bM/WBrO6FCt4pfpaT57HiAbORTevnWHgnUCdwsiqbddvhjkiuJYbgCMD0kEP1SURu/b2Z5hWsq5s\n",
-              "eIdJwlVUmffx/GFsHH2OVg2kldaudIzyWEsMXsnZccvZ4+1TTMECSDKdUtlhUW9AAgPUraaePKP1\n",
-              "hatMAsKbsEP5g1nzjTlmyHjs7FjRbwjKng4/qsqVQ+s9Z8Le9mq44VPerxrlkKxdRgf8PQXTEpxP\n",
-              "gMR8UP9I/vRSJBbzTafYsMhPytfC8ESUe9ySga0pNZKSvC+bN1h7zO9OEjqF3rsnXJU2SZN7NAbS\n",
-              "01WCPkWQIdWN39TZ8BwhuM2E1/XfXA9OxCI/7PAG40Z8M1rKVJPTY+iwZnIQA6cEF3rnJVasn/JZ\n",
-              "rircnzzi1JQr5NiwthCEkD02k7GAoyHtF8lIKArvw+GqH7Ox1Tpd6DhPPJm2hmyijeFH6E+9UCJk\n",
-              "Iiolc9K3UW1rmUlHlF/p9jHAvsiiJUpuG/KCfna2LEYj9yn6P2oNlWfqq5P2HNtctaJeVRZv9Qb/\n",
-              "mNVjyjAAAAErQZ4gRREsO/8AZUEtk8LzOoS4AAhIFC88oI10PfUAs3UxxCOOtSzHREgn4/jgVfHt\n",
-              "0r483Tf2Y8D+zGlycQw2lUV6Nidlo0k0sASUCm4dEwF8Hb0+IzseFE0dYexJdLqvhcI7IIUIH6RG\n",
-              "uv8cjTXFD8CTksvYGpGc+uBYXhlwc3/jHhNGtm8G24uHniey+Zy/NtEpSl5dub3bE324kx+/N1gF\n",
-              "sU/CxkQF6UQWvd6Br4nL+i2L6udCLqM/JAVJhScc01UR/bE+NX2i3upx0qofgxfWL8unNZ/BP9Vc\n",
-              "CvVXAtxPw+0JopAnWMlwtBFG9wd+oP4zOIJ88u/VEvyZQd0JJP1Y3qhYk13Deyiv0C1r6ci1z7CQ\n",
-              "UwYqgUT64pT/hlIvHeCzEZxqH+WbUbEAAAGYAZ5fdEN/AIteE+hbrZmAAHNd3/IVGxTYP4E6C+Wr\n",
-              "63le3xAHjzqOqEil1tIAAUY3LvF62/277H30QskV8sEjceHvPe7bE0mfZ44avBY2gS0AAAMByRDk\n",
-              "EKOyh31Y2H0mdsy+zcGsPrGm3pHtO2riBcgILxHO0F5398HG90hK8UgtDUfp9CQyPOvDSyEU4WTb\n",
-              "6/WT9Z3aca6tb4C53W6p8Geyjq/mwbvNpnCVbbqIcx1ZT2+dencovmeYmPlI7jrhk6KwLYEd+5gO\n",
-              "J2YeKk4iWai6BsaO9+Tb5P52jBVHcSZ+Vws5QhTxkBSpdHlWJRcbh50V4ViVltwUN//XNx+jx2bk\n",
-              "KsfglI41FGmS2xAJtr8ZhKDk1VRRL2tGsNB5nztuRXCFd8q4MIuVVWGjim0ntcxZ/R18mzJZN+sI\n",
-              "qKUvfsxoaeZp+oIaU1hLeXzgcHEe+3/6emdZeJWoDNhUqhkfWzWzVZbEzUKpDBS9AbVIA5KR27LD\n",
-              "3HEfRMw9yt8eYILg7m/Rm2ubtU8u6V2QuxVXq1OHry5oY2TAAAABvQGeQWpDfwCPV5unds/RGF4o\n",
-              "aWlq+XwTSVpG+igacFOApaqyNJIXSXT4q7gA4DkP0YAYAumNCN0MwD7HSEeIsv3Q3L9kZ2RagxvU\n",
-              "jle4yQq6Zl5W7AgdlZnaBngH/w8xYsqWx5t90zzi7s9VyRY9jaNshfxuJAZcRgFILNTmQNCPoCtl\n",
-              "wyo5Ht91VCy2qSby6JDLeTD096PzM4KOK7/I+amuefuT0S/QnDNs952oi11JV2mbadqtKDqJE9x4\n",
-              "nX/OjU9PBP1uhsFLNkjsz6ZHlTOcsZvWUxabbw0HBNFuLXWIYqtAYdWN7c/QUoqY2IlVBR//v+NN\n",
-              "Bxf/rxPv+9QlTTeUOAVhzyU/kQACorW+VEL2KFNUPF85LUxlbSGEYQv/98/fAQAu6hKRw3yoJoPy\n",
-              "tyr7S7Za9gGurMYseuvuasNoB+fPCmp37VWgm4yNZQ0LM+8CPtaQgShVMs2/RIG2cXksHuYVqEB7\n",
-              "PJtzP2tl8EYDen8RohIb2UO5d/Xdc8aoi/Nu4IzGq8ApuZIxjC5J9bUYtMDEDA6eChGKPjb20vqg\n",
-              "2PRBI2fSXJrcSROGTC4m+VsF+VagO1LnjrakndEAAAHtQZpDSahBbJlMCG///qeEAVH55ayIAL6z\n",
-              "9D9Go2JR/VsPgULYIy+HM1JNQWUio64eqKV59gHDbxQ77xKGvVi/RlMeepNHF+Cplpp4rKqgivaK\n",
-              "14o0jVVjKwdzXmYfm8QJck76NrSj9rXzMi3Th9DbQ5HQHvlFr1+Ft6fGVXaubVoF+Bx3J4nvsWO+\n",
-              "FhXDphKaWh9geM/3PqX1TK4zqhRL2wKgDCWdLvIi2s2e48RSWR1zksj0SjkMINJfgjA7wVj0dW8Z\n",
-              "NZGlcRPjgkoSgpomI+x9/l7dJ5fHEj4WOkMQMTJnj+KOqaXfgtXbhBachZ0Av1Z6rh+qw/iObJOy\n",
-              "7q2gUdlftEWI7In7KZjqqg18Bg+z35wI2FmknOyXdEiDAPaFiRrhqkKOLfgLssw1BdohiuTGWlKn\n",
-              "NvPL4EzIbAUeS+0qv5cFdXvRjnn1zOMYTMpyN1CZYg4pqjj8mGtGdm1F7w0Xo4Mnm3hRmvZyyOaW\n",
-              "yf38s1SCwyOkhQcwJhrAAebvkxMWrAUWrTq9K9PdCUqFbMVB9+93aovoux8zBfM/WLangtLLXd/D\n",
-              "T9TcgY0eosWGZeAhQk2sxNC3bgvMT328AT2T2XCg2nG4jsOakPWfscwbc0zKfItj/1eXvyR2tk+K\n",
-              "fpgdg9dJ/OdcXINTUAAAB95tb292AAAAbG12aGQAAAAAAAAAAAAAAAAAAAPoAAAnEAABAAABAAAA\n",
-              "AAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAA\n",
-              "AAAAAAAAAAAAAAAAAAACAAAHCHRyYWsAAABcdGtoZAAAAAMAAAAAAAAAAAAAAAEAAAAAAAAnEAAA\n",
-              "AAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAABsAAAASAA\n",
-              "AAAAACRlZHRzAAAAHGVsc3QAAAAAAAAAAQAAJxAAAAgAAAEAAAAABoBtZGlhAAAAIG1kaGQAAAAA\n",
-              "AAAAAAAAAAAAACgAAAGQAFXEAAAAAAAtaGRscgAAAAAAAAAAdmlkZQAAAAAAAAAAAAAAAFZpZGVv\n",
-              "SGFuZGxlcgAAAAYrbWluZgAAABR2bWhkAAAAAQAAAAAAAAAAAAAAJGRpbmYAAAAcZHJlZgAAAAAA\n",
-              "AAABAAAADHVybCAAAAABAAAF63N0YmwAAACzc3RzZAAAAAAAAAABAAAAo2F2YzEAAAAAAAAAAQAA\n",
-              "AAAAAAAAAAAAAAAAAAABsAEgAEgAAABIAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n",
-              "AAAAAAAAAAAY//8AAAAxYXZjQwFkABX/4QAYZ2QAFazZQbCWhAAAAwAEAAADAFA8WLZYAQAGaOvj\n",
-              "yyLAAAAAHHV1aWRraEDyXyRPxbo5pRvPAyPzAAAAAAAAABhzdHRzAAAAAAAAAAEAAABkAAAEAAAA\n",
-              "ABRzdHNzAAAAAAAAAAEAAAABAAADMGN0dHMAAAAAAAAAZAAAAAEAAAgAAAAAAQAAFAAAAAABAAAI\n",
-              "AAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQA\n",
-              "AAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAA\n",
-              "AAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAA\n",
-              "AAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n",
+              "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAQZZYiE\n",
+              "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZQ/thx05aw0AAQoAAjZrf0Z7SQAFS\n",
+              "RBmrGveunhOj4JFso/zYXaRjQ18w/5BhxFIRpIkBeRXl9T8OOtGMbM52JtIMXIY7KRr49/IsKi0w\n",
+              "jJUK8Z7XIFmlAjIU+jSbWER5LmeK+6/diSLijDB3co/ebDgChTdnt/smJJAlFMJhzTUcdwoA8NQo\n",
+              "YBnpXwCtHd9MDNyz4x4zrqfgfXAXtVDOuKqK+ZIROmkudESU5HAc84NxG9mIFkHTHpfRFX0vfuvN\n",
+              "v30XneTe8IilYhOJYkyOcVBz9L5D3N5P2RHbPf8d2Ia4qkwGurGLJl8PxjFsKE4dm+f6WYtxh4/M\n",
+              "EbibuuIVHuFVTrhDBdjGsnlvGJ613cHSu4frv4bqhIfOz9nOKI/zhLw9zlvfAkAek0G+jTz8be7+\n",
+              "o/ndntGdno6L1LXJpdgGJYFOyZwDpk3suJqu9FKdCFsjDfQ4s5OYpZkBRm/h6ksvqs/jKOI7H7Eu\n",
+              "JEDtMn0Px1875SS+KLSHaHwtTCNzTTTEE83rjSnRcLH2qekoCAzC/F7u+tWoo8/5q7AU8ZwbFyde\n",
+              "C0AcLGLOTLX2dctD5sMzDYlYtX/lYiEND4SUALBVfbetB5IH67pM/22hp7cM4zkyUfekvXZeKUpq\n",
+              "ihxpjZ/b0GfRGel+eaIkRAMer8l0HHBl4xOpdwEUiGEQqacmsmAKA7/Wn0I4FZAkAeHbrP6JQw8G\n",
+              "T6oLn8jHc2YBwe6YY+t5SuugRFwnijdFTQ2IYMHZ9spzZjJhn/lftFm13UY9ay8CDty2j8dXZfss\n",
+              "pdN3RSB6EMFrirN6yUkoxa8UPGBKHs9MUFO5MnKDgADHT4JhBGInxUASlDV0lsFB0GH9ED4tkRc6\n",
+              "7SnaMmZwf9T2i4a1NSsheM+jHEQWr9fgPDBABuIyToLYLrnVeLXqSC8JMeZigh4GOpQKyiIsG8oa\n",
+              "f6kiBTwG/5RebTqU6O7rrQLj5Wd5YFdqaacUZGByo8AxJ60NHIoQcxeNjsWAj6m8SKd2+g3en70+\n",
+              "zVQW9HkvHI7nnRF3FhwhZYu/LvproEPyWSYykJIx75ojR14WE7oWSjYs0X2AFiwEouayVGii6owJ\n",
+              "gdlCmnN8HoqT5PPnaOWG7mPgq/3meUuz982ZX4+4VMage3Fe0K3cqRdKLTge+gs4pyQbSUIdrgo3\n",
+              "4P4R1ejF0wAW1R8YjLZz6fQUzzzchgNN0t7aa8tlO2yDCmII5BbaYJXJrRvBm8Lb1m7TLILNalgu\n",
+              "RMjYD4Pf/P4iQqWsBEdgB3p334RMzrBfcviq+49N2SRQlYxV0SbSMdybZaH+vxuw+VyvLt3ulEcF\n",
+              "rmBwnxL4kpGATPv8mogAAAMAUMEAAAI7QZokbEEf/rUqgAYz+kaAoYS6oZnCZBWChU49QzRvBVh/\n",
+              "3Pl1tY/3h6ui3wW2qKCfpdwQ1h/uuKhRazpong7+Xsbw5g3mv3E7I0N68sUiey8Dbt0hMUrR6zYj\n",
+              "YtzMQ7gEdgcbbOEgu3H73w44JvEzvgZ4iO4Q2Kwp7BHY2uxxtdUENoG1kHXqnnQawFSCHZ9W6pRZ\n",
+              "ZX580jW/ekv7tzX5SLrr2mknIiIEL/9OqO/hdKRWyIS92L0VbeMgboQPIpdXZEemH8ScfWR641oo\n",
+              "Kb2ZqixayrynX4qeQdDAXvtKdnTPfgTsOJHs6zrnaaKb6SpoCg9ffzFUfiQ1YwLPZpLhwkJ1F58m\n",
+              "QtliSU1LCArOxcL0CdX1xv0PO1XbIga8mvD2ON78HrYIlpd7r9MIJUgGiGlRxLTUITjvxtxjLYBG\n",
+              "TBzSQ2Mqy08Y4xvBh9/AZrWGoBvplKVOooBAXsS/J3OngcAaMApnGniTlEgacIB/4ihqQm9Zync1\n",
+              "WrLEldONGr9K6gbteZcFnK/hoe6B53agN6YwjF+Hm1IYltzK42eiNQbmeo0nT6xx724Sek57Pcpp\n",
+              "/+64lZEYNhMLw61j8cLCmWJLqJ9+OlV3Tu4kvqWM5A7mBmXunK5EElFvFoiaHvfKnFzVKUZHVN47\n",
+              "dwwOu2bQK/GEFcs57H1A4Ddl2JAlJt4ZWgrJx+vzAgyhhcl1LtQgQcd3rX3aPisDf1CYETnay05i\n",
+              "xe8yUL0AVMzI07+lqERP6auGU//nlrslfAAAAS1BnkJ4h38AGAsZbANezx+IWo4Ni9MoMfKTC08P\n",
+              "cqaDTyueeuPLGgFgW9U33kZ+Bw1xhP+VnfaIAfTxYvkb1WNMMRMsh5PjwSMCmaFIlQvFeKZwdgkf\n",
+              "0eHuoCcg/XQXRqCvEyyYU7Kr945fY16Tu/18Zd8NU8RAJRLFspmBVoIPZ/aTPmSIXDq8KOSzL6TG\n",
+              "sWN+V8RKxGwExIfHZpdEvHu1tOeg+pVzKTracfnYiBxxlkuVIyzOz2mFv1LQ72jZQGocAdWS14tD\n",
+              "EtCsmNljiTGQDRggnoajq8kpnFHws9ZMWmcsV4dQvczexFmx4YibNvvMPauj3CH/KK6FXvQFumid\n",
+              "ftiga3Uno6si2epmOuEVTuVQwXsgCmOyejpjAiAjZuUS1zq40WginD1EPNgRAAAAXQGeYXRDfwAh\n",
+              "r6zZu6OyBrfB5mVsAz3QNRRqvrwAcnFznD7NXanOaWlAADNOwlJX/xGmO79sH9XeNRT/FnLuEPBH\n",
+              "1GJhJV/Xt2R0YziQPpgXV9BLMr5IaMaU9R2CpgAAAPgBnmNqQ38AHhCAmS1kGlkSnBkADoOXdXaF\n",
+              "NGZr+Q4fCvQ7bHDsrrZk+gghfDnB3EgAw+hgyCz7QjPCBdm4Oua2VioU2d4nUZ+UABLNnRNNghIa\n",
+              "znH4EU6++iAxhcURNicOGGgil2sQO5YirsL6J7S/TznXYcILcn91E9qrSkdqAKeiqMttbt/NlBlt\n",
+              "zFtTLIQV87eeTgQtRSaGjNkYcjtT9zsSroMxdQkaS/rgzWfPKqioru5///iiFvV7FHhGNapsB8Ep\n",
+              "xA6YqLEIyfxd3iBKiJ3g/96H/WMQrMVl8ykLYh6g9L/mEknpMxDRuX+/d5vuR5TJpN2l4QAAAY9B\n",
+              "mmdJqEFomUwII//+tSqABipnkgGrJGhoF2xhqIGFJgrTiV28TOHP6iMSZwA4LzauSvgcy42/qpKz\n",
+              "PF+GKWIn2EJeWsQWOqhnFWAeu8Qy08RHEYzw2BIfhXKPnsvQ1D45gRUsCZjYq85tliORVeVqHlvt\n",
+              "fzWrMqI5f+favhs74Q/1bo2ebSMVUSFuP3HPqFVDjXrf/wjJSgWTFPNzCZtjDghfnhYgAzPVh4sd\n",
+              "mfpnfQi7UGcAu+X0SPRW+sCzjBKyZsabYXRLvCvcRgXcWHRJnqJZ7DbIL5Ahmra4MUmiAdrDqxi1\n",
+              "yixz8Ge2MnwDKePhHbASj9FgVyabApZmODkYAk9x2eNsu3NC/GWuEsOYUEJXb3NkJ3H0Ehpogb5q\n",
+              "/7IADF2Rk2r94PZTFE6TdqRa+DeKrhf1PoBJxN2bNx2sA7Pci476Sn+ZpPsAPTlXaikJNRAhO4tD\n",
+              "lakPd29Edmfvk34bCqY6rFMuCfUJ3yzCy+VRKB59CtgS68dVzaJO/FxZ2Of18yjXsScM2fL16/kA\n",
+              "AADDQZ6FRREsN/8AHa60qBaQmR4IRAA6Dl3Sc6VtGJbtr5vbN23f25BY5Mbt9ZodJaqeGLgSZDt5\n",
+              "tMt3+exLq/o1or+DyDOaUjfDuI6HO9EMKVIFrK5bBNySwYGQ9ZOLXviohcSZAskgQCT8YbljWqgY\n",
+              "W5O+m+Ip3OoA9JMxAp4EiGRPR1hmuQDeRomyGX7bvvzp+lmhQcgx50Gtf2FsWph71RE5OIfz3vbU\n",
+              "YPJzvstNoHMLjQVN28uexbTk/wUswGjCQ8u5AAABFwGepmpDfwAhvaAbJNR/9ddNI1ZNZPr5vm6q\n",
+              "XTetXH7Eo8GqFltKJbOb+WxFxg1OZ9LY7Pm4G1n+FvJzAc9iMK3kbM6geeeFIdRl75A0UZYsXIff\n",
+              "dQXiQxB/kP/GUeJS/ghHdsFXhovY2ei0jBYXhl7XCQdiM+OxqVpdBNYdLY+vhvtTydDweWAQhmfY\n",
+              "3fYN3w2o0+YtvleCAQNIu+tN7OfSeOifT7EOLQk4YDYkvT1QcI6scYDf1en6ihiP1DSq11Clzx8a\n",
+              "ja6cddGuoMqDaNkxCF1dzf2Jvz1VA4BpWPjukcCUvSBL5Hjn5IenmZHNevhC9Ri5TKMMAK1OUZos\n",
+              "eUJttkHLI36Z4EqqgVQeXc7fMR78LG9GpQAAATJBmqlJqEFsmUwUTBH//rUqgAcd7WUAG1wL+eMP\n",
+              "5NbNjI1PanDtCkQqkSzemsYEjSdqyjDQBhMRhcVkBjrLnQ37QRY6anUo9HtaOXKEvV3Oq3t3zJnU\n",
+              "VnRnO4+DsYDha+hVjf2RQfz8iIHBAMZBzDCidKRjdK++FyTTJT//wjjoyDzrLD81EvvOEfP1hNq1\n",
+              "E7Mf/LNi4VzZp3xaz5k3oYD4Uh8itElOoUglEcP1/ghF2UcJA9hOtkSUpVhA8+T8Ytc1zpVMfYyg\n",
+              "QqbyRa4EvI2+PCgNWtypZmPOW/fUb8LPNYTg5GLhzbOmSjYpenEUzkib0QksNLKbj/E9aHrV1qHX\n",
+              "qXiny+3UUPxYGvj/pDuYRozh1EchMNkv/eHEkrQhTQjnyxDirLtyAwkvICbz8w9UK2AAAAC1AZ7I\n",
+              "akN/ACK9oCBuM4cceanCEEWpV8cuy27lpLcHp0RFJ/onjSEljOG8VqS2Rkf30kIRre+KMlNGVcvp\n",
+              "cL4orO6Yp5KjC/RRBwQz/yE8UKLNeO0Y0FFhQfICXcBtO9ndieTXXlspFHuGf4S6CeBKlAO/lDFn\n",
+              "Bm6rf4RqP1vvLrD8KUBlig+AFH77l/U3BNsHxmcjURJ4rz9SBUp3dWhkBmKNCP57UtC9bKnqFyE+\n",
+              "YvACZ+sMCAAAAZlBms1J4QpSZTAgj//+tSqAClE1egBKEwbZY3t792fWy96pbeQQCnoXHta8keYB\n",
+              "6YD4iyrisk5RAGXAP8hftXkqsIp3gIADtqeyulunIxMvA+tHyMYI4mH7Ktx24JQCDLGwr+SW5Lfl\n",
+              "LFzLN5Z5EpfMBtjuN1e5MGJfkKE7RLofReD1fgshPg5Hiu3eNzKNtXPqCUQOQrANHyjLVDHW1On8\n",
+              "GbpMg//3+EW5h//MyUrV8C3bm65GCPAdr+IiAQS5PLqRpJaqPFXYImLzCfEF4IcxGqfKzcnaOGUe\n",
+              "P5zhUa+at6SYruNLfSBlr3+mvyhAAxPUBpQBX3a2ZIbz3QLaxiA/KmUnrCDmuWAQmEAoRWFYDkhB\n",
+              "vSu304LzlIj5BSPPqNvyTdiIsLpzAu+SwxleN8rOU8p84R24aRhgQwchoF64pWQkYvhDlixS1XkC\n",
+              "+1BFsz/ugThqWNrj6DMWcUAmd8tN3JWA8raGQmJpBH1Zjd5483GFE2+DssYAdvIzFktdYvwqJy33\n",
+              "xqAAiKb/jZmChnRmwaKmyp+usNPBAAAA+UGe60U0TDv/ABgTM0cFpiU9S5COo+Eq1a5EDpKRq+6p\n",
+              "lSs4dhBzMdhHGYju3Syu9sir+n5TA4S4EozXRjp4djOH9s6Ebl4mnuRqUkAVVyRRxloLXXdAVwvm\n",
+              "Kw2kt3nH3KtGiXPZtoKRlLMwsYrakek54VGjJMSSK7z2j4bZfzdU5fWILhtGELYhukSGMv6CXtq0\n",
+              "ugZLCx24z5CJjXHZ6aJugoOXVvLE5AMKcYDe/LowGji7OLeFgeB849mfSaUGlnh7jxuhBOU+fRS4\n",
+              "p0ITI4vXzUUR4XVTQrOXBNie8HQwoivm+WRv0nW15Zl5mZ7wAnqm6XldppA1IAAAAMIBnwp0Q38A\n",
+              "Ir2gIG4zgb64sxYLzhi9P+r7lwy6Wa7RRkAjTYM9mY6ueOaRzgw6T2RlVKQ/Wnw9OUPsoB+98v3K\n",
+              "7Ai/8Ku9oiX4fIaC4XxFxl+0lQDznNsd4UfPo3AQh6FoBHug176P/7mBbtXW9HioX3mZhTRXJOlh\n",
+              "Psk7HP1i1klJ4f63KMPuZvFOjkq75Z+u+/aiOQvmn6+lP0r2vSaqs7nxNSGwPqSwNXaUgQz58aD0\n",
+              "pB2v6eKf+Yy3eGu8f7HHrAAAANkBnwxqQ38AH77opN4Quy1TZxAAOg5d0nOlbRa1oa+CUrbGUKO9\n",
+              "s1K1K60LxAZlk8ZQWiHU0UUuQDnHAAyjelIcwOj4NipQdTlRBT+HrLVCVEK5smCT4WEyhlST21vf\n",
+              "pS9QIx6rrJJt1ZwRk3fLMy3lh+GbSU8p/deKiRgvPKu2y5xljT8HokdUfoJBN0b+9AYNdPwZxzfv\n",
+              "wRj3rjB+XbCQdH7rLOmVBWtc7YBBcmnLfJ50Xx9vsPrIGyT/orCu88gDS7Q97WNMWaRoINuEV0SN\n",
+              "7lASQ8YC8xeRAAAByEGbEUmoQWiZTAgj//61KoAGg+KazAhO48Rk+mELCfGa3jedcL7j4gDd4k3m\n",
+              "hfDQA786lCeWa51/s1J2qe/kkvnBjg4L/5tqqnPuWzD5CtqsuCrBZfD9tieYn0V6h2QRjHTgf2S7\n",
+              "KbBJVduRkgXz0DCyLCsDRdQx7ZVeilFNQPYHPpL3dFbV2ZQLhZ15DCVv0ijUbfdtbaCxQWk4hFwi\n",
+              "4Cl7Vcv5eumMKNjbBf29eX+p4vfxRMeLxQVGLH+o2FLpf2SZwh6nFX8ReHwFB2aNAZojees14KLO\n",
+              "dDXVOKLwRfawG/F4iTHLNjIHr9KJ7RMP+ZW2v4UodTEwj2IkfoeugjPYygxsYBEN/HIWo7Lp4BiH\n",
+              "W+sGNW6nzMrLHeZnfPrIXJzjKMZ2dMe3r2TPoxLKTVgPHlFgXbB9gOVEkvjr1YtxEt3sHivjr7TH\n",
+              "zrmzrXSS01xk914HSqt/CnYSKPxa2MF69g9I/BNJSHdHCdNGwRVm5U4w/DYDySkJOTHhPK5xLTdI\n",
+              "6pomON2J7Snu3IFO1cMuZQAgHAwoynkWURtTVoyQbA1o0XW4HcVte0xmLSUrxW27KPhiReLpDIah\n",
+              "P07+6UwIug2Iw2yxWwAAAP1Bny9FESw7/wAZUxOT3tiejYgyJDRrCYHaMUHhX+buBbaoqZ/1iUWs\n",
+              "Jb7slI/imiQ6OnWj09SEskbfc/zlMQQ4SNXZauWfHJ95XYh7wMFGgh1p51IG9qMewyJwQS444Zn2\n",
+              "viLgUg5+yrpXHCf0t8/9jDlbqwjDulbT62pdxpAyxuynsO8RFT3dUKeSE5htp/jbraDowEdpXZyE\n",
+              "hG0WYkl+RbztI/PQNZCwZsz+nvpxvKr5XHM1hBpXHcYTolc3yg25EknXG5iovx0Y9EuSqthrt+Xw\n",
+              "mK43mYVJUVC/Oh8GeZYMuS8/kSjScKjb9J2cbfyAxgmK23G/LX345QQtAAAA2AGfTnRDfwAc/TTk\n",
+              "s3FNYSmNHdPgDfXQC1GBEwJGCqSU6MsmeFhDrrArJ4DXkS7h5Olwl5LsAdAjNSMWnsyuwfwlhiS4\n",
+              "Iu9nXiMR2gsFQTdJfxAGWv/oGKrfOpY9OM+oH5mmAEYRbo0uYIZjYyyv9H1tg0RX725ktocEeT9I\n",
+              "3B3Tp4qYCOAxN7JPiw1LGqnL098ntFu5ng1+yPoA7ayjGtnhqUNzDdxHw06qdCQZykRFXaAS2mFv\n",
+              "lmomA2wH7gnlU4hH+9/QtYxMog0PKOypGE94HJSUfoT7gAAAAEEBn1BqQ38AHE7WHA5VnN1RP/m4\n",
+              "B17wBGTsyVXKs9N7WlI9AxsJJ7v9zVkMjf6pvv+Cg6JoQ3BLOK7r3bcONYUtZQAAAddBm1VJqEFs\n",
+              "mUwII//+tSqABlJow5npTNmtYD16z8AGI7v0s/GnfyqOWKggEMwd90EmHsgCWksYKFE4Qru8Yv50\n",
+              "LqOKJvWMLHGzKIf1mWoops1hD8q4hCLJMEdRItKEcO/AvOw75DCgogAQMHz94YdBlV1FB7/3PGw/\n",
+              "kvp11c7Zd3bjgbTV5f9wCrj5V98Wrk1QkXKTao3xn1WeAORpyCtFJo3KIIzvry0ktsvXmShsZdHK\n",
+              "SF2Q6qY6Id0i1QRrrPRdF2iq2m2rhv1eY7FLgTuR+kimJsshiQFr/qQ4tOO2msQRBI4huY4JSA+L\n",
+              "KftHgweMeBwJfCg9ocoILqar/ZxuCC1Kx59hrQRJPfm8amRIkwU/k+wKJNYh9fLLSBsxlrg4XoMn\n",
+              "PzXBXS36HS/Vq/PUU0Saj0Ks8oGCHCVcz3eoIxgiU+QJY/DixHlF4+MYR1JrL+dYLi5XU6rOa8uy\n",
+              "cymZbC8fCrT8nFmCuYcD3DNSzmKt2Ypk8ahqcNxMHCCE377w4QcAAK8hLicCDiuo9KVio6ugqDQM\n",
+              "DiWya9QmBn0ClIbSCznyVdfSZyODo1gjrJ9IiCMcnWI45hcgB0F/w3f4fUDX3TFD/vbMoTmxwMKV\n",
+              "hWEq4XvI4IEAAAE5QZ9zRRUsO/8AFKVUcHl/E43Gt6o4RZvBs+iAp/X/n7d7Pz7RdmO0J7CPEDVr\n",
+              "YOGCwg4aa5sRnK1DwPx5sIYzP38566ezpK1+yb8tpnK38Otysb+fPORXq89pSQ+5zLmadq08PRPq\n",
+              "ft5b+CuHdsaohxgMdfr5HBiNNodd0VK8TNpXmgIXzYR5RpK7ScM1kMS9Nv/EnJHMV/HrvGwgTDTj\n",
+              "k64XWbP6seQRZKb98opQD+okWzwHsAFj5ehr/ekl0IlB4NOOkEs2vqjJoc0vIcwkba8FSFkLe2wm\n",
+              "HNG8c/q9E5Tipy3avrHlLTvT0bjPkjeD4HLfC3isImW2RvjzyyF2TiLuxINvE8y7u04RbyNnhNhC\n",
+              "J15BQDsVja0XtFDfnnr/h18foOkLRpLJ1yQTMBboYsOrVzSZ9GDWwAAAAM0Bn5J0Q38AHQXz6rvN\n",
+              "uarixND043ZCNdAAIHUCWbOjp5TUpZdEciERk/s2Hj36k/1QHuy5AO7bU6FcTtkwLNXpp4kEhhr2\n",
+              "pj14tuqcy7uq8XfveV+qzHFw516IWJuk3fnleTKVnyg4EmdGVkh8uUm8KAFIin8/UzurGkP5FXB1\n",
+              "JS0uIqtx2mbD94hCpeHMsXHXmWbW3GUD6bwQzUCwUdgGFWWOBIzHIH3jzzxIIZ0rnTzx6fd8zSRM\n",
+              "hMrhmhy9AElVESMBSl9RUVwHxFBAAAABSgGflGpDfwAhvaB1qIOto5yaJpOYSSkbksLCkPuZStd4\n",
+              "LeT7CV/DcB+jLm/y8AhlFfeod4crFEXxelJR/fWiWC5cEAQJB3xoICKkbqYOm6EmFwfhOJrnHL3F\n",
+              "i7egoJ4YJywxTcfWExKLj/7q5Qta5s9pQnji3v49xEhquy1bNbsP/0r8degDcM/eCvveCCuWJP4W\n",
+              "kmgZOsTL6w2RcANA9FiGFsZYFgwwIJNSoi5uPhHUWhw8DgpZUJJwhbcwAlrJ/XkpDgMQdv8+KTaK\n",
+              "5RNrXWUI+DQboZuQqh0EP6Ucm1iy8BiBubHVtPfvfM6aTMlQH2sGDo7kxk+QnIaS5zzgTFrv32D9\n",
+              "yKVtBoqoPJ0AuZgM4FsUTuUjy7Mb8fU+FNoSPESiOFS3CYbvMWBzWtiplx16c8G+2sTGiL+yia5h\n",
+              "U5UjqF9tl+DCrXkPmQAAAhVBm5lJqEFsmUwII//+tSqABlvipo+ln6jP3YEZZAIeN2gdAdBG93Am\n",
+              "88+PBAP+pBG1b08i0fIFrYTfZkz4SYTuxIQ1JlthBpef+blJppNwqif1piWVs/t6bCj9Z+mNxSeq\n",
+              "fY1/wgLfvSZhz+cH951YQ+3lZMxDj+AnlpOYgaA5ONYw7fbC4eXvAp07e1QLTwt7AKsxs6j/dp/S\n",
+              "ROqifCEiS8aS31tyrNd0WUbq8QssOlpj1+9+m64Uuc7+f7EFYNlp0SQRRU2ux+5kBFuUthOQf/99\n",
+              "ODAIvGEvExgFy7U9xycg96i+XWorpOkUsmc8UuZbMVhIEf4MYVuxmTzjhiOVDlxwcksj2gNb3xa2\n",
+              "pmXlh1zp/jlUP6lnJbCcR5jJhGaBJ/wuH3P+rOiJDpAwjSIE4agxxO9XGnmQRqhYjiBkbby/Qs/C\n",
+              "0p6IlpvwhBITpwXRBm1mH+MtJEskEccmYaNT1YNO6b966q1ndwWmG4wqG8yXMOLAMIGnxTjTIpRG\n",
+              "9a5Z9Xdl+HR4ndQhvFfQ+mQNsGUdDPAaOtDr9NfsDESdrHz/VFsWMxlbozv6ME9/FBsTE8SLTZxK\n",
+              "uKA7LtdEmFdsikvrVwkDRWs6mlddIWSLEJey878D400I9Bm2F1YzYF8hIer8urpKTRWH3dl5Pnql\n",
+              "OkpPyvm3RplNwN8DaGYvFB3ajEHHx79ej7jTTF7j2dZAVPOuzAAAAQNBn7dFFSw7/wAYtYg8t2YJ\n",
+              "aBl5mT7LoVquTMWPsAY8JEk7n2Ltj2VU9Y6yhnUjGblNmyV5I1tDP1WCa31R20KBx8ZAPYjEjgAl\n",
+              "IBPsF6gwEF1mGQPgwIt+DQ7Ltrn+WWljoOZe6qmL3ODaEJKUCy9wZy8Qi5WMsDYzpEybVU1vipuE\n",
+              "rsjD5epFom/S3CRpP+JRc2SuBGV9X135AtKz2dAbEFqb0f/DUfvRpyE/xar90tpMsUisBmDyfPqC\n",
+              "QCIWsyVA62u0XX4SHuuo3VkmdASLaLWJS0hWsThucD2h8t0xx4j3t8tQeFkAoX+vhWm72BA6IAOh\n",
+              "cP5AynBLYvgLjkBSaw6ZAAABWgGf1nRDfwAgt5i6arm7oDsF+i9EHiOJ6m6rVkYAHTQbG9yseMuo\n",
+              "2+jJx58xpeovc881Wv+6nIPwZiRTONb2IQaBwPwYP/UAnKjoweUWtNn8yjj61Yi1F5n9oYReT9vo\n",
+              "YNykd6+UIhqXBR69VB8JEqms6DNcB++Z+7S8cRY1PTjUFRAm3tXpZtcqOC46Yje8Z3mZdWtke57d\n",
+              "wfIWf/bjH+PQoHPWtMGigrlGqEUElC6TETXz+nB7X3pF40yVazdjxa5pCPS8j1Bqo/RmILtftGxN\n",
+              "Yu+1c8QTzG5+3qHYIB5lZeEW8bNhQmHlV1zck8pKhAWM+UMUo8Yo1gMDIjGuUuNGCTYOoVand7oO\n",
+              "JxBESUm+840sI50gEtqO5mhNaTQVfGrhYgQvynil8I63rBmEOncCHtkN57Vx9gduQDjk6aOyO6bY\n",
+              "qsBt2jiwg3SW9pmMOjEKBDS6IfMiAxcAAAD/AZ/YakN/ACK6K1xrl4Eswd4/m5m3eDoe6aKYRGzt\n",
+              "qScyJrEz0/YMsioeM46osJc2N8un8CXkVjpps6zgsf8LlkG70ab3ccrB+um/wXzisesiYCwJDgAm\n",
+              "D8ODYrLA2f4XQyaEvxMLwdPggFdV9SLGW7IaDs1Gj2MKL95CD69ggFd4PlXdr+MMXaKnRfCfYej6\n",
+              "jyRkJ6YHIJryGsscniQRwJ0d+J+1KTOriJZQomY6moOkqhpxON7UIyt9lzU6HlHOyQJ+oRH5iOIM\n",
+              "+hKNz7H8znQxxv6dKCBY67rZbPlwYKywoLx2OIjAEQohlh7LdbGhKMy/zzEiJYFobhp2mH1gAAAB\n",
+              "WkGb3UmoQWyZTAgj//61KoAGC/pGgJ9CubE/Hy/U90CEEMEEbF2Q4cnB3oAeksXBYLQl6DX56J1l\n",
+              "w/mHq8WxaGt2MnAvQ41YNYO39iE6FvpuFKpW712yS65PLr83LJiqo7HZlMfRzKZN59Hb83g9Yzjb\n",
+              "LItfty44d54BI12++V5xh28HT7V7r0Y3bFC5OovybNWx1HQWDmvmM+uWQT6BKmA1pblkm0jWUuJ0\n",
+              "KAyepKH6sPnyIzz9TF/cTcVBDLcJ0ebq4QoNf0i/efDFq1nH+LtoZFDiLpeCwZkCLTOE+JMjcVxC\n",
+              "aWP/XfyRHhNANFDKtoVePLPasXuBVFa5xCh3bB99SWFmaQdxLlk9zHTMNOyCWoiRa9OkdBShrOe1\n",
+              "dfGrU6t4YEao5nNo7umRhNJMptOYWcUtCbSBQmV/4G3c/zgmpJb1N+5bNROg3nNApsFhNWPnDxXX\n",
+              "YEcAkKEAAADvQZ/7RRUsO/8AGBSepWN8xnNsxE4oE6H3s58lr1m+iqw+EfUFRD+Jna0+Uvzz41Eu\n",
+              "ATVBokoBIC1dZOqsBeTj8Ij9FIuxNitjsFqDL+DuZwvmGihDa0HIS79MTSVw/f89Ulk3p2M2jbij\n",
+              "TpCkIItiAXbWCZspatvMx2+GoOmu0/Pjqc6iwrXWXyi9/N9Jj+yY/ClUEyj7sTv82Y9nVf++GCrf\n",
+              "1w5ltOrH9rRQKpUQaVxp4gxcgxC4qFFOgMxs83r/WkZSqY9kO/9UmmCqExD/ljnRMUJvxp8FxL1d\n",
+              "H7PGv4WLI5AeltB+MOGIOr9NYMAAAADwAZ4adEN/ACG6NY+qIzQfcYKCb0AhP1JJtQboSZcB2Ux6\n",
+              "0kAZypUjTcd/OmJjJuZBZL4W6I8Qwzms0HJLp8KRrHdk5GfU6sWQ2Z+fhfAzgzC1XgPD4QBqkDkc\n",
+              "T0sPX8iasgf4/DARkJP486Pq1cqH5kOYBwnnR907+n/qb/xaeHwouVk6h00s/qlqepq0S1p/xGR/\n",
+              "GdINVBgCemrU+PPAyI+EQBjfU66sma3ahiVaLQtsD7mxr/vZVvwLqa7Chr1J9NZveiHKnAzIMG16\n",
+              "G9Gmkk/8FUHgdrIbZ2heuBDh1KQSBCztE11k+ocodRJkiMj5AAABBQGeHGpDfwAhujWPq8KUOIXq\n",
+              "Yi8pfsfzwlVQDEG6igccpABq5mcqZlBxZf6f05WsPP5oiGUHFHfSykAR60y9PVPsKziKYov/dHwR\n",
+              "Kft2Arvz4qT56TCewQ06i1++DP3k7arAvxqk9+C83xiDX/XWrTHQ1+jT9fNei76g+LJLvs+Z4UVk\n",
+              "oEaQ3c6fXvOR9+Md7sWQeZnYPXpC/0w6s38iG8bM/+n0jsTdTFeBwE6YfrCAsv/ybSEXYS5eoPM3\n",
+              "f/HRzfWrUb9MZw2WEuoxs0K4qVyNiDTxcyb1DdadbkuzwkaFG7T2ZM6Pebp0YyXRqckmxx6YTGzB\n",
+              "LlKwKmWHeooj6Lm9LlzVgQAAAaFBmh9JqEFsmUwUTBH//rUqgAYrWZggqZs1s6MH6FUT684nhne8\n",
+              "ykZKf89h+0voVegpTcVlgsFoS6xwNTcMDCv9PiwISM3bG5gmdpPxwsd2af4u9VMbVGyE78HSQ5M/\n",
+              "nbkySYm5CPjed6c1fzFNEjUv+hlxYNfv3cPYnGT/Yav/5erFhxatniKB++1xw2wwwm3hwteUjAt3\n",
+              "Bi79ySg16ijYqJM5fa8+vosVJZysXRlnbW7/ITdmkkl3c8ndruo8FzJ7m8m8z0kOYciXI4QIL6Xh\n",
+              "qroOcvOVcWB7Uug78ZH3AowGQXzMbzVMrLD5Q7gJi2vHbYwWBG8EpVzYFtaj2m+v5trtiq/wJKtt\n",
+              "WosqXvVBFnxrWYQFjXg41D/ASyQHPzn2WsqemfWG6/EDepgeax6MAFQfxyDScuq3fNmr8jf0net2\n",
+              "tjnK9AbUeZfaZDCLHpnptMZuk8clMx5Y+UVSA4sRK6q5yL86vVu3TWQ+TGs9ZFdT4m8kNBPSkwSz\n",
+              "rQpsGSml5JPzqe84pJi6yJhqfYRsb2q5mJ8tkrUntJCF8lR106wAAACuAZ4+akN/AB1RsSI82HuA\n",
+              "EDVZr5mUHFl/p/ZTcmoRWj4TfRvTsYw8OlDJB7dvZ/vcXyur4LGUumPqBQUBQHfGq57+bI/8tRzs\n",
+              "Z+nHU7WH8qJ9BM8/NBixjH12m2oVcRb4XvfrX32V+Y0hU+0j88MNPEcdX4rv7aeeep8jA96PadWJ\n",
+              "mSmtmcZfJIFp4fz7nGsOeHvsRUbV0MKDUYmKN+mrh03bThLfJGXI3U9Tnh+UAAABmUGaI0nhClJl\n",
+              "MCCP//61KoAFm+ceSLbmAtKM+jG0tYuAZBSWLg59auQBOS8BoT1gHMsjZkIU234iG6WAeSbLJEu0\n",
+              "KCLhFA+AqaJQGzw142KKgdSAFtORqvq8YepvegTzCCnS1DU11oB/GUVDtDnboQEryLd0x6NUSSMN\n",
+              "cECL9Mzb9QebAeTbVcgtE4xPKr7FEgVH4vbNIioC6rYN5svm+n7fErwoxd1c4B0MbzpTJ9ypWCIt\n",
+              "jDqP/6ecCXKe8Ac6gqcpyPRaKmFcKdx7byHCFs3Y36UHxsmpasB5iKonQtfou1T7ViPEDD+TNshw\n",
+              "6ncI9FQOyx3EYxNs7CdmXQjjuiQ/hVztgan/8HWeS5jp2zgzBv5BXUEnWn+A7+FBONSn2LL/uQ/w\n",
+              "xRZTcRa0x52ow/V5cvgKu7FATp/RCkX/G+w1Qnp+0VyZbVkCutQ1yOnQYxf79Uw65C1zWPQdQMP/\n",
+              "K+VS6vPAs27IKeqUeSeiBKHv/3isIgE+rjxQbN9Lh1YW9R/9r++mSeHrs60NzUtdlXFG/VIZkaKd\n",
+              "XMkAAADXQZ5BRTRMO/8AFlm8HmElw5CLBq61UEezfOfwLuaBDj371pFQE2TaGfrDL2cPvWN1QZqb\n",
+              "tmH36IVd+buOk4nAS7OK6LGtZWekVP+ro0ezqUL6LNjplSKI15AkcuTQweCsbYhrSLoTsRiawYgs\n",
+              "mv975sfbTCY9L8bxROvDNcwG30R1+JWvK+o/hwf/xA32LhBb08HGKIsZFejSCR/ZACyPMiASYPKQ\n",
+              "KnKHiabUDVxwGq+/saT475SIsPn2KAHPd1oy/JYI5la+DZBAp1lqCWQj4yUkciIB5BAAAABzAZ5g\n",
+              "dEN/AB8V9DqLglnogAnlbAbcaeEM/+Dr1d94BLu23/b924ZA1vKLZ+NWO2PdXQ6go3Sf7NA4nwhe\n",
+              "Jfk07l2+PnIu+kI9sd8bYLUmTTByKGfoyEUnQqTPIf5dfjB+AgnVTc5y8pWcKU354gRsJCt4lQAA\n",
+              "AO0BnmJqQ38AHxX0OouCWHEND0XeNAIAEOFUWlDAA6yKdnA6h0XJ5AHh6k3PwK41LuRgTA6dFitc\n",
+              "eGcLOFImUAXmZeNXd8BBiP4Y7WDb/nj/8t7UR/ChuIYJmbMzvyMcttz9Od2nvufuLeTpnnGxlC5D\n",
+              "sKIQ4TiAF1Zf6Jjc46nP71VK4g2t6fmiQijizaslPXbGXByTezIrwT4YraOsiMH4GMwabs58JhIR\n",
+              "tYealSfNunZO0jU9FNwqBbfEknuQIRSATwmWr49+JU7MtkfWDJ9lAsDVu2W/43LTVqxccM6dY8NC\n",
+              "EBnYMhV6U9uYbKYAAAGwQZpnSahBaJlMCCP//rUqgAZTWZgI3NAzNytjReukCJhCqRIQrgVE5TFG\n",
+              "RpO1ZRhoAw39KCX0FTF/pEpCWlYTREK0RX8M+i/Zkz6IOh5zRR0GMJniH0SeRA8U+ZBIRrL9Hl62\n",
+              "8kZwKv6q5Netv/8gTYt8wrrWIwWANbXHJaruY4G39urxvB/yx7ozBV54M/wmK8P5AgF0ljjPQAUZ\n",
+              "DnLEHwmopi3rWM++lGz+7pSmghGU/3PNF3AxzoRutm1cdRdLqAFKdPRrKeDtflDHW39dHMmsizA0\n",
+              "JAD4HEW4vO3o1CbLX2IxlZFPJGuT1QOtzPR7lO7pJCxfeGJXFchlosXXXbYjZoXRMBBKcHqbIWa+\n",
+              "lcjl1FcSEXbk84/WCNR/hEiDPBQ56Zc4Yg/Uu5te5H7B3WBkQkc5+tttienjQao2TkWT/tLarBIb\n",
+              "fSMA+83k8gbv1oyeFIIWqR6ZYarMVbzfFtnH/fWhWkYB/el6Kk3P0OPSTUOVwdEnhQ/ztu0l8Ij9\n",
+              "PRLg28jDAaygyMt+MtthW/hM1h+aETPrMcrgZoJoV2dKCm8mLdDu/CmksDfLJBRBAAABQkGehUUR\n",
+              "LDv/ABi1i6Ag4bMBZUwXqVJnyx2PYc2F7FCjvy82YHTp5//HJrbZhCcYERymRfl1ah1T5z9noaM6\n",
+              "FqCYiKh/nb1NKcv6lay4yu1An9EGWzEXMRaTXWcwehWRMZky6GX2Elv0mAOhcWIk8WVG2FWKKMhd\n",
+              "27a8KH0mx5CnVDu76Igw2moc1+yPfDPZnRGymeVWDMSj1/TY3hGgb5hmSfANHPp4nyrFETtH62Dy\n",
+              "FIZnfZ2tua96PI/858zqXLfYaSaEy66elRjPHGSUQ+kLj7sT6e2TgQoh23asg1dvl0lw6aW2KtOQ\n",
+              "yQVjdxBZzehiTDj2VDDo/FI5LuGH/jfe71B2giPdfSUEN0GwZPmh+oBJ3YPtBDdEXjvqGtPnj9YN\n",
+              "o2RsGDqkSW3oa8BY1cptmQPEHp1SMBrX83w6xtQW5X0AAAD0AZ6kdEN/ACG9oBtcoOCFYVPj9Yn2\n",
+              "v/zfoFr4rWL2j9A7ZlqQHr0ZVpbLuAQJB33EyTSBNnFvVuljxMl3V6GA7Dl0BClPwL31OrTpG1l7\n",
+              "a7ghzL0atyS5ApCJWtp2wOBNzezTQ3N+Y1tH+luIT/i1PP0KLgniqnzZyMrwKfZeXoYEIl7twi0H\n",
+              "PJVeAcAdd8vPtJ2LywfKZ3u1S3on0S/4f7cj446r85qt7SkU/lr6c/+gK5erYXiPq/kf9oXoMNwY\n",
+              "9h0XgCkkY0ibuAMW3BGf/tJy6AGuO11Q5hQVr9nNkIcjB8Plen8B0nqwKQkOaIEp5QYqYQAAAQkB\n",
+              "nqZqQ38AIr2gIG4zhxx5qcIQ9c2Osw5+uNtUP7c8wH627Nk93kOS5kJwZOUsa/GuB8LSJPcgk4rv\n",
+              "NNy4X5Kv65LRXZpkjxKOzss2V4BAkHf3fdjwk53/8IYs8s8oIvwVKvgR9wljv8Ag07Nf+XJo681q\n",
+              "NbSzOUK6bv18ql/byQhgzEpF9gyeKzBYpIes4Jq5ygJqsHenGCQnuZZGCejK/v7YZig/zrXj2vhG\n",
+              "gCib7VW/rlAZYnZRYtYW6jN8+34R58oAelpNik7qpp/KkHdSQspzMHjVSAa9yHgI/KVEUfAeaSTC\n",
+              "N1Z3u1GIF1TdZRU1zNyC6xbuAxPXtz6Ez91WiAF1zBDEIltBAAABt0Gaq0moQWyZTAgj//61KoAG\n",
+              "e1mYdETW3g4OxfplN37UKMHTaFqDxb+9ytAjpKDc3XnMw/MxT04D0MH+PToJ4KWEuN7AocErZRv2\n",
+              "Rz2GQBbpS8lS31542pk6xM8YYh0/yeF1AnMnBxO2+HilOPhojFg3EW0klIcf/AybMYAo9NSuBD9C\n",
+              "s4e75EU0t8atdvYkg/yfik+FMNyFYTUg/mi4EKL8VgLWVSi8mxQ1+/EWE53/+fwb7K+j+527pMW9\n",
+              "VCj1B/8oEXG8oxyHRw/TQGPoBS7lGz9zLwh8gXusGZBvY9Xy0pnRdJKDkZLO/YjZFLNiCRPsHTqL\n",
+              "i2GYmJ9itG9pRnevDN9cAKQP0fgHBe/nvlXFVK7JMen+RKub1gCuPtFfO/y6rA2fstwepz1bap4Z\n",
+              "wJXzTLHNbeZ6/jnjul1UTQDo+Wyv2+WNy23qAxLYAQV2nquSCySITwJSTVvg+SdePIAmj5UPClGF\n",
+              "OrJIf0RX1xfSrhrpF0W0EhW8ceypgG4+dXb+bPwXKBwbO3GymyW89X2WJwubd13etWWTwju8K204\n",
+              "+w8LWTwxqMyJaP52mExMi4W5Yjr9AyAAAAElQZ7JRRUsO/8AGBMzRwWmJT1LkI6j4SrVrkQOkpGr\n",
+              "7qmVB6agtU/P7NMI3vz5LIs62lee9zlMDhLgStRXRkKeHaPAGaY9hwFwZg4RZnlEijsKiC6r+GA3\n",
+              "jOJMGPR2G+iEvFq9JqYdk0b1d9ABTX/7oiMKav8zTfVNhhkqe32oj6u1ioYXU2U/9Y4cH3f/N9Gx\n",
+              "JhjbFALTGuJMdeB2a/pmxPSRSx2DhwUwXe3BT4iK5IJF2QdQUjRydlTK56i3AOElSAfT6NVqnLr8\n",
+              "mfbO/AiWtC7ZCdSKqLQrBheoCisxuwRDc+0Qj4IlPLBawyneGpiLaece3KMzpKTos+5YxlSYlKtg\n",
+              "/Me6PG+fH2sUI9B09T2Px/9ucFTXTUC5j4ELLv01D5MY2VAAAADfAZ7odEN/ACK9oCBuM4G+uLMW\n",
+              "L2dP1lfTvDhmlpluM7IE4yEUJKicqu4KM5OijIBGmwd/fv/FYUE8C16mNefQ0Uy/D+0+Hpx1ZFAP\n",
+              "3vl+5XYGW/hV3tVz6fpDmClx2VYPTKI+QsHyxc+qQa6raGV2rQAFnERDWDAoPELDpD0DBzrtQ9Gj\n",
+              "f1X0zbjtJNpqrwp/hRbaIrr15pQNp8wHXKVl3vyz9d+FD2rUtkJQVzj6V7XpNVWdz4mpDYH1JRGS\n",
+              "i2MURr0RotwXgP3Qnz/8L/EyxM0Sb/CNWw8xQFPmbCgpDwAAAOUBnupqQ38AH77opN4Quy1TZxAA\n",
+              "Og5d0nOlbRa1c67qPfhIW7P+8Av3GtFE0HFQCvcwO1xKybwlnguY0Nqo5bzwqVZ4m1UebapfH7JG\n",
+              "d9M94gSTzLBzp+7XrhnquJ9dwfh5fBCyLWBt8xSfTcJZr1HXGrAMOw+Jv+pCMMogCsMVlWbHeQuT\n",
+              "mD3/yuQp5lDob+9AYNdyDEIT/fV+2vxg/LuQxTIX08ne1pWMu28zMsHEcHxols+2LTEYzIWCi8BU\n",
+              "K3ZtJRE3rAjZxLOQ4w3m2m/D157HitClmlKcP9jJchoyWV95Jy2gAAABu0Ga70moQWyZTAgj//61\n",
+              "KoAGg+KazAhO48Rk+mELCfGa3jedcL7j4i4wMKqReszSNQj5h17BpSVMT9hX+zPhBrSs6Vj7HyaE\n",
+              "qm6lvw7kPbwwNhW67XEllpB7/AB7Dtmc/Lsrl2N4BzMZzIFVEJCqVkWDwHz0DCyLCsDRdQx8uGEg\n",
+              "Ikolt9wM9AgzvQ7TxR98jTrIYP8SP9CCVhDDASOwwiUKcH0pWRrgAYwjw8Gf7OlbogYj/no1BpFx\n",
+              "lYglvem+TH822s9SIsjJ3EA1IN/sTGSWgAXqwMREDl6rGx1E4un7krghrGWUm+/7j4jDoGqrYrQI\n",
+              "g7E+ktnqOLNELPNyQd8WQ/umSuXC1xL1umwA8X5+yPqMMHEIeQL1fzz/JWAXyMH93QMSzGumbhKw\n",
+              "Zwg0U+25Tvu4PnK5VQHbV0zvOU2Pj+MGf/nsDxqxrqZsD9S4YY9rcTfMxz/MkkzIgfRGQF/OgLHr\n",
+              "joIjF7P6XCeWe+XUgCwqZQG68PRNzfXkn+zUJpMMk0jjnoYnDkQ975Dz0Z65i4o7OdZtwLEOfaoE\n",
+              "pB0fo5td4PyA9vYIFlRo3xi7uvrQcih7/M7KbZFgAAAA9kGfDUUVLDv/ABlUeHLsmGHl+OQZEho1\n",
+              "hMDtEgrgr/N3AttUVM/7crMT5dwlm5uvzGVCn6w/p670sqgr5PJ6oiWC1npINQXp4CRzsctCmXzn\n",
+              "Ugai5K7NbwfaQcfbZKrjzT/10H2u4nhhcuuZyNqUHfbG94mETU3kKDy9A89Il0BA9I1A+R3yjNfc\n",
+              "+Nz5BwP3DN+ZYjka/GHLl0y68JgPyPoe9w8jyG5IXdu2vCa+LYvH9kU234z4psgT4qxlrdkhxxyP\n",
+              "UJXN8nPpx6cXDiQznv0L2owqy0csZbCzUw4CVJ98G+4T1R39bjI9WT0YHLigorskW6Eh4QAAAMYB\n",
+              "nyx0Q38AHP005LNxTWEpiZ1J9di26t3EruDGda0AVBouFN0G1ywEJMXJZuIMxrfHCac7PtwdnQsN\n",
+              "5ABPxruKApfvrd4v1WFO3Cl2Zd1SOG3/r1ORn6HwtueiSFcG0RNU2EL7iLFK3PfYpxwH299J2sER\n",
+              "9fENVpZ0Q3jjs6HsM0edV/QB07Ofn+R5vOS4TYLqhcaZAnuosw5RlS5g1Q8CuW9BZXMHWP4TGLry\n",
+              "nY5Y9ez3m8FrqVUEclyyvuywjGI3odTE+j8AAABPAZ8uakN/ABxO1hwOVZzdUT/5uAde8ARk7MlV\n",
+              "yrPTe1pSPQMTQCpdw5z/lBFmnGZwxWyqh+3IqkDkhpoxeW8ZCVdNB2x/1RnvvpDhcO3MwQAAAbZB\n",
+              "mzNJqEFsmUwII//+tSqABlJow5npTNmtYD16z8AGI7v0s/GnfyqOWOrIj7MzWLMA+5yFNFLu1hTu\n",
+              "dlbGlkD8jL3ONezhs0gurnHp2pFLsP3djo3BgKHcLr5q4kg5WMX28rT11jnIH4bHAuJDI0/Gub5+\n",
+              "542H8l9OurnbLu7ccDaau7k+AVcLYmIJfjhEaissSRpn2usY/14Z8WeJwbzUwclx5b0pufbMDj2m\n",
+              "E4jonmtfVQvsVKXSLVBGus9F0XUey7wsw1/Hxpa1Dj6X89JFMTZZDEgLc8SXNlb52uC+3SYuA3pO\n",
+              "yIZ3zYRDkwb5/sIpC9s/jtT+DR4JrFHAg/zOLQvdBHh2BZ/H88Qk1FOi1nkBwtogVwTsAvTRwaaM\n",
+              "L+Fy6Vw65xxtt2p06IrGo+vGB6Ev7rBsQ1lA5dJTwIES1/HSnI96cCqyJNRkq8io7XoKHq1jP8jJ\n",
+              "K8KCILcbnjTzWMILhY3EuZ8pRzEGblkg+ofcWDech+PkwDbk4flJvQ1eVGNBBbzkH58MbHNkp5C1\n",
+              "pRDfsnIb9VIwGZIgexRK5GP0EM8ZveKhcNpqg0C7EdFVGM7dDkwAAAFMQZ9RRRUsO/8AFKVU3AQX\n",
+              "TKYCKlUskM896ABcbpuBaq23+VbIBAleYM+Uh2fmC8hKxXufvA+Jyd8ERfcMKq2QBuOeaw8cG8nv\n",
+              "l00dW9FnZ2ewlISmCmZ99L0bw0GXPORXq89pSQ+5zLmGTJWLpbqXg/Gg/k26eFQ7yctp0OrjpANw\n",
+              "gpKfTmSwqfpdIyAO4i1HmWAczC/dxtyvK6EJns7ev/M+uhg/UBsLPdCc4ktjYaoFvgpYJl8v+SaB\n",
+              "iW6/qJFs8B7ABY+Xoa/3pJdDPx7Wo16RIr9F0VKx7gY2CroKhVZyesK3QK039pTJworswqeMoYtQ\n",
+              "SxUGWdIlnZAh/LxAqJSAgdbCea7vV7Jw7UJ3RZWLCaN03DO0g6FTEO0PNlB/y2w2d5hCS2yZtMLR\n",
+              "726poAjDu+5lgVHjodzIR1vHcKS57NpFhydymmBuCPgAAAD3AZ9wdEN/AB0F8+qoYAk/JkWPAABe\n",
+              "eS/K4R2z8W8rEZ4Es2dHO2B1xqZeWERk/2j9D35SD32hnizfkl5AQkKu7sKMRtxB0qUTg/5Ai8ci\n",
+              "ewPsEvh0cTnE+UnVVZQsy2FhpSkguxSgj2GzhV7H4B4oQdASRatW+4ge9XWWDwbNzKDfs2ikSZGn\n",
+              "ZK2J2cdk5ZNdF/NbhHS0c6vDp3S53pob/1OoP8UOX13YMuZJYtnSstfaINj9HWvrLOMusuMgy0ge\n",
+              "hr00WpqM4G4LNFMeeHMWs3VdDioqjp1BlI0pyKTUMl2eH+Urm0ENGx6u7gM90gDkOBdN7tgm4QAA\n",
+              "ASkBn3JqQ38AIb2gdaiDraOcmiaTmEkpG5LCwpD7mwoBhbYx9hK/huA/Rlz76MMOi96iXfBz3DSh\n",
+              "vG5XYVehGnggzBAkHfGgYDsO5F3SWLpvAiWuQYgw379rpdMwhqWoBgIHHe7UqoU3PiKCUX8CUwon\n",
+              "PUuq8JY4AYYztu7mmGelokJyoAJS97RU/X6H+RdsNNzitkC1d8I6jDPIy7qqN4tCnL3rY6Yesfv1\n",
+              "e8kTaN9S190RCoZyxCFd2JzsfgZhniY0nZmfUb/Ilr3HhSfAoNjT9YPJpZU0gCEN/XEjzBiwlPnv\n",
+              "oPqWZP16sXNdepP+5XR/WuewqnrAjpV8x4yn9rFVK/AamriL1xzzEUk66pD3JF3R2TNlp/oPgGf2\n",
+              "3Zht7rWDs3F41xpI2UAAAAHmQZt3SahBbJlMCCP//rUqgAZb4qaPpZ+oz92BGWQCHjdoHQHQRvdw\n",
+              "JuWMeCAf9SCNq3pRzo+QLWwm+zJnwkwndhEvWHQ/SujctvY5pe+lS1QEjQXzeizSF8k6tO14eAtl\n",
+              "F+Mync2FH/YIAKwBXgDqn6AXOHpWQcynHtaJryxWYm270/11pJpJLJP1UcyORiPI54DPlbzdu+l/\n",
+              "jiFd4hpdaoZTSIPUh6A6ClqPxEqekFrNjAxud2WiOSd4IE7Kaf//vpwZ0mh9bmck4Z3rAu3/6Cvy\n",
+              "KA3WyoqAFX4UT0ZjH4z6LrUYRBEZElMEZc4snCHRyZf+tjKnoDXWOrVFpzxu69dV7GJ+V1irRKox\n",
+              "Pd1LRXYUoYi+P14fumR2pYbtX+VBW+m+c7NAd8Z01d3TTKV7Mg7nTZdtCA/oFcETl7++5b2EIheP\n",
+              "k2Fg+5ToPyynpqzSsvv9vWMyfYTJnDg6PojbFsxSs0nRUvqnP5QCdr6QHBhWXFOG60F0RsLzEsNc\n",
+              "wpNcPfKeYjjdCfe8YUIVjq0PBSvcnC+B/ETQWaX7IFbWhPaknWILlx3KsiYwYSMVn5rwfQd4Jkdd\n",
+              "9H+fdht5f/EJHYCK5IGupAjPxHpu+QiB/iUSmCHkkTiMqsG8twzlljjsl22n8veAAAABCEGflUUV\n",
+              "LDv/ABi1iDy3ZgloGXmZPsuhVsylb+qqNi7GSIfQ+OHuoRwObuWCiDJsleSNbQz9VgmS3f493Q1l\n",
+              "fk0LSjQ0QBKQCe3UmCkV8vYYHcKN9CZn1L0i/3IstLHQcy91VMXucG0IQjYMvd5K4nw1TsRQ+zNt\n",
+              "c33OM7wT4gTiFbFnfUP6sORkbyxKD8+9VWHRCKkGnoAnjqhwkHV3YzaNKz290rB0XwxFDvsi8iqf\n",
+              "z+DNrf49LxpvDCniJY8b921MDAhjoaXQisEELwuIkEG2MG16iA+xn4KZIc8cifkUnLKYTAHTEosc\n",
+              "/geFGHZmG9d/0Ad4ehB1+UFj3eeT8gc12jWX2ySdSQAAAUIBn7R0Q38AHbXz6qhgDdTYSzAi1h3K\n",
+              "16Xr3JTVUajJdHP4n1zwK/61yxZ9pP4QSRtJbkJZWH6vivN5vckWYfjVoaQoNcq3qWx+bI+OTtrh\n",
+              "UNznJnNVmMngQpK+748FuR69zyCunCVVntkmuIrtQvOCVbqBuRz5Qxvz7t49H+VL6IAp+Rh2gf74\n",
+              "0j/UPUfosZ/ElbvCMu7rvOP7cWI+JN6KUOE+/AXQCyHGSkSvvSc5FsX0fFal2fQXaEkH67EHfCc5\n",
+              "xhdseiByl+PiqAs8A9zuy4qmXDeeIj+3Yojnw30fZXbmjymzKitBenCylofDP0QjYedpgwNVFWxv\n",
+              "pKDrpf57i5C5JHBxrkMOZNs3TkoKjfQLvKDT/j1Fvw02tHitRU1MR1mnPja0zhtM0e5b68dpKMZ6\n",
+              "9AO+761c+Ba/40Js4HhAAAABBwGftmpDfwAiuitca5eBLMHeP5uZuF9cX0/VXhqHcuiBABGdnZlB\n",
+              "vvbdh+1A3f4uQyVZizhw70/9zDh2nx3tQGn11M/7g3e0ETDcFJMpuy3pyqZj8OhCsFXcJg/Dg2Ky\n",
+              "wNn+F0Nd65xqPmrT4IAWVNyWgNuyHhWrg80hH2qe3n3QFTH+AG0t1LUQWRwdt8cDbAi+8IGZZrTn\n",
+              "QzKAGB5g+jkMrZS2t5af/14Dikh/TUO9x6vp3udUZwfEqX9x43nyKd2KkcrjEt0VxTQ1LHt4TKTU\n",
+              "ov9g2wymXIrIg/m2cGScMEoY8xa4E2v0IBu8Siv364Oh7cF3cjWG+ZJkZ6xGCUsmpmsJt4n9AAAB\n",
+              "cEGbu0moQWyZTAgj//61KoAGC/pGgJ9CubE/Hy/U90CEEMEEbF2P5yKT5EQsPLolJYuDn1q5ANTN\n",
+              "SJwpmVcvZVK2Tco4v2Comd7hwZPuuXhX+lvh+l6ZtjrC3czf1ZVbdumb3r3D/ioYe7qcFNf7aS5r\n",
+              "2YnlPFx/ox3Po4uR9L227Pa5JPu/JVHojzbyIvC2hUPLYoK3yo8EFTOEx9VW2Kka/dDqBAClQEXM\n",
+              "coaHOVrqvWOBlx0SmrR2Fn5qD0ttjA+wKyG9Ww/+/fxdGsIy8lThxbGnpYEDoqIDxAPPdyC1j/7C\n",
+              "x1S6SZ6cX8TWD+edELbCVScHr4twowGayNRkN1sGJ3ChzFZqefnm592USWq1KVPalCkn+IgAbkI0\n",
+              "gf8crEnxuQcz5L3ov1loEzryk4ptgt40vN/cUUrwi49uNdXDzDlba6ntBbOYIPKYQqVbRsWX//V3\n",
+              "7VjjZzb0fU2VitbTbNlERmPP5obsCvIRmiOfAAAA7EGf2UUVLDv/ABgUnqVjfMZzbMROTbEr98Ov\n",
+              "G6hTv8LwbEOVBTuoZFwTL9eOUuW51yt7Pk5XoOwvCITHjPxM0+ACPLC5p8LXGPLXOMFwxyKNAOm2\n",
+              "+bVnL7eC/eonqWYHV7ElnGiaPE4DZvhksvIAUMvT1hgYsLWg5pHxPTMEf4vPc7k/U4gx+qn0dLIb\n",
+              "xLE6WPqhOli4SJOCHhekKlwgxlnM6S8wIxjTrZQVP6tyjUXc7nRDpn5+4xHTB5JTQd/Y+v5uYYim\n",
+              "vSxL9Lp9+sJa/YqUqQ0UFcQR3Tlp/PCrTJ5gUcQmlTDSjEV8pdpwAAABAgGf+HRDfwAhujWPq7Ze\n",
+              "gCJPvLBRhSSbcG6El3BFXKqbl3V6+XLJCsWmxwO7Xskzh85D3/GGBbxCjXU3okqTeEYfyjkOl+SH\n",
+              "4VGFs6uGeBXI6FuyUdCktochZVIQW+D6bukSQtQ9xBoZWqRH4hlWFBiT6bV+GQGerlgKyeaNsqD5\n",
+              "s+IDfM/wce0dikHUV0++Nr2rHe3jcRRrSy2FHjFSMdnyldmaj1iFauYYGv6d3l/8LPJtc5g5u4Q0\n",
+              "WerxF6DQAN+WlQUAod5dWuqnUKOySujKDQh4Sh1bNoaribkhCngsbjiJUpnyDzJfWcRyF47YB87L\n",
+              "Omkfy8ijCTvweGsJYAgScQAAAQUBn/pqQ38AIbo1j6vClDiF6mIvKX7IDWIXdy1QyeJm7hwAhKrN\n",
+              "5ZQTH6lrtJ9D3xtslHyvy2ywnd5a5/owLJHRc2EtkPadJ8Uji+G9O7CT6ooBM3rAgAWaKgWADHof\n",
+              "Rk55HzZ+V8DMw4S4pnRLudTRFnX1DyLXHV3VXMnhAeP+ewFDtdkUHGMhcSI0U8KajX0wWNdBGeGb\n",
+              "D8Ns9BH8mxfhSu/SqyYkA2AIdaTRVyL0w7XOVFH3DXljVqrcwMdXPvGgiBcw6chMaLbepo7nSmh1\n",
+              "vAbwAQYruBhNTN0eawky0jofbme4HocI40c1sz31wjy2n2/uelK4XikXYFYmVtl4Kdutz8YAAAGb\n",
+              "QZv9SahBbJlMFEwR//61KoAGK1mYIKmbNbOjB+hVE+vOJ4Z3vMpGSn/PYftL6FXoKU3FZYLBaEus\n",
+              "cDU8hX8r/T4sCEjN2tKC+to/+IoDOzT/F3qpjao2Qnfg6SHJn87cmSTE3IR8bzvTmr+Ye4Ac/+hl\n",
+              "xYNmjmRG01XaPV08JLNnbV2zuL5cn/7CsR7I4pKAadGKE6UheVLfqn0i791ThTaaO2OCRjsSWF8e\n",
+              "1o7SXLcWHdmh1WCFSlfjet1S/FkIphxf8M1ZQjLPF96/W7wlOpiP6jEis8o6251YpmdqxS3VSmv/\n",
+              "s9Bv3ISLvkMspiZj+iQwr28MINay/7syEY2A7ZiKqNUJX069yti8CuYwd1gGvQZSlufV+auVaTNU\n",
+              "xocXs0XuFW0e/AWENf2i3yxrLFTHW9CCBeoKH21CafAHq6hi+H/e9DkZU77nSidgvmP6DIx/XjI4\n",
+              "Sp9anaBxYwcylzQtEH2XN+nrwpDPp45KYG9LI0xieadJ2QOTHIvADfNhP/PY2gqE0NQ2qkvQc0a7\n",
+              "Xw6JCi5LfZz745MNAAAA8QGeHGpDfwAdo0DVwAgarNdw1dyEo22Z+2voCmn3MepWOJpNH9uE22Fc\n",
+              "UAf4fo25DS3VGYdH0kZ3bYGxdzd+R7awrh1yiW2ItRU9+fbZ+7eJ43X/1GQK2tLeuYX+rXNnNYVn\n",
+              "3JiyKGKiuk48G4gEpBGTo6LBxeBZg0OXhUHfR3yB3h9X56ir+g4EbNusZoLNQh23BaGzc9/s1PO9\n",
+              "1PPSEqrUiAosSTAygJNCJGqMs5yCqcS+EZopY3ntHhRp/rTMQhL4aAxAb8XQkEJtEmWrzD4p1eX6\n",
+              "QEZh/6hTVX/Gz191R2H/Dtkpg79J3GkssFm0vPkAAAH+QZoBSeEKUmUwII///rUqgAWb5x2D2a6r\n",
+              "t0Z9OpYFG2tABdnWLgsFoSkhKeOGdpZQLTxZJNtdR1o3VEUaCsJe7TDcWLiNBjbFk4iCHCNTwP1B\n",
+              "ET8aIdy/mqBaPrTdtuT/6FMRex7yXV0X/b0t3IdDKZDeFLpQzjHVkdbvbm3BNwCciVQUNcJ7Sjbw\n",
+              "T4hbhPp0oEDMMYqhG0FXqi8cqsDNhwZenV4L974lIjS1k1BRVCVuxIwrhHZ+ZNeKQOVccqtyU7fb\n",
+              "1nmmkdbnAEav9V5tnQTxoYHQvrZLL4f7C+LE0IOtSnKggNbex2Xp0FNi9T/+fjTgmF5bW9OJ+WCx\n",
+              "leyLvNiQF8k0bwSPMh7702+7OB9yXypsT0VFN+3fNlolLg4yJ7ye2ijeDcs0TyR0KI9OqHHwk9VT\n",
+              "lv0R4DjKMuNtxv3yyDdQ02ld84rRe/IbVoqtujoBlwArv27SRkTybmrwQddynU1vfFNgJ2tkTxsX\n",
+              "EuhAyTUDk1pdyrePvO3Kyjq07E+ZdqW1unVDCL0p2PAM0Bdj+ozOm4QJPGRq3YEQjJpnk1BNx6E0\n",
+              "yZMxRvkyW2tYZosgoDR8rW5jEN/sH3PsICgk/jLYhgpsvFfXxjf0NPxMCt81bgYfKxBAoUrGuF/8\n",
+              "Gb453zLMx96NgDfHj/3/yVULmADuEWX3e7X8vwCYAAAA+kGeP0U0TDv/ABZZvB5hJcOQiwautVBH\n",
+              "s3zn8C7pn/fvWkU93yxomewKAdw+9VXghKzj8nMy4EQ6n26QhvOvN3ZOGl4wrl9GlrTzWwgssqXz\n",
+              "oLBd9XVA4LrC7D/kDb3CEAYvcHCWxuhsk3WHFeLlRhwB95RghbDR4boSp+CQz3CY9L8bxC9Ohf/r\n",
+              "dy9+xoLX1H7kyaZJ3YehTdM+5Wu6Hpc4XocPo/ogFns0WlfgVPekkiZdh228q3p+OFEAyCsprsbc\n",
+              "bh4x6zwYau0C11ECccZga0PS18ku4j08dAfMYirHksImmVD9Aw8yto6D9YLwntF8IaA+FPG9VagA\n",
+              "AAB+AZ5edEN/AB8T3aVQEVcYwT0kXXzzDP4yP2lC7bONTcb6acU9HQ87UdrkSLI4+OHKFlU0EAFz\n",
+              "P/GPhcZ5NOIVfnz6vsVd3DH3XZLg43PF1cMypwOcG8sbzfthjMA4FQSgVvJe40X2MhECJet9t2G/\n",
+              "XdWa+YBzkUuLdbRPBeGTAAABAAGeQGpDfwAfDtYNiNYeWLJ1JGi8AHLac8oZrJR5tDRFy80bn36g\n",
+              "01RfxVuWBDFeUQUU4VHoswV2zHbq6MzAloc0SM3f88f/qXApn5tj32GTO8MmdjG+5h2BlZLr7lVk\n",
+              "BcTdEueULRCVgGF4dFB9PX4Y3jYyGQfKH/BWnAEfbs4hEQ8ebrGB8mSRpcKz5q1oNG7pkp8qNfsq\n",
+              "nkhG1h5qVJ826dklpNvhQDQdQnVi0zusZWH7g9GItx1/0euTzo8U/z7D4DrbASMUmgB0DC8TSqJd\n",
+              "xZ+UMAYbubxMdW+iPv2N1tIKXHdcOVBHhDDt1MeY4rBQavQwdjpFZBiUMt5ya+AAAAH0QZpFSahB\n",
+              "aJlMCCH//qpVAAyk+dgiPwCMdRFSufgoxGSIR+/0rSe9Cp9hy8WpEfkfjpu1RSHWd3zlulcFC+Nh\n",
+              "XPR//hjTft5KlTxkfWUjrzSX8Q8sCzZTRHqzVvb/rscPsXHQf0E6taB/yJXWDm9ZR5fbjX3mwQRc\n",
+              "72p/7Nk/lJUO//4LM1qLgtlckFFvGA4aviZYHpBb9w1OJg/Jqwvkkixar7ua0LNG3ane8+4yu/5g\n",
+              "n8krsqxREhrpsaI39b317zkKj6KVaeKiNvQ1KBsts5QsX+yTO1tzmbv5PRxGS8tz2hKf4zB8fbWM\n",
+              "XhqB6Gi2mMVEo6jXnv5vErjT3e551EcovqLpcSnuFBTI4jT6V7ZqZq5zqsmn23ZqFTbBnXJfy5qg\n",
+              "Xc1RIbUSG7SAPcicWIbuNtZ4GQS+WKAEZUxr++6VPQD3gpW4BeKCxEy910wCA11VXaqCgcSgS5FA\n",
+              "dwACIPfrp0NhEyPCvA4qNFC9NitDM1I8HthEGAjfRL6imFuJfW4+Sk08ZcO8JNBK0/bkkNG7XFo7\n",
+              "Hs15nZek/o+FGsRiwki6FYqc1HBc8skTelrrFiYgicL9M/ehriAlP3GGSVQdD58oSyTAbR/XOwHh\n",
+              "/k7736bu5rnUg2SpAi/FdrWUFq0zx+C7UUDgbK+SgABs/nsA2PEAAAE5QZ5jRREsO/8AGLWLoCDh\n",
+              "swFlTBepUmfLHY9h6nZJebQZXCAk5QrW0LEqJOc6Tf3RfmBa+BH+trXpxDsoWsYBGGxFB6vHNSw7\n",
+              "QTuHxSINvJ7kINONdsnA7unyZfe+/dUQpBab4cd9DfyyBJrHeEf61R0Nfn0RkLu3bt6BWIYQlYtM\n",
+              "K9Nfs/vIPwJSfjpXcON5DPtNNDffXZk4RydlgN+S/E7EUmDtA6DaeTT9v6cz5zUd9DSGZ32drbmv\n",
+              "ejyP/MmN69TJZPy1fo/BndGgtSNNbFsKVeTDjxqdcz9cfjIrJ3P86/aSSTu++gY85cN7L+QFkn5k\n",
+              "/lX20+90kKxSs6X+x+u7me+jslyG1ZQaBGKwDx+RwViiPwDARZocg2yGxzRByDsEM59E93SHlUl9\n",
+              "GT+PqBiUfn848MoVbAAAAOoBnoJ0Q38AIb2gG1yg4IVhU+P1ifa//N+gWvitYvaP0DtmWpAevRlW\n",
+              "lsu4BAkHfcTJNIE2cW9WUS6DTP7xEfhthE/Au7/XTkrYH5bPnHuWMD+L4E2Ys7TDv/WnXsb8WMjs\n",
+              "GVKLefmxcZqtW10iMABVusPZiYCVoxR1g16JAWeZ7iIjTKxZ0g1yWUY7SYbSh6LLTrvWvhE7lU5U\n",
+              "CdpswEmIpPdhoFfYojayY1ypJuWbbU1PB5nvwD9t85tVUeFQcQm5aN4kQawNooLXHpvRUW63Gqd8\n",
+              "iY0WiZheEXu2JHmP8XM7t/dfyrk3Fx0AAAEdAZ6EakN/ACK9oCBuM4cceanCEPXT9ZV29ukUDhUK\n",
+              "Q43qY97tIKPQ4ZLk+xSOgxBfQxL7yIrZscfkKmKCSoYxQfZ+tSzvOZ1GhW2ifFuVzAIEg7+77ixc\n",
+              "Kx//CGLPLPJ464HVUHGkhcx37PQ+kbQrXlUbN3cWUp0Qf4LtEibFhZ+LpSZJ4udEDKi6Q/S18Psl\n",
+              "/qmdcccWROb1W4f/Xy9V+lMS0Du/XhxzsIhWccm/rlAZXG9J5NMLdRfS734QHwqLqFpe0KPTU/Mz\n",
+              "iY1ev2MPDzHxs95uiDK6gRc1gvD7TgXhVki57ReTigwP0Vcnsm9mMNHj3Nt6/RMhlMwCLQhy6qqL\n",
+              "YC7Z58RnNbEutfWZAa9Y2SYIcplB+x/e/c7TAAABlkGaiUmoQWyZTAh3//6plgAykehDX8oAigHL\n",
+              "uS7e5BiYpAhLP0Zp72qQ9WFfih2hD6ViubvwAAAy+5vuYY1yi1tJuPfBi/DL0xvClymIwqUp5EK2\n",
+              "pijOf291KPaqRN5kbJjB/2wfKr1+XMiKLX6DysREeFfQlDwQLBucvt+vNOXQokOSOb4yTYfCyIZ/\n",
+              "GHqmX89FI8GoC7SVJ8dqrGOCOpcjHfvSY2QsrqBh9dhAV5Sl9v/BQKeopbgb9Qoepn/uEMh2fyEW\n",
+              "JmX+JgRFJalJclAgIlVBNaF+FoinY0YPKhqMcuoH+rtaEk2LTWu4NHdn9ysTAkHlBR2G+58hU289\n",
+              "8X49s9CJy7d2oeKmsapTwnIxxJ2LNCm+TxMniHit0ZHqI5VMxQ+5ZJ2tPHM7/cT3gdae3yVR8+YM\n",
+              "/KU5H6oISvxSd8TybIcXMyYVHn6O+gwy4SKx3AkMYLFpKRIO1eI3ZmEPll+L/2Ahp3aDBQxulIlY\n",
+              "Qc1v4+BSAHSjYxY/VpZwrkFkWgmuXijX9pnceU+eCQb0BkKKYYEAAAE7QZ6nRRUsO/8AGBMrmVrk\n",
+              "4p6lyIABr6JcUvWGXYV0DKg9NQWqfn9mmEaqxk2L7hAoVLAefd2AT3uOnaK6MhbcdSJ0jbOgAdky\n",
+              "1NCtoTFYEK1L3oNAlJW78V3WE6NttmJ67HTQFhc7jbPt6n2fAdknrF4tehh2ttPPRj0ZMNDck2O/\n",
+              "Og/0bAxzaaL7DSYz/qGCfH6ue/8E9mejEEqzP8HffVv8Obhn2u8eQxOotWj4hO+DblITeYVYJXny\n",
+              "h4Mo9PoOPQCtWY4pEEbVZmokYfc6NrhoTMJC8d+WVfQUp/9dQN2FtoGBhQPHEwvVbIcYhR7B4iO2\n",
+              "lHuM7fr8Nz2PLRQOuR4Lhle59+tgw9IpLSJGfVu5u0NIKILKM/viNoDYYuKxIDdR/J6apnFKAoah\n",
+              "uk9v6if+0v3ru/qsdmBBAAAA3wGexnRDfwAivaAgbjOBvrizFgvOGL0/6vuXDLpZruFaiDwd2rdX\n",
+              "jHVzx9p+aFelpPZGVUpD9afD05Q+ygH73y/cGcCL/wq72iJds0hr5PUpNV/aSoB5zpjnS1krIC0g\n",
+              "xgvcsTNLJd1aFsq1w5umkQK05c9QgDPa1eUOrMmn+/YlpdytXE6u+4FAjIpYVgn74StUYfcT8IT8\n",
+              "SGX5Wru0UB/4BiwZwXDYz0r2pPySvTt1TUg57ubb0S/BqMvEVZ5rArNFw0GaRO5EmmTuHjFK31Ed\n",
+              "ZcrudMiOWUSCfSesj44AAADfAZ7IakN/AB++6KTdC0Gg2vR2G3QAHQcu6TnStota0MGq57eEms8e\n",
+              "GSZ8YTYymFLgl7YZGG1YXmh3orKEBl6b97W6tU9/+wsf9/cg00EpDLAMwmuhlqrl+tcaP161PaCT\n",
+              "db1JjfLZ6rQlIR/u8Lq+hDMPBrZgZ6lFmsHEDUzmL1vhrC/Eg5wjH+dLR3xJpn70Bg13IMQhP99X\n",
+              "7a/GD8u5DFMhlFEykeU8M0AF5LVwxauGljyJ2PG9wt/W7GNjLNgsX4aFTR897+cKWdUMsr13pC8x\n",
+              "KjWMpGHXcQ2lKSkGzAAAAR5BmspJqEFsmUwIb//+p4QAYn+ayCPJyJ7QOf/irXuB3I7yUvrv3Wd8\n",
+              "OLQaJBb/+EMR1r6SAeh0um3VtQPrwYoZU0zDlMzZlECRYSRYOAqgamI/sUVWVEYaYAVab8QpucQ/\n",
+              "sSTh0wVtYsFYYkt/gr7uhkEpx1NPSuJ9CqWeDhMsefol+oaGZkPTooDGiCB29X8Zubhk7s13xY5c\n",
+              "l2KWl6cdQs8QOBu4PKBLJa04v3ctO+FHUCNJTXN7J5YnaOHn+BLPFy7A6HoUxVmuK9kB/hB9j6ln\n",
+              "0nykP3r6vgXJiVxtga3Ek+Zj3edZUHSAUux6bbxkCgdvPWLgxmKM0iIQ0SZS+9McjsqW/5Kw1hL5\n",
+              "sobdDT0GsHJ+I+IDODn9/vmRAAAGqm1vb3YAAABsbXZoZAAAAAAAAAAAAAAAAAAAA+gAAB1MAAEA\n",
+              "AAEAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAAAAAAAAAAAAAAAAIAAAXUdHJhawAAAFx0a2hkAAAAAwAAAAAAAAAAAAAAAQAAAAAA\n",
+              "AB1MAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAQAAAAAGw\n",
+              "AAABIAAAAAAAJGVkdHMAAAAcZWxzdAAAAAAAAAABAAAdTAAACAAAAQAAAAAFTG1kaWEAAAAgbWRo\n",
+              "ZAAAAAAAAAAAAAAAAAAAKAAAASwAVcQAAAAAAC1oZGxyAAAAAAAAAAB2aWRlAAAAAAAAAAAAAAAA\n",
+              "VmlkZW9IYW5kbGVyAAAABPdtaW5mAAAAFHZtaGQAAAABAAAAAAAAAAAAAAAkZGluZgAAABxkcmVm\n",
+              "AAAAAAAAAAEAAAAMdXJsIAAAAAEAAAS3c3RibAAAALNzdHNkAAAAAAAAAAEAAACjYXZjMQAAAAAA\n",
+              "AAABAAAAAAAAAAAAAAAAAAAAAAGwASAASAAAAEgAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAAAAAAABj//wAAADFhdmNDAWQAFf/hABhnZAAVrNlBsJaEAAADAAQAAAMAUDxYtlgB\n",
+              "AAZo6+PLIsAAAAAcdXVpZGtoQPJfJE/FujmlG88DI/MAAAAAAAAAGHN0dHMAAAAAAAAAAQAAAEsA\n",
+              "AAQAAAAAFHN0c3MAAAAAAAAAAQAAAAEAAAJgY3R0cwAAAAAAAABKAAAAAQAACAAAAAABAAAUAAAA\n",
+              "AAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABAAAAAAAgAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n",
               "AQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAAB\n",
               "AAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEA\n",
-              "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAA\n",
-              "CAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAM\n",
+              "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAwAAAAAAQAA\n",
+              "BAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAA\n",
               "AAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgA\n",
               "AAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAA\n",
               "AAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAA\n",
-              "AAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAA\n",
-              "AQAABAAAAAABAAAMAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n",
-              "AAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAgAAAAAHHN0c2MAAAAAAAAAAQAAAAEA\n",
-              "AABkAAAAAQAAAaRzdHN6AAAAAAAAAAAAAABkAAAGhgAAAl8AAAFjAAAAvgAAAXYAAAHzAAABDgAA\n",
-              "ATYAAAFIAAAB9QAAAOIAAAD6AAABWgAAAbAAAADTAAAB8wAAAN4AAAH+AAABEAAAAOIAAAG2AAAC\n",
-              "DAAAAWUAAAGkAAABmgAAAckAAAEdAAABfQAAAPMAAAFxAAABIgAAAjYAAAEmAAAA5AAAAXoAAAH+\n",
-              "AAAA/wAAAT0AAAFnAAACAwAAARQAAAE3AAABTwAAAckAAADrAAACFwAAAP0AAAHzAAABIQAAAOAA\n",
-              "AAHKAAACOwAAAVQAAAHFAAABugAAAdQAAAD3AAABUgAAARIAAAFuAAABLwAAAhAAAAERAAAA9gAA\n",
-              "AZkAAAIqAAABIgAAAV0AAAGIAAACSgAAASgAAAFEAAABggAAAegAAAD+AAACCgAAASIAAAIdAAAB\n",
-              "KAAAAQcAAAHbAAACFgAAAT0AAAITAAAB2gAAAi8AAAEGAAABrQAAASoAAAF0AAABZgAAAl4AAAFU\n",
-              "AAAA+gAAAbYAAAHjAAABLwAAAZwAAAHBAAAB8QAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEA\n",
-              "AABabWV0YQAAAAAAAAAhaGRscgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWp\n",
-              "dG9vAAAAHWRhdGEAAAABAAAAAExhdmY1Ny44My4xMDA=\n",
+              "AAEAAAwAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAA\n",
+              "AQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n",
+              "AAAIAAAAABxzdHNjAAAAAAAAAAEAAAABAAAASwAAAAEAAAFAc3RzegAAAAAAAAAAAAAASwAABs8A\n",
+              "AAI/AAABMQAAAGEAAAD8AAABkwAAAMcAAAEbAAABNgAAALkAAAGdAAAA/QAAAMYAAADdAAABzAAA\n",
+              "AQEAAADcAAAARQAAAdsAAAE9AAAA0QAAAU4AAAIZAAABBwAAAV4AAAEDAAABXgAAAPMAAAD0AAAB\n",
+              "CQAAAaUAAACyAAABnQAAANsAAAB3AAAA8QAAAbQAAAFGAAAA+AAAAQ0AAAG7AAABKQAAAOMAAADp\n",
+              "AAABvwAAAPoAAADKAAAAUwAAAboAAAFQAAAA+wAAAS0AAAHqAAABDAAAAUYAAAELAAABdAAAAPAA\n",
+              "AAEGAAABCQAAAZ8AAAD1AAACAgAAAP4AAACCAAABBAAAAfgAAAE9AAAA7gAAASEAAAGaAAABPwAA\n",
+              "AOMAAADjAAABIgAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEAAABabWV0YQAAAAAAAAAhaGRs\n",
+              "cgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWpdG9vAAAAHWRhdGEAAAABAAAA\n",
+              "AExhdmY1Ny44My4xMDA=\n",
               "\"\u003e\n",
               "  Your browser does not support the video tag.\n",
               "\u003c/video\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f84b2253b50\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f1286b190b8\u003e"
             ]
           },
           "metadata": {
@@ -1209,15 +790,15 @@
       "source": [
         "import time\n",
         "import traceback\n",
+        "import sys\n",
         "\n",
         "from matplotlib import pyplot as plt\n",
         "from matplotlib import animation as anim\n",
-        "import tensorflow as tf\n",
-        "from tensorflow.contrib import autograph as ag\n",
+        "import numpy as np\n",
         "from IPython import display\n",
         "\n",
         "\n",
-        "@ag.do_not_convert(ag.RunMode.PY_FUNC)\n",
+        "@tf.autograph.experimental.do_not_convert\n",
         "def render(boards):\n",
         "  fig = plt.figure()\n",
         "\n",
@@ -1237,74 +818,71 @@
         "  except RuntimeError:\n",
         "    print('Coult not render animation:')\n",
         "    traceback.print_exc()\n",
+        "    return 1\n",
+        "  return 0\n",
         "\n",
         "\n",
         "def gol_episode(board):\n",
-        "  directions = tf.constant(\n",
-        "      ((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)))\n",
+        "  new_board = tf.TensorArray(tf.int32, 0, dynamic_size=True)\n",
         "\n",
-        "  new_board = []\n",
-        "  ag.set_element_type(new_board, tf.int32)\n",
-        "\n",
-        "  for i in range(len(board)):\n",
-        "    for j in range(len(board[i])):\n",
-        "      num_neighbors = 0\n",
-        "      for d in directions:\n",
-        "        ni = i + d[0]\n",
-        "        nj = j + d[1]\n",
-        "        if ni \u003e= 0 and nj \u003e= 0 and ni \u003c len(board) and nj \u003c len(board[i]):\n",
-        "          num_neighbors += board[ni][nj]\n",
+        "  for i in tf.range(len(board)):\n",
+        "    for j in tf.range(len(board[i])):\n",
+        "      num_neighbors = tf.reduce_sum(\n",
+        "          board[tf.maximum(i-1, 0):tf.minimum(i+2, len(board)),\n",
+        "                tf.maximum(j-1, 0):tf.minimum(j+2, len(board[i]))]\n",
+        "      ) - board[i][j]\n",
         "      \n",
-        "      new_cell = 0\n",
         "      if num_neighbors == 2:\n",
         "        new_cell = board[i][j]\n",
         "      elif num_neighbors == 3:\n",
         "        new_cell = 1\n",
+        "      else:\n",
+        "        new_cell = 0\n",
         "      \n",
         "      new_board.append(new_cell)\n",
-        "  final_board = ag.stack(new_board)\n",
+        "  final_board = new_board.stack()\n",
         "  final_board = tf.reshape(final_board, board.shape)\n",
         "  return final_board\n",
         "  \n",
         "\n",
+        "@tf.function(experimental_autograph_options=(\n",
+        "    tf.autograph.experimental.Feature.EQUALITY_OPERATORS,\n",
+        "    tf.autograph.experimental.Feature.BUILTIN_FUNCTIONS,\n",
+        "    tf.autograph.experimental.Feature.LISTS,\n",
+        "    ))\n",
         "def gol(initial_board):\n",
         "  board = initial_board\n",
-        "  boards = []\n",
-        "  ag.set_element_type(boards, tf.int32)\n",
-        "  # We are being explicit about tensor constants to ensure the loop\n",
-        "  # is not unrolled in the graph. This may change in the future.\n",
-        "  for i in range(tf.constant(NUM_STEPS)):\n",
+        "  boards = tf.TensorArray(tf.int32, size=0, dynamic_size=True)\n",
+        "\n",
+        "  i = 0\n",
+        "  for i in tf.range(NUM_STEPS):\n",
         "    board = gol_episode(board)\n",
         "    boards.append(board)\n",
-        "  boards = ag.stack(boards)\n",
-        "  render(boards)\n",
-        "  return tf.no_op()\n",
+        "  boards = boards.stack()\n",
+        "  tf.py_function(render, (boards,), (tf.int64,))\n",
+        "  return i\n",
         " \n",
         "\n",
-        "with tf.Graph().as_default():\n",
-        "  # Gosper glider gun\n",
-        "  # Adapted from http://www.cplusplus.com/forum/lounge/75168/\n",
-        "  _ = 0\n",
-        "  initial_board = tf.constant((\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
-        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
-        "  ))\n",
-        "  initial_board = tf.pad(initial_board, ((0, 20), (0, 10)))\n",
-        "  \n",
-        "  tf_gol = ag.to_graph(gol)\n",
-        "  game_ops = tf_gol(initial_board)\n",
-        "  with tf.Session() as sess:\n",
-        "    sess.run(game_ops)\n"
+        "# Gosper glider gun\n",
+        "# Adapted from http://www.cplusplus.com/forum/lounge/75168/\n",
+        "_ = 0\n",
+        "initial_board = tf.constant((\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "    ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "    ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "))\n",
+        "initial_board = tf.pad(initial_board, ((0, 10), (0, 5)))\n",
+        "\n",
+        "_ = gol(initial_board)"
       ]
     },
     {
@@ -1319,179 +897,21 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 2323
-        },
+        "colab": {},
         "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 753,
-          "status": "ok",
-          "timestamp": 1532101593840,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "hIGYeX0Cxs3i",
-        "outputId": "e0b62eb1-3e12-4e53-dc54-8a3fa56d823d"
+        "id": "hIGYeX0Cxs3i"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "from __future__ import print_function\n",
-            "import tensorflow as tf\n",
-            "\n",
-            "def tf__gol_episode(board):\n",
-            "  try:\n",
-            "    with tf.name_scope('gol_episode'):\n",
-            "      directions = tf.constant(((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1),\n",
-            "          (1, -1), (1, 0), (1, 1)))\n",
-            "      new_board = ag__.new_list([])\n",
-            "\n",
-            "      def extra_test_2(new_board_2):\n",
-            "        with tf.name_scope('extra_test_2'):\n",
-            "          return True\n",
-            "\n",
-            "      def loop_body_2(i, new_board_2):\n",
-            "        with tf.name_scope('loop_body_2'):\n",
-            "\n",
-            "          def extra_test_1(new_board_1):\n",
-            "            with tf.name_scope('extra_test_1'):\n",
-            "              return True\n",
-            "\n",
-            "          def loop_body_1(j, new_board_1):\n",
-            "            with tf.name_scope('loop_body_1'):\n",
-            "              num_neighbors = 0\n",
-            "\n",
-            "              def extra_test(num_neighbors_2):\n",
-            "                with tf.name_scope('extra_test'):\n",
-            "                  return True\n",
-            "\n",
-            "              def loop_body(d, num_neighbors_2):\n",
-            "                with tf.name_scope('loop_body'):\n",
-            "                  ni = i + ag__.get_item(d, (0), opts=ag__.GetItemOpts(\n",
-            "                      element_dtype=None))\n",
-            "                  nj = j + ag__.get_item(d, (1), opts=ag__.GetItemOpts(\n",
-            "                      element_dtype=None))\n",
-            "\n",
-            "                  def if_true():\n",
-            "                    with tf.name_scope('if_true'):\n",
-            "                      num_neighbors_1, = num_neighbors_2,\n",
-            "                      num_neighbors_1 += ag__.get_item(ag__.get_item(board,\n",
-            "                          (ni), opts=ag__.GetItemOpts(element_dtype=None)),\n",
-            "                          (nj), opts=ag__.GetItemOpts(element_dtype=None))\n",
-            "                      return num_neighbors_1,\n",
-            "\n",
-            "                  def if_false():\n",
-            "                    with tf.name_scope('if_false'):\n",
-            "                      return num_neighbors_2,\n",
-            "                  num_neighbors_2 = ag__.utils.run_cond(tf.logical_and(tf.\n",
-            "                      greater_equal(ni, 0), tf.logical_and(tf.greater_equal\n",
-            "                      (nj, 0), tf.logical_and(tf.less(ni, ag__.utils.\n",
-            "                      dynamic_builtin(len, board)), tf.less(nj, ag__.utils.\n",
-            "                      dynamic_builtin(len, ag__.get_item(board, (i), opts=\n",
-            "                      ag__.GetItemOpts(element_dtype=None))))))), if_true,\n",
-            "                      if_false)\n",
-            "                  return num_neighbors_2,\n",
-            "              num_neighbors = ag__.for_stmt(directions, extra_test,\n",
-            "                  loop_body, (num_neighbors,))\n",
-            "              new_cell = 0\n",
-            "\n",
-            "              def if_true_2():\n",
-            "                with tf.name_scope('if_true_2'):\n",
-            "                  new_cell_2, = new_cell,\n",
-            "                  new_cell_2 = ag__.get_item(ag__.get_item(board, (i), opts\n",
-            "                      =ag__.GetItemOpts(element_dtype=None)), (j), opts=\n",
-            "                      ag__.GetItemOpts(element_dtype=None))\n",
-            "                  return new_cell_2,\n",
-            "\n",
-            "              def if_false_2():\n",
-            "                with tf.name_scope('if_false_2'):\n",
-            "                  new_cell_3, = new_cell,\n",
-            "\n",
-            "                  def if_true_1():\n",
-            "                    with tf.name_scope('if_true_1'):\n",
-            "                      new_cell_1, = new_cell_3,\n",
-            "                      new_cell_1 = 1\n",
-            "                      return new_cell_1,\n",
-            "\n",
-            "                  def if_false_1():\n",
-            "                    with tf.name_scope('if_false_1'):\n",
-            "                      return new_cell_3,\n",
-            "                  new_cell_3 = ag__.utils.run_cond(tf.equal(num_neighbors, \n",
-            "                      3), if_true_1, if_false_1)\n",
-            "                  return new_cell_3,\n",
-            "              new_cell = ag__.utils.run_cond(tf.equal(num_neighbors, 2),\n",
-            "                  if_true_2, if_false_2)\n",
-            "              new_board_1 = ag__.list_append(new_board_1, new_cell)\n",
-            "              return new_board_1,\n",
-            "          new_board_2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range,\n",
-            "              ag__.utils.dynamic_builtin(len, ag__.get_item(board, (i),\n",
-            "              opts=ag__.GetItemOpts(element_dtype=None)))), extra_test_1,\n",
-            "              loop_body_1, (new_board_2,))\n",
-            "          return new_board_2,\n",
-            "      new_board = ag__.for_stmt(ag__.utils.dynamic_builtin(range, ag__.\n",
-            "          utils.dynamic_builtin(len, board)), extra_test_2, loop_body_2, (\n",
-            "          new_board,))\n",
-            "      final_board = ag__.list_stack(new_board, opts=ag__.ListStackOpts(\n",
-            "          element_dtype=tf.int32, original_call=ag.stack))\n",
-            "      final_board = tf.reshape(final_board, board.shape)\n",
-            "      return final_board\n",
-            "  except:\n",
-            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
-            "\n",
-            "def tf__gol(initial_board):\n",
-            "  try:\n",
-            "    with tf.name_scope('gol'):\n",
-            "      board = initial_board\n",
-            "      boards = ag__.new_list([])\n",
-            "\n",
-            "      def extra_test(board_1, boards_1):\n",
-            "        with tf.name_scope('extra_test'):\n",
-            "          return True\n",
-            "\n",
-            "      def loop_body(i, board_1, boards_1):\n",
-            "        with tf.name_scope('loop_body'):\n",
-            "          board_1 = tf__gol_episode(board_1)\n",
-            "          boards_1 = ag__.list_append(boards_1, board_1)\n",
-            "          return board_1, boards_1\n",
-            "      board, boards = ag__.for_stmt(ag__.utils.dynamic_builtin(range, tf.\n",
-            "          constant(NUM_STEPS)), extra_test, loop_body, (board, boards))\n",
-            "      boards = ag__.list_stack(boards, opts=ag__.ListStackOpts(\n",
-            "          element_dtype=tf.int32, original_call=ag.stack))\n",
-            "      with ag__.utils.control_dependency_on_returns(render(boards)):\n",
-            "        boards_2 = ag__.utils.alias_tensors(boards)\n",
-            "        return tf.no_op()\n",
-            "  except:\n",
-            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
-        "print(ag.to_code(gol))"
+        "print(tf.autograph.to_code(gol.python_function))"
       ]
     }
   ],
   "metadata": {
     "colab": {
-      "collapsed_sections": [
-        "p8zZyj-tq4K3",
-        "Lkq3DBGOv3fA",
-        "r8_0ioEuAI-a",
-        "7NgrSPCZxs3h"
-      ],
-      "default_view": {},
+      "collapsed_sections": [],
       "last_runtime": {
         "build_target": "",
         "kind": "local"
@@ -1503,8 +923,11 @@
           "timestamp": 1528465909719
         }
       ],
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
     }
   },
   "nbformat": 4,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 51b27ea4212..1e6de7ee17e 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -214,8 +214,8 @@ class ToBigtableOp : public AsyncOpKernel {
       std::vector<string> columns;
       columns.reserve(column_families_tensor->NumElements());
       for (uint64 i = 0; i < column_families_tensor->NumElements(); ++i) {
-        column_families.push_back(column_families_tensor->flat<string>()(i));
-        columns.push_back(columns_tensor->flat<string>()(i));
+        column_families.push_back(column_families_tensor->flat<tstring>()(i));
+        columns.push_back(columns_tensor->flat<tstring>()(i));
       }
 
       DatasetBase* dataset;
@@ -317,7 +317,7 @@ class ToBigtableOp : public AsyncOpKernel {
           "Iterator produced a set of Tensors shorter than expected");
     }
     ::google::cloud::bigtable::SingleRowMutation mutation(
-        std::move(tensors[0].scalar<string>()()));
+        std::move(tensors[0].scalar<tstring>()()));
     std::chrono::milliseconds timestamp(timestamp_int);
     for (size_t i = 1; i < tensors.size(); ++i) {
       if (!TensorShapeUtils::IsScalar(tensors[i].shape())) {
@@ -326,11 +326,11 @@ class ToBigtableOp : public AsyncOpKernel {
       if (timestamp_int == -1) {
         mutation.emplace_back(::google::cloud::bigtable::SetCell(
             column_families[i - 1], columns[i - 1],
-            std::move(tensors[i].scalar<string>()())));
+            std::move(tensors[i].scalar<tstring>()())));
       } else {
         mutation.emplace_back(::google::cloud::bigtable::SetCell(
             column_families[i - 1], columns[i - 1], timestamp,
-            std::move(tensors[i].scalar<string>()())));
+            std::move(tensors[i].scalar<tstring>()())));
       }
     }
     bulk_mutation->emplace_back(std::move(mutation));
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 01cedd8d762..13658558bc0 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -67,9 +67,9 @@ Status GcpStatusToTfStatus(const ::google::cloud::Status& status) {
       strings::StrCat("Error reading from Cloud Bigtable: ", status.message()));
 }
 
-string RegexFromStringSet(const std::vector<string>& strs) {
+string RegexFromStringSet(const std::vector<tstring>& strs) {
   CHECK(!strs.empty()) << "The list of strings to turn into a regex was empty.";
-  std::unordered_set<string> uniq(strs.begin(), strs.end());
+  std::unordered_set<tstring> uniq(strs.begin(), strs.end());
   if (uniq.size() == 1) {
     return *uniq.begin();
   }
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index 1325560e772..ce2bea0d759 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 Status GcpStatusToTfStatus(const ::google::cloud::Status& status);
 
-string RegexFromStringSet(const std::vector<string>& strs);
+string RegexFromStringSet(const std::vector<tstring>& strs);
 
 class BigtableClientResource : public ResourceBase {
  public:
@@ -115,6 +115,15 @@ class BigtableReaderDatasetIterator : public DatasetIterator<Dataset> {
                           const ::google::cloud::bigtable::Row& row,
                           std::vector<Tensor>* out_tensors) = 0;
 
+  Status SaveInternal(IteratorStateWriter* writer) override {
+    return errors::Unimplemented("SaveInternal is currently not supported");
+  }
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override {
+    return errors::Unimplemented("RestoreInternal is currently not supported");
+  }
+
  private:
   Status EnsureIteratorInitialized() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (reader_) {
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
index 8039ef8cd77..a69936236be 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
@@ -29,11 +29,11 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
     core::RefCountPtr<BigtableTableResource> table;
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &table));
 
-    std::vector<string> column_families;
-    std::vector<string> columns;
-    OP_REQUIRES_OK(ctx, ParseVectorArgument<string>(ctx, "column_families",
-                                                    &column_families));
-    OP_REQUIRES_OK(ctx, ParseVectorArgument<string>(ctx, "columns", &columns));
+    std::vector<tstring> column_families;
+    std::vector<tstring> columns;
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, "column_families",
+                                                     &column_families));
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, "columns", &columns));
     OP_REQUIRES(
         ctx, column_families.size() == columns.size(),
         errors::InvalidArgument("len(columns) != len(column_families)"));
@@ -58,8 +58,8 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
                      BigtableTableResource* table,
-                     std::vector<string> column_families,
-                     std::vector<string> columns,
+                     std::vector<tstring> column_families,
+                     std::vector<tstring> columns,
                      const DataTypeVector& output_types,
                      std::vector<PartialTensorShape> output_shapes)
         : DatasetBase(DatasetContext(ctx)),
@@ -97,18 +97,23 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
       return "BigtableLookupDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
     static ::google::cloud::bigtable::Filter MakeFilter(
-        const std::vector<string>& column_families,
-        const std::vector<string>& columns) {
+        const std::vector<tstring>& column_families,
+        const std::vector<tstring>& columns) {
       string column_family_regex = RegexFromStringSet(column_families);
       string column_regex = RegexFromStringSet(columns);
 
@@ -154,13 +159,13 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
           ::google::cloud::StatusOr<
               std::pair<bool, ::google::cloud::bigtable::Row>>
               row = dataset()->table_->table().ReadRow(
-                  input_tensors[0].scalar<string>()(), dataset()->filter_);
+                  input_tensors[0].scalar<tstring>()(), dataset()->filter_);
           if (!row.ok()) {
             return GcpStatusToTfStatus(row.status());
           }
           if (!row->first) {
             return errors::DataLoss("Row key '",
-                                    input_tensors[0].scalar<string>()(),
+                                    input_tensors[0].scalar<tstring>()(),
                                     "' not found.");
           }
           TF_RETURN_IF_ERROR(ParseRow(ctx, row->second, out_tensors));
@@ -172,13 +177,24 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented("SaveInternal is currently not supported");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "RestoreInternal is currently not supported");
+      }
+
      private:
       Status ParseRow(IteratorContext* ctx,
                       const ::google::cloud::bigtable::Row& row,
                       std::vector<Tensor>* out_tensors) {
         out_tensors->reserve(dataset()->columns_.size() + 1);
         Tensor row_key_tensor(ctx->allocator({}), DT_STRING, {});
-        row_key_tensor.scalar<string>()() = string(row.row_key());
+        row_key_tensor.scalar<tstring>()() = tstring(row.row_key());
         out_tensors->emplace_back(std::move(row_key_tensor));
 
         if (row.cells().size() > 2 * dataset()->columns_.size()) {
@@ -194,9 +210,9 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
           for (auto cell_itr = row.cells().begin();
                !found_column && cell_itr != row.cells().end(); ++cell_itr) {
             if (cell_itr->family_name() == dataset()->column_families_[i] &&
-                string(cell_itr->column_qualifier()) ==
+                tstring(cell_itr->column_qualifier()) ==
                     dataset()->columns_[i]) {
-              col_tensor.scalar<string>()() = string(cell_itr->value());
+              col_tensor.scalar<tstring>()() = tstring(cell_itr->value());
               found_column = true;
             }
           }
@@ -216,8 +232,8 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     BigtableTableResource* table_;
-    const std::vector<string> column_families_;
-    const std::vector<string> columns_;
+    const std::vector<tstring> column_families_;
+    const std::vector<tstring> columns_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
     const ::google::cloud::bigtable::Filter filter_;
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
index e9d4a1e05ea..6af5c6d0fc2 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
@@ -26,8 +26,8 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel {
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string prefix;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "prefix", &prefix));
+    tstring prefix;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "prefix", &prefix));
 
     core::RefCountPtr<BigtableTableResource> resource;
     OP_REQUIRES_OK(ctx,
@@ -71,12 +71,17 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel {
 
     BigtableTableResource* table() const { return table_; }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
@@ -97,7 +102,7 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel {
                       const ::google::cloud::bigtable::Row& row,
                       std::vector<Tensor>* out_tensors) override {
         Tensor output_tensor(ctx->allocator({}), DT_STRING, {});
-        output_tensor.scalar<string>()() = string(row.row_key());
+        output_tensor.scalar<tstring>()() = tstring(row.row_key());
         out_tensors->emplace_back(std::move(output_tensor));
         return Status::OK();
       }
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
index be3c7cc5f38..22f7ddfe15d 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
@@ -26,11 +26,11 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel {
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string start_key;
+    tstring start_key;
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<string>(ctx, "start_key", &start_key));
-    string end_key;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "end_key", &end_key));
+                   ParseScalarArgument<tstring>(ctx, "start_key", &start_key));
+    tstring end_key;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "end_key", &end_key));
 
     core::RefCountPtr<BigtableTableResource> resource;
     OP_REQUIRES_OK(ctx,
@@ -76,12 +76,17 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel {
 
     BigtableTableResource* table() const { return table_; }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
@@ -103,7 +108,7 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel {
                       const ::google::cloud::bigtable::Row& row,
                       std::vector<Tensor>* out_tensors) override {
         Tensor output_tensor(ctx->allocator({}), DT_STRING, {});
-        output_tensor.scalar<string>()() = string(row.row_key());
+        output_tensor.scalar<tstring>()() = tstring(row.row_key());
         out_tensors->emplace_back(std::move(output_tensor));
         return Status::OK();
       }
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
index 880f5e40f25..08bf35f6c23 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
@@ -27,14 +27,14 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel {
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string prefix;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "prefix", &prefix));
+    tstring prefix;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "prefix", &prefix));
 
-    string start_key;
+    tstring start_key;
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<string>(ctx, "start_key", &start_key));
-    string end_key;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "end_key", &end_key));
+                   ParseScalarArgument<tstring>(ctx, "start_key", &start_key));
+    tstring end_key;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "end_key", &end_key));
 
     core::RefCountPtr<BigtableTableResource> resource;
     OP_REQUIRES_OK(ctx,
@@ -89,12 +89,17 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel {
       return "BigtableSampleKeyPairsDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
@@ -175,16 +180,27 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel {
         *end_of_sequence = false;
         out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                   TensorShape({}));
-        out_tensors->back().scalar<string>()() = keys_[index_];
+        out_tensors->back().scalar<tstring>()() = keys_[index_];
 
         out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                   TensorShape({}));
-        out_tensors->back().scalar<string>()() = keys_[index_ + 1];
+        out_tensors->back().scalar<tstring>()() = keys_[index_ + 1];
         ++index_;
 
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented("SaveInternal is currently not supported");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "RestoreInternal is currently not supported");
+      }
+
      private:
       mutex mu_;
       size_t index_ GUARDED_BY(mu_) = 0;
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
index 53be3b5a2bb..f4498305aa2 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
@@ -64,12 +64,17 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel {
 
     BigtableTableResource* table() const { return table_; }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
@@ -97,8 +102,8 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel {
         if (index_ < row_keys_.size()) {
           out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                     TensorShape({}));
-          out_tensors->back().scalar<string>()() =
-              string(row_keys_[index_].row_key);
+          out_tensors->back().scalar<tstring>()() =
+              tstring(row_keys_[index_].row_key);
           *end_of_sequence = false;
           index_++;
         } else {
@@ -107,6 +112,17 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented("SaveInternal is currently not supported");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "RestoreInternal is currently not supported");
+      }
+
      private:
       mutex mu_;
       size_t index_ = 0;
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
index e68c83ed547..d2b6959fef5 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
@@ -26,13 +26,13 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string prefix;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "prefix", &prefix));
-    string start_key;
+    tstring prefix;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "prefix", &prefix));
+    tstring start_key;
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<string>(ctx, "start_key", &start_key));
-    string end_key;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "end_key", &end_key));
+                   ParseScalarArgument<tstring>(ctx, "start_key", &start_key));
+    tstring end_key;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "end_key", &end_key));
 
     OP_REQUIRES(ctx, !(prefix.empty() && start_key.empty()),
                 errors::InvalidArgument(
@@ -46,11 +46,11 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
                       "If prefix is specified, end_key must be empty."));
     }
 
-    std::vector<string> column_families;
-    std::vector<string> columns;
-    OP_REQUIRES_OK(ctx, ParseVectorArgument<string>(ctx, "column_families",
-                                                    &column_families));
-    OP_REQUIRES_OK(ctx, ParseVectorArgument<string>(ctx, "columns", &columns));
+    std::vector<tstring> column_families;
+    std::vector<tstring> columns;
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, "column_families",
+                                                     &column_families));
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, "columns", &columns));
     OP_REQUIRES(
         ctx, column_families.size() == columns.size(),
         errors::InvalidArgument("len(columns) != len(column_families)"));
@@ -90,8 +90,8 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table,
                      string prefix, string start_key, string end_key,
-                     std::vector<string> column_families,
-                     std::vector<string> columns, float probability,
+                     std::vector<tstring> column_families,
+                     std::vector<tstring> columns, float probability,
                      const DataTypeVector& output_types,
                      std::vector<PartialTensorShape> output_shapes)
         : DatasetBase(DatasetContext(ctx)),
@@ -131,12 +131,17 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
 
     BigtableTableResource* table() const { return table_; }
 
+    Status CheckExternalState() const override {
+      return errors::FailedPrecondition(DebugString(),
+                                        " depends on external state.");
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization");
     }
 
    private:
@@ -175,7 +180,7 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
                       std::vector<Tensor>* out_tensors) override {
         out_tensors->reserve(dataset()->columns_.size() + 1);
         Tensor row_key_tensor(ctx->allocator({}), DT_STRING, {});
-        row_key_tensor.scalar<string>()() = string(row.row_key());
+        row_key_tensor.scalar<tstring>()() = tstring(row.row_key());
         out_tensors->emplace_back(std::move(row_key_tensor));
 
         if (row.cells().size() > 2 * dataset()->columns_.size()) {
@@ -191,9 +196,9 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
           for (auto cell_itr = row.cells().begin();
                !found_column && cell_itr != row.cells().end(); ++cell_itr) {
             if (cell_itr->family_name() == dataset()->column_families_[i] &&
-                string(cell_itr->column_qualifier()) ==
+                tstring(cell_itr->column_qualifier()) ==
                     dataset()->columns_[i]) {
-              col_tensor.scalar<string>()() = string(cell_itr->value());
+              col_tensor.scalar<tstring>()() = tstring(cell_itr->value());
               found_column = true;
             }
           }
@@ -212,8 +217,8 @@ class BigtableScanDatasetOp : public DatasetOpKernel {
     const string prefix_;
     const string start_key_;
     const string end_key_;
-    const std::vector<string> column_families_;
-    const std::vector<string> columns_;
+    const std::vector<tstring> column_families_;
+    const std::vector<tstring> columns_;
     const string column_family_regex_;
     const string column_regex_;
     const float probability_;
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 4f1d7990ce6..e55c0dc7806 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -476,7 +476,7 @@ class BigtableTable(object):
       if tensor_type != dtypes.string:
         raise ValueError("Not all elements of the dataset were `tf.string`")
     for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
-      if not shape.is_compatible_with(tensor_shape.scalar()):
+      if not shape.is_compatible_with(tensor_shape.TensorShape([])):
         raise ValueError("Not all elements of the dataset were scalars")
     if len(column_families) != len(columns):
       raise ValueError("len(column_families) != len(columns)")
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 5a8b2ba9caf..60f92a0ff25 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import tempfile
 import numpy as np
 
+from google.protobuf import text_format
 from tensorflow.contrib.boosted_trees.estimator_batch import estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.estimator.canned import head as head_lib
@@ -137,6 +139,15 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self._export_dir_base = tempfile.mkdtemp() + "export/"
     gfile.MkDir(self._export_dir_base)
 
+  def _assert_checkpoint_and_return_model(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor("ensemble_model:0_config")
+    ensemble_proto = tree_config_pb2.DecisionTreeEnsembleConfig()
+    ensemble_proto.ParseFromString(serialized)
+
+    return ensemble_proto
+
   def _assert_checkpoint(self, model_dir, global_step):
     reader = checkpoint_utils.load_checkpoint(model_dir)
     self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
@@ -404,8 +415,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
+    learner_config.regularization.tree_complexity = (1.0 /
+                                                     _QUANTILE_REGRESSION_SIZE)
 
     train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
 
@@ -437,8 +448,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
+    learner_config.regularization.tree_complexity = (1.0 /
+                                                     _QUANTILE_REGRESSION_SIZE)
 
     train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
         two_dimension=True)
@@ -471,6 +482,329 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_both_below_upper >= 0.91)
     self.assertTrue(frac_both_below_upper <= 0.99)
 
+  def testForcedInitialSplits(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+
+    initial_subtree = """
+            nodes {
+              dense_float_binary_split {
+                feature_column: 0
+                threshold: -0.5
+                left_id: 1
+                right_id: 2
+              }
+              node_metadata {
+                gain: 0
+              }
+            }
+            nodes {
+              dense_float_binary_split {
+                feature_column: 0
+                threshold: 0.52
+                left_id: 3
+                right_id: 4
+              }
+              node_metadata {
+                gain: 0
+              }
+            }
+            nodes {
+              dense_float_binary_split {
+                feature_column: 0
+                threshold: 0.554
+                left_id: 5
+                right_id: 6
+              }
+              node_metadata {
+                gain: 0
+              }
+            }
+            nodes {
+              leaf {
+                vector {
+                  value: 0.0
+                }
+              }
+            }
+            nodes {
+              leaf {
+                vector {
+                  value: 0.0
+                }
+              }
+            }
+            nodes {
+              leaf {
+                vector {
+                  value: 0.0
+                }
+              }
+            }
+            nodes {
+              leaf {
+                vector {
+                  value: 0.0
+                }
+              }
+            }
+    """
+    tree_proto = tree_config_pb2.DecisionTreeConfig()
+    text_format.Merge(initial_subtree, tree_proto)
+
+    # Set initial subtree info.
+    learner_config.each_tree_start.CopyFrom(tree_proto)
+    learner_config.each_tree_start_num_layers = 2
+
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=2,
+        examples_per_layer=6,
+        model_dir=model_dir,
+        config=config,
+        center_bias=False,
+        feature_columns=[contrib_feature_column.real_valued_column("x")],
+        output_leaf_index=False)
+
+    classifier.fit(input_fn=_train_input_fn, steps=100)
+    # When no override of global steps, 5 steps were used.
+    ensemble = self._assert_checkpoint_and_return_model(
+        classifier.model_dir, global_step=6)
+
+    # TODO(nponomareva): find a better way to test this.
+    expected_ensemble = """
+      trees {
+        nodes {
+          dense_float_binary_split {
+            threshold: -0.5
+            left_id: 1
+            right_id: 2
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.519999980927
+            left_id: 3
+            right_id: 4
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.554000020027
+            left_id: 5
+            right_id: 6
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 1.0
+            left_id: 7
+            right_id: 8
+          }
+          node_metadata {
+            gain: 0.888888895512
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: -2.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 2.00000023842
+            }
+          }
+        }
+      }
+      trees {
+        nodes {
+          dense_float_binary_split {
+            threshold: -0.5
+            left_id: 1
+            right_id: 2
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.519999980927
+            left_id: 3
+            right_id: 4
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.554000020027
+            left_id: 5
+            right_id: 6
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 1.0
+            left_id: 7
+            right_id: 8
+          }
+          node_metadata {
+            gain: 0.727760672569
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: -1.81873059273
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 1.81873047352
+            }
+          }
+        }
+      }
+      trees {
+        nodes {
+          dense_float_binary_split {
+            threshold: -0.5
+            left_id: 1
+            right_id: 2
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.519999980927
+            left_id: 3
+            right_id: 4
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          dense_float_binary_split {
+            threshold: 0.554000020027
+            left_id: 5
+            right_id: 6
+          }
+          node_metadata {
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+        nodes {
+          leaf {
+            vector {
+              value: 0.0
+            }
+          }
+        }
+      }
+      tree_weights: 0.10000000149
+      tree_weights: 0.10000000149
+      tree_weights: 0.10000000149
+      tree_metadata {
+        num_tree_weight_updates: 1
+        num_layers_grown: 3
+        is_finalized: true
+      }
+      tree_metadata {
+        num_tree_weight_updates: 1
+        num_layers_grown: 3
+        is_finalized: true
+      }
+      tree_metadata {
+        num_tree_weight_updates: 1
+        num_layers_grown: 2
+      }
+      growing_metadata {
+        num_layers_attempted: 3
+      }
+    """
+    self.assertProtoEquals(expected_ensemble, ensemble)
+
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -674,8 +1008,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
+    learner_config.regularization.tree_complexity = (1.0 /
+                                                     _QUANTILE_REGRESSION_SIZE)
 
     train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
     y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index 07fa4ca684b..477b191bcb7 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -29,6 +29,9 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
+from google.protobuf import text_format
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+
 
 class ModelBuilderOutputType(object):
   MODEL_FN_OPS = 0
@@ -106,10 +109,30 @@ def model_builder(features,
   training_features = copy.copy(features)
   training_features.pop(weight_column_name, None)
   global_step = training_util.get_global_step()
+
+  initial_ensemble = ""
+  if learner_config.each_tree_start.nodes:
+    if learner_config.each_tree_start_num_layers <= 0:
+      raise ValueError("You must provide each_tree_start_num_layers.")
+    num_layers = learner_config.each_tree_start_num_layers
+    initial_ensemble = """
+             trees { %s }
+             tree_weights: 0.1
+             tree_metadata {
+              num_tree_weight_updates: 1
+              num_layers_grown: %d
+              is_finalized: false
+             }
+             """ % (text_format.MessageToString(
+                 learner_config.each_tree_start), num_layers)
+    tree_ensemble_proto = tree_config_pb2.DecisionTreeEnsembleConfig()
+    text_format.Merge(initial_ensemble, tree_ensemble_proto)
+    initial_ensemble = tree_ensemble_proto.SerializeToString()
+
   with ops.device(global_step.device):
     ensemble_handle = model_ops.tree_ensemble_variable(
         stamp_token=0,
-        tree_ensemble_config="",  # Initialize an empty ensemble.
+        tree_ensemble_config=initial_ensemble,  # Initialize the ensemble.
         name="ensemble_model")
 
   # Create GBDT model.
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 9655e49d91b..5f9976a491c 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -46,7 +46,7 @@ class CreateTreeEnsembleVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input("tree_ensemble_config",
                                            &tree_ensemble_config_t));
     auto* result = new DecisionTreeEnsembleResource();
-    if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<string>()(),
+    if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<tstring>()(),
                                     stamp_token)) {
       result->Unref();
       OP_REQUIRES(
@@ -99,7 +99,7 @@ class TreeEnsembleSerializeOp : public OpKernel {
     Tensor* output_config_t = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(1, TensorShape(), &output_config_t));
-    output_config_t->scalar<string>()() =
+    output_config_t->scalar<tstring>()() =
         ensemble_resource->SerializeAsString();
   }
 };
@@ -130,7 +130,7 @@ class TreeEnsembleDeserializeOp : public OpKernel {
     OP_REQUIRES(
         context,
         ensemble_resource->InitFromSerialized(
-            tree_ensemble_config_t->scalar<string>()(), stamp_token),
+            tree_ensemble_config_t->scalar<tstring>()(), stamp_token),
         errors::InvalidArgument("Unable to parse tree ensemble config."));
   }
 };
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 431dc68836b..ee31a4b72c8 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -324,7 +324,7 @@ class QuantileAccumulatorAddSummariesOp : public OpKernel {
                 context,
                 ParseProtoUnlimited(
                     summary_proto,
-                    summary_list[resource_handle_idx].scalar<string>()()),
+                    summary_list[resource_handle_idx].scalar<tstring>()()),
                 errors::InvalidArgument("Unable to parse quantile summary."));
             std::vector<QuantileSummaryEntry> entries;
             entries.reserve(summary_proto->entries_size());
@@ -398,7 +398,7 @@ class MakeQuantileSummariesOp : public OpKernel {
         // Output to tensor.
         Tensor* output_t = nullptr;
         OP_REQUIRES_OK(context, output_list->allocate(index, {}, &output_t));
-        summary_proto->SerializeToString(&output_t->scalar<string>()());
+        SerializeToTString(*summary_proto, &output_t->scalar<tstring>()());
       };
 
       // These are blocks of ranges. We are iterating over both sparse and
@@ -494,7 +494,7 @@ class QuantileAccumulatorSerializeOp : public OpKernel {
     for (const auto& summary : stream.SerializeInternalSummaries()) {
       CopySummaryToProto(summary, stream_proto->add_summaries());
     }
-    stream_proto->SerializeToString(&stream_state_t->scalar<string>()());
+    SerializeToTString(*stream_proto, &stream_state_t->scalar<tstring>()());
     Tensor* buckets_t = nullptr;
     OP_REQUIRES_OK(
         context,
@@ -543,7 +543,7 @@ class QuantileAccumulatorDeserializeOp : public OpKernel {
     ::boosted_trees::QuantileStreamState state_proto;
     OP_REQUIRES(
         context,
-        ParseProtoUnlimited(&state_proto, stream_state_t->scalar<string>()()),
+        ParseProtoUnlimited(&state_proto, stream_state_t->scalar<tstring>()()),
         errors::InvalidArgument("Unabnle to parse quantile stream state."));
     std::vector<QuantileSummary> summaries;
     summaries.reserve(state_proto.summaries_size());
@@ -669,7 +669,7 @@ class QuantileAccumulatorFlushSummaryOp : public OpKernel {
     Tensor* output_t = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output_t));
-    summary_proto->SerializeToString(&output_t->scalar<string>()());
+    SerializeToTString(*summary_proto, &output_t->scalar<tstring>()());
     streams_resource->Reset(next_stamp_token);
   }
 };
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 65276242aba..0afab357414 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -213,8 +213,8 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output("split_infos",
                                                      TensorShape({size_output}),
                                                      &output_splits_t));
-    tensorflow::TTypes<string>::Vec output_splits =
-        output_splits_t->vec<string>();
+    tensorflow::TTypes<tstring>::Vec output_splits =
+        output_splits_t->vec<tstring>();
 
     if (num_elements == 0) {
       return;
@@ -248,7 +248,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
       const Tensor* gradients_t, const Tensor* hessians_t,
       tensorflow::TTypes<int32>::Vec* output_partition_ids,
       tensorflow::TTypes<float>::Vec* gains,
-      tensorflow::TTypes<string>::Vec* output_splits) {
+      tensorflow::TTypes<tstring>::Vec* output_splits) {
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[root_idx];
@@ -293,7 +293,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
 
       state->FillLeaf(best_left_node_stats, left_child);
       state->FillLeaf(best_right_node_stats, right_child);
-      split_info.SerializeToString(&(*output_splits)(root_idx));
+      SerializeToTString(split_info, &(*output_splits)(root_idx));
       (*gains)(root_idx) =
           best_gain - root_stats.gain - state->tree_complexity_regularization();
       (*output_partition_ids)(root_idx) = partition_ids(start_index);
@@ -308,7 +308,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
       const Tensor* gradients_t, const Tensor* hessians_t,
       tensorflow::TTypes<int32>::Vec* output_partition_ids,
       tensorflow::TTypes<float>::Vec* gains,
-      tensorflow::TTypes<string>::Vec* output_splits) {
+      tensorflow::TTypes<tstring>::Vec* output_splits) {
     // Holds the root stats per each node to be split.
     std::vector<GradientStats> current_layer_stats;
     current_layer_stats.reserve(num_elements);
@@ -411,7 +411,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
       (*output_partition_ids)(root_idx) = partition_ids(start_index);
       oblivious_split_info.add_children_parent_id(partition_ids(start_index));
     }
-    oblivious_split_info.SerializeToString(&(*output_splits)(0));
+    SerializeToTString(oblivious_split_info, &(*output_splits)(0));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
@@ -529,8 +529,8 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(
                                 "split_infos", TensorShape({num_elements}),
                                 &output_splits_t));
-    tensorflow::TTypes<string>::Vec output_splits =
-        output_splits_t->vec<string>();
+    tensorflow::TTypes<tstring>::Vec output_splits =
+        output_splits_t->vec<tstring>();
     SplitBuilderState state(context);
     // For each tree node that needs to be split.
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
@@ -674,7 +674,7 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
       auto* right_child = split_info.mutable_right_child();
       state.FillLeaf(best_left_node_stats, left_child);
       state.FillLeaf(best_right_node_stats, right_child);
-      split_info.SerializeToString(&output_splits(root_idx));
+      SerializeToTString(split_info, &output_splits(root_idx));
       gains(root_idx) =
           best_gain - root_stats.gain - state.tree_complexity_regularization();
       output_partition_ids(root_idx) = partition_ids(bias_start_index);
@@ -780,8 +780,8 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output("split_infos",
                                                      TensorShape({size_output}),
                                                      &output_splits_t));
-    tensorflow::TTypes<string>::Vec output_splits =
-        output_splits_t->vec<string>();
+    tensorflow::TTypes<tstring>::Vec output_splits =
+        output_splits_t->vec<tstring>();
     if (num_elements == 0) {
       return;
     }
@@ -818,7 +818,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       const Tensor* gradients_t, const Tensor* hessians_t,
       tensorflow::TTypes<int32>::Vec* output_partition_ids,
       tensorflow::TTypes<float>::Vec* gains,
-      tensorflow::TTypes<string>::Vec* output_splits) {
+      tensorflow::TTypes<tstring>::Vec* output_splits) {
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[non_empty_partitions[root_idx]];
@@ -873,7 +873,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       auto* right_child = split_info.mutable_right_child();
       state->FillLeaf(best_left_node_stats, left_child);
       state->FillLeaf(best_right_node_stats, right_child);
-      split_info.SerializeToString(&(*output_splits)(root_idx));
+      SerializeToTString(split_info, &(*output_splits)(root_idx));
       (*gains)(root_idx) =
           best_gain - root_stats.gain - state->tree_complexity_regularization();
       (*output_partition_ids)(root_idx) = partition_ids(start_index);
@@ -891,7 +891,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       const Tensor* gradients_t, const Tensor* hessians_t,
       tensorflow::TTypes<int32>::Vec* output_partition_ids,
       tensorflow::TTypes<float>::Vec* gains,
-      tensorflow::TTypes<string>::Vec* output_splits) {
+      tensorflow::TTypes<tstring>::Vec* output_splits) {
     // Holds the root stats per each node to be split.
     std::vector<GradientStats> current_layer_stats;
     current_layer_stats.reserve(num_elements);
@@ -992,7 +992,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       (*output_partition_ids)(root_idx) = partition_ids(start_index);
       oblivious_split_info.add_children_parent_id(partition_ids(start_index));
     }
-    oblivious_split_info.SerializeToString(&(*output_splits)(0));
+    SerializeToTString(oblivious_split_info, &(*output_splits)(0));
   }
 };
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 91c017839b5..bf5f5d34457 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -432,6 +432,27 @@ class GrowTreeEnsembleOp : public OpKernel {
       if (tree_config->nodes_size() <= 0) {
         ensemble_resource->RemoveLastTree();
       }
+
+      if ((ensemble_resource->num_trees() == 0 ||
+           ensemble_resource->LastTreeMetadata()->is_finalized()) &&
+          learner_config_.has_each_tree_start() &&
+          learner_config_.each_tree_start().nodes_size() > 0) {
+        DCHECK_GT(learner_config_.each_tree_start_num_layers(), 0);
+        // Add new dummy tree
+        boosted_trees::trees::DecisionTreeConfig* const tree_config =
+            ensemble_resource->AddNewTree(learning_rate);
+        VLOG(1) << "Adding a new forced tree";
+
+        *tree_config = learner_config_.each_tree_start();
+
+        boosted_trees::trees::DecisionTreeMetadata* const tree_metadata =
+            ensemble_resource->LastTreeMetadata();
+
+        tree_metadata->set_is_finalized(max_tree_depth <= 1);
+        tree_metadata->set_num_tree_weight_updates(1);
+        tree_metadata->set_num_layers_grown(
+            learner_config_.each_tree_start_num_layers());
+      }
     }
   }
 
@@ -447,7 +468,7 @@ class GrowTreeEnsembleOp : public OpKernel {
     for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
       const auto& partition_ids = partition_ids_list[handler_id].vec<int32>();
       const auto& gains = gains_list[handler_id].vec<float>();
-      const auto& splits = splits_list[handler_id].vec<string>();
+      const auto& splits = splits_list[handler_id].vec<tstring>();
       OP_REQUIRES(context, partition_ids.size() == gains.size(),
                   errors::InvalidArgument(
                       "Inconsistent partition Ids and gains tensors: ",
@@ -481,7 +502,7 @@ class GrowTreeEnsembleOp : public OpKernel {
     // Find best split per partition going through every feature candidate.
     for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
       const auto& gains = gains_list[handler_id].vec<float>();
-      const auto& splits = splits_list[handler_id].vec<string>();
+      const auto& splits = splits_list[handler_id].vec<tstring>();
       OP_REQUIRES(context, gains.size() == 1,
                   errors::InvalidArgument(
                       "Gains size must be one for oblivious weak learner: ",
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
index 386dc19fc7b..04dec603667 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
@@ -60,8 +60,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
       values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
@@ -183,8 +183,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = [[0, 0], [1, 0], [2, 0], [3, 0]]
       values = array_ops.constant([1, 2, 1, 2], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
@@ -294,8 +294,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
       values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
@@ -489,8 +489,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
       values = constant_op.constant_v1([], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
@@ -537,8 +537,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
       values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
@@ -591,8 +591,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       indices = [[0, 0], [0, 1], [2, 0]]
       values = array_ops.constant([1, 2, 2], dtype=dtypes.int64)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = categorical_split_handler.EqualitySplitHandler(
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 0e6a9f8f3a0..75881945fde 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -75,7 +75,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -261,8 +260,7 @@ class DenseSplitHandler(InequalitySplitHandler):
 
   def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state."""
-    if (self._gradient_shape == tensor_shape.scalar() and
-        self._hessian_shape == tensor_shape.scalar()):
+    if (self._gradient_shape.rank == 0 and self._hessian_shape.rank == 0):
       handler = make_dense_split_scalar
     else:
       handler = make_dense_split_tensor
@@ -441,8 +439,7 @@ class SparseSplitHandler(InequalitySplitHandler):
 
   def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state."""
-    if (self._gradient_shape == tensor_shape.scalar() and
-        self._hessian_shape == tensor_shape.scalar()):
+    if self._gradient_shape.rank == 0 and self._hessian_shape.rank == 0:
       handler = make_sparse_split_scalar
     else:
       handler = make_sparse_split_tensor
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 4a1b528646e..d41463d002f 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -63,8 +63,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
       class_id = -1
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
           l2_regularization=1.,
@@ -197,8 +197,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       partition_ids = array_ops.constant([1, 1, 1, 2], dtype=dtypes.int32)
       class_id = -1
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
           l2_regularization=1.,
@@ -333,8 +333,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
       class_id = -1
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.2,
           l2_regularization=2.,
@@ -645,8 +645,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
       partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
@@ -720,8 +720,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
       partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
@@ -854,8 +854,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
       hessians = array_ops.constant([0.12, 0.07, 0.2, 2])
       partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
@@ -965,8 +965,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       values = array_ops.constant([0.52, 0.3, 0.52])
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
@@ -1088,8 +1088,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       values = array_ops.constant([0.52, 0.3, 0.52])
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
@@ -1411,8 +1411,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       values = array_ops.constant([0.52, 0.3, 0.52])
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
@@ -1481,8 +1481,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       values = constant_op.constant_v1([], dtype=dtypes.float32)
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
@@ -1565,8 +1565,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
           non_empty_indices, non_empty_values, [4, 2])
       non_empty_sparse_column = non_empty_sparse_column.eval(session=sess)
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
@@ -1650,8 +1650,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       values = array_ops.constant([0.58])
       sparse_column = sparse_tensor.SparseTensor(indices, values, [1, 1])
 
-      gradient_shape = tensor_shape.scalar()
-      hessian_shape = tensor_shape.scalar()
+      gradient_shape = tensor_shape.TensorShape([])
+      hessian_shape = tensor_shape.TensorShape([])
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index edddc59956a..ca3dd545489 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -12,6 +12,9 @@ tf_proto_library(
         "learner.proto",
     ],
     cc_api_version = 2,
+    protodeps = [
+        ":tree_config_proto",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
index c49cb48cdea..fc5f158c073 100644
--- a/tensorflow/contrib/boosted_trees/proto/learner.proto
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -1,9 +1,11 @@
 syntax = "proto3";
 
-option cc_enable_arenas = true;
-
 package tensorflow.boosted_trees.learner;
 
+import "tensorflow/contrib/boosted_trees/proto/tree_config.proto";
+
+option cc_enable_arenas = true;
+
 // Tree regularization config.
 message TreeRegularizationConfig {
   // Classic L1/L2.
@@ -149,4 +151,11 @@ message LearnerConfig {
 
   // By default we use NORMAL_DECISION_TREE as weak learner.
   WeakLearnerType weak_learner_type = 12;
+
+  // If you want to enforce some splits and allow boosting to figure out the
+  // rest, you can provide a tree that represents the starting splits for each
+  // tree in the ensemble.
+  // Set both each_tree_start and each_tree_start_num_layers.
+  tensorflow.boosted_trees.trees.DecisionTreeConfig each_tree_start = 13;
+  int32 each_tree_start_num_layers = 14;
 }
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index ba459e8b812..d21a0f16621 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -32,8 +32,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
@@ -60,8 +60,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
@@ -89,8 +89,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
@@ -121,8 +121,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
@@ -162,8 +162,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
@@ -199,8 +199,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar())
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]))
       partition, feature, grads, hessians = accumulator._make_summary(
           partition_ids=[1, 2, 1],
           feature_ids=[[2, 0], [3, 1], [2, 0]],
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index 86fd5770a03..74a51f4e4d8 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -142,7 +142,8 @@ def _gen_categorical_split_info(fc, feat_id, left_weight, right_weight):
 
 
 def _get_bias_update(grads, hess):
-  return array_ops.where(hess > 0, -grads / hess, array_ops.zeros_like(grads))
+  return array_ops.where_v2(hess > 0, -grads / hess,
+                            array_ops.zeros_like(grads))
 
 
 class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 4dc764f9571..8083d8fac85 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -25,7 +25,6 @@ import six
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 
 
@@ -65,7 +64,7 @@ def _move_tensors(tensors, device):
   # logic.
   zero = constant_op.constant(0, dtype=dtypes.int32)
   with ops.device(None):
-    if all(tensor.shape == tensor_shape.scalar() for tensor in tensors):
+    if all(tensor.shape.rank == 0 for tensor in tensors):
       with ops.device(tensors[0].device):
         values = array_ops.stack(tensors)
       with ops.device(device):
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index 1f6bbbf5740..62d0d0821b2 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -23,7 +23,6 @@ from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
 # pylint: enable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import tracking
@@ -134,8 +133,7 @@ class StatsAccumulator(tracking.TrackableResource):
     self._hessian_shape = hessian_shape
     self._container = container
 
-    if (gradient_shape == tensor_shape.scalar() and
-        hessian_shape == tensor_shape.scalar()):
+    if (gradient_shape.rank == 0 and hessian_shape.rank == 0):
       self._is_scalar = True
     else:
       self._is_scalar = False
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 4a13da4b5be..ffad201cbf1 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -34,6 +34,7 @@ from tensorflow.contrib.boosted_trees.python.ops import training_ops
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_v2 as fc_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -184,16 +185,20 @@ def extract_features(features, feature_columns, use_core_columns):
   # Make a shallow copy of features to ensure downstream usage
   # is unaffected by modifications in the model function.
   features = copy.copy(features)
+  # pylint: disable=protected-access
+  state_manager = fc_v2._StateManagerImpl(layer=None, trainable=False)
   if feature_columns:
     scope = "gbdt"
     with variable_scope.variable_scope(scope):
       feature_columns = list(feature_columns)
       transformed_features = collections.OrderedDict()
       for fc in feature_columns:
-        # pylint: disable=protected-access
         if use_core_columns:
-          # pylint: disable=protected-access
-          tensor = fc_core._transform_features(features, [fc])[fc]
+          if isinstance(fc, fc_v2.FeatureColumn):
+            tensor = fc_v2._transform_features_v2(
+                features, [fc], state_manager)[fc]
+          else:
+            tensor = fc_core._transform_features(features, [fc])[fc]
           transformed_features[fc.name] = tensor
         elif isinstance(fc, feature_column_lib._EmbeddingColumn):
           # pylint: enable=protected-access
@@ -368,8 +373,8 @@ class GradientBoostedDecisionTreeModel(object):
 
     if logits_dimension == 1 or learner_config.multi_class_strategy == (
         learner_pb2.LearnerConfig.TREE_PER_CLASS):
-      self._gradient_shape = tensor_shape.scalar()
-      self._hessian_shape = tensor_shape.scalar()
+      self._gradient_shape = tensor_shape.TensorShape([])
+      self._hessian_shape = tensor_shape.TensorShape([])
     else:
       if center_bias:
         raise ValueError("Center bias should be False for multiclass.")
@@ -838,8 +843,8 @@ class GradientBoostedDecisionTreeModel(object):
       # Create steps accumulator.
       steps_accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
-          gradient_shape=tensor_shape.scalar(),
-          hessian_shape=tensor_shape.scalar(),
+          gradient_shape=tensor_shape.TensorShape([]),
+          hessian_shape=tensor_shape.TensorShape([]),
           name="StepsAccumulator")
     # Create ensemble stats summaries.
     summary.scalar("layer_stats/num_examples", num_layer_examples)
@@ -1212,7 +1217,7 @@ class GradientBoostedDecisionTreeModel(object):
 
   def _get_weights(self, hessian_shape, hessians):
     """Derives weights to be used based on hessians and multiclass strategy."""
-    if hessian_shape == tensor_shape.scalar():
+    if hessian_shape.rank == 0:
       # This is tree per class.
       weights = hessians
     elif len(hessian_shape.dims) == 1:
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 728b764898a..c9f37508677 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -176,6 +177,38 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sparse_int_shapes[0].eval(),
                           features["sparse_categorical"].dense_shape.eval())
 
+  def testExtractFeaturesFromV2FeatureColumns(self):
+    """Tests feature extraction when using v2 columns."""
+    with self.cached_session():
+      features = {}
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_categorical"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.string), array_ops.zeros([2],
+                                                               dtypes.int64))
+      feature_columns = set()
+      feature_columns.add(feature_column_v2.numeric_column("dense_float"))
+      feature_columns.add(
+          feature_column_v2.categorical_column_with_hash_bucket(
+              "sparse_categorical", hash_bucket_size=1000000))
+      (fc_names, dense_floats, _, _, _, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=True))
+      self.assertEqual(len(fc_names), 2)
+      self.assertAllEqual(fc_names, ["dense_float", "sparse_categorical"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_categorical"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_categorical"].dense_shape.eval())
+
   def testExtractFeaturesFromCoreFeatureColumns(self):
     """Tests feature extraction when using core columns."""
     with self.cached_session():
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 152d8836df5..d7bbbc10a17 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -10,7 +10,7 @@ load(
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library",
 )
 
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
index b0f9237ea27..ae6402b391e 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@@ -66,7 +66,7 @@ class BigQueryReader : public ReaderBase {
     return Status::OK();
   }
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     *at_end = false;
     *produced = false;
@@ -153,7 +153,7 @@ class GenerateBigQueryReaderPartitionsOp : public OpKernel {
                    context->allocate_output(0, TensorShape({num_partitions_}),
                                             &output_tensor));
 
-    auto output = output_tensor->template flat<string>();
+    auto output = output_tensor->template flat<tstring>();
     for (int64 i = 0; i < num_partitions_; ++i) {
       BigQueryTablePartition partition;
       partition.set_start_index(i * partition_size);
diff --git a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
index 648a219fb87..04571348272 100644
--- a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
@@ -83,8 +83,9 @@ class GcsCredentialsOpKernel : public OpKernel {
     RetryingGcsFileSystem* gcs = nullptr;
     OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
 
-    string json_string;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "json", &json_string));
+    tstring json_string;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<tstring>(ctx, "json", &json_string));
 
     Json::Value json;
     Json::Reader reader;
@@ -179,13 +180,13 @@ class GcsBlockCacheOpKernel : public OpKernel {
     RetryingGcsFileSystem* gcs = nullptr;
     OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
 
-    size_t max_cache_size, block_size, max_staleness;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<size_t>(ctx, "max_cache_size",
+    uint64 max_cache_size, block_size, max_staleness;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "max_cache_size",
                                                     &max_cache_size));
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<size_t>(ctx, "block_size", &block_size));
+                   ParseScalarArgument<uint64>(ctx, "block_size", &block_size));
     OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<size_t>(ctx, "max_staleness", &max_staleness));
+        ctx, ParseScalarArgument<uint64>(ctx, "max_staleness", &max_staleness));
 
     if (gcs->underlying()->block_size() == block_size &&
         gcs->underlying()->max_bytes() == max_cache_size &&
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index b15143bfc1c..2926889301a 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 1.20.2)
+set(nsync_TAG 1.22.0)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index ee0f1f02835..ae6f77238c5 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -172,16 +172,6 @@ tensorflow/contrib/fused_conv
 tensorflow/contrib/fused_conv/kernels
 tensorflow/contrib/fused_conv/python
 tensorflow/contrib/fused_conv/python/ops
-tensorflow/contrib/gan
-tensorflow/contrib/gan/python
-tensorflow/contrib/gan/python/estimator
-tensorflow/contrib/gan/python/estimator/python
-tensorflow/contrib/gan/python/eval
-tensorflow/contrib/gan/python/eval/python
-tensorflow/contrib/gan/python/features
-tensorflow/contrib/gan/python/features/python
-tensorflow/contrib/gan/python/losses
-tensorflow/contrib/gan/python/losses/python
 tensorflow/contrib/graph_editor
 tensorflow/contrib/graph_editor/examples
 tensorflow/contrib/grid_rnn
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index be66fac66b8..5831781c2ac 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import collections
 import functools
 import itertools
 import os
@@ -59,6 +58,7 @@ from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.util.compat import collections_abc
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -1131,7 +1131,7 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
     return numeric_grad.reshape(x_shape)
 
   def _GetShape(self, sess, inputs):
-    if not isinstance(inputs, collections.Iterable):
+    if not isinstance(inputs, collections_abc.Iterable):
       return sess.run(array_ops.shape(inputs))
     else:
       return sess.run([array_ops.shape(x) for x in inputs])
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index 220f9934b67..d5bcdebf81a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 import shutil
+import sys
 
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
@@ -40,7 +41,10 @@ class LMDBDatasetTest(test_base.DatasetTestBase):
   def setUp(self):
     super(LMDBDatasetTest, self).setUp()
     # Copy database out because we need the path to be writable to use locks.
-    path = os.path.join(prefix_path, "lmdb", "testdata", "data.mdb")
+    # The on-disk format of an LMDB database is different on big-endian
+    # machines, because LMDB is a memory-mapped database.
+    db_file = "data.mdb" if sys.byteorder == "little" else "data_bigendian.mdb"
+    path = os.path.join(prefix_path, "lmdb", "testdata", db_file)
     self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
     shutil.copy(path, self.db_path)
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index d51fa2e0c5c..92d4820d60a 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -45,8 +45,8 @@ def make_csv_dataset(
     shuffle=True,
     shuffle_buffer_size=10000,
     shuffle_seed=None,
-    prefetch_buffer_size=dataset_ops.AUTOTUNE,
-    num_parallel_reads=1,
+    prefetch_buffer_size=None,
+    num_parallel_reads=None,
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
@@ -112,7 +112,7 @@ def make_csv_dataset(
       batches to prefetch for performance improvement. Recommended value is the
       number of batches consumed per training step. Defaults to auto-tune.
     num_parallel_reads: Number of threads used to read CSV records from files.
-      If >1, the results will be interleaved.
+      If >1, the results will be interleaved. Defaults to `1`.
     sloppy: If `True`, reading performance will be improved at
       the cost of non-deterministic ordering. If `False`, the order of elements
       produced is deterministic prior to shuffling (elements are still
@@ -173,9 +173,9 @@ def make_batched_features_dataset(file_pattern,
                                   shuffle=True,
                                   shuffle_buffer_size=10000,
                                   shuffle_seed=None,
-                                  prefetch_buffer_size=dataset_ops.AUTOTUNE,
-                                  reader_num_threads=1,
-                                  parser_num_threads=2,
+                                  prefetch_buffer_size=None,
+                                  reader_num_threads=None,
+                                  parser_num_threads=None,
                                   sloppy_ordering=False,
                                   drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
@@ -248,9 +248,9 @@ def make_batched_features_dataset(file_pattern,
       improve performance. Recommended value is the number of batches consumed
       per training step. Defaults to auto-tune.
     reader_num_threads: Number of threads used to read `Example` records. If >1,
-      the results will be interleaved.
+      the results will be interleaved. Defaults to `1`.
     parser_num_threads: Number of threads to use for parsing `Example` tensors
-      into a dictionary of `Feature` tensors.
+      into a dictionary of `Feature` tensors. Defaults to `2`.
     sloppy_ordering: If `True`, reading performance will be improved at
       the cost of non-deterministic ordering. If `False`, the order of elements
       produced is deterministic prior to shuffling (elements are still
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index a0b2ca59d7b..ebbb9b3c052 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 8730dd45f3a..926797bebf1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -1,7 +1,7 @@
 # Implementation of a prototype TF distributed computation library.
 
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
-load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
+load("//tensorflow/core/platform:default/distribute.bzl", "distribute_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
@@ -206,6 +206,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "noguitar",  # b/139307796
     ],
 )
 
@@ -273,6 +274,7 @@ distribute_py_test(
         "no_windows_gpu",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = False,  # Ignoring due to in contrib.
     deps = [
         ":mirrored_strategy",
         "//tensorflow/python/distribute:tpu_strategy",
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 6dda497459f..1f527340d8d 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -32,11 +32,9 @@ from tensorflow.python.distribute import cross_device_ops as cross_device_ops_li
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -54,7 +52,6 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_util
-from tensorflow.python.training.server_lib import ClusterSpec
 
 
 class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1):
@@ -71,38 +68,22 @@ class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1):
 def create_test_objects(cluster_spec=None,
                         task_type=None,
                         task_id=None,
-                        num_gpus=None,
-                        use_core_strategy=False):
+                        num_gpus=None):
   sess_config = config_pb2.ConfigProto()
   if num_gpus is None:
     num_gpus = context.num_gpus()
-  if use_core_strategy:
-    if cluster_spec and task_type and task_id is not None:
-      cluster_resolver = SimpleClusterResolver(
-          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
-          task_type=task_type,
-          task_id=task_id,
-          num_accelerators={'GPU': num_gpus})
-      target = 'grpc://' + cluster_spec[task_type][task_id]
-    else:
-      cluster_resolver = SimpleClusterResolver(
-          ClusterSpec({}), num_accelerators={'GPU': num_gpus})
-      target = ''
 
-    strategy = MockCollectiveAllReduceStrategy(cluster_resolver)
-    sess_config = strategy.update_config_proto(sess_config)
+  strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+      num_gpus_per_worker=num_gpus)
+  if task_type and task_id is not None:
+    strategy.configure(
+        session_config=sess_config,
+        cluster_spec=cluster_spec,
+        task_type=task_type,
+        task_id=task_id)
+    target = 'grpc://' + cluster_spec[task_type][task_id]
   else:
-    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=num_gpus)
-    if task_type and task_id is not None:
-      strategy.configure(
-          session_config=sess_config,
-          cluster_spec=cluster_spec,
-          task_type=task_type,
-          task_id=task_id)
-      target = 'grpc://' + cluster_spec[task_type][task_id]
-    else:
-      target = ''
+    target = ''
 
   return strategy, target, sess_config
 
@@ -120,17 +101,12 @@ class CollectiveAllReduceStrategyTestBase(
     CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
-  def _get_test_object(self,
-                       task_type,
-                       task_id,
-                       num_gpus=0,
-                       use_core_strategy=False):
+  def _get_test_object(self, task_type, task_id, num_gpus=0):
     strategy, target, session_config = create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type=task_type,
         task_id=task_id,
-        num_gpus=num_gpus,
-        use_core_strategy=use_core_strategy)
+        num_gpus=num_gpus)
 
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 +
@@ -144,11 +120,7 @@ class CollectiveAllReduceStrategyTestBase(
 
     return strategy, target, session_config
 
-  def _test_minimize_loss_graph(self,
-                                task_type,
-                                task_id,
-                                num_gpus,
-                                use_core_strategy=False):
+  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
     with ops.Graph().as_default(), \
@@ -215,11 +187,7 @@ class CollectiveAllReduceStrategyTestBase(
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_complex_model(self,
-                          task_type,
-                          task_id,
-                          num_gpus,
-                          use_core_strategy=False):
+  def _test_complex_model(self, task_type, task_id, num_gpus):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
 
@@ -270,11 +238,7 @@ class CollectiveAllReduceStrategyTestBase(
       sess.run(variables.global_variables_initializer())
       sess.run(train_op)
 
-  def _test_variable_initialization(self,
-                                    task_type,
-                                    task_id,
-                                    num_gpus,
-                                    use_core_strategy=False):
+  def _test_variable_initialization(self, task_type, task_id, num_gpus):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
@@ -309,8 +273,7 @@ class CollectiveAllReduceStrategyTestBase(
                               input_fn,
                               expected_values,
                               test_reinitialize=True,
-                              ignore_order=False,
-                              use_core_strategy=False):
+                              ignore_order=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     devices = distribution.extended.worker_devices
@@ -360,62 +323,41 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def test_num_replicas_in_sync(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_num_replicas_in_sync(self):
     distribution, _, _ = create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type='worker',
         task_id=0,
-        num_gpus=2,
-        use_core_strategy=use_core_strategy)
+        num_gpus=2)
     num_workers = len(self._cluster_spec.get('chief', []) +
                       self._cluster_spec.get('worker', []))
     self.assertEqual(2 * num_workers,
                      distribution.num_replicas_in_sync)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          num_gpus=[0, 1, 2],
-          required_gpus=1,
-          use_core_strategy=[True, False]))
-  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
-    self._run_between_graph_clients(
-        self._test_minimize_loss_graph,
-        self._cluster_spec,
-        num_gpus,
-        use_core_strategy=use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          num_gpus=[0, 1, 2],
-          required_gpus=1,
-          use_core_strategy=[True, False]))
-  def testVariableInitialization(self, num_gpus, use_core_strategy):
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
-        num_gpus=num_gpus,
-        use_core_strategy=use_core_strategy)
+        num_gpus=num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          num_gpus=[0, 1, 2],
-          required_gpus=1,
-          use_core_strategy=[True, False]))
-  def testComplexModel(self, num_gpus, use_core_strategy):
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
-        self._test_complex_model,
-        self._cluster_spec,
-        num_gpus=num_gpus,
-        use_core_strategy=use_core_strategy)
+        self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
   # TODO(yuefengz): Update how we use num_gpus and required_gpus
   @combinations.generate(
@@ -423,9 +365,8 @@ class DistributedCollectiveAllReduceStrategyTest(
           mode=['graph'],
           num_gpus=[0, 1, 2],
           required_gpus=1,
-          use_dataset=[True, False],
-          use_core_strategy=[True, False]))
-  def testMakeInputFnIterator(self, num_gpus, use_dataset, use_core_strategy):
+          use_dataset=[True, False]))
+  def testMakeInputFnIterator(self, num_gpus, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -452,17 +393,12 @@ class DistributedCollectiveAllReduceStrategyTest(
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
-        ignore_order=not use_dataset,
-        use_core_strategy=use_core_strategy)
+        ignore_order=not use_dataset)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testUpdateConfigProto(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProto(self):
     strategy, _, _ = self._get_test_object(
-        task_type='worker',
-        task_id=1,
-        num_gpus=2,
-        use_core_strategy=use_core_strategy)
+        task_type='worker', task_id=1, num_gpus=2)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
     rewrite_options = config_proto.graph_options.rewrite_options
@@ -484,29 +420,6 @@ class DistributedCollectiveAllReduceStrategyTest(
     self.assertEqual(['CollectiveReduce'],
                      new_rewrite_options.scoped_allocator_opts.enable_op)
 
-  @combinations.generate(combinations.combine(mode=['eager']))
-  def testEnableCollectiveOps(self):
-    mock_called = [False]
-
-    # pylint: disable=dangerous-default-value
-    def mock_enable_collective_ops(server_def, mock_called=mock_called):
-      self.assertEqual('worker', server_def.job_name)
-      self.assertEqual(1, server_def.task_index)
-      self.assertEqual('grpc', server_def.protocol)
-      mock_called[0] = True
-
-    def mock_configure_collective_ops(*args, **kwargs):
-      del args, kwargs
-
-    with test.mock.patch.object(context.context(), 'enable_collective_ops',
-                                mock_enable_collective_ops), \
-         test.mock.patch.object(context.context(), 'configure_collective_ops',
-                                mock_configure_collective_ops):
-      strategy, _, _ = self._get_test_object(
-          task_type='worker', task_id=1, num_gpus=2, use_core_strategy=True)
-    self.assertTrue(strategy.extended._std_server_started)
-    self.assertTrue(mock_called[0])
-
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -550,41 +463,28 @@ class LocalCollectiveAllReduceStrategy(
 
   @combinations.generate(
       combinations.combine(
-          mode=['graph', 'eager'],
-          num_gpus=[2, 4],
-          required_gpus=2,
-          use_core_strategy=[True, False]))
-  def testMinimizeLoss(self, num_gpus, use_core_strategy):
+          mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2))
+  def testMinimizeLoss(self, num_gpus):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if context.executing_eagerly():
-      strategy, _, _ = self._get_test_object(
-          None, None, num_gpus, use_core_strategy=use_core_strategy)
+      strategy, _, _ = self._get_test_object(None, None, num_gpus)
       self._test_minimize_loss_eager(strategy)
     else:
-      self._test_minimize_loss_graph(
-          None, None, num_gpus, use_core_strategy=use_core_strategy)
+      self._test_minimize_loss_graph(None, None, num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          num_gpus=[2, 4],
-          required_gpus=2,
-          use_core_strategy=[True, False]))
-  def testComplexModel(self, num_gpus, use_core_strategy):
+      combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2))
+  def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_complex_model(
-        None, None, num_gpus, use_core_strategy=use_core_strategy)
+    self._test_complex_model(None, None, num_gpus)
 
   @combinations.generate(
       combinations.combine(
-          mode=['graph', 'eager'],
-          required_gpus=2,
-          use_dataset=[True, False],
-          use_core_strategy=[True, False]))
-  def testMakeInputFnIterator(self, use_dataset, use_core_strategy):
+          mode=['graph', 'eager'], required_gpus=2, use_dataset=[True, False]))
+  def testMakeInputFnIterator(self, use_dataset):
     num_gpus = 2
     if use_dataset:
       fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
@@ -607,71 +507,56 @@ class LocalCollectiveAllReduceStrategy(
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
-        ignore_order=not use_dataset,
-        use_core_strategy=use_core_strategy)
+        ignore_order=not use_dataset)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceSum(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSum(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceSumGradients(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSumGradients(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum_gradients(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceSumGradientTape(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSumGradientTape(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum_gradient_tape(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceMean(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMean(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceMeanGradients(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMeanGradients(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean_gradients(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testAllReduceMeanGradientTape(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMeanGradientTape(self):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(
-        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean_gradient_tape(distribution)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testNumpyIterator(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testNumpyIterator(self):
     num_gpus = 2
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    strategy, _, _ = self._get_test_object(
-        None, None, num_gpus=num_gpus, use_core_strategy=use_core_strategy)
+    strategy, _, _ = self._get_test_object(None, None, num_gpus=num_gpus)
     self._test_numpy_iterator(strategy)
 
 
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
index c97f93371bf..98195cca3c3 100644
--- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -369,7 +369,12 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
@@ -399,7 +404,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
       input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
@@ -432,7 +441,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
     model = get_model()
     optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
-    model.compile(optimizer, loss, distribute=distribution)
+    model.compile(
+        optimizer,
+        loss,
+        distribute=distribution,
+        experimental_run_tf_function=False)
 
     inputs = np.zeros((20, 3), np.float32)
     targets = np.zeros((20, 4), np.float32)
@@ -448,7 +461,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       # We take 6 input samples with each input having a dimension of 3 or 5.
       input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
@@ -478,7 +495,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -497,7 +519,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           gradient_descent.GradientDescentOptimizer(0.001),
           loss='mse',
           metrics=['mae', keras.metrics.CategoricalAccuracy()],
-          distribute=distribution)
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       interleaved_model = get_model()
       interleaved_model.set_weights(user_controlled_model.get_weights())
@@ -505,7 +528,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           gradient_descent.GradientDescentOptimizer(0.001),
           loss='mse',
           metrics=['mae', keras.metrics.CategoricalAccuracy()],
-          distribute=distribution)
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -546,7 +570,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -578,7 +607,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -592,7 +626,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model = get_model()
 
       loss = 'mse'
-      model.compile(optimizer(), loss, distribute=distribution)
+      model.compile(
+          optimizer(),
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -605,7 +643,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     model = get_model()
     optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
-    model.compile(optimizer, loss, distribute=distribution)
+    model.compile(
+        optimizer,
+        loss,
+        distribute=distribution,
+        experimental_run_tf_function=False)
 
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
@@ -633,7 +675,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       # Wrong input shape
       inputs = np.zeros((10, 5), dtype=np.float32)
@@ -660,7 +706,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3), dtype=np.float32)
@@ -692,7 +742,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.005)
       loss = 'mse'
       metrics = ['acc']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       batch_size = 8
       if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
@@ -727,7 +782,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       optimizer = gradient_descent_keras.SGD(0.01)
       loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -761,7 +820,12 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -816,7 +880,12 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       dataset = get_dataset(distribution)
 
@@ -856,9 +925,11 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse',
-                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=distribution)
+      model.compile(
+          loss='mse',
+          optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+          distribute=distribution,
+          experimental_run_tf_function=False)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -877,9 +948,11 @@ class TestDistributionStrategyWithNormalizationLayer(
       model = keras.models.Sequential()
       norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
       model.add(norm)
-      model.compile(loss='mse',
-                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=distribution)
+      model.compile(
+          loss='mse',
+          optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       # centered on 5.0, variance 10.0
       x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
@@ -924,7 +997,8 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           loss=keras.losses.mean_squared_error,
           optimizer=gradient_descent.GradientDescentOptimizer(0.5),
           metrics=[keras.metrics.BinaryAccuracy()],
-          distribute=distribution)
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       batch_size = 64
       if not distributed_training_utils.global_batch_size_supported(
@@ -950,7 +1024,8 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           loss='mae',
           metrics=['accuracy', keras.metrics.BinaryAccuracy()],
           optimizer=gradient_descent.GradientDescentOptimizer(0.001),
-          distribute=distribution)
+          distribute=distribution,
+          experimental_run_tf_function=False)
 
       # verify correctness of stateful and stateless metrics.
       x = np.ones((100, 4)).astype('float32')
@@ -1026,7 +1101,8 @@ class TestDistributionStrategyCorrectness(test.TestCase,
             loss=keras.losses.mean_squared_error,
             optimizer=gradient_descent_keras.SGD(0.5),
             metrics=['mse'],
-            distribute=with_distribution)
+            distribute=with_distribution,
+            experimental_run_tf_function=False)
 
         training_inputs, eval_inputs, predict_inputs = (
             get_correctness_test_inputs(use_numpy, use_validation_data,
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 12926cfa164..a4d5f0cf5a1 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -24,17 +24,14 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import parameter_server_strategy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import parameter_server_strategy as core_parameter_server_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
@@ -69,42 +66,24 @@ def create_test_objects(cluster_spec=None,
                         task_type=None,
                         task_id=None,
                         num_gpus=None,
-                        sess_config=None,
-                        use_core_strategy=False):
+                        sess_config=None):
   sess_config = sess_config or config_pb2.ConfigProto()
   if num_gpus is None:
     num_gpus = context.num_gpus()
-  if use_core_strategy:
-    if cluster_spec and task_type and task_id is not None:
-      cluster_resolver = SimpleClusterResolver(
-          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
-          task_type=task_type,
-          task_id=task_id,
-          num_accelerators={'GPU': num_gpus})
-      distribution = core_parameter_server_strategy.ParameterServerStrategy(
-          cluster_resolver)
-      target = 'grpc://' + cluster_spec[WORKER][task_id]
-    else:
-      distribution = (
-          central_storage_strategy.CentralStorageStrategy._from_num_gpus(
-              num_gpus))
-      target = ''
 
+  distribution = parameter_server_strategy.ParameterServerStrategy(
+      num_gpus_per_worker=num_gpus)
+
+  if task_type:
     sess_config = copy.deepcopy(sess_config)
-    sess_config = distribution.update_config_proto(sess_config)
+    distribution.configure(
+        session_config=sess_config,
+        cluster_spec=cluster_spec,
+        task_type=task_type,
+        task_id=task_id)
+    target = 'grpc://' + cluster_spec[WORKER][task_id]
   else:
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=num_gpus)
-    if task_type:
-      sess_config = copy.deepcopy(sess_config)
-      distribution.configure(
-          session_config=sess_config,
-          cluster_spec=cluster_spec,
-          task_type=task_type,
-          task_id=task_id)
-      target = 'grpc://' + cluster_spec[WORKER][task_id]
-    else:
-      target = ''
+    target = ''
 
   return distribution, target, sess_config
 
@@ -122,27 +101,17 @@ class ParameterServerStrategyTestBase(
     self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True)
     super(ParameterServerStrategyTestBase, self).setUp()
 
-  def _get_test_objects(self,
-                        task_type,
-                        task_id,
-                        num_gpus,
-                        use_core_strategy=False):
+  def _get_test_objects(self, task_type, task_id, num_gpus):
     return create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type=task_type,
         task_id=task_id,
         num_gpus=num_gpus,
-        sess_config=self._sess_config,
-        use_core_strategy=use_core_strategy)
+        sess_config=self._sess_config)
 
-  def _test_device_assignment_distributed(self,
-                                          task_type,
-                                          task_id,
-                                          num_gpus,
-                                          use_core_strategy=False):
+  def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
-    d, _, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
                              config=sess_config) as sess, \
@@ -240,9 +209,8 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(f_val, 46.0)
 
   def _test_device_assignment_distributed_enable_partitioner(
-      self, task_type, task_id, num_gpus, use_core_strategy=False):
-    d, _, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+      self, task_type, task_id, num_gpus):
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
     num_shards = len(d.extended.parameter_devices)
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
     with ops.Graph().as_default(), \
@@ -390,13 +358,9 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  def _test_simple_increment(self,
-                             task_type,
-                             task_id,
-                             num_gpus,
-                             use_core_strategy=False):
+  def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+        task_type, task_id, num_gpus)
     if d.extended._cluster_spec:
       num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
       if 'chief' in d.extended._cluster_spec.as_dict():
@@ -462,13 +426,9 @@ class ParameterServerStrategyTestBase(
       self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
       self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
 
-  def _test_minimize_loss_graph(self,
-                                task_type,
-                                task_id,
-                                num_gpus,
-                                use_core_strategy=False):
+  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+        task_type, task_id, num_gpus)
     if task_type:
       # Multi-worker
       assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
@@ -561,10 +521,9 @@ class ParameterServerStrategyTestBase(
                               input_fn,
                               expected_values,
                               test_reinitialize=True,
-                              ignore_order=False,
-                              use_core_strategy=False):
+                              ignore_order=False):
     distribution, master_target, config = self._get_test_objects(
-        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+        task_type, task_id, num_gpus)
     devices = distribution.extended.worker_devices
 
     with ops.Graph().as_default(), \
@@ -613,84 +572,62 @@ class ParameterServerStrategyTest(
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def test_num_replicas_in_sync(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_num_replicas_in_sync(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
     # All the devices on a given worker are in sync which in this case is the
     # number of gpus on each worker.
     self.assertEqual(2, strategy.num_replicas_in_sync)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testDeviceAssignmentLocalCPU(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=0, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalCPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=0)
     self._test_device_assignment_local(
         strategy, compute_device='CPU', variable_device='CPU', num_gpus=0)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testDeviceAssignmentLocalOneGPU(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=1, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalOneGPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=1)
     self._test_device_assignment_local(
         strategy, compute_device='GPU', variable_device='GPU', num_gpus=1)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalTwoGPUs(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
     self._test_device_assignment_local(
         strategy, compute_device='GPU', variable_device='CPU', num_gpus=2)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy):
-    self._test_device_assignment_distributed(
-        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributed(self, num_gpus):
+    self._test_device_assignment_distributed('worker', 1, num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus,
-                                                       use_core_strategy):
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
     self._test_device_assignment_distributed_enable_partitioner(
-        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
+        'worker', 1, num_gpus)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testSimpleBetweenGraph(self):
+    self._run_between_graph_clients(self._test_simple_increment,
+                                    self._cluster_spec, context.num_gpus())
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testSimpleBetweenGraph(self, use_core_strategy):
-    self._run_between_graph_clients(
-        self._test_simple_increment,
-        self._cluster_spec,
-        context.num_gpus(),
-        use_core_strategy=use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testLocalSimpleIncrement(self, num_gpus):
+    self._test_simple_increment(None, 0, num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testLocalSimpleIncrement(self, num_gpus, use_core_strategy):
-    self._test_simple_increment(None, 0, num_gpus, use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphDistributed(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy):
-    self._run_between_graph_clients(
-        self._test_minimize_loss_graph,
-        self._cluster_spec,
-        num_gpus,
-        use_core_strategy=use_core_strategy)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
-    self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphLocal(self, num_gpus):
+    self._test_minimize_loss_graph(None, None, num_gpus)
 
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
@@ -698,10 +635,8 @@ class ParameterServerStrategyTest(
           mode=['graph'],
           num_gpus=[1, 2],
           required_gpus=1,
-          use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def testMakeInputFnIteratorDistributed(
-      self, num_gpus, use_core_strategy, use_dataset):
+  def testMakeInputFnIteratorDistributed(self, num_gpus, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -726,18 +661,15 @@ class ParameterServerStrategyTest(
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
-        ignore_order=not use_dataset,
-        use_core_strategy=use_core_strategy)
+        ignore_order=not use_dataset)
 
   @combinations.generate(
       combinations.combine(
           mode=['graph'],
           num_gpus=[1, 2],
           required_gpus=1,
-          use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
-                                   use_dataset):
+  def testMakeInputFnIteratorLocal(self, num_gpus, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -762,24 +694,20 @@ class ParameterServerStrategyTest(
         input_fn,
         expected_values,
         test_reinitialize=use_dataset,
-        ignore_order=not use_dataset,
-        use_core_strategy=use_core_strategy)
+        ignore_order=not use_dataset)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testGlobalStepUpdate(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepUpdate(self):
+    strategy, _, _ = create_test_objects()
     self._test_global_step_update(strategy)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProtoMultiWorker(self):
     strategy, _, _ = create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type='worker',
         task_id=1,
-        num_gpus=2,
-        use_core_strategy=use_core_strategy)
+        num_gpus=2)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
 
@@ -792,11 +720,9 @@ class ParameterServerStrategyTest(
     # Verify isolate_session_state
     self.assertFalse(new_config.isolate_session_state)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testUpdateConfigProtoLocal(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProtoLocal(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
 
     config_proto = config_pb2.ConfigProto()
     new_config = strategy.update_config_proto(config_proto)
@@ -854,30 +780,20 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2, has_chief=True)
     cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testSimpleBetweenGraph(self, use_core_strategy):
-    self._run_between_graph_clients(
-        self._test_simple_increment,
-        self._cluster_spec,
-        context.num_gpus(),
-        use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testSimpleBetweenGraph(self):
+    self._run_between_graph_clients(self._test_simple_increment,
+                                    self._cluster_spec, context.num_gpus())
 
   @combinations.generate(
-      combinations.combine(
-          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
-  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
-    self._run_between_graph_clients(
-        self._test_minimize_loss_graph,
-        self._cluster_spec,
-        num_gpus,
-        use_core_strategy=use_core_strategy)
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepIsWrappedOnTwoGPUs(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
     with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
@@ -889,11 +805,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
       self.assertIs(values.AggregatingVariable, type(get_step))
       self.assertIs(strategy, created_step.distribute_strategy)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=1, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepIsNotWrappedOnOneGPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=1)
     with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
@@ -908,11 +822,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
       self.assertFalse(hasattr(strategy, 'distribute_strategy'))
       self.assertIs(strategy, created_step._distribute_strategy)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
-  def testValueContainer(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testValueContainer(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
     with ops.Graph().as_default(), strategy.scope():
 
       def f():
@@ -930,11 +842,9 @@ class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase,
                                  parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager'],
-                                              use_core_strategy=[True, False],
                                               required_gpus=2))
-  def testNumpyDataset(self, use_core_strategy):
-    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
+  def testNumpyDataset(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
     self._test_numpy_dataset(strategy)
 
 
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index f502a0b8279..87c920efa2b 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -513,6 +513,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index dc18eb3df69..8b61d4be63c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -461,13 +462,14 @@ class AffineBijectorTest(test.TestCase):
   def testNoBatchMultivariateRaisesWhenSingular(self):
     with self.cached_session():
       mu = [1., -1]
-      bijector = Affine(
-          shift=mu,
-          # Has zero on the diagonal.
-          scale_diag=[0., 1],
-          validate_args=True)
-      with self.assertRaisesOpError("diagonal part must be non-zero"):
-        bijector.forward([1., 1.]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "diagonal part must be non-zero"):
+        _ = Affine(
+            shift=mu,
+            # Has zero on the diagonal.
+            scale_diag=[0., 1],
+            validate_args=True)
+        # Error detected statically; don't need to run the op.
 
   def _makeScale(self,
                  x,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 79eadf524b5..f3d63da373a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
@@ -150,6 +151,27 @@ class _ReshapeBijectorTest(object):
       with self.assertRaisesError(expected_error_message):
         sess.run(bijector.forward_event_shape_tensor(shape_in),
                  feed_dict=feed_dict)
+
+  def _testInvalidDimensionsStatic(self, expected_error_message):
+    """Version of _testInvalidDimensionsOpError for errors detected statically.
+
+    Statically means at graph construction time.
+
+    Args:
+        expected_error_message: String that should be present in the error
+          message that `Reshape` raises for invalid shapes.
+    """
+    shape_in, shape_out, _ = self.build_shapes([2, 3], [
+        1,
+        2,
+        -2,
+    ])
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             expected_error_message):
+      _ = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
   # pylint: enable=invalid-name
 
   def testValidButNonMatchingInputOpError(self):
@@ -300,9 +322,9 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
       assert_bijective_and_finite(
           bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
-  def testInvalidDimensionsOpError(self):
-    self._testInvalidDimensionsOpError(
-        "Invalid value in tensor used for shape: -2")
+  def testInvalidDimensionsStatic(self):
+    self._testInvalidDimensionsStatic(
+        "elements must be either positive integers or `-1`")
 
   def testInputOutputMismatchOpError(self):
     self._testInputOutputMismatchOpError("Cannot reshape a tensor with")
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index e805619041d..2e7ab3ecfd2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
+from tensorflow.python.framework import errors
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
@@ -43,9 +44,10 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testHingeSoftnessZeroRaises(self):
     with self.cached_session():
-      bijector = Softplus(hinge_softness=0., validate_args=True)
-      with self.assertRaisesOpError("must be non-zero"):
-        bijector.forward([1., 1.]).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "must be non-zero"):
+        _ = Softplus(hinge_softness=0., validate_args=True)
+        # Error detected statically; don't need to run op.
 
   def testBijectorForwardInverseEventDimsZero(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
index 4411d6f4611..f5d6944d166 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -400,9 +401,10 @@ class CauchyTest(test.TestCase):
 
   def testCauchyNegativeLocFails(self):
     with self.cached_session():
-      cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0 did not hold"):
-        cauchy.mode().eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "Condition x > 0 did not hold"):
+        _ = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
+        # Error detected statically; no need for _.mode().eval()
 
   def testCauchyShape(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
index 36fc7a70c8a..bdcf6f39445 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -40,11 +41,10 @@ class DeterministicTest(test.TestCase):
 
   def testInvalidTolRaises(self):
     loc = rng.rand(2, 3, 4).astype(np.float32)
-    deterministic = deterministic_lib.Deterministic(
-        loc, atol=-1, validate_args=True)
-    with self.cached_session():
-      with self.assertRaisesOpError("Condition x >= 0"):
-        deterministic.prob(0.).eval()
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Condition x >= 0"):
+      _ = deterministic_lib.Deterministic(loc, atol=-1, validate_args=True)
+      # Error detected statically; no need for _.prob(0.).eval()
 
   def testProbWithNoBatchDimsIntegerType(self):
     deterministic = deterministic_lib.Deterministic(0)
@@ -195,16 +195,16 @@ class VectorDeterministicTest(test.TestCase):
 
   def testInvalidTolRaises(self):
     loc = rng.rand(2, 3, 4).astype(np.float32)
-    deterministic = deterministic_lib.VectorDeterministic(
-        loc, atol=-1, validate_args=True)
-    with self.cached_session():
-      with self.assertRaisesOpError("Condition x >= 0"):
-        deterministic.prob(loc).eval()
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Condition x >= 0"):
+      _ = deterministic_lib.VectorDeterministic(
+          loc, atol=-1, validate_args=True)
+      # Error detected statically; no need for _.prob(loc).eval()
 
   def testInvalidXRaises(self):
     loc = rng.rand(2, 3, 4).astype(np.float32)
     deterministic = deterministic_lib.VectorDeterministic(
-        loc, atol=-1, validate_args=True)
+        loc, atol=None, validate_args=True)
     with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "must have rank at least 1"):
         deterministic.prob(0.).eval()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
index 686de9d2465..3ed96e6fdb8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -41,6 +42,7 @@ def try_import(name):  # pylint: disable=invalid-name
     tf_logging.warning("Could not import %s: %s" % (name, str(e)))
   return module
 
+
 stats = try_import("scipy.stats")
 
 
@@ -288,9 +290,10 @@ class HalfNormalTest(test.TestCase):
 
   def testNegativeSigmaFails(self):
     with self.cached_session():
-      halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G")
-      with self.assertRaisesOpError("Condition x > 0 did not hold"):
-        halfnorm.mean().eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "Condition x > 0 did not hold"):
+        _ = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G")
+        # Error detected statically; no need for _.mean().eval()
 
   def testHalfNormalShape(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
index 70551d89d9c..7c46674cc04 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
@@ -22,6 +22,7 @@ from scipy import stats
 from tensorflow.contrib.distributions.python.ops import inverse_gamma
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
@@ -249,7 +250,8 @@ class InverseGammaTest(test.TestCase):
           fails += 0 if self._kstest(a, b, s) else 1
       self.assertLess(fails, trials * 0.03)
 
-  def _kstest(self, alpha, beta, samples):
+  @staticmethod
+  def _kstest(alpha, beta, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
     ks, _ = stats.kstest(samples, stats.invgamma(alpha, scale=beta).cdf)
     # Return True when the test passes.
@@ -295,16 +297,18 @@ class InverseGammaTest(test.TestCase):
     with self.cached_session():
       alpha_v = constant_op.constant(0.0, name="alpha")
       beta_v = constant_op.constant(1.0, name="beta")
-      inv_gamma = inverse_gamma.InverseGamma(
-          concentration=alpha_v, rate=beta_v, validate_args=True)
-      with self.assertRaisesOpError("alpha"):
-        inv_gamma.mean().eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "alpha"):
+        _ = inverse_gamma.InverseGamma(
+            concentration=alpha_v, rate=beta_v, validate_args=True)
+        # Error detected statically; no need for _.mean().eval()
       alpha_v = constant_op.constant(1.0, name="alpha")
       beta_v = constant_op.constant(0.0, name="beta")
-      inv_gamma = inverse_gamma.InverseGamma(
-          concentration=alpha_v, rate=beta_v, validate_args=True)
-      with self.assertRaisesOpError("beta"):
-        inv_gamma.mean().eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "beta"):
+        _ = inverse_gamma.InverseGamma(
+            concentration=alpha_v, rate=beta_v, validate_args=True)
+        # Error detected statically; no need for _.mean().eval()
 
   def testInverseGammaWithSoftplusConcentrationRate(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
index 07528cafaf1..82257e136ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
@@ -21,6 +21,7 @@ import numpy as np
 from scipy import stats
 from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -361,15 +362,14 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testLowerCutoffMustBeBelowUpperCutoffOrWeRaise(self):
     with self.cached_session():
-      qdist = distributions.QuantizedDistribution(
-          distribution=distributions.Normal(loc=0., scale=1.),
-          low=1.,  # not strictly less than high.
-          high=1.,
-          validate_args=True)
-
-      self.assertTrue(qdist.validate_args)  # Default is True.
-      with self.assertRaisesOpError("must be strictly less"):
-        qdist.sample().eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "must be strictly less"):
+        _ = distributions.QuantizedDistribution(
+            distribution=distributions.Normal(loc=0., scale=1.),
+            low=1.,  # not strictly less than high.
+            high=1.,
+            validate_args=True)
+        # Error detected statically; no need for _.sample().eval()
 
   def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
index fec23749286..aa90dae88bb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
@@ -94,12 +94,11 @@ class RelaxedBernoulliTest(test.TestCase):
     """If validate_args, raises InvalidArgumentError when temperature is 0."""
     temperature = constant_op.constant(0.0)
     p = constant_op.constant([0.1, 0.4])
-    dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p,
-                                              validate_args=True)
-    with self.cached_session():
-      sample = dist.sample()
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        sample.eval()
+    with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+                                             "x > 0 did not hold"):
+      _ = relaxed_bernoulli.RelaxedBernoulli(
+          temperature, probs=p, validate_args=True)
+      # Error detected statically; no need to run the op.
 
   def testDtype(self):
     temperature = constant_op.constant(1.0, dtype=dtypes.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index cdee30bbc42..c924a22c290 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -382,7 +382,7 @@ class WishartCholeskyTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "cannot be less than"):
         distributions.WishartCholesky(
             df=2, scale=chol_scale, validate_args=False)
-      with self.assertRaisesRegexp(TypeError, "Argument tril must have dtype"):
+      with self.assertRaisesRegexp(TypeError, "."):
         distributions.WishartCholesky(
             df=4.,
             scale=np.asarray(
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index d4503790888..e174596defd 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -191,10 +191,8 @@ class BatchReshape(distribution_lib.Distribution):
         self.distribution.survival_function, x)
 
   def _entropy(self):
-    return self._call_and_reshape_output(
-        self.distribution.entropy,
-        [],
-        [tensor_shape.scalar()])
+    return self._call_and_reshape_output(self.distribution.entropy, [],
+                                         [tensor_shape.TensorShape([])])
 
   def _mean(self):
     return self._call_and_reshape_output(self.distribution.mean)
@@ -381,7 +379,7 @@ def calculate_reshape(original_shape, new_shape, validate=False, name=None):
     size_implicit_dim = (
         original_size // math_ops.maximum(1, -math_ops.reduce_prod(new_shape)))
     new_ndims = array_ops.shape(new_shape)
-    expanded_new_shape = array_ops.where(  # Assumes exactly one `-1`.
+    expanded_new_shape = array_ops.where_v2(  # Assumes exactly one `-1`.
         implicit_dim, array_ops.fill(new_ndims, size_implicit_dim), new_shape)
     validations = [] if not validate else [
         check_ops.assert_rank(
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index fcc8898f6eb..2e0fd592c6c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -22,7 +22,6 @@ from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -254,8 +253,6 @@ class Affine(bijector.Bijector):
       super(Affine, self).__init__(
           forward_min_event_ndims=1,
           graph_parents=(
-              [self._scale] if tensor_util.is_tensor(self._scale)
-              else self._scale.graph_parents +
               [self._shift] if self._shift is not None else []),
           is_constant_jacobian=True,
           dtype=dtype,
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index 91301f15ad8..722d843f7f4 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -141,7 +141,6 @@ class AffineLinearOperator(bijector.Bijector):
           raise TypeError("scale is not an instance of tf.LinearOperator")
         if validate_args and not scale.is_non_singular:
           raise ValueError("Scale matrix must be non-singular.")
-        graph_parents += scale.graph_parents
         if scale.tensor_rank is not None:
           batch_ndims = scale.tensor_rank - 2
         else:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index 241fba2cb7e..aee3a603d2b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -43,7 +43,7 @@ __all__ = [
     warn_once=True)
 def _sqrtx2p1(x):
   """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
-  return array_ops.where(
+  return array_ops.where_v2(
       math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
       math_ops.sqrt(x**2. + 1.),
       # For large x, calculating x**2 can overflow. This can be alleviated by
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index b349e5966dd..38505c172f6 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -68,9 +68,9 @@ def _bdtr(k, n, p):
   #   where(unsafe, safe_output, betainc(where(unsafe, safe_input, input)))
   ones = array_ops.ones_like(n - k)
   k_eq_n = math_ops.equal(k, n)
-  safe_dn = array_ops.where(k_eq_n, ones, n - k)
+  safe_dn = array_ops.where_v2(k_eq_n, ones, n - k)
   dk = math_ops.betainc(a=safe_dn, b=k + 1, x=1 - p)
-  return array_ops.where(k_eq_n, ones, dk)
+  return array_ops.where_v2(k_eq_n, ones, dk)
 
 
 class Binomial(distribution.Distribution):
@@ -230,7 +230,7 @@ class Binomial(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   @distribution_util.AppendDocstring(_binomial_sample_note)
   def _log_prob(self, counts):
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index c461833b9ae..6b1a022a312 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -173,7 +173,7 @@ class Cauchy(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 507c5d36794..0d57a2ddc60 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -281,7 +281,7 @@ class Deterministic(_BaseDeterministic):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _prob(self, x):
     return math_ops.cast(
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 85692d271b6..e6acae57a40 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -305,7 +305,7 @@ def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
     ValueError:  If the last dimension of `loc` is determined statically to be
       different than the range of `scale`.
   """
-  with ops.name_scope(name, values=[loc] + scale.graph_parents):
+  with ops.name_scope(name, values=[loc]):
     # Get event shape.
     event_size = scale.range_dimension_tensor()
     event_size_const = tensor_util.constant_value(event_size)
@@ -475,10 +475,9 @@ def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
       return array_ops.shape(d.batch_shape_tensor())[0]
     dist_batch_ndims = _get_ndims(mixture_distribution)
     cat_batch_ndims = _get_ndims(categorical_distribution)
-    pad_ndims = array_ops.where(
-        categorical_distribution.is_scalar_batch(),
-        dist_batch_ndims,
-        dist_batch_ndims - cat_batch_ndims)
+    pad_ndims = array_ops.where_v2(categorical_distribution.is_scalar_batch(),
+                                   dist_batch_ndims,
+                                   dist_batch_ndims - cat_batch_ndims)
     s = array_ops.shape(x)
     x = array_ops.reshape(x, shape=array_ops.concat([
         s[:-1],
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index d62f024aa2a..0b5c47056f3 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -132,7 +132,7 @@ class Geometric(distribution.Distribution):
     return array_ops.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # Uniform variates must be sampled from the open-interval `(0, 1)` rather
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 4b50df5b481..341d63f573b 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -178,7 +178,7 @@ class _Gumbel(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # Uniform variates must be sampled from the open-interval `(0, 1)` rather
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index f1216370869..1f04090b3ac 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -150,7 +150,7 @@ class HalfNormal(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 9f1e9d5cd1b..e55b4a1457a 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -187,7 +187,7 @@ class InverseGamma(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   @distribution_util.AppendDocstring(
       """Note: See `tf.random.gamma` docstring for sampling details and
@@ -236,7 +236,7 @@ class InverseGamma(distribution.Distribution):
           self.batch_shape_tensor(),
           np.array(np.nan, dtype=self.dtype.as_numpy_dtype()),
           name="nan")
-      return array_ops.where(self.concentration > 1., mean, nan)
+      return array_ops.where_v2(self.concentration > 1., mean, nan)
     else:
       return control_flow_ops.with_dependencies([
           check_ops.assert_less(
@@ -257,7 +257,7 @@ class InverseGamma(distribution.Distribution):
           self.batch_shape_tensor(),
           np.array(np.nan, dtype=self.dtype.as_numpy_dtype()),
           name="nan")
-      return array_ops.where(self.concentration > 2., var, nan)
+      return array_ops.where_v2(self.concentration > 2., var, nan)
     else:
       return control_flow_ops.with_dependencies([
           check_ops.assert_less(
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index e3712dd84e3..56f35c28b1b 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -235,7 +235,7 @@ class Kumaraswamy(transformed_distribution.TransformedDistribution):
           np.array(np.nan, dtype=self.dtype.as_numpy_dtype),
           name="nan")
       is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
-      return array_ops.where(is_defined, mode, nan)
+      return array_ops.where_v2(is_defined, mode, nan)
 
     return control_flow_ops.with_dependencies([
         check_ops.assert_less(
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 21c9b5a3544..03c5ba2997a 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -173,7 +173,7 @@ class Logistic(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # Uniform variates must be sampled from the open-interval `(0, 1)` rather
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 8fdc99824b6..f9b51cc5a62 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -186,7 +186,7 @@ class MultivariateNormalLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
+    with ops.name_scope(name, values=[loc]) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
@@ -329,8 +329,7 @@ def _kl_brute_force(a, b, name=None):
             isinstance(x, linalg.LinearOperatorScaledIdentity) or
             isinstance(x, linalg.LinearOperatorDiag))
 
-  with ops.name_scope(name, "kl_mvn", values=[a.loc, b.loc] +
-                      a.scale.graph_parents + b.scale.graph_parents):
+  with ops.name_scope(name, "kl_mvn", values=[a.loc, b.loc]):
     # Calculation is based on:
     # http://stats.stackexchange.com/questions/60680/kl-divergence-between-two-multivariate-gaussians
     # and,
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 6acfc5746a0..faf9827c8bf 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -145,7 +145,7 @@ class NegativeBinomial(distribution.Distribution):
     return array_ops.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # Here we use the fact that if:
@@ -190,10 +190,9 @@ class NegativeBinomial(distribution.Distribution):
     return self.total_count * math_ops.exp(self.logits)
 
   def _mode(self):
-    adjusted_count = array_ops.where(
-        1. < self.total_count,
-        self.total_count - 1.,
-        array_ops.zeros_like(self.total_count))
+    adjusted_count = array_ops.where_v2(1. < self.total_count,
+                                        self.total_count - 1.,
+                                        array_ops.zeros_like(self.total_count))
     return math_ops.floor(adjusted_count * math_ops.exp(self.logits))
 
   def _variance(self):
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 3d055085cc7..64c41c57d79 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -151,7 +151,7 @@ class Poisson(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   @distribution_util.AppendDocstring(_poisson_sample_note)
   def _log_prob(self, x):
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 85683e3233d..b23a3231d27 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -355,7 +355,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         self.mixture_distribution.logits.shape)[:-1]
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 19d88d5ab5d..1be2dd1c719 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -457,9 +457,9 @@ class _DistributionShape(object):
         batch_shape = s[1:1+self.batch_ndims]
         # Since sample_dims=1 and is left-most, we add 1 to the number of
         # batch_ndims to get the event start dim.
-        event_start = array_ops.where(
-            math_ops.logical_and(expand_batch_dim, self._batch_ndims_is_0),
-            2, 1 + self.batch_ndims)
+        event_start = array_ops.where_v2(
+            math_ops.logical_and(expand_batch_dim, self._batch_ndims_is_0), 2,
+            1 + self.batch_ndims)
         event_shape = s[event_start:event_start+self.event_ndims]
       new_shape = array_ops.concat([sample_shape, batch_shape, event_shape], 0)
       x = array_ops.reshape(x, shape=new_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index f9748466c2e..f17ac136406 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -524,8 +524,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
           parameters=parameters,
           graph_parents=(
               distribution._graph_parents  # pylint: disable=protected-access
-              + [loc_ for loc_ in loc if loc_ is not None]
-              + [p for scale_ in scale for p in scale_.graph_parents]),
+              + [loc_ for loc_ in loc if loc_ is not None]),
           name=name)
 
   @property
@@ -1060,5 +1059,5 @@ def softmax(x, axis, name=None):
     if axis_ is not None:
       axis = np.int(ndims + axis_ if axis_ < 0 else axis_)
     else:
-      axis = array_ops.where(axis < 0, ndims + axis, axis)
+      axis = array_ops.where_v2(axis < 0, ndims + axis, axis)
   return nn_ops.softmax(x, axis=axis)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index fd5bf9ecc72..9dcd60dab5a 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -191,7 +191,7 @@ class VectorExponentialLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
+    with ops.name_scope(name, values=[loc]) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index 67d2ccd28d6..313046db9ba 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -207,7 +207,7 @@ class VectorLaplaceLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc]):
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index a5bb880bed9..8b819053f92 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -170,8 +170,7 @@ class _WishartLinearOperator(distribution.Distribution):
         allow_nan_stats=allow_nan_stats,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         parameters=parameters,
-        graph_parents=([self._df, self._dimension] +
-                       self._scale_operator.graph_parents),
+        graph_parents=[self._df, self._dimension],
         name=name)
 
   @property
@@ -400,10 +399,9 @@ class _WishartLinearOperator(distribution.Distribution):
 
   def _mode(self):
     s = self.df - self.dimension - 1.
-    s = array_ops.where(
+    s = array_ops.where_v2(
         math_ops.less(s, 0.),
-        constant_op.constant(float("NaN"), dtype=self.dtype, name="nan"),
-        s)
+        constant_op.constant(float("NaN"), dtype=self.dtype, name="nan"), s)
     if self.cholesky_input_output_matrices:
       return math_ops.sqrt(s) * self.scale_operator.to_dense()
     return s * self._square_scale_operator()
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 48925b1bfac..0bbece7d6c3 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -25,9 +25,9 @@ import numpy as np
 
 from tensorflow.contrib import lookup
 from tensorflow.contrib.eager.python import datasets
-from tensorflow.python.data import Dataset
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -44,24 +44,24 @@ class IteratorTest(test.TestCase):
 
   def testBasic(self):
     got = []
-    for t in datasets.Iterator(Dataset.range(4)):
+    for t in datasets.Iterator(dataset_ops.Dataset.range(4)):
       got.append(t.numpy())
     self.assertAllEqual([0, 1, 2, 3], got)
 
   def testBasicOneShotIterator(self):
     got = []
-    for t in Dataset.range(4).make_one_shot_iterator():
+    for t in dataset_ops.Dataset.range(4).make_one_shot_iterator():
       got.append(t.numpy())
     self.assertAllEqual([0, 1, 2, 3], got)
 
   def testBasicImplicitIterator(self):
     got = []
-    for t in Dataset.range(4):
+    for t in dataset_ops.Dataset.range(4):
       got.append(t.numpy())
     self.assertAllEqual([0, 1, 2, 3], got)
 
   def testGetNext(self):
-    iterator = datasets.Iterator(Dataset.range(4))
+    iterator = datasets.Iterator(dataset_ops.Dataset.range(4))
     self.assertEqual(0, iterator.get_next().numpy())
     self.assertEqual(1, iterator.get_next().numpy())
     self.assertEqual(2, iterator.get_next().numpy())
@@ -70,7 +70,7 @@ class IteratorTest(test.TestCase):
       iterator.get_next()
 
   def testGetNextOneShotIterator(self):
-    iterator = Dataset.range(4).make_one_shot_iterator()
+    iterator = dataset_ops.Dataset.range(4).make_one_shot_iterator()
     self.assertEqual(0, iterator.get_next().numpy())
     self.assertEqual(1, iterator.get_next().numpy())
     self.assertEqual(2, iterator.get_next().numpy())
@@ -79,7 +79,7 @@ class IteratorTest(test.TestCase):
       iterator.get_next()
 
   def testMultipleIteratorsOnTheSameDataset(self):
-    ds = Dataset.range(4)
+    ds = dataset_ops.Dataset.range(4)
     it1 = datasets.Iterator(ds)
     it2 = datasets.Iterator(ds)
     got = [x.numpy() for x in it1]
@@ -89,8 +89,10 @@ class IteratorTest(test.TestCase):
     self.assertAllEqual([0, 1, 2, 3], got)
 
   def testNestedOutputs(self):
-    ds = Dataset.zip((Dataset.range(4), Dataset.zip((Dataset.range(4),
-                                                     Dataset.range(4)))))
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(4),
+         dataset_ops.Dataset.zip(
+             (dataset_ops.Dataset.range(4), dataset_ops.Dataset.range(4)))))
     total = 0
     # The Iterator will return a nested structure of Tensor objects.
     # Some funkiness to compare against simple integers.
@@ -102,10 +104,12 @@ class IteratorTest(test.TestCase):
     self.assertEqual(4, total)
 
   def testMapAndFilter(self):
+
     def even(x):
       return math_ops.equal(math_ops.mod(x, 2), 0)
 
-    it = datasets.Iterator(Dataset.range(8).map(math_ops.square).filter(even))
+    it = datasets.Iterator(
+        dataset_ops.Dataset.range(8).map(math_ops.square).filter(even))
     got = [x.numpy() for x in it]
     self.assertAllEqual([0, 4, 16, 36], got)
 
@@ -115,14 +119,16 @@ class IteratorTest(test.TestCase):
     values = constant_op.constant([0, 1, 2], dtypes.int64)
     table = lookup.HashTable(
         lookup.KeyValueTensorInitializer(keys, values), default_val)
-    dataset = Dataset.from_tensor_slices(['brain', 'salad', 'surgery'])
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        ['brain', 'salad', 'surgery'])
     dataset = dataset.map(table.lookup)
     it = datasets.Iterator(dataset)
     got = [x.numpy() for x in it]
     self.assertAllEqual([0, 1, 2], got)
 
   def testMultipleIteratorsOnADatasetThatUsesFunctions(self):
-    ds = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(math_ops.square)
+    ds = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5,
+                                                 6]).map(math_ops.square)
 
     got1 = [x.numpy() for x in datasets.Iterator(ds)]
     self.assertAllEqual([1, 4, 9, 16, 25, 36], got1)
@@ -172,7 +178,7 @@ class IteratorTest(test.TestCase):
     ]
 
     for i, result in enumerate(
-        datasets.Iterator(Dataset.from_tensor_slices(components))):
+        datasets.Iterator(dataset_ops.Dataset.from_tensor_slices(components))):
       self.assertSparseValuesEqual(expected[i][0], result[0])
       self.assertSparseValuesEqual(expected[i][1], result[1])
 
@@ -181,20 +187,20 @@ class IteratorTest(test.TestCase):
     def my_map(inp):
       return [[x + 1 for x in inp]]
 
-    ds = Dataset.range(4).map(
+    ds = dataset_ops.Dataset.range(4).map(
         lambda x: script_ops.py_func(my_map, [[x]], dtypes.int64))
     got = [x.numpy() for x in datasets.Iterator(ds)]
     self.assertAllEqual([[1], [2], [3], [4]], got)
 
   def testTensorsPlacedOnDevice(self):
-    ds = Dataset.from_tensors([0., 1.])
+    ds = dataset_ops.Dataset.from_tensors([0., 1.])
     with ops.device(test.gpu_device_name()):
       x = datasets.Iterator(ds).next()
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
   def testGpuTensor(self):
-    ds = Dataset.from_tensors([0., 1.])
+    ds = dataset_ops.Dataset.from_tensors([0., 1.])
     with ops.device(test.gpu_device_name()):
       for x in ds:
         y = math_ops.add(x, x)
@@ -213,7 +219,7 @@ class IteratorTest(test.TestCase):
     for num_threads in [1, 2, 4, 8, 16]:
 
       dataset = (
-          Dataset.range(1000).map(
+          dataset_ops.Dataset.range(1000).map(
               lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
               num_parallel_calls=32).apply(unique.unique()))
 
@@ -235,8 +241,13 @@ class IteratorTest(test.TestCase):
   def testSaveRestore(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
+    # TODO(b/138399725): Re-enable default optimizations.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = datasets.Iterator(dataset)
     checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], iterator.get_next().numpy())
@@ -250,11 +261,16 @@ class IteratorTest(test.TestCase):
   def testSaveRestoreMultipleIterator(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
+    # TODO(b/138399725): Re-enable default optimizations.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator_1 = datasets.Iterator(dataset)
     iterator_2 = datasets.Iterator(dataset)
-    dataset_2 = Dataset.range(10)
+    dataset_2 = dataset_ops.Dataset.range(10)
     iterator_3 = datasets.Iterator(dataset_2)
 
     checkpoint = trackable_utils.Checkpoint(
@@ -276,7 +292,7 @@ class IteratorTest(test.TestCase):
   def testRestoreExhaustedIterator(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    dataset = Dataset.range(3)
+    dataset = dataset_ops.Dataset.range(3)
     iterator = datasets.Iterator(dataset)
 
     checkpoint = trackable_utils.Checkpoint(iterator=iterator)
@@ -290,12 +306,12 @@ class IteratorTest(test.TestCase):
   def testRestoreInReconstructedIterator(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    dataset = Dataset.range(10)
+    dataset = dataset_ops.Dataset.range(10)
     for i in range(5):
       iterator = datasets.Iterator(dataset)
       checkpoint = trackable_utils.Checkpoint(iterator=iterator)
-      checkpoint.restore(checkpoint_management.latest_checkpoint(
-          checkpoint_directory))
+      checkpoint.restore(
+          checkpoint_management.latest_checkpoint(checkpoint_directory))
       for j in range(2):
         self.assertEqual(i * 2 + j, iterator.get_next().numpy())
       checkpoint.save(file_prefix=checkpoint_prefix)
@@ -311,8 +327,8 @@ class DatasetConstructorBenchmark(test.Benchmark):
     input_data = np.random.randn(input_size)
 
     dataset = (
-        Dataset.from_tensor_slices(input_data).repeat(num_epochs)
-        .batch(batch_size))
+        dataset_ops.Dataset.from_tensor_slices(input_data).repeat(
+            num_epochs).batch(batch_size))
     iterator = datasets.Iterator(dataset)
 
     ends = [time.time()]
@@ -321,10 +337,8 @@ class DatasetConstructorBenchmark(test.Benchmark):
 
     deltas = np.ediff1d(ends)
     median_wall_time = np.median(deltas)
-    print(
-        'Slice/repeat/batch eager input size: %d batch size: %d Median wall '
-        'time per element: %f'
-        % (input_size, batch_size, median_wall_time))
+    print('Slice/repeat/batch eager input size: %d batch size: %d Median wall '
+          'time per element: %f' % (input_size, batch_size, median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
@@ -339,8 +353,8 @@ class DatasetConstructorBenchmark(test.Benchmark):
     input_data = np.random.randn(input_size)
 
     dataset = (
-        Dataset.from_tensor_slices(input_data).batch(batch_size).cache()
-        .repeat(num_epochs))
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+            batch_size).cache().repeat(num_epochs))
     iterator = datasets.Iterator(dataset)
 
     ends = [time.time()]
@@ -349,10 +363,9 @@ class DatasetConstructorBenchmark(test.Benchmark):
 
     deltas = np.ediff1d(ends)
     median_wall_time = np.median(deltas)
-    print(
-        'Slice/batch/cache/repeat eager input size: %d batch size: %d Median '
-        'wall time per element: %f'
-        % (input_size, batch_size, median_wall_time))
+    print('Slice/batch/cache/repeat eager input size: %d batch size: %d Median '
+          'wall time per element: %f' %
+          (input_size, batch_size, median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 512605a17eb..cabc71c98e1 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -117,7 +117,7 @@
       "source": [
         "# Download the file\n",
         "path_to_zip = tf.keras.utils.get_file(\n",
-        "    'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', \n",
+        "    'spa-eng.zip', origin='https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', \n",
         "    extract=True)\n",
         "\n",
         "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks.py b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
index f61354bc38a..221b0766225 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
@@ -61,7 +61,7 @@ class RevBlock(tf.keras.Model):
       fused: use fused batch normalization if True
       dtype: float16, float32, or float64
     """
-    super(RevBlock, self).__init__()
+    super(RevBlock, self).__init__(dtype=dtype)
     self.blocks = tf.contrib.checkpoint.List()
     for i in range(n_res):
       curr_batch_norm_first = batch_norm_first and i == 0
@@ -135,7 +135,7 @@ class _Residual(tf.keras.Model):
       fused: use fused batch normalization if True
       dtype: float16, float32, or float64
     """
-    super(_Residual, self).__init__()
+    super(_Residual, self).__init__(dtype=dtype)
 
     self.filters = filters
     self.strides = strides
@@ -283,7 +283,7 @@ class _BottleneckResidualInner(tf.keras.Model):
       fused: use fused batch normalization if True
       dtype: float16, float32, or float64
     """
-    super(_BottleneckResidualInner, self).__init__()
+    super(_BottleneckResidualInner, self).__init__(dtype=dtype)
     axis = 1 if data_format == "channels_first" else 3
     if batch_norm_first:
       self.batch_norm_0 = tf.keras.layers.BatchNormalization(
@@ -365,7 +365,7 @@ class _ResidualInner(tf.keras.Model):
       fused: use fused batch normalization if True
       dtype: float16, float32, or float64
     """
-    super(_ResidualInner, self).__init__()
+    super(_ResidualInner, self).__init__(dtype=dtype)
     axis = 1 if data_format == "channels_first" else 3
     if batch_norm_first:
       self.batch_norm_0 = tf.keras.layers.BatchNormalization(
@@ -416,7 +416,7 @@ class InitBlock(tf.keras.Model):
     Args:
       config: tf.contrib.training.HParams object; specifies hyperparameters
     """
-    super(InitBlock, self).__init__()
+    super(InitBlock, self).__init__(config.dtype)
     self.config = config
     self.axis = 1 if self.config.data_format == "channels_first" else 3
     self.conv2d = tf.keras.layers.Conv2D(
@@ -430,7 +430,8 @@ class InitBlock(tf.keras.Model):
         dtype=self.config.dtype)
     self.batch_norm = tf.keras.layers.BatchNormalization(
         axis=self.axis, fused=self.config.fused, dtype=self.config.dtype)
-    self.activation = tf.keras.layers.Activation("relu")
+    self.activation = tf.keras.layers.Activation("relu",
+                                                 dtype=self.config.dtype)
 
     if self.config.init_max_pool:
       self.max_pool = tf.keras.layers.MaxPooling2D(
@@ -464,7 +465,7 @@ class FinalBlock(tf.keras.Model):
     Raises:
       ValueError: Unsupported data format
     """
-    super(FinalBlock, self).__init__()
+    super(FinalBlock, self).__init__(dtype=config.dtype)
     self.config = config
     self.axis = 1 if self.config.data_format == "channels_first" else 3
 
@@ -488,7 +489,8 @@ class FinalBlock(tf.keras.Model):
         input_shape=input_shape,
         fused=self.config.fused,
         dtype=self.config.dtype)
-    self.activation = tf.keras.layers.Activation("relu")
+    self.activation = tf.keras.layers.Activation("relu",
+                                                 dtype=self.config.dtype)
     self.global_avg_pool = tf.keras.layers.GlobalAveragePooling2D(
         data_format=self.config.data_format, dtype=self.config.dtype)
     self.dense = tf.keras.layers.Dense(
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet.py b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
index 7406787ba43..08f2d8d6f17 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
@@ -37,7 +37,7 @@ class RevNet(tf.keras.Model):
     Args:
       config: tf.contrib.training.HParams object; specifies hyperparameters
     """
-    super(RevNet, self).__init__()
+    super(RevNet, self).__init__(dtype=config.dtype)
     self.axis = 1 if config.data_format == "channels_first" else 3
     self.config = config
 
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 5c55f7f597b..e04de0579b1 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import numbers
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -42,6 +41,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.util.compat import collections_abc
 
 _factorization_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile("_factorization_ops.so"))
@@ -388,7 +388,7 @@ class WALSModel(object):
       return None
 
     init_mode = "list"
-    if isinstance(wt_init, collections.Iterable):
+    if isinstance(wt_init, collections_abc.Iterable):
       if num_shards == 1 and len(wt_init) == num_wts:
         wt_init = [wt_init]
       assert len(wt_init) == num_shards
@@ -641,9 +641,9 @@ class WALSModel(object):
         extras = size % num_shards
         assignments = math_ops.maximum(ids // (ids_per_shard + 1),
                                        (ids - extras) // ids_per_shard)
-        new_ids = array_ops.where(assignments < extras,
-                                  ids % (ids_per_shard + 1),
-                                  (ids - extras) % ids_per_shard)
+        new_ids = array_ops.where_v2(assignments < extras,
+                                     ids % (ids_per_shard + 1),
+                                     (ids - extras) % ids_per_shard)
         return assignments, new_ids
 
     return func
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index ca65ad45326..32e62a6725f 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -135,9 +135,10 @@ class DecodeAudioOpV2 : public OpKernel {
                     "channel_count must be a rank-0 tensor but got shape ",
                     channel_count_tensor.shape().DebugString()));
 
-    const tensorflow::StringPiece contents = contents_tensor.scalar<string>()();
+    const tensorflow::StringPiece contents =
+        contents_tensor.scalar<tstring>()();
     const string file_format =
-        absl::AsciiStrToLower(file_format_tensor.scalar<string>()());
+        absl::AsciiStrToLower(file_format_tensor.scalar<tstring>()());
     const int32 samples_per_second =
         samples_per_second_tensor.scalar<int32>()();
     const int32 channel_count = channel_count_tensor.scalar<int32>()();
@@ -243,7 +244,7 @@ class DecodeAudioOp : public OpKernel {
         errors::InvalidArgument("contents must be scalar but got shape ",
                                 contents.shape().DebugString()));
 
-    const tensorflow::StringPiece file_contents = contents.scalar<string>()();
+    const tensorflow::StringPiece file_contents = contents.scalar<tstring>()();
     Decode(context, file_contents, file_format_, samples_per_second_,
            channel_count_, "");
   }
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index 6f8ad486d10..0bfdc2781aa 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -45,7 +45,8 @@ class DecodeVideoOp : public OpKernel {
                 errors::InvalidArgument(
                     "contents must be a rank-0 tensor but got shape ",
                     contents_tensor.shape().DebugString()));
-    const tensorflow::StringPiece contents = contents_tensor.scalar<string>()();
+    const tensorflow::StringPiece contents =
+        contents_tensor.scalar<tstring>()();
 
     // Write the input data to a temp file.
     string extension;
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op.cc b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
index 7de09e062ec..ee418fb9020 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
@@ -45,7 +45,7 @@ void Encode(OpKernelContext* context, const Tensor& contents,
   // Copy the encoded audio file to the output tensor.
   Tensor* output = nullptr;
   OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(), &output));
-  output->scalar<string>()() = encoded_audio;
+  output->scalar<tstring>()() = encoded_audio;
 }
 
 }  // namespace
@@ -95,7 +95,7 @@ class EncodeAudioOpV2 : public OpKernel {
                     bits_per_second_tensor.shape().DebugString()));
 
     const string file_format =
-        absl::AsciiStrToLower(file_format_tensor.scalar<string>()());
+        absl::AsciiStrToLower(file_format_tensor.scalar<tstring>()());
     const int32 samples_per_second =
         samples_per_second_tensor.scalar<int32>()();
     const int32 bits_per_second = bits_per_second_tensor.scalar<int32>()();
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
index 6dd887edf59..811df7a55ae 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import six
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
@@ -116,9 +117,10 @@ def _set_checkpoint_initializer(variable, file_pattern, tensor_name, slice_spec,
     name: Name of the operation.
   """
   base_type = variable.dtype.base_dtype
-  restore_op = io_ops.restore_v2(
-      file_pattern, [tensor_name], [slice_spec], [base_type], name=name)[0]
-  variable._initializer_op = state_ops.assign(variable, restore_op)
+  with ops.device(variable.device), ops.device("/cpu:0"):
+    restore_op = io_ops.restore_v2(
+        file_pattern, [tensor_name], [slice_spec], [base_type], name=name)[0]
+    variable._initializer_op = state_ops.assign(variable, restore_op)
 
 
 def _set_variable_or_list_initializer(variable_or_list, file_pattern,
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index c26fdb1f0a2..8ef11109da9 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -229,13 +229,17 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
         // (1) Scale and add bias.
         // NOTE(ezhulenev): We do not use Eigen expressions for this loop,
         // because it seems that packet FMA produces slightly different results,
-        // and we are targeting bit-by-bit equality with Nvidia implementation.
+        // and we are targeting close equality with Nvidia implementation.
+        // We could use std::fmaf, but it can be ~50x slower, on machines
+        // without fma instruction.
         for (int idx = 0; idx < num_rows; ++idx) {
-          conv_output_ptr[idx] =
-              std::fmaf(conv_output_ptr[idx], conv_input_scale, bias_ptr[idx]);
+          conv_output_ptr[idx] = static_cast<double>(conv_output_ptr[idx]) *
+                                     static_cast<double>(conv_input_scale) +
+                                 static_cast<double>(bias_ptr[idx]);
           if (side_input_scale != 0.0f) {
-            conv_output_ptr[idx] = std::fmaf(
-                side_input_ptr[idx], side_input_scale, conv_output_ptr[idx]);
+            conv_output_ptr[idx] = static_cast<double>(side_input_ptr[idx]) *
+                                       static_cast<double>(side_input_scale) +
+                                   static_cast<double>(conv_output_ptr[idx]);
           }
         }
 
@@ -561,6 +565,14 @@ void LogFusedConvForwardAutotuneResults(
   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
   log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
+  {
+    string blas_version;
+    if (auto* blas = stream_exec->AsBlas()) {
+      if (blas->GetVersion(&blas_version).ok()) {
+        log.set_blas_version(blas_version);
+      }
+    }
+  }
   for (const auto& result : results) {
     *log.add_results() = result;
   }
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
deleted file mode 100644
index ddd04947e9b..00000000000
--- a/tensorflow/contrib/gan/BUILD
+++ /dev/null
@@ -1,778 +0,0 @@
-# Files for using TF-GAN framework.
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-package(
-    default_visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "gan",
-    srcs = [
-        "__init__.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":estimator",
-        ":eval",
-        ":features",
-        ":losses",
-        ":namedtuples",
-        ":train",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "namedtuples",
-    srcs = ["python/namedtuples.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_library(
-    name = "train",
-    srcs = ["python/train.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses",
-        ":namedtuples",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/slim:learning",
-        "//tensorflow/contrib/training:training_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "train_test",
-    srcs = ["python/train_test.py"],
-    python_version = "PY2",
-    shard_count = 50,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":namedtuples",
-        ":random_tensor_pool",
-        ":train",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/slim:learning",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/ops/distributions",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "eval",
-    srcs = ["python/eval/__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":classifier_metrics",
-        ":eval_utils",
-        ":sliced_wasserstein",
-        ":summaries",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "estimator",
-    srcs = ["python/estimator/__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gan_estimator",
-        ":head",
-        ":latent_gan_estimator",
-        ":stargan_estimator",
-        ":tpu_gan_estimator",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "losses",
-    srcs = ["python/losses/__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses_impl",
-        ":tuple_losses",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "features",
-    srcs = ["python/features/__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":clip_weights",
-        ":conditioning_utils",
-        ":random_tensor_pool",
-        ":spectral_normalization",
-        ":virtual_batchnorm",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "losses_impl",
-    srcs = ["python/losses/python/losses_impl.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "losses_impl_test",
-    srcs = ["python/losses/python/losses_impl_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses_impl",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_library(
-    name = "tuple_losses",
-    srcs = [
-        "python/losses/python/losses_wargs.py",
-        "python/losses/python/tuple_losses.py",
-        "python/losses/python/tuple_losses_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses_impl",
-        ":namedtuples",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_test(
-    name = "tuple_losses_test",
-    srcs = ["python/losses/python/tuple_losses_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses_impl",
-        ":namedtuples",
-        ":tuple_losses",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "conditioning_utils",
-    srcs = [
-        "python/features/python/conditioning_utils.py",
-        "python/features/python/conditioning_utils_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-py_test(
-    name = "conditioning_utils_test",
-    srcs = ["python/features/python/conditioning_utils_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":conditioning_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-    ],
-)
-
-py_library(
-    name = "random_tensor_pool",
-    srcs = [
-        "python/features/python/random_tensor_pool.py",
-        "python/features/python/random_tensor_pool_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_test(
-    name = "random_tensor_pool_test",
-    srcs = ["python/features/python/random_tensor_pool_test.py"],
-    python_version = "PY2",
-    shard_count = 6,
-    srcs_version = "PY2AND3",
-    deps = [
-        ":random_tensor_pool",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "virtual_batchnorm",
-    srcs = [
-        "python/features/python/virtual_batchnorm.py",
-        "python/features/python/virtual_batchnorm_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-py_test(
-    name = "virtual_batchnorm_test",
-    srcs = ["python/features/python/virtual_batchnorm_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":virtual_batchnorm",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "clip_weights",
-    srcs = [
-        "python/features/python/clip_weights.py",
-        "python/features/python/clip_weights_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/opt:opt_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_test(
-    name = "clip_weights_test",
-    srcs = ["python/features/python/clip_weights_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":clip_weights",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-    ],
-)
-
-py_library(
-    name = "classifier_metrics",
-    srcs = [
-        "python/eval/python/classifier_metrics.py",
-        "python/eval/python/classifier_metrics_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:image_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "classifier_metrics_test",
-    srcs = ["python/eval/python/classifier_metrics_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":classifier_metrics",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "eval_utils",
-    srcs = [
-        "python/eval/python/eval_utils.py",
-        "python/eval/python/eval_utils_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_test(
-    name = "eval_utils_test",
-    srcs = ["python/eval/python/eval_utils_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":eval_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_library(
-    name = "summaries",
-    srcs = [
-        "python/eval/python/summaries.py",
-        "python/eval/python/summaries_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":eval_utils",
-        ":namedtuples",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "summaries_test",
-    srcs = ["python/eval/python/summaries_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":namedtuples",
-        ":summaries",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
-py_library(
-    name = "head",
-    srcs = [
-        "python/estimator/python/head.py",
-        "python/estimator/python/head_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":namedtuples",
-        ":train",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-py_test(
-    name = "head_test",
-    srcs = ["python/estimator/python/head_test.py"],
-    python_version = "PY2",
-    shard_count = 1,
-    srcs_version = "PY2AND3",
-    deps = [
-        ":head",
-        ":namedtuples",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-py_library(
-    name = "gan_estimator",
-    srcs = [
-        "python/estimator/python/gan_estimator.py",
-        "python/estimator/python/gan_estimator_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":namedtuples",
-        ":summaries",
-        ":train",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-py_test(
-    name = "gan_estimator_test",
-    srcs = ["python/estimator/python/gan_estimator_test.py"],
-    python_version = "PY2",
-    shard_count = 1,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":gan_estimator",
-        ":namedtuples",
-        ":tuple_losses",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:numpy_io",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "stargan_estimator",
-    srcs = [
-        "python/estimator/python/stargan_estimator.py",
-        "python/estimator/python/stargan_estimator_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":namedtuples",
-        ":summaries",
-        ":train",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-py_test(
-    name = "stargan_estimator_test",
-    srcs = ["python/estimator/python/stargan_estimator_test.py"],
-    python_version = "PY2",
-    shard_count = 1,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":namedtuples",
-        ":stargan_estimator",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:numpy_io",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "tpu_gan_estimator",
-    srcs = [
-        "python/estimator/python/tpu_gan_estimator.py",
-        "python/estimator/python/tpu_gan_estimator_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gan_estimator",
-        ":namedtuples",
-        ":train",
-        "//tensorflow/contrib/tpu:tpu_estimator",
-        "//tensorflow/contrib/tpu:tpu_lib",
-        "//tensorflow/contrib/training:training_py",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:util",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "tpu_gan_estimator_test",
-    srcs = ["python/estimator/python/tpu_gan_estimator_test.py"],
-    python_version = "PY2",
-    shard_count = 11,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":namedtuples",
-        ":tpu_gan_estimator",
-        ":tuple_losses",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/tpu:tpu_estimator",
-        "//tensorflow/contrib/tpu:tpu_lib",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "latent_gan_estimator",
-    srcs = [
-        "python/estimator/python/latent_gan_estimator.py",
-        "python/estimator/python/latent_gan_estimator_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":train",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-py_test(
-    name = "latent_gan_estimator_test",
-    srcs = [
-        "python/estimator/python/latent_gan_estimator_test.py",
-    ],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":latent_gan_estimator",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:run_config",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_library(
-    name = "sliced_wasserstein",
-    srcs = [
-        "python/eval/python/sliced_wasserstein.py",
-        "python/eval/python/sliced_wasserstein_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "sliced_wasserstein_test",
-    srcs = ["python/eval/python/sliced_wasserstein_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":sliced_wasserstein",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "spectral_normalization",
-    srcs = [
-        "python/features/python/spectral_normalization.py",
-        "python/features/python/spectral_normalization_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:standard_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/keras:engine",
-    ],
-)
-
-py_test(
-    name = "spectral_normalization_test",
-    srcs = ["python/features/python/spectral_normalization_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":spectral_normalization",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/slim",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/keras:layers",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
deleted file mode 100644
index 3c1d814e70f..00000000000
--- a/tensorflow/contrib/gan/README.md
+++ /dev/null
@@ -1,281 +0,0 @@
-<!-- TODO(joelshor): Add images to the examples. -->
-<!-- TODO(joelshor): Add link to new location when b/122114187 is done. -->
-# TensorFlow-GAN (TF-GAN)
-
-TF-GAN is a lightweight library for training and evaluating Generative
-Adversarial Networks (GANs). This technique allows you to train a network
-(called the 'generator') to sample from a distribution, without having to
-explicitly model the distribution and without writing an explicit loss. For
-example, the generator could learn to draw samples from the distribution of
-natural images. For more details on this technique, see
-['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al. See
-[tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/)
-for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an introduction.
-
-#### Usage
-```python
-import tensorflow as tf
-tfgan = tf.contrib.gan
-```
-
-## Why TF-GAN?
-
-* Easily train generator and discriminator networks with well-tested, flexible [library calls](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py). You can
-mix TF-GAN, native TF, and other custom frameworks
-* Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
-* [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
-* Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
-* Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
-* Use the TF-GAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
-* Improvements in TF-GAN infrastructure will automatically benefit your TF-GAN project
-* Stay up-to-date with research as we add more algorithms
-
-## What are the TF-GAN components?
-
-TF-GAN is composed of several parts which were design to exist independently.
-These include the following main pieces (explained in detail below).
-
-*   [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
-    provides the main infrastructure needed to train a GAN. Training occurs in
-    four phases, and each phase can be completed by custom-code or by using a
-    TF-GAN library call.
-
-*   [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
-    Many common GAN operations and normalization techniques are implemented for
-    you to use, such as instance normalization and conditioning.
-
-*   [losses](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/):
-    Easily experiment with already-implemented and well-tested losses and
-    penalties, such as the Wasserstein loss, gradient penalty, mutual
-    information penalty, etc
-
-*   [evaluation](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/):
-    Use `Inception Score`, `Frechet Distance`, or `Kernel Distance` with a
-    pretrained Inception network to evaluate your unconditional generative
-    model. You can also use your own pretrained classifier for more specific
-    performance numbers, or use other methods for evaluating conditional
-    generative models.
-
-*   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
-    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN to make
-    GAN training easier, or use the more complicated examples to jump-start your
-    own project. These include unconditional and conditional GANs, InfoGANs,
-    adversarial losses on existing networks, and image-to-image translation.
-
-## Training a GAN model
-
-Training in TF-GAN typically consists of the following steps:
-
-1. Specify the input to your networks.
-1. Set up your generator and discriminator using a `GANModel`.
-1. Specify your loss using a `GANLoss`.
-1. Create your train ops using a `GANTrainOps`.
-1. Run your train ops.
-
-At each stage, you can either use TF-GAN's convenience functions, or you can
-perform the step manually for fine-grained control. We provide examples below.
-
-There are various types of GAN setups. For instance, you can train a generator
-to sample unconditionally from a learned distribution, or you can condition on
-extra information such as a class label. TF-GAN is compatible with many setups,
-and we demonstrate a few below:
-
-### Examples
-
-#### Unconditional MNIST generation
-
-This example trains a generator to produce handwritten MNIST digits. The generator maps
-random draws from a multivariate normal distribution to MNIST digit images. See
-['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al.
-
-```python
-# Set up the input.
-images = mnist_data_provider.provide_data(FLAGS.batch_size)
-noise = tf.random_normal([FLAGS.batch_size, FLAGS.noise_dims])
-
-# Build the generator and discriminator.
-gan_model = tfgan.gan_model(
-    generator_fn=mnist.unconditional_generator,  # you define
-    discriminator_fn=mnist.unconditional_discriminator,  # you define
-    real_data=images,
-    generator_inputs=noise)
-
-# Build the GAN loss.
-gan_loss = tfgan.gan_loss(
-    gan_model,
-    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss)
-
-# Create the train ops, which calculate gradients and apply updates to weights.
-train_ops = tfgan.gan_train_ops(
-    gan_model,
-    gan_loss,
-    generator_optimizer=tf.train.AdamOptimizer(gen_lr, 0.5),
-    discriminator_optimizer=tf.train.AdamOptimizer(dis_lr, 0.5))
-
-# Run the train ops in the alternating training scheme.
-tfgan.gan_train(
-    train_ops,
-    hooks=[tf.train.StopAtStepHook(num_steps=FLAGS.max_number_of_steps)],
-    logdir=FLAGS.train_log_dir)
-```
-
-#### Conditional MNIST generation
-This example trains a generator to generate MNIST images *of a given class*.
-The generator maps random draws from a multivariate normal distribution and a
-one-hot label of the desired digit class to an MNIST digit image. See
-['Conditional Generative Adversarial Nets'](https://arxiv.org/abs/1411.1784) by
-Mirza and Osindero.
-
-```python
-# Set up the input.
-images, one_hot_labels = mnist_data_provider.provide_data(FLAGS.batch_size)
-noise = tf.random_normal([FLAGS.batch_size, FLAGS.noise_dims])
-
-# Build the generator and discriminator.
-gan_model = tfgan.gan_model(
-    generator_fn=mnist.conditional_generator,  # you define
-    discriminator_fn=mnist.conditional_discriminator,  # you define
-    real_data=images,
-    generator_inputs=(noise, one_hot_labels))
-
-# The rest is the same as in the unconditional case.
-...
-```
-#### Adversarial loss
-This example combines an L1 pixel loss and an adversarial loss to learn to
-autoencode images. The bottleneck layer can be used to transmit compressed
-representations of the image. Neutral networks with pixel-wise loss only tend to
-produce blurry results, so the GAN can be used to make the reconstructions more
-plausible. See ['Full Resolution Image Compression with Recurrent Neural Networks'](https://arxiv.org/abs/1608.05148) by Toderici et al
-for an example of neural networks used for image compression, and ['Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network'](https://arxiv.org/abs/1609.04802) by Ledig et al for a more detailed description of
-how GANs can sharpen image output.
-
-```python
-# Set up the input pipeline.
-images = image_provider.provide_data(FLAGS.batch_size)
-
-# Build the generator and discriminator.
-gan_model = tfgan.gan_model(
-    generator_fn=nets.autoencoder,  # you define
-    discriminator_fn=nets.discriminator,  # you define
-    real_data=images,
-    generator_inputs=images)
-
-# Build the GAN loss and standard pixel loss.
-gan_loss = tfgan.gan_loss(
-    gan_model,
-    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-    gradient_penalty=1.0)
-l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
-
-# Modify the loss tuple to include the pixel loss.
-gan_loss = tfgan.losses.combine_adversarial_loss(
-    gan_loss, gan_model, l1_pixel_loss, weight_factor=FLAGS.weight_factor)
-
-# The rest is the same as in the unconditional case.
-...
-```
-
-#### Image-to-image translation
-This example maps images in one domain to images of the same size in a different
-dimension. For example, it can map segmentation masks to street images, or
-grayscale images to color. See ['Image-to-Image Translation with Conditional Adversarial Networks'](https://arxiv.org/abs/1611.07004) by Isola et al for more details.
-
-```python
-# Set up the input pipeline.
-input_image, target_image = data_provider.provide_data(FLAGS.batch_size)
-
-# Build the generator and discriminator.
-gan_model = tfgan.gan_model(
-    generator_fn=nets.generator,  # you define
-    discriminator_fn=nets.discriminator,  # you define
-    real_data=target_image,
-    generator_inputs=input_image)
-
-# Build the GAN loss and standard pixel loss.
-gan_loss = tfgan.gan_loss(
-    gan_model,
-    generator_loss_fn=tfgan.losses.least_squares_generator_loss,
-    discriminator_loss_fn=tfgan.losses.least_squares_discriminator_loss)
-l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
-
-# Modify the loss tuple to include the pixel loss.
-gan_loss = tfgan.losses.combine_adversarial_loss(
-    gan_loss, gan_model, l1_pixel_loss, weight_factor=FLAGS.weight_factor)
-
-# The rest is the same as in the unconditional case.
-...
-```
-
-#### InfoGAN
-Train a generator to generate specific MNIST digit images, and control for digit style *without using any labels*. See ['InfoGAN: Interpretable Representation Learning by Information Maximizing Generative Adversarial Nets'](https://arxiv.org/abs/1606.03657) for more details.
-
-```python
-# Set up the input pipeline.
-images = mnist_data_provider.provide_data(FLAGS.batch_size)
-
-# Build the generator and discriminator.
-gan_model = tfgan.infogan_model(
-    generator_fn=mnist.infogan_generator,  # you define
-    discriminator_fn=mnist.infogran_discriminator,  # you define
-    real_data=images,
-    unstructured_generator_inputs=unstructured_inputs,  # you define
-    structured_generator_inputs=structured_inputs)  # you define
-
-# Build the GAN loss with mutual information penalty.
-gan_loss = tfgan.gan_loss(
-    gan_model,
-    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-    gradient_penalty=1.0,
-    mutual_information_penalty_weight=1.0)
-
-# The rest is the same as in the unconditional case.
-...
-```
-
-#### Custom model creation
-Train an unconditional GAN to generate MNIST digits, but manually construct
-the `GANModel` tuple for more fine-grained control.
-
-```python
-# Set up the input pipeline.
-images = mnist_data_provider.provide_data(FLAGS.batch_size)
-noise = tf.random_normal([FLAGS.batch_size, FLAGS.noise_dims])
-
-# Manually build the generator and discriminator.
-with tf.variable_scope('Generator') as gen_scope:
-  generated_images = generator_fn(noise)
-with tf.variable_scope('Discriminator') as dis_scope:
-  discriminator_gen_outputs = discriminator_fn(generated_images)
-with variable_scope.variable_scope(dis_scope, reuse=True):
-  discriminator_real_outputs = discriminator_fn(images)
-generator_variables = variables_lib.get_trainable_variables(gen_scope)
-discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-# Depending on what TF-GAN features you use, you don't always need to supply
-# every `GANModel` field. At a minimum, you need to include the discriminator
-# outputs and variables if you want to use TF-GAN to construct losses.
-gan_model = tfgan.GANModel(
-    generator_inputs,
-    generated_data,
-    generator_variables,
-    gen_scope,
-    generator_fn,
-    real_data,
-    discriminator_real_outputs,
-    discriminator_gen_outputs,
-    discriminator_variables,
-    dis_scope,
-    discriminator_fn)
-
-# The rest is the same as the unconditional case.
-...
-```
-
-
-## Authors
-Joel Shor (github: [joel-shor](https://github.com/joel-shor)) and Sergio Guadarrama (github: [sguada](https://github.com/sguada))
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
deleted file mode 100644
index 1e6000898f7..00000000000
--- a/tensorflow/contrib/gan/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TF-GAN is a lightweight library for training and evaluating GANs.
-
-In addition to providing the infrastructure for easily training and evaluating
-GANS, this library contains modules for a TFGAN-backed Estimator,
-evaluation metrics, features (such as virtual batch normalization), and losses.
-Please see README.md for details and usage.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Collapse TF-GAN into a tiered namespace.
-from tensorflow.contrib.gan.python import estimator
-from tensorflow.contrib.gan.python import eval  # pylint:disable=redefined-builtin
-from tensorflow.contrib.gan.python import features
-from tensorflow.contrib.gan.python import losses
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python import train
-
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.gan.python.namedtuples import *
-from tensorflow.contrib.gan.python.train import *
-# pylint: enable=unused-import,wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'estimator',
-    'eval',
-    'features',
-    'losses',
-]
-_allowed_symbols += train.__all__
-_allowed_symbols += namedtuples.__all__
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
deleted file mode 100644
index 430266555b7..00000000000
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TF-GAN estimator module.
-
-GANEstimator provides all the infrastructure support of a TensorFlow Estimator
-with the feature support of TF-GAN.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Collapse `estimator` into a single namespace.
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.gan.python.estimator.python import gan_estimator
-from tensorflow.contrib.gan.python.estimator.python import head
-from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
-from tensorflow.contrib.gan.python.estimator.python import stargan_estimator
-from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator
-
-from tensorflow.contrib.gan.python.estimator.python.gan_estimator import *
-from tensorflow.contrib.gan.python.estimator.python.head import *
-from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator import *
-from tensorflow.contrib.gan.python.estimator.python.stargan_estimator import *
-from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator import *
-# pylint: enable=unused-import,wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ([
-    'gan_estimator',
-    'stargan_estimator',
-    'tpu_gan_estimator',
-    'latent_gan_estimator',
-    'head',
-] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__ +
-                    tpu_gan_estimator.__all__ + latent_gan_estimator.__all__)
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator.py
deleted file mode 100644
index bc0e4854091..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`tf.Learn` components for `GANEstimator`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.estimator.python import gan_estimator_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.estimator.python.gan_estimator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = gan_estimator_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
deleted file mode 100644
index d234558d4da..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A TF-GAN-backed GAN Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import enum
-
-from tensorflow.contrib.framework.python.ops import variables as variable_lib
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python import train as tfgan_train
-from tensorflow.contrib.gan.python.eval.python import summaries as tfgan_summaries
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_inspect as inspect
-
-
-__all__ = [
-    'GANEstimator',
-    'SummaryType'
-]
-
-
-class SummaryType(enum.IntEnum):
-  NONE = 0
-  VARIABLES = 1
-  IMAGES = 2
-  IMAGE_COMPARISON = 3
-
-
-_summary_type_map = {
-    SummaryType.VARIABLES: tfgan_summaries.add_gan_model_summaries,
-    SummaryType.IMAGES: tfgan_summaries.add_gan_model_image_summaries,
-    SummaryType.IMAGE_COMPARISON: tfgan_summaries.add_image_comparison_summaries,  # pylint:disable=line-too-long
-}
-
-
-class GANEstimator(estimator.Estimator):
-  """An estimator for Generative Adversarial Networks (GANs).
-
-  This Estimator is backed by TF-GAN. The network functions follow the TF-GAN
-  API except for one exception: if either `generator_fn` or `discriminator_fn`
-  have an argument called `mode`, then the tf.Estimator mode is passed in for
-  that argument. This helps with operations like batch normalization, which have
-  different train and evaluation behavior.
-
-  Example:
-
-  ```python
-      import tensorflow as tf
-      tfgan = tf.contrib.gan
-
-      # See TF-GAN's `train.py` for a description of the generator and
-      # discriminator API.
-      def generator_fn(generator_inputs):
-        ...
-        return generated_data
-
-      def discriminator_fn(data, conditioning):
-        ...
-        return logits
-
-      # Create GAN estimator.
-      gan_estimator = tfgan.estimator.GANEstimator(
-          model_dir,
-          generator_fn=generator_fn,
-          discriminator_fn=discriminator_fn,
-          generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
-          discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5),
-          discriminator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5))
-
-      # Train estimator.
-      gan_estimator.train(train_input_fn, steps)
-
-      # Evaluate resulting estimator.
-      gan_estimator.evaluate(eval_input_fn)
-
-      # Generate samples from generator.
-      predictions = np.array([
-          x for x in gan_estimator.predict(predict_input_fn)])
-  ```
-  """
-
-  def __init__(self,
-               model_dir=None,
-               generator_fn=None,
-               discriminator_fn=None,
-               generator_loss_fn=None,
-               discriminator_loss_fn=None,
-               generator_optimizer=None,
-               discriminator_optimizer=None,
-               get_hooks_fn=None,
-               get_eval_metric_ops_fn=None,
-               add_summaries=None,
-               use_loss_summaries=True,
-               config=None,
-               warm_start_from=None,
-               is_chief=True):
-    """Initializes a GANEstimator instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      generator_fn: A python function that takes a Tensor, Tensor list, or
-        Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TF-GAN` for more details and examples. Additionally, if
-        it has an argument called `mode`, the Estimator's `mode` will be passed
-        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
-        normalization.
-      discriminator_fn: A python function that takes the output of
-        `generator_fn` or real data in the GAN setup, and `generator_inputs`.
-        Outputs a Tensor in the range [-inf, inf]. See `TF-GAN` for more details
-        and examples.
-      generator_loss_fn: The loss function on the generator. Takes a `GANModel`
-        tuple.
-      discriminator_loss_fn: The loss function on the discriminator. Takes a
-        `GANModel` tuple.
-      generator_optimizer: The optimizer for generator updates, or a function
-        that takes no arguments and returns an optimizer. This function will
-        be called when the default graph is the `GANEstimator`'s graph, so
-        utilities like `tf.contrib.framework.get_or_create_global_step` will
-        work.
-      discriminator_optimizer: Same as `generator_optimizer`, but for the
-        discriminator updates.
-      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
-        list of hooks. These hooks are run on the generator and discriminator
-        train ops, and can be used to implement the GAN training scheme.
-        Defaults to `train.get_sequential_train_hooks()`.
-      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
-        dict of metric results keyed by name. The output of this function is
-        passed into `tf.estimator.EstimatorSpec` during evaluation.
-      add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
-      use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A filepath to a checkpoint or saved model, or a
-        WarmStartSettings object to configure initialization.
-      is_chief: Whether or not this Estimator is running on a chief or worker.
-        Needs to be set appropriately if using SyncReplicasOptimizers.
-
-    Raises:
-      ValueError: If loss functions aren't callable.
-      ValueError: If `use_loss_summaries` isn't boolean or `None`.
-      ValueError: If `get_hooks_fn` isn't callable or `None`.
-    """
-    if not callable(generator_loss_fn):
-      raise ValueError('generator_loss_fn must be callable.')
-    if not callable(discriminator_loss_fn):
-      raise ValueError('discriminator_loss_fn must be callable.')
-    if use_loss_summaries not in [True, False, None]:
-      raise ValueError('use_loss_summaries must be True, False or None.')
-    if get_hooks_fn is not None and not callable(get_hooks_fn):
-      raise TypeError('get_hooks_fn must be callable.')
-
-    def _model_fn(features, labels, mode):
-      """GANEstimator model function."""
-      if mode not in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
-                      model_fn_lib.ModeKeys.PREDICT]:
-        raise ValueError('Mode not recognized: %s' % mode)
-      real_data = labels  # rename inputs for clarity
-      generator_inputs = features  # rename inputs for clarity
-
-      # Make GANModel, which encapsulates the GAN model architectures.
-      gan_model = _get_gan_model(
-          mode, generator_fn, discriminator_fn, real_data, generator_inputs,
-          add_summaries)
-
-      # Make the EstimatorSpec, which incorporates the GANModel, losses, eval
-      # metrics, and optimizers (if required).
-      return _get_estimator_spec(
-          mode, gan_model, generator_loss_fn, discriminator_loss_fn,
-          get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-          get_hooks_fn, use_loss_summaries, is_chief)
-
-    super(GANEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
-
-
-def _get_gan_model(
-    mode, generator_fn, discriminator_fn, real_data, generator_inputs,
-    add_summaries, generator_scope='Generator'):
-  """Makes the GANModel tuple, which encapsulates the GAN model architecture."""
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    if real_data is not None:
-      raise ValueError('`labels` must be `None` when mode is `predict`. '
-                       'Instead, found %s' % real_data)
-    gan_model = _make_prediction_gan_model(
-        generator_inputs, generator_fn, generator_scope)
-  else:  # model_fn_lib.ModeKeys.TRAIN or model_fn_lib.ModeKeys.EVAL
-    gan_model = _make_gan_model(
-        generator_fn, discriminator_fn, real_data, generator_inputs,
-        generator_scope, add_summaries, mode)
-
-  return gan_model
-
-
-def _get_estimator_spec(
-    mode, gan_model, generator_loss_fn, discriminator_loss_fn,
-    get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn=None, use_loss_summaries=True, is_chief=True):
-  """Get the EstimatorSpec for the current mode."""
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    estimator_spec = model_fn_lib.EstimatorSpec(
-        mode=mode, predictions=gan_model.generated_data)
-  else:
-    gan_loss = tfgan_tuples.GANLoss(
-        generator_loss=generator_loss_fn(
-            gan_model, add_summaries=use_loss_summaries),
-        discriminator_loss=discriminator_loss_fn(
-            gan_model, add_summaries=use_loss_summaries))
-    if mode == model_fn_lib.ModeKeys.EVAL:
-      estimator_spec = _get_eval_estimator_spec(
-          gan_model, gan_loss, get_eval_metric_ops_fn)
-    else:  # model_fn_lib.ModeKeys.TRAIN:
-      if callable(generator_optimizer):
-        generator_optimizer = generator_optimizer()
-      if callable(discriminator_optimizer):
-        discriminator_optimizer = discriminator_optimizer()
-      get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
-      estimator_spec = _get_train_estimator_spec(
-          gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-          get_hooks_fn, is_chief=is_chief)
-
-  return estimator_spec
-
-
-def _make_gan_model(generator_fn, discriminator_fn, real_data,
-                    generator_inputs, generator_scope, add_summaries, mode):
-  """Construct a `GANModel`, and optionally pass in `mode`."""
-  # If network functions have an argument `mode`, pass mode to it.
-  if 'mode' in inspect.getargspec(generator_fn).args:
-    generator_fn = functools.partial(generator_fn, mode=mode)
-  if 'mode' in inspect.getargspec(discriminator_fn).args:
-    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
-  gan_model = tfgan_train.gan_model(
-      generator_fn,
-      discriminator_fn,
-      real_data,
-      generator_inputs,
-      generator_scope=generator_scope,
-      check_shapes=False)
-  if add_summaries:
-    if not isinstance(add_summaries, (tuple, list)):
-      add_summaries = [add_summaries]
-    with ops.name_scope(None):
-      for summary_type in add_summaries:
-        _summary_type_map[summary_type](gan_model)
-
-  return gan_model
-
-
-def _make_prediction_gan_model(generator_inputs, generator_fn, generator_scope):
-  """Make a `GANModel` from just the generator."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
-  if 'mode' in inspect.getargspec(generator_fn).args:
-    generator_fn = functools.partial(generator_fn,
-                                     mode=model_fn_lib.ModeKeys.PREDICT)
-  with variable_scope.variable_scope(generator_scope) as gen_scope:
-    generator_inputs = tfgan_train._convert_tensor_or_l_or_d(generator_inputs)  # pylint:disable=protected-access
-    generated_data = generator_fn(generator_inputs)
-  generator_variables = variable_lib.get_trainable_variables(gen_scope)
-
-  return tfgan_tuples.GANModel(
-      generator_inputs,
-      generated_data,
-      generator_variables,
-      gen_scope,
-      generator_fn,
-      real_data=None,
-      discriminator_real_outputs=None,
-      discriminator_gen_outputs=None,
-      discriminator_variables=None,
-      discriminator_scope=None,
-      discriminator_fn=None)
-
-
-def _get_eval_estimator_spec(gan_model, gan_loss, get_eval_metric_ops_fn=None,
-                             name=None):
-  """Return an EstimatorSpec for the eval case."""
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-  with ops.name_scope(None, 'metrics',
-                      [gan_loss.generator_loss,
-                       gan_loss.discriminator_loss]):
-    def _summary_key(head_name, val):
-      return '%s/%s' % (val, head_name) if head_name else val
-    eval_metric_ops = {
-        _summary_key(name, 'generator_loss'):
-            metrics_lib.mean(gan_loss.generator_loss),
-        _summary_key(name, 'discriminator_loss'):
-            metrics_lib.mean(gan_loss.discriminator_loss)
-    }
-    if get_eval_metric_ops_fn is not None:
-      custom_eval_metric_ops = get_eval_metric_ops_fn(gan_model)
-      if not isinstance(custom_eval_metric_ops, dict):
-        raise TypeError('get_eval_metric_ops_fn must return a dict, '
-                        'received: {}'.format(custom_eval_metric_ops))
-      eval_metric_ops.update(custom_eval_metric_ops)
-  return model_fn_lib.EstimatorSpec(
-      mode=model_fn_lib.ModeKeys.EVAL,
-      predictions=gan_model.generated_data,
-      loss=scalar_loss,
-      eval_metric_ops=eval_metric_ops)
-
-
-def _get_train_estimator_spec(
-    gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops, is_chief=True):
-  """Return an EstimatorSpec for the train case."""
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-  train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer,
-                          discriminator_optimizer, is_chief=is_chief)
-  training_hooks = get_hooks_fn(train_ops)
-  return model_fn_lib.EstimatorSpec(
-      loss=scalar_loss,
-      mode=model_fn_lib.ModeKeys.TRAIN,
-      train_op=train_ops.global_step_inc_op,
-      training_hooks=training_hooks)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
deleted file mode 100644
index 66af79d1e81..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN's estimator.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python.estimator.python import gan_estimator_impl as estimator
-from tensorflow.contrib.gan.python.losses.python import tuple_losses as losses
-from tensorflow.contrib.learn.python.learn.learn_io import graph_io
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.estimator import WarmStartSettings
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework.errors_impl import NotFoundError
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import learning_rate_decay
-from tensorflow.python.training import sync_replicas_optimizer
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-
-
-def generator_fn(noise_dict, mode):
-  del mode
-  noise = noise_dict['x']
-  return layers.fully_connected(noise, tensor_shape.dimension_value(
-      noise.shape[1]))
-
-
-def discriminator_fn(data, unused_conditioning, mode):
-  del unused_conditioning, mode
-  return layers.fully_connected(data, 1)
-
-
-class GetGANModelTest(test.TestCase, parameterized.TestCase):
-  """Tests that `GetGANModel` produces the correct model."""
-
-  @parameterized.named_parameters(
-      ('train', model_fn_lib.ModeKeys.TRAIN),
-      ('eval', model_fn_lib.ModeKeys.EVAL),
-      ('predict', model_fn_lib.ModeKeys.PREDICT))
-  def test_get_gan_model(self, mode):
-    with ops.Graph().as_default():
-      generator_inputs = {'x': array_ops.ones([3, 4])}
-      is_predict = mode == model_fn_lib.ModeKeys.PREDICT
-      real_data = array_ops.zeros([3, 4]) if not is_predict else None
-      gan_model = estimator._get_gan_model(
-          mode, generator_fn, discriminator_fn, real_data, generator_inputs,
-          add_summaries=False)
-
-    self.assertEqual(generator_inputs, gan_model.generator_inputs)
-    self.assertIsNotNone(gan_model.generated_data)
-    self.assertLen(gan_model.generator_variables, 2)  # 1 FC layer
-    self.assertIsNotNone(gan_model.generator_fn)
-    if mode == model_fn_lib.ModeKeys.PREDICT:
-      self.assertIsNone(gan_model.real_data)
-      self.assertIsNone(gan_model.discriminator_real_outputs)
-      self.assertIsNone(gan_model.discriminator_gen_outputs)
-      self.assertIsNone(gan_model.discriminator_variables)
-      self.assertIsNone(gan_model.discriminator_scope)
-      self.assertIsNone(gan_model.discriminator_fn)
-    else:
-      self.assertIsNotNone(gan_model.real_data)
-      self.assertIsNotNone(gan_model.discriminator_real_outputs)
-      self.assertIsNotNone(gan_model.discriminator_gen_outputs)
-      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
-      self.assertIsNotNone(gan_model.discriminator_scope)
-      self.assertIsNotNone(gan_model.discriminator_fn)
-
-
-def get_dummy_gan_model():
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  return tfgan_tuples.GANModel(
-      generator_inputs=None,
-      generated_data=array_ops.ones([3, 4]),
-      generator_variables=[gen_var],
-      generator_scope=gen_scope,
-      generator_fn=None,
-      real_data=array_ops.zeros([3, 4]),
-      discriminator_real_outputs=array_ops.ones([1, 2, 3]) * dis_var,
-      discriminator_gen_outputs=array_ops.ones([1, 2, 3]) * gen_var * dis_var,
-      discriminator_variables=[dis_var],
-      discriminator_scope=dis_scope,
-      discriminator_fn=None)
-
-
-def dummy_loss_fn(gan_model, add_summaries=True):
-  del add_summaries
-  return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
-                             gan_model.discriminator_gen_outputs)
-
-
-def get_metrics(gan_model):
-  return {
-      'mse_custom_metric': metrics_lib.mean_squared_error(
-          gan_model.real_data, gan_model.generated_data)
-  }
-
-
-class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
-  """Tests that the EstimatorSpec is constructed appropriately."""
-
-  @classmethod
-  def setUpClass(cls):
-    super(GetEstimatorSpecTest, cls).setUpClass()
-    cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
-    cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
-
-  @parameterized.named_parameters(
-      ('train', model_fn_lib.ModeKeys.TRAIN),
-      ('eval', model_fn_lib.ModeKeys.EVAL),
-      ('predict', model_fn_lib.ModeKeys.PREDICT))
-  def test_get_estimator_spec(self, mode):
-    with ops.Graph().as_default():
-      self._gan_model = get_dummy_gan_model()
-      spec = estimator._get_estimator_spec(
-          mode,
-          self._gan_model,
-          generator_loss_fn=dummy_loss_fn,
-          discriminator_loss_fn=dummy_loss_fn,
-          get_eval_metric_ops_fn=get_metrics,
-          generator_optimizer=self._generator_optimizer,
-          discriminator_optimizer=self._discriminator_optimizer)
-
-    self.assertEqual(mode, spec.mode)
-    if mode == model_fn_lib.ModeKeys.PREDICT:
-      self.assertEqual(self._gan_model.generated_data, spec.predictions)
-    elif mode == model_fn_lib.ModeKeys.TRAIN:
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.train_op)
-      self.assertIsNotNone(spec.training_hooks)
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      self.assertEqual(self._gan_model.generated_data, spec.predictions)
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.eval_metric_ops)
-
-  def test_get_sync_estimator_spec(self):
-    """Make sure spec is loaded with sync hooks for sync opts."""
-
-    def get_sync_optimizer():
-      return sync_replicas_optimizer.SyncReplicasOptimizer(
-          training.GradientDescentOptimizer(learning_rate=1.0),
-          replicas_to_aggregate=1)
-
-    with ops.Graph().as_default():
-      self._gan_model = get_dummy_gan_model()
-      g_opt = get_sync_optimizer()
-      d_opt = get_sync_optimizer()
-
-      spec = estimator._get_estimator_spec(
-          model_fn_lib.ModeKeys.TRAIN,
-          self._gan_model,
-          generator_loss_fn=dummy_loss_fn,
-          discriminator_loss_fn=dummy_loss_fn,
-          get_eval_metric_ops_fn=get_metrics,
-          generator_optimizer=g_opt,
-          discriminator_optimizer=d_opt)
-
-      self.assertLen(spec.training_hooks, 4)
-      sync_opts = [
-          hook._sync_optimizer for hook in spec.training_hooks if
-          isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
-      self.assertLen(sync_opts, 2)
-      self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
-
-
-class GANEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, prediction_size,
-      lr_decay=False):
-    def make_opt():
-      gstep = training_util.get_or_create_global_step()
-      lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
-      return training.GradientDescentOptimizer(lr)
-
-    gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    est = estimator.GANEstimator(
-        generator_fn=generator_fn,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=gopt,
-        discriminator_optimizer=dopt,
-        get_eval_metric_ops_fn=get_metrics,
-        model_dir=self._model_dir)
-
-    # Train.
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # Evaluate.
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', scores)
-    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
-                     scores['loss'])
-    self.assertIn('mse_custom_metric', scores)
-
-    # Predict.
-    predictions = np.array([x for x in est.predict(predict_input_fn)])
-
-    self.assertAllEqual(prediction_size, predictions.shape)
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    input_dim = 4
-    batch_size = 5
-    data = np.zeros([batch_size, input_dim])
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        prediction_size=[batch_size, input_dim])
-
-  def test_numpy_input_fn_lrdecay(self):
-    """Tests complete flow with numpy_input_fn."""
-    input_dim = 4
-    batch_size = 5
-    data = np.zeros([batch_size, input_dim])
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        prediction_size=[batch_size, input_dim],
-        lr_decay=True)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dim = 4
-    batch_size = 6
-    data = np.zeros([batch_size, input_dim])
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-              'y': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dim], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([input_dim], dtypes.float32),
-    }
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(
-          serialized_examples, feature_spec)
-      _, features = graph_io.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      _, features = graph_io.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      _, features = graph_io.queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        prediction_size=[batch_size, input_dim])
-
-
-class GANEstimatorWarmStartTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = self.get_temp_dir()
-    self.new_variable_name = 'new_var'
-    self.new_variable_value = [1, 2, 3]
-
-  def tearDown(self):
-    writer_cache.FileWriterCache.clear()
-
-  def _test_warm_start(self, warm_start_from=None):
-    """Tests whether WarmStartSettings work as intended."""
-    def generator_with_new_variable(noise_dict, mode):
-      variable_scope.get_variable(name=self.new_variable_name,
-                                  initializer=self.new_variable_value,
-                                  trainable=True)
-      return generator_fn(noise_dict, mode)
-
-    def train_input_fn():
-      data = np.zeros([3, 4])
-      return {'x': data}, data
-
-    est = estimator.GANEstimator(
-        generator_fn=generator_fn,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
-        model_dir=self._model_dir)
-
-    est.train(train_input_fn, steps=1)
-
-    est_warm = estimator.GANEstimator(
-        generator_fn=generator_with_new_variable,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
-        model_dir=None if warm_start_from else self._model_dir,
-        warm_start_from=warm_start_from)
-
-    est_warm.train(train_input_fn, steps=1)
-
-    return est_warm
-
-  def test_warm_start_error(self):
-    """Test if exception when reloading different estimators."""
-    with self.assertRaises(NotFoundError):
-      self._test_warm_start()
-
-  def test_warm_start_success(self):
-    """Test if GANEstimator allows explicit warm start variable assignment."""
-    # Regex matches all variable names in ckpt except for new_var.
-    var_regex = '^(?!.*%s.*)' % self.new_variable_name
-    warmstart = WarmStartSettings(ckpt_to_initialize_from=self._model_dir,
-                                  vars_to_warm_start=var_regex)
-    est_warm = self._test_warm_start(warm_start_from=warmstart)
-    full_variable_name = 'Generator/%s' % self.new_variable_name
-    self.assertIn(full_variable_name, est_warm.get_variable_names())
-    equal_vals = np.array_equal(est_warm.get_variable_value(full_variable_name),
-                                self.new_variable_value)
-    self.assertTrue(equal_vals)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/head.py b/tensorflow/contrib/gan/python/estimator/python/head.py
deleted file mode 100644
index 3225d6f41a1..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/head.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`tf.Learn` components for `GANEstimator`'s loss."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.estimator.python import head_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.estimator.python.head_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = head_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
deleted file mode 100644
index cbe990b476c..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A TF-GAN-backed GAN Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python import train as tfgan_train
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.canned import head
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.util import deprecation
-
-__all__ = [
-    'GANHead',
-    'gan_head',
-]
-
-
-def _summary_key(head_name, val):
-  return '%s/%s' % (val, head_name) if head_name else val
-
-
-@deprecation.deprecated(
-    None, 'Please use tf.contrib.gan.GANEstimator without explicitly making a '
-    'GANHead.')
-def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
-             discriminator_optimizer, use_loss_summaries=True,
-             get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
-             get_eval_metric_ops_fn=None, name=None):
-  """Creates a `GANHead`.
-
-  Args:
-    generator_loss_fn: A TFGAN loss function for the generator. Takes a
-      `GANModel` and returns a scalar.
-    discriminator_loss_fn: Same as `generator_loss_fn`, but for the
-      discriminator.
-    generator_optimizer: The optimizer for generator updates.
-    discriminator_optimizer: Same as `generator_optimizer`, but for the
-      discriminator updates.
-    use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-      If `None`, uses defaults.
-    get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
-      list of hooks.
-    get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
-      dict of metric results keyed by name. The output of this function is
-      passed into `tf.estimator.EstimatorSpec` during evaluation.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`.
-
-  Returns:
-    An instance of `GANHead`.
-  """
-  return GANHead(generator_loss_fn=generator_loss_fn,
-                 discriminator_loss_fn=discriminator_loss_fn,
-                 generator_optimizer=generator_optimizer,
-                 discriminator_optimizer=discriminator_optimizer,
-                 use_loss_summaries=use_loss_summaries,
-                 get_hooks_fn=get_hooks_fn,
-                 get_eval_metric_ops_fn=get_eval_metric_ops_fn,
-                 name=name)
-
-
-class GANHead(head._Head):  # pylint: disable=protected-access
-  """`Head` for a GAN."""
-
-  @deprecation.deprecated(
-      None, 'Please use tf.contrib.gan.GANEstimator without explicitly making '
-      'a GANHead.')
-  def __init__(self, generator_loss_fn, discriminator_loss_fn,
-               generator_optimizer, discriminator_optimizer,
-               use_loss_summaries=True,
-               get_hooks_fn=None,
-               get_eval_metric_ops_fn=None,
-               name=None):
-    """`Head` for GAN training.
-
-    Args:
-      generator_loss_fn: A TFGAN loss function for the generator. Takes a
-        `GANModel` and returns a scalar.
-      discriminator_loss_fn: Same as `generator_loss_fn`, but for the
-      discriminator.
-      generator_optimizer: The optimizer for generator updates.
-      discriminator_optimizer: Same as `generator_optimizer`, but for the
-        discriminator updates.
-      use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
-        list of hooks. Defaults to `train.get_sequential_train_hooks()`
-      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
-        dict of metric results keyed by name. The output of this function is
-        passed into `tf.estimator.EstimatorSpec` during evaluation.
-      name: name of the head. If provided, summary and metrics keys will be
-        suffixed by `"/" + name`.
-    """
-
-    if not callable(generator_loss_fn):
-      raise TypeError('generator_loss_fn must be callable.')
-    if not callable(discriminator_loss_fn):
-      raise TypeError('discriminator_loss_fn must be callable.')
-    if use_loss_summaries not in [True, False, None]:
-      raise ValueError('use_loss_summaries must be True, False or None.')
-    if get_hooks_fn is not None and not callable(get_hooks_fn):
-      raise TypeError('get_hooks_fn must be callable.')
-    if name is not None and not isinstance(name, str):
-      raise TypeError('name must be string.')
-
-    if get_hooks_fn is None:
-      get_hooks_fn = tfgan_train.get_sequential_train_hooks()
-
-    if use_loss_summaries in [True, False]:
-      generator_loss_fn = functools.partial(
-          generator_loss_fn, add_summaries=use_loss_summaries)
-      discriminator_loss_fn = functools.partial(
-          discriminator_loss_fn, add_summaries=use_loss_summaries)
-    self._generator_loss_fn = generator_loss_fn
-    self._discriminator_loss_fn = discriminator_loss_fn
-    self._generator_optimizer = generator_optimizer
-    self._discriminator_optimizer = discriminator_optimizer
-    self._get_hooks_fn = get_hooks_fn
-    self._get_eval_metric_ops_fn = get_eval_metric_ops_fn
-    self._name = name
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def logits_dimension(self):
-    return None
-
-  def create_loss(self, features, mode, logits, labels):
-    """Returns a GANLoss tuple from the provided GANModel.
-
-    See `Head` for more details.
-
-    Args:
-      features: Input `dict` of `Tensor` objects. Unused.
-      mode: Estimator's `ModeKeys`.
-      logits: A GANModel tuple.
-      labels: Must be `None`.
-
-    Returns:
-      A GANLoss tuple.
-
-    """
-    _validate_logits_and_labels(logits, labels)
-    del mode, labels, features  # unused for this head.
-    gan_model = logits  # rename variable for clarity
-    return tfgan_tuples.GANLoss(
-        generator_loss=self._generator_loss_fn(gan_model),
-        discriminator_loss=self._discriminator_loss_fn(gan_model))
-
-  def create_estimator_spec(
-      self, features, mode, logits, labels=None,
-      train_op_fn=tfgan_train.gan_train_ops):
-    """Returns `EstimatorSpec` that a model_fn can return.
-
-    See `Head` for more details.
-
-    Args:
-      features: Must be `None`.
-      mode: Estimator's `ModeKeys`.
-      logits: A GANModel tuple.
-      labels: Must be `None`.
-      train_op_fn: Function that takes a GANModel, GANLoss, generator optimizer,
-        and discriminator optimizer, and returns a `GANTrainOps` tuple. For
-        example, this function can come from TFGAN's `train.py` library, or can
-        be custom.
-
-    Returns:
-      `EstimatorSpec`.
-
-    Raises:
-      ValueError: If `features` isn't `None`.
-      ValueError: If `train_op_fn` isn't provided in train mode.
-    """
-    _validate_logits_and_labels(logits, labels)
-    if features is not None:
-      raise ValueError('`features` should be `None`. Instead, found: %s' %
-                       features)
-    gan_model = logits  # rename variable for clarity
-    with ops.name_scope('GANHead'):
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        return model_fn_lib.EstimatorSpec(
-            mode=model_fn_lib.ModeKeys.PREDICT,
-            predictions=gan_model.generated_data,
-            export_outputs={
-                'predict': export_output.PredictOutput(gan_model.generated_data)
-            })
-      elif mode == model_fn_lib.ModeKeys.EVAL:
-        gan_loss = self.create_loss(
-            features=None, mode=mode, logits=gan_model, labels=None)
-        scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-        with ops.name_scope(None, 'metrics',
-                            [gan_loss.generator_loss,
-                             gan_loss.discriminator_loss]):
-          eval_metric_ops = {
-              _summary_key(self._name, 'generator_loss'):
-                  metrics_lib.mean(gan_loss.generator_loss),
-              _summary_key(self._name, 'discriminator_loss'):
-                  metrics_lib.mean(gan_loss.discriminator_loss)
-          }
-          if self._get_eval_metric_ops_fn is not None:
-            custom_eval_metric_ops = self._get_eval_metric_ops_fn(gan_model)
-            if not isinstance(custom_eval_metric_ops, dict):
-              raise TypeError('get_eval_metric_ops_fn must return a dict, '
-                              'received: {}'.format(custom_eval_metric_ops))
-            eval_metric_ops.update(custom_eval_metric_ops)
-        return model_fn_lib.EstimatorSpec(
-            mode=model_fn_lib.ModeKeys.EVAL,
-            predictions=gan_model.generated_data,
-            loss=scalar_loss,
-            eval_metric_ops=eval_metric_ops)
-      elif mode == model_fn_lib.ModeKeys.TRAIN:
-        if train_op_fn is None:
-          raise ValueError('train_op_fn can not be None.')
-        gan_loss = self.create_loss(None, mode, gan_model, None)
-        scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-        train_ops = train_op_fn(gan_model, gan_loss, self._generator_optimizer,
-                                self._discriminator_optimizer)
-        training_hooks = self._get_hooks_fn(train_ops)
-        return model_fn_lib.EstimatorSpec(
-            loss=scalar_loss,
-            mode=model_fn_lib.ModeKeys.TRAIN,
-            train_op=train_ops.global_step_inc_op,
-            training_hooks=training_hooks)
-      else:
-        raise ValueError('Mode not recognized: %s' % mode)
-
-
-def _validate_logits_and_labels(logits, labels):
-  if labels is not None:
-    raise ValueError('`GANHead`\'s `create_estimator_spec` input `labels` must '
-                     'be `None`. Instead, found: %s' % labels)
-
-  if not isinstance(logits, tfgan_tuples.GANModel):
-    raise ValueError('`GANHead`\'s `create_estimator_spec` input `logits` must '
-                     'be an instnace of a `GANModel`. Instead, found: %s' %
-                     logits)
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
deleted file mode 100644
index 5b50234a0e3..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN's head.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python.estimator.python import head
-
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import training
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def dummy_loss(gan_model, add_summaries=True):  # pylint:disable=unused-argument
-  return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
-                             gan_model.discriminator_gen_outputs)
-
-
-def get_gan_model():
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  return tfgan_tuples.GANModel(
-      generator_inputs=None,
-      generated_data=array_ops.ones([3, 4]),
-      generator_variables=[gen_var],
-      generator_scope=gen_scope,
-      generator_fn=None,
-      real_data=None,
-      discriminator_real_outputs=array_ops.ones([1, 2, 3]) * dis_var,
-      discriminator_gen_outputs=array_ops.ones([1, 2, 3]) * gen_var * dis_var,
-      discriminator_variables=[dis_var],
-      discriminator_scope=dis_scope,
-      discriminator_fn=None)
-
-
-class GANHeadTest(test.TestCase):
-
-  def setUp(self):
-    super(GANHeadTest, self).setUp()
-    self.gan_head = head.gan_head(
-        generator_loss_fn=dummy_loss,
-        discriminator_loss_fn=dummy_loss,
-        generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
-        get_eval_metric_ops_fn=self.get_metrics)
-    self.assertIsInstance(self.gan_head, head.GANHead)
-
-  def get_metrics(self, gan_model):
-    self.assertTrue(isinstance(gan_model, tfgan_tuples.GANModel))
-    return {}
-
-  def _test_modes_helper(self, mode):
-    return self.gan_head.create_estimator_spec(
-        features=None,
-        mode=mode,
-        logits=get_gan_model())
-
-  def test_modes_predict(self):
-    spec = self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT)
-    self.assertItemsEqual((_DEFAULT_SERVING_KEY, 'predict'),
-                          spec.export_outputs.keys())
-
-  def test_modes_eval(self):
-    self._test_modes_helper(model_fn_lib.ModeKeys.EVAL)
-
-  def test_modes_train(self):
-    self._test_modes_helper(model_fn_lib.ModeKeys.TRAIN)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
deleted file mode 100644
index 4e164e24168..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`tf.Learn` components for `Train Input Estimator`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = latent_gan_estimator_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
deleted file mode 100644
index f5afc773193..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements an estimator wrapper that allows training the input latent space.
-
-This file implements a latent gan estimator that wraps around a previously
-trained GAN. The latent gan estimator trains a single variable z, representing
-the hidden latent distribution that is the 'noise' input to the GAN. By training
-z, the inpainting estimator can move around the latent z space towards
-minimizing a specific loss function.
-
-The latent gan estimator has a few key differences from a normal estimator.
-
-First: the variables in the estimator should not be saved, as we are not
-updating the original GAN and are only adding a new z variable that is meant
-to be different for each run. In order to do distributed training using
-train_and_evaluate, the Tensorflow RunConfig is expected to save checkpoints
-by having either save_checkpoints_steps or save_checkpoints_secs saved.
-To avoid this conflict, we purposely set the save_checkpoints_steps value in
-the RunConfig to be one step more than the total number of steps that the
-inpainter estimator will run.
-
-Second: we need to specify warm start settings, as we are reloading the
-GAN model into a different graph (specifically, one with a new z variable).
-The warm start settings defined below reload all GAN variables and ignore the
-new z variable (and the optimizer).
-
-Usage:
-
-  def _generator(net, mode):
-    ...
-
-  def _discriminator(net, condition, mode):
-    ...
-
-  def _loss(gan_model, features, labels, add_summaries):
-    ...
-
-  def optimizer():
-    ...
-
-  params = {<required params>}
-  config = tf.estimator.RunConfig()
-  tmp_dir = path/to/output/storage
-
-  estimator = latent_gan_estimator.get_latent_gan_estimator(
-      _generator, _discriminator, _loss, optimizer, params, config, tmp_dir)
-
-  def input_fn():
-    ...
-
-  estimator.train(input_fn=input_fn)
-
-See latent_gan_estimator_test.py or tensorflow_models/gan/face_inpainting for
-further examples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-from tensorflow.contrib.gan.python import train as tfgan_train
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
-
-
-INPUT_NAME = 'new_var_z_input'  # The name for the new z space input variable.
-OPTIMIZER_NAME = 'latent_gan_optimizer'  # The name for the new optimizer vars.
-
-__all__ = [
-    'get_latent_gan_estimator',
-]
-
-
-def _get_latent_gan_model_fn(generator_fn, discriminator_fn, loss_fn,
-                             optimizer):
-  """Sets up a model function that wraps around a given GAN."""
-  def model_fn(features, labels, mode, params):
-    """Model function defining an inpainting estimator."""
-    batch_size = params['batch_size']
-    z_shape = [batch_size] + params['z_shape']
-    add_summaries = params['add_summaries']
-    input_clip = params['input_clip']
-
-    z = variable_scope.get_variable(
-        name=INPUT_NAME, initializer=random_ops.truncated_normal(z_shape),
-        constraint=lambda x: clip_ops.clip_by_value(x, -input_clip, input_clip))
-
-    generator = functools.partial(generator_fn, mode=mode)
-    discriminator = functools.partial(discriminator_fn, mode=mode)
-    gan_model = tfgan_train.gan_model(generator_fn=generator,
-                                      discriminator_fn=discriminator,
-                                      real_data=labels,
-                                      generator_inputs=z,
-                                      check_shapes=False)
-
-    loss = loss_fn(gan_model, features, labels, add_summaries)
-
-    # Use a variable scope to make sure that estimator variables dont cause
-    # save/load problems when restoring from ckpts.
-    with variable_scope.variable_scope(OPTIMIZER_NAME):
-      opt = optimizer(learning_rate=params['learning_rate'],
-                      **params['opt_kwargs'])
-      train_op = opt.minimize(
-          loss=loss, global_step=training_util.get_or_create_global_step(),
-          var_list=[z])
-
-    if add_summaries:
-      z_grads = gradients_impl.gradients(loss, z)
-      summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
-      summary.scalar('z_loss/loss', loss)
-
-    return model_fn_lib.EstimatorSpec(mode=mode,
-                                      predictions=gan_model.generated_data,
-                                      loss=loss,
-                                      train_op=train_op)
-  return model_fn
-
-
-def get_latent_gan_estimator(generator_fn, discriminator_fn, loss_fn,
-                             optimizer, params, config, ckpt_dir,
-                             warmstart_options=True):
-  """Gets an estimator that passes gradients to the input.
-
-  This function takes in a generator and adds a trainable z variable that is
-  used as input to this generator_fn. The generator itself is treated as a black
-  box through which gradients can pass through without updating any weights. The
-  result is a trainable way to traverse the GAN latent space. The loss_fn is
-  used to actually train the z variable. The generator_fn and discriminator_fn
-  should be previously trained by the tfgan library (on reload, the variables
-  are expected to follow the tfgan format. It may be possible to use the
-  latent gan estimator with entirely custom GANs that do not use the tfgan
-  library as long as the appropriate variables are wired properly).
-
-  Args:
-    generator_fn: a function defining a Tensorflow graph for a GAN generator.
-      The weights defined in this graph should already be defined in the given
-      checkpoint location. Should have 'mode' as an argument.
-    discriminator_fn: a function defining a Tensorflow graph for a GAN
-      discriminator. Should have 'mode' as an argument.
-    loss_fn: a function defining a Tensorflow graph for a GAN loss. Takes in a
-      GANModel tuple, features, labels, and add_summaries as inputs.
-    optimizer: a tf.Optimizer or a function that returns a tf.Optimizer with no
-      inputs.
-   params: An object containing the following parameters:
-      - batch_size: an int indicating the size of the training batch.
-      - z_shape: the desired shape of the input z values (not counting batch).
-      - learning_rate: a scalar or function defining a learning rate applied to
-        optimizer.
-      - input_clip: the amount to clip the x training variable by.
-      - add_summaries: whether or not to add summaries.
-      - opt_kwargs: optimizer kwargs.
-    config: tf.RunConfig. Should point model to output dir and should indicate
-     whether to save checkpoints (to avoid saving checkpoints, set
-     save_checkpoints_steps to a number larger than the number of train steps).
-     The model_dir field in the RunConfig should point to a directory WITHOUT
-     any saved checkpoints.
-    ckpt_dir: the directory where the model checkpoints live. The checkpoint is
-     used to warm start the underlying GAN. This should NOT be the same as
-     config.model_dir.
-    warmstart_options: boolean, None, or a WarmStartSettings object. If set to
-      True, uses a default WarmStartSettings object. If set to False or None,
-      does not use warm start. If using a custom WarmStartSettings object, make
-      sure that new variables are properly accounted for when reloading the
-      underlying GAN. Defaults to True.
-  Returns:
-    An estimator spec defining a GAN input training estimator.
-  """
-  model_fn = _get_latent_gan_model_fn(generator_fn, discriminator_fn,
-                                      loss_fn, optimizer)
-
-  if isinstance(warmstart_options, estimator.WarmStartSettings):
-    ws = warmstart_options
-  elif warmstart_options:
-    # Default WarmStart loads all variable names except INPUT_NAME and
-    # OPTIMIZER_NAME.
-    var_regex = '^(?!.*(%s|%s).*)' % (INPUT_NAME, OPTIMIZER_NAME)
-    ws = estimator.WarmStartSettings(ckpt_to_initialize_from=ckpt_dir,
-                                     vars_to_warm_start=var_regex)
-  else:
-    ws = None
-
-  if 'opt_kwargs' not in params:
-    params['opt_kwargs'] = {}
-
-  return estimator.Estimator(model_fn=model_fn, config=config, params=params,
-                             warm_start_from=ws)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
deleted file mode 100644
index ac139e532e3..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for latent_gan_estimator.
-
-See g3.tp.tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-import numpy as np
-from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
-from tensorflow.python.estimator import run_config as run_config
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import test
-from tensorflow.python.training import training
-
-
-class TrainInputEstimatorTest(test.TestCase):
-
-  def test_get_input_training_estimator(self):
-    """Integration test to make sure the input_training_estimator works."""
-
-    # Create dummy test input tensors.
-    true_features = np.reshape(np.random.uniform(size=100), (10, 10))
-    true_labels = np.reshape(np.random.uniform(size=100), (5, 20))
-    expected_z_output = [[1, -1], [-1, 1]]
-
-    # Fill out required parameters randomly, includes optimizer kwargs.
-    params = {
-        'batch_size': 2,
-        'z_shape': [2],
-        'learning_rate': 1.0,
-        'input_clip': 1.0,
-        'add_summaries': False,
-        'opt_kwargs': {
-            'beta1': 0.1
-        }
-    }
-
-    input_z_shape = [params['batch_size']] + params['z_shape']
-
-    # Create dummy model functions that represent an underlying GANEstimator and
-    # the input training wrapper. Make sure that everything is wired up
-    # correctly in the internals of each dummy function.
-    def _generator(net, mode):
-      """The generator function will get the newly created z variable."""
-      del mode
-      self.assertSequenceEqual(net.shape, input_z_shape)
-      gen_dummy_var = variable_scope.get_variable(
-          name='generator_dummy_variable',
-          initializer=array_ops.ones(input_z_shape))
-      return net * gen_dummy_var
-
-    def _discriminator(net, condition, mode):
-      """The discriminator function will get either the z variable or labels."""
-      del condition, mode
-      try:
-        self.assertSequenceEqual(net.shape, true_labels.shape)
-      except AssertionError:
-        self.assertSequenceEqual(net.shape, input_z_shape)
-      return net
-
-    def _loss(gan_model, features, labels, _):
-      """Make sure that features and labels are passed in from input."""
-      self.assertTrue(np.array_equal(features, true_features))
-      self.assertTrue(np.array_equal(labels, true_labels))
-      return losses.absolute_difference(expected_z_output,
-                                        gan_model.generated_data)
-
-    optimizer = training.AdamOptimizer
-
-    # We are not loading checkpoints, so set the corresponding directory to a
-    # dummy directories.
-    tmp_dir = tempfile.mkdtemp()
-    config = run_config.RunConfig(model_dir=tmp_dir,
-                                  save_summary_steps=None,
-                                  save_checkpoints_steps=1,
-                                  save_checkpoints_secs=None)
-
-    # Get the estimator. Disable warm start so that there is no attempted
-    # checkpoint reloading.
-    estimator = latent_gan_estimator.get_latent_gan_estimator(
-        _generator, _discriminator, _loss, optimizer, params, config, tmp_dir,
-        warmstart_options=None)
-
-    # Train for a few steps.
-    def dummy_input():
-      return true_features, true_labels
-    estimator.train(input_fn=dummy_input, steps=10)
-
-    # Make sure the generator variables did not change, but the z variables did
-    # change.
-    self.assertTrue(np.array_equal(
-        estimator.get_variable_value('Generator/generator_dummy_variable'),
-        np.ones(input_z_shape)))
-    self.assertTrue(np.array_equal(
-        estimator.get_variable_value('new_var_z_input'),
-        expected_z_output))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py
deleted file mode 100644
index 341bdf9fbbc..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`tf.Learn` components for `GANEstimator`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.estimator.python import stargan_estimator_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.estimator.python.stargan_estimator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = stargan_estimator_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
deleted file mode 100644
index 06a1480c072..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A TF-GAN-backed StarGAN Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import enum
-
-from tensorflow.contrib.framework.python.ops import variables as variable_lib
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python import train as tfgan_train
-from tensorflow.contrib.gan.python.eval.python import summaries as tfgan_summaries
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_inspect as inspect
-
-__all__ = ['StarGANEstimator', 'SummaryType']
-
-
-class SummaryType(enum.IntEnum):
-  NONE = 0
-  VARIABLES = 1
-  IMAGES = 2
-  IMAGE_COMPARISON = 3
-
-
-_summary_type_map = {
-    SummaryType.VARIABLES: tfgan_summaries.add_gan_model_summaries,
-    SummaryType.IMAGES: tfgan_summaries.add_stargan_image_summaries,
-}
-
-
-class StarGANEstimator(estimator.Estimator):
-  """An estimator for Generative Adversarial Networks (GANs).
-
-  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
-  except for one exception: if either `generator_fn` or `discriminator_fn` have
-  an argument called `mode`, then the tf.Estimator mode is passed in for that
-  argument. This helps with operations like batch normalization, which have
-  different train and evaluation behavior.
-
-  Example:
-
-  ```python
-      import tensorflow as tf
-      tfgan = tf.contrib.gan
-
-      # See TFGAN's `train.py` for a description of the generator and
-      # discriminator API.
-      def generator_fn(generator_inputs):
-        ...
-        return generated_data
-
-      def discriminator_fn(data, conditioning):
-        ...
-        return logits
-
-      # Create GAN estimator.
-      stargan_estimator = tfgan.estimator.StarGANEstimator(
-          model_dir,
-          generator_fn=generator_fn,
-          discriminator_fn=discriminator_fn,
-          loss_fn=loss_fn,
-          generator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5),
-          discriminator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5))
-
-      # Train estimator.
-      stargan_estimator.train(train_input_fn, steps)
-
-      # Evaluate resulting estimator.
-      stargan_estimator.evaluate(eval_input_fn)
-
-      # Generate samples from generator.
-      stargan_estimator = np.array([
-          x for x in stargan_estimator.predict(predict_input_fn)])
-  ```
-  """
-
-  def __init__(self,
-               model_dir=None,
-               generator_fn=None,
-               discriminator_fn=None,
-               loss_fn=None,
-               generator_optimizer=None,
-               discriminator_optimizer=None,
-               get_hooks_fn=None,
-               get_eval_metric_ops_fn=None,
-               add_summaries=None,
-               use_loss_summaries=True,
-               config=None):
-    """Initializes a StarGANEstimator instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      generator_fn: A python function that takes a Tensor, Tensor list, or
-        Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples. Additionally, if
-        it has an argument called `mode`, the Estimator's `mode` will be passed
-        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
-        normalization.
-      discriminator_fn: A python function that takes the output of
-        `generator_fn` or real data in the GAN setup, and `input_data`. Outputs
-        a Tensor in the range [-inf, inf]. See `TFGAN` for more details and
-        examples.
-      loss_fn: The loss function on the generator. Takes a `StarGANModel`
-        namedtuple and return a `GANLoss` namedtuple.
-      generator_optimizer: The optimizer for generator updates, or a function
-        that takes no arguments and returns an optimizer. This function will be
-        called when the default graph is the `StarGANEstimator`'s graph, so
-        utilities like `tf.contrib.framework.get_or_create_global_step` will
-        work.
-      discriminator_optimizer: Same as `generator_optimizer`, but for the
-        discriminator updates.
-      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
-        list of hooks. These hooks are run on the generator and discriminator
-        train ops, and can be used to implement the GAN training scheme.
-        Defaults to `train.get_sequential_train_hooks()`.
-      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
-        dict of metric results keyed by name. The output of this function is
-        passed into `tf.estimator.EstimatorSpec` during evaluation.
-      add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
-      use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-      config: `RunConfig` object to configure the runtime settings.
-
-    Raises:
-      ValueError: If loss functions aren't callable.
-      ValueError: If `use_loss_summaries` isn't boolean or `None`.
-      ValueError: If `get_hooks_fn` isn't callable or `None`.
-    """
-    if not callable(loss_fn):
-      raise ValueError('loss_fn must be callable.')
-    if use_loss_summaries not in [True, False, None]:
-      raise ValueError('use_loss_summaries must be True, False or None.')
-    if get_hooks_fn is not None and not callable(get_hooks_fn):
-      raise TypeError('get_hooks_fn must be callable.')
-
-    def _model_fn(features, labels, mode):
-      """StarGANEstimator model function."""
-      if mode not in [
-          model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
-          model_fn_lib.ModeKeys.PREDICT
-      ]:
-        raise ValueError('Mode not recognized: %s' % mode)
-
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        input_data = features[0]
-        input_data_domain_label = features[1]
-      else:
-        input_data = features  # rename inputs for clarity
-        input_data_domain_label = labels  # rename inputs for clarity
-
-      # Make StarGANModel, which encapsulates the GAN model architectures.
-      gan_model = _get_gan_model(mode, generator_fn, discriminator_fn,
-                                 input_data, input_data_domain_label,
-                                 add_summaries)
-
-      # Make the EstimatorSpec, which incorporates the StarGANModel, losses,
-      # eval, metrics, and optimizers (if required).
-      return _get_estimator_spec(mode, gan_model, loss_fn,
-                                 get_eval_metric_ops_fn, generator_optimizer,
-                                 discriminator_optimizer, get_hooks_fn)
-
-    super(StarGANEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
-
-
-def _get_gan_model(mode,
-                   generator_fn,
-                   discriminator_fn,
-                   input_data,
-                   input_data_domain_label,
-                   add_summaries,
-                   generator_scope='Generator'):
-  """Makes the StarGANModel tuple."""
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    gan_model = _make_prediction_gan_model(input_data, input_data_domain_label,
-                                           generator_fn, generator_scope)
-  else:  # model_fn_lib.ModeKeys.TRAIN or model_fn_lib.ModeKeys.EVAL
-    gan_model = _make_gan_model(generator_fn, discriminator_fn, input_data,
-                                input_data_domain_label, generator_scope,
-                                add_summaries, mode)
-
-  return gan_model
-
-
-def _get_estimator_spec(mode,
-                        gan_model,
-                        loss_fn,
-                        get_eval_metric_ops_fn,
-                        generator_optimizer,
-                        discriminator_optimizer,
-                        get_hooks_fn=None):
-  """Get the EstimatorSpec for the current mode."""
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    estimator_spec = model_fn_lib.EstimatorSpec(
-        mode=mode, predictions=gan_model.generated_data)
-  else:
-    gan_loss = loss_fn(gan_model)
-    if mode == model_fn_lib.ModeKeys.EVAL:
-      estimator_spec = _get_eval_estimator_spec(gan_model, gan_loss,
-                                                get_eval_metric_ops_fn)
-    else:  # model_fn_lib.ModeKeys.TRAIN:
-      gopt = (
-          generator_optimizer()
-          if callable(generator_optimizer) else generator_optimizer)
-      dopt = (
-          discriminator_optimizer()
-          if callable(discriminator_optimizer) else discriminator_optimizer)
-      get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
-      estimator_spec = _get_train_estimator_spec(gan_model, gan_loss, gopt,
-                                                 dopt, get_hooks_fn)
-
-  return estimator_spec
-
-
-def _make_gan_model(generator_fn, discriminator_fn, input_data,
-                    input_data_domain_label, generator_scope, add_summaries,
-                    mode):
-  """Construct a `StarGANModel`, and optionally pass in `mode`."""
-  # If network functions have an argument `mode`, pass mode to it.
-  if 'mode' in inspect.getargspec(generator_fn).args:
-    generator_fn = functools.partial(generator_fn, mode=mode)
-  if 'mode' in inspect.getargspec(discriminator_fn).args:
-    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
-  gan_model = tfgan_train.stargan_model(
-      generator_fn,
-      discriminator_fn,
-      input_data,
-      input_data_domain_label,
-      generator_scope=generator_scope)
-  if add_summaries:
-    if not isinstance(add_summaries, (tuple, list)):
-      add_summaries = [add_summaries]
-    with ops.name_scope(None):
-      for summary_type in add_summaries:
-        _summary_type_map[summary_type](gan_model)
-
-  return gan_model
-
-
-def _make_prediction_gan_model(input_data, input_data_domain_label,
-                               generator_fn, generator_scope):
-  """Make a `StarGANModel` from just the generator."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
-  if 'mode' in inspect.getargspec(generator_fn).args:
-    generator_fn = functools.partial(
-        generator_fn, mode=model_fn_lib.ModeKeys.PREDICT)
-  with variable_scope.variable_scope(generator_scope) as gen_scope:
-    # pylint:disable=protected-access
-    input_data = tfgan_train._convert_tensor_or_l_or_d(input_data)
-    input_data_domain_label = tfgan_train._convert_tensor_or_l_or_d(
-        input_data_domain_label)
-    # pylint:enable=protected-access
-    generated_data = generator_fn(input_data, input_data_domain_label)
-  generator_variables = variable_lib.get_trainable_variables(gen_scope)
-
-  return tfgan_tuples.StarGANModel(
-      input_data=input_data,
-      input_data_domain_label=None,
-      generated_data=generated_data,
-      generated_data_domain_target=input_data_domain_label,
-      reconstructed_data=None,
-      discriminator_input_data_source_predication=None,
-      discriminator_generated_data_source_predication=None,
-      discriminator_input_data_domain_predication=None,
-      discriminator_generated_data_domain_predication=None,
-      generator_variables=generator_variables,
-      generator_scope=generator_scope,
-      generator_fn=generator_fn,
-      discriminator_variables=None,
-      discriminator_scope=None,
-      discriminator_fn=None)
-
-
-def _get_eval_estimator_spec(gan_model,
-                             gan_loss,
-                             get_eval_metric_ops_fn=None,
-                             name=None):
-  """Return an EstimatorSpec for the eval case."""
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-  with ops.name_scope(None, 'metrics',
-                      [gan_loss.generator_loss, gan_loss.discriminator_loss]):
-
-    def _summary_key(head_name, val):
-      return '%s/%s' % (val, head_name) if head_name else val
-
-    eval_metric_ops = {
-        _summary_key(name, 'generator_loss'):
-            metrics_lib.mean(gan_loss.generator_loss),
-        _summary_key(name, 'discriminator_loss'):
-            metrics_lib.mean(gan_loss.discriminator_loss)
-    }
-    if get_eval_metric_ops_fn is not None:
-      custom_eval_metric_ops = get_eval_metric_ops_fn(gan_model)
-      if not isinstance(custom_eval_metric_ops, dict):
-        raise TypeError('get_eval_metric_ops_fn must return a dict, '
-                        'received: {}'.format(custom_eval_metric_ops))
-      eval_metric_ops.update(custom_eval_metric_ops)
-  return model_fn_lib.EstimatorSpec(
-      mode=model_fn_lib.ModeKeys.EVAL,
-      predictions=gan_model.generated_data,
-      loss=scalar_loss,
-      eval_metric_ops=eval_metric_ops)
-
-
-def _get_train_estimator_spec(gan_model,
-                              gan_loss,
-                              generator_optimizer,
-                              discriminator_optimizer,
-                              get_hooks_fn,
-                              train_op_fn=tfgan_train.gan_train_ops):
-  """Return an EstimatorSpec for the train case."""
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-  train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer,
-                          discriminator_optimizer)
-  training_hooks = get_hooks_fn(train_ops)
-  return model_fn_lib.EstimatorSpec(
-      loss=scalar_loss,
-      mode=model_fn_lib.ModeKeys.TRAIN,
-      train_op=train_ops.global_step_inc_op,
-      training_hooks=training_hooks)
-
-
-def stargan_prediction_input_fn_wrapper(fn):
-  """StarGAN Estimator prediction input_fn wrapper.
-
-  Since estimator will disregard the "label" variable pass to the model, we will
-  use a wrapper to pack the (feature, label) tuple as feature passed to the
-  model.
-
-  Args:
-    fn: input_fn for the prediction.
-
-  Returns:
-    A tuple ((feature, label), None) where the second element is the dummy label
-    to be disregarded and the first element is the true input to the estimator.
-  """
-
-  def new_fn():
-    return fn(), None
-
-  return new_fn
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
deleted file mode 100644
index 0fcd1b7924e..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN's stargan_estimator.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python.estimator.python import stargan_estimator_impl as estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import learning_rate_decay
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-
-
-def dummy_generator_fn(input_data, input_data_domain_label, mode):
-  del input_data_domain_label, mode
-
-  return variable_scope.get_variable('dummy_g', initializer=0.5) * input_data
-
-
-def dummy_discriminator_fn(input_data, num_domains, mode):
-  del mode
-
-  hidden = layers.flatten(input_data)
-  output_src = math_ops.reduce_mean(hidden, axis=1)
-  output_cls = layers.fully_connected(
-      inputs=hidden, num_outputs=num_domains, scope='debug')
-
-  return output_src, output_cls
-
-
-class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
-  """Tests that `StarGetGANModel` produces the correct model."""
-
-  @parameterized.named_parameters(('train', model_fn_lib.ModeKeys.TRAIN),
-                                  ('eval', model_fn_lib.ModeKeys.EVAL),
-                                  ('predict', model_fn_lib.ModeKeys.PREDICT))
-  def test_get_gan_model(self, mode):
-    with ops.Graph().as_default():
-      input_data = array_ops.ones([6, 4, 4, 3])
-      input_data_domain_label = array_ops.one_hot([0] * 6, 5)
-      gan_model = estimator._get_gan_model(
-          mode,
-          dummy_generator_fn,
-          dummy_discriminator_fn,
-          input_data,
-          input_data_domain_label,
-          add_summaries=False)
-
-    self.assertEqual(input_data, gan_model.input_data)
-    self.assertIsNotNone(gan_model.generated_data)
-    self.assertIsNotNone(gan_model.generated_data_domain_target)
-    self.assertLen(gan_model.generator_variables, 1)
-    self.assertIsNotNone(gan_model.generator_scope)
-    self.assertIsNotNone(gan_model.generator_fn)
-    if mode == model_fn_lib.ModeKeys.PREDICT:
-      self.assertIsNone(gan_model.input_data_domain_label)
-      self.assertEqual(input_data_domain_label,
-                       gan_model.generated_data_domain_target)
-      self.assertIsNone(gan_model.reconstructed_data)
-      self.assertIsNone(gan_model.discriminator_input_data_source_predication)
-      self.assertIsNone(
-          gan_model.discriminator_generated_data_source_predication)
-      self.assertIsNone(gan_model.discriminator_input_data_domain_predication)
-      self.assertIsNone(
-          gan_model.discriminator_generated_data_domain_predication)
-      self.assertIsNone(gan_model.discriminator_variables)
-      self.assertIsNone(gan_model.discriminator_scope)
-      self.assertIsNone(gan_model.discriminator_fn)
-    else:
-      self.assertEqual(input_data_domain_label,
-                       gan_model.input_data_domain_label)
-      self.assertIsNotNone(gan_model.reconstructed_data.shape)
-      self.assertIsNotNone(
-          gan_model.discriminator_input_data_source_predication)
-      self.assertIsNotNone(
-          gan_model.discriminator_generated_data_source_predication)
-      self.assertIsNotNone(
-          gan_model.discriminator_input_data_domain_predication)
-      self.assertIsNotNone(
-          gan_model.discriminator_generated_data_domain_predication)
-      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
-      self.assertIsNotNone(gan_model.discriminator_scope)
-      self.assertIsNotNone(gan_model.discriminator_fn)
-
-
-def get_dummy_gan_model():
-  """Similar to get_gan_model()."""
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  return tfgan_tuples.StarGANModel(
-      input_data=array_ops.ones([1, 2, 2, 3]),
-      input_data_domain_label=array_ops.ones([1, 2]),
-      generated_data=array_ops.ones([1, 2, 2, 3]),
-      generated_data_domain_target=array_ops.ones([1, 2]),
-      reconstructed_data=array_ops.ones([1, 2, 2, 3]),
-      discriminator_input_data_source_predication=array_ops.ones([1]) * dis_var,
-      discriminator_generated_data_source_predication=array_ops.ones(
-          [1]) * gen_var * dis_var,
-      discriminator_input_data_domain_predication=array_ops.ones([1, 2
-                                                                 ]) * dis_var,
-      discriminator_generated_data_domain_predication=array_ops.ones([1, 2]) *
-      gen_var * dis_var,
-      generator_variables=[gen_var],
-      generator_scope=gen_scope,
-      generator_fn=None,
-      discriminator_variables=[dis_var],
-      discriminator_scope=dis_scope,
-      discriminator_fn=None)
-
-
-def dummy_loss_fn(gan_model):
-  loss = math_ops.reduce_sum(
-      gan_model.discriminator_input_data_domain_predication -
-      gan_model.discriminator_generated_data_domain_predication)
-  loss += math_ops.reduce_sum(gan_model.input_data - gan_model.generated_data)
-  return tfgan_tuples.GANLoss(loss, loss)
-
-
-def get_metrics(gan_model):
-  return {
-      'mse_custom_metric':
-          metrics_lib.mean_squared_error(gan_model.input_data,
-                                         gan_model.generated_data)
-  }
-
-
-class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
-  """Tests that the EstimatorSpec is constructed appropriately."""
-
-  @classmethod
-  def setUpClass(cls):
-    super(GetEstimatorSpecTest, cls).setUpClass()
-    cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
-    cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
-
-  @parameterized.named_parameters(('train', model_fn_lib.ModeKeys.TRAIN),
-                                  ('eval', model_fn_lib.ModeKeys.EVAL),
-                                  ('predict', model_fn_lib.ModeKeys.PREDICT))
-  def test_get_estimator_spec(self, mode):
-    with ops.Graph().as_default():
-      self._gan_model = get_dummy_gan_model()
-      spec = estimator._get_estimator_spec(
-          mode,
-          self._gan_model,
-          loss_fn=dummy_loss_fn,
-          get_eval_metric_ops_fn=get_metrics,
-          generator_optimizer=self._generator_optimizer,
-          discriminator_optimizer=self._discriminator_optimizer)
-
-    self.assertEqual(mode, spec.mode)
-    if mode == model_fn_lib.ModeKeys.PREDICT:
-      self.assertEqual(self._gan_model.generated_data, spec.predictions)
-    elif mode == model_fn_lib.ModeKeys.TRAIN:
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.train_op)
-      self.assertIsNotNone(spec.training_hooks)
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      self.assertEqual(self._gan_model.generated_data, spec.predictions)
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.eval_metric_ops)
-
-
-# TODO(joelshor): Add pandas test.
-class StarGANEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self,
-                          train_input_fn,
-                          eval_input_fn,
-                          predict_input_fn,
-                          prediction_size,
-                          lr_decay=False):
-
-    def make_opt():
-      gstep = training_util.get_or_create_global_step()
-      lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
-      return training.GradientDescentOptimizer(lr)
-
-    gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    est = estimator.StarGANEstimator(
-        generator_fn=dummy_generator_fn,
-        discriminator_fn=dummy_discriminator_fn,
-        loss_fn=dummy_loss_fn,
-        generator_optimizer=gopt,
-        discriminator_optimizer=dopt,
-        get_eval_metric_ops_fn=get_metrics,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', scores)
-    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
-                     scores['loss'])
-    self.assertIn('mse_custom_metric', scores)
-
-    # PREDICT
-    predictions = np.array([x for x in est.predict(predict_input_fn)])
-
-    self.assertAllEqual(prediction_size, predictions.shape)
-
-  @staticmethod
-  def _numpy_input_fn_wrapper(numpy_input_fn, batch_size, label_size):
-    """Wrapper to remove the dictionary in numpy_input_fn.
-
-    NOTE:
-      We create the domain_label here because the model expect a fully define
-      batch_size from the input.
-
-    Args:
-      numpy_input_fn: input_fn created from numpy_io
-      batch_size: (int) number of items for each batch
-      label_size: (int) number of domains
-
-    Returns:
-      a new input_fn
-    """
-
-    def new_input_fn():
-      features = numpy_input_fn()
-      return features['x'], array_ops.one_hot([0] * batch_size, label_size)
-
-    return new_input_fn
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    batch_size = 5
-    img_size = 8
-    channel_size = 3
-    label_size = 3
-    image_data = np.zeros(
-        [batch_size, img_size, img_size, channel_size], dtype=np.float32)
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': image_data},
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': image_data}, batch_size=batch_size, shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': image_data}, shuffle=False)
-
-    train_input_fn = self._numpy_input_fn_wrapper(train_input_fn, batch_size,
-                                                  label_size)
-    eval_input_fn = self._numpy_input_fn_wrapper(eval_input_fn, batch_size,
-                                                 label_size)
-    predict_input_fn = self._numpy_input_fn_wrapper(predict_input_fn,
-                                                    batch_size, label_size)
-
-    predict_input_fn = estimator.stargan_prediction_input_fn_wrapper(
-        predict_input_fn)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        prediction_size=[batch_size, img_size, img_size, channel_size])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
deleted file mode 100644
index deb381f7be3..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`tf.Learn` components for `TPUGANEstimator`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = tpu_gan_estimator_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
deleted file mode 100644
index 8ed64e869a0..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A TF-GAN-backed GAN Estimator that works on TPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python import train as tfgan_train
-from tensorflow.contrib.gan.python.estimator.python import gan_estimator_impl as gan_estimator_lib
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
-from tensorflow.contrib.training.python.training import training
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops.losses import losses
-
-__all__ = [
-    'TPUGANEstimator',
-]
-
-
-class TPUGANEstimator(tpu_estimator.TPUEstimator):
-  """An estimator for Generative Adversarial Networks (GANs) on TPU.
-
-  This Estimator is backed by TFGAN. It is similar to `tfgan.GANEstimator`,
-  but works on TPU.
-
-  Example:
-
-  ```python
-      import tensorflow as tf
-      tfgan = tf.contrib.gan
-
-      # See TFGAN's `train.py` for a description of the generator and
-      # discriminator API.
-      def generator_fn(generator_inputs):
-        ...
-        return generated_data
-
-      def discriminator_fn(data, conditioning):
-        ...
-        return logits
-
-      # Create GAN estimator.
-      config = tpu_config.RunConfig(model_dir='/my/dir')
-      gan_estimator = tfgan.estimator.TPUGANEstimator(
-          generator_fn=generator_fn,
-          discriminator_fn=discriminator_fn,
-          generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
-          discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5),
-          discriminator_optimizer=tf.compat.v1.train.AdamOptimizer(0.1, 0.5),
-          train_batch_size=4,
-          config=config)
-
-      # Train estimator.
-      gan_estimator.train(train_input_fn, train_steps)
-
-      # Evaluate resulting estimator.
-      gan_estimator.evaluate(eval_input_fn, eval_steps)
-
-      # Generate samples from generator.
-      predictions = np.array([
-          x['generated_data'] for x in gan_estimator.predict(predict_input_fn)])
-  ```
-  """
-
-  def __init__(self,
-               # Arguments to construct the `model_fn`.
-               generator_fn=None,
-               discriminator_fn=None,
-               generator_loss_fn=None,
-               discriminator_loss_fn=None,
-               generator_optimizer=None,
-               discriminator_optimizer=None,
-               get_eval_metric_ops_fn=None,
-               add_summaries=None,
-               joint_train=False,
-               gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1),
-               # TPUEstimator options.
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               warm_start_from=None):
-    """Initializes a TPUGANEstimator instance.
-
-    Args:
-      generator_fn: A python function that takes a Tensor, Tensor list, or
-        Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples. Additionally, if
-        it has an argument called `mode`, the Estimator's `mode` will be passed
-        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
-        normalization.
-      discriminator_fn: A python function that takes the output of
-        `generator_fn` or real data in the GAN setup, and `generator_inputs`.
-        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
-        and examples.
-      generator_loss_fn: The loss function on the generator. Takes a `GANModel`
-        tuple.
-      discriminator_loss_fn: The loss function on the discriminator. Takes a
-        `GANModel` tuple.
-      generator_optimizer: The optimizer for generator updates, or a function
-        that takes no arguments and returns an optimizer. This function will
-        be called when the default graph is the `GANEstimator`'s graph, so
-        utilities like `tf.contrib.framework.get_or_create_global_step` will
-        work.
-      discriminator_optimizer: Same as `generator_optimizer`, but for the
-        discriminator updates.
-      get_eval_metric_ops_fn: A function that takes a list of arguments and
-        returns a dict of metric results keyed by name. The output of this
-        function is passed into `tf.estimator.EstimatorSpec` during evaluation.
-        The arguments must be:
-            * generator_inputs
-            * generated_data
-            * real_data
-            * discriminator_real_outputs
-            * discriminator_gen_outputs
-      add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
-        This is ignored for jobs that run on TPU, such as the train job if
-        `use_tpu` is `True` or the eval job if `eval_on_tpu` is `True`.
-      joint_train: A Python boolean. If `True`, jointly train the generator and
-        the discriminator. If `False`, sequentially train them. See `train.py`
-        in TFGAN for more details on the differences between the two GAN
-        training methods.
-      gan_train_steps: A `tfgan.GANTrainSteps` named tuple describing the ratio
-        of generator to discriminator steps. For now, only supports 1:1
-        training.
-      model_dir: Same as `TPUEstimator`: Directory to save model parameters,
-        graph and etc. This can also be used to load checkpoints from the
-        directory into a estimator to continue training a previously saved
-        model. If `None`, the model_dir in `config` will be used if set. If both
-        are set, they must be same. If both are `None`, a temporary directory
-        will be used.
-      config: Same as `TPUEstimator`: An `tpu_config.RunConfig` configuration
-        object. Cannot be `None`.
-      params: Same as `TPUEstimator`: An optional `dict` of hyper parameters
-        that will be passed into `input_fn` and `model_fn`.  Keys are names of
-        parameters, values are basic python types. There are reserved keys for
-        `TPUEstimator`, including 'batch_size'.
-      use_tpu: Same as `TPUEstimator`: A bool indicating whether TPU support is
-        enabled. Currently, TPU training and evaluation respect this bit, but
-        eval_on_tpu can override execution of eval. See below. Predict still
-        happens on CPU.
-      train_batch_size: Same as `TPUEstimator`: An int representing the global
-        training batch size. TPUEstimator transforms this global batch size to a
-        per-shard batch size, as params['batch_size'], when calling `input_fn`
-        and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be
-        divisible by total number of replicas.
-      eval_batch_size: Same as `TPUEstimator`: An int representing evaluation
-        batch size. Must be divisible by total number of replicas.
-      predict_batch_size: Same as `TPUEstimator`: An int representing the
-        prediction batch size. Must be divisible by total number of replicas.
-      batch_axis: Same as `TPUEstimator`: A python tuple of int values
-        describing how each tensor produced by the Estimator `input_fn` should
-        be split across the TPU compute shards. For example, if your input_fn
-        produced (images, labels) where the images tensor is in `HWCN` format,
-        your shard dimensions would be [3, 0], where 3 corresponds to the `N`
-        dimension of your images Tensor, and 0 corresponds to the dimension
-        along which to split the labels to match up with the corresponding
-        images. If None is supplied, and per_host_input_for_training is True,
-        batches will be sharded based on the major dimension. If
-        tpu_config.per_host_input_for_training is False or `PER_HOST_V2`,
-        batch_axis is ignored.
-      eval_on_tpu: Same as `TPUEstimator`: If False, evaluation runs on CPU or
-        GPU. In this case, the model_fn must return `EstimatorSpec` when called
-        with `mode` as `EVAL`.
-      export_to_tpu: Same as `TPUEstimator`: If True, `export_savedmodel()`
-        exports a metagraph for serving on TPU besides the one on CPU.
-      warm_start_from: Same as `TPUEstimator`: Optional string filepath to a
-        checkpoint or SavedModel to warm-start from, or a
-        `tf.estimator.WarmStartSettings` object to fully configure
-        warm-starting.  If the string filepath is provided instead of a
-        `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-
-    Raises:
-      ValueError: If loss functions aren't callable.
-      ValueError: If `gan_train_steps` isn't a `tfgan_tuples.GANTrainSteps`
-        tuple.
-      ValueError: If `gan_train_steps` isn't 1:1 training.
-    """
-    if not callable(generator_loss_fn):
-      raise ValueError('generator_loss_fn must be callable.')
-    if not callable(discriminator_loss_fn):
-      raise ValueError('discriminator_loss_fn must be callable.')
-    if not isinstance(gan_train_steps, tfgan_tuples.GANTrainSteps):
-      raise ValueError(
-          '`gan_train_steps` must be `tfgan_tuples.GANTrainSteps`. Instead, '
-          'was type: %s' % type(gan_train_steps))
-    if (gan_train_steps.generator_train_steps != 1 or
-        gan_train_steps.discriminator_train_steps != 1):
-      raise ValueError('Estimator currently only supports 1:1 training.')
-
-    if use_tpu:
-      generator_optimizer = _maybe_make_cross_shard_optimizer(
-          generator_optimizer)
-      discriminator_optimizer = _maybe_make_cross_shard_optimizer(
-          discriminator_optimizer)
-
-    def _model_fn(features, labels, mode, params):
-      """GANEstimator model function."""
-      del params  # unused
-      if mode not in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
-                      model_fn_lib.ModeKeys.PREDICT]:
-        raise ValueError('Mode not recognized: %s' % mode)
-      real_data = labels  # rename inputs for clarity
-      generator_inputs = features  # rename inputs for clarity
-
-      # Make GANModel, which encapsulates the GAN model architectures.
-      # TODO(joelshor): Switch TF-GAN over to TPU-compatible summaries, then
-      # remove `add_summaries` logic below.
-      is_on_tpu = _is_on_tpu(mode, use_tpu, eval_on_tpu)
-      gan_model = gan_estimator_lib._get_gan_model(  # pylint:disable=protected-access
-          mode, generator_fn, discriminator_fn, real_data, generator_inputs,
-          add_summaries=None if is_on_tpu else add_summaries)
-
-      # Make the TPUEstimatorSpec, which incorporates the GANModel, losses, eval
-      # metrics, and optimizers (if required).
-      estimator_spec = _get_estimator_spec(
-          mode, gan_model, generator_loss_fn, discriminator_loss_fn,
-          get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-          joint_train, is_on_tpu, gan_train_steps)
-      assert isinstance(estimator_spec, tpu_estimator.TPUEstimatorSpec)
-      return estimator_spec
-
-    super(TPUGANEstimator, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        use_tpu=use_tpu,
-        train_batch_size=train_batch_size,
-        eval_batch_size=eval_batch_size,
-        predict_batch_size=predict_batch_size,
-        batch_axis=batch_axis,
-        eval_on_tpu=eval_on_tpu,
-        export_to_tpu=export_to_tpu,
-        warm_start_from=warm_start_from)
-
-
-def _is_on_tpu(mode, use_tpu, eval_on_tpu):
-  if mode == model_fn_lib.ModeKeys.TRAIN:
-    return use_tpu
-  elif mode == model_fn_lib.ModeKeys.EVAL:
-    return eval_on_tpu
-  else:
-    return False
-
-
-def _get_estimator_spec(
-    mode, gan_model, generator_loss_fn, discriminator_loss_fn,
-    get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-    joint_train, is_on_tpu, gan_train_steps):
-  """Get the TPUEstimatorSpec for the current mode."""
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    estimator_spec = tpu_estimator.TPUEstimatorSpec(
-        mode=mode, predictions={'generated_data': gan_model.generated_data})
-  elif mode == model_fn_lib.ModeKeys.EVAL:
-    gan_loss = tfgan_tuples.GANLoss(
-        generator_loss=generator_loss_fn(
-            gan_model, add_summaries=not is_on_tpu),
-        discriminator_loss=discriminator_loss_fn(
-            gan_model, add_summaries=not is_on_tpu))
-    # Eval losses for metrics must preserve batch dimension.
-    gan_loss_no_reduction = tfgan_tuples.GANLoss(
-        generator_loss=generator_loss_fn(
-            gan_model, add_summaries=False, reduction=losses.Reduction.NONE),
-        discriminator_loss=discriminator_loss_fn(
-            gan_model, add_summaries=False, reduction=losses.Reduction.NONE))
-    estimator_spec = _get_eval_estimator_spec(
-        gan_model, gan_loss, gan_loss_no_reduction, get_eval_metric_ops_fn)
-  else:  # model_fn_lib.ModeKeys.TRAIN:
-    gan_loss = tfgan_tuples.GANLoss(
-        generator_loss=generator_loss_fn(
-            gan_model, add_summaries=not is_on_tpu),
-        discriminator_loss=discriminator_loss_fn(
-            gan_model, add_summaries=not is_on_tpu))
-
-    # Construct optimizers if arguments were callable. For TPUs, they must be
-    # `CrossShardOptimizer`.
-    g_callable = callable(generator_optimizer)
-    gopt = generator_optimizer() if g_callable  else generator_optimizer
-    d_callable = callable(discriminator_optimizer)
-    dopt = discriminator_optimizer() if d_callable else discriminator_optimizer
-
-    estimator_spec = _get_train_estimator_spec(
-        gan_model, gan_loss, gopt, dopt, joint_train, gan_train_steps)
-
-  return estimator_spec
-
-
-def _get_eval_estimator_spec(gan_model, gan_loss, gan_loss_no_reduction,
-                             get_eval_metric_ops_fn):
-  """Return an TPUEstimatorSpec for the eval case."""
-  # Make the metric function and tensor names.
-  if get_eval_metric_ops_fn is not None:
-    def metric_fn(
-        generator_inputs, generated_data, real_data, discriminator_real_outputs,
-        discriminator_gen_outputs, generator_loss, discriminator_loss):
-      """`metric_fn` used in TPUEstimator to calculate metrics."""
-      eval_metric_ops = {
-          'generator_loss': metrics_lib.mean(generator_loss),
-          'discriminator_loss': metrics_lib.mean(discriminator_loss),
-      }
-      custom_eval_metric_ops = get_eval_metric_ops_fn(
-          generator_inputs, generated_data, real_data,
-          discriminator_real_outputs, discriminator_gen_outputs)
-      if not isinstance(custom_eval_metric_ops, dict):
-        raise TypeError('`get_eval_metric_ops_fn` must return a dict, '
-                        'received: {}'.format(custom_eval_metric_ops))
-      eval_metric_ops.update(custom_eval_metric_ops)
-      return eval_metric_ops
-    tensors = {
-        'generator_loss': gan_loss_no_reduction.generator_loss,
-        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
-        'generator_inputs': gan_model.generator_inputs,
-        'generated_data': gan_model.generated_data,
-        'real_data': gan_model.real_data,
-        'discriminator_real_outputs': gan_model.discriminator_real_outputs,
-        'discriminator_gen_outputs': gan_model.discriminator_gen_outputs,
-    }
-  else:
-    def metric_fn(generator_loss, discriminator_loss):
-      return {
-          'generator_loss': metrics_lib.mean(generator_loss),
-          'discriminator_loss': metrics_lib.mean(discriminator_loss),
-      }
-    tensors = {
-        'generator_loss': gan_loss_no_reduction.generator_loss,
-        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
-    }
-
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-  return tpu_estimator.TPUEstimatorSpec(
-      mode=model_fn_lib.ModeKeys.EVAL,
-      predictions=gan_model.generated_data,
-      loss=scalar_loss,
-      eval_metrics=(metric_fn, tensors))
-
-
-def _get_train_estimator_spec(
-    gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-    joint_train, gan_train_steps):
-  """Return a TPUEstimatorSpec for the train case."""
-  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
-
-  # Get generator and discriminator update ops. We split them so that update
-  # ops aren't accidentally run multiple times. For now, throw an error if
-  # there are update ops that aren't associated with either the generator or
-  # the discriminator. Might modify the `kwargs` dictionary.
-  gen_update_ops, dis_update_ops = tfgan_train._get_update_ops(  # pylint:disable=protected-access
-      {}, gan_model.generator_scope.name, gan_model.discriminator_scope.name)
-
-  def gen_train_op():
-    with ops.name_scope('generator_train'):
-      return training.create_train_op(
-          total_loss=gan_loss.generator_loss,
-          optimizer=generator_optimizer,
-          variables_to_train=gan_model.generator_variables,
-          update_ops=gen_update_ops)
-  def dis_train_op():
-    with ops.name_scope('discriminator_train'):
-      return training.create_train_op(
-          total_loss=gan_loss.discriminator_loss,
-          optimizer=discriminator_optimizer,
-          variables_to_train=gan_model.discriminator_variables,
-          update_ops=dis_update_ops)
-
-  # Either optimize the generator and discriminator sequentially or jointly.
-  tpu_train_op = _combine_train_ops(gen_train_op, dis_train_op, joint_train,
-                                    gan_train_steps)
-
-  return tpu_estimator.TPUEstimatorSpec(
-      loss=scalar_loss,
-      mode=model_fn_lib.ModeKeys.TRAIN,
-      train_op=tpu_train_op)
-
-
-# TODO(joelshor): Add support for multiple D / G steps.
-def _combine_train_ops(gen_train_op, dis_train_op, joint_train,
-                       gan_train_steps):
-  """Combine generator and discriminator train ops into a single op."""
-  del gan_train_steps
-  if joint_train:
-    tpu_train_op = control_flow_ops.group(gen_train_op(), dis_train_op(),
-                                          name='joint_train')
-  else:
-    with ops.control_dependencies([dis_train_op()]):
-      tpu_train_op = gen_train_op()
-
-  return tpu_train_op
-
-
-def _maybe_make_cross_shard_optimizer(opt):
-  if callable(opt):
-    if not isinstance(opt(), tpu_optimizer.CrossShardOptimizer):
-      return lambda: tpu_optimizer.CrossShardOptimizer(opt())
-  elif not isinstance(opt, tpu_optimizer.CrossShardOptimizer):
-    return tpu_optimizer.CrossShardOptimizer(opt)
-  return opt
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
deleted file mode 100644
index baf2c28df4b..00000000000
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN's TPU Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
-from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl as estimator
-from tensorflow.contrib.gan.python.losses.python import tuple_losses as losses
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.estimator import WarmStartSettings
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework.errors_impl import NotFoundError
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import learning_rate_decay
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_bool('use_tpu', False, 'Whether to run test on TPU or not.')
-
-
-def generator_fn(noise, mode):
-  del mode
-  return layers.fully_connected(noise, tensor_shape.dimension_value(
-      noise.shape[1]))
-
-
-def discriminator_fn(data, unused_conditioning, mode):
-  del unused_conditioning, mode
-  return layers.fully_connected(data, 1)
-
-
-def get_dummy_gan_model():
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
-  return tfgan_tuples.GANModel(
-      generator_inputs=None,
-      generated_data=array_ops.ones([3, 4]),
-      generator_variables=[gen_var],
-      generator_scope=gen_scope,
-      generator_fn=None,
-      real_data=array_ops.zeros([3, 4]),
-      discriminator_real_outputs=array_ops.ones([1, 2, 3]) * dis_var,
-      discriminator_gen_outputs=array_ops.ones([1, 2, 3]) * gen_var * dis_var,
-      discriminator_variables=[dis_var],
-      discriminator_scope=dis_scope,
-      discriminator_fn=None)
-
-
-def get_metrics(generator_inputs, generated_data, real_data,
-                discriminator_real_outputs, discriminator_gen_outputs):
-  del generator_inputs, discriminator_real_outputs, discriminator_gen_outputs
-  return {
-      'mse_custom_metric': metrics_lib.mean_squared_error(
-          real_data, generated_data)
-  }
-
-
-class GetTPUEstimatorSpecTest(test.TestCase, parameterized.TestCase):
-  """Tests that the EstimatorSpec is constructed appropriately."""
-
-  @classmethod
-  def setUpClass(cls):
-    super(GetTPUEstimatorSpecTest, cls).setUpClass()
-    cls._generator_optimizer = tpu_optimizer.CrossShardOptimizer(
-        training.GradientDescentOptimizer(1.0))
-    cls._discriminator_optimizer = tpu_optimizer.CrossShardOptimizer(
-        training.GradientDescentOptimizer(1.0))
-
-  @parameterized.named_parameters(
-      ('joint_train', model_fn_lib.ModeKeys.TRAIN, True),
-      ('train_sequential', model_fn_lib.ModeKeys.TRAIN, False),
-      ('eval', model_fn_lib.ModeKeys.EVAL, None),
-      ('predict', model_fn_lib.ModeKeys.PREDICT, None))
-  def test_get_estimator_spec(self, mode, joint_train):
-    with ops.Graph().as_default():
-      self._gan_model = get_dummy_gan_model()
-      spec = estimator._get_estimator_spec(
-          mode,
-          self._gan_model,
-          generator_loss_fn=losses.wasserstein_generator_loss,
-          discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-          get_eval_metric_ops_fn=get_metrics,
-          generator_optimizer=self._generator_optimizer,
-          discriminator_optimizer=self._discriminator_optimizer,
-          joint_train=joint_train,
-          is_on_tpu=FLAGS.use_tpu,
-          gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1))
-
-    self.assertIsInstance(spec, tpu_estimator.TPUEstimatorSpec)
-    self.assertEqual(mode, spec.mode)
-    if mode == model_fn_lib.ModeKeys.PREDICT:
-      self.assertEqual({'generated_data': self._gan_model.generated_data},
-                       spec.predictions)
-    elif mode == model_fn_lib.ModeKeys.TRAIN:
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.train_op)
-      self.assertIsNotNone(spec.training_hooks)
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      self.assertEqual(self._gan_model.generated_data, spec.predictions)
-      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
-      self.assertIsNotNone(spec.eval_metrics)
-
-
-class TPUGANEstimatorIntegrationTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TPUGANEstimatorIntegrationTest, self).setUp()
-    self._model_dir = tempfile.mkdtemp()
-    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
-
-  def tearDown(self):
-    super(TPUGANEstimatorIntegrationTest, self).tearDown()
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, prediction_size,
-      lr_decay=False, joint_train=True):
-    def make_opt():
-      gstep = training_util.get_or_create_global_step()
-      lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
-      return training.GradientDescentOptimizer(lr)
-
-    gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
-    est = estimator.TPUGANEstimator(
-        generator_fn=generator_fn,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=gopt,
-        discriminator_optimizer=dopt,
-        joint_train=joint_train,
-        get_eval_metric_ops_fn=get_metrics,
-        train_batch_size=4,
-        eval_batch_size=10,
-        predict_batch_size=8,
-        use_tpu=FLAGS.use_tpu,
-        config=self._config)
-
-    # Train.
-    num_steps_train = 10
-    est.train(train_input_fn, steps=num_steps_train)
-
-    # Evaluate.
-    num_steps_eval = 2
-    scores = est.evaluate(eval_input_fn, steps=num_steps_eval)
-    self.assertIn(ops.GraphKeys.GLOBAL_STEP, scores)
-    self.assertIn('loss', scores)
-    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
-                     scores['loss'])
-    self.assertIn('mse_custom_metric', scores)
-
-    # Predict.
-    predictions = np.array([x['generated_data'] for x in
-                            est.predict(predict_input_fn)])
-    self.assertAllEqual(prediction_size, predictions.shape)
-
-  @parameterized.named_parameters(
-      ('joint_train', True, False, False),
-      ('train_sequential', False, False, False),
-      ('lr_decay', False, True, False),
-      ('train_sequential_ds', False, False, True))
-  def test_numpy_input_fn(self, joint_train, lr_decay, return_ds):
-    """Tests complete flow with numpy_input_fn."""
-    input_dim = 4
-    def train_input_fn(params):
-      data = np.zeros([input_dim], dtype=np.float32)
-      ds = (dataset_ops.Dataset
-            .from_tensors((data, data))
-            .repeat()
-            .batch(params['batch_size'], drop_remainder=True))
-      if return_ds:
-        return ds
-      else:
-        x, y = ds.make_one_shot_iterator().get_next()
-        return x, y
-    def eval_input_fn(params):
-      data = np.zeros([input_dim], dtype=np.float32)
-      ds = (dataset_ops.Dataset
-            .from_tensors((data, data))
-            .repeat()
-            .batch(params['batch_size'], drop_remainder=True))
-      if return_ds:
-        return ds
-      else:
-        x, y = ds.make_one_shot_iterator().get_next()
-        return x, y
-    predict_size = 10
-    def predict_input_fn(params):
-      del params  # unused
-      data = np.zeros([input_dim], dtype=np.float32)
-      ds = (dataset_ops.Dataset
-            .from_tensors(data)
-            .repeat(predict_size)
-            .batch(1, drop_remainder=True))
-      return ds
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        prediction_size=[predict_size, input_dim],
-        lr_decay=lr_decay,
-        joint_train=joint_train)
-
-
-class TPUGANEstimatorWarmStartTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = self.get_temp_dir()
-    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
-    self.new_variable_name = 'new_var'
-    self.new_variable_value = [1.0, 2.0, 3.0]
-
-  def tearDown(self):
-    writer_cache.FileWriterCache.clear()
-
-  def _test_warm_start(self, warm_start_from=None):
-    """Tests whether WarmStartSettings work as intended."""
-    def generator_with_new_variable(noise_dict, mode):
-      variable_scope.get_variable(name=self.new_variable_name,
-                                  initializer=self.new_variable_value,
-                                  trainable=True)
-      return generator_fn(noise_dict, mode)
-
-    est = estimator.TPUGANEstimator(
-        generator_fn=generator_fn,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
-        train_batch_size=4,
-        use_tpu=FLAGS.use_tpu,
-        config=self._config)
-
-    def train_input_fn(params):
-      data = np.zeros([params['batch_size'], 4], dtype=np.float32)
-      return data, data
-
-    est.train(train_input_fn, steps=1)
-
-    est_warm = estimator.TPUGANEstimator(
-        generator_fn=generator_with_new_variable,
-        discriminator_fn=discriminator_fn,
-        generator_loss_fn=losses.wasserstein_generator_loss,
-        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
-        generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
-        config=tpu_config.RunConfig(
-            model_dir=None if warm_start_from else self._model_dir),
-        train_batch_size=4,
-        use_tpu=FLAGS.use_tpu,
-        warm_start_from=warm_start_from)
-
-    est_warm.train(train_input_fn, steps=1)
-
-    return est_warm
-
-  def test_warm_start_error(self):
-    """Test if exception when reloading different estimators."""
-    with self.assertRaises(NotFoundError):
-      self._test_warm_start()
-
-  def test_warm_start_success(self):
-    """Test if GANEstimator allows explicit warm start variable assignment."""
-    # Regex matches all variable names in ckpt except for new_var.
-    var_regex = '^(?!.*%s.*)' % self.new_variable_name
-    warmstart = WarmStartSettings(ckpt_to_initialize_from=self._model_dir,
-                                  vars_to_warm_start=var_regex)
-    est_warm = self._test_warm_start(warm_start_from=warmstart)
-    full_variable_name = 'Generator/%s' % self.new_variable_name
-    self.assertIn(full_variable_name, est_warm.get_variable_names())
-    equal_vals = np.array_equal(est_warm.get_variable_value(full_variable_name),
-                                self.new_variable_value)
-    self.assertTrue(equal_vals)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
deleted file mode 100644
index 92e9abf8a35..00000000000
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TF-GAN evaluation module.
-
-This module supports techniques such as Inception Score, Frechet Inception
-distance, and Sliced Wasserstein distance.
-"""
-# pylint: disable=,wildcard-import,unused-import
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Collapse eval into a single namespace.
-from tensorflow.contrib.gan.python.eval.python import classifier_metrics
-from tensorflow.contrib.gan.python.eval.python import eval_utils
-from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein
-from tensorflow.contrib.gan.python.eval.python import summaries
-
-from tensorflow.contrib.gan.python.eval.python.classifier_metrics import *
-from tensorflow.contrib.gan.python.eval.python.eval_utils import *
-from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein import *
-from tensorflow.contrib.gan.python.eval.python.summaries import *
-# pylint: enable=wildcard-import,unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'classifier_metrics',
-    'sliced_wasserstein_distance',
-    'summaries',
-    'eval_utils',
-] + (
-    classifier_metrics.__all__ + sliced_wasserstein.__all__ +
-    summaries.__all__ + eval_utils.__all__)
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
deleted file mode 100644
index a52e899114b..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Model evaluation tools for TF-GAN."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.eval.python import classifier_metrics_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.eval.python.classifier_metrics_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = classifier_metrics_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
deleted file mode 100644
index 2c301267900..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ /dev/null
@@ -1,1115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Model evaluation tools for TF-GAN.
-
-These methods come from https://arxiv.org/abs/1606.03498,
-https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
-
-NOTE: This implementation uses the same weights as in
-https://github.com/openai/improved-gan/blob/master/inception_score/model.py,
-but is more numerically stable and is an unbiased estimator of the true
-Inception score even when splitting the inputs into batches.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import sys
-import tarfile
-
-from six.moves import urllib
-
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import map_fn
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_impl
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import resource_loader
-
-__all__ = [
-    'get_graph_def_from_disk',
-    'get_graph_def_from_resource',
-    'get_graph_def_from_url_tarball',
-    'preprocess_image',
-    'run_image_classifier',
-    'run_inception',
-    'inception_score',
-    'classifier_score',
-    'classifier_score_from_logits',
-    'frechet_inception_distance',
-    'frechet_classifier_distance',
-    'frechet_classifier_distance_from_activations',
-    'mean_only_frechet_classifier_distance_from_activations',
-    'diagonal_only_frechet_classifier_distance_from_activations',
-    'kernel_inception_distance',
-    'kernel_inception_distance_and_std',
-    'kernel_classifier_distance',
-    'kernel_classifier_distance_and_std',
-    'kernel_classifier_distance_from_activations',
-    'kernel_classifier_distance_and_std_from_activations',
-    'INCEPTION_DEFAULT_IMAGE_SIZE',
-]
-
-INCEPTION_URL = 'http://download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz'
-INCEPTION_FROZEN_GRAPH = 'inceptionv1_for_inception_score.pb'
-INCEPTION_INPUT = 'Mul:0'
-INCEPTION_OUTPUT = 'logits:0'
-INCEPTION_FINAL_POOL = 'pool_3:0'
-INCEPTION_DEFAULT_IMAGE_SIZE = 299
-
-
-def _validate_images(images, image_size):
-  images = ops.convert_to_tensor(images)
-  images.shape.with_rank(4)
-  images.shape.assert_is_compatible_with([None, image_size, image_size, None])
-  return images
-
-
-def _symmetric_matrix_square_root(mat, eps=1e-10):
-  """Compute square root of a symmetric matrix.
-
-  Note that this is different from an elementwise square root. We want to
-  compute M' where M' = sqrt(mat) such that M' * M' = mat.
-
-  Also note that this method **only** works for symmetric matrices.
-
-  Args:
-    mat: Matrix to take the square root of.
-    eps: Small epsilon such that any element less than eps will not be square
-      rooted to guard against numerical instability.
-
-  Returns:
-    Matrix square root of mat.
-  """
-  # Unlike numpy, tensorflow's return order is (s, u, v)
-  s, u, v = linalg_ops.svd(mat)
-  # sqrt is unstable around 0, just use 0 in such case
-  si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s))
-  # Note that the v returned by Tensorflow is v = V
-  # (when referencing the equation A = U S V^T)
-  # This is unlike Numpy which returns v = V^T
-  return math_ops.matmul(
-      math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
-
-
-def preprocess_image(images,
-                     height=INCEPTION_DEFAULT_IMAGE_SIZE,
-                     width=INCEPTION_DEFAULT_IMAGE_SIZE,
-                     scope=None):
-  """Prepare a batch of images for evaluation.
-
-  This is the preprocessing portion of the graph from
-  http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz.
-
-  Note that it expects Tensors in [0, 255]. This function maps pixel values to
-  [-1, 1] and resizes to match the InceptionV1 network.
-
-  Args:
-    images: 3-D or 4-D Tensor of images. Values are in [0, 255].
-    height: Integer. Height of resized output image.
-    width: Integer. Width of resized output image.
-    scope: Optional scope for name_scope.
-
-  Returns:
-    3-D or 4-D float Tensor of prepared image(s). Values are in [-1, 1].
-  """
-  is_single = images.shape.ndims == 3
-  with ops.name_scope(scope, 'preprocess', [images, height, width]):
-    if not images.dtype.is_floating:
-      images = math_ops.cast(images, dtypes.float32)
-    if is_single:
-      images = array_ops.expand_dims(images, axis=0)
-    resized = image_ops.resize_bilinear(images, [height, width])
-    resized = (resized - 128.0) / 128.0
-    if is_single:
-      resized = array_ops.squeeze(resized, axis=0)
-    return resized
-
-
-def _kl_divergence(p, p_logits, q):
-  """Computes the Kullback-Liebler divergence between p and q.
-
-  This function uses p's logits in some places to improve numerical stability.
-
-  Specifically:
-
-  KL(p || q) = sum[ p * log(p / q) ]
-    = sum[ p * ( log(p)                - log(q) ) ]
-    = sum[ p * ( log_softmax(p_logits) - log(q) ) ]
-
-  Args:
-    p: A 2-D floating-point Tensor p_ij, where `i` corresponds to the minibatch
-      example and `j` corresponds to the probability of being in class `j`.
-    p_logits: A 2-D floating-point Tensor corresponding to logits for `p`.
-    q: A 1-D floating-point Tensor, where q_j corresponds to the probability
-      of class `j`.
-
-  Returns:
-    KL divergence between two distributions. Output dimension is 1D, one entry
-    per distribution in `p`.
-
-  Raises:
-    ValueError: If any of the inputs aren't floating-point.
-    ValueError: If p or p_logits aren't 2D.
-    ValueError: If q isn't 1D.
-  """
-  for tensor in [p, p_logits, q]:
-    if not tensor.dtype.is_floating:
-      raise ValueError('Input %s must be floating type.', tensor.name)
-  p.shape.assert_has_rank(2)
-  p_logits.shape.assert_has_rank(2)
-  q.shape.assert_has_rank(1)
-  return math_ops.reduce_sum(
-      p * (nn_ops.log_softmax(p_logits) - math_ops.log(q)), axis=1)
-
-
-def get_graph_def_from_disk(filename):
-  """Get a GraphDef proto from a disk location."""
-  with gfile.GFile(filename, 'rb') as f:
-    return graph_pb2.GraphDef.FromString(f.read())
-
-
-def get_graph_def_from_resource(filename):
-  """Get a GraphDef proto from within a .par file."""
-  return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
-
-
-def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
-  """Get a GraphDef proto from a tarball on the web.
-
-  Args:
-    url: Web address of tarball
-    filename: Filename of graph definition within tarball
-    tar_filename: Temporary download filename (None = always download)
-
-  Returns:
-    A GraphDef loaded from a file in the downloaded tarball.
-  """
-  if not (tar_filename and os.path.exists(tar_filename)):
-
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' %
-                       (url,
-                        float(count * block_size) / float(total_size) * 100.0))
-      sys.stdout.flush()
-
-    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
-  with tarfile.open(tar_filename, 'r:gz') as tar:
-    proto_str = tar.extractfile(filename).read()
-  return graph_pb2.GraphDef.FromString(proto_str)
-
-
-def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
-                                        os.path.basename(INCEPTION_URL))
-
-
-def run_inception(images,
-                  graph_def=None,
-                  default_graph_def_fn=_default_graph_def_fn,
-                  image_size=INCEPTION_DEFAULT_IMAGE_SIZE,
-                  input_tensor=INCEPTION_INPUT,
-                  output_tensor=INCEPTION_OUTPUT):
-  """Run images through a pretrained Inception classifier.
-
-  Args:
-    images: Input tensors. Must be [batch, height, width, channels]. Input shape
-      and values must be in [-1, 1], which can be achieved using
-      `preprocess_image`.
-    graph_def: A GraphDef proto of a pretrained Inception graph. If `None`,
-      call `default_graph_def_fn` to get GraphDef.
-    default_graph_def_fn: A function that returns a GraphDef. Used if
-      `graph_def` is `None. By default, returns a pretrained InceptionV3 graph.
-    image_size: Required image width and height. See unit tests for the default
-      values.
-    input_tensor: Name of input Tensor.
-    output_tensor: Name or list of output Tensors. This function will compute
-      activations at the specified layer. Examples include INCEPTION_V3_OUTPUT
-      and INCEPTION_V3_FINAL_POOL which would result in this function computing
-      the final logits or the penultimate pooling layer.
-
-  Returns:
-    Tensor or Tensors corresponding to computed `output_tensor`.
-
-  Raises:
-    ValueError: If images are not the correct size.
-    ValueError: If neither `graph_def` nor `default_graph_def_fn` are provided.
-  """
-  images = _validate_images(images, image_size)
-
-  if graph_def is None:
-    if default_graph_def_fn is None:
-      raise ValueError('If `graph_def` is `None`, must provide '
-                       '`default_graph_def_fn`.')
-    graph_def = default_graph_def_fn()
-
-  activations = run_image_classifier(images, graph_def, input_tensor,
-                                     output_tensor)
-  if isinstance(activations, list):
-    for i, activation in enumerate(activations):
-      if array_ops.rank(activation) != 2:
-        activations[i] = layers.flatten(activation)
-  else:
-    if array_ops.rank(activations) != 2:
-      activations = layers.flatten(activations)
-
-  return activations
-
-
-def run_image_classifier(tensor,
-                         graph_def,
-                         input_tensor,
-                         output_tensor,
-                         scope='RunClassifier'):
-  """Runs a network from a frozen graph.
-
-  Args:
-    tensor: An Input tensor.
-    graph_def: A GraphDef proto.
-    input_tensor: Name of input tensor in graph def.
-    output_tensor: A tensor name or list of tensor names in graph def.
-    scope: Name scope for classifier.
-
-  Returns:
-    Classifier output if `output_tensor` is a string, or a list of outputs if
-    `output_tensor` is a list.
-
-  Raises:
-    ValueError: If `input_tensor` or `output_tensor` aren't in the graph_def.
-  """
-  input_map = {input_tensor: tensor}
-  is_singleton = isinstance(output_tensor, str)
-  if is_singleton:
-    output_tensor = [output_tensor]
-  classifier_outputs = importer.import_graph_def(
-      graph_def, input_map, output_tensor, name=scope)
-  if is_singleton:
-    classifier_outputs = classifier_outputs[0]
-
-  return classifier_outputs
-
-
-def classifier_score(images, classifier_fn, num_batches=1):
-  """Classifier score for evaluating a conditional generative model.
-
-  This is based on the Inception Score, but for an arbitrary classifier.
-
-  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
-  summary, this function calculates
-
-  exp( E[ KL(p(y|x) || p(y)) ] )
-
-  which captures how different the network's classification prediction is from
-  the prior distribution over classes.
-
-  NOTE: This function consumes images, computes their logits, and then
-  computes the classifier score. If you would like to precompute many logits for
-  large batches, use classifier_score_from_logits(), which this method also
-  uses.
-
-  Args:
-    images: Images to calculate the classifier score for.
-    classifier_fn: A function that takes images and produces logits based on a
-      classifier.
-    num_batches: Number of batches to split `generated_images` in to in order to
-      efficiently run them through the classifier network.
-
-  Returns:
-    The classifier score. A floating-point scalar of the same type as the output
-    of `classifier_fn`.
-  """
-  generated_images_list = array_ops.split(
-      images, num_or_size_splits=num_batches)
-
-  # Compute the classifier splits using the memory-efficient `map_fn`.
-  logits = map_fn.map_fn(
-      fn=classifier_fn,
-      elems=array_ops.stack(generated_images_list),
-      parallel_iterations=1,
-      back_prop=False,
-      swap_memory=True,
-      name='RunClassifier')
-  logits = array_ops.concat(array_ops.unstack(logits), 0)
-
-  return classifier_score_from_logits(logits)
-
-
-def classifier_score_from_logits(logits):
-  """Classifier score for evaluating a generative model from logits.
-
-  This method computes the classifier score for a set of logits. This can be
-  used independently of the classifier_score() method, especially in the case
-  of using large batches during evaluation where we would like precompute all
-  of the logits before computing the classifier score.
-
-  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
-  summary, this function calculates:
-
-  exp( E[ KL(p(y|x) || p(y)) ] )
-
-  which captures how different the network's classification prediction is from
-  the prior distribution over classes.
-
-  Args:
-    logits: Precomputed 2D tensor of logits that will be used to
-      compute the classifier score.
-
-  Returns:
-    The classifier score. A floating-point scalar of the same type as the output
-    of `logits`.
-  """
-  logits.shape.assert_has_rank(2)
-
-  # Use maximum precision for best results.
-  logits_dtype = logits.dtype
-  if logits_dtype != dtypes.float64:
-    logits = math_ops.cast(logits, dtypes.float64)
-
-  p = nn_ops.softmax(logits)
-  q = math_ops.reduce_mean(p, axis=0)
-  kl = _kl_divergence(p, logits, q)
-  kl.shape.assert_has_rank(1)
-  log_score = math_ops.reduce_mean(kl)
-  final_score = math_ops.exp(log_score)
-
-  if logits_dtype != dtypes.float64:
-    final_score = math_ops.cast(final_score, logits_dtype)
-
-  return final_score
-
-
-inception_score = functools.partial(
-    classifier_score,
-    classifier_fn=functools.partial(
-        run_inception, output_tensor=INCEPTION_OUTPUT))
-
-
-def trace_sqrt_product(sigma, sigma_v):
-  """Find the trace of the positive sqrt of product of covariance matrices.
-
-  '_symmetric_matrix_square_root' only works for symmetric matrices, so we
-  cannot just take _symmetric_matrix_square_root(sigma * sigma_v).
-  ('sigma' and 'sigma_v' are symmetric, but their product is not necessarily).
-
-  Let sigma = A A so A = sqrt(sigma), and sigma_v = B B.
-  We want to find trace(sqrt(sigma sigma_v)) = trace(sqrt(A A B B))
-  Note the following properties:
-  (i) forall M1, M2: eigenvalues(M1 M2) = eigenvalues(M2 M1)
-     => eigenvalues(A A B B) = eigenvalues (A B B A)
-  (ii) if M1 = sqrt(M2), then eigenvalues(M1) = sqrt(eigenvalues(M2))
-     => eigenvalues(sqrt(sigma sigma_v)) = sqrt(eigenvalues(A B B A))
-  (iii) forall M: trace(M) = sum(eigenvalues(M))
-     => trace(sqrt(sigma sigma_v)) = sum(eigenvalues(sqrt(sigma sigma_v)))
-                                   = sum(sqrt(eigenvalues(A B B A)))
-                                   = sum(eigenvalues(sqrt(A B B A)))
-                                   = trace(sqrt(A B B A))
-                                   = trace(sqrt(A sigma_v A))
-  A = sqrt(sigma). Both sigma and A sigma_v A are symmetric, so we **can**
-  use the _symmetric_matrix_square_root function to find the roots of these
-  matrices.
-
-  Args:
-    sigma: a square, symmetric, real, positive semi-definite covariance matrix
-    sigma_v: same as sigma
-
-  Returns:
-    The trace of the positive square root of sigma*sigma_v
-  """
-
-  # Note sqrt_sigma is called "A" in the proof above
-  sqrt_sigma = _symmetric_matrix_square_root(sigma)
-
-  # This is sqrt(A sigma_v A) above
-  sqrt_a_sigmav_a = math_ops.matmul(sqrt_sigma,
-                                    math_ops.matmul(sigma_v, sqrt_sigma))
-
-  return math_ops.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))
-
-
-def frechet_classifier_distance(real_images,
-                                generated_images,
-                                classifier_fn,
-                                num_batches=1):
-  """Classifier distance for evaluating a generative model.
-
-  This is based on the Frechet Inception distance, but for an arbitrary
-  classifier.
-
-  This technique is described in detail in https://arxiv.org/abs/1706.08500.
-  Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calculates
-
-              |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
-
-  which captures how different the distributions of real images and generated
-  images (or more accurately, their visual features) are. Note that unlike the
-  Inception score, this is a true distance and utilizes information about real
-  world images.
-
-  Note that when computed using sample means and sample covariance matrices,
-  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
-  even if the two distributions are the same, for a small sample size, the
-  expected Frechet distance is large). It is important to use the same
-  sample size to compute Frechet classifier distance when comparing two
-  generative models.
-
-  NOTE: This function consumes images, computes their activations, and then
-  computes the classifier score. If you would like to precompute many
-  activations for real and generated images for large batches, please use
-  frechet_clasifier_distance_from_activations(), which this method also uses.
-
-  Args:
-    real_images: Real images to use to compute Frechet Inception distance.
-    generated_images: Generated images to use to compute Frechet Inception
-      distance.
-    classifier_fn: A function that takes images and produces activations
-      based on a classifier.
-    num_batches: Number of batches to split images in to in order to
-      efficiently run them through the classifier network.
-
-  Returns:
-    The Frechet Inception distance. A floating-point scalar of the same type
-    as the output of `classifier_fn`.
-  """
-  real_images_list = array_ops.split(
-      real_images, num_or_size_splits=num_batches)
-  generated_images_list = array_ops.split(
-      generated_images, num_or_size_splits=num_batches)
-
-  real_imgs = array_ops.stack(real_images_list)
-  generated_imgs = array_ops.stack(generated_images_list)
-
-  # Compute the activations using the memory-efficient `map_fn`.
-  def compute_activations(elems):
-    return map_fn.map_fn(fn=classifier_fn,
-                         elems=elems,
-                         parallel_iterations=1,
-                         back_prop=False,
-                         swap_memory=True,
-                         name='RunClassifier')
-
-  real_a = compute_activations(real_imgs)
-  gen_a = compute_activations(generated_imgs)
-
-  # Ensure the activations have the right shapes.
-  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
-  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
-
-  return frechet_classifier_distance_from_activations(real_a, gen_a)
-
-
-def mean_only_frechet_classifier_distance_from_activations(
-    real_activations, generated_activations):
-  """Classifier distance for evaluating a generative model from activations.
-
-  Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calcuates
-
-                                |m - m_w|^2
-
-  which captures how different the distributions of real images and generated
-  images (or more accurately, their visual features) are. Note that unlike the
-  Inception score, this is a true distance and utilizes information about real
-  world images.
-
-  Note that when computed using sample means and sample covariance matrices,
-  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
-  even if the two distributions are the same, for a small sample size, the
-  expected Frechet distance is large). It is important to use the same
-  sample size to compute frechet classifier distance when comparing two
-  generative models.
-
-  In this variant, we only compute the difference between the means of the
-  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
-  still retains much of the same information as FID.
-
-  Args:
-    real_activations: 2D array of activations of real images of size
-      [num_images, num_dims] to use to compute Frechet Inception distance.
-    generated_activations: 2D array of activations of generated images of size
-      [num_images, num_dims] to use to compute Frechet Inception distance.
-
-  Returns:
-    The mean-only Frechet Inception distance. A floating-point scalar of the
-    same type as the output of the activations.
-  """
-  real_activations.shape.assert_has_rank(2)
-  generated_activations.shape.assert_has_rank(2)
-
-  activations_dtype = real_activations.dtype
-  if activations_dtype != dtypes.float64:
-    real_activations = math_ops.cast(real_activations, dtypes.float64)
-    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
-
-  # Compute means of activations.
-  m = math_ops.reduce_mean(real_activations, 0)
-  m_w = math_ops.reduce_mean(generated_activations, 0)
-
-  # Next the distance between means.
-  mean = math_ops.reduce_sum(
-      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
-  mofid = mean
-  if activations_dtype != dtypes.float64:
-    mofid = math_ops.cast(mofid, activations_dtype)
-
-  return mofid
-
-
-def diagonal_only_frechet_classifier_distance_from_activations(
-    real_activations, generated_activations):
-  """Classifier distance for evaluating a generative model.
-
-  This is based on the Frechet Inception distance, but for an arbitrary
-  classifier.
-
-  This technique is described in detail in https://arxiv.org/abs/1706.08500.
-  Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calcuates
-
-          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))
-
-  which captures how different the distributions of real images and generated
-  images (or more accurately, their visual features) are. Note that unlike the
-  Inception score, this is a true distance and utilizes information about real
-  world images. In this variant, we compute diagonal-only covariance matrices.
-  As a result, instead of computing an expensive matrix square root, we can do
-  something much simpler, and has O(n) vs O(n^2) space complexity.
-
-  Note that when computed using sample means and sample covariance matrices,
-  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
-  even if the two distributions are the same, for a small sample size, the
-  expected Frechet distance is large). It is important to use the same
-  sample size to compute frechet classifier distance when comparing two
-  generative models.
-
-  Args:
-    real_activations: Real images to use to compute Frechet Inception distance.
-    generated_activations: Generated images to use to compute Frechet Inception
-      distance.
-
-  Returns:
-    The diagonal-only Frechet Inception distance. A floating-point scalar of
-    the same type as the output of the activations.
-
-  Raises:
-    ValueError: If the shape of the variance and mean vectors are not equal.
-  """
-  real_activations.shape.assert_has_rank(2)
-  generated_activations.shape.assert_has_rank(2)
-
-  activations_dtype = real_activations.dtype
-  if activations_dtype != dtypes.float64:
-    real_activations = math_ops.cast(real_activations, dtypes.float64)
-    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
-
-  # Compute mean and covariance matrices of activations.
-  m, var = nn_impl.moments(real_activations, axes=[0])
-  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])
-
-  actual_shape = var.get_shape()
-  expected_shape = m.get_shape()
-
-  if actual_shape != expected_shape:
-    raise ValueError('shape: {} must match expected shape: {}'.format(
-        actual_shape, expected_shape))
-
-  # Compute the two components of FID.
-
-  # First the covariance component.
-  # Here, note that trace(A + B) = trace(A) + trace(B)
-  trace = math_ops.reduce_sum(
-      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))
-
-  # Next the distance between means.
-  mean = math_ops.reduce_sum(
-      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
-  dofid = trace + mean
-  if activations_dtype != dtypes.float64:
-    dofid = math_ops.cast(dofid, activations_dtype)
-
-  return dofid
-
-
-def frechet_classifier_distance_from_activations(real_activations,
-                                                 generated_activations):
-  """Classifier distance for evaluating a generative model.
-
-  This methods computes the Frechet classifier distance from activations of
-  real images and generated images. This can be used independently of the
-  frechet_classifier_distance() method, especially in the case of using large
-  batches during evaluation where we would like precompute all of the
-  activations before computing the classifier distance.
-
-  This technique is described in detail in https://arxiv.org/abs/1706.08500.
-  Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calculates
-
-                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
-
-  which captures how different the distributions of real images and generated
-  images (or more accurately, their visual features) are. Note that unlike the
-  Inception score, this is a true distance and utilizes information about real
-  world images.
-
-  Note that when computed using sample means and sample covariance matrices,
-  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
-  even if the two distributions are the same, for a small sample size, the
-  expected Frechet distance is large). It is important to use the same
-  sample size to compute frechet classifier distance when comparing two
-  generative models.
-
-  Args:
-    real_activations: 2D Tensor containing activations of real data. Shape is
-      [batch_size, activation_size].
-    generated_activations: 2D Tensor containing activations of generated data.
-      Shape is [batch_size, activation_size].
-
-  Returns:
-   The Frechet Inception distance. A floating-point scalar of the same type
-   as the output of the activations.
-
-  """
-  real_activations.shape.assert_has_rank(2)
-  generated_activations.shape.assert_has_rank(2)
-
-  activations_dtype = real_activations.dtype
-  if activations_dtype != dtypes.float64:
-    real_activations = math_ops.cast(real_activations, dtypes.float64)
-    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
-
-  # Compute mean and covariance matrices of activations.
-  m = math_ops.reduce_mean(real_activations, 0)
-  m_w = math_ops.reduce_mean(generated_activations, 0)
-  num_examples_real = math_ops.cast(
-      array_ops.shape(real_activations)[0], dtypes.float64)
-  num_examples_generated = math_ops.cast(
-      array_ops.shape(generated_activations)[0], dtypes.float64)
-
-  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
-  real_centered = real_activations - m
-  sigma = math_ops.matmul(
-      real_centered, real_centered, transpose_a=True) / (
-          num_examples_real - 1)
-
-  gen_centered = generated_activations - m_w
-  sigma_w = math_ops.matmul(
-      gen_centered, gen_centered, transpose_a=True) / (
-          num_examples_generated - 1)
-
-  # Find the Tr(sqrt(sigma sigma_w)) component of FID
-  sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
-
-  # Compute the two components of FID.
-
-  # First the covariance component.
-  # Here, note that trace(A + B) = trace(A) + trace(B)
-  trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component
-
-  # Next the distance between means.
-  mean = math_ops.reduce_sum(
-      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
-  fid = trace + mean
-  if activations_dtype != dtypes.float64:
-    fid = math_ops.cast(fid, activations_dtype)
-
-  return fid
-
-frechet_inception_distance = functools.partial(
-    frechet_classifier_distance,
-    classifier_fn=functools.partial(
-        run_inception, output_tensor=INCEPTION_FINAL_POOL))
-
-
-def kernel_classifier_distance(real_images,
-                               generated_images,
-                               classifier_fn,
-                               num_classifier_batches=1,
-                               max_block_size=1024,
-                               dtype=None):
-  """Kernel "classifier" distance for evaluating a generative model.
-
-  This is based on the Kernel Inception distance, but for an arbitrary
-  embedding.
-
-  This technique is described in detail in https://arxiv.org/abs/1801.01401.
-  Given two distributions P and Q of activations, this function calculates
-
-      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
-        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
-
-  where k is the polynomial kernel
-
-      k(x, y) = ( x^T y / dimension + 1 )^3.
-
-  This captures how different the distributions of real and generated images'
-  visual features are. Like the Frechet distance (and unlike the Inception
-  score), this is a true distance and incorporates information about the
-  target images. Unlike the Frechet score, this function computes an
-  *unbiased* and asymptotically normal estimator, which makes comparing
-  estimates across models much more intuitive.
-
-  The estimator used takes time quadratic in max_block_size. Larger values of
-  max_block_size will decrease the variance of the estimator but increase the
-  computational cost. This differs slightly from the estimator used by the
-  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
-
-  NOTE: the blocking code assumes that real_activations and
-  generated_activations are both in random order. If either is sorted in a
-  meaningful order, the estimator will behave poorly.
-
-  NOTE: This function consumes images, computes their activations, and then
-  computes the classifier score. If you would like to precompute many
-  activations for real and generated images for large batches, or to compute
-  multiple scores based on the same images, please use
-  kernel_clasifier_distance_from_activations(), which this method also uses.
-
-  Args:
-    real_images: Real images to use to compute Kernel Inception distance.
-    generated_images: Generated images to use to compute Kernel Inception
-      distance.
-    classifier_fn: A function that takes images and produces activations based
-      on a classifier.
-    num_classifier_batches: Number of batches to split images in to in order to
-      efficiently run them through the classifier network.
-    max_block_size: integer, default 1024. The distance estimator splits samples
-      into blocks for computational efficiency. Larger values are more
-      computationally expensive but decrease the variance of the distance
-      estimate.
-    dtype: if not None, coerce activations to this dtype before computations.
-
-  Returns:
-   The Kernel Inception Distance. A floating-point scalar of the same type
-   as the output of the activations.
-  """
-  return kernel_classifier_distance_and_std(
-      real_images,
-      generated_images,
-      classifier_fn,
-      num_classifier_batches=num_classifier_batches,
-      max_block_size=max_block_size,
-      dtype=dtype)[0]
-
-
-kernel_inception_distance = functools.partial(
-    kernel_classifier_distance,
-    classifier_fn=functools.partial(
-        run_inception, output_tensor=INCEPTION_FINAL_POOL))
-
-
-def kernel_classifier_distance_and_std(real_images,
-                                       generated_images,
-                                       classifier_fn,
-                                       num_classifier_batches=1,
-                                       max_block_size=1024,
-                                       dtype=None):
-  """Kernel "classifier" distance for evaluating a generative model.
-
-  This is based on the Kernel Inception distance, but for an arbitrary
-  embedding. Also returns an estimate of the standard error of the distance
-  estimator.
-
-  This technique is described in detail in https://arxiv.org/abs/1801.01401.
-  Given two distributions P and Q of activations, this function calculates
-
-      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
-        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
-
-  where k is the polynomial kernel
-
-      k(x, y) = ( x^T y / dimension + 1 )^3.
-
-  This captures how different the distributions of real and generated images'
-  visual features are. Like the Frechet distance (and unlike the Inception
-  score), this is a true distance and incorporates information about the
-  target images. Unlike the Frechet score, this function computes an
-  *unbiased* and asymptotically normal estimator, which makes comparing
-  estimates across models much more intuitive.
-
-  The estimator used takes time quadratic in max_block_size. Larger values of
-  max_block_size will decrease the variance of the estimator but increase the
-  computational cost. This differs slightly from the estimator used by the
-  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
-
-  NOTE: the blocking code assumes that real_activations and
-  generated_activations are both in random order. If either is sorted in a
-  meaningful order, the estimator will behave poorly.
-
-  NOTE: This function consumes images, computes their activations, and then
-  computes the classifier score. If you would like to precompute many
-  activations for real and generated images for large batches, or to compute
-  multiple scores based on the same images, please use
-  kernel_clasifier_distance_from_activations(), which this method also uses.
-
-  Args:
-    real_images: Real images to use to compute Kernel Inception distance.
-    generated_images: Generated images to use to compute Kernel Inception
-      distance.
-    classifier_fn: A function that takes images and produces activations based
-      on a classifier.
-    num_classifier_batches: Number of batches to split images in to in order to
-      efficiently run them through the classifier network.
-    max_block_size: integer, default 1024. The distance estimator splits samples
-      into blocks for computational efficiency. Larger values are more
-      computationally expensive but decrease the variance of the distance
-      estimate. Having a smaller block size also gives a better estimate of the
-      standard error.
-    dtype: if not None, coerce activations to this dtype before computations.
-
-  Returns:
-   The Kernel Inception Distance. A floating-point scalar of the same type
-     as the output of the activations.
-   An estimate of the standard error of the distance estimator (a scalar of
-     the same type).
-  """
-  real_images_list = array_ops.split(
-      real_images, num_or_size_splits=num_classifier_batches)
-  generated_images_list = array_ops.split(
-      generated_images, num_or_size_splits=num_classifier_batches)
-
-  real_imgs = array_ops.stack(real_images_list)
-  generated_imgs = array_ops.stack(generated_images_list)
-
-  # Compute the activations using the memory-efficient `map_fn`.
-  def compute_activations(elems):
-    return map_fn.map_fn(
-        fn=classifier_fn,
-        elems=elems,
-        parallel_iterations=1,
-        back_prop=False,
-        swap_memory=True,
-        name='RunClassifier')
-
-  real_a = compute_activations(real_imgs)
-  gen_a = compute_activations(generated_imgs)
-
-  # Ensure the activations have the right shapes.
-  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
-  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
-
-  return kernel_classifier_distance_and_std_from_activations(
-      real_a, gen_a, max_block_size, dtype)
-
-
-kernel_inception_distance_and_std = functools.partial(
-    kernel_classifier_distance_and_std,
-    classifier_fn=functools.partial(
-        run_inception, output_tensor=INCEPTION_FINAL_POOL))
-
-
-def kernel_classifier_distance_from_activations(real_activations,
-                                                generated_activations,
-                                                max_block_size=1024,
-                                                dtype=None):
-  """Kernel "classifier" distance for evaluating a generative model.
-
-  This methods computes the kernel classifier distance from activations of
-  real images and generated images. This can be used independently of the
-  kernel_classifier_distance() method, especially in the case of using large
-  batches during evaluation where we would like to precompute all of the
-  activations before computing the classifier distance, or if we want to
-  compute multiple metrics based on the same images.
-
-  This technique is described in detail in https://arxiv.org/abs/1801.01401.
-  Given two distributions P and Q of activations, this function calculates
-
-      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
-        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
-
-  where k is the polynomial kernel
-
-      k(x, y) = ( x^T y / dimension + 1 )^3.
-
-  This captures how different the distributions of real and generated images'
-  visual features are. Like the Frechet distance (and unlike the Inception
-  score), this is a true distance and incorporates information about the
-  target images. Unlike the Frechet score, this function computes an
-  *unbiased* and asymptotically normal estimator, which makes comparing
-  estimates across models much more intuitive.
-
-  The estimator used takes time quadratic in max_block_size. Larger values of
-  max_block_size will decrease the variance of the estimator but increase the
-  computational cost. This differs slightly from the estimator used by the
-  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
-
-  NOTE: the blocking code assumes that real_activations and
-  generated_activations are both in random order. If either is sorted in a
-  meaningful order, the estimator will behave poorly.
-
-  Args:
-    real_activations: 2D Tensor containing activations of real data. Shape is
-      [batch_size, activation_size].
-    generated_activations: 2D Tensor containing activations of generated data.
-      Shape is [batch_size, activation_size].
-    max_block_size: integer, default 1024. The distance estimator splits samples
-      into blocks for computational efficiency. Larger values are more
-      computationally expensive but decrease the variance of the distance
-      estimate.
-    dtype: If not None, coerce activations to this dtype before computations.
-
-  Returns:
-   The Kernel Inception Distance. A floating-point scalar of the same type
-   as the output of the activations.
-  """
-  return kernel_classifier_distance_and_std_from_activations(
-      real_activations, generated_activations, max_block_size, dtype)[0]
-
-
-def kernel_classifier_distance_and_std_from_activations(real_activations,
-                                                        generated_activations,
-                                                        max_block_size=1024,
-                                                        dtype=None):
-  """Kernel "classifier" distance for evaluating a generative model.
-
-  This methods computes the kernel classifier distance from activations of
-  real images and generated images. This can be used independently of the
-  kernel_classifier_distance() method, especially in the case of using large
-  batches during evaluation where we would like to precompute all of the
-  activations before computing the classifier distance, or if we want to
-  compute multiple metrics based on the same images. It also returns a rough
-  estimate of the standard error of the estimator.
-
-  This technique is described in detail in https://arxiv.org/abs/1801.01401.
-  Given two distributions P and Q of activations, this function calculates
-
-      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
-        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
-
-  where k is the polynomial kernel
-
-      k(x, y) = ( x^T y / dimension + 1 )^3.
-
-  This captures how different the distributions of real and generated images'
-  visual features are. Like the Frechet distance (and unlike the Inception
-  score), this is a true distance and incorporates information about the
-  target images. Unlike the Frechet score, this function computes an
-  *unbiased* and asymptotically normal estimator, which makes comparing
-  estimates across models much more intuitive.
-
-  The estimator used takes time quadratic in max_block_size. Larger values of
-  max_block_size will decrease the variance of the estimator but increase the
-  computational cost. This differs slightly from the estimator used by the
-  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
-  The estimate of the standard error will also be more reliable when there are
-  more blocks, i.e. when max_block_size is smaller.
-
-  NOTE: the blocking code assumes that real_activations and
-  generated_activations are both in random order. If either is sorted in a
-  meaningful order, the estimator will behave poorly.
-
-  Args:
-    real_activations: 2D Tensor containing activations of real data. Shape is
-      [batch_size, activation_size].
-    generated_activations: 2D Tensor containing activations of generated data.
-      Shape is [batch_size, activation_size].
-    max_block_size: integer, default 1024. The distance estimator splits samples
-      into blocks for computational efficiency. Larger values are more
-      computationally expensive but decrease the variance of the distance
-      estimate. Having a smaller block size also gives a better estimate of the
-      standard error.
-    dtype: If not None, coerce activations to this dtype before computations.
-
-  Returns:
-   The Kernel Inception Distance. A floating-point scalar of the same type
-     as the output of the activations.
-   An estimate of the standard error of the distance estimator (a scalar of
-     the same type).
-  """
-
-  real_activations.shape.assert_has_rank(2)
-  generated_activations.shape.assert_has_rank(2)
-  real_activations.shape[1].assert_is_compatible_with(
-      generated_activations.shape[1])
-
-  if dtype is None:
-    dtype = real_activations.dtype
-    assert generated_activations.dtype == dtype
-  else:
-    real_activations = math_ops.cast(real_activations, dtype)
-    generated_activations = math_ops.cast(generated_activations, dtype)
-
-  # Figure out how to split the activations into blocks of approximately
-  # equal size, with none larger than max_block_size.
-  n_r = array_ops.shape(real_activations)[0]
-  n_g = array_ops.shape(generated_activations)[0]
-
-  n_bigger = math_ops.maximum(n_r, n_g)
-  n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size),
-                           dtypes.int32)
-
-  v_r = n_r // n_blocks
-  v_g = n_g // n_blocks
-
-  n_plusone_r = n_r - v_r * n_blocks
-  n_plusone_g = n_g - v_g * n_blocks
-
-  sizes_r = array_ops.concat([
-      array_ops.fill([n_blocks - n_plusone_r], v_r),
-      array_ops.fill([n_plusone_r], v_r + 1),
-  ], 0)
-  sizes_g = array_ops.concat([
-      array_ops.fill([n_blocks - n_plusone_g], v_g),
-      array_ops.fill([n_plusone_g], v_g + 1),
-  ], 0)
-
-  zero = array_ops.zeros([1], dtype=dtypes.int32)
-  inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
-  inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)
-
-  dim = math_ops.cast(real_activations.shape[1], dtype)
-
-  def compute_kid_block(i):
-    """Computes the ith block of the KID estimate."""
-    r_s = inds_r[i]
-    r_e = inds_r[i + 1]
-    r = real_activations[r_s:r_e]
-    m = math_ops.cast(r_e - r_s, dtype)
-
-    g_s = inds_g[i]
-    g_e = inds_g[i + 1]
-    g = generated_activations[g_s:g_e]
-    n = math_ops.cast(g_e - g_s, dtype)
-
-    k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
-    k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
-    k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
-    return (-2 * math_ops.reduce_mean(k_rg) +
-            (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
-            (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))
-
-  ests = map_fn.map_fn(
-      compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)
-
-  mn = math_ops.reduce_mean(ests)
-
-  # nn_impl.moments doesn't use the Bessel correction, which we want here
-  n_blocks_ = math_ops.cast(n_blocks, dtype)
-  var = control_flow_ops.cond(
-      math_ops.less_equal(n_blocks, 1),
-      lambda: array_ops.constant(float('nan'), dtype=dtype),
-      lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1))
-
-  return mn, math_ops.sqrt(var / n_blocks_)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
deleted file mode 100644
index bc7c1057b47..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN classifier_metrics."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tarfile
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-from scipy import linalg as scp_linalg
-
-from google.protobuf import text_format
-
-from tensorflow.contrib.gan.python.eval.python import classifier_metrics_impl as classifier_metrics
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-mock = test.mock
-
-
-def _numpy_softmax(x):
-  e_x = np.exp(x - np.max(x, axis=1)[:, None])
-  return e_x / np.sum(e_x, axis=1)[:, None]
-
-
-def _expected_inception_score(logits):
-  p = _numpy_softmax(logits)
-  q = np.expand_dims(np.mean(p, 0), 0)
-  per_example_logincscore = np.sum(p * (np.log(p) - np.log(q)), 1)
-  return np.exp(np.mean(per_example_logincscore))
-
-
-def _expected_mean_only_fid(real_imgs, gen_imgs):
-  m = np.mean(real_imgs, axis=0)
-  m_v = np.mean(gen_imgs, axis=0)
-  mean = np.square(m - m_v).sum()
-  mofid = mean
-  return mofid
-
-
-def _expected_diagonal_only_fid(real_imgs, gen_imgs):
-  m = np.mean(real_imgs, axis=0)
-  m_v = np.mean(gen_imgs, axis=0)
-  var = np.var(real_imgs, axis=0)
-  var_v = np.var(gen_imgs, axis=0)
-  sqcc = np.sqrt(var * var_v)
-  mean = (np.square(m - m_v)).sum()
-  trace = (var + var_v - 2 * sqcc).sum()
-  dofid = mean + trace
-  return dofid
-
-
-def _expected_fid(real_imgs, gen_imgs):
-  m = np.mean(real_imgs, axis=0)
-  m_v = np.mean(gen_imgs, axis=0)
-  sigma = np.cov(real_imgs, rowvar=False)
-  sigma_v = np.cov(gen_imgs, rowvar=False)
-  sqcc = scp_linalg.sqrtm(np.dot(sigma, sigma_v))
-  mean = np.square(m - m_v).sum()
-  trace = np.trace(sigma + sigma_v - 2 * sqcc)
-  fid = mean + trace
-  return fid
-
-
-def _expected_trace_sqrt_product(sigma, sigma_v):
-  return np.trace(scp_linalg.sqrtm(np.dot(sigma, sigma_v)))
-
-
-def _expected_kid_and_std(real_imgs, gen_imgs, max_block_size=1024):
-  n_r, dim = real_imgs.shape
-  n_g = gen_imgs.shape[0]
-
-  n_blocks = int(np.ceil(max(n_r, n_g) / max_block_size))
-
-  sizes_r = np.full(n_blocks, n_r // n_blocks)
-  to_patch = n_r - n_blocks * (n_r // n_blocks)
-  if to_patch > 0:
-    sizes_r[-to_patch:] += 1
-  inds_r = np.r_[0, np.cumsum(sizes_r)]
-  assert inds_r[-1] == n_r
-
-  sizes_g = np.full(n_blocks, n_g // n_blocks)
-  to_patch = n_g - n_blocks * (n_g // n_blocks)
-  if to_patch > 0:
-    sizes_g[-to_patch:] += 1
-  inds_g = np.r_[0, np.cumsum(sizes_g)]
-  assert inds_g[-1] == n_g
-
-  ests = []
-  for i in range(n_blocks):
-    r = real_imgs[inds_r[i]:inds_r[i + 1]]
-    g = gen_imgs[inds_g[i]:inds_g[i + 1]]
-
-    k_rr = (np.dot(r, r.T) / dim + 1)**3
-    k_rg = (np.dot(r, g.T) / dim + 1)**3
-    k_gg = (np.dot(g, g.T) / dim + 1)**3
-    ests.append(-2 * k_rg.mean() +
-                k_rr[np.triu_indices_from(k_rr, k=1)].mean() +
-                k_gg[np.triu_indices_from(k_gg, k=1)].mean())
-
-  var = np.var(ests, ddof=1) if len(ests) > 1 else np.nan
-  return np.mean(ests), np.sqrt(var / len(ests))
-
-# A dummy GraphDef string with the minimum number of Ops.
-graphdef_string = """
-node {
-  name: "Mul"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: -1
-        }
-        dim {
-          size: 299
-        }
-        dim {
-          size: 299
-        }
-        dim {
-          size: 3
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: -1
-        }
-        dim {
-          size: 1001
-        }
-      }
-    }
-  }
-}
-node {
-  name: "pool_3"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: -1
-        }
-        dim {
-          size: 2048
-        }
-      }
-    }
-  }
-}
-versions {
-  producer: 24
-}
-"""
-
-
-def _get_dummy_graphdef():
-  dummy_graphdef = graph_pb2.GraphDef()
-  text_format.Merge(graphdef_string, dummy_graphdef)
-  return dummy_graphdef
-
-
-def _run_with_mock(function, *args, **kwargs):
-  with mock.patch.object(
-      classifier_metrics,
-      'get_graph_def_from_url_tarball') as mock_tarball_getter:
-    mock_tarball_getter.return_value = _get_dummy_graphdef()
-    return function(*args, **kwargs)
-
-
-class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('GraphDef', False),
-      ('DefaultGraphDefFn', True))
-  def test_run_inception_graph(self, use_default_graph_def):
-    """Test `run_inception` graph construction."""
-    batch_size = 7
-    img = array_ops.ones([batch_size, 299, 299, 3])
-
-    if use_default_graph_def:
-      logits = _run_with_mock(classifier_metrics.run_inception, img)
-    else:
-      logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
-
-    self.assertIsInstance(logits, ops.Tensor)
-    logits.shape.assert_is_compatible_with([batch_size, 1001])
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  @parameterized.named_parameters(
-      ('GraphDef', False),
-      ('DefaultGraphDefFn', True))
-  def test_run_inception_graph_pool_output(self, use_default_graph_def):
-    """Test `run_inception` graph construction with pool output."""
-    batch_size = 3
-    img = array_ops.ones([batch_size, 299, 299, 3])
-
-    if use_default_graph_def:
-      pool = _run_with_mock(
-          classifier_metrics.run_inception,
-          img,
-          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
-    else:
-      pool = classifier_metrics.run_inception(
-          img, _get_dummy_graphdef(),
-          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
-
-    self.assertIsInstance(pool, ops.Tensor)
-    pool.shape.assert_is_compatible_with([batch_size, 2048])
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  def test_run_inception_multiple_outputs(self):
-    """Test `run_inception` graph construction with multiple outputs."""
-    batch_size = 3
-    img = array_ops.ones([batch_size, 299, 299, 3])
-    logits, pool = _run_with_mock(
-        classifier_metrics.run_inception,
-        img,
-        output_tensor=[
-            classifier_metrics.INCEPTION_OUTPUT,
-            classifier_metrics.INCEPTION_FINAL_POOL
-        ])
-
-    self.assertIsInstance(logits, ops.Tensor)
-    self.assertIsInstance(pool, ops.Tensor)
-    logits.shape.assert_is_compatible_with([batch_size, 1001])
-    pool.shape.assert_is_compatible_with([batch_size, 2048])
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  def test_inception_score_graph(self):
-    """Test `inception_score` graph construction."""
-    score = _run_with_mock(
-        classifier_metrics.inception_score,
-        array_ops.zeros([6, 299, 299, 3]),
-        num_batches=3)
-    self.assertIsInstance(score, ops.Tensor)
-    score.shape.assert_has_rank(0)
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  def test_frechet_inception_distance_graph(self):
-    """Test `frechet_inception_distance` graph construction."""
-    img = array_ops.ones([7, 299, 299, 3])
-    distance = _run_with_mock(
-        classifier_metrics.frechet_inception_distance, img, img)
-
-    self.assertIsInstance(distance, ops.Tensor)
-    distance.shape.assert_has_rank(0)
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  def test_kernel_inception_distance_graph(self):
-    """Test `frechet_inception_distance` graph construction."""
-    img = array_ops.ones([7, 299, 299, 3])
-    distance = _run_with_mock(classifier_metrics.kernel_inception_distance, img,
-                              img)
-
-    self.assertIsInstance(distance, ops.Tensor)
-    distance.shape.assert_has_rank(0)
-
-    # Check that none of the model variables are trainable.
-    self.assertListEqual([], variables.trainable_variables())
-
-  def test_run_inception_multicall(self):
-    """Test that `run_inception` can be called multiple times."""
-    for batch_size in (7, 3, 2):
-      img = array_ops.ones([batch_size, 299, 299, 3])
-      _run_with_mock(classifier_metrics.run_inception, img)
-
-  def test_invalid_input(self):
-    """Test that functions properly fail on invalid input."""
-    with self.assertRaisesRegexp(ValueError, 'Shapes .* are incompatible'):
-      classifier_metrics.run_inception(array_ops.ones([7, 50, 50, 3]))
-
-    p = array_ops.zeros([8, 10])
-    p_logits = array_ops.zeros([8, 10])
-    q = array_ops.zeros([10])
-    with self.assertRaisesRegexp(ValueError, 'must be floating type'):
-      classifier_metrics._kl_divergence(
-          array_ops.zeros([8, 10], dtype=dtypes.int32), p_logits, q)
-
-    with self.assertRaisesRegexp(ValueError, 'must be floating type'):
-      classifier_metrics._kl_divergence(p,
-                                        array_ops.zeros(
-                                            [8, 10], dtype=dtypes.int32), q)
-
-    with self.assertRaisesRegexp(ValueError, 'must be floating type'):
-      classifier_metrics._kl_divergence(p, p_logits,
-                                        array_ops.zeros(
-                                            [10], dtype=dtypes.int32))
-
-    with self.assertRaisesRegexp(ValueError, 'must have rank 2'):
-      classifier_metrics._kl_divergence(array_ops.zeros([8]), p_logits, q)
-
-    with self.assertRaisesRegexp(ValueError, 'must have rank 2'):
-      classifier_metrics._kl_divergence(p, array_ops.zeros([8]), q)
-
-    with self.assertRaisesRegexp(ValueError, 'must have rank 1'):
-      classifier_metrics._kl_divergence(p, p_logits, array_ops.zeros([10, 8]))
-
-  def test_inception_score_value(self):
-    """Test that `inception_score` gives the correct value."""
-    logits = np.array(
-        [np.array([1, 2] * 500 + [4]),
-         np.array([4, 5] * 500 + [6])])
-    unused_image = array_ops.zeros([2, 299, 299, 3])
-    incscore = _run_with_mock(classifier_metrics.inception_score, unused_image)
-
-    with self.cached_session(use_gpu=True) as sess:
-      incscore_np = sess.run(incscore, {'concat:0': logits})
-
-    self.assertAllClose(_expected_inception_score(logits), incscore_np)
-
-  def test_mean_only_frechet_classifier_distance_value(self):
-    """Test that `frechet_classifier_distance` gives the correct value."""
-    np.random.seed(0)
-
-    pool_real_a = np.float32(np.random.randn(256, 2048))
-    pool_gen_a = np.float32(np.random.randn(256, 2048))
-
-    tf_pool_real_a = array_ops.constant(pool_real_a)
-    tf_pool_gen_a = array_ops.constant(pool_gen_a)
-
-    mofid_op = classifier_metrics.mean_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
-        tf_pool_real_a, tf_pool_gen_a)
-
-    with self.cached_session() as sess:
-      actual_mofid = sess.run(mofid_op)
-
-    expected_mofid = _expected_mean_only_fid(pool_real_a, pool_gen_a)
-
-    self.assertAllClose(expected_mofid, actual_mofid, 0.0001)
-
-  def test_diagonal_only_frechet_classifier_distance_value(self):
-    """Test that `frechet_classifier_distance` gives the correct value."""
-    np.random.seed(0)
-
-    pool_real_a = np.float32(np.random.randn(256, 2048))
-    pool_gen_a = np.float32(np.random.randn(256, 2048))
-
-    tf_pool_real_a = array_ops.constant(pool_real_a)
-    tf_pool_gen_a = array_ops.constant(pool_gen_a)
-
-    dofid_op = classifier_metrics.diagonal_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
-        tf_pool_real_a, tf_pool_gen_a)
-
-    with self.cached_session() as sess:
-      actual_dofid = sess.run(dofid_op)
-
-    expected_dofid = _expected_diagonal_only_fid(pool_real_a, pool_gen_a)
-
-    self.assertAllClose(expected_dofid, actual_dofid, 0.0001)
-
-  def test_frechet_classifier_distance_value(self):
-    """Test that `frechet_classifier_distance` gives the correct value."""
-    np.random.seed(0)
-
-    # Make num_examples > num_features to ensure scipy's sqrtm function
-    # doesn't return a complex matrix.
-    test_pool_real_a = np.float32(np.random.randn(512, 256))
-    test_pool_gen_a = np.float32(np.random.randn(512, 256))
-
-    fid_op = _run_with_mock(
-        classifier_metrics.frechet_classifier_distance,
-        test_pool_real_a,
-        test_pool_gen_a,
-        classifier_fn=lambda x: x)
-
-    with self.cached_session() as sess:
-      actual_fid = sess.run(fid_op)
-
-    expected_fid = _expected_fid(test_pool_real_a, test_pool_gen_a)
-
-    self.assertAllClose(expected_fid, actual_fid, 0.0001)
-
-  def test_frechet_classifier_distance_covariance(self):
-    """Test that `frechet_classifier_distance` takes covariance into account."""
-    np.random.seed(0)
-
-    # Make num_examples > num_features to ensure scipy's sqrtm function
-    # doesn't return a complex matrix.
-    test_pool_reals, test_pool_gens = [], []
-    for i in range(1, 11, 2):
-      test_pool_reals.append(np.float32(np.random.randn(2048, 256) * i))
-      test_pool_gens.append(np.float32(np.random.randn(2048, 256) * i))
-
-    fid_ops = []
-    for i in range(len(test_pool_reals)):
-      fid_ops.append(_run_with_mock(
-          classifier_metrics.frechet_classifier_distance,
-          test_pool_reals[i],
-          test_pool_gens[i],
-          classifier_fn=lambda x: x))
-
-    fids = []
-    with self.cached_session() as sess:
-      for fid_op in fid_ops:
-        fids.append(sess.run(fid_op))
-
-    # Check that the FIDs increase monotonically.
-    self.assertTrue(all(fid_a < fid_b for fid_a, fid_b in zip(fids, fids[1:])))
-
-  def test_kernel_classifier_distance_value(self):
-    """Test that `kernel_classifier_distance` gives the correct value."""
-    np.random.seed(0)
-
-    test_pool_real_a = np.float32(np.random.randn(512, 256))
-    test_pool_gen_a = np.float32(np.random.randn(512, 256) * 1.1 + .05)
-
-    kid_op = _run_with_mock(
-        classifier_metrics.kernel_classifier_distance_and_std,
-        test_pool_real_a,
-        test_pool_gen_a,
-        classifier_fn=lambda x: x,
-        max_block_size=600)
-
-    with self.cached_session() as sess:
-      actual_kid, actual_std = sess.run(kid_op)
-
-    expected_kid, expected_std = _expected_kid_and_std(test_pool_real_a,
-                                                       test_pool_gen_a)
-
-    self.assertAllClose(expected_kid, actual_kid, 0.001)
-    self.assertAllClose(expected_std, actual_std, 0.001)
-
-  def test_kernel_classifier_distance_block_sizes(self):
-    """Test that `kernel_classifier_distance` works with unusual max_block_size
-
-    values..
-    """
-    np.random.seed(0)
-
-    test_pool_real_a = np.float32(np.random.randn(512, 256))
-    test_pool_gen_a = np.float32(np.random.randn(768, 256) * 1.1 + .05)
-
-    max_block_size = array_ops.placeholder(dtypes.int32, shape=())
-    kid_op = _run_with_mock(
-        classifier_metrics.kernel_classifier_distance_and_std_from_activations,
-        array_ops.constant(test_pool_real_a),
-        array_ops.constant(test_pool_gen_a),
-        max_block_size=max_block_size)
-
-    for block_size in [50, 512, 1000]:
-      with self.cached_session() as sess:
-        actual_kid, actual_std = sess.run(kid_op, {max_block_size: block_size})
-
-      expected_kid, expected_std = _expected_kid_and_std(
-          test_pool_real_a, test_pool_gen_a, max_block_size=block_size)
-
-      self.assertAllClose(expected_kid, actual_kid, 0.001)
-      self.assertAllClose(expected_std, actual_std, 0.001)
-
-  def test_trace_sqrt_product_value(self):
-    """Test that `trace_sqrt_product` gives the correct value."""
-    np.random.seed(0)
-
-    # Make num_examples > num_features to ensure scipy's sqrtm function
-    # doesn't return a complex matrix.
-    test_pool_real_a = np.float32(np.random.randn(512, 256))
-    test_pool_gen_a = np.float32(np.random.randn(512, 256))
-
-    cov_real = np.cov(test_pool_real_a, rowvar=False)
-    cov_gen = np.cov(test_pool_gen_a, rowvar=False)
-
-    trace_sqrt_prod_op = _run_with_mock(classifier_metrics.trace_sqrt_product,
-                                        cov_real, cov_gen)
-
-    with self.cached_session() as sess:
-      # trace_sqrt_product: tsp
-      actual_tsp = sess.run(trace_sqrt_prod_op)
-
-    expected_tsp = _expected_trace_sqrt_product(cov_real, cov_gen)
-
-    self.assertAllClose(actual_tsp, expected_tsp, 0.01)
-
-  def test_preprocess_image_graph(self):
-    """Test `preprocess_image` graph construction."""
-    incorrectly_sized_image = array_ops.zeros([520, 240, 3])
-    correct_image = classifier_metrics.preprocess_image(
-        images=incorrectly_sized_image)
-    _run_with_mock(classifier_metrics.run_inception,
-                   array_ops.expand_dims(correct_image, 0))
-
-  def test_get_graph_def_from_url_tarball(self):
-    """Test `get_graph_def_from_url_tarball`."""
-    # Write dummy binary GraphDef to tempfile.
-    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
-      tmp_file.write(_get_dummy_graphdef().SerializeToString())
-    relative_path = os.path.relpath(tmp_file.name)
-
-    # Create gzip tarball.
-    tar_dir = tempfile.mkdtemp()
-    tar_filename = os.path.join(tar_dir, 'tmp.tar.gz')
-    with tarfile.open(tar_filename, 'w:gz') as tar:
-      tar.add(relative_path)
-
-    with mock.patch.object(classifier_metrics, 'urllib') as mock_urllib:
-      mock_urllib.request.urlretrieve.return_value = tar_filename, None
-      graph_def = classifier_metrics.get_graph_def_from_url_tarball(
-          'unused_url', relative_path)
-
-    self.assertIsInstance(graph_def, graph_pb2.GraphDef)
-    self.assertEqual(_get_dummy_graphdef(), graph_def)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/eval_utils.py b/tensorflow/contrib/gan/python/eval/python/eval_utils.py
deleted file mode 100644
index bb7327040c9..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/eval_utils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility file for visualizing generated images."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.eval.python import eval_utils_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.eval.python.eval_utils_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = eval_utils_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/eval/python/eval_utils_impl.py b/tensorflow/contrib/gan/python/eval/python/eval_utils_impl.py
deleted file mode 100644
index 6623b56c706..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/eval_utils_impl.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility file for visualizing generated images."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-
-
-__all__ = [
-    "image_grid",
-    "image_reshaper",
-]
-
-
-# TODO(joelshor): Make this a special case of `image_reshaper`.
-def image_grid(input_tensor, grid_shape, image_shape=(32, 32), num_channels=3):
-  """Arrange a minibatch of images into a grid to form a single image.
-
-  Args:
-    input_tensor: Tensor. Minibatch of images to format, either 4D
-        ([batch size, height, width, num_channels]) or flattened
-        ([batch size, height * width * num_channels]).
-    grid_shape: Sequence of int. The shape of the image grid,
-        formatted as [grid_height, grid_width].
-    image_shape: Sequence of int. The shape of a single image,
-        formatted as [image_height, image_width].
-    num_channels: int. The number of channels in an image.
-
-  Returns:
-    Tensor representing a single image in which the input images have been
-    arranged into a grid.
-
-  Raises:
-    ValueError: The grid shape and minibatch size don't match, or the image
-        shape and number of channels are incompatible with the input tensor.
-  """
-  if grid_shape[0] * grid_shape[1] != int(input_tensor.shape[0]):
-    raise ValueError("Grid shape %s incompatible with minibatch size %i." %
-                     (grid_shape, int(input_tensor.shape[0])))
-  if len(input_tensor.shape) == 2:
-    num_features = image_shape[0] * image_shape[1] * num_channels
-    if int(input_tensor.shape[1]) != num_features:
-      raise ValueError("Image shape and number of channels incompatible with "
-                       "input tensor.")
-  elif len(input_tensor.shape) == 4:
-    if (int(input_tensor.shape[1]) != image_shape[0] or
-        int(input_tensor.shape[2]) != image_shape[1] or
-        int(input_tensor.shape[3]) != num_channels):
-      raise ValueError("Image shape and number of channels incompatible with "
-                       "input tensor.")
-  else:
-    raise ValueError("Unrecognized input tensor format.")
-  height, width = grid_shape[0] * image_shape[0], grid_shape[1] * image_shape[1]
-  input_tensor = array_ops.reshape(
-      input_tensor, tuple(grid_shape) + tuple(image_shape) + (num_channels,))
-  input_tensor = array_ops.transpose(input_tensor, [0, 1, 3, 2, 4])
-  input_tensor = array_ops.reshape(
-      input_tensor, [grid_shape[0], width, image_shape[0], num_channels])
-  input_tensor = array_ops.transpose(input_tensor, [0, 2, 1, 3])
-  input_tensor = array_ops.reshape(
-      input_tensor, [1, height, width, num_channels])
-  return input_tensor
-
-
-def _validate_images(images):
-  for img in images:
-    img.shape.assert_has_rank(3)
-    img.shape.assert_is_fully_defined()
-    if img.shape[-1] not in (1, 3):
-      raise ValueError("image_reshaper only supports 1 or 3 channel images.")
-
-
-# TODO(joelshor): Move the dimension logic from Python to Tensorflow.
-def image_reshaper(images, num_cols=None):
-  """A reshaped summary image.
-
-  Returns an image that will contain all elements in the list and will be
-  laid out in a nearly-square tiling pattern (e.g. 11 images will lead to a
-  3x4 tiled image).
-
-  Args:
-    images: Image data to summarize. Can be an RGB or grayscale image, a list of
-         such images, or a set of RGB images concatenated along the depth
-         dimension. The shape of each image is assumed to be [batch_size,
-         height, width, depth].
-    num_cols: (Optional) If provided, this is the number of columns in the final
-         output image grid. Otherwise, the number of columns is determined by
-         the number of images.
-
-  Returns:
-    A summary image matching the input with automatic tiling if needed.
-    Output shape is [1, height, width, channels].
-  """
-  if isinstance(images, ops.Tensor):
-    images = array_ops.unstack(images)
-  _validate_images(images)
-
-  num_images = len(images)
-  num_columns = (num_cols if num_cols else
-                 int(math.ceil(math.sqrt(num_images))))
-  num_rows = int(math.ceil(float(num_images) / num_columns))
-  rows = [images[x:x+num_columns] for x in range(0, num_images, num_columns)]
-
-  # Add empty image tiles if the last row is incomplete.
-  num_short = num_rows * num_columns - num_images
-  assert num_short >= 0 and num_short < num_columns
-  if num_short > 0:
-    rows[-1].extend([array_ops.zeros_like(images[-1])] * num_short)
-
-  # Convert each row from a list of tensors to a single tensor.
-  rows = [array_ops.concat(row, 1) for row in rows]
-
-  # Stack rows vertically.
-  img = array_ops.concat(rows, 0)
-
-  return array_ops.expand_dims(img, 0)
diff --git a/tensorflow/contrib/gan/python/eval/python/eval_utils_test.py b/tensorflow/contrib/gan/python/eval/python/eval_utils_test.py
deleted file mode 100644
index cfed4dc513e..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/eval_utils_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for eval_utils_test."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.eval.python import eval_utils_impl as eval_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class UtilsTest(test.TestCase):
-
-  def test_image_grid(self):
-    eval_utils.image_grid(
-        input_tensor=array_ops.zeros([25, 32, 32, 3]),
-        grid_shape=(5, 5))
-
-  # TODO(joelshor): Add more `image_reshaper` tests.
-  def test_image_reshaper_image_list(self):
-    images = eval_utils.image_reshaper(
-        images=array_ops.unstack(array_ops.zeros([25, 32, 32, 3])),
-        num_cols=2)
-    images.shape.assert_is_compatible_with([1, 13 * 32, 2 * 32, 3])
-
-  def test_image_reshaper_image(self):
-    images = eval_utils.image_reshaper(
-        images=array_ops.zeros([25, 32, 32, 3]),
-        num_cols=2)
-    images.shape.assert_is_compatible_with([1, 13 * 32, 2 * 32, 3])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
deleted file mode 100644
index 326fcb3cdbf..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Model evaluation tools for TF-GAN."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = sliced_wasserstein_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
deleted file mode 100644
index 9657d4e3d0c..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of Sliced Wasserstein Distance.
-
-Proposed in https://arxiv.org/abs/1710.10196 and the official Theano
-implementation that we used as reference can be found here:
-https://github.com/tkarras/progressive_growing_of_gans
-
-Note: this is not an exact distance but an approximation through random
-projections.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
-
-__all__ = ['sliced_wasserstein_distance']
-_GAUSSIAN_FILTER = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
-    6, 24, 36, 24, 6
-], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]).reshape([5, 5, 1, 1]) / 256.0
-
-
-def _laplacian_pyramid(batch, num_levels):
-  """Compute a Laplacian pyramid.
-
-  Args:
-      batch: (tensor) The batch of images (batch, height, width, channels).
-      num_levels: (int) Desired number of hierarchical levels.
-  Returns:
-      List of tensors from the highest to lowest resolution.
-  """
-  gaussian_filter = constant_op.constant(_GAUSSIAN_FILTER)
-
-  def spatial_conv(batch, gain):
-    s = array_ops.shape(batch)
-    padded = array_ops.pad(batch, [[0, 0], [2, 2], [2, 2], [0, 0]], 'REFLECT')
-    xt = array_ops.transpose(padded, [0, 3, 1, 2])
-    xt = array_ops.reshape(xt, [s[0] * s[3], s[1] + 4, s[2] + 4, 1])
-    conv_out = nn_ops.conv2d(xt, gaussian_filter * gain, [1] * 4, 'VALID')
-    conv_xt = array_ops.reshape(conv_out, [s[0], s[3], s[1], s[2]])
-    conv_xt = array_ops.transpose(conv_xt, [0, 2, 3, 1])
-    return conv_xt
-
-  def pyr_down(batch):  # matches cv2.pyrDown()
-    return spatial_conv(batch, 1)[:, ::2, ::2]
-
-  def pyr_up(batch):  # matches cv2.pyrUp()
-    s = array_ops.shape(batch)
-    zeros = array_ops.zeros([3 * s[0], s[1], s[2], s[3]])
-    res = array_ops.concat([batch, zeros], 0)
-    res = array_ops.batch_to_space(res, crops=[[0, 0], [0, 0]], block_size=2)
-    res = spatial_conv(res, 4)
-    return res
-
-  pyramid = [math_ops.cast(batch, dtypes.float32)]
-  for _ in range(1, num_levels):
-    pyramid.append(pyr_down(pyramid[-1]))
-    pyramid[-2] -= pyr_up(pyramid[-1])
-  return pyramid
-
-
-def _batch_to_patches(batch, patches_per_image, patch_size):
-  """Extract patches from a batch.
-
-  Args:
-      batch: (tensor) The batch of images (batch, height, width, channels).
-      patches_per_image: (int) Number of patches to extract per image.
-      patch_size: (int) Size of the patches (size, size, channels) to extract.
-  Returns:
-      Tensor (batch*patches_per_image, patch_size, patch_size, channels) of
-      patches.
-  """
-
-  def py_func_random_patches(batch):
-    """Numpy wrapper."""
-    batch_size, height, width, channels = batch.shape
-    patch_count = patches_per_image * batch_size
-    hs = patch_size // 2
-    # Randomly pick patches.
-    patch_id, y, x, chan = np.ogrid[0:patch_count, -hs:hs + 1, -hs:hs + 1, 0:3]
-    img_id = patch_id // patches_per_image
-    # pylint: disable=g-no-augmented-assignment
-    # Need explicit addition for broadcast to work properly.
-    y = y + np.random.randint(hs, height - hs, size=(patch_count, 1, 1, 1))
-    x = x + np.random.randint(hs, width - hs, size=(patch_count, 1, 1, 1))
-    # pylint: enable=g-no-augmented-assignment
-    idx = ((img_id * height + y) * width + x) * channels + chan
-    patches = batch.flat[idx]
-    return patches
-
-  patches = script_ops.py_func(
-      py_func_random_patches, [batch], batch.dtype, stateful=False)
-  return patches
-
-
-def _normalize_patches(patches):
-  """Normalize patches by their mean and standard deviation.
-
-  Args:
-      patches: (tensor) The batch of patches (batch, size, size, channels).
-  Returns:
-      Tensor (batch, size, size, channels) of the normalized patches.
-  """
-  patches = array_ops.concat(patches, 0)
-  mean, variance = nn.moments(patches, [1, 2, 3], keep_dims=True)
-  patches = (patches - mean) / math_ops.sqrt(variance)
-  return array_ops.reshape(patches, [array_ops.shape(patches)[0], -1])
-
-
-def _sort_rows(matrix, num_rows):
-  """Sort matrix rows by the last column.
-
-  Args:
-      matrix: a matrix of values (row,col).
-      num_rows: (int) number of sorted rows to return from the matrix.
-  Returns:
-      Tensor (num_rows, col) of the sorted matrix top K rows.
-  """
-  tmatrix = array_ops.transpose(matrix, [1, 0])
-  sorted_tmatrix = nn_ops.top_k(tmatrix, num_rows)[0]
-  return array_ops.transpose(sorted_tmatrix, [1, 0])
-
-
-def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
-  """Compute the approximate sliced Wasserstein distance.
-
-  Args:
-      a: (matrix) Distribution "a" of samples (row, col).
-      b: (matrix) Distribution "b" of samples (row, col).
-      random_sampling_count: (int) Number of random projections to average.
-      random_projection_dim: (int) Dimension of the random projection space.
-  Returns:
-      Float containing the approximate distance between "a" and "b".
-  """
-  s = array_ops.shape(a)
-  means = []
-  for _ in range(random_sampling_count):
-    # Random projection matrix.
-    proj = random_ops.random_normal(
-        [array_ops.shape(a)[1], random_projection_dim])
-    proj *= math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
-    # Project both distributions and sort them.
-    proj_a = math_ops.matmul(a, proj)
-    proj_b = math_ops.matmul(b, proj)
-    proj_a = _sort_rows(proj_a, s[0])
-    proj_b = _sort_rows(proj_b, s[0])
-    # Pairwise Wasserstein distance.
-    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
-    means.append(wdist)
-  return math_ops.reduce_mean(means)
-
-
-def _sliced_wasserstein_svd(a, b):
-  """Compute the approximate sliced Wasserstein distance using an SVD.
-
-  This is not part of the paper, it's a variant with possibly more accurate
-  measure.
-
-  Args:
-      a: (matrix) Distribution "a" of samples (row, col).
-      b: (matrix) Distribution "b" of samples (row, col).
-  Returns:
-      Float containing the approximate distance between "a" and "b".
-  """
-  s = array_ops.shape(a)
-  # Random projection matrix.
-  sig, u = linalg_ops.svd(array_ops.concat([a, b], 0))[:2]
-  proj_a, proj_b = array_ops.split(u * sig, 2, axis=0)
-  proj_a = _sort_rows(proj_a[:, ::-1], s[0])
-  proj_b = _sort_rows(proj_b[:, ::-1], s[0])
-  # Pairwise Wasserstein distance.
-  wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
-  return wdist
-
-
-def sliced_wasserstein_distance(real_images,
-                                fake_images,
-                                resolution_min=16,
-                                patches_per_image=64,
-                                patch_size=7,
-                                random_sampling_count=1,
-                                random_projection_dim=7 * 7 * 3,
-                                use_svd=False):
-  """Compute the Wasserstein distance between two distributions of images.
-
-  Note that measure vary with the number of images. Use 8192 images to get
-  numbers comparable to the ones in the original paper.
-
-  Args:
-      real_images: (tensor) Real images (batch, height, width, channels).
-      fake_images: (tensor) Fake images (batch, height, width, channels).
-      resolution_min: (int) Minimum resolution for the Laplacian pyramid.
-      patches_per_image: (int) Number of patches to extract per image per
-        Laplacian level.
-      patch_size: (int) Width of a square patch.
-      random_sampling_count: (int) Number of random projections to average.
-      random_projection_dim: (int) Dimension of the random projection space.
-      use_svd: experimental method to compute a more accurate distance.
-  Returns:
-      List of tuples (distance_real, distance_fake) for each level of the
-      Laplacian pyramid from the highest resolution to the lowest.
-        distance_real is the Wasserstein distance between real images
-        distance_fake is the Wasserstein distance between real and fake images.
-  Raises:
-      ValueError: If the inputs shapes are incorrect. Input tensor dimensions
-      (batch, height, width, channels) are expected to be known at graph
-      construction time. In addition height and width must be the same and the
-      number of colors should be exactly 3. Real and fake images must have the
-      same size.
-  """
-  height = real_images.shape[1]
-  real_images.shape.assert_is_compatible_with([None, None, height, 3])
-  fake_images.shape.assert_is_compatible_with(real_images.shape)
-
-  # Select resolutions.
-  resolution_full = int(height)
-  resolution_min = min(resolution_min, resolution_full)
-  resolution_max = resolution_full
-  # Base loss of detail.
-  resolutions = [
-      2**i
-      for i in range(
-          int(np.log2(resolution_max)),
-          int(np.log2(resolution_min)) - 1, -1)
-  ]
-
-  # Gather patches for each level of the Laplacian pyramids.
-  patches_real, patches_fake, patches_test = (
-      [[] for _ in resolutions] for _ in range(3))
-  for lod, level in enumerate(
-      _laplacian_pyramid(real_images, len(resolutions))):
-    patches_real[lod].append(
-        _batch_to_patches(level, patches_per_image, patch_size))
-    patches_test[lod].append(
-        _batch_to_patches(level, patches_per_image, patch_size))
-
-  for lod, level in enumerate(
-      _laplacian_pyramid(fake_images, len(resolutions))):
-    patches_fake[lod].append(
-        _batch_to_patches(level, patches_per_image, patch_size))
-
-  for lod in range(len(resolutions)):
-    for patches in [patches_real, patches_test, patches_fake]:
-      patches[lod] = _normalize_patches(patches[lod])
-
-  # Evaluate scores.
-  scores = []
-  for lod in range(len(resolutions)):
-    if not use_svd:
-      scores.append(
-          (_sliced_wasserstein(patches_real[lod], patches_test[lod],
-                               random_sampling_count, random_projection_dim),
-           _sliced_wasserstein(patches_real[lod], patches_fake[lod],
-                               random_sampling_count, random_projection_dim)))
-    else:
-      scores.append(
-          (_sliced_wasserstein_svd(patches_real[lod], patches_test[lod]),
-           _sliced_wasserstein_svd(patches_real[lod], patches_fake[lod])))
-  return scores
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
deleted file mode 100644
index ab909feae37..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Sliced Wasserstein Distance."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from scipy import ndimage
-from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl as swd
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import test
-
-
-class ClassifierMetricsTest(test.TestCase):
-
-  def test_laplacian_pyramid(self):
-    # The numpy/scipy code for reference estimation comes from:
-    # https://github.com/tkarras/progressive_growing_of_gans
-    gaussian_filter = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
-        6, 24, 36, 24, 6
-    ], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]) / 256.0
-
-    def np_pyr_down(minibatch):  # matches cv2.pyrDown()
-      assert minibatch.ndim == 4
-      return ndimage.convolve(
-          minibatch,
-          gaussian_filter[np.newaxis, np.newaxis, :, :],
-          mode='mirror')[:, :, ::2, ::2]
-
-    def np_pyr_up(minibatch):  # matches cv2.pyrUp()
-      assert minibatch.ndim == 4
-      s = minibatch.shape
-      res = np.zeros((s[0], s[1], s[2] * 2, s[3] * 2), minibatch.dtype)
-      res[:, :, ::2, ::2] = minibatch
-      return ndimage.convolve(
-          res,
-          gaussian_filter[np.newaxis, np.newaxis, :, :] * 4.0,
-          mode='mirror')
-
-    def np_laplacian_pyramid(minibatch, num_levels):
-      # Note: there's a bug in the original SWD, fixed repeatability.
-      pyramid = [minibatch.astype('f').copy()]
-      for _ in range(1, num_levels):
-        pyramid.append(np_pyr_down(pyramid[-1]))
-        pyramid[-2] -= np_pyr_up(pyramid[-1])
-      return pyramid
-
-    data = np.random.normal(size=[256, 3, 32, 32]).astype('f')
-    pyramid = np_laplacian_pyramid(data, 3)
-    data_tf = array_ops.placeholder(dtypes.float32, [256, 32, 32, 3])
-    pyramid_tf = swd._laplacian_pyramid(data_tf, 3)
-    with self.cached_session() as sess:
-      pyramid_tf = sess.run(
-          pyramid_tf, feed_dict={
-              data_tf: data.transpose(0, 2, 3, 1)
-          })
-    for x in range(3):
-      self.assertAllClose(
-          pyramid[x].transpose(0, 2, 3, 1), pyramid_tf[x], atol=1e-6)
-
-  def test_sliced_wasserstein_distance(self):
-    """Test the distance."""
-    d1 = random_ops.random_uniform([256, 32, 32, 3])
-    d2 = random_ops.random_normal([256, 32, 32, 3])
-    wfunc = swd.sliced_wasserstein_distance(d1, d2)
-    with self.cached_session() as sess:
-      wscores = [sess.run(x) for x in wfunc]
-    self.assertAllClose(
-        np.array([0.014, 0.014], 'f'),
-        np.array([x[0] for x in wscores], 'f'),
-        rtol=0.15)
-    self.assertAllClose(
-        np.array([0.014, 0.020], 'f'),
-        np.array([x[1] for x in wscores], 'f'),
-        rtol=0.15)
-
-  def test_sliced_wasserstein_distance_svd(self):
-    """Test the distance."""
-    d1 = random_ops.random_uniform([256, 32, 32, 3])
-    d2 = random_ops.random_normal([256, 32, 32, 3])
-    wfunc = swd.sliced_wasserstein_distance(d1, d2, use_svd=True)
-    with self.cached_session() as sess:
-      wscores = [sess.run(x) for x in wfunc]
-    self.assertAllClose(
-        np.array([0.013, 0.013], 'f'),
-        np.array([x[0] for x in wscores], 'f'),
-        rtol=0.15)
-    self.assertAllClose(
-        np.array([0.014, 0.019], 'f'),
-        np.array([x[1] for x in wscores], 'f'),
-        rtol=0.15)
-
-  def test_swd_mismatched(self):
-    """Test the inputs mismatched shapes are detected."""
-    d1 = random_ops.random_uniform([256, 32, 32, 3])
-    d2 = random_ops.random_normal([256, 32, 31, 3])
-    d3 = random_ops.random_normal([256, 31, 32, 3])
-    d4 = random_ops.random_normal([255, 32, 32, 3])
-    with self.assertRaises(ValueError):
-      swd.sliced_wasserstein_distance(d1, d2)
-    with self.assertRaises(ValueError):
-      swd.sliced_wasserstein_distance(d1, d3)
-    with self.assertRaises(ValueError):
-      swd.sliced_wasserstein_distance(d1, d4)
-
-  def test_swd_not_rgb(self):
-    """Test that only RGB is supported."""
-    d1 = random_ops.random_uniform([256, 32, 32, 1])
-    d2 = random_ops.random_normal([256, 32, 32, 1])
-    with self.assertRaises(ValueError):
-      swd.sliced_wasserstein_distance(d1, d2)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
deleted file mode 100644
index 3eb4f5db0c8..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Common TF-GAN summaries."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python.eval.python import eval_utils
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import map_fn
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import util as loss_util
-from tensorflow.python.summary import summary
-
-__all__ = [
-    'add_gan_model_image_summaries',
-    'add_image_comparison_summaries',
-    'add_gan_model_summaries',
-    'add_regularization_loss_summaries',
-    'add_cyclegan_image_summaries',
-    'add_stargan_image_summaries'
-]
-
-
-def _assert_is_image(data):
-  data.shape.assert_has_rank(4)
-  data.shape[1:].assert_is_fully_defined()
-
-
-def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
-  """Adds image summaries for real and fake images.
-
-  Args:
-    gan_model: A GANModel tuple.
-    grid_size: The size of an image grid.
-    model_summaries: Also add summaries of the model.
-
-  Raises:
-    ValueError: If real and generated data aren't images.
-  """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    raise ValueError(
-        '`add_gan_model_image_summaries` does not take CycleGANModels. Please '
-        'use `add_cyclegan_image_summaries` instead.')
-  _assert_is_image(gan_model.real_data)
-  _assert_is_image(gan_model.generated_data)
-
-  num_images = grid_size ** 2
-  real_image_shape = gan_model.real_data.shape.as_list()[1:3]
-  generated_image_shape = gan_model.generated_data.shape.as_list()[1:3]
-  real_channels = gan_model.real_data.shape.as_list()[3]
-  generated_channels = gan_model.generated_data.shape.as_list()[3]
-
-  summary.image(
-      'real_data',
-      eval_utils.image_grid(
-          gan_model.real_data[:num_images],
-          grid_shape=(grid_size, grid_size),
-          image_shape=real_image_shape,
-          num_channels=real_channels),
-      max_outputs=1)
-  summary.image(
-      'generated_data',
-      eval_utils.image_grid(
-          gan_model.generated_data[:num_images],
-          grid_shape=(grid_size, grid_size),
-          image_shape=generated_image_shape,
-          num_channels=generated_channels),
-      max_outputs=1)
-
-  if model_summaries:
-    add_gan_model_summaries(gan_model)
-
-
-def add_cyclegan_image_summaries(cyclegan_model):
-  """Adds image summaries for CycleGAN.
-
-  There are two summaries, one for each generator. The first image is the
-  generator input, the second is the generator output, and the third is G(F(x)).
-
-  Args:
-    cyclegan_model: A CycleGANModel tuple.
-
-  Raises:
-    ValueError: If `cyclegan_model` isn't a CycleGANModel.
-    ValueError: If generated data, generator inputs, and reconstructions aren't
-      images.
-    ValueError: If the generator input, generated data, and reconstructions
-      aren't all the same size.
-  """
-  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
-    raise ValueError('`cyclegan_model` was not a CycleGANModel. Instead, was '
-                     '%s' % type(cyclegan_model))
-
-  _assert_is_image(cyclegan_model.model_x2y.generator_inputs)
-  _assert_is_image(cyclegan_model.model_x2y.generated_data)
-  _assert_is_image(cyclegan_model.reconstructed_x)
-  _assert_is_image(cyclegan_model.model_y2x.generator_inputs)
-  _assert_is_image(cyclegan_model.model_y2x.generated_data)
-  _assert_is_image(cyclegan_model.reconstructed_y)
-
-  def _add_comparison_summary(gan_model, reconstructions):
-    image_list = (array_ops.unstack(gan_model.generator_inputs[:1]) +
-                  array_ops.unstack(gan_model.generated_data[:1]) +
-                  array_ops.unstack(reconstructions[:1]))
-    summary.image(
-        'image_comparison', eval_utils.image_reshaper(
-            image_list, num_cols=len(image_list)), max_outputs=1)
-
-  with ops.name_scope('x2y_image_comparison_summaries'):
-    _add_comparison_summary(
-        cyclegan_model.model_x2y, cyclegan_model.reconstructed_x)
-  with ops.name_scope('y2x_image_comparison_summaries'):
-    _add_comparison_summary(
-        cyclegan_model.model_y2x, cyclegan_model.reconstructed_y)
-
-
-def add_image_comparison_summaries(gan_model, num_comparisons=2,
-                                   display_diffs=False):
-  """Adds image summaries to compare triplets of images.
-
-  The first image is the generator input, the second is the generator output,
-  and the third is the real data. This style of comparison is useful for
-  image translation problems, where the generator input is a corrupted image,
-  the generator output is the reconstruction, and the real data is the target.
-
-  Args:
-    gan_model: A GANModel tuple.
-    num_comparisons: The number of image triplets to display.
-    display_diffs: Also display the difference between generated and target.
-
-  Raises:
-    ValueError: If real data, generated data, and generator inputs aren't
-      images.
-    ValueError: If the generator input, real, and generated data aren't all the
-      same size.
-  """
-  _assert_is_image(gan_model.generator_inputs)
-  _assert_is_image(gan_model.generated_data)
-  _assert_is_image(gan_model.real_data)
-
-  gan_model.generated_data.shape.assert_is_compatible_with(
-      gan_model.generator_inputs.shape)
-  gan_model.real_data.shape.assert_is_compatible_with(
-      gan_model.generated_data.shape)
-
-  image_list = []
-  image_list.extend(
-      array_ops.unstack(gan_model.generator_inputs[:num_comparisons]))
-  image_list.extend(
-      array_ops.unstack(gan_model.generated_data[:num_comparisons]))
-  image_list.extend(array_ops.unstack(gan_model.real_data[:num_comparisons]))
-  if display_diffs:
-    generated_list = array_ops.unstack(
-        gan_model.generated_data[:num_comparisons])
-    real_list = array_ops.unstack(gan_model.real_data[:num_comparisons])
-    diffs = [
-        math_ops.abs(math_ops.cast(generated, dtypes.float32) -
-                     math_ops.cast(real, dtypes.float32))
-        for generated, real in zip(generated_list, real_list)
-    ]
-    image_list.extend(diffs)
-
-  # Reshape image and display.
-  summary.image(
-      'image_comparison',
-      eval_utils.image_reshaper(image_list, num_cols=num_comparisons),
-      max_outputs=1)
-
-
-def add_stargan_image_summaries(stargan_model,
-                                num_images=2,
-                                display_diffs=False):
-  """Adds image summaries to see StarGAN image results.
-
-  If display_diffs is True, each image result has `2` rows and `num_domains + 1`
-  columns.
-  The first row looks like:
-    [original_image, transformed_to_domain_0, transformed_to_domain_1, ...]
-  The second row looks like:
-    [no_modification_baseline, transformed_to_domain_0-original_image, ...]
-  If display_diffs is False, only the first row is shown.
-
-  IMPORTANT:
-    Since the model originally does not transformed the image to every domains,
-    we will transform them on-the-fly within this function in parallel.
-
-  Args:
-    stargan_model: A StarGANModel tuple.
-    num_images: The number of examples/images to be transformed and shown.
-    display_diffs: Also display the difference between generated and target.
-
-  Raises:
-    ValueError: If input_data is not images.
-    ValueError: If input_data_domain_label is not rank 2.
-    ValueError: If dimension 2 of input_data_domain_label is not fully defined.
-  """
-
-  _assert_is_image(stargan_model.input_data)
-  stargan_model.input_data_domain_label.shape.assert_has_rank(2)
-  stargan_model.input_data_domain_label.shape[1:].assert_is_fully_defined()
-
-  num_domains = stargan_model.input_data_domain_label.get_shape().as_list()[-1]
-
-  def _build_image(image):
-    """Helper function to create a result for each image on the fly."""
-
-    # Expand the first dimension as batch_size = 1.
-    images = array_ops.expand_dims(image, axis=0)
-
-    # Tile the image num_domains times, so we can get all transformed together.
-    images = array_ops.tile(images, [num_domains, 1, 1, 1])
-
-    # Create the targets to 0, 1, 2, ..., num_domains-1.
-    targets = array_ops.one_hot(list(range(num_domains)), num_domains)
-
-    with variable_scope.variable_scope(
-        stargan_model.generator_scope, reuse=True):
-
-      # Add the original image.
-      output_images_list = [image]
-
-      # Generate the image and add to the list.
-      gen_images = stargan_model.generator_fn(images, targets)
-      gen_images_list = array_ops.split(gen_images, num_domains)
-      gen_images_list = [
-          array_ops.squeeze(img, axis=0) for img in gen_images_list
-      ]
-      output_images_list.extend(gen_images_list)
-
-      # Display diffs.
-      if display_diffs:
-        diff_images = gen_images - images
-        diff_images_list = array_ops.split(diff_images, num_domains)
-        diff_images_list = [
-            array_ops.squeeze(img, axis=0) for img in diff_images_list
-        ]
-        output_images_list.append(array_ops.zeros_like(image))
-        output_images_list.extend(diff_images_list)
-
-      # Create the final image.
-      final_image = eval_utils.image_reshaper(
-          output_images_list, num_cols=num_domains + 1)
-
-    # Reduce the first rank.
-    return array_ops.squeeze(final_image, axis=0)
-
-  summary.image(
-      'stargan_image_generation',
-      map_fn.map_fn(
-          _build_image,
-          stargan_model.input_data[:num_images],
-          parallel_iterations=num_images,
-          back_prop=False,
-          swap_memory=True),
-      max_outputs=num_images)
-
-
-def add_gan_model_summaries(gan_model):
-  """Adds typical GANModel summaries.
-
-  Args:
-    gan_model: A GANModel tuple.
-  """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    with ops.name_scope('cyclegan_x2y_summaries'):
-      add_gan_model_summaries(gan_model.model_x2y)
-    with ops.name_scope('cyclegan_y2x_summaries'):
-      add_gan_model_summaries(gan_model.model_y2x)
-    return
-
-  with ops.name_scope('generator_variables'):
-    for var in gan_model.generator_variables:
-      summary.histogram(var.name, var)
-  with ops.name_scope('discriminator_variables'):
-    for var in gan_model.discriminator_variables:
-      summary.histogram(var.name, var)
-
-
-def add_regularization_loss_summaries(gan_model):
-  """Adds summaries for a regularization losses..
-
-  Args:
-    gan_model: A GANModel tuple.
-  """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    with ops.name_scope('cyclegan_x2y_regularization_loss_summaries'):
-      add_regularization_loss_summaries(gan_model.model_x2y)
-    with ops.name_scope('cyclegan_y2x_regularization_loss_summaries'):
-      add_regularization_loss_summaries(gan_model.model_y2x)
-    return
-
-  if gan_model.generator_scope:
-    summary.scalar(
-        'generator_regularization_loss',
-        loss_util.get_regularization_loss(gan_model.generator_scope.name))
-  if gan_model.discriminator_scope:
-    summary.scalar(
-        'discriminator_regularization_loss',
-        loss_util.get_regularization_loss(gan_model.discriminator_scope.name))
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
deleted file mode 100644
index 53fc7cb8ede..00000000000
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TF-GAN summaries."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python.eval.python import summaries_impl as summaries
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary
-
-
-def generator_model(inputs):
-  return variable_scope.get_variable('dummy_g', initializer=2.0) * inputs
-
-
-def discriminator_model(inputs, _):
-  return variable_scope.get_variable('dummy_d', initializer=2.0) * inputs
-
-
-def stargan_generator_model(inputs, _):
-  return generator_model(inputs)
-
-
-def get_gan_model():
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    pass
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    pass
-  return namedtuples.GANModel(
-      generator_inputs=array_ops.zeros([4, 32, 32, 3]),
-      generated_data=array_ops.zeros([4, 32, 32, 3]),
-      generator_variables=[variables.Variable(0), variables.Variable(1)],
-      generator_scope=gen_scope,
-      generator_fn=generator_model,
-      real_data=array_ops.ones([4, 32, 32, 3]),
-      discriminator_real_outputs=array_ops.ones([1, 2, 3]),
-      discriminator_gen_outputs=array_ops.ones([1, 2, 3]),
-      discriminator_variables=[variables.Variable(0)],
-      discriminator_scope=dis_scope,
-      discriminator_fn=discriminator_model)
-
-
-def get_stargan_model():
-  """Similar to get_gan_model()."""
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    pass
-  with variable_scope.variable_scope('generator') as gen_scope:
-    return namedtuples.StarGANModel(
-        input_data=array_ops.ones([1, 2, 2, 3]),
-        input_data_domain_label=array_ops.ones([1, 2]),
-        generated_data=stargan_generator_model(
-            array_ops.ones([1, 2, 2, 3]), None),
-        generated_data_domain_target=array_ops.ones([1, 2]),
-        reconstructed_data=array_ops.ones([1, 2, 2, 3]),
-        discriminator_input_data_source_predication=array_ops.ones([1]),
-        discriminator_generated_data_source_predication=array_ops.ones([1]),
-        discriminator_input_data_domain_predication=array_ops.ones([1, 2]),
-        discriminator_generated_data_domain_predication=array_ops.ones([1, 2]),
-        generator_variables=None,
-        generator_scope=gen_scope,
-        generator_fn=stargan_generator_model,
-        discriminator_variables=None,
-        discriminator_scope=dis_scope,
-        discriminator_fn=discriminator_model)
-
-
-def get_cyclegan_model():
-  with variable_scope.variable_scope('x2y'):
-    model_x2y = get_gan_model()
-  with variable_scope.variable_scope('y2x'):
-    model_y2x = get_gan_model()
-  return namedtuples.CycleGANModel(
-      model_x2y=model_x2y,
-      model_y2x=model_y2x,
-      reconstructed_x=array_ops.zeros([4, 32, 32, 3]),
-      reconstructed_y=array_ops.zeros([4, 32, 32, 3]))
-
-
-class SummariesTest(test.TestCase):
-
-  def _test_add_gan_model_image_summaries_impl(
-      self, get_model_fn, expected_num_summary_ops, model_summaries):
-    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
-                                            model_summaries=model_summaries)
-
-    self.assertEquals(expected_num_summary_ops,
-                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    with self.test_session(use_gpu=True):
-      variables.global_variables_initializer().run()
-      summary.merge_all().eval()
-
-  def test_add_gan_model_image_summaries(self):
-    self._test_add_gan_model_image_summaries_impl(get_gan_model, 5, True)
-
-  def test_add_gan_model_image_summaries_no_model(self):
-    self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
-
-  def test_cyclegan_image_summaries_dont_work(self):
-    with self.assertRaises(ValueError):
-      summaries.add_gan_model_image_summaries(get_cyclegan_model())
-
-  def _test_add_gan_model_summaries_impl(self, get_model_fn,
-                                         expected_num_summary_ops):
-    summaries.add_gan_model_summaries(get_model_fn())
-
-    self.assertEquals(expected_num_summary_ops,
-                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    with self.test_session(use_gpu=True):
-      variables.global_variables_initializer().run()
-      summary.merge_all().eval()
-
-  def test_add_gan_model_summaries(self):
-    self._test_add_gan_model_summaries_impl(get_gan_model, 3)
-
-  def test_add_gan_model_summaries_for_cyclegan(self):
-    self._test_add_gan_model_summaries_impl(get_cyclegan_model, 6)
-
-  def _test_add_regularization_loss_summaries_impl(self, get_model_fn,
-                                                   expected_num_summary_ops):
-    summaries.add_regularization_loss_summaries(get_model_fn())
-
-    self.assertEquals(expected_num_summary_ops,
-                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    with self.test_session(use_gpu=True):
-      summary.merge_all().eval()
-
-  def test_add_regularization_loss_summaries(self):
-    self._test_add_regularization_loss_summaries_impl(get_gan_model, 2)
-
-  def test_add_regularization_loss_summaries_for_cyclegan(self):
-    self._test_add_regularization_loss_summaries_impl(get_cyclegan_model, 4)
-
-  # TODO(joelshor): Add correctness test.
-  def _test_add_image_comparison_summaries_impl(self, get_model_fn,
-                                                expected_num_summary_ops):
-    summaries.add_image_comparison_summaries(get_model_fn(), display_diffs=True)
-
-    self.assertEquals(expected_num_summary_ops,
-                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    with self.test_session(use_gpu=True):
-      summary.merge_all().eval()
-
-  def test_add_image_comparison_summaries(self):
-    self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
-
-  def test_add_image_comparison_summaries_for_cyclegan(self):
-    summaries.add_cyclegan_image_summaries(get_cyclegan_model())
-
-    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    with self.test_session(use_gpu=True):
-      summary.merge_all().eval()
-
-  def test_add_image_comparison_summaries_for_stargan(self):
-
-    summaries.add_stargan_image_summaries(get_stargan_model())
-
-    self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
-      summary.merge_all().eval()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
deleted file mode 100644
index 410c3a02052..00000000000
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TFGAN features module.
-
-This module includes support for virtual batch normalization, buffer replay,
-conditioning, etc.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Collapse features into a single namespace.
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.gan.python.features.python import clip_weights
-from tensorflow.contrib.gan.python.features.python import conditioning_utils
-from tensorflow.contrib.gan.python.features.python import random_tensor_pool
-from tensorflow.contrib.gan.python.features.python import spectral_normalization
-from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
-
-from tensorflow.contrib.gan.python.features.python.clip_weights import *
-from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
-from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
-from tensorflow.contrib.gan.python.features.python.spectral_normalization import *
-from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
-# pylint: enable=unused-import,wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = clip_weights.__all__
-_allowed_symbols += conditioning_utils.__all__
-_allowed_symbols += random_tensor_pool.__all__
-_allowed_symbols += spectral_normalization.__all__
-_allowed_symbols += virtual_batchnorm.__all__
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights.py b/tensorflow/contrib/gan/python/features/python/clip_weights.py
deleted file mode 100644
index fa76fd7928f..00000000000
--- a/tensorflow/contrib/gan/python/features/python/clip_weights.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities to clip weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import clip_weights_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.clip_weights_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = clip_weights_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_impl.py b/tensorflow/contrib/gan/python/features/python/clip_weights_impl.py
deleted file mode 100644
index 96fbb8186d7..00000000000
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_impl.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities to clip weights.
-
-This is useful in the original formulation of the Wasserstein loss, which
-requires that the discriminator be K-Lipschitz. See
-https://arxiv.org/pdf/1701.07875 for more details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.opt.python.training import variable_clipping_optimizer
-
-
-__all__ = [
-    'clip_variables',
-    'clip_discriminator_weights',
-]
-
-
-def clip_discriminator_weights(optimizer, model, weight_clip):
-  """Modifies an optimizer so it clips weights to a certain value.
-
-  Args:
-    optimizer: An optimizer to perform variable weight clipping.
-    model: A GANModel namedtuple.
-    weight_clip: Positive python float to clip discriminator weights. Used to
-      enforce a K-lipschitz condition, which is useful for some GAN training
-      schemes (ex WGAN: https://arxiv.org/pdf/1701.07875).
-
-  Returns:
-    An optimizer to perform weight clipping after updates.
-
-  Raises:
-    ValueError: If `weight_clip` is less than 0.
-  """
-  return clip_variables(optimizer, model.discriminator_variables, weight_clip)
-
-
-def clip_variables(optimizer, variables, weight_clip):
-  """Modifies an optimizer so it clips weights to a certain value.
-
-  Args:
-    optimizer: An optimizer to perform variable weight clipping.
-    variables: A list of TensorFlow variables.
-    weight_clip: Positive python float to clip discriminator weights. Used to
-      enforce a K-lipschitz condition, which is useful for some GAN training
-      schemes (ex WGAN: https://arxiv.org/pdf/1701.07875).
-
-  Returns:
-    An optimizer to perform weight clipping after updates.
-
-  Raises:
-    ValueError: If `weight_clip` is less than 0.
-  """
-  if weight_clip < 0:
-    raise ValueError(
-        '`discriminator_weight_clip` must be positive. Instead, was %s',
-        weight_clip)
-  return variable_clipping_optimizer.VariableClippingOptimizer(
-      opt=optimizer,
-      # Do no reduction, so clipping happens per-value.
-      vars_to_clip_dims={var: [] for var in variables},
-      max_norm=weight_clip,
-      use_locking=True,
-      colocate_clip_ops_with_vars=True)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
deleted file mode 100644
index e4fac1976d6..00000000000
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for features.clip_weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.contrib.gan.python.features.python import clip_weights_impl as clip_weights
-
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training import training
-
-
-class ClipWeightsTest(test.TestCase):
-  """Tests for `discriminator_weight_clip`."""
-
-  def setUp(self):
-    super(ClipWeightsTest, self).setUp()
-    self.variables = [variables.Variable(2.0)]
-    self.tuple = collections.namedtuple(
-        'VarTuple', ['discriminator_variables'])(self.variables)
-
-  def _test_weight_clipping_helper(self, use_tuple):
-    loss = self.variables[0]
-    opt = training.GradientDescentOptimizer(1.0)
-    if use_tuple:
-      opt_clip = clip_weights.clip_variables(opt, self.variables, 0.1)
-    else:
-      opt_clip = clip_weights.clip_discriminator_weights(opt, self.tuple, 0.1)
-
-    train_op1 = opt.minimize(loss, var_list=self.variables)
-    train_op2 = opt_clip.minimize(loss, var_list=self.variables)
-
-    with self.cached_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(2.0, self.variables[0].eval())
-      sess.run(train_op1)
-      self.assertLess(0.1, self.variables[0].eval())
-
-    with self.cached_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(2.0, self.variables[0].eval())
-      sess.run(train_op2)
-      self.assertNear(0.1, self.variables[0].eval(), 1e-7)
-
-  def test_weight_clipping_argsonly(self):
-    self._test_weight_clipping_helper(False)
-
-  def test_weight_clipping_ganmodel(self):
-    self._test_weight_clipping_helper(True)
-
-  def _test_incorrect_weight_clip_value_helper(self, use_tuple):
-    opt = training.GradientDescentOptimizer(1.0)
-
-    if use_tuple:
-      with self.assertRaisesRegexp(ValueError, 'must be positive'):
-        clip_weights.clip_discriminator_weights(opt, self.tuple, weight_clip=-1)
-    else:
-      with self.assertRaisesRegexp(ValueError, 'must be positive'):
-        clip_weights.clip_variables(opt, self.variables, weight_clip=-1)
-
-  def test_incorrect_weight_clip_value_argsonly(self):
-    self._test_incorrect_weight_clip_value_helper(False)
-
-  def test_incorrect_weight_clip_value_tuple(self):
-    self._test_incorrect_weight_clip_value_helper(True)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
deleted file mode 100644
index a9b8faa7126..00000000000
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous utilities for TFGAN code and examples."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import conditioning_utils_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.conditioning_utils_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = conditioning_utils_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
deleted file mode 100644
index 364fa4eb461..00000000000
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous utilities for TFGAN code and examples.
-
-Includes:
-1) Conditioning the value of a Tensor, based on techniques from
-  https://arxiv.org/abs/1609.03499.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-
-
-__all__ = [
-    'condition_tensor',
-    'condition_tensor_from_onehot',
-]
-
-
-def _get_shape(tensor):
-  tensor_shape = array_ops.shape(tensor)
-  static_tensor_shape = tensor_util.constant_value(tensor_shape)
-  return (static_tensor_shape if static_tensor_shape is not None else
-          tensor_shape)
-
-
-def condition_tensor(tensor, conditioning):
-  """Condition the value of a tensor.
-
-  Conditioning scheme based on https://arxiv.org/abs/1609.03499.
-
-  Args:
-    tensor: A minibatch tensor to be conditioned.
-    conditioning: A minibatch Tensor of to condition on. Must be 2D, with first
-      dimension the same as `tensor`.
-
-  Returns:
-    `tensor` conditioned on `conditioning`.
-
-  Raises:
-    ValueError: If the non-batch dimensions of `tensor` aren't fully defined.
-    ValueError: If `conditioning` isn't at least 2D.
-    ValueError: If the batch dimension for the input Tensors don't match.
-  """
-  tensor.shape[1:].assert_is_fully_defined()
-  num_features = tensor.shape[1:].num_elements()
-  if conditioning.shape.ndims < 2:
-    raise ValueError('conditioning must be at least 2D, but saw shape: %s'
-                     % conditioning.shape)
-
-  mapped_conditioning = layers.linear(
-      layers.flatten(conditioning), num_features)
-  if not mapped_conditioning.shape.is_compatible_with(tensor.shape):
-    mapped_conditioning = array_ops.reshape(
-        mapped_conditioning, _get_shape(tensor))
-  return tensor + mapped_conditioning
-
-
-def _one_hot_to_embedding(one_hot, embedding_size):
-  """Get a dense embedding vector from a one-hot encoding."""
-  num_tokens = one_hot.shape[1]
-  label_id = math_ops.argmax(one_hot, axis=1)
-  embedding = variable_scope.get_variable(
-      'embedding', [num_tokens, embedding_size])
-  return embedding_ops.embedding_lookup(
-      embedding, label_id, name='token_to_embedding')
-
-
-def _validate_onehot(one_hot_labels):
-  one_hot_labels.shape.assert_has_rank(2)
-  one_hot_labels.shape[1:].assert_is_fully_defined()
-
-
-def condition_tensor_from_onehot(tensor, one_hot_labels, embedding_size=256):
-  """Condition a tensor based on a one-hot tensor.
-
-  Conditioning scheme based on https://arxiv.org/abs/1609.03499.
-
-  Args:
-    tensor: Tensor to be conditioned.
-    one_hot_labels: A Tensor of one-hot labels. Shape is
-      [batch_size, num_classes].
-    embedding_size: The size of the class embedding.
-
-  Returns:
-    `tensor` conditioned on `one_hot_labels`.
-
-  Raises:
-    ValueError: `one_hot_labels` isn't 2D, if non-batch dimensions aren't
-      fully defined, or if batch sizes don't match.
-  """
-  _validate_onehot(one_hot_labels)
-
-  conditioning = _one_hot_to_embedding(one_hot_labels, embedding_size)
-  return condition_tensor(tensor, conditioning)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
deleted file mode 100644
index f5c7d53cf2c..00000000000
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tfgan.python.features.conditioning_utils."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import conditioning_utils_impl as conditioning_utils
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ConditioningUtilsTest(test.TestCase):
-
-  def test_condition_tensor_multiple_shapes(self):
-    for tensor_shape in [(4, 1), (4, 2), (4, 2, 6), (None, 5, 3)]:
-      for conditioning_shape in [(4, 1), (4, 8), (4, 5, 3)]:
-        conditioning_utils.condition_tensor(
-            array_ops.placeholder(dtypes.float32, tensor_shape),
-            array_ops.placeholder(dtypes.float32, conditioning_shape))
-
-  def test_condition_tensor_asserts(self):
-    with self.assertRaisesRegexp(ValueError, 'Cannot reshape'):
-      conditioning_utils.condition_tensor(
-          array_ops.placeholder(dtypes.float32, (4, 1)),
-          array_ops.placeholder(dtypes.float32, (5, 1)))
-
-    with self.assertRaisesRegexp(ValueError, 'Shape .* is not fully defined'):
-      conditioning_utils.condition_tensor(
-          array_ops.placeholder(dtypes.float32, (5, None)),
-          array_ops.placeholder(dtypes.float32, (5, 1)))
-
-    with self.assertRaisesRegexp(ValueError, 'at least 2D'):
-      conditioning_utils.condition_tensor(
-          array_ops.placeholder(dtypes.float32, (5, 2)),
-          array_ops.placeholder(dtypes.float32, (5)))
-
-  def test_condition_tensor_from_onehot(self):
-    conditioning_utils.condition_tensor_from_onehot(
-        array_ops.placeholder(dtypes.float32, (5, 4, 1)),
-        array_ops.placeholder(dtypes.float32, (5, 10)))
-
-  def test_condition_tensor_from_onehot_asserts(self):
-    with self.assertRaisesRegexp(ValueError, 'Shape .* must have rank 2'):
-      conditioning_utils.condition_tensor_from_onehot(
-          array_ops.placeholder(dtypes.float32, (5, 1)),
-          array_ops.placeholder(dtypes.float32, (5)))
-
-    with self.assertRaisesRegexp(ValueError, 'Shape .* is not fully defined'):
-      conditioning_utils.condition_tensor_from_onehot(
-          array_ops.placeholder(dtypes.float32, (5, 1)),
-          array_ops.placeholder(dtypes.float32, (5, None)))
-
-    with self.assertRaisesRegexp(ValueError, 'Cannot reshape a tensor'):
-      conditioning_utils.condition_tensor_from_onehot(
-          array_ops.placeholder(dtypes.float32, (5, 1)),
-          array_ops.placeholder(dtypes.float32, (4, 6)))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
deleted file mode 100644
index ca904971fa8..00000000000
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A tensor pool stores values from an input tensor and returns a stored one.
-
-See the following papers for more details.
-1) `Learning from simulated and unsupervised images through adversarial
-    training` (https://arxiv.org/abs/1612.07828).
-2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
-    Networks` (https://arxiv.org/abs/1703.10593).
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import random_tensor_pool_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = random_tensor_pool_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
deleted file mode 100644
index ca2d724b49d..00000000000
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A tensor pool stores values from an input tensor and returns a stored one.
-
-We use this to keep a history of values created by a generator, such that
-a discriminator can randomly be trained on some older samples, not just the
-current one. This can help to not let the discriminator get too far ahead of the
-generator and also to keep the system from oscillating, if the discriminator
-forgets too fast what past samples from the generator looked like.
-
-See the following papers for more details.
-1) `Learning from simulated and unsupervised images through adversarial
-    training` (https://arxiv.org/abs/1612.07828).
-2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
-    Networks` (https://arxiv.org/abs/1703.10593).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.util import nest
-
-__all__ = [
-    'tensor_pool',
-]
-
-
-def _to_list(x):
-  return [x] if isinstance(x, ops.Tensor) else list(x)
-
-
-def tensor_pool(input_values,
-                pool_size=50,
-                pooling_probability=0.5,
-                name='tensor_pool'):
-  """Queue storing input values and returning random previously stored ones.
-
-  Every time the returned `output_value` is evaluated, `input_value` is
-  evaluated and its value either directly returned (with
-  `1-pooling_probability`) or stored in the pool and a random one of the samples
-  currently in the pool is popped and returned. As long as the pool in not fully
-  filled, the input_value is always directly returned, as well as stored in the
-  pool. Note during inference / testing, it may be appropriate to set
-  `pool_size` = 0 or `pooling_probability` = 0.
-
-  Args:
-    input_values: An arbitrarily nested structure of `tf.Tensors`, from which to
-      read values to be pooled.
-    pool_size: An integer specifying the maximum size of the pool. Defaults to
-      50.
-    pooling_probability: A float `Tensor` specifying the probability of getting
-      a value from the pool, as opposed to just the current input.
-    name: A string prefix for the name scope for all tensorflow ops.
-
-  Returns:
-    A nested structure of `Tensor` objects with the same structure as
-    `input_values`. With the given probability, the Tensor values are either the
-    same as in `input_values` or a randomly chosen sample that was previously
-    inserted in the pool.
-
-  Raises:
-    ValueError: If `pool_size` is negative.
-  """
-  pool_size = int(pool_size)
-  if pool_size < 0:
-    raise ValueError('`pool_size` is negative.')
-  elif pool_size == 0:
-    return input_values
-
-  original_input_values = input_values
-  input_values = nest.flatten(input_values)
-
-  with ops.name_scope('{}_pool_queue'.format(name),
-                      values=input_values + [pooling_probability]):
-    pool_queue = data_flow_ops.RandomShuffleQueue(
-        capacity=pool_size,
-        min_after_dequeue=0,
-        dtypes=[v.dtype for v in input_values],
-        shapes=None)
-
-    # In pseudo code this code does the following:
-    # if not pool_full:
-    #   enqueue(input_values)
-    #   return input_values
-    # else
-    #   dequeue_values = dequeue_random_sample()
-    #   enqueue(input_values)
-    #   if rand() < pooling_probability:
-    #     return dequeue_values
-    #   else
-    #     return input_values
-
-    def _get_input_value_pooled():
-      enqueue_op = pool_queue.enqueue(input_values)
-      with ops.control_dependencies([enqueue_op]):
-        return [array_ops.identity(v) for v in input_values]
-
-    def _get_random_pool_value_and_enqueue_input():
-      dequeue_values = _to_list(pool_queue.dequeue())
-      with ops.control_dependencies(dequeue_values):
-        enqueue_op = pool_queue.enqueue(input_values)
-        with ops.control_dependencies([enqueue_op]):
-          prob = random_ops.random_uniform(
-              (), dtype=dtypes.float32) < pooling_probability
-          return control_flow_ops.cond(prob, lambda: dequeue_values,
-                                       lambda: input_values)
-
-    output_values = _to_list(control_flow_ops.cond(
-        pool_queue.size() < pool_size, _get_input_value_pooled,
-        _get_random_pool_value_and_enqueue_input))
-
-    # Make sure that the shape of `output_value` is set.
-    for input_value, output_value in zip(input_values, output_values):
-      output_value.set_shape(input_value.shape)
-
-  return nest.pack_sequence_as(original_input_values, output_values)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
deleted file mode 100644
index 3c9dfd6de02..00000000000
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.gan.python.features.random_tensor_pool."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class TensorPoolTest(test.TestCase):
-
-  def test_pool_unknown_input_shape(self):
-    """Checks that `input_value` can have unknown shape."""
-    input_value = array_ops.placeholder(
-        dtype=dtypes.int32, shape=[None, None, 3])
-    output_value = tensor_pool(input_value, pool_size=10)
-    self.assertEqual(output_value.shape.as_list(), [None, None, 3])
-
-    with self.session(use_gpu=True) as session:
-      for i in range(10):
-        session.run(output_value, {input_value: [[[i] * 3]]})
-        session.run(output_value, {input_value: [[[i] * 3] * 2]})
-        session.run(output_value, {input_value: [[[i] * 3] * 5] * 2})
-
-  def test_pool_sequence(self):
-    """Checks that values are pooled and returned maximally twice."""
-    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
-    output_value = tensor_pool(input_value, pool_size=10)
-    self.assertEqual(output_value.shape.as_list(), [])
-
-    with self.session(use_gpu=True) as session:
-      outs = []
-      for i in range(50):
-        out = session.run(output_value, {input_value: i})
-        outs.append(out)
-        self.assertLessEqual(out, i)
-
-      _, counts = np.unique(outs, return_counts=True)
-      # Check that each value is returned maximally twice.
-      self.assertTrue((counts <= 2).all())
-
-  def test_never_pool(self):
-    """Checks that setting `pooling_probability` to zero works."""
-    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
-    output_value = tensor_pool(
-        input_value, pool_size=10, pooling_probability=0.0)
-    self.assertEqual(output_value.shape.as_list(), [])
-
-    with self.session(use_gpu=True) as session:
-      for i in range(50):
-        out = session.run(output_value, {input_value: i})
-        self.assertEqual(out, i)
-
-  def test_pooling_probability(self):
-    """Checks that `pooling_probability` works."""
-    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
-    pool_size = 10
-    pooling_probability = 0.2
-    output_value = tensor_pool(
-        input_value,
-        pool_size=pool_size,
-        pooling_probability=pooling_probability)
-    self.assertEqual(output_value.shape.as_list(), [])
-
-    with self.session(use_gpu=True) as session:
-      not_pooled = 0
-      total = 1000
-      for i in range(total):
-        out = session.run(output_value, {input_value: i})
-        if out == i:
-          not_pooled += 1
-      self.assertAllClose(
-          (not_pooled - pool_size) / (total - pool_size),
-          1 - pooling_probability,
-          atol=0.03)
-
-  def test_input_values_tuple(self):
-    """Checks that `input_values` can be a tuple."""
-    input_values = (array_ops.placeholder(dtype=dtypes.int32, shape=[]),
-                    array_ops.placeholder(dtype=dtypes.int32, shape=[]))
-    output_values = tensor_pool(input_values, pool_size=3)
-    self.assertEqual(len(output_values), len(input_values))
-    for output_value in output_values:
-      self.assertEqual(output_value.shape.as_list(), [])
-
-    with self.session(use_gpu=True) as session:
-      for i in range(10):
-        outs = session.run(output_values, {
-            input_values[0]: i,
-            input_values[1]: i + 1
-        })
-        self.assertEqual(len(outs), len(input_values))
-        self.assertEqual(outs[1] - outs[0], 1)
-
-  def test_pool_preserves_shape(self):
-    t = constant_op.constant(1)
-    input_values = [[t, t, t], (t, t), t]
-    output_values = tensor_pool(input_values, pool_size=5)
-    print('stuff: ', output_values)
-    # Overall shape.
-    self.assertIsInstance(output_values, list)
-    self.assertEqual(3, len(output_values))
-    # Shape of first element.
-    self.assertIsInstance(output_values[0], list)
-    self.assertEqual(3, len(output_values[0]))
-    # Shape of second element.
-    self.assertIsInstance(output_values[1], tuple)
-    self.assertEqual(2, len(output_values[1]))
-    # Shape of third element.
-    self.assertIsInstance(output_values[2], ops.Tensor)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
deleted file mode 100644
index 54d3d0a218d..00000000000
--- a/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras-like layers and utilities that implement Spectral Normalization.
-
-Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
-et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.spectral_normalization_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = spectral_normalization_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
deleted file mode 100644
index 9004be6229f..00000000000
--- a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras-like layers and utilities that implement Spectral Normalization.
-
-Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
-et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import numbers
-import re
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_layer_utils as keras_base_layer_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
-
-__all__ = [
-    'compute_spectral_norm', 'spectral_normalize', 'spectral_norm_regularizer',
-    'spectral_normalization_custom_getter', 'keras_spectral_normalization'
-]
-
-# tf.bfloat16 should work, but tf.matmul converts those to tf.float32 which then
-# can't directly be assigned back to the tf.bfloat16 variable.
-_OK_DTYPES_FOR_SPECTRAL_NORM = (dtypes.float16, dtypes.float32, dtypes.float64)
-_PERSISTED_U_VARIABLE_SUFFIX = 'spectral_norm_u'
-
-
-def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
-  """Estimates the largest singular value in the weight tensor.
-
-  Args:
-    w_tensor: The weight matrix whose spectral norm should be computed.
-    power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yields a better approximation.
-    name: An optional scope name.
-
-  Returns:
-    The largest singular value (the spectral norm) of w.
-  """
-  with variable_scope.variable_scope(name, 'spectral_norm'):
-    # The paper says to flatten convnet kernel weights from
-    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
-    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
-    # (KH * KW * C_in, C_out), and similarly for other layers that put output
-    # channels as last dimension.
-    # n.b. this means that w here is equivalent to w.T in the paper.
-    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))
-
-    # Persisted approximation of first left singular vector of matrix `w`.
-    u_var = variable_scope.get_variable(
-        _PERSISTED_U_VARIABLE_SUFFIX,
-        shape=(w.shape[0], 1),
-        dtype=w.dtype,
-        initializer=init_ops.random_normal_initializer(),
-        trainable=False)
-    u = u_var
-
-    # Use power iteration method to approximate spectral norm.
-    for _ in range(power_iteration_rounds):
-      # `v` approximates the first right singular vector of matrix `w`.
-      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
-      u = nn.l2_normalize(math_ops.matmul(w, v))
-
-    # Update persisted approximation.
-    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
-      u = array_ops.identity(u)
-
-    u = array_ops.stop_gradient(u)
-    v = array_ops.stop_gradient(v)
-
-    # Largest singular value of `w`.
-    spectral_norm = math_ops.matmul(
-        math_ops.matmul(array_ops.transpose(u), w), v)
-    spectral_norm.shape.assert_is_fully_defined()
-    spectral_norm.shape.assert_is_compatible_with([1, 1])
-
-    return spectral_norm[0][0]
-
-
-def spectral_normalize(w, power_iteration_rounds=1, name=None):
-  """Normalizes a weight matrix by its spectral norm.
-
-  Args:
-    w: The weight matrix to be normalized.
-    power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yields a better approximation.
-    name: An optional scope name.
-
-  Returns:
-    A normalized weight matrix tensor.
-  """
-  with variable_scope.variable_scope(name, 'spectral_normalize'):
-    w_normalized = w / compute_spectral_norm(
-        w, power_iteration_rounds=power_iteration_rounds)
-    return array_ops.reshape(w_normalized, w.get_shape())
-
-
-def spectral_norm_regularizer(scale, power_iteration_rounds=1, scope=None):
-  """Returns a functions that can be used to apply spectral norm regularization.
-
-  Small spectral norms enforce a small Lipschitz constant, which is necessary
-  for Wasserstein GANs.
-
-  Args:
-    scale: A scalar multiplier. 0.0 disables the regularizer.
-    power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yields a better approximation.
-    scope: An optional scope name.
-
-  Returns:
-    A function with the signature `sn(weights)` that applies spectral norm
-    regularization.
-
-  Raises:
-    ValueError: If scale is negative or if scale is not a float.
-  """
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.0:
-      raise ValueError(
-          'Setting a scale less than 0 on a regularizer: %g' % scale)
-    if scale == 0.0:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _: None
-
-  def sn(weights, name=None):
-    """Applies spectral norm regularization to weights."""
-    with ops.name_scope(scope, 'SpectralNormRegularizer', [weights]) as name:
-      scale_t = ops.convert_to_tensor(
-          scale, dtype=weights.dtype.base_dtype, name='scale')
-      return math_ops.multiply(
-          scale_t,
-          compute_spectral_norm(
-              weights, power_iteration_rounds=power_iteration_rounds),
-          name=name)
-
-  return sn
-
-
-def _default_name_filter(name):
-  """A filter function to identify common names of weight variables.
-
-  Args:
-    name: The variable name.
-
-  Returns:
-    Whether `name` is a standard name for a weight/kernel variables used in the
-    Keras, tf.layers, tf.contrib.layers or tf.contrib.slim libraries.
-  """
-  match = re.match(r'(.*\/)?(depthwise_|pointwise_)?(weights|kernel)$', name)
-  return match is not None
-
-
-def spectral_normalization_custom_getter(name_filter=_default_name_filter,
-                                         power_iteration_rounds=1):
-  """Custom getter that performs Spectral Normalization on a weight tensor.
-
-  Specifically it divides the weight tensor by its largest singular value. This
-  is intended to stabilize GAN training, by making the discriminator satisfy a
-  local 1-Lipschitz constraint.
-
-  Based on [Spectral Normalization for Generative Adversarial Networks][sn-gan].
-
-  [sn-gan]: https://openreview.net/forum?id=B1QRgziT-
-
-  To reproduce an SN-GAN, apply this custom_getter to every weight tensor of
-  your discriminator. The last dimension of the weight tensor must be the number
-  of output channels.
-
-  Apply this to layers by supplying this as the `custom_getter` of a
-  `tf.compat.v1.variable_scope`. For example:
-
-    with tf.compat.v1.variable_scope('discriminator',
-                           custom_getter=spectral_norm_getter()):
-      net = discriminator_fn(net)
-
-  IMPORTANT: Keras does not respect the custom_getter supplied by the
-  VariableScope, so Keras users should use `keras_spectral_normalization`
-  instead of (or in addition to) this approach.
-
-  It is important to carefully select to which weights you want to apply
-  Spectral Normalization. In general you want to normalize the kernels of
-  convolution and dense layers, but you do not want to normalize biases. You
-  also want to avoid normalizing batch normalization (and similar) variables,
-  but in general such layers play poorly with Spectral Normalization, since the
-  gamma can cancel out the normalization in other layers. By default we supply a
-  filter that matches the kernel variable names of the dense and convolution
-  layers of the tf.layers, tf.contrib.layers, tf.keras and tf.contrib.slim
-  libraries. If you are using anything else you'll need a custom `name_filter`.
-
-  This custom getter internally creates a variable used to compute the spectral
-  norm by power iteration. It will update every time the variable is accessed,
-  which means the normalized discriminator weights may change slightly whilst
-  training the generator. Whilst unusual, this matches how the paper's authors
-  implement it, and in general additional rounds of power iteration can't hurt.
-
-  Args:
-    name_filter: Optionally, a method that takes a Variable name as input and
-      returns whether this Variable should be normalized.
-    power_iteration_rounds: The number of iterations of the power method to
-      perform per step. A higher number yields a better approximation of the
-      true spectral norm.
-
-  Returns:
-    A custom getter function that applies Spectral Normalization to all
-    Variables whose names match `name_filter`.
-
-  Raises:
-    ValueError: If name_filter is not callable.
-  """
-  if not callable(name_filter):
-    raise ValueError('name_filter must be callable')
-
-  def _internal_getter(getter, name, *args, **kwargs):
-    """A custom getter function that applies Spectral Normalization.
-
-    Args:
-      getter: The true getter to call.
-      name: Name of new/existing variable, in the same format as
-        tf.get_variable.
-      *args: Other positional arguments, in the same format as tf.get_variable.
-      **kwargs: Keyword arguments, in the same format as tf.get_variable.
-
-    Returns:
-      The return value of `getter(name, *args, **kwargs)`, spectrally
-      normalized.
-
-    Raises:
-      ValueError: If used incorrectly, or if `dtype` is not supported.
-    """
-    if not name_filter(name):
-      return getter(name, *args, **kwargs)
-
-    if name.endswith(_PERSISTED_U_VARIABLE_SUFFIX):
-      raise ValueError(
-          'Cannot apply Spectral Normalization to internal variables created '
-          'for Spectral Normalization. Tried to normalized variable [%s]' %
-          name)
-
-    if kwargs['dtype'] not in _OK_DTYPES_FOR_SPECTRAL_NORM:
-      raise ValueError('Disallowed data type {}'.format(kwargs['dtype']))
-
-    # This layer's weight Variable/PartitionedVariable.
-    w_tensor = getter(name, *args, **kwargs)
-
-    if len(w_tensor.get_shape()) < 2:
-      raise ValueError(
-          'Spectral norm can only be applied to multi-dimensional tensors')
-
-    return spectral_normalize(
-        w_tensor,
-        power_iteration_rounds=power_iteration_rounds,
-        name=(name + '/spectral_normalize'))
-
-  return _internal_getter
-
-
-@contextlib.contextmanager
-def keras_spectral_normalization(name_filter=_default_name_filter,
-                                 power_iteration_rounds=1):
-  """A context manager that enables Spectral Normalization for Keras.
-
-  Keras doesn't respect the `custom_getter` in the VariableScope, so this is a
-  bit of a hack to make things work.
-
-  Usage:
-    with keras_spectral_normalization():
-      net = discriminator_fn(net)
-
-  Args:
-    name_filter: Optionally, a method that takes a Variable name as input and
-      returns whether this Variable should be normalized.
-    power_iteration_rounds: The number of iterations of the power method to
-      perform per step. A higher number yields a better approximation of the
-      true spectral norm.
-
-  Yields:
-    A context manager that wraps the standard Keras variable creation method
-    with the `spectral_normalization_custom_getter`.
-  """
-  original_make_variable = keras_base_layer_utils.make_variable
-  sn_getter = spectral_normalization_custom_getter(
-      name_filter=name_filter, power_iteration_rounds=power_iteration_rounds)
-
-  def make_variable_wrapper(name, *args, **kwargs):
-    return sn_getter(original_make_variable, name, *args, **kwargs)
-
-  keras_base_layer_utils.make_variable = make_variable_wrapper
-
-  yield
-
-  keras_base_layer_utils.make_variable = original_make_variable
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
deleted file mode 100644
index 4ea21f70ec0..00000000000
--- a/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for features.spectral_normalization."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import slim
-from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl as spectral_normalization
-from tensorflow.contrib.layers.python.layers import layers as contrib_layers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import convolutional as keras_convolutional
-from tensorflow.python.keras.layers import core as keras_core
-from tensorflow.python.layers import convolutional as layers_convolutional
-from tensorflow.python.layers import core as layers_core
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class SpectralNormalizationTest(test.TestCase):
-
-  def testComputeSpectralNorm(self):
-    weights = variable_scope.get_variable(
-        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
-    weights = math_ops.multiply(weights, 10.0)
-    s = linalg_ops.svd(
-        array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False)
-    true_sn = s[..., 0]
-    estimated_sn = spectral_normalization.compute_spectral_norm(weights)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      np_true_sn = sess.run(true_sn)
-      for i in range(50):
-        est = sess.run(estimated_sn)
-        if i < 1:
-          np_est_1 = est
-        if i < 4:
-          np_est_5 = est
-        if i < 9:
-          np_est_10 = est
-        np_est_50 = est
-
-      # Check that the estimate improves with more iterations.
-      self.assertAlmostEqual(np_true_sn, np_est_50, 0)
-      self.assertGreater(
-          abs(np_true_sn - np_est_10), abs(np_true_sn - np_est_50))
-      self.assertGreater(
-          abs(np_true_sn - np_est_5), abs(np_true_sn - np_est_10))
-      self.assertGreater(abs(np_true_sn - np_est_1), abs(np_true_sn - np_est_5))
-
-  def testSpectralNormalize(self):
-    weights = variable_scope.get_variable(
-        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
-    weights = math_ops.multiply(weights, 10.0)
-    normalized_weights = spectral_normalization.spectral_normalize(
-        weights, power_iteration_rounds=1)
-
-    unnormalized_sigma = linalg_ops.svd(
-        array_ops.reshape(weights, [-1, weights.shape[-1]]),
-        compute_uv=False)[..., 0]
-    normalized_sigma = linalg_ops.svd(
-        array_ops.reshape(normalized_weights, [-1, weights.shape[-1]]),
-        compute_uv=False)[..., 0]
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      s0 = sess.run(unnormalized_sigma)
-
-      for i in range(50):
-        sigma = sess.run(normalized_sigma)
-        if i < 1:
-          s1 = sigma
-        if i < 5:
-          s5 = sigma
-        if i < 10:
-          s10 = sigma
-        s50 = sigma
-
-      self.assertAlmostEqual(1., s50, 0)
-      self.assertGreater(abs(s10 - 1.), abs(s50 - 1.))
-      self.assertGreater(abs(s5 - 1.), abs(s10 - 1.))
-      self.assertGreater(abs(s1 - 1.), abs(s5 - 1.))
-      self.assertGreater(abs(s0 - 1.), abs(s1 - 1.))
-
-  def _testLayerHelper(self, build_layer_fn, w_shape, b_shape, is_keras=False):
-    x = array_ops.placeholder(dtypes.float32, shape=[2, 10, 10, 3])
-
-    w_initial = np.random.randn(*w_shape) * 10
-    w_initializer = init_ops.constant_initializer(w_initial)
-    b_initial = np.random.randn(*b_shape)
-    b_initializer = init_ops.constant_initializer(b_initial)
-
-    if is_keras:
-      context_manager = spectral_normalization.keras_spectral_normalization()
-    else:
-      getter = spectral_normalization.spectral_normalization_custom_getter()
-      context_manager = variable_scope.variable_scope('', custom_getter=getter)
-
-    with context_manager:
-      (net,
-       expected_normalized_vars, expected_not_normalized_vars) = build_layer_fn(
-           x, w_initializer, b_initializer)
-
-    x_data = np.random.rand(*x.shape)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-
-      # Before running a forward pass we still expect the variables values to
-      # differ from the initial value because of the normalizer.
-      w_befores = []
-      for name, var in expected_normalized_vars.items():
-        w_before = sess.run(var)
-        w_befores.append(w_before)
-        self.assertFalse(
-            np.allclose(w_initial, w_before),
-            msg=('%s appears not to be normalized. Before: %s After: %s' %
-                 (name, w_initial, w_before)))
-
-      # Not true for the unnormalized variables.
-      for name, var in expected_not_normalized_vars.items():
-        b_before = sess.run(var)
-        self.assertTrue(
-            np.allclose(b_initial, b_before),
-            msg=('%s appears to be unexpectedly normalized. '
-                 'Before: %s After: %s' % (name, b_initial, b_before)))
-
-      # Run a bunch of forward passes.
-      for _ in range(1000):
-        _ = sess.run(net, feed_dict={x: x_data})
-
-      # We expect this to have improved the estimate of the spectral norm,
-      # which should have changed the variable values and brought them close
-      # to the true Spectral Normalized values.
-      _, s, _ = np.linalg.svd(w_initial.reshape([-1, 3]))
-      exactly_normalized = w_initial / s[0]
-      for w_before, (name, var) in zip(w_befores,
-                                       expected_normalized_vars.items()):
-        w_after = sess.run(var)
-        self.assertFalse(
-            np.allclose(w_before, w_after, rtol=1e-8, atol=1e-8),
-            msg=('%s did not improve over many iterations. '
-                 'Before: %s After: %s' % (name, w_before, w_after)))
-        self.assertAllClose(
-            exactly_normalized,
-            w_after,
-            rtol=1e-4,
-            atol=1e-4,
-            msg=('Estimate of spectral norm for %s was innacurate. '
-                 'Normalized matrices do not match.'
-                 'Estimate: %s Actual: %s' % (name, w_after,
-                                              exactly_normalized)))
-
-  def testConv2D_Layers(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      layer = layers_convolutional.Conv2D(
-          filters=3,
-          kernel_size=3,
-          padding='same',
-          kernel_initializer=w_initializer,
-          bias_initializer=b_initializer)
-      net = layer.apply(x)
-      expected_normalized_vars = {'tf.layers.Conv2d.kernel': layer.kernel}
-      expected_not_normalized_vars = {'tf.layers.Conv2d.bias': layer.bias}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
-
-  def testConv2D_ContribLayers(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      var_collection = {
-          'weights': ['CONTRIB_LAYERS_CONV2D_WEIGHTS'],
-          'biases': ['CONTRIB_LAYERS_CONV2D_BIASES']
-      }
-      net = contrib_layers.conv2d(
-          x,
-          3,
-          3,
-          weights_initializer=w_initializer,
-          biases_initializer=b_initializer,
-          variables_collections=var_collection)
-      weight_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_WEIGHTS')
-      self.assertEquals(1, len(weight_vars))
-      bias_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_BIASES')
-      self.assertEquals(1, len(bias_vars))
-      expected_normalized_vars = {
-          'contrib.layers.conv2d.weights': weight_vars[0]
-      }
-      expected_not_normalized_vars = {
-          'contrib.layers.conv2d.bias': bias_vars[0]
-      }
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
-
-  def testConv2D_Slim(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      var_collection = {
-          'weights': ['SLIM_CONV2D_WEIGHTS'],
-          'biases': ['SLIM_CONV2D_BIASES']
-      }
-      net = slim.conv2d(
-          x,
-          3,
-          3,
-          weights_initializer=w_initializer,
-          biases_initializer=b_initializer,
-          variables_collections=var_collection)
-      weight_vars = ops.get_collection('SLIM_CONV2D_WEIGHTS')
-      self.assertEquals(1, len(weight_vars))
-      bias_vars = ops.get_collection('SLIM_CONV2D_BIASES')
-      self.assertEquals(1, len(bias_vars))
-      expected_normalized_vars = {'slim.conv2d.weights': weight_vars[0]}
-      expected_not_normalized_vars = {'slim.conv2d.bias': bias_vars[0]}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
-
-  def testConv2D_Keras(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      layer = keras_convolutional.Conv2D(
-          filters=3,
-          kernel_size=3,
-          padding='same',
-          kernel_initializer=w_initializer,
-          bias_initializer=b_initializer)
-      net = layer.apply(x)
-      expected_normalized_vars = {'keras.layers.Conv2d.kernel': layer.kernel}
-      expected_not_normalized_vars = {'keras.layers.Conv2d.bias': layer.bias}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,), is_keras=True)
-
-  def testFC_Layers(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      x = layers_core.Flatten()(x)
-      layer = layers_core.Dense(
-          units=3,
-          kernel_initializer=w_initializer,
-          bias_initializer=b_initializer)
-      net = layer.apply(x)
-      expected_normalized_vars = {'tf.layers.Dense.kernel': layer.kernel}
-      expected_not_normalized_vars = {'tf.layers.Dense.bias': layer.bias}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
-
-  def testFC_ContribLayers(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      var_collection = {
-          'weights': ['CONTRIB_LAYERS_FC_WEIGHTS'],
-          'biases': ['CONTRIB_LAYERS_FC_BIASES']
-      }
-      x = contrib_layers.flatten(x)
-      net = contrib_layers.fully_connected(
-          x,
-          3,
-          weights_initializer=w_initializer,
-          biases_initializer=b_initializer,
-          variables_collections=var_collection)
-      weight_vars = ops.get_collection('CONTRIB_LAYERS_FC_WEIGHTS')
-      self.assertEquals(1, len(weight_vars))
-      bias_vars = ops.get_collection('CONTRIB_LAYERS_FC_BIASES')
-      self.assertEquals(1, len(bias_vars))
-      expected_normalized_vars = {
-          'contrib.layers.fully_connected.weights': weight_vars[0]
-      }
-      expected_not_normalized_vars = {
-          'contrib.layers.fully_connected.bias': bias_vars[0]
-      }
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
-
-  def testFC_Slim(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      var_collection = {
-          'weights': ['SLIM_FC_WEIGHTS'],
-          'biases': ['SLIM_FC_BIASES']
-      }
-      x = slim.flatten(x)
-      net = slim.fully_connected(
-          x,
-          3,
-          weights_initializer=w_initializer,
-          biases_initializer=b_initializer,
-          variables_collections=var_collection)
-      weight_vars = ops.get_collection('SLIM_FC_WEIGHTS')
-      self.assertEquals(1, len(weight_vars))
-      bias_vars = ops.get_collection('SLIM_FC_BIASES')
-      self.assertEquals(1, len(bias_vars))
-      expected_normalized_vars = {
-          'slim.fully_connected.weights': weight_vars[0]
-      }
-      expected_not_normalized_vars = {'slim.fully_connected.bias': bias_vars[0]}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
-
-  def testFC_Keras(self):
-
-    def build_layer_fn(x, w_initializer, b_initializer):
-      x = keras_core.Flatten()(x)
-      layer = keras_core.Dense(
-          units=3,
-          kernel_initializer=w_initializer,
-          bias_initializer=b_initializer)
-      net = layer.apply(x)
-      expected_normalized_vars = {'keras.layers.Dense.kernel': layer.kernel}
-      expected_not_normalized_vars = {'keras.layers.Dense.bias': layer.bias}
-
-      return net, expected_normalized_vars, expected_not_normalized_vars
-
-    self._testLayerHelper(build_layer_fn, (300, 3), (3,), is_keras=True)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm.py
deleted file mode 100644
index ea54ac01cee..00000000000
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Virtual batch normalization."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.features.python import virtual_batchnorm_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.virtual_batchnorm_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = virtual_batchnorm_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
deleted file mode 100644
index 030ce942607..00000000000
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Virtual batch normalization.
-
-This technique was first introduced in `Improved Techniques for Training GANs`
-(Salimans et al, https://arxiv.org/abs/1606.03498). Instead of using batch
-normalization on a minibatch, it fixes a reference subset of the data to use for
-calculating normalization statistics.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import variable_scope
-
-__all__ = [
-    'VBN',
-]
-
-
-def _static_or_dynamic_batch_size(tensor, batch_axis):
-  """Returns the static or dynamic batch size."""
-  batch_size = array_ops.shape(tensor)[batch_axis]
-  static_batch_size = tensor_util.constant_value(batch_size)
-  return static_batch_size or batch_size
-
-
-def _statistics(x, axes):
-  """Calculate the mean and mean square of `x`.
-
-  Modified from the implementation of `tf.nn.moments`.
-
-  Args:
-    x: A `Tensor`.
-    axes: Array of ints.  Axes along which to compute mean and variance.
-
-  Returns:
-    Two `Tensor` objects: `mean` and `square mean`.
-  """
-  # The dynamic range of fp16 is too limited to support the collection of
-  # sufficient statistics. As a workaround we simply perform the operations
-  # on 32-bit floats before converting the mean and variance back to fp16
-  y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
-
-  # Compute true mean while keeping the dims for proper broadcasting.
-  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))
-
-  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
-  mean = shifted_mean + shift
-  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)
-
-  mean = array_ops.squeeze(mean, axes)
-  mean_squared = array_ops.squeeze(mean_squared, axes)
-  if x.dtype == dtypes.float16:
-    return (math_ops.cast(mean, dtypes.float16),
-            math_ops.cast(mean_squared, dtypes.float16))
-  else:
-    return (mean, mean_squared)
-
-
-def _validate_init_input_and_get_axis(reference_batch, axis):
-  """Validate input and return the used axis value."""
-  if reference_batch.shape.ndims is None:
-    raise ValueError('`reference_batch` has unknown dimensions.')
-
-  ndims = reference_batch.shape.ndims
-  if axis < 0:
-    used_axis = ndims + axis
-  else:
-    used_axis = axis
-  if used_axis < 0 or used_axis >= ndims:
-    raise ValueError('Value of `axis` argument ' + str(used_axis) +
-                     ' is out of range for input with rank ' + str(ndims))
-  return used_axis
-
-
-def _validate_call_input(tensor_list, batch_dim):
-  """Verifies that tensor shapes are compatible, except for `batch_dim`."""
-
-  def _get_shape(tensor):
-    shape = tensor.shape.as_list()
-    del shape[batch_dim]
-    return shape
-
-  base_shape = tensor_shape.TensorShape(_get_shape(tensor_list[0]))
-  for tensor in tensor_list:
-    base_shape.assert_is_compatible_with(_get_shape(tensor))
-
-
-class VBN(object):
-  """A class to perform virtual batch normalization.
-
-  This technique was first introduced in `Improved Techniques for Training GANs`
-  (Salimans et al, https://arxiv.org/abs/1606.03498). Instead of using batch
-  normalization on a minibatch, it fixes a reference subset of the data to use
-  for calculating normalization statistics.
-
-  To do this, we calculate the reference batch mean and mean square, and modify
-  those statistics for each example. We use mean square instead of variance,
-  since it is linear.
-
-  Note that if `center` or `scale` variables are created, they are shared
-  between all calls to this object.
-
-  The `__init__` API is intended to mimic
-  `tf.compat.v1.layers.batch_normalization` as
-  closely as possible.
-  """
-
-  def __init__(self,
-               reference_batch,
-               axis=-1,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer=init_ops.zeros_initializer(),
-               gamma_initializer=init_ops.ones_initializer(),
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               trainable=True,
-               name=None,
-               batch_axis=0):
-    """Initialize virtual batch normalization object.
-
-    We precompute the 'mean' and 'mean squared' of the reference batch, so that
-    `__call__` is efficient. This means that the axis must be supplied when the
-    object is created, not when it is called.
-
-    We precompute 'square mean' instead of 'variance', because the square mean
-    can be easily adjusted on a per-example basis.
-
-    Args:
-      reference_batch: A minibatch tensors. This will form the reference data
-        from which the normalization statistics are calculated. See
-        https://arxiv.org/abs/1606.03498 for more details.
-      axis: Integer, the axis that should be normalized (typically the features
-        axis). For instance, after a `Convolution2D` layer with
-        `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
-      epsilon: Small float added to variance to avoid dividing by zero.
-      center: If True, add offset of `beta` to normalized tensor. If False,
-        `beta` is ignored.
-      scale: If True, multiply by `gamma`. If False, `gamma` is not used. When
-        the next layer is linear (also e.g. `nn.relu`), this can be disabled
-        since the scaling can be done by the next layer.
-      beta_initializer: Initializer for the beta weight.
-      gamma_initializer: Initializer for the gamma weight.
-      beta_regularizer: Optional regularizer for the beta weight.
-      gamma_regularizer: Optional regularizer for the gamma weight.
-      trainable: Boolean, if `True` also add variables to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-      name: String, the name of the ops.
-      batch_axis: The axis of the batch dimension. This dimension is treated
-        differently in `virtual batch normalization` vs `batch normalization`.
-
-    Raises:
-      ValueError: If `reference_batch` has unknown dimensions at graph
-        construction.
-      ValueError: If `batch_axis` is the same as `axis`.
-    """
-    axis = _validate_init_input_and_get_axis(reference_batch, axis)
-    self._epsilon = epsilon
-    self._beta = 0
-    self._gamma = 1
-    self._batch_axis = _validate_init_input_and_get_axis(
-        reference_batch, batch_axis)
-
-    if axis == self._batch_axis:
-      raise ValueError('`axis` and `batch_axis` cannot be the same.')
-
-    with variable_scope.variable_scope(
-        name, 'VBN', values=[reference_batch]) as self._vs:
-      self._reference_batch = reference_batch
-
-      # Calculate important shapes:
-      #  1) Reduction axes for the reference batch
-      #  2) Broadcast shape, if necessary
-      #  3) Reduction axes for the virtual batchnormed batch
-      #  4) Shape for optional parameters
-      input_shape = self._reference_batch.shape
-      ndims = input_shape.ndims
-      reduction_axes = list(range(ndims))
-      del reduction_axes[axis]
-
-      self._broadcast_shape = [1] * len(input_shape)
-      self._broadcast_shape[axis] = input_shape.dims[axis]
-
-      self._example_reduction_axes = list(range(ndims))
-      del self._example_reduction_axes[max(axis, self._batch_axis)]
-      del self._example_reduction_axes[min(axis, self._batch_axis)]
-
-      params_shape = self._reference_batch.shape[axis]
-
-      # Determines whether broadcasting is needed. This is slightly different
-      # than in the `nn.batch_normalization` case, due to `batch_dim`.
-      self._needs_broadcasting = (
-          sorted(self._example_reduction_axes) != list(range(ndims))[:-2])
-
-      # Calculate the sufficient statistics for the reference batch in a way
-      # that can be easily modified by additional examples.
-      self._ref_mean, self._ref_mean_squares = _statistics(
-          self._reference_batch, reduction_axes)
-      self._ref_variance = (
-          self._ref_mean_squares - math_ops.square(self._ref_mean))
-
-      # Virtual batch normalization uses a weighted average between example
-      # statistics and the reference batch statistics.
-      ref_batch_size = _static_or_dynamic_batch_size(self._reference_batch,
-                                                     self._batch_axis)
-      self._example_weight = 1. / (
-          math_ops.cast(ref_batch_size, dtypes.float32) + 1.)
-      self._ref_weight = 1. - self._example_weight
-
-      # Make the variables, if necessary.
-      if center:
-        self._beta = variable_scope.get_variable(
-            name='beta',
-            shape=(params_shape,),
-            initializer=beta_initializer,
-            regularizer=beta_regularizer,
-            trainable=trainable)
-      if scale:
-        self._gamma = variable_scope.get_variable(
-            name='gamma',
-            shape=(params_shape,),
-            initializer=gamma_initializer,
-            regularizer=gamma_regularizer,
-            trainable=trainable)
-
-  def _virtual_statistics(self, inputs, reduction_axes):
-    """Compute the statistics needed for virtual batch normalization."""
-    cur_mean, cur_mean_sq = _statistics(inputs, reduction_axes)
-    vb_mean = (
-        self._example_weight * cur_mean + self._ref_weight * self._ref_mean)
-    vb_mean_sq = (
-        self._example_weight * cur_mean_sq +
-        self._ref_weight * self._ref_mean_squares)
-    return (vb_mean, vb_mean_sq)
-
-  def _broadcast(self, v, broadcast_shape=None):
-    # The exact broadcast shape depends on the current batch, not the reference
-    # batch, unless we're calculating the batch normalization of the reference
-    # batch.
-    b_shape = broadcast_shape or self._broadcast_shape
-    if self._needs_broadcasting and v is not None:
-      return array_ops.reshape(v, b_shape)
-    return v
-
-  def reference_batch_normalization(self):
-    """Return the reference batch, but batch normalized."""
-    with ops.name_scope(self._vs.name):
-      return nn.batch_normalization(self._reference_batch,
-                                    self._broadcast(self._ref_mean),
-                                    self._broadcast(self._ref_variance),
-                                    self._broadcast(self._beta),
-                                    self._broadcast(self._gamma), self._epsilon)
-
-  def __call__(self, inputs):
-    """Run virtual batch normalization on inputs.
-
-    Args:
-      inputs: Tensor input.
-
-    Returns:
-       A virtual batch normalized version of `inputs`.
-
-    Raises:
-       ValueError: If `inputs` shape isn't compatible with the reference batch.
-    """
-    _validate_call_input([inputs, self._reference_batch], self._batch_axis)
-
-    with ops.name_scope(self._vs.name, values=[inputs, self._reference_batch]):
-      # Calculate the statistics on the current input on a per-example basis.
-      vb_mean, vb_mean_sq = self._virtual_statistics(
-          inputs, self._example_reduction_axes)
-      vb_variance = vb_mean_sq - math_ops.square(vb_mean)
-
-      # The exact broadcast shape of the input statistic Tensors depends on the
-      # current batch, not the reference batch. The parameter broadcast shape
-      # is independent of the shape of the input statistic Tensor dimensions.
-      b_shape = self._broadcast_shape[:]  # deep copy
-      b_shape[self._batch_axis] = _static_or_dynamic_batch_size(
-          inputs, self._batch_axis)
-      return nn.batch_normalization(
-          inputs, self._broadcast(vb_mean, b_shape),
-          self._broadcast(vb_variance, b_shape),
-          self._broadcast(self._beta, self._broadcast_shape),
-          self._broadcast(self._gamma, self._broadcast_shape), self._epsilon)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
deleted file mode 100644
index 9848f654bad..00000000000
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tfgan.python.features.virtual_batchnorm."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
-from tensorflow.contrib.gan.python.features.python import virtual_batchnorm_impl as virtual_batchnorm
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
-from tensorflow.python.layers import normalization
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-
-
-class VirtualBatchnormTest(test.TestCase):
-
-  def test_syntax(self):
-    reference_batch = array_ops.zeros([5, 3, 16, 9, 15])
-    vbn = virtual_batchnorm.VBN(reference_batch, batch_axis=1)
-    vbn(array_ops.ones([5, 7, 16, 9, 15]))
-
-  def test_no_broadcast_needed(self):
-    """When `axis` and `batch_axis` are at the end, no broadcast is needed."""
-    reference_batch = array_ops.zeros([5, 3, 16, 9, 15])
-    minibatch = array_ops.zeros([5, 3, 16, 3, 15])
-    vbn = virtual_batchnorm.VBN(reference_batch, axis=-1, batch_axis=-2)
-    vbn(minibatch)
-
-  def test_statistics(self):
-    """Check that `_statistics` gives the same result as `nn.moments`."""
-    random_seed.set_random_seed(1234)
-
-    tensors = random_ops.random_normal([4, 5, 7, 3])
-    for axes in [(3), (0, 2), (1, 2, 3)]:
-      vb_mean, mean_sq = virtual_batchnorm._statistics(tensors, axes)
-      mom_mean, mom_var = nn.moments(tensors, axes)
-      vb_var = mean_sq - math_ops.square(vb_mean)
-
-      with self.cached_session(use_gpu=True) as sess:
-        vb_mean_np, vb_var_np, mom_mean_np, mom_var_np = sess.run([
-            vb_mean, vb_var, mom_mean, mom_var])
-
-      self.assertAllClose(mom_mean_np, vb_mean_np)
-      self.assertAllClose(mom_var_np, vb_var_np)
-
-  def test_virtual_statistics(self):
-    """Check that `_virtual_statistics` gives same result as `nn.moments`."""
-    random_seed.set_random_seed(1234)
-
-    batch_axis = 0
-    partial_batch = random_ops.random_normal([4, 5, 7, 3])
-    single_example = random_ops.random_normal([1, 5, 7, 3])
-    full_batch = array_ops.concat([partial_batch, single_example], axis=0)
-
-    for reduction_axis in range(1, 4):
-      # Get `nn.moments` on the full batch.
-      reduction_axes = list(range(4))
-      del reduction_axes[reduction_axis]
-      mom_mean, mom_variance = nn.moments(full_batch, reduction_axes)
-
-      # Get virtual batch statistics.
-      vb_reduction_axes = list(range(4))
-      del vb_reduction_axes[reduction_axis]
-      del vb_reduction_axes[batch_axis]
-      vbn = virtual_batchnorm.VBN(partial_batch, reduction_axis)
-      vb_mean, mean_sq = vbn._virtual_statistics(
-          single_example, vb_reduction_axes)
-      vb_variance = mean_sq - math_ops.square(vb_mean)
-      # Remove singleton batch dim for easy comparisons.
-      vb_mean = array_ops.squeeze(vb_mean, batch_axis)
-      vb_variance = array_ops.squeeze(vb_variance, batch_axis)
-
-      with self.cached_session(use_gpu=True) as sess:
-        vb_mean_np, vb_var_np, mom_mean_np, mom_var_np = sess.run([
-            vb_mean, vb_variance, mom_mean, mom_variance])
-
-      self.assertAllClose(mom_mean_np, vb_mean_np)
-      self.assertAllClose(mom_var_np, vb_var_np)
-
-  def test_reference_batch_normalization(self):
-    """Check that batch norm from VBN agrees with opensource implementation."""
-    random_seed.set_random_seed(1234)
-
-    batch = random_ops.random_normal([6, 5, 7, 3, 3])
-
-    for axis in range(5):
-      # Get `layers` batchnorm result.
-      bn_normalized = normalization.batch_normalization(
-          batch, axis, training=True)
-
-      # Get VBN's batch normalization on reference batch.
-      batch_axis = 0 if axis != 0 else 1  # axis and batch_axis can't same
-      vbn = virtual_batchnorm.VBN(batch, axis, batch_axis=batch_axis)
-      vbn_normalized = vbn.reference_batch_normalization()
-
-      with self.cached_session(use_gpu=True) as sess:
-        variables_lib.global_variables_initializer().run()
-
-        bn_normalized_np, vbn_normalized_np = sess.run(
-            [bn_normalized, vbn_normalized])
-      self.assertAllClose(bn_normalized_np, vbn_normalized_np)
-
-  def test_same_as_batchnorm(self):
-    """Check that batch norm on set X is the same as ref of X / y on `y`."""
-    random_seed.set_random_seed(1234)
-
-    num_examples = 4
-    examples = [random_ops.random_normal([5, 7, 3]) for _ in
-                range(num_examples)]
-
-    # Get the result of the opensource batch normalization.
-    batch_normalized = normalization.batch_normalization(
-        array_ops.stack(examples), training=True)
-
-    for i in range(num_examples):
-      examples_except_i = array_ops.stack(examples[:i] + examples[i+1:])
-      # Get the result of VBN's batch normalization.
-      vbn = virtual_batchnorm.VBN(examples_except_i)
-      vb_normed = array_ops.squeeze(
-          vbn(array_ops.expand_dims(examples[i], [0])), [0])
-
-      with self.cached_session(use_gpu=True) as sess:
-        variables_lib.global_variables_initializer().run()
-        bn_np, vb_np = sess.run([batch_normalized, vb_normed])
-      self.assertAllClose(bn_np[i, ...], vb_np)
-
-  def test_minibatch_independent(self):
-    """Test that virtual batch normalized examples are independent.
-
-    Unlike batch normalization, virtual batch normalization has the property
-    that the virtual batch normalized value of an example is independent of the
-    other examples in the minibatch. In this test, we verify this property.
-    """
-    random_seed.set_random_seed(1234)
-
-    # These can be random, but must be the same for all session calls.
-    reference_batch = constant_op.constant(
-        np.random.normal(size=[4, 7, 3]), dtype=dtypes.float32)
-    fixed_example = constant_op.constant(np.random.normal(size=[7, 3]),
-                                         dtype=dtypes.float32)
-
-    # Get the VBN object and the virtual batch normalized value for
-    # `fixed_example`.
-    vbn = virtual_batchnorm.VBN(reference_batch)
-    vbn_fixed_example = array_ops.squeeze(
-        vbn(array_ops.expand_dims(fixed_example, 0)), 0)
-    with self.session(use_gpu=True):
-      variables_lib.global_variables_initializer().run()
-      vbn_fixed_example_np = vbn_fixed_example.eval()
-
-    # Check that the value is the same for different minibatches, and different
-    # sized minibatches.
-    for minibatch_size in range(1, 6):
-      examples = [random_ops.random_normal([7, 3]) for _ in
-                  range(minibatch_size)]
-
-      minibatch = array_ops.stack([fixed_example] + examples)
-      vbn_minibatch = vbn(minibatch)
-      cur_vbn_fixed_example = vbn_minibatch[0, ...]
-      with self.cached_session(use_gpu=True):
-        variables_lib.global_variables_initializer().run()
-        cur_vbn_fixed_example_np = cur_vbn_fixed_example.eval()
-      self.assertAllClose(vbn_fixed_example_np, cur_vbn_fixed_example_np)
-
-  def test_variable_reuse(self):
-    """Test that variable scopes work and inference on a real-ish case."""
-    tensor1_ref = array_ops.zeros([6, 5, 7, 3, 3])
-    tensor1_examples = array_ops.zeros([4, 5, 7, 3, 3])
-    tensor2_ref = array_ops.zeros([4, 2, 3])
-    tensor2_examples = array_ops.zeros([2, 2, 3])
-
-    with variable_scope.variable_scope('dummy_scope', reuse=True):
-      with self.assertRaisesRegexp(
-          ValueError, 'does not exist, or was not created with '
-          'tf.get_variable()'):
-        virtual_batchnorm.VBN(tensor1_ref)
-
-    vbn1 = virtual_batchnorm.VBN(tensor1_ref, name='vbn1')
-    vbn2 = virtual_batchnorm.VBN(tensor2_ref, name='vbn2')
-
-    # Fetch reference and examples after virtual batch normalization. Also
-    # fetch in variable reuse case.
-    to_fetch = []
-
-    to_fetch.append(vbn1.reference_batch_normalization())
-    to_fetch.append(vbn2.reference_batch_normalization())
-    to_fetch.append(vbn1(tensor1_examples))
-    to_fetch.append(vbn2(tensor2_examples))
-
-    variable_scope.get_variable_scope().reuse_variables()
-
-    to_fetch.append(vbn1.reference_batch_normalization())
-    to_fetch.append(vbn2.reference_batch_normalization())
-    to_fetch.append(vbn1(tensor1_examples))
-    to_fetch.append(vbn2(tensor2_examples))
-
-    self.assertEqual(4, len(contrib_variables_lib.get_variables()))
-
-    with self.session(use_gpu=True) as sess:
-      variables_lib.global_variables_initializer().run()
-      sess.run(to_fetch)
-
-  def test_invalid_input(self):
-    # Reference batch has unknown dimensions.
-    with self.assertRaisesRegexp(
-        ValueError, '`reference_batch` has unknown dimensions.'):
-      virtual_batchnorm.VBN(array_ops.placeholder(dtypes.float32), name='vbn1')
-
-    # Axis too negative.
-    with self.assertRaisesRegexp(
-        ValueError, 'Value of `axis` argument .* is out of range'):
-      virtual_batchnorm.VBN(array_ops.zeros([1, 2]), axis=-3, name='vbn2')
-
-    # Axis too large.
-    with self.assertRaisesRegexp(
-        ValueError, 'Value of `axis` argument .* is out of range'):
-      virtual_batchnorm.VBN(array_ops.zeros([1, 2]), axis=2, name='vbn3')
-
-    # Batch axis too negative.
-    with self.assertRaisesRegexp(
-        ValueError, 'Value of `axis` argument .* is out of range'):
-      virtual_batchnorm.VBN(array_ops.zeros([1, 2]), name='vbn4', batch_axis=-3)
-
-    # Batch axis too large.
-    with self.assertRaisesRegexp(
-        ValueError, 'Value of `axis` argument .* is out of range'):
-      virtual_batchnorm.VBN(array_ops.zeros([1, 2]), name='vbn5', batch_axis=2)
-
-    # Axis and batch axis are the same.
-    with self.assertRaisesRegexp(
-        ValueError, '`axis` and `batch_axis` cannot be the same.'):
-      virtual_batchnorm.VBN(array_ops.zeros(
-          [1, 2]), axis=1, name='vbn6', batch_axis=1)
-
-    # Reference Tensor and example Tensor have incompatible shapes.
-    tensor_ref = array_ops.zeros([5, 2, 3])
-    tensor_examples = array_ops.zeros([3, 2, 3])
-    vbn = virtual_batchnorm.VBN(tensor_ref, name='vbn7', batch_axis=1)
-    with self.assertRaisesRegexp(ValueError, 'Shapes .* are incompatible'):
-      vbn(tensor_examples)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/__init__.py b/tensorflow/contrib/gan/python/losses/__init__.py
deleted file mode 100644
index d9bf8ebfdf6..00000000000
--- a/tensorflow/contrib/gan/python/losses/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TFGAN losses and penalties.
-
-Losses can be used with individual arguments or with GANModel tuples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Collapse losses into a single namespace.
-from tensorflow.contrib.gan.python.losses.python import losses_wargs as wargs
-from tensorflow.contrib.gan.python.losses.python import tuple_losses
-
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.losses.python.tuple_losses import *
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ['wargs'] + tuple_losses.__all__
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
deleted file mode 100644
index 99bdf5b20d3..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ /dev/null
@@ -1,1030 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Losses that are useful for training GANs.
-
-The losses belong to two main groups, but there are others that do not:
-1) xxxxx_generator_loss
-2) xxxxx_discriminator_loss
-
-Example:
-1) wasserstein_generator_loss
-2) wasserstein_discriminator_loss
-
-Other example:
-wasserstein_gradient_penalty
-
-All losses must be able to accept 1D or 2D Tensors, so as to be compatible with
-patchGAN style losses (https://arxiv.org/abs/1611.07004).
-
-To make these losses usable in the TF-GAN framework, please create a tuple
-version of the losses with `losses_utils.py`.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.ops.losses import util
-from tensorflow.python.summary import summary
-
-__all__ = [
-    'acgan_discriminator_loss',
-    'acgan_generator_loss',
-    'least_squares_discriminator_loss',
-    'least_squares_generator_loss',
-    'modified_discriminator_loss',
-    'modified_generator_loss',
-    'minimax_discriminator_loss',
-    'minimax_generator_loss',
-    'wasserstein_discriminator_loss',
-    'wasserstein_generator_loss',
-    'wasserstein_gradient_penalty',
-    'mutual_information_penalty',
-    'combine_adversarial_loss',
-    'cycle_consistency_loss',
-]
-
-
-def _to_float(tensor):
-  return math_ops.cast(tensor, dtypes.float32)
-
-
-# Wasserstein losses from `Wasserstein GAN` (https://arxiv.org/abs/1701.07875).
-def wasserstein_generator_loss(
-    discriminator_gen_outputs,
-    weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Wasserstein generator loss for GANs.
-
-  See `Wasserstein GAN` (https://arxiv.org/abs/1701.07875) for more details.
-
-  Args:
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_outputs`, and must be broadcastable to
-      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add detailed summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'generator_wasserstein_loss',
-                      (discriminator_gen_outputs, weights)) as scope:
-    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
-
-    loss = -discriminator_gen_outputs
-    loss = losses.compute_weighted_loss(loss, weights, scope, loss_collection,
-                                        reduction)
-
-    if add_summaries:
-      summary.scalar('generator_wass_loss', loss)
-
-  return loss
-
-
-def wasserstein_discriminator_loss(
-    discriminator_real_outputs,
-    discriminator_gen_outputs,
-    real_weights=1.0,
-    generated_weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Wasserstein discriminator loss for GANs.
-
-  See `Wasserstein GAN` (https://arxiv.org/abs/1701.07875) for more details.
-
-  Args:
-    discriminator_real_outputs: Discriminator output on real data.
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    real_weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_real_outputs`, and must be broadcastable to
-      `discriminator_real_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    generated_weights: Same as `real_weights`, but for
-      `discriminator_gen_outputs`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'discriminator_wasserstein_loss',
-                      (discriminator_real_outputs, discriminator_gen_outputs,
-                       real_weights, generated_weights)) as scope:
-    discriminator_real_outputs = _to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
-    discriminator_real_outputs.shape.assert_is_compatible_with(
-        discriminator_gen_outputs.shape)
-
-    loss_on_generated = losses.compute_weighted_loss(
-        discriminator_gen_outputs,
-        generated_weights,
-        scope,
-        loss_collection=None,
-        reduction=reduction)
-    loss_on_real = losses.compute_weighted_loss(
-        discriminator_real_outputs,
-        real_weights,
-        scope,
-        loss_collection=None,
-        reduction=reduction)
-    loss = loss_on_generated - loss_on_real
-    util.add_loss(loss, loss_collection)
-
-    if add_summaries:
-      summary.scalar('discriminator_gen_wass_loss', loss_on_generated)
-      summary.scalar('discriminator_real_wass_loss', loss_on_real)
-      summary.scalar('discriminator_wass_loss', loss)
-
-  return loss
-
-
-# ACGAN losses from `Conditional Image Synthesis With Auxiliary Classifier GANs`
-# (https://arxiv.org/abs/1610.09585).
-def acgan_discriminator_loss(discriminator_real_classification_logits,
-                             discriminator_gen_classification_logits,
-                             one_hot_labels,
-                             label_smoothing=0.0,
-                             real_weights=1.0,
-                             generated_weights=1.0,
-                             scope=None,
-                             loss_collection=ops.GraphKeys.LOSSES,
-                             reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-                             add_summaries=False):
-  """ACGAN loss for the discriminator.
-
-  The ACGAN loss adds a classification loss to the conditional discriminator.
-  Therefore, the discriminator must output a tuple consisting of
-    (1) the real/fake prediction and
-    (2) the logits for the classification (usually the last conv layer,
-        flattened).
-
-  For more details:
-    ACGAN: https://arxiv.org/abs/1610.09585
-
-  Args:
-    discriminator_real_classification_logits: Classification logits for real
-      data.
-    discriminator_gen_classification_logits: Classification logits for generated
-      data.
-    one_hot_labels: A Tensor holding one-hot labels for the batch.
-    label_smoothing: A float in [0, 1]. If greater than 0, smooth the labels for
-      "discriminator on real data" as suggested in
-      https://arxiv.org/pdf/1701.00160
-    real_weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_real_outputs`, and must be broadcastable to
-      `discriminator_real_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    generated_weights: Same as `real_weights`, but for
-      `discriminator_gen_classification_logits`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. Shape depends on `reduction`.
-
-  Raises:
-    TypeError: If the discriminator does not output a tuple.
-  """
-  with ops.name_scope(
-      scope, 'acgan_discriminator_loss',
-      (discriminator_real_classification_logits,
-       discriminator_gen_classification_logits, one_hot_labels)) as scope:
-    loss_on_generated = losses.softmax_cross_entropy(
-        one_hot_labels,
-        discriminator_gen_classification_logits,
-        weights=generated_weights,
-        scope=scope,
-        loss_collection=None,
-        reduction=reduction)
-    loss_on_real = losses.softmax_cross_entropy(
-        one_hot_labels,
-        discriminator_real_classification_logits,
-        weights=real_weights,
-        label_smoothing=label_smoothing,
-        scope=scope,
-        loss_collection=None,
-        reduction=reduction)
-    loss = loss_on_generated + loss_on_real
-    util.add_loss(loss, loss_collection)
-
-    if add_summaries:
-      summary.scalar('discriminator_gen_ac_loss', loss_on_generated)
-      summary.scalar('discriminator_real_ac_loss', loss_on_real)
-      summary.scalar('discriminator_ac_loss', loss)
-
-  return loss
-
-
-def acgan_generator_loss(discriminator_gen_classification_logits,
-                         one_hot_labels,
-                         weights=1.0,
-                         scope=None,
-                         loss_collection=ops.GraphKeys.LOSSES,
-                         reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-                         add_summaries=False):
-  """ACGAN loss for the generator.
-
-  The ACGAN loss adds a classification loss to the conditional discriminator.
-  Therefore, the discriminator must output a tuple consisting of
-    (1) the real/fake prediction and
-    (2) the logits for the classification (usually the last conv layer,
-        flattened).
-
-  For more details:
-    ACGAN: https://arxiv.org/abs/1610.09585
-
-  Args:
-    discriminator_gen_classification_logits: Classification logits for generated
-      data.
-    one_hot_labels: A Tensor holding one-hot labels for the batch.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_classification_logits`, and must be broadcastable to
-      `discriminator_gen_classification_logits` (i.e., all dimensions must be
-      either `1`, or the same as the corresponding dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. Shape depends on `reduction`.
-
-  Raises:
-    ValueError: if arg module not either `generator` or `discriminator`
-    TypeError: if the discriminator does not output a tuple.
-  """
-  with ops.name_scope(
-      scope, 'acgan_generator_loss',
-      (discriminator_gen_classification_logits, one_hot_labels)) as scope:
-    loss = losses.softmax_cross_entropy(
-        one_hot_labels,
-        discriminator_gen_classification_logits,
-        weights=weights,
-        scope=scope,
-        loss_collection=loss_collection,
-        reduction=reduction)
-
-    if add_summaries:
-      summary.scalar('generator_ac_loss', loss)
-
-  return loss
-
-
-# Wasserstein Gradient Penalty losses from `Improved Training of Wasserstein
-# GANs` (https://arxiv.org/abs/1704.00028).
-
-
-def wasserstein_gradient_penalty(
-    real_data,
-    generated_data,
-    generator_inputs,
-    discriminator_fn,
-    discriminator_scope,
-    epsilon=1e-10,
-    target=1.0,
-    one_sided=False,
-    weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """The gradient penalty for the Wasserstein discriminator loss.
-
-  See `Improved Training of Wasserstein GANs`
-  (https://arxiv.org/abs/1704.00028) for more details.
-
-  Args:
-    real_data: Real data.
-    generated_data: Output of the generator.
-    generator_inputs: Exact argument to pass to the generator, which is used as
-      optional conditioning to the discriminator.
-    discriminator_fn: A discriminator function that conforms to TF-GAN API.
-    discriminator_scope: If not `None`, reuse discriminators from this scope.
-    epsilon: A small positive number added for numerical stability when
-      computing the gradient norm.
-    target: Optional Python number or `Tensor` indicating the target value of
-      gradient norm. Defaults to 1.0.
-    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
-      is used. Defaults to `False`.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `real_data` and `generated_data`, and must be broadcastable to them (i.e.,
-      all dimensions must be either `1`, or the same as the corresponding
-      dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-
-  Raises:
-    ValueError: If the rank of data Tensors is unknown.
-  """
-  with ops.name_scope(scope, 'wasserstein_gradient_penalty',
-                      (real_data, generated_data)) as scope:
-    real_data = ops.convert_to_tensor(real_data)
-    generated_data = ops.convert_to_tensor(generated_data)
-    if real_data.shape.ndims is None:
-      raise ValueError('`real_data` can\'t have unknown rank.')
-    if generated_data.shape.ndims is None:
-      raise ValueError('`generated_data` can\'t have unknown rank.')
-
-    differences = generated_data - real_data
-    batch_size = differences.shape.dims[0].value or array_ops.shape(
-        differences)[0]
-    alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1)
-    alpha = random_ops.random_uniform(shape=alpha_shape)
-    interpolates = real_data + (alpha * differences)
-
-    with ops.name_scope(None):  # Clear scope so update ops are added properly.
-      # Reuse variables if variables already exists.
-      with variable_scope.variable_scope(
-          discriminator_scope,
-          'gpenalty_dscope',
-          reuse=variable_scope.AUTO_REUSE):
-        disc_interpolates = discriminator_fn(interpolates, generator_inputs)
-
-    if isinstance(disc_interpolates, tuple):
-      # ACGAN case: disc outputs more than one tensor
-      disc_interpolates = disc_interpolates[0]
-
-    gradients = gradients_impl.gradients(disc_interpolates, interpolates)[0]
-    gradient_squares = math_ops.reduce_sum(
-        math_ops.square(gradients), axis=list(range(1, gradients.shape.ndims)))
-    # Propagate shape information, if possible.
-    if isinstance(batch_size, int):
-      gradient_squares.set_shape([batch_size] +
-                                 gradient_squares.shape.as_list()[1:])
-    # For numerical stability, add epsilon to the sum before taking the square
-    # root. Note tf.norm does not add epsilon.
-    slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = slopes / target - 1.0
-    if one_sided:
-      penalties = math_ops.maximum(0., penalties)
-    penalties_squared = math_ops.square(penalties)
-    penalty = losses.compute_weighted_loss(
-        penalties_squared,
-        weights,
-        scope=scope,
-        loss_collection=loss_collection,
-        reduction=reduction)
-
-    if add_summaries:
-      summary.scalar('gradient_penalty_loss', penalty)
-
-    return penalty
-
-
-# Original losses from `Generative Adversarial Nets`
-# (https://arxiv.org/abs/1406.2661).
-
-
-def minimax_discriminator_loss(
-    discriminator_real_outputs,
-    discriminator_gen_outputs,
-    label_smoothing=0.25,
-    real_weights=1.0,
-    generated_weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Original minimax discriminator loss for GANs, with label smoothing.
-
-  Note that the authors don't recommend using this loss. A more practically
-  useful loss is `modified_discriminator_loss`.
-
-  L = - real_weights * log(sigmoid(D(x)))
-      - generated_weights * log(1 - sigmoid(D(G(z))))
-
-  See `Generative Adversarial Nets` (https://arxiv.org/abs/1406.2661) for more
-  details.
-
-  Args:
-    discriminator_real_outputs: Discriminator output on real data.
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    label_smoothing: The amount of smoothing for positive labels. This technique
-      is taken from `Improved Techniques for Training GANs`
-      (https://arxiv.org/abs/1606.03498). `0.0` means no smoothing.
-    real_weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `real_data`, and must be broadcastable to `real_data` (i.e., all
-      dimensions must be either `1`, or the same as the corresponding
-      dimension).
-    generated_weights: Same as `real_weights`, but for `generated_data`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(
-      scope, 'discriminator_minimax_loss',
-      (discriminator_real_outputs, discriminator_gen_outputs, real_weights,
-       generated_weights, label_smoothing)) as scope:
-
-    # -log((1 - label_smoothing) - sigmoid(D(x)))
-    loss_on_real = losses.sigmoid_cross_entropy(
-        array_ops.ones_like(discriminator_real_outputs),
-        discriminator_real_outputs,
-        real_weights,
-        label_smoothing,
-        scope,
-        loss_collection=None,
-        reduction=reduction)
-    # -log(- sigmoid(D(G(x))))
-    loss_on_generated = losses.sigmoid_cross_entropy(
-        array_ops.zeros_like(discriminator_gen_outputs),
-        discriminator_gen_outputs,
-        generated_weights,
-        scope=scope,
-        loss_collection=None,
-        reduction=reduction)
-
-    loss = loss_on_real + loss_on_generated
-    util.add_loss(loss, loss_collection)
-
-    if add_summaries:
-      summary.scalar('discriminator_gen_minimax_loss', loss_on_generated)
-      summary.scalar('discriminator_real_minimax_loss', loss_on_real)
-      summary.scalar('discriminator_minimax_loss', loss)
-
-  return loss
-
-
-def minimax_generator_loss(discriminator_gen_outputs,
-                           label_smoothing=0.0,
-                           weights=1.0,
-                           scope=None,
-                           loss_collection=ops.GraphKeys.LOSSES,
-                           reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-                           add_summaries=False):
-  """Original minimax generator loss for GANs.
-
-  Note that the authors don't recommend using this loss. A more practically
-  useful loss is `modified_generator_loss`.
-
-  L = log(sigmoid(D(x))) + log(1 - sigmoid(D(G(z))))
-
-  See `Generative Adversarial Nets` (https://arxiv.org/abs/1406.2661) for more
-  details.
-
-  Args:
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    label_smoothing: The amount of smoothing for positive labels. This technique
-      is taken from `Improved Techniques for Training GANs`
-      (https://arxiv.org/abs/1606.03498). `0.0` means no smoothing.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_outputs`, and must be broadcastable to
-      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'generator_minimax_loss') as scope:
-    loss = -minimax_discriminator_loss(
-        array_ops.ones_like(discriminator_gen_outputs),
-        discriminator_gen_outputs,
-        label_smoothing,
-        weights,
-        weights,
-        scope,
-        loss_collection,
-        reduction,
-        add_summaries=False)
-
-  if add_summaries:
-    summary.scalar('generator_minimax_loss', loss)
-
-  return loss
-
-
-def modified_discriminator_loss(
-    discriminator_real_outputs,
-    discriminator_gen_outputs,
-    label_smoothing=0.25,
-    real_weights=1.0,
-    generated_weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Same as minimax discriminator loss.
-
-  See `Generative Adversarial Nets` (https://arxiv.org/abs/1406.2661) for more
-  details.
-
-  Args:
-    discriminator_real_outputs: Discriminator output on real data.
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    label_smoothing: The amount of smoothing for positive labels. This technique
-      is taken from `Improved Techniques for Training GANs`
-      (https://arxiv.org/abs/1606.03498). `0.0` means no smoothing.
-    real_weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_outputs`, and must be broadcastable to
-      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    generated_weights: Same as `real_weights`, but for
-      `discriminator_gen_outputs`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  return minimax_discriminator_loss(discriminator_real_outputs,
-                                    discriminator_gen_outputs, label_smoothing,
-                                    real_weights, generated_weights, scope or
-                                    'discriminator_modified_loss',
-                                    loss_collection, reduction, add_summaries)
-
-
-def modified_generator_loss(discriminator_gen_outputs,
-                            label_smoothing=0.0,
-                            weights=1.0,
-                            scope=None,
-                            loss_collection=ops.GraphKeys.LOSSES,
-                            reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-                            add_summaries=False):
-  """Modified generator loss for GANs.
-
-  L = -log(sigmoid(D(G(z))))
-
-  This is the trick used in the original paper to avoid vanishing gradients
-  early in training. See `Generative Adversarial Nets`
-  (https://arxiv.org/abs/1406.2661) for more details.
-
-  Args:
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    label_smoothing: The amount of smoothing for positive labels. This technique
-      is taken from `Improved Techniques for Training GANs`
-      (https://arxiv.org/abs/1606.03498). `0.0` means no smoothing.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_outputs`, and must be broadcastable to `labels` (i.e.,
-      all dimensions must be either `1`, or the same as the corresponding
-      dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'generator_modified_loss',
-                      [discriminator_gen_outputs]) as scope:
-    loss = losses.sigmoid_cross_entropy(
-        array_ops.ones_like(discriminator_gen_outputs),
-        discriminator_gen_outputs, weights, label_smoothing, scope,
-        loss_collection, reduction)
-
-    if add_summaries:
-      summary.scalar('generator_modified_loss', loss)
-
-  return loss
-
-
-# Least Squares loss from `Least Squares Generative Adversarial Networks`
-# (https://arxiv.org/abs/1611.04076).
-
-
-def least_squares_generator_loss(
-    discriminator_gen_outputs,
-    real_label=1,
-    weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Least squares generator loss.
-
-  This loss comes from `Least Squares Generative Adversarial Networks`
-  (https://arxiv.org/abs/1611.04076).
-
-  L = 1/2 * (D(G(z)) - `real_label`) ** 2
-
-  where D(y) are discriminator logits.
-
-  Args:
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    real_label: The value that the generator is trying to get the discriminator
-      to output on generated data.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_gen_outputs`, and must be broadcastable to
-      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'lsq_generator_loss',
-                      (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
-    loss = math_ops.squared_difference(discriminator_gen_outputs,
-                                       real_label) / 2.0
-    loss = losses.compute_weighted_loss(loss, weights, scope, loss_collection,
-                                        reduction)
-
-  if add_summaries:
-    summary.scalar('generator_lsq_loss', loss)
-
-  return loss
-
-
-def least_squares_discriminator_loss(
-    discriminator_real_outputs,
-    discriminator_gen_outputs,
-    real_label=1,
-    fake_label=0,
-    real_weights=1.0,
-    generated_weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Least squares discriminator loss.
-
-  This loss comes from `Least Squares Generative Adversarial Networks`
-  (https://arxiv.org/abs/1611.04076).
-
-  L = 1/2 * (D(x) - `real`) ** 2 +
-      1/2 * (D(G(z)) - `fake_label`) ** 2
-
-  where D(y) are discriminator logits.
-
-  Args:
-    discriminator_real_outputs: Discriminator output on real data.
-    discriminator_gen_outputs: Discriminator output on generated data. Expected
-      to be in the range of (-inf, inf).
-    real_label: The value that the discriminator tries to output for real data.
-    fake_label: The value that the discriminator tries to output for fake data.
-    real_weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `discriminator_real_outputs`, and must be broadcastable to
-      `discriminator_real_outputs` (i.e., all dimensions must be either `1`, or
-      the same as the corresponding dimension).
-    generated_weights: Same as `real_weights`, but for
-      `discriminator_gen_outputs`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A loss Tensor. The shape depends on `reduction`.
-  """
-  with ops.name_scope(scope, 'lsq_discriminator_loss',
-                      (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_real_outputs = _to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
-    discriminator_real_outputs.shape.assert_is_compatible_with(
-        discriminator_gen_outputs.shape)
-
-    real_losses = math_ops.squared_difference(discriminator_real_outputs,
-                                              real_label) / 2.0
-    fake_losses = math_ops.squared_difference(discriminator_gen_outputs,
-                                              fake_label) / 2.0
-
-    loss_on_real = losses.compute_weighted_loss(
-        real_losses,
-        real_weights,
-        scope,
-        loss_collection=None,
-        reduction=reduction)
-    loss_on_generated = losses.compute_weighted_loss(
-        fake_losses,
-        generated_weights,
-        scope,
-        loss_collection=None,
-        reduction=reduction)
-
-    loss = loss_on_real + loss_on_generated
-    util.add_loss(loss, loss_collection)
-
-  if add_summaries:
-    summary.scalar('discriminator_gen_lsq_loss', loss_on_generated)
-    summary.scalar('discriminator_real_lsq_loss', loss_on_real)
-    summary.scalar('discriminator_lsq_loss', loss)
-
-  return loss
-
-
-# InfoGAN loss from `InfoGAN: Interpretable Representation Learning by
-# `Information Maximizing Generative Adversarial Nets`
-# https://arxiv.org/abs/1606.03657
-
-
-def _validate_distributions(distributions):
-  if not isinstance(distributions, (list, tuple)):
-    raise ValueError('`distributions` must be a list or tuple. Instead, '
-                     'found %s.' % type(distributions))
-  for x in distributions:
-    # We used to check with `isinstance(x, tf.compat.v1.distributions.Distribution)`.
-    # However, distributions have migrated to `tfp.distributions.Distribution`,
-    # which is a new code repo, so we can't check this way anymore until
-    # TF-GAN is migrated to a new repo as well.
-    # This new check is not sufficient, but is a useful heuristic for now.
-    if not callable(getattr(x, 'log_prob', None)):
-      raise ValueError('`distributions` must be a list of `Distributions`. '
-                       'Instead, found %s.' % type(x))
-
-
-def _validate_information_penalty_inputs(structured_generator_inputs,
-                                         predicted_distributions):
-  """Validate input to `mutual_information_penalty`."""
-  _validate_distributions(predicted_distributions)
-  if len(structured_generator_inputs) != len(predicted_distributions):
-    raise ValueError(
-        '`structured_generator_inputs` length %i must be the same '
-        'as `predicted_distributions` length %i.' %
-        (len(structured_generator_inputs), len(predicted_distributions)))
-
-
-def mutual_information_penalty(
-    structured_generator_inputs,
-    predicted_distributions,
-    weights=1.0,
-    scope=None,
-    loss_collection=ops.GraphKeys.LOSSES,
-    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-    add_summaries=False):
-  """Returns a penalty on the mutual information in an InfoGAN model.
-
-  This loss comes from an InfoGAN paper https://arxiv.org/abs/1606.03657.
-
-  Args:
-    structured_generator_inputs: A list of Tensors representing the random noise
-      that must  have high mutual information with the generator output. List
-      length should match `predicted_distributions`.
-    predicted_distributions: A list of `tfp.distributions.Distribution`s.
-      Predicted by the recognizer, and used to evaluate the likelihood of the
-      structured noise. List length should match `structured_generator_inputs`.
-    weights: Optional `Tensor` whose rank is either 0, or the same dimensions as
-      `structured_generator_inputs`.
-    scope: The scope for the operations performed in computing the loss.
-    loss_collection: collection to which this loss will be added.
-    reduction: A `tf.compat.v1.losses.Reduction` to apply to loss.
-    add_summaries: Whether or not to add summaries for the loss.
-
-  Returns:
-    A scalar Tensor representing the mutual information loss.
-  """
-  _validate_information_penalty_inputs(structured_generator_inputs,
-                                       predicted_distributions)
-
-  with ops.name_scope(scope, 'mutual_information_loss') as scope:
-    # Calculate the negative log-likelihood of the reconstructed noise.
-    log_probs = [
-        math_ops.reduce_mean(dist.log_prob(noise)) for dist, noise in zip(
-            predicted_distributions, structured_generator_inputs)
-    ]
-    loss = -1 * losses.compute_weighted_loss(
-        log_probs,
-        weights,
-        scope,
-        loss_collection=loss_collection,
-        reduction=reduction)
-
-    if add_summaries:
-      summary.scalar('mutual_information_penalty', loss)
-
-  return loss
-
-
-def _numerically_stable_global_norm(tensor_list):
-  """Compute the global norm of a list of Tensors, with improved stability.
-
-  The global norm computation sometimes overflows due to the intermediate L2
-  step. To avoid this, we divide by a cheap-to-compute max over the
-  matrix elements.
-
-  Args:
-    tensor_list: A list of tensors, or `None`.
-
-  Returns:
-    A scalar tensor with the global norm.
-  """
-  if all(x is None for x in tensor_list):
-    return 0.0
-
-  list_max = math_ops.reduce_max([
-      math_ops.reduce_max(math_ops.abs(x)) for x in tensor_list if x is not None
-  ])
-  return list_max * clip_ops.global_norm(
-      [x / list_max for x in tensor_list if x is not None])
-
-
-def _used_weight(weights_list):
-  for weight in weights_list:
-    if weight is not None:
-      return tensor_util.constant_value(ops.convert_to_tensor(weight))
-
-
-def _validate_args(losses_list, weight_factor, gradient_ratio):
-  for loss in losses_list:
-    loss.shape.assert_is_compatible_with([])
-  if weight_factor is None and gradient_ratio is None:
-    raise ValueError(
-        '`weight_factor` and `gradient_ratio` cannot both be `None.`')
-  if weight_factor is not None and gradient_ratio is not None:
-    raise ValueError(
-        '`weight_factor` and `gradient_ratio` cannot both be specified.')
-
-
-# TODO(joelshor): Add ability to pass in gradients, to avoid recomputing.
-def combine_adversarial_loss(main_loss,
-                             adversarial_loss,
-                             weight_factor=None,
-                             gradient_ratio=None,
-                             gradient_ratio_epsilon=1e-6,
-                             variables=None,
-                             scalar_summaries=True,
-                             gradient_summaries=True,
-                             scope=None):
-  """Utility to combine main and adversarial losses.
-
-  This utility combines the main and adversarial losses in one of two ways.
-  1) Fixed coefficient on adversarial loss. Use `weight_factor` in this case.
-  2) Fixed ratio of gradients. Use `gradient_ratio` in this case. This is often
-    used to make sure both losses affect weights roughly equally, as in
-    https://arxiv.org/pdf/1705.05823.
-
-  One can optionally also visualize the scalar and gradient behavior of the
-  losses.
-
-  Args:
-    main_loss: A floating scalar Tensor indicating the main loss.
-    adversarial_loss: A floating scalar Tensor indication the adversarial loss.
-    weight_factor: If not `None`, the coefficient by which to multiply the
-      adversarial loss. Exactly one of this and `gradient_ratio` must be
-      non-None.
-    gradient_ratio: If not `None`, the ratio of the magnitude of the gradients.
-      Specifically, gradient_ratio = grad_mag(main_loss) /
-      grad_mag(adversarial_loss) Exactly one of this and `weight_factor` must be
-      non-None.
-    gradient_ratio_epsilon: An epsilon to add to the adversarial loss
-      coefficient denominator, to avoid division-by-zero.
-    variables: List of variables to calculate gradients with respect to. If not
-      present, defaults to all trainable variables.
-    scalar_summaries: Create scalar summaries of losses.
-    gradient_summaries: Create gradient summaries of losses.
-    scope: Optional name scope.
-
-  Returns:
-    A floating scalar Tensor indicating the desired combined loss.
-
-  Raises:
-    ValueError: Malformed input.
-  """
-  _validate_args([main_loss, adversarial_loss], weight_factor, gradient_ratio)
-  if variables is None:
-    variables = contrib_variables_lib.get_trainable_variables()
-
-  with ops.name_scope(
-      scope, 'adversarial_loss', values=[main_loss, adversarial_loss]):
-    # Compute gradients if we will need them.
-    if gradient_summaries or gradient_ratio is not None:
-      main_loss_grad_mag = _numerically_stable_global_norm(
-          gradients_impl.gradients(main_loss, variables))
-      adv_loss_grad_mag = _numerically_stable_global_norm(
-          gradients_impl.gradients(adversarial_loss, variables))
-
-    # Add summaries, if applicable.
-    if scalar_summaries:
-      summary.scalar('main_loss', main_loss)
-      summary.scalar('adversarial_loss', adversarial_loss)
-    if gradient_summaries:
-      summary.scalar('main_loss_gradients', main_loss_grad_mag)
-      summary.scalar('adversarial_loss_gradients', adv_loss_grad_mag)
-
-    # Combine losses in the appropriate way.
-    # If `weight_factor` is always `0`, avoid computing the adversarial loss
-    # tensor entirely.
-    if _used_weight((weight_factor, gradient_ratio)) == 0:
-      final_loss = main_loss
-    elif weight_factor is not None:
-      final_loss = (
-          main_loss + array_ops.stop_gradient(weight_factor) * adversarial_loss)
-    elif gradient_ratio is not None:
-      grad_mag_ratio = main_loss_grad_mag / (
-          adv_loss_grad_mag + gradient_ratio_epsilon)
-      adv_coeff = grad_mag_ratio / gradient_ratio
-      summary.scalar('adversarial_coefficient', adv_coeff)
-      final_loss = (
-          main_loss + array_ops.stop_gradient(adv_coeff) * adversarial_loss)
-
-  return final_loss
-
-
-def cycle_consistency_loss(data_x,
-                           reconstructed_data_x,
-                           data_y,
-                           reconstructed_data_y,
-                           scope=None,
-                           add_summaries=False):
-  """Defines the cycle consistency loss.
-
-  The cyclegan model has two partial models where `model_x2y` generator F maps
-  data set X to Y, `model_y2x` generator G maps data set Y to X. For a `data_x`
-  in data set X, we could reconstruct it by
-  * reconstructed_data_x = G(F(data_x))
-  Similarly
-  * reconstructed_data_y = F(G(data_y))
-
-  The cycle consistency loss is about the difference between data and
-  reconstructed data, namely
-  * loss_x2x = |data_x - G(F(data_x))| (L1-norm)
-  * loss_y2y = |data_y - F(G(data_y))| (L1-norm)
-  * loss = (loss_x2x + loss_y2y) / 2
-  where `loss` is the final result.
-
-  For the L1-norm, we follow the original implementation:
-  https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
-  we use L1-norm of pixel-wise error normalized by data size such that
-  `cycle_loss_weight` can be specified independent of image size.
-
-  See https://arxiv.org/abs/1703.10593 for more details.
-
-  Args:
-    data_x: A `Tensor` of data X.
-    reconstructed_data_x: A `Tensor` of reconstructed data X.
-    data_y: A `Tensor` of data Y.
-    reconstructed_data_y: A `Tensor` of reconstructed data Y.
-    scope: The scope for the operations performed in computing the loss.
-      Defaults to None.
-    add_summaries: Whether or not to add detailed summaries for the loss.
-      Defaults to False.
-
-  Returns:
-    A scalar `Tensor` of cycle consistency loss.
-  """
-
-  with ops.name_scope(
-      scope,
-      'cycle_consistency_loss',
-      values=[data_x, reconstructed_data_x, data_y, reconstructed_data_y]):
-    loss_x2x = losses.absolute_difference(data_x, reconstructed_data_x)
-    loss_y2y = losses.absolute_difference(data_y, reconstructed_data_y)
-    loss = (loss_x2x + loss_y2y) / 2.0
-    if add_summaries:
-      summary.scalar('cycle_consistency_loss_x2x', loss_x2x)
-      summary.scalar('cycle_consistency_loss_y2y', loss_y2y)
-      summary.scalar('cycle_consistency_loss', loss)
-
-  return loss
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
deleted file mode 100644
index 44ee0f52696..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ /dev/null
@@ -1,701 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TFGAN losses."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python.losses.python import losses_impl as tfgan_losses
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import categorical
-from tensorflow.python.ops.distributions import normal
-from tensorflow.python.ops.losses import losses as tf_losses
-from tensorflow.python.platform import test
-
-
-# TODO(joelshor): Use `parameterized` tests when opensourced.
-class _LossesTest(object):
-
-  def init_constants(self):
-    self._discriminator_real_outputs_np = [-5.0, 1.4, 12.5, 2.7]
-    self._discriminator_gen_outputs_np = [10.0, 4.4, -5.5, 3.6]
-    self._weights = 2.3
-    self._discriminator_real_outputs = constant_op.constant(
-        self._discriminator_real_outputs_np, dtype=dtypes.float32)
-    self._discriminator_gen_outputs = constant_op.constant(
-        self._discriminator_gen_outputs_np, dtype=dtypes.float32)
-
-  def test_generator_all_correct(self):
-    loss = self._g_loss_fn(self._discriminator_gen_outputs)
-    self.assertEqual(self._discriminator_gen_outputs.dtype, loss.dtype)
-    self.assertEqual(self._generator_loss_name, loss.op.name)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss, loss.eval(), 5)
-
-  def test_discriminator_all_correct(self):
-    loss = self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs)
-    self.assertEqual(self._discriminator_gen_outputs.dtype, loss.dtype)
-    self.assertEqual(self._discriminator_loss_name, loss.op.name)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss, loss.eval(), 5)
-
-  def test_generator_loss_collection(self):
-    self.assertEqual(0, len(ops.get_collection('collection')))
-    self._g_loss_fn(
-        self._discriminator_gen_outputs, loss_collection='collection')
-    self.assertEqual(1, len(ops.get_collection('collection')))
-
-  def test_discriminator_loss_collection(self):
-    self.assertEqual(0, len(ops.get_collection('collection')))
-    self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs,
-        loss_collection='collection')
-    self.assertEqual(1, len(ops.get_collection('collection')))
-
-  def test_generator_no_reduction(self):
-    loss = self._g_loss_fn(
-        self._discriminator_gen_outputs, reduction=tf_losses.Reduction.NONE)
-    self.assertAllEqual([4], loss.shape)
-
-  def test_discriminator_no_reduction(self):
-    loss = self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs,
-        reduction=tf_losses.Reduction.NONE)
-    self.assertAllEqual([4], loss.shape)
-
-  def test_generator_patch(self):
-    loss = self._g_loss_fn(
-        array_ops.reshape(self._discriminator_gen_outputs, [2, 2]))
-    self.assertEqual(self._discriminator_gen_outputs.dtype, loss.dtype)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss, loss.eval(), 5)
-
-  def test_discriminator_patch(self):
-    loss = self._d_loss_fn(
-        array_ops.reshape(self._discriminator_real_outputs, [2, 2]),
-        array_ops.reshape(self._discriminator_gen_outputs, [2, 2]))
-    self.assertEqual(self._discriminator_gen_outputs.dtype, loss.dtype)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss, loss.eval(), 5)
-
-  def test_generator_loss_with_placeholder_for_logits(self):
-    logits = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    weights = array_ops.ones_like(logits, dtype=dtypes.float32)
-
-    loss = self._g_loss_fn(logits, weights=weights)
-    self.assertEqual(logits.dtype, loss.dtype)
-
-    with self.cached_session() as sess:
-      loss = sess.run(loss,
-                      feed_dict={
-                          logits: [[10.0, 4.4, -5.5, 3.6]],
-                      })
-      self.assertAlmostEqual(self._expected_g_loss, loss, 5)
-
-  def test_discriminator_loss_with_placeholder_for_logits(self):
-    logits = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    logits2 = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    real_weights = array_ops.ones_like(logits, dtype=dtypes.float32)
-    generated_weights = array_ops.ones_like(logits, dtype=dtypes.float32)
-
-    loss = self._d_loss_fn(
-        logits, logits2, real_weights=real_weights,
-        generated_weights=generated_weights)
-
-    with self.cached_session() as sess:
-      loss = sess.run(loss,
-                      feed_dict={
-                          logits: [self._discriminator_real_outputs_np],
-                          logits2: [self._discriminator_gen_outputs_np],
-                      })
-      self.assertAlmostEqual(self._expected_d_loss, loss, 5)
-
-  def test_generator_with_python_scalar_weight(self):
-    loss = self._g_loss_fn(
-        self._discriminator_gen_outputs, weights=self._weights)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_discriminator_with_python_scalar_weight(self):
-    loss = self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs,
-        real_weights=self._weights, generated_weights=self._weights)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_generator_with_scalar_tensor_weight(self):
-    loss = self._g_loss_fn(self._discriminator_gen_outputs,
-                           weights=constant_op.constant(self._weights))
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_discriminator_with_scalar_tensor_weight(self):
-    weights = constant_op.constant(self._weights)
-    loss = self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs,
-        real_weights=weights, generated_weights=weights)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_generator_add_summaries(self):
-    self.assertEqual(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    self._g_loss_fn(self._discriminator_gen_outputs, add_summaries=True)
-    self.assertLess(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-
-  def test_discriminator_add_summaries(self):
-    self.assertEqual(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    self._d_loss_fn(
-        self._discriminator_real_outputs, self._discriminator_gen_outputs,
-        add_summaries=True)
-    self.assertLess(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-
-
-class LeastSquaresLossTest(test.TestCase, _LossesTest):
-  """Tests for least_squares_xxx_loss."""
-
-  def setUp(self):
-    super(LeastSquaresLossTest, self).setUp()
-    self.init_constants()
-    self._expected_g_loss = 17.69625
-    self._expected_d_loss = 41.73375
-    self._generator_loss_name = 'lsq_generator_loss/value'
-    self._discriminator_loss_name = 'lsq_discriminator_loss/add'
-    self._g_loss_fn = tfgan_losses.least_squares_generator_loss
-    self._d_loss_fn = tfgan_losses.least_squares_discriminator_loss
-
-
-class ModifiedLossTest(test.TestCase, _LossesTest):
-  """Tests for modified_xxx_loss."""
-
-  def setUp(self):
-    super(ModifiedLossTest, self).setUp()
-    self.init_constants()
-    self._expected_g_loss = 1.38582
-    self._expected_d_loss = 6.19637
-    self._generator_loss_name = 'generator_modified_loss/value'
-    self._discriminator_loss_name = 'discriminator_modified_loss/add_1'
-    self._g_loss_fn = tfgan_losses.modified_generator_loss
-    self._d_loss_fn = tfgan_losses.modified_discriminator_loss
-
-
-class MinimaxLossTest(test.TestCase, _LossesTest):
-  """Tests for minimax_xxx_loss."""
-
-  def setUp(self):
-    super(MinimaxLossTest, self).setUp()
-    self.init_constants()
-    self._expected_g_loss = -4.82408
-    self._expected_d_loss = 6.19637
-    self._generator_loss_name = 'generator_minimax_loss/Neg'
-    self._discriminator_loss_name = 'discriminator_minimax_loss/add_1'
-    self._g_loss_fn = tfgan_losses.minimax_generator_loss
-    self._d_loss_fn = tfgan_losses.minimax_discriminator_loss
-
-
-class WassersteinLossTest(test.TestCase, _LossesTest):
-  """Tests for wasserstein_xxx_loss."""
-
-  def setUp(self):
-    super(WassersteinLossTest, self).setUp()
-    self.init_constants()
-    self._expected_g_loss = -3.12500
-    self._expected_d_loss = 0.22500
-    self._generator_loss_name = 'generator_wasserstein_loss/value'
-    self._discriminator_loss_name = 'discriminator_wasserstein_loss/sub'
-    self._g_loss_fn = tfgan_losses.wasserstein_generator_loss
-    self._d_loss_fn = tfgan_losses.wasserstein_discriminator_loss
-
-
-# TODO(joelshor): Use `parameterized` tests when opensourced.
-# TODO(joelshor): Refactor this test to use the same code as the other losses.
-class ACGANLossTest(test.TestCase):
-  """Tests for wasserstein_xxx_loss."""
-
-  def setUp(self):
-    super(ACGANLossTest, self).setUp()
-    self._g_loss_fn = tfgan_losses.acgan_generator_loss
-    self._d_loss_fn = tfgan_losses.acgan_discriminator_loss
-    self._discriminator_gen_classification_logits_np = [[10.0, 4.4, -5.5, 3.6],
-                                                        [-4.0, 4.4, 5.2, 4.6],
-                                                        [1.1, 2.4, -3.5, 5.6],
-                                                        [1.1, 2.4, -3.5, 5.6]]
-    self._discriminator_real_classification_logits_np = [[-2.0, 0.4, 12.5, 2.7],
-                                                         [-1.2, 1.9, 12.3, 2.6],
-                                                         [-2.4, -1.7, 2.5, 2.7],
-                                                         [1.1, 2.4, -3.5, 5.6]]
-    self._one_hot_labels_np = [[0, 1, 0, 0],
-                               [0, 0, 1, 0],
-                               [1, 0, 0, 0],
-                               [1, 0, 0, 0]]
-    self._weights = 2.3
-
-    self._discriminator_gen_classification_logits = constant_op.constant(
-        self._discriminator_gen_classification_logits_np, dtype=dtypes.float32)
-    self._discriminator_real_classification_logits = constant_op.constant(
-        self._discriminator_real_classification_logits_np, dtype=dtypes.float32)
-    self._one_hot_labels = constant_op.constant(
-        self._one_hot_labels_np, dtype=dtypes.float32)
-    self._generator_kwargs = {
-        'discriminator_gen_classification_logits':
-        self._discriminator_gen_classification_logits,
-        'one_hot_labels': self._one_hot_labels,
-    }
-    self._discriminator_kwargs = {
-        'discriminator_gen_classification_logits':
-        self._discriminator_gen_classification_logits,
-        'discriminator_real_classification_logits':
-        self._discriminator_real_classification_logits,
-        'one_hot_labels': self._one_hot_labels,
-    }
-    self._generator_loss_name = 'acgan_generator_loss/value'
-    self._discriminator_loss_name = 'acgan_discriminator_loss/add'
-    self._expected_g_loss = 3.84974
-    self._expected_d_loss = 9.43950
-
-  def test_generator_all_correct(self):
-    loss = self._g_loss_fn(**self._generator_kwargs)
-    self.assertEqual(
-        self._discriminator_gen_classification_logits.dtype, loss.dtype)
-    self.assertEqual(self._generator_loss_name, loss.op.name)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss, loss.eval(), 5)
-
-  def test_discriminator_all_correct(self):
-    loss = self._d_loss_fn(**self._discriminator_kwargs)
-    self.assertEqual(
-        self._discriminator_gen_classification_logits.dtype, loss.dtype)
-    self.assertEqual(self._discriminator_loss_name, loss.op.name)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss, loss.eval(), 5)
-
-  def test_generator_loss_collection(self):
-    self.assertEqual(0, len(ops.get_collection('collection')))
-    self._g_loss_fn(loss_collection='collection', **self._generator_kwargs)
-    self.assertEqual(1, len(ops.get_collection('collection')))
-
-  def test_discriminator_loss_collection(self):
-    self.assertEqual(0, len(ops.get_collection('collection')))
-    self._d_loss_fn(loss_collection='collection', **self._discriminator_kwargs)
-    self.assertEqual(1, len(ops.get_collection('collection')))
-
-  def test_generator_no_reduction(self):
-    loss = self._g_loss_fn(
-        reduction=tf_losses.Reduction.NONE, **self._generator_kwargs)
-    self.assertAllEqual([4], loss.shape)
-
-  def test_discriminator_no_reduction(self):
-    loss = self._d_loss_fn(
-        reduction=tf_losses.Reduction.NONE, **self._discriminator_kwargs)
-    self.assertAllEqual([4], loss.shape)
-
-  def test_generator_patch(self):
-    patch_args = {x: array_ops.reshape(y, [2, 2, 4]) for x, y in
-                  self._generator_kwargs.items()}
-    loss = self._g_loss_fn(**patch_args)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss, loss.eval(), 5)
-
-  def test_discriminator_patch(self):
-    patch_args = {x: array_ops.reshape(y, [2, 2, 4]) for x, y in
-                  self._discriminator_kwargs.items()}
-    loss = self._d_loss_fn(**patch_args)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss, loss.eval(), 5)
-
-  def test_generator_loss_with_placeholder_for_logits(self):
-    gen_logits = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    one_hot_labels = array_ops.placeholder(dtypes.int32, shape=(None, 4))
-
-    loss = self._g_loss_fn(gen_logits, one_hot_labels)
-    with self.cached_session() as sess:
-      loss = sess.run(
-          loss, feed_dict={
-              gen_logits: self._discriminator_gen_classification_logits_np,
-              one_hot_labels: self._one_hot_labels_np,
-          })
-      self.assertAlmostEqual(self._expected_g_loss, loss, 5)
-
-  def test_discriminator_loss_with_placeholder_for_logits_and_weights(self):
-    gen_logits = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    real_logits = array_ops.placeholder(dtypes.float32, shape=(None, 4))
-    one_hot_labels = array_ops.placeholder(dtypes.int32, shape=(None, 4))
-
-    loss = self._d_loss_fn(gen_logits, real_logits, one_hot_labels)
-
-    with self.cached_session() as sess:
-      loss = sess.run(
-          loss, feed_dict={
-              gen_logits: self._discriminator_gen_classification_logits_np,
-              real_logits: self._discriminator_real_classification_logits_np,
-              one_hot_labels: self._one_hot_labels_np,
-          })
-      self.assertAlmostEqual(self._expected_d_loss, loss, 5)
-
-  def test_generator_with_python_scalar_weight(self):
-    loss = self._g_loss_fn(weights=self._weights, **self._generator_kwargs)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_discriminator_with_python_scalar_weight(self):
-    loss = self._d_loss_fn(
-        real_weights=self._weights, generated_weights=self._weights,
-        **self._discriminator_kwargs)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_generator_with_scalar_tensor_weight(self):
-    loss = self._g_loss_fn(
-        weights=constant_op.constant(self._weights), **self._generator_kwargs)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_g_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_discriminator_with_scalar_tensor_weight(self):
-    weights = constant_op.constant(self._weights)
-    loss = self._d_loss_fn(real_weights=weights, generated_weights=weights,
-                           **self._discriminator_kwargs)
-    with self.cached_session():
-      self.assertAlmostEqual(self._expected_d_loss * self._weights,
-                             loss.eval(), 4)
-
-  def test_generator_add_summaries(self):
-    self.assertEqual(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    self._g_loss_fn(add_summaries=True, **self._generator_kwargs)
-    self.assertLess(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-
-  def test_discriminator_add_summaries(self):
-    self.assertEqual(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-    self._d_loss_fn(add_summaries=True, **self._discriminator_kwargs)
-    self.assertLess(0, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
-
-
-class _PenaltyTest(object):
-
-  def test_all_correct(self):
-    loss = self._penalty_fn(**self._kwargs)
-    self.assertEqual(self._expected_dtype, loss.dtype)
-    # NOTE: Op names will change, it is inappropriate to include them in tests.
-    # See go/tf-breaking-change.
-    # self.assertEqual(self._expected_op_name, loss.op.name)
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      self.assertAlmostEqual(self._expected_loss, loss.eval(), 6)
-
-  def test_loss_collection(self):
-    self.assertEqual(0, len(ops.get_collection('collection')))
-    self._penalty_fn(loss_collection='collection', **self._kwargs)
-    self.assertEqual(1, len(ops.get_collection('collection')))
-
-  def test_no_reduction(self):
-    loss = self._penalty_fn(reduction=tf_losses.Reduction.NONE, **self._kwargs)
-    self.assertAllEqual([self._batch_size], loss.shape)
-
-  def test_python_scalar_weight(self):
-    loss = self._penalty_fn(weights=2.3, **self._kwargs)
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      self.assertAlmostEqual(self._expected_loss * 2.3, loss.eval(), 3)
-
-  def test_scalar_tensor_weight(self):
-    loss = self._penalty_fn(weights=constant_op.constant(2.3), **self._kwargs)
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      self.assertAlmostEqual(self._expected_loss * 2.3, loss.eval(), 3)
-
-
-class GradientPenaltyTest(test.TestCase, _PenaltyTest):
-  """Tests for wasserstein_gradient_penalty."""
-
-  def setUp(self):
-    super(GradientPenaltyTest, self).setUp()
-    self._penalty_fn = tfgan_losses.wasserstein_gradient_penalty
-    self._generated_data_np = [[3.1, 2.3, -12.3, 32.1]]
-    self._real_data_np = [[-12.3, 23.2, 16.3, -43.2]]
-    self._expected_dtype = dtypes.float32
-
-    with variable_scope.variable_scope('fake_scope') as self._scope:
-      self._discriminator_fn(0.0, 0.0)
-
-    self._kwargs = {
-        'generated_data': constant_op.constant(
-            self._generated_data_np, dtype=self._expected_dtype),
-        'real_data': constant_op.constant(
-            self._real_data_np, dtype=self._expected_dtype),
-        'generator_inputs': None,
-        'discriminator_fn': self._discriminator_fn,
-        'discriminator_scope': self._scope,
-    }
-    self._expected_loss = 9.00000
-    self._expected_op_name = 'wasserstein_gradient_penalty/value'
-    self._batch_size = 1
-
-  def _discriminator_fn(self, inputs, _):
-    ops.add_to_collection('fake_update_ops', constant_op.constant(1.0))
-    return variable_scope.get_variable('dummy_d', initializer=2.0) * inputs
-
-  def test_loss_with_placeholder(self):
-    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-
-    loss = tfgan_losses.wasserstein_gradient_penalty(
-        generated_data,
-        real_data,
-        self._kwargs['generator_inputs'],
-        self._kwargs['discriminator_fn'],
-        self._kwargs['discriminator_scope'])
-    self.assertEqual(generated_data.dtype, loss.dtype)
-
-    with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
-      loss = sess.run(loss,
-                      feed_dict={
-                          generated_data: self._generated_data_np,
-                          real_data: self._real_data_np,
-                      })
-      self.assertAlmostEqual(self._expected_loss, loss, 5)
-
-  def test_loss_using_one_sided_mode(self):
-    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-
-    loss = tfgan_losses.wasserstein_gradient_penalty(
-        generated_data,
-        real_data,
-        self._kwargs['generator_inputs'],
-        self._kwargs['discriminator_fn'],
-        self._kwargs['discriminator_scope'],
-        one_sided=True)
-    self.assertEqual(generated_data.dtype, loss.dtype)
-
-    with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
-      loss = sess.run(loss,
-                      feed_dict={
-                          generated_data: self._generated_data_np,
-                          real_data: self._real_data_np,
-                      })
-      self.assertAlmostEqual(self._expected_loss, loss, 5)
-
-  def test_loss_with_gradient_norm_target(self):
-    """Test loss value with non default gradient norm target."""
-    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
-
-    loss = tfgan_losses.wasserstein_gradient_penalty(
-        generated_data,
-        real_data,
-        self._kwargs['generator_inputs'],
-        self._kwargs['discriminator_fn'],
-        self._kwargs['discriminator_scope'],
-        target=2.0)
-
-    with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
-      loss = sess.run(
-          loss,
-          feed_dict={
-              generated_data: self._generated_data_np,
-              real_data: self._real_data_np,
-          })
-      self.assertAlmostEqual(1.0, loss, 5)
-
-  def test_reuses_scope(self):
-    """Test that gradient penalty reuses discriminator scope."""
-    num_vars = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    tfgan_losses.wasserstein_gradient_penalty(**self._kwargs)
-    self.assertEqual(
-        num_vars, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-
-  def test_works_with_get_collection(self):
-    """Tests that gradient penalty works inside other scopes."""
-    # We ran the discriminator once in the setup, so there should be an op
-    # already in the collection.
-    self.assertEqual(1, len(ops.get_collection(
-        'fake_update_ops', self._kwargs['discriminator_scope'].name)))
-
-    # Make sure the op is added to the collection even if it's in a name scope.
-    with ops.name_scope('loss'):
-      tfgan_losses.wasserstein_gradient_penalty(**self._kwargs)
-    self.assertEqual(2, len(ops.get_collection(
-        'fake_update_ops', self._kwargs['discriminator_scope'].name)))
-
-    # Make sure the op is added to the collection even if it's in a variable
-    # scope.
-    with variable_scope.variable_scope('loss_vscope'):
-      tfgan_losses.wasserstein_gradient_penalty(**self._kwargs)
-    self.assertEqual(3, len(ops.get_collection(
-        'fake_update_ops', self._kwargs['discriminator_scope'].name)))
-
-
-class MutualInformationPenaltyTest(test.TestCase, _PenaltyTest):
-  """Tests for mutual_information_penalty."""
-
-  def setUp(self):
-    super(MutualInformationPenaltyTest, self).setUp()
-    self._penalty_fn = tfgan_losses.mutual_information_penalty
-    self._structured_generator_inputs = [1.0, 2.0]
-    self._predicted_distributions = [categorical.Categorical(logits=[1.0, 2.0]),
-                                     normal.Normal([0.0], [1.0])]
-    self._expected_dtype = dtypes.float32
-
-    self._kwargs = {
-        'structured_generator_inputs': self._structured_generator_inputs,
-        'predicted_distributions': self._predicted_distributions,
-    }
-    self._expected_loss = 1.61610
-    self._expected_op_name = 'mutual_information_loss/mul_1'
-    self._batch_size = 2
-
-
-class CombineAdversarialLossTest(test.TestCase):
-  """Tests for combine_adversarial_loss."""
-
-  def setUp(self):
-    super(CombineAdversarialLossTest, self).setUp()
-    self._generated_data_np = [[3.1, 2.3, -12.3, 32.1]]
-    self._real_data_np = [[-12.3, 23.2, 16.3, -43.2]]
-    self._generated_data = constant_op.constant(
-        self._generated_data_np, dtype=dtypes.float32)
-    self._real_data = constant_op.constant(
-        self._real_data_np, dtype=dtypes.float32)
-    self._generated_inputs = None
-    self._expected_loss = 9.00000
-
-  def _test_correct_helper(self, use_weight_factor):
-    variable_list = [variables.Variable(1.0)]
-    main_loss = variable_list[0] * 2
-    adversarial_loss = variable_list[0] * 3
-    gradient_ratio_epsilon = 1e-6
-    if use_weight_factor:
-      weight_factor = constant_op.constant(2.0)
-      gradient_ratio = None
-      adv_coeff = 2.0
-      expected_loss = 1.0 * 2 + adv_coeff * 1.0 * 3
-    else:
-      weight_factor = None
-      gradient_ratio = constant_op.constant(0.5)
-      adv_coeff = 2.0 / (3 * 0.5 + gradient_ratio_epsilon)
-      expected_loss = 1.0 * 2 + adv_coeff * 1.0 * 3
-    combined_loss = tfgan_losses.combine_adversarial_loss(
-        main_loss,
-        adversarial_loss,
-        weight_factor=weight_factor,
-        gradient_ratio=gradient_ratio,
-        gradient_ratio_epsilon=gradient_ratio_epsilon,
-        variables=variable_list)
-
-    with self.test_session(use_gpu=True):
-      variables.global_variables_initializer().run()
-      self.assertNear(expected_loss, combined_loss.eval(), 1e-5)
-
-  def test_correct_useweightfactor(self):
-    self._test_correct_helper(True)
-
-  def test_correct_nouseweightfactor(self):
-    self._test_correct_helper(False)
-
-  def _test_no_weight_skips_adversarial_loss_helper(self, use_weight_factor):
-    """Test the 0 adversarial weight or grad ratio skips adversarial loss."""
-    main_loss = constant_op.constant(1.0)
-    adversarial_loss = constant_op.constant(1.0)
-
-    weight_factor = 0.0 if use_weight_factor else None
-    gradient_ratio = None if use_weight_factor else 0.0
-
-    combined_loss = tfgan_losses.combine_adversarial_loss(
-        main_loss,
-        adversarial_loss,
-        weight_factor=weight_factor,
-        gradient_ratio=gradient_ratio,
-        gradient_summaries=False)
-
-    with self.test_session(use_gpu=True):
-      self.assertEqual(1.0, combined_loss.eval())
-
-  def test_no_weight_skips_adversarial_loss_useweightfactor(self):
-    self._test_no_weight_skips_adversarial_loss_helper(True)
-
-  def test_no_weight_skips_adversarial_loss_nouseweightfactor(self):
-    self._test_no_weight_skips_adversarial_loss_helper(False)
-
-  def test_stable_global_norm_avoids_overflow(self):
-    tensors = [array_ops.ones([4]), array_ops.ones([4, 4]) * 1e19, None]
-    gnorm_is_inf = math_ops.is_inf(clip_ops.global_norm(tensors))
-    stable_gnorm_is_inf = math_ops.is_inf(
-        tfgan_losses._numerically_stable_global_norm(tensors))
-
-    with self.test_session(use_gpu=True):
-      self.assertTrue(gnorm_is_inf.eval())
-      self.assertFalse(stable_gnorm_is_inf.eval())
-
-  def test_stable_global_norm_unchanged(self):
-    """Test that preconditioning doesn't change global norm value."""
-    random_seed.set_random_seed(1234)
-    tensors = [random_ops.random_uniform([3]*i, -10.0, 10.0) for i in range(6)]
-    gnorm = clip_ops.global_norm(tensors)
-    precond_gnorm = tfgan_losses._numerically_stable_global_norm(tensors)
-
-    with self.test_session(use_gpu=True) as sess:
-      for _ in range(10):  # spot check closeness on more than one sample.
-        gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
-        self.assertNear(gnorm_np, precond_gnorm_np, 1e-4)
-
-
-class CycleConsistencyLossTest(test.TestCase):
-  """Tests for cycle_consistency_loss."""
-
-  def setUp(self):
-    super(CycleConsistencyLossTest, self).setUp()
-
-    self._data_x_np = [[1.0, 2, 3], [4, 5, 6]]
-    self._reconstructed_data_x_np = [[7.0, 8, 9], [10, 11, 12]]
-    self._data_y_np = [1.0, 9]
-    self._reconstructed_data_y_np = [-2.0, 3]
-
-    self._data_x = constant_op.constant(self._data_x_np, dtype=dtypes.float32)
-    self._reconstructed_data_x = constant_op.constant(
-        self._reconstructed_data_x_np, dtype=dtypes.float32)
-    self._data_y = constant_op.constant(self._data_y_np, dtype=dtypes.float32)
-    self._reconstructed_data_y = constant_op.constant(
-        self._reconstructed_data_y_np, dtype=dtypes.float32)
-
-  def test_correct_loss(self):
-    loss = tfgan_losses.cycle_consistency_loss(
-        self._data_x, self._reconstructed_data_x, self._data_y,
-        self._reconstructed_data_y)
-    with self.test_session(use_gpu=True):
-      variables.global_variables_initializer().run()
-      self.assertNear(5.25, loss.eval(), 1e-5)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_wargs.py b/tensorflow/contrib/gan/python/losses/python/losses_wargs.py
deleted file mode 100644
index f212bdcf30b..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/losses_wargs.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.losses.python import losses_impl
-from tensorflow.contrib.gan.python.losses.python.losses_impl import *
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-remove_undocumented(__name__, losses_impl.__all__)
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses.py
deleted file mode 100644
index 1a50b3f5880..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TFGAN utilities for loss functions that accept GANModel namedtuples."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.losses.python import tuple_losses_impl
-from tensorflow.contrib.gan.python.losses.python.tuple_losses_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-__all__ = tuple_losses_impl.__all__
-remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
deleted file mode 100644
index 76e57df7f64..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TF-GAN utilities for loss functions that accept GANModel namedtuples.
-
-The losses and penalties in this file all correspond to losses in
-`losses_impl.py`. Losses in that file take individual arguments, whereas in this
-file they take a `GANModel` tuple. For example:
-
-losses_impl.py:
-  ```python
-  def wasserstein_discriminator_loss(
-      discriminator_real_outputs,
-      discriminator_gen_outputs,
-      real_weights=1.0,
-      generated_weights=1.0,
-      scope=None,
-      loss_collection=ops.GraphKeys.LOSSES,
-      reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-      add_summaries=False)
-  ```
-
-tuple_losses_impl.py:
-  ```python
-  def wasserstein_discriminator_loss(
-      gan_model,
-      real_weights=1.0,
-      generated_weights=1.0,
-      scope=None,
-      loss_collection=ops.GraphKeys.LOSSES,
-      reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-      add_summaries=False)
-  ```
-
-
-
-Example usage:
-  ```python
-  # `tfgan.losses.wargs` losses take individual arguments.
-  w_loss = tfgan.losses.wargs.wasserstein_discriminator_loss(
-    discriminator_real_outputs,
-    discriminator_gen_outputs)
-
-  # `tfgan.losses` losses take GANModel namedtuples.
-  w_loss2 = tfgan.losses.wasserstein_discriminator_loss(gan_model)
-  ```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python.losses.python import losses_impl
-from tensorflow.python.util import tf_inspect
-
-
-__all__ = [
-    'acgan_discriminator_loss',
-    'acgan_generator_loss',
-    'least_squares_discriminator_loss',
-    'least_squares_generator_loss',
-    'modified_discriminator_loss',
-    'modified_generator_loss',
-    'minimax_discriminator_loss',
-    'minimax_generator_loss',
-    'wasserstein_discriminator_loss',
-    'wasserstein_generator_loss',
-    'wasserstein_gradient_penalty',
-    'mutual_information_penalty',
-    'combine_adversarial_loss',
-    'cycle_consistency_loss',
-    'stargan_generator_loss_wrapper',
-    'stargan_discriminator_loss_wrapper',
-    'stargan_gradient_penalty_wrapper'
-]
-
-
-def _args_to_gan_model(loss_fn):
-  """Converts a loss taking individual args to one taking a GANModel namedtuple.
-
-  The new function has the same name as the original one.
-
-  Args:
-    loss_fn: A python function taking a `GANModel` object and returning a loss
-      Tensor calculated from that object. The shape of the loss depends on
-      `reduction`.
-
-  Returns:
-    A new function that takes a GANModel namedtuples and returns the same loss.
-  """
-  # Match arguments in `loss_fn` to elements of `namedtuple`.
-  # TODO(joelshor): Properly handle `varargs` and `keywords`.
-  argspec = tf_inspect.getargspec(loss_fn)
-  defaults = argspec.defaults or []
-
-  required_args = set(argspec.args[:-len(defaults)])
-  args_with_defaults = argspec.args[-len(defaults):]
-  default_args_dict = dict(zip(args_with_defaults, defaults))
-
-  def new_loss_fn(gan_model, **kwargs):  # pylint:disable=missing-docstring
-    def _asdict(namedtuple):
-      """Returns a namedtuple as a dictionary.
-
-      This is required because `_asdict()` in Python 3.x.x is broken in classes
-      that inherit from `collections.namedtuple`. See
-      https://bugs.python.org/issue24931 for more details.
-
-      Args:
-        namedtuple: An object that inherits from `collections.namedtuple`.
-
-      Returns:
-        A dictionary version of the tuple.
-      """
-      return {k: getattr(namedtuple, k) for k in namedtuple._fields}
-    gan_model_dict = _asdict(gan_model)
-
-    # Make sure non-tuple required args are supplied.
-    args_from_tuple = set(argspec.args).intersection(set(gan_model._fields))
-    required_args_not_from_tuple = required_args - args_from_tuple
-    for arg in required_args_not_from_tuple:
-      if arg not in kwargs:
-        raise ValueError('`%s` must be supplied to %s loss function.' % (
-            arg, loss_fn.__name__))
-
-    # Make sure tuple args aren't also supplied as keyword args.
-    ambiguous_args = set(gan_model._fields).intersection(set(kwargs.keys()))
-    if ambiguous_args:
-      raise ValueError(
-          'The following args are present in both the tuple and keyword args '
-          'for %s: %s' % (loss_fn.__name__, ambiguous_args))
-
-    # Add required args to arg dictionary.
-    required_args_from_tuple = required_args.intersection(args_from_tuple)
-    for arg in required_args_from_tuple:
-      assert arg not in kwargs
-      kwargs[arg] = gan_model_dict[arg]
-
-    # Add arguments that have defaults.
-    for arg in default_args_dict:
-      val_from_tuple = gan_model_dict[arg] if arg in gan_model_dict else None
-      val_from_kwargs = kwargs[arg] if arg in kwargs else None
-      assert not (val_from_tuple is not None and val_from_kwargs is not None)
-      kwargs[arg] = (val_from_tuple if val_from_tuple is not None else
-                     val_from_kwargs if val_from_kwargs is not None else
-                     default_args_dict[arg])
-
-    return loss_fn(**kwargs)
-
-  new_docstring = """The gan_model version of %s.""" % loss_fn.__name__
-  new_loss_fn.__docstring__ = new_docstring
-  new_loss_fn.__name__ = loss_fn.__name__
-  new_loss_fn.__module__ = loss_fn.__module__
-  return new_loss_fn
-
-
-# Wasserstein losses from `Wasserstein GAN` (https://arxiv.org/abs/1701.07875).
-wasserstein_generator_loss = _args_to_gan_model(
-    losses_impl.wasserstein_generator_loss)
-wasserstein_discriminator_loss = _args_to_gan_model(
-    losses_impl.wasserstein_discriminator_loss)
-wasserstein_gradient_penalty = _args_to_gan_model(
-    losses_impl.wasserstein_gradient_penalty)
-
-# ACGAN losses from `Conditional Image Synthesis With Auxiliary Classifier GANs`
-# (https://arxiv.org/abs/1610.09585).
-acgan_discriminator_loss = _args_to_gan_model(
-    losses_impl.acgan_discriminator_loss)
-acgan_generator_loss = _args_to_gan_model(
-    losses_impl.acgan_generator_loss)
-
-
-# Original losses from `Generative Adversarial Nets`
-# (https://arxiv.org/abs/1406.2661).
-minimax_discriminator_loss = _args_to_gan_model(
-    losses_impl.minimax_discriminator_loss)
-minimax_generator_loss = _args_to_gan_model(
-    losses_impl.minimax_generator_loss)
-modified_discriminator_loss = _args_to_gan_model(
-    losses_impl.modified_discriminator_loss)
-modified_generator_loss = _args_to_gan_model(
-    losses_impl.modified_generator_loss)
-
-
-# Least Squares loss from `Least Squares Generative Adversarial Networks`
-# (https://arxiv.org/abs/1611.04076).
-least_squares_generator_loss = _args_to_gan_model(
-    losses_impl.least_squares_generator_loss)
-least_squares_discriminator_loss = _args_to_gan_model(
-    losses_impl.least_squares_discriminator_loss)
-
-
-# InfoGAN loss from `InfoGAN: Interpretable Representation Learning by
-# `Information Maximizing Generative Adversarial Nets`
-# https://arxiv.org/abs/1606.03657
-mutual_information_penalty = _args_to_gan_model(
-    losses_impl.mutual_information_penalty)
-
-
-def combine_adversarial_loss(gan_loss,
-                             gan_model,
-                             non_adversarial_loss,
-                             weight_factor=None,
-                             gradient_ratio=None,
-                             gradient_ratio_epsilon=1e-6,
-                             scalar_summaries=True,
-                             gradient_summaries=True):
-  """Combine adversarial loss and main loss.
-
-  Uses `combine_adversarial_loss` to combine the losses, and returns
-  a modified GANLoss namedtuple.
-
-  Args:
-    gan_loss: A GANLoss namedtuple. Assume the GANLoss.generator_loss is the
-      adversarial loss.
-    gan_model: A GANModel namedtuple. Used to access the generator's variables.
-    non_adversarial_loss: Same as `main_loss` from
-      `combine_adversarial_loss`.
-    weight_factor: Same as `weight_factor` from
-      `combine_adversarial_loss`.
-    gradient_ratio: Same as `gradient_ratio` from
-      `combine_adversarial_loss`.
-    gradient_ratio_epsilon: Same as `gradient_ratio_epsilon` from
-      `combine_adversarial_loss`.
-    scalar_summaries: Same as `scalar_summaries` from
-      `combine_adversarial_loss`.
-    gradient_summaries: Same as `gradient_summaries` from
-      `combine_adversarial_loss`.
-
-  Returns:
-    A modified GANLoss namedtuple, with `non_adversarial_loss` included
-    appropriately.
-  """
-  combined_loss = losses_impl.combine_adversarial_loss(
-      non_adversarial_loss,
-      gan_loss.generator_loss,
-      weight_factor,
-      gradient_ratio,
-      gradient_ratio_epsilon,
-      gan_model.generator_variables,
-      scalar_summaries,
-      gradient_summaries)
-  return gan_loss._replace(generator_loss=combined_loss)
-
-
-def cycle_consistency_loss(cyclegan_model, scope=None, add_summaries=False):
-  """Defines the cycle consistency loss.
-
-  Uses `cycle_consistency_loss` to compute the cycle consistency loss for a
-  `cyclegan_model`.
-
-  Args:
-    cyclegan_model: A `CycleGANModel` namedtuple.
-    scope: The scope for the operations performed in computing the loss.
-      Defaults to None.
-    add_summaries: Whether or not to add detailed summaries for the loss.
-      Defaults to False.
-
-  Returns:
-    A scalar `Tensor` of cycle consistency loss.
-
-  Raises:
-    ValueError: If `cyclegan_model` is not a `CycleGANModel` namedtuple.
-  """
-  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
-    raise ValueError(
-        '`cyclegan_model` must be a `CycleGANModel`. Instead, was %s.' %
-        type(cyclegan_model))
-  return losses_impl.cycle_consistency_loss(
-      cyclegan_model.model_x2y.generator_inputs, cyclegan_model.reconstructed_x,
-      cyclegan_model.model_y2x.generator_inputs, cyclegan_model.reconstructed_y,
-      scope, add_summaries)
-
-
-def stargan_generator_loss_wrapper(loss_fn):
-  """Convert a generator loss function to take a StarGANModel.
-
-  The new function has the same name as the original one.
-
-  Args:
-    loss_fn: A python function taking Discriminator's real/fake prediction for
-      generated data.
-
-  Returns:
-    A new function that takes a StarGANModel namedtuple and returns the same
-    loss.
-  """
-
-  def new_loss_fn(stargan_model, **kwargs):
-    return loss_fn(
-        stargan_model.discriminator_generated_data_source_predication, **kwargs)
-
-  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
-  new_loss_fn.__docstring__ = new_docstring
-  new_loss_fn.__name__ = loss_fn.__name__
-  new_loss_fn.__module__ = loss_fn.__module__
-  return new_loss_fn
-
-
-def stargan_discriminator_loss_wrapper(loss_fn):
-  """Convert a discriminator loss function to take a StarGANModel.
-
-  The new function has the same name as the original one.
-
-  Args:
-    loss_fn: A python function taking Discriminator's real/fake prediction for
-      real data and generated data.
-
-  Returns:
-    A new function that takes a StarGANModel namedtuple and returns the same
-    loss.
-  """
-
-  def new_loss_fn(stargan_model, **kwargs):
-    return loss_fn(
-        stargan_model.discriminator_input_data_source_predication,
-        stargan_model.discriminator_generated_data_source_predication, **kwargs)
-
-  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
-  new_loss_fn.__docstring__ = new_docstring
-  new_loss_fn.__name__ = loss_fn.__name__
-  new_loss_fn.__module__ = loss_fn.__module__
-  return new_loss_fn
-
-
-def stargan_gradient_penalty_wrapper(loss_fn):
-  """Convert a gradient penalty function to take a StarGANModel.
-
-  The new function has the same name as the original one.
-
-  Args:
-    loss_fn: A python function taking real_data, generated_data,
-      generator_inputs for Discriminator's condition (i.e. number of domains),
-      discriminator_fn, and discriminator_scope.
-
-  Returns:
-    A new function that takes a StarGANModel namedtuple and returns the same
-    loss.
-  """
-
-  def new_loss_fn(stargan_model, **kwargs):
-    num_domains = stargan_model.input_data_domain_label.shape.as_list()[-1]
-    return loss_fn(
-        real_data=stargan_model.input_data,
-        generated_data=stargan_model.generated_data,
-        generator_inputs=num_domains,
-        discriminator_fn=stargan_model.discriminator_fn,
-        discriminator_scope=stargan_model.discriminator_scope,
-        **kwargs)
-
-  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
-  new_loss_fn.__docstring__ = new_docstring
-  new_loss_fn.__name__ = loss_fn.__name__
-  new_loss_fn.__module__ = loss_fn.__module__
-  return new_loss_fn
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
deleted file mode 100644
index 25d74a8c23d..00000000000
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for contrib.gan.python.losses."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python.losses.python import losses_impl as tfgan_losses_impl
-from tensorflow.contrib.gan.python.losses.python import tuple_losses_impl as tfgan_losses
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class ArgsToGanModelTest(test.TestCase):
-
-  def test_args_to_gan_model(self):
-    """Test `_args_to_gan_model`."""
-    tuple_type = collections.namedtuple('fake_type', ['arg1', 'arg3'])
-
-    def args_loss(arg1, arg2, arg3=3, arg4=4):
-      return arg1 + arg2 + arg3 + arg4
-
-    gan_model_loss = tfgan_losses._args_to_gan_model(args_loss)
-
-    # Value is correct.
-    self.assertEqual(1 + 2 + 5 + 6,
-                     gan_model_loss(tuple_type(1, 2), arg2=5, arg4=6))
-
-    # Uses tuple argument with defaults.
-    self.assertEqual(1 + 5 + 3 + 7,
-                     gan_model_loss(tuple_type(1, None), arg2=5, arg4=7))
-
-    # Uses non-tuple argument with defaults.
-    self.assertEqual(1 + 5 + 2 + 4,
-                     gan_model_loss(tuple_type(1, 2), arg2=5))
-
-    # Requires non-tuple, non-default arguments.
-    with self.assertRaisesRegexp(ValueError, '`arg2` must be supplied'):
-      gan_model_loss(tuple_type(1, 2))
-
-    # Can't pass tuple argument outside tuple.
-    with self.assertRaisesRegexp(
-        ValueError, 'present in both the tuple and keyword args'):
-      gan_model_loss(tuple_type(1, 2), arg2=1, arg3=5)
-
-  def test_args_to_gan_model_name(self):
-    """Test that `_args_to_gan_model` produces correctly named functions."""
-    def loss_fn(x):
-      return x
-    new_loss_fn = tfgan_losses._args_to_gan_model(loss_fn)
-    self.assertEqual('loss_fn', new_loss_fn.__name__)
-    self.assertTrue('The gan_model version of' in new_loss_fn.__docstring__)
-
-  def test_tuple_respects_optional_args(self):
-    """Test that optional args can be changed with tuple losses."""
-    tuple_type = collections.namedtuple('fake_type', ['arg1', 'arg2'])
-    def args_loss(arg1, arg2, arg3=3):
-      return arg1 + 2 * arg2 + 3 * arg3
-
-    loss_fn = tfgan_losses._args_to_gan_model(args_loss)
-    loss = loss_fn(tuple_type(arg1=-1, arg2=2), arg3=4)
-
-    # If `arg3` were not set properly, this value would be different.
-    self.assertEqual(-1 + 2 * 2 + 3 * 4, loss)
-
-  def test_works_with_child_classes(self):
-    """`args_to_gan_model` should work with classes derived from namedtuple."""
-    tuple_type = collections.namedtuple('fake_type', ['arg1', 'arg2'])
-
-    class InheritedType(tuple_type):
-      pass
-    def args_loss(arg1, arg2, arg3=3):
-      return arg1 + 2 * arg2 + 3 * arg3
-
-    loss_fn = tfgan_losses._args_to_gan_model(args_loss)
-    loss = loss_fn(InheritedType(arg1=-1, arg2=2), arg3=4)
-
-    # If `arg3` were not set properly, this value would be different.
-    self.assertEqual(-1 + 2 * 2 + 3 * 4, loss)
-
-
-class ConsistentLossesTest(test.TestCase):
-
-  pass
-
-
-def _tuple_from_dict(args_dict):
-  return collections.namedtuple('Tuple', args_dict.keys())(**args_dict)
-
-
-def add_loss_consistency_test(test_class, loss_name_str, loss_args):
-  tuple_loss = getattr(tfgan_losses, loss_name_str)
-  arg_loss = getattr(tfgan_losses.losses_impl, loss_name_str)
-
-  def consistency_test(self):
-    self.assertEqual(arg_loss.__name__, tuple_loss.__name__)
-    with self.cached_session():
-      self.assertEqual(arg_loss(**loss_args).eval(),
-                       tuple_loss(_tuple_from_dict(loss_args)).eval())
-
-  test_name = 'test_loss_consistency_%s' %  loss_name_str
-  setattr(test_class, test_name, consistency_test)
-
-
-# A list of consistency tests which need to be manually written.
-manual_tests = [
-    'acgan_discriminator_loss',
-    'acgan_generator_loss',
-    'combine_adversarial_loss',
-    'mutual_information_penalty',
-    'wasserstein_gradient_penalty',
-    'cycle_consistency_loss',
-    'stargan_generator_loss_wrapper',
-    'stargan_discriminator_loss_wrapper',
-    'stargan_gradient_penalty_wrapper'
-]
-
-discriminator_keyword_args = {
-    'discriminator_real_outputs': np.array([[3.4, 2.3, -2.3],
-                                            [6.3, -2.1, 0.2]]),
-    'discriminator_gen_outputs': np.array([[6.2, -1.5, 2.3],
-                                           [-2.9, -5.1, 0.1]]),
-}
-generator_keyword_args = {
-    'discriminator_gen_outputs': np.array([[6.2, -1.5, 2.3],
-                                           [-2.9, -5.1, 0.1]]),
-}
-
-
-class CycleConsistencyLossTest(test.TestCase):
-
-  def setUp(self):
-    super(CycleConsistencyLossTest, self).setUp()
-
-    def _partial_model(generator_inputs_np):
-      model = namedtuples.GANModel(*[None] * 11)
-      return model._replace(
-          generator_inputs=constant_op.constant(
-              generator_inputs_np, dtype=dtypes.float32))
-
-    self._model_x2y = _partial_model([1, 2])
-    self._model_y2x = _partial_model([5, 6])
-
-  def test_model_type(self):
-    """Test the input model type for `cycle_consistency_loss`."""
-    with self.assertRaises(ValueError):
-      tfgan_losses.cycle_consistency_loss(self._model_x2y)
-
-  def test_correct_loss(self):
-    """Test the output of `cycle_consistency_loss`."""
-    loss = tfgan_losses.cycle_consistency_loss(
-        namedtuples.CycleGANModel(
-            model_x2y=self._model_x2y,
-            model_y2x=self._model_y2x,
-            reconstructed_x=constant_op.constant([9, 8], dtype=dtypes.float32),
-            reconstructed_y=constant_op.constant([7, 2], dtype=dtypes.float32)))
-    with self.test_session(use_gpu=True):
-      variables.global_variables_initializer().run()
-      self.assertNear(5.0, loss.eval(), 1e-5)
-
-
-class StarGANLossWrapperTest(test.TestCase):
-
-  def setUp(self):
-
-    super(StarGANLossWrapperTest, self).setUp()
-
-    self.input_data = array_ops.ones([1, 2, 2, 3])
-    self.input_data_domain_label = constant_op.constant([[0, 1]])
-    self.generated_data = array_ops.ones([1, 2, 2, 3])
-    self.discriminator_input_data_source_predication = array_ops.ones([1])
-    self.discriminator_generated_data_source_predication = array_ops.ones([1])
-
-    def _discriminator_fn(inputs, num_domains):
-      """Differentiable dummy discriminator for StarGAN."""
-      hidden = layers.flatten(inputs)
-      output_src = math_ops.reduce_mean(hidden, axis=1)
-      output_cls = layers.fully_connected(
-          inputs=hidden,
-          num_outputs=num_domains,
-          activation_fn=None,
-          normalizer_fn=None,
-          biases_initializer=None)
-      return output_src, output_cls
-
-    with variable_scope.variable_scope('discriminator') as dis_scope:
-      pass
-
-    self.model = namedtuples.StarGANModel(
-        input_data=self.input_data,
-        input_data_domain_label=self.input_data_domain_label,
-        generated_data=self.generated_data,
-        generated_data_domain_target=None,
-        reconstructed_data=None,
-        discriminator_input_data_source_predication=self.
-        discriminator_input_data_source_predication,
-        discriminator_generated_data_source_predication=self.
-        discriminator_generated_data_source_predication,
-        discriminator_input_data_domain_predication=None,
-        discriminator_generated_data_domain_predication=None,
-        generator_variables=None,
-        generator_scope=None,
-        generator_fn=None,
-        discriminator_variables=None,
-        discriminator_scope=dis_scope,
-        discriminator_fn=_discriminator_fn)
-
-    self.discriminator_fn = _discriminator_fn
-    self.discriminator_scope = dis_scope
-
-  def test_stargan_generator_loss_wrapper(self):
-    """Test StarGAN generator loss wrapper."""
-    loss_fn = tfgan_losses_impl.wasserstein_generator_loss
-    wrapped_loss_fn = tfgan_losses.stargan_generator_loss_wrapper(loss_fn)
-
-    loss_result_tensor = loss_fn(
-        self.discriminator_generated_data_source_predication)
-    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      loss_result, wrapped_loss_result = sess.run(
-          [loss_result_tensor, wrapped_loss_result_tensor])
-      self.assertAlmostEqual(loss_result, wrapped_loss_result)
-
-  def test_stargan_discriminator_loss_wrapper(self):
-    """Test StarGAN discriminator loss wrapper."""
-    loss_fn = tfgan_losses_impl.wasserstein_discriminator_loss
-    wrapped_loss_fn = tfgan_losses.stargan_discriminator_loss_wrapper(loss_fn)
-
-    loss_result_tensor = loss_fn(
-        self.discriminator_generated_data_source_predication,
-        self.discriminator_generated_data_source_predication)
-    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      loss_result, wrapped_loss_result = sess.run(
-          [loss_result_tensor, wrapped_loss_result_tensor])
-      self.assertAlmostEqual(loss_result, wrapped_loss_result)
-
-  def test_stargan_gradient_penalty_wrapper(self):
-    """Test StaGAN gradient penalty wrapper.
-
-    Notes:
-      The random interpolates are handled by given setting the reconstruction to
-      be the same as the input.
-
-    """
-    loss_fn = tfgan_losses_impl.wasserstein_gradient_penalty
-    wrapped_loss_fn = tfgan_losses.stargan_gradient_penalty_wrapper(loss_fn)
-
-    loss_result_tensor = loss_fn(
-        real_data=self.input_data,
-        generated_data=self.generated_data,
-        generator_inputs=self.input_data_domain_label.shape.as_list()[-1],
-        discriminator_fn=self.discriminator_fn,
-        discriminator_scope=self.discriminator_scope)
-    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
-
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      loss_result, wrapped_loss_result = sess.run(
-          [loss_result_tensor, wrapped_loss_result_tensor])
-      self.assertAlmostEqual(loss_result, wrapped_loss_result)
-
-
-if __name__ == '__main__':
-  for loss_name in tfgan_losses.__all__:
-    if loss_name in manual_tests: continue
-    keyword_args = (generator_keyword_args if 'generator' in loss_name else
-                    discriminator_keyword_args)
-    add_loss_consistency_test(ConsistentLossesTest, loss_name, keyword_args)
-
-  test.main()
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
deleted file mode 100644
index 73dfee4fdee..00000000000
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Named tuples for TF-GAN.
-
-TF-GAN training occurs in four steps, and each step communicates with the next
-step via one of these named tuples. At each step, you can either use a TF-GAN
-helper function in `train.py`, or you can manually construct a tuple.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-__all__ = [
-    'GANModel',
-    'InfoGANModel',
-    'ACGANModel',
-    'CycleGANModel',
-    'StarGANModel',
-    'GANLoss',
-    'CycleGANLoss',
-    'GANTrainOps',
-    'GANTrainSteps',
-]
-
-
-class GANModel(
-    collections.namedtuple('GANModel', (
-        'generator_inputs',
-        'generated_data',
-        'generator_variables',
-        'generator_scope',
-        'generator_fn',
-        'real_data',
-        'discriminator_real_outputs',
-        'discriminator_gen_outputs',
-        'discriminator_variables',
-        'discriminator_scope',
-        'discriminator_fn',
-    ))):
-  """A GANModel contains all the pieces needed for GAN training.
-
-  Generative Adversarial Networks (https://arxiv.org/abs/1406.2661) attempt
-  to create an implicit generative model of data by solving a two agent game.
-  The generator generates candidate examples that are supposed to match the
-  data distribution, and the discriminator aims to tell the real examples
-  apart from the generated samples.
-
-  Args:
-    generator_inputs: The random noise source that acts as input to the
-      generator.
-    generated_data: The generated output data of the GAN.
-    generator_variables: A list of all generator variables.
-    generator_scope: Variable scope all generator variables live in.
-    generator_fn: The generator function.
-    real_data: A tensor or real data.
-    discriminator_real_outputs: The discriminator's output on real data.
-    discriminator_gen_outputs: The discriminator's output on generated data.
-    discriminator_variables: A list of all discriminator variables.
-    discriminator_scope: Variable scope all discriminator variables live in.
-    discriminator_fn: The discriminator function.
-  """
-
-
-# TODO(joelshor): Have this class inherit from `GANModel`.
-class InfoGANModel(
-    collections.namedtuple('InfoGANModel', GANModel._fields + (
-        'structured_generator_inputs',
-        'predicted_distributions',
-        'discriminator_and_aux_fn',
-    ))):
-  """An InfoGANModel contains all the pieces needed for InfoGAN training.
-
-  See https://arxiv.org/abs/1606.03657 for more details.
-
-  Args:
-    structured_generator_inputs: A list of Tensors representing the random noise
-      that must  have high mutual information with the generator output. List
-      length should match `predicted_distributions`.
-    predicted_distributions: A list of `tfp.distributions.Distribution`s.
-      Predicted by the recognizer, and used to evaluate the likelihood of the
-      structured noise. List length should match `structured_generator_inputs`.
-    discriminator_and_aux_fn: The original discriminator function that returns
-      a tuple of (logits, `predicted_distributions`).
-  """
-
-
-class ACGANModel(
-    collections.namedtuple('ACGANModel', GANModel._fields +
-                           ('one_hot_labels',
-                            'discriminator_real_classification_logits',
-                            'discriminator_gen_classification_logits',))):
-  """An ACGANModel contains all the pieces needed for ACGAN training.
-
-  See https://arxiv.org/abs/1610.09585 for more details.
-
-  Args:
-    one_hot_labels: A Tensor holding one-hot-labels for the batch.
-    discriminator_real_classification_logits: Classification logits for real
-      data.
-    discriminator_gen_classification_logits: Classification logits for generated
-      data.
-  """
-
-
-class CycleGANModel(
-    collections.namedtuple(
-        'CycleGANModel',
-        ('model_x2y', 'model_y2x', 'reconstructed_x', 'reconstructed_y'))):
-  """An CycleGANModel contains all the pieces needed for CycleGAN training.
-
-  The model `model_x2y` generator F maps data set X to Y, while the model
-  `model_y2x` generator G maps data set Y to X.
-
-  See https://arxiv.org/abs/1703.10593 for more details.
-
-  Args:
-    model_x2y: A `GANModel` namedtuple whose generator maps data set X to Y.
-    model_y2x: A `GANModel` namedtuple whose generator maps data set Y to X.
-    reconstructed_x: A `Tensor` of reconstructed data X which is G(F(X)).
-    reconstructed_y: A `Tensor` of reconstructed data Y which is F(G(Y)).
-  """
-
-
-class StarGANModel(
-    collections.namedtuple('StarGANModel', (
-        'input_data',
-        'input_data_domain_label',
-        'generated_data',
-        'generated_data_domain_target',
-        'reconstructed_data',
-        'discriminator_input_data_source_predication',
-        'discriminator_generated_data_source_predication',
-        'discriminator_input_data_domain_predication',
-        'discriminator_generated_data_domain_predication',
-        'generator_variables',
-        'generator_scope',
-        'generator_fn',
-        'discriminator_variables',
-        'discriminator_scope',
-        'discriminator_fn',
-    ))):
-  """A StarGANModel contains all the pieces needed for StarGAN training.
-
-  Args:
-    input_data: The real images that need to be transferred by the generator.
-    input_data_domain_label: The real domain labels associated with the real
-      images.
-    generated_data: The generated images produced by the generator. It has the
-      same shape as the input_data.
-    generated_data_domain_target: The target domain that the generated images
-      belong to. It has the same shape as the input_data_domain_label.
-    reconstructed_data: The reconstructed images produced by the G(enerator).
-      reconstructed_data = G(G(input_data, generated_data_domain_target),
-      input_data_domain_label).
-    discriminator_input_data_source: The discriminator's output for predicting
-      the source (real/generated) of input_data.
-    discriminator_generated_data_source: The discriminator's output for
-      predicting the source (real/generated) of  generated_data.
-    discriminator_input_data_domain_predication: The discriminator's output for
-      predicting the domain_label for the input_data.
-    discriminator_generated_data_domain_predication: The discriminatorr's output
-      for predicting the domain_target for the generated_data.
-    generator_variables: A list of all generator variables.
-    generator_scope: Variable scope all generator variables live in.
-    generator_fn: The generator function.
-    discriminator_variables: A list of all discriminator variables.
-    discriminator_scope: Variable scope all discriminator variables live in.
-    discriminator_fn: The discriminator function.
-  """
-
-
-class GANLoss(
-    collections.namedtuple('GANLoss', (
-        'generator_loss',
-        'discriminator_loss'
-    ))):
-  """GANLoss contains the generator and discriminator losses.
-
-  Args:
-    generator_loss: A tensor for the generator loss.
-    discriminator_loss: A tensor for the discriminator loss.
-  """
-
-
-class CycleGANLoss(
-    collections.namedtuple('CycleGANLoss', ('loss_x2y', 'loss_y2x'))):
-  """CycleGANLoss contains the losses for `CycleGANModel`.
-
-  See https://arxiv.org/abs/1703.10593 for more details.
-
-  Args:
-    loss_x2y: A `GANLoss` namedtuple representing the loss of `model_x2y`.
-    loss_y2x: A `GANLoss` namedtuple representing the loss of `model_y2x`.
-  """
-
-
-class GANTrainOps(
-    collections.namedtuple('GANTrainOps', (
-        'generator_train_op',
-        'discriminator_train_op',
-        'global_step_inc_op',
-        'train_hooks'
-    ))):
-  """GANTrainOps contains the training ops.
-
-  Args:
-    generator_train_op: Op that performs a generator update step.
-    discriminator_train_op: Op that performs a discriminator update step.
-    global_step_inc_op: Op that increments the shared global step.
-    train_hooks: a list or tuple containing hooks related to training that need
-      to be populated when training ops are instantiated. Used primarily for
-      sync hooks.
-  """
-
-  def __new__(cls, generator_train_op, discriminator_train_op,
-              global_step_inc_op, train_hooks=()):
-    return super(GANTrainOps, cls).__new__(cls, generator_train_op,
-                                           discriminator_train_op,
-                                           global_step_inc_op, train_hooks)
-
-
-class GANTrainSteps(
-    collections.namedtuple('GANTrainSteps', (
-        'generator_train_steps',
-        'discriminator_train_steps'
-    ))):
-  """Contains configuration for the GAN Training.
-
-  Args:
-    generator_train_steps: Number of generator steps to take in each GAN step.
-    discriminator_train_steps: Number of discriminator steps to take in each GAN
-      step.
-  """
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
deleted file mode 100644
index 422e16f0bfe..00000000000
--- a/tensorflow/contrib/gan/python/train.py
+++ /dev/null
@@ -1,1318 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TF-GAN project provides a lightweight GAN training/testing framework.
-
-This file contains the core helper functions to create and train a GAN model.
-See the README or examples in `tensorflow_models` for details on how to use.
-
-TF-GAN training occurs in four steps:
-1) Create a model
-2) Add a loss
-3) Create train ops
-4) Run the train ops
-
-The functions in this file are organized around these four steps. Each function
-corresponds to one of the steps.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.gan.python import losses as tfgan_losses
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python.losses.python import losses_impl as tfgan_losses_impl
-from tensorflow.contrib.slim.python.slim import learning as slim_learning
-from tensorflow.contrib.training.python.training import training
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import sync_replicas_optimizer
-from tensorflow.python.training import training_util
-
-__all__ = [
-    'gan_model',
-    'infogan_model',
-    'acgan_model',
-    'cyclegan_model',
-    'stargan_model',
-    'gan_loss',
-    'cyclegan_loss',
-    'stargan_loss',
-    'gan_train_ops',
-    'gan_train',
-    'get_sequential_train_hooks',
-    'get_joint_train_hooks',
-    'get_sequential_train_steps',
-    'RunTrainOpsHook',
-]
-
-
-def gan_model(
-    # Lambdas defining models.
-    generator_fn,
-    discriminator_fn,
-    # Real data and conditioning.
-    real_data,
-    generator_inputs,
-    # Optional scopes.
-    generator_scope='Generator',
-    discriminator_scope='Discriminator',
-    # Options.
-    check_shapes=True):
-  """Returns GAN model outputs and variables.
-
-  Args:
-    generator_fn: A python lambda that takes `generator_inputs` as inputs and
-      returns the outputs of the GAN generator.
-    discriminator_fn: A python lambda that takes `real_data`/`generated data`
-      and `generator_inputs`. Outputs a Tensor in the range [-inf, inf].
-    real_data: A Tensor representing the real data.
-    generator_inputs: A Tensor or list of Tensors to the generator. In the
-      vanilla GAN case, this might be a single noise Tensor. In the conditional
-      GAN case, this might be the generator's conditioning.
-    generator_scope: Optional generator variable scope. Useful if you want to
-      reuse a subgraph that has already been created.
-    discriminator_scope: Optional discriminator variable scope. Useful if you
-      want to reuse a subgraph that has already been created.
-    check_shapes: If `True`, check that generator produces Tensors that are the
-      same shape as real data. Otherwise, skip this check.
-
-  Returns:
-    A GANModel namedtuple.
-
-  Raises:
-    ValueError: If the generator outputs a Tensor that isn't the same shape as
-      `real_data`.
-  """
-  # Create models
-  with variable_scope.variable_scope(generator_scope) as gen_scope:
-    generator_inputs = _convert_tensor_or_l_or_d(generator_inputs)
-    generated_data = generator_fn(generator_inputs)
-  with variable_scope.variable_scope(discriminator_scope) as dis_scope:
-    discriminator_gen_outputs = discriminator_fn(generated_data,
-                                                 generator_inputs)
-  with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = _convert_tensor_or_l_or_d(real_data)
-    discriminator_real_outputs = discriminator_fn(real_data, generator_inputs)
-
-  if check_shapes:
-    if not generated_data.shape.is_compatible_with(real_data.shape):
-      raise ValueError(
-          'Generator output shape (%s) must be the same shape as real data '
-          '(%s).' % (generated_data.shape, real_data.shape))
-
-  # Get model-specific variables.
-  generator_variables = variables_lib.get_trainable_variables(gen_scope)
-  discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-
-  return namedtuples.GANModel(generator_inputs, generated_data,
-                              generator_variables, gen_scope, generator_fn,
-                              real_data, discriminator_real_outputs,
-                              discriminator_gen_outputs,
-                              discriminator_variables, dis_scope,
-                              discriminator_fn)
-
-
-def infogan_model(
-    # Lambdas defining models.
-    generator_fn,
-    discriminator_fn,
-    # Real data and conditioning.
-    real_data,
-    unstructured_generator_inputs,
-    structured_generator_inputs,
-    # Optional scopes.
-    generator_scope='Generator',
-    discriminator_scope='Discriminator'):
-  """Returns an InfoGAN model outputs and variables.
-
-  See https://arxiv.org/abs/1606.03657 for more details.
-
-  Args:
-    generator_fn: A python lambda that takes a list of Tensors as inputs and
-      returns the outputs of the GAN generator.
-    discriminator_fn: A python lambda that takes `real_data`/`generated data`
-      and `generator_inputs`. Outputs a 2-tuple of (logits, distribution_list).
-      `logits` are in the range [-inf, inf], and `distribution_list` is a list
-      of Tensorflow distributions representing the predicted noise distribution
-      of the ith structure noise.
-    real_data: A Tensor representing the real data.
-    unstructured_generator_inputs: A list of Tensors to the generator. These
-      tensors represent the unstructured noise or conditioning.
-    structured_generator_inputs: A list of Tensors to the generator. These
-      tensors must have high mutual information with the recognizer.
-    generator_scope: Optional generator variable scope. Useful if you want to
-      reuse a subgraph that has already been created.
-    discriminator_scope: Optional discriminator variable scope. Useful if you
-      want to reuse a subgraph that has already been created.
-
-  Returns:
-    An InfoGANModel namedtuple.
-
-  Raises:
-    ValueError: If the generator outputs a Tensor that isn't the same shape as
-      `real_data`.
-    ValueError: If the discriminator output is malformed.
-  """
-  # Create models
-  with variable_scope.variable_scope(generator_scope) as gen_scope:
-    unstructured_generator_inputs = _convert_tensor_or_l_or_d(
-        unstructured_generator_inputs)
-    structured_generator_inputs = _convert_tensor_or_l_or_d(
-        structured_generator_inputs)
-    generator_inputs = (
-        unstructured_generator_inputs + structured_generator_inputs)
-    generated_data = generator_fn(generator_inputs)
-  with variable_scope.variable_scope(discriminator_scope) as disc_scope:
-    dis_gen_outputs, predicted_distributions = discriminator_fn(
-        generated_data, generator_inputs)
-  _validate_distributions(predicted_distributions, structured_generator_inputs)
-  with variable_scope.variable_scope(disc_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
-    dis_real_outputs, _ = discriminator_fn(real_data, generator_inputs)
-
-  if not generated_data.get_shape().is_compatible_with(real_data.get_shape()):
-    raise ValueError(
-        'Generator output shape (%s) must be the same shape as real data '
-        '(%s).' % (generated_data.get_shape(), real_data.get_shape()))
-
-  # Get model-specific variables.
-  generator_variables = variables_lib.get_trainable_variables(gen_scope)
-  discriminator_variables = variables_lib.get_trainable_variables(disc_scope)
-
-  return namedtuples.InfoGANModel(
-      generator_inputs,
-      generated_data,
-      generator_variables,
-      gen_scope,
-      generator_fn,
-      real_data,
-      dis_real_outputs,
-      dis_gen_outputs,
-      discriminator_variables,
-      disc_scope,
-      lambda x, y: discriminator_fn(x, y)[0],  # conform to non-InfoGAN API
-      structured_generator_inputs,
-      predicted_distributions,
-      discriminator_fn)
-
-
-def acgan_model(
-    # Lambdas defining models.
-    generator_fn,
-    discriminator_fn,
-    # Real data and conditioning.
-    real_data,
-    generator_inputs,
-    one_hot_labels,
-    # Optional scopes.
-    generator_scope='Generator',
-    discriminator_scope='Discriminator',
-    # Options.
-    check_shapes=True):
-  """Returns an ACGANModel contains all the pieces needed for ACGAN training.
-
-  The `acgan_model` is the same as the `gan_model` with the only difference
-  being that the discriminator additionally outputs logits to classify the input
-  (real or generated).
-  Therefore, an explicit field holding one_hot_labels is necessary, as well as a
-  discriminator_fn that outputs a 2-tuple holding the logits for real/fake and
-  classification.
-
-  See https://arxiv.org/abs/1610.09585 for more details.
-
-  Args:
-    generator_fn: A python lambda that takes `generator_inputs` as inputs and
-      returns the outputs of the GAN generator.
-    discriminator_fn: A python lambda that takes `real_data`/`generated data`
-      and `generator_inputs`. Outputs a tuple consisting of two Tensors: (1)
-        real/fake logits in the range [-inf, inf] (2) classification logits in
-        the range [-inf, inf]
-    real_data: A Tensor representing the real data.
-    generator_inputs: A Tensor or list of Tensors to the generator. In the
-      vanilla GAN case, this might be a single noise Tensor. In the conditional
-      GAN case, this might be the generator's conditioning.
-    one_hot_labels: A Tensor holding one-hot-labels for the batch. Needed by
-      acgan_loss.
-    generator_scope: Optional generator variable scope. Useful if you want to
-      reuse a subgraph that has already been created.
-    discriminator_scope: Optional discriminator variable scope. Useful if you
-      want to reuse a subgraph that has already been created.
-    check_shapes: If `True`, check that generator produces Tensors that are the
-      same shape as real data. Otherwise, skip this check.
-
-  Returns:
-    A ACGANModel namedtuple.
-
-  Raises:
-    ValueError: If the generator outputs a Tensor that isn't the same shape as
-      `real_data`.
-    TypeError: If the discriminator does not output a tuple consisting of
-    (discrimination logits, classification logits).
-  """
-  # Create models
-  with variable_scope.variable_scope(generator_scope) as gen_scope:
-    generator_inputs = _convert_tensor_or_l_or_d(generator_inputs)
-    generated_data = generator_fn(generator_inputs)
-  with variable_scope.variable_scope(discriminator_scope) as dis_scope:
-    with ops.name_scope(dis_scope.name + '/generated/'):
-      (discriminator_gen_outputs, discriminator_gen_classification_logits
-      ) = _validate_acgan_discriminator_outputs(
-          discriminator_fn(generated_data, generator_inputs))
-  with variable_scope.variable_scope(dis_scope, reuse=True):
-    with ops.name_scope(dis_scope.name + '/real/'):
-      real_data = ops.convert_to_tensor(real_data)
-      (discriminator_real_outputs, discriminator_real_classification_logits
-      ) = _validate_acgan_discriminator_outputs(
-          discriminator_fn(real_data, generator_inputs))
-  if check_shapes:
-    if not generated_data.shape.is_compatible_with(real_data.shape):
-      raise ValueError(
-          'Generator output shape (%s) must be the same shape as real data '
-          '(%s).' % (generated_data.shape, real_data.shape))
-
-  # Get model-specific variables.
-  generator_variables = variables_lib.get_trainable_variables(gen_scope)
-  discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-
-  return namedtuples.ACGANModel(generator_inputs, generated_data,
-                                generator_variables, gen_scope, generator_fn,
-                                real_data, discriminator_real_outputs,
-                                discriminator_gen_outputs,
-                                discriminator_variables, dis_scope,
-                                discriminator_fn, one_hot_labels,
-                                discriminator_real_classification_logits,
-                                discriminator_gen_classification_logits)
-
-
-def cyclegan_model(
-    # Lambdas defining models.
-    generator_fn,
-    discriminator_fn,
-    # data X and Y.
-    data_x,
-    data_y,
-    # Optional scopes.
-    generator_scope='Generator',
-    discriminator_scope='Discriminator',
-    model_x2y_scope='ModelX2Y',
-    model_y2x_scope='ModelY2X',
-    # Options.
-    check_shapes=True):
-  """Returns a CycleGAN model outputs and variables.
-
-  See https://arxiv.org/abs/1703.10593 for more details.
-
-  Args:
-    generator_fn: A python lambda that takes `data_x` or `data_y` as inputs and
-      returns the outputs of the GAN generator.
-    discriminator_fn: A python lambda that takes `real_data`/`generated data`
-      and `generator_inputs`. Outputs a Tensor in the range [-inf, inf].
-    data_x: A `Tensor` of dataset X. Must be the same shape as `data_y`.
-    data_y: A `Tensor` of dataset Y. Must be the same shape as `data_x`.
-    generator_scope: Optional generator variable scope. Useful if you want to
-      reuse a subgraph that has already been created. Defaults to 'Generator'.
-    discriminator_scope: Optional discriminator variable scope. Useful if you
-      want to reuse a subgraph that has already been created. Defaults to
-      'Discriminator'.
-    model_x2y_scope: Optional variable scope for model x2y variables. Defaults
-      to 'ModelX2Y'.
-    model_y2x_scope: Optional variable scope for model y2x variables. Defaults
-      to 'ModelY2X'.
-    check_shapes: If `True`, check that generator produces Tensors that are the
-      same shape as `data_x` (`data_y`). Otherwise, skip this check.
-
-  Returns:
-    A `CycleGANModel` namedtuple.
-
-  Raises:
-    ValueError: If `check_shapes` is True and `data_x` or the generator output
-      does not have the same shape as `data_y`.
-  """
-
-  # Create models.
-  def _define_partial_model(input_data, output_data):
-    return gan_model(
-        generator_fn=generator_fn,
-        discriminator_fn=discriminator_fn,
-        real_data=output_data,
-        generator_inputs=input_data,
-        generator_scope=generator_scope,
-        discriminator_scope=discriminator_scope,
-        check_shapes=check_shapes)
-
-  with variable_scope.variable_scope(model_x2y_scope):
-    model_x2y = _define_partial_model(data_x, data_y)
-  with variable_scope.variable_scope(model_y2x_scope):
-    model_y2x = _define_partial_model(data_y, data_x)
-
-  with variable_scope.variable_scope(model_y2x.generator_scope, reuse=True):
-    reconstructed_x = model_y2x.generator_fn(model_x2y.generated_data)
-  with variable_scope.variable_scope(model_x2y.generator_scope, reuse=True):
-    reconstructed_y = model_x2y.generator_fn(model_y2x.generated_data)
-
-  return namedtuples.CycleGANModel(model_x2y, model_y2x, reconstructed_x,
-                                   reconstructed_y)
-
-
-def stargan_model(generator_fn,
-                  discriminator_fn,
-                  input_data,
-                  input_data_domain_label,
-                  generator_scope='Generator',
-                  discriminator_scope='Discriminator'):
-  """Returns a StarGAN model outputs and variables.
-
-  See https://arxiv.org/abs/1711.09020 for more details.
-
-  Args:
-    generator_fn: A python lambda that takes `inputs` and `targets` as inputs
-      and returns 'generated_data' as the transformed version of `input` based
-      on the `target`. `input` has shape (n, h, w, c), `targets` has shape (n,
-      num_domains), and `generated_data` has the same shape as `input`.
-    discriminator_fn: A python lambda that takes `inputs` and `num_domains` as
-      inputs and returns a tuple (`source_prediction`, `domain_prediction`).
-      `source_prediction` represents the source(real/generated) prediction by
-      the discriminator, and `domain_prediction` represents the domain
-      prediction/classification by the discriminator. `source_prediction` has
-      shape (n) and `domain_prediction` has shape (n, num_domains).
-    input_data: Tensor or a list of tensor of shape (n, h, w, c) representing
-      the real input images.
-    input_data_domain_label: Tensor or a list of tensor of shape (batch_size,
-      num_domains) representing the domain label associated with the real
-      images.
-    generator_scope: Optional generator variable scope. Useful if you want to
-      reuse a subgraph that has already been created.
-    discriminator_scope: Optional discriminator variable scope. Useful if you
-      want to reuse a subgraph that has already been created.
-
-  Returns:
-    StarGANModel nametuple return the tensor that are needed to compute the
-    loss.
-
-  Raises:
-    ValueError: If the shape of `input_data_domain_label` is not rank 2 or fully
-    defined in every dimensions.
-  """
-
-  # Convert to tensor.
-  input_data = _convert_tensor_or_l_or_d(input_data)
-  input_data_domain_label = _convert_tensor_or_l_or_d(input_data_domain_label)
-
-  # Convert list of tensor to a single tensor if applicable.
-  if isinstance(input_data, (list, tuple)):
-    input_data = array_ops.concat(
-        [ops.convert_to_tensor(x) for x in input_data], 0)
-  if isinstance(input_data_domain_label, (list, tuple)):
-    input_data_domain_label = array_ops.concat(
-        [ops.convert_to_tensor(x) for x in input_data_domain_label], 0)
-
-  # Get batch_size, num_domains from the labels.
-  input_data_domain_label.shape.assert_has_rank(2)
-  input_data_domain_label.shape.assert_is_fully_defined()
-  batch_size, num_domains = input_data_domain_label.shape.as_list()
-
-  # Transform input_data to random target domains.
-  with variable_scope.variable_scope(generator_scope) as generator_scope:
-    generated_data_domain_target = _generate_stargan_random_domain_target(
-        batch_size, num_domains)
-    generated_data = generator_fn(input_data, generated_data_domain_target)
-
-  # Transform generated_data back to the original input_data domain.
-  with variable_scope.variable_scope(generator_scope, reuse=True):
-    reconstructed_data = generator_fn(generated_data, input_data_domain_label)
-
-  # Predict source and domain for the generated_data using the discriminator.
-  with variable_scope.variable_scope(
-      discriminator_scope) as discriminator_scope:
-    disc_gen_data_source_pred, disc_gen_data_domain_pred = discriminator_fn(
-        generated_data, num_domains)
-
-  # Predict source and domain for the input_data using the discriminator.
-  with variable_scope.variable_scope(discriminator_scope, reuse=True):
-    disc_input_data_source_pred, disc_input_data_domain_pred = discriminator_fn(
-        input_data, num_domains)
-
-  # Collect trainable variables from the neural networks.
-  generator_variables = variables_lib.get_trainable_variables(generator_scope)
-  discriminator_variables = variables_lib.get_trainable_variables(
-      discriminator_scope)
-
-  # Create the StarGANModel namedtuple.
-  return namedtuples.StarGANModel(
-      input_data=input_data,
-      input_data_domain_label=input_data_domain_label,
-      generated_data=generated_data,
-      generated_data_domain_target=generated_data_domain_target,
-      reconstructed_data=reconstructed_data,
-      discriminator_input_data_source_predication=disc_input_data_source_pred,
-      discriminator_generated_data_source_predication=disc_gen_data_source_pred,
-      discriminator_input_data_domain_predication=disc_input_data_domain_pred,
-      discriminator_generated_data_domain_predication=disc_gen_data_domain_pred,
-      generator_variables=generator_variables,
-      generator_scope=generator_scope,
-      generator_fn=generator_fn,
-      discriminator_variables=discriminator_variables,
-      discriminator_scope=discriminator_scope,
-      discriminator_fn=discriminator_fn)
-
-
-def _validate_aux_loss_weight(aux_loss_weight, name='aux_loss_weight'):
-  if isinstance(aux_loss_weight, ops.Tensor):
-    aux_loss_weight.shape.assert_is_compatible_with([])
-    with ops.control_dependencies(
-        [check_ops.assert_greater_equal(aux_loss_weight, 0.0)]):
-      aux_loss_weight = array_ops.identity(aux_loss_weight)
-  elif aux_loss_weight is not None and aux_loss_weight < 0:
-    raise ValueError('`%s` must be greater than 0. Instead, was %s' %
-                     (name, aux_loss_weight))
-  return aux_loss_weight
-
-
-def _use_aux_loss(aux_loss_weight):
-  if aux_loss_weight is not None:
-    if not isinstance(aux_loss_weight, ops.Tensor):
-      return aux_loss_weight > 0
-    else:
-      return True
-  else:
-    return False
-
-
-def _tensor_pool_adjusted_model(model, tensor_pool_fn):
-  """Adjusts model using `tensor_pool_fn`.
-
-  Args:
-    model: A GANModel tuple.
-    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
-      stores them in an internal pool and returns a previously stored
-      (generated_data, generator_inputs) with some probability. For example
-      tfgan.features.tensor_pool.
-
-  Returns:
-    A new GANModel tuple where discriminator outputs are adjusted by taking
-    pooled generator outputs as inputs. Returns the original model if
-    `tensor_pool_fn` is None.
-
-  Raises:
-    ValueError: If tensor pool does not support the `model`.
-  """
-  if isinstance(model, namedtuples.GANModel):
-    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
-        (model.generator_inputs, model.generated_data))
-    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      dis_gen_outputs = model.discriminator_fn(pooled_generated_data,
-                                               pooled_generator_inputs)
-    return model._replace(
-        generator_inputs=pooled_generator_inputs,
-        generated_data=pooled_generated_data,
-        discriminator_gen_outputs=dis_gen_outputs)
-  elif isinstance(model, namedtuples.ACGANModel):
-    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
-        (model.generator_inputs, model.generated_data))
-    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (pooled_discriminator_gen_outputs,
-       pooled_discriminator_gen_classification_logits) = model.discriminator_fn(
-           pooled_generated_data, pooled_generator_inputs)
-    return model._replace(
-        generator_inputs=pooled_generator_inputs,
-        generated_data=pooled_generated_data,
-        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
-        discriminator_gen_classification_logits=pooled_discriminator_gen_classification_logits  # pylint: disable=line-too-long
-    )
-  elif isinstance(model, namedtuples.InfoGANModel):
-    pooled_generator_inputs, pooled_generated_data, pooled_structured_input = (
-        tensor_pool_fn((model.generator_inputs, model.generated_data,
-                        model.structured_generator_inputs)))
-    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (pooled_discriminator_gen_outputs,
-       pooled_predicted_distributions) = model.discriminator_and_aux_fn(
-           pooled_generated_data, pooled_generator_inputs)
-    return model._replace(
-        generator_inputs=pooled_generator_inputs,
-        generated_data=pooled_generated_data,
-        structured_generator_inputs=pooled_structured_input,
-        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
-        predicted_distributions=pooled_predicted_distributions)
-  else:
-    raise ValueError('Tensor pool does not support `model`: %s.' % type(model))
-
-
-def gan_loss(
-    # GANModel.
-    model,
-    # Loss functions.
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss,
-    # Auxiliary losses.
-    gradient_penalty_weight=None,
-    gradient_penalty_epsilon=1e-10,
-    gradient_penalty_target=1.0,
-    gradient_penalty_one_sided=False,
-    mutual_information_penalty_weight=None,
-    aux_cond_generator_weight=None,
-    aux_cond_discriminator_weight=None,
-    tensor_pool_fn=None,
-    # Options.
-    add_summaries=True):
-  """Returns losses necessary to train generator and discriminator.
-
-  Args:
-    model: A GANModel tuple.
-    generator_loss_fn: The loss function on the generator. Takes a GANModel
-      tuple.
-    discriminator_loss_fn: The loss function on the discriminator. Takes a
-      GANModel tuple.
-    gradient_penalty_weight: If not `None`, must be a non-negative Python number
-      or Tensor indicating how much to weight the gradient penalty. See
-      https://arxiv.org/pdf/1704.00028.pdf for more details.
-    gradient_penalty_epsilon: If `gradient_penalty_weight` is not None, the
-      small positive value used by the gradient penalty function for numerical
-      stability. Note some applications will need to increase this value to
-      avoid NaNs.
-    gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
-      number or `Tensor` indicating the target value of gradient norm. See the
-      CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
-    gradient_penalty_one_sided: If `True`, penalty proposed in
-      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
-    mutual_information_penalty_weight: If not `None`, must be a non-negative
-      Python number or Tensor indicating how much to weight the mutual
-      information penalty. See https://arxiv.org/abs/1606.03657 for more
-        details.
-    aux_cond_generator_weight: If not None: add a classification loss as in
-      https://arxiv.org/abs/1610.09585
-    aux_cond_discriminator_weight: If not None: add a classification loss as in
-      https://arxiv.org/abs/1610.09585
-    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
-      stores them in an internal pool and returns previous stored
-      (generated_data, generator_inputs). For example
-      `tf.gan.features.tensor_pool`. Defaults to None (not using tensor pool).
-    add_summaries: Whether or not to add summaries for the losses.
-
-  Returns:
-    A GANLoss 2-tuple of (generator_loss, discriminator_loss). Includes
-    regularization losses.
-
-  Raises:
-    ValueError: If any of the auxiliary loss weights is provided and negative.
-    ValueError: If `mutual_information_penalty_weight` is provided, but the
-      `model` isn't an `InfoGANModel`.
-  """
-  # Validate arguments.
-  gradient_penalty_weight = _validate_aux_loss_weight(
-      gradient_penalty_weight, 'gradient_penalty_weight')
-  mutual_information_penalty_weight = _validate_aux_loss_weight(
-      mutual_information_penalty_weight, 'infogan_weight')
-  aux_cond_generator_weight = _validate_aux_loss_weight(
-      aux_cond_generator_weight, 'aux_cond_generator_weight')
-  aux_cond_discriminator_weight = _validate_aux_loss_weight(
-      aux_cond_discriminator_weight, 'aux_cond_discriminator_weight')
-
-  # Verify configuration for mutual information penalty
-  if (_use_aux_loss(mutual_information_penalty_weight) and
-      not isinstance(model, namedtuples.InfoGANModel)):
-    raise ValueError(
-        'When `mutual_information_penalty_weight` is provided, `model` must be '
-        'an `InfoGANModel`. Instead, was %s.' % type(model))
-
-  # Verify configuration for mutual auxiliary condition loss (ACGAN).
-  if ((_use_aux_loss(aux_cond_generator_weight) or
-       _use_aux_loss(aux_cond_discriminator_weight)) and
-      not isinstance(model, namedtuples.ACGANModel)):
-    raise ValueError(
-        'When `aux_cond_generator_weight` or `aux_cond_discriminator_weight` '
-        'is provided, `model` must be an `ACGANModel`. Instead, was %s.' %
-        type(model))
-
-  # Optionally create pooled model.
-  if tensor_pool_fn:
-    pooled_model = _tensor_pool_adjusted_model(model, tensor_pool_fn)
-  else:
-    pooled_model = model
-
-  # Create standard losses.
-  gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  dis_loss = discriminator_loss_fn(pooled_model, add_summaries=add_summaries)
-
-  # Add optional extra losses.
-  if _use_aux_loss(gradient_penalty_weight):
-    gp_loss = tfgan_losses.wasserstein_gradient_penalty(
-        pooled_model,
-        epsilon=gradient_penalty_epsilon,
-        target=gradient_penalty_target,
-        one_sided=gradient_penalty_one_sided,
-        add_summaries=add_summaries)
-    dis_loss += gradient_penalty_weight * gp_loss
-  if _use_aux_loss(mutual_information_penalty_weight):
-    gen_info_loss = tfgan_losses.mutual_information_penalty(
-        model, add_summaries=add_summaries)
-    if tensor_pool_fn is None:
-      dis_info_loss = gen_info_loss
-    else:
-      dis_info_loss = tfgan_losses.mutual_information_penalty(
-          pooled_model, add_summaries=add_summaries)
-    gen_loss += mutual_information_penalty_weight * gen_info_loss
-    dis_loss += mutual_information_penalty_weight * dis_info_loss
-  if _use_aux_loss(aux_cond_generator_weight):
-    ac_gen_loss = tfgan_losses.acgan_generator_loss(
-        model, add_summaries=add_summaries)
-    gen_loss += aux_cond_generator_weight * ac_gen_loss
-  if _use_aux_loss(aux_cond_discriminator_weight):
-    ac_disc_loss = tfgan_losses.acgan_discriminator_loss(
-        pooled_model, add_summaries=add_summaries)
-    dis_loss += aux_cond_discriminator_weight * ac_disc_loss
-  # Gathers auxiliary losses.
-  if model.generator_scope:
-    gen_reg_loss = losses.get_regularization_loss(model.generator_scope.name)
-  else:
-    gen_reg_loss = 0
-  if model.discriminator_scope:
-    dis_reg_loss = losses.get_regularization_loss(
-        model.discriminator_scope.name)
-  else:
-    dis_reg_loss = 0
-
-  return namedtuples.GANLoss(gen_loss + gen_reg_loss, dis_loss + dis_reg_loss)
-
-
-def cyclegan_loss(
-    model,
-    # Loss functions.
-    generator_loss_fn=tfgan_losses.least_squares_generator_loss,
-    discriminator_loss_fn=tfgan_losses.least_squares_discriminator_loss,
-    # Auxiliary losses.
-    cycle_consistency_loss_fn=tfgan_losses.cycle_consistency_loss,
-    cycle_consistency_loss_weight=10.0,
-    # Options
-    **kwargs):
-  """Returns the losses for a `CycleGANModel`.
-
-  See https://arxiv.org/abs/1703.10593 for more details.
-
-  Args:
-    model: A `CycleGANModel` namedtuple.
-    generator_loss_fn: The loss function on the generator. Takes a `GANModel`
-      named tuple.
-    discriminator_loss_fn: The loss function on the discriminator. Takes a
-      `GANModel` namedtuple.
-    cycle_consistency_loss_fn: The cycle consistency loss function. Takes a
-      `CycleGANModel` namedtuple.
-    cycle_consistency_loss_weight: A non-negative Python number or a scalar
-      `Tensor` indicating how much to weigh the cycle consistency loss.
-    **kwargs: Keyword args to pass directly to `gan_loss` to construct the loss
-      for each partial model of `model`.
-
-  Returns:
-    A `CycleGANLoss` namedtuple.
-
-  Raises:
-    ValueError: If `model` is not a `CycleGANModel` namedtuple.
-  """
-  # Sanity checks.
-  if not isinstance(model, namedtuples.CycleGANModel):
-    raise ValueError('`model` must be a `CycleGANModel`. Instead, was %s.' %
-                     type(model))
-
-  # Defines cycle consistency loss.
-  cycle_consistency_loss = cycle_consistency_loss_fn(
-      model, add_summaries=kwargs.get('add_summaries', True))
-  cycle_consistency_loss_weight = _validate_aux_loss_weight(
-      cycle_consistency_loss_weight, 'cycle_consistency_loss_weight')
-  aux_loss = cycle_consistency_loss_weight * cycle_consistency_loss
-
-  # Defines losses for each partial model.
-  def _partial_loss(partial_model):
-    partial_loss = gan_loss(
-        partial_model,
-        generator_loss_fn=generator_loss_fn,
-        discriminator_loss_fn=discriminator_loss_fn,
-        **kwargs)
-    return partial_loss._replace(generator_loss=partial_loss.generator_loss +
-                                 aux_loss)
-
-  with ops.name_scope('cyclegan_loss_x2y'):
-    loss_x2y = _partial_loss(model.model_x2y)
-  with ops.name_scope('cyclegan_loss_y2x'):
-    loss_y2x = _partial_loss(model.model_y2x)
-
-  return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
-
-
-# Begin google-internal
-# The four major parts can be found here: http://screen/tMRMBAohDYG.
-# End google-internal
-def stargan_loss(
-    model,
-    generator_loss_fn=tfgan_losses.stargan_generator_loss_wrapper(
-        tfgan_losses_impl.wasserstein_generator_loss),
-    discriminator_loss_fn=tfgan_losses.stargan_discriminator_loss_wrapper(
-        tfgan_losses_impl.wasserstein_discriminator_loss),
-    gradient_penalty_weight=10.0,
-    gradient_penalty_epsilon=1e-10,
-    gradient_penalty_target=1.0,
-    gradient_penalty_one_sided=False,
-    reconstruction_loss_fn=losses.absolute_difference,
-    reconstruction_loss_weight=10.0,
-    classification_loss_fn=losses.softmax_cross_entropy,
-    classification_loss_weight=1.0,
-    classification_one_hot=True,
-    add_summaries=True):
-  """StarGAN Loss.
-
-  Args:
-    model: (StarGAN) Model output of the stargan_model() function call.
-    generator_loss_fn: The loss function on the generator. Takes a
-      `StarGANModel` named tuple.
-    discriminator_loss_fn: The loss function on the discriminator. Takes a
-      `StarGANModel` namedtuple.
-    gradient_penalty_weight: (float) Gradient penalty weight. Default to 10 per
-      the original paper https://arxiv.org/abs/1711.09020. Set to 0 or None to
-        turn off gradient penalty.
-    gradient_penalty_epsilon: (float) A small positive number added for
-      numerical stability when computing the gradient norm.
-    gradient_penalty_target: (float, or tf.float `Tensor`) The target value of
-      gradient norm. Defaults to 1.0.
-    gradient_penalty_one_sided: (bool) If `True`, penalty proposed in
-      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
-    reconstruction_loss_fn: The reconstruction loss function. Default to L1-norm
-      and the function must conform to the `tf.losses` API.
-    reconstruction_loss_weight: Reconstruction loss weight. Default to 10.0.
-    classification_loss_fn: The loss function on the discriminator's ability to
-      classify domain of the input. Default to one-hot softmax cross entropy
-      loss, and the function must conform to the `tf.losses` API.
-    classification_loss_weight: (float) Classification loss weight. Default to
-      1.0.
-    classification_one_hot: (bool) If the label is one hot representation.
-      Default to True. If False, classification classification_loss_fn need to
-      be sigmoid cross entropy loss instead.
-    add_summaries: (bool) Add the loss to the summary
-
-  Returns:
-    GANLoss namedtuple where we have generator loss and discriminator loss.
-
-  Raises:
-    ValueError: If input StarGANModel.input_data_domain_label does not have rank
-    2, or dimension 2 is not defined.
-  """
-
-  def _classification_loss_helper(true_labels, predict_logits, scope_name):
-    """Classification Loss Function Helper.
-
-    Args:
-      true_labels: Tensor of shape [batch_size, num_domains] representing the
-        label where each row is an one-hot vector.
-      predict_logits: Tensor of shape [batch_size, num_domains] representing the
-        predicted label logit, which is UNSCALED output from the NN.
-      scope_name: (string) Name scope of the loss component.
-
-    Returns:
-      Single scalar tensor representing the classification loss.
-    """
-
-    with ops.name_scope(scope_name, values=(true_labels, predict_logits)):
-
-      loss = classification_loss_fn(
-          onehot_labels=true_labels, logits=predict_logits)
-
-      if not classification_one_hot:
-        loss = math_ops.reduce_sum(loss, axis=1)
-      loss = math_ops.reduce_mean(loss)
-
-      if add_summaries:
-        summary.scalar(scope_name, loss)
-
-      return loss
-
-  # Check input shape.
-  model.input_data_domain_label.shape.assert_has_rank(2)
-  model.input_data_domain_label.shape[1:].assert_is_fully_defined()
-
-  # Adversarial Loss.
-  generator_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  discriminator_loss = discriminator_loss_fn(model, add_summaries=add_summaries)
-
-  # Gradient Penalty.
-  if _use_aux_loss(gradient_penalty_weight):
-    gradient_penalty_fn = tfgan_losses.stargan_gradient_penalty_wrapper(
-        tfgan_losses_impl.wasserstein_gradient_penalty)
-    discriminator_loss += gradient_penalty_fn(
-        model,
-        epsilon=gradient_penalty_epsilon,
-        target=gradient_penalty_target,
-        one_sided=gradient_penalty_one_sided,
-        add_summaries=add_summaries) * gradient_penalty_weight
-
-  # Reconstruction Loss.
-  reconstruction_loss = reconstruction_loss_fn(model.input_data,
-                                               model.reconstructed_data)
-  generator_loss += reconstruction_loss * reconstruction_loss_weight
-  if add_summaries:
-    summary.scalar('reconstruction_loss', reconstruction_loss)
-
-  # Classification Loss.
-  generator_loss += _classification_loss_helper(
-      true_labels=model.generated_data_domain_target,
-      predict_logits=model.discriminator_generated_data_domain_predication,
-      scope_name='generator_classification_loss') * classification_loss_weight
-  discriminator_loss += _classification_loss_helper(
-      true_labels=model.input_data_domain_label,
-      predict_logits=model.discriminator_input_data_domain_predication,
-      scope_name='discriminator_classification_loss'
-  ) * classification_loss_weight
-
-  return namedtuples.GANLoss(generator_loss, discriminator_loss)
-
-
-def _get_update_ops(kwargs, gen_scope, dis_scope, check_for_unused_ops=True):
-  """Gets generator and discriminator update ops.
-
-  Args:
-    kwargs: A dictionary of kwargs to be passed to `create_train_op`.
-      `update_ops` is removed, if present.
-    gen_scope: A scope for the generator.
-    dis_scope: A scope for the discriminator.
-    check_for_unused_ops: A Python bool. If `True`, throw Exception if there are
-      unused update ops.
-
-  Returns:
-    A 2-tuple of (generator update ops, discriminator train ops).
-
-  Raises:
-    ValueError: If there are update ops outside of the generator or
-      discriminator scopes.
-  """
-  if 'update_ops' in kwargs:
-    update_ops = set(kwargs['update_ops'])
-    del kwargs['update_ops']
-  else:
-    update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
-
-  all_gen_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS, gen_scope))
-  all_dis_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS, dis_scope))
-
-  if check_for_unused_ops:
-    unused_ops = update_ops - all_gen_ops - all_dis_ops
-    if unused_ops:
-      raise ValueError('There are unused update ops: %s' % unused_ops)
-
-  gen_update_ops = list(all_gen_ops & update_ops)
-  dis_update_ops = list(all_dis_ops & update_ops)
-
-  return gen_update_ops, dis_update_ops
-
-
-def gan_train_ops(
-    model,
-    loss,
-    generator_optimizer,
-    discriminator_optimizer,
-    check_for_unused_update_ops=True,
-    is_chief=True,
-    # Optional args to pass directly to the `create_train_op`.
-    **kwargs):
-  """Returns GAN train ops.
-
-  The highest-level call in TF-GAN. It is composed of functions that can also
-  be called, should a user require more control over some part of the GAN
-  training process.
-
-  Args:
-    model: A GANModel.
-    loss: A GANLoss.
-    generator_optimizer: The optimizer for generator updates.
-    discriminator_optimizer: The optimizer for the discriminator updates.
-    check_for_unused_update_ops: If `True`, throws an exception if there are
-      update ops outside of the generator or discriminator scopes.
-    is_chief: Specifies whether or not the training is being run by the primary
-      replica during replica training.
-    **kwargs: Keyword args to pass directly to `training.create_train_op` for
-      both the generator and discriminator train op.
-
-  Returns:
-    A GANTrainOps tuple of (generator_train_op, discriminator_train_op) that can
-    be used to train a generator/discriminator pair.
-  """
-  if isinstance(model, namedtuples.CycleGANModel):
-    # Get and store all arguments other than model and loss from locals.
-    # Contents of locals should not be modified, may not affect values. So make
-    # a copy. https://docs.python.org/2/library/functions.html#locals.
-    saved_params = dict(locals())
-    saved_params.pop('model', None)
-    saved_params.pop('loss', None)
-    kwargs = saved_params.pop('kwargs', {})
-    saved_params.update(kwargs)
-    with ops.name_scope('cyclegan_x2y_train'):
-      train_ops_x2y = gan_train_ops(model.model_x2y, loss.loss_x2y,
-                                    **saved_params)
-    with ops.name_scope('cyclegan_y2x_train'):
-      train_ops_y2x = gan_train_ops(model.model_y2x, loss.loss_y2x,
-                                    **saved_params)
-    return namedtuples.GANTrainOps(
-        (train_ops_x2y.generator_train_op, train_ops_y2x.generator_train_op),
-        (train_ops_x2y.discriminator_train_op,
-         train_ops_y2x.discriminator_train_op),
-        training_util.get_or_create_global_step().assign_add(1))
-
-  # Create global step increment op.
-  global_step = training_util.get_or_create_global_step()
-  global_step_inc = global_step.assign_add(1)
-
-  # Get generator and discriminator update ops. We split them so that update
-  # ops aren't accidentally run multiple times. For now, throw an error if
-  # there are update ops that aren't associated with either the generator or
-  # the discriminator. Might modify the `kwargs` dictionary.
-  gen_update_ops, dis_update_ops = _get_update_ops(
-      kwargs, model.generator_scope.name, model.discriminator_scope.name,
-      check_for_unused_update_ops)
-
-  # Get the sync hooks if these are needed.
-  sync_hooks = []
-
-  generator_global_step = None
-  if isinstance(generator_optimizer,
-                sync_replicas_optimizer.SyncReplicasOptimizer):
-    # TODO(joelshor): Figure out a way to get this work without including the
-    # dummy global step in the checkpoint.
-    # WARNING: Making this variable a local variable causes sync replicas to
-    # hang forever.
-    generator_global_step = variable_scope.get_variable(
-        'dummy_global_step_generator',
-        shape=[],
-        dtype=global_step.dtype.base_dtype,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-    gen_update_ops += [generator_global_step.assign(global_step)]
-    sync_hooks.append(generator_optimizer.make_session_run_hook(is_chief))
-  with ops.name_scope('generator_train'):
-    gen_train_op = training.create_train_op(
-        total_loss=loss.generator_loss,
-        optimizer=generator_optimizer,
-        variables_to_train=model.generator_variables,
-        global_step=generator_global_step,
-        update_ops=gen_update_ops,
-        **kwargs)
-
-  discriminator_global_step = None
-  if isinstance(discriminator_optimizer,
-                sync_replicas_optimizer.SyncReplicasOptimizer):
-    # See comment above `generator_global_step`.
-    discriminator_global_step = variable_scope.get_variable(
-        'dummy_global_step_discriminator',
-        shape=[],
-        dtype=global_step.dtype.base_dtype,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-    dis_update_ops += [discriminator_global_step.assign(global_step)]
-    sync_hooks.append(discriminator_optimizer.make_session_run_hook(is_chief))
-  with ops.name_scope('discriminator_train'):
-    disc_train_op = training.create_train_op(
-        total_loss=loss.discriminator_loss,
-        optimizer=discriminator_optimizer,
-        variables_to_train=model.discriminator_variables,
-        global_step=discriminator_global_step,
-        update_ops=dis_update_ops,
-        **kwargs)
-
-  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc,
-                                 sync_hooks)
-
-
-# TODO(joelshor): Implement a dynamic GAN train loop, as in `Real-Time Adaptive
-# Image Compression` (https://arxiv.org/abs/1705.05823)
-class RunTrainOpsHook(session_run_hook.SessionRunHook):
-  """A hook to run train ops a fixed number of times."""
-
-  def __init__(self, train_ops, train_steps):
-    """Run train ops a certain number of times.
-
-    Args:
-      train_ops: A train op or iterable of train ops to run.
-      train_steps: The number of times to run the op(s).
-    """
-    if not isinstance(train_ops, (list, tuple)):
-      train_ops = [train_ops]
-    self._train_ops = train_ops
-    self._train_steps = train_steps
-
-  def before_run(self, run_context):
-    for _ in range(self._train_steps):
-      run_context.session.run(self._train_ops)
-
-
-def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for sequential GAN training.
-
-  Args:
-    train_steps: A `GANTrainSteps` tuple that determines how many generator and
-      discriminator training steps to take.
-
-  Returns:
-    A function that takes a GANTrainOps tuple and returns a list of hooks.
-  """
-
-  def get_hooks(train_ops):
-    generator_hook = RunTrainOpsHook(train_ops.generator_train_op,
-                                     train_steps.generator_train_steps)
-    discriminator_hook = RunTrainOpsHook(train_ops.discriminator_train_op,
-                                         train_steps.discriminator_train_steps)
-    return [generator_hook, discriminator_hook] + list(train_ops.train_hooks)
-
-  return get_hooks
-
-
-def _num_joint_steps(train_steps):
-  g_steps = train_steps.generator_train_steps
-  d_steps = train_steps.discriminator_train_steps
-  # Get the number of each type of step that should be run.
-  num_d_and_g_steps = min(g_steps, d_steps)
-  num_g_steps = g_steps - num_d_and_g_steps
-  num_d_steps = d_steps - num_d_and_g_steps
-
-  return num_d_and_g_steps, num_g_steps, num_d_steps
-
-
-def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for joint GAN training.
-
-  When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON
-  ALL OPTIMIZERS TO AVOID RACE CONDITIONS.
-
-  The order of steps taken is:
-  1) Combined generator and discriminator steps
-  2) Generator only steps, if any remain
-  3) Discriminator only steps, if any remain
-
-  **NOTE**: Unlike `get_sequential_train_hooks`, this method performs updates
-  for the generator and discriminator simultaneously whenever possible. This
-  reduces the number of `tf.compat.v1.Session` calls, and can also change the
-  training
-  semantics.
-
-  To illustrate the difference look at the following example:
-
-  `train_steps=namedtuples.GANTrainSteps(3, 5)` will cause
-  `get_sequential_train_hooks` to make 8 session calls:
-    1) 3 generator steps
-    2) 5 discriminator steps
-
-  In contrast, `get_joint_train_steps` will make 5 session calls:
-  1) 3 generator + discriminator steps
-  2) 2 discriminator steps
-
-  Args:
-    train_steps: A `GANTrainSteps` tuple that determines how many generator and
-      discriminator training steps to take.
-
-  Returns:
-    A function that takes a GANTrainOps tuple and returns a list of hooks.
-  """
-  num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps)
-
-  def get_hooks(train_ops):
-    g_op = train_ops.generator_train_op
-    d_op = train_ops.discriminator_train_op
-
-    joint_hook = RunTrainOpsHook([g_op, d_op], num_d_and_g_steps)
-    g_hook = RunTrainOpsHook(g_op, num_g_steps)
-    d_hook = RunTrainOpsHook(d_op, num_d_steps)
-
-    return [joint_hook, g_hook, d_hook] + list(train_ops.train_hooks)
-
-  return get_hooks
-
-
-# TODO(joelshor): This function currently returns the global step. Find a
-# good way for it to return the generator, discriminator, and final losses.
-def gan_train(train_ops,
-              logdir,
-              get_hooks_fn=get_sequential_train_hooks(),
-              master='',
-              is_chief=True,
-              scaffold=None,
-              hooks=None,
-              chief_only_hooks=None,
-              save_checkpoint_secs=600,
-              save_summaries_steps=100,
-              config=None):
-  """A wrapper around `contrib.training.train` that uses GAN hooks.
-
-  Args:
-    train_ops: A GANTrainOps named tuple.
-    logdir: The directory where the graph and checkpoints are saved.
-    get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-      of hooks.
-    master: The URL of the master.
-    is_chief: Specifies whether or not the training is being run by the primary
-      replica during replica training.
-    scaffold: An tf.compat.v1.train.Scaffold instance.
-    hooks: List of `tf.estimator.SessionRunHook` callbacks which are run inside
-      the training loop.
-    chief_only_hooks: List of `tf.estimator.SessionRunHook` instances which are
-      run inside the training loop for the chief trainer only.
-    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
-      using a default checkpoint saver. If `save_checkpoint_secs` is set to
-      `None`, then the default checkpoint saver isn't used.
-    save_summaries_steps: The frequency, in number of global steps, that the
-      summaries are written to disk using a default summary saver. If
-      `save_summaries_steps` is set to `None`, then the default summary saver
-      isn't used.
-    config: An instance of `tf.compat.v1.ConfigProto`.
-
-  Returns:
-    Output of the call to `training.train`.
-  """
-  new_hooks = get_hooks_fn(train_ops)
-  if hooks is not None:
-    hooks = list(hooks) + list(new_hooks)
-  else:
-    hooks = new_hooks
-  return training.train(
-      train_ops.global_step_inc_op,
-      logdir,
-      master=master,
-      is_chief=is_chief,
-      scaffold=scaffold,
-      hooks=hooks,
-      chief_only_hooks=chief_only_hooks,
-      save_checkpoint_secs=save_checkpoint_secs,
-      save_summaries_steps=save_summaries_steps,
-      config=config)
-
-
-def get_sequential_train_steps(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a thin wrapper around slim.learning.train_step, for GANs.
-
-  This function is to provide support for the Supervisor. For new code, please
-  use `MonitoredSession` and `get_sequential_train_hooks`.
-
-  Args:
-    train_steps: A `GANTrainSteps` tuple that determines how many generator and
-      discriminator training steps to take.
-
-  Returns:
-    A function that can be used for `train_step_fn` for GANs.
-  """
-
-  def sequential_train_steps(sess, train_ops, global_step, train_step_kwargs):
-    """A thin wrapper around slim.learning.train_step, for GANs.
-
-    Args:
-      sess: A Tensorflow session.
-      train_ops: A GANTrainOps tuple of train ops to run.
-      global_step: The global step.
-      train_step_kwargs: Dictionary controlling `train_step` behavior.
-
-    Returns:
-      A scalar final loss and a bool whether or not the train loop should stop.
-    """
-    # Only run `should_stop` at the end, if required. Make a local copy of
-    # `train_step_kwargs`, if necessary, so as not to modify the caller's
-    # dictionary.
-    should_stop_op, train_kwargs = None, train_step_kwargs
-    if 'should_stop' in train_step_kwargs:
-      should_stop_op = train_step_kwargs['should_stop']
-      train_kwargs = train_step_kwargs.copy()
-      del train_kwargs['should_stop']
-
-    # Run generator training steps.
-    gen_loss = 0
-    for _ in range(train_steps.generator_train_steps):
-      cur_gen_loss, _ = slim_learning.train_step(sess,
-                                                 train_ops.generator_train_op,
-                                                 global_step, train_kwargs)
-      gen_loss += cur_gen_loss
-
-    # Run discriminator training steps.
-    dis_loss = 0
-    for _ in range(train_steps.discriminator_train_steps):
-      cur_dis_loss, _ = slim_learning.train_step(
-          sess, train_ops.discriminator_train_op, global_step, train_kwargs)
-      dis_loss += cur_dis_loss
-
-    sess.run(train_ops.global_step_inc_op)
-
-    # Run the `should_stop` op after the global step has been incremented, so
-    # that the `should_stop` aligns with the proper `global_step` count.
-    if should_stop_op is not None:
-      should_stop = sess.run(should_stop_op)
-    else:
-      should_stop = False
-
-    return gen_loss + dis_loss, should_stop
-
-  return sequential_train_steps
-
-
-# Helpers
-
-
-def _convert_tensor_or_l_or_d(tensor_or_l_or_d):
-  """Convert input, list of inputs, or dictionary of inputs to Tensors."""
-  if isinstance(tensor_or_l_or_d, (list, tuple)):
-    return [ops.convert_to_tensor(x) for x in tensor_or_l_or_d]
-  elif isinstance(tensor_or_l_or_d, dict):
-    return {k: ops.convert_to_tensor(v) for k, v in tensor_or_l_or_d.items()}
-  else:
-    return ops.convert_to_tensor(tensor_or_l_or_d)
-
-
-def _validate_distributions(distributions_l, noise_l):
-  if not isinstance(distributions_l, (tuple, list)):
-    raise ValueError('`predicted_distributions` must be a list. Instead, found '
-                     '%s.' % type(distributions_l))
-  if len(distributions_l) != len(noise_l):
-    raise ValueError('Length of `predicted_distributions` %i must be the same '
-                     'as the length of structured noise %i.' %
-                     (len(distributions_l), len(noise_l)))
-
-
-def _validate_acgan_discriminator_outputs(discriminator_output):
-  try:
-    a, b = discriminator_output
-  except (TypeError, ValueError):
-    raise TypeError(
-        'A discriminator function for ACGAN must output a tuple '
-        'consisting of (discrimination logits, classification logits).')
-  return a, b
-
-
-def _generate_stargan_random_domain_target(batch_size, num_domains):
-  """Generate random domain label.
-
-  Args:
-    batch_size: (int) Number of random domain label.
-    num_domains: (int) Number of domains representing with the label.
-
-  Returns:
-    Tensor of shape (batch_size, num_domains) representing random label.
-  """
-  domain_idx = random_ops.random_uniform([batch_size],
-                                         minval=0,
-                                         maxval=num_domains,
-                                         dtype=dtypes.int32)
-
-  return array_ops.one_hot(domain_idx, num_domains)
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
deleted file mode 100644
index 841f25cd7f1..00000000000
--- a/tensorflow/contrib/gan/python/train_test.py
+++ /dev/null
@@ -1,1144 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for gan.python.train."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.gan.python import namedtuples
-from tensorflow.contrib.gan.python import train
-from tensorflow.contrib.gan.python.features.python import random_tensor_pool
-from tensorflow.contrib.slim.python.slim import learning as slim_learning
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import categorical
-from tensorflow.python.platform import test
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import sync_replicas_optimizer
-from tensorflow.python.training import training_util
-
-
-def generator_model(inputs):
-  return variable_scope.get_variable('dummy_g', initializer=2.0) * inputs
-
-
-class Generator(object):
-
-  def __call__(self, inputs):
-    return generator_model(inputs)
-
-
-def infogan_generator_model(inputs):
-  return variable_scope.get_variable('dummy_g', initializer=2.0) * inputs[0]
-
-
-class InfoGANGenerator(object):
-
-  def __call__(self, inputs):
-    return infogan_generator_model(inputs)
-
-
-def discriminator_model(inputs, _):
-  return variable_scope.get_variable('dummy_d', initializer=2.0) * inputs
-
-
-class Discriminator(object):
-
-  def __call__(self, inputs, _):
-    return discriminator_model(inputs, _)
-
-
-def infogan_discriminator_model(inputs, _):
-  return (variable_scope.get_variable('dummy_d', initializer=2.0) * inputs,
-          [categorical.Categorical([1.0])])
-
-
-class InfoGANDiscriminator(object):
-
-  def __call__(self, inputs, _):
-    return infogan_discriminator_model(inputs, _)
-
-
-def acgan_discriminator_model(inputs, _, num_classes=10):
-  return (
-      discriminator_model(inputs, _),
-      array_ops.one_hot(
-          # TODO(haeusser): infer batch size from input
-          random_ops.random_uniform(
-              [3], maxval=num_classes, dtype=dtypes.int32),
-          num_classes))
-
-
-class ACGANDiscriminator(object):
-
-  def __call__(self, inputs, _, num_classes=10):
-    return (
-        discriminator_model(inputs, _),
-        array_ops.one_hot(
-            # TODO(haeusser): infer batch size from input
-            random_ops.random_uniform(
-                [3], maxval=num_classes, dtype=dtypes.int32),
-            num_classes))
-
-
-def stargan_generator_model(inputs, _):
-  """Dummy generator for StarGAN."""
-
-  return variable_scope.get_variable('dummy_g', initializer=0.5) * inputs
-
-
-class StarGANGenerator(object):
-
-  def __call__(self, inputs, _):
-    return stargan_generator_model(inputs, _)
-
-
-def stargan_discriminator_model(inputs, num_domains):
-  """Differentiable dummy discriminator for StarGAN."""
-
-  hidden = layers.flatten(inputs)
-
-  output_src = math_ops.reduce_mean(hidden, axis=1)
-
-  output_cls = layers.fully_connected(
-      inputs=hidden,
-      num_outputs=num_domains,
-      activation_fn=None,
-      normalizer_fn=None,
-      biases_initializer=None)
-  return output_src, output_cls
-
-
-class StarGANDiscriminator(object):
-
-  def __call__(self, inputs, num_domains):
-    return stargan_discriminator_model(inputs, num_domains)
-
-
-def get_gan_model():
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    pass
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    pass
-  return namedtuples.GANModel(
-      generator_inputs=None,
-      generated_data=None,
-      generator_variables=None,
-      generator_scope=gen_scope,
-      generator_fn=generator_model,
-      real_data=array_ops.ones([1, 2, 3]),
-      discriminator_real_outputs=array_ops.ones([1, 2, 3]),
-      discriminator_gen_outputs=array_ops.ones([1, 2, 3]),
-      discriminator_variables=None,
-      discriminator_scope=dis_scope,
-      discriminator_fn=discriminator_model)
-
-
-def get_callable_gan_model():
-  ganmodel = get_gan_model()
-  return ganmodel._replace(
-      generator_fn=Generator(), discriminator_fn=Discriminator())
-
-
-def create_gan_model():
-  return train.gan_model(
-      generator_model,
-      discriminator_model,
-      real_data=array_ops.zeros([1, 2]),
-      generator_inputs=random_ops.random_normal([1, 2]))
-
-
-def create_callable_gan_model():
-  return train.gan_model(
-      Generator(),
-      Discriminator(),
-      real_data=array_ops.zeros([1, 2]),
-      generator_inputs=random_ops.random_normal([1, 2]))
-
-
-def get_infogan_model():
-  return namedtuples.InfoGANModel(
-      *get_gan_model(),
-      structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])],
-      discriminator_and_aux_fn=infogan_discriminator_model)
-
-
-def get_callable_infogan_model():
-  return namedtuples.InfoGANModel(
-      *get_callable_gan_model(),
-      structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])],
-      discriminator_and_aux_fn=infogan_discriminator_model)
-
-
-def create_infogan_model():
-  return train.infogan_model(
-      infogan_generator_model,
-      infogan_discriminator_model,
-      real_data=array_ops.zeros([1, 2]),
-      unstructured_generator_inputs=[],
-      structured_generator_inputs=[random_ops.random_normal([1, 2])])
-
-
-def create_callable_infogan_model():
-  return train.infogan_model(
-      InfoGANGenerator(),
-      InfoGANDiscriminator(),
-      real_data=array_ops.zeros([1, 2]),
-      unstructured_generator_inputs=[],
-      structured_generator_inputs=[random_ops.random_normal([1, 2])])
-
-
-def get_acgan_model():
-  return namedtuples.ACGANModel(
-      *get_gan_model(),
-      one_hot_labels=array_ops.one_hot([0, 1, 2], 10),
-      discriminator_real_classification_logits=array_ops.one_hot([0, 1, 3], 10),
-      discriminator_gen_classification_logits=array_ops.one_hot([0, 1, 4], 10))
-
-
-def get_callable_acgan_model():
-  return namedtuples.ACGANModel(
-      *get_callable_gan_model(),
-      one_hot_labels=array_ops.one_hot([0, 1, 2], 10),
-      discriminator_real_classification_logits=array_ops.one_hot([0, 1, 3], 10),
-      discriminator_gen_classification_logits=array_ops.one_hot([0, 1, 4], 10))
-
-
-def create_acgan_model():
-  return train.acgan_model(
-      generator_model,
-      acgan_discriminator_model,
-      real_data=array_ops.zeros([1, 2]),
-      generator_inputs=random_ops.random_normal([1, 2]),
-      one_hot_labels=array_ops.one_hot([0, 1, 2], 10))
-
-
-def create_callable_acgan_model():
-  return train.acgan_model(
-      Generator(),
-      ACGANDiscriminator(),
-      real_data=array_ops.zeros([1, 2]),
-      generator_inputs=random_ops.random_normal([1, 2]),
-      one_hot_labels=array_ops.one_hot([0, 1, 2], 10))
-
-
-def get_cyclegan_model():
-  return namedtuples.CycleGANModel(
-      model_x2y=get_gan_model(),
-      model_y2x=get_gan_model(),
-      reconstructed_x=array_ops.ones([1, 2, 3]),
-      reconstructed_y=array_ops.zeros([1, 2, 3]))
-
-
-def get_callable_cyclegan_model():
-  return namedtuples.CycleGANModel(
-      model_x2y=get_callable_gan_model(),
-      model_y2x=get_callable_gan_model(),
-      reconstructed_x=array_ops.ones([1, 2, 3]),
-      reconstructed_y=array_ops.zeros([1, 2, 3]))
-
-
-def create_cyclegan_model():
-  return train.cyclegan_model(
-      generator_model,
-      discriminator_model,
-      data_x=array_ops.zeros([1, 2]),
-      data_y=array_ops.ones([1, 2]))
-
-
-def create_callable_cyclegan_model():
-  return train.cyclegan_model(
-      Generator(),
-      Discriminator(),
-      data_x=array_ops.zeros([1, 2]),
-      data_y=array_ops.ones([1, 2]))
-
-
-def get_stargan_model():
-  """Similar to get_gan_model()."""
-  # TODO(joelshor): Find a better way of creating a variable scope.
-  with variable_scope.variable_scope('generator') as gen_scope:
-    pass
-  with variable_scope.variable_scope('discriminator') as dis_scope:
-    pass
-  return namedtuples.StarGANModel(
-      input_data=array_ops.ones([1, 2, 2, 3]),
-      input_data_domain_label=array_ops.ones([1, 2]),
-      generated_data=array_ops.ones([1, 2, 2, 3]),
-      generated_data_domain_target=array_ops.ones([1, 2]),
-      reconstructed_data=array_ops.ones([1, 2, 2, 3]),
-      discriminator_input_data_source_predication=array_ops.ones([1]),
-      discriminator_generated_data_source_predication=array_ops.ones([1]),
-      discriminator_input_data_domain_predication=array_ops.ones([1, 2]),
-      discriminator_generated_data_domain_predication=array_ops.ones([1, 2]),
-      generator_variables=None,
-      generator_scope=gen_scope,
-      generator_fn=stargan_generator_model,
-      discriminator_variables=None,
-      discriminator_scope=dis_scope,
-      discriminator_fn=stargan_discriminator_model)
-
-
-def get_callable_stargan_model():
-  model = get_stargan_model()
-  return model._replace(
-      generator_fn=StarGANGenerator(), discriminator_fn=StarGANDiscriminator())
-
-
-def create_stargan_model():
-  return train.stargan_model(
-      stargan_generator_model, stargan_discriminator_model,
-      array_ops.ones([1, 2, 2, 3]), array_ops.ones([1, 2]))
-
-
-def create_callable_stargan_model():
-  return train.stargan_model(StarGANGenerator(), StarGANDiscriminator(),
-                             array_ops.ones([1, 2, 2, 3]),
-                             array_ops.ones([1, 2]))
-
-
-def get_sync_optimizer():
-  return sync_replicas_optimizer.SyncReplicasOptimizer(
-      gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
-      replicas_to_aggregate=1)
-
-
-class GANModelTest(test.TestCase, parameterized.TestCase):
-  """Tests for `gan_model`."""
-
-  @parameterized.named_parameters(
-      ('gan', get_gan_model, namedtuples.GANModel),
-      ('callable_gan', get_callable_gan_model, namedtuples.GANModel),
-      ('infogan', get_infogan_model, namedtuples.InfoGANModel),
-      ('callable_infogan', get_callable_infogan_model,
-       namedtuples.InfoGANModel),
-      ('acgan', get_acgan_model, namedtuples.ACGANModel),
-      ('callable_acgan', get_callable_acgan_model, namedtuples.ACGANModel),
-      ('cyclegan', get_cyclegan_model, namedtuples.CycleGANModel),
-      ('callable_cyclegan', get_callable_cyclegan_model,
-       namedtuples.CycleGANModel),
-      ('stargan', get_stargan_model, namedtuples.StarGANModel),
-      ('callabel_stargan', get_callable_stargan_model, namedtuples.StarGANModel)
-  )
-  def test_output_type(self, create_fn, expected_tuple_type):
-    """Test that output type is as expected."""
-    self.assertIsInstance(create_fn(), expected_tuple_type)
-
-  def test_no_shape_check(self):
-
-    def dummy_generator_model(_):
-      return (None, None)
-
-    def dummy_discriminator_model(data, conditioning):  # pylint: disable=unused-argument
-      return 1
-
-    with self.assertRaisesRegexp(AttributeError, 'object has no attribute'):
-      train.gan_model(
-          dummy_generator_model,
-          dummy_discriminator_model,
-          real_data=array_ops.zeros([1, 2]),
-          generator_inputs=array_ops.zeros([1]),
-          check_shapes=True)
-    train.gan_model(
-        dummy_generator_model,
-        dummy_discriminator_model,
-        real_data=array_ops.zeros([1, 2]),
-        generator_inputs=array_ops.zeros([1]),
-        check_shapes=False)
-
-
-class StarGANModelTest(test.TestCase):
-  """Tests for `stargan_model`."""
-
-  @staticmethod
-  def create_input_and_label_tensor(batch_size, img_size, c_size, num_domains):
-    input_tensor_list = []
-    label_tensor_list = []
-    for _ in range(num_domains):
-      input_tensor_list.append(
-          random_ops.random_uniform((batch_size, img_size, img_size, c_size)))
-      domain_idx = random_ops.random_uniform(
-          [batch_size], minval=0, maxval=num_domains, dtype=dtypes.int32)
-      label_tensor_list.append(array_ops.one_hot(domain_idx, num_domains))
-    return input_tensor_list, label_tensor_list
-
-  def test_generate_stargan_random_domain_target(self):
-    batch_size = 8
-    domain_numbers = 3
-
-    target_tensor = train._generate_stargan_random_domain_target(
-        batch_size, domain_numbers)
-
-    with self.cached_session() as sess:
-      targets = sess.run(target_tensor)
-      self.assertTupleEqual((batch_size, domain_numbers), targets.shape)
-      for target in targets:
-        self.assertEqual(1, np.sum(target))
-        self.assertEqual(1, np.max(target))
-
-  def test_stargan_model_output_type(self):
-    batch_size = 2
-    img_size = 16
-    c_size = 3
-    num_domains = 5
-
-    input_tensor, label_tensor = StarGANModelTest.create_input_and_label_tensor(
-        batch_size, img_size, c_size, num_domains)
-    model = train.stargan_model(
-        generator_fn=stargan_generator_model,
-        discriminator_fn=stargan_discriminator_model,
-        input_data=input_tensor,
-        input_data_domain_label=label_tensor)
-
-    self.assertIsInstance(model, namedtuples.StarGANModel)
-    self.assertTrue(isinstance(model.discriminator_variables, list))
-    self.assertTrue(isinstance(model.generator_variables, list))
-    self.assertIsInstance(model.discriminator_scope,
-                          variable_scope.VariableScope)
-    self.assertTrue(model.generator_scope, variable_scope.VariableScope)
-    self.assertTrue(callable(model.discriminator_fn))
-    self.assertTrue(callable(model.generator_fn))
-
-  def test_stargan_model_generator_output(self):
-    batch_size = 2
-    img_size = 16
-    c_size = 3
-    num_domains = 5
-
-    input_tensor, label_tensor = StarGANModelTest.create_input_and_label_tensor(
-        batch_size, img_size, c_size, num_domains)
-    model = train.stargan_model(
-        generator_fn=stargan_generator_model,
-        discriminator_fn=stargan_discriminator_model,
-        input_data=input_tensor,
-        input_data_domain_label=label_tensor)
-
-    with self.test_session(use_gpu=True) as sess:
-
-      sess.run(variables.global_variables_initializer())
-
-      input_data, generated_data, reconstructed_data = sess.run(
-          [model.input_data, model.generated_data, model.reconstructed_data])
-      self.assertTupleEqual(
-          (batch_size * num_domains, img_size, img_size, c_size),
-          input_data.shape)
-      self.assertTupleEqual(
-          (batch_size * num_domains, img_size, img_size, c_size),
-          generated_data.shape)
-      self.assertTupleEqual(
-          (batch_size * num_domains, img_size, img_size, c_size),
-          reconstructed_data.shape)
-
-  def test_stargan_model_discriminator_output(self):
-    batch_size = 2
-    img_size = 16
-    c_size = 3
-    num_domains = 5
-
-    input_tensor, label_tensor = StarGANModelTest.create_input_and_label_tensor(
-        batch_size, img_size, c_size, num_domains)
-    model = train.stargan_model(
-        generator_fn=stargan_generator_model,
-        discriminator_fn=stargan_discriminator_model,
-        input_data=input_tensor,
-        input_data_domain_label=label_tensor)
-
-    with self.test_session(use_gpu=True) as sess:
-
-      sess.run(variables.global_variables_initializer())
-
-      disc_input_data_source_pred, disc_gen_data_source_pred = sess.run([
-          model.discriminator_input_data_source_predication,
-          model.discriminator_generated_data_source_predication
-      ])
-      self.assertEqual(1, len(disc_input_data_source_pred.shape))
-      self.assertEqual(batch_size * num_domains,
-                       disc_input_data_source_pred.shape[0])
-      self.assertEqual(1, len(disc_gen_data_source_pred.shape))
-      self.assertEqual(batch_size * num_domains,
-                       disc_gen_data_source_pred.shape[0])
-
-      input_label, disc_input_label, gen_label, disc_gen_label = sess.run([
-          model.input_data_domain_label,
-          model.discriminator_input_data_domain_predication,
-          model.generated_data_domain_target,
-          model.discriminator_generated_data_domain_predication
-      ])
-      self.assertTupleEqual((batch_size * num_domains, num_domains),
-                            input_label.shape)
-      self.assertTupleEqual((batch_size * num_domains, num_domains),
-                            disc_input_label.shape)
-      self.assertTupleEqual((batch_size * num_domains, num_domains),
-                            gen_label.shape)
-      self.assertTupleEqual((batch_size * num_domains, num_domains),
-                            disc_gen_label.shape)
-
-
-class GANLossTest(test.TestCase, parameterized.TestCase):
-  """Tests for `gan_loss`."""
-
-  @parameterized.named_parameters(
-      ('gan', get_gan_model),
-      ('callable_gan', get_callable_gan_model),
-      ('infogan', get_infogan_model),
-      ('callable_infogan', get_callable_infogan_model),
-      ('acgan', get_acgan_model),
-      ('callable_acgan', get_callable_acgan_model),
-  )
-  def test_output_type(self, get_gan_model_fn):
-    """Test output type."""
-    loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
-    self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
-
-  @parameterized.named_parameters(
-      ('cyclegan', create_cyclegan_model),
-      ('callable_cyclegan', create_callable_cyclegan_model),
-  )
-  def test_cyclegan_output_type(self, get_gan_model_fn):
-    loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
-    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model, False),
-      ('gan_one_sided', create_gan_model, True),
-      ('callable_gan', create_callable_gan_model, False),
-      ('callable_gan_one_sided', create_callable_gan_model, True),
-      ('infogan', create_infogan_model, False),
-      ('infogan_one_sided', create_infogan_model, True),
-      ('callable_infogan', create_callable_infogan_model, False),
-      ('callable_infogan_one_sided', create_callable_infogan_model, True),
-      ('acgan', create_acgan_model, False),
-      ('acgan_one_sided', create_acgan_model, True),
-      ('callable_acgan', create_callable_acgan_model, False),
-      ('callable_acgan_one_sided', create_callable_acgan_model, True),
-  )
-  def test_grad_penalty(self, create_gan_model_fn, one_sided):
-    """Test gradient penalty option."""
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-    loss_gp = train.gan_loss(
-        model,
-        gradient_penalty_weight=1.0,
-        gradient_penalty_one_sided=one_sided)
-    self.assertIsInstance(loss_gp, namedtuples.GANLoss)
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      loss_gen_np, loss_gen_gp_np = sess.run(
-          [loss.generator_loss, loss_gp.generator_loss])
-      loss_dis_np, loss_dis_gp_np = sess.run(
-          [loss.discriminator_loss, loss_gp.discriminator_loss])
-
-    self.assertEqual(loss_gen_np, loss_gen_gp_np)
-    self.assertLess(loss_dis_np, loss_dis_gp_np)
-
-  @parameterized.named_parameters(
-      ('infogan', get_infogan_model),
-      ('callable_infogan', get_callable_infogan_model),
-  )
-  def test_mutual_info_penalty(self, create_gan_model_fn):
-    """Test mutual information penalty option."""
-    train.gan_loss(
-        create_gan_model_fn(),
-        mutual_information_penalty_weight=constant_op.constant(1.0))
-
-  @parameterized.named_parameters(
-      ('gan', get_gan_model),
-      ('callable_gan', get_callable_gan_model),
-      ('infogan', get_infogan_model),
-      ('callable_infogan', get_callable_infogan_model),
-      ('acgan', get_acgan_model),
-      ('callable_acgan', get_callable_acgan_model),
-  )
-  def test_regularization_helper(self, get_gan_model_fn):
-    """Test regularization loss."""
-    # Evaluate losses without regularization.
-    no_reg_loss = train.gan_loss(get_gan_model_fn())
-    with self.test_session(use_gpu=True):
-      no_reg_loss_gen_np = no_reg_loss.generator_loss.eval()
-      no_reg_loss_dis_np = no_reg_loss.discriminator_loss.eval()
-
-    with ops.name_scope(get_gan_model_fn().generator_scope.name):
-      ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES,
-                            constant_op.constant(3.0))
-    with ops.name_scope(get_gan_model_fn().discriminator_scope.name):
-      ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES,
-                            constant_op.constant(2.0))
-
-    # Check that losses now include the correct regularization values.
-    reg_loss = train.gan_loss(get_gan_model_fn())
-    with self.test_session(use_gpu=True):
-      reg_loss_gen_np = reg_loss.generator_loss.eval()
-      reg_loss_dis_np = reg_loss.discriminator_loss.eval()
-
-    self.assertEqual(3.0, reg_loss_gen_np - no_reg_loss_gen_np)
-    self.assertEqual(2.0, reg_loss_dis_np - no_reg_loss_dis_np)
-
-  @parameterized.named_parameters(
-      ('notcallable', create_acgan_model),
-      ('callable', create_callable_acgan_model),
-  )
-  def test_acgan(self, create_gan_model_fn):
-    """Test that ACGAN models work."""
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-    loss_ac_gen = train.gan_loss(model, aux_cond_generator_weight=1.0)
-    loss_ac_dis = train.gan_loss(model, aux_cond_discriminator_weight=1.0)
-    self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertIsInstance(loss_ac_gen, namedtuples.GANLoss)
-    self.assertIsInstance(loss_ac_dis, namedtuples.GANLoss)
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      loss_gen_np, loss_ac_gen_gen_np, loss_ac_dis_gen_np = sess.run([
-          loss.generator_loss, loss_ac_gen.generator_loss,
-          loss_ac_dis.generator_loss
-      ])
-      loss_dis_np, loss_ac_gen_dis_np, loss_ac_dis_dis_np = sess.run([
-          loss.discriminator_loss, loss_ac_gen.discriminator_loss,
-          loss_ac_dis.discriminator_loss
-      ])
-
-    self.assertLess(loss_gen_np, loss_dis_np)
-    self.assertTrue(np.isscalar(loss_ac_gen_gen_np))
-    self.assertTrue(np.isscalar(loss_ac_dis_gen_np))
-    self.assertTrue(np.isscalar(loss_ac_gen_dis_np))
-    self.assertTrue(np.isscalar(loss_ac_dis_dis_np))
-
-  @parameterized.named_parameters(
-      ('notcallable', create_cyclegan_model),
-      ('callable', create_callable_cyclegan_model),
-  )
-  def test_cyclegan(self, create_gan_model_fn):
-    """Test that CycleGan models work."""
-    model = create_gan_model_fn()
-    loss = train.cyclegan_loss(model)
-    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      (loss_x2y_gen_np, loss_x2y_dis_np, loss_y2x_gen_np,
-       loss_y2x_dis_np) = sess.run([
-           loss.loss_x2y.generator_loss, loss.loss_x2y.discriminator_loss,
-           loss.loss_y2x.generator_loss, loss.loss_y2x.discriminator_loss
-       ])
-
-    self.assertGreater(loss_x2y_gen_np, loss_x2y_dis_np)
-    self.assertGreater(loss_y2x_gen_np, loss_y2x_dis_np)
-    self.assertTrue(np.isscalar(loss_x2y_gen_np))
-    self.assertTrue(np.isscalar(loss_x2y_dis_np))
-    self.assertTrue(np.isscalar(loss_y2x_gen_np))
-    self.assertTrue(np.isscalar(loss_y2x_dis_np))
-
-  @parameterized.named_parameters(
-      ('notcallable', create_stargan_model),
-      ('callable', create_callable_stargan_model),
-  )
-  def test_stargan(self, create_gan_model_fn):
-
-    model = create_gan_model_fn()
-    model_loss = train.stargan_loss(model)
-
-    self.assertIsInstance(model_loss, namedtuples.GANLoss)
-
-    with self.cached_session() as sess:
-
-      sess.run(variables.global_variables_initializer())
-
-      gen_loss, disc_loss = sess.run(
-          [model_loss.generator_loss, model_loss.discriminator_loss])
-
-      self.assertTrue(np.isscalar(gen_loss))
-      self.assertTrue(np.isscalar(disc_loss))
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model),
-      ('callable_gan', create_callable_gan_model),
-      ('infogan', create_infogan_model),
-      ('callable_infogan', create_callable_infogan_model),
-      ('acgan', create_acgan_model),
-      ('callable_acgan', create_callable_acgan_model),
-  )
-  def test_tensor_pool(self, create_gan_model_fn):
-    """Test tensor pool option."""
-    model = create_gan_model_fn()
-    tensor_pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=5)
-    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
-    self.assertIsInstance(loss, namedtuples.GANLoss)
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      for _ in range(10):
-        sess.run([loss.generator_loss, loss.discriminator_loss])
-
-  def test_discriminator_only_sees_pool(self):
-    """Checks that discriminator only sees pooled values."""
-    def checker_gen_fn(_):
-      return constant_op.constant(0.0)
-    model = train.gan_model(
-        checker_gen_fn,
-        discriminator_model,
-        real_data=array_ops.zeros([]),
-        generator_inputs=random_ops.random_normal([]))
-    def tensor_pool_fn(_):
-      return (random_ops.random_uniform([]), random_ops.random_uniform([]))
-    def checker_dis_fn(inputs, _):
-      """Discriminator that checks that it only sees pooled Tensors."""
-      self.assertFalse(constant_op.is_constant(inputs))
-      return inputs
-    model = model._replace(
-        discriminator_fn=checker_dis_fn)
-    train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
-
-  def test_doesnt_crash_when_in_nested_scope(self):
-    with variable_scope.variable_scope('outer_scope'):
-      gan_model = train.gan_model(
-          generator_model,
-          discriminator_model,
-          real_data=array_ops.zeros([1, 2]),
-          generator_inputs=random_ops.random_normal([1, 2]))
-
-      # This should work inside a scope.
-      train.gan_loss(gan_model, gradient_penalty_weight=1.0)
-
-    # This should also work outside a scope.
-    train.gan_loss(gan_model, gradient_penalty_weight=1.0)
-
-
-class TensorPoolAdjusteModelTest(test.TestCase):
-
-  def _check_tensor_pool_adjusted_model_outputs(
-      self, tensor1, tensor2, pool_size):
-    history_values = []
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      for i in range(2 * pool_size):
-        t1, t2 = sess.run([tensor1, tensor2])
-        history_values.append(t1)
-        if i < pool_size:
-          # For [0, pool_size), the pool is not full, tensor1 should be equal
-          # to tensor2 as the pool.
-          self.assertAllEqual(t1, t2)
-        else:
-          # For [pool_size, ?), the pool is full, tensor2 must be equal to some
-          # historical values of tensor1 (which is previously stored in the
-          # pool).
-          self.assertTrue(any((v == t2).all() for v in history_values))
-
-  def _make_new_model_and_check(self, model, pool_size):
-    pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
-    new_model = train._tensor_pool_adjusted_model(model, pool_fn)
-    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
-    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIsNot(new_model.discriminator_gen_outputs,
-                     model.discriminator_gen_outputs)
-
-    return new_model
-
-  def test_tensor_pool_adjusted_model_gan(self):
-    """Test `_tensor_pool_adjusted_model` for gan model."""
-    pool_size = 5
-    model = create_gan_model()
-    new_model = self._make_new_model_and_check(model, pool_size)
-
-    # Check values.
-    self._check_tensor_pool_adjusted_model_outputs(
-        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
-        pool_size)
-
-  def test_tensor_pool_adjusted_model_infogan(self):
-    """Test _tensor_pool_adjusted_model for infogan model."""
-    pool_size = 5
-    model = create_infogan_model()
-    new_model = self._make_new_model_and_check(model, pool_size)
-
-    # Check values.
-    self.assertIsNot(new_model.predicted_distributions,
-                     model.predicted_distributions)
-    self._check_tensor_pool_adjusted_model_outputs(
-        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
-        pool_size)
-
-  def test_tensor_pool_adjusted_model_acgan(self):
-    """Test _tensor_pool_adjusted_model for acgan model."""
-    pool_size = 5
-    model = create_acgan_model()
-    new_model = self._make_new_model_and_check(model, pool_size)
-
-    # Check values.
-    self.assertIsNot(new_model.discriminator_gen_classification_logits,
-                     model.discriminator_gen_classification_logits)
-    self._check_tensor_pool_adjusted_model_outputs(
-        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
-        pool_size)
-
-
-class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
-  """Tests for `gan_train_ops`."""
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model),
-      ('callable_gan', create_callable_gan_model),
-      ('infogan', create_infogan_model),
-      ('callable_infogan', create_callable_infogan_model),
-      ('acgan', create_acgan_model),
-      ('callable_acgan', create_callable_acgan_model),
-  )
-  def test_output_type(self, create_gan_model_fn):
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-
-    g_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    d_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    train_ops = train.gan_train_ops(
-        model,
-        loss,
-        g_opt,
-        d_opt,
-        summarize_gradients=True,
-        colocate_gradients_with_ops=True)
-
-    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
-
-    # Make sure there are no training hooks populated accidentally.
-    self.assertEmpty(train_ops.train_hooks)
-
-  # TODO(joelshor): Add a test to check that custom update op is run.
-  @parameterized.named_parameters(
-      ('gan', create_gan_model, False),
-      ('gan_provideupdates', create_gan_model, True),
-      ('callable_gan', create_callable_gan_model, False),
-      ('callable_gan_provideupdates', create_callable_gan_model, True),
-      ('infogan', create_infogan_model, False),
-      ('infogan_provideupdates', create_infogan_model, True),
-      ('callable_infogan', create_callable_infogan_model, False),
-      ('callable_infogan_provideupdates', create_callable_infogan_model, True),
-      ('acgan', create_acgan_model, False),
-      ('acgan_provideupdates', create_acgan_model, True),
-      ('callable_acgan', create_callable_acgan_model, False),
-      ('callable_acgan_provideupdates', create_callable_acgan_model, True),
-  )
-  def test_unused_update_ops(self, create_gan_model_fn, provide_update_ops):
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-
-    # Add generator and discriminator update ops.
-    with variable_scope.variable_scope(model.generator_scope):
-      gen_update_count = variable_scope.get_variable('gen_count', initializer=0)
-      gen_update_op = gen_update_count.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, gen_update_op)
-    with variable_scope.variable_scope(model.discriminator_scope):
-      dis_update_count = variable_scope.get_variable('dis_count', initializer=0)
-      dis_update_op = dis_update_count.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, dis_update_op)
-
-    # Add an update op outside the generator and discriminator scopes.
-    if provide_update_ops:
-      kwargs = {
-          'update_ops': [
-              constant_op.constant(1.0), gen_update_op, dis_update_op
-          ]
-      }
-    else:
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, constant_op.constant(1.0))
-      kwargs = {}
-
-    g_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    d_opt = gradient_descent.GradientDescentOptimizer(1.0)
-
-    with self.assertRaisesRegexp(ValueError, 'There are unused update ops:'):
-      train.gan_train_ops(
-          model, loss, g_opt, d_opt, check_for_unused_update_ops=True, **kwargs)
-    train_ops = train.gan_train_ops(
-        model, loss, g_opt, d_opt, check_for_unused_update_ops=False, **kwargs)
-
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(0, gen_update_count.eval())
-      self.assertEqual(0, dis_update_count.eval())
-
-      train_ops.generator_train_op.eval()
-      self.assertEqual(1, gen_update_count.eval())
-      self.assertEqual(0, dis_update_count.eval())
-
-      train_ops.discriminator_train_op.eval()
-      self.assertEqual(1, gen_update_count.eval())
-      self.assertEqual(1, dis_update_count.eval())
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model, False),
-      ('callable_gan', create_callable_gan_model, False),
-      ('infogan', create_infogan_model, False),
-      ('callable_infogan', create_callable_infogan_model, False),
-      ('acgan', create_acgan_model, False),
-      ('callable_acgan', create_callable_acgan_model, False),
-      ('gan_canbeint32', create_gan_model, True),
-  )
-  def test_sync_replicas(self, create_gan_model_fn, create_global_step):
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-    num_trainable_vars = len(variables_lib.get_trainable_variables())
-
-    if create_global_step:
-      gstep = variable_scope.get_variable(
-          'custom_gstep', dtype=dtypes.int32, initializer=0, trainable=False)
-      ops.add_to_collection(ops.GraphKeys.GLOBAL_STEP, gstep)
-
-    g_opt = get_sync_optimizer()
-    d_opt = get_sync_optimizer()
-    train_ops = train.gan_train_ops(
-        model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
-    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
-    # No new trainable variables should have been added.
-    self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars)
-
-    # Sync hooks should be populated in the GANTrainOps.
-    self.assertLen(train_ops.train_hooks, 2)
-    for hook in train_ops.train_hooks:
-      self.assertIsInstance(
-          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
-    sync_opts = [hook._sync_optimizer for hook in train_ops.train_hooks]
-    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
-
-    g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1)
-    d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1)
-
-    # Check that update op is run properly.
-    global_step = training_util.get_or_create_global_step()
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      variables.local_variables_initializer().run()
-
-      g_opt.chief_init_op.run()
-      d_opt.chief_init_op.run()
-
-      gstep_before = global_step.eval()
-
-      # Start required queue runner for SyncReplicasOptimizer.
-      coord = coordinator.Coordinator()
-      g_threads = g_opt.get_chief_queue_runner().create_threads(sess, coord)
-      d_threads = d_opt.get_chief_queue_runner().create_threads(sess, coord)
-
-      g_sync_init_op.run()
-      d_sync_init_op.run()
-
-      train_ops.generator_train_op.eval()
-      # Check that global step wasn't incremented.
-      self.assertEqual(gstep_before, global_step.eval())
-
-      train_ops.discriminator_train_op.eval()
-      # Check that global step wasn't incremented.
-      self.assertEqual(gstep_before, global_step.eval())
-
-      coord.request_stop()
-      coord.join(g_threads + d_threads)
-
-  @parameterized.named_parameters(
-      ('is_chief', True),
-      ('is_not_chief', False),
-  )
-  def test_is_chief_in_train_hooks(self, is_chief):
-    """Make sure is_chief is propagated correctly to sync hooks."""
-    model = create_gan_model()
-    loss = train.gan_loss(model)
-    g_opt = get_sync_optimizer()
-    d_opt = get_sync_optimizer()
-    train_ops = train.gan_train_ops(
-        model,
-        loss,
-        g_opt,
-        d_opt,
-        is_chief=is_chief,
-        summarize_gradients=True,
-        colocate_gradients_with_ops=True)
-
-    self.assertLen(train_ops.train_hooks, 2)
-    for hook in train_ops.train_hooks:
-      self.assertIsInstance(
-          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
-    is_chief_list = [hook._is_chief for hook in train_ops.train_hooks]
-    self.assertListEqual(is_chief_list, [is_chief, is_chief])
-
-
-class GANTrainTest(test.TestCase, parameterized.TestCase):
-  """Tests for `gan_train`."""
-
-  def _gan_train_ops(self, generator_add, discriminator_add):
-    step = training_util.create_global_step()
-    # Increment the global count every time a train op is run so we can count
-    # the number of times they're run.
-    # NOTE: `use_locking=True` is required to avoid race conditions with
-    # joint training.
-    train_ops = namedtuples.GANTrainOps(
-        generator_train_op=step.assign_add(generator_add, use_locking=True),
-        discriminator_train_op=step.assign_add(
-            discriminator_add, use_locking=True),
-        global_step_inc_op=step.assign_add(1))
-    return train_ops
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model),
-      ('callable_gan', create_callable_gan_model),
-      ('infogan', create_infogan_model),
-      ('callable_infogan', create_callable_infogan_model),
-      ('acgan', create_acgan_model),
-      ('callable_acgan', create_callable_acgan_model),
-  )
-  def test_run_helper(self, create_gan_model_fn):
-    random_seed.set_random_seed(1234)
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-
-    g_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    d_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    train_ops = train.gan_train_ops(model, loss, g_opt, d_opt)
-
-    final_step = train.gan_train(
-        train_ops,
-        logdir='',
-        hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=2)])
-    self.assertTrue(np.isscalar(final_step))
-    self.assertEqual(2, final_step)
-
-  @parameterized.named_parameters(
-      ('seq_train_steps', train.get_sequential_train_hooks),
-      ('efficient_seq_train_steps', train.get_joint_train_hooks),
-  )
-  def test_multiple_steps(self, get_hooks_fn_fn):
-    """Test multiple train steps."""
-    train_ops = self._gan_train_ops(generator_add=10, discriminator_add=100)
-    train_steps = namedtuples.GANTrainSteps(
-        generator_train_steps=3, discriminator_train_steps=4)
-    final_step = train.gan_train(
-        train_ops,
-        get_hooks_fn=get_hooks_fn_fn(train_steps),
-        logdir='',
-        hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)])
-
-    self.assertTrue(np.isscalar(final_step))
-    self.assertEqual(1 + 3 * 10 + 4 * 100, final_step)
-
-  def test_supervisor_run_gan_model_train_ops_multiple_steps(self):
-    step = training_util.create_global_step()
-    train_ops = namedtuples.GANTrainOps(
-        generator_train_op=constant_op.constant(3.0),
-        discriminator_train_op=constant_op.constant(2.0),
-        global_step_inc_op=step.assign_add(1))
-    train_steps = namedtuples.GANTrainSteps(
-        generator_train_steps=3, discriminator_train_steps=4)
-
-    final_loss = slim_learning.train(
-        train_op=train_ops,
-        logdir='',
-        global_step=step,
-        number_of_steps=1,
-        train_step_fn=train.get_sequential_train_steps(train_steps))
-    self.assertTrue(np.isscalar(final_loss))
-    self.assertEqual(17.0, final_loss)
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model),
-      ('callable_gan', create_callable_gan_model),
-      ('infogan', create_infogan_model),
-      ('callable_infogan', create_callable_infogan_model),
-      ('acgan', create_acgan_model),
-      ('callable_acgan', create_callable_acgan_model),
-  )
-  def test_train_hooks_exist_in_get_hooks_fn(self, create_gan_model_fn):
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-
-    g_opt = get_sync_optimizer()
-    d_opt = get_sync_optimizer()
-    train_ops = train.gan_train_ops(
-        model,
-        loss,
-        g_opt,
-        d_opt,
-        summarize_gradients=True,
-        colocate_gradients_with_ops=True)
-
-    sequential_train_hooks = train.get_sequential_train_hooks()(train_ops)
-    self.assertLen(sequential_train_hooks, 4)
-    sync_opts = [
-        hook._sync_optimizer for hook in sequential_train_hooks if
-        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
-    self.assertLen(sync_opts, 2)
-    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
-
-    joint_train_hooks = train.get_joint_train_hooks()(train_ops)
-    self.assertLen(joint_train_hooks, 5)
-    sync_opts = [
-        hook._sync_optimizer for hook in joint_train_hooks if
-        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
-    self.assertLen(sync_opts, 2)
-    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
-
-
-class PatchGANTest(test.TestCase, parameterized.TestCase):
-  """Tests that functions work on PatchGAN style output."""
-
-  @parameterized.named_parameters(
-      ('gan', create_gan_model),
-      ('callable_gan', create_callable_gan_model),
-      ('infogan', create_infogan_model),
-      ('callable_infogan', create_callable_infogan_model),
-      ('acgan', create_acgan_model),
-      ('callable_acgan', create_callable_acgan_model),
-  )
-  def test_patchgan(self, create_gan_model_fn):
-    """Ensure that patch-based discriminators work end-to-end."""
-    random_seed.set_random_seed(1234)
-    model = create_gan_model_fn()
-    loss = train.gan_loss(model)
-
-    g_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    d_opt = gradient_descent.GradientDescentOptimizer(1.0)
-    train_ops = train.gan_train_ops(model, loss, g_opt, d_opt)
-
-    final_step = train.gan_train(
-        train_ops,
-        logdir='',
-        hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=2)])
-    self.assertTrue(np.isscalar(final_step))
-    self.assertEqual(2, final_step)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 0e8a493e15e..1eead8bff44 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -3,7 +3,7 @@
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_cc",
 )
 
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
index c0b40194faf..4988ce6d2fe 100644
--- a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 
@@ -65,12 +66,12 @@ class RecvBufCall : public CancellableCall {
 
 class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
  public:
-  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
-                                    DeviceResolverInterface* dev_resolver,
-                                    WorkerCacheInterface* worker_cache,
-                                    int64 step_id,
-                                    RemoteMemoryManager* remote_memory_manager)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+  CollectiveRemoteAccessDistributed(
+      const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+      std::shared_ptr<UnboundedWorkQueue> work_queue,
+      WorkerCacheInterface* worker_cache, int64 step_id,
+      RemoteMemoryManager* remote_memory_manager)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
         worker_cache_(worker_cache),
         remote_memory_manager_(remote_memory_manager) {}
 
@@ -152,7 +153,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
 CollectiveExecutor* GdrCollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessDistributed* rma =
       new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
-                                            worker_cache_, step_id,
+                                            work_queue_, worker_cache_, step_id,
                                             remote_memory_manager_);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
                                     &gpu_ring_order_);
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 4744a9ee9a8..51f6201005a 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -163,7 +163,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
                               recv_args, step_id_, parsed.FullKey());
 
     // Record "call" in active_ so that it can be aborted cleanly.
-    RegisterCall(call);
+    RegisterCall(call, recv_args);
 
     // RendezvousMgr already aborted, shouldn't send RPC call any more
     if (!call->status().ok()) {
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index f4bed99e2dc..0683a90610b 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -22,7 +22,6 @@ py_library(
         "util.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -46,7 +45,6 @@ py_library(
     name = "match",
     srcs = ["tests/match.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         "//tensorflow/python:framework_ops",
@@ -59,7 +57,6 @@ py_test(
     srcs = ["tests/util_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
@@ -73,7 +70,6 @@ py_test(
     srcs = ["tests/select_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
@@ -87,7 +83,6 @@ py_test(
     srcs = ["tests/match_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":match",
         "//tensorflow/python:client_testlib",
@@ -101,7 +96,6 @@ py_test(
     srcs = ["tests/subgraph_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
@@ -115,7 +109,6 @@ py_test(
     srcs = ["tests/reroute_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         ":match",
@@ -130,7 +123,6 @@ py_test(
     srcs = ["tests/edit_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         ":match",
@@ -145,7 +137,6 @@ py_test(
     srcs = ["tests/transform_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/133250576,
     deps = [
         ":graph_editor_py",
         ":match",
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 4b53d182f34..543c1da7e33 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import re
 from six import iteritems
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import array_ops as tf_array_ops
+from tensorflow.python.util.compat import collections_abc
 
 __all__ = [
     "make_list_of_op",
@@ -157,7 +157,7 @@ def transform_tree(tree, fn, iterable_type=tuple):
         res = tree.__new__(type(tree),
                            (transform_tree(child, fn) for child in tree))
       return res
-    elif isinstance(tree, collections.Sequence):
+    elif isinstance(tree, collections_abc.Sequence):
       res = tree.__new__(type(tree))
       res.__init__(transform_tree(child, fn) for child in tree)
       return res
diff --git a/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc
index 2bf6097d013..243c2a40298 100644
--- a/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc
+++ b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc
@@ -31,12 +31,13 @@ class SequenceFileReader {
             new io::BufferedInputStream(file, kSequenceFileBufferSize)) {}
 
   Status ReadHeader() {
-    string version;
+    tstring version;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &version));
-    if (version.substr(0, 3) != "SEQ" || version[3] != 6) {
+    StringPiece version_view(version);
+    if (version_view.substr(0, 3) != "SEQ" || version[3] != 6) {
       return errors::InvalidArgument(
           "sequence file header must starts with `SEQ6`, received \"",
-          version.substr(0, 3), static_cast<int>(version[3]), "\"");
+          version_view.substr(0, 3), static_cast<int>(version[3]), "\"");
     }
     TF_RETURN_IF_ERROR(ReadString(&key_class_name_));
     TF_RETURN_IF_ERROR(ReadString(&value_class_name_));
@@ -50,7 +51,7 @@ class SequenceFileReader {
                                    "' is currently not supported");
     }
 
-    string buffer;
+    tstring buffer;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(2, &buffer));
     compression_ = buffer[0];
     block_compression_ = buffer[1];
@@ -84,12 +85,12 @@ class SequenceFileReader {
     return Status::OK();
   }
 
-  Status ReadRecord(string* key, string* value) {
+  Status ReadRecord(tstring* key, tstring* value) {
     uint32 length = 0;
     TF_RETURN_IF_ERROR(ReadUInt32(&length));
     if (length == static_cast<uint32>(-1)) {
       // Sync marker.
-      string sync_marker;
+      tstring sync_marker;
       TF_RETURN_IF_ERROR(
           input_stream_->ReadNBytes(kSyncMarkerSize, &sync_marker));
       if (sync_marker != sync_marker_) {
@@ -114,7 +115,7 @@ class SequenceFileReader {
     return Status::OK();
   }
 
-  Status ReadString(string* value) {
+  Status ReadString(tstring* value) {
     int64 length = 0;
     TF_RETURN_IF_ERROR(ReadVInt(&length));
     if (value == nullptr) {
@@ -124,7 +125,7 @@ class SequenceFileReader {
   }
 
   Status ReadUInt32(uint32* value) {
-    string buffer;
+    tstring buffer;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &buffer));
     *value = ((static_cast<uint32>(buffer[0]) << 24) |
               static_cast<uint32>(buffer[1]) << 16) |
@@ -134,7 +135,7 @@ class SequenceFileReader {
   }
 
   Status ReadVInt(int64* value) {
-    string buffer;
+    tstring buffer;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(1, &buffer));
     if (buffer[0] >= -112) {
       *value = static_cast<int64>(buffer[0]);
@@ -167,12 +168,12 @@ class SequenceFileReader {
 
  private:
   std::unique_ptr<io::InputStreamInterface> input_stream_;
-  string key_class_name_;
-  string value_class_name_;
-  string sync_marker_;
+  tstring key_class_name_;
+  tstring value_class_name_;
+  tstring sync_marker_;
   bool compression_;
   bool block_compression_;
-  string compression_codec_class_name_;
+  tstring compression_codec_class_name_;
   TF_DISALLOW_COPY_AND_ASSIGN(SequenceFileReader);
 };
 class SequenceFileDatasetOp : public DatasetOpKernel {
@@ -198,7 +199,7 @@ class SequenceFileDatasetOp : public DatasetOpKernel {
     std::vector<string> filenames;
     filenames.reserve(filenames_tensor->NumElements());
     for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-      filenames.push_back(filenames_tensor->flat<string>()(i));
+      filenames.push_back(filenames_tensor->flat<tstring>()(i));
     }
 
     *output = new Dataset(ctx, filenames, output_types_);
@@ -233,6 +234,8 @@ class SequenceFileDatasetOp : public DatasetOpKernel {
       return "SequenceFileDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -256,17 +259,17 @@ class SequenceFileDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            string key, value;
+            tstring key, value;
             Status status = reader_->ReadRecord(&key, &value);
             if (!errors::IsOutOfRange(status)) {
               TF_RETURN_IF_ERROR(status);
 
               Tensor key_tensor(ctx->allocator({}), DT_STRING, {});
-              key_tensor.scalar<string>()() = key;
+              key_tensor.scalar<tstring>()() = std::move(key);
               out_tensors->emplace_back(std::move(key_tensor));
 
               Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
-              value_tensor.scalar<string>()() = value;
+              value_tensor.scalar<tstring>()() = std::move(value);
               out_tensors->emplace_back(std::move(value_tensor));
 
               *end_of_sequence = false;
diff --git a/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
index 4218ec05f2c..41c9a8b1f49 100644
--- a/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
@@ -73,7 +73,7 @@ Status BinaryObjectParser::Parse(uint8_t** ptr,
     }
     case STRING: {
       out_tensors->emplace_back(cpu_allocator(), DT_STRING, TensorShape({}));
-      out_tensors->back().scalar<string>()() = ParseString(ptr);
+      out_tensors->back().scalar<tstring>()() = ParseString(ptr);
       break;
     }
     case DATE: {
@@ -150,7 +150,7 @@ Status BinaryObjectParser::Parse(uint8_t** ptr,
       out_tensors->emplace_back(cpu_allocator(), DT_STRING,
                                 TensorShape({length}));
       for (int32_t i = 0; i < length; i++)
-        out_tensors->back().vec<string>()(i) = ParseString(ptr);
+        out_tensors->back().vec<tstring>()(i) = ParseString(ptr);
       break;
     }
     case DATE_ARR: {
diff --git a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
index ce8972f1e7f..67a84b99cff 100644
--- a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
@@ -379,7 +379,7 @@ Status IgniteDatasetIterator::LoadNextPage() {
 
 Status IgniteDatasetIterator::ReceivePage(int32_t page_size) {
   remainder_ = page_size;
-  page_ = std::unique_ptr<uint8_t>(new uint8_t[remainder_]);
+  page_ = std::unique_ptr<uint8_t[]>(new uint8_t[remainder_]);
   ptr_ = page_.get();
 
   uint64 start = Env::Default()->NowMicros();
diff --git a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
index 5868c2cb67f..2e5051105a9 100644
--- a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
@@ -74,7 +74,7 @@ class IgniteDatasetIterator : public DatasetIterator<IgniteDataset> {
 
   mutex mutex_;
 
-  std::unique_ptr<uint8_t> page_;
+  std::unique_ptr<uint8_t[]> page_;
   uint8_t* ptr_;
 };
 
diff --git a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
index e3593ac6c7a..c28dbeae079 100644
--- a/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
@@ -71,8 +71,8 @@ class IgniteDatasetOp : public DatasetOpKernel {
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string cache_name = "";
-    string host = "";
+    tstring cache_name = "";
+    tstring host = "";
     int32 port = -1;
     bool local = false;
     int32 part = -1;
@@ -96,17 +96,17 @@ class IgniteDatasetOp : public DatasetOpKernel {
     const char* env_cert_password = std::getenv("IGNITE_DATASET_CERT_PASSWORD");
 
     if (env_cache_name) {
-      cache_name = string(env_cache_name);
+      cache_name = env_cache_name;
     } else {
-      OP_REQUIRES_OK(ctx, data::ParseScalarArgument<string>(ctx, "cache_name",
-                                                            &cache_name));
+      OP_REQUIRES_OK(ctx, data::ParseScalarArgument<tstring>(ctx, "cache_name",
+                                                             &cache_name));
     }
 
     if (env_host) {
-      host = string(env_host);
+      host = env_host;
     } else {
       OP_REQUIRES_OK(ctx,
-                     data::ParseScalarArgument<string>(ctx, "host", &host));
+                     data::ParseScalarArgument<tstring>(ctx, "host", &host));
     }
 
     if (env_port) {
@@ -145,13 +145,13 @@ class IgniteDatasetOp : public DatasetOpKernel {
           ctx, data::ParseScalarArgument<int32>(ctx, "page_size", &page_size));
     }
 
-    if (env_username) username = string(env_username);
+    if (env_username) username = env_username;
 
-    if (env_password) password = string(env_password);
+    if (env_password) password = env_password;
 
-    if (env_certfile) certfile = string(env_certfile);
+    if (env_certfile) certfile = env_certfile;
 
-    if (env_keyfile) keyfile = string(env_keyfile);
+    if (env_keyfile) keyfile = env_keyfile;
 
     if (env_cert_password) cert_password = string(env_cert_password);
 
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 200e3476a9e..4b14b9e08cf 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -111,9 +111,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    tags = [
-        "notap",  # b/136286905
-    ],
 )
 
 tf_custom_op_library(
diff --git a/tensorflow/contrib/image/kernels/segmentation_ops.cc b/tensorflow/contrib/image/kernels/segmentation_ops.cc
index 93722896233..b9d615613cc 100644
--- a/tensorflow/contrib/image/kernels/segmentation_ops.cc
+++ b/tensorflow/contrib/image/kernels/segmentation_ops.cc
@@ -128,7 +128,7 @@ struct ImageConnectedComponentsFunctor<CPUDevice, T> {
 // Connected components (arguably) make sense for number, bool, and string types
 TF_CALL_NUMBER_TYPES(REGISTER_IMAGE_CONNECTED_COMPONENTS);
 TF_CALL_bool(REGISTER_IMAGE_CONNECTED_COMPONENTS);
-TF_CALL_string(REGISTER_IMAGE_CONNECTED_COMPONENTS);
+TF_CALL_tstring(REGISTER_IMAGE_CONNECTED_COMPONENTS);
 #undef REGISTER_IMAGE_CONNECTED_COMPONENTS
 
 // TODO(ringwalt): Implement on GPU. We probably want to stick to the original
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 05ba9155c40..96f6af2ac51 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -506,7 +506,7 @@ def connected_components(images):
     # constructing multiple additional large tensors.
     components_flat = array_ops.reshape(components, [-1])
     unique_ids, id_index = array_ops.unique(components_flat)
-    id_is_zero = array_ops.where(math_ops.equal(unique_ids, 0))[:, 0]
+    id_is_zero = array_ops.where_v2(math_ops.equal(unique_ids, 0))[:, 0]
     # Map each nonzero id to consecutive values.
     nonzero_consecutive_ids = math_ops.range(
         array_ops.shape(unique_ids)[0] - array_ops.shape(id_is_zero)[0]) + 1
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index 2b0bcf64019..dfc6af3e558 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -48,7 +48,7 @@ def single_image_random_dot_stereograms(depth_values,
   corrupt the encode 3-D data within the image.
 
   Based upon [this
-  paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+  paper](https://www.cs.waikato.ac.nz/~ihw/papers/94-HWT-SI-IHW-SIRDS-paper.pdf).
 
   This outputs a SIRDS image as picture_out.png:
 
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 777399184e8..4fd9e2c5b95 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -12,7 +12,7 @@ load(
     "tf_kernel_library",
 )
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
index 886f6798150..d5da76a753f 100644
--- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -30,7 +30,7 @@ class ObtainNextOp : public OpKernel {
     const Tensor* list;
     OP_REQUIRES_OK(ctx, ctx->input("list", &list));
     int64 num_elements = list->NumElements();
-    auto list_flat = list->flat<string>();
+    auto list_flat = list->flat<tstring>();
 
     // Allocate output.
     Tensor* output_tensor = nullptr;
@@ -48,7 +48,7 @@ class ObtainNextOp : public OpKernel {
     *pos = (*pos + 1) % num_elements;
 
     // Assign value to output.
-    output_tensor->scalar<string>()() = list_flat(*pos);
+    output_tensor->scalar<tstring>()() = list_flat(*pos);
   }
 };
 
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index bb0d4c178dc..a3875bb4a19 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -33,15 +33,15 @@ class KafkaDatasetOp : public DatasetOpKernel {
     std::vector<string> topics;
     topics.reserve(topics_tensor->NumElements());
     for (int i = 0; i < topics_tensor->NumElements(); ++i) {
-      topics.push_back(topics_tensor->flat<string>()(i));
+      topics.push_back(topics_tensor->flat<tstring>()(i));
     }
 
     std::string servers = "";
     OP_REQUIRES_OK(
-        ctx, data::ParseScalarArgument<std::string>(ctx, "servers", &servers));
+        ctx, data::ParseScalarArgument<tstring>(ctx, "servers", &servers));
     std::string group = "";
-    OP_REQUIRES_OK(
-        ctx, data::ParseScalarArgument<std::string>(ctx, "group", &group));
+    OP_REQUIRES_OK(ctx,
+                   data::ParseScalarArgument<tstring>(ctx, "group", &group));
     bool eof = false;
     OP_REQUIRES_OK(ctx, data::ParseScalarArgument<bool>(ctx, "eof", &eof));
     int64 timeout = -1;
@@ -128,9 +128,9 @@ class KafkaDatasetOp : public DatasetOpKernel {
               if (message->err() == RdKafka::ERR_NO_ERROR) {
                 // Produce the line as output.
                 Tensor line_tensor(cpu_allocator(), DT_STRING, {});
-                line_tensor.scalar<string>()() =
-                    std::string(static_cast<const char*>(message->payload()),
-                                message->len());
+                line_tensor.scalar<tstring>()().assign(
+                    static_cast<const char*>(message->payload()),
+                    message->len());
                 out_tensors->emplace_back(std::move(line_tensor));
                 *end_of_sequence = false;
                 // Sync offset
diff --git a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
index 8919d5efedf..88d1aa1bd22 100644
--- a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
+++ b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
@@ -148,11 +148,11 @@ class KinesisDatasetOp : public DatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     std::string stream = "";
-    OP_REQUIRES_OK(
-        ctx, data::ParseScalarArgument<std::string>(ctx, "stream", &stream));
+    OP_REQUIRES_OK(ctx,
+                   data::ParseScalarArgument<tstring>(ctx, "stream", &stream));
     std::string shard = "";
-    OP_REQUIRES_OK(
-        ctx, data::ParseScalarArgument<std::string>(ctx, "shard", &shard));
+    OP_REQUIRES_OK(ctx,
+                   data::ParseScalarArgument<tstring>(ctx, "shard", &shard));
     bool read_indefinitely = true;
     OP_REQUIRES_OK(ctx, data::ParseScalarArgument<bool>(
                             ctx, "read_indefinitely", &read_indefinitely));
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
index 1783a07fac9..3a257d81887 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
@@ -21,11 +21,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import re
 
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
 
 # used for register_type_abbreviation and _type_repr below.
 _TYPE_ABBREVIATIONS = {}
@@ -114,7 +114,7 @@ class Sequence(_SingleArgumentType):
   """
 
   def __instancecheck__(self, instance):
-    return (isinstance(instance, collections.Sequence) and
+    return (isinstance(instance, collections_abc.Sequence) and
             all(isinstance(x, self._type) for x in instance))
 
 
@@ -130,9 +130,9 @@ class Collection(_SingleArgumentType):
   """
 
   def __instancecheck__(self, instance):
-    return (isinstance(instance, collections.Iterable) and
-            isinstance(instance, collections.Sized) and
-            isinstance(instance, collections.Container) and
+    return (isinstance(instance, collections_abc.Iterable) and
+            isinstance(instance, collections_abc.Sized) and
+            isinstance(instance, collections_abc.Container) and
             all(isinstance(x, self._type) for x in instance))
 
 
@@ -157,7 +157,7 @@ class Mapping(_TwoArgumentType):
 
   def __instancecheck__(self, instance):
     key_type, value_type = self._types  # pylint: disable=unbalanced-tuple-unpacking
-    return (isinstance(instance, collections.Mapping) and
+    return (isinstance(instance, collections_abc.Mapping) and
             all(isinstance(k, key_type) for k in instance.keys()) and
             all(isinstance(k, value_type) for k in instance.values()))
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index b0961e5b3a2..394254cbd90 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -41,11 +41,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.compat import collections_abc
 
 # pylint: disable=invalid-name
 
 # Types coercible to Axis.labels
-# We use this instead of collections.Sequence to exclude strings.
+# We use this instead of collections_abc.Sequence to exclude strings.
 LabelsLike = tc.Union(np.ndarray, range, list, tuple)
 
 # Types coercible to a tf.compat.v1.Dimension
@@ -195,7 +196,7 @@ def as_axis(axis_data):
   return axis
 
 
-class Axes(collections.Mapping):
+class Axes(collections_abc.Mapping):
   """Axis names and indices for a tensor.
 
   It is an ordered mapping, with keys given by axis name and values given
@@ -719,7 +720,7 @@ def transpose(labeled_tensor, axis_order=None, name=None):
 @tc.accepts(LabeledTensorLike,
             tc.Collection(
                 tc.Union(string_types,
-                         tc.Tuple(string_types, collections.Hashable))),
+                         tc.Tuple(string_types, collections_abc.Hashable))),
             tc.Optional(string_types))
 def expand_dims(labeled_tensor, axes, name=None):
   """Insert dimensions of size 1.
@@ -1055,7 +1056,7 @@ def align(labeled_tensor_0, labeled_tensor_1, name=None):
 
 
 @tc.returns(types.FunctionType)
-@tc.accepts(string_types, collections.Callable)
+@tc.accepts(string_types, collections_abc.Callable)
 def define_unary_op(op_name, elementwise_function):
   """Define a unary operation for labeled tensors.
 
@@ -1124,7 +1125,7 @@ sigmoid = define_unary_op('sigmoid', math_ops.sigmoid)
 
 
 @tc.returns(types.FunctionType)
-@tc.accepts(string_types, collections.Callable)
+@tc.accepts(string_types, collections_abc.Callable)
 def define_binary_op(op_name, elementwise_function):
   """Define a binary operation that broadcasts labeled tensors.
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index a04e3772799..35ab141a18f 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import types
 
 import numpy as np
@@ -34,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.ops import random_ops
 from tensorflow.python.training import input  # pylint: disable=redefined-builtin
+from tensorflow.python.util.compat import collections_abc
 
 
 @tc.returns(core.LabeledTensor)
@@ -52,7 +52,7 @@ def _gather_1d_on_axis(labeled_tensor, indexer, axis, name=None):
 @tc.returns(core.LabeledTensor)
 @tc.accepts(core.LabeledTensorLike,
             tc.Mapping(string_types,
-                       tc.Union(slice, collections.Hashable, list)),
+                       tc.Union(slice, collections_abc.Hashable, list)),
             tc.Optional(string_types))
 def select(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
@@ -111,8 +111,8 @@ def select(labeled_tensor, selection, name=None):
         slices[axis_name] = slice(start, stop)
 
       # Needs to be after checking for slices, since slice objects claim to be
-      # instances of collections.Hashable but hash() on them fails.
-      elif isinstance(value, collections.Hashable):
+      # instances of collections_abc.Hashable but hash() on them fails.
+      elif isinstance(value, collections_abc.Hashable):
         slices[axis_name] = axis.index(value)
 
       elif isinstance(value, list):
@@ -400,7 +400,7 @@ def rename_axis(labeled_tensor, existing_name, new_name, name=None):
 
 
 @tc.returns(tc.List(core.LabeledTensor))
-@tc.accepts(string_types, collections.Callable, int, bool,
+@tc.accepts(string_types, collections_abc.Callable, int, bool,
             tc.Collection(core.LabeledTensorLike), bool,
             tc.Optional(string_types))
 def _batch_helper(default_name,
@@ -606,7 +606,7 @@ def random_crop(labeled_tensor, shape_map, seed=None, name=None):
 
 # TODO(shoyer): Allow the user to select the axis over which to map.
 @tc.returns(core.LabeledTensor)
-@tc.accepts(collections.Callable, core.LabeledTensorLike,
+@tc.accepts(collections_abc.Callable, core.LabeledTensorLike,
             tc.Optional(string_types))
 def map_fn(fn, labeled_tensor, name=None):
   """Map on the list of tensors unpacked from labeled_tensor.
@@ -661,7 +661,7 @@ def map_fn(fn, labeled_tensor, name=None):
 
 
 @tc.returns(core.LabeledTensor)
-@tc.accepts(collections.Callable, core.LabeledTensorLike,
+@tc.accepts(collections_abc.Callable, core.LabeledTensorLike,
             core.LabeledTensorLike, tc.Optional(string_types))
 def foldl(fn, labeled_tensor, initial_value, name=None):
   """Left fold on the list of tensors unpacked from labeled_tensor.
@@ -754,7 +754,7 @@ def squeeze(labeled_tensor, axis_names=None, name=None):
 
 # pylint: disable=invalid-name
 ReduceAxis = tc.Union(string_types,
-                      tc.Tuple(string_types, collections.Hashable))
+                      tc.Tuple(string_types, collections_abc.Hashable))
 ReduceAxes = tc.Optional(tc.Union(ReduceAxis, tc.Collection(ReduceAxis)))
 # pylint: enable=invalid-name
 
@@ -876,7 +876,7 @@ def matmul(a, b, name=None):
 
 
 @tc.returns(types.FunctionType)
-@tc.accepts(string_types, collections.Callable)
+@tc.accepts(string_types, collections_abc.Callable)
 def define_reduce_op(op_name, reduce_fn):
   """Define a reduction op for labeled tensors.
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 8e410006c16..6010b072418 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -77,6 +77,8 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = [
         "//learning/brain:__subpackages__",
+        "//learning/lib/ami/simple_ml/link_other_ml_tools/tensorflow:__subpackages__",
+        "//storage/d/analysis/prefetch:__pkg__",
         "//tensorflow:__subpackages__",
         "//tensorflow_model_optimization:__subpackages__",
         "//third_party/py/tf_slim:__subpackages__",
@@ -154,6 +156,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses:losses",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 py_test(
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index ee4b0373ef7..3fe4bd48748 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -78,16 +78,16 @@ template <>
 int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
-    return Fingerprint64(values_.vec<string>().data()[start + n]);
+    return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-string SparseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
-    return values_.vec<string>().data()[start + n];
+    return values_.vec<tstring>().data()[start + n];
   return std::to_string(values_.vec<int64>().data()[start + n]);
 }
 
@@ -95,7 +95,7 @@ template <>
 StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
                                                      int64 n) const {
   const int64 start = feature_start_indices_[batch];
-  return values_.vec<string>().data()[start + n];
+  return values_.vec<tstring>().data()[start + n];
 }
 
 // A column that is backed by a dense tensor.
@@ -118,21 +118,21 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 template <>
 int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
   if (DT_STRING == tensor_.dtype())
-    return Fingerprint64(tensor_.matrix<string>()(batch, n));
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
 }
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-string DenseTensorColumn<string>::Feature(int64 batch, int64 n) const {
-  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
 StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
                                                     int64 n) const {
-  return tensor_.matrix<string>()(batch, n);
+  return tensor_.matrix<tstring>()(batch, n);
 }
 
 // Updates Output tensors with sparse crosses.
@@ -310,7 +310,7 @@ struct CrossTraits;
 template <typename InternalType, bool VERSION_2>
 struct CrossTraits<false, InternalType, VERSION_2> {
   typedef StringCrosser<InternalType> Crosser;
-  typedef OutputUpdater<string> Updater;
+  typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
@@ -598,20 +598,20 @@ class SparseFeatureCrossOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("out_type")
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseFeatureCrossOp<false, StringPiece, false>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<tstring>("out_type")
                             .TypeConstraint<int64>("internal_type"),
-                        SparseFeatureCrossOp<false, string, false>);
+                        SparseFeatureCrossOp<false, tstring, false>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<int64>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseFeatureCrossOp<true, int64, false>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCross")
@@ -624,20 +624,20 @@ REGISTER_KERNEL_BUILDER(Name("SparseFeatureCross")
 // crosses features.
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCrossV2")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("out_type")
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseFeatureCrossOp<false, StringPiece, true>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCrossV2")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<tstring>("out_type")
                             .TypeConstraint<int64>("internal_type"),
-                        SparseFeatureCrossOp<false, string, true>);
+                        SparseFeatureCrossOp<false, tstring, true>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCrossV2")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<int64>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseFeatureCrossOp<true, int64, true>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseFeatureCrossV2")
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index e47a52a7072..385dcc0d80a 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -155,6 +155,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 # Imports the core `InputLayer` symbol in contrib during development.
 InputLayer = fc_core.InputLayer  # pylint: disable=invalid-name
@@ -1403,7 +1404,7 @@ def shared_embedding_columns(sparse_id_columns,
       least one element of `sparse_id_columns` is not a `SparseColumn` or a
       `WeightedSparseColumn`.
   """
-  if (not isinstance(sparse_id_columns, collections.Sequence) or
+  if (not isinstance(sparse_id_columns, collections_abc.Sequence) or
       isinstance(sparse_id_columns, six.string_types)):
     raise TypeError(
         "sparse_id_columns must be a non-string sequence (ex: list or tuple) "
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 65e8d75e5c5..d48edc027a2 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -25,6 +25,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = [
         "//learning/brain:__subpackages__",
+        "//storage/d/analysis/prefetch:__pkg__",
         "//tensorflow:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
diff --git a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
index 99f22d182cd..a15bbce515b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
@@ -19,12 +19,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 
 import numpy as np
 import six
 
+from tensorflow.python.util.compat import collections_abc
+
 
 def _pprint(d):
   return ', '.join(['%s=%s' % (key, str(value)) for key, value in d.items()])
@@ -55,7 +56,7 @@ class _BaseEstimator(object):
     for key in param_names:
       value = getattr(self, key, None)
 
-      if isinstance(value, collections.Callable):
+      if isinstance(value, collections_abc.Callable):
         continue
 
       # XXX: should we rather test if instance of estimator?
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 5ce5c02cc63..fcabbf69425 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -162,7 +162,7 @@ class ModelFnOps(
       loss_shape = loss.get_shape()
       if loss_shape.num_elements() not in (None, 1):
         raise ValueError('Loss must be scalar: %s.' % loss)
-      if not loss_shape.is_compatible_with(tensor_shape.scalar()):
+      if not loss_shape.is_compatible_with(tensor_shape.TensorShape([])):
         loss = array_ops.reshape(loss, [])
 
     # Validate predictions.
diff --git a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
index 720c74e3de5..f35453f267e 100644
--- a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@@ -36,7 +36,7 @@ class DecodeLibsvmOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* label_tensor;
     OP_REQUIRES_OK(
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index fa8dad938d7..8e75fcb666a 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -74,7 +74,7 @@ HOST_GENDIR := $(MAKEFILE_DIR)/gen/host_obj/
 
 # Settings for the host compiler.
 HOST_CXX := $(CC_PREFIX) gcc
-HOST_CXXFLAGS := --std=c++11
+HOST_CXXFLAGS := --std=c++14
 HOST_LDOPTS :=
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
@@ -185,7 +185,7 @@ ifneq ($(TARGET),ANDROID)
   OPTFLAGS += -march=native
 endif
 
-CXXFLAGS := --std=c++11 -DIS_SLIM_BUILD -fno-exceptions -DNDEBUG $(OPTFLAGS)
+CXXFLAGS := --std=c++14 -DIS_SLIM_BUILD -fno-exceptions -DNDEBUG $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib
 DEPFLAGS = -MT $@ -MMD -MP -MF $(DEPDIR)/$*.Td
@@ -416,7 +416,7 @@ $(MARCH_OPTION) \
 
 	ifeq ($(BUILD_FOR_TEGRA),1)
 		NVCC := $(JETPACK)/cuda/bin/nvcc
-		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++14 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX14 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
 		CXXFLAGS4NVCC =\
 -DIS_SLIM_BUILD \
 -DANDROID_TEGRA \
@@ -433,7 +433,7 @@ $(MARCH_OPTION) \
 -DANDROID_TEGRA \
 -DEIGEN_AVOID_STL_ARRAY \
 -DEIGEN_HAS_C99_MATH \
--DLANG_CXX11 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+-DLANG_CXX14 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
 
 		INCLUDES += \
 -Itensorflow/core/kernels \
@@ -655,8 +655,7 @@ $(wildcard tensorflow/core/util/*/*.cc) \
 $(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/profiler/internal/profiler_interface.cc \
 tensorflow/core/profiler/internal/traceme_recorder.cc \
-tensorflow/core/profiler/lib/profiler_session.cc \
-tensorflow/core/profiler/lib/traceme.cc \
+$(wildcard tensorflow/core/profiler/lib/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
@@ -860,7 +859,7 @@ $(OBJDIR)%.o: %.cc | $(PBT_GEN_FILES)
 $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	@mkdir -p $(dir $(DEPDIR)$*)
-	$(CXX) $(patsubst --std=c++11,--std=c99, $(CXXFLAGS)) -x c $(DEPFLAGS) \
+	$(CXX) $(patsubst --std=c++14,--std=c99, $(CXXFLAGS)) -x c $(DEPFLAGS) \
 $(INCLUDES) -c $< -o $@
 	@mv -f $(DEPDIR)/$*.Td $(DEPDIR)/$*.d
 
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 1293e59cbcb..7ace5d970ac 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -87,9 +87,11 @@ need to install the standalone toolchain, however.
 Assign your NDK location to $NDK_ROOT:
 
 ```bash
-export NDK_ROOT=/absolute/path/to/NDK/android-ndk-rxxx/
+export NDK_ROOT=/absolute/path/to/NDK/android-ndk-r14b
 ```
 
+Note : libtensorflow-core.a cannot be compiled with any ndk version above r14b.
+
 Download the graph if you haven't already:
 
 ```bash
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 1feca44f6e5..6cf1145021c 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'https://bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
-NSYNC_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
+NSYNC_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 
 # Note: The protobuf repo needs to be cloned due to its submodules.
 # These variables contain the GitHub repo and the sha, from `tensorflow/workspace.bzl`,
@@ -37,7 +37,7 @@ NSYNC_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/nsync/.*tar
 readonly PROTOBUF_REPO="https://github.com/protocolbuffers/protobuf.git"
 readonly PROTOBUF_TAG="$(grep -o 'https://github.com/protocolbuffers/protobuf/archive/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1 | awk '{print substr($0, index($0, "archive") + 8, index($0, "tar") - index($0, "archive") - 9) }')"
 
-# TODO (yongtang): Replace the following with 'http://mirror.tensorflow.org/github.com/google/re2/.*tar\.gz' once
+# TODO (yongtang): Replace the following with 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.tensorflow.org.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 FFT2D_URL="$(grep -o 'http.*fft2d\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
@@ -46,8 +46,8 @@ ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_
 CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
 # Required for TensorFlow Lite Flex runtime.
-FARMHASH_URL="http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
-FLATBUFFERS_URL="http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
+FARMHASH_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
+FLATBUFFERS_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -140,7 +140,7 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 # TODO(satok): Remove this once protobuf/autogen.sh is fixed.
-replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#http://download.tensorflow.org/deps/gmock-1.7.0.zip#' \
+replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#https://storage.googleapis.com/download.tensorflow.org/deps/gmock-1.7.0.zip#' \
   "${DOWNLOADS_DIR}/protobuf/autogen.sh"
 cat "third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h"
 
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index d7ad266f678..95f2d186dc5 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -30,13 +30,9 @@ tensorflow/core/lib/random/distribution_sampler.cc
 tensorflow/core/lib/random/random.cc
 tensorflow/core/lib/random/simple_philox.cc
 tensorflow/core/lib/random/weighted_picker.cc
-tensorflow/core/lib/strings/numbers.cc
 tensorflow/core/lib/strings/ordered_code.cc
 tensorflow/core/lib/strings/proto_text_util.cc
-tensorflow/core/lib/strings/scanner.cc
-tensorflow/core/lib/strings/str_util.cc
 tensorflow/core/lib/strings/strcat.cc
-tensorflow/core/lib/strings/stringprintf.cc
 tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/platform/default/logging.cc
@@ -44,9 +40,9 @@ tensorflow/core/platform/default/mutex.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/env.cc
-tensorflow/core/platform/env_time.cc
 tensorflow/core/platform/file_system.cc
 tensorflow/core/platform/file_system_helper.cc
+tensorflow/core/platform/numbers.cc
 tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/posix/env_time.cc
 tensorflow/core/platform/posix/error.cc
@@ -55,7 +51,10 @@ tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/posix_file_system.cc
 tensorflow/core/platform/protobuf.cc
 tensorflow/core/platform/protobuf_util.cc
+tensorflow/core/platform/scanner.cc
 tensorflow/core/platform/setround.cc
+tensorflow/core/platform/stringprintf.cc
+tensorflow/core/platform/str_util.cc
 tensorflow/core/platform/tensor_coding.cc
 tensorflow/core/platform/tracing.cc
 tensorflow/tools/proto_text/gen_proto_text_functions.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index e284353f2b0..73e19c0814a 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -129,6 +129,7 @@ tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/fused_eigen_output_kernels.cc
 tensorflow/core/kernels/gather_functor.cc
+tensorflow/core/kernels/gather_functor_batched.cc
 tensorflow/core/kernels/gather_nd_op.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
@@ -245,6 +246,7 @@ tensorflow/core/kernels/slice_op_cpu_impl_4.cc
 tensorflow/core/kernels/slice_op_cpu_impl_5.cc
 tensorflow/core/kernels/slice_op_cpu_impl_6.cc
 tensorflow/core/kernels/slice_op_cpu_impl_7.cc
+tensorflow/core/kernels/slice_op_cpu_impl_8.cc
 tensorflow/core/kernels/softmax_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softsign_op.cc
@@ -273,6 +275,7 @@ tensorflow/core/kernels/strided_slice_op_inst_4.cc
 tensorflow/core/kernels/strided_slice_op_inst_5.cc
 tensorflow/core/kernels/strided_slice_op_inst_6.cc
 tensorflow/core/kernels/strided_slice_op_inst_7.cc
+tensorflow/core/kernels/strided_slice_op_inst_8.cc
 tensorflow/core/kernels/string_join_op.cc
 tensorflow/core/kernels/string_util.cc
 tensorflow/core/kernels/tensor_array.cc
@@ -297,6 +300,46 @@ tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/xsmm_conv2d.cc
+tensorflow/core/kernels/data/batch_dataset_op.cc
+tensorflow/core/kernels/data/cache_dataset_ops.cc
+tensorflow/core/kernels/data/cache_ops.cc
+tensorflow/core/kernels/data/captured_function.cc
+tensorflow/core/kernels/data/concatenate_dataset_op.cc
+tensorflow/core/kernels/data/dataset_utils.cc
+tensorflow/core/kernels/data/filter_dataset_op.cc
+tensorflow/core/kernels/data/flat_map_dataset_op.cc
+tensorflow/core/kernels/data/generator_dataset_op.cc
+tensorflow/core/kernels/data/interleave_dataset_op.cc
+tensorflow/core/kernels/data/iterator_ops.cc
+tensorflow/core/kernels/data/map_dataset_op.cc
+tensorflow/core/kernels/data/map_defun_op.cc
+tensorflow/core/kernels/data/model_dataset_op.cc
+tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+tensorflow/core/kernels/data/name_utils.cc
+tensorflow/core/kernels/data/optional_ops.cc
+tensorflow/core/kernels/data/optional_ops.cu.cc
+tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+tensorflow/core/kernels/data/parallel_map_iterator.cc
+tensorflow/core/kernels/data/prefetch_autotuner.cc
+tensorflow/core/kernels/data/prefetch_dataset_op.cc
+tensorflow/core/kernels/data/random_seed_ops.cc
+tensorflow/core/kernels/data/range_dataset_op.cc
+tensorflow/core/kernels/data/repeat_dataset_op.cc
+tensorflow/core/kernels/data/shard_dataset_op.cc
+tensorflow/core/kernels/data/shuffle_dataset_op.cc
+tensorflow/core/kernels/data/single_threaded_executor.cc
+tensorflow/core/kernels/data/skip_dataset_op.cc
+tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+tensorflow/core/kernels/data/stats_utils.cc
+tensorflow/core/kernels/data/take_dataset_op.cc
+tensorflow/core/kernels/data/tensor_dataset_op.cc
+tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+tensorflow/core/kernels/data/unbounded_thread_pool.cc
+tensorflow/core/kernels/data/window_dataset.cc
+tensorflow/core/kernels/data/window_dataset_op.cc
+tensorflow/core/kernels/data/zip_dataset_op.cc
 tensorflow/core/ops/array_grad.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/audio_ops.cc
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 352b2d61084..765c93b06e5 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -102,4 +102,5 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = False,
 )
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index eae04c7ba3e..e46263b48a6 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1161,8 +1161,9 @@ def streaming_dynamic_auc(labels,
   and performing the final calculation using all of the concatenated values.
 
   Args:
-    labels: A `Tensor` of ground truth labels with the same shape as `labels`
-      and with values of 0 or 1 whose values are castable to `int64`.
+    labels:  A `Tensor` of ground truth labels with the same shape as 
+      `predictions` and with values of 0 or 1 whose values are castable to
+      `int64`.
     predictions: A `Tensor` of predictions whose values are castable to
       `float64`. Will be flattened into a 1-D `Tensor`.
     curve: The name of the curve for which to compute AUC, 'ROC' for the
@@ -3640,7 +3641,8 @@ def streaming_concat(values,
       next_shape = array_ops.stack([next_size] + fixed_shape)
       new_value = array_ops.zeros(next_shape, dtype=values.dtype)
       old_value = array.value()
-      assign_op = state_ops.assign(array, new_value, validate_shape=False)
+      with ops.control_dependencies([old_value]):
+        assign_op = state_ops.assign(array, new_value, validate_shape=False)
       with ops.control_dependencies([assign_op]):
         copy_op = array[:size].assign(old_value[:size])
       # return value needs to be the same dtype as no_op() for cond
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index aec07241e7a..906bebe3b82 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -1734,9 +1735,10 @@ class StreamingAUCTest(test.TestCase):
       predictions = constant_op.constant(
           [1, -1, 1, -1], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-      _, update_op = metrics.streaming_auc(predictions, labels)
-      sess.run(variables.local_variables_initializer())
-      self.assertRaises(errors_impl.InvalidArgumentError, update_op.eval)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r'predictions must be in \[0, 1\]'):
+        _, _ = metrics.streaming_auc(predictions, labels)
+        # Error detected statically; no need to run the op.
 
   def testAllCorrect(self):
     self.allCorrectAsExpected('ROC')
@@ -6718,6 +6720,7 @@ class StreamingConcatTest(test.TestCase):
 
   def setUp(self):
     ops.reset_default_graph()
+    variable_scope.enable_resource_variables()
 
   def testVars(self):
     metrics.streaming_concat(values=array_ops.ones((10,)))
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 388384a492f..30375c7f56e 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -172,9 +172,11 @@ def get_pruning_hparams():
     nbins: integer
       number of bins to use for histogram computation
     block_height: integer
-      number of rows in a block (defaults to 1)
+      number of rows in a block (defaults to 1), can be -1 in which
+      case it is set to the size of the corresponding weight tensor.
     block_width: integer
-      number of cols in a block (defaults to 1)
+      number of cols in a block (defaults to 1), can be -1 in which
+      case it is set to the size of the corresponding weight tensor.
     block_pooling_function: string
       Whether to perform average (AVG) or max (MAX) pooling in the block
       (default: AVG)
@@ -489,6 +491,10 @@ class Pruning(object):
     if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]:
       return self._update_mask(weights, threshold)
 
+    for i in range(2):
+      if block_dims[i] == -1:
+        block_dims[i] = squeezed_weights.get_shape()[i]
+
     if self._block_pooling_function not in ['AVG', 'MAX']:
       raise ValueError('Unknown pooling function for block sparsity: %s' %
                        self._block_pooling_function)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 58080ad050d..1a925caab96 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -129,7 +129,7 @@ class PruningTest(test.TestCase):
       mask_val = new_mask.eval()
       self.assertAllEqual(mask_val, expected_mask)
 
-  def testBlockMasking(self):
+  def testBlockMaskingWithNonnegativeBlockDimensions(self):
     param_list = ["block_height=2", "block_width=2", "threshold_decay=0"]
 
     weights_avg = constant_op.constant(
@@ -146,6 +146,25 @@ class PruningTest(test.TestCase):
     self._blockMasking(param_list + ["block_pooling_function=AVG"], weights_avg,
                        expected_mask)
 
+  def testBlockMaskingWithNegativeBlockDimensions(self):
+    param_list = ["block_height=1", "block_width=-1", "threshold_decay=0"]
+
+    weights_avg = constant_op.constant([[0.1, 0.1, 0.1, 0.1],
+                                        [0.2, 0.2, 0.2, 0.2],
+                                        [0.3, 0.3, 0.3, 0.3],
+                                        [0.3, 0.3, 0.4, 0.4]])
+    weights_max = constant_op.constant([[0.1, 0.0, 0.1, 0.0],
+                                        [0.0, 0.1, 0.0, 0.2],
+                                        [0.3, 0.0, 0.3, 0.0],
+                                        [0.0, -0.3, 0.0, 0.4]])
+    expected_mask = [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                     [1., 1., 1., 1.], [1., 1., 1., 1.]]
+
+    self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
+                       expected_mask)
+    self._blockMasking(param_list + ["block_pooling_function=AVG"], weights_avg,
+                       expected_mask)
+
   def testBlockMaskingWithHigherDimensions(self):
     param_list = ["block_height=2", "block_width=2", "threshold_decay=0"]
 
diff --git a/tensorflow/contrib/mpi/BUILD b/tensorflow/contrib/mpi/BUILD
deleted file mode 100644
index 23f90cf77ef..00000000000
--- a/tensorflow/contrib/mpi/BUILD
+++ /dev/null
@@ -1,93 +0,0 @@
-# Description:
-#   MPI based communication interfaces and implementations for TensorFlow.
-
-package(default_visibility = [
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-# For platform specific build config
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_cc",
-)
-
-tf_proto_library_cc(
-    name = "mpi_msg_proto",
-    srcs = ["mpi_msg.proto"],
-    cc_api_version = 2,
-    protodeps = ["//tensorflow/core:worker_proto"],
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "mpi_utils",
-    srcs = ["mpi_utils.cc"],
-    hdrs = ["mpi_utils.h"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/mpi",
-    ],
-)
-
-cc_library(
-    name = "mpi_rendezvous_mgr",
-    srcs = ["mpi_rendezvous_mgr.cc"],
-    hdrs = ["mpi_rendezvous_mgr.h"],
-    deps = [
-        ":mpi_msg_proto_cc",
-        ":mpi_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_cc",
-        "//tensorflow/core:worker_proto_cc",
-        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
-        "//tensorflow/core/distributed_runtime:recent_request_ids",
-        "//tensorflow/core/distributed_runtime:request_id",
-        "//tensorflow/core/distributed_runtime:session_mgr",
-        "//tensorflow/core/distributed_runtime:tensor_coding",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//third_party/mpi",
-    ],
-)
-
-cc_library(
-    name = "mpi_server_lib",
-    srcs = ["mpi_server_lib.cc"],
-    hdrs = ["mpi_server_lib.h"],
-    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
-    deps = [
-        ":mpi_rendezvous_mgr",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/contrib/mpi/README.md b/tensorflow/contrib/mpi/README.md
deleted file mode 100644
index 75cb8230483..00000000000
--- a/tensorflow/contrib/mpi/README.md
+++ /dev/null
@@ -1,94 +0,0 @@
-## How to compile and use MPI-enabled TensorFlow
-
-1. Follow the regular TF compilation instructions. During configure step, if you want MPI support, answer yes to this question:
-
-    ```Do you wish to build TensorFlow with MPI support [y/N]```
-
-2. To turn on the MPI connection, add the protocol "grpc+mpi" in the server definition:
-
-    ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+mpi') # default protocol is 'grpc'```
-
-## Overview
-
-By using this protocol TensorFlow can take advantage of the high performance networking primitives that are offered via the MPI API. This enables TensorFlow to take advantage of high performance low latency networks such as Infiniband. These changes are largely transparent to the user who only has to change the offered protocol and launch the script using the 'mpirun'  launcher. For example:
-    ```mpirun -np 2 python my_neuralnet.py ```
-
-
-
-
-
-## Runtime options
-
-The following environment variables can be set to modify the behavior at runtime:
-
-**MPI_DISABLED=[0,1]**
-
-This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing).
-
-**MPI_OPTIMAL_PATH=[0,1]**
-
-When set to 0 it will use the default path where tensors are encoded to ProtoText before being copied to a remote process. When set to 1 a more optimal path will be taken where only the tensor description is encoded while the actual tensor data is transferred directly from the source buffer to the destination buffer.
-This path is disabled by default as it requires that the MPI library can directly access the pointer to the data. For CPU backed buffers this is no problem, however for GPU backed buffers this requires MPI libraries that are built with CUDA support (CUDA Aware). When using non-CUDA aware MPI libraries and GPU buffers you will get segmentation faults.
-
-
-
-## Known problems
-
-For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine).
-
-**MVAPICH**
-- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support."
-
-**OpenMPI**
-- With OpenMPI corrupt data will be received resulting in an assertion or the MPI library will print an error and exit. The error is "Attempt to free memory that is still in use by an ongoing MPI communication.  MPI job will now abort."
-
-## Implementation details
-
-
-The implementation takes over the responsibility for sending and receiving tensors between separate processes. This is facilitated by TensorFlow's ability to support different protocols. In this particular implementation, the standard gRPC library is used for all administrative operations while the MPI functions take over the tensor exchanges. On the sending side the tensors are placed in the standard waiting tables and nothing is changed there. On the receiving side the RecvFromRemoteAsync function is newly implemented and instead of requesting the data via gRPC the data is now requested via MPI calls.
-
-To this end once the code is loaded a dedicated thread will be launched that handles all MPI operations. This thread will loop through a set of operations:
-
-* Send requests placed on the request queue to the sending process
-Once a request for a tensor is received two callbacks are created. The first one is to request the tensor and the second one is executed once the requested data has arrived. To this end the request is placed in a queue and will be sent once the MPI thread services the queue. This sending is done using non-blocking MPI_Isend operations.
-
-* Send tensor data in response to a request call
-Once a request has arrived from a remote process the request is forwarded to the original TensorFlow code which looks up the tensor in the waiting table. Once the tensor has been found a callback is executed which places the found tensor on the sendQueue for the MPI thread. Once the sendQueue is served the tensor data will be send using non-blocking send operations (MP_Isend) to the remote process.
-
-* Receive tensor request
-The MPI thread will check if there are any incoming tensor request messages on the communication lines using MPI_Iprobe. Once a request has been received it will be passed on to the standard TensorFlow code and eventually will be placed on the sendQueue.
-
-* Receive tensor
-At some point after a request has been sent the remote process will transmit the tensor. This tensor will be received and we look-up the callback that is associated with this tensor in our request table and execute the callback on the received data.
-
-
-In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive.
-The MPI processes identify each other using an MPI process ID. The TensorFlow gRPC processes identify each other using a name. During launch we create a mapping between the TensorFlow process name and the MPI process ID to allow the processes to communicate with the correct destinations when using MPI operations.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/tensorflow/contrib/mpi/mpi_msg.proto b/tensorflow/contrib/mpi/mpi_msg.proto
deleted file mode 100644
index 36f1504901c..00000000000
--- a/tensorflow/contrib/mpi/mpi_msg.proto
+++ /dev/null
@@ -1,19 +0,0 @@
-
-syntax = "proto3";
-
-package tensorflow;
-option cc_enable_arenas = true;
-
-import "tensorflow/core/protobuf/worker.proto";
-
-
-message MPIRecvTensorResponse {
-    RecvTensorResponse response = 1;
-    bool              singleSend = 2;
-    string key = 3;
-    int64 step_id = 4;
-    uint64 checksum = 5;
-}
-
-
-
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
deleted file mode 100644
index c2e1edb1366..00000000000
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/contrib/mpi/mpi_rendezvous_mgr.h"
-
-#include <chrono>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/tensor_coding.h"
-#include "tensorflow/core/framework/allocator.h"
-
-namespace tensorflow {
-
-MPIRendezvousMgr::MPIRendezvousMgr(const WorkerEnv* env)
-    : BaseRendezvousMgr(env),
-      worker_env_2(env),
-      use_optimal_transfer_(false),
-      recv_tensor_recent_request_ids_(100000) {
-  const char* mpienv = getenv("MPI_OPTIMAL_PATH");
-  if (mpienv && mpienv[0] == '1') {
-    LOG(INFO) << "MPI Optimal copy path enabled (Requires CUDA-Aware MPI when "
-                 "using GPUs)\n";
-    use_optimal_transfer_ = true;
-  }
-
-  // extract worker-name
-  auto parsed = env->local_devices[0]->parsed_name();
-  const std::string task_id =
-      strings::StrCat(parsed.job, ":", parsed.replica, ":", parsed.task);
-
-  mpiutils_ = new MPIUtils(task_id);
-  background_thread_ =
-      std::thread(&MPIRendezvousMgr::MPIBackgroundThread, this);
-}
-
-BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
-                                               const WorkerEnv* worker_env) {
-  return new MPIRemoteRendezvous(worker_env, step_id, mpiutils_, this);
-}
-
-void MPIRemoteRendezvous::RecvFromRemoteAsync(
-    const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
-    DoneCallback done) {
-  Status s = Status::OK();
-  MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
-
-  VLOG(2) << "MPI User requested " << parsed.FullKey()
-          << " @ step: " << step_id_;
-
-  std::string src_task = strings::StrCat(
-      parsed.src.job, ":", parsed.src.replica, ":", parsed.src.task);
-  const int dst = mpiutils_->GetSourceID(src_task);
-
-  Device* dst_device;
-  if (s.ok()) {
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-    CHECK(s.ok()) << "Device lookup failed";
-  } else {
-    done(s, Args(), recv_args, Tensor{}, false);
-    return;
-  }
-
-  // Set properties of the request object and create the request function
-  rendezvous_call->Init(parsed, step_id_);
-
-  std::function<void()> request_call = [parsed, dst, rendezvous_call]() {
-    // Use MPI_Alloc_mem here to force allocation inside MPI thread
-    // this is not optimal, but prevents memory corruption and segmentation
-    // faults during inter-server transfers...
-    MPI_CHECK(MPI_Alloc_mem(rendezvous_call->request_buffer_size_,
-                            MPI_INFO_NULL, &rendezvous_call->request_buffer_));
-    rendezvous_call->req_.SerializeToArray(
-        rendezvous_call->request_buffer_,
-        rendezvous_call->request_buffer_size_);
-    MPI_CHECK(MPI_Isend(rendezvous_call->request_buffer_,
-                        rendezvous_call->request_buffer_size_, MPI_CHAR, dst,
-                        TAG_REQTENSOR, MPI_COMM_WORLD,
-                        &rendezvous_call->mpi_request_));
-  };
-
-  // Create the function which is called when the Tensor is send by remote
-  const int64 temp1 = step_id_;
-  rendezvous_call->recv_call_ =
-      [this, parsed, recv_args, done, dst, temp1,
-       rendezvous_call](MPIRecvTensorResponse mpi_response) {
-        Status s;
-        Device* dst_device;
-        if (s.ok()) {
-          s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-          CHECK(s.ok()) << "Device lookup failed";
-        }
-
-        VLOG(3) << "MPI Received tensor " << parsed.FullKey()
-                << " @ step: " << temp1
-                << " single-send: " << mpi_response.singlesend();
-
-        Tensor val;
-        if (mpi_response.singlesend()) {
-          dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
-                                          recv_args.alloc_attrs, &val);
-        } else {
-          TensorResponse tr;
-          tr.InitAlloc(dst_device, recv_args.alloc_attrs);
-          tr.InitPartial(mpi_response.response(), AllocationAttributes());
-          const size_t nBytes = tr.tensor().TotalBytes();
-          void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
-          MPI_Status status;
-          MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
-                             TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
-          val = std::move(tr.tensor());
-        }
-
-        done(s, Args(), recv_args, val, mpi_response.response().is_dead());
-      };
-
-  MPIRendezvousMgr* mgr =
-      reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
-  mgr->QueueRequest(string(parsed.FullKey()), step_id_, std::move(request_call),
-                    rendezvous_call);
-}
-
-MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
-
-/*
- * Add the request for one of our Tensors by a remote process
- * to the local send/table. The here created callback will
- * be called once the Tensor data has arrived and is
- * ready to be send to the remote requester.
- */
-void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
-                                  const int mpi_dst) {
-  TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
-      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
-  const int64 step_id = request.step_id();
-  const std::string& key = request.rendezvous_key();
-  Rendezvous::ParsedKey parsed;
-  TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
-
-  MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
-                                      const Status& status,
-                                      const Rendezvous::Args& send_args,
-                                      const Rendezvous::Args& recv_args,
-                                      const Tensor& val, bool is_dead,
-                                      MPISendTensorCall* mpi_send_call) {
-    // TODO(jbedorf) this should be a loop over max size
-    CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
-        << "Buffer too large for single transfer";
-    MPI_CHECK(MPI_Alloc_mem(mpi_send_call->mRes_.ByteSize(), MPI_INFO_NULL,
-                            &mpi_send_call->send_buffer_));
-    mpi_send_call->mRes_.SerializeToArray(mpi_send_call->send_buffer_,
-                                          mpi_send_call->mRes_.ByteSize());
-
-    MPI_CHECK(MPI_Isend(mpi_send_call->send_buffer_,
-                        static_cast<int>(mpi_send_call->mRes_.ByteSize()),
-                        MPI_CHAR, mpi_dst, TAG_SENDTENSOR, MPI_COMM_WORLD,
-                        &(mpi_send_call->msg1_)));
-    MPI_CHECK(MPI_Test(&mpi_send_call->msg1_, &mpi_send_call->done1_,
-                       MPI_STATUS_IGNORE));
-
-    if (!mpi_send_call->mRes_.singlesend()) {
-      const int tensor_size = static_cast<int>(val.TotalBytes());
-      void* temp = const_cast<void*>(DMAHelper::base(&val));
-
-      // If the MPI library is not GPU aware there should be a data transfer
-      // here to get the data on the host.
-      // if(src_dev->tensorflow_gpu_device_info()) //memcpy to send_buffer2_
-
-      // TODO(jbedorf)  this should be a loop over max size
-      MPI_CHECK(MPI_Isend(temp, tensor_size, MPI_CHAR, mpi_dst, TAG_SENDTENSOR2,
-                          MPI_COMM_WORLD, &mpi_send_call->msg2_));
-      mpi_send_call->done2_ = 0;
-    }
-    return mpi_send_call;
-  };
-
-  // Wrapper around the read callback to place the callback on our queue
-  Rendezvous::DoneCallback done_cb =
-      [this, parsed, step_id, send_cb](
-          const Status& status, const Rendezvous::Args& send_args,
-          const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
-        if (!status.ok()) {
-          CHECK(status.ok())
-              << "RecvLocalAsync was not ok, key: " << parsed.FullKey()
-              << " step: " << step_id
-              << " error message: " << status.error_message();
-          return;
-        }
-
-        VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
-                << " @ step: " << step_id << std::endl;
-
-        auto mpi_send_call = new MPISendTensorCall();
-        mpi_send_call->Init(parsed, step_id, is_dead);
-
-        Device* src_dev = nullptr;
-        Status s = this->worker_env_2->device_mgr->LookupDevice(
-            parsed.src_device, &src_dev);
-        CHECK(s.ok()) << "src device not found";
-
-        // Control if shape and data should be send together or if we can
-        // optimize it in two different transfers, thereby reducing memory
-        // copies
-        bool doOptimalTransfer = true;
-        if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
-        if (val.TotalBytes() < 1024) doOptimalTransfer = false;
-
-        doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
-
-        if (doOptimalTransfer) {
-          // First send the Tensor description and in a follow up transfer the
-          // data
-          mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
-              val.dtype());
-          val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
-                                  ->mutable_tensor()
-                                  ->mutable_tensor_shape());
-          mpi_send_call->mRes_.set_singlesend(false);
-        } else {
-          // Send the Tensor description and data in a single transfer
-          if (src_dev->tensorflow_gpu_device_info() &&
-              (!send_args.alloc_attrs.on_host())) {
-            Notification n;
-            GPUUtil::SetProtoFromGPU(
-                val, src_dev, send_args.device_context,
-                mpi_send_call->mRes_.mutable_response()->mutable_tensor(),
-                is_dead, [&n, &s](const Status& s_) {
-                  s = s_;
-                  n.Notify();
-                });
-            n.WaitForNotification();
-          } else {
-            val.AsProtoTensorContent(
-                mpi_send_call->mRes_.mutable_response()->mutable_tensor());
-          }
-        }
-
-        std::function<MPISendTensorCall*()> res = std::bind(
-            send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
-
-        SendQueueEntry req(string(parsed.FullKey()), std::move(res));
-
-        this->QueueSendRequest(req);
-
-        // Wait for the notification that indicates the tensor has been
-        // successfully transmitted to the remote process. Only needed if we
-        // have not parsed the tensor to proto
-        if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
-      };  // done_cb
-
-  worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
-    this->RecvLocalAsync(step_id, parsed, done_cb);
-  });
-}
-
-void MPIRendezvousMgr::MPIBackgroundThread() {
-  std::list<std::unique_ptr<MPISendTensorCall>> active_sends;
-
-  while (1) {
-    MPI_Status status;
-
-    // Check for incoming Tensor requests
-    RecvTensorRequest request;
-    if (ProbeForData(TAG_REQTENSOR, &status, &request)) {
-      this->AddRequest(request, status.MPI_SOURCE);
-    }
-
-    // Check for incoming Tensor reply
-    MPIRecvTensorResponse mRes;
-    if (ProbeForData(TAG_SENDTENSOR, &status, &mRes)) {
-      const int64 step_id = mRes.step_id();
-      std::string key = mRes.key();
-
-      std::shared_ptr<MPIRequestTensorCall> call;
-      GetRecvCall(step_id, key, &call);
-      call->recv_call_(mRes);
-      RemoveRecvCall(step_id, key);
-    }
-
-    // Remove sends that have been completed
-    active_sends.remove_if(
-        [](std::unique_ptr<MPISendTensorCall>& i) { return i->IsFinished(); });
-
-    // send a Tensor request
-    RequestQueueEntry req;
-    if (GetRequest(&req)) req.second();
-
-    // Send a Tensor response
-    SendQueueEntry send;
-    if (GetResponse(&send)) {
-      std::unique_ptr<MPISendTensorCall> p(send.second());
-      active_sends.push_back(std::move(p));
-    }
-
-    //    std::this_thread::sleep_for(std::chrono::microseconds(1));
-  }
-}
-
-}  // namespace tensorflow
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
deleted file mode 100644
index 90140fcab31..00000000000
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
-#define TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <list>
-#include <map>
-#include <memory>
-#include <queue>
-#include <string>
-#include <thread>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <iostream>
-
-#include "tensorflow/contrib/mpi/mpi_msg.pb.h"
-#include "tensorflow/contrib/mpi/mpi_utils.h"
-#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
-#include "tensorflow/core/distributed_runtime/request_id.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/protobuf/worker.pb.h"
-
-#define TAG_REQTENSOR 1010
-#define TAG_SENDTENSOR 2020
-#define TAG_SENDTENSOR2 3030
-
-namespace tensorflow {
-
-class MPISendTensorCall {
- public:
-  char* send_buffer_;
-  char* send_buffer2_;
-
-  MPI_Request msg1_;
-  MPI_Request msg2_;
-  int done1_;  // Int instead of bool for simpler IsFinished logic
-  int done2_;
-  MPIRecvTensorResponse mRes_;
-  Notification n_;
-
-  MPISendTensorCall()
-      : send_buffer_(nullptr), send_buffer2_(nullptr), done1_(1), done2_(1) {}
-
-  ~MPISendTensorCall() {
-    MPI_CHECK(MPI_Wait(&msg1_, MPI_STATUS_IGNORE));
-    n_.Notify();
-    MPI_CHECK(MPI_Free_mem(send_buffer_));
-    //    delete[] send_buffer_;
-    delete[] send_buffer2_;
-  }
-
-  MPISendTensorCall(MPISendTensorCall&&) = delete;
-
-  void Init(const Rendezvous::ParsedKey& parsed, const int64 step_id,
-            const bool is_dead) {
-    mRes_.set_key(string(parsed.FullKey()));
-    mRes_.set_step_id(step_id);
-    mRes_.mutable_response()->set_is_dead(is_dead);
-    mRes_.mutable_response()->set_send_start_micros(
-        Env::Default()->NowMicros());
-    mRes_.set_singlesend(true);
-  }
-
-  bool IsFinished() {
-    MPI_Status status;
-    if (!done1_) MPI_CHECK(MPI_Test(&msg1_, &done1_, &status));
-    if (!done2_) MPI_CHECK(MPI_Test(&msg2_, &done2_, &status));
-    return done1_ && done2_;
-  }
-};
-
-class MPIRequestTensorCall {
- public:
-  Rendezvous::DoneCallback done_;
-  RecvTensorRequest req_;
-  MPI_Request mpi_request_;
-  char* request_buffer_;
-  size_t request_buffer_size_;
-  std::function<void(MPIRecvTensorResponse)> recv_call_;
-
-  MPIRequestTensorCall() : request_buffer_(nullptr) {}
-  ~MPIRequestTensorCall() {
-    MPI_CHECK(MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE));
-    // delete[] request_buffer_;
-    MPI_CHECK(MPI_Free_mem(request_buffer_));
-  }
-
-  void Init(const Rendezvous::ParsedKey& parsed, const int64 step_id) {
-    req_.set_step_id(step_id);
-    req_.set_rendezvous_key(parsed.FullKey().data(), parsed.FullKey().size());
-    req_.set_request_id(GetUniqueRequestId());
-    request_buffer_size_ = req_.ByteSize();
-    //   request_buffer_ = new char[request_buffer_size_];
-    //  req_.SerializeToArray(request_buffer_, request_buffer_size_);
-  }
-};
-
-class MPIRemoteRendezvous : public BaseRemoteRendezvous {
- public:
-  MPIRemoteRendezvous(const WorkerEnv* env, int64 step_id, const MPIUtils* util,
-                      BaseRendezvousMgr* mgr_)
-      : BaseRemoteRendezvous(env, step_id),
-        mpiutils_(util),
-        rendezvous_mgr_(mgr_) {}
-
- protected:
-  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
-                           const Rendezvous::Args& args,
-                           DoneCallback done) override;
-
- private:
-  ~MPIRemoteRendezvous() override;
-
-  const MPIUtils* mpiutils_;
-  BaseRendezvousMgr* rendezvous_mgr_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MPIRemoteRendezvous);
-};
-
-class MPIRendezvousMgr : public BaseRendezvousMgr {
- public:
-  explicit MPIRendezvousMgr(const WorkerEnv* env);
-  ~MPIRendezvousMgr() {
-    delete mpiutils_;
-    fprintf(stderr, "Delete MPIRendezvousMgr \n");
-    // TODO(jbedorf) stop background_thread_
-    MPI_CHECK(MPI_Finalize());
-  }
-
-  void QueueRequest(std::string key, int64 step_id,
-                    std::function<void()> request_call,
-                    MPIRequestTensorCall* rCall) {
-    mutex_lock l(mrq_);
-    request_queue_.push(RequestQueueEntry(key, std::move(request_call)));
-    const std::string key_id = strings::StrCat(key, "_", step_id);
-    recv_tensor_map_[key_id] = std::shared_ptr<MPIRequestTensorCall>(rCall);
-  }
-
- protected:
-  BaseRemoteRendezvous* Create(int64 step_id,
-                               const WorkerEnv* worker_env) override;
-
- private:
-  typedef std::function<MPISendTensorCall*(
-      const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-      const Tensor&, const bool, MPISendTensorCall*)>
-      MPIRecvTensorCallBack;
-
-  typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
-  typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
-      SendQueueEntry;
-
-  const WorkerEnv* worker_env_2;
-  std::thread background_thread_;
-  MPIUtils* mpiutils_;
-  bool use_optimal_transfer_;
-
-  mutex msq_;
-  mutex mrq_;
-
-  std::queue<SendQueueEntry> send_queue_ GUARDED_BY(msq_);
-  std::queue<RequestQueueEntry> request_queue_ GUARDED_BY(mrq_);
-  std::map<std::string, std::shared_ptr<MPIRequestTensorCall>> recv_tensor_map_
-      GUARDED_BY(mrq_);
-
-  RecentRequestIds recv_tensor_recent_request_ids_;
-
-  void AddRequest(RecvTensorRequest, const int);
-  void MPIBackgroundThread();
-
-  void QueueSendRequest(SendQueueEntry req) {
-    mutex_lock l(msq_);
-    send_queue_.push(req);
-  }
-
-  void GetRecvCall(const int64 step_id, const std::string& key,
-                   std::shared_ptr<MPIRequestTensorCall>* call) {
-    mutex_lock l(mrq_);
-
-    const std::string key_id = strings::StrCat(key, "_", step_id);
-    if (recv_tensor_map_.find(key_id) == recv_tensor_map_.end()) {
-      LOG(FATAL) << "Key/step not found in recv_tensor_map_, step: " << step_id
-                 << " key:  " << key << std::endl;
-    }
-    *call = recv_tensor_map_[key_id];
-  }
-
-  void RemoveRecvCall(const int64 step_id, const std::string& key) {
-    mutex_lock l(mrq_);
-    const std::string key_id = strings::StrCat(key, "_", step_id);
-    recv_tensor_map_.erase(key_id);
-  }
-
-  bool GetRequest(RequestQueueEntry* req) {
-    mutex_lock l(mrq_);
-    if (!request_queue_.empty()) {
-      *req = request_queue_.front();
-      request_queue_.pop();
-      return true;
-    }
-    return false;
-  }
-
-  bool GetResponse(SendQueueEntry* send) {
-    mutex_lock l(msq_);
-    if (!send_queue_.empty()) {
-      *send = send_queue_.front();
-      send_queue_.pop();
-      return true;
-    }
-    return false;
-  }
-
-  template <typename T>
-  int ProbeForData(const int tag, MPI_Status* status, T* obj) {
-    int flag = 0, msg_size = 0;
-    MPI_Message msg;
-    // Receive the message, probe as size is variable
-    MPI_CHECK(
-        MPI_Improbe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &msg, status));
-    if (flag) {
-      MPI_CHECK(MPI_Get_count(status, MPI_CHAR, &msg_size));
-      MPI_Status stat2;
-      std::vector<char> request_buffer_(msg_size);
-      MPI_Mrecv(&request_buffer_[0], msg_size, MPI_CHAR, &msg, &stat2);
-      bool res = obj->ParseFromArray(&request_buffer_[0], msg_size);
-      CHECK(res) << "Failed to parse incomming message";
-    }
-    return flag;
-  }
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MPIRendezvousMgr);
-};  // MPIRendezvousMgr
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-#endif  // TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
deleted file mode 100644
index e44e10af081..00000000000
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/contrib/mpi/mpi_server_lib.h"
-
-#include <string>
-#include <utility>
-
-#include "grpc/support/alloc.h"
-
-#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
-
-namespace tensorflow {
-
-namespace {
-// static utility function
-RendezvousMgrInterface* NewMPIRendezvousMgr(const WorkerEnv* env) {
-  // Runtime check to disable the MPI path
-  const char* mpienv = getenv("MPI_DISABLED");
-  if (mpienv && mpienv[0] == '1') {
-    LOG(INFO) << "MPI path disabled by environment variable\n";
-    return new RpcRendezvousMgr(env);
-  } else {
-    return new MPIRendezvousMgr(env);
-  }
-}
-
-}  // namespace
-
-MPIServer::MPIServer(const ServerDef& server_def, Env* env)
-    : GrpcServer(server_def, env) {}
-
-MPIServer::~MPIServer() {
-  TF_CHECK_OK(Stop());
-  TF_CHECK_OK(Join());
-}
-
-Status MPIServer::Init(ServiceInitFunction service_func,
-                       RendezvousMgrCreationFunction rendezvous_mgr_func) {
-  GrpcServerOptions opts;
-  opts.service_func = service_func;
-  opts.rendezvous_mgr_func = rendezvous_mgr_func;
-  Status s = GrpcServer::Init(opts);
-  return s;
-}
-
-Status MPIServer::Start() {
-  Status s = GrpcServer::Start();
-  return s;
-}
-
-Status MPIServer::Join() {
-  Status s = GrpcServer::Join();
-  return s;
-}
-
-/* static */
-Status MPIServer::Create(const ServerDef& server_def, Env* env,
-                         std::unique_ptr<ServerInterface>* out_server) {
-  std::unique_ptr<MPIServer> ret(new MPIServer(server_def, Env::Default()));
-  ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewMPIRendezvousMgr));
-  *out_server = std::move(ret);
-  return Status::OK();
-}
-
-namespace {
-
-class MPIServerFactory : public ServerFactory {
- public:
-  bool AcceptsOptions(const ServerDef& server_def) override {
-    return server_def.protocol() == "grpc+mpi";
-  }
-
-  Status NewServer(const ServerDef& server_def,
-                   std::unique_ptr<ServerInterface>* out_server) override {
-    return MPIServer::Create(server_def, Env::Default(), out_server);
-  }
-};
-
-// Registers a `ServerFactory` for `MPIServer` instances.
-class MPIServerRegistrar {
- public:
-  MPIServerRegistrar() {
-    gpr_allocation_functions alloc_fns;
-    alloc_fns.malloc_fn = port::Malloc;
-    alloc_fns.realloc_fn = port::Realloc;
-    alloc_fns.free_fn = port::Free;
-    gpr_set_allocation_functions(alloc_fns);
-    ServerFactory::Register("MPI_SERVER", new MPIServerFactory());
-  }
-};
-static MPIServerRegistrar registrar;
-
-}  // namespace
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.h b/tensorflow/contrib/mpi/mpi_server_lib.h
deleted file mode 100644
index 736f6922a15..00000000000
--- a/tensorflow/contrib/mpi/mpi_server_lib.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
-#define TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <memory>
-
-#include "tensorflow/contrib/mpi/mpi_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-
-namespace tensorflow {
-
-class MPIServer : public GrpcServer {
- protected:
-  MPIServer(const ServerDef& server_def, Env* env);
-
- public:
-  static Status Create(const ServerDef& server_def, Env* env,
-                       std::unique_ptr<ServerInterface>* out_server);
-
-  // Destruction is only supported in the factory method. Clean
-  // shutdown is not currently implemented for this server type.
-  ~MPIServer() override;
-
-  // Implementations of ServerInterface methods.
-  Status Start() override;
-  Status Join() override;
-
- protected:
-  Status Init(ServiceInitFunction service_func,
-              RendezvousMgrCreationFunction rendezvous_mgr_func);
-  Status ChannelCacheFactory(const ServerDef& server_def,
-                             GrpcChannelCache** channel_cache);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-#endif  // TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
diff --git a/tensorflow/contrib/mpi/mpi_utils.cc b/tensorflow/contrib/mpi/mpi_utils.cc
deleted file mode 100644
index 8184b856264..00000000000
--- a/tensorflow/contrib/mpi/mpi_utils.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/contrib/mpi/mpi_utils.h"
-namespace tensorflow {
-
-#define max_worker_name_length 128
-
-MPIUtils::MPIUtils(const std::string& worker_name) {
-  InitMPI();
-  // Connect the MPI process IDs to the worker names that are used by TF.
-  // Gather the names of all the active processes (name can't be longer than
-  // 128 bytes)
-  int proc_id = 0, number_of_procs = 1;
-  char my_name[max_worker_name_length];
-  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &proc_id));
-  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs));
-
-  CHECK(worker_name.size() < max_worker_name_length)
-      << "Specified worker name is too long.";
-  snprintf(my_name, max_worker_name_length, worker_name.c_str());
-  std::vector<char> worker_names(number_of_procs * max_worker_name_length);
-  MPI_CHECK(MPI_Allgather(my_name, max_worker_name_length, MPI_CHAR,
-                          &worker_names[0], max_worker_name_length, MPI_CHAR,
-                          MPI_COMM_WORLD));
-
-  if (proc_id == 0) LOG(INFO) << "MPI process-ID to gRPC server name map: \n";
-  for (int i = 0; i < number_of_procs; i++) {
-    name_to_id_[std::string(&worker_names[i * 128])] = i;
-    if (proc_id == 0)
-      LOG(INFO) << "Process: " << i
-                << "\tgRPC-name: " << std::string(&worker_names[i * 128])
-                << std::endl;
-  }
-}
-
-void MPIUtils::InitMPI() {
-  // Initialize the MPI environment if that hasn't been done
-  int flag = 0;
-  MPI_CHECK(MPI_Initialized(&flag));
-  if (!flag) {
-    int proc_id = 0, number_of_procs = 1, len = -1;
-    char my_host_name[max_worker_name_length];
-    // MPI_CHECK(MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &flag));
-    MPI_CHECK(MPI_Init(0, 0));
-    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &proc_id));
-    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs));
-    MPI_CHECK(MPI_Get_processor_name(my_host_name, &len));
-    fprintf(stderr,
-            "MPI Environment initialized. Process id: %d Total processes: %d "
-            "|| Hostname: %s \n",
-            proc_id, number_of_procs, my_host_name);
-  }
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
deleted file mode 100644
index 4091925fc0d..00000000000
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
-#define TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
-// Skip MPI C++ bindings support, this matches the usage in other places
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-#define MPI_CHECK(cmd)                                                \
-  do {                                                                \
-    int mpi_errno = cmd;                                              \
-    if (MPI_SUCCESS != mpi_errno) {                                   \
-      fprintf(stderr, "[%s:%d] MPI call failed with %d \n", __FILE__, \
-              __LINE__, mpi_errno);                                   \
-      exit(EXIT_FAILURE);                                             \
-    }                                                                 \
-    assert(MPI_SUCCESS == mpi_errno);                                 \
-  } while (false)
-
-namespace tensorflow {
-class MPIUtils {
- public:
-  explicit MPIUtils(const std::string& worker_name);
-
-  const int GetSourceID(const std::string& task_id) const {
-    auto it = name_to_id_.find(task_id);
-    if (it == name_to_id_.end()) {
-      LOG(FATAL) << "Failed to convert worker name to MPI index: " << task_id;
-    }
-    return it->second;
-  }
-
- private:
-  void InitMPI();
-
-  std::map<std::string, int> name_to_id_;
-};
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-#endif  // TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
deleted file mode 100644
index 5e848c9e7cf..00000000000
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ /dev/null
@@ -1,128 +0,0 @@
-# Ops that communicate with other processes via MPI.
-
-package(default_visibility = [
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_additional_mpi_lib_defines",
-    "tf_proto_library_cc",
-)
-
-tf_proto_library_cc(
-    name = "mpi_message_proto",
-    srcs = ["mpi_message.proto"],
-    cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "mpi_defines",
-    defines = tf_additional_mpi_lib_defines(),
-)
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_custom_op_library",
-    "tf_custom_op_py_library",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
-    "tf_py_test",
-)
-
-tf_custom_op_library(
-    name = "python/ops/_mpi_ops.so",
-    srcs = [
-        "kernels/mpi_ops.cc",
-        "kernels/ring.cc",
-        "kernels/ring.h",
-        "ops/mpi_ops.cc",
-    ],
-    gpu_srcs = [
-        "kernels/ring.cu.cc",
-        "kernels/ring.h",
-    ],
-    deps = [
-        ":mpi_defines",
-        ":mpi_message_proto_cc",
-        "//third_party/mpi",
-    ],
-)
-
-tf_kernel_library(
-    name = "mpi_ops_kernels",
-    srcs = [
-        "kernels/mpi_ops.cc",
-        "kernels/ring.cc",
-    ],
-    hdrs = [
-        "kernels/ring.h",
-    ],
-    gpu_srcs = [
-        "kernels/ring.cu.cc",
-    ],
-    deps = [
-        ":mpi_defines",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:stream_executor",
-    ],
-    # TODO: Include?    alwayslink = 1,
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["mpi_ops"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "mpi_ops",
-    deps = [":mpi_ops_op_lib"],
-)
-
-tf_custom_op_py_library(
-    name = "mpi_collectives_py",
-    srcs = [
-        "__init__.py",
-        "python/ops/mpi_ops.py",
-    ],
-    dso = [
-        ":python/ops/_mpi_ops.so",
-    ],
-    kernels = [
-        ":mpi_ops_kernels",
-        ":mpi_ops_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":mpi_ops",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-    ],
-)
-
-tf_py_test(
-    name = "mpi_ops_test",
-    srcs = ["mpi_ops_test.py"],
-    additional_deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:platform",
-    ],
-    data = [
-        ":python/ops/_mpi_ops.so",
-    ],
-    tags = ["manual"],
-)
diff --git a/tensorflow/contrib/mpi_collectives/README.md b/tensorflow/contrib/mpi_collectives/README.md
deleted file mode 100644
index c5e1a8c37e3..00000000000
--- a/tensorflow/contrib/mpi_collectives/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# MPI TensorFlow integration
-
-Tensorflow MPI integration allows communicating between different TensorFlow
-processes using MPI. This enables training across multiple nodes and GPUs
-using high-speed interconnects.
diff --git a/tensorflow/contrib/mpi_collectives/__init__.py b/tensorflow/contrib/mpi_collectives/__init__.py
deleted file mode 100644
index 52029cbc36a..00000000000
--- a/tensorflow/contrib/mpi_collectives/__init__.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-short-docstring-punctuation
-"""## Communicating Between Processes with MPI
-
-TensorFlow natively provides inter-device communication through send and
-receive ops and inter-node communication through Distributed TensorFlow, based
-on the same send and receive abstractions. On HPC clusters where Infiniband or
-other high-speed node interconnects are available, these can end up being
-insufficient for synchronous data-parallel training (without asynchronous
-gradient descent). This module implements a variety of MPI ops which can take
-advantage of hardware-specific MPI libraries for efficient communication.
-
-In order to use this module, TensorFlow must be built with an MPI library,
-which can be provided to the `./configure` script at build time. As a user of
-TensorFlow, you will need to build TensorFlow yourself to select the MPI
-library to use; to do so, follow the [instructions for building TensorFlow from
-source](https://www.tensorflow.org/get_started/os_setup#installing_from_sources).
-
-### Utility Ops
-
-In addition to reductions and gathers, this module provides utility operations
-for detecting the running MPI configuration.
-
-Example:
-
-```python
-import tensorflow.contrib.mpi_collectives as mpi
-
-# Use `mpi.Session` instead of `tf.Session`
-with mpi.Session() as session:
-    rank = session.run(mpi.rank())
-    print("My MPI Rank:", rank)
-
-    if rank == 0:
-        print("MPI Size:", session.run(mpi.size()))
-```
-
-@@init
-@@size
-@@rank
-@@local_rank
-
-### Ring Allreduce and Allgather
-
-When summing or averaging tensors across many processes, communication can
-easily become a bottleneck. A naive implementation will send all the tensor
-values to the same process, perform the reduction, and then broadcast the
-values back to all other processes, effectively creating a synchronous
-parameter server in one process. However, the process responsible for
-performing the reduction will have to receive and send a massive amount of data
-which scales with the number of processes *and* the number of parameters in the
-model.
-
-Instead of centralizing the reduction and having one primary reducer, we can
-implement a distributed allreduce or allgather. A bandwidth-optimal allreduce
-will end up sending 2(N - 1) values for every value in the input tensor,
-and can be implemented with a ring allreduce [1]. (Intuitively, a linear reduce
-requires at least (N - 1) sends between the different nodes, and a broadcast of
-the result also requires (N - 1) sends, for a total of 2 (N - 1); these two
-steps cannot be combined in a clever way to reduce the number of required
-sends.) This module implements bandwidth-optimal ring allreduce and ring
-allgather operations using MPI; by choosing a hardware-appropriate MPI
-implementation (such as OpenMPI with CUDA-IPC support), you can train large
-models with synchronous gradient descent with minimal communication overhead.
-
-In addition to the `allreduce` and `allgather` functions, a convenience
-`DistributedOptimizer` wrapper is provided to simplify using these functions
-for reducing model gradients.
-
-Example:
-
-```python
-import tensorflow as tf
-from tensorflow.contrib import mpi_collectives as mpi
-
-# Construct a simple linear regression model to optimize
-W = tf.get_variable("W", shape=[20, 1], dtype=tf.float32)
-B = tf.get_variable("B", shape=[1, 1], dtype=tf.float32)
-inputs = tf.placeholder("Inputs", shape=[None, 20])
-outputs = tf.placeholder("Outputs", shape=[None, 1])
-loss = tf.nn.l2_loss(tf.matmul(inputs, W) + B - outputs)
-
-# Training using MPI allreduce with DistributedOptimizer
-optimizer = mpi.DistributedOptimizer(tf.train.AdamOptimizer())
-train = optimizer.minimize(loss)
-
-# Average loss over all ranks, for printing.
-# Do not pass this to an optimizer!
-avg_loss = mpi.allreduce(loss)
-
-# On different ranks, feed different input data.
-with mpi.Session() as session:
-    rank = session.run(mpi.rank())
-    batch_inputs, batch_outputs = construct_batch_for_rank(rank)
-    feed_dict = {inputs: batch_inputs, outputs: batch_outputs}
-    _, l = session.run([train, avg_loss], feed_dict=feed_dict)
-    print("Average Loss:", l)
-```
-
-[1] Patarasuk, Pitch and Yuan, Xin. "Bandwidth Optimal All-reduce Algorithms
-for Clusters of Workstations".
-
-@@Session
-@@DistributedOptimizer
-@@allreduce
-@@allgather
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import init
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import size
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import rank
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import local_rank
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import allgather
-from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import _allreduce
-
-
-def allreduce(tensor, average=True):
-  """Perform an MPI allreduce on a tf.Tensor or tf.IndexedSlices.
-
-  Arguments:
-  tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
-          The shape of the input must be identical across all ranks.
-  average: If True, computes the average over all ranks.
-           Otherwise, computes the sum over all ranks.
-
-  This function performs a bandwidth-optimal ring allreduce on the input
-  tensor. If the input is an tf.IndexedSlices, the function instead does an
-  allgather on the values and the indices, effectively doing an allreduce on
-  the represented tensor.
-  """
-  if isinstance(tensor, tf.IndexedSlices):
-    # For IndexedSlices, do two allgathers intead of an allreduce.
-    mpi_size = tf.cast(size(), tensor.values.dtype)
-    values = allgather(tensor.values)
-    indices = allgather(tensor.indices)
-
-    # To make this operation into an average, divide all gathered values by
-    # the MPI size.
-    new_values = tf.div(values, mpi_size) if average else values
-    return tf.IndexedSlices(new_values, indices,
-                            dense_shape=tensor.dense_shape)
-  else:
-    mpi_size = tf.cast(size(), tensor.dtype)
-    summed_tensor = _allreduce(tensor)
-    new_tensor = (tf.div(summed_tensor, mpi_size)
-                  if average else summed_tensor)
-    return new_tensor
-
-
-class DistributedOptimizer(tf.train.Optimizer):
-  """An optimizer that wraps another tf.Optimizer, using an MPI allreduce to
-  average gradient values before applying gradients to model weights."""
-
-  def __init__(self, optimizer, name=None, use_locking=False):
-    """Construct a new DistributedOptimizer, which uses another optimizer
-    under the hood for computing single-process gradient values and
-    applying gradient updates after the gradient values have been averaged
-    across all the MPI ranks.
-
-    Args:
-    optimizer: Optimizer to use for computing gradients and applying updates.
-    name: Optional name prefix for the operations created when applying
-          gradients. Defaults to "Distributed" followed by the provided
-          optimizer type.
-    use_locking: Whether to use locking when updating variables. See
-                 Optimizer.__init__ for more info.
-    """
-    if name is None:
-      name = "Distributed{}".format(type(optimizer).__name__)
-
-    self._optimizer = optimizer
-    super(DistributedOptimizer, self).__init__(
-        name=name, use_locking=use_locking)
-
-  def compute_gradients(self, *args, **kwargs):
-    """Compute gradients of all trainable variables.
-
-    See Optimizer.compute_gradients() for more info.
-
-    In DistributedOptimizer, compute_gradients() is overridden to also
-    allreduce the gradients before returning them.
-    """
-    gradients = (super(DistributedOptimizer, self)
-                 .compute_gradients(*args, **kwargs))
-    return [(allreduce(gradient), var) for (gradient, var) in gradients]
-
-  def _apply_dense(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._apply_dense(*args, **kwargs)
-
-  def _apply_sparse(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._apply_sparse(*args, **kwargs)
-
-  def _apply_sparse_duplicate_indices(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._apply_sparse_duplicate_indices(*args,
-                                                           **kwargs)
-
-  def _prepare(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._prepare(*args, **kwargs)
-
-  def _create_slots(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._create_slots(*args, **kwargs)
-
-  def _valid_dtypes(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._valid_dtypes(*args, **kwargs)
-
-  def _finish(self, *args, **kwargs):
-    """Calls this same method on the underlying optimizer."""
-    return self._optimizer._finish(*args, **kwargs)
-
-
-class Session(tf.Session):
-  """A class for running TensorFlow operations, with copies of the same graph
-  running distributed across different MPI nodes.
-
-  The primary difference between `tf.Session` and
-  `tf.contrib.mpi_collectives.Session` is that the MPI `Session` ensures that
-  the `Session` options are correct for use with `tf.contrib.mpi`, and
-  initializes MPI immediately upon the start of the session.
-  """
-
-  def __init__(self, target='', graph=None, config=None):
-    """Creates a new TensorFlow MPI session.
-
-    Unlike a normal `tf.Session`, an MPI Session may only use a single GPU,
-    which must be specified in advance before the session is initialized.
-    In addition, it only uses a single graph evaluation thread, and
-    initializes MPI immediately upon starting.
-
-    If no `graph` argument is specified when constructing the session,
-    the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
-    process, you will have to use different sessions for each graph,
-    but each graph can be used in multiple sessions. In this case, it
-    is often clearer to pass the graph to be launched explicitly to
-    the session constructor.
-
-    Args:
-    target: (Optional.) The execution engine to connect to.
-    graph: (Optional.) The `Graph` to be launched (described above).
-    config: (Optional.) A `ConfigProto` protocol buffer with configuration
-    options for the session.
-    """
-    super(Session, self).__init__(target, graph, config=config)
-
-    # Initialize MPI on the relevant device.
-    # TODO: Move this to library load and eliminate mpi.Session()
-    if graph is None:
-      graph = tf.get_default_graph()
-    with graph.as_default():
-      self.run(init())
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
deleted file mode 100644
index e4b0c2c6541..00000000000
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ /dev/null
@@ -1,1132 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <queue>
-#include <thread>
-#include <unordered_map>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/mutex.h"
-
-#define EIGEN_USE_THREADS
-
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#include "tensorflow/stream_executor/stream.h"
-#endif
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
-
-/*
- * MPI Allreduce and Allgather Ops for TensorFlow.
- *
- * TensorFlow natively provides inter-device communication through send and
- * receive ops and inter-node communication through Distributed TensorFlow,
- * based on the same send and receive abstractions. These end up being
- * insufficient for synchronous data-parallel training on HPC clusters where
- * Infiniband or other high-speed interconnects are available.  This module
- * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
- * gathers and reductions and can take advantage of hardware-optimized
- * communication libraries through the MPI implementation.
- *
- * The primary logic of the allreduce and allgather are in RingAllgather() and
- * RingAllreduce(). The background thread which facilitates MPI operations is
- * run in BackgroundThreadLoop(). The provided MPI ops are:
- *      – MPIInit:
- *          Initialize MPI on a given device (CPU or GPU).
- *          Should only be run on a single device in every process.
- *      – MPISize:
- *          Get the number of MPI processes in the global communicator.
- *      – MPIRank:
- *          Get the rank of the current MPI process in the global communicator.
- *      – MPILocalRank:
- *          Get the local rank of the current MPI process within its node.
- *      – MPIAllreduce:
- *          Perform an allreduce on a Tensor, returning the sum
- *          across all MPI processes in the global communicator.
- *      – MPIAllgather:
- *          Perform an allgather on a Tensor, returning the concatenation of
- *          the tensor on the first dimension across all MPI processes in the
- *          global communicator.
- *
- */
-
-template <class T>
-using StatusOr = stream_executor::port::StatusOr<T>;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi_collectives {
-
-// Make sure template specializations are generated in the ring.cu.cc and the
-// ring.cc file, not in this file.
-extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<GPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<GPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<CPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<CPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-
-namespace {
-
-// Return true if the templated type is GPUDevice, otherwise false.
-template <typename T>
-bool IsGPUDevice();
-template <>
-bool IsGPUDevice<GPUDevice>() {
-  return true;
-};
-template <>
-bool IsGPUDevice<CPUDevice>() {
-  return false;
-};
-
-// A callback to call after the MPI communication completes. Since the
-// allreduce and allgather ops are asynchronous, this callback is what resumes
-// computation after the reduction is completed.
-typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
-
-struct CollectiveOpRecord {
-  // The rank performing this piece of the op
-  int rank;
-
-  // The name of the op/tensor to be reduced
-  std::string name;
-
-  // The op's kernel context
-  OpKernelContext* context;
-
-  // Data type of the op
-  DataType dtype;
-
-  // The input tensor
-  const Tensor* in_t;
-
-  // Allgather: Vector of per-rank first-dimension sizes
-  std::vector<size_t> sizes_vec;
-
-  // The temp tensor for intermediate results
-  Tensor temp_t;
-
-  // The output tensor
-  Tensor* out_t;
-
-  // Whether to run this op on the gpu
-  bool on_gpu;
-
-  // The callback to call after the op has completed
-  CommunicationDoneCallback callback;
-};
-
-// Table storing Tensors to be reduced, keyed by unique name.
-// This table contains everything necessary to do the reduction
-typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
-
-// Table for storing Tensor metadata on rank zero. This is used for error
-// checking and size calculations, as well as determining when a reduction is
-// ready to be done (when all nodes are ready to do it).
-typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
-
-// The global state required for the MPI ops.
-//
-// MPI is a library that stores a lot of global per-program state and often
-// requires running on a single thread. As a result, we have to have a single
-// background thread responsible for all MPI operations, and communicate with
-// that background thread through global state.
-struct MPIGlobalState {
-  // An atomic boolean which is set to true when MPI is initialized.
-  // This ensures that MPI_Init is never called twice.
-  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
-
-  // Condition variable to wait for initialization
-  condition_variable cv;
-
-  // Whether MPI_Init has been completed on the background thread.
-  bool initialization_done = false;
-
-  // Whether MPI_Init succeeded on the background thread.
-  Status init_status;
-
-  // A mutex that needs to be used whenever MPI operations touch
-  // shared structures.
-  mutex mu;
-
-  // Tensors waiting to be allreduced or allgathered.
-  TensorTable tensor_table;
-
-  // Queue of MPI requests waiting to be sent to the coordinator node.
-  std::queue<MPIRequest> message_queue;
-
-  // Background thread running MPI communication.
-  std::thread background_thread;
-
-  // Whether the background thread should shutdown.
-  bool shut_down = false;
-
-  // Only exists on the coordinator node (rank zero). Maintains a count of
-  // how many nodes are ready to allreduce every tensor (keyed by tensor
-  // name).
-  std::unique_ptr<MessageTable> message_table;
-
-  // The MPI rank, local rank, and size.
-  int rank = 0;
-  int local_rank = 0;
-  int size = 1;
-
-  // The device that MPI was initialized on. (-1 for no GPU)
-  int device = -1;
-
-  // The CUDA stream used for data transfers and within-allreduce operations.
-  // A naive implementation would use the TensorFlow StreamExecutor CUDA
-  // stream. However, the allreduce and allgather require doing memory copies
-  // and kernel executions (for accumulation of values on the GPU). However,
-  // the subsequent operations must wait for those operations to complete,
-  // otherwise MPI (which uses its own stream internally) will begin the data
-  // transfers before the CUDA calls are complete. In order to wait for those
-  // CUDA operations, if we were using the TensorFlow stream, we would have
-  // to synchronize that stream; however, other TensorFlow threads may be
-  // submitting more work to that stream, so synchronizing on it can cause
-  // the allreduce to be delayed, waiting for compute totally unrelated to it
-  // in other parts of the graph. Overlaying memory transfers and compute
-  // during backpropagation is crucial for good performance, so we cannot use
-  // the TensorFlow stream, and must use our own stream.
-#if GOOGLE_CUDA
-  cudaStream_t stream;
-  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
-#endif
-
-  ~MPIGlobalState() {
-    // Make sure that the destructor of the background thread is safe to
-    // call. If a thread is still joinable (not detached or complete) its
-    // destructor cannot be called.
-    if (background_thread.joinable()) {
-      shut_down = true;
-      background_thread.join();
-    }
-  }
-};
-
-// All the MPI state that must be stored globally per-process.
-static MPIGlobalState mpi_global;
-
-// For clarify in argument lists.
-#define RANK_ZERO 0
-
-// A tag used for all coordinator messaging.
-#define TAG_NOTIFY 1
-
-// Store the MPIRequest for a name, and return whether the total count of
-// MPIRequests for that tensor is now equal to the MPI size (and thus we are
-// ready to reduce the tensor).
-bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
-                          MPIRequest msg, int mpi_size) {
-  auto name = msg.tensor_name();
-  auto table_iter = message_table->find(name);
-  if (table_iter == message_table->end()) {
-    message_table->emplace(name, std::vector<MPIRequest>({msg}));
-    table_iter = message_table->find(name);
-  } else {
-    table_iter->second.push_back(msg);
-  }
-
-  int count = table_iter->second.size();
-  return count == mpi_size;
-}
-
-// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
-// instructing all ranks to start the reduction to all ranks. The MPIResponse
-// also contains error messages in case the submitted MPIRequests were not
-// valid (for example, contained mismatched shapes or types).
-//
-// Constructing the MPIResponse, thus, requires a whole lot of error checking.
-MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
-                                 std::string name) {
-  bool error = false;
-  auto it = message_table->find(name);
-  assert(it != message_table->end());
-
-  std::vector<MPIRequest> requests = it->second;
-  assert(requests.size() > 0);
-
-  std::ostringstream error_message_stream;
-
-  // Check that all data types being reduced or gathered are identical
-  auto data_type = requests[0].tensor_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    auto request_type = requests[i].tensor_type();
-    if (data_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched data types: One rank had type "
-                           << DataType_Name(data_type)
-                           << ", but another rank had type "
-                           << DataType_Name(request_type) << ".";
-      break;
-    }
-  }
-
-  // Check that all requested operations are the same
-  auto message_type = requests[0].request_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    if (error) {
-      break;
-    }
-
-    auto request_type = requests[i].request_type();
-    if (message_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched MPI operations: One rank did an "
-                           << message_type << ", but another rank did an "
-                           << request_type << ".";
-      break;
-    }
-  }
-
-  // If we are doing an allreduce, check that all tensor shapes
-  // are identical
-  if (message_type == MPIRequest::ALLREDUCE) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape != request_shape) {
-        error = true;
-        error_message_stream << "Mismatched allreduce tensor shapes: "
-                             << "One rank reduced a tensor of shape "
-                             << tensor_shape.DebugString()
-                             << ", but another rank sent a tensor of shape "
-                             << request_shape.DebugString() << ".";
-        break;
-      }
-    }
-  }
-
-  // If we are doing an allgather, make sure all but the first dimension are
-  // the same. The first dimension may be different and the output tensor is
-  // the sum of the first dimension. Collect the sizes by rank.
-  if (message_type == MPIRequest::ALLGATHER) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-
-    if (tensor_shape.dims() == 0) {
-      error = true;
-      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
-    }
-
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape.dims() != request_shape.dims()) {
-        error = true;
-        error_message_stream << "Mismatched allgather tensor shapes: "
-                             << "One rank gathered a tensor of rank "
-                             << tensor_shape.dims()
-                             << ", but another rank sent a tensor of rank "
-                             << request_shape.dims() << ".";
-        break;
-      }
-
-      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
-        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
-          error = true;
-          error_message_stream
-              << "Mismatched allgather tensor shapes: "
-              << "One rank gathered a tensor with dimension " << dim
-              << " equal to " << tensor_shape.dim_size(dim)
-              << ", but another rank sent a tensor with dimension " << dim
-              << " equal to " << request_shape.dim_size(dim) << ".";
-          break;
-        }
-      }
-    }
-  }
-
-  MPIResponse response;
-  response.set_tensor_name(name);
-  if (error) {
-    std::string error_message = error_message_stream.str();
-    response.set_response_type(MPIResponse::ERROR);
-    response.set_error_message(error_message);
-  } else {
-    auto response_type = MPIResponse::ERROR;
-    if (message_type == MPIRequest::ALLREDUCE) {
-      response_type = MPIResponse::ALLREDUCE;
-    } else {
-      response_type = MPIResponse::ALLGATHER;
-    }
-    response.set_response_type(response_type);
-  }
-
-  // Clear all queued up requests for this name. They are now taken care of
-  // by the constructed MPI response.
-  message_table->erase(it);
-
-  return response;
-}
-
-// Process an MPIResponse by doing a reduction, a gather, or raising an error.
-void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
-  OpKernelContext* context;
-  const Tensor* input_tensor;
-  std::vector<size_t> sizes_vec;
-  Tensor temp_tensor;
-  Tensor* output_tensor;
-  CommunicationDoneCallback callback;
-  bool on_gpu;
-  {
-    // Lock on the tensor table.
-    mutex_lock guard(mpi_global.mu);
-
-    // We should never fail at finding this key in the tensor table.
-    auto name = response.tensor_name();
-    auto iter = tensor_table.find(name);
-    assert(iter != tensor_table.end());
-
-    assert(response.response_type() == MPIResponse::ALLREDUCE ||
-           response.response_type() == MPIResponse::ALLGATHER ||
-           response.response_type() == MPIResponse::ERROR);
-
-    CollectiveOpRecord record = iter->second;
-    context = record.context;
-    input_tensor = record.in_t;
-    sizes_vec = record.sizes_vec;
-    temp_tensor = record.temp_t;
-    output_tensor = record.out_t;
-    on_gpu = record.on_gpu;
-    callback = record.callback;
-
-    // Clear the tensor table of this tensor and its callbacks; the rest of
-    // this function takes care of it.
-    tensor_table.erase(iter);
-  }
-
-  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
-  // link to non-existent symbols.
-#if GOOGLE_CUDA
-#define GPU_DEVICE_IF_CUDA GPUDevice
-#else
-#define GPU_DEVICE_IF_CUDA CPUDevice
-#endif
-
-  Status status;
-  auto dtype = input_tensor->dtype();
-  if (response.response_type() == MPIResponse::ALLGATHER) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, float>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, int>(context, input_tensor,
-                                                      sizes_vec, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, long long>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allgather.");
-    }
-  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, float>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, int>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
-    }
-  } else if (response.response_type() == MPIResponse::ERROR) {
-    status = errors::FailedPrecondition(response.error_message());
-  }
-
-  if (status.ok()) {
-    callback(StatusOr<Tensor>(*output_tensor));
-  } else {
-    callback(StatusOr<Tensor>(status));
-  }
-}
-
-// The MPI background thread loop coordinates all the MPI processes and the
-// tensor reductions. The design of the communicator mechanism is limited by a
-// few considerations:
-//
-//      1. Some MPI implementations require all MPI calls to happen from a
-//      single thread. Since TensorFlow may use several threads for graph
-//      processing, this means we must have our own dedicated thread for
-//      dealing with MPI.
-//      2. We want to gracefully handle errors, when MPI processes do not
-//      properly agree upon what should happen (such as mismatched types or
-//      shapes). To do so requires the MPI processes to know about the shapes
-//      and types of the relevant tensors on the other processes.
-//      3. The MPI reductions and gathers should be able to happen in parallel
-//      with other ongoing operations. Since MPI uses an internal
-//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
-//      cannot explicitly synchronize memcpys or kernels with it. As a result,
-//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
-//      ordering of memcpys and kernels with respect to TF streams.
-//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
-//      tensors in the same order. Thus, there must be a way to ensure the
-//      reduction memcpys and kernels occur for correct tensors across all
-//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
-//      gather and trigger the reduction operations that are ready to execute.
-//
-// The coordinator currently follows a master-worker paradigm. Rank zero acts
-// as the master (the "coordinator"), whereas all other ranks are simply
-// workers. Each rank runs its own background thread which progresses in ticks.
-// In each tick, the following actions happen:
-//
-//      a) The workers send any available MPIRequests to the coordinator. These
-//      MPIRequests indicate what the worker would like to do (i.e. which
-//      tensor they would like to gather or reduce, as well as their shape and
-//      type). They repeat this for every tensor that they would like to
-//      operate on after that tensor's collective op has executed ComputeAsync.
-//
-//      b) The workers send an empty "DONE" message to the coordinator to
-//      indicate that there are no more tensors they wish to operate on.
-//
-//      c) The coordinator receives the MPIRequests from the workers, as well
-//      as from its own TensorFlow ops, and stores them in a request table. The
-//      coordinator continues to receive MPIRequest messages until it has
-//      received MPI_SIZE number of empty "DONE" messages.
-//
-//      d) The coordinator finds all tensors that are ready to be reduced,
-//      gathered, or all operations that result in an error. For each of those,
-//      it sends an MPIResponse to all the workers. When no more MPIResponses
-//      are available, it sends a "DONE" response to the workers. If the
-//      process is being shutdown, it instead sends a "SHUTDOWN" response.
-//
-//      e) The workers listen for MPIResponse messages, processing each one by
-//      doing the required reduce or gather, until they receive a "DONE"
-//      response from the coordinator. At that point, the tick ends.
-//      If instead of "DONE" they receive "SHUTDOWN", they exit their
-//      background loop.
-// TODO: Use the global mpi_global state variable instead of a local one
-void BackgroundThreadLoop() {
-#if GOOGLE_CUDA
-  // Set the device, so that this thread uses the same GPU context as the
-  // calling thread.
-  // TODO: Ensure that this is operating correctly. The background thread
-  // needs to be able to control all GPUs that the rank has access to, and
-  // might be more than 1 GPU. Tensors could be resident in any of the
-  // GPUs, so the background thread's accumulate and copy kernels might need
-  // to correctly set the device and it might be necessary for the background
-  // thread to manage multiple streams.
-  cudaSetDevice(mpi_global.device);
-  cudaStreamCreate(&mpi_global.stream);
-#endif
-
-  // Initialize MPI. This must happen on the background thread, since not all
-  // MPI implementations support being called from multiple threads.
-  auto init_result = MPI_Init(NULL, NULL);
-  if (init_result != MPI_SUCCESS) {
-    mpi_global.init_status =
-        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
-    mpi_global.initialization_done = true;
-    mpi_global.cv.notify_all();
-    return;
-  } else {
-    mpi_global.init_status = Status::OK();
-  }
-
-  // Get MPI rank to determine if we are rank zero.
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  bool is_coordinator = rank == 0;
-
-  // Get MPI size to determine how many tensors to wait for before reducing.
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  // Determine local rank by querying the local communicator.
-  MPI_Comm local_comm;
-  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
-                      &local_comm);
-  int local_rank;
-  MPI_Comm_rank(local_comm, &local_rank);
-
-  mpi_global.rank = rank;
-  mpi_global.local_rank = local_rank;
-  mpi_global.size = size;
-  mpi_global.initialization_done = true;
-
-  // Notify calling thread that initialization is complete
-  mpi_global.cv.notify_all();
-
-  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
-  // Initialize the tensor count table. No tensors are available yet.
-  if (is_coordinator) {
-    mpi_global.message_table =
-        std::unique_ptr<MessageTable>(new MessageTable());
-  }
-
-  // The coordinator sends a SHUTDOWN message to trigger shutdown.
-  bool should_shut_down = false;
-  do {
-    // TODO: Eliminate the need for thread sleep by making all activity
-    // depend on other activity (e.g. condition or MPI waits).
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-
-    // Copy the data structures from global state under this lock.
-    // However, don't keep the lock for the rest of the loop, so that
-    // enqueued stream callbacks can continue.
-    std::queue<MPIRequest> message_queue;
-    {
-      mutex_lock guard(mpi_global.mu);
-      while (!mpi_global.message_queue.empty()) {
-        MPIRequest message = mpi_global.message_queue.front();
-        mpi_global.message_queue.pop();
-        message_queue.push(message);
-      }
-    }
-
-    // Collect all tensors that are ready to be reduced. Record them in the
-    // tensor count table (rank zero) or send them to rank zero to be
-    // recorded (everyone else).
-    std::vector<std::string> ready_to_reduce;
-    while (!message_queue.empty()) {
-      // Pop the first available message message
-      MPIRequest message = message_queue.front();
-      message_queue.pop();
-
-      if (is_coordinator) {
-        bool reduce =
-            IncrementTensorCount(mpi_global.message_table, message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(message.tensor_name());
-        }
-      } else {
-        std::string encoded_message;
-        message.SerializeToString(&encoded_message);
-        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
-                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    }
-
-    // Rank zero has put all its own tensors in the tensor count table.
-    // Now, it should count all the tensors that are coming from other
-    // ranks at this tick. It should keep getting tensors until it gets a
-    // DONE message from all the other ranks.
-    if (is_coordinator) {
-      // Count of DONE messages. Keep receiving messages until the number
-      // of messages is equal to the number of processes. Initialize to
-      // one since the coordinator is effectively done.
-      int completed_ranks = 1;
-      while (completed_ranks != size) {
-        MPI_Status status;
-        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int source_rank = status.MPI_SOURCE;
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // If the length is zero, this is a DONE message.
-        if (msg_length == 0) {
-          completed_ranks++;
-          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
-                   &status);
-          continue;
-        }
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
-                 MPI_COMM_WORLD, &status);
-        std::string received_data(buffer);
-        delete[] buffer;
-
-        MPIRequest received_message;
-        received_message.ParseFromString(received_data);
-        auto received_name = received_message.tensor_name();
-
-        bool reduce = IncrementTensorCount(mpi_global.message_table,
-                                           received_message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(received_name);
-        }
-      }
-
-      // At this point, rank zero should have a fully updated tensor
-      // count table and should know all the tensors that need to be
-      // reduced or gathered, and everyone else should have sent all
-      // their information to rank zero. We can now do reductions and
-      // gathers; rank zero will choose which ones and in what order,
-      // and will notify the other ranks before doing each reduction.
-      for (int i = 0; i < ready_to_reduce.size(); i++) {
-        // Notify all nodes which tensor we'd like to reduce now
-        auto name = ready_to_reduce[i];
-        MPIResponse response =
-            ConstructMPIResponse(mpi_global.message_table, name);
-
-        std::string encoded_response;
-        response.SerializeToString(&encoded_response);
-        for (int r = 1; r < size; r++) {
-          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-        }
-
-        // Perform the reduction. All nodes should end up performing
-        // the same reduction.
-        PerformCollectiveOp(mpi_global.tensor_table, response);
-      }
-
-      // Notify all nodes that we are done with the reductions for this
-      // tick.
-      MPIResponse done_response;
-      should_shut_down = mpi_global.shut_down;
-      done_response.set_response_type(
-          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
-      std::string encoded_response;
-      done_response.SerializeToString(&encoded_response);
-      for (int r = 1; r < size; r++) {
-        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    } else {
-      // Notify the coordinator that this node is done sending messages.
-      // A DONE message is encoded as a zero-length message.
-      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-
-      // Receive names for tensors to reduce from rank zero. Once we
-      // receive a empty DONE message, stop waiting for more names.
-      while (true) {
-        MPI_Status status;
-        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
-                 &status);
-        std::string received_message(buffer);
-        delete[] buffer;
-
-        MPIResponse response;
-        response.ParseFromString(received_message);
-        if (response.response_type() == MPIResponse::DONE) {
-          // No more messages this tick
-          break;
-        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
-          // No more messages this tick, and the background thread
-          // should shut down
-          should_shut_down = true;
-          break;
-        } else {
-          // Process the current message
-          PerformCollectiveOp(mpi_global.tensor_table, response);
-        }
-      }
-    }
-  } while (!should_shut_down);
-
-  MPI_Finalize();
-}
-
-// Initialize MPI and start the MPI background thread. Ensure that this is
-// only done once no matter how many times this function is called.
-Status InitializeMPIOnce(bool gpu) {
-  // Ensure MPI is only initialized once.
-  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
-
-  mpi_global.device = -1;
-#if GOOGLE_CUDA
-  if (gpu) {
-    cudaGetDevice(&mpi_global.device);
-  }
-#endif
-
-  // Start the MPI background thread, which assumes MPI is initialized
-  // TODO: Change this to a Tensorflow thread
-  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
-
-  // Wait to ensure that the background thread has finished initializing MPI
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.cv.wait(guard);
-  if (!mpi_global.initialization_done) {
-    mpi_global.init_status =
-        errors::Unknown("Failed to wait for MPI initialization.");
-  }
-
-  return mpi_global.init_status;
-}
-
-// Check that MPI is initialized.
-Status IsMPIInitialized() {
-  if (!mpi_global.initialization_done) {
-    return errors::FailedPrecondition(
-        "MPI has not been initialized; use tf.contrib.mpi.Session.");
-  }
-  return Status::OK();
-}
-
-// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
-// only adds the op's record into the local op queue (to track the op's
-// progress), and sends a message to the coordinator indicating that this rank
-// is ready to begin. The MPI background thread will handle the MPI message.
-void EnqueueTensorCollective(CollectiveOpRecord record,
-                             MPIRequest::RequestType rtype) {
-  const Tensor* input_tensor = record.in_t;
-  MPIRequest message;
-  message.set_request_rank(record.rank);
-  message.set_tensor_name(record.name);
-  message.set_tensor_type(record.dtype);
-  message.set_request_type(rtype);
-  input_tensor->shape().AsProto(message.mutable_tensor_shape());
-
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.tensor_table.emplace(record.name, record);
-  mpi_global.message_queue.push(message);
-}
-
-}  // namespace
-
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
-#endif
-
-// Op to initialize MPI in the current process. The settings used in the
-// configuration are the same that must be used for all future MPI ops.
-template <typename Device>
-class MPIInitOp : public OpKernel {
- public:
-  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool on_gpu = IsGPUDevice<Device>();
-    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
-                        MPIInitOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
-                        MPIInitOp<GPUDevice>);
-#endif
-
-// Op to get the current MPI Size.
-template <typename Device>
-class MPISizeOp : public OpKernel {
- public:
-  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.size;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
-                        MPISizeOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
-                        MPISizeOp<GPUDevice>);
-#endif
-
-// Op to get the current MPI Rank.
-template <typename Device>
-class MPIRankOp : public OpKernel {
- public:
-  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
-                        MPIRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
-                        MPIRankOp<GPUDevice>);
-#endif
-
-// Op to get the current local MPI Rank.
-template <typename Device>
-class MPILocalRankOp : public OpKernel {
- public:
-  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.local_rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
-                        MPILocalRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
-    MPILocalRankOp<GPUDevice>);
-#endif
-
-template <typename Device>
-class MPIAllreduceOp : public AsyncOpKernel {
- public:
-  explicit MPIAllreduceOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context,
-        context->allocate_output(0, input_tensor->shape(), &output_tensor),
-        done);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.out_t = output_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-    record.dtype = input_tensor->dtype();
-
-    const size_t temp_size =
-        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
-    TensorShape temp_shape;
-    temp_shape.AddDim(temp_size);
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_temp(input_tensor->dtype(),
-                                                temp_shape, &record.temp_t),
-                         done);
-
-    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allreduce_done_callback;
-
-    auto allreduce_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allreduce; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allreduce_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allreduce_launch_callback);
-    }
-#else
-    allreduce_launch_callback();
-#endif
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
-                        MPIAllreduceOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
-                        MPIAllreduceOp<GPUDevice>);
-#endif
-
-template <typename Device>
-class MPIAllgatherOp : public AsyncOpKernel {
- public:
-  explicit MPIAllgatherOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    const Tensor* sizing_tensor = &context->input(1);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-
-    // Construct the output size from the sizing tensor
-    size_t output_first_dim = 0;
-    if (sizing_tensor->shape().dims() == 0) {
-      // 0-dim sizing_tensor implies that the op is just gathering
-      // a single element from each rank
-      output_first_dim = mpi_global.size;
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(1);
-      }
-    } else {
-      // Collect the total output tensor sizing from the sizing tensor
-      // NOTE: The sizing tensor is forced to be placed on the CPU by
-      // declaring the input as HostMemory, so it is valid to read it here.
-      const int64* sizing_array =
-          (const int64*)sizing_tensor->tensor_data().data();
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(sizing_array[i]);
-        output_first_dim += sizing_array[i];
-      }
-    }
-
-    TensorShape output_shape;
-    output_shape.AddDim(output_first_dim);
-    for (int i = 1; i < input_tensor->shape().dims(); i++) {
-      output_shape.AddDim(input_tensor->shape().dim_size(i));
-    }
-
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context, context->allocate_output(0, output_shape, &output_tensor),
-        done);
-
-    record.out_t = output_tensor;
-    record.dtype = input_tensor->dtype();
-
-    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allgather_done_callback;
-
-    auto allgather_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allgather; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allgather_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allgather_launch_callback);
-    }
-#else
-    allgather_launch_callback();
-#endif
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
-    MPIAllgatherOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
-    MPIAllgatherOp<GPUDevice>);
-#endif
-
-}  // namespace mpi_collectives
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.cc b/tensorflow/contrib/mpi_collectives/kernels/ring.cc
deleted file mode 100644
index 8970ceb1a20..00000000000
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi_collectives {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-extern template MPI_Datatype MPIType<float>();
-extern template MPI_Datatype MPIType<int>();
-extern template MPI_Datatype MPIType<long long>();
-extern template DataType TensorFlowDataType<float>();
-extern template DataType TensorFlowDataType<int>();
-extern template DataType TensorFlowDataType<long long>();
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Copy data on a CPU using a straight-forward memcpy.
-template <>
-void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
-  std::memcpy(dst, src, size);
-};
-
-// Accumulate values on a CPU.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    for (unsigned int i = 0; i < size; i++) {                        \
-      dst[i] += src[i];                                              \
-    }                                                                \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi_collectives
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc b/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
deleted file mode 100644
index 572e19cb904..00000000000
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
-#include "tensorflow/core/util/gpu_kernel_helper.h"
-#include "tensorflow/core/util/gpu_launch_config.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi_collectives {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-template <>
-MPI_Datatype MPIType<float>() {
-  return MPI_FLOAT;
-};
-template <>
-MPI_Datatype MPIType<int>() {
-  return MPI_INT;
-};
-template <>
-MPI_Datatype MPIType<long long>() {
-  return MPI_LONG_LONG;
-};
-
-template <>
-DataType TensorFlowDataType<float>() {
-  return DT_FLOAT;
-};
-template <>
-DataType TensorFlowDataType<int>() {
-  return DT_INT32;
-};
-template <>
-DataType TensorFlowDataType<long long>() {
-  return DT_INT64;
-};
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Synchronously copy data on the GPU, using a different stream than the default
-// and than TensorFlow to avoid synchronizing on operations unrelated to the
-// allreduce.
-template <>
-void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
-  auto stream = CudaStreamForMPI();
-  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
-  cudaStreamSynchronize(stream);
-};
-
-// Elementwise accumulation kernel for GPU.
-template <typename T>
-__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    out[i] += in[i];
-  }
-}
-
-// Synchronously accumulate tensors on the GPU, using a different stream than
-// the default and than TensorFlow to avoid synchronizing on operations
-// unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                              \
-  template <>                                                                  \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src,           \
-                                             size_t size) {                    \
-    auto stream = CudaStreamForMPI();                                          \
-    TF_CHECK_OK(GpuLaunchKernel(elemwise_accum<type>, 32, 256, 0, stream, dst, \
-                                src, size));                                   \
-    cudaStreamSynchronize(stream);                                             \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi_collectives
-}  // namespace contrib
-}  // namespace tensorflow
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
deleted file mode 100644
index c001615d3ff..00000000000
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_H_
-#define TENSORFLOW_CONTRIB_MPI_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-
-#if GOOGLE_CUDA
-#include "cuda_runtime.h"
-#endif
-
-// Needed to avoid header issues with C++-supporting MPI implementations
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-
-#define TAG_TENSOR 12
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi_collectives {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-// Convert from templated types to values we can pass to MPI.
-template <typename T>
-MPI_Datatype MPIType();
-
-// Convert from templated types to TensorFlow data types.
-template <typename T>
-DataType TensorFlowDataType();
-
-#define MPI_REQUIRES_OK(MPI_STATUS)                               \
-  if ((MPI_STATUS) != MPI_SUCCESS) {                              \
-    return errors::Unknown("MPI operation failed unexpectedly."); \
-  }
-
-// Copy data from one tensor to another tensor.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device>
-void CopyTensorData(void* destination, void* source, size_t size);
-
-// Add a tensor into another tensor, accumulating in place.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device, typename T>
-void AccumulateTensorData(T* destination, T* source, size_t size);
-
-// We need to get the right stream for doing CUDA memory transfers and
-// operations, which is possibly different from the standard TensorFlow stream.
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI();
-#endif
-
-/* Perform a ring allreduce on the data. Allocate the necessary output tensor
- * and store it in the output parameter.
- *
- * Assumes that all MPI processes are doing an allreduce of the same tensor,
- * with the same dimensions.
- *
- * A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
- * allreduce, the nodes involved are arranged in a ring:
- *
- *                   .--0--.
- *                  /       \
- *                 3         1
- *                  \       /
- *                   *--2--*
- *
- *  Each node always sends to the next clockwise node in the ring, and receives
- *  from the previous one.
- *
- *  The allreduce is done in two parts: a scatter-reduce and an allgather. In
- *  the scatter reduce, a reduction is done, so that each node ends up with a
- *  chunk of the final output tensor which has contributions from all other
- *  nodes.  In the allgather, those chunks are distributed among all the nodes,
- *  so that all nodes have the entire output tensor.
- *
- *  Both of these operations are done by dividing the input tensor into N
- *  evenly sized chunks (where N is the number of nodes in the ring).
- *
- *  The scatter-reduce is done in N-1 steps. In the ith step, node j will send
- *  the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
- *  its existing data for that chunk. For example, in the first iteration with
- *  the ring depicted above, you will have the following transfers:
- *
- *      Segment 0:  Node 0 --> Node 1
- *      Segment 1:  Node 1 --> Node 2
- *      Segment 2:  Node 2 --> Node 3
- *      Segment 3:  Node 3 --> Node 0
- *
- *  In the second iteration, you'll have the following transfers:
- *
- *      Segment 0:  Node 1 --> Node 2
- *      Segment 1:  Node 2 --> Node 3
- *      Segment 2:  Node 3 --> Node 0
- *      Segment 3:  Node 0 --> Node 1
- *
- *  After this iteration, Node 2 has 3 of the four contributions to Segment 0.
- *  The last iteration has the following transfers:
- *
- *      Segment 0:  Node 2 --> Node 3
- *      Segment 1:  Node 3 --> Node 0
- *      Segment 2:  Node 0 --> Node 1
- *      Segment 3:  Node 1 --> Node 2
- *
- *  After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
- *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
- * complete.
- *
- *  Next, the allgather distributes these fully accumulated chunks across all
- * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
- * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
- * For example, at the first iteration, the following transfers will occur:
- *
- *      Segment 0:  Node 3 --> Node 0
- *      Segment 1:  Node 0 --> Node 1
- *      Segment 2:  Node 1 --> Node 2
- *      Segment 3:  Node 2 --> Node 3
- *
- * After the first iteration, Node 0 will have a fully accumulated Segment 0
- * (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
- * just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
- * After this has continued for N - 1 iterations, all nodes will have a the
- * fully accumulated tensor.
- *
- * Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
- * allgather. Each send will contain K / N bytes, if there are K bytes in the
- * original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
- * bytes of data, and the performance of the allreduce (assuming no latency in
- * connections) is constrained by the slowest interconnect between the nodes.
- *
- */
-template <typename Device, typename T>
-Status RingAllreduce(OpKernelContext* context, const Tensor* input,
-                     Tensor* temp, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  T* buffer = (T*)output->tensor_data().data();
-
-  CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
-                         output->tensor_data().size());
-
-  // Calculate segment sizes and segment ends
-  const size_t elements_to_reduce = input->NumElements();
-  const size_t segment_size = elements_to_reduce / n;
-  std::vector<size_t> segment_sizes(n, segment_size);
-
-  const size_t residual = elements_to_reduce % n;
-  for (size_t i = 0; i < residual; ++i) {
-    segment_sizes[i]++;
-  }
-
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (size_t i = 1; i < segment_starts.size(); ++i) {
-    segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
-  }
-
-  assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
-
-  T* segment_recv = (T*)temp->tensor_data().data();
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  MPI_Status recv_status;
-  MPI_Request recv_req;
-
-  // Now start ring. At every step, for every rank, we iterate through
-  // segments with wraparound and send and recv from our neighbors and reduce
-  // locally. At the i'th iteration, rank r, sends segment (r-i) and receives
-  // segment (r-i-1).
-  for (int i = 0; i < n - 1; i++) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
-                              MPIType<T>(), recv_from, TAG_TENSOR,
-                              MPI_COMM_WORLD, &recv_req));
-
-    MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
-                             MPIType<T>(), send_to, TAG_TENSOR,
-                             MPI_COMM_WORLD));
-
-    T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
-
-    // Wait for recv to complete before reduction
-    MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
-
-    const size_t recv_seg_size = segment_sizes[recv_seg_id];
-    AccumulateTensorData<Device, T>(segment_update, segment_recv,
-                                    recv_seg_size);
-  }
-
-  // Now start pipelined ring allgather. At every step, for every rank, we
-  // iterate through segments with wraparound and send and recv from our
-  // neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
-  // receives segment (r-i).
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i + 1) + n) % n;
-    const size_t recv_seg_id = ((r - i) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i+1)
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    // Segment to recv - at every iteration we receive segment (r-i)
-    T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-// Perform a ring allgather on a Tensor. Other ranks may allgather with a
-// tensor which differs in the first dimension only; all other dimensions must
-// be the same.
-//
-// For more information on the ring allgather, read the documentation for the
-// ring allreduce, which includes a ring allgather.
-template <typename Device, typename T>
-Status RingAllgather(OpKernelContext* context, const Tensor* input,
-                     const std::vector<size_t>& sizes, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  assert(sizes.size() == n);
-  assert(input->dim_size(0) == sizes[r]);
-
-  // Compute number of elements in every "row". We can't compute number of
-  // elements in every chunks, because those chunks are variable length.
-  size_t elements_per_row = 1;
-  for (int i = 1; i < input->shape().dims(); i++) {
-    elements_per_row *= input->dim_size(i);
-  }
-
-  // Copy data from input tensor to correct place in output tensor.
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (int i = 1; i < n; i++) {
-    segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
-  }
-  size_t offset = segment_starts[r];
-
-  // Copy data to the right offset for this rank.
-  T* buffer = (T*)output->tensor_data().data();
-  CopyTensorData<Device>((void*)(buffer + offset),
-                         (void*)input->tensor_data().data(),
-                         elements_per_row * sizes[r] * sizeof(T));
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  // Perform a ring allgather. At every step, for every rank, we iterate
-  // through segments with wraparound and send and recv from our neighbors.
-  // At the i'th iteration, rank r, sends segment (r-i) and receives segment
-  // (r-1-i).
-  MPI_Status recv_status;
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i)
-    size_t offset_send = segment_starts[send_seg_id];
-    size_t rows_send = sizes[send_seg_id];
-    T* segment_send = &(buffer[offset_send]);
-
-    // Segment to recv - at every iteration we receive segment (r-1-i)
-    size_t offset_recv = segment_starts[recv_seg_id];
-    size_t rows_recv = sizes[recv_seg_id];
-    T* segment_recv = &(buffer[offset_recv]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-}  // namespace mpi_collectives
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-
-#undef TENSORFLOW_CONTRIB_MPI_H_
-#endif  // TENSORFLOW_CONTRIB_MPI_H_
diff --git a/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py b/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
deleted file mode 100644
index c23dd33d579..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import numpy as np
-import tensorflow as tf
-import tensorflow.contrib.mpi_collectives as mpi
-from tensorflow.python.platform import test
-
-
-average_allgather = False
-
-
-class AllgatherTest(test.TestCase):
-  def checkAllgather(self, num_ranks, all_gathered, local_gathered):
-    # Ensure that indices match.
-    all_gat_ind = np.sort(all_gathered.indices)
-    loc_gat_ind = np.sort(local_gathered.indices)
-    assert(len(loc_gat_ind) == len(all_gat_ind))
-    for i in range(len(loc_gat_ind)):
-      assert(loc_gat_ind[i] == all_gat_ind[i])
-
-    # For each index, verify same values.
-    local_checked = []
-    for i in range(len(local_gathered.indices)):
-      local_checked.append(False)
-    for i in range(len(all_gathered.indices)):
-      all_index = all_gathered.indices[i]
-      # TODO(jthestness): Make this lookup quicker using sorting.
-      loc_index = -1
-      for j in range(len(local_gathered.indices)):
-        if local_gathered.indices[j] == all_index and not local_checked[j]:
-          loc_index = j
-          local_checked[j] = True
-          break
-      assert(loc_index >= 0)
-      correct_output = local_gathered.values[loc_index][0]
-      if average_allgather:
-        correct_output = correct_output / float(num_ranks)
-      assert(all_gathered.values[i][0] == correct_output)
-
-
-  def test_mpi_allgather(self):
-    # Get MPI rank
-    my_rank = int(os.environ['PMI_RANK'])
-    num_ranks = int(os.environ['PMI_SIZE'])
-
-    indices_per_rank = 100
-    tensor_width = 10
-
-    # Create IndexedSlices for each rank, some with overlapping indices.
-    to_gather_indices = []
-    to_gather_values = []
-    to_gather = []
-    for rank_id in range(num_ranks):
-      indices = []
-      values = []
-      my_multiple = rank_id + 1
-      current_index = my_multiple
-      for i in range(indices_per_rank):
-        indices.append(current_index)
-        ones_tensor = tf.ones([tensor_width])
-        values.append(tf.multiply(ones_tensor,
-                                  tf.fill(ones_tensor.get_shape(),
-                                          float(current_index))))
-        current_index += my_multiple
-      concat_ind = tf.stack(indices)
-      concat_vals = tf.stack(values)
-      to_gather_indices.append(concat_ind)
-      to_gather_values.append(concat_vals)
-      to_gather.append(tf.IndexedSlices(concat_vals, concat_ind))
-
-    # Collect the local IndexedSlices (indices and values) to create
-    # correct IndexedSlices output.
-    correct_gather_indices = tf.concat(to_gather_indices, 0)
-    correct_gather_values = tf.concat(to_gather_values, 0)
-    correct_gather = tf.IndexedSlices(correct_gather_values,
-                                      correct_gather_indices)
-
-    all_gather = mpi.allreduce(to_gather[my_rank], average_allgather)
-
-    # NOTE: This assumes that device IDs are numbered the same as ranks.
-    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
-    config = tf.ConfigProto(gpu_options=gpu_options)
-
-    # MPI Session to test allgather.
-    with mpi.Session(config=config) as sess:
-      sess.run(tf.global_variables_initializer())
-
-      all_gathered, local_gathered = sess.run([all_gather, correct_gather])
-
-      # Compare all_gathered with local_gathered.
-      self.checkAllgather(num_ranks, all_gathered, local_gathered)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py b/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
deleted file mode 100644
index 001f9170bc0..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import numpy as np
-import tensorflow as tf
-import tensorflow.contrib.mpi_collectives as mpi
-from tensorflow.python.platform import test
-
-
-average_allreduce = False
-max_wrong_count = -1
-
-
-class AllreduceTest(test.TestCase):
-  def dumpFailure(self, my_rank, out_loc_red, my_correct, out_all_red,
-                  our_correct):
-    # Find reduced/allreduced indices that are wrong and print all the
-    # values from output, slices, reduced, allreduced, so we can debug
-    # which is incorrect:
-    wrong_count = 0
-    red_dims = out_loc_red.shape
-    assert(len(red_dims) == 2)
-    for i in range(red_dims[0]):
-      for j in range(red_dims[1]):
-        suffix = ""
-        if out_loc_red[i][j] != my_correct[i][j] or \
-           out_all_red[i][j] != our_correct[i][j]:
-          suffix = "WRONG"
-          wrong_count += 1
-        print("{}\t{}\t{}\t{}\t{}\t{}"
-              .format(my_rank, i, j, out_loc_red[i][j],
-                      out_all_red[i][j], suffix), flush=True)
-        if max_wrong_count > 0 and wrong_count >= max_wrong_count:
-          return
-
-  def test_mpi_allreduce(self):
-    # Get MPI rank
-    my_rank = int(os.environ['PMI_RANK'])
-    num_ranks = int(os.environ['PMI_SIZE'])
-
-    stages = 13
-    batch_size = 1331
-    hidden_size = batch_size
-    out_size = batch_size
-
-    # Input placeholder (batch_size x hidden) - init to 1s
-    inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size),
-                            name="Input")
-
-    # Large matrices (hidden x out_dim) - init random
-    weights = []
-    for i in range(stages):
-      initer = tf.constant_initializer(pow(2.0, i + 1.0))
-      weights.append(tf.get_variable("weights_{}".format(i),
-                                     shape=(hidden_size, out_size),
-                                     dtype=tf.float32,
-                                     initializer=initer))
-
-    # Calculate output through dependent allreduces
-    stage_input = inputs
-    for i in range(stages):
-      inter_output = tf.add(stage_input, weights[i],
-                            name="add_red_{}".format(i))
-      stage_input = mpi.allreduce(inter_output,
-                                  average=average_allreduce)
-
-    all_reduced = stage_input
-
-    # Local reduced output for verification
-    local_input = inputs
-    for i in range(stages):
-      inter_output = tf.add(local_input, weights[i],
-                            name="addin_loc_{}".format(i))
-      my_reducer = tf.Variable(initial_value=np.ones((hidden_size, out_size)),
-                               dtype=tf.float32, name="loc_redr_{}".format(i))
-      for r in range(num_ranks):
-        my_reducer = tf.add(my_reducer, inter_output,
-                            name="add_loc_{}_{}".format(i, r))
-      if average_allreduce:
-        local_input = tf.div(my_reducer, num_ranks,
-                             name="div_loc_{}".format(i))
-      else:
-        local_input = my_reducer
-
-    local_reduced = local_input
-
-    # NOTE: This assumes that device IDs are numbered the same as ranks
-    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
-    config = tf.ConfigProto(gpu_options=gpu_options)
-
-    # MPI Session to test allreduce
-    with mpi.Session(config=config) as sess:
-      sess.run(tf.global_variables_initializer())
-
-      input_feed = np.ones((batch_size, hidden_size), dtype=np.float32)
-      our_output = input_feed[0][0]
-      spread_var = 100
-      input_feed = input_feed + my_rank * spread_var
-      my_output = input_feed[0][0]
-      for i in range(stages):
-        curr_feed = my_output + pow(2.0, i + 1.0)
-        my_output = curr_feed * num_ranks + 1
-        curr_our_feed = our_output + pow(2.0, i + 1.0)
-        if i == 0:
-          sum_ranks = num_ranks * (num_ranks - 1) / 2
-          our_output = curr_our_feed * num_ranks + \
-            spread_var * sum_ranks
-        else:
-          our_output = curr_our_feed * num_ranks
-
-      print("rank {}: My output is {}".format(my_rank, my_output))
-      my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
-      my_correct = my_correct + my_output
-      print("rank {}: Our output is {}".format(my_rank, our_output))
-      our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
-      our_correct = our_correct + our_output
-
-      for i in range(1000):
-        if i % 100 == 0:
-          print("{}: iter {}".format(my_rank, i), flush=True)
-        feed_dict = {inputs: input_feed}
-        out_all_red, out_loc_red \
-          = sess.run([all_reduced, local_reduced],
-                     feed_dict=feed_dict)
-
-        if not np.allclose(out_loc_red, my_correct) or \
-           not np.allclose(out_all_red, our_correct):
-          print("Test incorrect on iter {}".format(i), flush=True)
-          self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red,
-                           our_correct)
-          assert(np.allclose(out_loc_red, my_correct) and
-                 np.allclose(out_all_red, our_correct))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/mpi_collectives/mpi_message.proto b/tensorflow/contrib/mpi_collectives/mpi_message.proto
deleted file mode 100644
index afbce981ae1..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_message.proto
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-syntax = "proto3";
-
-package tensorflow.contrib.mpi_collectives;
-
-import "tensorflow/core/framework/tensor_shape.proto";
-import "tensorflow/core/framework/types.proto";
-
-// An MPIRequest is a message sent from a rank greater than zero to the
-// coordinator (rank zero), informing the coordinator of an operation that
-// the rank wants to do and the tensor that it wants to apply the operation to.
-message MPIRequest {
-  enum RequestType {
-    ALLREDUCE = 0;
-    ALLGATHER = 1;
-  }
-
-  // The request rank is necessary to create a consistent ordering of results,
-  // for example in the allgather where the order of outputs should be sorted
-  // by rank.
-  int32 request_rank = 1;
-  RequestType request_type = 2;
-  DataType tensor_type = 3;
-  string tensor_name = 4;
-  TensorShapeProto tensor_shape = 5;
-};
-
-// An MPIResponse is a message sent from the coordinator (rank zero) to a rank
-// greater than zero, informing the rank of an operation should be performed
-// now. If the operation requested would result in an error (for example, due
-// to a type or shape mismatch), then the MPIResponse can contain an error and
-// an error message instead. Finally, an MPIResponse can be a DONE message (if
-// there are no more tensors to reduce on this tick of the background loop) or
-// SHUTDOWN if all MPI processes should shut down.
-message MPIResponse {
-  enum ResponseType {
-    ALLREDUCE = 0;
-    ALLGATHER = 1;
-    ERROR = 2;
-    DONE = 3;
-    SHUTDOWN = 4;
-  }
-
-  // Empty if the type is DONE or SHUTDOWN.
-  ResponseType response_type = 1;
-  string tensor_name = 2;
-
-  // Empty unless response_type is ERROR.
-  string error_message = 3;
-};
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
deleted file mode 100644
index 475297ca921..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ /dev/null
@@ -1,1236 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <queue>
-#include <thread>
-#include <unordered_map>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/mutex.h"
-
-#define EIGEN_USE_THREADS
-
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#include "tensorflow/stream_executor/stream.h"
-#endif
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-/*
- * MPI Allreduce and Allgather Ops for TensorFlow.
- *
- * TensorFlow natively provides inter-device communication through send and
- * receive ops and inter-node communication through Distributed TensorFlow,
- * based on the same send and receive abstractions. These end up being
- * insufficient for synchronous data-parallel training on HPC clusters where
- * Infiniband or other high-speed interconnects are available.  This module
- * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
- * gathers and reductions and can take advantage of hardware-optimized
- * communication libraries through the MPI implementation.
- *
- * The primary logic of the allreduce and allgather are in RingAllgather() and
- * RingAllreduce(). The background thread which facilitates MPI operations is
- * run in BackgroundThreadLoop(). The provided MPI ops are:
- *      – MPIInit:
- *          Initialize MPI on a given device (CPU or GPU).
- *          Should only be run on a single device in every process.
- *      – MPISize:
- *          Get the number of MPI processes in the global communicator.
- *      – MPIRank:
- *          Get the rank of the current MPI process in the global communicator.
- *      – MPILocalRank:
- *          Get the local rank of the current MPI process within its node.
- *      – MPIAllreduce:
- *          Perform an allreduce on a Tensor, returning the sum
- *          across all MPI processes in the global communicator.
- *      – MPIAllgather:
- *          Perform an allgather on a Tensor, returning the concatenation of
- *          the tensor on the first dimension across all MPI processes in the
- *          global communicator.
- *
- */
-
-template <class T>
-using StatusOr = se::port::StatusOr<T>;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-// Make sure template specializations are generated in the ring.cu.cc and the
-// ring.cc file, not in this file.
-extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<GPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<GPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<CPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<CPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-
-namespace {
-
-// Return true if the templated type is GPUDevice, otherwise false.
-template <typename T>
-bool IsGPUDevice();
-template <>
-bool IsGPUDevice<GPUDevice>() {
-  return true;
-};
-template <>
-bool IsGPUDevice<CPUDevice>() {
-  return false;
-};
-
-// A callback to call after the MPI communication completes. Since the
-// allreduce and allgather ops are asynchronous, this callback is what resumes
-// computation after the reduction is completed.
-typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
-
-struct CollectiveOpRecord {
-  // The rank performing this piece of the op
-  int rank;
-
-  // The name of the op/tensor to be reduced
-  std::string name;
-
-  // The op's kernel context
-  OpKernelContext* context;
-
-  // Data type of the op
-  DataType dtype;
-
-  // The input tensor
-  const Tensor* in_t;
-
-  // Allgather: Vector of per-rank first-dimension sizes
-  std::vector<size_t> sizes_vec;
-
-  // The temp tensor for intermediate results
-  Tensor temp_t;
-
-  // The output tensor
-  Tensor* out_t;
-
-  // Whether to run this op on the gpu
-  bool on_gpu;
-
-  // The callback to call after the op has completed
-  CommunicationDoneCallback callback;
-};
-
-// Table storing Tensors to be reduced, keyed by unique name.
-// This table contains everything necessary to do the reduction
-typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
-
-// Table for storing Tensor metadata on rank zero. This is used for error
-// checking and size calculations, as well as determining when a reduction is
-// ready to be done (when all nodes are ready to do it).
-typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
-
-// The global state required for the MPI ops.
-//
-// MPI is a library that stores a lot of global per-program state and often
-// requires running on a single thread. As a result, we have to have a single
-// background thread responsible for all MPI operations, and communicate with
-// that background thread through global state.
-struct MPIGlobalState {
-  // An atomic boolean which is set to true when MPI is initialized.
-  // This ensures that MPI_Init is never called twice.
-  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
-
-  // Condition variable to wait for initialization
-  condition_variable cv;
-
-  // Whether MPI_Init has been completed on the background thread.
-  bool initialization_done = false;
-
-  // Whether MPI_Init succeeded on the background thread.
-  Status init_status;
-
-  // A mutex that needs to be used whenever MPI operations touch
-  // shared structures.
-  mutex mu;
-
-  // Tensors waiting to be allreduced or allgathered.
-  TensorTable tensor_table;
-
-  // Queue of MPI requests waiting to be sent to the coordinator node.
-  std::queue<MPIRequest> message_queue;
-
-  // Background thread running MPI communication.
-  std::thread background_thread;
-
-  // Whether the background thread should shutdown.
-  bool shut_down = false;
-
-  // Only exists on the coordinator node (rank zero). Maintains a count of
-  // how many nodes are ready to allreduce every tensor (keyed by tensor
-  // name).
-  std::unique_ptr<MessageTable> message_table;
-
-  // The MPI rank, local rank, and size.
-  int rank = 0;
-  int local_rank = 0;
-  int size = 1;
-
-  // The device that MPI was initialized on. (-1 for no GPU)
-  int device = -1;
-
-  // The CUDA stream used for data transfers and within-allreduce operations.
-  // A naive implementation would use the TensorFlow StreamExecutor CUDA
-  // stream. However, the allreduce and allgather require doing memory copies
-  // and kernel executions (for accumulation of values on the GPU). However,
-  // the subsequent operations must wait for those operations to complete,
-  // otherwise MPI (which uses its own stream internally) will begin the data
-  // transfers before the CUDA calls are complete. In order to wait for those
-  // CUDA operations, if we were using the TensorFlow stream, we would have
-  // to synchronize that stream; however, other TensorFlow threads may be
-  // submitting more work to that stream, so synchronizing on it can cause
-  // the allreduce to be delayed, waiting for compute totally unrelated to it
-  // in other parts of the graph. Overlaying memory transfers and compute
-  // during backpropagation is crucial for good performance, so we cannot use
-  // the TensorFlow stream, and must use our own stream.
-#if GOOGLE_CUDA
-  cudaStream_t stream;
-  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
-#endif
-
-  ~MPIGlobalState() {
-    // Make sure that the destructor of the background thread is safe to
-    // call. If a thread is still joinable (not detached or complete) its
-    // destructor cannot be called.
-    if (background_thread.joinable()) {
-      shut_down = true;
-      background_thread.join();
-    }
-  }
-};
-
-// All the MPI state that must be stored globally per-process.
-static MPIGlobalState mpi_global;
-
-// For clarify in argument lists.
-#define RANK_ZERO 0
-
-// A tag used for all coordinator messaging.
-#define TAG_NOTIFY 1
-
-// Store the MPIRequest for a name, and return whether the total count of
-// MPIRequests for that tensor is now equal to the MPI size (and thus we are
-// ready to reduce the tensor).
-bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
-                          MPIRequest msg, int mpi_size) {
-  auto name = msg.tensor_name();
-  auto table_iter = message_table->find(name);
-  if (table_iter == message_table->end()) {
-    message_table->emplace(name, std::vector<MPIRequest>({msg}));
-    table_iter = message_table->find(name);
-  } else {
-    table_iter->second.push_back(msg);
-  }
-
-  int count = table_iter->second.size();
-  return count == mpi_size;
-}
-
-// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
-// instructing all ranks to start the reduction to all ranks. The MPIResponse
-// also contains error messages in case the submitted MPIRequests were not
-// valid (for example, contained mismatched shapes or types).
-//
-// Constructing the MPIResponse, thus, requires a whole lot of error checking.
-MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
-                                 std::string name) {
-  bool error = false;
-  auto it = message_table->find(name);
-  assert(it != message_table->end());
-
-  std::vector<MPIRequest> requests = it->second;
-  assert(requests.size() > 0);
-
-  std::ostringstream error_message_stream;
-
-  // Check that all data types being reduced or gathered are identical
-  auto data_type = requests[0].tensor_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    auto request_type = requests[i].tensor_type();
-    if (data_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched data types: One rank had type "
-                           << DataType_Name(data_type)
-                           << ", but another rank had type "
-                           << DataType_Name(request_type) << ".";
-      break;
-    }
-  }
-
-  // Check that all requested operations are the same
-  auto message_type = requests[0].request_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    if (error) {
-      break;
-    }
-
-    auto request_type = requests[i].request_type();
-    if (message_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched MPI operations: One rank did an "
-                           << message_type << ", but another rank did an "
-                           << request_type << ".";
-      break;
-    }
-  }
-
-  // If we are doing an allreduce, check that all tensor shapes
-  // are identical
-  if (message_type == MPIRequest::ALLREDUCE) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape != request_shape) {
-        error = true;
-        error_message_stream << "Mismatched allreduce tensor shapes: "
-                             << "One rank reduced a tensor of shape "
-                             << tensor_shape.DebugString()
-                             << ", but another rank sent a tensor of shape "
-                             << request_shape.DebugString() << ".";
-        break;
-      }
-    }
-  }
-
-  // If we are doing an allgather, make sure all but the first dimension are
-  // the same. The first dimension may be different and the output tensor is
-  // the sum of the first dimension. Collect the sizes by rank.
-  if (message_type == MPIRequest::ALLGATHER) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-
-    if (tensor_shape.dims() == 0) {
-      error = true;
-      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
-    }
-
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape.dims() != request_shape.dims()) {
-        error = true;
-        error_message_stream << "Mismatched allgather tensor shapes: "
-                             << "One rank gathered a tensor of rank "
-                             << tensor_shape.dims()
-                             << ", but another rank sent a tensor of rank "
-                             << request_shape.dims() << ".";
-        break;
-      }
-
-      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
-        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
-          error = true;
-          error_message_stream
-              << "Mismatched allgather tensor shapes: "
-              << "One rank gathered a tensor with dimension " << dim
-              << " equal to " << tensor_shape.dim_size(dim)
-              << ", but another rank sent a tensor with dimension " << dim
-              << " equal to " << request_shape.dim_size(dim) << ".";
-          break;
-        }
-      }
-    }
-  }
-
-  MPIResponse response;
-  response.set_tensor_name(name);
-  if (error) {
-    std::string error_message = error_message_stream.str();
-    response.set_response_type(MPIResponse::ERROR);
-    response.set_error_message(error_message);
-  } else {
-    auto response_type = MPIResponse::ERROR;
-    if (message_type == MPIRequest::ALLREDUCE) {
-      response_type = MPIResponse::ALLREDUCE;
-    } else {
-      response_type = MPIResponse::ALLGATHER;
-    }
-    response.set_response_type(response_type);
-  }
-
-  // Clear all queued up requests for this name. They are now taken care of
-  // by the constructed MPI response.
-  message_table->erase(it);
-
-  return response;
-}
-
-// Process an MPIResponse by doing a reduction, a gather, or raising an error.
-void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
-  OpKernelContext* context;
-  const Tensor* input_tensor;
-  std::vector<size_t> sizes_vec;
-  Tensor temp_tensor;
-  Tensor* output_tensor;
-  CommunicationDoneCallback callback;
-  bool on_gpu;
-  {
-    // Lock on the tensor table.
-    mutex_lock guard(mpi_global.mu);
-
-    // We should never fail at finding this key in the tensor table.
-    auto name = response.tensor_name();
-    auto iter = tensor_table.find(name);
-    assert(iter != tensor_table.end());
-
-    assert(response.response_type() == MPIResponse::ALLREDUCE ||
-           response.response_type() == MPIResponse::ALLGATHER ||
-           response.response_type() == MPIResponse::ERROR);
-
-    CollectiveOpRecord record = iter->second;
-    context = record.context;
-    input_tensor = record.in_t;
-    sizes_vec = record.sizes_vec;
-    temp_tensor = record.temp_t;
-    output_tensor = record.out_t;
-    on_gpu = record.on_gpu;
-    callback = record.callback;
-
-    // Clear the tensor table of this tensor and its callbacks; the rest of
-    // this function takes care of it.
-    tensor_table.erase(iter);
-  }
-
-  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
-  // link to non-existent symbols.
-#if GOOGLE_CUDA
-#define GPU_DEVICE_IF_CUDA GPUDevice
-#else
-#define GPU_DEVICE_IF_CUDA CPUDevice
-#endif
-
-  Status status;
-  auto dtype = input_tensor->dtype();
-  if (response.response_type() == MPIResponse::ALLGATHER) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, float>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, int>(context, input_tensor,
-                                                      sizes_vec, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, long long>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allgather.");
-    }
-  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, float>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, int>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
-    }
-  } else if (response.response_type() == MPIResponse::ERROR) {
-    status = errors::FailedPrecondition(response.error_message());
-  }
-
-  if (status.ok()) {
-    callback(StatusOr<Tensor>(*output_tensor));
-  } else {
-    callback(StatusOr<Tensor>(status));
-  }
-}
-
-// The MPI background thread loop coordinates all the MPI processes and the
-// tensor reductions. The design of the communicator mechanism is limited by a
-// few considerations:
-//
-//      1. Some MPI implementations require all MPI calls to happen from a
-//      single thread. Since TensorFlow may use several threads for graph
-//      processing, this means we must have our own dedicated thread for
-//      dealing with MPI.
-//      2. We want to gracefully handle errors, when MPI processes do not
-//      properly agree upon what should happen (such as mismatched types or
-//      shapes). To do so requires the MPI processes to know about the shapes
-//      and types of the relevant tensors on the other processes.
-//      3. The MPI reductions and gathers should be able to happen in parallel
-//      with other ongoing operations. Since MPI uses an internal
-//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
-//      cannot explicitly synchronize memcpys or kernels with it. As a result,
-//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
-//      ordering of memcpys and kernels with respect to TF streams.
-//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
-//      tensors in the same order. Thus, there must be a way to ensure the
-//      reduction memcpys and kernels occur for correct tensors across all
-//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
-//      gather and trigger the reduction operations that are ready to execute.
-//
-// The coordinator currently follows a master-worker paradigm. Rank zero acts
-// as the master (the "coordinator"), whereas all other ranks are simply
-// workers. Each rank runs its own background thread which progresses in ticks.
-// In each tick, the following actions happen:
-//
-//      a) The workers send any available MPIRequests to the coordinator. These
-//      MPIRequests indicate what the worker would like to do (i.e. which
-//      tensor they would like to gather or reduce, as well as their shape and
-//      type). They repeat this for every tensor that they would like to
-//      operate on after that tensor's collective op has executed ComputeAsync.
-//
-//      b) The workers send an empty "DONE" message to the coordinator to
-//      indicate that there are no more tensors they wish to operate on.
-//
-//      c) The coordinator receives the MPIRequests from the workers, as well
-//      as from its own TensorFlow ops, and stores them in a request table. The
-//      coordinator continues to receive MPIRequest messages until it has
-//      received MPI_SIZE number of empty "DONE" messages.
-//
-//      d) The coordinator finds all tensors that are ready to be reduced,
-//      gathered, or all operations that result in an error. For each of those,
-//      it sends an MPIResponse to all the workers. When no more MPIResponses
-//      are available, it sends a "DONE" response to the workers. If the
-//      process is being shutdown, it instead sends a "SHUTDOWN" response.
-//
-//      e) The workers listen for MPIResponse messages, processing each one by
-//      doing the required reduce or gather, until they receive a "DONE"
-//      response from the coordinator. At that point, the tick ends.
-//      If instead of "DONE" they receive "SHUTDOWN", they exit their
-//      background loop.
-// TODO: Use the global mpi_global state variable instead of a local one
-void BackgroundThreadLoop() {
-#if GOOGLE_CUDA
-  // Set the device, so that this thread uses the same GPU context as the
-  // calling thread.
-  // TODO: Ensure that this is operating correctly. The background thread
-  // needs to be able to control all GPUs that the rank has access to, and
-  // might be more than 1 GPU. Tensors could be resident in any of the
-  // GPUs, so the background thread's accumulate and copy kernels might need
-  // to correctly set the device and it might be necessary for the background
-  // thread to manage multiple streams.
-  cudaSetDevice(mpi_global.device);
-  cudaStreamCreate(&mpi_global.stream);
-#endif
-
-  // Initialize MPI. This must happen on the background thread, since not all
-  // MPI implementations support being called from multiple threads.
-  auto init_result = MPI_Init(NULL, NULL);
-  if (init_result != MPI_SUCCESS) {
-    mpi_global.init_status =
-        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
-    mpi_global.initialization_done = true;
-    mpi_global.cv.notify_all();
-    return;
-  } else {
-    mpi_global.init_status = Status::OK();
-  }
-
-  // Get MPI rank to determine if we are rank zero.
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  bool is_coordinator = rank == 0;
-
-  // Get MPI size to determine how many tensors to wait for before reducing.
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  // Determine local rank by querying the local communicator.
-  MPI_Comm local_comm;
-  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
-                      &local_comm);
-  int local_rank;
-  MPI_Comm_rank(local_comm, &local_rank);
-
-  mpi_global.rank = rank;
-  mpi_global.local_rank = local_rank;
-  mpi_global.size = size;
-  mpi_global.initialization_done = true;
-
-  // Notify calling thread that initialization is complete
-  mpi_global.cv.notify_all();
-
-  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
-  // Initialize the tensor count table. No tensors are available yet.
-  if (is_coordinator) {
-    mpi_global.message_table =
-        std::unique_ptr<MessageTable>(new MessageTable());
-  }
-
-  // The coordinator sends a SHUTDOWN message to trigger shutdown.
-  bool should_shut_down = false;
-  do {
-    // TODO: Eliminate the need for thread sleep by making all activity
-    // depend on other activity (e.g. condition or MPI waits).
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-
-    // Copy the data structures from global state under this lock.
-    // However, don't keep the lock for the rest of the loop, so that
-    // enqueued stream callbacks can continue.
-    std::queue<MPIRequest> message_queue;
-    {
-      mutex_lock guard(mpi_global.mu);
-      while (!mpi_global.message_queue.empty()) {
-        MPIRequest message = mpi_global.message_queue.front();
-        mpi_global.message_queue.pop();
-        message_queue.push(message);
-      }
-    }
-
-    // Collect all tensors that are ready to be reduced. Record them in the
-    // tensor count table (rank zero) or send them to rank zero to be
-    // recorded (everyone else).
-    std::vector<std::string> ready_to_reduce;
-    while (!message_queue.empty()) {
-      // Pop the first available message message
-      MPIRequest message = message_queue.front();
-      message_queue.pop();
-
-      if (is_coordinator) {
-        bool reduce =
-            IncrementTensorCount(mpi_global.message_table, message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(message.tensor_name());
-        }
-      } else {
-        std::string encoded_message;
-        message.SerializeToString(&encoded_message);
-        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
-                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    }
-
-    // Rank zero has put all its own tensors in the tensor count table.
-    // Now, it should count all the tensors that are coming from other
-    // ranks at this tick. It should keep getting tensors until it gets a
-    // DONE message from all the other ranks.
-    if (is_coordinator) {
-      // Count of DONE messages. Keep receiving messages until the number
-      // of messages is equal to the number of processes. Initialize to
-      // one since the coordinator is effectively done.
-      int completed_ranks = 1;
-      while (completed_ranks != size) {
-        MPI_Status status;
-        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int source_rank = status.MPI_SOURCE;
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // If the length is zero, this is a DONE message.
-        if (msg_length == 0) {
-          completed_ranks++;
-          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
-                   &status);
-          continue;
-        }
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
-                 MPI_COMM_WORLD, &status);
-        std::string received_data(buffer);
-        delete[] buffer;
-
-        MPIRequest received_message;
-        received_message.ParseFromString(received_data);
-        auto received_name = received_message.tensor_name();
-
-        bool reduce = IncrementTensorCount(mpi_global.message_table,
-                                           received_message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(received_name);
-        }
-      }
-
-      // At this point, rank zero should have a fully updated tensor
-      // count table and should know all the tensors that need to be
-      // reduced or gathered, and everyone else should have sent all
-      // their information to rank zero. We can now do reductions and
-      // gathers; rank zero will choose which ones and in what order,
-      // and will notify the other ranks before doing each reduction.
-      for (int i = 0; i < ready_to_reduce.size(); i++) {
-        // Notify all nodes which tensor we'd like to reduce now
-        auto name = ready_to_reduce[i];
-        MPIResponse response =
-            ConstructMPIResponse(mpi_global.message_table, name);
-
-        std::string encoded_response;
-        response.SerializeToString(&encoded_response);
-        for (int r = 1; r < size; r++) {
-          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-        }
-
-        // Perform the reduction. All nodes should end up performing
-        // the same reduction.
-        PerformCollectiveOp(mpi_global.tensor_table, response);
-      }
-
-      // Notify all nodes that we are done with the reductions for this
-      // tick.
-      MPIResponse done_response;
-      should_shut_down = mpi_global.shut_down;
-      done_response.set_response_type(
-          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
-      std::string encoded_response;
-      done_response.SerializeToString(&encoded_response);
-      for (int r = 1; r < size; r++) {
-        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    } else {
-      // Notify the coordinator that this node is done sending messages.
-      // A DONE message is encoded as a zero-length message.
-      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-
-      // Receive names for tensors to reduce from rank zero. Once we
-      // receive a empty DONE message, stop waiting for more names.
-      while (true) {
-        MPI_Status status;
-        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
-                 &status);
-        std::string received_message(buffer);
-        delete[] buffer;
-
-        MPIResponse response;
-        response.ParseFromString(received_message);
-        if (response.response_type() == MPIResponse::DONE) {
-          // No more messages this tick
-          break;
-        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
-          // No more messages this tick, and the background thread
-          // should shut down
-          should_shut_down = true;
-          break;
-        } else {
-          // Process the current message
-          PerformCollectiveOp(mpi_global.tensor_table, response);
-        }
-      }
-    }
-  } while (!should_shut_down);
-
-  MPI_Finalize();
-}
-
-// Initialize MPI and start the MPI background thread. Ensure that this is
-// only done once no matter how many times this function is called.
-Status InitializeMPIOnce(bool gpu) {
-  // Ensure MPI is only initialized once.
-  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
-
-  mpi_global.device = -1;
-#if GOOGLE_CUDA
-  if (gpu) {
-    cudaGetDevice(&mpi_global.device);
-  }
-#endif
-
-  // Start the MPI background thread, which assumes MPI is initialized
-  // TODO: Change this to a Tensorflow thread
-  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
-
-  // Wait to ensure that the background thread has finished initializing MPI
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.cv.wait(guard);
-  if (!mpi_global.initialization_done) {
-    mpi_global.init_status =
-        errors::Unknown("Failed to wait for MPI initialization.");
-  }
-
-  return mpi_global.init_status;
-}
-
-// Check that MPI is initialized.
-Status IsMPIInitialized() {
-  if (!mpi_global.initialization_done) {
-    return errors::FailedPrecondition(
-        "MPI has not been initialized; use tf.contrib.mpi.Session.");
-  }
-  return Status::OK();
-}
-
-// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
-// only adds the op's record into the local op queue (to track the op's
-// progress), and sends a message to the coordinator indicating that this rank
-// is ready to begin. The MPI background thread will handle the MPI message.
-void EnqueueTensorCollective(CollectiveOpRecord record,
-                             MPIRequest::RequestType rtype) {
-  const Tensor* input_tensor = record.in_t;
-  MPIRequest message;
-  message.set_request_rank(record.rank);
-  message.set_tensor_name(record.name);
-  message.set_tensor_type(record.dtype);
-  message.set_request_type(rtype);
-  input_tensor->shape().AsProto(message.mutable_tensor_shape());
-
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.tensor_table.emplace(record.name, record);
-  mpi_global.message_queue.push(message);
-}
-
-}  // namespace
-
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
-#endif
-
-// Op to initialize MPI in the current process. The settings used in the
-// configuration are the same that must be used for all future MPI ops.
-template <typename Device>
-class MPIInitOp : public OpKernel {
- public:
-  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool on_gpu = IsGPUDevice<Device>();
-    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
-                        MPIInitOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
-                        MPIInitOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIInit").Doc(R"doc(
-Initialize MPI for the current process.
-
-If this is run on a GPU, then that GPU must be used for all future MPI
-operations. If it is run on CPU, then all future MPI operations must also
-run on CPU.
-)doc");
-
-// Op to get the current MPI Size.
-template <typename Device>
-class MPISizeOp : public OpKernel {
- public:
-  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.size;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
-                        MPISizeOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
-                        MPISizeOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPISize")
-    .Output("size: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the number of running MPI processes.
-
-More precisely, returns the number of MPI processes in the group associated
-with the MPI_COMM_WORLD communicator.
-
-size:   Size of the MPI group.
-)doc");
-
-// Op to get the current MPI Rank.
-template <typename Device>
-class MPIRankOp : public OpKernel {
- public:
-  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
-                        MPIRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
-                        MPIRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the MPI group.
-
-More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
-communicator.
-
-rank:   Rank of the calling process.
-)doc");
-
-// Op to get the current local MPI Rank.
-template <typename Device>
-class MPILocalRankOp : public OpKernel {
- public:
-  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.local_rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
-                        MPILocalRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
-    MPILocalRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPILocalRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the node it is on.
-
-More precisely, returns the rank of the calling process in communicator that
-only spans the MPI processes running on that node.
-
-rank:   Rank of the calling process on the node it is on.
-)doc");
-
-template <typename Device>
-class MPIAllreduceOp : public AsyncOpKernel {
- public:
-  explicit MPIAllreduceOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context,
-        context->allocate_output(0, input_tensor->shape(), &output_tensor),
-        done);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.out_t = output_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-    record.dtype = input_tensor->dtype();
-
-    const size_t temp_size =
-        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
-    TensorShape temp_shape;
-    temp_shape.AddDim(temp_size);
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_temp(input_tensor->dtype(),
-                                                temp_shape, &record.temp_t),
-                         done);
-
-    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allreduce_done_callback;
-
-    auto allreduce_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allreduce; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allreduce_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allreduce_launch_callback);
-    }
-#else
-    allreduce_launch_callback();
-#endif
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
-                        MPIAllreduceOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
-                        MPIAllreduceOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIAllreduce")
-    .Attr("T: {int32, int64, float32}")
-    .Input("tensor: T")
-    .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allreduce on a tensor. All other processes that do a reduction
-on a tensor with the same name must have the same dimension for that tensor.
-Tensors are reduced with other tensors that have the same node name for the
-allreduce.
-
-Arguments
-    tensor:     A tensor to reduce.
-
-Output
-    sum:        A tensor with the same shape as `tensor`, summed across all
-                MPI processes.
-)doc");
-
-template <typename Device>
-class MPIAllgatherOp : public AsyncOpKernel {
- public:
-  explicit MPIAllgatherOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    const Tensor* sizing_tensor = &context->input(1);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-
-    // Construct the output size from the sizing tensor
-    size_t output_first_dim = 0;
-    if (sizing_tensor->shape().dims() == 0) {
-      // 0-dim sizing_tensor implies that the op is just gathering
-      // a single element from each rank
-      output_first_dim = mpi_global.size;
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(1);
-      }
-    } else {
-      // Collect the total output tensor sizing from the sizing tensor
-      // NOTE: The sizing tensor is forced to be placed on the CPU by
-      // declaring the input as HostMemory, so it is valid to read it here.
-      const int64* sizing_array =
-          (const int64*)sizing_tensor->tensor_data().data();
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(sizing_array[i]);
-        output_first_dim += sizing_array[i];
-      }
-    }
-
-    TensorShape output_shape;
-    output_shape.AddDim(output_first_dim);
-    for (int i = 1; i < input_tensor->shape().dims(); i++) {
-      output_shape.AddDim(input_tensor->shape().dim_size(i));
-    }
-
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context, context->allocate_output(0, output_shape, &output_tensor),
-        done);
-
-    record.out_t = output_tensor;
-    record.dtype = input_tensor->dtype();
-
-    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allgather_done_callback;
-
-    auto allgather_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allgather; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allgather_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allgather_launch_callback);
-    }
-#else
-    allgather_launch_callback();
-#endif
-  }
-};
-
-REGISTER_OP("MPIAllgather")
-    .Attr("T: {int32, int64, float32}")
-    .Attr("S: {int64}")
-    .Input("tensor: T")
-    .Input("sizes: S")
-    .Output("gathered: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allgather on a tensor. All other processes that do a gather on a
-tensor with the same name must have the same rank for that tensor, and have the
-same dimension on all but the first dimension.
-
-Arguments
-    tensor:     A tensor to gather.
-    sizes:      A tensor containing the first-dimension sizes of tensors to be
-                gathered from other ranks
-
-Output
-    gathered:   A tensor with the same shape as `tensor` except for the first
-                dimension, which is the sum of dimensions in `sizes`.
-)doc");
-
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
-    MPIAllgatherOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
-    MPIAllgatherOp<GPUDevice>);
-#endif
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/mpi_ops.py
deleted file mode 100644
index bd7096d9cee..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Inter-process communication using MPI."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import load_library
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _load_library(name, op_list=None):
-  """Loads a .so file containing the specified operators.
-
-  Args:
-    name: The name of the .so file to load.
-    op_list: A list of names of operators that the library should have. If None
-        then the .so file's contents will not be verified.
-
-  Raises:
-    NameError if one of the required ops is missing.
-  """
-  try:
-    filename = resource_loader.get_path_to_datafile(name)
-    library = load_library.load_op_library(filename)
-    for expected_op in (op_list or []):
-      for lib_op in library.OP_LIST.op:
-        if lib_op.name == expected_op:
-          break
-      else:
-        raise NameError('Could not find operator %s in dynamic library %s' %
-                        (expected_op, name))
-    return library
-  except errors.NotFoundError:
-    logging.warning('%s file could not be loaded.', name)
-
-
-MPI_LIB = _load_library(
-    'mpi_collectives.so',
-    ['MPISize', 'MPIRank', 'MPILocalRank', 'MPIAllgather', 'MPIAllreduce'])
-
-
-def size(name=None):
-  """An op which returns the number of MPI processes.
-
-  This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
-  size of the global communicator.
-
-  Returns:
-    An integer scalar containing the number of MPI processes.
-  """
-  return MPI_LIB.mpi_size(name=name)
-
-
-ops.NotDifferentiable('MPISize')
-
-
-def rank(name=None):
-  """An op which returns the MPI rank of the calling process.
-
-  This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
-  rank of the current process in the global communicator.
-
-  Returns:
-    An integer scalar with the MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_rank(name=name)
-
-
-ops.NotDifferentiable('MPIRank')
-
-
-def init(name=None):
-  """An op which initializes MPI on the device on which it is run.
-
-  All future MPI ops must be run on the same device that the `init` op was run
-  on.
-  """
-  return MPI_LIB.mpi_init(name=name)
-
-
-ops.NotDifferentiable('MPIInit')
-
-
-def local_rank(name=None):
-  """An op which returns the local MPI rank of the calling process, within the
-  node that it is running on. For example, if there are seven processes running
-  on a node, their local ranks will be zero through six, inclusive.
-
-  This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
-  which only includes processes on the same node.
-
-  Returns:
-    An integer scalar with the local MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_local_rank(name=name)
-
-
-ops.NotDifferentiable('MPILocalRank')
-
-
-def _allreduce(tensor, name=None):
-  """An op which sums an input tensor over all the MPI processes.
-
-  The reduction operation is keyed by the name of the op. The tensor type and
-  shape must be the same on all MPI processes for a given name. The reduction
-  will not start until all processes are ready to send and receive the tensor.
-
-  Returns:
-    A tensor of the same shape and type as `tensor`, summed across all
-    processes.
-  """
-  return MPI_LIB.mpi_allreduce(tensor, name=name)
-
-
-ops.NotDifferentiable('MPIAllreduce')
-
-
-def allgather(tensor, name=None):
-  """An op which concatenates the input tensor with the same input tensor on
-  all other MPI processes.
-
-  The concatenation is done on the first dimension, so the input tensors on the
-  different processes must have the same rank and shape, except for the first
-  dimension, which is allowed to be different.
-
-  Returns:
-    A tensor of the same type as `tensor`, concatenated on dimension zero
-    across all processes. The shape is identical to the input shape, except for
-    the first dimension, which may be greater and is the sum of all first
-    dimensions of the tensors in different MPI processes.
-  """
-  # Specify that first allgather is to collect the tensor gather sizes,
-  # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
-  my_size = tf.slice(
-      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
-  if name is None:
-    name = 'allgather'
-  sizing_name = '{}_sizing'.format(name)
-  sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
-  return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
-
-
-ops.NotDifferentiable('MPIAllgather')
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops_test.py b/tensorflow/contrib/mpi_collectives/mpi_ops_test.py
deleted file mode 100644
index 48e5c0a0c70..00000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops_test.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Tests for tensorflow.contrib.mpi_collectives.mpi_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-import itertools
-
-import tensorflow as tf
-
-import tensorflow.contrib.mpi_collectives as mpi
-
-
-def mpi_env_rank_and_size():
-  """Get MPI rank and size from environment variables and return them as a
-  tuple of integers.
-
-  Most MPI implementations have an `mpirun` or `mpiexec` command that will
-  run an MPI executable and set up all communication necessary between the
-  different processors. As part of that set up, they will set environment
-  variables that contain the rank and size of the MPI_COMM_WORLD
-  communicator. We can read those environment variables from Python in order
-  to ensure that `mpi.rank()` and `mpi.size()` return the expected values.
-
-  Since MPI is just a standard, not an implementation, implementations
-  typically choose their own environment variable names. This function tries
-  to support several different implementation, but really it only needs to
-  support whatever implementation we want to use for the TensorFlow test
-  suite.
-
-  If this is not running under MPI, then defaults of rank zero and size one
-  are returned. (This is appropriate because when you call MPI_Init in an
-  application not started with mpirun, it will create a new independent
-  communicator with only one process in it.)
-  """
-  rank_env = "PMI_RANK OMPI_COMM_WORLD_RANK".split()
-  size_env = "PMI_SIZE OMPI_COMM_WORLD_SIZE".split()
-
-  for rank_var, size_var in zip(rank_env, size_env):
-    rank = os.environ.get(rank_var)
-    size = os.environ.get(size_var)
-    if rank is not None and size is not None:
-      return int(rank), int(size)
-
-  # Default to rank zero and size one if there are no environment variables
-  return 0, 1
-
-
-class MPITests(tf.test.TestCase):
-  """
-  Tests for MPI ops in tensorflow.contrib.mpi_collectives.
-  """
-
-  def test_mpi_rank(self):
-    """Test that the rank returned by mpi.rank() is correct."""
-    true_rank, _ = mpi_env_rank_and_size()
-    with self.test_session() as session:
-      rank = session.run(mpi.rank())
-      self.assertEqual(true_rank, rank)
-
-  def test_mpi_size(self):
-    """Test that the size returned by mpi.size() is correct."""
-    _, true_size = mpi_env_rank_and_size()
-    with self.test_session() as session:
-      size = session.run(mpi.size())
-      self.assertEqual(true_size, size)
-
-  def test_mpi_allreduce_cpu(self):
-    """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
-    with self.test_session() as session:
-      size = session.run(mpi.size())
-
-      dtypes = [tf.int32, tf.float32]
-      dims = [1, 2, 3]
-      for dtype, dim in itertools.product(dtypes, dims):
-        tf.set_random_seed(1234)
-        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
-        summed = mpi.allreduce(tensor, average=False)
-        multiplied = tensor * size
-        max_difference = tf.reduce_max(tf.abs(summed - multiplied))
-
-        # Threshold for floating point equality depends on number of
-        # ranks, since we're comparing against precise multiplication.
-        if size <= 3:
-          threshold = 0
-        elif size < 10:
-          threshold = 1e-4
-        elif size < 15:
-          threshold = 5e-4
-        else:
-          break
-
-        diff = session.run(max_difference)
-        self.assertTrue(diff <= threshold,
-                        "mpi.allreduce produces incorrect results")
-
-  def test_mpi_allreduce_gpu(self):
-    """Test that the allreduce works on GPUs.
-
-    This test will crash badly if used with an MPI implementation that does
-    not support GPU memory transfers directly, as it will call MPI_Send on
-    a GPU data pointer."""
-    # Only do this test if there are GPUs available.
-    if not tf.test.is_gpu_available(cuda_only=True):
-      return
-
-    no_gpus = tf.GPUOptions(visible_device_list="")
-    cpu_config = tf.ConfigProto(gpu_options=no_gpus)
-    with self.test_session(config=cpu_config) as session:
-      local_rank = session.run(mpi.local_rank())
-
-    one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
-    gpu_config = tf.ConfigProto(gpu_options=one_gpu)
-    with self.test_session(config=gpu_config) as session:
-      size = session.run(mpi.size())
-
-      dtype = tf.float32
-      dim = 3
-      with tf.device("/gpu:0"):
-        tf.set_random_seed(1234)
-        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
-        summed = mpi.allreduce(tensor, average=False)
-        multiplied = tensor * size
-        max_difference = tf.reduce_max(tf.abs(summed - multiplied))
-
-      # Threshold for floating point equality depends on number of
-      # ranks, since we're comparing against precise multiplication.
-      if size <= 3:
-        threshold = 0
-      elif size < 10:
-        threshold = 1e-4
-      elif size < 15:
-        threshold = 5e-4
-      else:
-        return
-
-      diff = session.run(max_difference)
-      self.assertTrue(diff <= threshold,
-                      "mpi.allreduce on GPU produces incorrect results")
-
-  def test_mpi_allreduce_error(self):
-    """Test that the allreduce raises an error if different ranks try to
-    send tensors of different rank or dimension."""
-    with self.test_session() as session:
-      rank = session.run(mpi.rank())
-      size = session.run(mpi.size())
-
-      # This test does not apply if there is only one worker.
-      if size == 1:
-        return
-
-      # Same rank, different dimension
-      tf.set_random_seed(1234)
-      dims = [17 + rank] * 3
-      tensor = tf.random_uniform(dims, -1.0, 1.0)
-      with self.assertRaises(tf.errors.FailedPreconditionError):
-        session.run(mpi.allreduce(tensor))
-
-      # Same number of elements, different rank
-      tf.set_random_seed(1234)
-      if rank == 0:
-        dims = [17, 23 * 57]
-      else:
-        dims = [17, 23, 57]
-      tensor = tf.random_uniform(dims, -1.0, 1.0)
-      with self.assertRaises(tf.errors.FailedPreconditionError):
-        session.run(mpi.allreduce(tensor))
-
-  def test_mpi_allreduce_type_error(self):
-    """Test that the allreduce raises an error if different ranks try to
-    send tensors of different type."""
-    with self.test_session() as session:
-      rank = session.run(mpi.rank())
-      size = session.run(mpi.size())
-
-      # This test does not apply if there is only one worker.
-      if size == 1:
-        return
-
-      # Same rank, different dimension
-      dims = [17] * 3
-      tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32)
-      with self.assertRaises(tf.errors.FailedPreconditionError):
-        session.run(mpi.allreduce(tensor))
-
-  def test_mpi_allgather(self):
-    """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
-    with self.test_session() as session:
-      size = session.run(mpi.size())
-      rank = session.run(mpi.rank())
-
-      dtypes = tf.int32, tf.float32
-      dims = 1, 2, 3
-      for dtype, dim in itertools.product(dtypes, dims):
-        tensor = tf.ones([17] * dim, dtype=dtype) * rank
-        gathered = mpi.allgather(tensor)
-
-        gathered_tensor = session.run(gathered)
-        self.assertEqual(list(gathered_tensor.shape),
-                         [17 * size] + [17] * (dim - 1))
-
-        for i in range(size):
-          rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1),
-                                 [17] + [-1] * (dim - 1))
-          self.assertEqual(list(rank_tensor.shape), [17] * dim)
-          self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
-                          "mpi.allgather produces incorrect gathered tensor")
-
-  def test_mpi_allgather_variable_size(self):
-    """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
-    even if those tensors have different sizes along the first dim."""
-    with self.test_session() as session:
-      size = session.run(mpi.size())
-      rank = session.run(mpi.rank())
-
-      dtypes = tf.int32, tf.float32
-      dims = 1, 2, 3
-      for dtype, dim in itertools.product(dtypes, dims):
-        # Support tests up to MPI Size of 35
-        if size > 35:
-          break
-
-        tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
-        tensor_sizes = tensor_sizes[:size]
-
-        tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1),
-                         dtype=dtype) * rank
-        gathered = mpi.allgather(tensor)
-
-        gathered_tensor = session.run(gathered)
-        expected_size = sum(tensor_sizes)
-        self.assertEqual(list(gathered_tensor.shape),
-                         [expected_size] + [17] * (dim - 1))
-
-        for i in range(size):
-          rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
-          rank_tensor = tf.slice(gathered,
-                                 [sum(tensor_sizes[:i])] + [0] * (dim - 1),
-                                 rank_size)
-          self.assertEqual(list(rank_tensor.shape), rank_size)
-          self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
-                          "mpi.allgather produces incorrect gathered tensor")
-
-  def test_mpi_allgather_error(self):
-    """Test that the allgather returns an error if any dimension besides
-    the first is different among the tensors being gathered."""
-    with self.test_session() as session:
-      rank = session.run(mpi.rank())
-      size = session.run(mpi.size())
-
-      # This test does not apply if there is only one worker.
-      if size == 1:
-        return
-
-      tensor_size = [17] * 3
-      tensor_size[1] = 10 * (rank + 1)
-      tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
-      with self.assertRaises(tf.errors.FailedPreconditionError):
-        session.run(mpi.allgather(tensor))
-
-  def test_mpi_allgather_type_error(self):
-    """Test that the allgather returns an error if the types being gathered
-    differ among the processes"""
-    with self.test_session() as session:
-      rank = session.run(mpi.rank())
-      size = session.run(mpi.size())
-
-      # This test does not apply if there is only one worker.
-      if size == 1:
-        return
-
-      tensor_size = [17] * 3
-      dtype = tf.int32 if rank % 2 == 0 else tf.float32
-      tensor = tf.ones(tensor_size, dtype=dtype) * rank
-      with self.assertRaises(tf.errors.FailedPreconditionError):
-        session.run(mpi.allgather(tensor))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc
deleted file mode 100644
index 18e6bb61cff..00000000000
--- a/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi_collectives {
-
-REGISTER_OP("MPIInit").Doc(R"doc(
-Initialize MPI for the current process.
-
-If this is run on a GPU, then that GPU must be used for all future MPI
-operations. If it is run on CPU, then all future MPI operations must also
-run on CPU.
-)doc");
-
-REGISTER_OP("MPISize")
-    .Output("size: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the number of running MPI processes.
-
-More precisely, returns the number of MPI processes in the group associated
-with the MPI_COMM_WORLD communicator.
-
-size:   Size of the MPI group.
-)doc");
-
-REGISTER_OP("MPIRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the MPI group.
-
-More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
-communicator.
-
-rank:   Rank of the calling process.
-)doc");
-
-REGISTER_OP("MPILocalRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the node it is on.
-
-More precisely, returns the rank of the calling process in communicator that
-only spans the MPI processes running on that node.
-
-rank:   Rank of the calling process on the node it is on.
-)doc");
-
-REGISTER_OP("MPIAllreduce")
-    .Attr("T: {int32, int64, float32}")
-    .Input("tensor: T")
-    .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allreduce on a tensor. All other processes that do a reduction
-on a tensor with the same name must have the same dimension for that tensor.
-Tensors are reduced with other tensors that have the same node name for the
-allreduce.
-
-Arguments
-    tensor:     A tensor to reduce.
-
-Output
-    sum:        A tensor with the same shape as `tensor`, summed across all
-                MPI processes.
-)doc");
-
-REGISTER_OP("MPIAllgather")
-    .Attr("T: {int32, int64, float32}")
-    .Attr("S: {int64}")
-    .Input("tensor: T")
-    .Input("sizes: S")
-    .Output("gathered: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allgather on a tensor. All other processes that do a gather on a
-tensor with the same name must have the same rank for that tensor, and have the
-same dimension on all but the first dimension.
-
-Arguments
-    tensor:     A tensor to gather.
-    sizes:      A tensor containing the first-dimension sizes of tensors to be
-                gathered from other ranks
-
-Output
-    gathered:   A tensor with the same shape as `tensor` except for the first
-                dimension, which is the sum of dimensions in `sizes`.
-)doc");
-
-}  // namespace mpi_collectives
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
deleted file mode 100644
index 2fbefef0d36..00000000000
--- a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Inter-process communication using MPI."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.contrib.mpi_collectives.ops import gen_mpi_ops
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-_mpi_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile('_mpi_ops.so'))
-
-
-def size(name=None):
-  """An op which returns the number of MPI processes.
-
-  This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
-  size of the global communicator.
-
-  Returns:
-    An integer scalar containing the number of MPI processes.
-  """
-  return gen_mpi_ops.mpi_size(name=name)
-
-
-ops.NotDifferentiable('MPISize')
-
-
-def rank(name=None):
-  """An op which returns the MPI rank of the calling process.
-
-  This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
-  rank of the current process in the global communicator.
-
-  Returns:
-    An integer scalar with the MPI rank of the calling process.
-  """
-  return gen_mpi_ops.mpi_rank(name=name)
-
-
-ops.NotDifferentiable('MPIRank')
-
-
-def init(name=None):
-  """An op which initializes MPI on the device on which it is run.
-
-  All future MPI ops must be run on the same device that the `init` op was run
-  on.
-  """
-  return gen_mpi_ops.mpi_init(name=name)
-
-
-ops.NotDifferentiable('MPIInit')
-
-
-def local_rank(name=None):
-  """An op which returns the local MPI rank of the calling process, within the
-  node that it is running on. For example, if there are seven processes running
-  on a node, their local ranks will be zero through six, inclusive.
-
-  This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
-  which only includes processes on the same node.
-
-  Returns:
-    An integer scalar with the local MPI rank of the calling process.
-  """
-  return gen_mpi_ops.mpi_local_rank(name=name)
-
-
-ops.NotDifferentiable('MPILocalRank')
-
-
-def _allreduce(tensor, name=None):
-  """An op which sums an input tensor over all the MPI processes.
-
-  The reduction operation is keyed by the name of the op. The tensor type and
-  shape must be the same on all MPI processes for a given name. The reduction
-  will not start until all processes are ready to send and receive the tensor.
-
-  Returns:
-    A tensor of the same shape and type as `tensor`, summed across all
-    processes.
-  """
-  return gen_mpi_ops.mpi_allreduce(tensor, name=name)
-
-
-ops.NotDifferentiable('MPIAllreduce')
-
-
-def allgather(tensor, name=None):
-  """An op which concatenates the input tensor with the same input tensor on
-  all other MPI processes.
-
-  The concatenation is done on the first dimension, so the input tensors on the
-  different processes must have the same rank and shape, except for the first
-  dimension, which is allowed to be different.
-
-  Returns:
-    A tensor of the same type as `tensor`, concatenated on dimension zero
-    across all processes. The shape is identical to the input shape, except for
-    the first dimension, which may be greater and is the sum of all first
-    dimensions of the tensors in different MPI processes.
-  """
-  # Specify that first allgather is to collect the tensor gather sizes,
-  # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
-  my_size = tf.slice(
-      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
-  if name is None:
-    name = 'allgather'
-  sizing_name = '{}_sizing'.format(name)
-  sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name)
-  return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name)
-
-
-ops.NotDifferentiable('MPIAllgather')
diff --git a/tensorflow/contrib/mpi_collectives/ring.cc b/tensorflow/contrib/mpi_collectives/ring.cc
deleted file mode 100644
index d93233eb210..00000000000
--- a/tensorflow/contrib/mpi_collectives/ring.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-extern template MPI_Datatype MPIType<float>();
-extern template MPI_Datatype MPIType<int>();
-extern template MPI_Datatype MPIType<long long>();
-extern template DataType TensorFlowDataType<float>();
-extern template DataType TensorFlowDataType<int>();
-extern template DataType TensorFlowDataType<long long>();
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Copy data on a CPU using a straight-forward memcpy.
-template <>
-void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
-  std::memcpy(dst, src, size);
-};
-
-// Accumulate values on a CPU.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    for (unsigned int i = 0; i < size; i++) {                        \
-      dst[i] += src[i];                                              \
-    }                                                                \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/ring.cu.cc
deleted file mode 100644
index 401d1caa514..00000000000
--- a/tensorflow/contrib/mpi_collectives/ring.cu.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-template <>
-MPI_Datatype MPIType<float>() {
-  return MPI_FLOAT;
-};
-template <>
-MPI_Datatype MPIType<int>() {
-  return MPI_INT;
-};
-template <>
-MPI_Datatype MPIType<long long>() {
-  return MPI_LONG_LONG;
-};
-
-template <>
-DataType TensorFlowDataType<float>() {
-  return DT_FLOAT;
-};
-template <>
-DataType TensorFlowDataType<int>() {
-  return DT_INT32;
-};
-template <>
-DataType TensorFlowDataType<long long>() {
-  return DT_INT64;
-};
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Synchronously copy data on the GPU, using a different stream than the default
-// and than TensorFlow to avoid synchronizing on operations unrelated to the
-// allreduce.
-template <>
-void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
-  auto stream = CudaStreamForMPI();
-  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
-  cudaStreamSynchronize(stream);
-};
-
-// Elementwise accumulation kernel for GPU.
-template <typename T>
-__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    out[i] += in[i];
-  }
-}
-
-// Synchronously accumulate tensors on the GPU, using a different stream than
-// the default and than TensorFlow to avoid synchronizing on operations
-// unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                              \
-  template <>                                                                  \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src,           \
-                                             size_t size) {                    \
-    auto stream = CudaStreamForMPI();                                          \
-    TF_CHECK_OK(GpuLaunchKernel(elemwise_accum<type>, 32, 256, 0, stream, dst, \
-                                src, size));                                   \
-    cudaStreamSynchronize(stream);                                             \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
deleted file mode 100644
index 9b5d52e1b64..00000000000
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_H_
-#define TENSORFLOW_CONTRIB_MPI_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-
-#if GOOGLE_CUDA
-#include "cuda_runtime.h"
-#endif
-
-// Needed to avoid header issues with C++-supporting MPI implementations
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-
-#define TAG_TENSOR 12
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-// Convert from templated types to values we can pass to MPI.
-template <typename T>
-MPI_Datatype MPIType();
-
-// Convert from templated types to TensorFlow data types.
-template <typename T>
-DataType TensorFlowDataType();
-
-#define MPI_REQUIRES_OK(MPI_STATUS)                               \
-  if ((MPI_STATUS) != MPI_SUCCESS) {                              \
-    return errors::Unknown("MPI operation failed unexpectedly."); \
-  }
-
-// Copy data from one tensor to another tensor.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device>
-void CopyTensorData(void* destination, void* source, size_t size);
-
-// Add a tensor into another tensor, accumulating in place.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device, typename T>
-void AccumulateTensorData(T* destination, T* source, size_t size);
-
-// We need to get the right stream for doing CUDA memory transfers and
-// operations, which is possibly different from the standard TensorFlow stream.
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI();
-#endif
-
-/* Perform a ring allreduce on the data. Allocate the necessary output tensor
- * and store it in the output parameter.
- *
- * Assumes that all MPI processes are doing an allreduce of the same tensor,
- * with the same dimensions.
- *
- * A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
- * allreduce, the nodes involved are arranged in a ring:
- *
- *                   .--0--.
- *                  /       \
- *                 3         1
- *                  \       /
- *                   *--2--*
- *
- *  Each node always sends to the next clockwise node in the ring, and receives
- *  from the previous one.
- *
- *  The allreduce is done in two parts: a scatter-reduce and an allgather. In
- *  the scatter reduce, a reduction is done, so that each node ends up with a
- *  chunk of the final output tensor which has contributions from all other
- *  nodes.  In the allgather, those chunks are distributed among all the nodes,
- *  so that all nodes have the entire output tensor.
- *
- *  Both of these operations are done by dividing the input tensor into N
- *  evenly sized chunks (where N is the number of nodes in the ring).
- *
- *  The scatter-reduce is done in N-1 steps. In the ith step, node j will send
- *  the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
- *  its existing data for that chunk. For example, in the first iteration with
- *  the ring depicted above, you will have the following transfers:
- *
- *      Segment 0:  Node 0 --> Node 1
- *      Segment 1:  Node 1 --> Node 2
- *      Segment 2:  Node 2 --> Node 3
- *      Segment 3:  Node 3 --> Node 0
- *
- *  In the second iteration, you'll have the following transfers:
- *
- *      Segment 0:  Node 1 --> Node 2
- *      Segment 1:  Node 2 --> Node 3
- *      Segment 2:  Node 3 --> Node 0
- *      Segment 3:  Node 0 --> Node 1
- *
- *  After this iteration, Node 2 has 3 of the four contributions to Segment 0.
- *  The last iteration has the following transfers:
- *
- *      Segment 0:  Node 2 --> Node 3
- *      Segment 1:  Node 3 --> Node 0
- *      Segment 2:  Node 0 --> Node 1
- *      Segment 3:  Node 1 --> Node 2
- *
- *  After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
- *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
- * complete.
- *
- *  Next, the allgather distributes these fully accumulated chunks across all
- * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
- * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
- * For example, at the first iteration, the following transfers will occur:
- *
- *      Segment 0:  Node 3 --> Node 0
- *      Segment 1:  Node 0 --> Node 1
- *      Segment 2:  Node 1 --> Node 2
- *      Segment 3:  Node 2 --> Node 3
- *
- * After the first iteration, Node 0 will have a fully accumulated Segment 0
- * (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
- * just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
- * After this has continued for N - 1 iterations, all nodes will have a the
- * fully accumulated tensor.
- *
- * Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
- * allgather. Each send will contain K / N bytes, if there are K bytes in the
- * original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
- * bytes of data, and the performance of the allreduce (assuming no latency in
- * connections) is constrained by the slowest interconnect between the nodes.
- *
- */
-template <typename Device, typename T>
-Status RingAllreduce(OpKernelContext* context, const Tensor* input,
-                     Tensor* temp, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  T* buffer = (T*)output->tensor_data().data();
-
-  CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
-                         output->tensor_data().size());
-
-  // Calculate segment sizes and segment ends
-  const size_t elements_to_reduce = input->NumElements();
-  const size_t segment_size = elements_to_reduce / n;
-  std::vector<size_t> segment_sizes(n, segment_size);
-
-  const size_t residual = elements_to_reduce % n;
-  for (size_t i = 0; i < residual; ++i) {
-    segment_sizes[i]++;
-  }
-
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (size_t i = 1; i < segment_starts.size(); ++i) {
-    segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
-  }
-
-  assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
-
-  T* segment_recv = (T*)temp->tensor_data().data();
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  MPI_Status recv_status;
-  MPI_Request recv_req;
-
-  // Now start ring. At every step, for every rank, we iterate through
-  // segments with wraparound and send and recv from our neighbors and reduce
-  // locally. At the i'th iteration, rank r, sends segment (r-i) and receives
-  // segment (r-i-1).
-  for (int i = 0; i < n - 1; i++) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
-                              MPIType<T>(), recv_from, TAG_TENSOR,
-                              MPI_COMM_WORLD, &recv_req));
-
-    MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
-                             MPIType<T>(), send_to, TAG_TENSOR,
-                             MPI_COMM_WORLD));
-
-    T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
-
-    // Wait for recv to complete before reduction
-    MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
-
-    const size_t recv_seg_size = segment_sizes[recv_seg_id];
-    AccumulateTensorData<Device, T>(segment_update, segment_recv,
-                                    recv_seg_size);
-  }
-
-  // Now start pipelined ring allgather. At every step, for every rank, we
-  // iterate through segments with wraparound and send and recv from our
-  // neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
-  // receives segment (r-i).
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i + 1) + n) % n;
-    const size_t recv_seg_id = ((r - i) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i+1)
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    // Segment to recv - at every iteration we receive segment (r-i)
-    T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-// Perform a ring allgather on a Tensor. Other ranks may allgather with a
-// tensor which differs in the first dimension only; all other dimensions must
-// be the same.
-//
-// For more information on the ring allgather, read the documentation for the
-// ring allreduce, which includes a ring allgather.
-template <typename Device, typename T>
-Status RingAllgather(OpKernelContext* context, const Tensor* input,
-                     const std::vector<size_t>& sizes, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  assert(sizes.size() == n);
-  assert(input->dim_size(0) == sizes[r]);
-
-  // Compute number of elements in every "row". We can't compute number of
-  // elements in every chunks, because those chunks are variable length.
-  size_t elements_per_row = 1;
-  for (int i = 1; i < input->shape().dims(); i++) {
-    elements_per_row *= input->dim_size(i);
-  }
-
-  // Copy data from input tensor to correct place in output tensor.
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (int i = 1; i < n; i++) {
-    segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
-  }
-  size_t offset = segment_starts[r];
-
-  // Copy data to the right offset for this rank.
-  T* buffer = (T*)output->tensor_data().data();
-  CopyTensorData<Device>((void*)(buffer + offset),
-                         (void*)input->tensor_data().data(),
-                         elements_per_row * sizes[r] * sizeof(T));
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  // Perform a ring allgather. At every step, for every rank, we iterate
-  // through segments with wraparound and send and recv from our neighbors.
-  // At the i'th iteration, rank r, sends segment (r-i) and receives segment
-  // (r-1-i).
-  MPI_Status recv_status;
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i)
-    size_t offset_send = segment_starts[send_seg_id];
-    size_t rows_send = sizes[send_seg_id];
-    T* segment_send = &(buffer[offset_send]);
-
-    // Segment to recv - at every iteration we receive segment (r-1-i)
-    size_t offset_recv = segment_starts[recv_seg_id];
-    size_t rows_recv = sizes[recv_seg_id];
-    T* segment_recv = &(buffer[offset_recv]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-
-#undef TENSORFLOW_CONTRIB_MPI_H_
-#endif  // TENSORFLOW_CONTRIB_MPI_H_
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
index 2b64a78c223..ad9f223f302 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
@@ -19,12 +19,11 @@ from __future__ import print_function
 import numbers
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 
 
 def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: disable=invalid-name
@@ -61,7 +60,7 @@ def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylin
     keep_prob = ops.convert_to_tensor(keep_prob,
                                       dtype=x.dtype,
                                       name="keep_prob")
-    keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+    keep_prob.get_shape().assert_has_rank(0)
 
     # Do nothing if we know keep_prob == 1
     if tensor_util.constant_value(keep_prob) == 1:
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
index bc18177b6d0..0c06f4d7f36 100644
--- a/tensorflow/contrib/opt/python/training/lars_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -113,28 +113,30 @@ class LARSOptimizer(optimizer.Optimizer):
                (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0),
           1.0)
       scaled_lr = self._learning_rate * trust_ratio
-    return scaled_lr
+      # Add the weight regularization gradient
+      grad = grad + self._weight_decay * var
+    return scaled_lr, grad
 
   def _apply_dense(self, grad, var):
-    scaled_lr = self.compute_lr(grad, var)
+    scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.apply_momentum(
         var,
         mom,
-        scaled_lr,
-        grad,
+        math_ops.cast(1.0, var.dtype.base_dtype),
+        grad * scaled_lr,
         self._momentum,
         use_locking=False,
         use_nesterov=self._use_nesterov)
 
   def _resource_apply_dense(self, grad, var):
-    scaled_lr = self.compute_lr(grad, var)
+    scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
-        scaled_lr,
-        grad,
+        math_ops.cast(1.0, var.dtype.base_dtype),
+        grad * scaled_lr,
         self._momentum,
         use_locking=False,
         use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer_test.py b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
index b76db763da0..8c135a21bc2 100644
--- a/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
@@ -67,9 +67,10 @@ class LARSOptimizerTest(test.TestCase):
           g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
           trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
           scaled_lr = lr_np * trust_ratio
+          grad_np = grad_np + wd_np * var_np
 
-          vel_np = m_np * vel_np + grad_np
-          var_np -= scaled_lr * vel_np
+          vel_np = m_np * vel_np + scaled_lr * grad_np
+          var_np -= vel_np
 
           self.assertAllClose(var_np, post_var)
           self.assertAllClose(vel_np, post_vel)
@@ -115,9 +116,10 @@ class LARSOptimizerTest(test.TestCase):
             g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
             trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
             scaled_lr = lr_np * trust_ratio
+            grad_np = grad_np + wd_np * var_np
 
-            vel_np = m_np * vel_np + grad_np
-            var_np -= scaled_lr * vel_np
+            vel_np = m_np * vel_np + scaled_lr * grad_np
+            var_np -= vel_np
 
             self.assertAllClose(var_np, post_var)
             self.assertAllClose(vel_np, post_vel)
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 960826407b6..046c6ee83fd 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -24,14 +24,37 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_ops
+from tensorflow.python.util import deprecation
 
 
 class NadamOptimizer(adam.AdamOptimizer):
   """Optimizer that implements the Nadam algorithm.
 
   See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+
+  WARNING: due to a known issue this optimizer does not use nesterov momentum
+  on TPUs or when using XLA in general. This is deprecated; instead prefer
+  tf.keras.optimizers.Nadam which does the right thing.
   """
 
+  @deprecation.deprecated(
+      None, "WARNING: wrong behavior with XLA. Use tf.keras.optimizers.Nadam.")
+  def __init__(
+      self,
+      learning_rate=0.001,
+      beta1=0.9,
+      beta2=0.999,
+      epsilon=1e-08,
+      use_locking=False,
+      name="Adam"):
+    super(NadamOptimizer, self).__init__(
+        learning_rate=learning_rate,
+        beta1=beta1,
+        beta2=beta2,
+        epsilon=epsilon,
+        use_locking=use_locking,
+        name=name)
+
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
index e2bcee51130..233503b911e 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -356,10 +356,10 @@ class MomentumWOptimizer(DecoupledWeightDecayExtension,
 class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer):
   """Optimizer that implements the Adam algorithm with weight decay.
 
-  This is an implementation of the AdamW optimizer described in "Fixing
-  Weight Decay Regularization in Adam" by Loshchilov & Hutter
+  This is an implementation of the AdamW optimizer described in ["Fixing
+  Weight Decay Regularization in Adam" by Loshchilov & Hutter]
   (https://arxiv.org/abs/1711.05101)
-  ([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+  ([pdf](https://arxiv.org/pdf/1711.05101.pdf)).
 
   It computes the update step of `train.AdamOptimizer` and additionally decays
   the variable. Note that this is different from adding L2 regularization on
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index f61e28bbc7e..a90647deed0 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -39,7 +39,8 @@ _RELU_TYPES = {'Relu', 'Relu6'}
 _QUANTIZATION_OP = {'FakeQuantWithMinMaxVars'}
 _VALID_SRC_OP = {'Add', 'AddV2', 'Mul'}
 _INTERMEDIATE_OP = {'Add', 'AddV2', 'Mul'}
-_PASS_THROUGH_OP = {'Reshape', 'Identity', 'BatchToSpaceND', 'SpaceToBatchND'}
+_PASS_THROUGH_OP = {'Reshape', 'Identity', 'BatchToSpaceND', 'SpaceToBatchND',
+                    'MaxPool', 'Max'}
 _VALID_ACTIVATION_OP = {'Relu', 'Relu6'}
 
 
diff --git a/tensorflow/contrib/reduce_slice_ops/BUILD b/tensorflow/contrib/reduce_slice_ops/BUILD
index c98ae649f3e..aeb2c67317e 100644
--- a/tensorflow/contrib/reduce_slice_ops/BUILD
+++ b/tensorflow/contrib/reduce_slice_ops/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_custom_op_library", "tf_gen_op_libs", "tf_gen_op_wrapper_py", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_kernel_tests_linkstatic")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_kernel_tests_linkstatic")
 
 package(
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 4f8186c7394..78ea6374220 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -227,9 +227,6 @@ def _block_lstm(seq_len_max,
   # pylint: enable=invalid-name
 
 
-_lstm_block_cell_grad_outputs = ["cs_prev_grad", "dicfo"]
-
-
 @ops.RegisterGradient("LSTMBlockCell")
 def _LSTMBlockCellGrad(op, *grad):
   """Gradient for LSTMBlockCell."""
@@ -247,7 +244,7 @@ def _LSTMBlockCellGrad(op, *grad):
   if cell_size is None:
     raise ValueError("cell_size from `cs_prev` should not be None.")
 
-  (cs_prev_grad, dicfo, wci_grad, wcf_grad,
+  (cs_prev_grad, dgates, wci_grad, wcf_grad,
    wco_grad) = gen_rnn_ops.lstm_block_cell_grad(
        x=x,
        cs_prev=cs_prev,
@@ -267,8 +264,8 @@ def _LSTMBlockCellGrad(op, *grad):
        h_grad=h_grad,
        use_peephole=op.get_attr("use_peephole"))
 
-  # Backprop from dicfo to xh.
-  xh_grad = math_ops.matmul(dicfo, w, transpose_b=True)
+  # Backprop from dgates to xh.
+  xh_grad = math_ops.matmul(dgates, w, transpose_b=True)
 
   x_grad = array_ops.slice(xh_grad, (0, 0), (batch_size, input_size))
   x_grad.get_shape().merge_with(x.get_shape())
@@ -277,13 +274,13 @@ def _LSTMBlockCellGrad(op, *grad):
                                 (batch_size, cell_size))
   h_prev_grad.get_shape().merge_with(h_prev.get_shape())
 
-  # Backprop from dicfo to w.
+  # Backprop from dgates to w.
   xh = array_ops.concat([x, h_prev], 1)
-  w_grad = math_ops.matmul(xh, dicfo, transpose_a=True)
+  w_grad = math_ops.matmul(xh, dgates, transpose_a=True)
   w_grad.get_shape().merge_with(w.get_shape())
 
-  # Backprop from dicfo to b.
-  b_grad = nn_ops.bias_add_grad(dicfo)
+  # Backprop from dgates to b.
+  b_grad = nn_ops.bias_add_grad(dgates)
   b_grad.get_shape().merge_with(b.get_shape())
 
   return (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad,
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 75710ea4190..c0939c84c44 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -1948,7 +1948,9 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
         in an existing scope. If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
-    super(PhasedLSTMCell, self).__init__(_reuse=reuse)
+    # We pass autocast=False because this layer can accept inputs of different
+    # dtypes, so we do not want to automatically cast them to the same dtype.
+    super(PhasedLSTMCell, self).__init__(_reuse=reuse, autocast=False)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._leak = leak
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index a037be78387..f092af17a90 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
index 47413aa8692..db197d10cd8 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 # Placeholder for loading internal BUILD rule.
 
 package(
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 6d8c50177d4..3f9400a6748 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -251,6 +251,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
index 66a464dc218..824c8dad43d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -149,7 +149,8 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
     x_test = np.random.randint(vocab, size=(self.batch, self.timestep))
     y = np.random.randn(self.batch, self.timestep)
     model = keras.models.Model([inputs, query, state], score)
-    model.compile("rmsprop", "mse")
+    # TODO(b/138592586): Run with single-execution-path
+    model.compile("rmsprop", "mse", experimental_run_tf_function=False)
     model.fit([x, self.query, self.state], (y, y))
     y_ref = model.predict_on_batch([x_test, self.query, self.state])
 
@@ -159,6 +160,9 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
         config, custom_objects={attention_cls.__name__: attention_cls})
     loaded_model.set_weights(weights)
 
+    # TODO(b/138592586): Run with single-execution-path
+    loaded_model.compile("rmsprop", "mse", experimental_run_tf_function=False)
+
     y = loaded_model.predict_on_batch([x_test, self.query, self.state])
 
     self.assertAllClose(y_ref, y)
@@ -405,11 +409,13 @@ class AttentionWrapperV2Test(test.TestCase, parameterized.TestCase):
         memory_sequence_length=self.encoder_sequence_length,
         normalize=True,
         dtype=dtype)
-    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid")
-    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid",
+                                 dtype=dtype)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
 
     sampler = sampler_py.TrainingSampler()
-    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler,
+                                              dtype=dtype)
 
     final_outputs, final_state, _ = my_decoder(
         decoder_inputs,
@@ -432,11 +438,13 @@ class AttentionWrapperV2Test(test.TestCase, parameterized.TestCase):
         scale=True,
         dtype=dtype,
     )
-    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid")
-    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid",
+                                 dtype=dtype)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
 
     sampler = sampler_py.TrainingSampler()
-    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler,
+                                              dtype=dtype)
 
     final_outputs, final_state, _ = my_decoder(
         decoder_inputs,
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 6360d1cfdc1..343e5f4be69 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -407,8 +407,8 @@ class TestLargeBeamStep(test.TestCase):
       log_prob_neg_inf = array_ops.ones(
           [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf
 
-      log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
-                                  log_prob_neg_inf)
+      log_probs = array_ops.where_v2(log_prob_mask, log_prob_zeros,
+                                     log_prob_neg_inf)
       return log_probs
 
     log_probs = get_probs()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index a9215e88000..0e19d1e3205 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -2147,7 +2147,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                initial_cell_state=None,
                name=None,
                attention_layer=None,
-               attention_fn=None):
+               attention_fn=None,
+               dtype=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -2224,6 +2225,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (attention_mechanism, cell_output, attention_state, attention_layer) and
         outputs (attention, alignments, next_attention_state). If provided, the
         attention_layer_size should be the size of the outputs of attention_fn.
+      dtype: The cell dtype
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
@@ -2232,7 +2234,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         is a list, and its length does not match that of `attention_layer_size`;
         if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
-    super(AttentionWrapper, self).__init__(name=name)
+    super(AttentionWrapper, self).__init__(name=name, dtype=dtype)
     rnn_cell_impl.assert_like_rnncell("cell", cell)
     if isinstance(attention_mechanism, (list, tuple)):
       self._is_multi = True
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 5e4f5f53cd7..737d6866283 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -10,7 +10,7 @@ load(
     "py_test",
     "tf_cc_test",
 )
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 # Placeholder: load("//tensorflow:tensorflow.bzl", "tf_portable_proto_lib")
 
 package(
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 815beb73a02..121fc2239dd 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -47,11 +47,11 @@ void ValidateHalfPlusTwo(const SavedModelBundle& saved_model_bundle,
                          const string& input_tensor_name,
                          const string& output_tensor_name) {
   // Validate the half plus two behavior.
-  std::vector<string> serialized_examples;
+  std::vector<tstring> serialized_examples;
   for (float x : {0, 1, 2, 3}) {
     serialized_examples.push_back(MakeSerializedExample(x));
   }
-  Tensor input = test::AsTensor<string>(serialized_examples, TensorShape({4}));
+  Tensor input = test::AsTensor<tstring>(serialized_examples, TensorShape({4}));
 
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(saved_model_bundle.session->Run(
diff --git a/tensorflow/contrib/session_bundle/session_bundle.cc b/tensorflow/contrib/session_bundle/session_bundle.cc
index a690d9b129a..996e4ce0b80 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle.cc
@@ -72,7 +72,7 @@ Status GetMetaGraphDefFromExport(const StringPiece export_dir,
 // Creates a string tensor.
 Tensor CreateStringTensor(const string& value) {
   Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = value;
+  tensor.scalar<tstring>()() = value;
   return tensor;
 }
 
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index 9e4b1c72195..108806e3328 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -97,11 +97,11 @@ void CheckRegressionSignature(const Signatures& signatures,
   const string output_name = regression_signature.output().tensor_name();
 
   // Validate the half plus two behavior.
-  std::vector<string> serialized_examples;
+  std::vector<tstring> serialized_examples;
   for (float x : {0, 1, 2, 3}) {
     serialized_examples.push_back(MakeSerializedExample(x));
   }
-  Tensor input = test::AsTensor<string>(serialized_examples, TensorShape({4}));
+  Tensor input = test::AsTensor<tstring>(serialized_examples, TensorShape({4}));
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(
       bundle.session->Run({{input_name, input}}, {output_name}, {}, &outputs));
@@ -146,13 +146,13 @@ void CheckSessionBundle(const string& export_path,
   ASSERT_EQ(2, path_outputs.size());
   // Validate the two asset file tensors are set by the init_op and include the
   // base_path and asset directory.
-  test::ExpectTensorEqual<string>(
-      test::AsTensor<string>({io::JoinPath(asset_path, "hello1.txt")},
-                             TensorShape({})),
+  test::ExpectTensorEqual<tstring>(
+      test::AsTensor<tstring>({io::JoinPath(asset_path, "hello1.txt")},
+                              TensorShape({})),
       path_outputs[0]);
-  test::ExpectTensorEqual<string>(
-      test::AsTensor<string>({io::JoinPath(asset_path, "hello2.txt")},
-                             TensorShape({})),
+  test::ExpectTensorEqual<tstring>(
+      test::AsTensor<tstring>({io::JoinPath(asset_path, "hello2.txt")},
+                              TensorShape({})),
       path_outputs[1]);
 
   Signatures signatures;
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py
index c457d44e07b..dec5cbc6d22 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py
@@ -144,14 +144,16 @@ class ParallelReaderTest(test.TestCase):
         capacity=55,
         min_after_dequeue=28,
         dtypes=[dtypes_lib.string, dtypes_lib.string],
-        shapes=[tensor_shape.scalar(), tensor_shape.scalar()])
+        shapes=[tensor_shape.TensorShape([]),
+                tensor_shape.TensorShape([])])
     self._verify_read_up_to_out(shared_queue)
 
   def testReadUpToFromFIFOQueue(self):
     shared_queue = data_flow_ops.FIFOQueue(
         capacity=99,
         dtypes=[dtypes_lib.string, dtypes_lib.string],
-        shapes=[tensor_shape.scalar(), tensor_shape.scalar()])
+        shapes=[tensor_shape.TensorShape([]),
+                tensor_shape.TensorShape([])])
     self._verify_read_up_to_out(shared_queue)
 
 
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 8fca63292e6..381d5941e5a 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -443,11 +443,9 @@ class Image(ItemHandler):
       """Decodes a raw image."""
       return parsing_ops.decode_raw(image_buffer, out_type=self._dtype)
 
-    pred_fn_pairs = {
-        math_ops.logical_or(
-            math_ops.equal(image_format, 'raw'),
-            math_ops.equal(image_format, 'RAW')): decode_raw,
-    }
+    pred_fn_pairs = [(math_ops.logical_or(
+        math_ops.equal(image_format, 'raw'),
+        math_ops.equal(image_format, 'RAW')), decode_raw)]
     image = control_flow_ops.case(
         pred_fn_pairs, default=check_jpeg, exclusive=True)
 
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 5db4fe02b8e..aefc07696b9 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -197,7 +197,8 @@ class MultiplyGradientsTest(test.TestCase):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     multiplier_flag = variables_lib.Variable(True)
-    tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0)
+    tensor_multiplier = array_ops.where_v2(multiplier_flag, self._multiplier,
+                                           1.0)
     grad_to_var = (gradient, variable)
     gradient_multipliers = {variable: tensor_multiplier}
 
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index 69cbb120ef8..7bb73f5a415 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -9,7 +9,7 @@ load(
     "tf_py_test",
 )
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index fdd7e1e1ee3..ca246f912be 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
index 926e4dda916..a8a5b574691 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework.python.ops import variables as framework_variables
 
@@ -29,6 +27,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 
 from tensorflow.python.training import adagrad
+from tensorflow.python.util.compat import collections_abc
 
 
 class HybridModel(object):
@@ -66,7 +65,7 @@ class HybridModel(object):
 
     # If this is a collection of layers, return the mean of their inference
     # results.
-    if isinstance(layer, collections.Iterable):
+    if isinstance(layer, collections_abc.Iterable):
       return math_ops.reduce_mean(
           array_ops.stack([l.inference_graph(data) for l in layer]), 0)
     # If this is a single layer, return its inference result.
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
index 94650fe108b..5f997c2fba0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
@@ -52,7 +52,7 @@ class CreateTreeVariableOp : public OpKernel {
 
     auto* result = new DecisionTreeResource(param_proto_);
     if (!ParseProtoUnlimited(result->mutable_decision_tree(),
-                             tree_config_t->scalar<string>()())) {
+                             tree_config_t->scalar<tstring>()())) {
       result->Unref();
       OP_REQUIRES(context, false,
                   errors::InvalidArgument("Unable to parse tree  config."));
@@ -85,7 +85,7 @@ class TreeSerializeOp : public OpKernel {
     Tensor* output_config_t = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape(), &output_config_t));
-    output_config_t->scalar<string>()() =
+    output_config_t->scalar<tstring>()() =
         decision_tree_resource->decision_tree().SerializeAsString();
   }
 };
@@ -116,7 +116,7 @@ class TreeDeserializeOp : public OpKernel {
     decision_trees::Model* config =
         decision_tree_resource->mutable_decision_tree();
     OP_REQUIRES(context,
-                ParseProtoUnlimited(config, tree_config_t->scalar<string>()()),
+                ParseProtoUnlimited(config, tree_config_t->scalar<tstring>()()),
                 errors::InvalidArgument("Unable to parse tree  config."));
     decision_tree_resource->MaybeInitialize();
   }
@@ -224,7 +224,7 @@ class TreePredictionsV4Op : public OpKernel {
                                                                   : 0);
     OP_REQUIRES_OK(context, context->allocate_output(1, output_paths_shape,
                                                      &output_tree_paths));
-    auto out_paths = output_tree_paths->unaligned_flat<string>();
+    auto out_paths = output_tree_paths->unaligned_flat<tstring>();
 
     // TODO(gilberth): If this slows down inference too much, consider having
     // a filter that only serializes paths for the predicted label that we're
diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
index b21a9179777..fcea240dee9 100644
--- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
@@ -38,7 +38,7 @@ float Convert(const string& in) {
 void Evaluate(const Tensor& input_data, Tensor output_data, int32 start,
               int32 end) {
   auto out_data = output_data.unaligned_flat<float>();
-  const auto in_data = input_data.unaligned_flat<string>();
+  const auto in_data = input_data.unaligned_flat<tstring>();
 
   for (int32 i = start; i < end; ++i) {
     out_data(i) = Convert(in_data(i));
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
index ede6e1abc9f..e4693cf68dc 100644
--- a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
@@ -56,7 +56,7 @@ class CreateFertileStatsVariableOp : public OpKernel {
                 errors::InvalidArgument("Stats config must be a scalar."));
     auto* result = new FertileStatsResource(param_proto_);
     FertileStats stats;
-    if (!ParseProtoUnlimited(&stats, stats_config_t->scalar<string>()())) {
+    if (!ParseProtoUnlimited(&stats, stats_config_t->scalar<tstring>()())) {
       result->Unref();
       OP_REQUIRES(context, false,
                   errors::InvalidArgument("Unable to parse stats config."));
@@ -98,7 +98,7 @@ class FertileStatsSerializeOp : public OpKernel {
 
     FertileStats stats;
     fertile_stats_resource->PackToProto(&stats);
-    output_config_t->scalar<string>()() = stats.SerializeAsString();
+    output_config_t->scalar<tstring>()() = stats.SerializeAsString();
   }
 
  private:
@@ -128,9 +128,10 @@ class FertileStatsDeserializeOp : public OpKernel {
     // Deallocate all the previous objects on the resource.
     fertile_stats_resource->Reset();
     FertileStats stats;
-    OP_REQUIRES(context,
-                ParseProtoUnlimited(&stats, stats_config_t->scalar<string>()()),
-                errors::InvalidArgument("Unable to parse stats config."));
+    OP_REQUIRES(
+        context,
+        ParseProtoUnlimited(&stats, stats_config_t->scalar<tstring>()()),
+        errors::InvalidArgument("Unable to parse stats config."));
 
     fertile_stats_resource->ExtractFromProto(stats);
     fertile_stats_resource->MaybeInitialize();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index d205b255402..71bfa5bbb8c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -1,7 +1,7 @@
 # TensorFlow code for training random forests.
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
index f4a7058ddb8..417cb6f7420 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -103,7 +103,7 @@ float CandidateGraphRunner::SplitScore() {
 void CandidateGraphRunner::GetSplit(decision_trees::BinaryNode* node) {
   std::vector<Tensor> outputs;
   RunOp(kNoOp, TensorNameValueList(), {kGetSplitName}, &outputs);
-  ParseProtoUnlimited(node, outputs[0].unaligned_flat<string>()(0));
+  ParseProtoUnlimited(node, outputs[0].unaligned_flat<tstring>()(0));
   const auto& oblique = split_.inequality_left_child_test().oblique();
   auto* new_split =
       node->mutable_inequality_left_child_test()->mutable_oblique();
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
index efa696fffe6..702dbed7fc0 100644
--- a/tensorflow/contrib/tensor_forest/proto/BUILD
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index df10997d633..623e52ca0b6 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -461,7 +461,7 @@ class RandomForestGraphs(object):
           mask = math_ops.less(
               r,
               array_ops.ones_like(r) * self.params.bagging_fraction)
-          gather_indices = array_ops.squeeze(array_ops.where(mask), axis=[1])
+          gather_indices = array_ops.squeeze(array_ops.where_v2(mask), axis=[1])
           # TODO(thomaswc): Calculate out-of-bag data and labels, and store
           # them for use in calculating statistics later.
           tree_data = array_ops.gather(processed_dense_features, gather_indices)
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index e5efe4b16d8..801fe67b069 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -2,7 +2,7 @@
 # TensorBoard module containing volatile or experimental code.
 
 # For platform specific build config
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
diff --git a/tensorflow/contrib/text/kernels/skip_gram_kernels.cc b/tensorflow/contrib/text/kernels/skip_gram_kernels.cc
index 3cd0b5f72b5..198388599e8 100644
--- a/tensorflow/contrib/text/kernels/skip_gram_kernels.cc
+++ b/tensorflow/contrib/text/kernels/skip_gram_kernels.cc
@@ -128,7 +128,7 @@ class SkipGramGenerateCandidatesOp : public OpKernel {
                               .TypeConstraint<type>("T"),    \
                           SkipGramGenerateCandidatesOp<type>)
 
-REGISTER_KERNEL(string);
+REGISTER_KERNEL(tstring);
 REGISTER_KERNEL(int64);
 REGISTER_KERNEL(int32);
 REGISTER_KERNEL(int16);
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 94a51abb762..017d08f5f60 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
-
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
@@ -174,7 +173,7 @@ py_test(
 
 py_test(
     name = "sampling_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/training/sampling_ops_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 10f3f88f3eb..fddcf1e4f62 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -212,7 +212,7 @@ def bucket(tensors,
         else static_batch_size)
 
     bucket_shapes = [
-        tensor_shape.vector(maybe_static_batch_size).concatenate(s)
+        tensor_shape.TensorShape([maybe_static_batch_size]).concatenate(s)
         for s in bucket_queues[0].shapes
     ]
     # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO
@@ -222,7 +222,7 @@ def bucket(tensors,
     top_queue = data_flow_ops.PaddingFIFOQueue(
         capacity=capacity,
         dtypes=[dtypes.int32] + types,
-        shapes=[tensor_shape.scalar()] + bucket_shapes,
+        shapes=[tensor_shape.TensorShape([])] + bucket_shapes,
         shared_name=shared_name,
         name="top_queue")
 
@@ -399,11 +399,11 @@ def bucket_by_sequence_length(input_length,
     conditions_c = math_ops.logical_and(
         math_ops.less_equal(buckets_min, input_length),
         math_ops.less(input_length, buckets_max))
-    which_bucket = math_ops.reduce_min(array_ops.where(conditions_c))
+    which_bucket = math_ops.reduce_min(array_ops.where_v2(conditions_c))
     which_bucket = math_ops.cast(which_bucket, dtypes.int32)
 
     if shapes is not None:
-      shapes = [tensor_shape.scalar()] + shapes
+      shapes = [tensor_shape.TensorShape([])] + shapes
 
     _, dequeued = bucket(
         tensors=[input_length] + tensor_list,
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index 849b77d6095..257cc4fce21 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -417,7 +417,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs):
   ratio_l = target_probs / init_probs
 
   # Replace NaNs with 0s.
-  ratio_l = array_ops.where(
+  ratio_l = array_ops.where_v2(
       math_ops.is_nan(ratio_l), array_ops.zeros_like(ratio_l), ratio_l)
 
   # Calculate list of acceptance probabilities.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index e44c4f8c0ef..02baf4e071e 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -594,7 +594,7 @@ class NextQueuedSequenceBatch(object):
       # unless we explicitly tie them to CPU.
       with ops.colocate_with(self._state_saver._capacity_queue.queue_ref):
         indices_where_not_done = array_ops.reshape(
-            array_ops.where(
+            array_ops.where_v2(
                 math_ops.logical_not(self._state_saver._sequence_is_done)),
             [-1])
         keeping_next_key = array_ops.gather(
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
index 096ca0f0cf9..1207a338f39 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
@@ -98,10 +98,10 @@ TEST(ConvertGraphdefMemmappedFormatTest, NotSupportedTypesConvert) {
   constexpr int kTensorHeight = 100;
   const TensorShape kTestTensorShape({kTensorWidth, kTensorHeight});
   Tensor test_tensor1(DT_STRING, kTestTensorShape);
-  test::FillFn<string>(&test_tensor1, [](int) -> string { return "ABC"; });
+  test::FillFn<tstring>(&test_tensor1, [](int) -> string { return "ABC"; });
 
   Tensor test_tensor2(DT_STRING, kTestTensorShape);
-  test::FillFn<string>(&test_tensor2, [](int) -> string { return "XYZ"; });
+  test::FillFn<tstring>(&test_tensor2, [](int) -> string { return "XYZ"; });
   auto root = Scope::NewRootScope().ExitOnError();
   Output m = ops::Add(root, test_tensor1, test_tensor2);
   const string result_name = m.node()->name();
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index fac783b7d5f..b0035269d40 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -5,7 +5,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_cc",
 )
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ca158b3486b..aa607fa8257 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -7,7 +7,7 @@
 # ":protos_all_cc" - exports all core TensorFlow protos
 #     ":protos_all_py" - py_proto_library version (Google-internal)
 # ":lib" - exports the public non-test headers for:
-#     platform/: Platform-specific code and external dependencies
+#     //third_party/tensorflow/core/platform:: Platform-specific code and external dependencies
 #     lib/: Low-level libraries that are not TensorFlow-specific
 # ":test" - test equivalent of ":lib".
 #     This is currently public, but may be made internal in the
@@ -104,7 +104,7 @@ load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 
 # For platform specific build config
 load(
-    ":platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_additional_cloud_kernel_deps",
     "tf_additional_cloud_op_deps",
@@ -112,36 +112,25 @@ load(
     "tf_additional_cupti_wrapper_deps",
     "tf_additional_device_tracer_cuda_deps",
     "tf_additional_device_tracer_deps",
-    "tf_additional_device_tracer_srcs",
     "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
-    "tf_additional_lib_hdrs",
-    "tf_additional_lib_srcs",
     "tf_additional_libdevice_data",
     "tf_additional_libdevice_deps",
-    "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_monitoring_hdrs",
-    "tf_additional_monitoring_srcs",
-    "tf_additional_mpi_lib_defines",
     "tf_additional_numa_copts",
     "tf_additional_numa_deps",
     "tf_additional_numa_lib_defines",
-    "tf_additional_proto_hdrs",
-    "tf_additional_proto_srcs",
     "tf_additional_test_deps",
-    "tf_additional_test_srcs",
     "tf_additional_verbs_lib_defines",
     "tf_grpc_service_all",
     "tf_jspb_proto_library",
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_compiler_deps",
     "tf_lib_proto_parsing_deps",
-    "tf_platform_hdrs",
-    "tf_platform_srcs",
     "tf_proto_library",
     "tf_proto_library_cc",
     "tf_protos_all",
@@ -151,10 +140,11 @@ load(
     "tf_pyclif_proto_library",
 )
 load(
-    ":platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_dynamic_kernels",
     "if_static",
     "tf_cuda_tests_tags",
+    "tf_gpu_tests_tags",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
@@ -179,6 +169,7 @@ package_group(
     name = "dependency_whitelist",
     packages = [
         "//learning/freud/topic_models/tensorflow/...",
+        "//perftools/accelerators/xprof/api/...",
         "//quality/webanswers/brain/tokenization/custom_tf_ops/kernels/...",
     ],
 )
@@ -242,7 +233,7 @@ COMMON_PROTO_SRCS = [
 ]
 
 ERROR_CODES_PROTO_SRCS = [
-    "lib/core/error_codes.proto",
+    "//tensorflow/core/lib/core:error_codes.proto",
 ]
 # LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb)
 
@@ -277,11 +268,27 @@ tf_proto_library(
     make_default_target_header_only = True,
     protodeps = [
         ":protos_all_proto",
-        ":error_codes_proto",
+        "//tensorflow/core/lib/core:error_codes_proto",
     ],
     visibility = ["//visibility:public"],
 )
 
+tf_generate_proto_text_sources(
+    name = "attr_value_proto_text",
+    srcs = [
+        "framework/attr_value.proto",
+        "framework/resource_handle.proto",
+        "framework/tensor.proto",
+        "framework/tensor_shape.proto",
+        "framework/types.proto",
+    ],
+    srcs_relative_dir = "tensorflow/core/",
+    deps = [
+        ":lib_internal",
+        ":protos_all_proto_cc",
+    ],
+)
+
 tf_jspb_proto_library(
     name = "protos_all_jspb_proto",
     visibility = ["//visibility:public"],
@@ -321,45 +328,34 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# Minimal lib to detect platform
-cc_library(
-    name = "lib_platform",
-    hdrs = [
-        "platform/platform.h",
-    ],
-)
-
 filegroup(
     name = "platform_base_hdrs",
     srcs = [
-        "platform/byte_order.h",
-        "platform/cord.h",
-        "platform/env_time.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/platform_strings.h",
-        "platform/types.h",
+        "//tensorflow/core/platform:byte_order.h",
+        "//tensorflow/core/platform:cord.h",
+        "//tensorflow/core/platform:env_time.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:platform_strings.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_base",
-    srcs = tf_platform_hdrs([
-        "integral_types.h",
-        "logging.h",
-    ]) + tf_platform_srcs([
-        "logging.cc",
-        "env_time.cc",
-    ]) + [
-        "platform/env_time.cc",
-    ],
     hdrs = [":platform_base_hdrs"],
     copts = tf_copts(),
     tags = ["avoid_dep"],
     visibility = [":__subpackages__"],
     deps = [
-        ":lib_platform",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:byte_order",
+        "//tensorflow/core/platform:env_time",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
         "//tensorflow/core/platform/default/build_config:base",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -379,13 +375,13 @@ cc_library(
 filegroup(
     name = "platform_port_hdrs",
     srcs = [
-        "platform/cpu_info.h",
-        "platform/dynamic_annotations.h",
-        "platform/init_main.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/numa.h",
-        "platform/thread_annotations.h",
+        "//tensorflow/core/platform:cpu_info.h",
+        "//tensorflow/core/platform:dynamic_annotations.h",
+        "//tensorflow/core/platform:init_main.h",
+        "//tensorflow/core/platform:mem.h",
+        "//tensorflow/core/platform:mutex.h",
+        "//tensorflow/core/platform:numa.h",
+        "//tensorflow/core/platform:thread_annotations.h",
     ],
     visibility = ["//visibility:private"],
 )
@@ -394,24 +390,18 @@ filegroup(
 filegroup(
     name = "platform_port_internal_hdrs",
     srcs = [
-        "platform/demangle.h",
-        "platform/host_info.h",
-        "platform/snappy.h",
+        "//tensorflow/core/platform:demangle.h",
+        "//tensorflow/core/platform:host_info.h",
+        "//tensorflow/core/platform:snappy.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_port",
-    srcs = tf_platform_hdrs([
-        "cpu_info.h",
-        "dynamic_annotations.h",
-        "thread_annotations.h",
-        "mutex.h",
-    ]) + tf_platform_srcs([
-        "port.cc",
-    ]) + [
-        "platform/cpu_info.cc",
+    srcs = [
+        "//tensorflow/core/platform:cpu_info.cc",
+        "//tensorflow/core/platform:legacy_platform_port_srcs",
     ],
     hdrs = [
         ":platform_port_hdrs",
@@ -420,7 +410,7 @@ cc_library(
     copts = tf_copts() + tf_additional_numa_copts(),
     visibility = [":__subpackages__"],
     deps = [
-        ":lib_platform",
+        "//tensorflow/core/platform:platform",
         ":platform_base",
         "@com_google_absl//absl/base",
         "//tensorflow/core/platform/default/build_config:port",
@@ -431,7 +421,7 @@ cc_library(
 filegroup(
     name = "platform_protobuf_hdrs",
     srcs = [
-        "platform/protobuf.h",
+        "//tensorflow/core/platform:protobuf.h",
     ],
     visibility = ["//visibility:private"],
 )
@@ -440,19 +430,18 @@ filegroup(
 filegroup(
     name = "platform_protobuf_internal_hdrs",
     srcs = [
-        "platform/protobuf_internal.h",
+        "//tensorflow/core/platform:protobuf_internal.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_protobuf",
-    srcs = tf_platform_hdrs([
-        "protobuf.h",
-    ]) + [
-        "platform/protobuf.cc",
-        "platform/protobuf_util.cc",
-        "lib/core/status.h",
+    srcs = [
+        "//tensorflow/core/lib/core:legacy_lib_core_status_header",
+        "//tensorflow/core/platform:protobuf.cc",
+        "//tensorflow/core/platform:protobuf.h",
+        "//tensorflow/core/platform:protobuf_util.cc",
     ],
     hdrs = [
         ":platform_protobuf_hdrs",
@@ -461,9 +450,9 @@ cc_library(
     copts = tf_copts(),
     visibility = [":__subpackages__"],
     deps = [
-        ":lib_platform",
         ":platform_base",
         ":platform_port",
+        "//tensorflow/core/platform",
         "//tensorflow/core/platform/default/build_config:protobuf",
         "@com_google_protobuf//:protobuf",
     ],
@@ -473,7 +462,7 @@ cc_library(
     name = "grpc_services",
     srcs = [],
     hdrs = [
-        "platform/grpc_services.h",
+        "//tensorflow/core/platform:grpc_services.h",
     ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
@@ -482,8 +471,8 @@ cc_library(
 
 cc_library(
     name = "human_readable_json",
-    srcs = tf_platform_srcs(["human_readable_json.cc"]),
-    hdrs = ["platform/human_readable_json.h"],
+    srcs = ["//tensorflow/core/platform:legacy_human_readable_json_src"],
+    hdrs = ["//tensorflow/core/platform:human_readable_json.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -494,8 +483,8 @@ cc_library(
 
 cc_library(
     name = "logger",
-    srcs = ["platform/logger.cc"],
-    hdrs = ["platform/logger.h"],
+    srcs = ["//tensorflow/core/platform:logger.cc"],
+    hdrs = ["//tensorflow/core/platform:logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -509,9 +498,9 @@ cc_library(
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
-        "platform/env.h",
-        "platform/file_statistics.h",
-        "platform/file_system.h",
+        "//tensorflow/core/platform:env.h",
+        "//tensorflow/core/platform:file_statistics.h",
+        "//tensorflow/core/platform:file_system.h",
     ],
     visibility = ["//visibility:private"],
 )
@@ -520,21 +509,17 @@ filegroup(
 filegroup(
     name = "platform_env_internal_hdrs",
     srcs = [
-        "platform/load_library.h",
+        "//tensorflow/core/platform:load_library.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_env",
-    srcs = tf_platform_srcs([
-        "env.cc",
-        "load_library.cc",
-    ]) + tf_platform_hdrs([
-        "wide_char.h",
-    ]) + [
-        "platform/env.cc",
-        "platform/file_system.cc",
+    srcs = [
+        "//tensorflow/core/platform:env.cc",
+        "//tensorflow/core/platform:file_system.cc",
+        "//tensorflow/core/platform:legacy_platform_env_srcs",
     ],
     hdrs = [
         ":platform_env_hdrs",
@@ -546,13 +531,13 @@ cc_library(
         "//tensorflow/c:__subpackages__",
     ],
     deps = [
-        ":error_codes_proto_cc",
         ":lib",
         ":lib_internal",
-        ":lib_platform",
         ":platform_base",
         ":platform_port",
         ":platform_protobuf",
+        "//tensorflow/core/lib/core:error_codes_proto_cc",
+        "//tensorflow/core/platform",
         "//tensorflow/core/platform/default/build_config:env",
         "//tensorflow/core/platform/default/build_config:port",
     ],
@@ -561,19 +546,17 @@ cc_library(
 filegroup(
     name = "platform_file_system_hdrs",
     srcs = [
-        "platform/file_system_helper.h",
-        "platform/null_file_system.h",
+        "//tensorflow/core/platform:file_system_helper.h",
+        "//tensorflow/core/platform:null_file_system.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_file_system",
-    srcs = tf_platform_srcs([
-    ]) + tf_platform_hdrs([
-        "windows_file_system.h",
-    ]) + [
-        "platform/file_system_helper.cc",
+    srcs = [
+        "//tensorflow/core/platform:file_system_helper.cc",
+        "//tensorflow/core/platform:legacy_file_system_hdrs",
     ],
     hdrs = [
         ":platform_file_system_hdrs",
@@ -582,83 +565,71 @@ cc_library(
     visibility = [":__subpackages__"],
     deps = [
         ":lib",
-        ":lib_platform",
         ":platform_env",
+        "//tensorflow/core/platform",
     ],
 )
 
-cc_library(
-    name = "platform_strings",
-    srcs = tf_platform_srcs([
-        "platform/platform_strings.cc",
-        "platform/platform_strings_computed.h",
-    ]),
-    hdrs = [
-        "platform/platform_strings.h",
-    ],
-    visibility = [":__subpackages__"],
-    deps = [":lib"],
-)
-
 filegroup(
     name = "platform_other_hdrs",
     srcs = [
-        "platform/abi.h",
-        "platform/context.h",
-        "platform/cpu_feature_guard.h",
-        "platform/error.h",
-        "platform/fingerprint.h",
-        "platform/monitoring.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/prefetch.h",
-        "platform/profile_utils/android_armv7a_cpu_utils_helper.h",
-        "platform/profile_utils/clock_cycle_profiler.h",
-        "platform/profile_utils/cpu_utils.h",
-        "platform/profile_utils/i_cpu_utils_helper.h",
-        "platform/stacktrace.h",
-        "platform/stacktrace_handler.h",
-        "platform/strong_hash.h",
-        "platform/subprocess.h",
+        "//tensorflow/core/platform:abi.h",
+        "//tensorflow/core/platform:context.h",
+        "//tensorflow/core/platform:cpu_feature_guard.h",
+        "//tensorflow/core/platform:error.h",
+        "//tensorflow/core/platform:fingerprint.h",
+        "//tensorflow/core/platform:monitoring.h",
+        "//tensorflow/core/platform:net.h",
+        "//tensorflow/core/platform:notification.h",
+        "//tensorflow/core/platform:prefetch.h",
+        "//tensorflow/core/platform:profile_utils/android_armv7a_cpu_utils_helper.h",
+        "//tensorflow/core/platform:profile_utils/clock_cycle_profiler.h",
+        "//tensorflow/core/platform:profile_utils/cpu_utils.h",
+        "//tensorflow/core/platform:profile_utils/i_cpu_utils_helper.h",
+        "//tensorflow/core/platform:stacktrace.h",
+        "//tensorflow/core/platform:stacktrace_handler.h",
+        "//tensorflow/core/platform:strong_hash.h",
+        "//tensorflow/core/platform:subprocess.h",
     ] + tf_additional_monitoring_hdrs(),
     visibility = ["//visibility:private"],
 )
 
+tf_cc_test(
+    name = "platform_unbounded_work_queue_test",
+    srcs = ["//tensorflow/core/platform:unbounded_work_queue_test.cc"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":test",
+        ":test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 # Headers that are not exported as part of ":lib".
 filegroup(
     name = "platform_other_internal_hdrs",
     srcs = [
-        "platform/denormal.h",
-        "platform/setround.h",
-        "platform/tracing.h",
+        "//tensorflow/core/platform:denormal.h",
+        "//tensorflow/core/platform:setround.h",
+        "//tensorflow/core/platform:tracing.h",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "platform_other",
-    srcs = tf_platform_srcs([
-        "subprocess.cc",
-        "net.cc",
-        "tracing.cc",
-    ]) + tf_platform_hdrs([
-        "tracing.h",
-        "error.h",
-        "context.h",
-        "fingerprint.h",
-        "notification.h",
-        "stacktrace.h",
-        "strong_hash.h",
-        "subprocess.h",
-        "tracing_impl.h",
-    ]) + [
-        "platform/cpu_feature_guard.cc",
-        "platform/setround.cc",
-        "platform/tracing.cc",
-        "platform/denormal.cc",
-        "platform/profile_utils/android_armv7a_cpu_utils_helper.cc",
-        "platform/profile_utils/clock_cycle_profiler.cc",
-        "platform/profile_utils/cpu_utils.cc",
+    srcs = [
+        "//tensorflow/core/platform:cpu_feature_guard.cc",
+        "//tensorflow/core/platform:denormal.cc",
+        "//tensorflow/core/platform:legacy_platform_other_srcs",
+        "//tensorflow/core/platform:profile_utils/android_armv7a_cpu_utils_helper.cc",
+        "//tensorflow/core/platform:profile_utils/clock_cycle_profiler.cc",
+        "//tensorflow/core/platform:profile_utils/cpu_utils.cc",
+        "//tensorflow/core/platform:setround.cc",
+        "//tensorflow/core/platform:tracing.cc",
     ],
     hdrs = [
         ":platform_other_hdrs",
@@ -668,11 +639,14 @@ cc_library(
     visibility = [":__subpackages__"],
     deps = [
         ":lib",
-        ":lib_platform",
         ":platform_base",
         ":platform_env",
         ":platform_port",
         ":platform_protobuf",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:abi",
+        "//tensorflow/core/platform:annotation",
+        "//tensorflow/core/platform:stacktrace",
         "//tensorflow/core/platform/default/build_config:other",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/platform/default/build_config:port",
@@ -684,34 +658,42 @@ cc_library(
 # don't have to depend on lib/platformlib.
 cc_library(
     name = "lib_proto_parsing",
-    srcs = glob(tf_additional_proto_srcs()),
+    srcs = [
+        "//tensorflow/core/platform:protobuf.cc",
+    ],
     hdrs = [
-        "lib/core/errors.h",
-        "lib/core/status.h",
-        "lib/core/stringpiece.h",
-        "lib/strings/numbers.h",
-        "lib/strings/strcat.h",
-        "platform/init_main.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/platform.h",
-        "platform/protobuf.h",
-        "platform/types.h",
-        "platform/windows/cpu_info.h",
-        "lib/bfloat16/bfloat16.h",
-    ] + tf_additional_proto_hdrs(),
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_proto_parsing_headers",
+        "//tensorflow/core/lib/strings:legacy_lib_proto_parsing_headers",
+        "//tensorflow/core/platform:init_main.h",
+        "//tensorflow/core/platform:legacy_proto_hdrs",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:protobuf.h",
+        "//tensorflow/core/platform:stringpiece.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
+    ],
     copts = tf_copts(),
     deps = tf_lib_proto_parsing_deps() + [
         ":platform_base",
         "@com_google_absl//absl/strings",
         "@double_conversion//:double-conversion",
+        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/platform:cpu_info",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:types",
     ],
 )
 
 cc_library(
     name = "lib_proto_compiler",
     hdrs = [
-        "platform/protobuf_compiler.h",
+        "//tensorflow/core/platform:protobuf_compiler.h",
     ],
     copts = tf_copts(),
     deps = tf_lib_proto_compiler_deps() + [
@@ -726,26 +708,6 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "lib/core/arena.h",
-        "lib/core/bitmap.h",
-        "lib/core/bits.h",
-        "lib/core/coding.h",
-        "lib/core/errors.h",
-        "lib/core/notification.h",
-        "lib/core/raw_coding.h",
-        "lib/core/status.h",
-        "lib/core/stringpiece.h",
-        "lib/core/threadpool.h",
-        "lib/core/threadpool_interface.h",
-        "lib/gtl/array_slice.h",
-        "lib/gtl/cleanup.h",
-        "lib/gtl/compactptrset.h",
-        "lib/gtl/flatmap.h",
-        "lib/gtl/flatset.h",
-        "lib/gtl/inlined_vector.h",
-        "lib/gtl/optional.h",
-        "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
         "lib/hash/hash.h",
         "lib/histogram/histogram.h",
@@ -760,32 +722,31 @@ cc_library(
         "lib/io/table.h",
         "lib/io/table_builder.h",
         "lib/io/table_options.h",
-        "lib/math/math_util.h",
         "lib/monitoring/collected_metrics.h",
         "lib/monitoring/collection_registry.h",
         "lib/monitoring/counter.h",
         "lib/monitoring/gauge.h",
         "lib/monitoring/metric_def.h",
         "lib/monitoring/sampler.h",
-        "lib/random/distribution_sampler.h",
-        "lib/random/philox_random.h",
-        "lib/random/random_distributions.h",
-        "lib/random/simple_philox.h",
-        "lib/strings/numbers.h",
-        "lib/strings/proto_serialization.h",
-        "lib/strings/str_util.h",
-        "lib/strings/strcat.h",
-        "lib/strings/stringprintf.h",
         ":platform_base_hdrs",
         ":platform_env_hdrs",
         ":platform_file_system_hdrs",
         ":platform_other_hdrs",
         ":platform_port_hdrs",
         ":platform_protobuf_hdrs",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_headers",
+        "//tensorflow/core/lib/gtl:legacy_lib_gtl_headers",
+        "//tensorflow/core/lib/math:math_util.h",
+        "//tensorflow/core/lib/random:legacy_lib_random_headers",
+        "//tensorflow/core/lib/strings:legacy_lib_string_headers",
     ],
     visibility = ["//visibility:public"],
     deps = [
         ":lib_internal",
+        "//tensorflow/core/platform:str_util",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:stringprintf",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -797,7 +758,7 @@ cc_library(
 cc_library(
     name = "lib_experimental",
     hdrs = [
-        "lib/core/threadpool_options.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_threadpool_options_header",
     ],
     visibility = [
         ":experimental_access",
@@ -820,45 +781,13 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "abi",
-    srcs = ["platform/abi.cc"],
-    hdrs = ["platform/abi.h"],
-    deps = [":platform_base"],
-)
-
-cc_library(
-    name = "stacktrace",
-    srcs = glob(["platform/*/stacktrace.h"]),
-    hdrs = ["platform/stacktrace.h"],
-    deps = [
-        ":abi",
-        ":lib_platform",
-        "//tensorflow/core/platform/default/build_config:stacktrace",
-    ],
-)
-
-cc_library(
-    name = "stacktrace_handler",
-    srcs = ["platform/stacktrace_handler.cc"],
-    hdrs = ["platform/stacktrace_handler.h"],
-    deps = [
-        ":abi",
-        ":lib_platform",
-        ":stacktrace",
-    ],
-)
-
-# Libraries that will eventually be moved into lib/core
-# Note that stringpiece_test can't be place here yet, because we are
-# required to use tf_cc_test, and that rule will change / into _
+# DEPRECATED: use platform:stringpiece instead.
 cc_library(
     name = "core_stringpiece",
-    hdrs = ["lib/core/stringpiece.h"],
+    hdrs = ["//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header"],
     copts = tf_copts(),
     deps = [
-        ":platform_base",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform:stringpiece",
     ],
 )
 
@@ -869,14 +798,15 @@ cc_library(
     name = "test",
     testonly = 1,
     srcs = [
-        "platform/test.cc",
         "util/reporter.cc",
-    ] + tf_additional_test_srcs(),
+        "//tensorflow/core/platform:legacy_test_srcs",
+        "//tensorflow/core/platform:test.cc",
+    ],
     hdrs = [
-        "lib/core/status_test_util.h",
-        "platform/test.h",
-        "platform/test_benchmark.h",
         "util/reporter.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_status_test_util_header",
+        "//tensorflow/core/platform:test.h",
+        "//tensorflow/core/platform:test_benchmark.h",
     ],
     copts = tf_copts(),
     linkopts = select({
@@ -902,16 +832,16 @@ cc_library(
     name = "test_lite",
     testonly = 1,
     srcs = [
-        "platform/test.cc",
+        "//tensorflow/core/platform:test.cc",
     ],
     hdrs = [
-        "platform/test.h",
-        "platform/test_benchmark.h",
+        "//tensorflow/core/platform:test.h",
+        "//tensorflow/core/platform:test_benchmark.h",
     ],
     copts = tf_copts(),
     deps = [
-        ":lib_platform",
         ":platform_base",
+        "//tensorflow/core/platform",
         "//tensorflow/core/platform/default/build_config:gtest",
     ],
 )
@@ -1162,35 +1092,39 @@ cc_library(
 
 cc_library(
     name = "framework_lite",
-    srcs = tf_additional_minimal_lib_srcs(),
+    srcs = [
+        "//tensorflow/core/platform:legacy_minimal_lib_srcs",
+    ],
     hdrs = [
         "framework/numeric_types.h",
         "framework/tensor_types.h",
         "framework/type_traits.h",
-        "lib/bfloat16/bfloat16.h",
-        "platform/byte_order.h",
-        "platform/default/dynamic_annotations.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/default/mutex.h",
-        "platform/default/thread_annotations.h",
-        "platform/dynamic_annotations.h",
-        "platform/macros.h",
-        "platform/mutex.h",
-        "platform/platform.h",
-        "platform/prefetch.h",
-        "platform/protobuf.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "platform/cpu_info.h",
-    ] + if_windows(["platform/windows/integral_types.h"]),
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/platform:byte_order.h",
+        "//tensorflow/core/platform:default/dynamic_annotations.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:default/mutex.h",
+        "//tensorflow/core/platform:dynamic_annotations.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:mutex.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:prefetch.h",
+        "//tensorflow/core/platform:protobuf.h",
+        "//tensorflow/core/platform:thread_annotations.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
+        "//tensorflow/core/platform:cpu_info.h",
+    ] + if_windows(["//tensorflow/core/platform:windows/integral_types.h"]),
     visibility = ["//visibility:public"],
     deps =
         [
             "@nsync//:nsync_cpp",
         ] + [
             "//third_party/eigen3",
+            "//tensorflow/core/lib/bfloat16",
             "//tensorflow/core/platform/default/build_config:minimal",
+            "//tensorflow/core/platform:types",
         ],
 )
 
@@ -1384,6 +1318,36 @@ tf_gen_op_libs(
         "ragged_conversion_ops",
         "ragged_math_ops",
     ],
+    deps = [":ragged_to_dense_util"],
+)
+
+cc_library(
+    name = "ragged_to_dense_util",
+    srcs = [
+        "ops/ragged_to_dense_util.cc",
+    ],
+    hdrs = [
+        "ops/ragged_to_dense_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_to_dense_util_test",
+    srcs = [
+        "ops/ragged_to_dense_util_test.cc",
+    ],
+    deps = [
+        ":ragged_to_dense_util",
+        ":test",
+        ":testlib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 cc_library(
@@ -1805,11 +1769,21 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
-        ":protos_all_proto_text_srcs",
-        ":error_codes_proto_text_srcs",
+        ":attr_value_proto_text_srcs",
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
+        "//tensorflow/core/platform:legacy_srcs_no_runtime",
         "//tensorflow/core/profiler:mobile_srcs",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.cc",
+        "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
+        "//tensorflow/core/lib/core:legacy_lib_core_all_srcs",
+        "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
+        "//tensorflow/core/lib/random:legacy_lib_random_all_headers",
+        "//tensorflow/core/lib/random:legacy_lib_random_all_srcs",
+        "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers",
+        "//tensorflow/core/lib/strings:legacy_lib_strings_all_srcs",
+        "//tensorflow/core/lib/math:math_util.h",
     ] + glob(
         [
             "client/**/*.cc",
@@ -1817,8 +1791,6 @@ filegroup(
             "framework/**/*.cc",
             "lib/**/*.h",
             "lib/**/*.cc",
-            "platform/**/*.h",
-            "platform/**/*.cc",
             "public/**/*.h",
             "util/**/*.h",
             "util/**/*.cc",
@@ -1839,22 +1811,6 @@ filegroup(
             "util/events_writer.*",
             "util/stats_calculator.*",
             "util/reporter.*",
-            "platform/**/cuda_libdevice_path.*",
-            "platform/**/logger.cc",
-            # Exclude env_time and logging to avoid collisions with
-            # :platform_base, a common dependency for downstream targets.
-            "platform/**/env_time.cc",
-            "platform/**/logging.cc",
-            "platform/default/test_benchmark.*",
-            "platform/cuda.h",
-            "platform/rocm.h",
-            "platform/google/**/*",
-            "platform/hadoop/**/*",
-            "platform/gif.h",
-            "platform/jpeg.h",
-            "platform/png.h",
-            "platform/stream_executor.*",
-            "platform/windows/**/*",
             "user_ops/**/*.cu.cc",
             "util/ctc/*.h",
             "util/ctc/*.cc",
@@ -2109,7 +2065,7 @@ filegroup(
 filegroup(
     name = "android_test_srcs",
     # TODO(andrewharp/nhua):
-    # make more test-related sources portable e.g. "platform/test.cc",
+    # make more test-related sources portable e.g. "//tensorflow/core/platform:test.cc",
     srcs = [
         ":framework/fake_input.cc",
         ":framework/fake_input.h",
@@ -2117,10 +2073,10 @@ filegroup(
         ":framework/shape_inference_testutil.h",
         ":framework/tensor_testutil.cc",
         ":framework/tensor_testutil.h",
-        ":platform/test.cc",
-        ":platform/test.h",
         ":util/reporter.cc",
         ":util/reporter.h",
+        "//tensorflow/core/platform:test.cc",
+        "//tensorflow/core/platform:test.h",
     ],
     visibility = ["//visibility:public"],
 )
@@ -2133,9 +2089,9 @@ filegroup(
         ":framework/shape_inference_testutil.h",
         ":framework/tensor_testutil.cc",
         ":framework/tensor_testutil.h",
-        ":platform/test.h",
         ":util/reporter.cc",
         ":util/reporter.h",
+        "//tensorflow/core/platform:test.h",
     ],
     visibility = ["//visibility:public"],
 )
@@ -2298,6 +2254,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/graph_debug_info_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/graph_debug_info.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "protobuf/meta_graph_pyclif",
     proto_lib = ":protos_all_cc",
@@ -2400,36 +2363,32 @@ tf_proto_library_cc(
     ],
 )
 
-LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
+LIB_INTERNAL_PRIVATE_HEADERS = [
+    "framework/resource_handle.h",
+    "//tensorflow/core/platform:legacy_lib_internal_headers",
+    "//tensorflow/core/platform:scanner.h",
+    "//tensorflow/core/platform:str_util.h",
+    "//tensorflow/core/lib/bfloat16:bfloat16.h",
+    "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
+    "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
+    "//tensorflow/core/lib/random:legacy_lib_random_all_headers",
+    "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers",
+    "//tensorflow/core/lib/math:math_util.h",
+] + glob(
     [
         "lib/**/*.h",
-        "platform/*.h",
-        "platform/profile_utils/**/*.h",
     ],
     exclude = [
         "**/*test*",
         "lib/gif/**/*",
         "lib/jpeg/**/*",
         "lib/png/**/*",
-        "platform/gif.h",
-        "platform/jpeg.h",
-        "platform/png.h",
-        "platform/**/cuda.h",
-        "platform/**/rocm.h",
-        "platform/**/stream_executor.h",
     ],
 )
 
-LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
-    "lib/core/blocking_counter.h",
-    "lib/core/refcount.h",
-    "lib/gtl/edit_distance.h",
-    "lib/gtl/int_type.h",
-    "lib/gtl/iterator_range.h",
-    "lib/gtl/manual_constructor.h",
-    "lib/gtl/map_util.h",
-    "lib/gtl/stl_util.h",
-    "lib/gtl/top_n.h",
+LIB_INTERNAL_PUBLIC_HEADERS = [
+    "//tensorflow/core/lib/core:legacy_lib_internal_core_headers",
+    "//tensorflow/core/lib/gtl:legacy_lib_internal_public_gtl_headers",
     "lib/hash/hash.h",
     "lib/io/inputbuffer.h",
     "lib/io/iterator.h",
@@ -2442,25 +2401,21 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "lib/monitoring/mobile_gauge.h",
     "lib/monitoring/mobile_sampler.h",
     "lib/png/png_io.h",
-    "lib/random/random.h",
-    "lib/random/random_distributions.h",
-    "lib/random/weighted_picker.h",
-    "lib/strings/base64.h",
-    "lib/strings/ordered_code.h",
-    "lib/strings/proto_text_util.h",
-    "lib/strings/proto_serialization.h",
-    "lib/strings/scanner.h",
+    "//tensorflow/core/lib/random:legacy_lib_internal_public_random_headers",
+    "//tensorflow/core/lib/strings:legacy_lib_internal_public_string_headers",
     "lib/wav/wav_io.h",
-    "platform/demangle.h",
-    "platform/denormal.h",
-    "platform/host_info.h",
-    "platform/platform.h",
-    "platform/monitoring.h",
-    "platform/protobuf_internal.h",
-    "platform/setround.h",
-    "platform/snappy.h",
-    "platform/tensor_coding.h",
-    "platform/tracing.h",
+    "//tensorflow/core/platform:demangle.h",
+    "//tensorflow/core/platform:denormal.h",
+    "//tensorflow/core/platform:host_info.h",
+    "//tensorflow/core/platform:platform.h",
+    "//tensorflow/core/platform:monitoring.h",
+    "//tensorflow/core/platform:protobuf_internal.h",
+    "//tensorflow/core/platform:setround.h",
+    "//tensorflow/core/platform:snappy.h",
+    "//tensorflow/core/platform:tensor_coding.h",
+    "//tensorflow/core/platform:tracing.h",
+    "//tensorflow/core/platform:unbounded_work_queue.h",
+    "//tensorflow/core/platform:legacy_platform_lib_hdrs",
     "util/env_var.h",
 ]
 
@@ -2469,7 +2424,6 @@ LIB_INTERNAL_DEFINES = (
     tf_additional_lib_defines() + [
         "TF_USE_SNAPPY",
     ] + tf_additional_verbs_lib_defines() +
-    tf_additional_mpi_lib_defines() +
     tf_additional_gdr_lib_defines() +
     tf_additional_numa_lib_defines()
 )
@@ -2490,6 +2444,7 @@ cc_library(
         ],
     }),
     deps = tf_additional_lib_deps() + [
+        "//tensorflow/core/platform:annotation",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
@@ -2503,8 +2458,6 @@ cc_library(
     srcs = LIB_INTERNAL_PRIVATE_HEADERS + glob(
         [
             "lib/**/*.cc",
-            "platform/*.cc",
-            "platform/profile_utils/**/*.cc",
             "util/env_var.cc",
         ],
         exclude = [
@@ -2514,46 +2467,34 @@ cc_library(
             "lib/gif/**/*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
-            "platform/**/env_time.cc",
-            "platform/**/monitoring.cc",
-            "platform/**/cuda_libdevice_path.cc",
-            "platform/**/device_tracer.cc",
-            "platform/**/logger.cc",
-            "platform/**/logging.cc",
-            "platform/**/human_readable_json.cc",
-            "platform/abi.cc",
-            "platform/protobuf.cc",
         ],
-    ) + tf_additional_lib_srcs(
-        exclude = [
-            "**/*test*",
-            "platform/**/cuda.h",
-            "platform/**/cuda_libdevice_path.cc",
-            "platform/**/rocm.h",
-            "platform/**/monitoring.cc",
-            "platform/**/stream_executor.h",
-            "platform/**/env_time.cc",
-            "platform/**/device_tracer.cc",
-            "platform/**/logger.cc",
-            "platform/**/logging.cc",
-            "platform/**/human_readable_json.cc",
-            "platform/abi.cc",
-        ] +
-        # Protobuf deps already included through the ":lib_proto_parsing"
-        # dependency.
-        tf_additional_proto_srcs(),
-    ) + tf_additional_monitoring_srcs(),
+    ) + [
+        "//tensorflow/core/platform:legacy_monitoring_srcs",
+        "//tensorflow/core/platform:legacy_platform_lib_srcs",
+        "//tensorflow/core/platform:legacy_lib_internal_srcs",
+        "//tensorflow/core/lib/core:legacy_lib_core_all_srcs",
+        "//tensorflow/core/lib/random:legacy_lib_random_all_srcs",
+        "//tensorflow/core/lib/strings:legacy_lib_strings_all_srcs",
+    ],
     hdrs = LIB_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     defines = LIB_INTERNAL_DEFINES,
     deps = tf_additional_lib_deps() + [
+               ":core_stringpiece",
                ":lib_hash_crc32c_accelerate_internal",
                ":lib_proto_parsing",
-               ":abi",
-               ":core_stringpiece",
                "@com_google_absl//absl/memory",
                "@com_google_absl//absl/strings",
                "//third_party/eigen3",
+               "//tensorflow/core/lib/bfloat16",
+               "//tensorflow/core/platform:abi",
+               "//tensorflow/core/platform:annotation",
+               "//tensorflow/core/platform:cpu_info",
+               "//tensorflow/core/platform:numbers",
+               "//tensorflow/core/platform:platform_strings",
+               "//tensorflow/core/platform:scanner",
+               "//tensorflow/core/platform:stringprintf",
+               "//tensorflow/core/platform:str_util",
                "//tensorflow/core/platform/default/build_config:platformlib",
                "@snappy",
                "@zlib_archive//:zlib",
@@ -2575,7 +2516,7 @@ cc_library(
     name = "gif_internal",
     srcs = [
         "lib/gif/gif_io.cc",
-        "platform/gif.h",
+        "//tensorflow/core/platform:gif.h",
     ],
     hdrs = ["lib/gif/gif_io.h"],
     copts = tf_copts(),
@@ -2596,7 +2537,7 @@ cc_library(
     srcs = [
         "lib/jpeg/jpeg_handle.cc",
         "lib/jpeg/jpeg_mem.cc",
-        "platform/jpeg.h",
+        "//tensorflow/core/platform:jpeg.h",
     ],
     hdrs = [
         "lib/jpeg/jpeg_handle.h",
@@ -2619,18 +2560,19 @@ cc_library(
     name = "png_internal",
     srcs = ["lib/png/png_io.cc"],
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "lib/core/stringpiece.h",
         "lib/png/png_io.h",
-        "platform/byte_order.h",
-        "platform/cpu_info.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/platform.h",
-        "platform/png.h",
-        "platform/types.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
+        "//tensorflow/core/platform:byte_order.h",
+        "//tensorflow/core/platform:cpu_info.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:png.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     copts = tf_copts(),
     linkopts = select({
@@ -2651,13 +2593,14 @@ cc_library(
 cc_library(
     name = "tflite_portable_logging",
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/platform.h",
-        "platform/types.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     copts = tf_copts(),
     linkopts = ["-ldl"],
@@ -2672,26 +2615,30 @@ cc_library(
     srcs = if_android([
         "lib/jpeg/jpeg_handle.cc",
         "lib/jpeg/jpeg_mem.cc",
-        "platform/jpeg.h",
+        "//tensorflow/core/platform:jpeg.h",
     ]),
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "lib/core/stringpiece.h",
         "lib/jpeg/jpeg_handle.h",
         "lib/jpeg/jpeg_mem.h",
-        "platform/default/dynamic_annotations.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/dynamic_annotations.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/platform.h",
-        "platform/types.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
+        "//tensorflow/core/platform:default/dynamic_annotations.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:dynamic_annotations.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:mem.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:stringpiece.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
+        ":core_stringpiece",
+        "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform/default/build_config:jpeg",
         "//tensorflow/core/platform/default/build_config:logging",
         "@com_google_absl//absl/base:core_headers",
@@ -2703,24 +2650,24 @@ cc_library(
     name = "android_gif_internal",
     srcs = if_android([
         "lib/gif/gif_io.cc",
-        "platform/gif.h",
-        "lib/strings/strcat.h",
-        "lib/strings/numbers.h",
+        "//tensorflow/core/platform:gif.h",
+        "//tensorflow/core/lib/strings:legacy_lib_android_gif_internal_string_headers",
     ]),
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "lib/core/stringpiece.h",
         "lib/gif/gif_io.h",
-        "lib/gtl/cleanup.h",
-        "platform/default/dynamic_annotations.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/dynamic_annotations.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/platform.h",
-        "platform/types.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
+        "//tensorflow/core/lib/gtl:legacy_android_gif_internal_headers",
+        "//tensorflow/core/platform:default/dynamic_annotations.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:dynamic_annotations.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:mem.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     copts = tf_copts(),
     linkopts = ["-ldl"],
@@ -2736,20 +2683,21 @@ cc_library(
     name = "android_png_internal",
     srcs = if_android([
         "lib/png/png_io.cc",
-        "platform/png.h",
+        "//tensorflow/core/platform:png.h",
     ]),
     hdrs = [
-        "lib/bfloat16/bfloat16.h",
-        "lib/core/stringpiece.h",
         "lib/png/png_io.h",
-        "platform/byte_order.h",
-        "platform/cpu_info.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/platform.h",
-        "platform/types.h",
+        "//tensorflow/core/lib/bfloat16:bfloat16.h",
+        "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
+        "//tensorflow/core/platform:byte_order.h",
+        "//tensorflow/core/platform:cpu_info.h",
+        "//tensorflow/core/platform:default/integral_types.h",
+        "//tensorflow/core/platform:default/logging.h",
+        "//tensorflow/core/platform:logging.h",
+        "//tensorflow/core/platform:macros.h",
+        "//tensorflow/core/platform:platform.h",
+        "//tensorflow/core/platform:tstring.h",
+        "//tensorflow/core/platform:types.h",
     ],
     copts = tf_copts(),
     linkopts = ["-ldl"],
@@ -2760,59 +2708,19 @@ cc_library(
     ],
 )
 
-tf_proto_library(
-    name = "error_codes_proto",
-    srcs = ERROR_CODES_PROTO_SRCS,
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    provide_cc_alias = True,
-)
-
-tf_generate_proto_text_sources(
-    name = "error_codes_proto_text",
-    srcs = ERROR_CODES_PROTO_SRCS,
-    protodeps = [],
-    srcs_relative_dir = "tensorflow/core/",
-    deps = [
-        ":error_codes_proto_cc",
-        ":lib_internal",
-    ],
-)
-
 tf_proto_library(
     name = "protos_all_proto",
     srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        ":error_codes_proto",
+        "//tensorflow/core/lib/core:error_codes_proto",
     ],
 )
 
-tf_generate_proto_text_sources(
-    name = "protos_all_proto_text",
-    srcs = COMMON_PROTO_SRCS,
-    protodeps = ERROR_CODES_PROTO_SRCS,
-    srcs_relative_dir = "tensorflow/core/",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":error_codes_proto_text",
-        ":lib_internal",
-        ":protos_all_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "proto_text",
-    hdrs = [
-        ":error_codes_proto_text_hdrs",
-        ":protos_all_proto_text_hdrs",
-    ],
-    deps = [
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-    ],
+alias(
+    name = "error_codes_proto_cc",
+    actual = "//tensorflow/core/lib/core:error_codes_proto_cc",
 )
 
 tf_version_info_genrule()
@@ -2975,11 +2883,10 @@ tf_cuda_library(
     deps = [
         ":allocator_registry_impl",
         ":allocator",
+        ":attr_value_proto_text",
         ":feature_util",
         ":lib",
         ":lib_internal",
-        ":protos_all_proto_text",
-        ":error_codes_proto_text",
         ":protos_all_cc",
         ":stats_calculator_portable",
         ":version_lib",
@@ -3030,11 +2937,11 @@ cc_header_only_library(
 
 tf_cuda_library(
     name = "stream_executor",
-    srcs = ["platform/stream_executor.h"],
+    srcs = ["//tensorflow/core/platform:stream_executor.h"],
     hdrs = [
-        "platform/cuda.h",
-        "platform/rocm.h",
-        "platform/stream_executor.h",
+        "//tensorflow/core/platform:cuda.h",
+        "//tensorflow/core/platform:rocm.h",
+        "//tensorflow/core/platform:stream_executor.h",
     ],
     deps = [
         "//tensorflow/core/platform/default/build_config:stream_executor",
@@ -3045,9 +2952,9 @@ tf_cuda_library(
 # and does not include any cuda dependencies.
 cc_library(
     name = "stream_executor_no_cuda",
-    srcs = ["platform/stream_executor.h"],
+    srcs = ["//tensorflow/core/platform:stream_executor.h"],
     hdrs = [
-        "platform/stream_executor_no_cuda.h",
+        "//tensorflow/core/platform:stream_executor_no_cuda.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -3121,7 +3028,6 @@ tf_cuda_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3180,7 +3086,6 @@ tf_cuda_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_set",
         "//third_party/eigen3",
@@ -3328,7 +3233,6 @@ tf_cuda_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
@@ -3347,7 +3251,6 @@ tf_cuda_library(
     hdrs = CORE_CPU_LIB_HEADERS,
     deps = [
         ":core_cpu_base",
-        ":proto_text",
         "//tensorflow/core/grappler:grappler_item",
     ] + if_static([":core_cpu_impl"]) + tf_protos_all() + tf_protos_grappler(),
 )
@@ -3357,7 +3260,6 @@ tf_cuda_library(
     hdrs = CORE_CPU_LIB_HEADERS,
     deps = [
         ":core_cpu_base_no_ops",
-        ":proto_text",
         "//tensorflow/core/grappler:grappler_item",
     ] + tf_protos_all() + tf_protos_grappler(),
 )
@@ -3375,7 +3277,6 @@ tf_cuda_library(
         ":framework",
         ":graph",
         ":lib",
-        ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -3411,6 +3312,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         ":shared_counter",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -3427,7 +3329,7 @@ cc_library(
 cc_library(
     name = "regexp_internal",
     hdrs = [
-        "platform/regexp.h",
+        "//tensorflow/core/platform:regexp.h",
     ],
     visibility = [
         "//tensorflow/compiler:__subpackages__",
@@ -3454,7 +3356,6 @@ tf_cuda_library(
         ":lib",
         ":lib_experimental",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
@@ -3479,7 +3380,6 @@ cc_library(
         ":framework",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
     ],
     alwayslink = 1,
@@ -3487,7 +3387,9 @@ cc_library(
 
 tf_cuda_library(
     name = "device_tracer",
-    srcs = tf_additional_device_tracer_srcs(),
+    srcs = [
+        "//tensorflow/core/platform:legacy_device_tracer_srcs",
+    ],
     copts = tf_copts(),
     cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     visibility = [
@@ -3705,7 +3607,6 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         "//third_party/eigen3",
         "@local_config_sycl//sycl",
     ],
@@ -3719,11 +3620,11 @@ cc_library(
     name = "lib_test_internal",
     testonly = 1,
     hdrs = [
-        "lib/gtl/manual_constructor.h",
         "lib/io/block.h",
         "lib/io/block_builder.h",
         "lib/io/format.h",
-        "lib/random/philox_random_test_utils.h",
+        "//tensorflow/core/lib/gtl:legacy_lib_test_internal_headers",
+        "//tensorflow/core/lib/random:legacy_lib_test_internal_headers",
     ],
     deps = [
         ":lib",
@@ -3762,7 +3663,7 @@ cc_library(
 cc_library(
     name = "test_main",
     testonly = 1,
-    srcs = ["platform/test_main.cc"],
+    srcs = ["//tensorflow/core/platform:test_main.cc"],
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:windows": [],
@@ -3783,16 +3684,16 @@ cc_library(
 cc_library(
     name = "test_lite_main",
     testonly = 1,
-    srcs = ["platform/test_main.cc"],
+    srcs = ["//tensorflow/core/platform:test_main.cc"],
     copts = tf_copts(),
     deps = [
         # TODO(ahentz): we don't want to depend on "lib" here. It used to be
         # that "core_stringpiece" was enough but that recently changed and
         # we now need at least "str_util".
         ":lib",
-        ":lib_platform",
-        ":stacktrace_handler",
         ":test_lite",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:stacktrace_handler",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3802,25 +3703,6 @@ tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
     srcs = [
-        "lib/core/arena_test.cc",
-        "lib/core/bitmap_test.cc",
-        "lib/core/blocking_counter_test.cc",
-        "lib/core/coding_test.cc",
-        "lib/core/notification_test.cc",
-        "lib/core/refcount_test.cc",
-        "lib/core/status_test.cc",
-        "lib/core/stringpiece_test.cc",
-        "lib/core/threadpool_test.cc",
-        "lib/gtl/cleanup_test.cc",
-        "lib/gtl/compactptrset_test.cc",
-        "lib/gtl/edit_distance_test.cc",
-        "lib/gtl/flatmap_test.cc",
-        "lib/gtl/flatset_test.cc",
-        "lib/gtl/int_type_test.cc",
-        "lib/gtl/iterator_range_test.cc",
-        "lib/gtl/manual_constructor_test.cc",
-        "lib/gtl/map_util_test.cc",
-        "lib/gtl/top_n_test.cc",
         "lib/hash/crc32c_test.cc",
         "lib/hash/hash_test.cc",
         "lib/histogram/histogram_test.cc",
@@ -3834,33 +3716,31 @@ tf_cc_tests(
         "lib/io/snappy/snappy_buffers_test.cc",
         "lib/io/table_test.cc",
         "lib/io/zlib_buffers_test.cc",
-        "lib/math/math_util_test.cc",
         "lib/monitoring/collection_registry_test.cc",
         "lib/monitoring/counter_test.cc",
         "lib/monitoring/gauge_test.cc",
         "lib/monitoring/metric_def_test.cc",
         "lib/monitoring/sampler_test.cc",
-        "lib/random/distribution_sampler_test.cc",
-        "lib/random/philox_random_test.cc",
-        "lib/random/random_test.cc",
-        "lib/random/simple_philox_test.cc",
-        "lib/strings/base64_test.cc",
-        "lib/strings/numbers_test.cc",
-        "lib/strings/scanner_test.cc",
-        "lib/strings/str_util_test.cc",
-        "lib/strings/strcat_test.cc",
-        "lib/strings/stringprintf_test.cc",
         "lib/wav/wav_io_test.cc",
-        "platform/fingerprint_test.cc",
-        "platform/integral_types_test.cc",
-        "platform/logging_test.cc",
-        "platform/mutex_test.cc",
-        "platform/net_test.cc",
-        "platform/port_test.cc",
-        "platform/profile_utils/cpu_utils_test.cc",
-        "platform/stacktrace_handler_test.cc",
-        "platform/subprocess_test.cc",
-        "platform/vmodule_benchmark_test.cc",
+        "//tensorflow/core/lib/core:legacy_lib_core_all_tests",
+        "//tensorflow/core/lib/gtl:legacy_lib_gtl_tests",
+        "//tensorflow/core/lib/math:math_util_test.cc",
+        "//tensorflow/core/lib/random:legacy_lib_random_tests",
+        "//tensorflow/core/lib/strings:legacy_low_level_library_tests",
+        "//tensorflow/core/platform:fingerprint_test.cc",
+        "//tensorflow/core/platform:integral_types_test.cc",
+        "//tensorflow/core/platform:logging_test.cc",
+        "//tensorflow/core/platform:mutex_test.cc",
+        "//tensorflow/core/platform:net_test.cc",
+        "//tensorflow/core/platform:port_test.cc",
+        "//tensorflow/core/platform:profile_utils/cpu_utils_test.cc",
+        "//tensorflow/core/platform:scanner_test.cc",
+        "//tensorflow/core/platform:stacktrace_handler_test.cc",
+        "//tensorflow/core/platform:str_util_test.cc",
+        "//tensorflow/core/platform:stringpiece_test.cc",
+        "//tensorflow/core/platform:stringprintf_test.cc",
+        "//tensorflow/core/platform:subprocess_test.cc",
+        "//tensorflow/core/platform:vmodule_benchmark_test.cc",
     ],
     deps = [
         ":core_cpu_internal",
@@ -3870,6 +3750,10 @@ tf_cc_tests(
         ":protos_all_cc",
         ":test",
         ":test_main",
+        "//tensorflow/core/platform:scanner",
+        "//tensorflow/core/platform:str_util",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:stringprintf",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -3879,7 +3763,7 @@ tf_cc_tests(
 
 tf_cc_test(
     name = "vmodule_test",
-    srcs = ["platform/vmodule_test.cc"],
+    srcs = ["//tensorflow/core/platform:vmodule_test.cc"],
     tags = ["optonly"],
     deps = [
         ":lib",
@@ -3894,7 +3778,7 @@ tf_cc_test(
 
 tf_cc_test(
     name = "lib_random_random_distributions_test",
-    srcs = ["lib/random/random_distributions_test.cc"],
+    srcs = ["//tensorflow/core/lib/random:legacy_lib_random_random_distributions_test"],
     tags = ["optonly"],
     deps = [
         ":lib",
@@ -3910,18 +3794,18 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_strings_test",
     size = "small",
-    srcs = ["platform/platform_strings_test.cc"],
+    srcs = ["//tensorflow/core/platform:platform_strings_test.cc"],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":lib",
-        ":platform_strings",
+        "//tensorflow/core/platform:platform_strings",
     ],
 )
 
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
-    srcs = ["platform/env_test.cc"],
+    srcs = ["//tensorflow/core/platform:env_test.cc"],
     deps = [
         ":lib",
         ":lib_internal",
@@ -3936,7 +3820,7 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_fake_python_env_test",
     size = "small",
-    srcs = ["platform/fake_python_env_test.cc"],
+    srcs = ["//tensorflow/core/platform:fake_python_env_test.cc"],
     args = [
         "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py",
     ],
@@ -3959,7 +3843,7 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_abi_test",
     size = "small",
-    srcs = ["platform/abi_test.cc"],
+    srcs = ["//tensorflow/core/platform:abi_test.cc"],
     deps = [
         ":framework",
         ":lib",
@@ -3975,7 +3859,7 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_numa_test",
     size = "small",
-    srcs = ["platform/numa_test.cc"],
+    srcs = ["//tensorflow/core/platform:numa_test.cc"],
     tags = [
         # This test will not pass unless it has access to all NUMA nodes
         # on the executing machine.
@@ -3997,7 +3881,7 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_setround_test",
     size = "small",
-    srcs = ["platform/setround_test.cc"],
+    srcs = ["//tensorflow/core/platform:setround_test.cc"],
     tags = [
         "noasan",
         "noclang",
@@ -4016,7 +3900,7 @@ tf_cc_test(
 tf_cc_test(
     name = "platform_file_system_test",
     size = "small",
-    srcs = ["platform/file_system_test.cc"],
+    srcs = ["//tensorflow/core/platform:file_system_test.cc"],
     deps = [
         ":lib",
         ":lib_internal",
@@ -4067,7 +3951,7 @@ tf_cc_test(
 
 tf_cc_test(
     name = "lib_strings_ordered_code_test",
-    srcs = ["lib/strings/ordered_code_test.cc"],
+    srcs = ["//tensorflow/core/lib/strings:legacy_strings_ordered_code_test"],
     extra_copts = ["$(STACK_FRAME_UNLIMITED)"],  # Tests initialize large vectors
     deps = [
         ":lib",
@@ -4079,7 +3963,7 @@ tf_cc_test(
 
 tf_cc_test(
     name = "lib_strings_proto_serialization_test",
-    srcs = ["lib/strings/proto_serialization_test.cc"],
+    srcs = ["//tensorflow/core/lib/strings:legacy_strings_proto_serialization_test"],
     deps = [
         ":lib",
         ":lib_internal",
@@ -4094,7 +3978,7 @@ tf_cc_test(
 tf_cc_test(
     name = "lib_random_weighted_picker_test",
     size = "medium",
-    srcs = ["lib/random/weighted_picker_test.cc"],
+    srcs = ["//tensorflow/core/lib/random:legacy_lib_random_random_weighted_picker_test"],
     deps = [
         ":lib",
         ":lib_internal",
@@ -4586,6 +4470,20 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test_gpu(
+    name = "rocm_rocdl_path_test",
+    size = "small",
+    srcs = ["//tensorflow/core/platform:rocm_rocdl_path_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_gpu_tests_tags(),
+    deps = [
+        ":lib",
+        ":test",
+        ":test_main",
+        "//tensorflow/core/platform:rocm_rocdl_path",
+    ],
+)
+
 tf_cuda_only_cc_test(
     name = "util_gpu_kernel_helper_test",
     srcs = [
@@ -4658,7 +4556,7 @@ tf_cc_test(
     size = "small",
     srcs = ["common_runtime/constant_folding_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
     deps = [
         ":core",
         ":core_cpu",
@@ -4724,12 +4622,14 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["common_runtime/process_function_library_runtime_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_rocm"],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
         ":framework",
         ":framework_internal",
         ":lib",
+        ":protos_all_cc",
         ":test",
         ":test_main",
         ":testlib",
@@ -5328,7 +5228,7 @@ tf_cc_test(
 tf_cc_test_gpu(
     name = "device_tracer_test",
     size = "small",
-    srcs = ["platform/device_tracer_test.cc"],
+    srcs = ["//tensorflow/core/platform:device_tracer_test.cc"],
     args =
         ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -5534,11 +5434,19 @@ filegroup(
     testonly = 1,
     srcs = [
         # A simple key-value store:
+        #   0 : 'b'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'b'
+        # Which is then overwritten with:
         #   0 : 'a'
         #   1 : 'b'
         #    ...
         #   9 : 'j'
         "lib/lmdb/testdata/data.mdb",
+        # LMDB, being a memory-mapped database, uses a different file format on
+        # big-endian systems.
+        "lib/lmdb/testdata/data_bigendian.mdb",
     ],
     visibility = ["//visibility:public"],
 )
@@ -5552,10 +5460,12 @@ filegroup(
 
 cc_library(
     name = "cuda_libdevice_path",
-    srcs = tf_additional_libdevice_srcs(),
-    hdrs = ["platform/cuda_libdevice_path.h"],
+    srcs = [
+        "//tensorflow/core/platform:legacy_libdevice_srcs",
+    ],
     copts = tf_copts(),
     data = tf_additional_libdevice_data(),
+    textual_hdrs = ["//tensorflow/core/platform:cuda_libdevice_path.h"],
     visibility = ["//visibility:public"],
     deps = [
         ":lib",
@@ -5569,9 +5479,9 @@ transitive_hdrs(
         ":core_cpu",
         ":framework",
         ":lib",
-        ":platform_strings",
         ":protos_all_cc",
         ":stream_executor",
+        "//tensorflow/core/platform:platform_strings",
     ],
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_AnonymousMemoryCache.pbtxt b/tensorflow/core/api_def/base_api/api_def_AnonymousMemoryCache.pbtxt
new file mode 100644
index 00000000000..7c6d161b236
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AnonymousMemoryCache.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AnonymousMemoryCache"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AnonymousRandomSeedGenerator.pbtxt b/tensorflow/core/api_def/base_api/api_def_AnonymousRandomSeedGenerator.pbtxt
new file mode 100644
index 00000000000..327a0682dc8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AnonymousRandomSeedGenerator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AnonymousRandomSeedGenerator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdagradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..07366bfd367
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdagradV2.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "ApplyAdagradV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the adagrad scheme."
+  description: <<END
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt
index 8a213aad9d2..47956bd9a48 100644
--- a/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt
@@ -17,7 +17,7 @@ Usage:
   import tensorflow as tf
   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
   b = tf.math.argmax(input = a)
-  c = tf.keras.backend.eval(b)  
+  c = tf.keras.backend.eval(b)
   # c = 4
   # here a[4] = 166.32 which is the largest element of a across axis 0
   ```
diff --git a/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt
index 6a5f2fa8e83..5ebee5c48f5 100644
--- a/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt
@@ -17,7 +17,7 @@ Usage:
   import tensorflow as tf
   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
   b = tf.math.argmin(input = a)
-  c = tf.keras.backend.eval(b)  
+  c = tf.keras.backend.eval(b)
   # c = 0
   # here a[0] = 1 which is the smallest element of a across axis 0
   ```
diff --git a/tensorflow/core/api_def/base_api/api_def_AssertNextDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssertNextDataset.pbtxt
index c97b807713f..a27c3e34a29 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssertNextDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssertNextDataset.pbtxt
@@ -1,4 +1,29 @@
 op {
   graph_op_name: "AssertNextDataset"
   visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+`AssertNextDataset` passes through the outputs of its input dataset.
+END
+  }
+  in_arg {
+  name: "transformations"
+  description: <<END
+A `tf.string` vector `tf.Tensor` identifying the transformations that are
+expected to happen next.
+END
+  }
+  summary: "A transformation that asserts which transformations happen next."
+  description: <<END
+This transformation checks whether the camel-case names (i.e. "FlatMap", not
+"flat_map") of the transformations following this transformation match the list
+of names in the `transformations` argument. If there is a mismatch, the
+transformation raises an exception.
+
+The check occurs when iterating over the contents of the dataset, which
+means that the check happens *after* any static optimizations are applied
+to the dataset graph.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BlockLSTMGradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BlockLSTMGradV2.pbtxt
new file mode 100644
index 00000000000..638d1549804
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BlockLSTMGradV2.pbtxt
@@ -0,0 +1,171 @@
+op {
+  graph_op_name: "BlockLSTMGradV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "seq_len_max"
+    description: <<END
+Maximum time length actually used by this input. Outputs are padded
+with zeros beyond this length.
+END
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+The sequence input to the LSTM, shape (timelen, batch_size, num_inputs).
+END
+  }
+  in_arg {
+    name: "cs_prev"
+    description: <<END
+Value of the initial cell state.
+END
+  }
+  in_arg {
+    name: "h_prev"
+    description: <<END
+Initial output of cell (to be used for peephole).
+END
+  }
+  in_arg {
+    name: "w"
+    description: <<END
+The weight matrix.
+END
+  }
+  in_arg {
+    name: "wci"
+    description: <<END
+The weight matrix for input gate peephole connection.
+END
+  }
+  in_arg {
+    name: "wcf"
+    description: <<END
+The weight matrix for forget gate peephole connection.
+END
+  }
+  in_arg {
+    name: "wco"
+    description: <<END
+The weight matrix for output gate peephole connection.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+The bias vector.
+END
+  }
+  in_arg {
+    name: "i"
+    description: <<END
+The input gate over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "cs"
+    description: <<END
+The cell state before the tanh over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "f"
+    description: <<END
+The forget gate over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "o"
+    description: <<END
+The output gate over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "ci"
+    description: <<END
+The cell input over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "co"
+    description: <<END
+The cell after the tanh over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "h"
+    description: <<END
+The output h vector over the whole time sequence.
+END
+  }
+  in_arg {
+    name: "cs_grad"
+    description: <<END
+The current gradient of cs.
+END
+  }
+  in_arg {
+    name: "h_grad"
+    description: <<END
+The gradient of h vector.
+END
+  }
+  out_arg {
+    name: "x_grad"
+    description: <<END
+The gradient of x to be back-propped.
+END
+  }
+  out_arg {
+    name: "cs_prev_grad"
+    description: <<END
+The gradient of cs_prev to be back-propped.
+END
+  }
+  out_arg {
+    name: "h_prev_grad"
+    description: <<END
+The gradient of h_prev to be back-propped.
+END
+  }
+  out_arg {
+    name: "w_grad"
+    description: <<END
+The gradient for w to be back-propped.
+END
+  }
+  out_arg {
+    name: "wci_grad"
+    description: <<END
+The gradient for wci to be back-propped.
+END
+  }
+  out_arg {
+    name: "wcf_grad"
+    description: <<END
+The gradient for wcf to be back-propped.
+END
+  }
+  out_arg {
+    name: "wco_grad"
+    description: <<END
+The gradient for wco to be back-propped.
+END
+  }
+  out_arg {
+    name: "b_grad"
+    description: <<END
+The gradient for w to be back-propped.
+END
+  }
+  attr {
+    name: "use_peephole"
+    description: <<END
+Whether to use peephole weights.
+END
+  }
+  summary: "Computes the LSTM cell backward propagation for the entire time sequence."
+  description: <<END
+This implementation is to be used in conjunction of BlockLSTMV2.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BlockLSTMV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BlockLSTMV2.pbtxt
new file mode 100644
index 00000000000..4da9ebaf863
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BlockLSTMV2.pbtxt
@@ -0,0 +1,137 @@
+op {
+  graph_op_name: "BlockLSTMV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "seq_len_max"
+    description: <<END
+Maximum time length actually used by this input. Outputs are padded
+with zeros beyond this length.
+END
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+The sequence input to the LSTM, shape (timelen, batch_size, num_inputs).
+END
+  }
+  in_arg {
+    name: "cs_prev"
+    description: <<END
+Value of the initial cell state.
+END
+  }
+  in_arg {
+    name: "h_prev"
+    description: <<END
+Initial output of cell (to be used for peephole).
+END
+  }
+  in_arg {
+    name: "w"
+    description: <<END
+The weight matrix.
+END
+  }
+  in_arg {
+    name: "wci"
+    description: <<END
+The weight matrix for input gate peephole connection.
+END
+  }
+  in_arg {
+    name: "wcf"
+    description: <<END
+The weight matrix for forget gate peephole connection.
+END
+  }
+  in_arg {
+    name: "wco"
+    description: <<END
+The weight matrix for output gate peephole connection.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+The bias vector.
+END
+  }
+  out_arg {
+    name: "i"
+    description: <<END
+The input gate over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "cs"
+    description: <<END
+The cell state before the tanh over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "f"
+    description: <<END
+The forget gate over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "o"
+    description: <<END
+The output gate over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "ci"
+    description: <<END
+The cell input over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "co"
+    description: <<END
+The cell after the tanh over the whole time sequence.
+END
+  }
+  out_arg {
+    name: "h"
+    description: <<END
+The output h vector over the whole time sequence.
+END
+  }
+  attr {
+    name: "cell_clip"
+    description: <<END
+Value to clip the 'cs' value to.
+END
+  }
+  attr {
+    name: "use_peephole"
+    description: <<END
+Whether to use peephole weights.
+END
+  }
+  summary: "Computes the LSTM cell forward propagation for all the time steps."
+  description: <<END
+This is equivalent to applying LSTMBlockCell in a loop, like so:
+
+```python
+for x1 in unpack(x):
+  i1, cs1, f1, o1, ci1, co1, h1 = LSTMBlock(
+    x1, cs_prev, h_prev, w, wci, wcf, wco, b)
+  cs_prev = cs1
+  h_prev = h1
+  i.append(i1)
+  cs.append(cs1)
+  f.append(f1)
+  o.append(o1)
+  ci.append(ci1)
+  co.append(co1)
+  h.append(h1)
+return pack(i), pack(cs), pack(f), pack(o), pack(ci), pack(ch), pack(h)
+
+Note that unlike LSTMBlockCell (and BlockLSTM) which uses ICFO gate layout, 
+this op uses IFCO. So in order for the following snippet to be equivalent
+all gate-related outputs should be reordered.
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
index 729ba2c4a69..d48fe6b1936 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
@@ -78,6 +78,7 @@ END
     name: "split_with_default_directions"
     description: <<END
 A Rank 1 tensors indicating the which direction to go if data is missing. See above for details like shapes and sizes.
+Inequality with default left returns 0, inequality with default right returns 1, equality with default right returns 2.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt
new file mode 100644
index 00000000000..bcd7cc5978d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "BoostedTreesFlushQuantileSummaries"
+  visibility: HIDDEN
+  in_arg {
+    name: "quantile_stream_resource_handle"
+    description: <<END
+resource handle referring to a QuantileStreamResource.
+END
+  }
+  summary: "Flush the quantile summaries from each quantile stream resource."
+  description: <<END
+An op that outputs a list of quantile summaries of a quantile stream resource.
+Each summary Tensor is rank 2, containing summaries (value, weight, min_rank, 
+max_rank) for a single feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
index ff39bbe5143..50e4a53020f 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
@@ -91,6 +91,7 @@ END
     name: "split_with_default_directions"
     description: <<END
 A Rank 1 tensor indicating which direction to go if data is missing.
+Inequality with default left returns 0, inequality with default right returns 1, equality with default right returns 2.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt
new file mode 100644
index 00000000000..26f1f20843e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt
@@ -0,0 +1,100 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsembleV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the ensemble variable.
+END
+  }
+  in_arg {
+    name: "feature_ids"
+    description: <<END
+Rank 1 tensor with ids for each feature. This is the real id of
+the feature that will be used in the split.
+END
+  }
+  in_arg {
+    name: "dimension_ids"
+    description: <<END
+List of rank 1 tensors representing the dimension in each feature.
+END
+  }
+  in_arg {
+    name: "node_ids"
+    description: <<END
+List of rank 1 tensors representing the nodes for which this feature
+has a split.
+END
+  }
+  in_arg {
+    name: "gains"
+    description: <<END
+List of rank 1 tensors representing the gains for each of the feature's
+split.
+END
+  }
+  in_arg {
+    name: "thresholds"
+    description: <<END
+List of rank 1 tensors representing the thesholds for each of the
+feature's split.
+END
+  }
+  in_arg {
+    name: "left_node_contribs"
+    description: <<END
+List of rank 2 tensors with left leaf contribs for each of
+the feature's splits. Will be added to the previous node values to constitute
+the values of the left nodes.
+END
+  }
+  in_arg {
+    name: "right_node_contribs"
+    description: <<END
+List of rank 2 tensors with right leaf contribs for each
+of the feature's splits. Will be added to the previous node values to constitute
+the values of the right nodes.
+END
+  }
+  in_arg {
+    name: "split_types"
+    description: <<END
+List of rank 1 tensors representing the split type for each feature.
+END
+  }
+  in_arg {
+    name: "max_depth"
+    description: <<END
+Max depth of the tree to build.
+END
+  }
+  in_arg {
+    name: "learning_rate"
+    description: <<END
+shrinkage const for each new tree.
+END
+  }
+  in_arg {
+    name: "pruning_mode"
+    description: <<END
+0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+Number of features that have best splits returned. INFERRED.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits
+END
+  }
+  summary: "Updates the tree ensemble by adding a layer to the last tree being grown"
+  description: <<END
+or by starting a new tree.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDatasetV2.pbtxt
new file mode 100644
index 00000000000..665d7cef410
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CacheDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConfigureTPUEmbedding.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConfigureTPUEmbedding.pbtxt
new file mode 100644
index 00000000000..4198734b775
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConfigureTPUEmbedding.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "ConfigureTPUEmbedding"
+  visibility: HIDDEN
+  attr {
+    name: "config"
+    description: <<END
+Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+describes the embedding lookups of the program.
+END
+  }
+  summary: "Sets up TPUEmbedding in a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt b/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt
new file mode 100644
index 00000000000..7db367c71bd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "CumulativeLogsumexp"
+  visibility: HIDDEN
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`. Must be one of the following types: `float16`, `float32`, `float64`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: 0). Must be in the range
+`[-rank(x), rank(x))`.
+END
+  }
+  attr {
+    name: "exclusive"
+    description: <<END
+If `True`, perform exclusive cumulative log-sum-exp.
+END
+  }
+  attr {
+    name: "reverse"
+    description: <<END
+A `bool` (default: False).
+END
+  }
+  summary: "Compute the cumulative product of the tensor `x` along `axis`."
+  description: <<END
+By default, this op performs an inclusive cumulative log-sum-exp,
+which means that the first
+element of the input is identical to the first element of the output:
+```python
+tf.math.cumulative_logsumexp([a, b, c])  # => [a, log(exp(a) + exp(b)), log(exp(a) + exp(b) + exp(c))]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumulative log-sum-exp is
+performed instead:
+```python
+tf.cumulative_logsumexp([a, b, c], exclusive=True)  # => [-inf, a, log(exp(a) * exp(b))]
+```
+Note that the neutral element of the log-sum-exp operation is `-inf`,
+however, for performance reasons, the minimal value representable by the
+floating point type is used instead.
+
+By setting the `reverse` kwarg to `True`, the cumulative log-sum-exp is performed in the
+opposite direction.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeleteMemoryCache.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeleteMemoryCache.pbtxt
new file mode 100644
index 00000000000..791e608002b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeleteMemoryCache.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeleteMemoryCache"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeleteRandomSeedGenerator.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeleteRandomSeedGenerator.pbtxt
new file mode 100644
index 00000000000..3197405980f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeleteRandomSeedGenerator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeleteRandomSeedGenerator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
index b8455308e5c..d45abf5630e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
@@ -8,9 +8,9 @@ A variant tensor representing the input dataset.
 END
   }
   in_arg {
-  name: "num_workers"
+  name: "num_replicas"
   description: <<END
-A scalar representing the number of workers to distribute this batch across. As
+A scalar representing the number of replicas to distribute this batch across. As
 a result of this transformation the current batch size would end up being
 divided  by this parameter.
 END
@@ -18,6 +18,6 @@ END
   summary: "Creates a dataset that changes the batch size."
   description: <<END
 Creates a dataset that changes the batch size of the dataset to current batch
-size // num_workers.
+size // num_replicas.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
index 9f3f9b276b4..68b78be790b 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -21,13 +21,13 @@ END
   }
   summary: "Gather slices from `params` into a Tensor with shape specified by `indices`."
   description: <<END
-`indices` is an K-dimensional integer tensor, best thought of as a
+`indices` is a K-dimensional integer tensor, best thought of as a
 (K-1)-dimensional tensor of indices into `params`, where each element defines a
 slice of `params`:
 
     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
 
-Whereas in `tf.gather` `indices` defines slices into the first
+Whereas in `tf.gather` `indices` defines slices into the `axis`
 dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
 first `N` dimensions of `params`, where `N = indices.shape[-1]`.
 
diff --git a/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt b/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt
index c6cb1c17cc5..44c67e6a015 100644
--- a/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt
@@ -28,10 +28,10 @@ for dtype in dtype_list:
                                       input_tensor, bitwise_ops.invert(input_tensor)),
                                     bitwise_ops.invert(
                                       tf.constant(0, dtype=dtype))]
-  
+
   expected = tf.constant([0, 0, 0, 0], dtype=tf.float32)
   tf.assert_equal(tf.cast(not_a_and_a, tf.float32), expected)
-  
+
   expected = tf.cast([not_0] * 4, tf.float32)
   tf.assert_equal(tf.cast(not_a_or_a, tf.float32), expected)
 
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 923076e1d5c..eebdcbf93a7 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -60,6 +60,13 @@ END
 If True, then the absolute value of the quantized minimum value is the same as
 the quantized maximum value, instead of 1 greater.
 i.e. for 8 bit quantization, the minimum value is -127 instead of -128.
+END
+  }
+  attr {
+    name: "axis"
+    description: <<END
+If specified, this axis is treated as a channel or slice axis, and a separate
+quantization range is used for each channel or slice along this axis.
 END
   }
   summary: "Quantizes then dequantizes a tensor."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToTensor.pbtxt
new file mode 100644
index 00000000000..17462214e75
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToTensor.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "RaggedTensorToTensor"
+  visibility: HIDDEN
+  attr {
+    name: "row_partition_types"
+    description: <<END
+The types of the row partition tensors. At present, these can be:
+* "ROW_SPLITS": the row_splits tensor from the ragged tensor.
+* "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
+* "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
+  is preceeded by "FIRST_DIM_SIZE".
+The tensors are in the order of the dimensions.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The desired shape of the the output tensor. If left unspecified (empty),
+the minimal shape required to contain all the elements in the ragged tensor
+(the natural shape) will be used. If some dimensions are left unspecified, then
+the size of the natural shape is used in that dimension.
+
+Note that dense dimensions cannot be modified by the shape argument. Trying to
+change the size of a dense dimension will cause the op to fail.
+Examples:
+natural shape: [4, 5, 6]
+shape: -1
+output shape: [4, 5, 6]
+
+natural shape: [4, 5, 6]
+shape: [3, -1, 2]
+output shape: [3, 5, 2]
+
+natural shape: [4, 5, 6]
+shape: [3, 7, 2]
+output shape: [3, 7, 2]
+
+END
+  }
+in_arg {
+    name: "values"
+    description: <<END
+A 1D tensor representing the values of the ragged tensor.
+END
+  }
+  in_arg {
+    name: "default_value"
+    description: <<END
+The default_value when the shape is larger than the ragged tensor. The
+default_value is broadcast until it is the shape of the output tensor, and
+then overwritten by values in the ragged tensor. The default value must be
+compatible with this broadcast operation, and must have fewer dimensions than
+the value tensor.
+END
+  }
+  out_arg {
+    name: "result"
+    description: "The resulting dense tensor."
+  }
+  summary: <<END
+Create a dense tensor from a ragged tensor, possibly altering its shape.
+END
+  description: <<END
+The `ragged_to_dense` op creates a dense tensor from a list of row partition
+tensors, a value vector, and default values. If the shape is unspecified, the
+minimal shape required to contain all the elements in the ragged tensor (the
+natural shape) will be used. If some dimensions are left unspecified, then the
+size of the natural shape is used in that dimension.
+
+The default_value will be broadcast to the output shape. After that, the values
+from the ragged tensor overwrite the default values. Note that the default_value
+must have less dimensions than the value.
+
+The row partition tensors are in the order of the dimensions.
+At present, the types can be:
+* "ROW_SPLITS": the row_splits tensor from the ragged tensor.
+* "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
+* "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
+  is preceded by "FIRST_DIM_SIZE".
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
index 103e608ae8f..18bf94a7abb 100644
--- a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
@@ -16,4 +16,16 @@ A second scalar seed to avoid seed collision.
 END
   }
   summary: "Creates a Dataset that returns pseudorandom numbers."
+  description: <<END
+Creates a Dataset that returns a stream of uniformly distributed
+pseudorandom 64-bit signed integers.
+
+In the TensorFlow Python API, you can instantiate this dataset via the 
+class `tf.data.experimental.RandomDataset`.
+
+Instances of this dataset are also created as a result of the
+`hoist_random_uniform` static optimization. Whether this optimization is
+performed is determined by the `experimental_optimization.hoist_random_uniform`
+option of `tf.data.Options`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt
index 7017e37ce58..9375f3e1854 100644
--- a/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt
@@ -8,9 +8,9 @@ A variant tensor representing the input dataset.
 END
   }
   in_arg {
-  name: "num_workers"
+  name: "num_replicas"
   description: <<END
-A scalar representing the number of workers to distribute this batch across. As
+A scalar representing the number of replicas to distribute this batch across. As
 a result of this transformation the current batch size would end up being
 divided  by this parameter.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..d99d4184a2c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "ResourceApplyAdagradV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the adagrad scheme."
+  description: <<END
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..5c98df62072
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradV2.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagradV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt
index 0c6fccdacc5..48c01e9cae1 100644
--- a/tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt
@@ -4,8 +4,8 @@ op {
   in_arg {
     name: "rate"
     description: <<END
-A scalar representing the sample rate of elements from the `input_dataset`
-that should be taken.
+A scalar representing the sample rate. Each element of `input_dataset` is 
+retained with this probability, independent of all other elements.
 END
   }
   in_arg {
@@ -20,5 +20,12 @@ END
 A scalar representing seed2 of random number generator.
 END
   }
-  summary: "Creates a dataset that contains `rate` elements from the `input_dataset`."
+  summary: "Creates a dataset that takes a Bernoulli sample of the contents of another dataset."
+  description: <<END
+There is no transformation in the `tf.data` Python API for creating this dataset.
+Instead, it is created as a result of the `filter_with_random_uniform_fusion`
+static optimization. Whether this optimization is performed is determined by the
+`experimental_optimization.filter_with_random_uniform_fusion` option of
+`tf.data.Options`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV2.pbtxt
new file mode 100644
index 00000000000..b5a6e2df198
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShuffleDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..e44d329e218
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradV2.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "SparseApplyAdagradV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+$$accum += grad * grad$$
+$$var -= lr * grad * (1 / sqrt(accum))$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt
new file mode 100644
index 00000000000..d3d1a01ed37
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "StringNGrams"
+  in_arg {
+    name: "data"
+    description: <<END
+The values tensor of the ragged string tensor to make ngrams out of. Must be a
+1D string tensor.
+END
+  }
+  in_arg {
+    name: "data_splits"
+    description: <<END
+The splits tensor of the ragged string tensor to make ngrams out of.
+END
+  }
+  out_arg {
+    name: "ngrams"
+    description: <<END
+The values tensor of the output ngrams ragged tensor.
+END
+  }
+  out_arg {
+    name: "ngrams_splits"
+    description: <<END
+The splits tensor of the output ngrams ragged tensor.
+END
+  }
+  attr {
+    name: "separator"
+    description: <<END
+The string to append between elements of the token. Use "" for no separator.
+END
+  }
+  attr {
+    name: "ngram_widths"
+    description: <<END
+The sizes of the ngrams to create.
+END
+  }
+  attr {
+    name: "left_pad"
+    description: <<END
+The string to use to pad the left side of the ngram sequence. Only used if
+pad_width != 0.
+END
+  }
+  attr {
+    name: "right_pad"
+    description: <<END
+The string to use to pad the right side of the ngram sequence. Only used if
+pad_width != 0.
+END
+}
+  attr {
+    name: "pad_width"
+    description: <<END
+The number of padding elements to add to each side of each
+sequence. Note that padding will never be greater than 'ngram_widths'-1
+regardless of this value. If `pad_width=-1`, then add `max(ngram_widths)-1`
+elements.
+END
+  }
+  summary: "Creates ngrams from ragged string data."
+  description: <<END
+This op accepts a ragged tensor with 1 ragged dimension containing only
+strings and outputs a ragged tensor with 1 ragged dimension containing ngrams
+of that string, joined along the innermost axis.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt b/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt
index 3ec746a117a..1e8357cdb17 100644
--- a/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt
@@ -53,8 +53,8 @@ Computes the SVD of each inner matrix in `input` such that
 ```python
 # a is a tensor containing a batch of matrices.
 # s is a tensor of singular values for each matrix.
-# u is the tensor containing of left singular vectors for each matrix.
-# v is the tensor containing of right singular vectors for each matrix.
+# u is the tensor containing the left singular vectors for each matrix.
+# v is the tensor containing the right singular vectors for each matrix.
 s, u, v = svd(a)
 s, _, _ = svd(a, compute_uv=False)
 ```
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt
index 73297c03003..988d8b6549c 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt
@@ -1,6 +1,6 @@
 op {
   graph_op_name: "TensorListPushBack"
-  summary: "Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`."
+  summary: "Returns a list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`."
   description: <<END
 tensor: The tensor to put on the list.
 input_handle: The old list.
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
index 97c380700a2..16e0a8f6bfb 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -21,9 +21,24 @@ An 2-D (or 1-D if indices is 0-D) tensor where each row has the
 same shape as the indices array.
 END
   }
-  summary: "Converts a flat index or array of flat indices into a tuple of"
+  summary: "Converts an array of flat indices into a tuple of coordinate arrays."
   description: <<END
-coordinate arrays.
+
+Example:
+
+```
+y = tf.unravel_index(indices=[2, 5, 7], dims=[3, 3])
+# 'dims' represent a hypothetical (3, 3) tensor of indices:
+# [[0, 1, *2*],
+#  [3, 4, *5*],
+#  [6, *7*, 8]]
+# For each entry from 'indices', this operation returns
+# its coordinates (marked with '*'), such as
+# 2 ==> (0, 2)
+# 5 ==> (1, 2)
+# 7 ==> (2, 1)
+y ==> [[0, 1, 2], [2, 2, 1]]
+```
 
 @compatibility(numpy)
 Equivalent to np.unravel_index
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
index ddac98d4f28..2b3a8f609e6 100644
--- a/tensorflow/core/api_def/excluded_ops.cc
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -42,9 +42,9 @@ const std::unordered_set<std::string>* GetExcludedOps() {
           "QuantizedMatMulWithBiasAndReluAndRequantize",
 #endif  // INTEL_MKL
 #ifdef GOOGLE_TENSORRT
-          "CreateTRTEngineCacheHandle",
-          "PopulateTRTEngineCache",
-          "DumpTRTEngineCache",
+          "CreateTRTResourceHandle",
+          "InitializeTRTResource",
+          "SerializeTRTResource",
           "GetCalibrationDataOp",
           "TRTEngineOp",
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/core/api_def/python_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/python_api/api_def_Fill.pbtxt
new file mode 100644
index 00000000000..ba9b5dcfdb1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Fill.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fill"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt b/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
index ee20249094c..d3694dcbc4a 100644
--- a/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "Reshape"
-  endpoint {
-    name: "reshape"
-  }
-  endpoint {
-    name: "manip.reshape"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringNGrams.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringNGrams.pbtxt
new file mode 100644
index 00000000000..acefd9ba024
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringNGrams.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringNGrams"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 62424ebe261..15e58ce0cf8 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -85,9 +85,9 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
       // With `parallel_iterations == 1` it's safe to use TemporaryVariable.
       if (is_in_while_loop) {
         int parallel_iterations;
-        Status s = GetNodeAttr(frame->attrs(), kParallelIterationsAttrName,
-                               &parallel_iterations);
-        if (s.ok() && parallel_iterations == 1) {
+        bool found = TryGetNodeAttr(frame->attrs(), kParallelIterationsAttrName,
+                                    &parallel_iterations);
+        if (found && parallel_iterations == 1) {
           is_in_while_loop = false;
         }
       }
@@ -112,8 +112,8 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
 
       // The pieces of AccumulateNV2 should all be on the same node.
       node_builder.Device(n->requested_device());
-      string colo;
-      if (GetNodeAttr(n_attrs, kColocationAttrName, &colo).ok()) {
+      const string& colo = GetNodeAttrString(n_attrs, kColocationAttrName);
+      if (!colo.empty()) {
         node_builder.Attr(kColocationAttrName, colo);
       }
       return node_builder;
@@ -261,8 +261,8 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
             .Attr("T", dtype)
             .Input(data_inputs)
             .ControlInputs(control_inputs);
-    string colo;
-    if (GetNodeAttr(n_attrs, kColocationAttrName, &colo).ok()) {
+    const string& colo = GetNodeAttrString(n_attrs, kColocationAttrName);
+    if (!colo.empty()) {
       builder.Attr(kColocationAttrName, colo);
     }
     TF_RETURN_IF_ERROR(builder.Finalize(g, &add_n_node));
diff --git a/tensorflow/core/common_runtime/allocator_retry.cc b/tensorflow/core/common_runtime/allocator_retry.cc
index f3b51c5ca51..3402b7fd919 100644
--- a/tensorflow/core/common_runtime/allocator_retry.cc
+++ b/tensorflow/core/common_runtime/allocator_retry.cc
@@ -29,7 +29,6 @@ void* AllocatorRetry::AllocateRaw(
         alloc_func,
     int max_millis_to_wait, size_t alignment, size_t num_bytes) {
   if (num_bytes == 0) {
-    LOG(WARNING) << "Request to allocate 0 bytes";
     return nullptr;
   }
   uint64 deadline_micros = 0;
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 0734e53f54e..c728c294f37 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -159,11 +159,7 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
                            ")");
   }
 
-  Tensor Scalar(int v) const override {
-    Tensor t(dt_, TensorShape({}));
-    t.scalar<T>()() = v;
-    return t;
-  }
+  Tensor Scalar(int v) const override { return Tensor(static_cast<T>(v)); }
 
   Tensor Scalar(Allocator* a, const AllocationAttributes& attr) const override {
     Tensor t(a, dt_, TensorShape({}), attr);
@@ -187,6 +183,10 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
                                          Allocator* allocator,
                                          bool align_chunks) {
   switch (output->dtype()) {
+    case DT_HALF:
+      return new CollectiveAdapterImpl<Eigen::half>(output, num_chunks,
+                                                    allocator, align_chunks);
+      break;
     case DT_FLOAT:
       return new CollectiveAdapterImpl<float>(output, num_chunks, allocator,
                                               align_chunks);
@@ -204,7 +204,7 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
                                               align_chunks);
       break;
     default:
-      LOG(FATAL) << "Unsupported type " << output->dtype()
+      LOG(FATAL) << "Unsupported type " << DataTypeString(output->dtype())
                  << " to MakeCollectiveAdapter";
       return nullptr;
   }
@@ -262,11 +262,9 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
     delete col_impl;
     return;
   }
-  // Run in an I/O thread, so as not to starve the executor threads.
-  // TODO(b/80529858): Instead of forking every per-device Collective
-  // Op off into its own thread, consider queuing them on a
-  // fixed-size thread-pool dedicated to running CollectiveOps.
-  SchedClosure([col_impl, col_ctx, done_safe, ctx]() {
+  // Run on an unbounded work queue that can handle blocking work so as to not
+  // starve executor threads.
+  remote_access_->RunClosure([col_impl, col_ctx, done_safe, ctx]() {
     profiler::TraceMe activity(
         [&] {
           return strings::StrCat(ctx->op_kernel().name(), ":",
@@ -292,29 +290,32 @@ void BaseCollectiveExecutor::CompleteParamsAsync(
 Status BaseCollectiveExecutor::CreateCollective(
     const CollectiveParams& col_params,
     CollectiveImplementationInterface** col_impl) {
+  VLOG(2) << "CreateCollective type "
+          << DataTypeString(col_params.instance.data_type) << " name "
+          << col_params.instance.impl_details.collective_name;
   *col_impl = nullptr;
-  Status status;
   switch (col_params.instance.data_type) {
     case DT_INT32:
-      if (col_params.group.device_type == DEVICE_GPU) {
-        status = errors::Internal(
-            "CollectiveImplementation does not support datatype DT_INT32 on "
+      if (col_params.group.device_type == DEVICE_GPU &&
+          col_params.instance.type == REDUCTION_COLLECTIVE) {
+        // TODO(b/139421603): enable int32 all-reduce on GPU.
+        return errors::Internal(
+            "Collective all-reduce does not support datatype DT_INT32 on "
             "DEVICE_GPU");
       }
       TF_FALLTHROUGH_INTENDED;
+    case DT_HALF:
     case DT_FLOAT:
     case DT_DOUBLE:
     case DT_INT64: {
-      status = CollectiveRegistry::Lookup(
+      return CollectiveRegistry::Lookup(
           col_params.instance.impl_details.collective_name, col_impl);
-      break;
     }
     default:
-      status = errors::Internal(
+      return errors::Internal(
           "CollectiveImplementation does not support datatype ",
-          col_params.instance.data_type);
+          DataTypeString(col_params.instance.data_type));
   }
-  return status;
 }
 
 bool BaseCollectiveExecutor::CheckDependencies(
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 6ecfca242f8..1f1c8097ea0 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -142,6 +142,10 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                                client_locality, done);
   }
 
+  void RunClosure(std::function<void()> closure) override {
+    remote_access_->RunClosure(std::move(closure));
+  }
+
   // If we need to enforce an ordering on any portion of collective
   // implementation, and the ordering is encoded via attribute on the collective
   // op, this function will block until all dependencies for this collective
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 62461cf7fae..759f692e258 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -30,8 +30,10 @@ limitations under the License.
 namespace tensorflow {
 
 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
-                           bool allow_growth, const string& name)
-    : sub_allocator_(sub_allocator),
+                           bool allow_growth, const string& name,
+                           bool garbage_collection)
+    : garbage_collection_(garbage_collection),
+      sub_allocator_(sub_allocator),
       name_(name),
       free_chunks_list_(kInvalidChunkHandle),
       next_allocation_id_(1) {
@@ -260,6 +262,97 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
   return rounded_bytes;
 }
 
+bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
+    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+  // Do nothing if garbage collection is off.
+  if (!garbage_collection_) {
+    return false;
+  }
+
+  // Searching for free regions.
+  absl::flat_hash_set<void*> free_region_ptrs;
+  size_t total_free_bytes = 0;
+  for (const AllocationRegion& region : region_manager_.regions()) {
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
+    bool any_use = false;
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (c->in_use()) {
+        any_use = true;
+        break;
+      }
+      h = c->next;
+    }
+
+    if (!any_use) {
+      VLOG(2) << "Found free region with ptr = " << region.ptr();
+      free_region_ptrs.insert(region.ptr());
+      total_free_bytes += region.memory_size();
+    }
+  }
+
+  if (total_free_bytes == 0) {
+    return false;
+  }
+
+  // Rough estimation to check whether deallocation can help.
+  size_t available_bytes =
+      memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
+  if (rounded_bytes > available_bytes) {
+    return false;
+  }
+
+  LOG(WARNING) << "Garbage collection: deallocate free memory regions"
+               << " (i.e., allocations) so that we can re-allocate a larger"
+               << " region to avoid OOM due to memory fragmentation. If you"
+               << " see this message frequently, you are running near the"
+               << " threshold of the available device memory and re-allocation"
+               << " may incur great performance overhead. You may try smaller"
+               << " batch sizes to observe the performance impact."
+               << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
+               << " disable this feature.";
+
+  // Deallocate free regions.
+  DeallocateRegions(free_region_ptrs);
+
+  return true;
+}
+
+void BFCAllocator::DeallocateRegions(
+    const absl::flat_hash_set<void*>& region_ptrs)
+    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+  // Explicitly remove the const qualifier as some compilers disallow passing
+  // const_iterator to std::vector::erase(), which is used in
+  // RemoveAllocationRegion().
+  auto regions =
+      const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
+  auto it = regions->begin();
+  while (it != regions->end()) {
+    if (!region_ptrs.contains(it->ptr())) {
+      ++it;
+      continue;
+    }
+
+    VLOG(2) << "Deallocate region with ptr = " << it->ptr();
+    // Remove all chunk registrations from Bins.
+    ChunkHandle h = region_manager_.get_handle(it->ptr());
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (c->bin_num != kInvalidBinNum) {
+        RemoveFreeChunkFromBin(h);
+      }
+      auto h_to_delete = h;
+      h = c->next;
+      DeleteChunk(h_to_delete);
+    }
+
+    // Deallocate the memory.
+    sub_allocator_->Free(it->ptr(), it->memory_size());
+    total_region_allocated_bytes_ -= it->memory_size();
+    it = region_manager_.RemoveAllocationRegion(it);
+  }
+}
+
 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                         size_t num_bytes,
                                         bool dump_log_on_failure,
@@ -307,6 +400,18 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
     }
   }
 
+  // Reaching this point means that no chunks can satisfy the request. Also,
+  // the unallocated bytes cannot satisfy the request. Before giving up, let's
+  // try deallocating free regions so that suballocator can combine them with
+  // the unallocated bytes and form a larger region.
+  if (DeallocateFreeRegions(rounded_bytes) &&
+      Extend(unused_alignment, rounded_bytes)) {
+    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
+    if (ptr != nullptr) {
+      return ptr;
+    }
+  }
+
   // We searched all bins for an existing free chunk to use and
   // couldn't find one.  This means we must have run out of memory,
   // Dump the memory log for analysis.
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index bfd857a5e1b..d51db0ac6ef 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -47,7 +48,8 @@ class BFCAllocator : public Allocator {
  public:
   // Takes ownership of sub_allocator.
   BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
-               bool allow_growth, const string& name);
+               bool allow_growth, const string& name,
+               bool garbage_collection = false);
   ~BFCAllocator() override;
 
   string Name() override { return name_; }
@@ -309,6 +311,11 @@ class BFCAllocator : public Allocator {
       regions_.insert(entry, AllocationRegion(ptr, memory_size));
     }
 
+    std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
+        std::vector<AllocationRegion>::iterator it) {
+      return regions_.erase(it);
+    }
+
     ChunkHandle get_handle(const void* p) const {
       return RegionFor(p)->get_handle(p);
     }
@@ -354,6 +361,18 @@ class BFCAllocator : public Allocator {
   bool Extend(size_t alignment, size_t rounded_bytes)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
+  // Deallocate free regions to give back the memory to suballocator, so that
+  // we can re-allocate a larger region.  The main use scenario of this function
+  // is when OOM happens but we have free regions and the sum of sizes of free
+  // regions and unallocated bytes is larger than the requested size, implying
+  // (external) memory fragmentation.  Returns true if any free regions are
+  // found and freed; false otherwise.
+  bool DeallocateFreeRegions(size_t rounded_bytes);
+
+  // Helper function to deallocate regions.
+  void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
@@ -469,6 +488,10 @@ class BFCAllocator : public Allocator {
   // of the available memory.
   bool started_backpedal_ = false;
 
+  // Whether the allocator will deallocate free regions to avoid OOM due to
+  // memory fragmentation.
+  bool garbage_collection_;
+
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
   SharedCounter* timing_counter_ = nullptr;
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index 7bbc7ca06c5..e9e0082195d 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -31,7 +31,9 @@ CollectiveExecutorMgr::CollectiveExecutorMgr(
       dev_resolver_(std::move(dev_resolver)),
       param_resolver_(std::move(param_resolver)),
       gpu_ring_order_(
-          config.gpu_options().experimental().collective_ring_order()) {}
+          config.gpu_options().experimental().collective_ring_order()),
+      work_queue_(std::make_shared<UnboundedWorkQueue>(Env::Default(),
+                                                       "collective_ops")) {}
 
 CollectiveExecutorMgr::~CollectiveExecutorMgr() {
   for (auto iter : executor_table_) {
@@ -56,8 +58,8 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
 }
 
 CollectiveExecutor* CollectiveExecutorMgr::Create(int64 step_id) {
-  CollectiveRemoteAccessLocal* rma =
-      new CollectiveRemoteAccessLocal(dev_mgr_, dev_resolver_.get(), step_id);
+  CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
+      dev_mgr_, dev_resolver_.get(), work_queue_, step_id);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
                                     &gpu_ring_order_);
 }
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index 4db121a4d6d..d4cef14c1d2 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 class ConfigProto;
@@ -63,6 +64,10 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   std::unique_ptr<DeviceResolverInterface> dev_resolver_;
   std::unique_ptr<ParamResolverInterface> param_resolver_;
   string gpu_ring_order_;
+  // Unbounded work queue for scheduling potentially-blocking work during
+  // collective op execution.  Ownership is shared between `this` and
+  // `CollectiveRemoteAccessLocal`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
 
  private:
   mutex exec_mu_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 2be3f623359..72a843e2df4 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
@@ -56,18 +57,13 @@ void CollectiveParamResolverLocal::CompleteGroupAsync(
 }
 
 namespace {
-string GetCollectiveName(const CollectiveParams* cp, bool nccl) {
+const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
   switch (cp->instance.type) {
     case BROADCAST_COLLECTIVE:
       return "HierarchicalTreeBroadcast";
 
-    case REDUCTION_COLLECTIVE: {
-      if (nccl) {
-        return "NcclReduce";
-      } else {
-        return "RingReduce";
-      }
-    }
+    case REDUCTION_COLLECTIVE:
+      return nccl ? "NcclReduce" : "RingReduce";
 
     case GATHER_COLLECTIVE:
       return "RingGather";
@@ -96,15 +92,22 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
 
       // Initialize group runtime details.
       CollectiveImplementationInterface* col_impl;
-      // TODO(b/128853131,b/132707282): Remove NCCL special case when we have
-      // NCCL implementations for all collectives.
-      status = CollectiveRegistry::LookupParamResolverInstance(
-          nccl_ ? "NcclReduce" : GetCollectiveName(cp, /*nccl=*/false),
-          &col_impl);
+      // Try to lookup a NCCL collective kernel.  This will return error status
+      // if `NcclReduce` kernel is not present in the registry, e.g. on an
+      // environment that does not support NCCL.
+      status = CollectiveRegistry::LookupParamResolverInstance("NcclReduce",
+                                                               &col_impl);
+      if (!status.ok()) {
+        // Fallback to non-NCCL collective.
+        status = CollectiveRegistry::LookupParamResolverInstance(
+            GetCollectiveName(cp, /*nccl=*/false), &col_impl);
+      }
       if (status.ok()) {
         status = col_impl->InitializeCollectiveGroupRuntimeDetails(
             &gr->group.runtime_details);
-      } else {
+      }
+
+      if (!status.ok()) {
         done(status, gr);
         return;
       }
@@ -227,17 +230,20 @@ GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
 }
 
 bool ParseRingOrder(const string& gpu_ring_order_str, TaskDeviceMap* tdm) {
-  std::vector<int32> gpu_ring_order_vec;
-  if (!str_util::SplitAndParseAsInts(gpu_ring_order_str, ',',
-                                     &gpu_ring_order_vec)) {
-    return false;
-  }
-  if (gpu_ring_order_vec.size() != tdm->size()) return false;
+  std::vector<string> split_gpu_ring_order_str =
+      str_util::Split(gpu_ring_order_str, ',');
+  if (split_gpu_ring_order_str.size() != tdm->size()) return false;
+
   // gpu id -> local rank
   gtl::FlatMap<int32, int32> gpu_ranks;
-  for (int32 rank = 0; rank < static_cast<int32>(gpu_ring_order_vec.size());
-       ++rank) {
-    gpu_ranks[gpu_ring_order_vec[rank]] = rank;
+  for (int32 rank = 0;
+       rank < static_cast<int32>(split_gpu_ring_order_str.size()); ++rank) {
+    int32 tmp;
+    if (strings::safe_strto32(split_gpu_ring_order_str[rank], &tmp)) {
+      gpu_ranks[tmp] = rank;
+    } else {
+      return false;
+    }
   }
 
   for (auto& tdm_it : *tdm) {
@@ -668,7 +674,18 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
 // implementation.  The ideal way would depend upon the topology and link
 // strength before picking a particular implementation.
 void CollectiveParamResolverLocal::AssignCollectiveType(CollectiveParams* cp) {
-  cp->instance.impl_details.collective_name = GetCollectiveName(cp, nccl_);
+  // We use the NCCL implementation if this is an environment which supports
+  // NCCL, i.e. `LookupParamResolverInstance` for `NcclReduce` returns OK, and
+  // also if indicated either in `ConfigProto` or `communication_hint`.
+  //
+  // After enough testing, we may simplify this logic to use NCCL whenever
+  // available.
+  CollectiveImplementationInterface* col_impl;
+  bool use_nccl =
+      (nccl_ || cp->instance.impl_details.communication_hint == "nccl") &&
+      CollectiveRegistry::LookupParamResolverInstance("NcclReduce", &col_impl)
+          .ok();
+  cp->instance.impl_details.collective_name = GetCollectiveName(cp, use_nccl);
   VLOG(1) << "AssignCollectiveType "
           << cp->instance.impl_details.collective_name;
 }
@@ -702,6 +719,7 @@ void CollectiveParamResolverLocal::CompleteInstanceLocal(
 void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     const string& device, const GroupRec* gr, CollectiveParams* cp,
     InstanceRec* ir, bool is_source, const StatusCallback& done) {
+  auto expected_shape = cp->instance.shape;
   // Populate the fields common across instance.
   {
     mutex_lock l(ir->out_mu);
@@ -709,6 +727,16 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     // custom operator= does a deep copy.
     cp->instance = ir->shared.instance;
   }
+  if (expected_shape != cp->instance.shape) {
+    done(errors::InvalidArgument(
+        "Shape mismatch in the collective instance ", cp->instance.instance_key,
+        ". Op at device ", device, " expected shape ",
+        expected_shape.DebugString(), " but another member in the group ",
+        "expected shape ", cp->instance.shape.DebugString(), ". This is likely",
+        " due to different input shapes at different members of the collective",
+        " op."));
+    return;
+  }
   // Populate the fields common across task.
   AssignCollectiveType(cp);
   SetDefaultRank(device, cp);
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index 160161f70f1..b5d02f4d2bd 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
+
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 
@@ -26,13 +28,15 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
  public:
   CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
                               DeviceResolverInterface* dev_resolver,
+                              std::shared_ptr<UnboundedWorkQueue> work_queue,
                               int64 step_id)
       : dev_mgr_(dev_mgr),
         dev_resolver_(dev_resolver),
+        work_queue_(std::move(work_queue)),
         buf_rendezvous_(step_id, dev_mgr),
         step_id_(step_id) {}
 
-  virtual ~CollectiveRemoteAccessLocal() {}
+  ~CollectiveRemoteAccessLocal() override = default;
 
   void StartAbort(const Status& s) override;
 
@@ -52,6 +56,10 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
                   const DeviceLocality& client_locality,
                   const StatusCallback& done) override;
 
+  void RunClosure(std::function<void()> closure) override {
+    work_queue_->Schedule(std::move(closure));
+  }
+
   void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
                                    const std::vector<string>& tasks,
                                    std::vector<DeviceAttributes>* attributes,
@@ -88,6 +96,9 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
  protected:
   const DeviceMgr* dev_mgr_;               // not owned
   DeviceResolverInterface* dev_resolver_;  // not owned
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   BufRendezvous buf_rendezvous_;
   int64 step_id_;
 };
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index 2e9d8cd394e..6024359643b 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -38,20 +39,24 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
   const string kTaskName = "/job:localhost/replica:0/task:0";
 
   CollectiveRemoteAccessLocalTest() {
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
     ConfigProto cp;
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
-    device_mgr_.reset(new DeviceMgr(std::move(devices)));
-    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
-                                                drl_.get(), kTaskName));
-    rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
-                                               kStepId));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+    drl_ = absl::make_unique<DeviceResolverLocal>(device_mgr_.get());
+    prl_ = absl::make_unique<CollectiveParamResolverLocal>(
+        cp, device_mgr_.get(), drl_.get(), kTaskName);
+    rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(
+        device_mgr_.get(), drl_.get(), work_queue_, kStepId);
   }
 
+  ~CollectiveRemoteAccessLocalTest() override = default;
+
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index 4fd40a103a0..0f053b18f7b 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -121,15 +121,16 @@ bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
 }  // namespace
 
 Status Member::SetParentAndSupportedDevices(
-    const Node& node, const std::vector<DeviceType>& types) {
+    const Node& node, const std::vector<DeviceType>& types,
+    const DeviceNameUtils::ParsedName* local_address_spec) {
   int id = node.id();
   if (id < 0) {
     return errors::Internal("Placer should not be creating a Member for node: ",
                             node.DebugString());
   }
   parent_ = id;
-  return SupportedDeviceTypesForNode(types, node.def(),
-                                     &supported_device_types_);
+  return SupportedDeviceTypesForNode(
+      types, node.def(), &supported_device_types_, local_address_spec);
 }
 
 Status Member::SetAssignedDeviceName(const string& device_name) {
@@ -438,7 +439,7 @@ bool Member::MergeSupportedDevices(
   return true;
 }
 
-Status Member::AssignDevice(const Node& node, bool allow_soft_placement) {
+Status Member::AssignDevice(const Node& node) {
   if (node.assigned_device_name_index() == assigned_device_name_index_) {
     return Status::OK();
   }
@@ -530,21 +531,43 @@ DeviceNameUtils::ParsedName Member::GetPreferredSoftDeviceName() const {
   return soft_device_name;
 }
 
+// Returns ParsedName whose address space (i.e. job, replica, task) identifies
+// the address space directly accessible by the local process. If the address
+// space is fully specified and it is exactly the same as the address space
+// of a device, then all kernels of that device should be registered in the
+// local process.
+static const DeviceNameUtils::ParsedName LocalAddressSpec(
+    const Device* client_device, const Device* default_local_device) {
+  if (client_device != nullptr) {
+    return DeviceNameUtils::AddressSpace(client_device->parsed_name());
+  }
+
+  if (default_local_device != nullptr) {
+    return DeviceNameUtils::AddressSpace(default_local_device->parsed_name());
+  }
+
+  // TODO(b/139617593) Return the name of the first local device in device_set_
+  // once we can trust the output of Device::IsLocal().
+  return DeviceNameUtils::ParsedName();
+}
+
 ColocationGraph::ColocationGraph(const Graph* graph, const FunctionStack& stack,
                                  const FunctionLibraryDefinition* flib_def,
                                  const DeviceSet* device_set,
-                                 const Device* default_device,
+                                 const Device* default_local_device,
                                  bool allow_soft_placement,
                                  bool log_device_placement)
     : graph_(*graph),
       stack_(stack),
       flib_def_(*flib_def),
-      inspecting_placer_(graph, stack, flib_def, device_set, default_device,
+      inspecting_placer_(stack, flib_def, device_set, default_local_device,
                          allow_soft_placement, log_device_placement),
       inspection_required_checker_(graph, flib_def),
       device_set_(*device_set),
       device_types_(device_set->PrioritizedDeviceTypeList()),
-      default_device_(default_device),
+      local_address_spec_(
+          LocalAddressSpec(device_set->client_device(), default_local_device)),
+      default_local_device_(default_local_device),
       allow_soft_placement_(allow_soft_placement),
       log_device_placement_(log_device_placement) {
   members_.resize(graph_.num_node_ids());
@@ -914,7 +937,7 @@ Status ColocationGraph::LimitToAssignedDevice(const Node& node) {
   }
   int root = FindAndUpdateRoot(node.id());
   Member& root_member = members_[root];
-  return root_member.AssignDevice(node, allow_soft_placement_);
+  return root_member.AssignDevice(node);
 }
 
 void ColocationGraph::GetSoftDeviceCandidates(
@@ -930,7 +953,7 @@ void ColocationGraph::GetSoftDeviceCandidates(
   if (!possible_devices->empty()) {
     *possible_devices = FilterSupportedDevices(
         *possible_devices, root_member.supported_device_types(),
-        default_device_);
+        default_local_device_);
   }
 
   if (!possible_devices->empty()) {
@@ -953,7 +976,7 @@ void ColocationGraph::GetSoftDeviceCandidates(
   if (!possible_devices->empty()) {
     *possible_devices = FilterSupportedDevices(
         *possible_devices, root_member.supported_device_types(),
-        default_device_);
+        default_local_device_);
   }
 
   if (!possible_devices->empty()) {
@@ -1007,7 +1030,7 @@ Status ColocationGraph::GetDevicesForNode(
       // Filter devices into those that are compatible with the root
       // node (and its children).
       devices = FilterSupportedDevices(
-          devices, root_member.supported_device_types(), default_device_);
+          devices, root_member.supported_device_types(), default_local_device_);
     }
 
     // Perform soft placement if allow_soft_placement_ is set.
@@ -1094,7 +1117,7 @@ Status ColocationGraph::GetDevicesForNode(
     }
     devices = FilterSupportedDevices(device_set_.devices(),
                                      root_member.supported_device_types(),
-                                     default_device_);
+                                     default_local_device_);
 
     if (devices.empty()) {
       return errors::InvalidArgument(
@@ -1163,7 +1186,8 @@ string ColocationGraph::DebugInfo(const int node_root) const {
     colocation_nodes.push_back(node);
 
     PrioritizedDeviceTypeVector supported_types;
-    SupportedDeviceTypesForNode(device_types_, node->def(), &supported_types)
+    SupportedDeviceTypesForNode(device_types_, node->def(), &supported_types,
+                                &local_address_spec_)
         .IgnoreError();
     string devices_registered;
     for (const auto& device_type : supported_types) {
@@ -1239,7 +1263,8 @@ Status ColocationGraph::InitializeMemberWithAssignedDevice(
 }
 
 Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
-  TF_RETURN_IF_ERROR(member->SetParentAndSupportedDevices(node, device_types_));
+  TF_RETURN_IF_ERROR(member->SetParentAndSupportedDevices(
+      node, device_types_, &local_address_spec_));
 
   if (node.has_assigned_device_name()) {
     TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
@@ -1291,19 +1316,19 @@ Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
 /*static*/ std::vector<Device*> ColocationGraph::FilterSupportedDevices(
     const std::vector<Device*>& devices,
     const PrioritizedDeviceTypeVector& supported_device_types,
-    const Device* default_device) {
+    const Device* default_local_device) {
   Device* filtered_default_device = nullptr;
   std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
   for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
       if (DeviceType(device->attributes().device_type()) ==
           supported_device_type.first) {
-        if (default_device &&
-            (device == default_device ||
+        if (default_local_device &&
+            (device == default_local_device ||
              // TODO(nareshmodi, fishx): At times the device pointer in the
              // device set is different to the one passed in as the default
              // device. Figure out why this might be.
-             device->name() == default_device->name())) {
+             device->name() == default_local_device->name())) {
           filtered_default_device = device;
         } else {
           prioritized_filtered_devices.emplace_back(
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
index 410b943a34e..da130279c1f 100644
--- a/tensorflow/core/common_runtime/colocation_graph.h
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -37,8 +37,9 @@ class Member {
  public:
   Member() = default;
 
-  Status SetParentAndSupportedDevices(const Node& node,
-                                      const std::vector<DeviceType>& types);
+  Status SetParentAndSupportedDevices(
+      const Node& node, const std::vector<DeviceType>& types,
+      const DeviceNameUtils::ParsedName* local_address_spec);
 
   const DeviceNameUtils::ParsedName& requested_device_name() const {
     return requested_device_name_;
@@ -80,7 +81,7 @@ class Member {
   // not update this. Else returns true and updates this.
   bool MergeSupportedDevices(const Member& other);
 
-  Status AssignDevice(const Node& node, bool allow_soft_placement);
+  Status AssignDevice(const Node& node);
 
   // Limit the possible devices of this (should be a root) to the device
   // specifications in `devices`.
@@ -203,12 +204,13 @@ class Member {
 class ColocationGraph {
  public:
   // graph, flib_def, and device_set must not be null and must outlive
-  // this ColocationGraph. default_device can be null. If not, must outlive
-  // this.
+  // this ColocationGraph. default_local_device can be null. If not, must
+  // outlive this.
   ColocationGraph(const Graph* graph, const FunctionStack& stack,
                   const FunctionLibraryDefinition* flib_def,
-                  const DeviceSet* device_set, const Device* default_device,
-                  bool allow_soft_placement, bool log_device_placement);
+                  const DeviceSet* device_set,
+                  const Device* default_local_device, bool allow_soft_placement,
+                  bool log_device_placement);
 
   Status Initialize();
 
@@ -254,7 +256,7 @@ class ColocationGraph {
   static std::vector<Device*> FilterSupportedDevices(
       const std::vector<Device*>& devices,
       const PrioritizedDeviceTypeVector& supported_device_types,
-      const Device* default_device);
+      const Device* default_local_device);
 
  private:
   // Adds each node of the Graph to this ColocationGraph as a singleton.
@@ -355,7 +357,8 @@ class ColocationGraph {
   PlacerInspectionRequiredOpChecker inspection_required_checker_;
   const DeviceSet& device_set_;
   const std::vector<DeviceType> device_types_;
-  const Device* default_device_;
+  const DeviceNameUtils::ParsedName local_address_spec_;
+  const Device* default_local_device_;
   const bool allow_soft_placement_;
   const bool log_device_placement_;
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index beeca579b98..5c7d3ef19e4 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -419,7 +419,7 @@ Graph* GetConstantGraph(
     const Graph* orig_graph, const std::vector<Node*>& nodes,
     const std::unordered_map<const Node*, std::vector<Tensor>>&
         shape_replacement_map,
-    std::map<NodeAndOutput, Node*>* tensors_to_fetch,
+    std::map<NodeAndOutput, NodeAndOutput>* tensors_to_fetch,
     const ConstantFoldNameGenerator& generate_new_name) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, std::vector<Node*>> node_map;
@@ -441,7 +441,7 @@ Graph* GetConstantGraph(
         if (added_nodes.second.size() == 1) {
           tensors_to_fetch->insert(
               {{added_nodes.second[0], out_edge->src_output()},
-               added_nodes.first});
+               {added_nodes.first, out_edge->src_output()}});
         } else {
           // The node had multiple outputs and was replaced by a
           // vector of constants, so the NodeAndOutput is the 0th
@@ -449,7 +449,7 @@ Graph* GetConstantGraph(
           // output of the added node as in the standard case above.
           tensors_to_fetch->insert(
               {{added_nodes.second[out_edge->src_output()], 0},
-               added_nodes.first});
+               {added_nodes.first, out_edge->src_output()}});
         }
       }
     }
@@ -590,7 +590,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
     return Status::OK();
   }
 
-  std::map<NodeAndOutput, Node*> tensors_to_fetch;
+  std::map<NodeAndOutput, NodeAndOutput> tensors_to_fetch;
   std::unique_ptr<Graph> constant_graph(
       GetConstantGraph(graph, constant_foldable_nodes, shape_replacement_map,
                        &tensors_to_fetch, generate_new_name));
@@ -609,17 +609,18 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
   std::vector<NodeAndOutput> tensors_to_replace;
   // Sorting the nodes based on the name gives us a stable ordering between runs
   // for the same graph.
-  std::vector<std::pair<NodeAndOutput, Node*>> tensors_to_fetch_sorted(
+  std::vector<std::pair<NodeAndOutput, NodeAndOutput>> tensors_to_fetch_sorted(
       tensors_to_fetch.begin(), tensors_to_fetch.end());
   std::sort(tensors_to_fetch_sorted.begin(), tensors_to_fetch_sorted.end(),
-            [](const std::pair<NodeAndOutput, Node*>& n1,
-               const std::pair<NodeAndOutput, Node*>& n2) {
-              return n1.first.first->name() < n2.first.first->name();
+            [](const std::pair<NodeAndOutput, NodeAndOutput>& n1,
+               const std::pair<NodeAndOutput, NodeAndOutput>& n2) {
+              return std::tie(n1.first.first->name(), n1.first.second) <
+                     std::tie(n2.first.first->name(), n2.first.second);
             });
   for (auto n : tensors_to_fetch_sorted) {
     tensors_to_fetch_names.push_back(
         strings::StrCat(n.first.first->name(), ":", n.first.second));
-    tensors_to_replace.push_back({n.second, n.first.second});
+    tensors_to_replace.push_back(n.second);
   }
 
   auto graph_runner = std::unique_ptr<GraphRunner>(new GraphRunner(env));
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 38f8fb96b42..844dbc2a198 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -136,28 +136,37 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
       status_cb->Unref();
     };
     auto copier = std::bind(
-        [copy_function, src, dst, src_alloc_attr, dst_alloc_attr,
+        [copy_function, cpu_allocator, src, dst, src_alloc_attr, dst_alloc_attr,
          recv_dev_context, send_dev_context, out_allocator, status_cb,
          dev_to_dev_stream_index](StatusCallback wrapped_done_,
                                   // Begin unbound arguments
                                   const Tensor& from, Tensor* to) {
-          if (!DMAHelper::CanUseDMA(&from)) {
-            Status err = errors::InvalidArgument(
-                "During Variant Device->Device Copy: "
-                "non-DMA-copy attempted of tensor type: ",
-                DataTypeString(from.dtype()));
-            status_cb->UpdateStatus(err);
-            return err;
-          }
-          if (status_cb->ok()) {
+          if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
-            *to = Tensor(out_allocator, from.dtype(), from.shape());
-            copy_function(send_dev_context, recv_dev_context, src, dst,
-                          src_alloc_attr, dst_alloc_attr, &from, to,
-                          dev_to_dev_stream_index, std::move(wrapped_done_));
+            CopyDeviceToDevice(copy_function, cpu_allocator, out_allocator,
+                               send_dev_context, recv_dev_context, src, dst,
+                               src_alloc_attr, dst_alloc_attr, &from, to,
+                               dev_to_dev_stream_index, wrapped_done_);
             return Status::OK();
           } else {
-            return status_cb->status();
+            if (!DMAHelper::CanUseDMA(&from)) {
+              Status err = errors::InvalidArgument(
+                  "During Variant Device->Device Copy: ", src->name(), " to ",
+                  dst->name(), " non-DMA-copy attempted of tensor type: ",
+                  DataTypeString(from.dtype()));
+              status_cb->UpdateStatus(err);
+              return err;
+            }
+            if (status_cb->ok()) {
+              status_cb->Ref();
+              *to = Tensor(out_allocator, from.dtype(), from.shape());
+              copy_function(send_dev_context, recv_dev_context, src, dst,
+                            src_alloc_attr, dst_alloc_attr, &from, to,
+                            dev_to_dev_stream_index, std::move(wrapped_done_));
+              return Status::OK();
+            } else {
+              return status_cb->status();
+            }
           }
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
index 2544cc67af6..0242e2e1cfa 100644
--- a/tensorflow/core/common_runtime/data/BUILD
+++ b/tensorflow/core/common_runtime/data/BUILD
@@ -1,10 +1,13 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_protos_all")
+
 package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
-
 cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
@@ -14,6 +17,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:session_options",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc
index eebf00096a0..21becb37ed5 100644
--- a/tensorflow/core/common_runtime/data/standalone.cc
+++ b/tensorflow/core/common_runtime/data/standalone.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -45,20 +46,17 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
   Graph graph(OpRegistry::Global());
   TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
 
-  // Instantiate enough of the TensorFlow runtime to run `graph` on a single CPU
-  // device.
-  std::unique_ptr<DeviceMgr> device_mgr =
-      MakeUnique<DeviceMgr>(DeviceFactory::NewDevice(
-          "CPU", params.session_options, "/job:localhost/replica:0/task:0"));
+  // Instantiate enough of the TF runtime to run `graph` on a single CPU device.
+  auto device_mgr = absl::make_unique<DeviceMgr>(DeviceFactory::NewDevice(
+      "CPU", params.session_options, "/job:localhost/replica:0/task:0"));
   Device* device = device_mgr->ListDevices()[0];
   // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
   // the lifetime of `graph`.
-  std::unique_ptr<FunctionLibraryDefinition> flib_def =
-      MakeUnique<FunctionLibraryDefinition>(graph.flib_def());
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr =
-      MakeUnique<ProcessFunctionLibraryRuntime>(
-          device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION,
-          flib_def.get(), OptimizerOptions{}, nullptr /* parent */);
+  auto flib_def =
+      absl::make_unique<FunctionLibraryDefinition>(graph.flib_def());
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, flib_def.get(),
+      OptimizerOptions{}, nullptr /* parent */);
 
   string fetch_node = "";
   for (auto node : graph_def.node()) {
@@ -107,7 +105,10 @@ Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
     OpKernelContext op_ctx(&op_params, 0);
     IteratorContext::Params params(&op_ctx);
     params.function_handle_cache = function_handle_cache_.get();
-    ctx = MakeUnique<IteratorContext>(std::move(params));
+    params.resource_mgr = &resource_mgr_;
+    params.cancellation_manager = &cancellation_manager_;
+
+    ctx = absl::make_unique<IteratorContext>(std::move(params));
   }
 
   // Create the iterator from the dataset.
@@ -129,7 +130,7 @@ Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
       pool_(pool) {
   runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
   function_handle_cache_ =
-      MakeUnique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+      absl::make_unique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
 }
 
 Dataset::~Dataset() { dataset_->Unref(); }
diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h
index 7ec420ab8ac..70a6820c63f 100644
--- a/tensorflow/core/common_runtime/data/standalone.h
+++ b/tensorflow/core/common_runtime/data/standalone.h
@@ -111,6 +111,8 @@ class Dataset {
   std::unique_ptr<thread::ThreadPool> pool_;
   std::unique_ptr<FunctionHandleCache> function_handle_cache_;
   std::function<void(std::function<void()>)> runner_;
+  ResourceMgr resource_mgr_;
+  CancellationManager cancellation_manager_;
 };
 
 }  // namespace standalone
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index e25bd06c17e..2c6b55cee12 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -34,7 +34,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/control_flow.h"
-#include "tensorflow/core/framework/device_attributes.pb_text.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -103,12 +102,6 @@ class Device : public DeviceBase {
     }
   }
 
-  // If true, and tracing is enabled, the `tracing::ScopedAnnotation()` tracing
-  // mechanism will be used instead of `tracing::ScopedActivity()`. Some devices
-  // may override this method to use annotations, which enable child activities
-  // (such as GPU kernel launches) to be related to the OpKernel invocation.
-  virtual bool TraceUsingAnnotations() const { return false; }
-
   // Blocks until all operations queued on the device at the time of
   // the call have completed.  Returns any error pending on the device
   // at completion.
@@ -173,7 +166,7 @@ class Device : public DeviceBase {
   virtual ResourceMgr* resource_manager() { return rmgr_; }
 
   // Summarizes the status of this Device, for debugging.
-  string DebugString() const { return ProtoDebugString(device_attributes_); }
+  string DebugString() const { return device_attributes_.DebugString(); }
 
   // Assembles the parameter components into a complete DeviceAttributes value.
   static DeviceAttributes BuildDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 1cd8931a911..56ac71d7d09 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -49,6 +49,14 @@ DeviceMgr::DeviceMgr(std::unique_ptr<Device> device)
         return vector;
       }()) {}
 
+DeviceMgr::~DeviceMgr() {
+  // Release resources ahead of destroying the device manager as the resource
+  // destructors (e.g. ~IteratorResource) assume devices still exist.
+  for (auto& device : devices_) {
+    device->ClearResourceMgr();
+  }
+}
+
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
   size_t n = s.size();
   char* space = name_backing_store_.Alloc(n);
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index bf8694655ae..3cef631bd0a 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -42,6 +42,8 @@ class DeviceMgr {
   // Constructs a DeviceMgr managing a single device.
   explicit DeviceMgr(std::unique_ptr<Device> device);
 
+  ~DeviceMgr();
+
   // Returns attributes of all devices.
   void ListDeviceAttributes(std::vector<DeviceAttributes>* devices) const;
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 3661367c708..c95a23bfdcf 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -38,10 +38,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/run_handler.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -353,7 +353,10 @@ DirectSession::DirectSession(const SessionOptions& options,
     } else {
       printf("Device mapping:\n%s", mapping_str.c_str());
     }
-    LOG(INFO) << "Device mapping:\n" << mapping_str;
+    string msg = strings::StrCat("Device mapping:\n", mapping_str);
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
   }
   for (auto d : device_mgr_->ListDevices()) {
     devices_.push_back(d);
@@ -381,9 +384,6 @@ DirectSession::~DirectSession() {
   for (auto d : device_mgr_->ListDevices()) {
     d->op_segment()->RemoveHold(session_handle_);
   }
-  for (auto d : device_mgr_->ListDevices()) {
-    d->ClearResourceMgr();
-  }
   functions_.clear();
   delete cancellation_manager_;
   for (const auto& p_and_owned : thread_pools_) {
@@ -496,7 +496,17 @@ Status DirectSession::RunInternal(
   RunState run_state(step_id, &devices_);
 
   profiler::TraceMe activity(
-      [&] { return strings::StrCat("SessionRun #id=", step_id, "#"); },
+      [&] {
+        if (options_.config.experimental().has_session_metadata()) {
+          const auto& model_metadata =
+              options_.config.experimental().session_metadata();
+          return strings::StrCat("SessionRun #id=", step_id,
+                                 ",model_id=", model_metadata.name(), ":",
+                                 model_metadata.version(), "#");
+        } else {
+          return strings::StrCat("SessionRun #id=", step_id, "#");
+        }
+      },
       profiler::TraceMeLevel::kInfo);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
@@ -590,7 +600,7 @@ Status DirectSession::RunInternal(
 
   std::unique_ptr<ProfilerSession> profiler_session;
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
-    profiler_session = ProfilerSession::Create(/*ProfilerContext*/ nullptr);
+    profiler_session = ProfilerSession::Create();
   }
 
   if (run_options.inter_op_thread_pool() < -1 ||
@@ -1271,9 +1281,14 @@ Status DirectSession::CreateExecutors(
 
   int graph_def_version = graphs.begin()->second->versions().producer();
 
+  const auto* session_metadata =
+      options_.config.experimental().has_session_metadata()
+          ? &options_.config.experimental().session_metadata()
+          : nullptr;
   func_info->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_.get(), options_.env, graph_def_version,
-      func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first));
+      func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first,
+      nullptr, nullptr, session_metadata));
 
   GraphOptimizer optimizer(optimizer_opts);
   for (auto iter = graphs.begin(); iter != graphs.end(); ++iter) {
@@ -1293,10 +1308,7 @@ Status DirectSession::CreateExecutors(
 
     LocalExecutorParams params;
     params.device = device;
-    params.session_metadata =
-        options_.config.experimental().has_session_metadata()
-            ? &options_.config.experimental().session_metadata()
-            : nullptr;
+    params.session_metadata = session_metadata;
     params.function_library = lib;
     auto opseg = device->op_segment();
     params.create_kernel = [this, lib, opseg](const NodeDef& ndef,
@@ -1614,15 +1626,15 @@ Status DirectSession::CreateGraphs(
     }
   }
 
-  for (const auto& partition : partitions) {
+  for (auto& partition : partitions) {
     std::unique_ptr<Graph> device_graph(
         new Graph(client_graph->flib_def.get()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now allow.
     device_opts.allow_internal_ops = true;
     device_opts.expect_device_spec = true;
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(device_opts, partition.second,
-                                              device_graph.get()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+        device_opts, std::move(partition.second), device_graph.get()));
     outputs->emplace(partition.first, std::move(device_graph));
   }
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 8da13aaca22..454f144b5e7 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -51,9 +51,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -1055,14 +1057,30 @@ class SessionMetadataReaderOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("y", TensorShape({}), &out_tensor));
     if (ctx->session_metadata() != nullptr) {
-      out_tensor->scalar<string>()() = ctx->session_metadata()->DebugString();
+      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
     } else {
-      out_tensor->scalar<string>()() = "";
+      out_tensor->scalar<tstring>()() = "";
     }
   }
 };
 REGISTER_KERNEL_BUILDER(Name("SessionMetadataReader").Device(DEVICE_CPU),
                         SessionMetadataReaderOp);
+REGISTER_KERNEL_BUILDER(Name("SessionMetadataReader").Device(DEVICE_GPU),
+                        SessionMetadataReaderOp);
+
+FunctionDef SessionMetadataReaderOpFn() {
+  return FunctionDefHelper::Define(
+      // Name
+      "SessionMetadataReaderFn",
+      // Args
+      {"x: int64"},
+      // Return values
+      {"y: string"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"y"}, "SessionMetadataReader", {"x"}, {}}});
+}
 
 TEST(DirectSessionTest, SessionMetadataAbsent) {
   Graph g(OpRegistry::Global());
@@ -1079,7 +1097,29 @@ TEST(DirectSessionTest, SessionMetadataAbsent) {
   run_opts.set_inter_op_thread_pool(-1);
   auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
 
-  EXPECT_EQ("", outputs[0].scalar<string>()());
+  EXPECT_EQ("", outputs[0].scalar<tstring>()());
+}
+
+TEST(DirectSessionTest, SessionMetadataAbsentViaFunction) {
+  FunctionDefLibrary library_graph_def;
+  *library_graph_def.add_function() = SessionMetadataReaderOpFn();
+  FunctionLibraryDefinition flib(OpRegistry::Global(), library_graph_def);
+  Graph g(&flib);
+  Tensor vx(DT_INT64, TensorShape({}));
+  vx.scalar<int64>()() = 17;
+  Node* x = test::graph::Constant(&g, vx);
+  Node* y = test::graph::Unary(&g, "SessionMetadataReaderFn", x);
+  GraphDef def;
+  g.ToGraphDef(&def);
+  *def.mutable_library() = library_graph_def;
+  auto sess = CreateSession();
+  TF_ASSERT_OK(sess->Create(def));
+  std::vector<Tensor> outputs;
+  RunOptions run_opts;
+  run_opts.set_inter_op_thread_pool(-1);
+  auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
+
+  EXPECT_EQ("", outputs[0].scalar<tstring>()());
 }
 
 TEST(DirectSessionTest, SessionMetadataPresent) {
@@ -1104,7 +1144,38 @@ TEST(DirectSessionTest, SessionMetadataPresent) {
 
   SessionMetadata read_metadata;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
-      outputs[0].scalar<string>()(), &read_metadata));
+      outputs[0].scalar<tstring>()(), &read_metadata));
+  EXPECT_EQ("name", read_metadata.name());
+  EXPECT_EQ(1, read_metadata.version());
+}
+
+TEST(DirectSessionTest, SessionMetadataPresentViaFunction) {
+  FunctionDefLibrary library_graph_def;
+  *library_graph_def.add_function() = SessionMetadataReaderOpFn();
+  FunctionLibraryDefinition flib(OpRegistry::Global(), library_graph_def);
+  Graph g(&flib);
+  Tensor vx(DT_INT64, TensorShape({}));
+  vx.scalar<int64>()() = 17;
+  Node* x = test::graph::Constant(&g, vx);
+  Node* y = test::graph::Unary(&g, "SessionMetadataReaderFn", x);
+  GraphDef def;
+  g.ToGraphDef(&def);
+  *def.mutable_library() = library_graph_def;
+  auto session_options = DefaultSessionOptions();
+  auto* session_metadata =
+      session_options.config.mutable_experimental()->mutable_session_metadata();
+  session_metadata->set_name("name");
+  session_metadata->set_version(1);
+  auto sess = std::unique_ptr<Session>(NewSession(session_options));
+  TF_ASSERT_OK(sess->Create(def));
+  std::vector<Tensor> outputs;
+  RunOptions run_opts;
+  run_opts.set_inter_op_thread_pool(-1);
+  auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
+
+  SessionMetadata read_metadata;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      outputs[0].scalar<tstring>()(), &read_metadata));
   EXPECT_EQ("name", read_metadata.name());
   EXPECT_EQ(1, read_metadata.version());
 }
@@ -1468,7 +1539,7 @@ TEST(DirectSessionTest, RunHandleTest) {
 
   const ResourceHandle& resource_handle = outputs[0].scalar<ResourceHandle>()();
   Tensor string_handle(DT_STRING, {});
-  string_handle.flat<string>().setConstant(resource_handle.name());
+  string_handle.flat<tstring>().setConstant(resource_handle.name());
 
   // Second run call: Use a handle.
   std::vector<Tensor> outputs1;
@@ -1521,7 +1592,7 @@ TEST(DirectSessionTest, RunHandleTest_Callable) {
 
   const ResourceHandle& resource_handle = outputs[0].scalar<ResourceHandle>()();
   Tensor string_handle(DT_STRING, {});
-  string_handle.flat<string>().setConstant(resource_handle.name());
+  string_handle.flat<tstring>().setConstant(resource_handle.name());
 
   // Second run call: Use a handle.
   std::vector<Tensor> outputs1;
@@ -2089,6 +2160,12 @@ bool IsCUDATensor(const Tensor& t) {
   if (err == cudaErrorInvalidValue) return false;
   CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   return (attributes.memoryType == cudaMemoryTypeDevice);
+#elif TENSORFLOW_USE_ROCM
+  hipPointerAttribute_t attributes;
+  hipError_t err = hipPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == hipErrorInvalidValue) return false;
+  CHECK_EQ(hipSuccess, err) << hipGetErrorString(err);
+  return (attributes.memoryType == hipMemoryTypeDevice);
 #else
   return false;
 #endif
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 5d771703409..61c85fa8469 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -3,6 +3,10 @@ load(
     "tf_cc_test",
     "tf_cuda_library",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 package(
     default_visibility = [
@@ -86,6 +90,7 @@ tf_cuda_library(
     deps = [
         ":attr_builder",
         ":context",
+        ":eager_executor",
         ":tensor_handle",
         "//tensorflow/core:framework",
     ],
@@ -172,6 +177,17 @@ tf_cuda_library(
     }),
 )
 
+KERNEL_AND_DEVICE_DEPS = [
+    "//tensorflow/core:core_cpu_lib",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:framework_internal",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/profiler/lib:traceme",
+    "//tensorflow/core/grappler/optimizers:meta_optimizer",
+]
+
 tf_cuda_library(
     name = "kernel_and_device",
     srcs = [
@@ -189,15 +205,9 @@ tf_cuda_library(
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
-        "//conditions:default": [
-            "//tensorflow/core:core_cpu_lib",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core/profiler/lib:traceme",
-            "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
+        "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
+            "//tensorflow/compiler/jit:xla_kernel_creator_util",
         ],
     }),
 )
@@ -264,8 +274,21 @@ cc_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_copy_node",
         ],
-    }),
+    }) + if_mkl([":mkl_eager_op_rewrite"]),
+)
+
+cc_library(
+    name = "mkl_eager_op_rewrite",
+    srcs = ["mkl_eager_op_rewrite.cc"],
+    deps = [
+        ":eager_op_rewrite_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:mkl_graph_util",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 9c7ad99bda5..a989de34fb7 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -80,7 +80,7 @@ EagerContext::EagerContext(
       log_device_placement_(opts.config.log_device_placement()),
       allow_soft_placement_(opts.config.allow_soft_placement()),
       num_active_steps_(0),
-      async_default_(async),
+      default_executor_(async),
       log_memory_(LogMemory::IsEnabled()),
       env_(opts.env),
       use_send_tensor_rpc_(false),
@@ -102,6 +102,10 @@ EagerContext::EagerContext(
     this->thread_pool_->Schedule(std::move(closure));
   };
 
+#if !defined(IS_MOBILE_PLATFORM)
+  context_id_ = kInvalidContextId;
+#endif  // IS_MOBILE_PLATFORM
+
   std::unique_ptr<DeviceResolverInterface> drl(
       new DeviceResolverLocal(local_device_mgr()));
   std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
@@ -112,10 +116,6 @@ EagerContext::EagerContext(
 }
 
 void EagerContext::InitDeviceMapAndAsync() {
-  if (async_default_) {
-    executor_.EnableAsync();
-  }
-
   for (auto* device : devices_) {
     devices_map_[device->name()] = device;
   }
@@ -136,38 +136,38 @@ void EagerContext::InitDeviceMapAndAsync() {
   prioritized_device_type_list_ = ds.PrioritizedDeviceTypeList();
 }
 
-bool EagerContext::Async() const {
-  mutex_lock l(async_map_mu_);
-  return gtl::FindWithDefault(thread_local_async_, std::this_thread::get_id(),
-                              async_default_);
+EagerExecutor* EagerContext::Executor() {
+  tf_shared_lock l(executor_map_mu_);
+  return gtl::FindWithDefault(thread_local_executor_,
+                              std::this_thread::get_id(), &default_executor_);
 }
 
-Status EagerContext::SetAsyncForThread(bool async) {
-  {
-    tensorflow::mutex_lock l(async_map_mu_);
-    thread_local_async_[std::this_thread::get_id()] = async;
-  }
-  if (async) {
-    executor_.EnableAsync();
+void EagerContext::SetExecutorForThread(EagerExecutor* executor) {
+  tensorflow::mutex_lock l(executor_map_mu_);
+  if (executor == &default_executor_) {
+    thread_local_executor_.erase(std::this_thread::get_id());
   } else {
-    // TODO(agarwal): Currently we add a wait here to handle cases where a
-    // sync op has a control dependency on an async op, and the latter has not
-    // executed yet. This wait can be removed by storing all the control
-    // inputs and waiting for them when executing ops.
-    return executor_.WaitForAllPendingNodes();
+    thread_local_executor_[std::this_thread::get_id()] = executor;
   }
-  return Status::OK();
 }
 
 void EagerContext::ClearCaches() {
-  // The executor stores pointers to kernels, so we need to make sure that no
-  // async eager ops are still executing. We lock the cache during this time as
-  // well.
-  mutex_lock ml(cache_mu_);
-  executor_.WaitForAllPendingNodes().IgnoreError();
-  kernel_cache_.clear();
-  for (auto& entry : registered_functions_) {
-    entry.second->cached_kernel_keys->clear();
+  {
+    mutex_lock ml(executor_map_mu_);
+    for (auto& entry : thread_local_executor_) {
+      entry.second->WaitForAllPendingNodes().IgnoreError();
+    }
+  }
+  {
+    // The executor stores pointers to kernels, so we need to make sure that no
+    // async eager ops are still executing. We lock the cache during this time
+    // as well.
+    mutex_lock ml(cache_mu_);
+    default_executor_.WaitForAllPendingNodes().IgnoreError();
+    kernel_cache_.clear();
+    for (auto& entry : registered_functions_) {
+      entry.second->cached_kernel_keys->clear();
+    }
   }
 }
 
@@ -210,7 +210,16 @@ bool EagerContext::MirrorTensors() const {
 void EagerContext::CloseRemoteContexts() {
   // Close all remote contexts.
   eager::CloseContextRequest request;
-  request.set_context_id(context_id_);
+  uint64 context_id;
+  {
+    mutex_lock l(remote_state_mu_);
+    if (!is_master_) return;
+    context_id = context_id_;
+    context_id_ = kInvalidContextId;
+  }
+  request.set_context_id(context_id);
+  // Setting context_id to a new value can avoid us issuing DestroyTensorHandle
+  // request to closed remote workers.
   std::vector<eager::CloseContextResponse> responses(remote_contexts_.size());
   BlockingCounter counter(static_cast<int>(remote_contexts_.size()));
 
@@ -220,10 +229,11 @@ void EagerContext::CloseRemoteContexts() {
     Status s = remote_eager_workers_->GetClient(worker, &client);
 
     client->CloseContextAsync(
-        &request, &responses[i], [this, &worker, &counter](const Status& s) {
+        &request, &responses[i],
+        [&worker, &counter, context_id](const Status& s) {
           if (!s.ok()) {
             LOG(ERROR) << "Unable to close remote context with ID "
-                       << context_id_ << " for worker: " << worker << " due to "
+                       << context_id << " for worker: " << worker << " due to "
                        << s.error_message();
           }
           counter.DecrementCount();
@@ -232,9 +242,50 @@ void EagerContext::CloseRemoteContexts() {
   }
 
   counter.Wait();
+
+  remote_contexts_.clear();
 }
+
 #endif  // !IS_MOBILE_PLATFORM
 
+void EagerContext::WaitForAndCloseRemoteContexts() {
+  ClearCaches();
+
+#if !defined(IS_MOBILE_PLATFORM)
+  {
+    mutex_lock l(keep_alive_thread_shutdown_mu_);
+    shutting_down_ = true;
+    keep_alive_thread_cv_.notify_all();
+  }
+  keep_alive_thread_.reset();
+
+  if (!remote_contexts_.empty()) {
+    CloseRemoteContexts();
+  }
+
+  mutex_lock l(remote_state_mu_);
+
+  default_executor_.ShutDown().IgnoreError();
+  std::unordered_map<std::thread::id, EagerExecutor*> executors_copy;
+  {
+    mutex_lock l(executor_map_mu_);
+    executors_copy = thread_local_executor_;
+  }
+  for (const auto& it : executors_copy) {
+    it.second->ShutDown().IgnoreError();
+  }
+
+  // This shuts down the completion queue and joins the thread polling it.
+  // The thread exits only after the completion queue has been drained of all
+  // the events. These events' completion should invoke all remaining RPC
+  // callbacks.
+  // This also deletes all EagerClient instances. There should not be any
+  // references to EagerClients left after all RPCs and async ops have been
+  // finished.
+  remote_eager_workers_ = nullptr;
+#endif  // !IS_MOBILE_PLATFORM
+}
+
 EagerContext::~EagerContext() {
   ClearCaches();
   for (auto& entry : registered_functions_) {
@@ -258,19 +309,12 @@ EagerContext::~EagerContext() {
     keep_alive_thread_cv_.notify_all();
   }
   keep_alive_thread_.reset();
-  if (!remote_contexts_.empty() && is_master_) {
+  if (!remote_contexts_.empty()) {
     CloseRemoteContexts();
   }
 #endif  // !IS_MOBILE_PLATFORM
 
-  executor_.WaitForAllPendingNodes().IgnoreError();
   rendezvous_->Unref();
-
-  // Release resources ahead of destroying the device manager as the resource
-  // destructors (e.g. ~IteratorResource) assume devices still exist.
-  for (auto device : local_device_mgr()->ListDevices()) {
-    device->ClearResourceMgr();
-  }
 }
 
 bool EagerContext::FindFunctionByName(const string& name) {
@@ -298,35 +342,17 @@ Status EagerContext::FindDeviceByName(const string& name,
 }
 
 void EagerContext::ClearRunMetadata() {
-  if (metadata_listener_ != nullptr) {
-    metadata_listener_->BeforeClearRunMetadata();
-  }
   run_metadata_.Clear();
 }
 
-Status EagerContext::RegisterRunMetadataListener(
-    RunMetadataListener* listener) {
-  mutex_lock l(metadata_mu_);
-  if (metadata_listener_ != nullptr) {
-    return Status(error::Code::INVALID_ARGUMENT,
-                  "Cannot run two eager profiler at the same time");
-  }
-  metadata_listener_ = listener;
-  return Status::OK();
-}
-
-void EagerContext::ClearRunMetadataListener() {
-  mutex_lock l(metadata_mu_);
-  metadata_listener_ = nullptr;
-}
-
 void EagerContext::StartStep() {
   mutex_lock ml(metadata_mu_);
   num_active_steps_++;
   if (step_container_ == nullptr) {
     step_container_.reset(
         new ScopedStepContainer(0, [this](const string& name) {
-          for (Device* device : devices_) {
+          auto local_devices = local_device_mgr()->ListDevices();
+          for (Device* device : local_devices) {
             device->resource_manager()->Cleanup(name).IgnoreError();
           }
         }));
@@ -356,7 +382,7 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
   BlockingCounter blocking_counter(static_cast<int>(remote_contexts_.size()));
 
   eager::RegisterFunctionRequest request;
-  request.set_context_id(context_id_);
+  request.set_context_id(GetContextId());
   *request.mutable_function_def() = fdef;
   std::vector<eager::RegisterFunctionResponse> responses(
       remote_contexts_.size());
@@ -460,28 +486,12 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   }
 }
 
-bool EagerContext::ShouldStoreGraphs() {
-  mutex_lock ml(metadata_mu_);
-  return should_store_graphs_.load() || metadata_listener_ != nullptr;
-}
-
-bool EagerContext::ShouldStoreStepStats() {
-  mutex_lock ml(metadata_mu_);
-  return should_store_step_stats_.load() || metadata_listener_ != nullptr;
-}
+bool EagerContext::ShouldStoreGraphs() { return should_store_graphs_.load(); }
 
 void EagerContext::SetShouldStoreGraphs(bool value) {
   mutex_lock ml(metadata_mu_);
   should_store_graphs_.store(value);
-  if (!value || metadata_listener_ != nullptr) {
-    run_metadata_.Clear();
-  }
-}
-
-void EagerContext::SetShouldStoreStepStats(bool value) {
-  mutex_lock ml(metadata_mu_);
-  should_store_step_stats_.store(value);
-  if (!value || metadata_listener_ != nullptr) {
+  if (!value) {
     run_metadata_.Clear();
   }
 }
@@ -582,7 +592,10 @@ Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name,
   return Status::OK();
 }
 
-uint64 EagerContext::GetContextId() { return context_id_; }
+uint64 EagerContext::GetContextId() {
+  tf_shared_lock l(remote_state_mu_);
+  return context_id_;
+}
 
 Status EagerContext::StoreCollectiveOpsServer(
     std::unique_ptr<ServerInterface> server, DeviceMgr* device_mgr,
@@ -598,6 +611,13 @@ Status EagerContext::StoreCollectiveOpsServer(
 
   InitDeviceMapAndAsync();
   ClearCaches();
+  default_executor_.ClearError();
+  {
+    tensorflow::mutex_lock l(executor_map_mu_);
+    for (auto& entry : thread_local_executor_) {
+      entry.second->ClearError();
+    }
+  }
 
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       local_unowned_device_manager_, env_, TF_GRAPH_DEF_VERSION, &func_lib_def_,
@@ -624,14 +644,20 @@ Status EagerContext::InitializeRemoteMaster(
     DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
-  mutex_lock l(remote_state_mu_);
-  is_master_ = true;
+  if (context_id == kInvalidContextId) {
+    return errors::InvalidArgument(
+        "Failed to initialize remote for master context due to invalid ",
+        "context id");
+  }
 
   if (!remote_contexts_.empty()) {
     CloseRemoteContexts();
   }
-  remote_contexts_ = remote_contexts;
+
+  mutex_lock l(remote_state_mu_);
+  is_master_ = true;
   context_id_ = context_id;
+  remote_contexts_ = remote_contexts;
 
   use_send_tensor_rpc_ =
       ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", false);
@@ -666,7 +692,13 @@ Status EagerContext::InitializeRemoteMaster(
   InitDeviceMapAndAsync();
 
   ClearCaches();
-  executor_.ClearError();
+  default_executor_.ClearError();
+  {
+    tensorflow::mutex_lock l(executor_map_mu_);
+    for (auto& entry : thread_local_executor_) {
+      entry.second->ClearError();
+    }
+  }
 
   keep_alive_secs_ = keep_alive_secs;
   sleep_for_secs_ = std::max(1, keep_alive_secs_ / 2);
@@ -736,6 +768,11 @@ Status EagerContext::InitializeRemoteWorker(
     std::function<Rendezvous*(const int64)> rendezvous_creator,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
+  if (context_id == kInvalidContextId) {
+    return errors::InvalidArgument(
+        "Failed to initialize remote for worker context due to invalid ",
+        "context id");
+  }
   mutex_lock l(remote_state_mu_);
 
   if (remote_device_manager_ != nullptr || server_ != nullptr ||
@@ -757,7 +794,13 @@ Status EagerContext::InitializeRemoteWorker(
   InitDeviceMapAndAsync();
 
   ClearCaches();
-  executor_.ClearError();
+  default_executor_.ClearError();
+  {
+    tensorflow::mutex_lock l(executor_map_mu_);
+    for (auto& entry : thread_local_executor_) {
+      entry.second->ClearError();
+    }
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5b9a08b3476..a60940d4021 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -107,6 +107,16 @@ class RunMetadataListener {
 
 class EagerContext : public core::RefCounted {
  public:
+  static const uint64 kInvalidContextId = 0;
+
+  static uint64 NewContextId() {
+    uint64 context_id = random::New64();
+    while (context_id == kInvalidContextId) {
+      context_id = random::New64();
+    }
+    return context_id;
+  }
+
   EagerContext(const SessionOptions& opts,
                ContextDevicePlacementPolicy default_device_placement_policy,
                ContextMirroringPolicy default_mirroring_policy, bool async,
@@ -124,15 +134,10 @@ class EagerContext : public core::RefCounted {
 
   ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
 
-  // True if running in asynchronous mode.
-  bool Async() const;
-
-  EagerExecutor* Executor() { return &executor_; }
-
   std::function<void(std::function<void()>)>* runner() { return &runner_; }
 
-  // Sets whether this thread should run in synchronous or asynchronous mode.
-  Status SetAsyncForThread(bool async);
+  // Specify a executor for this thread.
+  void SetExecutorForThread(EagerExecutor* executor);
 
   // TODO(apassos) make this return a constant reference
   gtl::FlatMap<string, Device*, StringPieceHasher>* device_map() {
@@ -162,12 +167,6 @@ class EagerContext : public core::RefCounted {
 
   bool MirrorTensors() const;
 
-  Status AsyncWait() { return executor_.WaitForAllPendingNodes(); }
-
-  Status GetStatus() { return executor_.status(); }
-
-  void ClearAsyncError() { executor_.ClearError(); }
-
   bool FindFunctionByName(const string& name);
 
   Status FindFunctionOpData(const string& name,
@@ -184,9 +183,7 @@ class EagerContext : public core::RefCounted {
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
-  Status ExecutorAdd(std::unique_ptr<EagerNode> node) {
-    return executor_.Add(std::move(node));
-  }
+  EagerExecutor* Executor();
 
   Status AddFunctionDef(const FunctionDef& fdef);
 
@@ -246,17 +243,11 @@ class EagerContext : public core::RefCounted {
 
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
-  bool ShouldStoreStepStats() LOCKS_EXCLUDED(metadata_mu_);
-  void SetShouldStoreStepStats(bool value);
   bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
   void SetShouldStoreGraphs(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
   void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
-  Status RegisterRunMetadataListener(RunMetadataListener* listener)
-      LOCKS_EXCLUDED(metadata_mu_);
-  void ClearRunMetadataListener() LOCKS_EXCLUDED(metadata_mu_);
-
   void StartStep();
   void EndStep();
   ScopedStepContainer* StepContainer();
@@ -320,8 +311,20 @@ class EagerContext : public core::RefCounted {
   // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
   // instead (which in-turn use WorkerService.RecvTensor RPCs).
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
+
 #endif  // IS_MOBILE_PLATFORM
 
+  // Closes remote eager contexts, waits for all RPCs to finish, and
+  // destroys the EagerClientCache. No RPCs can be made through this context
+  // after this method has been called.
+  // This method exists to aid a clean shutdown. It causes all RPCs to finish
+  // and remote TensorHandles to release their references to this context.
+  // To avoid deadlocks, this method must not be called on the thread
+  // processing RPCs because it makes RPCs and waits for their completion.
+  //
+  // On mobile, it just cleans the caches.
+  void WaitForAndCloseRemoteContexts();
+
   bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
 
   tensorflow::Env* TFEnv() const { return env_; }
@@ -390,28 +393,23 @@ class EagerContext : public core::RefCounted {
       GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_step_stats_{false};
   std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
-  RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
   GraphCollector graph_collector_;
   // TODO(fishx): Allow update following two bool after context creation.
   const bool log_device_placement_;
   const bool allow_soft_placement_;
-  // EagerExecutor for async execution.
-  EagerExecutor executor_;
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
   std::unique_ptr<ScopedStepContainer> step_container_ GUARDED_BY(metadata_mu_);
 
-  // True if the default value for execution mode is async. Note that this value
-  // can be overridden per thread based on `thread_local_async` overrides.
-  const bool async_default_;
-  mutable mutex async_map_mu_;
-  std::unordered_map<std::thread::id, bool> thread_local_async_
-      GUARDED_BY(async_map_mu_);
+  EagerExecutor default_executor_;
+  mutable mutex executor_map_mu_;
+  // Not owned.
+  std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
+      GUARDED_BY(executor_map_mu_);
 
   const bool log_memory_;
 
@@ -433,7 +431,7 @@ class EagerContext : public core::RefCounted {
 
   mutex remote_state_mu_;
 
-  uint64 context_id_;
+  uint64 context_id_ GUARDED_BY(remote_state_mu_);
   std::vector<string> remote_contexts_;
 
   int keep_alive_secs_ GUARDED_BY(remote_state_mu_);
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index ae3369dfbc0..7f91251511b 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -15,20 +15,81 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 
+#include "tensorflow/core/lib/gtl/cleanup.h"
+
 namespace tensorflow {
 
+EagerExecutor::EagerExecutor(bool async)
+    : thread_(async ? tensorflow::Env::Default()->StartThread(
+                          tensorflow::ThreadOptions(), "eager_async_executor",
+                          std::bind(&EagerExecutor::Run, this))
+                    : nullptr) {}
+
 EagerExecutor::~EagerExecutor() {
   tensorflow::mutex_lock l(node_queue_mutex_);
-  thread_done_ = true;
+  state_ = ExecutorState::kShutDown;
   nodes_pending_.notify_all();
 }
 
-void EagerExecutor::EnableAsync() {
+Status EagerExecutor::ShutDown() {
+  {
+    tensorflow::mutex_lock l(node_queue_mutex_);
+    if (state_ != ExecutorState::kShutDown) {
+      // if the state is kShutDown, we don't return here because we want to
+      // make sure the executor thread has ended (if there is one).
+      // So, we fall through to
+      // thread_exited_notification_.WaitForNotification() below.
+      state_ = ExecutorState::kShuttingDown;
+    }
+    WaitForOrDestroyAllPendingNodes(&l);
+    state_ = ExecutorState::kShutDown;
+    if (thread_ == nullptr) {
+      return status_;
+    }
+    nodes_pending_.notify_all();
+  }
+
+  thread_exited_notification_.WaitForNotification();
   tensorflow::mutex_lock l(node_queue_mutex_);
+  return status_;
+}
+
+void EagerExecutor::WaitForOrDestroyAllPendingNodes(mutex_lock* lock) {
+  if (state_ == ExecutorState::kShutDown) {
+    return;
+  }
   if (thread_ == nullptr) {
-    thread_.reset(tensorflow::Env::Default()->StartThread(
-        tensorflow::ThreadOptions(), "eager_async_executor",
-        std::bind(&EagerExecutor::Run, this)));
+    Status status = status_;
+    if (status.ok()) {
+      status = errors::FailedPrecondition(
+          "Aborting eager nodes because EagerExecutor is being shut down "
+          "before it got a thread to run the nodes");
+      status_ = status;
+    }
+    while (!node_queue_.empty()) {
+      node_queue_.front()->Abort(status);
+      node_queue_.pop();
+    }
+    return;
+  }
+
+  // It is OK to ignore the returned status here because it will be saved
+  // as the final status_.
+  WaitForAllPendingNodesLocked(lock).IgnoreError();
+}
+
+bool EagerExecutor::Async() const {
+  return thread_ != nullptr;
+}
+
+const char* EagerExecutor::StateStringLocked() {
+  switch (state_) {
+    case ExecutorState::kActive:
+      return "Active";
+    case ExecutorState::kShuttingDown:
+      return "ShuttingDown";
+    case ExecutorState::kShutDown:
+      return "ShutDown";
   }
 }
 
@@ -40,18 +101,25 @@ Status EagerExecutor::Add(std::unique_ptr<EagerNode> node) {
   // try to call EagerExecutor::Add()
   {
     tensorflow::mutex_lock l(node_queue_mutex_);
-    DCHECK(thread_) << "EnableAsync should have been called before Add";
-    status = status_;
-    if (status.ok()) {
-      node_queue_.push(std::move(node));
+    if (state_ != ExecutorState::kActive) {
+      status = errors::FailedPrecondition(
+          "EagerExecutor accepts new EagerNodes to run only in Active state. "
+          "Current state is '",
+          StateStringLocked(), "'");
+    } else {
+      DCHECK(thread_) << "EnableAsync should have been called before Add";
+      status = status_;
+      if (status.ok()) {
+        node_queue_.push(std::move(node));
 
-      // If there were no previous nodes pending, wake the run thread to start
-      // processing requests again.
-      if (node_queue_.size() == 1) {
-        nodes_pending_.notify_all();
+        // If there were no previous nodes pending, wake the run thread to start
+        // processing requests again.
+        if (node_queue_.size() == 1) {
+          nodes_pending_.notify_all();
+        }
+
+        return Status::OK();
       }
-
-      return Status::OK();
     }
   }
 
@@ -61,14 +129,19 @@ Status EagerExecutor::Add(std::unique_ptr<EagerNode> node) {
 }
 
 tensorflow::Status EagerExecutor::WaitForAllPendingNodes() {
-  tensorflow::condition_variable cond;
   tensorflow::mutex_lock l(node_queue_mutex_);
+  return WaitForAllPendingNodesLocked(&l);
+}
+
+tensorflow::Status EagerExecutor::WaitForAllPendingNodesLocked(
+    mutex_lock* lock) {
+  tensorflow::condition_variable cond;
   // Don't wait if an error is already set.
   if (!status_.ok()) return status_;
   if (node_queue_.empty()) return tensorflow::Status::OK();
   EagerNode* last_node = node_queue_.back().get();
   node_done_notifications_.insert(std::make_pair(last_node, &cond));
-  cond.wait(l);
+  cond.wait(*lock);
   // Note that we could be woken up if an error occurs, even though the node has
   // not actually executed.
   return status_;
@@ -76,6 +149,7 @@ tensorflow::Status EagerExecutor::WaitForAllPendingNodes() {
 
 void EagerExecutor::ClearError() {
   tensorflow::mutex_lock l(node_queue_mutex_);
+  // TODO(iga): Check state_ and return an error if it is not kActive.
   if (status_.ok()) return;
   // If an error was set, node_done_notifications_ and node_queue_ should have
   // been cleared, and no new entries should have been added since.
@@ -91,48 +165,67 @@ tensorflow::Status EagerExecutor::status() const {
 }
 
 void EagerExecutor::Run() {
+  auto thread_exited_notifier =
+      gtl::MakeCleanup([this] { thread_exited_notification_.Notify(); });
   while (true) {
-    EagerNode* curr_node;
+    EagerNode* curr_node_raw;
     {
       tensorflow::mutex_lock l(node_queue_mutex_);
       while (node_queue_.empty() || !status_.ok()) {
-        if (thread_done_) return;
+        if (state_ == ExecutorState::kShutDown) return;
         nodes_pending_.wait(l);
       }
       // Obtain raw pointer since we don't want to remove from the queue until
-      // the node has been run.
-      curr_node = node_queue_.front().get();
+      // the node has been run. Otherwise, WaitForAllPendingNodes can return
+      // too early.
+      // Note, we don't std::move from the here because the front of the queue
+      // will then contain a nullptr. This can be a problem in
+      // WaitForAllPendingNodes where we get the top EagerNode pointer
+      // and register a notification for its completion.
+      curr_node_raw = node_queue_.front().get();
     }
-    tensorflow::Status status = curr_node->Run();
+    tensorflow::Status status = curr_node_raw->Run();
     const bool ok = status.ok();
-    tensorflow::mutex_lock l(node_queue_mutex_);
-    node_queue_.pop();
-    if (!ok) {
-      status_ = status;
-      // We remove any pending ops so that we don't try to execute them if
-      // ClearError is called.
-      errors::AppendToMessage(&status,
-                              ". Encountered when executing an operation using "
-                              "EagerExecutor. This error cancels all future "
-                              "operations and poisons their output tensors.");
-      for (int i = 0; i < node_queue_.size(); ++i) {
-        node_queue_.front()->Abort(status);
-        // Dequeue and delete nodes
-        node_queue_.pop();
+
+    std::unique_ptr<EagerNode> curr_node;
+    std::vector<std::unique_ptr<EagerNode>> nodes_to_destroy;
+    {
+      tensorflow::mutex_lock l(node_queue_mutex_);
+      curr_node = std::move(node_queue_.front());
+      node_queue_.pop();
+      if (!ok) {
+        status_ = status;
+        // We remove any pending ops so that we don't try to execute them if
+        // ClearError is called.
+        errors::AppendToMessage(
+            &status,
+            ". Encountered when executing an operation using "
+            "EagerExecutor. This error cancels all future "
+            "operations and poisons their output tensors.");
+        while (!node_queue_.empty()) {
+          node_queue_.front()->Abort(status);
+          nodes_to_destroy.push_back(std::move(node_queue_.front()));
+          node_queue_.pop();
+        }
+      }
+      if (!node_done_notifications_.empty()) {
+        // Note that we notify all waiting threads in case an error has
+        // occurred. These calling threads are responsible for checking status_
+        // before proceeding.
+        const auto range =
+            ok ? node_done_notifications_.equal_range(curr_node_raw)
+               : make_pair(node_done_notifications_.begin(),
+                           node_done_notifications_.end());
+        for (auto it = range.first; it != range.second; ++it) {
+          it->second->notify_all();
+        }
+        node_done_notifications_.erase(range.first, range.second);
       }
     }
-    if (!node_done_notifications_.empty()) {
-      // Note that we notify all waiting threads in case an error has occurred.
-      // These calling threads are responsible for checking status_ before
-      // proceeding.
-      const auto range = ok ? node_done_notifications_.equal_range(curr_node)
-                            : make_pair(node_done_notifications_.begin(),
-                                        node_done_notifications_.end());
-      for (auto it = range.first; it != range.second; ++it) {
-        it->second->notify_all();
-      }
-      node_done_notifications_.erase(range.first, range.second);
-    }
+    // curr_node and nodes_to_destroy will be destructed here, while not holding
+    // node_queue_mutex_. This is important because, unfortunately, some nodes'
+    // destructors can enqueue more operations onto this executor and cause
+    // a deadlock.
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index 9a5aee313b6..539cd5ab3b3 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -70,17 +70,29 @@ class EagerNode {
 // TODO(agarwal): Implement optimizations over EagerNode traces.
 class EagerExecutor {
  public:
+  explicit EagerExecutor(bool async);
+
   ~EagerExecutor();
 
-  // This is called whenever async mode is enabled. Note that it may be called
-  // multiple times as different calling threads may switch async mode on or off
-  // independently.
-  void EnableAsync();
+  // Puts this in a shutdown state. In this state, Add() will return an error
+  // and not add new EagerNodes. After putting this in the shutdown state,
+  // blocks until all pendings nodes have finished running.
+  // Returns the status of executing pending nodes.
+  // If async was not enabled, aborts and destroys all pending nodes.
+  Status ShutDown();
 
-  // Schedules `node` for execution.
+  bool Async() const;
+
+  // Schedules `node` for execution. If an error occurs (e.g. EagerExecutor
+  // has already been shut down), the `node` is not added to this executor
+  // and its Abort() method is called.
   Status Add(std::unique_ptr<EagerNode> node);
 
   // Blocks till all currently pending ops are done.
+  // In particular, if EnableAsync() has not beed called, it will not return
+  // until that happens (and pendings, at the time of call, nodes finish
+  // running). If this executor has already been shut down, its final status is
+  // returned.
   Status WaitForAllPendingNodes();
 
   // Clears all currently set errors which re-enables async execution.
@@ -90,12 +102,43 @@ class EagerExecutor {
   Status status() const;
 
  private:
+  // Possible states for this executor.
+  // Executor starts in kActive state. When Shutdown() is called, Executor
+  // is put in the kShuttingDown state. In this state, the executor thread
+  // continues to run, but no new nodes are accepted. Finally, when all nodes
+  // are drained, the executor is put in the kShutDown state, which causes the
+  // thread to exit.
+  // If this executor is destroyed without calling shutdown first, it
+  // transitions to kShutDown state immediately which causes the thread to exit
+  // without running pending nodes.
+  enum class ExecutorState {
+    kActive,
+    kShuttingDown,
+    kShutDown,
+  };
+
+  const char* StateStringLocked() EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
   // Starts execution of pending EagerNodes. This function loops till
   // thread_done_ is set to true. If any errors are encontered, these are set
   // inside `status_`. The loop blocks anytime there are no pending nodes, or if
   // `status_` is not ok.
   void Run();
 
+  // The impl of WaitForAllPendingNodes
+  // `lock` is the lock that holds node_queue_mutex_.
+  Status WaitForAllPendingNodesLocked(mutex_lock* lock)
+      EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
+  // If async has been enabled on this executor, just calls
+  // WaitForAllPendingNodes. Else:
+  //  - Aborts and destroys all pending nodes
+  //  - sets the status_ to an error if it does not already contain one
+  // `lock` is the lock that holds node_queue_mutex_.
+  // Precondition: state_ != kActive.
+  void WaitForOrDestroyAllPendingNodes(mutex_lock* lock)
+      EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
   Status WaitImpl(bool wait_all, uint64 node_id);
 
   mutable mutex node_queue_mutex_;
@@ -117,13 +160,17 @@ class EagerExecutor {
   std::multimap<EagerNode*, condition_variable*> node_done_notifications_
       GUARDED_BY(node_queue_mutex_);
 
-  // Thread object that calls the `Run` method. Currently we use only one thread
-  // for executing the EagerNodes one-by-one.
-  std::unique_ptr<Thread> thread_ GUARDED_BY(node_queue_mutex_);
+  // thread_exited_notification_ is notified by the `thread_` right before it
+  // exits.
+  Notification thread_exited_notification_;
 
   // Indicates that `thread_` should stop as soon as it is done executing the
   // current EagerNode.
-  bool thread_done_ GUARDED_BY(node_queue_mutex_) = false;
+  ExecutorState state_ GUARDED_BY(node_queue_mutex_) = ExecutorState::kActive;
+
+  // Thread object that calls the `Run` method in async mode.This thread runs
+  // until state_ is set to kShuttingDown. It is `nullptr` in sync mode.
+  const std::unique_ptr<Thread> thread_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index cfb11d870f5..46a7584d45b 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -22,8 +22,9 @@ namespace tensorflow {
 class TestEagerOpRewrite : public EagerOpRewrite {
  public:
   TestEagerOpRewrite(string name, string file, string line)
-      : EagerOpRewrite(name, file, line) {}
+      : EagerOpRewrite(name, file, line), executor_(/*async=*/false) {}
   static int count_;
+  EagerExecutor executor_;
   Status Run(EagerOperation* orig_op,
              std::unique_ptr<tensorflow::EagerOperation>* out_op) override {
     ++count_;
@@ -33,8 +34,8 @@ class TestEagerOpRewrite : public EagerOpRewrite {
     TF_RETURN_IF_ERROR(
         tensorflow::AttrTypeMapForOp(kNewOp.c_str(), &types, &is_function));
     // Create a new NoOp Eager operation.
-    out_op->reset(new tensorflow::EagerOperation(nullptr, kNewOp.c_str(),
-                                                 is_function, types));
+    out_op->reset(new tensorflow::EagerOperation(
+        nullptr, kNewOp.c_str(), is_function, types, &executor_));
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 7191cbb7648..853b1a784f1 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -25,13 +26,15 @@ namespace tensorflow {
 class EagerOperation {
  public:
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
-                 bool is_function, const tensorflow::AttrTypeMap* t)
+                 bool is_function, const tensorflow::AttrTypeMap* t,
+                 EagerExecutor* executor = nullptr)
       : ctx_(ctx),
         name_(op),
         attrs_(op),
         attr_types_(t),
         device_(nullptr),
-        is_function_(is_function) {}
+        is_function_(is_function),
+        executor_(executor ? *executor : *ctx->Executor()) {}
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -81,6 +84,8 @@ class EagerOperation {
     cancellation_manager_ = cancellation_manager;
   }
 
+  EagerExecutor* Executor() { return &executor_; }
+
   string DebugString() const;
 
  private:
@@ -94,6 +99,7 @@ class EagerOperation {
   bool use_xla_ = false;
   const bool is_function_;
   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
+  EagerExecutor& executor_;                              // Not owned.
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 19e79a95146..a85280676b7 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -21,6 +21,7 @@ limitations under the License.
 // Required for IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -44,8 +45,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #endif  // IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -125,7 +127,6 @@ const string DeviceNameOrUnspecified(const DeviceNameUtils::ParsedName& name) {
 // unset and we might have selected some specific device to run this op on.
 Status MaybeCopyInputToExpectedDevice(EagerOperation* op, Device* op_device,
                                       int i, Device* expected_input_device,
-                                      RunMetadata* run_metadata,
                                       TensorHandle** result) {
   tensorflow::TensorHandle* handle = op->Inputs()[i];
   EagerContext* ctx = op->EagerContext();
@@ -174,30 +175,13 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, Device* op_device,
   }
   // We are only here if the policy is warn or silent copies, so we should
   // trigger a copy.
-  auto pre_time_nanos = Env::Default()->NowNanos();
   TensorHandle* result_handle = nullptr;
-  Status status = EagerCopyToDevice(handle, ctx, expected_input_device,
-                                    ctx->MirrorTensors(), &result_handle);
-  if (run_metadata != nullptr) {
-    auto* step_stats = run_metadata->mutable_step_stats();
-    MaybeInitializeStepStats(step_stats, ctx);
-    // Record the sending on the source device for now.
-    int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
-    auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-    auto* node_stats = dev_stats->add_node_stats();
-    node_stats->set_node_name("_Send");
-    node_stats->set_all_start_micros(pre_time_nanos / EnvTime::kMicrosToNanos);
-    node_stats->set_all_start_nanos(pre_time_nanos);
-    int64 now_nanos = Env::Default()->NowNanos();
-    node_stats->set_op_end_rel_micros((now_nanos - pre_time_nanos) /
-                                      EnvTime::kMicrosToNanos);
-    node_stats->set_op_end_rel_nanos(now_nanos - pre_time_nanos);
-    node_stats->set_all_end_rel_micros((now_nanos - pre_time_nanos) /
-                                       EnvTime::kMicrosToNanos);
-    node_stats->set_all_end_rel_nanos(now_nanos - pre_time_nanos);
-  }
+  profiler::TraceMe activity("_Send", profiler::TraceMeLevel::kInfo);
+  Status status =
+      EagerCopyToDevice(handle, ctx, op->Executor(), expected_input_device,
+                        ctx->MirrorTensors(), &result_handle);
+  activity.Stop();
   if (!status.ok()) {
-    if (result_handle != nullptr) result_handle->Unref();
     return errors::Internal("Failed copying input tensor from ",
                             handle_device->name(), " to ",
                             expected_input_device->name(), " in order to run ",
@@ -214,8 +198,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, Device* op_device,
 // unspecified.
 Status ValidateInputTypeAndPlacement(
     EagerContext* ctx, EagerOperation* op,
-    const core::RefCountPtr<KernelAndDevice>& kernel,
-    RunMetadata* run_metadata) {
+    const core::RefCountPtr<KernelAndDevice>& kernel) {
   profiler::TraceMe activity("ValidateInputTypeAndPlacement",
                              profiler::TraceMeLevel::kInfo);
   if (kernel->num_inputs() != op->Inputs().size()) {
@@ -226,7 +209,7 @@ Status ValidateInputTypeAndPlacement(
     Device* expected_device = kernel->InputDevice(i);
     TensorHandle* handle = nullptr;
     TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, kernel->device(), i, expected_device, run_metadata, &handle));
+        op, kernel->device(), i, expected_device, &handle));
     op->UpdateInput(i, handle);
     // Unref handle since it has a ref as an input now
     handle->Unref();
@@ -245,7 +228,8 @@ Status SelectDevice(EagerOperation* op, const NodeDef& ndef, EagerContext* ctx,
   std::vector<Device*> final_devices;
   PrioritizedDeviceTypeVector supported_devs;
   TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
-      ctx->prioritized_device_type_list(), ndef, &supported_devs));
+      ctx->prioritized_device_type_list(), ndef, &supported_devs,
+      &ctx->HostCPU()->parsed_name()));
   if (supported_devs.empty()) {
     return errors::NotFound("Could not find valid device for node.\nNode:",
                             FormatNodeDefForError(ndef),
@@ -298,6 +282,8 @@ Status SelectDevice(EagerOperation* op, const NodeDef& ndef, EagerContext* ctx,
 
   VLOG(1) << "Placer place op [" << op->Name()
           << "] on device: " << final_devices[0]->name();
+  VLOG(4) << "Available kernels for " << op->Name() << "are "
+          << KernelsRegisteredForOp(op->Name());
   op->SetDevice(final_devices[0]);
   *device = final_devices[0];
   return Status::OK();
@@ -408,8 +394,7 @@ void AppendTensorShapeToFingerprint(const PartialTensorShape& shape,
 
 Status ShouldCompileWithXLA(const EagerOperation* op, const EagerContext* ctx,
                             bool* compile_with_xla) {
-  if (!op->is_function() ||
-      !DeviceNameUtils::HasSomeDetails(op->GetDeviceName())) {
+  if (!op->is_function()) {
     *compile_with_xla = false;
     return Status::OK();
   }
@@ -473,7 +458,8 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
   EagerContext* ctx = op->EagerContext();
-  TF_RETURN_IF_ERROR(ctx->GetStatus());
+  auto* executor = op->Executor();
+  TF_RETURN_IF_ERROR(executor->status());
   Device* device = op->Device();
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(
@@ -483,10 +469,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       IsMultiDevice(ctx->FindFunctionDef(op->Name()));
 
   std::vector<Device*> input_dev_ptrs;
-  // `input_tensor_shapes` contains (potentially a subset of) non DT_RESOURCE
-  // arguments, and `input_resource_variable_dtypes_and_shapes` contains shapes
-  // and underlying types for (potentially a subset) of DT_RESOURCE arguments.
-  std::unordered_map<int, TensorShape> input_tensor_shapes;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   if (is_multi_device_function) {
@@ -506,7 +488,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       if (input->IsRemote()) {
         TensorHandle* handle = nullptr;
         TF_RETURN_IF_ERROR(EagerCopyToDevice(
-            input, ctx, device == nullptr ? ctx->HostCPU() : device,
+            input, ctx, executor, device == nullptr ? ctx->HostCPU() : device,
             ctx->MirrorTensors(), &handle));
         op->UpdateInput(i, handle);
         // Unref handle since it has a ref as an input now
@@ -521,19 +503,9 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       cache_key =
           FingerprintCat128(cache_key, Fingerprint128(input_device->name()));
 
-      // If input is normal tensor, get its shape and add it to 'cache_key';
       // If input is a ResourceHandle, get its resource handle dtypes and shapes
       // and add them to 'cache_key'.
-      if (input->dtype != DT_RESOURCE) {
-        TensorShape shape;
-        TF_RETURN_IF_ERROR(input->Shape(&shape));
-
-        input_tensor_shapes[i] = shape;
-
-        // Add both _Arg index and shape to "cache_key".
-        cache_key = FingerprintCat128(cache_key, i);
-        AppendTensorShapeToFingerprint(shape, &cache_key);
-      } else {
+      if (input->dtype == DT_RESOURCE) {
         // We only care about data type and shape for resource variable inputs.
         // But we have no way to tell if input is resource variable (other than
         // looking it up in ResourceMgr, which is slow). So we just get
@@ -566,9 +538,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
     if (compile_with_xla) {
       // Note that it is not ideal, but currently correct, to set this
       // attribute after computing the kernel cache key above.
-      // TODO(iga): Creating XlaLaunchOp kernel directly here would be much
-      // better than setting this attribute and relying on
-      // custom_kernel_creator.
       // Note: If the attribute is already set to true, this is a noop.
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
     }
@@ -613,7 +582,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
               << ". Full node_def=" << ndef.DebugString();
       kernel.reset(new KernelAndDeviceFunc(
           flr, ctx->pflr(), std::move(input_dev_ptrs),
-          std::move(input_tensor_shapes),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx->GetCollectiveExecutorHandle(), ctx->HostCPU(), op->Name(),
           [ctx](const int64 step_id) {
@@ -623,9 +591,10 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       VLOG(2) << "Running " << ndef.op() << " using op kernel. "
               << "compile_with_xla=" << compile_with_xla
               << ". Full node_def=" << ndef.DebugString();
-      kernel.reset(new KernelAndDeviceOp(
-          ctx->GetRendezvous(), ctx->LogMemory(), flr, runner,
-          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU()));
+      kernel.reset(new KernelAndDeviceOp(ctx->GetRendezvous(), ctx->LogMemory(),
+                                         flr, runner,
+                                         ctx->GetCollectiveExecutorHandle(),
+                                         ctx->HostCPU(), compile_with_xla));
     }
 
     TF_RETURN_IF_ERROR(kernel->Init(ndef, graph_collector));
@@ -640,29 +609,13 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                                    *num_retvals);
   }
   *num_retvals = num_outputs;
-  TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(
-      ctx, op, kernel,
-      ctx->ShouldStoreStepStats() ? ctx->RunMetadataProto() : nullptr));
+  TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(ctx, op, kernel));
 
-  std::unique_ptr<NodeExecStats> maybe_stats;
   StepStats* maybe_step_stats = nullptr;
   GraphCollector* graph_collector = nullptr;
   if (ctx->ShouldStoreGraphs()) {
     graph_collector = ctx->GetGraphCollector();
   }
-  if (ctx->ShouldStoreStepStats()) {
-    maybe_step_stats = ctx->RunMetadataProto()->mutable_step_stats();
-    int64 now_nanos = Env::Default()->NowNanos();
-    maybe_stats.reset(new NodeExecStats);
-    maybe_stats->set_node_name(op->Name());
-    maybe_stats->set_all_start_micros(now_nanos / EnvTime::kMicrosToNanos);
-    maybe_stats->set_all_start_nanos(now_nanos);
-    maybe_stats->set_op_start_rel_micros(0);
-    maybe_stats->set_op_start_rel_nanos(0);
-    maybe_stats->set_scheduled_micros(now_nanos / EnvTime::kMicrosToNanos);
-    maybe_stats->set_scheduled_nanos(now_nanos);
-    // TODO(apassos) track referenced tensors
-  }
 
   for (int i = 0; i < num_outputs; ++i) {
     TF_RETURN_IF_ERROR(TensorHandle::CreateAsyncLocalHandle(
@@ -672,15 +625,15 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
         output_dtypes[i], ctx, &retvals[i]));
   }
 
-  std::unique_ptr<EagerNode> node(new ExecuteNode(
-      ctx, op->Inputs(), std::move(kernel), maybe_stats.release(),
-      maybe_step_stats, graph_collector, output_dtypes,
-      op->GetCancellationManager(), {retvals, num_outputs}));
+  std::unique_ptr<EagerNode> node(
+      new ExecuteNode(ctx, op->Inputs(), std::move(kernel), nullptr,
+                      maybe_step_stats, graph_collector, output_dtypes,
+                      op->GetCancellationManager(), {retvals, num_outputs}));
   // Note that for async mode, execution order will make sure that all
   // input handles are ready before executing them.
   // TODO(b/137118203): Consider executing "cheap" kernels inline for
   // performance.
-  Status s = ctx->Async() ? ctx->ExecutorAdd(std::move(node)) : node->Run();
+  Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -693,64 +646,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-// When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
-// devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
-// sender* (Rendezvous::RecvAsync() invoked by the _Recv kernel).
-//
-// However, in some configurations the node that has the tensor to be copied
-// isn't running a server (WorkerService RPC interface). For such cases,
-// this function enables sending tensors using the EagerService.SendTensor RPC
-// *on the receiver*.
-Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
-                             Device* recv_device, bool mirror,
-                             TensorHandle** result) {
-  eager::EagerClient* eager_client;
-  uint64 context_id = ctx->GetContextId();
-  TF_RETURN_IF_ERROR(ctx->GetClient(recv_device, &eager_client));
-
-  eager::SendTensorRequest request;
-  eager::SendTensorResponse response;
-
-  request.set_context_id(context_id);
-  request.set_op_id(ctx->RemoteMgr()->NextOpId());
-  request.set_device_name(recv_device->name());
-
-  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
-  // copy it to the CPU before copying it out.
-  // TODO(b/110044833): this is currently slow, but can be fixed by making
-  // tensor handles aware of more than one device.
-  Tensor tensor;
-  TF_RETURN_IF_ERROR(h->CopyToDevice(ctx, ctx->HostCPU(), &tensor));
-  tensor.AsProtoTensorContent(request.add_tensors());
-
-  const tensorflow::uint64 id = request.op_id();
-
-  // TODO(nareshmodi): support making this call async.
-  Notification n;
-  Status status;
-  eager_client->SendTensorAsync(&request, &response,
-                                [&n, &status](const Status& s) {
-                                  status = s;
-                                  n.Notify();
-                                });
-  n.WaitForNotification();
-  if (!status.ok()) return status;
-
-  auto tensor_handle_data = absl::make_unique<RemoteTensorHandleData>(
-      id, 0, tensor.shape(), eager_client, context_id, ctx);
-  if (mirror) {
-    status = h->AddRemoteMirror(std::move(tensor_handle_data), recv_device);
-    h->Ref();
-    *result = h;
-  } else {
-    status = TensorHandle::CreateRemoteHandle(std::move(tensor_handle_data),
-                                              tensor.dtype(), recv_device,
-                                              nullptr, ctx, result);
-  }
-
-  return status;
-}
-
 void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
   EagerContext* ctx = op->EagerContext();
 
@@ -789,6 +684,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     for (int i = 0; i < op->Inputs().size(); i++) {
       tensorflow::TensorHandle* input = op->Inputs()[i];
       tensorflow::Device* input_device = input->device();
+      const string* input_device_name = &input->DeviceOrHostCPU(ctx)->name();
       if (op->Device() != input_device &&
           // If the expected and actual devices are on the same task, don't
           // explicitly copy, and instead depend on the copy to happen locally
@@ -804,17 +700,17 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         // the op might have its inputs on host memory.
         TensorHandle* handle = nullptr;
         TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-            op, op->Device(), i, remote_cpu_device,
-            /* run_metadata= */ nullptr, &handle));
+            op, op->Device(), i, remote_cpu_device, &handle));
         op->UpdateInput(i, handle);
         input = handle;
         input_device = remote_cpu_device;
+        input_device_name = &remote_cpu_device->name();
         // Unref handle since it has a ref as an input now
         handle->Unref();
       }
 
       TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          input, remote_op->add_inputs(), input_device));
+          input, remote_op->add_inputs(), input_device, *input_device_name));
     }
   }
 
@@ -831,11 +727,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   *num_retvals = num_outputs;
 
   tensorflow::Device* op_device = op->Device();
-
-  bool is_async = ctx->Async();
-  VLOG(4) << "Execute remote eager op: " << op->Name()
-          << " (is async?: " << is_async << ").";
-
   const tensorflow::uint64 id = remote_op->id();
   for (int i = 0; i < num_outputs; ++i) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a
@@ -849,15 +740,28 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     // remote device here. We just need to know that it is remote. If we need
     // to copy this tensor to this process, the remote end will know the
     // correct device of this handle.
-    TF_RETURN_IF_ERROR(TensorHandle::CreateUnshapedRemoteHandle(
+    Status status = TensorHandle::CreateUnshapedRemoteHandle(
         id, i, eager_client, context_id, output_dtypes[i], op_device, ctx,
-        &retvals[i]));
+        &retvals[i]);
+    if (!status.ok()) {
+      for (int j = 0; j < i; ++j) {
+        retvals[j]->Poison(errors::Internal(
+            "Failed to construct unshaped remote tensor handle at index ", i,
+            " for op ", op->Name()));
+      }
+      return status;
+    }
   }
 
+  auto* executor = op->Executor();
+  bool is_async = executor->Async();
+  VLOG(4) << "Execute remote eager op: " << op->Name()
+          << " (is async?: " << is_async << ").";
+
   std::unique_ptr<EagerNode> node(
       new eager::RemoteExecuteNode(std::move(request), op_device, eager_client,
                                    op->Inputs(), {retvals, num_outputs}));
-  Status s = is_async ? ctx->ExecutorAdd(std::move(node)) : node->Run();
+  Status s = is_async ? executor->Add(std::move(node)) : node->Run();
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -898,7 +802,7 @@ bool IsPinnableOp(const string& op_type) {
 // (int32/int64). This can be disabled by setting the environment variable
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
 Status MaybeUpdateOpDevice(EagerOperation* op) {
-  auto exempt_ops = InputColocationExemptionRegistry::Global()->Get();
+  const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
   if (op->is_function() || exempt_ops.find(op->Name()) != exempt_ops.end()) {
     // Don't update the device of direct function calls.
     // Particularly, if the user did not explicitly request any device for this
@@ -1095,44 +999,6 @@ Status EagerKernelExecute(EagerContext* ctx,
       collector->ClearGraphs();
     }
   }
-  if (maybe_stats != nullptr) {
-    int64 nanos = Env::Default()->NowNanos();
-    maybe_stats->set_op_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
-                                       maybe_stats->all_start_micros());
-    maybe_stats->set_op_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
-                                        maybe_stats->all_start_micros());
-    maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    if (ctx->ShouldStoreStepStats()) {
-      mutex_lock ml(*ctx->MetadataMu());
-      {
-        auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
-        // Lazily initialize the RunMetadata with information about all devices
-        // if this is the first call.
-        while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-          step_stats->add_dev_stats();
-        }
-        // Find the current device's index.
-        // If device is a nullptr (we are running a function without explicitly
-        // requested device), attribute the function runtime to CPU.
-        Device* attribution_device = kernel->device();
-        if (attribution_device == nullptr) {
-          attribution_device = ctx->HostCPU();
-        }
-        int device_idx = 0;
-        for (int i = 0; i < ctx->devices()->size(); ++i) {
-          if (ctx->devices()->at(i) == attribution_device) {
-            device_idx = i;
-            break;
-          }
-        }
-        // Populate the device stats for this device.
-        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        dev_stats->set_device(attribution_device->name());
-        *dev_stats->add_node_stats() = *maybe_stats;
-      }
-    }
-  }
   DCHECK_EQ(retvals.size(), outputs.size());
   for (int i = 0; i < retvals.size(); ++i) {
     DCHECK_EQ(kernel->device(), retvals[i]->op_device());
@@ -1146,9 +1012,10 @@ Status EagerKernelExecute(EagerContext* ctx,
 
 namespace {
 
-Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
+Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                              EagerExecutor* executor, Device* dstd,
                               TensorHandle** result) {
-  TF_RETURN_IF_ERROR(ctx->GetStatus());
+  TF_RETURN_IF_ERROR(executor->status());
   Device* resource_device = (h->dtype == DT_RESOURCE) ? dstd : nullptr;
   TF_RETURN_IF_ERROR(TensorHandle::CreateAsyncLocalHandle(
       ctx->CanonicalDevice(dstd), dstd, resource_device, h->dtype, ctx,
@@ -1157,7 +1024,7 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
   // Note that `h` may not be currently ready. However execution order will
   // make sure that `h` is ready before the copy is actually done.
   std::unique_ptr<EagerNode> node(new CopyToDeviceNode(h, *result, dstd, ctx));
-  Status s = ctx->Async() ? ctx->ExecutorAdd(std::move(node)) : node->Run();
+  Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -1167,185 +1034,11 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
   return s;
 }
 
-#if !defined(IS_MOBILE_PLATFORM)
-Status CreateUncachedKernelAndDeviceOp(
-    EagerOperation* op, core::RefCountPtr<KernelAndDevice>* kernel) {
-  EagerContext* ctx = op->EagerContext();
-  Device* device = op->Device();
-
-  FunctionLibraryRuntime* flr = ctx->func_lib(device);
-  if (flr == nullptr) {
-    return errors::Unavailable(
-        "Unable to find a FunctionLibraryRuntime corresponding to device ",
-        device->name());
-  }
-
-  auto runner = (flr->runner() != nullptr) ? flr->runner() : ctx->runner();
-  kernel->reset(new KernelAndDeviceOp(
-      ctx->GetRendezvous(), ctx->LogMemory(), flr, runner,
-      ctx->GetCollectiveExecutorHandle(), ctx->HostCPU()));
-
-  const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
-  return kernel->get()->Init(ndef, nullptr);
-}
-
-Status ExecuteSend(EagerContext* ctx, Device* device, TensorHandle* h,
-                   StringPiece wire_id, Device* recv_device) {
-  // TODO(gjn): We should consider just using the low-level SendOp::Compute()
-  // functionality here instead of constructing an Op.
-  const AttrTypeMap* types;
-  bool is_function = false;
-  TF_RETURN_IF_ERROR(AttrTypeMapForOp("_Send", &types, &is_function));
-  DCHECK(!is_function);
-  EagerOperation op(ctx, "_Send", /*is_function=*/false, types);
-
-  op.SetDevice(device);
-
-  op.MutableAttrs()->Set("tensor_name", wire_id);
-  op.MutableAttrs()->Set("send_device", device->name());
-  op.MutableAttrs()->Set(
-      "send_device_incarnation",
-      static_cast<int64>(device->attributes().incarnation()));
-  op.MutableAttrs()->Set("recv_device", recv_device->name());
-  op.MutableAttrs()->Set("client_terminated", false);
-
-  op.MutableAttrs()->Set("T", h->dtype);
-
-  DCHECK(device != nullptr);
-
-  if (device->IsLocal()) {
-    TF_RETURN_IF_ERROR(ctx->GetStatus());
-
-    op.AddInput(h);
-
-    core::RefCountPtr<KernelAndDevice> kernel;
-    TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(&op, &kernel));
-
-    gtl::InlinedVector<TensorValue, 4> input_vector(1);
-    TF_RETURN_IF_ERROR(h->TensorValue(&input_vector[0]));
-
-    TF_RETURN_IF_ERROR(
-        kernel->Run(input_vector, nullptr, nullptr, nullptr, nullptr, nullptr));
-  } else {
-    eager::EagerClient* eager_client;
-    uint64 context_id = ctx->GetContextId();
-    TF_RETURN_IF_ERROR(ctx->GetClient(device, &eager_client));
-
-    std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
-    request->set_context_id(context_id);
-
-    auto* remote_op = request->add_queue()->mutable_operation();
-    TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-        h, remote_op->add_inputs(), h->device()));
-
-    PrepareRemoteOp(remote_op, &op);
-
-    std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
-        std::move(request), nullptr, eager_client, op.Inputs(), {nullptr, 0}));
-    if (ctx->Async()) {
-      TF_RETURN_IF_ERROR(ctx->ExecutorAdd(std::move(node)));
-    } else {
-      TF_RETURN_IF_ERROR(node->Run());
-    }
-  }
-
-  return Status::OK();
-}
-
-// Execute a Recv to transfer a tensor handle to a specific device. The received
-// tensor handle will be returned in result. If mirror_dst is provided, the
-// tensor handle will be added as a mirror.
-Status ExecuteRecv(EagerContext* ctx, Device* device, DataType dtype,
-                   StringPiece wire_id, Device* send_device,
-                   TensorHandle* mirror_dst, TensorHandle** result) {
-  // TODO(gjn): We should consider just using the low-level RecvOp::Compute()
-  // functionality here instead of constructing an Op.
-  const AttrTypeMap* types;
-  bool is_function = false;
-  TF_RETURN_IF_ERROR(AttrTypeMapForOp("_Recv", &types, &is_function));
-  DCHECK(!is_function);
-  EagerOperation op(ctx, "_Recv", /*is_function=*/false, types);
-
-  op.SetDevice(device);
-
-  op.MutableAttrs()->Set("tensor_name", wire_id);
-  op.MutableAttrs()->Set("send_device", send_device->name());
-  op.MutableAttrs()->Set(
-      "send_device_incarnation",
-      static_cast<int64>(send_device->attributes().incarnation()));
-  op.MutableAttrs()->Set("recv_device", device->name());
-  op.MutableAttrs()->Set("client_terminated", false);
-
-  op.MutableAttrs()->Set("tensor_type", dtype);
-
-  if (device->IsLocal()) {
-    TF_RETURN_IF_ERROR(ctx->GetStatus());
-
-    core::RefCountPtr<KernelAndDevice> kernel;
-    TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(&op, &kernel));
-
-    std::vector<Tensor> outputs;
-    gtl::InlinedVector<TensorValue, 4> input_vector;
-    TF_RETURN_IF_ERROR(kernel->Run(input_vector, &outputs, nullptr, nullptr,
-                                   nullptr, nullptr));
-
-    // TODO(gjn): Add support for async mode
-    TF_RETURN_IF_ERROR(TensorHandle::CreateLocalHandle(
-        outputs[0], /* d= */ kernel->OutputDevice(0),
-        /* op_device= */ kernel->device(), ctx, result));
-  } else {
-    eager::EagerClient* eager_client;
-    uint64 context_id = ctx->GetContextId();
-    TF_RETURN_IF_ERROR(ctx->GetClient(device, &eager_client));
-
-    std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
-    eager::EnqueueResponse response;
-
-    request->set_context_id(context_id);
-
-    auto* remote_op = request->add_queue()->mutable_operation();
-    PrepareRemoteOp(remote_op, &op);
-
-    const uint64 id = remote_op->id();
-    auto tensor_handle_data = absl::make_unique<UnshapedRemoteTensorHandleData>(
-        id, 0, eager_client, context_id, ctx);
-    if (mirror_dst != nullptr) {
-      TF_RETURN_IF_ERROR(mirror_dst->AddUnshapedRemoteMirror(
-          std::move(tensor_handle_data), device));
-      mirror_dst->Ref();
-      *result = mirror_dst;
-    } else {
-      TF_RETURN_IF_ERROR(TensorHandle::CreateUnshapedRemoteHandle(
-          std::move(tensor_handle_data), dtype, device, ctx, result));
-    }
-
-    std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
-        std::move(request), device, eager_client, op.Inputs(), {result, 1}));
-    if (ctx->Async()) {
-      TF_RETURN_IF_ERROR(ctx->ExecutorAdd(std::move(node)));
-    } else {
-      TF_RETURN_IF_ERROR(node->Run());
-    }
-  }
-
-  return Status::OK();
-}
-
-// This gets a unique wire ID. We add a random identifier so that if the
-// worker has other clients that it is servicing, we don't have any collision.
-string GetUniqueWireID() {
-  static tensorflow::uint64 random_seed = random::New64();
-  static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
-  static tensorflow::int64 wireid GUARDED_BY(wireid_mutex) = 0;
-  tensorflow::mutex_lock l(wireid_mutex);
-  return strings::StrCat(random_seed, "_", wireid++);
-}
-#endif  // !IS_MOBILE_PLATFORM
-
 }  // namespace
 
-Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* device,
-                         bool mirror, TensorHandle** result) {
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         EagerExecutor* executor, Device* device, bool mirror,
+                         TensorHandle** result) {
   Device* send_device = h->DeviceOrHostCPU(ctx);
 
   bool sender_is_local = send_device->IsLocal();
@@ -1353,7 +1046,7 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* device,
   bool recver_is_local = device->IsLocal();
 
   if (sender_is_local && recver_is_local) {
-    return LocalEagerCopyToDevice(h, ctx, device, result);
+    return LocalEagerCopyToDevice(h, ctx, executor, device, result);
   } else {
 #if defined(IS_MOBILE_PLATFORM)
     return errors::Unimplemented(
@@ -1366,16 +1059,37 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* device,
         return Status::OK();
       }
     }
-
-    if (ctx->UseSendTensorRPC() && sender_is_local && !recver_is_local) {
-      return EagerRemoteSendTensor(ctx, h, device, mirror, result);
+    uint64 recv_op_id = 0;
+    if (recver_is_local) {
+      TF_RETURN_IF_ERROR(TensorHandle::CreateAsyncLocalHandle(
+          /* d= */ device,
+          /* op_device= */ device, /*resource_device=*/nullptr, h->dtype, ctx,
+          result));
     } else {
-      string wire_id = GetUniqueWireID();
-      TF_RETURN_IF_ERROR(ExecuteSend(ctx, send_device, h, wire_id, device));
-
-      return ExecuteRecv(ctx, device, h->dtype, wire_id, send_device,
-                         mirror ? h : nullptr, result);
+      eager::EagerClient* eager_client;
+      uint64 context_id = ctx->GetContextId();
+      TF_RETURN_IF_ERROR(ctx->GetClient(device, &eager_client));
+      recv_op_id = ctx->RemoteMgr()->NextOpId();
+      auto tensor_handle_data =
+          absl::make_unique<UnshapedRemoteTensorHandleData>(
+              recv_op_id, 0, eager_client, context_id, ctx);
+      if (mirror) {
+        TF_RETURN_IF_ERROR(
+            h->AddUnshapedRemoteMirror(std::move(tensor_handle_data), device));
+        h->Ref();
+        *result = h;
+      } else {
+        TF_RETURN_IF_ERROR(TensorHandle::CreateUnshapedRemoteHandle(
+            std::move(tensor_handle_data), h->dtype, device, ctx, result));
+      }
     }
+    auto node = absl::make_unique<eager::RemoteCopyNode>(
+        ctx, executor, h, result[0], device, recv_op_id);
+    Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
+    if (!s.ok()) {
+      result[0]->Unref();
+    }
+    return s;
 #endif  // !IS_MOBILE_PLATFORM
   }
 }
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index a7b0a2cf5aa..f2dc57948fa 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -58,8 +58,9 @@ Status EagerKernelExecute(EagerContext* ctx,
 // the mirror flag, EagerCopyToDevice will attempt to add a mirror to the
 // original handle and update *result to point to h. Since this is not
 // guaranteed, callers should always use the value in *result.
-Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* device,
-                         bool mirror, TensorHandle** result);
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         EagerExecutor* executor, Device* device, bool mirror,
+                         TensorHandle** result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 432278486eb..bdb29c1035a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -41,6 +42,9 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
+#if !defined(PLATFORM_WINDOWS)
+#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
+#endif  // !PLATFORM_WINDOWS
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -69,16 +73,6 @@ KernelAndDeviceFunc::~KernelAndDeviceFunc() {
   }
 }
 
-KernelAndDeviceOp::~KernelAndDeviceOp() {
-  // Make sure that the device execution has finished before deleting cm_.
-  {
-    mutex_lock lock(num_deferred_ops_mu_);
-    while (num_deferred_ops_ > 0) {
-      no_deferred_ops_cv_.wait(lock);
-    }
-  }
-}
-
 Status KernelAndDeviceOp::Init(const NodeDef& ndef,
                                GraphCollector* graph_collector) {
   OpKernel* k = nullptr;
@@ -87,7 +81,18 @@ Status KernelAndDeviceOp::Init(const NodeDef& ndef,
         "A valid FunctionLibraryRuntime must be provided when running ops "
         "based on OpKernel.");
   }
-  TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
+  if (compile_with_xla_) {
+#if defined(IS_MOBILE_PLATFORM) || defined(PLATFORM_WINDOWS)
+    return errors::Unimplemented(
+        "Compile with XLA is not available on mobile devices and windows.");
+#else   // !IS_MOBILE_PLATFORM && !PLATFORM_WINDOWS
+    std::unique_ptr<OpKernel> kernel;
+    TF_RETURN_IF_ERROR(CreateXlaKernel(flr_, ndef, &kernel));
+    k = kernel.release();
+#endif  // !IS_MOBILE_PLATFORM && !PLATFORM_WINDOWS
+  } else {
+    TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
+  }
   kernel_.reset(k);
   return Status::OK();
 }
@@ -121,7 +126,6 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
   }
-  options.input_tensor_shapes = input_tensor_shapes_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
 
   const auto& it = ndef.attr().find("executor_type");
@@ -216,7 +220,7 @@ void UpdateStats(OpKernelContext* context,
 
     absl::optional<AllocatorStats> allocator_stats =
         allocator_pair.first->GetStats();
-    if (stats) {
+    if (allocator_stats) {
       memory->set_allocator_bytes_in_use(allocator_stats->bytes_in_use);
     }
     allocator_pair.second->GetRecordsAndUnRef();
@@ -230,6 +234,15 @@ void UpdateStats(OpKernelContext* context,
   ms->set_persistent_memory_size(context->persistent_memory_allocated());
   step_stats_collector->Finalize();
 }
+
+// In certain contexts (e.g. TPU async executions), the CancellationManager is
+// used to shut down the device in error scenarios (as opposed to using the
+// AsyncCompute's DoneCallback). This is handled through the
+// {inc,dec}_num_deferred_ops_function.
+struct OpExecutionState : public core::RefCounted {
+  // TODO(nareshmodi): consider refcounting the cancellation_manager.
+  CancellationManager cancellation_manager;
+};
 }  // anonymous namespace
 
 Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
@@ -259,6 +272,7 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
   }
 
   OpKernelContext::Params params;
+  params.is_eager = true;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
   params.inputs = &inputs;
@@ -269,22 +283,22 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
   params.function_library = flr_;
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
+  OpExecutionState* op_execution_state = nullptr;
   if (cancellation_manager) {
     params.cancellation_manager = cancellation_manager;
   } else {
-    params.cancellation_manager = &cm_;
-    cm_.Reset();
+    op_execution_state = new OpExecutionState;
+    params.cancellation_manager = &op_execution_state->cancellation_manager;
   }
   params.log_memory = log_memory_;
-  params.inc_num_deferred_ops_function = [this]() {
-    mutex_lock lock(num_deferred_ops_mu_);
-    num_deferred_ops_++;
+  params.inc_num_deferred_ops_function = [op_execution_state]() {
+    if (op_execution_state != nullptr) {
+      op_execution_state->Ref();
+    }
   };
-  params.dec_num_deferred_ops_function = [this]() {
-    mutex_lock lock(num_deferred_ops_mu_);
-    num_deferred_ops_--;
-    if (num_deferred_ops_ == 0) {
-      no_deferred_ops_cv_.notify_all();
+  params.dec_num_deferred_ops_function = [op_execution_state]() {
+    if (op_execution_state != nullptr) {
+      op_execution_state->Unref();
     }
   };
   std::unique_ptr<StepStatsCollector> step_stats_collector;
@@ -313,33 +327,25 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
     done.WaitForNotification();
   } else {
     const string& op_name = kernel_->name();
-    // If tracing if off, the overheads of ScopedAnnotation and TraceMe
-    // are negligible.
-    if (device_->TraceUsingAnnotations()) {
-      // 'ScopedActivity' will trace the OpKernel scheduling time on host.
-      profiler::TraceMe activity(
-          [&] {
-            return strings::StrCat(
-                op_name, ":", kernel_->type_string(),
-                "#id=", step_container ? step_container->step_id() : 0,
-                ",device=", device_->name(), ",async=false#");
-          },
-          profiler::TraceMeLevel::kInfo);
-      // 'ScopedAnnotation' will trace the OpKernel execution time on device.
-      tracing::ScopedAnnotation annotation(op_name, kernel_->type_string());
-      device_->Compute(kernel_.get(), &context);
-    } else {
-      profiler::TraceMe activity(
-          [&] {
-            return strings::StrCat(
-                op_name, ":", kernel_->type_string(),
-                "#id=", step_container ? step_container->step_id() : 0,
-                ",device=", device_->name(), ",async=false#");
-          },
-          profiler::TraceMeLevel::kInfo);
-      device_->Compute(kernel_.get(), &context);
-    }
+    // 'ScopedActivity' will trace the OpKernel scheduling time on host.
+    profiler::TraceMe activity(
+        [&] {
+          return absl::StrCat(op_name, ":", kernel_->type_string(), "#id=",
+                              step_container ? step_container->step_id() : 0,
+                              ",device=", device_->name(), ",async=false#");
+        },
+        profiler::TraceMeLevel::kInfo);
+    // 'ScopedAnnotation' will trace the OpKernel execution time on device.
+    tracing::ScopedAnnotation annotation(
+        [&]() { return absl::StrCat(op_name, ":", kernel_->type_string()); });
+    device_->Compute(kernel_.get(), &context);
   }
+
+  // Clean up execution op_execution_state if deferred ops aren't running.
+  if (op_execution_state != nullptr) {
+    op_execution_state->Unref();
+  }
+
   if (!context.status().ok()) return context.status();
 
   if (outputs != nullptr) {
@@ -369,11 +375,11 @@ Status KernelAndDeviceFunc::Run(
   opts.rendezvous = rendezvous;
   opts.create_rendezvous = false;
 
+  CancellationManager cm;
   if (cancellation_manager) {
     opts.cancellation_manager = cancellation_manager;
   } else {
-    opts.cancellation_manager = &cm_;
-    cm_.Reset();
+    opts.cancellation_manager = &cm;
   }
   opts.allow_dead_tensors = true;
   opts.step_container = step_container;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 6ec085944ad..7e1247c285d 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -115,11 +115,6 @@ class KernelAndDevice : public core::RefCounted {
  protected:
   std::function<void(std::function<void()>)>* get_runner() const;
 
-  // TODO(apassos) Consider a shared cancellation manager. Note that this
-  // cancellation manager is not useful to actually cancel anything, and is
-  // provided here only for the few kernels which can't handle one being
-  // missing.
-  CancellationManager cm_;
   Device* const device_;               // can be null
   Device* const host_cpu_device_;      // non-null
   FunctionLibraryRuntime* const flr_;  // can be null
@@ -137,13 +132,14 @@ class KernelAndDeviceOp final : public KernelAndDevice {
       FunctionLibraryRuntime* flr,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
-      Device* host_cpu_device)
+      Device* host_cpu_device, const bool compile_with_xla = false)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
         rendez_(rendez),
-        log_memory_(log_memory) {}
+        log_memory_(log_memory),
+        compile_with_xla_(compile_with_xla) {}
 
-  virtual ~KernelAndDeviceOp();
+  ~KernelAndDeviceOp() override {}
 
   Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
 
@@ -177,15 +173,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   Rendezvous* const rendez_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
   const bool log_memory_;
-
-  // For deferred ops, AsyncOpKernel::DoneCallback is called once the op is
-  // enqueued to device. The execution of the op may not finish when
-  // device_->Compute returns. We rely on no_deferred_ops_cv_ to know when the
-  // execution has finished.
-  // Available via OpKernelContext to every OpKernel invocation.
-  mutex num_deferred_ops_mu_;
-  condition_variable no_deferred_ops_cv_;
-  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
+  const bool compile_with_xla_;
 };
 
 // Represents a multi-device function. Functions can also be run using
@@ -199,7 +187,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
-      std::unordered_map<int, TensorShape> input_tensor_shapes,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
@@ -211,7 +198,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
         pflr_(pflr),
         handle_(kInvalidHandle),
         input_devices_(std::move(input_devices)),
-        input_tensor_shapes_(std::move(input_tensor_shapes)),
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
         name_(name),
@@ -254,7 +240,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   // CPU devices are not null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> input_devices_;
-  std::unordered_map<int, TensorShape> input_tensor_shapes_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
new file mode 100644
index 00000000000..bb9e3684013
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -0,0 +1,200 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+class MklEagerOpRewrite : public EagerOpRewrite {
+ public:
+  MklEagerOpRewrite(string name, string file, string line);
+  typedef struct {
+    string op_name;
+    std::function<bool(EagerOperation*)> RewriteRule;
+    std::function<Status(EagerOperation*, std::unique_ptr<EagerOperation>*)>
+        CreateMklOp;
+  } MklEagerOp;
+
+ private:
+  // TODO(intel-tf): refactor with unordered_map;
+  // especially when adding more ops/rewrite rules in future.
+  std::vector<MklEagerOp> mkl_eager_ops_;
+
+  // The entry point to execute the op rewrite.
+  Status Run(EagerOperation* orig_op,
+             std::unique_ptr<tensorflow::EagerOperation>* out_op);
+
+  // Initializes the new op and sets up its inputs and attributes
+  static Status SetupNewOp(EagerOperation* orig_op, const string mkl_op_name,
+                           std::unique_ptr<EagerOperation>* new_mkl_op);
+
+  // Generic rewrite that can be used for any mkl op that doesn't need
+  // special processing.
+  static Status CreateGenericMklOp(EagerOperation* orig_op,
+                                   std::unique_ptr<EagerOperation>* mkl_op);
+
+  // Creates new MKL op for Conv2D, Conv2DBackpropInput and
+  // Conv2DBackpropFilter.
+  static Status CreateMklConv2DOp(
+      EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op);
+
+  // Rewrite rule for Conv2D, Conv2DBackpropInput and Conv2DBackpropFilter.
+  static bool RewriteConv2D(EagerOperation* op);
+
+  // Calls op-specific rewrite function to create new MKL op.
+  Status RewriteToMklOp(EagerOperation* orig_op,
+                        std::unique_ptr<EagerOperation>* mkl_op,
+                        const int op_idx);
+
+  // Checks whether we can rewrite the op to MKL one or not.
+  bool ShouldRewriteOp(EagerOperation* op, int* op_idx);
+
+  // Default rewrite rule to be used when rewrite should happen without any
+  // restriction.
+  static bool AlwaysRewrite(EagerOperation* op) { return true; }
+};
+
+REGISTER_REWRITE(EagerOpRewriteRegistry::PRE_EXECUTION, MklEagerOpRewrite);
+
+// Constructor
+MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
+    : EagerOpRewrite(name, file, line) {
+  mkl_eager_ops_.push_back({"BatchMatMul", AlwaysRewrite, CreateGenericMklOp});
+  mkl_eager_ops_.push_back(
+      {"BatchMatMulV2", AlwaysRewrite, CreateGenericMklOp});
+  mkl_eager_ops_.push_back({"Conv2D", RewriteConv2D, CreateMklConv2DOp});
+  mkl_eager_ops_.push_back(
+      {"Conv2DBackpropInput", RewriteConv2D, CreateMklConv2DOp});
+  mkl_eager_ops_.push_back(
+      {"Conv2DBackpropFilter", RewriteConv2D, CreateMklConv2DOp});
+  mkl_eager_ops_.push_back({"MatMul", AlwaysRewrite, CreateGenericMklOp});
+}
+
+Status MklEagerOpRewrite::Run(
+    EagerOperation* orig_op,
+    std::unique_ptr<tensorflow::EagerOperation>* out_op) {
+  int found_op_idx = -1;
+  if (ShouldRewriteOp(orig_op, &found_op_idx)) {
+    TF_CHECK_OK(RewriteToMklOp(orig_op, out_op, found_op_idx));
+  }
+  return Status::OK();
+}
+
+Status MklEagerOpRewrite::SetupNewOp(
+    EagerOperation* orig_op, const string mkl_op_name,
+    std::unique_ptr<EagerOperation>* new_mkl_op) {
+  const tensorflow::AttrTypeMap* types;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp(mkl_op_name.c_str(), &types, &is_function));
+  EagerContext* ctx = orig_op->EagerContext();
+  new_mkl_op->reset(new tensorflow::EagerOperation(ctx, mkl_op_name.c_str(),
+                                                   is_function, types));
+
+  int num_inputs = orig_op->Inputs().size();
+  // Add all inputs to the new op.
+  for (int i = 0; i < num_inputs; ++i) {
+    (*new_mkl_op)->AddInput(orig_op->Inputs()[i]);
+  }
+
+  // Copy all attributes to the new op.
+  string name;
+  const NodeDef& orig_ndef = orig_op->MutableAttrs()->BuildNodeDef();
+
+  AttrSlice attr_list(orig_ndef);
+  for (const auto& attr : attr_list) {
+    (*new_mkl_op)->MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  (*new_mkl_op)
+      ->MutableAttrs()
+      ->Set("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
+
+  if (orig_op->Device() != nullptr) {
+    (*new_mkl_op)->SetDevice(orig_op->Device());
+  } else {
+    string device_name =
+        DeviceNameUtils::ParsedNameToString(orig_op->GetDeviceName());
+    (*new_mkl_op)->SetDeviceName(device_name.c_str());
+  }
+  return Status::OK();
+}
+
+Status MklEagerOpRewrite::CreateGenericMklOp(
+    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_op) {
+  const string mkl_op_name = mkl_op_registry::GetMklOpName(orig_op->Name());
+  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
+  return Status::OK();
+}
+
+Status MklEagerOpRewrite::CreateMklConv2DOp(
+    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op) {
+  const string mkl_op_name =
+      mkl_op_registry::GetMklEagerOpName(orig_op->Name());
+  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_conv2d_op));
+  return Status::OK();
+}
+
+bool MklEagerOpRewrite::ShouldRewriteOp(EagerOperation* op, int* op_idx) {
+  // Don't rewrite the op if MKL use is disabled at runtime.
+  if (DisableMKL()) {
+    return false;
+  }
+  DataType data_type;
+  if (op->Attrs().Get("T", &data_type) != Status::OK()) {
+    return false;
+  }
+  // Check if we have registered MKL kernel for this op.
+  if (!mkl_op_registry::IsMklNameChangeOp(
+          mkl_op_registry::GetMklEagerOpName(op->Name()), data_type) &&
+      !mkl_op_registry::IsMklNameChangeOp(
+          mkl_op_registry::GetMklOpName(op->Name()), data_type)) {
+    return false;
+  }
+
+  *op_idx = -1;
+  // Find and call the op's rewrite rule that determines whether we need to
+  // rewrite this op or not.
+  for (auto it = mkl_eager_ops_.begin(); it != mkl_eager_ops_.end(); ++it) {
+    if (it->op_name.compare(op->Name()) == 0 && it->RewriteRule(op)) {
+      *op_idx = it - mkl_eager_ops_.begin();
+      return true;
+    }
+  }
+  return false;
+}
+
+Status MklEagerOpRewrite::RewriteToMklOp(
+    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_op,
+    const int op_idx) {
+  mkl_eager_ops_[op_idx].CreateMklOp(orig_op, mkl_op);
+  return Status::OK();
+}
+
+bool MklEagerOpRewrite::RewriteConv2D(EagerOperation* op) {
+  const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+  string padding;
+  TF_CHECK_OK(GetNodeAttr(ndef, "padding", &padding));
+  // Right now MKL Conv2D does not support explicit padding.
+  return (padding != "EXPLICIT");
+}
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 8f68ee4bb99..f451cbbccd5 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -53,6 +53,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+#if !defined(IS_MOBILE_PLATFORM)
+const int64 kInvalidOpId = -1;
+const int32 kInvalidOutputNum = -1;
+#endif
+}  // namespace
+
 Status TensorHandle::GetResourceHandleDtypesAndShapes(
     std::vector<DtypeAndPartialTensorShape>* result) {
   if (IsRemote()) {
@@ -109,8 +116,8 @@ TensorHandle::TensorHandle(std::unique_ptr<LocalTensorHandleData> t,
       op_device_(op_device),
       resource_device_(nullptr),
 #if !defined(IS_MOBILE_PLATFORM)
-      remote_op_id_(-1),
-      remote_output_num_(-1),
+      remote_op_id_(kInvalidOpId),
+      remote_output_num_(kInvalidOutputNum),
 #endif
       ctx_(ctx),
       is_remote_(false),
@@ -128,8 +135,8 @@ TensorHandle::TensorHandle(std::unique_ptr<LocalTensorHandleData> t,
       op_device_(op_device),
       resource_device_(GetResourceDevice(resource_handle, ctx)),
 #if !defined(IS_MOBILE_PLATFORM)
-      remote_op_id_(-1),
-      remote_output_num_(-1),
+      remote_op_id_(kInvalidOpId),
+      remote_output_num_(kInvalidOutputNum),
 #endif
       ctx_(ctx),
       is_remote_(false),
@@ -159,8 +166,8 @@ TensorHandle::TensorHandle(std::unique_ptr<AsyncLocalTensorHandleData> t,
       op_device_(op_device),
       resource_device_(resource_device),
 #if !defined(IS_MOBILE_PLATFORM)
-      remote_op_id_(-1),
-      remote_output_num_(-1),
+      remote_op_id_(kInvalidOpId),
+      remote_output_num_(kInvalidOutputNum),
 #endif
       ctx_(ctx),
       is_remote_(false),
@@ -250,8 +257,8 @@ TensorHandle::TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype)
       op_device_(nullptr),
       resource_device_(nullptr),
 #if !defined(IS_MOBILE_PLATFORM)
-      remote_op_id_(-1),
-      remote_output_num_(-1),
+      remote_op_id_(kInvalidOpId),
+      remote_output_num_(kInvalidOutputNum),
 #endif
       ctx_(nullptr),
       is_remote_(false),
@@ -326,11 +333,24 @@ Status TensorHandle::RemoteAddress(Device* d, int64* op_id,
         "Could not find remote mirror for specified device");
   }
 
+  if (remote_op_id_ == kInvalidOpId ||
+      remote_output_num_ == kInvalidOutputNum) {
+    return errors::InvalidArgument("Remote handle (op_id:", remote_op_id_,
+                                   ", output_num:", remote_output_num_,
+                                   ") is not set.");
+  }
   *op_id = remote_op_id_;
   *output_num = remote_output_num_;
   return Status::OK();
 }
 
+void TensorHandle::SetRemoteOpIdAndOutputNumToLocalTensorHandle(
+    const int64 op_id, const int32 output_num) {
+  DCHECK(!is_remote_);
+  remote_op_id_ = op_id;
+  remote_output_num_ = output_num;
+}
+
 bool TensorHandle::HasRemoteMirror(Device* d) {
   tf_shared_lock l(remote_mirrors_mutex_);
   auto mirror = remote_mirrors_.find(d);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 1ecf5bfee34..95003a945bf 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -149,6 +149,11 @@ class TensorHandle : public core::RefCounted {
   // Return the op_id and output num if the handle refers to a remote tensor.
   Status RemoteAddress(Device* d, int64* op_id, int32* output_num) const;
 
+  // Set remote_op_id_ and remote_output_num_ if the handle refers to a local
+  // tensor that needs to be copied to remote workers.
+  void SetRemoteOpIdAndOutputNumToLocalTensorHandle(const int64 op_id,
+                                                    const int32 output_num);
+
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
   // replacing the backing data abstraction to allow for the shape to be
@@ -238,8 +243,8 @@ class TensorHandle : public core::RefCounted {
       remote_mirrors_ GUARDED_BY(remote_mirrors_mutex_);
 
   // IDs required when this class is representing a remote tensor handle.
-  const int64 remote_op_id_;
-  const int32 remote_output_num_;
+  int64 remote_op_id_;
+  int32 remote_output_num_;
   eager::EagerClient* remote_eager_client_;
   uint64 remote_context_id_;
 #endif
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 065a6782811..da2954a1c68 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1272,7 +1272,6 @@ class ExecutorState {
   std::unique_ptr<DeviceBase> user_device_;
   Executor::Args::Runner runner_;
   bool sync_on_finish_;
-  const bool trace_using_annotations_;
 
   // Owned.
 
@@ -1405,7 +1404,6 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       cancellation_manager_(args.cancellation_manager),
       runner_(args.runner),
       sync_on_finish_(args.sync_on_finish),
-      trace_using_annotations_(impl->params_.device->TraceUsingAnnotations()),
       num_outstanding_ops_(0) {
   if (args.user_intra_op_threadpool != nullptr) {
     Device* device = impl_->params_.device;
@@ -1600,8 +1598,7 @@ struct ExecutorState::AsyncState {
 // Returns true if `item` might be traced by the given trace and event
 // collectors. Returns false only if `item` definitely will not be traced.
 bool MightTrace(const NodeItem& item,
-                const tracing::EventCollector* event_collector,
-                bool using_annotations) {
+                const tracing::EventCollector* event_collector) {
   // Tracing will only be enabled if either `event_collector` is non null,
   // or `trace_collector` is non-null and enabled for this particular kernel.
   // Although `profiler::TraceMe`, `tracing::ScopedAnnotation`, and
@@ -1612,12 +1609,9 @@ bool MightTrace(const NodeItem& item,
   if (event_collector != nullptr) {
     return true;
   }
-  auto* trace_collector = tracing::GetTraceCollector();
-  if (trace_collector) {
-    if (using_annotations && trace_collector->IsEnabledForAnnotations()) {
-      return true;
-    }
-  }
+
+  if (tracing::ScopedAnnotation::IsEnabled()) return true;
+
   return profiler::TraceMeRecorder::Active(
       profiler::GetTFTraceMeLevel(item.kernel->IsExpensive()));
 }
@@ -1832,8 +1826,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         OpKernelContext ctx(&params, item.num_outputs);
         nodestats::SetOpStart(stats);
 
-        if (TF_PREDICT_FALSE(
-                MightTrace(item, event_collector_, trace_using_annotations_))) {
+        if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
           const string& op_name = op_kernel->name();
           const string kernel_label = strings::StrCat(
               op_name, ":", op_kernel->type_string(),
@@ -1841,21 +1834,13 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
               ",device=", device->name(), ",async=false#");
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
-          if (trace_using_annotations_) {
-            // 'TraceMe' will trace the OpKernel scheduling time.
-            profiler::TraceMe activity(absl::string_view(kernel_label),
-                                       profiler::TraceMeLevel::kInfo);
-            // 'ScopedAnnotation' will trace the OpKernel execution time.
-            tracing::ScopedAnnotation annotation(kernel_label);
-            device->Compute(op_kernel, &ctx);
-          } else {
-            // Use the cheaper `TraceMe` to trace just the OpKernel
-            // execution.
-            profiler::TraceMe activity(
-                absl::string_view(kernel_label),
-                profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
-            device->Compute(op_kernel, &ctx);
-          }
+          // 'TraceMe' will trace the OpKernel scheduling time.
+          profiler::TraceMe activity(
+              absl::string_view(kernel_label),
+              profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
+          // 'ScopedAnnotation' will trace the OpKernel execution time.
+          tracing::ScopedAnnotation annotation(kernel_label);
+          device->Compute(op_kernel, &ctx);
         } else {
           // In the common case, avoid creating any tracing objects.
           if (op_kernel->IsExpensive()) {
@@ -2563,9 +2548,9 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
                                            const Node* node,
                                            FrameState** child) {
   // Get the child frame name.
-  string enter_name;
-  Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
-  DCHECK(s.ok()) << s;
+  const string& enter_name = GetNodeAttrString(node->attrs(), "frame_name");
+  DCHECK(!enter_name.empty())
+      << "Could not find \"frame_name\" attr in node " << node->name();
   const string child_name = MakeFrameName(frame, iter, enter_name);
 
   {
@@ -2582,8 +2567,10 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters;
-  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
-  DCHECK(s.ok()) << s;
+  bool found_parallel_iters =
+      TryGetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
+  DCHECK(found_parallel_iters)
+      << "Could not find \"parallel_iterations\" attr in node " << node->name();
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
   temp->frame_id = Hash64(child_name);
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 9ca758b78a2..d8e68b762a4 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 // See core/kernels/function_ops.cc for related kernels.
 
@@ -322,6 +323,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
                              thread::ThreadPool* default_thread_pool,
                              const OptimizerOptions& optimizer_options,
                              const CustomKernelCreator* custom_kernel_creator,
+                             const SessionMetadata* session_metadata,
                              ProcessFunctionLibraryRuntime* parent);
 
   ~FunctionLibraryRuntimeImpl() override;
@@ -378,6 +380,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const FunctionLibraryDefinition* const base_lib_def_;
   GraphOptimizer optimizer_;
   const CustomKernelCreator* custom_kernel_creator_;
+  const SessionMetadata* const session_metadata_;
   Executor::Args::Runner default_runner_;
   const string device_name_;
 
@@ -444,6 +447,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
     thread::ThreadPool* default_thread_pool,
     const OptimizerOptions& optimizer_options,
     const CustomKernelCreator* custom_kernel_creator,
+    const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent)
     : device_mgr_(dmgr),
       device_(device),
@@ -452,6 +456,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       base_lib_def_(lib_def),
       optimizer_(optimizer_options),
       custom_kernel_creator_(custom_kernel_creator),
+      session_metadata_(session_metadata),
       default_runner_(nullptr),
       device_name_(device_ == nullptr
                        ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
@@ -844,9 +849,9 @@ void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(2) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
           << g->num_edges();
-  if (VLOG_IS_ON(4)) {
+  if (VLOG_IS_ON(5)) {
     for (const auto& line : str_util::Split(DebugString(g), '\n')) {
-      VLOG(4) << "|| " << line;
+      VLOG(5) << "|| " << line;
     }
   }
 }
@@ -946,6 +951,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
     DeleteNonCachedKernel(kernel);
   };
   params.rendezvous_factory = (*item)->rendezvous_factory;
+  params.session_metadata = session_metadata_;
   Graph* graph = g.get();
   std::unique_ptr<Executor> exec;
   TF_RETURN_IF_ERROR(NewExecutor(executor_type, params, std::move(g), &exec));
@@ -1097,7 +1103,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
                                      std::vector<Tensor>* rets,
                                      DoneCallback done) {
   if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
-    done(errors::Cancelled(""));
+    done(errors::Cancelled("Function was cancelled before it was started"));
     return;
   }
   Options run_opts = opts;
@@ -1246,7 +1252,7 @@ Status FunctionLibraryRuntimeImpl::Clone(
       env_, graph_def_version_, optimizer_.options(), custom_kernel_creator_,
       out_lib_def, out_pflr, skip_flib_def));
   *out_flr = (*out_pflr)->GetFLR(device_->name());
-  if (out_flr != nullptr) {
+  if (*out_flr != nullptr) {
     return Status::OK();
   } else {
     return errors::Internal("Cloning FunctionLibraryRuntime failed.");
@@ -1290,10 +1296,11 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
     thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     const CustomKernelCreator* custom_kernel_creator,
+    const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent) {
   return std::unique_ptr<FunctionLibraryRuntime>(new FunctionLibraryRuntimeImpl(
       device_mgr, env, device, graph_def_version, lib_def, thread_pool,
-      optimizer_options, custom_kernel_creator, parent));
+      optimizer_options, custom_kernel_creator, session_metadata, parent));
 }
 
 bool RemoveDeadNodes(Graph* g) {
@@ -1495,13 +1502,28 @@ namespace {
 
 std::vector<string> InputDevices(const Node& caller) {
   std::vector<string> input_devices(caller.in_edges().size());
+  std::vector<string> input_tensors(caller.in_edges().size());
+
   for (const Edge* edge : caller.in_edges()) {
     if (edge->IsControlEdge()) continue;
     const string& input_device = edge->src()->has_assigned_device_name()
                                      ? edge->src()->assigned_device_name()
                                      : edge->src()->requested_device();
     input_devices[edge->dst_input()] = input_device;
+    input_tensors[edge->dst_input()] =
+        absl::StrCat(edge->src()->name(), ":", edge->src_output());
   }
+
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "Function instantiation input devices:";
+    for (int i = 0; i < input_devices.size(); ++i) {
+      if (input_tensors[i].empty()) continue;  // skip control edges
+      VLOG(4) << "    [index " << i << "]"
+              << " device: " << input_devices[i]
+              << " (input: " << input_tensors[i] << ")";
+    }
+  }
+
   return input_devices;
 }
 
@@ -1616,24 +1638,21 @@ class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
 std::unique_ptr<InlinedFunctionBodyPlacer>
 InlinedFunctionBodyPlacer::DefaultPlacer(const Graph& graph,
                                          const Node& caller) {
-  VLOG(3) << "Create default placer for inlined function body: "
-          << SummarizeNode(caller);
+  VLOG(3) << "Create default placer for inlined function body.";
   return absl::make_unique<DefaultFunctionBodyPlacer>(caller);
 }
 
 std::unique_ptr<InlinedFunctionBodyPlacer>
 InlinedFunctionBodyPlacer::SingleDevicePlacer(const Graph& graph,
                                               const Node& caller) {
-  VLOG(3) << "Create single device placer for inlined function body: "
-          << SummarizeNode(caller);
+  VLOG(3) << "Create single device placer for inlined function body.";
   return absl::make_unique<SingleDeviceFunctionBodyPlacer>(caller);
 }
 
 std::unique_ptr<InlinedFunctionBodyPlacer>
 InlinedFunctionBodyPlacer::MultiDevicePlacer(const Graph& graph,
                                              const Node& caller) {
-  VLOG(3) << "Create multi device placer for inlined function body: "
-          << SummarizeNode(caller);
+  VLOG(3) << "Create multi device placer for inlined function body.";
   return absl::make_unique<MultiDeviceFunctionBodyPlacer>(caller);
 }
 
@@ -1642,7 +1661,7 @@ namespace {
 Status ValidateNoInline(const FunctionBody* fbody) {
   const auto attr = AttrSlice(&fbody->fdef.attr());
   bool noinline = false;
-  if (GetNodeAttr(attr, kNoInlineAttr, &noinline).ok() && noinline) {
+  if (TryGetNodeAttr(attr, kNoInlineAttr, &noinline) && noinline) {
     return errors::InvalidArgument(
         "Can't inline function marked with '_noinline'");
   }
@@ -1848,7 +1867,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
                           const InlineFunctionBodyOptions& options) {
   VLOG(3) << "Inline function call: " << SummarizeNode(*caller) << " ["
           << options.DebugString() << "]";
-  VLOG(4) << "Inlined function definition: " << DebugString(fbody->fdef);
+  VLOG(5) << "Inlined function definition: " << DebugString(fbody->fdef);
 
   Status validation = ValidateInlining(caller, fbody, options);
   if (!validation.ok()) {
@@ -1985,9 +2004,13 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   // identity node.
   //
   // The added identity nodes depend on "input_control_node".
+  VLOG(4) << "Add input Identity nodes for each function argument:";
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = input_identity("input", inputs[i], i);
+    VLOG(4) << "    [index " << i << "] " << n->name()
+            << " (input: " << inputs[i].name() << ")";
+
     if (input_control_node) {
       g->AddControlEdge(input_control_node, n, kDoNotCheckDuplicates);
     }
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index e7b16677983..6dcad77832b 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -63,6 +63,7 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
     thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     const CustomKernelCreator* custom_kernel_creator,
+    const SessionMetadata* sesson_metadata,
     ProcessFunctionLibraryRuntime* parent);
 
 // FunctionLibraryRuntime::GetFunctionBody returns a description of an
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 5bfd9f5eba3..2f5cef30a6e 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -1515,13 +1515,21 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto two = ops::Const(s.WithOpName("two"), 2LL);
     auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
     auto y = ops::Mul(s.WithOpName("y"), x, scale);
-    NameAttrList fn;
-    fn.set_name("Mul");
-    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    NameAttrList fn0;
+    fn0.set_name("Mul");
+    (*fn0.mutable_attr())["T"].set_type(DT_FLOAT);
     auto func1 = ops::SymbolicGradient(
         s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
-        {DT_FLOAT, DT_FLOAT}, fn);
-    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+        {DT_FLOAT, DT_FLOAT}, fn0);
+    NameAttrList fn1;
+    fn1.set_name("Cast");
+    (*fn1.mutable_attr())["SrcT"].set_type(DT_INT64);
+    (*fn1.mutable_attr())["DstT"].set_type(DT_FLOAT);
+    (*fn1.mutable_attr())["Truncate"].set_b(false);
+    auto func2 = ops::SymbolicGradient(
+        s.WithOpName("Func/_2"),
+        std::initializer_list<Input>{two, func1.output[1]}, {DT_INT64}, fn1);
+    auto func3 = ops::_Retval(s.WithOpName("Func/_3"), func1[0], 0);
     GraphDef expected;
     TF_ASSERT_OK(s.ToGraphDef(&expected));
 
@@ -1552,7 +1560,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
         ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
     auto func1_dx =
         ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
-    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_3"), func1_dx, 0);
     GraphDef expected;
     TF_ASSERT_OK(s.ToGraphDef(&expected));
 
@@ -1755,20 +1763,23 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
         std::initializer_list<Input>{grad0_z, grad0_indices, func2},
         {DT_FLOAT, DT_INT32}, sum);
 
-    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+    auto grad0_func2 =
+        ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_zero);
+    auto grad0_func3 = ops::ZerosLike(s.WithOpName("grad0/Func/_3"), grad0_r);
+    auto grad0_func4 = ops::ZerosLike(s.WithOpName("grad0/Func/_4"), grad0_one);
 
     NameAttrList add;
     add.set_name("Add");
     (*add.mutable_attr())["T"].set_type(DT_FLOAT);
-    auto grad0_func3 = ops::SymbolicGradient(
-        s.WithOpName("grad0/Func/_3"),
+    auto grad0_func5 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_5"),
         std::initializer_list<Input>{func0, func1, grad0_func1[0]},
         {DT_FLOAT, DT_FLOAT}, add);
 
     auto func3 =
-        ops::Identity(s.WithOpName("Func/grad0/output/_3"), grad0_func3[0]);
+        ops::Identity(s.WithOpName("Func/grad0/output/_3"), grad0_func5[0]);
     auto func4 =
-        ops::Identity(s.WithOpName("Func/grad0/output/_4"), grad0_func3[1]);
+        ops::Identity(s.WithOpName("Func/grad0/output/_4"), grad0_func5[1]);
     auto dx = ops::Identity(s.WithOpName("dx"), func3);
     auto dy = ops::Identity(s.WithOpName("dy"), func4);
     auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
@@ -1848,17 +1859,17 @@ TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
   opts.source_device = "/device:CPU:1";
   // Run on flr1_, flr2_ and make sure that the device it ran on was cpu:1.
   TF_CHECK_OK(Run(flr1_, handle, opts, {}, {&y}, true));
-  test::ExpectTensorEqual<string>(
+  test::ExpectTensorEqual<tstring>(
       y,
-      test::AsTensor<string>({"/job:localhost/replica:0/task:0/device:CPU:1"},
-                             TensorShape({})));
+      test::AsTensor<tstring>({"/job:localhost/replica:0/task:0/device:CPU:1"},
+                              TensorShape({})));
   opts.remote_execution = true;
   opts.source_device = "/job:localhost/replica:0/task:0/cpu:2";
   TF_CHECK_OK(Run(flr2_, handle, opts, {}, {&y}, true));
-  test::ExpectTensorEqual<string>(
+  test::ExpectTensorEqual<tstring>(
       y,
-      test::AsTensor<string>({"/job:localhost/replica:0/task:0/device:CPU:1"},
-                             TensorShape({})));
+      test::AsTensor<tstring>({"/job:localhost/replica:0/task:0/device:CPU:1"},
+                              TensorShape({})));
   opts.rendezvous->Unref();
 }
 
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index 1720ee64c07..bbaa94d6143 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -33,7 +33,7 @@ class FindDeviceOpKernel : public OpKernel {
     Tensor* device_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("device_name", TensorShape{},
                                              &device_tensor));
-    device_tensor->scalar<string>()() =
+    device_tensor->scalar<tstring>()() =
         ctx->function_library()->device()->name();
   }
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index c284958ee9f..aeb5d33f3ca 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -52,6 +52,27 @@ bool GPUBFCAllocator::GetAllowGrowthValue(const GPUOptions& gpu_options) {
   return gpu_options.allow_growth();
 }
 
+bool GPUBFCAllocator::GetGarbageCollectionValue() {
+  const char* enable_gpu_garbage_collection =
+      std::getenv("TF_ENABLE_GPU_GARBAGE_COLLECTION");
+  if (enable_gpu_garbage_collection == nullptr) {
+    // By default, turn on the memory garbage collection.
+    return true;
+  }
+  if (strcmp("false", enable_gpu_garbage_collection) == 0) {
+    return false;
+  } else if (strcmp("true", enable_gpu_garbage_collection) == 0) {
+    return true;
+  }
+
+  LOG(ERROR)
+      << "The TF_ENABLE_GPU_GARBAGE_COLLECTION environment variable is set but"
+      << " could not be parsed: \"" << enable_gpu_garbage_collection << "\"."
+      << " Valid values are \"true\" or \"false\"."
+      << " Using the default value \"true\".";
+  return true;
+}
+
 GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
                                  size_t total_memory, const string& name)
     : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
@@ -61,6 +82,7 @@ GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
                                  const GPUOptions& gpu_options,
                                  const string& name)
     : BFCAllocator(sub_allocator, total_memory,
-                   GPUBFCAllocator::GetAllowGrowthValue(gpu_options), name) {}
+                   GPUBFCAllocator::GetAllowGrowthValue(gpu_options), name,
+                   GPUBFCAllocator::GetGarbageCollectionValue()) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 5cae743115f..0f65abd6e9f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -43,6 +43,7 @@ class GPUBFCAllocator : public BFCAllocator {
 
  private:
   static bool GetAllowGrowthValue(const GPUOptions& gpu_options);
+  static bool GetGarbageCollectionValue();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 75d21d80dcb..e285acd8ec5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -568,6 +568,52 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
     EXPECT_EQ(GPUBFCAllocator::RoundedBytes(1LL << 31),
               force_no_allow_growth_allocator.curr_region_allocation_bytes_);
   }
+
+  void TestRegionDeallocation() {
+    GPUOptions options;
+    options.set_allow_growth(true);
+
+    // Max of 2GiB, but starts out small.
+    PlatformGpuId platform_gpu_id(0);
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
+        platform_gpu_id, /*use_unified_memory=*/false, {}, {});
+    GPUBFCAllocator a(sub_allocator, 1LL << 31, options, "GPU_0_bfc");
+
+    // Allocate 128 raw pointers of 4 megs.
+    const size_t size = 1LL << 22;
+    std::vector<void*> initial_ptrs;
+    for (size_t s = 0; s < 128; s++) {
+      void* raw = a.AllocateRaw(1, size);
+      initial_ptrs.push_back(raw);
+    }
+
+    {
+      mutex_lock l(a.lock_);
+      // Make sure there are more than 1 regions in preparation for the test.
+      EXPECT_LT(1, a.region_manager_.regions().size());
+    }
+
+    // Deallocate all the memories except the last one.
+    for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
+      a.DeallocateRaw(initial_ptrs[i]);
+    }
+
+    // Deallocate free regions and there shall be only one region left.
+    EXPECT_EQ(true, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
+    {
+      mutex_lock l(a.lock_);
+      EXPECT_EQ(1, a.region_manager_.regions().size());
+    }
+
+    // There should be only one chunk left in bins.
+    size_t num_chunks_in_bins = 0;
+    for (int i = 0; i < BFCAllocator::kNumBins; i++) {
+      BFCAllocator::Bin* bin = a.BinFromIndex(i);
+      num_chunks_in_bins += bin->free_chunks.size();
+    }
+    EXPECT_EQ(1, num_chunks_in_bins);
+  }
 };
 
 TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
@@ -580,6 +626,10 @@ TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
   TestForceAllowGrowth();
 }
 
+TEST_F(GPUBFCAllocatorPrivateMethodsTest, TestRegionDeallocation) {
+  TestRegionDeallocation();
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index ea12a663b2f..491ef2ad8d2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -61,6 +61,10 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #endif  // GOOGLE_CUDA
 }
 
+absl::optional<AllocatorStats> GPUcudaMallocAllocator::GetStats() {
+  return base_allocator_->GetStats();
+}
+
 bool GPUcudaMallocAllocator::TracksAllocationSizes() const { return false; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 5025eed1213..b45d505c017 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -38,6 +38,7 @@ class GPUcudaMallocAllocator : public Allocator {
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   bool TracksAllocationSizes() const override;
+  absl::optional<AllocatorStats> GetStats() override;
 
  private:
   Allocator* base_allocator_ = nullptr;  // owned
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 2dc775c337a..cbba89d0d05 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -67,11 +67,6 @@ class BaseGPUDevice : public LocalDevice {
   // completes.
   bool RequiresRecordingAccessedTensors() const override;
 
-  // GPU kernel execution requires us to use `tracing::ScopedAnnotation()`
-  // rather than `tracing::ScopedActivity()`, in order to relate asynchronously
-  // launched GPU kernels to the OpKernel.
-  bool TraceUsingAnnotations() const { return true; }
-
   void ConsumeListOfAccessedTensors(
       DeviceContext* device_context,
       const TensorReferenceVector& tensor_refs) override;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index 2b40730119d..84eb84102d7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -59,7 +59,9 @@ class TfToPlatformGpuIdMap {
 
   bool Find(TfGpuId tf_gpu_id, PlatformGpuId* platform_gpu_id) const
       LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
+    // TODO(mrry): Consider replacing this with an atomic `is_initialized` bit,
+    // to avoid writing to a shared cache line in the tf_shared_lock.
+    tf_shared_lock lock(mu_);
     auto result = id_map_.find(tf_gpu_id.value());
     if (result == id_map_.end()) return false;
     *platform_gpu_id = result->second;
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 49071833f24..4a37bcfa40f 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -466,8 +466,8 @@ Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
 
   // All the node types handled here have their output datatype set in
   // either attribute 'dtype' or 'T'.
-  if (!GetNodeAttr(node, "dtype", type).ok() &&
-      !GetNodeAttr(node, "T", type).ok()) {
+  if (!TryGetNodeAttr(node, "dtype", type) &&
+      !TryGetNodeAttr(node, "T", type)) {
     return errors::InvalidArgument(
         "Could not determine output type for feed node: ", node.name(),
         " of type ", node.op());
@@ -610,7 +610,7 @@ Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
   Placer placer(new_graph.get(), "", flib_def_.get(), device_set_,
-                /* default_device= */ nullptr,
+                /* default_local_device= */ nullptr,
                 session_options_ == nullptr ||
                     session_options_->config.allow_soft_placement(),
                 session_options_ != nullptr &&
@@ -757,8 +757,8 @@ Status GraphExecutionState::OptimizeGraph(
 
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, std::move(new_graph),
+                                              optimized_graph->get()));
     // The graph conversion sets the requested device names but not the
     // assigned device names. However, since at this point the graph is placed
     // TF expects an assigned device name for every node. Therefore we copy
@@ -848,7 +848,8 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
           for (const NodeDef& ndef : fdef->node_def()) {
             if (ndef.op() == "CollectiveReduce" ||
                 ndef.op() == "CollectiveBcastSend" ||
-                ndef.op() == "CollectiveBcastRecv") {
+                ndef.op() == "CollectiveBcastRecv" ||
+                ndef.op() == "CollectiveGather") {
               int32 instance_key;
               TF_RETURN_IF_ERROR(
                   GetNodeAttr(ndef, "instance_key", &instance_key));
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index a300ae0e684..c00645a3ec3 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h"
 
 #include <algorithm>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -136,8 +138,9 @@ DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              int64 step_id, int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
+              int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -244,12 +247,15 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       }
     }
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
-      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+      dev_mgr_ = absl::make_unique<DeviceMgr>(std::move(local_devices));
     }
-    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
-    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
-                           fail_after);
+    if (!gpu_ring_order_) {
+      gpu_ring_order_ = absl::make_unique<string>();
+    }
+    dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
+                           kStepId, fail_after);
     col_exec_ = new BaseCollectiveExecutor(
         &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
     col_params_.name = "test_collective";
@@ -714,6 +720,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   CollectiveExecutor* col_exec_ = nullptr;
   CollectiveRemoteAccessLocal* rma_;
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
index 19cc784ec54..88317bfc5c2 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.cc
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -108,15 +108,13 @@ class ColocationGraphToIOColocationGroups {
   int next_group_id_;
 };
 
-InspectingPlacer::InspectingPlacer(const Graph* graph,
-                                   const FunctionStack& stack,
+InspectingPlacer::InspectingPlacer(const FunctionStack& stack,
                                    const FunctionLibraryDefinition* flib_def,
                                    const DeviceSet* device_set,
                                    const Device* default_device,
                                    bool allow_soft_placement,
                                    bool log_device_placement)
-    : graph_(*graph),
-      stack_(stack),
+    : stack_(stack),
       flib_def_(*flib_def),
       device_set_(*device_set),
       default_device_(default_device),
diff --git a/tensorflow/core/common_runtime/inspecting_placer.h b/tensorflow/core/common_runtime/inspecting_placer.h
index 6cba3649cb9..3fe6a1a9188 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.h
+++ b/tensorflow/core/common_runtime/inspecting_placer.h
@@ -69,7 +69,7 @@ class InspectingPlacer {
   // TODO(iga): Add a "stack trace" to detect recursion and improve log
   // messages. Currently, we will enter an infinite loop for recursive
   // functions.
-  InspectingPlacer(const Graph* graph, const FunctionStack& stack,
+  InspectingPlacer(const FunctionStack& stack,
                    const FunctionLibraryDefinition* flib_def,
                    const DeviceSet* device_set, const Device* default_device,
                    bool allow_soft_placement, bool log_device_placement);
@@ -80,7 +80,6 @@ class InspectingPlacer {
                                    IOColocationGroups* groups);
 
  private:
-  const Graph& graph_;
   const FunctionStack stack_;
   const FunctionLibraryDefinition& flib_def_;
   const DeviceSet& device_set_;
diff --git a/tensorflow/core/common_runtime/lower_case_op.cc b/tensorflow/core/common_runtime/lower_case_op.cc
index f85dc14231d..24ca8a94b85 100644
--- a/tensorflow/core/common_runtime/lower_case_op.cc
+++ b/tensorflow/core/common_runtime/lower_case_op.cc
@@ -38,11 +38,9 @@ constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
 class CaseBuilder {
  public:
   // Create a CaseBuilder to create the lowered form of `case` with branch
-  // functions identified by `branch_fn_names` in the `graph`. The functions
-  // should be available in `flib`.
+  // functions identified by `branch_fn_names` in the `graph`.
   CaseBuilder(Node* case_op, const std::vector<string>& branch_fn_names,
-              const FunctionLibraryDefinition& flib, bool keep_node_fetchable,
-              Graph* graph);
+              bool keep_node_fetchable, Graph* graph);
 
   // Constructs the basic conditional control flow using switch and merge nodes.
   Status CreatePivotNodes();
@@ -91,7 +89,6 @@ class CaseBuilder {
   // for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  const FunctionLibraryDefinition& flib_;
   string name_;
   bool keep_node_fetchable_;
 
@@ -101,12 +98,10 @@ class CaseBuilder {
 
 CaseBuilder::CaseBuilder(Node* case_op,
                          const std::vector<string>& branch_fn_names,
-                         const FunctionLibraryDefinition& flib,
                          bool keep_node_fetchable, Graph* graph)
     : case_op_(case_op),
       num_branches_(branch_fn_names.size()),
       graph_(graph),
-      flib_(flib),
       name_(case_op->name()),
       keep_node_fetchable_(keep_node_fetchable),
       debug_info_(*case_op_) {
@@ -273,8 +268,7 @@ Status CaseBuilder::BuildLoweredCaseOutput() {
 
 }  // namespace
 
-Status RewriteCaseNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
-                       bool keep_node_fetchable) {
+Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable) {
   VLOG(2) << "Lower Case node (keep_node_fetchable=" << keep_node_fetchable
           << "): " << SummarizeNode(*n);
   const AttrValue* branches_attr = n->attrs().Find("branches");
@@ -288,7 +282,7 @@ Status RewriteCaseNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
   for (int b = 0; b < num_branches; b++) {
     branch_fn_names.emplace_back(branches_attr->list().func(b).name());
   }
-  CaseBuilder cb(n, branch_fn_names, flib, keep_node_fetchable, g);
+  CaseBuilder cb(n, branch_fn_names, keep_node_fetchable, g);
   TF_RETURN_IF_ERROR(cb.CreatePivotNodes());
   TF_RETURN_IF_ERROR(cb.AddInputs());
   TF_RETURN_IF_ERROR(cb.AddOutputs());
diff --git a/tensorflow/core/common_runtime/lower_case_op.h b/tensorflow/core/common_runtime/lower_case_op.h
index fc46a1f34b6..9148f43c6c1 100644
--- a/tensorflow/core/common_runtime/lower_case_op.h
+++ b/tensorflow/core/common_runtime/lower_case_op.h
@@ -22,8 +22,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Replaces Case node `n` with a lowered form that uses _SwitchN/Merge nodes.
-Status RewriteCaseNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
-                       bool keep_node_fetchable);
+Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index 87b024636fc..1152619cb82 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -33,8 +33,9 @@ bool LowerAsMultiDeviceFunction(const Node* n) {
   if (n->IsPartitionedCall()) return true;
 
   bool match;
-  Status s = GetNodeAttr(n->attrs(), kLowerAsMultiDeviceFunctionAttr, &match);
-  return s.ok() && match;
+  bool found =
+      TryGetNodeAttr(n->attrs(), kLowerAsMultiDeviceFunctionAttr, &match);
+  return found && match;
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 2b8d941a295..4254fd1878b 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -40,15 +40,15 @@ constexpr const char* const kXlaClusterAttr = "_xla_compile_id";
 // Checks if boolean attribute is defined and it's value is 'true'.
 bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
   bool match;
-  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
-  return s.ok() && match;
+  bool found = TryGetNodeAttr(n->attrs(), attr_name, &match);
+  return found && match;
 }
 
 // Checks if string attribute is defined and it's not empty.
 bool CheckStringAttr(const Node* n, absl::string_view attr_name) {
   string match;
-  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
-  return s.ok() && !match.empty();
+  bool found = TryGetNodeAttr(n->attrs(), attr_name, &match);
+  return found && !match.empty();
 }
 
 bool LowerUsingSwitchMergeIsOn(const Node* n) {
@@ -138,14 +138,12 @@ Status LowerFunctionalOpsPass::Run(
 
     if (LowerUsingSwitchMergeIsOn(n)) {
       if (n->IsIfNode()) {
-        TF_RETURN_IF_ERROR(
-            RewriteIfNode(n, g, *flib_def, keep_lowered_nodes_fetchable));
+        TF_RETURN_IF_ERROR(RewriteIfNode(n, g, keep_lowered_nodes_fetchable));
       } else if (n->type_string() == "Case") {
+        TF_RETURN_IF_ERROR(RewriteCaseNode(n, g, keep_lowered_nodes_fetchable));
+      } else if (n->IsWhileNode()) {
         TF_RETURN_IF_ERROR(
-            RewriteCaseNode(n, g, *flib_def, keep_lowered_nodes_fetchable));
-      } else if (n->type_string() == "While") {
-        TF_RETURN_IF_ERROR(
-            RewriteWhileNode(n, g, *flib_def, keep_lowered_nodes_fetchable));
+            RewriteWhileNode(n, g, keep_lowered_nodes_fetchable));
       } else {
         return errors::Internal(
             "Node ", FormatNodeForError(*n), " of type ", n->type_string(),
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 2cd89eab756..9b1d2b8e270 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -41,8 +41,7 @@ class CondBuilder {
   // else functions `then_fn` and `else_fn` respectively in the `graph`. The
   // functions should be available in `flib`.
   CondBuilder(Node* if_op, const NameAttrList& then_fn,
-              const NameAttrList& else_fn,
-              const FunctionLibraryDefinition& flib, bool keep_node_fetchable,
+              const NameAttrList& else_fn, bool keep_node_fetchable,
               Graph* graph);
 
   // Constructs the basic conditional control flow using switch and merge nodes.
@@ -95,7 +94,6 @@ class CondBuilder {
   // executed for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  const FunctionLibraryDefinition& flib_;
   string name_;
   bool keep_node_fetchable_;
 
@@ -106,11 +104,9 @@ class CondBuilder {
 
 CondBuilder::CondBuilder(Node* if_op, const NameAttrList& then_fn,
                          const NameAttrList& else_fn,
-                         const FunctionLibraryDefinition& flib,
                          bool keep_node_fetchable, Graph* graph)
     : if_op_(if_op),
       graph_(graph),
-      flib_(flib),
       name_(if_op->name()),
       keep_node_fetchable_(keep_node_fetchable),
       debug_info_(*if_op_),
@@ -272,8 +268,7 @@ Status CondBuilder::BuildLoweredIfOutput() {
 
 }  // namespace
 
-Status RewriteIfNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
-                     bool keep_node_fetchable) {
+Status RewriteIfNode(Node* n, Graph* g, bool keep_node_fetchable) {
   VLOG(2) << "Lower If node (keep_node_fetchable=" << keep_node_fetchable
           << "): " << SummarizeNode(*n);
 
@@ -286,8 +281,8 @@ Status RewriteIfNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
     return errors::InvalidArgument("Else branch function missing");
   }
 
-  CondBuilder cb(n, then_attr->func(), else_attr->func(), flib,
-                 keep_node_fetchable, g);
+  CondBuilder cb(n, then_attr->func(), else_attr->func(), keep_node_fetchable,
+                 g);
   TF_RETURN_IF_ERROR(cb.CreatePivotNodes());
   TF_RETURN_IF_ERROR(cb.AddInputs());
   TF_RETURN_IF_ERROR(cb.AddOutputs());
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
index 00e9302008f..cfaf15e71f1 100644
--- a/tensorflow/core/common_runtime/lower_if_op.h
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -22,8 +22,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Replaces If node `n` with its lowered form that uses Switch and Merge nodes.
-Status RewriteIfNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib,
-                     bool keep_node_fetchable);
+Status RewriteIfNode(Node* n, Graph* g, bool keep_node_fetchable);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index c1c5e510bd0..c28918a8200 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -58,10 +58,9 @@ class LowerWhileHelper {
  public:
   static Status Run(Node* while_op, const NameAttrList& cond_fn,
                     const NameAttrList& body_fn, int parallel_iterations,
-                    Graph* graph, const FunctionLibraryDefinition& flib,
-                    bool keep_node_fetchable) {
+                    Graph* graph, bool keep_node_fetchable) {
     LowerWhileHelper helper(while_op, cond_fn, body_fn, parallel_iterations,
-                            graph, flib, keep_node_fetchable);
+                            graph, keep_node_fetchable);
     return helper.RunInternal();
   }
 
@@ -71,8 +70,7 @@ class LowerWhileHelper {
   // the given graph.
   LowerWhileHelper(Node* while_op, const NameAttrList& cond_fn,
                    const NameAttrList& body_fn, int parallel_iterations,
-                   Graph* graph, const FunctionLibraryDefinition& flib,
-                   bool keep_node_fetchable);
+                   Graph* graph, bool keep_node_fetchable);
 
   Status RunInternal();
 
@@ -136,7 +134,6 @@ class LowerWhileHelper {
   // used as a source of outgoing control edges from lowered While node.
   Node* lowered_while_executed_;
   Graph* graph_;
-  const FunctionLibraryDefinition& flib_;
   // Name of the `while_op_`.
   string name_;
   // Max number of parallel_iterations for the while loop.
@@ -159,11 +156,9 @@ class LowerWhileHelper {
 LowerWhileHelper::LowerWhileHelper(Node* while_op, const NameAttrList& cond_fn,
                                    const NameAttrList& body_fn,
                                    int parallel_iterations, Graph* graph,
-                                   const FunctionLibraryDefinition& flib,
                                    bool keep_node_fetchable)
     : while_op_(while_op),
       graph_(graph),
-      flib_(flib),
       name_(while_op->name()),
       parallel_iterations_(parallel_iterations),
       keep_node_fetchable_(keep_node_fetchable),
@@ -417,7 +412,6 @@ string LowerWhileHelper::NewName(const string& infix) {
 }  // namespace
 
 Status RewriteWhileNode(Node* n, Graph* g,
-                        const FunctionLibraryDefinition& flib,
                         bool keep_node_fetchable) {
   VLOG(2) << "Lower While node (keep_node_fetchable=" << keep_node_fetchable
           << "): " << SummarizeNode(*n);
@@ -438,7 +432,7 @@ Status RewriteWhileNode(Node* n, Graph* g,
 
   TF_RETURN_IF_ERROR(LowerWhileHelper::Run(
       n, cond_attr->func(), body_attr->func(), parallel_iterations_attr->i(), g,
-      flib, keep_node_fetchable));
+      keep_node_fetchable));
   g->RemoveNode(n);
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/lower_while_op.h b/tensorflow/core/common_runtime/lower_while_op.h
index 2090a241e3c..9f016c45892 100644
--- a/tensorflow/core/common_runtime/lower_while_op.h
+++ b/tensorflow/core/common_runtime/lower_while_op.h
@@ -23,9 +23,7 @@ namespace tensorflow {
 
 // Replaces While node `n` with its lowered form that uses Enter, Exit, Switch,
 // Merge, NextIteration and LoopCond nodes.
-Status RewriteWhileNode(Node* n, Graph* g,
-                        const FunctionLibraryDefinition& flib,
-                        bool keep_node_fetchable);
+Status RewriteWhileNode(Node* n, Graph* g, bool keep_node_fetchable);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index 5a6a5e9326a..bb3ea39bedd 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -78,6 +78,15 @@ auto* build_graph_time_usecs = monitoring::Counter<0>::New(
     "spent optimizing the graph with Grappler, and time spent pruning the "
     "sub-graph.");
 
+auto* xla_compilations = monitoring::Counter<0>::New(
+    "/tensorflow/core/xla_compilations",
+    "The number of XLA compilations used to collect "
+    "/tensorflow/core/xla_compilation_time_usecs");
+
+auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/xla_compilation_time_usecs",
+    "The total time spent on compiling XLA graphs in microseconds.");
+
 }  // namespace
 
 void RecordTFDataAutotune(const string& name) {
@@ -119,5 +128,12 @@ void UpdateGraphBuildTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
+  if (compilation_time_usecs > 0) {
+    xla_compilations->GetCell()->IncrementBy(1);
+    xla_compilation_time_usecs->GetCell()->IncrementBy(compilation_time_usecs);
+  }
+}
+
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index b75638292de..1c0f795978c 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -64,6 +64,9 @@ void UpdateGraphExecTime(const uint64 running_time_usecs);
 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
 void UpdateGraphBuildTime(const uint64 running_time_usecs);
 
+// Updates the metrics stored about time XLA spents compiling graphs.
+void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
+
 }  // namespace metrics
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index 5b61e66fa0e..486b21e7bf7 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -55,8 +55,8 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
         NodeDebugInfo debug_info(*n);
         NodeBuilder node_builder(name, op, OpRegistry::Global(), &debug_info);
         node_builder.Device(n->requested_device());
-        string colo;
-        if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
+        const string& colo = GetNodeAttrString(n_attrs, "_class");
+        if (!colo.empty()) {
           node_builder.Attr("_class", colo);
         }
         return node_builder;
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index d27e9da1c81..f8194e6c4ba 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -102,8 +102,10 @@ Status UpdateArgAndRetvalMetadata(
     TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
     AllocatorAttributes alloc_attr;
     DataType type = attr_value->type();
-    MemoryType mtype = (device_type == "TPU") ? MTypeFromDTypeIntsOnDevice(type)
-                                              : MTypeFromDType(type);
+    MemoryType mtype = (device_type == "TPU" || device_type == "XLA_CPU" ||
+                        device_type == "XLA_GPU")
+                           ? MTypeFromDTypeIntsOnDevice(type)
+                           : MTypeFromDType(type);
     if (mtype == HOST_MEMORY) {
       alloc_attr.set_on_host(true);
     }
@@ -115,8 +117,10 @@ Status UpdateArgAndRetvalMetadata(
     TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
     AllocatorAttributes alloc_attr;
     DataType type = attr_value->type();
-    MemoryType mtype = (device_type == "TPU") ? MTypeFromDTypeIntsOnDevice(type)
-                                              : MTypeFromDType(type);
+    MemoryType mtype = (device_type == "TPU" || device_type == "XLA_CPU" ||
+                        device_type == "XLA_GPU")
+                           ? MTypeFromDTypeIntsOnDevice(type)
+                           : MTypeFromDType(type);
     if (mtype == HOST_MEMORY) {
       alloc_attr.set_on_host(true);
     }
@@ -137,8 +141,8 @@ std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
 }
 
 string FunctionNameGenerator::GetName() {
-  for (;; ++counter_) {
-    const string candidate = strings::StrCat(name_, "_", counter_);
+  while (true) {
+    const string candidate = strings::StrCat(name_, "_", counter_++);
     if (flib_def_->Find(candidate) == nullptr) {
       return candidate;
     }
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 08100c1fb07..625b8d6c9f2 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -73,20 +73,20 @@ Status AssignAndLog(int assigned_device, Node* node,
 
 Placer::Placer(Graph* graph, const string& function_name,
                const FunctionLibraryDefinition* flib_def,
-               const DeviceSet* devices, const Device* default_device,
+               const DeviceSet* devices, const Device* default_local_device,
                bool allow_soft_placement, bool log_device_placement)
     : graph_(graph),
       function_name_(function_name),
       flib_def_(flib_def),
       devices_(devices),
-      default_device_(default_device),
+      default_local_device_(default_local_device),
       allow_soft_placement_(allow_soft_placement),
       log_device_placement_(log_device_placement) {}
 
 Placer::Placer(Graph* graph, const string& function_name,
-               const DeviceSet* devices, const Device* default_device)
-    : Placer(graph, function_name, &graph->flib_def(), devices, default_device,
-             true, false) {}
+               const DeviceSet* devices, const Device* default_local_device)
+    : Placer(graph, function_name, &graph->flib_def(), devices,
+             default_local_device, true, false) {}
 
 Placer::Placer(Graph* graph, const string& function_name,
                const DeviceSet* devices)
@@ -113,7 +113,7 @@ Status Placer::Run() {
 
   FunctionStack stack(function_name_);
   ColocationGraph colocation_graph(graph_, stack, flib_def_, devices_,
-                                   default_device_, allow_soft_placement_,
+                                   default_local_device_, allow_soft_placement_,
                                    log_device_placement_);
 
   TF_RETURN_IF_ERROR(colocation_graph.Initialize());
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index 592f08f1593..a2792451b77 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -61,19 +61,20 @@ class Placer {
   // represented by "graph". If "graph" is not representing a function body,
   // "function_name" should be empty.
   //
-  // If non-null, default_device is used where possible as a placement for nodes
-  // which do not have a device specified, ahead of other devices which would
-  // otherwise be higher priority.
+  // If non-null, default_local_device is used where possible as a placement for
+  // nodes which do not have a device specified, ahead of other devices which
+  // would otherwise be higher priority. default_local_device should be on the
+  // local host so that its FLR is directly accessible by the current process.
   //
-  // The "graph", "devices", and "default_device" pointer arguments are borrowed
-  // by this Placer, and must outlive it.
+  // The "graph", "devices", and "default_local_device" pointer arguments are
+  // borrowed by this Placer, and must outlive it.
   Placer(Graph* graph, const string& function_name,
          const FunctionLibraryDefinition* flib_def, const DeviceSet* devices,
-         const Device* default_device, bool allow_soft_placement,
+         const Device* default_local_device, bool allow_soft_placement,
          bool log_device_placement);
 
   Placer(Graph* graph, const string& function_name, const DeviceSet* devices,
-         const Device* default_device);
+         const Device* default_local_device);
 
   Placer(Graph* graph, const string& function_name, const DeviceSet* devices);
 
@@ -96,7 +97,7 @@ class Placer {
   const string function_name_;
   const FunctionLibraryDefinition* const flib_def_;  // Not owned.
   const DeviceSet* const devices_;                   // Not owned.
-  const Device* default_device_;                     // Not owned.
+  const Device* default_local_device_;               // Not owned.
   const bool allow_soft_placement_;
   const bool log_device_placement_;
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 0a4312fb79d..7f3f7b36cce 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -241,9 +241,9 @@ class PlacerTest : public ::testing::Test {
   // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
   //
   // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
-  Status Place(Graph* graph, DeviceSet* devices, bool allow_soft_placement,
-               bool log_device_placement) {
-    Placer placer(graph, "", &graph->flib_def(), devices, nullptr,
+  Status Place(Graph* graph, DeviceSet* devices, Device* default_local_device,
+               bool allow_soft_placement, bool log_device_placement) {
+    Placer placer(graph, "", &graph->flib_def(), devices, default_local_device,
                   allow_soft_placement, log_device_placement);
     return placer.Run();
   }
@@ -286,15 +286,18 @@ class PlacerTest : public ::testing::Test {
   }
 
   Status Place(Graph* graph, DeviceSet* devices) {
-    return Place(graph, devices, true, false);
+    return Place(graph, devices, nullptr, true, false);
   }
 
   Status Place(Graph* graph, bool allow_soft_placement,
                bool log_device_placement) {
-    return Place(graph, &devices_, allow_soft_placement, log_device_placement);
+    return Place(graph, &devices_, nullptr, allow_soft_placement,
+                 log_device_placement);
   }
 
-  Status Place(Graph* graph) { return Place(graph, &devices_, true, false); }
+  Status Place(Graph* graph) {
+    return Place(graph, &devices_, nullptr, true, false);
+  }
 
   Status CallOptPassesAndPlace(Graph* graph, bool allow_soft_placement,
                                bool log_device_placement) {
@@ -1430,8 +1433,8 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
 }
 
 // Test that placement fails when an op with no registered kernels is
-// requested.
-TEST_F(PlacerTest, TestNoKernelsRegistered) {
+// requested and no device is requested for the node
+TEST_F(PlacerTest, TestNoKernelsRegisteredWithNoRequstedDevice) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1447,6 +1450,58 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
   EXPECT_TRUE(absl::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
+// Test that placement fails when an op does not have registered kernel
+// and the requested device has the same (job, replica, task) as the placer's
+// local device
+TEST_F(PlacerTest, TestNoKernelsRegisteredWithRequestedDeviceLocal) {
+  const string cpu_device = "/job:b/replica:0/task:0/device:FakeCPU:0";
+  const string gpu_device = "/job:b/replica:0/task:0/device:FakeGPU:0";
+
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("VariableNoKernels", b.opts().WithName("var"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  GetNodeByName(g, "var")->set_requested_device(gpu_device);
+
+  DeviceSet devices;
+  std::unique_ptr<Device> gpu(FakeDevice::MakeGPU(gpu_device));
+  devices.AddDevice(gpu.get());
+  std::unique_ptr<Device> cpu(FakeDevice::MakeCPU(cpu_device));
+  devices.AddDevice(cpu.get());
+  Status s = Place(&g, &devices, cpu.get(), false, false);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "No OpKernel was registered to support Op "
+                                "'VariableNoKernels' used by {{node var}}"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "<no registered kernels>"));
+}
+
+// Test that placement succeeds when an op does not have registered kernel
+// and the requested device has different (job, replica, task) than the placer's
+// local device
+TEST_F(PlacerTest, TestNoKernelsRegisteredWithRequestedDeviceRemote) {
+  const string local_device = "/job:b/replica:0/task:0/device:FakeCPU:0";
+  const string remote_device = "/job:b/replica:0/task:1/device:FakeGPU:0";
+
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("VariableNoKernels", b.opts().WithName("var"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  GetNodeByName(g, "var")->set_requested_device(remote_device);
+
+  DeviceSet heterogeneous;
+  std::unique_ptr<Device> gpu(FakeDevice::MakeGPU(remote_device));
+  heterogeneous.AddDevice(gpu.get());
+  std::unique_ptr<Device> cpu(FakeDevice::MakeCPU(local_device));
+  heterogeneous.AddDevice(cpu.get());
+  TF_EXPECT_OK(Place(&g, &heterogeneous, cpu.get(), false, false));
+  EXPECT_DEVICE_CONTAINS(g, "var", remote_device);
+}
+
 // Test that placement fails when a kernel is registered but no known
 // device supports it.
 TEST_F(PlacerTest, TestNoDevicesRegistered) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 4bc692e933d..34ee14eea07 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
 #include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/framework/function.h"
@@ -35,7 +36,10 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -66,7 +70,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent,
-    const CustomKernelCreator* custom_kernel_creator)
+    const CustomKernelCreator* custom_kernel_creator,
+    const SessionMetadata* session_metadata)
     : env_(env),
       device_mgr_(device_mgr),
       lib_def_(lib_def),
@@ -74,17 +79,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       flr_map_(new std::unordered_map<Device*,
                                       std::unique_ptr<FunctionLibraryRuntime>>),
       next_handle_(0),
-      parent_(parent) {
+      parent_(parent),
+      session_metadata_(session_metadata) {
   if (device_mgr == nullptr) {
     (*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
-        optimizer_options, custom_kernel_creator, this);
+        optimizer_options, custom_kernel_creator, session_metadata_, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     (*flr_map_)[d] = NewFunctionLibraryRuntime(
         device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
-        optimizer_options, custom_kernel_creator, this);
+        optimizer_options, custom_kernel_creator, session_metadata_, this);
   }
 
   DeviceMgr const* all_devices = device_mgr_;
@@ -311,7 +317,6 @@ const string* AssignedOrRequestedDeviceName(const Node& node) {
 }
 
 Status SetArgShape(
-    const std::unordered_map<int, TensorShape>& input_tensor_shapes,
     const std::unordered_map<int, DtypeAndPartialTensorShape>&
         input_resource_dtypes_and_shapes,
     const std::vector<Node*>& arg_nodes) {
@@ -320,16 +325,7 @@ Status SetArgShape(
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
     DataType dtype;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
-    if (dtype != DT_RESOURCE) {
-      auto shape_iter = input_tensor_shapes.find(index);
-      if (shape_iter != input_tensor_shapes.end()) {
-        TensorShapeProto shape_proto;
-        shape_iter->second.AsProto(&shape_proto);
-        AttrValue attr_value;
-        *attr_value.mutable_list()->add_shape() = shape_proto;
-        n->AddAttr("_output_shapes", attr_value);
-      }
-    } else {
+    if (dtype == DT_RESOURCE) {
       auto dtype_and_shape_iter = input_resource_dtypes_and_shapes.find(index);
       if (dtype_and_shape_iter != input_resource_dtypes_and_shapes.end()) {
         AttrValue dtype_attr_value;
@@ -620,9 +616,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     options.graph_collector->CollectRawGraph(def);
   }
 
-  TF_RETURN_IF_ERROR(SetArgShape(options.input_tensor_shapes,
-                                 options.input_resource_dtypes_and_shapes,
-                                 arg_nodes));
+  TF_RETURN_IF_ERROR(
+      SetArgShape(options.input_resource_dtypes_and_shapes, arg_nodes));
   TF_RETURN_IF_ERROR(PinArgsAndRets(options.input_devices,
                                     options.output_devices, device_set_,
                                     arg_nodes, ret_nodes));
@@ -762,42 +757,71 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   int i = 0;
   // Generate a random function_name to avoid one function reuse the partition
   // function instantiated by another function.
+  FunctionLibraryDefinition* data_lib_def = &data->lib_def_;
   FunctionNameGenerator name_generator(
-      &data->lib_def_, absl::StrCat(function_name, "_", random::New64()));
+      data_lib_def, absl::StrCat(function_name, "_", random::New64()));
+  auto subgraph_size = subgraphs.size();
+  gtl::InlinedVector<Status, 4> instantiate_status(subgraph_size);
+  BlockingCounter counter(static_cast<int>(subgraph_size));
+  auto runner = [this, subgraph_size](std::function<void()> fn) {
+    // NOTE: Only use thread pool to instantiate sub-function when there are
+    // more than 8 sub-functions. We want to avoid cost of switching thread when
+    // there are only a few sub-functions.
+    if (default_thread_pool_ != nullptr && subgraph_size > 8) {
+      default_thread_pool_->Schedule(fn);
+    } else {
+      fn();
+    }
+  };
   for (const auto& pair : subgraphs) {
-    i += 1;
-    const string& target = pair.first;
-
-    const string& device_type =
-        device_set_.FindDeviceByName(target)->device_type();
-    Graph* subgraph = pair.second.get();
-
-    ComponentFunctionData* comp_data = &data->glue_[target];
-    TF_RETURN_IF_ERROR(UpdateArgAndRetvalMetadata(
-        subgraph, device_type, &comp_data->arg_indices_,
-        &comp_data->ret_indices_, &comp_data->arg_alloc_attrs_,
-        &comp_data->ret_alloc_attrs_));
-    FunctionDef shard;
+    Status* status = &instantiate_status[i];
     string unique_name = name_generator.GetName();
-    TF_RETURN_IF_ERROR(
-        GraphToFunctionDef(*subgraph, unique_name, control_ret, &shard));
-    TF_RETURN_IF_ERROR(data->lib_def_.AddFunctionDef(shard));
-    FunctionLibraryRuntime::InstantiateOptions opts;
-    opts.executor_type = options.executor_type;
-    opts.target = target;
-    opts.lib_def = &data->lib_def_;
-    opts.create_kernels_eagerly = options.create_kernels_eagerly;
-    opts.state_handle = options.state_handle;
-    FunctionLibraryRuntime::Handle component_handle;
+    ComponentFunctionData* comp_data = &data->glue_[pair.first];
+    runner([this, &pair, comp_data, unique_name, data_lib_def, &control_ret,
+            &options, status, &counter] {
+      auto cleanup = gtl::MakeCleanup([&counter] { counter.DecrementCount(); });
+      const string& target = pair.first;
 
-    TF_RETURN_IF_ERROR(Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
-                                   &component_handle));
-    VLOG(1) << "Instantiated component function " << unique_name
-            << " on device " << target << " with component handle "
-            << component_handle;
-    VLOG(2) << DebugString(shard);
-    comp_data->handle_ = component_handle;
+      const string& device_type =
+          device_set_.FindDeviceByName(target)->device_type();
+      Graph* subgraph = pair.second.get();
+
+      status->Update(UpdateArgAndRetvalMetadata(
+          subgraph, device_type, &comp_data->arg_indices_,
+          &comp_data->ret_indices_, &comp_data->arg_alloc_attrs_,
+          &comp_data->ret_alloc_attrs_));
+      if (!status->ok()) return;
+      FunctionDef shard;
+      status->Update(
+          GraphToFunctionDef(*subgraph, unique_name, control_ret, &shard));
+      if (!status->ok()) return;
+      status->Update(data_lib_def->AddFunctionDef(shard));
+      FunctionLibraryRuntime::InstantiateOptions opts;
+      opts.executor_type = options.executor_type;
+      opts.target = target;
+      opts.lib_def = data_lib_def;
+      opts.create_kernels_eagerly = options.create_kernels_eagerly;
+      opts.state_handle = options.state_handle;
+      FunctionLibraryRuntime::Handle component_handle;
+
+      // TODO(fishx): introduce an async version of this Instantiate method.
+      status->Update(Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
+                                 &component_handle));
+      if (!status->ok()) return;
+      VLOG(1) << "Instantiated component function " << unique_name
+              << " on device " << target << " with component handle "
+              << component_handle;
+      VLOG(2) << DebugString(shard);
+      comp_data->handle_ = component_handle;
+    });
+    i += 1;
   }
+  counter.Wait();
+  StatusGroup group;
+  for (auto& status : instantiate_status) {
+    group.Update(status);
+  }
+  TF_RETURN_IF_ERROR(group.as_summary_status());
 
   *handle = AddMultiDeviceHandle(std::move(data), function_key);
   VLOG(2) << "Instantiated MultiDevice function \"" << function_name
@@ -1290,7 +1314,8 @@ Status ProcessFunctionLibraryRuntime::Clone(
   }
   *out_pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_, env, graph_def_version, out_lib_def->get(),
-      optimizer_options, default_thread_pool_, parent_, custom_kernel_creator);
+      optimizer_options, default_thread_pool_, parent_, custom_kernel_creator,
+      session_metadata_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 8ca6f3b9221..4ec99486f52 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -38,7 +38,8 @@ class ProcessFunctionLibraryRuntime {
       const OptimizerOptions& optimizer_options,
       thread::ThreadPool* thread_pool = nullptr,
       DistributedFunctionLibraryRuntime* parent = nullptr,
-      const CustomKernelCreator* custom_kernel_creator = nullptr);
+      const CustomKernelCreator* custom_kernel_creator = nullptr,
+      const SessionMetadata* metadata = nullptr);
 
   ~ProcessFunctionLibraryRuntime() {
     // Deleting the FunctionLibraryRuntime map will delete the function handles
@@ -360,6 +361,7 @@ class ProcessFunctionLibraryRuntime {
       flr_map_;
   int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
+  const SessionMetadata* const session_metadata_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index f848bdf7471..f231bc1624b 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 
+#include <memory>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -30,12 +31,15 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -71,6 +75,13 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
   DeviceMgr* device_mgr_;
 };
 
+SessionMetadata GenerateSessionMetadata() {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("name");
+  session_metadata.set_version(42);
+  return session_metadata;
+}
+
 // TODO(b/128707168): Tests requiring a GPU device are currently always skipped
 // because the check for whether a GPU device is present happens before the GPU
 // device is set up.
@@ -102,7 +113,8 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     }
   }
 
-  void Init(const std::vector<FunctionDef>& flib) {
+  void Init(const std::vector<FunctionDef>& flib,
+            const SessionMetadata* session_metadata = nullptr) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -110,7 +122,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     cluster_flr_.reset(new TestClusterFLR(device_mgr_.get()));
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
-        opts, nullptr, cluster_flr_.get()));
+        opts, nullptr, cluster_flr_.get(), nullptr, session_metadata));
     rendezvous_ = new IntraProcessRendezvous(device_mgr_.get());
   }
 
@@ -122,7 +134,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   Tensor GPUToCPU(const Tensor& device_tensor) {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     CHECK(gpu_device_);
     CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
     DeviceContext* device_context =
@@ -146,7 +158,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   Tensor CPUToGPU(const Tensor& cpu_tensor) {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     CHECK(gpu_device_);
     CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
     DeviceContext* device_context =
@@ -170,13 +182,14 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 #endif  // GOOGLE_CUDA
   }
 
-  Status Run(const string& name, FunctionLibraryRuntime::Options opts,
-             test::function::Attrs attrs,
-             const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+  Status RunWithRuntime(
+      const string& name, FunctionLibraryRuntime::Options opts,
+      test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
+      const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+      ProcessFunctionLibraryRuntime* pflr) {
     FunctionLibraryRuntime::Handle handle;
-    Status status =
-        proc_flr_->Instantiate(name, attrs, instantiate_opts, &handle);
+    Status status = pflr->Instantiate(name, attrs, instantiate_opts, &handle);
     if (!status.ok()) {
       return status;
     }
@@ -191,7 +204,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     Notification done;
     opts.runner = &runner;
     std::vector<Tensor> out;
-    proc_flr_->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
+    pflr->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
       status = s;
       done.Notify();
     });
@@ -208,16 +221,15 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     // Release the handle and then try running the function. It shouldn't
     // succeed.
-    status = proc_flr_->ReleaseHandle(handle);
+    status = pflr->ReleaseHandle(handle);
     if (!status.ok()) {
       return status;
     }
     Notification done2;
-    proc_flr_->Run(opts, handle, args, &out,
-                   [&status, &done2](const Status& s) {
-                     status = s;
-                     done2.Notify();
-                   });
+    pflr->Run(opts, handle, args, &out, [&status, &done2](const Status& s) {
+      status = s;
+      done2.Notify();
+    });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(absl::StrContains(status.error_message(), "not found."));
@@ -225,6 +237,14 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status Run(const string& name, FunctionLibraryRuntime::Options opts,
+             test::function::Attrs attrs,
+             const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+    return RunWithRuntime(name, opts, attrs, instantiate_opts, args, rets,
+                          proc_flr_.get());
+  }
+
   Status RunInstantiated(FunctionLibraryRuntime::Handle handle,
                          FunctionLibraryRuntime::Options opts,
                          const std::vector<Tensor>& args,
@@ -338,9 +358,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCallFindDevice) {
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   Tensor y;
   TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
-  test::ExpectTensorEqual<string>(
-      y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:0"},
-                                TensorShape({})));
+  test::ExpectTensorEqual<tstring>(
+      y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:0"},
+                                 TensorShape({})));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
@@ -371,13 +391,13 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:1";
   Tensor y;
   TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
-  test::ExpectTensorEqual<string>(
-      y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
-                                TensorShape({})));
+  test::ExpectTensorEqual<tstring>(
+      y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:1"},
+                                 TensorShape({})));
   TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
-  test::ExpectTensorEqual<string>(
-      y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
-                                TensorShape({})));
+  test::ExpectTensorEqual<tstring>(
+      y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:1"},
+                                 TensorShape({})));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
@@ -390,15 +410,15 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts_0;
   instantiate_opts_0.target = "/job:a/replica:0/task:0/device:CPU:0";
   TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts_0, {}, {&y}));
-  test::ExpectTensorEqual<string>(
-      y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:0"},
-                                TensorShape({})));
+  test::ExpectTensorEqual<tstring>(
+      y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:0"},
+                                 TensorShape({})));
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts_1;
   instantiate_opts_1.target = "/job:a/replica:0/task:0/device:CPU:1";
   TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts_1, {}, {&y}));
-  test::ExpectTensorEqual<string>(
-      y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
-                                TensorShape({})));
+  test::ExpectTensorEqual<tstring>(
+      y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:1"},
+                                 TensorShape({})));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRSerialTest) {
@@ -461,6 +481,12 @@ bool IsCUDATensor(const Tensor& t) {
   if (err == cudaErrorInvalidValue) return false;
   CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   return (attributes.memoryType == cudaMemoryTypeDevice);
+#elif TENSORFLOW_USE_ROCM
+  hipPointerAttribute_t attributes;
+  hipError_t err = hipPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == hipErrorInvalidValue) return false;
+  CHECK_EQ(hipSuccess, err) << hipGetErrorString(err);
+  return (attributes.memoryType == hipMemoryTypeDevice);
 #else
   CHECK(false)
       << "IsCUDATensor should not be called when CUDA is not available";
@@ -904,5 +930,108 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
   }
 }
 
+REGISTER_OP("SessionMetadataReader")
+    .Input("x: int64")
+    .Output("y: string")
+    .SetIsStateful()
+    .Doc(R"doc(SessionMetadataReader returns the session metadata.
+
+x: int64
+y: string
+)doc");
+
+class SessionMetadataReaderOp : public OpKernel {
+ public:
+  explicit SessionMetadataReaderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* out_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("y", TensorShape({}), &out_tensor));
+    if (ctx->session_metadata() != nullptr) {
+      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
+    } else {
+      out_tensor->scalar<tstring>()() = "";
+    }
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("SessionMetadataReader").Device(DEVICE_CPU),
+                        SessionMetadataReaderOp);
+
+FunctionDef SessionMetadataReaderOpFn() {
+  return FunctionDefHelper::Define(
+      // Name
+      "SessionMetadataReaderFn",
+      // Args
+      {"x: int64"},
+      // Return values
+      {"y: string"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"y"}, "SessionMetadataReader", {"x"}, {}}});
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataAbsent) {
+  Init({SessionMetadataReaderOpFn()}, /*session_metadata=*/nullptr);
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.rendezvous = rendezvous_;
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
+  const auto x = test::AsTensor<int64>({17});
+  Tensor y;
+  TF_CHECK_OK(
+      Run("SessionMetadataReaderFn", opts, {}, instantiate_opts, {x}, {&y}));
+  EXPECT_EQ("", y.scalar<tstring>()());
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresent) {
+  const SessionMetadata session_metadata = GenerateSessionMetadata();
+  Init({SessionMetadataReaderOpFn()}, &session_metadata);
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.rendezvous = rendezvous_;
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
+  const auto x = test::AsTensor<int64>({17});
+  Tensor y;
+  TF_CHECK_OK(
+      Run("SessionMetadataReaderFn", opts, {}, instantiate_opts, {x}, {&y}));
+  SessionMetadata read_metadata;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(y.scalar<tstring>()(),
+                                                    &read_metadata));
+  EXPECT_EQ(session_metadata.name(), read_metadata.name());
+  EXPECT_EQ(session_metadata.version(), read_metadata.version());
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresentAfterCloning) {
+  const SessionMetadata session_metadata = GenerateSessionMetadata();
+  Init({SessionMetadataReaderOpFn()}, &session_metadata);
+  auto* flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
+  ASSERT_NE(nullptr, flr);
+  std::unique_ptr<FunctionLibraryDefinition> cloned_lib_def;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> cloned_proc_flr;
+  FunctionLibraryRuntime* cloned_flr;
+  TF_ASSERT_OK(flr->Clone(&cloned_lib_def, &cloned_proc_flr, &cloned_flr));
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.rendezvous = rendezvous_;
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
+  const auto x = test::AsTensor<int64>({17});
+  Tensor y;
+  TF_CHECK_OK(RunWithRuntime("SessionMetadataReaderFn", opts, {},
+                             instantiate_opts, {x}, {&y},
+                             cloned_proc_flr.get()));
+  SessionMetadata read_metadata;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(y.scalar<tstring>()(),
+                                                    &read_metadata));
+  EXPECT_EQ(session_metadata.name(), read_metadata.name());
+  EXPECT_EQ(session_metadata.version(), read_metadata.version());
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 93b658fb6b5..e4ef78c3e2b 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -163,6 +163,8 @@ class RenamedDevice : public Device {
     }
   }
 
+  bool IsLocal() const override { return underlying_device_->IsLocal(); }
+
  private:
   RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
                 bool owns_underlying, bool isolate_session_state,
diff --git a/tensorflow/core/common_runtime/rendezvous_util_test.cc b/tensorflow/core/common_runtime/rendezvous_util_test.cc
index 093fa7921f5..cb3fc45499d 100644
--- a/tensorflow/core/common_runtime/rendezvous_util_test.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util_test.cc
@@ -33,7 +33,7 @@ class RendezvousUtilTest : public ::testing::Test {
 // string -> Tensor<string>
 Tensor V(const string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = content;
+  tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
@@ -41,7 +41,7 @@ Tensor V(const string& content) {
 string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
-  return tensor.scalar<string>()();
+  return tensor.scalar<tstring>()();
 }
 
 string MakeStringKey(const string& name) {
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index f0f29987452..a5648684906 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_gatherer.h"
 
 #include <algorithm>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -44,8 +46,9 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              int64 step_id, int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
+              int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -164,12 +167,15 @@ class RingGathererTest : public ::testing::Test {
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
       LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
                  << " devices: ";
-      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+      dev_mgr_ = absl::make_unique<DeviceMgr>(std::move(local_devices));
     }
-    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
-    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
-                           fail_after);
+    if (!gpu_ring_order_) {
+      gpu_ring_order_ = absl::make_unique<string>();
+    }
+    dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
+                           kStepId, fail_after);
     col_exec_ = new BaseCollectiveExecutor(
         &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
     col_params_.name = "test_collective";
@@ -518,6 +524,7 @@ class RingGathererTest : public ::testing::Test {
   CollectiveExecutor* col_exec_;
   CollectiveRemoteAccessLocal* rma_;
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 16dbabd1f37..6141d332dd0 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 
 #include <algorithm>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -44,8 +46,9 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              int64 step_id, int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
+              int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -184,14 +187,17 @@ class RingReducerTest : public ::testing::Test {
       }
     }
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
-      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
-                 << " devices: ";
-      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+      LOG(INFO) << "resetting dev_mgr for " << local_devices.size()
+                << " devices: ";
+      dev_mgr_ = absl::make_unique<DeviceMgr>(std::move(local_devices));
     }
-    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
-    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
-                           fail_after);
+    if (!gpu_ring_order_) {
+      gpu_ring_order_ = absl::make_unique<string>();
+    }
+    dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
+                           kStepId, fail_after);
     col_exec_ = new BaseCollectiveExecutor(
         &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
     col_params_.name = "test_collective";
@@ -545,6 +551,7 @@ class RingReducerTest : public ::testing::Test {
   CollectiveExecutor* col_exec_;
   CollectiveRemoteAccessLocal* rma_;
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index eabcb7c438e..575fafdbcde 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -92,6 +92,7 @@ Status NewSession(const SessionOptions& options, Session** out_session) {
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
+  session_created->GetCell()->Set(true);
   monitoring::StartExporter();
   s = factory->NewSession(options, out_session);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index f37d25f4e13..ae527e38929 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/config.pb_text.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -61,7 +61,7 @@ const string RegisteredFactoriesErrorMessageLocked() {
 }
 string SessionOptionsToString(const SessionOptions& options) {
   return strings::StrCat("target: \"", options.target,
-                         "\" config: ", ProtoShortDebugString(options.config));
+                         "\" config: ", options.config.ShortDebugString());
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index e8ac66e3209..2333e55ef46 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -83,7 +83,15 @@ Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
           " not in [0, ", outer_context->num_inputs(), ").");
     }
 
-    node_context->set_output(0, outer_context->input(index));
+    // TODO(b/134547156): TEMPORARY WORKAROUND. If input shape handle is not set
+    // in outer context, set _Arg node output shape to unknown.
+    if (outer_context->input(index).SameHandle(ShapeHandle())) {
+      LOG(WARNING) << "Function instantiation has undefined input shape at "
+                   << "index: " << index << " in the outer inference context.";
+      node_context->set_output(0, node_context->UnknownShape());
+    } else {
+      node_context->set_output(0, outer_context->input(index));
+    }
 
     auto* resource = outer_context->input_handle_shapes_and_types(index);
     if (resource) {
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index f3bd72f697c..7a2eeda8497 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #if TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index 80205830a2d..6436dea4f2a 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -47,6 +47,10 @@ class TestCollectiveExecutor : public CollectiveExecutor {
                   const StatusCallback& done) override {
     done(errors::Internal("Unimplemented"));
   }
+
+  void RunClosure(std::function<void()>) override {
+    LOG(FATAL) << "Unimplemented";
+  }
 };
 
 class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index c60d2a7d875..68fcc9a079a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
@@ -100,7 +100,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     }
   }
   return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                 ProtoDebugString(tensor_proto));
+                                 tensor_proto.DebugString());
 }
 
 void ThreadPoolDevice::CopyTensorInSameDevice(
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 135f73d3222..8a985e5ae30 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -22,7 +22,7 @@ load(
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_kernel_tests_linkstatic",
     "tf_proto_library",
@@ -101,7 +101,6 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
     ],
     alwayslink = 1,
@@ -124,7 +123,6 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index d5498ed6ffa..038418a827e 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -56,6 +56,10 @@ Status DebugNodeInserter::InsertNodes(
     return Status::OK();
   }
 
+  // Debug ops and URLs for wildcard node names (if any).
+  std::vector<string> default_debug_ops;
+  std::vector<string> default_debug_urls;
+
   // A map from tensor name (e.g., "node_a:0") to list of debug op names
   // (e.g., {"DebugIdentity", "DebugNanCount"})
   std::unordered_map<string, std::vector<string>> tensor_watches;
@@ -65,16 +69,39 @@ Status DebugNodeInserter::InsertNodes(
 
   // Cache the proto content for fast lookup later
   for (const DebugTensorWatch& watch : watches) {
-    if (watch.output_slot() < 0) {
-      // The semantics of output_slot == -1 is that the node is watched only
-      // for completion, but not for output tensor values (see
-      // NodeCompletionCallback in debug_gateway.h).
-      continue;
-    }
     if (watch.debug_ops().empty()) {
       continue;
     }
 
+    if (watch.debug_urls().empty()) {
+      continue;
+    }
+
+    if (watch.node_name() == "*") {
+      if (watch.output_slot() == -1) {
+        default_debug_ops.insert(default_debug_ops.end(),
+                                 watch.debug_ops().begin(),
+                                 watch.debug_ops().end());
+        default_debug_urls.insert(default_debug_urls.end(),
+                                  watch.debug_urls().begin(),
+                                  watch.debug_urls().end());
+      } else {
+        return Status(error::FAILED_PRECONDITION,
+                      strings::StrCat(
+                          "output_slot is expected to be -1 for wildcard ",
+                          "node name (\"*\"), but got ", watch.output_slot()));
+      }
+      continue;
+    } else {
+      if (watch.output_slot() < 0) {
+        return Status(
+            error::FAILED_PRECONDITION,
+            strings::StrCat("A negative output_slot in DebugTensorWatch is ",
+                            "valid only for the wildcard node name (\"*\"), ",
+                            "but got node name ", watch.node_name()));
+      }
+    }
+
     string tensor_name =
         strings::StrCat(watch.node_name(), ":", watch.output_slot());
 
@@ -120,9 +147,9 @@ Status DebugNodeInserter::InsertNodes(
          ++src_output_slot) {
       const string tensor_name =
           strings::StrCat(src_node->name(), ":", src_output_slot);
-      if (tensor_watches.find(tensor_name) == tensor_watches.end()) {
-        // Add debug nodes only for edges with matching source node and source
-        // output slot.
+      const bool explicit_tensor_match =
+          tensor_watches.find(tensor_name) != tensor_watches.end();
+      if (!explicit_tensor_match && default_debug_ops.empty()) {
         continue;
       }
 
@@ -146,11 +173,17 @@ Status DebugNodeInserter::InsertNodes(
                                              src_output_slot, &memory_type));
 
       // Create the copy node for the watched tensor.
+      const std::vector<string> debug_ops = explicit_tensor_match
+                                                ? tensor_watches[tensor_name]
+                                                : default_debug_ops;
+      const std::vector<string> debug_urls =
+          explicit_tensor_match ? tensor_watch_urls[tensor_name]
+                                : default_debug_urls;
       Node* copy_node;
-      Status copy_s = CreateCopyNode(
-          graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
-          src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
-          tensor_watch_urls[tensor_name], &copy_node);
+      Status copy_s =
+          CreateCopyNode(graph, device_type, memory_type == HOST_MEMORY,
+                         src_node->name(), src_output_slot, src_dt, tensor_name,
+                         debug_ops, debug_urls, &copy_node);
       if (!copy_s.ok()) {
         return Status(
             error::FAILED_PRECONDITION,
@@ -163,13 +196,13 @@ Status DebugNodeInserter::InsertNodes(
 
       // Create all requested debug nodes and their edges to the Copy node.
       std::vector<Node*> debug_nodes;
-      for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) {
-        const string& debug_op_name = tensor_watches[tensor_name][i];
+      for (size_t i = 0; i < debug_ops.size(); ++i) {
+        const string& debug_op_name = debug_ops[i];
 
         Node* debug_node;
-        Status debug_s = CreateDebugNode(
-            graph, *device, copy_node->name(), src_dt, tensor_name,
-            tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
+        Status debug_s = CreateDebugNode(graph, *device, copy_node->name(),
+                                         src_dt, tensor_name, debug_urls, i,
+                                         debug_op_name, &debug_node);
         if (debug_s.ok()) {
           graph->AddEdge(copy_node, 0, debug_node, 0);
           debug_nodes.push_back(debug_node);
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index c857f12e755..26fd376cc6a 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -147,7 +147,7 @@ TEST_F(GrpcDebugTest, SendSingleDebugTensorViaGrpcTest) {
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 1}));
-  tensor.flat<string>()(0) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(0) = string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const Status status = DebugIO::PublishDebugTensor(
@@ -162,8 +162,8 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 2}));
-  tensor.flat<string>()(0) = "A";
-  tensor.flat<string>()(1) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(0) = "A";
+  tensor.flat<tstring>()(1) = string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const Status status = DebugIO::PublishDebugTensor(
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 928a82b0611..3eebcb3f138 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -47,8 +47,8 @@ class DebugIOUtilsTest : public ::testing::Test {
     tensor_a_->flat<float>()(3) = 0.0;
 
     tensor_b_.reset(new Tensor(DT_STRING, TensorShape{2}));
-    tensor_b_->flat<string>()(0) = "corge";
-    tensor_b_->flat<string>()(1) = "garply";
+    tensor_b_->flat<tstring>()(0) = "corge";
+    tensor_b_->flat<tstring>()(1) = "garply";
   }
 
   Env* env_;
@@ -182,8 +182,8 @@ TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
 
   // Verify tensor shape and value.
   ASSERT_EQ(tensor_b_->shape(), b_prime.shape());
-  for (int i = 0; i < b_prime.flat<string>().size(); ++i) {
-    ASSERT_EQ(tensor_b_->flat<string>()(i), b_prime.flat<string>()(i));
+  for (int i = 0; i < b_prime.flat<tstring>().size(); ++i) {
+    ASSERT_EQ(tensor_b_->flat<tstring>()(i), b_prime.flat<tstring>()(i));
   }
 
   // Tear down temporary file and directories.
diff --git a/tensorflow/core/debug/debug_node_key.cc b/tensorflow/core/debug/debug_node_key.cc
index 4b56fe83580..205121e2afc 100644
--- a/tensorflow/core/debug/debug_node_key.cc
+++ b/tensorflow/core/debug/debug_node_key.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_node_key.h"
 
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 642a2a4c07d..65ec1ef8a6d 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -231,7 +231,7 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
   Graph graph(OpRegistry::Global());
   Tensor a_tensor(DT_STRING, TensorShape({2, 2}));
   for (size_t i = 0; i < 4; ++i) {
-    a_tensor.flat<string>()(i) = "hello, world";
+    a_tensor.flat<tstring>()(i) = "hello, world";
   }
   Node* a = test::graph::Constant(&graph, a_tensor);
   Node* b = test::graph::Identity(&graph, a);
@@ -266,7 +266,7 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
         ASSERT_EQ(outputs[0].dtype(), DT_STRING);
         ASSERT_EQ(outputs[0].NumElements(), 4);
         for (size_t i = 0; i < outputs[0].NumElements(); ++i) {
-          EXPECT_EQ(outputs[0].flat<string>()(i), "hello, world");
+          EXPECT_EQ(outputs[0].flat<tstring>()(i), "hello, world");
         }
         TF_CHECK_OK(session->Close());
 
@@ -278,7 +278,7 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
         ASSERT_EQ(1, dumped_tensors.size());
         ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
         for (size_t i = 0; i < 4; ++i) {
-          ASSERT_EQ("hello, world", dumped_tensors[0].flat<string>()(i));
+          ASSERT_EQ("hello, world", dumped_tensors[0].flat<tstring>()(i));
         }
 
         DeleteDumpDir();
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index ef791c74d52..45efd4a473d 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -8,11 +8,11 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -61,7 +61,6 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:master_proto_cc",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
@@ -286,6 +285,7 @@ cc_library(
         ":worker_cache",
         ":worker_interface",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:worker_proto_cc",
     ],
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 21fcd05b1c2..2751deb5be2 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -389,26 +390,53 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
     mutex_lock l(mu_);
     if (status_.ok()) {
       status_ = derived_status;
-      for (BaseRecvTensorCall* call : active_) {
-        call->StartAbort(derived_status);
+      for (auto& entry : active_) {
+        entry.first->StartAbort(derived_status);
+        entry.second();
       }
       active_.clear();
     }
   }
 }
 
-void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call) {
-  mutex_lock l(mu_);
-  if (!status_.ok()) {
-    call->StartAbort(status_);
-  } else {
-    CHECK(active_.insert(call).second);
+void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
+                                        const Rendezvous::Args& args) {
+  CancellationManager* cm = args.cancellation_manager;
+  {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      call->StartAbort(status_);
+      return;
+    }
+    bool already_cancelled = false;
+    InactiveCallback callback = [] {};
+    if (cm != nullptr) {
+      auto token = cm->get_cancellation_token();
+      already_cancelled = !cm->RegisterCallback(token, [this, call] {
+        {
+          mutex_lock l(mu_);
+          if (active_.find(call) == active_.end()) return;
+          call->StartAbort(
+              errors::Cancelled("RecvFromRemoteAsync is cancelled."));
+        }
+      });
+      callback = [cm, token] { cm->TryDeregisterCallback(token); };
+    }
+    if (already_cancelled) {
+      call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
+    } else {
+      CHECK(active_.emplace(call, callback).second);
+    }
   }
 }
 
 void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
   mutex_lock l(mu_);
-  active_.erase(call);
+  auto it = active_.find(call);
+  if (it != active_.end()) {
+    it->second();
+    active_.erase(it);
+  }
 }
 
 BaseRemoteRendezvous::DeferredCall::DeferredCall(const ParsedKey& parsed,
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index 6751fb8bae6..fde589b3511 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -160,7 +160,7 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
                             DeviceNameUtils::ParsedName dst);
 
   // If aborted, aborts "call". Otherwise, adds "call" into active_.
-  void RegisterCall(BaseRecvTensorCall* call);
+  void RegisterCall(BaseRecvTensorCall* call, const Rendezvous::Args& args);
 
   // Removes "call" from active_ if "call" is in active_.
   void DeregisterCall(BaseRecvTensorCall* call);
@@ -192,8 +192,11 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   };
   std::vector<DeferredCall> deferred_calls_ GUARDED_BY(mu_);
 
+  typedef std::function<void()> InactiveCallback;
+
   // Active outstanding RecvTensor calls.
-  gtl::FlatSet<BaseRecvTensorCall*> active_ GUARDED_BY(mu_);
+  std::unordered_map<BaseRecvTensorCall*, InactiveCallback> active_
+      GUARDED_BY(mu_);
 
   bool is_initialized_locked() SHARED_LOCKS_REQUIRED(mu_) {
     return session_ != nullptr;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 596206ca45c..e9133fd45c6 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 8ee879880c7..b4869dcb8be 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -51,7 +51,7 @@ class FakeWorker : public TestWorkerInterface {
       : name_(name), device_mgr_(dev_mgr), param_resolver_(cpres) {}
 
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     std::vector<DeviceAttributes> dev_attr;
     device_mgr_->ListDeviceAttributes(&dev_attr);
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 92b2e4e77d0..b2af3c218a8 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -137,11 +137,11 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
                            nullptr /*send_dev_ctx*/, to_device_ctx, cpu_dev,
                            to_device, cpu_attr, to_alloc_attr, cpu_tensor,
                            to_tensor, dev_to_dev_stream_index,
-                           [cpu_tensor, done](const Status& s) {
+                           [this, cpu_tensor, done](const Status& s) {
                              delete cpu_tensor;
                              // This callback must not block, so execute
                              // done in another thread.
-                             SchedClosure([s, done] { done(s); });
+                             RunClosure([s, done] { done(s); });
                            });
         delete state;
         return;
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index 9434cacbcaa..7d8fcc615cb 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 class WorkerCacheInterface;
@@ -23,11 +25,11 @@ class WorkerCacheInterface;
 // Extend CollectiveRemoteAccessLocal with access to remote peers.
 class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
  public:
-  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
-                                    DeviceResolverInterface* dev_resolver,
-                                    WorkerCacheInterface* worker_cache,
-                                    int64 step_id)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+  CollectiveRemoteAccessDistributed(
+      const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+      std::shared_ptr<UnboundedWorkQueue> work_queue,
+      WorkerCacheInterface* worker_cache, int64 step_id)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
         worker_cache_(worker_cache) {}
 
   ~CollectiveRemoteAccessDistributed() override {}
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 4ed8b31e7d1..1462c38645b 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -74,7 +74,7 @@ class FakeWorker : public TestWorkerInterface {
   BufRendezvous* buf_rendezvous() { return &buf_rendezvous_; }
 
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     std::vector<DeviceAttributes> dev_attr;
     device_mgr_->ListDeviceAttributes(&dev_attr);
@@ -170,7 +170,9 @@ class FakeCache : public TestWorkerCache {
 
 class CollRMADistTest : public ::testing::Test {
  protected:
-  CollRMADistTest() {}
+  CollRMADistTest()
+      : work_queue_(
+            std::make_shared<UnboundedWorkQueue>(Env::Default(), "test")) {}
 
   ~CollRMADistTest() override {
     for (DeviceMgr* dm : device_mgrs_) {
@@ -198,7 +200,8 @@ class CollRMADistTest : public ::testing::Test {
     }
     // All tests simulate requests from worker 0 to worker 1.
     rma_.reset(new CollectiveRemoteAccessDistributed(
-        device_mgrs_[0], dev_resolvers_[dev0_worker_name], &wc_, kStepId));
+        device_mgrs_[0], dev_resolvers_[dev0_worker_name], work_queue_, &wc_,
+        kStepId));
 
     const int kNumElts = 8;
     expected_value_ = Tensor(DT_FLOAT, {kNumElts});
@@ -257,6 +260,7 @@ class CollRMADistTest : public ::testing::Test {
   std::vector<DeviceMgr*> device_mgrs_;
   std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
   std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
   mutex mu_;
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index d39e1cb47a4..fead44cd302 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -98,7 +98,8 @@ void DeviceResolverDistributed::RefreshRemoteAttributes(
   WorkerInterface* worker = worker_cache_->GetOrCreateWorker(task);
   CHECK(worker) << "Failed to get worker for " << task;
   worker->GetStatusAsync(
-      req, resp, [this, device, task, req, resp, worker, done](Status s) {
+      req, resp, /*fail_fast=*/true,
+      [this, device, task, req, resp, worker, done](Status s) {
         if (s.ok()) {
           mutex_lock l(mu_);
           for (const DeviceAttributes& da : resp->device_attributes()) {
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index ecd14db2b6f..1a0d1f51591 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -69,7 +69,7 @@ class FakeWorker : public TestWorkerInterface {
       : name_(name), device_mgr_(dev_mgr), device_resolver_(dres) {}
 
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     std::vector<DeviceAttributes> dev_attr;
     device_mgr_->ListDeviceAttributes(&dev_attr);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index bffe4dcca83..5a0b3dd92b0 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -44,6 +44,7 @@ cc_library(
 
 cc_library(
     name = "remote_execute_node",
+    srcs = ["remote_execute_node.cc"],
     hdrs = ["remote_execute_node.h"],
     deps = [
         ":eager_client",
@@ -126,6 +127,21 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
+
+tf_cc_test(
+    name = "remote_mgr_test",
+    size = "small",
+    srcs = ["remote_mgr_test.cc"],
+    deps = [
+        ":remote_mgr",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
 )
@@ -141,3 +157,24 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:tensor_handle_data",
     ],
 )
+
+cc_library(
+    name = "remote_copy_node",
+    srcs = [
+        "remote_copy_node.cc",
+    ],
+    hdrs = [
+        "remote_copy_node.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":remote_mgr",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index 88847a2dbc3..06f37965b1f 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -34,18 +34,16 @@ class DestroyTensorHandleNode : public tensorflow::EagerNode {
         eager_client_(eager_client) {}
 
   Status Run() override {
-    EnqueueResponse response;
-    Status status;
-    // TODO(b/136025146): Remove wait for notification
-    Notification n;
-    eager_client_->EnqueueAsync(request_.get(), &response,
-                                [&n, &status](const tensorflow::Status& s) {
-                                  status.Update(s);
-                                  n.Notify();
-                                });
-    n.WaitForNotification();
-
-    return status;
+    EnqueueResponse* response = new EnqueueResponse;
+    return eager_client_->StreamingEnqueueAsync(
+        request_.get(), response, [response](const tensorflow::Status& s) {
+          if (!s.ok()) {
+            LOG(WARNING) << "Ignoring an error encountered when deleting "
+                            "remote tensors handles: "
+                         << s.ToString();
+          }
+          delete response;
+        });
   }
 
   void Abort(Status status) override {}
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index e7e923d927b..6eab8670f86 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -39,7 +39,6 @@ class EagerClient {
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
   CLIENT_METHOD(RegisterFunction);
-  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 
@@ -54,9 +53,9 @@ class EagerClient {
   // is invoked and keeps it open until some error condition.
   // Similarly to the methods above, the request can be deleted as soon as
   // StreamingEnqueueAsync returns.
-  virtual void StreamingEnqueueAsync(const EnqueueRequest* request,
-                                     EnqueueResponse* response,
-                                     StatusCallback done) = 0;
+  virtual Status StreamingEnqueueAsync(const EnqueueRequest* request,
+                                       EnqueueResponse* response,
+                                       StatusCallback done) = 0;
 };
 
 // Simple wrapper class that can be used to retrieve EagerClients.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index ae2fd939bdb..9e7c660ba55 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
@@ -122,14 +123,19 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
         return r;
       };
 
+  LOG(INFO) << "Creating " << (request->async() ? "async" : "sync")
+            << " eager service context with rendezvous_id on host "
+            << port::Hostname();
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
       tensorflow::ContextMirroringPolicy::MIRRORING_NONE, request->async(),
       device_mgr, false, r, GetDefaultCustomKernelCreator(),
       worker_session->cluster_flr.get());
+  // Ownership will be transferred to the ServerContext, or else in an error
+  // case ctx will be deleted by this unref.
+  core::ScopedUnref unref_ctx(ctx);
 
-  Status s;
   std::vector<string> remote_workers;
   worker_session->worker_cache->ListWorkers(&remote_workers);
   remote_workers.erase(std::remove(remote_workers.begin(), remote_workers.end(),
@@ -137,20 +143,18 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                        remote_workers.end());
 
   std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers;
-  s = worker_session->worker_cache->GetEagerClientCache(&remote_eager_workers);
-  if (!s.ok()) {
-    delete ctx;
-    return s;
-  }
+  TF_RETURN_IF_ERROR(
+      worker_session->worker_cache->GetEagerClientCache(&remote_eager_workers));
 
   auto remote_mgr =
-      absl::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/false);
-  s = ctx->InitializeRemoteWorker(
+      absl::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/false, ctx);
+  Status s = ctx->InitializeRemoteWorker(
       std::move(remote_eager_workers), worker_session->remote_device_mgr(),
       remote_workers, request->context_id(), std::move(rendezvous_creator),
       std::move(remote_mgr));
   if (!s.ok()) {
-    delete ctx;
+    VLOG(1) << "EagerContext::InitializeRemoteWorker failed with "
+            << s.ToString();
     return s;
   }
 
@@ -163,7 +167,6 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   {
     mutex_lock l(contexts_mu_);
     if (contexts_.find(request->context_id()) != contexts_.end()) {
-      delete ctx;
       return errors::InvalidArgument("EagerService:CreateContext failed. ",
                                      "Context id: <", request->context_id(),
                                      "> already exists.");
@@ -175,6 +178,24 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   return Status::OK();
 }
 
+Status EagerServiceImpl::CreateMasterContext(
+    const tensorflow::uint64 context_id, EagerContext* context) {
+  {
+    mutex_lock l(contexts_mu_);
+    auto iter = contexts_.find(context_id);
+    if (iter != contexts_.end()) {
+      return errors::InvalidArgument(
+          "EagerService:CreateMasterContext failed. ", "Context id: <",
+          context_id, "> already exists.");
+    }
+  }
+  ServerContext* server_context =
+      ServerContext::CreateMasterContext(context, env_);
+  mutex_lock l(contexts_mu_);
+  contexts_.emplace(context_id, server_context);
+  return Status::OK();
+}
+
 Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
   const tensorflow::Tensor* t = nullptr;
 
@@ -187,14 +208,15 @@ Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
 }
 
 Status EagerServiceImpl::ExecuteOp(const Operation& operation,
-                                   ServerContext* server_context,
+                                   EagerContext* eager_context,
+                                   EagerExecutor* eager_executor,
                                    QueueResponse* queue_response) {
   std::unique_ptr<tensorflow::EagerOperation> op;
   const char* name = operation.name().c_str();  // Shorthand
   const tensorflow::AttrTypeMap* types;
   bool is_function = false;
   TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(name, &types, &is_function));
-  if (is_function && !server_context->Context()->FindFunctionByName(name)) {
+  if (is_function && !eager_context->FindFunctionByName(name)) {
     return errors::NotFound(
         "'", name,
         "' is neither a type of a primitive operation nor a name "
@@ -203,8 +225,8 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
         ". Make sure the operation or function is "
         "registered in the binary running in this process.");
   }
-  op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
-                                          is_function, types));
+  op.reset(new tensorflow::EagerOperation(eager_context, name, is_function,
+                                          types, eager_executor));
 
   TF_RETURN_IF_ERROR(op->SetDeviceName(operation.device().c_str()));
 
@@ -214,9 +236,11 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
     for (const auto& remote_handle : operation.inputs()) {
       tensorflow::TensorHandle* handle;
       TF_RETURN_IF_ERROR(
-          server_context->Context()->RemoteMgr()->DeserializeRemoteTensorHandle(
+          eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
               remote_handle, &handle));
       op->AddInput(handle);
+      // Unref handle since it has a ref as an input now.
+      handle->Unref();
     }
   }
 
@@ -226,16 +250,16 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
 
   int num_retvals = 0;
   // TODO(nareshmodi): Consider caching this.
-  TF_RETURN_IF_ERROR(GetNumRetvals(server_context->Context(), operation.name(),
+  TF_RETURN_IF_ERROR(GetNumRetvals(eager_context, operation.name(),
                                    operation.attrs(), &num_retvals));
 
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> retvals(
       num_retvals);
+  VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
   TF_RETURN_IF_ERROR(EagerExecute(op.get(), &retvals, &num_retvals));
   retvals.resize(num_retvals);
 
-  server_context->Context()->RemoteMgr()->AddOperationOutputs(retvals,
-                                                              operation.id());
+  eager_context->RemoteMgr()->AddOperationOutputs(retvals, operation.id());
 
   for (auto* handle : retvals) {
     TF_RETURN_IF_ERROR(TensorHandleShape(handle, queue_response->add_shape()));
@@ -245,7 +269,7 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
 }
 
 Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
-                                 EnqueueResponse* response) {
+                                 EnqueueResponse* response, uint64 stream_id) {
   profiler::TraceMe activity(
       [&] {
         return absl::StrCat("EagerService:Enqueue:", request->DebugString());
@@ -255,13 +279,35 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
   core::ScopedUnref context_unref(context);
 
+  EagerExecutor* executor =
+      stream_id == kInvalidStreamId
+          ? context->Context()->Executor()
+          : context->Context()->RemoteMgr()->GetOrCreateExecutorForStream(
+                stream_id);
+  Status s;
   for (const auto& item : request->queue()) {
     auto* queue_response = response->add_queue_response();
     if (item.has_operation()) {
-      TF_RETURN_IF_ERROR(ExecuteOp(item.operation(), context, queue_response));
+      s = ExecuteOp(item.operation(), context->Context(), executor,
+                    queue_response);
+    } else if (item.has_handle_to_decref()) {
+      auto handle_to_decref = absl::make_unique<RemoteTensorHandleInternal>(
+          item.handle_to_decref());
+      auto node = absl::make_unique<ClientTensorHandleDeleteNode>(
+          context, std::move(handle_to_decref));
+      s = executor->Async()
+              ? context->Context()->Executor()->Add(std::move(node))
+              : node->Run();
     } else {
-      TF_RETURN_IF_ERROR(context->Context()->RemoteMgr()->DeleteTensorHandle(
-          RemoteTensorHandleInternal(item.handle_to_decref())));
+      s = SendTensor(item.send_tensor(), context->Context());
+    }
+
+    if (!s.ok()) {
+      if (stream_id != kInvalidStreamId) {
+        // TODO(b/138847548): Cleanup the executor when StreamCall is deleted.
+        context->Context()->RemoteMgr()->DeleteExecutorForStream(stream_id);
+      }
+      return s;
     }
   }
 
@@ -279,7 +325,7 @@ Status EagerServiceImpl::WaitQueueDone(const WaitQueueDoneRequest* request,
         "EagerServiceImpl::WaitQueueDone is not "
         "implemented for particular op IDs.");
   }
-  return context->Context()->AsyncWait();
+  return context->Context()->Executor()->WaitForAllPendingNodes();
 }
 
 Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
@@ -293,6 +339,8 @@ Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
 
 Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
                                       CloseContextResponse* response) {
+  VLOG(1) << "Executing EagerService::CloseContext for context "
+          << request->context_id();
   ServerContext* context = nullptr;
   if (!GetServerContext(request->context_id(), &context).ok()) {
     // Swallow the error here.
@@ -342,8 +390,8 @@ Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
     Device* device;
     TF_RETURN_IF_ERROR(
         ctx->FindDeviceFromName(request->device_name().c_str(), &device));
-    TF_RETURN_IF_ERROR(
-        EagerCopyToDevice(tensor_handle, ctx, device, false, &copied_handle));
+    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, ctx, ctx->Executor(),
+                                         device, false, &copied_handle));
     tensors.push_back(copied_handle);
     tensor_handle->Unref();
   }
@@ -354,6 +402,33 @@ Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
   return Status::OK();
 }
 
+Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
+                                    EagerContext* eager_context) {
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> tensors;
+  for (const auto& tensor_proto : send_tensor.tensors()) {
+    Tensor tensor;
+    if (!tensor.FromProto(tensor_proto)) {
+      return errors::InvalidArgument("Unable to parse tensor proto");
+    }
+
+    TensorHandle* tensor_handle = nullptr;
+    TF_RETURN_IF_ERROR(TensorHandle::CreateLocalHandle(tensor, &tensor_handle));
+    TensorHandle* copied_handle = nullptr;
+    Device* device;
+    TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
+        send_tensor.device_name().c_str(), &device));
+    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, eager_context,
+                                         eager_context->Executor(), device,
+                                         false, &copied_handle));
+    tensors.push_back(copied_handle);
+    tensor_handle->Unref();
+  }
+
+  eager_context->RemoteMgr()->AddOperationOutputs(tensors, send_tensor.op_id());
+
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   mutex_lock l(contexts_mu_);
@@ -362,7 +437,7 @@ tensorflow::Status EagerServiceImpl::GetServerContext(
     *server_context = nullptr;
     return errors::InvalidArgument(strings::Printf(
         "Unable to find a context_id matching the specified one "
-        "(%lld). Perhaps the worker was restarted, or the context was GC'd?",
+        "(%llu). Perhaps the worker was restarted, or the context was GC'd?",
         context_id));
   }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index b64c0ffb28c..1793726066b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
 
-
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -80,7 +80,15 @@ class EagerServiceImpl {
   Status CreateContext(const CreateContextRequest* request,
                        CreateContextResponse* response);
 
-  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response);
+  // Create a ServerContext for master eager context.
+  Status CreateMasterContext(const tensorflow::uint64 context_id,
+                             EagerContext* context);
+
+  static const uint64 kInvalidStreamId = 0;
+
+  // Used by both Enqueue and StreamingEnqueue RPCs.
+  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response,
+                 uint64 stream_id = kInvalidStreamId);
 
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
                        WaitQueueDoneResponse* response);
@@ -103,15 +111,29 @@ class EagerServiceImpl {
   // and the EagerContext).
   class ServerContext : public core::RefCounted {
    public:
+    // Create a ServerContext for local master.
+    static ServerContext* CreateMasterContext(tensorflow::EagerContext* ctx,
+                                              const WorkerEnv* env) {
+      return new ServerContext(ctx, -1, env, /* is_master= */ true);
+    }
+
     explicit ServerContext(tensorflow::EagerContext* ctx,
-                           int64 destroy_after_secs, const WorkerEnv* env)
-        : ctx_(ctx), env_(env) {
+                           int64 destroy_after_secs, const WorkerEnv* env,
+                           const bool is_master = false)
+        : ctx_(ctx), env_(env), is_master_(is_master) {
+      ctx->Ref();
       destroy_after_micros_ =
           destroy_after_secs * tensorflow::EnvTime::kSecondsToMicros;
       RecordAccess();
     }
-    ~ServerContext() {
 
+    ~ServerContext() {
+      // TFE_Context is responsible for shutting down master eager context.
+      if (!is_master_) {
+        ctx_->WaitForAndCloseRemoteContexts();
+      }
+      // ctx_->RefCountIsOne() should be true here when is_master_ = false.
+      // TODO(iga): Remove EagerContext refcounting.
       ctx_->Unref();
     }
 
@@ -138,13 +160,47 @@ class EagerServiceImpl {
     mutex last_accessed_mu_;
     int64 last_accessed_micros_ GUARDED_BY(last_accessed_mu_);
     int64 destroy_after_micros_;
+
+    const bool is_master_;
   };
   // The returned ServerContext will need to be Unrefed.
   tensorflow::Status GetServerContext(uint64, ServerContext**);
 
+  class ClientTensorHandleDeleteNode : public EagerNode {
+   public:
+    ClientTensorHandleDeleteNode(
+        ServerContext* context,
+        std::unique_ptr<RemoteTensorHandleInternal> handle_to_delete)
+        : tensorflow::EagerNode(),
+          context_(context),
+          handle_to_delete_(std::move(handle_to_delete)) {
+      context_->Ref();
+    }
+
+    ~ClientTensorHandleDeleteNode() override { context_->Unref(); }
+
+    Status Run() override {
+      VLOG(3) << "ServerContext: Deleting tensor handle "
+              << handle_to_delete_->op_id << ":"
+              << handle_to_delete_->output_num;
+      return context_->Context()->RemoteMgr()->DeleteTensorHandle(
+          *handle_to_delete_);
+    }
+
+    void Abort(Status status) override {}
+
+   private:
+    // Owns one reference.
+    ServerContext* const context_;
+    const std::unique_ptr<RemoteTensorHandleInternal> handle_to_delete_;
+  };
+
  private:
-  Status ExecuteOp(const Operation& operation, ServerContext* server_context,
+  Status ExecuteOp(const Operation& operation, EagerContext* eager_context,
+                   EagerExecutor* eager_executor,
                    QueueResponse* queue_response);
+  Status SendTensor(const SendTensorOp& send_tensor,
+                    EagerContext* eager_context);
   const WorkerEnv* const env_;  // Not owned.
 
   mutex contexts_mu_;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index ff07b9fd3e0..ad6da5a97ff 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -125,6 +125,8 @@ void AddOperationToEnqueueRequest(
     auto* input = operation->add_inputs();
     input->set_op_id(tensor_handle_pair.first);
     input->set_output_num(tensor_handle_pair.second);
+    input->set_op_device(device);
+    input->set_device(device);
   }
 
   for (const auto& attr_entry : attrs) {
@@ -324,20 +326,14 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
 
   TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
 
-
-  SendTensorRequest send_tensor_request;
-  send_tensor_request.set_context_id(context_id);
-  send_tensor_request.set_op_id(1);
-  SetTensorProto(send_tensor_request.add_tensors());
-  SendTensorResponse send_tensor_response;
-
-  TF_ASSERT_OK(eager_service_impl.SendTensor(&send_tensor_request,
-                                             &send_tensor_response));
-
   EnqueueRequest remote_enqueue_request;
   remote_enqueue_request.set_context_id(context_id);
   EnqueueResponse remote_enqueue_response;
 
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
   std::unordered_map<string, AttrValue> attrs;
   AttrValue val;
   val.Clear();
@@ -379,6 +375,51 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
                                                &close_context_response));
 }
 
+// Test requests sent to the eager service on master.
+TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
+  tensorflow::Rendezvous* rendezvous =
+      new tensorflow::IntraProcessRendezvous(device_mgr_.get());
+  // Create a master eager context.
+  tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false,
+      device_mgr_.get(), false, rendezvous, GetDefaultCustomKernelCreator(),
+      nullptr);
+  const uint64 context_id = random::New64();
+
+  // Set RemoteMgr to ctx.
+  auto remote_mgr =
+      absl::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/true, ctx);
+  TF_ASSERT_OK(ctx->InitializeRemoteWorker(nullptr, nullptr, {}, context_id,
+                                           nullptr, std::move(remote_mgr)));
+
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
+  // Unable to handle the request since there is no eager context.
+  Status status = eager_service_impl.Enqueue(&remote_enqueue_request,
+                                             &remote_enqueue_response);
+  EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "Unable to find a context_id matching the specified one"));
+
+  // The request can be handled after adding the master eager context to
+  // service.
+  TF_ASSERT_OK(eager_service_impl.CreateMasterContext(context_id, ctx));
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+  ctx->Unref();
+}
+
 TEST_F(EagerServiceImplTest, KeepAliveTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
new file mode 100644
index 00000000000..113b48cadb9
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -0,0 +1,351 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
+
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace eager {
+
+namespace {
+
+void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
+  remote_op->set_name(op->Name());
+
+  op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
+  remote_op->set_device(op->Device()->name());
+}
+
+Status CreateUncachedKernelAndDeviceOp(
+    EagerOperation* op, core::RefCountPtr<KernelAndDevice>* kernel) {
+  EagerContext* ctx = op->EagerContext();
+  Device* device = op->Device();
+
+  FunctionLibraryRuntime* flr = ctx->func_lib(device);
+  if (flr == nullptr) {
+    return errors::Unavailable(
+        "Unable to find a FunctionLibraryRuntime corresponding to device ",
+        device->name());
+  }
+
+  auto runner = (flr->runner() != nullptr) ? flr->runner() : ctx->runner();
+  kernel->reset(new KernelAndDeviceOp(
+      ctx->GetRendezvous(), ctx->LogMemory(), flr, runner,
+      ctx->GetCollectiveExecutorHandle(), ctx->HostCPU()));
+
+  const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+  return kernel->get()->Init(ndef, nullptr);
+}
+
+// This gets a unique wire ID. We add a random identifier so that if the
+// worker has other clients that it is servicing, we don't have any collision.
+string GetUniqueWireID() {
+  static tensorflow::uint64 random_seed = random::New64();
+  static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
+  static tensorflow::int64 wireid GUARDED_BY(wireid_mutex) = 0;
+  tensorflow::mutex_lock l(wireid_mutex);
+  return strings::StrCat(random_seed, "_", wireid++);
+}
+
+}  // namespace
+
+RemoteCopyNode::RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor,
+                               TensorHandle* src, TensorHandle* dst,
+                               Device* recv_device, uint64 recv_op_id)
+    : EagerNode(),
+      src_(src),
+      ctx_(ctx),
+      executor_(executor),
+      send_device_(src->DeviceOrHostCPU(ctx)),
+      recv_device_(recv_device),
+      wire_id_(GetUniqueWireID()),
+      recv_op_id_(recv_op_id),
+      captured_state_(std::make_shared<CapturedSharedState>(dst)) {
+  DCHECK(!send_device_->IsLocal() || !recv_device_->IsLocal());
+  src_->Ref();
+  ctx_->Ref();
+}
+
+Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
+  TF_RETURN_IF_ERROR(executor_->status());
+
+  op->AddInput(src_);
+
+  core::RefCountPtr<KernelAndDevice> kernel;
+  TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
+
+  gtl::InlinedVector<TensorValue, 4> input_vector(1);
+  TF_RETURN_IF_ERROR(src_->TensorValue(&input_vector[0]));
+
+  return kernel->Run(input_vector, nullptr, nullptr, nullptr, nullptr, nullptr);
+}
+
+Status RemoteCopyNode::StartSend() {
+  // TODO(gjn): We should consider just using the low-level SendOp::Compute()
+  // functionality here instead of constructing an Op.
+  const AttrTypeMap* types;
+  bool is_function = false;
+  Status status = AttrTypeMapForOp("_Send", &types, &is_function);
+  if (!status.ok()) {
+    captured_state_->SetSendStatus(status);
+    return status;
+  }
+  DCHECK(!is_function);
+  EagerOperation op(ctx_, "_Send", /*is_function=*/false, types);
+
+  op.SetDevice(send_device_);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id_);
+  op.MutableAttrs()->Set("send_device", send_device_->name());
+  op.MutableAttrs()->Set(
+      "send_device_incarnation",
+      static_cast<int64>(send_device_->attributes().incarnation()));
+  op.MutableAttrs()->Set("recv_device", recv_device_->name());
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("T", src_->dtype);
+
+  DCHECK(send_device_ != nullptr);
+
+  if (send_device_->IsLocal()) {
+    status = RunLocalSend(&op);
+    captured_state_->SetSendStatus(status);
+    return status;
+  } else {
+    // Prepare the request
+    EnqueueRequest request;
+    request.set_context_id(ctx_->GetContextId());
+    auto* remote_op = request.add_queue()->mutable_operation();
+    status = ctx_->RemoteMgr()->SerializeRemoteTensorHandle(
+        src_, remote_op->add_inputs(), src_->device(),
+        src_->DeviceOrHostCPU(ctx_)->name());
+    if (!status.ok()) {
+      captured_state_->SetSendStatus(status);
+      return status;
+    }
+
+    PrepareRemoteOp(remote_op, &op);
+    remote_op->set_id(ctx_->RemoteMgr()->NextOpId());
+
+    // Issue the RPC
+    eager::EagerClient* eager_client;
+    status = ctx_->GetClient(send_device_, &eager_client);
+    if (!status.ok()) {
+      captured_state_->SetSendStatus(status);
+      return status;
+    }
+
+    const std::shared_ptr<CapturedSharedState>& captured_state =
+        captured_state_;
+    EnqueueResponse* response = new EnqueueResponse;
+    // If StartRecv fails very quickly, `this` can be destroyed before the
+    // callback below is executed. So, we can't capture `this`.
+    return eager_client->StreamingEnqueueAsync(
+        &request, response, [response, captured_state](const Status& s) {
+          captured_state->SetSendStatus(s);
+          if (!s.ok()) {
+            captured_state->recv_cancellation()->StartCancel();
+          }
+          delete response;
+        });
+  }
+}
+
+Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
+                                    std::vector<Tensor>* outputs) {
+  TF_RETURN_IF_ERROR(executor_->status());
+
+  core::RefCountPtr<KernelAndDevice> kernel;
+  TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
+
+  gtl::InlinedVector<TensorValue, 4> input_vector;
+  return kernel->Run(input_vector, outputs, nullptr, nullptr, nullptr,
+                     captured_state_->recv_cancellation());
+}
+
+Status RemoteCopyNode::RunRemoteRecv(EagerOperation* op) {
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  auto* remote_op = request.add_queue()->mutable_operation();
+  PrepareRemoteOp(remote_op, op);
+  remote_op->set_id(recv_op_id_);
+
+  eager::EagerClient* eager_client;
+  Status status = ctx_->GetClient(recv_device_, &eager_client);
+  if (!status.ok()) {
+    captured_state_->dst()->Poison(status);
+    return status;
+  }
+
+  // Don't issue the recv until send has completed.
+  //  - local send will complete very quickly.
+  //  - remote send will take some time, but remote->remote copy is
+  //    probably rare enough that we don't care much.
+  // Blocks until send has completed.
+  Status send_status = captured_state_->GetSendStatus();
+  if (!send_status.ok()) {
+    captured_state_->dst()->Poison(send_status);
+    return send_status;
+  }
+
+  EnqueueResponse* response = new EnqueueResponse;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  Device* recv_device = recv_device_;
+  return eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              response->queue_response(0).shape(0), recv_device);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by remote Recv op: "
+                       << status.ToString()
+                       << "\nThis should never happen. "
+                          "Please file an issue with the TensorFlow Team.";
+          }
+        } else {
+          captured_state->dst()->Poison(s);
+        }
+        delete response;
+      });
+}
+
+Status RemoteCopyNode::StartRecv() {
+  // TODO(gjn): We should consider just using the low-level RecvOp::Compute()
+  // functionality here instead of constructing an Op.
+  const AttrTypeMap* types;
+  bool is_function = false;
+  Status status = AttrTypeMapForOp("_Recv", &types, &is_function);
+  if (!status.ok()) {
+    captured_state_->dst()->Poison(status);
+    return status;
+  }
+  DCHECK(!is_function);
+  EagerOperation op(ctx_, "_Recv", /*is_function=*/false, types);
+
+  op.SetDevice(recv_device_);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id_);
+  op.MutableAttrs()->Set("send_device", send_device_->name());
+  op.MutableAttrs()->Set(
+      "send_device_incarnation",
+      static_cast<int64>(send_device_->attributes().incarnation()));
+  op.MutableAttrs()->Set("recv_device", recv_device_->name());
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("tensor_type", src_->dtype);
+
+  if (recv_device_->IsLocal()) {
+    std::vector<Tensor> outputs(1);
+    status = RunLocalRecv(&op, &outputs);
+    if (!status.ok()) {
+      captured_state_->dst()->Poison(status);
+      return status;
+    }
+    return captured_state_->dst()->SetTensor(outputs[0]);
+  } else {
+    // Handles captured_state_->dst_ internally.
+    return RunRemoteRecv(&op);
+  }
+}
+
+Status RemoteCopyNode::StartRemoteSendTensor() {
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  auto* send_tensor = request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(recv_op_id_);
+  send_tensor->set_device_name(recv_device_->name());
+
+  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+  // copy it to the CPU before copying it out.
+  // TODO(b/110044833): this is currently slow, but can be fixed by making
+  // tensor handles aware of more than one device.
+  // TODO(fishx): Make CopyToDevice asynchronous.
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, ctx_->HostCPU(), &tensor));
+  tensor.AsProtoTensorContent(send_tensor->add_tensors());
+
+  eager::EagerClient* eager_client;
+  Status status = ctx_->GetClient(recv_device_, &eager_client);
+  if (!status.ok()) {
+    captured_state_->dst()->Poison(status);
+    return status;
+  }
+  EnqueueResponse* response = new EnqueueResponse;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  captured_state->SetSrcShape(tensor.shape());
+  Device* recv_device = recv_device_;
+  return eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              captured_state->GetSrcShape(), recv_device);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by SendTensor rpc: "
+                       << status.ToString();
+          }
+        } else {
+          captured_state->dst()->Poison(s);
+        }
+        delete response;
+      });
+}
+
+Status RemoteCopyNode::Run() {
+  if (ctx_->UseSendTensorRPC() && send_device_->IsLocal() &&
+      !recv_device_->IsLocal()) {
+    return StartRemoteSendTensor();
+  }
+  Status s = StartSend();
+  if (!s.ok()) {
+    Abort(s);
+    return s;
+  }
+
+  // StartRecv() takes care of doing the right thing to dst handle.
+  // No need to poison it after this point.
+  s = StartRecv();
+  if (!s.ok() && errors::IsCancelled(s)) {
+    Status send_status = captured_state_->GetSendStatus();
+    if (!send_status.ok()) {
+      // In this case, Recv is cancelled because the Send op failed. Return the
+      // status of the Send op instead.
+      s = send_status;
+    }
+  }
+
+  src_->Unref();
+  ctx_->Unref();
+  return s;
+}
+
+void RemoteCopyNode::Abort(Status status) {
+  captured_state_->dst()->Poison(status);
+  src_->Unref();
+  ctx_->Unref();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
new file mode 100644
index 00000000000..00cab8a304b
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This node supports copying a tensor in the following way:
+// - Remote -> Local:
+//   We don't block on the remote _Send op and start executing the local
+//   _Recv immediately after issuing the remote _Send. The local _Recv
+//   kernel (or rather the special _Recv handling in KernelAndDeviceOp::Run)
+//   blocks until the tensor is received. If the remote _Send (or some op
+//   before it) fails, the local callback we give to EnqueueAsync will run
+//   and call CancellationManager.StartCancel(). The blocked local _Recv will
+//   get this notification and return with a cancelled error.
+//
+// - Local -> Remote:
+//   The local _Send op is synchronous and non-blocking, thus it should complete
+//   quickly. We issue remote _Recv RPC only after local _Send completes
+//   successfully. At this point, the tensor to be sent is in the local
+//   Rendezvous, hence, remote _Recv op will not deadlock waiting for the tensor
+//   to appear.
+//   When ctx->UseSendTensorRPC() is true, we use EagerService::Enqueue
+//   SendTensor instead of _Send/_Recv.
+//
+// - Remote -> Remote:
+//   We could issue both remote ops asynchronously, but if remote _Send (or some
+//   op before it) fails, we don't have a good way of cancelling the remote
+//   _Recv. The remote _Recv will deadlock in this case. The current approach
+//   to deal with this issue is to wait for remote _Send to complete before
+//   issuing remote _Recv RPC. Another option is to close the whole streaming
+//   RPC that contains the deadlocked remote _Recv. This would not unblock the
+//   deadlocked RPC on the remote machine without some extra code. Luckily, the
+//   remote -> remote case seems to be fairly rare at this point. So, the
+//   current partially synchronous approach seems fine.
+//
+// To copy a tensor within a host, please use copy_to_device_node instead.
+class RemoteCopyNode : public EagerNode {
+ public:
+  RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor, TensorHandle* src,
+                 TensorHandle* dst, Device* recv_device, uint64 recv_op_id);
+
+  ~RemoteCopyNode() override {}
+
+  Status Run() override;
+
+  void Abort(Status status) override;
+
+ private:
+  // Runs the _Send operation locally or remotely.
+  // An error return value indicates that _Send did not run successfully.
+  // An OK return value does NOT necessarily indicate that _Send has completed
+  // successfully. It might still fail after this method returns.
+  // StartSend() makes sure that captured_state_->send_status_ is set to the
+  // final _Send status after captured_state->send_done_.WaitForNotification()
+  // returns.
+  Status StartSend();
+
+  // Synchronously runs local send `op` and returns its status.
+  Status RunLocalSend(EagerOperation* op);
+
+  // Runs the _Recv operation locally or remotely.
+  // An error return value indicates that _Recv did not run successfully. It
+  // does not indicate that _Send op has completed since StartRecv could have
+  // encountered an error before waiting for _Send's completion.
+  // An OK return value does NOT necessarily indicate that _Recv has completed
+  // successfully (it does now, but won't when streaming RPCs are turned on).
+  // StartRecv() makes sure that dst_ tensor handle is handled correctly
+  // (potentially after this methods returns); a tensor is set in the local
+  // case, a remote shape is set in the remote case, the dst_ handle is
+  // poisoned in either case if there is an error.
+  Status StartRecv();
+
+  // Synchronously runs local receive `op` and returns its status.
+  // Does not wait for the send to complete before running receive.
+  Status RunLocalRecv(EagerOperation* op, std::vector<Tensor>* outputs);
+
+  // Waits for send to complete, then issues remote receive `op` and
+  // returns its status.
+  Status RunRemoteRecv(EagerOperation* op);
+
+  // When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
+  // devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
+  // sender* (Rendezvous::RecvAsync() invoked by the _Recv kernel).
+  //
+  // However, in some configurations the node that has the tensor to be copied
+  // isn't running a server (WorkerService RPC interface). For such cases,
+  // this function enables sending tensors using the EagerService.Enqueue
+  // SendTensor RPC *on the receiver*.
+  Status StartRemoteSendTensor();
+
+  // State that is captured by Send and/or Recv callbacks (depending on which
+  // one(s) is remote) and outlives this node in the case of remote->remote
+  // copy.
+  class CapturedSharedState {
+   public:
+    explicit CapturedSharedState(TensorHandle* d) : dst_(d) { dst_->Ref(); }
+    ~CapturedSharedState() { dst_->Unref(); }
+
+    void SetSendStatus(Status status) {
+      send_status_.Update(status);
+      send_done_.Notify();
+    }
+
+    Status GetSendStatus() {
+      send_done_.WaitForNotification();
+      return send_status_;
+    }
+
+    // src_shape_ is not thread-safe. It should only be set in one thread.
+    void SetSrcShape(const TensorShape& shape) { src_shape_ = shape; }
+
+    const TensorShape& GetSrcShape() { return src_shape_; }
+
+    TensorHandle* dst() { return dst_; }
+    CancellationManager* recv_cancellation() { return &recv_cancellation_; }
+
+   private:
+    TensorHandle* const dst_;
+    CancellationManager recv_cancellation_;
+    // send_status_ is safe to read only after send_done_.WaitForNotification()
+    // has returned.
+    Status send_status_;
+    Notification send_done_;
+    TensorShape src_shape_;
+  };
+
+  TensorHandle* const src_;
+  EagerContext* const ctx_;
+  EagerExecutor* const executor_;
+  Device* const send_device_;
+  Device* const recv_device_;
+  const string wire_id_;
+  const uint64 recv_op_id_;
+
+  std::shared_ptr<CapturedSharedState> captured_state_;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
new file mode 100644
index 00000000000..068575d0095
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
+
+namespace tensorflow {
+namespace eager {
+
+Status RemoteExecuteNode::Run() {
+  EnqueueResponse* response = new EnqueueResponse;
+
+  const gtl::InlinedVector<TensorHandle*, 4>& inputs = inputs_;
+  const gtl::InlinedVector<TensorHandle*, 2>& retvals = retvals_;
+  Device* device = device_;
+
+  // Filled and used only when VLOG(3) is on.
+  string rpc_description;
+  if (VLOG_IS_ON(3)) {
+    std::vector<string> ops;
+    ops.reserve(request_->queue_size());
+    for (const QueueItem& item : request_->queue()) {
+      if (item.has_operation()) {
+        ops.push_back(item.operation().name());
+      } else {
+        ops.push_back(absl::StrCat("DeleteHandle(",
+                                   item.handle_to_decref().op_id(), ":",
+                                   item.handle_to_decref().output_num(), ")"));
+      }
+    }
+    rpc_description =
+        absl::StrCat("RemoteOperation(", absl::StrJoin(ops, ", "), ")");
+  }
+  VLOG(3) << "Issuing: " << rpc_description;
+
+  return eager_client_->StreamingEnqueueAsync(
+      request_.get(), response,
+      [inputs, retvals, response, device,
+       rpc_description](const Status& status) {
+        for (auto handle : inputs) {
+          handle->Unref();
+        }
+        if (status.ok()) {
+          VLOG(3) << "Completed successfully: " << rpc_description;
+        } else {
+          VLOG(3) << "Failed: " << rpc_description << " with status "
+                  << status.ToString();
+        }
+        for (size_t i = 0; i < retvals.size(); ++i) {
+          if (status.ok()) {
+            Status s = retvals[i]->SetRemoteShape(
+                response->queue_response(0).shape(i), device);
+            if (!s.ok()) {
+              LOG(ERROR) << "Ignoring an error encountered when setting "
+                            "remote shape of tensor handle: "
+                         << retvals[i] << " with status: " << status.ToString()
+                         << "\nThis should never happen. "
+                            "Please file an issue with the TensorFlow Team.";
+            }
+          } else {
+            retvals[i]->Poison(status);
+          }
+          retvals[i]->Unref();
+        }
+        delete response;
+      });
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index 761efff0796..9dab1f7a1e6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
 
+#include <cstddef>
+
 #include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
@@ -52,37 +54,7 @@ class RemoteExecuteNode : public EagerNode {
     }
   }
 
-  Status Run() override {
-    EnqueueResponse response;
-    Status status;
-    Notification n;
-    eager_client_->EnqueueAsync(request_.get(), &response,
-                                [&n, &status](const Status& s) {
-                                  status.Update(s);
-                                  n.Notify();
-                                });
-    n.WaitForNotification();
-
-    if (!status.ok()) {
-      Abort(status);
-      return status;
-    }
-
-    for (int i = 0; i < retvals_.size(); i++) {
-      Status s = retvals_[i]->SetRemoteShape(
-          response.queue_response(0).shape(i), device_);
-      if (!s.ok()) {
-        retvals_[i]->Poison(s);
-      }
-      retvals_[i]->Unref();
-    }
-
-    for (auto handle : inputs_) {
-      handle->Unref();
-    }
-
-    return status;
-  }
+  Status Run() override;
 
   void Abort(Status status) override {
     for (auto handle : retvals_) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index a7e00272029..5952b0e4e97 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace eager {
@@ -32,10 +33,9 @@ void RemoteMgr::AddOperationOutputs(
   }
 }
 
-Status RemoteMgr::GetTensorHandle(
+Status RemoteMgr::GetTensorHandleImpl(
     const RemoteTensorHandleInternal& remote_handle,
     tensorflow::TensorHandle** handle) {
-  tf_shared_lock l(remote_tensor_handle_mu_);
   auto iter = remote_tensor_handle_map_.find(remote_handle);
   if (iter == remote_tensor_handle_map_.end()) {
     return errors::InvalidArgument(
@@ -48,6 +48,28 @@ Status RemoteMgr::GetTensorHandle(
   return Status::OK();
 }
 
+Status RemoteMgr::GetTensorHandle(
+    const RemoteTensorHandleInternal& remote_handle,
+    tensorflow::TensorHandle** handle) {
+  tf_shared_lock l(remote_tensor_handle_mu_);
+  return GetTensorHandleImpl(remote_handle, handle);
+}
+
+Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                        int64* op_id, int32* output_num) {
+  TF_RETURN_IF_ERROR(
+      handle->RemoteAddress(handle->device(), op_id, output_num));
+  tensorflow::TensorHandle* h;
+  TF_RETURN_IF_ERROR(
+      GetTensorHandleImpl(RemoteTensorHandleInternal(*op_id, *output_num), &h));
+  if (handle != h) {
+    return errors::Internal(
+        "Found two different tensor handles with the same op_id:", *op_id,
+        " and output_num:", *output_num);
+  }
+  return Status::OK();
+}
+
 Status RemoteMgr::DeleteTensorHandle(
     const RemoteTensorHandleInternal& remote_handle) {
   mutex_lock l(remote_tensor_handle_mu_);
@@ -66,24 +88,82 @@ Status RemoteMgr::DeleteTensorHandle(
 
 Status RemoteMgr::SerializeRemoteTensorHandle(TensorHandle* in,
                                               RemoteTensorHandle* out,
-                                              Device* device) {
-  // TODO(fishx): support serializing local tensor handle.
+                                              Device* device,
+                                              const string& device_name) {
   int64 op_id;
   int32 output_num;
-  TF_RETURN_IF_ERROR(in->RemoteAddress(device, &op_id, &output_num));
+  if (!in->RemoteAddress(device, &op_id, &output_num).ok()) {
+    mutex_lock l(remote_tensor_handle_mu_);
+    if (!GetRemoteTensorHandle(in, &op_id, &output_num).ok()) {
+      op_id = NextOpId();
+      output_num = 0;
+      in->SetRemoteOpIdAndOutputNumToLocalTensorHandle(op_id, output_num);
+      in->Ref();
+      remote_tensor_handle_map_.emplace(
+          RemoteTensorHandleInternal(op_id, output_num), in);
+    }
+  }
   out->Clear();
   out->set_op_id(op_id);
   out->set_output_num(output_num);
+  out->set_op_device(in->op_device() ? in->op_device()->name() : "");
+  out->set_device(device_name);
+  out->set_dtype(in->dtype);
   return Status::OK();
 }
 
 Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
                                                 TensorHandle** out) {
-  // TODO(fishx): support the case when the remote tensor handle does not exist
-  // in the map.
-  TF_RETURN_IF_ERROR(GetTensorHandle(RemoteTensorHandleInternal(in), out));
+  Device* device;
+  if (parent_->local_device_mgr()->LookupDevice(in.op_device(), &device).ok() ||
+      parent_->local_device_mgr()->LookupDevice(in.device(), &device).ok()) {
+    TF_RETURN_IF_ERROR(GetTensorHandle(RemoteTensorHandleInternal(in), out));
+    (*out)->Ref();
+  } else {
+    // Create a remote TensorHandle for remote tensors which have not been
+    // copied to the local worker yet.
+    const string& device_name =
+        in.op_device().empty() ? in.device() : in.op_device();
+    TF_RETURN_IF_ERROR(
+        parent_->FindDeviceFromName(device_name.c_str(), &device));
+    EagerClient* eager_client;
+    TF_RETURN_IF_ERROR(parent_->GetClient(device, &eager_client));
+    auto remote_handle_data = absl::make_unique<UnshapedRemoteTensorHandleData>(
+        in.op_id(), in.output_num(), eager_client, parent_->GetContextId(),
+        parent_);
+    remote_handle_data->ReleaseRemoteTensorHandle();
+    TF_RETURN_IF_ERROR(TensorHandle::CreateUnshapedRemoteHandle(
+        std::move(remote_handle_data), in.dtype(), device, parent_, out));
+  }
+
   return Status::OK();
 }
 
+EagerExecutor* RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
+  mutex_lock l(executor_map_mu_);
+  auto it = executor_map_.find(stream_id);
+  if (it == executor_map_.end()) {
+    auto it_and_bool = executor_map_.emplace(
+        std::piecewise_construct, std::forward_as_tuple(stream_id),
+        std::forward_as_tuple(/*async=*/false));
+    DCHECK(it_and_bool.second);
+    it = it_and_bool.first;
+  }
+  return &it->second;
+}
+
+void RemoteMgr::DeleteExecutorForStream(uint64 stream_id) {
+  mutex_lock l(executor_map_mu_);
+  auto it = executor_map_.find(stream_id);
+  if (it == executor_map_.end()) {
+    return;
+  }
+  Status s = it->second.ShutDown();
+  if (!s.ok()) {
+    LOG(ERROR) << "EagerExecutor shutdown with error " << s.error_message();
+  }
+  executor_map_.erase(it);
+}
+
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 7b4a9bfa84f..69410434af3 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -29,7 +30,8 @@ namespace eager {
 // TODO(fishx): Move remote state from context to this class.
 class RemoteMgr {
  public:
-  explicit RemoteMgr(bool is_master) : is_master_(is_master) {}
+  RemoteMgr(bool is_master, EagerContext* ctx)
+      : is_master_(is_master), parent_(ctx) {}
 
   ~RemoteMgr() {
     for (const auto& entry : remote_tensor_handle_map_) {
@@ -56,13 +58,34 @@ class RemoteMgr {
     return next_op_id_++;
   }
 
+  // Serialize a TensorHandle(local/remote) to a RemoteTensorHandle.
   Status SerializeRemoteTensorHandle(TensorHandle* in, RemoteTensorHandle* out,
-                                     Device* device);
+                                     Device* device, const string& device_name);
 
+  // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
+  // The output holds a reference to the TensorHandle.
   Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
                                        TensorHandle** out);
 
+  EagerExecutor* GetOrCreateExecutorForStream(uint64 stream_id);
+
+  void DeleteExecutorForStream(uint64 stream_id);
+
+ protected:
+  mutex next_id_mutex_;
+  uint64 next_op_id_ GUARDED_BY(next_id_mutex_) = 1;
+
  private:
+  // Returns the op_id and output_num if the given local TensorHandle exists in
+  // remote_tensor_handle_map_.
+  Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                               int64* op_id, int32* output_num)
+      SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+
+  Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
+                             tensorflow::TensorHandle** handle)
+      SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+
   bool is_master_;
 
   using RemoteTensorHandleMap =
@@ -70,13 +93,17 @@ class RemoteMgr {
                    RemoteTensorHandleInternalHash,
                    RemoteTensorHandleInternalEquals>;
   mutex remote_tensor_handle_mu_;
-  // This map maintains the TensorHandles that is required by remote worker
-  // in the cluster.
+  // This map maintains the TensorHandles that are required by remote workers
+  // in the cluster. Each map key is generated by the master, so it should be
+  // globally unique. This map owns references on the handles it contains.
   RemoteTensorHandleMap remote_tensor_handle_map_
       GUARDED_BY(remote_tensor_handle_mu_);
 
-  mutex next_id_mutex_;
-  uint64 next_op_id_ GUARDED_BY(next_id_mutex_) = 1;
+  EagerContext* parent_;  // not owned.
+
+  mutex executor_map_mu_;
+  std::unordered_map<uint64, EagerExecutor> executor_map_
+      GUARDED_BY(executor_map_mu_);
 };
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
new file mode 100644
index 00000000000..f5f01064265
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+namespace {
+
+class TestRemoteMgr : public RemoteMgr {
+ public:
+  TestRemoteMgr(bool is_master, EagerContext* ctx)
+      : RemoteMgr(is_master, ctx) {}
+
+  uint64 OpId() {
+    tf_shared_lock l(next_id_mutex_);
+    return next_op_id_;
+  }
+};
+
+class RemoteMgrTest : public ::testing::Test {
+ public:
+  RemoteMgrTest() {
+    std::vector<std::unique_ptr<Device>> devices;
+    devices.push_back(
+        DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+    local_device_ = devices.back().get();
+    devices.push_back(
+        DeviceFactory::NewDevice("CPU", {}, "/job:worker/replica:0/task:0"));
+    remote_device_ = devices.back().get();
+    auto device_mgr = absl::make_unique<DeviceMgr>(std::move(devices));
+    context_id_ = random::New64();
+    tensorflow::Rendezvous* rendezvous =
+        new tensorflow::IntraProcessRendezvous(device_mgr.get());
+    ctx_ = new tensorflow::EagerContext(
+        SessionOptions(),
+        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false,
+        device_mgr.release(), true, rendezvous, GetDefaultCustomKernelCreator(),
+        nullptr);
+  }
+
+  ~RemoteMgrTest() override { ctx_->Unref(); }
+
+  Device* local_device_;
+  Device* remote_device_;
+  uint64 context_id_;
+  EagerContext* ctx_;
+};
+
+TEST_F(RemoteMgrTest, LocalTensorHandle) {
+  TestRemoteMgr remote_mgr(true, ctx_);
+  Tensor t(DT_FLOAT, TensorShape({0}));
+
+  TensorHandle* handle;
+  TF_ASSERT_OK(TensorHandle::CreateLocalHandle(t, &handle));
+  EXPECT_EQ(nullptr, handle->device());
+  EXPECT_EQ(local_device_, handle->DeviceOrHostCPU(ctx_));
+  const uint64 op_id = remote_mgr.OpId();
+  EXPECT_EQ(1, op_id);
+  RemoteTensorHandle remote_handle;
+  TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
+      handle, &remote_handle, handle->device(),
+      handle->DeviceOrHostCPU(ctx_)->name()));
+  EXPECT_EQ(2, remote_mgr.OpId());
+  EXPECT_EQ(op_id, remote_handle.op_id());
+  EXPECT_EQ(0, remote_handle.output_num());
+  EXPECT_EQ(local_device_->name(), remote_handle.device());
+
+  TensorHandle* deserialized_handle;
+  TF_ASSERT_OK(remote_mgr.DeserializeRemoteTensorHandle(remote_handle,
+                                                        &deserialized_handle));
+  tensorflow::TensorHandle* h;
+  TF_EXPECT_OK(remote_mgr.GetTensorHandle(
+      RemoteTensorHandleInternal(remote_handle), &h));
+  TF_ASSERT_OK(
+      remote_mgr.DeleteTensorHandle(RemoteTensorHandleInternal(remote_handle)));
+  EXPECT_FALSE(
+      remote_mgr.GetTensorHandle(RemoteTensorHandleInternal(remote_handle), &h)
+          .ok());
+
+  deserialized_handle->Unref();
+  handle->Unref();
+}
+
+TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
+  RemoteMgr remote_mgr(false, ctx_);
+  Tensor t(DT_FLOAT, TensorShape({0}));
+
+  TensorHandle* handle;
+  TF_ASSERT_OK(
+      TensorHandle::CreateLocalHandle(t, local_device_, ctx_, &handle));
+  const uint64 op_id = 2;
+  const int output_num = 3;
+  auto tensor_handle_data = absl::make_unique<RemoteTensorHandleData>(
+      op_id, output_num, t.shape(), /*eager_client=*/nullptr, context_id_,
+      ctx_);
+  TF_ASSERT_OK(
+      handle->AddRemoteMirror(std::move(tensor_handle_data), remote_device_));
+  RemoteTensorHandle remote_handle;
+  TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
+      handle, &remote_handle, remote_device_, remote_device_->name()));
+  EXPECT_EQ(op_id, remote_handle.op_id());
+  EXPECT_EQ(output_num, remote_handle.output_num());
+  EXPECT_EQ(remote_device_->name(), remote_handle.device());
+  handle->Unref();
+}
+
+TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
+  RemoteMgr remote_mgr(false, ctx_);
+  Tensor t(DT_FLOAT, TensorShape({0}));
+
+  const uint64 op_id = 3;
+  const int output_num = 1;
+  TensorHandle* handle;
+  TF_ASSERT_OK(TensorHandle::CreateRemoteHandle(
+      op_id, output_num, t.shape(), /*eager_client=*/nullptr, context_id_,
+      DT_FLOAT, remote_device_,
+      /*resource_device=*/nullptr, ctx_, &handle));
+  RemoteTensorHandle remote_handle;
+  TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
+      handle, &remote_handle, remote_device_, remote_device_->name()));
+  EXPECT_EQ(op_id, remote_handle.op_id());
+  EXPECT_EQ(output_num, remote_handle.output_num());
+  EXPECT_EQ(remote_device_->name(), remote_handle.device());
+  handle->Unref();
+}
+
+}  // namespace
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index d3a7c60af31..85ad20e51d9 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -41,16 +41,31 @@ void DestoryRemoteTensorHandle(EagerContext* ctx,
   handle_to_decref->set_op_id(op_id);
   handle_to_decref->set_output_num(output_num);
 
+  VLOG(3) << "Sending request to delete " << request->DebugString();
   std::unique_ptr<EagerNode> node(
       absl::make_unique<eager::DestroyTensorHandleNode>(std::move(request),
                                                         eager_client));
-  Status s = ctx->Async() ? ctx->ExecutorAdd(std::move(node)) : node->Run();
-  if (!s.ok()) {
-    LOG(ERROR) << "Unable to destroy remote tensor handles: "
-               << s.error_message();
+  auto* executor = ctx->Executor();
+  if (executor->Async()) {
+    Status status = executor->Add(std::move(node));
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to destroy remote tensor handles: "
+                 << status.error_message();
+    }
+  } else {
+    // This thread may still hold tensorflow::StreamingRPCState::mu_. We need
+    // to send out the destroy request in a new thread to avoid deadlock.
+    auto* released_node = node.release();
+    (*ctx->runner())([released_node] {
+      Status status = released_node->Run();
+      if (!status.ok()) {
+        LOG(ERROR) << "Unable to destroy remote tensor handles: "
+                   << status.error_message();
+      }
+      delete released_node;
+    });
   }
 }
-
 }  // namespace
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 81d6412e1bf..5d06bf9a75b 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -179,14 +179,14 @@ Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   }
 
   std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
-  for (const auto& partition : partitions) {
+  for (auto& partition : partitions) {
     std::unique_ptr<Graph> device_graph(new Graph(OpRegistry::Global()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now allow.
     device_opts.allow_internal_ops = true;
     device_opts.expect_device_spec = true;
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(device_opts, partition.second,
-                                              device_graph.get()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+        device_opts, std::move(partition.second), device_graph.get()));
     partition_graphs.emplace(partition.first, std::move(device_graph));
   }
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 874424ae90d..ca8b7a7e3d3 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/graph/tensor_id.h"
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index bc21b3a3440..aaae523b546 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/named_tensor.pb.h"
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 3cfdea6bf0b..2566c05fe14 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/master.pb.h"
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index a4b19cbf157..1e1151ca587 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -26,28 +27,24 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
-// TODO(zhifengc): We need to consolidate (full/partial) device name
-// parsing into one place.
-//
-// Parses and returns the local device part (e.g., cpu:0, gpu:4).
-string GetLocalDeviceName(StringPiece fullname) {
-  auto pos = fullname.rfind('/');
-  CHECK_NE(pos, StringPiece::npos);
-  fullname.remove_prefix(pos + 1);
-  return string(fullname);
-}
-
 class RemoteDevice : public Device {
  public:
   RemoteDevice(Env* env, const DeviceAttributes& da)
-      : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {}
+      : Device(env, da),
+        local_dev_name_(DeviceNameUtils::LocalName(da.name())) {}
 
   Status Sync() override { return Status::OK(); }
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
 
+  ResourceMgr* resource_manager() override {
+    LOG(FATAL) << "Accessing the resource manager of a remote device is not "
+               << "supported.";
+  }
+
   bool IsLocal() const override { return false; }
 
  private:
@@ -59,10 +56,18 @@ class RemoteDevice : public Device {
 void AsRemoteDevices(
     Env* env,
     const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    LookupLocalDevice lookup_local_device,
     std::vector<std::unique_ptr<Device>>* remote_devices) {
   for (const auto& da : device_attributes) {
-    auto d = new RemoteDevice(env, da);
-    remote_devices->emplace_back(d);
+    Device* local_device;
+    if (lookup_local_device != nullptr &&
+        lookup_local_device(da.name(), &local_device).ok()) {
+      remote_devices->emplace_back(RenamedDevice::NewRenamedDevice(
+          local_device->name(), local_device, false, false));
+    } else {
+      auto d = new RemoteDevice(env, da);
+      remote_devices->emplace_back(d);
+    }
   }
 }
 
@@ -124,7 +129,7 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
       }
     }
   };
-  wi->GetStatusAsync(&call->req, &call->resp, cb);
+  wi->GetStatusAsync(&call->req, &call->resp, /*fail_fast=*/false, cb);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/remote_device.h b/tensorflow/core/distributed_runtime/remote_device.h
index 1b2a4cd6279..cd53f8f4b9d 100644
--- a/tensorflow/core/distributed_runtime/remote_device.h
+++ b/tensorflow/core/distributed_runtime/remote_device.h
@@ -29,16 +29,23 @@ class Device;
 class Env;
 class WorkerCacheInterface;
 
+// This callback should have the same definition as DeviceMgr::LookupDevice
+// It assigns *device with pointer to Device of the given 'name', where 'name'
+// is either a full device name, or just the replica-local suffix.
+typedef std::function<Status(StringPiece name, Device** device)>
+    LookupLocalDevice;
+
 // Creates Remote Devices for the provided device attributes. Helpful when the
 // list of attributes is known, and doesn't need to be discovered via RPC.
 void AsRemoteDevices(
     Env* env,
     const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    LookupLocalDevice lookup_local_device,
     std::vector<std::unique_ptr<Device>>* remote_devices);
 
 // NewRemoteDevices discovers available devices on the
-// 'remote_worker'.  The implementation uses 'channel_cache' to
-// discover how to communicate with the 'remote_worker' (via gRPC, for
+// 'worker_name'.  The implementation uses 'channel_cache' to
+// discover how to communicate with the 'worker_name' (via gRPC, for
 // example).
 //
 // NewRemoteDevices does not block.
@@ -51,7 +58,7 @@ void AsRemoteDevices(
 typedef std::function<void(const Status&, std::vector<Device*>*)>
     NewRemoteDevicesDone;
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
-                      const string& remote_worker, NewRemoteDevicesDone done);
+                      const string& worker_name, NewRemoteDevicesDone done);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index bd300d6df9c..50b381b2622 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -3,20 +3,20 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
     "tf_cc_test",
     "tf_cuda_library",
-    "tf_cc_binary",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -295,6 +295,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
         "//tensorflow/core/distributed_runtime:device_resolver_distributed",
         "//tensorflow/core/distributed_runtime:graph_mgr",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 0b181366353..1ac8e683f07 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//tensorflow:grpc++",
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index b3164f0956e..2d12c370752 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -23,10 +23,19 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/protobuf/eager_service.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace eager {
 namespace {
+bool EnableStreaming() {
+  bool result;
+  // TODO(b/139210648): Turn on this flag by default.
+  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE",
+                                 false, &result));
+  return result;
+}
+
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
@@ -40,7 +49,7 @@ class GrpcEagerClient : public EagerClient {
       override {                                                          \
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
-        response, std::move(done), nullptr, nullptr);                     \
+        response, std::move(done), nullptr, nullptr, /*max_retries=*/0);  \
   }
 
   CLIENT_METHOD(CreateContext);
@@ -48,7 +57,6 @@ class GrpcEagerClient : public EagerClient {
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(RegisterFunction);
-  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 
@@ -59,8 +67,13 @@ class GrpcEagerClient : public EagerClient {
         &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request,
         response, std::move(done), nullptr, nullptr);
 
-    if (enqueue_dispatchers_.find(request->context_id()) !=
-        enqueue_dispatchers_.end()) {
+    VLOG(1) << "Sending RPC to close remote eager context "
+            << request->DebugString();
+
+    mutex_lock l(mu_);
+    const auto& it = enqueue_dispatchers_.find(request->context_id());
+    if (it != enqueue_dispatchers_.end()) {
+      it->second.CancelCall();
       enqueue_dispatchers_.erase(request->context_id());
     } else {
       LOG(ERROR) << "Remote EagerContext with id " << request->context_id()
@@ -68,28 +81,45 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
-  void StreamingEnqueueAsync(const EnqueueRequest* request,
-                             EnqueueResponse* response,
-                             StatusCallback done) override {
-    auto it = enqueue_dispatchers_.find(request->context_id());
-    if (enqueue_dispatchers_.find(request->context_id()) ==
-        enqueue_dispatchers_.end()) {
-      auto it_and_bool = enqueue_dispatchers_.emplace(
-          std::piecewise_construct,
-          std::forward_as_tuple(request->context_id()),
-          std::forward_as_tuple(
-              &stub_, cq_, "/tensorflow.eager.EagerService/StreamingEnqueue"));
-      it = it_and_bool.first;
+  Status StreamingEnqueueAsync(const EnqueueRequest* request,
+                               EnqueueResponse* response,
+                               StatusCallback done) override {
+    if (EnableStreaming()) {
+      tf_shared_lock l(mu_);
+      auto it = enqueue_dispatchers_.find(request->context_id());
+      if (enqueue_dispatchers_.find(request->context_id()) ==
+          enqueue_dispatchers_.end()) {
+        auto it_and_bool = enqueue_dispatchers_.emplace(
+            std::piecewise_construct,
+            std::forward_as_tuple(request->context_id()),
+            std::forward_as_tuple(
+                &stub_, cq_,
+                "/tensorflow.eager.EagerService/StreamingEnqueue"));
+        it = it_and_bool.first;
+      }
+      it->second.SendNextRequest(*request, response, std::move(done));
+      return Status::OK();
+    } else {
+      Notification n;
+      Status status;
+      EnqueueAsync(request, response, [&n, &status](const Status& s) {
+        status.Update(s);
+        n.Notify();
+      });
+      n.WaitForNotification();
+      done(status);
+      return status;
     }
-
-    it->second.SendNextRequest(*request, response, std::move(done));
   }
 
  private:
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
+
+  mutable mutex mu_;
+
   std::unordered_map<uint64, StreamingRPCDispatcher<EnqueueResponse>>
-      enqueue_dispatchers_;
+      enqueue_dispatchers_ GUARDED_BY(mu_);
 };
 
 class GrpcEagerClientCache : public EagerClientCache {
@@ -147,10 +177,13 @@ class GrpcEagerClientCache : public EagerClientCache {
             void* tag;
             bool ok;
             while (completion_queue_.Next(&tag, &ok)) {
+              VLOG(4) << "GrpcEagerClientThread got next tag";
               GrpcClientCQTag* callback_tag =
                   static_cast<GrpcClientCQTag*>(tag);
               callback_tag->OnCompleted(ok);
+              VLOG(4) << "GrpcEagerClientThread blocking for next tag";
             }
+            VLOG(4) << "GrpcEagerClientThread exiting";
           }));
     }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index b3c2001187c..7bfe34b0c95 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -27,11 +27,18 @@ namespace eager {
 
 GrpcEagerServiceImpl::GrpcEagerServiceImpl(
     const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
-    : env_(env), local_impl_(env) {
+    : env_(env),
+      local_impl_(env),
+      enqueue_streaming_thread_(env_->env, "enqueue_streaming_thread", 1) {
   server_builder->RegisterService(&service_);
   cq_ = server_builder->AddCompletionQueue();
 }
 
+Status GrpcEagerServiceImpl::CreateMasterContext(
+    const tensorflow::uint64 context_id, EagerContext* context) {
+  return local_impl_.CreateMasterContext(context_id, context);
+}
+
 void GrpcEagerServiceImpl::HandleRPCsLoop() {
 #define ENQUEUE_REQUEST(method)                                            \
   do {                                                                     \
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index cea7b6929b8..ae9477049ab 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -44,6 +44,10 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
                        ::grpc::ServerBuilder* server_builder);
   virtual ~GrpcEagerServiceImpl() {}
 
+  // Create a master context in eager service.
+  Status CreateMasterContext(const tensorflow::uint64 context_id,
+                             EagerContext* context);
+
   void HandleRPCsLoop() override;
   void Shutdown() override;
 
@@ -73,34 +77,42 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   // call.
   // StreamingEnqueueHandler gets the request from the `call` and fills the
   // response (also found in `call`) by invoking the local EagerServiceImpl.
-  // The local EagerServiceImpl is invoked in this thread instead of using a
-  // thread-pool as is done for all other methods above. We do this to preserve
-  // request order. The local service can parallelize based on context_id in
-  // request if necessary. Remote contexts are created in async mode by default,
-  // so the local service impl just puts the request on eager executor queue.
+  // The local EagerServiceImpl is invoked in a single-threaded thread pool. We
+  // do this to preserve request order. The local service can parallelize based
+  // on context_id in request if necessary. Remote contexts are created in async
+  // mode by default, so the local service impl just puts the request on eager
+  // executor queue.
   void StreamingEnqueueHandler(
       StreamingCall<EnqueueRequest, EnqueueResponse>* call) {
-    Status status =
-        local_impl_.Enqueue(&call->request(), call->mutable_response());
+    enqueue_streaming_thread_.Schedule([this, call]() {
+      // NOTE(fishx): Use the address of StreamingCall as the stream_id since we
+      // reuse the same StreamingCall for multiple requests in the same
+      // streaming connection.
+      Status status = local_impl_.Enqueue(
+          &call->request(), call->mutable_response(),
+          reinterpret_cast<uint64>(static_cast<void*>(call)));
 
-    if (status.ok()) {
-      VLOG(1) << "local_impl_.Enqueue completed successfully";
-      call->SendResponse();
-    } else {
-      VLOG(1) << "local_impl_.Enqueue failed with " << status.ToString()
-              << " on request " << call->request().DebugString();
-      call->Finish(ToGrpcStatus(status));
-    }
+      if (status.ok()) {
+        VLOG(1) << "local_impl_.Enqueue completed successfully";
+        call->SendResponse();
+      } else {
+        VLOG(1) << "local_impl_.Enqueue failed with " << status.ToString()
+                << " on request " << call->request().DebugString();
+        call->Finish(ToGrpcStatus(status));
+      }
 
-    // We do not tell gRPC to accept a new StreamingEnqueue request because this
-    // method can be called multiple times for a given streaming call.
-    // The StreamingCall does this per call instead, after a call has been
-    // opened.
+      // We do not tell gRPC to accept a new StreamingEnqueue request because
+      // this method can be called multiple times for a given streaming call.
+      // The StreamingCall does this per call instead, after a call has been
+      // opened.
+    });
   }
 
   const WorkerEnv* const env_;  // Not owned.
   EagerServiceImpl local_impl_;
 
+  // A single-threaded thread pool to handle streaming enqueue rpc request.
+  thread::ThreadPool enqueue_streaming_thread_;
   std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
 
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index 8809c1e6b19..e85baac0f70 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -425,7 +425,13 @@ class ServerBidirectionalStreamingCall
         stream_(&ctx_),
         grpc_service_(grpc_service),
         cq_(cq),
-        enqueue_function_(enqueue_function) {}
+        enqueue_function_(enqueue_function) {
+    VLOG(3) << "Creating ServerBidirectionalStreamingCall " << this;
+  }
+
+  ~ServerBidirectionalStreamingCall() override {
+    VLOG(3) << "Destroying ServerBidirectionalStreamingCall " << this;
+  }
 
   void CallOpen() override {
     // Let gRPC know that we can accept another call.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index a313588efdd..f70d60891cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -86,6 +86,10 @@ Status ValidateHostPortPair(const string& host_port) {
       LOG(ERROR) << "Invalid compression algorithm: "
                  << rpc_options->compression_algorithm();
     }
+    if (rpc_options->disable_session_connection_sharing()) {
+      VLOG(5) << "Disabling TCP connection sharing";
+      args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
+    }
   }
   return args;
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 7cda2a2ea83..5446ccc429f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -72,9 +72,10 @@ class GrpcRemoteWorker : public WorkerInterface {
   ~GrpcRemoteWorker() override {}
 
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
-    IssueRequest(request, response, getstatus_, std::move(done));
+    IssueRequest(request, response, getstatus_, std::move(done), nullptr,
+                 fail_fast);
   }
 
   void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
@@ -269,18 +270,18 @@ class GrpcRemoteWorker : public WorkerInterface {
   void IssueRequest(const protobuf::Message* request,
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr,
-                    int max_retries = kMaxWorkerRpcRetries) {
-    new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
-                                    std::move(done), call_opts,
-                                    callback_threadpool_, max_retries);
+                    bool fail_fast = true) {
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, method, *request, response, std::move(done), call_opts,
+        callback_threadpool_, /*max_retries=*/0, fail_fast);
   }
+
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
-                    CallOptions* call_opts = nullptr,
-                    int max_retries = kMaxWorkerRpcRetries) {
+                    CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_, max_retries);
+                                 callback_threadpool_);
   }
 
   void IssueMarkRecvFinishedRequest(int64 request_id) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 3635caf3d10..272d6bb1b20 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -34,8 +34,8 @@ namespace internal {
 class GrpcCall {
  public:
   explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
-                    const string* request_msg, string* response_msg,
-                    int32* status_code, string* status_message)
+                    const tstring* request_msg, tstring* response_msg,
+                    int32* status_code, tstring* status_message)
       : container_(container),
         index_(index),
         try_rpc_(try_rpc),
@@ -59,18 +59,18 @@ class GrpcCall {
 
   CallOptions* call_opts() { return &call_opts_; }
   int index() { return index_; }
-  const string& request() const { return *request_msg_; }
-  string* response() const { return response_msg_; }
+  const tstring& request() const { return *request_msg_; }
+  tstring* response() const { return response_msg_; }
 
  private:
   CallContainer<GrpcCall>* const container_;
   const int index_;
   bool try_rpc_;
   CallOptions call_opts_;
-  const string* request_msg_;
-  string* response_msg_;
+  const tstring* request_msg_;
+  tstring* response_msg_;
   int* status_code_;
-  string* status_message_;
+  tstring* status_message_;
 };
 
 }  // namespace internal
@@ -168,16 +168,16 @@ void GrpcRPCFactory::CreateCall(const Tensor& request_t, const bool try_rpc,
                                 int index, CallContainer<GrpcCall>* container,
                                 Tensor* response_t, Tensor* status_code_t,
                                 Tensor* status_message_t) {
-  auto request = request_t.flat<string>();
-  auto get_request_ptr = [&request](int64 ix) -> const string* {
+  auto request = request_t.flat<tstring>();
+  auto get_request_ptr = [&request](int64 ix) -> const tstring* {
     return (request.size() > 1) ? &(request(ix)) : &(request(0));
   };
-  auto response = response_t->flat<string>();
+  auto response = response_t->flat<tstring>();
   int32* status_code_ptr = nullptr;
-  string* status_message_ptr = nullptr;
+  tstring* status_message_ptr = nullptr;
   if (try_rpc) {
     status_code_ptr = status_code_t->flat<int32>().data();
-    status_message_ptr = status_message_t->flat<string>().data();
+    status_message_ptr = status_message_t->flat<tstring>().data();
   }
   container->RegisterCall(container, index, try_rpc, get_request_ptr(index),
                           &response(index),
@@ -187,8 +187,8 @@ void GrpcRPCFactory::CreateCall(const Tensor& request_t, const bool try_rpc,
 
 void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
                                GrpcCall* call) {
-  auto address = address_t.flat<string>();
-  auto method = method_t.flat<string>();
+  auto address = address_t.flat<tstring>();
+  auto method = method_t.flat<tstring>();
   // Stubs are maintained by the GrpcRPCFactory class and will be
   // deleted when the class is destroyed.
   ::grpc::GenericStub* singleton_stub = nullptr;
@@ -200,13 +200,13 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
     return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
                                 : singleton_stub;
   };
-  auto get_method_ptr = [&method](int64 ix) -> const string* {
+  auto get_method_ptr = [&method](int64 ix) -> const tstring* {
     return (method.size() > 1) ? &(method(ix)) : &(method(0));
   };
 
   int index = call->index();
   // This object will delete itself when done.
-  new RPCState<string>(
+  new RPCState<tstring>(
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 78751ffd464..c8eeaa9ddef 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -375,6 +375,13 @@ Status GrpcServer::Start() {
   }
 }
 
+Status GrpcServer::AddMasterEagerContextToEagerService(
+    const tensorflow::uint64 context_id, tensorflow::EagerContext* context) {
+  auto* eager_service =
+      static_cast<eager::GrpcEagerServiceImpl*>(eager_service_);
+  return eager_service->CreateMasterContext(context_id, context);
+}
+
 Status GrpcServer::Stop() {
   mutex_lock l(mu_);
   switch (state_) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 6f3bdd2cb56..521c8f206f8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/credentials.h"
-
+#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
@@ -95,6 +95,11 @@ class GrpcServer : public ServerInterface {
   WorkerEnv* worker_env() { return &worker_env_; }
   MasterEnv* master_env() { return &master_env_; }
 
+  // Add master eager context to local eager service in order to handle enqueue
+  // requests from remote workers.
+  Status AddMasterEagerContextToEagerService(
+      const tensorflow::uint64 context_id, tensorflow::EagerContext* context);
+
  protected:
   virtual Status GetPort(int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 6fee432857a..776e3af94a3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index c38b89b9c6f..7f2906efca6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -501,7 +501,7 @@ TEST(GrpcSessionTest, MultiDevices_String) {
   Graph graph(OpRegistry::Global());
   Tensor a_tensor(DT_STRING, TensorShape({2, 2}));
   for (int i = 0; i < 4; ++i) {
-    a_tensor.flat<string>()(i) = "hello, world";
+    a_tensor.flat<tstring>()(i) = "hello, world";
   }
   Node* a = test::graph::Constant(&graph, a_tensor);
   Node* b = test::graph::Identity(&graph, a);
@@ -525,7 +525,7 @@ TEST(GrpcSessionTest, MultiDevices_String) {
         ASSERT_EQ(outputs[0].dtype(), DT_STRING);
         ASSERT_EQ(outputs[0].NumElements(), 4);
         for (int i = 0; i < outputs[0].NumElements(); ++i) {
-          EXPECT_EQ(outputs[0].flat<string>()(i), "hello, world");
+          EXPECT_EQ(outputs[0].flat<tstring>()(i), "hello, world");
         }
         TF_CHECK_OK(session->Close());
       } else {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
index 7626891d898..b05a54c4db4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
@@ -26,6 +26,8 @@ const char* ToString(UntypedStreamingRPCState::Tag::TagType tag_type) {
       return "kRequestWriteCompleted";
     case UntypedStreamingRPCState::Tag::TagType::kResponseReadCommpleted:
       return "kResponseReadCommpleted";
+    case UntypedStreamingRPCState::Tag::TagType::kCallFinished:
+      return "kCallFinished";
   }
 }
 
@@ -44,6 +46,9 @@ void UntypedStreamingRPCState::Tag::OnCompleted(bool ok) {
     case TagType::kResponseReadCommpleted:
       streaming_state_->ResponseReadCompleted(ok);
       break;
+    case TagType::kCallFinished:
+      streaming_state_->CallFinished(ok);
+      break;
   }
   streaming_state_->Unref();  // Ref acquired when tag was handed to grpc.
 }
@@ -54,6 +59,8 @@ void Exchange::Complete(Status status) {
       status.Update(errors::Internal("could not parse rpc response"));
     }
   }
+  VLOG(3) << "Completing exchange " << DebugString() << " with "
+          << status.ToString();
   cb_(status);
 }
 
@@ -76,12 +83,14 @@ const char* ToString(Exchange::State state) {
 }
 
 string Exchange::DebugString() const {
-  return absl::StrFormat("%p@%s", this, ToString(state_));
+  return absl::StrFormat("%p@%s_%s", this, ToString(state_), debug_string_);
 }
 
 void ExchangeQueue::Emplace(const ::grpc::ByteBuffer& request_buf,
-                            protobuf::Message* response, StatusCallback cb) {
-  exchanges_.emplace(exchanges_.end(), request_buf, response, std::move(cb));
+                            protobuf::Message* response, StatusCallback cb,
+                            string debug_string) {
+  exchanges_.emplace(exchanges_.end(), request_buf, response, std::move(cb),
+                     debug_string);
 }
 
 Exchange* ExchangeQueue::GetReadyForRequestWriting() {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index b12218206d3..f02fc89e102 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -36,18 +36,17 @@ namespace tensorflow {
 
 // Object allocated per active RPC.
 // Manage the state of a single asynchronous RPC request.  If `max_retries`
-// is greater than 0, the request will be retried for any transient failures
-// as long as the overall deadline has not elapsed.
+// is greater than 0, the request will be retried for any transient failures.
 template <class Response>
 class RPCState : public GrpcClientCQTag {
  public:
-  // Default behavior is to set fail_fast = False and handle timeouts manually.
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, int32 max_retries = 0)
+           thread::ThreadPool* threadpool, int32 max_retries = 0,
+           bool fail_fast = true)
       : RPCState(stub, cq, method, request, response, std::move(done),
-                 call_opts, threadpool, /*fail_fast=*/false,
+                 call_opts, threadpool, fail_fast,
                  /*timeout_in_ms=*/0, max_retries) {}
 
   template <typename Request>
@@ -80,7 +79,7 @@ class RPCState : public GrpcClientCQTag {
 
   void StartCall() {
     context_.reset(new ::grpc::ClientContext());
-    context_->set_fail_fast(fail_fast_);
+    context_->set_wait_for_ready(!fail_fast_);
 
     if (timeout_in_ms_ > 0) {
       context_->set_deadline(
@@ -132,6 +131,7 @@ class RPCState : public GrpcClientCQTag {
       response_buf_.Clear();
       VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
               << " of " << max_retries_;
+      // TODO(b/139945426) Allow user to configure the retry backoff time.
       StartCall();
     } else {
       // Attach additional GRPC error information if any to the final status
@@ -194,6 +194,7 @@ class UntypedStreamingRPCState : public core::RefCounted {
   virtual void CallStarted(bool ok) = 0;
   virtual void RequestWriteCompleted(bool ok) = 0;
   virtual void ResponseReadCompleted(bool ok) = 0;
+  virtual void CallFinished(bool ok) = 0;
 
   virtual string DebugString() const = 0;
 
@@ -204,6 +205,7 @@ class UntypedStreamingRPCState : public core::RefCounted {
       kCallStarted,
       kRequestWriteCompleted,
       kResponseReadCommpleted,
+      kCallFinished,
     };
 
     Tag(UntypedStreamingRPCState* streaming_state, Tag::TagType type);
@@ -237,11 +239,12 @@ class Exchange {
   };
 
   Exchange(const ::grpc::ByteBuffer& request_buf, protobuf::Message* response,
-           StatusCallback cb)
+           StatusCallback cb, string debug_string)
       : state_(State::kExchangeCreated),
         request_buf_(request_buf),
         response_(response),
-        cb_(std::move(cb)) {}
+        cb_(std::move(cb)),
+        debug_string_(std::move(debug_string)) {}
 
   const ::grpc::ByteBuffer& request_buf() { return request_buf_; }
   ::grpc::ByteBuffer* response_buf() { return &response_buf_; }
@@ -274,6 +277,7 @@ class Exchange {
   ::grpc::ByteBuffer response_buf_;
   protobuf::Message* response_;
   StatusCallback cb_;
+  string debug_string_;
 };
 
 const char* ToString(Exchange::State s);
@@ -303,7 +307,8 @@ class ExchangeQueue {
  public:
   // Creates a new exchange and adds it to the end of the queue.
   void Emplace(const ::grpc::ByteBuffer& request_buf,
-               protobuf::Message* response, StatusCallback cb);
+               protobuf::Message* response, StatusCallback cb,
+               std::string debug_string);
 
   // Returns an exchange for which we can initiated request writing, if any.
   // Returns nullptr if there is no such exchange.
@@ -361,11 +366,17 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   // manually.
   StreamingRPCState(std::unique_ptr<grpc::GenericClientAsyncReaderWriter> call,
                     const std::shared_ptr<::grpc::ClientContext>& context)
-      : context_(context), call_(std::move(call)), call_done_(false) {
+      : context_(context), call_(std::move(call)), call_state_(State::kActive) {
     Ref();
+    VLOG(3) << "Created new StreamingRPCState " << this;
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::StartCall";
     call_->StartCall(&call_started_tag_);
   }
 
+  ~StreamingRPCState() override {
+    VLOG(3) << "Destructing StreamingRPCState " << this;
+  }
+
   // Attempts to send the next request. `done` is invoked when
   // `response` has been filled with the data from the server, or if there
   // is an error. `done` can be invoked before SendNextRequest returns.
@@ -387,19 +398,28 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     }
 
     mutex_lock l(mu_);
-    if (call_done_) {
+    if (call_state_ != State::kActive) {
       // `done` is not invoked intentionally.
       return false;
     }
-    exchanges_.Emplace(request_buf, response, done);
+    if (VLOG_IS_ON(3)) {
+      // If vlog 3 is enabled, include first 100 chars of request as debug
+      // string.
+      exchanges_.Emplace(request_buf, response, done,
+                         request.ShortDebugString().substr(0, 100));
+    } else {
+      exchanges_.Emplace(request_buf, response, done, "");
+    }
     MaybeIssueRequestWriteLocked();
     return true;
   }
 
   void CallStarted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this << ")::CallStarted(ok=" << ok
+            << ")";
     mutex_lock l(mu_);
     if (!ok) {
-      call_done_ = true;
+      call_state_ = State::kDone;
       return;
     }
     exchanges_.CallStarted();
@@ -408,14 +428,20 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   }
 
   void RequestWriteCompleted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this
+            << ")::RequestWriteCompleted(ok=" << ok << ")";
     mu_.lock();
-    if (call_done_) {
+    if (call_state_ != State::kActive) {
       mu_.unlock();
       return;
     }
     if (!ok) {
       // unlocks mu_
-      MarkDoneAndCompleteExchanges();
+      MarkDoneAndCompleteExchanges(errors::Internal(
+          "Unexpected ok value at streaming rpc writing. ",
+          "Probably because the completion queue has been shut ",
+          "down or the connection went down. ",
+          context_->debug_error_string()));
       return;
     }
 
@@ -426,14 +452,16 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   }
 
   void ResponseReadCompleted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this
+            << ")::ResponseReadCompleted(ok=" << ok << ")";
     mu_.lock();
-    if (call_done_) {
+    if (call_state_ != State::kActive) {
       mu_.unlock();
       return;
     }
     if (!ok) {
-      // unlocks mu_
-      MarkDoneAndCompleteExchanges();
+      IssueCallFinishLocked();
+      mu_.unlock();
       return;
     }
 
@@ -455,17 +483,43 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     }
   }
 
+  void CallFinished(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this << ")::CallFinished(ok=" << ok
+            << ")";
+    mu_.lock();
+    DCHECK(call_state_ != State::kActive);
+    if (call_state_ != State::kFinishing) {
+      mu_.unlock();
+      return;
+    }
+
+    Status s = FromGrpcStatus(call_status_);
+    if (s.ok() && !ok) {
+      s.Update(
+          errors::Internal("unexpected ok value at streaming rpc completion. ",
+                           context_->debug_error_string()));
+    }
+    // unlocks mu_
+    MarkDoneAndCompleteExchanges(s);
+  }
+
   string DebugString() const override {
     mutex_lock l(mu_);
     return exchanges_.DebugString();
   }
 
  private:
-  void MarkDoneAndCompleteExchanges() EXCLUSIVE_LOCKS_REQUIRED(mu_)
+  enum class State {
+    kActive,
+    kFinishing,
+    kDone,
+  };
+
+  void MarkDoneAndCompleteExchanges(Status status) EXCLUSIVE_LOCKS_REQUIRED(mu_)
       UNLOCK_FUNCTION(mu_) {
-    call_done_ = true;
-    Status status = errors::Unknown("gRPC streaming call has ended: ",
-                                    context_->debug_error_string());
+    call_state_ = State::kDone;
+    VLOG(2) << "Ending gRPC stremaing call on the client side due to "
+            << status.ToString();
     // Swap the exchanges_ into a temporary ExchangeQueue so that we can
     // complete all exchanges without holding mu_ in case user callback
     // reach back into this. This should be impossible now, but safer for
@@ -485,6 +539,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     }
     exchange->MarkRequestWriteIssued();
     Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Write";
     call_->Write(exchange->request_buf(), &request_write_completed_tag_);
   }
 
@@ -495,9 +550,21 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     }
     exchange->MarkResponseReadIssued();
     Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Read";
     call_->Read(exchange->response_buf(), &response_read_completed_tag_);
   }
 
+  void IssueCallFinishLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    call_state_ = State::kFinishing;
+    Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Finish";
+    // We call finish in response to completed (with error) response reading tag
+    // on some exchange. We let this exchange hang in ResponseReadIssued state.
+    // ExchangeQueue makes sure that there is at most one exchange in this
+    // state. So, no new reads will be issued.
+    call_->Finish(&call_status_, &finished_tag_);
+  }
+
   // Holds state for a single request/response exchange between the client
   // and the server.
   typedef typename UntypedStreamingRPCState::Tag Tag;
@@ -509,7 +576,8 @@ class StreamingRPCState : public UntypedStreamingRPCState {
 
   mutable mutex mu_;
   ExchangeQueue exchanges_ GUARDED_BY(mu_);
-  bool call_done_ GUARDED_BY(mu_);
+  State call_state_ GUARDED_BY(mu_);
+  ::grpc::Status call_status_ GUARDED_BY(mu_);
 
   // We can get away with having single instances of these tags per
   // StreamingRPCState because we make sure (as gRPC requires) that
@@ -519,6 +587,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   Tag call_started_tag_{this, Tag::TagType::kCallStarted};
   Tag request_write_completed_tag_{this, Tag::TagType::kRequestWriteCompleted};
   Tag response_read_completed_tag_{this, Tag::TagType::kResponseReadCommpleted};
+  Tag finished_tag_{this, Tag::TagType::kCallFinished};
 };
 
 // Creates streaming calls and dispatches requests to them.
@@ -571,6 +640,16 @@ class StreamingRPCDispatcher {
     }
   }
 
+  // Request to cancel the current streaming call. Non-blocking.
+  void CancelCall() {
+    mutex_lock l(mu_);
+    if (state_ == nullptr) {
+      return;
+    }
+    context_->TryCancel();
+    state_ = nullptr;
+  }
+
  private:
   void CreateStreamingState() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // ClientContext cannot be reused across calls.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
index d07bac5631c..29ee480e39b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
@@ -65,11 +65,11 @@ class GrpcTensorCodingTest : public ::testing::Test {
     }
   }
   void DoTestForStrings(DataType dt) {
-    gtl::InlinedVector<string, 4> v;
+    gtl::InlinedVector<tstring, 4> v;
     for (int elems = 0; elems <= 10000; elems++) {
       if (elems < 100 || (elems % 1000 == 0)) {
         Tensor a(dt, TensorShape({1, static_cast<int64>(v.size())}));
-        test::FillValues<string>(&a, v);
+        test::FillValues<tstring>(&a, v);
         Validate(a, (elems == 0));
       }
       v.push_back(strings::StrCat("This is string ", elems));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index 471e2c16b34..5dda1459167 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -100,7 +100,7 @@ bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst) {
   return s.ok();
 }
 
-// GrpcMaybeParseProto into a string simply copies bytes into the string.
+// GrpcMaybeParseProto simply copies bytes into the string.
 bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
   dst->clear();
   dst->reserve(src->Length());
@@ -114,4 +114,20 @@ bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
   return true;
 }
 
+#ifdef USE_TSTRING
+// GrpcMaybeParseProto simply copies bytes into the tstring.
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, tstring* dst) {
+  dst->clear();
+  dst->reserve(src->Length());
+  std::vector<::grpc::Slice> slices;
+  if (!src->Dump(&slices).ok()) {
+    return false;
+  }
+  for (const ::grpc::Slice& s : slices) {
+    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  return true;
+}
+#endif  // USE_TSTRING
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 976f3e6452a..aed798217cb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -131,6 +131,9 @@ bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst);
 // Copy grpc buffer src to string *dst.
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
 
+// Copy grpc buffer src to tstring *dst.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 0818a05e3f3..a267371ed42 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -255,7 +255,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
              recv_args, std::move(done));
 
   // Record "call" in active_ so that it can be aborted cleanly.
-  RegisterCall(call);
+  RegisterCall(call, recv_args);
 
   // RendezvousMgr already aborted, shouldn't send RPC call any more
   if (!call->status().ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index f54eace1e55..5021853ce23 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -29,7 +30,7 @@ namespace tensorflow {
 // string -> Tensor<string>
 Tensor V(const string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = content;
+  tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
@@ -37,7 +38,7 @@ Tensor V(const string& content) {
 string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
-  return tensor.scalar<string>()();
+  return tensor.scalar<tstring>()();
 }
 
 Rendezvous::ParsedKey MakeKey(const string& s) {
@@ -142,6 +143,56 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
   }
 }
 
+TEST_F(RpcRendezvousMgrTest, LocalCancel) {
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  auto* cm = new CancellationManager();
+  const int64 step_id = 123;
+  RemoteRendezvous* rendez = rmgr_.Find(step_id);
+  core::ScopedUnref unref(rendez);
+  Notification n;
+  SchedClosure([this, cm, &n]() {
+    env.env->SleepForMicroseconds(100 * 1000);
+    cm->StartCancel();
+    n.Notify();
+  });
+  Tensor val(DT_STRING);
+  bool val_dead = false;
+  Rendezvous::Args args;
+  args.cancellation_manager = cm;
+  TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+  EXPECT_TRUE(errors::IsCancelled(rendez->Recv(key, args, &val, &val_dead)));
+  n.WaitForNotification();
+  delete cm;
+}
+
+TEST_F(RpcRendezvousMgrTest, CancelAfterReceived) {
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  auto* cm = new CancellationManager();
+  const int64 step_id = 123;
+  RemoteRendezvous* rendez = rmgr_.Find(step_id);
+  core::ScopedUnref unref(rendez);
+  Notification n;
+  SchedClosure([this, rendez, key, cm, &n]() {
+    env.env->SleepForMicroseconds(100 * 1000);
+    TF_ASSERT_OK(rendez->Send(key, Rendezvous::Args(), V("peach"), false));
+    cm->StartCancel();
+    n.Notify();
+  });
+  Tensor val(DT_STRING);
+  bool val_dead = false;
+  Rendezvous::Args args;
+  args.cancellation_manager = cm;
+  TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+  TF_ASSERT_OK(rendez->Recv(key, args, &val, &val_dead));
+  EXPECT_EQ(V(val), "peach");
+  n.WaitForNotification();
+  delete cm;
+}
+
 TEST_F(RpcRendezvousMgrTest, CleanupAll) {
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 9157dbe648c..0c3ef6ab075 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -47,8 +47,8 @@ RpcCollectiveExecutorMgr::~RpcCollectiveExecutorMgr() {
 
 CollectiveExecutor* RpcCollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessDistributed* rma =
-      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
-                                            worker_cache_, step_id);
+      new CollectiveRemoteAccessDistributed(
+          dev_mgr_, dev_resolver_.get(), work_queue_, worker_cache_, step_id);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
                                     &gpu_ring_order_);
 }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index ace4e456ce2..0319e61aec0 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -85,14 +85,7 @@ Status SessionMgr::CreateSession(
       << "The WorkerEnv must have at least one device in `local_devices`.";
 
   std::shared_ptr<WorkerSession> worker_session;
-
-  std::unique_ptr<DeviceMgr> remote_devices;
-  if (!cluster_device_attributes.empty()) {
-    std::vector<std::unique_ptr<Device>> cluster_devices;
-    tensorflow::AsRemoteDevices(worker_env_->env, cluster_device_attributes,
-                                &cluster_devices);
-    remote_devices.reset(new DeviceMgr(std::move(cluster_devices)));
-  }
+  std::vector<std::unique_ptr<Device>> cluster_devices;
 
   if (isolate_session_state || server_def.cluster().job_size()) {
     if (server_def.cluster().job_size()) {
@@ -108,8 +101,16 @@ Status SessionMgr::CreateSession(
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
     }
-
     auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
+    LookupLocalDevice cb = [&device_mgr](StringPiece name, Device** device) {
+      return device_mgr->LookupDevice(name, device);
+    };
+    AsRemoteDevices(worker_env_->env, cluster_device_attributes, cb,
+                    &cluster_devices);
+    std::unique_ptr<DeviceMgr> remote_devices;
+    if (!cluster_device_attributes.empty())
+      remote_devices = MakeUnique<DeviceMgr>(std::move(cluster_devices));
+
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
     worker_session.reset(
         new WorkerSession(session, worker_name,
@@ -117,6 +118,11 @@ Status SessionMgr::CreateSession(
                           std::move(device_mgr), std::move(graph_mgr),
                           std::move(remote_devices)));
   } else {
+    AsRemoteDevices(worker_env_->env, cluster_device_attributes, nullptr,
+                    &cluster_devices);
+    std::unique_ptr<DeviceMgr> remote_devices;
+    if (!cluster_device_attributes.empty())
+      remote_devices = MakeUnique<DeviceMgr>(std::move(cluster_devices));
     // Borrow the WorkerEnv's DeviceMgr for the WorkerSession, so
     // that resources using it can use its devices after the
     // WorkerSession has been deleted.
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 2c2c0277fd6..a66acffb3c7 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -84,10 +84,23 @@ TEST_F(SessionMgrTest, CreateSessionClusterDefWorkerName) {
   job->set_name("worker");
   job->mutable_tasks()->insert({3, "localhost:3333"});
 
+  protobuf::RepeatedPtrField<DeviceAttributes> cluster_device_attributes;
+  DeviceAttributes* local_cpu = cluster_device_attributes.Add();
+  local_cpu->set_name("/job:worker/replica:0/task:3/device:fakecpu:0");
+  DeviceAttributes* remote_cpu = cluster_device_attributes.Add();
+  remote_cpu->set_name("/job:coordinator/replica:0/task:0/device:fakecpu:0");
+
   string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def,
+                                  cluster_device_attributes, true));
   std::shared_ptr<WorkerSession> session;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  Device* device;
+  // remote_device_mgr should show the local device as actually local
+  TF_EXPECT_OK(
+      session->remote_device_mgr()->LookupDevice(local_cpu->name(), &device));
+
+  EXPECT_TRUE(device->IsLocal());
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_EQ("/job:worker/replica:0/task:3", session->worker_name);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 52a057bdb2f..02e137a46c6 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -120,12 +120,12 @@ class TensorResponseTest : public ::testing::Test {
     }
   }
   void DoTestForStrings(DataType dt) {
-    gtl::InlinedVector<string, 4> v;
+    gtl::InlinedVector<tstring, 4> v;
     LOG(ERROR) << "DT: string";
     for (int elems = 0; elems <= 10000; elems++) {
       if (elems < 100 || (elems % 1000 == 0)) {
         Tensor a(dt, TensorShape({1, static_cast<int64>(v.size())}));
-        test::FillValues<string>(&a, v);
+        test::FillValues<tstring>(&a, v);
         Validate(a, (elems == 0), true);
       }
       v.push_back(strings::StrCat("This is string ", elems));
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index 152d97f62a7..e71bc2113ad 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 class TestWorkerInterface : public WorkerInterface {
  public:
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
     done(errors::Unimplemented("GetStatusAsync"));
   }
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index cfa61916444..686714bae84 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -36,7 +36,8 @@ Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {
 }
 
 void Worker::GetStatusAsync(const GetStatusRequest* request,
-                            GetStatusResponse* response, StatusCallback done) {
+                            GetStatusResponse* response, bool fail_fast,
+                            StatusCallback done) {
   DeviceMgr* dm = env_->device_mgr;
   std::vector<DeviceAttributes> devices;
   dm->ListDeviceAttributes(&devices);
@@ -196,8 +197,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   ProfilerSession* profiler_session = nullptr;
   if (collector && request->exec_opts().record_timeline()) {
     // If timeline was requested, assume we want hardware level tracing.
-    profiler_session =
-        ProfilerSession::Create(/*ProfilerContext*/ nullptr).release();
+    profiler_session = ProfilerSession::Create().release();
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index bde4d61e8d4..2d441f913b7 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -46,7 +46,7 @@ class Worker : public WorkerInterface {
   virtual ~Worker() {}
 
   void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response,
+                      GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override;
 
   void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
index 22f9c2abd6a..9d495ea48be 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -29,11 +29,11 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  virtual void ListWorkers(std::vector<string>* workers) const {
+  void ListWorkers(std::vector<string>* workers) const override {
     return wrapped_->ListWorkers(workers);
   }
-  virtual void ListWorkersInJob(const string& job_name,
-                                std::vector<string>* workers) const {
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
     return wrapped_->ListWorkersInJob(job_name, workers);
   }
 
@@ -41,7 +41,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // or can be constructed, returns a pointer to a WorkerInterface object
   // wrapping that channel. The returned value must be destroyed by
   // calling `this->ReleaseWorker(target, ret)`
-  virtual WorkerInterface* GetOrCreateWorker(const string& target) {
+  WorkerInterface* GetOrCreateWorker(const string& target) override {
     return wrapped_->GetOrCreateWorker(target);
   }
 
@@ -50,7 +50,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
   // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
   //                    per-rpc-subsystem WorkerInterface creator.
-  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
     return wrapped_->ReleaseWorker(target, worker);
   }
 
@@ -63,29 +63,28 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // within its local environment.  Returns true if *locality
   // was set, using only locally cached data.  Returns false
   // if status data for that device was not available.  Never blocks.
-  virtual bool GetDeviceLocalityNonBlocking(const string& device,
-                                            DeviceLocality* locality) {
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
     return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
   }
 
   // Set *locality with the DeviceLocality of the specified remote device
   // within its local environment.  Callback gets Status::OK if *locality
   // was set.
-  virtual void GetDeviceLocalityAsync(const string& device,
-                                      DeviceLocality* locality,
-                                      StatusCallback done) {
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
     return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
   }
 
   // Start/stop logging activity.
-  virtual void SetLogging(bool active) { wrapped_->SetLogging(active); }
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
 
   // Discard any saved log data.
-  virtual void ClearLogs() { wrapped_->ClearLogs(); }
+  void ClearLogs() override { wrapped_->ClearLogs(); }
 
   // Return logs for the identified step in *ss.  Any returned data will no
   // longer be stored.
-  virtual bool RetrieveLogs(int64 step_id, StepStats* ss) {
+  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
     return wrapped_->RetrieveLogs(step_id, ss);
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index a50ac3b8ae5..cf8099ab96f 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -37,7 +37,7 @@ class TensorResponse;
 class WorkerInterface {
  public:
   virtual void GetStatusAsync(const GetStatusRequest* request,
-                              GetStatusResponse* response,
+                              GetStatusResponse* response, bool fail_fast,
                               StatusCallback done) = 0;
 
   virtual void CreateWorkerSessionAsync(
@@ -131,7 +131,15 @@ class WorkerInterface {
 
   Status GetStatus(const GetStatusRequest* request,
                    GetStatusResponse* response) {
-    return CallAndWait(&ME::GetStatusAsync, request, response);
+    Status ret;
+    Notification n;
+    GetStatusAsync(request, response, /*fail_fast=*/true,
+                   [&ret, &n](const Status& s) {
+                     ret = s;
+                     n.Notify();
+                   });
+    n.WaitForNotification();
+    return ret;
   }
 
   Status CreateWorkerSession(const CreateWorkerSessionRequest* request,
diff --git a/tensorflow/core/example/example_parser_configuration.cc b/tensorflow/core/example/example_parser_configuration.cc
index 5660465c51a..d48b12db8f1 100644
--- a/tensorflow/core/example/example_parser_configuration.cc
+++ b/tensorflow/core/example/example_parser_configuration.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/core/example/feature.pb_text.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -114,13 +114,14 @@ Status ExtractExampleParserConfiguration(
 
   for (int i = 0; i < num_sparse; ++i) {
     int input_idx = sparse_keys_start + i;
-    (*var_len_features)[i].key = op_input_tensors[input_idx].scalar<string>()();
+    (*var_len_features)[i].key =
+        op_input_tensors[input_idx].scalar<tstring>()();
   }
 
   for (int i = 0; i < num_dense; ++i) {
     FixedLenFeature& config = (*fixed_len_features)[i];
     int dense_keys_offset = dense_keys_start + i;
-    config.key = op_input_tensors[dense_keys_offset].scalar<string>()();
+    config.key = op_input_tensors[dense_keys_offset].scalar<tstring>()();
 
     int defaults_offset = dense_defaults_start + i;
     config.default_value = op_input_tensors[defaults_offset];
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 2cb895cdbc9..595e0408f71 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -20,11 +20,11 @@ limitations under the License.
 // So accessing feature values is not very convenient.
 //
 // For example, to read a first value of integer feature "tag":
-//   int id = example.features().feature().at("tag").int64_list().value(0)
+//   int id = example.features().feature().at("tag").int64_list().value(0);
 //
 // to add a value:
 //   auto features = example->mutable_features();
-//   (*features->mutable_feature())["tag"].mutable_int64_list()->add_value(id)
+//   (*features->mutable_feature())["tag"].mutable_int64_list()->add_value(id);
 //
 // For float features you have to use float_list, for string - bytes_list.
 //
@@ -67,7 +67,8 @@ limitations under the License.
 //         feature { float_list { value: [4.0] } }
 //         feature { float_list { value: [5.0, 3.0] } }
 //       }
-//     } }
+//     }
+//   }
 //
 // Functions exposed by this library:
 //   HasFeature<[FeatureType]>(key, proto) -> bool
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index ed7caaa6c0b..f911b5b7b6f 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -129,8 +129,6 @@ bool FastAreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
 }
 
 using TensorProtoHasher = std::function<uint64(const TensorProto&)>;
-using TensorProtosEquality =
-    std::function<bool(const TensorProto&, const TensorProto&)>;
 
 uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   if (a.has_tensor()) return tensor_hash(a.tensor());
@@ -150,8 +148,15 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   return DeterministicProtoHash64(a);
 }
 
+template <typename TensorProtosEquality>
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
-                        const TensorProtosEquality& tensor_equality) {
+                        TensorProtosEquality tensor_equality) {
+  if (a.type() != b.type()) {
+    return false;
+  } else if (a.type() != DT_INVALID && b.type() != DT_INVALID) {
+    return a.type() == b.type();
+  }
+
   if (a.has_tensor() != b.has_tensor()) {
     return false;
   } else if (a.has_tensor() && b.has_tensor()) {
@@ -204,7 +209,7 @@ string SummarizeTensor(const TensorProto& tensor_proto) {
   Tensor t;
   if (!t.FromProto(tensor_proto)) {
     return strings::StrCat(
-        "<Invalid TensorProto: ", ProtoShortDebugString(tensor_proto), ">");
+        "<Invalid TensorProto: ", tensor_proto.ShortDebugString(), ">");
   }
   return t.DebugString();
 }
@@ -487,6 +492,13 @@ void SetAttrValue(const gtl::ArraySlice<StringPiece> value, AttrValue* out) {
   }
 }
 
+void MoveAttrValue(std::vector<string>&& value, AttrValue* out) {
+  out->mutable_list()->Clear();  // Create list() even if value empty.
+  for (auto& v : value) {
+    out->mutable_list()->add_s(std::move(v));
+  }
+}
+
 void SetAttrValue(const TensorShape& value, AttrValue* out) {
   value.AsProto(out->mutable_shape());
 }
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 9fce488793f..e302e656805 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -87,6 +87,8 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out);
 
 void SetAttrValue(const AttrValue& value, AttrValue* out);
 
+void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
+
 // Returns true if a and b have the same value.
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b);
 
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index e9e94024f5b..ba5637d9707 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -20,10 +20,6 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
-#if defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/windows/cpu_info.h"
-#endif
-
 // Compact 16-bit encoding of floating point numbers. This representation uses
 // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.  It
 // is assumed that floats are in IEEE 754 format so the representation is just
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index 7f639b5ca9a..af59500aee3 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -27,12 +27,6 @@ CancellationManager::CancellationManager()
       is_cancelled_(false),
       next_cancellation_token_(0) {}
 
-void CancellationManager::Reset() {
-  mutex_lock l(mu_);
-  is_cancelling_ = false;
-  is_cancelled_.store(false);
-}
-
 void CancellationManager::StartCancel() {
   gtl::FlatMap<CancellationToken, CancelCallback> callbacks_to_run;
   {
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 51b200423ec..d1172ca82ed 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -42,6 +42,9 @@ typedef int64 CancellationToken;
 // comment for CancellationManager::RegisterCallback.
 typedef std::function<void()> CancelCallback;
 
+// This class should never simultaneously be used as the cancellation manager
+// for two separate sets of executions (i.e two separate steps, or two separate
+// function executions).
 class CancellationManager {
  public:
   // A value that won't be returned by get_cancellation_token().
@@ -56,9 +59,6 @@ class CancellationManager {
   // Returns true iff StartCancel() has been called.
   bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); }
 
-  // Resets the cancellation manager to its original pre-cancelled state.
-  void Reset();
-
   // Returns a token that must be used in calls to RegisterCallback
   // and DeregisterCallback.
   CancellationToken get_cancellation_token();
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 41cc4e45e73..ad9165869a0 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -90,9 +90,10 @@ CollInstanceParams& CollInstanceParams::operator=(
 }
 
 string CollInstanceParams::ToString() const {
-  string v = strings::StrCat("CollInstanceParams { instance_key=", instance_key,
-                             " type=", type, " data_type=", data_type,
-                             " shape=", shape.DebugString(), " devices {");
+  string v =
+      strings::StrCat("CollInstanceParams { instance_key=", instance_key,
+                      " type=", type, " data_type=", DataTypeString(data_type),
+                      " shape=", shape.DebugString(), " devices {");
   for (const auto& d : device_names) {
     strings::StrAppend(&v, d, ",");
   }
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index f0511f0c164..859fd600735 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -81,7 +81,9 @@ struct CollImplDetails {
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
   std::vector<int32>
-      dependencies;  // collective instances on which this node depends
+      dependencies;           // collective instances on which this node depends
+  string communication_hint;  // user-supplied hint for implementation choice,
+                              // e.g. ring or nccl
 };
 
 // Data common to all members of a collective instance.
@@ -259,6 +261,9 @@ class PeerAccessInterface {
                           const Tensor* from_tensor,
                           const DeviceLocality& client_locality,
                           const StatusCallback& done) = 0;
+
+  // Runs the potentially-blocking closure/expensive callback.
+  virtual void RunClosure(std::function<void()> closure) = 0;
 };
 
 class PerStepCollectiveRemoteAccess;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 801fbb994d0..8586530b2df 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -877,15 +877,35 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   DimensionHandle in_planes_dim = c->Dim(input_shape, 1);
   DimensionHandle in_rows_dim = c->Dim(input_shape, 2);
   DimensionHandle in_cols_dim = c->Dim(input_shape, 3);
+  DimensionHandle input_depth_dim = c->Dim(input_shape, 4);
 
   DimensionHandle filter_planes_dim = c->Dim(filter_shape, 0);
   DimensionHandle filter_rows_dim = c->Dim(filter_shape, 1);
   DimensionHandle filter_cols_dim = c->Dim(filter_shape, 2);
+  DimensionHandle filter_input_depth_dim = c->Dim(filter_shape, 3);
   DimensionHandle output_depth_dim = c->Dim(filter_shape, 4);
 
-  DimensionHandle unused;
-  TF_RETURN_IF_ERROR(
-      c->Merge(c->Dim(input_shape, 4), c->Dim(filter_shape, 3), &unused));
+  // Check that the input tensor and the filter tensor agree on the channel
+  // count.
+  if (c->ValueKnown(input_depth_dim) && c->ValueKnown(filter_input_depth_dim)) {
+    int64 input_depth_value = c->Value(input_depth_dim),
+          filter_input_depth_value = c->Value(filter_input_depth_dim);
+    if (input_depth_value % filter_input_depth_value != 0)
+      return errors::InvalidArgument(
+          "Depth of input (", input_depth_value,
+          ") is not a multiple of input depth of filter (",
+          filter_input_depth_value, ")");
+    if (input_depth_value != filter_input_depth_value) {
+      int64 num_groups = input_depth_value / filter_input_depth_value;
+      if (c->ValueKnown(output_depth_dim)) {
+        int64 output_depth_value = c->Value(output_depth_dim);
+        if (output_depth_value % num_groups != 0)
+          return errors::InvalidArgument(
+              "Depth of output (", output_depth_value,
+              ") is not a multiple of the number of groups (", num_groups, ")");
+      }
+    }
+  }
 
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 40ca891f929..19642efe389 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -963,9 +963,15 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   INFER_OK(op, "[1,2,2,2,1];[1,1,1,?,1]", "[d0_0,2,2,2,d1_4]");
   INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,?]", "[d0_0,2,2,2,d1_4]");
 
-  // input depths must match.
-  INFER_ERROR("Dimensions must be equal, but are 10 and 10000", op,
-              "[1,2,2,2,10];[1,1,1,10000,20]");
+  // input depth must be multiple of filter depth for group convolutions
+  INFER_ERROR(
+      "Depth of input (10) is not a multiple of input depth of filter (6)", op,
+      "[1,2,2,2,10];[1,1,1,6,20]");
+
+  // Output dimensions must be multiple of group number
+  INFER_ERROR(
+      "Depth of output (1) is not a multiple of the number of groups (2)", op,
+      "[1,2,2,2,10];[1,1,1,5,1]");
 
   // 2x2x2 filter
   set_op({{1, 1, 1, 1, 1}}, "VALID");
@@ -983,6 +989,17 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   set_op({{1, 1, 1, 1, 1}}, "SAME");
   INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
 
+  // 4x4 input of depth 10, 2x2 filter with depth 5, 1x1 stride
+  INFER_OK(op, "[1,4,4,4,10];[2,2,2,5,2]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+
+  // test output multiple of group size is ok
+  // 4x4 input of depth 10, 2x2 filter with depth 5, 1x1 stride
+  INFER_OK(op, "[1,4,4,4,10];[2,2,2,5,2]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+
+  // Depthwise convolution first step
+  // 4x4 input of depth 10, 2x2 filter with depth 1, 1x1 stride
+  INFER_OK(op, "[1,4,4,4,10];[2,2,2,1,10]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+
   // with SAME, filter doesn't matter except for last dim.
   set_op({{1, 1, 1, 1, 1}}, "SAME");
   INFER_OK(op, "[?,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 0171fe9a363..507b7aa7dd1 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -264,9 +264,6 @@ Status GraphDefBuilderWrapper::AddFunction(
             << " the graph. It will not be added again.";
     return Status::OK();
   }
-  if (!ctx->optimization_only()) {
-    TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(function_name, lib_def));
-  }
   const FunctionDef* f_def = lib_def.Find(function_name);
   if (f_def == nullptr) {
     return errors::InvalidArgument("Unable to find FunctionDef for ",
@@ -369,29 +366,10 @@ Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
   return Status::OK();
 }
 
-Status DatasetBase::Save(SerializationContext* ctx,
-                         IteratorStateWriter* writer) const {
-  string serialized_graph_def;
-  string output_node;
-  GraphDefBuilder b;
-  DatasetGraphDefBuilder db(&b);
-  Node* node = nullptr;
-  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-  output_node = node->name();
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-  graph_def.SerializeToString(&serialized_graph_def);
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-  return Status::OK();
-}
-
 Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
     SerializationContext* ctx, const DatasetBase* dataset, Node** output) {
   Status status = dataset->AsGraphDefInternal(ctx, this, output);
-  if (ctx->optimization_only() && errors::IsUnimplemented(status)) {
+  if (errors::IsUnimplemented(status) && !ctx->fail_if_unimplemented()) {
     Tensor t(DT_VARIANT, TensorShape({}));
     // `StoreDatasetInVariantTensor` will transfer ownership of `dataset`. We
     // increment the refcount of `dataset` here to retain ownership.
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 712865ee68f..2069d25267c 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -33,10 +34,13 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
@@ -64,7 +68,7 @@ class SerializationContext;
 class IteratorStateReader {
  public:
   virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadScalar(StringPiece key, tstring* val) = 0;
   virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
   virtual bool Contains(StringPiece key) = 0;
 
@@ -76,7 +80,7 @@ class IteratorStateReader {
 class IteratorStateWriter {
  public:
   virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
-  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteScalar(StringPiece key, const tstring& val) = 0;
   virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
 
   virtual ~IteratorStateWriter() {}
@@ -111,7 +115,7 @@ class GraphDefBuilderWrapper {
   Status AddVector(const std::vector<T>& val, Node** output) {
     Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
                           TensorShape({static_cast<int64>(val.size())}));
-    for (int i = 0; i < val.size(); i++) {
+    for (size_t i = 0; i < val.size(); i++) {
       val_t.flat<T>()(i) = val[i];
     }
     AddTensorInternal(val_t, output);
@@ -121,6 +125,23 @@ class GraphDefBuilderWrapper {
     return Status::OK();
   }
 
+#ifdef USE_TSTRING
+  // TODO(dero): Temp guard to prevent duplicate declaration during tstring
+  // migration.
+  Status AddVector(const std::vector<string>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<tstring>::v(),
+                          TensorShape({static_cast<int64>(val.size())}));
+    for (size_t i = 0; i < val.size(); i++) {
+      val_t.flat<tstring>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+#endif  // USE_TSTRING
+
   // Adds a `Const` node for the given tensor value to the graph.
   //
   // `*output` contains a pointer to the output `Node`. It is guaranteed to be
@@ -197,48 +218,6 @@ class GraphDefBuilderWrapper {
  private:
   void AddPlaceholderInternal(const Tensor& val, Node** output);
   void AddTensorInternal(const Tensor& val, Node** output);
-
-  Status EnsureFunctionIsStateless(
-      const string& function_name,
-      const FunctionLibraryDefinition& lib_def) const {
-    const FunctionDef* function_def = lib_def.Find(function_name);
-    if (!function_def) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in registry.");
-    }
-    for (const NodeDef& node_def : function_def->node_def()) {
-      const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def.LookUpOpDef(node_def.op(), &op_def));
-      // TODO(b/65524810): Hack to allow functions to capture Dataset op
-      // nodes needed for FlatMap. Currently, source datasets nodes have been
-      // marked stateful to avoid constant folding since we do not have a
-      // good way of serializing them.
-      if (IsOpWhitelisted(op_def)) {
-        continue;
-      }
-      if (op_def->is_stateful()) {
-        return errors::InvalidArgument(
-            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
-            "in function ", function_name, " is stateful. ",
-            "Saving stateful functions is not supported yet.");
-      }
-    }
-    return Status::OK();
-  }
-
-  // Returns whether an op has been whitelisted for use inside map_fns.
-  // Uses a heuristic to whitelist source dataset ops which have been
-  // marked stateful due to b/65524810.
-  // Also looks up the `op_def->name` in the global
-  // `WhitelistedStatefulOpRegistry`.
-  bool IsOpWhitelisted(const OpDef* op_def) const {
-    return ((absl::EndsWith(op_def->name(), "Dataset") ||
-             absl::EndsWith(op_def->name(), "DatasetV2")) &&
-            op_def->output_arg_size() == 1 &&
-            op_def->output_arg(0).type() == DT_VARIANT) ||
-           WhitelistedStatefulOpRegistry::Global()->Contains(op_def->name());
-  }
-
   bool HasAttr(const string& op_type_name, const string& attr_name) const;
 
   bool HasAttr(const OpDef* op_def, const string& attr_name) const {
@@ -298,6 +277,7 @@ class IteratorContext {
   struct Params {
     explicit Params(IteratorContext* ctx)
         : allocator_getter(ctx->allocator_getter()),
+          cancellation_manager(ctx->cancellation_manager()),
           env(ctx->env()),
           flr(ctx->flr()),
           function_handle_cache(ctx->function_handle_cache()),
@@ -306,7 +286,8 @@ class IteratorContext {
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
           stats_aggregator(ctx->stats_aggregator()),
-          thread_factory(ctx->thread_factory()) {}
+          thread_factory(ctx->thread_factory()),
+          thread_pool(ctx->thread_pool()) {}
 
     explicit Params(OpKernelContext* ctx)
         : env(ctx->env()), flr(ctx->function_library()) {
@@ -343,6 +324,9 @@ class IteratorContext {
     // The Allocator to be used to allocate the output of an iterator.
     std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
 
+    // The CancellationManager to be used to cancel execution of ops.
+    CancellationManager* cancellation_manager;
+
     // Interface to operating system functionality.
     Env* env = nullptr;
 
@@ -368,9 +352,11 @@ class IteratorContext {
     // The `StatsAggregator` object to record statistics about the iterator.
     std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
 
-    // A `ThreadFactory` for creating threads used by iterators to perform
-    // blocking work.
+    // A factory for creating threads to perform blocking work.
     std::shared_ptr<ThreadFactory> thread_factory = nullptr;
+
+    // A shared thread pool to schedule computation into.
+    thread::ThreadPoolInterface* thread_pool = nullptr;
   };
 
   explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {}
@@ -387,6 +373,10 @@ class IteratorContext {
     return params_.allocator_getter;
   }
 
+  CancellationManager* cancellation_manager() {
+    return params_.cancellation_manager;
+  }
+
   Env* env() const { return params_.env; }
 
   FunctionLibraryRuntime* flr() { return params_.flr; }
@@ -403,10 +393,35 @@ class IteratorContext {
     return &params_.runner;
   }
 
+  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    return params_.stats_aggregator;
+  }
+
   const std::shared_ptr<ThreadFactory>& thread_factory() {
     return params_.thread_factory;
   }
 
+  thread::ThreadPoolInterface* thread_pool() { return params_.thread_pool; }
+
+  Params params() { return params_; }
+
+  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
+                                                       int num_threads) {
+    if (params_.thread_pool) {
+      // Create a `ThreadPool` instance by wrapping `params_.thread_pool` (which
+      // is an instance of `thread::ThreadPoolInterface`). Notably, the
+      // ownership of `params_.thread_pool` is *not* transferred onto the newly
+      // created `ThreadPool` instance.
+      return absl::make_unique<thread::ThreadPool>(params_.thread_pool);
+    } else {
+      return absl::make_unique<thread::ThreadPool>(params_.env, ThreadOptions(),
+                                                   name, num_threads,
+                                                   /*low_latency_hint=*/false);
+    }
+  }
+
   std::unique_ptr<Thread> StartThread(const string& name,
                                       std::function<void()> fn) {
     if (params_.thread_factory) {
@@ -417,14 +432,6 @@ class IteratorContext {
     }
   }
 
-  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    return params_.stats_aggregator;
-  }
-
-  Params params() { return params_; }
-
  private:
   Params params_;
 };
@@ -434,7 +441,23 @@ class SerializationContext {
  public:
   struct Params {
     std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
-    bool optimization_only = false;
+
+    // Indicates whether serialization should check if the dataset depends on
+    // external state. If the check is enabled and external state is
+    // encountered, then the serialization will fail.
+    bool check_external_state = true;
+
+    // Indicates whether an attempt to serialize a dataset that does not
+    // implement serialization should result in an error. If set to `false`, the
+    // serialized graph will replace the dataset with a placeholder returned in
+    // `input_list`.
+    bool fail_if_unimplemented = true;
+
+    // Indicates whether (potentionally large) data tensors should be
+    // serialized, or replaced with a placeholder returned in `input_list`. The
+    // latter makes sense to do when performing data agnostic graph rewrites to
+    // reduce the memory usage.
+    bool serialize_data_tensors = true;
   };
 
   explicit SerializationContext(Params params) : params_(std::move(params)) {}
@@ -443,7 +466,11 @@ class SerializationContext {
     return params_.input_list;
   }
 
-  bool optimization_only() { return params_.optimization_only; }
+  bool check_external_state() const { return params_.check_external_state; }
+
+  bool fail_if_unimplemented() const { return params_.fail_if_unimplemented; }
+
+  bool serialize_data_tensors() const { return params_.serialize_data_tensors; }
 
  private:
   Params params_;
@@ -537,12 +564,22 @@ class IteratorBase {
     return input->RestoreInternal(ctx, reader);
   }
 
-  // Saves the state of this iterator recursively.
+  // Saves the state of this iterator.
+  //
+  // This method is used to store the state of the iterator in a checkpoint.
+  //
+  // TODO(jsimsa): Make this method pure virtual once all `IteratorBase`
+  // implementations have an override.
   virtual Status SaveInternal(IteratorStateWriter* writer) {
     return errors::Unimplemented("SaveInternal");
   }
 
-  // Restores the state of this iterator recursively.
+  // Restores the state of this iterator.
+  //
+  // This method is used to restore the state of the iterator from a checkpoint.
+  //
+  // TODO(jsimsa): Make this method pure virtual once all `IteratorBase`
+  // implementations have an override.
   virtual Status RestoreInternal(IteratorContext* ctx,
                                  IteratorStateReader* reader) {
     return errors::Unimplemented("RestoreInternal");
@@ -682,9 +719,25 @@ class DatasetBase : public core::RefCounted {
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
-  // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(SerializationContext* ctx,
-                      IteratorStateWriter* writer) const;
+  // If the dataset is stateful it will not be possible to save its graph or
+  // checkpoint the state of its iterators.
+  //
+  // TODO(jsimsa): Remove this method once all `DatasetBase` implementations are
+  // migrated over to `CheckExternalState`.
+  virtual bool IsStateful() const { return false; }
+
+  // Indicates whether the dataset depends on any external state. If so, the
+  // method returns `errors::FailedPrecondition` with a message that identifies
+  // the external state. Otherwise, the method returns `Status::OK()`.
+  //
+  // TODO(jsimsa): Make this method pure virtual once all `DatasetBase`
+  // implementations have an override.
+  virtual Status CheckExternalState() const {
+    if (IsStateful()) {
+      return errors::FailedPrecondition("Dataset cannot be serialized.");
+    }
+    return Status::OK();
+  }
 
  protected:
   friend Status AsGraphDef(
@@ -695,11 +748,22 @@ class DatasetBase : public core::RefCounted {
 
   class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
    public:
-    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
+    explicit DatasetGraphDefBuilder(GraphDefBuilder* b)
+        : GraphDefBuilderWrapper(b) {}
     Status AddInputDataset(SerializationContext* ctx,
                            const DatasetBase* dataset, Node** output);
   };
 
+  // Serializes the dataset into a `GraphDef`, which has two uses:
+  //
+  // 1) To perform static input pipeline optimizations, tf.data serializes the
+  // dataset graph, applies graph rewrites, and then deserializes the graph.
+  // If a subclass of `DatasetBase` does not implement this method, then it will
+  // be excluded from static optimizations (and so will any upstream datasets).
+  //
+  // 2) To save the dataset so that it can restore at a later point (possibly in
+  // different environment). If a subclass of `DatasetBase` does not implement
+  // this method, then this migration will not be possible.
   virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
                                     Node** node) const = 0;
@@ -758,7 +822,7 @@ class DatasetBaseIterator : public IteratorBase {
                  bool* end_of_sequence) final;
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(params_.dataset->Save(ctx, writer));
+    TF_RETURN_IF_ERROR(params_.dataset->CheckExternalState());
     return IteratorBase::Save(ctx, writer);
   }
 
@@ -800,7 +864,7 @@ class DatasetBaseIterator : public IteratorBase {
   void RecordBufferDequeue(IteratorContext* ctx,
                            const std::vector<Tensor>& element) {
     if (collect_resource_usage(ctx)) {
-      node_->add_buffered_bytes(-GetAllocatedBytes(element));
+      node_->record_buffer_event(-GetAllocatedBytes(element), -1);
     }
   }
 
@@ -809,7 +873,7 @@ class DatasetBaseIterator : public IteratorBase {
   void RecordBufferEnqueue(IteratorContext* ctx,
                            const std::vector<Tensor>& element) {
     if (collect_resource_usage(ctx)) {
-      node_->add_buffered_bytes(GetAllocatedBytes(element));
+      node_->record_buffer_event(GetAllocatedBytes(element), 1);
     }
   }
 
diff --git a/tensorflow/core/framework/dataset_stateful_op_whitelist.h b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
index 74bd39cb619..25d7db2436a 100644
--- a/tensorflow/core/framework/dataset_stateful_op_whitelist.h
+++ b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
@@ -30,6 +30,11 @@ class WhitelistedStatefulOpRegistry {
     return Status::OK();
   }
 
+  Status Remove(string op_name) {
+    op_names_.erase(op_name);
+    return Status::OK();
+  }
+
   bool Contains(const string& op_name) { return op_names_.count(op_name); }
 
   static WhitelistedStatefulOpRegistry* Global() {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 5f9cc9a292a..b64b03efd0b 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/function.pb_text.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -621,7 +621,7 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   strings::StrAppend(&out, "\n(");
   auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
-    if (!GetNodeAttr(n, "T", &dt).ok()) {
+    if (!TryGetNodeAttr(n, "T", &dt)) {
       dt = DT_INVALID;
     }
     if (!n.device().empty()) {
@@ -921,11 +921,6 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(strings::StrCat(
         "_output_dev", i, "=", absl::CEscape(options.output_devices[i])));
   }
-  for (const auto& iter : options.input_tensor_shapes) {
-    entries.push_back(
-        strings::StrCat("_input_tensor_shape", iter.first, "=",
-                        absl::CEscape(iter.second.DebugString())));
-  }
   for (const auto& iter : options.input_resource_dtypes_and_shapes) {
     entries.push_back(strings::StrCat("_input_resource_dtype", iter.first, "=",
                                       DataTypeString(iter.second.dtype)));
@@ -1226,6 +1221,7 @@ Status FunctionLibraryDefinition::AddLibrary(
   // the duration of the function could lead to deadlock).
   FunctionLibraryDefinition clone(other);
   mutex_lock l(mu_);
+  mutex_lock l2(clone.mu_);
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   std::vector<string> funcs;
@@ -1388,7 +1384,7 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   // If ndef is SymbolicGradient[f=Foo], we use Foo's gradient or
   // Foo's attributes.
   const NameAttrList* forward_func_attrs;
-  if (!GetNodeAttr(ndef, kFuncAttr, &forward_func_attrs).ok()) {
+  if (!TryGetNodeAttr(ndef, kFuncAttr, &forward_func_attrs)) {
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
@@ -1433,7 +1429,7 @@ template <typename T>
 Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
                                           const string& attr, T* value) const {
   const FunctionDef* fdef = GetAttrImpl(ndef);
-  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
+  if (fdef && TryGetNodeAttr(AttrSlice(&fdef->attr()), attr, value)) {
     return Status::OK();
   }
   return errors::InvalidArgument("Attr ", attr, " is not defined.");
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index a106c74c55c..476c6055801 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -563,14 +563,6 @@ class FunctionLibraryRuntime {
     // infer correct device.
     std::vector<string> output_devices;
 
-    // This interface is EXPERIMENTAL and subject to change.
-    //
-    // For multi-device functions, a mapping from _Arg node index to input
-    // tensor shape.
-    // REQUIRES: if input_tensor_shapes.count(i) > 0 then i-th argument type
-    // must not be DT_RESOURCE.
-    std::unordered_map<int, TensorShape> input_tensor_shapes;
-
     // This interface is EXPERIMENTAL and subject to change.
     //
     // For multi-device functions, a mapping from _Arg node index to type and
@@ -634,7 +626,8 @@ class FunctionLibraryRuntime {
                              Handle* handle) = 0;
   Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) {
-    return Instantiate(function_name, attrs, {}, handle);
+    auto opts = absl::make_unique<InstantiateOptions>();
+    return Instantiate(function_name, attrs, *opts, handle);
   }
 
   // Releases state associated with the handle.
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 50a60e0087e..e86a88c661b 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
-#include "tensorflow/core/framework/versions.pb_text.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -37,7 +37,7 @@ namespace tensorflow {
 string SummarizeGraphDef(const GraphDef& graph_def) {
   string ret;
   strings::StrAppend(
-      &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n");
+      &ret, "versions = ", graph_def.versions().ShortDebugString(), ";\n");
   for (const NodeDef& node : graph_def.node()) {
     strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n");
   }
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 1ac322e48e2..08cb4c28b20 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -35,6 +35,17 @@ Status FinalizeOpDef(const OpDefBuilder& b, OpDef* op_def) {
   return s;
 }
 
+// We can create a Graph containing a namespaced Op
+TEST(AddToGraphTest, MakeGraphDefWithNamespacedOpName) {
+  OpList op_list;
+  TF_ASSERT_OK(FinalizeOpDef(OpDefBuilder("Project>SomeOp"), op_list.add_op()));
+  OpListOpRegistry registry(&op_list);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(NodeDefBuilder("node", "Project>SomeOp", &registry)
+                   .Finalize(graph_def.add_node()));
+}
+
 // Producer and consumer have default for an attr -> graph unchanged.
 TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeWithDefault) {
   OpList op_list;
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 0aff1edc97a..a0616dcc757 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -36,6 +36,7 @@ namespace {
 // a name collision with the other node names, so if necessary we add
 // a suffix to make names unique.  So if we have an input named "A" and a
 // node in the function body named "a", they will be renamed to "a" and "a_0".
+// TODO(b/139886381) Unify this and the one in c_api_function.cc
 class NodeNameMapping {
  public:
   NodeNameMapping() = default;
@@ -54,7 +55,13 @@ class NodeNameMapping {
   string NormalizeHelper(string name) const;
   string UniquifyHelper(string name);
 
-  std::unordered_set<string> used_names_;
+  // The normalized/uniquified names already used as
+  // input names (in signature), output names (in signature), and node names
+  // (in node_def).
+  // This is a superset of values in name_mapping_.
+  std::unordered_map<string, uint64> used_names_;
+  // Mapping from original node name from the graph to the normalized
+  // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
 };
 
@@ -76,12 +83,15 @@ string NodeNameMapping::NormalizeHelper(string name) const {
 }
 
 string NodeNameMapping::UniquifyHelper(string name) {
+  auto it = used_names_.emplace(name, 0);
   // If the name hasn't been used yet, use it as-is.
-  if (used_names_.insert(name).second) return name;
+  if (it.second) return name;
+
   // Add a suffix to name to make it unique.
-  for (int i = 0;; ++i) {
-    const string candidate = strings::StrCat(name, "_", i);
-    if (used_names_.insert(candidate).second) return candidate;
+  while (true) {
+    const string candidate = strings::StrCat(name, "_", it.first->second);
+    it.first->second++;
+    if (used_names_.emplace(candidate, 0).second) return candidate;
   }
 }
 
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index 4d1ae2c4e5f..3d0b25b82b6 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/kernel_def.pb_text.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
 namespace tensorflow {
@@ -132,7 +131,7 @@ KernelDefBuilder& KernelDefBuilder::HostMemory(const char* arg_name) {
 KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   CHECK_EQ(kernel_def_->label(), "")
       << "Trying to set a kernel's label a second time: '" << label
-      << "' in: " << ProtoShortDebugString(*kernel_def_);
+      << "' in: " << kernel_def_->DebugString();
   kernel_def_->set_label(label);
   return *this;
 }
diff --git a/tensorflow/core/framework/kernel_def_util.cc b/tensorflow/core/framework/kernel_def_util.cc
index bbd3dd3e57b..b9c41cb0afc 100644
--- a/tensorflow/core/framework/kernel_def_util.cc
+++ b/tensorflow/core/framework/kernel_def_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/kernel_def.pb_text.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 
@@ -39,7 +39,7 @@ Status KernelAttrsMatch(const KernelDef& kernel_def, AttrSlice attrs,
   for (const auto& constraint : kernel_def.constraint()) {
     if (constraint.allowed_values().list().type_size() == 0) {
       return errors::Unimplemented(
-          "KernelDef '", ProtoShortDebugString(kernel_def),
+          "KernelDef '", kernel_def.ShortDebugString(),
           " has constraint on attr '", constraint.name(),
           "' with unsupported type: ",
           SummarizeAttrValue(constraint.allowed_values()));
@@ -54,7 +54,7 @@ Status KernelAttrsMatch(const KernelDef& kernel_def, AttrSlice attrs,
       } else {
         if (!AttrValueHasType(*found, "list(type)").ok()) {
           return errors::InvalidArgument(
-              "KernelDef '", ProtoShortDebugString(kernel_def),
+              "KernelDef '", kernel_def.ShortDebugString(),
               "' has constraint on attr '", constraint.name(),
               "' that has value '", SummarizeAttrValue(*found),
               "' that does not have type 'type' or 'list(type)' in NodeDef "
@@ -73,7 +73,7 @@ Status KernelAttrsMatch(const KernelDef& kernel_def, AttrSlice attrs,
       return errors::InvalidArgument(
           "OpKernel '", kernel_def.op(), "' has constraint on attr '",
           constraint.name(), "' not in NodeDef '", attrs.SummarizeNode(),
-          "', KernelDef: '", ProtoShortDebugString(kernel_def), "'");
+          "', KernelDef: '", kernel_def.ShortDebugString(), "'");
     }
   }
   *match = true;
diff --git a/tensorflow/core/framework/log_memory.cc b/tensorflow/core/framework/log_memory.cc
index 865bfc5add6..ecdc3c4e040 100644
--- a/tensorflow/core/framework/log_memory.cc
+++ b/tensorflow/core/framework/log_memory.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/log_memory.h"
 
-#include "tensorflow/core/framework/log_memory.pb_text.h"
 #include "tensorflow/core/framework/log_memory.pb.h"
 
 namespace tensorflow {
@@ -33,7 +32,7 @@ void OutputToLog(const T& proto) {
   const size_t index = type_name.find_last_of(".");
   if (index != string::npos) type_name = type_name.substr(index + 1);
   LOG(INFO) << LogMemory::kLogMemoryLabel << " " << type_name << " { "
-            << ProtoShortDebugString(proto) << " }";
+            << proto.ShortDebugString() << " }";
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/logging.cc b/tensorflow/core/framework/logging.cc
index 7a819e7fb0c..b838833733f 100644
--- a/tensorflow/core/framework/logging.cc
+++ b/tensorflow/core/framework/logging.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <iostream>
 
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 1d482184a52..246f50acd26 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -156,14 +156,14 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
   }
 
   std::vector<int32> hostmem_attr;
-  if (GetNodeAttr(ndef, "_input_hostmem", &hostmem_attr).ok()) {
+  if (TryGetNodeAttr(ndef, "_input_hostmem", &hostmem_attr)) {
     for (int32 i : hostmem_attr) {
       if (0 <= i && i < inp_mtypes->size()) {
         (*inp_mtypes)[i] = HOST_MEMORY;
       }
     }
   }
-  if (GetNodeAttr(ndef, "_output_hostmem", &hostmem_attr).ok()) {
+  if (TryGetNodeAttr(ndef, "_output_hostmem", &hostmem_attr)) {
     for (int32 i : hostmem_attr) {
       if (0 <= i && i < out_mtypes->size()) {
         (*out_mtypes)[i] = HOST_MEMORY;
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index a6b61148437..2e4c4e81cfd 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -49,6 +49,8 @@ double ComputeWaitTime(double output_time, double input_time,
                        double buffer_size, double* output_time_derivative,
                        double* input_time_derivative,
                        double* buffer_size_derivative) {
+  // Case 0: either the producer or the consumer are infinitely fast. Wait time
+  // is the time to produce an output.
   if (output_time == 0 || input_time == 0) {
     if (output_time_derivative) {
       *output_time_derivative = 1.0L;
@@ -61,6 +63,22 @@ double ComputeWaitTime(double output_time, double input_time,
     }
     return output_time;
   }
+  // Case 1: the consumer is slower than the producer. Wait time is 0 since the
+  // buffer will be full in the long run.
+  if (input_time > output_time) {
+    if (output_time_derivative) {
+      *output_time_derivative = 0.0L;
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = 0.0L;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return 0;
+  }
+  // Case 2: the consumer and the producer are equally fast. Expected wait time
+  // decreases linearly with the size of the buffer.
   if (input_time == output_time) {
     const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
     if (output_time_derivative) {
@@ -75,6 +93,8 @@ double ComputeWaitTime(double output_time, double input_time,
     }
     return p_buffer_empty * output_time;
   }
+  // Case 3: the producer is slower than the consumer and neither is infinitely
+  // fast.
   const double alpha = 1.0L / input_time;
   const double beta = 1.0L / output_time;
   const double ratio_pow = std::pow((beta / alpha), (buffer_size + 1.0L));
@@ -167,14 +187,20 @@ class InterleaveMany : public Node {
 
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
-    if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (processing_times) {
+      (*processing_times)[long_name()] = self_processing_time;
     }
-    double processing_time = (TotalProcessingTimeForInputs() -
-                              inputs_.front()->TotalProcessingTime()) /
-                             static_cast<double>(num_inputs() - 1);
-    return SelfProcessingTimeLocked() + processing_time;
+    if (num_inputs() <= 1) {
+      return self_processing_time;
+    }
+    double processing_time =
+        (TotalProcessingTimeForInputs(processing_times) -
+         inputs_.front()->TotalProcessingTime(/*processing_times=*/nullptr)) /
+        static_cast<double>(num_inputs() - 1);
+    return self_processing_time + processing_time;
   }
 };
 
@@ -224,7 +250,7 @@ class AsyncInterleaveMany : public Node {
     auto cleanup =
         gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
     double parallelism = num_inputs() - 1;  // default to cycle length
-    auto* parameter = gtl::FindOrNull(parameters_, "parallelism");
+    auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
@@ -282,13 +308,19 @@ class AsyncInterleaveMany : public Node {
 
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (processing_times) {
+      (*processing_times)[long_name()] = self_processing_time;
+    }
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+      return self_processing_time;
     }
     double processing_time =
-        TotalProcessingTimeForInputs() - inputs_.front()->TotalProcessingTime();
-    return SelfProcessingTimeLocked() +
+        TotalProcessingTimeForInputs(processing_times) -
+        inputs_.front()->TotalProcessingTime(/*processing_times=*/nullptr);
+    return self_processing_time +
            processing_time / static_cast<double>(num_inputs() - 1);
   }
 };
@@ -345,8 +377,14 @@ class KnownRatio : public Node {
 
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
-    return SelfProcessingTimeLocked() + ratio_ * TotalProcessingTimeForInputs();
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (processing_times) {
+      (*processing_times)[long_name()] = self_processing_time;
+    }
+    return self_processing_time +
+           ratio_ * TotalProcessingTimeForInputs(processing_times);
   }
 
  private:
@@ -379,15 +417,22 @@ class AsyncKnownRatio : public Node {
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the self
   // processing time and the product of `ratio_` and the sum of output times of
-  // inputs, `input_time` is specified through `input_times` and `buffer_size`
-  // is derived from parallelism.
+  // inputs, `input_time` is specified through `input_times` and if the node
+  // has parallelism parameter, then `buffer_size` is derived from parallelism.
+  //
+  // Current implementation assumes that there is at most 1 parameter per node.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
       SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
-    auto* parameter = gtl::FindOrNull(parameters_, "parallelism");
-    if (parameter) {
-      parallelism = (*parameter)->value;
+    double buffer_size = 0.0;
+    auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
+    auto* buffer_size_parameter = gtl::FindOrNull(parameters_, kBufferSize);
+    if (parallelism_parameter) {
+      parallelism = (*parallelism_parameter)->value;
+      buffer_size = parallelism;
+    } else if (buffer_size_parameter) {
+      buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
     if (ratio_ == 0.0) {
@@ -397,21 +442,24 @@ class AsyncKnownRatio : public Node {
         double input_time_der = 0.0L;
         double buffer_size_der = 0.0L;
         double result = ComputeWaitTime(output_time, input_times->back(),
-                                        parallelism, &output_time_der,
+                                        buffer_size, &output_time_der,
                                         &input_time_der, &buffer_size_der);
         auto last_input_time_der =
             gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
         (*gradient)[kInputTimeDerivativeKey] =
             last_input_time_der + input_time_der;
-        // Add derivative w.r.t. own parallelism parameter.
-        if (parameter && (*parameter)->state->tunable) {
+        // Add derivative w.r.t. own parameter if it's tunable.
+        if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
           (*gradient)[long_name()] =
               -output_time_der * self_processing_time / Square(parallelism) +
               buffer_size_der;
+        } else if (buffer_size_parameter &&
+                   (*buffer_size_parameter)->state->tunable) {
+          (*gradient)[long_name()] = buffer_size_der;
         }
         return result;
       }
-      return ComputeWaitTime(output_time, input_times->back(), parallelism,
+      return ComputeWaitTime(output_time, input_times->back(), buffer_size,
                              /*output_time_derivative=*/nullptr,
                              /*input_time_derivative=*/nullptr,
                              /*buffer_size_derivative=*/nullptr);
@@ -430,7 +478,7 @@ class AsyncKnownRatio : public Node {
           self_processing_time / parallelism +
           ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
       double result =
-          ComputeWaitTime(output_time, old_input_time, parallelism,
+          ComputeWaitTime(output_time, old_input_time, buffer_size,
                           &output_time_der, &input_time_der, &buffer_size_der);
       auto last_input_time_der =
           gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
@@ -441,20 +489,23 @@ class AsyncKnownRatio : public Node {
           (*gradient)[pair.first] = pair.second * ratio_ * output_time_der;
         }
       }
-      // Add derivative w.r.t. own parallelism parameter.
-      if (parameter && (*parameter)->state->tunable) {
+      // Add derivative w.r.t. own parameter if it's tunable.
+      if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
         (*gradient)[long_name()] =
             -output_time_der * self_processing_time / Square(parallelism) +
             buffer_size_der -
             output_time_der * inputs_gradient[kInputTimeDerivativeKey] *
                 self_processing_time / Square(parallelism);
+      } else if (buffer_size_parameter &&
+                 (*buffer_size_parameter)->state->tunable) {
+        (*gradient)[long_name()] = buffer_size_der;
       }
       return result;
     }
     double output_time =
         self_processing_time / parallelism +
         ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
-    return ComputeWaitTime(output_time, old_input_time, parallelism,
+    return ComputeWaitTime(output_time, old_input_time, buffer_size,
                            /*output_time_derivative=*/nullptr,
                            /*input_time_derivative=*/nullptr,
                            /*buffer_size_derivative=*/nullptr);
@@ -462,8 +513,14 @@ class AsyncKnownRatio : public Node {
 
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
-    return SelfProcessingTimeLocked() + ratio_ * TotalProcessingTimeForInputs();
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (processing_times) {
+      (*processing_times)[long_name()] = self_processing_time;
+    }
+    return self_processing_time +
+           ratio_ * TotalProcessingTimeForInputs(processing_times);
   }
 
  private:
@@ -524,16 +581,22 @@ class UnknownRatio : public Node {
 
   // The processing time is the sum of the self processing time and the product
   // of the ratio estimate and the sum of processing times of inputs.
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (processing_times) {
+      (*processing_times)[long_name()] = self_processing_time;
+    }
     if (inputs_.empty() || num_elements_ == 0) {
-      return SelfProcessingTimeLocked();
+      return self_processing_time;
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
     // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    return SelfProcessingTimeLocked() + ratio * TotalProcessingTimeForInputs();
+    return self_processing_time +
+           ratio * TotalProcessingTimeForInputs(processing_times);
   }
 };
 
@@ -557,8 +620,9 @@ class Unknown : public Node {
   }
 
   // The processing time is the sum of processing times of inputs.
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
-    return TotalProcessingTimeForInputs();
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
+    return TotalProcessingTimeForInputs(processing_times);
   }
 };
 
@@ -647,13 +711,14 @@ void Model::AddProcessingTime(const string& name, int64 delta) {
   }
 }
 
-void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget) {
+void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
+                     int64 ram_budget) {
   switch (algorithm) {
     case AutotuneAlgorithm::HILL_CLIMB:
-      OptimizeHillClimb(cpu_budget);
+      OptimizeHillClimb(cpu_budget, ram_budget);
       break;
     case AutotuneAlgorithm::GRADIENT_DESCENT:
-      OptimizeGradientDescent(cpu_budget);
+      OptimizeGradientDescent(cpu_budget, ram_budget);
       break;
   }
 }
@@ -719,31 +784,53 @@ std::map<string, std::shared_ptr<Parameter>> Model::CollectTunableParameters(
   return parameters;
 }
 
-void Model::OptimizeGradientDescent(int64 cpu_budget) {
+std::map<string, std::shared_ptr<Parameter>> Model::CollectEssentialParallelism(
+    std::shared_ptr<Node> node) {
+  // Parallelism parameter is considered to be essential if the coressponding
+  // transformations's processing time is greater than essential rate times the
+  // average transformation self processing time.
+  constexpr double kEssentialRate = 0.3L;
+
+  std::map<string, std::shared_ptr<Parameter>> parameters;
+  node->CollectTunableParameters(&parameters);
+  std::map<string, double> processing_times;
+  double processing_time = node->TotalProcessingTime(&processing_times);
+  double uniform_share =
+      processing_time / static_cast<double>(processing_times.size());
+  std::map<string, std::shared_ptr<Parameter>> essential_parameters;
+  for (auto& pair : parameters) {
+    if (pair.second->name == kParallelism &&
+        processing_times[pair.first] > kEssentialRate * uniform_share) {
+      essential_parameters.insert(pair);
+    }
+  }
+  return essential_parameters;
+}
+
+void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   std::shared_ptr<Node> snapshot;
   {
     tf_shared_lock lock(mu_);
     snapshot = output_->Snapshot(nullptr);
   }
   VLOG(2) << "Starting optimization of tunable parameters with GradientDescent";
-  const double processing_time = TotalProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
+  auto essential_parameters = CollectEssentialParallelism(snapshot);
+  // We add the number of model's buffered bytes because it is excluded from the
+  // memory budget, but it is included in the maximum number of buffered bytes.
+  ram_budget += TotalBufferedBytes(snapshot);
   for (auto& pair : parameters) {
     pair.second->value = pair.second->min;
   }
   // Gradient descent step size.
-  constexpr double kDescentStep = 0.7L;
+  constexpr double kDescentStep = 0.1L;
 
   // Optimization is stopped once the `OutputTime` improvement is smaller than
   // this value.
   constexpr double kOptimizationPrecision = 100.0L;
 
-  // Penalizing step for the parameters after we overoptimize (output time <
-  // processing time / cpu budget) the objective.
-  constexpr double kParametersPenalty = 0.05L;
-
   // Maximum number of iterations for optimization.
-  constexpr int64 kMaxIterations = 100;
+  constexpr int64 kMaxIterations = 1000;
 
   double output_time = 0;
   double new_output_time;
@@ -751,8 +838,16 @@ void Model::OptimizeGradientDescent(int64 cpu_budget) {
   for (int i = 0; i < kMaxIterations; ++i) {
     std::map<string, double> gradient;
     new_output_time = OutputTime(snapshot, &gradient);
+    int64 model_parallelism = 0;
+    for (auto& pair : essential_parameters) {
+      model_parallelism += std::round(pair.second->value);
+    }
+    // We terminate once the improvement of the output latency is too small or
+    // the essential transformations' parallelism reaches the CPU budget or the
+    // worst-case total buffer size exceeds the memory budget.
     if (std::abs(output_time - new_output_time) < kOptimizationPrecision ||
-        new_output_time < processing_time / cpu_budget) {
+        model_parallelism > cpu_budget ||
+        TotalMaximumBufferedBytes(snapshot) > ram_budget) {
       break;
     }
     double max_abs_derivative = 1.0;
@@ -762,13 +857,6 @@ void Model::OptimizeGradientDescent(int64 cpu_budget) {
             std::max(max_abs_derivative, std::abs(gradient[pair.first]));
       }
     }
-    // Maximizes parameters on early stages of the model.
-    if (max_abs_derivative < kOptimizationPrecision) {
-      for (auto& pair : parameters) {
-        pair.second->value = pair.second->max;
-      }
-      break;
-    }
     for (auto& pair : parameters) {
       new_value = pair.second->value -
                   kDescentStep * gradient[pair.first] / max_abs_derivative;
@@ -783,23 +871,6 @@ void Model::OptimizeGradientDescent(int64 cpu_budget) {
     }
     output_time = new_output_time;
   }
-  // Penalize parameters if we overoptimized the objective.
-  for (int i = 0;
-       i < kMaxIterations && new_output_time < processing_time / cpu_budget;
-       ++i) {
-    for (auto& pair : parameters) {
-      new_value = pair.second->value - kParametersPenalty;
-      // Projection on a feasible interval.
-      if (new_value > pair.second->max) {
-        pair.second->value = pair.second->max;
-      } else if (new_value < pair.second->min) {
-        pair.second->value = pair.second->min;
-      } else {
-        pair.second->value = new_value;
-      }
-    }
-    new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
-  }
   VLOG(2) << "Number of tunable parameters: " << parameters.size();
   for (auto& pair : parameters) {
     pair.second->value = std::round(pair.second->value);
@@ -812,7 +883,7 @@ void Model::OptimizeGradientDescent(int64 cpu_budget) {
   }
 }
 
-void Model::OptimizeHillClimb(int64 cpu_budget) {
+void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
   std::shared_ptr<Node> snapshot;
   {
     tf_shared_lock lock(mu_);
@@ -821,8 +892,15 @@ void Model::OptimizeHillClimb(int64 cpu_budget) {
   VLOG(2) << "Starting optimization of tunable parameters with HillClimb";
   const double processing_time = TotalProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
+  // We add the number of model's buffered bytes because it is excluded from the
+  // memory budget, but it is included in the maximum number of buffered bytes.
+  ram_budget += TotalBufferedBytes(snapshot);
+  // Buffer size parameter will only be incremented if the output latency
+  // improvement is greater than this constant.
+  constexpr double kBufferSizeMinDelta = 1.0L;
+
   for (auto& pair : parameters) {
-    pair.second->value = 1;
+    pair.second->value = pair.second->min;
   }
   while (true) {
     const double output_time = OutputTime(snapshot, /*gradient=*/nullptr);
@@ -833,7 +911,8 @@ void Model::OptimizeHillClimb(int64 cpu_budget) {
         break;
       }
     }
-    if (output_time < processing_time / cpu_budget || all_max) {
+    if (output_time < processing_time / cpu_budget || all_max ||
+        TotalMaximumBufferedBytes(snapshot) > ram_budget) {
       break;
     }
     double best_delta = -1.0L;
@@ -845,17 +924,18 @@ void Model::OptimizeHillClimb(int64 cpu_budget) {
       pair.second->value++;
       double new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
       double delta = output_time - new_output_time;
-      if (delta > best_delta) {
+      if (delta > best_delta &&
+          (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
         best_delta = delta;
         best_parameter = pair.second.get();
       }
       pair.second->value--;
     }
     if (!best_parameter) {
-      LOG(WARNING) << "Failed to find a tunable parameter that would "
-                      "decrease the output time. This means that the "
-                      "autotuning optimization got stuck in a local maximum. "
-                      "The optimization attempt will be aborted.";
+      VLOG(2) << "Failed to find a tunable parameter that would decrease the "
+                 "output time. This means that the autotuning optimization got "
+                 "stuck in a local maximum. The optimization attempt will be "
+                 "aborted.";
       return;
     }
     best_parameter->value++;
@@ -883,8 +963,16 @@ double Model::OutputTime(std::shared_ptr<Node> node,
   return node->OutputTime(&input_times, gradient);
 }
 
+double Model::TotalBufferedBytes(std::shared_ptr<Node> node) {
+  return node->TotalBufferedBytes();
+}
+
+double Model::TotalMaximumBufferedBytes(std::shared_ptr<Node> node) {
+  return node->TotalMaximumBufferedBytes();
+}
+
 double Model::TotalProcessingTime(std::shared_ptr<Node> node) {
-  return node->TotalProcessingTime();
+  return node->TotalProcessingTime(/*processing_times=*/nullptr);
 }
 
 }  // namespace model
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 2687cc6e534..ebd92e27e73 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -37,6 +37,8 @@ namespace model {
 
 // A constant that can be used to enable auto-tuning.
 constexpr int64 kAutotune = -1;
+constexpr char kParallelism[] = "parallelism";
+constexpr char kBufferSize[] = "buffer_size";
 
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
@@ -126,12 +128,6 @@ class Node {
 
   virtual ~Node() {}
 
-  // Increments the bytes buffered by the given delta.
-  void add_buffered_bytes(int64 delta) LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    buffered_bytes_ += delta;
-  }
-
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
@@ -156,6 +152,12 @@ class Node {
     return buffered_bytes_;
   }
 
+  // Returns the number of elements stored in this node's buffer.
+  int64 buffered_elements() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return buffered_elements_;
+  }
+
   // Indicates whether the node has tunable parameters.
   bool has_tunable_parameters() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
@@ -195,6 +197,14 @@ class Node {
     return processing_time_;
   }
 
+  // Records the change in this node's buffer.
+  void record_buffer_event(int64 bytes_delta, int64 elements_delta)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    buffered_bytes_ += bytes_delta;
+    buffered_elements_ += elements_delta;
+  }
+
   // Records that the node produced an element.
   void record_element() LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
@@ -294,6 +304,7 @@ class Node {
       mutex_lock l2(result->mu_);
       result->autotune_ = autotune_;
       result->buffered_bytes_ = buffered_bytes_;
+      result->buffered_elements_ = buffered_elements_;
       result->processing_time_ = processing_time_;
       result->num_elements_ = num_elements_;
       result->parameters_ = parameters_;
@@ -310,10 +321,56 @@ class Node {
     return SelfProcessingTimeLocked();
   }
 
-  // Returns the per-element CPU time spent in the subtree rooted in this node.
-  double TotalProcessingTime() LOCKS_EXCLUDED(mu_) {
+  // Returns the total number of bytes buffered in all nodes in the subtree for
+  // which autotuning is enabled.
+  double TotalBufferedBytes() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
-    return TotalProcessingTimeLocked();
+    if (!autotune_) {
+      return 0;
+    }
+    double result = 0;
+    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
+    if (!parameter) {
+      parameter = gtl::FindOrNull(parameters_, kParallelism);
+    }
+    if (parameter) {
+      result = buffered_bytes_;
+    }
+    for (auto& input : inputs_) {
+      result += input->TotalBufferedBytes();
+    }
+    return result;
+  }
+
+  // Collects the total buffer limit of all nodes in the subtree for which
+  // autotuning is enabled. This number represents the amount of memory that
+  // would be used by the subtree nodes if all of their buffers were full.
+  double TotalMaximumBufferedBytes() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    if (!autotune_) {
+      return 0;
+    }
+    double result = 0;
+    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
+    if (!parameter) {
+      parameter = gtl::FindOrNull(parameters_, kParallelism);
+    }
+    if (parameter) {
+      result = (*parameter)->value * AverageBufferedElementSize();
+    }
+    for (auto& input : inputs_) {
+      result += input->TotalMaximumBufferedBytes();
+    }
+    return result;
+  }
+
+  // Returns the per-element CPU time spent in the subtree rooted in this node.
+  // If `processing_times` is not `nullptr`, collects the per-element CPU time
+  // spent in each node of the subtree.
+  double TotalProcessingTime(std::map<string, double>* processing_times)
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return TotalProcessingTimeLocked(processing_times);
   }
 
  protected:
@@ -333,6 +390,15 @@ class Node {
   virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
       SHARED_LOCKS_REQUIRED(mu_) = 0;
 
+  // Returns the average size of an element buffered in this node.
+  double AverageBufferedElementSize() const SHARED_LOCKS_REQUIRED(mu_) {
+    if (buffered_elements_ == 0) {
+      return 0;
+    }
+    return static_cast<double>(buffered_bytes_) /
+           static_cast<double>(buffered_elements_);
+  }
+
   // Returns the sum of per-element output time for the inputs of this node and
   // if `gradient` is not `nullptr`, collects gradients of output times w.r.t.
   // tunable parameters and the last input time.
@@ -360,10 +426,13 @@ class Node {
   // Processing time for a given input is a weighted combination of a statistic
   // based on history of input processing time and the actual time. This is done
   // to improve accuracy of processing time estimation for newly created inputs.
+  // If `processing_times` is not `nullptr`, collects the per-element CPU time
+  // spent in each input node.
   //
   // Uniform distribution of per-element processing times across different
   // inputs is assumed.
-  double TotalProcessingTimeForInputs() SHARED_LOCKS_REQUIRED(mu_) {
+  double TotalProcessingTimeForInputs(
+      std::map<string, double>* processing_times) SHARED_LOCKS_REQUIRED(mu_) {
     // If the number of elements produced by an input is smaller than this
     // constant, then its processing time is estimated using a weighted average
     // of the empirical processing time and processing time history.
@@ -377,7 +446,8 @@ class Node {
     for (auto& input : inputs_) {
       // Inputs for which autotuning is disabled are excluded.
       if (input->autotune()) {
-        double input_processing_time = input->TotalProcessingTime();
+        double input_processing_time =
+            input->TotalProcessingTime(processing_times);
         int64 num_elements = input->num_elements();
         if (num_elements < kNumElementsThreshold) {
           if (input_processing_time_count_ < kCountThreshold) {
@@ -411,7 +481,11 @@ class Node {
   }
 
   // Returns the per-element CPU time spent in the subtree rooted in this node.
-  virtual double TotalProcessingTimeLocked() SHARED_LOCKS_REQUIRED(mu_) = 0;
+  // If `processing_times` is not `nullptr`, collects the per-element CPU time
+  // spent in each node of the subtree.
+  virtual double TotalProcessingTimeLocked(
+      std::map<string, double>* processing_times)
+      SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   mutable mutex mu_;
   const int64 id_;
@@ -422,6 +496,7 @@ class Node {
   // from computation of output time and processing time.
   bool autotune_ GUARDED_BY(mu_) = true;
   int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
+  int64 buffered_elements_ GUARDED_BY(mu_) = 0;
   int64 processing_time_ GUARDED_BY(mu_) = 0;
   int64 num_elements_ GUARDED_BY(mu_) = 0;
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
@@ -510,7 +585,7 @@ class Model {
   void AddProcessingTime(const string& name, int64 delta) LOCKS_EXCLUDED(mu_);
 
   // Uses the given algorithm to perform the autotuning optimization.
-  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget)
+  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
       LOCKS_EXCLUDED(mu_);
 
   // Records that a node has produced an element.
@@ -536,13 +611,21 @@ class Model {
   std::map<string, std::shared_ptr<Parameter>> CollectTunableParameters(
       std::shared_ptr<Node> node);
 
+  // Collects "essential" parallelism parameters of transformations in the tree
+  // rooted in the given node. Which parameters are essential is determined by
+  // comparison the processing time spent in the corresponding transformation
+  // relative to other transformations. The collected parameters are returned
+  // as a mapping from a (unique) node name to a parallelism parameter.
+  std::map<string, std::shared_ptr<Parameter>> CollectEssentialParallelism(
+      std::shared_ptr<Node> node);
+
   // This optimization algorithm starts by setting all tunable parallelism
-  // parameters to 1. It then repeatedly identifies the parameter whose increase
-  // in parallelism decreases the output time the most. This process is repeated
-  // until all parameters reach their maximum values or the projected output
-  // time is less than or equal to the processing time needed to produce an
-  // element divided by CPU budget.
-  void OptimizeHillClimb(int64 cpu_budget);
+  // parameters to the minimum value. It then repeatedly identifies the
+  // parameter whose increase in parallelism decreases the output time the most.
+  // This process is repeated until all parameters reach their maximum values or
+  // the projected output time is less than or equal to the processing time
+  // needed to produce an element divided by CPU budget.
+  void OptimizeHillClimb(int64 cpu_budget, int64 ram_budget);
 
   // This optimization algorithm starts by setting all tunable parallelism
   // parameters to the minimum value. It then improves current parameters by
@@ -551,7 +634,7 @@ class Model {
   // repeated until either the output time improvement is smaller than threshold
   // value or the output time is less than the processing time needed to produce
   // an element divided by CPU budget.
-  void OptimizeGradientDescent(int64 cpu_budget);
+  void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
 
   // Collects the output time and if `gradient` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
@@ -562,6 +645,16 @@ class Model {
   // Collects the processing time for the given node.
   double TotalProcessingTime(std::shared_ptr<Node> node);
 
+  // Collects the total number of bytes buffered in all nodes in the subtree
+  // rooted in the given node for which autotuning is enabled.
+  double TotalBufferedBytes(std::shared_ptr<Node> node);
+
+  // Collects the total buffer limit of all nodes in the subtree rooted in the
+  // given node for which autotuning is enabled. This number represents the
+  // amount of memory that would be used by the subtree nodes if all of their
+  // buffers were full.
+  double TotalMaximumBufferedBytes(std::shared_ptr<Node> node);
+
   // Used for coordination between different input pipeline threads. Exclusive
   // access is required only when adding or removing nodes. Concurrent access to
   // existing nodes is protected by a node mutex.
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 1a96ebdd7e3..27ddd1bf239 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -56,28 +56,44 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(source2);
   });
   std::vector<double> input_times(1, input_time);
+  EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
+  EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
+  async_interleave_many->record_buffer_event(110, 10);
+  EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 110);
+  EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(),
+            110 * parallelism / 10);
   async_interleave_many->add_processing_time(100);
   EXPECT_EQ(async_interleave_many->processing_time(), 100);
-  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(
+      async_interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+      0);
   EXPECT_EQ(async_interleave_many->OutputTime(&input_times, nullptr), 0);
   async_interleave_many->record_element();
   EXPECT_EQ(async_interleave_many->num_elements(), 1);
-  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(
+      async_interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+      100);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times, nullptr), 100);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times, nullptr), 0);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(
+      async_interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+      100);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times, nullptr), 100);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times, nullptr), 0);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100 + 250);
+  EXPECT_EQ(
+      async_interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+      100 + 250);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times, nullptr),
             100 + 250 / parallelism);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times, nullptr), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 50 + 250);
+  EXPECT_EQ(
+      async_interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+      50 + 250);
   EXPECT_LE(async_interleave_many->OutputTime(&input_times, nullptr),
             50 + 250 / parallelism);
   EXPECT_GE(async_interleave_many->OutputTime(&input_times, nullptr), 0);
@@ -108,50 +124,58 @@ TEST_P(AsyncKnownRatioTest, Model) {
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
   std::vector<double> input_times(1, input_time);
+  EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
+  EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
+  async_known_many->record_buffer_event(110, 10);
+  EXPECT_EQ(async_known_many->TotalBufferedBytes(), 110);
+  EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(),
+            110 * parallelism / 10);
   source1->add_processing_time(100);
-  EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            0);
   EXPECT_EQ(async_known_many->OutputTime(&input_times, nullptr), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            0);
   EXPECT_EQ(async_known_many->OutputTime(&input_times, nullptr), 0);
   source1->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * 100);
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * 100);
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   source2->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (100 + 200));
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (100 + 200));
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   source1->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 200));
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 200));
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   source2->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   async_known_many->add_processing_time(128);
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   async_known_many->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100) + 128);
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100) + 128 / parallelism);
   EXPECT_GE(async_known_many->OutputTime(&input_times, nullptr), 0);
   async_known_many->record_element();
-  EXPECT_EQ(async_known_many->TotalProcessingTime(),
+  EXPECT_EQ(async_known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100) + 64);
   EXPECT_LE(async_known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100) + 64 / parallelism);
@@ -178,22 +202,27 @@ TEST(InterleaveManyTest, Model) {
   std::vector<double> input_times(1, 0);
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
-  EXPECT_EQ(interleave_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            0);
   EXPECT_EQ(interleave_many->OutputTime(&input_times, nullptr), 0);
   interleave_many->record_element();
   EXPECT_EQ(interleave_many->num_elements(), 1);
-  EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            100);
   EXPECT_EQ(interleave_many->OutputTime(&input_times, nullptr), 100);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            100);
   EXPECT_EQ(interleave_many->OutputTime(&input_times, nullptr), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(interleave_many->TotalProcessingTime(), 350);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            350);
   EXPECT_EQ(interleave_many->OutputTime(&input_times, nullptr), 350);
   interleave_many->record_element();
-  EXPECT_EQ(interleave_many->TotalProcessingTime(), 300);
+  EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            300);
   EXPECT_EQ(interleave_many->OutputTime(&input_times, nullptr), 300);
 }
 
@@ -211,42 +240,43 @@ TEST_P(KnownRatioTest, Model) {
   known_many->add_input(source2);
   std::vector<double> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(known_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(known_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
   source1->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            num_inputs_per_output * 100);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * 100);
   source2->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (100 + 200));
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (100 + 200));
   source1->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 200));
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 200));
   source2->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100));
   known_many->add_processing_time(128);
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100));
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100));
   known_many->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100) + 128);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100) + 128);
   known_many->record_element();
-  EXPECT_EQ(known_many->TotalProcessingTime(),
+  EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr),
             num_inputs_per_output * (50 + 100) + 64);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr),
             num_inputs_per_output * (50 + 100) + 64);
@@ -259,15 +289,15 @@ TEST(SourceTest, Model) {
   std::vector<double> input_times(1, 0);
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
-  EXPECT_EQ(source->TotalProcessingTime(), 0);
+  EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(source->OutputTime(&input_times, nullptr), 0);
   source->record_element();
   EXPECT_EQ(source->num_elements(), 1);
-  EXPECT_EQ(source->TotalProcessingTime(), 100);
+  EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 100);
   EXPECT_EQ(source->OutputTime(&input_times, nullptr), 100);
   source->record_element();
   EXPECT_EQ(source->num_elements(), 2);
-  EXPECT_EQ(source->TotalProcessingTime(), 50);
+  EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 50);
   EXPECT_EQ(source->OutputTime(&input_times, nullptr), 50);
 }
 
@@ -283,22 +313,26 @@ TEST(UnknownRatioTest, Model) {
   std::vector<double> input_times(1, 0);
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
-  EXPECT_EQ(unknown_many->TotalProcessingTime(), 0);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown_many->OutputTime(&input_times, nullptr), 0);
   unknown_many->record_element();
   EXPECT_EQ(unknown_many->num_elements(), 1);
-  EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            100);
   EXPECT_EQ(unknown_many->OutputTime(&input_times, nullptr), 100);
   source1->add_processing_time(100);
   source2->add_processing_time(200);
-  EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            100);
   EXPECT_EQ(unknown_many->OutputTime(&input_times, nullptr), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(unknown_many->TotalProcessingTime(), 400);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            400);
   EXPECT_EQ(unknown_many->OutputTime(&input_times, nullptr), 400);
   unknown_many->record_element();
-  EXPECT_EQ(unknown_many->TotalProcessingTime(), 200);
+  EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            200);
   EXPECT_EQ(unknown_many->OutputTime(&input_times, nullptr), 200);
 }
 
@@ -313,34 +347,34 @@ TEST(UnknownTest, Model) {
   unknown->add_input(source2);
   std::vector<double> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(unknown->TotalProcessingTime(), 0);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
   source2->add_processing_time(100);
-  EXPECT_EQ(unknown->TotalProcessingTime(), 0);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
   source1->record_element();
-  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 100);
   source2->record_element();
-  EXPECT_EQ(unknown->TotalProcessingTime(), 200);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 200);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 200);
   source1->record_element();
-  EXPECT_EQ(unknown->TotalProcessingTime(), 150);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 150);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 150);
   source2->record_element();
-  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 100);
   // Unknown node processing time should not affect its TotalProcessingTime() or
   // OutputTime().
   unknown->add_processing_time(100);
   EXPECT_EQ(unknown->processing_time(), 100);
-  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 100);
   // Unknown node number of elements should not affect its TotalProcessingTime()
   // or OutputTime().
   unknown->record_element();
   EXPECT_EQ(unknown->num_elements(), 1);
-  EXPECT_EQ(unknown->TotalProcessingTime(), 100);
+  EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 100);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 100);
 }
 
@@ -362,7 +396,8 @@ class TestNode : public model::Node {
     return 0;
   }
 
-  double TotalProcessingTimeLocked() override SHARED_LOCKS_REQUIRED(mu_) {
+  double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
+      override SHARED_LOCKS_REQUIRED(mu_) {
     return 0;
   }
 };
@@ -375,8 +410,19 @@ TEST(SetterGetterTest, Node) {
   EXPECT_EQ(node->output(), nullptr);
 
   EXPECT_EQ(node->buffered_bytes(), 0);
-  node->add_buffered_bytes(42);
+  EXPECT_EQ(node->buffered_elements(), 0);
+  EXPECT_EQ(node->TotalBufferedBytes(), 0);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
+  node->record_buffer_event(42, 0);
   EXPECT_EQ(node->buffered_bytes(), 42);
+  EXPECT_EQ(node->TotalBufferedBytes(), 0);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
+  EXPECT_EQ(node->buffered_elements(), 0);
+  node->record_buffer_event(0, 11);
+  EXPECT_EQ(node->buffered_bytes(), 42);
+  EXPECT_EQ(node->TotalBufferedBytes(), 0);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
+  EXPECT_EQ(node->buffered_elements(), 11);
 
   EXPECT_EQ(node->processing_time(), 0);
   node->record_start(1);
@@ -393,6 +439,9 @@ TEST(SetterGetterTest, Node) {
   node->add_input(input);
   EXPECT_EQ(node->inputs().size(), 1);
   EXPECT_EQ(node->inputs().front(), input);
+  input->record_buffer_event(13, 0);
+  EXPECT_EQ(node->TotalBufferedBytes(), 0);
+  EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
   node->remove_input(input);
   EXPECT_EQ(node->inputs().size(), 0);
 
@@ -424,9 +473,10 @@ TEST(TestManyElements, Model) {
   for (int i = 0; i < 100; i++) {
     source1->record_element();
   }
-  EXPECT_LE(interleave_many->TotalProcessingTime(),
+  EXPECT_LE(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
             (weighted_processing_time(100, 2, 0)) + 100);
-  EXPECT_GE(interleave_many->TotalProcessingTime(), 0);
+  EXPECT_GE(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
+            0);
 }
 
 // Precision for comparison of the gradient and a relative output time change.
@@ -499,22 +549,25 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
               kComparisonPrecision);
 }
 
-TEST(AsyncKnownRatioGradientTest, Model) {
-  const int64 parallelism = model::kAutotune;
+class AsyncKnownRatioGradientTest : public ::testing::TestWithParam<string> {};
+
+TEST_P(AsyncKnownRatioGradientTest, Model) {
+  const string parameter_name = GetParam();
+  const int64 parameter_value = model::kAutotune;
   const double input_time = 100;
   const int64 num_inputs_per_output = 2;
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
       {0, "async_known_many", nullptr}, num_inputs_per_output,
       {model::MakeParameter(
-          "parallelism",
-          std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
-          parallelism)});
+          parameter_name,
+          std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
+          parameter_value)});
   std::shared_ptr<Node> source1 = model::MakeAsyncKnownRatioNode(
       {0, "source1", nullptr}, num_inputs_per_output,
       {model::MakeParameter(
-          "parallelism",
-          std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
-          parallelism)});
+          parameter_name,
+          std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
+          parameter_value)});
   async_known_many->add_input(source1);
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
@@ -549,6 +602,9 @@ TEST(AsyncKnownRatioGradientTest, Model) {
               kComparisonPrecision);
 }
 
+INSTANTIATE_TEST_SUITE_P(Test, AsyncKnownRatioGradientTest,
+                         ::testing::Values("parallelism", "buffer_size"));
+
 TEST(InterleaveManyGradientTest, Model) {
   const int64 parallelism = model::kAutotune;
   const double input_time = 100;
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index bdc994cb224..3c89f789916 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -11,7 +11,7 @@ import "tensorflow/core/framework/attr_value.proto";
 message NodeDef {
   // The name given to this operator. Used for naming inputs,
   // logging, visualization, etc.  Unique within a single GraphDef.
-  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_./]*".
+  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_>./]*".
   string name = 1;
 
   // The operation name.  There may be custom parameters in attrs.
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 6a25114e6dc..9011b61715e 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -211,7 +211,7 @@ NodeDefBuilder& NodeDefBuilder::Device(StringPiece device_spec) {
   return *this;
 }
 
-Status NodeDefBuilder::Finalize(NodeDef* node_def) const {
+Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) {
   const std::vector<string>* errors_ptr = &errors_;
   std::vector<string> errors_storage;
   if (op_def_ != nullptr && inputs_specified_ < op_def_->input_arg_size()) {
@@ -243,7 +243,11 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def) const {
   } else {
     NodeDef node_def_backup;
     if (node_def == nullptr) node_def = &node_def_backup;
-    *node_def = node_def_;
+    if (consume) {
+      *node_def = std::move(node_def_);
+    } else {
+      *node_def = node_def_;
+    }
 
     // Add control inputs after the regular inputs.
     for (const auto& control_input : control_inputs_) {
@@ -257,19 +261,33 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def) const {
   }
 }
 
-NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, const AttrValue& value) {
+bool NodeDefBuilder::AttrValueAlreadyPresent(StringPiece name,
+                                             const AttrValue& value) {
   if (const AttrValue* found = AttrSlice(node_def_).Find(name)) {
     if (!AreAttrValuesEqual(*found, value)) {
       errors_.push_back(strings::StrCat("Inconsistent values for attr '", name,
                                         "' ", SummarizeAttrValue(*found),
                                         " vs. ", SummarizeAttrValue(value)));
     }
-  } else {
+    return true;
+  }
+  return false;
+}
+
+NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, const AttrValue& value) {
+  if (!AttrValueAlreadyPresent(name, value)) {
     AddNodeAttr(name, value, &node_def_);
   }
   return *this;
 }
 
+NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, AttrValue&& value) {
+  if (!AttrValueAlreadyPresent(name, value)) {
+    AddNodeAttr(name, std::move(value), &node_def_);
+  }
+  return *this;
+}
+
 #define ATTR(T)                                                     \
   NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, T value) { \
     AttrValue attr_value;                                           \
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index 63d856d16c6..b4509662e15 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -93,6 +93,7 @@ class NodeDefBuilder {
   // Sets the attr, if not already set.  If already set with a different
   // value, an error will be returned from Finalize().
   NodeDefBuilder& Attr(StringPiece name, const AttrValue& value);
+  NodeDefBuilder& Attr(StringPiece name, AttrValue&& value);
   NodeDefBuilder& Attr(StringPiece name, StringPiece value);
   NodeDefBuilder& Attr(StringPiece name, const char* value);
   NodeDefBuilder& Attr(StringPiece name, int32 value);
@@ -129,9 +130,11 @@ class NodeDefBuilder {
 
   // Finish building the NodeDef, returning any errors or setting
   // *node_def if none.
+  // If `consume` is true, the builder state will be moved into `node_def`,
+  // and the builder will be left in an undefined state.
   // WARNING: Not all problems are detected!  The resulting NodeDef may
   // not be valid!  Call ValidateNodeDef() from node_def_utils to be sure.
-  Status Finalize(NodeDef* node_def) const;
+  Status Finalize(NodeDef* node_def, bool consume = false);
 
   // Accessors for the values set in the constructor.
   const string& node_name() const { return node_def_.name(); }
@@ -170,6 +173,11 @@ class NodeDefBuilder {
     return input_arg->is_ref() ? MakeRefType(dt) : dt;
   }
 
+  // Returns true if an attr named `name` is already present in the node_def_.
+  // If such an attr is already present and `value` is not equal to the present
+  // value, an error is generated.
+  bool AttrValueAlreadyPresent(StringPiece name, const AttrValue& value);
+
   const OpDef* op_def_;
   NodeDef node_def_;
   int inputs_specified_;
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index 7c4426e276a..d93f8e9e2d8 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -48,7 +48,7 @@ class NodeDefBuilderTest : public ::testing::Test {
 
   // Calls Finalize() and verifies it returns success and the result matches
   // expectations.
-  void ExpectSuccess(const NodeDefBuilder& builder,
+  void ExpectSuccess(NodeDefBuilder& builder,  // NOLINT
                      DataTypeSlice expected_in_types,
                      DataTypeSlice expected_out_types, StringPiece proto) {
     NodeDef node_def;
@@ -76,7 +76,7 @@ class NodeDefBuilderTest : public ::testing::Test {
 
   // Calls Finalize() and verifies it returns an error.
   // Each message must appear as a substring of the error.
-  void ExpectFailures(const NodeDefBuilder& builder,
+  void ExpectFailures(NodeDefBuilder& builder,  // NOLINT
                       const std::vector<string>& messages) {
     NodeDef node_def;
     Status status = builder.Finalize(&node_def);
@@ -90,13 +90,15 @@ class NodeDefBuilderTest : public ::testing::Test {
 
   // Calls Finalize() and verifies it returns an error.
   // Message must appear as a substring of the error.
-  void ExpectFailure(const NodeDefBuilder& builder, const string& message) {
+  void ExpectFailure(NodeDefBuilder& builder,  // NOLINT
+                     const string& message) {
     ExpectFailures(builder, {message});
   }
 
   // Like ExpectFailure(), except that the error can come from
   // ValidateNodeDef().
-  void ExpectInvalid(const NodeDefBuilder& builder, const string& message) {
+  void ExpectInvalid(NodeDefBuilder& builder,  // NOLINT
+                     const string& message) {
     NodeDef node_def;
     Status status = builder.Finalize(&node_def);
     if (status.ok()) {
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index a130d26504b..9bfd9af6c92 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -21,12 +21,13 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -243,6 +244,7 @@ bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
     const AttrValue* attr_value;                                              \
     TF_RETURN_IF_ERROR(attrs.Find(attr_name, &attr_value));                   \
     TF_RETURN_IF_ERROR(AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")")); \
+    value->reserve(attr_value->list().FIELD().size());                        \
     for (const auto& v : attr_value->list().FIELD()) {                        \
       __VA_ARGS__;                                                            \
       value->APPEND_OP(CAST);                                                 \
@@ -250,65 +252,94 @@ bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
     return Status::OK();                                                      \
   }
 
-#define DEFINE_GET_ATTR_SIMPLE(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...) \
-  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
-                         TYPE* value) {                                      \
-    const AttrValue* attr_value = attrs.Find(attr_name);                     \
-    if (attr_value == nullptr) {                                             \
-      return false;                                                          \
-    }                                                                        \
-    Status s = AttrValueHasType(*attr_value, ATTR_TYPE);                     \
-    if (!s.ok()) {                                                           \
-      return false;                                                          \
-    }                                                                        \
-    const auto& v = attr_value->FIELD();                                     \
-    __VA_ARGS__;                                                             \
-    *value = CAST;                                                           \
-    return true;                                                             \
-  }                                                                          \
-  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
-                         std::vector<TYPE>* value) {                         \
-    const AttrValue* attr_value = attrs.Find(attr_name);                     \
-    if (attr_value == nullptr) {                                             \
-      return false;                                                          \
-    }                                                                        \
-    Status s = AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")");         \
-    if (!s.ok()) {                                                           \
-      return false;                                                          \
-    }                                                                        \
-    for (const auto& v : attr_value->list().FIELD()) {                       \
-      __VA_ARGS__;                                                           \
-      value->APPEND_OP(CAST);                                                \
-    }                                                                        \
-    return true;                                                             \
+#define DEFINE_TRY_GET_ATTR(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...) \
+  bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,      \
+                      TYPE* value) {                                      \
+    const AttrValue* attr_value = attrs.Find(attr_name);                  \
+    if (attr_value == nullptr) {                                          \
+      return false;                                                       \
+    }                                                                     \
+    Status s = AttrValueHasType(*attr_value, ATTR_TYPE);                  \
+    if (!s.ok()) {                                                        \
+      return false;                                                       \
+    }                                                                     \
+    const auto& v = attr_value->FIELD();                                  \
+    __VA_ARGS__;                                                          \
+    *value = CAST;                                                        \
+    return true;                                                          \
+  }                                                                       \
+  bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,      \
+                      std::vector<TYPE>* value) {                         \
+    const AttrValue* attr_value = attrs.Find(attr_name);                  \
+    if (attr_value == nullptr) {                                          \
+      return false;                                                       \
+    }                                                                     \
+    Status s = AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")");      \
+    if (!s.ok()) {                                                        \
+      return false;                                                       \
+    }                                                                     \
+    value->reserve(attr_value->list().FIELD().size());                    \
+    for (const auto& v : attr_value->list().FIELD()) {                    \
+      __VA_ARGS__;                                                        \
+      value->APPEND_OP(CAST);                                             \
+    }                                                                     \
+    return true;                                                          \
   }
 
 DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
-DEFINE_GET_ATTR_SIMPLE(string, s, "string", emplace_back, v, ;)
+DEFINE_TRY_GET_ATTR(string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64, i, "int", emplace_back, v, ;)
+DEFINE_TRY_GET_ATTR(int64, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(
     int32, i, "int", emplace_back, static_cast<int32>(v),
     if (static_cast<int64>(static_cast<int32>(v)) != v) {
       return errors::InvalidArgument("Attr ", attr_name, " has value ", v,
                                      " out of range for an int32");
     })
+DEFINE_TRY_GET_ATTR(
+    int32, i, "int", emplace_back, static_cast<int32>(v),
+    if (static_cast<int64>(static_cast<int32>(v)) != v) {
+      static int log_counter = 0;
+      if (log_counter < 10) {
+        log_counter++;
+        LOG(WARNING) << "Attr " << attr_name << " has value " << v
+                     << " out of range for an int32";
+      }
+      return false;
+    })
 DEFINE_GET_ATTR(float, f, "float", emplace_back, v, ;)
+DEFINE_TRY_GET_ATTR(float, f, "float", emplace_back, v, ;)
 // std::vector<bool> specialization does not have emplace_back until
 // c++14, so we have to use push_back (see
 // http://en.cppreference.com/w/cpp/container/vector/emplace_back)
 DEFINE_GET_ATTR(bool, b, "bool", push_back, v, ;)
+DEFINE_TRY_GET_ATTR(bool, b, "bool", push_back, v, ;)
 DEFINE_GET_ATTR(DataType, type, "type", emplace_back, static_cast<DataType>(v),
                 ;)
+DEFINE_TRY_GET_ATTR(DataType, type, "type", emplace_back,
+                    static_cast<DataType>(v),
+                    ;)
 DEFINE_GET_ATTR(TensorShapeProto, shape, "shape", emplace_back, v, ;)
 DEFINE_GET_ATTR(TensorShape, shape, "shape", emplace_back, TensorShape(v),
                 TF_RETURN_IF_ERROR(TensorShape::IsValidShape(v));)
+DEFINE_TRY_GET_ATTR(
+    TensorShape, shape, "shape", emplace_back, TensorShape(v),
+    if (!TensorShape::IsValidShape(v).ok()) {
+      static int log_counter = 0;
+      if (log_counter < 10) {
+        log_counter++;
+        LOG(WARNING) << "Attr " << attr_name << " has invalid shape value "
+                     << v.DebugString();
+      }
+      return false;
+    })
 DEFINE_GET_ATTR(PartialTensorShape, shape, "shape", emplace_back,
                 PartialTensorShape(v),
                 TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(v));)
 DEFINE_GET_ATTR(
     Tensor, tensor, "tensor", emplace_back, t, Tensor t; if (!t.FromProto(v)) {
       return errors::InvalidArgument("Attr ", attr_name, " has value ",
-                                     ProtoShortDebugString(v),
+                                     v.ShortDebugString(),
                                      " that can't be converted to a Tensor");
     })
 DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
@@ -332,6 +363,40 @@ const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name) {
   return attr_value->s();
 }
 
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<const string*>* value) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return false;
+  }
+  Status s = AttrValueHasType(*attr_value, "list(string)");
+  if (!s.ok()) {
+    return false;
+  }
+  value->reserve(attr_value->list().s().size());
+  for (const auto& v : attr_value->list().s()) {
+    value->push_back(&v);
+  }
+  return true;
+}
+
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<const TensorShapeProto*>* value) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return false;
+  }
+  Status s = AttrValueHasType(*attr_value, "list(shape)");
+  if (!s.ok()) {
+    return false;
+  }
+  value->reserve(attr_value->list().shape().size());
+  for (const auto& v : attr_value->list().shape()) {
+    value->push_back(&v);
+  }
+  return true;
+}
+
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    DataTypeVector* value) {
   const AttrValue* attr_value;
@@ -352,6 +417,20 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
   return Status::OK();
 }
 
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    const TensorProto** value) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return false;
+  }
+  Status s = AttrValueHasType(*attr_value, "tensor");
+  if (!s.ok()) {
+    return false;
+  }
+  *value = &attr_value->tensor();
+  return true;
+}
+
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    const NameAttrList** value) {
   const AttrValue* attr_value;
@@ -361,6 +440,20 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
   return Status::OK();
 }
 
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    const NameAttrList** value) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return false;
+  }
+  Status s = AttrValueHasType(*attr_value, "func");
+  if (!s.ok()) {
+    return false;
+  }
+  *value = &attr_value->func();
+  return true;
+}
+
 namespace {  // Helper for InOutTypesForNode().
 
 template <class NodeDefOrAttrSlice>
@@ -390,7 +483,7 @@ Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
       }
     } else {
       return errors::InvalidArgument("Missing type or type_attr field in ",
-                                     ProtoShortDebugString(arg_def));
+                                     arg_def.ShortDebugString());
     }
   } else if (!arg_def.type_attr().empty()) {
     const AttrValue* attr_value;
@@ -408,7 +501,7 @@ Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
     sig->push_back(arg_def.type());
   } else {
     return errors::InvalidArgument("No type fields in ",
-                                   ProtoShortDebugString(arg_def));
+                                   arg_def.ShortDebugString());
   }
   if (arg_def.is_ref()) {
     // For all types that were added by this function call, make them refs.
@@ -650,11 +743,21 @@ namespace {
 using ::tensorflow::strings::Scanner;
 
 bool IsValidOpName(StringPiece sp) {
-  return Scanner(sp)
-      .One(Scanner::LETTER_DIGIT_DOT)
-      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
-      .Eos()
-      .GetResult();
+  Scanner scanner(sp);
+  scanner.One(Scanner::LETTER_DIGIT_DOT)
+      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another piece, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::LETTER_DIGIT_DOT)
+        .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  }
 }
 
 bool IsValidDataInputName(StringPiece sp) {
@@ -753,6 +856,10 @@ void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
       AttrValueMap::value_type(string(name), value));
 }
 
+void AddNodeAttr(StringPiece name, AttrValue&& value, NodeDef* node_def) {
+  (*node_def->mutable_attr())[string(name)] = std::move(value);
+}
+
 #define ADD_NODE_ATTR(T)                                           \
   void AddNodeAttr(StringPiece name, T value, NodeDef* node_def) { \
     AttrValue attr_value;                                          \
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 1a089b5f638..8f58607701b 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -74,6 +74,7 @@ typedef protobuf::Map<string, AttrValue> AttrValueMap;
 // Adds an attr with name <name> and value <value> to *node_def.
 // The type of the attr is based on the type of value.
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, AttrValue&& value, NodeDef* node_def);
 void AddNodeAttr(StringPiece name, StringPiece value, NodeDef* node_def);
 void AddNodeAttr(StringPiece name, const char* value, NodeDef* node_def);
 void AddNodeAttr(StringPiece name, int32 value, NodeDef* node_def);
@@ -234,11 +235,15 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 // REQUIRES: Must not use *value beyond the lifetime of node_def.
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    const TensorProto** value);  // type: "tensor"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    const TensorProto** value);  // type: "tensor"
 
 // This version avoids copying the NameAttrList.
 // REQUIRES: Must not use *value beyond the lifetime of node_def.
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    const NameAttrList** value);  // type: "func"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    const NameAttrList** value);  // type: "func"
 
 // These versions copies the NameAttrList(s).
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
@@ -249,10 +254,43 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 // Look up the attr with name attr_name and set *value to its value.  If no
 // attr with attr_name is found in node_def, or the attr does not have
 // a matching type, false is returned.
-bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
-                       string* value);  // type: "string"
-bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
-                       std::vector<string>* value);  // type: "string"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    string* value);  // type: "string"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    int64* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<int64>* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    int32* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    float* value);  // type: "float"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    bool* value);  // type: "bool"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    DataType* value);  // type: "type"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    TensorShape* value);  // type: "shape"
+
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<string>* value);  // type: "list(string)"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<int32>* value);  // type: "list(int)"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<float>* value);  // type: "list(float)"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<bool>* value);  // type: "list(bool)"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<DataType>* value);  // type: "list(type)"
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<TensorShape> value);  // type: "shape"
+
+// Overloads of TryGetNodeAttr() that avoid copying the non-POD attribute
+// values.
+bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                    std::vector<const string*>* value);  // type: "list(string)"
+bool TryGetNodeAttr(
+    const AttrSlice& attrs, StringPiece attr_name,
+    std::vector<const TensorShapeProto*>* value);  // type: "list(shape)"
 
 // Look up the attr with name attr_name and return a reference to its value.
 // If no attr with attr_name is found in node_def, or the attr does not have
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 4c4f0e2f37a..a2a3dcf6c04 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -43,7 +43,7 @@ NodeDef ToNodeDef(const string& text) {
   return node_def;
 }
 
-NodeDef ToNodeDef(const NodeDefBuilder& builder) {
+NodeDef ToNodeDef(NodeDefBuilder&& builder) {
   NodeDef node_def;
   TF_EXPECT_OK(builder.Finalize(&node_def));
   return node_def;
@@ -244,14 +244,14 @@ TEST(NodeDefUtilTest, AnyIn) {
 TEST(NodeDefUtilTest, Device) {
   const OpDef op_def1 = ToOpDef(OpDefBuilder("None"));
   const NodeDef node_def1 =
-      ToNodeDef(NodeDefBuilder("d", &op_def1).Device("/cpu:17"));
+      ToNodeDef(std::move(NodeDefBuilder("d", &op_def1).Device("/cpu:17")));
   ExpectSuccess(node_def1, op_def1);
   EXPECT_EQ("{{node d}} = None[_device=\"/cpu:17\"]()",
             SummarizeNodeDef(node_def1));
 
   const OpDef op_def2 = ToOpDef(OpDefBuilder("WithAttr").Attr("v: int"));
-  const NodeDef node_def2 =
-      ToNodeDef(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5"));
+  const NodeDef node_def2 = ToNodeDef(
+      std::move(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5")));
   ExpectSuccess(node_def2, op_def2);
   EXPECT_EQ("{{node d}} = WithAttr[v=7, _device=\"/cpu:5\"]()",
             SummarizeNodeDef(node_def2));
@@ -282,10 +282,28 @@ TEST(NodeDefUtilTest, ValidSyntax) {
     )proto");
   ExpectValidSyntax(node_def);
 
+  const NodeDef node_def_namespace = ToNodeDef(R"proto(
+    name: 'n'
+    op: 'Project>AnyIn'
+    input: 'a'
+    input: 'b'
+    attr {
+      key: 'T'
+      value { list { type: [ DT_INT32, DT_STRING ] } }
+    }
+  )proto");
+  ExpectValidSyntax(node_def_namespace);
+
   const NodeDef node_def_explicit_inputs = ToNodeDef(R"proto(
-    name:'n' op:'AnyIn' input:'a:0' input:'b:123'
-    attr { key:'T' value { list { type: [DT_INT32, DT_STRING] } } }
-    )proto");
+    name: 'n'
+    op: 'AnyIn'
+    input: 'a:0'
+    input: 'b:123'
+    attr {
+      key: 'T'
+      value { list { type: [ DT_INT32, DT_STRING ] } }
+    }
+  )proto");
   ExpectValidSyntax(node_def_explicit_inputs);
 
   EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
@@ -376,8 +394,8 @@ TEST(InputTypesForNode, Simple) {
                                    .Input("b: int32")
                                    .Output("c: string")
                                    .Output("d: bool"));
-  const NodeDef node_def = ToNodeDef(
-      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()));
+  const NodeDef node_def = ToNodeDef(std::move(
+      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())));
   DataTypeVector types;
   EXPECT_TRUE(InputTypesForNode(node_def, op_def, &types).ok());
   EXPECT_EQ(types[0], DT_FLOAT);
@@ -397,8 +415,8 @@ TEST(OutputTypesForNode, Simple) {
                                    .Input("b: int32")
                                    .Output("c: string")
                                    .Output("d: bool"));
-  const NodeDef node_def = ToNodeDef(
-      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()));
+  const NodeDef node_def = ToNodeDef(std::move(
+      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())));
   DataTypeVector types;
   EXPECT_TRUE(OutputTypesForNode(node_def, op_def, &types).ok());
   EXPECT_EQ(types[0], DT_STRING);
@@ -418,8 +436,10 @@ TEST(OutputTypesForNode_AttrSliceOverload, Simple) {
                                    .Input("b: int32")
                                    .Output("c: string")
                                    .Output("d: bool"));
-  const AttrSlice attr_slice = AttrSlice(ToNodeDef(
-      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())));
+  const AttrSlice attr_slice =
+      AttrSlice(ToNodeDef(std::move(NodeDefBuilder("simple", &op_def)
+                                        .Input(FakeInput())
+                                        .Input(FakeInput()))));
   DataTypeVector types;
   EXPECT_TRUE(OutputTypesForNode(attr_slice, op_def, &types).ok());
   EXPECT_EQ(types[0], DT_STRING);
@@ -433,8 +453,8 @@ TEST(NameRangesForNodeTest, Simple) {
                                    .Output("c: string")
                                    .Output("d: bool"));
   NameRangeMap inputs, outputs;
-  const NodeDef node_def = ToNodeDef(
-      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()));
+  const NodeDef node_def = ToNodeDef(std::move(
+      NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())));
   TF_EXPECT_OK(NameRangesForNode(node_def, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 2}}}), outputs);
@@ -453,18 +473,20 @@ TEST(NameRangesForNodeTest, Polymorphic) {
                                    .Output("c: T")
                                    .Attr("T: type"));
   NameRangeMap inputs, outputs;
-  const NodeDef node_def1 = ToNodeDef(NodeDefBuilder("poly", &op_def)
-                                          .Input(FakeInput(DT_INT32))
-                                          .Input(FakeInput(DT_INT32)));
+  const NodeDef node_def1 =
+      ToNodeDef(std::move(NodeDefBuilder("poly", &op_def)
+                              .Input(FakeInput(DT_INT32))
+                              .Input(FakeInput(DT_INT32))));
   TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
   EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_INT32](a, b)",
             SummarizeNodeDef(node_def1));
 
-  const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("poly", &op_def)
-                                          .Input(FakeInput(DT_BOOL))
-                                          .Input(FakeInput(DT_BOOL)));
+  const NodeDef node_def2 =
+      ToNodeDef(std::move(NodeDefBuilder("poly", &op_def)
+                              .Input(FakeInput(DT_BOOL))
+                              .Input(FakeInput(DT_BOOL))));
   TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
@@ -483,10 +505,11 @@ TEST(NameRangesForNodeTest, NRepeats) {
                                    .Attr("M: int")
                                    .Attr("T: type"));
   NameRangeMap inputs, outputs;
-  const NodeDef node_def1 = ToNodeDef(NodeDefBuilder("nr", &op_def)
-                                          .Input(FakeInput(4, DT_INT32))
-                                          .Input(FakeInput(4, DT_FLOAT))
-                                          .Attr("M", 3));
+  const NodeDef node_def1 =
+      ToNodeDef(std::move(NodeDefBuilder("nr", &op_def)
+                              .Input(FakeInput(4, DT_INT32))
+                              .Input(FakeInput(4, DT_FLOAT))
+                              .Attr("M", 3)));
   TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 4}}, {"b", {4, 8}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 5}}, {"e", {5, 8}}}),
@@ -496,10 +519,11 @@ TEST(NameRangesForNodeTest, NRepeats) {
       "b:2, b:3)",
       SummarizeNodeDef(node_def1));
 
-  const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("nr", &op_def)
-                                          .Input(FakeInput(2, DT_INT32))
-                                          .Input(FakeInput(2, DT_DOUBLE))
-                                          .Attr("M", 7));
+  const NodeDef node_def2 =
+      ToNodeDef(std::move(NodeDefBuilder("nr", &op_def)
+                              .Input(FakeInput(2, DT_INT32))
+                              .Input(FakeInput(2, DT_DOUBLE))
+                              .Attr("M", 7)));
   TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 4}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
@@ -524,10 +548,10 @@ TEST(NameRangesForNodeTest, TypeList) {
                                    .Attr("T3: list(type)"));
   NameRangeMap inputs, outputs;
   const NodeDef node_def1 =
-      ToNodeDef(NodeDefBuilder("tl", &op_def)
-                    .Input(FakeInput({DT_BOOL, DT_FLOAT}))
-                    .Input(FakeInput(4, DT_FLOAT))
-                    .Attr("T3", {DT_INT32, DT_DOUBLE, DT_STRING}));
+      ToNodeDef(std::move(NodeDefBuilder("tl", &op_def)
+                              .Input(FakeInput({DT_BOOL, DT_FLOAT}))
+                              .Input(FakeInput(4, DT_FLOAT))
+                              .Attr("T3", {DT_INT32, DT_DOUBLE, DT_STRING})));
   TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 6}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 4}}, {"d", {4, 7}}, {"e", {7, 9}}}),
@@ -538,10 +562,11 @@ TEST(NameRangesForNodeTest, TypeList) {
       " T3=[DT_INT32, DT_DOUBLE, DT_STRING]](a, a:1, b, b:1, b:2, b:3)",
       SummarizeNodeDef(node_def1));
 
-  const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("tl", &op_def)
-                                          .Input(FakeInput(7, DT_INT32))
-                                          .Input(FakeInput({DT_DOUBLE}))
-                                          .Attr("T3", {DT_DOUBLE, DT_STRING}));
+  const NodeDef node_def2 =
+      ToNodeDef(std::move(NodeDefBuilder("tl", &op_def)
+                              .Input(FakeInput(7, DT_INT32))
+                              .Input(FakeInput({DT_DOUBLE}))
+                              .Attr("T3", {DT_DOUBLE, DT_STRING})));
   TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 7}}, {"b", {7, 8}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index dc931c38cd5..4edb60786d7 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -35,7 +35,7 @@ class TestKernel : public OpKernel {
     Tensor* out_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output("ndef", TensorShape({}),
                                                      &out_tensor));
-    out_tensor->scalar<string>()() = SummarizeNodeDef(def());
+    out_tensor->scalar<tstring>()() = SummarizeNodeDef(def());
   }
 };
 
@@ -87,7 +87,7 @@ class OpCompatibilityTest : public OpsTestBase {
     TF_ASSERT_OK(RunOpKernel());
   }
 
-  string Result() { return GetOutput(0)->scalar<string>()(); }
+  string Result() { return GetOutput(0)->scalar<tstring>()(); }
 
   void ExpectIncompatible(const OpDef& old_op_def, const OpDef& new_op_def,
                           const string& error) {
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index e44ecc9f623..9b65e23e557 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -14,7 +14,7 @@ import "tensorflow/core/framework/types.proto";
 // LINT.IfChange
 message OpDef {
   // Op names starting with an underscore are reserved for internal use.
-  // Names should be CamelCase and match the regexp "[A-Z][a-zA-Z0-9_]*".
+  // Names should be CamelCase and match the regexp "[A-Z][a-zA-Z0-9>_]*".
   string name = 1;
 
   // For describing inputs and outputs.
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 83991c833f2..7b0f77a7825 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -18,9 +18,10 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -183,12 +184,12 @@ const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
   return nullptr;
 }
 
-#define VALIDATE(EXPR, ...)                                            \
-  do {                                                                 \
-    if (!(EXPR)) {                                                     \
-      return errors::InvalidArgument(                                  \
-          __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \
-    }                                                                  \
+#define VALIDATE(EXPR, ...)                                        \
+  do {                                                             \
+    if (!(EXPR)) {                                                 \
+      return errors::InvalidArgument(                              \
+          __VA_ARGS__, "; in OpDef: ", op_def.ShortDebugString()); \
+    }                                                              \
   } while (false)
 
 static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index c721d6df550..50ff5914f80 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 020b3b205b2..b11d70a3817 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/graph.pb_text.h"
-#include "tensorflow/core/framework/kernel_def.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/kernel_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/memory_types.h"
@@ -1248,8 +1248,8 @@ Status FindKernelRegistration(
             "Multiple OpKernel registrations match NodeDef '",
             FormatNodeDefForError(node_name, has_experimental_debug_info,
                                   experimental_debug_info),
-            "': '", ProtoShortDebugString((*reg)->def), "' and '",
-            ProtoShortDebugString(iter->second.def), "'");
+            "': '", (*reg)->def.ShortDebugString(), "' and '",
+            iter->second.def.ShortDebugString(), "'");
       }
       *reg = &iter->second;
     } else {
@@ -1274,8 +1274,8 @@ Status FindKernelRegistration(
               "Multiple Default OpKernel registrations match NodeDef '",
               FormatNodeDefForError(node_name, has_experimental_debug_info,
                                     experimental_debug_info),
-              "': '", ProtoShortDebugString((*reg)->def), "' and '",
-              ProtoShortDebugString(iter->second.def), "'");
+              "': '", (*reg)->def.ShortDebugString(), "' and '",
+              iter->second.def.ShortDebugString(), "'");
         }
         *reg = &iter->second;
       } else {
@@ -1359,7 +1359,8 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    PrioritizedDeviceTypeVector* prioritized_device_types) {
+    PrioritizedDeviceTypeVector* prioritized_device_types,
+    const DeviceNameUtils::ParsedName* local_address_spec) {
   // TODO(zhifengc): Changes the callers (SimplePlacer and
   // DynamicPlacer) to consider the possibility that 'def' is call to
   // a user-defined function and only calls this
@@ -1367,16 +1368,44 @@ Status SupportedDeviceTypesForNode(
   const OpRegistrationData* op_reg_data;
   const Status s = OpRegistry::Global()->LookUp(def.op(), &op_reg_data);
   if (s.ok()) {
+    bool exists_attr_mismatch = false;
     for (const DeviceType& device_type : prioritized_types) {
       const KernelRegistration* reg = nullptr;
-      bool was_attr_mismatch;
+      bool was_attr_mismatch = false;
       TF_RETURN_IF_ERROR(
           FindKernelRegistration(device_type, def, &reg, &was_attr_mismatch));
+      exists_attr_mismatch = exists_attr_mismatch || was_attr_mismatch;
       if (reg != nullptr) {
         int32 priority = reg->def.priority();
         prioritized_device_types->emplace_back(device_type, priority);
       }
     }
+    // Add extra supported device types if the following conditions are
+    // satisfied:
+    // 1) No kernel is defined for the given op (e.g. PyFunc on worker process)
+    // 2) A device is requested for this node which specifies job/replica/task
+    // 3) A local device is provided which specifies job/replica/task
+    // 4) The local device does not have the same (job, replica, task) as the
+    //    requested device
+    //
+    // The goal is to address the issue where a graph includes op (e.g. PyFunc)
+    // whose kernel is known to a remote process but not to the current process.
+    if (prioritized_device_types->empty() && !exists_attr_mismatch &&
+        local_address_spec != nullptr) {
+      DeviceNameUtils::ParsedName requested_device_name;
+      DeviceNameUtils::ParseFullName(def.device(), &requested_device_name);
+      if (DeviceNameUtils::IsDifferentAddressSpace(*local_address_spec,
+                                                   requested_device_name)) {
+        if (requested_device_name.has_type) {
+          prioritized_device_types->push_back(
+              std::make_pair(DeviceType(requested_device_name.type), 0));
+        } else {
+          for (const DeviceType& device_type : prioritized_types) {
+            prioritized_device_types->push_back(std::make_pair(device_type, 0));
+          }
+        }
+      }
+    }
     std::sort(prioritized_device_types->begin(),
               prioritized_device_types->end(),
               [](const std::pair<DeviceType, int32>& a,
@@ -1395,7 +1424,7 @@ Status SupportedDeviceTypesForNode(
 void LogAllRegisteredKernels() {
   KernelList kernel_list = GetAllRegisteredKernels();
   for (const auto& kernel_def : kernel_list.kernel()) {
-    LOG(INFO) << "OpKernel ('" << ProtoShortDebugString(kernel_def) << "')";
+    LOG(INFO) << "OpKernel ('" << kernel_def.ShortDebugString() << "')";
   }
 }
 
@@ -1477,8 +1506,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
   }
   if (registration == nullptr) {
     s.Update(errors::NotFound("No registered '", node_def.op(),
-                              "' OpKernel for ", DeviceTypeString(device_type),
-                              " devices compatible with node ",
+                              "' OpKernel for '", DeviceTypeString(device_type),
+                              "' devices compatible with node ",
                               FormatNodeDefForError(node_def)));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
@@ -1543,7 +1572,7 @@ Status ValidateKernelRegistrations(const OpRegistryInterface& op_registry) {
     const Status status = op_registry.LookUp(kernel_def.op(), &op_reg_data);
     if (!status.ok()) {
       // TODO(josh11b): Make this a hard error.
-      LOG(ERROR) << "OpKernel ('" << ProtoShortDebugString(kernel_def)
+      LOG(ERROR) << "OpKernel ('" << kernel_def.ShortDebugString()
                  << "') for unknown op: " << kernel_def.op();
       continue;
     }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index f05bb9099dd..5d8741461b6 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -621,6 +621,9 @@ class OpKernelContext {
     // The step being executed.
     int64 step_id = 0;
 
+    // True if the op is created by eager runtime.
+    bool is_eager = false;
+
     // The op kernel being computed.
     OpKernel* op_kernel = nullptr;
 
@@ -738,6 +741,8 @@ class OpKernelContext {
 
   int64 step_id() const { return params_->step_id; }
 
+  bool is_eager() const { return params_->is_eager; }
+
   const OpKernel& op_kernel() const { return *params_->op_kernel; }
 
   // Input/output signature.
@@ -1282,8 +1287,9 @@ class OpKernelContext {
     return params_->dec_num_deferred_ops_function;
   }
 
- private:
   Allocator* get_allocator(AllocatorAttributes attr);
+
+ private:
   bool record_memory_consumption_ = false;
 
   // Internal method to add a tensor's buffer to the list of buffers
@@ -1405,7 +1411,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    PrioritizedDeviceTypeVector* device_types);
+    PrioritizedDeviceTypeVector* device_types,
+    const DeviceNameUtils::ParsedName* local_address_spec = nullptr);
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index d28eaf3fe80..ec887a0ad93 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 class DummyKernel : public tensorflow::OpKernel {
  public:
@@ -107,6 +108,8 @@ REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_GPU), DummyKernel);
 // Kernels with different priorities.
 REGISTER_OP("Test5").Input("a: T").Input("b: T").Attr("T: type");
 
+REGISTER_OP("OpWithoutKernel").Input("a: T").Input("b: T").Attr("T: type");
+
 class TestOp5Cpu : public tensorflow::OpKernel {
  public:
   explicit TestOp5Cpu(OpKernelConstruction* context) : OpKernel(context) {}
@@ -134,11 +137,13 @@ class OpKernelTest : public ::testing::Test {
   OpKernelTest() : device_(Env::Default()) {}
 
  protected:
-  NodeDef CreateNodeDef(const string& op_type, const DataTypeVector& inputs) {
+  NodeDef CreateNodeDef(const string& op_type, const DataTypeVector& inputs,
+                        const string& device = "") {
     NodeDefBuilder builder(op_type + "-op", op_type);
     for (DataType dt : inputs) {
       builder.Input(FakeInput(dt));
     }
+    builder.Device(device);
     NodeDef node_def;
     TF_CHECK_OK(builder.Finalize(&node_def));
     return node_def;
@@ -214,6 +219,38 @@ TEST_F(OpKernelTest, CpuTypeRegistered) {
   EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
 }
 
+TEST_F(OpKernelTest, KernelNotRegistered) {
+  const string& local_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string& remote_device = "/job:worker/replica:0/task:0/device";
+  {
+    // Try a node def of an op which does not have kernel. And the requested
+    // device in NodeDef is on a different address space than the local device.
+    NodeDef ndef =
+        CreateNodeDef("OpWithoutKernel", {DT_STRING, DT_STRING}, remote_device);
+    PrioritizedDeviceTypeVector devs;
+    DeviceNameUtils::ParsedName local_device_name;
+    DeviceNameUtils::ParseFullName(local_device, &local_device_name);
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs,
+                                             &local_device_name));
+    EXPECT_EQ(2, devs.size());
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1].first);
+  }
+
+  {
+    // Try a node def of an op which does not have kernel. And the requested
+    // device in NodeDef is on the same address space as the local device.
+    NodeDef ndef =
+        CreateNodeDef("OpWithoutKernel", {DT_STRING, DT_STRING}, local_device);
+    PrioritizedDeviceTypeVector devs;
+    DeviceNameUtils::ParsedName local_device_name;
+    DeviceNameUtils::ParseFullName(local_device, &local_device_name);
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs,
+                                             &local_device_name));
+    EXPECT_EQ(0, devs.size());
+  }
+}
+
 TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
   {
     // Try a node def of an op that is registered for a specific type
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index 39d83d9633b..0e6c33e8117 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -53,16 +53,16 @@ Status ReaderBase::ResetLocked() {
   return Status::OK();
 }
 
-Status ReaderBase::SerializeState(string* state) {
+Status ReaderBase::SerializeState(tstring* state) {
   mutex_lock lock(mu_);
   return SerializeStateLocked(state);
 }
 
-Status ReaderBase::SerializeStateLocked(string* state) {
+Status ReaderBase::SerializeStateLocked(tstring* state) {
   return errors::Unimplemented("Reader SerializeState");
 }
 
-Status ReaderBase::RestoreState(const string& state) {
+Status ReaderBase::RestoreState(const tstring& state) {
   mutex_lock lock(mu_);
   Status status = RestoreStateLocked(state);
   if (!status.ok()) {
@@ -71,13 +71,13 @@ Status ReaderBase::RestoreState(const string& state) {
   return status;
 }
 
-Status ReaderBase::RestoreStateLocked(const string& state) {
+Status ReaderBase::RestoreStateLocked(const tstring& state) {
   return errors::Unimplemented("Reader RestoreState");
 }
 
 int64 ReaderBase::ReadUpTo(const int64 num_records, QueueInterface* queue,
-                           std::vector<string>* keys,
-                           std::vector<string>* values,
+                           std::vector<tstring>* keys,
+                           std::vector<tstring>* values,
                            OpKernelContext* context) {
   mutex_lock lock(mu_);
   int64 records_produced_this_call = 0;
@@ -133,16 +133,16 @@ int64 ReaderBase::ReadUpTo(const int64 num_records, QueueInterface* queue,
 }
 
 // Default implementation just reads one record at a time.
-Status ReaderBase::ReadUpToLocked(int64 num_records, std::vector<string>* keys,
-                                  std::vector<string>* values, int64* num_read,
+Status ReaderBase::ReadUpToLocked(int64 num_records, std::vector<tstring>* keys,
+                                  std::vector<tstring>* values, int64* num_read,
                                   bool* at_end) {
   bool produced = false;
-  string key;
-  string value;
+  tstring key;
+  tstring value;
   Status status = ReadLocked(&key, &value, &produced, at_end);
   if (produced) {
-    keys->emplace_back(key);
-    values->emplace_back(value);
+    keys->push_back(std::move(key));
+    values->push_back(std::move(value));
     *num_read = 1;
   } else {
     *num_read = 0;
@@ -150,7 +150,7 @@ Status ReaderBase::ReadUpToLocked(int64 num_records, std::vector<string>* keys,
   return status;
 }
 
-void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
+void ReaderBase::Read(QueueInterface* queue, tstring* key, tstring* value,
                       OpKernelContext* context) {
   mutex_lock lock(mu_);
   while (true) {
@@ -214,7 +214,7 @@ string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
             context->SetStatus(errors::InvalidArgument(
                 "Expected to dequeue a one-element string tensor"));
           } else {
-            work = tuple[0].flat<string>()(0);
+            work = tuple[0].flat<tstring>()(0);
           }
         }
         n.Notify();
@@ -228,10 +228,19 @@ void ReaderBase::SaveBaseState(ReaderBaseState* state) const {
   state->set_work_started(work_started_);
   state->set_work_finished(work_finished_);
   state->set_num_records_produced(num_records_produced_);
-  state->set_current_work(work_);
+  // Unfortunately, external proto does not accept string_view.
+#if defined(PLATFORM_GOOGLE)
+  // TODO(dero): Remove NOLINT after USE_TSTRING is enabled.  The external proto
+  // compiler does not create an overloaded set method that accepts
+  // absl::string_view, and string_view to std::string is an explicit
+  // conversion.
+  state->set_current_work(StringPiece(work_));  // NOLINT
+#else
+  state->set_current_work(string(work_));
+#endif
 }
 
-string ReaderBase::KeyName(const string& key) const {
+tstring ReaderBase::KeyName(const tstring& key) const {
   return strings::StrCat(current_work(), ":", key);
 }
 
diff --git a/tensorflow/core/framework/reader_base.h b/tensorflow/core/framework/reader_base.h
index 5b82e9181f2..e976121aed9 100644
--- a/tensorflow/core/framework/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -52,15 +52,15 @@ class ReaderBase : public ReaderInterface {
   //  d) If there was an error producing (e.g. an error reading the file,
   //     data corruption), return a non-OK() status.  ReadLocked may be
   //     called again if the user reruns this part of the graph.
-  virtual Status ReadLocked(string* key, string* value, bool* produced,
+  virtual Status ReadLocked(tstring* key, tstring* value, bool* produced,
                             bool* at_end) = 0;
 
   // Descendants may optionally implement these -------------------------------
 
   // Produce up to num_records next key/value pairs from the current
   // work item, in the same manner of ReadLocked.
-  virtual Status ReadUpToLocked(int64 num_records, std::vector<string>* keys,
-                                std::vector<string>* values, int64* num_read,
+  virtual Status ReadUpToLocked(int64 num_records, std::vector<tstring>* keys,
+                                std::vector<tstring>* values, int64* num_read,
                                 bool* at_end);
 
   // Called when work starts / finishes.
@@ -72,8 +72,8 @@ class ReaderBase : public ReaderInterface {
 
   // Default implementation generates an Unimplemented error.
   // See the protected helper methods below.
-  virtual Status SerializeStateLocked(string* state);
-  virtual Status RestoreStateLocked(const string& state);
+  virtual Status SerializeStateLocked(tstring* state);
+  virtual Status RestoreStateLocked(const tstring& state);
 
   // Accessors ----------------------------------------------------------------
 
@@ -83,13 +83,13 @@ class ReaderBase : public ReaderInterface {
   // Returns the name of the current work item (valid if
   // work_in_progress() returns true).  May change between calls to
   // ReadLocked().
-  const string& current_work() const { return work_; }
+  const tstring& current_work() const { return work_; }
 
   // What was passed to the constructor.
   const string& name() const { return name_; }
 
   // Produce the key name (from current_work and the actual key).
-  string KeyName(const string& key) const;
+  tstring KeyName(const tstring& key) const;
 
  protected:
   // For descendants wishing to implement serialize & restore state.
@@ -110,27 +110,27 @@ class ReaderBase : public ReaderInterface {
 
   // Implementations of ReaderInterface methods.  These ensure thread-safety
   // and call the methods above to do the work.
-  void Read(QueueInterface* queue, string* key, string* value,
+  void Read(QueueInterface* queue, tstring* key, tstring* value,
             OpKernelContext* context) override;
 
   // Produces up to num_records.
   // In this implementation all the records come from the same work unit.
   int64 ReadUpTo(const int64 num_records, QueueInterface* queue,
-                 std::vector<string>* keys, std::vector<string>* value,
+                 std::vector<tstring>* keys, std::vector<tstring>* value,
                  OpKernelContext* context) override;
 
   Status Reset() override;
   int64 NumRecordsProduced() override;
   int64 NumWorkUnitsCompleted() override;
-  Status SerializeState(string* state) override;
-  Status RestoreState(const string& state) override;
+  Status SerializeState(tstring* state) override;
+  Status RestoreState(const tstring& state) override;
 
   mutable mutex mu_;
   const string name_;
   int64 work_started_ = 0;
   int64 work_finished_ = 0;
   int64 num_records_produced_ = 0;
-  string work_;
+  tstring work_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index e47644cb8f2..85800304f28 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -48,7 +48,7 @@ class ReaderInterface : public ResourceBase {
   // *context with an OutOfRange Status if the current work is
   // complete and the queue is done (closed and empty).
   // This method may block.
-  virtual void Read(QueueInterface* queue, string* key, string* value,
+  virtual void Read(QueueInterface* queue, tstring* key, tstring* value,
                     OpKernelContext* context) = 0;
 
   // Read up to num_records records into keys / values. May get more work from
@@ -60,7 +60,8 @@ class ReaderInterface : public ResourceBase {
   // structures (that have most likely been reserve(num_records)).
   // Returns how many records were actually read.
   virtual int64 ReadUpTo(const int64 num_records, QueueInterface* queue,
-                         std::vector<string>* keys, std::vector<string>* value,
+                         std::vector<tstring>* keys,
+                         std::vector<tstring>* value,
                          OpKernelContext* context) = 0;
 
   // Restore this reader to its newly-constructed state.
@@ -72,9 +73,9 @@ class ReaderInterface : public ResourceBase {
 
   // -- Serialization/Restoration support --
   // Not all readers will support saving and restoring state.
-  virtual Status SerializeState(string* state) = 0;
+  virtual Status SerializeState(tstring* state) = 0;
   // Note: Must Reset on error.
-  virtual Status RestoreState(const string& state) = 0;
+  virtual Status RestoreState(const tstring& state) = 0;
 
   string DebugString() const override { return "a reader"; }
 
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index ddb5b10c180..47aab2efb61 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -67,7 +67,8 @@ limitations under the License.
 #define TF_CALL_int16(m) m(::tensorflow::int16)
 
 #define TF_CALL_int8(m) m(::tensorflow::int8)
-#define TF_CALL_string(m) m(string)
+#define TF_CALL_string(m) m(::tensorflow::tstring)
+#define TF_CALL_tstring(m) m(::tensorflow::tstring)
 #define TF_CALL_resource(m) m(::tensorflow::ResourceHandle)
 #define TF_CALL_variant(m) m(::tensorflow::Variant)
 #define TF_CALL_complex64(m) m(::tensorflow::complex64)
@@ -98,7 +99,8 @@ limitations under the License.
 #define TF_CALL_int16(m)
 
 #define TF_CALL_int8(m)
-#define TF_CALL_string(m) m(string)
+#define TF_CALL_string(m) m(::tensorflow::tstring)
+#define TF_CALL_tstring(m) m(::tensorflow::tstring)
 #define TF_CALL_resource(m)
 #define TF_CALL_variant(m)
 #define TF_CALL_complex64(m)
@@ -129,6 +131,7 @@ limitations under the License.
 
 #define TF_CALL_int8(m)
 #define TF_CALL_string(m)
+#define TF_CALL_tstring(m)
 #define TF_CALL_resource(m)
 #define TF_CALL_variant(m)
 #define TF_CALL_complex64(m)
@@ -188,10 +191,10 @@ limitations under the License.
 
 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) TF_CALL_variant(m)
+  TF_CALL_POD_TYPES(m) TF_CALL_tstring(m) TF_CALL_resource(m) TF_CALL_variant(m)
 
 // Call "m" on POD and string types.
-#define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m)
+#define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_tstring(m)
 
 // Call "m" on all number types supported on GPU.
 #define TF_CALL_GPU_NUMBER_TYPES(m) \
@@ -213,7 +216,7 @@ limitations under the License.
 #define TF_CALL_SAVE_RESTORE_TYPES(m)                                     \
   TF_CALL_INTEGRAL_TYPES(m)                                               \
   TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_complex64(m) \
-      TF_CALL_complex128(m) TF_CALL_bool(m) TF_CALL_string(m)             \
+      TF_CALL_complex128(m) TF_CALL_bool(m) TF_CALL_tstring(m)            \
           TF_CALL_QUANTIZED_TYPES(m)
 
 #ifdef TENSORFLOW_SYCL_NO_DOUBLE
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 1281b121a91..90e432a1015 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -187,7 +187,7 @@ class LocalRendezvousImpl : public Rendezvous {
 
     // Delete the queue when the last element has been consumed.
     if (queue->size() == 1) {
-      VLOG(2) << "Clean up Send/Recv queu (key:" << key.FullKey() << "). ";
+      VLOG(2) << "Clean up Send/Recv queue (key:" << key.FullKey() << "). ";
       table_.erase(key_hash);
     } else {
       queue->pop_front();
@@ -220,10 +220,53 @@ class LocalRendezvousImpl : public Rendezvous {
     if (queue->empty() || !queue->front()->IsSendValue()) {
       // There is no message to pick up.
       // Only recv-related fields need to be filled.
+      CancellationManager* cm = recv_args.cancellation_manager;
+      CancellationToken token = CancellationManager::kInvalidToken;
+      bool already_cancelled = false;
+      if (cm != nullptr) {
+        token = cm->get_cancellation_token();
+        already_cancelled = !cm->RegisterCallback(token, [this, token,
+                                                          key_hash] {
+          Item* item = nullptr;
+          {
+            mutex_lock l(mu_);
+            ItemQueue* queue = &table_[key_hash];
+            if (!queue->empty() && !queue->front()->IsSendValue()) {
+              for (auto it = queue->begin(); it != queue->end(); it++) {
+                if ((*it)->cancellation_token == token) {
+                  item = *it;
+                  if (queue->size() == 1) {
+                    table_.erase(key_hash);
+                  } else {
+                    queue->erase(it);
+                  }
+                  break;
+                }
+              }
+            }
+          }
+
+          if (item != nullptr) {
+            item->waiter(StatusGroup::MakeDerived(
+                             errors::Cancelled("RecvAsync is cancelled.")),
+                         Args(), item->recv_args, Tensor(), /*is_dead=*/false);
+            delete item;
+          }
+        });
+      }
+      if (already_cancelled) {
+        mu_.unlock();
+        done(StatusGroup::MakeDerived(
+                 errors::Cancelled("RecvAsync is cancelled.")),
+             Args(), recv_args, Tensor(), /*is_dead=*/false);
+        return;
+      }
+
       VLOG(2) << "Enqueue Recv Item (key:" << key.FullKey() << "). ";
       Item* item = new Item;
       item->waiter = std::move(done);
       item->recv_args = recv_args;
+      item->cancellation_token = token;
       if (item->recv_args.device_context) {
         item->recv_args.device_context->Ref();
       }
@@ -239,7 +282,7 @@ class LocalRendezvousImpl : public Rendezvous {
 
     // Delete the queue when the last element has been consumed.
     if (queue->size() == 1) {
-      VLOG(2) << "Clean up Send/Recv queu (key:" << key.FullKey() << "). ";
+      VLOG(2) << "Clean up Send/Recv queue (key:" << key.FullKey() << "). ";
       table_.erase(key_hash);
     } else {
       queue->pop_front();
@@ -280,6 +323,7 @@ class LocalRendezvousImpl : public Rendezvous {
     bool is_dead = false;
     Args send_args;
     Args recv_args;
+    CancellationToken cancellation_token;
 
     ~Item() {
       if (send_args.device_context) {
@@ -288,6 +332,11 @@ class LocalRendezvousImpl : public Rendezvous {
       if (recv_args.device_context) {
         recv_args.device_context->Unref();
       }
+      auto* cm = recv_args.cancellation_manager;
+      if (cancellation_token != CancellationManager::kInvalidToken &&
+          cm != nullptr) {
+        cm->TryDeregisterCallback(cancellation_token);
+      }
     }
 
     // Returns true iff this item represents a value being sent.
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 01e43e44e3f..84e2f6ae192 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -48,6 +49,7 @@ class Rendezvous : public core::RefCounted {
   struct Args {
     DeviceContext* device_context = nullptr;
     AllocatorAttributes alloc_attrs;
+    CancellationManager* cancellation_manager = nullptr;  // not owned.
   };
 
   // Constructs a rendezvous key for the tensor of "name" sent from
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 8f16c6fd839..da9a1fbbe89 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/rendezvous.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
@@ -84,7 +86,7 @@ class LocalRendezvousTest : public ::testing::Test {
 // string -> Tensor<string>
 Tensor V(const string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
-  tensor.scalar<string>()() = content;
+  tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
@@ -92,7 +94,7 @@ Tensor V(const string& content) {
 string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
-  return tensor.scalar<string>()();
+  return tensor.scalar<tstring>()();
 }
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
@@ -153,6 +155,126 @@ TEST_F(LocalRendezvousTest, PingPong) {
   EXPECT_EQ("secret msg", V(val));
 }
 
+TEST_F(LocalRendezvousTest, CancelBeforeRecv) {
+  auto* cm = new CancellationManager();
+  Tensor val(DT_STRING);
+  bool is_dead = false;
+  Rendezvous::Args args;
+  args.cancellation_manager = cm;
+  cm->StartCancel();
+  auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(errors::IsCancelled(s));
+  EXPECT_EQ("[_Derived_]RecvAsync is cancelled.", s.error_message());
+  delete cm;
+}
+
+TEST_F(LocalRendezvousTest, CancelAfterRecv) {
+  auto* cm = new CancellationManager();
+  Notification n;
+  SchedClosure([cm, &n]() {
+    Env::Default()->SleepForMicroseconds(10000);
+    cm->StartCancel();
+    n.Notify();
+  });
+  Tensor val(DT_STRING);
+  bool is_dead = false;
+  Rendezvous::Args args;
+  args.cancellation_manager = cm;
+  auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(errors::IsCancelled(s));
+  EXPECT_EQ("[_Derived_]RecvAsync is cancelled.", s.error_message());
+  n.WaitForNotification();
+  delete cm;
+}
+
+TEST_F(LocalRendezvousTest, CancelEmptyQueue) {
+  auto* cm = new CancellationManager();
+  Notification n;
+  SchedClosure([this, cm, &n]() {
+    Env::Default()->SleepForMicroseconds(10000);
+    Rendezvous::Args args;
+    TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("hello"), false));
+    cm->StartCancel();
+    n.Notify();
+  });
+  Tensor val(DT_STRING);
+  bool is_dead = false;
+  Rendezvous::Args args;
+  args.cancellation_manager = cm;
+  TF_ASSERT_OK(rendez_->Recv(KeyFoo(), args, &val, &is_dead));
+  EXPECT_EQ("hello", V(val));
+  n.WaitForNotification();
+  delete cm;
+}
+
+TEST_F(LocalRendezvousTest, CancelMultiple) {
+  auto* cm = new CancellationManager();
+  SchedClosure([this, cm]() {
+    Env::Default()->SleepForMicroseconds(10000);
+    Rendezvous::Args args;
+    cm->StartCancel();
+    TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("hello"), false));
+    TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("hello"), false));
+  });
+  Tensor val(DT_STRING);
+  Rendezvous::Args args;
+  Rendezvous::Args args_with_cancellation;
+  args_with_cancellation.cancellation_manager = cm;
+  Notification n0;
+  Notification n1;
+  Notification n2;
+  Notification n3;
+  Status s0;
+  Status s1;
+  Status s2;
+  Status s3;
+
+  rendez_->RecvAsync(
+      KeyFoo(), args,
+      [&n0, &s0](const Status& s, const Rendezvous::Args& send_args,
+                 const Rendezvous::Args& recv_args, const Tensor& v,
+                 const bool dead) {
+        s0.Update(s);
+        n0.Notify();
+      });
+  rendez_->RecvAsync(
+      KeyFoo(), args_with_cancellation,
+      [&n1, &s1](const Status& s, const Rendezvous::Args& send_args,
+                 const Rendezvous::Args& recv_args, const Tensor& v,
+                 const bool dead) {
+        s1.Update(s);
+        n1.Notify();
+      });
+  rendez_->RecvAsync(
+      KeyFoo(), args,
+      [&n2, &s2](const Status& s, const Rendezvous::Args& send_args,
+                 const Rendezvous::Args& recv_args, const Tensor& v,
+                 const bool dead) {
+        s2.Update(s);
+        n2.Notify();
+      });
+  rendez_->RecvAsync(
+      KeyFoo(), args_with_cancellation,
+      [&n3, &s3](const Status& s, const Rendezvous::Args& send_args,
+                 const Rendezvous::Args& recv_args, const Tensor& v,
+                 const bool dead) {
+        s3.Update(s);
+        n3.Notify();
+      });
+  n0.WaitForNotification();
+  n1.WaitForNotification();
+  n2.WaitForNotification();
+  n3.WaitForNotification();
+  TF_ASSERT_OK(s0);
+  TF_ASSERT_OK(s2);
+  EXPECT_FALSE(s1.ok());
+  EXPECT_FALSE(s3.ok());
+
+  delete cm;
+}
+
 // A simple structure that behaves a bit like a blocking counter.  The
 // user that decrements counter to 0 does done.Notify(), and the main
 // thread waits for done to be notified.
@@ -331,6 +453,7 @@ BENCHMARK(BM_SendRecv);
 
 void BM_PingPong(int iters) {
   CHECK_GT(iters, 0);
+  auto* cm = new CancellationManager();
   thread::ThreadPool* pool = new thread::ThreadPool(Env::Default(), "test", 1);
 
   // The main thread sends "foo" for iters times and receives "bar"
@@ -352,12 +475,14 @@ void BM_PingPong(int iters) {
   Tensor bar(DT_STRING, TensorShape({}));
   bool is_dead = false;
   Rendezvous::Args args;
+  args.cancellation_manager = cm;
   for (int i = 0; i < iters; ++i) {
     TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
     TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
   }
   CHECK_EQ("bar", V(bar));
   delete pool;
+  delete cm;
 }
 BENCHMARK(BM_PingPong);
 
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 301fe686df1..67ea803511c 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -639,8 +639,8 @@ Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
           "Resource handle must have 2 elements, but had shape: ",
           tensor.shape().DebugString());
     }
-    container = tensor.flat<string>()(0);
-    shared_name = tensor.flat<string>()(1);
+    container = tensor.flat<tstring>()(0);
+    shared_name = tensor.flat<tstring>()(1);
   }
   return ctx->resource_manager()->Lookup(container, shared_name, resource);
 }
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index fbcd439dea3..60e9703190c 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -96,7 +96,7 @@ class ResourceOpKernel : public OpKernel {
       }
 
       if (!has_resource_type_) {
-        auto h = handle_.AccessTensor(context)->template flat<string>();
+        auto h = handle_.AccessTensor(context)->template flat<tstring>();
         h(0) = cinfo_.container();
         h(1) = cinfo_.name();
       }
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index f902eb69bd1..ae0934dbce4 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -94,11 +94,217 @@ class RunHandlerEnvironment {
   }
 };
 
+typedef typename RunHandlerEnvironment::Task Task;
+typedef Eigen::RunQueue<Task, 1024> Queue;
+
+class ThreadWorkSource {
+ public:
+  ThreadWorkSource()
+      : non_blocking_work_sharding_factor_(2),
+        non_blocking_work_queues_(non_blocking_work_sharding_factor_),
+        blocking_inflight_(0),
+        non_blocking_inflight_(0),
+        traceme_id_(0) {
+    queue_waiters_.next = &queue_waiters_;
+    queue_waiters_.prev = &queue_waiters_;
+    for (int i = 0; i < NonBlockingWorkShardingFactor(); ++i) {
+      non_blocking_work_queues_.emplace_back(new NonBlockingQueue());
+    }
+  }
+
+  ~ThreadWorkSource() {
+    for (int i = 0; i < non_blocking_work_queues_.size(); ++i) {
+      delete non_blocking_work_queues_[i];
+    }
+  }
+
+  Task EnqueueTask(Task t, bool is_blocking) {
+    mutex* mu = nullptr;
+    Queue* task_queue = nullptr;
+    thread_local int64 closure_counter = 0;
+
+    if (!is_blocking) {
+      int queue_index = ++closure_counter % non_blocking_work_sharding_factor_;
+      task_queue = &(non_blocking_work_queues_[queue_index]->queue);
+      mu = &non_blocking_work_queues_[queue_index]->queue_op_mu;
+    } else {
+      task_queue = &blocking_work_queue_;
+      mu = &blocking_queue_op_mu_;
+    }
+
+    {
+      mutex_lock l(*mu);
+      // For a given queue, only one thread can call PushFront.
+      t = task_queue->PushFront(std::move(t));
+    }
+
+    // Only wake up the thread that can take tasks from both blocking and
+    // non-blocking queues. The rational is that we don't want to wake up more
+    // threads than the available physical cores for them to compete for
+    // resource. The non-blocking threads are used only to compensate for
+    // threads that may be blocked on some tasks. There is less need to
+    // proactively wake up those threads.
+    static int max_rank_to_wakeup = static_cast<int>(ParamFromEnvWithDefault(
+        "TF_RUN_HANDLER_MAX_RANK_TO_WAKE_UP", kMaxConcurrentHandlers));
+    if (max_rank_to_wakeup > 0 &&
+        rank_.load(std::memory_order_relaxed) <= max_rank_to_wakeup) {
+      Waiter* w = nullptr;
+      {
+        mutex_lock l(waiters_mu_);
+        if (queue_waiters_.next != &queue_waiters_) {
+          // Remove waiter from the LIFO queue
+          w = queue_waiters_.next;
+
+          CHECK(w->prev != w);
+          CHECK(w->next != w);
+
+          w->next->prev = w->prev;
+          w->prev->next = w->next;
+
+          // Use `w->next == &w` to indicate that the waiter has been removed
+          // from the queue.
+          w->next = w;
+          w->prev = w;
+        }
+      }
+      if (w != nullptr) {
+        // We call notify_one() without any locks, so we can miss notifications.
+        // The wake up logic is best effort and a thread will wake in short
+        // period of time in case a notification is missed.
+        w->cv.notify_one();
+      }
+    }
+    VLOG(3) << "Added " << (is_blocking ? "inter" : "intra") << " work from "
+            << traceme_id_.load(std::memory_order_relaxed);
+    return t;
+  }
+
+  Task PopBlockingTask() { return blocking_work_queue_.PopBack(); }
+
+  Task PopNonBlockingTask(int index) {
+    return non_blocking_work_queues_[index]->queue.PopBack();
+  }
+
+  void WaitForWork(int max_sleep_micros) {
+    thread_local Waiter waiter;
+    {
+      mutex_lock l(waiters_mu_);
+      CHECK_EQ(waiter.next, &waiter);
+      CHECK_EQ(waiter.prev, &waiter);
+
+      // Add waiter to the LIFO queue
+      waiter.prev = &queue_waiters_;
+      waiter.next = queue_waiters_.next;
+      waiter.next->prev = &waiter;
+      waiter.prev->next = &waiter;
+    }
+    {
+      mutex_lock l(waiter.mu);
+      // Wait on the condition variable
+      waiter.cv.wait_for(l, std::chrono::microseconds(max_sleep_micros));
+    }
+
+    mutex_lock l(waiters_mu_);
+    // Remove waiter from the LIFO queue. Note even when a waiter wakes up due
+    // to a notification we cannot conclude the waiter is not in the queue.
+    // This is due to the fact that a thread preempted right before notifying
+    // may resume after a waiter got re-added.
+    if (waiter.next != &waiter) {
+      CHECK(waiter.prev != &waiter);
+      waiter.next->prev = waiter.prev;
+      waiter.prev->next = waiter.next;
+      waiter.next = &waiter;
+      waiter.prev = &waiter;
+    } else {
+      CHECK_EQ(waiter.prev, &waiter);
+    }
+  }
+
+  int TaskQueueSize(bool is_blocking) {
+    if (is_blocking) {
+      return blocking_work_queue_.Size();
+    } else {
+      unsigned total_size = 0;
+      for (int i = 0; i < non_blocking_work_sharding_factor_; ++i) {
+        total_size += non_blocking_work_queues_[i]->queue.Size();
+      }
+      return total_size;
+    }
+  }
+
+  int64 GetTracemeId() { return traceme_id_.load(std::memory_order_relaxed); }
+
+  void SetTracemeId(int64 value) { traceme_id_ = value; }
+  void SetRank(int64 value) { rank_ = value; }
+
+  int64 GetInflightTaskCount(bool is_blocking) {
+    std::atomic<int64>* counter =
+        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+    return counter->load(std::memory_order_relaxed);
+  }
+
+  void IncrementInflightTaskCount(bool is_blocking) {
+    std::atomic<int64>* counter =
+        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+    counter->fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void DecrementInflightTaskCount(bool is_blocking) {
+    std::atomic<int64>* counter =
+        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+    counter->fetch_sub(1, std::memory_order_relaxed);
+  }
+
+  unsigned NonBlockingWorkShardingFactor() {
+    return non_blocking_work_sharding_factor_;
+  }
+
+  std::string ToString() {
+    return strings::StrCat("traceme_id = ", GetTracemeId(),
+                           ", inter queue size = ", TaskQueueSize(true),
+                           ", inter inflight = ", GetInflightTaskCount(true),
+                           ", intra queue size = ", TaskQueueSize(false),
+                           ", intra inflight = ", GetInflightTaskCount(false));
+  }
+
+ private:
+  // To reduce cache misses, we use a doubly-linked list of Waiter structs and
+  // queue them in LIFO order rather than the FIFO order used by a single
+  // condition variable.
+  struct Waiter {
+    Waiter() {
+      next = this;
+      prev = this;
+    }
+    condition_variable cv;
+    mutex mu;
+    Waiter* next;
+    Waiter* prev;
+  };
+
+  struct NonBlockingQueue {
+    mutex queue_op_mu;
+    char pad[128];
+    Queue queue;
+  };
+
+  int32 non_blocking_work_sharding_factor_;
+  Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
+
+  std::atomic<int64> blocking_inflight_;
+  std::atomic<int64> non_blocking_inflight_;
+
+  Queue blocking_work_queue_;
+  mutex blocking_queue_op_mu_;
+  char pad_[128];
+  mutex waiters_mu_;
+  Waiter queue_waiters_ GUARDED_BY(waiters_mu_);
+  std::atomic<int64> traceme_id_;
+  std::atomic<int64> rank_;
+};
+
 class RunHandlerThreadPool {
  public:
-  typedef typename RunHandlerEnvironment::Task Task;
-  typedef Eigen::RunQueue<Task, 1024> Queue;
-
   struct PerThread {
     constexpr PerThread() : pool(nullptr), thread_id(-1) {}
     RunHandlerThreadPool* pool;  // Parent pool, or null for normal threads.
@@ -133,36 +339,21 @@ class RunHandlerThreadPool {
 
     cancelled_ = true;
     for (size_t i = 0; i < thread_data_.size(); ++i) {
+      {
+        mutex_lock l(thread_data_[i].mu);
+        thread_data_[i].sources_not_empty.notify_all();
+      }
       thread_data_[i].thread.reset();
     }
   }
 
-  struct ThreadWorkSource {
-    ThreadWorkSource()
-        : blocking_inflight(0), non_blocking_inflight(0), traceme_id(0) {}
-    Queue blocking_work_queue;
-    std::atomic<int64> blocking_inflight;
-    mutex blocking_mu;
-    Queue non_blocking_work_queue;
-    std::atomic<int64> non_blocking_inflight;
-    mutex non_blocking_mu;
-    std::atomic<int64> traceme_id;
-  };
-
-  void AddWorkToQueue(Queue* q, mutex* mu, bool inter_work,
-                      std::atomic<int64>* traceme_id,
+  void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking,
                       std::function<void()> fn) {
     Task t = env_.CreateTask(std::move(fn));
-    {
-      mutex_lock l(*mu);
-      // For a given queue, only one thread can call PushFront.
-      t = q->PushFront(std::move(t));
-      VLOG(3) << "Added " << (inter_work ? "inter" : "intra") << " work from "
-              << traceme_id->load(std::memory_order_relaxed);
-    }
+    t = tws->EnqueueTask(std::move(t), is_blocking);
     if (t.f) {
-      VLOG(3) << "Running " << (inter_work ? "inter" : "intra") << " work from "
-              << traceme_id->load(std::memory_order_relaxed);
+      VLOG(3) << "Running " << (is_blocking ? "inter" : "intra") << " work for "
+              << tws->GetTracemeId();
       env_.ExecuteTask(t);
     }
   }
@@ -189,6 +380,7 @@ class RunHandlerThreadPool {
             thread_work_sources[i]);
       }
     }
+    thread_data_[tid].sources_not_empty.notify_all();
   }
 
   PerThread* GetPerThread() {
@@ -215,10 +407,14 @@ class RunHandlerThreadPool {
 
   void WorkerLoop(int thread_id, bool may_steal_blocking_work);
 
+  void WaitForWork(bool is_blocking, int thread_id,
+                   int32 max_blocking_inflight);
+
  private:
   struct ThreadData {
     ThreadData() : thread_work_sources(kMaxConcurrentHandlers) {}
     mutex mu;
+    condition_variable sources_not_empty;
     std::unique_ptr<Thread> thread;
     Eigen::MaxSizeVector<ThreadWorkSource*> thread_work_sources GUARDED_BY(mu);
   };
@@ -238,12 +434,12 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id,
   PerThread* pt = GetPerThread();
   pt->pool = this;
   pt->thread_id = thread_id;
+  static constexpr int32 kMaxBlockingInflight = 10;
 
   while (!cancelled_) {
     Task t;
-    bool inter_work = true;
-    std::atomic<int64>* inflight_counter = nullptr;
-    int64 traceme_id = 0;
+    ThreadWorkSource* tws = nullptr;
+    bool task_from_blocking_queue = true;
     Eigen::MaxSizeVector<ThreadWorkSource*>* thread_work_sources =
         &thread_data_[thread_id].thread_work_sources;
     {
@@ -252,42 +448,55 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id,
       mutex_lock l(thread_data_[thread_id].mu);
 
       for (int i = 0; i < thread_work_sources->size(); ++i) {
-        ThreadWorkSource* tws = (*thread_work_sources)[i];
+        tws = (*thread_work_sources)[i];
         // We want a smallish numbers of inter threads since
         // otherwise there will be contention in PropagateOutputs.
         // This is best effort policy.
-        static constexpr int32 kMaxBlockingInflight = 10;
         if (may_steal_blocking_work &&
-            (tws->blocking_inflight.load(std::memory_order_relaxed) <
-             kMaxBlockingInflight)) {
-          t = tws->blocking_work_queue.PopBack();
+            tws->GetInflightTaskCount(true) < kMaxBlockingInflight) {
+          t = tws->PopBlockingTask();
           if (t.f) {
-            inflight_counter = &(tws->blocking_inflight);
-            traceme_id = tws->traceme_id.load(std::memory_order_relaxed);
             break;
           }
         }
-        t = tws->non_blocking_work_queue.PopBack();
-        if (t.f) {
-          inflight_counter = &(tws->non_blocking_inflight);
-          traceme_id = tws->traceme_id.load(std::memory_order_relaxed);
-          inter_work = false;
-          break;
+        if (i == 0) {
+          // Always look for any work from the "primary" work source.
+          // This way when we wake up a thread for a new closure we are
+          // guaranteed it can be worked on.
+          for (int j = 0; j < tws->NonBlockingWorkShardingFactor(); ++j) {
+            t = tws->PopNonBlockingTask((j + thread_id) %
+                                        tws->NonBlockingWorkShardingFactor());
+            if (t.f) {
+              task_from_blocking_queue = false;
+              break;
+            }
+          }
+          if (t.f) {
+            break;
+          }
+        } else {
+          t = tws->PopNonBlockingTask(thread_id %
+                                      tws->NonBlockingWorkShardingFactor());
+          if (t.f) {
+            task_from_blocking_queue = false;
+            break;
+          }
         }
       }
     }
     if (t.f) {
       profiler::TraceMe activity(
           [=] {
-            return strings::StrCat(inter_work ? "inter" : "intra", " ",
-                                   "#id = ", traceme_id, " ", thread_id, "#");
+            return strings::StrCat(task_from_blocking_queue ? "inter" : "intra",
+                                   " #id = ", tws->GetTracemeId(), " ",
+                                   thread_id, "#");
           },
           profiler::TraceMeLevel::kInfo);
-      VLOG(2) << "Running " << (inter_work ? "inter" : "intra") << " work from "
-              << traceme_id;
-      inflight_counter->fetch_add(1, std::memory_order_relaxed);
+      VLOG(2) << "Running " << (task_from_blocking_queue ? "inter" : "intra")
+              << " work from " << tws->GetTracemeId();
+      tws->IncrementInflightTaskCount(task_from_blocking_queue);
       env_.ExecuteTask(t);
-      inflight_counter->fetch_sub(1, std::memory_order_relaxed);
+      tws->DecrementInflightTaskCount(task_from_blocking_queue);
     } else {
       profiler::TraceMe activity(
           [=] {
@@ -297,22 +506,48 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id,
       if (VLOG_IS_ON(4)) {
         mutex_lock l(thread_data_[thread_id].mu);
         for (int i = 0; i < thread_work_sources->size(); ++i) {
-          ThreadWorkSource* tws = (*thread_work_sources)[i];
-          VLOG(4) << "source id " << i << " traceme_id = "
-                  << tws->traceme_id.load(std::memory_order_relaxed)
-                  << " inter queue size " << tws->blocking_work_queue.Size()
-                  << " inter inflight "
-                  << tws->blocking_inflight.load(std::memory_order_relaxed)
-                  << " intra queue size " << tws->non_blocking_work_queue.Size()
-                  << " intra inflight "
-                  << tws->non_blocking_inflight.load(std::memory_order_relaxed);
+          VLOG(4) << "source id " << i << " "
+                  << (*thread_work_sources)[i]->ToString();
         }
       }
-      Env::Default()->SleepForMicroseconds(250);
+
+      WaitForWork(may_steal_blocking_work, thread_id, kMaxBlockingInflight);
     }
   }
 }
 
+void RunHandlerThreadPool::WaitForWork(bool is_blocking, int thread_id,
+                                       int32 max_blocking_inflight) {
+  const int kMaxSleepMicros = 250;
+
+  // The non-blocking thread will just sleep.
+  if (!is_blocking) {
+    Env::Default()->SleepForMicroseconds(kMaxSleepMicros);
+    return;
+  }
+
+  ThreadWorkSource* tws = nullptr;
+  {
+    Eigen::MaxSizeVector<ThreadWorkSource*>* thread_work_sources =
+        &thread_data_[thread_id].thread_work_sources;
+    mutex_lock l(thread_data_[thread_id].mu);
+    while (!cancelled_ && thread_work_sources->empty()) {
+      // Wait until there is new request
+      thread_data_[thread_id].sources_not_empty.wait(l);
+    }
+    if (cancelled_) {
+      return;
+    }
+    tws = (*thread_work_sources)[0];
+  }
+
+  if (tws->GetInflightTaskCount(true) >= max_blocking_inflight) {
+    // Sleep to reduce contention in PropagateOutputs
+    Env::Default()->SleepForMicroseconds(kMaxSleepMicros);
+  }
+  tws->WaitForWork(kMaxSleepMicros);
+}
+
 }  // namespace
 
 // Contains the concrete implementation of the RunHandler.
@@ -338,7 +573,7 @@ class RunHandler::Impl {
 
   RunHandlerPool::Impl* pool_impl() { return pool_impl_; }
 
-  RunHandlerThreadPool::ThreadWorkSource* tws() { return &tws_; }
+  ThreadWorkSource* tws() { return &tws_; }
 
  private:
   class ThreadPoolInterfaceWrapper : public thread::ThreadPoolInterface {
@@ -358,7 +593,7 @@ class RunHandler::Impl {
   uint64 start_time_us_;
   int64 step_id_;
   std::unique_ptr<thread::ThreadPoolInterface> thread_pool_interface_;
-  RunHandlerThreadPool::ThreadWorkSource tws_;
+  ThreadWorkSource tws_;
 };
 
 // Contains shared state across all run handlers present in the pool. Also
@@ -419,8 +654,8 @@ class RunHandlerPool::Impl {
       mutex_lock l(mu_);
       DCHECK_GT(sorted_active_handlers_.size(), 0);
 
-      CHECK_EQ(handler->tws()->blocking_work_queue.Size(), 0);
-      CHECK_EQ(handler->tws()->non_blocking_work_queue.Size(), 0);
+      CHECK_EQ(handler->tws()->TaskQueueSize(true), 0);
+      CHECK_EQ(handler->tws()->TaskQueueSize(false), 0);
 
       uint64 now = tensorflow::Env::Default()->NowMicros();
       double elapsed = (now - handler->start_time_us()) / 1000.0;
@@ -472,31 +707,35 @@ class RunHandlerPool::Impl {
 void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
   int num_active_requests = sorted_active_handlers_.size();
   if (num_active_requests == 0) return;
-  Eigen::MaxSizeVector<RunHandlerThreadPool::ThreadWorkSource*>
-      thread_work_sources(num_active_requests);
+  Eigen::MaxSizeVector<ThreadWorkSource*> thread_work_sources(
+      num_active_requests);
 
   thread_work_sources.resize(num_active_requests);
   for (int i = 0; i < num_active_requests; ++i) {
     thread_work_sources[i] = sorted_active_handlers_[i]->tws();
+    thread_work_sources[i]->SetRank(i);
   }
 
   int num_threads = run_handler_thread_pool()->NumThreads();
   int num_blocking_threads = run_handler_thread_pool()->NumBlockingThreads();
+  int num_non_blocking_threads = num_threads - num_blocking_threads;
+
   std::vector<int> request_idx_list = ChooseRequestsWithExponentialDistribution(
       num_active_requests, num_blocking_threads);
-
-  for (int tid = 0; tid < num_blocking_threads; ++tid) {
-    VLOG(2) << "Set work for tid=" << tid
-            << " with start_request_idx=" << request_idx_list[tid];
-    run_handler_thread_pool()->SetThreadWorkSources(tid, request_idx_list[tid],
+  for (int i = 0; i < num_blocking_threads; ++i) {
+    VLOG(2) << "Set work for tid=" << i
+            << " with start_request_idx=" << request_idx_list[i];
+    run_handler_thread_pool()->SetThreadWorkSources(i, request_idx_list[i],
                                                     thread_work_sources);
   }
 
-  // Non-blocking (i.e. intra-op) threads always steal requests in FIFO order
-  for (int tid = num_blocking_threads; tid < num_threads; ++tid) {
-    VLOG(2) << "Set work for tid=" << tid << " with start_request_idx=0";
-    run_handler_thread_pool()->SetThreadWorkSources(tid, 0,
-                                                    thread_work_sources);
+  request_idx_list = ChooseRequestsWithExponentialDistribution(
+      num_active_requests, num_non_blocking_threads);
+  for (int i = 0; i < num_non_blocking_threads; ++i) {
+    VLOG(2) << "Set work for tid=" << (i + num_blocking_threads)
+            << " with start_request_idx=" << request_idx_list[i];
+    run_handler_thread_pool()->SetThreadWorkSources(
+        i + num_blocking_threads, request_idx_list[i], thread_work_sources);
   }
 
   if (iterations_++ % 50000 == 10 && VLOG_IS_ON(1)) {
@@ -514,8 +753,7 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
       times_str += strings::StrCat(
           (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms.");
       ids_str +=
-          strings::StrCat(sorted_active_handlers_[i]->tws()->traceme_id.load(
-              std::memory_order_relaxed));
+          strings::StrCat(sorted_active_handlers_[i]->tws()->GetTracemeId());
     }
     VLOG(1) << "Elapsed times are: " << times_str;
     VLOG(1) << "Step ids are: " << ids_str;
@@ -545,25 +783,21 @@ RunHandler::Impl::Impl(RunHandlerPool::Impl* pool_impl)
 }
 
 void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
-  VLOG(3) << "Scheduling inter work for  "
-          << tws()->traceme_id.load(std::memory_order_relaxed);
-  pool_impl_->run_handler_thread_pool()->AddWorkToQueue(
-      &tws()->blocking_work_queue, &tws()->blocking_mu, true,
-      &tws()->traceme_id, std::move(fn));
+  VLOG(3) << "Scheduling inter work for  " << tws()->GetTracemeId();
+  pool_impl_->run_handler_thread_pool()->AddWorkToQueue(tws(), true,
+                                                        std::move(fn));
 }
 
 void RunHandler::Impl::ScheduleIntraOpClosure(std::function<void()> fn) {
-  VLOG(3) << "Scheduling inter work for "
-          << tws()->traceme_id.load(std::memory_order_relaxed);
-  pool_impl_->run_handler_thread_pool()->AddWorkToQueue(
-      &tws()->non_blocking_work_queue, &tws()->non_blocking_mu, false,
-      &tws()->traceme_id, std::move(fn));
+  VLOG(3) << "Scheduling intra work for " << tws()->GetTracemeId();
+  pool_impl_->run_handler_thread_pool()->AddWorkToQueue(tws(), false,
+                                                        std::move(fn));
 }
 
 void RunHandler::Impl::Reset(int64 step_id) {
   start_time_us_ = tensorflow::Env::Default()->NowMicros();
   step_id_ = step_id;
-  tws_.traceme_id = step_id;
+  tws_.SetTracemeId(step_id);
 }
 
 RunHandlerPool::RunHandlerPool(int num_inter_op_threads)
diff --git a/tensorflow/core/framework/run_handler_test.cc b/tensorflow/core/framework/run_handler_test.cc
index 0a100473136..263ef16796f 100644
--- a/tensorflow/core/framework/run_handler_test.cc
+++ b/tensorflow/core/framework/run_handler_test.cc
@@ -40,34 +40,28 @@ TEST(RunHandlerUtilTest, TestBasicScheduling) {
   std::unique_ptr<RunHandlerPool> pool(
       new RunHandlerPool(num_threads, num_threads));
 
-  // RunHandler has 2 * num_threads (inter + intra) -
-  // all should be able to run concurrently.
-  absl::Barrier barrier1(num_threads);
-  absl::Barrier barrier2(num_threads);
+  // RunHandler should always be able to run num_threads inter closures
+  absl::Barrier barrier(num_threads);
 
   BlockingCounter counter(2 * num_handlers * num_threads);
 
   thread::ThreadPool test_pool(Env::Default(), "test", num_handlers);
   for (int i = 0; i < num_handlers; ++i) {
-    test_pool.Schedule([&counter, &barrier1, &barrier2, &pool, i,
-                        num_threads]() {
-      auto handler = pool->Get();
+    test_pool.Schedule([&counter, &barrier, &pool, i, num_threads]() {
+      auto handler = pool->Get(i);
       BlockingCounter local_counter(2 * num_threads);
       auto intra_thread_pool = handler->AsIntraThreadPoolInterface();
 
       for (int j = 0; j < num_threads; ++j) {
         handler->ScheduleInterOpClosure(
-            [&local_counter, &counter, &barrier1, i]() {
+            [&local_counter, &counter, &barrier, i]() {
               if (i == 2) {
-                barrier1.Block();
+                barrier.Block();
               }
               counter.DecrementCount();
               local_counter.DecrementCount();
             });
-        intra_thread_pool->Schedule([&local_counter, &counter, &barrier2, i]() {
-          if (i == 9) {
-            barrier2.Block();
-          }
+        intra_thread_pool->Schedule([&local_counter, &counter]() {
           counter.DecrementCount();
           local_counter.DecrementCount();
         });
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
index 19c72a2af4f..ebdc670a925 100644
--- a/tensorflow/core/framework/run_handler_util.cc
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/framework/run_handler_util.h"
 
-#include <algorithm>
 #include <cmath>
+
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
+double ParamFromEnvWithDefault(const std::string& var_name,
+                               double default_value) {
+  const char* val = std::getenv(var_name.c_str());
+  double num;
+  return (val && strings::safe_strtod(val, &num)) ? num : default_value;
+}
+
 void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
                                     int min_threads_per_request,
                                     std::vector<std::uint_fast32_t>* start_vec,
@@ -80,21 +88,30 @@ std::vector<int> ChooseRequestsWithExponentialDistribution(
   // Fraction of the total threads that will be evenly distributed across
   // requests. The rest of threads will be exponentially distributed across
   // requests.
-  const double kCapacityFractionForEvenDistribution = 0.5;
+  static const double kCapacityFractionForEvenDistribution =
+      ParamFromEnvWithDefault("TF_RUN_HANDLER_EXP_DIST_EVEN_FRACTION", 0.5);
+
   // For the threads that will be exponentially distributed across requests,
   // a request will get allocated (kPowerBase - 1) times as much threads as
   // threads allocated to all requests that arrive after it. For example, the
   // oldest request will be allocated num_threads*(kPowerBase-1)/kPowerBase
   // number of threads.
-  const double kPowerBase = 2;
+  static const double kPowerBase =
+      ParamFromEnvWithDefault("TF_RUN_HANDLER_EXP_DIST_POWER_BASE", 2.0);
 
   std::vector<int> request_idx_list;
   request_idx_list.resize(num_threads);
   // Each request gets at least this number of threads that steal from it first.
   int min_threads_per_request =
       num_threads * kCapacityFractionForEvenDistribution / num_active_requests;
-  min_threads_per_request = std::max(1, min_threads_per_request);
-  min_threads_per_request = std::min(3, min_threads_per_request);
+  min_threads_per_request =
+      std::max(static_cast<int>(ParamFromEnvWithDefault(
+                   "TF_RUN_HANDLER_EXP_DIST_MIN_EVEN_THREADS", 1)),
+               min_threads_per_request);
+  min_threads_per_request =
+      std::min(static_cast<int>(ParamFromEnvWithDefault(
+                   "TF_RUN_HANDLER_EXP_DIST_MAX_EVEN_THREADS", 3)),
+               min_threads_per_request);
 
   int num_remaining_threads =
       std::max(0, num_threads - num_active_requests * min_threads_per_request);
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
index fd146254205..864e6e698fc 100644
--- a/tensorflow/core/framework/run_handler_util.h
+++ b/tensorflow/core/framework/run_handler_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 namespace tensorflow {
@@ -53,5 +54,10 @@ void ComputeInterOpStealingRanges(int num_threads, int min_threads_per_domain,
 std::vector<int> ChooseRequestsWithExponentialDistribution(
     int num_active_requests, int num_threads);
 
+// Loop environment variable named 'var_name' and return the value if it exist
+// and can be parsed. Return 'default_value' otherwise.
+double ParamFromEnvWithDefault(const std::string& var_name,
+                               double default_value);
+
 }  // end namespace tensorflow
 #endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 759bfdc939c..68527a438ee 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 
 #include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -346,7 +346,7 @@ string InferenceContext::DebugString(DimensionHandle d) {
 
 string InferenceContext::DebugString() const {
   return strings::StrCat("InferenceContext for node: ",
-                         ProtoDebugString(*node_def_));
+                         node_def_->DebugString());
 }
 
 string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 98e6cc8db4d..2cf447471e3 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -33,7 +33,6 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
   TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
 
   std::vector<string> ins_v = str_util::Split(ins, ';');
-  std::unique_ptr<const NodeDef> new_node_def;
 
   InferenceContext::ShapeManager manager;
   std::vector<ShapeHandle> in_shapes;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index c2b3b7d19e7..9bbb5262814 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -168,7 +168,7 @@ struct Helper {
 // Helper specialization for string (the only non-simple type we
 // support).
 template <>
-struct Helper<string> {
+struct Helper<tstring> {
   // Proto message uses RepeatedFieldType to hold repeated T.
   typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
 
@@ -176,7 +176,7 @@ struct Helper<string> {
   // "out", which is usually the TensorProto::tensor_content.
   template <typename Destination>
   static void Encode(TensorBuffer* in, int64 n, Destination* out) {
-    port::EncodeStringList(in->base<const string>(), n, out);
+    port::EncodeStringList(in->base<const tstring>(), n, out);
   }
 
   // Decodes "n" elements of type string from "in" and constructs a
@@ -184,8 +184,8 @@ struct Helper<string> {
   // usually the TensorProto::tensor_content.
   template <typename Source>
   static TensorBuffer* Decode(Allocator* a, const Source& in, int64 n) {
-    Buffer<string>* buf = new Buffer<string>(a, n);
-    string* strings = buf->template base<string>();
+    Buffer<tstring>* buf = new Buffer<tstring>(a, n);
+    tstring* strings = buf->template base<tstring>();
     if (strings == nullptr || !port::DecodeStringList(in, strings, n)) {
       buf->Unref();
       return nullptr;
@@ -197,8 +197,8 @@ struct Helper<string> {
   // stored in buffer "in".
   static int64 TotalBytes(TensorBuffer* in, int n) {
     int64 tot = in->size();
-    DCHECK_EQ(tot, sizeof(string) * n);
-    const string* p = in->base<const string>();
+    DCHECK_EQ(tot, sizeof(tstring) * n);
+    const tstring* p = in->base<const tstring>();
     for (int i = 0; i < n; ++i, ++p) tot += p->size();
     return tot;
   }
@@ -302,7 +302,7 @@ PROTO_TRAITS(uint32, uint32, uint32);
 PROTO_TRAITS(int16, int32, int);
 PROTO_TRAITS(int8, int32, int);
 PROTO_TRAITS(bool, bool, bool);
-PROTO_TRAITS(string, string, string);
+PROTO_TRAITS(tstring, tstring, string);
 PROTO_TRAITS(qint8, int32, int);
 PROTO_TRAITS(quint8, int32, int);
 PROTO_TRAITS(qint16, int32, int);
@@ -515,7 +515,12 @@ TensorBuffer* FromProtoField<Variant>(Allocator* a, const TensorProto& in,
   if (in_n <= 0) {
     std::fill_n(data, n, Variant());
   } else {
-    for (int64 i = 0; i < in_n; ++i) {
+    // If tensor shape says we have n < in_n elements in the output tensor
+    // then make sure to only decode the first n out of the in_n elements in the
+    // in tensors. In all other cases, we decode all in_n elements of in and set
+    // the remaining elements up to n to be the default Variant() value.
+    const int64 real_n = n < in_n ? n : in_n;
+    for (int64 i = 0; i < real_n; ++i) {
       data[i] = in.variant_val(i);
       if (!DecodeUnaryVariant(&data[i])) {
         LOG(ERROR) << "Could not decode variant with type_name: \""
@@ -708,7 +713,7 @@ bool Tensor::RefCountIsOne() const {
     CASE(uint64, SINGLE_ARG(STMTS))                            \
     CASE(int16, SINGLE_ARG(STMTS))                             \
     CASE(int8, SINGLE_ARG(STMTS))                              \
-    CASE(string, SINGLE_ARG(STMTS))                            \
+    CASE(tstring, SINGLE_ARG(STMTS))                           \
     CASE(complex64, SINGLE_ARG(STMTS))                         \
     CASE(complex128, SINGLE_ARG(STMTS))                        \
     CASE(int64, SINGLE_ARG(STMTS))                             \
@@ -963,7 +968,7 @@ inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
                                                 bool print_v2) {
   return a;
 }
-inline string PrintOneElement(const string& a, bool print_v2) {
+inline string PrintOneElement(const tstring& a, bool print_v2) {
   if (print_v2) {
     return "\"" + absl::CEscape(a) + "\"";
   } else {
@@ -986,7 +991,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
     for (int64 i = 0; i < element_count; i++) {
       if (*data_index >= limit) {
         // If not enough elements has been printed, append "...".
-        if (dim_index != 0 && i < element_count) {
+        if (dim_index != 0) {
           strings::StrAppend(result, "...");
         }
         return;
@@ -1159,7 +1164,7 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       return SummarizeArray<bool>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_STRING:
-      return SummarizeArray<string>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<tstring>(limit, num_elts, shape_, data, print_v2);
       break;
     default: {
       // All irregular cases
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 46de4b1f6b8..7f7837ad1ce 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -54,6 +54,44 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
 }  // namespace batch_util
 
 /// @ingroup core
+
+/// Interface to access the raw ref-counted data buffer.
+class TensorBuffer : public core::RefCounted {
+ public:
+  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
+  ~TensorBuffer() override {}
+
+  /// \brief data() points to a memory region of size() bytes.
+  ///
+  /// NOTE(mrry): The `data()` method is not virtual for performance reasons.
+  /// It can be called multiple times when the contents of a `Tensor` are
+  /// accessed, and so making it non-virtual allows the body to be inlined.
+  void* data() const { return data_; }
+
+  /// \brief Size (in bytes) of the buffer.
+  virtual size_t size() const = 0;
+
+  /// \brief If this TensorBuffer is sub-buffer of another TensorBuffer,
+  /// returns that TensorBuffer. Otherwise, returns this.
+  virtual TensorBuffer* root_buffer() = 0;
+
+  /// \brief Fills metadata about the allocation into the proto.
+  virtual void FillAllocationDescription(
+      AllocationDescription* proto) const = 0;
+
+  /// \brief Helper method to reinterpret the buffer as an array of `T`.
+  template <typename T>
+  T* base() const {
+    return reinterpret_cast<T*>(data());
+  }
+
+  /// \brief Whether this TensorBuffer owns the underlying memory.
+  virtual bool OwnsMemory() const { return true; }
+
+ private:
+  void* const data_;
+};
+
 /// Represents an n-dimensional array of values.
 class Tensor {
  public:
@@ -108,6 +146,11 @@ class Tensor {
   Tensor(Allocator* a, DataType type, const TensorShape& shape,
          const AllocationAttributes& allocation_attr);
 
+  /// \brief Creates a tensor with the input datatype, shape and buf.
+  ///
+  /// Acquires a ref on buf that belongs to this Tensor.
+  Tensor(DataType type, const TensorShape& shape, TensorBuffer* buf);
+
   /// \brief Creates an empty Tensor of the given data type.
   ///
   /// Like Tensor(), returns a 1-dimensional, 0-element Tensor with
@@ -150,7 +193,7 @@ class Tensor {
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int8 scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(string scalar_value)
+  explicit Tensor(tstring scalar_value)
       : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
   explicit Tensor(complex64 scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
@@ -183,7 +226,7 @@ class Tensor {
   // convenience because otherwise passing a string literal would surprisingly
   // construct a DT_BOOL tensor.
   explicit Tensor(const char* scalar_value)
-      : Tensor(string(scalar_value), host_scalar_tag{}) {}
+      : Tensor(tstring(scalar_value), host_scalar_tag{}) {}
 
   /// Copy constructor.
   Tensor(const Tensor& other);
@@ -606,20 +649,16 @@ class Tensor {
   TensorShape shape_;
   TensorBuffer* buf_;
 
-  friend class DMAHelper;
-  friend class TensorCApi;
-  friend class TensorCord;            // For access to buf_
-  friend class TensorReference;       // For access to buf_
-  friend class VariableOp;            // For access to set_shape
-  friend class AutoReloadVariableOp;  // For access to set_shape
-  friend class TensorTestHelper;      // For access to set_shape
-  friend class CastOpBase;            // For access to set_dtype;
+  friend class DMAHelper;             // For access to buf_.
+  friend class TensorCApi;            // For access to buf_.
+  friend class TensorReference;       // For access to buf_.
+  friend class VariableOp;            // For access to set_shape.
+  friend class AutoReloadVariableOp;  // For access to set_shape.
+  friend class TensorTestHelper;      // For access to set_shape.
+  friend class CastOpBase;            // For access to set_dtype.
   friend class OpKernelContext;       // For access to RefCountIsOne().
   friend class ScopedAllocator;       // For access to buf_.
   friend class XlaTensor;             // For access to RefCountIsOne().
-  friend class XlaTensorBuffer;  // For access to the private constructor taking
-                                 // the buffer
-  friend class Var;
   template <typename Device, typename T>
   friend class AssignVariableOp;  // For access to RefCountIsOne().
   template <typename Device, typename T>
@@ -636,11 +675,6 @@ class Tensor {
       Tensor* parent, Tensor* element,
       int64 index);  // For access to RefCountIsOne().
 
-  // Creates a tensor with the input datatype, shape and buf.
-  //
-  // Acquires a ref on buf that belongs to this Tensor.
-  Tensor(DataType type, const TensorShape& shape, TensorBuffer* buf);
-
   bool CanUseDMA() const;
 
   // Only needed by variable op to set the shape of an uninitialized
@@ -673,40 +707,6 @@ class Tensor {
 
 // START_SKIP_DOXYGEN
 
-// Interface to access the raw ref-counted data buffer.
-class TensorBuffer : public core::RefCounted {
- public:
-  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
-  ~TensorBuffer() override {}
-
-  // data() points to a memory region of size() bytes.
-  //
-  // NOTE(mrry): The `data()` method is not virtual for performance reasons.
-  // It can be called multiple times when the contents of a `Tensor` are
-  // accessed, and so making it non-virtual allows the body to be inlined.
-  void* data() const { return data_; }
-  virtual size_t size() const = 0;
-
-  // If this TensorBuffer is sub-buffer of another TensorBuffer,
-  // returns that TensorBuffer. Otherwise, returns this.
-  virtual TensorBuffer* root_buffer() = 0;
-
-  // Fill metadata about the allocation into the proto.
-  virtual void FillAllocationDescription(
-      AllocationDescription* proto) const = 0;
-
-  template <typename T>
-  T* base() const {
-    return reinterpret_cast<T*>(data());
-  }
-
-  // Whether this TensorBuffer owns the underlying memory.
-  virtual bool OwnsMemory() const { return true; }
-
- private:
-  void* const data_;
-};
-
 template <typename T>
 T* Tensor::base() const {
   return buf_ == nullptr ? nullptr : buf_->base<T>();
@@ -869,12 +869,29 @@ typename TTypes<T>::Scalar Tensor::scalar() {
   return typename TTypes<T>::Scalar(base<T>());
 }
 
+#ifdef USE_TSTRING
+template <>
+inline typename TTypes<std::string>::Scalar Tensor::scalar<std::string>() {
+  LOG(FATAL)
+      << "std::string is no longer a scalar type, use tensorflow::tstring";
+}
+#endif  // USE_TSTRING
+
 template <typename T>
 typename TTypes<T>::ConstScalar Tensor::scalar() const {
   CheckIsAlignedAndSingleElement();
   return typename TTypes<T>::ConstScalar(base<T>());
 }
 
+#ifdef USE_TSTRING
+template <>
+inline typename TTypes<std::string>::ConstScalar Tensor::scalar<std::string>()
+    const {
+  LOG(FATAL)
+      << "std::string is no longer a scalar type, use tensorflow::tstring";
+}
+#endif  // USE_TSTRING
+
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_dims() {
   return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index d4aed387610..01a0971b152 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -94,6 +94,7 @@ TEST(TensorTest, DataType_Traits) {
   EXPECT_TRUE(std::is_trivial<int8>::value);
   EXPECT_TRUE(std::is_trivial<int64>::value);
   EXPECT_TRUE(std::is_trivial<bool>::value);
+  EXPECT_FALSE(std::is_trivial<tstring>::value);
   EXPECT_FALSE(std::is_trivial<string>::value);
 
   EXPECT_EQ(sizeof(bool), 1);
@@ -480,7 +481,7 @@ TEST_F(TensorReshapeTest, ReshapeError) {
 
   Tensor string_tensor{DT_STRING, {10}};
   // Note that the error message compare # of elements, not # of bytes.
-  EXPECT_DEATH((string_tensor.bit_casted_shaped<string, 1>({9})), "9 vs. 10");
+  EXPECT_DEATH((string_tensor.bit_casted_shaped<tstring, 1>({9})), "9 vs. 10");
 }
 
 TEST_F(TensorReshapeTest, Flat) {
@@ -795,27 +796,27 @@ TEST(Tensor_Scalar, Basics) {
   {
     Tensor t(DT_STRING, TensorShape({}));
     EXPECT_EQ(1, t.NumElements());
-    auto Tt = t.scalar<string>();
+    auto Tt = t.scalar<tstring>();
     EXPECT_EQ(1, Tt.size());
     EXPECT_EQ(0, Tt.rank());
-    t.scalar<string>()() = "foo";
+    t.scalar<tstring>()() = "foo";
     EXPECT_EQ("foo", Tt());
   }
   {
     Tensor t(DT_STRING, TensorShape({1}));
     EXPECT_EQ(1, t.NumElements());
-    auto Tt = t.vec<string>();
+    auto Tt = t.vec<tstring>();
     EXPECT_EQ(1, Tt.size());
-    t.flat<string>()(0) = "foo";
+    t.flat<tstring>()(0) = "foo";
     EXPECT_EQ("foo", Tt(0));
   }
   {
     Tensor t(DT_STRING, TensorShape({1, 1, 1}));
     EXPECT_EQ(1, t.NumElements());
-    auto Tt = t.scalar<string>();
+    auto Tt = t.scalar<tstring>();
     EXPECT_EQ(1, Tt.size());
     EXPECT_EQ(0, Tt.rank());
-    t.flat<string>()(0) = "bar";
+    t.flat<tstring>()(0) = "bar";
     EXPECT_EQ("bar", Tt());
   }
   {
@@ -860,7 +861,7 @@ TEST(Tensor_HostScalar, Basics) {
     Tensor t("fooooooooooooooooooooooooooooooooooooo");
     EXPECT_EQ(DT_STRING, t.dtype());
     EXPECT_EQ(1, t.NumElements());
-    auto Tt = t.scalar<string>();
+    auto Tt = t.scalar<tstring>();
     EXPECT_EQ(1, Tt.size());
     EXPECT_EQ(0, Tt.rank());
     EXPECT_EQ("fooooooooooooooooooooooooooooooooooooo", Tt());
@@ -903,15 +904,15 @@ TEST(Tensor_Float, Reshape_And_Slice_Assignment) {
 }
 
 TEST(Tensor_String, Simple) {
-  Tensor t = test::AsTensor<string>(
+  Tensor t = test::AsTensor<tstring>(
       {"hello", "world", "machine", "learning", "new", "york"},
       TensorShape({3, 2}));
   auto s = t.shape();
   ASSERT_EQ(s.dims(), 2);
   ASSERT_EQ(s.dim_size(0), 3);
   ASSERT_EQ(s.dim_size(1), 2);
-  auto m = t.matrix<string>();
-  EXPECT_EQ(t.TotalBytes(), 3 * 2 * sizeof(string) + 5 + 5 + 7 + 8 + 3 + 4);
+  auto m = t.matrix<tstring>();
+  EXPECT_EQ(t.TotalBytes(), 3 * 2 * sizeof(tstring) + 5 + 5 + 7 + 8 + 3 + 4);
 
   EXPECT_EQ(m(0, 0), "hello");
   EXPECT_EQ(m(0, 1), "world");
@@ -920,7 +921,7 @@ TEST(Tensor_String, Simple) {
   EXPECT_EQ(m(2, 0), "new");
   EXPECT_EQ(m(2, 1), "york");
 
-  TestCopies<string>(t);
+  TestCopies<tstring>(t);
 }
 
 TEST(Tensor_Float, SimpleWithHelper) {
@@ -976,16 +977,16 @@ TEST(Tensor_Int64, SimpleWithHelper) {
 }
 
 TEST(Tensor_String, SimpleWithHelper) {
-  Tensor t1 = test::AsTensor<string>({"0", "1", "2", "3", "4", "5"}, {2, 3});
+  Tensor t1 = test::AsTensor<tstring>({"0", "1", "2", "3", "4", "5"}, {2, 3});
   Tensor t2(DT_STRING, {2, 3});
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
-      t2.matrix<string>()(i, j) = strings::StrCat(i * 3 + j);
+      t2.matrix<tstring>()(i, j) = strings::StrCat(i * 3 + j);
     }
   }
 
   // Test with helper.
-  test::ExpectTensorEqual<string>(t1, t2);
+  test::ExpectTensorEqual<tstring>(t1, t2);
 }
 
 TEST(Tensor_Bool, SimpleWithHelper) {
@@ -1163,7 +1164,7 @@ TEST(Tensor, FailureToAllocate) {
   // String
   {
     Tensor t(DT_STRING, TensorShape({1}));
-    t.vec<string>()(0) = "foo";
+    t.vec<tstring>()(0) = "foo";
     TensorProto proto;
     t.AsProtoField(&proto);
 
@@ -1365,11 +1366,11 @@ TEST(SummarizeValue, BOOL) {
 }
 
 TEST(SummarizeValue, STRING) {
-  Tensor x = MkTensor<string>(DT_STRING, TensorShape({5}),
-                              {"one", "two", "three", "four", "five"});
+  Tensor x = MkTensor<tstring>(DT_STRING, TensorShape({5}),
+                               {"one", "two", "three", "four", "five"});
   EXPECT_EQ("one two three four five", x.SummarizeValue(16));
-  x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
-                       {"one", "two", "three", "four", "five"});
+  x = MkTensor<tstring>(DT_STRING, TensorShape({5, 1, 5}),
+                        {"one", "two", "three", "four", "five"});
   EXPECT_EQ("[[one two three four five]][[one...]]...", x.SummarizeValue(6));
 }
 
@@ -1421,16 +1422,16 @@ TEST(SummarizeValue, BOOL_PRINT_V2) {
 }
 
 TEST(SummarizeValue, STRING_PRINT_V2) {
-  Tensor x = MkTensor<string>(DT_STRING, TensorShape({5}),
-                              {"one", "two", "three", "four", "five"});
+  Tensor x = MkTensor<tstring>(DT_STRING, TensorShape({5}),
+                               {"one", "two", "three", "four", "five"});
   EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
             x.SummarizeValue(16, true));
   EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
             x.SummarizeValue(-1, true));
   EXPECT_EQ("[\"one\" \"two\" ... \"four\" \"five\"]",
             x.SummarizeValue(2, true));
-  x = MkTensor<string>(DT_STRING, TensorShape({2, 2}),
-                       {"one", "two", "three", "four", "five"});
+  x = MkTensor<tstring>(DT_STRING, TensorShape({2, 2}),
+                        {"one", "two", "three", "four", "five"});
   EXPECT_EQ("[[\"one\" \"two\"]\n [\"three\" \"four\"]]",
             x.SummarizeValue(16, true));
 }
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index c87cc9548da..896d83ffa2c 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -48,7 +48,7 @@ void DeepCopy(const Tensor& input, Tensor* output) {
              input_data.size());
     }
   } else if (input.dtype() == DT_STRING) {
-    output->unaligned_flat<string>() = input.unaligned_flat<string>();
+    output->unaligned_flat<tstring>() = input.unaligned_flat<tstring>();
   } else {
     CHECK_EQ(DT_VARIANT, input.dtype());
     output->unaligned_flat<Variant>() = input.unaligned_flat<Variant>();
@@ -98,12 +98,12 @@ Status Concat(const gtl::ArraySlice<Tensor>& tensors, Tensor* result) {
     if (dtype != DT_STRING) {
       return errors::Internal("Unexpected data type");
     }
-    string* to_strings =
-        reinterpret_cast<string*>(const_cast<char*>(to_data.data()));
+    tstring* to_strings =
+        reinterpret_cast<tstring*>(const_cast<char*>(to_data.data()));
 
     int64 offset = 0;
     for (const Tensor& tensor : tensors) {
-      auto from_strings = tensor.flat<string>();
+      auto from_strings = tensor.flat<tstring>();
       CHECK_LE(offset + tensor.NumElements(), result->NumElements());
       for (int i = 0; i < tensor.NumElements(); ++i) {
         to_strings[offset + i] = from_strings(i);
@@ -155,7 +155,7 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
     if (tensor.dtype() != DT_STRING) {
       return errors::Internal("Unexpected data type");
     }
-    auto from_strings = tensor.flat<string>();
+    auto from_strings = tensor.flat<tstring>();
 
     int64 offset = 0;
     for (int64 size : sizes) {
@@ -163,7 +163,7 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
       shape.set_dim(0, size);
       result->emplace_back(tensor.dtype(), shape);
       Tensor& split = (*result)[result->size() - 1];
-      string* to_strings = reinterpret_cast<string*>(
+      tstring* to_strings = reinterpret_cast<tstring*>(
           const_cast<char*>(split.tensor_data().data()));
 
       CHECK_LE(offset + split.NumElements(), tensor.NumElements());
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 44708765bbf..fe988015e27 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -111,12 +111,12 @@ TEST(TensorUtil, DeepCopy) {
 
   // Test string deep copy
   Tensor str1(DT_STRING, TensorShape({2}));
-  str1.flat<string>()(0) = "foo1";
-  str1.flat<string>()(1) = "foo2";
+  str1.flat<tstring>()(0) = "foo1";
+  str1.flat<tstring>()(1) = "foo2";
   Tensor str2 = tensor::DeepCopy(str1);
-  str2.flat<string>()(0) = "bar1";
-  str2.flat<string>()(1) = "bar2";
-  EXPECT_NE(str2.flat<string>()(0), str1.flat<string>()(0));
+  str2.flat<tstring>()(0) = "bar1";
+  str2.flat<tstring>()(1) = "bar2";
+  EXPECT_NE(str2.flat<tstring>()(0), str1.flat<tstring>()(0));
 }
 
 TEST(TensorUtil, DeepCopySlice) {
@@ -151,7 +151,7 @@ TEST(TensorUtil, DeepCopySlice) {
 
 TEST(TensorUtil, DeepCopySliceString) {
   Tensor x(DT_STRING, TensorShape({10}));
-  x.flat<string>().setConstant("hello");
+  x.flat<tstring>().setConstant("hello");
 
   // Slice 'x' -- y still refers to the same buffer.
   Tensor y = x.Slice(3, 7);
@@ -160,7 +160,7 @@ TEST(TensorUtil, DeepCopySliceString) {
   Tensor z = tensor::DeepCopy(y);
 
   // Set x to be different.
-  x.flat<string>().setConstant("goodbye");
+  x.flat<tstring>().setConstant("goodbye");
 
   EXPECT_EQ(TensorShape({10}), x.shape());
   EXPECT_EQ(TensorShape({4}), y.shape());
@@ -171,11 +171,11 @@ TEST(TensorUtil, DeepCopySliceString) {
 
   // x and y should now all be 'goodbye', but z should be 'hello'.
   for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ("goodbye", x.flat<string>()(i));
+    EXPECT_EQ("goodbye", x.flat<tstring>()(i));
   }
   for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ("goodbye", y.unaligned_flat<string>()(i));
-    EXPECT_EQ("hello", z.flat<string>()(i));
+    EXPECT_EQ("goodbye", y.unaligned_flat<tstring>()(i));
+    EXPECT_EQ("hello", z.flat<tstring>()(i));
   }
 }
 
@@ -202,11 +202,12 @@ TEST(TensorUtil, DeepCopySliceVariant) {
   // Each element of x and y should now be a DT_STRING Tensor containing "foo",
   // but each element of z should be a DT_FLOAT tensor containing 42.0.
   for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ("foo", x.flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+    EXPECT_EQ("foo", x.flat<Variant>()(i).get<Tensor>()->scalar<tstring>()());
   }
   for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ("foo",
-              y.unaligned_flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+    EXPECT_EQ(
+        "foo",
+        y.unaligned_flat<Variant>()(i).get<Tensor>()->scalar<tstring>()());
     EXPECT_EQ(42.0, z.flat<Variant>()(i).get<Tensor>()->scalar<float>()());
   }
 }
@@ -271,7 +272,7 @@ TEST(TensorUtil, Split) {
 TEST(TensorUtil, ConcatSplitStrings) {
   Tensor x(DT_STRING, TensorShape({4, 3}));
   for (int i = 0; i < 4 * 3; ++i) {
-    x.flat<string>()(i) = strings::StrCat("foo_", i);
+    x.flat<tstring>()(i) = strings::StrCat("foo_", i);
   }
 
   std::vector<Tensor> split;
@@ -280,15 +281,15 @@ TEST(TensorUtil, ConcatSplitStrings) {
   TF_ASSERT_OK(tensor::Concat(split, &x_round_tripped));
   ASSERT_EQ(x.shape(), x_round_tripped.shape());
   for (int i = 0; i < 4 * 3; ++i) {
-    EXPECT_EQ(x.flat<string>()(i), x_round_tripped.flat<string>()(i));
+    EXPECT_EQ(x.flat<tstring>()(i), x_round_tripped.flat<tstring>()(i));
   }
 
   // Ensure that no memory is being shared between 'x' and 'x_round_tripped'.
   for (int i = 0; i < 4 * 3; ++i) {
-    x_round_tripped.flat<string>()(i) = strings::StrCat("bar_", i);
+    x_round_tripped.flat<tstring>()(i) = strings::StrCat("bar_", i);
   }
   for (int i = 0; i < 4 * 3; ++i) {
-    EXPECT_NE(x.flat<string>()(i), x_round_tripped.flat<string>()(i));
+    EXPECT_NE(x.flat<tstring>()(i), x_round_tripped.flat<tstring>()(i));
   }
 }
 
diff --git a/tensorflow/core/framework/typed_allocator.h b/tensorflow/core/framework/typed_allocator.h
index 7e1ea1bfae5..20e16358f2c 100644
--- a/tensorflow/core/framework/typed_allocator.h
+++ b/tensorflow/core/framework/typed_allocator.h
@@ -77,19 +77,19 @@ class TypedAllocator {
 
 template <>
 /* static */
-inline void TypedAllocator::RunCtor(Allocator* raw_allocator, string* p,
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, tstring* p,
                                     size_t n) {
   if (!raw_allocator->AllocatesOpaqueHandle()) {
-    for (size_t i = 0; i < n; ++p, ++i) new (p) string();
+    for (size_t i = 0; i < n; ++p, ++i) new (p) tstring();
   }
 }
 
 template <>
 /* static */
-inline void TypedAllocator::RunDtor(Allocator* raw_allocator, string* p,
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, tstring* p,
                                     size_t n) {
   if (!raw_allocator->AllocatesOpaqueHandle()) {
-    for (size_t i = 0; i < n; ++p, ++i) p->~string();
+    for (size_t i = 0; i < n; ++p, ++i) p->~tstring();
   }
 }
 
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 7a58c101d7c..e09ea268cd7 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -391,7 +391,7 @@ MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
 MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
 MATCH_TYPE_AND_ENUM(int16, DT_INT16);
 MATCH_TYPE_AND_ENUM(int8, DT_INT8);
-MATCH_TYPE_AND_ENUM(string, DT_STRING);
+MATCH_TYPE_AND_ENUM(tstring, DT_STRING);
 MATCH_TYPE_AND_ENUM(complex64, DT_COMPLEX64);
 MATCH_TYPE_AND_ENUM(complex128, DT_COMPLEX128);
 MATCH_TYPE_AND_ENUM(int64, DT_INT64);
diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index 70e56515d7c..b2978c75c36 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -7,7 +7,7 @@ option java_outer_classname = "VariableProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Indicates when a distributed variable will be synced.
 enum VariableSynchronization {
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 25cddc00a3a..19226d232ae 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -244,7 +244,7 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   // Create the input StoredTensorValue and serialize it.
   StoredTensorValue from;
   from.stored = Tensor(DT_STRING, TensorShape({}));
-  from.stored.scalar<string>()() = "hi";
+  from.stored.scalar<tstring>()() = "hi";
   VariantTensorData data;
   data.set_type_name(from.TypeName());
   from.Encode(&data);
@@ -292,7 +292,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
 TEST(VariantOpCopyTest, CreateCopyCPUToCPUString) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Tensor t_str(DT_STRING, TensorShape({}));
-  t_str.scalar<string>()() = "hi";
+  t_str.scalar<tstring>()() = "hi";
   Output create_op = CreateTestVariant(root, t_str);
   Output identity = ops::Identity(root, create_op);
 
@@ -309,7 +309,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPUString) {
     EXPECT_EQ("StoredTensorValue", r1.TypeName());
     const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
     EXPECT_NE(v1, nullptr);
-    EXPECT_EQ("hi", v1->stored.scalar<string>()());
+    EXPECT_EQ("hi", v1->stored.scalar<tstring>()());
   }
 }
 
@@ -356,7 +356,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Scope with_gpu = root.WithDevice("/gpu:0");
   Tensor t_str(DT_STRING, TensorShape({}));
-  t_str.scalar<string>()() = "hi";
+  t_str.scalar<tstring>()() = "hi";
   Output create_op = CreateTestVariant(root, t_str);
   Output identity = ops::Identity(with_gpu, create_op);
 
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index b5107a02a7f..608f3688a09 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/variant_op_registry.h"
+
 #include <string>
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -56,6 +58,18 @@ void UnaryVariantOpRegistry::RegisterDecodeFn(
 }
 
 bool DecodeUnaryVariant(Variant* variant) {
+  CHECK_NOTNULL(variant);
+  if (variant->TypeName().empty()) {
+    VariantTensorDataProto* t = variant->get<VariantTensorDataProto>();
+    if (t == nullptr || !t->metadata().empty() || !t->tensors().empty()) {
+      // Malformed variant.
+      return false;
+    } else {
+      // Serialization of an empty Variant.
+      variant->clear();
+      return true;
+    }
+  }
   UnaryVariantOpRegistry::VariantDecodeFn* decode_fn =
       UnaryVariantOpRegistry::Global()->GetDecodeFn(variant->TypeName());
   if (decode_fn == nullptr) {
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 0a4874aeae5..6f40cd1de11 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -118,13 +118,30 @@ TEST(VariantOpDecodeRegistryTest, TestBasic) {
   v.Encode(&data);
   VariantTensorDataProto proto;
   data.ToProto(&proto);
-  Variant encoded = proto;
+  Variant encoded = std::move(proto);
   EXPECT_TRUE((*decode_fn)(&encoded));
   VariantValue* decoded = encoded.get<VariantValue>();
   EXPECT_NE(decoded, nullptr);
   EXPECT_EQ(decoded->early_exit, true);
 }
 
+TEST(VariantOpDecodeRegistryTest, TestEmpty) {
+  VariantTensorDataProto empty_proto;
+  Variant empty_encoded = std::move(empty_proto);
+  EXPECT_TRUE(DecodeUnaryVariant(&empty_encoded));
+  EXPECT_TRUE(empty_encoded.is_empty());
+
+  VariantTensorData data;
+  Variant number = 3.0f;
+  number.Encode(&data);
+  VariantTensorDataProto proto;
+  data.ToProto(&proto);
+  proto.set_type_name("");
+  Variant encoded = std::move(proto);
+  // Failure when type name is empty but there's data in the proto.
+  EXPECT_FALSE(DecodeUnaryVariant(&encoded));
+}
+
 TEST(VariantOpDecodeRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantDecodeFn f;
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc
index 75352fc2977..3cf8e8b0f42 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@@ -254,30 +254,35 @@ void SymbolicGradientBuilder::InitBackprop() {
     backprops_.clear();
     std::unordered_set<Node*> visited;
     std::deque<Node*> queue;
-    for (const NodeOut& nout : x_node_outputs_) {
+    for (const NodeOut& nout : y_node_outputs_) {
       queue.push_back(nout.node);
       visited.insert(nout.node);
     }
 
     // Going forward to figure out which endpoints need backprop-ed.
     // A node's endpoints need to be backprop-ed only if one of the
-    // arg node can reach the node via data edges.
+    // return nodes can reach backwards to the node via data edges.
     while (!queue.empty()) {
       Node* n = queue.front();
       queue.pop_front();
       for (int i = 0; i < n->num_outputs(); ++i) {
         backprops_[{n, i}].clear();
       }
-      int num_expected_backprops = 0;
-      for (const Edge* e : n->out_edges()) {
+      for (const Edge* e : n->in_edges()) {
         if (e->IsControlEdge()) continue;
-        ++num_expected_backprops;
-        if (visited.find(e->dst()) == visited.end()) {
-          queue.push_back(e->dst());
-          visited.insert(e->dst());
+        pending_[e->src()->id()]++;
+        if (visited.find(e->src()) == visited.end()) {
+          queue.push_back(e->src());
+          visited.insert(e->src());
         }
       }
-      pending_[n->id()] = num_expected_backprops;
+    }
+
+    // Create entries in backprops_ for all x_node_outputs_, because they will
+    // not be added in above loop if they are not reverse reachable from
+    // y_node_outputs_.
+    for (const NodeOut& nout : x_node_outputs_) {
+      backprops_[{nout.node, nout.index}].clear();
     }
   }
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index cc8e18a685d..c24cac5cbda 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 
 #include <vector>
+
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -85,11 +86,14 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"CollectiveReduce", NC_COLLECTIVE},
         {"CollectiveBcastSend", NC_COLLECTIVE},
         {"CollectiveBcastRecv", NC_COLLECTIVE},
+        {"CollectiveGather", NC_COLLECTIVE},
         {"FakeParam", NC_FAKE_PARAM},
         {"PartitionedCall", NC_PARTITIONED_CALL},
         {"StatefulPartitionedCall", NC_PARTITIONED_CALL},
         {"If", NC_IF},
         {"StatelessIf", NC_IF},
+        {"While", NC_WHILE},
+        {"StatelessWhile", NC_WHILE},
         // Not using the constants defined in FunctionLibraryDefinition for the
         // 4 ops below because android inference library does not link
         // tf.function related files.
@@ -592,7 +596,7 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
 }
 
 Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
-  if (dst->type_string() != "While") {
+  if (!dst->IsWhileNode()) {
     return errors::Internal(
         "dst argument to AddWhileEdgeHack should be a While op, got: ",
         dst->DebugString());
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 1d9a45b562e..0fe7f86a9c8 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -177,6 +177,7 @@ class Node {
   bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
   bool IsPartitionedCall() const { return class_ == NC_PARTITIONED_CALL; }
   bool IsIfNode() const { return class_ == NC_IF; }
+  bool IsWhileNode() const { return class_ == NC_WHILE; }
   // Is this node a function input
   bool IsArg() const { return class_ == NC_ARG; }
   // Is this node a function output
@@ -188,6 +189,11 @@ class Node {
     UpdateProperties();
   }
 
+  void AddAttr(const string& name, std::vector<string>&& val) {
+    MoveAttrValue(std::move(val), AddAttrHelper(name));
+    UpdateProperties();
+  }
+
   void ClearAttr(const string& name);
 
   // Returns into '*e' the edge connecting to the 'idx' input of this Node.
@@ -264,6 +270,7 @@ class Node {
     NC_FAKE_PARAM,
     NC_PARTITIONED_CALL,
     NC_IF,
+    NC_WHILE,
     NC_ARG,
     NC_RETVAL,
     NC_OTHER  // Not a special kind of node
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 48c6639907c..b462ab3438c 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -728,9 +728,9 @@ Status GraphConstructor::ValidateShape(Node* node) {
   if (!opts_.importing || !opts_.validate_shape) return Status::OK();
   TF_RETURN_IF_ERROR(refiner_->AddNode(node));
   // For nodes with the _output_shapes attribute, override the shape.
-  std::vector<TensorShapeProto> shape_attrs;
+  std::vector<const TensorShapeProto*> shape_attrs;
   const char* kAttrName = "_output_shapes";
-  if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
+  if (!TryGetNodeAttr(node->attrs(), kAttrName, &shape_attrs)) {
     // No _output_shapes attribute, the AddNode call above was sufficient.
     return Status::OK();
   }
@@ -753,7 +753,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
                  << " outputs. Output shapes may be inaccurate.";
   }
   for (int i = 0; i < node->num_outputs(); ++i) {
-    const TensorShapeProto& p = shape_attrs[i];
+    const TensorShapeProto& p = *shape_attrs[i];
     shape_inference::ShapeHandle h;
     Status s = ic->MakeShapeFromShapeProto(p, &h);
     if (!s.ok()) {
@@ -772,7 +772,6 @@ Status GraphConstructor::ValidateShape(Node* node) {
       // This is an escape hatch that allows us to correct shape
       // functions that are not critical to correct execution but
       // would cause graphs to fail if imported after correcting.
-      //
       const string& op = node->type_string();
       const std::vector<string> whitelist = {
           // To be removed after 2017/03/08.
@@ -991,11 +990,10 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     Node* node = pair.second.node;
     if (node == nullptr) continue;
     std::vector<string> coloc_values;
-    Status status =
-        GetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values);
-    if (!status.ok()) continue;
+    if (!TryGetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values))
+      continue;
     bool updated = false;
-    for (int i = 0; i < coloc_values.size(); ++i) {
+    for (size_t i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
       if (absl::ConsumePrefix(&val, kColocationGroupPrefix)) {
         auto name_pair = uniquified_names_.find(string(val));
@@ -1006,7 +1004,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
       }
     }
     if (updated) {
-      node->AddAttr(kColocationAttrName, coloc_values);
+      node->AddAttr(kColocationAttrName, std::move(coloc_values));
     }
   }
 }
@@ -1182,10 +1180,19 @@ Status GraphConstructor::Convert() {
       }
 
       if (src_node != nullptr && src_index >= src_node->num_outputs()) {
-        return errors::InvalidArgument(
-            "Node '", node_def.name(), "': Connecting to invalid output ",
-            tensor_id.index(), " of source node ", tensor_id.node(),
-            " which has ", src_node->num_outputs(), " outputs");
+        std::ostringstream out;
+        out << "Node '" << node_def.name() << "': Connecting to invalid output "
+            << tensor_id.index() << " of source node " << tensor_id.node()
+            << " which has " << src_node->num_outputs() << " outputs.";
+
+        if (src_node->type_string() == "If" ||
+            src_node->type_string() == "StatelessIf" ||
+            src_node->type_string() == "While" ||
+            src_node->type_string() == "StatelessWhile") {
+          out << " Try using "
+              << "tf.compat.v1.experimental.output_all_intermediates(True).";
+        }
+        return errors::InvalidArgument(out.str());
       }
 
       inputs.emplace_back(string(tensor_id.node()), src_node, src_index);
diff --git a/tensorflow/core/graph/graph_def_builder_util.cc b/tensorflow/core/graph/graph_def_builder_util.cc
index 102c72185f7..3ca9f8a21ff 100644
--- a/tensorflow/core/graph/graph_def_builder_util.cc
+++ b/tensorflow/core/graph/graph_def_builder_util.cc
@@ -22,7 +22,7 @@ Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph) {
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(builder.ToGraphDef(&graph_def));
   GraphConstructorOptions opts;
-  return ConvertGraphDefToGraph(opts, graph_def, graph);
+  return ConvertGraphDefToGraph(opts, std::move(graph_def), graph);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index a13769b3315..b295085b40d 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -227,7 +227,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
     }
 
     NodeDef* cast = gdef->add_node();
-    *status = cast_builder.Finalize(cast);
+    *status = cast_builder.Finalize(cast, /*consume=*/true);
     if (!status->ok()) return nullptr;
 
     // Connect the Send op to the cast.
@@ -244,7 +244,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
     send_builder.Attr("_start_time", start_time);
   }
   NodeDef* send = gdef->add_node();
-  *status = send_builder.Finalize(send);
+  *status = send_builder.Finalize(send, /*consume=*/true);
   return send;
 }
 
@@ -301,7 +301,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
   recv_builder.Device(dst->assigned_device_name())
       .Attr("tensor_type", cast_dtype);
   NodeDef* recv = gdef->add_node();
-  *status = recv_builder.Finalize(recv);
+  *status = recv_builder.Finalize(recv, /*consume=*/true);
   if (!status->ok()) return nullptr;
   *real_recv = recv;
 
@@ -314,7 +314,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
     cast_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
     NodeDef* cast = gdef->add_node();
-    *status = cast_builder.Finalize(cast);
+    *status = cast_builder.Finalize(cast, /*consume=*/true);
     if (!status->ok()) return nullptr;
     return cast;
   } else if (edge->IsControlEdge()) {
@@ -324,7 +324,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
     id_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
     NodeDef* id = gdef->add_node();
-    *status = id_builder.Finalize(id);
+    *status = id_builder.Finalize(id, /*consume=*/true);
     if (!status->ok()) return nullptr;
     return id;
   } else {
@@ -341,7 +341,7 @@ NodeDef* AddDummyConst(const PartitionOptions& opts, GraphDef* gdef,
                 .Device(src->assigned_device_name())
                 .Attr("dtype", DT_FLOAT)
                 .Attr("value", tensor)
-                .Finalize(result);
+                .Finalize(result, /*consume=*/true);
   return result;
 }
 
@@ -354,7 +354,7 @@ NodeDef* AddControlTrigger(const PartitionOptions& opts, GraphDef* gdef,
                            "ControlTrigger")
                 .Device(assigned_device_name)
                 .Attr("_start_time", starttime)
-                .Finalize(result);
+                .Finalize(result, /*consume=*/true);
   return result;
 }
 
@@ -424,7 +424,7 @@ Node* AddControlEnter(Graph* g, const string& node_name,
   node_builder.Attr("frame_name", frame_name);
   node_builder.Attr("parallel_iterations", parallel_iterations);
   Node* res_node;
-  *status = node_builder.Finalize(g, &res_node);
+  *status = node_builder.Finalize(g, &res_node, /*consume=*/true);
   if (!status->ok()) return nullptr;
   res_node->set_assigned_device_name(device_name);
   return res_node;
@@ -437,7 +437,7 @@ Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
   NodeBuilder node_builder(node_name, "Merge", g->op_registry());
   node_builder.Input({{in_name1, 0, DT_FLOAT}, {in_name2, 0, DT_FLOAT}});
   Node* res_node;
-  *status = node_builder.Finalize(g, &res_node);
+  *status = node_builder.Finalize(g, &res_node, /*consume=*/true);
   if (!status->ok()) return nullptr;
   res_node->set_assigned_device_name(device_name);
   return res_node;
@@ -947,13 +947,13 @@ void SetIncarnation(const PartitionOptions& opts, NodeDef* ndef) {
     // Not related to send/recv.
     return;
   }
-  string send_device;
-  if (!GetNodeAttr(*ndef, "send_device", &send_device).ok()) {
+  const string& send_device = GetNodeAttrString(*ndef, "send_device");
+  if (send_device.empty()) {
     // No known send_device. The runtime will detect it later.
     return;
   }
   int64 incarnation = PartitionOptions::kIllegalIncarnation;
-  if (!GetNodeAttr(*ndef, "send_device_incarnation", &incarnation).ok() ||
+  if (!TryGetNodeAttr(*ndef, "send_device_incarnation", &incarnation) ||
       (incarnation == PartitionOptions::kIllegalIncarnation)) {
     incarnation = opts.get_incarnation(send_device);
     SetAttrValue(incarnation,
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index c204dd0ffcf..2ea20d01225 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -18,7 +18,7 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.pb_text.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 
 namespace tensorflow {
@@ -104,12 +104,24 @@ static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'";
 
 // Prefix that we add to Tensorflow op name to construct Mkl op name.
 static const char* const kMklOpPrefix = "_Mkl";
+// TODO(intel-tf): PR review feedback (penpornk)
+// Can we add eager_mode (or is_eager) as an op attribute instead?
+// This way we don't need to rename the op just to pass eager_mode
+// through template parameter.
+static const char* const kMklEagerOpPrefix = "_MklEager";
 
 // Get the name of Mkl op from original TensorFlow op
 // We prefix 'Mkl' to the original op to get Mkl op.
 inline string GetMklOpName(const string& name) {
   return string(kMklOpPrefix) + name;
 }
+
+// Get the name of Mkl Eager op from original TensorFlow op
+// We prefix 'MklEager' to the original op to get Mkl Eager op.
+inline string GetMklEagerOpName(const string& name) {
+  return string(kMklEagerOpPrefix) + name;
+}
+
 // Check whether opname with type T is registered as MKL operator
 // that can accept input tensors in MKL layout.
 //
@@ -165,7 +177,7 @@ static inline bool IsMklNameChangeOp(const string& op_name, DataType T) {
   // Now we just construct a search string to match what we are looking for.
   string search_string = kMklNameChangeOpLabelPattern;
   search_string += string(";") + string(" T in [");
-  search_string += EnumName_DataType(T) + string("]");
+  search_string += DataType_Name(T) + string("]");
 
   return kernel.find(search_string) != string::npos;
 }
@@ -177,6 +189,11 @@ static inline bool IsMklOp(const string& op_name, DataType T) {
   return IsMklLayoutDependentOp(op_name, T) || IsMklNameChangeOp(op_name, T);
 }
 
+static inline bool IsMklOp(const Node* n) {
+  DataType T;
+  return GetNodeAttr(n->def(), "T", &T).ok() && IsMklOp(n->type_string(), T);
+}
+
 // Check whether opname with type T is registered as MKL-compliant and
 // is element-wise.
 //
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index fd92aaf8adf..1487200b4e3 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -246,6 +246,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.avg_pool3d = "AvgPool3D";
     csinfo_.avg_pool3d_grad = "AvgPool3DGrad";
     csinfo_.batch_matmul = "BatchMatMul";
+    csinfo_.batch_matmul_v2 = "BatchMatMulV2";
     csinfo_.bias_add = "BiasAdd";
     csinfo_.bias_add_grad = "BiasAddGrad";
     csinfo_.concat = "Concat";
@@ -353,47 +354,52 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mul = "Mul";
     csinfo_.squared_difference = "SquaredDifference";
     csinfo_.sub = "Sub";
-    // End - element-wise ops. See note above.
+// End - element-wise ops. See note above.
 
-    // NOTE: names are alphabetically sorted.
+// NOTE: names are alphabetically sorted.
+#ifndef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAddN, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite,
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
-         CopyAttrsPooling, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.avg_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
-                      CopyAttrsPooling, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.avg_pool3d, mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d),
-         CopyAttrsPooling, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.avg_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d_grad),
-                      CopyAttrsPooling, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.batch_matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul),
-                      CopyAttrsBatchMatMul, AlwaysRewrite,
-                      kRewriteForOpNameChange});
+                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
+    rinfo_.push_back({csinfo_.batch_matmul_v2,
+                      mkl_op_registry::GetMklOpName(csinfo_.batch_matmul_v2),
+                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
     rinfo_.push_back(
         {csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat),
-         CopyAttrsConcat, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.concatv2, mkl_op_registry::GetMklOpName(csinfo_.concatv2),
-         CopyAttrsConcatV2, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.conjugate_transpose,
          mkl_op_registry::GetMklOpName(csinfo_.conjugate_transpose),
-         CopyAttrsTranspose, AlwaysRewrite, kRewriteForOpNameChange});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
+#endif  // !ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
                       CopyAttrsConvCheckConstFilter, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
+#ifndef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConvCheckConstFilter, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
@@ -427,90 +433,89 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_input,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
-         CopyAttrsConv2DDepthwise, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_filter,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
-         CopyAttrsConv2DDepthwise, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.dequantize, mkl_op_registry::GetMklOpName(csinfo_.dequantize),
-         CopyAttrsDequantize, DequantizeRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, DequantizeRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-         CopyAttrsFusedBatchNorm, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v2),
-         CopyAttrsFusedBatchNormV2, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v2),
-         CopyAttrsFusedBatchNormV2, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
 
-    // Using CopyAttrsFusedBatchNormV2 for V3 on CPU, as there are no additional
+    // Using CopyAttrsAll for V3 on CPU, as there are no additional
     // attributes.
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v3),
-         CopyAttrsFusedBatchNormV2, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
-         CopyAttrsFusedBatchNormV2, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#endif  // !ENABLE_MKLDNN_V1
 
     rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifndef ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.identity,
+                      mkl_op_registry::GetMklOpName(csinfo_.identity),
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
+                      kRewriteForLayoutPropagation});
+
     rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsLRN, LrnRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, LrnRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-         CopyAttrsLRN, LrnGradRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
-                      CopyAttrsMatMul, AlwaysRewrite, kRewriteForOpNameChange});
+                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
     rinfo_.push_back(
         {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
-         CopyAttrsLeakyRelu, LeakyReluRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.leakyrelu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad),
-                      CopyAttrsLeakyRelu, LeakyReluRewrite,
+                      CopyAttrsAll, LeakyReluRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool),
-                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite,
+                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
-                      CopyAttrsPooling, MaxpoolGradRewrite,
+                      CopyAttrsAll, MaxpoolGradRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.max_pool3d,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d),
-                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite,
+                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.max_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad),
-                      CopyAttrsPooling, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
+                      kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.maximum,
+                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
-                      CopyAttrsDataType, AlwaysRewrite,
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite,
@@ -521,11 +526,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
-                      CopyAttrsQuantizedPooling, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantized_concatv2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_concatv2),
-                      CopyAttrsConcatV2, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantized_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d),
@@ -574,7 +579,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantized_max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_max_pool),
-                      CopyAttrsQuantizedPooling, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_sum_and_relu,
                       mkl_op_registry::GetMklOpName(
@@ -631,55 +636,58 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.quantize_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantize_v2),
-                      CopyAttrsQuantizeV2, QuantizeOpRewrite,
+                      CopyAttrsAll, QuantizeOpRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
-                      CopyAttrsDataType, AlwaysRewrite,
+                      CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.relu_grad, mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.relu6, mkl_op_registry::GetMklOpName(csinfo_.relu6),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.relu6_grad, mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
-         CopyAttrsRequantize, AlwaysRewrite, kRewriteForLayoutPropagation});
-    // Disable these two MKL operators for now due to some test failures caused
-    // by these two ops
-    /*
-    rinfo_.push_back({csinfo_.tanh,
-                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
-                      CopyAttrsDataType, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.tanh_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
-                      CopyAttrsDataType, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    */
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#endif  // !ENABLE_MKLDNN_V1
+// Disable these two MKL operators for now due to some test failures caused
+// by these two ops
+/*
+rinfo_.push_back({csinfo_.tanh,
+                  mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                  CopyAttrsAll, AlwaysRewrite,
+                  kRewriteForLayoutPropagation});
+rinfo_.push_back({csinfo_.tanh_grad,
+                  mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                  CopyAttrsAll, AlwaysRewrite,
+                  kRewriteForLayoutPropagation});
+*/
+#ifndef ENABLE_MKLDNN_V1
     rinfo_.push_back(
         {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
-         CopyAttrsReshape, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice),
-         CopyAttrsSlice, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.slice,
+                      mkl_op_registry::GetMklOpName(csinfo_.slice),
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
+                      kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax),
-         CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
 
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
-                      CopyAttrsDataType, AlwaysRewrite,
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
-                      CopyAttrsDataType, AlwaysRewrite,
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.transpose, mkl_op_registry::GetMklOpName(csinfo_.transpose),
-         CopyAttrsTranspose, AlwaysRewrite, kRewriteForOpNameChange});
+    rinfo_.push_back({csinfo_.transpose,
+                      mkl_op_registry::GetMklOpName(csinfo_.transpose),
+                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -760,6 +768,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          // CheckForMklOp
          FuseConv3D,
          CopyAttrsConv});
+#endif  // !ENABLE_MKLDNN_V1
   }
 
   // Standard interface to run pass
@@ -865,6 +874,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string avg_pool3d;
     string avg_pool3d_grad;
     string batch_matmul;
+    string batch_matmul_v2;
     string bias_add;
     string bias_add_grad;
     string concat;
@@ -1381,6 +1391,38 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @return - true (since we want to always rewrite)
   static bool AlwaysRewrite(const Node* n) { return true; }
 
+  // Rewrite rule which considers "context" of the current node to decide if we
+  // should rewrite. By "context" we currently mean all the inputs of current
+  // node. The idea is if none of the inputs of current node are not MKL nodes,
+  // then rewriting current node to MKL node _may not_ offer any performance
+  // improvement.
+  //
+  // One such case is element-wise ops. For such ops, we reuse the Eigen
+  // implementation and pass the MKL metadata tensor through so we can avoid
+  // conversions. However, if all incoming edges are in TF format, we don't
+  // need all this overhead, so replace the elementwise node only if at least
+  // one of its parents is a MKL node.
+  //
+  // More generally, all memory- or IO-bound ops (such as Identity) may fall
+  // under this category.
+  //
+  // @input - Input graph node to be rewritten
+  // @return - true if node is to be rewritten as MKL node; false otherwise.
+  static bool RewriteIfAtleastOneMklInput(const Node* n) {
+    DataType T;
+    if (GetNodeAttr(n->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+      for (auto e : n->in_edges()) {
+        if (e->IsControlEdge()) continue;
+        if (mkl_op_registry::IsMklOp(e->src())) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   static bool DequantizeRewrite(const Node* n) {
     DCHECK(n);
     Node* input = nullptr;
@@ -1480,7 +1522,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DCHECK(n);
 
     float alpha;
-    bool has_attr = GetNodeAttr(n->def(), "alpha", &alpha).ok();
+    bool has_attr = TryGetNodeAttr(n->def(), "alpha", &alpha);
     DCHECK(has_attr);
 
     // If the alpha of LeakyRelu is less than 1, rewrite the node.
@@ -1543,7 +1585,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
     // it includes those we support.
     DataType T;
-    if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+    if (!TryGetNodeAttr(n->def(), "T", &T) ||
         !mkl_op_registry::IsMklLayoutDependentOp(csinfo_.mkl_fused_conv2d, T)) {
       return false;
     }
@@ -1757,41 +1799,17 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
   // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
-                            bool change_format = false);
-  static void CopyAttrsBatchMatMul(const Node* orig_node, NodeBuilder* nb,
-                                   bool change_format = false);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb,
-                                   bool change_format = false);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb,
-                              bool change_format = false);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
-                                bool change_format = false);
+  static void CopyAttrsAll(const Node* orig_node, NodeBuilder* nb,
+                           bool change_format = false);
   static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                             bool change_format = false);
-  static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb,
-                                       bool change_format = false);
   static void CopyAttrsConv2DDepthwiseCheckConstFilter(
       const Node* orig_node, NodeBuilder* nb, bool change_format = false);
   static void CopyAttrsConvCheckConstFilter(const Node* orig_node,
                                             NodeBuilder* nb,
                                             bool change_format = false);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
-                                bool change_format = false);
-  static void CopyAttrsDequantize(const Node* orig_node, NodeBuilder* nb,
-                                  bool change_format = false);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
-                                      bool change_format = false);
-  static void CopyAttrsFusedBatchNormV2(const Node* orig_node, NodeBuilder* nb,
-                                        bool change_format = false);
-  static void CopyAttrsLeakyRelu(const Node* orig_node, NodeBuilder* nb,
-                                 bool change_format = false);
   static void CopyAttrsFusedConv2D(const Node* orig_node, NodeBuilder* nb,
                                    bool change_format = false);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
-                           bool change_format = false);
-  static void CopyAttrsMatMul(const Node* orig_node, NodeBuilder* nb,
-                              bool change_format = false);
   static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
                                      bool change_format = false);
   static void CopyAttrsPadWithFusedConv2D(const Node* orig_node,
@@ -1804,26 +1822,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                              const Node* orig_node2,
                                              NodeBuilder* nb,
                                              bool change_format = false);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
-                               bool change_format = false);
-  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
-                                        bool change_format = false);
   static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
                                        bool change_format = false);
-  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb,
-                                       bool change_format = false);
-  static void CopyAttrsQuantizeV2(const Node* orig_node, NodeBuilder* nb,
-                                  bool change_format = false);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb,
-                               bool change_format = false);
-  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb,
-                                  bool change_format = false);
-  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb,
-                             bool change_format = false);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
-                             bool change_format = false);
-  static void CopyAttrsTranspose(const Node* orig_node, NodeBuilder* nb,
-                                 bool change_format = false);
   static void CopyFormatAttrsConv(const Node* orig_node, NodeBuilder* nb,
                                   const std::vector<int32>& strides,
                                   const std::vector<int32>& dilations,
@@ -1975,7 +1975,7 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor(
 
   // If this is an MKL op, then it will create extra output for MKL layout.
   DataType T;
-  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+  if (TryGetNodeAttr(n->def(), "T", &T) &&
       mkl_op_registry::IsMklLayoutDependentOp(n->type_string(), T)) {
     // If this is an MKL op, then it will generate an edge that will receive
     // Mkl tensor from a node.
@@ -2373,6 +2373,21 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
+// Generic function to copy all attributes from original node to target.
+void MklLayoutRewritePass::CopyAttrsAll(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format) {
+  string name;
+  AttrSlice attr_list(orig_node->def());
+
+  auto iter = attr_list.begin();
+  while (iter != attr_list.end()) {
+    name = iter->first;
+    auto attr = iter->second;
+    nb->Attr(name, attr);
+    ++iter;
+  }
+}
+
 void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node,
                                                          NodeBuilder* nb,
                                                          bool change_format) {
@@ -2399,23 +2414,6 @@ void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node,
   CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsQuantizeV2(const Node* orig_node,
-                                               NodeBuilder* nb,
-                                               bool change_format) {
-  DataType T;
-  string mode;
-  string round_mode;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "mode", &mode));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "round_mode", &round_mode));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("mode", mode);
-  nb->Attr("round_mode", round_mode);
-}
 void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                                          bool change_format) {
   DataType T;
@@ -2437,21 +2435,6 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
   CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsDequantize(const Node* orig_node,
-                                               NodeBuilder* nb,
-                                               bool change_format) {
-  DataType T;
-  string mode;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "mode", &mode));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("mode", mode);
-}
-
 // Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
 void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
                                                   NodeBuilder* nb,
@@ -2576,30 +2559,6 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   nb->Attr("fused_ops", fused_ops);
 }
 
-void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node,
-                                                    NodeBuilder* nb,
-                                                    bool change_format) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  std::vector<int32> dilations;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("dilations", dilations);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-}
-
 void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
     const Node* orig_node, NodeBuilder* nb, bool change_format) {
   DataType T;
@@ -2627,131 +2586,6 @@ void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
-                                         bool change_format) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb,
-                                                bool change_format) {
-  DataType T;
-  string data_format;
-  std::vector<int32> strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
-                                        bool change_format) {
-  DataType T;
-  int depth_radius;
-  float bias;
-  float alpha;
-  float beta;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("depth_radius", depth_radius);
-  nb->Attr("bias", bias);
-  nb->Attr("alpha", alpha);
-  nb->Attr("beta", beta);
-}
-
-void MklLayoutRewritePass::CopyAttrsLeakyRelu(const Node* orig_node,
-                                              NodeBuilder* nb,
-                                              bool change_format) {
-  DataType T;
-  float alpha;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("alpha", alpha);
-}
-
-void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb,
-                                            bool change_format) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> ksize, strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("ksize", ksize);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb,
-                                             bool change_format) {
-  DataType T;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-}
-
-void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
-                                                     NodeBuilder* nb,
-                                                     bool change_format) {
-  DataType T;
-  string padding;
-  std::vector<int32> ksize, strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("ksize", ksize);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-}
-
 void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
                                                     NodeBuilder* nb,
                                                     bool change_format) {
@@ -2816,66 +2650,6 @@ void MklLayoutRewritePass::CopyAttrsQuantizedMatMulWithBias(
   if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias);
 }
 
-void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
-                                               NodeBuilder* nb,
-                                               bool change_format) {
-  DataType Tinput, out_type;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tinput", &Tinput));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "out_type", &out_type));
-
-  // Add attributes to new node.
-  nb->Attr("Tinput", Tinput);
-  nb->Attr("out_type", out_type);
-}
-
-void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb,
-                                            bool change_format) {
-  DataType T;
-  DataType Tshape;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Tshape", Tshape);
-}
-
-void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
-                                          NodeBuilder* nb, bool change_format) {
-  DataType T;
-  DataType Index;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Index", &Index));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Index", Index);
-}
-
-void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb, bool change_format) {
-  DataType T;
-  string data_format;
-  int num_split;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("num_split", num_split);
-  nb->Attr("data_format", data_format);
-}
-
 void MklLayoutRewritePass::CopyFormatAttrsConv(
     const Node* orig_node, NodeBuilder* nb, const std::vector<int32>& strides,
     const std::vector<int32>& dilations, bool change_format) {
@@ -2915,70 +2689,6 @@ void MklLayoutRewritePass::CopyFormatAttrsConv(
   }
 }
 
-void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb,
-                                           bool change_format) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb,
-                                             bool change_format) {
-  DataType T;
-  int N;
-  DataType tidx;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-  nb->Attr("Tidx", tidx);
-}
-
-void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb,
-                                                   bool change_format) {
-  DataType T;
-  float epsilon;
-  string data_format;
-  bool is_training;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("epsilon", epsilon);
-  nb->Attr("data_format", data_format);
-  nb->Attr("is_training", is_training);
-}
-
-void MklLayoutRewritePass::CopyAttrsFusedBatchNormV2(const Node* orig_node,
-                                                     NodeBuilder* nb,
-                                                     bool change_format) {
-  CopyAttrsFusedBatchNorm(orig_node, nb, change_format);
-
-  DataType U;
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "U", &U));
-  nb->Attr("U", U);
-}
-
 void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
                                                 NodeBuilder* nb,
                                                 bool change_format) {
@@ -3016,54 +2726,6 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   nb->Attr("epsilon", epsilon);
 }
 
-void MklLayoutRewritePass::CopyAttrsMatMul(const Node* orig_node,
-                                           NodeBuilder* nb,
-                                           bool change_format) {
-  DataType T;
-  bool transpose_a, transpose_b;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "transpose_a", &transpose_a));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "transpose_b", &transpose_b));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("transpose_a", transpose_a);
-  nb->Attr("transpose_b", transpose_b);
-}
-
-void MklLayoutRewritePass::CopyAttrsTranspose(const Node* orig_node,
-                                              NodeBuilder* nb,
-                                              bool change_format) {
-  DataType T, Tperm;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tperm", &Tperm));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Tperm", Tperm);
-}
-
-void MklLayoutRewritePass::CopyAttrsBatchMatMul(const Node* orig_node,
-                                                NodeBuilder* nb,
-                                                bool change_format) {
-  DataType T;
-  bool adj_x, adj_y;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "adj_x", &adj_x));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "adj_y", &adj_y));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("adj_x", adj_x);
-  nb->Attr("adj_y", adj_y);
-}
-
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions related to node merge pass
 //////////////////////////////////////////////////////////////////////////
@@ -3202,7 +2864,6 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   // Create node.
   Node* new_node;
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  CHECK_NOTNULL(new_node);
 
   // In the following code of this function, an unsorted set is used to make
   // sure no duplicated edges be added into the new node. Therefore, we can
@@ -3393,7 +3054,8 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   // Create node.
   Node* new_node;
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  DCHECK(new_node);
+  // No need to check if new_node is null because it will be null only when
+  // Finalize fails.
 
   // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
   // node are already copied in BuildNode.
@@ -3502,7 +3164,6 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
   // Create node.
   Node* new_node;
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  CHECK_NOTNULL(new_node);
 
   // In the following code of this function, an unsorted set is used to make
   // sure no duplicated edges be added into the new node. Therefore, we can
@@ -3659,7 +3320,6 @@ Status MklLayoutRewritePass::RewriteNodeForLayoutPropagation(
   if (s != Status::OK()) {
     return s;
   }
-  DCHECK(*new_node != nullptr);
 
   // In the following code of this function, an unsorted set is used to make
   // sure no duplicated edges be added into the new node. Therefore, we can
@@ -3735,7 +3395,6 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange(
   if (s != Status::OK()) {
     return s;
   }
-  DCHECK(*new_node != nullptr);
 
   // In the following code of this function, an unsorted set is used to make
   // sure no duplicated edges be added into the new node. Therefore, we can
@@ -3792,7 +3451,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
                         "RewriteNode will fail.");
   }
   TF_CHECK_OK(ret_status);
-  DCHECK(new_node != nullptr);
 
   // Copy the runtime device assigned from original code to new node.
   new_node->set_assigned_device_name(orig_node->assigned_device_name());
@@ -3811,19 +3469,24 @@ const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
   DataType T1, T2;
   DataType Tinput, Tfilter;
+  bool type_attrs_present = false;
 
-  if ((GetNodeAttr(n->def(), "Tinput", &Tinput).ok() &&
-       GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok()) ||
-      (GetNodeAttr(n->def(), "T1", &T1).ok() &&
-       GetNodeAttr(n->def(), "T2", &T2).ok())) {
-    if (mkl_op_registry::IsMklLayoutDependentOp(
-            mkl_op_registry::GetMklOpName(n->type_string()), T1, T2) ||
-        mkl_op_registry::IsMklLayoutDependentOp(
-            mkl_op_registry::GetMklOpName(n->type_string()), Tinput, Tfilter)) {
-      for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-        if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
-          return &*ri;
-        }
+  if (TryGetNodeAttr(n->def(), "Tinput", &Tinput) &&
+      TryGetNodeAttr(n->def(), "Tfilter", &Tfilter) &&
+      mkl_op_registry::IsMklLayoutDependentOp(
+          mkl_op_registry::GetMklOpName(n->type_string()), Tinput, Tfilter)) {
+    type_attrs_present = true;
+  } else if (TryGetNodeAttr(n->def(), "T1", &T1) &&
+             TryGetNodeAttr(n->def(), "T2", &T2) &&
+             mkl_op_registry::IsMklLayoutDependentOp(
+                 mkl_op_registry::GetMklOpName(n->type_string()), T1, T2)) {
+    type_attrs_present = true;
+  }
+
+  if (type_attrs_present) {
+    for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+      if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
+        return &*ri;
       }
     }
   }
@@ -3845,7 +3508,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
   // MklRelu if type is INT32.
   DataType T;
-  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+  if (!TryGetNodeAttr(n->def(), "T", &T)) {
     return nullptr;
   }
 
@@ -3872,47 +3535,6 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // For elementwise node, we reuse the Eigen implementation and pass the MKL
-  // metadata tensor through so we can avoid conversions. However, if all
-  // incoming edges are in TF format, we don't need all this overhead, so
-  // replace the elementwise node only if at least one of its parents is a MKL
-  // node.
-  //
-  // Identity nodes can also skip replacement if they are not being served by
-  // any MKL nodes.
-  //
-  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
-  // eigen code to reduce cross-library dependency.
-  VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
-  if (mkl_op_registry::IsMklElementWiseOp(
-          mkl_op_registry::GetMklOpName(n->type_string()), T) ||
-      n->type_string().find("Identity") != string::npos) {
-    VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
-    bool incoming_mkl_edge = false;
-    int num_parent = 0;
-    for (auto parent : n->in_edges()) {
-      if (mkl_op_registry::IsMklLayoutDependentOp(parent->src()->type_string(),
-                                                  T)) {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
-                << " is MKL op: " << parent->src()->type_string();
-        incoming_mkl_edge = true;
-        break;
-      } else {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
-                << " is NON-MKL op: " << parent->src()->type_string();
-      }
-    }
-    if (incoming_mkl_edge == false) {
-      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which "
-                 "has no MKL "
-                 "parents.";
-      return nullptr;
-    } else {
-      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string()
-              << " which has MKL parents";
-    }
-  }
-
   // We now check if rewrite rule applies for this op. If rewrite rule passes
   // for this op, then we rewrite it to Mkl op.
   // Find matching RewriteInfo and then check that rewrite rule applies.
@@ -3980,7 +3602,8 @@ Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
   // Create node.
   Node* new_node;
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  DCHECK(new_node);
+  // No need to check if new_node is null because it will be null only when
+  // Finalize fails.
 
   // Fill outputs.
   for (const Edge* e : transpose_to_nchw->out_edges()) {
@@ -4100,7 +3723,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
 
   // If graph node is not Mkl node, then return.
   DataType T = DT_INVALID;
-  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+  if (!TryGetNodeAttr(n->def(), "T", &T) ||
       !mkl_op_registry::IsMklLayoutDependentOp(n->type_string(), T)) {
     return result;
   }
@@ -4125,7 +3748,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
     // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
     // node, then we don't need to do anything.
     Node* e_src = e->src();
-    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+    if (TryGetNodeAttr(e_src->def(), "T", &T) &&
         mkl_op_registry::IsMklLayoutDependentOp(e_src->type_string(), T)) {
       // Source node for edge 'e' is Mkl node.
       // Destination node and destination input slot of e is node 'n' and 'idx'
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 494abbd5170..b69a30e8274 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
 
 #include "tensorflow/core/graph/mkl_layout_pass.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
 #include <vector>
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -2206,7 +2206,6 @@ TEST_F(MklLayoutPassTest,
       "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
@@ -2230,7 +2229,6 @@ TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNativeGradInput_Positive) {
       "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropInput'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
@@ -3004,6 +3002,60 @@ TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
       "DMT/_5->H:11;E->H:4;F->H:5;G->I;H->I:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to context-based node rewrite
+/////////////////////////////////////////////////////////////////////
+
+// If any of the inputs is an MKL op, then rewrite Slice to Mkl op.
+TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Int32Input'}"
+      "node { name: 'E' op: 'Int32Input'}"
+      "node { name: 'F' op: 'Slice'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'Index'        value { type: DT_INT32 } }"
+      " input: ['C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Int32Input);"
+            "DMT/_0(Const);DMT/_1(Const);"
+            "E(Int32Input);F(_MklSlice);G(Zeta);M(_MklInput);N(_MklInput)|"
+            "A->C;A->G;B->C:1;C->F;C->G:1;C:2->F:3;"
+            "C:control->DMT/_0:control;C:control->DMT/"
+            "_1:control;"
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;"
+            "E->F:2;M->C:2;N->C:3");
+}
+
+// If none of the inputs is an MKL op, then Slice should not be rewritten.
+TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Int32Input'}"
+      "node { name: 'D' op: 'Slice'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'Index'        value { type: DT_INT32 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Int32Input);"
+            "D(Slice);E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node for workspace edges
 /////////////////////////////////////////////////////////////////////
@@ -3017,7 +3069,6 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'MaxPool'"
@@ -3041,7 +3092,6 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['E', 'F', 'B'] }"
       "node { name: 'H' op: 'Input'}"
@@ -3066,7 +3116,6 @@ TEST_F(MklLayoutPassTest, LRN_Positive) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Input'}"
@@ -3076,7 +3125,6 @@ TEST_F(MklLayoutPassTest, LRN_Positive) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['C', 'D', 'B'] }"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -3098,7 +3146,6 @@ TEST_F(MklLayoutPassTest, LRN_Negative1) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -3119,7 +3166,6 @@ TEST_F(MklLayoutPassTest, LRN_Negative2) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -3139,7 +3185,6 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Input'}"
@@ -3149,7 +3194,6 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['C', 'D', 'B'] }"
       "node { name: 'F' op: 'LRNGrad'"
@@ -3157,7 +3201,6 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
       " attr { key: 'alpha'        value { f: 0.001 } }"
       " attr { key: 'beta'         value { f: 0.75 } }"
       " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'depth_radius' value { i: 2 } }"
       " input: ['C', 'B', 'D'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -3471,7 +3514,6 @@ TEST_F(MklLayoutPassTest,
       "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
@@ -3683,7 +3725,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_DeviceTest) {
       kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormV3);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
             "E->F:4;F->G:1");
 }
 
@@ -3727,14 +3769,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Slice_DeviceTest) {
       " attr { key: 'Index'        value { type: DT_INT32 } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Int32Input);"
-            "D(_MklSlice);DMT/_0(Const);DMT/_1(Const);DMT/"
-            "_2(Const);E(Zeta)|A->D;A->E;"
-            "A:control->DMT/_0:control;A:control->DMT/"
-            "_1:control;A:control->DMT/_2:control;"
-            "B->D:1;C->D:2;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+            "A(Input);B(Int32Input);C(Int32Input);D(Slice);E(Zeta)|A->D;A->E;"
+            "B->D:1;C->D:2;D->E:1");
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -4268,6 +4307,39 @@ TEST_F(MklLayoutPassTest,
             "H->K:7;I->K:8;J->L:1;K->L");
 }
 
+TEST_F(MklLayoutPassTest, MatMul_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklMatMul)|A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, BatchMatMul_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'BatchMatMul'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklBatchMatMul)|A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, BatchMatMulV2_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'BatchMatMulV2'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklBatchMatMulV2)|A->C;B->C:1");
+}
+
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
   testing::StopTiming();
   string s;
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 6ce4531c5bc..07bf49f7f63 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -112,7 +112,7 @@ NodeBuilder& NodeBuilder::XlaCluster(StringPiece xla_cluster) {
   return *this;
 }
 
-Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
+Status NodeBuilder::Finalize(Graph* graph, Node** created_node, bool consume) {
   // In case of error, set *created_node to nullptr.
   if (created_node != nullptr) *created_node = nullptr;
   if (!errors_.empty()) {
@@ -120,7 +120,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   }
 
   NodeDef node_def;
-  TF_RETURN_IF_ERROR(def_builder_.Finalize(&node_def));
+  TF_RETURN_IF_ERROR(def_builder_.Finalize(&node_def, consume));
   TF_RETURN_IF_ERROR(ValidateNodeDef(node_def, def_builder_.op_def()));
   TF_RETURN_IF_ERROR(
       CheckOpDeprecation(def_builder_.op_def(), graph->versions().producer()));
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 51e044cd8b2..ce4fb4f3c48 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -121,7 +121,9 @@ class NodeBuilder {
   // Validates the described node and adds it to *graph, adding edges
   // for all (non-back) inputs.  If created_node is not nullptr,
   // *created_node will be set to the new node (or nullptr on error).
-  Status Finalize(Graph* graph, Node** created_node) const;
+  // If `consume` is true, the builder state will be moved into `node_def`,
+  // and the builder will be left in an undefined state.
+  Status Finalize(Graph* graph, Node** created_node, bool consume = false);
 
   // Accessors for the values set in the constructor.
   const string& node_name() const { return def_builder_.node_name(); }
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 26bb6543569..4670e7a543c 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -172,8 +172,8 @@ StringPiece GetNodeNamePrefix(const Node* node) {
 }
 
 void FillStringTensor(Tensor* dst, const Tensor& src) {
-  auto dst_flat = dst->flat<string>();
-  auto src_flat = src.flat<string>();
+  auto dst_flat = dst->flat<tstring>();
+  auto src_flat = src.flat<tstring>();
   for (int i = 0; i < src.NumElements(); i++) {
     dst_flat(i) = src_flat(i);
   }
@@ -220,8 +220,8 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
   FillStringTensor(&new_shape_and_slices, shape_and_slices);
   for (int i = 0; i < var_size; i++) {
     Node* var = added_variables[i];
-    new_tensor_names.flat<string>()(tn_size + i) = var->name();
-    new_shape_and_slices.flat<string>()(tn_size + i) = "";
+    new_tensor_names.flat<tstring>()(tn_size + i) = var->name();
+    new_shape_and_slices.flat<tstring>()(tn_size + i) = "";
     var_nodeouts.emplace_back(var);
   }
   save_op_builder = save_op_builder.Input(var_nodeouts);
@@ -275,7 +275,7 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
     // Construct the tensor_names input with the variable name.
     Node* tensor_names;
     Tensor tensor_names_val(DT_STRING, TensorShape({1}));
-    tensor_names_val.flat<string>()(0) = var->name();
+    tensor_names_val.flat<tstring>()(0) = var->name();
     TF_RETURN_IF_ERROR(NodeBuilder(tensor_names_op_name, "Const")
                            .Attr("dtype", DT_STRING)
                            .Attr("value", tensor_names_val)
@@ -284,7 +284,7 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
     // Construct the shape_and_slices input with empty string.
     Node* shape_and_slices;
     Tensor shape_and_slices_val(DT_STRING, TensorShape({1}));
-    shape_and_slices_val.flat<string>()(0) = "";
+    shape_and_slices_val.flat<tstring>()(0) = "";
     TF_RETURN_IF_ERROR(NodeBuilder(shape_and_slices_op_name, "Const")
                            .Attr("dtype", DT_STRING)
                            .Attr("value", shape_and_slices_val)
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 7d839723f89..e70427f9ef8 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -229,7 +229,7 @@ Status ArgFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
                   "_Arg")
           .Attr("T", BaseType(feed_tensor.node->output_type(feed_tensor.index)))
           .Attr("index", arg_index_)
-          .Finalize(g, out_node));
+          .Finalize(g, out_node, /*consume=*/true));
   (*out_node)->set_assigned_device_name(device_info().name());
   return Status::OK();
 }
@@ -248,7 +248,7 @@ Status RecvFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
           .Attr("send_device_incarnation",
                 static_cast<int64>(device_info().incarnation()))
           .Attr("client_terminated", true)
-          .Finalize(g, out_node));
+          .Finalize(g, out_node, /*consume=*/true));
 
   (*out_node)->set_assigned_device_name(device_info().name());
   return Status::OK();
@@ -268,7 +268,7 @@ Status RetvalFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
           .Attr("T",
                 BaseType(fetch_tensor.node->output_type(fetch_tensor.index)))
           .Attr("index", retval_index_)
-          .Finalize(g, out_node));
+          .Finalize(g, out_node, /*consume=*/true));
   (*out_node)->set_assigned_device_name(device_info().name());
   return Status::OK();
 }
@@ -286,7 +286,7 @@ Status SendFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
           .Attr("send_device_incarnation",
                 static_cast<int64>(device_info().incarnation()))
           .Attr("client_terminated", true)
-          .Finalize(g, out_node));
+          .Finalize(g, out_node, /*consume=*/true));
   (*out_node)->set_assigned_device_name(device_info().name());
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 5d16e4e182d..20bed36d1b8 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -2,7 +2,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index f7af7cc374f..c3326faed06 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "rocm/include/hip/hip_runtime.h"
 #endif
 
-#ifdef EIGEN_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM
 #include "include/libxsmm.h"
 #endif
 
@@ -67,7 +67,7 @@ DeviceProperties GetLocalCPUInfo() {
 
   (*device.mutable_environment())["eigen"] = strings::StrCat(
       EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
-#ifdef EIGEN_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM
   (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
 #endif
 
diff --git a/tensorflow/core/grappler/clusters/utils_test.cc b/tensorflow/core/grappler/clusters/utils_test.cc
index 3cf72fd8170..6b7013d3038 100644
--- a/tensorflow/core/grappler/clusters/utils_test.cc
+++ b/tensorflow/core/grappler/clusters/utils_test.cc
@@ -40,6 +40,18 @@ TEST(UtilsTest, GetLocalGPUInfo) {
   properties = GetLocalGPUInfo(PlatformGpuId(0));
   EXPECT_EQ("GPU", properties.type());
   EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  LOG(INFO) << "ROCm is enabled.";
+  DeviceProperties properties;
+
+  // Invalid platform GPU ID.
+  properties = GetLocalGPUInfo(PlatformGpuId(100));
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+  // Succeed when a valid platform GPU id was inserted.
+  properties = GetLocalGPUInfo(PlatformGpuId(0));
+  EXPECT_EQ("GPU", properties.type());
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
 #else
   LOG(INFO) << "CUDA is not enabled.";
   DeviceProperties properties;
@@ -73,6 +85,8 @@ TEST(UtilsTest, GetDeviceInfo) {
   EXPECT_EQ("GPU", properties.type());
 #if GOOGLE_CUDA
   EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
 #endif
 
   // TF to platform GPU id mapping entry doesn't exist.
@@ -81,7 +95,7 @@ TEST(UtilsTest, GetDeviceInfo) {
   properties = GetDeviceInfo(device);
   EXPECT_EQ("UNKNOWN", properties.type());
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // Invalid platform GPU id.
   TF_ASSERT_OK(
       GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
@@ -94,7 +108,11 @@ TEST(UtilsTest, GetDeviceInfo) {
   device.id = 1;
   properties = GetDeviceInfo(device);
   EXPECT_EQ("GPU", properties.type());
+#if GOOGLE_CUDA
   EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
+#endif
 #endif
 }
 
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index e76472291f9..3ef6c2ae954 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -85,9 +85,8 @@ Status VirtualCluster::Run(const GrapplerItem& item, RunMetadata* metadata) {
   }
 
   TF_RETURN_IF_ERROR(estimator_->Initialize(item));
-  Costs ignored_costs;
   TF_RETURN_IF_ERROR(
-      estimator_->PredictCosts(item.graph, metadata, &ignored_costs));
+      estimator_->PredictCosts(item.graph, metadata, /*cost=*/nullptr));
 
   const std::unordered_map<string, DeviceProperties>& device = GetDevices();
   std::unordered_map<string, int64> peak_mem_usage =
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f1746a2e7e3..af79d098fb8 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
     "tf_protos_grappler",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index a7e81847cac..5a79441eee9 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -61,6 +61,8 @@ void AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
   node->set_compute_cost(node_costs.execution_time.asMicroSeconds().count());
   node->set_compute_time(node_costs.compute_time.asMicroSeconds().count());
   node->set_memory_time(node_costs.memory_time.asMicroSeconds().count());
+  node->set_temporary_memory_size(node_costs.temporary_memory);
+  node->set_persistent_memory_size(node_costs.persistent_memory);
   node->set_inaccurate(node_costs.inaccurate);
 
   for (const string& input : node_manager->GetCurrNode()->input()) {
@@ -149,12 +151,24 @@ Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                              RunMetadata* run_metadata,
                                              Costs* costs) const {
-  GraphDef graph_copy = optimized_graph;
-  GrapplerItem item = item_->WithGraph(std::move(graph_copy));
+  std::unique_ptr<GrapplerItem> item_storage;
+  const GrapplerItem* item;
+  // Many callers to PredictCosts() pass the same optimized_graph as was used
+  // to initialize the estimator.
+  if (&optimized_graph == &item_->graph) {
+    item = item_;
+  } else {
+    GraphDef graph_copy = optimized_graph;
+    item_storage = absl::make_unique<GrapplerItem>(
+        item_->WithGraph(std::move(graph_copy)));
+    item = item_storage.get();
+  }
 
-  auto status = scheduler_->Init(&item);
+  auto status = scheduler_->Init(item);
   if (!status.ok()) {
-    costs->execution_time = Costs::Duration::max();
+    if (costs) {
+      costs->execution_time = Costs::Duration::max();
+    }
     return status;
   }
 
@@ -203,7 +217,11 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   // run_metadata gets step_stats and partition_graphs from Summary.
-  *costs = scheduler_->Summary(run_metadata);
+  if (costs) {
+    *costs = scheduler_->Summary(run_metadata);
+  } else if (run_metadata) {
+    scheduler_->GenerateRunMetadata(run_metadata);
+  }
 
   if (VLOG_IS_ON(1)) {
     bool verbose = VLOG_IS_ON(2);
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 15e66477ca1..e72e613c9e3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -811,16 +811,22 @@ class SymbolicShapeRefiner {
           ctx->input_tensor_protos[dst_input] = tensor_proto;
 
           if (!ic->FullyDefined(input_tensors_as_shapes[dst_input])) {
-            // Shape from a Const is not fully defined when the Const has
-            // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
-            // tensor x to a vector).
+            // Tensorflow uses '-1' to encode unknown shape or dimension:
+            //
+            //      -1  : unknown shape
+            //     [-1] : vector of unknown size
+            // [-1, -1] : matrix of unknown size
+            //
+            // For example `tf.reshape(x, [-1])` will reshape an arbitrary
+            // tensor x to a vector.
+            //
             // It's possible that the same Const with -1 is used in many
             // places, but that doesn't mean the resultant shapes are
             // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
-            // where c is -1. In this case, shape inference yields both x1 and
+            // where c is [-1]. In this case, shape inference yields both x1 and
             // y1 as rank 1, size unknown, but still the shapes of x1 and y1
-            // can be different. (even if we use different Const(-1) for x1
-            // and x2, graph optimzier may merge them to single Const through
+            // can be different. (even if we use different Const([-1]) for x1
+            // and x2, graph optimizer may merge them to single Const through
             // duplicate removal.)
             // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
             // by copying ShapeHandle, they share the same Shape object, and
@@ -1755,9 +1761,14 @@ class SymbolicShapeRefiner {
       // Scalar constant.
       int64 value = tensor.dtype() == DT_INT32 ? tensor.flat<int32>()(0)
                                                : tensor.flat<int64>()(0);
-      // Ideally, values can be < -1, but MakeDim() fails with a value < -1.
-      // It's a limitation as we use ShapeHandle as a means to pass values.
-      if (value >= -1) {
+      if (value == -1) {
+        // Scalar value -1 represents an unknown shape. If we would try to
+        // MakeShape(MakeDim) with it, we would get vector of unknown size.
+        *tensors_as_shapes = ic->UnknownShape();
+        return true;
+      } else if (value >= 0) {
+        // Ideally, values can be < -1, but MakeDim() fails with a value < -1.
+        // It's a limitation as we use ShapeHandle as a means to pass values.
         *tensors_as_shapes = ic->MakeShape({ic->MakeDim(value)});
         return true;
       }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 3f71f5b8d5a..a494c522be6 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -661,6 +661,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
 }
 
 // Helper to translate the positional arguments into named fields.
+/* static */
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
@@ -2022,7 +2023,6 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   return costs;
 }
 
-/* static */
 void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     Costs* costs) const {
   if (compute_memory_overlap_) {
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index d45bb14e070..2f3d17191ee 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -130,7 +130,7 @@ static void ExtractExtraProperties(
         if (tensor.NumElements() != 1) {
           continue;
         }
-        const string filename = tensor.scalar<string>()();
+        const string& filename = tensor.scalar<tstring>()();
 
         Env* env = Env::Default();
         FileStatistics stat;
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 44e94b83c7a..3ed3d0e3c00 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -183,17 +183,23 @@ void HeapReadyManager::DrainWaitingQueue() {
   waiting_queue_.clear();
 }
 
+bool FirstReadyCmp(
+    const std::unordered_map<const NodeDef*, NodeState>* node_map,
+    const NodeDef* a, const NodeDef* b) {
+  if (node_map->at(a).time_ready == node_map->at(b).time_ready) {
+    // Use Node name as tie-breaker for deterministic node scheduling.
+    return a->name().compare(b->name()) > 0;
+  } else {
+    // Note: we need a node with minimum time_ready, not maximum; hence, using
+    // a > b for comparison function.
+    return node_map->at(a).time_ready > node_map->at(b).time_ready;
+  }
+}
+
 std::function<bool(const NodeDef*, const NodeDef*)>
 FirstReadyManager::Greater() {
   auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
-    if (node_map_->at(a).time_ready == node_map_->at(b).time_ready) {
-      // Use Node name as tie-breaker for deterministic node scheduling.
-      return a->name().compare(b->name()) > 0;
-    } else {
-      // Note: we need a node with minimum time_ready, not maximum; hence, using
-      // a > b for comparison function.
-      return node_map_->at(a).time_ready > node_map_->at(b).time_ready;
-    }
+    return FirstReadyCmp(node_map_, a, b);
   };
   return greater;
 }
@@ -201,22 +207,27 @@ FirstReadyManager::Greater() {
 std::function<bool(const NodeDef*, const NodeDef*)>
 PriorityReadyManager::Greater() {
   auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
-    return node_priority_.at(a->name()) > node_priority_.at(b->name());
+    auto pri_a = node_priority_.at(a->name());
+    auto pri_b = node_priority_.at(b->name());
+    if (pri_a == pri_b) {
+      // Fallback to default (FirstReady) behaviour.
+      return FirstReadyCmp(node_map_, a, b);
+    }
+    return pri_a > pri_b;
   };
   return greater;
 }
 
+void PriorityReadyManager::AddNode(const NodeDef* node) {
+  if (node_priority_.count(node->name()) == 0) {
+    VLOG(3) << "Priority of node " << node->name() << " not found.";
+    node_priority_[node->name()] = 0;
+  }
+  HeapReadyManager::AddNode(node);
+}
+
 Status PriorityReadyManager::SetPriority(
     const std::unordered_map<string, int>& node_priority) {
-  // Checks each node has a unique priority.
-  std::unordered_set<int> priorities;
-  for (const auto& it : node_priority_) {
-    if (priorities.find(it.second) != priorities.end()) {
-      return errors::InvalidArgument("Non-unique priority found");
-    }
-    priorities.insert(it.second);
-  }
-
   node_priority_ = node_priority;
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index b50f61d155b..ab8084b1a4b 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -252,6 +252,7 @@ class PriorityReadyManager : public HeapReadyManager {
  public:
   PriorityReadyManager() : HeapReadyManager() {}
   ~PriorityReadyManager() override {}
+  void AddNode(const NodeDef* node) override;
 
   // Note this should be called after Init().
   Status SetPriority(const std::unordered_map<string, int>& node_priority);
@@ -260,7 +261,7 @@ class PriorityReadyManager : public HeapReadyManager {
   std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
 
  private:
-  // A map from unique node name to unique priority. Lower number means higher
+  // A map from unique node name to priority. Lower number means higher
   // priority.
   std::unordered_map<string, int> node_priority_;
 };
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 588bfce5e90..cfe1d6081d8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -377,32 +377,33 @@ TEST_F(ReadyNodeManagerTest, GetAndRemoveMultiplePriorityReadyManager) {
   TF_EXPECT_OK(manager.Init(&node_states_));
 
   // Sets up node priorities.
-  std::unordered_map<string, int> node_priority = {{"Node1", 1}, {"Node2", 2},
-                                                   {"Node3", 3}, {"Node4", 4},
-                                                   {"Node5", 5}, {"Node6", 6}};
+  std::unordered_map<string, int> node_priority = {
+      {"Node1", 1}, {"Node2", 2}, {"Node3", 2}, {"Node4", 4}, {"Node5", 5}};
   TF_EXPECT_OK(manager.SetPriority(node_priority));
 
   // Inserts nodes in some random order.
-  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
   manager.AddNode(&node1_);
   manager.AddNode(&node4_);
   manager.AddNode(&node5_);
-  manager.AddNode(&node3_);
+  manager.AddNode(&node2_);
   manager.AddNode(&node6_);
 
   // Expects nodes scheduled based on priority.
+  // Node6 should default to lowest priority, since it is not found.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
   EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
   manager.RemoveCurrNode();
-  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
-  manager.RemoveCurrNode();
+  // Nodes 2 and 3 have equal priority and so should be scheduled ready-first.
   EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
   manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
   EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
   manager.RemoveCurrNode();
   EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
   manager.RemoveCurrNode();
-  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
-  manager.RemoveCurrNode();
   EXPECT_TRUE(manager.Empty());
 }
 
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 5b3e140f23d..7be98dc43b4 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -98,7 +98,7 @@ TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
 
 TEST_F(GraphViewTest, ParseSingleExample) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a = ops::Const<string>(s.WithOpName("a"), "", {});
+  Output a = ops::Const<tstring>(s.WithOpName("a"), "", {});
   Output b = ops::Const<int64>(s.WithOpName("b"), 1, {1, 1});
   ops::ParseSingleExample c(s.WithOpName("c"), a, {b, b}, 2, {"w", "x"},
                             {"y", "z"}, {DT_INT64, DT_INT64}, {{1}, {1}});
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 6916bc8a950..80d01341d6f 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -120,6 +120,8 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
     fn_library.emplace(OpRegistry::Global(), graph.library());
   }
   for (const NodeDef& node : graph.node()) {
+    const auto attrs = AttrSlice(&node.attr());
+
     // Tensorflow functions do not prune stateful or dataset-output ops from
     // the function body (see PruneFunctionBody in common_runtime/function.cc).
     if (!optimization_options_.allow_pruning_stateful_and_dataset_ops &&
@@ -129,8 +131,9 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
 
     // Do not remove ops with attribute _grappler_do_not_remove. This is useful
     // for debugging.
-    auto iter = node.attr().find("_grappler_do_not_remove");
-    if (iter != node.attr().end() && iter->second.b()) {
+    bool do_not_remove;
+    if (TryGetNodeAttr(attrs, "_grappler_do_not_remove", &do_not_remove) &&
+        do_not_remove) {
       result.insert(node.name());
     }
   }
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 9790915eb96..6d49b2f29d0 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -267,8 +267,8 @@ Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
 
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, graphptr.get()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+      graph_ctor_opts, std::move(graph_def), graphptr.get()));
 
   // Optimize the graph.
   ::tensorflow::GraphOptimizer optimizer(*optimizer_opts);
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 1200cff7127..6b6cc8d49da 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -89,8 +89,9 @@ bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
 bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
                                      absl::string_view control_node_name) {
   NodeDef* control_node = graph.GetNode(control_node_name);
-  DCHECK(control_node != nullptr)
-      << "Didn't find a node for control dependency: " << control_node_name;
+  if (control_node == nullptr) {
+    return false;
+  }
   return CanDedupControlWithRegularInput(graph, *control_node);
 }
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index fcdd366487a..b3d53360802 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/op_types.h"
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -253,6 +254,10 @@ bool IsFusedBatchNorm(const NodeDef& node) {
          op == "FusedBatchNormV3";
 }
 
+bool IsFusedBatchNormEx(const NodeDef& node) {
+  return node.op() == "_FusedBatchNormEx";
+}
+
 bool IsFusedBatchNormGrad(const NodeDef& node) {
   const auto& op = node.op();
   return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2" ||
@@ -313,6 +318,8 @@ bool IsLogicalNot(const NodeDef& node) { return node.op() == "LogicalNot"; }
 
 bool IsLogicalOr(const NodeDef& node) { return node.op() == "LogicalOr"; }
 
+bool IsLoopCond(const NodeDef& node) { return node.op() == "LoopCond"; }
+
 bool IsMatMul(const NodeDef& node) { return node.op() == "MatMul"; }
 
 bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
@@ -953,5 +960,7 @@ bool NeverForwardsInputs(const NodeDef& node) {
          absl::StartsWith(op_name, "Quantize");
 }
 
+bool IsXlaLaunch(const NodeDef& node) { return node.op() == "XlaLaunch"; }
+
 }  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index d0562c32e4c..eee368e0700 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -78,6 +78,7 @@ bool IsFill(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
 bool IsFusedBatchNorm(const NodeDef& node);
+bool IsFusedBatchNormEx(const NodeDef& node);
 bool IsFusedBatchNormGrad(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
@@ -98,6 +99,7 @@ bool IsLog(const NodeDef& node);
 bool IsLogicalAnd(const NodeDef& node);
 bool IsLogicalNot(const NodeDef& node);
 bool IsLogicalOr(const NodeDef& node);
+bool IsLoopCond(const NodeDef& node);
 bool IsMatMul(const NodeDef& node);
 bool IsMax(const NodeDef& node);
 bool IsMaxPoolGrad(const NodeDef& node);
@@ -188,6 +190,7 @@ bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsWhile(const NodeDef& node);
 bool IsXdivy(const NodeDef& node);
+bool IsXlaLaunch(const NodeDef& node);
 bool IsZerosLike(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 65a8b52c05b..89440622341 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -3,7 +3,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 # Platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_static",
 )
 
@@ -625,6 +625,9 @@ cc_library(
 tf_cuda_cc_test(
     name = "meta_optimizer_test",
     srcs = ["meta_optimizer_test.cc"],
+    tags = [
+        "no_gpu",
+    ],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
@@ -893,6 +896,8 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -900,7 +905,6 @@ tf_cc_test(
     name = "scoped_allocator_optimizer_test",
     size = "small",
     srcs = ["scoped_allocator_optimizer_test.cc"],
-    tags = ["notsan"],  # TODO(b/137795054): re-enable after fixing race.
     deps = [
         ":scoped_allocator_optimizer",
         "//tensorflow/cc:cc_ops",
@@ -988,6 +992,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:graph_view",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1085,12 +1090,14 @@ tf_cuda_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/utils:graph_view",
+        "//tensorflow/core/grappler/utils:grappler_test",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 273460050fc..6a281fc0e71 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-using tensorflow::str_util::StringReplace;
 using tensorflow::strings::StrCat;
 
 namespace tensorflow {
@@ -62,9 +61,9 @@ namespace {
 
 // Mark nodes created or optimized by a stage with a tag.
 constexpr char kAddOpsRewriteTag[] =
-    "_grappler:ArithmeticOptimizer:AddOpsRewriteStage";
+    "_grappler_ArithmeticOptimizer_AddOpsRewriteStage";
 constexpr char kMinimizeBroadcastsTag[] =
-    "_grappler:ArithmeticOptimizer:MinimizeBroadcasts";
+    "_grappler_ArithmeticOptimizer_MinimizeBroadcasts";
 
 // Extract values from a Const op to `values`. Returns true if succeeds.
 template <typename T>
@@ -369,9 +368,9 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
         }
       } else {
         // If input node can't be absorbed, add it to OptimizedNodesGroup input.
-        OpInfo::TensorProperties properties;
+        const OpInfo::TensorProperties* properties;
         TF_RETURN_IF_ERROR(GetTensorProperties(*input_tensor, &properties));
-        group->inputs.emplace_back(*input_tensor, properties.shape());
+        group->inputs.emplace_back(*input_tensor, properties->shape());
       }
     }
 
@@ -380,12 +379,12 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
 
   Status CreateOptimizedNodesGroup(NodeDef* root_node,
                                    OptimizedNodesGroup* group) const {
-    OpInfo::TensorProperties root_node_output_properties;
+    const OpInfo::TensorProperties* root_node_output_properties;
     TF_RETURN_IF_ERROR(
         GetTensorProperties(root_node->name(), &root_node_output_properties));
 
     group->root_node = root_node;
-    group->root_shape = root_node_output_properties.shape();
+    group->root_shape = root_node_output_properties->shape();
 
     group->optimized_nodes.reserve(root_node->input_size());
     for (int i = 0; i < root_node->input_size(); ++i) {
@@ -403,10 +402,10 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   bool HasAllInputsBroadcastableToShape(
       const NodeDef& node, const OpInfo::TensorProperties& properties) const {
     auto is_broadcastable = [this, &properties](const string& input) {
-      OpInfo::TensorProperties input_props;
+      const OpInfo::TensorProperties* input_props;
       Status has_input_properties = GetTensorProperties(input, &input_props);
       return has_input_properties.ok() &&
-             ShapesBroadcastable(properties, input_props);
+             ShapesBroadcastable(properties, *input_props);
     };
     return std::all_of(node.input().begin(), node.input().end(),
                        is_broadcastable);
@@ -490,10 +489,10 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     if (!CanOptimize(*node)) return false;
 
     // shape must be symbolically defined and all inputs compatible with it
-    OpInfo::TensorProperties properties;
+    const OpInfo::TensorProperties* properties;
     Status has_properties = GetTensorProperties(node->name(), &properties);
-    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsBroadcastableToShape(*node, properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(*properties) &&
+           HasAllInputsBroadcastableToShape(*node, *properties);
   }
 
  protected:
@@ -512,10 +511,10 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
-    OpInfo::TensorProperties properties;
+    const OpInfo::TensorProperties* properties;
     Status has_properties = GetTensorProperties(node.name(), &properties);
     return has_properties.ok() &&
-           HasAllInputsBroadcastableToShape(node, properties);
+           HasAllInputsBroadcastableToShape(node, *properties);
   }
 
   // Node requirements both for a root node and an absorbed node
@@ -817,13 +816,14 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         // In case of possible common dividers, we avoid hoisting out if any
         // input is not float/double, since integer division is not distributive
         // over addition.
-        OpInfo::TensorProperties properties0, properties1;
+        const OpInfo::TensorProperties* properties0;
+        const OpInfo::TensorProperties* properties1;
         TF_RETURN_IF_ERROR(GetTensorProperties(input->input(0), &properties0));
         TF_RETURN_IF_ERROR(GetTensorProperties(input->input(1), &properties1));
-        if (properties0.dtype() != DT_FLOAT &&
-            properties0.dtype() != DT_DOUBLE &&
-            properties1.dtype() != DT_FLOAT &&
-            properties1.dtype() != DT_DOUBLE) {
+        if (properties0->dtype() != DT_FLOAT &&
+            properties0->dtype() != DT_DOUBLE &&
+            properties1->dtype() != DT_FLOAT &&
+            properties1->dtype() != DT_DOUBLE) {
           common_factors->clear();
           break;
         }
@@ -866,7 +866,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     *shapes_match = true;
     unique_factors->reserve(node->input_size());
 
-    for (int i = 0; i < node->input_size() && shapes_match; ++i) {
+    for (int i = 0; i < node->input_size() && *shapes_match; ++i) {
       const string& input = node->input(i);
       if (IsControlInput(input)) {
         break;
@@ -879,11 +879,11 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
               : (inner_node->input(0) == common_factor ? 1 : 0);
       unique_factors->push_back(inner_node->input(unique_factor_index));
       if (i > 0 && !IsAdd(*node)) {
-        OpInfo::TensorProperties lhs;
-        OpInfo::TensorProperties rhs;
+        const OpInfo::TensorProperties* lhs;
+        const OpInfo::TensorProperties* rhs;
         TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->front(), &lhs));
         TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->back(), &rhs));
-        *shapes_match = ShapesSymbolicallyEqual(lhs, rhs);
+        *shapes_match = ShapesSymbolicallyEqual(*lhs, *rhs);
       }
     }
     return Status::OK();
@@ -929,10 +929,10 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       return false;
 
     // has a symbolically defined shape with broadcastable inputs
-    OpInfo::TensorProperties properties;
+    const OpInfo::TensorProperties* properties;
     Status has_properties = GetTensorProperties(node->name(), &properties);
-    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsBroadcastableToShape(*node, properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(*properties) &&
+           HasAllInputsBroadcastableToShape(*node, *properties);
   }
 
  protected:
@@ -969,10 +969,10 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
-    OpInfo::TensorProperties properties;
+    const OpInfo::TensorProperties* properties;
     Status has_properties = GetTensorProperties(node.name(), &properties);
     return has_properties.ok() &&
-           HasAllInputsBroadcastableToShape(node, properties);
+           HasAllInputsBroadcastableToShape(node, *properties);
   }
 
   std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
@@ -1464,6 +1464,11 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         // in place for merging slices into splits.
         return false;
       }
+      if (NumControlOutputs(*node, *ctx().node_map) > 0) {
+        // TODO(ezhulenev): Unary ops after Split might have a control path to
+        // the Split node, and we currently do not propertly handle cycles.
+        return false;
+      }
       return num_split > 1 && !IsAlreadyOptimized(*node);
     }
     return false;
@@ -1519,6 +1524,11 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
   // control inputs gathered from them to the concat or split node.
   Status HoistUnaryOpChain(const int prefix_length, const ChainLinkSet& tails,
                            std::set<string>* ctrl_inputs, NodeDef* root_node) {
+    VLOG(3) << "Hoist unary op chain:"
+            << " root=" << root_node->name()
+            << " prefix_length=" << prefix_length << " ctrl_inputs=["
+            << absl::StrJoin(*ctrl_inputs, ", ") << "]";
+
     if (tails.empty()) {
       return Status::OK();
     }
@@ -1913,15 +1923,16 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
  private:
   // Returns whether `reshape` is an identity op.
   bool ReshapeIsIdentity(const NodeDef& reshape) {
-    OpInfo::TensorProperties reshape_props;
-    OpInfo::TensorProperties input_props;
+    const OpInfo::TensorProperties* reshape_props;
+    const OpInfo::TensorProperties* input_props;
 
     if (!GetTensorProperties(reshape.name(), &reshape_props).ok() ||
         !GetTensorProperties(reshape.input(0), &input_props).ok()) {
       return false;
     }
 
-    return ShapesSymbolicallyEqual(input_props.shape(), reshape_props.shape());
+    return ShapesSymbolicallyEqual(input_props->shape(),
+                                   reshape_props->shape());
   }
 };
 
@@ -2248,6 +2259,8 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
       FlipBooleanAttr(attr_a, new_op);
       new_op->set_input(0, a->input(0));
       ctx().node_map->UpdateInput(new_op->name(), a->name(), a->input(0));
+    } else {
+      ctx().node_map->UpdateOutput(a->name(), node->name(), new_op->name());
     }
 
     if (b_is_foldable) {
@@ -2256,6 +2269,8 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
       FlipBooleanAttr(attr_b, new_op);
       new_op->set_input(1, b->input(0));
       ctx().node_map->UpdateInput(new_op->name(), b->name(), b->input(0));
+    } else {
+      ctx().node_map->UpdateOutput(b->name(), node->name(), new_op->name());
     }
 
     std::vector<const NodeDef*> deps_to_forward = {node};
@@ -2759,7 +2774,6 @@ class ConvertExpm1Stage : public ArithmeticOptimizerStage {
         // input data type is not supported by expm1. Skip.
         return Status::OK();
       }
-      LOG(INFO) << "Got element = " << element;
       if (element != complex128(1)) {
         // current element is not 1. Skip.
         return Status::OK();
@@ -3256,21 +3270,21 @@ class RemoveStackStridedSliceSameAxis : public ArithmeticOptimizerStage {
                       string* simplified_node_name) {
     const string& input_slice = pack->input(slice_start_value);
 
-    OpInfo::TensorProperties input_slice_properties;
+    const OpInfo::TensorProperties* input_slice_properties;
     TF_RETURN_IF_ERROR(GetTensorProperties(pack->input(slice_start_value),
                                            &input_slice_properties));
-    PartialTensorShape input_slice_shape(input_slice_properties.shape());
+    PartialTensorShape input_slice_shape(input_slice_properties->shape());
 
-    OpInfo::TensorProperties output_properties;
+    const OpInfo::TensorProperties* output_properties;
     TF_RETURN_IF_ERROR(GetTensorProperties(
         strings::StrCat(node->name(), ":", 0), &output_properties));
-    PartialTensorShape output_shape(output_properties.shape());
+    PartialTensorShape output_shape(output_properties->shape());
     NodeDef* output =
         AddEmptyNode(OptimizedNodeName(ParseNodeScopeAndName(node->name())));
     if (input_slice_shape.IsCompatibleWith(output_shape)) {
       output->set_op("Identity");
       output->set_device(node->device());
-      SetDataTypeToAttr(output_properties.dtype(), "T", output);
+      SetDataTypeToAttr(output_properties->dtype(), "T", output);
       output->add_input(input_slice);
     } else {
       NodeDef* axis = AddEmptyNode(
@@ -3285,7 +3299,7 @@ class RemoveStackStridedSliceSameAxis : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(axis);
       output->set_op("ExpandDims");
       output->set_device(node->device());
-      SetDataTypeToAttr(output_properties.dtype(), "T", output);
+      SetDataTypeToAttr(output_properties->dtype(), "T", output);
       output->add_input(input_slice);
       output->add_input(axis->name());
     }
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 0330480db3c..e7c847c3411 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -44,6 +44,8 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   string name() const override { return "arithmetic_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index ae3da034212..82c0016b102 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -160,12 +160,12 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   OptimizeTwice(&optimizer, &item, &output);
   NodeMap node_map(&output);
 
-  EXPECT_EQ(output.node_size(), 5);
+  EXPECT_EQ(output.node_size(), 6);
   const NodeDef* new_div = node_map.GetNode("div");
   ASSERT_NE(new_div, nullptr);
   ASSERT_EQ(new_div->input_size(), 3);
   EXPECT_EQ(new_div->input(0), "check1");
-  EXPECT_EQ(new_div->input(1), "check1");
+  EXPECT_EQ(new_div->input(1), "check2");
   EXPECT_EQ(new_div->input(2), "^assert1");
 
   auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 8e26daef0d1..e896e0f98ee 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -898,6 +898,22 @@ bool CanForceFP16(const NodeDef& node) {
          !IsStateful(node) && !HasInputOrOutputRefs(node);
 }
 
+int GetCudaVersion(const Cluster& cluster) {
+  auto devices = cluster.GetDevices();
+  for (const auto& device : devices) {
+    const DeviceProperties& device_properties = device.second;
+    if (device_properties.type() == "GPU") {
+      const auto& device_env = device_properties.environment();
+      auto it = device_env.find("cuda");
+      if (it != device_env.end()) {
+        string cuda_version_str = it->second;
+        return std::stoi(cuda_version_str);
+      }
+    }
+  }
+  return 0;
+}
+
 class AutoMixedPrecisionImpl {
  public:
   AutoMixedPrecisionImpl(Cluster* cluster,
@@ -907,7 +923,8 @@ class AutoMixedPrecisionImpl {
         nodes_to_preserve_(nodes_to_preserve),
         graph_(graph),
         id_(id),
-        graph_view_(graph) {}
+        graph_view_(graph),
+        cuda_version_(GetCudaVersion(*cluster)) {}
 
   Status Optimize();
 
@@ -926,7 +943,7 @@ class AutoMixedPrecisionImpl {
   bool IsOnSuitableGPUArch(const NodeDef& node) const;
   bool ShouldProcess(const NodeDef& node) const;
   bool NodeHasFP16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
-  bool IsIdentityAfterVariable(const NodeDef& node) const;
+  bool NodeImplicitlyReadsNonResourceVariable(const NodeDef& node) const;
   void ConvertBatchNormOpsToV2();
   bool SupportsFloat16(const NodeTypeId& node_type) const;
   const NodeDef* GetTailOfChain(const NodeDef& node, const string& op) const;
@@ -958,6 +975,7 @@ class AutoMixedPrecisionImpl {
   GraphDef* graph_;
   string id_;
   MutableGraphView graph_view_;
+  int cuda_version_;
   NodeTypeAttrMap node_type_map_;
   GraphTypeTopologyView graph_type_view_;
   bool force_all_fp16_;
@@ -1012,7 +1030,7 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
                          strings::StrCat("paintbuckets", suffix, ".txt"));
     f.open(fname.c_str(), std::fstream::out);
     f << "WhiteList:\n";
-    for (auto x : AutoMixedPrecisionLists::WhiteList()) {
+    for (auto x : AutoMixedPrecisionLists::WhiteList(cuda_version_)) {
       f << x << "\n";
     }
     f << "\nBlackList:\n";
@@ -1067,16 +1085,25 @@ std::pair<int, int> GetDeviceGPUArch(
     const DeviceProperties& device_properties) {
   if (device_properties.type() != "GPU") return {0, 0};
   string arch_str = device_properties.environment().at("architecture");
-  std::vector<int32> arch_pieces;
-  if (!str_util::SplitAndParseAsInts(arch_str, '.', &arch_pieces) ||
-      arch_pieces.empty()) {
+  std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
+  if (split_arch_str.empty()) {
     return {0, 0};
   }
-  std::pair<int, int> arch(arch_pieces[0], 0);
-  if (arch_pieces.size() > 1) {
-    arch.second = arch_pieces[1];
+
+  int major, minor;
+  if (!strings::safe_strto32(split_arch_str[0], &major)) {
+    return {0, 0};
+  }
+
+  if (split_arch_str.size() > 1) {
+    if (strings::safe_strto32(split_arch_str[1], &minor)) {
+      return {major, minor};
+    } else {
+      return {0, 0};
+    }
+  } else {
+    return {major, 0};
   }
-  return arch;
 }
 
 bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
@@ -1148,7 +1175,7 @@ Status AutoMixedPrecisionImpl::Optimize() {
   optimization_level = absl::AsciiStrToUpper(optimization_level);
   force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
 
-  fp16_whitelist_ = AutoMixedPrecisionLists::WhiteList();
+  fp16_whitelist_ = AutoMixedPrecisionLists::WhiteList(cuda_version_);
   fp16_blacklist_ = AutoMixedPrecisionLists::BlackList();
   fp16_graylist_ = AutoMixedPrecisionLists::GrayList();
   fp16_clearlist_ = AutoMixedPrecisionLists::ClearList();
@@ -1461,10 +1488,11 @@ void AutoMixedPrecisionImpl::PropagateWhiteThroughClear(
                   ShouldProcess(*item.node) && IsFloat32(item) &&
                   SupportsFloat16(item) &&
                   (fp16_clearlist_.count(item.node->op())) &&
-                  // We don't propagate (backwards) through Identity nodes when
-                  // they immediately follow Variable nodes because otherwise it
-                  // breaks TensorBoard visualization.
-                  !IsIdentityAfterVariable(*item.node));
+                  // We don't propagate (backwards) through nodes that read
+                  // Variables because it can break the behavior of TensorBoard
+                  // visualization and/or (in the case of Enter nodes) the model
+                  // itself. This is only a problem for non-resource variables.
+                  !NodeImplicitlyReadsNonResourceVariable(*item.node));
         }),
         DfsTypeCallbacks::PreOrder([&](int idx) {
           clear_prop_set.insert(idx);
@@ -1614,13 +1642,17 @@ void AutoMixedPrecisionImpl::ForceColorMatchBetweenDataStructureOps(
   }
 }
 
-bool AutoMixedPrecisionImpl::IsIdentityAfterVariable(
+bool AutoMixedPrecisionImpl::NodeImplicitlyReadsNonResourceVariable(
     const NodeDef& node) const {
-  if (node.op() == "Identity") {
+  if (node.op() == "Identity" || node.op() == "Enter") {
     GraphView::InputPort node_input(&node, 0);
     MutableGraphView::OutputPort prev_output =
         graph_view_.GetRegularFanin(node_input);
-    if (prev_output.node && IsVariable(*prev_output.node)) {
+    const NodeDef* input = prev_output.node;
+    if (input && ((node.op() == "Identity" && (input->op() == "Variable" ||
+                                               input->op() == "VariableV2")) ||
+                  (node.op() == "Enter" &&
+                   NodeImplicitlyReadsNonResourceVariable(*input)))) {
       return true;
     }
   }
@@ -1726,7 +1758,7 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
               added_cast_node = graph_view_.AddNode(
                   BuildCastNode(src, to_fp16, src.node->device()));
               if (to_fp16 && !IsConstant(*node) && !IsVariable(*node) &&
-                  !IsIdentityAfterVariable(*node)) {
+                  !NodeImplicitlyReadsNonResourceVariable(*node)) {
                 ++num_nonvar_casts_to_fp16;
               }
             }
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 2c81aeef540..163d1f6923f 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -33,6 +33,8 @@ class AutoMixedPrecision : public GraphOptimizer {
 
   string name() const override { return "auto_mixed_precision"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index f8641838b99..4d4fa0fa716 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -20,11 +20,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-// Needed for CUDA_VERSION macro.
-#include "third_party/gpus/cuda/include/cuda.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 namespace grappler {
 
@@ -52,7 +47,7 @@ class AutoMixedPrecisionLists {
  public:
   // Returns the set of ops that are considered numerically-safe (for execution
   // in fp16) and performance-critical. These ops are always converted to fp16.
-  static gtl::FlatSet<string> WhiteList() {
+  static gtl::FlatSet<string> WhiteList(int cuda_version) {
     string to_add, to_remove;
     TF_CHECK_OK(ReadStringFromEnvVar(
         "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_ADD", "", &to_add));
@@ -61,9 +56,6 @@ class AutoMixedPrecisionLists {
         &to_remove));
 
     auto list = gtl::FlatSet<string> {
-#if CUDA_VERSION >= 9010  // Fp16 BatchMatMul is slow before CUDA 9.1.
-      "BatchMatMul", "BatchMatMulV2",
-#endif
           "BlockLSTM", "BlockLSTMGrad", "Conv2D", "Conv2DBackpropFilter",
           "Conv2DBackpropInput",
           // TODO(benbarsdell): Enable these when Tensor Core kernels are
@@ -83,6 +75,11 @@ class AutoMixedPrecisionLists {
           // "DepthwiseConv2dNativeBackpropInput",
           "MatMul",
     };
+    if (cuda_version >= 9010) {
+      // Fp16 BatchMatMul is slow before CUDA 9.1.
+      list.insert("BatchMatMul");
+      list.insert("BatchMatMulV2");
+    }
     UpdateList(&list, to_add, to_remove);
     return list;
   }
@@ -120,6 +117,7 @@ class AutoMixedPrecisionLists {
         "FusedBatchNormGradV2",
         "FusedBatchNormV3",
         "FusedBatchNormGradV3",
+        "_FusedBatchNormEx",
         "Inv",
         "LeakyRelu",
         "LeakyReluGrad",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 6d1efa67bba..3d0c6c88406 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -60,6 +60,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
       DeviceProperties device_properties;
       device_properties.set_type("GPU");
       device_properties.mutable_environment()->insert({"architecture", "7"});
+      device_properties.mutable_environment()->insert({"cuda", "9010"});
       virtual_cluster_.reset(
           new VirtualCluster({{"/GPU:1", device_properties}}));
     }
@@ -727,6 +728,57 @@ TEST_F(AutoMixedPrecisionTest, StackV2) {
             DT_FLOAT);
 }
 
+int GetCudaVersion(const Cluster& cluster) {
+  auto devices = cluster.GetDevices();
+  for (const auto& device : devices) {
+    const DeviceProperties& device_properties = device.second;
+    if (device_properties.type() == "GPU") {
+      const auto& device_env = device_properties.environment();
+      auto it = device_env.find("cuda");
+      if (it != device_env.end()) {
+        string cuda_version_str = it->second;
+        return std::stoi(cuda_version_str);
+      }
+    }
+  }
+  return 0;
+}
+
+TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
+  Output wht1 = ops::BatchMatMul(s.WithOpName("wht1"), input, input);
+  Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht1);
+
+  GrapplerItem item;
+  item.fetch = {"fetch1"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
+    EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+    EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
+  } else {
+    EXPECT_EQ(output.node_size(), item.graph.node_size());
+    EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_FLOAT);
+  }
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 3.0e-3);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index 63f6fe5b9db..f3ba0e1fa6b 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -33,6 +33,8 @@ class AutoParallel : public GraphOptimizer {
 
   string name() const override { return "autoparallel"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 487e3bc7320..f3a3e4c188a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -60,7 +60,7 @@ namespace grappler {
 using TensorVector = gtl::InlinedVector<TensorValue, 4>;
 
 // We only fold/materialize constants smaller than 10 MiB.
-static const int64 kMaxConstantSize = 10 * 1024 * 1024;
+const int64 kMaxConstantSize = 10 * 1024 * 1024;
 
 namespace {
 template <typename T>
@@ -989,11 +989,10 @@ bool ConstantFolding::IsFoldable(const NodeDef& node,
     }
   }
 
-  // No need to (and don't) fold nodes that have no outgoing edges except
-  // whitelisted nodes. Such nodes could be introduced by an earlier constant
-  // folding pass and are preserved in case users want to fetch their values;
-  // re-processing them would lead to an error of adding a duplicated node
-  // to graph.
+  // Don't fold nodes that have no outgoing edges except whitelisted nodes.
+  // Such nodes could be introduced by an earlier constant folding pass and are
+  // preserved in case users want to fetch their values; re-processing them
+  // would lead to an error of adding a duplicated node to graph.
   const auto& outputs = node_map_->GetOutputs(node.name());
   if (outputs.empty() &&
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
@@ -1029,18 +1028,30 @@ bool ConstantFolding::IsFoldable(const NodeDef& node,
       return false;
     }
   }
+  if (is_merge && !merge_has_constant_input) return false;
 
   // If we know the output shapes, make sure that the outputs are small enough
   // to materialize.
   if (properties != nullptr && properties->HasOutputProperties(node.name())) {
+    const std::vector<OpInfo::TensorProperties>& input_props =
+        properties->GetInputProperties(node.name());
     const std::vector<OpInfo::TensorProperties>& output_props =
         properties->GetOutputProperties(node.name());
+    // Compute total size of inputs.
+    int64 input_size_bytes = 0;
+    for (const auto& input_prop : input_props) {
+      const PartialTensorShape input_shape(input_prop.shape());
+      if (input_shape.IsFullyDefined()) {
+        input_size_bytes +=
+            input_shape.num_elements() * DataTypeSize(input_prop.dtype());
+      }
+    }
     for (const auto& output_prop : output_props) {
       const PartialTensorShape output_shape(output_prop.shape());
       if (output_shape.IsFullyDefined()) {
         const int64 num_bytes =
             output_shape.num_elements() * DataTypeSize(output_prop.dtype());
-        if (num_bytes > kMaxConstantSize) {
+        if (num_bytes > input_size_bytes && num_bytes > kMaxConstantSize) {
           // Do not fold nodes if the in-memory size of output is too large.
           // Notice that this is not exactly the same check used in
           // CreateNodeDef() where the actual encoded size is checked.
@@ -1050,7 +1061,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node,
     }
   }
 
-  return !is_merge || merge_has_constant_input;
+  return true;
 }
 
 namespace {
@@ -1205,11 +1216,11 @@ Status ConstantFolding::CreateNodeDef(const string& name,
       case DT_INT64:
         POPULATE_TENSOR_PROTO(tensor, t, int64, int64);
       case DT_UINT64:
-        POPULATE_TENSOR_PROTO(tensor, t, uint64, int64);
+        POPULATE_TENSOR_PROTO(tensor, t, uint64, uint64);
       case DT_INT32:
         POPULATE_TENSOR_PROTO(tensor, t, int32, int);
       case DT_UINT32:
-        POPULATE_TENSOR_PROTO(tensor, t, uint32, int);
+        POPULATE_TENSOR_PROTO(tensor, t, uint32, uint32);
       case DT_INT16:
         POPULATE_TENSOR_PROTO(tensor, t, int16, int);
       case DT_UINT16:
@@ -2839,83 +2850,169 @@ bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
   //                       / \              / \
   //                      X   Y            C   Y   = leaves
   //
-  // where C is constant and X is non-constant, and '+' denotes an
-  // associative and commutative operator like addition or multiplication.
-  // This optimization pushes constants down in the tree to canonicalize it.
-  // Moreoever, in cases where the child node has a second constant input Y
-  // we will create a leaf node that can be folded, e.g.
+  // where C is constant, X is non-constant, Y may be constant or non-constant,
+  // and '+' denotes an associative and commutative operator like addition or
+  // multiplication. This optimization pushes constants down in the tree to
+  // canonicalize it. Moreoever, in cases where the child node has a second
+  // constant input Y we will create a leaf node that can be folded, e.g.
   //
   //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
   //
-  // TODO(rmlarsen): Handle non-associative/non-commutative operators like
-  // subtraction and division, as well as mixed subtraction/addition,
-  // division/multiplication.
-  // Don't touch BiasAdd since they can't handle vectors as their first
+  // We also handle the non-commutative cases of subtraction and division
+  // by rotating the tree locally, e.g.
+  //    Sub(C, Add(X, Y)) -> Sub(Sub(C, Y), X)
+  //    Mul(C, Div(X, Y)) -> Mul(X, Div(C, Y)).
+  //
+  // Note: Don't touch BiasAdd since they can't handle vectors as their first
   // inputs.
-  if (has_fetch_ && (IsAdd(*node) || IsMul(*node)) &&
-      NumNonControlInputs(*node) == 2) {
-    NodeDef* left_child = node_map_->GetNode(node->input(0));
-    NodeDef* right_child = node_map_->GetNode(node->input(1));
-    // One child must be constant, and the other the same op as the parent.
-    if (node->op() != left_child->op() && node->op() != right_child->op()) {
-      return false;
-    }
-    const bool left_child_is_constant = IsReallyConstant(*left_child);
-    const bool right_child_is_constant = IsReallyConstant(*right_child);
-    if (!left_child_is_constant && !right_child_is_constant) {
-      return false;
-    }
-    if (node->device() != left_child->device() ||
-        node->device() != right_child->device()) {
-      return false;
-    }
-    NodeDef* op_child_node = left_child_is_constant ? right_child : left_child;
-    NodeDef* const_child_node =
-        left_child_is_constant ? left_child : right_child;
-    // Make sure that it is safe to change the value of the child node->
-    if (op_child_node->input_size() < 2 ||
-        nodes_to_preserve_.find(op_child_node->name()) !=
-            nodes_to_preserve_.end() ||
-        NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
-      return false;
-    }
 
-    // Identify the nodes to swap.
-    NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
-    NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
-    const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
-    const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
-    if (left_leaf_is_constant && right_leaf_is_constant) {
-      // Child is already foldable, leave it alone.
-      return false;
-    }
-    const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
-    const int parent_const_input = left_child_is_constant ? 0 : 1;
-    const auto& child_output = node_map_->GetOutputs(op_child_node->name());
-    if (child_output.find(const_child_node) != child_output.end()) {
-      // If there is a control edge from the child op to C, the transformation
-      // would create a cycle in the graph. We know that it must be a control
-      // edge. We can replace such a control edge with a control edge from A
-      // to C.
-      CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                    optimized_graph, node_map_.get()));
-      string other_leaf_input = left_leaf_is_constant ? op_child_node->input(0)
-                                                      : op_child_node->input(1);
-      MaybeAddControlInput(other_leaf_input, const_child_node, optimized_graph,
-                           node_map_.get());
-    }
-
-    // Swap the constant child with a non-constant leaf node.
-    node_map_->UpdateInput(node->name(), node->input(parent_const_input),
-                           op_child_node->input(non_const_leaf_input));
-    node_map_->UpdateInput(op_child_node->name(),
-                           op_child_node->input(non_const_leaf_input),
-                           node->input(parent_const_input));
-    std::swap(*node->mutable_input(parent_const_input),
-              *op_child_node->mutable_input(non_const_leaf_input));
-    return true;
+  // Get parent op type.
+  const bool is_add = IsAdd(*node);
+  const bool is_mul = IsMul(*node);
+  const bool is_sub = IsSub(*node);
+  const bool is_div = IsDiv(*node);
+  const bool is_symmetric = is_add || is_mul;
+  if (!has_fetch_ || !(is_add || is_sub || is_mul || is_div) ||
+      NumNonControlInputs(*node) != 2) {
+    return false;
   }
-  return false;
+
+  NodeDef* left_child = node_map_->GetNode(node->input(0));
+  NodeDef* right_child = node_map_->GetNode(node->input(1));
+
+  const bool left_child_is_constant = IsReallyConstant(*left_child);
+  const bool right_child_is_constant = IsReallyConstant(*right_child);
+  if (!left_child_is_constant && !right_child_is_constant) {
+    return false;
+  }
+  // Don't move nodes across devices.
+  if (node->device() != left_child->device() ||
+      node->device() != right_child->device()) {
+    return false;
+  }
+  NodeDef* op_child = left_child_is_constant ? right_child : left_child;
+  NodeDef* const_child = left_child_is_constant ? left_child : right_child;
+  // Don't rewrite the tree if it might create cycles.
+  // TODO(rmlarsen): Add back handling of control dependency from op to C.
+  const auto& child_output = node_map_->GetOutputs(op_child->name());
+  if (child_output.find(const_child) != child_output.end()) {
+    return false;
+  }
+  // Get child op type.
+  const bool is_child_add = IsAdd(*op_child);
+  const bool is_child_mul = IsMul(*op_child);
+  const bool is_child_sub = IsSub(*op_child);
+  const bool is_child_div = IsDiv(*op_child);
+  const bool is_add_sub = (is_add || is_sub) && (is_child_add || is_child_sub);
+  const bool is_mul_div = (is_mul || is_div) && (is_child_mul || is_child_div);
+  if (!is_add_sub && !is_mul_div) {
+    return false;
+  }
+
+  const bool is_child_symmetric = is_child_add || is_child_mul;
+  // Make sure that it is safe to change the value of the child node result.
+  if (op_child->input_size() < 2 ||
+      nodes_to_preserve_.find(op_child->name()) != nodes_to_preserve_.end() ||
+      NumNonControlOutputs(*op_child, *node_map_) > 1) {
+    return false;
+  }
+  // Do not rewrite integer expressions with subtraction or division.
+  //  if (node->name().find("filter_boxes") != std::string::npos) return false;
+  if (!CheckAttrExists(*node, "T").ok()) return false;
+  DataType dtype = node->attr().at("T").type();
+  if (dtype == DT_BFLOAT16 || dtype == DT_HALF) {
+    // Don't apply reassociation to floating point types of low precision.
+    // The danger of significant numerical changes is too high.
+    return false;
+  }
+  if (!(is_symmetric && is_child_symmetric) &&
+      !(DataTypeIsFloating(dtype) || DataTypeIsComplex(dtype))) {
+    return false;
+  }
+
+  // Identify the nodes to swap.
+  NodeDef* left_leaf = node_map_->GetNode(op_child->input(0));
+  NodeDef* right_leaf = node_map_->GetNode(op_child->input(1));
+  const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
+  const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
+  if (left_leaf_is_constant && right_leaf_is_constant) {
+    // Child is already foldable, leave it alone.
+    return false;
+  }
+  // Don't move nodes across devices.
+  if (node->device() != left_leaf->device() ||
+      node->device() != right_leaf->device()) {
+    return false;
+  }
+  // Get the node names corresponding to X, Y, and C.
+  const string input_x =
+      left_leaf_is_constant ? op_child->input(1) : op_child->input(0);
+  const string input_y =
+      input_x == op_child->input(0) ? op_child->input(1) : op_child->input(0);
+  const string input_c =
+      left_child_is_constant ? node->input(0) : node->input(1);
+  const string input_op =
+      left_child_is_constant ? node->input(1) : node->input(0);
+
+  VLOG(1) << "\n++++++++ Reordering node " << node->name() << ": " << node->op()
+          << "(" << left_child->op() << ", " << right_child->op() << ")\n";
+
+  // Now we have identified the nodes to swap (non_const_leaf_input and
+  // const_child).
+  node_map_->UpdateInput(node->name(), input_c, input_x);
+  node_map_->AddOutput(input_c, op_child->name());
+  if (input_x != input_y) {
+    node_map_->RemoveOutput(input_x, op_child->name());
+  }
+
+  if (is_symmetric && is_child_symmetric) {
+    // Easy case (only commutative ops). We always write this as one of
+    //   +
+    //  / \
+    // X   +
+    //    / \
+    //   C   Y
+    node->set_input(0, input_x);
+    node->set_input(1, input_op);
+    op_child->set_input(0, input_c);
+    op_child->set_input(1, input_y);
+  } else {
+    // More complicated case: When there are non-commutative operations like
+    // subtractions or divisions involved, we may have to rotate the tree
+    // and/or change op types. There are 6 non-trivial cases depending on
+    // the effective generalized "sign" of each of the three terms C, Y, and X.
+    // Here are the final trees we want to generate for those 6 cases:
+    //
+    // (CYX signs):   ++-      +--      -+-    --+     +-+      -++
+    //                 -        -        -      -       +        +
+    //                / \      / \      / \    / \     / \      / \
+    //               +   X    -   X    -   X  X   +   X   -    X   -
+    //              / \      / \      / \        / \     / \      / \
+    //             C   Y    C   Y    Y   C      Y   C   C   Y    Y   C
+    //
+
+    // First, let's determine the effective sign of each term in the original
+    // expression
+    auto is_leaf_negated = [&](const bool is_right_leaf) -> bool {
+      bool leaf_negated = !is_child_symmetric && is_right_leaf;
+      bool child_negated = !is_symmetric && (op_child == right_child);
+      return leaf_negated != child_negated;
+    };
+    const string symmetric_op = (is_add || is_sub) ? "Add" : "Mul";
+    const string nonsymmetric_op = (is_add || is_sub) ? "Sub" : "Div";
+    bool neg_c = !is_symmetric && (const_child == right_child);
+    bool neg_x = is_leaf_negated(left_leaf_is_constant);
+    bool neg_y = is_leaf_negated(!left_leaf_is_constant);
+    // Rewrite the parent node.
+    node->set_op((neg_x || (neg_c && neg_y)) ? nonsymmetric_op : symmetric_op);
+    node->set_input(0, neg_x ? input_op : input_x);
+    node->set_input(1, neg_x ? input_x : input_op);
+    // Rewrite the child node.
+    op_child->set_op(neg_c != neg_y ? nonsymmetric_op : symmetric_op);
+    op_child->set_input(0, neg_c ? input_y : input_c);
+    op_child->set_input(1, neg_c ? input_c : input_y);
+  }
+  return true;
 }
 
 bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
@@ -3221,7 +3318,7 @@ bool ConstantFolding::PartialConcatConstFolding(GraphDef* optimized_graph,
     // child node.
     node->set_input(interval.first, added_node->name());
   }
-  if (!constant_input_runs.empty() && !inputs_to_delete.empty()) {
+  if (!inputs_to_delete.empty()) {
     // Fix up the inputs to the original node.
     protobuf::RepeatedPtrField<string> tmp;
     tmp.Swap(node->mutable_input());
@@ -3288,12 +3385,12 @@ bool ConstantFolding::MergeConcat(bool use_shape_info,
   parent_inputs.Swap(parent->mutable_input());
   std::vector<string> ctrl_output;
   // TODO(rmlarsen): IF the child occurs more than once, is it beneficial to
-  // collapse it into the parent multiple times? Probablyu not.
+  // collapse it into the parent multiple times? Probably not.
   for (const auto& input : parent_inputs) {
     if (IsSameInput(input, node->name())) {
       for (int j = 0; j < num_regular_inputs - 1; ++j) {
-        // Add tensor inputs to first child concat tensors (exceptthe final axis
-        // input) to the parent's inputs.
+        // Add tensor inputs to first child concat tensors (except the final
+        // axis input) to the parent's inputs.
         parent->add_input(node->input(j));
         node_map_->UpdateInput(parent->name(), node->name(), node->input(j));
       }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 14a717fd238..37c5674a011 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -32,6 +32,7 @@ namespace grappler {
 
 const char kConstantFoldingConst[] = "ConstantFolding";
 const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
+extern const int64 kMaxConstantSize;
 
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
@@ -48,7 +49,9 @@ class ConstantFolding : public GraphOptimizer {
 
   ~ConstantFolding() override {}
 
-  string name() const override { return "constant folding"; };
+  string name() const override { return "constant_folding"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 3928fdff9ff..7bcae29c63a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -255,20 +256,19 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
 TEST_F(ConstantFoldingTest, AddTree) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
+  Output c1 = ops::Const(s.WithOpName("c1"), 1.0f, {1});
   Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2});
   Output c3 = ops::Const(s.WithOpName("c3"), 3.0f, {2});
   Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
                               ops::Placeholder::Shape(TensorShape({2, 2})));
   Output add_child = ops::Add(s.WithOpName("add_child"), c2, x);
-  Output c1 = ops::Const(s.WithOpName("c1").WithControlDependencies(add_child),
-                         1.0f, {1});
   Output add_parent = ops::Add(s.WithOpName("add_parent"), c1, add_child);
 
-  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
-                              ops::Placeholder::Shape(TensorShape({2, 2})));
   Output c4 = ops::Const(s.WithOpName("c4"), 4.0f, {2});
   Output c5 = ops::Const(s.WithOpName("c5"), 5.0f, {2});
   Output c20 = ops::Const(s.WithOpName("c20"), 20.0f, {2});
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
   Output mul_child = ops::Mul(s.WithOpName("mul_child"), c4, y);
   Output mul_parent = ops::Mul(s.WithOpName("mul_parent"), c5, mul_child);
   Output addmul_child = ops::Add(s.WithOpName("addmul_child"), c4, x);
@@ -298,16 +298,16 @@ TEST_F(ConstantFoldingTest, AddTree) {
   //     / \              / \
   //   5.0  y           4.0 5.0
 
-  EXPECT_EQ(11, output.node_size());
+  EXPECT_EQ(10, output.node_size());
   for (const auto& node : output.node()) {
     if (node.name() == "add_child") {
       EXPECT_EQ("Const", node.op());
       TensorProto t = node.attr().at("value").tensor();
-      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      ASSERT_EQ(1, t.tensor_shape().dim_size());
       EXPECT_EQ(2, t.tensor_shape().dim(0).size());
     } else if (node.name() == "add_parent") {
       EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(2, node.input_size());
+      ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("add_child", node.input(1));
     } else if (node.name() == "mul_child") {
@@ -317,30 +317,161 @@ TEST_F(ConstantFoldingTest, AddTree) {
       EXPECT_EQ(2, t.tensor_shape().dim(0).size());
     } else if (node.name() == "mul_parent") {
       EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(2, node.input_size());
+      ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("y", node.input(0));
       EXPECT_EQ("mul_child", node.input(1));
     } else if (node.name() == "addmul_child") {
       // Unchanged.
       EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(2, node.input_size());
+      ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("c4", node.input(0));
       EXPECT_EQ("x", node.input(1));
     }
   }
 
   // Check that the result nodes have the expected value.
-  std::vector<string> fetch = {"c3", "c20"};
-  auto tensor_expected = EvaluateNodes(item.graph, fetch);
-  EXPECT_EQ(fetch.size(), tensor_expected.size());
-  fetch = {"add_child", "mul_child"};
-  auto tensors = EvaluateNodes(output, fetch);
-  EXPECT_EQ(fetch.size(), tensors.size());
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+
+  std::vector<string> fetch = {"add_parent", "mul_parent"};
+  auto tensor_expected =
+      EvaluateNodes(item.graph, fetch, {{"x", x_t}, {"y", y_t}});
+  ASSERT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"add_parent", "mul_parent"};
+  auto tensors = EvaluateNodes(output, fetch, {{"x", x_t}, {"y", y_t}});
+  ASSERT_EQ(fetch.size(), tensors.size());
   for (int i = 0; i < fetch.size(); i++) {
     test::ExpectTensorEqual<float>(tensor_expected[i], tensors[i]);
   }
 }
 
+TEST_F(ConstantFoldingTest, AddSubtactTree) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output c1 = ops::Const(s.WithOpName("c1"), 1.0f, {1});
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output sub_child = ops::Sub(s.WithOpName("sub_child"), x, x);
+  Output add_parent = ops::Add(s.WithOpName("add_parent"), sub_child, c1);
+
+  GrapplerItem item;
+  item.fetch = {"add_parent"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     +                +
+  //    / \              / \
+  //   -   1     -->    -   x
+  //  / \              / \
+  // x   x            1   x
+
+  EXPECT_EQ(4, output.node_size());
+  for (const auto& node : output.node()) {
+    if (node.name() == "sub_child") {
+      EXPECT_EQ("Sub", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("c1", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    } else if (node.name() == "add_parent") {
+      EXPECT_EQ("Add", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("sub_child", node.input(1));
+    }
+  }
+
+  // Check that the result nodes have the expected value.
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+
+  std::vector<string> fetch = {"add_parent"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch, {{"x", x_t}});
+  ASSERT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"add_parent"};
+  auto tensors = EvaluateNodes(output, fetch, {{"x", x_t}});
+  ASSERT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[i], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, TreeCanonicalization) {
+  for (int is_add : {true, false}) {
+    for (int is_parent_commutative : {true, false}) {
+      for (int is_child_commutative : {true, false}) {
+        for (int is_left_child_const : {true, false}) {
+          for (int is_left_leaf_const : {true, false}) {
+            tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+            Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2});
+            Output c3 = ops::Const(s.WithOpName("c3"), 3.0f, {2});
+            Output x =
+                ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                 ops::Placeholder::Shape(TensorShape({2, 2})));
+
+            auto get_op = [&](bool is_commutative, bool is_left_arg_cont,
+                              const string& name, const Output& const_arg,
+                              const Output non_const_arg) -> Output {
+              if (is_add) {
+                if (is_commutative) {
+                  return ops::Add(s.WithOpName(name),
+                                  is_left_arg_cont ? const_arg : non_const_arg,
+                                  is_left_arg_cont ? non_const_arg : const_arg);
+                } else {
+                  return ops::Sub(s.WithOpName(name),
+                                  is_left_arg_cont ? const_arg : non_const_arg,
+                                  is_left_arg_cont ? non_const_arg : const_arg);
+                }
+              } else {
+                if (is_commutative) {
+                  return ops::Mul(s.WithOpName(name),
+                                  is_left_arg_cont ? const_arg : non_const_arg,
+                                  is_left_arg_cont ? non_const_arg : const_arg);
+                } else {
+                  return ops::Div(s.WithOpName(name),
+                                  is_left_arg_cont ? const_arg : non_const_arg,
+                                  is_left_arg_cont ? non_const_arg : const_arg);
+                }
+              }
+            };
+
+            Output child = get_op(is_child_commutative, is_left_leaf_const,
+                                  "child", c2, x);
+            Output parent = get_op(is_parent_commutative, is_left_child_const,
+                                   "parent", c3, child);
+            GrapplerItem item;
+            item.fetch = {"parent"};
+            TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+            ConstantFolding optimizer(/*cpu_device=*/nullptr);
+            GraphDef output;
+            Status status =
+                optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+            TF_EXPECT_OK(status);
+
+            // Check that the result nodes have the expected value.
+            auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+            std::vector<string> fetch = {"parent"};
+            auto tensor_expected =
+                EvaluateNodes(item.graph, fetch, {{"x", x_t}});
+            ASSERT_EQ(fetch.size(), tensor_expected.size());
+            fetch = {"parent"};
+            auto tensors = EvaluateNodes(output, fetch, {{"x", x_t}});
+            ASSERT_EQ(fetch.size(), tensors.size());
+            for (int i = 0; i < fetch.size(); i++) {
+              test::ExpectTensorEqual<float>(tensor_expected[i], tensors[i]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
   for (string data_format : {
          "NHWC",
@@ -2772,6 +2903,39 @@ TEST_F(ConstantFoldingTest, Packing) {
   EXPECT_GT(8000, output.ByteSizeLong());
 }
 
+TEST_F(ConstantFoldingTest, LargeConstantNoSizeIncrease) {
+  // Build a simple graph with a large constant with size greater than
+  // kMaxConstantSize that can be folded because the resulting size does not
+  // increase.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  const int64 large_constant_size = kMaxConstantSize + 1;
+  Output a = ops::Variable(scope.WithOpName("a"), {1, 1}, DT_FLOAT);
+  Output b_const =
+      ops::Const(scope.WithOpName("b_const"), 3.14f, {1, large_constant_size});
+  Output b = ops::Identity(scope.WithOpName("b"), b_const);
+  Output matmul = ops::MatMul(scope.WithOpName("matmul"), a, b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "b") {
+      EXPECT_EQ("Const", node.op());
+    }
+  }
+  EXPECT_EQ(4, output.node_size());
+  EXPECT_LT(output.ByteSizeLong(), sizeof(float) * large_constant_size + 500);
+}
+
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a =
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
index bdb1ae85321..de78e287429 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
@@ -37,6 +37,7 @@ class TestGraphOptimizer : public CustomGraphOptimizer {
     return Status::OK();
   }
   string name() const override { return kTestOptimizerName; }
+  bool UsesFunctionLibrary() const override { return false; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 8fffe36e84d..288d2e9d340 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_protos_all")
 
 package(
     default_visibility = [
@@ -20,6 +20,7 @@ cc_library(
         ":inject_prefetch",
         ":latency_all_edges",
         ":make_sloppy",
+        ":make_stateless",
         ":map_and_batch_fusion",
         ":map_and_filter_fusion",
         ":map_fusion",
@@ -309,6 +310,21 @@ cc_library(
     alwayslink = 1,
 )
 
+tf_cc_test(
+    name = "inject_prefetch_test",
+    srcs = ["inject_prefetch_test.cc"],
+    deps = [
+        ":graph_test_utils",
+        ":graph_utils",
+        ":inject_prefetch",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "latency_all_edges",
     srcs = ["latency_all_edges.cc"],
@@ -374,6 +390,37 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "make_stateless",
+    srcs = ["make_stateless.cc"],
+    hdrs = ["make_stateless.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "make_stateless_test",
+    srcs = ["make_stateless_test.cc"],
+    deps = [
+        ":graph_test_utils",
+        ":graph_utils",
+        ":make_stateless",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "map_and_batch_fusion",
     srcs = ["map_and_batch_fusion.cc"],
@@ -682,6 +729,7 @@ cc_library(
     srcs = ["rebatch.cc"],
     hdrs = ["rebatch.h"],
     deps = [
+        ":function_utils",
         ":graph_utils",
         ":optimizer_base",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index a82f04eea4e..0b7996a6862 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -37,6 +37,7 @@ namespace {
 // clang-format off
 constexpr char kShardDatasetOpName[] = "ShardDataset";
 constexpr char kShuffleDatasetOpName[] = "ShuffleDataset";
+constexpr char kShuffleDatasetV2OpName[] = "ShuffleDatasetV2";
 
 constexpr std::array<const char*, 4> kReaderDatasetOps = {
     "FixedLengthRecordDataset",
@@ -50,7 +51,7 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 23> kPassThroughOps = {
+constexpr std::array<const char*, 25> kPassThroughOps = {
     "_Retval",
     "BatchDataset",
     "BatchDatasetV2",
@@ -58,6 +59,7 @@ constexpr std::array<const char*, 23> kPassThroughOps = {
     "PaddedBatchDataset",
     "PaddedBatchDatasetV2",
     "CacheDataset",
+    "CacheDatasetV2",
     "FilterDataset",
     "Identity",
     "MapAndBatchDataset",
@@ -71,9 +73,10 @@ constexpr std::array<const char*, 23> kPassThroughOps = {
     "ShardDataset",
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
+    "ShuffleDatasetV2",
     "SkipDataset",
     "TakeDataset",
-    "WindowDataset"
+    "WindowDataset",
 };
 
 // TODO(frankchn): Process functions within kFuncDatasetOps as well.
@@ -129,8 +132,8 @@ Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
   // Add shapes and other attributes
   NodeDef* add_after = graph->GetNode(add_before.input(0));
 
-  if (str_util::EndsWith(add_after->op(), "Dataset") ||
-      str_util::EndsWith(add_after->op(), "DatasetV2")) {
+  if (absl::EndsWith(add_after->op(), "Dataset") ||
+      absl::EndsWith(add_after->op(), "DatasetV2")) {
     // We still may or may not have the right attributes because Datasets like
     // TFRecordDataset doesn't have a output type or shape, and by default we
     // set them to DT_STRING and an unknown shape.
@@ -174,27 +177,48 @@ Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
 }
 
 Status AddShuffleNode(MutableGraphView* graph, const NodeDef& add_before,
-                      const string& buffer_node) {
+                      const string& buffer_size_node, const string& seed_node,
+                      const string& seed2_node, bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
-
   NodeDef new_node;
   new_node.set_op(kShuffleDatasetOpName);
   graph_utils::SetUniqueGraphNodeName(kShuffleDatasetOpName, graph->graph(),
                                       &new_node);
 
-  NodeDef* seed = graph_utils::AddScalarConstNode<int64>(1, graph);
-  NodeDef* seed2 = graph_utils::AddScalarConstNode<int64>(2, graph);
-  AttrValue reshuffle;
-  reshuffle.set_b(false);
-
   new_node.add_input(add_before.input(0));
-  new_node.add_input(buffer_node);
-  new_node.add_input(seed->name());
-  new_node.add_input(seed2->name());
+  new_node.add_input(buffer_size_node);
+  new_node.add_input(seed_node);
+  new_node.add_input(seed2_node);
+
+  graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
+  graph_utils::CopyAttribute("output_types", *add_after, &new_node);
+
+  AttrValue reshuffle_attr;
+  reshuffle_attr.set_b(reshuffle_each_iteration);
+  (*new_node.mutable_attr())["reshuffle_each_iteration"] = reshuffle_attr;
+
+  NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
+
+  TF_RETURN_IF_ERROR(
+      graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
+  return Status::OK();
+}
+
+Status AddShuffleV2Node(MutableGraphView* graph, const NodeDef& add_before,
+                        const string& buffer_size_node,
+                        const string& seed_generator_node) {
+  NodeDef* add_after = graph->GetNode(add_before.input(0));
+  NodeDef new_node;
+  new_node.set_op(kShuffleDatasetV2OpName);
+  graph_utils::SetUniqueGraphNodeName(kShuffleDatasetV2OpName, graph->graph(),
+                                      &new_node);
+
+  new_node.add_input(add_before.input(0));
+  new_node.add_input(buffer_size_node);
+  new_node.add_input(seed_generator_node);
 
   graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
   graph_utils::CopyAttribute("output_types", *add_after, &new_node);
-  (*new_node.mutable_attr())["reshuffle_each_iteration"] = reshuffle;
 
   NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
 
@@ -223,19 +247,45 @@ bool ReaderOpInFunction(const NodeDef& node,
 
 Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
                             absl::flat_hash_set<string>* nodes_to_delete,
-                            bool* shuffle_removed,
-                            string* buffer_size_node_name) {
+                            string* op_name, string* buffer_size_node,
+                            string* seed_node, string* seed2_node,
+                            bool* reshuffle_each_iteration) {
   if (node.op() == kShuffleDatasetOpName) {
-    *shuffle_removed = true;
-    *buffer_size_node_name = node.input(1);
+    *op_name = node.op();
+    *buffer_size_node = node.input(1);
+    *seed_node = node.input(2);
+    *seed2_node = node.input(3);
+    *reshuffle_each_iteration = node.attr().at("reshuffle_each_iteration").b();
     TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
     nodes_to_delete->insert(node.name());
   }
 
   for (const auto& fanin : graph->GetFanins(node, true)) {
-    TF_RETURN_IF_ERROR(RemoveShuffleDataset(graph, *fanin.node, nodes_to_delete,
-                                            shuffle_removed,
-                                            buffer_size_node_name));
+    TF_RETURN_IF_ERROR(RemoveShuffleDataset(
+        graph, *fanin.node, nodes_to_delete, op_name, buffer_size_node,
+        seed_node, seed2_node, reshuffle_each_iteration));
+  }
+
+  // TODO(frankchn): Traverse functions too.
+  return Status::OK();
+}
+
+Status RemoveShuffleDatasetV2(MutableGraphView* graph, const NodeDef& node,
+                              absl::flat_hash_set<string>* nodes_to_delete,
+                              string* op_name, string* buffer_size_node,
+                              string* seed_generator_node) {
+  if (node.op() == kShuffleDatasetV2OpName) {
+    *op_name = node.op();
+    *buffer_size_node = node.input(1);
+    *seed_generator_node = node.input(2);
+    TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
+    nodes_to_delete->insert(node.name());
+  }
+
+  for (const auto& fanin : graph->GetFanins(node, true)) {
+    TF_RETURN_IF_ERROR(
+        RemoveShuffleDatasetV2(graph, *fanin.node, nodes_to_delete, op_name,
+                               buffer_size_node, seed_generator_node));
   }
 
   // TODO(frankchn): Traverse functions too.
@@ -245,15 +295,29 @@ Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
 Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
                                 absl::flat_hash_set<string>* nodes_to_delete,
                                 int64 num_workers, int64 index) {
-  bool shuffle_removed = false;
-  string buffer_size_node_name = "";
+  string shuffle_op_name = "";
+  string buffer_size_node = "";
+  string seed_node = "";
+  string seed2_node = "";
+  string seed_generator_node = "";
+  bool reshuffle_each_iteration;
 
   TF_RETURN_IF_ERROR(AddShardNode(graph, node, num_workers, index));
   TF_RETURN_IF_ERROR(RemoveShuffleDataset(
-      graph, node, nodes_to_delete, &shuffle_removed, &buffer_size_node_name));
+      graph, node, nodes_to_delete, &shuffle_op_name, &buffer_size_node,
+      &seed_node, &seed2_node, &reshuffle_each_iteration));
+  if (shuffle_op_name.empty()) {
+    TF_RETURN_IF_ERROR(
+        RemoveShuffleDatasetV2(graph, node, nodes_to_delete, &shuffle_op_name,
+                               &buffer_size_node, &seed_generator_node));
+  }
 
-  if (shuffle_removed) {
-    TF_RETURN_IF_ERROR(AddShuffleNode(graph, node, buffer_size_node_name));
+  if (shuffle_op_name == kShuffleDatasetOpName) {
+    TF_RETURN_IF_ERROR(AddShuffleNode(graph, node, buffer_size_node, seed_node,
+                                      seed2_node, reshuffle_each_iteration));
+  } else if (shuffle_op_name == kShuffleDatasetV2OpName) {
+    TF_RETURN_IF_ERROR(
+        AddShuffleV2Node(graph, node, buffer_size_node, seed_generator_node));
   }
 
   return Status::OK();
@@ -383,7 +447,6 @@ Status AutoShard::OptimizeAndCollectStats(Cluster* /* cluster */,
                                           GraphDef* output,
                                           OptimizationStats* stats) {
   *output = item.graph;
-
   TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, index_, output));
   stats->num_changes++;
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.h b/tensorflow/core/grappler/optimizers/data/auto_shard.h
index 67692b9e8bc..1bdc69974d1 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.h
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -32,6 +32,8 @@ class AutoShard : public TFDataOptimizerBase {
 
   string name() const override { return "tf_auto_shard"; }
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
 
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index ac0326c0ec2..bc428434b10 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -29,6 +29,8 @@ class FilterFusion : public TFDataOptimizerBase {
 
   string name() const override { return "filter_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.h
index 523728a81e6..14d872e1265 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_with_random_uniform_fusion.h
@@ -28,6 +28,8 @@ class FilterWithRandomUniformFusion : public TFDataOptimizerBase {
 
   string name() const override { return "filter_with_random_uniform_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index 20536910db1..40f4f24b03f 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -88,18 +88,27 @@ void ReplaceReferences(const string& from, const string& to,
 
 void AddFunctionOutputWithUniqueName(StringPiece prefix,
                                      StringPiece output_tensor_name,
-                                     FunctionDef* function, DataType dt) {
+                                     FunctionDef* fdef, DataType dtype) {
   string name = string(prefix);
-  int id = function->signature().output_arg_size();
-  while (ContainsFunctionOutputWithName(name, *function)) {
+  int id = fdef->signature().output_arg_size();
+  while (ContainsFunctionOutputWithName(name, *fdef)) {
     name = strings::StrCat(prefix, "/_", id);
     ++id;
   }
-  auto* output = function->mutable_signature()->mutable_output_arg()->Add();
+  auto* output = fdef->mutable_signature()->mutable_output_arg()->Add();
   output->set_name(name);
-  output->set_type(dt);
+  output->set_type(dtype);
 
-  (*function->mutable_ret())[name] = string(output_tensor_name);
+  (*fdef->mutable_ret())[name] = string(output_tensor_name);
+}
+
+OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+                               DataType dtype) {
+  auto* input_arg = fdef->mutable_signature()->mutable_input_arg()->Add();
+  input_arg->set_type(dtype);
+  input_arg->set_name(name);
+
+  return input_arg;
 }
 
 NodeDef* AddNode(StringPiece name, StringPiece op,
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index 79271e8ad0c..8941e58c558 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -61,7 +61,11 @@ void ReplaceReferences(const string& from, const string& to, FunctionDef* func);
 // is unique, and maps to output_tensor_name in the ret dict.
 void AddFunctionOutputWithUniqueName(StringPiece prefix,
                                      StringPiece output_tensor_name,
-                                     FunctionDef* function, DataType dt);
+                                     FunctionDef* fdef, DataType dtype);
+
+// Adds an input to a FunctionDef.
+OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+                               DataType dtype);
 
 // Adds a node to a FunctionDef.
 NodeDef* AddNode(StringPiece name, StringPiece op,
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index 8ae0cde4cd1..9a53b00275e 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -60,6 +60,18 @@ TEST(FunctionUtilsTest, AddFunctionOutputWithUniqueName) {
   EXPECT_EQ(function.ret().at("y/_1"), "two");
 }
 
+TEST(FunctionUtilsTest, AddFunctionInput) {
+  FunctionDef fdef;
+  auto arg0 = AddFunctionInput("arg0", &fdef, DT_INT32);
+  auto arg1 = AddFunctionInput("arg1", &fdef, DT_BOOL);
+  EXPECT_EQ(fdef.signature().input_arg().data()[0], arg0);
+  EXPECT_EQ(arg0->name(), "arg0");
+  EXPECT_EQ(arg0->type(), DT_INT32);
+  EXPECT_EQ(fdef.signature().input_arg().data()[1], arg1);
+  EXPECT_EQ(arg1->name(), "arg1");
+  EXPECT_EQ(arg1->type(), DT_BOOL);
+}
+
 TEST(FunctionUtilsTest, ContainsFunctionNodeWithName) {
   FunctionDef function = test::function::XTimesTwo();
   EXPECT_FALSE(ContainsFunctionNodeWithName(
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 934026706b2..323e3c2c6d8 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -25,6 +25,22 @@ namespace tensorflow {
 namespace grappler {
 namespace graph_tests_utils {
 
+NodeDef MakeCacheV2Node(StringPiece name, StringPiece input_node_name,
+                        StringPiece filename_node_name,
+                        StringPiece cache_node_name) {
+  return test::function::NDef(
+      name, "CacheDatasetV2",
+      {
+          string(input_node_name),
+          string(filename_node_name),
+          string(cache_node_name),
+      },
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+      });
+}
+
 NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
                        StringPiece function_name) {
   return test::function::NDef(
@@ -60,12 +76,12 @@ NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
        {"output_types", gtl::ArraySlice<DataType>{}}});
 }
 
-NodeDef MakeParallelInterleaveNode(StringPiece name,
-                                   StringPiece input_node_name,
-                                   StringPiece cycle_length_node_name,
-                                   StringPiece block_length_node_name,
-                                   StringPiece num_parallel_calls_node_name,
-                                   StringPiece function_name, bool sloppy) {
+NodeDef MakeParallelInterleaveV2Node(StringPiece name,
+                                     StringPiece input_node_name,
+                                     StringPiece cycle_length_node_name,
+                                     StringPiece block_length_node_name,
+                                     StringPiece num_parallel_calls_node_name,
+                                     StringPiece function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
       {string(input_node_name), string(cycle_length_node_name),
@@ -107,6 +123,22 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
       });
 }
 
+NodeDef MakeShuffleV2Node(StringPiece name, StringPiece input_node_name,
+                          StringPiece buffer_size_node_name,
+                          StringPiece seed_generator_node_name) {
+  return test::function::NDef(
+      name, "ShuffleDatasetV2",
+      {
+          string(input_node_name),
+          string(buffer_size_node_name),
+          string(seed_generator_node_name),
+      },
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+      });
+}
+
 }  // namespace graph_tests_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index 3750e2d5cce..0dcfe656b89 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -23,6 +23,11 @@ namespace tensorflow {
 namespace grappler {
 namespace graph_tests_utils {
 
+// Creates a test NodeDef for ShuffleDatasetV2.
+NodeDef MakeCacheV2Node(StringPiece name, StringPiece input_node_name,
+                        StringPiece filename_node_name,
+                        StringPiece cache_node_name);
+
 // Creates a test NodeDef for FilterDataset.
 NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
                        StringPiece function_name = "IsZero");
@@ -38,13 +43,13 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece drop_remainder_node_name,
                             StringPiece function_name = "XTimesTwo");
 
-// Creates a test NodeDef for ParallelInterleaveDataset.
-NodeDef MakeParallelInterleaveNode(StringPiece name,
-                                   StringPiece input_node_name,
-                                   StringPiece cycle_length_node_name,
-                                   StringPiece block_length_node_name,
-                                   StringPiece num_parallel_calls_node_name,
-                                   StringPiece function_name, bool sloppy);
+// Creates a test NodeDef for ParallelInterleaveDatasetV2.
+NodeDef MakeParallelInterleaveV2Node(StringPiece name,
+                                     StringPiece input_node_name,
+                                     StringPiece cycle_length_node_name,
+                                     StringPiece block_length_node_name,
+                                     StringPiece num_parallel_calls_node_name,
+                                     StringPiece function_name, bool sloppy);
 
 // Creates a test NodeDef for ParallelMapDataset.
 NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
@@ -56,6 +61,11 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
                              StringPiece num_parallel_calls_node_name,
                              bool sloppy);
 
+// Creates a test NodeDef for ShuffleDatasetV2.
+NodeDef MakeShuffleV2Node(StringPiece name, StringPiece input_node_name,
+                          StringPiece buffer_size_node_name,
+                          StringPiece seed_generator_node_name);
+
 }  // namespace graph_tests_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 758f7786aff..ce56b7c3b0e 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -157,6 +158,46 @@ NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph) {
       graph);
 }
 
+Status GetScalarConstNodeValueHelper(
+    const NodeDef& node, DataType dtype,
+    const std::function<void(const Tensor&)>& get_value) {
+  if (node.op() != kConstOpName)
+    return errors::InvalidArgument("Node ", node.name(),
+                                   " is not a Const node. Op: ", node.op());
+
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor));
+  if (!TensorShapeUtils::IsScalar(tensor.shape())) {
+    return errors::InvalidArgument(
+        "Node ", node.name(),
+        " should be a scalar but has shape: ", tensor.shape());
+  }
+
+  if (tensor.dtype() != dtype) {
+    return errors::InvalidArgument(
+        "Node ", node.name(), " should have type ", DataTypeString(dtype),
+        " but has type: ", DataTypeString(tensor.dtype()));
+  }
+
+  get_value(tensor);
+
+  return Status::OK();
+}
+
+template <>
+Status GetScalarConstNodeValue(const NodeDef& node, int64* value) {
+  return GetScalarConstNodeValueHelper(
+      node, DT_INT64,
+      [value](const Tensor& tensor) { *value = tensor.scalar<int64>()(); });
+}
+
+template <>
+Status GetScalarConstNodeValue(const NodeDef& node, bool* value) {
+  return GetScalarConstNodeValueHelper(
+      node, DT_BOOL,
+      [value](const Tensor& tensor) { *value = tensor.scalar<bool>()(); });
+}
+
 bool Compare(const GraphDef& g1, const GraphDef& g2) {
   if (g1.node_size() != g2.node_size()) {
     return false;
@@ -239,6 +280,18 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
   return graph.GetRegularFanin(input_port).node;
 }
 
+Status GetDatasetOutputTypesAttr(const NodeDef& node,
+                                 DataTypeVector* output_types) {
+  // We don't name the output_types attr consistently, so should check for both.
+  for (const string& attr_name : {"output_types", "Toutput_types"}) {
+    if (node.attr().contains(attr_name)) {
+      return GetNodeAttr(node, attr_name, output_types);
+    }
+  }
+  return errors::InvalidArgument("Could not find output_types attr for node: ",
+                                 node.name(), " with op: ", node.op());
+}
+
 void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph,
                             NodeDef* node) {
   string name = string(prefix);
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 417a8c4ffd1..87c9831126f 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -80,6 +80,21 @@ NodeDef* AddScalarConstNode(int64 v, MutableGraphView* graph);
 template <>
 NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph);
 
+// Retrieves the value of a const node. Returns an error
+// if the node is not const, or its value is of a different type.
+template <typename T>
+Status GetScalarConstNodeValue(const NodeDef& node, T* value) {
+  // is_same is an idiomatic hack for making it compile if not instantiated.
+  // Replacing with false will result in a compile-time error.
+  static_assert(!std::is_same<T, T>::value,
+                "Invalid specialization of this method fo rtype T.");
+}
+
+template <>
+Status GetScalarConstNodeValue(const NodeDef& node, int64* value);
+template <>
+Status GetScalarConstNodeValue(const NodeDef& node, bool* value);
+
 // Checks whether the two graphs are the same.
 bool Compare(const GraphDef& g1, const GraphDef& g2);
 
@@ -113,6 +128,10 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph);
 NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
                       int64 i);
 
+// Gets the attr corresponding to a dataset node's output types, if it exists.
+Status GetDatasetOutputTypesAttr(const NodeDef& node,
+                                 DataTypeVector* output_types);
+
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
 std::vector<int> FindAllGraphNodesWithOp(const string& op,
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 93df72ab623..125f2e3ea32 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -85,6 +85,64 @@ TEST(GraphUtilsTest, AddScalarConstNodeString) {
   EXPECT_EQ(string_node->attr().at("value").tensor().string_val(0), "hello");
 }
 
+TEST(GraphUtilsTest, GetScalarConstNodeInt64) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int64_node = AddScalarConstNode<int64>(128, &graph);
+  int64 result;
+  EXPECT_TRUE(GetScalarConstNodeValue<int64>(*int64_node, &result).ok());
+  EXPECT_EQ(result, 128);
+}
+
+TEST(GraphUtilsTest, GetScalarConstNodeBool) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* bool_node = AddScalarConstNode<bool>(true, &graph);
+  bool result;
+  EXPECT_TRUE(GetScalarConstNodeValue<bool>(*bool_node, &result).ok());
+  EXPECT_EQ(result, true);
+}
+
+TEST(GraphUtilsTest, GetScalarConstNodeErrorWithNonConst) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* non_const = AddScalarPlaceholder(DT_INT64, &graph);
+  int64 result;
+  Status s = GetScalarConstNodeValue<int64>(*non_const, &result);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Node Placeholder is not a Const node. Op: Placeholder");
+}
+
+TEST(GraphUtilsTest, GetScalarConstNodeErrorWithType) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int64_node = AddScalarConstNode<int64>(128, &graph);
+  bool result;
+  Status s = GetScalarConstNodeValue<bool>(*int64_node, &result);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Node Const should have type bool but has type: int64");
+}
+
+TEST(GraphUtilsTest, GetScalarConstNodeErrorWithVector) {
+  NodeDef node;
+  node.set_name("Const");
+  node.set_op("Const");
+
+  (*node.mutable_attr())["dtype"].set_type(DT_INT64);
+  auto tensor = (*node.mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT64);
+  tensor->mutable_tensor_shape()->mutable_dim()->Add()->set_size(1);
+  tensor->add_int64_val(128);
+
+  int64 result;
+  Status s = GetScalarConstNodeValue<int64>(node, &result);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Node Const should be a scalar but has shape: [1]");
+}
+
 TEST(GraphUtilsTest, Compare) {
   GraphDef graph_def_a;
   MutableGraphView graph_a(&graph_def_a);
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
index 94db9f72a45..d4f3d48fb2a 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
@@ -37,6 +37,8 @@ class HoistRandomUniform : public TFDataOptimizerBase {
 
   string name() const override { return "hoist_random_uniform"; };
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
index 479ce1e7ea5..bd37becf959 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
@@ -30,6 +30,9 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kLegacyAutotune[] = "legacy_autotune";
+constexpr char kPrefetchDataset[] = "PrefetchDataset";
+
 constexpr std::array<const char*, 4> kAsyncDatasetOps = {
     "ExperimentalMapAndBatchDataset",
     "ParallelMapDataset",
@@ -65,7 +68,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
   for (const NodeDef* async_dataset_node : async_datasets) {
     NodeDef prefetch_node;
     graph_utils::SetUniqueGraphNodeName(
-        strings::StrCat("autotune/prefetch_", async_dataset_node->name()),
+        strings::StrCat("inject/prefetch_", async_dataset_node->name()),
         graph.graph(), &prefetch_node);
     prefetch_node.set_op("PrefetchDataset");
     // `input_dataset` input
@@ -82,6 +85,14 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
     TF_RETURN_IF_ERROR(
         graph.UpdateFanouts(async_dataset_node->name(), added_node->name()));
   }
+
+  for (NodeDef& node : *output->mutable_node()) {
+    if (node.op() == kPrefetchDataset) {
+      (*node.mutable_attr())[kLegacyAutotune].set_b(false);
+      stats->num_changes++;
+    }
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h b/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
index 8f51dab4d9f..b685e7aefa9 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
@@ -30,7 +30,9 @@ class InjectPrefetch : public TFDataOptimizerBase {
   InjectPrefetch() = default;
   ~InjectPrefetch() override = default;
 
-  string name() const override { return "autotune_buffers"; };
+  string name() const override { return "inject_prefetch"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
new file mode 100644
index 00000000000..9c75867ca9d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/inject_prefetch.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MakeStateless, ParallelMap) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeParallelMapNode("map", "range",
+                                              "num_parallel_calls", "XTimesTwo",
+                                              /*sloppy=*/false)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  InjectPrefetch optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
+  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
+}
+
+TEST(MakeStateless, ParallelInterleave) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeParallelInterleaveV2Node(
+           "interleave", "range", "cycle_length", "block_length",
+           "num_parallel_calls", "XTimesTwo", /*sloppy=*/false)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  InjectPrefetch optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
+  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
+}
+
+TEST(MakeStateless, MapAndBatch) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("batch_size", "Const", {}, {{"value", 32}, {"dtype", DT_INT64}}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT64}}),
+       NDef("drop_remainder", "Const", {},
+            {{"value", false}, {"dtype", DT_BOOL}}),
+       graph_tests_utils::MakeMapAndBatchNode(
+           "map_and_batch", "range", "batch_size", "num_parallel_calls",
+           "drop_remainder", "XTimesTwo")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  InjectPrefetch optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
+  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
index 313d108286b..f180b922a35 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
@@ -28,6 +28,8 @@ class LatencyAllEdges : public TFDataOptimizerBase {
 
   string name() const override { return "latency_all_edges"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index cf42e841989..a92e4aa473e 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -28,6 +28,8 @@ class MakeSloppy : public TFDataOptimizerBase {
 
   string name() const override { return "make_sloppy"; }
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc
index 24431f47441..89bb3f35842 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc
@@ -29,10 +29,6 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-using graph_tests_utils::MakeParallelInterleaveNode;
-using graph_tests_utils::MakeParallelMapNode;
-using graph_tests_utils::MakeParseExampleNode;
-
 TEST(MakeSloppy, ParallelInterleave) {
   using test::function::NDef;
   GrapplerItem item;
@@ -45,9 +41,9 @@ TEST(MakeSloppy, ParallelInterleave) {
        NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("num_parallel_calls", "Const", {},
             {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeParallelInterleaveNode("interleave", "range", "cycle_length",
-                                  "block_length", "num_parallel_calls",
-                                  "XTimesTwo", /*sloppy=*/false)},
+       graph_tests_utils::MakeParallelInterleaveV2Node(
+           "interleave", "range", "cycle_length", "block_length",
+           "num_parallel_calls", "XTimesTwo", /*sloppy=*/false)},
       // FunctionLib
       {
           test::function::XTimesTwo(),
@@ -71,8 +67,9 @@ TEST(MakeSloppy, ParallelMap) {
        NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
        NDef("num_parallel_calls", "Const", {},
             {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
-                           /*sloppy=*/false)},
+       graph_tests_utils::MakeParallelMapNode("map", "range",
+                                              "num_parallel_calls", "XTimesTwo",
+                                              /*sloppy=*/false)},
       // FunctionLib
       {
           test::function::XTimesTwo(),
@@ -96,8 +93,9 @@ TEST(MakeSloppy, ParseExampleDataset) {
        NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
        NDef("num_parallel_calls", "Const", {},
             {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeParseExampleNode("parse_example", "range", "num_parallel_calls",
-                            /*sloppy=*/false)},
+       graph_tests_utils::MakeParseExampleNode("parse_example", "range",
+                                               "num_parallel_calls",
+                                               /*sloppy=*/false)},
       // FunctionLib
       {});
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless.cc b/tensorflow/core/grappler/optimizers/data/make_stateless.cc
new file mode 100644
index 00000000000..a18ca58f246
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_stateless.cc
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_stateless.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kCacheDataset[] = "CacheDataset";
+constexpr char kCacheDatasetV2[] = "CacheDatasetV2";
+constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
+constexpr char kShuffleDataset[] = "ShuffleDataset";
+constexpr char kShuffleDatasetV2[] = "ShuffleDatasetV2";
+
+}  // namespace
+
+Status MakeStateless::OptimizeAndCollectStats(Cluster* cluster,
+                                              const GrapplerItem& item,
+                                              GraphDef* output,
+                                              OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  NodeDef* zero_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+
+  for (NodeDef& node : *output->mutable_node()) {
+    if (node.op() == kShuffleDatasetV2) {
+      *node.mutable_op() = kShuffleDataset;
+      // remove `seed_generator` input
+      node.mutable_input()->RemoveLast();
+      // add `seed` input
+      node.add_input(zero_node->name());
+      // add `seed2` input
+      node.add_input(zero_node->name());
+      // set `reshuffle_each_iteration` attr
+      (*node.mutable_attr())[kReshuffleEachIteration].set_b(true);
+    } else if (node.op() == kCacheDatasetV2) {
+      *node.mutable_op() = kCacheDataset;
+      // remove `cache` input
+      node.mutable_input()->RemoveLast();
+    }
+  }
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MakeStateless, "make_stateless");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless.h b/tensorflow/core/grappler/optimizers/data/make_stateless.h
new file mode 100644
index 00000000000..cd95c23a276
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_stateless.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This rewrite replaces transformations that depend on external state (such as
+// `ShuffleDatasetV2`) with a stateless alternative so that the input pipeline
+// graph can be cloned.
+//
+// Note that this rewrites may change observable behavior of the input pipeline
+// (e.g. `reshuffle_each_iteration` will not work) and is a stop gap solution
+// to enable cloning until a better mechanism exists.
+class MakeStateless : public TFDataOptimizerBase {
+ public:
+  MakeStateless() = default;
+  ~MakeStateless() override = default;
+
+  string name() const override { return "make_stateless"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {}
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_STATELESS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc b/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc
new file mode 100644
index 00000000000..a30b7c63726
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_stateless_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_stateless.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MakeStateless, Cache) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_INT64}}),
+       NDef("handle", "Const", {}, {{"value", 1}, {"dtype", DT_RESOURCE}}),
+       graph_tests_utils::MakeCacheV2Node("cache", "range", "filename",
+                                          "handle")},
+      {});
+
+  MakeStateless optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("cache", output));
+  int index = graph_utils::FindGraphNodeWithName("cache", output);
+  EXPECT_EQ(output.node(index).op(), "CacheDataset");
+  EXPECT_EQ(output.node(index).input_size(), 2);
+}
+
+TEST(MakeStateless, Shuffle) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("buffer_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT64}}),
+       NDef("handle", "Const", {}, {{"value", 1}, {"dtype", DT_RESOURCE}}),
+       graph_tests_utils::MakeShuffleV2Node("shuffle", "range", "buffer_size",
+                                            "handle")},
+      {});
+
+  MakeStateless optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("shuffle", output));
+  int index = graph_utils::FindGraphNodeWithName("shuffle", output);
+  EXPECT_EQ(output.node(index).op(), "ShuffleDataset");
+  EXPECT_EQ(output.node(index).input_size(), 4);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index ef3a218bf34..08d881dea89 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -28,6 +28,8 @@ class MapAndBatchFusion : public TFDataOptimizerBase {
 
   string name() const override { return "map_and_batch_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index 41c08a18d20..eafe802e159 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -37,6 +37,8 @@ class MapAndFilterFusion : public TFDataOptimizerBase {
 
   string name() const override { return "map_and_filter_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index c9960c72178..411a3451293 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -29,6 +29,8 @@ class MapFusion : public TFDataOptimizerBase {
 
   string name() const override { return "map_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index 8e71dadcb85..b231697b0ed 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -29,6 +29,8 @@ class MapParallelization : public TFDataOptimizerBase {
 
   string name() const override { return "map_parallelization"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
index 2d3c068e7f6..b91537c920d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -47,6 +47,8 @@ class MapVectorization : public TFDataOptimizerBase {
 
   string name() const override { return "map_vectorization"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index b364296d9a9..7a42fabbc1d 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -36,7 +36,8 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 15> kTFDataOptimizations = {
+constexpr std::array<const char*, 16> kTFDataOptimizations = {
+    "make_stateless",
     "noop_elimination",
     "shuffle_and_repeat_fusion",
     "map_fusion",
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
index b65e7027777..e7de4910c52 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -31,6 +31,8 @@ class TFDataMetaOptimizer : public CustomGraphOptimizer {
 
   string name() const override { return "tf_data_meta_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index 11d86ad2a38..2de5677ef4e 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -30,6 +30,8 @@ class NoOpElimination : public TFDataOptimizerBase {
 
   string name() const override { return "noop_elimination"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.h b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
index 8fa6413354d..f8fb5eb5c9b 100644
--- a/tensorflow/core/grappler/optimizers/data/parallel_batch.h
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -28,6 +28,8 @@ class ParallelBatch : public TFDataOptimizerBase {
 
   string name() const override { return "parallel_batch"; }
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
index b3e7f8febe3..c62088e9f3d 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -18,23 +18,29 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
 
 namespace tensorflow {
 namespace grappler {
 
 Status RebatchOptimizer::Init(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
-  if (!config) return Status::OK();
+  if (!config)
+    return errors::InvalidArgument(
+        "Cannot initialize RebatchOptimizer without config.");
 
-  num_workers_ = config->parameter_map().at("num_workers").i();
+  num_replicas_ = config->parameter_map().at("num_replicas").i();
+  use_fallback_ = config->parameter_map().at("use_fallback").b();
   return Status::OK();
 }
 
@@ -45,22 +51,33 @@ constexpr char kConstOp[] = "Const";
 constexpr char kIdentityOp[] = "Identity";
 constexpr char kSubOp[] = "Sub";
 constexpr char kTruncateDivOp[] = "TruncateDiv";
+constexpr char kOutputShapesAttr[] = "output_shapes";
+constexpr char kOutputTypesAttr[] = "output_types";
+constexpr char kTOutputTypesAttr[] = "Toutput_types";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kPaddedBatchOp[] = "PaddedBatchDataset";
+constexpr char kPaddedBatchV2Op[] = "PaddedBatchDatasetV2";
+constexpr char kMapAndBatchOp[] = "MapAndBatchDataset";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
 
 constexpr std::array<const char*, 6> kBatchDatasetOps = {
-    "BatchDataset",
-    "BatchDatasetV2",
-    "ExperimentalMapAndBatchDataset",
-    "MapAndBatchDataset",
-    "PaddedBatchDataset",
-    "PaddedBatchDatasetV2"};
+    kBatchOp,       kBatchV2Op,      kMapAndBatchOp, kExperimentalMapAndBatchOp,
+    kPaddedBatchOp, kPaddedBatchV2Op};
 
 constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ConcatenateDataset",
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 20> kPassThroughOps = {
+// TODO(rachelim): We might want to be more conservative here and not allow
+// passthrough for ops like "Map", "ParallelMap" etc which may change the
+// batch dimension. Furthermore, transformations like "Skip" may change
+// the semantics of the dataset (since we'd be skipping N minibatches instead
+// of N batches).
+constexpr std::array<const char*, 22> kPassThroughOps = {
     "CacheDataset",
+    "CacheDatasetV2",
     "ExperimentalScanDataset",
     "ExperimentalParseExampleDataset",
     "FilterDataset",
@@ -77,9 +94,11 @@ constexpr std::array<const char*, 20> kPassThroughOps = {
     "ShardDataset",
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
+    "ShuffleDatasetV2",
     "SkipDataset",
     "TakeDataset",
-    "WindowDataset"};
+    "WindowDataset",
+};
 
 constexpr std::array<const char*, 5> kFuncDatasetOps = {
     "ExperimentalGroupByWindowDataset",
@@ -106,17 +125,141 @@ constexpr std::array<const char*, 9> kSourceDatasetOps = {
     "TFRecordDataset",
 };
 
-NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
-                       const string& op, DataType type,
-                       MutableGraphView* graph) {
+NodeDef MakeBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType dtype) {
   NodeDef node;
   node.set_op(op);
   node.add_input(input_x);
   node.add_input(input_y);
-  graph_utils::SetUniqueGraphNodeName(op, graph->graph(), &node);
-  AddNodeAttr("T", type, &node);
+  AddNodeAttr("T", dtype, &node);
 
-  return graph->AddNode(std::move(node));
+  return node;
+}
+
+NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType type, FunctionDef* fdef) {
+  NodeDef* node = fdef->add_node_def();
+  *node = MakeBinaryNode(input_x, input_y, op, type);
+  function_utils::SetUniqueFunctionNodeName(op, fdef, node);
+
+  return node;
+}
+
+// Adds a Const node to the FunctionDef.
+Status AddConstIntNode(gtl::ArraySlice<int32> values, const TensorShape& shape,
+                       FunctionDef* fdef, NodeDef** result) {
+  if (shape.dims() > 1) {
+    return errors::InvalidArgument("Cannot add const node with rank > 1");
+  }
+  *result = fdef->add_node_def();
+  TensorProto tensor_proto;
+  tensor_proto.set_dtype(DT_INT32);
+  if (shape.dims() == 0) {
+    // Scalar
+    DCHECK_EQ(values.size(), 1);
+  } else {
+    // vector
+    DCHECK_EQ(values.size(), shape.dim_size(0));
+    tensor_proto.mutable_tensor_shape()->add_dim()->set_size(shape.dim_size(0));
+  }
+
+  for (int value : values) {
+    *tensor_proto.mutable_int_val()->Add() = value;
+  }
+
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Const")
+                         .Attr("dtype", DT_INT32)
+                         .Attr("value", tensor_proto)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/const", fdef, *result);
+
+  return Status::OK();
+}
+
+Status AddConstInt64Node(int64 value, FunctionDef* fdef, NodeDef** result) {
+  *result = fdef->add_node_def();
+  Tensor t(value);
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Const")
+                         .Attr("dtype", DT_INT64)
+                         .Attr("value", t)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/const", fdef, *result);
+
+  return Status::OK();
+}
+
+Status AddConstBoolNode(bool value, FunctionDef* fdef, NodeDef** result) {
+  *result = fdef->add_node_def();
+  Tensor t(value);
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Const")
+                         .Attr("dtype", DT_BOOL)
+                         .Attr("value", t)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/const", fdef, *result);
+
+  return Status::OK();
+}
+
+Status AddShapeNode(const NodeDefBuilder::NodeOut& input, DataType out_type,
+                    FunctionDef* fdef, NodeDef** result) {
+  *result = fdef->add_node_def();
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Shape")
+                         .Input(input)
+                         .Attr("out_type", out_type)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/shape", fdef, *result);
+  return Status::OK();
+}
+
+Status AddStridedSliceNode(const NodeDefBuilder::NodeOut& input,
+                           const NodeDefBuilder::NodeOut& begin,
+                           const NodeDefBuilder::NodeOut& end,
+                           const NodeDefBuilder::NodeOut& strides,
+                           DataType index, int32 begin_mask,
+                           int32 ellipsis_mask, int32 end_mask,
+                           int32 new_axis_mask, int32 shrink_axis_mask,
+                           FunctionDef* fdef, NodeDef** result) {
+  *result = fdef->add_node_def();
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "StridedSlice")
+                         .Input(input)
+                         .Input(begin)
+                         .Input(end)
+                         .Input(strides)
+                         .Attr("Index", index)
+                         .Attr("begin_mask", begin_mask)
+                         .Attr("ellipsis_mask", ellipsis_mask)
+                         .Attr("end_mask", end_mask)
+                         .Attr("new_axis_mask", new_axis_mask)
+                         .Attr("shrink_axis_mask", shrink_axis_mask)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/strided_slice", fdef,
+                                            *result);
+  return Status::OK();
+}
+
+Status AddConcatNode(gtl::ArraySlice<NodeDefBuilder::NodeOut> values,
+                     NodeDefBuilder::NodeOut axis, int32 n, FunctionDef* fdef,
+                     NodeDef** result) {
+  *result = fdef->add_node_def();
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "ConcatV2")
+                         .Input(values)
+                         .Input(axis)
+                         .Attr("N", n)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/concat", fdef, *result);
+  return Status::OK();
+}
+
+Status AddReshapeNode(NodeDefBuilder::NodeOut tensor,
+                      NodeDefBuilder::NodeOut shape, FunctionDef* fdef,
+                      NodeDef** result) {
+  *result = fdef->add_node_def();
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Reshape")
+                         .Input(tensor)
+                         .Input(shape)
+                         .Finalize(*result));
+  function_utils::SetUniqueFunctionNodeName("rebatch/reshape", fdef, *result);
+  return Status::OK();
 }
 
 template <std::size_t SIZE>
@@ -128,62 +271,125 @@ bool IsDatasetNodeOfType(const NodeDef& node,
   return false;
 }
 
-Status UpdateOutputShapes(const string& node_name, int64 num_workers,
+void SetUnknownShapes(int num_components, AttrValue* output_shapes) {
+  for (int i = 0; i < num_components; ++i) {
+    output_shapes->mutable_list()->mutable_shape()->Add()->set_unknown_rank(
+        true);
+  }
+}
+
+// If the batch dimension is known and divisible by num_replicas, we set
+// result = batch_dim / num_replicas. If the batch dimension is unknown,
+// result = -1. If the dataset node is missing an output shapes attr,
+// or the batch dimensions of its components don't match, we return an error
+// status.
+Status GetMinibatchDimForReshape(const NodeDef& dataset_node,
+                                 int64 num_replicas, int64* result) {
+  AttrValue output_shapes;
+  if (!dataset_node.attr().contains(kOutputShapesAttr)) {
+    return errors::InvalidArgument(
+        "Cannot use rebatching fallback when the final dataset node does not "
+        "have an `output_shapes` attr. Node: ",
+        dataset_node.name(), " Op: ", dataset_node.op());
+  }
+  output_shapes = dataset_node.attr().at(kOutputShapesAttr);
+
+  // Get the batch dimension by checking the 0th dimension of all the inputs.
+  int batch_dim = -1;
+  for (int i = 0; i < output_shapes.list().shape_size(); ++i) {
+    const auto& shape_i = output_shapes.list().shape(i);
+
+    // If unknown, ignore.
+    if (shape_i.unknown_rank()) continue;
+    int batch_dim_i = shape_i.dim(0).size();
+    if (batch_dim_i == -1) continue;
+
+    // Update batch_dim with known dimension.
+    if (batch_dim_i != batch_dim && batch_dim != -1) {
+      return errors::InvalidArgument(
+          "Cannot use rebatching fallback: 0th dimensions of dataset "
+          "components don't match. Component ",
+          i, " has batch dimension: ", batch_dim_i,
+          " while previous components have batch dimension: ", batch_dim);
+    }
+    batch_dim = batch_dim_i;
+  }
+
+  if (batch_dim == -1 || batch_dim % num_replicas != 0) {
+    *result = -1;
+  } else {
+    *result = batch_dim / num_replicas;
+  }
+
+  return Status::OK();
+}
+
+Status UpdateOutputShapes(const string& node_name, int64 num_replicas,
                           MutableGraphView* graph) {
   NodeDef* node = graph->GetNode(node_name);
-  if (node->op() == kIdentityOp) {
-    return Status::OK();
-  }
-  AttrValue output_shapes = node->attr().at("output_shapes");
-  for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) {
-    if (shape.dim(0).size() != -1) {
-      shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_workers);
+  if (node->attr().contains(kOutputShapesAttr)) {
+    AttrValue output_shapes = node->attr().at(kOutputShapesAttr);
+    for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) {
+      if (!shape.unknown_rank() && shape.dim(0).size() != -1) {
+        shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_replicas);
+      }
     }
+    (*node->mutable_attr())[kOutputShapesAttr] = output_shapes;
   }
-  (*node->mutable_attr())["output_shapes"] = output_shapes;
+  return Status::OK();
+}
+
+// Helper function to get the batch_size input node for a give batch node.
+int64 GetBatchSizeArgIndex(const NodeDef& batch_node) {
+  if (batch_node.op() == kExperimentalMapAndBatchOp ||
+      batch_node.op() == kMapAndBatchOp) {
+    // For MapAndBatch we take the 3rd last input.
+    return batch_node.input_size() - 3;
+  }
+  // For all the batching datasets the batch_size is input number 1 except for
+  // MapAndBatchDataset.
+  return 1;
+}
+
+Status MakeNewBatchSizeNode(const string& global_batch_size_name,
+                            int64 num_replicas, FunctionDef* fdef,
+                            NodeDef** result) {
+  NodeDef* one_node;
+  TF_RETURN_IF_ERROR(AddConstInt64Node(1, fdef, &one_node));
+  NodeDef* num_replicas_node;
+  TF_RETURN_IF_ERROR(AddConstInt64Node(num_replicas, fdef, &num_replicas_node));
+
+  NodeDef* numerator_node =
+      AddBinaryNode(global_batch_size_name,
+                    strings::StrCat(num_replicas_node->name(), ":output:0"),
+                    kAddOp, DT_INT64, fdef);
+  numerator_node = AddBinaryNode(
+      strings::StrCat(numerator_node->name(), ":z:0"),
+      strings::StrCat(one_node->name(), ":output:0"), kSubOp, DT_INT64, fdef);
+
+  *result =
+      AddBinaryNode(strings::StrCat(numerator_node->name(), ":z:0"),
+                    strings::StrCat(num_replicas_node->name(), ":output:0"),
+                    kTruncateDivOp, DT_INT64, fdef);
   return Status::OK();
 }
 
 // Given a "batch" dataset node, we replace the `batch_size` input with a new
-// input that corresponds to the original input divided by `num_workers`. If
-// `num_workers` does not divide `batch_size` evenly, the value is rounded up.
-Status MutateBatchSize(const NodeDef& node, int64 num_workers,
+// input that corresponds to the original input divided by `num_replicas`.
+Status MutateBatchSize(const NodeDef& node, int64 num_replicas,
                        MutableGraphView* graph) {
   // For all the batching datasets the batch_size is input number 1 except for
   // MapAndBatchDataset.
-  int64 batch_size_arg_index = 1;
-  if (node.op() == "ExperimentalMapAndBatchDataset" ||
-      node.op() == "MapAndBatchDataset") {
-    // For MapAndBatch we take the 3rd last input.
-    batch_size_arg_index = node.input_size() - 3;
-  }
+  int64 batch_size_arg_index = GetBatchSizeArgIndex(node);
   NodeDef* batch_size_node =
       graph_utils::GetInputNode(node, *graph, batch_size_arg_index);
-  NodeDef* new_batch_size_node;
-  if (batch_size_node->op() == kConstOp) {
-    Tensor batch_size_tensor;
-    TF_RETURN_IF_ERROR(
-        GetNodeAttr(*batch_size_node, "value", &batch_size_tensor));
-    if (!TensorShapeUtils::IsScalar(batch_size_tensor.shape())) {
-      return errors::Internal("Batch size node shape should be scalar");
-    }
-    int64 batch_size = batch_size_tensor.scalar<int64>()();
-    batch_size = (batch_size + num_workers - 1) / num_workers;
-    new_batch_size_node =
-        graph_utils::AddScalarConstNode<int64>(batch_size, graph);
-  } else {
-    NodeDef* one_node = graph_utils::AddScalarConstNode<int64>(1, graph);
-    NodeDef* num_workers_node =
-        graph_utils::AddScalarConstNode<int64>(num_workers, graph);
-    NodeDef* numerator_node =
-        AddBinaryNode(batch_size_node->name(), num_workers_node->name(), kAddOp,
-                      DT_INT64, graph);
-    numerator_node = AddBinaryNode(numerator_node->name(), one_node->name(),
-                                   kSubOp, DT_INT64, graph);
-    new_batch_size_node =
-        AddBinaryNode(numerator_node->name(), num_workers_node->name(),
-                      kTruncateDivOp, DT_INT64, graph);
-  }
+  int64 batch_size;
+  TF_RETURN_IF_ERROR(
+      graph_utils::GetScalarConstNodeValue(*batch_size_node, &batch_size));
+  DCHECK_EQ(batch_size % num_replicas, 0);
+  batch_size = batch_size / num_replicas;
+  NodeDef* new_batch_size_node =
+      graph_utils::AddScalarConstNode<int64>(batch_size, graph);
   // We don't call UpdateFanouts here because CSE elimination might lead to
   // multiple nodes sharing the same batch size constant node. This is also
   // why we don't delete batch_size_node as well.
@@ -192,8 +398,202 @@ Status MutateBatchSize(const NodeDef& node, int64 num_workers,
   return Status::OK();
 }
 
-Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
-                     GraphDef* output);
+Status AddFlatMapNode(const string& input_dataset,
+                      gtl::ArraySlice<string> other_arguments,
+                      gtl::ArraySlice<DataType> t_arguments,
+                      const FunctionDef& flat_map_fn,
+                      const AttrValue& output_shapes,
+                      const DataTypeVector& output_types,
+                      FunctionLibraryDefinition* flib, MutableGraphView* graph,
+                      NodeDef** result) {
+  TF_RETURN_IF_ERROR(flib->AddFunctionDef(flat_map_fn));
+  AttrValue f;
+  f.mutable_func()->set_name(flat_map_fn.signature().name());
+
+  NodeDef flat_map_node;
+  flat_map_node.set_op("FlatMapDataset");
+  flat_map_node.add_input(input_dataset);
+  for (const auto& arg : other_arguments) {
+    flat_map_node.add_input(arg);
+  }
+  AddNodeAttr("f", f, &flat_map_node);
+  AddNodeAttr("Targuments", t_arguments, &flat_map_node);
+  AddNodeAttr(kOutputShapesAttr, output_shapes, &flat_map_node);
+  AddNodeAttr(kOutputTypesAttr, output_types, &flat_map_node);
+
+  graph_utils::SetUniqueGraphNodeName("rebatch/flat_map", graph->graph(),
+                                      &flat_map_node);
+  *result = graph->AddNode(std::move(flat_map_node));
+  return Status::OK();
+}
+
+// def flat_map_fn(*batched_components):
+//   batch_size = tf.shape(batched_components[0])[0]
+//   minibatch_size = (batch_size + num_replicas - 1) // num_replicas
+//   ds = tf.data.Dataset.from_tensor_slices(batched_components)
+//   return ds.batch(minibatch_size, drop_remainder=False)
+Status CreateFlatMapFnWithBatch(const DataTypeVector& dtypes,
+                                int64 num_replicas, FunctionDef* result) {
+  NodeDef* tensor_slice_node = result->add_node_def();
+  tensor_slice_node->set_op("TensorSliceDataset");
+  for (int i = 0; i < dtypes.size(); ++i) {
+    auto* input_arg = function_utils::AddFunctionInput(
+        strings::StrCat("args_", i), result, dtypes.at(i));
+    tensor_slice_node->add_input(input_arg->name());
+  }
+  AddNodeAttr(kTOutputTypesAttr, dtypes, tensor_slice_node);
+
+  // The output_shapes attr here doesn't make a difference, since we
+  // set the output_shapes of the external FlatMap node.
+  AttrValue shapes;
+  SetUnknownShapes(dtypes.size(), &shapes);
+  AddNodeAttr(kOutputShapesAttr, shapes, tensor_slice_node);
+  function_utils::SetUniqueFunctionNodeName("rebatch/from_tensor_slices",
+                                            result, tensor_slice_node);
+
+  NodeDef* false_node;
+  TF_RETURN_IF_ERROR(AddConstBoolNode(false, result, &false_node));
+  NodeDef* batch_node = result->add_node_def();
+  batch_node->set_op(kBatchV2Op);
+  batch_node->add_input(
+      strings::StrCat(tensor_slice_node->name(), ":handle:0"));
+
+  // `batch_size` is tf.shape(arg)[0]
+  NodeDef* shape;
+  TF_RETURN_IF_ERROR(AddShapeNode({tensor_slice_node->input(0), 0, dtypes[0]},
+                                  DT_INT64, result, &shape));
+
+  // Const with value [0]
+  NodeDef* const_vec_0;
+  TF_RETURN_IF_ERROR(AddConstIntNode({0}, {1}, result, &const_vec_0));
+
+  // Const with value [1]
+  NodeDef* const_vec_1;
+  TF_RETURN_IF_ERROR(AddConstIntNode({1}, {1}, result, &const_vec_1));
+
+  // Extracts the 0th dimension from the shape node.
+  NodeDef* original_batch_size;
+  TF_RETURN_IF_ERROR(AddStridedSliceNode(
+      {strings::StrCat(shape->name(), ":output"), 0, DT_INT64},
+      {strings::StrCat(const_vec_0->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32}, DT_INT32,
+      0, 0, 0, 0, 1, result, &original_batch_size));
+
+  NodeDef* new_batch_size;
+  TF_RETURN_IF_ERROR(MakeNewBatchSizeNode(
+      strings::StrCat(original_batch_size->name(), ":output:0"), num_replicas,
+      result, &new_batch_size));
+  batch_node->add_input(strings::StrCat(new_batch_size->name(), ":z:0"));
+
+  // `drop_remainder` input
+  batch_node->add_input(strings::StrCat(false_node->name(), ":output:0"));
+  AddNodeAttr(kOutputTypesAttr, dtypes, batch_node);
+  AddNodeAttr(kOutputShapesAttr, shapes, batch_node);
+  function_utils::SetUniqueFunctionNodeName("rebatch/batch", result,
+                                            batch_node);
+  function_utils::AddFunctionOutputWithUniqueName(
+      "output", strings::StrCat(batch_node->name(), ":handle:0"), result,
+      DT_VARIANT);
+  // Because TensorSliceDataset is stateful, we set the function to stateful.
+  result->mutable_signature()->set_is_stateful(true);
+
+  return Status::OK();
+}
+
+// Rewrite graph to add
+// `.flat_map(lambda x: tf.data.Dataset.from_tensor_slices(x).
+//     batch(minibatch_size, drop_remainder=False))`
+// after the batch node. This ensures that the sum of the minibatch sizes
+// in a step adds up to the global batch size. However, since this adds
+// additional data copies (both from_tensor_slices and batch), we only use
+// this approach when necessary, i.e. when we need to drop remainder on the
+// global batch, or when the global batch size does not divide num_replicas
+// evenly.
+Status AppendFlatMap(const NodeDef& batch_node, int64 num_replicas,
+                     FunctionLibraryDefinition* flib, MutableGraphView* graph) {
+  // `.flat_map(lambda x: tf.data.Dataset.from_tensor_slices(x).
+  //     batch(minibatch_size, drop_remainder=False))`
+  FunctionDef flat_map_fn;
+  FunctionDefLibrary lib = flib->ToProto();
+  graph_utils::SetUniqueGraphFunctionName("rebatch/flat_map_fn", &lib,
+                                          &flat_map_fn);
+  DataTypeVector dtypes;
+  TF_RETURN_IF_ERROR(
+      graph_utils::GetDatasetOutputTypesAttr(batch_node, &dtypes));
+  TF_RETURN_IF_ERROR(
+      CreateFlatMapFnWithBatch(dtypes, num_replicas, &flat_map_fn));
+
+  NodeDef* flat_map_node;
+
+  AttrValue output_shapes = batch_node.attr().at(kOutputShapesAttr);
+  for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) {
+    if (!shape.unknown_rank() && shape.dim(0).size() != -1) {
+      // Because the flat map function uses drop_remainder = False,
+      // the shape might be unknown
+      auto old_dim = shape.dim(0).size();
+      auto new_dim = old_dim % num_replicas == 0 ? old_dim / num_replicas : -1;
+      shape.mutable_dim(0)->set_size(new_dim);
+    }
+  }
+
+  TF_RETURN_IF_ERROR(AddFlatMapNode(strings::StrCat(batch_node.name(), ":0"),
+                                    {}, {}, flat_map_fn, output_shapes, dtypes,
+                                    flib, graph, &flat_map_node));
+
+  TF_RETURN_IF_ERROR(
+      graph->UpdateFanouts(batch_node.name(), flat_map_node->name()));
+
+  return Status::OK();
+}
+
+// There are several things we do here, depending on the values of
+// batch_size and drop_remainder.
+// (1) If batch size is known and divisible by num_replicas, and drop_remainder
+// is known to be False, we mutate the batch size directly.
+//   .batch(global_batch_size) -> .batch(global_batch_size // num_replicas)
+// (2) Otherwise, we add a flat_map transformation to preserve the global batch
+// size across the replicas and to preserve the drop remainder behavior.
+bool ShouldMutateBatchSizeDirectly(const NodeDef& batch_node,
+                                   int64 num_replicas,
+                                   MutableGraphView* graph) {
+  int64 batch_size_arg_index = GetBatchSizeArgIndex(batch_node);
+  NodeDef* batch_size_node =
+      graph_utils::GetInputNode(batch_node, *graph, batch_size_arg_index);
+
+  int64 batch_size;
+  Status s =
+      graph_utils::GetScalarConstNodeValue(*batch_size_node, &batch_size);
+  // If batch size is unknown or indivisible by num replicas, we don't
+  // mutate it directly
+  if (!s.ok() || batch_size % num_replicas != 0) return false;
+
+  if (batch_node.op() == kBatchOp || batch_node.op() == kPaddedBatchOp) {
+    // These ops don't have a `drop_remainder` input, and behave like
+    // drop_remainder is False.
+    return true;
+  }
+
+  // drop_remainder is the final input on the other batch nodes.
+  NodeDef* drop_remainder_node = graph_utils::GetInputNode(
+      batch_node, *graph, batch_node.input_size() - 1);
+  bool drop_remainder;
+  s = graph_utils::GetScalarConstNodeValue(*drop_remainder_node,
+                                           &drop_remainder);
+  return s.ok() && !drop_remainder;
+}
+
+Status RewriteBatchNode(const NodeDef& batch_node, int64 num_replicas,
+                        FunctionLibraryDefinition* flib,
+                        MutableGraphView* graph) {
+  if (ShouldMutateBatchSizeDirectly(batch_node, num_replicas, graph)) {
+    return MutateBatchSize(batch_node, num_replicas, graph);
+  }
+  return AppendFlatMap(batch_node, num_replicas, flib, graph);
+}
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_replicas,
+                     bool use_fallback, GraphDef* output);
 
 // Helper function that starts from a node in the graph and recurses into its
 // inputs trying to find a BatchDataset type operation to modify. During the
@@ -203,27 +603,25 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
 //      as they are datasets themselves.
 // 3. Core dataset ops + Identity op: Recurses into first input parameter.
 // 4. FlatMap type mapping dataset ops: Recurses into the function definition.
-Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
-                           FunctionLibraryDefinition* flib,
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_replicas,
+                           bool use_fallback, FunctionLibraryDefinition* flib,
                            MutableGraphView* graph) {
   if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
-    TF_RETURN_IF_ERROR(MutateBatchSize(node, num_workers, graph));
-    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
+    TF_RETURN_IF_ERROR(RewriteBatchNode(node, num_replicas, flib, graph));
   } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
     // For all multiple input datasets, all inputs are datasets themselves.
     for (int i = 0; i < node.input_size(); ++i) {
       NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
-      TF_RETURN_IF_ERROR(
-          RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+      TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_replicas,
+                                             use_fallback, flib, graph));
     }
-    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
-  } else if (IsDatasetNodeOfType(node, kPassThroughOps)) {
-    // For all the dataset ops that are pass through, the input dataset is
+  } else if (IsDatasetNodeOfType(node, kPassThroughOps) || IsRetval(node)) {
+    // For all the dataset ops that are passthrough, or _Retvals added to the
+    // function body graph in place of function outputs, the input dataset is
     // input 0.
     NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
-    TF_RETURN_IF_ERROR(
-        RecursivelyHandleOp(*input_node, num_workers, flib, graph));
-    TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
+    TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_replicas,
+                                           use_fallback, flib, graph));
   } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
     const string func_name =
         node.attr().at(kFuncDatasetOpFuncs->at(node.op())).func().name();
@@ -232,50 +630,208 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
     TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
         *fdef, *flib, graph->graph()->versions().producer(), &f_item));
     GraphDef optimized_func_graph;
-    Status s = OptimizeGraph(f_item, num_workers, &optimized_func_graph);
-    if (s.ok()) {
-      // Function body optimization might have created new specialized
-      // functions for each instantiation context. Add them to the library.
-      for (const FunctionDef& func_def :
-           optimized_func_graph.library().function()) {
-        if (flib->Find(func_def.signature().name()) == nullptr) {
-          TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def));
-        }
+    TF_RETURN_IF_ERROR(OptimizeGraph(f_item, num_replicas, use_fallback,
+                                     &optimized_func_graph));
+
+    // Function body optimization might have created new specialized
+    // functions for each instantiation context. Add them to the library.
+    for (const FunctionDef& func_def :
+         optimized_func_graph.library().function()) {
+      if (flib->Find(func_def.signature().name()) == nullptr) {
+        TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def));
       }
-
-      // Convert optimized graph back to FunctionDef.
-      FunctionDef optimized_func;
-      f_item.SwapFunctionBody(std::move(optimized_func_graph));
-      TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func));
-
-      // Replace optimized function with a new FunctionDef.
-      TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
-      TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
-    } else {
-      VLOG(2) << "Failed to optimize dataset function. Error: "
-              << s.error_message();
     }
+
+    // Convert optimized graph back to FunctionDef.
+    FunctionDef optimized_func;
+    f_item.SwapFunctionBody(std::move(optimized_func_graph));
+    TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func));
+
+    // Replace optimized function with a new FunctionDef.
+    TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
   } else if (IsDatasetNodeOfType(node, kSourceDatasetOps)) {
     return errors::InvalidArgument(
         "Reached a source dataset: ", node.op(),
         " without encountering a batch transformation.");
-  } else if (IsRetval(node)) {
-    // _Retvals added to the function body graph in place of function outputs.
-    NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
-    TF_RETURN_IF_ERROR(
-        RecursivelyHandleOp(*input_node, num_workers, flib, graph));
   } else {
     return errors::InvalidArgument("Encountered an unsupported op: ",
                                    node.op());
   }
+  // If we've successfully updated the batch size of this node or any nodes
+  // in the dataset tree rooted in this node, we update the output_shapes attr.
+  TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_replicas, graph));
+  return Status::OK();
+}
+
+// Add nodes to the function to reshape arg to shape (-1, new_batch_dim, ...)
+Status ReshapeComponent(int new_batch_dim, const string& arg, DataType dtype,
+                        FunctionDef* fdef, string* result) {
+  // Const with value [0]
+  NodeDef* const_vec_0;
+  TF_RETURN_IF_ERROR(AddConstIntNode({0}, {1}, fdef, &const_vec_0));
+
+  // Const with value [1]
+  NodeDef* const_vec_1;
+  TF_RETURN_IF_ERROR(AddConstIntNode({1}, {1}, fdef, &const_vec_1));
+
+  // Const with value 0
+  NodeDef* const_0;
+  TF_RETURN_IF_ERROR(AddConstIntNode({0}, {}, fdef, &const_0));
+
+  // Const with value [-1, new_batch_dim]
+  NodeDef* first_two_dims;
+  TF_RETURN_IF_ERROR(
+      AddConstIntNode({-1, new_batch_dim}, {2}, fdef, &first_two_dims));
+
+  // shape = tf.shape(arg)
+  NodeDef* shape;
+  TF_RETURN_IF_ERROR(AddShapeNode({arg, 0, dtype}, DT_INT32, fdef, &shape));
+
+  // later_dimensions = tf.shape(arg)[1:]
+  NodeDef* later_dimensions;
+  TF_RETURN_IF_ERROR(AddStridedSliceNode(
+      {strings::StrCat(shape->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_0->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32}, DT_INT32,
+      0, 0, 1, 0, 0, fdef, &later_dimensions));
+
+  // new_shape = tf.concat([pack, later_dimensions], 0)
+  NodeDef* new_shape;
+  TF_RETURN_IF_ERROR(AddConcatNode(
+      {{strings::StrCat(first_two_dims->name(), ":output"), 0, DT_INT32},
+       {strings::StrCat(later_dimensions->name(), ":output"), 0, DT_INT32}},
+      {strings::StrCat(const_0->name(), ":output"), 0, DT_INT32}, 2, fdef,
+      &new_shape));
+
+  NodeDef* reshape;
+  TF_RETURN_IF_ERROR(AddReshapeNode(
+      {arg, 0, dtype},
+      {strings::StrCat(new_shape->name(), ":output"), 0, DT_INT32}, fdef,
+      &reshape));
+  *result = reshape->name();
+
+  return Status::OK();
+}
+
+// def flat_map_fn(*batched_components):
+//   return tf.data.Dataset.from_tensor_slices(
+//     [tf.reshape(c, (-1, new_batch_size, ...))
+//      for c in batched_components])
+Status CreateFlatMapFnWithReshape(int new_batch_dim,
+                                  const DataTypeVector& types,
+                                  FunctionDef* result) {
+  std::vector<NodeDefBuilder::NodeOut> tensor_slice_dataset_inputs;
+
+  // For each component of the dataset, we reshape it from shape
+  // (old_batch_size, ...) to (-1, new_batch_size, ...)
+  // where new_batch_size = (old_batch_size + num_replicas - 1) // num_replicas
+  for (int i = 0; i < types.size(); ++i) {
+    auto* input_arg = function_utils::AddFunctionInput(
+        strings::StrCat("args_", i), result, types.at(i));
+
+    string reshape_node_name;
+    TF_RETURN_IF_ERROR(ReshapeComponent(new_batch_dim, input_arg->name(),
+                                        types.at(i), result,
+                                        &reshape_node_name));
+
+    tensor_slice_dataset_inputs.emplace_back(
+        strings::StrCat(reshape_node_name, ":output"), 0, types.at(i));
+  }
+
+  // The output_shapes attr here doesn't make a difference, since we
+  // set the output_shapes of the external FlatMap node.
+  AttrValue shapes;
+  SetUnknownShapes(types.size(), &shapes);
+
+  NodeDef* tensor_slice_dataset = result->add_node_def();
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "TensorSliceDataset")
+                         .Input(tensor_slice_dataset_inputs)
+                         .Attr("Toutput_types", types)
+                         .Attr(kOutputShapesAttr, shapes)
+                         .Finalize(tensor_slice_dataset));
+  function_utils::SetUniqueFunctionNodeName("rebatch/tensor_slice_dataset",
+                                            result, tensor_slice_dataset);
+
+  function_utils::AddFunctionOutputWithUniqueName(
+      "output", strings::StrCat(tensor_slice_dataset->name(), ":handle:0"),
+      result, DT_VARIANT);
+  // Because TensorSliceDataset is stateful, we set the function to stateful.
+  result->mutable_signature()->set_is_stateful(true);
+
+  return Status::OK();
+}
+
+// We fallback to the following rewrite:
+// ```
+//   dataset = ...fetch_node...
+//   def fn(x):
+//     return tf.data.Dataset.from_tensor_slices(
+//       tf.reshape(
+//         x,
+//         tf.concat([[-1, old_batch_dim / num_replicas], tf.shape(x)[1:]], 0)
+//       )
+//     )
+//
+//   dataset = dataset.flat_map(fn)
+// ```
+Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_replicas,
+                           FunctionLibraryDefinition* flib,
+                           MutableGraphView* graph) {
+  if (IsRetval(*fetch_node) || fetch_node->op() == kIdentityOp) {
+    // Get the last dataset in the pipeline
+    fetch_node = graph_utils::GetInputNode(*fetch_node, *graph, 0);
+  }
+
+
+  // Create the flat map fn
+  FunctionDef flat_map_fn;
+  FunctionDefLibrary lib = flib->ToProto();
+  graph_utils::SetUniqueGraphFunctionName("rebatch/flat_map_fn", &lib,
+                                          &flat_map_fn);
+
+  // Get types of input arguments from the output types of the final dataset.
+  DataTypeVector output_types;
+  TF_RETURN_IF_ERROR(
+      graph_utils::GetDatasetOutputTypesAttr(*fetch_node, &output_types));
+
+  int64 minibatch_dim;
+  // If the batch dimension is known and perfectly divisible by num_replicas,
+  // we use a fallback with `tf.reshape` for better performance.
+  TF_RETURN_IF_ERROR(
+      GetMinibatchDimForReshape(*fetch_node, num_replicas, &minibatch_dim));
+  if (minibatch_dim != -1) {
+    TF_RETURN_IF_ERROR(
+        CreateFlatMapFnWithReshape(minibatch_dim, output_types, &flat_map_fn));
+  } else {
+    TF_RETURN_IF_ERROR(
+        CreateFlatMapFnWithBatch(output_types, num_replicas, &flat_map_fn));
+  }
+
+  AttrValue output_shapes;
+  if (fetch_node->attr().contains(kOutputShapesAttr)) {
+    output_shapes = fetch_node->attr().at(kOutputShapesAttr);
+  } else {
+    SetUnknownShapes(output_types.size(), &output_shapes);
+  }
+  NodeDef* flat_map_node;
+  TF_RETURN_IF_ERROR(AddFlatMapNode(strings::StrCat(fetch_node->name(), ":0"),
+                                    {}, {}, flat_map_fn, output_shapes,
+                                    output_types, flib, graph, &flat_map_node));
+  TF_RETURN_IF_ERROR(
+      UpdateOutputShapes(flat_map_node->name(), num_replicas, graph));
+
+  TF_RETURN_IF_ERROR(
+      graph->UpdateFanouts(fetch_node->name(), flat_map_node->name()));
+
   return Status::OK();
 }
 
 // Helper function that given a GrapplerItem generates a mutated graph def
 // with the batch size changed. The GrapplerItem could be generated from the
 // main graph or could be a function graph.
-Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
-                     GraphDef* output) {
+Status OptimizeGraph(const GrapplerItem& item, int64 num_replicas,
+                     bool use_fallback, GraphDef* output) {
   *output = item.graph;
   MutableGraphView graph(output);
 
@@ -283,8 +839,24 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
 
   NodeDef* sink_node;
   TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
-  TF_RETURN_IF_ERROR(
-      RecursivelyHandleOp(*sink_node, num_workers, &flib, &graph));
+
+  Status s = RecursivelyHandleOp(*sink_node, num_replicas, use_fallback, &flib,
+                                 &graph);
+  if (!s.ok()) {
+    if (use_fallback) {
+      VLOG(1) << "Failed to rebatch by rewriting the batch transformation ("
+              << s << "). Using a fallback method instead.";
+      // If RecursivelyHandleOp fails, we reset `graph` to use the original,
+      // graph, since that function may have mutated `graph`.
+      *output = item.graph;
+      graph = MutableGraphView(output);
+      TF_RETURN_IF_ERROR(
+          RebatchWithFallback(sink_node, num_replicas, &flib, &graph));
+    } else {
+      // Return the error
+      return s;
+    }
+  }
   *output->mutable_library() = flib.ToProto();
   return Status::OK();
 }
@@ -298,7 +870,7 @@ Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster,
   *output = item.graph;
   MutableGraphView graph(output);
 
-  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, output));
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_replicas_, use_fallback_, output));
   stats->num_changes++;
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.h b/tensorflow/core/grappler/optimizers/data/rebatch.h
index 29a61000264..977d2869693 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.h
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.h
@@ -23,7 +23,7 @@ namespace tensorflow {
 namespace grappler {
 
 // This optimizer changes the batch size of the output dataset by dividing the
-// current batch size by parameter `num_workers`. Currently, this works only
+// current batch size by parameter `num_replicas`. Currently, this works only
 // for very simple pipelines with a single BatchDatasetV2 transformation.
 class RebatchOptimizer : public TFDataOptimizerBase {
  public:
@@ -32,6 +32,8 @@ class RebatchOptimizer : public TFDataOptimizerBase {
 
   string name() const override { return "tf_data_rebatcher"; }
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
 
@@ -43,7 +45,8 @@ class RebatchOptimizer : public TFDataOptimizerBase {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  int64 num_workers_;
+  int64 num_replicas_;
+  bool use_fallback_;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
index 3738d141c3a..f1eef1f357e 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -28,6 +28,8 @@ class ShuffleAndRepeatFusion : public TFDataOptimizerBase {
 
   string name() const override { return "shuffle_and_repeat_fusion"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 1ccc00eaf08..6d1aab0c688 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -51,8 +51,9 @@ bool IsDatasetNodeOfType(const NodeDef& node,
 constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset", "ConcatenateDataset"};
 
-constexpr std::array<const char*, 19> kPassThroughOps = {
+constexpr std::array<const char*, 21> kPassThroughOps = {
     "CacheDataset",
+    "CacheDatasetV2",
     "ExperimentalMaxIntraOpParallelismDataset",
     "ExperimentalPrivateThreadPoolDataset",
     "FilterDataset",
@@ -68,9 +69,11 @@ constexpr std::array<const char*, 19> kPassThroughOps = {
     "ShardDataset",
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
+    "ShuffleDatasetV2",
     "SkipDataset",
     "TakeDataset",
-    "WindowDataset"};
+    "WindowDataset",
+};
 
 }  // namespace
 
diff --git a/tensorflow/core/grappler/optimizers/data/slack.h b/tensorflow/core/grappler/optimizers/data/slack.h
index 0ff5afc921c..e65e18697c5 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.h
+++ b/tensorflow/core/grappler/optimizers/data/slack.h
@@ -33,6 +33,8 @@ class Slack : public TFDataOptimizerBase {
 
   string name() const override { return "slack"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return errors::InvalidArgument("Config parameter required.");
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 2247f81b1d1..d0417977492 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_protos_all")
 
 package(
     default_visibility = ["//visibility:private"],
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
index f5aa8c888e0..4a0efc03bde 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
@@ -1155,7 +1155,7 @@ TEST(VectorizerTest, VectorizeDecodeCSV) {
       /*attr_def=*/{},
       /*node_def=*/
       {FunctionDefHelper::Const("Default0", gtl::ArraySlice<int>({2})),
-       FunctionDefHelper::Const("Default1", gtl::ArraySlice<string>({})),
+       FunctionDefHelper::Const("Default1", gtl::ArraySlice<tstring>({})),
        {{"DecodeCSV"},
         "DecodeCSV",
         {"arg0", "Default0:output:0", "Default1:output:0"},
@@ -1267,7 +1267,7 @@ TEST(VectorizerTest, VectorizeParseSingleExample) {
       /*attr_def=*/{},
       /*node_def=*/
       {FunctionDefHelper::Const("DenseIntDefault", static_cast<int64>(0)),
-       FunctionDefHelper::Const("DenseStrDefault", string("")),
+       FunctionDefHelper::Const("DenseStrDefault", tstring("")),
        {{"Parse"},
         "ParseSingleExample",
         {"arg0", "DenseIntDefault:output:0", "DenseStrDefault:output:0"},
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.h b/tensorflow/core/grappler/optimizers/debug_stripper.h
index 1fe25aa1c38..3f717cf27b5 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.h
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.h
@@ -30,6 +30,8 @@ class DebugStripper : public GraphOptimizer {
 
   string name() const override { return "debug_stripper"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 99021b955f2..feeddc28376 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -35,6 +35,8 @@ class DependencyOptimizer : public GraphOptimizer {
 
   string name() const override { return "dependency_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index deb9abab08f..95c1b4e3e38 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -146,8 +146,7 @@ class FakeDevice : public Device {
 bool MarkedNoSpecialize(const FunctionDef& fdef) {
   const auto attr = AttrSlice(&fdef.attr());
   bool nospecialize = false;
-  return GetNodeAttr(attr, kNoSpecializeAttr, &nospecialize).ok() &&
-         nospecialize;
+  return TryGetNodeAttr(attr, kNoSpecializeAttr, &nospecialize) && nospecialize;
 }
 
 // Specialized function instantiation type parameters, body parameters, and
@@ -784,18 +783,17 @@ constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
 using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
 using OutputControlSource = InlineFunctionBodyOptions::OutputControlSource;
 
-// Checks if boolean attribute is defined and it's value is 'true'.
+// Checks if boolean attribute is defined and its value is 'true'.
 bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
   bool match;
-  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
-  return s.ok() && match;
+  bool found = TryGetNodeAttr(n->attrs(), attr_name, &match);
+  return found && match;
 }
 
 // Checks if string attribute is defined and it's not empty.
 bool CheckStringAttr(const Node* n, absl::string_view attr_name) {
-  string match;
-  Status s = GetNodeAttr(n->attrs(), attr_name, &match);
-  return s.ok() && !match.empty();
+  const string& value = GetNodeAttrString(n->attrs(), attr_name);
+  return !value.empty();
 }
 
 bool LowerUsingSwitchMergeIsOn(const Node* n) {
@@ -816,6 +814,32 @@ bool MarkedForXlaCompilation(const Node* n) {
   return CheckStringAttr(n, kXlaClusterAttr);
 }
 
+const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
+  static const auto* exemption = new absl::flat_hash_set<string>(
+      {// LINT.IfChange
+       // Op types that should not run in program order, e.g. because they need
+       // to run asynchronously to avoid deadlock.
+       "CollectiveGather", "CollectiveReduce", "CollectiveBcastSend",
+       "CollectiveBcastRecv", "NcclAllReduce",
+
+       // Legacy random ops.
+       // See details in tensorflow/python/framework/auto_control_deps.py.
+       "RandomUniform", "RandomUniformInt", "RandomStandardNormal",
+       "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
+       "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
+       "RandomPoissonV2",
+       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
+
+       // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
+       // but it can't generate any observable side-effect.
+       "ReadVariableOp",
+
+       // CudnnRNN ops are stateful but they can't generate any observable
+       // side-effect.
+       "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3"});
+  return exemption->contains(op);
+}
+
 // Validates that all side effects inside function body will be executed after
 // function inlining. We do it by looking for a path from stateful ops, to one
 // of the output control sources.
@@ -826,19 +850,15 @@ Status ValidateSideEffectsExecution(
     const FunctionBody& fbody, OutputControlSource output_control_source,
     bool has_outgoing_control_edges,
     bool validate_outgoing_control_edge = true) {
-  // ReadVariableOp marked as stateful because it consumes DT_RESOURCE, but it
-  // can't generate any observable side-effect.
-  static constexpr const char* const kReadVariableOp = "ReadVariableOp";
-
   // Find all nodes that can produce side effects in the function body graph. We
   // use 'is_stateful()' bit as an approximation of "has side effects" property.
   std::vector<const Node*> fbody_side_effects;
-  absl::c_copy_if(fbody.graph->nodes(), std::back_inserter(fbody_side_effects),
-                  [](const Node* n) {
-                    return n->op_def().is_stateful() && !n->IsArg() &&
-                           !n->IsRetval() &&
-                           n->type_string() != kReadVariableOp;
-                  });
+  absl::c_copy_if(
+      fbody.graph->nodes(), std::back_inserter(fbody_side_effects),
+      [](const Node* n) {
+        return n->op_def().is_stateful() && !n->IsArg() && !n->IsRetval() &&
+               !IsExemptFromSideEffectsExecutionValidation(n->type_string());
+      });
 
   // When graph executed in TF-2.0 context with automatic control dependencies
   // tracking, absence of outgoing control edge indicates that no one is
@@ -1187,11 +1207,11 @@ Status InlineFunctionCalls(const GrapplerItem& item,
       AddFrameForwardingControlEdge(control_flow_info, n, graph.get());
 
       if (n->IsIfNode()) {
-        TF_RETURN_IF_ERROR(RewriteIfNode(n, graph.get(), flib_def, false));
+        TF_RETURN_IF_ERROR(RewriteIfNode(n, graph.get(), false));
       } else if (n->type_string() == "Case") {
-        TF_RETURN_IF_ERROR(RewriteCaseNode(n, graph.get(), flib_def, false));
-      } else if (n->type_string() == "While") {
-        TF_RETURN_IF_ERROR(RewriteWhileNode(n, graph.get(), flib_def, false));
+        TF_RETURN_IF_ERROR(RewriteCaseNode(n, graph.get(), false));
+      } else if (n->IsWhileNode()) {
+        TF_RETURN_IF_ERROR(RewriteWhileNode(n, graph.get(), false));
       }
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 8c96bbcf2d6..afd507fe51c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -32,6 +32,8 @@ class FunctionOptimizer : public GraphOptimizer {
 
   string name() const override { return "function_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 0318baf7b19..969857879af 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -40,6 +40,12 @@ constexpr char kNCHW[] = "NCHW";
 constexpr float kVoltaGPURatioThreshold = 0.5;
 constexpr float kConv2DGPUFP16Threshold = 0.5;
 
+struct MutableNodeViewFormatter {
+  void operator()(std::string* out, utils::MutableNodeView* node_view) const {
+    absl::StrAppend(out, node_view->node()->name());
+  }
+};
+
 inline std::pair<int, int> GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   int num_gpus = 0;
@@ -93,6 +99,8 @@ inline bool NumConv2DOnDeviceWithDataTypeOverThreshold(
     }
   }
 
+  if (num_conv2d_gpu == 0) return false;
+
   return (static_cast<float>(num_conv2d_gpu_fp16) /
           static_cast<float>(num_conv2d_gpu)) >= kConv2DGPUFP16Threshold;
 }
@@ -156,11 +164,13 @@ inline bool IsCancellableConstPermTransposeNodePair(
     const utils::MutableNodeView& fanout_transpose,
     const utils::MutableNodeView& fanin_transpose) {
   Tensor fanout_tensor;
-  if (!GetValueAttrIfConstPermTransposeNode(fanout_transpose, &fanout_tensor)) {
+  if (!GetValueAttrFromConstInputNode(fanout_transpose, IsTranspose, 1,
+                                      &fanout_tensor)) {
     return false;
   }
   Tensor fanin_tensor;
-  if (!GetValueAttrIfConstPermTransposeNode(fanin_transpose, &fanin_tensor)) {
+  if (!GetValueAttrFromConstInputNode(fanin_transpose, IsTranspose, 1,
+                                      &fanin_tensor)) {
     return false;
   }
   if (fanout_tensor.NumElements() != fanin_tensor.NumElements()) {
@@ -255,6 +265,120 @@ Status EraseCancellableNodes(TransposeContext* context) {
   return mutation->Apply();
 }
 
+// TODO(ezhulenev): This is a temporary workaround for a graph pattern
+// in Resnet models. We should be able to push down transpose nodes across Pad
+// and many other ops, and then rely on cancellation to remove them.
+//
+// From: Transpose[NHWC->NCHW] -> Pad[paddings] -> Transpose[NCHW->NHWC]
+// To:   Pad[Permute(paddings)]
+Status EraseCancellableNodesAroundPad(TransposeContext* context) {
+  utils::MutableGraphView* graph_view = context->graph_view.get();
+  utils::Mutation* mutation = graph_view->GetMutationBuilder();
+
+  absl::flat_hash_set<utils::MutableNodeView*> cancelled_transposes;
+
+  const int num_nodes = graph_view->NumNodes();
+  for (int i = 0; i < num_nodes; ++i) {
+    // Transpose node after Pad.
+    auto* transpose_after = graph_view->GetNode(i);
+    if (!IsTranspose(*transpose_after->node())) continue;
+
+    // This transpose was already cancelled in previous loop iteration.
+    if (cancelled_transposes.contains(transpose_after)) continue;
+
+    // Pad node.
+    const auto& transpose_after_fanin = transpose_after->GetRegularFanin(0);
+    auto* pad = transpose_after_fanin.node_view();
+    if (!IsPad(*pad->node())) continue;
+
+    // Transpose node before Pad.
+    const auto& pad_fanin_0 = pad->GetRegularFanin(0);
+    auto* transpose_before = pad_fanin_0.node_view();
+    if (!IsTranspose(*transpose_before->node())) continue;
+
+    // Transpose before output used once by the Pad node.
+    if (transpose_before->NumRegularFanouts() != 1) continue;
+
+    // Transposes are cancellable.
+    if (!IsCancellableConstPermTransposeNodePair(*transpose_after,
+                                                 *transpose_before))
+      continue;
+
+    // Paddings are known constant values.
+    Tensor paddings_t;
+    if (!GetValueAttrFromConstInputNode(*pad, IsPad, 1, &paddings_t)) continue;
+
+    // Paddings value used once by the pad node only.
+    const auto& pad_fanin_1 = pad->GetRegularFanin(1);
+    auto* paddings = pad_fanin_1.node_view();
+    if (paddings->NumRegularFanouts() != 1) continue;
+
+    // Get permutation after the padding.
+    Tensor permute_t;
+    if (!GetValueAttrFromConstInputNode(*transpose_after, IsTranspose, 1,
+                                        &permute_t))
+      continue;
+
+    // Pad output might be used multiple times by different Transpose nodes. If
+    // they all have identical permutation, we can cancel all of them.
+    std::vector<utils::MutableNodeView*> pad_fanout_transposes;
+    pad_fanout_transposes.emplace_back(transpose_after);
+
+    bool pad_has_unsupported_fanout = false;
+    for (auto& fanout : pad->GetRegularFanout(0)) {
+      auto* extra_transpose = fanout.node_view();
+      if (extra_transpose == transpose_after) continue;
+
+      // Check that fanout is a Transpose identical to the transpose_after.
+      Tensor extra_permute_t;
+      if (!GetValueAttrFromConstInputNode(*extra_transpose, IsTranspose, 1,
+                                          &extra_permute_t) ||
+          extra_permute_t.tensor_data() != permute_t.tensor_data()) {
+        pad_has_unsupported_fanout = true;
+        break;
+      }
+
+      pad_fanout_transposes.emplace_back(extra_transpose);
+    }
+    if (pad_has_unsupported_fanout) continue;
+
+    VLOG(0) << "Cancel Transpose nodes around Pad:"
+            << " transpose_before=" << transpose_before->node()->name()
+            << " pad=" << pad->node()->name() << " transpose_after="
+            << absl::StrJoin(pad_fanout_transposes, ",",
+                             MutableNodeViewFormatter());
+
+    // Permute paddings in place according to permutation in second transpose.
+    auto permutation_s = absl::Span<int32>(permute_t.flat<int32>().data(),
+                                           permute_t.NumElements());
+    auto paddings_s = absl::Span<int32>(paddings_t.flat<int32>().data(),
+                                        paddings_t.NumElements());
+    TF_RETURN_IF_ERROR(
+        PermuteDouble(absl::StrCat("paddings in ", pad->GetName()),
+                      permutation_s, &paddings_s));
+
+    // Update paddings constant value with a permuted tensor.
+    AttrValue permuted_paddings_tensor;
+    paddings_t.AsProtoTensorContent(permuted_paddings_tensor.mutable_tensor());
+    mutation->AddOrUpdateNodeAttr(paddings, "value", permuted_paddings_tensor);
+
+    // Transform Transpose nodes into Identity nodes.
+    const auto transpose_to_identity =
+        [&cancelled_transposes,
+         &mutation](utils::MutableNodeView* transpose) -> void {
+      mutation->UpdateNodeOp(transpose, "Identity");
+      mutation->RemoveNodeAttr(transpose, "Tperm");
+      mutation->RemoveRegularFanin(transpose, 1);
+      cancelled_transposes.insert(transpose);
+    };
+
+    transpose_to_identity(transpose_before);
+    absl::c_for_each(pad_fanout_transposes, transpose_to_identity);
+  }
+
+  return mutation->Apply();
+}
+
 Status EraseOutputShapeAttrs(TransposeContext* context) {
   utils::MutableGraphView* graph_view = context->graph_view.get();
   utils::Mutation* mutation = graph_view->GetMutationBuilder();
@@ -284,6 +408,8 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
         "GPU.");
   }
 
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+
   TransposeContext context;
   TF_RETURN_IF_ERROR(
       TransposeContext::InitializeTransposeContext(item, cluster, &context));
@@ -295,9 +421,10 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
 
   TransposerFactory transposer_factory;
   TF_RETURN_IF_ERROR(ExpandLayoutSensitiveOp(&context, &transposer_factory));
-  if (context.graph.node_size() > context.num_nodes) {
+  if (context.graph.node_size() > context.num_nodes || is_aggressive) {
     TF_RETURN_IF_ERROR(ExpandLayoutAgnosticOp(&context, &transposer_factory));
     TF_RETURN_IF_ERROR(EraseCancellableNodes(&context));
+    TF_RETURN_IF_ERROR(EraseCancellableNodesAroundPad(&context));
     // TODO(lyandy): Remove sorting once other optimizers are migrated to using
     // `utils::GraphView`.
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
index af8a2e395d3..d4d61bed70c 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -24,16 +25,23 @@ namespace grappler {
 // Optimize the data layout for convolutional models.
 class GenericLayoutOptimizer : public GraphOptimizer {
  public:
-  GenericLayoutOptimizer() : GraphOptimizer() {}
-  ~GenericLayoutOptimizer() override {}
+  GenericLayoutOptimizer() : GenericLayoutOptimizer(RewriterConfig::DEFAULT) {}
+  explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~GenericLayoutOptimizer() override = default;
 
   string name() const override { return "layout"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index a48fde74c09..fd5ae22eac8 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/graph_view.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -117,7 +119,7 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   return conv_backprop_input;
 }
 
-class GenericLayoutOptimizerTest : public ::testing::Test {
+class GenericLayoutOptimizerTest : public GrapplerTest {
  protected:
   void SetUp() override {
     bool gpu_available = GetNumAvailableGPUs() > 0;
@@ -525,6 +527,73 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
                           0);
 }
 
+TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) {
+  using test::function::NDef;
+
+  GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kPermuteNhwcToNchw = test::AsTensor<int32>({0, 3, 1, 2});
+  const Tensor kPermuteNchwToNhwc = test::AsTensor<int32>({0, 2, 3, 1});
+  const Tensor kPad = test::AsTensor<int32>({1, 2, 3, 4, 5, 6, 7, 8}, {4, 2});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef({
+      NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}),
+
+      NDef("paddings", "Const", {}, {{"dtype", DT_INT32}, {"value", kPad}}),
+      NDef("perm_nhwc_to_nchw", "Const", {},
+           {{"dtype", DT_INT32}, {"value", kPermuteNhwcToNchw}}),
+      NDef("perm_nchw_to_nhwc", "Const", {},
+           {{"dtype", DT_INT32}, {"value", kPermuteNchwToNhwc}}),
+
+      NDef("transpose_0", "Transpose", {"x", "perm_nhwc_to_nchw"},
+           {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}),
+      NDef("pad", "Pad", {"transpose_0", "paddings"},
+           {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}),
+      NDef("transpose_1", "Transpose", {"pad", "perm_nchw_to_nhwc"},
+           {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}),
+      NDef("transpose_2", "Transpose", {"pad", "perm_nchw_to_nhwc"},
+           {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}),
+  });
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  const Tensor kPermutedPaddings =
+      test::AsTensor<int32>({1, 2, 5, 6, 7, 8, 3, 4}, {4, 2});
+
+  GraphDef expected = test::function::GDef({
+      NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}),
+
+      NDef("paddings", "Const", {},
+           {{"dtype", DT_INT32}, {"value", kPermutedPaddings}}),
+      NDef("perm_nhwc_to_nchw", "Const", {},
+           {{"dtype", DT_INT32}, {"value", kPermuteNhwcToNchw}}),
+      NDef("perm_nchw_to_nhwc", "Const", {},
+           {{"dtype", DT_INT32}, {"value", kPermuteNchwToNhwc}}),
+
+      // Transpose nodes replaced by Identity nodes.
+      NDef("transpose_0", "Identity", {"x"}, {{"T", DT_FLOAT}}),
+      NDef("pad", "Pad", {"transpose_0", "paddings"},
+           {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}),
+      NDef("transpose_1", "Identity", {"pad"}, {{"T", DT_FLOAT}}),
+      NDef("transpose_2", "Identity", {"pad"}, {{"T", DT_FLOAT}}),
+  });
+
+  CompareGraphs(expected, output);
+
+  Tensor x = GenerateRandomTensor<DT_FLOAT>({2, 6, 6, 8});
+  item.fetch = {"transpose_1", "transpose_2"};
+  item.feed.emplace_back("x", x);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 2);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+}
+
 // TODO(yanzha): Add more complex Graph for test.
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 2b4b4a4ca69..34d91a2e731 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -279,7 +279,9 @@ Status Transposer::CreateTransposeNode(
   node.mutable_attr()->insert({"Tperm", attr_data_type_perm});
 
   if (!fanin_shape.unknown_rank()) {
-    TF_RETURN_IF_ERROR(PermuteSingle(permutation, fanin_shape.mutable_dim()));
+    TF_RETURN_IF_ERROR(
+        PermuteSingle(absl::StrCat("fanin shape in", node.name()), permutation,
+                      fanin_shape.mutable_dim()));
     AttrValue attr_output_shape;
     *attr_output_shape.mutable_list()->add_shape() = fanin_shape;
     node.mutable_attr()->insert({kAttrOutputShape, attr_output_shape});
@@ -333,9 +335,12 @@ Status Transposer::UpdateFanoutEdgesWithOp(TransposeContext* context,
   if (op == kOpTranspose && output_shape_attr != nullptr) {
     shape_attr_copy = *output_shape_attr;
     for (int port : src_ports) {
-      TF_RETURN_IF_ERROR(PermuteSingle(
-          context->src_to_dst,
-          shape_attr_copy.mutable_list()->mutable_shape(port)->mutable_dim()));
+      auto* shape = shape_attr_copy.mutable_list()->mutable_shape(port);
+      if (shape->unknown_rank()) continue;
+      TF_RETURN_IF_ERROR(
+          PermuteSingle(absl::StrCat("output shape attribute at port ", port,
+                                     " in", src_node->GetName()),
+                        context->src_to_dst, shape->mutable_dim()));
     }
     context->graph_view->GetMutationBuilder()->AddOrUpdateNodeAttr(
         src_node, kAttrOutputShape, shape_attr_copy);
@@ -438,6 +443,7 @@ Status Transposer::UpdateEdge(
   string added_node_name;
   if (op == kOpTranspose) {
     TensorShapeProto input_shape_proto;
+    input_shape_proto.set_unknown_rank(true);
     if (input_shape != nullptr) {
       input_shape_proto = input_shape->list().shape(src_port);
     } else {
@@ -616,8 +622,9 @@ Status LayoutSensitiveOpTransposer::UpdateNode(TransposeContext* context,
     const auto* attr = node->GetAttr(attr_name);
     if (attr != nullptr) {
       AttrValue attr_copy(*attr);
-      TF_RETURN_IF_ERROR(PermuteSingle(context->src_to_dst,
-                                       attr_copy.mutable_list()->mutable_i()));
+      TF_RETURN_IF_ERROR(PermuteSingle(
+          absl::StrCat(attr_name, " attribute in", node->GetName()),
+          context->src_to_dst, attr_copy.mutable_list()->mutable_i()));
       mutation->AddOrUpdateNodeAttr(node, attr_name, attr_copy);
     }
     return Status::OK();
@@ -632,9 +639,10 @@ Status LayoutSensitiveOpTransposer::UpdateNode(TransposeContext* context,
   if (explicit_paddings_attr != nullptr && explicit_paddings_attr->has_list() &&
       explicit_paddings_attr->list().i_size() > 0) {
     AttrValue explicit_paddings_attr_copy(*explicit_paddings_attr);
-    TF_RETURN_IF_ERROR(
-        PermuteDouble(context->src_to_dst,
-                      explicit_paddings_attr_copy.mutable_list()->mutable_i()));
+    TF_RETURN_IF_ERROR(PermuteDouble(
+        absl::StrCat("explicit_paddings attribute in", node->GetName()),
+        context->src_to_dst,
+        explicit_paddings_attr_copy.mutable_list()->mutable_i()));
     mutation->AddOrUpdateNodeAttr(node, kAttrExplicitPaddings,
                                   explicit_paddings_attr_copy);
   }
@@ -648,6 +656,9 @@ Status DefaultLayoutSensitiveOpTransposer::TransposeNode(
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
@@ -660,6 +671,9 @@ Status AvgPoolGradTransposer::TransposeNode(TransposeContext* context,
   if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 1, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
@@ -674,6 +688,9 @@ Status BiasAddGradTransposer::TransposeNode(TransposeContext* context,
   if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   // No need to update output shape, as it is always of shape 1-D with size the
@@ -689,6 +706,9 @@ Status Conv2DBackpropFilterTransposer::TransposeNode(
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 2}, node, kOpTranspose));
@@ -705,6 +725,9 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
@@ -713,6 +736,27 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
+Status FusedBatchNormExTransposer::TransposeNode(TransposeContext* context,
+                                                 utils::MutableNodeView* node) {
+  DCHECK(IsFusedBatchNormEx(*node->node()));
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  if (node->NumRegularFanins() == 6) {
+    TF_RETURN_IF_ERROR(
+        UpdateFaninEdgesWithOp(context, {0, 5}, node, kOpTranspose));
+  } else {
+    TF_RETURN_IF_ERROR(
+        UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
+  }
+  TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
 bool FusedBatchNormGradTransposer::IsTraining(
     const utils::MutableNodeView& node) const {
   const auto* is_training_attr = node.GetAttr(kAttrIsTraining);
@@ -729,6 +773,9 @@ Status FusedBatchNormGradTransposer::TransposeNode(
       !IsTraining(*node)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 1}, node, kOpTranspose));
@@ -748,6 +795,9 @@ Status MaxPoolV2Transposer::TransposeNode(TransposeContext* context,
       !IsFanoutPortRankN(*data_fanin_node, data_fanin.index(), 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(
@@ -762,6 +812,9 @@ Status MaxPoolGradTransposer::TransposeNode(TransposeContext* context,
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 1, 2}, node, kOpTranspose));
@@ -775,6 +828,9 @@ Status MaxPoolGradV2Transposer::TransposeNode(TransposeContext* context,
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 1, 2}, node, kOpTranspose));
@@ -789,7 +845,7 @@ Status MaxPoolGradV2Transposer::TransposeNode(TransposeContext* context,
 inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node,
                                           absl::Span<const int> permutation) {
   Tensor tensor;
-  if (!GetValueAttrIfConstPermTransposeNode(node, &tensor)) {
+  if (!GetValueAttrFromConstInputNode(node, IsTranspose, 1, &tensor)) {
     return false;
   }
   if (tensor.NumElements() != permutation.size()) {
@@ -1607,8 +1663,8 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
          IsConv2DBackpropInput(node) ||
          IsDepthwiseConv2dNativeBackpropFilter(node) ||
          IsDepthwiseConv2dNativeBackpropInput(node) ||
-         IsFusedBatchNormGrad(node) || IsMaxPoolV2(node) ||
-         IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) ||
+         IsFusedBatchNormEx(node) || IsFusedBatchNormGrad(node) ||
+         IsMaxPoolV2(node) || IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) ||
          IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node);
 }
 
@@ -1751,17 +1807,19 @@ std::vector<int> GetDataFanoutPorts(const utils::MutableNodeView& node) {
   return {0};
 }
 
-bool GetValueAttrIfConstPermTransposeNode(const utils::MutableNodeView& node,
-                                          Tensor* tensor) {
-  if (!IsTranspose(*node.node())) {
+bool GetValueAttrFromConstInputNode(
+    const utils::MutableNodeView& node,
+    const std::function<bool(const NodeDef&)>& predicate, int index,
+    Tensor* tensor) {
+  if (!predicate(*node.node())) {
     return false;
   }
-  const auto& regular_fanin_1 = node.GetRegularFanin(1);
-  auto* regular_fanin_1_node = regular_fanin_1.node_view();
-  if (!IsConstant(*regular_fanin_1_node->node())) {
+  const auto& regular_fanin = node.GetRegularFanin(index);
+  auto* regular_fanin_node = regular_fanin.node_view();
+  if (!IsConstant(*regular_fanin_node->node())) {
     return false;
   }
-  const auto* value_attr = regular_fanin_1_node->GetAttr(kAttrValue);
+  const auto* value_attr = regular_fanin_node->GetAttr(kAttrValue);
   if (value_attr == nullptr || value_attr->tensor().dtype() != DT_INT32) {
     return false;
   }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 4da29e2e2d6..b518c32d8ec 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -239,6 +239,14 @@ class Conv2DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
                        utils::MutableNodeView* node) override;
 };
 
+class FusedBatchNormExTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit FusedBatchNormExTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
 class FusedBatchNormGradTransposer : public LayoutSensitiveOpTransposer {
  public:
   explicit FusedBatchNormGradTransposer() : LayoutSensitiveOpTransposer() {}
@@ -517,13 +525,14 @@ class UnaryGradTransposer : public LayoutAgnosticOpTransposer {
 // Permutes elements according to permutation and replaces the original values.
 // Permutation and values must have same size.
 template <typename T>
-Status PermuteSingle(absl::Span<const int> permutation, T* values) {
+Status PermuteSingle(absl::string_view location,
+                     absl::Span<const int> permutation, T* values) {
   DCHECK(values != nullptr);
   if (values->size() != permutation.size()) {
     return Status(tensorflow::error::Code::INVALID_ARGUMENT,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match size of permutation ",
-                               permutation.size(), "."));
+                               permutation.size(), " @ ", location));
   }
   typedef typename T::value_type V;
   std::vector<V> elements(values->begin(), values->end());
@@ -537,13 +546,14 @@ Status PermuteSingle(absl::Span<const int> permutation, T* values) {
 // Permutes two elements at a time according to permutation and replaces the
 // original values. Values must be twice the size of permutation.
 template <typename T>
-Status PermuteDouble(absl::Span<const int> permutation, T* values) {
+Status PermuteDouble(absl::string_view location,
+                     absl::Span<const int> permutation, T* values) {
   DCHECK(values != nullptr);
   if (values->size() != permutation.size() * 2) {
     return Status(tensorflow::error::Code::INVALID_ARGUMENT,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match twice the size of permutation ",
-                               permutation.size(), "."));
+                               permutation.size(), " @ ", location));
   }
   typedef typename T::value_type V;
   std::vector<V> elements(values->begin(), values->end());
@@ -585,8 +595,12 @@ std::vector<int> GetDataFaninPorts(const utils::MutableNodeView& node);
 
 std::vector<int> GetDataFanoutPorts(const utils::MutableNodeView& node);
 
-bool GetValueAttrIfConstPermTransposeNode(const utils::MutableNodeView& node,
-                                          Tensor* tensor);
+// Returns a value of constant input to the `node` at `index`, iff `predicate`
+// evaluated to true. Returns true if `tensor` was populated with data.
+bool GetValueAttrFromConstInputNode(
+    const utils::MutableNodeView& node,
+    const std::function<bool(const NodeDef&)>& predicate, int index,
+    Tensor* tensor);
 
 bool IsDataFormatOp(const utils::MutableNodeView& node);
 
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
index bab17492a4a..59c06d42441 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
@@ -43,6 +43,10 @@ std::shared_ptr<Transposer> TransposerFactory::GetTransposer(
     return GetOrCreateIfNotFound<Conv2DBackpropInputTransposer>(
         "Conv2DBackpropInput");
   }
+  if (IsFusedBatchNormEx(node)) {
+    return GetOrCreateIfNotFound<FusedBatchNormExTransposer>(
+        "FusedBatchNormEx");
+  }
   if (IsFusedBatchNormGrad(node)) {
     return GetOrCreateIfNotFound<FusedBatchNormGradTransposer>(
         "FusedBatchNormGrad");
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc
index 9bc3dff3f71..2721b2f0d26 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc
@@ -67,6 +67,8 @@ TEST(TransposerFactoryTest, SanityCheck) {
 
   CheckSameTransposerForOps({"BiasAddGrad"}, &factory, &transposers);
 
+  CheckSameTransposerForOps({"_FusedBatchNormEx"}, &factory, &transposers);
+
   CheckSameTransposerForOps({"FusedBatchNormGrad", "FusedBatchNormGradV2"},
                             &factory, &transposers);
 
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
index 5a724b67f1c..b5145585715 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
@@ -3640,7 +3640,7 @@ TEST_F(TransposerTest, ReduceTransposerValidAxisNode) {
 TEST(PermutationTest, PermutesVector) {
   std::vector<int64> input{32, 16, 8, 4};
   std::vector<int64> expected{4, 8, 16, 32};
-  TF_ASSERT_OK(PermuteSingle({3, 2, 1, 0}, &input));
+  TF_ASSERT_OK(PermuteSingle("test", {3, 2, 1, 0}, &input));
   ASSERT_EQ(input.size(), 4);
   for (int i = 0; i < input.size(); ++i) {
     EXPECT_EQ(input[i], expected[i]);
@@ -3651,7 +3651,7 @@ TEST(PermutationTest, PermutesRepeatedField) {
   TensorShapeProto input_shape = MakeTensorShapeFromDimensions({1, 2, 3, 4});
   TensorShapeProto expected_shape = MakeTensorShapeFromDimensions({1, 4, 2, 3});
 
-  TF_ASSERT_OK(PermuteSingle({0, 3, 1, 2}, input_shape.mutable_dim()));
+  TF_ASSERT_OK(PermuteSingle("test", {0, 3, 1, 2}, input_shape.mutable_dim()));
   EXPECT_EQ(input_shape.DebugString(), expected_shape.DebugString());
 }
 
@@ -3663,7 +3663,7 @@ TEST(PermutationTest, PermutesDoubleRepeatedField) {
     TensorShapeProto expected =
         MakeTensorShapeFromDimensions({1, 2, 7, 8, 3, 4, 5, 6});
 
-    TF_ASSERT_OK(PermuteDouble({0, 3, 1, 2}, input.mutable_dim()));
+    TF_ASSERT_OK(PermuteDouble("test", {0, 3, 1, 2}, input.mutable_dim()));
     EXPECT_EQ(input.DebugString(), expected.DebugString());
   }
   {
@@ -3672,7 +3672,7 @@ TEST(PermutationTest, PermutesDoubleRepeatedField) {
         MakeTensorShapeFromDimensions({1, 2, 3, 4, 5, 6, 7, 8});
     TensorShapeProto expected =
         MakeTensorShapeFromDimensions({1, 2, 5, 6, 7, 8, 3, 4});
-    TF_ASSERT_OK(PermuteDouble({0, 2, 3, 1}, input.mutable_dim()));
+    TF_ASSERT_OK(PermuteDouble("test", {0, 2, 3, 1}, input.mutable_dim()));
     EXPECT_EQ(input.DebugString(), expected.DebugString());
   }
 }
@@ -3680,14 +3680,14 @@ TEST(PermutationTest, PermutesDoubleRepeatedField) {
 TEST(PermutationTest, PermutesDataFormat) {
   string input = "NHWC";
   string expected = "NCHW";
-  TF_ASSERT_OK(PermuteSingle({0, 3, 1, 2}, &input));
+  TF_ASSERT_OK(PermuteSingle("test", {0, 3, 1, 2}, &input));
   EXPECT_EQ(input, expected);
 }
 
 TEST(PermutationTest, PermutesString) {
   string input = "ABCD";
   string expected = "ACBD";
-  TF_ASSERT_OK(PermuteSingle({0, 2, 1, 3}, &input));
+  TF_ASSERT_OK(PermuteSingle("test", {0, 2, 1, 3}, &input));
   EXPECT_EQ(input, expected);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 2d0b2550396..238606ee673 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -36,6 +36,13 @@ class GraphOptimizer {
 
   virtual string name() const = 0;
 
+  // Returns true if the optimizer requires a valid function library to perform
+  // graph optimization. If false, optimized GrapplerItem will have a stub
+  // instead of real function library (all function signatures and attributes
+  // will be valid, but function body will be empty). Most of the optimizers
+  // that do not instantiate functions should return true.
+  virtual bool UsesFunctionLibrary() const = 0;
+
   // Routine called to allow an algorithm to propose a rewritten graph
   // for the graph, feeds and fetches in "item" to run more efficiently
   // on "cluster". If the returned status is Status::OK() then
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 82c408b521f..97033a180a6 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -42,7 +42,7 @@ Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
 
 Status GetTensorProperties(const GraphOptimizerContext& ctx,
                            const string& tensor,
-                           OpInfo::TensorProperties* properties) {
+                           const OpInfo::TensorProperties** properties) {
   if (ctx.graph_properties == nullptr) {
     return errors::InvalidArgument("Graph properties are unknown.");
   }
@@ -67,7 +67,7 @@ Status GetTensorProperties(const GraphOptimizerContext& ctx,
         " (num_outputs=", num_outputs, ")");
   }
 
-  properties->CopyFrom(output_properties[tensor_id.index()]);
+  *properties = &output_properties[tensor_id.index()];
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 58107fa0fe3..0bf3803d253 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -70,7 +70,7 @@ Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
                     NodeDef** node);
 Status GetTensorProperties(const GraphOptimizerContext& ctx,
                            const string& tensor,
-                           OpInfo::TensorProperties* properties);
+                           const OpInfo::TensorProperties** properties);
 
 NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
                      const NodeDef* node_to_copy);
@@ -189,8 +189,8 @@ class GraphOptimizerStage {
   // Lookup tensor properties by name. Tensor name might have non-zero port
   // number. Return an error if tensor node doesn't exists in a graph, or it
   // doesn't have properties defined for requested port.
-  Status GetTensorProperties(const string& tensor,
-                             OpInfo::TensorProperties* properties) const {
+  Status GetTensorProperties(
+      const string& tensor, const OpInfo::TensorProperties** properties) const {
     return ::tensorflow::grappler::GetTensorProperties(ctx_, tensor,
                                                        properties);
   }
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 7b6bd3a1227..678db7be83f 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -167,17 +167,17 @@ TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
   EXPECT_EQ(add_node->input(0), "a");
   EXPECT_EQ(add_node->input(1), "b");
 
-  OpInfo::TensorProperties add_properties;
+  const OpInfo::TensorProperties* add_properties;
   TF_CHECK_OK(stage.GetTensorProperties("Add", &add_properties));
-  EXPECT_EQ(add_properties.dtype(), DT_FLOAT);
+  EXPECT_EQ(add_properties->dtype(), DT_FLOAT);
 
-  OpInfo::TensorProperties a_properties;
+  const OpInfo::TensorProperties* a_properties;
   TF_CHECK_OK(stage.GetTensorProperties("a:0", &a_properties));
-  EXPECT_EQ(a_properties.dtype(), DT_FLOAT_REF);
+  EXPECT_EQ(a_properties->dtype(), DT_FLOAT_REF);
 
-  OpInfo::TensorProperties b_properties;
+  const OpInfo::TensorProperties* b_properties;
   TF_CHECK_OK(stage.GetTensorProperties("b:0", &b_properties));
-  EXPECT_EQ(b_properties.dtype(), DT_FLOAT_REF);
+  EXPECT_EQ(b_properties->dtype(), DT_FLOAT_REF);
 }
 
 TEST_F(GraphOptimizerStageTest, AddNodes) {
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 5bef9374c18..37dda6ab6a3 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/function_api_info.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -34,15 +35,123 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
+// The overall idea for the function swap is like below:
+//          -----------                            -----------
+//  inp_1 ->|  P_C    | -> out_1         g_inp_1 ->|  P_C    | -> g_out_1
+//  inp_2 ->| forward | -> out_2         g_inp_2 ->| backward| -> g_out_2
+//          | FUNC_1  | -> out_3         g_inp_3 ->| FUNC_1  |
+//          -----------                            -----------
+//           |  |  |                                 ^  ^  ^
+//           v  v  v                                 |  |  |
+//           s1 s2 s3                                s1 s2 s3
+//           |                                       ^
+//           |                                       |
+//           |             --------------            |
+//           |-----------> | Identity_1 | ---------->|
+//                         --------------
+// P_C: op Partitioned_call or stateful_partitioned_call
+// FUNC1 (forward): TF function generated for the forward path.
+// FUNC1 (backward): TF function generated for the backward path.
+// inp_x: input tensors for the forward path.
+// out_x: output tensors for the forward path.
+// g_inp_x: gradient input tensors for the backward path.
+// g_out_x: gradient output tensors for the backward path.
+// s_x: intermediate result generated by forward tf function, which will be
+//      consumed by backward function for gradient calculation.
+//
+// In the example above, the FUNC_1 takes 2 inputs, and return 3 outputs, in the
+// meantime, generate 3 intermediate results for gradient calculation.
+// The backward function will take 6 inputs, 3 for the gradient value for out_x,
+// and 3 for the intermediate results s1/2/3. It returns 2 outputs for gradient
+// value wrt inp_x.
+//
+// Given the graph, especially after the device placement is done, we could
+// check if there is an alternative FUNC_2 that is better for the assigned
+// device type. Note that FUNC_2 (both forward and backward) should have same
+// amount of input output tensor with same dtype. However, it can generate
+// different intermediate state tensor, both number wise and type wise, since it
+// depends on the implementation detail.
+//
+// Also note that there might be some Identity op being added to the output of
+// the forward function by IsolatePlacerInspectionRequiredOps for device
+// placement. When the output DTYPE changes when switching from FUNC_1 to
+// FUNC_2, the Identity node down the stream also need to be updated with new
+// DTYPE.
+//
+// Based on this, the rewrite need to happen for following items:
+//
+// 1. P_C forward/backward need to use FUNC_2 instead of FUNC_1.
+// 2. The T_IN for P_C backward need to be updated since the s_x can be
+//    different between FUNC_1 and FUNC_2.
+// 3. The T_OUT for P_C forward need to be updated since the s_x can be
+//    different between FUNC_1 and FUNC_2.
+// 4. The input edge for P_C backward need to be updated since the amount of
+//    intermediate result can be different between FUNC_1 and FUNC_2.
+// 5. DTYPE of the Identity node after s_1/2/3 need to be updated if they exist.
+
+string FindForwardNode(utils::MutableNodeView* backward_node) {
+  // For the tf function, Identity op node might be added by
+  // placer_inspection_required_ops_utils for device placement. Those ops might
+  // be removed by model_pruner, or stay there if the Identity op is cross
+  // device. Given the partitioned_call node for backward function, we want to
+  // find the partitioned_call node for the forward function, so that we can
+  // add/remove/updated input tensors for backward function, which is the step
+  // 4 as described above.
+
+  // Find the last input
+  const int last_input_index = backward_node->NumRegularFanins() - 1;
+  const utils::MutableFanoutView& input =
+      backward_node->GetRegularFanin(last_input_index);
+  // For the input node, it should either be the partitioned call, which is
+  // the forward node we need, or a Identity op which just pass through the
+  // output of the partitioned call.
+  if (IsIdentity(*input.node_view()->node())) {
+    // Find the only input to this op, which should be the original forward node
+    return input.node_view()->node()->input(0);
+  } else if (IsPartitionedCall(*input.node_view()->node()) ||
+             IsStatefulPartitionedCall(*input.node_view()->node())) {
+    // Found the forward node.
+    return backward_node->node()->input(last_input_index);
+  } else {
+    // Unhandled situation.
+    return "";
+  }
+}
+
+void UpdateForwardIdentityNodeDtype(utils::MutableNodeView* forward_node,
+                                    const DataTypeVector& dtypes) {
+  const auto& fanouts_vector = forward_node->GetRegularFanouts();
+  for (int pos = 0; pos < fanouts_vector.size(); ++pos) {
+    const auto& fanouts_at_pos = fanouts_vector[pos];
+    for (const auto& fanout : fanouts_at_pos) {
+      if ("Identity" == fanout.node_view()->GetOp()) {
+        (*fanout.node_view()->node()->mutable_attr())["T"].set_type(
+            dtypes[pos]);
+        VLOG(3) << "Updated DTYPE for Identity node: "
+                << fanout.node_view()->node()->DebugString();
+      }
+    }
+  }
+}
+
+Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName,
                      const FunctionApiInfo& apiInfo) {
+  NodeDef* node_def = node_view->node();
+
   VLOG(3) << "Node def before swap is: " << node_def->DebugString();
+
+  // For step 1 above.
+  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
+      funcName);
+
+  // For step 2 above.
   auto tin = node_def->mutable_attr()->find("Tin");
   tin->second.mutable_list()->clear_type();
   for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) {
     tin->second.mutable_list()->add_type(tin_dtype);
   }
 
+  // For step 3 above.
   auto tout = node_def->mutable_attr()->find("Tout");
   tout->second.mutable_list()->clear_type();
   for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) {
@@ -50,14 +159,7 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
   }
 
   if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
-    // Update the inputs since for backward function, it might have different
-    // number of inputs due the different number output from forward function.
-    // The output of forward function are composed by two parts:
-    //   1. Real output tensors from defun.
-    //   2. Internal states that will be used for gradient calculation.
-    // Part 1 will be static, and part 2 could be different based on the
-    // different implementation.
-
+    // For step 4 above.
     const int prev_input_size = node_def->input_size();
     const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
     if (diff >= 0) {
@@ -75,7 +177,7 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
       //   input: "unified_lstm/StatefulPartitionedCall:4"
       //   # New input should be "unified_lstm/StatefulPartitionedCall:5"
       // }
-      const string last_input = node_def->input(prev_input_size - 1);
+      const string last_input = FindForwardNode(node_view);
       const std::vector<string> name_index = ::absl::StrSplit(last_input, ':');
       if (name_index.size() != 2) {
         return errors::InvalidArgument(
@@ -92,23 +194,25 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
       for (int i = 1; i <= -diff; ++i)
         node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
     }
+  } else if (apiInfo.function_type() == FunctionApiInfo::FORWARD) {
+    // For forward function, since the DTYPE of the intermediate state might
+    // have been changed, we want to update the down stream Identity node if
+    // any. This is the step 5 in the commend above.
+    UpdateForwardIdentityNodeDtype(node_view, apiInfo.output_arg_dtypes());
   }
 
-  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
-      funcName);
-
   VLOG(3) << "Node def after swap is: " << node_def->DebugString();
   return Status::OK();
 }
 
 Status ImplementationSelector::LoadFunctions(const GraphDef& graph) {
-  lib_info_.reset(new FunctionLibraryApiInfo);
+  lib_info_ = absl::make_unique<FunctionLibraryApiInfo>();
   TF_RETURN_IF_ERROR(lib_info_->Init(graph.library()));
   return Status::OK();
 }
 
 Status ImplementationSelector::MaybeOptimizeFunctionCall(
-    NodeDef* node_def) const {
+    utils::MutableNodeView* node_view) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
   //  2. Via the @defun functional interface, where the real function call
@@ -116,6 +220,8 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall(
   //     attribute with name "f" and type func. In this use case, there are more
   //     attributes need to be taken care, like Tin and Tout which take care of
   //     the DTYPE of input/output.
+  NodeDef* node_def = node_view->node();
+
   std::vector<string> function_attribute_names;
   for (const auto& attr : node_def->attr()) {
     if (attr.second.has_func() &&
@@ -149,7 +255,7 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall(
       const auto& func_api_info = lib_info_->GetApiInfo(func_name);
       if (func_api_info->preferred_device() == parsed_name.type) {
         VLOG(2) << "Swapping: " << function_name << " TO: " << func_name;
-        TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info));
+        TF_RETURN_IF_ERROR(UpdateNodeDef(node_view, func_name, *func_api_info));
         break;
       }
     }
@@ -181,8 +287,13 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
     return Status::OK();
   }
 
-  for (int k = 0; k < graph->node_size(); ++k)
-    TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
+  Status status;
+  utils::MutableGraphView graph_view(graph, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  const int num_nodes = graph_view.NumNodes();
+  for (int k = 0; k < num_nodes; ++k)
+    TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph_view.GetNode(k)));
 
   return Status::OK();
 }
@@ -196,8 +307,8 @@ Status ImplementationSelector::Optimize(Cluster* cluster,
   // function_optimizer from previous runs, which will fail due to function
   // signature mismatch.
   if (!status.ok()) {
-    LOG(WARNING) << "Skipping optimization due to error while loading function "
-                 << "libraries: " << status;
+    VLOG(2) << "Skipping optimization due to error while loading function "
+            << "libraries: " << status;
     return errors::Aborted("Skipped Optimization");
   }
   *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
index c206d21640b..57d19fe7046 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/function_api_info.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -78,6 +79,8 @@ class ImplementationSelector : public CustomGraphOptimizer {
     return "implementation_selector";
   }
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   // This call is not thread-safe.
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
@@ -88,7 +91,7 @@ class ImplementationSelector : public CustomGraphOptimizer {
 
  private:
   Status LoadFunctions(const GraphDef& graph);
-  Status MaybeOptimizeFunctionCall(NodeDef* node_def) const;
+  Status MaybeOptimizeFunctionCall(utils::MutableNodeView* node_view) const;
 
   // Finds all call sites for functions, then replace with the appropriate
   // implementation.
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 49b697bb75b..bc2342fa07d 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -30,6 +30,8 @@ class LayoutOptimizer : public GraphOptimizer {
 
   string name() const override { return "layout"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   struct TuningConfig {
     // If true, do not use the NHWC GEMM implementation. When filter size is
     // one or filter size is equal to input image size,
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 3ffc6ad2d46..41e6c9f74d9 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -558,6 +558,9 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
                                          DeviceBase* cpu_device,
                                          ResourceMgr* resource_mgr,
                                          bool* value) {
+  VLOG(4) << "Evaluate bool op: op_node=" << op_node.name()
+          << " input0=" << constant_operand_0.name()
+          << " input1=" << constant_operand_1.name();
   TensorVector inputs;
 
   const TensorProto& raw_val_0 = constant_operand_0.attr().at("value").tensor();
@@ -604,10 +607,14 @@ Status CheckForDeadFanout(const MutableGraphView& view,
 
   // CASE 1: Control is a constant.
   if (IsReallyConstant(*switch_predicate, feed_nodes)) {
+    VLOG(3) << "Found switch node with constant predicate:"
+            << " switch_node=" << switch_node.name()
+            << " switch_predicate=" << switch_predicate->name();
     Tensor selector;
     CHECK(selector.FromProto(switch_predicate->attr().at("value").tensor()));
     *has_dead_fanout = true;
     *dead_fanout = selector.scalar<bool>()() ? 0 : 1;
+    return Status::OK();
   }
 
   GraphView::InputPort switch_input_port(&switch_node, 0);
@@ -617,28 +624,29 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   // We check if its a while loop such that the condition is a simple binary
   // operator which returns false for the initialization value.
   // TODO(srjoglekar): Improve to work with arbitrary predicate subgraphs.
-  if (!IsMerge(*switch_input)) {
+  if (!IsMerge(*switch_input) || !IsLoopCond(*switch_predicate)) {
     return Status::OK();
   }
 
-  // Find the boolean Op from predicate node.
-  NodeDef* switch_ctrl_node = nullptr;
-  for (int i = 0; i < switch_predicate->input().size(); ++i) {
-    NodeDef* node = node_map.GetNode(switch_predicate->input(i));
-    if (IsSimpleBinaryOperator(*node)) {
-      switch_ctrl_node = node;
-    }
-  }
-  if (switch_ctrl_node == nullptr) {
+  VLOG(3) << "Try to find a zero iteration while loop:"
+          << " switch_node=" << switch_node.name();
+
+  // Find the boolean predicate from a LoopCond node (e.g. Greater).
+  NodeDef* switch_ctrl_node = view.GetRegularFanin({switch_predicate, 0}).node;
+  if (!switch_ctrl_node || !IsSimpleBinaryOperator(*switch_ctrl_node)) {
     return Status::OK();
   }
+
   // Find the Merge node & the Constant Operand to the condition node, if
   // available.
   NodeDef* merge_node = nullptr;
   NodeDef* constant_ctrl_input = nullptr;
   int constant_index = 0;
   for (int i = 0; i < switch_ctrl_node->input().size(); ++i) {
-    NodeDef* node = node_map.GetNode(switch_ctrl_node->input(i));
+    const string& input = switch_ctrl_node->input(i);
+    if (IsControlInput(input)) continue;
+
+    NodeDef* node = view.GetNode(switch_ctrl_node->input(i));
     if (IsMerge(*node)) {
       merge_node = node;
     }
@@ -650,6 +658,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   if (merge_node == nullptr || constant_ctrl_input == nullptr) {
     return Status::OK();
   }
+
   // Find the initialization constant (via Enter, if one exists).
   NodeDef* enter_node = nullptr;
   NodeDef* constant_init_node = nullptr;
@@ -675,6 +684,15 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     return Status::OK();
   }
 
+  VLOG(4) << "Check if loop will be 0 iterations:"
+          << "\n|  switch_node        : " << switch_node.name()
+          << "\n|  switch_ctrl_node   : " << switch_ctrl_node->name()
+          << "\n|  merge_node         : " << merge_node->name()
+          << "\n|  constant_ctrl_input: " << constant_ctrl_input->name()
+          << "\n|  enter_node         : "
+          << (enter_node ? enter_node->name() : "<n/a>")
+          << "\n|  constant_init_node : " << constant_init_node->name();
+
   // Check if there will be 0 iterations. This will only happen if the condition
   // evaluates to false with respect to the initialization value.
   NodeDef* operand_0 =
@@ -685,9 +703,14 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   TF_RETURN_IF_ERROR(EvaluateBoolOpForConstantOperands(
       *switch_ctrl_node, *operand_0, *operand_1, cpu_device, resource_mgr,
       &constant_switch_value));
+
   if (constant_switch_value == false) {
+    VLOG(4) << "Remove 0 iteration while loop:"
+            << " switch_node=" << switch_node.name();
     *has_dead_fanout = true;
     *dead_fanout = 1;
+  } else {
+    VLOG(4) << "Was not able to prove that loop has 0 iterations.";
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 9be134ef78c..f9fab36579b 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -40,6 +40,8 @@ class LoopOptimizer : public GraphOptimizer {
 
   string name() const override { return "loop_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 4571049870c..a8bedeed663 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -1687,7 +1687,7 @@ versions {
   TF_CHECK_OK(status);
   auto tensors_got = EvaluateNodes(output, item.fetch);
   ASSERT_EQ(tensors_got.size(), 1);
-  test::ExpectTensorEqual<string>(tensors_got[0], tensors_expected[0]);
+  test::ExpectTensorEqual<tstring>(tensors_got[0], tensors_expected[0]);
 
   EXPECT_EQ(output.node_size(), 8);
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 4420f8e6256..1b8d8c73089 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <queue>
+#include <set>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -442,12 +443,6 @@ void RecomputeSubgraph(
 void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
                                 const string& recomputation_targets_name_scope,
                                 GraphDef* graph, const GrapplerItem& item) {
-  if (optimization_level != RewriterConfig::RECOMPUTATION_HEURISTICS &&
-      optimization_level != RewriterConfig::HEURISTICS &&
-      optimization_level != RewriterConfig::MANUAL) {
-    // Nothing to do
-    return;
-  }
   // The topological numberings and NodeMap will be stale as soon as we start
   // modifying the graph in RecomputeSubgraph. However, RecomputeSubgraph only
   // looks up nodes which were in the original graph, and preserves the graph
@@ -609,6 +604,11 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
       VLOG(1) << "Shape not fully known for " << node->name();
       continue;
     }
+    DataType dtype = node->attr().at("T").type();
+    if (dtype != DT_HALF && dtype != DT_FLOAT && dtype != DT_DOUBLE &&
+        dtype != DT_INT64) {  // Only GPU-supported TemporaryVariable types.
+      VLOG(1) << "Unsupported dtype for " << node->name();
+    }
 
     // Compute a topological ordering for the node fanin.
     std::unordered_map<const NodeDef*, int> topo_order;
@@ -656,7 +656,6 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
       }
     }
 
-    DataType dtype = node->attr().at("T").type();
     const string& device = node->device();
     const string tmp_var_name = strings::StrCat(node->name(), "/tmp_var");
     if (view.GetNode(tmp_var_name) != nullptr) {
@@ -1274,13 +1273,24 @@ bool CrossesTaskOrCpuGpuBoundary(const NodeDef& node1, const NodeDef& node2) {
           absl::StrContains(device2, DEVICE_CPU));
 }
 
+void RelaxAssignNodes(const std::set<int>& nodes_to_relax,
+                      GraphDef* optimized_graph) {
+  for (int idx : nodes_to_relax) {
+    // Set an attribute telling AssignOp to ignore allocator constraints.
+    NodeDef* assign_node = optimized_graph->mutable_node(idx);
+    (*assign_node->mutable_attr())["_grappler_relax_allocator_constraints"]
+        .set_b(true);
+  }
+}
+
 // TODO(rmlarsen): Add distributed TF test.
-Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
+Status FindAssignNodesToRelax(const GraphDef& graph,
+                              std::set<int>* nodes_to_relax) {
   std::unordered_set<string> devices;
   std::vector<int> assign_nodes;
   bool found_send = false;
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    const NodeDef& node = optimized_graph->node(i);
+  for (int i = 0; i < graph.node_size(); ++i) {
+    const NodeDef& node = graph.node(i);
     devices.insert(node.device());
     if (IsAssign(node)) {
       assign_nodes.push_back(i);
@@ -1291,22 +1301,17 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
     }
   }
   if (!found_send && devices.size() == 1) {
-    for (int assign_idx : assign_nodes) {
-      // Set an attribute telling AssignOp to ignore allocator constraints.
-      NodeDef* assign_node = optimized_graph->mutable_node(assign_idx);
-      (*assign_node->mutable_attr())["_grappler_relax_allocator_constraints"]
-          .set_b(true);
-    }
+    nodes_to_relax->insert(assign_nodes.begin(), assign_nodes.end());
     return Status::OK();
   }
 
   GraphTopologyView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(
-      *optimized_graph, /*ignore_control_edges=*/true));
+  TF_RETURN_IF_ERROR(
+      graph_view.InitializeFromGraph(graph, /*ignore_control_edges=*/true));
   std::unordered_set<const NodeDef*> optimized_nodes;
 
   for (int i : assign_nodes) {
-    const NodeDef& assign_node = optimized_graph->node(i);
+    const NodeDef& assign_node = graph.node(i);
 
     if (optimized_nodes.find(&assign_node) == optimized_nodes.end()) {
       std::vector<const NodeDef*> assign_nodes_in_fanout;
@@ -1352,11 +1357,7 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
           // Set an attribute telling AssignOp to ignore allocator constraints.
           const absl::optional<int> assign_node_idx =
               graph_view.GetNodeIndex(*assign_node_in_fanout);
-          NodeDef* assign_node_to_relax =
-              optimized_graph->mutable_node(assign_node_idx.value());
-          (*assign_node_to_relax
-                ->mutable_attr())["_grappler_relax_allocator_constraints"]
-              .set_b(true);
+          nodes_to_relax->insert(assign_node_idx.value());
         }
       }
     }
@@ -1368,39 +1369,55 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
 
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
-  GrapplerItem optimized_item(item);
+  std::set<int> nodes_to_relax;
+  TF_RETURN_IF_ERROR(FindAssignNodesToRelax(item.graph, &nodes_to_relax));
 
-  RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_scope_,
-                             &optimized_item.graph, item);
+  bool run_recomputation_pass =
+      (optimization_level_ == RewriterConfig::RECOMPUTATION_HEURISTICS ||
+       optimization_level_ == RewriterConfig::HEURISTICS ||
+       optimization_level_ == RewriterConfig::MANUAL);
+  if (!run_recomputation_pass && nodes_to_relax.empty() && item.fetch.empty()) {
+    return errors::Aborted("Nothing to do.");
+  }
+
+  GrapplerItem optimized_item(item);
+  RelaxAssignNodes(nodes_to_relax, &optimized_item.graph);
+
+  if (run_recomputation_pass) {
+    RecomputationRewritingPass(optimization_level_,
+                               recomputation_targets_name_scope_,
+                               &optimized_item.graph, item);
+  }
 
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
-  bool updated_graph = true;
-  for (int i = 0; i < 25 && updated_graph; ++i) {
-    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
-    updated_graph = false;
-    if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
-         optimization_level_ == RewriterConfig::SCHEDULING_HEURISTICS ||
-         optimization_level_ == RewriterConfig::HEURISTICS) &&
-        cluster != nullptr) {
-      updated_graph |= SchedulingPass(cluster, &optimized_item);
-    }
+  // SchedulingPass() and SwappingPass() rely on defined fetches in order to
+  // infer the memory usage, so skip optimization if there are no fetches.
+  if (!item.fetch.empty()) {
+    bool updated_graph = true;
+    for (int i = 0; i < 25 && updated_graph; ++i) {
+      GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+      updated_graph = false;
+      if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
+           optimization_level_ == RewriterConfig::SCHEDULING_HEURISTICS ||
+           optimization_level_ == RewriterConfig::HEURISTICS) &&
+          cluster != nullptr) {
+        updated_graph |= SchedulingPass(cluster, &optimized_item);
+      }
 
-    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
-    if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
-         optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
-         optimization_level_ == RewriterConfig::HEURISTICS ||
-         optimization_level_ == RewriterConfig::MANUAL) &&
-        cluster != nullptr) {
-      updated_graph |= SwappingPass(optimization_level_, cluster,
-                                    &optimized_item, &skip_list);
+      GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+      if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
+           optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
+           optimization_level_ == RewriterConfig::HEURISTICS ||
+           optimization_level_ == RewriterConfig::MANUAL) &&
+          cluster != nullptr) {
+        updated_graph |= SwappingPass(optimization_level_, cluster,
+                                      &optimized_item, &skip_list);
+      }
     }
   }
 
-  TF_RETURN_IF_ERROR(RelaxAllocatorConstraints(&optimized_item.graph));
-
   optimized_graph->Swap(&optimized_item.graph);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
index baaadf64b3b..00afa4781ed 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -42,6 +42,8 @@ class MemoryOptimizer : public GraphOptimizer {
 
   string name() const override { return "memory_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* pruned_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index e7aea5f5c5e..9f2e0b343b5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -240,6 +240,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"e"};
 
   EXPECT_EQ(7, item.graph.node_size());
   EXPECT_EQ(NodeName(e.name()), item.graph.node(4).name());
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 7f1302d6b09..84b9a1b07a2 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -87,6 +87,23 @@ bool IsRunOnceOptimizer(const string& name) {
          name == "loop_optimizer" || name == "auto_mixed_precision";
 }
 
+// Creates a function library stub from a real function library: copy only
+// signatures and attributes of all the function defined in fdef_lib. This stub
+// can be swapped with real function library in a graph, before passing it to
+// optimizer, if optimizer doesn't instantiate functions.
+FunctionDefLibrary GetFunctionDefLibraryStub(
+    const FunctionDefLibrary& fdef_lib) {
+  FunctionDefLibrary stub;
+  for (const FunctionDef& fn : fdef_lib.function()) {
+    FunctionDef* fn_stub = stub.mutable_function()->Add();
+    *(fn_stub->mutable_signature()) = fn.signature();
+    *(fn_stub->mutable_attr()) = fn.attr();
+    *(fn_stub->mutable_arg_attr()) = fn.arg_attr();
+  }
+  *stub.mutable_gradient() = fdef_lib.gradient();
+  return stub;
+}
+
 uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
   const uint64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
   if (cfg.meta_optimizer_timeout_ms() < 0) {
@@ -192,13 +209,13 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->push_back(
         MakeUnique<DependencyOptimizer>(cfg_.dependency_optimization()));
   }
-  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>());
-  }
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
     optimizers->push_back(
         MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
   }
+  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>());
+  }
   if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
     if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
       optimizers->push_back(
@@ -472,7 +489,21 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 Status MetaOptimizer::RunOptimizer(
     GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
     GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
-  uint64 start_us = Env::Default()->NowMicros();
+  const uint64 start_us = Env::Default()->NowMicros();
+
+  // If optimizer doesn't need a function library, we will replace it with a
+  // stub before running optimization, and will put it back at the end.
+  FunctionDefLibrary optimized_graph_function_library;
+  const bool is_function_library_aware = optimizer->UsesFunctionLibrary();
+
+  // Replace function library in optimized graph with a stub.
+  if (!is_function_library_aware) {
+    VLOG(3) << "Replace function library with a stub for " << optimizer->name();
+    optimized_graph_function_library.Swap(optimized_graph->mutable_library());
+    *optimized_graph->mutable_library() =
+        GetFunctionDefLibraryStub(optimized_graph_function_library);
+  }
+
   // This swaps the current optimized_graph into optimized item and
   // resets optimized_graph to an empty graph.
   optimized_graph->Swap(&optimized_item->graph);
@@ -480,8 +511,8 @@ Status MetaOptimizer::RunOptimizer(
   optimizer->set_deadline_usec(this->deadline_usec());
   Status status =
       optimizer->Optimize(cluster, *optimized_item, optimized_graph);
-  uint64 end_us = Env::Default()->NowMicros();
-  float duration_ms = (end_us - start_us) / 1000.0f;
+  const uint64 end_us = Env::Default()->NowMicros();
+  const float duration_ms = (end_us - start_us) / 1000.0f;
 
   string message;
   if (!status.ok()) {
@@ -508,6 +539,11 @@ Status MetaOptimizer::RunOptimizer(
     VLOG(1) << optimizer->name() << ": " << message;
   }
 
+  // Swap function library back into the main graph.
+  if (!is_function_library_aware) {
+    optimized_graph->mutable_library()->Swap(&optimized_graph_function_library);
+  }
+
   OptimizerResult optimizer_result{optimizer->name(), message, status};
   optimization_result->results.push_back(optimizer_result);
 
@@ -555,13 +591,13 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // 2. Optimize functions reachable from the optimized graph.
   FunctionLibraryDefinition flib = minimized_flib(*optimized_graph);
+  using NodeDefs = protobuf::RepeatedPtrField<NodeDef>;
 
   // Find functions for which we might need to compute a gradient at runtime.
   absl::flat_hash_set<string> differentiable_functions;
 
-  using NodeDefs = protobuf::RepeatedPtrField<NodeDef>;
   const auto find_differentiable_functions =
-      [&differentiable_functions](const NodeDefs& nodes) -> void {
+      [&](const NodeDefs& nodes) -> void {
     for (const NodeDef& node : nodes) {
       if (IsSymbolicGradient(node)) {
         const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
@@ -577,6 +613,28 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     find_differentiable_functions(function.node_def());
   }
 
+  // Find functions that are formed by XLA and will be compiled later. We do it
+  // by looking for a function attribute in XlaLaunch ops. Grappler rewrites
+  // potentially can add nodes that are not supported by XLA, so we choose to
+  // skip such functions when we optimize function library.
+  absl::flat_hash_set<string> xla_compiled_functions;
+
+  const auto find_xla_compiled_functions = [&](const NodeDefs& nodes) -> void {
+    NameAttrList function;
+    for (const NodeDef& node : nodes) {
+      if (!IsXlaLaunch(node)) continue;
+      if (!GetNodeAttr(node, "function", &function).ok()) continue;
+      xla_compiled_functions.insert(function.name());
+    }
+  };
+
+  // XlaLaunch ops inside the main graph ...
+  find_xla_compiled_functions(optimized_graph->node());
+  // ... and inside the function library.
+  for (const FunctionDef& function : optimized_graph->library().function()) {
+    find_xla_compiled_functions(function.node_def());
+  }
+
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
   bool optimize_function_library =
@@ -585,6 +643,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   while (optimize_function_library) {
     optimize_function_library = false;
 
+    int function_idx = 0;
     for (const FunctionDef& func : optimized_graph->library().function()) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
@@ -592,9 +651,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
       // Skip functions that are not reachable from the optimized graph.
       if (!flib.Contains(func_name)) continue;
-
       // Skip already optimized functions.
-      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
+      if (optimized_funcs.contains(func_name)) continue;
+      // Skip functions that will be compiled by XLA.
+      if (xla_compiled_functions.contains(func_name)) continue;
 
       // Skip parametrized functions (function type or body is defined only at
       // function call time by caller node attributes).
@@ -602,7 +662,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // the function optimizer, before we can optimize function body.
       if (IsParametrized(func)) continue;
 
-      VLOG(3) << "Optimize function: function=" << func_name;
+      VLOG(3) << "Optimize function: function=" << func_name << " ["
+              << function_idx++ << " of "
+              << optimized_graph->library().function_size() << "]";
 
       // Function optimization might specialize nested function calls, so we
       // have to reset the flag and do at least one more pass over the library.
@@ -668,6 +730,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         // implementation selector what is required to swap in some TPU specific
         // lowering code and is verified the work correctly on TPUs.
         ImplementationSelector implementation_selector;
+
+        // Implementation selector needs to have access to valid function
+        // signature and attributes, and it doesn't need actual function body.
+        FunctionDefLibrary func_item_function_library;
+        func_item_function_library.Swap(func_item.graph.mutable_library());
+        *func_item.graph.mutable_library() =
+            GetFunctionDefLibraryStub(func_item_function_library);
+
         TF_RETURN_IF_ERROR(implementation_selector.Optimize(
             cluster, func_item, &optimized_func_graph));
       } else {
@@ -802,8 +872,6 @@ Status OptimizeGraph(
 
   std::unique_ptr<tensorflow::Graph> optimized_graph(
       new tensorflow::Graph(OpRegistry::Global()));
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
-                                            out_graph, optimized_graph.get()));
 
   // Copy optimized functions back to the overlay lib.
   if (flib) {
@@ -817,25 +885,28 @@ Status OptimizeGraph(
     }
   }
 
-  *g = std::move(optimized_graph);
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+      GraphConstructorOptions(), std::move(out_graph), optimized_graph.get()));
 
   // The graph conversion sets the requested device names but not the
   // assigned device names. However, since at this point the graph is
   // placed TF expects an assigned device name for every node. Therefore
   // we copy the requested device into the assigned device field.
-  for (Node* node : (*g)->nodes()) {
+  for (Node* node : optimized_graph->nodes()) {
     if (node->IsOp() && node->assigned_device_name().empty()) {
       if (node->requested_device().empty()) {
         return errors::Internal(
             "Either placer did not place the node or Grappler did not "
             "copy the assigned device. Contact Grappler team since latter "
             "is more likely. Node=",
-            node->name(), " Graph: ", (*g)->ToGraphDefDebug().DebugString());
+            node->name(),
+            " Graph: ", optimized_graph->ToGraphDefDebug().DebugString());
       }
       node->set_assigned_device_name(node->requested_device());
     }
   }
 
+  *g = std::move(optimized_graph);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index be0c2756674..a70a9872b93 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -39,6 +39,8 @@ class MetaOptimizer : public GraphOptimizer {
 
   string name() const override { return "meta_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return true; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 0d7de583972..53424995eb4 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -46,6 +46,7 @@ class TestOptimizer : public CustomGraphOptimizer {
 
   TestOptimizer() {}
   string name() const override { return "test_optimizer"; }
+  bool UsesFunctionLibrary() const override { return false; }
 
   Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
                   nullptr) override {
@@ -102,6 +103,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
   string name() const override {
     return "grappler_item_properties_accumulator";
   }
+  bool UsesFunctionLibrary() const override { return false; }
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
@@ -684,6 +686,7 @@ class SleepingOptimizer : public CustomGraphOptimizer {
  public:
   SleepingOptimizer() {}
   string name() const override { return "test_optimizer"; }
+  bool UsesFunctionLibrary() const override { return false; }
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 7bd4f35a729..cb2f1b24b26 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -622,7 +622,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
 
-  *optimized_graph = graph;
+  *optimized_graph = std::move(graph);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
index b6fa5146fcc..e45e9faf07e 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.h
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -31,6 +31,8 @@ class ModelPruner : public GraphOptimizer {
 
   string name() const override { return "model_pruner"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
index 44f26461c0e..27eb6a7d84f 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -45,6 +45,8 @@ class PinToHostOptimizer : public GraphOptimizer {
 
   string name() const override { return "pin_to_host_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 7a9110e72ab..8a2f534551c 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -89,7 +89,7 @@ TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
     if (i < num_int32) {
       test::ExpectTensorEqual<int32>(tensors[i], tensors_expected[i]);
     } else {
-      test::ExpectTensorEqual<string>(tensors[i], tensors_expected[i]);
+      test::ExpectTensorEqual<tstring>(tensors[i], tensors_expected[i]);
     }
   }
 
@@ -203,7 +203,7 @@ TEST_F(PinToHostOptimizerTest, Identity) {
       // If CUDA, then there is a GPU kernel registration that is pinned to Host
       // memory. Consequently, `b` will be mapped to Host correct if there is
       // a GPU kernel registered.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       EXPECT_EQ(node.device(), "/device:CPU:0");
 #else
       EXPECT_TRUE(node.device().empty());
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 3cfebdadcda..eeeaac3fe3f 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -64,15 +64,6 @@ constexpr char kIsTraining[] = "is_training";
 
 constexpr int kMissingIndex = -1;
 
-// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-// contractions with non-default contraction output kernels.
-bool EigenSupportsContractionOutputKernel() {
-#if defined(EIGEN_USE_LIBXSMM)
-  return false;
-#endif
-  return true;
-}
-
 struct RemapperContext {
   explicit RemapperContext(GrapplerItem* item, Status* status)
       : nodes_to_preserve(item->NodesToPreserve()),
@@ -353,8 +344,6 @@ inline bool HasAtMostOneFanoutAtPort0(const utils::MutableNodeView& node_view) {
 bool FindContractionWithBias(const RemapperContext& ctx, int node_index,
                              ContractionWithBiasAdd* matched,
                              bool check_device_compatible = true) {
-  if (!EigenSupportsContractionOutputKernel()) return false;
-
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   // Root of the pattern must be a BiasAdd.
   // TODO(lyandy): Forward controls for patterns with control dependencies.
@@ -394,8 +383,6 @@ bool FindContractionWithBias(const RemapperContext& ctx, int node_index,
 bool FindContractionWithBiasAndActivation(
     const RemapperContext& ctx, int node_index,
     ContractionWithBiasAddAndActivation* matched) {
-  if (!EigenSupportsContractionOutputKernel()) return false;
-
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   // Root of the pattern must be an activation node.
   // TODO(lyandy): Forward controls for patterns with control dependencies.
@@ -431,8 +418,6 @@ bool FindContractionWithBiasAndActivation(
 
 bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx, int node_index,
                                   ContractionWithSqueezeAndBiasAdd* matched) {
-  if (!EigenSupportsContractionOutputKernel()) return false;
-
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   // TODO(lyandy): Forward controls for patterns with control dependencies.
   if (HasControlFaninOrFanout(*node_view)) return false;
@@ -456,7 +441,7 @@ bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx, int node_index,
 
   // Squeeze must not squeeze output channel dimension.
   std::vector<int32> dims;
-  if (!GetNodeAttr(*squeeze_node_def, "squeeze_dims", &dims).ok()) return false;
+  if (!TryGetNodeAttr(*squeeze_node_def, "squeeze_dims", &dims)) return false;
   for (auto dim : dims) {
     if (dim == 3) return false;
   }
@@ -488,8 +473,6 @@ bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx, int node_index,
 
 bool FindConv2DWithBatchNorm(const RemapperContext& ctx, int node_index,
                              ContractionWithBatchNorm* matched) {
-  if (!EigenSupportsContractionOutputKernel()) return false;
-
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   const auto* node_def = node_view->node();
   // Root of the pattern must be a FusedBatchNorm.
@@ -531,7 +514,7 @@ bool FindConv2DWithBatchNorm(const RemapperContext& ctx, int node_index,
   // We successfully found a Conv2D+FusedBatchNorm pattern.
   matched->contraction = conv2d_node_view->node_index();
   matched->fused_batch_norm = node_index;
-  if (!GetNodeAttr(*node_def, "epsilon", &matched->epsilon).ok()) return false;
+  if (!TryGetNodeAttr(*node_def, "epsilon", &matched->epsilon)) return false;
 
   return true;
 }
@@ -539,8 +522,6 @@ bool FindConv2DWithBatchNorm(const RemapperContext& ctx, int node_index,
 bool FindConv2DWithBatchNormAndActivation(
     const RemapperContext& ctx, int node_index,
     ContractionWithBatchNormAndActivation* matched) {
-  if (!EigenSupportsContractionOutputKernel()) return false;
-
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   // TODO(lyandy): Forward controls for patterns with control dependencies.
   if (HasControlFaninOrFanout(*node_view)) return false;
@@ -684,7 +665,7 @@ bool FindFusedBatchNorm(const RemapperContext& ctx, int node_index,
 
   // Check that the node is in inference mode.
   bool is_training = true;
-  if (!GetNodeAttr(*node_def, kIsTraining, &is_training).ok()) return false;
+  if (!TryGetNodeAttr(*node_def, kIsTraining, &is_training)) return false;
   if (is_training) return false;
 
   const auto& props = ctx.graph_properties.GetInputProperties(node_def->name());
@@ -1477,7 +1458,7 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
 
     bool is_training = true;
-    if (!GetNodeAttr(*node_def, kIsTraining, &is_training).ok()) return false;
+    if (!TryGetNodeAttr(*node_def, kIsTraining, &is_training)) return false;
     if (is_training) return false;
 
     return true;
@@ -1634,7 +1615,6 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     // Infer properties lazily in case they are not needed.
     if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
       const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
-      // TODO(rmlarsen): Get rid of tensor value copies.
       TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
           assume_valid_feeds,
           /*aggressive_shape_inference=*/false,
diff --git a/tensorflow/core/grappler/optimizers/remapper.h b/tensorflow/core/grappler/optimizers/remapper.h
index c18413e4e72..6951436bf7b 100644
--- a/tensorflow/core/grappler/optimizers/remapper.h
+++ b/tensorflow/core/grappler/optimizers/remapper.h
@@ -32,6 +32,8 @@ class Remapper : public GraphOptimizer {
 
   string name() const override { return "remapper"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index cd4fe518f4e..ecf79371ceb 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -37,15 +37,6 @@ class RemapperTest : public GrapplerTest {
     // This is a requirement for fusing FusedBatchNorm + SideInput + Activation.
     setenv("TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT", "1", 1 /* replace */);
   }
-
-  // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-  // contractions with non-default contraction output kernels.
-  bool EigenSupportsContractionOutputKernel() {
-#if defined(EIGEN_USE_LIBXSMM)
-    return false;
-#endif
-    return true;
-  }
 };
 
 TEST_F(RemapperTest, FusedBatchNorm) {
@@ -336,8 +327,6 @@ TEST_F(RemapperTest, FuseBatchNormWithAddAndRelu) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBias) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -400,8 +389,6 @@ TEST_F(RemapperTest, FuseConv2DWithBias) {
 }
 
 TEST_F(RemapperTest, FuseMatMulWithBias) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -463,8 +450,6 @@ TEST_F(RemapperTest, FuseMatMulWithBias) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ::tensorflow::ops::Placeholder;
 
   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
@@ -545,8 +530,6 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
 }
 
 TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ::tensorflow::ops::Placeholder;
 
   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
@@ -625,8 +608,6 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -705,8 +686,6 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ops::Placeholder;
 
   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
@@ -802,8 +781,6 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
-  if (!EigenSupportsContractionOutputKernel()) return;
-
   using ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 29bc154eb0e..ec4cb13941d 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -70,6 +70,17 @@ bool HasOpName(const string& node_name, const string& op_name) {
   return node_name.substr(begin, end - begin) == op_name;
 }
 
+Status GetOutputDataType(
+    const std::vector<OpInfo::TensorProperties>& output_props, int output_index,
+    DataType* dtype) {
+  if (output_index >= output_props.size()) {
+    return errors::Internal("Invalid output index ", output_index,
+                            " size of output_props ", output_props.size());
+  }
+  *dtype = output_props[output_index].dtype();
+  return Status::OK();
+}
+
 // After shape inference has been done each op should be annotated
 // with its output shape(s).  This function iterates over a collection
 // of ops that are a potential application of a ScopedAllocator.  It
@@ -86,6 +97,7 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
   for (NodeDef* n : ops) {
     AttrSlice n_attrs = AttrSlice(*n);
     DataType dtype;
+    // Check that op has an explicit data type attr "T".
     LOG_WARNING_AND_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "T", &dtype));
     VLOG(2) << "op " << n->name() << " has type " << dtype << " shapes.size() "
             << shapes->size();
@@ -108,6 +120,11 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
     } else if (!TensorShape::IsValid(props.shape())) {
       return errors::Internal("Complete shape not known for ", n->name());
     }
+    if (*type != dtype) {
+      return errors::Internal(
+          "Type mismatch: type in op attr = ", DataTypeString(dtype),
+          ", type in output props = ", DataTypeString(*type));
+    }
     VLOG(2) << "Adding shape " << props.shape().DebugString();
     shapes->push_back(TensorShape(props.shape()));
   }
@@ -172,11 +189,11 @@ Status RemoveEdge(const string& input_edge_name, const string& from_node_name,
 // If `input` is an Exit node, we add an identity to avoid the case when Exit
 // has inputs from different frames.
 //
-// If `input` has kScopedAllocatorAttrName attribute, this means that it was
-// previously marked for allocation with a different scope id.  Since there can
-// be only one scope id per output, we insert an identity between the input and
-// op.  This will ensure that the identity becomes the new input to op, and this
-// identity can be marked with a new scope id different from `input`.
+// If `input` is in `sa_opti->repeated_outputs()`, this means that it will be
+// potentially used by multiple scope ids.  Since there can be only one scope id
+// per output, we insert an identity between the input and op.  This will ensure
+// that the identity becomes the new input to op, and this identity can be
+// marked with a new scope id different from `input`.
 //
 // If the graph is rewritten, this function will perform the following change:
 //
@@ -196,16 +213,9 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
                          NodeDef* input, const string& edge_name,
                          int output_index, NodeDef* op, NodeDef** new_input,
                          int* new_output_index) {
-  bool rewrite = false;
-  if (IsExit(*input)) {
-    rewrite = true;
-  } else {
-    AttrSlice input_attrs = AttrSlice(*input);
-    std::vector<int32> scopes;
-    Status sa_status =
-        GetNodeAttr(input_attrs, kScopedAllocatorAttrName, &scopes);
-    rewrite = sa_status.ok();
-  }
+  bool rewrite =
+      IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) !=
+                         sa_opti->repeated_outputs().end());
   if (!rewrite) {
     *new_input = input;
     *new_output_index = output_index;
@@ -246,9 +256,9 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
 // Returns error if it fails to find exactly one input for each op,
 // or if some input is not of type dtype.
 Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
-                 GraphDef* graph, NodeMap* node_map,
-                 const std::vector<NodeDef*>& ops, DataType dtype,
-                 std::vector<InputDesc>* inputs) {
+                 GraphDef* graph, const GraphProperties& graph_properties,
+                 NodeMap* node_map, const std::vector<NodeDef*>& ops,
+                 DataType dtype, std::vector<InputDesc>* inputs) {
   VLOG(1) << "Getinputs";
   for (NodeDef* n : ops) {
     NodeDef* inode = nullptr;
@@ -274,10 +284,15 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
                 << " output_index " << output_index;
       }
     }
-    AttrSlice inode_attrs = AttrSlice(*inode);
     DataType inode_dtype;
+    if (!graph_properties.HasOutputProperties(inode->name())) {
+      return errors::Internal("Input node ", inode->name(),
+                              " does not have output properties");
+    }
+    const auto& inode_output_props =
+        graph_properties.GetOutputProperties(inode->name());
     LOG_WARNING_AND_RETURN_IF_ERROR(
-        GetNodeAttr(inode_attrs, "T", &inode_dtype));
+        GetOutputDataType(inode_output_props, output_index, &inode_dtype));
     if (inode_dtype != dtype) {
       return errors::Internal("ScopedAllocatorOptimizer expected input type ",
                               dtype, " but found ", inode_dtype);
@@ -287,6 +302,38 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
   return Status::OK();
 }
 
+// Return non-control inputs of `op` in `inputs`.
+Status GetDataInputs(GraphDef* graph, NodeMap* node_map, NodeDef* op,
+                     std::vector<InputDesc>* inputs) {
+  VLOG(2) << "GetDataInputs for node " << op->name();
+  NodeDef* inode = nullptr;
+  int output_index = 0;
+  for (const auto& input_name : op->input()) {
+    if (IsControlInput(input_name)) {
+      continue;
+    }
+    ParseNodeName(input_name, &output_index);
+    inode = nullptr;
+    inode = node_map->GetNode(input_name);
+    if (inode == nullptr) {
+      return errors::Internal("Did not find node ", input_name);
+    }
+    VLOG(2) << "inode " << inode->DebugString() << " output_index "
+            << output_index;
+    inputs->emplace_back(inode, output_index, op);
+  }
+  return Status::OK();
+}
+
+void DumpGraphToVLOG(const GraphDef& graph, int log_level) {
+  if (VLOG_IS_ON(log_level)) {
+    // VLOG may truncate lines so we print line by line.
+    for (const auto& line : str_util::Split(graph.DebugString(), "\n\r")) {
+      VLOG(log_level) << line;
+    }
+  }
+}
+
 }  // namespace
 
 void ScopedAllocatorOptimizer::ExtendNodeAttr(StringPiece name,
@@ -319,7 +366,9 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       AttrSlice n_attrs = AttrSlice(*nd.from_node_def);
       std::vector<int32> scope_ids;
       Status ss = GetNodeAttr(n_attrs, kScopedAllocatorAttrName, &scope_ids);
-      if (ss.ok()) {
+      // Check that both output name and output slot match.  It is okay to have
+      // different outputs of the input committed to different scope ids.
+      if (ss.ok() && scope_ids[0] == nd.output_slot) {
         LOG(INFO) << "Abandoning ScopedAllocatorOptimizer because input "
                   << nd.from_node_def->name() << " output " << scope_ids[0]
                   << " is already assigned to scope_id " << scope_ids[1];
@@ -382,9 +431,9 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     CHECK(graph_properties_);
     LOG_WARNING_AND_RETURN_IF_ERROR(
         CheckTypesAndGetShapes(*graph_properties_, ops, dtype, input_shapes));
-    LOG_WARNING_AND_RETURN_IF_ERROR(GetInputs(sa_opti, invocation_count, graph,
-                                              sa_opti->node_map(), ops, *dtype,
-                                              inputs));
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        GetInputs(sa_opti, invocation_count, graph, *graph_properties_,
+                  sa_opti->node_map(), ops, *dtype, inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(CheckExistingScopedAllocator(*inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(
         CheckInternalDataDependency(op_instance_names, *inputs));
@@ -442,6 +491,23 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                                                nd.from_node_def);
       node_map->AddOutput(sa_name, nd.from_node_def->name());
     }
+
+    // We add control edges in order to delay execution of the ScopedAllocatorOp
+    // until just before first use in order to conserve memory.
+    {
+      auto& nd = inputs[0];
+      std::vector<InputDesc> inputs_to_first;
+      LOG_WARNING_AND_RETURN_IF_ERROR(GetDataInputs(
+          graph, sa_opti->node_map(), nd.from_node_def, &inputs_to_first));
+      for (int i = 0; i < inputs_to_first.size(); ++i) {
+        sa_node->add_input(
+            strings::StrCat("^", inputs_to_first[i].from_node_def->name()));
+        VLOG(2) << "Adding control dependency from "
+                << inputs_to_first[i].from_node_def->name() << " to "
+                << sa_node->name();
+      }
+    }
+
     return Status::OK();
   }
 
@@ -700,25 +766,6 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
         sa_opti, graph, node_map, ops, device_name, dtype, sa_id, sa_name,
         input_shapes, inputs, sa_shape));
 
-    // TODO(tucker): Maybe add control edges to delay execution of the
-    // ScopedAllocatorOp until just before first use in order to
-    // conserve memory.  What would be correct?  Let I0...In be the
-    // input nodes that are all going to alloc from SA.  If we make
-    // SA wait until all of these are ready, that might be too slow.
-    // It should probably wait until at least one is ready, but which
-    // one?  Maybe just pick the first.
-    // {
-    //   auto& nd = inputs[0];
-    //   std::vector<InputDesc> inputs_to_first;
-    //   LOG_WARNING_AND_RETURN_IF_ERROR(GetInputs(sa_opti->node_map(),
-    //   {nd.from_node_def},
-    //                                dtype, &inputs_to_first));
-    //   for (int i = 0; i < inputs_to_first.size(); ++i) {
-    //     sa_node->add_input(
-    //         strings::StrCat("^", inputs_to_first[i].from_node_def->name()));
-    //   }
-    // }
-
     // Build a ScopedAllocatorConcat below all of the input nodes.
     std::vector<NodeDefBuilder::NodeOut> sac_inputs;
     string sac_name = strings::StrCat("scoped_allocator_concat_", sa_id, "_",
@@ -773,6 +820,9 @@ ScopedAllocatorOptimizer::ScopedAllocatorOptimizer(
 Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
                                           const GrapplerItem& item,
                                           GraphDef* optimized_graph) {
+  VLOG(3) << "Input graph:";
+  DumpGraphToVLOG(item.graph, /*log_level=*/3);
+
   // Nodes that cannot be removed from the graph without damaging correctness,
   // typically fetch nodes.
   nodes_to_preserve_ = item.NodesToPreserve();
@@ -783,12 +833,14 @@ Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
       assume_valid_feeds, /*aggressive_shape_inference=*/false,
       /*include_tensor_values=*/false));
   *optimized_graph = item.graph;
-  node_map_.reset(new NodeMap(optimized_graph));
+  node_map_ = absl::make_unique<NodeMap>(optimized_graph);
 
   LOG_WARNING_AND_RETURN_IF_ERROR(ScopedAllocatorOptimizer::ProcessGraphDef(
       optimized_graph, graph_properties));
 
   VLOG(1) << "ScopedAllocatorOptimizer::Optimize() done";
+  VLOG(3) << "Optimized graph:";
+  DumpGraphToVLOG(*optimized_graph, /*log_level=*/3);
   return Status::OK();
 }
 
@@ -869,7 +921,7 @@ class Tree {
   string edge_;
   int depth_;
   std::vector<NodeDef*> nodes_;
-  std::unordered_map<string, Tree*> subtrees_;
+  absl::flat_hash_map<string, Tree*> subtrees_;
 };
 
 // Applies a function to every Tree in DFS order.  Terminates early
@@ -905,7 +957,7 @@ void PartitionByLoopStructure(const FrameView& frame_view,
                               std::vector<std::vector<NodeDef*>>* loop_groups) {
   // It is assumed that two nodes with identical loop containment have
   // identical integer vectors. Represent those by 64 bit hashes.
-  std::unordered_map<uint64, std::vector<NodeDef*>> loop_sets;
+  absl::flat_hash_map<uint64, std::vector<NodeDef*>> loop_sets;
   for (NodeDef* nd : nodes) {
     uint64 hash = 0;
     const std::vector<int>& loop_ids = frame_view.Frames(*nd);
@@ -919,6 +971,19 @@ void PartitionByLoopStructure(const FrameView& frame_view,
   }
 }
 
+// Identify outputs that are inputs to multiple sets of nodes.
+void IdentifyRepeatedInputs(const std::vector<NodeDef*>& nodes,
+                            absl::flat_hash_set<string>* seen_outputs,
+                            absl::flat_hash_set<string>* repeated_outputs) {
+  for (NodeDef* node : nodes) {
+    for (const auto& input_name : node->input()) {
+      if (!seen_outputs->insert(input_name).second) {
+        repeated_outputs->insert(input_name);
+      }
+    }
+  }
+}
+
 }  // namespace
 
 Status ScopedAllocatorOptimizer::ProcessGraphDef(
@@ -954,6 +1019,15 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         }
         rewriter->SetGraphProperties(graph_properties);
         std::unique_ptr<Tree> root(ComputeScopeTree(it.first, it.second));
+        // Record outputs that are inputs to multiple Tree nodes.
+        absl::flat_hash_set<string> seen_outputs;
+        status = ApplyToAll(root.get(), [this, &seen_outputs](Tree* t) {
+          IdentifyRepeatedInputs(t->nodes_, &seen_outputs, &repeated_outputs_);
+          return Status::OK();
+        });
+        if (!status.ok()) {
+          break;
+        }
         // Nodes with a common depth and root path are now grouped
         // in the same Tree struct.  Split those groups into subgroups that
         // share identical loop nesting.
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
index 20c29a56446..acc28f934dc 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
@@ -16,10 +16,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
 
 #include <atomic>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -42,6 +43,8 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
 
   string name() const override { return "scoped_allocator_optimizer"; }
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
@@ -49,10 +52,10 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override {}
 
   // Map from an Op name to a vector of Nodes with that Op.
-  typedef std::unordered_map<string, std::vector<NodeDef*>> DevOpOccurrences;
+  typedef absl::flat_hash_map<string, std::vector<NodeDef*>> DevOpOccurrences;
   // Map from a device name to a DevOpOccurrences map.
-  typedef std::unordered_map<string, DevOpOccurrences> GraphOpOccurrences;
-  typedef std::unordered_set<string> OpNameSet;
+  typedef absl::flat_hash_map<string, DevOpOccurrences> GraphOpOccurrences;
+  typedef absl::flat_hash_set<string> OpNameSet;
 
   Status ProcessGraphDef(GraphDef* graph,
                          const GraphProperties& graph_properties);
@@ -72,6 +75,10 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
 
   NodeMap* node_map() { return node_map_.get(); }
 
+  const absl::flat_hash_set<string>& repeated_outputs() {
+    return repeated_outputs_;
+  }
+
   // Appends values to the attr value under name in node_def, if present.
   // If not present does an assignment.
   static void ExtendNodeAttr(StringPiece name, const std::vector<int32>& values,
@@ -106,11 +113,15 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
   RewriterConfig::Toggle opt_level_;
   std::unordered_set<string> nodes_to_preserve_;
   OpNameSet op_name_set_;
-  std::unordered_map<string, Rewriter*> rewriters_;
+  absl::flat_hash_map<string, Rewriter*> rewriters_;
   std::vector<Rewriter*> to_delete_;
   int next_sa_id_ = 1;
   int next_identity_id_ = 1;
   std::unique_ptr<NodeMap> node_map_;
+  // Keeps track of outputs, i.e. a node and an output index, that are inputs to
+  // more than one op groups that are candidates for scoped allocator
+  // optimization.
+  absl::flat_hash_set<string> repeated_outputs_;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index dfdbc8cfc87..3a1cfb64b99 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -95,7 +95,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
       const auto& prop =
           properties.GetOutputProperties(reduce_indices.node->name());
-      if (prop.size() < reduce_indices.port_id) {
+      if (prop.size() <= reduce_indices.port_id) {
         continue;
       }
       const TensorShapeProto& reduction_indices_shape =
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.h b/tensorflow/core/grappler/optimizers/shape_optimizer.h
index d9c1fefb194..ce75f7b0b05 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -37,6 +37,8 @@ class ShapeOptimizer : public GraphOptimizer {
 
   string name() const override { return "shape_optimizer"; };
 
+  bool UsesFunctionLibrary() const override { return false; }
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index acbb81ac23f..7054eb22de3 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -275,6 +275,21 @@ int NumNonControlInputs(const NodeDef& node) {
   return num_inputs;
 }
 
+int NumControlOutputs(const NodeDef& node, const NodeMap& node_map) {
+  int num_outputs = 0;
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    for (const string& node_as_input : output->input()) {
+      if (!IsControlInput(node_as_input)) continue;
+
+      TensorId tensor = ParseTensorName(node_as_input);
+      if (tensor.node() == node.name()) {
+        ++num_outputs;
+      }
+    }
+  }
+  return num_outputs;
+}
+
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 700e4319810..8a698431268 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -259,6 +259,9 @@ bool HasControlInputs(const NodeDef& node);
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
+// Number of connected control outputs.
+int NumControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
 // Number of connected non-control outputs.
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index d193907eda4..fef002b2788 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_protos_grappler",
 )
 
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index 0dccee582ee..5b3d8e70f66 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/graph_view_internal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -815,7 +816,9 @@ Status MutableGraphView::CheckKernelRegisteredForNodes() {
                           attr_to_add.second);
     }
     const string& device = diff.update_device ? diff.device : node->device();
-    if (device.empty()) {
+    DeviceNameUtils::ParsedName name;
+    if (device.empty() || !DeviceNameUtils::ParseFullName(device, &name) ||
+        !name.has_type) {
       continue;
     }
     s = IsKernelRegisteredForNode(diff.update_name ? diff.name : node->name(),
@@ -824,19 +827,20 @@ Status MutableGraphView::CheckKernelRegisteredForNodes() {
                                   diff.update_op ? diff.op : node->op(), device,
                                   AttrSlice(&diff.processed_attrs));
     if (!s.ok()) {
-      return errors::InvalidArgument(kMutableGraphViewApplyError,
-                                     s.error_message());
+      LOG(WARNING) << s.error_message();
     }
   }
   for (const auto& new_node_holder : mutation_.new_nodes_) {
     const auto& new_node_def = new_node_holder.node;
-    if (new_node_def.device().empty()) {
+    DeviceNameUtils::ParsedName name;
+    if (new_node_def.device().empty() ||
+        !DeviceNameUtils::ParseFullName(new_node_def.device(), &name) ||
+        !name.has_type) {
       continue;
     }
     s = IsKernelRegisteredForNode(new_node_def);
     if (!s.ok()) {
-      return errors::InvalidArgument(kMutableGraphViewApplyError,
-                                     s.error_message());
+      LOG(WARNING) << s.error_message();
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
index 3170de05f17..a8f4b65c415 100644
--- a/tensorflow/core/grappler/utils/graph_view_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -1894,6 +1894,7 @@ TEST_F(MutationTest, ConsecutiveMutations) {
 constexpr char kMatchingFiles[] = "MatchingFiles";
 
 TEST_F(MutationTest, OpWithUnsupportedDevice) {
+  GTEST_SKIP() << "Reenable once offline optimization tests enable CUDA.";
   auto test_graph = []() {
     return GDef({NDef("a", kMatchingFiles, {}, {}, kDeviceCPU0)},
                 /*funcs=*/{});
@@ -1930,6 +1931,7 @@ TEST_F(MutationTest, OpWithUnsupportedDevice) {
 }
 
 TEST_F(MutationTest, OpMissingAttribute) {
+  GTEST_SKIP() << "Reenable once offline optimization tests enable CUDA.";
   auto test_graph = []() {
     return GDef({NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0)},
                 /*funcs=*/{});
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 572afde42d9..4db5143ad7e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -20,16 +20,15 @@ load("//tensorflow:tensorflow.bzl", "if_nccl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "if_static",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 load(
@@ -139,6 +138,7 @@ tf_kernel_library(
         "strided_slice_op_inst_5.cc",
         "strided_slice_op_inst_6.cc",
         "strided_slice_op_inst_7.cc",
+        "strided_slice_op_inst_8.cc",
     ],
     hdrs = [
         "slice_op.h",
@@ -195,9 +195,26 @@ tf_cc_test(
     ],
 )
 
+# virtual targets since nested select statements not possible
+tf_kernel_library(
+    name = "virtual_nccl",
+    deps = if_cuda(["@local_config_nccl//:nccl"]),
+)
+
+tf_kernel_library(
+    name = "virtual_rccl",
+    deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
+)
+
 tf_kernel_library(
     name = "collective_ops",
     srcs = if_nccl([
+        "collective_nccl.h",
+        "collective_nccl.cc",
+        "collective_nccl_broadcaster.h",
+        "collective_nccl_broadcaster.cc",
+        "collective_nccl_gatherer.h",
+        "collective_nccl_gatherer.cc",
         "collective_nccl_reducer.h",
         "collective_nccl_reducer.cc",
     ]),
@@ -208,15 +225,16 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
     ] + if_nccl([
-        "@local_config_nccl//:nccl",
+        ":virtual_nccl",
+        ":virtual_rccl",
         "//tensorflow/core/nccl:nccl_lib",
     ]),
 )
 
 tf_cuda_cc_test(
-    name = "collective_nccl_reducer_test",
+    name = "collective_nccl_test",
     size = "small",
-    srcs = ["collective_nccl_reducer_test.cc"],
+    srcs = ["collective_nccl_test.cc"],
     tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
     deps = [
         "//tensorflow/core:all_kernels",
@@ -377,11 +395,14 @@ cc_library(
 
 tf_kernel_library(
     name = "nccl_kernels",
-    srcs = if_cuda([
+    srcs = if_cuda_or_rocm([
         "nccl_ops.cc",
     ]),
     deps = if_cuda([
         "@local_config_nccl//:nccl",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+    ]) + if_cuda_or_rocm([
         "//tensorflow/core/nccl:nccl_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
@@ -497,15 +518,18 @@ tf_cuda_library(
     hdrs = ["gpu_utils.h"],
     deps = [
         ":gpu_util_hdrs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:conv_autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/util/proto:proto_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/types:span",
-    ],
+    ] + if_cuda([
+        "//tensorflow/stream_executor/cuda:redzone_allocator",
+        "//tensorflow/stream_executor/cuda:ptxas_utils",
+    ]),
 )
 
 tf_cc_test(
@@ -813,7 +837,6 @@ cc_library(
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
         "eigen_volume_patch.h",
     ],
@@ -834,7 +857,6 @@ cc_library(
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
         "eigen_volume_patch.h",
     ],
@@ -913,7 +935,6 @@ ARRAY_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
     "//third_party/eigen3",
 ] + if_sycl(["//tensorflow/core:sycl_runtime"])
@@ -1332,6 +1353,7 @@ cc_library(
         ":ragged_range_op",
         ":ragged_tensor_from_variant_op",
         ":ragged_tensor_to_sparse_kernel",
+        ":ragged_tensor_to_tensor_op",
         ":ragged_tensor_to_variant_op",
     ],
 )
@@ -1387,6 +1409,35 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "ragged_tensor_to_tensor_op_test",
+    size = "small",
+    srcs = ["ragged_tensor_to_tensor_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ragged_tensor_to_tensor_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "ragged_tensor_to_tensor_op",
+    srcs = ["ragged_tensor_to_tensor_op.cc"],
+    deps = [
+        ":broadcast_to_op",
+        ":list_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ragged_to_dense_util",
+    ],
+)
+
 tf_cc_test(
     name = "ragged_tensor_to_sparse_kernel_test",
     size = "small",
@@ -1884,7 +1935,10 @@ tf_kernel_library(
 # Unlike gather_functor library, this does not include the CUDA code and deps.
 cc_library(
     name = "gather_functor_hdr",
-    hdrs = ["gather_functor.h"],
+    hdrs = [
+        "gather_functor.h",
+        "gather_functor_batched.h",
+    ],
 )
 
 tf_kernel_library(
@@ -2390,7 +2444,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -2927,7 +2980,6 @@ tf_cc_tests(
         "eigen_attention_test.cc",
         "eigen_backward_spatial_convolutions_test.cc",
         "eigen_pooling_test.cc",
-        "eigen_softmax_test.cc",
         "eigen_spatial_convolutions_test.cc",
     ],
     deps = [
@@ -3097,6 +3149,7 @@ tf_cuda_cc_test(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -3294,16 +3347,9 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
-    ] + if_static(
-        [
-            "@local_config_cuda//cuda:cusolver",
-            "@local_config_cuda//cuda:cublas",
-        ],
-        [
-            "//tensorflow/stream_executor/cuda:cusolver_stub",
-            "//tensorflow/stream_executor/cuda:cublas_stub",
-        ],
-    ),
+        "//tensorflow/stream_executor/cuda:cublas_lib",
+        "//tensorflow/stream_executor/cuda:cusolver_lib",
+    ],
 )
 
 tf_kernel_library(
@@ -3313,10 +3359,8 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cusparse"],
-        ["//tensorflow/stream_executor/cuda:cusparse_stub"],
-    ),
+        "//tensorflow/stream_executor/cuda:cusparse_lib",
+    ],
 )
 
 LINALG_DEPS = [
@@ -3753,10 +3797,7 @@ tf_kernel_library(
     ],
     hdrs = ["matmul_op.h"],
     defines = select({
-        ":xsmm": [
-            "TENSORFLOW_USE_LIBXSMM",
-            "EIGEN_USE_LIBXSMM",
-        ],
+        ":xsmm": ["TENSORFLOW_USE_LIBXSMM"],
         "//conditions:default": [],
     }),
     deps = MATH_DEPS + [
@@ -4179,12 +4220,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     defines = select({
-        ":xsmm_convolutions": [
-            "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        ":xsmm": ["EIGEN_USE_LIBXSMM"],
+        ":xsmm_convolutions": ["TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS"],
         "//conditions:default": [],
     }) + select({
         ":xsmm_backward_convolutions": ["TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS"],
@@ -4364,7 +4400,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "lrn_op",
     prefix = "lrn_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + if_rocm([":conv_ops_gpu_hdrs"]),
 )
 
 tf_kernel_library(
@@ -4704,7 +4740,6 @@ cc_library(
 PARSING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
 ]
 
@@ -5317,6 +5352,7 @@ cc_library(
         ":string_join_op",
         ":string_length_op",
         ":string_lower_op",
+        ":string_ngrams_op",
         ":string_split_op",
         ":string_strip_op",
         ":string_to_hash_bucket_op",
@@ -5457,6 +5493,30 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "string_ngrams_op",
+    srcs = ["string_ngrams_op.cc"],
+    deps = STRING_DEPS + [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "string_ngrams_op_test",
+    srcs = ["string_ngrams_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":string_ngrams_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "string_strip_op",
     prefix = "string_strip_op",
@@ -5945,7 +6005,6 @@ filegroup(
         "eigen_convolution_helpers.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
         "eigen_spatial_convolutions-inl.h",
         "eigen_volume_patch.h",
@@ -6014,6 +6073,7 @@ filegroup(
         "function_ops.cc",
         "function_ops.h",
         "gather_functor.h",
+        "gather_functor_batched.h",
         "gather_nd_op.cc",
         "gather_nd_op.h",
         "gather_nd_op_cpu_impl.h",
@@ -6063,6 +6123,7 @@ filegroup(
         "slice_op_cpu_impl_5.cc",
         "slice_op_cpu_impl_6.cc",
         "slice_op_cpu_impl_7.cc",
+        "slice_op_cpu_impl_8.cc",
         "softmax_op.cc",
         "softmax_op_functor.h",
         "split_lib.h",
@@ -6080,6 +6141,7 @@ filegroup(
         "strided_slice_op_inst_5.cc",
         "strided_slice_op_inst_6.cc",
         "strided_slice_op_inst_7.cc",
+        "strided_slice_op_inst_8.cc",
         "unpack_op.cc",
         "variable_ops.cc",
         "variable_ops.h",
@@ -6438,7 +6500,10 @@ ANDROID_TEXTUAL_HDRS = [
 # registration.
 filegroup(
     name = "android_all_ops",
-    srcs = ["//tensorflow/c/kernels:android_all_op_kernels"] + glob(
+    srcs = [
+        "//tensorflow/c/kernels:android_all_op_kernels",
+        "//tensorflow/core/kernels/data:dataset_ops_srcs",
+    ] + glob(
         [
             "*.cc",
             "*.h",
@@ -6490,7 +6555,6 @@ filegroup(
             "debug_ops.*",
             "mutex_ops.*",
             "batch_kernels.*",
-            "regex_full_match_op.cc",
             "regex_replace_op.cc",
             "string_lower_op.cc",  # Requires ICU for unicode.
             "string_upper_op.cc",  # Requires ICU for unicode.
@@ -7957,7 +8021,7 @@ cc_library(
     name = "kernel_platform_strings",
     srcs = ["kernel_platform_strings.h"],
     deps = [
-        "//tensorflow/core:platform_strings",
+        "//tensorflow/core/platform:platform_strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index 4a43dc55d6d..ba4427ffb9d 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -92,11 +92,11 @@ inline __device__ RgbTuple hsv2rgb_cuda(const float h, const float s,
 }
 
 template <bool AdjustHue, bool AdjustSaturation, bool AdjustV, typename T>
-__global__ void adjust_hsv_nhwc(const int64 number_elements,
-                                const T* const __restrict__ input,
-                                T* const output, const float* const hue_delta,
-                                const float* const saturation_scale,
-                                const float* const value_scale) {
+__global__ void adjust_hsv_nhwc(
+    const int64 number_elements, const T* const __restrict__ input,
+    T* const __restrict__ output, const float* const __restrict__ hue_delta,
+    const float* const __restrict__ saturation_scale,
+    const float* const __restrict__ value_scale) {
   // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
   // (NHWC)
   for (int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index e6d6c40f760..8341909fbc8 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -116,7 +116,7 @@ class AsStringOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output("output", input_tensor->shape(),
                                             &output_tensor));
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
 
 #define ENCODE_TYPE(type, T, enc_str)                                     \
   case (type): {                                                          \
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index 1cc5a2d8a3e..ead0efb635e 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -36,6 +36,10 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
+#endif  // GOOGLE_CUDA
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
@@ -155,6 +159,12 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
     TensorShape output_shape = params.forward_output_shape();
 
+#if CUDNN_VERSION >= 7300
+    DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
+                             stride_, padding_, data_format_, tensor_in,
+                             output_shape,
+                             /*propagate_nans=*/false);
+#else
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
                                stride_, padding_, data_format_, tensor_in,
@@ -170,6 +180,7 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
           tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
           params.row_stride, params.col_stride, pt);
     }
+#endif  // CUDNN_VERSION >= 7300
   }
 
  private:
@@ -496,6 +507,12 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
       output_shape.AddDim(shape_vec(i));
     }
 
+#if CUDNN_VERSION >= 7300
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
+                                 ksize_, stride_, padding_, data_format_,
+                                 nullptr, nullptr, out_backprop, output_shape,
+                                 /*propagate_nans=*/false);
+#else
     if (data_format_ == FORMAT_NHWC) {
       const int64 out_backprop_batch = out_backprop.dim_size(0);
       const int64 out_backprop_rows = out_backprop.dim_size(1);
@@ -552,6 +569,7 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                    nullptr, nullptr, out_backprop, output_shape,
                                    /*propagate_nans=*/false);
     }
+#endif  // CUDNN_VERSION >= 7300
   }
 
  private:
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 17a615fd9bb..f97312adf0e 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -40,14 +40,12 @@ DEFINE_GPU_KERNELS(double)
 #undef DEFINE_GPU_KERNELS
 
 template <typename dtype>
-__global__ void AvePoolBackwardNHWC(const int nthreads,
-                                    const dtype* const top_diff, const int num,
-                                    const int height, const int width,
-                                    const int channels, const int pooled_height,
-                                    const int pooled_width, const int kernel_h,
-                                    const int kernel_w, const int stride_h,
-                                    const int stride_w, const int pad_t,
-                                    const int pad_l, dtype* const bottom_diff) {
+__global__ void AvePoolBackwardNHWC(
+    const int nthreads, const dtype* const __restrict__ top_diff, const int num,
+    const int height, const int width, const int channels,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* const __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 89d742c2daf..adbe370395c 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -308,7 +308,7 @@ class Barrier : public ResourceBase {
                          int component_index, int i,
                          std::vector<Tuple>* ready_tuples, bool* new_elements)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    auto keys_vec = keys.flat<string>();
+    auto keys_vec = keys.flat<tstring>();
     auto values_matrix = values.flat_outer_dims<T>();
 
     PersistentTuple* element_ptr;
@@ -392,7 +392,7 @@ class Barrier : public ResourceBase {
                                                   &key, &allocated_key));
       ready_tuple.push_back(*element[0].AccessTensor(ctx));  // index
       ready_tuple.push_back(*allocated_key);                 // key
-      ready_tuple[1].scalar<string>()() = keys_vec(i);       // set the key
+      ready_tuple[1].scalar<tstring>()() = keys_vec(i);      // set the key
       for (int j = 1; j < num_components() + 1; ++j) {
         ready_tuple.push_back(*element[j].AccessTensor(ctx));
       }
diff --git a/tensorflow/core/kernels/base64_ops.cc b/tensorflow/core/kernels/base64_ops.cc
index 74e6b39390a..cb235f56615 100644
--- a/tensorflow/core/kernels/base64_ops.cc
+++ b/tensorflow/core/kernels/base64_ops.cc
@@ -36,8 +36,8 @@ class EncodeBase64Op : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
 
-    auto input = input_tensor.flat<string>();
-    auto output = output_tensor->flat<string>();
+    auto input = input_tensor.flat<tstring>();
+    auto output = output_tensor->flat<tstring>();
 
     for (int64 i = 0; i < input.dimension(0); ++i) {
       OP_REQUIRES_OK(context, Base64Encode(input(i), pad_, &output(i)));
@@ -61,8 +61,8 @@ class DecodeBase64Op : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
 
-    auto input = input_tensor.flat<string>();
-    auto output = output_tensor->flat<string>();
+    auto input = input_tensor.flat<tstring>();
+    auto output = output_tensor->flat<tstring>();
 
     for (int64 i = 0; i < input.dimension(0); ++i) {
       OP_REQUIRES_OK(context, Base64Decode(input(i), &output(i)));
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 84f7571d6a4..1e85dbcfc15 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -265,10 +265,10 @@ class BlasScratchAllocator : public se::ScratchAllocator {
 
   BlasScratchAllocator(OpKernelContext* context) : context_(context) {}
 
-  int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
+  int64 GetMemoryLimitInBytes() override { return -1; }
 
   se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
-      Stream* stream, int64 byte_size) override {
+      int64 byte_size) override {
     Tensor temporary_memory;
 
     Status allocation_status(context_->allocate_temp(
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 843aedc3e0f..bf54aff5a0d 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -51,8 +51,9 @@ struct AccumulatorType<Eigen::half> {
 // Definition of the GPU implementations declared in bias_op.cc.
 
 template <typename T>
-__global__ void BiasNHWCKernel(int32 nthreads, const T* input, const T* bias,
-                               T* output, int32 bias_size) {
+__global__ void BiasNHWCKernel(int32 nthreads, const T* __restrict__ input,
+                               const T* __restrict__ bias,
+                               T* __restrict__ output, int32 bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int32 bias_offset = index % bias_size;
     output[index] = ldg(input + index) + ldg(bias + bias_offset);
@@ -60,8 +61,10 @@ __global__ void BiasNHWCKernel(int32 nthreads, const T* input, const T* bias,
 }
 
 template <typename T>
-__global__ void BiasNCHWKernel(int32 nthreads, const T* input, const T* bias,
-                               T* output, int32 bias_size, int32 image_size) {
+__global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
+                               const T* __restrict__ bias,
+                               T* __restrict__ output, int32 bias_size,
+                               int32 image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int32 index2 = index / image_size;
     int32 bias_offset = index2 % bias_size;
@@ -97,8 +100,10 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNHWC_Naive(int32 nthreads, const T* output_backprop,
-                                   T* bias_backprop, int32 bias_size) {
+__global__ void BiasGradNHWC_Naive(int32 nthreads,
+                                   const T* __restrict__ output_backprop,
+                                   T* __restrict__ bias_backprop,
+                                   int32 bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int32 bias_offset = index % bias_size;
     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
@@ -107,9 +112,10 @@ __global__ void BiasGradNHWC_Naive(int32 nthreads, const T* output_backprop,
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNCHW_Naive(int32 nthreads, const T* output_backprop,
-                                   T* bias_backprop, int32 bias_size,
-                                   int32 image_size) {
+__global__ void BiasGradNCHW_Naive(int32 nthreads,
+                                   const T* __restrict__ output_backprop,
+                                   T* __restrict__ bias_backprop,
+                                   int32 bias_size, int32 image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int32 index2 = index / image_size;
     int32 bias_offset = index2 % bias_size;
@@ -117,11 +123,10 @@ __global__ void BiasGradNCHW_Naive(int32 nthreads, const T* output_backprop,
   }
 }
 
-
 template <typename T>
-__global__ void BiasGradNHWC_SharedAtomics(int32 nthreads,
-                                           const T* output_backprop,
-                                           T* bias_backprop, int32 bias_size) {
+__global__ void BiasGradNHWC_SharedAtomics(
+    int32 nthreads, const T* __restrict__ output_backprop,
+    T* __restrict__ bias_backprop, int32 bias_size) {
   typedef typename AccumulatorType<T>::type AccT;
   GPU_DYNAMIC_SHARED_MEM_DECL(8, char, s_buf);
   AccT* s_data = reinterpret_cast<AccT*>(s_buf);
@@ -143,10 +148,9 @@ __global__ void BiasGradNHWC_SharedAtomics(int32 nthreads,
 }
 
 template <typename T>
-__global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
-                                           T* bias_backprop, int32 batch,
-                                           int32 bias_size, int32 image_size,
-                                           int group_size) {
+__global__ void BiasGradNCHW_SharedAtomics(
+    const T* __restrict__ output_backprop, T* __restrict__ bias_backprop,
+    int32 batch, int32 bias_size, int32 image_size, int group_size) {
   // Initialize the shared memory.
   typedef typename AccumulatorType<T>::type AccT;
   const int32 kSDataSize = 32;
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index f6414c8d38e..30f7697187e 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -7,7 +7,7 @@ load(
     "tf_kernel_library",
 )
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library",
 )
 
@@ -46,6 +46,7 @@ cc_library(
     srcs = ["resources.cc"],
     hdrs = ["resources.h"],
     deps = [
+        ":tree_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -95,6 +96,7 @@ tf_kernel_library(
         ":tree_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 4e0f4c7d56c..0c54b357c22 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -30,9 +30,6 @@ message NodeMetadata {
 // Leaves can either hold dense or sparse information.
 message Leaf {
   oneof leaf {
-    // See third_party/tensorflow/contrib/decision_trees/
-    // proto/generic_tree_model.proto
-    // for a description of how vector and sparse_vector might be used.
     Vector vector = 1;
     SparseVector sparse_vector = 2;
   }
@@ -48,6 +45,18 @@ message SparseVector {
   repeated float value = 2;
 }
 
+enum SplitTypeWithDefault {
+  INEQUALITY_DEFAULT_LEFT = 0;
+  INEQUALITY_DEFAULT_RIGHT = 1;
+  EQUALITY_DEFAULT_RIGHT = 3;
+}
+
+enum DefaultDirection {
+  // Left is the default direction.
+  DEFAULT_LEFT = 0;
+  DEFAULT_RIGHT = 1;
+}
+
 message BucketizedSplit {
   // Float feature column and split threshold describing
   // the rule feature <= threshold.
@@ -56,11 +65,6 @@ message BucketizedSplit {
   // If feature column is multivalent, this holds the index of the dimension
   // for the split. Defaults to 0.
   int32 dimension_id = 5;
-  enum DefaultDirection {
-    // Left is the default direction.
-    DEFAULT_LEFT = 0;
-    DEFAULT_RIGHT = 1;
-  }
   // default direction for missing values.
   DefaultDirection default_direction = 6;
 
@@ -75,6 +79,9 @@ message CategoricalSplit {
   // value.
   int32 feature_id = 1;
   int32 value = 2;
+  // If feature column is multivalent, this holds the index of the dimension
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
 
   // Node children indexing into a contiguous
   // vector of nodes starting from the root.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 718cf8e4139..19be606f184 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -36,6 +36,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+static void ConvertVectorsToMatrices(
+    const OpInputList bucketized_features_list,
+    std::vector<tensorflow::TTypes<int32>::ConstMatrix>& bucketized_features) {
+  for (const Tensor& tensor : bucketized_features_list) {
+    if (tensor.dims() == 1) {
+      const auto v = tensor.vec<int32>();
+      bucketized_features.emplace_back(
+          TTypes<int32>::ConstMatrix(v.data(), v.size(), 1));
+    } else {
+      bucketized_features.emplace_back(tensor.matrix<int32>());
+    }
+  }
+}
+
 // The Op used during training time to get the predictions so far with the
 // current ensemble being built.
 // Expect some logits are cached from the previous step and passed through
@@ -60,12 +74,10 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
     OpInputList bucketized_features_list;
     OP_REQUIRES_OK(context, context->input_list("bucketized_features",
                                                 &bucketized_features_list));
-    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
-    batch_bucketized_features.reserve(bucketized_features_list.size());
-    for (const Tensor& tensor : bucketized_features_list) {
-      batch_bucketized_features.emplace_back(tensor.vec<int32>());
-    }
-    const int batch_size = batch_bucketized_features[0].size();
+    std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
+    bucketized_features.reserve(bucketized_features_list.size());
+    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    const int batch_size = bucketized_features[0].dimension(0);
 
     const Tensor* cached_tree_ids_t;
     OP_REQUIRES_OK(context,
@@ -106,7 +118,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       output_partial_logits.setZero();
     } else {
       output_tree_ids.setConstant(latest_tree);
-      auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
+      auto do_work = [&resource, &bucketized_features, &cached_tree_ids,
                       &cached_node_ids, &output_partial_logits,
                       &output_node_ids, latest_tree,
                       this](int32 start, int32 end) {
@@ -155,8 +167,8 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
               ++tree_id;
               node_id = 0;
             } else {
-              node_id = resource->next_node(tree_id, node_id, i,
-                                            batch_bucketized_features);
+              node_id =
+                  resource->next_node(tree_id, node_id, i, bucketized_features);
             }
           }
           output_node_ids(i) = node_id;
@@ -205,12 +217,10 @@ class BoostedTreesPredictOp : public OpKernel {
     OpInputList bucketized_features_list;
     OP_REQUIRES_OK(context, context->input_list("bucketized_features",
                                                 &bucketized_features_list));
-    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
-    batch_bucketized_features.reserve(bucketized_features_list.size());
-    for (const Tensor& tensor : bucketized_features_list) {
-      batch_bucketized_features.emplace_back(tensor.vec<int32>());
-    }
-    const int batch_size = batch_bucketized_features[0].size();
+    std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
+    bucketized_features.reserve(bucketized_features_list.size());
+    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    const int batch_size = bucketized_features[0].dimension(0);
 
     // Allocate outputs.
     Tensor* output_logits_t = nullptr;
@@ -226,8 +236,8 @@ class BoostedTreesPredictOp : public OpKernel {
     }
 
     const int32 last_tree = resource->num_trees() - 1;
-    auto do_work = [&resource, &batch_bucketized_features, &output_logits,
-                    last_tree, this](int32 start, int32 end) {
+    auto do_work = [&resource, &bucketized_features, &output_logits, last_tree,
+                    this](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
         std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32 tree_id = 0;
@@ -248,8 +258,8 @@ class BoostedTreesPredictOp : public OpKernel {
             ++tree_id;
             node_id = 0;
           } else {
-            node_id = resource->next_node(tree_id, node_id, i,
-                                          batch_bucketized_features);
+            node_id =
+                resource->next_node(tree_id, node_id, i, bucketized_features);
           }
         }
         for (int32 j = 0; j < logits_dimension_; ++j) {
@@ -307,12 +317,10 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     OpInputList bucketized_features_list;
     OP_REQUIRES_OK(context, context->input_list("bucketized_features",
                                                 &bucketized_features_list));
-    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
-    batch_bucketized_features.reserve(bucketized_features_list.size());
-    for (const Tensor& tensor : bucketized_features_list) {
-      batch_bucketized_features.emplace_back(tensor.vec<int32>());
-    }
-    const int batch_size = batch_bucketized_features[0].size();
+    std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
+    bucketized_features.reserve(bucketized_features_list.size());
+    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    const int batch_size = bucketized_features[0].dimension(0);
 
     // We need to get the feature ids used for splitting and the logits after
     // each split. We will use these to calculate the changes in the prediction
@@ -324,14 +332,14 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
         context, context->allocate_output("examples_debug_outputs_serialized",
                                           {batch_size}, &output_debug_info_t));
     // Will contain serialized protos, per example.
-    auto output_debug_info = output_debug_info_t->flat<string>();
+    auto output_debug_info = output_debug_info_t->flat<tstring>();
     const int32 last_tree = resource->num_trees() - 1;
 
     // For each given example, traverse through all trees keeping track of the
     // features used to split and the associated logits at each point along the
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
-    auto do_work = [&resource, &batch_bucketized_features, &output_debug_info,
+    auto do_work = [&resource, &bucketized_features, &output_debug_info,
                     last_tree](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
@@ -360,8 +368,8 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
             feature_id = resource->feature_id(tree_id, node_id);
             example_debug_info.add_feature_ids(feature_id);
             // Get logit after split.
-            node_id = resource->next_node(tree_id, node_id, i,
-                                          batch_bucketized_features);
+            node_id =
+                resource->next_node(tree_id, node_id, i, bucketized_features);
             const auto& tree_logits = resource->node_value(tree_id, node_id);
             DCHECK_EQ(tree_logits.size(), 1);
             tree_logit = resource->GetTreeWeight(tree_id) * tree_logits[0];
diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index da6ad38b425..71b06f56623 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -215,6 +215,66 @@ REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesMakeQuantileSummaries").Device(DEVICE_CPU),
     BoostedTreesMakeQuantileSummariesOp);
 
+class BoostedTreesFlushQuantileSummariesOp : public OpKernel {
+ public:
+  explicit BoostedTreesFlushQuantileSummariesOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    ResourceHandle handle;
+    OP_REQUIRES_OK(context,
+                   HandleFromInput(context, kResourceHandleName, &handle));
+    core::RefCountPtr<QuantileStreamResource> stream_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, handle, &stream_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*stream_resource->mutex());
+
+    OpOutputList summaries_output_list;
+    OP_REQUIRES_OK(
+        context, context->output_list(kSummariesName, &summaries_output_list));
+
+    auto do_quantile_summary_gen = [&](const int64 begin, const int64 end) {
+      // Iterating features.
+      for (int64 index = begin; index < end; index++) {
+        QuantileStream* stream = stream_resource->stream(index);
+        stream->Finalize();
+
+        const auto summary_list = stream->GetFinalSummary().GetEntryList();
+        Tensor* output_t;
+        const int64 summary_list_size = static_cast<int64>(summary_list.size());
+        OP_REQUIRES_OK(context, summaries_output_list.allocate(
+                                    index, TensorShape({summary_list_size, 4}),
+                                    &output_t));
+        auto output = output_t->matrix<float>();
+        for (auto row = 0; row < summary_list_size; row++) {
+          const auto& entry = summary_list[row];
+          output(row, 0) = entry.value;
+          output(row, 1) = entry.weight;
+          output(row, 2) = entry.min_rank;
+          output(row, 3) = entry.max_rank;
+        }
+      }
+    };
+    // TODO(tanzheny): comment on the magic number.
+    const int64 kCostPerUnit = 500 * num_features_;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features_,
+          kCostPerUnit, do_quantile_summary_gen);
+    stream_resource->ResetStreams();
+  }
+
+ private:
+  int64 num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesFlushQuantileSummaries").Device(DEVICE_CPU),
+    BoostedTreesFlushQuantileSummariesOp);
+
 class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel {
  public:
   explicit BoostedTreesQuantileStreamResourceAddSummariesOp(
@@ -240,6 +300,12 @@ class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel {
     auto do_quantile_add_summary = [&](const int64 begin, const int64 end) {
       // Iterating all features.
       for (int64 feature_idx = begin; feature_idx < end; ++feature_idx) {
+        QuantileStream* stream = stream_resource->stream(feature_idx);
+        if (stream->IsFinalized()) {
+          VLOG(1) << "QuantileStream has already been finalized for feature"
+                  << feature_idx << ".";
+          continue;
+        }
         const Tensor& summaries = summaries_list[feature_idx];
         const auto summary_values = summaries.matrix<float>();
         const auto& tensor_shape = summaries.shape();
@@ -365,6 +431,7 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
     Shard(worker_threads.num_threads, worker_threads.workers, num_streams,
           kCostPerUnit, do_quantile_flush);
 
+    stream_resource->ResetStreams();
     stream_resource->set_buckets_ready(true);
   }
 
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
index 965bf2c924c..10afc9ee618 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
@@ -67,6 +67,14 @@ class BoostedTreesQuantileStreamResource : public ResourceBase {
     are_buckets_ready_ = are_buckets_ready;
   }
 
+  void ResetStreams() {
+    streams_.clear();
+    streams_.reserve(num_streams_);
+    for (int64 idx = 0; idx < num_streams_; ++idx) {
+      streams_.push_back(QuantileStream(epsilon_, max_elements_));
+    }
+  }
+
  private:
   ~BoostedTreesQuantileStreamResource() override {}
 
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h
index 525e2a6a645..20f84733a86 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h
@@ -141,6 +141,8 @@ class WeightedQuantilesStream {
     finalized_ = true;
   }
 
+  bool IsFinalized() { return finalized_; }
+
   // Generates requested number of quantiles after finalizing stream.
   // The returned quantiles can be queried using std::lower_bound to get
   // the bucket for a given value.
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index 5a9c3549041..ac1fb5652da 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -51,7 +51,7 @@ class BoostedTreesCreateEnsembleOp : public OpKernel {
     std::unique_ptr<BoostedTreesEnsembleResource> result(
         new BoostedTreesEnsembleResource());
     if (!result->InitFromSerialized(
-            tree_ensemble_serialized_t->scalar<string>()(), stamp_token)) {
+            tree_ensemble_serialized_t->scalar<tstring>()(), stamp_token)) {
       result->Unref();
       OP_REQUIRES(
           context, false,
@@ -152,7 +152,7 @@ class BoostedTreesSerializeEnsembleOp : public OpKernel {
     Tensor* output_proto_t = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, TensorShape(), &output_proto_t));
-    output_proto_t->scalar<string>()() =
+    output_proto_t->scalar<tstring>()() =
         tree_ensemble_resource->SerializeAsString();
   }
 };
@@ -187,7 +187,7 @@ class BoostedTreesDeserializeEnsembleOp : public OpKernel {
     OP_REQUIRES(
         context,
         tree_ensemble_resource->InitFromSerialized(
-            tree_ensemble_serialized_t->scalar<string>()(), stamp_token),
+            tree_ensemble_serialized_t->scalar<tstring>()(), stamp_token),
         errors::InvalidArgument("Unable to parse tree ensemble proto."));
   }
 };
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index c2a77a88f10..4e5f1db7e02 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/boosted_trees/resources.h"
+
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -52,7 +54,7 @@ int32 BoostedTreesEnsembleResource::num_trees() const {
 
 int32 BoostedTreesEnsembleResource::next_node(
     const int32 tree_id, const int32 node_id, const int32 index_in_batch,
-    const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
+    const std::vector<TTypes<int32>::ConstMatrix>& bucketized_features) const {
   DCHECK_LT(tree_id, tree_ensemble_->trees_size());
   DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
   const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
@@ -60,15 +62,17 @@ int32 BoostedTreesEnsembleResource::next_node(
   switch (node.node_case()) {
     case boosted_trees::Node::kBucketizedSplit: {
       const auto& split = node.bucketized_split();
-      return (bucketized_features[split.feature_id()](index_in_batch) <=
-              split.threshold())
+      const auto bucketized_feature = bucketized_features[split.feature_id()];
+      return bucketized_feature(index_in_batch, split.dimension_id()) <=
+                     split.threshold()
                  ? split.left_id()
                  : split.right_id();
     }
     case boosted_trees::Node::kCategoricalSplit: {
       const auto& split = node.categorical_split();
-      return (bucketized_features[split.feature_id()](index_in_batch) ==
-              split.value())
+      const auto bucketized_feature = bucketized_features[split.feature_id()];
+      return bucketized_feature(index_in_batch, split.dimension_id()) ==
+                     split.value()
                  ? split.left_id()
                  : split.right_id();
     }
@@ -265,31 +269,92 @@ int32 BoostedTreesEnsembleResource::AddNewTreeWithLogits(const float weight,
 }
 
 void BoostedTreesEnsembleResource::AddBucketizedSplitNode(
-    const int32 tree_id, const int32 node_id, const int32 feature_id,
-    const int32 threshold, const float gain, const float left_contrib,
-    const float right_contrib, int32* left_node_id, int32* right_node_id) {
+    const int32 tree_id,
+    const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+    const int32 logits_dimension, int32* left_node_id, int32* right_node_id) {
+  const auto candidate = split_entry.second;
+  auto* node = AddLeafNodes(tree_id, split_entry, logits_dimension,
+                            left_node_id, right_node_id);
+  auto* new_split = node->mutable_bucketized_split();
+  new_split->set_feature_id(candidate.feature_idx);
+  new_split->set_threshold(candidate.threshold);
+  new_split->set_dimension_id(candidate.dimension_id);
+  new_split->set_left_id(*left_node_id);
+  new_split->set_right_id(*right_node_id);
+
+  boosted_trees::SplitTypeWithDefault split_type_with_default;
+  bool parsed = boosted_trees::SplitTypeWithDefault_Parse(
+      candidate.split_type, &split_type_with_default);
+  DCHECK(parsed);
+  if (split_type_with_default == boosted_trees::INEQUALITY_DEFAULT_RIGHT) {
+    new_split->set_default_direction(boosted_trees::DEFAULT_RIGHT);
+  } else {
+    new_split->set_default_direction(boosted_trees::DEFAULT_LEFT);
+  }
+}
+
+void BoostedTreesEnsembleResource::AddCategoricalSplitNode(
+    const int32 tree_id,
+    const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+    const int32 logits_dimension, int32* left_node_id, int32* right_node_id) {
+  const auto candidate = split_entry.second;
+  auto* node = AddLeafNodes(tree_id, split_entry, logits_dimension,
+                            left_node_id, right_node_id);
+  auto* new_split = node->mutable_categorical_split();
+  new_split->set_feature_id(candidate.feature_idx);
+  new_split->set_value(candidate.threshold);
+  new_split->set_dimension_id(candidate.dimension_id);
+  new_split->set_left_id(*left_node_id);
+  new_split->set_right_id(*right_node_id);
+}
+
+boosted_trees::Node* BoostedTreesEnsembleResource::AddLeafNodes(
+    const int32 tree_id,
+    const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+    const int32 logits_dimension, int32* left_node_id, int32* right_node_id) {
   auto* tree = tree_ensemble_->mutable_trees(tree_id);
+  const auto node_id = split_entry.first;
+  const auto candidate = split_entry.second;
   auto* node = tree->mutable_nodes(node_id);
   DCHECK_EQ(node->node_case(), boosted_trees::Node::kLeaf);
-  float prev_node_value = node->leaf().scalar();
   *left_node_id = tree->nodes_size();
   *right_node_id = *left_node_id + 1;
   auto* left_node = tree->add_nodes();
   auto* right_node = tree->add_nodes();
-  if (node_id != 0 || (node->has_leaf() && node->leaf().scalar() != 0)) {
+  const bool has_leaf_value =
+      node->has_leaf() &&
+      ((logits_dimension == 1 && (node->leaf().scalar() != 0)) ||
+       node->leaf().has_vector());
+  if (node_id != 0 || has_leaf_value) {
     // Save previous leaf value if it is not the first leaf in the tree.
     node->mutable_metadata()->mutable_original_leaf()->Swap(
         node->mutable_leaf());
   }
-  node->mutable_metadata()->set_gain(gain);
-  auto* new_split = node->mutable_bucketized_split();
-  new_split->set_feature_id(feature_id);
-  new_split->set_threshold(threshold);
-  new_split->set_left_id(*left_node_id);
-  new_split->set_right_id(*right_node_id);
+  node->mutable_metadata()->set_gain(candidate.gain);
   // TODO(npononareva): this is LAYER-BY-LAYER boosting; add WHOLE-TREE.
-  left_node->mutable_leaf()->set_scalar(prev_node_value + left_contrib);
-  right_node->mutable_leaf()->set_scalar(prev_node_value + right_contrib);
+  if (logits_dimension == 1) {
+    const float prev_logit_value = node->metadata().original_leaf().scalar();
+    left_node->mutable_leaf()->set_scalar(prev_logit_value +
+                                          candidate.left_node_contribs[0]);
+    right_node->mutable_leaf()->set_scalar(prev_logit_value +
+                                           candidate.right_node_contribs[0]);
+  } else {
+    if (has_leaf_value) {
+      DCHECK_EQ(logits_dimension,
+                node->metadata().original_leaf().vector().value_size());
+    }
+    float prev_logit_value = 0.0;
+    for (int32 i = 0; i < logits_dimension; ++i) {
+      if (has_leaf_value) {
+        prev_logit_value = node->metadata().original_leaf().vector().value(i);
+      }
+      left_node->mutable_leaf()->mutable_vector()->add_value(
+          prev_logit_value + candidate.left_node_contribs[i]);
+      right_node->mutable_leaf()->mutable_vector()->add_value(
+          prev_logit_value + candidate.right_node_contribs[i]);
+    }
+  }
+  return node;
 }
 
 void BoostedTreesEnsembleResource::Reset() {
@@ -486,25 +551,22 @@ void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
 
     // Change node back into leaf.
     *node->mutable_leaf() = node_metadata.original_leaf();
-    const auto& parent_values = node_value(tree_id, node_id);
 
     // Save the old values of weights of children.
     nodes_meta->at(left_id).first = node_id;
+    nodes_meta->at(right_id).first = node_id;
     const auto& left_child_values = node_value(tree_id, left_id);
-    DCHECK_EQ(parent_values.size(), left_child_values.size());
+    const auto& right_child_values = node_value(tree_id, right_id);
+    std::vector<float> parent_values(left_child_values.size(), 0.0);
+    if (node_metadata.has_original_leaf()) {
+      parent_values = node_value(tree_id, node_id);
+    }
     for (int32 i = 0; i < parent_values.size(); ++i) {
       nodes_meta->at(left_id).second.emplace_back(parent_values[i] -
                                                   left_child_values[i]);
-    }
-
-    nodes_meta->at(right_id).first = node_id;
-    const auto& right_child_values = node_value(tree_id, right_id);
-    DCHECK_EQ(parent_values.size(), right_child_values.size());
-    for (int32 i = 0; i < parent_values.size(); ++i) {
       nodes_meta->at(right_id).second.emplace_back(parent_values[i] -
                                                    right_child_values[i]);
     }
-
     // Clear gain for leaf node.
     node->clear_metadata();
   }
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index b4f75777b3d..572b14757cf 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -25,6 +26,7 @@ namespace tensorflow {
 // Forward declaration for proto class TreeEnsemble
 namespace boosted_trees {
 class TreeEnsemble;
+class Node;
 }  // namespace boosted_trees
 
 // A StampedResource is a resource that has a stamp token associated with it.
@@ -66,7 +68,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
   //   bucketized_features: vector of feature Vectors.
   int32 next_node(
       const int32 tree_id, const int32 node_id, const int32 index_in_batch,
-      const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
+      const std::vector<TTypes<int32>::ConstMatrix>& bucketized_features) const;
 
   std::vector<float> node_value(const int32 tree_id, const int32 node_id) const;
 
@@ -105,12 +107,17 @@ class BoostedTreesEnsembleResource : public StampedResource {
   // Adds new tree with one node to the ensemble and sets node's value to logits
   int32 AddNewTreeWithLogits(const float weight, const float logits);
 
-  // Grows the tree by adding a split and leaves.
-  void AddBucketizedSplitNode(const int32 tree_id, const int32 node_id,
-                              const int32 feature_id, const int32 threshold,
-                              const float gain, const float left_contrib,
-                              const float right_contrib, int32* left_node_id,
-                              int32* right_node_id);
+  // Grows the tree by adding a bucketized split and leaves.
+  void AddBucketizedSplitNode(
+      const int32 tree_id,
+      const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+      const int32 logits_dimension, int32* left_node_id, int32* right_node_id);
+
+  // Grows the tree by adding a categorical split and leaves.
+  void AddCategoricalSplitNode(
+      const int32 tree_id,
+      const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+      const int32 logits_dimension, int32* left_node_id, int32* right_node_id);
 
   // Retrieves tree weights and returns as a vector.
   // It involves a copy, so should be called only sparingly (like once per
@@ -166,6 +173,11 @@ class BoostedTreesEnsembleResource : public StampedResource {
   protobuf::Arena arena_;
   mutex mu_;
   boosted_trees::TreeEnsemble* tree_ensemble_;
+
+  boosted_trees::Node* AddLeafNodes(
+      int32 tree_id,
+      const std::pair<int32, boosted_trees::SplitCandidate>& split_entry,
+      const int32 logits_dimension, int32* left_node_id, int32* right_node_id);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index fac5967b1b1..45dc248bffd 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -20,13 +20,19 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
 #include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
-const char INEQUALITY_DEFAULT_LEFT[] = "inequality_default_left";
-const char INEQUALITY_DEFAULT_RIGHT[] = "inequality_default_right";
+using Matrix =
+    Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ConstMatrixMap = Eigen::Map<const Matrix>;
+using MatrixMap = Eigen::Map<Matrix>;
+
+using ConstVectorMap = Eigen::Map<const Eigen::VectorXf>;
+using VectorMap = Eigen::Map<Eigen::VectorXf>;
 
 // V1 Op. Deprecated. BoostedTreesCalculateBestFeatureSplitOp is V2.
 class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
@@ -228,6 +234,7 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
       OpKernelConstruction* const context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("split_type", &split_type_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -242,11 +249,13 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input("stats_summary", &stats_summary_t));
     TTypes<float, 4>::ConstTensor stats_summary =
         stats_summary_t->tensor<float, 4>();
-    const int64 feature_dims = stats_summary_t->dim_size(1);
-    const int64 num_buckets = stats_summary_t->dim_size(2);
-    const int64 hessian_dim = stats_summary_t->dim_size(3) - logits_dim_;
+    const int32 feature_dims = stats_summary_t->dim_size(1);
+    // The last bucket is for default/missing value.
+    const int32 num_buckets = stats_summary_t->dim_size(2) - 1;
+    const int32 logits_dim = logits_dim_;
+    const int32 hessian_dim = stats_summary_t->dim_size(3) - logits_dim;
     DCHECK_GT(hessian_dim, 0);
-    DCHECK_LE(hessian_dim, logits_dim_ * logits_dim_);
+    DCHECK_LE(hessian_dim, logits_dim * logits_dim);
 
     const Tensor* l1_t;
     OP_REQUIRES_OK(context, context->input("l1", &l1_t));
@@ -280,76 +289,45 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
     std::vector<Eigen::VectorXf> output_right_node_contribs;
     std::vector<string> output_split_types;
 
-    for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
-      std::vector<Eigen::VectorXf> cum_grad;
-      std::vector<Eigen::VectorXf> cum_hess;
-      cum_grad.reserve(num_buckets);
-      cum_hess.reserve(num_buckets);
-
+    // TODO(tanzheny) parallelize the computation.
+    // Iterate each node and find the best gain per node.
+    for (int32 node_id = node_id_first; node_id < node_id_last; ++node_id) {
       float best_gain = std::numeric_limits<float>::lowest();
-      float best_bucket = 0;
-      float best_f_dim = 0;
-      string best_split_type = INEQUALITY_DEFAULT_LEFT;
-      Eigen::VectorXf best_contrib_for_left(logits_dim_);
-      Eigen::VectorXf best_contrib_for_right(logits_dim_);
+      int32 best_bucket = 0;
+      int32 best_f_dim = 0;
+      string best_split_type;
+      Eigen::VectorXf best_contrib_for_left(logits_dim);
+      Eigen::VectorXf best_contrib_for_right(logits_dim);
       float parent_gain;
-      Eigen::VectorXf unused(logits_dim_);
-      for (int f_dim = 0; f_dim < feature_dims; ++f_dim) {
-        cum_grad.clear();
-        cum_hess.clear();
-        Eigen::VectorXf total_grad = Eigen::VectorXf::Zero(logits_dim_);
-        Eigen::VectorXf total_hess = Eigen::VectorXf::Zero(hessian_dim);
-        for (int bucket = 0; bucket < num_buckets; ++bucket) {
-          for (int i = 0; i < logits_dim_; ++i) {
-            total_grad[i] += stats_summary(node_id, f_dim, bucket, i);
-            total_hess[i] +=
-                stats_summary(node_id, f_dim, bucket, logits_dim_ + i);
-          }
-          for (int i = logits_dim_; i < hessian_dim; ++i) {
-            // Full hessian.
-            total_hess[i] +=
-                stats_summary(node_id, f_dim, bucket, logits_dim_ + i);
-          }
-          cum_grad.push_back(total_grad);
-          cum_hess.push_back(total_hess);
-        }
 
-        // Only need to check once as total_grad/total_hess will be the same for
-        // all features.
-        if (f_dim == 0) {
-          if (total_hess.norm() < min_node_weight) {
-            break;
-          }
-          CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
-                                   &parent_gain);
-        }
+      // Including default bucket.
+      ConstMatrixMap stats_mat(&stats_summary(node_id, 0, 0, 0),
+                               num_buckets + 1, logits_dim + hessian_dim);
+      const Eigen::VectorXf total_grad =
+          stats_mat.leftCols(logits_dim).colwise().sum();
+      const Eigen::VectorXf total_hess =
+          stats_mat.rightCols(hessian_dim).colwise().sum();
+      if (total_hess.norm() < min_node_weight) {
+        continue;
+      }
+      Eigen::VectorXf parent_weight(logits_dim);
+      CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &parent_weight,
+                               &parent_gain);
+
+      if (split_type_ == "inequality") {
+        CalculateBestInequalitySplit(
+            stats_summary, node_id, feature_dims, logits_dim, hessian_dim,
+            num_buckets, min_node_weight, l1, l2, &best_gain, &best_bucket,
+            &best_f_dim, &best_split_type, &best_contrib_for_left,
+            &best_contrib_for_right);
+      } else {
+        CalculateBestEqualitySplit(
+            stats_summary, total_grad, total_hess, node_id, feature_dims,
+            logits_dim, hessian_dim, num_buckets, l1, l2, &best_gain,
+            &best_bucket, &best_f_dim, &best_split_type, &best_contrib_for_left,
+            &best_contrib_for_right);
+      }
 
-        for (int bucket = 0; bucket < num_buckets; ++bucket) {
-          const Eigen::VectorXf cum_grad_bucket = cum_grad[bucket];
-          const Eigen::VectorXf cum_hess_bucket = cum_hess[bucket];
-          // Left child.
-          Eigen::VectorXf contrib_for_left(logits_dim_);
-          float gain_for_left;
-          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
-                                   &contrib_for_left, &gain_for_left);
-          // Right child.
-          // TODO(crawles): consider accumulating right grad/hessians when doing
-          // cum_grad/hessian (if this becomes a bottleneck).
-          const Eigen::VectorXf grad_for_right = total_grad - cum_grad_bucket;
-          const Eigen::VectorXf hess_for_right = total_hess - cum_hess_bucket;
-          Eigen::VectorXf contrib_for_right(logits_dim_);
-          float gain_for_right;
-          CalculateWeightsAndGains(grad_for_right, hess_for_right, l1, l2,
-                                   &contrib_for_right, &gain_for_right);
-          if (GainIsLarger(gain_for_left + gain_for_right, best_gain)) {
-            best_gain = gain_for_left + gain_for_right;
-            best_bucket = bucket;
-            best_f_dim = f_dim;
-            best_contrib_for_left = contrib_for_left;
-            best_contrib_for_right = contrib_for_right;
-          }
-        }  // for bucket
-      }    // for f_dim
       if (best_gain == std::numeric_limits<float>::lowest()) {
         // Do not add the node if not split if found.
         continue;
@@ -395,7 +373,7 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
     // output_left_node_contribs
     Tensor* output_left_node_contribs_t;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                "left_node_contribs", {num_nodes, logits_dim_},
+                                "left_node_contribs", {num_nodes, logits_dim},
                                 &output_left_node_contribs_t));
     auto output_left_node_contribs_matrix =
         output_left_node_contribs_t->matrix<float>();
@@ -403,7 +381,7 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
     // output_right_node_contribs
     Tensor* output_right_node_contribs_t;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                "right_node_contribs", {num_nodes, logits_dim_},
+                                "right_node_contribs", {num_nodes, logits_dim},
                                 &output_right_node_contribs_t));
     auto output_right_node_contribs_matrix =
         output_right_node_contribs_t->matrix<float>();
@@ -413,7 +391,7 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
     OP_REQUIRES_OK(
         context, context->allocate_output("split_with_default_directions",
                                           {num_nodes}, &output_split_types_t));
-    auto output_split_types_vec = output_split_types_t->vec<string>();
+    auto output_split_types_vec = output_split_types_t->vec<tstring>();
 
     // Sets output tensors from vectors.
     for (int i = 0; i < num_nodes; ++i) {
@@ -422,7 +400,7 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
       output_gains_vec(i) = output_gains[i] - tree_complexity;
       output_feature_dimensions_vec(i) = output_feature_dimensions[i];
       output_thresholds_vec(i) = output_thresholds[i];
-      for (int j = 0; j < logits_dim_; ++j) {
+      for (int j = 0; j < logits_dim; ++j) {
         output_left_node_contribs_matrix(i, j) =
             output_left_node_contribs[i][j];
         output_right_node_contribs_matrix(i, j) =
@@ -433,7 +411,138 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
   }
 
  private:
+  // TODO(crawles): Simplify inequality path just like equality b/138329196
+  // Currently this is not simplify-able due to numerical instability in math
+  // i.e. gain = -g.transpose() * hessian_and_reg.colPivHouseholderQr().solve(g)
+  // It caused gain to be Inf when g is approaching 0 but not exactly 0 while
+  // there is no regularization.
+  // Calculate the best inequality split per node.
+  void CalculateBestInequalitySplit(
+      TTypes<float, 4>::ConstTensor stats_summary, const int32 node_id,
+      const int32 feature_dims, const int32 logits_dim, const int32 hessian_dim,
+      const int32 num_buckets, const float min_node_weight, const float l1,
+      const float l2, float* best_gain, int32* best_bucket, int32* best_f_dim,
+      string* best_split_type, Eigen::VectorXf* best_contrib_for_left,
+      Eigen::VectorXf* best_contrib_for_right) {
+    std::vector<Eigen::VectorXf> cum_grad;
+    std::vector<Eigen::VectorXf> cum_hess;
+    // get all cumulative gradients including default bucket.
+    cum_grad.reserve(num_buckets);
+    cum_hess.reserve(num_buckets);
+
+    for (int f_dim = 0; f_dim < feature_dims; ++f_dim) {
+      ConstVectorMap default_stats_vec(
+          &stats_summary(node_id, f_dim, num_buckets, 0),
+          logits_dim + hessian_dim);
+      Eigen::VectorXf missing_bucket_grad = default_stats_vec.head(logits_dim);
+      Eigen::VectorXf missing_bucket_hess = default_stats_vec.tail(hessian_dim);
+      cum_grad.clear();
+      cum_hess.clear();
+      Eigen::VectorXf total_grad = Eigen::VectorXf::Zero(logits_dim);
+      Eigen::VectorXf total_hess = Eigen::VectorXf::Zero(hessian_dim);
+      // sum all the gradients including default bucket.
+      for (int bucket = 0; bucket <= num_buckets; ++bucket) {
+        for (int i = 0; i < logits_dim; ++i) {
+          total_grad[i] += stats_summary(node_id, f_dim, bucket, i);
+        }
+        for (int i = 0; i < hessian_dim; ++i) {
+          // Full hessian.
+          total_hess[i] +=
+              stats_summary(node_id, f_dim, bucket, logits_dim + i);
+        }
+        if (bucket < num_buckets) {
+          cum_grad.push_back(total_grad);
+          cum_hess.push_back(total_hess);
+        }
+      }
+      const string kInequalityDefaultLeft =
+          boosted_trees::SplitTypeWithDefault_Name(
+              boosted_trees::INEQUALITY_DEFAULT_LEFT);
+      const string kInequalityDefaultRight =
+          boosted_trees::SplitTypeWithDefault_Name(
+              boosted_trees::INEQUALITY_DEFAULT_RIGHT);
+
+      // Iterate from left to right, excluding default bucket.
+      for (int bucket = 0; bucket < num_buckets; ++bucket) {
+        // default value goes to left node.
+        const Eigen::VectorXf total_left_grad =
+            cum_grad[bucket] + missing_bucket_grad;
+        const Eigen::VectorXf total_left_hess =
+            cum_hess[bucket] + missing_bucket_hess;
+        MaybeUpdateBestSplit(
+            total_left_grad, total_grad - total_left_grad, total_left_hess,
+            total_hess - total_left_hess, logits_dim, bucket, f_dim, l1, l2,
+            kInequalityDefaultLeft, best_gain, best_bucket, best_f_dim,
+            best_split_type, best_contrib_for_left, best_contrib_for_right);
+        // default value goes to right node.
+        MaybeUpdateBestSplit(
+            cum_grad[bucket], total_grad - cum_grad[bucket], cum_hess[bucket],
+            total_hess - cum_hess[bucket], logits_dim, bucket, f_dim, l1, l2,
+            kInequalityDefaultRight, best_gain, best_bucket, best_f_dim,
+            best_split_type, best_contrib_for_left, best_contrib_for_right);
+      }  // for bucket
+    }
+  }
+
+  // Calculate the best equality split per node.
+  void CalculateBestEqualitySplit(
+      TTypes<float, 4>::ConstTensor stats_summary,
+      const Eigen::VectorXf& total_grad, const Eigen::VectorXf& total_hess,
+      const int32 node_id, const int32 feature_dims, const int32 logits_dim,
+      const int32 hessian_dim, const int32 num_buckets, const float l1,
+      const float l2, float* best_gain, int32* best_bucket, int32* best_f_dim,
+      string* best_split_type, Eigen::VectorXf* best_contrib_for_left,
+      Eigen::VectorXf* best_contrib_for_right) {
+    const string kEqualityDefaultRight =
+        boosted_trees::SplitTypeWithDefault_Name(
+            boosted_trees::EQUALITY_DEFAULT_RIGHT);
+    for (int f_dim = 0; f_dim < feature_dims; ++f_dim) {
+      for (int bucket = 0; bucket < num_buckets; ++bucket) {
+        ConstVectorMap stats_vec(&stats_summary(node_id, f_dim, bucket, 0),
+                                 logits_dim + hessian_dim);
+        Eigen::VectorXf curr_grad = stats_vec.head(logits_dim);
+        Eigen::VectorXf curr_hess = stats_vec.tail(hessian_dim);
+        MaybeUpdateBestSplit(curr_grad, total_grad - curr_grad, curr_hess,
+                             total_hess - curr_hess, logits_dim, bucket, f_dim,
+                             l1, l2, kEqualityDefaultRight, best_gain,
+                             best_bucket, best_f_dim, best_split_type,
+                             best_contrib_for_left, best_contrib_for_right);
+      }
+    }
+  }
+
+  void MaybeUpdateBestSplit(const Eigen::VectorXf& grad_for_left,
+                            const Eigen::VectorXf& grad_for_right,
+                            const Eigen::VectorXf& hess_for_left,
+                            const Eigen::VectorXf& hess_for_right,
+                            const int32 logits_dim, const int32 bucket,
+                            const int32 f_dim, const float l1, const float l2,
+                            const string split_type, float* best_gain,
+                            int32* best_bucket, int32* best_f_dim,
+                            string* best_split_type,
+                            Eigen::VectorXf* best_contrib_for_left,
+                            Eigen::VectorXf* best_contrib_for_right) {
+    // Left child.
+    Eigen::VectorXf contrib_for_left(logits_dim);
+    float gain_for_left;
+    CalculateWeightsAndGains(grad_for_left, hess_for_left, l1, l2,
+                             &contrib_for_left, &gain_for_left);
+    Eigen::VectorXf contrib_for_right(logits_dim);
+    float gain_for_right;
+    CalculateWeightsAndGains(grad_for_right, hess_for_right, l1, l2,
+                             &contrib_for_right, &gain_for_right);
+    if (GainIsLarger(gain_for_left + gain_for_right, *best_gain)) {
+      *best_gain = gain_for_left + gain_for_right;
+      *best_bucket = bucket;
+      *best_f_dim = f_dim;
+      *best_contrib_for_left = contrib_for_left;
+      *best_contrib_for_right = contrib_for_right;
+      *best_split_type = split_type;
+    }
+  }
+
   int logits_dim_;
+  string split_type_;
 };
 
 // v2 op that supports multi-class.
@@ -594,7 +703,7 @@ class BoostedTreesSparseCalculateBestFeatureSplitOp : public OpKernel {
     OP_REQUIRES_OK(
         context, context->allocate_output("split_with_default_directions",
                                           {num_nodes}, &output_split_types_t));
-    auto output_split_types_vec = output_split_types_t->vec<string>();
+    auto output_split_types_vec = output_split_types_t->vec<tstring>();
 
     // Sets output tensors from vectors.
     for (int i = 0; i < num_nodes; ++i) {
@@ -630,7 +739,8 @@ class BoostedTreesSparseCalculateBestFeatureSplitOp : public OpKernel {
     float best_gain = std::numeric_limits<float>::lowest();
     float best_bucket = 0;
     float best_f_dim = 0;
-    string best_split_type = INEQUALITY_DEFAULT_LEFT;
+    string best_split_type = boosted_trees::SplitTypeWithDefault_Name(
+        boosted_trees::INEQUALITY_DEFAULT_LEFT);
     float best_contrib_for_left = 0.0;
     float best_contrib_for_right = 0.0;
     // the sum of gradients including default bucket.
@@ -697,7 +807,8 @@ class BoostedTreesSparseCalculateBestFeatureSplitOp : public OpKernel {
           best_gain = gain_for_left + gain_for_right;
           best_bucket = bucket_id;
           best_f_dim = feature_dim;
-          best_split_type = INEQUALITY_DEFAULT_RIGHT;
+          best_split_type = boosted_trees::SplitTypeWithDefault_Name(
+              boosted_trees::INEQUALITY_DEFAULT_RIGHT);
           best_contrib_for_left = contrib_for_left[0];
           best_contrib_for_right = contrib_for_right[0];
         }
@@ -714,7 +825,8 @@ class BoostedTreesSparseCalculateBestFeatureSplitOp : public OpKernel {
           best_gain = gain_for_left + gain_for_right;
           best_bucket = bucket_id;
           best_f_dim = feature_dim;
-          best_split_type = INEQUALITY_DEFAULT_LEFT;
+          best_split_type = boosted_trees::SplitTypeWithDefault_Name(
+              boosted_trees::INEQUALITY_DEFAULT_LEFT);
           best_contrib_for_left = contrib_for_left[0];
           best_contrib_for_right = contrib_for_right[0];
         }
@@ -808,6 +920,7 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesMakeStatsSummary").Device(DEVICE_CPU),
                         BoostedTreesMakeStatsSummaryOp);
 
+// TODO(tanzheny): Add an option of default value into the API interface.
 class BoostedTreesAggregateStatsOp : public OpKernel {
  public:
   explicit BoostedTreesAggregateStatsOp(OpKernelConstruction* const context)
@@ -845,18 +958,22 @@ class BoostedTreesAggregateStatsOp : public OpKernel {
     const int64 feature_dims = feature_t->dim_size(1);
 
     // Allocate temporary stats tensor (Rank 4), upcasting to double.
+    // A default bucket is added to the end for missing/default values.
     Tensor temp_stats_double_t;
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE,
-                                                   {max_splits_, feature_dims,
-                                                    num_buckets_, stats_dims},
-                                                   &temp_stats_double_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DT_DOUBLE,
+                     {max_splits_, feature_dims, num_buckets_ + 1, stats_dims},
+                     &temp_stats_double_t));
     auto temp_stats_double = temp_stats_double_t.tensor<double, 4>();
     temp_stats_double.setZero();
 
     for (int i = 0; i < batch_size; ++i) {
       const int32 node = node_ids(i);
       for (int feature_dim = 0; feature_dim < feature_dims; ++feature_dim) {
-        const int32 bucket = feature(i, feature_dim);
+        const int32 feature_value = feature(i, feature_dim);
+        const int32 bucket =
+            (feature_value == -1) ? num_buckets_ : feature_value;
         for (int stat_dim = 0; stat_dim < logits_dims; ++stat_dim) {
           temp_stats_double(node, feature_dim, bucket, stat_dim) +=
               gradients(i, stat_dim);
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index eabb8361127..ca4f2e011be 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
 #include "tensorflow/core/kernels/boosted_trees/resources.h"
 #include "tensorflow/core/kernels/boosted_trees/tree_helper.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -26,19 +27,6 @@ namespace {
 constexpr float kLayerByLayerTreeWeight = 1.0;
 constexpr float kMinDeltaForCenterBias = 0.01;
 
-// TODO(nponomareva, youngheek): consider using vector.
-struct SplitCandidate {
-  SplitCandidate() {}
-
-  // Index in the list of the feature ids.
-  int64 feature_idx;
-
-  // Index in the tensor of node_ids for the feature with idx feature_idx.
-  int64 candidate_idx;
-
-  float gain;
-};
-
 enum PruningMode { kNoPruning = 0, kPrePruning = 1, kPostPruning = 2 };
 
 }  // namespace
@@ -89,11 +77,13 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
     const Tensor* learning_rate_t;
     OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
     const auto learning_rate = learning_rate_t->scalar<float>()();
-
+    // Op does not support multi-class, the V2 op below does however.
+    int32 logits_dimension = 1;
     // Find best splits for each active node.
-    std::map<int32, SplitCandidate> best_splits;
-    FindBestSplitsPerNode(context, node_ids_list, gains_list, feature_ids,
-                          &best_splits);
+    std::map<int32, boosted_trees::SplitCandidate> best_splits;
+    FindBestSplitsPerNode(context, learning_rate, node_ids_list, gains_list,
+                          thresholds_list, left_node_contribs,
+                          right_node_contribs, feature_ids, &best_splits);
 
     int32 current_tree =
         UpdateGlobalAttemptsAndRetrieveGrowableTree(ensemble_resource);
@@ -113,17 +103,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
     int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
     // Add the splits to the tree.
     for (auto& split_entry : best_splits) {
-      const int32 node_id = split_entry.first;
-      const SplitCandidate& candidate = split_entry.second;
-
-      const int64 feature_idx = candidate.feature_idx;
-      const int64 candidate_idx = candidate.candidate_idx;
-
-      const int32 feature_id = feature_ids(feature_idx);
-      const int32 threshold =
-          thresholds_list[feature_idx].vec<int32>()(candidate_idx);
-      const float gain = gains_list[feature_idx].vec<float>()(candidate_idx);
-
+      const float gain = split_entry.second.gain;
       if (pruning_mode_ == kPrePruning) {
         // Don't consider negative splits if we're pre-pruning the tree.
         // Note that zero-gain splits are acceptable.
@@ -131,22 +111,14 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           continue;
         }
       }
-      // For now assume that the weights vectors are one dimensional.
-      // TODO(nponomareva): change here for multiclass.
-      const float left_contrib =
-          learning_rate *
-          left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
-      const float right_contrib =
-          learning_rate *
-          right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
 
       // unused.
       int32 left_node_id;
       int32 right_node_id;
 
-      ensemble_resource->AddBucketizedSplitNode(
-          current_tree, node_id, feature_id, threshold, gain, left_contrib,
-          right_contrib, &left_node_id, &right_node_id);
+      ensemble_resource->AddBucketizedSplitNode(current_tree, split_entry,
+                                                logits_dimension, &left_node_id,
+                                                &right_node_id);
       split_happened = true;
     }
     int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
@@ -160,8 +132,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
         node_id_end = 1;
         ensemble_resource->SetIsFinalized(current_tree, true);
         if (pruning_mode_ == kPostPruning) {
-          // TODO(crawles): change for multi-class.
-          ensemble_resource->PostPruneTree(current_tree, 1); /*logit dimension*/
+          ensemble_resource->PostPruneTree(current_tree, logits_dimension);
         }
         if (ensemble_resource->num_trees() > 0) {
           // Create a dummy new tree with an empty node.
@@ -196,14 +167,22 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
   // Helper method which effectively does a reduce over all split candidates
   // and finds the best split for each node.
   void FindBestSplitsPerNode(
-      OpKernelContext* const context, const OpInputList& node_ids_list,
-      const OpInputList& gains_list,
+      OpKernelContext* const context, const float learning_rate,
+      const OpInputList& node_ids_list, const OpInputList& gains_list,
+      const OpInputList& thresholds_list,
+      const OpInputList& left_node_contribs_list,
+      const OpInputList& right_node_contribs_list,
       const TTypes<const int32>::Vec& feature_ids,
-      std::map<int32, SplitCandidate>* best_split_per_node) {
+      std::map<int32, boosted_trees::SplitCandidate>* best_split_per_node) {
     // Find best split per node going through every feature candidate.
     for (int64 feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
       const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
       const auto& gains = gains_list[feature_idx].vec<float>();
+      const auto& thresholds = thresholds_list[feature_idx].vec<int32>();
+      const auto& left_node_contribs =
+          left_node_contribs_list[feature_idx].matrix<float>();
+      const auto& right_node_contribs =
+          right_node_contribs_list[feature_idx].matrix<float>();
 
       for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
            ++candidate_idx) {
@@ -212,16 +191,24 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
         const auto& gain = gains(candidate_idx);
 
         auto best_split_it = best_split_per_node->find(node_id);
-        SplitCandidate candidate;
-        candidate.feature_idx = feature_idx;
+        boosted_trees::SplitCandidate candidate;
+        candidate.feature_idx = feature_ids(feature_idx);
         candidate.candidate_idx = candidate_idx;
         candidate.gain = gain;
+        candidate.dimension_id = 0;
+        candidate.threshold = thresholds(candidate_idx);
+        candidate.left_node_contribs.push_back(
+            learning_rate * left_node_contribs(candidate_idx, 0));
+        candidate.right_node_contribs.push_back(
+            learning_rate * right_node_contribs(candidate_idx, 0));
+        candidate.split_type = boosted_trees::SplitTypeWithDefault_Name(
+            boosted_trees::INEQUALITY_DEFAULT_LEFT);
 
         if (TF_PREDICT_FALSE(best_split_it != best_split_per_node->end() &&
                              GainsAreEqual(gain, best_split_it->second.gain))) {
           const auto best_candidate = (*best_split_per_node)[node_id];
-          const int32 best_feature_id = feature_ids(best_candidate.feature_idx);
-          const int32 feature_id = feature_ids(candidate.feature_idx);
+          const int32 best_feature_id = best_candidate.feature_idx;
+          const int32 feature_id = candidate.feature_idx;
           VLOG(2) << "Breaking ties on feature ids and buckets";
           // Breaking ties deterministically.
           if (feature_id < best_feature_id) {
@@ -243,6 +230,233 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsemble").Device(DEVICE_CPU),
                         BoostedTreesUpdateEnsembleOp);
 
+// V2 of UpdateEnsembleOp that takes in split type and feature dimension id.
+class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
+ public:
+  explicit BoostedTreesUpdateEnsembleV2Op(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+    OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    core::RefCountPtr<BoostedTreesEnsembleResource> ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &ensemble_resource));
+    mutex_lock l(*ensemble_resource->get_mutex());
+    // Increase the ensemble stamp.
+    ensemble_resource->set_stamp(ensemble_resource->stamp() + 1);
+
+    // Read node ids, gains, thresholds and node contribs.
+    OpInputList node_ids_list;
+    OpInputList gains_list;
+    OpInputList thresholds_list;
+    OpInputList dimension_ids_list;
+    OpInputList left_node_contribs_list;
+    OpInputList right_node_contribs_list;
+    OpInputList split_types_list;
+    OP_REQUIRES_OK(context, context->input_list("node_ids", &node_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("gains", &gains_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("thresholds", &thresholds_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("dimension_ids", &dimension_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("left_node_contribs",
+                                                &left_node_contribs_list));
+    OP_REQUIRES_OK(context, context->input_list("right_node_contribs",
+                                                &right_node_contribs_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("split_types", &split_types_list));
+
+    const Tensor* feature_ids_t;
+    OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto feature_ids = feature_ids_t->vec<int32>();
+
+    const Tensor* max_depth_t;
+    OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    const auto max_depth = max_depth_t->scalar<int32>()();
+
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    const auto learning_rate = learning_rate_t->scalar<float>()();
+
+    const Tensor* pruning_mode_t;
+    OP_REQUIRES_OK(context, context->input("pruning_mode", &pruning_mode_t));
+    const auto pruning_mode =
+        static_cast<PruningMode>(pruning_mode_t->scalar<int32>()());
+    // Find best splits for each active node.
+    std::map<int32, boosted_trees::SplitCandidate> best_splits;
+    FindBestSplitsPerNode(context, learning_rate, node_ids_list, gains_list,
+                          thresholds_list, dimension_ids_list,
+                          left_node_contribs_list, right_node_contribs_list,
+                          split_types_list, feature_ids, &best_splits);
+
+    int32 current_tree =
+        UpdateGlobalAttemptsAndRetrieveGrowableTree(ensemble_resource);
+
+    // No-op if no new splits can be considered.
+    if (best_splits.empty()) {
+      LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
+      return;
+    }
+
+    const int32 new_num_layers =
+        ensemble_resource->GetNumLayersGrown(current_tree) + 1;
+    VLOG(1) << "Adding layer #" << new_num_layers - 1 << " to tree #"
+            << current_tree << " of ensemble of " << current_tree + 1
+            << " trees.";
+    bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
+    // Add the splits to the tree.
+    for (auto& split_entry : best_splits) {
+      const float gain = split_entry.second.gain;
+      const string split_type = split_entry.second.split_type;
+
+      if (pruning_mode == kPrePruning) {
+        // Don't consider negative splits if we're pre-pruning the tree.
+        // Note that zero-gain splits are acceptable.
+        if (gain < 0) {
+          continue;
+        }
+      }
+
+      // unused.
+      int32 left_node_id;
+      int32 right_node_id;
+
+      boosted_trees::SplitTypeWithDefault split_type_with_default;
+      bool parsed = boosted_trees::SplitTypeWithDefault_Parse(
+          split_type, &split_type_with_default);
+      DCHECK(parsed);
+      if (split_type_with_default == boosted_trees::EQUALITY_DEFAULT_RIGHT) {
+        // Add equality split to the node.
+        ensemble_resource->AddCategoricalSplitNode(current_tree, split_entry,
+                                                   logits_dim_, &left_node_id,
+                                                   &right_node_id);
+      } else {
+        // Add inequality split to the node.
+        ensemble_resource->AddBucketizedSplitNode(current_tree, split_entry,
+                                                  logits_dim_, &left_node_id,
+                                                  &right_node_id);
+      }
+      split_happened = true;
+    }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
+    if (split_happened) {
+      // Update growable tree metadata.
+      ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
+      // Finalize the tree if needed.
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
+        ensemble_resource->SetIsFinalized(current_tree, true);
+        if (pruning_mode == kPostPruning) {
+          ensemble_resource->PostPruneTree(current_tree, logits_dim_);
+        }
+        if (ensemble_resource->num_trees() > 0) {
+          // Create a dummy new tree with an empty node.
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+        }
+      }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
+    }
+  }
+
+ private:
+  int32 UpdateGlobalAttemptsAndRetrieveGrowableTree(
+      const core::RefCountPtr<BoostedTreesEnsembleResource>& resource) {
+    int32 num_trees = resource->num_trees();
+    int32 current_tree = num_trees - 1;
+
+    // Increment global attempt stats.
+    resource->UpdateGrowingMetadata();
+
+    // Note we don't set tree weight to be equal to learning rate, since we
+    // apply learning rate to leaf weights instead, when doing layer-by-layer
+    // boosting.
+    if (num_trees <= 0) {
+      // Create a new tree with a no-op leaf.
+      current_tree = resource->AddNewTree(kLayerByLayerTreeWeight);
+    }
+    return current_tree;
+  }
+
+  // Helper method which effectively does a reduce over all split candidates
+  // and finds the best split for each node.
+  void FindBestSplitsPerNode(
+      OpKernelContext* const context, const float learning_rate,
+      const OpInputList& node_ids_list, const OpInputList& gains_list,
+      const OpInputList& thresholds_list, const OpInputList& dimension_ids_list,
+      const OpInputList& left_node_contribs_list,
+      const OpInputList& right_node_contribs_list,
+      const OpInputList& split_types_list,
+      const TTypes<const int32>::Vec& feature_ids,
+      std::map<int32, boosted_trees::SplitCandidate>* best_split_per_node) {
+    // Find best split per node going through every feature candidate.
+    for (int64 feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
+      const auto& gains = gains_list[feature_idx].vec<float>();
+      const auto& thresholds = thresholds_list[feature_idx].vec<int32>();
+      const auto& dimension_ids = dimension_ids_list[feature_idx].vec<int32>();
+      const auto& left_node_contribs =
+          left_node_contribs_list[feature_idx].matrix<float>();
+      const auto& right_node_contribs =
+          right_node_contribs_list[feature_idx].matrix<float>();
+      const auto& split_types = split_types_list[feature_idx].vec<tstring>();
+
+      for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
+           ++candidate_idx) {
+        // Get current split candidate.
+        const auto& node_id = node_ids(candidate_idx);
+        const auto& gain = gains(candidate_idx);
+        const auto& threshold = thresholds(candidate_idx);
+        const auto& dimension_id = dimension_ids(candidate_idx);
+        const auto& split_type = split_types(candidate_idx);
+
+        auto best_split_it = best_split_per_node->find(node_id);
+        boosted_trees::SplitCandidate candidate;
+        candidate.feature_idx = feature_ids(feature_idx);
+        candidate.candidate_idx = candidate_idx;
+        candidate.gain = gain;
+        candidate.threshold = threshold;
+        candidate.dimension_id = dimension_id;
+        candidate.split_type = split_type;
+        for (int i = 0; i < logits_dim_; ++i) {
+          candidate.left_node_contribs.push_back(
+              learning_rate * left_node_contribs(candidate_idx, i));
+          candidate.right_node_contribs.push_back(
+              learning_rate * right_node_contribs(candidate_idx, i));
+        }
+        if (TF_PREDICT_FALSE(best_split_it != best_split_per_node->end() &&
+                             GainsAreEqual(gain, best_split_it->second.gain))) {
+          const auto best_candidate = (*best_split_per_node)[node_id];
+          const int32 best_feature_id = best_candidate.feature_idx;
+          const int32 feature_id = candidate.feature_idx;
+          VLOG(2) << "Breaking ties on feature ids and buckets";
+          // Breaking ties deterministically.
+          if (feature_id < best_feature_id) {
+            (*best_split_per_node)[node_id] = candidate;
+          }
+        } else if (best_split_it == best_split_per_node->end() ||
+                   GainIsLarger(gain, best_split_it->second.gain)) {
+          (*best_split_per_node)[node_id] = candidate;
+        }
+      }
+    }
+  }
+
+ private:
+  int32 num_features_;
+  int32 logits_dim_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsembleV2").Device(DEVICE_CPU),
+                        BoostedTreesUpdateEnsembleV2Op);
+
 class BoostedTreesCenterBiasOp : public OpKernel {
  public:
   explicit BoostedTreesCenterBiasOp(OpKernelConstruction* const context)
diff --git a/tensorflow/core/kernels/boosted_trees/tree_helper.h b/tensorflow/core/kernels/boosted_trees/tree_helper.h
index 4a4aafd0e52..198c27e6ad7 100644
--- a/tensorflow/core/kernels/boosted_trees/tree_helper.h
+++ b/tensorflow/core/kernels/boosted_trees/tree_helper.h
@@ -25,6 +25,27 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace boosted_trees {
+// TODO(nponomareva, youngheek): consider using vector.
+struct SplitCandidate {
+  SplitCandidate() {}
+
+  // Index in the list of the feature ids.
+  int64 feature_idx = 0;
+
+  // Index in the tensor of node_ids for the feature with idx feature_idx.
+  int64 candidate_idx = 0;
+
+  float gain = 0.0;
+  int32 threshold = 0.0;
+  int32 dimension_id = 0;
+  std::vector<float> left_node_contribs;
+  std::vector<float> right_node_contribs;
+  // The split type, i.e., with missing value to left/right.
+  string split_type;
+};
+}  // namespace boosted_trees
+
 static bool GainsAreEqual(const float g1, const float g2) {
   const float kTolerance = 1e-15;
   return std::abs(g1 - g2) < kTolerance;
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 91ccf357436..998a2721a93 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -34,8 +34,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
-    const int32 size_in, const T* in, const int32 size_boundaries,
-    GpuDeviceArrayStruct<float> boundaries_array, int32* out) {
+    const int32 size_in, const T* __restrict__ in, const int32 size_boundaries,
+    GpuDeviceArrayStruct<float> boundaries_array, int32* __restrict__ out) {
   const float* boundaries = GetGpuDeviceArrayOnDevice(&boundaries_array);
 
   GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(float), unsigned char, shared_mem);
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index a698960114c..2060b647359 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 // A Cuda kernel to check if each element is Inf or Nan. If any exists, the
 // relevant elements in abnormal_detected will be set
 template <typename T>
-__global__ void CheckNumericsKernel(const T* data, int size,
+__global__ void CheckNumericsKernel(const T* __restrict__ data, int size,
                                     int abnormal_detected[2]) {
   const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const int32 total_thread_count = gridDim.x * blockDim.x;
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index 744436c06e2..71e2604afce 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -132,7 +132,7 @@ class CholeskyOpGpu : public AsyncOpKernel {
     // Copy the lower triangular part of the input matrices to the output and
     // set the strictly upper triangular part to zero. We use a pre-existing
     // kernel MatrixBandPart to do this for all matrices in the batch at once,
-    // before we launch each of the Cholesky factorization kernels in paralle.
+    // before we launch each of the Cholesky factorization kernels.
     auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
     auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
     functor::MatrixBandPartFunctor<GPUDevice, Scalar> band_part;
@@ -143,16 +143,57 @@ class CholeskyOpGpu : public AsyncOpKernel {
     // Launch a Cholesky kernel for each matrix in the batch.
     const int64 batch_size = input_reshaped.dimension(0);
     std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "potrf"));
-    // TODO(rmlarsen): Use PotrfBatched for factoring many small matrices in
-    // parallel.
-    for (int batch = 0; batch < batch_size; ++batch) {
+
+#if CUDA_VERSION >= 9020
+    // Decide whether to use the batched API.
+    // TODO(rmlarsen): The value 128 was found to be optimal for the equivalent
+    // split in matrix_solve_op. Tune this heuristic.
+    constexpr int kMaxMatrixSizeToBatchSizeRatio = 128;
+    const bool use_batched_solver =
+        n <= kMaxMatrixSizeToBatchSizeRatio * batch_size;
+    if (use_batched_solver) {
+      // For small matrices or large batch sizes, we use the batched interface
+      // from cuSolver.
+      auto output_reshaped_ptrs = solver->GetScratchSpace<uint8>(
+          sizeof(Scalar*) * batch_size, "input_copt_ptrs",
+          /* on_host */ true);
+      const Scalar** output_reshaped_ptrs_base =
+          reinterpret_cast<const Scalar**>(output_reshaped_ptrs.mutable_data());
+      for (int batch = 0; batch < batch_size; ++batch) {
+        output_reshaped_ptrs_base[batch] = &output_reshaped(batch, 0, 0);
+      }
+      dev_info.push_back(
+          solver->GetDeviceLapackInfo(batch_size, "potrfBatched"));
       OP_REQUIRES_OK_ASYNC(context,
-                           solver->Potrf(CUBLAS_FILL_MODE_UPPER, n,
-                                         &output_reshaped(batch, 0, 0), n,
-                                         &dev_info.back()(batch)),
+                           solver->PotrfBatched(CUBLAS_FILL_MODE_UPPER, n,
+                                                output_reshaped_ptrs_base, n,
+                                                &dev_info.back(), batch_size),
                            done);
+      // TODO(rmlarsen): We have to clear the upper triangle of the output
+      // due to a bug in potrfBatched. Remove this workaround once the bug
+      // is fixed.
+      auto input_reshaped = const_cast<const Tensor*>(output)
+                                ->template flat_inner_dims<Scalar, 3>();
+      auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
+      functor::MatrixBandPartFunctor<GPUDevice, Scalar> band_part;
+      band_part(context, context->eigen_device<GPUDevice>(),
+                n /* num_lower_diags */, 0 /* num_upper_diags */,
+                input_reshaped, output_reshaped);
+    } else {
+#endif
+
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "potrf"));
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(context,
+                             solver->Potrf(CUBLAS_FILL_MODE_UPPER, n,
+                                           &output_reshaped(batch, 0, 0), n,
+                                           &dev_info.back()(batch)),
+                             done);
+      }
+
+#if CUDA_VERSION >= 9020
     }
+#endif
 
     // Register callback to check info after kernels finish.
     auto info_checker = [context, done](
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
new file mode 100644
index 00000000000..013e06cc374
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+NcclBase::NcclBase(CollectiveType type, const string& name)
+    : type_(type), name_(name), col_ctx_(nullptr), col_params_(nullptr) {}
+
+Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
+  if (type_ != col_params->instance.type) {
+    return errors::Internal("Expected initialized type ", type_,
+                            " to match type in CollectiveParams ",
+                            col_params->instance.type);
+  }
+
+  const char* expected_name;
+  switch (type_) {
+    case REDUCTION_COLLECTIVE:
+      expected_name = "NcclReduce";
+      break;
+    case BROADCAST_COLLECTIVE:
+      expected_name = "NcclBroadcast";
+      break;
+    case GATHER_COLLECTIVE:
+      expected_name = "NcclGather";
+      break;
+    default:
+      return errors::Internal("Unexpected CollectiveType ", type_);
+  }
+
+  if (expected_name != col_params->instance.impl_details.collective_name) {
+    return errors::Internal("Unexpected combination of collective type ",
+                            col_params->instance.type, " and collective name ",
+                            col_params->instance.impl_details.collective_name,
+                            ", expected name ", expected_name);
+  }
+
+  return Status::OK();
+}
+
+Status NcclBase::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+Status NcclBase::InitializeCollectiveGroupRuntimeDetails(
+    CollGroupRuntimeDetails* col_group_runtime_details) {
+  col_group_runtime_details->communicator_key =
+      NcclManager::instance()->GenerateCommunicatorKey();
+  return Status::OK();
+}
+
+const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
new file mode 100644
index 00000000000..5ef0d61aee5
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclBase : public CollectiveImplementationInterface {
+ public:
+  explicit NcclBase(CollectiveType type, const string& name);
+  ~NcclBase() override = default;
+
+  // No-op for this collective implementation.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes the device objects and device localities.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // Initialize nccl communicator key.
+  Status InitializeCollectiveGroupRuntimeDetails(
+      CollGroupRuntimeDetails* col_group_runtime_details) override;
+
+ protected:
+  const string NcclCollectiveKey(const string& exec_key, int step_id);
+
+  const CollectiveType type_;
+  const string name_;
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.cc b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
new file mode 100644
index 00000000000..6e1da95faa7
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+void NcclBroadcaster::Run(StatusCallback done) {
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done));
+  VLOG(1)
+      << "NcclBroadcast calling NcclManager::AddBroadcastSend/Recv num_tasks "
+      << col_params_->group.num_tasks << " current task "
+      << col_params_->instance.task_names[col_params_->default_rank]
+      << " num local devices " << num_local_devices << " num global devices "
+      << num_global_devices << " rank " << col_params_->default_rank
+      << " device " << col_ctx_->device_name << " instance "
+      << col_params_->instance.instance_key << " source "
+      << col_params_->is_source;
+  if (col_params_->is_source) {
+    NcclManager::instance()->AddBroadcastSend(
+        std::move(participant),
+        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
+         col_params_->group.runtime_details.communicator_key,
+         col_params_->source_rank});
+  } else {
+    NcclManager::instance()->AddBroadcastRecv(
+        std::move(participant),
+        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
+         col_params_->group.runtime_details.communicator_key,
+         col_params_->source_rank});
+  }
+  {
+    // `WaitForDependencies` may block if the collective instances on which this
+    // op depends have not yet launched.  When this function returns, this op is
+    // ready to go.
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->WaitForDependencies(*col_params_);
+    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  }
+  {
+    // When all devices at this worker have called `SignalMultiNodeReady`, the
+    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+    // implementation of `Launched` keeps track of the number of devices that
+    // have launched.
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->Launched(*col_params_);
+  }
+}
+
+REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.h b/tensorflow/core/kernels/collective_nccl_broadcaster.h
new file mode 100644
index 00000000000..9c1f6f4a787
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclBroadcaster : public NcclBase {
+ public:
+  NcclBroadcaster() : NcclBase(BROADCAST_COLLECTIVE, "NcclBroadcast") {}
+  ~NcclBroadcaster() override = default;
+
+  // Hands off broadcast to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.cc b/tensorflow/core/kernels/collective_nccl_gatherer.cc
new file mode 100644
index 00000000000..144d830befb
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_gatherer.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+void NcclGatherer::Run(StatusCallback done) {
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done));
+  VLOG(1) << "NcclGatherer calling NcclManager::AddToAllGather num_tasks "
+          << col_params_->group.num_tasks << " current task "
+          << col_params_->instance.task_names[col_params_->default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices << " rank "
+          << col_params_->default_rank << " device " << col_ctx_->device_name
+          << " instance " << col_params_->instance.instance_key;
+  NcclManager::instance()->AddToAllGather(
+      std::move(participant),
+      {std::move(nccl_collective_key), num_local_devices, num_global_devices,
+       col_params_->group.runtime_details.communicator_key,
+       /*source_rank=*/-1});
+  {
+    // `WaitForDependencies` may block if the collective instances on which this
+    // op depends have not yet launched.  When this function returns, this op is
+    // ready to go.
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->WaitForDependencies(*col_params_);
+    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  }
+  {
+    // When all devices at this worker have called `SignalMultiNodeReady`, the
+    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+    // implementation of `Launched` keeps track of the number of devices that
+    // have launched.
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->Launched(*col_params_);
+  }
+}
+
+REGISTER_COLLECTIVE(NcclGather, NcclGatherer);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.h b/tensorflow/core/kernels/collective_nccl_gatherer.h
new file mode 100644
index 00000000000..97d41f778a5
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclGatherer : public NcclBase {
+ public:
+  NcclGatherer() : NcclBase(GATHER_COLLECTIVE, "NcclGather") {}
+  ~NcclGatherer() override = default;
+
+  // Hands off all-gather to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 8fd6b15257b..873e4e3aa6c 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/collective_nccl_reducer.h"
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
@@ -22,42 +22,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
+
 namespace {
-string NcclCollectiveKey(const string& exec_key, int step_id) {
-  return strings::StrCat(exec_key, ":", step_id);
-}
-}  // namespace
-
-NcclReducer::NcclReducer() : col_ctx_(nullptr), col_params_(nullptr) {}
-
-Status NcclReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
-  if (col_params->instance.type != REDUCTION_COLLECTIVE ||
-      col_params->instance.impl_details.collective_name != "NcclReduce") {
-    return errors::Internal("Unexpected collective type ",
-                            col_params->instance.type, " expected ",
-                            REDUCTION_COLLECTIVE, "; or collective name ",
-                            col_params->instance.impl_details.collective_name,
-                            " expected NcclReduce");
-  } else {
-    return Status::OK();
-  }
-}
-
-Status NcclReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
-  col_ctx_ = col_ctx;
-  col_params_ = &col_ctx->col_params;
-  return collective_util::InitializeDeviceAndLocality(
-      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
-      &col_ctx->device_locality);
-}
-
-Status NcclReducer::InitializeCollectiveGroupRuntimeDetails(
-    CollGroupRuntimeDetails* col_group_runtime_details) {
-  col_group_runtime_details->communicator_key =
-      NcclManager::instance()->GenerateCommunicatorKey();
-  return Status::OK();
-}
-
 Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
   if (merge_op == "Add") {
     *reduction_op = ncclSum;
@@ -70,6 +36,7 @@ Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
                             merge_op);
   }
 }
+}  // namespace
 
 void NcclReducer::Run(StatusCallback done) {
   ncclRedOp_t reduction_op;
@@ -86,22 +53,31 @@ void NcclReducer::Run(StatusCallback done) {
     // Create an on-device scalar value from group_size_.
     // TODO(ayushd, tucker): avoid this copy by either reusing across
     // invocations or providing the scalar to the kernel in host memory.
-    Tensor group_size_val(col_ctx_->output->dtype(), TensorShape({}));
+    Tensor group_size_val;
     switch (col_ctx_->output->dtype()) {
+      case DT_HALF:
+        group_size_val =
+            Tensor(static_cast<Eigen::half>(col_params_->group.group_size));
+        break;
       case DT_FLOAT:
-        group_size_val.scalar<float>()() = col_params_->group.group_size;
+        group_size_val =
+            Tensor(static_cast<float>(col_params_->group.group_size));
         break;
       case DT_DOUBLE:
-        group_size_val.scalar<double>()() = col_params_->group.group_size;
+        group_size_val =
+            Tensor(static_cast<double>(col_params_->group.group_size));
         break;
       case DT_INT32:
-        group_size_val.scalar<int32>()() = col_params_->group.group_size;
+        group_size_val =
+            Tensor(static_cast<int32>(col_params_->group.group_size));
         break;
       case DT_INT64:
-        group_size_val.scalar<int64>()() = col_params_->group.group_size;
+        group_size_val =
+            Tensor(static_cast<int64>(col_params_->group.group_size));
         break;
       default:
-        done(errors::Internal("Unsupported type ", col_ctx_->output->dtype()));
+        done(errors::Internal("Unsupported type ",
+                              DataTypeString(col_ctx_->output->dtype())));
         return;
     }
     group_size = Tensor(
@@ -155,7 +131,7 @@ void NcclReducer::Run(StatusCallback done) {
   NcclManager::instance()->AddToAllReduce(
       std::move(participant),
       {nccl_collective_key, num_local_devices, num_global_devices,
-       col_params_->group.runtime_details.communicator_key},
+       col_params_->group.runtime_details.communicator_key, /*source_rank=*/-1},
       reduction_op);
 
   // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
@@ -215,4 +191,4 @@ REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.h b/tensorflow/core/kernels/collective_nccl_reducer.h
index f04a5b5be96..b3f4b60852a 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.h
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -15,35 +15,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
 #define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
 
-#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/kernels/collective_nccl.h"
 
 namespace tensorflow {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-class NcclReducer : public CollectiveImplementationInterface {
+class NcclReducer : public NcclBase {
  public:
-  NcclReducer();
+  NcclReducer() : NcclBase(REDUCTION_COLLECTIVE, "NcclReduce") {}
   ~NcclReducer() override = default;
 
-  // No-op for this collective implementation.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
-
-  // Initializes the device objects and device localities.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
-
-  // Initialize nccl communicator key.
-  Status InitializeCollectiveGroupRuntimeDetails(
-      CollGroupRuntimeDetails* col_group_runtime_details) override;
-
   // Hands off all reduce to NcclManager.
   void Run(StatusCallback done) override;
-
- private:
-  CollectiveContext* col_ctx_;          // Not owned
-  const CollectiveParams* col_params_;  // Not owned
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
deleted file mode 100644
index 00dfa722b57..00000000000
--- a/tensorflow/core/kernels/collective_nccl_reducer_test.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef GOOGLE_CUDA
-
-#include "tensorflow/core/kernels/collective_nccl_reducer.h"
-
-#include <algorithm>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/core/common_runtime/base_collective_executor.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/device_resolver_local.h"
-#include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
-#include "tensorflow/core/framework/collective.h"
-#include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/public/version.h"
-
-namespace tensorflow {
-static constexpr int kStepId = 10;
-
-std::unique_ptr<OpKernel> GetKernel(const NodeDef& node, DeviceBase* device) {
-  Status status;
-  std::unique_ptr<OpKernel> k = CreateOpKernel(
-      DEVICE_GPU, device, device->GetAllocator(AllocatorAttributes()), node,
-      TF_GRAPH_DEF_VERSION, &status);
-  if (!status.ok()) LOG(FATAL) << status;
-  return k;
-}
-
-std::unique_ptr<OpKernel> GetAdd(DeviceBase* device) {
-  NodeDef node_def;
-  NodeDefBuilder builder("add_node", "Add");
-  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
-                  .Input(FakeInput(DT_FLOAT))
-                  .Input(FakeInput(DT_FLOAT))
-                  .Finalize(&node_def));
-  return GetKernel(node_def, device);
-}
-
-std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
-  NodeDef node_def;
-  NodeDefBuilder builder("add_node", "Div");
-  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
-                  .Input(FakeInput(DT_FLOAT))
-                  .Input(FakeInput(DT_FLOAT))
-                  .Finalize(&node_def));
-  return GetKernel(node_def, device);
-}
-
-class NcclReducerTest : public ::testing::Test {
- protected:
-  ~NcclReducerTest() override {
-    if (col_exec_) col_exec_->Unref();
-  }
-
-  void InitGPUDevices() {
-    std::vector<std::unique_ptr<Device>> all_devices;
-    SessionOptions session_options;
-    session_options.config.mutable_gpu_options()
-        ->set_per_process_gpu_memory_fraction(0.1);
-    session_options.env = Env::Default();
-    Status s = DeviceFactory::GetFactory(DEVICE_GPU)
-                   ->AddDevices(session_options, "", &all_devices);
-    TF_CHECK_OK(s);
-    for (std::unique_ptr<Device>& d : all_devices) {
-      if (d->device_type() == "GPU") {
-        gpus_.emplace_back(std::move(d));
-      }
-    }
-  }
-
-  void Init(int num_ranks) {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
-    InitGPUDevices();
-    std::vector<std::unique_ptr<Device>> local_devices;
-    std::vector<string> device_names;
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      if (rank < gpus_.size()) {
-        local_devices.emplace_back(std::move(gpus_[rank]));
-      }
-    }
-    int num_gpus = local_devices.size();
-    for (const auto& device : local_devices) {
-      device_names.push_back(device->name());
-      VLOG(2) << device->name();
-    }
-    if (!dev_mgr_) dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
-        /*gpu_ring_order=*/nullptr);
-
-    // Initialize collective params.
-    col_params_.name = "test_nccl_collective_op";
-    const int group_key = 5;
-    col_params_.group.group_key = group_key;
-    col_params_.group.device_type = DEVICE_GPU;
-    col_params_.group.group_size = num_ranks;
-    const int instance_key = 23;
-    col_params_.instance.instance_key = instance_key;
-    col_params_.instance.type = REDUCTION_COLLECTIVE;
-    col_params_.instance.data_type = DT_FLOAT;
-    col_params_.instance.impl_details.collective_name = "NcclReduce";
-    const string task_name = "/job:worker/replica:0/task:0";
-    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      col_params_.instance.device_names.push_back(
-          device_names[rank % num_gpus]);
-      col_params_.instance.task_names.push_back(task_name);
-    }
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      instances_.push_back(absl::make_unique<DeviceInstance>(
-          rank, col_params_.instance.device_names[rank], this));
-    }
-  }
-
-  void Reduce() {
-    int done = 0;
-    mutex done_mu;
-    condition_variable done_cv;
-    for (const auto& instance : instances_) {
-      DeviceInstance* di = instance.get();
-      SchedClosure([di, &done, &done_mu, &done_cv] {
-        di->DoReduce();
-        mutex_lock l(done_mu);
-        ++done;
-        done_cv.notify_all();
-      });
-    }
-
-    mutex_lock l(done_mu);
-    while (done < instances_.size()) done_cv.wait(l);
-  }
-
-  void RunTest(int num_ranks, int tensor_length) {
-    Init(num_ranks);
-    std::vector<float> expected(tensor_length, 0.0);
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      DeviceInstance* instance = instances_[rank].get();
-      instance->InitTensor(DT_FLOAT, TensorShape({tensor_length}),
-                           [&expected, rank](Tensor* t) {
-                             for (size_t i = 0; i < t->NumElements(); ++i) {
-                               float value = pow(10, rank) * i;
-                               t->flat<float>()(i) = value;
-                               expected[i] += value;
-                             }
-                           });
-    }
-    Reduce();
-    // Confirm that every rank computed the same correct value.
-    for (int i = 0; i < tensor_length; ++i) {
-      expected[i] /= num_ranks;
-    }
-    for (int rank = 0; rank < instances_.size(); ++rank) {
-      TF_ASSERT_OK(instances_[rank]->status_);
-      Tensor* dev_tensor = &instances_[rank]->tensor_;
-      Tensor actual(DT_FLOAT, TensorShape({tensor_length}));
-      Notification note;
-      Device* dev = instances_[rank]->device_;
-      auto* dev_info = dev->tensorflow_gpu_device_info();
-      dev_info->default_context->CopyDeviceTensorToCPU(
-          dev_tensor, /*tensor_name=*/"", dev, &actual,
-          [&note](const Status&) { note.Notify(); });
-      note.WaitForNotification();
-      for (int i = 0; i < tensor_length; ++i) {
-        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
-            << "Mismatch at rank " << rank << " index " << i;
-      }
-    }
-  }
-
-  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
-                                                Tensor* input,
-                                                DeviceBase* device) {
-    mutex_lock l(mu_);
-    NodeDef node_def;
-    NodeDefBuilder builder(
-        strings::StrCat("collective_reduce_", reduce_counter_++),
-        "CollectiveReduce");
-    TF_CHECK_OK(
-        builder.Attr("T", params.instance.data_type)
-            .Attr("merge_op", "Add")
-            .Attr("final_op", "Div")
-            .Attr("group_size", params.group.group_size)
-            .Attr("group_key", params.group.group_key)
-            .Attr("instance_key", params.instance.instance_key)
-            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
-            .Input(FakeInput(params.instance.data_type))
-            .Finalize(&node_def));
-    return GetKernel(node_def, device);
-  }
-
-  class DeviceInstance {
-   public:
-    DeviceInstance(int rank, const string& device_name, NcclReducerTest* parent)
-        : parent_(parent), device_name_(device_name), rank_(rank) {
-      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
-          << "Could not find device " << device_name_ << " existing devices "
-          << parent_->dev_mgr_->DebugString();
-      col_params_.name = parent_->col_params_.name;
-      col_params_.default_rank = rank;
-      col_params_.group.group_key = parent_->col_params_.group.group_key;
-      col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.group.group_size = parent_->col_params_.group.group_size;
-      col_params_.instance = parent->col_params_.instance;
-    }
-
-    void InitTensor(DataType dtype, const TensorShape& shape,
-                    const std::function<void(Tensor*)>& init_f) {
-      tensor_ =
-          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
-      Tensor cpu_tensor(dtype, shape);
-      init_f(&cpu_tensor);
-      VLOG(2) << "cpu_tensor " << cpu_tensor.DebugString();
-      auto* dev_info = device_->tensorflow_gpu_device_info();
-      Notification note;
-      dev_info->default_context->CopyCPUTensorToDevice(
-          &cpu_tensor, device_, &tensor_,
-          [&note](const Status&) { note.Notify(); });
-      note.WaitForNotification();
-    }
-
-    void DoReduce() {
-      col_params_.merge_op = GetAdd(device_);
-      col_params_.final_op = GetDiv(device_);
-
-      // Prepare an OpKernelContext.
-      OpKernelContext::Params op_params;
-      op_params.step_id = kStepId;
-      op_params.device = device_;
-      gtl::InlinedVector<TensorValue, 4> inputs;
-      inputs.push_back(TensorValue(&tensor_));
-      op_params.inputs = &inputs;
-      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
-          {AllocatorAttributes()});
-      op_params.input_alloc_attrs = &input_aa;
-      gtl::InlinedVector<DeviceContext*, 4> input_dc;
-      DeviceContext* dev_ctx = nullptr;
-      auto* dev_info = device_->tensorflow_gpu_device_info();
-      if (dev_info) {
-        dev_ctx = dev_info->default_context;
-        dev_ctx->Ref();
-      } else {
-        dev_ctx = new DeviceContext;
-      }
-      input_dc.push_back(dev_ctx);
-      op_params.input_device_contexts = &input_dc;
-      op_params.op_device_context = dev_ctx;
-      int forward_from = 0;
-      op_params.forward_from_array = &forward_from;
-      AllocatorAttributes generic_alloc_attr;
-      op_params.output_attr_array = &generic_alloc_attr;
-      std::unique_ptr<OpKernel> op =
-          parent_->GetCollectiveReduce(col_params_, &tensor_, device_);
-      op_params.op_kernel = op.get();
-      OpKernelContext ctx(&op_params, 1);
-
-      // We never actually execute the kernel, so we need to do the output
-      // allocation it would do, ourselves.
-      Tensor* output_tensor_ptr = nullptr;
-      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
-                                                       &output_tensor_ptr));
-      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
-
-      // Prepare a NcclReducer instance.
-      string exec_key =
-          strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      NcclReducer reducer;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                &ctx, &op_params, col_params_, exec_key,
-                                kStepId, &tensor_, &tensor_);
-      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
-
-      // Run the all-reduce.
-      reducer.Run([this](Status s) { status_ = s; });
-      if (status_.ok()) {
-        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
-      }
-
-      dev_ctx->Unref();
-    }
-
-    NcclReducerTest* parent_;
-    string device_name_;
-    int rank_;
-    Tensor tensor_;
-    Device* device_;
-    CollectiveParams col_params_;
-    Status status_;
-  };
-
-  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
-  TestCollectiveExecutorMgr col_exec_mgr_;
-  CollectiveExecutor* col_exec_;
-  std::unique_ptr<DeviceMgr> dev_mgr_;
-  std::vector<std::unique_ptr<DeviceInstance>> instances_;
-  CollectiveParams col_params_;
-  mutex mu_;
-  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
-};
-
-TEST_F(NcclReducerTest, Test2Dev16Len) { RunTest(2, 16); }
-TEST_F(NcclReducerTest, Test4Dev16Len) { RunTest(4, 16); }
-TEST_F(NcclReducerTest, Test8Dev16Len) { RunTest(8, 16); }
-TEST_F(NcclReducerTest, Test8Dev128Len) { RunTest(8, 128); }
-TEST_F(NcclReducerTest, Test8Dev1045991Len) { RunTest(8, 1048576); }
-
-}  // namespace tensorflow
-
-#endif
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
new file mode 100644
index 00000000000..b77fe2b9547
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -0,0 +1,586 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+#include <algorithm>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
+#include "tensorflow/core/kernels/collective_nccl_gatherer.h"
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+static constexpr int kStepId = 10;
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node, DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      DEVICE_GPU, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) LOG(FATAL) << status;
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+class NcclTestBase : public ::testing::Test {
+ protected:
+  class DeviceInstance;
+
+  NcclTestBase(CollectiveType collective_type, const string& collective_name)
+      : collective_type_(collective_type), collective_name_(collective_name) {}
+  ~NcclTestBase() override {
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void InitGPUDevices() {
+    std::vector<std::unique_ptr<Device>> all_devices;
+    SessionOptions session_options;
+    session_options.config.mutable_gpu_options()
+        ->set_per_process_gpu_memory_fraction(0.1);
+    session_options.env = Env::Default();
+    Status s = DeviceFactory::GetFactory(DEVICE_GPU)
+                   ->AddDevices(session_options, "", &all_devices);
+    TF_CHECK_OK(s);
+    for (std::unique_ptr<Device>& d : all_devices) {
+      if (d->device_type() == "GPU") {
+        gpus_.emplace_back(std::move(d));
+      }
+    }
+  }
+
+  void Init(const int num_ranks, const int instance_key) {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    InitGPUDevices();
+    std::vector<std::unique_ptr<Device>> local_devices;
+    std::vector<string> device_names;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      if (rank < gpus_.size()) {
+        local_devices.emplace_back(std::move(gpus_[rank]));
+      }
+    }
+    int num_gpus = local_devices.size();
+    for (const auto& device : local_devices) {
+      device_names.push_back(device->name());
+      VLOG(2) << device->name();
+    }
+    if (!dev_mgr_) dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
+        /*gpu_ring_order=*/nullptr);
+
+    // Initialize collective params.
+    col_params_.name = "test_nccl_collective_op";
+    const int group_key = num_ranks;
+    col_params_.group.group_key = group_key;
+    col_params_.group.device_type = DEVICE_GPU;
+    col_params_.group.group_size = num_ranks;
+    col_params_.instance.instance_key = instance_key;
+    col_params_.instance.type = collective_type_;
+    col_params_.instance.data_type = DT_FLOAT;
+    col_params_.instance.impl_details.collective_name = collective_name_;
+    const string task_name = "/job:worker/replica:0/task:0";
+    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      col_params_.instance.device_names.push_back(
+          device_names[rank % num_gpus]);
+      col_params_.instance.task_names.push_back(task_name);
+    }
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      instances_.push_back(absl::make_unique<DeviceInstance>(
+          rank, col_params_.instance.device_names[rank], this));
+    }
+  }
+
+  // Initialize `input` tensor at rank `rank`.
+  virtual void InitInput(Tensor* input, const int rank) = 0;
+
+  // Initialize `expected` output at all `num_ranks` ranks.
+  virtual void InitExpected(std::vector<float>* expected,
+                            const int tensor_length, const int num_ranks) = 0;
+
+  // Initialize device `di` specific to the collective op.
+  virtual void InitDevice(DeviceInstance* di) = 0;
+
+  // Run collective op on device `di`.
+  virtual void RunCollectiveOnDevice(DeviceInstance* di) = 0;
+
+  void RunCollective() {
+    int done = 0;
+    mutex done_mu;
+    condition_variable done_cv;
+    for (const auto& instance : instances_) {
+      DeviceInstance* di = instance.get();
+      InitDevice(di);
+      SchedClosure([this, di, &done, &done_mu, &done_cv] {
+        RunCollectiveOnDevice(di);
+        mutex_lock l(done_mu);
+        ++done;
+        done_cv.notify_all();
+      });
+    }
+
+    mutex_lock l(done_mu);
+    while (done < instances_.size()) done_cv.wait(l);
+  }
+
+  void RunTest(int num_ranks, int input_length, int instance_key) {
+    Init(num_ranks, instance_key);
+    std::vector<float> expected;
+    InitExpected(&expected, input_length, num_ranks);
+    if (VLOG_IS_ON(3)) {
+      string str_buf;
+      for (const auto& x : expected) {
+        strings::StrAppend(&str_buf, " ", x);
+      }
+      VLOG(3) << "Expected output " << str_buf;
+    }
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      DeviceInstance* instance = instances_[rank].get();
+      instance->InitTensor(DT_FLOAT, TensorShape({input_length}),
+                           [this, rank](Tensor* t) { InitInput(t, rank); });
+    }
+    RunCollective();
+    // Confirm that every rank computed the same correct value.
+    for (int rank = 0; rank < instances_.size(); ++rank) {
+      TF_ASSERT_OK(instances_[rank]->status_);
+      Tensor* output = &instances_[rank]->output_;
+      const int output_length = output->NumElements();
+      VLOG(2) << "rank " << rank << " output " << output << " buf "
+              << DMAHelper::base(output);
+      Tensor actual(DT_FLOAT, TensorShape({output_length}));
+      Notification note;
+      Device* dev = instances_[rank]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          output, /*tensor_name=*/"", dev, &actual, [&note](const Status& s) {
+            TF_CHECK_OK(s);
+            note.Notify();
+          });
+      note.WaitForNotification();
+      VLOG(3) << "rank " << rank << " got output tensor "
+              << actual.DebugString(output_length);
+      for (int i = 0; i < output_length; ++i) {
+        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
+            << "Mismatch at rank " << rank << " index " << i;
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduceOpKernel(
+      const CollectiveParams& params, Tensor* input, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(strings::StrCat("collective_reduce_", op_counter_++),
+                           "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Div")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& device_name, NcclTestBase* parent)
+        : parent_(parent), device_name_(device_name), rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
+          << "Could not find device " << device_name_ << " existing devices "
+          << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.default_rank = rank;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      input_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      Tensor cpu_tensor(dtype, shape);
+      init_f(&cpu_tensor);
+      if (VLOG_IS_ON(3)) {
+        VLOG(3) << "input tensor "
+                << cpu_tensor.DebugString(shape.num_elements());
+      } else {
+        VLOG(2) << "input tensor " << cpu_tensor.DebugString();
+      }
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      Notification note;
+      dev_info->default_context->CopyCPUTensorToDevice(
+          &cpu_tensor, device_, &input_, [&note](const Status& s) {
+            TF_CHECK_OK(s);
+            note.Notify();
+          });
+      note.WaitForNotification();
+    }
+
+    void PrepareDeviceContext(OpKernelContext::Params* params) {
+      params->step_id = kStepId;
+      params->device = device_;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      params->op_device_context = dev_ctx;
+    }
+
+    void RunReduce() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      PrepareDeviceContext(&op_params);
+
+      // Prepare inputs and outputs to OpKernel.
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&input_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      input_dc.push_back(op_params.op_device_context);
+      op_params.input_device_contexts = &input_dc;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          parent_->GetCollectiveReduceOpKernel(col_params_, &input_, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, input_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Run the all-reduce.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclReducer reducer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                /*OpKernelContext=*/&ctx, &op_params,
+                                col_params_, exec_key, kStepId,
+                                /*input=*/&input_, /*output=*/&input_);
+      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+      Notification note;
+      reducer.Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+      if (status_.ok()) {
+        CHECK(output_.CopyFrom(*ctx.mutable_output(0), input_.shape()));
+      }
+
+      op_params.op_device_context->Unref();
+    }
+
+    void RunBroadcast() {
+      VLOG(2) << "RunBroadcast name " << parent_->collective_name_ << " rank "
+              << col_params_.default_rank;
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      PrepareDeviceContext(&op_params);
+      OpKernelContext ctx(&op_params, 1);
+
+      // Run broadcast.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclBroadcaster broadcaster;
+      CollectiveContext col_ctx(
+          parent_->col_exec_, parent_->dev_mgr_.get(),
+          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
+          /*input=*/col_params_.is_source ? &input_ : nullptr,
+          /*output=*/&input_);
+      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(&col_ctx));
+      Notification note;
+      broadcaster.Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+      if (status_.ok()) {
+        CHECK(output_.CopyFrom(input_, input_.shape()));
+      }
+
+      op_params.op_device_context->Unref();
+    }
+
+    void RunGather() {
+      VLOG(2) << "RunGather name " << parent_->collective_name_ << " rank "
+              << col_params_.default_rank;
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      PrepareDeviceContext(&op_params);
+      OpKernelContext ctx(&op_params, 1);
+
+      // Allocate output.  We can't reuse the input because output has a
+      // different shape.
+      auto output_shape = input_.shape();
+      output_shape.set_dim(
+          0, output_shape.dim_size(0) * col_params_.group.group_size);
+      output_ = Tensor(device_->GetAllocator(AllocatorAttributes()), DT_FLOAT,
+                       output_shape);
+
+      // Run gather.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclGatherer gatherer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                /*OpKernelContext=*/&ctx, &op_params,
+                                col_params_, exec_key, kStepId,
+                                /*input=*/&input_,
+                                /*output=*/&output_);
+      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+      Notification note;
+      gatherer.Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+
+      op_params.op_device_context->Unref();
+    }
+
+    NcclTestBase* parent_;
+    string device_name_;
+    int rank_;
+    Tensor input_;
+    Tensor output_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };
+
+  CollectiveType collective_type_;
+  const string collective_name_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::vector<std::unique_ptr<DeviceInstance>> instances_;
+  CollectiveParams col_params_;
+  mutex mu_;
+  int32 op_counter_ GUARDED_BY(mu_) = 0;
+};
+
+class NcclReducerTest : public NcclTestBase {
+ protected:
+  NcclReducerTest()
+      : NcclTestBase(/*collective_type=*/REDUCTION_COLLECTIVE,
+                     /*collective_name=*/"NcclReduce") {}
+  ~NcclReducerTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      float value = pow(10, rank) * i;
+      input->flat<float>()(i) = value;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int num_ranks) override {
+    expected->resize(tensor_length);
+    for (int i = 0; i < tensor_length; ++i) {
+      float expected_sum = 0.0;
+      for (int rank = 0; rank < num_ranks; ++rank) {
+        float value = pow(10, rank) * i;
+        expected_sum += value;
+      }
+      (*expected)[i] = expected_sum / num_ranks;
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {
+    di->col_params_.merge_op = GetAdd(di->device_);
+    di->col_params_.final_op = GetDiv(di->device_);
+  }
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunReduce(); }
+};
+
+class NcclBroadcasterTest : public NcclTestBase {
+ protected:
+  NcclBroadcasterTest()
+      : NcclTestBase(/*collective_type=*/BROADCAST_COLLECTIVE,
+                     /*collective_name=*/"NcclBroadcast") {}
+  ~NcclBroadcasterTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    bool source = rank == source_rank_;
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      input->flat<float>()(i) = source ? static_cast<float>(i) : -1.0;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int num_ranks) override {
+    expected->resize(tensor_length);
+    for (int i = 0; i < tensor_length; ++i) {
+      (*expected)[i] = i;
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {
+    di->col_params_.source_rank = source_rank_;
+    di->col_params_.is_source = di->col_params_.default_rank == source_rank_;
+  }
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override {
+    di->RunBroadcast();
+  }
+
+  int source_rank_ = 0;
+};
+
+class NcclGathererTest : public NcclTestBase {
+ protected:
+  NcclGathererTest()
+      : NcclTestBase(/*collective_type=*/GATHER_COLLECTIVE,
+                     /*collective_name=*/"NcclGather") {}
+  ~NcclGathererTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      float value = pow(10, rank) * i;
+      input->flat<float>()(i) = value;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int num_ranks) override {
+    expected->resize(tensor_length * num_ranks, -1);
+    for (int rank = 0, i = 0; rank < num_ranks; ++rank) {
+      for (int j = 0; j < tensor_length; ++j, ++i) {
+        (*expected)[i] = pow(10, rank) * j;
+      }
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {}
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunGather(); }
+
+  int source_rank_ = 0;
+};
+
+TEST_F(NcclReducerTest, Test2Dev16Len) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test4Dev16Len) {
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev16Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev128Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev1045991Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+}
+
+TEST_F(NcclBroadcasterTest, Test2Dev16LenSrc0) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test4Dev16LenSrc1) {
+  source_rank_ = 1;
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev16LenSrc7) {
+  source_rank_ = 7;
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev128LenSrc0) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/24);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev1045991LenSrc0) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+}
+
+TEST_F(NcclGathererTest, Test2Dev16Len) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclGathererTest, Test4Dev16Len) {
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclGathererTest, Test8Dev16Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclGathererTest, Test8Dev128Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/24);
+}
+TEST_F(NcclGathererTest, Test8Dev1045991Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+}
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index b262dc58dca..1aa841e938c 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -40,9 +40,9 @@ class CollectiveOpKernel : public AsyncOpKernel {
     if (col_params_.group.group_size >
         col_params_.instance.device_names.size()) {
       // This is the first invocation: Finish initializing col_params_.
-      // Call in a blockable thread because it's not guaranteed that
-      // this call cannot block.
-      c->env()->SchedClosure([this, c, done, col_exec]() {
+      // Schedule the `CompleteParamsAsync` call on a work queue that can handle
+      // blocking work because it's not guaranteed that this call cannot block.
+      c->collective_executor()->RunClosure([this, c, done, col_exec]() {
         VLOG(1) << "CollectiveOpKernel CompleteParams for collective "
                 << col_params_.name << " device " << c->device()->name()
                 << " group " << col_params_.group.group_key << " instance "
@@ -78,6 +78,9 @@ class CollectiveGatherOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("communication_hint",
+                      &col_params_.instance.impl_details.communication_hint));
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Gather");
     col_params_.group.device_type = c->device_type();
@@ -167,6 +170,9 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
                     final_op_name));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
     OP_REQUIRES_OK(c, c->GetAttr("wait_for", &dependencies_));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("communication_hint",
+                      &col_params_.instance.impl_details.communication_hint));
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
@@ -261,6 +267,9 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
         c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
     OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("communication_hint",
+                      &col_params_.instance.impl_details.communication_hint));
     col_params_.is_source = true;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
@@ -330,6 +339,9 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
         c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
     OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("communication_hint",
+                      &col_params_.instance.impl_details.communication_hint));
     col_params_.is_source = false;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
index 0877a853625..49f9add58b9 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@@ -32,8 +32,10 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 template <typename T>
-__global__ void CompareAndBitpackKernel(const int size, const T* threshold,
-                                        const T* input, uint8* output) {
+__global__ void CompareAndBitpackKernel(const int size,
+                                        const T* __restrict__ threshold,
+                                        const T* __restrict__ input,
+                                        uint8* __restrict__ output) {
   // TODO(ebrevdo): Erich said: to get a better memory access pattern
   // you could have 8 threads load this data and do a comparison, then
   // use the ballot instruction to combine the values from each thread
@@ -54,10 +56,9 @@ __global__ void CompareAndBitpackKernel(const int size, const T* threshold,
 }
 
 template <>
-__global__ void CompareAndBitpackKernel<bool>(const int size,
-                                              const bool* threshold,
-                                              const bool* input,
-                                              uint8* output) {
+__global__ void CompareAndBitpackKernel<bool>(
+    const int size, const bool* __restrict__ threshold,
+    const bool* __restrict__ input, uint8* __restrict__ output) {
   // TODO(ebrevdo): Erich said: I think you could again have multiple
   // threads work on one block and use the ballot instruction to the
   // bit packing in one instruction.
@@ -76,10 +77,9 @@ __global__ void CompareAndBitpackKernel<bool>(const int size,
 }
 
 template <>
-__global__ void CompareAndBitpackKernel<float>(const int size,
-                                               const float* threshold,
-                                               const float* input,
-                                               uint8* output) {
+__global__ void CompareAndBitpackKernel<float>(
+    const int size, const float* __restrict__ threshold,
+    const float* __restrict__ input, uint8* __restrict__ output) {
   const float thresh = ldg(threshold);
   GPU_1D_KERNEL_LOOP(i, size) {
     const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
@@ -93,10 +93,9 @@ __global__ void CompareAndBitpackKernel<float>(const int size,
 }
 
 template <>
-__global__ void CompareAndBitpackKernel<double>(const int size,
-                                                const double* threshold,
-                                                const double* input,
-                                                uint8* output) {
+__global__ void CompareAndBitpackKernel<double>(
+    const int size, const double* __restrict__ threshold,
+    const double* __restrict__ input, uint8* __restrict__ output) {
   const double thresh = ldg(threshold);
   GPU_1D_KERNEL_LOOP(i, size) {
     const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 547a7b40b92..d66511a495b 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -73,13 +73,15 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
+REGISTER(uint32)
+REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
-    // Primarily used for SavedModel support on mobile. Registering it here only
-    // if __ANDROID_TYPES_FULL__ is not defined (which already registers string)
-    // to avoid duplicate registration.
-    REGISTER(string);
+// Primarily used for SavedModel support on mobile. Registering it here only
+// if __ANDROID_TYPES_FULL__ is not defined (which already registers string)
+// to avoid duplicate registration.
+REGISTER(tstring);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
         // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
         // !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 3ddd071fe4a..8f5458c9b56 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -36,7 +36,7 @@ namespace {
 template <typename T, typename IntType>
 __global__ void concat_fixed_kernel(
     GpuDeviceArrayStruct<const T*> input_ptr_data, int split_size,
-    int total_rows, int total_cols, T* output) {
+    int total_rows, int total_cols, T* __restrict__ output) {
   const T** input_ptrs = GetGpuDeviceArrayOnDevice(&input_ptr_data);
   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ea0c486f304..350f5e71725 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -194,6 +194,8 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
+REGISTER_CONCAT(uint32);
+REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index ab54fc1d914..a2bfa2cdc8c 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -113,7 +113,7 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
     // Verify that the shared accumulator is compatible
     // with the requested arguments.
     TF_RETURN_IF_ERROR(accumulator->MatchesNodeDef(def()));
-    auto h = accumulator_handle_.AccessTensor(ctx)->template flat<string>();
+    auto h = accumulator_handle_.AccessTensor(ctx)->template flat<tstring>();
     h(0) = cinfo_.container();
     h(1) = cinfo_.name();
     accumulator_handle_set_ = true;
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index 2bbd0ec35fb..3c7fbe0c65a 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -85,7 +85,7 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
 
   void SetHandleToOutput(OpKernelContext* ctx)
       SHARED_LOCKS_REQUIRED(mu_) override {
-    auto h = accumulator_handle_.AccessTensor(ctx)->template flat<string>();
+    auto h = accumulator_handle_.AccessTensor(ctx)->template flat<tstring>();
     h(0) = cinfo_.container();
     h(1) = cinfo_.name();
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 723814c5b58..b084af9fd4d 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -145,8 +145,8 @@ REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
 REGISTER_GPU_HOST_REF_KERNEL(bool);
-REGISTER_GPU_HOST_KERNEL(string);
-REGISTER_GPU_HOST_REF_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
+REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
@@ -183,7 +183,7 @@ TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
-REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(tstring);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
 #define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
@@ -198,7 +198,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
 REGISTER_SYCL_HOST_REF_KERNEL(bool);
-REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(tstring);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
@@ -350,7 +350,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
                           MergeOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
@@ -373,7 +373,7 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
                           MergeOp)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(tstring);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
@@ -439,8 +439,8 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
-REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(tstring);
+REGISTER_SYCL_HOST_REF_KERNEL(tstring);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
@@ -468,8 +468,8 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
-REGISTER_GPU_HOST_REF_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
+REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
@@ -529,7 +529,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                           ExitOp)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(tstring);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
@@ -551,7 +551,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
                           ExitOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
@@ -601,7 +601,7 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
                           NextIterationOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
@@ -634,7 +634,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                           NextIterationOp)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(tstring);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index a2f7bd40692..4037f1c3855 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -71,12 +71,12 @@ TEST_F(SwitchOpTest, Int32Success_2_3_s0) {
 
 TEST_F(SwitchOpTest, StringSuccess_s1) {
   Initialize(DT_STRING);
-  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
   AddInputFromArray<bool>(TensorShape({}), {true});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({6}));
-  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(1));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(1));
   EXPECT_EQ(nullptr, GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index f47f2a1527e..22d7f939686 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -182,8 +182,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
 // Requires that nthreads is equal to the total number of elements in the input
 // tensor.
 template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
-__global__ void ShuffleInTensor3Simple(int nthreads, const T* input,
-                                       Dimension<3> input_dims, T* output) {
+__global__ void ShuffleInTensor3Simple(int nthreads,
+                                       const T* __restrict__ input,
+                                       Dimension<3> input_dims,
+                                       T* __restrict__ output) {
   Dimension<3> output_dims;
   output_dims[sp0] = input_dims[0];
   output_dims[sp1] = input_dims[1];
@@ -366,8 +368,10 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
 // A Gpu custom kernel that convert input to output, given proper padding on
 // the left and the top. The padded value is zero.
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
-                                         Dimension<NDIMS> input_dims, T* output,
+__global__ void PadInputCustomKernelNHWC(int nthreads,
+                                         const T* __restrict__ input,
+                                         Dimension<NDIMS> input_dims,
+                                         T* __restrict__ output,
                                          Dimension<NDIMS> output_dims,
                                          Dimension<NDIMS - 2> padding_left) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
@@ -395,8 +399,10 @@ __global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
 }
 
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNCHW(int nthreads, const T* input,
-                                         Dimension<NDIMS> input_dims, T* output,
+__global__ void PadInputCustomKernelNCHW(int nthreads,
+                                         const T* __restrict__ input,
+                                         Dimension<NDIMS> input_dims,
+                                         T* __restrict__ output,
                                          Dimension<NDIMS> output_dims,
                                          Dimension<NDIMS - 2> padding_left) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index ea934b81680..84588e2ff50 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -53,6 +53,11 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
 
 namespace {
 
@@ -408,11 +413,15 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current implementation does not yet support "
                     "dilations in the batch and depth dimensions."));
-    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
-    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
-                errors::InvalidArgument(
-                    "Current libxsmm and customized CPU implementations do "
-                    "not yet support dilation rates larger than 1."));
+    if (std::is_same<Device, CPUDevice>::value ||
+        std::is_same<Device, GPUDevice>::value) {
+      // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+      OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                  errors::InvalidArgument(
+                      "Current libxsmm and customized CPU implementations do "
+                      "not yet support dilation rates larger than 1."));
+      dilations_ = {1, 1, 1, 1};
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -434,8 +443,8 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
         context,
         ConvBackpropComputeDimensionsV2(
             "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
-            filter_shape, out_backprop.shape(), /*dilations=*/{1, 1, 1, 1},
-            strides_, padding_, explicit_paddings_, data_format_, &dims));
+            filter_shape, out_backprop.shape(), dilations_, strides_, padding_,
+            explicit_paddings_, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
@@ -929,10 +938,10 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     transformed_input = compatible_input;
   }
 
-  auto out_backprop_ptr =
+  se::DeviceMemory<T> out_backprop_ptr =
       AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
                      transformed_out_backprop.template flat<T>().size());
-  auto filter_backprop_ptr =
+  se::DeviceMemory<T> filter_backprop_ptr =
       AsDeviceMemory(pre_transformed_filter_backprop.template flat<T>().data(),
                      pre_transformed_filter_backprop.template flat<T>().size());
   auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
@@ -961,11 +970,21 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         common_padding_cols}},             // padding_cols
       dtype,                               // tensor datatype
       device_id,                           // device_id
+      conv_desc.group_count()              // group_count
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
 #if GOOGLE_CUDA
+
+    se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
+                                                stream);
+    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                            se::cuda::PtxCompilationOptions());
+
+    se::DeviceMemory<T> filter_backprop_ptr_rz(
+        WrapRedzoneBestEffort(&rz_allocator, filter_backprop_ptr));
+
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
@@ -976,13 +995,21 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       // accuracy.
       DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
                                             ctx);
+      se::cuda::RedzoneAllocator rz_scratch_allocator(
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+          /*memory_limit=*/ConvolveBackwardFilterScratchSize);
+      se::ScratchAllocator* allocator_used =
+          !RedzoneCheckDisabled()
+              ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+              : static_cast<se::ScratchAllocator*>(&scratch_allocator);
+
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
               ->ThenConvolveBackwardFilterWithAlgorithm(
                   input_desc, input_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, filter_desc, &filter_backprop_ptr,
-                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                  conv_desc, filter_desc, &filter_backprop_ptr_rz,
+                  allocator_used, AlgorithmConfig(profile_algorithm),
                   &profile_result)
               .ok();
       if (cudnn_launch_status && profile_result.is_valid()) {
@@ -991,14 +1018,21 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
         result.mutable_conv()->set_tensor_ops_enabled(
             profile_algorithm.tensor_ops_enabled());
-        result.set_scratch_bytes(scratch_allocator.TotalByteSize());
+
+        result.set_scratch_bytes(
+            !RedzoneCheckDisabled()
+                ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
+                : scratch_allocator.TotalByteSize());
         *result.mutable_run_time() = proto_utils::ToDurationProto(
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+        CheckRedzones(rz_scratch_allocator, &result);
+        CheckRedzones(rz_allocator, &result);
       }
     }
     LogConvAutotuneResults(se::dnn::ConvolutionKind::BACKWARD_FILTER,
                            se::dnn::ToDataType<T>::value, input_ptr,
-                           filter_backprop_ptr, out_backprop_ptr, input_desc,
+                           filter_backprop_ptr_rz, out_backprop_ptr, input_desc,
                            filter_desc, output_desc, conv_desc,
                            stream->parent(), results);
     OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 8974aa1e11d..165560d6789 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -53,6 +53,11 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
 
 namespace {
 
@@ -1091,11 +1096,22 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         common_padding_cols}},             // padding_cols
       dtype,                               // tensor data type
       device_id,                           // device_id
+      conv_desc.group_count()              // group_count
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
 #if GOOGLE_CUDA
+
+    se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
+                                                stream);
+
+    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                            se::cuda::PtxCompilationOptions());
+
+    se::DeviceMemory<T> in_backprop_ptr_rz(
+        WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
+
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
@@ -1106,12 +1122,19 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       // accuracy.
       DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                             ctx);
+      se::cuda::RedzoneAllocator rz_scratch_allocator(
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+          /*memory_limit=*/ConvolveBackwardDataScratchSize);
+      se::ScratchAllocator* allocator_used =
+          !RedzoneCheckDisabled()
+              ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+              : static_cast<se::ScratchAllocator*>(&scratch_allocator);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
               ->ThenConvolveBackwardDataWithAlgorithm(
                   filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                  conv_desc, input_desc, &in_backprop_ptr_rz, allocator_used,
                   AlgorithmConfig(profile_algorithm), &profile_result)
               .ok();
       if (cudnn_launch_status && profile_result.is_valid()) {
@@ -1120,9 +1143,15 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
         result.mutable_conv()->set_tensor_ops_enabled(
             profile_algorithm.tensor_ops_enabled());
-        result.set_scratch_bytes(scratch_allocator.TotalByteSize());
+        result.set_scratch_bytes(
+            !RedzoneCheckDisabled()
+                ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
+                : scratch_allocator.TotalByteSize());
         *result.mutable_run_time() = proto_utils::ToDurationProto(
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+        CheckRedzones(rz_scratch_allocator, &result);
+        CheckRedzones(rz_allocator, &result);
       }
     }
     LogConvAutotuneResults(
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 6ab51781f6d..687338f8fa0 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -41,7 +41,14 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/platform/stream_executor.h"
 using stream_executor::dnn::DimIndex;
-#endif
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
 
 namespace {
 
@@ -1164,11 +1171,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    if (dims.filter_size(0) == 1 && dims.filter_size(1) == 1 &&
-        dims.filter_size(2) == 1 && dims.dilation(0) == 1 &&
-        dims.dilation(1) == 1 && dims.dilation(2) == 1 && dims.stride(0) == 1 &&
-        dims.stride(1) == 1 && dims.stride(2) == 1 &&
-        data_format_ == FORMAT_NHWC) {
+    bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
+    if (!is_grouped_convolution && dims.filter_size(0) == 1 &&
+        dims.filter_size(1) == 1 && dims.filter_size(2) == 1 &&
+        dims.dilation(0) == 1 && dims.dilation(1) == 1 &&
+        dims.dilation(2) == 1 && dims.stride(0) == 1 && dims.stride(1) == 1 &&
+        dims.stride(2) == 1 && data_format_ == FORMAT_NHWC) {
       const uint64 m = dims.batch_size * dims.input_size(0) *
                        dims.input_size(1) * dims.input_size(2);
       const uint64 k = dims.out_depth;
@@ -1194,7 +1202,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                                             ", n=", n, ", k=", k));
       }
       return;
-    } else if (dims.filter_size(0) == dims.input_size(0) &&
+    } else if (!is_grouped_convolution &&
+               dims.filter_size(0) == dims.input_size(0) &&
                dims.filter_size(1) == dims.input_size(1) &&
                dims.filter_size(2) == dims.input_size(2) &&
                padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) {
@@ -1269,8 +1278,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
-        .set_input_feature_map_count(dims.in_depth)
-        .set_output_feature_map_count(dims.out_depth);
+        .set_input_feature_map_count(filter_shape.dim_size(3))
+        .set_output_feature_map_count(filter_shape.dim_size(4));
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1280,17 +1289,18 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         .set_filter_stride(DimIndex::Z, dims.stride(0))
         .set_zero_padding(DimIndex::X, padding_cols / 2)
         .set_zero_padding(DimIndex::Y, padding_rows / 2)
-        .set_zero_padding(DimIndex::Z, padding_planes / 2);
+        .set_zero_padding(DimIndex::Z, padding_planes / 2)
+        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
 
     // Shape: out, in, z, y, x.
     Tensor transformed_filter;
     OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(
-            DataTypeToEnum<T>::value,
-            TensorShape({dims.out_depth, dims.in_depth, dims.filter_size(0),
-                         dims.filter_size(1), dims.filter_size(2)}),
-            &transformed_filter));
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     TensorShape({filter_shape.dim_size(4),
+                                  filter_shape.dim_size(3), dims.filter_size(0),
+                                  dims.filter_size(1), dims.filter_size(2)}),
+                     &transformed_filter));
     functor::TransformFilter<GPUDevice, T, int, 5>()(
         context->eigen_device<GPUDevice>(), FORMAT_OIHW,
         To32Bit(filter.tensor<T, 5>()),
@@ -1349,7 +1359,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
         device_id,
-    };
+        conv_desc.group_count()};
 
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
@@ -1358,6 +1368,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
 #if GOOGLE_CUDA
+      se::TfAllocatorAdapter tf_allocator_adapter(
+          context->device()->GetAllocator({}), stream);
+      se::cuda::RedzoneAllocator rz_allocator(
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
+      se::DeviceMemory<T> in_backprop_ptr_rz(
+          WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
@@ -1365,21 +1381,42 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
           &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
+      std::vector<tensorflow::AutotuneResult> results;
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
         DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                               context);
+        se::cuda::RedzoneAllocator rz_scratch_allocator(
+            stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+            /*memory_limit=*/ConvolveBackwardDataScratchSize);
+        se::ScratchAllocator* allocator_used =
+            !RedzoneCheckDisabled()
+                ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+                : static_cast<se::ScratchAllocator*>(&scratch_allocator);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
                 ->ThenConvolveBackwardDataWithAlgorithm(
                     filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                    conv_desc, input_desc, &in_backprop_ptr_rz, allocator_used,
                     AlgorithmConfig(profile_algorithm), &profile_result)
                 .ok();
         if (cudnn_launch_status) {
           if (profile_result.is_valid()) {
+            results.emplace_back();
+            auto& result = results.back();
+            result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+            result.mutable_conv()->set_tensor_ops_enabled(
+                profile_algorithm.tensor_ops_enabled());
+            result.set_scratch_bytes(
+                !RedzoneCheckDisabled()
+                    ? rz_scratch_allocator
+                          .TotalAllocatedBytesExcludingRedzones()
+                    : scratch_allocator.TotalByteSize());
+            *result.mutable_run_time() = proto_utils::ToDurationProto(
+                absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
             if (profile_result.elapsed_time_in_ms() <
                 best_result.elapsed_time_in_ms()) {
               best_result = profile_result;
@@ -1389,9 +1426,17 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                     best_result_no_scratch.elapsed_time_in_ms()) {
               best_result_no_scratch = profile_result;
             }
+            // TODO(george): they don't do results at all??
+            CheckRedzones(rz_scratch_allocator, &result);
+            CheckRedzones(rz_allocator, &result);
           }
         }
       }
+      LogConvAutotuneResults(se::dnn::ConvolutionKind::BACKWARD_DATA,
+                             se::dnn::ToDataType<T>::value, in_backprop_ptr,
+                             filter_ptr, out_backprop_ptr, input_desc,
+                             filter_desc, output_desc, conv_desc,
+                             stream->parent(), results);
       OP_REQUIRES(context,
                   best_result.is_valid() || best_result_no_scratch.is_valid(),
                   errors::NotFound("No algorithm worked!"));
@@ -1566,11 +1611,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    if (dims.filter_size(1) == 1 && dims.filter_size(2) == 1 &&
-        dims.filter_size(0) == 1 && dims.dilation(2) == 1 &&
-        dims.dilation(1) == 1 && dims.dilation(0) == 1 && dims.stride(2) == 1 &&
-        dims.stride(1) == 1 && dims.stride(0) == 1 &&
-        data_format_ == FORMAT_NHWC) {
+    bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
+    if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
+        dims.filter_size(2) == 1 && dims.filter_size(0) == 1 &&
+        dims.dilation(2) == 1 && dims.dilation(1) == 1 &&
+        dims.dilation(0) == 1 && dims.stride(2) == 1 && dims.stride(1) == 1 &&
+        dims.stride(0) == 1 && data_format_ == FORMAT_NHWC) {
       const uint64 m = dims.in_depth;
       const uint64 k = dims.batch_size * dims.input_size(1) *
                        dims.input_size(2) * dims.input_size(0);
@@ -1605,7 +1651,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                                             ", n=", n, ", k=", k));
       }
       return;
-    } else if (dims.filter_size(0) == dims.input_size(0) &&
+    } else if (!is_grouped_convolution &&
+               dims.filter_size(0) == dims.input_size(0) &&
                dims.filter_size(1) == dims.input_size(1) &&
                dims.filter_size(2) == dims.input_size(2) &&
                padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) {
@@ -1685,8 +1732,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
-        .set_input_feature_map_count(dims.in_depth)
-        .set_output_feature_map_count(dims.out_depth);
+        .set_input_feature_map_count(filter_shape.dim_size(3))
+        .set_output_feature_map_count(filter_shape.dim_size(4));
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1696,16 +1743,16 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_filter_stride(DimIndex::Z, dims.stride(0))
         .set_zero_padding(DimIndex::X, padding_cols / 2)
         .set_zero_padding(DimIndex::Y, padding_rows / 2)
-        .set_zero_padding(DimIndex::Z, padding_planes / 2);
-
+        .set_zero_padding(DimIndex::Z, padding_planes / 2)
+        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
     Tensor pre_transformed_filter_backprop;
     OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(
-            DataTypeToEnum<T>::value,
-            TensorShape({dims.out_depth, dims.in_depth, dims.filter_size(0),
-                         dims.filter_size(1), dims.filter_size(2)}),
-            &pre_transformed_filter_backprop));
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     TensorShape({filter_shape.dim_size(4),
+                                  filter_shape.dim_size(3), dims.filter_size(0),
+                                  dims.filter_size(1), dims.filter_size(2)}),
+                     &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
     if (data_format_ == FORMAT_NHWC) {
@@ -1772,7 +1819,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
         device_id,
-    };
+        conv_desc.group_count()};
 
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 4ea31861e7a..d453e9d68e7 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -75,7 +75,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
-
 template <typename Device, typename T>
 struct LaunchGeneric {
   void operator()(OpKernelContext* ctx, const Tensor& input,
@@ -578,10 +577,6 @@ template struct LaunchConv2DOp<CPUDevice, float>;
 template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-static bool RedzoneCheckDisabled() {
-  const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK");
-  return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
-}
 
 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
                            int64 default_value_in_bytes) {
@@ -608,47 +603,6 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
                           se::dnn::AlgorithmConfig>
     AutoTuneConv;
 
-#if GOOGLE_CUDA
-// Check the passed allocator for redzone violations.
-// If violations have occurred, mark the corresponding autotune result
-// as a failure.
-static void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
-                          se::Stream* stream,
-                          tensorflow::AutotuneResult* autotune_result) {
-  se::port::StatusOr<se::cuda::RedzoneAllocator::RedzoneCheckStatus> rz_status =
-      rz_allocator.CheckRedzones(stream);
-  if (!rz_status.ok()) {
-    static std::once_flag failure_logged;
-    std::call_once(failure_logged, [&]() {
-      LOG(WARNING) << "Failed to check cudnn convolutions for out-of-bounds "
-                   << "reads and writes with an error message: '"
-                   << rz_status.status().error_message()
-                   << "'; skipping this check. This only means that we won't "
-                   << "check cudnn for out-of-bounds reads and writes. This "
-                   << "message will only be printed once.";
-    });
-    return;
-  }
-  auto rz_check_status = rz_status.ValueOrDie();
-  if (!rz_check_status.ok()) {
-    auto* fail = autotune_result->mutable_failure();
-    fail->set_msg(rz_check_status.RedzoneFailureMsg());
-    fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
-    fail->set_buffer_address(
-        reinterpret_cast<uint64>(rz_check_status.user_buffer_address));
-    LOG(ERROR)
-        << "Detected cudnn out-of-bounds write in convolution buffer! This is "
-           "likely a cudnn bug. We will skip this algorithm in the future, but "
-           "your GPU state may already be corrupted, leading to incorrect "
-           "results. Within Google, no action is needed on your part. Outside "
-           "of Google, please ensure you're running the latest version of "
-           "cudnn. If that doesn't fix the problem, please file a bug with "
-           "this full error message and we'll contact nvidia.";
-    LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
-  }
-}
-#endif  // GOOGLE_CUDA
-
 template <typename T>
 void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
@@ -966,25 +920,24 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
-  ConvParameters conv_parameters = {
-      in_batch,                 // batch
-      in_depths,                // in_depths
-      {{in_rows,                // in_rows
-        in_cols}},              // in_cols
-      compute_data_format,      // compute_data_format
-      out_depths,               // out_depths
-      {{patch_rows,             // filter_rows
-        patch_cols,             // filter_cols
-        patch_depths}},         // filter_depths
-      {{row_dilation,           // dilation_rows
-        col_dilation}},         // dilation_cols
-      {{row_stride,             // stride_rows
-        col_stride}},           // stride_cols
-      {{common_padding_rows,    // padding_rows
-        common_padding_cols}},  // padding_cols
-      dtype,                    // tensor datatype
-      device_id,                // device_id
-  };
+  ConvParameters conv_parameters = {in_batch,             // batch
+                                    in_depths,            // in_depths
+                                    {{in_rows,            // in_rows
+                                      in_cols}},          // in_cols
+                                    compute_data_format,  // compute_data_format
+                                    out_depths,           // out_depths
+                                    {{patch_rows,         // filter_rows
+                                      patch_cols,         // filter_cols
+                                      patch_depths}},     // filter_depths
+                                    {{row_dilation,       // dilation_rows
+                                      col_dilation}},     // dilation_cols
+                                    {{row_stride,         // stride_rows
+                                      col_stride}},       // stride_cols
+                                    {{common_padding_rows,    // padding_rows
+                                      common_padding_cols}},  // padding_cols
+                                    dtype,                    // tensor datatype
+                                    device_id,                // device_id
+                                    conv_desc.group_count()};
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
@@ -1000,41 +953,20 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
                         "because cuDNN failed to initialize, so try looking to "
                         "see if a warning log message was printed above."));
 
-    se::TfAllocatorAdapter tf_allocator_adapter(
-        stream->parent()->platform(), ctx->device()->GetAllocator({}));
-
-    se::cuda::RedzoneAllocator rz_allocator(stream->parent()->device_ordinal(),
-                                            &tf_allocator_adapter,
+    se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
+                                                stream);
+    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
                                             se::cuda::PtxCompilationOptions());
-
-    se::DeviceMemory<T> output_tensor;
-
-    if (!RedzoneCheckDisabled()) {
-      auto output_rz_or = rz_allocator.AllocateBytes(stream, output_ptr.size());
-      if (!output_rz_or.ok()) {
-        static std::once_flag rz_allocation_failure_logged;
-        std::call_once(rz_allocation_failure_logged, []() {
-          LOG(WARNING)
-              << "Failed to allocate memory for convolution redzone "
-              << "checking; skipping this check. This is benign and only "
-              << "means that we won't check cudnn for out-of-bounds reads "
-              << "and writes. This message will only be printed once.";
-        });
-        output_tensor = output_ptr;
-      } else {
-        output_tensor = se::DeviceMemory<T>(output_rz_or.ValueOrDie());
-      }
-    } else {
-      output_tensor = output_ptr;
-    }
+    se::DeviceMemory<T> output_tensor(
+        WrapRedzoneBestEffort(&rz_allocator, output_ptr));
 
     std::vector<tensorflow::AutotuneResult> results;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
       se::cuda::RedzoneAllocator rz_scratch_allocator(
-          stream->parent()->device_ordinal(), &tf_allocator_adapter,
-          se::cuda::PtxCompilationOptions());
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+          /*memory_limit=*/ConvolveScratchSize);
       DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       se::ScratchAllocator* allocator_used =
           !RedzoneCheckDisabled()
@@ -1057,12 +989,14 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
             profile_algorithm.tensor_ops_enabled());
 
         result.set_scratch_bytes(
-            rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones());
+            !RedzoneCheckDisabled()
+                ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
+                : scratch_allocator.TotalByteSize());
         *result.mutable_run_time() = proto_utils::ToDurationProto(
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
 
-        CheckRedzones(rz_scratch_allocator, stream, &result);
-        CheckRedzones(rz_allocator, stream, &result);
+        CheckRedzones(rz_scratch_allocator, &result);
+        CheckRedzones(rz_allocator, &result);
       }
     }
     LogConvAutotuneResults(se::dnn::ConvolutionKind::FORWARD,
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 076db5c5442..f985edca12f 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -36,7 +36,12 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 using stream_executor::dnn::DimIndex;
-#endif
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
@@ -132,10 +137,13 @@ class Conv3DOp : public BinaryOp<T> {
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
     const int64 in_batch = GetTensorDim(input, data_format_, 'N');
 
+    const int64 filter_depth = filter.dim_size(3);
     const int64 out_depth = filter.dim_size(4);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(3),
-        errors::InvalidArgument("input and filter must have the same depth"));
+
+    OP_REQUIRES(context, in_depth % filter_depth == 0,
+                errors::InvalidArgument(
+                    "Input depth must be evenly divisible by filter depth: ",
+                    in_depth, " vs ", filter_depth));
 
     // Dimension order for these arrays is: z, y, x.
     std::array<int64, 3> input_size = {
@@ -218,6 +226,7 @@ struct LaunchConvOp<GPUDevice, T> {
     const int64 filter_planes = filter.dim_size(0);
     const int64 filter_rows = filter.dim_size(1);
     const int64 filter_cols = filter.dim_size(2);
+    const int64 filter_depth = filter.dim_size(3);
     const int64 out_depth = filter.dim_size(4);
 
     int64 pad_planes = 0, pad_rows = 0, pad_cols = 0;
@@ -234,11 +243,13 @@ struct LaunchConvOp<GPUDevice, T> {
           0, (out_cols - 1) * strides[2] + filter_cols - in_cols);
     }
 
+    bool is_grouped_convolution = filter_depth != in_depth;
+
     // NOTE: This only works in NHWC.
-    if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 &&
-        dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1 &&
-        strides[0] == 1 && strides[1] == 1 && strides[2] == 1 &&
-        data_format == FORMAT_NHWC) {
+    if (!is_grouped_convolution && filter_planes == 1 && filter_rows == 1 &&
+        filter_cols == 1 && dilations[0] == 1 && dilations[1] == 1 &&
+        dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 &&
+        strides[2] == 1 && data_format == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
       const uint64 m = in_batch * in_planes * in_rows * in_cols;
       const uint64 k = in_depth;
@@ -262,9 +273,9 @@ struct LaunchConvOp<GPUDevice, T> {
                                         ", n=", n, ", k=", k));
       }
       return;
-    } else if (filter_planes == in_planes && filter_rows == in_rows &&
-               filter_cols == in_cols && padding == Padding::VALID &&
-               data_format == FORMAT_NHWC) {
+    } else if (!is_grouped_convolution && filter_planes == in_planes &&
+               filter_rows == in_rows && filter_cols == in_cols &&
+               padding == Padding::VALID && data_format == FORMAT_NHWC) {
       // The input data and filter have the same planes/height/width, so call
       // cublas directly.
       const uint64 m = in_batch;
@@ -365,7 +376,7 @@ struct LaunchConvOp<GPUDevice, T> {
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
-        .set_input_feature_map_count(in_depth)
+        .set_input_feature_map_count(filter_depth)
         .set_output_feature_map_count(out_depth);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
@@ -376,7 +387,8 @@ struct LaunchConvOp<GPUDevice, T> {
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, pad_cols / 2)
         .set_zero_padding(DimIndex::Y, pad_rows / 2)
-        .set_zero_padding(DimIndex::Z, pad_planes / 2);
+        .set_zero_padding(DimIndex::Z, pad_planes / 2)
+        .set_group_count(in_depth / filter_depth);
 
     Tensor transformed_filter;
     OP_REQUIRES_OK(
@@ -425,7 +437,7 @@ struct LaunchConvOp<GPUDevice, T> {
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
         device_id,
-    };
+        conv_desc.group_count()};
 
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
@@ -436,6 +448,12 @@ struct LaunchConvOp<GPUDevice, T> {
     if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
                                   conv_parameters, &algorithm_config)) {
 #if GOOGLE_CUDA
+      se::TfAllocatorAdapter tf_allocator_adapter(
+          ctx->device()->GetAllocator({}), stream);
+      se::cuda::RedzoneAllocator rz_allocator(
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
+      se::DeviceMemory<T> output_ptr_rz(
+          WrapRedzoneBestEffort(&rz_allocator, output_ptr));
       std::vector<AlgorithmDesc> algorithms;
       OP_REQUIRES(ctx,
                   stream->parent()->GetConvolveAlgorithms(
@@ -452,12 +470,19 @@ struct LaunchConvOp<GPUDevice, T> {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
         DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        se::cuda::RedzoneAllocator rz_scratch_allocator(
+            stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+            /*memory_limit=*/ConvolveScratchSize);
+        se::ScratchAllocator* allocator_used =
+            !RedzoneCheckDisabled()
+                ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+                : static_cast<se::ScratchAllocator*>(&scratch_allocator);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
                 ->ThenConvolveWithAlgorithm(
                     input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                    output_desc, &output_ptr, &scratch_allocator,
+                    output_desc, &output_ptr_rz, allocator_used,
                     AlgorithmConfig(profile_algorithm), &profile_result)
                 .ok();
         if (cudnn_launch_status) {
@@ -467,9 +492,15 @@ struct LaunchConvOp<GPUDevice, T> {
             result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
             result.mutable_conv()->set_tensor_ops_enabled(
                 profile_algorithm.tensor_ops_enabled());
-            result.set_scratch_bytes(scratch_allocator.TotalByteSize());
+            result.set_scratch_bytes(
+                !RedzoneCheckDisabled()
+                    ? rz_scratch_allocator
+                          .TotalAllocatedBytesExcludingRedzones()
+                    : scratch_allocator.TotalByteSize());
             *result.mutable_run_time() = proto_utils::ToDurationProto(
                 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+            CheckRedzones(rz_scratch_allocator, &result);
+            CheckRedzones(rz_allocator, &result);
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_ops_fused_double.cc b/tensorflow/core/kernels/conv_ops_fused_double.cc
index 4ff5627dc20..f84c0f4422e 100644
--- a/tensorflow/core/kernels/conv_ops_fused_double.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_double.cc
@@ -23,9 +23,7 @@ namespace tensorflow {
 
 // If we're using the alternative GEMM-based implementation of Conv2D for the
 // CPU implementation, don't register this EigenTensor-based version.
-// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-// contractions with non-default contraction output kernels.
-#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+#if !defined(USE_GEMM_FOR_CONV)
 TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
 #endif  // !USE_GEMM_FOR_CONV
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_float.cc b/tensorflow/core/kernels/conv_ops_fused_float.cc
index 40f2eb3bbec..ce6bc861181 100644
--- a/tensorflow/core/kernels/conv_ops_fused_float.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_float.cc
@@ -23,9 +23,7 @@ namespace tensorflow {
 
 // If we're using the alternative GEMM-based implementation of Conv2D for the
 // CPU implementation, don't register this EigenTensor-based version.
-// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-// contractions with non-default contraction output kernels.
-#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+#if !defined(USE_GEMM_FOR_CONV)
 TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
 #endif  // !USE_GEMM_FOR_CONV
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 8fba8ce679b..61171842b5e 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -61,6 +61,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -304,6 +307,7 @@ template <typename T, typename ConvLaunch, typename LogFunc>
 Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
                                  const ConvLaunch launch,
                                  OpKernelContext* context, se::Stream* stream,
+                                 se::DeviceMemory<T> output_ptr,
                                  const LogFunc& log,
                                  se::dnn::AlgorithmConfig* algorithm_config) {
   // Check if we already have an algorithm selected for the given parameters.
@@ -322,14 +326,28 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
         "see if a warning log message was printed above.");
   }
 
+  se::TfAllocatorAdapter tf_allocator_adapter(
+      context->device()->GetAllocator({}), stream);
+  se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                          se::cuda::PtxCompilationOptions());
+  se::DeviceMemory<T> output_ptr_rz(
+      WrapRedzoneBestEffort(&rz_allocator, output_ptr));
+
   std::vector<tensorflow::AutotuneResult> results;
   for (auto profile_algorithm : algorithms) {
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
+    se::cuda::RedzoneAllocator rz_scratch_allocator(
+        stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+        /*memory_limit=*/ConvolveScratchSize());
+    se::ScratchAllocator* allocator_used =
+        !RedzoneCheckDisabled()
+            ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+            : static_cast<se::ScratchAllocator*>(&scratch_allocator);
     se::dnn::ProfileResult profile_result;
 
     bool cudnn_launch_status =
-        launch(se::dnn::AlgorithmConfig(profile_algorithm), &scratch_allocator,
-               &profile_result);
+        launch(se::dnn::AlgorithmConfig(profile_algorithm), allocator_used,
+               output_ptr_rz, &profile_result);
 
     if (cudnn_launch_status && profile_result.is_valid()) {
       results.emplace_back();
@@ -337,9 +355,14 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
       result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
       result.mutable_conv()->set_tensor_ops_enabled(
           profile_algorithm.tensor_ops_enabled());
-      result.set_scratch_bytes(scratch_allocator.TotalByteSize());
+      result.set_scratch_bytes(
+          !RedzoneCheckDisabled()
+              ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
+              : scratch_allocator.TotalByteSize());
       *result.mutable_run_time() = proto_utils::ToDurationProto(
           absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+      CheckRedzones(rz_scratch_allocator, &result);
+      CheckRedzones(rz_allocator, &result);
     }
   }
   // Only log on an AutoTuneFusedConv cache miss.
@@ -563,32 +586,32 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     int device_id = stream->parent()->device_ordinal();
     DataType dtype = input.dtype();
     FusedConvParameters conv_parameters = {
-        {
-            in_batch,                      // batch
-            in_depths,                     // in_depths
-            {{in_rows,                     // in_rows
-              in_cols}},                   // in_cols
-            FORMAT_NCHW,                   // compute_data_format
-            out_depths,                    // out_depths
-            {{patch_rows,                  // filter_rows
-              patch_cols,                  // filter_cols
-              patch_depths}},              // filter_depths
-            {{dimensions.dilation_rows,    // dilation_rows
-              dimensions.dilation_cols}},  // dilation_cols
-            {{dimensions.stride_rows,      // stride_rows
-              dimensions.stride_cols}},    // stride_cols
-            {{common_padding_rows,         // padding_rows
-              common_padding_cols}},       // padding_cols
-            dtype,                         // tensor datatype
-            device_id,                     // device_id
-        },
+        {in_batch,                      // batch
+         in_depths,                     // in_depths
+         {{in_rows,                     // in_rows
+           in_cols}},                   // in_cols
+         FORMAT_NCHW,                   // compute_data_format
+         out_depths,                    // out_depths
+         {{patch_rows,                  // filter_rows
+           patch_cols,                  // filter_cols
+           patch_depths}},              // filter_depths
+         {{dimensions.dilation_rows,    // dilation_rows
+           dimensions.dilation_cols}},  // dilation_cols
+         {{dimensions.stride_rows,      // stride_rows
+           dimensions.stride_cols}},    // stride_cols
+         {{common_padding_rows,         // padding_rows
+           common_padding_cols}},       // padding_cols
+         dtype,                         // tensor datatype
+         device_id,                     // device_id
+         conv_desc.group_count()},
         dnn_activation_mode  // activation_mode
     };
 
     // Launch fused convolution with given parameters and scratch allocator.
     // Record profile result into `profile_result` if it's not nullptr.
     const auto launch = [&](se::dnn::AlgorithmConfig algorithm_config,
-                            DnnScratchAllocator* scratch_allocator,
+                            se::ScratchAllocator* scratch_allocator,
+                            se::DeviceMemory<T> output_ptr_to_use,
                             se::dnn::ProfileResult* profile_result) -> bool {
       return stream
           ->ThenFusedConvolveWithAlgorithm(
@@ -599,7 +622,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
               side_input_ptr, /*side_input_scale=*/0.0,  // side_input
               bias_desc, bias_ptr,                       // bias
               dnn_activation_mode,                       // activation
-              output_desc, &output_ptr,                  // output
+              output_desc, &output_ptr_to_use,           // output
               scratch_allocator, algorithm_config, profile_result)
           .ok();
     };
@@ -607,7 +630,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     se::dnn::AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune) {
       auto status = FindBestConvolveAlgorithm<T>(
-          conv_parameters, launch, context, stream,
+          conv_parameters, launch, context, stream, output_ptr,
           [&](absl::Span<const tensorflow::AutotuneResult> results) {
             LogFusedConvForwardAutotuneResults(
                 se::dnn::ToDataType<T>::value, input_ptr, filter_ptr,
@@ -621,7 +644,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
 
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
     bool cudnn_launch_status = launch(algorithm_config, &scratch_allocator,
-                                      /*profile_result=*/nullptr);
+                                      output_ptr, /*profile_result=*/nullptr);
     OP_REQUIRES(
         context, cudnn_launch_status,
         errors::Internal(absl::Substitute(
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7906f74c616..9f6e0fc4197 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <tuple>
 #include <unordered_map>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -50,11 +51,9 @@ class DnnScratchAllocator : public se::ScratchAllocator {
   virtual ~DnnScratchAllocator() {}
   DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
-    return memory_limit_;
-  }
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override {
+      int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
       return se::port::Status{se::port::error::INVALID_ARGUMENT,
@@ -97,7 +96,7 @@ class ConvParameters {
                  TensorFormat data_format, int64 out_depths,
                  const SpatialArray& filter, const SpatialArray& dilation,
                  const SpatialArray& stride, const SpatialArray& padding,
-                 DataType dtype, int device_id)
+                 DataType dtype, int device_id, int group_count = 1)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
@@ -108,7 +107,8 @@ class ConvParameters {
         stride_(CheckSpatialArraySize(stride)),
         padding_(CheckSpatialArraySize(padding)),
         dtype_(dtype),
-        device_id_(device_id) {
+        device_id_(device_id),
+        group_count_(group_count) {
     hash_code_ = batch;
     hash_code_ = Hash64Combine(hash_code_, in_depths);
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
@@ -120,7 +120,9 @@ class ConvParameters {
     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, dtype);
     hash_code_ = Hash64Combine(hash_code_, device_id);
+    hash_code_ = Hash64Combine(hash_code_, group_count);
   }
+
   bool operator==(const ConvParameters& other) const {
     return this->get_data_as_tuple() == other.get_data_as_tuple();
   }
@@ -142,7 +144,8 @@ class ConvParameters {
         "(", str_util::Join(stride_, ", "), "), ",
         "(", str_util::Join(padding_, ", "), "), ",
         dtype_, ", ",
-        device_id_);
+        device_id_,
+        group_count_);
     // clang-format on
   }
 
@@ -166,12 +169,12 @@ class ConvParameters {
  protected:
   using ParameterDataType =
       std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
-                 SpatialArray, SpatialArray, SpatialArray, DataType, int>;
+                 SpatialArray, SpatialArray, SpatialArray, DataType, int, int>;
 
   ParameterDataType get_data_as_tuple() const {
     return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
                            filter_, dilation_, stride_, padding_, dtype_,
-                           device_id_);
+                           device_id_, group_count_);
   }
 
   uint64 hash_code_;
@@ -208,6 +211,7 @@ class ConvParameters {
   SpatialArray padding_;
   DataType dtype_;
   int device_id_;
+  int group_count_;
 };
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index ab338a2550c..fcf86754b5c 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1001,6 +1001,10 @@ class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest);
 TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest);
 
+// ROCm does not yet support the _FusedConv2D op,
+// Therefore disable tests that check _FusedConv2D, when building with ROCm
+
+#ifndef TENSORFLOW_USE_ROCM
 // -------------------------------------------------------------------------- //
 // Conv2D + BiasAdd + {Activation}                                            //
 // -------------------------------------------------------------------------- //
@@ -1165,4 +1169,5 @@ using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
 
+#endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 3202516be7e..d268eb7b21e 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -38,10 +38,11 @@ enum InterpolationMethod {
 
 template <typename T>
 __global__ void CropAndResizeKernel(
-    const int32 nthreads, const T* image_ptr, const float* boxes_ptr,
-    const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth, int method_id,
-    float extrapolation_value, float* crops_ptr) {
+    const int32 nthreads, const T* __restrict__ image_ptr,
+    const float* __restrict__ boxes_ptr, const int32* __restrict__ box_ind_ptr,
+    int num_boxes, int batch, int image_height, int image_width,
+    int crop_height, int crop_width, int depth, int method_id,
+    float extrapolation_value, float* __restrict__ crops_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     int idx = out_idx;
@@ -130,10 +131,11 @@ __global__ void CropAndResizeKernel(
 
 template <typename T>
 __global__ void CropAndResizeBackpropImageKernel(
-    const int32 nthreads, const float* grads_ptr, const float* boxes_ptr,
-    const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    T* grads_image_ptr, int method_id) {
+    const int32 nthreads, const float* __restrict__ grads_ptr,
+    const float* __restrict__ boxes_ptr, const int32* __restrict__ box_ind_ptr,
+    int num_boxes, int batch, int image_height, int image_width,
+    int crop_height, int crop_width, int depth, T* __restrict__ grads_image_ptr,
+    int method_id) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     int idx = out_idx;
@@ -225,10 +227,11 @@ __global__ void CropAndResizeBackpropImageKernel(
 
 template <typename T>
 __global__ void CropAndResizeBackpropBoxesKernel(
-    const int32 nthreads, const float* grads_ptr, const T* image_ptr,
-    const float* boxes_ptr, const int32* box_ind_ptr, int num_boxes, int batch,
+    const int32 nthreads, const float* __restrict__ grads_ptr,
+    const T* __restrict__ image_ptr, const float* __restrict__ boxes_ptr,
+    const int32* __restrict__ box_ind_ptr, int num_boxes, int batch,
     int image_height, int image_width, int crop_height, int crop_width,
-    int depth, float* grads_boxes_ptr) {
+    int depth, float* __restrict__ grads_boxes_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     int idx = out_idx;
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 8cadeac68d7..517612eecb6 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -33,11 +34,12 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-inline float RowMax(const TTypes<float>::UnalignedConstMatrix& m, int r,
-                    int* c) {
+template <typename T>
+inline T RowMax(const typename TTypes<T>::UnalignedConstMatrix& m, int r,
+                int* c) {
   *c = 0;
   CHECK_LT(0, m.dimension(1));
-  float p = m(r, 0);
+  auto p = m(r, 0);
   for (int i = 1; i < m.dimension(1); ++i) {
     if (m(r, i) > p) {
       p = m(r, i);
@@ -170,6 +172,7 @@ class CTCDecodeHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(CTCDecodeHelper);
 };
 
+template <typename T>
 class CTCGreedyDecoderOp : public OpKernel {
  public:
   explicit CTCGreedyDecoderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -189,7 +192,7 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     const TensorShape& inputs_shape = inputs->shape();
 
-    std::vector<TTypes<float>::UnalignedConstMatrix> input_list_t;
+    std::vector<typename TTypes<T>::UnalignedConstMatrix> input_list_t;
     const int64 max_time = inputs_shape.dim_size(0);
     const int64 batch_size = inputs_shape.dim_size(1);
     const int64 num_classes_raw = inputs_shape.dim_size(2);
@@ -198,14 +201,14 @@ class CTCGreedyDecoderOp : public OpKernel {
         errors::InvalidArgument("num_classes cannot exceed max int"));
     const int num_classes = static_cast<const int>(num_classes_raw);
 
-    auto inputs_t = inputs->tensor<float, 3>();
+    auto inputs_t = inputs->tensor<T, 3>();
 
     for (std::size_t t = 0; t < max_time; ++t) {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);
     }
     auto seq_len_t = seq_len->vec<int32>();
-    auto log_prob_t = log_prob->matrix<float>();
+    auto log_prob_t = log_prob->matrix<T>();
 
     log_prob_t.setZero();
 
@@ -221,7 +224,8 @@ class CTCGreedyDecoderOp : public OpKernel {
         int prev_indices = -1;
         for (int t = 0; t < seq_len_t(b); ++t) {
           int max_class_indices;
-          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          log_prob_t(b, 0) +=
+              -RowMax<T>(input_list_t[t], b, &max_class_indices);
           if (max_class_indices != blank_index &&
               !(merge_repeated_ && max_class_indices == prev_indices)) {
             sequence.push_back(max_class_indices);
@@ -250,10 +254,18 @@ class CTCGreedyDecoderOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(CTCGreedyDecoderOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("CTCGreedyDecoder").Device(DEVICE_CPU),
-                        CTCGreedyDecoderOp);
+#define REGISTER_CPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("CTCGreedyDecoder").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      CTCGreedyDecoderOp<T>);
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#undef REGISTER_CPU
 
 // CTC beam search
+template <typename T>
 class CTCBeamSearchDecoderOp : public OpKernel {
  public:
   explicit CTCBeamSearchDecoderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -275,9 +287,9 @@ class CTCBeamSearchDecoderOp : public OpKernel {
                             ctx, &inputs, &seq_len, &log_prob, &decoded_indices,
                             &decoded_values, &decoded_shape));
 
-    auto inputs_t = inputs->tensor<float, 3>();
+    auto inputs_t = inputs->tensor<T, 3>();
     auto seq_len_t = seq_len->vec<int32>();
-    auto log_prob_t = log_prob->matrix<float>();
+    auto log_prob_t = log_prob->matrix<T>();
 
     const TensorShape& inputs_shape = inputs->shape();
 
@@ -291,21 +303,21 @@ class CTCBeamSearchDecoderOp : public OpKernel {
 
     log_prob_t.setZero();
 
-    std::vector<TTypes<float>::UnalignedConstMatrix> input_list_t;
+    std::vector<typename TTypes<T>::UnalignedConstMatrix> input_list_t;
 
     for (std::size_t t = 0; t < max_time; ++t) {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);
     }
 
-    ctc::CTCBeamSearchDecoder<> beam_search(num_classes, beam_width_,
-                                            &beam_scorer_, 1 /* batch_size */,
-                                            merge_repeated_);
-    Tensor input_chip(DT_FLOAT, TensorShape({num_classes}));
-    auto input_chip_t = input_chip.flat<float>();
+    ctc::CTCBeamSearchDecoder<T> beam_search(num_classes, beam_width_,
+                                             &beam_scorer_, 1 /* batch_size */,
+                                             merge_repeated_);
+    Tensor input_chip(DataTypeToEnum<T>::v(), TensorShape({num_classes}));
+    auto input_chip_t = input_chip.flat<T>();
 
     std::vector<std::vector<std::vector<int> > > best_paths(batch_size);
-    std::vector<float> log_probs;
+    std::vector<T> log_probs;
 
     // Assumption: the blank index is num_classes - 1
     for (int b = 0; b < batch_size; ++b) {
@@ -313,8 +325,8 @@ class CTCBeamSearchDecoderOp : public OpKernel {
       best_paths_b.resize(decode_helper_.GetTopPaths());
       for (int t = 0; t < seq_len_t(b); ++t) {
         input_chip_t = input_list_t[t].chip(b, 0);
-        auto input_bi =
-            Eigen::Map<const Eigen::ArrayXf>(input_chip_t.data(), num_classes);
+        auto input_bi = Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>(
+            input_chip_t.data(), num_classes);
         beam_search.Step(input_bi);
       }
       OP_REQUIRES_OK(
@@ -335,13 +347,20 @@ class CTCBeamSearchDecoderOp : public OpKernel {
 
  private:
   CTCDecodeHelper decode_helper_;
-  ctc::CTCBeamSearchDecoder<>::DefaultBeamScorer beam_scorer_;
+  typename ctc::CTCBeamSearchDecoder<T>::DefaultBeamScorer beam_scorer_;
   bool merge_repeated_;
   int beam_width_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoderOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoderOp<T>);
 };
 
-REGISTER_KERNEL_BUILDER(Name("CTCBeamSearchDecoder").Device(DEVICE_CPU),
-                        CTCBeamSearchDecoderOp);
+#define REGISTER_CPU(T)                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("CTCBeamSearchDecoder").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      CTCBeamSearchDecoderOp<T>);
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#undef REGISTER_CPU
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index aa68e105add..995d28a158c 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,14 +27,13 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
+template <typename T>
 class CTCLossOp : public OpKernel {
-  typedef Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic,
-                                         Eigen::RowMajor> >
+  typedef Eigen::Map<
+      const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> >
       InputMap;
   typedef Eigen::Map<
-      Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> >
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> >
       OutputMap;
 
  public:
@@ -110,7 +110,7 @@ class CTCLossOp : public OpKernel {
                 errors::InvalidArgument("label SparseTensor is not valid: ",
                                         labels_sp_valid.error_message()));
 
-    ctc::CTCLossCalculator::LabelSequences labels_t(batch_size);
+    typename ctc::CTCLossCalculator<T>::LabelSequences labels_t(batch_size);
     for (const auto& g : labels_sp.group({0})) {  // iterate by batch
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
@@ -137,13 +137,13 @@ class CTCLossOp : public OpKernel {
 
     Tensor* loss = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("loss", seq_len->shape(), &loss));
-    auto loss_t = loss->vec<float>();
+    auto loss_t = loss->vec<T>();
 
     Tensor* gradient;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("gradient", inputs_shape, &gradient));
-    auto gradient_t = gradient->tensor<float, 3>();
-    auto inputs_t = inputs->tensor<float, 3>();
+    auto gradient_t = gradient->tensor<T, 3>();
+    auto inputs_t = inputs->tensor<T, 3>();
     std::vector<OutputMap> gradient_list_t;
     std::vector<InputMap> input_list_t;
 
@@ -158,7 +158,7 @@ class CTCLossOp : public OpKernel {
     gradient_t.setZero();
 
     // Assumption: the blank index is num_classes - 1
-    ctc::CTCLossCalculator ctc_loss_calculator(num_classes - 1, 0);
+    ctc::CTCLossCalculator<T> ctc_loss_calculator(num_classes - 1, 0);
     DeviceBase::CpuWorkerThreads workers =
         *ctx->device()->tensorflow_cpu_worker_threads();
     OP_REQUIRES_OK(ctx, ctc_loss_calculator.CalculateLoss(
@@ -173,9 +173,17 @@ class CTCLossOp : public OpKernel {
   bool ctc_merge_repeated_;
   bool ignore_longer_outputs_than_inputs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp<T>);
 };
 
-REGISTER_KERNEL_BUILDER(Name("CTCLoss").Device(DEVICE_CPU), CTCLossOp);
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("CTCLoss").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      CTCLossOp<T>);
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#undef REGISTER_CPU
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 9679fad09ac..104ee09a2bc 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cusolverDn.h"
 #endif
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 09826f57ce5..4a27394f289 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -363,12 +363,11 @@ class CudnnRnnAllocatorInTemp : public ScratchAllocator {
 
   explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
+  int64 GetMemoryLimitInBytes() override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                              int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
     Tensor temporary_memory;
     const DataType tf_data_type = ToTFDataType<T>::value;
     int64 allocate_count =
@@ -409,11 +408,10 @@ class CudnnRnnAllocatorInOutput : public ScratchAllocator {
   ~CudnnRnnAllocatorInOutput() override {}
   CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
+  int64 GetMemoryLimitInBytes() override {
     return std::numeric_limits<int64>::max();
   }
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                              int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
     CHECK(total_byte_size_ == 0)
         << "Reserve space allocator can only be called once";
     int64 allocate_count =
@@ -449,12 +447,11 @@ class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
 
   ~CudnnRNNPersistentSpaceAllocator() override {}
 
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
+  int64 GetMemoryLimitInBytes() override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                              int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
     if (total_byte_size_ != 0) {
       return Status(error::FAILED_PRECONDITION,
                     "Persistent space allocator can only be called once");
@@ -944,6 +941,20 @@ void RestoreParams(const OpInputList params_input,
   }
 }
 
+bool ShouldUsePaddedIO(const Tensor* sequence_lengths,
+                       const CudnnRnnModelShapes& model_shapes,
+                       bool time_major) {
+  auto seq_array = sequence_lengths->template flat<int>().data();
+  bool all_max_seq_length = true;
+  for (int i = 0; i < model_shapes.batch_size; i++) {
+    if (seq_array[i] != model_shapes.max_seq_length) {
+      all_max_seq_length = false;
+      break;
+    }
+  }
+  return !(time_major && all_max_seq_length);
+}
+
 }  // namespace
 
 // Note: all following kernels depend on a RnnDescriptor instance, which
@@ -1027,7 +1038,7 @@ class CudnnRNNKernelCommon : public OpKernel {
         num_layers, h_num_units, input_size, /*cell_size=*/c_num_units,
         /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(),
         ToDataType<T>::value, algo_config, dropout(), seed(),
-        /* state_allocator=*/nullptr);
+        /* state_allocator=*/nullptr, /*use_padded_io=*/false);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
     }
@@ -1041,14 +1052,16 @@ class CudnnRNNKernelCommon : public OpKernel {
                              const RnnInputMode& input_mode,
                              const AlgorithmConfig& algo_config,
                              ScratchAllocator* dropout_state_allocator,
-                             std::unique_ptr<RnnDescriptor>* rnn_desc) {
+                             std::unique_ptr<RnnDescriptor>* rnn_desc,
+                             bool use_padded_io) {
     StreamExecutor* executor = context->op_device_context()->stream()->parent();
     se::dnn::DataType data_type = ToDataType<T>::value;
     auto rnn_desc_s = executor->createRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
         model_shapes.input_size, model_shapes.cell_num_units,
         model_shapes.batch_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, algo_config, dropout(), seed(), dropout_state_allocator);
+        data_type, algo_config, dropout(), seed(), dropout_state_allocator,
+        use_padded_io);
     TF_RETURN_IF_ERROR(rnn_desc_s.status());
 
     *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
@@ -1065,17 +1078,17 @@ class CudnnRNNKernelCommon : public OpKernel {
                                 const CudnnRnnModelShapes& model_shapes,
                                 const RnnInputMode& input_mode,
                                 const AlgorithmConfig& algo_config,
-                                RnnStateCache* cache,
-                                RnnDescriptor** rnn_desc) {
+                                RnnStateCache* cache, RnnDescriptor** rnn_desc,
+                                bool use_padded_io) {
     auto key = std::make_pair(model_shapes, algo_config.algorithm());
     RnnScratchSpace& rnn_state = (*cache)[key];
     if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
       CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
           new CudnnRNNPersistentSpaceAllocator(context);
       rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
-      Status status =
-          CreateRnnDescriptor<T>(context, model_shapes, input_mode, algo_config,
-                                 dropout_state_allocator, &rnn_state.rnn_desc);
+      Status status = CreateRnnDescriptor<T>(
+          context, model_shapes, input_mode, algo_config,
+          dropout_state_allocator, &rnn_state.rnn_desc, use_padded_io);
       TF_RETURN_IF_ERROR(status);
     }
     *rnn_desc = rnn_state.rnn_desc.get();
@@ -1444,11 +1457,14 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* params = nullptr;
     const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
+    bool use_padded_io = false;
     if (var_seq_lengths) {
       OP_REQUIRES_OK(context, ExtractForwardInput(
                                   context, model_types(), time_major, &input,
                                   &input_h, &input_c, &params,
                                   &sequence_lengths, num_proj, &model_shapes));
+      use_padded_io =
+          ShouldUsePaddedIO(sequence_lengths, model_shapes, time_major);
     } else {
       OP_REQUIRES_OK(context,
                      ExtractForwardInput(context, model_types(), time_major,
@@ -1488,10 +1504,10 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     {
       mutex_lock l(mu_);
       RnnDescriptor* rnn_desc_ptr = nullptr;
-      OP_REQUIRES_OK(
-          context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
-                                             *output_algo_config,
-                                             &rnn_state_cache_, &rnn_desc_ptr));
+      OP_REQUIRES_OK(context,
+                     GetCachedRnnDescriptor<T>(
+                         context, model_shapes, input_mode, *output_algo_config,
+                         &rnn_state_cache_, &rnn_desc_ptr, use_padded_io));
       launch_status = DoForward<T>(
           context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
           input_c, params, is_training_, output, output_h, output_c,
@@ -1690,7 +1706,8 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       CudnnRnnAllocatorInTemp<uint8> dropout_state_allocator(context);
       if (!this->template CreateRnnDescriptor<T>(
                    context, model_shapes, input_mode, AlgorithmConfig(algo),
-                   &dropout_state_allocator, &rnn_desc)
+                   &dropout_state_allocator, &rnn_desc,
+                   /*use_padded_io=*/false)
                .ok()) {
         continue;
       }
@@ -1840,11 +1857,14 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* params = nullptr;
     const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
+    bool use_padded_io = false;
     if (var_seq_lengths) {
       OP_REQUIRES_OK(context, ExtractForwardInput(
                                   context, model_types(), time_major, &input,
                                   &input_h, &input_c, &params,
                                   &sequence_lengths, num_proj, &model_shapes));
+      use_padded_io =
+          ShouldUsePaddedIO(sequence_lengths, model_shapes, time_major);
     } else {
       OP_REQUIRES_OK(context,
                      ExtractForwardInput(context, model_types(), time_major,
@@ -1890,7 +1910,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       OP_REQUIRES_OK(
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
                                              algo_config, &rnn_state_cache_,
-                                             &rnn_desc_ptr));
+                                             &rnn_desc_ptr, use_padded_io));
       launch_status = DoBackward<T>(
           context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
           input_c, params, output, output_h, output_c, output_backprop,
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 1fa453ddb09..c218d35498e 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 #if !defined(__ANDROID_TYPES_SLIM__)
 
 REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
-          complex128, string);
+          complex128, tstring);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
index 55d3033483f..294ca01e006 100644
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -24,8 +24,11 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
-                                      const T *in1, const T *in2, T *out) {
+__global__ void UnaryClipCustomKernel(const int32 size_in,
+                                      const T *__restrict__ in0,
+                                      const T *__restrict__ in1,
+                                      const T *__restrict__ in2,
+                                      T *__restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -33,9 +36,11 @@ __global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
 }
 
 template <typename T>
-__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
-                                            const T *in1, const T *in2,
-                                            T *out) {
+__global__ void BinaryRightClipCustomKernel(const int32 size_in,
+                                            const T *__restrict__ in0,
+                                            const T *__restrict__ in1,
+                                            const T *__restrict__ in2,
+                                            T *__restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[i] < in0[i] ? in2[i] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -43,8 +48,11 @@ __global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
 }
 
 template <typename T>
-__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
-                                           const T *in1, const T *in2, T *out) {
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in,
+                                           const T *__restrict__ in0,
+                                           const T *__restrict__ in1,
+                                           const T *__restrict__ in2,
+                                           T *__restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[i] ? in1[i] : value;
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
index 77810338697..8bf53d89b41 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 #if !defined(__ANDROID_TYPES_SLIM__)
 
 REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32, int64, complex64,
-          complex128, string, bool);
+          complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64,
           complex64, complex128, bool);
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 563bb7d4566..062a029f069 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -18,8 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
           bfloat16, int32);
-REGISTER5(BinaryOp, CPU, "Less", functor::less, int64, uint8, int8, int16,
-          bfloat16);
+REGISTER4(BinaryOp, CPU, "Less", functor::less, int64, uint8, int8, int16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER7(BinaryOp, GPU, "Less", functor::less, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 1998fc0b2ad..43af03878e9 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
           bfloat16, double, int32);
-REGISTER5(BinaryOp, CPU, "LessEqual", functor::less_equal, int64, uint8, int8,
-          int16, bfloat16);
+REGISTER4(BinaryOp, CPU, "LessEqual", functor::less_equal, int64, uint8, int8,
+          int16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER7(BinaryOp, GPU, "LessEqual", functor::less_equal, float, Eigen::half,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 0ecc70c4f2b..9b23960936b 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 #if !defined(__ANDROID_TYPES_SLIM__)
 
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32, int64,
-          complex64, complex128, string, bool);
+          complex64, complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
           complex64, complex128, bool);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 83252bfcbd8..295c52465c0 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -30,6 +30,7 @@ cc_library(
         ":iterator_ops",
         ":name_utils",
         ":range_dataset_op",
+        ":take_dataset_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -54,14 +55,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core/grappler/optimizers/data",
-        "//tensorflow/core/grappler/optimizers/data:function_utils",
-        "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
@@ -102,6 +95,40 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "rewrite_utils",
+    srcs = ["rewrite_utils.cc"],
+    hdrs = ["rewrite_utils.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
+    ],
+)
+
+tf_cc_test(
+    name = "rewrite_utils_test",
+    srcs = ["rewrite_utils_test.cc"],
+    deps = [
+        ":rewrite_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "stats_utils",
     srcs = ["stats_utils.cc"],
@@ -179,6 +206,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -626,6 +654,7 @@ tf_kernel_library(
     srcs = ["prefetch_dataset_op.cc"],
     hdrs = ["prefetch_dataset_op.h"],
     deps = [
+        ":dataset_utils",
         ":name_utils",
         ":prefetch_autotuner",
         ":stats_utils",
@@ -793,6 +822,7 @@ tf_kernel_library(
     hdrs = ["shuffle_dataset_op.h"],
     deps = [
         ":name_utils",
+        ":random_seed_ops",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1153,6 +1183,7 @@ tf_kernel_library(
     srcs = ["cache_dataset_ops.cc"],
     hdrs = ["cache_dataset_ops.h"],
     deps = [
+        ":cache_ops",
         ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -1185,6 +1216,7 @@ tf_kernel_library(
     hdrs = ["optimize_dataset_op.h"],
     deps = [
         ":dataset_utils",
+        ":rewrite_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -1238,6 +1270,9 @@ tf_kernel_library(
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler/utils:traversal",
+        "//tensorflow/core/kernels/data:captured_function",
     ],
 )
 
@@ -1317,3 +1352,51 @@ tf_cc_test(
         "//tensorflow/core/kernels:function_ops",
     ],
 )
+
+tf_kernel_library(
+    name = "random_seed_ops",
+    srcs = ["random_seed_ops.cc"],
+    hdrs = ["random_seed_ops.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "cache_ops",
+    srcs = ["cache_ops.cc"],
+    hdrs = ["cache_ops.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+# Used by mobile builds.
+filegroup(
+    name = "dataset_ops_srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "rewrite_utils*",  # includes grappler dependency, which isn't supported on mobile.
+            "optimize_dataset*",  # includes grappler dependency, which isn't supported on mobile.
+            "dataset_ops*",  # includes grappler dependency, which isn't supported on mobile.
+            "*test.cc",
+            "*test.h",
+            "*_test_*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index e5276743c0e..3e6f8b8628e 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -101,6 +101,10 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/batch_dataset_op_test.cc b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
index 0addd6e0ed4..cce73a41ca4 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
@@ -20,720 +20,357 @@ namespace {
 constexpr char kNodeName[] = "batch_dataset_v2";
 constexpr int kOpVersion = 2;
 
-class BatchDatasetOpTest : public DatasetOpsTestBase {
- protected:
-  // Creates a new `BatchDataset` op kernel.
-  Status CreateBatchDatasetOpKernel(
-      bool parallel_copy, const DataTypeVector& output_types,
-      const std::vector<PartialTensorShape>& output_shapes,
-      std::unique_ptr<OpKernel>* batch_dataset_op_kernel) {
-    name_utils::OpNameParams params;
-    params.op_version = kOpVersion;
-    NodeDef node_def = test::function::NDef(
-        kNodeName, name_utils::OpName(BatchDatasetOp::kDatasetType, params),
-        {BatchDatasetOp::kInputDataset, BatchDatasetOp::kBatchSize,
-         BatchDatasetOp::kDropRemainder},
-        {{BatchDatasetOp::kParallelCopy, parallel_copy},
-         {BatchDatasetOp::kOutputTypes, output_types},
-         {BatchDatasetOp::kOutputShapes, output_shapes}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, batch_dataset_op_kernel));
+class BatchDatasetParams : public DatasetParams {
+ public:
+  BatchDatasetParams(int64 num_input_elements, int64 batch_size,
+                     bool drop_remainder, bool parallel_copy,
+                     DataTypeVector output_dtypes,
+                     std::vector<PartialTensorShape> output_shapes,
+                     string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        num_input_elements(num_input_elements),
+        batch_size(CreateTensor<int64>(TensorShape({}), {batch_size})),
+        drop_remainder(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
+        parallel_copy(parallel_copy) {}
+
+  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    if (!IsDatasetTensor(input_dataset)) {
+      return errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    }
+    *inputs = {TensorValue(&input_dataset), TensorValue(&batch_size),
+               TensorValue(&drop_remainder)};
     return Status::OK();
   }
 
-  // Create a new `BatchDataset` op kernel context
-  Status CreateBatchDatasetContext(
-      OpKernel* const op_kernel,
-      gtl::InlinedVector<TensorValue, 4>* const inputs,
-      std::unique_ptr<OpKernelContext>* context) {
-    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
-    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
-    return Status::OK();
-  }
-};
-
-struct RangeDatasetParam {
-  int64 start;
-  int64 end;
-  int64 step;
-};
-
-struct TestCase {
-  RangeDatasetParam range_dataset_param;
+  int64 num_input_elements;
+  Tensor input_dataset;
   Tensor batch_size;
   Tensor drop_remainder;
   bool parallel_copy;
-  std::vector<Tensor> expected_outputs;
-  DataTypeVector expected_output_dtypes;
-  std::vector<PartialTensorShape> expected_output_shapes;
-  int64 expected_cardinality;
-  std::vector<int> breakpoints;
+};
+
+class BatchDatasetOpTest : public DatasetOpsTestBaseV2<BatchDatasetParams> {
+ public:
+  Status Initialize(BatchDatasetParams* batch_dataset_params) override {
+    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
+
+    // Populate the `input_dataset` in `batch_dataset_params_`.
+    RangeDatasetParams input_dataset_params(
+        0, batch_dataset_params->num_input_elements, 1, {DT_INT64},
+        {PartialTensorShape({})}, "range_dataset");
+    TF_RETURN_IF_ERROR(MakeRangeDataset(input_dataset_params,
+                                        &batch_dataset_params->input_dataset));
+    // Create the dataset kernel.
+    TF_RETURN_IF_ERROR(
+        MakeDatasetOpKernel(*batch_dataset_params, &dataset_kernel_));
+    // Create the inputs for the dataset op.
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    TF_RETURN_IF_ERROR(batch_dataset_params->MakeInputs(&inputs));
+    // Creat the dataset context.
+    TF_RETURN_IF_ERROR(
+        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+    // Create the dataset.
+    TF_RETURN_IF_ERROR(
+        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+
+    // Create the iterator context.
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+    // Create the iterator.
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+        iterator_ctx_.get(), batch_dataset_params->iterator_prefix,
+        &iterator_));
+    return Status::OK();
+  }
+
+ protected:
+  // Creates a new `BatchDataset` op kernel.
+  Status MakeDatasetOpKernel(
+      const BatchDatasetParams& dataset_params,
+      std::unique_ptr<OpKernel>* batch_dataset_op_kernel) override {
+    name_utils::OpNameParams params;
+    params.op_version = kOpVersion;
+    NodeDef node_def = test::function::NDef(
+        dataset_params.node_name,
+        name_utils::OpName(BatchDatasetOp::kDatasetType, params),
+        {BatchDatasetOp::kInputDataset, BatchDatasetOp::kBatchSize,
+         BatchDatasetOp::kDropRemainder},
+        {{BatchDatasetOp::kParallelCopy, dataset_params.parallel_copy},
+         {BatchDatasetOp::kOutputTypes, dataset_params.output_dtypes},
+         {BatchDatasetOp::kOutputShapes, dataset_params.output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, batch_dataset_op_kernel));
+    return Status::OK();
+  }
 };
 
 // Test Case 1: test BatchDatasetV2 with `drop_remainder` = false and a batch
 // size that can evenly split the input dataset.
-TestCase TestCase1() {
-  return {
-      /*range_data_param*/ {0, 12, 1},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-      /*parallel_copy*/ true,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {0, 1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {4, 5, 6, 7}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}),
-                                               {8, 9, 10, 11})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({4})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams1() {
+  return {/*num_input_elements=*/12,
+          /*batch_size=*/4,
+          /*drop_remainder=*/false,
+          /*parallel_copy=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({4})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 2: test BatchDatasetV2 with `drop_remainder` = true and a batch
 // size that can evenly split the input dataset.
-TestCase TestCase2() {
-  return {
-      /*range_data_param*/ {0, 12, 1},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*parallel_copy*/ false,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {0, 1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}), {4, 5, 6, 7}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({4}),
-                                               {8, 9, 10, 11})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({4})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams2() {
+  return {/*num_input_elements=*/12,
+          /*batch_size=*/4,
+          /*drop_remainder=*/true,
+          /*parallel_copy=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({4})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 3: test BatchDatasetV2 with `drop_remainder` = false and a batch
 // size that can not evenly split the input dataset.
-TestCase TestCase3() {
-  return {/*range_data_param*/ {0, 10, 1},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-          /*parallel_copy*/ false,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {6, 7, 8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1}), {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({-1})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams3() {
+  return {/*num_input_elements=*/10,
+          /*batch_size=*/3,
+          /*drop_remainder=*/false,
+          /*parallel_copy=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({-1})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 4: test BatchDatasetV2 with `drop_remainder` = true and a batch
 // size that can not evenly split the input dataset.
-TestCase TestCase4() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*parallel_copy*/ true,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {6, 7, 8})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({3})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams4() {
+  return {/*num_input_elements=*/10,
+          /*batch_size=*/3,
+          /*drop_remainder=*/true,
+          /*parallel_copy=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({3})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 5: test BatchDatasetV2 with `drop_remainder` = true and
 // `batch_size` > the cardinality of the input dataset.
-TestCase TestCase5() {
-  return {/*range_data_param*/ {0, 10, 1},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-          /*parallel_copy*/ true,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({12})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams5() {
+  return {/*num_input_elements=*/10,
+          /*batch_size=*/12,
+          /*drop_remainder=*/true,
+          /*parallel_copy=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({12})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 6: test BatchDatasetV2 with `drop_remainder` = false and
 // `batch_size` > the cardinality of the input dataset.
-TestCase TestCase6() {
-  return {/*range_data_param*/ {0, 10, 1},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-          /*parallel_copy*/ true,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape({10}), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({-1})},
-          /*expected_cardinality*/ 1,
-          /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams6() {
+  return {/*num_input_elements=*/10,
+          /*batch_size=*/12,
+          /*drop_remainder=*/false,
+          /*parallel_copy=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({-1})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 7: test BatchDatasetV2 with `drop_remainder` = false and
 // the output of the input dataset is empty.
-TestCase TestCase7() {
-  return {/*range_data_param*/ {0, 0, 1},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-          /*parallel_copy*/ false,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({4})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams BatchDatasetParams7() {
+  return {/*num_input_elements=*/0,
+          /*batch_size=*/4,
+          /*drop_remainder=*/false,
+          /*parallel_copy=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({4})},
+          /*node_name=*/kNodeName};
 }
 
 // Test Case 8: test BatchDatasetV2 with an invalid batch size
-TestCase InvalidBatchSizeTestCase() {
-  return {/*range_data_param*/ {0, 10, 1},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-          /*parallel_copy*/ false,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({3})},
-          /*expected_cardinality*/ 3,
-          /*breakpoints*/ {0, 1, 5}};
+BatchDatasetParams InvalidBatchSizeBatchDatasetParams() {
+  return {/*num_input_elements=*/10,
+          /*batch_size=*/-1,
+          /*drop_remainder=*/false,
+          /*parallel_copy=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({3})},
+          /*node_name=*/kNodeName};
 }
 
-class ParameterizedBatchDatasetOpTest
-    : public BatchDatasetOpTest,
-      public ::testing::WithParamInterface<TestCase> {};
+std::vector<GetNextTestCase<BatchDatasetParams>> GetNextTestCases() {
+  return {{/*dataset_params=*/BatchDatasetParams1(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({4}),
+                                {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}})},
+          {/*dataset_params=*/BatchDatasetParams2(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({4}),
+                                {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}})},
+          {/*dataset_params=*/BatchDatasetParams3(),
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
+            CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
+            CreateTensor<int64>(TensorShape({3}), {6, 7, 8}),
+            CreateTensor<int64>(TensorShape({1}), {9})}},
+          {/*dataset_params=*/BatchDatasetParams4(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({3}),
+                                {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}})},
+          {/*dataset_params=*/BatchDatasetParams5(),
+           /*expected_outputs=*/{}},
+          {/*dataset_params=*/BatchDatasetParams6(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({10}),
+                                {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}})},
 
-TEST_P(ParameterizedBatchDatasetOpTest, GetNext) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(
-      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
-
-  bool end_of_sequence = false;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  std::vector<Tensor> out_tensors;
-  while (!end_of_sequence) {
-    TF_EXPECT_OK(
-        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
-    if (!end_of_sequence) {
-      EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
-      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-      expected_outputs_it++;
-    }
-  }
-  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+          {/*dataset_params=*/BatchDatasetParams7(),
+           /*expected_outputs=*/{}}};
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, DatasetNodeName) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+ITERATOR_GET_NEXT_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
+                         GetNextTestCases())
 
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  EXPECT_EQ(batch_dataset->node_name(), kNodeName);
+TEST_F(BatchDatasetOpTest, DatasetNodeName) {
+  auto batch_dataset_params = BatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(batch_dataset_params.node_name));
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, DatasetTypeString) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
+TEST_F(BatchDatasetOpTest, DatasetTypeString) {
+  auto batch_dataset_params = BatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&batch_dataset_params));
   name_utils::OpNameParams params;
   params.op_version = kOpVersion;
-  EXPECT_EQ(batch_dataset->type_string(),
-            name_utils::OpName(BatchDatasetOp::kDatasetType, params));
+  TF_ASSERT_OK(CheckDatasetTypeString(
+      name_utils::OpName(BatchDatasetOp::kDatasetType, params)));
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, DatasetOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  TF_EXPECT_OK(VerifyTypesMatch(batch_dataset->output_dtypes(),
-                                test_case.expected_output_dtypes));
+TEST_F(BatchDatasetOpTest, DatasetOutputDtypes) {
+  auto batch_dataset_params = BatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, DatasetOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  TF_EXPECT_OK(VerifyShapesCompatible(batch_dataset->output_shapes(),
-                                      test_case.expected_output_shapes));
+std::vector<DatasetOutputShapesTestCase<BatchDatasetParams>>
+DatasetOutputShapesTestCases() {
+  return {{/*dataset_params=*/BatchDatasetParams1(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}},
+          {/*dataset_params=*/BatchDatasetParams2(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}},
+          {/*dataset_params=*/BatchDatasetParams3(),
+           /*expected_output_shapes=*/{PartialTensorShape({-1})}},
+          {/*dataset_params=*/BatchDatasetParams4(),
+           /*expected_output_shapes=*/{PartialTensorShape({3})}},
+          {/*dataset_params=*/BatchDatasetParams5(),
+           /*expected_output_shapes=*/{PartialTensorShape({12})}},
+          {/*dataset_params=*/BatchDatasetParams6(),
+           /*expected_output_shapes=*/{PartialTensorShape({-1})}},
+          {/*dataset_params=*/BatchDatasetParams7(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}}};
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, Cardinality) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+DATASET_OUTPUT_SHAPES_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
+                             DatasetOutputShapesTestCases())
 
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  EXPECT_EQ(batch_dataset->Cardinality(), test_case.expected_cardinality);
+std::vector<CardinalityTestCase<BatchDatasetParams>> CardinalityTestCases() {
+  return {
+      {/*dataset_params=*/BatchDatasetParams1(), /*expected_cardinality=*/3},
+      {/*dataset_params=*/BatchDatasetParams2(), /*expected_cardinality=*/3},
+      {/*dataset_params=*/BatchDatasetParams3(), /*expected_cardinality=*/4},
+      {/*dataset_params=*/BatchDatasetParams4(), /*expected_cardinality=*/3},
+      {/*dataset_params=*/BatchDatasetParams5(), /*expected_cardinality=*/0},
+      {/*dataset_params=*/BatchDatasetParams6(), /*expected_cardinality=*/1},
+      {/*dataset_params=*/BatchDatasetParams7(), /*expected_cardinality=*/0}};
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+DATASET_CARDINALITY_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
+                           CardinalityTestCases())
 
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(batch_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
+TEST_F(BatchDatasetOpTest, IteratorOutputDtypes) {
+  auto batch_dataset_params = BatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(
-      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
-                                test_case.expected_output_dtypes));
+std::vector<IteratorOutputShapesTestCase<BatchDatasetParams>>
+IteratorOutputShapesTestCases() {
+  return {{/*dataset_params=*/BatchDatasetParams1(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}},
+          {/*dataset_params=*/BatchDatasetParams2(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}},
+          {/*dataset_params=*/BatchDatasetParams3(),
+           /*expected_output_shapes=*/{PartialTensorShape({-1})}},
+          {/*dataset_params=*/BatchDatasetParams4(),
+           /*expected_output_shapes=*/{PartialTensorShape({3})}},
+          {/*dataset_params=*/BatchDatasetParams5(),
+           /*expected_output_shapes=*/{PartialTensorShape({12})}},
+          {/*dataset_params=*/BatchDatasetParams6(),
+           /*expected_output_shapes=*/{PartialTensorShape({-1})}},
+          {/*dataset_params=*/BatchDatasetParams7(),
+           /*expected_output_shapes=*/{PartialTensorShape({4})}}};
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+ITERATOR_OUTPUT_SHAPES_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
+                              IteratorOutputShapesTestCases())
 
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(
-      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
-                                      test_case.expected_output_shapes));
-}
-
-TEST_P(ParameterizedBatchDatasetOpTest, IteratorOutputPrefix) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(
-      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+TEST_F(BatchDatasetOpTest, IteratorOutputPrefix) {
+  auto batch_dataset_params = BatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&batch_dataset_params));
   name_utils::IteratorPrefixParams params;
   params.op_version = kOpVersion;
-  EXPECT_EQ(iterator->prefix(),
-            name_utils::IteratorPrefix(BatchDatasetOp::kDatasetType, "Iterator",
-                                       params));
+  TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
+      BatchDatasetOp::kDatasetType, batch_dataset_params.iterator_prefix,
+      params)));
 }
 
-TEST_P(ParameterizedBatchDatasetOpTest, Roundtrip) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  TF_ASSERT_OK(CreateDataset(batch_dataset_kernel.get(),
-                             batch_dataset_context.get(), &batch_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(batch_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(
-      CreateIteratorContext(batch_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      batch_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  bool end_of_sequence = false;
-  std::vector<Tensor> out_tensors;
-  int cur_iteration = 0;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  for (int breakpoint : test_case.breakpoints) {
-    VariantTensorData data;
-    VariantTensorDataWriter writer(&data);
-    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
-    TF_EXPECT_OK(writer.Flush());
-    VariantTensorDataReader reader(&data);
-    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
-                                 *batch_dataset, &iterator));
-
-    while (cur_iteration <= breakpoint) {
-      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
-                                     &end_of_sequence));
-      if (!end_of_sequence) {
-        EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
-        TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-        expected_outputs_it++;
-      }
-      cur_iteration++;
-    }
-
-    if (breakpoint >= test_case.expected_cardinality) {
-      EXPECT_TRUE(end_of_sequence);
-      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
-    } else {
-      EXPECT_FALSE(end_of_sequence);
-    }
-  }
+std::vector<IteratorSaveAndRestoreTestCase<BatchDatasetParams>>
+IteratorSaveAndRestoreTestCases() {
+  return {{/*dataset_params=*/BatchDatasetParams1(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({4}),
+                                {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}})},
+          {/*dataset_params=*/BatchDatasetParams2(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({4}),
+                                {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}})},
+          {/*dataset_params=*/BatchDatasetParams3(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
+            CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
+            CreateTensor<int64>(TensorShape({3}), {6, 7, 8}),
+            CreateTensor<int64>(TensorShape({1}), {9})}},
+          {/*dataset_params=*/BatchDatasetParams4(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({3}), {0, 1, 2}),
+            CreateTensor<int64>(TensorShape({3}), {3, 4, 5}),
+            CreateTensor<int64>(TensorShape({3}), {6, 7, 8})}},
+          {/*dataset_params=*/BatchDatasetParams5(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/{}},
+          {/*dataset_params=*/BatchDatasetParams6(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({10}),
+                                {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})}},
+          {/*dataset_params=*/BatchDatasetParams7(),
+           /*breakpoints=*/{0, 1, 5},
+           /*expected_outputs=*/{}}};
 }
 
-INSTANTIATE_TEST_SUITE_P(BatchDatasetOpTest, ParameterizedBatchDatasetOpTest,
-                         ::testing::ValuesIn(std::vector<TestCase>(
-                             {TestCase1(), TestCase2(), TestCase3(),
-                              TestCase4(), TestCase5(), TestCase6(),
-                              TestCase7()})));
+ITERATOR_SAVE_AND_RESTORE_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
+                                 IteratorSaveAndRestoreTestCases())
 
 TEST_F(BatchDatasetOpTest, InvalidBatchSize) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  TestCase test_case = InvalidBatchSizeTestCase();
-  std::unique_ptr<OpKernel> batch_dataset_kernel;
-  TF_ASSERT_OK(CreateBatchDatasetOpKernel(
-      test_case.parallel_copy, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &batch_dataset_kernel));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor batch_size = test_case.batch_size;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&range_dataset_tensor),
-                                            TensorValue(&batch_size),
-                                            TensorValue(&drop_remainder)};
-  std::unique_ptr<OpKernelContext> batch_dataset_context;
-  TF_ASSERT_OK(CreateBatchDatasetContext(batch_dataset_kernel.get(), &inputs,
-                                         &batch_dataset_context));
-  DatasetBase* batch_dataset;
-  EXPECT_EQ(CreateDataset(batch_dataset_kernel.get(),
-                          batch_dataset_context.get(), &batch_dataset)
-                .code(),
+  auto batch_dataset_params = InvalidBatchSizeBatchDatasetParams();
+  EXPECT_EQ(Initialize(&batch_dataset_params).code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 9b1fed90463..c2646168d7f 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/cache_ops.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -54,6 +55,7 @@ constexpr char kSizeSuffix[] = ".size";
 constexpr char kCacheCompleted[] = "cache_completed";
 constexpr char kIndex[] = "index";
 constexpr char kImpl[] = "Impl";
+constexpr char kCacheDataset[] = "CacheDataset";
 
 class CacheDatasetOp::FileDataset : public DatasetBase {
  public:
@@ -99,6 +101,10 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -111,6 +117,9 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
     return Status::OK();
   }
 
+  const DatasetBase* const input_;
+  const tstring filename_;
+
  private:
   static size_t StringPaddingSize(size_t num_tensors) {
     return strings::Printf(kPaddingSizeStrFormat, num_tensors - 1).size();
@@ -215,6 +224,25 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
             lockfile_created_(false),
             iteration_completed_(false) {}
 
+      ~FileWriterIterator() {
+        if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) {
+          std::vector<string> cache_files;
+          Status s = dataset()->env_->GetMatchingPaths(
+              strings::StrCat(filename_, "*"), &cache_files);
+          if (!s.ok()) {
+            LOG(WARNING) << "Failed to get matching files on " << filename_
+                         << "* : " << s.ToString();
+          }
+          for (const string& path : cache_files) {
+            s = dataset()->env_->DeleteFile(path);
+            if (!s.ok()) {
+              LOG(WARNING) << "Failed to delete " << path << " : "
+                           << s.ToString();
+            }
+          }
+        }
+      }
+
       Status Initialize(IteratorContext* ctx) override {
         return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
       }
@@ -275,6 +303,9 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kCurIndex), cur_index_));
+
         if (iteration_completed_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name(kIterationCompleted), ""));
@@ -301,8 +332,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
           lockfile_created_ = false;
         }
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(kCurIndex), cur_index_));
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kShardId), shard_id_));
         return Status::OK();
       }
@@ -310,12 +339,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (reader->Contains(full_name(kIterationCompleted))) {
-          iteration_completed_ = true;
-          return Status::OK();
-        }
-
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         int64 temp;
         // TODO(b/78048575): Update this when saving size_t tensors directly
         // is supported.
@@ -326,6 +349,14 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
             return errors::Internal("Invalid value for cur_index ", temp);
           }
         }
+
+        if (reader->Contains(full_name(kIterationCompleted))) {
+          iteration_completed_ = true;
+          return Status::OK();
+        }
+
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+
         // TODO(b/78048575): Update this when saving size_t tensors directly
         // is supported.
         {
@@ -348,7 +379,9 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
           *end_of_sequence = true;
           return Status::OK();
         }
-        if (lockfile_created_ && !iteration_completed_) return Status::OK();
+        if (lockfile_created_) {
+          return Status::OK();
+        }
 
         // Perform rudimentary locking to help catch concurrent writes to the
         // same cache files.
@@ -409,12 +442,12 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
         // Merge all the bundles.
         // Currently there are `shard_id_ + 1` bundles, one for each
         // checkpoint. Each bundle has prefix <filename>_<id> where `id` is an
-        // integer starting at 0 an incremented by 1 for each new checkpoint.
+        // integer starting at 0 and incremented by 1 for each new checkpoint.
         // We merge all these bundles into a bundle with prefix <filename> so
         // that the next call to `MakeIterator` can build a
         // `FileReaderIterator`.
         {
-          std::vector<string> prefixes;
+          std::vector<tstring> prefixes;
           prefixes.reserve(shard_id_ + 1);
           for (size_t i = 0; i <= shard_id_; ++i) {
             prefixes.emplace_back(
@@ -562,8 +595,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
     std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
   };  // FileIterator
 
-  const DatasetBase* const input_;
-  const string filename_;
   Env* const env_;
   const size_t num_tensors_;
   const size_t tensor_index_padding_size_;
@@ -572,21 +603,56 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
   const string tensor_format_string_;
 };  // FileDataset
 
-class CacheDatasetOp::MemoryDataset : public DatasetBase {
+class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDataset {
  public:
-  explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input)
-      : DatasetBase(DatasetContext(ctx)), input_(input) {
-    input->Ref();
+  explicit FileDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
+                         string filename, Env* env,
+                         const Tensor& resource_handle)
+      : FileDataset(ctx, input, filename, env),
+        resource_handle_(resource_handle) {}
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+    Node* filename_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(filename_, &filename_node));
+    Node* resource_handle_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this, {input_node, filename_node, resource_handle_node}, output));
+    return Status::OK();
   }
 
-  ~MemoryDataset() override { input_->Unref(); }
+ private:
+  const Tensor resource_handle_;
+};
+
+class CacheDatasetOp::MemoryDataset : public DatasetBase {
+ public:
+  explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input,
+                         MemoryCache* cache)
+      : DatasetBase(DatasetContext(ctx)), input_(input), cache_(cache) {
+    input_->Ref();
+  }
+
+  ~MemoryDataset() override {
+    input_->Unref();
+    if (cache_) {
+      cache_->Unref();
+    }
+  }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.dataset_prefix = kMemoryDatasetPrefix;
-    return absl::make_unique<MemoryIterator>(MemoryIterator::Params{
-        this, name_utils::IteratorPrefix(kDatasetType, prefix, params)});
+    return absl::make_unique<MemoryIterator>(
+        MemoryIterator::Params{
+            this, name_utils::IteratorPrefix(kDatasetType, prefix, params)},
+        cache_);
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -605,6 +671,10 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -612,108 +682,38 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
     Node* input_node = nullptr;
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
     Node* filename_node = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(string(""), &filename_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_node, filename_node}, output));
     return Status::OK();
   }
 
- private:
-  // A thread-safe data structure for caching dataset elements.
-  //
-  // The expected use is that a single `MemoryWriterIterator` populates the
-  // cache with dataset elements. Once all elements are cached, the cache can
-  // be used by one or more `MemoryReaderIterator`s.
-  class MemoryCache : public ResourceBase {
-   public:
-    MemoryCache() = default;
-
-    string DebugString() const override { return "CacheDataset::MemoryCache"; }
-
-    // Marks the cache as completed.
-    void Complete() {
-      mutex_lock l(mu_);
-      completed_ = true;
-    }
-
-    // Returns whether the cache is claimed.
-    bool IsClaimed() {
-      tf_shared_lock l(mu_);
-      return claimed_;
-    }
-
-    // Returns whether the cache is completed.
-    bool IsCompleted() {
-      tf_shared_lock l(mu_);
-      return completed_;
-    }
-
-    // Attempts to claim the cache, returning whether the cache was claimed.
-    bool MaybeClaim() {
-      mutex_lock l(mu_);
-      if (!claimed_) {
-        claimed_ = true;
-        return true;
-      }
-      return false;
-    }
-
-    // Resets the cache.
-    void Reset() {
-      mutex_lock l(mu_);
-      claimed_ = false;
-      completed_ = false;
-      cache_.clear();
-    }
-
-    // Returns the element at the given index.
-    const std::vector<Tensor>& at(int64 index) {
-      tf_shared_lock l(mu_);
-      DCHECK(index < cache_.size());
-      return cache_[index];
-    }
-
-    // Adds the element to the cache.
-    void emplace_back(std::vector<Tensor> element) {
-      mutex_lock l(mu_);
-      cache_.emplace_back(std::move(element));
-    }
-
-    // Returns the size of the cache.
-    size_t size() {
-      tf_shared_lock l(mu_);
-      return cache_.size();
-    }
-
-   private:
-    mutex mu_;
-    // Determines whether a writer has claimed the cache.
-    bool claimed_ GUARDED_BY(mu_) = false;
-    // Determines whether all elements of the dataset have been cached.
-    bool completed_ GUARDED_BY(mu_) = false;
-    std::vector<std::vector<Tensor>> cache_ GUARDED_BY(mu_);
-  };
-
   class MemoryIterator : public DatasetIterator<MemoryDataset> {
    public:
-    explicit MemoryIterator(const Params& params)
-        : DatasetIterator<MemoryDataset>(params) {}
+    explicit MemoryIterator(const Params& params, MemoryCache* cache)
+        : DatasetIterator<MemoryDataset>(params), cache_(cache) {}
 
-    ~MemoryIterator() override { cache_->Unref(); }
+    ~MemoryIterator() override {
+      if (dataset()->cache_ == nullptr) {
+        cache_->Unref();
+      }
+    }
 
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
-      // Use the resource manager in the iterator context to get / create
-      // a cache.
-      ResourceMgr* mgr = ctx->resource_mgr();
-      const string name = strings::StrCat(prefix(), name_utils::kDelimiter,
-                                          dataset()->node_name(),
-                                          name_utils::kDelimiter, kMemoryCache);
-      TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
-          kTFData, name, &cache_, [](MemoryCache** cache) {
-            *cache = new MemoryCache();
-            return Status::OK();
-          }));
+      if (cache_ == nullptr) {
+        // Use the resource manager in the iterator context to get / create
+        // a cache.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        const string name = strings::StrCat(
+            prefix(), name_utils::kDelimiter, dataset()->node_name(),
+            name_utils::kDelimiter, kMemoryCache);
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
+            kTFData, name, &cache_, [](MemoryCache** cache) {
+              *cache = new MemoryCache();
+              return Status::OK();
+            }));
+      }
       mode_ = cache_->MaybeClaim() ? Mode::write : Mode::read;
       InitializeIterator();
       if (mode_ == Mode::read && !cache_->IsCompleted()) {
@@ -966,27 +966,73 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
   };  // MemoryIterator
 
   const DatasetBase* const input_;
+  MemoryCache* cache_ = nullptr;
 };  // MemoryDataset
 
+class CacheDatasetOp::MemoryDatasetV2 : public CacheDatasetOp::MemoryDataset {
+ public:
+  explicit MemoryDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
+                           MemoryCache* cache, const Tensor& resource_handle)
+      : MemoryDataset(ctx, input, cache), resource_handle_(resource_handle) {}
+
+  Status CheckExternalState() const override {
+    return errors::FailedPrecondition(DebugString(),
+                                      " depends on memory cache resource.");
+  }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+    Node* filename_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
+    Node* resource_handle_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this, {input_node, filename_node, resource_handle_node}, output));
+    return Status::OK();
+  }
+
+ private:
+  const Tensor resource_handle_;
+};
+
 CacheDatasetOp::CacheDatasetOp(OpKernelConstruction* ctx)
-    : UnaryDatasetOpKernel(ctx) {}
+    : UnaryDatasetOpKernel(ctx),
+      op_version_(ctx->def().op() == kCacheDataset ? 1 : 2) {}
 
 void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                  DatasetBase** output) {
   // Parse out the filenames tensor.
-  string filename;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, kFileName, &filename));
+  tstring filename;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kFileName, &filename));
 
   if (filename.empty()) {
-    *output = new MemoryDataset(ctx, input);
+    if (op_version_ == 2) {
+      MemoryCache* cache = nullptr;
+      OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 2), &cache));
+      // Transferring cache reference ownership onto `MemoryDatasetV2`.
+      *output = new MemoryDatasetV2(ctx, input, cache, ctx->input(2));
+    } else {
+      *output = new MemoryDataset(ctx, input, /*cache=*/nullptr);
+    }
   } else {
-    *output = new FileDataset(ctx, input, filename, ctx->env());
+    if (op_version_ == 2) {
+      *output =
+          new FileDatasetV2(ctx, input, filename, ctx->env(), ctx->input(2));
+    } else {
+      *output = new FileDataset(ctx, input, filename, ctx->env());
+    }
   }
 }
 
 namespace {
 REGISTER_KERNEL_BUILDER(Name("CacheDataset").Device(DEVICE_CPU),
                         CacheDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("CacheDatasetV2").Device(DEVICE_CPU),
+                        CacheDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.h b/tensorflow/core/kernels/data/cache_dataset_ops.h
index af023a60075..484d0489336 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.h
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OP_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
 
 #include "tensorflow/core/framework/dataset.h"
 
@@ -22,6 +22,9 @@ namespace data {
 
 class CacheDatasetOp : public UnaryDatasetOpKernel {
  public:
+  class FileDataset;
+  class MemoryDataset;
+
   static constexpr const char* const kDatasetType = "Cache";
   static constexpr const char* const kInputDataset = "input_dataset";
   static constexpr const char* const kFileName = "filename";
@@ -35,11 +38,13 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override;
 
  private:
-  class FileDataset;
-  class MemoryDataset;
+  class FileDatasetV2;
+  class MemoryDatasetV2;
+
+  int op_version_;
 };
 
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
index 812d719946f..ae71fa182cf 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
@@ -23,6 +23,25 @@ constexpr char kFileDatasetPrefix[] = "File";
 constexpr char kMemoryDatasetPrefix[] = "Memory";
 
 class CacheDatasetOpTest : public DatasetOpsTestBase {
+ public:
+  ~CacheDatasetOpTest() {
+    if (!filename_.empty()) {
+      std::vector<string> cache_files;
+      Status s = device_->env()->GetMatchingPaths(
+          strings::StrCat(filename_, "*"), &cache_files);
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to get matching files on " << filename_
+                     << "* : " << s.ToString();
+      }
+      for (const string& path : cache_files) {
+        s = device_->env()->DeleteFile(path);
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to delete " << path << " : " << s.ToString();
+        }
+      }
+    }
+  }
+
  protected:
   // Creates `TensorSliceDataset` variant tensor from the input vector of
   // tensors.
@@ -57,8 +76,13 @@ class CacheDatasetOpTest : public DatasetOpsTestBase {
       std::unique_ptr<OpKernelContext>* context) {
     TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
     TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    TF_RETURN_IF_ERROR(ParseScalarArgument<tstring>(
+        context->get(), CacheDatasetOp::kFileName, &filename_));
     return Status::OK();
   }
+
+ private:
+  tstring filename_ = "";
 };
 
 struct TestCase {
@@ -73,58 +97,54 @@ struct TestCase {
 
 // Test case 1: cache data in file.
 TestCase TestCase1() {
-  return {
-      /*input_tensors*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-      /*file_name*/ absl::StrCat(testing::TmpDir(), "/cache_data"),
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {0, 1, 2}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {3, 4, 5}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/ {CreateTensor<int64>(TensorShape{3, 3, 1},
+                                                 {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+          /*file_name*/ absl::StrCat(testing::TmpDir(), "/cache_data"),
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{3, 1}, {0, 1, 2}),
+           CreateTensor<int64>(TensorShape{3, 1}, {3, 4, 5}),
+           CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 4, 11}};
 }
 
 // Test case 2: cache empty data in file.
 TestCase TestCase2() {
-  return {/*input_tensors*/ {
-              DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+  return {/*input_tensors*/ {CreateTensor<int64>(TensorShape{0}, {})},
           /*file_name*/ absl::StrCat(testing::TmpDir(), "/empty_cache_data"),
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 4, 11}};
+          /*breakpoints*/ {0, 2, 4, 11}};
 }
 
 // Test case 3: cache data in memory.
 TestCase TestCase3() {
-  return {
-      /*input_tensors*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-      /*file_name*/ "",
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {0, 1, 2}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {3, 4, 5}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/ {CreateTensor<int64>(TensorShape{3, 3, 1},
+                                                 {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+          /*file_name*/ "",
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{3, 1}, {0, 1, 2}),
+           CreateTensor<int64>(TensorShape{3, 1}, {3, 4, 5}),
+           CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 4, 11}};
 }
 
 // Test case 4: cache empty data in memory.
 TestCase TestCase4() {
-  return {/*input_tensors*/ {
-              DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+  return {/*input_tensors*/ {CreateTensor<int64>(TensorShape{0}, {})},
           /*file_name*/ "",
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 4, 11}};
+          /*breakpoints*/ {0, 2, 4, 11}};
 }
 
 class ParameterizedCacheDatasetOpTest
@@ -145,7 +165,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, GetNext) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -204,7 +225,8 @@ TEST_F(CacheDatasetOpTest, DatasetNodeName) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -232,7 +254,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, DatasetTypeString) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -261,7 +284,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, DatasetOutputDtypes) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -290,7 +314,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, DatasetOutputShapes) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -319,7 +344,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, Cardinality) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -333,39 +359,6 @@ TEST_P(ParameterizedCacheDatasetOpTest, Cardinality) {
   EXPECT_EQ(cache_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedCacheDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> cache_dataset_kernel;
-  TF_ASSERT_OK(CreateCacheDatasetOpKernel(test_case.expected_output_dtypes,
-                                          test_case.expected_output_shapes,
-                                          &cache_dataset_kernel));
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
-  std::unique_ptr<OpKernelContext> cache_dataset_context;
-  TF_ASSERT_OK(CreateCacheDatasetContext(cache_dataset_kernel.get(), &inputs,
-                                         &cache_dataset_context));
-  DatasetBase* cache_dataset;
-  TF_ASSERT_OK(CreateDataset(cache_dataset_kernel.get(),
-                             cache_dataset_context.get(), &cache_dataset));
-  core::ScopedUnref scoped_unref(cache_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(cache_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedCacheDatasetOpTest, IteratorOutputShapes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
@@ -380,7 +373,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, IteratorOutputShapes) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -416,7 +410,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, IteratorOutputPrefix) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
@@ -456,7 +451,8 @@ TEST_P(ParameterizedCacheDatasetOpTest, Roundtrip) {
   std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
   TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
                                               &tensor_slice_dataset_tensor));
-  Tensor file_name = CreateTensor<string>(TensorShape{}, {test_case.file_name});
+  Tensor file_name =
+      CreateTensor<tstring>(TensorShape{}, {test_case.file_name});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&file_name)});
   std::unique_ptr<OpKernelContext> cache_dataset_context;
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
new file mode 100644
index 00000000000..2d77e0378f7
--- /dev/null
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/cache_ops.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+const char kMemoryCache[] = "MemoryCache";
+
+}  // namespace
+
+string MemoryCache::DebugString() const { return kMemoryCache; }
+
+void MemoryCache::Complete() {
+  mutex_lock l(mu_);
+  completed_ = true;
+}
+
+bool MemoryCache::IsClaimed() {
+  tf_shared_lock l(mu_);
+  return claimed_;
+}
+
+bool MemoryCache::IsCompleted() {
+  tf_shared_lock l(mu_);
+  return completed_;
+}
+
+bool MemoryCache::MaybeClaim() {
+  mutex_lock l(mu_);
+  if (!claimed_) {
+    claimed_ = true;
+    return true;
+  }
+  return false;
+}
+
+void MemoryCache::Reset() {
+  mutex_lock l(mu_);
+  claimed_ = false;
+  completed_ = false;
+  cache_.clear();
+}
+
+const std::vector<Tensor>& MemoryCache::at(int64 index) {
+  tf_shared_lock l(mu_);
+  DCHECK(index < cache_.size());
+  return cache_[index];
+}
+
+void MemoryCache::emplace_back(std::vector<Tensor> element) {
+  mutex_lock l(mu_);
+  cache_.emplace_back(std::move(element));
+}
+
+size_t MemoryCache::size() {
+  tf_shared_lock l(mu_);
+  return cache_.size();
+}
+
+AnonymousMemoryCacheHandleOp::AnonymousMemoryCacheHandleOp(
+    OpKernelConstruction* ctx)
+    : AnonymousResourceOp<MemoryCache>(ctx) {}
+
+void AnonymousMemoryCacheHandleOp::Compute(OpKernelContext* ctx) {
+  AnonymousResourceOp<MemoryCache>::Compute(ctx);
+}
+
+string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
+
+Status AnonymousMemoryCacheHandleOp::CreateResource(
+    OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+    FunctionLibraryRuntime* lib, MemoryCache** resource) {
+  *resource = new MemoryCache();
+  return Status::OK();
+}
+
+void DeleteMemoryCacheOp::Compute(OpKernelContext* ctx) {
+  const ResourceHandle& handle = ctx->input(0).flat<ResourceHandle>()(0);
+  // The resource is guaranteed to exist because the variant tensor wrapping the
+  // deleter is provided as an unused input to this op, which guarantees that it
+  // has not run yet.
+  Status s = ctx->resource_manager()->Delete(handle);
+  if (errors::IsNotFound(s)) {
+    // TODO(b/135948230): Investigate why is the above statement not true and
+    // then get rid of the special case.
+    ctx->SetStatus(Status::OK());
+    return;
+  }
+  ctx->SetStatus(s);
+}
+
+namespace {
+
+REGISTER_KERNEL_BUILDER(Name("AnonymousMemoryCache").Device(DEVICE_CPU),
+                        AnonymousMemoryCacheHandleOp);
+
+REGISTER_KERNEL_BUILDER(Name("DeleteMemoryCache").Device(DEVICE_CPU),
+                        DeleteMemoryCacheOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
new file mode 100644
index 00000000000..c022c06f291
--- /dev/null
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+
+namespace tensorflow {
+namespace data {
+
+// A thread-safe data structure for caching dataset elements.
+//
+// The expected use is that a single `MemoryWriterIterator` populates the
+// cache with dataset elements. Once all elements are cached, the cache can
+// be used by one or more `MemoryReaderIterator`s.
+class MemoryCache : public ResourceBase {
+ public:
+  MemoryCache() = default;
+
+  string DebugString() const override;
+
+  // Marks the cache as completed.
+  void Complete();
+
+  // Returns whether the cache is claimed.
+  bool IsClaimed();
+
+  // Returns whether the cache is completed.
+  bool IsCompleted();
+
+  // Attempts to claim the cache, returning whether the cache was claimed.
+  bool MaybeClaim();
+
+  // Resets the cache.
+  void Reset();
+
+  // Returns the element at the given index.
+  const std::vector<Tensor>& at(int64 index);
+
+  // Adds the element to the cache.
+  void emplace_back(std::vector<Tensor> element);
+
+  // Returns the size of the cache.
+  size_t size();
+
+ private:
+  mutex mu_;
+  // Determines whether a writer has claimed the cache.
+  bool claimed_ GUARDED_BY(mu_) = false;
+  // Determines whether all elements of the dataset have been cached.
+  bool completed_ GUARDED_BY(mu_) = false;
+  std::vector<std::vector<Tensor>> cache_ GUARDED_BY(mu_);
+};
+
+// Creates an instance of cache resource and transfers ownership to the caller.
+class AnonymousMemoryCacheHandleOp : public AnonymousResourceOp<MemoryCache> {
+ public:
+  explicit AnonymousMemoryCacheHandleOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  string name() override;
+  Status CreateResource(OpKernelContext* ctx,
+                        std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                        FunctionLibraryRuntime* lib,
+                        MemoryCache** resource) override;
+};
+
+// Deletes an instance of cache resource.
+class DeleteMemoryCacheOp : public OpKernel {
+ public:
+  explicit DeleteMemoryCacheOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 26290166c1e..14dc91966d7 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -210,8 +210,74 @@ Status CreateFunctionLibraryDefinition(
   return (*result)->CopyFunctionDefFrom(func_name, *lib_def);
 }
 
+Status IsFunctionStateful(const FunctionLibraryDefinition& library,
+                          const FunctionDef& function_def) {
+  if (!function_def.signature().is_stateful()) {
+    return Status::OK();
+  }
+
+  for (const NodeDef& node_def : function_def.node_def()) {
+    TF_RETURN_IF_ERROR(IsNodeStateful(library, node_def));
+  }
+  return Status::OK();
+}
+
+// Returns whether an op has been whitelisted as stateless. Uses a heuristic to
+// whitelist source dataset ops which have been marked stateful due to
+// b/65524810. Also looks up the `op_def->name` in the global
+// `WhitelistedStatefulOpRegistry`.
+bool IsOpWhitelisted(const OpDef* op_def) {
+  return (op_def->output_arg_size() == 1 &&
+          op_def->output_arg(0).type() == DT_VARIANT &&
+          (absl::EndsWith(op_def->name(), "Dataset") ||
+           absl::EndsWith(op_def->name(), "DatasetV2"))) ||
+         WhitelistedStatefulOpRegistry::Global()->Contains(op_def->name());
+}
 }  // namespace
 
+Status IsNodeStateful(const FunctionLibraryDefinition& library,
+                      const NodeDef& node) {
+  const OpDef* op_def;
+
+  // TODO(jsimsa): Fix C++ unit tests so that we do not have to ignore
+  // `LookUpOpDef` errors here.
+  if (!OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok() ||
+      IsOpWhitelisted(op_def) || !op_def->is_stateful() ||
+      op_def->name() == "Assert") {
+    return Status::OK();
+  }
+
+  if (op_def->name() == "If") {
+    const FunctionDef* then_func =
+        library.Find(node.attr().at("then_branch").func().name());
+    const FunctionDef* else_func =
+        library.Find(node.attr().at("else_branch").func().name());
+    if (then_func != nullptr) {
+      TF_RETURN_IF_ERROR(IsFunctionStateful(library, *then_func));
+    }
+    if (else_func != nullptr) {
+      TF_RETURN_IF_ERROR(IsFunctionStateful(library, *else_func));
+    }
+    return Status::OK();
+  }
+
+  if (op_def->name() == "While") {
+    const FunctionDef* cond_func =
+        library.Find(node.attr().at("cond").func().name());
+    const FunctionDef* body_func =
+        library.Find(node.attr().at("body").func().name());
+    if (cond_func != nullptr) {
+      TF_RETURN_IF_ERROR(IsFunctionStateful(library, *cond_func));
+    }
+    if (body_func != nullptr) {
+      TF_RETURN_IF_ERROR(IsFunctionStateful(library, *body_func));
+    }
+    return Status::OK();
+  }
+
+  return errors::FailedPrecondition(op_def->name(), " is stateful.");
+}
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
@@ -372,11 +438,22 @@ Status CapturedFunction::Instantiate(
     // TODO(jsimsa): Correctly handle tensors on devices other than CPU:0.
     Device* cpu_device;
     TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
-    for (auto& input : captured_inputs_) {
+    std::unordered_map<int, DtypeAndPartialTensorShape>&
+        input_resource_variable_dtypes_and_shapes =
+            inst_opts.input_resource_dtypes_and_shapes;
+    for (size_t i = 0; i < captured_inputs_.size(); ++i) {
+      const auto& input = captured_inputs_[i];
       DataType dtype = input.dtype();
       if (dtype == DT_RESOURCE) {
         const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
         inst_opts.input_devices.push_back(handle.device());
+        const auto& dtypes_and_shapes = handle.dtypes_and_shapes();
+        // Set dtypes and shapes for resource variable inputs.
+        if (!dtypes_and_shapes.empty()) {
+          input_resource_variable_dtypes_and_shapes[num_non_captured_inputs +
+                                                    i] =
+              dtypes_and_shapes.at(0);
+        }
       } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
         inst_opts.input_devices.push_back(cpu_device->name());
       } else {
@@ -401,7 +478,18 @@ Status CapturedFunction::Instantiate(
   *instantiated_captured_function =
       absl::WrapUnique<InstantiatedCapturedFunction>(
           new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types),
-                                           *ctx->runner(), this));
+                                           *ctx->runner(),
+                                           ctx->cancellation_manager(), this));
+  return Status::OK();
+}
+
+bool CapturedFunction::IsStateful() const { return !CheckExternalState().ok(); }
+
+Status CapturedFunction::CheckExternalState() const {
+  for (const auto& name : lib_def()->ListFunctionNames()) {
+    TF_RETURN_IF_ERROR(
+        IsFunctionStateful(*lib_def(), *(lib_def()->Find(name))));
+  }
   return Status::OK();
 }
 
@@ -522,11 +610,12 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 InstantiatedCapturedFunction::InstantiatedCapturedFunction(
     FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
     DataTypeVector ret_types, std::function<void(std::function<void()>)> runner,
-    CapturedFunction* captured_func)
+    CancellationManager* cancellation_manager, CapturedFunction* captured_func)
     : lib_(lib),
       f_handle_(f_handle),
       ret_types_(std::move(ret_types)),
       captured_runner_(std::move(runner)),
+      cancellation_manager_(cancellation_manager),
       captured_func_(captured_func) {}
 
 // NOTE: We don't release f_handle_ here and instead delegate the function
@@ -552,14 +641,12 @@ Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
   f_opts.create_rendezvous = ShouldCreateRendezvous();
-  // TODO(mrry): Add cancellation manager support to IteratorContext
-  // so that we can cancel running map functions. The local
-  // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness of
-  // `OpKernelContext::cancellation_manager()`, but additional effort
-  // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
+  CancellationManager cancellation_manager;
+  f_opts.cancellation_manager = &cancellation_manager;
+  std::function<void()> deregister_fn;
+  TF_RETURN_IF_ERROR(ConnectCancellationManagers(
+      cancellation_manager_, &cancellation_manager, &deregister_fn));
+  auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
 
   OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
                            ret_types_);
@@ -590,14 +677,12 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
   f_opts.create_rendezvous = ShouldCreateRendezvous();
-  // TODO(mrry): Add cancellation manager support to IteratorContext
-  // so that we can cancel running map functions. The local
-  // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness of
-  // `OpKernelContext::cancellation_manager()`, but additional effort
-  // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
+  CancellationManager cancellation_manager;
+  f_opts.cancellation_manager = &cancellation_manager;
+  std::function<void()> deregister_fn;
+  TF_RETURN_IF_ERROR(ConnectCancellationManagers(
+      cancellation_manager_, &cancellation_manager, &deregister_fn));
+  auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
@@ -628,14 +713,12 @@ Status InstantiatedCapturedFunction::RunInstantiated(
   f_opts.step_container = &step_container;
   f_opts.runner = &captured_runner_;
   f_opts.create_rendezvous = ShouldCreateRendezvous();
-  // TODO(mrry): Add cancellation manager support to IteratorContext
-  // so that we can cancel running map functions. The local
-  // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness of
-  // `OpKernelContext::cancellation_manager()`, but additional effort
-  // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
+  CancellationManager cancellation_manager;
+  f_opts.cancellation_manager = &cancellation_manager;
+  std::function<void()> deregister_fn;
+  TF_RETURN_IF_ERROR(ConnectCancellationManagers(
+      cancellation_manager_, &cancellation_manager, &deregister_fn));
+  auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
@@ -681,59 +764,65 @@ void InstantiatedCapturedFunction::RunAsync(
   f_opts.step_container = step_container;
   f_opts.runner = ctx->runner();
   f_opts.create_rendezvous = ShouldCreateRendezvous();
-  // TODO(mrry): Add cancellation manager support to IteratorContext
-  // so that we can cancel running map functions. The local
-  // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness of
-  // `OpKernelContext::cancellation_manager()`, but additional effort
-  // will be required to plumb it through the `IteratorContext`.
-  CancellationManager* c_mgr = new CancellationManager();
-  f_opts.cancellation_manager = c_mgr;
+  auto cancellation_manager = absl::make_unique<CancellationManager>();
+  f_opts.cancellation_manager = cancellation_manager.get();
+  std::function<void()> deregister_fn;
+  Status s = ConnectCancellationManagers(
+      ctx->cancellation_manager(), cancellation_manager.get(), &deregister_fn);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
   std::shared_ptr<SimpleStepStatsCollector> stats_collector;
   if (ctx->model() || ctx->stats_aggregator()) {
     stats_collector = absl::make_unique<SimpleStepStatsCollector>();
   }
   f_opts.stats_collector = stats_collector.get();
 
+  // Transfer ownership of the cancellation manager to `callback`.
+  CancellationManager* raw_cancellation_manager =
+      cancellation_manager.release();
   auto callback = std::bind(
-      [this, rets, step_container, c_mgr, frame](
+      [this, rets, step_container, raw_cancellation_manager, frame](
           const FunctionLibraryRuntime::DoneCallback& done,
-          const std::shared_ptr<model::Model>& model,
-          const std::shared_ptr<StatsAggregator>& stats_aggregator,
+          IteratorContext* ctx, const std::function<void()>& deregister_fn,
           const string& prefix,
           const std::shared_ptr<SimpleStepStatsCollector>& stats_collector,
           // Begin unbound arguments.
           Status s) {
         delete step_container;
-        delete c_mgr;
+        deregister_fn();
+        delete raw_cancellation_manager;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
         }
         delete frame;
-        // TODO(b/129085499) Utilize the `node_name` which would be unique than
-        // the prefix for the function execution time statistics.
-        // prefix_with_func_name would then be node_name + func_name.
-        if (stats_aggregator) {
-          string prefix_end =
-              str_util::Split(prefix, "::", str_util::SkipEmpty()).back();
-          string prefix_with_func_name =
-              strings::StrCat(prefix_end, stats_utils::kDelimiter,
-                              captured_func_->func().name());
-          stats_aggregator->AddToHistogram(
-              stats_utils::ExecutionTimeHistogramName(prefix_with_func_name),
-              {static_cast<float>(stats_collector->processing_time())},
-              model->NumElements(prefix));
-        }
-        if (model) {
-          model->AddProcessingTime(prefix, stats_collector->processing_time());
-          model->RecordStart(prefix, false /* stop_output */);
+        if (ctx->model()) {
+          // TODO(b/129085499) Utilize the `node_name` which would be unique
+          // than the prefix for the function execution time statistics.
+          // prefix_with_func_name would then be node_name + func_name.
+          if (ctx->stats_aggregator()) {
+            string prefix_end =
+                str_util::Split(prefix, "::", str_util::SkipEmpty()).back();
+            string prefix_with_func_name =
+                strings::StrCat(prefix_end, stats_utils::kDelimiter,
+                                captured_func_->func().name());
+            ctx->stats_aggregator()->AddToHistogram(
+                stats_utils::ExecutionTimeHistogramName(prefix_with_func_name),
+                {static_cast<float>(stats_collector->processing_time())},
+                ctx->model()->NumElements(prefix));
+          }
+          ctx->model()->AddProcessingTime(prefix,
+                                          stats_collector->processing_time());
+          ctx->model()->RecordStart(prefix, false /* stop_output */);
         }
         done(s);
-        if (model) {
-          model->RecordStop(prefix, false /* start_output */);
+        if (ctx->model()) {
+          ctx->model()->RecordStop(prefix, false /* start_output */);
         }
       },
-      std::move(done), ctx->model(), ctx->stats_aggregator(), prefix,
+      std::move(done), ctx, std::move(deregister_fn), prefix,
       std::move(stats_collector), std::placeholders::_1);
 
   lib_->Run(f_opts, f_handle_, frame, std::move(callback));
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 258fe172004..647620de80d 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -43,6 +44,9 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
     StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
 
+Status IsNodeStateful(const FunctionLibraryDefinition& library,
+                      const NodeDef& node);
+
 // `InstantiatedCapturedFunction` encapsulates all the runtime support needed
 // to execute a tensorflow function.
 //
@@ -93,6 +97,7 @@ class InstantiatedCapturedFunction {
       FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
       DataTypeVector ret_types,
       std::function<void(std::function<void()>)> runner,
+      CancellationManager* cancellation_manager,
       CapturedFunction* captured_func);
 
   // Determines whether a rendezvous object should be created when running the
@@ -105,6 +110,7 @@ class InstantiatedCapturedFunction {
   const FunctionLibraryRuntime::Handle f_handle_;
   const DataTypeVector ret_types_;
   std::function<void(std::function<void()>)> captured_runner_;
+  CancellationManager* cancellation_manager_;
   CapturedFunction* const captured_func_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
@@ -201,6 +207,15 @@ class CapturedFunction {
                      std::unique_ptr<InstantiatedCapturedFunction>*
                          instantiated_captured_function);
 
+  // Determines whether the captured function is stateful.
+  //
+  // TODO(jsimsa): Remove this method once all users of `CapturedFunction`
+  // migrate to `CheckExternalState`.
+  bool IsStateful() const;
+
+  // Determines whether the captured function is stateful.
+  Status CheckExternalState() const;
+
   // Returns the additional captured inputs that will be passed to the function.
   const std::vector<Tensor>& captured_inputs() const {
     return captured_inputs_;
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index f1368ae7c64..4d9bce125f2 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -85,6 +85,11 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return n1 + n2;
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(input_->CheckExternalState());
+    return to_concatenate_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
index b5399adc170..7f4c55f7ace 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
@@ -81,23 +81,19 @@ struct TestCase {
 // Test case 1: same shape.
 TestCase SameShapeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                    {1, 2, 3, 4}),
-            DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                    {5, 6, 7, 8})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                    {11, 12, 13, 14}),
-            DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                    {15, 16, 17, 18})}},
+          {{CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+            CreateTensor<int64>(TensorShape{2, 2}, {5, 6, 7, 8})},
+           {CreateTensor<int64>(TensorShape{2, 2}, {11, 12, 13, 14}),
+            CreateTensor<int64>(TensorShape{2, 2}, {15, 16, 17, 18})}},
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {5, 6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {7, 8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {11, 12}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {15, 16}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {13, 14}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {17, 18})},
+          {CreateTensor<int64>(TensorShape{2}, {1, 2}),
+           CreateTensor<int64>(TensorShape{2}, {5, 6}),
+           CreateTensor<int64>(TensorShape{2}, {3, 4}),
+           CreateTensor<int64>(TensorShape{2}, {7, 8}),
+           CreateTensor<int64>(TensorShape{2}, {11, 12}),
+           CreateTensor<int64>(TensorShape{2}, {15, 16}),
+           CreateTensor<int64>(TensorShape{2}, {13, 14}),
+           CreateTensor<int64>(TensorShape{2}, {17, 18})},
           /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
           /*expected_output_shapes*/
           {PartialTensorShape({2}), PartialTensorShape({2})},
@@ -107,42 +103,38 @@ TestCase SameShapeTestCase() {
 
 // Test case 2: different shape.
 TestCase DifferentShapeTestCase() {
-  return {
-      /*input_tensors*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                {1, 2, 3, 4, 5, 6}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                {7, 8, 9, 10})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
-                                                {11, 12, 13, 14}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {15, 16})}},
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3}, {1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {7, 8}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3}, {4, 5, 6}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {9, 10}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {11, 12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {15}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {13, 14}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {16})},
-      /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
-      /*expected_output_shapes*/
-      {PartialTensorShape({-1}), PartialTensorShape({-1})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 2, 5}};
+  return {/*input_tensors*/
+          {{CreateTensor<int64>(TensorShape{2, 3}, {1, 2, 3, 4, 5, 6}),
+            CreateTensor<int64>(TensorShape{2, 2}, {7, 8, 9, 10})},
+           {CreateTensor<int64>(TensorShape{2, 2}, {11, 12, 13, 14}),
+            CreateTensor<int64>(TensorShape{2, 1}, {15, 16})}},
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{3}, {1, 2, 3}),
+           CreateTensor<int64>(TensorShape{2}, {7, 8}),
+           CreateTensor<int64>(TensorShape{3}, {4, 5, 6}),
+           CreateTensor<int64>(TensorShape{2}, {9, 10}),
+           CreateTensor<int64>(TensorShape{2}, {11, 12}),
+           CreateTensor<int64>(TensorShape{1}, {15}),
+           CreateTensor<int64>(TensorShape{2}, {13, 14}),
+           CreateTensor<int64>(TensorShape{1}, {16})},
+          /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({-1}), PartialTensorShape({-1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 3: different dtypes
 TestCase DifferentDtypeTestCase() {
-  return {/*input_tensors*/ {{DatasetOpsTestBase::CreateTensor<int64>(
-                                 TensorShape({2, 2}), {1, 2, 3, 4})},
-                             {DatasetOpsTestBase::CreateTensor<double>(
-                                 TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({2})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/ {
+          {CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4})},
+          {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}},
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({2})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {}};
 }
 
 class ParameterizedConcatenateDatasetOpTest
@@ -365,39 +357,6 @@ TEST_P(ParameterizedConcatenateDatasetOpTest, Cardinality) {
   EXPECT_EQ(concatenate_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_F(ConcatenateDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = SameShapeTestCase();
-  std::vector<Tensor> tensor_slice_dataset_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensors(test_case.input_tensors,
-                                               &tensor_slice_dataset_tensors));
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (auto &tensor : tensor_slice_dataset_tensors) {
-    inputs.emplace_back(&tensor);
-  }
-  std::unique_ptr<OpKernel> dataset_kernel;
-  TF_ASSERT_OK(CreateConcatenateDatasetKernel(test_case.expected_output_dtypes,
-                                              test_case.expected_output_shapes,
-                                              &dataset_kernel));
-  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
-  TF_ASSERT_OK(CreateConcatenateDatasetContext(dataset_kernel.get(), &inputs,
-                                               &dataset_kernel_ctx));
-  DatasetBase *concatenate_dataset;
-  TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
-                             &concatenate_dataset));
-  core::ScopedUnref scoped_unref(concatenate_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(concatenate_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedConcatenateDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 58cd17482e7..873426a95e6 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -18,29 +18,85 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 
 namespace tensorflow {
 namespace data {
 
+/* static */ constexpr const char* const DatasetToGraphOp::kAllowStateful;
 /* static */ constexpr const char* const DatasetFromGraphOp::kGraphDef;
 /* static */ constexpr const char* const DatasetFromGraphOp::kHandle;
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
+DatasetToGraphOp::DatasetToGraphOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  if (ctx->HasAttr(kAllowStateful)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kAllowStateful, &allow_stateful_ops_));
+  }
+}
+
+namespace {
+Status FindStatefulOps(const GraphDef& graph_def,
+                       std::vector<string>* stateful_op_names) {
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), graph_def.library());
+
+  // Iterate over all nodes in the graph.
+  for (const auto& node : graph_def.node()) {
+    // Each Dataset graph has a _Retval op in the end which is marked stateful
+    if (node.op() == FunctionLibraryDefinition::kRetOp) continue;
+    if (!IsNodeStateful(lib_def, node).ok()) {
+      stateful_op_names->push_back(node.op());
+    }
+  }
+
+  // Iterate over all functions.
+  for (const auto& fdef : graph_def.library().function()) {
+    if (!fdef.signature().is_stateful()) continue;
+    for (const auto& node : fdef.node_def()) {
+      if (!IsNodeStateful(lib_def, node).ok()) {
+        stateful_op_names->push_back(
+            absl::StrCat(node.op(), " in function: ", fdef.signature().name()));
+      }
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
 void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset;
   OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+  SerializationContext::Params params;
+  params.check_external_state = !allow_stateful_ops_;
   GraphDef graph_def;
   OP_REQUIRES_OK(
-      ctx, AsGraphDef(ctx, dataset, SerializationContext({}), &graph_def));
+      ctx, AsGraphDef(ctx, dataset, SerializationContext(params), &graph_def));
+  // In case we allow stateful ops, we walk the graph and find all the stateful
+  // ops in the Graph. We then log a warning indicating what ops' state we are
+  // going to throw away.
+  if (allow_stateful_ops_) {
+    std::vector<string> stateful_op_names;
+    OP_REQUIRES_OK(ctx, FindStatefulOps(graph_def, &stateful_op_names));
+    if (!stateful_op_names.empty()) {
+      LOG(WARNING)
+          << "We found the following stateful ops in the dataset "
+             "construction graph whose state would not be serialized and might "
+             "cause subtle bugs: "
+          << absl::StrJoin(stateful_op_names, ", ");
+    }
+  }
   Tensor* result;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
-  result->scalar<string>()() = graph_def.SerializeAsString();
+  result->scalar<tstring>()() = graph_def.SerializeAsString();
 }
 
 void DatasetCardinalityOp::Compute(OpKernelContext* ctx) {
@@ -52,7 +108,7 @@ void DatasetCardinalityOp::Compute(OpKernelContext* ctx) {
 }
 
 void DatasetFromGraphOp::Compute(OpKernelContext* ctx) {
-  string graph_def_string;
+  tstring graph_def_string;
   OP_REQUIRES_OK(ctx,
                  ParseScalarArgument(ctx, kGraphDef, &graph_def_string));
   GraphDef graph_def;
diff --git a/tensorflow/core/kernels/data/dataset_ops.h b/tensorflow/core/kernels/data/dataset_ops.h
index 885ad88ebc8..8145dfd0281 100644
--- a/tensorflow/core/kernels/data/dataset_ops.h
+++ b/tensorflow/core/kernels/data/dataset_ops.h
@@ -24,9 +24,14 @@ namespace data {
 
 class DatasetToGraphOp : public OpKernel {
  public:
-  explicit DatasetToGraphOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  static constexpr const char* const kAllowStateful = "allow_stateful";
+
+  explicit DatasetToGraphOp(OpKernelConstruction* ctx);
 
   void Compute(OpKernelContext* ctx) override;
+
+ private:
+  bool allow_stateful_ops_ = false;
 };
 
 class DatasetCardinalityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 8c9d775444f..67b18724b28 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -141,7 +141,7 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     TF_RETURN_IF_ERROR(IsEqual<DT>(a, b)); \
     break;
     TF_CALL_NUMBER_TYPES(CASE);
-    TF_CALL_string(CASE);
+    TF_CALL_tstring(CASE);
     TF_CALL_uint32(CASE);
     TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
@@ -274,6 +274,65 @@ Status DatasetOpsTestBase::CreateTensorSliceDataset(
   return Status::OK();
 }
 
+// Create a `RangeDataset` dataset as a variant tensor.
+Status DatasetOpsTestBase::MakeRangeDataset(
+    const Tensor& start, const Tensor& stop, const Tensor& step,
+    const DataTypeVector& output_types,
+    const std::vector<PartialTensorShape>& output_shapes,
+    Tensor* range_dataset) {
+  GraphConstructorOptions graph_opts;
+  graph_opts.allow_internal_ops = true;
+  graph_opts.expect_device_spec = false;
+  TF_RETURN_IF_ERROR(
+      RunFunction(test::function::MakeRangeDataset(),
+                  /*attrs*/
+                  {{RangeDatasetOp::kOutputTypes, output_types},
+                   {RangeDatasetOp::kOutputShapes, output_shapes}},
+                  /*inputs*/ {start, stop, step}, graph_opts,
+                  /*rets*/ {range_dataset}));
+  return Status::OK();
+}
+
+// Create a `RangeDataset` dataset as a variant tensor.
+Status DatasetOpsTestBase::MakeRangeDataset(
+    const RangeDatasetParams& range_dataset_params, Tensor* range_dataset) {
+  GraphConstructorOptions graph_opts;
+  graph_opts.allow_internal_ops = true;
+  graph_opts.expect_device_spec = false;
+  TF_RETURN_IF_ERROR(RunFunction(
+      test::function::MakeRangeDataset(),
+      /*attrs*/
+      {{RangeDatasetOp::kOutputTypes, range_dataset_params.output_dtypes},
+       {RangeDatasetOp::kOutputShapes, range_dataset_params.output_shapes}},
+      /*inputs*/
+      {range_dataset_params.start, range_dataset_params.stop,
+       range_dataset_params.step},
+      graph_opts,
+      /*rets*/ {range_dataset}));
+  return Status::OK();
+}
+
+// Create a `TakeDataset` dataset as a variant tensor.
+Status DatasetOpsTestBase::MakeTakeDataset(
+    const Tensor& input_dataset, int64 count,
+    const DataTypeVector& output_types,
+    const std::vector<PartialTensorShape>& output_shapes,
+    Tensor* take_dataset) {
+  GraphConstructorOptions graph_opts;
+  graph_opts.allow_internal_ops = true;
+  graph_opts.expect_device_spec = false;
+
+  Tensor count_tensor = CreateTensor<int64>(TensorShape({}), {count});
+  TF_RETURN_IF_ERROR(
+      RunFunction(test::function::MakeTakeDataset(),
+                  /*attrs*/
+                  {{TakeDatasetOp::kOutputTypes, output_types},
+                   {TakeDatasetOp::kOutputShapes, output_shapes}},
+                  /*inputs*/ {input_dataset, count_tensor}, graph_opts,
+                  /*rets*/ {take_dataset}));
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::CreateOpKernel(
     const NodeDef& node_def, std::unique_ptr<OpKernel>* op_kernel) {
   OpKernel* kernel;
@@ -284,6 +343,16 @@ Status DatasetOpsTestBase::CreateOpKernel(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CreateDatasetContext(
+    OpKernel* const dateset_kernel,
+    gtl::InlinedVector<TensorValue, 4>* const inputs,
+    std::unique_ptr<OpKernelContext>* dataset_context) {
+  TF_RETURN_IF_ERROR(CheckOpKernelInput(*dateset_kernel, *inputs));
+  TF_RETURN_IF_ERROR(
+      CreateOpKernelContext(dateset_kernel, inputs, dataset_context));
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::CreateDataset(OpKernel* kernel,
                                          OpKernelContext* context,
                                          DatasetBase** const dataset) {
@@ -310,6 +379,7 @@ Status DatasetOpsTestBase::CreateIteratorContext(
   params.resource_mgr = op_context->resource_manager();
   function_handle_cache_ = absl::make_unique<FunctionHandleCache>(flr_);
   params.function_handle_cache = function_handle_cache_.get();
+  params.cancellation_manager = cancellation_manager_.get();
   *iterator_context = absl::make_unique<IteratorContext>(params);
   return Status::OK();
 }
@@ -519,5 +589,114 @@ Status DatasetOpsTestBase::AddDatasetInput(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CheckIteratorGetNext(
+    const std::vector<Tensor>& expected_outputs, bool compare_order) {
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_RETURN_IF_ERROR(
+        iterator_->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
+                           /*compare_order=*/compare_order));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckDatasetNodeName(
+    const string& expected_dataset_node_name) {
+  EXPECT_EQ(dataset_->node_name(), expected_dataset_node_name);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckDatasetTypeString(
+    const string& expected_type_str) {
+  EXPECT_EQ(dataset_->type_string(), expected_type_str);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckDatasetOutputDtypes(
+    const DataTypeVector& expected_output_dtypes) {
+  TF_EXPECT_OK(
+      VerifyTypesMatch(dataset_->output_dtypes(), expected_output_dtypes));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckDatasetOutputShapes(
+    const std::vector<PartialTensorShape>& expected_output_shapes) {
+  TF_EXPECT_OK(VerifyShapesCompatible(dataset_->output_shapes(),
+                                      expected_output_shapes));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckDatasetCardinality(int expected_cardinality) {
+  EXPECT_EQ(dataset_->Cardinality(), expected_cardinality);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckIteratorOutputDtypes(
+    const DataTypeVector& expected_output_dtypes) {
+  TF_EXPECT_OK(
+      VerifyTypesMatch(iterator_->output_dtypes(), expected_output_dtypes));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckIteratorOutputShapes(
+    const std::vector<PartialTensorShape>& expected_output_shapes) {
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator_->output_shapes(),
+                                      expected_output_shapes));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckIteratorPrefix(
+    const string& expected_iterator_prefix) {
+  EXPECT_EQ(iterator_->prefix(), expected_iterator_prefix);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
+    const string& iterator_prefix, const std::vector<Tensor>& expected_outputs,
+    const std::vector<int>& breakpoints) {
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(
+      dataset_->MakeIterator(iterator_ctx_.get(), iterator_prefix, &iterator));
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_RETURN_IF_ERROR(CreateSerializationContext(&serialization_ctx));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  auto expected_outputs_it = expected_outputs.begin();
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_RETURN_IF_ERROR(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx_.get(), &reader, iterator_prefix,
+                                 *dataset_, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      TF_RETURN_IF_ERROR(iterator->GetNext(iterator_ctx_.get(), &out_tensors,
+                                           &end_of_sequence));
+      if (!end_of_sequence) {
+        EXPECT_NE(expected_outputs_it, expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+        expected_outputs_it++;
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= expected_outputs.size()) {
+      EXPECT_TRUE(end_of_sequence);
+      EXPECT_EQ(expected_outputs_it, expected_outputs.end());
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index 75a221e2782..55ad8677701 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/iterator_ops.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
+#include "tensorflow/core/kernels/data/take_dataset_op.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
@@ -44,6 +45,10 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+constexpr int kDefaultCPUNum = 2;
+constexpr int kDefaultThreadNum = 2;
+constexpr char kDefaultIteratorPrefix[] = "Iterator";
+
 enum class CompressionType { ZLIB = 0, GZIP = 1, RAW = 2, UNCOMPRESSED = 3 };
 
 // Returns a string representation for the given compression type.
@@ -76,16 +81,166 @@ Status WriteDataToTFRecordFile(const string& filename,
                                const std::vector<absl::string_view>& records,
                                const CompressionParams& params);
 
+// Creates a tensor with the specified dtype, shape, and value.
+template <typename T>
+static Tensor CreateTensor(const TensorShape& input_shape,
+                           const gtl::ArraySlice<T>& input_data) {
+  Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+  test::FillValues<T>(&tensor, input_data);
+  return tensor;
+}
+
+// Creates a vector of tensors with the specified dtype, shape, and values.
+template <typename T>
+std::vector<Tensor> CreateTensors(
+    const TensorShape& shape, const std::vector<gtl::ArraySlice<T>>& values) {
+  std::vector<Tensor> result;
+  result.reserve(values.size());
+  for (auto& value : values) {
+    result.emplace_back(CreateTensor<T>(shape, value));
+  }
+  return result;
+}
+
+class DatasetParams {
+ public:
+  DatasetParams(DataTypeVector output_dtypes,
+                std::vector<PartialTensorShape> output_shapes, string node_name)
+      : output_dtypes(std::move(output_dtypes)),
+        output_shapes(std::move(output_shapes)),
+        node_name(std::move(node_name)) {}
+
+  virtual ~DatasetParams() {}
+
+  virtual Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) = 0;
+
+  bool IsDatasetTensor(const Tensor& tensor) {
+    return tensor.dtype() == DT_VARIANT &&
+           TensorShapeUtils::IsScalar(tensor.shape());
+  }
+
+  DataTypeVector output_dtypes;
+  std::vector<PartialTensorShape> output_shapes;
+  string node_name;
+  string iterator_prefix = kDefaultIteratorPrefix;
+};
+
+class RangeDatasetParams : public DatasetParams {
+ public:
+  RangeDatasetParams(int64 start, int64 stop, int64 step,
+                     DataTypeVector output_dtypes,
+                     std::vector<PartialTensorShape> output_shapes,
+                     string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        start(CreateTensor<int64>(TensorShape({}), {start})),
+        stop(CreateTensor<int64>(TensorShape({}), {stop})),
+        step(CreateTensor<int64>(TensorShape({}), {step})) {}
+
+  RangeDatasetParams(int64 start, int64 stop, int64 step)
+      : DatasetParams({DT_INT64}, {PartialTensorShape({})}, ""),
+        start(CreateTensor<int64>(TensorShape({}), {start})),
+        stop(CreateTensor<int64>(TensorShape({}), {stop})),
+        step(CreateTensor<int64>(TensorShape({}), {step})) {}
+
+  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    *inputs = {TensorValue(&start), TensorValue(&stop), TensorValue(&step)};
+    return Status::OK();
+  }
+
+  Tensor start;
+  Tensor stop;
+  Tensor step;
+};
+
+template <typename T>
+struct GetNextTestCase {
+  T dataset_params;
+  std::vector<Tensor> expected_outputs;
+};
+
+template <typename T>
+struct DatasetNodeNameTestCase {
+  T dataset_params;
+  string expected_node_name;
+};
+
+template <typename T>
+struct DatasetTypeStringTestCase {
+  T dataset_params;
+  string expected_dataset_type_string;
+};
+
+template <typename T>
+struct DatasetOutputDtypesTestCase {
+  T dataset_params;
+  DataTypeVector expected_output_dtypes;
+};
+
+template <typename T>
+struct DatasetOutputShapesTestCase {
+  T dataset_params;
+  std::vector<PartialTensorShape> expected_output_shapes;
+};
+
+template <typename T>
+struct CardinalityTestCase {
+  T dataset_params;
+  int64 expected_cardinality;
+};
+
+template <typename T>
+struct DatasetSaveTestCase {
+  T dataset_params;
+};
+
+template <typename T>
+struct IsStatefulTestCase {
+  T dataset_params;
+  bool expected_stateful;
+};
+
+template <typename T>
+struct IteratorOutputDtypesTestCase {
+  T dataset_params;
+  DataTypeVector expected_output_dtypes;
+};
+
+template <typename T>
+struct IteratorOutputShapesTestCase {
+  T dataset_params;
+  std::vector<PartialTensorShape> expected_output_shapes;
+};
+
+template <typename T>
+struct IteratorPrefixTestCase {
+  T dataset_params;
+  string expected_iterator_prefix;
+};
+
+template <typename T>
+struct IteratorSaveAndRestoreTestCase {
+  T dataset_params;
+  std::vector<int> breakpoints;
+  std::vector<Tensor> expected_outputs;
+};
+
 // Helpful functions to test Dataset op kernels.
 class DatasetOpsTestBase : public ::testing::Test {
  public:
   DatasetOpsTestBase()
       : device_(DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")),
-        device_type_(DEVICE_CPU) {
+        device_type_(DEVICE_CPU),
+        cpu_num_(kDefaultCPUNum),
+        thread_num_(kDefaultThreadNum) {
     allocator_ = device_->GetAllocator(AllocatorAttributes());
   }
 
-  ~DatasetOpsTestBase() {}
+  ~DatasetOpsTestBase() {
+    if (dataset_) {
+      dataset_->Unref();
+    }
+  }
 
   // The method validates whether the two tensors have the same shape, dtype,
   // and value.
@@ -98,19 +253,16 @@ class DatasetOpsTestBase : public ::testing::Test {
                             std::vector<Tensor> expected_tensors,
                             bool compare_order);
 
-  // Creates a tensor with the specified dtype, shape, and value.
-  template <typename T>
-  static Tensor CreateTensor(TensorShape input_shape,
-                             const gtl::ArraySlice<T>& input_data) {
-    Tensor tensor(DataTypeToEnum<T>::value, input_shape);
-    test::FillValues<T>(&tensor, input_data);
-    return tensor;
-  }
-
   // Creates a new op kernel based on the node definition.
   Status CreateOpKernel(const NodeDef& node_def,
                         std::unique_ptr<OpKernel>* op_kernel);
 
+  // Creates a new op kernel context.
+  Status CreateDatasetContext(
+      OpKernel* const dateset_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* dataset_context);
+
   // Creates a new dataset.
   Status CreateDataset(OpKernel* kernel, OpKernelContext* context,
                        DatasetBase** const dataset);
@@ -177,10 +329,64 @@ class DatasetOpsTestBase : public ::testing::Test {
                                   std::vector<Tensor>* const components,
                                   DatasetBase** tensor_slice_dataset);
 
+  // Creates a `RangeDataset` dataset as a variant tensor.
+  Status MakeRangeDataset(const Tensor& start, const Tensor& stop,
+                          const Tensor& step,
+                          const DataTypeVector& output_types,
+                          const std::vector<PartialTensorShape>& output_shapes,
+                          Tensor* range_dataset);
+
+  // Creates a `RangeDataset` dataset as a variant tensor.
+  Status MakeRangeDataset(const RangeDatasetParams& range_dataset_params,
+                          Tensor* range_dataset);
+
+  // Creates a `TakeDataset` dataset as a variant tensor.
+  Status MakeTakeDataset(const Tensor& input_dataset, int64 count,
+                         const DataTypeVector& output_types,
+                         const std::vector<PartialTensorShape>& output_shapes,
+                         Tensor* take_dataset);
+
   // Fetches the dataset from the operation context.
   Status GetDatasetFromContext(OpKernelContext* context, int output_index,
                                DatasetBase** const dataset);
 
+  // Checks `IteratorBase::GetNext()`.
+  Status CheckIteratorGetNext(const std::vector<Tensor>& expected_outputs,
+                              bool compare_order);
+
+  // Checks `DatasetBase::node_name()`.
+  Status CheckDatasetNodeName(const string& expected_dataset_node_name);
+
+  // Checks `DatasetBase::type_string()`.
+  Status CheckDatasetTypeString(const string& expected_type_str);
+
+  // Checks `DatasetBase::output_dtypes()`.
+  Status CheckDatasetOutputDtypes(const DataTypeVector& expected_output_dtypes);
+
+  // Checks `DatasetBase::output_shapes()`.
+  Status CheckDatasetOutputShapes(
+      const std::vector<PartialTensorShape>& expected_output_shapes);
+
+  // Checks `DatasetBase::Cardinality()`.
+  Status CheckDatasetCardinality(int expected_cardinality);
+
+  // Checks `IteratorBase::output_dtypes()`.
+  Status CheckIteratorOutputDtypes(
+      const DataTypeVector& expected_output_dtypes);
+
+  // Checks `IteratorBase::output_shapes()`.
+  Status CheckIteratorOutputShapes(
+      const std::vector<PartialTensorShape>& expected_output_shapes);
+
+  // Checks `IteratorBase::prefix()`.
+  Status CheckIteratorPrefix(const string& expected_iterator_prefix);
+
+  // Checks `IteratorBase::GetNext()`.
+  Status CheckIteratorSaveAndRestore(
+      const string& iterator_prefix,
+      const std::vector<Tensor>& expected_outputs,
+      const std::vector<int>& breakpoints);
+
  protected:
   // Creates a thread pool for parallel tasks.
   Status InitThreadPool(int thread_num);
@@ -244,6 +450,8 @@ class DatasetOpsTestBase : public ::testing::Test {
  protected:
   std::unique_ptr<Device> device_;
   DeviceType device_type_;
+  int cpu_num_;
+  int thread_num_;
   Allocator* allocator_;  // Owned by `AllocatorFactoryRegistry`.
   std::vector<AllocatorAttributes> allocator_attrs_;
   std::unique_ptr<ScopedStepContainer> step_container_;
@@ -263,8 +471,218 @@ class DatasetOpsTestBase : public ::testing::Test {
   std::vector<std::unique_ptr<Tensor>> tensors_;  // Owns tensors.
   mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs.
   std::unique_ptr<CancellationManager> cancellation_manager_;
+
+  std::unique_ptr<OpKernel> dataset_kernel_;
+  std::unique_ptr<OpKernelContext> dataset_ctx_;
+  DatasetBase* dataset_ = nullptr;
+  std::unique_ptr<IteratorContext> iterator_ctx_;
+  std::unique_ptr<IteratorBase> iterator_;
 };
 
+template <typename T>
+class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
+ public:
+  // Initializes the required members for running the unit tests.
+  virtual Status Initialize(T* dataset_params) = 0;
+
+  virtual Status MakeDatasetOpKernel(
+      const T& dataset_params, std::unique_ptr<OpKernel>* dataset_kernel) = 0;
+};
+
+#define ITERATOR_GET_NEXT_TEST_P(dataset_op_test_class, dataset_params_class, \
+                                 test_case_generator)                         \
+  class ParameterizedGetNextTest                                              \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            GetNextTestCase<dataset_params_class>> {};                        \
+                                                                              \
+  TEST_P(ParameterizedGetNextTest, GetNext) {                                 \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckIteratorGetNext(test_case.expected_outputs,             \
+                                      /*compare_order=*/true));               \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedGetNextTest,                        \
+      ::testing::ValuesIn(std::vector<GetNextTestCase<dataset_params_class>>( \
+          test_case_generator)));
+
+#define DATASET_NODE_NAME_TEST_P(dataset_op_test_class, dataset_params_class, \
+                                 test_case_generator)                         \
+  class ParameterizedDatasetNodeNameTest                                      \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetNodeNameTestCase<dataset_params_class>> {};                \
+                                                                              \
+  TEST_P(ParameterizedDatasetNodeNameTest, DatasetNodeName) {                 \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckDatasetNodeName(test_case.expected_node_name));         \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetNodeNameTest,                \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetNodeNameTestCase<dataset_params_class>>(         \
+              test_case_generator)));
+
+#define DATASET_TYPE_STRING_TEST_P(dataset_op_test_class,                     \
+                                   dataset_params_class, test_case_generator) \
+  class ParameterizedDatasetTypeStringTest                                    \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetTypeStringTestCase<dataset_params_class>> {};              \
+                                                                              \
+  TEST_P(ParameterizedDatasetTypeStringTest, DatasetTypeString) {             \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(                                                             \
+        CheckDatasetTypeString(test_case.expected_dataset_type_string));      \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetTypeStringTest,              \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetTypeStringTestCase<dataset_params_class>>(       \
+              test_case_generator)));
+
+#define DATASET_OUTPUT_DTYPES_TEST_P(                                         \
+    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+                                                                              \
+  class ParameterizedDatasetOutputDtypesTest                                  \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetOutputDtypesTestCase<dataset_params_class>> {};            \
+                                                                              \
+  TEST_P(ParameterizedDatasetOutputDtypesTest, DatasetOutputDtypes) {         \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetOutputDtypesTest,            \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetOutputDtypesTestCase<dataset_params_class>>(     \
+              test_case_generator)));
+
+#define DATASET_OUTPUT_SHAPES_TEST_P(                                         \
+    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+                                                                              \
+  class ParameterizedDatasetOutputShapesTest                                  \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetOutputShapesTestCase<dataset_params_class>> {};            \
+                                                                              \
+  TEST_P(ParameterizedDatasetOutputShapesTest, DatasetOutputShapes) {         \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckDatasetOutputShapes(test_case.expected_output_shapes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetOutputShapesTest,            \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetOutputShapesTestCase<dataset_params_class>>(     \
+              test_case_generator)));
+
+#define DATASET_CARDINALITY_TEST_P(dataset_op_test_class,                     \
+                                   dataset_params_class, test_case_generator) \
+                                                                              \
+  class ParameterizedCardinalityTest                                          \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            CardinalityTestCase<dataset_params_class>> {};                    \
+                                                                              \
+  TEST_P(ParameterizedCardinalityTest, Cardinality) {                         \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckDatasetCardinality(test_case.expected_cardinality));    \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedCardinalityTest,                    \
+      ::testing::ValuesIn(                                                    \
+          std::vector<CardinalityTestCase<dataset_params_class>>(             \
+              test_case_generator)));
+
+#define ITERATOR_OUTPUT_DTYPES_TEST_P(                                        \
+    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+  class ParameterizedIteratorOutputDtypesTest                                 \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            IteratorOutputDtypesTestCase<dataset_params_class>> {};           \
+                                                                              \
+  TEST_P(ParameterizedIteratorOutputDtypesTest, IteratorOutputDtypes) {       \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedIteratorOutputDtypesTest,           \
+      ::testing::ValuesIn(                                                    \
+          std::vector<IteratorOutputDtypesTestCase<dataset_params_class>>(    \
+              test_case_generator)));
+
+#define ITERATOR_OUTPUT_SHAPES_TEST_P(                                         \
+    dataset_op_test_class, dataset_params_class, test_case_generator)          \
+  class ParameterizedIteratorOutputShapesTest                                  \
+      : public dataset_op_test_class,                                          \
+        public ::testing::WithParamInterface<                                  \
+            IteratorOutputShapesTestCase<dataset_params_class>> {};            \
+                                                                               \
+  TEST_P(ParameterizedIteratorOutputShapesTest, IteratorOutputShapes) {        \
+    auto test_case = GetParam();                                               \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                       \
+    TF_ASSERT_OK(CheckIteratorOutputShapes(test_case.expected_output_shapes)); \
+  }                                                                            \
+                                                                               \
+  INSTANTIATE_TEST_SUITE_P(                                                    \
+      dataset_op_test_class, ParameterizedIteratorOutputShapesTest,            \
+      ::testing::ValuesIn(                                                     \
+          std::vector<IteratorOutputShapesTestCase<dataset_params_class>>(     \
+              test_case_generator)));
+
+#define ITERATOR_PREFIX_TEST_P(dataset_op_test_class, dataset_params_class, \
+                               test_case_generator)                         \
+  class ParameterizedIteratorPrefixTest                                     \
+      : public dataset_op_test_class,                                       \
+        public ::testing::WithParamInterface<                               \
+            IteratorPrefixTestCase<dataset_params_class>> {};               \
+                                                                            \
+  TEST_P(ParameterizedIteratorPrefixTest, IteratorPrefix) {                 \
+    auto test_case = GetParam();                                            \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                    \
+    TF_ASSERT_OK(CheckIteratorPrefix(test_case.expected_iterator_prefix));  \
+  }                                                                         \
+                                                                            \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
+      dataset_op_test_class, ParameterizedIteratorPrefixTest,               \
+      ::testing::ValuesIn(                                                  \
+          std::vector<IteratorPrefixTestCase<dataset_params_class>>(        \
+              test_case_generator)));
+
+#define ITERATOR_SAVE_AND_RESTORE_TEST_P(                                     \
+    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+  class ParameterizedIteratorSaveAndRestoreTest                               \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            IteratorSaveAndRestoreTestCase<dataset_params_class>> {};         \
+  TEST_P(ParameterizedIteratorSaveAndRestoreTest, IteratorSaveAndRestore) {   \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckIteratorSaveAndRestore(                                 \
+        test_case.dataset_params.iterator_prefix, test_case.expected_outputs, \
+        test_case.breakpoints));                                              \
+  }                                                                           \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedIteratorSaveAndRestoreTest,         \
+      ::testing::ValuesIn(                                                    \
+          std::vector<IteratorSaveAndRestoreTestCase<dataset_params_class>>(  \
+              test_case_generator)));
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 9838586111d..8bdd4ad6e32 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -15,138 +15,35 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/grappler_item_builder.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-void AddFakeSinks(FunctionDef* function_def) {
-  int counter = 0;
-  for (const auto& output : function_def->signature().output_arg()) {
-    NodeDef* node = function_def->add_node_def();
-    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
-        strings::StrCat("FakeSink", counter++), function_def, node);
-    node->set_op("Identity");
-    node->add_input(function_def->ret().at(output.name()));
-    (*node->mutable_attr())["T"].set_type(output.type());
-
-    (*function_def->mutable_ret())[output.name()] =
-        strings::StrCat(node->name(), ":output:0");
-  }
-}
-
-void RemoveFakeSinks(FunctionDef* function_def) {
-  // Map from identity node names to their input tensor strings
-  std::map<string, string> identity_map;
-  for (const auto& node : function_def->node_def()) {
-    if (node.op() == "Identity" && node.input_size() == 1) {
-      identity_map[node.name()] = node.input(0);
-    }
-  }
-  for (const auto& output_arg : function_def->signature().output_arg()) {
-    const string& tensor = function_def->ret().at(output_arg.name());
-    const string& output_node = tensor.substr(0, tensor.find(':'));
-    if (identity_map.find(output_node) != identity_map.end()) {
-      (*function_def->mutable_ret())[output_arg.name()] =
-          identity_map.at(output_node);
-    }
-  }
-}
-
-Status ApplyRewrites(OpKernelContext* ctx,
-                     const std::function<RewriterConfig(void)> config_factory,
-                     bool optimize_function_library, GraphDef* graph_def,
-                     string* output_node) {
-  // Add an identity node as the fetch node, otherwise we might get 'placeholder
-  // is both fed and fetched' errors in some cases when using input list with
-  // placeholder dataset nodes.
-  NodeDef* node = graph_def->mutable_node()->Add();
-  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
-                                                            node);
-  node->set_op("Identity");
-  node->add_input(*output_node);
-  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
-  *output_node = node->name();
-
-  // Add fake sink node to graph and functions to allow rewriting the actual
-  // sink nodes.
-  //
-  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
-  // to be optimizable, we will no longer need this.
-  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
-    AddFakeSinks(&function_def);
-  }
-
-  // Create metagraph.
-  MetaGraphDef meta_graph_def;
-  (*meta_graph_def.mutable_graph_def()) = *graph_def;
-
-  // Grappler determines fetch ops from collection 'train_op'.
-  CollectionDef collection_def;
-  auto node_list = collection_def.mutable_node_list();
-  node_list->add_value(*output_node);
-  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
-
-  // Create Grappler item.
-  tensorflow::grappler::ItemConfig item_config;
-  item_config.apply_optimizations = true;
-  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
-      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
-          "graph", meta_graph_def, item_config);
-  grappler_item->optimization_options().optimize_function_library =
-      optimize_function_library;
-  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
-  tensorflow::grappler::VirtualCluster cluster(device_map);
-
-  // Run data optimizer using grappler's meta optimizer.
-  tensorflow::ConfigProto config;
-  *config.mutable_graph_options()->mutable_rewrite_options() = config_factory();
-  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-      *grappler_item, config, ctx->device(), &cluster, graph_def));
-
-  // Remove fake sinks after optimizations are done.
-  //
-  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
-  // to be optimizable, we will no longer need this.
-  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
-    RemoveFakeSinks(&function_def);
-  }
-
-  return Status::OK();
-}
+constexpr char kDelimiter[] = "@@";
 
 }  // anonymous namespace
 
 Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
                   SerializationContext&& serialization_ctx,
                   GraphDef* graph_def) {
+  if (serialization_ctx.check_external_state()) {
+    TF_RETURN_IF_ERROR(dataset->CheckExternalState());
+  }
   GraphDefBuilder b;
   DatasetBase::DatasetGraphDefBuilder db(&b);
   Node* output_node = nullptr;
   TF_RETURN_IF_ERROR(
       db.AddInputDataset(&serialization_ctx, dataset, &output_node));
-  // Insert a purely symbolic _Retval node to indicate to consumers which Tensor
-  // represents this Dataset.
+  // Insert a purely symbolic _Retval node to indicate to consumers which node
+  // represents `dataset`.
   ops::UnaryOp("_Retval", output_node,
                b.opts()
                    .WithName("dataset")
@@ -156,54 +53,20 @@ Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
   return Status::OK();
 }
 
-Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
-                      std::function<RewriterConfig(void)> config_factory,
-                      bool optimize_function_library,
-                      DatasetBase** rewritten_input) {
-  SerializationContext::Params params;
-  std::vector<std::pair<string, Tensor>> input_list;
-  params.input_list = &input_list;
-  params.optimization_only = true;
-  SerializationContext serialization_ctx(params);
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(
-      AsGraphDef(ctx, input, std::move(serialization_ctx), &graph_def));
-
-  string output_node;
-  for (const auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
-      output_node = node.input(0);
+Status ConnectCancellationManagers(CancellationManager* parent,
+                                   CancellationManager* child,
+                                   std::function<void()>* deregister_fn) {
+  if (parent) {
+    CancellationToken token = parent->get_cancellation_token();
+    if (!parent->RegisterCallback(token, [child]() { child->StartCancel(); })) {
+      return errors::Cancelled("Operation was cancelled");
     }
+    *deregister_fn = [parent, token]() { parent->DeregisterCallback(token); };
+  } else {
+    VLOG(1) << "Parent cancellation manager is not set. Cancellation will "
+               "not be propagated to the child cancellation manager.";
+    *deregister_fn = []() {};
   }
-
-  VLOG(3) << "Before graph rewrites: " << graph_def.DebugString();
-  TF_RETURN_IF_ERROR(ApplyRewrites(ctx, config_factory,
-                                   optimize_function_library, &graph_def,
-                                   &output_node));
-  VLOG(3) << "After graph rewrites: " << graph_def.DebugString();
-
-  // Instantiate the optimized input pipeline by running the optimized graph
-  // using the optimized function library.
-  FunctionLibraryRuntime* flr = nullptr;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr = nullptr;
-  std::unique_ptr<FunctionLibraryDefinition> lib_def = nullptr;
-  TF_RETURN_IF_ERROR(
-      ctx->function_library()->Clone(&lib_def, &pflr, &flr, true));
-
-  // Some functions may have been modified without having their names
-  // changed (for example, nested dataset graphs from FlatMap or
-  // Interleave).
-  TF_RETURN_IF_ERROR(AddToFunctionLibrary(lib_def.get(), graph_def.library()));
-
-  Graph graph(OpRegistry::Global());
-  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-  std::vector<Tensor> outputs;
-  GraphRunner graph_runner(flr->device());
-
-  TF_RETURN_IF_ERROR(
-      graph_runner.Run(&graph, flr, input_list, {output_node}, &outputs));
-  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], rewritten_input));
-  (*rewritten_input)->Ref();
   return Status::OK();
 }
 
@@ -244,169 +107,6 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
   return Status::OK();
 }
 
-namespace {
-
-uint64 HashAttr(const FunctionDefLibrary& library, const string& attr_key,
-                const AttrValue& attr_value) {
-  uint64 attr_hash = 0;
-  if (attr_value.has_func()) {
-    for (const auto& func : library.function()) {
-      if (func.signature().name() == attr_value.func().name()) {
-        attr_hash = Hash64CombineUnordered(
-            attr_hash,
-            Hash64(absl::StrCat(attr_key, "=",
-                                HashSubgraphFunction(library, &func))));
-        break;
-      }
-    }
-  } else {
-    attr_hash = Hash64CombineUnordered(
-        attr_hash, Hash64(absl::StrCat(attr_key, "=",
-                                       DeterministicProtoHash64(attr_value))));
-  }
-
-  return attr_hash;
-}
-
-uint64 HashSubgraph(const grappler::GraphView& g, const NodeDef* node) {
-  uint64 input_hash = 0;
-  uint64 control_dep_hash = 0;
-
-  for (int i = 0; i < node->input_size(); ++i) {
-    DCHECK_GT(node->input(i).length(), 0);
-    if (node->input(i)[0] == '^') {
-      // TODO(frankchn): Investigate if control dependencies are necessary
-      // inputs to the hash.
-      // Control dependency node names start with '^', and order of appearance
-      // for the control dependencies does not matter.
-      control_dep_hash = Hash64CombineUnordered(
-          control_dep_hash,
-          HashSubgraph(g, g.GetNode(node->input(i).substr(1))));
-    } else {
-      // The output port is significant and is optionally delimited by a ':'
-      // for non-zero ports.
-      std::pair<std::string, std::string> node_spec =
-          absl::StrSplit(node->input(i), absl::MaxSplits(':', 1));
-      // TODO(frankchn): Cache hashes if possible.
-      uint64 child_node_hash = HashSubgraph(g, g.GetNode(node_spec.first));
-      uint64 child_port_hash = Hash64(node_spec.second);
-      input_hash = Hash64Combine(
-          input_hash, Hash64Combine(child_node_hash, child_port_hash));
-    }
-  }
-
-  uint64 op_hash = Hash64(node->op());
-
-  uint64 attr_hash = 0;
-  for (const auto& attr : node->attr()) {
-    attr_hash = Hash64CombineUnordered(
-        attr_hash, HashAttr(g.graph()->library(), attr.first, attr.second));
-  }
-
-  uint64 device_hash = Hash64(node->device());
-
-  return Hash64Combine(
-      Hash64Combine(attr_hash, op_hash),
-      Hash64Combine(device_hash, Hash64Combine(input_hash, control_dep_hash)));
-}
-
-void ClearOpDefForHashing(OpDef* op) {
-  op->clear_name();
-  op->clear_description();
-  op->clear_summary();
-  for (auto& arg : *op->mutable_input_arg()) {
-    arg.clear_name();
-    arg.clear_description();
-  }
-  for (auto& arg : *op->mutable_output_arg()) {
-    arg.clear_name();
-    arg.clear_description();
-  }
-}
-
-}  // namespace
-
-uint64 HashSubgraphFunction(const FunctionDefLibrary& library,
-                            const FunctionDef* f) {
-  OpDef op = f->signature();
-  ClearOpDefForHashing(&op);
-  uint64 signature_hash = OpDefHash(op);
-
-  uint64 attr_hash = 0;
-  for (const auto& attr : f->attr()) {
-    attr_hash = Hash64CombineUnordered(
-        attr_hash, HashAttr(library, attr.first, attr.second));
-  }
-
-  uint64 arg_attr_hash = 0;
-  for (const auto& arg_attr : f->arg_attr()) {
-    for (const auto& attr : arg_attr.second.attr()) {
-      arg_attr_hash = Hash64CombineUnordered(
-          arg_attr_hash,
-          Hash64Combine(arg_attr.first,
-                        HashAttr(library, attr.first, attr.second)));
-    }
-  }
-
-  GraphDef node_graph;
-  for (const auto& node : f->node_def()) {
-    NodeDef* node_graph_node = node_graph.add_node();
-    *node_graph_node = node;
-  }
-  for (const auto& input_arg : f->signature().input_arg()) {
-    // We add dummy input nodes for the inputs to the function.
-    NodeDef* node_graph_node = node_graph.add_node();
-    node_graph_node->set_name(input_arg.name());
-    node_graph_node->set_op("_Retval");
-  }
-  grappler::GraphView node_gv(&node_graph);
-
-  // TODO(frankchn): Investigate whether we need to hash the name of the
-  // return argument / control return argument or whether we can relax it and
-  // hash the index (etc...)
-  uint64 ret_hash = f->ret_size();
-  for (const auto& ret : f->ret()) {
-    std::pair<std::string, std::string> node_spec =
-        absl::StrSplit(ret.second, absl::MaxSplits(':', 1));
-    // For every return value, we need to hash the output node (and the subgraph
-    // rooted at the output node) to ensure that the computation graph that
-    // ends at the output node has not changed.
-    uint64 node_hash = HashSubgraph(node_gv, node_gv.GetNode(node_spec.first));
-    uint64 node_port_hash = Hash64(node_spec.second);
-
-    ret_hash = Hash64CombineUnordered(
-        ret_hash, Hash64Combine(Hash64(ret.first),
-                                Hash64Combine(node_hash, node_port_hash)));
-  }
-
-  uint64 control_ret_hash = f->control_ret_size();
-  for (const auto& ret : f->control_ret()) {
-    std::pair<std::string, std::string> node_spec =
-        absl::StrSplit(ret.second, absl::MaxSplits(':', 1));
-    uint64 node_hash = HashSubgraph(node_gv, node_gv.GetNode(node_spec.first));
-    uint64 node_port_hash = Hash64(node_spec.second);
-
-    control_ret_hash = Hash64CombineUnordered(
-        control_ret_hash,
-        Hash64Combine(Hash64(ret.first),
-                      Hash64Combine(node_hash, node_port_hash)));
-  }
-
-  return Hash64Combine(
-      Hash64Combine(Hash64Combine(signature_hash, attr_hash), arg_attr_hash),
-      Hash64Combine(ret_hash, control_ret_hash));
-}
-
-uint64 HashSubgraph(const GraphDef& g, const NodeDef* node) {
-  return HashSubgraph(grappler::GraphView(&g), node);
-}
-
-namespace {
-
-constexpr char kDelimiter[] = "@@";
-
-}  // namespace
-
 VariantTensorDataReader::VariantTensorDataReader(
     const tensorflow::VariantTensorData* data)
     : data_(data) {
@@ -422,7 +122,7 @@ Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
   return ReadScalarInternal(key, val);
 }
 
-Status VariantTensorDataReader::ReadScalar(StringPiece key, string* val) {
+Status VariantTensorDataReader::ReadScalar(StringPiece key, tstring* val) {
   return ReadScalarInternal(key, val);
 }
 
@@ -457,7 +157,7 @@ Status VariantTensorDataWriter::WriteScalar(StringPiece key, const int64 val) {
 }
 
 Status VariantTensorDataWriter::WriteScalar(StringPiece key,
-                                            const string& val) {
+                                            const tstring& val) {
   return WriteScalarInternal(key, val);
 }
 
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 063f524e7ad..c747d96ca04 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -15,23 +15,78 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
 namespace data {
 
+template <typename T>
+class AnonymousResourceOp : public OpKernel {
+ public:
+  static std::atomic<int64> resource_id_counter_;
+
+  explicit AnonymousResourceOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    OP_REQUIRES_OK(
+        ctx, ctx->function_library()->Clone(&flib_def, &pflr, &lib, true));
+    T* resource;
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, std::move(flib_def),
+                                       std::move(pflr), lib, &resource));
+
+    string container_name = name();
+    string unique_name =
+        strings::StrCat(container_name, resource_id_counter_.fetch_add(1));
+    ResourceMgr* mgr = ctx->resource_manager();
+    OP_REQUIRES_OK(ctx, mgr->Create<T>(container_name, unique_name, resource));
+
+    Tensor* handle_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle_t));
+    ResourceHandle handle = MakeResourceHandle(ctx, container_name, unique_name,
+                                               MakeTypeIndex<T>());
+    handle_t->scalar<ResourceHandle>()() = handle;
+
+    if (create_deleter_) {
+      Tensor* deleter_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t));
+      deleter_t->scalar<Variant>()() =
+          ResourceDeleter(handle, ctx->resource_manager());
+    }
+  }
+
+ protected:
+  virtual string name() = 0;
+
+  virtual Status CreateResource(
+      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib, T** resource) = 0;
+
+  bool create_deleter_ = true;
+};
+
+template <typename T>
+std::atomic<int64> AnonymousResourceOp<T>::resource_id_counter_;
+
 // Returns a GraphDef representation of the given dataset.
 Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
                   SerializationContext&& serialization_ctx,
                   GraphDef* graph_def);
 
-// Rewrites the input dataset using the given config.
-Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
-                      std::function<RewriterConfig(void)> config_factory,
-                      bool optimize_function_library,
-                      DatasetBase** rewritten_input);
+// Creates a connection between "child" and "parent" cancellation managers so
+// that parent cancellations are propagated to the child, returning a function
+// that can be used to remove the connection.
+Status ConnectCancellationManagers(CancellationManager* parent,
+                                   CancellationManager* child,
+                                   std::function<void()>* deregister_fn);
 
 // Returns Status::OK() if `expected` and `received` types match,
 // errors::InvalidArgument otherwise.
@@ -74,7 +129,7 @@ class VariantTensorDataReader : public IteratorStateReader {
 
   // Returns OK iff the initialization was successful.
   Status ReadScalar(StringPiece key, int64* val) override;
-  Status ReadScalar(StringPiece key, string* val) override;
+  Status ReadScalar(StringPiece key, tstring* val) override;
   Status ReadTensor(StringPiece key, Tensor* val) override;
   bool Contains(StringPiece key) override;
 
@@ -93,7 +148,7 @@ class VariantTensorDataWriter : public IteratorStateWriter {
   // Does not take ownership of data.
   explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
   Status WriteScalar(StringPiece key, const int64 val) override;
-  Status WriteScalar(StringPiece key, const string& val) override;
+  Status WriteScalar(StringPiece key, const tstring& val) override;
   Status WriteTensor(StringPiece key, const Tensor& val) override;
 
   // Writes the metadata to `data_`.
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 98e958b5f59..d6531852b9b 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -52,7 +52,7 @@ TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
   data.tensors_.push_back(Tensor(DT_INT64, {1}));
   VariantTensorDataReader reader(&data);
   int64 val_int64;
-  string val_string;
+  tstring val_string;
   Tensor val_tensor;
   EXPECT_EQ(error::NOT_FOUND,
             reader.ReadScalar("NonExistentKey", &val_int64).code());
@@ -62,490 +62,6 @@ TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
             reader.ReadTensor("NonExistentKey", &val_tensor).code());
 }
 
-TEST(DatasetUtilsTest, HashSubgraphFunctionSameFunctionDifferentNames) {
-  FunctionDefLibrary fl1;
-
-  FunctionDef* f1 = fl1.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl1.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul2", {"input: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  EXPECT_EQ(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
-}
-
-TEST(DatasetUtilsTest, HashSubgraphFunctionDifferentFunctions) {
-  FunctionDefLibrary fl1;
-
-  FunctionDef* f1 = fl1.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl1.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndAdd", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  // The second op in `f2` is changed to "Add"
-  EXPECT_NE(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
-}
-
-TEST(DatasetUtilsTest, HashSubgraphFunctionDifferentInternalNodeNames) {
-  FunctionDefLibrary fl1;
-
-  FunctionDef* f1 = fl1.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float", "j: float", "k: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "j"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"add", "k"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "ret"}});
-
-  FunctionDef* f2 = fl1.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul", {"a: float", "b: float", "c: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
-       {{"mul"}, "Mul", {"add", "c"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "mul:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "mul"}});
-
-  EXPECT_EQ(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
-}
-
-TEST(DatasetUtilsTest, HashSubgraphSameGraphDifferentNames) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = HashSubgraph(gd, n3);
-
-  n1->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  n2->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = HashSubgraph(gd, n3);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphDifferentGraphs) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = HashSubgraph(gd, n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = HashSubgraph(gd, n3);
-
-  // We expect different hashes because the op of n3 has changed.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphReversedOrder) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = HashSubgraph(gd, n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = HashSubgraph(gd, n3);
-
-  // We expect different hashes because the inputs of n3 are swapped.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphInputPortChanged) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = HashSubgraph(gd, n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 1, DT_INT32)
-                  .Input(n2->name(), 2, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = HashSubgraph(gd, n3);
-
-  // We expect different hashes because the input ports for nodes used by n3
-  // has changed.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphSameFunctionDifferentNames) {
-  GraphDef gd;
-  FunctionDefLibrary* fl1 = gd.mutable_library();
-
-  FunctionDef* f1 = fl1->add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl1->add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul2", {"input: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  AttrValue a1;
-  NameAttrList* nal1 = a1.mutable_func();
-  nal1->set_name("AddAndMul");
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  std::vector<NodeDefBuilder::NodeOut> func_inputs;
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a1)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash1 = HashSubgraph(gd, n2);
-
-  n2->Clear();
-  AttrValue a2;
-  NameAttrList* nal2 = a2.mutable_func();
-  nal2->set_name("AddAndMul2");
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash2 = HashSubgraph(gd, n2);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphDifferentFunctions) {
-  GraphDef gd;
-
-  FunctionDefLibrary* fl1 = gd.mutable_library();
-  FunctionDef* f1 = fl1->add_function();
-
-  FunctionDef func = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-  *f1 = func;
-
-  FunctionDef* f2 = fl1->add_function();
-  func = FunctionDefHelper::Create(
-      "AddAndMul2", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "ret"}});
-  *f2 = func;
-
-  AttrValue a1;
-  NameAttrList* nal1 = a1.mutable_func();
-  nal1->set_name("AddAndMul");
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  std::vector<NodeDefBuilder::NodeOut> func_inputs;
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a1)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash1 = HashSubgraph(gd, n2);
-
-  n2->Clear();
-  AttrValue a2;
-  NameAttrList* nal2 = a2.mutable_func();
-  nal2->set_name("AddAndMul2");
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash2 = HashSubgraph(gd, n2);
-
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphDifferentControlInputs) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
-                  .Attr("value", 10)
-                  .Device("CPU:0")
-                  .Finalize(n3));
-
-  NodeDef* n4 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n2->name())
-                  .Finalize(n4));
-
-  uint64 hash1 = HashSubgraph(gd, n4);
-
-  n4->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n3->name())
-                  .Finalize(n4));
-
-  uint64 hash2 = HashSubgraph(gd, n4);
-
-  // Control inputs are different between these two graphs.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphControlInputDifferentOrdering) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
-                  .Attr("value", 10)
-                  .Device("CPU:0")
-                  .Finalize(n3));
-
-  NodeDef* n4 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n2->name())
-                  .ControlInput(n3->name())
-                  .Finalize(n4));
-
-  uint64 hash1 = HashSubgraph(gd, n4);
-
-  n4->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n3->name())
-                  .ControlInput(n2->name())
-                  .Finalize(n4));
-
-  uint64 hash2 = HashSubgraph(gd, n4);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST(DatasetUtilsTest, HashSubgraphDifferentGraphSamePartialGraph) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = HashSubgraph(gd, n1);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = HashSubgraph(gd, n1);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
 TEST(DatasetUtilsTest, AddToFunctionLibrary) {
   auto make_fn_a = [](const string& fn_name) {
     return FunctionDefHelper::Create(
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index d16f580d1c5..8aef8eec257 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -3,6 +3,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_kernel_library",
 )
 
@@ -16,9 +17,27 @@ exports_files(["LICENSE"])
 tf_kernel_library(
     name = "assert_next_dataset_op",
     srcs = ["assert_next_dataset_op.cc"],
+    hdrs = ["assert_next_dataset_op.h"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:name_utils",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "assert_next_dataset_op_test",
+    size = "small",
+    srcs = ["assert_next_dataset_op_test.cc"],
+    deps = [
+        ":assert_next_dataset_op",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
         "//third_party/eigen3",
     ],
 )
@@ -26,6 +45,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "auto_shard_dataset_op",
     srcs = ["auto_shard_dataset_op.cc"],
+    hdrs = ["auto_shard_dataset_op.h"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -34,7 +54,24 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/optimizers/data:auto_shard",
-        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:rewrite_utils",
+    ],
+)
+
+tf_cc_test(
+    name = "auto_shard_dataset_op_test",
+    size = "small",
+    srcs = ["auto_shard_dataset_op_test.cc"],
+    deps = [
+        ":auto_shard_dataset_op",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+        "//tensorflow/core/kernels/data:shard_dataset_op",
+        "//third_party/eigen3",
     ],
 )
 
@@ -140,6 +177,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "map_and_batch_dataset_op",
     srcs = ["map_and_batch_dataset_op.cc"],
+    hdrs = ["map_and_batch_dataset_op.h"],
     deps = [
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
@@ -151,10 +189,27 @@ tf_kernel_library(
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:stats_utils",
     ],
 )
 
+tf_cc_test(
+    name = "map_and_batch_dataset_op_test",
+    size = "small",
+    srcs = ["map_and_batch_dataset_op_test.cc"],
+    deps = [
+        ":map_and_batch_dataset_op",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
 tf_kernel_library(
     name = "matching_files_dataset_op",
     srcs = ["matching_files_dataset_op.cc"],
@@ -192,6 +247,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "parallel_interleave_dataset_op",
     srcs = ["parallel_interleave_dataset_op.cc"],
+    hdrs = ["parallel_interleave_dataset_op.h"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -200,6 +256,23 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:name_utils",
+    ],
+)
+
+tf_cc_test(
+    name = "parallel_interleave_dataset_op_test",
+    size = "small",
+    srcs = ["parallel_interleave_dataset_op_test.cc"],
+    deps = [
+        ":parallel_interleave_dataset_op",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+        "//tensorflow/core/kernels/data:tensor_slice_dataset_op",
     ],
 )
 
@@ -249,18 +322,36 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/optimizers/data:rebatch",
-        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:rewrite_utils",
     ],
 )
 
 tf_kernel_library(
     name = "sampling_dataset_op",
     srcs = ["sampling_dataset_op.cc"],
+    hdrs = ["sampling_dataset_op.h"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:name_utils",
+    ],
+)
+
+tf_cc_test(
+    name = "sampling_dataset_op_test",
+    size = "small",
+    srcs = ["sampling_dataset_op_test.cc"],
+    deps = [
+        ":sampling_dataset_op",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+        "//third_party/eigen3",
     ],
 )
 
@@ -319,7 +410,8 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:rewrite_utils",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index b84d813c023..189f793aa58 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -12,149 +12,150 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h"
+
 #include <map>
 
-#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 
 namespace tensorflow {
 namespace data {
-namespace {
+namespace experimental {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-class AssertNextDatasetOp : public UnaryDatasetOpKernel {
+/* static */ constexpr const char* const AssertNextDatasetOp::kInputDataset;
+/* static */ constexpr const char* const AssertNextDatasetOp::kDatasetType;
+/* static */ constexpr const char* const AssertNextDatasetOp::kTransformations;
+/* static */ constexpr const char* const AssertNextDatasetOp::kOutputTypes;
+/* static */ constexpr const char* const AssertNextDatasetOp::kOutputShapes;
+
+class AssertNextDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit AssertNextDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          const std::vector<tstring>& transformations,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        transformations_(transformations),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
+  }
+
+  ~Dataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
   }
 
  protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    std::vector<string> transformations;
-    OP_REQUIRES_OK(ctx, ParseVectorArgument<string>(ctx, "transformations",
-                                                    &transformations));
-    *output =
-        new Dataset(ctx, input, transformations, output_types_, output_shapes_);
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* transformations_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node));
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, transformations_node}, output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const std::vector<string>& transformations,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          transformations_(transformations),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      std::vector<string> tokens =
+          absl::StrSplit(prefix(), ':', absl::SkipEmpty());
+      if (dataset()->transformations_.size() > tokens.size() - 2) {
+        return errors::InvalidArgument(
+            "Asserted next ", dataset()->transformations_.size(),
+            " transformations but encountered only ", tokens.size() - 2, ".");
+      }
+      int n = tokens.size();
+      for (size_t i = 0; i < dataset()->transformations_.size(); ++i) {
+        if (dataset()->transformations_[i] != tokens[n - 2 - i]) {
+          return errors::InvalidArgument(
+              "Asserted ", dataset()->transformations_[i],
+              " transformation at offset ", i, " but encountered ",
+              tokens[n - 2 - i], " transformation instead.");
+        }
+      }
+      return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
     }
 
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::AssertNext")});
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "AssertNextDatasetOp::Dataset";
-    }
-
-    int64 Cardinality() const override { return input_->Cardinality(); }
-
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* transformations_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {input_graph_node, transformations_node}, output));
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeKnownRatioNode(std::move(args),
+                                       /*ratio=*/1);
+    }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        std::vector<string> tokens =
-            absl::StrSplit(prefix(), ':', absl::SkipEmpty());
-        if (dataset()->transformations_.size() > tokens.size() - 2) {
-          return errors::InvalidArgument(
-              "Asserted next ", dataset()->transformations_.size(),
-              " transformations but encountered only ", tokens.size() - 2, ".");
-        }
-        int n = tokens.size();
-        for (size_t i = 0; i < dataset()->transformations_.size(); ++i) {
-          if (dataset()->transformations_[i] != tokens[n - 2 - i]) {
-            return errors::InvalidArgument(
-                "Asserted ", dataset()->transformations_[i],
-                " transformation at offset ", i, " but encountered ",
-                tokens[n - 2 - i], " transformation instead.");
-          }
-        }
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      std::unique_ptr<IteratorBase> input_impl_;
-    };
-
-    const DatasetBase* input_;
-    const std::vector<string> transformations_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
+    std::unique_ptr<IteratorBase> input_impl_;
   };
 
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
+  const DatasetBase* input_;
+  const std::vector<tstring> transformations_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
 };
 
+AssertNextDatasetOp::AssertNextDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void AssertNextDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                      DatasetBase** output) {
+  std::vector<tstring> transformations;
+  OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, kTransformations,
+                                                   &transformations));
+  *output =
+      new Dataset(ctx, input, transformations, output_types_, output_shapes_);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("AssertNextDataset").Device(DEVICE_CPU),
                         AssertNextDatasetOp);
 REGISTER_KERNEL_BUILDER(
@@ -162,5 +163,6 @@ REGISTER_KERNEL_BUILDER(
     AssertNextDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h
new file mode 100644
index 00000000000..6e86b5d82d9
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AssertNextDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "AssertNext";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kTransformations = "transformations";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit AssertNextDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc
new file mode 100644
index 00000000000..acfdf860be6
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc
@@ -0,0 +1,631 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+constexpr char kNodeName[] = "assert_next_dataset";
+
+struct RangeDatasetParams {
+  int start;
+  int stop;
+  int step;
+};
+
+struct TakeDatasetParams {
+  int count;
+};
+
+class AssertNextDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `AssertNextDataset` op kernel.
+  Status CreateAssertNextDatasetOpKernel(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* assert_next_dataset_op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, name_utils::OpName(AssertNextDatasetOp::kDatasetType),
+        {AssertNextDatasetOp::kInputDataset,
+         AssertNextDatasetOp::kTransformations},
+        {{AssertNextDatasetOp::kOutputTypes, output_types},
+         {AssertNextDatasetOp::kOutputShapes, output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, assert_next_dataset_op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `AssertNextDataset` op kernel context.
+  Status CreateAssertNextDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+  // Creates a new `RangeAndTakeDataset` tensor.
+  Status MakeRangeAndTakeDatasetTensor(
+      const RangeDatasetParams& range_dataset_params,
+      const TakeDatasetParams& take_dataset_params,
+      Tensor* range_and_take_dataset_tensor) {
+    Tensor range_dataset_tensor;
+    Tensor start =
+        CreateTensor<int64>(TensorShape({}), {range_dataset_params.start});
+    Tensor stop =
+        CreateTensor<int64>(TensorShape({}), {range_dataset_params.stop});
+    Tensor step =
+        CreateTensor<int64>(TensorShape({}), {range_dataset_params.step});
+    TF_RETURN_IF_ERROR(MakeRangeDataset(start, stop, step, {DT_INT64},
+                                        {PartialTensorShape({})},
+                                        &range_dataset_tensor));
+
+    TF_RETURN_IF_ERROR(MakeTakeDataset(
+        range_dataset_tensor, take_dataset_params.count, {DT_INT64},
+        {PartialTensorShape({})}, range_and_take_dataset_tensor));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  RangeDatasetParams range_dataset_params;
+  TakeDatasetParams take_dataset_params;
+  Tensor transformations;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1 : assert one transformation.
+TestCase TestCase1() {
+  return {
+      /*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1},
+      /*take_dataset_params*/ {/*count*/ 3},
+      /*transformations*/
+      CreateTensor<tstring>(TensorShape({1}), {TakeDatasetOp::kDatasetType}),
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {1}),
+       CreateTensor<int64>(TensorShape({}), {2})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 2 : assert two transformations.
+TestCase TestCase2() {
+  return {
+      /*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1},
+      /*take_dataset_params*/ {/*count*/ 3},
+      /*transformations*/
+      CreateTensor<tstring>(TensorShape({2}), {TakeDatasetOp::kDatasetType,
+                                               RangeDatasetOp::kDatasetType}),
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {1}),
+       CreateTensor<int64>(TensorShape({}), {2})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase AssertNextInvalid() {
+  return {/*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1},
+          /*take_dataset_params*/ {/*count*/ 3},
+          /*transformations*/
+          CreateTensor<tstring>(TensorShape({1}), {"Whoops"}),
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({}), {2})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestCase AssertNextShort() {
+  return {/*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1},
+          /*take_dataset_params*/ {/*count*/ 3},
+          /*transformations*/
+          CreateTensor<tstring>(TensorShape({3}),
+                                {TakeDatasetOp::kDatasetType,
+                                 RangeDatasetOp::kDatasetType, "Whoops"}),
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({}), {2})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+class ParameterizedAssertNextDatasetOpTest
+    : public AssertNextDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  string iterator_prefix = name_utils::IteratorPrefix(
+      TakeDatasetOp::kDatasetType,
+      name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+  TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(),
+                                                 iterator_prefix, &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_context.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(AssertNextDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  EXPECT_EQ(assert_next_dataset->node_name(), kNodeName);
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  EXPECT_EQ(assert_next_dataset->type_string(),
+            name_utils::OpName(AssertNextDatasetOp::kDatasetType));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(assert_next_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(assert_next_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  EXPECT_EQ(assert_next_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  string iterator_prefix = name_utils::IteratorPrefix(
+      TakeDatasetOp::kDatasetType,
+      name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+  TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(),
+                                                 iterator_prefix, &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  string iterator_prefix = name_utils::IteratorPrefix(
+      TakeDatasetOp::kDatasetType,
+      name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+  TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(),
+                                                 iterator_prefix, &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  string iterator_prefix = name_utils::IteratorPrefix(
+      TakeDatasetOp::kDatasetType,
+      name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+  TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(),
+                                                 iterator_prefix, &iterator));
+
+  EXPECT_EQ(iterator->prefix(),
+            name_utils::IteratorPrefix(AssertNextDatasetOp::kDatasetType,
+                                       iterator_prefix));
+}
+
+TEST_P(ParameterizedAssertNextDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor range_and_take_dataset_tensor;
+  TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                             test_case.take_dataset_params,
+                                             &range_and_take_dataset_tensor));
+
+  std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+  TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes,
+                                               test_case.expected_output_shapes,
+                                               &assert_next_dataset_kernel));
+  Tensor transformations = test_case.transformations;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_and_take_dataset_tensor),
+       TensorValue(&transformations)});
+  std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+  TF_ASSERT_OK(CreateAssertNextDatasetContext(
+      assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context));
+
+  DatasetBase* assert_next_dataset;
+  TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                             assert_next_dataset_context.get(),
+                             &assert_next_dataset));
+  core::ScopedUnref scoped_unref(assert_next_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  string iterator_prefix = name_utils::IteratorPrefix(
+      TakeDatasetOp::kDatasetType,
+      name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+  TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(),
+                                                 iterator_prefix, &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int>& breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_context.get(), &reader,
+                                 iterator_prefix, *assert_next_dataset,
+                                 &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_context.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      ++cur_iteration;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AssertNextDatasetOpTest, ParameterizedAssertNextDatasetOpTest,
+    ::testing::ValuesIn(std::vector<TestCase>({TestCase1(), TestCase2()})));
+
+TEST_F(AssertNextDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::vector<TestCase> test_cases = {AssertNextInvalid(), AssertNextShort()};
+  for (TestCase test_case : test_cases) {
+    Tensor range_and_take_dataset_tensor;
+    TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params,
+                                               test_case.take_dataset_params,
+                                               &range_and_take_dataset_tensor));
+
+    std::unique_ptr<OpKernel> assert_next_dataset_kernel;
+    TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &assert_next_dataset_kernel));
+    Tensor transformations = test_case.transformations;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {TensorValue(&range_and_take_dataset_tensor),
+         TensorValue(&transformations)});
+    std::unique_ptr<OpKernelContext> assert_next_dataset_context;
+    TF_ASSERT_OK(
+        CreateAssertNextDatasetContext(assert_next_dataset_kernel.get(),
+                                       &inputs, &assert_next_dataset_context));
+
+    DatasetBase* assert_next_dataset;
+    TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(),
+                               assert_next_dataset_context.get(),
+                               &assert_next_dataset));
+    core::ScopedUnref scoped_unref(assert_next_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_context;
+    TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(),
+                                       &iterator_context));
+    std::unique_ptr<IteratorBase> iterator;
+    string iterator_prefix = name_utils::IteratorPrefix(
+        TakeDatasetOp::kDatasetType,
+        name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator"));
+    EXPECT_EQ(
+        assert_next_dataset
+            ->MakeIterator(iterator_context.get(), iterator_prefix, &iterator)
+            .code(),
+        tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
index 6ecea13ed76..5227e1b5608 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
@@ -12,74 +12,78 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace data {
-namespace {
+namespace experimental {
+
+/* static */ constexpr const char* const AutoShardDatasetOp::kDatasetType;
+/* static */ constexpr const char* const AutoShardDatasetOp::kInputDataset;
+/* static */ constexpr const char* const AutoShardDatasetOp::kNumWorkers;
+/* static */ constexpr const char* const AutoShardDatasetOp::kIndex;
+/* static */ constexpr const char* const AutoShardDatasetOp::kOutputTypes;
+/* static */ constexpr const char* const AutoShardDatasetOp::kOutputShapes;
 
 constexpr char kOptimizerName[] = "tf_auto_shard";
 
-class AutoShardDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit AutoShardDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+AutoShardDatasetOp::AutoShardDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {}
 
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 index, num_workers;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
-    OP_REQUIRES(
-        ctx, num_workers > 0,
-        errors::InvalidArgument("num_workers must be greater than zero."));
+void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                     DatasetBase** output) {
+  int64 index, num_workers;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kNumWorkers, &num_workers));
+  OP_REQUIRES(
+      ctx, num_workers > 0,
+      errors::InvalidArgument("num_workers must be greater than zero."));
 
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "index", &index));
-    OP_REQUIRES(ctx, index >= 0 && index < num_workers,
-                errors::InvalidArgument("index must be between 0 and ",
-                                        num_workers - 1));
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kIndex, &index));
+  OP_REQUIRES(
+      ctx, index >= 0 && index < num_workers,
+      errors::InvalidArgument("index must be between 0 and ", num_workers - 1));
 
-    auto config_factory = [num_workers, index]() {
-      return CreateConfig(num_workers, index);
-    };
+  auto config_factory = [num_workers, index]() {
+    return CreateConfig(num_workers, index);
+  };
 
-    // We only want to optimize functions for some particular datasets like
-    // FlatMapDataset, InterleaveDataset etc. So we disable generalized
-    // function optimization and explicitly handle function modifications
-    // for those datasets in the rewrite.
-    OP_REQUIRES_OK(ctx,
-                   RewriteDataset(ctx, input, std::move(config_factory),
-                                  /*optimize_function_library=*/false, output));
-  }
+  // We only want to optimize functions for some particular datasets like
+  // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+  // function optimization and explicitly handle function modifications
+  // for those datasets in the rewrite.
+  OP_REQUIRES_OK(ctx,
+                 RewriteDataset(ctx, input, std::move(config_factory),
+                                /*optimize_function_library=*/false, output));
+}
 
- private:
-  static RewriterConfig CreateConfig(int64 num_workers, int64 index) {
-    RewriterConfig rewriter_config;
-    rewriter_config.set_fail_on_optimizer_errors(true);
-    rewriter_config.add_optimizers(kOptimizerName);
-    rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
-    auto custom_optimizer = rewriter_config.add_custom_optimizers();
-    custom_optimizer->set_name(kOptimizerName);
-    AttrValue num_workers_attr;
-    num_workers_attr.set_i(num_workers);
-    (*custom_optimizer->mutable_parameter_map())["num_workers"] =
-        num_workers_attr;
+RewriterConfig AutoShardDatasetOp::CreateConfig(int64 num_workers,
+                                                int64 index) {
+  RewriterConfig rewriter_config;
+  rewriter_config.set_fail_on_optimizer_errors(true);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
 
-    AttrValue index_attr;
-    index_attr.set_i(index);
-    (*custom_optimizer->mutable_parameter_map())["index"] = index_attr;
+  rewriter_config.add_optimizers(kOptimizerName);
+  auto custom_optimizer = rewriter_config.add_custom_optimizers();
+  custom_optimizer->set_name(kOptimizerName);
+  AttrValue num_workers_attr;
+  num_workers_attr.set_i(num_workers);
+  (*custom_optimizer->mutable_parameter_map())[kNumWorkers] = num_workers_attr;
+  AttrValue index_attr;
+  index_attr.set_i(index);
+  (*custom_optimizer->mutable_parameter_map())[kIndex] = index_attr;
 
-    return rewriter_config;
-  }
-};
+  return rewriter_config;
+}
 
+namespace {
 REGISTER_KERNEL_BUILDER(Name("AutoShardDataset").Device(DEVICE_CPU),
                         AutoShardDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalAutoShardDataset").Device(DEVICE_CPU),
                         AutoShardDatasetOp);
-
 }  // anonymous namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
new file mode 100644
index 00000000000..087337cec79
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AutoShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "AutoShard";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kNumWorkers = "num_workers";
+  static constexpr const char* const kIndex = "index";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit AutoShardDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  static RewriterConfig CreateConfig(int64 num_workers, int64 index);
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
new file mode 100644
index 00000000000..c509be0cec8
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
@@ -0,0 +1,282 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/shard_dataset_op.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+constexpr char kNodeName[] = "auto_shard_dataset";
+constexpr char kIteratorPrefix[] = "Iterator";
+
+class AutoShardDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `AutoShardDataset` op kernel.
+  Status CreateAutoShardDatasetOpKernel(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, name_utils::OpName(AutoShardDatasetOp::kDatasetType),
+        {AutoShardDatasetOp::kInputDataset, AutoShardDatasetOp::kNumWorkers,
+         AutoShardDatasetOp::kIndex},
+        {{AutoShardDatasetOp::kOutputTypes, output_types},
+         {AutoShardDatasetOp::kOutputShapes, output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `AutoShardDataset` op kernel context
+  Status CreateAutoShardDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  TestCase(int64 start, int64 stop, int64 step, int64 num_workers, int64 index,
+           std::vector<Tensor> expected_outputs,
+           DataTypeVector expected_output_dtypes,
+           std::vector<PartialTensorShape> expected_output_shapes,
+           int64 expected_cardinality, std::vector<int> breakpoints)
+      : start(CreateTensor<int64>(TensorShape({}), {start})),
+        stop(CreateTensor<int64>(TensorShape({}), {stop})),
+        step(CreateTensor<int64>(TensorShape({}), {step})),
+        num_workers(CreateTensor<int64>(TensorShape({}), {num_workers})),
+        index(CreateTensor<int64>(TensorShape({}), {index})),
+        expected_outputs(std::move(expected_outputs)),
+        expected_output_dtypes(std::move(expected_output_dtypes)),
+        expected_output_shapes(std::move(expected_output_shapes)),
+        expected_cardinality(expected_cardinality),
+        breakpoints(std::move(breakpoints)) {}
+
+  Tensor start;
+  Tensor stop;
+  Tensor step;
+  Tensor num_workers;
+  Tensor index;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test Case 1: simple case.
+TestCase SimpleCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/5,
+          /*index=*/2,
+          /*expected_outputs=*/
+          {CreateTensor<int64>(TensorShape({}), {2}),
+           CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/2,
+          /*breakpoints=*/{0, 1, 5}};
+}
+
+// Test Case 2: the index is larger than the available elements.
+TestCase IndexLargerThanAvailableElementsCase() {
+  return {/*start=*/0,
+          /*stop=*/1,
+          /*step=*/1,
+          /*num_workers=*/5,
+          /*index=*/2,
+          /*expected_outputs=*/{},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/2,
+          /*breakpoints=*/{0, 1}};
+}
+
+// Test Case 3: the number of outputs could not be evenly divided by
+// num_workers.
+TestCase ElementsUnequallyDividedCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/4,
+          /*index=*/3,
+          /*expected_outputs=*/
+          {CreateTensor<int64>(TensorShape({}), {3}),
+           CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/2,
+          /*breakpoints=*/{0, 1, 5}};
+}
+
+// TODO(feihugis): add more test cases that have ReaderDatasets (e.g. a
+// CSVDataset or a TFRecordDataset) in the pipeline.
+
+TestCase IndexGreaterNumWorkersCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/5,
+          /*index=*/7,
+          /*expected_outputs=*/{},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/0,
+          /*breakpoints=*/{}};
+}
+
+TestCase NegativeIndexTestCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/5,
+          /*index=*/-3,
+          /*expected_outputs=*/{},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/0,
+          /*breakpoints=*/{}};
+}
+
+TestCase NegativeNumWorkersTestCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/-3,
+          /*index=*/1,
+          /*expected_outputs=*/{},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/0,
+          /*breakpoints=*/{}};
+}
+
+TestCase ZeroNumWorkersTestCase() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/1,
+          /*num_workers=*/0,
+          /*index=*/1,
+          /*expected_outputs=*/{},
+          /*expected_output_dtypes=*/{DT_INT64},
+          /*expected_output_shapes=*/{PartialTensorShape({})},
+          /*expected_cardinality=*/0,
+          /*breakpoints=*/{}};
+}
+
+class ParameterizedAutoShardDatasetOpTest
+    : public AutoShardDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedAutoShardDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> auto_shard_dataset_kernel;
+  TF_ASSERT_OK(CreateAutoShardDatasetOpKernel(test_case.expected_output_dtypes,
+                                              test_case.expected_output_shapes,
+                                              &auto_shard_dataset_kernel));
+
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(MakeRangeDataset(test_case.start, test_case.stop, test_case.step,
+                                {DT_INT64}, {TensorShape({})},
+                                &range_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&range_dataset_tensor), TensorValue(&test_case.num_workers),
+       TensorValue(&test_case.index)});
+  std::unique_ptr<OpKernelContext> auto_shard_dataset_context;
+  TF_ASSERT_OK(CreateAutoShardDatasetContext(
+      auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context));
+
+  DatasetBase* auto_shard_dataset;
+  TF_ASSERT_OK(CreateDataset(auto_shard_dataset_kernel.get(),
+                             auto_shard_dataset_context.get(),
+                             &auto_shard_dataset));
+  core::ScopedUnref scoped_unref_auto_shard_dataset(auto_shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(auto_shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(auto_shard_dataset->MakeIterator(iterator_ctx.get(),
+                                                kIteratorPrefix, &iterator));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+      expected_outputs_it++;
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+INSTANTIATE_TEST_SUITE_P(AutoShardDatasetOpTest,
+                         ParameterizedAutoShardDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {SimpleCase(),
+                              IndexLargerThanAvailableElementsCase(),
+                              ElementsUnequallyDividedCase()})));
+
+TEST_F(AutoShardDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::vector<TestCase> test_cases = {
+      IndexGreaterNumWorkersCase(), NegativeIndexTestCase(),
+      NegativeNumWorkersTestCase(), ZeroNumWorkersTestCase()};
+  for (auto& test_case : test_cases) {
+    std::unique_ptr<OpKernel> auto_shard_dataset_kernel;
+    TF_ASSERT_OK(CreateAutoShardDatasetOpKernel(
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &auto_shard_dataset_kernel));
+
+    Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(MakeRangeDataset(test_case.start, test_case.stop,
+                                  test_case.step, {DT_INT64}, {TensorShape({})},
+                                  &range_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {TensorValue(&range_dataset_tensor),
+         TensorValue(&test_case.num_workers), TensorValue(&test_case.index)});
+    std::unique_ptr<OpKernelContext> auto_shard_dataset_context;
+    TF_ASSERT_OK(CreateAutoShardDatasetContext(
+        auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context));
+
+    DatasetBase* auto_shard_dataset;
+    EXPECT_EQ(
+        CreateDataset(auto_shard_dataset_kernel.get(),
+                      auto_shard_dataset_context.get(), &auto_shard_dataset)
+            .code(),
+        tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index 8b4bafe1f5b..cc7577d52d5 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 static const double kPercentile = 90.0;
@@ -241,6 +242,13 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       return static_cast<double>(n) * ratio_numerator_ / ratio_denominator_;
     }
 
+    Status CheckExternalState() const override {
+      for (const auto& captured_func : captured_funcs_) {
+        TF_RETURN_IF_ERROR(captured_func->CheckExternalState());
+      }
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -553,5 +561,6 @@ REGISTER_KERNEL_BUILDER(Name("ChooseFastestBranchDataset").Device(DEVICE_CPU),
                         ChooseFastestBranchDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index 2c934eb65df..1db93a1a066 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 static const double kPercentile = 90.0;
@@ -157,6 +158,13 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
 
     int64 Cardinality() const override { return cardinality_; }
 
+    Status CheckExternalState() const override {
+      for (const auto& input : inputs_) {
+        TF_RETURN_IF_ERROR(input->CheckExternalState());
+      }
+      return Status::OK();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -364,5 +372,6 @@ REGISTER_KERNEL_BUILDER(
     ChooseFastestDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index d721e69b7ad..ae13d712f1f 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class CSVDatasetOp : public DatasetOpKernel {
@@ -39,9 +40,9 @@ class CSVDatasetOp : public DatasetOpKernel {
         ctx, filenames_tensor->dims() <= 1,
         errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
-    string compression_type;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "compression_type",
-                                                    &compression_type));
+    tstring compression_type;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "compression_type",
+                                                     &compression_type));
 
     OpInputList record_defaults_list;
     OP_REQUIRES_OK(ctx,
@@ -67,9 +68,9 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, buffer_size > 0,
                 errors::InvalidArgument("buffer_size should be positive"));
 
-    string delim;
+    tstring delim;
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<string>(ctx, "field_delim", &delim));
+                   ParseScalarArgument<tstring>(ctx, "field_delim", &delim));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
 
@@ -79,9 +80,9 @@ class CSVDatasetOp : public DatasetOpKernel {
     bool use_quote_delim;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "use_quote_delim",
                                                   &use_quote_delim));
-    string na_value;
+    tstring na_value;
     OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<string>(ctx, "na_value", &na_value));
+                   ParseScalarArgument<tstring>(ctx, "na_value", &na_value));
 
     std::vector<Tensor> record_defaults;
     record_defaults.reserve(record_defaults_list.size());
@@ -92,7 +93,7 @@ class CSVDatasetOp : public DatasetOpKernel {
     std::vector<string> filenames;
     filenames.reserve(filenames_tensor->NumElements());
     for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-      filenames.push_back(filenames_tensor->flat<string>()(i));
+      filenames.push_back(filenames_tensor->flat<tstring>()(i));
     }
 
     io::ZlibCompressionOptions zlib_compression_options =
@@ -169,6 +170,8 @@ class CSVDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "CSVDatasetOp::Dataset"; }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -196,7 +199,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           b->AddScalar(options_.input_buffer_size, &buffer_size));
       TF_RETURN_IF_ERROR(b->AddScalar(header_, &header));
 
-      string delim_string(1, delim_);
+      tstring delim_string(1, delim_);
       TF_RETURN_IF_ERROR(b->AddScalar(delim_string, &delim));
       TF_RETURN_IF_ERROR(b->AddScalar(use_quote_delim_, &use_quote_delim));
       TF_RETURN_IF_ERROR(b->AddScalar(na_value_, &na_value));
@@ -416,7 +419,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
                                size_t* start, bool include)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        string temp_buffer;
+        tstring temp_buffer;
 
         buffer_.swap(temp_buffer);
         if (include && pos_ > *start) {
@@ -619,7 +622,7 @@ class CSVDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status FillBuffer(tstring* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         result->clear();
         ++num_buffer_reads_;
         Status s = input_stream_->ReadNBytes(
@@ -718,10 +721,10 @@ class CSVDatasetOp : public DatasetOpKernel {
           }
           case DT_STRING: {
             if (field.empty() || field == dataset()->na_value_) {
-              component.scalar<string>()() =
-                  dataset()->record_defaults_[output_idx].flat<string>()(0);
+              component.scalar<tstring>()() =
+                  dataset()->record_defaults_[output_idx].flat<tstring>()(0);
             } else {
-              component.scalar<string>()() = string(field);
+              component.scalar<tstring>()() = string(field);
             }
             break;
           }
@@ -824,7 +827,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
       mutex mu_;
-      string buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
+      tstring buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
       size_t pos_ GUARDED_BY(
           mu_);  // Index into the buffer must be maintained between iters
       size_t num_buffer_reads_ GUARDED_BY(mu_);
@@ -844,9 +847,9 @@ class CSVDatasetOp : public DatasetOpKernel {
     const std::vector<int64> select_cols_;
     const bool use_quote_delim_;
     const char delim_;
-    const string na_value_;
+    const tstring na_value_;
     const bool use_compression_;
-    const string compression_type_;
+    const tstring compression_type_;
     const io::ZlibCompressionOptions options_;
   };  // class Dataset
 
@@ -859,5 +862,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalCSVDataset").Device(DEVICE_CPU),
                         CSVDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index b3003aae455..545a9668024 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -19,11 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit DenseToSparseBatchDatasetOp(OpKernelConstruction* ctx)
@@ -122,6 +120,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
     }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -319,5 +321,6 @@ REGISTER_KERNEL_BUILDER(
     DenseToSparseBatchDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index 5d94ec0721d..8561de615c9 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -19,11 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class DirectedInterleaveDatasetOp : public DatasetOpKernel {
  public:
   explicit DirectedInterleaveDatasetOp(OpKernelConstruction* ctx)
@@ -109,6 +107,13 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       return strings::StrCat("DirectedInterleaveDatasetOp::Dataset");
     }
 
+    Status CheckExternalState() const override {
+      for (const auto& input : data_inputs_) {
+        TF_RETURN_IF_ERROR(input->CheckExternalState());
+      }
+      return selector_input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -171,7 +176,8 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
           }
 
           int64 selected_input = selector_result[0].scalar<int64>()();
-          if (selected_input < 0 || selected_input > data_input_impls_.size()) {
+          if (selected_input < 0 ||
+              selected_input >= data_input_impls_.size()) {
             return errors::InvalidArgument(
                 "Selector index out of range: ", selected_input,
                 " >= ", data_input_impls_.size());
@@ -196,8 +202,8 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
             }
           }
 
-          LOG(WARNING) << "DirectedInterleave selected an exhausted input: "
-                       << selected_input;
+          VLOG(2) << "DirectedInterleave selected an exhausted input: "
+                  << selected_input;
         }
       }
 
@@ -284,5 +290,6 @@ REGISTER_KERNEL_BUILDER(
     DirectedInterleaveDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index 6aa2ae77d4c..fa02e8ca8e0 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -25,10 +25,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
 class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByReducerDatasetOp(OpKernelConstruction* ctx)
@@ -113,6 +112,14 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       return "GroupByReducerDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override {
+      TF_RETURN_IF_ERROR(captured_key_func_->CheckExternalState());
+      TF_RETURN_IF_ERROR(captured_init_func_->CheckExternalState());
+      TF_RETURN_IF_ERROR(captured_reduce_func_->CheckExternalState());
+      TF_RETURN_IF_ERROR(captured_finalize_func_->CheckExternalState());
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -422,5 +429,6 @@ REGISTER_INPUT_COLOCATION_EXEMPTION("GroupByReducerDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalGroupByReducerDataset");
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 38a520126f9..2ccb4634281 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -26,10 +26,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
 class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
@@ -109,6 +108,13 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return "GroupByWindowDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override {
+      TF_RETURN_IF_ERROR(captured_key_func_->CheckExternalState());
+      TF_RETURN_IF_ERROR(captured_reduce_func_->CheckExternalState());
+      TF_RETURN_IF_ERROR(captured_window_size_func_->CheckExternalState());
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -517,5 +523,6 @@ REGISTER_INPUT_COLOCATION_EXEMPTION("GroupByWindowDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalGroupByWindowDataset");
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 410861de78a..b9fb85ce7bf 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -18,11 +18,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit IgnoreErrorsDatasetOp(OpKernelConstruction* ctx)
@@ -62,6 +60,10 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -147,5 +149,6 @@ REGISTER_KERNEL_BUILDER(
     IgnoreErrorsDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 0c75995d5de..bae373a1d76 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class LMDBDatasetOp : public DatasetOpKernel {
@@ -37,7 +38,7 @@ class LMDBDatasetOp : public DatasetOpKernel {
     std::vector<string> filenames;
     filenames.reserve(filenames_tensor->NumElements());
     for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-      filenames.push_back(filenames_tensor->flat<string>()(i));
+      filenames.push_back(filenames_tensor->flat<tstring>()(i));
     }
 
     *output = new Dataset(ctx, filenames);
@@ -69,6 +70,8 @@ class LMDBDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "LMDBDatasetOp::Dataset"; }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -94,13 +97,13 @@ class LMDBDatasetOp : public DatasetOpKernel {
             out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                       TensorShape({}));
             Tensor& key_tensor = out_tensors->back();
-            key_tensor.scalar<string>()() = string(
+            key_tensor.scalar<tstring>()() = string(
                 static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
 
             out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                       TensorShape({}));
             Tensor& value_tensor = out_tensors->back();
-            value_tensor.scalar<string>()() =
+            value_tensor.scalar<tstring>()() =
                 string(static_cast<const char*>(mdb_value_.mv_data),
                        mdb_value_.mv_size);
 
@@ -221,5 +224,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalLMDBDataset").Device(DEVICE_CPU),
                         LMDBDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 2eddf4a775a..51f3c20732d 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#define EIGEN_USE_THREADS
+#include "tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h"
 
 #include <atomic>
 #include <utility>
@@ -20,12 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/common_runtime/metrics.h"
-#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -38,401 +37,377 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace {
+namespace experimental {
 
-constexpr char kDatasetName[] = "MapAndBatch";
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kDatasetType;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kInputDataset;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kOtherArguments;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kBatchSize;
+/* static */ constexpr const char* const
+    MapAndBatchDatasetOp::kNumParallelCalls;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kDropRemainder;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kFunc;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kTarguments;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kOutputTypes;
+/* static */ constexpr const char* const MapAndBatchDatasetOp::kOutputShapes;
+/* static */ constexpr const char* const
+    MapAndBatchDatasetOp::kPreserveCardinality;
 
 // Maximum number of batch results to buffer.
 constexpr int64 kMaxBatchResults = 16;
+constexpr char kParallelism[] = "parallelism";
+constexpr char kCallCounter[] = "call_counter";
+constexpr char kBatchResultsSize[] = "batch_results_size";
+constexpr char kTFDataMapAndBatch[] = "tf_data_map_and_batch";
+constexpr char kBatchResults[] = "batch_results";
+constexpr char kEndOfInput[] = "end_of_input";
+constexpr char kNumCalls[] = "num_calls";
+constexpr char kNumElements[] = "num_elements";
+constexpr char kOutputAllocated[] = "output_allocated";
+constexpr char kOutputSize[] = "output_size";
+constexpr char kOutput[] = "output";
+constexpr char kStatus[] = "status";
+constexpr char kCode[] = "code";
+constexpr char kMessage[] = "msg";
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
+class MapAndBatchDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
-    OP_REQUIRES_OK(ctx,
-                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
+          int64 num_parallel_calls, bool drop_remainder,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes,
+          std::unique_ptr<CapturedFunction> captured_func,
+          bool preserve_cardinality)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        batch_size_(batch_size),
+        num_parallel_calls_(num_parallel_calls),
+        drop_remainder_(drop_remainder),
+        output_types_(output_types),
+        output_shapes_(output_shapes),
+        captured_func_(std::move(captured_func)),
+        preserve_cardinality_(preserve_cardinality) {
+    input_->Ref();
+  }
+
+  ~Dataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  int64 Cardinality() const override {
+    int64 n = input_->Cardinality();
+    if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+      return n;
+    }
+    return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+  }
+
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
   }
 
  protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("batch_size must be greater than zero."));
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* batch_size_node;
+    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+    Node* num_parallel_calls_node;
+    TF_RETURN_IF_ERROR(
+        b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
+    Node* drop_remainder_node;
+    TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+    std::vector<Node*> other_arguments;
+    DataTypeVector other_arguments_types;
+    TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
+                                                  &other_arguments_types));
+    AttrValue f;
+    b->BuildAttrValue(captured_func_->func(), &f);
+    AttrValue other_arguments_types_attr;
+    b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+    AttrValue preserve_cardinality_attr;
+    b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
 
-    int64 num_parallel_calls = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                            &num_parallel_calls));
-    OP_REQUIRES(
-        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutotune,
-        errors::InvalidArgument(
-            "num_parallel_calls must be greater than zero."));
-
-    bool drop_remainder;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
-                                      &captured_func));
-
-    if (num_parallel_calls == model::kAutotune) {
-      metrics::RecordTFDataAutotune(kDatasetName);
-    }
-
-    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_,
-                          std::move(captured_func), preserve_cardinality_);
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this,
+        {std::make_pair(0, input_graph_node),
+         std::make_pair(2, batch_size_node),
+         std::make_pair(3, num_parallel_calls_node),
+         std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+        {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+        {std::make_pair(kFunc, f),
+         std::make_pair(kTarguments, other_arguments_types_attr),
+         std::make_pair(kPreserveCardinality,
+                        preserve_cardinality_attr)},  // Attrs
+        output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_calls, bool drop_remainder,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            std::unique_ptr<CapturedFunction> captured_func,
-            bool preserve_cardinality)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          batch_size_(batch_size),
-          num_parallel_calls_(num_parallel_calls),
-          drop_remainder_(drop_remainder),
-          output_types_(output_types),
-          output_shapes_(output_shapes),
-          captured_func_(std::move(captured_func)),
-          preserve_cardinality_(preserve_cardinality) {
-      input_->Ref();
-    }
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params),
+          mu_(std::make_shared<mutex>()),
+          cond_var_(std::make_shared<condition_variable>()),
+          num_parallel_calls_(std::make_shared<model::SharedState>(
+              params.dataset->num_parallel_calls_, mu_, cond_var_)),
+          max_batch_results_(
+              std::min(kMaxBatchResults, (params.dataset->num_parallel_calls_ +
+                                          params.dataset->batch_size_ - 1) /
+                                             params.dataset->batch_size_)) {}
 
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)});
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "MapAndBatchDatasetOp::Dataset";
-    }
-
-    int64 Cardinality() const override {
-      int64 n = input_->Cardinality();
-      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
-        return n;
+    ~Iterator() override {
+      mutex_lock l(*mu_);
+      // Cancel the runner thread.
+      cancelled_ = true;
+      cond_var_->notify_all();
+      // Wait for all in-flight calls to complete.
+      while (num_calls_ > 0) {
+        cond_var_->wait(l);
       }
-      return n / batch_size_ +
-             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
+    string BuildTraceMeName() override {
+      // NOTE: We do not synchronize the following access to
+      // num_parallel_calls_ to minimize the tracing overhead.
+      int64 parallelism = num_parallel_calls_->value;
+      return strings::StrCat(prefix(), "#", kParallelism, "=", parallelism,
+                             "#");
+    }
+
+    Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(*mu_);
+      if (num_parallel_calls_->value == model::kAutotune) {
+        num_parallel_calls_->value = ctx->runner_threadpool_size();
+      }
+      TF_RETURN_IF_ERROR(
+          dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+      return dataset()->captured_func_->Instantiate(
+          ctx, &instantiated_captured_func_);
+    }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      std::shared_ptr<BatchResult> result;
+      {
+        mutex_lock l(*mu_);
+        EnsureRunnerThreadStarted(ctx);
+        while (batch_results_.empty() ||
+               batch_results_.front()->num_calls > 0) {
+          ++waiting_;
+          RecordStop(ctx);
+          cond_var_->wait(l);
+          RecordStart(ctx);
+          --waiting_;
+        }
+        std::swap(result, batch_results_.front());
+        batch_results_.pop_front();
+        cond_var_->notify_all();
+      }
+      return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
 
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* batch_size_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
-      Node* num_parallel_calls_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
-      Node* drop_remainder_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
-      std::vector<Node*> other_arguments;
-      DataTypeVector other_arguments_types;
-      TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
-                                                    &other_arguments_types));
-      AttrValue f;
-      b->BuildAttrValue(captured_func_->func(), &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-      AttrValue preserve_cardinality_attr;
-      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeAsyncKnownRatioNode(
+          std::move(args), dataset()->batch_size_,
+          {model::MakeParameter(kParallelism, num_parallel_calls_, /*min=*/1,
+                                /*max=*/ctx->runner_threadpool_size())});
+    }
 
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {std::make_pair(0, input_graph_node),
-           std::make_pair(2, batch_size_node),
-           std::make_pair(3, num_parallel_calls_node),
-           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
-          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
-          {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr),
-           std::make_pair("preserve_cardinality",
-                          preserve_cardinality_attr)},  // Attrs
-          output));
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      mutex_lock l(*mu_);
+      // Wait for all in-flight calls to complete.
+      while (num_calls_ > 0) {
+        cond_var_->wait(l);
+      }
+      DCHECK_EQ(num_calls_, 0);
+      TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kCallCounter), call_counter_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kBatchResultsSize),
+                                             batch_results_.size()));
+      for (size_t i = 0; i < batch_results_.size(); ++i) {
+        TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
+      }
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(*mu_);
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kCallCounter), &call_counter_));
+      int64 batch_results_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kBatchResultsSize),
+                                            &batch_results_size));
+      for (int i = 0; i < batch_results_size; ++i) {
+        TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
+      }
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            mu_(std::make_shared<mutex>()),
-            cond_var_(std::make_shared<condition_variable>()),
-            num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, cond_var_)),
-            max_batch_results_(std::min(kMaxBatchResults,
-                                        (params.dataset->num_parallel_calls_ +
-                                         params.dataset->batch_size_ - 1) /
-                                            params.dataset->batch_size_)) {}
-
-      ~Iterator() override {
-        mutex_lock l(*mu_);
-        // Cancel the runner thread.
-        cancelled_ = true;
-        cond_var_->notify_all();
-        // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
-          cond_var_->wait(l);
-        }
+    // BatchResult encapsulates the output batch, as well as ancillary
+    // metadata required to execute the fused map-and-batch operation.
+    struct BatchResult {
+      explicit BatchResult(int64 batch_size) {
+        end_of_input = false;
+        num_calls = batch_size;
+        num_elements = 0;
+        output_allocated = false;
+        status = Status::OK();
+        status_offset = -1;
       }
 
-      string BuildTraceMeName() override {
-        // NOTE: We do not synchronize the following access to
-        // num_parallel_calls_ to minimize the tracing overhead.
-        int64 parallelism = num_parallel_calls_->value;
-        return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == model::kAutotune) {
-          num_parallel_calls_->value = ctx->runner_threadpool_size();
-        }
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(
-            ctx, &instantiated_captured_func_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        std::shared_ptr<BatchResult> result;
-        {
-          mutex_lock l(*mu_);
-          EnsureRunnerThreadStarted(ctx);
-          while (batch_results_.empty() ||
-                 batch_results_.front()->num_calls > 0) {
-            ++waiting_;
-            RecordStop(ctx);
-            cond_var_->wait(l);
-            RecordStart(ctx);
-            --waiting_;
-          }
-          std::swap(result, batch_results_.front());
-          batch_results_.pop_front();
-          cond_var_->notify_all();
-        }
-        return ProcessResult(ctx, result, out_tensors, end_of_sequence);
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncKnownRatioNode(
-            std::move(args), dataset()->batch_size_,
-            {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/ctx->runner_threadpool_size())});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(*mu_);
-        // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
-          cond_var_->wait(l);
-        }
-        DCHECK_EQ(num_calls_, 0);
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("call_counter"), call_counter_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
-                                               batch_results_.size()));
-        for (size_t i = 0; i < batch_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(*mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("call_counter"), &call_counter_));
-        int64 batch_results_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_results_size"),
-                                              &batch_results_size));
-        for (int i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
-        }
-        return Status::OK();
-      }
-
-     private:
-      // BatchResult encapsulates the output batch, as well as ancillary
-      // metadata required to execute the fused map-and-batch operation.
-      struct BatchResult {
-        explicit BatchResult(int64 batch_size) {
-          end_of_input = false;
-          num_calls = batch_size;
-          num_elements = 0;
-          output_allocated = false;
-          status = Status::OK();
-          status_offset = -1;
-        }
-
-        // UpdateStatus updates the batch's aggregate Status.
-        //
-        // In order to ensure that exactly the first non-OK status is returned
-        // (required to make the behavior is observably identical to a
-        // sequential execution of map followed by batch), we must also keep
-        // track of the offset into the batch that produced `s`.
-        void UpdateStatus(const Status& s, int64 offset) {
-          if (TF_PREDICT_FALSE(!s.ok())) {
-            mutex_lock l(mu);
-            if (status.ok() || offset < status_offset) {
-              status = s;
-              status_offset = offset;
-            }
+      // UpdateStatus updates the batch's aggregate Status.
+      //
+      // In order to ensure that exactly the first non-OK status is returned
+      // (required to make the behavior is observably identical to a
+      // sequential execution of map followed by batch), we must also keep
+      // track of the offset into the batch that produced `s`.
+      void UpdateStatus(const Status& s, int64 offset) {
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          mutex_lock l(mu);
+          if (status.ok() || offset < status_offset) {
+            status = s;
+            status_offset = offset;
           }
         }
-
-        mutex mu;
-        bool end_of_input GUARDED_BY(mu);
-        int64 num_elements GUARDED_BY(mu);
-        std::vector<Tensor> output;
-        bool output_allocated GUARDED_BY(mu);
-        Status status GUARDED_BY(mu);
-        int64 status_offset GUARDED_BY(mu);
-        // Counts the number of outstanding calls for this batch.
-        int64 num_calls;  // access guarded by owner's mutex
-      };
-
-      void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
-                         const std::shared_ptr<BatchResult>& result)
-          LOCKS_EXCLUDED(*mu_) {
-        mutex_lock l(*mu_);
-        num_calls_--;
-        result->num_calls--;
-        const auto& stats_aggregator = ctx->stats_aggregator();
-        if (stats_aggregator) {
-          stats_aggregator->AddScalar(
-              stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
-              static_cast<float>(num_calls_) /
-                  static_cast<float>(num_parallel_calls_->value),
-              num_elements());
-        }
-        cond_var_->notify_all();
       }
 
-      void CallFunction(std::shared_ptr<IteratorContext> ctx,
-                        const std::shared_ptr<BatchResult>& result,
-                        int64 offset) LOCKS_EXCLUDED(*mu_) {
-        // Get the next input element.
-        std::vector<Tensor> input_element;
-        bool end_of_input;
-        Status status =
-            input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
-        bool return_early;
-        {
-          mutex_lock l(result->mu);
-          result->end_of_input = result->end_of_input || end_of_input;
-          result->status.Update(status);
-          return_early = result->end_of_input || !result->status.ok();
-        }
-        if (return_early) {
-          CallCompleted(ctx, result);
-          return;
-        }
+      mutex mu;
+      bool end_of_input GUARDED_BY(mu);
+      int64 num_elements GUARDED_BY(mu);
+      std::vector<Tensor> output;
+      bool output_allocated GUARDED_BY(mu);
+      Status status GUARDED_BY(mu);
+      int64 status_offset GUARDED_BY(mu);
+      // Counts the number of outstanding calls for this batch.
+      int64 num_calls;  // access guarded by owner's mutex
+    };
 
-        std::shared_ptr<std::vector<Tensor>> return_values =
-            std::make_shared<std::vector<Tensor>>();
-        auto done = [this, ctx, result, return_values, offset](Status status) {
-          if (dataset()->preserve_cardinality_ &&
-              errors::IsOutOfRange(status)) {
-            // To guarantee that the transformation preserves the cardinality of
-            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
-            // former may be interpreted by a caller as the end of sequence.
-            status = errors::InvalidArgument(
-                "Function invocation produced OutOfRangeError: ",
-                status.error_message());
-          }
-          result->UpdateStatus(status, offset);
-          if (status.ok()) {
-            Status allocate_status =
-                EnsureOutputAllocated(ctx, result, return_values);
-            if (!allocate_status.ok()) {
-              result->UpdateStatus(allocate_status, offset);
-            } else {
-              for (size_t i = 0; i < return_values->size(); ++i) {
-                Tensor& tensor = return_values->at(i);
-                Tensor* batch = &(result->output)[i];
-                if (tensor.NumElements() !=
-                    (batch->NumElements() / batch->dim_size(0))) {
-                  TensorShape batch_shape = batch->shape();
-                  batch_shape.RemoveDim(0);
-                  result->UpdateStatus(
-                      errors::InvalidArgument(
-                          "Cannot add tensor to the batch: number of elements "
-                          "does not match. Shapes are: [tensor]: ",
-                          tensor.shape().DebugString(),
-                          ", [batch]: ", batch_shape.DebugString()),
-                      offset);
-                  break;
-                }
-                // TODO(mrry): Add a version of DoParallelConcat that allows us
-                // to move `tensor` where possible, to speed up string tensor
-                // batching.
-                Status copy_status = batch_util::CopyElementToSlice(
-                    std::move(tensor), batch, offset);
-                if (!copy_status.ok()) {
-                  result->UpdateStatus(copy_status, offset);
-                  break;
-                }
+    void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
+                       const std::shared_ptr<BatchResult>& result)
+        LOCKS_EXCLUDED(*mu_) {
+      mutex_lock l(*mu_);
+      num_calls_--;
+      result->num_calls--;
+      const auto& stats_aggregator = ctx->stats_aggregator();
+      if (stats_aggregator) {
+        stats_aggregator->AddScalar(
+            stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
+            static_cast<float>(num_calls_) /
+                static_cast<float>(num_parallel_calls_->value),
+            num_elements());
+      }
+      cond_var_->notify_all();
+    }
+
+    void CallFunction(std::shared_ptr<IteratorContext> ctx,
+                      const std::shared_ptr<BatchResult>& result, int64 offset)
+        LOCKS_EXCLUDED(*mu_) {
+      // Get the next input element.
+      std::vector<Tensor> input_element;
+      bool end_of_input;
+      Status status =
+          input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
+      bool return_early;
+      {
+        mutex_lock l(result->mu);
+        result->end_of_input = result->end_of_input || end_of_input;
+        result->status.Update(status);
+        return_early = result->end_of_input || !result->status.ok();
+      }
+      if (return_early) {
+        CallCompleted(ctx, result);
+        return;
+      }
+
+      std::shared_ptr<std::vector<Tensor>> return_values =
+          std::make_shared<std::vector<Tensor>>();
+      auto done = [this, ctx, result, return_values, offset](Status status) {
+        if (dataset()->preserve_cardinality_ && errors::IsOutOfRange(status)) {
+          // To guarantee that the transformation preserves the cardinality of
+          // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+          // former may be interpreted by a caller as the end of sequence.
+          status = errors::InvalidArgument(
+              "Function invocation produced OutOfRangeError: ",
+              status.error_message());
+        }
+        result->UpdateStatus(status, offset);
+        if (status.ok()) {
+          Status allocate_status =
+              EnsureOutputAllocated(ctx, result, return_values);
+          if (!allocate_status.ok()) {
+            result->UpdateStatus(allocate_status, offset);
+          } else {
+            for (size_t i = 0; i < return_values->size(); ++i) {
+              Tensor& tensor = return_values->at(i);
+              Tensor* batch = &(result->output)[i];
+              if (tensor.NumElements() !=
+                  (batch->NumElements() / batch->dim_size(0))) {
+                TensorShape batch_shape = batch->shape();
+                batch_shape.RemoveDim(0);
+                result->UpdateStatus(
+                    errors::InvalidArgument(
+                        "Cannot add tensor to the batch: number of elements "
+                        "does not match. Shapes are: [tensor]: ",
+                        tensor.shape().DebugString(),
+                        ", [batch]: ", batch_shape.DebugString()),
+                    offset);
+                break;
+              }
+              // TODO(mrry): Add a version of DoParallelConcat that allows us
+              // to move `tensor` where possible, to speed up string tensor
+              // batching.
+              Status copy_status = batch_util::CopyElementToSlice(
+                  std::move(tensor), batch, offset);
+              if (!copy_status.ok()) {
+                result->UpdateStatus(copy_status, offset);
+                break;
               }
             }
-            {
-              mutex_lock l(result->mu);
-              result->num_elements++;
-            }
           }
-          CallCompleted(ctx, result);
-        };
+          {
+            mutex_lock l(result->mu);
+            result->num_elements++;
+          }
+        }
+        CallCompleted(ctx, result);
+      };
 
-        // Apply the map function on `input_element`, storing the result in
-        // `return_values`, and invoking `done` when finished.
-        instantiated_captured_func_->RunAsync(
-            ctx.get(), std::move(input_element), return_values.get(),
-            std::move(done), prefix());
-      }
+      // Apply the map function on `input_element`, storing the result in
+      // `return_values`, and invoking `done` when finished.
+      instantiated_captured_func_->RunAsync(ctx.get(), std::move(input_element),
+                                            return_values.get(),
+                                            std::move(done), prefix());
+    }
 
-      Status CopyPartialBatch(Tensor* output, const Tensor& value,
-                              int64 num_elements) {
-        switch (value.dtype()) {
+    Status CopyPartialBatch(Tensor* output, const Tensor& value,
+                            int64 num_elements) {
+      switch (value.dtype()) {
 #define HANDLE_TYPE(type)                                         \
   case DataTypeToEnum<type>::value: {                             \
     auto output_t = output->flat_outer_dims<type>();              \
@@ -442,328 +417,366 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     }                                                             \
     return Status::OK();                                          \
   }
-          TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
 #undef HANDLE_TYPE
-          default:
-            return errors::InvalidArgument("Unsupported data type: ",
-                                           DataTypeString(value.dtype()));
-        }
+        default:
+          return errors::InvalidArgument("Unsupported data type: ",
+                                         DataTypeString(value.dtype()));
+      }
+      return Status::OK();
+    }
+
+    void EnsureRunnerThreadStarted(IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      if (!runner_thread_) {
+        auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+        runner_thread_ = ctx->StartThread(
+            kTFDataMapAndBatch,
+            std::bind(&Iterator::RunnerThread, this, ctx_copy));
+      }
+    }
+
+    Status EnsureOutputAllocated(
+        const std::shared_ptr<IteratorContext>& ctx,
+        const std::shared_ptr<BatchResult>& result,
+        const std::shared_ptr<std::vector<Tensor>>& return_values) {
+      mutex_lock l(result->mu);
+      if (result->output_allocated) {
         return Status::OK();
       }
-
-      void EnsureRunnerThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (!runner_thread_) {
-          auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-          runner_thread_ = ctx->StartThread(
-              "tf_data_map_and_batch",
-              std::bind(&Iterator::RunnerThread, this, ctx_copy));
+      const size_t num_components = return_values->size();
+      for (size_t i = 0; i < num_components; ++i) {
+        TensorShape component_shape({dataset()->batch_size_});
+        component_shape.AppendShape(return_values->at(i).shape());
+        AllocatorAttributes attr;
+        attr.set_gpu_compatible(true);
+        result->output.emplace_back(ctx->allocator(attr),
+                                    return_values->at(i).dtype(),
+                                    component_shape);
+        if (!result->output.back().IsInitialized()) {
+          return errors::ResourceExhausted(
+              "Failed to allocate memory for the batch of component ", i);
         }
       }
+      result->output_allocated = true;
+      return Status::OK();
+    }
 
-      Status EnsureOutputAllocated(
-          const std::shared_ptr<IteratorContext>& ctx,
-          const std::shared_ptr<BatchResult>& result,
-          const std::shared_ptr<std::vector<Tensor>>& return_values) {
-        mutex_lock l(result->mu);
-        if (result->output_allocated) {
+    Status ProcessResult(IteratorContext* ctx,
+                         const std::shared_ptr<BatchResult>& result,
+                         std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) {
+      mutex_lock l(result->mu);
+      if (result->num_elements == 0) {
+        if (result->status.ok() || errors::IsOutOfRange(result->status)) {
+          *end_of_sequence = true;
           return Status::OK();
-        }
-        const size_t num_components = return_values->size();
-        for (size_t i = 0; i < num_components; ++i) {
-          TensorShape component_shape({dataset()->batch_size_});
-          component_shape.AppendShape(return_values->at(i).shape());
-          AllocatorAttributes attr;
-          attr.set_gpu_compatible(true);
-          result->output.emplace_back(ctx->allocator(attr),
-                                      return_values->at(i).dtype(),
-                                      component_shape);
-          if (!result->output.back().IsInitialized()) {
-            return errors::ResourceExhausted(
-                "Failed to allocate memory for the batch of component ", i);
-          }
-        }
-        result->output_allocated = true;
-        return Status::OK();
-      }
-
-      Status ProcessResult(IteratorContext* ctx,
-                           const std::shared_ptr<BatchResult>& result,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) {
-        mutex_lock l(result->mu);
-        if (result->num_elements == 0) {
-          if (result->status.ok() || errors::IsOutOfRange(result->status)) {
-            *end_of_sequence = true;
-            return Status::OK();
-          } else {
-            *end_of_sequence = false;
-            return result->status;
-          }
-        }
-        if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
-          // Deallocate tensors allocated for the output.
-          result->output.clear();
+        } else {
           *end_of_sequence = false;
           return result->status;
         }
-        if (result->num_elements < dataset()->batch_size_) {
-          if (dataset()->drop_remainder_) {
-            // Deallocate tensors allocated for the output.
-            result->output.clear();
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-          const std::vector<Tensor>& output = result->output;
-          for (size_t i = 0; i < output.size(); ++i) {
-            TensorShape component_shape(result->output[i].shape());
-            component_shape.set_dim(0, result->num_elements);
-            AllocatorAttributes attr;
-            attr.set_gpu_compatible(true);
-            out_tensors->emplace_back(ctx->allocator(attr), output[i].dtype(),
-                                      component_shape);
-            TF_RETURN_IF_ERROR(CopyPartialBatch(&out_tensors->back(), output[i],
-                                                result->num_elements));
-          }
+      }
+      if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
+        // Deallocate tensors allocated for the output.
+        result->output.clear();
+        *end_of_sequence = false;
+        return result->status;
+      }
+      if (result->num_elements < dataset()->batch_size_) {
+        if (dataset()->drop_remainder_) {
           // Deallocate tensors allocated for the output.
           result->output.clear();
-        } else {
-          *out_tensors = std::move(result->output);
+          *end_of_sequence = true;
+          return Status::OK();
         }
-        *end_of_sequence = false;
-        return Status::OK();
+        const std::vector<Tensor>& output = result->output;
+        for (size_t i = 0; i < output.size(); ++i) {
+          TensorShape component_shape(result->output[i].shape());
+          component_shape.set_dim(0, result->num_elements);
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          out_tensors->emplace_back(ctx->allocator(attr), output[i].dtype(),
+                                    component_shape);
+          TF_RETURN_IF_ERROR(CopyPartialBatch(&out_tensors->back(), output[i],
+                                              result->num_elements));
+        }
+        // Deallocate tensors allocated for the output.
+        result->output.clear();
+      } else {
+        *out_tensors = std::move(result->output);
       }
+      *end_of_sequence = false;
+      return Status::OK();
+    }
 
-      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
-          LOCKS_EXCLUDED(*mu_) {
-        std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
-        RecordStart(ctx.get());
-        auto stop_cleanup =
-            gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
+    void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
+        LOCKS_EXCLUDED(*mu_) {
+      std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
+      RecordStart(ctx.get());
+      auto stop_cleanup =
+          gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
+      {
+        tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+        new_calls.reserve(num_parallel_calls_->value);
+      }
+      auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+        int64 num_parallel_calls = num_parallel_calls_->value;
+        return num_calls_ >= num_parallel_calls ||
+               (batch_results_.size() > max_batch_results_ ||
+                (batch_results_.size() == max_batch_results_ &&
+                 call_counter_ % dataset()->batch_size_ == 0));
+      };
+      while (true) {
         {
-          tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
-          new_calls.reserve(num_parallel_calls_->value);
-        }
-        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          int64 num_parallel_calls = num_parallel_calls_->value;
-          return num_calls_ >= num_parallel_calls ||
-                 (batch_results_.size() > max_batch_results_ ||
-                  (batch_results_.size() == max_batch_results_ &&
-                   call_counter_ % dataset()->batch_size_ == 0));
-        };
-        while (true) {
-          {
-            mutex_lock l(*mu_);
-            while (!cancelled_ && busy()) {
-              if (waiting_ > 0 && num_calls_ < num_parallel_calls_->value &&
-                  max_batch_results_ < kMaxBatchResults) {
-                // If there is a caller waiting for a batch and the number of
-                // outstanding calls is not maxed out, it means we are out of
-                // `batch_results_` slots. Instead of waiting for a slot to open
-                // up, we create a new one to utilize CPU efficiently.
-                max_batch_results_++;
-                continue;
-              }
-              RecordStop(ctx.get());
-              cond_var_->wait(l);
-              RecordStart(ctx.get());
+          mutex_lock l(*mu_);
+          while (!cancelled_ && busy()) {
+            if (waiting_ > 0 && num_calls_ < num_parallel_calls_->value &&
+                max_batch_results_ < kMaxBatchResults) {
+              // If there is a caller waiting for a batch and the number of
+              // outstanding calls is not maxed out, it means we are out of
+              // `batch_results_` slots. Instead of waiting for a slot to open
+              // up, we create a new one to utilize CPU efficiently.
+              max_batch_results_++;
+              continue;
             }
+            RecordStop(ctx.get());
+            cond_var_->wait(l);
+            RecordStart(ctx.get());
+          }
 
-            if (cancelled_) {
-              return;
-            }
+          if (cancelled_) {
+            return;
+          }
 
-            while (!busy()) {
-              if (call_counter_ % dataset()->batch_size_ == 0) {
-                batch_results_.push_back(
-                    std::make_shared<BatchResult>(dataset()->batch_size_));
-              }
-              int64 offset = call_counter_++ % dataset()->batch_size_;
-              new_calls.emplace_back(batch_results_.back(), offset);
-              num_calls_++;
+          while (!busy()) {
+            if (call_counter_ % dataset()->batch_size_ == 0) {
+              batch_results_.push_back(
+                  std::make_shared<BatchResult>(dataset()->batch_size_));
             }
+            int64 offset = call_counter_++ % dataset()->batch_size_;
+            new_calls.emplace_back(batch_results_.back(), offset);
+            num_calls_++;
           }
-          const auto& stats_aggregator = ctx->stats_aggregator();
-          if (stats_aggregator) {
-            mutex_lock l(*mu_);
-            stats_aggregator->AddScalar(
-                stats_utils::ThreadUtilizationScalarName(
-                    dataset()->node_name()),
-                static_cast<float>(num_calls_) /
-                    static_cast<float>(num_parallel_calls_->value),
-                num_elements());
-          }
-          for (const auto& call : new_calls) {
-            CallFunction(ctx, call.first, call.second);
-          }
-          new_calls.clear();
         }
+        const auto& stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator) {
+          mutex_lock l(*mu_);
+          stats_aggregator->AddScalar(
+              stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value),
+              num_elements());
+        }
+        for (const auto& call : new_calls) {
+          CallFunction(ctx, call.first, call.second);
+        }
+        new_calls.clear();
       }
+    }
 
-      Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
-                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        batch_results_.push_back(
-            std::make_shared<BatchResult>(dataset()->batch_size_));
-        std::shared_ptr<BatchResult> result = batch_results_.back();
-        string prefix = strings::StrCat("batch_results_", index);
-        mutex_lock l(result->mu);
-        result->end_of_input = reader->Contains(
-            full_name(strings::StrCat(prefix, "_end_of_input")));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name(strings::StrCat(prefix, "_num_calls")),
-                               &result->num_calls));
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_num_elements")),
-            &result->num_elements));
-        result->output_allocated = reader->Contains(
-            full_name(strings::StrCat(prefix, "_output_allocated")));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_output_size")), &output_size));
-        result->output.reserve(output_size);
-        for (int i = 0; i < output_size; i++) {
-          Tensor t;
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)), &t));
-          // If the batch was not full, we may have stored only the relevant
-          // slice. Since tensors in `BatchResult.output` are expected to
-          // have the leading dimension of size batch_size, we build a larger
-          // tensor and copy the slice read from the checkpoint into it.
-          if (t.dim_size(0) < dataset()->batch_size_) {
-            TensorShape component_shape(t.shape());
-            component_shape.set_dim(0, dataset()->batch_size_);
-            AllocatorAttributes attr;
-            attr.set_gpu_compatible(true);
-            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
-            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
-            result->output.emplace_back(std::move(new_t));
-          } else {
-            result->output.emplace_back(std::move(t));
-          }
-        }
-        TF_RETURN_IF_ERROR(ReadStatus(
-            reader, strings::StrCat(prefix, "_status"), &result->status));
-        return Status::OK();
-      }
-
-      Status ReadStatus(IteratorStateReader* reader, const string& prefix,
-                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
+    Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
+                           size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      batch_results_.push_back(
+          std::make_shared<BatchResult>(dataset()->batch_size_));
+      std::shared_ptr<BatchResult> result = batch_results_.back();
+      string prefix = strings::StrCat(kBatchResults, "_", index);
+      mutex_lock l(result->mu);
+      result->end_of_input = reader->Contains(
+          full_name(strings::StrCat(prefix, "_", kEndOfInput)));
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(strings::StrCat(prefix, "_", kNumCalls)),
+                             &result->num_calls));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(prefix, "_", kNumElements)),
+          &result->num_elements));
+      result->output_allocated = reader->Contains(
+          full_name(strings::StrCat(prefix, "_", kOutputAllocated)));
+      int64 output_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(prefix, "_", kOutputSize)), &output_size));
+      result->output.reserve(output_size);
+      for (int i = 0; i < output_size; i++) {
+        Tensor t;
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            full_name(strings::StrCat(prefix, "_", kOutput, "_", i)), &t));
+        // If the batch was not full, we may have stored only the relevant
+        // slice. Since tensors in `BatchResult.output` are expected to
+        // have the leading dimension of size batch_size, we build a larger
+        // tensor and copy the slice read from the checkpoint into it.
+        if (t.dim_size(0) < dataset()->batch_size_) {
+          TensorShape component_shape(t.shape());
+          component_shape.set_dim(0, dataset()->batch_size_);
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
+          TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
+          result->output.emplace_back(std::move(new_t));
         } else {
-          *status = Status::OK();
+          result->output.emplace_back(std::move(t));
         }
-        return Status::OK();
       }
+      TF_RETURN_IF_ERROR(ReadStatus(
+          reader, strings::StrCat(prefix, "_", kStatus), &result->status));
+      return Status::OK();
+    }
 
-      Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        std::shared_ptr<BatchResult> result = batch_results_[index];
-        string prefix = strings::StrCat("batch_results_", index);
-        mutex_lock l(result->mu);
-        if (result->end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_num_calls")),
-            result->num_calls));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_num_elements")),
-            result->num_elements));
-        if (result->output_allocated) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_output_allocated")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_output_size")),
-            result->output.size()));
-        for (int i = 0; i < result->output.size(); i++) {
-          // If the batch is not full, we only store the first `num_elements`
-          // values. The rest of the batch tensor is *uninitialized* and
-          // accessing that will raise msan errors.
-          if (result->num_elements < dataset()->batch_size_) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result->output[i].Slice(0, result->num_elements)));
-          } else {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result->output[i]));
-          }
-        }
-        TF_RETURN_IF_ERROR(WriteStatus(
-            writer, strings::StrCat(prefix, "_status"), result->status));
-        return Status::OK();
+    Status ReadStatus(IteratorStateReader* reader, const string& prefix,
+                      Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      int64 code_int;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(prefix, "_", kCode)), &code_int));
+      error::Code code = static_cast<error::Code>(code_int);
+
+      if (code != error::Code::OK) {
+        tstring error_message;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_", kMessage)), &error_message));
+        *status = Status(code, error_message);
+      } else {
+        *status = Status::OK();
       }
+      return Status::OK();
+    }
 
-      Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
-                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
+    Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
+        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      std::shared_ptr<BatchResult> result = batch_results_[index];
+      string prefix = strings::StrCat(kBatchResults, "_", index);
+      mutex_lock l(result->mu);
+      if (result->end_of_input) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kEndOfInput)), ""));
       }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kNumCalls)),
+          result->num_calls));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kNumElements)),
+          result->num_elements));
+      if (result->output_allocated) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kOutputAllocated)), ""));
+      }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kOutputSize)),
+          result->output.size()));
+      for (int i = 0; i < result->output.size(); i++) {
+        // If the batch is not full, we only store the first `num_elements`
+        // values. The rest of the batch tensor is *uninitialized* and
+        // accessing that will raise msan errors.
+        if (result->num_elements < dataset()->batch_size_) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_", kOutput, "_", i)),
+              result->output[i].Slice(0, result->num_elements)));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_", kOutput, "_", i)),
+              result->output[i]));
+        }
+      }
+      TF_RETURN_IF_ERROR(WriteStatus(
+          writer, strings::StrCat(prefix, "_", kStatus), result->status));
+      return Status::OK();
+    }
 
-      // Used for coordination between the main thread, the runner thread, and
-      // the callback threads.
-      const std::shared_ptr<mutex> mu_;
-      // Used for coordination between the main thread, the runner thread, and
-      // the callback threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than
-      // `num_parallel_calls_->value` and there are slots available in the
-      // `batch_results_` buffer.
-      const std::shared_ptr<condition_variable> cond_var_;
-      // Identifies the maximum number of parallel calls.
-      const std::shared_ptr<model::SharedState> num_parallel_calls_;
+    Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
+                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(strings::StrCat(prefix, "_", kCode)),
+                              static_cast<int64>(status.code())));
+      if (!status.ok()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kMessage)),
+            status.error_message()));
+      }
+      return Status::OK();
+    }
 
-      // Counts the number of outstanding calls for this batch.
-      int64 num_calls_ GUARDED_BY(*mu_) = 0;
-      // Counts the total number of calls.
-      int64 call_counter_ GUARDED_BY(*mu_) = 0;
-      std::unique_ptr<IteratorBase> input_impl_;
-      // Buffer for storing the (intermediate) batch results.
-      std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      // Background thread used for coordinating input processing.
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
-      // Determines whether the transformation has been cancelled.
-      bool cancelled_ GUARDED_BY(*mu_) = false;
-      // Identifies the number of callers currently waiting for a batch result.
-      int64 waiting_ GUARDED_BY(*mu_) = 0;
-      // Identifies the maximum number of batch results to store.
-      int64 max_batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
-    };
+    // Used for coordination between the main thread, the runner thread, and
+    // the callback threads.
+    const std::shared_ptr<mutex> mu_;
+    // Used for coordination between the main thread, the runner thread, and
+    // the callback threads. In particular, the runner thread should only
+    // schedule new calls when the number of in-flight calls is less than
+    // `num_parallel_calls_->value` and there are slots available in the
+    // `batch_results_` buffer.
+    const std::shared_ptr<condition_variable> cond_var_;
+    // Identifies the maximum number of parallel calls.
+    const std::shared_ptr<model::SharedState> num_parallel_calls_;
 
-    const DatasetBase* const input_;
-    const int64 batch_size_;
-    const int64 num_parallel_calls_;
-    const bool drop_remainder_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const bool preserve_cardinality_;
+    // Counts the number of outstanding calls for this batch.
+    int64 num_calls_ GUARDED_BY(*mu_) = 0;
+    // Counts the total number of calls.
+    int64 call_counter_ GUARDED_BY(*mu_) = 0;
+    std::unique_ptr<IteratorBase> input_impl_;
+    // Buffer for storing the (intermediate) batch results.
+    std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+    // Background thread used for coordinating input processing.
+    std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+    // Determines whether the transformation has been cancelled.
+    bool cancelled_ GUARDED_BY(*mu_) = false;
+    // Identifies the number of callers currently waiting for a batch result.
+    int64 waiting_ GUARDED_BY(*mu_) = 0;
+    // Identifies the maximum number of batch results to store.
+    int64 max_batch_results_ GUARDED_BY(*mu_);
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
   };
 
-  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  bool preserve_cardinality_;
+  const DatasetBase* const input_;
+  const int64 batch_size_;
+  const int64 num_parallel_calls_;
+  const bool drop_remainder_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+  const std::unique_ptr<CapturedFunction> captured_func_;
+  const bool preserve_cardinality_;
 };
 
+MapAndBatchDatasetOp::MapAndBatchDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  FunctionMetadata::Params params;
+  params.is_multi_device_function = true;
+  OP_REQUIRES_OK(ctx,
+                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetAttr(kPreserveCardinality, &preserve_cardinality_));
+}
+
+void MapAndBatchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                       DatasetBase** output) {
+  int64 batch_size = 0;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kBatchSize, &batch_size));
+  OP_REQUIRES(ctx, batch_size > 0,
+              errors::InvalidArgument("batch_size must be greater than zero."));
+
+  int64 num_parallel_calls = 0;
+  OP_REQUIRES_OK(
+      ctx, ParseScalarArgument(ctx, kNumParallelCalls, &num_parallel_calls));
+  OP_REQUIRES(
+      ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutotune,
+      errors::InvalidArgument("num_parallel_calls must be greater than zero."));
+
+  bool drop_remainder;
+  OP_REQUIRES_OK(ctx,
+                 ParseScalarArgument(ctx, kDropRemainder, &drop_remainder));
+
+  std::unique_ptr<CapturedFunction> captured_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, func_metadata_, kOtherArguments,
+                                          &captured_func));
+
+  if (num_parallel_calls == model::kAutotune) {
+    metrics::RecordTFDataAutotune(kDatasetType);
+  }
+
+  *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
+                        drop_remainder, output_types_, output_shapes_,
+                        std::move(captured_func), preserve_cardinality_);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
                         MapAndBatchDatasetOp);
 REGISTER_KERNEL_BUILDER(
@@ -772,7 +785,7 @@ REGISTER_KERNEL_BUILDER(
 
 REGISTER_INPUT_COLOCATION_EXEMPTION("MapAndBatchDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalMapAndBatchDataset");
-
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h
new file mode 100644
index 00000000000..e32d954c3a6
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level
+// description of the following op.
+
+class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "MapAndBatch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kBatchSize = "batch_size";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kPreserveCardinality =
+      "preserve_cardinality";
+
+  explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool preserve_cardinality_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
new file mode 100644
index 00000000000..926ed31a2c5
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
@@ -0,0 +1,441 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+constexpr char kNodeName[] = "map_and_batch_dataset";
+
+class MapAndBatchDatasetParams : public DatasetParams {
+ public:
+  MapAndBatchDatasetParams(
+      RangeDatasetParams range_dataset_params,
+      std::vector<Tensor> other_arguments, int64 batch_size,
+      int64 num_parallel_calls, bool drop_remainder,
+      FunctionDefHelper::AttrValueWrapper func,
+      std::vector<FunctionDef> func_lib, DataTypeVector type_arguments,
+      bool preserve_cardinality, DataTypeVector output_dtypes,
+      std::vector<PartialTensorShape> output_shapes, string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        input_dataset_params(std::move(range_dataset_params)),
+        other_arguments(std::move(other_arguments)),
+        batch_size(CreateTensor<int64>(TensorShape({}), {batch_size})),
+        num_parallel_calls(
+            CreateTensor<int64>(TensorShape({}), {num_parallel_calls})),
+        drop_remainder(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
+        func(std::move(func)),
+        func_lib(std::move(func_lib)),
+        type_arguments(std::move(type_arguments)),
+        preserve_cardinality(preserve_cardinality) {}
+
+  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    if (!IsDatasetTensor(input_dataset)) {
+      return tensorflow::errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    }
+    *inputs = {TensorValue(&input_dataset)};
+    for (auto& argument : other_arguments) {
+      inputs->emplace_back(TensorValue(&argument));
+    }
+    inputs->insert(inputs->end(),
+                   {TensorValue(&batch_size), TensorValue(&num_parallel_calls),
+                    TensorValue(&drop_remainder)});
+    return Status::OK();
+  }
+
+  RangeDatasetParams input_dataset_params;
+  Tensor input_dataset;
+  std::vector<Tensor> other_arguments;
+  Tensor batch_size;
+  Tensor num_parallel_calls;
+  Tensor drop_remainder;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  DataTypeVector type_arguments;
+  bool preserve_cardinality;
+};
+
+class MapAndBatchDatasetOpTest
+    : public DatasetOpsTestBaseV2<MapAndBatchDatasetParams> {
+ public:
+  Status Initialize(
+      MapAndBatchDatasetParams* map_and_batch_dataset_params) override {
+    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime(
+        map_and_batch_dataset_params->func_lib, cpu_num_));
+
+    TF_RETURN_IF_ERROR(
+        MakeDatasetOpKernel(*map_and_batch_dataset_params, &dataset_kernel_));
+    TF_RETURN_IF_ERROR(
+        MakeRangeDataset(map_and_batch_dataset_params->input_dataset_params,
+                         &map_and_batch_dataset_params->input_dataset));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    TF_RETURN_IF_ERROR(map_and_batch_dataset_params->MakeInputs(&inputs));
+    TF_RETURN_IF_ERROR(
+        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+    TF_RETURN_IF_ERROR(
+        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+        iterator_ctx_.get(), map_and_batch_dataset_params->iterator_prefix,
+        &iterator_));
+    return Status::OK();
+  }
+
+ protected:
+  Status MakeDatasetOpKernel(
+      const MapAndBatchDatasetParams& map_and_batch_dataset_params,
+      std::unique_ptr<OpKernel>* map_and_batch_kernel) override {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, name_utils::OpName(MapAndBatchDatasetOp::kDatasetType),
+        {MapAndBatchDatasetOp::kInputDataset, MapAndBatchDatasetOp::kBatchSize,
+         MapAndBatchDatasetOp::kNumParallelCalls,
+         MapAndBatchDatasetOp::kDropRemainder},
+        {{MapAndBatchDatasetOp::kFunc, map_and_batch_dataset_params.func},
+         {MapAndBatchDatasetOp::kTarguments,
+          map_and_batch_dataset_params.type_arguments},
+         {MapAndBatchDatasetOp::kOutputTypes,
+          map_and_batch_dataset_params.output_dtypes},
+         {MapAndBatchDatasetOp::kOutputShapes,
+          map_and_batch_dataset_params.output_shapes},
+         {MapAndBatchDatasetOp::kPreserveCardinality,
+          map_and_batch_dataset_params.preserve_cardinality}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, map_and_batch_kernel));
+    return Status::OK();
+  }
+};
+
+FunctionDefHelper::AttrValueWrapper MapFunc(const string& func_name,
+                                            const DataType& dtype) {
+  return FunctionDefHelper::FunctionRef(func_name, {{"T", dtype}});
+}
+
+// test case 1: num_parallel_calls = 1, drop_remainder = true,
+// preserve_cardinality = false, MapFunc = XTimesTwo
+MapAndBatchDatasetParams MapAndBatchDatasetParams1() {
+  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+          /*other_arguments=*/{},
+          /*batch_size=*/2,
+          /*num_parallel_calls=*/1,
+          /*drop_remainder=*/true,
+          /*func=*/MapFunc("XTimesTwo", DT_INT64),
+          /*func_lib=*/{test::function::XTimesTwo()},
+          /*type_arguments*/ {},
+          /*preserve_cardinality=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({2})},
+          /*node_name=*/kNodeName};
+}
+
+// test case 2: num_parallel_calls = 2, drop_remainder = true,
+// preserve_cardinality = true, MapFunc = XTimesTwo
+MapAndBatchDatasetParams MapAndBatchDatasetParams2() {
+  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+          /*other_arguments=*/{},
+          /*batch_size=*/2,
+          /*num_parallel_calls=*/2,
+          /*drop_remainder=*/true,
+          /*func=*/MapFunc("XTimesTwo", DT_INT64),
+          /*func_lib=*/{test::function::XTimesTwo()},
+          /*type_arguments*/ {},
+          /*preserve_cardinality=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({2})},
+          /*node_name=*/kNodeName};
+}
+
+// test case 3: num_parallel_calls = 3, drop_remainder = false,
+// preserve_cardinality = true, MapFunc = XTimesFour
+MapAndBatchDatasetParams MapAndBatchDatasetParams3() {
+  return {
+      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+      /*other_arguments=*/{},
+      /*batch_size=*/2,
+      /*num_parallel_calls=*/3,
+      /*drop_remainder=*/false,
+      /*func=*/MapFunc("XTimesFour", DT_INT64),
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments*/ {},
+      /*preserve_cardinality=*/true,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({2})},
+      /*node_name=*/kNodeName};
+}
+
+// test case 4: num_parallel_calls = 4, drop_remainder = true,
+// preserve_cardinality = false, MapFunc = XTimesTwo
+MapAndBatchDatasetParams MapAndBatchDatasetParams4() {
+  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+          /*other_arguments=*/{},
+          /*batch_size=*/2,
+          /*num_parallel_calls=*/4,
+          /*drop_remainder=*/true,
+          /*func=*/MapFunc("XTimesTwo", DT_INT64),
+          /*func_lib=*/{test::function::XTimesTwo()},
+          /*type_arguments*/ {},
+          /*preserve_cardinality=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({2})},
+          /*node_name=*/kNodeName};
+}
+
+// test case 5: num_parallel_calls = kAutotune, drop_remainder = true,
+// preserve_cardinality = true, MapFunc = XTimesTwo
+MapAndBatchDatasetParams MapAndBatchDatasetParams5() {
+  return {
+      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+      /*other_arguments=*/{},
+      /*batch_size=*/2,
+      /*num_parallel_calls=*/model::kAutotune,
+      /*drop_remainder=*/true,
+      /*func=*/MapFunc("XTimesFour", DT_INT64),
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments*/ {},
+      /*preserve_cardinality=*/true,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({2})},
+      /*node_name=*/kNodeName};
+}
+
+// test case 6: num_parallel_calls = 4, drop_remainder = false,
+// preserve_cardinality = true, MapFunc = XTimesFour
+MapAndBatchDatasetParams MapAndBatchDatasetParams6() {
+  return {
+      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+      /*other_arguments=*/{},
+      /*batch_size=*/2,
+      /*num_parallel_calls=*/4,
+      /*drop_remainder=*/false,
+      /*func=*/MapFunc("XTimesFour", DT_INT64),
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments*/ {},
+      /*preserve_cardinality=*/false,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({2})},
+      /*node_name=*/kNodeName};
+}
+
+MapAndBatchDatasetParams InvalidNumParallelCallsMapAndBatchDatasetParams() {
+  return {
+      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+      /*other_arguments=*/{},
+      /*batch_size=*/2,
+      /*num_parallel_calls=*/-4,
+      /*drop_remainder=*/false,
+      /*func=*/MapFunc("XTimesFour", DT_INT64),
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments*/ {},
+      /*preserve_cardinality=*/false,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({2})},
+      /*node_name=*/kNodeName};
+}
+
+MapAndBatchDatasetParams InvalidBatchSizeMapAndBatchDatasetParams() {
+  return {
+      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+      /*other_arguments=*/{},
+      /*batch_size=*/-2,
+      /*num_parallel_calls=*/2,
+      /*drop_remainder=*/false,
+      /*func=*/MapFunc("XTimesFour", DT_INT64),
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments*/ {},
+      /*preserve_cardinality=*/false,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({2})},
+      /*node_name=*/kNodeName};
+}
+
+std::vector<GetNextTestCase<MapAndBatchDatasetParams>> GetNextTestCases() {
+  return {{/*dataset_params=*/MapAndBatchDatasetParams1(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams2(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams3(),
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({2}), {0, 8}),
+            CreateTensor<int64>(TensorShape({2}), {16, 24}),
+            CreateTensor<int64>(TensorShape({1}), {32})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams4(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams5(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 8}, {16, 24}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams6(),
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({2}), {0, 8}),
+            CreateTensor<int64>(TensorShape({2}), {16, 24}),
+            CreateTensor<int64>(TensorShape({1}), {32})}}};
+}
+
+ITERATOR_GET_NEXT_TEST_P(MapAndBatchDatasetOpTest, MapAndBatchDatasetParams,
+                         GetNextTestCases())
+
+TEST_F(MapAndBatchDatasetOpTest, DatasetTypeString) {
+  auto dataset_params = MapAndBatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckDatasetTypeString(
+      name_utils::OpName(MapAndBatchDatasetOp::kDatasetType)));
+}
+
+TEST_F(MapAndBatchDatasetOpTest, DatasetOutputDtypes) {
+  auto dataset_params = MapAndBatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
+}
+
+std::vector<DatasetOutputShapesTestCase<MapAndBatchDatasetParams>>
+DatasetOutputShapesTestCases() {
+  return {{/*dataset_params=*/MapAndBatchDatasetParams1(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams2(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams3(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams4(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams5(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams6(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}}};
+}
+
+DATASET_OUTPUT_SHAPES_TEST_P(MapAndBatchDatasetOpTest, MapAndBatchDatasetParams,
+                             DatasetOutputShapesTestCases())
+
+std::vector<CardinalityTestCase<MapAndBatchDatasetParams>>
+CardinalityTestCases() {
+  return {{/*dataset_params=*/MapAndBatchDatasetParams1(),
+           /*expected_cardinality=*/2},
+          {/*dataset_params=*/MapAndBatchDatasetParams2(),
+           /*expected_cardinality=*/2},
+          {/*dataset_params=*/MapAndBatchDatasetParams3(),
+           /*expected_cardinality=*/3},
+          {/*dataset_params=*/MapAndBatchDatasetParams4(),
+           /*expected_cardinality=*/2},
+          {/*dataset_params=*/MapAndBatchDatasetParams5(),
+           /*expected_cardinality=*/2},
+          {/*dataset_params=*/MapAndBatchDatasetParams6(),
+           /*expected_cardinality=*/3}};
+}
+
+DATASET_CARDINALITY_TEST_P(MapAndBatchDatasetOpTest, MapAndBatchDatasetParams,
+                           CardinalityTestCases())
+
+TEST_F(MapAndBatchDatasetOpTest, IteratorOutputDtypes) {
+  auto dataset_params = MapAndBatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
+}
+
+std::vector<IteratorOutputShapesTestCase<MapAndBatchDatasetParams>>
+IteratorOutputShapesTestCases() {
+  return {{/*dataset_params=*/MapAndBatchDatasetParams1(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams2(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams3(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams4(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams5(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams6(),
+           /*expected_output_shapes=*/
+           {PartialTensorShape({2})}}};
+}
+
+ITERATOR_OUTPUT_SHAPES_TEST_P(MapAndBatchDatasetOpTest,
+                              MapAndBatchDatasetParams,
+                              IteratorOutputShapesTestCases())
+
+TEST_F(MapAndBatchDatasetOpTest, IteratorPrefix) {
+  auto dataset_params = MapAndBatchDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
+      MapAndBatchDatasetOp::kDatasetType, dataset_params.iterator_prefix)));
+}
+
+std::vector<IteratorSaveAndRestoreTestCase<MapAndBatchDatasetParams>>
+IteratorSaveAndRestoreTestCases() {
+  return {{/*dataset_params=*/MapAndBatchDatasetParams1(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams2(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams3(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({2}), {0, 8}),
+            CreateTensor<int64>(TensorShape({2}), {16, 24}),
+            CreateTensor<int64>(TensorShape({1}), {32})}},
+          {/*dataset_params=*/MapAndBatchDatasetParams4(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 4}, {8, 12}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams5(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({2}), {{0, 8}, {16, 24}})},
+          {/*dataset_params=*/MapAndBatchDatasetParams6(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           {CreateTensor<int64>(TensorShape({2}), {0, 8}),
+            CreateTensor<int64>(TensorShape({2}), {16, 24}),
+            CreateTensor<int64>(TensorShape({1}), {32})}}};
+}
+
+ITERATOR_SAVE_AND_RESTORE_TEST_P(MapAndBatchDatasetOpTest,
+                                 MapAndBatchDatasetParams,
+                                 IteratorSaveAndRestoreTestCases())
+
+TEST_F(MapAndBatchDatasetOpTest, InvalidBatchSize) {
+  auto dataset_params = InvalidBatchSizeMapAndBatchDatasetParams();
+  EXPECT_EQ(Initialize(&dataset_params).code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
+
+TEST_F(MapAndBatchDatasetOpTest, InvalidNumParallel) {
+  auto dataset_params = InvalidNumParallelCallsMapAndBatchDatasetParams();
+  EXPECT_EQ(Initialize(&dataset_params).code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 0c3d06a4ace..84cf2149a82 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class MatchingFilesDatasetOp : public DatasetOpKernel {
@@ -41,9 +42,9 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     const Tensor* patterns_t;
     OP_REQUIRES_OK(ctx, ctx->input("patterns", &patterns_t));
-    const auto patterns = patterns_t->flat<string>();
+    const auto patterns = patterns_t->flat<tstring>();
     size_t num_patterns = static_cast<size_t>(patterns.size());
-    std::vector<string> pattern_strs;
+    std::vector<tstring> pattern_strs;
     pattern_strs.reserve(num_patterns);
 
     for (size_t i = 0; i < num_patterns; i++) {
@@ -56,7 +57,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, std::vector<string> patterns)
+    Dataset(OpKernelContext* ctx, std::vector<tstring> patterns)
         : DatasetBase(DatasetContext(ctx)), patterns_(std::move(patterns)) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -80,6 +81,8 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
       return "MatchingFilesDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -125,7 +128,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
                              current_path.first.end(), '/', '\\');
               }
 
-              filepath_tensor.scalar<string>()() =
+              filepath_tensor.scalar<tstring>()() =
                   std::move(current_path.first);
               out_tensors->emplace_back(std::move(filepath_tensor));
               *end_of_sequence = false;
@@ -140,6 +143,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
           } else {
             // search a new pattern
             current_pattern_ = dataset()->patterns_[current_pattern_index_];
+            StringPiece current_pattern_view = StringPiece(current_pattern_);
 
             // Windows paths contain backslashes and Windows APIs accept forward
             // and backslashes equivalently, so we convert the pattern to use
@@ -147,17 +151,17 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
             // indicator of Windows paths. Note that this is not ideal, since
             // the API expects backslash as an escape character, but no code
             // appears to rely on this behavior
-            if (current_pattern_.find('\\') != std::string::npos) {
+            if (current_pattern_view.find('\\') != std::string::npos) {
               isWindows_ = true;
-              std::replace(current_pattern_.begin(), current_pattern_.end(),
-                           '\\', '/');
+              std::replace(&current_pattern_[0],
+                           &current_pattern_[0] + current_pattern_.size(), '\\',
+                           '/');
             } else {
               isWindows_ = false;
             }
 
-            StringPiece fixed_prefix =
-                StringPiece(current_pattern_)
-                    .substr(0, current_pattern_.find_first_of("*?[\\"));
+            StringPiece fixed_prefix = current_pattern_view.substr(
+                0, current_pattern_view.find_first_of("*?[\\"));
             string current_dir(io::Dirname(fixed_prefix));
 
             // If current_dir is empty then we need to fix up fixed_prefix and
@@ -226,8 +230,11 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
             full_name("current_pattern_index"), &current_pattern_index));
         current_pattern_index_ = size_t(current_pattern_index);
 
+        tstring current_pattern_tstr;
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_pattern"),
-                                              &current_pattern_));
+                                              &current_pattern_tstr));
+        current_pattern_ = current_pattern_tstr;
+
         int64 hasMatch;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("hasMatch"), &hasMatch));
@@ -243,7 +250,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(
               reader->ReadScalar(full_name("queue_size"), &queue_size));
           for (int i = 0; i < queue_size; i++) {
-            string path;
+            tstring path;
             int64 path_status;
             TF_RETURN_IF_ERROR(reader->ReadScalar(
                 full_name(strings::StrCat("path_", i)), &path));
@@ -357,12 +364,12 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
                           std::greater<PathStatus>>
           filepath_queue_ GUARDED_BY(mu_);
       size_t current_pattern_index_ GUARDED_BY(mu_) = 0;
-      string current_pattern_ GUARDED_BY(mu_);
+      tstring current_pattern_ GUARDED_BY(mu_);
       bool hasMatch_ GUARDED_BY(mu_) = false;
       bool isWindows_ GUARDED_BY(mu_) = false;
     };
 
-    const std::vector<string> patterns_;
+    const std::vector<tstring> patterns_;
   };
 };
 
@@ -373,5 +380,6 @@ REGISTER_KERNEL_BUILDER(
     MatchingFilesDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 9086e13eae4..c08e8fead4e 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
@@ -68,11 +69,16 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return "NonSerializableDatasetOp::Dataset";
     }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
+      return errors::Unimplemented(DebugString(),
+                                   " does not support serialization.");
     }
 
     int64 Cardinality() const override { return input_->Cardinality(); }
@@ -130,5 +136,6 @@ REGISTER_KERNEL_BUILDER(
     NonSerializableDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 2ce26f6b01b..cc335e87b9f 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -12,1063 +12,1095 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h"
+
 #include <atomic>
 #include <deque>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
-#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace data {
-namespace {
+namespace experimental {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kDatasetType;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kInputDataset;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kOtherArguments;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kCycleLength;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kBlockLength;
+/* static */ constexpr const char* const ParallelInterleaveDatasetOp::kSloppy;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kBufferOutputElements;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kPrefetchInputElements;
+/* static */ constexpr const char* const ParallelInterleaveDatasetOp::kFunc;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kTarguments;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kOutputTypes;
+/* static */ constexpr const char* const
+    ParallelInterleaveDatasetOp::kOutputShapes;
 
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+constexpr char kInputExhausted[] = "input_exhausted";
+constexpr char kNextIndex[] = "next_index";
+constexpr char kBlockCount[] = "block_count";
+constexpr char kWorkersSize[] = "workers_size";
+constexpr char kInterleaveSize[] = "interleave_size";
+constexpr char kInterleaveIndices[] = "interleave_indices";
+constexpr char kStagingSize[] = "staging_size";
+constexpr char kStagingIndices[] = "staging_indices";
+constexpr char kWorkerThreadsRunning[] = "worker_threads_running";
+constexpr char kTFDataParallelInterleaveWorker[] =
+    "tf_data_parallel_interleave_worker";
+constexpr char kWorker[] = "worker";
+constexpr char kInputSize[] = "input_size";
+constexpr char kInput[] = "input";
+constexpr char kOutputsSize[] = "outputs_size";
+constexpr char kOutputs[] = "outputs";
+constexpr char kIsProducing[] = "is_producing";
+constexpr char kWorkerThread[] = "worker_thread";
+constexpr char kIteratorExhausted[] = "iterator_exhausted";
+constexpr char kIteratorCreationStatus[] = "iterator_creation_status";
+constexpr char kOutput[] = "output";
+constexpr char kEndOfSequence[] = "end_of_sequence";
+constexpr char kStatus[] = "status";
+constexpr char kOutputSize[] = "output_size";
+constexpr char kCode[] = "code";
+constexpr char KMessage[] = "msg";
+
+class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
-    OP_REQUIRES_OK(ctx,
-                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+          int64 block_length, bool sloppy, int64 buffer_output_elements,
+          int64 prefetch_input_elements, const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        captured_func_(std::move(captured_func)),
+        cycle_length_(cycle_length),
+        block_length_(block_length),
+        sloppy_(sloppy),
+        buffer_output_elements_(buffer_output_elements),
+        prefetch_input_elements_(prefetch_input_elements),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
   }
 
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 cycle_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
+  ~Dataset() override { input_->Unref(); }
 
-    int64 block_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
 
-    bool sloppy = false;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
 
-    int64 buffer_output_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
-                                            &buffer_output_elements));
-    OP_REQUIRES(
-        ctx, buffer_output_elements > 0,
-        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
 
-    int64 prefetch_input_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
-                                            &prefetch_input_elements));
-    OP_REQUIRES(
-        ctx, prefetch_input_elements >= 0,
-        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
 
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
-                                      &captured_func));
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
 
-    *output =
-        new Dataset(ctx, input, std::move(captured_func), cycle_length,
-                    block_length, sloppy, buffer_output_elements,
-                    prefetch_input_elements, output_types_, output_shapes_);
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_node;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+    Node* cycle_length_node;
+    TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+    Node* block_length_node;
+    TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+    Node* sloppy_node;
+    TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+    Node* buffer_output_elements_node;
+    TF_RETURN_IF_ERROR(
+        b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+    Node* prefetch_input_elements_node;
+    TF_RETURN_IF_ERROR(
+        b->AddScalar(prefetch_input_elements_, &prefetch_input_elements_node));
+    std::vector<Node*> other_arguments;
+    DataTypeVector other_arguments_types;
+    TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
+                                                  &other_arguments_types));
+    AttrValue f;
+    b->BuildAttrValue(captured_func_->func(), &f);
+    AttrValue other_arguments_types_attr;
+    b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this,
+        {{0, input_node},
+         {2, cycle_length_node},
+         {3, block_length_node},
+         {4, sloppy_node},
+         {5, buffer_output_elements_node},
+         {6, prefetch_input_elements_node}},
+        {{1, other_arguments}},
+        {{kFunc, f}, {kTarguments, other_arguments_types_attr}}, output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  int64 num_threads() const { return cycle_length_ + prefetch_input_elements_; }
+
+  // Parallel interleave's implementation is designed around a few principles:
+  //  1. Thread creation is relatively expensive. (Not reusing
+  //     threads causes a number of indirect costs such as poorer tcmalloc
+  //     performance due to thread-local caches, etc.) We allocate a fixed
+  //     number of threads at the start and never change. This is why we've
+  //     fused functionality that is theoretically orthogonal (i.e.
+  //     .prefetch()) into the implementation.
+  //  2. Drop-in replacement for standard interleave. The goal will be to
+  //     auto-opt people into an optimized implementation without any work
+  //     on the customer's part. We thus go through great pains to maintain
+  //     identical iteration orders, full determinism (disabled only via a
+  //     flag, etc.)
+  //  3. Performance across a variety of environments and I/O envelopes.
+  //
+  // The actual implementation centers around a collection of worker threads
+  // and their corresponding worker state (tracked in the `workers_` vector).
+  // Worker threads repeatedly receive a vector of Tensors that are used as
+  // input to the flat-map function (`captured_func_`). The output of this
+  // function must be a dataset. The worker thread then repeatedly calls
+  // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+  // that a caller will block waiting for an element to be produced.
+  //
+  // Pointers to these worker states are kept in 2 disjoint data structures:
+  //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+  //     in `workers_` that we are interleaving. Worker threads backing these
+  //     WorkerStates should be regularly producing values.
+  //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+  //     `workers_` that we will move to `interleave_indices_` when an
+  //     iterator in `interleave_indices_` is exhausted.
+  //
+  // The client calls `GetNext[Internal]()` to retrieve an output element. The
+  // internal implementation updates the state of `interleave_indices_` and
+  // `staging_indices_` as output iterators (run by the worker threads) are
+  // exhausted.
+  //
+  // `input_impl_` is the input iterator that generates arguments for the
+  // flat-map function (`captured_func_`). It is set to an iterator at
+  // Iterator construction, and is fixed until we consume all input elements.
+  // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+  // memory.
+  //
+  // A few invariants are maintained:
+  //  1. No element in interleave_indices_ should be a -1 unless
+  //     `staging_indices_` is empty and `input_impl_` is empty.
+  //  2. Every `worker_` element is pointed to by at most one element of the
+  //     union of `interleave_indices_` and `staging_indices_`.
+  //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+  //     an element in `interleave_indices_` or `staging_indices_`.
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, int64 buffer_output_elements,
-            int64 prefetch_input_elements, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          buffer_output_elements_(buffer_output_elements),
-          prefetch_input_elements_(prefetch_input_elements),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params),
+          workers_(dataset()->num_threads()),
+          worker_thread_states_(dataset()->num_threads()) {}
+
+    ~Iterator() override {
+      mutex_lock l(mu_);
+      cancelled_ = true;
+      // Notify all workers in case they are blocked.
+      for (auto& worker : workers_) {
+        worker.cond_var.notify_all();
+      }
     }
 
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(Iterator::Params{
-          this, strings::StrCat(prefix, "::ParallelInterleave")});
+    Status Initialize(IteratorContext* ctx) override {
+      TF_RETURN_IF_ERROR(
+          dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+      return dataset()->captured_func_->Instantiate(
+          ctx, &instantiated_captured_func_);
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
+    // It is implemented so that it matches the deterministic interleave
+    // unless getting the next element would block and we are allowed to be
+    // sloppy.
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+      while (!cancelled_) {
+        // Wait for an item to become available, blocking if necessary. If we
+        // are allowed to be sloppy, we can skip over input datasets that do
+        // not have an item readily available.
+        bool can_produce_elements = false;
+        bool must_wait_for_input = true;
+        for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+          int64 index = (next_index_ + i) % interleave_indices_.size();
+          int64 current_worker_index = interleave_indices_[index];
+          if (current_worker_index < 0) {
+            continue;  // Empty interleave elements.
+          }
+          WorkerState* current_worker = &workers_[current_worker_index];
+          can_produce_elements |= current_worker->MayHaveElements();
+          if (!current_worker->outputs.empty()) {
+            // We have an element!
+            next_index_ = index;
+            const bool element_acquired_sloppily = dataset()->sloppy_ && i > 1;
+            if (!element_acquired_sloppily) {
+              // If the element was acquired in the regular (non-sloppy)
+              // order, then advance the current block and cycle pointers to
+              // the next element in the regular order.
+              block_count_++;
+              if (block_count_ == dataset()->block_length_) {
+                next_index_ = (index + 1) % interleave_indices_.size();
+                block_count_ = 0;
+              }
+            } else {
+              block_count_ = 0;
+            }
+            *end_of_sequence = false;
+            Status s = current_worker->outputs.front().status;
+            current_worker->outputs.front().output.swap(*out_tensors);
+            current_worker->outputs.pop_front();
+            current_worker->cond_var.notify_one();
+            return s;
+          } else if (current_worker->is_producing && !dataset()->sloppy_) {
+            // current_worker.outputs.empty(), and we must wait for this
+            // iterator.
+            if (next_index_ != index) {
+              // We have advanced to a new iterator; reset block counts.
+              next_index_ = index;
+              block_count_ = 0;
+            }
+            break;
+          } else if (!current_worker->is_producing) {
+            // This iterator has reached end of input.
+            interleave_indices_[index] = -1;
+            if (input_impl_) {
+              // Start prefetching a new iterator.
+              std::vector<Tensor> args;
+              bool end_of_input = false;
+              Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+              if (end_of_input) {
+                input_impl_.reset();
+              } else {
+                current_worker->SetInputs(s, std::move(args));
+                staging_indices_.emplace_back(current_worker_index);
+              }
+            }
 
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
+            if (!staging_indices_.empty()) {
+              // Move a worker from `staging_indices_` to
+              // `interleave_indices_`.
+              interleave_indices_[index] = staging_indices_.front();
+              staging_indices_.pop_front();
 
-    string DebugString() const override {
-      return "ParallelInterleaveDatasetOp::Dataset";
+              next_index_ = (index + 1) % interleave_indices_.size();
+              block_count_ = 0;
+              // Restart the inner [for] loop
+              can_produce_elements = true;
+              must_wait_for_input = false;
+              break;
+            }
+          }
+        }
+
+        if (!can_produce_elements && !input_impl_) {
+          // No potential for future values.
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        if (must_wait_for_input) {
+          // Wait for elements to become available.
+          RecordStop(ctx);
+          if (dataset()->sloppy_) {
+            sloppy_cond_var_.wait(l);
+          } else {
+            workers_[interleave_indices_[next_index_]].cond_var.wait(l);
+          }
+          RecordStart(ctx);
+        }
+      }
+      return errors::Cancelled(
+          "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
     }
 
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-      Node* cycle_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
-      Node* block_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
-      Node* sloppy_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
-      Node* buffer_output_elements_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
-      Node* prefetch_input_elements_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
-                                      &prefetch_input_elements_node));
-      std::vector<Node*> other_arguments;
-      DataTypeVector other_arguments_types;
-      TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
-                                                    &other_arguments_types));
-      AttrValue f;
-      b->BuildAttrValue(captured_func_->func(), &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeAsyncInterleaveManyNode(std::move(args),
+                                                /*parameters=*/{});
+    }
 
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, sloppy_node},
-           {5, buffer_output_elements_node},
-           {6, prefetch_input_elements_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      // The order of locking is important here to avoid deadlock.
+      mutex_lock l(mu_);
+      mutex_lock ckpt_l(ckpt_mu_);
+      if (input_impl_) {
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+      } else {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputExhausted), ""));
+      }
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kNextIndex), next_index_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kBlockCount), block_count_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kWorkersSize), workers_.size()));
+      for (int i = 0; i < workers_.size(); ++i) {
+        TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+      }
+      for (int i = 0; i < worker_thread_states_.size(); ++i) {
+        TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+      }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInterleaveSize),
+                                             interleave_indices_.size()));
+      for (int i = 0; i < interleave_indices_.size(); ++i) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(kInterleaveIndices, "_", i)),
+            interleave_indices_[i]));
+      }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kStagingSize),
+                                             staging_indices_.size()));
+      for (int i = 0; i < staging_indices_.size(); ++i) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(kStagingIndices, "_", i)),
+            staging_indices_[i]));
+      }
+      if (!worker_threads_.empty()) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kWorkerThreadsRunning), ""));
+      }
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      // The order of locking is important here to avoid deadlock.
+      mutex_lock l(mu_);
+      mutex_lock ckpt_l(ckpt_mu_);
+      if (!reader->Contains(full_name(kInputExhausted))) {
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+      } else {
+        input_impl_.reset();
+      }
+      int64 temp;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNextIndex), &temp));
+      next_index_ = size_t(temp);
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kBlockCount), &temp));
+      block_count_ = size_t(temp);
+
+      // Restore WorkerStates.
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kWorkersSize), &temp));
+      if (temp != dataset()->num_threads()) {
+        return errors::Internal("Expected ", dataset()->num_threads(),
+                                " worker states but found ", temp, ".");
+      }
+      for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+        TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+      }
+      for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+        TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+      }
+
+      // Restore `interleave_indices_`.
+      std::set<int64> all_indices;
+      {
+        int64 interleave_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(kInterleaveSize), &interleave_size));
+        interleave_indices_.reserve(interleave_size);
+        for (int64 i = 0; i < interleave_size; ++i) {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(kInterleaveIndices, "_", i)), &temp));
+          if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+            return errors::Internal(
+                "Duplicate entry for ", temp,
+                " found when reading interleave and staging indices.");
+          }
+          if (temp >= 0) {
+            all_indices.insert(temp);
+          }
+          interleave_indices_.emplace_back(temp);
+        }
+      }
+
+      // Restore `staging_indices_`.
+      {
+        int64 staging_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(kStagingSize), &staging_size));
+        for (int i = 0; i < staging_size; ++i) {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(kStagingIndices, "_", i)), &temp));
+          if (all_indices.find(temp) != all_indices.end()) {
+            return errors::Internal(
+                "Duplicate entry for ", temp,
+                " found when reading interleave and staging indices.");
+          }
+          if (temp >= 0) {
+            all_indices.insert(temp);
+          }
+          staging_indices_.emplace_back(temp);
+        }
+      }
+
+      // Start Worker threads.
+      if (reader->Contains(full_name(kWorkerThreadsRunning))) {
+        worker_threads_.reserve(dataset()->num_threads());
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+          worker_threads_.emplace_back(ctx->StartThread(
+              strings::StrCat(kTFDataParallelInterleaveWorker, "_", i),
+              [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+        }
+      }
       return Status::OK();
     }
 
    private:
-    int64 num_threads() const {
-      return cycle_length_ + prefetch_input_elements_;
-    }
+    // OutputElem contains the information from a call to GetNext by an output
+    // iterator.
+    struct OutputElem {
+      // The output iterator sets `status` if getting the output element
+      // fails.
+      Status status;
+      // The buffered data element.
+      std::vector<Tensor> output;
 
-    // Parallel interleave's implementation is designed around a few principles:
-    //  1. Thread creation is relatively expensive. (Not reusing
-    //     threads causes a number of indirect costs such as poorer tcmalloc
-    //     performance due to thread-local caches, etc.) We allocate a fixed
-    //     number of threads at the start and never change. This is why we've
-    //     fused functionality that is theoretically orthogonal (i.e.
-    //     .prefetch()) into the implementation.
-    //  2. Drop-in replacement for standard interleave. The goal will be to
-    //     auto-opt people into an optimized implementation without any work
-    //     on the customer's part. We thus go through great pains to maintain
-    //     identical iteration orders, full determinism (disabled only via a
-    //     flag, etc.)
-    //  3. Performance across a variety of environments and I/O envelopes.
-    //
-    // The actual implementation centers around a collection of worker threads
-    // and their corresponding worker state (tracked in the `workers_` vector).
-    // Worker threads repeatedly receive a vector of Tensors that are used as
-    // input to the flat-map function (`captured_func_`). The output of this
-    // function must be a dataset. The worker thread then repeatedly calls
-    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
-    // that a caller will block waiting for an element to be produced.
-    //
-    // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
-    //     in `workers_` that we are interleaving. Worker threads backing these
-    //     WorkerStates should be regularly producing values.
-    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
-    //     `workers_` that we will move to `interleave_indices_` when an
-    //     iterator in `interleave_indices_` is exhausted.
-    //
-    // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_indices_` and
-    // `staging_indices_` as output iterators (run by the worker threads) are
-    // exhausted.
-    //
-    // `input_impl_` is the input iterator that generates arguments for the
-    // flat-map function (`captured_func_`). It is set to an iterator at
-    // Iterator construction, and is fixed until we consume all input elements.
-    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
-    // memory.
-    //
-    // A few invariants are maintained:
-    //  1. No element in interleave_indices_ should be a -1 unless
-    //     `staging_indices_` is empty and `input_impl_` is empty.
-    //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_indices_` and `staging_indices_`.
-    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_indices_` or `staging_indices_`.
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            workers_(dataset()->num_threads()),
-            worker_thread_states_(dataset()->num_threads()) {}
+      explicit OutputElem(const Status& s) : status(s) {}
+    };
 
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (auto& worker : workers_) {
-          worker.cond_var.notify_all();
+    // Worker threads operate on their relevant WorkerState structs.
+    //
+    // WorkerState's fields are all protected by mu_;
+    struct WorkerState {
+      // The arguments to be used to construct an output iterator.
+      std::vector<Tensor> input;
+      // The buffered output elements.
+      std::deque<OutputElem> outputs;
+      // Set to true iff the worker thread expects to append more elements to
+      // outputs. is_producing can be false despite !outputs.empty().
+      // Concretely, all output elements will have been consumed only when:
+      // is_producing == false && outputs.empty();
+      bool is_producing = false;
+      // Condition variable used to coordinate between threads. The worker
+      // thread waits on this condition variable when it is either (1) waiting
+      // for the main thread to add arguments to `input`, or (2) waiting for
+      // the main thread to consume an element of `outputs`. The main thread
+      // waits on cond_var if it is waiting for the worker thread to produce
+      // an element into `outputs` (this implies sloppy_==false).
+      condition_variable cond_var;
+
+      inline bool MayHaveElements() const {
+        return is_producing || !outputs.empty();
+      }
+
+      // Sets inputs for a worker thread and notifies it to start processing.
+      void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+        if (s.ok()) {
+          DCHECK(!MayHaveElements())
+              << "Tried to start inputs, despite already producing!";
+          input = std::move(input_arguments);
+          is_producing = true;
+          cond_var.notify_one();
+        } else {
+          outputs.emplace_back(s);
         }
       }
+    };
 
-      Status Initialize(IteratorContext* ctx) override {
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(
-            ctx, &instantiated_captured_func_);
-      }
+    // The internal state of a worker thread that is not already captured
+    // in its `WorkerState`.
+    //
+    // This is needed only for checkpointing purposes. We keep this
+    // separate from `WorkerState` and guard its fields using a separate
+    // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+    struct WorkerThreadState {
+      // The output element that has been produced from the input iterator
+      // and is waiting to be added to `WorkerState.outputs`.
+      OutputElem output_elem;
 
-      // It is implemented so that it matches the deterministic interleave
-      // unless getting the next element would block and we are allowed to be
-      // sloppy.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          bool can_produce_elements = false;
-          bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_indices_.size();
-            int64 current_worker_index = interleave_indices_[index];
-            if (current_worker_index < 0) {
-              continue;  // Empty interleave elements.
-            }
-            WorkerState* current_worker = &workers_[current_worker_index];
-            can_produce_elements |= current_worker->MayHaveElements();
-            if (!current_worker->outputs.empty()) {
-              // We have an element!
-              next_index_ = index;
-              const bool element_acquired_sloppily =
-                  dataset()->sloppy_ && i > 1;
-              if (!element_acquired_sloppily) {
-                // If the element was acquired in the regular (non-sloppy)
-                // order, then advance the current block and cycle pointers to
-                // the next element in the regular order.
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_indices_.size();
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              *end_of_sequence = false;
-              Status s = current_worker->outputs.front().status;
-              current_worker->outputs.front().output.swap(*out_tensors);
-              current_worker->outputs.pop_front();
-              current_worker->cond_var.notify_one();
-              return s;
-            } else if (current_worker->is_producing && !dataset()->sloppy_) {
-              // current_worker.outputs.empty(), and we must wait for this
-              // iterator.
-              if (next_index_ != index) {
-                // We have advanced to a new iterator; reset block counts.
-                next_index_ = index;
-                block_count_ = 0;
-              }
-              break;
-            } else if (!current_worker->is_producing) {
-              // This iterator has reached end of input.
-              interleave_indices_[index] = -1;
-              if (input_impl_) {
-                // Start prefetching a new iterator.
-                std::vector<Tensor> args;
-                bool end_of_input = false;
-                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-                if (end_of_input) {
-                  input_impl_.reset();
-                } else {
-                  current_worker->SetInputs(s, std::move(args));
-                  staging_indices_.emplace_back(current_worker_index);
-                }
-              }
+      // Whether the input iterator returned an `end_of_sequence`.
+      bool end_of_sequence = false;
 
-              if (!staging_indices_.empty()) {
-                // Move a worker from `staging_indices_` to
-                // `interleave_indices_`.
-                interleave_indices_[index] = staging_indices_.front();
-                staging_indices_.pop_front();
+      // Status returned from `MakeIteratorFromInputElement`.
+      Status iterator_creation_status;
 
-                next_index_ = (index + 1) % interleave_indices_.size();
-                block_count_ = 0;
-                // Restart the inner [for] loop
-                can_produce_elements = true;
-                must_wait_for_input = false;
-                break;
-              }
-            }
-          }
+      // The arguments to be used to construct `iterator`.
+      std::vector<Tensor> input;
 
-          if (!can_produce_elements && !input_impl_) {
-            // No potential for future values.
-            *end_of_sequence = true;
+      std::unique_ptr<IteratorBase> iterator;
+
+      WorkerThreadState() : output_elem(Status::OK()) {}
+    };
+
+    Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (worker_threads_.empty()) {
+        worker_threads_.reserve(dataset()->num_threads());
+        for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+          std::vector<Tensor> args;
+          bool end_of_input = false;
+          Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+          if (end_of_input) {
+            input_impl_.reset();
             return Status::OK();
           }
-
-          if (must_wait_for_input) {
-            // Wait for elements to become available.
-            RecordStop(ctx);
-            if (dataset()->sloppy_) {
-              sloppy_cond_var_.wait(l);
-            } else {
-              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
-            }
-            RecordStart(ctx);
-          }
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncInterleaveManyNode(std::move(args),
-                                                  /*parameters=*/{});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("next_index"), next_index_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("block_count"), block_count_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("workers_size"), workers_.size()));
-        for (int i = 0; i < workers_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
-        }
-        for (int i = 0; i < worker_thread_states_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
-                                               interleave_indices_.size()));
-        for (int i = 0; i < interleave_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("interleave_indices_", i)),
-              interleave_indices_[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
-                                               staging_indices_.size()));
-        for (int i = 0; i < staging_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("staging_indices_", i)),
-              staging_indices_[i]));
-        }
-        if (!worker_threads_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("worker_threads_running"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
-        next_index_ = size_t(temp);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
-        block_count_ = size_t(temp);
-
-        // Restore WorkerStates.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("workers_size"), &temp));
-        if (temp != dataset()->num_threads()) {
-          return errors::Internal("Expected ", dataset()->num_threads(),
-                                  " worker states but found ", temp, ".");
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
-        }
-
-        // Restore `interleave_indices_`.
-        std::set<int64> all_indices;
-        {
-          int64 interleave_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
-                                                &interleave_size));
-          interleave_indices_.reserve(interleave_size);
-          for (int64 i = 0; i < interleave_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("interleave_indices_", i)), &temp));
-            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            interleave_indices_.emplace_back(temp);
-          }
-        }
-
-        // Restore `staging_indices_`.
-        {
-          int64 staging_size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("staging_size"), &staging_size));
-          for (int i = 0; i < staging_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("staging_indices_", i)), &temp));
-            if (all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            staging_indices_.emplace_back(temp);
-          }
-        }
-
-        // Start Worker threads.
-        if (reader->Contains(full_name("worker_threads_running"))) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->StartThread(
-                strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // OutputElem contains the information from a call to GetNext by an output
-      // iterator.
-      struct OutputElem {
-        // The output iterator sets `status` if getting the output element
-        // fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> output;
-
-        explicit OutputElem(const Status& s) : status(s) {}
-      };
-
-      // Worker threads operate on their relevant WorkerState structs.
-      //
-      // WorkerState's fields are all protected by mu_;
-      struct WorkerState {
-        // The arguments to be used to construct an output iterator.
-        std::vector<Tensor> input;
-        // The buffered output elements.
-        std::deque<OutputElem> outputs;
-        // Set to true iff the worker thread expects to append more elements to
-        // outputs. is_producing can be false despite !outputs.empty().
-        // Concretely, all output elements will have been consumed only when:
-        // is_producing == false && outputs.empty();
-        bool is_producing = false;
-        // Condition variable used to coordinate between threads. The worker
-        // thread waits on this condition variable when it is either (1) waiting
-        // for the main thread to add arguments to `input`, or (2) waiting for
-        // the main thread to consume an element of `outputs`. The main thread
-        // waits on cond_var if it is waiting for the worker thread to produce
-        // an element into `outputs` (this implies sloppy_==false).
-        condition_variable cond_var;
-
-        inline bool MayHaveElements() const {
-          return is_producing || !outputs.empty();
-        }
-
-        // Sets inputs for a worker thread and notifies it to start processing.
-        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
-          if (s.ok()) {
-            DCHECK(!MayHaveElements())
-                << "Tried to start inputs, despite already producing!";
-            input = std::move(input_arguments);
-            is_producing = true;
-            cond_var.notify_one();
+          workers_[i].SetInputs(s, std::move(args));
+          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+          worker_threads_.push_back(ctx->StartThread(
+              strings::StrCat(kTFDataParallelInterleaveWorker, "_", i),
+              [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+          if (i < dataset()->cycle_length_) {
+            interleave_indices_.push_back(i);
           } else {
-            outputs.emplace_back(s);
+            staging_indices_.push_back(i);
           }
         }
-      };
+        DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+        DCHECK(staging_indices_.size() == dataset()->prefetch_input_elements_);
+      }
+      return Status::OK();
+    }
 
-      // The internal state of a worker thread that is not already captured
-      // in its `WorkerState`.
+    // Produces elements into the worker's output buffers.
+    void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
+                      const int64 thread_index) {
+      // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
       //
-      // This is needed only for checkpointing purposes. We keep this
-      // separate from `WorkerState` and guard its fields using a separate
-      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
-      struct WorkerThreadState {
-        // The output element that has been produced from the input iterator
-        // and is waiting to be added to `WorkerState.outputs`.
-        OutputElem output_elem;
+      // 1. Any local state that may need to be checkpointed should be kept
+      //    in `worker_thread_states_[thread_index]`.
+      // 2. `WorkerThreadState` should contain state that is needed only for
+      //    checkpointing, i.e., if we were to remove checkpointing support,
+      //    we could keep that state as local variables in this thread.
+      // 3. This thread should only read/write state at `thread_index`
+      //    and should not access other thread states.
+      // 4. When restoring from checkpoint, threads are started only after
+      //    the restore is complete.
+      // 5. Once restored from a checkpoint, the local state is edited only
+      //    by this thread. 3 & 4 allow making assumptions like temporarily
+      //    caching local state in this thread and using it outside a lock
+      //    e.g. `make_new_iterator`.
+      // 6. `ckpt_mu_` should be wisely used to create *consistent*
+      //    checkpoint markers.
 
-        // Whether the input iterator returned an `end_of_sequence`.
-        bool end_of_sequence = false;
-
-        // Status returned from `MakeIteratorFromInputElement`.
+      // std::function arguments are copy-constructable, so we pass raw
+      // pointers, and then immediately wrap them to ensure correct ownership.
+      RecordStart(ctx.get());
+      auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
+        mutex_lock l(mu_);
+        workers_[thread_index].cond_var.notify_all();
+        RecordStop(ctx.get());
+      });
+      bool make_new_iterator;
+      {
+        tf_shared_lock l(ckpt_mu_);
+        // Decide whether a new iterator should be built.
+        // 1. If there is an existing iterator, we use it.
+        // 2. If there was an error in iterator creation that could not be
+        //    notified to the client we attempt to send that to the client
+        //    first.
+        make_new_iterator =
+            worker_thread_states_[thread_index].iterator == nullptr &&
+            worker_thread_states_[thread_index].iterator_creation_status.ok();
+      }
+      // Even though `make_new_iterator` has cached values from
+      // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+      // it is safe to *read* `make_new_iterator`outside of a lock without
+      // worrying about concurrent changes to values in
+      // `worker_thread_states_[thread_index]`. See comment at the start of
+      // this function for details.
+      while (true) {
+        // Whether creation of the iterator succeeded.
         Status iterator_creation_status;
-
-        // The arguments to be used to construct `iterator`.
-        std::vector<Tensor> input;
-
-        std::unique_ptr<IteratorBase> iterator;
-
-        WorkerThreadState() : output_elem(Status::OK()) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
-            std::vector<Tensor> args;
-            bool end_of_input = false;
-            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-            if (end_of_input) {
-              input_impl_.reset();
-              return Status::OK();
-            }
-            workers_[i].SetInputs(s, std::move(args));
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.push_back(ctx->StartThread(
-                strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-            if (i < dataset()->cycle_length_) {
-              interleave_indices_.push_back(i);
-            } else {
-              staging_indices_.push_back(i);
-            }
-          }
-          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
-          DCHECK(staging_indices_.size() ==
-                 dataset()->prefetch_input_elements_);
-        }
-        return Status::OK();
-      }
-
-      // Produces elements into the worker's output buffers.
-      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
-                        const int64 thread_index) {
-        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
-        //
-        // 1. Any local state that may need to be checkpointed should be kept
-        //    in `worker_thread_states_[thread_index]`.
-        // 2. `WorkerThreadState` should contain state that is needed only for
-        //    checkpointing, i.e., if we were to remove checkpointing support,
-        //    we could keep that state as local variables in this thread.
-        // 3. This thread should only read/write state at `thread_index`
-        //    and should not access other thread states.
-        // 4. When restoring from checkpoint, threads are started only after
-        //    the restore is complete.
-        // 5. Once restored from a checkpoint, the local state is edited only
-        //    by this thread. 3 & 4 allow making assumptions like temporarily
-        //    caching local state in this thread and using it outside a lock
-        //    e.g. `make_new_iterator`.
-        // 6. `ckpt_mu_` should be wisely used to create *consistent*
-        //    checkpoint markers.
-
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
-          mutex_lock l(mu_);
-          workers_[thread_index].cond_var.notify_all();
-          RecordStop(ctx.get());
-        });
-        bool make_new_iterator;
-        {
-          tf_shared_lock l(ckpt_mu_);
-          // Decide whether a new iterator should be built.
-          // 1. If there is an existing iterator, we use it.
-          // 2. If there was an error in iterator creation that could not be
-          //    notified to the client we attempt to send that to the client
-          //    first.
-          make_new_iterator =
-              worker_thread_states_[thread_index].iterator == nullptr &&
-              worker_thread_states_[thread_index].iterator_creation_status.ok();
-        }
-        // Even though `make_new_iterator` has cached values from
-        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
-        // it is safe to *read* `make_new_iterator`outside of a lock without
-        // worrying about concurrent changes to values in
-        // `worker_thread_states_[thread_index]`. See comment at the start of
-        // this function for details.
-        while (true) {
-          // Whether creation of the iterator succeeded.
-          Status iterator_creation_status;
-          // 1. Build a new iterator or use the existing one.
-          if (make_new_iterator) {
-            // 1a. Get new input tensors or use the exiting ones.
-            bool read_new_input;
-            {
-              tf_shared_lock l(ckpt_mu_);
-              // worker_thread_states_[thread_index].input will be non-empty
-              // if checkpointing happened at CHECKPOINT_MARKER_A.
-              read_new_input =
-                  worker_thread_states_[thread_index].input.empty();
-            }
-
-            if (read_new_input) {
-              mutex_lock l(mu_);
-              while (!cancelled_ && !workers_[thread_index].is_producing) {
-                RecordStop(ctx.get());
-                workers_[thread_index].cond_var.wait(l);
-                RecordStart(ctx.get());
-              }
-              if (cancelled_) return;
-              // Copy the input tensors so that we do not need to block on `mu_`
-              // when building the iterator.
-              // We keep a copy of the input tensors in
-              // `WorkerThreadState.input` till the iterator is in use. This is
-              // used in `RestoreInternal` to re-build the iterator.
-              // TODO(b/78046638): Explore ways to avoid tracking the input
-              // tensors.
-              tf_shared_lock ckpt_l(ckpt_mu_);
-              worker_thread_states_[thread_index].input.swap(
-                  workers_[thread_index].input);
-              // CHECKPOINT_MARKER_A
-              // We have the input tensors but have not built the iterator yet.
-            }
-
-            // 1b. Run the user defined function to produce a new iterator.
-            {
-              tf_shared_lock l(ckpt_mu_);
-              worker_thread_states_[thread_index].iterator_creation_status =
-                  MakeIteratorFromInputElement(
-                      ctx.get(), worker_thread_states_[thread_index].input,
-                      thread_index, *instantiated_captured_func_, prefix(),
-                      &worker_thread_states_[thread_index].iterator);
-              iterator_creation_status =
-                  worker_thread_states_[thread_index].iterator_creation_status;
-              if (!iterator_creation_status.ok()) {
-                worker_thread_states_[thread_index].input.clear();
-              }
-              // CHECKPOINT_MARKER_B
-              // Either an iterator has been successfully built and placed in
-              // `worker_thread_states_[thread_index].iterator` or it failed and
-              // a non-OK status has been put in
-              // `worker_thread_states_[thread_index].iterator_creation_status`.
-            }
-          } else {
+        // 1. Build a new iterator or use the existing one.
+        if (make_new_iterator) {
+          // 1a. Get new input tensors or use the exiting ones.
+          bool read_new_input;
+          {
             tf_shared_lock l(ckpt_mu_);
-            iterator_creation_status =
-                worker_thread_states_[thread_index].iterator_creation_status;
-            // Mark that we have used up the restored iterator.
-            make_new_iterator = true;
+            // worker_thread_states_[thread_index].input will be non-empty
+            // if checkpointing happened at CHECKPOINT_MARKER_A.
+            read_new_input = worker_thread_states_[thread_index].input.empty();
           }
-          // 2. Start producing elements or send error state to client if
-          //    iterator creation failed.
-          if (!iterator_creation_status.ok()) {
+
+          if (read_new_input) {
             mutex_lock l(mu_);
-            // Wait for space in the prefetch queue.
-            while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                      dataset()->buffer_output_elements_) {
+            while (!cancelled_ && !workers_[thread_index].is_producing) {
               RecordStop(ctx.get());
               workers_[thread_index].cond_var.wait(l);
               RecordStart(ctx.get());
             }
             if (cancelled_) return;
+            // Copy the input tensors so that we do not need to block on `mu_`
+            // when building the iterator.
+            // We keep a copy of the input tensors in
+            // `WorkerThreadState.input` till the iterator is in use. This is
+            // used in `RestoreInternal` to re-build the iterator.
+            // TODO(b/78046638): Explore ways to avoid tracking the input
+            // tensors.
             tf_shared_lock ckpt_l(ckpt_mu_);
-            workers_[thread_index].outputs.emplace_back(
-                iterator_creation_status);
-            workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].input.swap(
+                workers_[thread_index].input);
+            // CHECKPOINT_MARKER_A
+            // We have the input tensors but have not built the iterator yet.
+          }
+
+          // 1b. Run the user defined function to produce a new iterator.
+          {
+            tf_shared_lock l(ckpt_mu_);
             worker_thread_states_[thread_index].iterator_creation_status =
-                Status::OK();
-            // CHECKPOINT_MARKER_C
-            // Non-OK iterator creation status has been notified to the
-            // client.
-            workers_[thread_index].cond_var.notify_one();
-          } else {
-            bool end_of_sequence = false;
-            while (!end_of_sequence) {
-              // 3.a Produce an element!
-              {
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                if (worker_thread_states_[thread_index]
-                        .output_elem.status.ok() &&
-                    worker_thread_states_[thread_index]
-                        .output_elem.output.empty() &&
-                    !worker_thread_states_[thread_index].end_of_sequence) {
-                  worker_thread_states_[thread_index].output_elem.status =
-                      worker_thread_states_[thread_index].iterator->GetNext(
-                          ctx.get(),
-                          &worker_thread_states_[thread_index]
-                               .output_elem.output,
-                          &worker_thread_states_[thread_index].end_of_sequence);
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                } else {
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                }
-                // CHECKPOINT_MARKER_D
-                // An element has been read or an error or end_of_sequence has
-                // been received from the input iterator and is waiting to be
-                // sent to client.
-              }
-
-              // 3.b Make it available to the client.
-              {
-                mutex_lock l(mu_);
-
-                // Wait for space in the prefetch queue.
-                while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                          dataset()->buffer_output_elements_) {
-                  RecordStop(ctx.get());
-                  workers_[thread_index].cond_var.wait(l);
-                  RecordStart(ctx.get());
-                }
-                if (cancelled_) return;
-
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                workers_[thread_index].is_producing = !end_of_sequence;
-
-                // Output the element.
-
-                // Move the temporary state in WorkerThreadState to WorkerState
-                // and mark it as used.
-                if (end_of_sequence) {
-                  worker_thread_states_[thread_index].iterator.reset();
-                  worker_thread_states_[thread_index].input.clear();
-                  worker_thread_states_[thread_index].end_of_sequence = false;
-                } else {
-                  workers_[thread_index].outputs.emplace_back(
-                      worker_thread_states_[thread_index].output_elem.status);
-                  workers_[thread_index].outputs.back().output.swap(
-                      worker_thread_states_[thread_index].output_elem.output);
-                }
+                MakeIteratorFromInputElement(
+                    ctx.get(), worker_thread_states_[thread_index].input,
+                    thread_index, *instantiated_captured_func_, prefix(),
+                    &worker_thread_states_[thread_index].iterator);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            if (!iterator_creation_status.ok()) {
+              worker_thread_states_[thread_index].input.clear();
+            }
+            // CHECKPOINT_MARKER_B
+            // Either an iterator has been successfully built and placed in
+            // `worker_thread_states_[thread_index].iterator` or it failed and
+            // a non-OK status has been put in
+            // `worker_thread_states_[thread_index].iterator_creation_status`.
+          }
+        } else {
+          tf_shared_lock l(ckpt_mu_);
+          iterator_creation_status =
+              worker_thread_states_[thread_index].iterator_creation_status;
+          // Mark that we have used up the restored iterator.
+          make_new_iterator = true;
+        }
+        // 2. Start producing elements or send error state to client if
+        //    iterator creation failed.
+        if (!iterator_creation_status.ok()) {
+          mutex_lock l(mu_);
+          // Wait for space in the prefetch queue.
+          while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                    dataset()->buffer_output_elements_) {
+            RecordStop(ctx.get());
+            workers_[thread_index].cond_var.wait(l);
+            RecordStart(ctx.get());
+          }
+          if (cancelled_) return;
+          tf_shared_lock ckpt_l(ckpt_mu_);
+          workers_[thread_index].outputs.emplace_back(iterator_creation_status);
+          workers_[thread_index].is_producing = false;
+          worker_thread_states_[thread_index].iterator_creation_status =
+              Status::OK();
+          // CHECKPOINT_MARKER_C
+          // Non-OK iterator creation status has been notified to the
+          // client.
+          workers_[thread_index].cond_var.notify_one();
+        } else {
+          bool end_of_sequence = false;
+          while (!end_of_sequence) {
+            // 3.a Produce an element!
+            {
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              if (worker_thread_states_[thread_index].output_elem.status.ok() &&
+                  worker_thread_states_[thread_index]
+                      .output_elem.output.empty() &&
+                  !worker_thread_states_[thread_index].end_of_sequence) {
                 worker_thread_states_[thread_index].output_elem.status =
-                    Status::OK();
-                if (dataset()->sloppy_) {
-                  sloppy_cond_var_.notify_one();
-                } else {
-                  workers_[thread_index].cond_var.notify_one();
-                }
-                // CHECKPOINT_MARKER_E
-                // Output element or iterator status has been sent to the
-                // client.
+                    worker_thread_states_[thread_index].iterator->GetNext(
+                        ctx.get(),
+                        &worker_thread_states_[thread_index].output_elem.output,
+                        &worker_thread_states_[thread_index].end_of_sequence);
+                end_of_sequence =
+                    worker_thread_states_[thread_index].end_of_sequence;
+              } else {
+                end_of_sequence =
+                    worker_thread_states_[thread_index].end_of_sequence;
               }
+              // CHECKPOINT_MARKER_D
+              // An element has been read or an error or end_of_sequence has
+              // been received from the input iterator and is waiting to be
+              // sent to client.
+            }
+
+            // 3.b Make it available to the client.
+            {
+              mutex_lock l(mu_);
+
+              // Wait for space in the prefetch queue.
+              while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                        dataset()->buffer_output_elements_) {
+                RecordStop(ctx.get());
+                workers_[thread_index].cond_var.wait(l);
+                RecordStart(ctx.get());
+              }
+              if (cancelled_) return;
+
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              workers_[thread_index].is_producing = !end_of_sequence;
+
+              // Output the element.
+
+              // Move the temporary state in WorkerThreadState to WorkerState
+              // and mark it as used.
+              if (end_of_sequence) {
+                worker_thread_states_[thread_index].iterator.reset();
+                worker_thread_states_[thread_index].input.clear();
+                worker_thread_states_[thread_index].end_of_sequence = false;
+              } else {
+                workers_[thread_index].outputs.emplace_back(
+                    worker_thread_states_[thread_index].output_elem.status);
+                workers_[thread_index].outputs.back().output.swap(
+                    worker_thread_states_[thread_index].output_elem.output);
+              }
+              worker_thread_states_[thread_index].output_elem.status =
+                  Status::OK();
+              if (dataset()->sloppy_) {
+                sloppy_cond_var_.notify_one();
+              } else {
+                workers_[thread_index].cond_var.notify_one();
+              }
+              // CHECKPOINT_MARKER_E
+              // Output element or iterator status has been sent to the
+              // client.
             }
           }
         }
       }
+    }
 
-      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_", index);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            workers_[index].input.size()));
-        for (int i = 0; i < workers_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              workers_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_outputs_size")),
-            workers_[index].outputs.size()));
-        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-              writer, workers_[index].outputs[i],
-              full_name(strings::StrCat(prefix, "_outputs_", i))));
-        }
-        if (workers_[index].is_producing) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_is_producing")), ""));
-        }
-        return Status::OK();
+    Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      string prefix = strings::StrCat(kWorker, "_", index);
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kInputSize)),
+          workers_[index].input.size()));
+      for (int i = 0; i < workers_[index].input.size(); ++i) {
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            full_name(strings::StrCat(prefix, "_", kInput, "_", i)),
+            workers_[index].input[i]));
       }
-
-      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
-                                   IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        workers_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          workers_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &workers_[index].input.back()));
-        }
-        int64 outputs_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
-            &outputs_size));
-        for (int i = 0; i < outputs_size; ++i) {
-          workers_[index].outputs.emplace_back(Status::OK());
-          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-              reader, &workers_[index].outputs.back(),
-              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
-        }
-        if (reader->Contains(
-                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
-          workers_[index].is_producing = true;
-        } else {
-          workers_[index].is_producing = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
-                                          int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_thread_", index);
-        if (worker_thread_states_[index].iterator != nullptr) {
-          TF_RETURN_IF_ERROR(
-              SaveInput(writer, worker_thread_states_[index].iterator));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            worker_thread_states_[index].input.size()));
-        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              worker_thread_states_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_iterator_creation_status"),
-            worker_thread_states_[index].iterator_creation_status));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kOutputsSize)),
+          workers_[index].outputs.size()));
+      for (int i = 0; i < workers_[index].outputs.size(); ++i) {
         TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-            writer, worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(prefix, "_output"))));
-        if (worker_thread_states_[index].end_of_sequence) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
-        }
-        return Status::OK();
+            writer, workers_[index].outputs[i],
+            full_name(strings::StrCat(prefix, "_", kOutputs, "_", i))));
       }
+      if (workers_[index].is_producing) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kIsProducing)), ""));
+      }
+      return Status::OK();
+    }
 
-      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
-                                         IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_thread_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        worker_thread_states_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          worker_thread_states_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &worker_thread_states_[index].input.back()));
-        }
-        // Restore iterator.
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
-          worker_thread_states_[index].iterator.reset();
-        } else {
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = MakeIteratorFromInputElement(
-              ctx, worker_thread_states_[index].input, index,
-              *instantiated_captured_func_, prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
-          worker_thread_states_[index].iterator.swap(iterator);
-        }
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
-            &worker_thread_states_[index].iterator_creation_status));
+    Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                 IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      string worker_prefix = strings::StrCat(kWorker, "_", index);
+      // Restore inputs.
+      int64 input_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(worker_prefix, "_", kInputSize)),
+          &input_size));
+      workers_[index].input.reserve(input_size);
+      for (int i = 0; i < input_size; ++i) {
+        workers_[index].input.emplace_back();
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            full_name(strings::StrCat(worker_prefix, "_", kInput, "_", i)),
+            &workers_[index].input.back()));
+      }
+      int64 outputs_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(worker_prefix, "_", kOutputsSize)),
+          &outputs_size));
+      for (int i = 0; i < outputs_size; ++i) {
+        workers_[index].outputs.emplace_back(Status::OK());
         TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-            reader, &worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(worker_prefix, "_output"))));
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
-          worker_thread_states_[index].end_of_sequence = true;
-        } else {
-          worker_thread_states_[index].end_of_sequence = false;
-        }
-        return Status::OK();
+            reader, &workers_[index].outputs.back(),
+            full_name(strings::StrCat(worker_prefix, "_", kOutputs, "_", i))));
       }
+      if (reader->Contains(
+              full_name(strings::StrCat(worker_prefix, "_", kIsProducing)))) {
+        workers_[index].is_producing = true;
+      } else {
+        workers_[index].is_producing = false;
+      }
+      return Status::OK();
+    }
 
-      Status WriteOutputElemLocked(IteratorStateWriter* writer,
-                                   const OutputElem& output_elem,
-                                   const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+    Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer, int index)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      string prefix = strings::StrCat(kWorkerThread, "_", index);
+      if (worker_thread_states_[index].iterator != nullptr) {
         TF_RETURN_IF_ERROR(
-            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
-                                output_elem.output.size()));
-        for (int i = 0; i < output_elem.output.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
-        }
-        return Status::OK();
+            SaveInput(writer, worker_thread_states_[index].iterator));
+      } else {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kIteratorExhausted)), ""));
       }
-
-      Status ReadOutputElemLocked(IteratorStateReader* reader,
-                                  OutputElem* output_elem, const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            strings::StrCat(prefix, "_output_size"), &output_size));
-        output_elem->output.reserve(output_size);
-        for (int i = 0; i < output_size; ++i) {
-          output_elem->output.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
-                                 &output_elem->output.back()));
-        }
-        return Status::OK();
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(strings::StrCat(prefix, "_", kInputSize)),
+          worker_thread_states_[index].input.size()));
+      for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            full_name(strings::StrCat(prefix, "_", kInput, "_", i)),
+            worker_thread_states_[index].input[i]));
       }
+      TF_RETURN_IF_ERROR(WriteStatusLocked(
+          writer, strings::StrCat(prefix, "_", kIteratorCreationStatus),
+          worker_thread_states_[index].iterator_creation_status));
+      TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+          writer, worker_thread_states_[index].output_elem,
+          full_name(strings::StrCat(prefix, "_", kOutput))));
+      if (worker_thread_states_[index].end_of_sequence) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", kEndOfSequence)), ""));
+      }
+      return Status::OK();
+    }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+    Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                       IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      string worker_prefix = strings::StrCat(kWorkerThread, "_", index);
+      // Restore inputs.
+      int64 input_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(worker_prefix, "_", kInputSize)),
+          &input_size));
+      worker_thread_states_[index].input.reserve(input_size);
+      for (int i = 0; i < input_size; ++i) {
+        worker_thread_states_[index].input.emplace_back();
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            full_name(strings::StrCat(worker_prefix, "_", kInput, "_", i)),
+            &worker_thread_states_[index].input.back()));
+      }
+      // Restore iterator.
+      if (reader->Contains(full_name(
+              strings::StrCat(worker_prefix, "_", kIteratorExhausted)))) {
+        worker_thread_states_[index].iterator.reset();
+      } else {
+        std::unique_ptr<IteratorBase> iterator;
+        Status s = MakeIteratorFromInputElement(
+            ctx, worker_thread_states_[index].input, index,
+            *instantiated_captured_func_, prefix(), &iterator);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+        worker_thread_states_[index].iterator.swap(iterator);
+      }
+      TF_RETURN_IF_ERROR(ReadStatusLocked(
+          reader, strings::StrCat(worker_prefix, "_", kIteratorCreationStatus),
+          &worker_thread_states_[index].iterator_creation_status));
+      TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+          reader, &worker_thread_states_[index].output_elem,
+          full_name(strings::StrCat(worker_prefix, "_", kOutput))));
+      if (reader->Contains(
+              full_name(strings::StrCat(worker_prefix, "_", kEndOfSequence)))) {
+        worker_thread_states_[index].end_of_sequence = true;
+      } else {
+        worker_thread_states_[index].end_of_sequence = false;
+      }
+      return Status::OK();
+    }
+
+    Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                 const OutputElem& output_elem,
+                                 const string& prefix)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      TF_RETURN_IF_ERROR(WriteStatusLocked(
+          writer, strings::StrCat(prefix, "_", kStatus), output_elem.status));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(strings::StrCat(prefix, "_", kOutputSize),
+                              output_elem.output.size()));
+      for (int i = 0; i < output_elem.output.size(); ++i) {
         TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
+            writer->WriteTensor(strings::StrCat(prefix, "_", kOutput, "_", i),
+                                output_elem.output[i]));
       }
+      return Status::OK();
+    }
 
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        int64 code_int;
+    Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                OutputElem* output_elem, const string& prefix)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      TF_RETURN_IF_ERROR(ReadStatusLocked(
+          reader, strings::StrCat(prefix, "_", kStatus), &output_elem->status));
+      int64 output_size;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          strings::StrCat(prefix, "_", kOutputSize), &output_size));
+      output_elem->output.reserve(output_size);
+      for (int i = 0; i < output_size; ++i) {
+        output_elem->output.emplace_back();
+        TF_RETURN_IF_ERROR(
+            reader->ReadTensor(strings::StrCat(prefix, "_", kOutput, "_", i),
+                               &output_elem->output.back()));
+      }
+      return Status::OK();
+    }
+
+    Status WriteStatusLocked(IteratorStateWriter* writer, const string& prefix,
+                             const Status& status)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(strings::StrCat(prefix, "_", kCode)),
+                              static_cast<int64>(status.code())));
+      if (!status.ok()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_", KMessage)),
+            status.error_message()));
+      }
+      return Status::OK();
+    }
+
+    Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                            Status* status)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+      int64 code_int;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(strings::StrCat(prefix, "_", kCode)), &code_int));
+      error::Code code = static_cast<error::Code>(code_int);
+
+      if (code != error::Code::OK) {
+        tstring error_message;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
+            full_name(strings::StrCat(prefix, "_", KMessage)), &error_message));
+        *status = Status(code, error_message);
+      } else {
+        *status = Status::OK();
       }
+      return Status::OK();
+    }
 
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
-      // The main thread waits on this condition variable if running in sloppy
-      // mode and no values are available.
-      condition_variable sloppy_cond_var_;
-      // Mutex used to wait for a consistent state while checkpointing.
-      // Only Save and Restore require an exclusive lock on this mutex. In
-      // other scenarios we just acquire a shared lock so the pipeline's
-      // performance should not be affected in the absence of checkpointing.
-      // A thread must not wait on any condition variable while holding
-      // `ckpt_mu_` in either shared or exclusive modes.
-      mutex ckpt_mu_;
+    // Mutex & condition variable to guard mutable iterator internals and
+    // coordinate among worker threads and client thread[s].
+    mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+    // The main thread waits on this condition variable if running in sloppy
+    // mode and no values are available.
+    condition_variable sloppy_cond_var_;
+    // Mutex used to wait for a consistent state while checkpointing.
+    // Only Save and Restore require an exclusive lock on this mutex. In
+    // other scenarios we just acquire a shared lock so the pipeline's
+    // performance should not be affected in the absence of checkpointing.
+    // A thread must not wait on any condition variable while holding
+    // `ckpt_mu_` in either shared or exclusive modes.
+    mutex ckpt_mu_;
 
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      // input_impl_ is reset when we have exhausted its input.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    // The iterator producing elements which are converted to datasets by
+    // the dataset()->captured_func_ then interleaved together.
+    // input_impl_ is reset when we have exhausted its input.
+    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
 
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
-      // The WorkerState structs the worker threads operate on.
-      // workers_ elements are in at most one of interleave_ and staging_.
-      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+    // The WorkerState structs the worker threads operate on.
+    // workers_ elements are in at most one of interleave_ and staging_.
+    std::vector<WorkerState> workers_ GUARDED_BY(mu_);
 
-      // Stores the temporary state of WorkerThreads which is not stored in
-      // WorkerState. This is used for checkpointing purposes only.
-      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+    // Stores the temporary state of WorkerThreads which is not stored in
+    // WorkerState. This is used for checkpointing purposes only.
+    std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
 
-      // Indices in `workers_` of iterators to interleave.
-      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
-      // Indices in `workers_` of prefetched iterators.
-      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+    // Indices in `workers_` of iterators to interleave.
+    std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+    // Indices in `workers_` of prefetched iterators.
+    std::deque<int64> staging_indices_ GUARDED_BY(mu_);
 
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // The worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const int64 buffer_output_elements_;
-    const int64 prefetch_input_elements_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
+    // The index into output_elements_ for next element to produce.
+    size_t next_index_ GUARDED_BY(mu_) = 0;
+    // The number of items produced so far within the block
+    size_t block_count_ GUARDED_BY(mu_) = 0;
+    // Flag to instruct the worker threads to exit.
+    bool cancelled_ GUARDED_BY(mu_) = false;
+    // The worker threads. This must be last to ensure the
+    // threads have exited before any other members are deallocated.
+    // TODO(b/65178177): Avoid allocating additional threads.
+    std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
   };
 
-  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
+  const DatasetBase* const input_;
+  const std::unique_ptr<CapturedFunction> captured_func_;
+  const int64 cycle_length_;
+  const int64 block_length_;
+  const bool sloppy_;
+  const int64 buffer_output_elements_;
+  const int64 prefetch_input_elements_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
 };
 
+ParallelInterleaveDatasetOp::ParallelInterleaveDatasetOp(
+    OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  FunctionMetadata::Params params;
+  params.is_multi_device_function = true;
+  OP_REQUIRES_OK(ctx,
+                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void ParallelInterleaveDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                              DatasetBase* input,
+                                              DatasetBase** output) {
+  int64 cycle_length = 0;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kCycleLength, &cycle_length));
+  OP_REQUIRES(ctx, cycle_length > 0,
+              errors::InvalidArgument("`cycle_length` must be > 0"));
+
+  int64 block_length = 0;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kBlockLength, &block_length));
+  OP_REQUIRES(ctx, block_length > 0,
+              errors::InvalidArgument("`block_length` must be > 0"));
+
+  bool sloppy = false;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kSloppy, &sloppy));
+
+  int64 buffer_output_elements = 0;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kBufferOutputElements,
+                                          &buffer_output_elements));
+  OP_REQUIRES(ctx, buffer_output_elements > 0,
+              errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+  int64 prefetch_input_elements = 0;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kPrefetchInputElements,
+                                          &prefetch_input_elements));
+  OP_REQUIRES(
+      ctx, prefetch_input_elements >= 0,
+      errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+  std::unique_ptr<CapturedFunction> captured_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, func_metadata_, kOtherArguments,
+                                          &captured_func));
+
+  *output = new Dataset(ctx, input, std::move(captured_func), cycle_length,
+                        block_length, sloppy, buffer_output_elements,
+                        prefetch_input_elements, output_types_, output_shapes_);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
                         ParallelInterleaveDatasetOp);
 REGISTER_KERNEL_BUILDER(
@@ -1079,5 +1111,6 @@ REGISTER_INPUT_COLOCATION_EXEMPTION("ParallelInterleaveDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalParallelInterleaveDataset");
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h
new file mode 100644
index 00000000000..6e49679ecb0
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ParallelInterleave";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kCycleLength = "cycle_length";
+  static constexpr const char* const kBlockLength = "block_length";
+  static constexpr const char* const kSloppy = "sloppy";
+  static constexpr const char* const kBufferOutputElements =
+      "buffer_output_elements";
+  static constexpr const char* const kPrefetchInputElements =
+      "prefetch_input_elements";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc
new file mode 100644
index 00000000000..5340d240f9d
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc
@@ -0,0 +1,813 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/tensor_slice_dataset_op.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+constexpr char kNodeName[] = "parallel_interleave_dataset";
+constexpr char kIteratorPrefix[] = "Iterator";
+
+class ParallelInterleaveDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor>* const tensor_vector, Tensor* dataset_tensor) {
+    DatasetBase* tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `ParallelInterleaveDataset` op kernel
+  Status CreateParallelInterleaveDatasetKernel(
+      const FunctionDefHelper::AttrValueWrapper& func,
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName,
+        name_utils::OpName(ParallelInterleaveDatasetOp::kDatasetType),
+        {ParallelInterleaveDatasetOp::kInputDataset,
+         ParallelInterleaveDatasetOp::kCycleLength,
+         ParallelInterleaveDatasetOp::kBlockLength,
+         ParallelInterleaveDatasetOp::kSloppy,
+         ParallelInterleaveDatasetOp::kBufferOutputElements,
+         ParallelInterleaveDatasetOp::kPrefetchInputElements},
+        {{ParallelInterleaveDatasetOp::kFunc, func},
+         {ParallelInterleaveDatasetOp::kTarguments, {}},
+         {ParallelInterleaveDatasetOp::kOutputTypes, output_types},
+         {ParallelInterleaveDatasetOp::kOutputShapes, output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `ParallelInterleaveDataset` op kernel context.
+  Status CreateParallelInterleaveDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  TestCase(std::vector<Tensor> input_tensors, int64 cycle_length,
+           int64 block_length, bool sloppy, int64 buffer_output_elements,
+           int64 prefetch_input_elements,
+           FunctionDefHelper::AttrValueWrapper func,
+           std::vector<FunctionDef> func_lib,
+           std::vector<Tensor> expected_outputs,
+           DataTypeVector expected_output_dtypes,
+           std::vector<PartialTensorShape> expected_output_shapes,
+           int64 expected_cardinality, std::vector<int> breakpoints)
+      : input_tensors(std::move(input_tensors)),
+        cycle_length(CreateTensor<int64>(TensorShape({}), {cycle_length})),
+        block_length(CreateTensor<int64>(TensorShape({}), {block_length})),
+        sloppy(CreateTensor<bool>(TensorShape({}), {sloppy})),
+        buffer_output_elements(
+            CreateTensor<int64>(TensorShape({}), {buffer_output_elements})),
+        prefetch_input_elements(
+            CreateTensor<int64>(TensorShape({}), {prefetch_input_elements})),
+        func(std::move(func)),
+        func_lib(std::move(func_lib)),
+        expected_outputs(std::move(expected_outputs)),
+        expected_output_dtypes(std::move(expected_output_dtypes)),
+        expected_output_shapes(std::move(expected_output_shapes)),
+        expected_cardinality(expected_cardinality),
+        breakpoints(std::move(breakpoints)) {}
+
+  std::vector<Tensor> input_tensors;
+  Tensor cycle_length;
+  Tensor block_length;
+  Tensor sloppy;
+  Tensor buffer_output_elements;
+  Tensor prefetch_input_elements;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+template <typename T>
+std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
+  std::vector<Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto& value : values) {
+    tensors.emplace_back(CreateTensor<T>(TensorShape({1}), {value}));
+  }
+  return tensors;
+}
+
+FunctionDefHelper::AttrValueWrapper MakeTensorSliceDatasetFunc(
+    const DataTypeVector& output_types,
+    const std::vector<PartialTensorShape>& output_shapes) {
+  return FunctionDefHelper::FunctionRef(
+      /*name*/ "MakeTensorSliceDataset",
+      /*attrs*/ {{TensorSliceDatasetOp::kToutputTypes, output_types},
+                 {TensorSliceDatasetOp::kOutputShapes, output_shapes}});
+}
+
+// Test case 1: cycle_length = 1, block_length = 1, sloppy = false,
+// buffer_output_elements = 1, prefetch_input_elements = 1
+TestCase TestCase1() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/1,
+      /*block_length=*/1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/1,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+// Test case 2: cycle_length = 2, block_length = 1, sloppy = false,
+// buffer_output_elements = 1, prefetch_input_elements = 0
+TestCase TestCase2() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/2,
+      /*block_length=*/1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/0,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 1, 4, 2, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+// Test case 3: cycle_length = 3, block_length = 1, sloppy = true,
+// buffer_output_elements = 3, prefetch_input_elements = 2
+TestCase TestCase3() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/3,
+      /*block_length=*/1,
+      /*sloppy=*/true,
+      /*buffer_output_elements=*/3,
+      /*prefetch_input_elements=*/2,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+// Test case 4: cycle_length = 5, block_length = 1, sloppy = true
+// buffer_output_elements = 1, prefetch_input_elements = 2
+TestCase TestCase4() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/5,
+      /*block_length=*/1,
+      /*sloppy=*/true,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/2,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+// Test case 5: cycle_length = 2, block_length = 2, sloppy = false
+// buffer_output_elements = 2, prefetch_input_elements = 2
+TestCase TestCase5() {
+  return {/*input_tensors=*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*cycle_length=*/2,
+          /*block_length=*/2,
+          /*sloppy=*/false,
+          /*buffer_output_elements=*/2,
+          /*prefetch_input_elements=*/2,
+          /*func=*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "d", "e", "c", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
+}
+
+TestCase InvalidCycleLengthTestCase() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/0,
+      /*block_length=*/1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/1,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+TestCase InvalidBlockLengthTestCase() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/1,
+      /*block_length=*/-1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/1,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+TestCase InvalidBufferOutputElementsTestCase() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/1,
+      /*block_length=*/1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/0,
+      /*prefetch_input_elements=*/1,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+TestCase InvalidPrefetchInputElementsTestCase() {
+  return {
+      /*input_tensors=*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*cycle_length=*/1,
+      /*block_length=*/1,
+      /*sloppy=*/false,
+      /*buffer_output_elements=*/1,
+      /*prefetch_input_elements=*/-1,
+      /*func=*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib=*/{test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+class ParameterizedParallelInterleaveDatasetOpTest
+    : public ParallelInterleaveDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(parallel_interleave_dataset_context.get(),
+                                     &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(parallel_interleave_dataset->MakeIterator(
+      iterator_ctx.get(), kIteratorPrefix, &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(
+      ExpectEqual(out_tensors, test_case.expected_outputs,
+                  /*compare_order=*/!test_case.sloppy.scalar<bool>()()));
+}
+
+TEST_F(ParallelInterleaveDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  EXPECT_EQ(parallel_interleave_dataset->node_name(), kNodeName);
+}
+
+TEST_F(ParallelInterleaveDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  EXPECT_EQ(parallel_interleave_dataset->type_string(),
+            name_utils::OpName(ParallelInterleaveDatasetOp::kDatasetType));
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(parallel_interleave_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  TF_EXPECT_OK(
+      VerifyShapesCompatible(parallel_interleave_dataset->output_shapes(),
+                             test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  EXPECT_EQ(parallel_interleave_dataset->Cardinality(),
+            test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(parallel_interleave_dataset_context.get(),
+                                     &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(parallel_interleave_dataset->MakeIterator(
+      iterator_ctx.get(), kIteratorPrefix, &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(parallel_interleave_dataset_context.get(),
+                                     &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(parallel_interleave_dataset->MakeIterator(
+      iterator_ctx.get(), kIteratorPrefix, &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ParallelInterleaveDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(parallel_interleave_dataset_context.get(),
+                                     &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(parallel_interleave_dataset->MakeIterator(
+      iterator_ctx.get(), kIteratorPrefix, &iterator));
+  EXPECT_EQ(iterator->prefix(),
+            name_utils::IteratorPrefix(
+                ParallelInterleaveDatasetOp::kDatasetType, kIteratorPrefix));
+}
+
+TEST_P(ParameterizedParallelInterleaveDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&test_case.cycle_length),
+       TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+       TensorValue(&test_case.buffer_output_elements),
+       TensorValue(&test_case.prefetch_input_elements)});
+  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+  TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+      parallel_interleave_dataset_kernel.get(), &inputs,
+      &parallel_interleave_dataset_context));
+  DatasetBase* parallel_interleave_dataset;
+  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                             parallel_interleave_dataset_context.get(),
+                             &parallel_interleave_dataset));
+  core::ScopedUnref scoped_unref_dataset(parallel_interleave_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(parallel_interleave_dataset_context.get(),
+                                     &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(parallel_interleave_dataset->MakeIterator(
+      iterator_ctx.get(), kIteratorPrefix, &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int>& breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, kIteratorPrefix,
+                                 *parallel_interleave_dataset, &iterator));
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(
+      ExpectEqual(out_tensors, test_case.expected_outputs,
+                  /*compare_order*/ !test_case.sloppy.scalar<bool>()()));
+}
+
+INSTANTIATE_TEST_SUITE_P(ParallelInterleaveDatasetOpTest,
+                         ParameterizedParallelInterleaveDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5()})));
+
+TEST_F(ParallelInterleaveDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+
+  std::vector<TestCase> test_cases({InvalidCycleLengthTestCase(),
+                                    InvalidBlockLengthTestCase(),
+                                    InvalidBufferOutputElementsTestCase(),
+                                    InvalidPrefetchInputElementsTestCase()});
+  for (auto test_case : test_cases) {
+    TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+    std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
+    TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
+        test_case.func, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &parallel_interleave_dataset_kernel));
+
+    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+    std::vector<Tensor> inputs_for_tensor_slice_dataset =
+        test_case.input_tensors;
+    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
+        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {TensorValue(&tensor_slice_dataset_tensor),
+         TensorValue(&test_case.cycle_length),
+         TensorValue(&test_case.block_length), TensorValue(&test_case.sloppy),
+         TensorValue(&test_case.buffer_output_elements),
+         TensorValue(&test_case.prefetch_input_elements)});
+    std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
+    TF_ASSERT_OK(CreateParallelInterleaveDatasetContext(
+        parallel_interleave_dataset_kernel.get(), &inputs,
+        &parallel_interleave_dataset_context));
+    DatasetBase* parallel_interleave_dataset;
+    EXPECT_EQ(CreateDataset(parallel_interleave_dataset_kernel.get(),
+                            parallel_interleave_dataset_context.get(),
+                            &parallel_interleave_dataset)
+                  .code(),
+              tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 97b91d80b10..de5f56b10b8 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -22,10 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
 class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParseExampleDatasetOp(OpKernelConstruction* ctx)
@@ -207,6 +206,10 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -269,11 +272,11 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         (*ctx->runner())([this, ctx, prefix, input, output, callback]() {
           thread::ThreadPool* device_threadpool =
               ctx->flr()->device()->tensorflow_cpu_worker_threads()->workers;
-          std::vector<string> slice_vec;
+          std::vector<tstring> slice_vec;
           for (const Tensor& t : input) {
-            auto serialized_t = t.flat<string>();
-            gtl::ArraySlice<string> slice(serialized_t.data(),
-                                          serialized_t.size());
+            auto serialized_t = t.flat<tstring>();
+            gtl::ArraySlice<tstring> slice(serialized_t.data(),
+                                           serialized_t.size());
             for (auto it = slice.begin(); it != slice.end(); it++)
               slice_vec.push_back(*it);
           }
@@ -404,5 +407,6 @@ REGISTER_KERNEL_BUILDER(
     ParseExampleDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
index 0a640c5f877..3fbb9bd79b9 100644
--- a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
+++ b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class IteratorGetDeviceOp : public OpKernel {
@@ -41,7 +42,7 @@ class IteratorGetDeviceOp : public OpKernel {
     // NOTE(mrry): Since the operation's input is a resource, we must be
     // colocated with it, and so we can simply return the current device's
     // name without looking at the input.
-    device_name_t->scalar<string>()() = ctx->device()->name();
+    device_name_t->scalar<tstring>()() = ctx->device()->name();
   }
 };
 
@@ -52,5 +53,6 @@ REGISTER_KERNEL_BUILDER(
     IteratorGetDeviceOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 80a37764edd..a5cc433f4cf 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -22,11 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class RandomDatasetOp : public DatasetOpKernel {
  public:
   explicit RandomDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
@@ -78,6 +76,8 @@ class RandomDatasetOp : public DatasetOpKernel {
 
     int64 Cardinality() const override { return kInfiniteCardinality; }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -160,5 +160,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index b75c2422f21..615882119ad 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -13,30 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 constexpr char kOptimizerName[] = "tf_data_rebatcher";
+constexpr char kUseFallbackAttr[] = "use_fallback";
 
 class RebatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit RebatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    if (ctx->HasAttr(kUseFallbackAttr)) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseFallbackAttr, &use_fallback_));
+    }
+  }
 
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 num_workers;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
+    int64 num_replicas;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "num_replicas", &num_replicas));
     OP_REQUIRES(
-        ctx, num_workers > 0,
-        errors::InvalidArgument("num_workers must be greater than zero."));
+        ctx, num_replicas > 0,
+        errors::InvalidArgument("num_replicas must be greater than zero."));
 
-    auto config_factory = [num_workers]() { return CreateConfig(num_workers); };
+    auto config_factory = [num_replicas, this]() {
+      return CreateConfig(num_replicas, this->use_fallback_);
+    };
 
     // We only want to optimize functions for some particular datasets like
     // FlatMapDataset, InterleaveDataset etc. So we disable generalized
@@ -48,19 +57,25 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
   }
 
  private:
-  static RewriterConfig CreateConfig(int64 num_workers) {
+  static RewriterConfig CreateConfig(int64 num_replicas, bool use_fallback) {
     RewriterConfig rewriter_config;
     rewriter_config.set_fail_on_optimizer_errors(true);
     rewriter_config.add_optimizers(kOptimizerName);
     rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
     auto custom_optimizer = rewriter_config.add_custom_optimizers();
     custom_optimizer->set_name(kOptimizerName);
-    AttrValue num_workers_attr;
-    num_workers_attr.set_i(num_workers);
-    (*custom_optimizer->mutable_parameter_map())["num_workers"] =
-        num_workers_attr;
+    AttrValue num_replicas_attr;
+    num_replicas_attr.set_i(num_replicas);
+    (*custom_optimizer->mutable_parameter_map())["num_replicas"] =
+        num_replicas_attr;
+    AttrValue use_fallback_attr;
+    use_fallback_attr.set_b(use_fallback);
+    (*custom_optimizer->mutable_parameter_map())["use_fallback"] =
+        use_fallback_attr;
     return rewriter_config;
   }
+
+  bool use_fallback_ = true;
 };
 
 REGISTER_KERNEL_BUILDER(Name("RebatchDataset").Device(DEVICE_CPU),
@@ -69,5 +84,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalRebatchDataset").Device(DEVICE_CPU),
                         RebatchDatasetOp);
 
 }  // anonymous namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
index a118fd81763..b108f77756c 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/sampling_dataset_op.h"
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -22,203 +25,215 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace {
+namespace experimental {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
+// Constants declared in sampling_dataset_op.h and used both here and in test
+// cases.
+/* static */ constexpr const char* const SamplingDatasetOp::kDatasetType;
+/* static */ constexpr const char* const SamplingDatasetOp::kInputDataset;
+/* static */ constexpr const char* const SamplingDatasetOp::kRate;
+/* static */ constexpr const char* const SamplingDatasetOp::kSeed;
+/* static */ constexpr const char* const SamplingDatasetOp::kSeed2;
+/* static */ constexpr const char* const SamplingDatasetOp::kOutputTypes;
+/* static */ constexpr const char* const SamplingDatasetOp::kOutputShapes;
 
-class SamplingDatasetOp : public UnaryDatasetOpKernel {
+class SamplingDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit SamplingDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+  Dataset(OpKernelContext* ctx, float rate, int64 seed, int64 seed2,
+          const DatasetBase* input)
+      : DatasetBase(DatasetContext(ctx)),
+        rate_(rate),
+        seed_(seed),
+        seed2_(seed2),
+        input_(input) {
+    input_->Ref();
+  }
+
+  ~Dataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(
+        new Iterator({this, name_utils::IteratorPrefix(kDatasetType, prefix)},
+                     seed_, seed2_));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return input_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return input_->output_shapes();
+  }
+
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
 
  protected:
-  // Create a new SamplingDatasetOp::Dataset, and return it as the output.
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    float rate;
-    int64 seed;
-    int64 seed2;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<float>(ctx, "rate", &rate));
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
-
-    if (seed == 0 && seed2 == 0) {
-      seed = random::New64();
-      seed2 = random::New64();
-    }
-    *output = new Dataset(ctx, rate, seed, seed2, input);
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* rate = nullptr;
+    Node* seed = nullptr;
+    Node* seed2 = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(rate_, &rate));
+    TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+    TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, rate, seed, seed2}, output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, float rate, int64 seed, int64 seed2,
-            const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)),
-          rate_(rate),
+    explicit Iterator(const Params& params, int64 seed, int64 seed2)
+        : DatasetIterator<Dataset>(params),
           seed_(seed),
           seed2_(seed2),
-          input_(input) {
-      input_->Ref();
+          parent_generator_(seed, seed2),
+          generator_(&parent_generator_) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
     }
 
-    ~Dataset() override { input_->Unref(); }
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      bool rand_val_hit;
+      do {
+        {
+          tf_shared_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+        }
+        if (*end_of_sequence) {
+          mutex_lock l(mu_);
+          input_impl_.reset();
+          return Status::OK();
+        }
 
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::Sampling")}, seed_, seed2_));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return input_->output_shapes();
-    }
-
-    string DebugString() const override { return "SamplingDatasetOp::Dataset"; }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* rate = nullptr;
-      Node* seed = nullptr;
-      Node* seed2 = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(rate_, &rate));
-      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
-      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, rate, seed, seed2}, output));
+        // generate a number from random uniform [0, 1)
+        float rand_val = Random();
+        rand_val_hit = rand_val < dataset()->rate_;
+        if (!rand_val_hit) {
+          // Clear the output tensor list since it doesn't match.
+          out_tensors->clear();
+        }
+      } while (!rand_val_hit);
+      *end_of_sequence = false;
       return Status::OK();
     }
 
+   protected:
+    void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // Reset the generators based on the current iterator seeds.
+      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+      generator_ =
+          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
+      generator_.Skip(num_random_samples_);
+    }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      mutex_lock l(mu_);
+      // Save state needed to restore the random number generators.
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          this->full_name("num_random_samples"), num_random_samples_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed"), seed_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed2"), seed2_));
+
+      if (input_impl_) {
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+      } else {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("input_impl_empty"), ""));
+      }
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(mu_);
+      // Restore the random number generators.
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          this->full_name("num_random_samples"), &num_random_samples_));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed"), &seed_));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed2"), &seed2_));
+      ResetRngs();
+
+      if (!reader->Contains(full_name("input_impl_empty"))) {
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+      } else {
+        input_impl_.reset();
+      }
+      return Status::OK();
+    }
+
+    mutex mu_;
+    int64 seed_ GUARDED_BY(mu_);
+    int64 seed2_ GUARDED_BY(mu_);
+
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params, int64 seed, int64 seed2)
-          : DatasetIterator<Dataset>(params),
-            seed_(seed),
-            seed2_(seed2),
-            parent_generator_(seed, seed2),
-            generator_(&parent_generator_) {}
+    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
 
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
+    float Random() {
+      mutex_lock l(mu_);
+      num_random_samples_++;
+      uint32 random_uint = generator_();
 
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        bool rand_val_hit;
-        do {
-          {
-            tf_shared_lock l(mu_);
-            if (!input_impl_) {
-              *end_of_sequence = true;
-              return Status::OK();
-            }
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-          }
-          if (*end_of_sequence) {
-            mutex_lock l(mu_);
-            input_impl_.reset();
-            return Status::OK();
-          }
+      // PhiloxRandom returns 32-bit unsigned ints. Convert to float in [0,1)
+      // using the same method that the RandomUniform op uses.
+      return random::Uint32ToFloat(random_uint);
+    }
 
-          // generate a number from random uniform [0, 1)
-          float rand_val = Random();
-          rand_val_hit = rand_val < dataset()->rate_;
-          if (!rand_val_hit) {
-            // Clear the output tensor list since it doesn't match.
-            out_tensors->clear();
-          }
-        } while (!rand_val_hit);
-        *end_of_sequence = false;
-        return Status::OK();
-      }
-
-     protected:
-      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Reset the generators based on the current iterator seeds.
-        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-        generator_ = random::SimplePhilox(&parent_generator_);
-
-        parent_generator_.Skip(num_random_samples_);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        // Save state needed to restore the random number generators.
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            this->full_name("num_random_samples"), num_random_samples_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed"), seed_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(this->full_name("seed2"), seed2_));
-
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        // Restore the random number generators.
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            this->full_name("num_random_samples"), &num_random_samples_));
-        TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed"), &seed_));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(this->full_name("seed2"), &seed2_));
-        ResetRngs();
-
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        return Status::OK();
-      }
-
-      mutex mu_;
-      int64 seed_ GUARDED_BY(mu_);
-      int64 seed2_ GUARDED_BY(mu_);
-
-     private:
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-
-      float Random() {
-        mutex_lock l(mu_);
-        num_random_samples_++;
-        auto out = generator_.RandFloat();
-        return out;
-      }
-
-      // random util
-      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-      random::SimplePhilox generator_ GUARDED_BY(mu_);
-      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
-    };
-
-    const float rate_;
-    const int64 seed_, seed2_;
-    const DatasetBase* const input_;
+    // random util
+    random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+    random::SingleSampleAdapter<random::PhiloxRandom> generator_
+        GUARDED_BY(mu_);
+    int64 num_random_samples_ GUARDED_BY(mu_) = 0;
   };
-};
 
+  const float rate_;
+  const int64 seed_, seed2_;
+  const DatasetBase* const input_;
+};  // SamplingDatasetOp::Dataset
+
+SamplingDatasetOp::SamplingDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {}
+
+// Create a new SamplingDatasetOp::Dataset, and return it as the output.
+void SamplingDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                    DatasetBase** output) {
+  float rate;
+  int64 seed;
+  int64 seed2;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<float>(ctx, kRate, &rate));
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
+
+  if (seed == 0 && seed2 == 0) {
+    seed = random::New64();
+    seed2 = random::New64();
+  }
+  *output = new Dataset(ctx, rate, seed, seed2, input);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("SamplingDataset").Device(DEVICE_CPU),
                         SamplingDatasetOp);
-
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h
new file mode 100644
index 00000000000..9223c0e5497
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt for the
+// API definition that corresponds to this kernel.
+class SamplingDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  // Names of op parameters, public so that they can be accessed by test cases.
+  // Make sure that these are kept in sync with the REGISTER_OP call in
+  // tensorflow/core/ops/experimental_dataset_ops.cc
+  static constexpr const char* const kDatasetType = "Sampling";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kRate = "rate";
+  static constexpr const char* const kSeed = "seed";
+  static constexpr const char* const kSeed2 = "seed2";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit SamplingDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
new file mode 100644
index 00000000000..0c354f821f2
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
@@ -0,0 +1,271 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/sampling_dataset_op.h"
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+constexpr char kNodeName[] = "sampling_dataset";
+constexpr char kIteratorPrefix[] = "Iterator";
+constexpr int64 kRandomSeed = 42;
+constexpr int64 kRandomSeed2 = 7;
+constexpr int64 kStart = 0;
+constexpr int64 kStep = 1;
+
+class SamplingDatasetParams : public DatasetParams {
+ public:
+  SamplingDatasetParams(float rate, int64 num_elements,
+                        DataTypeVector output_dtypes,
+                        std::vector<PartialTensorShape> output_shapes,
+                        string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        rate(CreateTensor<float>(TensorShape({}), {rate})),
+        range_dataset_params(kStart, num_elements, kStep, {DT_INT64},
+                             {PartialTensorShape({})}, "") {}
+
+  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    if (input_dataset.NumElements() == 0 ||
+        input_dataset.dtype() != DT_VARIANT) {
+      return tensorflow::errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    }
+    *inputs = {TensorValue(&input_dataset), TensorValue(&rate),
+               TensorValue(&seed_tensor_), TensorValue(&seed2_tensor_)};
+    return Status::OK();
+  }
+
+  // Target sample rate, range (0,1], wrapped in a scalar Tensor
+  Tensor rate;
+
+  // Parameters of the sequence of numbers that will serve as the dynamic input
+  // of the kernel.
+  RangeDatasetParams range_dataset_params;
+
+  // RangeDataset kernel wrapped in a variant tensor. Initialized by the test
+  // harness class because the MakeRangeDataset() method requires an instance of
+  // DatasetOpsTestBase.
+  Tensor input_dataset;
+
+ private:
+  // Boxed versions of kRandomSeed and kRandomSeed2.
+  Tensor seed_tensor_ = CreateTensor<int64>(TensorShape({}), {kRandomSeed});
+  Tensor seed2_tensor_ = CreateTensor<int64>(TensorShape({}), {kRandomSeed2});
+};
+
+class SamplingDatasetOpTest
+    : public DatasetOpsTestBaseV2<SamplingDatasetParams> {
+ public:
+  Status Initialize(SamplingDatasetParams* dataset_params) override {
+    // Step 1: Set up enough of a TF runtime to be able to invoke a kernel.
+    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
+
+    // Step 2: Create the dataset that will provide input data for the kernel
+    TF_RETURN_IF_ERROR(MakeRangeDataset(dataset_params->range_dataset_params,
+                                        &dataset_params->input_dataset));
+
+    // Step 3: Box up the four inputs to the kernel inside TensorValue objects
+    // inside a vector.
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    TF_RETURN_IF_ERROR(dataset_params->MakeInputs(&inputs));
+
+    // Step 4: Create a dataset kernel to test, passing in attributes of the
+    // kernel.
+    TF_RETURN_IF_ERROR(MakeDatasetOpKernel(*dataset_params, &dataset_kernel_));
+
+    // Step 5: Create a context in which the kernel will operate. This is where
+    // the kernel gets initialized with its inputs
+    TF_RETURN_IF_ERROR(
+        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+
+    // Step 6: Unbox the DatasetBase object inside the variant tensor backing
+    // the kernel.
+    TF_RETURN_IF_ERROR(
+        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+
+    // Step 7: Create an iterator in case the test needs to read the output of
+    // the dataset.
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(iterator_ctx_.get(),
+                                              kIteratorPrefix, &iterator_));
+
+    return Status::OK();
+  }
+
+  // Creates a new `SamplingDataset` op kernel.
+  // Doesn't initialize the kernel's static parameters because they are inputs,
+  // not attributes.
+  Status MakeDatasetOpKernel(
+      const SamplingDatasetParams& dataset_params,
+      std::unique_ptr<OpKernel>* sampling_dataset_op_kernel) override {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, name_utils::OpName(SamplingDatasetOp::kDatasetType),
+        // Inputs
+        {SamplingDatasetOp::kInputDataset, SamplingDatasetOp::kRate,
+         SamplingDatasetOp::kSeed, SamplingDatasetOp::kSeed2},
+        // Attributes
+        {{SamplingDatasetOp::kOutputTypes, dataset_params.output_dtypes},
+         {SamplingDatasetOp::kOutputShapes, dataset_params.output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, sampling_dataset_op_kernel));
+    return Status::OK();
+  }
+};
+
+SamplingDatasetParams OneHundredPercentSampleParams() {
+  return {/*rate*/ 1.0,
+          /*num_elements*/ 3,
+          /*output_dtypes*/ {DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+SamplingDatasetParams TenPercentSampleParams() {
+  return {/*rate*/ 0.1,
+          /*num_elements*/ 20,
+          /*output_dtypes*/ {DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+SamplingDatasetParams ZeroPercentSampleParams() {
+  return {/*rate*/ 0.0,
+          /*num_elements*/ 20,
+          /*output_dtypes*/ {DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+std::vector<GetNextTestCase<SamplingDatasetParams>> GetNextTestCases() {
+  return {
+      // Test case 1: 100% sample should return all inputs
+      {/*dataset_params=*/OneHundredPercentSampleParams(),
+       /*expected_outputs=*/CreateTensors<int64>(TensorShape({}),
+                                                 {{0}, {1}, {2}})},
+
+      // Test case 2: 10% sample should return about 10% of inputs, and the
+      // specific inputs returned shouldn't change across build environments.
+      {/*dataset_params=*/TenPercentSampleParams(),
+       /*expected_outputs=*/CreateTensors<int64>(TensorShape({}),
+                                                 {{9}, {11}, {19}})},
+
+      // Test case 3: 0% sample should return nothing and should not crash.
+      {/*dataset_params=*/ZeroPercentSampleParams(), /*expected_outputs=*/{}}};
+}
+
+ITERATOR_GET_NEXT_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                         GetNextTestCases());
+
+std::vector<DatasetNodeNameTestCase<SamplingDatasetParams>>
+DatasetNodeNameTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_node_name=*/kNodeName}};
+}
+
+DATASET_NODE_NAME_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                         DatasetNodeNameTestCases());
+
+std::vector<DatasetTypeStringTestCase<SamplingDatasetParams>>
+DatasetTypeStringTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_dataset_type_string=*/name_utils::OpName(
+               SamplingDatasetOp::kDatasetType)}};
+}
+
+DATASET_TYPE_STRING_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                           DatasetTypeStringTestCases());
+
+std::vector<DatasetOutputDtypesTestCase<SamplingDatasetParams>>
+DatasetOutputDtypesTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_output_dtypes=*/{DT_INT64}}};
+}
+
+DATASET_OUTPUT_DTYPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                             DatasetOutputDtypesTestCases());
+
+std::vector<DatasetOutputShapesTestCase<SamplingDatasetParams>>
+DatasetOutputShapesTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_output_shapes=*/{PartialTensorShape({})}}};
+}
+
+DATASET_OUTPUT_SHAPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                             DatasetOutputShapesTestCases());
+
+std::vector<CardinalityTestCase<SamplingDatasetParams>> CardinalityTestCases() {
+  return {{/*dataset_params=*/OneHundredPercentSampleParams(),
+           /*expected_cardinality=*/kUnknownCardinality},
+          {/*dataset_params=*/TenPercentSampleParams(),
+           /*expected,cardinality=*/kUnknownCardinality},
+          {/*dataset_params=*/ZeroPercentSampleParams(),
+           /*expected_cardinality=*/kUnknownCardinality}};
+}
+
+DATASET_CARDINALITY_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                           CardinalityTestCases());
+
+std::vector<IteratorOutputDtypesTestCase<SamplingDatasetParams>>
+IteratorOutputDtypesTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_output_dtypes=*/{DT_INT64}}};
+}
+
+ITERATOR_OUTPUT_DTYPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                              IteratorOutputDtypesTestCases());
+
+std::vector<IteratorOutputShapesTestCase<SamplingDatasetParams>>
+IteratorOutputShapesTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_output_shapes=*/{PartialTensorShape({})}}};
+}
+
+ITERATOR_OUTPUT_SHAPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                              IteratorOutputShapesTestCases());
+
+std::vector<IteratorPrefixTestCase<SamplingDatasetParams>>
+IteratorOutputPrefixTestCases() {
+  return {{/*dataset_params=*/TenPercentSampleParams(),
+           /*expected_iterator_prefix=*/name_utils::IteratorPrefix(
+               SamplingDatasetOp::kDatasetType, kIteratorPrefix)}};
+}
+
+ITERATOR_PREFIX_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                       IteratorOutputPrefixTestCases());
+
+std::vector<IteratorSaveAndRestoreTestCase<SamplingDatasetParams>>
+IteratorSaveAndRestoreTestCases() {
+  return {{/*dataset_params=*/OneHundredPercentSampleParams(),
+           /*breakpoints=*/{0, 2, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {1}, {2}})},
+          {/*dataset_params=*/TenPercentSampleParams(),
+           /*breakpoints=*/{0, 2, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{9}, {11}, {19}})},
+          {/*dataset_params=*/ZeroPercentSampleParams(),
+           /*breakpoints=*/{0, 2, 5},
+           /*expected_outputs=*/{}}};
+}
+
+ITERATOR_SAVE_AND_RESTORE_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
+                                 IteratorSaveAndRestoreTestCases());
+
+}  // namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 31ec0868048..e7fd1ddabd0 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -26,11 +26,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class ScanDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ScanDatasetOp(OpKernelConstruction* ctx)
@@ -104,6 +102,11 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -298,5 +301,6 @@ REGISTER_INPUT_COLOCATION_EXEMPTION("ScanDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalScanDataset");
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index 64390e72fd2..1d6688fff26 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
@@ -87,9 +88,9 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     core::RefCountPtr<StatsAggregatorResource> resource;
     OP_REQUIRES_OK(ctx,
                    LookupResource(ctx, HandleFromInput(ctx, 1), &resource));
-    string tag;
+    tstring tag;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
-    string prefix;
+    tstring prefix;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "counter_prefix", &prefix));
 
     *output =
@@ -137,6 +138,10 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -206,8 +211,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     const DatasetBase* const input_;
     const Tensor resource_handle_;
     StatsAggregatorResource* stats_aggregator_resource_;
-    string tag_;
-    string prefix_;
+    tstring tag_;
+    tstring prefix_;
   };
 };
 
@@ -218,5 +223,6 @@ REGISTER_KERNEL_BUILDER(
     SetStatsAggregatorDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index afe44dc9ecc..cbb53e19937 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -18,11 +18,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class SleepDatasetOp : public UnaryDatasetOpKernel {
  public:
   using UnaryDatasetOpKernel::UnaryDatasetOpKernel;
@@ -69,6 +67,10 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -152,5 +154,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalSleepDataset")
                         SleepDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 154ce7dfd48..82baa018042 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -23,11 +23,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit SlidingWindowDatasetOp(OpKernelConstruction* ctx)
@@ -110,6 +108,10 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
       return n / window_shift_;
     }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -309,5 +311,6 @@ REGISTER_KERNEL_BUILDER(
     SlidingWindowDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index eeaf5051294..2a05c4e10f6 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/raw_coding.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
@@ -38,11 +39,13 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cord.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/data/experimental/snapshot.pb.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
@@ -52,11 +55,18 @@ const int64 kDefaultShardSizeBytes = 10LL * 1024 * 1024 * 1024;
 
 const size_t kHeaderSize = sizeof(uint64);
 
-const char kSnapshotFilename[] = "snapshot.metadata";
+constexpr char kSnapshotFilename[] = "snapshot.metadata";
 constexpr char kSnapshotReaderWorkerPool[] = "snapshot_reader_worker_pool";
+constexpr char kSnapshotWriterWorkerPool[] = "snapshot_writer_worker_pool";
+constexpr char kSeparator[] = "::";
+constexpr char kBookkeeping[] = "Bookkeeping";
 
 class SnapshotWriter {
  public:
+  static constexpr const char* const kClassName = "SnapshotWriter";
+  static constexpr const char* const kWriteStringPiece = "WriteStringPiece";
+  static constexpr const char* const kWriteCord = "WriteCord";
+
   explicit SnapshotWriter(WritableFile* dest, const string& compression_type =
                                                   io::compression::kNone)
       : dest_(dest), compression_type_(compression_type) {
@@ -79,6 +89,9 @@ class SnapshotWriter {
   }
 
   Status WriteRecord(const StringPiece& data) {
+    profiler::TraceMe activity(
+        absl::StrCat(kClassName, kSeparator, kWriteStringPiece),
+        profiler::TraceMeLevel::kInfo);
     char header[kHeaderSize];
     core::EncodeFixed64(header, data.size());
     TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
@@ -87,6 +100,8 @@ class SnapshotWriter {
 
 #if defined(PLATFORM_GOOGLE)
   Status WriteRecord(const absl::Cord& data) {
+    profiler::TraceMe activity(absl::StrCat(kClassName, kSeparator, kWriteCord),
+                               profiler::TraceMeLevel::kInfo);
     char header[kHeaderSize];
     core::EncodeFixed64(header, data.size());
 
@@ -123,6 +138,10 @@ class SnapshotWriter {
 
 class SnapshotReader {
  public:
+  static constexpr const char* const kClassName = "SnapshotReader";
+  static constexpr const char* const kReadString = "ReadString";
+  static constexpr const char* const kReadCord = "ReadCord";
+
   explicit SnapshotReader(
       RandomAccessFile* file,
       const string& compression_type = io::compression::kNone)
@@ -143,8 +162,11 @@ class SnapshotReader {
     }
   }
 
-  Status ReadRecord(string* record) {
-    string header;
+  Status ReadRecord(tstring* record) {
+    profiler::TraceMe activity(
+        absl::StrCat(kClassName, kSeparator, kReadString),
+        profiler::TraceMeLevel::kInfo);
+    tstring header;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
     uint64 length = core::DecodeFixed64(header.data());
     return input_stream_->ReadNBytes(length, record);
@@ -152,14 +174,16 @@ class SnapshotReader {
 
 #if defined(PLATFORM_GOOGLE)
   Status ReadRecord(absl::Cord* record) {
-    string header;
+    profiler::TraceMe activity(absl::StrCat(kClassName, kSeparator, kReadCord),
+                               profiler::TraceMeLevel::kInfo);
+    tstring header;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
     uint64 length = core::DecodeFixed64(header.data());
 
     if (compression_type_ == io::compression::kNone) {
       return input_stream_->ReadNBytes(length, record);
     } else {
-      string tmp_str;
+      tstring tmp_str;
       Status s = input_stream_->ReadNBytes(length, &tmp_str);
       record->Append(tmp_str);
       return s;
@@ -177,6 +201,11 @@ Status WriteMetadataFile(const string& hash_dir,
   string metadata_filename = absl::StrCat(hash_dir, "/", kSnapshotFilename);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(hash_dir));
 
+  Status exists = Env::Default()->FileExists(metadata_filename);
+  if (exists.ok()) {
+    TF_RETURN_IF_ERROR(Env::Default()->DeleteFile(metadata_filename));
+  }
+
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(metadata_filename, &file));
 
@@ -195,7 +224,7 @@ Status ReadMetadataFile(const string& hash_dir,
   std::unique_ptr<RandomAccessFile> file;
   TF_CHECK_OK(Env::Default()->NewRandomAccessFile(metadata_filename, &file));
 
-  string record_bytes;
+  tstring record_bytes;
   auto reader = absl::make_unique<SnapshotReader>(file.get());
   TF_CHECK_OK(reader->ReadRecord(&record_bytes));
 
@@ -304,14 +333,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    string path;
+    tstring path;
 
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "path", &path));
 
     SerializationContext::Params params;
     std::vector<std::pair<string, Tensor>> input_list;
     params.input_list = &input_list;
-    params.optimization_only = true;
+    params.check_external_state = false;
 
     GraphDef graph_def;
     OP_REQUIRES_OK(
@@ -373,6 +402,10 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -440,7 +473,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(mu_);
-        hash_dir_ = absl::StrCat(dataset()->dir_, "/", dataset()->graph_hash_);
+        // TODO(dero): remove NOLINT after USE_TSTRING is enabled.
+        hash_dir_ = absl::StrCat(StringPiece(dataset()->dir_), "/",  // NOLINT
+                                 dataset()->graph_hash_);
         return Status::OK();
       }
 
@@ -458,19 +493,19 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             case WRITER:
               iterator_ = absl::make_unique<SnapshotWriterIterator>(
                   SnapshotWriterIterator::Params{
-                      dataset(), strings::StrCat(prefix(), "Impl")},
+                      dataset(), strings::StrCat(prefix(), "WriterImpl")},
                   hash_dir_);
               break;
             case READER:
               iterator_ = absl::make_unique<SnapshotReaderIterator>(
                   SnapshotReaderIterator::Params{
-                      dataset(), strings::StrCat(prefix(), "Impl")},
+                      dataset(), strings::StrCat(prefix(), "ReaderImpl")},
                   hash_dir_, metadata);
               break;
             case PASSTHROUGH:
               iterator_ = absl::make_unique<SnapshotPassthroughIterator>(
                   SnapshotPassthroughIterator::Params{
-                      dataset(), strings::StrCat(prefix(), "Impl")});
+                      dataset(), strings::StrCat(prefix(), "PassthroughImpl")});
               break;
           }
           TF_RETURN_IF_ERROR(iterator_->Initialize(ctx));
@@ -494,26 +529,28 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
      private:
       class SnapshotReaderIterator : public DatasetIterator<Dataset> {
        public:
+        static constexpr const char* const kParse = "Parse";
+
         explicit SnapshotReaderIterator(
             const Params& params, const string& hash_dir,
             const experimental::SnapshotMetadataRecord& metadata)
             : DatasetIterator<Dataset>(params),
               hash_dir_(hash_dir),
-              metadata_(metadata) {
-          thread_pool_ = absl::make_unique<thread::ThreadPool>(
-              Env::Default(), ThreadOptions(), kSnapshotReaderWorkerPool,
-              params.dataset->num_reader_threads_, /*low_latency_hint=*/false);
-        }
+              metadata_(metadata) {}
 
         ~SnapshotReaderIterator() override {
           mutex_lock l(mu_);
           cancelled_ = true;
           cond_var_.notify_all();
+          while (num_active_threads_ > 0) {
+            cond_var_.wait(l);
+          }
         }
 
         Status Initialize(IteratorContext* ctx) override {
           mutex_lock l(mu_);
-
+          thread_pool_ = ctx->CreateThreadPool(kSnapshotReaderWorkerPool,
+                                               dataset()->num_reader_threads_);
           run_id_ = metadata_.run_id();
           run_dir_ = absl::StrCat(hash_dir_, "/", run_id_);
           // Get all the files in the run_dir.
@@ -534,6 +571,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           mutex_lock l(mu_);
           if (!background_threads_started_) {
             for (int i = 0; i < dataset()->num_reader_threads_; ++i) {
+              ++num_active_threads_;
               thread_pool_->Schedule([this]() { ReadingFilesLoop(); });
             }
             background_threads_started_ = true;
@@ -556,20 +594,25 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               *end_of_sequence = false;
               *out_tensors = std::move(buffer_.front().value);
 
-              // Printing some statistics along the way.
-              int64 num_bytes = 0;
-              for (int i = 0; i < out_tensors->size(); ++i) {
-                num_bytes += (*out_tensors)[i].TotalBytes();
-              }
-              absl::Time end = absl::Now();
-              absl::Duration d = end - start;
-              time_spent_micros_ += absl::ToInt64Microseconds(d);
-              kbytes_read_ += static_cast<double>(num_bytes) / 1024.0;
-              elements_produced_++;
-              if (elements_produced_ % 10000 == 0) {
-                VLOG(2) << "Current read throughput (MBPS): "
-                        << ((kbytes_read_ / 1024.0) /
-                            (time_spent_micros_ / 1000000.0));
+              {
+                profiler::TraceMe activity(
+                    absl::StrCat(prefix(), kSeparator, kBookkeeping),
+                    profiler::TraceMeLevel::kInfo);
+                // Printing some statistics along the way.
+                int64 num_bytes = 0;
+                for (int i = 0; i < out_tensors->size(); ++i) {
+                  num_bytes += (*out_tensors)[i].TotalBytes();
+                }
+                absl::Time end = absl::Now();
+                absl::Duration d = end - start;
+                time_spent_micros_ += absl::ToInt64Microseconds(d);
+                kbytes_read_ += static_cast<double>(num_bytes) / 1024.0;
+                elements_produced_++;
+                if (elements_produced_ % 10000 == 0) {
+                  LOG(INFO) << "Current read throughput (MBPS): "
+                            << ((kbytes_read_ / 1024.0) /
+                                (time_spent_micros_ / 1000000.0));
+                }
               }
             }
             buffer_.pop_front();
@@ -616,6 +659,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             Status s = reader->ReadRecord(&record_cord);
 #endif
             if (s.ok()) {
+              profiler::TraceMe activity(
+                  absl::StrCat(prefix(), kSeparator, kParse),
+                  profiler::TraceMeLevel::kInfo);
               experimental::SnapshotRecord record;
 #if !defined(PLATFORM_GOOGLE)
               record.ParseFromString(record_bytes);
@@ -648,6 +694,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         // Pulls one file off the filenames_ list and reads it through. When
         // all files are read, terminates.
         void ReadingFilesLoop() {
+          auto cleanup = gtl::MakeCleanup([this]() {
+            mutex_lock l(mu_);
+            --num_active_threads_;
+            cond_var_.notify_all();
+          });
           while (true) {
             string filename = "";
             {
@@ -691,6 +742,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           std::vector<Tensor> value;
         };
 
+        mutex mu_;
+        condition_variable cond_var_;
+
         const string hash_dir_;
         const experimental::SnapshotMetadataRecord metadata_;
         string run_id_ GUARDED_BY(mu_);
@@ -704,39 +758,39 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         int64 num_files_done_ GUARDED_BY(mu_) = 0;
 
         std::unique_ptr<thread::ThreadPool> thread_pool_;
-        condition_variable cond_var_;
+        int64 num_active_threads_ GUARDED_BY(mu_) = 0;
         std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
         bool cancelled_ GUARDED_BY(mu_) = false;
         bool background_threads_started_ GUARDED_BY(mu_) = false;
         bool background_threads_finished_ GUARDED_BY(mu_) = false;
-
-        mutex mu_;
       };
 
       class SnapshotWriterIterator : public DatasetIterator<Dataset> {
        public:
+        static constexpr const char* const kProcessOneElement =
+            "ProcessOneElement";
+
         explicit SnapshotWriterIterator(const Params& params,
                                         const string& hash_dir)
-            : DatasetIterator<Dataset>(params), hash_dir_(hash_dir) {
-          thread_pool_ = absl::make_unique<thread::ThreadPool>(
-              Env::Default(), ThreadOptions(), "snapshot_writer_pool",
-              params.dataset->num_writer_threads_, /*low_latency_hint=*/false);
-        }
+            : DatasetIterator<Dataset>(params), hash_dir_(hash_dir) {}
 
         ~SnapshotWriterIterator() override {
           mutex_lock l(mu_);
           cancelled_ = true;
           cond_var_.notify_all();
+          while (num_active_threads_ > 0) {
+            cond_var_.wait(l);
+          }
         }
 
         Status Initialize(IteratorContext* ctx) override {
           mutex_lock l(mu_);
-
+          thread_pool_ = ctx->CreateThreadPool(kSnapshotWriterWorkerPool,
+                                               dataset()->num_writer_threads_);
           run_id_ = strings::StrCat(
               strings::Hex(random::New64(), strings::kZeroPad4));
           run_dir_ = absl::StrCat(dataset()->writer_path_prefix_, hash_dir_,
                                   "/", run_id_);
-
           TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir_));
 
           experimental::SnapshotMetadataRecord metadata;
@@ -744,7 +798,6 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           metadata.set_graph_hash(dataset()->graph_hash_);
           metadata.set_run_id(run_id_);
           metadata.set_finalized(false);
-
           TF_RETURN_IF_ERROR(WriteMetadataFile(hash_dir_, metadata));
 
           return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
@@ -761,6 +814,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             first_call = first_call_;
             if (first_call_) {
               for (int i = 0; i < dataset()->num_writer_threads_; ++i) {
+                ++num_active_threads_;
                 thread_pool_->Schedule([this]() { WriterThread(); });
               }
               first_call_ = false;
@@ -788,23 +842,29 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           // Update prefetched_elem with the next element.
           TF_RETURN_IF_ERROR(FillBuffer(ctx));
 
-          // Book keeping to report some statistics.
-          mutex_lock l(mu_);
-          int64 num_bytes = 0;
-          for (auto out_tensor : *out_tensors) {
-            num_bytes += out_tensor.TotalBytes();
-          }
+          {
+            profiler::TraceMe activity(
+                absl::StrCat(prefix(), kSeparator, kBookkeeping),
+                profiler::TraceMeLevel::kInfo);
 
-          absl::Time end = absl::Now();
-          absl::Duration d = end - start;
-          time_spent_micros_ += absl::ToInt64Microseconds(d);
-          bytes_produced_ += num_bytes;
-          elements_produced_++;
+            // Book keeping to report some statistics.
+            mutex_lock l(mu_);
+            int64 num_bytes = 0;
+            for (auto out_tensor : *out_tensors) {
+              num_bytes += out_tensor.TotalBytes();
+            }
 
-          if (elements_produced_ % 10000 == 0) {
-            VLOG(2) << "Current write throughput (MBPS): "
-                    << (bytes_produced_ * 1000000.0) /
-                           (time_spent_micros_ * 1024.0 * 1024.0);
+            absl::Time end = absl::Now();
+            absl::Duration d = end - start;
+            time_spent_micros_ += absl::ToInt64Microseconds(d);
+            bytes_produced_ += num_bytes;
+            elements_produced_++;
+
+            if (elements_produced_ % 10000 == 0) {
+              LOG(INFO) << "Current write throughput (MBPS): "
+                        << (bytes_produced_ * 1000000.0) /
+                               (time_spent_micros_ * 1024.0 * 1024.0);
+            }
           }
           return Status::OK();
         }
@@ -836,7 +896,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             end_of_sequence_ = true;
             cond_var_.notify_all();
             // Now we wait till all background threads finish.
-            while (num_threads_finished_ < dataset()->num_writer_threads_) {
+            while (num_active_threads_ > 0) {
               cond_var_.wait(l);
             }
             return Status::OK();
@@ -870,6 +930,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                  std::unique_ptr<WritableFile>* file,
                                  std::unique_ptr<SnapshotWriter>* writer,
                                  bool* end_of_processing) {
+          profiler::TraceMe activity(
+              absl::StrCat(prefix(), kSeparator, kProcessOneElement),
+              profiler::TraceMeLevel::kInfo);
           bool cancelled = false;
           *end_of_processing = false;
           bool produced_elem = false;
@@ -918,7 +981,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               TF_RETURN_IF_ERROR((*writer)->Close());
               TF_RETURN_IF_ERROR((*file)->Close());
               *snapshot_data_filename = GetSnapshotFilename();
-              TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(
+              TF_RETURN_IF_ERROR(Env::Default()->NewAppendableFile(
                   *snapshot_data_filename, file));
               *writer = absl::make_unique<SnapshotWriter>(
                   file->get(), dataset()->compression_);
@@ -957,11 +1020,17 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
         // Just pulls off elements from the buffer and writes them.
         void WriterThread() {
+          auto cleanup = gtl::MakeCleanup([this]() {
+            mutex_lock l(mu_);
+            --num_active_threads_;
+            cond_var_.notify_all();
+          });
+
           int64 bytes_written = 0;
           string snapshot_data_filename = GetSnapshotFilename();
           std::unique_ptr<WritableFile> file;
           Status s =
-              Env::Default()->NewWritableFile(snapshot_data_filename, &file);
+              Env::Default()->NewAppendableFile(snapshot_data_filename, &file);
           if (!s.ok()) {
             LOG(ERROR) << "Creating " << snapshot_data_filename
                        << " failed: " << s.ToString();
@@ -987,12 +1056,19 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               return;
             }
           }
-          mutex_lock l(mu_);
-          num_threads_finished_++;
-          cond_var_.notify_all();
         }
 
         mutex mu_;
+        // This condition variable is notified
+        // 1. By the background writer threads when an element from the buffer
+        //    is consumed.
+        // 2. By the main thread when it puts something into the buffer.
+        // 3. By the main thread when the destructor is called to cancel.
+        // 4. By the background writer threads when any error is encountered
+        //    while writing.
+        // 5. By the background threads when they finish.
+        condition_variable cond_var_;
+
         BufferElement next_elem_ GUARDED_BY(mu_);
         std::unique_ptr<IteratorBase> input_impl_;
 
@@ -1004,15 +1080,6 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         int64 time_spent_micros_ GUARDED_BY(mu_) = 0;
         int64 bytes_produced_ GUARDED_BY(mu_) = 0;
 
-        // This condition variable is notified
-        // 1. By the background writer threads when an element from the buffer
-        //    is consumed.
-        // 2. By the main thread when it puts something into the buffer.
-        // 3. By the main thread when the destructor is called to cancel.
-        // 4. By the background writer threads when any error is encountered
-        //    while writing.
-        // 5. By the background threads when they finish.
-        condition_variable cond_var_;
         std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
         bool snapshot_failed_ GUARDED_BY(mu_) = false;
         bool cancelled_ GUARDED_BY(mu_) = false;
@@ -1020,8 +1087,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         bool end_of_sequence_ GUARDED_BY(mu_) = false;
         bool written_final_metadata_file_ GUARDED_BY(mu_) = false;
         uint64 next_file_index_ GUARDED_BY(mu_) = 0;
-        int64 num_threads_finished_ GUARDED_BY(mu_) = 0;
         std::unique_ptr<thread::ThreadPool> thread_pool_;
+        int64 num_active_threads_ GUARDED_BY(mu_) = 0;
       };
 
       class SnapshotPassthroughIterator : public DatasetIterator<Dataset> {
@@ -1051,7 +1118,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const string dir_;
+    const tstring dir_;
     const string graph_hash_;
 
     const string reader_path_prefix_;
@@ -1086,5 +1153,6 @@ REGISTER_KERNEL_BUILDER(Name("SnapshotDataset").Device(DEVICE_CPU),
                         SnapshotDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 58174f69a44..5bb511f4cb6 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace sql {
 
 std::unique_ptr<QueryConnection> DriverManager::CreateQueryConnection(
@@ -30,5 +31,6 @@ std::unique_ptr<QueryConnection> DriverManager::CreateQueryConnection(
 }
 
 }  // namespace sql
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index 6afadf91a47..7aa307e2690 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace sql {
 
 // A factory class for creating `QueryConnection` instances.
@@ -35,6 +36,7 @@ class DriverManager {
 };
 
 }  // namespace sql
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/experimental/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 10c66436792..40f13d54f35 100644
--- a/tensorflow/core/kernels/data/experimental/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -22,6 +22,8 @@ namespace data {
 
 class IteratorContext;
 
+namespace experimental {
+
 namespace sql {
 // This interface allows a user to connect to a database, execute a query, and
 // iterate over the result set, putting the results into an output tensor.
@@ -64,6 +66,7 @@ class QueryConnection {
 };
 
 }  // namespace sql
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index cadceee8f51..e86cbc7684c 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace sql {
 
 SqliteQueryConnection::SqliteQueryConnection() {}
@@ -100,7 +101,7 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
     TF_CALL_uint64(INT_CASE)
     TF_CALL_float(DOUBLE_CASE)
     TF_CALL_double(DOUBLE_CASE)
-    TF_CALL_string(STRING_CASE)
+    TF_CALL_tstring(STRING_CASE)
     case DT_BOOL:
       tensor->scalar<bool>()() = stmt_.ColumnInt(column_index) != 0;
       break;
@@ -114,5 +115,6 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
 }
 
 }  // namespace sql
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 61df29065e1..42526c7668a 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace sql {
 
 class SqliteQueryConnection : public QueryConnection {
@@ -50,6 +51,7 @@ class SqliteQueryConnection : public QueryConnection {
 };
 
 }  // namespace sql
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index 8a095d9c445..5f480c18400 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -25,11 +25,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following ops.
-
 class SqlDatasetOp : public DatasetOpKernel {
  public:
   explicit SqlDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
@@ -52,16 +50,16 @@ class SqlDatasetOp : public DatasetOpKernel {
     }
   }
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string driver_name;
+    tstring driver_name;
     OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<string>(ctx, "driver_name", &driver_name));
+        ctx, ParseScalarArgument<tstring>(ctx, "driver_name", &driver_name));
 
-    string data_source_name;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "data_source_name",
-                                                    &data_source_name));
+    tstring data_source_name;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "data_source_name",
+                                                     &data_source_name));
 
-    string query;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "query", &query));
+    tstring query;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, "query", &query));
 
     // TODO(b/64276826) Change this check when we add support for other
     // databases.
@@ -105,6 +103,8 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
+    Status CheckExternalState() const override { return Status::OK(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -204,9 +204,9 @@ class SqlDatasetOp : public DatasetOpKernel {
       std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
       bool query_connection_initialized_ GUARDED_BY(mu_) = false;
     };
-    const string driver_name_;
-    const string data_source_name_;
-    const string query_;
+    const tstring driver_name_;
+    const tstring data_source_name_;
+    const tstring query_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
   };
@@ -219,5 +219,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalSqlDataset").Device(DEVICE_CPU),
                         SqlDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 9b5a4832d85..05dadf084d4 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 static mutex* get_counters_map_lock() {
@@ -266,7 +267,7 @@ class StatsAggregatorSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &summary_t));
     Summary summary;
     resource->stats_aggregator()->EncodeToProto(&summary);
-    summary_t->scalar<string>()() = summary.SerializeAsString();
+    summary_t->scalar<tstring>()() = summary.SerializeAsString();
   }
 };
 
@@ -316,5 +317,6 @@ REGISTER_KERNEL_BUILDER(
     StatsAggregatorSetSummaryWriterOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 70a95faf707..1e36cbb7c76 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 // This op defines a `Dataset` that passes through its input elements and
@@ -41,7 +42,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    string tag;
+    tstring tag;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
     *output = new Dataset(ctx, input, std::move(tag));
   }
@@ -77,6 +78,10 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -141,7 +146,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const string tag_;
+    const tstring tag_;
   };
 };
 
@@ -152,7 +157,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    string tag;
+    tstring tag;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
     *output = new Dataset(ctx, input, std::move(tag));
   }
@@ -188,6 +193,10 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -254,7 +263,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const string tag_;
+    const tstring tag_;
   };
 };
 
@@ -271,5 +280,6 @@ REGISTER_KERNEL_BUILDER(
     LatencyStatsDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index af7f778fa74..378fa805975 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -26,11 +26,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
@@ -86,6 +84,11 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return kUnknownCardinality; }
 
+    Status CheckExternalState() const override {
+      TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -207,5 +210,6 @@ REGISTER_INPUT_COLOCATION_EXEMPTION("TakeWhileDataset");
 REGISTER_INPUT_COLOCATION_EXEMPTION("ExperimentalTakeWhileDataset");
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 8aa26ea4bec..0ece7617107 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class ThreadPoolResource : public ResourceBase {
@@ -171,6 +172,10 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -278,6 +283,10 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -378,6 +387,10 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -455,5 +468,6 @@ REGISTER_KERNEL_BUILDER(
     ThreadPoolDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 24262a50f11..ac3ad55ea6a 100644
--- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -23,6 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
 class ToTFRecordOp : public AsyncOpKernel {
@@ -46,54 +48,88 @@ class ToTFRecordOp : public AsyncOpKernel {
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     // The call to `iterator->GetNext()` may block and depend on an inter-op
     // thread pool thread, so we issue the call using a background thread.
-    background_worker_.Schedule([this, ctx, done]() {
-      string filename;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
-      string compression_type;
-      OP_REQUIRES_OK_ASYNC(ctx,
-                           ParseScalarArgument<string>(ctx, "compression_type",
-                                                       &compression_type),
-                           done);
-      std::unique_ptr<WritableFile> file;
-      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
-                           done);
-      std::unique_ptr<io::RecordWriter> writer =
-          absl::make_unique<io::RecordWriter>(
+    background_worker_.Schedule(std::bind(
+        [this, ctx](std::function<void()>& done) {
+          tstring filename;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, ParseScalarArgument<tstring>(ctx, "filename", &filename),
+              done);
+          tstring compression_type;
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               ParseScalarArgument<tstring>(
+                                   ctx, "compression_type", &compression_type),
+                               done);
+          std::unique_ptr<WritableFile> file;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, ctx->env()->NewWritableFile(filename, &file), done);
+          auto writer = absl::make_unique<io::RecordWriter>(
               file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
                               compression_type));
 
-      DatasetBase* dataset;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      std::unique_ptr<IteratorBase> iterator;
-      IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache =
-          absl::make_unique<FunctionHandleCache>(params.flr);
-      params.function_handle_cache = function_handle_cache.get();
-      IteratorContext iter_ctx(std::move(params));
-
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
-          done);
-
-      std::vector<Tensor> components;
-      components.reserve(dataset->output_dtypes().size());
-      bool end_of_sequence;
-      do {
-        OP_REQUIRES_OK_ASYNC(
-            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-            done);
-
-        if (!end_of_sequence) {
+          DatasetBase* dataset;
           OP_REQUIRES_OK_ASYNC(
-              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
-        }
-        components.clear();
-      } while (!end_of_sequence);
-      done();
-    });
+              ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+
+          IteratorContext::Params params(ctx);
+          FunctionHandleCache function_handle_cache(params.flr);
+          params.function_handle_cache = &function_handle_cache;
+          ResourceMgr resource_mgr;
+          params.resource_mgr = &resource_mgr;
+          CancellationManager cancellation_manager;
+          params.cancellation_manager = &cancellation_manager;
+          std::function<void()> deregister_fn;
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               ConnectCancellationManagers(
+                                   ctx->cancellation_manager(),
+                                   params.cancellation_manager, &deregister_fn),
+                               done);
+
+          // Update the `done` callback to deregister the cancellation callback.
+          done = std::bind(
+              [](const std::function<void()>& done,
+                 const std::function<void()>& deregister_fn) {
+                deregister_fn();
+                done();
+              },
+              std::move(done), std::move(deregister_fn));
+
+          IteratorContext iter_ctx(std::move(params));
+          std::unique_ptr<IteratorBase> iterator;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator",
+                                    &iterator),
+              done);
+
+          // Update the `done` callback to destroy the iterator before calling
+          // the actual callback to avoid destruction races.
+          IteratorBase* raw_iterator = iterator.release();
+          done = std::bind(
+              [raw_iterator](const std::function<void()>& done) {
+                delete raw_iterator;
+                done();
+              },
+              std::move(done));
+
+          std::vector<Tensor> components;
+          components.reserve(dataset->output_dtypes().size());
+          bool end_of_sequence;
+          do {
+            OP_REQUIRES_OK_ASYNC(
+                ctx,
+                raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+                done);
+
+            if (!end_of_sequence) {
+              OP_REQUIRES_OK_ASYNC(
+                  ctx, writer->WriteRecord(components[0].scalar<tstring>()()),
+                  done);
+            }
+            components.clear();
+          } while (!end_of_sequence);
+          done();
+        },
+        std::move(done)));
   }
 
  private:
@@ -106,5 +142,6 @@ REGISTER_KERNEL_BUILDER(
     Name("ExperimentalDatasetToTFRecord").Device(DEVICE_CPU), ToTFRecordOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 3252196acfe..c206982bc68 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -19,11 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class UnbatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit UnbatchDatasetOp(OpKernelConstruction* ctx)
@@ -71,6 +69,10 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -227,5 +229,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalUnbatchDataset").Device(DEVICE_CPU),
                         UnbatchDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index 613b2fd641b..90926875412 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -19,11 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace experimental {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class UniqueDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit UniqueDatasetOp(OpKernelConstruction* ctx)
@@ -74,6 +72,10 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("UniqueDatasetOp::Dataset");
     }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -171,7 +173,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
             return Hash64(t.tensor_data().data(), t.tensor_data().size());
           } else {
             DCHECK_EQ(DT_STRING, t.dtype());
-            auto flat_t = t.flat<string>();
+            auto flat_t = t.flat<tstring>();
             uint64 hash = 0;
             for (int64 i = 0; i < t.NumElements(); ++i) {
               hash = Hash64Combine(hash, Hash64(flat_t(i)));
@@ -227,5 +229,6 @@ REGISTER_KERNEL_BUILDER(Name("ExperimentalUniqueDataset").Device(DEVICE_CPU),
                         UniqueDatasetOp);
 
 }  // namespace
+}  // namespace experimental
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index ad92840e04a..93258564bcc 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -75,6 +75,11 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/filter_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
index bb4e17da3c9..86342070de3 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
@@ -77,8 +77,7 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
   std::vector<Tensor> tensors;
   tensors.reserve(values.size());
   for (auto &value : values) {
-    tensors.emplace_back(
-        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+    tensors.emplace_back(CreateTensor<T>(TensorShape({1}), {value}));
   }
   return tensors;
 }
@@ -86,8 +85,7 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
 // Test case 1: norm case.
 TestCase TestCase1() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{9, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          {CreateTensor<int64>(TensorShape{9, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
           /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::IsZero()},
           /*expected_outputs*/
@@ -101,7 +99,7 @@ TestCase TestCase1() {
 // Test case 2: the input dataset has no outputs.
 TestCase TestCase2() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+          {CreateTensor<int64>(TensorShape{0}, {})},
           /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::IsZero()},
           /*expected_outputs*/
@@ -115,8 +113,7 @@ TestCase TestCase2() {
 // Test case 3: the filter function returns two outputs.
 TestCase InvalidFuncTestCase1() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          {CreateTensor<int64>(TensorShape{3, 3}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
           /*func*/
           FunctionDefHelper::FunctionRef(
               "GetUnique", {{"T", DT_INT64}, {"out_idx", DT_INT32}}),
@@ -131,24 +128,23 @@ TestCase InvalidFuncTestCase1() {
 
 // Test case 4: the filter function returns a 1-D bool tensor.
 TestCase InvalidFuncTestCase2() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
-          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
-          /*func_lib*/ {test::function::IsZero()},
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+      /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+      /*func_lib*/ {test::function::IsZero()},
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+      /*expected_cardinality*/ kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 // Test case 5: the filter function returns a scalar int64 tensor.
 TestCase InvalidFuncTestCase3() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{9}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          {CreateTensor<int64>(TensorShape{9}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
           /*func*/ FunctionDefHelper::FunctionRef("NonZero", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::NonZero()},
           /*expected_outputs*/
@@ -350,39 +346,6 @@ TEST_P(ParameterizedFilterDatasetOpTest, Cardinality) {
   EXPECT_EQ(filter_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedFilterDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  const TestCase &test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  std::unique_ptr<OpKernel> filter_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterDatasetKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &filter_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&tensor_slice_dataset_tensor)});
-  std::unique_ptr<OpKernelContext> filter_dataset_context;
-  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
-                                          &filter_dataset_context));
-  DatasetBase *filter_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
-                             filter_dataset_context.get(), &filter_dataset));
-  core::ScopedUnref scoped_unref(filter_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(filter_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedFilterDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   const TestCase &test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index dd147c6fd95..cb8a2d5cf16 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -93,6 +93,8 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType, params);
   }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -141,7 +143,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
 
             // Produce the record as output.
             Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
-            record_tensor.scalar<string>()() = record;
+            record_tensor.scalar<tstring>()() = record;
             out_tensors->emplace_back(std::move(record_tensor));
             *end_of_sequence = false;
             return Status::OK();
@@ -256,7 +258,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
           if (dataset()->compression_type_.empty()) {
             DCHECK_GE(file_pos_limit_, 0);
             if (current_pos < file_pos_limit_) {
-              string record;
+              tstring record;
               TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
                   dataset()->record_bytes_, &record));
               metrics::RecordTFDataBytesRead(kDatasetType,
@@ -264,25 +266,27 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
 
               // Produce the record as output.
               Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
-              record_tensor.scalar<string>()() = std::move(record);
+              record_tensor.scalar<tstring>()() = std::move(record);
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
               return Status::OK();
             }
           } else {
-            string record;
+            tstring record;
             Status s = buffered_input_stream_->ReadNBytes(
                 dataset()->record_bytes_, &record);
             if (s.ok()) {
               metrics::RecordTFDataBytesRead(kDatasetType,
                                              dataset()->record_bytes_);
               lookahead_cache_.append(record);
-              record = lookahead_cache_.substr(0, dataset()->record_bytes_);
-              lookahead_cache_ =
-                  lookahead_cache_.substr(dataset()->record_bytes_);
+              StringPiece lookahead_cache_view(lookahead_cache_);
+              record = tstring(
+                  lookahead_cache_view.substr(0, dataset()->record_bytes_));
+              lookahead_cache_ = tstring(
+                  lookahead_cache_view.substr(dataset()->record_bytes_));
               // Produce the record as output.
               Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
-              record_tensor.scalar<string>()() = std::move(record);
+              record_tensor.scalar<tstring>()() = std::move(record);
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
               return Status::OK();
@@ -431,7 +435,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     std::unique_ptr<io::InputStreamInterface> buffered_input_stream_
         GUARDED_BY(mu_);
     int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
-    string lookahead_cache_ GUARDED_BY(mu_);
+    tstring lookahead_cache_ GUARDED_BY(mu_);
   };
 
   const std::vector<string> filenames_;
@@ -439,7 +443,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
   const int64 record_bytes_;
   const int64 footer_bytes_;
   const int64 buffer_size_;
-  const string compression_type_;
+  const tstring compression_type_;
   const int op_version_;
 };
 
@@ -459,7 +463,7 @@ void FixedLengthRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-    filenames.push_back(filenames_tensor->flat<string>()(i));
+    filenames.push_back(filenames_tensor->flat<tstring>()(i));
   }
 
   int64 header_bytes = -1;
@@ -488,10 +492,10 @@ void FixedLengthRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
   if (buffer_size == 0) {
     buffer_size = 256 << 10;  // 256 kB as default.
   }
-  string compression_type;
+  tstring compression_type;
   if (op_version_ > 1) {
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, kCompressionType,
-                                                    &compression_type));
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kCompressionType,
+                                                     &compression_type));
     OP_REQUIRES(ctx,
                 compression_type.empty() || compression_type == kZLIB ||
                     compression_type == kGZIP,
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
index 0a0f7a2e78f..b3fd26798d7 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
@@ -55,7 +55,7 @@ class FixedLengthRecordDatasetOpTest : public DatasetOpsTestBase {
 };
 
 struct TestCase {
-  std::vector<string> filenames;
+  std::vector<tstring> filenames;
   std::vector<string> contents;
   int64 header_bytes;
   int64 record_bytes;
@@ -105,11 +105,11 @@ TestCase TestCase1() {
           /*buffer_size*/ 10,
           /*compression_type*/ CompressionType::ZLIB,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"111"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"222"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"aaa"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bbb"})},
+          {CreateTensor<tstring>(TensorShape({}), {"111"}),
+           CreateTensor<tstring>(TensorShape({}), {"222"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"aaa"}),
+           CreateTensor<tstring>(TensorShape({}), {"bbb"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -129,11 +129,11 @@ TestCase TestCase2() {
           /*buffer_size*/ 10,
           /*compression_type*/ CompressionType::GZIP,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"111"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"222"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"aaa"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bbb"})},
+          {CreateTensor<tstring>(TensorShape({}), {"111"}),
+           CreateTensor<tstring>(TensorShape({}), {"222"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"aaa"}),
+           CreateTensor<tstring>(TensorShape({}), {"bbb"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -154,11 +154,11 @@ TestCase TestCase3() {
           /*buffer_size*/ 10,
           /*compression_type*/ CompressionType::UNCOMPRESSED,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"111"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"222"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"aaa"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bbb"})},
+          {CreateTensor<tstring>(TensorShape({}), {"111"}),
+           CreateTensor<tstring>(TensorShape({}), {"222"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"aaa"}),
+           CreateTensor<tstring>(TensorShape({}), {"bbb"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -183,7 +183,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, GetNext) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -192,7 +192,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, GetNext) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -243,7 +243,7 @@ TEST_F(FixedLengthRecordDatasetOpTest, DatasetNodeName) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -252,7 +252,7 @@ TEST_F(FixedLengthRecordDatasetOpTest, DatasetNodeName) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -286,7 +286,7 @@ TEST_F(FixedLengthRecordDatasetOpTest, DatasetTypeString) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -295,7 +295,7 @@ TEST_F(FixedLengthRecordDatasetOpTest, DatasetTypeString) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -333,7 +333,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, DatasetOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -342,7 +342,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, DatasetOutputDtypes) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -377,7 +377,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, DatasetOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -386,7 +386,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, DatasetOutputShapes) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -422,7 +422,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, Cardinality) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -431,7 +431,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, Cardinality) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -452,56 +452,6 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, Cardinality) {
             test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TF_ASSERT_OK(CreateTestFiles(test_case));
-
-  std::unique_ptr<OpKernel> fixed_length_record_dataset_kernel;
-  TF_ASSERT_OK(CreateFixedLengthRecordDatasetOpKernel(
-      &fixed_length_record_dataset_kernel));
-
-  int64 num_files = test_case.filenames.size();
-  Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor header_bytes =
-      CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
-  Tensor record_bytes =
-      CreateTensor<int64>(TensorShape({}), {test_case.record_bytes});
-  Tensor footer_bytes =
-      CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
-  Tensor buffer_size =
-      CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
-      TensorShape({}), {ToString(test_case.compression_type)});
-  gtl::InlinedVector<TensorValue, 4> inputs{
-      TensorValue(&filenames),    TensorValue(&header_bytes),
-      TensorValue(&record_bytes), TensorValue(&footer_bytes),
-      TensorValue(&buffer_size),  TensorValue(&compression_type),
-  };
-  std::unique_ptr<OpKernelContext> fixed_length_record_dataset_context;
-  TF_ASSERT_OK(CreateFixedLengthRecordDatasetContext(
-      fixed_length_record_dataset_kernel.get(), &inputs,
-      &fixed_length_record_dataset_context));
-
-  DatasetBase* fixed_length_record_dataset;
-  TF_ASSERT_OK(CreateDataset(fixed_length_record_dataset_kernel.get(),
-                             fixed_length_record_dataset_context.get(),
-                             &fixed_length_record_dataset));
-  core::ScopedUnref scoped_unref(fixed_length_record_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(
-      fixed_length_record_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
@@ -516,7 +466,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -525,7 +475,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputDtypes) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -568,7 +518,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -577,7 +527,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputShapes) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -620,7 +570,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputPrefix) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -629,7 +579,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, IteratorOutputPrefix) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
@@ -674,7 +624,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, Roundtrip) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
   Tensor header_bytes =
       CreateTensor<int64>(TensorShape({}), {test_case.header_bytes});
   Tensor record_bytes =
@@ -683,7 +633,7 @@ TEST_P(ParameterizedFixedLengthRecordDatasetOpTest, Roundtrip) {
       CreateTensor<int64>(TensorShape({}), {test_case.footer_bytes});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  Tensor compression_type = CreateTensor<string>(
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   gtl::InlinedVector<TensorValue, 4> inputs{
       TensorValue(&filenames),    TensorValue(&header_bytes),
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 8a0d14b47c2..184d9daf806 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -75,6 +75,11 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc b/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
index 8beb9433128..8927a4f2d4a 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
@@ -73,47 +73,47 @@ struct TestCase {
 };
 
 TestCase MakeTensorSliceDatasetFuncTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          FunctionDefHelper::FunctionRef(
-              /*name*/ "MakeTensorSliceDataset",
-              /*attrs*/ {{"Toutput_types", DataTypeVector({DT_INT64})},
-                         {"output_shapes", std::vector<PartialTensorShape>(
-                                               {PartialTensorShape({1})})}}),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      FunctionDefHelper::FunctionRef(
+          /*name*/ "MakeTensorSliceDataset",
+          /*attrs*/ {{"Toutput_types", DataTypeVector({DT_INT64})},
+                     {"output_shapes", std::vector<PartialTensorShape>(
+                                           {PartialTensorShape({1})})}}),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // Test case 2: test the case if the function does not return a single scalar
 // of dtype DT_VARIANT.
 TestCase InvalidFuncTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          FunctionDefHelper::FunctionRef(/*name*/ "NonZero",
-                                         /*attrs*/ {{"T", DT_INT64}}),
-          /*func_lib*/ {test::function::NonZero()},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      FunctionDefHelper::FunctionRef(/*name*/ "NonZero",
+                                     /*attrs*/ {{"T", DT_INT64}}),
+      /*func_lib*/ {test::function::NonZero()},
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {}};
 }
 
 class ParameterizedFlatMapDatasetOpTest
@@ -364,41 +364,6 @@ TEST_P(ParameterizedFlatMapDatasetOpTest, Cardinality) {
   EXPECT_EQ(flat_map_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_F(FlatMapDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  const TestCase &test_case = MakeTensorSliceDatasetFuncTestCase();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  std::unique_ptr<OpKernel> flat_map_dataset_kernel;
-  TF_ASSERT_OK(CreateFlatMapDatasetKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &flat_map_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&tensor_slice_dataset_tensor)});
-  std::unique_ptr<OpKernelContext> flat_map_dataset_context;
-  TF_ASSERT_OK(CreateFlatMapDatasetContext(flat_map_dataset_kernel.get(),
-                                           &inputs, &flat_map_dataset_context));
-  DatasetBase *flat_map_dataset;
-  TF_ASSERT_OK(CreateDataset(flat_map_dataset_kernel.get(),
-                             flat_map_dataset_context.get(),
-                             &flat_map_dataset));
-  core::ScopedUnref scoped_unref(flat_map_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(flat_map_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedFlatMapDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   const TestCase &test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 49ee3ede656..e57a18540a4 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -74,12 +74,18 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(init_func_->CheckExternalState());
+    TF_RETURN_IF_ERROR(next_func_->CheckExternalState());
+    return finalize_func_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
-    return errors::Unimplemented("%s does not support serialization",
-                                 DebugString());
+    return errors::Unimplemented(DebugString(),
+                                 " does not support serialization");
   }
 
  private:
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 5a6c65cde85..642092a078d 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -81,6 +81,11 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
index 20b55f03b33..e3057008b4b 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
@@ -80,8 +80,7 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
   std::vector<Tensor> tensors;
   tensors.reserve(values.size());
   for (auto &value : values) {
-    tensors.emplace_back(
-        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+    tensors.emplace_back(CreateTensor<T>(TensorShape({1}), {value}));
   }
   return tensors;
 }
@@ -97,201 +96,199 @@ FunctionDefHelper::AttrValueWrapper MakeTensorSliceDatasetFunc(
 
 // test case 1: cycle_length = 1, block_length = 1.
 TestCase TestCase1() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 2: cycle_length = 2, block_length = 1.
 TestCase TestCase2() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 1, 4, 2, 5, 6, 7, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 1, 4, 2, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 3: cycle_length = 3, block_length = 1.
 TestCase TestCase3() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {3}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 4: cycle_length = 5, block_length = 1.
 TestCase TestCase4() {
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {3}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
+}
+
+// test case 5: cycle_length = 2, block_length = 2.
+TestCase TestCase5() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
           /*func*/
           MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
+              DataTypeVector({DT_STRING}),
               std::vector<PartialTensorShape>({PartialTensorShape({1})})),
           /*func_lib*/ {test::function::MakeTensorSliceDataset()},
           /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+          CreateTensor<int64>(TensorShape({}), {2}),
           /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          CreateTensor<int64>(TensorShape({}), {2}),
           /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "d", "e", "c", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({1})},
           /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
           /*breakpoints*/ {0, 4, 11}};
 }
 
-// test case 5: cycle_length = 2, block_length = 2.
-TestCase TestCase5() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "d", "e", "c", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
-}
-
 // test case 6: cycle_length = 2, block_length = 3.
 TestCase TestCase6() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 7: cycle_length = 2, block_length = 5.
 TestCase TestCase7() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {5}),
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 8: cycle_length = 0, block_length = 5.
 TestCase InvalidCycleLengthTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-          /*expected_outputs*/ ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {0}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {5}),
+      /*expected_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 // test case 9: cycle_length = 1, block_length = -1.
 TestCase InvalidBlockLengthTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-          /*expected_outputs*/ ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/ CreateTensor<int64>(TensorShape({}), {1}),
+      /*block_length*/ CreateTensor<int64>(TensorShape({}), {-1}),
+      /*expected_outputs*/ ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 class ParameterizedInterleaveDatasetOpTest
@@ -573,43 +570,6 @@ TEST_P(ParameterizedInterleaveDatasetOpTest, Cardinality) {
   EXPECT_EQ(interleave_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedInterleaveDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  const TestCase &test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  std::unique_ptr<OpKernel> interleave_dataset_kernel;
-  TF_ASSERT_OK(CreateInterleaveDatasetKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &interleave_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor cycle_length = test_case.cycle_length;
-  Tensor block_length = test_case.block_length;
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&cycle_length),
-       TensorValue(&block_length)});
-  std::unique_ptr<OpKernelContext> interleave_dataset_context;
-  TF_ASSERT_OK(CreateInterleaveDatasetContext(
-      interleave_dataset_kernel.get(), &inputs, &interleave_dataset_context));
-  DatasetBase *interleave_dataset;
-  TF_ASSERT_OK(CreateDataset(interleave_dataset_kernel.get(),
-                             interleave_dataset_context.get(),
-                             &interleave_dataset));
-  core::ScopedUnref scoped_unref(interleave_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(interleave_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedInterleaveDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   const TestCase &test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 5ae1c155cca..08d9d936537 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -50,11 +52,15 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
+const char kAnonymousIterator[] = "AnonymousIterator";
+const char kAnonymousIteratorV2[] = "AnonymousIteratorV2";
 const char kIteratorVariantTypeName[] = "tensorflow::Iterator";
+const char kOutputShapes[] = "output_shapes";
+const char kOutputTypes[] = "output_types";
 
 }  // namespace
 
-Status IteratorResource::GetNext(IteratorContext* ctx,
+Status IteratorResource::GetNext(OpKernelContext* ctx,
                                  std::vector<Tensor>* out_tensors,
                                  bool* end_of_sequence) {
   std::shared_ptr<State> captured_state;
@@ -68,20 +74,20 @@ Status IteratorResource::GetNext(IteratorContext* ctx,
     params.function_handle_cache = captured_state->function_handle_cache.get();
     params.resource_mgr = &captured_state->resource_mgr;
     params.thread_factory = unbounded_thread_pool_.get_thread_factory();
+    params.thread_pool = &unbounded_thread_pool_;
+    params.cancellation_manager = &captured_state->cancellation_manager;
+    std::function<void()> deregister_fn;
+    TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(),
+                                                   params.cancellation_manager,
+                                                   &deregister_fn));
+    auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
     return captured_state->iterator->GetNext(IteratorContext(std::move(params)),
                                              out_tensors, end_of_sequence);
-  } else {
-    return errors::FailedPrecondition(
-        "GetNext() failed because the iterator has not been initialized. "
-        "Ensure that you have run the initializer operation for this "
-        "iterator before getting the next element.");
   }
-}
-
-Status IteratorResource::GetNext(IteratorContext&& ctx,
-                                 std::vector<Tensor>* out_tensors,
-                                 bool* end_of_sequence) {
-  return GetNext(&ctx, out_tensors, end_of_sequence);
+  return errors::FailedPrecondition(
+      "GetNext() failed because the iterator has not been initialized. Ensure "
+      "that you have run the initializer operation for this iterator before "
+      "getting the next element.");
 }
 
 Status IteratorResource::Save(SerializationContext* ctx,
@@ -93,86 +99,40 @@ Status IteratorResource::Save(SerializationContext* ctx,
   }
   if (captured_state->iterator) {
     return captured_state->iterator->Save(ctx, writer);
-  } else {
-    return errors::FailedPrecondition(
-        "Save() failed because the iterator has not been initialized. "
-        "Ensure that you have run the initializer operation for this "
-        "iterator before saving it.");
   }
+  return errors::FailedPrecondition(
+      "Save() failed because the iterator has not been initialized. Ensure "
+      "that you have run the initializer operation for this iterator before "
+      "saving it.");
 }
 
 Status IteratorResource::Restore(OpKernelContext* ctx,
                                  IteratorStateReader* reader) {
-  string serialized_graph_def;
-  TF_RETURN_IF_ERROR(
-      reader->ReadScalar(DatasetBase::kDatasetGraphKey, &serialized_graph_def));
-  GraphDef graph_def;
-  if (!graph_def.ParseFromString(serialized_graph_def)) {
-    return errors::Internal("Error parsing dataset GraphDef.");
-  }
-  string output_node;
-  TF_RETURN_IF_ERROR(reader->ReadScalar(DatasetBase::kDatasetGraphOutputNodeKey,
-                                        &output_node));
-  DatasetBase* dataset = nullptr;
-  Graph graph(OpRegistry::Global());
-  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-  std::vector<Tensor> outputs;
-  GraphRunner graph_runner(ctx->env());
-
-  // Build a new FLR that knows about the functions in the graph, and use
-  // it for all operations on the restored iterator.
-  // NOTE(mrry): We clone the existing FLR and use it in the GraphRunner
-  // because some of the OpKernels in the graph might call functions that are
-  // only defined in the loaded GraphDef.
-  FunctionLibraryRuntime* flr;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-  TF_RETURN_IF_ERROR(
-      ctx->function_library()->Clone(&flib_def, &pflr, &flr, true));
-
-  // Some function names may be duplicated (for example, if the serialized
-  // graph has an optimized function that retains its original name). We
-  // override functions in flib_def in the event of conflict. It is
-  // safe to assume that any node in the serialized graph is referring to the
-  // serialized function when there is a conflict.
-  TF_RETURN_IF_ERROR(AddToFunctionLibrary(flib_def.get(), graph_def.library()));
-  auto new_state = absl::make_unique<State>(
-      std::move(flib_def), std::move(pflr), flr, nullptr /* iterator */);
-
-  TF_RETURN_IF_ERROR(
-      graph_runner.Run(&graph, new_state->flr, {}, {output_node}, &outputs));
-  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
-
-  IteratorContext::Params params(ctx);
-  params.flr = new_state->flr;
-  params.function_handle_cache = new_state->function_handle_cache.get();
-  params.resource_mgr = &new_state->resource_mgr;
-  params.thread_factory = unbounded_thread_pool_.get_thread_factory();
-
-  TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                           "Iterator", &new_state->iterator));
-  TF_RETURN_IF_ERROR(
-      VerifyTypesMatch(output_dtypes_, new_state->iterator->output_dtypes()));
-  TF_RETURN_IF_ERROR(VerifyShapesCompatible(
-      output_shapes_, new_state->iterator->output_shapes()));
-
+  std::shared_ptr<State> captured_state;
   {
-    IteratorContext::Params params(ctx);
-    params.flr = new_state->flr;
-    params.function_handle_cache = new_state->function_handle_cache.get();
-    params.resource_mgr = &new_state->resource_mgr;
-    DeviceBase* device = new_state->flr->device();
-    params.allocator_getter = [device](AllocatorAttributes attrs) {
-      return device->GetAllocator(attrs);
-    };
-    params.thread_factory = unbounded_thread_pool_.get_thread_factory();
-    IteratorContext iter_ctx(std::move(params));
-    TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader));
+    tf_shared_lock l(mu_);
+    captured_state = iterator_state_;
   }
-
-  mutex_lock l(mu_);
-  iterator_state_ = std::move(new_state);
-  return Status::OK();
+  if (captured_state->iterator) {
+    IteratorContext::Params params(ctx);
+    params.flr = captured_state->flr;
+    params.function_handle_cache = captured_state->function_handle_cache.get();
+    params.resource_mgr = &captured_state->resource_mgr;
+    params.thread_factory = unbounded_thread_pool_.get_thread_factory();
+    params.thread_pool = &unbounded_thread_pool_;
+    params.cancellation_manager = &captured_state->cancellation_manager;
+    std::function<void()> deregister_fn;
+    TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(),
+                                                   params.cancellation_manager,
+                                                   &deregister_fn));
+    auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
+    IteratorContext iter_ctx(std::move(params));
+    return captured_state->iterator->Restore(&iter_ctx, reader);
+  }
+  return errors::FailedPrecondition(
+      "Restore() failed because the iterator has not been initialized. Ensure "
+      "that you have run the initializer operation for this iterator before "
+      "restoring it.");
 }
 
 Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
@@ -182,10 +142,8 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
     tf_shared_lock l(mu_);
     new_state = std::make_shared<State>(
         iterator_state_->flib_def, iterator_state_->pflr, iterator_state_->flr,
-        nullptr /* function_handle_cache */, nullptr /* iterator */);
+        /*iterator=*/nullptr);
   }
-  new_state->function_handle_cache =
-      absl::make_unique<FunctionHandleCache>(new_state->flr);
   // Create new iterator.
   std::unique_ptr<IteratorBase> iterator;
   IteratorContext::Params params(ctx);
@@ -193,13 +151,22 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
   params.function_handle_cache = new_state->function_handle_cache.get();
   params.resource_mgr = &new_state->resource_mgr;
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
-  TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                           "Iterator", &iterator));
-  TF_RETURN_IF_ERROR(
-      VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
-  TF_RETURN_IF_ERROR(
-      VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
-  std::swap(new_state->iterator, iterator);
+  params.thread_pool = &unbounded_thread_pool_;
+  params.cancellation_manager = &new_state->cancellation_manager;
+  std::function<void()> deregister_fn;
+  TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(),
+                                                 params.cancellation_manager,
+                                                 &deregister_fn));
+  {
+    auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    std::swap(new_state->iterator, iterator);
+  }
 
   mutex_lock l(mu_);
   std::swap(iterator_state_, new_state);
@@ -299,8 +266,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
 // resource containers with AnonymousIteratorHandleOp instead.
 IteratorHandleOp::IteratorHandleOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_dtypes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
 }
 
@@ -407,19 +374,14 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
 // running them.
 AnonymousIteratorHandleOp::AnonymousIteratorHandleOp(
     OpKernelConstruction* context)
-    : AnonymousIteratorResourceOp<IteratorResource>(context),
+    : AnonymousResourceOp<IteratorResource>(context),
       graph_def_version_(context->graph_def_version()) {
-  create_deleter_ = context->def().op() == "AnonymousIteratorV2";
+  OP_REQUIRES_OK(context, context->GetAttr(kOutputTypes, &output_dtypes_));
+  OP_REQUIRES_OK(context, context->GetAttr(kOutputShapes, &output_shapes_));
+  create_deleter_ = context->def().op() == kAnonymousIteratorV2;
 }
 
-static std::atomic<int64> current_iterator_id_;
-
-void AnonymousIteratorHandleOp::GenerateContainerNames(string* unique_name,
-                                                       string* container_name) {
-  *unique_name =
-      strings::StrCat("AnonymousIterator", current_iterator_id_.fetch_add(1));
-  *container_name = "AnonymousIterator";
-}
+string AnonymousIteratorHandleOp::name() { return kAnonymousIterator; }
 
 Status AnonymousIteratorHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
@@ -477,64 +439,91 @@ class ToSingleElementOp : public AsyncOpKernel {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    background_worker_.Schedule([ctx, done]() {
-      DatasetBase* dataset;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      std::unique_ptr<IteratorBase> iterator;
-      IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache =
-          absl::make_unique<FunctionHandleCache>(params.flr);
-      params.function_handle_cache = function_handle_cache.get();
-      std::unique_ptr<ResourceMgr> resource_mgr =
-          absl::make_unique<ResourceMgr>();
-      params.resource_mgr = resource_mgr.get();
-      IteratorContext iter_ctx(std::move(params));
+    background_worker_.Schedule(std::bind(
+        [ctx](std::function<void()>& done) {
+          DatasetBase* dataset;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
 
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
-          done);
+          IteratorContext::Params params(ctx);
+          FunctionHandleCache function_handle_cache(params.flr);
+          params.function_handle_cache = &function_handle_cache;
+          ResourceMgr resource_mgr;
+          params.resource_mgr = &resource_mgr;
+          CancellationManager cancellation_manager;
+          params.cancellation_manager = &cancellation_manager;
+          std::function<void()> deregister_fn;
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               ConnectCancellationManagers(
+                                   ctx->cancellation_manager(),
+                                   params.cancellation_manager, &deregister_fn),
+                               done);
 
-      // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
-      // avoid destruction races.
-      IteratorBase* raw_iterator = iterator.release();
-      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
-        delete raw_iterator;
-        done();
-      });
-      std::vector<Tensor> components;
-      components.reserve(dataset->output_dtypes().size());
-      bool end_of_sequence = false;
+          // Update the `done` callback to deregister the cancellation callback.
+          done = std::bind(
+              [](const std::function<void()>& done,
+                 const std::function<void()>& deregister_fn) {
+                deregister_fn();
+                done();
+              },
+              std::move(done), std::move(deregister_fn));
 
-      Status s =
-          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
-      if (!s.ok()) {
-        ctx->SetStatus(s);
-        return;
-      }
-      if (end_of_sequence) {
-        ctx->SetStatus(errors::InvalidArgument("Dataset was empty."));
-        return;
-      }
-      for (int i = 0; i < components.size(); ++i) {
-        // TODO(mrry): Check that the shapes match the shape attrs.
-        ctx->set_output(i, components[i]);
-      }
+          IteratorContext iter_ctx(std::move(params));
+          std::unique_ptr<IteratorBase> iterator;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              dataset->MakeIterator(&iter_ctx, "SingleElementIterator",
+                                    &iterator),
+              done);
 
-      components.clear();
-      Status s2 =
-          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
-      if (!s2.ok()) {
-        ctx->SetStatus(s2);
-        return;
-      }
-      if (!end_of_sequence) {
-        ctx->SetStatus(
-            errors::InvalidArgument("Dataset had more than one element."));
-        return;
-      }
-    });
+          // Update the `done` callback to destroy the iterator before calling
+          // the actual callback to avoid destruction races.
+          IteratorBase* raw_iterator = iterator.release();
+          done = std::bind(
+              [raw_iterator](const std::function<void()>& done) {
+                delete raw_iterator;
+                done();
+              },
+              std::move(done));
+
+          std::vector<Tensor> components;
+          components.reserve(dataset->output_dtypes().size());
+          bool end_of_sequence = false;
+
+          Status s =
+              raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+            done();
+            return;
+          }
+          if (end_of_sequence) {
+            ctx->SetStatus(errors::InvalidArgument("Dataset was empty."));
+            done();
+            return;
+          }
+          for (int i = 0; i < components.size(); ++i) {
+            // TODO(mrry): Check that the shapes match the shape attrs.
+            ctx->set_output(i, components[i]);
+          }
+
+          components.clear();
+          s.Update(
+              raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+            done();
+            return;
+          }
+          if (!end_of_sequence) {
+            ctx->SetStatus(
+                errors::InvalidArgument("Dataset had more than one element."));
+            done();
+            return;
+          }
+          done();
+        },
+        std::move(done)));
   }
 
  private:
@@ -552,129 +541,157 @@ class ReduceDatasetOp : public AsyncOpKernel {
     params.is_multi_device_function = true;
     OP_REQUIRES_OK(ctx,
                    FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    background_worker_.Schedule([this, ctx, done]() {
-      DatasetBase* dataset;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      OpInputList inputs;
-      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("initial_state", &inputs),
-                           done);
-      std::vector<Tensor> state(inputs.begin(), inputs.end());
+    background_worker_.Schedule(std::bind(
+        [this, ctx](std::function<void()>& done) {
+          DatasetBase* dataset;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+          OpInputList inputs;
+          OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("initial_state", &inputs),
+                               done);
+          std::vector<Tensor> state(inputs.begin(), inputs.end());
 
-      std::unique_ptr<CapturedFunction> captured_func;
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
-                                   &captured_func),
-          done);
+          std::unique_ptr<CapturedFunction> captured_func;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
+                                       &captured_func),
+              done);
 
-      IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache =
-          absl::make_unique<FunctionHandleCache>(params.flr);
-      params.function_handle_cache = function_handle_cache.get();
-      std::unique_ptr<ResourceMgr> resource_mgr =
-          absl::make_unique<ResourceMgr>();
-      params.resource_mgr = resource_mgr.get();
-      IteratorContext iter_ctx(std::move(params));
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          captured_func->Instantiate(&iter_ctx, &instantiated_captured_func),
-          done);
+          IteratorContext::Params params(ctx);
+          auto function_handle_cache =
+              absl::make_unique<FunctionHandleCache>(params.flr);
+          params.function_handle_cache = function_handle_cache.get();
+          ResourceMgr resource_mgr;
+          params.resource_mgr = &resource_mgr;
+          CancellationManager cancellation_manager;
+          params.cancellation_manager = &cancellation_manager;
+          std::function<void()> deregister_fn;
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               ConnectCancellationManagers(
+                                   ctx->cancellation_manager(),
+                                   params.cancellation_manager, &deregister_fn),
+                               done);
 
-      std::unique_ptr<IteratorBase> iterator;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator),
-          done);
+          // Update the `done` callback to deregister the cancellation callback.
+          done = std::bind(
+              [](const std::function<void()>& done,
+                 const std::function<void()>& deregister_fn) {
+                deregister_fn();
+                done();
+              },
+              std::move(done), std::move(deregister_fn));
 
-      // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
-      // avoid destruction races.
-      IteratorBase* raw_iterator = iterator.release();
-      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
-        delete raw_iterator;
-        done();
-      });
-      auto done = []() {};
+          IteratorContext iter_ctx(std::move(params));
+          std::unique_ptr<InstantiatedCapturedFunction>
+              instantiated_captured_func;
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               captured_func->Instantiate(
+                                   &iter_ctx, &instantiated_captured_func),
+                               done);
 
-      // Iterate through the input dataset.
-      Status status;
-      while (true) {
-        OP_REQUIRES_ASYNC(ctx, !ctx->cancellation_manager()->IsCancelled(),
-                          errors::Cancelled("Operation was cancelled"), done);
-        std::vector<Tensor> next_input_element;
-        bool end_of_input;
-        status = raw_iterator->GetNext(&iter_ctx, &next_input_element,
-                                       &end_of_input);
-        if (!status.ok() || end_of_input) {
-          break;
-        }
+          std::unique_ptr<IteratorBase> iterator;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator),
+              done);
 
-        // Run the reduce function to update the current state.
-        std::vector<Tensor> args;
-        args.reserve(state.size() + next_input_element.size());
-        std::copy(state.begin(), state.end(), std::back_inserter(args));
-        std::copy(next_input_element.begin(), next_input_element.end(),
-                  std::back_inserter(args));
+          // Update the `done` callback to destroy the iterator before calling
+          // the actual callback to avoid destruction races.
+          IteratorBase* raw_iterator = iterator.release();
+          done = std::bind(
+              [raw_iterator](const std::function<void()>& done) {
+                delete raw_iterator;
+                done();
+              },
+              std::move(done));
 
-        std::vector<Tensor> reduce_func_output;
-        status = instantiated_captured_func->Run(&iter_ctx, std::move(args),
-                                                 &reduce_func_output);
-        if (!status.ok()) {
-          break;
-        }
-        OP_REQUIRES_ASYNC(
-            ctx, reduce_func_output.size() == state.size(),
-            errors::InvalidArgument(
-                "The number of components of the initial state and the reduce "
-                "function output does not match. (initial_state=",
-                state.size(), ", output=", reduce_func_output.size(), ")."),
-            done);
-        std::swap(reduce_func_output, state);
-      }
+          // Iterate through the input dataset.
+          Status status;
+          while (true) {
+            OP_REQUIRES_ASYNC(ctx, !ctx->cancellation_manager()->IsCancelled(),
+                              errors::Cancelled("Operation was cancelled"),
+                              done);
+            std::vector<Tensor> next_input_element;
+            bool end_of_input;
+            status = raw_iterator->GetNext(&iter_ctx, &next_input_element,
+                                           &end_of_input);
+            if (!status.ok() || end_of_input) {
+              break;
+            }
 
-      if (!status.ok()) {
-        ctx->SetStatus(status);
-        return;
-      }
+            // Run the reduce function to update the current state.
+            std::vector<Tensor> args;
+            args.reserve(state.size() + next_input_element.size());
+            std::copy(state.begin(), state.end(), std::back_inserter(args));
+            std::copy(next_input_element.begin(), next_input_element.end(),
+                      std::back_inserter(args));
 
-      OP_REQUIRES_ASYNC(ctx, state.size() == output_types_.size(),
-                        errors::InvalidArgument(
-                            "The number of result elements does not match "
-                            "the size of output types: ",
-                            state.size(), " vs. ", output_types_.size()),
-                        done);
-      OP_REQUIRES_ASYNC(ctx, state.size() == output_shapes_.size(),
-                        errors::InvalidArgument(
-                            "The number of result elements does not match "
-                            "the size of output shapes: ",
-                            state.size(), " vs. ", output_shapes_.size()),
-                        done);
-      for (int i = 0; i < state.size(); ++i) {
-        OP_REQUIRES_ASYNC(
-            ctx, state[i].dtype() == output_types_[i],
-            errors::InvalidArgument(
-                "The result does not match the expected type for component ", i,
-                ". Expected: ", DataTypeString(output_types_[i]),
-                ". Actual: ", DataTypeString(state[i].dtype()), "."),
-            done);
-        OP_REQUIRES_ASYNC(
-            ctx, output_shapes_[i].IsCompatibleWith(state[i].shape()),
-            errors::InvalidArgument(
-                "The result does not match the expected shape for component ",
-                i, ". Expected: ", output_shapes_[i].DebugString(),
-                ". Actual: ", state[i].shape().DebugString(), "."),
-            done);
-        ctx->set_output(i, state[i]);
-      }
-    });
+            std::vector<Tensor> reduce_func_output;
+            status = instantiated_captured_func->Run(&iter_ctx, std::move(args),
+                                                     &reduce_func_output);
+            if (!status.ok()) {
+              break;
+            }
+            OP_REQUIRES_ASYNC(
+                ctx, reduce_func_output.size() == state.size(),
+                errors::InvalidArgument(
+                    "The number of components of the initial state and the "
+                    "reduce "
+                    "function output does not match. (initial_state=",
+                    state.size(), ", output=", reduce_func_output.size(), ")."),
+                done);
+            std::swap(reduce_func_output, state);
+          }
+
+          if (!status.ok()) {
+            ctx->SetStatus(status);
+            done();
+            return;
+          }
+
+          OP_REQUIRES_ASYNC(ctx, state.size() == output_types_.size(),
+                            errors::InvalidArgument(
+                                "The number of result elements does not match "
+                                "the size of output types: ",
+                                state.size(), " vs. ", output_types_.size()),
+                            done);
+          OP_REQUIRES_ASYNC(ctx, state.size() == output_shapes_.size(),
+                            errors::InvalidArgument(
+                                "The number of result elements does not match "
+                                "the size of output shapes: ",
+                                state.size(), " vs. ", output_shapes_.size()),
+                            done);
+          for (int i = 0; i < state.size(); ++i) {
+            OP_REQUIRES_ASYNC(
+                ctx, state[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The result does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(state[i].dtype()), "."),
+                done);
+            OP_REQUIRES_ASYNC(
+                ctx, output_shapes_[i].IsCompatibleWith(state[i].shape()),
+                errors::InvalidArgument(
+                    "The result does not match the expected shape for "
+                    "component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", state[i].shape().DebugString(), "."),
+                done);
+            ctx->set_output(i, state[i]);
+          }
+          done();
+        },
+        std::move(done)));
   }
 
  private:
@@ -699,8 +716,8 @@ class OneShotIteratorOp : public AsyncOpKernel {
                                         "support the 'shared_name' attr."));
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("dataset_factory", &dataset_factory_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   }
 
   ~OneShotIteratorOp() override {
@@ -882,8 +899,7 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        Status s = iterator->GetNext(IteratorContext(ctx), &components,
-                                     &end_of_sequence);
+        Status s = iterator->GetNext(ctx, &components, &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -910,8 +926,7 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   std::vector<Tensor> components;
   bool end_of_sequence = false;
 
-  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components,
-                                        &end_of_sequence));
+  OP_REQUIRES_OK(ctx, iterator->GetNext(ctx, &components, &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -933,8 +948,7 @@ void IteratorGetNextAsOptionalOp::ComputeAsync(OpKernelContext* ctx,
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        Status s = iterator->GetNext(IteratorContext(ctx), &components,
-                                     &end_of_sequence);
+        Status s = iterator->GetNext(ctx, &components, &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -988,15 +1002,15 @@ void IteratorToStringHandleOp::Compute(OpKernelContext* ctx) {
   Tensor* string_handle_t;
   OP_REQUIRES_OK(ctx,
                  ctx->allocate_output(0, TensorShape({}), &string_handle_t));
-  string_handle_t->scalar<string>()() =
+  string_handle_t->scalar<tstring>()() =
       resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
 }
 
 IteratorFromStringHandleOp::IteratorFromStringHandleOp(
     OpKernelConstruction* ctx)
     : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_dtypes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   OP_REQUIRES(
       ctx,
       output_dtypes_.empty() || output_shapes_.empty() ||
@@ -1012,7 +1026,7 @@ void IteratorFromStringHandleOp::Compute(OpKernelContext* ctx) {
 
   ResourceHandle resource_handle;
   OP_REQUIRES(
-      ctx, resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+      ctx, resource_handle.ParseFromString(string_handle_t.scalar<tstring>()()),
       errors::InvalidArgument(
           "Could not parse string_handle as a valid ResourceHandle"));
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index ceeed061f57..07b88d4ccc3 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 #include "tensorflow/core/kernels/ops_util.h"
 
@@ -40,14 +41,11 @@ class IteratorResource : public ResourceBase {
       : unbounded_thread_pool_(env, "tf_data_iterator_resource"),
         device_mgr_(std::move(device_mgr)),
         iterator_state_(std::make_shared<State>(
-            std::move(flib_def), std::move(pflr), flr, nullptr /* iterator */)),
+            std::move(flib_def), std::move(pflr), flr, /*iterator=*/nullptr)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
-  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence);
-
-  Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+  Status GetNext(OpKernelContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence);
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer);
@@ -75,22 +73,12 @@ class IteratorResource : public ResourceBase {
           function_handle_cache(absl::make_unique<FunctionHandleCache>(flr)),
           iterator(std::move(iterator)) {}
 
-    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
-          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
-          FunctionLibraryRuntime* flr,
-          std::unique_ptr<FunctionHandleCache> function_handle_cache,
-          std::unique_ptr<IteratorBase> iterator)
-        : flib_def(flib_def),
-          flr(flr),
-          pflr(pflr),
-          function_handle_cache(std::move(function_handle_cache)),
-          iterator(std::move(iterator)) {}
-
     std::shared_ptr<FunctionLibraryDefinition> flib_def;
     FunctionLibraryRuntime* flr = nullptr;  // not owned.
     std::shared_ptr<ProcessFunctionLibraryRuntime> pflr;
     std::unique_ptr<FunctionHandleCache> function_handle_cache;
     ResourceMgr resource_mgr;
+    CancellationManager cancellation_manager;
     std::unique_ptr<IteratorBase> iterator;
   };
 
@@ -149,70 +137,16 @@ class IteratorHandleOp : public OpKernel {
   string name_;
 };
 
-template <typename T>
-class AnonymousIteratorResourceOp : public OpKernel {
- public:
-  explicit AnonymousIteratorResourceOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(context, context->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    FunctionLibraryRuntime* lib;
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    OP_REQUIRES_OK(
-        ctx, ctx->function_library()->Clone(&flib_def, &pflr, &lib, true));
-    T* resource;
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, std::move(flib_def),
-                                       std::move(pflr), lib, &resource));
-
-    string unique_name, container_name;
-    GenerateContainerNames(&unique_name, &container_name);
-    ResourceMgr* mgr = ctx->resource_manager();
-    OP_REQUIRES_OK(ctx, mgr->Create<T>(container_name, unique_name, resource));
-
-    Tensor* handle_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle_t));
-    ResourceHandle handle = MakeResourceHandle(ctx, container_name, unique_name,
-                                               MakeTypeIndex<T>());
-    handle_t->scalar<ResourceHandle>()() = handle;
-
-    if (create_deleter_) {
-      Tensor* deleter_t;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t));
-      deleter_t->scalar<Variant>()() =
-          ResourceDeleter(handle, ctx->resource_manager());
-    }
-  }
-
- protected:
-  virtual void GenerateContainerNames(string* unique_name,
-                                      string* container_name) = 0;
-
-  virtual Status CreateResource(
-      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
-      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-      FunctionLibraryRuntime* lib, T** resource) = 0;
-
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-  bool create_deleter_ = true;
-};
-
 // Like IteratorHandleOp, but creates handles which are never shared, and does
 // not hold a reference to these handles. The latter is important for eager
 // execution, since OpKernel instances generally live as long as the program
 // running them.
-class AnonymousIteratorHandleOp
-    : public AnonymousIteratorResourceOp<IteratorResource> {
+class AnonymousIteratorHandleOp : public AnonymousResourceOp<IteratorResource> {
  public:
   explicit AnonymousIteratorHandleOp(OpKernelConstruction* context);
 
  private:
-  void GenerateContainerNames(string* unique_name,
-                              string* container_name) override;
+  string name() override;
 
   Status CreateResource(OpKernelContext* ctx,
                         std::unique_ptr<FunctionLibraryDefinition> flib_def,
@@ -220,6 +154,8 @@ class AnonymousIteratorHandleOp
                         FunctionLibraryRuntime* lib,
                         IteratorResource** resource) override;
 
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
 };
 
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 26a56b0a683..0f36c6e9960 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -74,6 +74,11 @@ class MapDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
index 7dbe34574a2..3c0a635ab00 100644
--- a/tensorflow/core/kernels/data/map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -22,557 +22,233 @@ namespace {
 
 constexpr char kNodeName[] = "map_dataset";
 
-class MapDatasetOpTest : public DatasetOpsTestBase {
+class MapDatasetParams : public DatasetParams {
+ public:
+  MapDatasetParams(RangeDatasetParams range_dataset_params,
+                   std::vector<Tensor> other_arguments,
+                   FunctionDefHelper::AttrValueWrapper func,
+                   std::vector<FunctionDef> func_lib,
+                   DataTypeVector type_arguments, DataTypeVector output_dtypes,
+                   std::vector<PartialTensorShape> output_shapes,
+                   bool use_inter_op_parallelism, bool preserve_cardinality,
+                   string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        range_dataset_params(std::move(range_dataset_params)),
+        other_arguments(std::move(other_arguments)),
+        func(std::move(func)),
+        func_lib(std::move(func_lib)),
+        type_arguments(std::move(type_arguments)),
+        use_inter_op_parallelism(use_inter_op_parallelism),
+        preserve_cardinality(preserve_cardinality) {}
+
+  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    if (!IsDatasetTensor(input_dataset)) {
+      return tensorflow::errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    }
+    *inputs = {TensorValue(&input_dataset)};
+    for (auto& argument : other_arguments) {
+      inputs->emplace_back(TensorValue(&argument));
+    }
+    return Status::OK();
+  }
+
+  RangeDatasetParams range_dataset_params;
+  Tensor input_dataset;
+  std::vector<Tensor> other_arguments;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  DataTypeVector type_arguments;
+  bool use_inter_op_parallelism;
+  bool preserve_cardinality;
+};
+
+class MapDatasetOpTest : public DatasetOpsTestBaseV2<MapDatasetParams> {
+ public:
+  Status Initialize(MapDatasetParams* map_dataset_params) override {
+    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+    TF_RETURN_IF_ERROR(
+        InitFunctionLibraryRuntime(map_dataset_params->func_lib, cpu_num_));
+
+    TF_RETURN_IF_ERROR(
+        MakeDatasetOpKernel(*map_dataset_params, &dataset_kernel_));
+    TF_RETURN_IF_ERROR(
+        MakeRangeDataset(map_dataset_params->range_dataset_params,
+                         &map_dataset_params->input_dataset));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    TF_RETURN_IF_ERROR(map_dataset_params->MakeInputs(&inputs));
+    TF_RETURN_IF_ERROR(
+        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+    TF_RETURN_IF_ERROR(
+        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+        iterator_ctx_.get(), map_dataset_params->iterator_prefix, &iterator_));
+    return Status::OK();
+  }
+
  protected:
   // Creates a new MapDataset op kernel.
-  Status CreateMapDatasetOpKernel(
-      const FunctionDefHelper::AttrValueWrapper& func,
-      const DataTypeVector& output_types,
-      const std::vector<PartialTensorShape>& output_shapes,
-      std::unique_ptr<OpKernel>* map_kernel) {
+  Status MakeDatasetOpKernel(const MapDatasetParams& map_dataset_params,
+                             std::unique_ptr<OpKernel>* map_kernel) override {
     NodeDef map_dataset_node_def = test::function::NDef(
-        kNodeName, name_utils::OpName(MapDatasetOp::kDatasetType),
+        map_dataset_params.node_name,
+        name_utils::OpName(MapDatasetOp::kDatasetType),
         {MapDatasetOp::kInputDataset},
-        {{MapDatasetOp::kFunc, func},
-         {MapDatasetOp::kTarguments, {}},
-         {MapDatasetOp::kOutputShapes, output_shapes},
-         {MapDatasetOp::kOutputTypes, output_types},
-         {MapDatasetOp::kUseInterOpParallelism, true},
-         {MapDatasetOp::kPreserveCardinality, false}});
+        {{MapDatasetOp::kFunc, map_dataset_params.func},
+         {MapDatasetOp::kTarguments, map_dataset_params.type_arguments},
+         {MapDatasetOp::kOutputShapes, map_dataset_params.output_shapes},
+         {MapDatasetOp::kOutputTypes, map_dataset_params.output_dtypes},
+         {MapDatasetOp::kUseInterOpParallelism,
+          map_dataset_params.use_inter_op_parallelism},
+         {MapDatasetOp::kPreserveCardinality,
+          map_dataset_params.preserve_cardinality}});
     TF_RETURN_IF_ERROR(CreateOpKernel(map_dataset_node_def, map_kernel));
     return Status::OK();
   }
-
-  // Creates a new MapDataset op kernel context.
-  Status CreateMapDatasetContext(
-      OpKernel* const map_kernel, gtl::InlinedVector<TensorValue, 4>* inputs,
-      std::unique_ptr<OpKernelContext>* map_context) {
-    TF_RETURN_IF_ERROR(CheckOpKernelInput(*map_kernel, *inputs));
-    TF_RETURN_IF_ERROR(CreateOpKernelContext(map_kernel, inputs, map_context));
-    return Status::OK();
-  }
 };
 
-struct TestCase {
-  int64 start;
-  int64 end;
-  int64 step;
-  FunctionDefHelper::AttrValueWrapper func;
-  std::vector<FunctionDef> func_lib;
-  std::vector<Tensor> expected_outputs;
-  DataTypeVector expected_output_dtypes;
-  std::vector<PartialTensorShape> expected_output_shapes;
-  int64 expected_cardinality;
-  std::vector<int> breakpoints;
-};
-
-TestCase TestCase1() {
-  return {/*start*/ 0,
-          /*end*/ 10,
-          /*step*/ 3,
-          /*func*/
+MapDatasetParams MapDatasetParams1() {
+  return {{/*start=*/0, /*stop=*/10, /*step=*/3},
+          /*other_arguments=*/{},
+          /*func=*/
           FunctionDefHelper::FunctionRef("XTimesTwo", {{"T", DT_INT64}}),
-          /*func_lib*/ {test::function::XTimesTwo()},
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {18})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 1, 5}};
+          /*func_lib=*/{test::function::XTimesTwo()},
+          /*type_arguments=*/{},
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*use_inter_op_parallelism=*/true,
+          /*preserve_cardinality=*/true,
+          /*node_name=*/kNodeName};
 }
 
-TestCase TestCase2() {
-  return {/*start*/ 10,
-          /*end*/ 0,
-          /*step*/ -3,
-          /*func*/
+MapDatasetParams MapDatasetParams2() {
+  return {{/*start=*/10, /*stop=*/0, /*step=*/-3},
+          /*other_arguments=*/{},
+          /*func=*/
           FunctionDefHelper::FunctionRef("XAddX", {{"T", DT_INT64}}),
-          /*func_lib*/ {test::function::XAddX()},
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {14}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 1, 5}};
+          /*func_lib=*/{test::function::XAddX()},
+          /*type_arguments=*/{},
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*use_inter_op_parallelism=*/true,
+          /*preserve_cardinality=*/false,
+          /*node_name=*/kNodeName};
 }
 
 // In this test case, the function `XTimesFour()` will call `XTimesTwo()`, so
 // both of them are added to the function library.
-TestCase TestCase3() {
+MapDatasetParams MapDatasetParams3() {
   return {
-      /*start*/ 0,
-      /*end*/ 10,
-      /*step*/ 3,
-      /*func*/
+      {/*start=*/0, /*stop=*/10, /*step=*/3},
+      /*other_arguments=*/{},
+      /*func=*/
       FunctionDefHelper::FunctionRef("XTimesFour", {{"T", DT_INT64}}),
-      /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 1, 5}};
+      /*func_lib=*/{test::function::XTimesTwo(), test::function::XTimesFour()},
+      /*type_arguments=*/{},
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({})},
+      /*use_inter_op_parallelism=*/false,
+      /*preserve_cardinality=*/true,
+      /*node_name=*/kNodeName};
 }
 
-class ParameterizedMapDatasetOpTest
-    : public MapDatasetOpTest,
-      public ::testing::WithParamInterface<TestCase> {};
-
-TEST_P(ParameterizedMapDatasetOpTest, GetNext) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(map_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
-
-  bool end_of_sequence = false;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  std::vector<Tensor> out_tensors;
-  while (!end_of_sequence) {
-    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
-                                   &end_of_sequence));
-    if (!end_of_sequence) {
-      EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
-      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-      expected_outputs_it++;
-    }
-  }
-  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+std::vector<GetNextTestCase<MapDatasetParams>> GetNextTestCases() {
+  return {{/*dataset_params=*/MapDatasetParams1(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {6}, {12}, {18}})},
+          {/*dataset_params=*/MapDatasetParams2(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{20}, {14}, {8}, {2}})},
+          {/*dataset_params=*/MapDatasetParams3(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {12}, {24}, {36}})}};
 }
 
+ITERATOR_GET_NEXT_TEST_P(MapDatasetOpTest, MapDatasetParams, GetNextTestCases())
+
+std::vector<DatasetNodeNameTestCase<MapDatasetParams>>
+DatasetNodeNameTestCases() {
+  return {{/*dataset_params=*/MapDatasetParams1(),
+           /*expected_node_name=*/kNodeName}};
+}
+
+DATASET_NODE_NAME_TEST_P(MapDatasetOpTest, MapDatasetParams,
+                         DatasetNodeNameTestCases())
+
 TEST_F(MapDatasetOpTest, DatasetNodeName) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  EXPECT_EQ(map_dataset->node_name(), kNodeName);
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(dataset_params.node_name));
 }
 
 TEST_F(MapDatasetOpTest, DatasetTypeString) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
   TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  EXPECT_EQ(map_dataset->type_string(),
-            name_utils::OpName(MapDatasetOp::kDatasetType));
+      CheckDatasetTypeString(name_utils::OpName(MapDatasetOp::kDatasetType)));
 }
 
 TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  TF_EXPECT_OK(VerifyTypesMatch(map_dataset->output_dtypes(),
-                                test_case.expected_output_dtypes));
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
 TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  TF_EXPECT_OK(VerifyShapesCompatible(map_dataset->output_shapes(),
-                                      test_case.expected_output_shapes));
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputShapes({PartialTensorShape({})}));
 }
 
-TEST_P(ParameterizedMapDatasetOpTest, Cardinality) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  EXPECT_EQ(map_dataset->Cardinality(), test_case.expected_cardinality);
+std::vector<CardinalityTestCase<MapDatasetParams>> CardinalityTestCases() {
+  return {{/*dataset_params=*/MapDatasetParams1(), /*expected_cardinality=*/4},
+          {/*dataset_params=*/MapDatasetParams2(), /*expected_cardinality=*/4},
+          {/*dataset_params=*/MapDatasetParams3(), /*expected_cardinality=*/4}};
 }
 
-TEST_P(ParameterizedMapDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(map_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
+DATASET_CARDINALITY_TEST_P(MapDatasetOpTest, MapDatasetParams,
+                           CardinalityTestCases())
 
 TEST_F(MapDatasetOpTest, IteratorOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(map_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
-                                test_case.expected_output_dtypes));
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
 TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(map_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
-                                      test_case.expected_output_shapes));
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputShapes({PartialTensorShape({})}));
 }
 
-TEST_F(MapDatasetOpTest, IteratorOutputPrefix) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = TestCase1();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(map_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
-
-  EXPECT_EQ(iterator->prefix(),
-            name_utils::IteratorPrefix(MapDatasetOp::kDatasetType, "Iterator"));
+TEST_F(MapDatasetOpTest, IteratorPrefix) {
+  auto dataset_params = MapDatasetParams1();
+  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
+      MapDatasetOp::kDatasetType, dataset_params.iterator_prefix)));
 }
 
-TEST_P(ParameterizedMapDatasetOpTest, Roundtrip) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.start, test_case.end, test_case.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  // The ownership of range_dataset is transferred to DatasetVariantWrapper,
-  // which will handle the release of memory.
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> map_dataset_inputs;
-  map_dataset_inputs.emplace_back(&range_dataset_tensor);
-
-  std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &map_dataset_kernel));
-  std::unique_ptr<OpKernelContext> map_dataset_context;
-  TF_ASSERT_OK(CreateMapDatasetContext(
-      map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
-  DatasetBase* map_dataset;
-  TF_ASSERT_OK(CreateDataset(map_dataset_kernel.get(),
-                             map_dataset_context.get(), &map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(map_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(
-      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  bool end_of_sequence = false;
-  std::vector<Tensor> out_tensors;
-  int cur_iteration = 0;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  const std::vector<int>& breakpoints = test_case.breakpoints;
-  for (int breakpoint : breakpoints) {
-    VariantTensorData data;
-    VariantTensorDataWriter writer(&data);
-    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
-    TF_EXPECT_OK(writer.Flush());
-    VariantTensorDataReader reader(&data);
-    TF_EXPECT_OK(RestoreIterator(iterator_context.get(), &reader, "Iterator",
-                                 *map_dataset, &iterator));
-
-    while (cur_iteration <= breakpoint) {
-      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
-                                     &end_of_sequence));
-      if (!end_of_sequence) {
-        EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
-        TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-        expected_outputs_it++;
-      }
-      cur_iteration++;
-    }
-
-    if (breakpoint >= test_case.expected_cardinality) {
-      EXPECT_TRUE(end_of_sequence);
-      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
-    } else {
-      EXPECT_FALSE(end_of_sequence);
-    }
-  }
+std::vector<IteratorSaveAndRestoreTestCase<MapDatasetParams>>
+IteratorSaveAndRestoreTestCases() {
+  return {{/*dataset_params=*/MapDatasetParams1(),
+           /*breakpoints*/ {0, 1, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {6}, {12}, {18}})},
+          {/*dataset_params=*/MapDatasetParams2(),
+           /*breakpoints*/ {0, 1, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{20}, {14}, {8}, {2}})},
+          {/*dataset_params=*/MapDatasetParams3(),
+           /*breakpoints*/ {0, 1, 5},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {12}, {24}, {36}})}};
 }
 
-INSTANTIATE_TEST_SUITE_P(MapDatasetOpTest, ParameterizedMapDatasetOpTest,
-                         ::testing::ValuesIn(std::vector<TestCase>(
-                             {TestCase1(), TestCase2(), TestCase3()})));
+ITERATOR_SAVE_AND_RESTORE_TEST_P(MapDatasetOpTest, MapDatasetParams,
+                                 IteratorSaveAndRestoreTestCases())
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/map_defun_op_test.cc b/tensorflow/core/kernels/data/map_defun_op_test.cc
index 6db3376d308..39561acf5f0 100644
--- a/tensorflow/core/kernels/data/map_defun_op_test.cc
+++ b/tensorflow/core/kernels/data/map_defun_op_test.cc
@@ -79,8 +79,8 @@ struct TestCase {
 // Test case 1: one input for the map function with no captured inputs.
 TestCase TestCase1() {
   return {
-      /*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
       /*captured_inputs*/ {},
       /*t_arguments*/ {DT_INT64},
       /*t_captured*/ {},
@@ -90,36 +90,34 @@ TestCase TestCase1() {
       /*output_dtypes*/ {DT_INT64},
       /*output_shapes*/ {PartialTensorShape({2})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                               {0, 2, 4, 6, 8, 10})}};
+      {CreateTensor<int64>(TensorShape({3, 2}), {0, 2, 4, 6, 8, 10})}};
 }
 
 // Test case 2: two inputs for the map function with no captured inputs.
 TestCase TestCase2() {
-  return {/*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-                             TensorShape({3, 2}), {0, 1, 2, 3, 4, 5}),
-                         DatasetOpsTestBase::CreateTensor<int64>(
-                             TensorShape({3, 2}), {0, 10, 20, 30, 40, 50})},
-          /*captured_inputs*/ {},
-          /*t_arguments*/ {DT_INT64, DT_INT64},
-          /*t_captured*/ {},
-          /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
-          /*func_lib*/ {test::function::XAddY()},
-          /*max_intra_op_parallelism*/ 2,
-          /*output_dtypes*/ {DT_INT64},
-          /*output_shapes*/ {PartialTensorShape({2})},
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                                   {0, 11, 22, 33, 44, 55})}};
+  return {
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5}),
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 10, 20, 30, 40, 50})},
+      /*captured_inputs*/ {},
+      /*t_arguments*/ {DT_INT64, DT_INT64},
+      /*t_captured*/ {},
+      /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
+      /*func_lib*/ {test::function::XAddY()},
+      /*max_intra_op_parallelism*/ 2,
+      /*output_dtypes*/ {DT_INT64},
+      /*output_shapes*/ {PartialTensorShape({2})},
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape({3, 2}), {0, 11, 22, 33, 44, 55})}};
 }
 
 // Test case 3: two inputs for the map function with one captured input.
 TestCase TestCase3() {
   return {
-      /*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
       /*captured_inputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {10, 100})},
+      {CreateTensor<int64>(TensorShape({2}), {10, 100})},
       /*t_arguments*/ {DT_INT64},
       /*t_captured*/ {DT_INT64},
       /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
@@ -128,16 +126,15 @@ TestCase TestCase3() {
       /*output_dtypes*/ {DT_INT64},
       /*output_shapes*/ {PartialTensorShape({2})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                               {10, 101, 12, 103, 14, 105})}};
+      {CreateTensor<int64>(TensorShape({3, 2}), {10, 101, 12, 103, 14, 105})}};
 }
 
 TestCase InvalidOutputTypes() {
   return {
-      /*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
       /*captured_inputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {10, 100})},
+      {CreateTensor<int64>(TensorShape({2}), {10, 100})},
       /*t_arguments*/ {DT_INT64},
       /*t_captured*/ {DT_INT64},
       /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
@@ -146,16 +143,15 @@ TestCase InvalidOutputTypes() {
       /*output_dtypes*/ {DT_FLOAT},
       /*output_shapes*/ {PartialTensorShape({2})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                               {10, 101, 12, 103, 14, 105})}};
+      {CreateTensor<int64>(TensorShape({3, 2}), {10, 101, 12, 103, 14, 105})}};
 }
 
 TestCase InvalidOutputShapes() {
   return {
-      /*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-          TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5})},
       /*captured_inputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {10, 100})},
+      {CreateTensor<int64>(TensorShape({2}), {10, 100})},
       /*t_arguments*/ {DT_INT64},
       /*t_captured*/ {DT_INT64},
       /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
@@ -164,18 +160,16 @@ TestCase InvalidOutputShapes() {
       /*output_dtypes*/ {DT_INT64},
       /*output_shapes*/ {PartialTensorShape({2, 2})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                               {10, 101, 12, 103, 14, 105})}};
+      {CreateTensor<int64>(TensorShape({3, 2}), {10, 101, 12, 103, 14, 105})}};
 }
 
 TestCase InvalidInputs() {
   return {
-      /*arguments*/ {DatasetOpsTestBase::CreateTensor<int64>(
-                         TensorShape({3, 2}), {0, 1, 2, 3, 4, 5}),
-                     DatasetOpsTestBase::CreateTensor<int64>(
-                         TensorShape({2, 2}), {0, 1, 2, 3})},
+      /*arguments*/ {
+          CreateTensor<int64>(TensorShape({3, 2}), {0, 1, 2, 3, 4, 5}),
+          CreateTensor<int64>(TensorShape({2, 2}), {0, 1, 2, 3})},
       /*captured_inputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {10, 100})},
+      {CreateTensor<int64>(TensorShape({2}), {10, 100})},
       /*t_arguments*/ {DT_INT64, DT_INT64},
       /*t_captured*/ {DT_INT64},
       /*func*/ {FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}})},
@@ -184,8 +178,7 @@ TestCase InvalidInputs() {
       /*output_dtypes*/ {DT_INT64},
       /*output_shapes*/ {PartialTensorShape({2})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3, 2}),
-                                               {10, 101, 12, 103, 14, 105})}};
+      {CreateTensor<int64>(TensorShape({3, 2}), {10, 101, 12, 103, 14, 105})}};
 }
 
 class ParameterizedMapDefunOpTest
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 6f347efdc4e..b91e913620a 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -29,6 +29,9 @@ namespace {
 
 constexpr int64 kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMillis;
 
+// Default share of available RAM that can be used by model's internal buffers.
+constexpr double kRamBudgetShare = 0.5;
+
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ModelDatasetOp(OpKernelConstruction* ctx)
@@ -47,22 +50,25 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES(ctx, cpu_budget_ > 0,
                 errors::InvalidArgument("CPU budget must be positive but is ",
                                         cpu_budget_, "."));
+    ram_budget_ = kRamBudgetShare * port::AvailableRam();
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    *output = new Dataset(ctx, input, algorithm_, cpu_budget_);
+    *output = new Dataset(ctx, input, algorithm_, cpu_budget_, ram_budget_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            model::AutotuneAlgorithm algorithm, int64 cpu_budget)
+            model::AutotuneAlgorithm algorithm, int64 cpu_budget,
+            int64 ram_budget)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           algorithm_(algorithm),
-          cpu_budget_(cpu_budget) {
+          cpu_budget_(cpu_budget),
+          ram_budget_(ram_budget) {
       input_->Ref();
     }
 
@@ -85,6 +91,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -186,7 +196,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
             }
             if (cancelled_) return;
           }
-          model_->Optimize(dataset()->algorithm_, dataset()->cpu_budget_);
+          model_->Optimize(dataset()->algorithm_, dataset()->cpu_budget_,
+                           dataset()->ram_budget_);
           // Exponentially increase the period of running the optimization
           // until a threshold is reached.
           if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
@@ -209,10 +220,12 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     const DatasetBase* input_;
     const model::AutotuneAlgorithm algorithm_;
     const int64 cpu_budget_;
+    const int64 ram_budget_;
   };
 
   model::AutotuneAlgorithm algorithm_;
   int64 cpu_budget_;
+  int64 ram_budget_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ModelDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 0305a85153e..7a538d77d1b 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
@@ -26,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -34,6 +35,11 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+const char kAnonymousMultiDeviceIterator[] = "AnonymousMultiDeviceIterator";
+const char kDevices[] = "devices";
+const char kOutputShapes[] = "output_shapes";
+const char kOutputTypes[] = "output_types";
+
 struct HostBufferElement {
   Status status;
   bool end_of_sequence;
@@ -93,16 +99,41 @@ class MultiDeviceIterator : public ResourceBase {
   }
 
   void GetNextFromShard(OpKernelContext* ctx, int shard_num,
-                        int64 incarnation_id,
-                        MultiDeviceIteratorCallback callback) {
+                        int64 incarnation_id, std::function<void()> done) {
     tf_shared_lock l(mu_);
     IteratorContext::Params params(ctx);
     params.flr = flr_;
     params.function_handle_cache = function_handle_cache_.get();
     params.resource_mgr = &resource_mgr_;
     params.thread_factory = unbounded_thread_pool_.get_thread_factory();
-
+    params.thread_pool = &unbounded_thread_pool_;
+    params.cancellation_manager = &cancellation_manager_;
+    std::function<void()> deregister_fn;
+    OP_REQUIRES_OK_ASYNC(ctx,
+                         ConnectCancellationManagers(
+                             ctx->cancellation_manager(),
+                             params.cancellation_manager, &deregister_fn),
+                         done);
     IteratorContext iter_ctx(std::move(params));
+    MultiDeviceIteratorCallback callback = std::bind(
+        [ctx](const HostBufferElement& elem, const std::function<void()>& done,
+              const std::function<void()>& deregister_fn) {
+          // iterator->Unref();
+          Status s = elem.status;
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+          } else if (elem.end_of_sequence) {
+            ctx->SetStatus(errors::OutOfRange("End of sequence"));
+          } else {
+            for (int i = 0; i < elem.value.size(); ++i) {
+              ctx->set_output(i, elem.value[i]);
+            }
+          }
+          deregister_fn();
+          done();
+        },
+        std::placeholders::_1, std::move(done), std::move(deregister_fn));
+
     multi_device_buffer_->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
                                            std::move(callback));
   }
@@ -124,6 +155,8 @@ class MultiDeviceIterator : public ResourceBase {
 
   ResourceMgr* resource_mgr() { return &resource_mgr_; }
 
+  CancellationManager* cancellation_manager() { return &cancellation_manager_; }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -171,7 +204,9 @@ class MultiDeviceIterator : public ResourceBase {
                           MultiDeviceIteratorCallback callback) {
       HostBufferElement elem;
       if (incarnation_id_ != incarnation_id) {
-        elem.status = errors::InvalidArgument("Invalid incarnation id");
+        elem.status = errors::InvalidArgument(
+            "Invalid incarnation id. Provided: ", incarnation_id,
+            "; Expected: ", incarnation_id_);
         callback(elem);
         return;
       }
@@ -356,7 +391,7 @@ class MultiDeviceIterator : public ResourceBase {
   const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   const std::unique_ptr<FunctionHandleCache> function_handle_cache_;
   ResourceMgr resource_mgr_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+  CancellationManager cancellation_manager_;
 
   int64 incarnation_id_ GUARDED_BY(mu_) = 0;
   std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_);
@@ -370,11 +405,11 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
  public:
   explicit MultiDeviceIteratorHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("devices", &devices_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kDevices, &devices_));
   }
 
   // The resource is deleted from the resource manager only when it is private
@@ -414,7 +449,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         if (name_ == ResourceHandle::ANONYMOUS_NAME) {
           unique_name = strings::StrCat("_AnonymousMultiDeviceIterator",
                                         current_id_.fetch_add(1));
-          container_name = "AnonymousMultiDeviceIterator";
+          container_name = kAnonymousMultiDeviceIterator;
           resource = new MultiDeviceIterator(
               context->env(), output_types_, output_shapes_, devices_,
               std::move(flib_def), std::move(pflr), flr,
@@ -482,26 +517,18 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("MultiDeviceIterator").Device(DEVICE_CPU),
                         MultiDeviceIteratorHandleOp);
 
-// This atomic is used to ensure that each new AnonymousMultiDeviceIterator
-// handle is unique.
-static std::atomic<int64> current_multi_device_iterator_id_;
-
 class AnonymousMultiDeviceIteratorOp
-    : public AnonymousIteratorResourceOp<MultiDeviceIterator> {
+    : public AnonymousResourceOp<MultiDeviceIterator> {
  public:
   explicit AnonymousMultiDeviceIteratorOp(OpKernelConstruction* ctx)
-      : AnonymousIteratorResourceOp<MultiDeviceIterator>(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("devices", &devices_));
+      : AnonymousResourceOp<MultiDeviceIterator>(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kDevices, &devices_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   }
 
  private:
-  void GenerateContainerNames(string* unique_name,
-                              string* container_name) override {
-    *unique_name =
-        strings::StrCat("_AnonymousMultiDeviceIterator",
-                        current_multi_device_iterator_id_.fetch_add(1));
-    *container_name = "AnonymousMultiDeviceIterator";
-  }
+  string name() override { return kAnonymousMultiDeviceIterator; }
 
   Status CreateResource(OpKernelContext* ctx,
                         std::unique_ptr<FunctionLibraryDefinition> flib_def,
@@ -517,9 +544,11 @@ class AnonymousMultiDeviceIteratorOp
   }
 
   std::vector<string> devices_;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AnonymousMultiDeviceIterator").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name(kAnonymousMultiDeviceIterator).Device(DEVICE_CPU),
                         AnonymousMultiDeviceIteratorOp);
 
 // Calls init on the MultiDeviceIterator.
@@ -544,6 +573,13 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     params.flr = resource->flr();
     params.function_handle_cache = resource->function_handle_cache();
     params.resource_mgr = resource->resource_mgr();
+    params.cancellation_manager = resource->cancellation_manager();
+    std::function<void()> deregister_fn;
+    OP_REQUIRES_OK(ctx, ConnectCancellationManagers(ctx->cancellation_manager(),
+                                                    params.cancellation_manager,
+                                                    &deregister_fn));
+    auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
+
     IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
@@ -581,24 +617,7 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
 
-    MultiDeviceIteratorCallback callback = std::bind(
-        [ctx](const HostBufferElement& elem, DoneCallback done) {
-          // iterator->Unref();
-          Status s = elem.status;
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (elem.end_of_sequence) {
-            ctx->SetStatus(errors::OutOfRange("End of sequence"));
-          } else {
-            for (int i = 0; i < elem.value.size(); ++i) {
-              ctx->set_output(i, elem.value[i]);
-            }
-          }
-          done();
-        },
-        std::placeholders::_1, std::move(done));
-
-    iterator->GetNextFromShard(ctx, shard_num, incarnation_id, callback);
+    iterator->GetNextFromShard(ctx, shard_num, incarnation_id, std::move(done));
   }
 };
 
@@ -625,7 +644,7 @@ class MultiDeviceIteratorToStringHandleOp : public OpKernel {
     Tensor* string_handle_t;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output(0, TensorShape({}), &string_handle_t));
-    string_handle_t->scalar<string>()() =
+    string_handle_t->scalar<tstring>()() =
         resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
   }
 };
@@ -638,8 +657,8 @@ class MultiDeviceIteratorFromStringHandleOp : public OpKernel {
  public:
   explicit MultiDeviceIteratorFromStringHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
     OP_REQUIRES(
         ctx,
         output_types_.empty() || output_shapes_.empty() ||
@@ -656,7 +675,7 @@ class MultiDeviceIteratorFromStringHandleOp : public OpKernel {
     ResourceHandle resource_handle;
     OP_REQUIRES(
         ctx,
-        resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+        resource_handle.ParseFromString(string_handle_t.scalar<tstring>()()),
         errors::InvalidArgument(
             "Could not parse string_handle as a valid ResourceHandle"));
 
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 7f46e956454..b4a1ec636ce 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
@@ -48,9 +48,9 @@ OptimizeDatasetOp::OptimizeDatasetOp(OpKernelConstruction* ctx)
 
 void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                     DatasetBase** output) {
-  std::vector<string> optimizations;
+  std::vector<tstring> optimizations;
   OP_REQUIRES_OK(
-      ctx, ParseVectorArgument<string>(ctx, kOptimizations, &optimizations));
+      ctx, ParseVectorArgument<tstring>(ctx, kOptimizations, &optimizations));
 
   auto config_factory = [this, &optimizations]() {
     return CreateConfig(optimizations, optimization_configs_);
@@ -61,7 +61,7 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
 }
 
 RewriterConfig OptimizeDatasetOp::CreateConfig(
-    std::vector<string> optimizations,
+    std::vector<tstring> optimizations,
     std::vector<string> optimizations_configs) {
   RewriterConfig rewriter_config;
   rewriter_config.add_optimizers(kOptimizerName);
@@ -72,13 +72,13 @@ RewriterConfig OptimizeDatasetOp::CreateConfig(
   auto* custom_optimizations_list =
       (*custom_optimizer->mutable_parameter_map())[kOptimizers].mutable_list();
   for (const auto& opt : optimizations) {
-    custom_optimizations_list->add_s(opt);
+    custom_optimizations_list->add_s(opt.data(), opt.size());
   }
   auto* config_list =
       (*custom_optimizer->mutable_parameter_map())[kOptimizerConfigs]
           .mutable_list();
   for (const auto& config : optimizations_configs) {
-    config_list->add_s(config);
+    config_list->add_s(config.data(), config.size());
   }
   return rewriter_config;
 }
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.h b/tensorflow/core/kernels/data/optimize_dataset_op.h
index ecc482f2245..a5fcc72260d 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.h
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.h
@@ -37,7 +37,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override;
 
  private:
-  static RewriterConfig CreateConfig(std::vector<string> optimizations,
+  static RewriterConfig CreateConfig(std::vector<tstring> optimizations,
                                      std::vector<string> optimizations_configs);
 
   std::vector<string> optimization_configs_;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
index 94dda91dbef..fcdc76443fd 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
@@ -50,45 +50,6 @@ class OptimizeDatasetOpTest : public DatasetOpsTestBase {
     TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
     return Status::OK();
   }
-
-  // Create a `RangeDataset` dataset as a variant tensor.
-  Status MakeRangeDataset(const Tensor& start, const Tensor& stop,
-                          const Tensor& step,
-                          const DataTypeVector& output_types,
-                          const std::vector<PartialTensorShape>& output_shapes,
-                          Tensor* range_dataset) {
-    GraphConstructorOptions graph_opts;
-    graph_opts.allow_internal_ops = true;
-    graph_opts.expect_device_spec = false;
-    TF_RETURN_IF_ERROR(
-        RunFunction(test::function::MakeRangeDataset(),
-                    /*attrs*/
-                    {{RangeDatasetOp::kOutputTypes, output_types},
-                     {RangeDatasetOp::kOutputShapes, output_shapes}},
-                    /*inputs*/ {start, stop, step}, graph_opts,
-                    /*rets*/ {range_dataset}));
-    return Status::OK();
-  }
-
-  // Create a `TakeDataset` dataset as a variant tensor.
-  Status MakeTakeDataset(const Tensor& input_dataset, int64 count,
-                         const DataTypeVector& output_types,
-                         const std::vector<PartialTensorShape>& output_shapes,
-                         Tensor* take_dataset) {
-    GraphConstructorOptions graph_opts;
-    graph_opts.allow_internal_ops = true;
-    graph_opts.expect_device_spec = false;
-
-    Tensor count_tensor = CreateTensor<int64>(TensorShape({}), {count});
-    TF_RETURN_IF_ERROR(
-        RunFunction(test::function::MakeTakeDataset(),
-                    /*attrs*/
-                    {{TakeDatasetOp::kOutputTypes, output_types},
-                     {TakeDatasetOp::kOutputShapes, output_shapes}},
-                    /*inputs*/ {input_dataset, count_tensor}, graph_opts,
-                    /*rets*/ {take_dataset}));
-    return Status::OK();
-  }
 };
 
 TEST_F(OptimizeDatasetOpTest, NoopElimination) {
@@ -116,7 +77,7 @@ TEST_F(OptimizeDatasetOpTest, NoopElimination) {
                                              /*optimization_configs*/ {},
                                              &optimize_dataset_kernel));
   Tensor optimizations =
-      CreateTensor<string>(TensorShape({1}), {kNoopElimination});
+      CreateTensor<tstring>(TensorShape({1}), {kNoopElimination});
   gtl::InlinedVector<TensorValue, 4> inputs(
       {TensorValue(&take_dataset_tensor), TensorValue(&optimizations)});
   std::unique_ptr<OpKernelContext> optimize_dataset_context;
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index bc7234c57ee..5c2327351c5 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -114,6 +114,10 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
index 6e1b06ca6cb..9a6454b04f8 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
@@ -147,70 +147,60 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
   std::vector<Tensor> tensors;
   tensors.reserve(values.size());
   for (auto &value : values) {
-    tensors.emplace_back(
-        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+    tensors.emplace_back(CreateTensor<T>(TensorShape({1}), {value}));
   }
   return tensors;
 }
 
 // Test case 1: input elements with same shapes.
 TestCase TestCase1() {
-  return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(
-               TensorShape{4, 2}, {6, 7, 8, 9, 10, 11, 12, 13})}},
-          /*concatenate_output_dtypes*/ {DT_INT64},
-          /*concatenate_output_shapes*/ {PartialTensorShape({2})},
-          /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-          /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
-          /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
-          /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {true}),
-          /*parallel_copy*/ true,
-          /*n*/ 1,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {0, 1, 1, 2, 3, 1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {4, 5, 1, 6, 7, 1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {8, 9, 1, 10, 11, 1})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({2, 3})},
-          /*expected_cardinality*/ 3,
-          /*breakpoints*/ {0, 2, 5}};
+  return {
+      /*input_tensors*/
+      {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+       {CreateTensor<int64>(TensorShape{4, 2}, {6, 7, 8, 9, 10, 11, 12, 13})}},
+      /*concatenate_output_dtypes*/ {DT_INT64},
+      /*concatenate_output_shapes*/ {PartialTensorShape({2})},
+      /*batch_size*/
+      CreateTensor<int64>(TensorShape{}, {2}),
+      /*padded_shapes*/
+      {CreateTensor<int64>(TensorShape{1}, {3})},
+      /*padding_values*/
+      {CreateTensor<int64>(TensorShape{}, {1})},
+      /*drop_remainder*/
+      CreateTensor<bool>(TensorShape{}, {true}),
+      /*parallel_copy*/ true,
+      /*n*/ 1,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{2, 3}, {0, 1, 1, 2, 3, 1}),
+       CreateTensor<int64>(TensorShape{2, 3}, {4, 5, 1, 6, 7, 1}),
+       CreateTensor<int64>(TensorShape{2, 3}, {8, 9, 1, 10, 11, 1})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({2, 3})},
+      /*expected_cardinality*/ 3,
+      /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 2: input elements with different shapes.
 TestCase TestCase2() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
-                                                    {6, 7, 8, 9})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{4, 1}, {6, 7, 8, 9})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {true}),
+          CreateTensor<bool>(TensorShape{}, {true}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {0, 1, 1, 2, 3, 1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {4, 5, 1, 6, 1, 1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                                   {7, 1, 1, 8, 1, 1})},
+          {CreateTensor<int64>(TensorShape{2, 3}, {0, 1, 1, 2, 3, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {4, 5, 1, 6, 1, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {7, 1, 1, 8, 1, 1})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({2, 3})},
           /*expected_cardinality*/ 3,
@@ -219,149 +209,132 @@ TestCase TestCase2() {
 
 // Test case 3: similar with the test case 2 but drop_remainder = false.
 TestCase TestCase3() {
-  return {
-      /*input_tensors*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                {0, 1, 2, 3, 4, 5})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
-                                                {6, 7, 8, 9})}},
-      /*concatenate_output_dtypes*/ {DT_INT64},
-      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-      /*padded_shapes*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
-      /*padding_values*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
-      /*parallel_copy*/ false,
-      /*n*/ 1,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {0, 1, 1, 2, 3, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {4, 5, 1, 6, 1, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {7, 1, 1, 8, 1, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 3}, {9, 1, 1})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 2, 5}};
+  return {/*input_tensors*/
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{4, 1}, {6, 7, 8, 9})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ false,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{2, 3}, {0, 1, 1, 2, 3, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {4, 5, 1, 6, 1, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {7, 1, 1, 8, 1, 1}),
+           CreateTensor<int64>(TensorShape{1, 3}, {9, 1, 1})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 4: similar with the test case 3 but the input elements can be
 // divided by the batch size evenly. As drop_remainder = false, the output
 // shape is still {-1, 3} instead of {2, 3}.
 TestCase TestCase4() {
-  return {
-      /*input_tensors*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                {0, 1, 2, 3, 4, 5})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})}},
-      /*concatenate_output_dtypes*/ {DT_INT64},
-      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-      /*padded_shapes*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
-      /*padding_values*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
-      /*parallel_copy*/ false,
-      /*n*/ 1,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {0, 1, 1, 2, 3, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {4, 5, 1, 6, 1, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
-                                               {7, 1, 1, 8, 1, 1})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 2, 5}};
+  return {/*input_tensors*/
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 1}, {6, 7, 8})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {CreateTensor<int64>(TensorShape{1}, {3})},
+          /*padding_values*/
+          {CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ false,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{2, 3}, {0, 1, 1, 2, 3, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {4, 5, 1, 6, 1, 1}),
+           CreateTensor<int64>(TensorShape{2, 3}, {7, 1, 1, 8, 1, 1})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, 3})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 5: similar with the test case 3 but padded_shapes = {-1}.
 TestCase TestCase5() {
-  return {
-      /*input_tensors*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                {0, 1, 2, 3, 4, 5})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
-                                                {6, 7, 8, 9})}},
-      /*concatenate_output_dtypes*/ {DT_INT64},
-      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-      /*padded_shapes*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
-      /*padding_values*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
-      /*parallel_copy*/ false,
-      /*n*/ 1,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 1}, {9})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 2, 5}};
+  return {/*input_tensors*/
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{4, 1}, {6, 7, 8, 9})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {CreateTensor<int64>(TensorShape{1}, {-1})},
+          /*padding_values*/
+          {CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ false,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
+           CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
+           CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
+           CreateTensor<int64>(TensorShape{1, 1}, {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 6: similar with the test case 5 but parallel_copy = true.
 TestCase TestCase6() {
-  return {
-      /*input_tensors*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                {0, 1, 2, 3, 4, 5})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{4, 1},
-                                                {6, 7, 8, 9})}},
-      /*concatenate_output_dtypes*/ {DT_INT64},
-      /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
-      /*batch_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-      /*padded_shapes*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
-      /*padding_values*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
-      /*parallel_copy*/ true,
-      /*n*/ 1,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1, 1}, {9})},
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 2, 5}};
+  return {/*input_tensors*/
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{4, 1}, {6, 7, 8, 9})}},
+          /*concatenate_output_dtypes*/ {DT_INT64},
+          /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
+          /*batch_size*/
+          CreateTensor<int64>(TensorShape{}, {2}),
+          /*padded_shapes*/
+          {CreateTensor<int64>(TensorShape{1}, {-1})},
+          /*padding_values*/
+          {CreateTensor<int64>(TensorShape{}, {1})},
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape{}, {false}),
+          /*parallel_copy*/ true,
+          /*n*/ 1,
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{2, 2}, {0, 1, 2, 3}),
+           CreateTensor<int64>(TensorShape{2, 2}, {4, 5, 6, 1}),
+           CreateTensor<int64>(TensorShape{2, 1}, {7, 8}),
+           CreateTensor<int64>(TensorShape{1, 1}, {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({-1, -1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 7: empty input elements.
 TestCase TestCase7() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})}},
+          {{CreateTensor<int64>(TensorShape{0}, {})},
+           {CreateTensor<int64>(TensorShape{0}, {})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({-1})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {-1})},
+          {CreateTensor<int64>(TensorShape{1}, {-1})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -373,20 +346,18 @@ TestCase TestCase7() {
 
 TestCase ShortPaddingTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1})},
+          {CreateTensor<int64>(TensorShape{1}, {1})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -398,20 +369,18 @@ TestCase ShortPaddingTestCase() {
 
 TestCase InvalidPaddingShapesTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2})},
+          {CreateTensor<int64>(TensorShape{2}, {1, 2})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -423,20 +392,18 @@ TestCase InvalidPaddingShapesTestCase() {
 
 TestCase InvalidBatchSizeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {-1}),
+          CreateTensor<int64>(TensorShape{}, {-1}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -448,21 +415,19 @@ TestCase InvalidBatchSizeTestCase() {
 
 TestCase InvalidPaddedShapesSizeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3}),
+           CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 2,
           /*expected_outputs*/ {},
@@ -474,21 +439,19 @@ TestCase InvalidPaddedShapesSizeTestCase() {
 
 TestCase InvalidPaddedValuesSizeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1})},
+          {CreateTensor<int64>(TensorShape{}, {1}),
+           CreateTensor<int64>(TensorShape{}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -500,20 +463,18 @@ TestCase InvalidPaddedValuesSizeTestCase() {
 
 TestCase InvalidPaddedValuesDTypeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape{}, {"a"})},
+          {CreateTensor<tstring>(TensorShape{}, {"a"})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -525,20 +486,18 @@ TestCase InvalidPaddedValuesDTypeTestCase() {
 
 TestCase InvalidPaddedValuesShapeTestCase() {
   return {/*input_tensors*/
-          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {0, 1, 2, 3, 4, 5})},
-           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                    {6, 7, 8, 9, 10, 11})}},
+          {{CreateTensor<int64>(TensorShape{3, 2}, {0, 1, 2, 3, 4, 5})},
+           {CreateTensor<int64>(TensorShape{3, 2}, {6, 7, 8, 9, 10, 11})}},
           /*concatenate_output_dtypes*/ {DT_INT64},
           /*concatenate_output_shapes*/ {PartialTensorShape({2})},
           /*batch_size*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+          CreateTensor<int64>(TensorShape{}, {2}),
           /*padded_shapes*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          {CreateTensor<int64>(TensorShape{1}, {3})},
           /*padding_values*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1})},
+          {CreateTensor<int64>(TensorShape{1}, {1})},
           /*drop_remainder*/
-          DatasetOpsTestBase::CreateTensor<bool>(TensorShape{}, {false}),
+          CreateTensor<bool>(TensorShape{}, {false}),
           /*parallel_copy*/ true,
           /*n*/ 1,
           /*expected_outputs*/ {},
@@ -826,53 +785,6 @@ TEST_P(ParameterizedPaddedBatchDatasetOpTest, Cardinality) {
             test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedPaddedBatchDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  const TestCase &test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> padded_batch_dataset_kernel;
-  TF_ASSERT_OK(CreatePaddedBatchDatasetKernel(
-      test_case.parallel_copy, test_case.n, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &padded_batch_dataset_kernel));
-
-  Tensor concatenate_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(CreateConcatenateDatasetTensor(
-      test_case.input_tensors, test_case.concatenate_output_dtypes,
-      test_case.concatenate_output_shapes, &concatenate_dataset_tensor));
-  Tensor batch_size = test_case.batch_size;
-  std::vector<Tensor> padded_shapes = test_case.padded_shapes;
-  std::vector<Tensor> padding_values = test_case.padding_values;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&concatenate_dataset_tensor), TensorValue(&batch_size)});
-  for (auto &padded_shape : padded_shapes) {
-    inputs.emplace_back(&padded_shape);
-  }
-  for (auto &padding_value : padding_values) {
-    inputs.emplace_back(&padding_value);
-  }
-  inputs.emplace_back(&drop_remainder);
-
-  std::unique_ptr<OpKernelContext> padded_batch_dataset_context;
-  TF_ASSERT_OK(
-      CreatePaddedBatchDatasetContext(padded_batch_dataset_kernel.get(),
-                                      &inputs, &padded_batch_dataset_context));
-  DatasetBase *padded_batch_dataset;
-  TF_ASSERT_OK(CreateDataset(padded_batch_dataset_kernel.get(),
-                             padded_batch_dataset_context.get(),
-                             &padded_batch_dataset));
-  core::ScopedUnref scoped_unref(padded_batch_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(padded_batch_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedPaddedBatchDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   const TestCase &test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 3e213998a68..7bca8c06c44 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -153,6 +153,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         ParallelInterleaveDatasetOp::kDatasetType, params);
   }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -200,23 +205,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           num_parallel_calls_(std::make_shared<model::SharedState>(
               params.dataset->num_parallel_calls_, mu_, cond_var_)),
           sloppy_(sloppy),
-          current_elements_(params.dataset->cycle_length_) {
-      // The size of the threadpool is the smaller of:
-      //
-      // 1) The number of schedulable CPUs multiplied by a constant factor
-      //    factor to account for the fact that some threads may perform I/O.
-      //
-      // 2) The maximum number of iterators instantiated at any given point
-      //    in time (`cycle_length` for the current cycle elements and
-      //    `kPrefetchFactor * cycle_length` for future cycle elements).
-      const int num_threads =
-          std::min(static_cast<int>(kCPUFactor * port::NumSchedulableCPUs()),
-                   static_cast<int>((kPrefetchFactor + 1) *
-                                    params.dataset->cycle_length_));
-      thread_pool_ = absl::make_unique<thread::ThreadPool>(
-          Env::Default(), ThreadOptions(), kDataParallelInterleaveWorkerPool,
-          num_threads, /*low_latency_hint=*/false);
-    }
+          current_elements_(params.dataset->cycle_length_) {}
 
     ~ParallelInterleaveIterator() override {
       mutex_lock l(*mu_);
@@ -238,6 +227,24 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(*mu_);
+      // The size of the threadpool `num_threads` is the smaller of:
+      //
+      // 1) The number of schedulable CPUs multiplied by a constant factor
+      //    factor to account for the fact that some threads may perform I/O.
+      //
+      // 2) The maximum number of iterators instantiated at any given point
+      //    in time (`cycle_length` for the current cycle elements and
+      //    `kPrefetchFactor * cycle_length` for future cycle elements).
+      //
+      // Note that if `ctx->thread_pool()` is non-null, then instead of creating
+      // a dedicated thread pool of size `num_threads`, computation will be
+      // scheduled into the shared threadpool whose size is independent of
+      // `num_threads`.
+      const int num_threads = std::min(
+          static_cast<int>(kCPUFactor * port::NumSchedulableCPUs()),
+          static_cast<int>((kPrefetchFactor + 1) * dataset()->cycle_length_));
+      thread_pool_ =
+          ctx->CreateThreadPool(kDataParallelInterleaveWorkerPool, num_threads);
       if (num_parallel_calls_->value == model::kAutotune) {
         num_parallel_calls_->value = dataset()->cycle_length_;
       }
@@ -560,7 +567,6 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                       int64 num_results, std::function<void()> done)
         LOCKS_EXCLUDED(*mu_) {
       RecordStart(ctx.get());
-      auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
       bool end_of_input = false;
       for (int64 i = 0; i < num_results; ++i) {
         auto result = std::make_shared<Result>();
@@ -588,6 +594,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
       done();
       cond_var_->notify_all();
+      RecordStop(ctx.get());
     }
 
     // Manages futures cycle elements, creating new iterators as needed and
@@ -697,7 +704,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
-        string error_message;
+        tstring error_message;
         TF_RETURN_IF_ERROR(reader->ReadScalar(ErrorMessageKey(key_prefix, idx),
                                               &error_message));
         *status = Status(code, error_message);
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
index c2e66ec3b9a..98e9aa5bc91 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
@@ -89,8 +89,7 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
   std::vector<Tensor> tensors;
   tensors.reserve(values.size());
   for (auto &value : values) {
-    tensors.emplace_back(
-        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+    tensors.emplace_back(CreateTensor<T>(TensorShape({1}), {value}));
   }
   return tensors;
 }
@@ -107,346 +106,345 @@ FunctionDefHelper::AttrValueWrapper MakeTensorSliceDatasetFunc(
 // test case 1: cycle_length = 1, block_length = 1, num_parallel_calls = 1,
 // sloppy = false
 TestCase TestCase1() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*sloppy*/ false,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*sloppy*/ false,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 2: cycle_length = 2, block_length = 1, num_parallel_calls = 2,
 // sloppy = false
 TestCase TestCase2() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*sloppy*/ false,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 1, 4, 2, 5, 6, 7, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*sloppy*/ false,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 1, 4, 2, 5, 6, 7, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 3: cycle_length = 3, block_length = 1, num_parallel_calls = 2,
 // sloppy = true
 TestCase TestCase3() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*sloppy*/ true,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {3}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*sloppy*/ true,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 4: cycle_length = 5, block_length = 1, num_parallel_calls = 4,
 // sloppy = true
 TestCase TestCase4() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-          /*sloppy*/ true,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {5}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {4}),
+      /*sloppy*/ true,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({0, 3, 6, 1, 4, 7, 2, 5, 8}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 5: cycle_length = 2, block_length = 2, num_parallel_calls = 1,
 // sloppy = false
 TestCase TestCase5() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*sloppy*/ false,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "d", "e", "c", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {1}),
+          /*sloppy*/ false,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "d", "e", "c", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 6: cycle_length = 2, block_length = 3, num_parallel_calls = 2,
 // sloppy = true
 TestCase TestCase6() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*sloppy*/ true,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*sloppy*/ true,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 7: cycle_length = 3, block_length = 2, num_parallel_calls = 2,
 // sloppy = false
 TestCase TestCase7() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*sloppy*/ false,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "d", "e", "g", "h", "c", "f", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*sloppy*/ false,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "d", "e", "g", "h", "c", "f", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 8: cycle_length = 3, block_length = 3, num_parallel_calls = 3,
 // sloppy = true
 TestCase TestCase8() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*sloppy*/ true,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*sloppy*/ true,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 9: cycle_length = 4, block_length = 4, num_parallel_calls = 4,
 // sloppy = true
 TestCase TestCase9() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*sloppy*/ true,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {4}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {4}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {4}),
+          /*sloppy*/ true,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 10: cycle_length = 3, block_length = 3,
 // num_parallel_calls = kAutotune, sloppy = true
 TestCase TestCase10() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<string>(
-          TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e", "f", "g", "h", "i"})},
-      /*func*/
-      MakeTensorSliceDatasetFunc(
-          DataTypeVector({DT_STRING}),
-          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-      /*cycle_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*block_length*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}),
-                                              {model::kAutotune}),
-      /*sloppy*/ true,
-      /*expected_outputs*/
-      ConvertToTensorVec<string>({"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({1})},
-      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-      /*breakpoints*/ {0, 4, 11}};
+  return {/*input_tensors*/
+          {CreateTensor<tstring>(TensorShape{3, 3, 1}, {"a", "b", "c", "d", "e",
+                                                        "f", "g", "h", "i"})},
+          /*func*/
+          MakeTensorSliceDatasetFunc(
+              DataTypeVector({DT_STRING}),
+              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+          /*cycle_length*/
+          CreateTensor<int64>(TensorShape({}), {4}),
+          /*block_length*/
+          CreateTensor<int64>(TensorShape({}), {4}),
+          /*num_parallel_calls*/
+          CreateTensor<int64>(TensorShape({}), {model::kAutotune}),
+          /*sloppy*/ true,
+          /*expected_outputs*/
+          ConvertToTensorVec<tstring>(
+              {"a", "b", "c", "d", "e", "f", "g", "h", "i"}),
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+          /*breakpoints*/ {0, 4, 11}};
 }
 
 // test case 11: cycle_length = 0, block_length = 1, num_parallel_calls = 2,
 // sloppy = true
 TestCase InvalidCycleLengthTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*sloppy*/ true,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {0}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*sloppy*/ true,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 // test case 12: cycle_length = 1, block_length = -1, num_parallel_calls = 2,
 // sloppy = true
 TestCase InvalidBlockLengthTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-          /*sloppy*/ true,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {-1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {2}),
+      /*sloppy*/ true,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 // test case 13: cycle_length = 1, block_length = 1, num_parallel_calls = -5,
 // sloppy = true
 TestCase InvalidNumParallelCallsTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
-          /*func*/
-          MakeTensorSliceDatasetFunc(
-              DataTypeVector({DT_INT64}),
-              std::vector<PartialTensorShape>({PartialTensorShape({1})})),
-          /*func_lib*/ {test::function::MakeTensorSliceDataset()},
-          /*cycle_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*block_length*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-          /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-5}),
-          /*sloppy*/ true,
-          /*expected_outputs*/
-          ConvertToTensorVec<int64>({}),
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
-          /*breakpoints*/ {}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8})},
+      /*func*/
+      MakeTensorSliceDatasetFunc(
+          DataTypeVector({DT_INT64}),
+          std::vector<PartialTensorShape>({PartialTensorShape({1})})),
+      /*func_lib*/ {test::function::MakeTensorSliceDataset()},
+      /*cycle_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*block_length*/
+      CreateTensor<int64>(TensorShape({}), {1}),
+      /*num_parallel_calls*/
+      CreateTensor<int64>(TensorShape({}), {-5}),
+      /*sloppy*/ true,
+      /*expected_outputs*/
+      ConvertToTensorVec<int64>({}),
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ tensorflow::data::kUnknownCardinality,
+      /*breakpoints*/ {}};
 }
 
 class ParameterizedParallelInterleaveDatasetOpTest
@@ -726,47 +724,6 @@ TEST_P(ParameterizedParallelInterleaveDatasetOpTest, Cardinality) {
             test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedParallelInterleaveDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  const TestCase &test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  std::unique_ptr<OpKernel> parallel_interleave_dataset_kernel;
-  TF_ASSERT_OK(CreateParallelInterleaveDatasetKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, test_case.sloppy,
-      &parallel_interleave_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor cycle_length = test_case.cycle_length;
-  Tensor block_length = test_case.block_length;
-  Tensor num_parallel_calls = test_case.num_parallel_calls;
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&cycle_length),
-       TensorValue(&block_length), TensorValue(&num_parallel_calls)});
-  std::unique_ptr<OpKernelContext> parallel_interleave_dataset_context;
-  TF_ASSERT_OK(CreateInterleaveDatasetContext(
-      parallel_interleave_dataset_kernel.get(), &inputs,
-      &parallel_interleave_dataset_context));
-  DatasetBase *parallel_interleave_dataset;
-  TF_ASSERT_OK(CreateDataset(parallel_interleave_dataset_kernel.get(),
-                             parallel_interleave_dataset_context.get(),
-                             &parallel_interleave_dataset));
-  core::ScopedUnref scoped_unref(parallel_interleave_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(
-      parallel_interleave_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedParallelInterleaveDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   const TestCase &test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 4ec87110d33..625d672ab6a 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -89,6 +89,11 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status CheckExternalState() const override {
+    TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
index 34f19e0ece8..4870b7ce7b6 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
@@ -86,17 +86,17 @@ FunctionDefHelper::AttrValueWrapper MapFunc(const string& func_name,
 TestCase TestCase1() {
   return {/*range_data_param*/ {0, 10, 3},
           /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          CreateTensor<int64>(TensorShape({}), {1}),
           /*func*/ MapFunc("XTimesTwo", DT_INT64),
           /*func_lib*/ {test::function::XTimesTwo()},
           /*use_inter_op_parallelism*/ false,
           /*sloppy*/ false,
           /*preserve_cardinality*/ false,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {18})},
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {6}),
+           CreateTensor<int64>(TensorShape({}), {12}),
+           CreateTensor<int64>(TensorShape({}), {18})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 4,
@@ -108,17 +108,17 @@ TestCase TestCase1() {
 TestCase TestCase2() {
   return {/*range_data_param*/ {0, 10, 3},
           /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          CreateTensor<int64>(TensorShape({}), {2}),
           /*func*/ MapFunc("XTimesTwo", DT_INT64),
           /*func_lib*/ {test::function::XTimesTwo()},
           /*use_inter_op_parallelism*/ true,
           /*sloppy*/ true,
           /*preserve_cardinality*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {18})},
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {6}),
+           CreateTensor<int64>(TensorShape({}), {12}),
+           CreateTensor<int64>(TensorShape({}), {18})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 4,
@@ -131,17 +131,17 @@ TestCase TestCase3() {
   return {
       /*range_data_param*/ {0, 10, 3},
       /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+      CreateTensor<int64>(TensorShape({}), {3}),
       /*func*/ MapFunc("XTimesFour", DT_INT64),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*use_inter_op_parallelism*/ true,
       /*sloppy*/ false,
       /*preserve_cardinality*/ false,
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {12}),
+       CreateTensor<int64>(TensorShape({}), {24}),
+       CreateTensor<int64>(TensorShape({}), {36})},
       /*expected_output_dtypes*/ {DT_INT64},
       /*expected_output_shapes*/ {PartialTensorShape({})},
       /*expected_cardinality*/ 4,
@@ -153,17 +153,17 @@ TestCase TestCase3() {
 TestCase TestCase4() {
   return {/*range_data_param*/ {0, 10, 3},
           /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          CreateTensor<int64>(TensorShape({}), {4}),
           /*func*/ MapFunc("XTimesTwo", DT_INT64),
           /*func_lib*/ {test::function::XTimesTwo()},
           /*use_inter_op_parallelism*/ false,
           /*sloppy*/ false,
           /*preserve_cardinality*/ false,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {18})},
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {6}),
+           CreateTensor<int64>(TensorShape({}), {12}),
+           CreateTensor<int64>(TensorShape({}), {18})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 4,
@@ -176,18 +176,17 @@ TestCase TestCase5() {
   return {
       /*range_data_param*/ {0, 10, 3},
       /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}),
-                                              {model::kAutotune}),
+      CreateTensor<int64>(TensorShape({}), {model::kAutotune}),
       /*func*/ MapFunc("XTimesFour", DT_INT64),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*use_inter_op_parallelism*/ true,
       /*sloppy*/ true,
       /*preserve_cardinality*/ true,
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {12}),
+       CreateTensor<int64>(TensorShape({}), {24}),
+       CreateTensor<int64>(TensorShape({}), {36})},
       /*expected_output_dtypes*/ {DT_INT64},
       /*expected_output_shapes*/ {PartialTensorShape({})},
       /*expected_cardinality*/ 4,
@@ -200,17 +199,17 @@ TestCase TestCase6() {
   return {
       /*range_data_param*/ {0, 10, 3},
       /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+      CreateTensor<int64>(TensorShape({}), {4}),
       /*func*/ MapFunc("XTimesFour", DT_INT64),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*use_inter_op_parallelism*/ true,
       /*sloppy*/ false,
       /*preserve_cardinality*/ false,
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {12}),
+       CreateTensor<int64>(TensorShape({}), {24}),
+       CreateTensor<int64>(TensorShape({}), {36})},
       /*expected_output_dtypes*/ {DT_INT64},
       /*expected_output_shapes*/ {PartialTensorShape({})},
       /*expected_cardinality*/ 4,
@@ -224,17 +223,17 @@ TestCase TestCase7() {
   return {
       /*range_data_param*/ {0, 10, 3},
       /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+      CreateTensor<int64>(TensorShape({}), {2}),
       /*func*/ MapFunc("XTimesFour", DT_INT64),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*use_inter_op_parallelism*/ false,
       /*sloppy*/ false,
       /*preserve_cardinality*/ false,
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {12}),
+       CreateTensor<int64>(TensorShape({}), {24}),
+       CreateTensor<int64>(TensorShape({}), {36})},
       /*expected_output_dtypes*/ {DT_INT64},
       /*expected_output_shapes*/ {PartialTensorShape({})},
       /*expected_cardinality*/ 4,
@@ -248,18 +247,17 @@ TestCase TestCase8() {
   return {
       /*range_data_param*/ {0, 10, 3},
       /*num_parallel_calls*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}),
-                                              {model::kAutotune}),
+      CreateTensor<int64>(TensorShape({}), {model::kAutotune}),
       /*func*/ MapFunc("XTimesFour", DT_INT64),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*use_inter_op_parallelism*/ false,
       /*sloppy*/ true,
       /*preserve_cardinality*/ true,
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {12}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {24}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {36})},
+      {CreateTensor<int64>(TensorShape({}), {0}),
+       CreateTensor<int64>(TensorShape({}), {12}),
+       CreateTensor<int64>(TensorShape({}), {24}),
+       CreateTensor<int64>(TensorShape({}), {36})},
       /*expected_output_dtypes*/ {DT_INT64},
       /*expected_output_shapes*/ {PartialTensorShape({})},
       /*expected_cardinality*/ 4,
@@ -269,7 +267,7 @@ TestCase TestCase8() {
 TestCase InvalidNumParallelCallsTestCase() {
   return {/*range_data_param*/ {0, 10, 3},
           /*num_parallel_calls*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-4}),
+          CreateTensor<int64>(TensorShape({}), {-4}),
           /*func*/ MapFunc("XTimesTwo", DT_INT64),
           /*func_lib*/ {test::function::XTimesTwo()},
           /*use_inter_op_parallelism*/ true,
@@ -529,49 +527,6 @@ TEST_P(ParameterizedParallelMapDatasetOpTest, Cardinality) {
             test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedParallelMapDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
-
-  std::unique_ptr<OpKernel> parallel_map_dataset_kernel;
-  TF_ASSERT_OK(CreateParallelMapDatasetOpKernel(
-      test_case.func, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, test_case.use_inter_op_parallelism,
-      test_case.sloppy, test_case.preserve_cardinality,
-      &parallel_map_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_data_param.start, test_case.range_data_param.end,
-      test_case.range_data_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  Tensor num_parallel_calls = test_case.num_parallel_calls;
-  gtl::InlinedVector<TensorValue, 4> parallel_map_dataset_inputs(
-      {TensorValue(&range_dataset_tensor), TensorValue(&num_parallel_calls)});
-
-  std::unique_ptr<OpKernelContext> parallel_map_dataset_context;
-  TF_ASSERT_OK(CreateParallelMapDatasetContext(
-      parallel_map_dataset_kernel.get(), &parallel_map_dataset_inputs,
-      &parallel_map_dataset_context));
-  DatasetBase* parallel_map_dataset;
-  TF_ASSERT_OK(CreateDataset(parallel_map_dataset_kernel.get(),
-                             parallel_map_dataset_context.get(),
-                             &parallel_map_dataset));
-  core::ScopedUnref scoped_unref_map_dataset(parallel_map_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(
-      parallel_map_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedParallelMapDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 89218192d7b..76146ee8dee 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -379,7 +379,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     error::Code code = static_cast<error::Code>(code_int);
 
     if (code != error::Code::OK) {
-      string error_message;
+      tstring error_message;
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(ErrorMessageKey(index), &error_message));
       *status = Status(code, error_message);
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 76a4e39650e..692fb554151 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
@@ -39,6 +40,7 @@ namespace data {
 /* static */ constexpr const char* const PrefetchDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const PrefetchDatasetOp::kOutputShapes;
 /* static */ constexpr const char* const PrefetchDatasetOp::kSlackPeriod;
+/* static */ constexpr const char* const PrefetchDatasetOp::kLegacyAutotune;
 
 // Determines the fraction of slack time by which to delay prefetching of data.
 constexpr double kSleepFactor = 0.2;
@@ -51,11 +53,12 @@ constexpr char kErrorMessageSuffix[] = ".error_message";
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
-          int64 slack_period)
+          int64 slack_period, bool legacy_autotune)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
         buffer_size_(buffer_size),
-        slack_period_(slack_period) {
+        slack_period_(slack_period),
+        legacy_autotune_(legacy_autotune) {
     input_->Ref();
   }
 
@@ -81,6 +84,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -102,31 +109,30 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          auto_tuner_(params.dataset->buffer_size_) {
+          mu_(std::make_shared<mutex>()),
+          parent_mu_(std::make_shared<mutex>()),
+          cond_var_(std::make_shared<condition_variable>()),
+          auto_tuner_(params.dataset->buffer_size_),
+          legacy_autotune_(params.dataset->legacy_autotune_),
+          buffer_size_(std::make_shared<model::SharedState>(
+              legacy_autotune_ ? 0 : params.dataset->buffer_size_, mu_,
+              cond_var_)) {
       slack_us_ = 0;
     }
 
     ~Iterator() override {
-      // Signal the prefetch thread to terminate it. We will then
-      // join that thread when we delete `this->prefetch_thread_`.
-      //
-      // TODO(mrry): Replace this cancellation logic with a
-      // CancellationManager. The syntax would be more heavyweight,
-      // but it would be possible to thread a cancellation manager
-      // through the IteratorContext to upstream,
-      // potentially-blocking iterators, when we add these.
-      {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        cond_var_.notify_all();
-      }
+      mutex_lock l(*mu_);
+      cancellation_manager_.StartCancel();
+      cond_var_->notify_all();
+      deregister_fn_();
     }
 
     string BuildTraceMeName() override {
       int64 buffer_limit;
       {
-        tf_shared_lock l(mu_);
-        buffer_limit = auto_tuner_.buffer_limit();
+        tf_shared_lock l(*mu_);
+        buffer_limit =
+            legacy_autotune_ ? auto_tuner_.buffer_limit() : buffer_size_->value;
       }
       string prefetch_with_slack_trace = "";
       if (dataset()->slack_period_ > 0) {
@@ -138,6 +144,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(*mu_);
+      if (buffer_size_->value == model::kAutotune) {
+        buffer_size_->value = 0;
+      }
+      TF_RETURN_IF_ERROR(
+          ConnectCancellationManagers(ctx->cancellation_manager(),
+                                      &cancellation_manager_, &deregister_fn_));
       return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
     }
 
@@ -146,19 +159,30 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       const auto& stats_aggregator = ctx->stats_aggregator();
       {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(EnsurePrefetchThreadStarted(ctx));
         // Wait until the next element in the buffer has been
         // produced, or we are shutting down.
-        while (!cancelled_ && buffer_.empty() && !prefetch_thread_finished_ &&
-               auto_tuner_.buffer_limit() != 0) {
-          auto_tuner_.RecordEmpty();
-          RecordStop(ctx);
-          cond_var_.wait(l);
-          RecordStart(ctx);
+        if (legacy_autotune_) {
+          while (!cancellation_manager_.IsCancelled() && buffer_.empty() &&
+                 !prefetch_thread_finished_ &&
+                 auto_tuner_.buffer_limit() != 0) {
+            auto_tuner_.RecordEmpty();
+            buffer_size_->value = auto_tuner_.buffer_limit();
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
+          }
+        } else {
+          while (!cancellation_manager_.IsCancelled() && buffer_.empty() &&
+                 !prefetch_thread_finished_ && buffer_size_->value != 0) {
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
+          }
         }
 
-        if (cancelled_) {
+        if (cancellation_manager_.IsCancelled()) {
           return errors::Cancelled(
               "PrefetchDatasetOp::Dataset::Iterator::GetNext");
         }
@@ -172,18 +196,18 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           return Status::OK();
         }
 
-        DCHECK_EQ(auto_tuner_.buffer_limit(), 0);
+        DCHECK_EQ(buffer_limit(), 0);
       }
 
-      mutex_lock parent_l(parent_mu_);
-      mutex_lock l(mu_);
+      mutex_lock parent_l(*parent_mu_);
+      mutex_lock l(*mu_);
       if (stats_aggregator) {
         stats_aggregator->AddScalar(
             stats_utils::BufferSizeScalarName(dataset()->node_name()),
             static_cast<float>(buffer_.size()), num_elements());
         stats_aggregator->AddScalar(
             stats_utils::BufferCapacityScalarName(dataset()->node_name()),
-            static_cast<float>(auto_tuner_.buffer_limit()), num_elements());
+            static_cast<float>(buffer_limit()), num_elements());
       }
       return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
     }
@@ -191,16 +215,18 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeAsyncKnownRatioNode(std::move(args),
-                                            /*ratio=*/1,
-                                            /*parameters=*/{});
+      return model::MakeAsyncKnownRatioNode(
+          std::move(args),
+          /*ratio=*/1,
+          {model::MakeParameter(kBufferSize, buffer_size_, /*min=*/0,
+                                /*max=*/std::numeric_limits<int64>::max())});
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
       // Acquire both locks to ensure that the prefetch thread and
       // all GetNext threads are blocked.
-      mutex_lock parent_l(parent_mu_);
-      mutex_lock l(mu_);
+      mutex_lock parent_l(*parent_mu_);
+      mutex_lock l(*mu_);
       TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(full_name(kBufferSize), buffer_.size()));
@@ -223,8 +249,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      mutex_lock parent_l(parent_mu_);
-      mutex_lock l(mu_);
+      mutex_lock parent_l(*parent_mu_);
+      mutex_lock l(*mu_);
       buffer_.clear();
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       size_t buffer_size;
@@ -269,21 +295,27 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       int64 created_us;
     };
 
+    inline int64 buffer_limit() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      return legacy_autotune_ ? auto_tuner_.buffer_limit()
+                              : buffer_size_->value;
+    }
+
     Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                    bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
+        double buffer_limit_ = buffer_limit();
         stats_aggregator->AddToHistogram(
             stats_utils::BufferUtilizationHistogramName(dataset()->node_name()),
             {static_cast<float>(buffer_.size()) /
-             static_cast<float>(auto_tuner_.buffer_limit())},
+             static_cast<float>(buffer_limit_)},
             num_elements());
         stats_aggregator->AddScalar(
             stats_utils::BufferSizeScalarName(dataset()->node_name()),
             static_cast<float>(buffer_.size()), num_elements());
         stats_aggregator->AddScalar(
             stats_utils::BufferCapacityScalarName(dataset()->node_name()),
-            static_cast<float>(auto_tuner_.buffer_limit()), num_elements());
+            static_cast<float>(buffer_limit_), num_elements());
       }
       // A new element is available. Forward the status from computing it, and
       // (if we successfully got an element) the output values.
@@ -306,7 +338,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         *out_tensors = std::move(buffer_.front().value);
         RecordBufferDequeue(ctx, *out_tensors);
       }
-      auto_tuner_.RecordConsumption(buffer_.size());
+      if (legacy_autotune_) {
+        auto_tuner_.RecordConsumption(buffer_.size());
+        buffer_size_->value = auto_tuner_.buffer_limit();
+      }
       buffer_.pop_front();
       *end_of_sequence = false;
 
@@ -315,12 +350,12 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       //
       // TODO(mrry): Consider using different condition variables for
       // GetNext and Prefetch.
-      cond_var_.notify_all();
+      cond_var_->notify_all();
       return s;
     }
 
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       if (!prefetch_thread_) {
         std::shared_ptr<IteratorContext> new_ctx =
             std::make_shared<IteratorContext>(*ctx);
@@ -330,8 +365,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       return Status::OK();
     }
 
-    // Prefetches elements of the input, storing results in an internal
-    // buffer.
+    // Prefetches elements of the input, storing results in an internal buffer.
     //
     // It owns the iterator context passed to it.
     void PrefetchThread(const std::shared_ptr<IteratorContext>& ctx) {
@@ -342,14 +376,15 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       while (true) {
         // 1. Wait for a slot in the buffer.
         {
-          mutex_lock l(mu_);
-          while (!cancelled_ && buffer_.size() >= auto_tuner_.buffer_limit()) {
+          mutex_lock l(*mu_);
+          while (!cancellation_manager_.IsCancelled() &&
+                 buffer_.size() >= buffer_limit()) {
             RecordStop(ctx.get());
-            cond_var_.wait(l);
+            cond_var_->wait(l);
             RecordStart(ctx.get());
           }
 
-          if (cancelled_) {
+          if (cancellation_manager_.IsCancelled()) {
             return;
           }
         }
@@ -368,32 +403,32 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         // this lock till we have added the fetched element to the
         // `buffer_` else there will be local state that may be missed
         // by SaveInternal.
-        mutex_lock parent_l(parent_mu_);
+        mutex_lock parent_l(*parent_mu_);
         bool end_of_sequence;
         BufferElement buffer_element;
         buffer_element.status = input_impl_->GetNext(
             ctx.get(), &buffer_element.value, &end_of_sequence);
         if (buffer_element.status.ok() && end_of_sequence) {
-          mutex_lock l(mu_);
+          mutex_lock l(*mu_);
           prefetch_thread_finished_ = true;
-          cond_var_.notify_all();
+          cond_var_->notify_all();
           return;
         }
 
         // 3. Signal that the element has been produced.
         {
-          mutex_lock l(mu_);
+          mutex_lock l(*mu_);
           RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_element.created_us = ctx->env()->NowMicros();
           buffer_.push_back(std::move(buffer_element));
-          cond_var_.notify_all();
+          cond_var_->notify_all();
         }
         ++num_produced;
       }
     }
 
     Status WriteStatus(IteratorStateWriter* writer, size_t index,
-                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           CodeKey(index), static_cast<int64>(status.code())));
       if (!status.ok()) {
@@ -404,13 +439,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status ReadStatus(IteratorStateReader* reader, size_t index, Status* status)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64 code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
-        string error_message;
+        tstring error_message;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(ErrorMessageKey(index), &error_message));
         *status = Status(code, error_message);
@@ -431,20 +466,27 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
     // This mutex is used to ensure exclusivity between multiple threads
     // reading/writing this iterator's local state.
-    mutex mu_;
+    const std::shared_ptr<mutex> mu_;
     // This mutex is used to ensure exclusivity between multiple threads
     // accessing the parent iterator. We keep this separate from `mu_` to
     // allow prefetching to run in parallel with GetNext calls.
-    mutex parent_mu_ ACQUIRED_BEFORE(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
-    condition_variable cond_var_;
-    PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
-    std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
-    bool cancelled_ GUARDED_BY(mu_) = false;
-    bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
+    const std::shared_ptr<mutex> parent_mu_ ACQUIRED_BEFORE(*mu_);
+    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*parent_mu_);
+    const std::shared_ptr<condition_variable> cond_var_;
+    PrefetchAutotuner auto_tuner_ GUARDED_BY(*mu_);
+    std::deque<BufferElement> buffer_ GUARDED_BY(*mu_);
+    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(*mu_);
+    bool cancelled_ GUARDED_BY(*mu_) = false;
+    bool prefetch_thread_finished_ GUARDED_BY(*mu_) = false;
+    const bool legacy_autotune_;
 
     std::atomic<int64> slack_us_;
+
+    // If legacy_autotune_ is false, identifies the maximum size of the buffer.
+    const std::shared_ptr<model::SharedState> buffer_size_;
+
+    CancellationManager cancellation_manager_;
+    std::function<void()> deregister_fn_;
   };
   const DatasetBase* const input_;
   const int64 buffer_size_;
@@ -452,6 +494,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
   // If non-zero, determines the period between injecting "slack" into the
   // execution.
   const int64 slack_period_;
+
+  // Determines whether legacy autotuning should be used.
+  const bool legacy_autotune_ = true;
 };
 
 PrefetchDatasetOp::PrefetchDatasetOp(OpKernelConstruction* ctx)
@@ -459,6 +504,9 @@ PrefetchDatasetOp::PrefetchDatasetOp(OpKernelConstruction* ctx)
   if (ctx->HasAttr(kSlackPeriod)) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kSlackPeriod, &slack_period_));
   }
+  if (ctx->HasAttr(kLegacyAutotune)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kLegacyAutotune, &legacy_autotune_));
+  }
 }
 
 void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -475,7 +523,8 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     metrics::RecordTFDataAutotune(kDatasetType);
   }
 
-  *output = new Dataset(ctx, input, buffer_size, slack_period_);
+  *output =
+      new Dataset(ctx, input, buffer_size, slack_period_, legacy_autotune_);
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
index 17df8078709..999f002bf16 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.h
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -30,6 +30,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
   static constexpr const char* const kSlackPeriod = "slack_period";
+  static constexpr const char* const kLegacyAutotune = "legacy_autotune";
 
   explicit PrefetchDatasetOp(OpKernelConstruction* ctx);
 
@@ -40,6 +41,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset;
   int64 slack_period_ = 0;
+  bool legacy_autotune_ = true;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 3cd70c82d4d..03c193f7b9e 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -44,7 +44,8 @@ class PrefetchDatasetOpTest : public DatasetOpsTestBase {
         {PrefetchDatasetOp::kInputDataset, PrefetchDatasetOp::kBufferSize},
         {{PrefetchDatasetOp::kOutputTypes, output_types},
          {PrefetchDatasetOp::kOutputShapes, output_shapes},
-         {PrefetchDatasetOp::kSlackPeriod, 0}});
+         {PrefetchDatasetOp::kSlackPeriod, 0},
+         {PrefetchDatasetOp::kLegacyAutotune, true}});
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
     return Status::OK();
   }
@@ -70,81 +71,81 @@ struct TestCase {
 };
 
 TestCase PositiveBufferSizeTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*buffer_size*/ 5,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 10,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*buffer_size*/ 5,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 TestCase ZeroBufferSizeTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*buffer_size*/ 0,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 10,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*buffer_size*/ 0,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 TestCase AutoTuneTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*buffer_size*/ -1,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 10,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*buffer_size*/ -1,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 TestCase InvalidBufferSizeTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*buffer_size*/ -2,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 4, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*buffer_size*/ -2,
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 4, 11}};
 }
 
 class ParameterizedPrefetchDatasetOpTest
@@ -397,43 +398,6 @@ TEST_P(ParameterizedPrefetchDatasetOpTest, Cardinality) {
   EXPECT_EQ(prefetch_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_F(PrefetchDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = PositiveBufferSizeTestCase();
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor buffer_size =
-      CreateTensor<int64>(TensorShape{}, {test_case.buffer_size});
-  gtl::InlinedVector<TensorValue, 4> inputs_for_prefetch_dataset(
-      {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&buffer_size)});
-
-  std::unique_ptr<OpKernel> prefetch_dataset_kernel;
-  TF_ASSERT_OK(CreatePrefetchDatasetKernel(test_case.expected_output_dtypes,
-                                           test_case.expected_output_shapes,
-                                           &prefetch_dataset_kernel));
-  std::unique_ptr<OpKernelContext> prefetch_dataset_context;
-  TF_ASSERT_OK(CreatePrefetchDatasetContext(prefetch_dataset_kernel.get(),
-                                            &inputs_for_prefetch_dataset,
-                                            &prefetch_dataset_context));
-  DatasetBase *prefetch_dataset;
-  TF_ASSERT_OK(CreateDataset(prefetch_dataset_kernel.get(),
-                             prefetch_dataset_context.get(),
-                             &prefetch_dataset));
-  core::ScopedUnref scoped_unref(prefetch_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(prefetch_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_F(PrefetchDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/random_seed_ops.cc b/tensorflow/core/kernels/data/random_seed_ops.cc
new file mode 100644
index 00000000000..99a174aaad8
--- /dev/null
+++ b/tensorflow/core/kernels/data/random_seed_ops.cc
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/random_seed_ops.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+const char kNumRandomSamples[] = "num_random_samples";
+const char kRandomSeedGenerator[] = "RandomSeedGenerator";
+const char kSeed[] = "seed";
+const char kSeed2[] = "seed2";
+
+}  // namespace
+
+string RandomSeedGenerator::DebugString() const { return kRandomSeedGenerator; }
+
+void RandomSeedGenerator::GenerateRandomSeeds(int64* seed1, int64* seed2) {
+  mutex_lock l(mu_);
+  num_random_samples_++;
+  *seed1 = generator_();
+  num_random_samples_++;
+  *seed2 = generator_();
+}
+
+int64 RandomSeedGenerator::num_random_samples() {
+  tf_shared_lock l(mu_);
+  return num_random_samples_;
+}
+
+void RandomSeedGenerator::set_num_random_samples(int64 num_random_samples) {
+  mutex_lock l(mu_);
+  num_random_samples_ = num_random_samples;
+}
+
+void RandomSeedGenerator::Reset() {
+  mutex_lock l(mu_);
+  // Reset the generators based on the current seeds.
+  parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+  generator_ =
+      random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
+  generator_.Skip(num_random_samples_);
+}
+
+void RandomSeedGenerator::Serialize(OpKernelContext* ctx) {
+  mutex_lock l(mu_);
+  Tensor* num_random_samples;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(kNumRandomSamples, TensorShape({}),
+                                           &num_random_samples));
+  num_random_samples->scalar<int64>()() = num_random_samples_;
+  Tensor* seed;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(kSeed, TensorShape({}), &seed));
+  seed->scalar<int64>()() = seed_;
+  Tensor* seed2;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(kSeed2, TensorShape({}), &seed2));
+  seed2->scalar<int64>()() = seed2_;
+}
+
+AnonymousRandomSeedGeneratorHandleOp::AnonymousRandomSeedGeneratorHandleOp(
+    OpKernelConstruction* ctx)
+    : AnonymousResourceOp<RandomSeedGenerator>(ctx) {}
+
+void AnonymousRandomSeedGeneratorHandleOp::Compute(OpKernelContext* ctx) {
+  int64 seed;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
+  int64 seed2;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
+  if (seed == 0 && seed2 == 0) {
+    seed = random::New64();
+    seed2 = random::New64();
+  }
+  seed_ = seed;
+  seed2_ = seed2;
+  AnonymousResourceOp<RandomSeedGenerator>::Compute(ctx);
+}
+
+string AnonymousRandomSeedGeneratorHandleOp::name() {
+  return kRandomSeedGenerator;
+}
+
+Status AnonymousRandomSeedGeneratorHandleOp::CreateResource(
+    OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+    FunctionLibraryRuntime* lib, RandomSeedGenerator** resource) {
+  *resource = new RandomSeedGenerator(seed_, seed2_);
+  return Status::OK();
+}
+
+void DeleteRandomSeedGeneratorOp::Compute(OpKernelContext* ctx) {
+  ResourceHandle handle = ctx->input(0).flat<ResourceHandle>()(0);
+  // The resource is guaranteed to exist because the variant tensor wrapping the
+  // deleter is provided as an unused input to this op, which guarantees that it
+  // has not run yet.
+  Status s = ctx->resource_manager()->Delete(handle);
+  if (errors::IsNotFound(s)) {
+    // TODO(b/135948230): Investigate why is the above statement not true and
+    // then get rid of the special case.
+    ctx->SetStatus(Status::OK());
+    return;
+  }
+  ctx->SetStatus(s);
+}
+
+namespace {
+
+REGISTER_KERNEL_BUILDER(Name("AnonymousRandomSeedGenerator").Device(DEVICE_CPU),
+                        AnonymousRandomSeedGeneratorHandleOp);
+
+REGISTER_KERNEL_BUILDER(Name("DeleteRandomSeedGenerator").Device(DEVICE_CPU),
+                        DeleteRandomSeedGeneratorOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/random_seed_ops.h b/tensorflow/core/kernels/data/random_seed_ops.h
new file mode 100644
index 00000000000..750e6fdfdba
--- /dev/null
+++ b/tensorflow/core/kernels/data/random_seed_ops.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+namespace data {
+
+// A random seed generator resource.
+class RandomSeedGenerator : public ResourceBase {
+ public:
+  RandomSeedGenerator(int64 seed, int64 seed2)
+      : seed_(seed),
+        seed2_(seed2),
+        parent_generator_(seed, seed2),
+        generator_(&parent_generator_) {}
+
+  int64 num_random_samples();
+  void set_num_random_samples(int64 num_random_samples);
+
+  string DebugString() const override;
+  void GenerateRandomSeeds(int64* seed1, int64* seed2);
+  void Reset();
+  void Serialize(OpKernelContext* ctx);
+
+ private:
+  const int64 seed_;
+  const int64 seed2_;
+  mutex mu_;
+  random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+  random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_);
+  int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+};
+
+// Creates an instance of random seed generator resource and transfers ownership
+// to the caller.
+class AnonymousRandomSeedGeneratorHandleOp
+    : public AnonymousResourceOp<RandomSeedGenerator> {
+ public:
+  explicit AnonymousRandomSeedGeneratorHandleOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  string name() override;
+  Status CreateResource(OpKernelContext* ctx,
+                        std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                        FunctionLibraryRuntime* lib,
+                        RandomSeedGenerator** resource) override;
+
+  int64 seed_;
+  int64 seed2_;
+};
+
+// Deletes an instance of random seed generator resource.
+class DeleteRandomSeedGeneratorOp : public OpKernel {
+ public:
+  explicit DeleteRandomSeedGeneratorOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 8b37b2cb65b..8e870859913 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -73,6 +73,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 3165ad5d1a0..62f621fd838 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
@@ -23,483 +22,156 @@ namespace {
 
 constexpr char kNodeName[] = "range_dataset";
 
-class RangeDatasetOpTest : public DatasetOpsTestBase {
- protected:
-  // Creates a new RangeDataset op kernel context.
-  Status CreateRangeDatasetContext(
-      OpKernel* const range_kernel,
-      gtl::InlinedVector<TensorValue, 4>* const inputs,
-      std::unique_ptr<OpKernelContext>* range_context) {
-    TF_RETURN_IF_ERROR(CheckOpKernelInput(*range_kernel, *inputs));
+class RangeDatasetOpTest : public DatasetOpsTestBaseV2<RangeDatasetParams> {
+ public:
+  Status Initialize(RangeDatasetParams* range_dataset_params) override {
+    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
+
     TF_RETURN_IF_ERROR(
-        CreateOpKernelContext(range_kernel, inputs, range_context));
+        MakeDatasetOpKernel(*range_dataset_params, &dataset_kernel_));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    TF_RETURN_IF_ERROR(range_dataset_params->MakeInputs(&inputs));
+    TF_RETURN_IF_ERROR(
+        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+    TF_RETURN_IF_ERROR(
+        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+        iterator_ctx_.get(), range_dataset_params->iterator_prefix,
+        &iterator_));
+    return Status::OK();
+  }
+
+ protected:
+  Status MakeDatasetOpKernel(
+      const RangeDatasetParams& dataset_params,
+      std::unique_ptr<OpKernel>* range_dataset_op_kernel) override {
+    NodeDef node_def = test::function::NDef(
+        dataset_params.node_name,
+        name_utils::OpName(RangeDatasetOp::kDatasetType),
+        {RangeDatasetOp::kStart, RangeDatasetOp::kStop, RangeDatasetOp::kStep},
+        {{RangeDatasetOp::kOutputTypes, dataset_params.output_dtypes},
+         {RangeDatasetOp::kOutputShapes, dataset_params.output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, range_dataset_op_kernel));
     return Status::OK();
   }
 };
 
-struct TestCase {
-  int64 start;
-  int64 stop;
-  int64 step;
-  std::vector<Tensor> expected_outputs;
-  DataTypeVector expected_output_dtypes;
-  std::vector<PartialTensorShape> expected_output_shapes;
-  int64 expected_cardinality;
-  std::vector<int> breakpoints;
-};
-
-TestCase PositiveStepTestCase() {
-  return {/*start*/ 0,
-          /*stop*/ 10,
-          /*step*/ 3,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 1, 4}};
+RangeDatasetParams PositiveStepRangeDatasetParams() {
+  return {/*start=*/0,
+          /*stop=*/10,
+          /*step=*/3,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
 }
 
-TestCase NegativeStepTestCase() {
-  return {/*start*/ 10,
-          /*stop*/ 0,
-          /*step*/ -3,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 1, 4}};
+RangeDatasetParams NegativeStepRangeDatasetParams() {
+  return {/*start=*/10,
+          /*stop=*/0,
+          /*step=*/-3,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
 }
 
-TestCase ZeroStepTestCase() {
-  return {/*start*/ 0,
-          /*stop*/ 10,
-          /*step*/ 0,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {},
-          /*expected_output_shapes*/ {},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {}};
+RangeDatasetParams ZeroStepRangeDatasetParams() {
+  return {/*start=*/10,
+          /*stop=*/0,
+          /*step=*/0,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
 }
 
-class ParameterizedRangeDatasetOpTest
-    : public RangeDatasetOpTest,
-      public ::testing::WithParamInterface<TestCase> {};
-
-TEST_P(ParameterizedRangeDatasetOpTest, GetNext) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = GetParam();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(range_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
-                                           &iterator));
-
-  bool end_of_sequence = false;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  std::vector<Tensor> out_tensors;
-  while (!end_of_sequence) {
-    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
-                                   &end_of_sequence));
-    if (!end_of_sequence) {
-      EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
-      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-      expected_outputs_it++;
-    }
-  }
-  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+std::vector<GetNextTestCase<RangeDatasetParams>> GetNextTestCases() {
+  return {{/*dataset_params=*/PositiveStepRangeDatasetParams(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {3}, {6}, {9}})},
+          {/*dataset_params=*/NegativeStepRangeDatasetParams(),
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{10}, {7}, {4}, {1}})}};
 }
 
-TEST_F(RangeDatasetOpTest, ZeroStep) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = ZeroStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  EXPECT_EQ(CreateDataset(range_dataset_kernel.get(),
-                          range_dataset_context.get(), &range_dataset)
-                .code(),
-            tensorflow::error::INVALID_ARGUMENT);
-}
+ITERATOR_GET_NEXT_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
+                         GetNextTestCases())
 
 TEST_F(RangeDatasetOpTest, DatasetNodeName) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  EXPECT_EQ(range_dataset->node_name(), kNodeName);
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(range_dataset_params.node_name));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetTypeString) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
   TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  EXPECT_EQ(range_dataset->type_string(),
-            name_utils::OpName(RangeDatasetOp::kDatasetType));
+      CheckDatasetTypeString(name_utils::OpName(RangeDatasetOp::kDatasetType)));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  TF_EXPECT_OK(VerifyTypesMatch(range_dataset->output_dtypes(),
-                                test_case.expected_output_dtypes));
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  TF_EXPECT_OK(VerifyShapesCompatible(range_dataset->output_shapes(),
-                                      test_case.expected_output_shapes));
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckDatasetOutputShapes({PartialTensorShape({})}));
 }
 
-TEST_P(ParameterizedRangeDatasetOpTest, Cardinality) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = GetParam();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  EXPECT_EQ(range_dataset->Cardinality(), test_case.expected_cardinality);
+std::vector<CardinalityTestCase<RangeDatasetParams>> CardinalityTestCases() {
+  return {{/*dataset_params=*/PositiveStepRangeDatasetParams(),
+           /*expected_cardinality=*/4},
+          {/*dataset_params=*/NegativeStepRangeDatasetParams(),
+           /*expected_cardinality=*/4}};
 }
 
-TEST_F(RangeDatasetOpTest, DatasetSave) {
-  int64 thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(range_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
+DATASET_CARDINALITY_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
+                           CardinalityTestCases())
 
 TEST_F(RangeDatasetOpTest, IteratorOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(range_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
-                                           &iterator));
-
-  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
-                                test_case.expected_output_dtypes));
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
 TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(range_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
-                                           &iterator));
-
-  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
-                                      test_case.expected_output_shapes));
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckIteratorOutputShapes({PartialTensorShape({})}));
 }
 
-TEST_F(RangeDatasetOpTest, IteratorOutputPrefix) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = PositiveStepTestCase();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(range_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
-                                           &iterator));
-
-  EXPECT_EQ(iterator->prefix(), name_utils::IteratorPrefix(
-                                    RangeDatasetOp::kDatasetType, "Iterator"));
+TEST_F(RangeDatasetOpTest, IteratorPrefix) {
+  auto range_dataset_params = PositiveStepRangeDatasetParams();
+  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
+      RangeDatasetOp::kDatasetType, range_dataset_params.iterator_prefix)));
 }
 
-TEST_P(ParameterizedRangeDatasetOpTest, Roundtrip) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TestCase test_case = GetParam();
-  Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
-  Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&start), TensorValue(&stop), TensorValue(&step)});
-
-  std::unique_ptr<OpKernel> range_dataset_kernel;
-  TF_ASSERT_OK(
-      CreateRangeDatasetOpKernel<int64>(kNodeName, &range_dataset_kernel));
-  std::unique_ptr<OpKernelContext> range_dataset_context;
-  TF_ASSERT_OK(CreateRangeDatasetContext(range_dataset_kernel.get(), &inputs,
-                                         &range_dataset_context));
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateDataset(range_dataset_kernel.get(),
-                             range_dataset_context.get(), &range_dataset));
-  core::ScopedUnref scoped_unref(range_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_context;
-  TF_ASSERT_OK(
-      CreateIteratorContext(range_dataset_context.get(), &iterator_context));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
-                                           &iterator));
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  bool end_of_sequence = false;
-  std::vector<Tensor> out_tensors;
-  int cur_iteration = 0;
-  auto expected_outputs_it = test_case.expected_outputs.begin();
-  const std::vector<int>& breakpoints = test_case.breakpoints;
-  for (int breakpoint : breakpoints) {
-    VariantTensorData data;
-    VariantTensorDataWriter writer(&data);
-    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
-    TF_EXPECT_OK(writer.Flush());
-    VariantTensorDataReader reader(&data);
-    TF_EXPECT_OK(RestoreIterator(iterator_context.get(), &reader, "Iterator",
-                                 *range_dataset, &iterator));
-
-    while (cur_iteration <= breakpoint) {
-      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
-                                     &end_of_sequence));
-      if (!end_of_sequence) {
-        EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
-        TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
-        expected_outputs_it++;
-      }
-      cur_iteration++;
-    }
-
-    if (breakpoint >= test_case.expected_cardinality) {
-      EXPECT_TRUE(end_of_sequence);
-      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
-    } else {
-      EXPECT_FALSE(end_of_sequence);
-    }
-  }
+std::vector<IteratorSaveAndRestoreTestCase<RangeDatasetParams>>
+IteratorSaveAndRestoreTestCases() {
+  return {{/*dataset_params=*/PositiveStepRangeDatasetParams(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{0}, {3}, {6}, {9}})},
+          {/*dataset_params=*/NegativeStepRangeDatasetParams(),
+           /*breakpoints=*/{0, 1, 4},
+           /*expected_outputs=*/
+           CreateTensors<int64>(TensorShape({}), {{10}, {7}, {4}, {1}})}};
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    RangeDatasetOpTest, ParameterizedRangeDatasetOpTest,
-    ::testing::ValuesIn(std::vector<TestCase>({PositiveStepTestCase(),
-                                               NegativeStepTestCase()})));
+ITERATOR_SAVE_AND_RESTORE_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
+                                 IteratorSaveAndRestoreTestCases())
+
+TEST_F(RangeDatasetOpTest, ZeroStep) {
+  auto range_dataset_params = ZeroStepRangeDatasetParams();
+  EXPECT_EQ(Initialize(&range_dataset_params).code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
index 825168fc50d..2f90e2dbc34 100644
--- a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
@@ -82,14 +82,14 @@ struct TestCase {
 TestCase TestCase1() {
   return {/*range_data_param*/ {0, 10, 1},
           /*initial_state*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
+          {CreateTensor<int64>(TensorShape({}), {0})},
           /*func*/
           FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::XAddY()},
           /*t_state*/ {DT_INT64},
           /*use_inter_op_parallelism*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {45})},
+          {CreateTensor<int64>(TensorShape({}), {45})},
           /*output_dtypes*/ {DT_INT64},
           /*output_shapes*/ {PartialTensorShape({})}};
 }
@@ -103,17 +103,17 @@ TestCase TestCase1() {
 TestCase TestCase2() {
   return {/*range_data_param*/ {1, 10, 1},
           /*initial_state*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})},
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({}), {1})},
           /*func*/
           FunctionDefHelper::FunctionRef("XPlusOneXTimesY", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::XPlusOneXTimesY()},
           /*t_state*/ {DT_INT64, DT_INT64},
           /*use_inter_op_parallelism*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-           DatasetOpsTestBase::CreateTensor<int64>(
-               TensorShape({}), {1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9})},
+          {CreateTensor<int64>(TensorShape({}), {10}),
+           CreateTensor<int64>(TensorShape({}),
+                               {1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9})},
           /*output_dtypes*/ {DT_INT64, DT_INT64},
           /*output_shapes*/ {PartialTensorShape({}), PartialTensorShape({})}};
 }
@@ -123,16 +123,16 @@ TestCase TestCase2() {
 TestCase TestCase3() {
   return {/*range_data_param*/ {0, 0, 1},
           /*initial_state*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({}), {3})},
           /*func*/
           FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::XAddY()},
           /*t_state*/ {DT_INT64, DT_INT64},
           /*use_inter_op_parallelism*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({}), {3})},
           /*output_dtypes*/ {DT_INT64, DT_INT64},
           /*output_shapes*/ {PartialTensorShape({}), PartialTensorShape({})}};
 }
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 8b918e93838..6ec0b01edd8 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -89,6 +89,10 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     return count_ * n;
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
index 6f399697248..1c4dc692a85 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
@@ -71,51 +71,49 @@ struct TestCase {
 };
 
 TestCase FiniteRepeatTestCase() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
-      /*count*/ 2,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"})},
-      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
-      /*expected_output_shapes*/
-      {PartialTensorShape({2}), PartialTensorShape({1})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 1, 3}};
+  return {/*input_tensors*/
+          {CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+           CreateTensor<tstring>(TensorShape{2, 1}, {"a", "b"})},
+          /*count*/ 2,
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape{2}, {1, 2}),
+           CreateTensor<tstring>(TensorShape{1}, {"a"}),
+           CreateTensor<int64>(TensorShape{2}, {3, 4}),
+           CreateTensor<tstring>(TensorShape{1}, {"b"}),
+           CreateTensor<int64>(TensorShape{2}, {1, 2}),
+           CreateTensor<tstring>(TensorShape{1}, {"a"}),
+           CreateTensor<int64>(TensorShape{2}, {3, 4}),
+           CreateTensor<tstring>(TensorShape{1}, {"b"})},
+          /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+          /*expected_output_shapes*/
+          {PartialTensorShape({2}), PartialTensorShape({1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 1, 3}};
 }
 
 TestCase EmptyRepeatTestCase() {
-  return {
-      /*input_tensors*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
-      /*count*/ 0,
-      /*expected_outputs*/
-      {},
-      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
-      /*expected_output_shapes*/
-      {PartialTensorShape({2}), PartialTensorShape({1})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 3}};
+  return {/*input_tensors*/
+          {CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+           CreateTensor<tstring>(TensorShape{2, 1}, {"a", "b"})},
+          /*count*/ 0,
+          /*expected_outputs*/
+          {},
+          /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+          /*expected_output_shapes*/
+          {PartialTensorShape({2}), PartialTensorShape({1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 3}};
 }
 
 TestCase ForeverRepeatTestCase() {
   return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {1, 2})},
+          {CreateTensor<int64>(TensorShape{2, 1}, {1, 2})},
           /*count*/ -1,
           /*expected_outputs*/
           // Use the first group of the repeated tensors to represent the
           // infinite outputs.
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2})},
+          {CreateTensor<int64>(TensorShape{1}, {1}),
+           CreateTensor<int64>(TensorShape{1}, {2})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({1})},
           /*expected_cardinality*/ -1,
@@ -351,41 +349,6 @@ TEST_P(ParameterizedDatasetOpTest, Cardinality) {
   EXPECT_EQ(repeat_dataset->Cardinality(), GetParam().expected_cardinality);
 }
 
-TEST_F(RepeatDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = FiniteRepeatTestCase();
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
-  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
-  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
-  inputs_for_repeat_dataset.emplace_back(&count);
-
-  std::unique_ptr<OpKernel> repeat_dataset_kernel;
-  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
-                                         test_case.expected_output_shapes,
-                                         &repeat_dataset_kernel));
-  std::unique_ptr<OpKernelContext> repeat_dataset_context;
-  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
-                                          &inputs_for_repeat_dataset,
-                                          &repeat_dataset_context));
-  DatasetBase *repeat_dataset;
-  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
-                             repeat_dataset_context.get(), &repeat_dataset));
-  core::ScopedUnref scoped_unref(repeat_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(repeat_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/rewrite_utils.cc b/tensorflow/core/kernels/data/rewrite_utils.cc
new file mode 100644
index 00000000000..1760f9724bb
--- /dev/null
+++ b/tensorflow/core/kernels/data/rewrite_utils.cc
@@ -0,0 +1,441 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
+
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kDelimiter[] = "@@";
+
+void AddFakeSinks(FunctionDef* function_def) {
+  int counter = 0;
+  for (const auto& output : function_def->signature().output_arg()) {
+    NodeDef* node = function_def->add_node_def();
+    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+        strings::StrCat("FakeSink", counter++), function_def, node);
+    node->set_op("Identity");
+    node->add_input(function_def->ret().at(output.name()));
+    (*node->mutable_attr())["T"].set_type(output.type());
+
+    (*function_def->mutable_ret())[output.name()] =
+        strings::StrCat(node->name(), ":output:0");
+  }
+}
+
+void RemoveFakeSinks(FunctionDef* function_def) {
+  // Map from identity node names to their input tensor strings
+  std::map<string, string> identity_map;
+  for (const auto& node : function_def->node_def()) {
+    if (node.op() == "Identity" && node.input_size() == 1) {
+      identity_map[node.name()] = node.input(0);
+    }
+  }
+  for (const auto& output_arg : function_def->signature().output_arg()) {
+    const string& tensor = function_def->ret().at(output_arg.name());
+    const string& output_node = tensor.substr(0, tensor.find(':'));
+    if (identity_map.find(output_node) != identity_map.end()) {
+      (*function_def->mutable_ret())[output_arg.name()] =
+          identity_map.at(output_node);
+    }
+  }
+}
+
+Status ApplyRewrites(OpKernelContext* ctx,
+                     const std::function<RewriterConfig(void)> config_factory,
+                     bool optimize_function_library, GraphDef* graph_def,
+                     string* output_node) {
+  // Add an identity node as the fetch node, otherwise we might get 'placeholder
+  // is both fed and fetched' errors in some cases when using input list with
+  // placeholder dataset nodes.
+  NodeDef* node = graph_def->mutable_node()->Add();
+  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
+                                                            node);
+  node->set_op("Identity");
+  node->add_input(*output_node);
+  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+  *output_node = node->name();
+
+  // Add fake sink node to graph and functions to allow rewriting the actual
+  // sink nodes.
+  //
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
+  // to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    AddFakeSinks(&function_def);
+  }
+
+  // Create metagraph.
+  MetaGraphDef meta_graph_def;
+  (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+  // Grappler determines fetch ops from collection 'train_op'.
+  CollectionDef collection_def;
+  auto node_list = collection_def.mutable_node_list();
+  node_list->add_value(*output_node);
+  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+  // Create Grappler item.
+  tensorflow::grappler::ItemConfig item_config;
+  item_config.apply_optimizations = true;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+          "graph", meta_graph_def, item_config);
+  grappler_item->optimization_options().optimize_function_library =
+      optimize_function_library;
+  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+  tensorflow::grappler::VirtualCluster cluster(device_map);
+
+  // Run data optimizer using grappler's meta optimizer.
+  tensorflow::ConfigProto config;
+  *config.mutable_graph_options()->mutable_rewrite_options() = config_factory();
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+  // Remove fake sinks after optimizations are done.
+  //
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals
+  // to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    RemoveFakeSinks(&function_def);
+  }
+
+  return Status::OK();
+}
+
+uint64 DefaultDependencyLoopNodeHash() {
+  static const uint64 hash = Hash64("DependencyLoopNode");
+  return hash;
+}
+
+uint64 DefaultDependencyLoopFnHash() {
+  static const uint64 hash = Hash64("DependencyLoopFn");
+  return hash;
+}
+
+void ClearOpDefForHashing(OpDef* op) {
+  op->clear_name();
+  op->clear_description();
+  op->clear_summary();
+  for (auto& arg : *op->mutable_input_arg()) {
+    arg.clear_name();
+    arg.clear_description();
+  }
+  for (auto& arg : *op->mutable_output_arg()) {
+    arg.clear_name();
+    arg.clear_description();
+  }
+}
+
+// forward declaration for use in HashAttr.
+uint64 HashSubgraphFunctionImpl(
+    const FunctionDefLibrary& library, const FunctionDef* f,
+    std::vector<std::string>* visited,
+    absl::flat_hash_map<std::string, uint64>* cache);
+
+// Produces a hash of a attribute from an op or a function. Since attributes
+// may refer to functions present in the graph, we may need to hash the function
+// referred to by the attribute, and thus we need the FunctionDefLibrary.
+uint64 HashAttr(const FunctionDefLibrary& library, const std::string& attr_key,
+                const AttrValue& attr_value, std::vector<std::string>* visited,
+                absl::flat_hash_map<std::string, uint64>* cache) {
+  uint64 attr_hash = 0;
+  if (attr_value.has_func()) {
+    for (const auto& func : library.function()) {
+      if (func.signature().name() == attr_value.func().name()) {
+        attr_hash = Hash64CombineUnordered(
+            attr_hash,
+            Hash64(absl::StrCat(
+                attr_key, "=",
+                HashSubgraphFunctionImpl(library, &func, visited, cache))));
+        break;
+      }
+    }
+  } else {
+    attr_hash = Hash64CombineUnordered(
+        attr_hash, Hash64(absl::StrCat(attr_key, "=",
+                                       DeterministicProtoHash64(attr_value))));
+  }
+
+  return attr_hash;
+}
+
+// This function hashes a subgraph (rooted at node) by traversing all possible
+// dependency paths from that node.
+uint64 HashSubgraphImpl(const grappler::GraphView& g, const NodeDef* node,
+                        std::vector<std::string>* visited,
+                        absl::flat_hash_map<std::string, uint64>* cache) {
+  uint64 input_hash = 0;
+  uint64 control_dep_hash = 0;
+
+  std::string canonical_node_name = absl::StrCat("node-", node->name());
+  auto it = cache->find(canonical_node_name);
+  if (it != cache->end()) {
+    return it->second;
+  }
+
+  uint64 op_hash = Hash64(node->op());
+
+  // Checks to make sure we won't get stuck in an infinite loop (especially in
+  // loops with control dependencies).
+  for (const std::string& visited_node_name : *visited) {
+    if (visited_node_name == canonical_node_name) {
+      uint64 final_hash =
+          Hash64Combine(DefaultDependencyLoopNodeHash(), op_hash);
+      (*cache)[canonical_node_name] = final_hash;
+      return final_hash;
+    }
+  }
+  visited->push_back(canonical_node_name);
+
+  for (int i = 0; i < node->input_size(); ++i) {
+    DCHECK_GT(node->input(i).length(), 0);
+    if (node->input(i)[0] == '^') {
+      // TODO(frankchn): Investigate if control dependencies are necessary
+      // inputs to the hash.
+      // Control dependency node names start with '^', and order of appearance
+      // for the control dependencies does not matter.
+      control_dep_hash = Hash64CombineUnordered(
+          control_dep_hash,
+          HashSubgraphImpl(g, g.GetNode(node->input(i).substr(1)), visited,
+                           cache));
+    } else {
+      // The output port is significant and is optionally delimited by a ':'
+      // for non-zero ports.
+      std::pair<std::string, std::string> node_spec =
+          absl::StrSplit(node->input(i), absl::MaxSplits(':', 1));
+      uint64 child_node_hash =
+          HashSubgraphImpl(g, g.GetNode(node_spec.first), visited, cache);
+      uint64 child_port_hash = Hash64(node_spec.second);
+      input_hash = Hash64Combine(
+          input_hash, Hash64Combine(child_node_hash, child_port_hash));
+    }
+  }
+
+  uint64 attr_hash = 0;
+  for (const auto& attr : node->attr()) {
+    attr_hash = Hash64CombineUnordered(
+        attr_hash, HashAttr(g.graph()->library(), attr.first, attr.second,
+                            visited, cache));
+  }
+
+  uint64 device_hash = Hash64(node->device());
+
+  uint64 final_hash = Hash64Combine(
+      Hash64Combine(attr_hash, op_hash),
+      Hash64Combine(device_hash, Hash64Combine(input_hash, control_dep_hash)));
+
+  (*cache)[canonical_node_name] = final_hash;
+  visited->pop_back();
+
+  return final_hash;
+}
+
+// This function hashes a function by traversing all possible dependency paths
+// from all output nodes declared by the function in its definition.
+uint64 HashSubgraphFunctionImpl(
+    const FunctionDefLibrary& library, const FunctionDef* f,
+    std::vector<std::string>* visited,
+    absl::flat_hash_map<std::string, uint64>* cache) {
+  std::string canonical_function_name =
+      absl::StrCat("function-", f->signature().name());
+
+  auto it = cache->find(canonical_function_name);
+  if (it != cache->end()) {
+    return it->second;
+  }
+
+  OpDef op = f->signature();
+  ClearOpDefForHashing(&op);
+  uint64 signature_hash = OpDefHash(op);
+
+  // Checks to make sure we won't get stuck in an infinite loop (especially when
+  // functions depend on other function ops as a control dependency).
+  for (const std::string& visited_node_name : *visited) {
+    if (visited_node_name == canonical_function_name) {
+      uint64 final_hash =
+          Hash64Combine(DefaultDependencyLoopFnHash(), signature_hash);
+      (*cache)[canonical_function_name] = final_hash;
+      return final_hash;
+    }
+  }
+  visited->push_back(canonical_function_name);
+
+  uint64 attr_hash = 0;
+  for (const auto& attr : f->attr()) {
+    attr_hash = Hash64CombineUnordered(
+        attr_hash, HashAttr(library, attr.first, attr.second, visited, cache));
+  }
+
+  uint64 arg_attr_hash = 0;
+  for (const auto& arg_attr : f->arg_attr()) {
+    for (const auto& attr : arg_attr.second.attr()) {
+      arg_attr_hash = Hash64CombineUnordered(
+          arg_attr_hash,
+          Hash64Combine(arg_attr.first, HashAttr(library, attr.first,
+                                                 attr.second, visited, cache)));
+    }
+  }
+
+  GraphDef node_graph;
+  for (const auto& node : f->node_def()) {
+    NodeDef* node_graph_node = node_graph.add_node();
+    *node_graph_node = node;
+  }
+  for (const auto& input_arg : f->signature().input_arg()) {
+    // We add dummy input nodes for the inputs to the function.
+    NodeDef* node_graph_node = node_graph.add_node();
+    node_graph_node->set_name(input_arg.name());
+    node_graph_node->set_op("_Retval");
+  }
+  *(node_graph.mutable_library()) = library;
+
+  grappler::GraphView node_gv(&node_graph);
+
+  // TODO(frankchn): Investigate whether we need to hash the name of the
+  // return argument / control return argument or whether we can relax it and
+  // hash the index (etc...)
+  uint64 ret_hash = f->ret_size();
+  for (const auto& ret : f->ret()) {
+    std::pair<std::string, std::string> node_spec =
+        absl::StrSplit(ret.second, absl::MaxSplits(':', 1));
+    // For every return value, we need to hash the output node (and the subgraph
+    // rooted at the output node) to ensure that the computation graph that
+    // ends at the output node has not changed.
+    uint64 node_hash = HashSubgraphImpl(
+        node_gv, node_gv.GetNode(node_spec.first), visited, cache);
+    uint64 node_port_hash = Hash64(node_spec.second);
+
+    ret_hash = Hash64CombineUnordered(
+        ret_hash, Hash64Combine(Hash64(ret.first),
+                                Hash64Combine(node_hash, node_port_hash)));
+  }
+
+  uint64 control_ret_hash = f->control_ret_size();
+  for (const auto& ret : f->control_ret()) {
+    std::pair<std::string, std::string> node_spec =
+        absl::StrSplit(ret.second, absl::MaxSplits(':', 1));
+
+    uint64 node_hash = HashSubgraphImpl(
+        node_gv, node_gv.GetNode(node_spec.first), visited, cache);
+    uint64 node_port_hash = Hash64(node_spec.second);
+
+    control_ret_hash = Hash64CombineUnordered(
+        control_ret_hash,
+        Hash64Combine(Hash64(ret.first),
+                      Hash64Combine(node_hash, node_port_hash)));
+  }
+
+  uint64 final_hash = Hash64Combine(
+      Hash64Combine(Hash64Combine(signature_hash, attr_hash), arg_attr_hash),
+      Hash64Combine(ret_hash, control_ret_hash));
+  (*cache)[canonical_function_name] = final_hash;
+  visited->pop_back();
+
+  return final_hash;
+}
+
+}  // anonymous namespace
+
+Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      std::function<RewriterConfig(void)> config_factory,
+                      bool optimize_function_library,
+                      DatasetBase** rewritten_input) {
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.input_list = &input_list;
+  params.check_external_state = false;
+  params.fail_if_unimplemented = false;
+  params.serialize_data_tensors = false;
+  SerializationContext serialization_ctx(params);
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(
+      AsGraphDef(ctx, input, std::move(serialization_ctx), &graph_def));
+
+  string output_node;
+  for (const auto& node : graph_def.node()) {
+    if (node.op() == "_Retval") {
+      output_node = node.input(0);
+    }
+  }
+
+  VLOG(3) << "Before graph rewrites: " << graph_def.DebugString();
+  TF_RETURN_IF_ERROR(ApplyRewrites(ctx, config_factory,
+                                   optimize_function_library, &graph_def,
+                                   &output_node));
+  VLOG(3) << "After graph rewrites: " << graph_def.DebugString();
+
+  // Instantiate the optimized input pipeline by running the optimized graph
+  // using the optimized function library.
+  FunctionLibraryRuntime* flr = nullptr;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr = nullptr;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def = nullptr;
+  TF_RETURN_IF_ERROR(
+      ctx->function_library()->Clone(&lib_def, &pflr, &flr, true));
+
+  // Some functions may have been modified without having their names
+  // changed (for example, nested dataset graphs from FlatMap or
+  // Interleave).
+  TF_RETURN_IF_ERROR(AddToFunctionLibrary(lib_def.get(), graph_def.library()));
+
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+  std::vector<Tensor> outputs;
+  GraphRunner graph_runner(flr->device());
+
+  TF_RETURN_IF_ERROR(
+      graph_runner.Run(&graph, flr, input_list, {output_node}, &outputs));
+  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], rewritten_input));
+  (*rewritten_input)->Ref();
+  return Status::OK();
+}
+
+uint64 HashSubgraphFunction(const FunctionDefLibrary& library,
+                            const FunctionDef* f) {
+  std::vector<std::string> visited;
+  absl::flat_hash_map<std::string, uint64> cache;
+  return HashSubgraphFunctionImpl(library, f, &visited, &cache);
+}
+
+uint64 HashSubgraph(const GraphDef& g, const NodeDef* node) {
+  std::vector<std::string> visited;
+  absl::flat_hash_map<std::string, uint64> cache;
+  return HashSubgraphImpl(grappler::GraphView(&g), node, &visited, &cache);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/rewrite_utils.h b/tensorflow/core/kernels/data/rewrite_utils.h
new file mode 100644
index 00000000000..0701f0d9873
--- /dev/null
+++ b/tensorflow/core/kernels/data/rewrite_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Rewrites the input dataset using the given config.
+Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      std::function<RewriterConfig(void)> config_factory,
+                      bool optimize_function_library,
+                      DatasetBase** rewritten_input);
+
+// Returns a stable hash of the portion of the graph `g` rooted at
+// `node`, by creating a Merkle tree-like structure.
+//
+// Specifically, this function recursively walks the graph from `node` by
+// following its inputs.
+//
+// The hash is computed by hashing its op name, device, attributes, and hashes
+// of its inputs (if applicable).
+//
+// There is currently no guarantee that the hash of a subgraph will stay the
+// same between TensorFlow builds.
+uint64 HashSubgraph(const GraphDef& g, const NodeDef* node);
+
+// Returns a stable hash of the function `f`.
+//
+// This function computes the hash by hashing the metadata of the
+// function (disregarding the auto-generated names and descriptions) and also
+// hashing the subgraph rooted at each of the output nodes.
+//
+// There is currently no guarantee that the hash of a function will stay the
+// same between TensorFlow builds.
+uint64 HashSubgraphFunction(const FunctionDefLibrary& library,
+                            const FunctionDef* f);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
diff --git a/tensorflow/core/kernels/data/rewrite_utils_test.cc b/tensorflow/core/kernels/data/rewrite_utils_test.cc
new file mode 100644
index 00000000000..52be4f0ea29
--- /dev/null
+++ b/tensorflow/core/kernels/data/rewrite_utils_test.cc
@@ -0,0 +1,673 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/rewrite_utils.h"
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(DatasetUtilsTest, HashSubgraphFunctionSameFunctionDifferentNames) {
+  FunctionDefLibrary fl1;
+
+  FunctionDef* f1 = fl1.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl1.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul2", {"input: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  EXPECT_EQ(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
+}
+
+TEST(DatasetUtilsTest, HashSubgraphFunctionDifferentFunctions) {
+  FunctionDefLibrary fl1;
+
+  FunctionDef* f1 = fl1.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl1.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndAdd", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  // The second op in `f2` is changed to "Add"
+  EXPECT_NE(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
+}
+
+TEST(DatasetUtilsTest, HashSubgraphFunctionDifferentInternalNodeNames) {
+  FunctionDefLibrary fl1;
+
+  FunctionDef* f1 = fl1.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float", "j: float", "k: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "j"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"add", "k"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+
+  FunctionDef* f2 = fl1.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul", {"a: float", "b: float", "c: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"add", "c"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "mul:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "mul"}});
+
+  EXPECT_EQ(HashSubgraphFunction(fl1, f1), HashSubgraphFunction(fl1, f2));
+}
+
+TEST(DatasetUtilsTest, HashSubgraphSameGraphDifferentNames) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = HashSubgraph(gd, n3);
+
+  n1->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  n2->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = HashSubgraph(gd, n3);
+
+  EXPECT_EQ(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphDifferentGraphs) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = HashSubgraph(gd, n3);
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = HashSubgraph(gd, n3);
+
+  // We expect different hashes because the op of n3 has changed.
+  EXPECT_NE(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphReversedOrder) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = HashSubgraph(gd, n3);
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = HashSubgraph(gd, n3);
+
+  // We expect different hashes because the inputs of n3 are swapped.
+  EXPECT_NE(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphInputPortChanged) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = HashSubgraph(gd, n3);
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 1, DT_INT32)
+                  .Input(n2->name(), 2, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = HashSubgraph(gd, n3);
+
+  // We expect different hashes because the input ports for nodes used by n3
+  // has changed.
+  EXPECT_NE(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphSameFunctionDifferentNames) {
+  GraphDef gd;
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+
+  FunctionDef* f1 = fl1->add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl1->add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul2", {"input: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  uint64 hash1 = HashSubgraph(gd, n2);
+
+  n2->Clear();
+  AttrValue a2;
+  NameAttrList* nal2 = a2.mutable_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  uint64 hash2 = HashSubgraph(gd, n2);
+
+  EXPECT_EQ(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphDifferentFunctions) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  FunctionDef* f2 = fl1->add_function();
+  func = FunctionDefHelper::Create(
+      "AddAndMul2", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+  *f2 = func;
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  uint64 hash1 = HashSubgraph(gd, n2);
+
+  n2->Clear();
+  AttrValue a2;
+  NameAttrList* nal2 = a2.mutable_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  uint64 hash2 = HashSubgraph(gd, n2);
+
+  EXPECT_NE(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphDifferentControlInputs) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
+                  .Attr("value", 10)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n2->name())
+                  .Finalize(n4));
+
+  uint64 hash1 = HashSubgraph(gd, n4);
+
+  n4->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n3->name())
+                  .Finalize(n4));
+
+  uint64 hash2 = HashSubgraph(gd, n4);
+
+  // Control inputs are different between these two graphs.
+  EXPECT_NE(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphControlInputDifferentOrdering) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
+                  .Attr("value", 10)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n2->name())
+                  .ControlInput(n3->name())
+                  .Finalize(n4));
+
+  uint64 hash1 = HashSubgraph(gd, n4);
+
+  n4->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n3->name())
+                  .ControlInput(n2->name())
+                  .Finalize(n4));
+
+  uint64 hash2 = HashSubgraph(gd, n4);
+
+  EXPECT_EQ(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphDifferentGraphSamePartialGraph) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = HashSubgraph(gd, n1);
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = HashSubgraph(gd, n1);
+
+  EXPECT_EQ(hash1, hash2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphWithManyControlDependencies) {
+  GraphDef gd;
+  NodeDef* n;
+
+  for (int i = 0; i < 1000; ++i) {
+    n = gd.add_node();
+    NodeDefBuilder ndb(absl::StrCat("graph_1/node_", i), "Const");
+    ndb.Attr("value", 1);
+    ndb.Device("CPU:0");
+    for (int j = 0; j < i; ++j) {
+      ndb.ControlInput(absl::StrCat("graph_1/node_", j));
+    }
+    TF_CHECK_OK(ndb.Finalize(n));
+  }
+
+  // No checks here, because so long as this does not time out, we are OK.
+  HashSubgraph(gd, n);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphFunctionsWithControlDependencyLoop) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  std::pair<string, FunctionDefHelper::AttrValueWrapper> func_attr = {
+      "body", FunctionDefHelper::AttrValueWrapper(*nal1)};
+
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/"AddAndMul",
+      /*in_def=*/{"i: float"},
+      /*out_def=*/{"o: float"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}, {"ret"}},
+       // This creates a dependency on the same function.
+       {{"for"}, "For", {"i", "i", "i"}, {func_attr}, {"ret"}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "for:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .ControlInput("graph_1/node_2")
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
+  // the stack means it is successful.
+  HashSubgraph(gd, n2);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphWithControlDependencyLoop) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_1")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_1")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n3));
+
+  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
+  // the stack means it is successful.
+  HashSubgraph(gd, n3);
+}
+
+TEST(DatasetUtilsTest, HashSubgraphWithControlDependencyLoopDifferentNames) {
+  GraphDef gd1;
+
+  NodeDef* n1 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_1")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_1")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n3));
+
+  GraphDef gd2;
+
+  NodeDef* n4 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_5")
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_5", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_4")
+                  .Finalize(n5));
+
+  NodeDef* n6 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_6", "Add")
+                  .Device("CPU:0")
+                  .Input(n4->name(), 0, DT_INT32)
+                  .Input(n5->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_4")
+                  .ControlInput("graph_1/node_5")
+                  .Finalize(n6));
+
+  EXPECT_EQ(HashSubgraph(gd1, n3), HashSubgraph(gd2, n6));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index d88654f5f96..e79d3437bf0 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -79,6 +79,10 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/shard_dataset_op_test.cc b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
index b51e296b2f2..b1013272654 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
@@ -70,13 +70,13 @@ struct TestCase {
 TestCase TestCase1() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          CreateTensor<int64>(TensorShape({}), {2}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          {CreateTensor<int64>(TensorShape({}), {2}),
+           CreateTensor<int64>(TensorShape({}), {7})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 2,
@@ -87,13 +87,13 @@ TestCase TestCase1() {
 TestCase TestCase2() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          CreateTensor<int64>(TensorShape({}), {0}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          {CreateTensor<int64>(TensorShape({}), {0}),
+           CreateTensor<int64>(TensorShape({}), {5})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 2,
@@ -104,9 +104,9 @@ TestCase TestCase2() {
 TestCase TestCase3() {
   return {/*range_data_param*/ {0, 1, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          CreateTensor<int64>(TensorShape({}), {2}),
           /*require_non_empty*/ true,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
@@ -119,12 +119,12 @@ TestCase TestCase3() {
 TestCase TestCase4() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          CreateTensor<int64>(TensorShape({}), {7}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          {CreateTensor<int64>(TensorShape({}), {5})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 1,
@@ -135,13 +135,13 @@ TestCase TestCase4() {
 TestCase TestCase5() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          CreateTensor<int64>(TensorShape({}), {4}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {9})},
+          {CreateTensor<int64>(TensorShape({}), {4}),
+           CreateTensor<int64>(TensorShape({}), {9})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 2,
@@ -153,13 +153,13 @@ TestCase TestCase5() {
 TestCase TestCase6() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          CreateTensor<int64>(TensorShape({}), {4}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+          CreateTensor<int64>(TensorShape({}), {3}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          {CreateTensor<int64>(TensorShape({}), {3}),
+           CreateTensor<int64>(TensorShape({}), {7})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 2,
@@ -171,12 +171,12 @@ TestCase TestCase6() {
 TestCase TestCase7() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          CreateTensor<int64>(TensorShape({}), {20}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*require_non_empty*/ false,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          {CreateTensor<int64>(TensorShape({}), {5})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 1,
@@ -187,12 +187,12 @@ TestCase TestCase7() {
 TestCase NoElemForEachShardTestCase() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          CreateTensor<int64>(TensorShape({}), {20}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*require_non_empty*/ true,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          {CreateTensor<int64>(TensorShape({}), {5})},
           /*expected_output_dtypes*/ {DT_INT64},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ 1,
@@ -202,9 +202,9 @@ TestCase NoElemForEachShardTestCase() {
 TestCase IndexGreaterNumShardsCase() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          CreateTensor<int64>(TensorShape({}), {7}),
           /*require_non_empty*/ true,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
@@ -216,9 +216,9 @@ TestCase IndexGreaterNumShardsCase() {
 TestCase NegativeIndexTestCase() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          CreateTensor<int64>(TensorShape({}), {5}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          CreateTensor<int64>(TensorShape({}), {-3}),
           /*require_non_empty*/ true,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
@@ -230,9 +230,9 @@ TestCase NegativeIndexTestCase() {
 TestCase NegativeNumShardsTestCase() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          CreateTensor<int64>(TensorShape({}), {-3}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          CreateTensor<int64>(TensorShape({}), {1}),
           /*require_non_empty*/ true,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
@@ -244,9 +244,9 @@ TestCase NegativeNumShardsTestCase() {
 TestCase ZeroNumShardsTestCase() {
   return {/*range_data_param*/ {0, 10, 1},
           /*num_shards*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          CreateTensor<int64>(TensorShape({}), {0}),
           /*index*/
-          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          CreateTensor<int64>(TensorShape({}), {1}),
           /*require_non_empty*/ true,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {DT_INT64},
@@ -497,47 +497,6 @@ TEST_P(ParameterizedShardDatasetOpTest, Cardinality) {
   EXPECT_EQ(shard_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedShardDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> shard_dataset_kernel;
-  TF_ASSERT_OK(CreateShardDatasetOpKernel(
-      test_case.require_non_empty, test_case.expected_output_dtypes,
-      test_case.expected_output_shapes, &shard_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
-      test_case.range_dataset_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-
-  Tensor num_shards = test_case.num_shards;
-  Tensor index = test_case.index;
-  gtl::InlinedVector<TensorValue, 4> inputs({TensorValue(&range_dataset_tensor),
-                                             TensorValue(&num_shards),
-                                             TensorValue(&index)});
-  std::unique_ptr<OpKernelContext> shard_dataset_context;
-  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
-                                         &shard_dataset_context));
-
-  DatasetBase* shard_dataset;
-  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
-                             shard_dataset_context.get(), &shard_dataset));
-  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(shard_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedShardDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 0be76f3ffcf..d5d7f0a0818 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/kernels/data/random_seed_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -64,6 +66,7 @@ constexpr char kTFData[] = "tf_data";
 constexpr char kDSNumRandomSamples[] = "ds_num_random_samples";
 constexpr char kFixedSeedDatasetPrefix[] = "FixedSeed";
 constexpr char kReshufflingDatasetPrefix[] = "Reshuffling";
+constexpr char kShuffleDataset[] = "ShuffleDataset";
 
 ShuffleDatasetOpBase::ShuffleDatasetOpBase(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx) {}
@@ -100,6 +103,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     }
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   template <class T>
   class Iterator : public DatasetIterator<T> {
@@ -383,12 +390,6 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
   const int64 count_;
 };
 
-ShuffleDatasetOp::ShuffleDatasetOp(OpKernelConstruction* ctx)
-    : ShuffleDatasetOpBase(ctx) {
-  OP_REQUIRES_OK(
-      ctx, ctx->GetAttr(kReshuffleEachIteration, &reshuffle_each_iteration_));
-}
-
 // A dataset that uses a pseudorandom sequence of seeds for the iterators
 // created from it. Used when `reshuffle_each_iteration` is true.
 class ShuffleDatasetOp::ReshufflingDataset : public ShuffleDatasetBase {
@@ -415,59 +416,9 @@ class ShuffleDatasetOp::ReshufflingDataset : public ShuffleDatasetBase {
   }
 
  protected:
-  class RandomSeedGenerator : public ResourceBase {
-   public:
-    RandomSeedGenerator(int64 seed, int64 seed2)
-        : seed_(seed),
-          seed2_(seed2),
-          parent_generator_(seed, seed2),
-          generator_(&parent_generator_) {}
-
-    string DebugString() const override {
-      return strings::StrCat(kReshufflingDatasetPrefix, name_utils::kDelimiter,
-                             kRandomSeedGenerator);
-    }
-
-    void GenerateRandomSeeds(int64* seed1, int64* seed2) {
-      mutex_lock l(mu_);
-      num_random_samples_++;
-      *seed1 = generator_();
-      num_random_samples_++;
-      *seed2 = generator_();
-    }
-
-    int64 num_random_samples() {
-      tf_shared_lock l(mu_);
-      return num_random_samples_;
-    }
-
-    void set_num_random_samples(int64 num_random_samples) {
-      mutex_lock l(mu_);
-      num_random_samples_ = num_random_samples;
-    }
-
-    void Reset() {
-      mutex_lock l(mu_);
-      // Reset the generators based on the current seeds.
-      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-      generator_ =
-          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
-      generator_.Skip(num_random_samples_);
-    }
-
-   private:
-    const int64 seed_;
-    const int64 seed2_;
-    mutex mu_;
-    random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-    random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    int64 num_random_samples_ GUARDED_BY(mu_) = 0;
-  };
-
   class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDataset> {
    public:
-    explicit Iterator(const Params& params, int64 seed, int64 seed2)
+    Iterator(const Params& params, int64 seed, int64 seed2)
         : ShuffleDatasetBase::Iterator<ReshufflingDataset>(params, seed,
                                                            seed2) {}
 
@@ -500,13 +451,10 @@ class ShuffleDatasetOp::ReshufflingDataset : public ShuffleDatasetBase {
                 new RandomSeedGenerator(dataset_seed, dataset_seed2);
             return Status::OK();
           }));
-      // Now use the seed generator to update the base class Iterator seeds
-      // and random number generator with generated seeds for the current
-      // repetition.
-      mutex_lock l(mu_);
-      seed_generator->GenerateRandomSeeds(&seed_, &seed2_);
-      ResetRngs();
       seed_generator_ = seed_generator;
+      seed_generator_->GenerateRandomSeeds(&seed_, &seed2_);
+      mutex_lock l(mu_);
+      ResetRngs();
       return Status::OK();
     }
 
@@ -573,6 +521,111 @@ class ShuffleDatasetOp::ReshufflingDataset : public ShuffleDatasetBase {
   const int64 seed2_;
 };
 
+// A dataset that uses a pseudorandom sequence of seeds for the iterators
+// created from it. Used in TF 2.0 when `reshuffle_each_iteration` is true.
+class ShuffleDatasetOp::ReshufflingDatasetV2 : public ShuffleDatasetBase {
+ public:
+  ReshufflingDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
+                       int64 buffer_size, int64 count,
+                       const Tensor& resource_handle,
+                       RandomSeedGenerator* seed_generator)
+      : ShuffleDatasetBase(ctx, input, buffer_size, count),
+        resource_handle_(resource_handle),
+        seed_generator_(seed_generator) {}
+
+  ~ReshufflingDatasetV2() override { seed_generator_->Unref(); }
+
+  string DebugString() const override {
+    name_utils::DatasetDebugStringParams params;
+    params.dataset_prefix = kReshufflingDatasetPrefix;
+    params.set_args(buffer_size_);
+    return name_utils::DatasetDebugString(kDatasetType, params);
+  }
+
+  Status CheckExternalState() const override {
+    return errors::FailedPrecondition(
+        DebugString(), " depends on random seed generator resource.");
+  }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this,
+                         name_utils::IteratorPrefix(kDatasetType, prefix)},
+        seed_generator_);
+  }
+
+ protected:
+  class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDatasetV2> {
+   public:
+    Iterator(const Params& params, RandomSeedGenerator* seed_generator)
+        : ShuffleDatasetBase::Iterator<ReshufflingDatasetV2>(params, 0, 0),
+          seed_generator_(seed_generator) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(mu_);
+      seed_generator_->GenerateRandomSeeds(&seed_, &seed2_);
+      ResetRngs();
+      return Status::OK();
+    }
+
+   protected:
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+    }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      // Save state of the seed generator.
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kDSNumRandomSamples),
+                              seed_generator_->num_random_samples()));
+
+      // Save the tterator state.
+      return ShuffleDatasetBase::Iterator<ReshufflingDatasetV2>::SaveInternal(
+          writer);
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      // Restore state of the seed generator.
+      int64 num_random_samples;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kDSNumRandomSamples),
+                                            &num_random_samples));
+      seed_generator_->set_num_random_samples(num_random_samples);
+      seed_generator_->Reset();
+
+      // Restore the iterator state.
+      return ShuffleDatasetBase::Iterator<
+          ReshufflingDatasetV2>::RestoreInternal(ctx, reader);
+    }
+
+   private:
+    RandomSeedGenerator* seed_generator_;
+  };
+
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* buffer_size_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size_node));
+    Node* resource_handle_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this,
+        {input_graph_node, buffer_size_node, resource_handle_node},  // Inputs
+        {},                                                          // Attrs
+        output));
+    return Status::OK();
+  }
+
+ private:
+  const Tensor resource_handle_;
+  RandomSeedGenerator* seed_generator_ = nullptr;
+};
+
 // A dataset that uses the same fixed seed for all iterators created from it.
 // Used when `reshuffle_each_iteration` is false.
 class ShuffleDatasetOp::FixedSeedDataset : public ShuffleDatasetBase {
@@ -626,6 +679,15 @@ class ShuffleDatasetOp::FixedSeedDataset : public ShuffleDatasetBase {
   const int64 seed2_;
 };
 
+ShuffleDatasetOp::ShuffleDatasetOp(OpKernelConstruction* ctx)
+    : ShuffleDatasetOpBase(ctx),
+      op_version_(ctx->def().op() == kShuffleDataset ? 1 : 2) {
+  if (ctx->HasAttr(kReshuffleEachIteration)) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kReshuffleEachIteration, &reshuffle_each_iteration_));
+  }
+}
+
 void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                    DatasetBase** output) {
   int64 buffer_size = 0;
@@ -635,6 +697,18 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
       ctx, buffer_size > 0,
       errors::InvalidArgument("buffer_size must be greater than zero."));
 
+  int64 count = 1;
+  if (op_version_ == 2) {
+    RandomSeedGenerator* seed_generator = nullptr;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 2), &seed_generator));
+    // Transferring ownership of seed generator reference onto
+    // `ReshufflingDatasetV2`.
+    *output = new ReshufflingDatasetV2(ctx, input, buffer_size, count,
+                                       ctx->input(2), seed_generator);
+    return;
+  }
+
   int64 seed;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
 
@@ -648,7 +722,6 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     seed2 = random::New64();
   }
 
-  int64 count = 1;
   if (reshuffle_each_iteration_) {
     *output =
         new ReshufflingDataset(ctx, input, buffer_size, seed, seed2, count);
@@ -746,6 +819,9 @@ namespace {
 REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
                         ShuffleDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("ShuffleDatasetV2").Device(DEVICE_CPU),
+                        ShuffleDatasetOp);
+
 REGISTER_KERNEL_BUILDER(Name("ShuffleAndRepeatDataset").Device(DEVICE_CPU),
                         ShuffleAndRepeatDatasetOp);
 }  // namespace
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.h b/tensorflow/core/kernels/data/shuffle_dataset_op.h
index 280221b51ce..33b33f8d7e0 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.h
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.h
@@ -49,7 +49,9 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
  private:
   class ReshufflingDataset;
+  class ReshufflingDatasetV2;
   class FixedSeedDataset;
+  int op_version_;
   bool reshuffle_each_iteration_;
 };
 
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
index b03f7d7eb62..a017c53bd7c 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -89,231 +89,219 @@ std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
   std::vector<Tensor> tensors;
   tensors.reserve(values.size());
   for (auto& value : values) {
-    tensors.emplace_back(
-        DatasetOpsTestBase::CreateTensor<T>(TensorShape({}), {value}));
+    tensors.emplace_back(CreateTensor<T>(TensorShape({}), {value}));
   }
   return tensors;
 }
 
 // Test case 1: test shuffle_dataset with reshuffle_each_iteration = false.
 TestCase TestCase1() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ false,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 10,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {3}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ false,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({2, 3, 0, 5, 6, 4, 7, 8, 9, 1}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 2: test shuffle_dataset with reshuffle_each_iteration = true.
 TestCase TestCase2() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({2, 6, 1, 3, 9, 5, 0, 8, 7, 4}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({1, 6, 0, 5, 2, 7, 4, 3, 9, 8}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 10,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({2, 6, 1, 3, 9, 5, 0, 8, 7, 4}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({1, 6, 0, 5, 2, 7, 4, 3, 9, 8}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 3: similar with the test case 2 but a smaller buffer size than
 // the input dataset.
 TestCase TestCase3() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({0, 2, 1, 3, 5, 6, 4, 7, 8, 9}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({1, 0, 2, 3, 4, 5, 6, 7, 9, 8}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 10,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {2}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({0, 2, 1, 3, 5, 6, 4, 7, 8, 9}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({1, 0, 2, 3, 4, 5, 6, 7, 9, 8}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 4: similar with the test case 2 but has different seeds.
 TestCase TestCase4() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({3, 0, 8, 1, 5, 4, 7, 2, 6, 9}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({4, 6, 9, 0, 1, 8, 2, 7, 3, 5}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 10,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({3, 0, 8, 1, 5, 4, 7, 2, 6, 9}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({4, 6, 9, 0, 1, 8, 2, 7, 3, 5}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 5: test shuffle_dataset with buffer_size = 1 &
 // reshuffle_each_iteration = true.
 TestCase TestCase5() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 10,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 6: test shuffle_dataset with an empty input dataset.
 TestCase TestCase6() {
-  return {
-      /*range_data_param*/ {0, 0, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 0, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 7: test shuffle_and_repeat_dataset with buffer_size = 10 &
 // count = 2.
 TestCase TestCase7() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*reshuffle_each_iteration*/ false,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>(
-          {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>(
-          {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 20,
-      /*breakpoints*/ {0, 5, 22}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*reshuffle_each_iteration*/ false,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>(
+              {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>(
+              {9, 0, 8, 6, 1, 3, 7, 2, 4, 5, 4, 3, 0, 5, 8, 2, 6, 9, 7, 1}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 20,
+          /*breakpoints*/ {0, 5, 22}};
 }
 
 // Test case 8: test shuffle_and_repeat_dataset with buffer_size = 10 &
 // count = -1
 TestCase TestCase8() {
-  return {
-      /*range_data_param*/ {0, 3, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-      /*reshuffle_each_iteration*/ false,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>(
-          {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>(
-          {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ kInfiniteCardinality,
-      /*breakpoints*/ {0, 5, 20}};
+  return {/*range_data_param*/ {0, 3, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {-1}),
+          /*reshuffle_each_iteration*/ false,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>(
+              {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>(
+              {2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 1, 0}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kInfiniteCardinality,
+          /*breakpoints*/ {0, 5, 20}};
 }
 
 TestCase InvalidBufferSizeTestCaseForShuffleDataset() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
-      /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {-1}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
+          /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 TestCase InvalidBufferSizeTestCaseForShuffleAndRepeatDataset() {
-  return {
-      /*range_data_param*/ {0, 10, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-1}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*reshuffle_each_iteration*/ true,
-      /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
-      /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 10, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {-1}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*reshuffle_each_iteration*/ true,
+          /*expected_shuffle_outputs*/ ConvertToTensorVec<int64>({}),
+          /*expected_reshuffle_outputs*/ ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 TestCase InvalidCountTestCaseForShuffleAndRepeatDataset() {
-  return {
-      /*range_data_param*/ {0, 3, 1},
-      /*buffer_size*/
-      DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
-      /*seed*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*seed2*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*count*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-      /*reshuffle_each_iteration*/ false,
-      /*expected_shuffle_outputs*/
-      ConvertToTensorVec<int64>({}),
-      /*expected_reshuffle_outputs*/
-      ConvertToTensorVec<int64>({}),
-      /*expected_output_dtypes*/ {DT_INT64},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 5, 20}};
+  return {/*range_data_param*/ {0, 3, 1},
+          /*buffer_size*/
+          CreateTensor<int64>(TensorShape({}), {10}),
+          /*seed*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*seed2*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*count*/ CreateTensor<int64>(TensorShape({}), {0}),
+          /*reshuffle_each_iteration*/ false,
+          /*expected_shuffle_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_reshuffle_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 5, 20}};
 }
 
 class ParameterizedShuffleDatasetOpTest
@@ -618,51 +606,6 @@ TEST_P(ParameterizedShuffleDatasetOpTest, Cardinality) {
   EXPECT_EQ(dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedShuffleDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  Tensor count = test_case.count;
-  int64 count_value = count.flat<int64>()(0);
-  std::unique_ptr<OpKernel> dataset_kernel;
-  TF_ASSERT_OK(
-      CreateDatasetOpKernel(count_value, test_case.reshuffle_each_iteration,
-                            test_case.expected_output_dtypes,
-                            test_case.expected_output_shapes, &dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_data_param.start, test_case.range_data_param.end,
-      test_case.range_data_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  Tensor buffer_size = test_case.buffer_size;
-  Tensor seed = test_case.seed;
-  Tensor seed2 = test_case.seed2;
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&range_dataset_tensor), TensorValue(&buffer_size),
-       TensorValue(&seed), TensorValue(&seed2)});
-  if (count_value != 1) inputs.push_back(TensorValue(&count));
-
-  std::unique_ptr<OpKernelContext> dataset_context;
-  TF_ASSERT_OK(
-      CreateDatasetContext(dataset_kernel.get(), &inputs, &dataset_context));
-  DatasetBase* dataset;
-  TF_ASSERT_OK(
-      CreateDataset(dataset_kernel.get(), dataset_context.get(), &dataset));
-  core::ScopedUnref scoped_unref_dataset(dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedShuffleDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 4d378b20f8d..5858c0702e5 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -75,6 +75,10 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return count_ < 0 ? 0 : std::max(0LL, n - count_);
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/skip_dataset_op_test.cc b/tensorflow/core/kernels/data/skip_dataset_op_test.cc
index bc95bf7a986..8079b609eb6 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op_test.cc
@@ -69,83 +69,83 @@ struct TestCase {
 
 // Test case 1: skip fewer than input size.
 TestCase SkipLessTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 4,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 6,
-          /*breakpoints*/ {0, 2, 7}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 4,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 6,
+      /*breakpoints*/ {0, 2, 7}};
 }
 
 // Test case 2: skip more than input size.
 TestCase SkipMoreTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 25,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 2, 5}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 25,
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 3: skip exactly the input size.
 TestCase SkipAllTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 10,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 2, 5}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 10,
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 4: skip nothing.
 TestCase SkipNothingTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 0,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 10,
-          /*breakpoints*/ {0, 2, 5, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 0,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 2, 5, 11}};
 }
 
 // Test case 5: set -1 for `count` to skip the entire dataset.
 TestCase SkipEntireDatasetTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ -1,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 2, 5}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ -1,
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 2, 5}};
 }
 
 class ParameterizedSkipDatasetOpTest
@@ -356,41 +356,6 @@ TEST_P(ParameterizedSkipDatasetOpTest, Cardinality) {
   EXPECT_EQ(skip_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_F(SkipDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = SkipLessTestCase();
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
-  gtl::InlinedVector<TensorValue, 4> inputs_for_skip_dataset(
-      {TensorValue(&tensor_slice_dataset_tensor), TensorValue(&count)});
-
-  std::unique_ptr<OpKernel> skip_dataset_kernel;
-  TF_ASSERT_OK(CreateSkipDatasetKernel(test_case.expected_output_dtypes,
-                                       test_case.expected_output_shapes,
-                                       &skip_dataset_kernel));
-  std::unique_ptr<OpKernelContext> skip_dataset_context;
-  TF_ASSERT_OK(CreateSkipDatasetContext(skip_dataset_kernel.get(),
-                                        &inputs_for_skip_dataset,
-                                        &skip_dataset_context));
-  DatasetBase *skip_dataset;
-  TF_ASSERT_OK(CreateDataset(skip_dataset_kernel.get(),
-                             skip_dataset_context.get(), &skip_dataset));
-  core::ScopedUnref scoped_unref(skip_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(skip_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedSkipDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index d8d7cd204d0..ffc74fc15de 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -56,6 +56,8 @@ class Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
index c8586d922b1..396a8d0f3e1 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
@@ -57,87 +57,78 @@ struct TestCase {
 };
 
 TestCase TwoDimsTestCase() {
-  return {
-      /*input_sparse_tensor*/
-      {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({2, 2},
-                                                           {0, 0, 1, 1}),
-       /*values*/ DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999}),
-       /*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})},
-      /*expected_outputs*/
-      {{/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {0}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<int32>({1}, {888}),
-        /*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({1}, {2})},
-       {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {1}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<int32>({1}, {999}),
-        /*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}},
-      /*breakpoints*/ {0, 1, 2}};
+  return {/*input_sparse_tensor*/
+          {/*indices*/ CreateTensor<int64>({2, 2}, {0, 0, 1, 1}),
+           /*values*/ CreateTensor<int32>({2}, {888, 999}),
+           /*dense_shape*/ CreateTensor<int64>({2}, {2, 2})},
+          /*expected_outputs*/
+          {{/*indices*/ CreateTensor<int64>({1, 1}, {0}),
+            /*values*/ CreateTensor<int32>({1}, {888}),
+            /*dense_shape*/ CreateTensor<int64>({1}, {2})},
+           {/*indices*/ CreateTensor<int64>({1, 1}, {1}),
+            /*values*/ CreateTensor<int32>({1}, {999}),
+            /*dense_shape*/ CreateTensor<int64>({1}, {2})}},
+          /*breakpoints*/ {0, 1, 2}};
 }
 
 TestCase ThreeDimsTestCase() {
-  return {
-      /*input_sparse_tensor*/
-      {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({2, 3},
-                                                           {0, 0, 0, 1, 1, 1}),
-       /*values*/ DatasetOpsTestBase::CreateTensor<double>({2}, {888.0, 999.0}),
-       /*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})},
-      /*expected_outputs*/
-      {{/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {0, 0}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<double>({1}, {888.0}),
-        /*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})},
-       {{/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {1, 1})},
-        {/*values*/ DatasetOpsTestBase::CreateTensor<double>({1}, {999.0})},
-        {/*dense_shape*/ DatasetOpsTestBase::CreateTensor<int64>({2},
-                                                                 {2, 2})}}},
-      /*breakpoints*/ {0, 1, 2}};
+  return {/*input_sparse_tensor*/
+          {/*indices*/ CreateTensor<int64>({2, 3}, {0, 0, 0, 1, 1, 1}),
+           /*values*/ CreateTensor<double>({2}, {888.0, 999.0}),
+           /*dense_shape*/ CreateTensor<int64>({3}, {2, 2, 2})},
+          /*expected_outputs*/
+          {{/*indices*/ CreateTensor<int64>({1, 2}, {0, 0}),
+            /*values*/ CreateTensor<double>({1}, {888.0}),
+            /*dense_shape*/ CreateTensor<int64>({2}, {2, 2})},
+           {{/*indices*/ CreateTensor<int64>({1, 2}, {1, 1})},
+            {/*values*/ CreateTensor<double>({1}, {999.0})},
+            {/*dense_shape*/ CreateTensor<int64>({2}, {2, 2})}}},
+          /*breakpoints*/ {0, 1, 2}};
 }
 
 TestCase FourDimsTestCase() {
-  return {
-      /*input_sparse_tensor*/
-      {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>(
-           {2, 4}, {0, 0, 0, 0, 1, 1, 1, 1}),
-       /*values*/ DatasetOpsTestBase::CreateTensor<string>({2}, {"a", "b"}),
-       /*dense_shape*/
-       DatasetOpsTestBase::CreateTensor<int64>({4}, {3, 2, 2, 2})},
-      /*expected_outputs*/
-      {{/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {0, 0, 0}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<string>({1}, {"a"}),
-        /*dense_shape*/
-        DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})},
-       {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {1, 1, 1}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<string>({1}, {"b"}),
-        /*dense_shape*/
-        DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})},
-       {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({0, 3}, {}),
-        /*values*/ DatasetOpsTestBase::CreateTensor<string>({0}, {}),
-        /*dense_shape*/
-        DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
-      /*breakpoints*/ {0, 1, 3}};
+  return {/*input_sparse_tensor*/
+          {/*indices*/ CreateTensor<int64>({2, 4}, {0, 0, 0, 0, 1, 1, 1, 1}),
+           /*values*/ CreateTensor<tstring>({2}, {"a", "b"}),
+           /*dense_shape*/
+           CreateTensor<int64>({4}, {3, 2, 2, 2})},
+          /*expected_outputs*/
+          {{/*indices*/ CreateTensor<int64>({1, 3}, {0, 0, 0}),
+            /*values*/ CreateTensor<tstring>({1}, {"a"}),
+            /*dense_shape*/
+            CreateTensor<int64>({3}, {2, 2, 2})},
+           {/*indices*/ CreateTensor<int64>({1, 3}, {1, 1, 1}),
+            /*values*/ CreateTensor<tstring>({1}, {"b"}),
+            /*dense_shape*/
+            CreateTensor<int64>({3}, {2, 2, 2})},
+           {/*indices*/ CreateTensor<int64>({0, 3}, {}),
+            /*values*/ CreateTensor<tstring>({0}, {}),
+            /*dense_shape*/
+            CreateTensor<int64>({3}, {2, 2, 2})}},
+          /*breakpoints*/ {0, 1, 3}};
 }
 
 TestCase FiveDimsTestCase() {
-  return {/*input_sparse_tensor*/
-          {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>(
-               {2, 5}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}),
-           /*values*/ DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999}),
-           /*dense_shape*/
-           DatasetOpsTestBase::CreateTensor<int64>({5}, {3, 2, 2, 2, 2})},
-          /*expected_outputs*/
-          {{/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 4},
-                                                                {0, 0, 0, 0}),
-            /*values*/ DatasetOpsTestBase::CreateTensor<int32>({1}, {888}),
-            /*dense_shape*/
-            DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})},
-           {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({1, 4},
-                                                                {1, 1, 1, 1}),
-            /*values*/ DatasetOpsTestBase::CreateTensor<int32>({1}, {999}),
-            /*dense_shape*/
-            DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})},
-           {/*indices*/ DatasetOpsTestBase::CreateTensor<int64>({0, 4}, {}),
-            /*values*/ DatasetOpsTestBase::CreateTensor<int32>({0}, {}),
-            /*dense_shape*/
-            DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
-          /*breakpoints*/ {0, 1, 3}};
+  return {
+      /*input_sparse_tensor*/
+      {/*indices*/ CreateTensor<int64>({2, 5}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}),
+       /*values*/ CreateTensor<int32>({2}, {888, 999}),
+       /*dense_shape*/
+       CreateTensor<int64>({5}, {3, 2, 2, 2, 2})},
+      /*expected_outputs*/
+      {{/*indices*/ CreateTensor<int64>({1, 4}, {0, 0, 0, 0}),
+        /*values*/ CreateTensor<int32>({1}, {888}),
+        /*dense_shape*/
+        CreateTensor<int64>({4}, {2, 2, 2, 2})},
+       {/*indices*/ CreateTensor<int64>({1, 4}, {1, 1, 1, 1}),
+        /*values*/ CreateTensor<int32>({1}, {999}),
+        /*dense_shape*/
+        CreateTensor<int64>({4}, {2, 2, 2, 2})},
+       {/*indices*/ CreateTensor<int64>({0, 4}, {}),
+        /*values*/ CreateTensor<int32>({0}, {}),
+        /*dense_shape*/
+        CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+      /*breakpoints*/ {0, 1, 3}};
 }
 
 class ParameterizedSparseTensorSliceDatasetOpTest
@@ -333,38 +324,6 @@ TEST_P(ParameterizedSparseTensorSliceDatasetOpTest, Cardinality) {
   EXPECT_EQ(dataset->Cardinality(), expected_outputs.size());
 }
 
-TEST_F(SparseTensorSliceDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = TwoDimsTestCase();
-  SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
-  std::vector<SparseTensorParam> expected_outputs = test_case.expected_outputs;
-  DataType tvalues = input_sparse_tensor.values.dtype();
-  gtl::InlinedVector<TensorValue, 4> inputs = {
-      TensorValue(&input_sparse_tensor.indices),
-      TensorValue(&input_sparse_tensor.values),
-      TensorValue(&input_sparse_tensor.dense_shape)};
-
-  std::unique_ptr<OpKernel> dataset_kernel;
-  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
-  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
-  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
-      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
-  DatasetBase *dataset;
-  TF_ASSERT_OK(
-      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
-  core::ScopedUnref scoped_unref(dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedSparseTensorSliceDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 9cee97d3202..8fc9cdea6b4 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -71,6 +71,10 @@ int64 TakeDataset::Cardinality() const {
   return std::min(n, count_);
 }
 
+Status TakeDataset::CheckExternalState() const {
+  return input_->CheckExternalState();
+}
+
 class TakeDataset::EmptyIterator : public DatasetIterator<TakeDataset> {
  public:
   explicit EmptyIterator(const Params& params)
diff --git a/tensorflow/core/kernels/data/take_dataset_op.h b/tensorflow/core/kernels/data/take_dataset_op.h
index 5d76f6dc511..03f8ff662a7 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.h
+++ b/tensorflow/core/kernels/data/take_dataset_op.h
@@ -32,13 +32,15 @@ class TakeDataset : public DatasetBase {
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override;
 
-  const DataTypeVector& output_dtypes() const;
+  const DataTypeVector& output_dtypes() const override;
 
-  const std::vector<PartialTensorShape>& output_shapes() const;
+  const std::vector<PartialTensorShape>& output_shapes() const override;
 
-  string DebugString() const;
+  string DebugString() const override;
 
-  int64 Cardinality() const;
+  int64 Cardinality() const override;
+
+  Status CheckExternalState() const override;
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
diff --git a/tensorflow/core/kernels/data/take_dataset_op_test.cc b/tensorflow/core/kernels/data/take_dataset_op_test.cc
index b482a52324c..0a75c066428 100644
--- a/tensorflow/core/kernels/data/take_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op_test.cc
@@ -69,78 +69,78 @@ struct TestCase {
 
 // Test case 1: take fewer than input size.
 TestCase TakeLessTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 4,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 4,
-          /*breakpoints*/ {0, 2, 5}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 4,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 2, 5}};
 }
 
 // Test case 2: take more than input size.
 TestCase TakeMoreTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 25,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 10,
-          /*breakpoints*/ {0, 2, 5, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 25,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 10,
+      /*breakpoints*/ {0, 2, 5, 11}};
 }
 
 // Test case 3: take all of input.
 TestCase TakeAllTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ -1,
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ -1,
-          /*breakpoints*/ {0, 2, 5, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ -1,
+      /*expected_outputs*/
+      {CreateTensor<int64>(TensorShape{1}, {0}),
+       CreateTensor<int64>(TensorShape{1}, {1}),
+       CreateTensor<int64>(TensorShape{1}, {2}),
+       CreateTensor<int64>(TensorShape{1}, {3}),
+       CreateTensor<int64>(TensorShape{1}, {4}),
+       CreateTensor<int64>(TensorShape{1}, {5}),
+       CreateTensor<int64>(TensorShape{1}, {6}),
+       CreateTensor<int64>(TensorShape{1}, {7}),
+       CreateTensor<int64>(TensorShape{1}, {8}),
+       CreateTensor<int64>(TensorShape{1}, {9})},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ -1,
+      /*breakpoints*/ {0, 2, 5, 11}};
 }
 
 // Test case 4: take nothing.
 TestCase TakeNothingTestCase() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(
-              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
-          /*count*/ 0,
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ 0,
-          /*breakpoints*/ {0, 2, 5, 11}};
+  return {
+      /*input_tensors*/
+      {CreateTensor<int64>(TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*count*/ 0,
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 2, 5, 11}};
 }
 
 class ParameterizedTakeDatasetOpTest
@@ -351,41 +351,6 @@ TEST_P(ParameterizedTakeDatasetOpTest, Cardinality) {
   EXPECT_EQ(take_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_F(TakeDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = TakeLessTestCase();
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
-  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
-  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
-  inputs_for_take_dataset.emplace_back(&count);
-
-  std::unique_ptr<OpKernel> take_dataset_kernel;
-  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
-                                       test_case.expected_output_shapes,
-                                       &take_dataset_kernel));
-  std::unique_ptr<OpKernelContext> take_dataset_context;
-  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
-                                        &inputs_for_take_dataset,
-                                        &take_dataset_context));
-  DatasetBase *take_dataset;
-  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
-                             take_dataset_context.get(), &take_dataset));
-  core::ScopedUnref scoped_unref(take_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(take_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedTakeDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 38acc0a986b..3a12690c4b3 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -62,6 +62,8 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return 1LL; }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -70,12 +72,12 @@ class TensorDatasetOp::Dataset : public DatasetBase {
     components.reserve(tensors_.size());
     for (const Tensor& t : tensors_) {
       Node* node;
-      if (ctx->optimization_only()) {
+      if (ctx->serialize_data_tensors()) {
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      } else {
         TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
         DCHECK_NE(ctx->input_list(), nullptr);
         ctx->input_list()->emplace_back(node->name(), t);
-      } else {
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
       }
       components.emplace_back(node);
     }
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
index 48961a1ca71..45c3252e3bd 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
@@ -68,47 +68,44 @@ struct TestCase {
 
 // Test case 1: test a dataset that represents a single tuple of plain tensors.
 TestCase PlainTensorsTestCase() {
-  return {
-      /*components*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<double>(TensorShape({}), {37.0}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1, 2}),
-                                                {"a", "b"})},
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<double>(TensorShape({}), {37.0}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1, 2}),
-                                                {"a", "b"})},
-      /*expected_output_dtypes*/
-      {DT_INT64, DT_INT64, DT_DOUBLE, DT_STRING},
-      /*expected_output_shapes*/
-      {PartialTensorShape({}), PartialTensorShape({1, 3}),
-       PartialTensorShape({}), PartialTensorShape({1, 2})},
-      /*expected_cardinality*/ 1,
-      /*breakpoints*/ {0, 1, 2}};
+  return {/*components*/
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
+           CreateTensor<double>(TensorShape({}), {37.0}),
+           CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"})},
+          /*expected_outputs*/
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
+           CreateTensor<double>(TensorShape({}), {37.0}),
+           CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"})},
+          /*expected_output_dtypes*/
+          {DT_INT64, DT_INT64, DT_DOUBLE, DT_STRING},
+          /*expected_output_shapes*/
+          {PartialTensorShape({}), PartialTensorShape({1, 3}),
+           PartialTensorShape({}), PartialTensorShape({1, 2})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 1, 2}};
 }
 
 // Test case 2: test a dataset that represents a tuple of nested tensors.
 TestCase NestedTensorsTestCase() {
   return {
       /*components*/
-      {DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({}), {DatasetOpsTestBase::CreateTensor<double>(
-                                TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({}), {DatasetOpsTestBase::CreateTensor<string>(
-                                TensorShape({1, 2}), {"a", "b"})}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
+      {CreateTensor<Variant>(
+           TensorShape({}),
+           {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       CreateTensor<Variant>(
+           TensorShape({}),
+           {CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"})}),
+       CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({}), {DatasetOpsTestBase::CreateTensor<double>(
-                                TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({}), {DatasetOpsTestBase::CreateTensor<string>(
-                                TensorShape({1, 2}), {"a", "b"})}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
+      {CreateTensor<Variant>(
+           TensorShape({}),
+           {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       CreateTensor<Variant>(
+           TensorShape({}),
+           {CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"})}),
+       CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
       /*expected_output_dtypes*/
       {DT_VARIANT, DT_VARIANT, DT_INT64},
       /*expected_output_shapes*/
@@ -308,37 +305,6 @@ TEST_F(TensorDatasetOpTest, Cardinality) {
   EXPECT_EQ(tensor_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParametrizedTensorDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = GetParam();
-  std::vector<Tensor> components = test_case.components;
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (auto &component : components) {
-    inputs.push_back(TensorValue(&component));
-  }
-  std::unique_ptr<OpKernel> tensor_dataset_kernel;
-  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
-                                         test_case.expected_output_shapes,
-                                         &tensor_dataset_kernel));
-  std::unique_ptr<OpKernelContext> tensor_dataset_context;
-  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
-                                          &tensor_dataset_context));
-  DatasetBase *tensor_dataset;
-  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
-                             tensor_dataset_context.get(), &tensor_dataset));
-  core::ScopedUnref scoped_unref(tensor_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(tensor_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParametrizedTensorDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 308efeb332c..16f5b36eb76 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -68,6 +68,8 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return tensors_[0].dim_size(0); }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -76,12 +78,12 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
     components.reserve(tensors_.size());
     for (const Tensor& t : tensors_) {
       Node* node;
-      if (ctx->optimization_only()) {
+      if (ctx->serialize_data_tensors()) {
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      } else {
         TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
         DCHECK_NE(ctx->input_list(), nullptr);
         ctx->input_list()->emplace_back(node->name(), t);
-      } else {
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
       }
       components.emplace_back(node);
     }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
index 2ef28076690..fd1c5a40d94 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -64,70 +64,61 @@ struct TestCase {
 
 TestCase PlainTensorTestCase() {
   return {/*components*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 2}),
-                                                   {1, 2, 3, 4}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {2, 3}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2, 2}),
-                                                    {2, 3, 4, 5}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {3, 4}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2, 2}),
-                                                    {3, 4, 5, 6}),
-           DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 1}),
-                                                    {37.0, 38.0}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({2, 1}),
-                                                    {"a", "b"})},
+          {CreateTensor<int64>(TensorShape({2}), {1, 2}),
+           CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4}),
+           CreateTensor<uint32>(TensorShape({2}), {2, 3}),
+           CreateTensor<uint32>(TensorShape({2, 2}), {2, 3, 4, 5}),
+           CreateTensor<uint64>(TensorShape({2}), {3, 4}),
+           CreateTensor<uint64>(TensorShape({2, 2}), {3, 4, 5, 6}),
+           CreateTensor<double>(TensorShape({2, 1}), {37.0, 38.0}),
+           CreateTensor<tstring>(TensorShape({2, 1}), {"a", "b"})},
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({}), {2}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {2, 3}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({}), {3}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {3, 4}),
-           DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {37.0}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"a"}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {3, 4}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({}), {3}),
-           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {4, 5}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({}), {4}),
-           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {5, 6}),
-           DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {38.0}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"b"})},
+          {CreateTensor<int64>(TensorShape({}), {1}),
+           CreateTensor<int64>(TensorShape({2}), {1, 2}),
+           CreateTensor<uint32>(TensorShape({}), {2}),
+           CreateTensor<uint32>(TensorShape({2}), {2, 3}),
+           CreateTensor<uint64>(TensorShape({}), {3}),
+           CreateTensor<uint64>(TensorShape({2}), {3, 4}),
+           CreateTensor<double>(TensorShape({1}), {37.0}),
+           CreateTensor<tstring>(TensorShape({1}), {"a"}),
+           CreateTensor<int64>(TensorShape({}), {2}),
+           CreateTensor<int64>(TensorShape({2}), {3, 4}),
+           CreateTensor<uint32>(TensorShape({}), {3}),
+           CreateTensor<uint32>(TensorShape({2}), {4, 5}),
+           CreateTensor<uint64>(TensorShape({}), {4}),
+           CreateTensor<uint64>(TensorShape({2}), {5, 6}),
+           CreateTensor<double>(TensorShape({1}), {38.0}),
+           CreateTensor<tstring>(TensorShape({1}), {"b"})},
           /*breakpoints*/ {0, 1, 3}};
 }
 
 TestCase NestedTensorTestCase() {
   return {
       /*components*/
-      {DatasetOpsTestBase::CreateTensor<Variant>(
+      {CreateTensor<Variant>(
            TensorShape({2, 1}),
-           {DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
-                                                     {1.0, 2.0, 3.0, 4.0}),
-            DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
-                                                     {5.0, 6.0, 7.0, 8.0})}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({2, 1}), {DatasetOpsTestBase::CreateTensor<string>(
-                                     TensorShape({1, 2}), {"a", "b"}),
-                                 DatasetOpsTestBase::CreateTensor<string>(
-                                     TensorShape({1, 2}), {"c", "d"})}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 3}),
-                                               {1, 2, 3, 4, 5, 6})},
+           {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0}),
+            CreateTensor<double>(TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
+       CreateTensor<Variant>(
+           TensorShape({2, 1}),
+           {CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"}),
+            CreateTensor<tstring>(TensorShape({1, 2}), {"c", "d"})}),
+       CreateTensor<int64>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6})},
       /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
-                                 TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
-                                 TensorShape({1, 2}), {"a", "b"})}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {1, 2, 3}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
-                                 TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
-       DatasetOpsTestBase::CreateTensor<Variant>(
-           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
-                                 TensorShape({1, 2}), {"c", "d"})}),
-       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {4, 5, 6})},
+      {CreateTensor<Variant>(
+           TensorShape({1}),
+           {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       CreateTensor<Variant>(
+           TensorShape({1}),
+           {CreateTensor<tstring>(TensorShape({1, 2}), {"a", "b"})}),
+       CreateTensor<int64>(TensorShape({3}), {1, 2, 3}),
+       CreateTensor<Variant>(
+           TensorShape({1}),
+           {CreateTensor<double>(TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
+       CreateTensor<Variant>(
+           TensorShape({1}),
+           {CreateTensor<tstring>(TensorShape({1, 2}), {"c", "d"})}),
+       CreateTensor<int64>(TensorShape({3}), {4, 5, 6})},
       /*breakpoints*/ {0, 1, 2}};
 }
 
@@ -396,48 +387,6 @@ TEST_P(ParameterizedTensorSliceDatasetOpTest, Cardinality) {
   EXPECT_EQ(tensor_slice_dataset->Cardinality(), inputs[0].tensor->dim_size(0));
 }
 
-TEST_F(TensorSliceDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestCase &test_case = PlainTensorTestCase();
-  const std::vector<Tensor> &expected_outputs = test_case.expected_outputs;
-  std::vector<Tensor> components = test_case.components;
-  DataTypeVector dtypes;
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (auto &component : components) {
-    inputs.emplace_back(&component);
-    dtypes.emplace_back(component.dtype());
-  }
-  size_t num_tensors_per_slice = components.size();
-  std::vector<PartialTensorShape> shapes;
-  shapes.reserve(num_tensors_per_slice);
-  for (int i = 0; i < num_tensors_per_slice; ++i) {
-    shapes.emplace_back(expected_outputs[i].shape());
-  }
-  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
-  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
-                                              &tensor_slice_dataset_kernel));
-  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
-  TF_ASSERT_OK(
-      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
-                                      &inputs, &tensor_slice_dataset_context));
-  DatasetBase *tensor_slice_dataset;
-  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
-                             tensor_slice_dataset_context.get(),
-                             &tensor_slice_dataset));
-  core::ScopedUnref scoped_unref(tensor_slice_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(
-      tensor_slice_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedTensorSliceDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index b8302b890c8..b3a08f05890 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -70,6 +70,8 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -108,7 +110,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
                 line_contents.size());
             out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                       TensorShape({}));
-            out_tensors->back().scalar<string>()() = std::move(line_contents);
+            out_tensors->back().scalar<tstring>()() = std::move(line_contents);
             *end_of_sequence = false;
             return Status::OK();
           } else if (!errors::IsOutOfRange(s)) {
@@ -220,7 +222,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
   };
 
   const std::vector<string> filenames_;
-  const string compression_type_;
+  const tstring compression_type_;
   const bool use_compression_;
   const io::ZlibCompressionOptions options_;
 };
@@ -236,9 +238,9 @@ void TextLineDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, filenames_tensor->dims() <= 1,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
-  string compression_type;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, kCompressionType,
-                                                  &compression_type));
+  tstring compression_type;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kCompressionType,
+                                                   &compression_type));
 
   int64 buffer_size = -1;
   OP_REQUIRES_OK(ctx,
@@ -266,7 +268,7 @@ void TextLineDatasetOp::MakeDataset(OpKernelContext* ctx,
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-    filenames.push_back(filenames_tensor->flat<string>()(i));
+    filenames.push_back(filenames_tensor->flat<tstring>()(i));
   }
 
   *output = new Dataset(ctx, std::move(filenames), compression_type,
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
index d5909c857e6..4979c0fdb4f 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
@@ -46,7 +46,7 @@ class TextLineDatasetOpTest : public DatasetOpsTestBase {
 };
 
 struct TestCase {
-  std::vector<string> filenames;
+  std::vector<tstring> filenames;
   std::vector<string> texts;
   CompressionType compression_type;
   int64 buffer_size;
@@ -82,81 +82,66 @@ Status CreateTestFiles(const TestCase& test_case) {
 
 // Test case 1: multiple text files with ZLIB compression.
 TestCase TestCase1() {
-  return {
-      /*filenames*/ {absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_1"),
-                     absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_2")},
-      /*texts*/
-      {absl::StrCat("hello world\n", "11223334455\n"),
-       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
-      /*compression_type*/ CompressionType::ZLIB,
-      /*buffer_size*/ 10,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"hello world"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"11223334455"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"abcd, EFgH"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"           "}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"$%^&*()"})},
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ kUnknownCardinality,
-      /*breakpoints*/ {0, 2, 6}};
+  return {/*filenames*/ {absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_1"),
+                         absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_2")},
+          /*texts*/
+          {absl::StrCat("hello world\n", "11223334455\n"),
+           absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
+          /*compression_type*/ CompressionType::ZLIB,
+          /*buffer_size*/ 10,
+          /*expected_outputs*/
+          {CreateTensor<tstring>(TensorShape({}), {"hello world"}),
+           CreateTensor<tstring>(TensorShape({}), {"11223334455"}),
+           CreateTensor<tstring>(TensorShape({}), {"abcd, EFgH"}),
+           CreateTensor<tstring>(TensorShape({}), {"           "}),
+           CreateTensor<tstring>(TensorShape({}), {"$%^&*()"})},
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
 }
 
 // Test case 2: multiple text files with GZIP compression.
 TestCase TestCase2() {
-  return {
-      /*filenames*/ {absl::StrCat(testing::TmpDir(), "/text_line_GZIP_1"),
-                     absl::StrCat(testing::TmpDir(), "/text_line_GZIP_2")},
-      /*texts*/
-      {absl::StrCat("hello world\n", "11223334455\n"),
-       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
-      /*compression_type*/ CompressionType::GZIP,
-      /*buffer_size*/ 10,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"hello world"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"11223334455"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"abcd, EFgH"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"           "}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"$%^&*()"})},
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ kUnknownCardinality,
-      /*breakpoints*/ {0, 2, 6}};
+  return {/*filenames*/ {absl::StrCat(testing::TmpDir(), "/text_line_GZIP_1"),
+                         absl::StrCat(testing::TmpDir(), "/text_line_GZIP_2")},
+          /*texts*/
+          {absl::StrCat("hello world\n", "11223334455\n"),
+           absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
+          /*compression_type*/ CompressionType::GZIP,
+          /*buffer_size*/ 10,
+          /*expected_outputs*/
+          {CreateTensor<tstring>(TensorShape({}), {"hello world"}),
+           CreateTensor<tstring>(TensorShape({}), {"11223334455"}),
+           CreateTensor<tstring>(TensorShape({}), {"abcd, EFgH"}),
+           CreateTensor<tstring>(TensorShape({}), {"           "}),
+           CreateTensor<tstring>(TensorShape({}), {"$%^&*()"})},
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
 }
 
 // Test case 3: multiple text files without compression.
 TestCase TestCase3() {
-  return {
-      /*filenames*/ {
-          absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_1"),
-          absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_2")},
-      /*texts*/
-      {absl::StrCat("hello world\n", "11223334455\n"),
-       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
-      /*compression_type*/ CompressionType::UNCOMPRESSED,
-      /*buffer_size*/ 10,
-      /*expected_outputs*/
-      {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"hello world"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"11223334455"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"abcd, EFgH"}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}),
-                                                {"           "}),
-       DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"$%^&*()"})},
-      /*expected_output_dtypes*/ {DT_STRING},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ kUnknownCardinality,
-      /*breakpoints*/ {0, 2, 6}};
+  return {/*filenames*/ {
+              absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_1"),
+              absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_2")},
+          /*texts*/
+          {absl::StrCat("hello world\n", "11223334455\n"),
+           absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")},
+          /*compression_type*/ CompressionType::UNCOMPRESSED,
+          /*buffer_size*/ 10,
+          /*expected_outputs*/
+          {CreateTensor<tstring>(TensorShape({}), {"hello world"}),
+           CreateTensor<tstring>(TensorShape({}), {"11223334455"}),
+           CreateTensor<tstring>(TensorShape({}), {"abcd, EFgH"}),
+           CreateTensor<tstring>(TensorShape({}), {"           "}),
+           CreateTensor<tstring>(TensorShape({}), {"$%^&*()"})},
+          /*expected_output_dtypes*/ {DT_STRING},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
 }
 
 class ParameterizedTextLineDatasetOpTest
@@ -176,8 +161,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, GetNext) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -226,8 +211,8 @@ TEST_F(TextLineDatasetOpTest, DatasetNodeName) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -259,8 +244,8 @@ TEST_F(TextLineDatasetOpTest, DatasetTypeString) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -293,8 +278,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, DatasetOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -327,8 +312,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, DatasetOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -361,8 +346,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, Cardinality) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -381,45 +366,6 @@ TEST_P(ParameterizedTextLineDatasetOpTest, Cardinality) {
   EXPECT_EQ(text_line_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedTextLineDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TF_ASSERT_OK(CreateTestFiles(test_case));
-
-  std::unique_ptr<OpKernel> text_line_dataset_kernel;
-  TF_ASSERT_OK(CreateTextLineDatasetOpKernel(&text_line_dataset_kernel));
-
-  int64 num_files = test_case.filenames.size();
-  Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
-      TensorShape({}), {ToString(test_case.compression_type)});
-  Tensor buffer_size =
-      CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&filenames),
-                                            TensorValue(&compression_type),
-                                            TensorValue(&buffer_size)};
-  std::unique_ptr<OpKernelContext> text_line_dataset_context;
-  TF_ASSERT_OK(CreateTextLineDatasetContext(
-      text_line_dataset_kernel.get(), &inputs, &text_line_dataset_context));
-
-  DatasetBase* text_line_dataset;
-  TF_ASSERT_OK(CreateDataset(text_line_dataset_kernel.get(),
-                             text_line_dataset_context.get(),
-                             &text_line_dataset));
-  core::ScopedUnref scoped_unref(text_line_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(text_line_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedTextLineDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
@@ -433,8 +379,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, IteratorOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -475,8 +421,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, IteratorOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -517,8 +463,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, IteratorOutputPrefix) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -560,8 +506,8 @@ TEST_P(ParameterizedTextLineDatasetOpTest, Roundtrip) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index e35743dae60..096a412bba1 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -74,6 +74,8 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -105,10 +107,10 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
           out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                     TensorShape({}));
           Status s =
-              reader_->ReadRecord(&out_tensors->back().scalar<string>()());
+              reader_->ReadRecord(&out_tensors->back().scalar<tstring>()());
           if (s.ok()) {
             metrics::RecordTFDataBytesRead(
-                kDatasetType, out_tensors->back().scalar<string>()().size());
+                kDatasetType, out_tensors->back().scalar<tstring>()().size());
             *end_of_sequence = false;
             return Status::OK();
           }
@@ -206,7 +208,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
   };
 
   const std::vector<string> filenames_;
-  const string compression_type_;
+  const tstring compression_type_;
   io::RecordReaderOptions options_;
 };
 
@@ -224,13 +226,13 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
-    VLOG(2) << "Reading file: " << filenames_tensor->flat<string>()(i);
-    filenames.push_back(filenames_tensor->flat<string>()(i));
+    VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
+    filenames.push_back(filenames_tensor->flat<tstring>()(i));
   }
 
-  string compression_type;
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, kCompressionType,
-                                                  &compression_type));
+  tstring compression_type;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kCompressionType,
+                                                   &compression_type));
 
   int64 buffer_size = -1;
   OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc b/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
index 742b45803fa..9ec682085ad 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
@@ -46,7 +46,7 @@ class TFRecordDatasetOpTest : public DatasetOpsTestBase {
 };
 
 struct TestCase {
-  std::vector<string> filenames;
+  std::vector<tstring> filenames;
   std::vector<std::vector<string>> contents;
   CompressionType compression_type;
   int64 buffer_size;
@@ -84,12 +84,12 @@ TestCase TestCase1() {
           /*compression_type*/ CompressionType::ZLIB,
           /*buffer_size*/ 10,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"1"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"22"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"a"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bb"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"ccc"})},
+          {CreateTensor<tstring>(TensorShape({}), {"1"}),
+           CreateTensor<tstring>(TensorShape({}), {"22"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"a"}),
+           CreateTensor<tstring>(TensorShape({}), {"bb"}),
+           CreateTensor<tstring>(TensorShape({}), {"ccc"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -105,12 +105,12 @@ TestCase TestCase2() {
           /*compression_type*/ CompressionType::GZIP,
           /*buffer_size*/ 10,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"1"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"22"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"a"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bb"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"ccc"})},
+          {CreateTensor<tstring>(TensorShape({}), {"1"}),
+           CreateTensor<tstring>(TensorShape({}), {"22"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"a"}),
+           CreateTensor<tstring>(TensorShape({}), {"bb"}),
+           CreateTensor<tstring>(TensorShape({}), {"ccc"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -127,12 +127,12 @@ TestCase TestCase3() {
           /*compression_type*/ CompressionType::UNCOMPRESSED,
           /*buffer_size*/ 10,
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"1"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"22"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"333"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"a"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"bb"}),
-           DatasetOpsTestBase::CreateTensor<string>(TensorShape({}), {"ccc"})},
+          {CreateTensor<tstring>(TensorShape({}), {"1"}),
+           CreateTensor<tstring>(TensorShape({}), {"22"}),
+           CreateTensor<tstring>(TensorShape({}), {"333"}),
+           CreateTensor<tstring>(TensorShape({}), {"a"}),
+           CreateTensor<tstring>(TensorShape({}), {"bb"}),
+           CreateTensor<tstring>(TensorShape({}), {"ccc"})},
           /*expected_output_dtypes*/ {DT_STRING},
           /*expected_output_shapes*/ {PartialTensorShape({})},
           /*expected_cardinality*/ kUnknownCardinality,
@@ -156,8 +156,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, GetNext) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -206,8 +206,8 @@ TEST_F(TFRecordDatasetOpTest, DatasetNodeName) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -239,8 +239,8 @@ TEST_F(TFRecordDatasetOpTest, DatasetTypeString) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -273,8 +273,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, DatasetOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -307,8 +307,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, DatasetOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -341,8 +341,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, Cardinality) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -361,45 +361,6 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, Cardinality) {
   EXPECT_EQ(tf_record_dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedTFRecordDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  TF_ASSERT_OK(CreateTestFiles(test_case));
-
-  std::unique_ptr<OpKernel> tf_record_dataset_kernel;
-  TF_ASSERT_OK(CreateTFRecordDatasetOpKernel(&tf_record_dataset_kernel));
-
-  int64 num_files = test_case.filenames.size();
-  Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
-      TensorShape({}), {ToString(test_case.compression_type)});
-  Tensor buffer_size =
-      CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&filenames),
-                                            TensorValue(&compression_type),
-                                            TensorValue(&buffer_size)};
-  std::unique_ptr<OpKernelContext> tf_record_dataset_context;
-  TF_ASSERT_OK(CreateTFRecordDatasetContext(
-      tf_record_dataset_kernel.get(), &inputs, &tf_record_dataset_context));
-
-  DatasetBase* tf_record_dataset;
-  TF_ASSERT_OK(CreateDataset(tf_record_dataset_kernel.get(),
-                             tf_record_dataset_context.get(),
-                             &tf_record_dataset));
-  core::ScopedUnref scoped_unref(tf_record_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(tf_record_dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedTFRecordDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
@@ -413,8 +374,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, IteratorOutputDtypes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -455,8 +416,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, IteratorOutputShapes) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -497,8 +458,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, IteratorOutputPrefix) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
@@ -540,8 +501,8 @@ TEST_P(ParameterizedTFRecordDatasetOpTest, Roundtrip) {
 
   int64 num_files = test_case.filenames.size();
   Tensor filenames =
-      CreateTensor<string>(TensorShape({num_files}), test_case.filenames);
-  Tensor compression_type = CreateTensor<string>(
+      CreateTensor<tstring>(TensorShape({num_files}), test_case.filenames);
+  Tensor compression_type = CreateTensor<tstring>(
       TensorShape({}), {ToString(test_case.compression_type)});
   Tensor buffer_size =
       CreateTensor<int64>(TensorShape({}), {test_case.buffer_size});
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.cc b/tensorflow/core/kernels/data/unbounded_thread_pool.cc
index ac12197f1b8..9cb4563e33d 100644
--- a/tensorflow/core/kernels/data/unbounded_thread_pool.cc
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool.cc
@@ -16,27 +16,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 
 #include "absl/memory/memory.h"
+#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 namespace data {
 
-// A lightweight wrapper for creating logical threads in a `UnboundedThreadPool`
-// that can be shared (e.g.) in an `IteratorContext`.
-class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
- public:
-  explicit LogicalThreadFactory(UnboundedThreadPool* pool) : pool_(pool) {}
-
-  std::unique_ptr<Thread> StartThread(const string& name,
-                                      std::function<void()> fn) override {
-    return pool_->RunOnPooledThread(std::move(fn));
-  }
-
- private:
-  UnboundedThreadPool* const pool_;  // Not owned.
-};
-
 // A logical implementation of the `tensorflow::Thread` interface that uses
 // physical threads in an `UnboundedThreadPool` to perform the work.
 //
@@ -45,111 +31,64 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
 // same `UnboundedThreadPool`.
 class UnboundedThreadPool::LogicalThreadWrapper : public Thread {
  public:
-  explicit LogicalThreadWrapper(std::shared_ptr<Notification> join_notification)
-      : join_notification_(std::move(join_notification)) {}
+  explicit LogicalThreadWrapper(std::shared_ptr<Notification> done)
+      : done_(std::move(done)) {}
 
   ~LogicalThreadWrapper() override {
     // NOTE: The `Thread` destructor is expected to "join" the created thread,
     // but the physical thread may continue to execute after the work for this
     // thread is complete. We simulate this by waiting on a notification that
-    // the `CachedThreadFunc` will notify when the thread's work function is
-    // complete.
-    join_notification_->WaitForNotification();
+    // the thread's work function will notify when it is complete.
+    done_->WaitForNotification();
   }
 
  private:
-  std::shared_ptr<Notification> join_notification_;
+  std::shared_ptr<Notification> done_;
 };
 
-UnboundedThreadPool::~UnboundedThreadPool() {
-  {
-    mutex_lock l(work_queue_mu_);
-    // Wake up all `CachedThreadFunc` threads and cause them to terminate before
-    // joining them when `threads_` is cleared.
-    cancelled_ = true;
-    work_queue_cv_.notify_all();
-    if (!work_queue_.empty()) {
-      LOG(ERROR) << "UnboundedThreadPool named \"" << thread_name_ << "\" was "
-                 << "deleted with pending work in its queue. This may indicate "
-                 << "a potential use-after-free bug.";
-    }
+// A lightweight wrapper for creating logical threads in a `UnboundedThreadPool`
+// that can be shared (e.g.) in an `IteratorContext`.
+class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
+ public:
+  explicit LogicalThreadFactory(UnboundedThreadPool* pool) : pool_(pool) {}
+
+  std::unique_ptr<Thread> StartThread(const string& name,
+                                      std::function<void()> fn) override {
+    auto done = std::make_shared<Notification>();
+    pool_->ScheduleOnWorkQueue(std::move(fn), done);
+    return absl::make_unique<LogicalThreadWrapper>(std::move(done));
   }
 
-  {
-    mutex_lock l(thread_pool_mu_);
-    // Clear the list of pooled threads, which will eventually terminate due to
-    // the previous notification.
-    //
-    // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because
-    // no subsequent calls to `this->StartThread()` should be issued after the
-    // destructor starts.
-    thread_pool_.clear();
-  }
-}
+ private:
+  UnboundedThreadPool* const pool_;  // Not owned.
+};
 
 std::shared_ptr<ThreadFactory> UnboundedThreadPool::get_thread_factory() {
   return std::make_shared<LogicalThreadFactory>(this);
 }
 
-size_t UnboundedThreadPool::size() {
-  tf_shared_lock l(thread_pool_mu_);
-  return thread_pool_.size();
+void UnboundedThreadPool::Schedule(std::function<void()> fn) {
+  ScheduleOnWorkQueue(std::move(fn), /*done=*/nullptr);
 }
 
-std::unique_ptr<Thread> UnboundedThreadPool::RunOnPooledThread(
-    std::function<void()> fn) {
-  auto join_notification = std::make_shared<Notification>();
-  bool all_threads_busy;
-  {
-    // Enqueue a work item for the new thread's function, and wake up a
-    // cached thread to process it.
-    mutex_lock l(work_queue_mu_);
-    work_queue_.push_back({std::move(fn), join_notification});
-    work_queue_cv_.notify_one();
-    // NOTE: The queue may be non-empty, so we must account for queued work when
-    // considering how many threads are free.
-    all_threads_busy = work_queue_.size() > num_idle_threads_;
+int UnboundedThreadPool::NumThreads() const { return -1; }
+
+int UnboundedThreadPool::CurrentThreadId() const { return -1; }
+
+namespace {
+void WorkQueueFunc(const std::function<void()>& fn,
+                   std::shared_ptr<Notification> done) {
+  fn();
+  if (done) {
+    done->Notify();
   }
-
-  if (all_threads_busy) {
-    // Spawn a new physical thread to process the given function.
-    // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_`
-    // at the beginning of its work loop.
-    Thread* new_thread = env_->StartThread(
-        {}, thread_name_,
-        std::bind(&UnboundedThreadPool::PooledThreadFunc, this));
-
-    mutex_lock l(thread_pool_mu_);
-    thread_pool_.emplace_back(new_thread);
-  }
-
-  return absl::make_unique<LogicalThreadWrapper>(std::move(join_notification));
 }
+}  // namespace
 
-void UnboundedThreadPool::PooledThreadFunc() {
-  while (true) {
-    WorkItem work_item;
-    {
-      mutex_lock l(work_queue_mu_);
-      ++num_idle_threads_;
-      while (!cancelled_ && work_queue_.empty()) {
-        // Wait for a new work function to be submitted, or the cache to be
-        // destroyed.
-        work_queue_cv_.wait(l);
-      }
-      if (cancelled_) {
-        return;
-      }
-      work_item = std::move(work_queue_.front());
-      work_queue_.pop_front();
-      --num_idle_threads_;
-    }
-
-    work_item.work_function();
-
-    // Notify any thread that has "joined" the cached thread for this work item.
-    work_item.done_notification->Notify();
-  }
+void UnboundedThreadPool::ScheduleOnWorkQueue(
+    std::function<void()> fn, std::shared_ptr<Notification> done) {
+  unbounded_work_queue_.Schedule(
+      std::bind(&WorkQueueFunc, std::move(fn), std::move(done)));
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.h b/tensorflow/core/kernels/data/unbounded_thread_pool.h
index c84d495b296..82335d73fb6 100644
--- a/tensorflow/core/kernels/data/unbounded_thread_pool.h
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool.h
@@ -21,54 +21,39 @@ limitations under the License.
 
 #include "tensorflow/core/framework/thread_factory.h"
 #include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 namespace data {
 
 // An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a
 // potentially large number of "logical" threads onto a smaller number of
-// "physical" threads. The multiplexing is achieved by maintaining an internal
-// pool of long-running "physical" threads that are used to execute the
-// "logical" threads.  Like a regular thread, a "logical" thread may block on
-// other threads, and the size of the pool will increase to ensure that progress
-// is made. This mechanism is recommended in situations where short-lived
-// threads are created repeatedly, to avoid the overhead and memory
-// fragmentation that can result from excessive thread creation.
-class UnboundedThreadPool {
+// "physical" threads. The multiplexing is achieved by using an
+// `UnboundedWorkQueue`.
+class UnboundedThreadPool : public thread::ThreadPoolInterface {
  public:
   UnboundedThreadPool(Env* env, const string& thread_name)
-      : env_(env), thread_name_(thread_name) {}
-  ~UnboundedThreadPool();
+      : unbounded_work_queue_(env, thread_name) {}
+  ~UnboundedThreadPool() = default;
 
   // Returns an implementation of `ThreadFactory` that can be used to create
   // logical threads in this pool.
   std::shared_ptr<ThreadFactory> get_thread_factory();
 
-  // Returns the current number of threads in this pool.
-  size_t size();
+  void Schedule(std::function<void()> fn) override;
+  int NumThreads() const override;
+  int CurrentThreadId() const override;
 
  private:
   class LogicalThreadFactory;
   class LogicalThreadWrapper;
-  struct WorkItem {
-    std::function<void()> work_function;
-    std::shared_ptr<Notification> done_notification;
-  };
 
-  std::unique_ptr<Thread> RunOnPooledThread(std::function<void()> fn);
-  void PooledThreadFunc();
+  void ScheduleOnWorkQueue(std::function<void()> fn,
+                           std::shared_ptr<Notification> done);
 
-  Env* const env_;  // Not owned.
-  const string thread_name_;
-  mutex work_queue_mu_;
-  condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_);
-  size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0;
-  bool cancelled_ GUARDED_BY(work_queue_mu_) = false;
-  std::deque<WorkItem> work_queue_ GUARDED_BY(work_queue_mu_);
-  mutex thread_pool_mu_;
-  std::vector<std::unique_ptr<Thread>> thread_pool_ GUARDED_BY(thread_pool_mu_);
+  UnboundedWorkQueue unbounded_work_queue_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc
index f996b4f931b..3604be86473 100644
--- a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc
@@ -23,59 +23,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-TEST(UnboundedThreadPool, SingleThread) {
-  UnboundedThreadPool pool(Env::Default(), "test");
-  auto thread_factory = pool.get_thread_factory();
-
-  // Create a thread that updates a variable, and ensure that it runs to
-  // completion.
-  std::atomic<int> i(0);
-  auto thread = thread_factory->StartThread("", [&i]() { ++i; });
-  thread.reset();
-
-  EXPECT_GE(pool.size(), 1);
-  EXPECT_EQ(1, i);
-}
-
-TEST(UnboundedThreadPool, MultipleThreads) {
-  UnboundedThreadPool pool(Env::Default(), "test");
-  auto thread_factory = pool.get_thread_factory();
-
-  // Create ten threads that update a variable, and ensure that they all run
-  // to completion.
-  std::vector<std::unique_ptr<Thread>> threads;
-  const int kNumThreadsToCreate = 10;
-  std::atomic<int> i(0);
-  for (int j = 0; j < kNumThreadsToCreate; ++j) {
-    threads.push_back(thread_factory->StartThread("", [&i]() { ++i; }));
-  }
-  threads.clear();
-
-  EXPECT_GE(pool.size(), 1);
-  EXPECT_EQ(i, kNumThreadsToCreate);
-}
-
-TEST(UnboundedThreadPool, MultipleThreadsSleepingRandomly) {
-  UnboundedThreadPool pool(Env::Default(), "test");
-  auto thread_factory = pool.get_thread_factory();
-
-  // Create 1000 threads that sleep for a random period of time then update a
-  // variable, and ensure that they all run to completion.
-  std::vector<std::unique_ptr<Thread>> threads;
-  const int kNumThreadsToCreate = 1000;
-  std::atomic<int> i(0);
-  for (int j = 0; j < kNumThreadsToCreate; ++j) {
-    threads.push_back(thread_factory->StartThread("", [&i]() {
-      Env::Default()->SleepForMicroseconds(random::New64() % 10);
-      ++i;
-    }));
-  }
-  threads.clear();
-
-  EXPECT_GE(pool.size(), 1);
-  EXPECT_EQ(i, kNumThreadsToCreate);
-}
-
 TEST(UnboundedThreadPool, ConcurrentThreadCreation) {
   UnboundedThreadPool pool(Env::Default(), "test");
   auto thread_factory = pool.get_thread_factory();
@@ -97,7 +44,6 @@ TEST(UnboundedThreadPool, ConcurrentThreadCreation) {
   }
   threads.clear();
 
-  EXPECT_GE(pool.size(), 1);
   EXPECT_EQ(i, kNumThreadsToCreate * kNumThreadsToCreate);
 }
 
@@ -108,9 +54,7 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) {
   std::vector<std::unique_ptr<Thread>> threads;
 
   // Create multiple waves (with increasing sizes) of threads that all block
-  // before returning, and
-  // ensure that we create the appropriate number of threads and terminate
-  // correctly.
+  // before returning, and ensure that we terminate correctly.
   std::vector<int> round_sizes = {5, 10, 15, 20};
 
   for (const int round_size : round_sizes) {
@@ -129,10 +73,6 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) {
     // wave is increasing, we should have at least that number of threads in the
     // pool.
     bc.Wait();
-    // NOTE: There is a benign race between a new round starting and the
-    // physical threads from the previous round returning to the pool, so we may
-    // create more threads than the round_size.
-    EXPECT_GE(pool.size(), round_size);
     n.Notify();
     threads.clear();
   }
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 3b1e886b418..b8a7a8a1474 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -59,6 +59,8 @@ class WindowDataset : public DatasetBase {
 
   string DebugString() const override { return kWindowDataset; }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   // TODO(b/110981596): Support checkpointing.
   Status AsGraphDefInternal(SerializationContext* ctx,
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 3f2d18d8a91..bb91c57b3e2 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -99,6 +99,10 @@ class WindowDatasetOp::Dataset : public DatasetBase {
     return cardinality;
   }
 
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -326,7 +330,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
-        string error_message;
+        tstring error_message;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(ErrorMessageKey(index), &error_message));
         *status = Status(code, error_message);
diff --git a/tensorflow/core/kernels/data/window_dataset_op_test.cc b/tensorflow/core/kernels/data/window_dataset_op_test.cc
index e02d5e864b0..4e01fb3ce7f 100644
--- a/tensorflow/core/kernels/data/window_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op_test.cc
@@ -69,247 +69,234 @@ struct TestCase {
 
 // Test case 1: size=2, shift=2, stride=1, drop_remainder=false.
 TestCase TestCase1() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {false}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {1})},
+           {CreateTensor<int64>(TensorShape({}), {2}),
+            CreateTensor<int64>(TensorShape({}), {3})},
+           {CreateTensor<int64>(TensorShape({}), {4}),
+            CreateTensor<int64>(TensorShape({}), {5})},
+           {CreateTensor<int64>(TensorShape({}), {6})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 2: size=2, shift=2, stride=2, drop_remainder=true.
 TestCase TestCase2() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {2})},
+           {CreateTensor<int64>(TensorShape({}), {2}),
+            CreateTensor<int64>(TensorShape({}), {4})},
+           {CreateTensor<int64>(TensorShape({}), {4}),
+            CreateTensor<int64>(TensorShape({}), {6})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 3: size=8, shift=3, stride=1, drop_remainder=false.
 TestCase TestCase3() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 3,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {3}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {false}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {1}),
+            CreateTensor<int64>(TensorShape({}), {2}),
+            CreateTensor<int64>(TensorShape({}), {3}),
+            CreateTensor<int64>(TensorShape({}), {4}),
+            CreateTensor<int64>(TensorShape({}), {5}),
+            CreateTensor<int64>(TensorShape({}), {6})},
+           {CreateTensor<int64>(TensorShape({}), {3}),
+            CreateTensor<int64>(TensorShape({}), {4}),
+            CreateTensor<int64>(TensorShape({}), {5}),
+            CreateTensor<int64>(TensorShape({}), {6})},
+           {CreateTensor<int64>(TensorShape({}), {6})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 3,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 4: size=8, shift=3, stride=1, drop_remainder=true.
 TestCase TestCase4() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {3}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 5: size=2, shift=8, stride=1, drop_remainder=false.
 TestCase TestCase5() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 1,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {false}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {1})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 6: size=2, shift=8, stride=1, drop_remainder=true.
 TestCase TestCase6() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 1,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {1}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {1})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 7: size=2, shift=2, stride=8, drop_remainder=false.
 TestCase TestCase7() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {false}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4})},
-       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 4,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {false}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0})},
+           {CreateTensor<int64>(TensorShape({}), {2})},
+           {CreateTensor<int64>(TensorShape({}), {4})},
+           {CreateTensor<int64>(TensorShape({}), {6})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 8: size=2, shift=2, stride=8, drop_remainder=true.
 TestCase TestCase8() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {8}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {8}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 9: size=4, shift=2, stride=2, drop_remainder=true.
 TestCase TestCase9() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/
-      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
-        DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {6})}},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 1,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {4}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/
+          {{CreateTensor<int64>(TensorShape({}), {0}),
+            CreateTensor<int64>(TensorShape({}), {2}),
+            CreateTensor<int64>(TensorShape({}), {4}),
+            CreateTensor<int64>(TensorShape({}), {6})}},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 10: size=5, shift=2, stride=2, drop_remainder=true.
 TestCase TestCase10() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {5}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 11: size=0, shift=2, stride=2, drop_remainder=true.
 TestCase InvalidWindowSizeTestCase() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {0}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 12: size=2, shift=0, stride=2, drop_remainder=true.
 TestCase InvalidWindowShiftTestCase() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {0}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 // Test case 13: size=2, shift=2, stride=0, drop_remainder=true.
 TestCase InvalidWindowStrideTestCase() {
-  return {
-      /*range_data_param*/ {0, 7, 1},
-      /*size*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*shift*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
-      /*stride*/ DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
-      /*drop_remainder*/
-      DatasetOpsTestBase::CreateTensor<bool>(TensorShape({}), {true}),
-      /*expected_outputs*/ {},
-      /*expected_output_dtypes*/ {DT_VARIANT},
-      /*expected_output_shapes*/ {PartialTensorShape({})},
-      /*expected_cardinality*/ 0,
-      /*breakpoints*/ {0, 1, 9}};
+  return {/*range_data_param*/ {0, 7, 1},
+          /*size*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*shift*/ CreateTensor<int64>(TensorShape({}), {2}),
+          /*stride*/ CreateTensor<int64>(TensorShape({}), {0}),
+          /*drop_remainder*/
+          CreateTensor<bool>(TensorShape({}), {true}),
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_VARIANT},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1, 9}};
 }
 
 class ParameterizedWindowDatasetOpTest
@@ -587,49 +574,6 @@ TEST_P(ParameterizedWindowDatasetOpTest, Cardinality) {
   EXPECT_EQ(dataset->Cardinality(), test_case.expected_cardinality);
 }
 
-TEST_P(ParameterizedWindowDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TestCase test_case = GetParam();
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  std::unique_ptr<OpKernel> window_dataset_kernel;
-  TF_ASSERT_OK(CreateWindowDatasetKernel(test_case.expected_output_dtypes,
-                                         test_case.expected_output_shapes,
-                                         &window_dataset_kernel));
-
-  DatasetBase* range_dataset;
-  TF_ASSERT_OK(CreateRangeDataset<int64>(
-      test_case.range_data_param.start, test_case.range_data_param.end,
-      test_case.range_data_param.step, "range", &range_dataset));
-  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_ASSERT_OK(
-      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
-  Tensor size = test_case.size;
-  Tensor shift = test_case.shift;
-  Tensor stride = test_case.stride;
-  Tensor drop_remainder = test_case.drop_remainder;
-  gtl::InlinedVector<TensorValue, 4> inputs(
-      {TensorValue(&range_dataset_tensor), TensorValue(&size),
-       TensorValue(&shift), TensorValue(&stride),
-       TensorValue(&drop_remainder)});
-
-  std::unique_ptr<OpKernelContext> window_dataset_op_ctx;
-  TF_ASSERT_OK(CreateWindowDatasetContext(window_dataset_kernel.get(), &inputs,
-                                          &window_dataset_op_ctx));
-  DatasetBase* dataset;
-  TF_ASSERT_OK(CreateDataset(window_dataset_kernel.get(),
-                             window_dataset_op_ctx.get(), &dataset));
-  core::ScopedUnref scoped_unref_dataset(dataset);
-
-  std::unique_ptr<SerializationContext> serialization_context;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(dataset->Save(serialization_context.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedWindowDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TestCase test_case = GetParam();
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index ecc307293e7..c401b650862 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -87,6 +87,13 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return result;
   }
 
+  Status CheckExternalState() const override {
+    for (const auto& input : inputs_) {
+      TF_RETURN_IF_ERROR(input->CheckExternalState());
+    }
+    return Status::OK();
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/zip_dataset_op_test.cc b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
index 301f1822d14..9dddb05641a 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
@@ -91,12 +91,12 @@ TestParam TestCase1() {
   return {/*input_range_dataset_params*/
           {RangeDatasetParam{0, 3, 1}, RangeDatasetParam{10, 13, 1}},
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {10}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {11}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {12})},
+          {CreateTensor<int64>(TensorShape{}, {0}),
+           CreateTensor<int64>(TensorShape{}, {10}),
+           CreateTensor<int64>(TensorShape{}, {1}),
+           CreateTensor<int64>(TensorShape{}, {11}),
+           CreateTensor<int64>(TensorShape{}, {2}),
+           CreateTensor<int64>(TensorShape{}, {12})},
           /*breakpoints*/ {0, 1, 4}};
 }
 
@@ -105,12 +105,12 @@ TestParam TestCase2() {
   return {/*input_range_dataset_params*/
           {RangeDatasetParam{0, 3, 1}, RangeDatasetParam{10, 15, 1}},
           /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {0}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {10}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {11}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {12})},
+          {CreateTensor<int64>(TensorShape{}, {0}),
+           CreateTensor<int64>(TensorShape{}, {10}),
+           CreateTensor<int64>(TensorShape{}, {1}),
+           CreateTensor<int64>(TensorShape{}, {11}),
+           CreateTensor<int64>(TensorShape{}, {2}),
+           CreateTensor<int64>(TensorShape{}, {12})},
           /*breakpoints*/ {0, 1, 4}};
 }
 
@@ -333,41 +333,6 @@ TEST_P(ParameterizedZipDatasetOpTest, Cardinality) {
             test_case.expected_outputs.size() / num_tensors_per_slice);
 }
 
-TEST_F(ZipDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-
-  const TestParam &test_case = TestCase1();
-  std::vector<Tensor> range_dataset_tensors;
-  range_dataset_tensors.reserve(test_case.input_range_dataset_params.size());
-  TF_ASSERT_OK(CreateRangeDatasetTensors(test_case.input_range_dataset_params,
-                                         &range_dataset_tensors));
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  inputs.reserve(range_dataset_tensors.size());
-  for (auto &tensor : range_dataset_tensors) {
-    inputs.emplace_back(&tensor);
-  }
-  std::unique_ptr<OpKernel> dataset_kernel;
-  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
-  TF_ASSERT_OK(CreateZipDatasetKernel({DT_INT64}, {{num_tensors_per_slice}},
-                                      inputs.size(), &dataset_kernel));
-  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
-  TF_ASSERT_OK(CreateZipDatasetContext(dataset_kernel.get(), &inputs,
-                                       &dataset_kernel_ctx));
-  DatasetBase *zip_dataset;
-  TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
-                             &zip_dataset));
-  core::ScopedUnref scoped_unref(zip_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(zip_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
 TEST_P(ParameterizedZipDatasetOpTest, IteratorOutputDtypes) {
   int thread_num = 2, cpu_num = 2;
   TF_ASSERT_OK(InitThreadPool(thread_num));
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 12ea7db1ea1..45a4260dea8 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -167,11 +167,11 @@ TEST_F(DebugIdentityOpTest, Int32Success_2_3) {
 
 TEST_F(DebugIdentityOpTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_STRING));
-  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({6}));
-  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 // Tests for DebugNanCountOp
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index 8a9f7b18601..122b7ecb3da 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -54,7 +54,7 @@ class DecodeBmpOp : public OpKernel {
                                         contents.shape().DebugString()));
 
     // Start decoding image to get shape details
-    const StringPiece input = contents.scalar<string>()();
+    const StringPiece input = contents.scalar<tstring>()();
 
     OP_REQUIRES(context, (32 <= input.size()),
                 errors::InvalidArgument("Incomplete bmp content, requires at "
diff --git a/tensorflow/core/kernels/decode_compressed_op.cc b/tensorflow/core/kernels/decode_compressed_op.cc
index 3c3d49e1f8f..d3ba95eaa07 100644
--- a/tensorflow/core/kernels/decode_compressed_op.cc
+++ b/tensorflow/core/kernels/decode_compressed_op.cc
@@ -34,7 +34,7 @@ class MemoryInputStream : public io::InputStreamInterface {
 
   ~MemoryInputStream() override {}
 
-  Status ReadNBytes(int64 bytes_to_read, string* result) override {
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override {
     result->clear();
     if (bytes_to_read < 0) {
       return errors::InvalidArgument("Can't read a negative number of bytes: ",
@@ -84,13 +84,13 @@ class DecodeCompressedOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* bytes_tensor;
     OP_REQUIRES_OK(context, context->input("bytes", &bytes_tensor));
-    const auto& bytes_flat = bytes_tensor->flat<string>();
+    const auto& bytes_flat = bytes_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output("output", bytes_tensor->shape(),
                                             &output_tensor));
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
     if (compression_type_.empty()) {
       for (int64 i = 0; i < bytes_flat.size(); i++) {
         output_flat(i) = bytes_flat(i);
@@ -106,10 +106,10 @@ class DecodeCompressedOp : public OpKernel {
             new io::ZlibInputStream(
                 input_stream.get(), static_cast<size_t>(kBufferSize),
                 static_cast<size_t>(kBufferSize), zlib_options));
-        string output_string;
+        tstring output_string;
         Status s = zlib_stream->ReadNBytes(INT_MAX, &output_string);
         OP_REQUIRES(context, (s.ok() || errors::IsOutOfRange(s)), s);
-        output_flat(i) = output_string;
+        output_flat(i) = std::move(output_string);
       }
     }
   }
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index ba6369533ad..9d959a5a140 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -70,7 +70,7 @@ class DecodeCSVOp : public OpKernel {
                       " has ", record_defaults[i].NumElements()));
     }
 
-    auto records_t = records->flat<string>();
+    auto records_t = records->flat<tstring>();
     int64 records_size = records_t.size();
 
     OpOutputList output;
@@ -181,10 +181,10 @@ class DecodeCSVOp : public OpKernel {
                           errors::InvalidArgument(
                               "Field ", f,
                               " is required but missing in record ", i, "!"));
-              output[f]->flat<string>()(i) =
-                  record_defaults[f].flat<string>()(0);
+              output[f]->flat<tstring>()(i) =
+                  record_defaults[f].flat<tstring>()(0);
             } else {
-              output[f]->flat<string>()(i) = fields[f];
+              output[f]->flat<tstring>()(i) = std::move(fields[f]);
             }
             break;
           }
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 052c9f24e4b..f89533d1574 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -154,7 +154,7 @@ class DecodeImageOp : public OpKernel {
                                         contents.shape().DebugString()));
 
     // Determine format
-    const StringPiece input = contents.scalar<string>()();
+    const StringPiece input = contents.scalar<tstring>()();
     const auto magic = ClassifyFileFormat(input);
     OP_REQUIRES(
         context,
diff --git a/tensorflow/core/kernels/decode_padded_raw_op.cc b/tensorflow/core/kernels/decode_padded_raw_op.cc
index 1e6a0cb7606..12e8ec6aff0 100644
--- a/tensorflow/core/kernels/decode_padded_raw_op.cc
+++ b/tensorflow/core/kernels/decode_padded_raw_op.cc
@@ -39,7 +39,7 @@ class DecodePaddedRawOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const auto& input = context->input(0);
-    auto flat_in = input.flat<string>();
+    auto flat_in = input.flat<tstring>();
 
     int fixed_length;
     const auto& length_input = context->input(1);
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index 06dc766794c..f181ce453f1 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -552,7 +552,7 @@ class DenseCollector {
       case DataType::DT_INT64:
         return FillDefault<int64>(default_value_.value.v_int64);
       case DataType::DT_STRING:
-        return FillDefault<string>(default_value_.value.v_string);
+        return FillDefault<tstring>(default_value_.value.v_string);
       case DataType::DT_UINT8:
         return FillDefault<uint8>(default_value_.value.v_uint8);
       case DataType::DT_UINT32:
@@ -738,24 +738,24 @@ class DecodeProtoOp : public OpKernel {
 
     // This is used to allocate binary bufs if used. It serves only to define
     // memory ownership.
-    std::vector<string> tmp_binary_bufs(message_count);
+    std::vector<tstring> tmp_binary_bufs(message_count);
 
     // These are the actual buffers to use, which may be in tmp_binary_bufs
     // or may be pointers into the buf_tensor. Either way they are not owned
     // here.
-    std::vector<const string*> bufs;
+    std::vector<const tstring*> bufs;
 
     if (is_binary_ && !sanitize_) {
       // Fast path.
       for (int mi = 0; mi < message_count; ++mi) {
-        const string* buf = &buf_tensor.flat<string>()(mi);
+        const tstring* buf = &buf_tensor.flat<tstring>()(mi);
         bufs.push_back(buf);
       }
     } else {
       // We will have to allocate a copy, either to convert from text to binary
       // or to sanitize a binary proto.
       for (int mi = 0; mi < message_count; ++mi) {
-        ReserializeMessage(ctx, buf_tensor.flat<string>()(mi),
+        ReserializeMessage(ctx, buf_tensor.flat<tstring>()(mi),
                            &tmp_binary_bufs[mi]);
         if (!ctx->status().ok()) {
           return;
@@ -808,7 +808,7 @@ class DecodeProtoOp : public OpKernel {
  private:
   // Copy a serialized message to binary, e.g. to handle text proto inputs.
   void ReserializeMessage(OpKernelContext* ctx, const string& buf,
-                          string* binary_buf) {
+                          tstring* binary_buf) {
     // Handle text protos by translating them to binary.
     std::unique_ptr<Message> message(message_prototype_->New());
     OP_REQUIRES(ctx, message, errors::DataLoss("Initializing message failed"));
@@ -823,7 +823,7 @@ class DecodeProtoOp : public OpKernel {
                   errors::DataLoss("Unable to parse text protobuf"));
     }
 
-    OP_REQUIRES(ctx, message->SerializeToString(binary_buf),
+    OP_REQUIRES(ctx, SerializeToTString(*message, binary_buf),
                 errors::DataLoss("Unable to reserialize text proto as binary"));
   }
 
@@ -875,7 +875,7 @@ class DecodeProtoOp : public OpKernel {
 
   // Parse fields from a serialized message into preallocated tensors.
   void AccumulateFields(OpKernelContext* ctx,
-                        const std::vector<const string*>& bufs,
+                        const std::vector<const tstring*>& bufs,
                         std::vector<Tensor*> outputs) {
     struct TensorInfo {
       explicit TensorInfo(Tensor* tensor) {
@@ -895,8 +895,8 @@ class DecodeProtoOp : public OpKernel {
           data = tensor->bit_casted_shaped<uint8, 1>(flatshape).data();
         } else {
           // DataTypeSize() returns 0 for string types.
-          stride = last_dim_size * sizeof(string);
-          data = reinterpret_cast<uint8*>(tensor->flat<string>().data());
+          stride = last_dim_size * sizeof(tstring);
+          data = reinterpret_cast<uint8*>(tensor->flat<tstring>().data());
         }
       }
 
@@ -915,7 +915,7 @@ class DecodeProtoOp : public OpKernel {
     }
 
     for (int message_index = 0; message_index < bufs.size(); ++message_index) {
-      const string& buf = *bufs[message_index];
+      const tstring& buf = *bufs[message_index];
 
       std::vector<DenseCollector> collectors;
       collectors.reserve(field_count);
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index e68fa407534..942589608c0 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -41,7 +41,7 @@ class DecodeRawOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const auto& input = context->input(0);
     int64 str_size = -1;
-    auto flat_in = input.flat<string>();
+    auto flat_in = input.flat<tstring>();
     for (int64 i = 0; i < flat_in.size(); ++i) {
       const string& in_str = flat_in(i);
       if (str_size == -1) {
diff --git a/tensorflow/core/kernels/decode_wav_op.cc b/tensorflow/core/kernels/decode_wav_op.cc
index 4bd5d7ac2a6..c7edcac6bb3 100644
--- a/tensorflow/core/kernels/decode_wav_op.cc
+++ b/tensorflow/core/kernels/decode_wav_op.cc
@@ -40,7 +40,7 @@ class DecodeWavOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
                 errors::InvalidArgument("contents must be scalar, got shape ",
                                         contents.shape().DebugString()));
-    const string wav_string = contents.scalar<string>()();
+    const string& wav_string = contents.scalar<tstring>()();
     OP_REQUIRES(context, wav_string.size() <= std::numeric_limits<int>::max(),
                 errors::InvalidArgument("WAV contents are too large for int: ",
                                         wav_string.size()));
diff --git a/tensorflow/core/kernels/decode_wav_op_test.cc b/tensorflow/core/kernels/decode_wav_op_test.cc
index 84dc649daba..a953bad9d9d 100644
--- a/tensorflow/core/kernels/decode_wav_op_test.cc
+++ b/tensorflow/core/kernels/decode_wav_op_test.cc
@@ -55,7 +55,7 @@ TEST(DecodeWavOpTest, DecodeWavTest) {
       0x00, 0x80,  // fourth sample: -32768 (saturated)
   };
   Tensor content_tensor =
-      test::AsScalar<string>(string(wav_data.begin(), wav_data.end()));
+      test::AsScalar<tstring>(string(wav_data.begin(), wav_data.end()));
   Output content_op =
       Const(root.WithOpName("content_op"), Input::Initializer(content_tensor));
 
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index 4d7eafd4f72..22181ce6cff 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -32,8 +32,8 @@ namespace functor {
 
 template <>
 struct DenseUpdate<CPUDevice, string, ASSIGN> {
-  void operator()(const CPUDevice& d, typename TTypes<string>::Flat params,
-                  typename TTypes<string>::ConstFlat update) {
+  void operator()(const CPUDevice& d, typename TTypes<tstring>::Flat params,
+                  typename TTypes<tstring>::ConstFlat update) {
     if (params.dimension(0) == 1) {
       params.data()->resize(update.data()->size());
       auto work = [&params, &update](int64 start, int64 end) {
@@ -57,9 +57,9 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
         // first element of the tensor seems as good a guess as any of the sizes
         // of the strings contained within...
         estimated_string_size =
-            std::max(update.data()[0].size(), sizeof(string));
+            std::max(update.data()[0].size(), sizeof(tstring));
       } else {
-        estimated_string_size = sizeof(string);
+        estimated_string_size = sizeof(tstring);
       }
       d.parallelFor(
           params.dimension(0),
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index b29e8323332..5ddcf1d816b 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -37,10 +37,14 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
+#endif
+
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -517,7 +521,7 @@ extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_grad_input_ops.cc.
 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
@@ -530,7 +534,7 @@ extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Kernel to compute the input backprop for depthwise convolution.
 template <typename Device, class T>
@@ -677,7 +681,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 #undef REGISTER_CPU_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNEL(T)                                       \
   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
@@ -715,7 +719,7 @@ TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
 #undef REGISTER_GROUPED_CONV_KERNEL
 #endif  // CUDNN_VERSION
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Kernels to compute the gradients of the filters for depthwise convolution.
 
@@ -991,7 +995,7 @@ extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_grad_filter_ops.cc.
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
@@ -1004,7 +1008,7 @@ extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Kernel to compute the filter backprop for depthwise convolution.
 template <typename Device, class T>
@@ -1160,7 +1164,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 #undef REGISTER_CPU_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
                               .Device(DEVICE_GPU)                     \
@@ -1197,6 +1201,6 @@ TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
 #undef REGISTER_GROUPED_CONV_KERNEL
 #endif  // CUDNN_VERSION
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index ceaeaac21de..a7a0088fd3d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -38,10 +38,14 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
+#endif
+
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -246,7 +250,7 @@ extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
 extern template struct LaunchConv2DOp<CPUDevice, float>;
 extern template struct LaunchConv2DOp<CPUDevice, double>;
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_ops.cc.
 extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
@@ -461,7 +465,7 @@ TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNEL(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                                     \
@@ -494,6 +498,6 @@ TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
 #endif  // CUDNN_VERSION
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index b2d58988913..508a25e3397 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -80,7 +80,7 @@ struct LaunchDepthwiseConvBackpropFilterOp {
                   TensorFormat data_format);
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index 721088f80ba..4812fb1a698 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
@@ -79,13 +78,15 @@ inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
 // convolution depending on a template argument of this enum.
 enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };
 
-// A Cuda kernel to compute the depthwise convolution forward pass
+// A GPU kernel to compute the depthwise convolution forward pass
 // in NHWC format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(1024, 2)
-    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
-                                 const T* filter, T* output, int num_outputs) {
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
+                                 const T* __restrict__ input,
+                                 const T* __restrict__ filter,
+                                 T* __restrict__ output, int num_outputs) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
@@ -103,7 +104,7 @@ __global__ void __launch_bounds__(1024, 2)
   const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
-  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_outputs) {
     // Compute the indexes of this thread in the output.
     const int out_channel = thread_id % out_depth;
     const int out_col = (thread_id / out_depth) % out_width;
@@ -188,7 +189,8 @@ template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
           bool kKnownEvenHeight>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
-    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+    const DepthwiseArgs args, const T* __restrict__ input,
+    const T* __restrict__ filter, T* __restrict__ output) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
@@ -323,13 +325,15 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   }
 }
 
-// A Cuda kernel to compute the depthwise convolution forward pass
+// A GPU kernel to compute the depthwise convolution forward pass
 // in NCHW format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(1024, 2)
-    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
-                                 const T* filter, T* output, int num_outputs) {
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
+                                 const T* __restrict__ input,
+                                 const T* __restrict__ filter,
+                                 T* __restrict__ output, int num_outputs) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
@@ -347,7 +351,7 @@ __global__ void __launch_bounds__(1024, 2)
   const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
-  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_outputs) {
     // Compute the indexes of this thread in the output.
     //
     // We want coalesced reads so we make sure that each warp reads
@@ -476,7 +480,8 @@ template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
           bool kKnownEvenHeight>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
-    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+    const DepthwiseArgs args, const T* __restrict__ input,
+    const T* __restrict__ filter, T* __restrict__ output) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
@@ -779,7 +784,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
   }
 }
 
-// A simple launch pad to launch the Cuda kernel for depthwise convolution.
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
 template <typename T>
 void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
                                                      const DepthwiseArgs& args,
@@ -795,14 +800,14 @@ void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
   }
 }
 
-// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(640, 2)
-    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
-                                              const T* out_backprop,
-                                              const T* filter, T* in_backprop,
-                                              int num_in_backprop) {
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ filter, T* __restrict__ in_backprop,
+        int num_in_backprop) {
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
@@ -819,7 +824,7 @@ __global__ void __launch_bounds__(640, 2)
   const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
-  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
     // Compute the indexes of this thread in the output.
     const int in_channel = thread_id % in_depth;
     const int in_col = (thread_id / in_depth) % in_width;
@@ -869,10 +874,10 @@ __global__ void __launch_bounds__(640, 2)
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(640, 2)
-    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
-                                              const T* out_backprop,
-                                              const T* filter, T* in_backprop,
-                                              int num_in_backprop) {
+    DepthwiseConv2dBackpropInputGPUKernelNCHW(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ filter, T* __restrict__ in_backprop,
+        int num_in_backprop) {
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
@@ -891,7 +896,7 @@ __global__ void __launch_bounds__(640, 2)
 
   // TODO(vrv): Consider assigning threads to output and using
   // atomics for accumulation, similar to the filter case.
-  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
     // Compute the indexes of this thread in the input.
     const int in_col = thread_id % in_width;
     const int in_row = (thread_id / in_width) % in_height;
@@ -998,7 +1003,7 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
   }
 }
 
-// A simple launch pad to launch the Cuda kernel for depthwise convolution.
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
 template <typename T>
 void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
@@ -1014,17 +1019,16 @@ void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
   }
 }
 
-// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter.
 // TODO: Add fp32 accumulation to half calls of this function. This addition
 // is non-trivial as the partial sums are added directly to the output
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(640, 2)
-    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
-                                               const T* out_backprop,
-                                               const T* input,
-                                               T* filter_backprop,
-                                               int num_out_backprop) {
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ input, T* __restrict__ filter_backprop,
+        int num_out_backprop) {
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
@@ -1041,7 +1045,7 @@ __global__ void __launch_bounds__(640, 2)
   const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
-  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
     // Compute the indexes of this thread in the output.
     const int out_channel = thread_id % out_depth;
     const int out_col = (thread_id / out_depth) % out_width;
@@ -1081,7 +1085,7 @@ __global__ void __launch_bounds__(640, 2)
               (dm + depth_multiplier *
                         (in_channel +
                          in_depth * (filter_col + filter_width * filter_row)));
-          CudaAtomicAdd(addr, partial_sum);
+          GpuAtomicAdd(addr, partial_sum);
         }
       }
     } else {
@@ -1112,7 +1116,7 @@ __global__ void __launch_bounds__(640, 2)
             // contention on the destination; 2. Have each thread compute one
             // gradient for an element in the filters. This should work well
             // when the input depth is big and filter size is not too small.
-            CudaAtomicAdd(addr, partial_sum);
+            GpuAtomicAdd(addr, partial_sum);
           }
         }
       }
@@ -1126,11 +1130,11 @@ template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
-  int sub_warp = cub::LaneId() / kWidth;
+  int sub_warp = GpuLaneId() / kWidth;
   int zeros = sub_warp * kWidth;
   unsigned mask = ((1UL << kWidth) - 1) << zeros;
   for (int delta = kWidth / 2; delta > 0; delta /= 2) {
-    val += CudaShuffleXorSync(mask, val, delta);
+    val += GpuShuffleXorSync(mask, val, delta);
   }
   return val;
 }
@@ -1154,7 +1158,8 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockDepth, int kAccumPixels>
 __global__
 __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
-    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
+    const DepthwiseArgs args, const T* __restrict__ output,
+    const T* __restrict__ input, T* __restrict__ filter) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
@@ -1253,7 +1258,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
+    unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range);
 
     if (channel_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1268,7 +1273,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
           S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16; delta >= kBlockDepth; delta /= 2) {
-            val += CudaShuffleXorSync(active_threads, val, delta);
+            val += GpuShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) {
             *accum_ptr = val;
@@ -1294,22 +1299,21 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
         // Warp-accumulate the pixels of the same depth from the accumulator.
         val = WarpSumReduce<kAccumPixels>(val);
         if (!(thread_idx & kAccumPixels - 1)) {
-          CudaAtomicAdd(filter_offset + filter, static_cast<T>(val));
+          GpuAtomicAdd(filter_offset + filter, static_cast<T>(val));
         }
       }
     }
   }
 }
 
-// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(640, 2)
-    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
-                                               const T* out_backprop,
-                                               const T* input,
-                                               T* filter_backprop,
-                                               int num_out_backprop) {
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ input, T* __restrict__ filter_backprop,
+        int num_out_backprop) {
   const int in_height = args.in_rows;
   const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
@@ -1326,7 +1330,7 @@ __global__ void __launch_bounds__(640, 2)
   const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
-  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
+  GPU_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
     // Compute the indexes of this thread in the output.
     const int out_col = thread_id % out_width;
     const int out_row = (thread_id / out_width) % out_height;
@@ -1370,7 +1374,7 @@ __global__ void __launch_bounds__(640, 2)
               (dm + depth_multiplier *
                         (in_channel +
                          in_depth * (filter_col + filter_width * filter_row)));
-          CudaAtomicAdd(addr, partial_sum);
+          GpuAtomicAdd(addr, partial_sum);
         }
       }
     } else {
@@ -1402,7 +1406,7 @@ __global__ void __launch_bounds__(640, 2)
             // contention on the destination; 2. Have each thread compute one
             // gradient for an element in the filters. This should work well
             // when the input depth is big and filter size is not too small.
-            CudaAtomicAdd(addr, partial_sum);
+            GpuAtomicAdd(addr, partial_sum);
           }
         }
       }
@@ -1427,7 +1431,8 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockDepth, int kAccumPixels>
 __global__
 __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
-    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
+    const DepthwiseArgs args, const T* __restrict__ output,
+    const T* __restrict__ input, T* __restrict__ filter) {
   typedef typename detail::PseudoHalfType<T>::Type S;
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
@@ -1521,7 +1526,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
+    unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range);
 
     if (channel_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1536,7 +1541,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
           S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) {
-            val += CudaShuffleXorSync(active_threads, val, delta);
+            val += GpuShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 / kBlockDepth - 1)) {
             *accum_ptr = val;  // kBlockDepth threads per warp.
@@ -1563,7 +1568,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
         // Warp-accumulate pixels of the same depth from the accumulator.
         val = WarpSumReduce<kAccumPixels>(val);
         if (!(thread_idx & kAccumPixels - 1)) {
-          CudaAtomicAdd(filter_offset + filter, static_cast<T>(val));
+          GpuAtomicAdd(filter_offset + filter, static_cast<T>(val));
         }
       }
     }
@@ -1745,7 +1750,7 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU(
   }
 }
 
-// A simple launch pad to launch the Cuda kernel for depthwise convolution.
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
 template <typename T>
 void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
@@ -1769,6 +1774,6 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
   }
 }
 }  // namespace tensorflow
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
index 073e7cf2698..1e4b3390d7f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
@@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
index 4b0e15e4766..946cb650668 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
@@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
index 2db9fa4dff5..c1fe5dfa5b1 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
@@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/deserialize_sparse_string_op.cc b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
index d26d8188d51..cea891e6b88 100644
--- a/tensorflow/core/kernels/deserialize_sparse_string_op.cc
+++ b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
@@ -75,7 +75,7 @@ class DeserializeSparseOp : public OpKernel {
     if (num_sparse_tensors == 1 && ndims == 1) {
       // Special case with a single sparse tensor. We can avoid data
       // motion in the Concat and Reshape.
-      const auto& serialized_sparse_t = serialized_sparse.vec<string>();
+      const auto& serialized_sparse_t = serialized_sparse.vec<tstring>();
 
       Tensor output_indices;
       Tensor output_values;
@@ -98,7 +98,7 @@ class DeserializeSparseOp : public OpKernel {
     values.reserve(num_sparse_tensors);
 
     const auto& serialized_sparse_t =
-        serialized_sparse.flat_inner_dims<string, 2>();
+        serialized_sparse.flat_inner_dims<tstring, 2>();
     for (int i = 0; i < num_sparse_tensors; ++i) {
       Tensor output_indices;
       Tensor output_values;
@@ -283,7 +283,7 @@ class DeserializeSparseOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("Tserialized"),
+                            .TypeConstraint<tstring>("Tserialized"),
                         DeserializeSparseOp)
 
 REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/determinant_op_gpu.cu.cc b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
index 23081c9650f..9aa64b3a7da 100644
--- a/tensorflow/core/kernels/determinant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
@@ -30,7 +30,7 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 namespace {
-__device__ int PermutationOrder(int n, const int* pivots) {
+__device__ int PermutationOrder(int n, const int* __restrict__ pivots) {
   // Compute the order of the permutation from the number of transpositions
   // encoded in the pivot array, see:
   // http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340
@@ -83,11 +83,10 @@ __device__ inline complex128 operator/(const complex128& a, const double& b) {
 // on the value of the template parameter. If compute_log_abs_det is false,
 // the sign argument is ignored.
 template <typename Scalar, bool compute_log_abs_det = true>
-__global__ void DeterminantFromPivotedLUKernel(int nthreads, int n,
-                                               const Scalar* lu_factor,
-                                               const int* all_pivots,
-                                               Scalar* sign,
-                                               Scalar* log_abs_det) {
+__global__ void DeterminantFromPivotedLUKernel(
+    int nthreads, int n, const Scalar* __restrict__ lu_factor,
+    const int* __restrict__ all_pivots, Scalar* __restrict__ sign,
+    Scalar* __restrict__ log_abs_det) {
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   const int matrix_size = n * n;
   const int stride = n + 1;
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d716d9cd45c..ae541fb365a 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
 __global__ void DiagGpuKernel(const int num_threads, const int64 size,
-                              const T* in, T* out) {
+                              const T* __restrict__ in, T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -79,7 +79,8 @@ template struct DiagFunctor<GPUDevice, complex128>;
 
 template <typename T>
 __global__ void DiagPartGpuKernel(const int num_threads, const int64 size,
-                                  const T* in, T* out) {
+                                  const T* __restrict__ in,
+                                  T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index 5cb88243abe..7c4123c11d8 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -35,13 +35,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 
 template <typename T>
-__global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
-                               const T* filter_ptr, int batch, int input_rows,
-                               int input_cols, int depth, int filter_rows,
-                               int filter_cols, int output_rows,
-                               int output_cols, int stride_rows,
-                               int stride_cols, int rate_rows, int rate_cols,
-                               int pad_top, int pad_left, T* output_ptr) {
+__global__ void DilationKernel(
+    const int32 nthreads, const T* __restrict__ input_ptr,
+    const T* __restrict__ filter_ptr, int batch, int input_rows, int input_cols,
+    int depth, int filter_rows, int filter_cols, int output_rows,
+    int output_cols, int stride_rows, int stride_cols, int rate_rows,
+    int rate_cols, int pad_top, int pad_left, T* __restrict__ output_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
@@ -76,11 +75,12 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
 
 template <typename T>
 __global__ void DilationBackpropInputKernel(
-    const int32 nthreads, const T* input_ptr, const T* filter_ptr,
-    const T* out_backprop_ptr, int batch, int input_rows, int input_cols,
-    int depth, int filter_rows, int filter_cols, int output_rows,
-    int output_cols, int stride_rows, int stride_cols, int rate_rows,
-    int rate_cols, int pad_top, int pad_left, T* in_backprop_ptr) {
+    const int32 nthreads, const T* __restrict__ input_ptr,
+    const T* __restrict__ filter_ptr, const T* __restrict__ out_backprop_ptr,
+    int batch, int input_rows, int input_cols, int depth, int filter_rows,
+    int filter_cols, int output_rows, int output_cols, int stride_rows,
+    int stride_cols, int rate_rows, int rate_cols, int pad_top, int pad_left,
+    T* __restrict__ in_backprop_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
@@ -125,11 +125,12 @@ __global__ void DilationBackpropInputKernel(
 
 template <typename T>
 __global__ void DilationBackpropFilterKernel(
-    const int32 nthreads, const T* input_ptr, const T* filter_ptr,
-    const T* out_backprop_ptr, int batch, int input_rows, int input_cols,
-    int depth, int filter_rows, int filter_cols, int output_rows,
-    int output_cols, int stride_rows, int stride_cols, int rate_rows,
-    int rate_cols, int pad_top, int pad_left, T* filter_backprop_ptr) {
+    const int32 nthreads, const T* __restrict__ input_ptr,
+    const T* __restrict__ filter_ptr, const T* __restrict__ out_backprop_ptr,
+    int batch, int input_rows, int input_cols, int depth, int filter_rows,
+    int filter_cols, int output_rows, int output_cols, int stride_rows,
+    int stride_cols, int rate_rows, int rate_cols, int pad_top, int pad_left,
+    T* __restrict__ filter_backprop_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 7fe519b063c..1df2dfdc786 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -139,7 +139,8 @@ class BoundedOutputIterator
     int32* base;
     // Constructor
     __host__ __device__ __forceinline__
-    BoundedReference(int32* ptr, int32* base, IdentityOp op, int32 limit)
+    BoundedReference(int32* __restrict__ ptr, int32* __restrict__ base,
+                     IdentityOp op, int32 limit)
         : Reference(ptr, op), limit(limit), base(base) {}
 
     // Assignment
@@ -153,13 +154,13 @@ class BoundedOutputIterator
   typedef BoundedOutputIterator self_type;
   typedef BoundedReference reference;
 
-  __host__ __device__ __forceinline__ BoundedOutputIterator(int32* ptr,
-                                                            IdentityOp op,
-                                                            int32 size)
+  __host__ __device__ __forceinline__
+  BoundedOutputIterator(int32* __restrict__ ptr, IdentityOp op, int32 size)
       : TransformOutputIterator(ptr, op), limit(size), base(ptr) {}
 
   __host__ __device__ __forceinline__
-  BoundedOutputIterator(int32* ptr, int32* base, IdentityOp op, int32 size)
+  BoundedOutputIterator(int32* __restrict__ ptr, int32* __restrict__ base,
+                        IdentityOp op, int32 size)
       : TransformOutputIterator(ptr, op), limit(size), base(base) {}
 
   // Indirection
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
index cbf525212bb..1db98d18e65 100644
--- a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// Need to #include Eigen's Tensor class first because Eigen/CXX11/FixedPoint
+// depends on the file but doesn't include it. This breaks compilation on
+// clang.
+// clang-format off
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+// clang-format on
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
deleted file mode 100644
index 12148c54b36..00000000000
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-namespace Eigen {
-
-/** SoftMax
- * \ingroup CXX11_NeuralNetworks_Module
- *
- * \brief Applies a softmax
- *
- * The input parameter is expected to be a col-major tensor with a rank of 2
- * (depth and other).
- *
- * The result can be assigned to a tensor of rank and dimensions equal to that
- * of the input. The result will be laid out in col-major order.
- *
- */
-
-namespace {
-struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) {}
-
-  template <typename Input>
-  typename Input::Dimensions dimensions(const Input& input) const {
-    return input.dimensions();
-  }
-
-  template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    // nvcc doesn't support cxx11
-    Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
-    depth_dim[0] = 0;
-    Eigen::array<typename internal::traits<Input>::Index, 2> bcast;
-    bcast[0] = dimensions(input)[0];
-    bcast[1] = 1;
-    DSizes<typename internal::traits<Input>::Index, 2> dims2d;
-    dims2d[0] = 1;
-    dims2d[1] = dimensions(input)[1];
-#else
-    // Take advantage of cxx11 to give the compiler information it can use to
-    // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
-    bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>,
-                     typename internal::traits<Input>::Index>
-        dims2d;
-    dims2d.set(1, dimensions(input)[1]);
-#endif
-
-    output.device(device) =
-        ((input -
-          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
-         beta_)
-            .exp();
-    output.device(device) =
-        output /
-        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
-  }
-
- private:
-  const float beta_;
-};
-}  // namespace
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
-                                                     const Input>
-SoftMax(const Input& input, const float beta) {
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
-                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
-                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  const SoftmaxOp op(beta);
-  return input.customOp(op);
-}
-
-}  // end namespace Eigen
-
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
deleted file mode 100644
index 30a1ccca052..00000000000
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/eigen_softmax.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace Eigen {
-
-namespace {
-void EigenApprox(float a, float b) {
-  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
-}
-}  // namespace
-
-TEST(EigenSoftmaxTest, Simple) {
-  const int depth = 1024;
-  const int batch = 32;
-  const float beta = 1.2f;
-
-  Tensor<float, 2> input(depth, batch);
-  input = input.constant(11.0f) + input.random();
-
-  Tensor<float, 2> reference(depth, batch);
-  reference.setRandom();
-
-  Eigen::array<int, 1> depth_dim;
-  depth_dim[0] = 0;
-  Eigen::array<int, 2> bcast;
-  bcast[0] = depth;
-  bcast[1] = 1;
-  Tensor<float, 2>::Dimensions dims2d;
-  dims2d[0] = 1;
-  dims2d[1] = batch;
-  reference =
-      ((input -
-        input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
-       beta)
-          .exp();
-  reference =
-      reference /
-      (reference.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
-
-  Tensor<float, 2> result = SoftMax(input, beta);
-
-  for (int i = 0; i < depth; ++i) {
-    for (int j = 0; j < batch; ++j) {
-      EigenApprox(result(i, j), reference(i, j));
-    }
-  }
-}
-
-}  // namespace Eigen
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index ceee96d7ecb..e43fd7ed4b1 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1472,6 +1472,10 @@ static void PackRhsHelper(int iters,
   std::vector<Evaluator> evaluators;
   std::vector<InputMapper> input_mappers;
 
+  inputs.reserve(num_inputs);
+  evaluators.reserve(num_inputs);
+  input_mappers.reserve(num_inputs);
+
   for (int i = 0; i < num_inputs; ++i) {
     inputs.emplace_back(input_dims);
     inputs[i].setRandom();
@@ -1664,6 +1668,10 @@ static void PackLhsHelper(int iters,
   std::vector<Evaluator> evaluators;
   std::vector<InputMapper> input_mappers;
 
+  filters.reserve(num_filters);
+  evaluators.reserve(num_filters);
+  input_mappers.reserve(num_filters);
+
   for (int i = 0; i < num_filters; ++i) {
     filters.emplace_back(filter_dims);
     filters[i].setRandom();
diff --git a/tensorflow/core/kernels/einsum_op.cc b/tensorflow/core/kernels/einsum_op.cc
index bca7fca7f3d..75136cd9f01 100644
--- a/tensorflow/core/kernels/einsum_op.cc
+++ b/tensorflow/core/kernels/einsum_op.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #define EIGEN_USE_THREADS
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/einsum_op.h"
 
@@ -39,9 +39,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/einsum_op_util.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/reduction_ops_common_gpu.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -709,7 +709,7 @@ class EinsumOp : public OpKernel {
   bool output_has_ellipsis_ = false;
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T, N)                                      \
@@ -736,12 +736,15 @@ namespace functor {
 
 DECLARE_GPU_SPECS(double);
 DECLARE_GPU_SPECS(float);
+// TODO(rocm): Enable once complex types are supported.
+#if GOOGLE_CUDA
 DECLARE_GPU_SPECS(complex64);
 DECLARE_GPU_SPECS(complex128);
+#endif
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPECS
 }  // namespace functor
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_EINSUM(D, TYPE)                                   \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -755,14 +758,17 @@ TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(TYPE) REGISTER_EINSUM(GPU, TYPE)
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
+// TODO(rocm): Enable once complex types are supported.
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+#endif
 #undef REGISTER_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef REGISTER_EINSUM
 
diff --git a/tensorflow/core/kernels/einsum_op.h b/tensorflow/core/kernels/einsum_op.h
index 8ac1bbc5fe5..31d1109004c 100644
--- a/tensorflow/core/kernels/einsum_op.h
+++ b/tensorflow/core/kernels/einsum_op.h
@@ -18,9 +18,9 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 namespace functor {
diff --git a/tensorflow/core/kernels/einsum_op_gpu.cu.cc b/tensorflow/core/kernels/einsum_op_gpu.cu.cc
index e7adbe571e7..fa1c8cbb4a5 100644
--- a/tensorflow/core/kernels/einsum_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/einsum_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
@@ -43,4 +43,4 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 547b9d8da4d..923b379c2d6 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -124,7 +124,7 @@ class EncodeJpegOp : public OpKernel {
                    context->allocate_output(0, TensorShape({}), &output));
     OP_REQUIRES(context,
                 jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<string>()()),
+                               adjusted_flags, &output->scalar<tstring>()()),
                 errors::Internal("JPEG encoding failed"));
   }
 
@@ -190,7 +190,7 @@ class EncodeJpegVariableQualityOp : public OpKernel {
                    context->allocate_output(0, TensorShape({}), &output));
     OP_REQUIRES(context,
                 jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<string>()()),
+                               adjusted_flags, &output->scalar<tstring>()()),
                 errors::Internal("JPEG encoding failed"));
   }
 };
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
index cb9a1660a7d..8dbe1d377df 100644
--- a/tensorflow/core/kernels/encode_png_op.cc
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -78,17 +78,17 @@ class EncodePngOp : public OpKernel {
                    context->allocate_output(0, TensorShape({}), &output));
     if (desired_channel_bits_ == 8) {
       OP_REQUIRES(context,
-                  png::WriteImageToBuffer(image.flat<uint8>().data(), width,
-                                          height, width * channels, channels,
-                                          desired_channel_bits_, compression_,
-                                          &output->scalar<string>()(), nullptr),
+                  png::WriteImageToBuffer(
+                      image.flat<uint8>().data(), width, height,
+                      width * channels, channels, desired_channel_bits_,
+                      compression_, &output->scalar<tstring>()(), nullptr),
                   errors::Internal("PNG encoding failed"));
     } else {
       OP_REQUIRES(context,
                   png::WriteImageToBuffer(
                       image.flat<uint16>().data(), width, height,
                       width * channels * 2, channels, desired_channel_bits_,
-                      compression_, &output->scalar<string>()(), nullptr),
+                      compression_, &output->scalar<tstring>()(), nullptr),
                   errors::Internal("PNG encoding failed"));
     }
   }
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index b023f1cdeb8..288b74c4d24 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -298,12 +298,32 @@ Status WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
   return Status::OK();
 }
 
+static void WriteStringAdapter(int field_number, const tstring& value,
+                               CodedOutputStream* output) {
+  // Unfortunately, external proto does not accept string_view.
+#if defined(PLATFORM_GOOGLE)
+  WireFormatLite::WriteString(field_number, StringPiece(value), output);
+#else
+  WireFormatLite::WriteString(field_number, string(value), output);
+#endif
+}
+
+static void WriteBytesAdapter(int field_number, const tstring& value,
+                              CodedOutputStream* output) {
+  // Unfortunately, external proto does not accept string_view.
+#if defined(PLATFORM_GOOGLE)
+  WireFormatLite::WriteBytes(field_number, StringPiece(value), output);
+#else
+  WireFormatLite::WriteBytes(field_number, string(value), output);
+#endif
+}
+
 // Writes a group field. Groups are treated like submessages, but tag-delimited
 // instead of length-delimited. WireFormatLite handles this differently so we
 // code it ourselves.
 Status WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
                   int message_index, int size, CodedOutputStream* output) {
-  auto input_t = input.flat_inner_dims<string>();
+  auto input_t = input.flat_inner_dims<tstring>();
   for (int64 i = 0; i < size; i++) {
     const string& value = input_t(static_cast<int64>(message_index), i);
     WireFormatLite::WriteTag(field_desc.number(),
@@ -388,15 +408,15 @@ Status WriteField(const FieldDescriptor& field_desc, const Tensor& input,
                         WireFormatLite::WriteBoolNoTag>(
           field_desc, input, message_index, size, output);
     case WireFormatLite::TYPE_STRING:
-      return WriteVarLenField<string, WireFormatLite::WriteString>(
+      return WriteVarLenField<tstring, WriteStringAdapter>(
           field_desc, input, message_index, size, output);
     case WireFormatLite::TYPE_GROUP:
       return WriteGroup(field_desc, input, message_index, size, output);
     case WireFormatLite::TYPE_MESSAGE:
-      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+      return WriteVarLenField<tstring, WriteBytesAdapter>(
           field_desc, input, message_index, size, output);
     case WireFormatLite::TYPE_BYTES:
-      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+      return WriteVarLenField<tstring, WriteBytesAdapter>(
           field_desc, input, message_index, size, output);
     case WireFormatLite::TYPE_UINT32:
       switch (dtype) {
@@ -587,12 +607,12 @@ class EncodeProtoOp : public OpKernel {
     Tensor* output_tensor;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, common_prefix, &output_tensor));
 
-    auto bufs = output_tensor->flat<string>();
+    auto bufs = output_tensor->flat<tstring>();
     for (int message_index = 0; message_index < message_count;
          message_index++) {
       // TODO(nix): possibly optimize allocation here by calling
       // `bufs(message_index).reserve(DEFAULT_BUF_SIZE)`.
-      StringOutputStream output_string(&bufs(message_index));
+      TStringOutputStream output_string(&bufs(message_index));
       CodedOutputStream out(&output_string);
       // Write fields in ascending field_number order.
       for (int i : sorted_field_index_) {
diff --git a/tensorflow/core/kernels/encode_wav_op.cc b/tensorflow/core/kernels/encode_wav_op.cc
index 082f9a74ae1..b90d90873ab 100644
--- a/tensorflow/core/kernels/encode_wav_op.cc
+++ b/tensorflow/core/kernels/encode_wav_op.cc
@@ -58,7 +58,7 @@ class EncodeWavOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    wav::EncodeAudioAsS16LEWav(
                        audio.flat<float>().data(), sample_rate, channel_count,
-                       sample_count, &output->scalar<string>()()));
+                       sample_count, &output->scalar<tstring>()()));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("EncodeWav").Device(DEVICE_CPU), EncodeWavOp);
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 708b52a5174..b9f9ec30d52 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb_text.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -63,10 +63,10 @@ class ParseExampleOp : public OpKernel {
 
     // Copy from OpInputList to std::vector<string>.
     for (int di = 0; di < attrs_.num_dense; ++di) {
-      dense_keys_t[di] = dense_keys[di].scalar<string>()();
+      dense_keys_t[di] = dense_keys[di].scalar<tstring>()();
     }
     for (int di = 0; di < attrs_.num_sparse; ++di) {
-      sparse_keys_t[di] = sparse_keys[di].scalar<string>()();
+      sparse_keys_t[di] = sparse_keys[di].scalar<tstring>()();
     }
 
     if (names->NumElements() > 0) {
@@ -132,10 +132,10 @@ class ParseExampleOp : public OpKernel {
       config.sparse.push_back({sparse_keys_t[d], attrs_.sparse_types[d]});
     }
 
-    auto serialized_t = serialized->flat<string>();
-    auto names_t = names->flat<string>();
-    gtl::ArraySlice<string> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<string> names_slice(names_t.data(), names_t.size());
+    auto serialized_t = serialized->flat<tstring>();
+    auto names_t = names->flat<tstring>();
+    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
+    gtl::ArraySlice<tstring> names_slice(names_t.data(), names_t.size());
 
     OP_REQUIRES_OK(
         ctx,
@@ -234,7 +234,7 @@ class ParseSingleExampleOp : public OpKernel {
       config.sparse.push_back({attrs_.sparse_keys[d], attrs_.sparse_types[d]});
     }
 
-    const string& serialized_proto = serialized->scalar<string>()();
+    const string& serialized_proto = serialized->scalar<tstring>()();
 
     OP_REQUIRES_OK(ctx,
                    FastParseSingleExample(config, serialized_proto, &result));
@@ -352,11 +352,11 @@ class ParseSequenceExampleOp : public OpKernel {
            attrs_.feature_list_sparse_types[d]});
     }
 
-    auto serialized_t = serialized->flat<string>();
-    auto debug_name_t = debug_name->flat<string>();
-    gtl::ArraySlice<string> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<string> names_slice(debug_name_t.data(),
-                                        debug_name_t.size());
+    auto serialized_t = serialized->flat<tstring>();
+    auto debug_name_t = debug_name->flat<tstring>();
+    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
+    gtl::ArraySlice<tstring> names_slice(debug_name_t.data(),
+                                         debug_name_t.size());
 
     OP_REQUIRES_OK(
         ctx,
@@ -473,7 +473,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
                       "Expected context_dense_keys[", di,
                       "] to be a scalar, got shape: ",
                       context_dense_keys[di].shape().DebugString()));
-      context_dense_keys_t[di] = context_dense_keys[di].scalar<string>()();
+      context_dense_keys_t[di] = context_dense_keys[di].scalar<tstring>()();
     }
     for (int di = 0; di < attrs_.num_context_sparse; ++di) {
       OP_REQUIRES(ctx,
@@ -482,7 +482,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
                       "Expected context_sparse_keys[", di,
                       "] to be a scalar, got shape: ",
                       context_sparse_keys[di].shape().DebugString()));
-      context_sparse_keys_t[di] = context_sparse_keys[di].scalar<string>()();
+      context_sparse_keys_t[di] = context_sparse_keys[di].scalar<tstring>()();
     }
     for (int di = 0; di < attrs_.num_feature_list_dense; ++di) {
       OP_REQUIRES(
@@ -492,7 +492,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
               "] to be a scalar, got shape: ",
               feature_list_dense_keys[di].shape().DebugString()));
       feature_list_dense_keys_t[di] =
-          feature_list_dense_keys[di].scalar<string>()();
+          feature_list_dense_keys[di].scalar<tstring>()();
     }
     for (int di = 0; di < attrs_.num_feature_list_sparse; ++di) {
       OP_REQUIRES(
@@ -502,7 +502,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
               "] to be a scalar, got shape: ",
               feature_list_sparse_keys[di].shape().DebugString()));
       feature_list_sparse_keys_t[di] =
-          feature_list_sparse_keys[di].scalar<string>()();
+          feature_list_sparse_keys[di].scalar<tstring>()();
     }
     OP_REQUIRES(
         ctx,
@@ -513,7 +513,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
             "to be a vector, got shape: ",
             feature_list_dense_missing_assumed_empty->shape().DebugString()));
     auto feature_list_dense_missing_assumped_empty_t =
-        feature_list_dense_missing_assumed_empty->vec<string>();
+        feature_list_dense_missing_assumed_empty->vec<tstring>();
     for (int de = 0;
          de < feature_list_dense_missing_assumed_empty->NumElements(); ++de) {
       feature_list_dense_missing_assumed_empty_set.insert(
@@ -527,7 +527,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
                       "Expected debug_name to be a scalar, got shape: ",
                       debug_name->shape().DebugString()));
     }
-    auto debug_name_t = debug_name->scalar<string>();
+    auto debug_name_t = debug_name->scalar<tstring>();
 
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(serialized->shape()),
                 errors::InvalidArgument(
@@ -561,7 +561,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
       }
     }
 
-    auto serialized_t = serialized->scalar<string>();
+    auto serialized_t = serialized->scalar<tstring>();
 
     OpOutputList context_sparse_indices;
     OpOutputList context_sparse_values;
@@ -646,7 +646,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
             errors::InvalidArgument("Name: ", name, ", Context feature: ", key,
                                     ".  Data types don't match. ",
                                     "Expected type: ", DataTypeString(dtype),
-                                    "  Feature is: ", ProtoDebugString(f)));
+                                    "  Feature is: ", f.DebugString()));
 
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(0, name, key, dtype, shape, f,
                                              context_dense_values[d]));
@@ -675,7 +675,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
             errors::InvalidArgument("Name: ", name, ", Context feature: ", key,
                                     ".  Data types don't match. ",
                                     "Expected type: ", DataTypeString(dtype),
-                                    "  Feature is: ", ProtoDebugString(f)));
+                                    "  Feature is: ", f.DebugString()));
 
         Tensor feature_values = FeatureSparseCopy(0, key, dtype, f);
         const int64 num_elements = feature_values.NumElements();
@@ -756,7 +756,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
                         "Name: ", name, ", Feature list: ", key, ", Index: ", t,
                         ".  Data types don't match. ",
                         "Expected type: ", DataTypeString(dtype),
-                        "  Feature is: ", ProtoDebugString(f)));
+                        "  Feature is: ", f.DebugString()));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
@@ -786,7 +786,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
                                       ", Index: ", t,
                                       ".  Data types don't match. ",
                                       "Expected type: ", DataTypeString(dtype),
-                                      "  Feature is: ", ProtoDebugString(f)));
+                                      "  Feature is: ", f.DebugString()));
           sparse_values_tmp.push_back(FeatureSparseCopy(t, key, dtype, f));
         }
       } else {
@@ -853,10 +853,12 @@ class DecodeJSONExampleOp : public OpKernel {
                                   &binary_examples));
 
     for (int i = 0; i < json_examples->NumElements(); ++i) {
-      const string& json_example = json_examples->flat<string>()(i);
-      auto status = protobuf::util::JsonToBinaryString(
-          resolver_.get(), "type.googleapis.com/tensorflow.Example",
-          json_example, &binary_examples->flat<string>()(i));
+      const tstring& json_example = json_examples->flat<tstring>()(i);
+      protobuf::io::ArrayInputStream in(json_example.data(),
+                                        json_example.size());
+      TStringOutputStream out(&binary_examples->flat<tstring>()(i));
+      auto status = protobuf::util::JsonToBinaryStream(
+          resolver_.get(), "type.googleapis.com/tensorflow.Example", &in, &out);
       OP_REQUIRES(ctx, status.ok(),
                   errors::InvalidArgument("Error while parsing JSON: ",
                                           string(status.error_message())));
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 4d843ab02cc..7e718ca7be7 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -114,7 +114,7 @@ struct ExampleStore {
     Example example;
     Filler fill;
     Tensor record_string(DT_STRING, TensorShape({batch_size}));
-    auto string_t = record_string.vec<string>();
+    auto string_t = record_string.vec<tstring>();
     example.Clear();
     for (int b = 0; b < batch_size; ++b) {
       for (int k = 0; k < num_keys; ++k) {
@@ -124,7 +124,7 @@ struct ExampleStore {
         Features* features = example.mutable_features();
         (*features->mutable_feature())[k_str] = f;
       }
-      CHECK(example.SerializeToString(&string_t(b)));
+      CHECK(SerializeToTString(example, &string_t(b)));
     }
     (*examples)[std::make_tuple(batch_size, num_keys, feature_size)] =
         record_string;
@@ -163,7 +163,7 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
   Options opt;
   for (int i = 0; i < num_keys; ++i) {
     Tensor key(DT_STRING, TensorShape());
-    key.scalar<string>()() = strings::Printf("feature_%d", i);
+    key.scalar<tstring>()() = strings::Printf("feature_%d", i);
     switch (opt.benchmark_type) {
       case kDense:
         dense_keys.emplace_back(test::graph::Constant(g, key));
@@ -205,7 +205,7 @@ static Graph* ParseSingleExample(int num_keys, int feature_size) {
       Options::Store::GetSerializedExample()[std::make_tuple(1, num_keys,
                                                              feature_size)];
   Tensor serialized(DT_STRING, TensorShape());
-  serialized.scalar<string>()() = serialized_batch_1.vec<string>()(0);
+  serialized.scalar<tstring>()() = serialized_batch_1.vec<tstring>()(0);
 
   std::vector<string> sparse_keys;
   std::vector<string> dense_keys;
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
index ab424595c1a..c74245dcf85 100644
--- a/tensorflow/core/kernels/extract_jpeg_shape_op.cc
+++ b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
@@ -41,7 +41,7 @@ class ExtractJpegShapeOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
                 errors::InvalidArgument("contents must be scalar, got shape ",
                                         contents.shape().DebugString()));
-    const StringPiece input = contents.scalar<string>()();
+    const StringPiece input = contents.scalar<tstring>()();
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
                 errors::InvalidArgument("JPEG contents are too large for int: ",
                                         input.size()));
diff --git a/tensorflow/core/kernels/eye_functor_gpu.cu.cc b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
index 5756299d02d..90df538dd2c 100644
--- a/tensorflow/core/kernels/eye_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Scalar>
 __global__ void EyeKernel(int num_threads, int batch_size, int m, int n,
-                          Scalar* output_ptr) {
+                          Scalar* __restrict__ output_ptr) {
   const Scalar one = Scalar(1);
   const Scalar zero = Scalar(0);
   GPU_1D_KERNEL_LOOP(index, num_threads) {
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4a1aa433bc9..6c11ab7a2d2 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -85,7 +85,7 @@ class FactOpKernel : public OpKernel {
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    auto output = output_tensor->template scalar<string>();
+    auto output = output_tensor->template scalar<tstring>();
 
     string coded = facts[context->env()->NowMicros() % count];
     E(&coded);
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index e0f326dcea3..fabd8e9cb36 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -315,11 +315,9 @@ class CufftScratchAllocator : public se::ScratchAllocator {
   ~CufftScratchAllocator() override {}
   CufftScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
-    return memory_limit_;
-  }
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override {
+      int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
       return se::port::StatusOr<se::DeviceMemory<uint8>>();
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 2435c3eed52..10dd3df1915 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -32,9 +32,9 @@ void SetZeroFunctor<Eigen::ThreadPoolDevice, T>::operator()(
   out.device(d) = out.constant(T(0));
 }
 
-void SetZeroFunctor<Eigen::ThreadPoolDevice, string>::operator()(
-    const Eigen::ThreadPoolDevice& d, typename TTypes<string>::Flat out) {
-  out.device(d) = out.constant(string());
+void SetZeroFunctor<Eigen::ThreadPoolDevice, tstring>::operator()(
+    const Eigen::ThreadPoolDevice& d, typename TTypes<tstring>::Flat out) {
+  out.device(d) = out.constant(tstring());
 }
 
 // Explicit instantiations.
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
index 46bffa51734..a9a47c6ecd3 100644
--- a/tensorflow/core/kernels/fill_functor.h
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -54,9 +54,9 @@ struct SetZeroFunctor<Eigen::SyclDevice, T> {
 #endif  // TENSORFLOW_USE_SYCL
 
 template <>
-struct SetZeroFunctor<Eigen::ThreadPoolDevice, string> {
+struct SetZeroFunctor<Eigen::ThreadPoolDevice, tstring> {
   void operator()(const Eigen::ThreadPoolDevice& d,
-                  typename TTypes<string>::Flat out);
+                  typename TTypes<tstring>::Flat out);
 };
 
 template <typename Device, typename T>
@@ -81,9 +81,9 @@ struct SetOneFunctor<Eigen::SyclDevice, T> {
 #endif  // TENSORFLOW_USE_SYCL
 
 template <>
-struct SetOneFunctor<Eigen::ThreadPoolDevice, string> {
+struct SetOneFunctor<Eigen::ThreadPoolDevice, tstring> {
   void operator()(const Eigen::ThreadPoolDevice& d,
-                  typename TTypes<string>::Flat out);
+                  typename TTypes<tstring>::Flat out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/fingerprint_op.cc b/tensorflow/core/kernels/fingerprint_op.cc
index 20529326b3d..340dcf111a5 100644
--- a/tensorflow/core/kernels/fingerprint_op.cc
+++ b/tensorflow/core/kernels/fingerprint_op.cc
@@ -52,7 +52,7 @@ void FarmhashFingerprint64(TTypes<uint8, 2>::ConstTensor input,
   }
 }
 
-void FarmhashFingerprint64(TTypes<string>::ConstFlat input,
+void FarmhashFingerprint64(TTypes<tstring>::ConstFlat input,
                            TTypes<uint8, 2>::Matrix output) {
   DCHECK_EQ(output.dimension(0), input.dimension(0));
   DCHECK_EQ(output.dimension(1), sizeof(uint64));
@@ -79,7 +79,7 @@ class FingerprintOp : public OpKernel {
                 errors::InvalidArgument("`method` should be a scalar string: ",
                                         method_tensor.shape()));
     // For now, farmhash64 is the only function supported.
-    const string& method = method_tensor.scalar<string>()();
+    const tstring& method = method_tensor.scalar<tstring>()();
     OP_REQUIRES(
         context, method == "farmhash64",
         errors::InvalidArgument("Unsupported fingerprint method: ", method));
@@ -110,14 +110,14 @@ class FingerprintOp : public OpKernel {
         // and each row contains the fingerprint value of corresponding string.
         // To compute fingerprints of multiple strings, this op fingerprints the
         // buffer containing the string fingerprints.
-        FarmhashFingerprint64(input.flat<string>(), temp.tensor<uint8, 2>());
+        FarmhashFingerprint64(input.flat<tstring>(), temp.tensor<uint8, 2>());
         FarmhashFingerprint64(static_cast<const Tensor&>(temp).shaped<uint8, 2>(
                                   {dim0, dim1 * kFingerprintSize}),
                               output->matrix<uint8>());
       } else {
         // In case dim1 == 1, each string computes into its own fingerprint
         // value. There is no need to fingerprint twice.
-        FarmhashFingerprint64(input.flat<string>(), output->matrix<uint8>());
+        FarmhashFingerprint64(input.flat<tstring>(), output->matrix<uint8>());
       }
     } else {
       auto data = input.bit_casted_shaped<uint8, 2>(
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
index 14376cb2d35..79d54a5fde4 100644
--- a/tensorflow/core/kernels/fingerprint_op_test.cc
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -51,7 +51,7 @@ class FingerprintOpTest : public OpsTestBase {
     inputs_.push_back(TensorValue(data));
 
     method_ = Tensor(DT_STRING, TensorShape{});
-    method_.scalar<string>()() = method;
+    method_.scalar<tstring>()() = method;
     inputs_.push_back(TensorValue(&method_));
     return Status::OK();
   }
@@ -77,15 +77,15 @@ TEST_F(FingerprintOpTest, GoldenValue) {
 // special-case handling.
 TEST_F(FingerprintOpTest, StringGoldenValue) {
   Tensor data(DT_STRING, {1, 2, 2});
-  auto buffer = data.flat<string>();
+  auto buffer = data.flat<tstring>();
   buffer(0).resize(10);
   buffer(1).resize(7);
   buffer(2).resize(0);
   buffer(3).resize(19);
-  std::iota(buffer(0).begin(), buffer(0).end(), 0);
-  std::iota(buffer(1).begin(), buffer(1).end(), 7);
-  std::iota(buffer(2).begin(), buffer(2).end(), 71);
-  std::iota(buffer(3).begin(), buffer(3).end(), 41);
+  std::iota(&buffer(0)[0], &buffer(0)[0] + buffer(0).size(), 0);
+  std::iota(&buffer(1)[0], &buffer(1)[0] + buffer(1).size(), 7);
+  std::iota(&buffer(2)[0], &buffer(2)[0] + buffer(2).size(), 71);
+  std::iota(&buffer(3)[0], &buffer(3)[0] + buffer(3).size(), 41);
 
   TF_ASSERT_OK(MakeFingerprintOp(&data));
   TF_ASSERT_OK(RunOpKernel());
@@ -134,10 +134,10 @@ TEST_F(FingerprintOpTest, CollisionString) {
   constexpr int64 size = 256;
 
   Tensor tensor(DT_STRING, {1});
-  auto& input = tensor.vec<string>()(0);
+  auto& input = tensor.vec<tstring>()(0);
   input.resize(size);
 
-  TTypes<uint8>::UnalignedFlat buffer(reinterpret_cast<uint8*>(&*input.begin()),
+  TTypes<uint8>::UnalignedFlat buffer(reinterpret_cast<uint8*>(&input[0]),
                                       input.size());
   buffer.setRandom();
 
@@ -163,7 +163,7 @@ TEST_F(FingerprintOpTest, CompareBytesAndString) {
   auto pods = pods_tensor.matrix<float>();
   pods.setRandom();
 
-  auto strings = strings_tensor.vec<string>();
+  auto strings = strings_tensor.vec<tstring>();
   for (int64 i = 0; i < strings.size(); ++i) {
     strings(i).assign(reinterpret_cast<const char*>(&pods(i, 0)),
                       pods.dimension(1) * sizeof(pods(i, 0)));
@@ -199,7 +199,7 @@ TEST(FingerprintOpShapeFnTest, MethodKnownStatically) {
   ShapeInferenceTestOp op("Fingerprint");
 
   Tensor method(DT_STRING, TensorShape{});
-  method.scalar<string>()() = "farmhash64";
+  method.scalar<tstring>()() = "farmhash64";
   op.input_tensors.assign({nullptr, &method});
 
   TF_ASSERT_OK(MakeNodeDef(DT_UINT8, &op.node_def));
@@ -229,12 +229,12 @@ TEST(FingerprintOpShapeFnTest, InvalidMethod) {
 
   // When `method` shape is unknown statically.
   Tensor method(DT_STRING, TensorShape{1});
-  method.vec<string>()(0) = "farmhash64";
+  method.vec<tstring>()(0) = "farmhash64";
   op.input_tensors.assign({nullptr, &method});
   INFER_ERROR("must be rank 0", op, "?;?");
 
   method = Tensor(DT_STRING, TensorShape{});
-  method.scalar<string>()() = "unsupported_method";
+  method.scalar<tstring>()() = "unsupported_method";
   op.input_tensors.assign({nullptr, &method});
   INFER_ERROR("unsupported_method", op, "?;?");
 }
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index 960255ed7bb..b42b17d413a 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -77,7 +77,7 @@ class FixedLengthRecordReader : public ReaderBase {
     return Status::OK();
   }
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     // We will always "hop" the hop_bytes_ except the first record
     // where record_number_ == 0
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 33bed217003..8e2b20d6057 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -120,7 +120,7 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
 REGISTER_KERNEL_BUILDER(Name(kArgOp)
                             .Device(DEVICE_GPU)
                             .HostMemory("output")
-                            .TypeConstraint<string>("T"),
+                            .TypeConstraint<tstring>("T"),
                         ArgOp);
 
 REGISTER_KERNEL_BUILDER(
@@ -148,7 +148,7 @@ REGISTER_KERNEL_BUILDER(Name(kRetOp)
 
 REGISTER_KERNEL_BUILDER(Name(kRetOp)
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<string>("T")
+                            .TypeConstraint<tstring>("T")
                             .HostMemory("input"),
                         RetvalOp);
 #undef REGISTER
@@ -318,7 +318,7 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   string target_device;
   OP_REQUIRES_OK_ASYNC(
       ctx,
-      DeviceNameUtils::CanonicalizeDeviceName(target->scalar<string>()(),
+      DeviceNameUtils::CanonicalizeDeviceName(target->scalar<tstring>()(),
                                               source_device, &target_device),
       done);
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 920c14b36ac..d7d15d5f14b 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -82,7 +82,7 @@ Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
         *v = t[0].scalar<bool>()();
         break;
       case DT_STRING:
-        *v = !t[0].scalar<string>()().empty();
+        *v = !t[0].scalar<tstring>()().empty();
         break;
       default:
         return errors::InvalidArgument(DataTypeString(t[0].dtype()),
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 70bd659be66..dd75b3718ae 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -101,12 +101,11 @@ class CudnnBatchNormAllocatorInTemp : public ScratchAllocator {
   explicit CudnnBatchNormAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
 
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
+  int64 GetMemoryLimitInBytes() override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                              int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
     Tensor temporary_memory;
     const DataType tf_data_type = DataTypeToEnum<T>::v();
     int64 allocate_count =
@@ -155,12 +154,11 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
   CudnnBatchNormAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
 
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
+  int64 GetMemoryLimitInBytes() override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                              int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
     output_allocated = true;
     DCHECK(total_byte_size_ == 0)
         << "Reserve space allocator can only be called once";
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index 0ac5af04582..e0dce85ad64 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -105,8 +105,10 @@ template struct FusedBatchNormFreezeGrad<GPUDevice, float, float>;
 template struct FusedBatchNormFreezeGrad<GPUDevice, Eigen::half, float>;
 
 template <class T>
-__global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
-                                            double epsilon, T* output) {
+__global__ void VarianceToInvVarianceKernel(int nthreads,
+                                            const T* __restrict__ input,
+                                            double epsilon,
+                                            T* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     output[index] = rsqrt(input[index] + T(epsilon));
   }
@@ -172,9 +174,13 @@ struct FusedBatchNormInferenceKernel {
                 "Unsupported data format");
 
   __device__ static void run(int32 count, int32 channels_size,
-                             int32 inner_dim_size, const T* in, const U* scale,
-                             const U* offset, const U* mean, const U* var,
-                             const T* side_input, float epsilon, T* out) {
+                             int32 inner_dim_size, const T* __restrict__ in,
+                             const U* __restrict__ scale,
+                             const U* __restrict__ offset,
+                             const U* __restrict__ mean,
+                             const U* __restrict__ var,
+                             const T* __restrict__ side_input, float epsilon,
+                             T* __restrict__ out) {
     int32 index = blockIdx.x * blockDim.x + threadIdx.x;
     const int32 total_device_threads = gridDim.x * blockDim.x;
 
@@ -225,9 +231,13 @@ struct FusedBatchNormInferenceKernel<Eigen::half, float, tensor_format,
                                     /*is_generic_kernel=*/true>;
 
   __device__ static void run(int32 count, int32 channels_size,
-                             int32 inner_dim_size, const T* in, const U* scale,
-                             const U* offset, const U* mean, const U* var,
-                             const T* side_input, float epsilon, T* out) {
+                             int32 inner_dim_size, const T* __restrict__ in,
+                             const U* __restrict__ scale,
+                             const U* __restrict__ offset,
+                             const U* __restrict__ mean,
+                             const U* __restrict__ var,
+                             const T* __restrict__ side_input, float epsilon,
+                             T* __restrict__ out) {
     // Old GPUs do not have (or have very slow) fp16 arithmetic.
 #if __CUDA_ARCH__ >= 610
     int32 index = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index f72dfb39b31..35cd0fba1a6 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -51,7 +51,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) final {
     // TODO(dga):  Test the batch case also.
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
-    input_tensor.scalar<string>()() =
+    input_tensor.scalar<tstring>()() =
         string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 4b036b181de..dc0435cdc32 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -145,7 +145,7 @@ class FuzzSession {
 class FuzzStringInputOp : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) final {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
-    input_tensor.scalar<string>()() =
+    input_tensor.scalar<tstring>()() =
         string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index 0ce4206fc3c..a71f2902559 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -61,7 +61,7 @@ class FuzzParseTensor : public FuzzSession {
 
     // Now we can do the actual fuzz implementation
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
-    input_tensor.scalar<string>()() = as_string;
+    input_tensor.scalar<tstring>()() = as_string;
     RunInputs({{"input", input_tensor}});
   }
 };
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index b3b637bac72..d4e64181ab2 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -42,9 +42,9 @@ class FuzzStringSplit : public FuzzSession {
       if (delim_len > size) {
         delim_len = size - 1;
       }
-      delimiter_tensor.scalar<string>()() =
+      delimiter_tensor.scalar<tstring>()() =
           string(reinterpret_cast<const char*>(data), delim_len);
-      input_tensor.scalar<string>()() = string(
+      input_tensor.scalar<tstring>()() = string(
           reinterpret_cast<const char*>(data + delim_len), size - delim_len);
 
       RunInputs({{"input", input_tensor}, {"delimiter", delimiter_tensor}});
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
index f7e3da80437..367759d374e 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -46,10 +46,10 @@ class FuzzStringSplitV2 : public FuzzSession {
       if (sep_len > size) {
         sep_len = size - 1;
       }
-      separator_tensor.scalar<string>()() =
+      separator_tensor.scalar<tstring>()() =
           string(reinterpret_cast<const char*>(data), sep_len);
-      input_tensor.scalar<string>()() = string(
-          reinterpret_cast<const char*>(data + sep_len), size - sep_len);
+      input_tensor.scalar<tstring>()() =
+          string(reinterpret_cast<const char*>(data + sep_len), size - sep_len);
 
       RunInputs({{"input", input_tensor}, {"separator", separator_tensor}});
     }
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 96d353c7e21..ab8b4b2dd48 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -97,7 +97,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
             slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
-        out.template chip<1>(indices_idx) = params.template chip<1>(index);
+        out.template chip<0>(batch_idx).template chip<0>(indices_idx) =
+            params.template chip<0>(batch_idx).template chip<0>(index);
       }
       indices_idx = i_next;
       batch_idx = b_next;
diff --git a/tensorflow/core/kernels/gather_functor_batched.cc b/tensorflow/core/kernels/gather_functor_batched.cc
new file mode 100644
index 00000000000..0960b3a2472
--- /dev/null
+++ b/tensorflow/core/kernels/gather_functor_batched.cc
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/gather_functor_batched.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Forward declarations of the functor specializations for GPU.
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctorBatched<GPUDevice, T, Index>::operator()(          \
+      OpKernelContext* ctx, typename TTypes<T, 4>::ConstTensor Tparams, \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 4>::Tensor Tout);                              \
+  extern template struct GatherFunctorBatched<GPUDevice, T, Index>;
+
+#define DECLARE_GPU_SPECS(T)         \
+  DECLARE_GPU_SPECS_INDEX(T, int32); \
+  DECLARE_GPU_SPECS_INDEX(T, int64)
+
+TF_CALL_int64(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_complex64(DECLARE_GPU_SPECS);
+TF_CALL_complex128(DECLARE_GPU_SPECS);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_INDEX
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#else
+
+#include "tensorflow/core/kernels/gather_functor_batched.h"
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/gather_functor_batched.h b/tensorflow/core/kernels/gather_functor_batched.h
new file mode 100644
index 00000000000..ee94fdf6690
--- /dev/null
+++ b/tensorflow/core/kernels/gather_functor_batched.h
@@ -0,0 +1,202 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/prefetch.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Helper method to copy using memcpy.
+template <typename T, typename Index, typename SliceIndex,
+          SliceIndex static_slice_elems>
+SliceIndex HandleCopiesBatched(OpKernelContext* ctx,
+                               typename TTypes<T, 4>::ConstTensor params,
+                               typename TTypes<Index>::ConstFlat indices,
+                               SliceIndex slice_elems,
+                               typename TTypes<T, 4>::Tensor out) {
+  const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
+  const SliceIndex outer_size = static_cast<SliceIndex>(params.dimension(1));
+  const SliceIndex indices_size =
+      static_cast<SliceIndex>(indices.dimension(0)) / batch_size;
+
+  const Index limit = static_cast<Index>(params.dimension(2));
+  if (static_slice_elems >= 0) {
+    // Give compiler static knowledge of the number of elements/bytes
+    slice_elems = static_slice_elems;
+  }
+  // Compute slice_bytes here so that static knowledge is available
+  const size_t slice_bytes = slice_elems * sizeof(T);
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  mutex mu;
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
+  SliceIndex result = -1;
+  auto work = [&](int64 start, int64 end) {
+    const int64 r_start = start % (outer_size * indices_size);
+    SliceIndex batch_idx = static_cast<SliceIndex>(
+        start / (outer_size * indices_size));
+    SliceIndex outer_idx = static_cast<SliceIndex>(r_start / indices_size);
+    SliceIndex indices_idx = static_cast<SliceIndex>(r_start % indices_size);
+
+    SliceIndex batch_offset = batch_idx * indices_size;
+    for (; start < end; ++start) {
+      SliceIndex i_next = indices_idx + 1;
+      SliceIndex o_next = outer_idx;
+      SliceIndex b_next = batch_idx;
+      SliceIndex b_offset_next = batch_offset;
+
+      if (i_next >= indices_size) {
+        i_next = 0;
+        if (++o_next >= outer_size) {
+          o_next = 0;
+          ++b_next;
+          b_offset_next += indices_size;
+        }
+      }
+      if (start + 1 < end) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(b_next, o_next, indices(b_offset_next + i_next), 0));
+        port::prefetch<port::PREFETCH_HINT_T0>(&out(b_next, o_next, i_next, 0));
+      }
+      const Index index = internal::SubtleMustCopy(
+          indices(batch_offset + indices_idx));
+      if (!FastBoundsCheck(index, limit)) {
+        mutex_lock l(mu);
+        result = batch_offset + indices_idx;
+        return;
+      }
+
+      // Copy using memcpy if possible, otherwise an Eigen loop
+      // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
+      // ahead-of-time compilation binary size).
+      if (is_simple_type<T>::value) {
+        // Avoid auto-promotion to Index from SliceIndex by casting.
+        memcpy(
+            &out(batch_idx, outer_idx, indices_idx, 0),
+            &params(batch_idx, outer_idx, static_cast<SliceIndex>(index), 0),
+            slice_bytes);
+      } else {
+        // For non-"simple" types (e.g. strings).
+        out.template chip<0>(batch_idx)
+            .template chip<0>(outer_idx)
+            .template chip<0>(indices_idx) =
+            params.template chip<0>(batch_idx)
+                .template chip<0>(outer_idx)
+                .template chip<0>(static_cast<SliceIndex>(index));
+      }
+
+      indices_idx = i_next;
+      outer_idx = o_next;
+      batch_idx = b_next;
+      batch_offset = b_offset_next;
+    }
+  };
+
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * outer_size * indices_size, slice_elems * sizeof(T), work);
+  return result;
+}
+
+template <typename T, typename Index>
+struct GatherFunctorBatchedCPU {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 4>::Tensor out) {
+    const int64 indices_size = indices.size();  // Includes the batch_size.
+    const int64 slice_size = out.dimension(3);
+    int64 bad_i;
+
+    const int64 batch_size = params.dimension(0);
+    const int64 outer_size = params.dimension(1);
+
+    bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
+                      params.size() > std::numeric_limits<int32>::max() ||
+                      indices_size > std::numeric_limits<int32>::max() ||
+                      batch_size * outer_size * indices_size * slice_size >
+                          std::numeric_limits<int32>::max());
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopiesBatched<T, Index, int64, elems>(               \
+          ctx, params, indices, slice_size, out);                        \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopiesBatched<T, Index, int32, elems>(               \
+          ctx, params, indices, small_slice, out);                       \
+    }                                                                    \
+  } while (0)
+
+    // TODO(rmlarsen): Investigate whether these specializations are still
+    // needed and, if yes, whether the slice sizes are apropriate.
+    if (slice_size == 10)
+      CALL(10);
+    else if (slice_size == 20)
+      CALL(20);
+    else
+      CALL(-1);
+#undef CALL
+
+    return bad_i;
+  }
+};
+
+template <typename Device, typename T, typename Index>
+struct GatherFunctorBatched {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 4>::Tensor out);
+};
+
+template <typename T, typename Index>
+struct GatherFunctorBatched<CPUDevice, T, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 4>::Tensor out) {
+    return GatherFunctorBatchedCPU<T, Index>()(ctx, params, indices, out);
+  }
+};
+
+template <typename Index>
+struct GatherFunctorBatched<GPUDevice, Variant, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<Variant, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<Variant, 4>::Tensor out) {
+    return GatherFunctorBatchedCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
diff --git a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
new file mode 100644
index 00000000000..f118d8dc72b
--- /dev/null
+++ b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
@@ -0,0 +1,46 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/gather_functor_batched_gpu.cu.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPECS_INDEX(T, Index) \
+  template struct functor::GatherFunctorBatched<GPUDevice, T, Index>
+
+#define DEFINE_GPU_SPECS(T)         \
+  DEFINE_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_bool(DEFINE_GPU_SPECS);
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_complex64(DEFINE_GPU_SPECS);
+TF_CALL_complex128(DEFINE_GPU_SPECS);
+
+#undef DEFINE_GPU_SPECS
+#undef DEFINE_GPU_SPECS_INDEX
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h
new file mode 100644
index 00000000000..24c23f1f900
--- /dev/null
+++ b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h
@@ -0,0 +1,132 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gather_functor_batched.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T, typename Index,
+          bool is_axis_zero, bool is_batch_dims_zero>
+__global__ void GatherOpKernel(const T* params, const Index* indices, T* out,
+                               int64 outer_size,
+                               int64 gather_dim_size, int64 indices_size,
+                               int64 slice_size, int64 out_size) {
+  // params is a tensor of shape
+  // [batch_size, outer_size, gather_dim_size, slice_size].
+  GPU_1D_KERNEL_LOOP(i, out_size) {
+    Index batch_i = 0;  // The batch index into params to use for i.
+    Index outer_i = 0;  // The outer index into params to use for i.
+    Index indices_i = 0;  // The index into indices to use for i.
+    Index slice_i = 0;  // Index into the current slice in params to use for i.
+
+    const Index slices_count = i / slice_size;
+    if (is_batch_dims_zero) {
+      if (is_axis_zero) {
+        indices_i = slices_count;
+      } else {
+        outer_i = slices_count / indices_size;
+        indices_i = slices_count - outer_i * indices_size;
+      }
+    } else {
+      const Index entries_count = slices_count / indices_size;
+      if (is_axis_zero) {
+        batch_i = entries_count;
+      } else {
+        batch_i = entries_count / outer_size;
+        outer_i = entries_count - batch_i * outer_size;
+      }
+      indices_i = slices_count - entries_count * indices_size;
+    }
+    slice_i = i - slices_count * slice_size;
+
+    // Index into the gather axis to use for i.
+    Index gather_i = ldg(indices + batch_i * indices_size + indices_i);
+
+    // Check gather_i is in [0, gather_dim_size).
+    if (!FastBoundsCheck(gather_i, gather_dim_size)) {
+      // Set indices out of range to zero
+      // TODO(fpmc): Log an error for transfer back to host.
+      out[i] = T(0);
+    } else {
+      // Read params[batch_i, outer_i, gather_i, slice_i] and write it to the
+      // i'th position in out.
+      Index params_i = (
+          (batch_i * outer_size + outer_i) * gather_dim_size + gather_i
+      ) * slice_size + slice_i;
+      out[i] = ldg(params + params_i);
+    }
+  }
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct GatherFunctorBatched<GPUDevice, T, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 4>::Tensor out) {
+    const GPUDevice& d = ctx->eigen_gpu_device();
+    const int64 out_size = out.size();
+    if (out_size == 0) {
+      // We need a check here since the CPU version does useful error checking
+      // work if there are nonempty indices but empty slices, so the kernel is
+      // executed in that case.  In the GPU case we don't know how to do error
+      // checking, so we skip the loop entirely.
+      return -1;
+    }
+    const bool is_batch_dims_zero = params.dimension(0) == 1;
+    const bool is_axis_zero = params.dimension(1) == 1;
+    const int64 outer_size = params.dimension(1);
+    const int64 gather_dim_size = params.dimension(2);
+    const int64 indices_size = indices.size() / params.dimension(0);
+    const int64 slice_size = params.dimension(3);
+
+    GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);
+    const auto function = is_axis_zero ?
+          (is_batch_dims_zero ?
+            GatherOpKernel<T, Index, true, true>:
+            GatherOpKernel<T, Index, true, false>) :
+          (is_batch_dims_zero ?
+             GatherOpKernel<T, Index, false, true>:
+             GatherOpKernel<T, Index, false, false>);
+    TF_CHECK_OK(GpuLaunchKernel(
+        function, config.block_count, config.thread_per_block, 0, d.stream(),
+        params.data(), indices.data(), out.data(),
+        outer_size, gather_dim_size, indices_size, slice_size, out_size));
+    // TODO(fpmc): enable indices validation on GPU.
+    // Right now checking for indicies out of bound in the kernel would
+    // require copying code between GPU/CPU, and thus slow.
+    return -1;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index 6507846e3f0..0a3ecea8b5f 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -30,9 +30,11 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, bool is_axis_zero>
-__global__ void GatherOpKernel(const T* params, const Index* indices, T* out,
-                               int64 gather_dim_size, int64 indices_size,
-                               int64 slice_size, int64 out_size) {
+__global__ void GatherOpKernel(const T* __restrict__ params,
+                               const Index* __restrict__ indices,
+                               T* __restrict__ out, int64 gather_dim_size,
+                               int64 indices_size, int64 slice_size,
+                               int64 out_size) {
   GPU_1D_KERNEL_LOOP(i, out_size) {
     Index batch_i = 0;
     Index indices_i = 0;
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index f746105e00f..95ead6b0fd3 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -28,8 +28,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, int IXDIM>
 __global__ void GatherSliceOpKernel(
-    const T* params, const Index* indices, T* out,
-    const Eigen::array<int64, IXDIM> batch_strides,
+    const T* __restrict__ params, const Index* __restrict__ indices,
+    T* __restrict__ out, const Eigen::array<int64, IXDIM> batch_strides,
     const Eigen::array<int64, IXDIM> batch_indices, const int64 indices_size,
     const int64 slice_size, const int64 out_size) {
   // TODO(ebrevdo): reduce inner loop into two loops:
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 68c258da6ad..38e0bab676d 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/gather_functor_batched.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
@@ -123,16 +124,22 @@ class GatherOp : public OpKernel {
     // The result shape is params.shape[:axis] + indices.shape[batch_dims:] +
     // params.shape[axis + 1:].
     TensorShape result_shape;
+    int64 batch_size = 1;
     int64 outer_size = 1;
     int64 inner_size = 1;
-    for (int i = 0; i < axis; i++) {
+
+    for (int i = 0; i < batch_dims_; ++i) {
+      result_shape.AddDim(params.dim_size(i));
+      batch_size *= params.dim_size(i);
+    }
+    for (int i = batch_dims_; i < axis; ++i) {
       result_shape.AddDim(params.dim_size(i));
       outer_size *= params.dim_size(i);
     }
     for (int i = batch_dims_; i < indices.dims(); ++i) {
       result_shape.AddDim(indices.dim_size(i));
     }
-    for (int i = axis + 1; i < params.dims(); i++) {
+    for (int i = axis + 1; i < params.dims(); ++i) {
       result_shape.AddDim(params.dim_size(i));
       inner_size *= params.dim_size(i);
     }
@@ -141,60 +148,29 @@ class GatherOp : public OpKernel {
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     if (N == 0) return;
 
+    int64 bad_i = -1;
+    auto indices_flat = indices.flat<Index>();
     if (batch_dims_ > 0) {
-      // TODO(virimia): Switch to transpose / gather with axis=0 / transpose
-      // on GPU, to avoid launching a lot of small kernels.
+      auto params_flat = params.shaped<T, 4>(
+          {batch_size, outer_size, gather_dim_size, inner_size});
+      auto out_flat = out->shaped<T, 4>(
+          {batch_size, outer_size, N / batch_size, inner_size});
 
-      // To avoid copying params (by transposing), run gather for each batch.
-      int64 batch_size = 1;
-      for (int i = 0; i < batch_dims_; ++i) {
-        batch_size *= params.dim_size(i);
-      }
-      outer_size /= batch_size;
-      auto batched_params =
-          params.shaped<T, 2>({batch_size, params.NumElements() / batch_size});
-      auto batched_indices =
-          indices.shaped<Index, 2>({batch_size, N / batch_size});
-      auto batched_out =
-          out->shaped<T, 2>({batch_size, out->NumElements() / batch_size});
-
-      // TODO(virimia): Investigate the best performance, when the number of
-      // batches is large, between parallel vs sequential runs.
-      for (int64 batch = 0; batch < batch_size; ++batch) {
-        auto params_flat = typename TTypes<T, 3>::ConstTensor(
-            &batched_params(batch, 0), static_cast<IndexType>(outer_size),
-            static_cast<IndexType>(gather_dim_size),
-            static_cast<IndexType>(inner_size));
-        auto indices_flat = typename TTypes<Index>::ConstFlat(
-            &batched_indices(batch, 0), batched_indices.dimension(1));
-        auto out_flat = typename TTypes<T, 3>::Tensor(
-            &batched_out(batch, 0), static_cast<IndexType>(outer_size),
-            static_cast<IndexType>(N), static_cast<IndexType>(inner_size));
-
-        functor::GatherFunctor<Device, T, Index> functor;
-        const int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
-
-        OP_REQUIRES(
-            c, bad_i < 0,
-            errors::InvalidArgument(
-                "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-                indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
-      }
+      functor::GatherFunctorBatched<Device, T, Index> functor;
+      bad_i = functor(c, params_flat, indices_flat, out_flat);
     } else {
       auto params_flat =
           params.shaped<T, 3>({outer_size, gather_dim_size, inner_size});
-      auto indices_flat = indices.flat<Index>();
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      const int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
-
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
+      bad_i = functor(c, params_flat, indices_flat, out_flat);
     }
+    OP_REQUIRES(
+        c, bad_i < 0,
+        errors::InvalidArgument(
+            "indices", SliceDebugString(indices.shape(), bad_i), " = ",
+            indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
   }
 
  private:
diff --git a/tensorflow/core/kernels/generate_vocab_remapping_op.cc b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
index 2b97677e385..d4cf83896f5 100644
--- a/tensorflow/core/kernels/generate_vocab_remapping_op.cc
+++ b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
@@ -57,11 +57,11 @@ class GenerateVocabRemappingOp : public OpKernel {
 
     // Build a new ID->token lookup table.
     const string& new_vocab_filename =
-        new_vocab_file_tensor->scalar<string>()();
+        new_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !new_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
-    lookup::HashTable<int64, string>* new_vocab_table =
-        new lookup::HashTable<int64, string>(context, this);
+    lookup::HashTable<int64, tstring>* new_vocab_table =
+        new lookup::HashTable<int64, tstring>(context, this);
     core::ScopedUnref unref_new(new_vocab_table);
     // Note: we pass -1 (unknown) for vocab_size, which is supposed to be the
     // total elements in file.  This is different from num_new_vocab_, which
@@ -88,11 +88,11 @@ class GenerateVocabRemappingOp : public OpKernel {
                     old_vocab_file_tensor->shape().DebugString()));
     // Build a token->old ID lookup table.
     const string& old_vocab_filename =
-        old_vocab_file_tensor->scalar<string>()();
+        old_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !old_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
-    lookup::HashTable<string, int64>* old_vocab_table =
-        new lookup::HashTable<string, int64>(context, this);
+    lookup::HashTable<tstring, int64>* old_vocab_table =
+        new lookup::HashTable<tstring, int64>(context, this);
     core::ScopedUnref unref_old(old_vocab_table);
     // Note: If old_vocab_size_ is -1 (unknown), we retrieve all elements in
     // file (see TextFileLineIterator).
@@ -118,7 +118,7 @@ class GenerateVocabRemappingOp : public OpKernel {
     OP_REQUIRES_OK(
         context, context->allocate_temp(
                      DT_STRING, TensorShape({num_new_vocab_}), &default_token));
-    auto default_token_vec = default_token.vec<string>();
+    auto default_token_vec = default_token.vec<tstring>();
     default_token_vec.setConstant("" /* NOT_FOUND_TOKEN */);
 
     Tensor default_id;
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 318a9176ebb..68b069acb0c 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -25,8 +25,71 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
 
 namespace tensorflow {
+
+bool RedzoneCheckDisabled() {
+  const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK");
+  return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
+}
+
+se::DeviceMemoryBase WrapRedzoneBestEffort(
+    se::cuda::RedzoneAllocator* rz_allocator, se::DeviceMemoryBase buffer) {
+  if (RedzoneCheckDisabled()) {
+    return buffer;
+  }
+  se::DeviceMemoryBase output_tensor;
+  auto output_rz_or = rz_allocator->AllocateBytes(buffer.size());
+  if (!output_rz_or.ok()) {
+    static std::once_flag rz_allocation_failure_logged;
+    std::call_once(rz_allocation_failure_logged, []() {
+      LOG(WARNING) << "Failed to allocate memory for convolution redzone "
+                   << "checking; skipping this check. This is benign and only "
+                   << "means that we won't check cudnn for out-of-bounds reads "
+                   << "and writes. This message will only be printed once.";
+    });
+    return buffer;
+  }
+  return se::DeviceMemoryBase(output_rz_or.ValueOrDie());
+}
+
+void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
+                   tensorflow::AutotuneResult* autotune_result) {
+  se::port::StatusOr<se::cuda::RedzoneAllocator::RedzoneCheckStatus> rz_status =
+      rz_allocator.CheckRedzones();
+  if (!rz_status.ok()) {
+    static std::once_flag failure_logged;
+    std::call_once(failure_logged, [&]() {
+      LOG(WARNING) << "Failed to check cudnn convolutions for out-of-bounds "
+                   << "reads and writes with an error message: '"
+                   << rz_status.status().error_message()
+                   << "'; skipping this check. This only means that we won't "
+                   << "check cudnn for out-of-bounds reads and writes. This "
+                   << "message will only be printed once.";
+    });
+    return;
+  }
+  auto rz_check_status = rz_status.ValueOrDie();
+  if (!rz_check_status.ok()) {
+    auto* fail = autotune_result->mutable_failure();
+    fail->set_msg(rz_check_status.RedzoneFailureMsg());
+    fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
+    fail->set_buffer_address(
+        reinterpret_cast<uint64>(rz_check_status.user_buffer_address));
+    LOG(ERROR)
+        << "Detected cudnn out-of-bounds write in convolution buffer! This is "
+           "likely a cudnn bug. We will skip this algorithm in the future, but "
+           "your GPU state may already be corrupted, leading to incorrect "
+           "results. Within Google, no action is needed on your part. Outside "
+           "of Google, please ensure you're running the latest version of "
+           "cudnn. If that doesn't fix the problem, please file a bug with "
+           "this full error message and we'll contact nvidia.";
+    LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
+  }
+}
+
 namespace {
 
 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
@@ -85,6 +148,14 @@ void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
   log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
+  {
+    string blas_version;
+    if (auto* blas = stream_exec->AsBlas()) {
+      if (blas->GetVersion(&blas_version).ok()) {
+        log.set_blas_version(blas_version);
+      }
+    }
+  }
   for (const auto& result : results) {
     *log.add_results() = result;
   }
@@ -123,6 +194,14 @@ void LogFusedConvForwardAutotuneResults(
   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
   log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
+  {
+    string blas_version;
+    if (auto* blas = stream_exec->AsBlas()) {
+      if (blas->GetVersion(&blas_version).ok()) {
+        log.set_blas_version(blas_version);
+      }
+    }
+  }
   for (const auto& result : results) {
     *log.add_results() = result;
   }
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 67e8963a904..b3ac9535443 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -29,11 +29,38 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
+namespace stream_executor {
+namespace cuda {
+class RedzoneAllocator;
+}
+}  // namespace stream_executor
+
 namespace tensorflow {
 
 class NodeDef;
 class AutotuneResult;
 
+// Return whether the redzone check is disabled.
+//
+// Controlled by the TF_DISABLE_RZ_CHECK environment variable.
+bool RedzoneCheckDisabled();
+
+// Return an allocated buffer with redzones the size of `buffer`. Does
+// *not* copy the contents of the `buffer` into the newly allocated buffer:
+// assumes that buffer is a pure out-parameter.
+//
+// Returns `buffer` if RedzoneCheckDisabled() is true.
+//
+// On error, return `buffer`, and log an error message (once).
+se::DeviceMemoryBase WrapRedzoneBestEffort(
+    se::cuda::RedzoneAllocator* rz_allocator, se::DeviceMemoryBase buffer);
+
+// Check the passed allocator for redzone violations.
+// If violations have occurred, mark the corresponding autotune result
+// as a failure.
+void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
+                   tensorflow::AutotuneResult* autotune_result);
+
 template <typename T>
 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
index 01461fbb8c2..c09a615b62c 100644
--- a/tensorflow/core/kernels/guarantee_const_op_test.cc
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -57,11 +57,11 @@ TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
 
 TEST_F(GuaranteeConstOpTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_STRING));
-  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({6}));
-  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(GuaranteeConstOpTest, ResourceInputError) {
diff --git a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
index ee548c6887e..0e5d8fce669 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
@@ -54,8 +54,13 @@ Status RewriteQuantizedStrippedModelForHexagon(
     string shape_string;
     TF_RETURN_IF_ERROR(context.GetOneStringParameter(
         INPUT_SHAPE_PREFIX + std::to_string(i), "", &shape_string));
+    std::vector<string> split_shape = str_util::Split(shape_string, ',');
     std::vector<int64> dims;
-    CHECK(str_util::SplitAndParseAsInts(shape_string, ',', &dims));
+    for (const string& dim : split_shape) {
+      int64 tmp;
+      CHECK(strings::safe_strto64(dim, &tmp));
+      dims.push_back(tmp);
+    }
 
     // Get input data type
     string data_type_string;
diff --git a/tensorflow/core/kernels/identity_n_op_test.cc b/tensorflow/core/kernels/identity_n_op_test.cc
index 6a133c4d03a..9eada689d2c 100644
--- a/tensorflow/core/kernels/identity_n_op_test.cc
+++ b/tensorflow/core/kernels/identity_n_op_test.cc
@@ -64,12 +64,12 @@ TEST_F(IdentityNOpTest, Int32Success_2_3) {
 
 TEST_F(IdentityNOpTest, StringInt32Success) {
   TF_ASSERT_OK(Init(DT_STRING, DT_INT32));
-  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
   AddInputFromArray<int32>(TensorShape({8}), {1, 3, 5, 7, 9, 11, 13, 15});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected0(allocator(), DT_STRING, TensorShape({6}));
-  test::FillValues<string>(&expected0, {"A", "b", "C", "d", "E", "f"});
-  test::ExpectTensorEqual<string>(expected0, *GetOutput(0));
+  test::FillValues<tstring>(&expected0, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected0, *GetOutput(0));
   Tensor expected1(allocator(), DT_INT32, TensorShape({8}));
   test::FillValues<int32>(&expected1, {1, 3, 5, 7, 9, 11, 13, 15});
   test::ExpectTensorEqual<int32>(expected1, *GetOutput(1));
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 9349bb69bcd..daa8a1ddb25 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -158,7 +158,7 @@ REGISTER_GPU_KERNEL(Variant);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
-REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
index 9975cd35376..b22848f816b 100644
--- a/tensorflow/core/kernels/identity_op_test.cc
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -56,11 +56,11 @@ TEST_F(IdentityOpTest, Int32Success_2_3) {
 
 TEST_F(IdentityOpTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_STRING));
-  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({6}));
-  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(IdentityOpTest, RefInputError) { TF_ASSERT_OK(Init(DT_INT32_REF)); }
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index e51c782b2c2..f6b4bc3e9ee 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -33,7 +33,7 @@ class IdentityReader : public ReaderBase {
   explicit IdentityReader(const string& node_name)
       : ReaderBase(strings::StrCat("IdentityReader '", node_name, "'")) {}
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     *key = current_work();
     *value = current_work();
@@ -44,14 +44,14 @@ class IdentityReader : public ReaderBase {
 
   // Stores state in a ReaderBaseState proto, since IdentityReader has
   // no additional state beyond ReaderBase.
-  Status SerializeStateLocked(string* state) override {
+  Status SerializeStateLocked(tstring* state) override {
     ReaderBaseState base_state;
     SaveBaseState(&base_state);
-    base_state.SerializeToString(state);
+    SerializeToTString(base_state, state);
     return Status::OK();
   }
 
-  Status RestoreStateLocked(const string& state) override {
+  Status RestoreStateLocked(const tstring& state) override {
     ReaderBaseState base_state;
     if (!ParseProtoUnlimited(&base_state, state)) {
       return errors::InvalidArgument("Could not parse state for ", name(), ": ",
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index a8ee00e080e..22d833395f0 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -116,7 +116,7 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .TypeConstraint<int64>("T"),
                         InTopK<CPUDevice, float, int64>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
@@ -142,6 +142,6 @@ REGISTER_KERNEL_BUILDER(
     Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int64>("T"),
     InTopK<GPUDevice, float, int64>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/in_topk_op.h b/tensorflow/core/kernels/in_topk_op.h
index 52716f2d272..f48932cdbbf 100644
--- a/tensorflow/core/kernels/in_topk_op.h
+++ b/tensorflow/core/kernels/in_topk_op.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
 #define TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
diff --git a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
index 1894dedddf1..44c96f67b26 100644
--- a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -37,11 +37,11 @@ namespace functor {
 //   1: If prediction is larger than the target prediction for the batch.
 template <typename T, typename TargetT>
 __global__ void ComputePredictionMaskKernel(
-    const T* predictions,    // dims: [ num_targets x num_classes ]
-    const TargetT* targets,  // dims: [ num_targets ]
-    int64* mask,             // dims: [ num_targets x num_classes ]
+    const T* __restrict__ predictions,    // dims: [ num_targets x num_classes ]
+    const TargetT* __restrict__ targets,  // dims: [ num_targets ]
+    int64* __restrict__ mask,             // dims: [ num_targets x num_classes ]
     int num_targets, int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, num_targets * num_classes) {
+  GPU_1D_KERNEL_LOOP(i, num_targets * num_classes) {
     const int batch_index = i / num_classes;
     TargetT target_idx = ldg(targets + batch_index);
 
@@ -118,7 +118,7 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
     const auto& d = context->eigen_device<GPUDevice>();
 
     // Compute a mask for all predictions.
-    CudaLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d);
+    GpuLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d);
     OP_REQUIRES_OK(
         context, GpuLaunchKernel(ComputePredictionMaskKernel<T, TargetT>,
                                  config.block_count, config.thread_per_block, 0,
@@ -173,4 +173,4 @@ DEFINE_GPU_KERNELS(float, int64);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index c0d39d9d46d..fc23f70f39b 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -51,7 +51,7 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
   case DataTypeToEnum<type>::value: \
     return DoParallelConcatUpdate<CPUDevice, type>(d, value, loc, output);
     TF_CALL_POD_TYPES(CASE);
-    TF_CALL_string(CASE);
+    TF_CALL_tstring(CASE);
     TF_CALL_variant(CASE);
 #undef CASE
     default:
@@ -319,8 +319,8 @@ void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
 void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
                              const Tensor& v, Tensor* y) {
   auto Ti = i.flat<int32>();
-  auto Tv = v.flat_outer_dims<string>();
-  auto Ty = y->flat_outer_dims<string>();
+  auto Tv = v.flat_outer_dims<tstring>();
+  auto Ty = y->flat_outer_dims<tstring>();
   auto nrows = Ty.dimension(0);
   for (int64 j = 0; j < Ti.size(); ++j) {
     auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
@@ -416,7 +416,7 @@ Status DoCopy(const CPUDevice& device, const Tensor& x, Tensor* y) {
 
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_bool(CASE);
-    TF_CALL_string(CASE);
+    TF_CALL_tstring(CASE);
 #undef CASE
     default:
       return errors::InvalidArgument("Unsupported data type: ",
@@ -477,7 +477,7 @@ REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
 REGISTER_EMPTY(float, CPU)
 REGISTER_EMPTY(double, CPU)
 REGISTER_EMPTY(Eigen::half, CPU)
-REGISTER_EMPTY(string, CPU)
+REGISTER_EMPTY(tstring, CPU)
 REGISTER_EMPTY(int32, CPU)
 REGISTER_EMPTY(int64, CPU)
 REGISTER_EMPTY(bool, CPU)
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index dbcb0ad8a70..bfffadecdf1 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -29,7 +29,8 @@ typedef Eigen::GpuDevice Device;
 template <typename T>
 __global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
                                          const int64 cols, int32 loc,
-                                         const T* src, T* dst) {
+                                         const T* __restrict__ src,
+                                         T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
     int64 c = idx % cols;
     int64 r = (loc % rows + rows) % rows;  // Guard index range.
@@ -80,8 +81,10 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
 
 template <typename T, InplaceOpType op>
 __global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
-                                  const int64 cols, const int64 n, const T* src,
-                                  const int32* rowids, T* dst) {
+                                  const int64 cols, const int64 n,
+                                  const T* __restrict__ src,
+                                  const int32* __restrict__ rowids,
+                                  T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
     int64 r = idx / cols;
     int64 c = idx % cols;
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index afe4b24731b..c0f57b912c0 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <limits>
+
 #include "tensorflow/core/framework/allocator.h"
 
 #define EIGEN_USE_THREADS
@@ -21,8 +22,6 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/kernels/list_kernels.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/list_kernels.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/util.h"
@@ -38,20 +38,16 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// Variant compatible type for a list of tensors. This is mutable but instances
-// should never be mutated after stored in a variant tensor.
-TensorList::TensorList(const TensorList& other)
-    : tensors(other.tensors),
-      element_shape(other.element_shape),
-      element_dtype(other.element_dtype),
-      max_num_elements(other.max_num_elements) {}
+TensorList::~TensorList() {
+  if (tensors_) tensors_->Unref();
+}
 
 void TensorList::Encode(VariantTensorData* data) const {
   data->set_type_name(TypeName());
   std::vector<size_t> invalid_indices;
-  for (size_t i = 0; i < tensors.size(); i++) {
-    if (tensors.at(i).dtype() != DT_INVALID) {
-      *data->add_tensors() = tensors.at(i);
+  for (size_t i = 0; i < tensors().size(); i++) {
+    if (tensors().at(i).dtype() != DT_INVALID) {
+      *data->add_tensors() = tensors().at(i);
     } else {
       invalid_indices.push_back(i);
     }
@@ -78,11 +74,11 @@ static Status TensorListDeviceCopy(
   to->element_shape = from.element_shape;
   to->element_dtype = from.element_dtype;
   to->max_num_elements = from.max_num_elements;
-  to->tensors.reserve(from.tensors.size());
-  for (const Tensor& t : from.tensors) {
-    to->tensors.emplace_back(t.dtype());
+  to->tensors().reserve(from.tensors().size());
+  for (const Tensor& t : from.tensors()) {
+    to->tensors().emplace_back(t.dtype());
     if (t.dtype() != DT_INVALID) {
-      TF_RETURN_IF_ERROR(copy(t, &to->tensors.back()));
+      TF_RETURN_IF_ERROR(copy(t, &to->tensors().back()));
     }
   }
   return Status::OK();
@@ -116,16 +112,16 @@ bool TensorList::Decode(const VariantTensorData& data) {
   }
 
   size_t total_num_tensors = data.tensors().size() + num_invalid_tensors;
-  tensors.reserve(total_num_tensors);
+  tensors().reserve(total_num_tensors);
   std::vector<size_t>::iterator invalid_indices_it = invalid_indices.begin();
   std::vector<Tensor>::const_iterator tensors_it = data.tensors().begin();
   for (size_t i = 0; i < total_num_tensors; i++) {
     if (invalid_indices_it != invalid_indices.end() &&
         *invalid_indices_it == i) {
-      tensors.emplace_back(Tensor(DT_INVALID));
+      tensors().emplace_back(Tensor(DT_INVALID));
       invalid_indices_it++;
     } else if (tensors_it != data.tensors().end()) {
-      tensors.emplace_back(*tensors_it);
+      tensors().emplace_back(*tensors_it);
       tensors_it++;
     } else {
       // VariantTensorData is corrupted.
@@ -201,19 +197,31 @@ Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
       input_index, output_index, DT_VARIANT, TensorShape{},
       c->input_memory_type(input_index), AllocatorAttributes());
   Tensor* output_tensor;
-  if (maybe_output != nullptr) {
-    // Woohoo, forwarding succeeded!
+  if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT &&
+      maybe_output->NumElements() == 1) {
     output_tensor = maybe_output.get();
-    c->set_output(output_index, *output_tensor);
-  } else {
-    // If forwarding is not possible allocate a new output tensor and copy
-    // the `input_list` to it.
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    TF_RETURN_IF_ERROR(
-        c->allocate_output(output_index, {}, &output_tensor, attr));
-    output_tensor->scalar<Variant>()() = input_list;
+    TensorList* tmp_out = output_tensor->scalar<Variant>()().get<TensorList>();
+    if (tmp_out == nullptr) {
+      return errors::InvalidArgument(
+          "Expected input ", input_index, " to be a TensorList but saw ",
+          output_tensor->scalar<Variant>()().TypeName());
+    }
+    if (tmp_out->RefCountIsOne()) {
+      // Woohoo, forwarding succeeded!
+      c->set_output(output_index, *output_tensor);
+      *output_list = tmp_out;
+      return Status::OK();
+    }
   }
+
+  // If forwarding is not possible allocate a new output tensor and copy
+  // the `input_list` to it.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(
+      c->allocate_output(output_index, {}, &output_tensor, attr));
+  output_tensor->scalar<Variant>()() = input_list.Copy();
+
   *output_list = output_tensor->scalar<Variant>()().get<TensorList>();
   return Status::OK();
 }
@@ -295,15 +303,15 @@ class TensorListPushBack : public OpKernel {
 
     if (l->max_num_elements != -1) {
       OP_REQUIRES(
-          c, l->tensors.size() < l->max_num_elements,
+          c, l->tensors().size() < l->max_num_elements,
           errors::InvalidArgument("Tried to push item into a full list",
-                                  " list size: ", l->tensors.size(),
+                                  " list size: ", l->tensors().size(),
                                   " max_num_elements: ", l->max_num_elements));
     }
 
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    output_list->tensors.push_back(input);
+    output_list->tensors().push_back(input);
   }
 
  private:
@@ -330,7 +338,7 @@ class TensorListLength : public OpKernel {
     OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = l->tensors.size();
+    result->scalar<int32>()() = l->tensors().size();
   }
 };
 
@@ -399,7 +407,7 @@ class TensorListReserve : public OpKernel {
     TensorList output;
     output.element_shape = element_shape;
     output.element_dtype = element_dtype_;
-    output.tensors.resize(num_elements, Tensor(DT_INVALID));
+    output.tensors().resize(num_elements, Tensor(DT_INVALID));
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
@@ -440,32 +448,37 @@ class TensorListResize : public OpKernel {
         c->forward_input(0, 0, DT_VARIANT, TensorShape{},
                          c->input_memory_type(0), AllocatorAttributes());
     if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.resize(
-          size, Tensor(DT_INVALID));
-      c->set_output(0, *maybe_result);
-    } else {
-      Tensor* result;
-      AllocatorAttributes attr;
-      attr.set_on_host(true);
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      TensorList output_list;
-      output_list.element_shape = input_list->element_shape;
-      output_list.element_dtype = input_list->element_dtype;
-      output_list.max_num_elements = input_list->max_num_elements;
-      if (size > input_list->tensors.size()) {
-        output_list.tensors.insert(output_list.tensors.begin(),
-                                   input_list->tensors.begin(),
-                                   input_list->tensors.end());
-        // Add DT_INVALID tensors to the end of the list if the requested size
-        // is larger than the list length.
-        output_list.tensors.resize(size, Tensor(DT_INVALID));
-      } else {
-        output_list.tensors.insert(output_list.tensors.begin(),
-                                   input_list->tensors.begin(),
-                                   input_list->tensors.begin() + size);
+      TensorList* out = maybe_result->scalar<Variant>()().get<TensorList>();
+      if (out->RefCountIsOne()) {
+        // We are able to forward the input.
+        out->tensors().resize(size, Tensor(DT_INVALID));
+        c->set_output(0, *maybe_result);
+        return;
       }
-      result->scalar<Variant>()() = std::move(output_list);
     }
+
+    // We were not able to forward the input.  Will have to resize from scratch.
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    TensorList output_list;
+    output_list.element_shape = input_list->element_shape;
+    output_list.element_dtype = input_list->element_dtype;
+    output_list.max_num_elements = input_list->max_num_elements;
+    if (size > input_list->tensors().size()) {
+      output_list.tensors().insert(output_list.tensors().begin(),
+                                   input_list->tensors().begin(),
+                                   input_list->tensors().end());
+      // Add DT_INVALID tensors to the end of the list if the requested size
+      // is larger than the list length.
+      output_list.tensors().resize(size, Tensor(DT_INVALID));
+    } else {
+      output_list.tensors().insert(output_list.tensors().begin(),
+                                   input_list->tensors().begin(),
+                                   input_list->tensors().begin() + size);
+    }
+    result->scalar<Variant>()() = std::move(output_list);
   }
 };
 
@@ -495,9 +508,9 @@ class TensorListSetItem : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
     int32 index = c->input(1).scalar<int32>()();
-    OP_REQUIRES(c, index < l->tensors.size(),
+    OP_REQUIRES(c, index < l->tensors().size(),
                 errors::InvalidArgument("Trying to modify element ", index,
-                                        " in a list with ", l->tensors.size(),
+                                        " in a list with ", l->tensors().size(),
                                         " elements."));
     const Tensor& value = c->input(2);
     OP_REQUIRES(c, l->element_shape.IsCompatibleWith(value.shape()),
@@ -508,7 +521,7 @@ class TensorListSetItem : public OpKernel {
                     " list shape: ", l->element_shape.DebugString()));
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    output_list->tensors[index] = value;
+    output_list->tensors()[index] = value;
   }
 
  private:
@@ -560,11 +573,26 @@ class TensorListConcatLists : public OpKernel {
     const Tensor& tl_a = c->input(0);
     const Tensor& tl_b = c->input(1);
 
-    Tensor* output;
-    if (tl_alias) {
-      c->set_output(0, *tl_alias);
-      output = tl_alias.get();
-    } else {
+    Tensor* output = nullptr;
+    bool ok_to_alias = tl_alias != nullptr;
+    if (tl_alias && tl_alias->dtype() == DT_VARIANT &&
+        tl_alias->NumElements() > 0) {
+      auto tl_a_t = tl_alias->flat<Variant>();
+      for (int64 i = 0; i < tl_alias->NumElements(); ++i) {
+        TensorList* aliased = tl_a_t(i).get<TensorList>();
+        if (aliased == nullptr || !aliased->RefCountIsOne()) {
+          ok_to_alias = false;
+          break;
+        }
+      }
+      if (ok_to_alias) {
+        c->set_output(0, *tl_alias);
+        output = tl_alias.get();
+      }
+    }
+    if (!ok_to_alias) {
+      // Couldn't alias the entire Tensor.  We'll be conservative and not try
+      // to alias individual batch entries.
       attr.set_on_host(true);
       OP_REQUIRES_OK(c, c->allocate_output(0, tl_a_shape, &output, attr));
     }
@@ -573,45 +601,42 @@ class TensorListConcatLists : public OpKernel {
     auto tl_a_t = tl_a.flat<Variant>();
     auto tl_b_t = tl_b.flat<Variant>();
 
-    for (int64 b = 0; b < tl_a.NumElements(); ++b) {
-      const TensorList* l_a = tl_a_t(b).get<TensorList>();
-      const TensorList* l_b = tl_b_t(b).get<TensorList>();
+    for (int64 i = 0; i < tl_a.NumElements(); ++i) {
+      const TensorList* l_a = tl_a_t(i).get<TensorList>();
+      const TensorList* l_b = tl_b_t(i).get<TensorList>();
       OP_REQUIRES(
           c, l_a != nullptr,
-          errors::InvalidArgument("input_a is not a TensorList at index ", b,
-                                  ".  Saw: '", tl_a_t(b).DebugString(), "'"));
+          errors::InvalidArgument("input_a is not a TensorList at index ", i,
+                                  ".  Saw: '", tl_a_t(i).DebugString(), "'"));
       OP_REQUIRES(
           c, l_b != nullptr,
-          errors::InvalidArgument("input_b is not a TensorList at index ", b,
-                                  ".  Saw: '", tl_b_t(b).DebugString(), "'"));
+          errors::InvalidArgument("input_b is not a TensorList at index ", i,
+                                  ".  Saw: '", tl_b_t(i).DebugString(), "'"));
       OP_REQUIRES(c, l_a->element_dtype == element_dtype_,
                   errors::InvalidArgument(
-                      "input_a[", b, "].dtype != element_dtype.  Saw: ",
+                      "input_a[", i, "].dtype != element_dtype.  Saw: ",
                       DataTypeString(l_a->element_dtype), " vs. ",
                       DataTypeString(element_dtype_)));
       OP_REQUIRES(c, l_b->element_dtype == element_dtype_,
                   errors::InvalidArgument(
-                      "input_b[", b, "].dtype != element_dtype.  Saw: ",
+                      "input_b[", i, "].dtype != element_dtype.  Saw: ",
                       DataTypeString(l_b->element_dtype), " vs. ",
                       DataTypeString(element_dtype_)));
       OP_REQUIRES(c, l_a->element_shape.IsIdenticalTo(l_b->element_shape),
                   errors::InvalidArgument(
                       "input_a and input_b TensorList element shapes are not "
                       "identical at index ",
-                      b, ".  Saw ", l_a->element_shape.DebugString(), " vs. ",
+                      i, ".  Saw ", l_a->element_shape.DebugString(), " vs. ",
                       l_b->element_shape.DebugString()));
-      if (tl_alias) {
-        TensorList* out = output_t(b).get<TensorList>();
-        DCHECK(out != nullptr) << "Expected output to alias input_a, but it "
-                                  "doesn't contain a TensorList at index "
-                               << b;
-        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
-                  std::back_inserter(out->tensors));
+      if (ok_to_alias) {
+        TensorList* out = output_t(i).get<TensorList>();
+        std::copy(l_b->tensors().begin(), l_b->tensors().end(),
+                  std::back_inserter(out->tensors()));
       } else {
-        TensorList out = *l_a;
-        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
-                  std::back_inserter(out.tensors));
-        output_t(b) = std::move(out);
+        TensorList out = l_a->Copy();
+        std::copy(l_b->tensors().begin(), l_b->tensors().end(),
+                  std::back_inserter(out.tensors()));
+        output_t(i) = std::move(out);
       }
     }
   }
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index a33ca1cee19..3a6b553f7a8 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -31,7 +31,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/util/tensor_ops_util.h"
 #include "tensorflow/core/util/util.h"
 
@@ -41,12 +43,85 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // Variant compatible type for a list of tensors. This is mutable but instances
 // should never be mutated after stored in a variant tensor.
-struct TensorList {
+//
+// **NOTE**: TensorList stores a refcounted container of tf::Tensor objects,
+// which are accessible via TensorList::tensors().  Because it is refcounted,
+// straight copies of the form:
+//
+//    TensorList b = a;
+//    b.tensors().push_back(t);  // WARNING: This modifies a.tensors().
+//
+// Do not create a true copy of the underlying container - but instead increment
+// a reference count.  Modifying b.tensors() modifies a.tensors().  In this way,
+// TensorList should be considered similar to the tf::Tensor object.
+//
+// In order to get a copy of the underlying list, use the Copy method:
+//
+//    TensorList b = a.Copy();
+//    b.tensors().push_back(t);  // This does not modify a.tensors().
+//
+// Note that this is not a deep copy: the memory locations of the underlying
+// tensors will still point to the same locations of the corresponding tensors
+// in the original.  To truly perform a deep copy, Device and Type-specific
+// code needs to be applied to the underlying tensors as usual.
+//
+// The most important implication of RefCounted TLs is that OpKernels
+// wishing to reuse TensorList inputs as outputs via context->forward_input()
+// need to perform an additional check on the refcount of the TensorList,
+// to ensure aliasing can be performed safely.  For example:
+//
+//     bool can_alias = false;
+//     auto fw = c->forward_input(..., DT_VARIANT, {}, ...);
+//     if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) {
+//       auto* tl = fw->scalar<Variant>()().get<TensorList>();
+//       if (tl && tl->RefCountIsOne()) {
+//         can_alias = true;
+//       }
+//     }
+//
+class TensorList {
  public:
-  TensorList() {}
-  TensorList(const TensorList& other);
+  TensorList() : tensors_(new Tensors) {}
+  ~TensorList();
+
+  TensorList(const TensorList& other)
+      : element_shape(other.element_shape),
+        element_dtype(other.element_dtype),
+        max_num_elements(other.max_num_elements),
+        tensors_(other.tensors_) {
+    tensors_->Ref();
+  }
+
+  TensorList(TensorList&& rhs)
+      : element_shape(std::move(rhs.element_shape)),
+        element_dtype(rhs.element_dtype),
+        max_num_elements(rhs.max_num_elements),
+        tensors_(rhs.tensors_) {
+    rhs.tensors_ = nullptr;
+  }
+
+  TensorList& operator=(const TensorList& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    max_num_elements = rhs.max_num_elements;
+    tensors_->Unref();
+    tensors_ = rhs.tensors_;
+    tensors_->Ref();
+    return *this;
+  }
+
+  TensorList& operator=(TensorList&& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    max_num_elements = rhs.max_num_elements;
+    std::swap(tensors_, rhs.tensors_);
+    return *this;
+  }
 
   static const char kTypeName[];
+
   string TypeName() const { return kTypeName; }
 
   void Encode(VariantTensorData* data) const;
@@ -56,14 +131,47 @@ struct TensorList {
   // TODO(apassos) fill this out
   string DebugString() const { return "TensorList"; }
 
-  std::vector<Tensor> tensors;
   PartialTensorShape element_shape;
+
   DataType element_dtype;
+
   // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size
   // of `tensors` is unbounded.
   int max_num_elements = -1;
+
+  // Access to the underlying tensor container.
+  std::vector<Tensor>& tensors() { return tensors_->values_; }
+  const std::vector<Tensor>& tensors() const { return tensors_->values_; }
+
+  // Get a new TensorList containing a copy of the underlying tensor container.
+  TensorList Copy() const {
+    TensorList out;
+    out.element_shape = element_shape;
+    out.element_dtype = element_dtype;
+    out.max_num_elements = max_num_elements;
+    // This performs a copy of the std::vector.
+    out.tensors_->values_ = tensors_->values_;
+    return out;
+  }
+
+  // Is this TensorList the only one with a reference to the underlying
+  // container?
+  bool RefCountIsOne() const { return tensors_->RefCountIsOne(); }
+
+ private:
+  class Tensors : public core::RefCounted {
+   public:
+    std::vector<Tensor> values_;
+  };
+  Tensors* tensors_;
 };
 
+#if defined(PLATFORM_GOOGLE)
+// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
+static_assert(Variant::CanInlineType<TensorList>(),
+              "Must be able to inline TensorList into a Variant");
+#endif
+
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
 
 Status GetElementShapeFromInput(OpKernelContext* c,
@@ -96,18 +204,19 @@ class TensorListStack : public OpKernel {
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
     if (num_elements_ != -1) {
-      OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
+      OP_REQUIRES(c, tensor_list->tensors().size() == num_elements_,
                   errors::InvalidArgument(
                       "Operation expected a list with ", num_elements_,
                       " elements but got a list with ",
-                      tensor_list->tensors.size(), " elements."));
+                      tensor_list->tensors().size(), " elements."));
     }
     PartialTensorShape partial_element_shape;
     OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1,
                                                &partial_element_shape));
     OP_REQUIRES(
         c,
-        partial_element_shape.IsFullyDefined() || !tensor_list->tensors.empty(),
+        partial_element_shape.IsFullyDefined() ||
+            !tensor_list->tensors().empty(),
         errors::InvalidArgument("Tried to stack elements of an empty ",
                                 "list with non-fully-defined element_shape: ",
                                 partial_element_shape.DebugString()));
@@ -115,8 +224,8 @@ class TensorListStack : public OpKernel {
     // Check that `element_shape` input tensor is compatible with the shapes of
     // element tensors.
     if (!tensor_list->element_shape.IsFullyDefined()) {
-      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
-        const Tensor& t = tensor_list->tensors[i];
+      for (int i = 0; i < tensor_list->tensors().size(); ++i) {
+        const Tensor& t = tensor_list->tensors()[i];
         if (t.dtype() != DT_INVALID) {
           PartialTensorShape tmp = partial_element_shape;
           OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
@@ -133,7 +242,7 @@ class TensorListStack : public OpKernel {
                     "tensors and has a non-fully-defined element_shape: ",
                     partial_element_shape.DebugString()));
     TensorShape output_shape = element_shape;
-    output_shape.InsertDim(0, tensor_list->tensors.size());
+    output_shape.InsertDim(0, tensor_list->tensors().size());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
@@ -141,9 +250,9 @@ class TensorListStack : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(tensor_list->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors().size());
     Tensor zeros;
-    for (const auto& t : tensor_list->tensors) {
+    for (const auto& t : tensor_list->tensors()) {
       if (t.dtype() != DT_INVALID) {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             t.shaped<T, 2>({1, t.NumElements()})));
@@ -195,12 +304,12 @@ class TensorListGetItem : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
     int32 index = c->input(1).scalar<int32>()();
-    OP_REQUIRES(c, index < l->tensors.size(),
+    OP_REQUIRES(c, index < l->tensors().size(),
                 errors::InvalidArgument("Trying to access element ", index,
-                                        " in a list with ", l->tensors.size(),
+                                        " in a list with ", l->tensors().size(),
                                         " elements."));
-    if (l->tensors[index].dtype() != DT_INVALID) {
-      c->set_output(0, l->tensors[index]);
+    if (l->tensors()[index].dtype() != DT_INVALID) {
+      c->set_output(0, l->tensors()[index]);
     } else {
       PartialTensorShape partial_element_shape;
       OP_REQUIRES_OK(
@@ -216,7 +325,7 @@ class TensorListGetItem : public OpKernel {
       // In that mode TensorArray sets the array's element_shape on the first
       // write call. We could do something similar here if needed.
       if (!partial_element_shape.IsFullyDefined()) {
-        for (const Tensor& t : l->tensors) {
+        for (const Tensor& t : l->tensors()) {
           if (t.dtype() != DT_INVALID) {
             PartialTensorShape tmp = partial_element_shape;
             OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
@@ -260,10 +369,10 @@ class TensorListPopBack : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
 
-    OP_REQUIRES(c, !l->tensors.empty(),
+    OP_REQUIRES(c, !l->tensors().empty(),
                 errors::InvalidArgument("Trying to pop from an empty list."));
 
-    const Tensor& t = l->tensors.back();
+    const Tensor& t = l->tensors().back();
     if (t.dtype() != DT_INVALID) {
       c->set_output(1, t);
     } else {
@@ -288,7 +397,7 @@ class TensorListPopBack : public OpKernel {
 
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    output_list->tensors.pop_back();
+    output_list->tensors().pop_back();
   }
 
  private:
@@ -347,7 +456,7 @@ class TensorListConcat : public OpKernel {
     // If the TensorList is empty, element_shape_except_first_dim_ must be fully
     // defined.
     OP_REQUIRES(c,
-                !tensor_list->tensors.empty() ||
+                !tensor_list->tensors().empty() ||
                     element_shape_except_first_dim_.IsFullyDefined(),
                 errors::InvalidArgument(
                     "All except the first dimension must be fully defined ",
@@ -364,8 +473,8 @@ class TensorListConcat : public OpKernel {
     if (!tensor_list->element_shape.IsFullyDefined()) {
       bool check_dim = (first_dim == -1);
       int64 inferred_first_dim = first_dim;
-      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
-        const Tensor& t = tensor_list->tensors[i];
+      for (int i = 0; i < tensor_list->tensors().size(); ++i) {
+        const Tensor& t = tensor_list->tensors()[i];
         if (t.dtype() != DT_INVALID) {
           PartialTensorShape tmp = element_shape_except_first_dim_;
           OP_REQUIRES(
@@ -407,14 +516,14 @@ class TensorListConcat : public OpKernel {
     OP_REQUIRES_OK(
         c,
         c->allocate_output(
-            1, TensorShape({static_cast<int64>(tensor_list->tensors.size())}),
+            1, TensorShape({static_cast<int64>(tensor_list->tensors().size())}),
             &lengths_tensor));
     auto lengths_tensor_vec = lengths_tensor->vec<int64>();
     int64 leading_dim = 0;
-    for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
+    for (size_t i = 0; i < tensor_list->tensors().size(); i++) {
       int64 dim;
-      if (tensor_list->tensors[i].dtype() != DT_INVALID) {
-        dim = tensor_list->tensors[i].shape().dim_size(0);
+      if (tensor_list->tensors()[i].dtype() != DT_INVALID) {
+        dim = tensor_list->tensors()[i].shape().dim_size(0);
       } else {
         // If leading_dims is not provided or does not contain an entry for
         // index i use the inferred `first_dim` if set.
@@ -449,12 +558,12 @@ class TensorListConcat : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(tensor_list->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors().size());
     // Store the zeros tensors in a vector to prevent them from being GC'ed till
     // concat is complete.
     std::vector<Tensor> zeros_vec;
-    for (int i = 0; i < tensor_list->tensors.size(); i++) {
-      const Tensor& element_tensor = tensor_list->tensors[i];
+    for (int i = 0; i < tensor_list->tensors().size(); i++) {
+      const Tensor& element_tensor = tensor_list->tensors()[i];
       if (element_tensor.dtype() != DT_INVALID) {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
@@ -536,7 +645,7 @@ class TensorListSplit : public OpKernel {
                 errors::InvalidArgument(
                     "Expected lengths to be a vector, received shape: ",
                     lengths.shape().DebugString()));
-    output_list.tensors.reserve(lengths.shape().dim_size(0));
+    output_list.tensors().reserve(lengths.shape().dim_size(0));
     int64 start = 0;
     int64 end = 0;
     for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
@@ -557,7 +666,7 @@ class TensorListSplit : public OpKernel {
       OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
       aligned.flat<T>().device(c->eigen_device<Device>()) =
           tmp.unaligned_flat<T>();
-      output_list.tensors.emplace_back(aligned);
+      output_list.tensors().emplace_back(aligned);
     }
     OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
                 errors::InvalidArgument(
@@ -599,7 +708,7 @@ class TensorListGather : public OpKernel {
     if (!tensor_list->element_shape.IsFullyDefined()) {
       for (int index = 0; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
-        const Tensor& t = tensor_list->tensors[i];
+        const Tensor& t = tensor_list->tensors()[i];
         if (t.dtype() != DT_INVALID) {
           PartialTensorShape tmp = partial_element_shape;
           OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
@@ -629,10 +738,10 @@ class TensorListGather : public OpKernel {
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
-          c, i < tensor_list->tensors.size(),
+          c, i < tensor_list->tensors().size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  tensor_list->tensors.size(), " elements."));
-      const Tensor& t = tensor_list->tensors[i];
+                                  tensor_list->tensors().size(), " elements."));
+      const Tensor& t = tensor_list->tensors()[i];
       if (t.dtype() != DT_INVALID) {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             t.shaped<T, 2>({1, t.NumElements()})));
@@ -693,7 +802,7 @@ class TensorListFromTensor : public OpKernel {
                     "Specified a list with shape ", element_shape.DebugString(),
                     " from a tensor with shape ", output_shape.DebugString()));
     output_list.element_shape = element_shape;
-    output_list.tensors.reserve(t.shape().dim_size(0));
+    output_list.tensors().reserve(t.shape().dim_size(0));
     for (int i = 0; i < t.shape().dim_size(0); ++i) {
       Tensor tmp = t.Slice(i, i + 1);
       TensorShape tmp_shape = tmp.shape();
@@ -706,7 +815,7 @@ class TensorListFromTensor : public OpKernel {
       OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
       aligned.flat<T>().device(c->eigen_device<Device>()) =
           tmp.unaligned_flat<T>();
-      output_list.tensors.push_back(aligned);
+      output_list.tensors().push_back(aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
@@ -732,7 +841,7 @@ Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices,
     // many small ones.
     aligned.flat<T>().device(c->eigen_device<Device>()) =
         tmp.unaligned_flat<T>();
-    std::swap(list->tensors[i], aligned);
+    std::swap(list->tensors()[i], aligned);
   }
   return Status::OK();
 }
@@ -777,8 +886,8 @@ class TensorListScatterIntoExistingList : public OpKernel {
             ? -1
             : *std::max_element(indices_vec.data(),
                                 indices_vec.data() + indices.NumElements());
-    if (max_index + 1 > output_list->tensors.size()) {
-      output_list->tensors.resize(max_index + 1);
+    if (max_index + 1 > output_list->tensors().size()) {
+      output_list->tensors().resize(max_index + 1);
     }
 
     // Scatter the values.
@@ -845,8 +954,8 @@ class TensorListScatter : public OpKernel {
           highest_index = i;
         }
       }
-      output_list.tensors.resize(std::max(highest_index + 1, num_elements),
-                                 Tensor(DT_INVALID));
+      output_list.tensors().resize(std::max(highest_index + 1, num_elements),
+                                   Tensor(DT_INVALID));
     }
 
     OP_REQUIRES_OK(c,
@@ -875,19 +984,19 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
 
   TF_RETURN_IF_ERROR(
       a.element_shape.MergeWith(b.element_shape, &out->element_shape));
-  if (a.tensors.size() != b.tensors.size()) {
+  if (a.tensors().size() != b.tensors().size()) {
     return errors::InvalidArgument(
         "Trying to add two lists of tensors with different lengths. One is ",
-        a.tensors.size(), " and the other is ", b.tensors.size());
+        a.tensors().size(), " and the other is ", b.tensors().size());
   }
-  out->tensors.reserve(a.tensors.size());
-  for (int i = 0; i < a.tensors.size(); ++i) {
-    const Tensor& a_tensor = a.tensors[i];
-    const Tensor& b_tensor = b.tensors[i];
+  out->tensors().reserve(a.tensors().size());
+  for (int i = 0; i < a.tensors().size(); ++i) {
+    const Tensor& a_tensor = a.tensors()[i];
+    const Tensor& b_tensor = b.tensors()[i];
     Tensor out_tensor;
     TF_RETURN_IF_ERROR(
         BinaryAddTensors<Device>(c, a_tensor, b_tensor, &out_tensor));
-    out->tensors.push_back(out_tensor);
+    out->tensors().push_back(out_tensor);
   }
   return Status::OK();
 }
@@ -897,11 +1006,11 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
                            TensorList* y) {
   y->element_dtype = x.element_dtype;
   y->element_shape = x.element_shape;
-  y->tensors.reserve(x.tensors.size());
-  for (const Tensor& t : x.tensors) {
+  y->tensors().reserve(x.tensors().size());
+  for (const Tensor& t : x.tensors()) {
     Tensor out_tensor;
     TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(c, t, &out_tensor));
-    y->tensors.emplace_back(out_tensor);
+    y->tensors().emplace_back(out_tensor);
   }
   return Status::OK();
 }
@@ -936,7 +1045,19 @@ class TensorListPushBackBatch : public OpKernel {
         0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape,
         DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
 
-    const Tensor& tls = tls_alias ? *tls_alias : c->input(0);
+    bool ok_to_alias = tls_alias != nullptr;
+    if (tls_alias && tls_alias->dtype() == DT_VARIANT &&
+        tls_alias->NumElements() > 0) {
+      auto alias_t = tls_alias->flat<Variant>();
+      for (int i = 0; i < tls_alias->NumElements(); ++i) {
+        TensorList* tl_i = alias_t(i).get<TensorList>();
+        if (tl_i == nullptr || !tl_i->RefCountIsOne()) {
+          ok_to_alias = false;
+          break;
+        }
+      }
+    }
+    const Tensor& tls = ok_to_alias ? *tls_alias : c->input(0);
 
     OP_REQUIRES(c, tls.dtype() == DT_VARIANT,
                 errors::InvalidArgument(
@@ -979,7 +1100,7 @@ class TensorListPushBackBatch : public OpKernel {
 
     Tensor* result;
 
-    if (tls_alias) {
+    if (ok_to_alias) {
       result = tls_alias.get();
       c->set_output(0, *result);
     } else {
@@ -998,8 +1119,8 @@ class TensorListPushBackBatch : public OpKernel {
     auto result_t = result->vec<Variant>();
 
     for (int64 b = 0; b < batch_size; ++b) {
-      if (!tls_alias) {
-        result_t(b) = *tl_batch[b];
+      if (!ok_to_alias) {
+        result_t(b) = tl_batch[b]->Copy();
       }
       TensorList* output = result_t(b).get<TensorList>();
       DCHECK(output != nullptr);
@@ -1011,7 +1132,7 @@ class TensorListPushBackBatch : public OpKernel {
         auto frame_t = frame->flat<T>();
         frame_t.device(c->eigen_device<Device>()) = input_t.template chip<0>(b);
       }
-      output->tensors.push_back(std::move(*frame));
+      output->tensors().push_back(std::move(*frame));
     }
   }
 
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
index d28a2729d4c..b1f7f453096 100644
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -104,7 +104,7 @@ class ListDiffOp : public OpKernel {
                           ListDiffOp<type, int64>)
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_LISTDIFF);
-REGISTER_LISTDIFF(string);
+REGISTER_LISTDIFF(tstring);
 #undef REGISTER_LISTDIFF
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
old mode 100755
new mode 100644
index 2474fe4d564..a19cf83f0ad
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -68,7 +68,7 @@ class LMDBReader : public ReaderBase {
     return Status::OK();
   }
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     if (mdb_cursor_ == nullptr) {
       MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
@@ -82,9 +82,10 @@ class LMDBReader : public ReaderBase {
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-    *value = string(static_cast<const char*>(mdb_value_.mv_data),
-                    mdb_value_.mv_size);
+    *key =
+        tstring(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+    *value = tstring(static_cast<const char*>(mdb_value_.mv_data),
+                     mdb_value_.mv_size);
     *produced = true;
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/load_and_remap_matrix_op.cc b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
index 9d5a4b2f035..cb0245a9b61 100644
--- a/tensorflow/core/kernels/load_and_remap_matrix_op.cc
+++ b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
@@ -123,12 +123,11 @@ class LoadAndRemapMatrixOp : public OpKernel {
     // Processes the checkpoint source and the provided Tensor name.
     const Tensor* ckpt_path_t;
     OP_REQUIRES_OK(context, context->input("ckpt_path", &ckpt_path_t));
-    const string ckpt_path = *(ckpt_path_t->scalar<string>().data());
+    const string& ckpt_path = ckpt_path_t->scalar<tstring>()();
     const Tensor* old_tensor_name_t;
     OP_REQUIRES_OK(context,
                    context->input("old_tensor_name", &old_tensor_name_t));
-    const string old_tensor_name =
-        *(old_tensor_name_t->scalar<string>().data());
+    const string& old_tensor_name = old_tensor_name_t->scalar<tstring>()();
 
     LOG(INFO) << "Processing checkpoint : " << ckpt_path;
     BundleReader reader(context->env(), ckpt_path);
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index f93d3246af4..e4d04c4245c 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -143,7 +143,7 @@ class PrintV2Op : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_));
-    const string& msg = input_->scalar<string>()();
+    const string& msg = input_->scalar<tstring>()();
 
     string ended_msg = strings::StrCat(msg, end_);
 
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index a259d995faa..1c0ab482771 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -42,7 +42,7 @@ class PrintingV2GraphTest : public OpsTestBase {
 
 TEST_F(PrintingV2GraphTest, StringSuccess) {
   TF_ASSERT_OK(Init());
-  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
 }
 
@@ -90,8 +90,8 @@ TEST_F(PrintingGraphTest, Int32Success_Summarize6) {
 TEST_F(PrintingGraphTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING));
   AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<string>(TensorShape({}), {"foo"});
-  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  AddInputFromArray<tstring>(TensorShape({}), {"foo"});
+  AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
   test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
@@ -101,8 +101,8 @@ TEST_F(PrintingGraphTest, StringSuccess) {
 TEST_F(PrintingGraphTest, MsgSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: "));
   AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<string>(TensorShape({}), {"foo"});
-  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  AddInputFromArray<tstring>(TensorShape({}), {"foo"});
+  AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
   test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
@@ -112,8 +112,8 @@ TEST_F(PrintingGraphTest, MsgSuccess) {
 TEST_F(PrintingGraphTest, FirstNSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3));
   AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<string>(TensorShape({}), {"foo"});
-  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  AddInputFromArray<tstring>(TensorShape({}), {"foo"});
+  AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   // run 4 times but we only print 3 as intended
   for (int i = 0; i < 4; i++) TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 6e77e1ee012..459a7b3b177 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -130,7 +130,7 @@ class InitializeTableFromTextFileOp : public OpKernel {
         errors::InvalidArgument("filename should be a single string, but got ",
                                 vocab_filename_tensor.shape().DebugString()));
 
-    string vocab_filename = vocab_filename_tensor.scalar<string>()();
+    const string& vocab_filename = vocab_filename_tensor.scalar<tstring>()();
     OP_REQUIRES(ctx, !vocab_filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 3d1ee50c953..bd960758e40 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -299,7 +299,7 @@ inline uint64 HashScalar(const T& key) {
   return static_cast<uint64>(key);
 }
 
-inline uint64 HashScalar(const string& key) { return Hash64(key); }
+inline uint64 HashScalar(const tstring& key) { return Hash64(key); }
 
 // If the given shape is a scalar return {1} instead. Otherwise leave it alone.
 TensorShape MaybeVectorizeShape(const TensorShape& shape) {
@@ -982,18 +982,18 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
 REGISTER_KERNEL(int32, double);
 REGISTER_KERNEL(int32, float);
 REGISTER_KERNEL(int32, int32);
-REGISTER_KERNEL(int32, string);
+REGISTER_KERNEL(int32, tstring);
 REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, int32);
 REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(string, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int32);
-REGISTER_KERNEL(string, int64);
-REGISTER_KERNEL(string, string);
+REGISTER_KERNEL(int64, tstring);
+REGISTER_KERNEL(tstring, bool);
+REGISTER_KERNEL(tstring, double);
+REGISTER_KERNEL(tstring, float);
+REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int64);
+REGISTER_KERNEL(tstring, tstring);
 
 #undef REGISTER_KERNEL
 
@@ -1021,13 +1021,13 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, int32);
 REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, string);
+REGISTER_KERNEL(int64, tstring);
 REGISTER_KERNEL(int64, Variant);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(string, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int32);
-REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(tstring, bool);
+REGISTER_KERNEL(tstring, double);
+REGISTER_KERNEL(tstring, float);
+REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int64);
 
 #undef REGISTER_KERNEL
 
@@ -1055,12 +1055,12 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, int32);
 REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(string, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int32);
-REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(int64, tstring);
+REGISTER_KERNEL(tstring, bool);
+REGISTER_KERNEL(tstring, double);
+REGISTER_KERNEL(tstring, float);
+REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int64);
 
 #undef REGISTER_KERNEL
 
@@ -1090,11 +1090,11 @@ REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, int32);
 REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, Variant);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(string, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int32);
-REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(tstring, bool);
+REGISTER_KERNEL(tstring, double);
+REGISTER_KERNEL(tstring, float);
+REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int64);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 28a3d94e579..416848db6b0 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -92,7 +92,7 @@ class LookupTableOp : public OpKernel {
                                                       cinfo_.name());
     } else {
       if (!table_handle_set_) {
-        auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+        auto h = table_handle_.AccessTensor(ctx)->template flat<tstring>();
         h(0) = cinfo_.container();
         h(1) = cinfo_.name();
       }
@@ -134,7 +134,7 @@ T SubtleMustCopyIfIntegral(const T& value) {
   return internal::SubtleMustCopy(value);
 }
 
-inline const string& SubtleMustCopyIfIntegral(const string& value) {
+inline const tstring& SubtleMustCopyIfIntegral(const tstring& value) {
   return value;
 }
 
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index c3b80f04ed2..1fe7988aa67 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -238,7 +238,7 @@ class TextFileLineIterator
         tensor->flat<double>()(0) = value;
       } break;
       case DT_STRING:
-        tensor->flat<string>()(0) = token;
+        tensor->flat<tstring>()(0) = token;
         break;
       default:
         valid_ = false;
@@ -264,7 +264,7 @@ Status GetTableHandle(const string& input_name, OpKernelContext* ctx,
           "Lookup table handle must be scalar, but had shape: ",
           tensor.shape().DebugString());
     }
-    auto h = tensor.flat<string>();
+    auto h = tensor.flat<tstring>();
     *container = h(0);
     *table_handle = h(1);
   }
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index a1997579037..afe94edd4ba 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -36,9 +36,17 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#if TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#endif
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -164,7 +172,7 @@ struct LaunchLRN<CPUDevice, T> {
   T beta_;
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
 struct LaunchLRN<GPUDevice, T> {
@@ -173,6 +181,7 @@ struct LaunchLRN<GPUDevice, T> {
 
   void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in,
               Tensor* output) {
+#if GOOGLE_CUDA
     OP_REQUIRES(
         context, beta_ >= 0.01,
         errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_));
@@ -217,6 +226,71 @@ struct LaunchLRN<GPUDevice, T> {
             .ok();
     OP_REQUIRES(context, status,
                 errors::Internal("NormalizeWithDimensions launch failed"));
+#elif TENSORFLOW_USE_ROCM
+    // For NHWC input/output tensors, convert to NCHW because it's the only
+    // supported format in MIOpen for now.
+
+    // Cast to platform-specific int to avoid conversion warnings.
+    const int batch = static_cast<int>(in.dim_size(0));
+    const int rows = static_cast<int>(in.dim_size(1));
+    const int cols = static_cast<int>(in.dim_size(2));
+    const int depth = static_cast<int>(in.dim_size(3));
+
+    Tensor transformed_input;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       ShapeFromFormat(FORMAT_NCHW, in.shape(), FORMAT_NHWC),
+                       &transformed_input));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in.tensor<T, 4>(),
+                                           transformed_input.tensor<T, 4>());
+
+    Tensor transformed_output;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
+                     &transformed_output));
+
+    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    dimensions_desc.set_count(batch)
+        .set_height(rows)
+        .set_width(cols)
+        .set_feature_map_count(depth)
+        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    normalize_desc.set_bias(bias_)
+        .set_range(depth_radius_)
+        .set_alpha(alpha_)
+        .set_beta(beta_);
+
+    auto input_data =
+        AsDeviceMemory(transformed_input.template flat<T>().data(),
+                       transformed_input.template flat<T>().size());
+    auto output_data =
+        AsDeviceMemory(transformed_output.template flat<T>().data(),
+                       transformed_output.template flat<T>().size());
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    bool status =
+        stream
+            ->ThenNormalizeWithDimensions(normalize_desc, dimensions_desc,
+                                          input_data, &output_data)
+            .ok();
+    OP_REQUIRES(context, status,
+                errors::Internal("NormalizeWithDimensions launch failed"));
+
+    // Need to convert it back to NHWC once MIOpen kernels finishes.
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_output).template tensor<T, 4>(),
+        output->tensor<T, 4>());
+#endif
   }
 
   int depth_radius_;
@@ -225,7 +299,7 @@ struct LaunchLRN<GPUDevice, T> {
   T beta_;
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T>
 class LRNOp : public OpKernel {
@@ -292,7 +366,7 @@ TF_CALL_half(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(T)                                      \
   REGISTER_KERNEL_BUILDER(                                   \
@@ -302,7 +376,7 @@ TF_CALL_float(REGISTER_GPU);
 
 #undef REGISTER_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if !defined(IS_MOBILE_PLATFORM)
 
@@ -390,7 +464,7 @@ struct LaunchLRNGrad<CPUDevice, T> {
   T alpha_beta_2_;
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
 struct LaunchLRNGrad<GPUDevice, T> {
@@ -400,6 +474,7 @@ struct LaunchLRNGrad<GPUDevice, T> {
   void launch(OpKernelContext* context, OpKernel* kernel,
               const Tensor& in_grads, const Tensor& in_image,
               const Tensor& out_image, Tensor* output) {
+#if GOOGLE_CUDA
     OP_REQUIRES(
         context, beta_ >= 0.01,
         errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_));
@@ -447,6 +522,105 @@ struct LaunchLRNGrad<GPUDevice, T> {
     OP_REQUIRES(
         context, status,
         errors::Internal("NormalizeBackwardWithDimensions launch failed"));
+#elif TENSORFLOW_USE_ROCM
+    // For NHWC input/output tensors, convert to NCHW because it's the only
+    // supported format in MIOpen for now.
+    const int64 batch = in_grads.dim_size(0);
+    const int64 rows = in_grads.dim_size(1);
+    const int64 cols = in_grads.dim_size(2);
+    const int64 depth = in_grads.dim_size(3);
+
+    Tensor transformed_in_grads;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, in_grads.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_in_grads));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in_grads.tensor<T, 4>(),
+                                           transformed_in_grads.tensor<T, 4>());
+
+    Tensor transformed_in_image;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, in_image.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_in_image));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in_image.tensor<T, 4>(),
+                                           transformed_in_image.tensor<T, 4>());
+
+    Tensor transformed_out_image;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, out_image.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_out_image));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(), out_image.tensor<T, 4>(),
+        transformed_out_image.tensor<T, 4>());
+
+    Tensor transformed_output;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
+                     &transformed_output));
+
+    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    dimensions_desc.set_count(batch)
+        .set_height(rows)
+        .set_width(cols)
+        .set_feature_map_count(depth)
+        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    normalize_desc.set_bias(bias_)
+        .set_range(depth_radius_)
+        .set_alpha(alpha_)
+        .set_beta(beta_);
+
+    auto input_grads_data =
+        AsDeviceMemory(transformed_in_grads.template flat<T>().data(),
+                       transformed_in_grads.template flat<T>().size());
+    auto input_image_data =
+        AsDeviceMemory(transformed_in_image.template flat<T>().data(),
+                       transformed_in_image.template flat<T>().size());
+    auto output_image_data =
+        AsDeviceMemory(transformed_out_image.template flat<T>().data(),
+                       transformed_out_image.template flat<T>().size());
+    auto output_grads_data =
+        AsDeviceMemory(transformed_output.template flat<T>().data(),
+                       transformed_output.template flat<T>().size());
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    static int64 NormalizeBackwardScratchSize = GetDnnWorkspaceLimit(
+        // default value is in bytes despite the name of the environment
+        // variable
+        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+    );
+
+    DnnScratchAllocator scratch_allocator(NormalizeBackwardScratchSize,
+                                          context);
+    bool status = stream
+                      ->ThenNormalizeBackwardWithDimensions(
+                          normalize_desc, dimensions_desc, input_image_data,
+                          output_image_data, input_grads_data,
+                          &output_grads_data, &scratch_allocator)
+                      .ok();
+    OP_REQUIRES(
+        context, status,
+        errors::Internal("NormalizeBackwardWithDimensions launch failed"));
+
+    // Need to convert it back to NHWC once MIOpen kernels finishes.
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_output).template tensor<T, 4>(),
+        output->tensor<T, 4>());
+#endif
   }
 
   int depth_radius_;
@@ -455,7 +629,7 @@ struct LaunchLRNGrad<GPUDevice, T> {
   T beta_;
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T>
 class LRNGradOp : public OpKernel {
@@ -524,7 +698,7 @@ TF_CALL_half(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(T)                                          \
   REGISTER_KERNEL_BUILDER(                                       \
@@ -534,7 +708,7 @@ TF_CALL_float(REGISTER_GPU);
 
 #undef REGISTER_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // !defined(IS_MOBILE_PLATFORM)
 
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc
index 8ae38013341..47b37ed7f7a 100644
--- a/tensorflow/core/kernels/lu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc
@@ -37,7 +37,8 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 template <typename Scalar>
 __device__ void ComputePermutationFromTranspositions(
-    int64 num_rows, const int* pivots, Scalar* permutation_indices) {
+    int64 num_rows, const int* __restrict__ pivots,
+    Scalar* __restrict__ permutation_indices) {
   // Fill in the output array with the identity permutation.
   for (int i = 0; i < num_rows; ++i) {
     permutation_indices[i] = Scalar(i);
@@ -61,8 +62,9 @@ __device__ void ComputePermutationFromTranspositions(
 // transpositions.
 template <typename Scalar>
 __global__ void ComputePermutationFromTranspositionsKernel(
-    GpuLaunchConfig config, const int64 num_rows, const int* all_pivots,
-    Scalar* all_permutation_indices) {
+    GpuLaunchConfig config, const int64 num_rows,
+    const int* __restrict__ all_pivots,
+    Scalar* __restrict__ all_permutation_indices) {
   // We only parallelize over batches here. Performance is not critical,
   // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
   // LU factorization.
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 7912ca1563c..0ba718c88ec 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -40,7 +40,7 @@ class MatchingFilesOp : public OpKernel {
         errors::InvalidArgument(
             "Input patterns tensor must be scalar or vector, but had shape: ",
             patterns_t->shape().DebugString()));
-    const auto patterns = patterns_t->flat<string>();
+    const auto patterns = patterns_t->flat<tstring>();
     int num_patterns = patterns.size();
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
@@ -53,7 +53,7 @@ class MatchingFilesOp : public OpKernel {
     OP_REQUIRES_OK(
         context, context->allocate_output("filenames", TensorShape({num_files}),
                                           &output_t));
-    auto output = output_t->vec<string>();
+    auto output = output_t->vec<tstring>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
       for (int j = 0; j < all_fnames[i].size(); j++) {
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 3bdc303dff5..7b9d82718eb 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -189,9 +189,7 @@ class FusedMatMulOp : public OpKernel {
       Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       FusedMatMulOp<CPUDevice, T>);
 
-#ifndef EIGEN_USE_LIBXSMM
 TF_CALL_float(REGISTER_FUSED_CPU_MATMUL);
-#endif  // !EIGEN_USE_LIBXSMM
 
 #undef REGISTER_FUSED_CPU_MATMUL
 
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index 5c90963ea72..4a94c51e878 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -33,8 +33,8 @@ __global__ void MatrixBandPartKernel(const int num_threads,
                                      const int batch_size, const int m,
                                      const int n, const int num_lower_diags,
                                      const int num_upper_diags,
-                                     const Scalar* input_ptr,
-                                     Scalar* output_ptr) {
+                                     const Scalar* __restrict__ input_ptr,
+                                     Scalar* __restrict__ output_ptr) {
   GPU_1D_KERNEL_LOOP(index, num_threads) {
     const int col = index % n;
     const int row = (index / n) % m;
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index d26978fe49c..55489073f93 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -31,8 +31,8 @@ __global__ void MatrixSetDiagKernel(const int num_threads, const int m,
                                     const int n, const int num_diags,
                                     const int max_diag_len,
                                     const int upper_diag_index,
-                                    const Scalar* diag_ptr,
-                                    Scalar* output_ptr) {
+                                    const Scalar* __restrict__ diag_ptr,
+                                    Scalar* __restrict__ output_ptr) {
   GPU_1D_KERNEL_LOOP(index, num_threads) {
     const int batch_and_diag_index = index / max_diag_len;
     const int index_in_the_diagonal =
@@ -56,8 +56,8 @@ template <typename Scalar>
 __global__ void MatrixCopyInputAndSetDiagKernel(
     const int num_threads, const int m, const int n, const int num_diags,
     const int max_diag_len, const int lower_diag_index,
-    const int upper_diag_index, const Scalar* input_ptr, const Scalar* diag_ptr,
-    Scalar* output_ptr) {
+    const int upper_diag_index, const Scalar* __restrict__ input_ptr,
+    const Scalar* __restrict__ diag_ptr, Scalar* __restrict__ output_ptr) {
   GPU_1D_KERNEL_LOOP(index, num_threads) {
     const int batch_and_row_index = index / n;
     const int col = index - batch_and_row_index * n;
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index bc7eb49fff0..16fb29f0c3e 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -83,7 +83,7 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
     const ConstMatrixMap& rhs = inputs[1];
     MatrixMap& output = outputs->at(0);
 
-    if (matrix.rows() == 0 || rhs.cols() == 0) {
+    if (matrix.rows() == 0 || rhs.rows() == 0 || rhs.cols() == 0) {
       // To be consistent with the MatrixInverse op, we define the solution for
       // an empty set of equation as the empty matrix.
       return;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f3b2e516be2..2a4bb9a94fe 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -65,11 +65,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
 template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNCHW(
-    const int nthreads, const dtype* bottom_data, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    dtype* top_data, int64* mask, const bool include_batch_in_index) {
+    const int nthreads, const dtype* __restrict__ bottom_data,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* __restrict__ top_data, int64* __restrict__ mask,
+    const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -108,11 +109,11 @@ __global__ void MaxPoolForwardNCHW(
 // the same X, y coordinate.
 // (so channels = outer_channels, output_size = real output size / 4).
 __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
-    const int nthreads, const int32* bottom_data, const int height,
+    const int nthreads, const int32* __restrict__ bottom_data, const int height,
     const int width, const int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    int32* top_data) {
+    int32* __restrict__ top_data) {
   // TODO(pauldonnelly): Implement a better optimized version of this kernel.
   const int32 kMinINT8X4 = 0x80808080;
   GPU_1D_KERNEL_LOOP(index, nthreads) {
@@ -141,11 +142,12 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
 
 template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNHWC(
-    const int nthreads, const dtype* bottom_data, const int height,
+    const int nthreads, const dtype* __restrict__ bottom_data, const int height,
     const int width, const int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    dtype* top_data, int64* mask, const bool include_batch_in_index) {
+    dtype* __restrict__ top_data, int64* __restrict__ mask,
+    const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -180,11 +182,11 @@ __global__ void MaxPoolForwardNHWC(
 
 template <typename dtype>
 __global__ void MaxPoolBackwardNoMaskNHWC(
-    const int nthreads, const dtype* bottom_data, const int height,
+    const int nthreads, const dtype* __restrict__ bottom_data, const int height,
     const int width, const int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* top_diff, dtype* bottom_diff) {
+    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // First find out the index to the maximum, since we have no mask.
     int n = index;
@@ -240,9 +242,11 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
 // the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
-__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
-                                const int64* mask, const int top_offset,
-                                const int bottom_offset, dtype* bottom_diff,
+__global__ void MaxPoolBackward(const int nthreads,
+                                const dtype* __restrict__ top_diff,
+                                const int64* __restrict__ mask,
+                                const int top_offset, const int bottom_offset,
+                                dtype* __restrict__ bottom_diff,
                                 const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     const int offset =
@@ -267,11 +271,12 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
 //     bottom_diff: the gradient of the gradient w.r.t. output.
 template <typename dtype>
 __global__ void MaxPoolGradBackwardNoMaskNCHW(
-    const int nthreads, const dtype* bottom_data, const dtype* output_data,
-    const int pooled_height, const int pooled_width, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* top_diff, dtype* bottom_diff) {
+    const int nthreads, const dtype* __restrict__ bottom_data,
+    const dtype* __restrict__ output_data, const int pooled_height,
+    const int pooled_width, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l,
+    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // First find out the index to the maximum, since we have no mask.
     int pw = index % pooled_width;
@@ -307,11 +312,12 @@ __global__ void MaxPoolGradBackwardNoMaskNCHW(
 
 template <typename dtype>
 __global__ void MaxPoolGradBackwardNoMaskNHWC(
-    const int nthreads, const dtype* bottom_data, const dtype* output_data,
-    const int pooled_height, const int pooled_width, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* top_diff, dtype* bottom_diff) {
+    const int nthreads, const dtype* __restrict__ bottom_data,
+    const dtype* __restrict__ output_data, const int pooled_height,
+    const int pooled_width, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l,
+    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // First find out the index to the maximum, since we have no mask.
     int n = index;
@@ -367,9 +373,12 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
 //     include_batch_in_index: whether to include batch dimension in flattened
 //         index of `argmax`.
 template <typename dtype>
-__global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
-                                    const int64* mask, const int top_offset,
-                                    const int bottom_offset, dtype* bottom_diff,
+__global__ void MaxPoolGradBackward(const int nthreads,
+                                    const dtype* __restrict__ top_diff,
+                                    const int64* __restrict__ mask,
+                                    const int top_offset,
+                                    const int bottom_offset,
+                                    dtype* __restrict__ bottom_diff,
                                     const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     const int offset =
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index 693ed8a8f05..bd65c487a39 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -79,10 +79,10 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
     // Now merges.
     MakeOp(delete_old_dirs);
     // Add checkpoint_prefixes.
-    AddInput<string>(TensorShape({2}),
-                     [&prefixes](int i) -> string { return prefixes[i]; });
+    AddInput<tstring>(TensorShape({2}),
+                      [&prefixes](int i) -> tstring { return prefixes[i]; });
     // Add destination_prefix.
-    AddInput<string>(TensorShape({}), [kMergedPrefix](int unused) -> string {
+    AddInput<tstring>(TensorShape({}), [kMergedPrefix](int unused) -> tstring {
       return kMergedPrefix;
     });
     TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index 6f5b8a3536f..20211c88c8b 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -173,7 +173,7 @@ namespace functor {
   DECLARE_CPU_SPEC(T, int64, 5);
 
 TF_CALL_POD_TYPES(DECLARE_CPU_SPECS);
-TF_CALL_string(DECLARE_CPU_SPECS);
+TF_CALL_tstring(DECLARE_CPU_SPECS);
 
 #undef DECLARE_CPU_SPEC
 #undef DECLARE_CPU_SPECS
@@ -195,7 +195,7 @@ TF_CALL_string(DECLARE_CPU_SPECS);
 
 // Note that we do register for bool type, but not in the gradient op.
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
-TF_CALL_string(REGISTER_KERNEL);
+TF_CALL_tstring(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
index 98e3be082d7..45e6676e5a6 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
@@ -29,7 +29,7 @@ using CpuDevice = Eigen::ThreadPoolDevice;
   template struct functor::MirrorPad<CpuDevice, T, int32, CPU_PROVIDED_IXDIM>; \
   template struct functor::MirrorPad<CpuDevice, T, int64, CPU_PROVIDED_IXDIM>;
 TF_CALL_POD_TYPES(DEFINE_CPU_SPECS);
-TF_CALL_string(DEFINE_CPU_SPECS);
+TF_CALL_tstring(DEFINE_CPU_SPECS);
 #undef DEFINE_CPU_SPECS
 
 #define DEFINE_CPU_SPECS(T)                                   \
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 5a0401cbad7..8966260c4fe 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -41,13 +41,16 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-template <typename Device, typename Scalar>
+//  The third parameter v2_bcast is set to true if we are using V2 otherwise
+//  we set it to false.
+template <typename Device, typename Scalar, bool v2_bcast>
 class BatchMatMulMkl : public OpKernel {
  public:
   explicit BatchMatMulMkl(OpKernelConstruction *context) : OpKernel(context) {
@@ -60,28 +63,54 @@ class BatchMatMulMkl : public OpKernel {
   void Compute(OpKernelContext *ctx) override {
     const Tensor &lhs = ctx->input(0);
     const Tensor &rhs = ctx->input(1);
-    OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
-                errors::InvalidArgument("lhs and rhs has different ndims: ",
-                                        lhs.shape().DebugString(), " vs. ",
-                                        rhs.shape().DebugString()));
-    const int ndims = lhs.dims();
-    OP_REQUIRES(
-        ctx, ndims >= 2,
-        errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
-    TensorShape out_shape;
-    for (int i = 0; i < ndims - 2; ++i) {
-      OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
-                  errors::InvalidArgument(
-                      "lhs.dim(", i, ") and rhs.dim(", i,
-                      ") must be the same: ", lhs.shape().DebugString(), " vs ",
-                      rhs.shape().DebugString()));
-      out_shape.AddDim(lhs.dim_size(i));
+
+    if (!v2_bcast) {
+      // Using V1, so check to make sure lhs and rhs dimensions are correct and
+      // no broadcasting is needed.
+      OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
+                  errors::InvalidArgument("lhs and rhs has different ndims: ",
+                                          lhs.shape().DebugString(), " vs. ",
+                                          rhs.shape().DebugString()));
+      const int ndims = lhs.dims();
+      OP_REQUIRES(
+          ctx, ndims >= 2,
+          errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
+      for (int i = 0; i < ndims - 2; ++i) {
+        OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
+                    errors::InvalidArgument(
+                        "lhs.dim(", i, ") and rhs.dim(", i,
+                        ") must be the same: ", lhs.shape().DebugString(),
+                        " vs ", rhs.shape().DebugString()));
+      }
+    } else {
+      OP_REQUIRES(
+          ctx, lhs.dims() >= 2,
+          errors::InvalidArgument("In[0] ndims must be >= 2: ", lhs.dims()));
+      OP_REQUIRES(
+          ctx, rhs.dims() >= 2,
+          errors::InvalidArgument("In[1] ndims must be >= 2: ", rhs.dims()));
     }
-    auto batch_size = (ndims == 2) ? 1 : out_shape.num_elements();
-    auto lhs_rows = lhs.dim_size(ndims - 2);
-    auto lhs_cols = lhs.dim_size(ndims - 1);
-    auto rhs_rows = rhs.dim_size(ndims - 2);
-    auto rhs_cols = rhs.dim_size(ndims - 1);
+
+    // lhs and rhs can have different dimensions
+    const int ndims_lhs = lhs.dims();
+    const int ndims_rhs = rhs.dims();
+
+    // Get broadcast info
+    MatMulBCast bcast(lhs.shape().dim_sizes(), rhs.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+
+    auto lhs_rows = lhs.dim_size(ndims_lhs - 2);
+    auto lhs_cols = lhs.dim_size(ndims_lhs - 1);
+    auto rhs_rows = rhs.dim_size(ndims_rhs - 2);
+    auto rhs_cols = rhs.dim_size(ndims_rhs - 1);
+
     if (adj_x_) std::swap(lhs_rows, lhs_cols);
     if (adj_y_) std::swap(rhs_rows, rhs_cols);
     OP_REQUIRES(ctx, lhs_cols == rhs_rows,
@@ -89,8 +118,10 @@ class BatchMatMulMkl : public OpKernel {
                     "lhs mismatch rhs shape: ", lhs_cols, " vs. ", rhs_rows,
                     ": ", lhs.shape().DebugString(), " ",
                     rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
+
     out_shape.AddDim(lhs_rows);
     out_shape.AddDim(rhs_cols);
+
     Tensor *out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
     if (out->NumElements() == 0) {
@@ -122,10 +153,24 @@ class BatchMatMulMkl : public OpKernel {
     a_array.reserve(batch_size);
     b_array.reserve(batch_size);
     c_array.reserve(batch_size);
-    for (int64 i = 0; i < batch_size; i++) {
-      a_array.push_back(&lhs_reshaped(i, 0, 0));
-      b_array.push_back(&rhs_reshaped(i, 0, 0));
-      c_array.push_back(&out_reshaped(i, 0, 0));
+
+    if (!bcast.IsBroadcastingRequired()) {
+      for (int64 i = 0; i < batch_size; i++) {
+        a_array.push_back(&lhs_reshaped(i, 0, 0));
+        b_array.push_back(&rhs_reshaped(i, 0, 0));
+        c_array.push_back(&out_reshaped(i, 0, 0));
+      }
+    } else {
+      // Broadcasting is needed, so get the mapping from flattened output batch
+      // indices to x's and y's flattened batch indices.
+      const std::vector<int64> &a_batch_indices = bcast.x_batch_indices();
+      const std::vector<int64> &b_batch_indices = bcast.y_batch_indices();
+
+      for (int64 i = 0; i < batch_size; i++) {
+        a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0));
+        b_array.push_back(&rhs_reshaped(b_batch_indices[i], 0, 0));
+        c_array.push_back(&out_reshaped(i, 0, 0));
+      }
     }
 
     MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
@@ -226,13 +271,25 @@ class BatchMatMulMkl : public OpKernel {
                               .Device(DEVICE_CPU)                             \
                               .TypeConstraint<TYPE>("T")                      \
                               .Label(mkl_op_registry::kMklNameChangeOpLabel), \
-                          BatchMatMulMkl<CPUDevice, TYPE>)
+                          BatchMatMulMkl<CPUDevice, TYPE, false>)
+
+#define REGISTER_BATCH_MATMUL_MKL_V2(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(Name("_MklBatchMatMulV2")                           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<TYPE>("T")                      \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          BatchMatMulMkl<CPUDevice, TYPE, true>)
 
 #ifdef ENABLE_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL);
+
+TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_double(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL_V2);
 #endif  // ENABLE_MKL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 14bfc9a5ffa..b701db88ea6 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -461,8 +461,14 @@ class MklConcatOp : public OpKernel {
               dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
           // Set the output format same as the most common format of inputs
           // to avoid layout conversions.
-          dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                                mkl_common_format);
+          if (mkl_common_format == memory::format::blocked) {
+            VLOG(1) << "mkl_common_format == memory::format::blocked";
+            dst_md = MklDnnData<T>::CreateBlockedMemDesc(
+                dst_dims_in_nchw, CalculateTFStrides(dst_dims_in_nchw));
+          } else {
+            dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
+                                  mkl_common_format);
+          }
         } else if (dst_dims.size() == 2 &&
                    mkl_common_format == memory::format::nc) {
           // When memory::format::nc, dst_dims are already in MKL-DNN order
@@ -555,7 +561,6 @@ class MklConcatOp : public OpKernel {
                                   dnn_shape_dst);
         DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
       }
-
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -578,9 +583,9 @@ class MklConcatOp : public OpKernel {
     for (size_t i = 0; i < num_mkl_input_shapes; ++i) {
       if (mkl_input_shapes[i].IsMklTensor()) {
         // do conversion from MKL to TF
-        Tensor tmp_tensor =
-            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
-        converted_values[i] = tmp_tensor;
+        OP_REQUIRES_OK(
+            context, ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i],
+                                       &converted_values[i]));
         tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
       } else {
         // no conversion since it is TF tensor already
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index aa4254de20b..fa3264d825f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -357,7 +357,8 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool bias_enabled, bool is_depthwise>
+template <typename Device, class T, bool bias_enabled, bool is_depthwise,
+          bool eager_mode>
 class MklConvCustomBackpropFilterOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -382,9 +383,9 @@ class MklConvCustomBackpropFilterOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
-      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
+      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -395,7 +396,8 @@ class MklConvCustomBackpropFilterOp
       // allow this class to handle this case.
       TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor);
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
-      TensorShape diff_dst_tf_shape = GetTfShape(context, kOutbpropIdx);
+      TensorShape diff_dst_tf_shape =
+          GetTfShape(context, kOutbpropIdx, eager_mode);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_filter_tensor = nullptr;
@@ -408,7 +410,8 @@ class MklConvCustomBackpropFilterOp
             GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor,
-                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+                                  diff_filter_tf_shape, diff_filter_mkl_shape,
+                                  eager_mode);
         CHECK_NOTNULL(diff_filter_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -493,8 +496,8 @@ class MklConvCustomBackpropFilterOp
                bwd_output_dims[MklDnnDims::Dim_I],
                bwd_output_dims[MklDnnDims::Dim_O]});
           AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                    diff_filter_tf_shape,
-                                    diff_filter_mkl_shape);
+                                    diff_filter_tf_shape, diff_filter_mkl_shape,
+                                    eager_mode);
         } else {
           // Depthwise Conv2d: bwd_output_dims is GOIHW format
           //                  | TensorFlow       | MKLDNN
@@ -620,7 +623,7 @@ class MklConvCustomBackpropFilterOp
   TensorShape MakeInputTfShape(OpKernelContext* context,
                                const Tensor& input_tensor) {
     size_t input_idx = 0;
-    return GetTfShape(context, input_idx);
+    return GetTfShape(context, input_idx, eager_mode);
   }
 
   // Get TensorFlow shape of filter tensor.
@@ -699,37 +702,43 @@ class MklConvCustomBackpropFilterOp
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                            \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("_MklConv2DBackpropFilter")                            \
-          .Device(DEVICE_CPU)                                     \
-          .TypeConstraint<T>("T")                                 \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>); \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("_MklConv2DBackpropFilterWithBias")                    \
-          .Device(DEVICE_CPU)                                     \
-          .TypeConstraint<T>("T")                                 \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, true, false>);  \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("_MklDepthwiseConv2dNativeBackpropFilter")             \
-          .Device(DEVICE_CPU)                                     \
-          .TypeConstraint<T>("T")                                 \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true>);  \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("__MklDummyConv2DBackpropFilterWithBias")              \
-          .Device(DEVICE_CPU)                                     \
-          .TypeConstraint<T>("T")                                 \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
-      MklDummyOp<CPUDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("_MklConv3DBackpropFilterV2")                          \
-          .Device(DEVICE_CPU)                                     \
-          .TypeConstraint<T>("T")                                 \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklEagerConv2DBackpropFilter")                              \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, true, false, false>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklDepthwiseConv2dNativeBackpropFilter")                    \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("__MklDummyConv2DBackpropFilterWithBias")                     \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
+      MklDummyOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv3DBackpropFilterV2")                                 \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_FILTER_KERNELS);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index e23e099916a..943f4989f54 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -295,7 +295,7 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool is_depthwise>
+template <typename Device, class T, bool is_depthwise, bool eager_mode>
 class MklConvCustomBackpropInputOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -319,9 +319,9 @@ class MklConvCustomBackpropInputOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
-      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
+      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -332,7 +332,8 @@ class MklConvCustomBackpropInputOp
       // allow this class to handle this case.
       TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor);
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
-      TensorShape diff_dst_tf_shape = GetTfShape(context, kOutbpropIdx);
+      TensorShape diff_dst_tf_shape =
+          GetTfShape(context, kOutbpropIdx, eager_mode);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_src_tensor = nullptr;
@@ -345,7 +346,8 @@ class MklConvCustomBackpropInputOp
             GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor,
-                                  diff_src_tf_shape, diff_src_mkl_shape);
+                                  diff_src_tf_shape, diff_src_mkl_shape,
+                                  eager_mode);
         CHECK_NOTNULL(diff_src_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -429,9 +431,13 @@ class MklConvCustomBackpropInputOp
                                      bwd_diff_src_dims, bwd_diff_src_format);
       TensorShape diff_src_tf_shape;
       diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T));
+      Tensor tmp_tensor;
+      if (eager_mode) {
+        AllocTmpBuffer<T>(context, &tmp_tensor, diff_src_tf_shape);
+        diff_src_tf_shape = diff_src_mkl_shape.GetTfShape();
+      }
       AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape,
-                                diff_src_mkl_shape);
-
+                                diff_src_mkl_shape, eager_mode);
       T* diff_src_data =
           static_cast<T*>(const_cast<T*>(diff_src_tensor->flat<T>().data()));
 
@@ -458,7 +464,25 @@ class MklConvCustomBackpropInputOp
       }
 
       // execute convolution input bwd
-      conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+      if (!eager_mode) {
+        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+      } else {
+        // In eager mode we first write the output to temporary
+        // buffer in MKL format. Then we convert the data to TF format.
+        T* tmp_data =
+            static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
+        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data);
+        auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
+        auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+        mkldnn::reorder::primitive_desc reorder_pd =
+            mkldnn::reorder::primitive_desc(diff_src_pd, output_tf_pd);
+        std::vector<mkldnn::primitive> net;
+        memory* tmp_data_mem = new memory(diff_src_pd, tmp_data);
+        memory* dst_data_mem = new memory(output_tf_pd, diff_src_data);
+        net.push_back(
+            mkldnn::reorder(reorder_pd, *tmp_data_mem, *dst_data_mem));
+        stream(stream::kind::eager).submit(net).wait();
+      }
 
       // delete primitive since it is not cached.
       if (do_not_cache) {
@@ -506,7 +530,7 @@ class MklConvCustomBackpropInputOp
   // Get TensorFlow shape of filter tensor.
   TensorShape MakeFilterTfShape(OpKernelContext* context,
                                 const Tensor& filter_tensor) {
-    return GetTfShape(context, kInputIndex_Filter);
+    return GetTfShape(context, kInputIndex_Filter, eager_mode);
   }
 
   // Get the Tensorflow shape of Output (diff_src),
@@ -557,26 +581,31 @@ class MklConvCustomBackpropInputOp
   }
 };
 
-#define REGISTER_MKL_CPU_KERNELS(T)                            \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklConv2DBackpropInput")                          \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvCustomBackpropInputOp<CPUDevice, T, false>);      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklConv3DBackpropInputV2")                        \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvCustomBackpropInputOp<CPUDevice, T, false>);      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklDepthwiseConv2dNativeBackpropInput")           \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvCustomBackpropInputOp<CPUDevice, T, true>);
-
+#define REGISTER_MKL_CPU_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklConv2DBackpropInput")                            \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
+      MklConvCustomBackpropInputOp<CPUDevice, T, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklEagerConv2DBackpropInput")                       \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
+      MklConvCustomBackpropInputOp<CPUDevice, T, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklConv3DBackpropInputV2")                          \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
+      MklConvCustomBackpropInputOp<CPUDevice, T, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklDepthwiseConv2dNativeBackpropInput")             \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
+      MklConvCustomBackpropInputOp<CPUDevice, T, true, false>);
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 14344da0560..a74fd2f521f 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -47,13 +47,100 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+using mkldnn::convolution_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-using mkldnn::convolution_forward;
-using mkldnn::convolution_direct;
+
+using ConvFwdPd = mkldnn::convolution_forward::primitive_desc;
+using ReorderPd = mkldnn::reorder::primitive_desc;
 
 namespace tensorflow {
 
+#ifdef ENABLE_MKLDNN_V1
+#define ADD_MD add_md
+#define ALGORITHM mkldnn::algorithm
+#define ALGORITHM_UNDEF ALGORITHM::undef
+#define CPU_STREAM(engine) stream(engine)
+#define DATA_WITH_ENGINE(data, engine) data, engine
+#define DST_MD dst_md
+#define ENGINE_CPU engine::kind::cpu
+#define GET_DESC get_desc()
+#define GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm) \
+  { {dims}, MklDnnType<type>(), memory::format_tag::fm }
+#define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd->src_desc()
+#define GET_WEIGHTS_DESC_FROM_OP_PD(op_pd) op_pd->weights_desc()
+#define GET_WEIGHTS_FORMAT_FROM_OP_PD(op_pd, op_primitive) \
+  GET_WEIGHTS_DESC_FROM_OP_PD(op_pd)
+#define IS_FILTER_REORDER_NEEDED(filter_md, op_pd, op_primitive) \
+  filter_md != op_pd->weights_desc()
+#define IS_SRC_REORDER_NEEDED(src_md, op_pd, op_primitive) \
+  src_md != op_pd->src_desc()
+#define MEMORY_CONSTRUCTOR(mem_desc, engine, data) \
+  memory(mem_desc, engine, data)
+#define MEMORY_CONSTRUCTOR_USING_MEM_PD(dims, type, fm, engine, data) \
+  memory(GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine, data)
+#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_desc, engine) \
+  memory(mem_desc, engine)
+#define MEMORY_DESC memory::desc
+#define MEMORY_FORMAT mkldnn::memory::format_tag
+#define MEMORY_PD_CONSTRUCTOR(dims, type, fm, engine) \
+  memory::desc({dims}, MklDnnType<type>(), memory::format_tag::fm)
+#define MEMORY_PD_WITHOUT_DATA(md, engine) md, engine
+#define MKL_TENSOR_FORMAT MklTensorFormat
+#define MKL_TENSOR_FORMAT_BLOCKED MklTensorFormat::FORMAT_BLOCKED
+#define MKL_TENSOR_FORMAT_IN_C MKL_TENSOR_FORMAT
+#define OUTPUT_TF_MD output_tf_md
+#define PRIMITIVE_DESC_BIAS bias_desc()
+#define PRIMITIVE_DESC_DST dst_desc()
+#define PRIMITIVE_DESC_SRC src_desc()
+#define PRIMITIVE_DESC_WEIGHTS weights_desc()
+#define REORDER_PD_CONSTRUCTOR(src_md, dst_md, engine) \
+  ReorderPd(engine, src_md, engine, dst_md)
+#define REORDER_PD_CONSTRUCTOR_WITH_ATTR(src_md, dst_md, engine, prim_attr) \
+  ReorderPd(engine, src_md, engine, dst_md, prim_attr)
+#define SUMMAND_MD summand_md
+#else
+#define ADD_MD add_pd
+#define ALGORITHM mkldnn
+#define ALGORITHM_UNDEF ALGORITHM::algorithm_undef
+#define CPU_STREAM(engine) stream(stream::kind::eager)
+#define DATA_WITH_ENGINE(data, engine) data
+#define DST_MD dst_pd
+#define ENGINE_CPU engine::cpu
+#define GET_DESC get_primitive_desc()
+#define GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm) \
+  { {dims}, MklDnnType<type>(), memory::format::fm }
+#define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd.get()->src_primitive_desc()
+#define GET_WEIGHTS_DESC_FROM_OP_PD(op_pd) op_pd.get()->weights_primitive_desc()
+#define GET_WEIGHTS_FORMAT_FROM_OP_PD(op_pd, op_primitive) \
+  op_primitive->GetFilterMemoryFormat()
+#define IS_FILTER_REORDER_NEEDED(filter_md, op_pd, op_primitive) \
+  filter_md.data.format != op_primitive->GetFilterMemoryFormat()
+#define IS_SRC_REORDER_NEEDED(src_md, op_pd, op_primitive) \
+  src_md.data.format != op_primitive->GetSrcMemoryFormat()
+#define MEMORY_CONSTRUCTOR(mem_pd, engine, data) memory(mem_pd, data)
+#define MEMORY_CONSTRUCTOR_USING_MEM_PD(dims, type, fm, engine, data) \
+  memory({GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine}, data)
+#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_pd, engine) memory(mem_pd)
+#define MEMORY_DESC memory::format
+#define MEMORY_FORMAT mkldnn::memory::format
+#define MEMORY_PD_CONSTRUCTOR(dims, type, fm, engine) \
+  memory::primitive_desc(GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine)
+#define MEMORY_PD_WITHOUT_DATA(pd, engine) pd
+#define MKL_TENSOR_FORMAT memory::format
+#define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked
+#define MKL_TENSOR_FORMAT_IN_C mkldnn_memory_format_t
+#define OUTPUT_TF_MD output_tf_pd
+#define PRIMITIVE_DESC_BIAS bias_primitive_desc()
+#define PRIMITIVE_DESC_DST dst_primitive_desc()
+#define PRIMITIVE_DESC_SRC src_primitive_desc()
+#define PRIMITIVE_DESC_WEIGHTS weights_primitive_desc()
+#define REORDER_PD_CONSTRUCTOR(src_pd, dst_pd, engine) ReorderPd(src_pd, dst_pd)
+#define REORDER_PD_CONSTRUCTOR_WITH_ATTR(src_pd, dst_pd, engine, prim_attr) \
+  ReorderPd(src_pd, dst_pd, prim_attr)
+#define SUMMAND_MD summand_pd
+#endif  // ENABLE_MKLDNN_V1
+
 // This structure aggregates multiple inputs to Conv2DFwd* methods.
 struct MklConvFwdParams {
   memory::dims src_dims;
@@ -86,17 +173,15 @@ struct MklConvFwdParams {
         padding_right(padding_right) {}
 };
 
-typedef mkldnn::convolution_forward::primitive_desc ConvFwdPd;
-
 // With quantization, input, filter, and output can have different types
 // so we use different template parameter for each type
 template <typename Tinput, typename Tfilter, typename Tbias, typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
-      : cpu_engine_(engine::cpu, 0) {
-    context_.fwd_stream.reset(new stream(stream::kind::eager));
-    // Create conv primitive
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+    // Create convolution primitive
     if (context_.conv_fwd == nullptr) {
       Setup(convFwdDims);
     }
@@ -115,19 +200,30 @@ class MklConvFwdPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
         static_cast<void*>(const_cast<Tfilter*>(filter_data)));
-    context_.bias_mem->set_data_handle(
-        static_cast<void*>(const_cast<Tbias*>(bias_data)));
+    if (bias_data != nullptr) {
+      context_.bias_mem->set_data_handle(
+          static_cast<void*>(const_cast<Tbias*>(bias_data)));
+    }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK_EQ(context_.fwd_primitives.size(),
+              context_.fwd_primitives_args.size());
+    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
+      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
+                                            context_.fwd_primitives_args.at(i));
+    }
+#else
     context_.fwd_stream->submit(context_.fwd_primitives);
+#endif  // ENABLE_MKLDNN_V1
 
-    // After exec, set data handle back
+    // After execution, set data handle back
     context_.src_mem->set_data_handle(DummyData);
     context_.filter_mem->set_data_handle(DummyData);
-    context_.bias_mem->set_data_handle(DummyData);
+    if (bias_data != nullptr) {
+      context_.bias_mem->set_data_handle(DummyData);
+    }
     context_.dst_mem->set_data_handle(DummyData);
-
-    return;
   }
 
   // Convolution forward execute without bias
@@ -136,23 +232,15 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   dst_data:    output data buffer of dst
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
                const Toutput* dst_data) {
-    context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<Tinput*>(src_data)));
-    context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
-    context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<Toutput*>(dst_data)));
-    context_.fwd_stream->submit(context_.fwd_primitives);
-
-    // After execution, set data handle back
-    context_.src_mem->set_data_handle(DummyData);
-    context_.filter_mem->set_data_handle(DummyData);
-    context_.dst_mem->set_data_handle(DummyData);
+    Execute(src_data, filter_data, nullptr, dst_data);
   }
 
+#ifndef ENABLE_MKLDNN_V1
+  // In MKL-DNN v1.x, memory format tags only provide a partial description
+  // of the memory layout. Hence, these functions are disabled for v1.x.
   memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
-
   memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
+#endif  // !ENABLE_MKLDNN_V1
 
   std::shared_ptr<ConvFwdPd> GetPrimitiveDesc() const {
     return context_.fwd_pd;
@@ -161,17 +249,19 @@ class MklConvFwdPrimitive : public MklPrimitive {
  private:
   // Primitive reuse context for Conv2D Fwd op
   struct ConvFwdContext {
+#ifndef ENABLE_MKLDNN_V1
     // Expected memory format for this primitive instance
     memory::format src_fmt;
     memory::format filter_fmt;
+#endif  // !ENABLE_MKLDNN_V1
 
-    // MKLDNN memory
+    // MKL-DNN memory
     std::shared_ptr<mkldnn::memory> src_mem;
     std::shared_ptr<mkldnn::memory> filter_mem;
     std::shared_ptr<mkldnn::memory> bias_mem;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
-    // Desc & prmitive desc
+    // Desc & primitive desc
     std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc;
 
     // Memory desc
@@ -187,9 +277,16 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<std::unordered_map<int, memory>> fwd_primitives_args;
+#endif  // ENABLE_MKLDNN_V1
+
     ConvFwdContext()
-        : src_fmt(memory::format::any),
+        :
+#ifndef ENABLE_MKLDNN_V1
+          src_fmt(memory::format::any),
           filter_fmt(memory::format::any),
+#endif  // !ENABLE_MKLDNN_V1
           src_mem(nullptr),
           filter_mem(nullptr),
           bias_mem(nullptr),
@@ -200,34 +297,35 @@ class MklConvFwdPrimitive : public MklPrimitive {
           bias_md(nullptr),
           fwd_pd(nullptr),
           conv_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          fwd_stream(nullptr) {
+    }
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
     // Create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), MEMORY_FORMAT::any));
 
     context_.filter_md.reset(new memory::desc(
-        {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), memory::format::any));
+        {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), MEMORY_FORMAT::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), memory::format::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), MEMORY_FORMAT::any));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
-          {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
+          {convFwdDims.bias_dims}, MklDnnType<Tbias>(), MEMORY_FORMAT::any));
 
-    // Create a convolution
+    // Create a convolution descriptor
     if (!convFwdDims.bias_dims.empty()) {
       context_.fwd_desc.reset(new convolution_forward::desc(
-          prop_kind::forward, convolution_direct, *context_.src_md,
+          prop_kind::forward, ALGORITHM::convolution_direct, *context_.src_md,
           *context_.filter_md, *context_.bias_md, *context_.dst_md,
           convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
           convFwdDims.padding_right, padding_kind::zero));
     } else {
       context_.fwd_desc.reset(new convolution_forward::desc(
-          prop_kind::forward, convolution_direct, *context_.src_md,
+          prop_kind::forward, ALGORITHM::convolution_direct, *context_.src_md,
           *context_.filter_md, *context_.dst_md, convFwdDims.strides,
           convFwdDims.dilations, convFwdDims.padding_left,
           convFwdDims.padding_right, padding_kind::zero));
@@ -246,7 +344,12 @@ class MklConvFwdPrimitive : public MklPrimitive {
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
+#ifdef ENABLE_MKLDNN_V1
+          post_ops.append_eltwise(op_scale, mkldnn::algorithm::eltwise_relu,
+                                  op_alpha,
+#else
           post_ops.append_eltwise(op_scale, post_op_param.alg, op_alpha,
+#endif  // ENABLE_MKLDNN_V1
                                   op_beta);
         } else if (post_op_param.name == "sum") {
           DCHECK_EQ(post_op_param.param.size(), 1);
@@ -271,27 +374,44 @@ class MklConvFwdPrimitive : public MklPrimitive {
       context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
     }
 
+#ifndef ENABLE_MKLDNN_V1
     // Store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
 
     context_.filter_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
+#endif  // !ENABLE_MKLDNN_V1
 
     // Create memory primitive based on dummy data
-    context_.src_mem.reset(
-        new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
-    context_.filter_mem.reset(
-        new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData));
-    context_.dst_mem.reset(
-        new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
+    context_.src_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_SRC, cpu_engine_, DummyData));
+    context_.filter_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_WEIGHTS, cpu_engine_, DummyData));
+    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_DST, cpu_engine_, DummyData));
 
     // Create convolution primitive and add it to net
     if (!convFwdDims.bias_dims.empty()) {
-      context_.bias_mem.reset(new memory(
-          {{{convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::x},
-           cpu_engine_},
-          DummyData));
+      context_.bias_mem.reset(new MEMORY_CONSTRUCTOR_USING_MEM_PD(
+          convFwdDims.bias_dims, Tbias, x, cpu_engine_, DummyData));
+#ifdef ENABLE_MKLDNN_V1
+      context_.conv_fwd.reset(new convolution_forward(*context_.fwd_pd));
+      context_.fwd_primitives_args.push_back(
+          {{MKLDNN_ARG_SRC, *context_.src_mem},
+           {MKLDNN_ARG_WEIGHTS, *context_.filter_mem},
+           {MKLDNN_ARG_BIAS, *context_.bias_mem},
+           { MKLDNN_ARG_DST,
+             *context_.dst_mem }});
+    } else {
+      context_.conv_fwd.reset(new convolution_forward(*context_.fwd_pd));
+      context_.fwd_primitives_args.push_back(
+          {{MKLDNN_ARG_SRC, *context_.src_mem},
+           {MKLDNN_ARG_WEIGHTS, *context_.filter_mem},
+           { MKLDNN_ARG_DST,
+             *context_.dst_mem }});
+    }
+#else
       context_.conv_fwd.reset(new convolution_forward(
           *context_.fwd_pd, *context_.src_mem, *context_.filter_mem,
           *context_.bias_mem, *context_.dst_mem));
@@ -300,9 +420,8 @@ class MklConvFwdPrimitive : public MklPrimitive {
           new convolution_forward(*context_.fwd_pd, *context_.src_mem,
                                   *context_.filter_mem, *context_.dst_mem));
     }
-
+#endif  // ENABLE_MKLDNN_V1
     context_.fwd_primitives.push_back(*context_.conv_fwd);
-    return;
   }
 
   struct ConvFwdContext context_;
@@ -396,12 +515,11 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
   }
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
-          bool bias_enabled, bool pad_enabled, bool is_depthwise>
+          bool bias_enabled, bool pad_enabled, bool is_depthwise,
+          bool eager_mode>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -428,8 +546,10 @@ class MklConvOp : public OpKernel {
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     is_filter_const_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("is_filter_const", &is_filter_const_));
+    if (context->HasAttr("is_filter_const")) {
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("is_filter_const", &is_filter_const_));
+    }
 
     if (strides_.size() == 4) {
       OP_REQUIRES(context, dilations_.size() == 4,
@@ -472,9 +592,10 @@ class MklConvOp : public OpKernel {
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape;
-      GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
-      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
-      OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
+      GetMklShape(context, kInputIndex_Src, &src_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape, eager_mode);
+
+      OP_REQUIRES(context, !filter_mkl_shape.IsMklTensor(),
                   errors::InvalidArgument("Filter should not be in "
                                           "Mkl Layout"));
 
@@ -503,8 +624,9 @@ class MklConvOp : public OpKernel {
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
                               dilations_);
-      auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
-      auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
+      auto src_tf_shape = GetTfShape(context, kInputIndex_Src, eager_mode);
+      auto filter_tf_shape =
+          GetTfShape(context, kInputIndex_Filter, eager_mode);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
@@ -517,15 +639,17 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
+      Tensor tmp_tensor;
       bool emit_filter_output = (typeid(Tinput) == typeid(Tfilter) &&
                                  typeid(Tinput) == typeid(Toutput) &&
                                  (typeid(Tinput) == typeid(float) ||
-                                  typeid(Tinput) == typeid(bfloat16)));
+                                  typeid(Tinput) == typeid(bfloat16))) &&
+                                !eager_mode;
       if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
-                                  src_tf_shape, dst_mkl_shape);
+                                  src_tf_shape, dst_mkl_shape, eager_mode);
 
         // MklConv2D/3D also outputs converted filter as 2nd output.
         filter_mkl_shape.SetMklTensor(false);
@@ -545,6 +669,9 @@ class MklConvOp : public OpKernel {
         OP_REQUIRES(
             context, !pad_enabled,
             errors::InvalidArgument("Pad + Conv fusion only works for 2D"));
+        OP_REQUIRES(
+            context, !fuse_pad_,
+            errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
       }
 
       // TODO 3-D support for Depthwise is not there
@@ -554,18 +681,19 @@ class MklConvOp : public OpKernel {
                         "Only 2D convolution is supported for depthwise."));
       }
 
-      // TODO(Intel-tf) Add check to make sure pad_enabled is true only for 2D
-      if (!is_conv2d) {
-        OP_REQUIRES(
-            context, !fuse_pad_,
-            errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
-      }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
       auto tf_fmt = is_conv2d ? TFDataFormatToMklDnnDataFormat(data_format_)
                               : TFDataFormatToMklDnn3DDataFormat(data_format_);
 
+#ifdef ENABLE_MKLDNN_V1
+      auto mkl_fmt_tag = MklTensorFormatToMklDnnDataFormat(tf_fmt);
+      // NOTE: `mkl_fmt_tag` will be `format_tag::undef` for ReLU
+      OP_REQUIRES(context, mkl_fmt_tag != memory::format_tag::undef,
+                  errors::InvalidArgument("Invalid data format"));
+#endif  // ENABLE_MKLDNN_V1
+
       // If input is in MKL layout, then simply grab the layout; otherwise,
       // construct TF layout for input.
       // For constructing TF layout for input, although input shape (src_dims)
@@ -573,18 +701,22 @@ class MklConvOp : public OpKernel {
       // TF layout depending on the data format:
       //     Conv2D: NHWC or NCHW
       //     Conv3D: NDHWC or NCDHW
-      auto src_md = src_mkl_shape.IsMklTensor()
-                        ? src_mkl_shape.GetMklLayout()
-                        : memory::desc(src_dims, MklDnnType<Tinput>(), tf_fmt);
+      auto src_md =
+          src_mkl_shape.IsMklTensor()
+              ? src_mkl_shape.GetMklLayout()
+#ifdef ENABLE_MKLDNN_V1
+              : memory::desc(src_dims, MklDnnType<Tinput>(), mkl_fmt_tag);
+#else
+              : memory::desc(src_dims, MklDnnType<Tinput>(), tf_fmt);
+#endif  // ENABLE_MKLDNN_V1
       src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO) and (HWIGO) for
       // depthwise/group convolutions.
-
-      auto filter_format = is_conv2d ? (is_depthwise ? memory::format::hwigo
-                                                     : memory::format::hwio)
-                                     : memory::format::dhwio;
+      auto filter_format = is_conv2d ? (is_depthwise ? MEMORY_FORMAT::hwigo
+                                                     : MEMORY_FORMAT::hwio)
+                                     : MEMORY_FORMAT::dhwio;
 
       DCHECK(!filter_mkl_shape.IsMklTensor());
       auto filter_md =
@@ -593,7 +725,7 @@ class MklConvOp : public OpKernel {
               : memory::desc(filter_dims, MklDnnType<Tfilter>(), filter_format);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
-      // MKLDNN dilations start from 0.
+      // MKL-DNN dilations start from 0.
       for (int i = 0; i < dilations.size(); ++i) --dilations[i];
 
       // In some cases, primitive descriptor could potentially contain
@@ -627,9 +759,10 @@ class MklConvOp : public OpKernel {
               convFwdDims, do_not_cache);
 
       // Allocate output tensors `output_tensor` and `filter_out_tensor`
+      MklDnnShape output_mkl_shape;
       std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
-                           &dst_tensor);
+                           &output_mkl_shape, &dst_tensor, &tmp_tensor);
 
       Tensor* filter_out_tensor = nullptr;
       if (emit_filter_output) {
@@ -643,10 +776,11 @@ class MklConvOp : public OpKernel {
 
       // Check whether src and filter need to be reordered
       Tinput* src_data = nullptr;
-      if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
+      if (IS_SRC_REORDER_NEEDED(src_md, conv_fwd_pd, conv_fwd)) {
         // Reorder src
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
+        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+            GET_SRC_DESC_FROM_OP_PD(conv_fwd_pd), cpu_engine_));
         src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<Tinput*>(
@@ -654,7 +788,7 @@ class MklConvOp : public OpKernel {
       }
 
       Tfilter* filter_data = nullptr;
-      if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
+      if (IS_FILTER_REORDER_NEEDED(filter_md, conv_fwd_pd, conv_fwd)) {
         bool is_filter_cached = false;
         // If filter is a constant, we can avoid the conversion of filter from
         // Tensorflow format to MKL format by caching the filter when it is
@@ -664,21 +798,26 @@ class MklConvOp : public OpKernel {
           if (IsFilterCacheEmpty(context)) {
             // Cache filter if it is not already cached.
             CacheFilter(context, conv_fwd_pd, filter_data, filter_tensor,
+#ifdef ENABLE_MKLDNN_V1
+                        filter, filter_md, filter_mkl_shape);
+#else
                         filter, filter_md);
+#endif  // ENABLE_MKLDNN_V1
           }
-          filter_data =
-              GetCachedFilter(context, conv_fwd->GetFilterMemoryFormat());
+          filter_data = GetCachedFilter(
+              context, GET_WEIGHTS_FORMAT_FROM_OP_PD(conv_fwd_pd, conv_fwd));
           is_filter_cached = (filter_data != nullptr);
         }
         if (!is_filter_cached) {
           filter.SetUsrMem(filter_md, &filter_tensor);
           if (filter_out_tensor == nullptr) {
-            filter.CheckReorderToOpMem(
-                conv_fwd_pd.get()->weights_primitive_desc());
+            filter.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+                GET_WEIGHTS_DESC_FROM_OP_PD(conv_fwd_pd), cpu_engine_));
           } else {
             filter.CheckReorderToOpMem(
-                conv_fwd_pd.get()->weights_primitive_desc(),
-                filter.GetTensorBuffer(filter_out_tensor));
+                GET_WEIGHTS_DESC_FROM_OP_PD(conv_fwd_pd),
+                DATA_WITH_ENGINE(filter.GetTensorBuffer(filter_out_tensor),
+                                 cpu_engine_));
           }
           filter_data =
               static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
@@ -695,7 +834,30 @@ class MklConvOp : public OpKernel {
             this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-        conv_fwd->Execute(src_data, filter_data, dst_data);
+        if (!eager_mode) {
+          conv_fwd->Execute(src_data, filter_data, dst_data);
+        } else {
+          // In eager mode we first write the output to temporary
+          // buffer in MKL format. Then we convert the data to TF format.
+          Ttemp_output* tmp_data = reinterpret_cast<Ttemp_output*>(
+              tmp_tensor.flat<Toutput>().data());
+          conv_fwd->Execute(src_data, filter_data, tmp_data);
+
+          // Now we need to convert the output to TF format.
+          auto output_tf_md = output_mkl_shape.GetTfLayout();
+#ifndef ENABLE_MKLDNN_V1
+          auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
+#endif  // !ENABLE_MKLDNN_V1
+          auto dst_pd = conv_fwd_pd->PRIMITIVE_DESC_DST;
+          ReorderPd reorder_pd =
+              REORDER_PD_CONSTRUCTOR(dst_pd, OUTPUT_TF_MD, cpu_engine_);
+          memory* tmp_data_mem =
+              new MEMORY_CONSTRUCTOR(dst_pd, cpu_engine_, tmp_data);
+          memory* dst_data_mem =
+              new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, dst_data);
+          CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
+                                  cpu_engine_);
+        }
       }
 
       // Delete primitive since it is not cached.
@@ -787,7 +949,7 @@ class MklConvOp : public OpKernel {
     // NOTE: Fusion of BiasAdd is handled directly inside MklConvOp by
     // checking `fuse_biasadd_` flag.
     if (fuse_add_) {
-      params.post_op_params.push_back({"sum", mkldnn::algorithm_undef, {1.0}});
+      params.post_op_params.push_back({"sum", ALGORITHM_UNDEF, {1.0}});
     }
     if (fuse_activation_) {
       params.post_op_params.push_back(
@@ -808,62 +970,87 @@ class MklConvOp : public OpKernel {
   virtual void AllocateOutputTensor(OpKernelContext* context,
                                     const ConvFwdPd& conv_prim_desc,
                                     const memory::dims& output_dims_mkl_order,
-                                    memory::format output_tf_format,
-                                    Tensor** output_tensor) {
-    CHECK_NOTNULL(output_tensor);
+                                    MKL_TENSOR_FORMAT output_tf_format,
+                                    MklDnnShape* output_mkl_shape,
+                                    Tensor** output_tensor,
+                                    Tensor* tmp_tensor) {
+    DCHECK(output_tensor);
+#ifdef ENABLE_MKLDNN_V1
+    auto dst_md = conv_prim_desc.dst_desc();
+#else
     auto dst_pd = conv_prim_desc.dst_primitive_desc();
-
     auto dst_md = dst_pd.desc();
+#endif  // ENABLE_MKLDNN_V1
+
     if (!std::is_same<Ttemp_output, Toutput>::value) {
       dst_md.data.data_type =
           static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+#ifndef ENABLE_MKLDNN_V1
       dst_pd = memory::primitive_desc(dst_md, cpu_engine_);
+#endif  // !ENABLE_MKLDNN_V1
     }
-    // Allocate shape of Mkl tensor.
-    MklDnnShape output_mkl_shape;
-    output_mkl_shape.SetMklTensor(true);
-    output_mkl_shape.SetMklLayout(&dst_pd);
-    output_mkl_shape.SetElemType(MklDnnType<Toutput>());
-    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                 output_dims_mkl_order, output_tf_format);
 
-    // Allocate shape of TF tensor.
+    // Allocate shape of MKL tensor
+    output_mkl_shape->SetMklTensor(true);
+    output_mkl_shape->SetMklLayout(&DST_MD);
+    output_mkl_shape->SetElemType(MklDnnType<Toutput>());
+    output_mkl_shape->SetTfLayout(output_dims_mkl_order.size(),
+                                  output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor
     TensorShape output_tf_shape;
-    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
+    output_tf_shape.AddDim((DST_MD.get_size() / sizeof(Toutput)));
+    if (eager_mode) {
+      AllocTmpBuffer<Toutput>(context, tmp_tensor, output_tf_shape);
+      output_tf_shape = output_mkl_shape->GetTfShape();
+    }
 
     AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                              output_tf_shape, output_mkl_shape);
+                              output_tf_shape, *output_mkl_shape, eager_mode);
+
     if (fuse_add_) {
       const Tensor& add_tensor = MklGetInput(context, kInputIndex_Add);
       MklDnnShape add_mkl_shape;
       GetMklShape(context, kInputIndex_Add, &add_mkl_shape);
 
-      // Check if need reorder
-      if (add_mkl_shape == output_mkl_shape) {
-        CHECK((*output_tensor)->CopyFrom(add_tensor, output_tf_shape));
+      // Check if reorder is needed
+      if (add_mkl_shape == *output_mkl_shape) {
+        OP_REQUIRES(
+            context, (*output_tensor)->CopyFrom(add_tensor, output_tf_shape),
+            errors::Internal("MklConvOp: AddN fusion: Failed to forward "
+                             "input tensor to output"));
       } else {
+#ifdef ENABLE_MKLDNN_V1
+        auto output_format_tag = MklTensorFormatToMklDnnDataFormat(
+            output_mkl_shape->GetTfDataFormat());
+        OP_REQUIRES(context, output_format_tag != memory::format_tag::undef,
+                    errors::InvalidArgument(
+                        "MklConvOp: AddN fusion: Invalid data format"));
+#endif  // ENABLE_MKLDNN_V1
         auto add_md =
             add_mkl_shape.IsMklTensor()
                 ? add_mkl_shape.GetMklLayout()
                 : memory::desc(output_dims_mkl_order, MklDnnType<Toutput>(),
-                               output_mkl_shape.GetTfDataFormat());
+#ifdef ENABLE_MKLDNN_V1
+                               output_format_tag);
+#else
+                               output_mkl_shape->GetTfDataFormat());
         auto add_pd = memory::primitive_desc(add_md, this->cpu_engine_);
+#endif  // ENABLE_MKLDNN_V1
         void* add_buf = static_cast<void*>(
             const_cast<Toutput*>(add_tensor.flat<Toutput>().data()));
         void* dst_buf =
             static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
-        auto add = new memory(add_pd, add_buf);
-        auto dst = new memory(dst_pd, dst_buf);
-        auto reorder_desc = mkldnn::reorder::primitive_desc(add_pd, dst_pd);
-
-        std::vector<mkldnn::primitive> net;
-        net.push_back(mkldnn::reorder(reorder_desc, *add, *dst));
-        stream(stream::kind::eager).submit(net).wait();
+        auto add = new MEMORY_CONSTRUCTOR(ADD_MD, this->cpu_engine_, add_buf);
+        auto dst = new MEMORY_CONSTRUCTOR(DST_MD, this->cpu_engine_, dst_buf);
+        auto reorder_desc =
+            REORDER_PD_CONSTRUCTOR(ADD_MD, DST_MD, this->cpu_engine_);
+        CreateAndExecuteReorder(reorder_desc, *add, *dst, this->cpu_engine_);
       }
     }
   }
 
-  engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
  private:
   std::vector<int32> strides_;
@@ -883,7 +1070,7 @@ class MklConvOp : public OpKernel {
   bool fuse_add_ = false;
 
   float relu_up_bound_ = 0.0;
-  mkldnn::algorithm activation_alg_ = mkldnn::algorithm_undef;
+  mkldnn::algorithm activation_alg_ = ALGORITHM_UNDEF;
 
   int input_index_pad_ = 2;
 
@@ -892,15 +1079,27 @@ class MklConvOp : public OpKernel {
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
+  MKL_TENSOR_FORMAT_IN_C GetFilterTfDataFormat(
+      const MklDnnShape* filter_mkl_shape,
+      const ConvFwdPd& conv_prim_desc) const {
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK(filter_mkl_shape);
+    return filter_mkl_shape->GetTfDataFormat();
+#else
+    return conv_prim_desc.weights_primitive_desc().desc().data.format;
+#endif  // ENABLE_MKLDNN_V1
+  }
+
   // Allocate persistent tensors for cached filter data and
   // cached filter memory descriptor (data format)
   void AllocatePersistentTensor(OpKernelContext* context,
                                 const ConvFwdPd& conv_prim_desc,
-                                Tensor** filter_tensor) {
+                                Tensor** filter_tensor,
+                                const MklDnnShape* filter_mkl_shape) {
     DCHECK(filter_tensor);
     TensorShape filter_tf_shape;
     filter_tf_shape.AddDim(
-        (conv_prim_desc.weights_primitive_desc().get_size() / sizeof(Tfilter)));
+        (conv_prim_desc.PRIMITIVE_DESC_WEIGHTS.get_size() / sizeof(Tfilter)));
     OP_REQUIRES_OK(context, context->allocate_persistent(
                                 DataTypeToEnum<Tfilter>::value, filter_tf_shape,
                                 &cached_filter_data_ptensor_, filter_tensor));
@@ -908,37 +1107,44 @@ class MklConvOp : public OpKernel {
     Tensor* second_tensor = nullptr;
     TensorShape filter_mkl_format;
     filter_mkl_format.AddDim(
-        sizeof(conv_prim_desc.weights_primitive_desc().desc().data.format) /
+        sizeof(GetFilterTfDataFormat(filter_mkl_shape, conv_prim_desc)) /
         sizeof(DT_INT32));
     OP_REQUIRES_OK(context, context->allocate_persistent(
                                 DT_INT32, filter_mkl_format,
                                 &cached_filter_md_ptensor_, &second_tensor));
-    second_tensor->scalar<int32>()() =
-        conv_prim_desc.weights_primitive_desc().desc().data.format;
+    second_tensor->scalar<int32>()() = static_cast<int32>(
+        GetFilterTfDataFormat(filter_mkl_shape, conv_prim_desc));
+  }
+
+  void AllocatePersistentTensor(OpKernelContext* context,
+                                const ConvFwdPd& conv_prim_desc,
+                                Tensor** filter_tensor) {
+    AllocatePersistentTensor(context, conv_prim_desc, filter_tensor, nullptr);
   }
 
   void AllocateFilterOutputTensor(OpKernelContext* context,
                                   const ConvFwdPd& conv_prim_desc,
                                   const memory::dims& filter_dims_tf_order,
                                   Tensor** filter_tensor) {
-    CHECK_NOTNULL(filter_tensor);
-    auto filter_pd = conv_prim_desc.weights_primitive_desc();
+    DCHECK(filter_tensor);
+    auto filter_md = conv_prim_desc.PRIMITIVE_DESC_WEIGHTS;
 
-    // Allocate shape of Mkl tensor.
+    // Allocate shape of MKL tensor
     MklDnnShape filter_mkl_shape;
     filter_mkl_shape.SetMklTensor(true);
-    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetMklLayout(&filter_md);
     filter_mkl_shape.SetElemType(MklDnnType<Tfilter>());
 
     // The format of the filter is actually OIhw8i8o, but TF doesn't support
     // this format. Just use format::blocked for now because the layout
     // is stored in the MKL data.
     filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
-                                 filter_dims_tf_order, memory::format::blocked);
+                                 filter_dims_tf_order,
+                                 MKL_TENSOR_FORMAT_BLOCKED);
 
     // Allocate the data space for the filter to propagate as TF tensor.
     TensorShape filter_tf_shape;
-    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(Tfilter)));
+    filter_tf_shape.AddDim((filter_md.get_size() / sizeof(Tfilter)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
@@ -951,20 +1157,41 @@ class MklConvOp : public OpKernel {
                             MklDnnData<Tbias>* bias,
                             MklDnnData<Toutput>* output,
                             Tensor* filter_out_tensor) {
-    CHECK_NOTNULL(filter_out_tensor);
+    DCHECK(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution. No need to check for output
     // reorder as we propagate output layout to the next layer.
-    src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc());
+    src->CheckReorderToOpMem(
+        MEMORY_PD_WITHOUT_DATA(conv_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
 
-    // rather than re-order to a temp buffer, reorder directly to the
+    // Rather than re-ordering to a temp buffer, reorder directly to the
     // filter output tensor
-    filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
+    filter->CheckReorderToOpMem(conv_prim_desc.PRIMITIVE_DESC_WEIGHTS,
                                 filter->GetTensorBuffer(filter_out_tensor));
 
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<std::unordered_map<int, memory>> net_args;
+    if (bias) {
+      DCHECK(fuse_biasadd_);
+      net.push_back(convolution_forward(conv_prim_desc));
+      net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()},
+                          {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()},
+                          {MKLDNN_ARG_BIAS, bias->GetOpMem()},
+                          { MKLDNN_ARG_DST,
+                            output->GetOpMem() }});
+    } else {
+      DCHECK(!fuse_biasadd_);
+      net.push_back(convolution_forward(conv_prim_desc));
+      net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()},
+                          {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()},
+                          { MKLDNN_ARG_DST,
+                            output->GetOpMem() }});
+    }
+    ExecutePrimitive(net, &net_args, cpu_engine_);
+#else
     if (bias) {
       DCHECK(fuse_biasadd_);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
@@ -976,8 +1203,8 @@ class MklConvOp : public OpKernel {
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
     }
-
-    stream(stream::kind::eager).submit(net).wait();
+    ExecutePrimitive(net, nullptr, cpu_engine_);
+#endif  // ENABLE_MKLDNN_V1
   }
 
   // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
@@ -990,8 +1217,55 @@ class MklConvOp : public OpKernel {
     return (cached_filter_data_tensor.NumElements() == 0);
   }
 
-  // Cache the converted filter in a persistent tensor.
-  // Only one thread can execute this method at any given time.
+// Cache the converted filter in a persistent tensor.
+// Only one thread can execute this method at any given time.
+#ifdef ENABLE_MKLDNN_V1
+  void CacheFilter(OpKernelContext* context,
+                   const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
+                   Tfilter* filter_data, const Tensor& filter_tensor,
+                   MklDnnData<Tfilter>& filter, const memory::desc& filter_md,
+                   const MklDnnShape& filter_mkl_shape) LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    const Tensor& cached_filter_data_tensor =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+
+    // If filter is already cached, there's nothing to do.
+    if (cached_filter_data_tensor.NumElements() > 0) {
+      return;
+    }
+
+    // Otherwise, cache filter
+    filter.SetUsrMem(filter_md, &filter_tensor);
+    filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_desc(),
+                               this->cpu_engine_);
+    filter_data = static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+
+    Tensor* filter_tensor_ptr = nullptr;
+    AllocatePersistentTensor(context, *conv_fwd_pd, &filter_tensor_ptr,
+                             &filter_mkl_shape);
+    void* cached_filter_data = filter.GetTensorBuffer(filter_tensor_ptr);
+    size_t cached_filter_data_size = filter.GetOpMem().get_desc().get_size();
+    memcpy(cached_filter_data, filter_data, cached_filter_data_size);
+  }
+
+  bool AreMemoryDescriptorsEqual(const memory::desc& filter_md,
+                                 const Tensor& cached_filter_md) {
+    auto filter_md_data = filter_md.data;
+    const char* filter_data = reinterpret_cast<const char*>(&filter_md_data);
+
+    auto cached_filter_md_data = cached_filter_md.scalar<int64>()();
+    const char* cached_filter_data =
+        reinterpret_cast<const char*>(&cached_filter_md_data);
+
+    for (size_t i = 0; i < sizeof(filter_md_data); ++i) {
+      if (*filter_data++ != *cached_filter_data++) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+#else
   void CacheFilter(OpKernelContext* context,
                    const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
                    Tfilter* filter_data, const Tensor& filter_tensor,
@@ -1018,22 +1292,26 @@ class MklConvOp : public OpKernel {
         filter.GetOpMem().get_primitive_desc().get_size();
     memcpy(cached_filter_data, filter_data, cached_filter_data_size);
   }
+#endif  // ENABLE_MKLDNN_V1
 
   Tfilter* GetCachedFilter(OpKernelContext* context,
-                           const memory::format& filter_mf)
-      LOCKS_EXCLUDED(mu_) {
+                           const MEMORY_DESC& filter_md) LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& cached_filter_data =
         *cached_filter_data_ptensor_.AccessTensor(context);
     const Tensor& cached_filter_md =
         *cached_filter_md_ptensor_.AccessTensor(context);
 
-    // Check if the memory descriptor of the cached weights is same as
-    // filter_mf. If so, we can used the cached weights; otherwise
-    // return NULL.
-    // TODO (bhavanis): Do we need to cast filter_mf before the check?
+// Check if the memory descriptor of the cached weights is same as
+// filter_md. If so, we can used the cached weights; otherwise
+// return NULL.
+#ifdef ENABLE_MKLDNN_V1
+    if (cached_filter_md.scalar<int64>().size() &&
+        AreMemoryDescriptorsEqual(filter_md, cached_filter_md)) {
+#else
     if (cached_filter_md.scalar<int32>().size() &&
-        cached_filter_md.scalar<int32>()() == filter_mf) {
+        cached_filter_md.scalar<int32>()() == filter_md) {
+#endif  // ENABLE_MKLDNN_V1
       return static_cast<Tfilter*>(
           const_cast<Tfilter*>(cached_filter_data.flat<Tfilter>().data()));
     }
@@ -1047,11 +1325,11 @@ template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           bool pad_enabled>
 class MklFusedConvOp
     : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                       Tpadding, false, false, false> {
+                       Tpadding, false, false, false, false> {
  public:
   explicit MklFusedConvOp(OpKernelConstruction* context)
       : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                  Tpadding, false, false, false>(context) {
+                  Tpadding, false, false, false, false>(context) {
     // Since we came here through the registration of _MklFusedConv2D, get
     // all information from 'fused_ops' and 'num_args'
     std::vector<string> fused_ops;
@@ -1069,26 +1347,26 @@ class MklFusedConvOp
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"Relu"}) {
-      this->set_fuse_activation(true, mkldnn::eltwise_relu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
     } else if (fused_ops == std::vector<string>{"Relu6"}) {
-      this->set_fuse_activation(true, mkldnn::eltwise_bounded_relu, 6.0);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
     } else if (fused_ops == std::vector<string>{"Elu"}) {
-      this->set_fuse_activation(true, mkldnn::eltwise_elu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_elu);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_relu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu6"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_bounded_relu, 6.0);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Elu"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_elu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_elu);
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
@@ -1102,7 +1380,7 @@ class MklFusedConvOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_relu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
       OP_REQUIRES(
           context, num_args == 2,
           errors::InvalidArgument(
@@ -1110,7 +1388,7 @@ class MklFusedConvOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_bounded_relu, 6.0);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
       OP_REQUIRES(
           context, num_args == 2,
           errors::InvalidArgument(
@@ -1118,7 +1396,7 @@ class MklFusedConvOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Elu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
-      this->set_fuse_activation(true, mkldnn::eltwise_elu);
+      this->set_fuse_activation(true, ALGORITHM::eltwise_elu);
       OP_REQUIRES(
           context, num_args == 2,
           errors::InvalidArgument(
@@ -1143,7 +1421,7 @@ template <typename Device, typename Tbias, typename Toutput,
           typename Ttemp_output, bool bias_enabled, bool is_depthwise>
 class MklQuantizedConv2DOp
     : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                       int32, bias_enabled, false, is_depthwise> {
+                       int32, bias_enabled, false, is_depthwise, false> {
  public:
   virtual ~MklQuantizedConv2DOp() {
     if (this->input_bias_ != nullptr) {
@@ -1159,7 +1437,7 @@ class MklQuantizedConv2DOp
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
       : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-                  bias_enabled, false, is_depthwise>(context) {
+                  bias_enabled, false, is_depthwise, false>(context) {
     bool is_filter_const;
     OP_REQUIRES_OK(context,
                    context->GetAttr("is_filter_const", &is_filter_const));
@@ -1170,7 +1448,7 @@ class MklQuantizedConv2DOp
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              bias_enabled, false, is_depthwise>::Compute(context);
+              bias_enabled, false, is_depthwise, false>::Compute(context);
 
     // Compute additional outputs: min/max scalars.
     int bias_index_offset;
@@ -1232,8 +1510,8 @@ class MklQuantizedConv2DOp
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              bias_enabled, false, is_depthwise>::ExtendConvFwdParams(context,
-                                                                      params);
+              bias_enabled, false, is_depthwise,
+              false>::ExtendConvFwdParams(context, params);
 
     // When the output type is quint8, the output data id requantized
     // into quint8. A post_op "output_scale" is added to do the conversion.
@@ -1274,7 +1552,7 @@ class MklQuantizedConv2DOp
                     (255.0f * 127.0f * output_range);
       }
       params.post_op_params.push_back(
-          {"output_scale", mkldnn::algorithm_undef, scales});
+          {"output_scale", ALGORITHM_UNDEF, scales});
     }
   }
 
@@ -1293,7 +1571,6 @@ class MklQuantizedConv2DOp
     const float* min_filter = min_filter_vector.flat<float>().data();
     const float* max_filter = max_filter_vector.flat<float>().data();
 
-    std::vector<mkldnn::primitive> net;
     if (bias_enabled) {
       if (std::is_same<Tbias, qint32>::value) {
         return static_cast<Tbias*>(
@@ -1315,21 +1592,21 @@ class MklQuantizedConv2DOp
       } else {
         bias_attr.set_output_scales(1, scales);
       }
-      auto bias_pd =
-          memory::primitive_desc({{static_cast<int>(bias_tensor.NumElements())},
-                                  MklDnnType<Tbias>(),
-                                  memory::format::x},
-                                 this->cpu_engine_);
 
+      auto bias_md =
+          MEMORY_PD_CONSTRUCTOR(static_cast<int>(bias_tensor.NumElements()),
+                                Tbias, x, this->cpu_engine_);
       void* bias_buf = static_cast<void*>(
           const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
-      input_bias_ = new memory(bias_pd, bias_buf);
-      scaled_bias_ = new memory(conv_fwd_pd->bias_primitive_desc());
-      auto reorder_desc = mkldnn::reorder::primitive_desc(
-          input_bias_->get_primitive_desc(), scaled_bias_->get_primitive_desc(),
+      input_bias_ =
+          new MEMORY_CONSTRUCTOR(bias_md, this->cpu_engine_, bias_buf);
+      scaled_bias_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(
+          conv_fwd_pd->PRIMITIVE_DESC_BIAS, this->cpu_engine_);
+      auto reorder_desc = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
+          input_bias_->GET_DESC, scaled_bias_->GET_DESC, this->cpu_engine_,
           bias_attr);
-      net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
-      stream(stream::kind::eager).submit(net).wait();
+      CreateAndExecuteReorder(reorder_desc, *input_bias_, *scaled_bias_,
+                              this->cpu_engine_);
       return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
     } else {
       return nullptr;
@@ -1358,7 +1635,7 @@ class MklQuantizedConv2DReluOp
     MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, bias_enabled,
                          is_depthwise>::ExtendConvFwdParams(context, params);
     params.post_op_params.push_back(
-        {"activation", mkldnn::eltwise_relu, {1.0, 0.0, 0.0}});
+        {"activation", ALGORITHM::eltwise_relu, {1.0, 0.0, 0.0}});
   }
 };
 
@@ -1415,24 +1692,26 @@ class MklQuantizedConv2DSumReluOp
       // If it is not then  it is DT_INT8 and is scaled appropriately.
       if (summand_type == DT_QUINT8)
         params.post_op_params.push_back(
-            {"sum", mkldnn::algorithm_undef, {scale_summand / scale_output}});
+            {"sum", ALGORITHM_UNDEF, {scale_summand / scale_output}});
       else
         params.post_op_params.push_back(
             {"sum",
-             mkldnn::algorithm_undef,
+             ALGORITHM_UNDEF,
              {255.0f * scale_summand / (scale_output * 127.0f)}});
     } else {
-      params.post_op_params.push_back({"sum", mkldnn::algorithm_undef, {1.0}});
+      params.post_op_params.push_back({"sum", ALGORITHM_UNDEF, {1.0}});
     }
     params.post_op_params.push_back(
-        {"activation", mkldnn::eltwise_relu, {1.0, 0.0, 0.0}});
+        {"activation", ALGORITHM::eltwise_relu, {1.0, 0.0, 0.0}});
   }
 
   void AllocateOutputTensor(OpKernelContext* context,
                             const ConvFwdPd& conv_prim_desc,
                             const memory::dims& output_dims_mkl_order,
-                            memory::format output_tf_format,
-                            Tensor** output_tensor) override {
+                            MKL_TENSOR_FORMAT output_tf_format,
+                            MklDnnShape* output_mkl_shape,
+                            Tensor** output_tensor,
+                            Tensor* tmp_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
     if (std::is_same<Toutput, quint8>::value) {
       summand_idx -= 2;
@@ -1459,12 +1738,12 @@ class MklQuantizedConv2DSumReluOp
       *output_tensor = const_cast<Tensor*>(&summand);
       return;
     }
-
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              bias_enabled, false,
+              bias_enabled, false, false,
               false>::AllocateOutputTensor(context, conv_prim_desc,
                                            output_dims_mkl_order,
-                                           output_tf_format, output_tensor);
+                                           output_tf_format, output_mkl_shape,
+                                           output_tensor, tmp_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -1503,20 +1782,22 @@ class MklQuantizedConv2DSumReluOp
         summand_mkl_shape.IsMklTensor()
             ? summand_mkl_shape.GetMklLayout()
             : memory::desc(output_dims_mkl_order, MklDnnType<Tbias>(),
-                           memory::format::nhwc);
+                           MEMORY_FORMAT::nhwc);
+#ifndef ENABLE_MKLDNN_V1
     auto summand_pd = memory::primitive_desc(summand_md, this->cpu_engine_);
+#endif  // !ENABLE_MKLDNN_V1
     void* summand_buf =
         static_cast<void*>(const_cast<Tbias*>(summand.flat<Tbias>().data()));
     void* dst_buf =
         static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
-    summand_ = new memory(summand_pd, summand_buf);
-    dst_ = new memory(conv_prim_desc.dst_primitive_desc(), dst_buf);
-    auto reorder_desc = mkldnn::reorder::primitive_desc(
-        summand_pd, conv_prim_desc.dst_primitive_desc(), reorder_attr);
-
-    std::vector<mkldnn::primitive> net;
-    net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_));
-    stream(stream::kind::eager).submit(net).wait();
+    summand_ =
+        new MEMORY_CONSTRUCTOR(SUMMAND_MD, this->cpu_engine_, summand_buf);
+    dst_ = new MEMORY_CONSTRUCTOR(conv_prim_desc.PRIMITIVE_DESC_DST,
+                                  this->cpu_engine_, dst_buf);
+    auto reorder_desc = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
+        SUMMAND_MD, conv_prim_desc.PRIMITIVE_DESC_DST, this->cpu_engine_,
+        reorder_attr);
+    CreateAndExecuteReorder(reorder_desc, *summand_, *dst_, this->cpu_engine_);
   }
 
   memory* summand_ = nullptr;
@@ -1870,46 +2151,52 @@ REGISTER_KERNEL_BUILDER(
     MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, true, true>);
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                          \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("_MklConv2D")                                                \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false>); \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("_MklConv2DWithBias")                                        \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, true, false, false>);  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("__MklDummyConv2DWithBias")                                  \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklDummyOp<CPUDevice, T>);                                        \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("_MklPadWithConv2D")                                         \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .TypeConstraint<int32>("Tpaddings")                           \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, true, false>);  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("_MklPadWithConv2D")                                         \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .TypeConstraint<int64>("Tpaddings")                           \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklConvOp<CPUDevice, T, T, T, T, T, int64, false, true, false>);  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("__MklDummyPadWithConv2D")                                   \
-          .Device(DEVICE_CPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .TypeConstraint<int32>("Tpaddings")                           \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),          \
-      MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_CPU_2D(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklConv2D")                                                       \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklConv2DWithBias")                                               \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, true, false, false, false>);  \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("__MklDummyConv2DWithBias")                                         \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklDummyOp<CPUDevice, T>);                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklPadWithConv2D")                                                \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .TypeConstraint<int32>("Tpaddings")                                  \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, true, false, false>);  \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklPadWithConv2D")                                                \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .TypeConstraint<int64>("Tpaddings")                                  \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklConvOp<CPUDevice, T, T, T, T, T, int64, false, true, false, false>);  \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("__MklDummyPadWithConv2D")                                          \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .TypeConstraint<int32>("Tpaddings")                                  \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklDummyOp<CPUDevice, T>);                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklEagerConv2D")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);
@@ -1920,7 +2207,7 @@ TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true>);
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, false>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_DEPTHWISE);
@@ -1966,9 +2253,40 @@ TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_FUSED);
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false>);
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_3D);
 
+#undef ADD_MD
+#undef ALGORITHM
+#undef ALGORITHM_UNDEF
+#undef CPU_STREAM
+#undef DATA_WITH_ENGINE
+#undef DST_MD
+#undef ENGINE_CPU
+#undef GET_DESC
+#undef GET_MEMORY_DESC_CONSTRUCTOR
+#undef GET_SRC_DESC_FROM_OP_PD
+#undef GET_WEIGHTS_DESC_FROM_OP_PD
+#undef GET_WEIGHTS_FORMAT_FROM_OP_PD
+#undef IS_FILTER_REORDER_NEEDED
+#undef IS_SRC_REORDER_NEEDED
+#undef MEMORY_CONSTRUCTOR
+#undef MEMORY_CONSTRUCTOR_USING_MEM_PD
+#undef MEMORY_CONSTRUCTOR_WITHOUT_DATA
+#undef MEMORY_DESC
+#undef MEMORY_FORMAT
+#undef MEMORY_PD_CONSTRUCTOR
+#undef MEMORY_PD_WITHOUT_DATA
+#undef MKL_TENSOR_FORMAT
+#undef MKL_TENSOR_FORMAT_BLOCKED
+#undef MKL_TENSOR_FORMAT_IN_C
+#undef PRIMITIVE_DESC_BIAS
+#undef PRIMITIVE_DESC_DST
+#undef PRIMITIVE_DESC_SRC
+#undef PRIMITIVE_DESC_WEIGHTS
+#undef REORDER_PD_CONSTRUCTOR
+#undef REORDER_PD_CONSTRUCTOR_WITH_ATTR
+#undef SUMMAND_MD
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e9be11a4ded..99e9c9f4e70 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -40,13 +40,21 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#ifndef ENABLE_MKLDNN_V1
 using mkldnn::convolution_direct;
+#endif  // !ENABLE_MKLDNN_V1
 using mkldnn::convolution_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
 
 namespace tensorflow {
 
+#ifdef ENABLE_MKLDNN_V1
+#define MKLDNN_SIZE_DTYPE long int
+#else
+#define MKLDNN_SIZE_DTYPE int
+#endif  // ENABLE_MKLDNN_V1
+
 class MklDnnConvUtil {
  protected:
   OpKernelContext* context_;  // We don't own this.
@@ -137,7 +145,7 @@ class MklDnnConvUtil {
       int input_cols = static_cast<int>(input_cols_raw);
 
       // MKL-DNN always requires input in NCHW format Conv2D.
-      std::vector<int> mkldnn_sizes(4, -1);
+      std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(4, -1);
       mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
       mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
       mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
@@ -161,7 +169,7 @@ class MklDnnConvUtil {
       int input_cols = static_cast<int>(input_cols_raw);
 
       // MKL-DNN always requires input in NCDHW format for Conv3D.
-      std::vector<int> mkldnn_sizes(5, -1);
+      std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_N] = input_batch;
       mkldnn_sizes[MklDnnDims3D::Dim3d_C] = input_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_D] = input_planes;
@@ -225,7 +233,7 @@ class MklDnnConvUtil {
       // GOIHW = (group, out_depth, in_depth, rows, cols)
       // Specifically for depthwise G=filter_indepth, O=filter_outdepth, I=1
       if (is_depthwise) {
-        std::vector<int> mkldnn_sizes(5, -1);
+        std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(5, -1);
         mkldnn_sizes[MKL_GROUP_FILTER_DIM_G] = filter_in_depth;
         mkldnn_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth;
         mkldnn_sizes[MKL_GROUP_FILTER_DIM_I] = 1;
@@ -234,7 +242,7 @@ class MklDnnConvUtil {
 
         *filter_dims = mkldnn_sizes;
       } else {
-        std::vector<int> mkldnn_sizes(4, -1);
+        std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(4, -1);
         mkldnn_sizes[MklDnnDims::Dim_O] = filter_out_depth;
         mkldnn_sizes[MklDnnDims::Dim_I] = filter_in_depth;
         mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
@@ -262,7 +270,7 @@ class MklDnnConvUtil {
 
       // MKL-DNN always needs filter in OIDHW format.
       // OIDHW = (out_depth, in_depth, planes, rows, cols)
-      std::vector<int> mkldnn_sizes(5, -1);
+      std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_O] = filter_out_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_I] = filter_in_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
@@ -455,14 +463,14 @@ class MklDnnConvUtil {
 
     if (is_conv2d) {
       // For Conv2D, MKL-DNN always needs output in NCHW format.
-      std::vector<int> mkldnn_sizes(4, -1);
+      std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(4, -1);
       mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
       mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
       mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
     } else {
-      std::vector<int> mkldnn_sizes(5, -1);
+      std::vector<MKLDNN_SIZE_DTYPE> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
       mkldnn_sizes[MklDnnDims3D::Dim3d_C] = out_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_D] = static_cast<int>(out_planes);
@@ -624,6 +632,8 @@ class MklDummyOp : public OpKernel {
   }
 };
 
+#undef MKLDNN_SIZE_DTYPE
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 7c8919929f8..4fbacc13e5a 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -518,6 +518,8 @@ class MklFusedBatchNormOp : public OpKernel {
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
     depth_ = 0;
+    mean_values_ = nullptr;
+    variance_values_ = nullptr;
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 6e652bd0f44..e69fddd327a 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -32,10 +32,23 @@ limitations under the License.
 #include "mkldnn.hpp"
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
-using mkldnn::stream;
 
 namespace tensorflow {
-typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#ifdef ENABLE_MKLDNN_V1
+#define ENGINE_CPU engine::kind::cpu
+#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(md, tensor, net, net_args, engine) \
+  md, tensor, net, net_args, engine
+#define GET_TF_DATA_FORMAT(shape, mem_desc) shape.GetTfDataFormat()
+#define NET_ARGS_PTR &net_args
+#else
+#define ENGINE_CPU engine::cpu
+#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(md, tensor, net_ptr, net_args, \
+                                         engine)                        \
+  memory::primitive_desc(md, engine), tensor, &net_ptr
+#define GET_TF_DATA_FORMAT(shape, mem_desc) mem_desc.data.format
+#define NET_ARGS_PTR nullptr
+#endif  // ENABLE_MKLDNN_V1
 
 ///////////////////////////////////////////////////////////
 //               Op kernel
@@ -107,7 +120,8 @@ class MklInputConversionOp : public OpKernel {
         auto input1_md = input_shape_1.GetMklLayout();
 
         // If both have the same shape and same format, pass them through
-        if (input0_md.data.format == input1_md.data.format) {
+        if (GET_TF_DATA_FORMAT(input_shape_0, input0_md) ==
+            GET_TF_DATA_FORMAT(input_shape_1, input1_md)) {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
@@ -136,19 +150,22 @@ class MklInputConversionOp : public OpKernel {
                                     input_tensor_0.shape(),
                                     mkl_output_mkl_shape);
 
-          // Create MklDnnData object for input0 tesnsor
-          auto cpu_engine = engine(engine::cpu, 0);
+          // Create MklDnnData object for input0 tensor
+          auto cpu_engine = engine(ENGINE_CPU, 0);
           MklDnnData<T> input(&cpu_engine);
           input.SetUsrMem(input0_md, &input_tensor_0);
-
           // Create reorder from input0's layout to input1's layout
           std::vector<primitive> net;
-          CHECK_EQ(input.CheckReorderToOpMem(
-                       memory::primitive_desc(input1_md, cpu_engine),
-                       tensor_out, &net),
-                   true);
-          stream(stream::kind::eager).submit(net).wait();
-
+          std::vector<MemoryArgsMap> net_args;
+          // TODO(bhavanis): Refactor CheckReorderToOpMem() to create and
+          // execute reorder
+          OP_REQUIRES(
+              context,
+              input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
+                  input1_md, tensor_out, net, net_args, cpu_engine)),
+              errors::Internal(
+                  "MklInputConversionOp: Failed to create reorder for input0"));
+          ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
           // Input1 will be passed through
           ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
@@ -235,25 +252,28 @@ class MklInputConversionOp : public OpKernel {
 
       // Create MklDnnData object for input tensor. Input tensor is in
       // Tensorflow layout.
-      auto cpu_engine = engine(engine::cpu, 0);
+      auto cpu_engine = engine(ENGINE_CPU, 0);
       MklDnnData<T> tf_input(&cpu_engine);
       auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
       tf_input.SetUsrMem(input_tf_md, tf_tensor);
-
-      // Create reorder between tensorflow layout and Mkl layout if necessary
+      // Create reorder between TF layout and MKL layout if necessary
       std::vector<primitive> net;
-      bool reordered = tf_input.CheckReorderToOpMem(
-          memory::primitive_desc(output_mkl_md, cpu_engine), tensor_out, &net);
-
+      std::vector<MemoryArgsMap> net_args;
+      bool reordered =
+          tf_input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
+              output_mkl_md, tensor_out, net, net_args, cpu_engine));
       if (!reordered) {
         // This is the case that the TF tensor has the same shape and format of
         // mkl tensor. However, tf_tensor can not be simply forwarded to the
         // output tensor since mkl data tensor is always one dimensional tensor.
         // Tensor::CopyFrom shares the buffer of the other tensor while set its
         // shape to the other tensor.
-        CHECK(tensor_out->CopyFrom(*tf_tensor, tensor_out->shape()));
+        OP_REQUIRES(context,
+                    tensor_out->CopyFrom(*tf_tensor, tensor_out->shape()),
+                    errors::Internal("MklInputConversionOp: Failed to forward "
+                                     "input tensor to output"));
       } else {
-        stream(stream::kind::eager).submit(net).wait();
+        ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
       }
 
       // -- The tensor in MKL format passes through --
@@ -308,6 +328,12 @@ class MklInputConversionOp : public OpKernel {
 // TF_CALL_NUMBER_TYPES(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
+
 #undef REGISTER_CPU
+#undef ENGINE_CPU
+#undef GET_CHECK_REORDER_TO_OP_MEM_ARGS
+#undef GET_TF_DATA_FORMAT
+#undef NET_ARGS_PTR
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index d177bfcb53f..93df6e1ae99 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
@@ -108,8 +109,10 @@ class MklLRNOp : public OpKernel {
         return;
       } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
                                                 1)) {
-        Tensor converted_tensor =
-            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+        Tensor converted_tensor;
+        OP_REQUIRES_OK(context,
+                       ConvertMklToTF<T>(context, src_tensor, src_dnn_shape,
+                                         &converted_tensor));
         MklDefaultToEigen(context, converted_tensor);
         return;
       }
@@ -531,22 +534,26 @@ class MklLRNGradOp : public OpKernel {
     GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
 
     if (input_grad_dnn_shape.IsMklTensor()) {
-      input_gradient_tensor = ConvertMklToTF<T>(
-          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+      OP_REQUIRES_OK(
+          context,
+          ConvertMklToTF<T>(context, MklGetInput(context, kIdxGradient),
+                            input_grad_dnn_shape, &input_gradient_tensor));
     } else {
       input_gradient_tensor = MklGetInput(context, kIdxGradient);
     }
 
     if (orig_input_dnn_shape.IsMklTensor()) {
-      orig_input_tensor = ConvertMklToTF<T>(
-          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+      OP_REQUIRES_OK(context, ConvertMklToTF<T>(
+                                  context, MklGetInput(context, kIdxOrigInput),
+                                  orig_input_dnn_shape, &orig_input_tensor));
     } else {
       orig_input_tensor = MklGetInput(context, kIdxOrigInput);
     }
 
     if (orig_output_dnn_shape.IsMklTensor()) {
-      orig_output_tensor = ConvertMklToTF<T>(
-          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+      OP_REQUIRES_OK(context, ConvertMklToTF<T>(
+                                  context, MklGetInput(context, kIdxOrigOutput),
+                                  orig_output_dnn_shape, &orig_output_tensor));
     } else {
       orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
     }
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 53de24a6b8b..0d203c1d874 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -75,6 +75,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // Declare output tensor
       Tensor* output_tensor = nullptr;
+      // Declare output workspace tensor
+      Tensor* output_ws_tensor = nullptr;
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
@@ -83,6 +85,19 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
         const int kOutputIndex = 0;
         this->AllocateEmptyOutputTensor(context, kOutputIndex, &pool_params,
                                         output_dims_mkl_order, &output_tensor);
+        bool int8_forward_inference =
+            std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+
+        // Allocate an empty workspace tensor if not Quantized MaxPooling
+        // Because Quantized MaxPooling does not have backward pass
+        // Therefore no workspace, which is used to help backward pass in MKL
+        if (!int8_forward_inference) {
+          const int kOutputWorkspaceIndex = 1;
+          // output_ws_tensor is not really used, so using output_dims_mkl_order
+          this->AllocateEmptyOutputTensor(context, kOutputWorkspaceIndex,
+                                          &pool_params, output_dims_mkl_order,
+                                          &output_ws_tensor);
+        }
         return;
       }
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index ec440a0aedf..c2c33d91628 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -548,12 +548,21 @@ class MklPoolingOpBase : public OpKernel {
     if (pool_params->data_format == TensorFormat::FORMAT_NCHW) {
       output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
     } else {
-      memory::dims output_dims_NHWC_order;
-      output_dims_NHWC_order = {pool_params->tensor_in_batch,
-                                static_cast<int>(pool_params->out_height),
-                                static_cast<int>(pool_params->out_width),
-                                pool_params->out_depth};
-      output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+      memory::dims output_dims_order;
+      // determine Pooling2D (NHWC) or Pooling3D (NDHWC)
+      if (this->ksize_.size() == 4) {
+        output_dims_order = {pool_params->tensor_in_batch,
+                             static_cast<int>(pool_params->out_height),
+                             static_cast<int>(pool_params->out_width),
+                             pool_params->out_depth};
+      } else {
+        output_dims_order = {pool_params->tensor_in_batch,
+                             static_cast<int>(pool_params->out_planes),
+                             static_cast<int>(pool_params->out_height),
+                             static_cast<int>(pool_params->out_width),
+                             pool_params->out_depth};
+      }
+      output_tf_shape = MklDnnDimsToTFShape(output_dims_order);
     }
     AllocateOutputSetMklShape(context, kOutputIndex, output_tensor,
                               output_tf_shape, output_mkl_shape);
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc
index fc571602b35..4aff02ac827 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc
@@ -737,8 +737,8 @@ class MklDnnQuantizedMatMulOp : public OpKernel {
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<Toutput>());
-    output_mkl_shape.SetTfLayout2D(output_dims_mkl_order.size(),
-                                   output_dims_mkl_order, output_tf_format);
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format, true);
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
index d9a521cef94..fa47ab377bc 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
@@ -36,9 +36,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// TODO(bhavanis): Move ConvMklToTF to mkl_test_util.h as it is used by
-// most unit tests.
-
 // Helper class for converting MKL tensors to TF tensors and comparing to
 // expected values
 
@@ -47,6 +44,7 @@ static const TensorShape dummy_shape({8});
 
 class ConvMklToTF : public OpsTestBase {
  public:
+  // TODO(bhavanis): Move the below ConvertMklToTF() to mkl_util.h
   template <typename T>
   void ConvertMklToTF(DataType dtype, const Tensor& input,
                       const Tensor& input_metadata_tensor, Tensor& output) {
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index b259c284faf..35b8d87ec38 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -313,6 +313,7 @@ class MklSliceOp : public OpKernel {
     bool done = false;
 
     CheckCommonCasesForMklInputs<T>(context, &begin, &size, &done);
+
     if (!context->status().ok() || done == true) return;
 
     // Though MKL-DNN supports more than 8 dimension and
@@ -387,20 +388,30 @@ class MklSliceOp : public OpKernel {
       // Step 1 (as per above description) - Create memory for user data.
       // We use blocked format here to describe input tensor.
       const Tensor& input_tensor = MklGetInput(context, 0);
+      memory::dims input_dims, input_strides;
       MklDnnShape input_mkl_shape;
       GetMklShape(context, 0, &input_mkl_shape);
 
       if (input_mkl_shape.IsMklTensor()) {
         auto input_mkl_format = input_mkl_shape.GetTfDataFormat();
         auto input_tf_format = MklDnnDataFormatToTFDataFormat(input_mkl_format);
-        begin_dims = MklDnnDimsInNCHW(begin_dims, input_tf_format);
-        size_dims = MklDnnDimsInNCHW(size_dims, input_tf_format);
+
+        bool is_slice2d = (input_mkl_shape.GetDimension() == 4);
+        begin_dims = is_slice2d
+                         ? MklDnnDimsInNCHW(begin_dims, input_tf_format)
+                         : MklDnnDimsInNCDHW(begin_dims, input_tf_format);
+        size_dims = is_slice2d ? MklDnnDimsInNCHW(size_dims, input_tf_format)
+                               : MklDnnDimsInNCDHW(size_dims, input_tf_format);
         auto input_md = input_mkl_shape.GetMklLayout();
         src.SetUsrMem(input_md, &input_tensor);
+
+        // Handle data format safely, change them to block format.
+        // Compute parameters of reorder primitive first.
+        input_dims = input_mkl_shape.GetSizesAsMklDnnDims();
+        input_strides = CalculateTFStrides(input_dims);
       } else {
         // Initialize input dimensions and strides to be used when input is not
         // in MklDnn layout.
-        memory::dims input_dims, input_strides;
         input_dims = TFShapeToMklDnnDims(input_tensor.shape());
         input_strides = CalculateTFStrides(input_dims);
         // Create input memory descriptor.
@@ -409,6 +420,13 @@ class MklSliceOp : public OpKernel {
         src.SetUsrMem(input_md, &input_tensor);
       }
 
+      // If format not equal to block format, execute reorder.
+      // Or else do nothing for it.
+      auto op_md =
+          MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
+      auto op_pd = memory::primitive_desc(op_md, cpu_engine);
+      src.CheckReorderToOpMem(op_pd);
+
       // Step 2 - Create memory for output.
       auto output_strides = CalculateTFStrides(size_dims);
       auto output_md =
@@ -421,7 +439,7 @@ class MklSliceOp : public OpKernel {
       output.SetUsrMem(output_md, output_tensor);
 
       // Step 3 - create reorder primitive.
-      MklSliceParams sliceParams(src.GetUsrMem(), output.GetUsrMem(),
+      MklSliceParams sliceParams(&src.GetOpMem(), output.GetUsrMem(),
                                  begin_dims, size_dims);
       MklSlicePrimitive<T>* reorder_prim =
           MklSlicePrimitiveFactory<T>::Get(sliceParams);
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index bd92b21e1a4..c90a0d32199 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -37,6 +37,15 @@ limitations under the License.
 using mkldnn::stream;
 
 namespace tensorflow {
+
+#ifdef ENABLE_MKLDNN_V1
+#define ENGINE_CPU engine::kind::cpu
+#define OUTPUT_TF_MD output_tf_md
+#else
+#define ENGINE_CPU engine::cpu
+#define OUTPUT_TF_MD output_tf_pd
+#endif  // ENABLE_MKLDNN_V1
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 ///////////////////////////////////////////////////////////
@@ -58,6 +67,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
+  // TODO(bhavanis): Move the below ConvertMklToTf() to mkl_util.h
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
@@ -82,16 +92,18 @@ class MklToTfOp : public OpKernel {
       CHECK_EQ(op_data_type, input_data_type);
       CHECK_EQ(op_data_type, output_data_type);
 
-      auto cpu_engine = engine(engine::cpu, 0);
+      auto cpu_engine = engine(ENGINE_CPU, 0);
       MklDnnData<T> input(&cpu_engine);
 
-      // Get Mkl layout of input tensor.
+      // Get MKL layout of input tensor.
       auto input_mkl_md = input_shape.GetMklLayout();
       // Get TensorFlow layout of input tensor. Expected output of conversion
       // has same layout as Tensorflow layout of input tensor.
       auto output_tf_md = input_shape.GetTfLayout();
+#ifndef ENABLE_MKLDNN_V1
       auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-      // Set input Mkl layout as the user layout.
+#endif  // !ENABLE_MKLDNN_V1
+      // Set input MKL layout as the user layout.
       input.SetUsrMem(input_mkl_md, &input_tensor);
 
       // Allocate output tensor.
@@ -101,13 +113,18 @@ class MklToTfOp : public OpKernel {
                                   input_number, output_shape, &output_tensor));
       CHECK_NOTNULL(output_tensor);
 
-      // Do we need to reorder Mkl layout into TensorFlow layout?
-      if (input.IsReorderNeeded(output_tf_pd)) {
-        // Insert reorder between Mkl layout and TensorFlow layout.
-        CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, output_tensor), true);
+      // Check if input needs to be reordered
+      if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
+        // Insert reorder between MKL layout and TensorFlow layout
+        OP_REQUIRES(
+            context, input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor),
+            errors::Internal("MklToTfOp: Failed to create input reorder"));
       } else {
         // If not, just forward input tensor to output tensor.
-        CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
+        OP_REQUIRES(context,
+                    output_tensor->CopyFrom(input_tensor, output_shape),
+                    errors::Internal(
+                        "MklToTfOp: Failed to forward input tensor to output"));
       }
     } catch (mkldnn::error& e) {
       OP_REQUIRES_OK(
@@ -143,7 +160,11 @@ class MklToTfOp : public OpKernel {
 
 TF_CALL_NUMBER_TYPES(REGISTER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU);
+
 #undef REGISTER_CPU
+#undef ENGINE_CPU
+#undef OUTPUT_TF_MD
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 97603833182..4cb38d5873e 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -45,8 +45,10 @@ using GPUDevice = Eigen::GpuDevice;
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
 template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
-                                  const int32 num_samples, const float* scores,
-                                  const float* maxima, OutputType* output) {
+                                  const int32 num_samples,
+                                  const float* __restrict__ scores,
+                                  const float* __restrict__ maxima,
+                                  OutputType* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index 666a144fe24..9ccf591058e 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <vector>
 
+#if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/rccl/rccl.h"
+#endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
 
@@ -93,9 +97,9 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
     const Tensor* input = &c->input(0);
     Tensor* output;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, input->shape(), &output),
-                         done);
-
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, input->shape(), &output),
+        done);
     auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
@@ -112,7 +116,7 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
         {GetCollectiveKey(c),
          /*num_local_devices=*/num_devices(),
          /*num_global_devices=*/num_devices(),
-         /*communicator_key=*/""},
+         /*communicator_key=*/"", /*source_rank=*/-1},
         reduction_op());
   }
 };
@@ -144,7 +148,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
         {GetCollectiveKey(c),
          /*num_local_devices=*/num_devices(),
          /*num_global_devices=*/num_devices(),
-         /*communicator_key=*/""},
+         /*communicator_key=*/"", /*source_rank=*/-1},
         reduction_op());
   }
 };
@@ -181,7 +185,7 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
         {GetCollectiveKey(c),
          /*num_local_devices=*/num_devices(),
          /*num_global_devices=*/num_devices(),
-         /*communicator_key=*/""},
+         /*communicator_key=*/"", /*source_rank=*/-1},
         reduction_op());
   }
 
@@ -215,7 +219,7 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
         std::move(participant), {GetCollectiveKey(c),
                                  /*num_local_devices=*/num_devices(),
                                  /*num_global_devices=*/num_devices(),
-                                 /*communicator_key=*/""});
+                                 /*communicator_key=*/"", /*source_rank=*/-1});
   }
 };
 REGISTER_KERNEL_BUILDER(Name("_NcclBroadcastSend").Device(DEVICE_GPU),
@@ -252,7 +256,7 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
         std::move(participant), {GetCollectiveKey(c),
                                  /*num_local_devices=*/num_devices(),
                                  /*num_global_devices=*/num_devices(),
-                                 /*communicator_key=*/""});
+                                 /*communicator_key=*/"", /*source_rank=*/-1});
   }
 };
 REGISTER_KERNEL_BUILDER(
@@ -276,4 +280,4 @@ REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 8b4d3d991ce..9c52be6897e 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -108,6 +108,7 @@ static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
                          CONV_OP op, int num_threads, int stride,
                          Padding padding, bool use_gpu, DataType data_type,
                          const string& label) {
+  testing::StopTiming();
   if (!IsGoogleCudaEnabled() && use_gpu) {
     testing::SetLabel(
         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -221,6 +222,7 @@ static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
 
   string device = use_gpu ? "gpu" : "cpu";
   testing::UseRealTime();
+  testing::StartTiming();
   test::Benchmark(device, g, &options).Run(iters);
   testing::ItemsProcessed(num_ops * iters);
 }
@@ -502,6 +504,7 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
                                   int filter_cols, DEPTHWISE_CONV_OP op,
                                   int num_threads, int stride, Padding padding,
                                   bool use_gpu, const string& label) {
+  testing::StopTiming();
   if (!IsGoogleCudaEnabled() && use_gpu) {
     testing::SetLabel(
         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -601,6 +604,7 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
 
   string device = use_gpu ? "gpu" : "cpu";
   testing::UseRealTime();
+  testing::StartTiming();
   test::Benchmark(device, g, &options).Run(iters);
   testing::ItemsProcessed(num_ops * iters);
 }
@@ -1255,6 +1259,79 @@ BM_Relu(32, 56, 56, 192, 4, "relu1");
 BM_Relu(32, 28, 28, 352, 4, "relu4");
 BM_Relu(32, 14, 14, 576, 4, "relu10");
 
+/*
+Softplus Op
+Run benchmark with:
+*/
+static void BM_SoftplusFloat(int iters, int batch_size, int rows, int cols,
+                             int depth, int num_threads, const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  Eigen::ThreadPoolDevice eigen_cpu_device(threadpool.AsEigenThreadPool(),
+                                           num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape1({batch_size, rows, cols, depth});
+  Tensor input1(DT_FLOAT, shape1);
+  input1.flat<float>().setRandom();
+  inputs.push_back({nullptr, &input1});
+
+  // Softplusing op.
+  NodeDef softplus_node_def;
+  Status status = NodeDefBuilder("softplus_op", "Softplus")
+                      .Input(FakeInput(DT_FLOAT))
+                      .Finalize(&softplus_node_def);
+  TF_CHECK_OK(status);
+  std::unique_ptr<OpKernel> op(
+      CreateOpKernel(DEVICE_CPU, device.get(), cpu_allocator(),
+                     softplus_node_def, TF_GRAPH_DEF_VERSION, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  std::vector<AllocatorAttributes> attrs;
+  test::SetOutputAttrs(&params, &attrs);
+
+  std::unique_ptr<OpKernelContext> softplus_context(
+      new OpKernelContext(&params));
+
+  op->Compute(softplus_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete softplus_context->release_output(0).tensor;
+    op->Compute(softplus_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(softplus_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+#define BM_Softplus(BS, IR, IC, ND, TH, LABEL)                               \
+  static void BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
+    BM_SoftplusFloat(iters, BS, IR, IC, ND, TH, LABEL);                      \
+  }                                                                          \
+  BENCHMARK(BM_SoftplusFloat_##BS##_##IR##_##IC##_##ND##_##TH)
+
+BM_Softplus(32, 112, 112, 64, 1, "softplus0");
+BM_Softplus(32, 56, 56, 192, 1, "softplus1");
+BM_Softplus(32, 28, 28, 352, 1, "softplus4");
+BM_Softplus(32, 14, 14, 576, 1, "softplus10");
+BM_Softplus(32, 112, 112, 64, 4, "softplus0");
+BM_Softplus(32, 56, 56, 192, 4, "softplus1");
+BM_Softplus(32, 28, 28, 352, 4, "softplus4");
+BM_Softplus(32, 14, 14, 576, 4, "softplus10");
+
 static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
                                   int num_threads, bool use_gpu,
                                   const string& label) {
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 79d3ea71c8a..8b51a39cb58 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -94,8 +94,8 @@ __device__ EIGEN_STRONG_INLINE float legacy_offset<false>(float a) {
 // Check whether two boxes have an IoU greater than threshold.
 template <typename T, bool L>
 __device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* a, const Box* b,
-                                                  float a_area,
-                                                  T iou_threshold) {
+                                                  const float a_area,
+                                                  const T iou_threshold) {
   const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
   if (a_area == 0.0f || b_area == 0.0f) return false;
   const float xx1 = fmaxf(a->x1, b->x1);
@@ -182,9 +182,9 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
 // x1<x2 and y1<y2.
 template <bool flip_box, bool legacy_mode>
 __launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
-    void NMSKernel(const Box* d_desc_sorted_boxes, const int num_boxes,
-                   const float iou_threshold, const int bit_mask_len,
-                   int* d_delete_mask) {
+    void NMSKernel(const Box* __restrict__ d_desc_sorted_boxes,
+                   const int num_boxes, const float iou_threshold,
+                   const int bit_mask_len, int* __restrict__ d_delete_mask) {
   // Storing boxes used by this CUDA block in the shared memory.
   __shared__ Box shared_i_boxes[kNmsBlockDim];
   // Same thing with areas
@@ -247,7 +247,8 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
 template <typename Index, typename T, typename... Args>
 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
                                                  const Index i_original,
-                                                 const T* original, T* selected,
+                                                 const T* __restrict__ original,
+                                                 T* __restrict__ selected,
                                                  Args... args) {
   selected[i_selected] = original[i_original];
   SelectHelper(i_selected, i_original, args...);
@@ -260,15 +261,18 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
 // IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
 // selected2).
 template <typename Index, typename T, typename... Args>
-__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
-                                 const T* original, T* selected, Args... args) {
+__global__ void IndexMultiSelect(const int num_elements,
+                                 const Index* __restrict__ indices,
+                                 const T* __restrict__ original,
+                                 T* __restrict__ selected, Args... args) {
   for (const int idx : CudaGridRangeX(num_elements)) {
     SelectHelper(idx, indices[idx], original, selected, args...);
   }
 }
 
 template <typename T>
-__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
+__global__ void Iota(const int num_elements, const T offset,
+                     T* to_fill) {
   for (int idx : CudaGridRangeX(num_elements)) {
     to_fill[idx] = static_cast<T>(idx) + offset;
   }
diff --git a/tensorflow/core/kernels/ops_testutil_test.cc b/tensorflow/core/kernels/ops_testutil_test.cc
index 239e460825b..bc2626a6990 100644
--- a/tensorflow/core/kernels/ops_testutil_test.cc
+++ b/tensorflow/core/kernels/ops_testutil_test.cc
@@ -29,7 +29,7 @@ TEST_F(OpsTestBase, ScopedStepContainer) {
                    .Input(FakeInput(DT_STRING))
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
-  AddInputFromArray<string>(TensorShape({}), {""});
+  AddInputFromArray<tstring>(TensorShape({}), {""});
   TF_EXPECT_OK(RunOpKernel());
   EXPECT_TRUE(step_container_ != nullptr);
 }
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 5e57365e3d3..94315f75c38 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -142,7 +142,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
-REGISTER_PACK(string);
+REGISTER_PACK(tstring);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
         // !defined(SUPPORT_SELECTIVE_REGISTRATION)
 
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index a55b4afb9c8..a9d8e591e14 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -52,7 +52,7 @@ class PadOp : public OpKernel {
     const Tensor& in1 = context->input(1);
     const int dims = in0.dims();
     static const int kMinDims = 0;
-    static const int kMaxDims = 6;
+    static const int kMaxDims = 8;
     OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims,
                 errors::Unimplemented("inputs rank not in [", kMinDims, ",",
                                       kMaxDims, "]: ", dims));
@@ -291,7 +291,7 @@ class PadOp : public OpKernel {
                           PadOp<CPUDevice, type, int64>);
 
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
-TF_CALL_string(REGISTER_KERNEL);
+TF_CALL_tstring(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index ddc12417a91..2ef238af9d5 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -34,7 +34,9 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Pad<GPUDevice, T, Tpadding, 3>; \
   template struct functor::Pad<GPUDevice, T, Tpadding, 4>; \
   template struct functor::Pad<GPUDevice, T, Tpadding, 5>; \
-  template struct functor::Pad<GPUDevice, T, Tpadding, 6>;
+  template struct functor::Pad<GPUDevice, T, Tpadding, 6>; \
+  template struct functor::Pad<GPUDevice, T, Tpadding, 7>; \
+  template struct functor::Pad<GPUDevice, T, Tpadding, 8>;
 
 #define DEFINE_GPU_SPECS(T)      \
   DEFINE_GPU_PAD_SPECS(T, int32) \
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 5c26126371f..af972a1eb5e 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -53,10 +53,11 @@ template <typename T>
 __global__ void __launch_bounds__(1024)
     TruncatedNormalKernel(random::PhiloxRandom gen, T* data, int64 num_batches,
                           int64 samples_per_batch, int64 num_elements,
-                          const T* means, bool single_mean, const T* stddevs,
-                          bool single_stddev, const T* minvals,
-                          bool single_minval, const T* maxvals,
-                          bool single_maxval, int64 kMaxIterations) {
+                          const T* __restrict__ means, bool single_mean,
+                          const T* __restrict__ stddevs, bool single_stddev,
+                          const T* __restrict__ minvals, bool single_minval,
+                          const T* __restrict__ maxvals, bool single_maxval,
+                          int64 kMaxIterations) {
   const int32 max_samples_per_item = 2 * kMaxIterations;
   // Initial offset as given by GPU_1D_KERNEL_LOOP.
   const int32 initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 8e175fe8d4b..c02bfa7073d 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -39,7 +39,7 @@ class ParseTensorOp : public OpKernel {
                     "Expected `serialized` to be a scalar, got shape: ",
                     serialized.shape().DebugString()));
 
-    auto serialized_t = serialized.scalar<string>();
+    auto serialized_t = serialized.scalar<tstring>();
 
     TensorProto proto;
     OP_REQUIRES(ctx, ParseProtoUnlimited(&proto, serialized_t()),
@@ -82,7 +82,7 @@ class SerializeTensorOp : public OpKernel {
     Tensor* proto_string = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &proto_string));
-    CHECK(proto.SerializeToString(&proto_string->scalar<string>()()));
+    CHECK(SerializeToTString(proto, &proto_string->scalar<tstring>()()));
   }
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc
index 4a5fc07935c..f32419c350a 100644
--- a/tensorflow/core/kernels/parse_tensor_test.cc
+++ b/tensorflow/core/kernels/parse_tensor_test.cc
@@ -186,12 +186,12 @@ TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_bool) {
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_string) {
-  MakeOp<string>(TensorShape({10}),
-                 [](int x) -> string { return std::to_string(x / 10.); });
+  MakeOp<tstring>(TensorShape({10}),
+                  [](int x) -> tstring { return std::to_string(x / 10.); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<string>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<string>(parse_output, GetInput(0));
+  ParseSerializedOutput<tstring>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<tstring>(parse_output, GetInput(0));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
index a38e7de1d2d..92e52c3b60f 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -28,13 +28,14 @@ namespace {
 
 template <typename dtype>
 __global__ void MaxPoolGradBackwardNoMaskNCDHW(
-    const int nthreads, const dtype* bottom_data, const dtype* output_data,
-    const int pooled_plane, const int pooled_height, const int pooled_width,
-    const int channels, const int plane, const int height, const int width,
-    const int kernel_p, const int kernel_h, const int kernel_w,
-    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
-    const int pad_t, const int pad_l, const dtype* top_diff,
-    dtype* bottom_diff) {
+    const int nthreads, const dtype* __restrict__ bottom_data,
+    const dtype* __restrict__ output_data, const int pooled_plane,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int plane, const int height, const int width, const int kernel_p,
+    const int kernel_h, const int kernel_w, const int stride_p,
+    const int stride_h, const int stride_w, const int pad_p, const int pad_t,
+    const int pad_l, const dtype* __restrict__ top_diff,
+    dtype* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // First find out the index to the maximum, since we have no mask.
     int pw = index % pooled_width;
@@ -78,13 +79,14 @@ __global__ void MaxPoolGradBackwardNoMaskNCDHW(
 
 template <typename dtype>
 __global__ void MaxPoolGradBackwardNoMaskNDHWC(
-    const int nthreads, const dtype* bottom_data, const dtype* output_data,
-    const int pooled_plane, const int pooled_height, const int pooled_width,
-    const int channels, const int plane, const int height, const int width,
-    const int kernel_p, const int kernel_h, const int kernel_w,
-    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
-    const int pad_t, const int pad_l, const dtype* top_diff,
-    dtype* bottom_diff) {
+    const int nthreads, const dtype* __restrict__ bottom_data,
+    const dtype* __restrict__ output_data, const int pooled_plane,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int plane, const int height, const int width, const int kernel_p,
+    const int kernel_h, const int kernel_w, const int stride_p,
+    const int stride_h, const int stride_w, const int pad_p, const int pad_t,
+    const int pad_l, const dtype* __restrict__ top_diff,
+    dtype* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     // First find out the index to the maximum, since we have no mask.
     int n = index;
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index cd37a2570c9..997c2aba62f 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -317,6 +317,7 @@ void DnnPoolingGradOp<T>::Compute(
     return;
   }
 
+#if CUDNN_VERSION < 7300
   /// For now, cudnn does not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
   Tensor transformed_input;
@@ -382,6 +383,40 @@ void DnnPoolingGradOp<T>::Compute(
         context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
         transformed_output_backprop.tensor<T, 4>());
   }
+  se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
+#else
+  Tensor transformed_input;
+  if (!tensor_in) {
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          tensor_in_shape, &transformed_input));
+  } else {
+    transformed_input = *tensor_in;
+  }
+  Tensor transformed_output;
+  if (!tensor_out) {
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   out_backprop.shape(),
+                                                   &transformed_output));
+  } else {
+    transformed_output = *tensor_out;
+  }
+  Tensor transformed_input_backprop = *input_backprop;
+  Tensor transformed_output_backprop = out_backprop;
+  se::dnn::DataLayout data_layout;
+  switch (data_format) {
+    case FORMAT_NHWC:
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      break;
+    case FORMAT_NCHW:
+      data_layout = se::dnn::DataLayout::kBatchDepthYX;
+      break;
+    default:
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unsupported format: ",
+                                          ToString(data_format)));
+  }
+#endif  // CUDNN_VERSION < 7300
 
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
@@ -399,14 +434,14 @@ void DnnPoolingGradOp<T>::Compute(
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(data_layout);
 
   se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(data_layout);
 
   auto orig_output_data =
       AsDeviceMemory(transformed_output.template flat<T>().data(),
@@ -449,6 +484,7 @@ void DnnPoolingGradOp<T>::Compute(
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolBackward launch failed"));
 
+#if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC.
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
@@ -457,6 +493,7 @@ void DnnPoolingGradOp<T>::Compute(
         toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
         input_backprop->tensor<T, 4>());
   }
+#endif  // CUDNN_VERSION < 7300
 }
 
 #define DEFINE_DNN_OPS(T)         \
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
index c04975fa0e7..6721c7adcef 100644
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -33,14 +33,16 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 template <typename T>
-__global__ void PopulationCountKernel(const int size, const T* input,
-                                      uint8* output) {
+__global__ void PopulationCountKernel(const int size,
+                                      const T* __restrict__ input,
+                                      uint8* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
 }
 
 template <>
-__global__ void PopulationCountKernel(const int size, const int8* input,
-                                      uint8* output) {
+__global__ void PopulationCountKernel(const int size,
+                                      const int8* __restrict__ input,
+                                      uint8* __restrict__ output) {
   // For some reason, __popc on a negative int8 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
     output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
@@ -48,8 +50,9 @@ __global__ void PopulationCountKernel(const int size, const int8* input,
 }
 
 template <>
-__global__ void PopulationCountKernel(const int size, const int16* input,
-                                      uint8* output) {
+__global__ void PopulationCountKernel(const int size,
+                                      const int16* __restrict__ input,
+                                      uint8* __restrict__ output) {
   // For some reason, __popc on a negative int16 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
     output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
@@ -57,8 +60,9 @@ __global__ void PopulationCountKernel(const int size, const int16* input,
 }
 
 template <>
-__global__ void PopulationCountKernel<int64>(const int size, const int64* input,
-                                             uint8* output) {
+__global__ void PopulationCountKernel<int64>(const int size,
+                                             const int64* __restrict__ input,
+                                             uint8* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
 }
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index e84bddd63ff..50d33167f11 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -45,6 +45,7 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   explicit QuantizeAndDequantizeV2Op(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_));
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
@@ -70,38 +71,67 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
-
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
     Tensor input_min_tensor;
     Tensor input_max_tensor;
+    Tensor* input_min_tensor_ptr = &input_min_tensor;
+    Tensor* input_max_tensor_ptr = &input_max_tensor;
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
     if (range_given_) {
-      input_min_tensor = ctx->input(1);
-      input_max_tensor = ctx->input(2);
-      auto min_val = input_min_tensor.scalar<T>()();
-      auto max_val = input_max_tensor.scalar<T>()();
-      OP_REQUIRES(ctx, min_val <= max_val,
-                  errors::InvalidArgument("Invalid range: input_min ", min_val,
-                                          " > input_max ", max_val));
+      input_min_tensor_ptr = const_cast<Tensor*>(&(ctx->input(1)));
+      input_max_tensor_ptr = const_cast<Tensor*>(&(ctx->input(2)));
+      if (axis_ == -1) {
+        auto min_val = input_min_tensor_ptr->scalar<T>()();
+        auto max_val = input_max_tensor_ptr->scalar<T>()();
+        OP_REQUIRES(ctx, min_val <= max_val,
+                    errors::InvalidArgument("Invalid range: input_min ",
+                                            min_val, " > input_max ", max_val));
+      } else {
+        OP_REQUIRES(ctx, input_min_tensor_ptr->dim_size(0) == depth,
+                    errors::InvalidArgument(
+                        "input_min_tensor has incorrect size, was ",
+                        input_min_tensor_ptr->dim_size(0), " expected ", depth,
+                        " to match dim ", axis_, " of the input ",
+                        input_min_tensor_ptr->shape()));
+        OP_REQUIRES(ctx, input_max_tensor_ptr->dim_size(0) == depth,
+                    errors::InvalidArgument(
+                        "input_max_tensor has incorrect size, was ",
+                        input_max_tensor_ptr->dim_size(0), " expected ", depth,
+                        " to match dim ", axis_, " of the input ",
+                        input_max_tensor_ptr->shape()));
+      }
     } else {
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape(), &input_min_tensor));
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape(), &input_max_tensor));
+      auto range_shape = (axis_ == -1) ? TensorShape({}) : TensorShape({depth});
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, range_shape,
+                                        input_min_tensor_ptr));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, range_shape,
+                                        input_max_tensor_ptr));
     }
 
-    functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
-    f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
-      range_given_, &input_min_tensor, &input_max_tensor, round_mode_,
-      narrow_range_, output->flat<T>());
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
+        range_given_, input_min_tensor_ptr, input_max_tensor_ptr, round_mode_,
+        narrow_range_, output->flat<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1), signed_input_,
+        num_bits_, range_given_, input_min_tensor_ptr, input_max_tensor_ptr,
+        round_mode_, narrow_range_,
+        output->template flat_inner_outer_dims<T, 3>(axis_ - 1));
+    }
   }
 
  private:
-  bool signed_input_;
   int num_bits_;
-  bool range_given_;
+  int axis_;
   QuantizerRoundMode round_mode_;
+  bool signed_input_;
+  bool range_given_;
   bool narrow_range_;
 };
 
@@ -120,11 +150,12 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
   }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
-
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
 
@@ -139,28 +170,58 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
 
     Tensor input_min_tensor;
     Tensor input_max_tensor;
+    Tensor* input_min_tensor_ptr = &input_min_tensor;
+    Tensor* input_max_tensor_ptr = &input_max_tensor;
     if (range_given_) {
-      input_min_tensor = ctx->input(1);
-      input_max_tensor = ctx->input(2);
-      auto min_val = input_min_tensor.scalar<T>()();
-      auto max_val = input_max_tensor.scalar<T>()();
-      OP_REQUIRES(ctx, min_val <= max_val,
-                  errors::InvalidArgument("Invalid range: input_min ", min_val,
-                                          " > input_max ", max_val));
+      input_min_tensor_ptr = const_cast<Tensor*>(&(ctx->input(1)));
+      input_max_tensor_ptr = const_cast<Tensor*>(&(ctx->input(2)));
+      if (axis_ == -1) {
+        auto min_val = input_min_tensor_ptr->scalar<T>()();
+        auto max_val = input_max_tensor_ptr->scalar<T>()();
+        OP_REQUIRES(ctx, min_val <= max_val,
+                    errors::InvalidArgument("Invalid range: input_min ",
+                                            min_val, " > input_max ", max_val));
+      } else {
+        OP_REQUIRES(ctx, input_min_tensor_ptr->dim_size(0) == depth,
+                    errors::InvalidArgument(
+                        "input_min_tensor has incorrect size, was ",
+                        input_min_tensor_ptr->dim_size(0), " expected ", depth,
+                        " to match dim ", axis_, " of the input ",
+                        input_min_tensor_ptr->shape()));
+        OP_REQUIRES(ctx, input_max_tensor_ptr->dim_size(0) == depth,
+                    errors::InvalidArgument(
+                        "input_max_tensor has incorrect size, was ",
+                        input_max_tensor_ptr->dim_size(0), " expected ", depth,
+                        " to match dim ", axis_, " of the input ",
+                        input_max_tensor_ptr->shape()));
+      }
     } else {
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape(), &input_min_tensor));
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape(), &input_max_tensor));
+      auto range_shape = (axis_ == -1) ? TensorShape({}) : TensorShape({depth});
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, range_shape,
+                                        input_min_tensor_ptr));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, range_shape,
+                                        input_max_tensor_ptr));
     }
 
-    functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
-    f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
-      range_given_, &input_min_tensor, &input_max_tensor, ROUND_HALF_TO_EVEN,
-      narrow_range_, output->flat<T>());
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
+        num_bits_val, range_given_, input_min_tensor_ptr, input_max_tensor_ptr,
+        ROUND_HALF_TO_EVEN, narrow_range_, output->flat<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1), signed_input_,
+        num_bits_val, range_given_, input_min_tensor_ptr, input_max_tensor_ptr,
+        ROUND_HALF_TO_EVEN, narrow_range_,
+        output->template flat_inner_outer_dims<T, 3>(axis_ - 1));
+    }
   }
 
  private:
+  int axis_;
   bool signed_input_;
   bool range_given_;
   bool narrow_range_;
@@ -214,7 +275,8 @@ class QuantizeAndDequantizeOp : public OpKernel {
   float input_max_;
 };
 
-// Specialization for CPUDevice.
+// Specializations for CPUDevice.
+
 namespace functor {
 template <typename T>
 struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
@@ -228,6 +290,19 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 3>::ConstTensor input,
+                  bool signed_input, int num_bits, bool range_given,
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  QuantizerRoundMode round_mode, bool narrow_range,
+                  typename TTypes<T, 3>::Tensor out) {
+    QuantizeAndDequantizePerChannelImpl<CPUDevice, T>::Compute(
+        d, input, signed_input, num_bits, range_given, input_min_tensor,
+        input_max_tensor, round_mode, narrow_range, out);
+  }
+};
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -251,14 +326,14 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #define REGISTER_GPU_KERNEL(T)                                                 \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
                               .Device(DEVICE_GPU)                              \
-                              .HostMemory("input_max")                         \
                               .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_GPU)                              \
-                              .HostMemory("input_max")                         \
                               .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
                               .HostMemory("num_bits")                          \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV3Op<GPUDevice, T>);            \
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 22a411efc20..3ecb89d2129 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -46,62 +48,111 @@ struct QuantizeAndDequantizeOneScaleFunctor {
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
                   QuantizerRoundMode round_mode, bool narrow_range,
-                  typename TTypes<T>::Vec out);
+                  typename TTypes<T>::Vec output);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor input,
+                  bool signed_input, int num_bits, bool range_given,
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  QuantizerRoundMode round_mode, bool narrow_range,
+                  typename TTypes<T, 3>::Tensor output);
 };
 
 // The implementation below runs on both CPU and GPU.
-template <typename Device, typename T, typename Func>
-void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
-                        T min_range, T max_range, T scale, T inverse_scale,
-                        Func round_func, typename TTypes<T>::Vec out) {
-  out.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
-                      .unaryExpr(round_func) *
-                  inverse_scale;
+template <typename Device, typename T, typename Func,
+          typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
+                        T max_range, T scale, T inverse_scale, Func round_func,
+                        Vec output) {
+  output.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
+                         .unaryExpr(round_func) *
+                     inverse_scale;
 }
 
 // The implementation below runs on both CPU and GPU.
-template <typename Device, typename T>
-void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
-                        T min_range, T max_range, T scale, T inverse_scale,
-                        QuantizerRoundMode round_mode,
-                        typename TTypes<T>::Vec out) {
+template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
+                        T max_range, T scale, T inverse_scale,
+                        QuantizerRoundMode round_mode, Vec output) {
   switch (round_mode) {
     case ROUND_HALF_TO_EVEN:
       ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
-                         Eigen::internal::scalar_round_op_google<T>(), out);
+                         Eigen::internal::scalar_round_op_google<T>(), output);
       break;
     case ROUND_HALF_UP:
       ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
-                         Eigen::internal::scalar_round_up_op<T>(), out);
+                         Eigen::internal::scalar_round_up_op<T>(), output);
       break;
   }
 }
 
 // The implementation below runs on both CPU and GPU.
-template <typename Device, typename T, typename Func>
-void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
-                   T inverse_scale, Func round_func,
-                   typename TTypes<T>::Vec out) {
-  out.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
+template <typename Device, typename T, typename Func,
+          typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
+                   Func round_func, Vec output) {
+  output.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
 }
 
 // The implementation below runs on both CPU and GPU.
-template <typename Device, typename T>
-void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
-                   T inverse_scale, QuantizerRoundMode round_mode,
-                   typename TTypes<T>::Vec out) {
+template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
+                   QuantizerRoundMode round_mode, Vec output) {
   switch (round_mode) {
     case ROUND_HALF_TO_EVEN:
       ScaleAndRound(d, input, scale, inverse_scale,
-                    Eigen::internal::scalar_round_op_google<T>(), out);
+                    Eigen::internal::scalar_round_op_google<T>(), output);
       break;
     case ROUND_HALF_UP:
       ScaleAndRound(d, input, scale, inverse_scale,
-                    Eigen::internal::scalar_round_up_op<T>(), out);
+                    Eigen::internal::scalar_round_up_op<T>(), output);
       break;
   }
 }
 
+template <typename T>
+void ComputeQuantizationRange(bool signed_input, int num_bits,
+                              QuantizerRoundMode round_mode, bool narrow_range,
+                              T* min_range, T* max_range, T* scale,
+                              T* inverse_scale) {
+  // Calculate the range for the simulated integer quantization:
+  // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
+  // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
+  // or [0, 255] for signed = false, num_bits = 8.
+  const int64 min_quantized = signed_input ? narrow_range
+                                                 ? -(1ULL << (num_bits - 1)) + 1
+                                                 : -(1ULL << (num_bits - 1))
+                                           : 0;
+  const int64 max_quantized =
+      signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
+  // Determine the maximum scaling factor that would scale
+  // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+  // while keeping 0 unchanged.
+  const T scale_from_min_side = (min_quantized * *min_range > 0)
+                                    ? min_quantized / *min_range
+                                    : std::numeric_limits<T>::max();
+  const T scale_from_max_side = (max_quantized * *max_range > 0)
+                                    ? max_quantized / *max_range
+                                    : std::numeric_limits<T>::max();
+
+  // Note: Avoids changing the side of the range that determines scale.
+  if (scale_from_min_side < scale_from_max_side) {
+    *scale = scale_from_min_side;
+    *inverse_scale = *min_range / min_quantized;
+    *max_range = max_quantized * *inverse_scale;
+  } else {
+    *scale = scale_from_max_side;
+    *inverse_scale = *max_range / max_quantized;
+    *min_range = min_quantized * *inverse_scale;
+  }
+}
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleImpl {
@@ -109,7 +160,7 @@ struct QuantizeAndDequantizeOneScaleImpl {
                       bool signed_input, int num_bits, bool range_given,
                       Tensor* input_min_tensor, Tensor* input_max_tensor,
                       QuantizerRoundMode round_mode, bool narrow_range,
-                      typename TTypes<T>::Vec out) {
+                      typename TTypes<T>::Vec output) {
     T min_range;
     T max_range;
     auto input_min = input_min_tensor->scalar<T>();
@@ -125,37 +176,9 @@ struct QuantizeAndDequantizeOneScaleImpl {
       max_range = input_max_tensor->scalar<T>()();
     }
 
-    // Calculate the range for the simulated integer quantization:
-    // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
-    // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
-    // or [0, 255] for signed = false, num_bits = 8.
-    const int64 min_quantized =
-        signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
-                                    : -(1ULL << (num_bits - 1))
-                     : 0;
-    const int64 max_quantized =
-        signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
-    // Determine the maximum scaling factor that would scale
-    // [min_range, max_range] to not exceed [min_quantized, max_quantized],
-    // while keeping 0 unchanged.
-    const T scale_from_min_side = (min_quantized * min_range > 0)
-                                      ? min_quantized / min_range
-                                      : std::numeric_limits<T>::max();
-    const T scale_from_max_side = (max_quantized * max_range > 0)
-                                      ? max_quantized / max_range
-                                      : std::numeric_limits<T>::max();
-
-    // Note: Avoids changing the side of the range that determines scale.
     T scale, inverse_scale;
-    if (scale_from_min_side < scale_from_max_side) {
-      scale = scale_from_min_side;
-      inverse_scale = min_range / min_quantized;
-      max_range = max_quantized * inverse_scale;
-    } else {
-      scale = scale_from_max_side;
-      inverse_scale = max_range / max_quantized;
-      min_range = min_quantized * inverse_scale;
-    }
+    ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
+                             &min_range, &max_range, &scale, &inverse_scale);
 
     if (range_given) {
       // Note: The clamping here is to avoid overflow in the quantized type.
@@ -163,9 +186,64 @@ struct QuantizeAndDequantizeOneScaleImpl {
       // min_range and max_range - because we may have changed either min_range
       // or max_range.
       ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
-                         round_mode, out);
+                         round_mode, output);
     } else {
-      ScaleAndRound(d, input, scale, inverse_scale, round_mode, out);
+      ScaleAndRound(d, input, scale, inverse_scale, round_mode, output);
+    }
+  }
+};
+
+// The implementation below runs on both CPU and GPU.
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelImpl {
+  static void Compute(const Device& d, typename TTypes<T, 3>::ConstTensor input,
+                      bool signed_input, int num_bits, bool range_given,
+                      Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode, bool narrow_range,
+                      typename TTypes<T, 3>::Tensor output) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    int num_channels = input.dimension(1);
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    std::vector<T> min_range(num_channels);
+    std::vector<T> max_range(num_channels);
+
+    if (!range_given) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+      Eigen::array<int, 2> reduce_dims{{0, 2}};
+#else
+      Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2> > reduce_dims;
+#endif
+      input_min.device(d) = input.minimum(reduce_dims);
+      input_max.device(d) = input.maximum(reduce_dims);
+      d.memcpyDeviceToHost(min_range.data(), input_min.data(),
+                           num_channels * sizeof(T));
+      d.memcpyDeviceToHost(max_range.data(), input_max.data(),
+                           num_channels * sizeof(T));
+    } else {
+      // Copy the range values from their respective tensors on the host.
+      std::memcpy(min_range.data(), input_min_tensor->vec<T>().data(),
+                  num_channels * sizeof(T));
+      std::memcpy(max_range.data(), input_max_tensor->vec<T>().data(),
+                  num_channels * sizeof(T));
+    }
+
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto input_chip = input.template chip<1>(i);
+      auto output_chip = output.template chip<1>(i);
+
+      T scale, inverse_scale;
+      ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
+                               &min_range[i], &max_range[i], &scale,
+                               &inverse_scale);
+      if (range_given) {
+        ClampScaleAndRound(d, input_chip, min_range[i], max_range[i], scale,
+                           inverse_scale, round_mode, output_chip);
+      } else {
+        ScaleAndRound(d, input_chip, scale, inverse_scale, round_mode,
+                      output_chip);
+      }
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index fa7f0ee840f..f3bb41071cb 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -33,12 +33,26 @@ struct QuantizeAndDequantizeOneScaleFunctor<GPUDevice, T> {
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
                   QuantizerRoundMode round_mode, bool narrow_range,
-                  typename TTypes<T>::Vec out) {
+                  typename TTypes<T>::Vec output) {
     QuantizeAndDequantizeOneScaleImpl<GPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, round_mode, narrow_range, out);
+        input_max_tensor, round_mode, narrow_range, output);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor input,
+                  bool signed_input, int num_bits, bool range_given,
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  QuantizerRoundMode round_mode, bool narrow_range,
+                  typename TTypes<T, 3>::Tensor output) {
+    QuantizeAndDequantizePerChannelImpl<GPUDevice, T>::Compute(
+        d, input, signed_input, num_bits, range_given, input_min_tensor,
+        input_max_tensor, round_mode, narrow_range, output);
+  }
+};
+
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -46,6 +60,11 @@ template struct functor::QuantizeAndDequantizeOneScaleFunctor<GPUDevice, float>;
 template struct functor::QuantizeAndDequantizeOneScaleFunctor<GPUDevice,
                                                               double>;
 
+template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
+                                                                float>;
+template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
+                                                                double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index a799a357847..d18eb526eb5 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
@@ -37,6 +38,10 @@ namespace {
 
 class QuantizeAndDequantizeTest : public OpsTestBase {};
 
+struct ParameterizedQuantizeAndDequantizeTest
+    : public OpsTestBase,
+      public ::testing::WithParamInterface<int> {};
+
 // Convert a simple scalar tensor.
 TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor) {
   TF_ASSERT_OK(
@@ -89,8 +94,33 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Creates a tensor with the specified dims, using values chosen from data,
+// multiplied by (1 + index) along the axis dimension.
+template <typename T>
+std::vector<T> ScalePerSliceAlongAxis(std::vector<int64> dims, int axis,
+                                      const std::vector<T>& data) {
+  uint32 seed = 123;
+  int64 out_size = 1;
+  for (int dim : dims) {
+    out_size *= dim;
+  }
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  std::vector<T> out(out_size);
+  int num_slices = (axis == -1) ? 1 : dims[axis];
+  for (int out_idx = 0; out_idx < out_size; ++out_idx) {
+    int in_idx = rand_r(&seed) % data.size();
+    int multiplier = ((out_idx / minor_size) % num_slices) + 1;
+    out[out_idx] = data[in_idx] * multiplier;
+  }
+  return out;
+}
+
 // Convert a 1D tensor with signed 8 bits.
-TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
+TEST_P(ParameterizedQuantizeAndDequantizeTest, Convert_4D_tensor_with_int8) {
+  const int axis = GetParam();
   TF_ASSERT_OK(
       NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
           .Input(FakeInput(DT_FLOAT))
@@ -99,29 +129,48 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
           .Attr("signed_input", true)
           .Attr("num_bits", 8)
           .Attr("range_given", false)
+          .Attr("axis", axis)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({7}),
-                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Each slice contains the same 7 values multiplied by (slice_idx + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625}));
 
-  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 64}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5}
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> init_value(num_slices, 0.0f);
+  AddInputFromArray<float>(range_shape, init_value);  // Min
+  AddInputFromArray<float>(range_shape, init_value);  // Max
+
+  // With int8, the values in the tensor are quantized to
+  // {-128, -64, 0, 38, 102, 71, 64}.
+  // Scale is: (slice_idx + 1) / 128
+  // Then it is dequantized to:
+  //    (slice_idx + 1) * {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
   test::FillValues<float>(
-      &expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5});
+      &expected,
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5}));
+
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
-  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
-  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(inputs_[1]->flat<float>()(slice_idx), 0.0);
+    EXPECT_EQ(inputs_[2]->flat<float>()(slice_idx), 0.0);
+  }
 }
 
 // Convert a 1D tensor with signed 8 bits and round_mode half_up.
-TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_round_half_up) {
+TEST_P(ParameterizedQuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_int8_round_half_up) {
+  const int axis = GetParam();
   TF_ASSERT_OK(
       NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
           .Input(FakeInput(DT_FLOAT))
@@ -131,32 +180,50 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_round_half_up) {
           .Attr("num_bits", 8)
           .Attr("range_given", false)
           .Attr("round_mode", "HALF_UP")
+          .Attr("axis", axis)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({7}),
-                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  const std::vector<int64> dims = {5, 7, 11, 13};
+  // Each slice contains the same 7 values multiplied by (slice_idx + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625}));
 
-  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 65}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128,
-  // 65.0 /128}
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> init_value(num_slices, 0.0f);
+  AddInputFromArray<float>(range_shape, init_value);  // Min
+  AddInputFromArray<float>(range_shape, init_value);  // Max
+
+  // With int8, the values in the tensor are quantized to
+  // {-128, -64, 0, 38, 102, 71, 65}.
+  // Scale is: (slice_idx + 1) / 128
+  // Then it is dequantized to:
+  //   (slice_idx + 1) * {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 65.0 /128}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
-  test::FillValues<float>(&expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128,
-                                      71.0 / 128, 65.0 / 128});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(&expected, ScalePerSliceAlongAxis<float>(
+                                         dims, axis,
+                                         {-1, -0.5, 0, 38.0 / 128, 102.0 / 128,
+                                          71.0 / 128, 65.0 / 128}));
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
-  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
-  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(inputs_[1]->flat<float>()(slice_idx), 0.0);
+    EXPECT_EQ(inputs_[2]->flat<float>()(slice_idx), 0.0);
+  }
 }
 
 // Convert a 1D tensor with signed 8 bits and round_mode half_up, using
 // narrow range quantization.
-TEST_F(QuantizeAndDequantizeTest,
-       Convert_1D_tensor_with_int8_round_half_up_narrow_range) {
+TEST_P(ParameterizedQuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_int8_round_half_up_narrow_range) {
+  const int axis = GetParam();
   TF_ASSERT_OK(
       NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
           .Input(FakeInput(DT_FLOAT))
@@ -167,26 +234,43 @@ TEST_F(QuantizeAndDequantizeTest,
           .Attr("range_given", false)
           .Attr("round_mode", "HALF_UP")
           .Attr("narrow_range", true)
+          .Attr("axis", axis)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({7}),
-                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Each slice contains the same 7 values multiplied by (slice_idx + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625}));
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70, 64}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127,
-  // 64.0/127}
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> init_value(num_slices, 0.0f);
+  AddInputFromArray<float>(range_shape, init_value);  // Min
+  AddInputFromArray<float>(range_shape, init_value);  // Max
+
+  // With int8, the values in the tensor are quantized to
+  // {-127, -63, 0, 38, 102, 70, 64}.
+  // Scale is: (slice_idx + 1) / 127
+  // Then it is dequantized to:
+  //    (slice_idx + 1) * {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70/127,
+  //    64/127}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
-  test::FillValues<float>(&expected, {-1, -63.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, 64.0 / 127});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(
+      &expected,
+      ScalePerSliceAlongAxis<float>(dims, axis,
+                                    {-1, -63.0 / 127, 0, 38.0 / 127,
+                                     102.0 / 127, 70.0 / 127, 64.0 / 127}));
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
-  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
-  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(inputs_[1]->flat<float>()(slice_idx), 0.0);
+    EXPECT_EQ(inputs_[2]->flat<float>()(slice_idx), 0.0);
+  }
 }
 
 // Convert a 1D tensor with signed 8 bits.
@@ -221,7 +305,9 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
 }
 
 // Convert a 1D tensor with signed 8 bits, using narrow range quantization.
-TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_narrow_range_V3) {
+TEST_P(ParameterizedQuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_int8_narrow_range_V3) {
+  const int axis = GetParam();
   TF_ASSERT_OK(
       NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
           .Input(FakeInput(DT_FLOAT))
@@ -231,29 +317,55 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_narrow_range_V3) {
           .Attr("signed_input", true)
           .Attr("range_given", false)
           .Attr("narrow_range", true)
+          .Attr("axis", axis)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({7}),
-                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
-  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Each slice contains the same 7 values multiplied by (slice_idx + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625}));
 
-  // With int8, the tensor is quantized to {-127, -64, 0, 38, 102, 70, 64}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -64.0/127, 0, 38.0/127, 102.0/127, 70.0/127,
-  // 64.0/127}
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> init_value(num_slices, 0.0f);
+  AddInputFromArray<float>(range_shape, init_value);  // Min
+  AddInputFromArray<float>(range_shape, init_value);  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+
+  // With int8, the values in the tensor are quantized to
+  // {-127, -63, 0, 38, 102, 70, 64}.
+  // Scale is: (slice_idx + 1) / 127
+  // Then it is dequantized to:
+  //   (slice_idx + 1) * {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70/127, 64/127}
+
+  // With int8, each slice of the the tensor is quantized to
+  // {-127, -64, 0, 38, 102, 70, 64}.
+  // Scale is: (slice_idx + 1) / 127
+  // Then it is dequantized to:
+  //   (slice_idx + 1) * {-1, -64.0/127, 0, 38.0/127, 102.0/127, 70/127, 64/127}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
-  test::FillValues<float>(&expected, {-1, -64.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, 64.0 / 127});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(
+      &expected,
+      ScalePerSliceAlongAxis<float>(dims, axis,
+                                    {-1, -64.0 / 127, 0, 38.0 / 127,
+                                     102.0 / 127, 70.0 / 127, 64.0 / 127}));
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
-  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
-  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(inputs_[1]->flat<float>()(slice_idx), 0.0);
+    EXPECT_EQ(inputs_[2]->flat<float>()(slice_idx), 0.0);
+  }
 }
 
+// Instantiate parameterized tests for axis = -1, 1, 3.
+INSTANTIATE_TEST_SUITE_P(, ParameterizedQuantizeAndDequantizeTest,
+                         ::testing::Values(-1, 1, 3));
+
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   TF_ASSERT_OK(
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 6ed5bb0c752..67e8c943a65 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -84,8 +84,8 @@ class FakeQueueOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const ResourceHandle& ref = context->input(0).flat<ResourceHandle>()(0);
-    handle_.AccessTensor(context)->flat<string>()(0) = ref.container();
-    handle_.AccessTensor(context)->flat<string>()(1) = ref.name();
+    handle_.AccessTensor(context)->flat<tstring>()(0) = ref.container();
+    handle_.AccessTensor(context)->flat<tstring>()(1) = ref.name();
     context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
   }
 
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 730694e85ce..623b848a656 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -292,7 +292,7 @@ class RaggedGatherOp : public RaggedGatherOpBase<INDEX_TYPE, SPLITS_TYPE> {
   REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int32, value_type, int64) \
   REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int64, value_type, int64)
 TF_CALL_POD_TYPES(REGISTER_CPU_KERNEL);
-TF_CALL_string(REGISTER_CPU_KERNEL);
+TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index 122718c1610..470b3a219d2 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -303,7 +303,7 @@ class RaggedTensorFromVariantOp : public OpKernel {
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
-TF_CALL_string(REGISTER_KERNELS);
+TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
index f5397dad509..0be3609f942 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
@@ -601,7 +601,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
       {component_split_1_1}, TensorShape({1}), component_values_1);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<string, int64>(
+  BuildDecodeRaggedTensorGraph<tstring, int64>(
       input_ragged_rank, output_ragged_rank, TensorShape({1}),
       {variant_component_1});
   EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
new file mode 100644
index 00000000000..9199fcf6d9c
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -0,0 +1,538 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/kernels/list_kernels.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/ops/ragged_to_dense_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+namespace {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+using ::std::vector;
+using ::tensorflow::errors::Internal;
+
+const int kShapeInputIndex = 0;
+const int kValueInputIndex = 1;
+const int kDefaultValueInputIndex = 2;
+const int kFirstPartitionInputIndex = 3;
+
+template <typename INDEX_TYPE>
+class RaggedTensorToTensorBaseOp : public OpKernel {
+ public:
+  typedef
+      typename ::tensorflow::TTypes<const INDEX_TYPE>::Flat RowPartitionTensor;
+
+  explicit RaggedTensorToTensorBaseOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, GetRowPartitionTypes<OpKernelConstruction>(
+                                context, &row_partition_types_));
+    ragged_rank_ = GetRaggedRank(row_partition_types_);
+  }
+
+  // Returns the relationship between dimension and dimension + 1.
+  RowPartitionType GetRowPartitionTypeByDimension(int dimension) {
+    if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
+      return row_partition_types_[dimension + 1];
+    } else {
+      return row_partition_types_[dimension];
+    }
+  }
+
+  // Returns the relationship between dimension and dimension + 1.
+  RowPartitionTensor GetRowPartitionTensor(OpKernelContext* c, int dimension) {
+    if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
+      return c->input(dimension + 1 + kFirstPartitionInputIndex)
+          .flat<INDEX_TYPE>();
+    } else {
+      return c->input(dimension + kFirstPartitionInputIndex).flat<INDEX_TYPE>();
+    }
+  }
+
+  Status GetMaxWidth(OpKernelContext* c, int dimension, INDEX_TYPE* result) {
+    const RowPartitionTensor row_partition_tensor =
+        GetRowPartitionTensor(c, dimension - 1);
+    switch (GetRowPartitionTypeByDimension(dimension - 1)) {
+      case RowPartitionType::VALUE_ROWIDS:
+        *result = GetMaxWidthValueRowID(row_partition_tensor);
+        return Status::OK();
+      case RowPartitionType::ROW_SPLITS:
+        *result = GetMaxWidthRowSplit(row_partition_tensor);
+        return Status::OK();
+      default:
+        return errors::InvalidArgument(
+            "Cannot handle partition type ",
+            RowPartitionTypeToString(
+                GetRowPartitionTypeByDimension(dimension - 1)));
+    }
+  }
+
+  static INDEX_TYPE GetMaxWidthRowSplit(const RowPartitionTensor& row_split) {
+    const INDEX_TYPE tensor_length = row_split.size();
+    if (tensor_length == 0 || tensor_length == 1) {
+      return 0;
+    }
+    INDEX_TYPE max_width = 0;
+    for (INDEX_TYPE i = 0; i < tensor_length - 1; ++i) {
+      const INDEX_TYPE current_width = row_split(i + 1) - row_split(i);
+      if (current_width > max_width) {
+        max_width = current_width;
+      }
+    }
+    return max_width;
+  }
+
+  static INDEX_TYPE GetMaxWidthValueRowID(
+      const RowPartitionTensor& value_rowids) {
+    const INDEX_TYPE index_length = value_rowids.size();
+    if (index_length == 0) {
+      return 0;
+    }
+    INDEX_TYPE first_equal_index = 0;
+    INDEX_TYPE first_equal_index_value = value_rowids(0);
+    INDEX_TYPE max_width = 0;
+    for (INDEX_TYPE i = 1; i < index_length; ++i) {
+      const INDEX_TYPE value = value_rowids(i);
+      if (value != first_equal_index_value) {
+        first_equal_index_value = value;
+        max_width = std::max(i - first_equal_index, max_width);
+        first_equal_index = i;
+      }
+    }
+    return std::max(index_length - first_equal_index, max_width);
+  }
+
+  Status CalculateOutputSize(INDEX_TYPE first_dim, OpKernelContext* c,
+                             vector<INDEX_TYPE>* result) {
+    TensorShapeProto value_shape_proto;
+    c->input(kValueInputIndex).shape().AsProto(&value_shape_proto);
+
+    TensorShapeProto default_value_shape_proto;
+    c->input(kDefaultValueInputIndex)
+        .shape()
+        .AsProto(&default_value_shape_proto);
+
+    TensorShapeProto output_shape_proto;
+    TF_RETURN_IF_ERROR(ValidateDefaultValueShape(default_value_shape_proto,
+                                                 value_shape_proto));
+
+    TensorShapeProto shape_proto;
+    {
+      PartialTensorShape partial_tensor_shape;
+      TF_RETURN_IF_ERROR(TensorShapeFromTensor(c->input(kShapeInputIndex),
+                                               &partial_tensor_shape));
+      partial_tensor_shape.AsProto(&shape_proto);
+    }
+
+    TF_RETURN_IF_ERROR(CombineRaggedTensorToTensorShapes(
+        ragged_rank_, shape_proto, value_shape_proto, &output_shape_proto));
+
+    result->reserve(output_shape_proto.dim_size());
+    for (const TensorShapeProto::Dim& dim : output_shape_proto.dim()) {
+      // Note that this may be -1 (if dimension size is unknown).
+      result->push_back(dim.size());
+    }
+
+    if ((*result)[0] < 0) {
+      (*result)[0] = first_dim;
+    }
+    for (int i = 1; i <= ragged_rank_; ++i) {
+      if ((*result)[i] < 0) {
+        TF_RETURN_IF_ERROR(GetMaxWidth(c, i, &(*result)[i]));
+      }
+    }
+    return Status::OK();
+  }
+
+  /**
+   * The output_index represents the index in the output tensor
+   * where the first element of a particular dimension would be written.
+   * If it is -1, it indicates that the index is out of scope.
+   * Example, given first_dimension = 10, first_dimension_output = 6,
+   * and output_index_multiplier = 100:
+   * result = [0 100 200 300 400 500 -1 -1 -1 -1]
+   * If first_dimension_output = 11 instead, then:
+   * result = [0 100 200 300 400 500 600 700 800 900]
+   */
+  vector<INDEX_TYPE> CalculateFirstParentOutputIndex(
+      INDEX_TYPE first_dimension, INDEX_TYPE output_index_multiplier,
+      INDEX_TYPE first_dimension_output) {
+    const INDEX_TYPE min_dimension =
+        std::min(first_dimension, first_dimension_output);
+    vector<INDEX_TYPE> result;
+    result.reserve(first_dimension);
+    int current_output_index = 0;
+    for (INDEX_TYPE i = 0; i < min_dimension;
+         ++i, current_output_index += output_index_multiplier) {
+      result.push_back(current_output_index);
+    }
+    for (INDEX_TYPE i = min_dimension; i < first_dimension; ++i) {
+      result.push_back(-1);
+    }
+    DCHECK_EQ(result.size(), first_dimension);
+    return result;
+  }
+
+  void CalculateOutputIndexRowSplit(
+      const RowPartitionTensor& row_split,
+      const vector<INDEX_TYPE>& parent_output_index,
+      INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
+      vector<INDEX_TYPE>* result) {
+    INDEX_TYPE row_split_size = row_split.size();
+    if (row_split_size > 0) {
+      result->reserve(row_split(row_split_size - 1));
+    }
+    for (INDEX_TYPE i = 0; i < row_split_size - 1; ++i) {
+      INDEX_TYPE row_length = row_split(i + 1) - row_split(i);
+      INDEX_TYPE real_length = std::min(output_size, row_length);
+      INDEX_TYPE parent_output_index_current = parent_output_index[i];
+
+      if (parent_output_index_current == -1) {
+        real_length = 0;
+      }
+      for (INDEX_TYPE j = 0; j < real_length; ++j) {
+        result->push_back(parent_output_index_current);
+        parent_output_index_current += output_index_multiplier;
+      }
+      for (INDEX_TYPE j = 0; j < row_length - real_length; ++j) {
+        result->push_back(-1);
+      }
+    }
+    if (row_split_size > 0) {
+      DCHECK_EQ(result->size(), row_split(row_split_size - 1));
+    }
+  }
+
+  // Calculate the output index of the first element of a list.
+  // The parent_output_index is the same computation for the previous list.
+  // -1 indicates an element or list that is out of range.
+  // The output_index_multiplier is the number of output indices one moves
+  // forward for each column.
+  // E.g., given:
+  // value_rowids:[0 1 2 2 2 3 5 5 6]
+  // parent_output_index:[1000 1100 2000 2100 -1 3000 4000]
+  // output_index_multiplier: 10
+  // output_size: 2
+  // You get:
+  // result = [1000 1100 2000 2010 -1 2100 -1 -1 3000]
+  // result[0] = parent_output_index[value_rowids[0]]
+  // result[1] = parent_output_index[value_rowids[1]]
+  // result[2] = parent_output_index[value_rowids[2]]
+  // result[3] = parent_output_index[value_rowids[2] + 10]
+  // result[4] = -1 because it is the third element the size is 2.
+  // result[5] = parent_output_index[value_rowids[3]]
+  // result[6] = -1 because parent_output_index[value_rowids[6]] == -1
+  // result[7] = -1 because parent_output_index[value_rowids[6]] == -1
+  // result[8] = parent_output_index[value_rowids[7]]
+  void CalculateOutputIndexValueRowID(
+      const RowPartitionTensor& value_rowids,
+      const vector<INDEX_TYPE>& parent_output_index,
+      INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
+      vector<INDEX_TYPE>* result) {
+    const INDEX_TYPE index_size = value_rowids.size();
+    result->reserve(index_size);
+    if (index_size == 0) {
+      return;
+    }
+
+    INDEX_TYPE current_output_column = 0;
+    INDEX_TYPE current_value_rowid = value_rowids(0);
+    DCHECK_LT(current_value_rowid, parent_output_index.size());
+    INDEX_TYPE current_output_index = parent_output_index[current_value_rowid];
+    result->push_back(current_output_index);
+    for (INDEX_TYPE i = 1; i < index_size; ++i) {
+      INDEX_TYPE next_value_rowid = value_rowids(i);
+      if (next_value_rowid == current_value_rowid) {
+        if (current_output_index >= 0) {
+          ++current_output_column;
+          if (current_output_column < output_size) {
+            current_output_index += output_index_multiplier;
+          } else {
+            current_output_index = -1;
+          }
+        }
+      } else {
+        current_output_column = 0;
+        current_value_rowid = next_value_rowid;
+        DCHECK_LT(next_value_rowid, parent_output_index.size());
+        current_output_index = parent_output_index[next_value_rowid];
+      }
+      result->push_back(current_output_index);
+    }
+    DCHECK_EQ(result->size(), value_rowids.size());
+  }
+
+  Status CalculateOutputIndex(OpKernelContext* context, int dimension,
+                              const vector<INDEX_TYPE>& parent_output_index,
+                              INDEX_TYPE output_index_multiplier,
+                              INDEX_TYPE output_size,
+                              vector<INDEX_TYPE>* result) {
+    const RowPartitionTensor row_partition_tensor =
+        GetRowPartitionTensor(context, dimension);
+    auto partition_type = GetRowPartitionTypeByDimension(dimension);
+    switch (partition_type) {
+      case RowPartitionType::VALUE_ROWIDS:
+        CalculateOutputIndexValueRowID(
+            row_partition_tensor, parent_output_index, output_index_multiplier,
+            output_size, result);
+        return tensorflow::Status::OK();
+      case RowPartitionType::ROW_SPLITS:
+        CalculateOutputIndexRowSplit(row_partition_tensor, parent_output_index,
+                                     output_index_multiplier, output_size,
+                                     result);
+        return tensorflow::Status::OK();
+      default:
+        return errors::InvalidArgument(
+            "Unsupported partition type:",
+            RowPartitionTypeToString(partition_type));
+    }
+  }
+
+  Status GetFirstDimensionSize(OpKernelContext* context, INDEX_TYPE* result) {
+    const Tensor first_partition_tensor =
+        context->input(kFirstPartitionInputIndex);
+    const RowPartitionType first_partition_type = row_partition_types_[0];
+    switch (first_partition_type) {
+      case RowPartitionType::FIRST_DIM_SIZE:
+        *result = first_partition_tensor.scalar<INDEX_TYPE>()();
+        return Status::OK();
+      case RowPartitionType::VALUE_ROWIDS:
+        return errors::InvalidArgument(
+            "Cannot handle VALUE_ROWIDS in first dimension.");
+      case RowPartitionType::ROW_SPLITS:
+        *result = first_partition_tensor.shape().dim_size(0) - 1;
+        return Status::OK();
+      default:
+        return errors::InvalidArgument(
+            "Cannot handle type ",
+            RowPartitionTypeToString(first_partition_type));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    INDEX_TYPE first_dimension;
+    OP_REQUIRES_OK(context, GetFirstDimensionSize(context, &first_dimension));
+    vector<INDEX_TYPE> output_size;
+    OP_REQUIRES_OK(context,
+                   CalculateOutputSize(first_dimension, context, &output_size));
+    vector<INDEX_TYPE> multiplier;
+    multiplier.resize(output_size.size());
+
+    multiplier[multiplier.size() - 1] = 1;
+    for (int i = output_size.size() - 2; i >= 0; --i) {
+      multiplier[i] = multiplier[i + 1] * output_size[i + 1];
+    }
+    // Full size of the tensor.
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeUtils::MakeShape(output_size, &output_shape));
+    Tensor* output_tensor = nullptr;
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+    const INDEX_TYPE full_size = multiplier[0] * output_size[0];
+    if (full_size > 0) {
+      vector<INDEX_TYPE> output_index = CalculateFirstParentOutputIndex(
+          first_dimension, multiplier[0], output_size[0]);
+
+      for (int i = 1; i <= ragged_rank_; ++i) {
+        vector<INDEX_TYPE> new_output_index;
+        OP_REQUIRES_OK(context, CalculateOutputIndex(
+                                    context, i - 1, output_index, multiplier[i],
+                                    output_size[i], &new_output_index));
+        output_index = new_output_index;
+      }
+
+      SetOutput(context, output_index, output_tensor);
+    }
+  }
+  virtual void SetOutput(OpKernelContext* context,
+                         const vector<INDEX_TYPE>& output_index,
+                         Tensor* output_tensor) = 0;
+
+ private:
+  vector<RowPartitionType> row_partition_types_;
+  int ragged_rank_;
+};
+
+template <typename VALUE_TYPE, typename INDEX_TYPE>
+void slow_copy_array(VALUE_TYPE* dst, const VALUE_TYPE* src, INDEX_TYPE size) {
+  for (INDEX_TYPE index = 0; index < size; ++index) {
+    dst[index] = src[index];
+  }
+}
+
+template <typename VALUE_TYPE, typename INDEX_TYPE>
+void copy_array(VALUE_TYPE* dst, const VALUE_TYPE* src, INDEX_TYPE size,
+                size_t bytes) {
+  memcpy(dst, src, bytes);
+}
+
+template <>
+void copy_array<string, int64>(string* dst, const string* src, int64 size,
+                               size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <>
+void copy_array<string, int32>(string* dst, const string* src, int32 size,
+                               size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+// If we don't specialize for Eigen::half, we get:
+// undefined behavior, destination object type 'Eigen::half'
+// is not TriviallyCopyable
+template <>
+void copy_array<Eigen::half, int64>(Eigen::half* dst, const Eigen::half* src,
+                                    int64 size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <>
+void copy_array<Eigen::half, int32>(Eigen::half* dst, const Eigen::half* src,
+                                    int32 size, size_t bytes) {
+  slow_copy_array(dst, src, size);
+}
+
+template <typename VALUE_TYPE, typename INDEX_TYPE>
+class RaggedTensorToTensorOp : public RaggedTensorToTensorBaseOp<INDEX_TYPE> {
+ public:
+  explicit RaggedTensorToTensorOp(OpKernelConstruction* context)
+      : RaggedTensorToTensorBaseOp<INDEX_TYPE>(context) {}
+
+  void SetOutput(OpKernelContext* context,
+                 const vector<INDEX_TYPE>& output_index,
+                 Tensor* output_tensor) override {
+    typename tensorflow::TTypes<VALUE_TYPE>::Flat output_flat =
+        output_tensor->flat<VALUE_TYPE>();
+    const auto& value_tensor = context->input(kValueInputIndex);
+    const auto& default_value_tensor = context->input(kDefaultValueInputIndex);
+    if (value_tensor.shape().dims() == 1) {
+      // Initialize tensor to default_value.
+      VALUE_TYPE* base_output = output_flat.data();
+      VALUE_TYPE default_value = default_value_tensor.scalar<VALUE_TYPE>()();
+
+      std::fill(base_output, base_output + output_flat.size(), default_value);
+      auto values = context->input(kValueInputIndex).flat<VALUE_TYPE>();
+      int values_size = values.size();
+      OP_REQUIRES(context, values_size == output_index.size(),
+                  Internal("Values and indices must be equal"));
+      for (int i = 0; i < values_size; ++i) {
+        if (output_index[i] >= 0) {
+          output_flat(output_index[i]) = values(i);
+        }
+      }
+    } else {
+      const auto& output_shape = output_tensor->shape();
+      const auto& default_value_shape = default_value_tensor.shape();
+
+      // Initialize tensor to default_value.
+
+      BCast bcast(BCast::FromShape(default_value_shape),
+                  BCast::FromShape(output_shape),
+                  /*fewer_dims_optimization=*/true);
+      OP_REQUIRES(
+          context, bcast.IsValid(),
+          errors::InvalidArgument(
+              "Incompatible shapes: ", default_value_shape.DebugString(),
+              " vs. ", default_value_shape.DebugString()));
+      OP_REQUIRES(
+          context, BCast::ToShape(bcast.output_shape()) == output_shape,
+          errors::InvalidArgument("Unable to broadcast default_value of shape ",
+                                  default_value_shape, " to tensor of shape ",
+                                  output_shape));
+      const CPUDevice& device = context->eigen_device<CPUDevice>();
+      functor::BroadcastTo<CPUDevice, VALUE_TYPE>()(
+          device, context, *output_tensor, output_shape, default_value_tensor,
+          default_value_shape, bcast);
+
+      VALUE_TYPE* base_output = output_flat.data();
+      auto values = context->input(kValueInputIndex).flat<VALUE_TYPE>();
+      size_t values_size = values.size();
+      size_t output_index_size = output_index.size();
+      //  A value "element" is a group of values that are arranged together.
+      // For example, if the value shape is [3,4,5], then 20 values are in a
+      // value element.
+      int value_element_size = values_size / output_index_size;
+      int value_element_bytesize = value_element_size * sizeof(VALUE_TYPE);
+      const VALUE_TYPE* values_base = values.data();
+
+      OP_REQUIRES(context,
+                  value_tensor.shape().dim_size(0) == output_index_size,
+                  Internal("Values and indices must be equal"));
+
+      OP_REQUIRES(context,
+                  values_size == output_index_size * value_element_size,
+                  Internal("Values and indices must be equal"));
+      INDEX_TYPE value_index = 0;
+      for (int i = 0; i < output_index_size;
+           ++i, value_index += value_element_size) {
+        if (output_index[i] >= 0) {
+          VALUE_TYPE* dst = base_output + output_index[i];
+          const VALUE_TYPE* src = values_base + value_index;
+          copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, value_element_size,
+                                             value_element_bytesize);
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNEL_INDEX_TYPE(value_type, index_type)       \
+  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToTensor")               \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<value_type>("T")       \
+                              .TypeConstraint<index_type>("Tindex"), \
+                          RaggedTensorToTensorOp<value_type, index_type>);
+
+#define REGISTER_CPU_KERNEL(value_type)                          \
+  REGISTER_CPU_KERNEL_INDEX_TYPE(value_type, tensorflow::int64); \
+  REGISTER_CPU_KERNEL_INDEX_TYPE(value_type, tensorflow::int32);
+
+TF_CALL_POD_TYPES(REGISTER_CPU_KERNEL);
+TF_CALL_string(REGISTER_CPU_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
+TF_CALL_quint16(REGISTER_CPU_KERNEL);
+TF_CALL_qint16(REGISTER_CPU_KERNEL);
+TF_CALL_uint32(REGISTER_CPU_KERNEL);
+TF_CALL_uint64(REGISTER_CPU_KERNEL);
+
+#undef REGISTER_CPU_KERNEL
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
new file mode 100644
index 00000000000..7337ebe4ba5
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -0,0 +1,553 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+template <typename VALUE_TYPE>
+struct ShapeAndValues {
+  TensorShape shape;
+  std::vector<VALUE_TYPE> values;
+};
+
+template <typename VALUE_TYPE>
+ShapeAndValues<VALUE_TYPE> createVector(const std::vector<VALUE_TYPE>& values) {
+  TensorShape shape({static_cast<int64>(values.size())});
+  return {shape, values};
+}
+
+template <typename VALUE_TYPE>
+ShapeAndValues<VALUE_TYPE> createScalar(const VALUE_TYPE& values) {
+  TensorShape shape({});
+  return {shape, {values}};
+}
+
+class RaggedTensorToTensorOpTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for RaggedTensorToTensor.
+  template <typename VALUE_TYPE, typename INDEX_TYPE>
+  void BuildRaggedTensorToTensorGraph(
+      const TensorShape& shape, const std::vector<string>& row_partition_types,
+      const ShapeAndValues<VALUE_TYPE>& values,
+      const ShapeAndValues<VALUE_TYPE>& default_value,
+      const std::vector<ShapeAndValues<INDEX_TYPE>>& row_partition_tensors) {
+    const auto& value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto& index_dtype = DataTypeToEnum<INDEX_TYPE>::v();
+    int num_row_partition_tensors = row_partition_tensors.size();
+    TF_ASSERT_OK(
+        NodeDefBuilder("tested_op", "RaggedTensorToTensor")
+            .Attr("T", value_dtype)
+            .Attr("Tindex", index_dtype)
+            .Attr("num_row_partition_tensors", num_row_partition_tensors)
+            .Attr("row_partition_types", row_partition_types)
+            .Input(FakeInput(index_dtype))
+            .Input(FakeInput(value_dtype))  // values
+            .Input(FakeInput(value_dtype))  // default_value
+            .Input(FakeInput(num_row_partition_tensors,
+                             index_dtype))  // row_partition_tensors
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    {
+      std::vector<INDEX_TYPE> shape_as_vector;
+      for (const auto& dim : shape.dim_sizes()) {
+        shape_as_vector.push_back(dim);
+      }
+      ShapeAndValues<INDEX_TYPE> shape_as_tensor =
+          createVector(shape_as_vector);
+      AddInputFromArray<INDEX_TYPE>(shape_as_tensor.shape,
+                                    shape_as_tensor.values);
+    }
+    AddInputFromArray<VALUE_TYPE>(values.shape, values.values);
+    AddInputFromArray<VALUE_TYPE>(default_value.shape, default_value.values);
+
+    for (const auto& row_partition_tensor : row_partition_tensors) {
+      AddInputFromArray<INDEX_TYPE>(row_partition_tensor.shape,
+                                    row_partition_tensor.values);
+    }
+  }
+};
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor) {
+  // indices = [2, 1, 0, 3]
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  // params.shape = [4, None]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({4, 4}),                 // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({.1, .2, .3, 1.5, 1.5, 1.5, 1.5, 1.5, .4, .5, .6,
+                             .7, .8, .9, 1.5, 1.5},
+                            TensorShape({4, 4})),
+      0.01);
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorRowSplits) {
+  // indices = [2, 1, 0, 3]
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({4, 4}),  // shape
+      {"ROW_SPLITS"},       // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),               // default_value
+      {createVector<int32>({0, 3, 3, 7, 9})}  // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({.1, .2, .3, 1.5, 1.5, 1.5, 1.5, 1.5, .4, .5, .6,
+                             .7, .8, .9, 1.5, 1.5},
+                            TensorShape({4, 4})),
+      0.01);
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParams) {
+  // params = [
+  //           [[]],
+  //           [[.1, .2], [.3]],
+  //           [],
+  //           [[.4, .5], [.6, .7, .8]],
+  //           [[.9]]
+  //          ]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({5, 2, 3}),  // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
+       "VALUE_ROWIDS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {
+          createScalar<int32>(5),
+          createVector<int32>({0, 1, 1, 3, 3, 4}),
+          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+      }  // row_partition_tensors
+  );
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected = [
+  //              [[1.5, 1.5, 1.5], [1.5, 1.5, 1.5]],
+  //              [[.1, .2, 1.5], [.3, 1.5, 1.5]],
+  //              [[1.5, 1.5, 1.5], [1.5, 1.5, 1.5]],
+  //              [[.4, .5, 1.5], [.6, .7, .8]],
+  //              [[.9, 1.5, 1.5], [1.5, 1.5, 1.5]]
+  //            ]
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .1,  .2,  1.5, .3,
+                             1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .4,  .5,
+                             1.5, .6,  .7,  .8,  .9,  1.5, 1.5, 1.5, 1.5, 1.5},
+                            TensorShape({5, 2, 3})),
+      0.1);
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsRowSplits) {
+  // params = [
+  //           [[]],
+  //           [[.1, .2], [.3]],
+  //           [],
+  //           [[.4, .5], [.6, .7, .8]],
+  //           [[.9]]
+  //          ]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({5, 2, 3}),        // shape
+      {"ROW_SPLITS", "ROW_SPLITS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {
+          createVector<int32>({0, 1, 3, 3, 5, 6}),
+          createVector<int32>({0, 0, 2, 3, 5, 8, 9}),
+      }  // row_partition_tensors
+  );
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected = [
+  //              [[1.5, 1.5, 1.5], [1.5, 1.5, 1.5]],
+  //              [[.1, .2, 1.5], [.3, 1.5, 1.5]],
+  //              [[1.5, 1.5, 1.5], [1.5, 1.5, 1.5]],
+  //              [[.4, .5, 1.5], [.6, .7, .8]],
+  //              [[.9, 1.5, 1.5], [1.5, 1.5, 1.5]]
+  //            ]
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .1,  .2,  1.5, .3,
+                             1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, .4,  .5,
+                             1.5, .6,  .7,  .8,  .9,  1.5, 1.5, 1.5, 1.5, 1.5},
+                            TensorShape({5, 2, 3})),
+      0.1);
+}
+
+// test_three_dimensional_ragged fails, want to try it at a lower level.
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsRowSplits2) {
+  // params = [
+  //           [[0, 1, 2], []],
+  //           [],
+  //           [[3]]
+  //          ]
+  BuildRaggedTensorToTensorGraph<int64, int64>(
+      TensorShape({3, 2, 3}),             // shape
+      {"ROW_SPLITS", "ROW_SPLITS"},       // row_partition_types
+      createVector<int64>({0, 1, 2, 3}),  // values
+      createScalar<int64>(5),             // default_value
+      {
+          createVector<int64>({0, 2, 2, 3}),
+          createVector<int64>({0, 3, 3, 4}),
+      }  // row_partition_tensors
+  );
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected = [
+  //              [[0, 1, 2], [5, 5, 5]],
+  //              [[5, 5, 5], [5, 5, 5]],
+  //              [[3, 5, 5], [5, 5, 5]]
+  //            ]
+  test::ExpectTensorEqual<int64>(
+      *GetOutput(0), test::AsTensor<int64>(
+                         {0, 1, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5},
+                         TensorShape({3, 2, 3})));
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParams) {
+  // Input:    [[],
+  //            [
+  //             [[1, 2], [3, 4], [5, 6]],
+  //             [[7, 8]]
+  //            ],
+  //            [[]],
+  //            []
+  // ]
+  // params.shape = [3, 2, 3, 2]
+  BuildRaggedTensorToTensorGraph<int32, int32>(
+      TensorShape({4, 2, 3, 2}),  // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
+       "VALUE_ROWIDS"},                               // row_partition_types
+      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32>(15),                        // default_value
+      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
+       createVector<int32>({1, 1, 1, 2}),
+       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+  // params = [
+  //           [
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ],
+  //           [
+  //             [[1, 2], [3, 4], [5, 6]],
+  //             [[7, 8], [15, 15], [15,15]],
+  //           ],
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ],
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ]
+  // params.shape = [3, 2, 3, 2]
+  test::ExpectTensorEqual<int32>(
+      *GetOutput(0),
+      test::AsTensor<int32>(
+          {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
+           5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+           15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
+          TensorShape({4, 2, 3, 2})));
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
+  // Input:    [[],
+  //            [
+  //             [[1, 2], [3, 4], [5, 6]],
+  //             [[7, 8]]
+  //            ],
+  //            [[]],
+  //            []
+  // ]
+  // params.shape = [3, 2, 3, 2]
+  BuildRaggedTensorToTensorGraph<int32, int32>(
+      TensorShape({4, 2, 3, 2}),  // shape
+      {"ROW_SPLITS", "ROW_SPLITS", "ROW_SPLITS"},
+      // row_partition_types
+      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32>(15),                        // default_value
+      {createVector<int32>({0, 1, 3}), createVector<int32>({0, 0, 3, 4}),
+       createVector<int32>({0, 2, 4, 6, 8})}  // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+  // params = [
+  //           [
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ],
+  //           [
+  //             [[1, 2], [3, 4], [5, 6]],
+  //             [[7, 8], [15, 15], [15,15]],
+  //           ],
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ],
+  //             [[15,15],[15,15],[15,15]],
+  //             [[15,15],[15,15],[15,15]],
+  //           ]
+  // params.shape = [3, 2, 3, 2]
+  test::ExpectTensorEqual<int32>(
+      *GetOutput(0),
+      test::AsTensor<int32>(
+          {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
+           5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+           15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
+          TensorShape({4, 2, 3, 2})));
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpanded) {
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({3, 5}),                 // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({.1, .2, .3, 1.5, 1.5,     //
+                             1.5, 1.5, 1.5, 1.5, 1.5,  //
+                             .4, .5, .6, .7, 1.5},     //
+                            TensorShape({3, 5})),
+      0.01);
+}
+
+// Adds a dense dimension.
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpandedDense) {
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({3, 5, 2}),              // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
+      ShapeAndValues<float>{TensorShape({9, 2}),
+                            {.1, 1.1, .2, 1.2, .3, 1.3, .4, 1.4, .5, 1.5, .6,
+                             1.6, .7, 1.7, .8, 1.8, .9, 1.9}},  // values
+      createScalar<float>(1.5),                                 // default_value
+      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>(
+          {.1,  1.1, .2,  1.2, .3,  1.3, 1.5, 1.5, 1.5, 1.5,   //
+           1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5,   //
+           .4,  1.4, .5,  1.5, .6,  1.6, .7,  1.7, 1.5, 1.5},  //
+          TensorShape({3, 5, 2})),
+      0.01);
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorConstrained) {
+  // params = [[.1, .2, .3],
+  //           [],
+  //           [.4, .5, .6, .7],
+  //           [.8, .9]]
+  // constrained to (3, 3)
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({3, 3}),                 // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  test::ExpectTensorNear<float>(*GetOutput(0),
+                                test::AsTensor<float>(
+                                    {
+                                        //
+                                        .1, .2, .3,     //
+                                        1.5, 1.5, 1.5,  //
+                                        .4, .5, .6      //
+                                    },
+                                    TensorShape({3, 3})),
+                                0.01);
+}
+
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsConstrained) {
+  // params = [
+  //           [[]],
+  //           [[.1, .2], [.3]],
+  //           [],
+  //           [[.4, .5], [.6, .7, .8]],
+  //           [[.9]]
+  //          ]
+  // params.shape = [5, None, None]
+  BuildRaggedTensorToTensorGraph<float, int32>(
+      TensorShape({4, 1, 2}),  // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
+       "VALUE_ROWIDS"},  // row_partition_types
+      createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
+      createScalar<float>(1.5),  // default_value
+      {
+          createScalar<int32>(5),
+          createVector<int32>({0, 1, 1, 3, 3, 4}),
+          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+      }  // row_partition_tensors
+  );
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected = [
+  //              [[1.5, 1.5]],
+  //              [[.1, .2]],
+  //              [[1.5, 1.5]],
+  //              [[.4, .5]],
+  //            ]
+  test::ExpectTensorNear<float>(
+      *GetOutput(0),
+      test::AsTensor<float>({1.5, 1.5, .1, .2, 1.5, 1.5, .4, .5},
+                            TensorShape({4, 1, 2})),
+      0.01);
+}
+
+// Seg fault but removing this does not make the problem go away.
+// This tests is labeled as flaky. Removing it to find out.
+TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsConstrained) {
+  // Input:    [[],
+  //            [
+  //             [[1, 2], [3, 4], [5, 6]],
+  //             [[7, 8]]
+  //            ],
+  //            [[]],
+  //            []
+  // ]
+  // params.shape = [3, 2, 3, 2]
+  BuildRaggedTensorToTensorGraph<int32, int32>(
+      TensorShape({2, 2, 2, 2}),  // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
+       "VALUE_ROWIDS"},                               // row_partition_types
+      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32>(15),                        // default_value
+      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
+       createVector<int32>({1, 1, 1, 2}),
+       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+  // params = [
+  //           [
+  //             [[15,15],[15,15]],
+  //             [[15,15],[15,15]],
+  //           ],
+  //           [
+  //             [[1, 2], [3, 4]],
+  //             [[7, 8], [15, 15]],
+  //           ],
+  //          ]
+  // params.shape = [3, 2, 3, 2]
+  test::ExpectTensorEqual<int32>(*GetOutput(0), test::AsTensor<int32>(
+                                                    {
+                                                        15, 15, 15, 15,  //
+                                                        15, 15, 15, 15,  //
+                                                        1, 2, 3, 4,      //
+                                                        7, 8, 15, 15,    //
+                                                    },
+                                                    TensorShape({2, 2, 2, 2})));
+}
+
+TEST_F(RaggedTensorToTensorOpTest, ShapeWrongDimensions) {
+  BuildRaggedTensorToTensorGraph<int32, int32>(
+      TensorShape({10, 7, 10, 20}),  // shape
+      {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
+       "VALUE_ROWIDS"},                   // row_partition_types
+      createVector<int32>({1, 2, 3, 4}),  // values
+      createScalar<int32>(15),            // default_value
+      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
+       createVector<int32>({1, 1, 1, 2})}  // row_partition_tensors
+  );
+  // Fails with an invalid argument.
+  EXPECT_EQ(RunOpKernel().code(), errors::Code::INVALID_ARGUMENT);
+}
+
+class RaggedTensorToTensorOpUnknownShapeTest
+    : public ::tensorflow::OpsTestBase {
+ protected:
+  std::unique_ptr<ShapeInferenceTestOp> op_;
+  void SetAttributes(const gtl::ArraySlice<string> row_partition_types,
+                     int num_row_partition_tensors) {
+    op_ = absl::make_unique<ShapeInferenceTestOp>("RaggedTensorToTensor");
+    SetAttrValue(row_partition_types,
+                 &((*op_->node_def.mutable_attr())["row_partition_types"]));
+    (*op_->node_def.mutable_attr())["num_row_partition_tensors"].set_i(
+        num_row_partition_tensors);
+  }
+};
+
+TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
+  SetAttributes(gtl::ArraySlice<string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
+
+  INFER_OK(*op_, "?;?;?;?;?", "?");
+  INFER_OK(*op_, "?;[6];[];[];[6]", "[?,?]");
+  INFER_OK(*op_, "?;[6];?;[];[6]", "[?,?]");
+  INFER_OK(*op_, "?;?;[];[];[6]", "?");
+  INFER_OK(*op_, "?;[6];?;[];[6]", "[?,?]");
+  INFER_OK(*op_, "?;[6,2];?;[];[6]", "[?,?,2]");
+  INFER_OK(*op_, "?;[6,2];[2];[];[6]", "[?,?,2]");
+  INFER_OK(*op_, "?;[6,2,7];[2,7];[];[6]", "[?,?,2,7]");
+  INFER_ERROR("default_value_shape and value_shape do not match", *op_,
+              "?;[6,2];[3];[];[6]");
+  INFER_ERROR("default_value_shape and value_shape do not match", *op_,
+              "?;[6,2,1,2];[2,2];[];[6]");
+  INFER_ERROR("must be a vector", *op_, "?;[6];[];[];[3,6]");
+  INFER_ERROR("must be a scalar", *op_, "?;[6];[];[7];[3]");
+}
+
+TEST_F(RaggedTensorToTensorOpUnknownShapeTest, RowSplits) {
+  // RaggedTensorToTensor(param_splits+, param_values, indices) -> [splits+,
+  // values]
+  SetAttributes(gtl::ArraySlice<string>{"ROW_SPLITS"}, 1);
+
+  // value, default_value, ROW_SPLITS
+  INFER_OK(*op_, "?;?;?;?", "?");
+  INFER_OK(*op_, "?;[3];[];[6]", "[?,?]");
+  INFER_OK(*op_, "?;?;?;?", "?");
+  INFER_OK(*op_, "?;[3,2];[2];[6]", "[?,?,2]");
+  INFER_OK(*op_, "?;[3,2,7];[2,7];[6]", "[?,?,2,7]");
+  INFER_OK(*op_, "?;[3,2,7];[2,7];[6]", "[?,?,2,7]");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 6923fd45f11..c9f09796239 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -210,7 +210,7 @@ class RaggedTensorToVariantOp : public OpKernel {
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
-TF_CALL_string(REGISTER_KERNELS);
+TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index a002549b44b..df27541bb66 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -96,7 +96,7 @@ double stirling_approx_tail(double k) {
     return kTailValues[static_cast<int>(k)];
   }
   double kp1sq = (k + 1) * (k + 1);
-  return (1 / 12 - (1 / 360 + 1 / 1260 / kp1sq) / kp1sq) / (k + 1);
+  return (1.0 / 12 - (1.0 / 360 + 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1);
 }
 
 // We use a transformation-rejection algorithm from
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index e3203cee1d1..70c96b45af5 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -48,7 +48,8 @@ template <typename T, int ElementCount>
 class SampleCopier {
  public:
   inline __device__ void operator()(
-      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
+      T* __restrict__ buf,
+      const tensorflow::random::Array<T, ElementCount>& array) const {
 #pragma unroll
     for (int i = 0; i < ElementCount; i++) {
       buf[i] = array[i];
@@ -63,7 +64,8 @@ class SampleCopier<float, 4> {
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      float* buf, const tensorflow::random::Array<float, 4>& array) const {
+      float* __restrict__ buf,
+      const tensorflow::random::Array<float, 4>& array) const {
     // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
     // have 32-bit alignment vs 128-bit alignment. There seems to be no
     // performance loss when assigning each element to a vector.
@@ -84,7 +86,8 @@ class SampleCopier<int32, 4> {
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
+      int32* __restrict__ buf,
+      const tensorflow::random::Array<int32, 4>& array) const {
     int4 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -102,7 +105,8 @@ class SampleCopier<double, 2> {
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      double* buf, const tensorflow::random::Array<double, 2>& array) const {
+      double* __restrict__ buf,
+      const tensorflow::random::Array<double, 2>& array) const {
     double2 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -118,7 +122,8 @@ class SampleCopier<int64, 2> {
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
+      int64* __restrict__ buf,
+      const tensorflow::random::Array<int64, 2>& array) const {
     longlong2 vec;
     vec.x = array[0];
     vec.y = array[1];
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
index abd16de6a1c..d1702b8631f 100644
--- a/tensorflow/core/kernels/reader_ops.cc
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -90,9 +90,12 @@ class ReaderReadOp : public ReaderVerbAsyncOpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output("value", TensorShape({}), &value));
 
-    auto key_scalar = key->scalar<string>();
-    auto value_scalar = value->scalar<string>();
-    reader->Read(queue, &key_scalar(), &value_scalar(), context);
+    auto key_scalar = key->scalar<tstring>();
+    auto value_scalar = value->scalar<tstring>();
+    tstring key_out, val_out;
+    reader->Read(queue, &key_out, &val_out, context);
+    key_scalar() = key_out;
+    value_scalar() = val_out;
   }
 };
 
@@ -115,9 +118,9 @@ class ReaderReadUpToOp : public ReaderVerbAsyncOpKernel {
                    GetResourceFromContext(context, "queue_handle", &queue));
     core::ScopedUnref unref_me(queue);
 
-    std::vector<string> keys_vec;
+    std::vector<tstring> keys_vec;
     keys_vec.reserve(num_records);
-    std::vector<string> values_vec;
+    std::vector<tstring> values_vec;
     values_vec.reserve(num_records);
 
     int64 num_actually_read =
@@ -139,8 +142,8 @@ class ReaderReadUpToOp : public ReaderVerbAsyncOpKernel {
                    context->allocate_output(
                        "values", TensorShape({num_actually_read}), &values));
 
-    auto keys_t = keys->vec<string>();
-    auto values_t = values->vec<string>();
+    auto keys_t = keys->vec<tstring>();
+    auto values_t = values->vec<tstring>();
     for (int i = 0; i < num_actually_read; ++i) {
       keys_t(i) = std::move(keys_vec[i]);
       values_t(i) = std::move(values_vec[i]);
@@ -200,7 +203,7 @@ class ReaderSerializeStateOp : public ReaderVerbSyncOpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output("state", TensorShape({}), &output));
     OP_REQUIRES_OK(context,
-                   reader->SerializeState(&output->scalar<string>()()));
+                   reader->SerializeState(&output->scalar<tstring>()()));
   }
 };
 
@@ -221,7 +224,7 @@ class ReaderRestoreStateOp : public ReaderVerbSyncOpKernel {
         context, TensorShapeUtils::IsScalar(tensor->shape()),
         errors::InvalidArgument("Reader state must be scalar, but had shape: ",
                                 tensor->shape().DebugString()));
-    OP_REQUIRES_OK(context, reader->RestoreState(tensor->scalar<string>()()));
+    OP_REQUIRES_OK(context, reader->RestoreState(tensor->scalar<tstring>()()));
   }
 };
 
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 841f9dc4b8e..e7d5750e09d 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -55,7 +55,7 @@ class RecordInputOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Tensor out(DT_STRING, {batch_size_});
-    auto t_out = out.flat<string>();
+    auto t_out = out.flat<tstring>();
     for (int i = 0; i < batch_size_; ++i) {
       OP_REQUIRES_OK(ctx, yielder_->YieldOne(&t_out(i)));
     }
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index 3fd9bf9defe..8ecfe32bcf9 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -44,7 +44,7 @@ RecordYielder::~RecordYielder() {
   delete thread_;
 }
 
-Status RecordYielder::YieldOne(string* value) {
+Status RecordYielder::YieldOne(tstring* value) {
   mutex_lock l(mu_);
   while (!BufEnough() && status_.ok()) {
     buf_enough_.wait(l);
@@ -70,7 +70,7 @@ Status RecordYielder::YieldOne(string* value) {
 
 struct RecordYielder::Shard {
   int index;                      // Shard index.
-  std::vector<string> filenames;  // File names given to this shard.
+  std::vector<tstring> filenames;  // File names given to this shard.
   Notification done;              // Notified when this shard is done.
   Status status;                  // Shard status.
 };
@@ -211,7 +211,7 @@ void RecordYielder::ShardLoop(Shard* shard) {
             opts_.compression_type);
     io::RecordReader rdr(file.get(), options);
     uint64 offset = 0;
-    string record;
+    tstring record;
     while (true) {
       Status s = rdr.ReadRecord(&offset, &record);
       if (s.ok()) {
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 159b43b4cd0..4efef6c4366 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -90,7 +90,7 @@ class RecordYielder {
   RecordYielder& operator=(const RecordYielder&) = delete;
 
   // Yields one 'value'.
-  Status YieldOne(string* value);
+  Status YieldOne(tstring* value);
 
   // Returns the current epoch number.
   int64 current_epoch() const { return epoch_; }
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
index 7a81dfd0369..562281ea308 100644
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -122,7 +122,7 @@ class ReduceJoinOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    const auto input_flat = input.flat<string>();
+    const auto input_flat = input.flat<tstring>();
     const TensorShape& input_shape = input.shape();
     const int32 input_dims = input_shape.dims();
 
@@ -156,7 +156,7 @@ class ReduceJoinOp : public OpKernel {
         GetOutputShape(index_is_reduced, input_shape, keep_dims_);
     OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
                                                      &output_tensor));
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
 
     const int64 reduction_iter_size =
         GetReductionIterSize(reduced_indices, input_shape);
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 7024d62a53e..3262bbb05e9 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -494,7 +494,8 @@ __device__ __inline__ T ComputeSum(IN_T in_, const int plane,
 }
 
 template <typename IN_T, typename Op>
-__global__ void ColumnReduceInToTempKernel(void* temp, int temp_in_offset,
+__global__ void ColumnReduceInToTempKernel(void* __restrict__ temp,
+                                           int temp_in_offset,
                                            int temp_out_offset, IN_T in,
                                            int num_planes, int num_rows,
                                            int num_cols, Op op) {
@@ -524,9 +525,10 @@ __global__ void ColumnReduceInToTempKernel(void* temp, int temp_in_offset,
 }
 
 template <typename T, typename OUT_T, typename Op>
-__global__ void ColumnReduceTempToOutKernel(void* temp, int temp_in_offset,
-                                            T in, OUT_T out, int num_planes,
-                                            int num_rows, int num_cols, Op op) {
+__global__ void ColumnReduceTempToOutKernel(void* __restrict__ temp,
+                                            int temp_in_offset, T in, OUT_T out,
+                                            int num_planes, int num_rows,
+                                            int num_cols, Op op) {
   typedef typename std::iterator_traits<T>::value_type value_type;
   value_type* t = (value_type*)temp;
   const int tid = threadIdx.x;
diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc
index 4a34c4ef513..70ea87a2dfc 100644
--- a/tensorflow/core/kernels/reduction_ops_all.cc
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("reduction_indices"),
     ReductionOp<CPUDevice, bool, int64, Eigen::internal::AndReducer>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
     Name("All")
         .TypeConstraint<int32>("Tidx")
diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc
index 6c0519de95e..cd0ce289e51 100644
--- a/tensorflow/core/kernels/reduction_ops_any.cc
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("reduction_indices"),
     ReductionOp<CPUDevice, bool, int64, Eigen::internal::OrReducer>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
     Name("Any")
         .TypeConstraint<int32>("Tidx")
diff --git a/tensorflow/core/kernels/reduction_ops_common_gpu.h b/tensorflow/core/kernels/reduction_ops_common_gpu.h
index 9af43f885f9..2415f1dbc6d 100644
--- a/tensorflow/core/kernels/reduction_ops_common_gpu.h
+++ b/tensorflow/core/kernels/reduction_ops_common_gpu.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
 
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with GPU support
 #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
index 9f4bf50e7ca..cf719e76cd8 100644
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                           \
   REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
@@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                           ReductionOp<GPUDevice, type, int64,                \
                                       functor::EuclideanNormReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS
 
 #endif
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc
index 79ec1d59dfa..89bcf1d7ced 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -59,4 +59,4 @@ DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::OrReducer);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index c492308a916..c952c4c9fa4 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(double);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index b006311c125..92f4b9d707c 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(float);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index 91a33b92cb6..c35d8c2ec86 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -68,4 +68,4 @@ DEFINE_FOR_ALL_REDUCERS(int64);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
index f33d504e25a..bbb34c9d3ba 100644
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
index 84fd389bb38..d2a180ba351 100644
--- a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index 8bfa44b2d06..fe9775f7f1d 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index 67c974edda2..d314f1953dc 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                      \
   REGISTER_KERNEL_BUILDER(                                              \
@@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("reduction_indices"),                             \
       ReductionOp<GPUDevice, type, int64, functor::MeanReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS
 
 #endif
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 5c537c5b9c7..9f1feae969e 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index e9b23df7460..0642bad9218 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                          \
   REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
@@ -52,8 +52,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                                       Eigen::internal::ProdReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int32(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS
 
 #endif
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index cf0d0f5c714..d79684df290 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
@@ -52,8 +52,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
       ReductionOp<GPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index 7edaaad8f78..04da969df12 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -31,14 +31,14 @@ class RegexFullMatchOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     const Tensor* pattern_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
-    const string pattern = pattern_tensor->flat<string>()(0);
+    const string pattern = pattern_tensor->flat<tstring>()(0);
     const RE2 match(pattern);
     OP_REQUIRES(ctx, match.ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
@@ -71,7 +71,7 @@ class StaticRegexFullMatchOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index a1b948891d6..4eb83c5fe0d 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -44,15 +44,19 @@ Status InternalCompute(const RE2& match, const string& rewrite,
   } else {
     TF_RETURN_IF_ERROR(
         ctx->allocate_output("output", input_tensor->shape(), &output_tensor));
-    output_tensor->flat<string>() = input_tensor->flat<string>();
+    output_tensor->flat<tstring>() = input_tensor->flat<tstring>();
   }
-  auto output_flat = output_tensor->flat<string>();
+  auto output_flat = output_tensor->flat<tstring>();
   for (size_t i = 0; i < output_flat.size(); ++i) {
+    // TODO(dero): Mitigate copy; Global and GlobalReplace below currently only
+    // accept std::string.
+    string buf = output_flat(i);
     if (replace_global) {
-      RE2::GlobalReplace(&output_flat(i), match, rewrite);
+      RE2::GlobalReplace(&buf, match, rewrite);
     } else {
-      RE2::Replace(&output_flat(i), match, rewrite);
+      RE2::Replace(&buf, match, rewrite);
     }
+    output_flat(i) = std::move(buf);
   }
   return Status::OK();
 }
@@ -70,7 +74,7 @@ class RegexReplaceOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
-    const string pattern = pattern_tensor->flat<string>()(0);
+    const string& pattern = pattern_tensor->scalar<tstring>()();
     const RE2 match(pattern);
     OP_REQUIRES(ctx, match.ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
@@ -81,7 +85,7 @@ class RegexReplaceOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rewrite_tensor->shape()),
                 errors::InvalidArgument("Rewrite must be scalar, but received ",
                                         rewrite_tensor->shape().DebugString()));
-    const string rewrite = rewrite_tensor->flat<string>()(0);
+    const string& rewrite = rewrite_tensor->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, InternalCompute(match, rewrite, replace_global_, ctx));
   }
 
diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc
index 9691d4a89f5..b9e960efecc 100644
--- a/tensorflow/core/kernels/regex_replace_op_test.cc
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@@ -60,7 +60,7 @@ const char kRewrite[] = " ";
 Tensor GetTestTensor(int batch) {
   const int sz = TF_ARRAYSIZE(lines);
   Tensor t(DT_STRING, {batch});
-  auto s = t.flat<string>();
+  auto s = t.flat<tstring>();
   for (int i = 0; i < batch; ++i) {
     s(i) = lines[i % sz];
   }
@@ -71,9 +71,9 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
                               const string& input_rewrite) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor pattern(DT_STRING, TensorShape({}));
-  pattern.flat<string>().setConstant(input_pattern);
+  pattern.flat<tstring>().setConstant(input_pattern);
   Tensor rewrite(DT_STRING, TensorShape({}));
-  rewrite.flat<string>().setConstant(input_rewrite);
+  rewrite.flat<tstring>().setConstant(input_rewrite);
 
   TF_CHECK_OK(NodeBuilder("regex_replace_op", "RegexReplace")
                   .Input(test::graph::Constant(g, input))
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index e67695d54af..83ef50a2b97 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -74,7 +74,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ELU_KERNELS);
 #undef REGISTER_ELU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                    \
@@ -143,11 +143,14 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
+#if GOOGLE_CUDA
+// TODO(rocm) : qint8 datatype currently not supported on the ROCm platform
 template <>
 void Relu<GPUDevice, qint8>::operator()(
     const GPUDevice& d, typename TTypes<qint8>::ConstTensor features,
     typename TTypes<qint8>::Tensor activations);
 extern template struct Relu<GPUDevice, qint8>;
+#endif  // GOOGLE_CUDA
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
@@ -188,6 +191,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
+#if GOOGLE_CUDA
 template <typename Device>
 class ReluOp<Device, qint8>
     : public UnaryElementWiseOp<qint8, ReluOp<Device, qint8>> {
@@ -210,6 +214,7 @@ REGISTER_KERNEL_BUILDER(
     ReluOp<GPUDevice, qint8>);
 
 #endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 // Registration of the GPU implementations.
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 385565b28d4..b9ca43d5749 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -31,12 +31,18 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
+
+#if GOOGLE_CUDA
+// TODO(rocm): disabling this code on the ROCm platform since the references
+// to `half2` are leading to compile errors.
+
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
 // It effectively does: backdrops = (feature > 0) ? gradient : 0
 // It also tries to use native half2 primitives as much as possible.
-__global__ void ReluGradHalfKernel(const Eigen::half* gradient,
-                                   const Eigen::half* feature,
-                                   Eigen::half* backprop, int32 count) {
+__global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
+                                   const Eigen::half* __restrict__ feature,
+                                   Eigen::half* __restrict__ backprop,
+                                   int32 count) {
   int32 half2_count = count >> 1;
   int32 index = blockIdx.x * blockDim.x + threadIdx.x;
   const int32 total_device_threads = gridDim.x * blockDim.x;
@@ -111,9 +117,12 @@ struct ReluGrad<Device, Eigen::half> {
         d.stream(), gradient.data(), feature.data(), backprop.data(), count));
   }
 };
+#endif  // GOOGLE_CUDA
 
-__global__ void Relu_int8x4_kernel(int vect_count, const int32* input,
-                                   int32* output) {
+#if GOOGLE_CUDA
+__global__ void Relu_int8x4_kernel(int vect_count,
+                                   const int32* __restrict__ input,
+                                   int32* __restrict__ output) {
   CUDA_1D_KERNEL_LOOP(index, vect_count) {
     output[index] = __vmaxs4(input[index], 0);
   }
@@ -141,6 +150,7 @@ struct Relu<Device, qint8> {
         reinterpret_cast<int32*>(output.data())));
   }
 };
+#endif  // GOOGLE_CUDA
 
 }  // namespace functor
 
@@ -158,9 +168,10 @@ struct Relu<Device, qint8> {
   template struct functor::SeluGrad<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-
+#if GOOGLE_CUDA
 template struct functor::Relu<GPUDevice, qint8>;
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index 26f107f9403..5e01f4d2d33 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -1356,7 +1356,7 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
       dst_ptr = tensor->flat<int8>().data();
       break;
     case DT_STRING:
-      dst_ptr = tensor->flat<string>().data();
+      dst_ptr = tensor->flat<tstring>().data();
       break;
     case DT_INT64:
       dst_ptr = tensor->flat<int64>().data();
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
index d42c0364ff2..61560b92f75 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
@@ -116,8 +116,14 @@ static Status PlaceShapeType(const std::vector<string>& inputs,
   std::vector<std::pair<string, Tensor>> input_tensors;
   for (size_t i = 0; i < inputs.size(); ++i) {
     const string& name = inputs.at(i);
+    std::vector<string> split_input_shapes =
+        str_util::Split(input_shapes_strs.at(i), ',');
     std::vector<int64> dims;
-    CHECK(str_util::SplitAndParseAsInts(input_shapes_strs.at(i), ',', &dims));
+    for (const string& dim : split_input_shapes) {
+      int64 tmp;
+      CHECK(strings::safe_strto64(dim, &tmp));
+      dims.push_back(tmp);
+    }
     DataType data_type;
     CHECK(DataTypeFromString(input_types_strs.at(i), &data_type))
         << "\"" << input_types_strs.at(i) << "\" was an invalid type";
diff --git a/tensorflow/core/kernels/reshape_util.cc b/tensorflow/core/kernels/reshape_util.cc
index 50fdc179165..3b49181f77c 100644
--- a/tensorflow/core/kernels/reshape_util.cc
+++ b/tensorflow/core/kernels/reshape_util.cc
@@ -89,7 +89,8 @@ void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
         errors::InvalidArgument(
             "Input to reshape is a SparseTensor with ", dense_size,
             " dense values, but the requested shape requires a multiple of ",
-            product));
+            product, ". input_shape=", input_shape.DebugString(),
+            " output_shape=", output_shape.DebugString()));
     output_shape.set_dim(unknown_index, missing);
   }
 
@@ -97,7 +98,9 @@ void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
       context, output_shape.num_elements() == dense_size,
       errors::InvalidArgument("Input to reshape is a tensor with ", dense_size,
                               " dense values, but the requested shape has ",
-                              output_shape.num_elements()));
+                              output_shape.num_elements(),
+                              ". input_shape=", input_shape.DebugString(),
+                              " output_shape=", output_shape.DebugString()));
 
   // Optimize for reshaping to the same shape.
   if (input_shape == output_shape) {
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index d2afe4a89ab..42a3daae116 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -113,11 +113,10 @@ __global__ void ResizeBilinearKernel_faster(
 }
 
 template <typename T>
-__global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
-                                     float height_scale, float width_scale,
-                                     int batch, int in_height, int in_width,
-                                     int channels, int out_height,
-                                     int out_width, float* output) {
+__global__ void ResizeBilinearKernel(
+    const int32 nthreads, const T* __restrict__ images, float height_scale,
+    float width_scale, int batch, int in_height, int in_width, int channels,
+    int out_height, int out_width, float* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = c + channels * (x + out_width * (y + out_height * b))
     int idx = out_idx;
@@ -165,10 +164,13 @@ __global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
 }
 
 template <typename T>
-__global__ void ResizeBilinearGradKernel(
-    const int32 nthreads, const float* input_grad, float height_scale,
-    float width_scale, int batch, int original_height, int original_width,
-    int channels, int resized_height, int resized_width, T* output_grad) {
+__global__ void ResizeBilinearGradKernel(const int32 nthreads,
+                                         const float* __restrict__ input_grad,
+                                         float height_scale, float width_scale,
+                                         int batch, int original_height,
+                                         int original_width, int channels,
+                                         int resized_height, int resized_width,
+                                         T* __restrict__ output_grad) {
   GPU_1D_KERNEL_LOOP(in_idx, nthreads) {
     // in_idx = c + channels * (x + resized_width * (y + resized_height * b))
     int idx = in_idx;
@@ -227,12 +229,10 @@ __global__ void ResizeBilinearGradKernel(
 }
 
 template <typename T>
-__global__ void LegacyResizeBilinearKernel(const int32 nthreads,
-                                           const T* images, float height_scale,
-                                           float width_scale, int batch,
-                                           int in_height, int in_width,
-                                           int channels, int out_height,
-                                           int out_width, float* output) {
+__global__ void LegacyResizeBilinearKernel(
+    const int32 nthreads, const T* __restrict__ images, float height_scale,
+    float width_scale, int batch, int in_height, int in_width, int channels,
+    int out_height, int out_width, float* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = c + channels * (x + out_width * (y + out_height * b))
     int idx = out_idx;
@@ -280,9 +280,10 @@ __global__ void LegacyResizeBilinearKernel(const int32 nthreads,
 
 template <typename T>
 __global__ void LegacyResizeBilinearGradKernel(
-    const int32 nthreads, const float* input_grad, float height_scale,
-    float width_scale, int batch, int original_height, int original_width,
-    int channels, int resized_height, int resized_width, T* output_grad) {
+    const int32 nthreads, const float* __restrict__ input_grad,
+    float height_scale, float width_scale, int batch, int original_height,
+    int original_width, int channels, int resized_height, int resized_width,
+    T* __restrict__ output_grad) {
   GPU_1D_KERNEL_LOOP(in_idx, nthreads) {
     // in_idx = c + channels * (x + resized_width * (y + resized_height * b))
     int idx = in_idx;
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index 73a27c5817a..b6a9c77ba13 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -33,7 +33,7 @@ namespace {
 
 template <typename T>
 __global__ void ResizeNearestNeighborNHWC(
-    const int nthreads, const T* bottom_data, const int in_height,
+    const int nthreads, const T* __restrict__ bottom_data, const int in_height,
     const int in_width, const int channels, const int out_height,
     const int out_width, const float height_scale, const float width_scale,
     T* top_data) {
@@ -64,10 +64,10 @@ __global__ void ResizeNearestNeighborNHWC(
 
 template <typename T, bool align_corners>
 __global__ void LegacyResizeNearestNeighborNHWC(
-    const int nthreads, const T* bottom_data, const int in_height,
+    const int nthreads, const T* __restrict__ bottom_data, const int in_height,
     const int in_width, const int channels, const int out_height,
     const int out_width, const float height_scale, const float width_scale,
-    T* top_data) {
+    T* __restrict__ top_data) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -93,10 +93,10 @@ __global__ void LegacyResizeNearestNeighborNHWC(
 
 template <typename T>
 __global__ void ResizeNearestNeighborBackwardNHWC(
-    const int nthreads, const T* top_diff, const int in_height,
+    const int nthreads, const T* __restrict__ top_diff, const int in_height,
     const int in_width, const int channels, const int out_height,
     const int out_width, const float height_scale, const float width_scale,
-    T* bottom_diff) {
+    T* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -124,10 +124,10 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
 
 template <typename T, bool align_corners>
 __global__ void LegacyResizeNearestNeighborBackwardNHWC(
-    const int nthreads, const T* top_diff, const int in_height,
+    const int nthreads, const T* __restrict__ top_diff, const int in_height,
     const int in_width, const int channels, const int out_height,
     const int out_width, const float height_scale, const float width_scale,
-    T* bottom_diff) {
+    T* __restrict__ bottom_diff) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 967d4a4734e..6d82bd507b0 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -188,11 +188,11 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
     }
   }
 
-  OP_REQUIRES(
-      ctx, uninitialized_vars.empty(),
-      errors::InvalidArgument("In ReadVariableOp the following variables were "
-                              "found uninitialized: ",
-                              absl::StrJoin(uninitialized_vars, ", ")));
+  OP_REQUIRES(ctx, uninitialized_vars.empty(),
+              errors::FailedPrecondition(
+                  "In ReadVariablesOp the following variables were "
+                  "found uninitialized: ",
+                  absl::StrJoin(uninitialized_vars, ", ")));
 
   for (size_t i = 0; i < dtypes_.size(); ++i) {
     // We're acquiring a reference to the underlying buffer while
@@ -765,7 +765,8 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
+TF_CALL_int64(REGISTER_GATHER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GATHER_GPU);
 
 // Variant objects themselves sit on CPU, even if they contain data
 // pointing to a device.
@@ -950,7 +951,7 @@ class ResourceScatterUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_CPU);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
-REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
+REGISTER_SCATTER_KERNEL(tstring, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
 REGISTER_SCATTER_KERNEL(bool, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index b6f15a9dc25..d7c5780e6ae 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -94,11 +94,11 @@ TEST_F(RestoreOpTest, RestoreSimple) {
 
     // Input #0 is the file name
     Tensor input_0(DT_STRING, TensorShape({}));
-    input_0.scalar<string>()() = filename;
+    input_0.scalar<tstring>()() = filename;
     inputs.push_back({nullptr, &input_0});
 
     // Input #1 is the tensor names
-    Tensor input_1 = MakeInput<string>(
+    Tensor input_1 = MakeInput<tstring>(
         TensorShape({static_cast<int>(tensor_names.size())}),
         [&tensor_names](int x) -> string { return tensor_names[x]; });
     inputs.push_back({nullptr, &input_1});
@@ -149,7 +149,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
                                        [](int x) -> int64 { return x - 9; });
     inputs.push_back({nullptr, &input_11});
     // Input #12 is a 1-d string tensor
-    Tensor input_12 = MakeInput<string>(
+    Tensor input_12 = MakeInput<tstring>(
         TensorShape({2}), [](int x) -> string { return x ? "yes" : "no"; });
     inputs.push_back({nullptr, &input_12});
     // Input #13 is a 1-d complex64 tensor
@@ -188,10 +188,10 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d bool tensor
   {
     MakeRestoreOp(DT_BOOL);
-    AddInput<string>(TensorShape({}),
-                     [&filename](int x) -> string { return filename; });
-    AddInput<string>(TensorShape({}),
-                     [&](int x) -> string { return tensor_names[0]; });
+    AddInput<tstring>(TensorShape({}),
+                      [&filename](int x) -> tstring { return filename; });
+    AddInput<tstring>(TensorShape({}),
+                      [&](int x) -> tstring { return tensor_names[0]; });
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2});
@@ -203,7 +203,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d integer tensor
   {
     MakeRestoreOp(DT_INT32);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[1];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[1];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({10});
@@ -215,7 +215,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d float tensor
   {
     MakeRestoreOp(DT_FLOAT);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[2];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[2];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 4});
@@ -227,7 +227,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d double tensor
   {
     MakeRestoreOp(DT_DOUBLE);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[3];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[3];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 4});
@@ -239,7 +239,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d qint8 tensor
   {
     MakeRestoreOp(DT_QINT8);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[4];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[4];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({3, 2});
@@ -251,7 +251,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d qint32 tensor
   {
     MakeRestoreOp(DT_QINT32);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[5];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[5];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 3});
@@ -264,7 +264,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d uint8 tensor
   {
     MakeRestoreOp(DT_UINT8);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[6];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[6];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({11});
@@ -276,7 +276,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d int8 tensor
   {
     MakeRestoreOp(DT_INT8);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[7];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[7];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({7});
@@ -288,7 +288,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d int16 tensor
   {
     MakeRestoreOp(DT_INT16);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[8];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[8];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({7});
@@ -300,7 +300,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d int64 tensor
   {
     MakeRestoreOp(DT_INT64);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[9];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[9];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({9});
@@ -312,18 +312,18 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 1-d string tensor
   {
     MakeRestoreOp(DT_STRING);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[10];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[10];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
-    EXPECT_EQ("no", output->flat<string>()(0));
-    EXPECT_EQ("yes", output->flat<string>()(1));
+    EXPECT_EQ("no", output->flat<tstring>()(0));
+    EXPECT_EQ("yes", output->flat<tstring>()(1));
   }
   // The 2-d complex64 tensor
   {
     MakeRestoreOp(DT_COMPLEX64);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[11];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[11];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 3});
@@ -335,7 +335,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d half tensor
   {
     MakeRestoreOp(DT_HALF);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[12];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[12];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 4});
@@ -348,7 +348,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
   // The 2-d empty float tensor
   {
     MakeRestoreOp(DT_FLOAT);
-    (*mutable_input(1).tensor).scalar<string>()() = tensor_names[13];
+    (*mutable_input(1).tensor).scalar<tstring>()() = tensor_names[13];
     TF_ASSERT_OK(RunOpKernel());
     Tensor* output = GetOutput(0);
     TensorShape expected({2, 0});
@@ -398,12 +398,12 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
 
     // Input #0 is the file name
     Tensor input_0(DT_STRING, TensorShape({}));
-    input_0.scalar<string>()() = filename;
+    input_0.scalar<tstring>()() = filename;
     inputs.push_back({nullptr, &input_0});
 
     // Input #1 is the tensor name
     Tensor input_1(DT_STRING, TensorShape({}));
-    input_1.scalar<string>()() = tensor_name;
+    input_1.scalar<tstring>()() = tensor_name;
     inputs.push_back({nullptr, &input_1});
 
     // Input #2 is a 4x16 integer tensor.
@@ -432,13 +432,13 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
   MakeRestoreSliceOp(DT_INT32);
   string shape_and_slice = "4 16 0,2:-";
   // Add a file name
-  AddInput<string>(TensorShape({}),
-                   [&filename](int x) -> string { return filename; });
+  AddInput<tstring>(TensorShape({}),
+                    [&filename](int x) -> tstring { return filename; });
   // Add the tensor names
-  AddInput<string>(TensorShape({}),
-                   [&tensor_name](int x) -> string { return tensor_name; });
+  AddInput<tstring>(TensorShape({}),
+                    [&tensor_name](int x) -> tstring { return tensor_name; });
   // Add the tensor shape and slice
-  AddInput<string>(TensorShape({}), [&shape_and_slice](int x) -> string {
+  AddInput<tstring>(TensorShape({}), [&shape_and_slice](int x) -> tstring {
     return shape_and_slice;
   });
 
diff --git a/tensorflow/core/kernels/restore_v2_op_test.cc b/tensorflow/core/kernels/restore_v2_op_test.cc
index 36631570c7b..da9ef51b417 100644
--- a/tensorflow/core/kernels/restore_v2_op_test.cc
+++ b/tensorflow/core/kernels/restore_v2_op_test.cc
@@ -105,16 +105,16 @@ class RestoreV2OpTest : public OpsTestBase {
 
       // Input #0 is the file name
       Tensor input_0(DT_STRING, TensorShape({}));
-      input_0.scalar<string>()() = filename;
+      input_0.scalar<tstring>()() = filename;
       inputs.push_back({nullptr, &input_0});
 
       // Input #1 is the tensor names
-      Tensor input_1 = MakeInput<string>(
+      Tensor input_1 = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
           [&tensor_names](int x) -> string { return tensor_names[x]; });
       inputs.push_back({nullptr, &input_1});
 
-      Tensor shape_and_slices = MakeInput<string>(
+      Tensor shape_and_slices = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
           [](int x) -> string { return "" /* saves in full */; });
       if (save_op_to_use != "Save") {
@@ -195,11 +195,11 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d bool tensor
     {
       MakeRestoreOp(DT_BOOL);
-      AddInput<string>(TensorShape({}),
-                       [&filename](int x) -> string { return filename; });
-      AddInput<string>(TensorShape({1}),
-                       [&](int x) -> string { return tensor_names[0]; });
-      AddInput<string>(TensorShape({1}), [&](int x) -> string {
+      AddInput<tstring>(TensorShape({}),
+                        [&filename](int x) -> tstring { return filename; });
+      AddInput<tstring>(TensorShape({1}),
+                        [&](int x) -> tstring { return tensor_names[0]; });
+      AddInput<tstring>(TensorShape({1}), [&](int x) -> tstring {
         return "";
       });  // Restores in full.
       TF_ASSERT_OK(RunOpKernel());
@@ -213,7 +213,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d integer tensor
     {
       MakeRestoreOp(DT_INT32);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[1];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[1];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({10});
@@ -225,7 +225,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d float tensor
     {
       MakeRestoreOp(DT_FLOAT);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[2];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[2];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({2, 4});
@@ -237,7 +237,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d double tensor
     {
       MakeRestoreOp(DT_DOUBLE);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[3];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[3];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({2, 4});
@@ -249,7 +249,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d qint8 tensor
     {
       MakeRestoreOp(DT_QINT8);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[4];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[4];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({3, 2});
@@ -261,7 +261,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d qint32 tensor
     {
       MakeRestoreOp(DT_QINT32);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[5];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[5];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({2, 3});
@@ -274,7 +274,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d uint8 tensor
     {
       MakeRestoreOp(DT_UINT8);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[6];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[6];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({11});
@@ -286,7 +286,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d int8 tensor
     {
       MakeRestoreOp(DT_INT8);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[7];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[7];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({7});
@@ -298,7 +298,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d int16 tensor
     {
       MakeRestoreOp(DT_INT16);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[8];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[8];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({7});
@@ -310,7 +310,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 1-d int64 tensor
     {
       MakeRestoreOp(DT_INT64);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[9];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[9];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({9});
@@ -322,7 +322,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d complex64 tensor
     {
       MakeRestoreOp(DT_COMPLEX64);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[10];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[10];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({2, 3});
@@ -334,7 +334,7 @@ class RestoreV2OpTest : public OpsTestBase {
     // The 2-d half tensor
     {
       MakeRestoreOp(DT_HALF);
-      (*mutable_input(1).tensor).flat<string>()(0) = tensor_names[11];
+      (*mutable_input(1).tensor).flat<tstring>()(0) = tensor_names[11];
       TF_ASSERT_OK(RunOpKernel());
       Tensor* output = GetOutput(0);
       TensorShape expected({2, 4});
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index c60ab60849f..98bf8bf8e91 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -314,7 +314,7 @@ class ReverseV2Op : public OpKernel {
                               .HostMemory("axis"),           \
                           ReverseV2Op<CPUDevice, T, int64>)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
-TF_CALL_string(REGISTER_KERNELS);
+TF_CALL_tstring(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 2975e8bc02c..8096e2150d8 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -7,9 +7,13 @@ load(
     "tf_kernel_library",
 )
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -19,7 +23,11 @@ licenses(["notice"])  # Apache 2.0
 
 tf_gpu_library(
     name = "blas_gemm",
-    srcs = if_cuda_is_configured(["blas_gemm.cc"]),
+    srcs = [] + if_cuda_is_configured([
+        "blas_gemm.cc",
+    ]) + if_rocm_is_configured([
+        "blas_gemm.cc",
+    ]),
     hdrs = ["blas_gemm.h"],
     deps = [
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.cc b/tensorflow/core/kernels/rnn/blas_gemm.cc
index e9da5f0aebb..d0f25dd73bb 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.cc
+++ b/tensorflow/core/kernels/rnn/blas_gemm.cc
@@ -15,15 +15,15 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/rnn/blas_gemm.h"
 namespace tensorflow {
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace {
 template <typename T>
 se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
@@ -32,7 +32,7 @@ se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
   return typed;
 }
 }  // namespace
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace functor {
 template <typename T>
@@ -41,7 +41,7 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
                                      float alpha, const T* a, int lda,
                                      const T* b, int ldb, float beta, T* c,
                                      int ldc) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                  se::blas::Transpose::kTranspose};
 
diff --git a/tensorflow/core/kernels/rnn/gru_ops.cc b/tensorflow/core/kernels/rnn/gru_ops.cc
index 27e1698ece5..fbeaf3c7810 100644
--- a/tensorflow/core/kernels/rnn/gru_ops.cc
+++ b/tensorflow/core/kernels/rnn/gru_ops.cc
@@ -380,7 +380,7 @@ REGISTER_KERNEL(float);
 #undef REGISTER_KERNEL
 
 // GPU support.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 // Forward declare the GPU Fprop functor.
@@ -445,6 +445,6 @@ DECLARE_GPU_SPEC(float);
 
 REGISTER_GPU_KERNEL(float);
 #undef REGISTER_GPU_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc
index ca4c233388d..d72a3b1efef 100644
--- a/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/rnn/gru_ops.h"
@@ -32,4 +32,4 @@ DEFINE_GPU_SPECS(float);
 
 }  // end namespace functor
 }  // end namespace tensorflow
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/rnn/lstm_ops.cc b/tensorflow/core/kernels/rnn/lstm_ops.cc
index b1bf1cae0ce..dfd77f9ca5f 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/rnn/lstm_ops.h"
 
@@ -41,7 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename T>
+template <typename T, GateLayout gate_layout>
 void LSTMBlockCellFpropWithEigen(
     const LSTMBlockCell& cell, OpKernelContext* ctx, const CPUDevice& d,
     const float forget_bias, const float cell_clip, bool use_peephole,
@@ -52,7 +52,7 @@ void LSTMBlockCellFpropWithEigen(
     typename TTypes<T>::Matrix xh, typename TTypes<T>::Matrix i,
     typename TTypes<T>::Matrix cs, typename TTypes<T>::Matrix f,
     typename TTypes<T>::Matrix o, typename TTypes<T>::Matrix ci,
-    typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
+    typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix gates,
     typename TTypes<T>::Matrix h) {
   // Concat xh = [x, h].
   xh.slice(cell.xh_x_offsets(), cell.xh_x_extents()).device(d) = x;
@@ -62,10 +62,10 @@ void LSTMBlockCellFpropWithEigen(
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<CPUDevice, T, false /* USE_CUBLAS */>::compute(
       ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
-      w, typename gemm_compute_type<T>::type(0.f), icfo);
+      w, typename gemm_compute_type<T>::type(0.f), gates);
   Eigen::array<Eigen::DenseIndex, 2> b_shape({1, b.dimensions()[0]});
   Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({cell.batch_size(), 1});
-  icfo.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
+  gates.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
 
   Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell.cell_size()});
   Eigen::array<Eigen::DenseIndex, 2> p_broadcast_shape({cell.batch_size(), 1});
@@ -74,26 +74,30 @@ void LSTMBlockCellFpropWithEigen(
   if (use_peephole) {
     auto i_peep = cs_prev * wci.reshape(p_shape).broadcast(p_broadcast_shape);
     i.device(d) =
-        (icfo.slice(cell.icfo_i_offsets(), cell.cell_extents()) + i_peep)
+        (gates.slice(cell.gates_i_offsets(), cell.cell_extents()) + i_peep)
             .sigmoid();
   } else {
     i.device(d) =
-        icfo.slice(cell.icfo_i_offsets(), cell.cell_extents()).sigmoid();
+        gates.slice(cell.gates_i_offsets(), cell.cell_extents()).sigmoid();
   }
 
   // Cell input.
-  ci.device(d) = icfo.slice(cell.icfo_c_offsets(), cell.cell_extents()).tanh();
+  ci.device(d) =
+      gates.slice(cell.gates_c_offsets(gate_layout), cell.cell_extents())
+          .tanh();
 
   // Forget gate (w/ bias).
   if (use_peephole) {
     auto f_peep = cs_prev * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
-    f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(T(forget_bias)) + f_peep)
-                      .sigmoid();
+    f.device(d) =
+        (gates.slice(cell.gates_f_offsets(gate_layout), cell.cell_extents()) +
+         f.constant(T(forget_bias)) + f_peep)
+            .sigmoid();
   } else {
-    f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(T(forget_bias)))
-                      .sigmoid();
+    f.device(d) =
+        (gates.slice(cell.gates_f_offsets(gate_layout), cell.cell_extents()) +
+         f.constant(T(forget_bias)))
+            .sigmoid();
   }
 
   // cs = ci .* i + f .* cs_prev
@@ -111,18 +115,18 @@ void LSTMBlockCellFpropWithEigen(
   if (use_peephole) {
     auto o_peep = cs * wco.reshape(p_shape).broadcast(p_broadcast_shape);
     o.device(d) =
-        (icfo.slice(cell.icfo_o_offsets(), cell.cell_extents()) + o_peep)
+        (gates.slice(cell.gates_o_offsets(), cell.cell_extents()) + o_peep)
             .sigmoid();
   } else {
     o.device(d) =
-        icfo.slice(cell.icfo_o_offsets(), cell.cell_extents()).sigmoid();
+        gates.slice(cell.gates_o_offsets(), cell.cell_extents()).sigmoid();
   }
 
   // h = o .* co
   h.device(d) = o * co;
 }
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, GateLayout gate_layout>
 void LSTMBlockCellBpropWithEigen(
     const LSTMBlockCell& cell, OpKernelContext* ctx, const Device& d,
     bool use_peephole, typename TTypes<T>::ConstMatrix x,
@@ -137,7 +141,7 @@ void LSTMBlockCellBpropWithEigen(
     typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
     typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
     typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
-    typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
+    typename TTypes<T>::Matrix dgates, typename TTypes<T>::Matrix cs_prev_grad,
     typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
     typename TTypes<T>::Vec wco_grad) {
   // do[t] = sigm'(o[t]) .* dh[t] .* co[t]
@@ -162,10 +166,12 @@ void LSTMBlockCellBpropWithEigen(
   // di[t] = sigm'(i[t]) dcs[t] ci[t]
   di.device(d) = i * (i.constant(T(1)) - i) * dcs * ci;
 
-  dicfo.slice(cell.icfo_i_offsets(), cell.cell_extents()).device(d) = di;
-  dicfo.slice(cell.icfo_c_offsets(), cell.cell_extents()).device(d) = dci;
-  dicfo.slice(cell.icfo_f_offsets(), cell.cell_extents()).device(d) = df;
-  dicfo.slice(cell.icfo_o_offsets(), cell.cell_extents()).device(d) = do_;
+  dgates.slice(cell.gates_i_offsets(), cell.cell_extents()).device(d) = di;
+  dgates.slice(cell.gates_c_offsets(gate_layout), cell.cell_extents())
+      .device(d) = dci;
+  dgates.slice(cell.gates_f_offsets(gate_layout), cell.cell_extents())
+      .device(d) = df;
+  dgates.slice(cell.gates_o_offsets(), cell.cell_extents()).device(d) = do_;
 
   cs_prev_grad.device(d) = dcs * f;
   if (use_peephole) {
@@ -178,10 +184,71 @@ void LSTMBlockCellBpropWithEigen(
   }
 }
 
-#define DEFINE_CPU_SPECS(T)                                                   \
+#define DECLARE_CPU_FBPROP(T, GATE_LAYOUT)                                     \
+  template <>                                                                  \
+  void LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */, GATE_LAYOUT>:: \
+  operator()(                                                                  \
+      OpKernelContext* ctx, const CPUDevice& d, const float forget_bias,       \
+      const float cell_clip, bool use_peephole,                                \
+      typename TTypes<T>::ConstMatrix x,                                       \
+      typename TTypes<T>::ConstMatrix cs_prev,                                 \
+      typename TTypes<T>::ConstMatrix h_prev,                                  \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,             \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
+      typename TTypes<T>::Matrix gates, typename TTypes<T>::Matrix h) {        \
+    LSTMBlockCellFpropWithEigen<T, GATE_LAYOUT>(                               \
+        *this, ctx, d, forget_bias, cell_clip, use_peephole, x, cs_prev,       \
+        h_prev, w, wci, wcf, wco, b, xh, i, cs, f, o, ci, co, gates, h);       \
+  }                                                                            \
+  template <>                                                                  \
+  void LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */, GATE_LAYOUT>:: \
+  operator()(                                                                  \
+      OpKernelContext* ctx, const CPUDevice& d, bool use_peephole,             \
+      typename TTypes<T>::ConstMatrix x,                                       \
+      typename TTypes<T>::ConstMatrix cs_prev,                                 \
+      typename TTypes<T>::ConstMatrix h_prev,                                  \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,       \
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,   \
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,   \
+      typename TTypes<T>::ConstMatrix co,                                      \
+      typename TTypes<T>::ConstMatrix cs_grad,                                 \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,  \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,          \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,            \
+      typename TTypes<T>::Matrix dgates,                                       \
+      typename TTypes<T>::Matrix cs_prev_grad,                                 \
+      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,      \
+      typename TTypes<T>::Vec wco_grad) {                                      \
+    LSTMBlockCellBpropWithEigen<CPUDevice, T, GATE_LAYOUT>(                    \
+        *this, ctx, d, use_peephole, x, cs_prev, h_prev, w, wci, wcf, wco, b,  \
+        i, cs, f, o, ci, co, cs_grad, h_grad, do_, dcs, dci, df, di, dgates,   \
+        cs_prev_grad, wci_grad, wcf_grad, wco_grad);                           \
+  }                                                                            \
+  template struct LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */,     \
+                                     GATE_LAYOUT>;                             \
+  template struct LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */,     \
+                                     GATE_LAYOUT>;
+
+#define DECLARE_CPU_SPECS(T)   \
+  DECLARE_CPU_FBPROP(T, ICFO); \
+  DECLARE_CPU_FBPROP(T, IFCO);
+
+DECLARE_CPU_SPECS(Eigen::half);
+DECLARE_CPU_SPECS(float);
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_FBPROP
+
+#if GOOGLE_CUDA
+#define DECLARE_GPU_FBPROP(T, GATE_LAYOUT)                                    \
   template <>                                                                 \
-  void LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(  \
-      OpKernelContext* ctx, const CPUDevice& d, const float forget_bias,      \
+  void LSTMBlockCellFprop<GPUDevice, T, true, GATE_LAYOUT>::operator()(       \
+      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,      \
       const float cell_clip, bool use_peephole,                               \
       typename TTypes<T>::ConstMatrix x,                                      \
       typename TTypes<T>::ConstMatrix cs_prev,                                \
@@ -192,14 +259,10 @@ void LSTMBlockCellBpropWithEigen(
       typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,            \
       typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,             \
       typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,           \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h) {        \
-    LSTMBlockCellFpropWithEigen<T>(                                           \
-        *this, ctx, d, forget_bias, cell_clip, use_peephole, x, cs_prev,      \
-        h_prev, w, wci, wcf, wco, b, xh, i, cs, f, o, ci, co, icfo, h);       \
-  }                                                                           \
+      typename TTypes<T>::Matrix gates, typename TTypes<T>::Matrix h);        \
   template <>                                                                 \
-  void LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(  \
-      OpKernelContext* ctx, const CPUDevice& d, bool use_peephole,            \
+  void LSTMBlockCellBprop<GPUDevice, T, true, GATE_LAYOUT>::operator()(       \
+      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,            \
       typename TTypes<T>::ConstMatrix x,                                      \
       typename TTypes<T>::ConstMatrix cs_prev,                                \
       typename TTypes<T>::ConstMatrix h_prev,                                 \
@@ -213,25 +276,25 @@ void LSTMBlockCellBpropWithEigen(
       typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
       typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
       typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
-      typename TTypes<T>::Matrix dicfo,                                       \
+      typename TTypes<T>::Matrix dgates,                                      \
       typename TTypes<T>::Matrix cs_prev_grad,                                \
       typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,     \
-      typename TTypes<T>::Vec wco_grad) {                                     \
-    LSTMBlockCellBpropWithEigen<CPUDevice, T, false /* USE_CUBLAS */>(        \
-        *this, ctx, d, use_peephole, x, cs_prev, h_prev, w, wci, wcf, wco, b, \
-        i, cs, f, o, ci, co, cs_grad, h_grad, do_, dcs, dci, df, di, dicfo,   \
-        cs_prev_grad, wci_grad, wcf_grad, wco_grad);                          \
-  }                                                                           \
-  template struct LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>;   \
-  template struct LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>;
+      typename TTypes<T>::Vec wco_grad);                                      \
+                                                                              \
+  extern template struct LSTMBlockCellBprop<                                  \
+      GPUDevice, T, true /* USE_CUBLAS */, GATE_LAYOUT>;                      \
+  extern template struct LSTMBlockCellFprop<GPUDevice, T, true, GATE_LAYOUT>;
 
-DEFINE_CPU_SPECS(float);
-DEFINE_CPU_SPECS(Eigen::half);
-#undef DEFINE_CPU_SPECS
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_FBPROP(T, ICFO);
 
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(Eigen::half);
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_FBROP
+#endif  // GOOGLE_CUDA
 }  // namespace functor
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 class LSTMBlockCellOp : public OpKernel {
  public:
   explicit LSTMBlockCellOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -345,23 +408,24 @@ class LSTMBlockCellOp : public OpKernel {
                             TensorShape({batch_size, input_size + cell_size}),
                             &xh_tensor));
 
-    Tensor icfo_tensor;
+    Tensor gates_tensor;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                       TensorShape({batch_size, cell_size * 4}),
-                                      &icfo_tensor));
+                                      &gates_tensor));
 
     const Device& device = ctx->eigen_device<Device>();
 
-    functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
-                                                       cell_size)(
+    functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS, gate_layout>(
+        batch_size, input_size, cell_size)(
         ctx, device, forget_bias_, cell_clip_, use_peephole_,
         x_tensor->matrix<T>(), cs_prev_tensor->matrix<T>(),
         h_prev_tensor->matrix<T>(), w_tensor->matrix<T>(), wci_tensor->vec<T>(),
         wcf_tensor->vec<T>(), wco_tensor->vec<T>(), b_tensor->vec<T>(),
         xh_tensor.matrix<T>(), i_tensor->matrix<T>(), cs_tensor->matrix<T>(),
         f_tensor->matrix<T>(), o_tensor->matrix<T>(), ci_tensor->matrix<T>(),
-        co_tensor->matrix<T>(), icfo_tensor.matrix<T>(), h_tensor->matrix<T>());
+        co_tensor->matrix<T>(), gates_tensor.matrix<T>(),
+        h_tensor->matrix<T>());
   }
 
  private:
@@ -373,48 +437,24 @@ class LSTMBlockCellOp : public OpKernel {
 #define REGISTER_KERNEL(T)                                             \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("LSTMBlockCell").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      LSTMBlockCellOp<CPUDevice, T, false>);
-REGISTER_KERNEL(float);
+      LSTMBlockCellOp<CPUDevice, T, false, ICFO>);
+
 REGISTER_KERNEL(Eigen::half);
+REGISTER_KERNEL(float);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                \
-  template <>                                                              \
-  void LSTMBlockCellFprop<GPUDevice, T, true>::operator()(                 \
-      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,   \
-      const float cell_clip, bool use_peephole,                            \
-      typename TTypes<T>::ConstMatrix x,                                   \
-      typename TTypes<T>::ConstMatrix cs_prev,                             \
-      typename TTypes<T>::ConstMatrix h_prev,                              \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci, \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,  \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,       \
-      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,         \
-      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,          \
-      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,        \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);      \
-                                                                           \
-  extern template struct LSTMBlockCellFprop<GPUDevice, T, true>;
-
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-#undef DECLARE_GPU_SPEC
-}  // end namespace functor
-
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("LSTMBlockCell").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      LSTMBlockCellOp<GPUDevice, T, true>);
+      LSTMBlockCellOp<GPUDevice, T, true, ICFO>);
 
-REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(Eigen::half);
-// REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(float);
 #undef REGISTER_GPU_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 class LSTMBlockCellGradOp : public OpKernel {
  public:
   explicit LSTMBlockCellGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -586,10 +626,10 @@ class LSTMBlockCellGradOp : public OpKernel {
                  {"cs_grad"}, "cs_prev_grad",
                  TensorShape({batch_size, cell_size}), &cs_prev_grad_tensor));
 
-    Tensor* dicfo_tensor = nullptr;
+    Tensor* dgates_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(
                             "dicfo", TensorShape({batch_size, cell_size * 4}),
-                            &dicfo_tensor));
+                            &dgates_tensor));
 
     Tensor* wci_grad_tensor = nullptr;
     OP_REQUIRES_OK(
@@ -638,8 +678,8 @@ class LSTMBlockCellGradOp : public OpKernel {
     functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<T>());
     functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<T>());
 
-    functor::LSTMBlockCellBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
-                                                       cell_size)(
+    functor::LSTMBlockCellBprop<Device, T, USE_CUBLAS, gate_layout>(
+        batch_size, input_size, cell_size)(
         ctx, device, use_peephole_, x_tensor->matrix<T>(),
         cs_prev_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
         w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
@@ -648,9 +688,10 @@ class LSTMBlockCellGradOp : public OpKernel {
         ci_tensor->matrix<T>(), co_tensor->matrix<T>(),
         cs_grad_tensor->matrix<T>(), h_grad_tensor->matrix<T>(),
         do_tensor.matrix<T>(), dcs_tensor.matrix<T>(), dci_tensor.matrix<T>(),
-        df_tensor.matrix<T>(), di_tensor.matrix<T>(), dicfo_tensor->matrix<T>(),
-        cs_prev_grad_tensor->matrix<T>(), wci_grad_tensor->vec<T>(),
-        wcf_grad_tensor->vec<T>(), wco_grad_tensor->vec<T>());
+        df_tensor.matrix<T>(), di_tensor.matrix<T>(),
+        dgates_tensor->matrix<T>(), cs_prev_grad_tensor->matrix<T>(),
+        wci_grad_tensor->vec<T>(), wcf_grad_tensor->vec<T>(),
+        wco_grad_tensor->vec<T>());
   }
 
  protected:
@@ -660,54 +701,21 @@ class LSTMBlockCellGradOp : public OpKernel {
 #define REGISTER_KERNEL(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("LSTMBlockCellGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      LSTMBlockCellGradOp<CPUDevice, T, false>);
+      LSTMBlockCellGradOp<CPUDevice, T, false, ICFO>);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                   \
-  template <>                                                                 \
-  void LSTMBlockCellBprop<GPUDevice, T, true>::operator()(                    \
-      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,            \
-      typename TTypes<T>::ConstMatrix x,                                      \
-      typename TTypes<T>::ConstMatrix cs_prev,                                \
-      typename TTypes<T>::ConstMatrix h_prev,                                 \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,      \
-      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,  \
-      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,  \
-      typename TTypes<T>::ConstMatrix co,                                     \
-      typename TTypes<T>::ConstMatrix cs_grad,                                \
-      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
-      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
-      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
-      typename TTypes<T>::Matrix dicfo,                                       \
-      typename TTypes<T>::Matrix cs_prev_grad,                                \
-      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,     \
-      typename TTypes<T>::Vec wco_grad);                                      \
-                                                                              \
-  extern template struct LSTMBlockCellBprop<GPUDevice, T,                     \
-                                            true /* USE_CUBLAS */>;
-
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-// DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(T)                                             \
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("LSTMBlockCellGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      LSTMBlockCellGradOp<GPUDevice, T, true>);
+      LSTMBlockCellGradOp<GPUDevice, T, true, ICFO>);
 
-REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(Eigen::half);
-// REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(float);
 #undef REGISTER_GPU_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace {
 
@@ -817,11 +825,16 @@ class SliceHelper {
 
 }  // namespace
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 class BlockLSTMOp : public OpKernel {
  public:
   explicit BlockLSTMOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_));
+    if (ctx->HasAttr("forget_bias")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_));
+    } else {
+      // V2 version does not have "forget_bias" attribute.
+      forget_bias_ = 0.0;
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("cell_clip", &cell_clip_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_));
   }
@@ -948,11 +961,11 @@ class BlockLSTMOp : public OpKernel {
                             TensorShape({batch_size, input_size + cell_size}),
                             &xh_tensor));
 
-    Tensor icfo_tensor;
+    Tensor gates_tensor;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                       TensorShape({batch_size, cell_size * 4}),
-                                      &icfo_tensor));
+                                      &gates_tensor));
 
     const Device& device = ctx->eigen_device<Device>();
 
@@ -974,16 +987,16 @@ class BlockLSTMOp : public OpKernel {
       Tensor co_tensor = slicer.OutputSlice(co_out, t, "co_out");
       Tensor h_tensor = slicer.OutputSlice(h_out, t, "h_out");
 
-      functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
-                                                         cell_size)(
+      functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS, gate_layout>(
+          batch_size, input_size, cell_size)(
           ctx, device, forget_bias_, cell_clip_, use_peephole_,
           x_tensor.matrix<T>(), cs_prev_tensor2.matrix<T>(),
           h_prev_tensor2.matrix<T>(), w_tensor->matrix<T>(),
           wci_tensor->vec<T>(), wcf_tensor->vec<T>(), wco_tensor->vec<T>(),
           b_tensor->vec<T>(), xh_tensor.matrix<T>(), i_tensor.matrix<T>(),
           cs_tensor.matrix<T>(), f_tensor.matrix<T>(), o_tensor.matrix<T>(),
-          ci_tensor.matrix<T>(), co_tensor.matrix<T>(), icfo_tensor.matrix<T>(),
-          h_tensor.matrix<T>());
+          ci_tensor.matrix<T>(), co_tensor.matrix<T>(),
+          gates_tensor.matrix<T>(), h_tensor.matrix<T>());
       slicer.FinishTimeStep();
     }
 
@@ -1004,17 +1017,21 @@ class BlockLSTMOp : public OpKernel {
   bool use_peephole_;
 };
 
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("BlockLSTM").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BlockLSTMOp<CPUDevice, T, false>);
-REGISTER_KERNEL(float);
+#define REGISTER_KERNEL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("BlockLSTM").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      BlockLSTMOp<CPUDevice, T, false, ICFO>);                       \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("BlockLSTMV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      BlockLSTMOp<CPUDevice, T, false, IFCO>);
+
 REGISTER_KERNEL(Eigen::half);
+REGISTER_KERNEL(float);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
+#define DECLARE_GPU_SPECS(T)                                             \
   template <>                                                            \
   void TensorZero<GPUDevice, T>::operator()(const GPUDevice& d,          \
                                             typename TTypes<T>::Flat t); \
@@ -1027,26 +1044,29 @@ namespace functor {
                                                                          \
   extern template struct TensorUnalignedZero<GPUDevice, T>;
 
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-// DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
+DECLARE_GPU_SPECS(Eigen::half);
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPECS
 }  // end namespace functor
 
-#define REGISTER_GPU_KERNEL(T)                           \
-  REGISTER_KERNEL_BUILDER(Name("BlockLSTM")              \
-                              .Device(DEVICE_GPU)        \
-                              .HostMemory("seq_len_max") \
-                              .TypeConstraint<T>("T"),   \
-                          BlockLSTMOp<GPUDevice, T, true>);
+#define REGISTER_GPU_KERNEL(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("BlockLSTM")                       \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("seq_len_max")          \
+                              .TypeConstraint<T>("T"),            \
+                          BlockLSTMOp<GPUDevice, T, true, ICFO>); \
+  REGISTER_KERNEL_BUILDER(Name("BlockLSTMV2")                     \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("seq_len_max")          \
+                              .TypeConstraint<T>("T"),            \
+                          BlockLSTMOp<GPUDevice, T, true, IFCO>);
 
-REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(Eigen::half);
-// REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(float);
 #undef REGISTER_GPU_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 class BlockLSTMGradOp : public OpKernel {
  public:
   explicit BlockLSTMGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1188,11 +1208,11 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                            batch_cell_shape, &di_tensor));
 
-    Tensor dicfo_tensor;
+    Tensor dgates_tensor;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                       TensorShape({batch_size, cell_size * 4}),
-                                      &dicfo_tensor));
+                                      &dgates_tensor));
 
     Tensor cs_grad_tensor;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
@@ -1249,8 +1269,8 @@ class BlockLSTMGradOp : public OpKernel {
       const Tensor& const_h_grad_tensor = h_grad_tensor;
 
       Tensor x_grad_tensor = slicer.OutputSlice(x_grad, t, "x_grad");
-      functor::BlockLSTMBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
-                                                     cell_size)(
+      functor::BlockLSTMBprop<Device, T, USE_CUBLAS, gate_layout>(
+          batch_size, input_size, cell_size)(
           ctx, device, use_peephole_, x_tensor.matrix<T>(),
           cs_prev_tensor2.matrix<T>(), h_prev_tensor2.matrix<T>(),
           w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
@@ -1260,7 +1280,7 @@ class BlockLSTMGradOp : public OpKernel {
           const_cs_grad_tensor.matrix<T>(), const_h_grad_tensor.matrix<T>(),
           do_tensor.matrix<T>(), dcs_tensor.matrix<T>(), dci_tensor.matrix<T>(),
           df_tensor.matrix<T>(), di_tensor.matrix<T>(),
-          dicfo_tensor.matrix<T>(), cs_prev_grad_tensor->matrix<T>(),
+          dgates_tensor.matrix<T>(), cs_prev_grad_tensor->matrix<T>(),
           h_prev_grad_tensor->matrix<T>(), xh_grad_tensor.matrix<T>(),
           x_grad_tensor.matrix<T>(), w_grad_tensor->matrix<T>(),
           wci_grad_tensor->vec<T>(), wcf_grad_tensor->vec<T>(),
@@ -1279,17 +1299,47 @@ class BlockLSTMGradOp : public OpKernel {
   bool use_peephole_;
 };
 
-#define REGISTER_KERNEL(T)                                             \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BlockLSTMGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BlockLSTMGradOp<CPUDevice, T, false>);
-REGISTER_KERNEL(float);
+#define REGISTER_KERNEL(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("BlockLSTMGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      BlockLSTMGradOp<CPUDevice, T, false, ICFO>);                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("BlockLSTMGradV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      BlockLSTMGradOp<CPUDevice, T, false, IFCO>);
+
 REGISTER_KERNEL(Eigen::half);
+REGISTER_KERNEL(float);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
+#define DECLARE_GPU_BPROP(T, GATE_LAYOUT)                                     \
+  template <>                                                                 \
+  void BlockLSTMBprop<GPUDevice, T, true, GATE_LAYOUT>::operator()(           \
+      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,            \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,          \
+      typename TTypes<T>::ConstMatrix i, typename TTypes<T>::ConstMatrix cs,  \
+      typename TTypes<T>::ConstMatrix f, typename TTypes<T>::ConstMatrix o,   \
+      typename TTypes<T>::ConstMatrix ci, typename TTypes<T>::ConstMatrix co, \
+      typename TTypes<T>::ConstMatrix cs_grad,                                \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
+      typename TTypes<T>::Matrix dgates,                                      \
+      typename TTypes<T>::Matrix cs_prev_grad,                                \
+      typename TTypes<T>::Matrix h_prev_grad,                                 \
+      typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,  \
+      typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,    \
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad,     \
+      typename TTypes<T>::Vec b_grad);                                        \
+  extern template struct BlockLSTMBprop<GPUDevice, T, true, GATE_LAYOUT>;
+
+#define DECLARE_GPU_SPECS(T)                                                   \
   template <>                                                                  \
   void TensorCopy<GPUDevice, T>::operator()(const GPUDevice& d,                \
                                             typename TTypes<T>::ConstFlat src, \
@@ -1310,51 +1360,33 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T>::ConstFlat a,                     \
       typename TTypes<T>::ConstFlat b, typename TTypes<T>::Flat c);            \
                                                                                \
-  template <>                                                                  \
-  void BlockLSTMBprop<GPUDevice, T, true>::operator()(                         \
-      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,             \
-      typename TTypes<T>::ConstMatrix x,                                       \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
-      typename TTypes<T>::ConstMatrix i, typename TTypes<T>::ConstMatrix cs,   \
-      typename TTypes<T>::ConstMatrix f, typename TTypes<T>::ConstMatrix o,    \
-      typename TTypes<T>::ConstMatrix ci, typename TTypes<T>::ConstMatrix co,  \
-      typename TTypes<T>::ConstMatrix cs_grad,                                 \
-      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,  \
-      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,          \
-      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,            \
-      typename TTypes<T>::Matrix dicfo,                                        \
-      typename TTypes<T>::Matrix cs_prev_grad,                                 \
-      typename TTypes<T>::Matrix h_prev_grad,                                  \
-      typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,   \
-      typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,     \
-      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad,      \
-      typename TTypes<T>::Vec b_grad);                                         \
-                                                                               \
   extern template struct TensorCopy<GPUDevice, T>;                             \
   extern template struct TensorAdd<GPUDevice, T>;                              \
-  extern template struct BlockLSTMBprop<GPUDevice, T, true>;
+                                                                               \
+  DECLARE_GPU_BPROP(T, ICFO);                                                  \
+  DECLARE_GPU_BPROP(T, IFCO);
 
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-// DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
+DECLARE_GPU_SPECS(Eigen::half);
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_BPROP
 }  // end namespace functor
 
-#define REGISTER_GPU_KERNEL(T)                           \
-  REGISTER_KERNEL_BUILDER(Name("BlockLSTMGrad")          \
-                              .Device(DEVICE_GPU)        \
-                              .HostMemory("seq_len_max") \
-                              .TypeConstraint<T>("T"),   \
-                          BlockLSTMGradOp<GPUDevice, T, true>);
+#define REGISTER_GPU_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("BlockLSTMGrad")                       \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("seq_len_max")              \
+                              .TypeConstraint<T>("T"),                \
+                          BlockLSTMGradOp<GPUDevice, T, true, ICFO>); \
+  REGISTER_KERNEL_BUILDER(Name("BlockLSTMGradV2")                     \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("seq_len_max")              \
+                              .TypeConstraint<T>("T"),                \
+                          BlockLSTMGradOp<GPUDevice, T, true, IFCO>);
 
-REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(Eigen::half);
-// REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(float);
 #undef REGISTER_GPU_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/lstm_ops.h b/tensorflow/core/kernels/rnn/lstm_ops.h
index 8885d7c4bcb..834a9231433 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops.h
+++ b/tensorflow/core/kernels/rnn/lstm_ops.h
@@ -25,6 +25,16 @@ limitations under the License.
 namespace tensorflow {
 class OpKernelContext;
 
+enum GateLayout { ICFO, IFCO };
+
+constexpr int gate_c_offset(GateLayout gate_layout, int cell_size) {
+  return (gate_layout == ICFO) ? cell_size : cell_size * 2;
+}
+
+constexpr int gate_f_offset(GateLayout gate_layout, int cell_size) {
+  return (gate_layout == ICFO) ? cell_size * 2 : cell_size;
+}
+
 namespace functor {
 
 template <typename Device, typename T>
@@ -103,19 +113,21 @@ struct LSTMBlockCell {
 
   int cell_size() const { return cell_size_; }
 
-  inline Eigen::array<Eigen::DenseIndex, 2> icfo_i_offsets() const {
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_i_offsets() const {
     return {0, 0};
   }
 
-  inline Eigen::array<Eigen::DenseIndex, 2> icfo_c_offsets() const {
-    return {0, cell_size_};
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_c_offsets(
+      const GateLayout gate_layout) const {
+    return {0, gate_c_offset(gate_layout, cell_size_)};
   }
 
-  inline Eigen::array<Eigen::DenseIndex, 2> icfo_f_offsets() const {
-    return {0, cell_size_ * 2};
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_f_offsets(
+      const GateLayout gate_layout) const {
+    return {0, gate_f_offset(gate_layout, cell_size_)};
   }
 
-  inline Eigen::array<Eigen::DenseIndex, 2> icfo_o_offsets() const {
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_o_offsets() const {
     return {0, cell_size_ * 3};
   }
 
@@ -147,7 +159,7 @@ struct LSTMBlockCell {
 
 // See lstm_ops.cc for CPUDevice implementation and lstm_ops_gpu.cu.cc for
 // GPUDevice implementation.
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 struct LSTMBlockCellFprop : public LSTMBlockCell {
   LSTMBlockCellFprop(const int batch_size, const int input_size,
                      const int cell_size)
@@ -166,13 +178,13 @@ struct LSTMBlockCellFprop : public LSTMBlockCell {
                   typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,
                   typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,
                   typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,
-                  typename TTypes<T>::Matrix icfo,
+                  typename TTypes<T>::Matrix gates,
                   typename TTypes<T>::Matrix h);
 };
 
 // See lstm_ops.cc for CPUDevice implementation and lstm_ops_gpu.cu.cc for
 // GPUDevice implementation.
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 struct LSTMBlockCellBprop : public LSTMBlockCell {
   LSTMBlockCellBprop(const int batch_size, const int input_size,
                      const int cell_size)
@@ -192,12 +204,12 @@ struct LSTMBlockCellBprop : public LSTMBlockCell {
       typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
       typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
       typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
-      typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
-      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
-      typename TTypes<T>::Vec wco_grad);
+      typename TTypes<T>::Matrix dgates,
+      typename TTypes<T>::Matrix cs_prev_grad, typename TTypes<T>::Vec wci_grad,
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad);
 };
 
-template <typename Device, typename T, bool USE_CUBLAS>
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
 struct BlockLSTMBprop : public LSTMBlockCell {
   BlockLSTMBprop(const int batch_size, const int input_size,
                  const int cell_size)
@@ -218,7 +230,8 @@ struct BlockLSTMBprop : public LSTMBlockCell {
       typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
       typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
       typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
-      typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
+      typename TTypes<T>::Matrix dgates,
+      typename TTypes<T>::Matrix cs_prev_grad,
       typename TTypes<T>::Matrix h_prev_grad,
       typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,
       typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,
@@ -246,10 +259,10 @@ struct BlockLSTMBprop : public LSTMBlockCell {
     // di[t] = sigm'(i[t]) dcs[t] ci[t]
     di.device(d) = i * (i.constant(T(1)) - i) * dcs * ci;
 
-    dicfo.slice(icfo_i_offsets(), cell_extents()).device(d) = di;
-    dicfo.slice(icfo_c_offsets(), cell_extents()).device(d) = dci;
-    dicfo.slice(icfo_f_offsets(), cell_extents()).device(d) = df;
-    dicfo.slice(icfo_o_offsets(), cell_extents()).device(d) = do_;
+    dgates.slice(gates_i_offsets(), cell_extents()).device(d) = di;
+    dgates.slice(gates_c_offsets(gate_layout), cell_extents()).device(d) = dci;
+    dgates.slice(gates_f_offsets(gate_layout), cell_extents()).device(d) = df;
+    dgates.slice(gates_o_offsets(), cell_extents()).device(d) = do_;
 
     cs_prev_grad.device(d) = dcs * f;
     if (use_peephole) {
@@ -260,10 +273,10 @@ struct BlockLSTMBprop : public LSTMBlockCell {
     }
 
     // xh_grad.
-    typename TTypes<T>::ConstMatrix const_dicfo(dicfo.data(),
-                                                dicfo.dimensions());
+    typename TTypes<T>::ConstMatrix const_dgates(dgates.data(),
+                                                 dgates.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, true, 1.f, const_dicfo, w, 0.f, xh_grad);
+        ctx, d, false, true, 1.f, const_dgates, w, 0.f, xh_grad);
 
     // xh.
     xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
@@ -276,10 +289,10 @@ struct BlockLSTMBprop : public LSTMBlockCell {
 
     // w_grad.
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, true, false, 1.f, const_xh, const_dicfo, 1.f, w_grad);
+        ctx, d, true, false, 1.f, const_xh, const_dgates, 1.f, w_grad);
 
     // b_grad.
-    b_grad.device(d) += dicfo.sum(Eigen::array<int, 1>({0}));
+    b_grad.device(d) += dgates.sum(Eigen::array<int, 1>({0}));
 
     if (use_peephole) {
       wci_grad.device(d) += (di * cs_prev).sum(Eigen::array<int, 1>({0}));
diff --git a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
index 4101ee8ed2f..3c1ea27b1ea 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -81,8 +81,8 @@ namespace {
 // Launch with blocks of (batch x 32)
 //
 // TODO(b/67600500): Try making 'use_peephole' a template parameter.
-template <typename T, bool use_peephole>
-__global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
+template <typename T, bool use_peephole, GateLayout gate_layout>
+__global__ void lstm_gates(const T* gates, const T* b, const T* cs_prev,
                            const T* wci, const T* wcf, const T* wco, T* o, T* h,
                            T* ci, T* cs, T* co, T* i, T* f,
                            const float forget_bias, const float cell_clip,
@@ -98,7 +98,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   // The following code assumes the input arrays are of the following
   // shapes and interpretations.
   //
-  // 1) 'icfo' is a matrix such that,
+  // 1) 'gates' is a matrix such that,
   //
   //   cell_size  cell_size  cell_size  cell_size
   //  +----------+----------+----------+----------+
@@ -107,7 +107,8 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   //  |          |          |          |          |
   //  +----------+----------+----------+----------+
   //
-  // 'gid' is the index assigned to this thread for 'icfo' in the 'i' submatrix.
+  // 'gid' is the index assigned to this thread for 'gates' in the 'i'
+  // submatrix.
   //
   // 2) 'b' is a vector such that,
   //
@@ -146,23 +147,27 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
 
   T i_local;
   if (use_peephole) {
-    i_local = sigmoid_op(icfo[0 * cell_size + gid] + b[0 * cell_size + act_id] +
-                         cs_prev[cid] * wci[act_id]);
+    i_local =
+        sigmoid_op(gates[0 * cell_size + gid] + b[0 * cell_size + act_id] +
+                   cs_prev[cid] * wci[act_id]);
   } else {
-    i_local = sigmoid_op(icfo[0 * cell_size + gid] + b[0 * cell_size + act_id]);
+    i_local =
+        sigmoid_op(gates[0 * cell_size + gid] + b[0 * cell_size + act_id]);
   }
   i[cid] = i_local;
 
-  const T ci_local =
-      tanh_op(icfo[1 * cell_size + gid] + b[1 * cell_size + act_id]);
+  const int c_offset = gate_c_offset(gate_layout, cell_size);
+  const int f_offset = gate_f_offset(gate_layout, cell_size);
+
+  const T ci_local = tanh_op(gates[c_offset + gid] + b[c_offset + act_id]);
   ci[cid] = ci_local;
 
   T f_local;
   if (use_peephole) {
-    f_local = sigmoid_op(icfo[2 * cell_size + gid] + b[2 * cell_size + act_id] +
+    f_local = sigmoid_op(gates[f_offset + gid] + b[f_offset + act_id] +
                          forget_bias_t + cs_prev[cid] * wcf[act_id]);
   } else {
-    f_local = sigmoid_op(icfo[2 * cell_size + gid] + b[2 * cell_size + act_id] +
+    f_local = sigmoid_op(gates[f_offset + gid] + b[f_offset + act_id] +
                          forget_bias_t);
   }
   f[cid] = f_local;
@@ -178,10 +183,11 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
 
   T o_local;
   if (use_peephole) {
-    o_local = sigmoid_op(icfo[3 * cell_size + gid] + b[3 * cell_size + act_id] +
-                         cs_local * wco[act_id]);
+    o_local = sigmoid_op(gates[3 * cell_size + gid] +
+                         b[3 * cell_size + act_id] + cs_local * wco[act_id]);
   } else {
-    o_local = sigmoid_op(icfo[3 * cell_size + gid] + b[3 * cell_size + act_id]);
+    o_local =
+        sigmoid_op(gates[3 * cell_size + gid] + b[3 * cell_size + act_id]);
   }
   o[cid] = o_local;
 
@@ -217,7 +223,7 @@ __global__ void concat_xh(T* xh, const T* x, const T* h_prev,
   }
 }
 
-template <typename T>
+template <typename T, GateLayout gate_layout>
 void LSTMBlockCellFpropWithCUDA(
     OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,
     const float cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x,
@@ -228,7 +234,7 @@ void LSTMBlockCellFpropWithCUDA(
     typename TTypes<T>::Matrix xh, typename TTypes<T>::Matrix i,
     typename TTypes<T>::Matrix cs, typename TTypes<T>::Matrix f,
     typename TTypes<T>::Matrix o, typename TTypes<T>::Matrix ci,
-    typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
+    typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix gates,
     typename TTypes<T>::Matrix h, int batch_size, int cell_size,
     int input_size) {
   const auto& cu_stream = GetGpuStream(ctx);
@@ -249,7 +255,7 @@ void LSTMBlockCellFpropWithCUDA(
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<GPUDevice, T, true /* USE_CUBLAS */>::compute(
       ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
-      w, typename gemm_compute_type<T>::type(0.f), icfo);
+      w, typename gemm_compute_type<T>::type(0.f), gates);
 
   // Add bias, apply non-linearities and gating.
   //
@@ -262,20 +268,22 @@ void LSTMBlockCellFpropWithCUDA(
 
   if (use_peephole) {
     TF_CHECK_OK(GpuLaunchKernel(
-        lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
-        icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
-        wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
-        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
+        lstm_gates<T, true, gate_layout>, grid_dim_2d, block_dim_2d, 0,
+        cu_stream, gates.data(), b.data(), cs_prev.data(), wci.data(),
+        wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(),
+        co.data(), i.data(), f.data(), forget_bias, cell_clip, batch_size,
+        cell_size));
   } else {
     TF_CHECK_OK(GpuLaunchKernel(
-        lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
-        icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
-        wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
-        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
+        lstm_gates<T, false, gate_layout>, grid_dim_2d, block_dim_2d, 0,
+        cu_stream, gates.data(), b.data(), cs_prev.data(), wci.data(),
+        wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(),
+        co.data(), i.data(), f.data(), forget_bias, cell_clip, batch_size,
+        cell_size));
   }
 }
 
-template <typename T>
+template <typename T, GateLayout gate_layout>
 __global__ void lstm_gates_bprop(
     const T* cs_prev,  // [batch_size, cell_size]
     const T* h_prev,   // [batch_size, cell_size]
@@ -297,7 +305,7 @@ __global__ void lstm_gates_bprop(
     T* dci,            // [batch_size, cell_size]
     T* df,             // [batch_size, cell_size]
     T* di,             // [batch_size, cell_size]
-    T* dicfo,          // [input_size + cell_size, 4 * cell_size]
+    T* dgates,         // [input_size + cell_size, 4 * cell_size]
     T* cs_prev_grad,   // [batch_size, cell_size]
     const int batch_size, const int cell_size, const bool use_peephole) {
   const int batch_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -341,10 +349,10 @@ __global__ void lstm_gates_bprop(
   const T di_local = i_local * (one - i_local) * dcs_local * ci_local;
   di[cid] = di_local;
 
-  dicfo[gid + 0 * cell_size] = di_local;
-  dicfo[gid + 1 * cell_size] = dci_local;
-  dicfo[gid + 2 * cell_size] = df_local;
-  dicfo[gid + 3 * cell_size] = do_local;
+  dgates[gid + 0 * cell_size] = di_local;
+  dgates[gate_c_offset(gate_layout, cell_size)] = dci_local;
+  dgates[gate_f_offset(gate_layout, cell_size)] = df_local;
+  dgates[gid + 3 * cell_size] = do_local;
 
   cs_prev_grad[cid] = dcs_local * f_local;
   if (use_peephole) {
@@ -352,7 +360,7 @@ __global__ void lstm_gates_bprop(
   }
 }
 
-template <typename T>
+template <typename T, GateLayout gate_layout>
 void LSTMBlockCellBpropWithCUDA(
     OpKernelContext* ctx, const GPUDevice& d, typename TTypes<T>::ConstMatrix x,
     typename TTypes<T>::ConstMatrix cs_prev,
@@ -366,7 +374,7 @@ void LSTMBlockCellBpropWithCUDA(
     typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
     typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
     typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
-    typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
+    typename TTypes<T>::Matrix dgates, typename TTypes<T>::Matrix cs_prev_grad,
     typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
     typename TTypes<T>::Vec wco_grad, const int batch_size, const int cell_size,
     const bool use_peephole) {
@@ -377,11 +385,11 @@ void LSTMBlockCellBpropWithCUDA(
                    Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
 
   TF_CHECK_OK(GpuLaunchKernel(
-      lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
+      lstm_gates_bprop<T, gate_layout>, grid_dim_2d, block_dim_2d, 0, cu_stream,
       cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
       wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),
       co.data(), cs_grad.data(), h_grad.data(), do_.data(), dcs.data(),
-      dci.data(), df.data(), di.data(), dicfo.data(), cs_prev_grad.data(),
+      dci.data(), df.data(), di.data(), dgates.data(), cs_prev_grad.data(),
       batch_size, cell_size, use_peephole));
 
   if (use_peephole) {
@@ -398,66 +406,75 @@ void LSTMBlockCellBpropWithCUDA(
 
 }  // namespace
 
-#define DEFINE_GPU_SPECS(T)                                                    \
-  template struct TensorZero<GPUDevice, T>;                                    \
-  template struct TensorUnalignedZero<GPUDevice, T>;                           \
-  template struct TensorCopy<GPUDevice, T>;                                    \
-  template struct TensorCopyUnaligned<GPUDevice, T>;                           \
-  template struct TensorCopyToUnaligned<GPUDevice, T>;                         \
-  template struct TensorAdd<GPUDevice, T>;                                     \
-  template <>                                                                  \
-  void LSTMBlockCellFprop<GPUDevice, T, true /* USE_CUBLAS */>::operator()(    \
-      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,       \
-      const float cell_clip, bool use_peephole,                                \
-      typename TTypes<T>::ConstMatrix x,                                       \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
-      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,             \
-      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
-      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h) {         \
-    LSTMBlockCellFpropWithCUDA<T>(ctx, d, forget_bias, cell_clip,              \
-                                  use_peephole, x, cs_prev, h_prev, w, wci,    \
-                                  wcf, wco, b, xh, i, cs, f, o, ci, co, icfo,  \
-                                  h, batch_size_, cell_size_, input_size_);    \
-  }                                                                            \
-  template <>                                                                  \
-  void LSTMBlockCellBprop<GPUDevice, T, true /* USE_CUBLAS */>::operator()(    \
-      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,             \
-      typename TTypes<T>::ConstMatrix x,                                       \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,       \
-      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,   \
-      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,   \
-      typename TTypes<T>::ConstMatrix co,                                      \
-      typename TTypes<T>::ConstMatrix cs_grad,                                 \
-      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,  \
-      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,          \
-      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,            \
-      typename TTypes<T>::Matrix dicfo,                                        \
-      typename TTypes<T>::Matrix cs_prev_grad,                                 \
-      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,      \
-      typename TTypes<T>::Vec wco_grad) {                                      \
-    LSTMBlockCellBpropWithCUDA<T>(                                             \
-        ctx, d, x, cs_prev, h_prev, w, wci, wcf, wco, b, i, cs, f, o, ci, co,  \
-        cs_grad, h_grad, do_, dcs, dci, df, di, dicfo, cs_prev_grad, wci_grad, \
-        wcf_grad, wco_grad, batch_size_, cell_size_, use_peephole);            \
-  }                                                                            \
-  template struct LSTMBlockCellFprop<GPUDevice, T, true /* USE_CUBLAS */>;     \
-  template struct LSTMBlockCellBprop<GPUDevice, T, true /* USE_CUBLAS */>;     \
-  template struct BlockLSTMBprop<GPUDevice, T, true /* USE_CUBLAS */>;
+#define DECLARE_GPU_FBPROP(T, GATE_LAYOUT)                                    \
+  template <>                                                                 \
+  void LSTMBlockCellFprop<GPUDevice, T, true /* USE_CUBLAS */, GATE_LAYOUT>:: \
+  operator()(                                                                 \
+      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,      \
+      const float cell_clip, bool use_peephole,                               \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,          \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,            \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,             \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,           \
+      typename TTypes<T>::Matrix gates, typename TTypes<T>::Matrix h) {       \
+    LSTMBlockCellFpropWithCUDA<T, GATE_LAYOUT>(                               \
+        ctx, d, forget_bias, cell_clip, use_peephole, x, cs_prev, h_prev, w,  \
+        wci, wcf, wco, b, xh, i, cs, f, o, ci, co, gates, h, batch_size_,     \
+        cell_size_, input_size_);                                             \
+  }                                                                           \
+  template <>                                                                 \
+  void LSTMBlockCellBprop<GPUDevice, T, true /* USE_CUBLAS */, GATE_LAYOUT>:: \
+  operator()(                                                                 \
+      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,            \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,      \
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,  \
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,  \
+      typename TTypes<T>::ConstMatrix co,                                     \
+      typename TTypes<T>::ConstMatrix cs_grad,                                \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
+      typename TTypes<T>::Matrix dgates,                                      \
+      typename TTypes<T>::Matrix cs_prev_grad,                                \
+      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,     \
+      typename TTypes<T>::Vec wco_grad) {                                     \
+    LSTMBlockCellBpropWithCUDA<T, GATE_LAYOUT>(                               \
+        ctx, d, x, cs_prev, h_prev, w, wci, wcf, wco, b, i, cs, f, o, ci, co, \
+        cs_grad, h_grad, do_, dcs, dci, df, di, dgates, cs_prev_grad,         \
+        wci_grad, wcf_grad, wco_grad, batch_size_, cell_size_, use_peephole); \
+  }                                                                           \
+  template struct LSTMBlockCellFprop<GPUDevice, T, true /* USE_CUBLAS */,     \
+                                     GATE_LAYOUT>;                            \
+  template struct LSTMBlockCellBprop<GPUDevice, T, true /* USE_CUBLAS */,     \
+                                     GATE_LAYOUT>;                            \
+  template struct BlockLSTMBprop<GPUDevice, T, true /* USE_CUBLAS */,         \
+                                 GATE_LAYOUT>;
 
-DEFINE_GPU_SPECS(float);
-DEFINE_GPU_SPECS(Eigen::half);
-// DEFINE_GPU_SPECS(double);
-#undef DEFINE_GPU_SPECS
+#define DECLARE_GPU_SPECS(T)                           \
+  template struct TensorZero<GPUDevice, T>;            \
+  template struct TensorUnalignedZero<GPUDevice, T>;   \
+  template struct TensorCopy<GPUDevice, T>;            \
+  template struct TensorCopyUnaligned<GPUDevice, T>;   \
+  template struct TensorCopyToUnaligned<GPUDevice, T>; \
+  template struct TensorAdd<GPUDevice, T>;             \
+                                                       \
+  DECLARE_GPU_FBPROP(T, ICFO);                         \
+  DECLARE_GPU_FBPROP(T, IFCO);
 
+DECLARE_GPU_SPECS(Eigen::half);
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_FBPROP
 }  // end namespace functor
 }  // end namespace tensorflow
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 787d4aa247c..06e5ae1ff87 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -360,7 +360,7 @@ struct Roll<CPUDevice, T> {
 TF_CALL_ALL_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_KERNEL(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Roll")                           \
                               .Device(DEVICE_GPU)                \
@@ -402,5 +402,5 @@ TF_CALL_complex64(REGISTER_KERNEL);
 TF_CALL_complex128(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc
index c5ef02d84a6..d4171edaca8 100644
--- a/tensorflow/core/kernels/roll_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -31,8 +31,10 @@ namespace {
 
 template <typename T>
 __global__ void RollKernel(const int32 nthreads, const int32 num_dims,
-                           const T* input, T* output, const int32* dim_size,
-                           const int32* threshold, const int64* dim_range) {
+                           const T* __restrict__ input, T* __restrict__ output,
+                           const int32* __restrict__ dim_size,
+                           const int32* __restrict__ threshold,
+                           const int64* __restrict__ dim_range) {
   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     int64 offset = 0;
     for (int i = 0; i < num_dims; i++) {
@@ -71,7 +73,7 @@ struct Roll<GPUDevice, T> {
     d.memcpyHostToDevice(thres_buf, threshold.data(), thres_bytes);
     d.memcpyHostToDevice(range_buf, dim_range.data(), range_bytes);
 
-    CudaLaunchConfig cfg = GetGpuLaunchConfig(num_elements, d);
+    GpuLaunchConfig cfg = GetGpuLaunchConfig(num_elements, d);
 
     TF_CHECK_OK(GpuLaunchKernel(RollKernel<T>, cfg.block_count,
                                 cfg.thread_per_block, 0, d.stream(),
@@ -98,4 +100,4 @@ TF_CALL_complex128(DEFINE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 2d7652d985f..2777c61ac48 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -68,15 +68,15 @@ TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<tstring>(TensorShape({5}), {"a", "b", "c", "d", "e"});
   AddInputFromArray<int32>(TensorShape({}), {3});
   AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({5}));
-  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, ScalarIndices_Complex) {
@@ -121,18 +121,18 @@ TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({3, 5}),
-                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
-                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<tstring>(TensorShape({3, 5}),
+                             {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                              "k", "l", "m", "n", "o"});
   AddInputFromArray<int32>(TensorShape({2}), {2, -1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
-  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
-                                       "o", "k", "b", "c", "d", "e", "a"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                        "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, Simple_ThreeD32) {
@@ -155,7 +155,7 @@ TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(
+  AddInputFromArray<tstring>(
       TensorShape({2, 2, 3}),
       {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
   AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
@@ -164,9 +164,9 @@ TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
-  test::FillValues<string>(
+  test::FillValues<tstring>(
       &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, Simple_TwoD64) {
@@ -190,18 +190,18 @@ TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT64);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({5, 3}),
-                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
-                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<tstring>(TensorShape({5, 3}),
+                             {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                              "k", "l", "m", "n", "o"});
   AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
   AddInputFromArray<int64>(TensorShape({2}), {0, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
-  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
-                                       "k", "o", "m", "n", "c", "a", "b"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                        "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, Simple_ThreeD64) {
@@ -224,7 +224,7 @@ TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT64);
 
   // Feed and run
-  AddInputFromArray<string>(
+  AddInputFromArray<tstring>(
       TensorShape({4, 1, 3}),
       {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
   AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
@@ -233,9 +233,9 @@ TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
-  test::FillValues<string>(
+  test::FillValues<tstring>(
       &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, ZeroShift_ThreeD32) {
@@ -258,7 +258,7 @@ TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(
+  AddInputFromArray<tstring>(
       TensorShape({2, 2, 3}),
       {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
   AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
@@ -267,9 +267,9 @@ TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
-  test::FillValues<string>(
+  test::FillValues<tstring>(
       &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, ZeroSize_ThreeD32) {
@@ -290,14 +290,14 @@ TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<tstring>(TensorShape({5, 0, 0}), {});
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, OneSize_ThreeD32) {
@@ -319,15 +319,15 @@ TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<tstring>(TensorShape({1, 1, 1}), {"a"});
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
-  test::FillValues<string>(&expected, {"a"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"a"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, MultiShifts_TwoD32) {
@@ -351,18 +351,18 @@ TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
   MakeOp(DT_STRING, DT_INT32);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({3, 5}),
-                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
-                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<tstring>(TensorShape({3, 5}),
+                             {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                              "k", "l", "m", "n", "o"});
   AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
   AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
   Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
-  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
-                                       "e", "a", "g", "h", "i", "j", "f"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                        "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
diff --git a/tensorflow/core/kernels/save_op.cc b/tensorflow/core/kernels/save_op.cc
index f87e0fa0e9c..f53976cae28 100644
--- a/tensorflow/core/kernels/save_op.cc
+++ b/tensorflow/core/kernels/save_op.cc
@@ -62,8 +62,8 @@ class ShardedFilenameOp : public OpKernel {
     }
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
-    out->scalar<string>()() = strings::Printf(
-        "%s-%05d-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+    out->scalar<tstring>()() = strings::Printf(
+        "%s-%05d-of-%05d", ctx->input(0).scalar<tstring>()().c_str(),
         ctx->input(1).scalar<int32>()(), ctx->input(2).scalar<int32>()());
   }
 };
@@ -85,8 +85,8 @@ class ShardedFilespecOp : public OpKernel {
     }
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
-    out->scalar<string>()() = strings::Printf(
-        "%s-\?\?\?\?\?-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+    out->scalar<tstring>()() = strings::Printf(
+        "%s-\?\?\?\?\?-of-%05d", ctx->input(0).scalar<tstring>()().c_str(),
         ctx->input(1).scalar<int32>()());
   }
 };
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
index 76e70097047..1d46352008c 100644
--- a/tensorflow/core/kernels/save_op_test.cc
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -65,12 +65,13 @@ TEST_F(SaveOpTest, Simple) {
 
   MakeOp();
   // Add a file name
-  AddInput<string>(TensorShape({}),
-                   [&filename](int x) -> string { return filename; });
+  AddInput<tstring>(TensorShape({}),
+                    [&filename](int x) -> tstring { return filename; });
 
   // Add the tensor names
-  AddInput<string>(TensorShape({14}),
-                   [&tensornames](int x) -> string { return tensornames[x]; });
+  AddInput<tstring>(TensorShape({14}), [&tensornames](int x) -> tstring {
+    return tensornames[x];
+  });
 
   // Add a 1-d bool tensor
   AddInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
@@ -108,8 +109,8 @@ TEST_F(SaveOpTest, Simple) {
   AddInput<int64>(TensorShape({9}), [](int x) -> int64 { return x - 9; });
 
   // Add a 1-d string tensor
-  AddInput<string>(TensorShape({2}),
-                   [](int x) -> string { return x ? "yes" : "no"; });
+  AddInput<tstring>(TensorShape({2}),
+                    [](int x) -> tstring { return x ? "yes" : "no"; });
 
   // Add a 2-d complex64 tensor
   AddInput<complex64>(TensorShape({2, 3}), [](int x) -> complex64 {
@@ -328,7 +329,7 @@ TEST_F(SaveOpTest, Simple) {
 
     // We expect the tensor value to be correct.
     TensorSlice s = TensorSlice::ParseOrDie("-");
-    string data[2];
+    tstring data[2];
     EXPECT_TRUE(reader.CopySliceData("tensor_string", s, data));
     EXPECT_EQ("no", data[0]);
     EXPECT_EQ("yes", data[1]);
@@ -425,15 +426,16 @@ TEST_F(SaveSlicesOpTest, Slices) {
 
   MakeOp();
   // Add a file name
-  AddInput<string>(TensorShape({}),
-                   [&filename](int x) -> string { return filename; });
+  AddInput<tstring>(TensorShape({}),
+                    [&filename](int x) -> tstring { return filename; });
 
   // Add the tensor names
-  AddInput<string>(TensorShape({5}),
-                   [&tensornames](int x) -> string { return tensornames[x]; });
+  AddInput<tstring>(TensorShape({5}), [&tensornames](int x) -> tstring {
+    return tensornames[x];
+  });
 
   // Add the tensor shapes and slices
-  AddInput<string>(TensorShape({5}), [&tensorshapes](int x) -> string {
+  AddInput<tstring>(TensorShape({5}), [&tensorshapes](int x) -> tstring {
     return tensorshapes[x];
   });
 
@@ -577,15 +579,16 @@ TEST_F(SaveOpSlices2Test, TwoSlices) {
 
   MakeOp();
   // Add a file name
-  AddInput<string>(TensorShape({}),
-                   [&filename](int x) -> string { return filename; });
+  AddInput<tstring>(TensorShape({}),
+                    [&filename](int x) -> tstring { return filename; });
 
   // Add the tensor names
-  AddInput<string>(TensorShape({3}),
-                   [&tensornames](int x) -> string { return tensornames[x]; });
+  AddInput<tstring>(TensorShape({3}), [&tensornames](int x) -> tstring {
+    return tensornames[x];
+  });
 
   // Add the tensor shapes and slices
-  AddInput<string>(TensorShape({3}), [&tensorshapes](int x) -> string {
+  AddInput<tstring>(TensorShape({3}), [&tensorshapes](int x) -> tstring {
     return tensorshapes[x];
   });
 
@@ -666,10 +669,10 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
   tensor.flat<float>().setZero();
 
   // Builds the graph.
-  const string temp_filename =
+  const tstring temp_filename =
       io::JoinPath(testing::TmpDir(), "benchmark_checkpoint");
   auto root = Scope::NewRootScope().ExitOnError();
-  const string tensor_name = "my_tensor";
+  const tstring tensor_name = "my_tensor";
   ops::Save(root, temp_filename, {tensor_name}, {{tensor}});
 
   // Disables optimizations.
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index faafed367d3..1a5b6b92bd6 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -59,7 +59,7 @@ void SaveTensors(
                               std::numeric_limits<int>::max()),
               errors::InvalidArgument("Too many inputs to SaveTensors"));
   const int N = static_cast<int>(tensor_names_t.NumElements());
-  const string* tensor_shapes_and_slices_ptr = nullptr;
+  const tstring* tensor_shapes_and_slices_ptr = nullptr;
   if (save_slices) {
     const Tensor& tensor_shapes_and_slices_t = context->input(2);
     OP_REQUIRES(
@@ -70,7 +70,7 @@ void SaveTensors(
                                 "shapes and slices but got ",
                                 tensor_shapes_and_slices_t.NumElements()));
     tensor_shapes_and_slices_ptr =
-        tensor_shapes_and_slices_t.flat<string>().data();
+        tensor_shapes_and_slices_t.flat<tstring>().data();
   }
   OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs,
               errors::InvalidArgument("Expected totally ", N + kFixedInputs,
@@ -79,13 +79,13 @@ void SaveTensors(
                                       N, " names, but received ",
                                       context->num_inputs(), " inputs"));
 
-  VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
+  VLOG(1) << "About to save tensors to file " << filename_t.flat<tstring>()(0)
           << "...";
-  checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
+  checkpoint::TensorSliceWriter writer(filename_t.flat<tstring>()(0),
                                        std::move(builder_func));
 
   Status s;
-  auto tensor_names_flat = tensor_names_t.flat<string>();
+  auto tensor_names_flat = tensor_names_t.flat<tstring>();
 
   // Process tensors in sorted name order.  This allows us to avoid seeking
   // during restoration in the common case where we are restoring a full
@@ -103,7 +103,7 @@ void SaveTensors(
     TensorShape shape(input.shape());
     TensorSlice slice(input.dims());
     if (save_slices && !tensor_shapes_and_slices_ptr[i].empty()) {
-      const string& shape_spec = tensor_shapes_and_slices_ptr[i];
+      const tstring& shape_spec = tensor_shapes_and_slices_ptr[i];
       TensorShape slice_shape;
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
@@ -153,10 +153,10 @@ void RestoreTensor(OpKernelContext* context,
             "Input 0 (file_pattern) must be a string scalar; got a tensor of ",
             size, "elements"));
   }
-  const string& file_pattern = file_pattern_t.flat<string>()(0);
+  const string& file_pattern = file_pattern_t.flat<tstring>()(0);
 
   const Tensor& tensor_name_t = context->input(1);
-  const string& tensor_name = tensor_name_t.flat<string>()(restore_index);
+  const string& tensor_name = tensor_name_t.flat<tstring>()(restore_index);
 
   // If we cannot find a cached reader we will allocate our own.
   std::unique_ptr<checkpoint::TensorSliceReader> allocated_reader;
@@ -192,7 +192,8 @@ void RestoreTensor(OpKernelContext* context,
   TensorShape output_shape(saved_shape);
   TensorSlice slice_to_load(saved_shape.dims());
   if (restore_slice) {
-    const string& shape_spec = context->input(2).flat<string>()(restore_index);
+    const tstring& shape_spec =
+        context->input(2).flat<tstring>()(restore_index);
     if (!shape_spec.empty()) {
       TensorShape parsed_shape;
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
@@ -318,10 +319,10 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
                         gtl::ArraySlice<DataType> dtypes) {
-  const string& prefix_string = prefix.scalar<string>()();
+  const string& prefix_string = prefix.scalar<tstring>()();
 
-  const auto& tensor_names_flat = tensor_names.flat<string>();
-  const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
+  const auto& tensor_names_flat = tensor_names.flat<tstring>();
+  const auto& shape_and_slices_flat = shape_and_slices.flat<tstring>();
 
   // Sort lookup keys to improve locality when reading multiple tensors.
   std::vector<size_t> sorted_name_idx(tensor_names_flat.size());
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index ed1195c0535..07e120e042c 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -101,9 +101,9 @@ class SaveV2 : public OpKernel {
 
     const int kFixedInputs = 3;  // Prefix, tensor names, shape_and_slices.
     const int num_tensors = static_cast<int>(tensor_names.NumElements());
-    const string& prefix_string = prefix.scalar<string>()();
-    const auto& tensor_names_flat = tensor_names.flat<string>();
-    const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
+    const string& prefix_string = prefix.scalar<tstring>()();
+    const auto& tensor_names_flat = tensor_names.flat<tstring>();
+    const auto& shape_and_slices_flat = shape_and_slices.flat<tstring>();
 
     BundleWriter writer(Env::Default(), prefix_string);
     OP_REQUIRES_OK(context, writer.status());
@@ -157,7 +157,7 @@ class RestoreV2 : public OpKernel {
     ValidateInputs(false /* not save op */, context, prefix, tensor_names,
                    shape_and_slices);
 
-    const string& prefix_string = prefix.scalar<string>()();
+    const string& prefix_string = prefix.scalar<tstring>()();
 
     // Intention: we plan to use the RestoreV2 op as a backward-compatible
     // reader as we upgrade to the V2 format.  This allows transparent upgrade.
@@ -212,10 +212,10 @@ class MergeV2Checkpoints : public OpKernel {
                     "Input destination_prefix should be a scalar tensor, got ",
                     destination_prefix.shape().DebugString(), " instead."));
 
-    const gtl::ArraySlice<string> input_prefixes =
-        gtl::ArraySlice<string>(checkpoint_prefixes.flat<string>());
+    const gtl::ArraySlice<tstring> input_prefixes =
+        gtl::ArraySlice<tstring>(checkpoint_prefixes.flat<tstring>());
     Env* env = Env::Default();
-    const string& merged_prefix = destination_prefix.scalar<string>()();
+    const string& merged_prefix = destination_prefix.scalar<tstring>()();
     OP_REQUIRES_OK(
         context, tensorflow::MergeBundles(env, input_prefixes, merged_prefix));
 
diff --git a/tensorflow/core/kernels/save_v2_op_test.cc b/tensorflow/core/kernels/save_v2_op_test.cc
index 589d9639fb4..bc74d91d9e2 100644
--- a/tensorflow/core/kernels/save_v2_op_test.cc
+++ b/tensorflow/core/kernels/save_v2_op_test.cc
@@ -59,16 +59,17 @@ TEST_F(SaveV2OpTest, Simple) {
 
   MakeOp();
   // Add a file name
-  AddInput<string>(TensorShape({}),
-                   [&prefix](int x) -> string { return prefix; });
+  AddInput<tstring>(TensorShape({}),
+                    [&prefix](int x) -> tstring { return prefix; });
 
   // Add the tensor names
-  AddInput<string>(TensorShape({13}),
-                   [&tensornames](int x) -> string { return tensornames[x]; });
+  AddInput<tstring>(TensorShape({13}), [&tensornames](int x) -> tstring {
+    return tensornames[x];
+  });
 
   // Add the slice specs
-  AddInput<string>(TensorShape({13}),
-                   [](int x) -> string { return "" /* saves in full */; });
+  AddInput<tstring>(TensorShape({13}),
+                    [](int x) -> tstring { return "" /* saves in full */; });
 
   // Add a 1-d bool tensor
   AddInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 87e8aa4b761..20f6b864fd8 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#include "tensorflow/core/kernels/scan_ops.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,10 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/kernels/scan_ops.h"
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -107,8 +106,12 @@ namespace functor {
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
 DECLARE_FOR_ALL_REDUCERS(int32);
 DECLARE_FOR_ALL_REDUCERS(int64);
-
 #undef DECLARE_FOR_ALL_REDUCERS
+
+#define DECLARE_FOR_LOGSUMEXP_REDUCER(T) DECLARE(LogSumExpReducer<T>, T);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_LOGSUMEXP_REDUCER)
+#undef DECLARE_FOR_LOGSUMEXP_REDUCER
+
 #undef DECLARE
 
 }  // namespace functor
@@ -192,4 +195,31 @@ REGISTER_GPU_KERNELS(int64);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#define REGISTER_CUMLOGSUMEXP_KERNEL(device, device_type, type, type_idx) \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("CumulativeLogsumexp")                                         \
+          .Device(device)                                                 \
+          .TypeConstraint<type>("T")                                      \
+          .TypeConstraint<type_idx>("Tidx")                               \
+          .HostMemory("axis"),                                            \
+      ScanOp<device_type, type, functor::LogSumExpReducer<type>, type_idx>)
+
+#define REGISTER_CPU_KERNELS(type)                                 \
+  REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int32) \
+  REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int64)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define REGISTER_GPU_KERNELS(type)                                 \
+  REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_GPU, GPUDevice, type, int32) \
+  REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_GPU, GPUDevice, type, int64)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef REGISTER_CUMLOGSUMEXP_KERNEL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h
index 13831bb377d..1fd98f6656d 100644
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@@ -40,6 +40,41 @@ struct Scan {
   }
 };
 
+template <typename T>
+struct LogSumExp {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    Eigen::internal::scalar_sum_op<T> sum_op;
+    Eigen::internal::scalar_exp_op<T> exp_op;
+    Eigen::internal::scalar_log_op<T> log_op;
+    Eigen::internal::scalar_max_op<T> max_op;
+    Eigen::internal::scalar_min_op<T> min_op;
+    Eigen::internal::scalar_log1p_op<T> log1p_op;
+    Eigen::internal::scalar_difference_op<T> diff_op;
+
+    auto mi = min_op(a, b);
+    auto ma = max_op(a, b);
+
+    return sum_op(log1p_op(exp_op(diff_op(mi, ma))), ma);
+  }
+};
+
+template <typename T>
+struct LogSumExpReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp(*accum, t);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index 1d3cb35517d..eaa9360a5b7 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -143,9 +143,16 @@ struct IsProd {
        std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
 };
 
+template <typename T, typename Op>
+struct IsLogSumExp {
+  constexpr static bool value = (std::is_same<Op, LogSumExp<T>>::value ||
+                                 std::is_same<Op, LogSumExpReducer<T>>::value);
+};
+
 template <typename T, typename Op>
 struct IdentityValue {
-  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value,
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value ||
+                    IsLogSumExp<T, Op>::value,
                 "IdentityValue not yet defined for this type.");
 
   template <typename U = T, typename OpCopy = Op>
@@ -159,6 +166,13 @@ struct IdentityValue {
       typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
     return t;
   }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U
+  operator()(typename std::enable_if<IsLogSumExp<U, OpCopy>::value, U>::type t =
+                 U(Eigen::NumTraits<U>::lowest())) {
+    return t;
+  }
 };
 
 // Each block is mapped to one sequence.  A contiguous range is mapped to the
@@ -311,6 +325,16 @@ struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
   }
 };
 
+template <typename T>
+struct Scan<GPUDevice, LogSumExpReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const LogSumExpReducer<T>& reducer, const bool reverse,
+                  const bool exclusive) {
+    LaunchScan<T, LogSumExp<T>>(d, in, out, LogSumExp<T>(), reverse, exclusive);
+  }
+};
+
 }  // namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
index f304c5cc53c..199a477b560 100644
--- a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
@@ -26,6 +26,8 @@ template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<double>,
                               double>;
 template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<double>,
                               double>;
+template struct functor::Scan<GpuDevice, functor::LogSumExpReducer<double>,
+                              double>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
index 1d0780541cc..6704572c1cf 100644
--- a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
@@ -26,6 +26,8 @@ template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<float>,
                               float>;
 template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<float>,
                               float>;
+template struct functor::Scan<GpuDevice, functor::LogSumExpReducer<float>,
+                              float>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
index 3ea7c5a47c7..0b16cb79ab8 100644
--- a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
@@ -26,6 +26,8 @@ template struct functor::Scan<
     GpuDevice, Eigen::internal::SumReducer<Eigen::half>, Eigen::half>;
 template struct functor::Scan<
     GpuDevice, Eigen::internal::ProdReducer<Eigen::half>, Eigen::half>;
+template struct functor::Scan<GpuDevice, functor::LogSumExpReducer<Eigen::half>,
+                              Eigen::half>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index 11baad0d585..a395fa8fc0b 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -36,42 +36,55 @@ struct ScatterOpKernelBody;
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ASSIGN> {
-  __device__ void operator()(T* dest, T src) const { *dest = src; }
+  __device__ void operator()(T* __restrict__ dest, T src) const { *dest = src; }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ADD> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicAdd(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicAdd(dest, src);
+  }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::SUB> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicSub(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicSub(dest, src);
+  }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MUL> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicMul(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMul(dest, src);
+  }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::DIV> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicDiv(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicDiv(dest, src);
+  }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MIN> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicMin(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMin(dest, src);
+  }
 };
 
 template <typename T>
 struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MAX> {
-  __device__ void operator()(T* dest, T src) const { GpuAtomicMax(dest, src); }
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMax(dest, src);
+  }
 };
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(T* params, const T* updates,
-                                      const Index* indices,
+__global__ void ScatterOpCustomKernel(T* __restrict__ params,
+                                      const T* __restrict__ updates,
+                                      const Index* __restrict__ indices,
                                       Index first_dim_size, Index updates_size,
                                       Index indices_size) {
   Index update_block = updates_size / indices_size;
@@ -90,8 +103,9 @@ __global__ void ScatterOpCustomKernel(T* params, const T* updates,
 }
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterScalarOpCustomKernel(T* params, const T* update,
-                                            const Index* indices,
+__global__ void ScatterScalarOpCustomKernel(T* __restrict__ params,
+                                            const T* __restrict__ update,
+                                            const Index* __restrict__ indices,
                                             Index first_dim_size,
                                             Index indices_size,
                                             Index synthesized_updates_size) {
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index abf7cfde135..59dad3d8a46 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -230,6 +230,9 @@ class ScatterNdUpdateOp : public OpKernel {
     const DataType dt_ref = DataTypeToEnum<T>::ref();
     const DataType index_t = DataTypeToEnum<Index>::v();
     dtype_ = c->input_type(0);
+    // If we are updating a resource, we always use the exclusive lock.
+    // For ref types, we lock based on the use_locking parameter
+    // Otherwise, we don't mutate the input tensor (we copy-on-write if needed).
     if (c->input_type(0) == DT_RESOURCE) {
       // TODO(apassos): what to validate here?
     } else if (IsRefType(c->input_type(0))) {
@@ -378,7 +381,7 @@ class ScatterNdUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
-TF_CALL_string(REGISTER_SCATTER_ND_CPU);
+TF_CALL_tstring(REGISTER_SCATTER_ND_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
@@ -428,7 +431,7 @@ TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
 // Register TensorScatterUpdate/Add/Sub for all number types.
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_TENSOR_CPU);
 // Register only TensorScatterUpdate for string/bool types as well.
-TF_CALL_string(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+TF_CALL_tstring(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
 
 #undef REGISTER_SCATTER_ND_TENSOR_CPU
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 01e4656eab8..811679dac79 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -160,7 +160,7 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
   REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);
 
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
-REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
+REGISTER_SCATTER_ND_INDEX(tstring, scatter_nd_op::UpdateOp::ADD);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH);
 TF_CALL_bool(REGISTER_SCATTER_ND_MATH);
 #undef REGISTER_SCATTER_ND_MATH
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index d3f6ee6dc44..1461831a1fb 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -51,15 +51,15 @@ class ScatterNdUpdateOpTest : public OpsTestBase {
 // TODO(simister): Re-enable this once binary size is under control.
 // TEST_F(ScatterNdUpdateOpTest, Simple_StringType) {
 //   MakeOp(DT_STRING_REF, DT_INT32);
-//   AddInputFromArray<string>(TensorShape({1}), {"Brain"});
+//   AddInputFromArray<tstring>(TensorShape({1}), {"Brain"});
 //   AddInputFromArray<int32>(TensorShape({1}), {0});
-//   AddInputFromArray<string>(TensorShape({1}), {"TensorFlow"});
+//   AddInputFromArray<tstring>(TensorShape({1}), {"TensorFlow"});
 //   TF_ASSERT_OK(RunOpKernel());
 //   // Check the new state of the input
 //   Tensor params_tensor = *mutable_input(0).tensor;
 //   Tensor expected(allocator(), DT_STRING, TensorShape({1}));
-//   test::FillValues<string>(&expected, {"TensorFlow"});
-//   test::ExpectTensorEqual<string>(expected, params_tensor);
+//   test::FillValues<tstring>(&expected, {"TensorFlow"});
+//   test::ExpectTensorEqual<tstring>(expected, params_tensor);
 // }
 
 // TEST_F(ScatterNdUpdateOpTest, Simple_BoolType) {
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index ae6548e9ef2..c9a34f85765 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -50,15 +50,15 @@ class ScatterUpdateOpTest : public OpsTestBase {
 
 TEST_F(ScatterUpdateOpTest, Simple_StringType) {
   MakeOp(DT_STRING_REF, DT_INT32);
-  AddInputFromArray<string>(TensorShape({1}), {"Brain"});
+  AddInputFromArray<tstring>(TensorShape({1}), {"Brain"});
   AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<string>(TensorShape({1}), {"TensorFlow"});
+  AddInputFromArray<tstring>(TensorShape({1}), {"TensorFlow"});
   TF_ASSERT_OK(RunOpKernel());
   // Check the new state of the input
   Tensor params_tensor = *mutable_input(0).tensor;
   Tensor expected(allocator(), DT_STRING, TensorShape({1}));
-  test::FillValues<string>(&expected, {"TensorFlow"});
-  test::ExpectTensorEqual<string>(expected, params_tensor);
+  test::FillValues<tstring>(&expected, {"TensorFlow"});
+  test::ExpectTensorEqual<tstring>(expected, params_tensor);
 }
 
 TEST_F(ScatterUpdateOpTest, Simple_BoolType) {
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index 69e754fd606..79320e08cb2 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -58,10 +58,11 @@ class ScopedAllocatorOp : public OpKernel {
     AllocatorAttributes attr = context->output_alloc_attr(0);
     Status s =
         context->allocate_output(0, {num_elements_}, &backing_tensor, attr);
-    VLOG(1) << "_ScopedAllocatorOp new backing tensor size "
-            << backing_tensor->TotalBytes() << " num_elements_ "
-            << num_elements_ << " buffer " << DMAHelper::buffer(backing_tensor)
-            << " base addr " << DMAHelper::base(backing_tensor);
+    VLOG(1) << "_ScopedAllocatorOp " << context->op_kernel().name()
+            << " new backing tensor size " << backing_tensor->TotalBytes()
+            << " num_elements_ " << num_elements_ << " buffer "
+            << DMAHelper::buffer(backing_tensor) << " base addr "
+            << DMAHelper::base(backing_tensor);
     if (s.ok()) {
       s = sam->AddScopedAllocator(*backing_tensor, context->step_id(), id_,
                                   name_, fields_, expected_call_count_);
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index d0e0b15da78..4fdb7d1e257 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -312,7 +312,7 @@ class SdcaFprint : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, TensorShape({num_elements, 2}), &out));
 
-    const auto in_values = input.flat<string>();
+    const auto in_values = input.flat<tstring>();
     auto out_values = out->matrix<int64>();
 
     for (int64 i = 0; i < num_elements; ++i) {
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index 58a78ddceef..0b0ff212f61 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -31,9 +31,10 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
 template <typename T, typename OutType>
-__global__ void UpperBoundKernel(const T* sorted_inputs, int batch_size,
-                                 int sorted_inputs_size, int values_size,
-                                 const T* values, OutType* outputs) {
+__global__ void UpperBoundKernel(const T* __restrict__ sorted_inputs,
+                                 int batch_size, int sorted_inputs_size,
+                                 int values_size, const T* __restrict__ values,
+                                 OutType* __restrict__ outputs) {
   GPU_1D_KERNEL_LOOP(work_unit_id, values_size * batch_size) {
     int bid = work_unit_id / values_size;
     T value = values[work_unit_id];
@@ -43,9 +44,10 @@ __global__ void UpperBoundKernel(const T* sorted_inputs, int batch_size,
 }
 
 template <typename T, typename OutType>
-__global__ void LowerBoundKernel(const T* sorted_inputs, int batch_size,
-                                 int sorted_inputs_size, int values_size,
-                                 const T* values, OutType* outputs) {
+__global__ void LowerBoundKernel(const T* __restrict__ sorted_inputs,
+                                 int batch_size, int sorted_inputs_size,
+                                 int values_size, const T* __restrict__ values,
+                                 OutType* __restrict__ outputs) {
   GPU_1D_KERNEL_LOOP(work_unit_id, values_size * batch_size) {
     int bid = work_unit_id / values_size;
     T value = values[work_unit_id];
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 490c3d2de82..bc28e64c4d7 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -52,12 +52,11 @@ using GPUDevice = Eigen::GpuDevice;
 // rows of input data and all reduction elements share one inner
 // dimension index.
 template <typename T, typename Index, int OuterDimTileSize>
-__global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
-                                             const Index inner_dim_size,
-                                             const Index output_outer_dim_size,
-                                             const Index* segment_ids,
-                                             const T* input, T* output,
-                                             const Index total_stripe_count) {
+__global__ void SortedSegmentSumCustomKernel(
+    const Index input_outer_dim_size, const Index inner_dim_size,
+    const Index output_outer_dim_size, const Index* __restrict__ segment_ids,
+    const T* __restrict__ input, T* __restrict__ output,
+    const Index total_stripe_count) {
   for (int stripe_index : GpuGridRangeX(total_stripe_count)) {
     const Index segment_offset = stripe_index % inner_dim_size;
     const Index input_outer_dim_index_base =
@@ -106,11 +105,10 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
 template <typename T, typename Index, typename KernelReductionFunctor>
-__global__ void UnsortedSegmentCustomKernel(const int64 input_outer_dim_size,
-                                            const int64 inner_dim_size,
-                                            const int64 output_outer_dim_size,
-                                            const Index* segment_ids,
-                                            const T* input, T* output) {
+__global__ void UnsortedSegmentCustomKernel(
+    const int64 input_outer_dim_size, const int64 inner_dim_size,
+    const int64 output_outer_dim_size, const Index* __restrict__ segment_ids,
+    const T* __restrict__ input, T* __restrict__ output) {
   const int64 input_total_size = input_outer_dim_size * inner_dim_size;
   for (int64 input_index : GpuGridRangeX(input_total_size)) {
     const int64 input_segment_index = input_index / inner_dim_size;
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 90bd3ea0591..5e09e5ff4bc 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -169,6 +169,12 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 577e327809d..5d48c8d685e 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -93,7 +93,7 @@ class SerializeSparseOp : public OpKernel {
 // performs O(1) shallow copies (and hence is much cheaper than
 // dispatching to another thread would be).
 template <>
-bool SerializeSparseOp<string>::IsExpensive() {
+bool SerializeSparseOp<tstring>::IsExpensive() {
   return true;
 }
 template <>
@@ -102,14 +102,14 @@ bool SerializeSparseOp<Variant>::IsExpensive() {
 }
 
 template <>
-Status SerializeSparseOp<string>::Initialize(Tensor* result) {
+Status SerializeSparseOp<tstring>::Initialize(Tensor* result) {
   *result = Tensor(DT_STRING, TensorShape({3}));
   return Status::OK();
 }
 
 template <>
-Status SerializeSparseOp<string>::Serialize(const Tensor& input,
-                                            string* result) {
+Status SerializeSparseOp<tstring>::Serialize(const Tensor& input,
+                                             tstring* result) {
   TensorProto proto;
   input.AsProtoTensorContent(&proto);
   *result = proto.SerializeAsString();
@@ -118,8 +118,8 @@ Status SerializeSparseOp<string>::Serialize(const Tensor& input,
 
 REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type"),
-                        SerializeSparseOp<string>);
+                            .TypeConstraint<tstring>("out_type"),
+                        SerializeSparseOp<tstring>);
 
 template <>
 Status SerializeSparseOp<Variant>::Initialize(Tensor* result) {
@@ -261,27 +261,27 @@ class SerializeManySparseOp : public SerializeManySparseOpBase<U> {
 };
 
 template <>
-Status SerializeManySparseOpBase<string>::Initialize(const int64 n,
-                                                     Tensor* result) {
+Status SerializeManySparseOpBase<tstring>::Initialize(const int64 n,
+                                                      Tensor* result) {
   *result = Tensor(DT_STRING, TensorShape({n, 3}));
   return Status::OK();
 }
 
 template <>
-Status SerializeManySparseOpBase<string>::Serialize(const Tensor& input,
-                                                    string* result) {
+Status SerializeManySparseOpBase<tstring>::Serialize(const Tensor& input,
+                                                     tstring* result) {
   TensorProto proto;
   input.AsProtoTensorContent(&proto);
   *result = proto.SerializeAsString();
   return Status::OK();
 }
 
-#define REGISTER_KERNELS(type)                                     \
-  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")              \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<string>("out_type"), \
-                          SerializeManySparseOp<type, string>)
+#define REGISTER_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")               \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<tstring>("out_type"), \
+                          SerializeManySparseOp<type, tstring>)
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index f2dd2812b53..d83a714452f 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -57,7 +57,7 @@ class GetSessionHandleOp : public OpKernel {
       handle->scalar<ResourceHandle>()() = resource_handle;
     } else {
       // Legacy behavior in V1.
-      handle->flat<string>().setConstant(tk.GetHandle(name()));
+      handle->flat<tstring>().setConstant(tk.GetHandle(name()));
     }
   }
 
@@ -110,7 +110,7 @@ class GetSessionTensorOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& handle = ctx->input(0);
-    const string& name = handle.scalar<string>()();
+    const string& name = handle.scalar<tstring>()();
     Tensor val;
     OP_REQUIRES_OK(ctx, ctx->session_state()->GetTensor(name, &val));
     ctx->set_output(0, val);
@@ -153,7 +153,7 @@ class DeleteSessionTensorOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& handle = ctx->input(0);
-    const string& name = handle.scalar<string>()();
+    const string& name = handle.scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->session_state()->DeleteTensor(name));
   }
 
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 59516b2329b..4532396455f 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -291,7 +291,7 @@ _SET_SIZE_REGISTER_KERNEL_BUILDER(int32);
 _SET_SIZE_REGISTER_KERNEL_BUILDER(int64);
 _SET_SIZE_REGISTER_KERNEL_BUILDER(uint8);
 _SET_SIZE_REGISTER_KERNEL_BUILDER(uint16);
-_SET_SIZE_REGISTER_KERNEL_BUILDER(string);
+_SET_SIZE_REGISTER_KERNEL_BUILDER(tstring);
 #undef _SET_SIZE_REGISTER_KERNEL_BUILDER
 
 enum InputTypes {
@@ -716,7 +716,7 @@ _DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int32);
 _DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int64);
 _DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint8);
 _DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint16);
-_DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(string);
+_DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(tstring);
 #undef _DENSE_TO_DENSE_SET_OPERATION_REGISTER_KERNEL_BUILDER
 
 template <typename T>
@@ -737,7 +737,7 @@ _DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int32);
 _DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int64);
 _DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint8);
 _DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint16);
-_DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(string);
+_DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(tstring);
 #undef _DENSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER
 
 template <typename T>
@@ -758,7 +758,7 @@ _SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int32);
 _SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(int64);
 _SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint8);
 _SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(uint16);
-_SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(string);
+_SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER(tstring);
 #undef _SPARSE_TO_SPARSE_SET_OPERATION_REGISTER_KERNEL_BUILDER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 86ccde9fb8c..cf065f738d6 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -546,7 +546,7 @@ REGISTER_GPU_KERNEL(Variant);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
-REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 1ac89b07c77..15f7157db07 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -202,6 +202,7 @@ class SliceOp : public OpKernel {
       HANDLE_DIM(5);
       HANDLE_DIM(6);
       HANDLE_DIM(7);
+      HANDLE_DIM(8);
 
 #undef HANDLE_DIM
 
@@ -247,7 +248,8 @@ namespace functor {
   DECLARE_CPU_SPEC(T, 4); \
   DECLARE_CPU_SPEC(T, 5); \
   DECLARE_CPU_SPEC(T, 6); \
-  DECLARE_CPU_SPEC(T, 7);
+  DECLARE_CPU_SPEC(T, 7); \
+  DECLARE_CPU_SPEC(T, 8);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N);
 
@@ -286,7 +288,8 @@ namespace functor {
   DECLARE_GPU_SPEC(T, 4); \
   DECLARE_GPU_SPEC(T, 5); \
   DECLARE_GPU_SPEC(T, 6); \
-  DECLARE_GPU_SPEC(T, 7);
+  DECLARE_GPU_SPEC(T, 7); \
+  DECLARE_GPU_SPEC(T, 8);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
 TF_CALL_complex64(DECLARE_FOR_N);
@@ -352,7 +355,8 @@ namespace functor {
   DECLARE_SYCL_SPEC(T, 4); \
   DECLARE_SYCL_SPEC(T, 5); \
   DECLARE_SYCL_SPEC(T, 6); \
-  DECLARE_SYCL_SPEC(T, 7);
+  DECLARE_SYCL_SPEC(T, 7); \
+  DECLARE_SYCL_SPEC(T, 8);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N);
 DECLARE_FOR_N(int32);
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl_8.cc b/tensorflow/core/kernels/slice_op_cpu_impl_8.cc
new file mode 100644
index 00000000000..0d4d656a7d0
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op_cpu_impl_8.cc
@@ -0,0 +1,18 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 8
+#include "tensorflow/core/kernels/slice_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index f76eab04e11..5a9d2ff950a 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -34,7 +34,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Slice<GPUDevice, T, 4>; \
   template struct functor::Slice<GPUDevice, T, 5>; \
   template struct functor::Slice<GPUDevice, T, 6>; \
-  template struct functor::Slice<GPUDevice, T, 7>;
+  template struct functor::Slice<GPUDevice, T, 7>; \
+  template struct functor::Slice<GPUDevice, T, 8>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index b90381db8ef..86f9d93b646 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -99,8 +99,8 @@ __global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
 
 template <typename T, typename U>
 struct SubtractAndExpFunctor {
-  __host__ __device__ SubtractAndExpFunctor(const T* logits,
-                                            const T* max_logits,
+  __host__ __device__ SubtractAndExpFunctor(const T* __restrict__ logits,
+                                            const T* __restrict__ max_logits,
                                             const int num_cols)
       : logits_(logits), max_logits_(max_logits), num_cols_(num_cols) {}
 
@@ -165,8 +165,8 @@ class SoftmaxOpGPU : public OpKernel {
           context, const_cast<T*>(max_logits.flat<T>().data()),
           reinterpret_cast<const T*>(logits_in_.flat<T>().data()), rows, cols);
 
-      const int numThreads = 128;
-      const int numBlocks = Eigen::divup(rows * cols, numThreads);
+      const int numThreadsPerBlock = 128;
+      const int numBlocks = Eigen::divup(rows * cols, numThreadsPerBlock);
 
       gpuprim::CountingInputIterator<int> counting_iterator(0);
       using InputIterType =
@@ -185,7 +185,7 @@ class SoftmaxOpGPU : public OpKernel {
           input_itr, rows, cols);
 
       TF_CHECK_OK(GpuLaunchKernel(
-          GenerateNormalizedProb<T, acc_type>, numBlocks, numThreads, 0,
+          GenerateNormalizedProb<T, acc_type>, numBlocks, numThreadsPerBlock, 0,
           cu_stream, reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
           reinterpret_cast<const acc_type*>(sum_probs.flat<acc_type>().data()),
           reinterpret_cast<const T*>(max_logits.flat<T>().data()),
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
index 8c083ba1581..3b35af07039 100644
--- a/tensorflow/core/kernels/softplus_op.h
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -18,7 +18,10 @@ limitations under the License.
 // Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by
 // nvcc.
 
+// clang-format off
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+// clang-format on
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index ecd3ba5da00..25996b1a202 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -44,8 +44,9 @@ struct S2BParameters {
 // To simplify template implementation given lack of constexpr if, both the
 // input and output pointers are non-const.
 template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-__global__ void S2B(const int32 nthreads, T* space_tensor_ptr,
-                    S2BParameters<NUM_BLOCK_DIMS> args, T* batch_tensor_ptr) {
+__global__ void S2B(const int32 nthreads, T* __restrict__ space_tensor_ptr,
+                    S2BParameters<NUM_BLOCK_DIMS> args,
+                    T* __restrict__ batch_tensor_ptr) {
   GPU_1D_KERNEL_LOOP(batch_tensor_idx, nthreads) {
     int32 remaining_batch_tensor_idx = batch_tensor_idx;
 
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index b80599e93de..610cb5eed59 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -29,12 +29,13 @@ typedef Eigen::GpuDevice GPUDevice;
 // Space2Depth kernel for FORMAT_NHWC.
 // See 'spacetodepth_op.h' for a more detailed description.
 template <typename dtype>
-__global__ void S2D_NHWC(const int32 nthreads, const dtype* input_ptr,
+__global__ void S2D_NHWC(const int32 nthreads,
+                         const dtype* __restrict__ input_ptr,
                          const int block_size, const int batch_size,
                          const int input_height, const int input_width,
                          const int input_depth, const int output_height,
                          const int output_width, const int output_depth,
-                         dtype* output_ptr) {
+                         dtype* __restrict__ output_ptr) {
   GPU_1D_KERNEL_LOOP(inp_idx, nthreads) {
     // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
     const int d = inp_idx % input_depth;
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 8e92c9e5517..a16e34c7cb4 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -78,16 +78,16 @@ template <>
 int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
-    return Fingerprint64(values_.vec<string>().data()[start + n]);
+    return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-string SparseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
-    return values_.vec<string>().data()[start + n];
+    return values_.vec<tstring>().data()[start + n];
   return std::to_string(values_.vec<int64>().data()[start + n]);
 }
 
@@ -95,7 +95,7 @@ template <>
 StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
                                                      int64 n) const {
   const int64 start = feature_start_indices_[batch];
-  return values_.vec<string>().data()[start + n];
+  return values_.vec<tstring>().data()[start + n];
 }
 
 // A column that is backed by a dense tensor.
@@ -118,21 +118,21 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 template <>
 int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
   if (DT_STRING == tensor_.dtype())
-    return Fingerprint64(tensor_.matrix<string>()(batch, n));
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
 }
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-string DenseTensorColumn<string>::Feature(int64 batch, int64 n) const {
-  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
 StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
                                                     int64 n) const {
-  return tensor_.matrix<string>()(batch, n);
+  return tensor_.matrix<tstring>()(batch, n);
 }
 
 // Updates Output tensors with sparse crosses.
@@ -275,7 +275,7 @@ struct CrossTraits;
 template <typename InternalType>
 struct CrossTraits<false, InternalType> {
   typedef StringCrosser<InternalType> Crosser;
-  typedef OutputUpdater<string> Updater;
+  typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
@@ -555,20 +555,20 @@ class SparseCrossOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("out_type")
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseCrossOp<false, StringPiece>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<tstring>("out_type")
                             .TypeConstraint<int64>("internal_type"),
-                        SparseCrossOp<false, string>);
+                        SparseCrossOp<false, tstring>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<int64>("out_type")
-                            .TypeConstraint<string>("internal_type"),
+                            .TypeConstraint<tstring>("internal_type"),
                         SparseCrossOp<true, int64>);
 
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 9c9e7370ac4..1ad86b666fc 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1550,7 +1550,7 @@ inline void SparseMatMul<TL, TR>::Compute(
   // Note buffer needs enough space to hold at most a KR * NR matrix since that
   // is the block size per iteration.
   const int buffer_num_rows =
-      std::min(KR, right_dim0) * (std::min(NR, right_dim1) + N - 1) / N;
+      std::min(KR, right_dim0) * ((std::min(NR, right_dim1) + N - 1) / N);
   MatrixR buffer(buffer_num_rows, N);
   std::vector<ConstMatrixMapR*> right_slices;
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 6b9db8f471a..6e84e22c2b4 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/windows/cpu_info.h"
 #include "tensorflow/core/platform/windows/intrinsics_port.h"
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index f1651455190..777a38700d3 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -27,11 +27,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-__global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
-                                              int b_cols, int p,
-                                              const Tindices* a_indices,
-                                              const T* a_values, const T* b,
-                                              T* out) {
+__global__ void SparseTensorDenseMatMulKernel(
+    int nnz, int m, int b_rows, int b_cols, int p,
+    const Tindices* __restrict__ a_indices, const T* __restrict__ a_values,
+    const T* __restrict__ b, T* __restrict__ out) {
   // out_{ij} = sum_k {a_ik b_kj}
   // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
   const int n = (ADJ_B) ? b_cols : b_rows;
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index f79a4d0494c..f0cddc88fbf 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -152,7 +152,7 @@ class SparseToDense : public OpKernel {
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL);
 REGISTER_KERNELS_ALL(bool);
-REGISTER_KERNELS_ALL(string);
+REGISTER_KERNELS_ALL(tstring);
 
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index bb9d18e915a..684cbc19e77 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
 #include "tensorflow/core/platform/env.h"
@@ -162,7 +163,12 @@ void ReadCSVFileToArrayOrDie(const string& filename,
   std::vector<float> values;
   for (int l = 0; l < lines.size(); ++l) {
     values.clear();
-    CHECK(str_util::SplitAndParseAsFloats(lines[l], ',', &values));
+    std::vector<string> split_line = str_util::Split(lines[l], ",");
+    for (const string& token : split_line) {
+      float tmp;
+      CHECK(strings::safe_strtof(token, &tmp));
+      values.push_back(tmp);
+    }
     array->push_back(values);
   }
 }
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index c2cc7d61006..b094a5320f7 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -74,8 +74,9 @@ TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 namespace {
 
 template <typename T>
-__global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
-                              int32 split_dim_size, int32 suffix_dim_size,
+__global__ void SplitOpKernel(const T* __restrict__ input,
+                              int32 prefix_dim_size, int32 split_dim_size,
+                              int32 suffix_dim_size,
                               GpuDeviceArrayStruct<T*> output_ptr_data) {
   const int32 num_split = output_ptr_data.size;
   T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
@@ -112,7 +113,7 @@ __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
 // very similar to the concat kernel except the input/output logic
 // is reversed
 template <typename T, typename IntType, bool useSmem>
-__global__ void split_v_kernel(const T* input_ptr,
+__global__ void split_v_kernel(const T* __restrict__ input_ptr,
                                GpuDeviceArrayStruct<IntType> output_scan,
                                IntType total_rows, IntType total_cols,
                                GpuDeviceArrayStruct<T*> output_ptr_data) {
@@ -169,7 +170,8 @@ __global__ void split_v_kernel(const T* input_ptr,
 // different from the original split implementation due to 2D vs 3D
 // dimensions.  This version is likely faster due to less integer math.
 template <typename T>
-__global__ void SplitVOpKernel_fixed(const T* input, int32 prefix_dim_size,
+__global__ void SplitVOpKernel_fixed(const T* __restrict__ input,
+                                     int32 prefix_dim_size,
                                      int32 suffix_dim_size,
                                      GpuDeviceArrayStruct<T*> output_ptr_data) {
   const int32 num_split = output_ptr_data.size;
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 033b9f34780..af8f760d47f 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -134,8 +134,8 @@ Status GetStack(OpKernelContext* ctx, Stack** stack) {
           "Stack handle must have two elements, but had shape: ",
           Tstack_handle.shape().DebugString());
     }
-    const string& container = Tstack_handle.flat<string>()(0);
-    const string& stack_name = Tstack_handle.flat<string>()(1);
+    const string& container = Tstack_handle.flat<tstring>()(0);
+    const string& stack_name = Tstack_handle.flat<tstring>()(1);
     string key = strings::StrCat(container, stack_name);
     ResourceMgr* rm = ctx->resource_manager();
     if (rm == nullptr) {
@@ -184,10 +184,10 @@ void StackOp::Compute(OpKernelContext* ctx) {
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
   string key = strings::StrCat(kContainer, stack_name);
-  Stack* stack = new Stack(elem_type_, stack_name, size);
   auto* step_container = ctx->step_container();
   OP_REQUIRES(ctx, step_container != nullptr,
               errors::Internal("No step container."));
+  Stack* stack = new Stack(elem_type_, stack_name, size);
   OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
   if (IsRefType(ctx->expected_output_dtype(0))) {
     // Create the stack handle.
@@ -196,7 +196,7 @@ void StackOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
                                            tensorflow::TensorShape({2}),
                                            &stack->handle_, alloc_attr));
-    auto handle = stack->handle_.flat<string>();
+    auto handle = stack->handle_.flat<tstring>();
     handle(0) = kContainer;
     handle(1) = std::move(stack_name);
     ctx->set_output_ref(0, stack->mu(), &stack->handle_);
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
index deb58275897..a99ecc2c257 100644
--- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -35,8 +35,8 @@ __device__ int thread_counter;
 template <typename Distribution>
 __global__ void FillKernel(
     Distribution dist, int64 state_size, int64 output_size,
-    StateElementType* state_data,
-    typename Distribution::ResultElementType* output_data) {
+    StateElementType* __restrict__ state_data,
+    typename Distribution::ResultElementType* __restrict__ output_data) {
   // Threads in this block share `philox`. Thread 0 is responsible for
   // initializing it.
   __shared__ char philox_raw[sizeof(PhiloxRandom)];
@@ -90,7 +90,8 @@ void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
 }
 
 // Precondition: there is only 1 block and 1 thread.
-__global__ void SkipKernel(int64 delta, StateElementType* state_data) {
+__global__ void SkipKernel(int64 delta,
+                           StateElementType* __restrict__ state_data) {
   auto philox = GetPhiloxRandomFromMem(state_data);
   UpdateMemWithPhiloxRandom(philox, delta, state_data);
 }
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 5d4ee523bdb..dd23c251897 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -170,6 +170,7 @@ class StridedSliceOp : public OpKernel {
       HANDLE_DIM(5);
       HANDLE_DIM(6);
       HANDLE_DIM(7);
+      HANDLE_DIM(8);
 
 #undef HANDLE_DIM
 
@@ -268,6 +269,7 @@ class StridedSliceGradOp : public OpKernel {
     HANDLE_DIM(5);
     HANDLE_DIM(6);
     HANDLE_DIM(7);
+    HANDLE_DIM(8);
 
 #undef HANDLE_DIM
   }
@@ -384,6 +386,7 @@ class StridedSliceAssignOp : public OpKernel {
       HANDLE_DIM(5);
       HANDLE_DIM(6);
       HANDLE_DIM(7);
+      HANDLE_DIM(8);
 #undef HANDLE_DIM
 
       OP_REQUIRES(context, false,
@@ -437,6 +440,8 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
+TF_CALL_uint32(REGISTER_STRIDED_SLICE);
+TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_impl.h b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
index f451fe4bb7a..23a3ff8606e 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
@@ -38,6 +38,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSlice<GPUDevice, T, 5>;       \
   template struct functor::StridedSlice<GPUDevice, T, 6>;       \
   template struct functor::StridedSlice<GPUDevice, T, 7>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 8>;       \
   template struct functor::StridedSliceGrad<GPUDevice, T, 1>;   \
   template struct functor::StridedSliceGrad<GPUDevice, T, 2>;   \
   template struct functor::StridedSliceGrad<GPUDevice, T, 3>;   \
@@ -45,6 +46,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceGrad<GPUDevice, T, 5>;   \
   template struct functor::StridedSliceGrad<GPUDevice, T, 6>;   \
   template struct functor::StridedSliceGrad<GPUDevice, T, 7>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 8>;   \
   template struct functor::StridedSliceAssign<GPUDevice, T, 1>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 2>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 3>; \
@@ -52,6 +54,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 5>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 8>; \
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index e7d9a5e129f..bf69a19abc5 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -291,6 +291,8 @@ DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_uint32(DECLARE_FOR_N_CPU);
+TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/kernels/strided_slice_op_inst_8.cc
similarity index 65%
rename from tensorflow/core/platform/windows/cpu_info.h
rename to tensorflow/core/kernels/strided_slice_op_inst_8.cc
index 8b42cbec7a1..83dfd9b49a2 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/kernels/strided_slice_op_inst_8.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
-#define TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
 
-// included so __cpuidex function is available for GETCPUID on Windows
-#include <intrin.h>
-
-#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
+#define STRIDED_SLICE_INSTANTIATE_DIM 8
+#include "tensorflow/core/kernels/strided_slice_op_impl.h"
+#undef STRIDED_SLICE_INSTANTIATE_DIM
diff --git a/tensorflow/core/kernels/string_format_op.cc b/tensorflow/core/kernels/string_format_op.cc
index e4a1887f8d3..0caec3ed068 100644
--- a/tensorflow/core/kernels/string_format_op.cc
+++ b/tensorflow/core/kernels/string_format_op.cc
@@ -50,7 +50,7 @@ class StringFormatOp : public OpKernel {
       strings::StrAppend(&msg, split_template_[i + 1].c_str());
     }
 
-    formatted_string->scalar<string>()() = msg;
+    formatted_string->scalar<tstring>()() = std::move(msg);
   }
 
  private:
diff --git a/tensorflow/core/kernels/string_format_op_test.cc b/tensorflow/core/kernels/string_format_op_test.cc
index 13130a57975..6fbda126984 100644
--- a/tensorflow/core/kernels/string_format_op_test.cc
+++ b/tensorflow/core/kernels/string_format_op_test.cc
@@ -47,8 +47,8 @@ TEST_F(StringFormatGraphTest, Int32Success_7) {
   AddInputFromArray<int32>(TensorShape({7}), {1, 2, 3, 4, 5, 6, 7});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({}));
-  test::FillValues<string>(&expected, {"First tensor: [1 2 3 ... 5 6 7]"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"First tensor: [1 2 3 ... 5 6 7]"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 TEST_F(StringFormatGraphTest, Int32Success_3_3) {
@@ -57,9 +57,9 @@ TEST_F(StringFormatGraphTest, Int32Success_3_3) {
   AddInputFromArray<int32>(TensorShape({3, 3}), {1, 2, 3, 4, 5, 6, 7, 8, 9});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_STRING, TensorShape({}));
-  test::FillValues<string>(&expected, {"First tensor: [[1 ... 3]\n ..."
-                                       "\n [7 ... 9]]"});
-  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+  test::FillValues<tstring>(&expected, {"First tensor: [[1 ... 3]\n ..."
+                                        "\n [7 ... 9]]"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
 }  // end namespace
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 4b9c19da691..336be40b192 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -39,10 +39,10 @@ class StringJoinOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input_list("inputs", &input_list));
     TensorShape input_shape;
     std::vector<bool> is_scalar;
-    std::vector<TTypes<string>::ConstFlat> inputs;
+    std::vector<TTypes<tstring>::ConstFlat> inputs;
 
     for (const auto& input : input_list) {
-      inputs.push_back(input.flat<string>());
+      inputs.push_back(input.flat<tstring>());
       is_scalar.push_back(TensorShapeUtils::IsScalar(input.shape()));
       if (!TensorShapeUtils::IsScalar(input.shape())) {
         if (TensorShapeUtils::IsScalar(input_shape)) {
@@ -60,7 +60,7 @@ class StringJoinOp : public OpKernel {
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output("output", input_shape,
                                                      &output_tensor));
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
 
     std::vector<StringPiece> strings(input_list.size());
     for (size_t i = 0; i < input_shape.num_elements(); ++i) {
diff --git a/tensorflow/core/kernels/string_length_op.cc b/tensorflow/core/kernels/string_length_op.cc
index 435a7abdcac..53a161353f0 100644
--- a/tensorflow/core/kernels/string_length_op.cc
+++ b/tensorflow/core/kernels/string_length_op.cc
@@ -34,7 +34,7 @@ class StringLengthOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
 
-    auto src = input.flat<string>();
+    auto src = input.flat<tstring>();
     auto dst = output->flat<int32>();
 
     switch (unit_) {
diff --git a/tensorflow/core/kernels/string_lower_op.cc b/tensorflow/core/kernels/string_lower_op.cc
index e24eedcc3ae..07065d2777e 100644
--- a/tensorflow/core/kernels/string_lower_op.cc
+++ b/tensorflow/core/kernels/string_lower_op.cc
@@ -45,8 +45,8 @@ class StringLowerOp : public OpKernel {
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
 
-    const auto input = input_tensor->flat<string>();
-    auto output = output_tensor->flat<string>();
+    const auto input = input_tensor->flat<tstring>();
+    auto output = output_tensor->flat<tstring>();
 
     if (encoding_.empty()) {
       for (int64 i = 0; i < input.size(); ++i) {
diff --git a/tensorflow/core/kernels/string_ngrams_op.cc b/tensorflow/core/kernels/string_ngrams_op.cc
new file mode 100644
index 00000000000..dc757a01fcf
--- /dev/null
+++ b/tensorflow/core/kernels/string_ngrams_op.cc
@@ -0,0 +1,201 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <locale>
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace text {
+
+namespace {
+template <typename SPLITS_TYPE>
+class StringNGramsOp : public tensorflow::OpKernel {
+ public:
+  explicit StringNGramsOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("separator", &separator_));
+    OP_REQUIRES_OK(context, context->GetAttr("ngram_widths", &ngram_widths_));
+    OP_REQUIRES_OK(context, context->GetAttr("left_pad", &left_pad_));
+    OP_REQUIRES_OK(context, context->GetAttr("right_pad", &right_pad_));
+    OP_REQUIRES_OK(context, context->GetAttr("pad_width", &pad_width_));
+    OP_REQUIRES_OK(context, context->GetAttr("preserve_short_sequences",
+                                             &preserve_short_));
+  }
+
+  int get_pad_width(const int ngram_width) const {
+    // Ngrams can be padded with either a fixed pad width or a dynamic pad
+    // width depending on the 'pad_width' arg, but in no case should the padding
+    // ever be wider than 'ngram_width' - 1.
+    return std::min(pad_width_ < 0 ? ngram_width - 1 : pad_width_,
+                    ngram_width - 1);
+  }
+
+  int get_num_ngrams(const int length, const int ngram_width) const {
+    int pad_width = get_pad_width(ngram_width);
+    return std::max(0, ((length + 2 * pad_width) - ngram_width) + 1);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor* data;
+    OP_REQUIRES_OK(context, context->input("data", &data));
+    const auto& input_data = data->flat<tstring>().data();
+
+    const tensorflow::Tensor* splits;
+    OP_REQUIRES_OK(context, context->input("data_splits", &splits));
+    const auto& splits_vec = splits->flat<SPLITS_TYPE>();
+
+    // If there is no data or size, return an empty RT.
+    if (data->flat<tstring>().size() == 0 || splits_vec.size() == 0) {
+      tensorflow::Tensor* empty;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, data->shape(), &empty));
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(1, splits->shape(), &empty));
+      return;
+    }
+
+    int num_batch_items = splits_vec.size() - 1;
+    tensorflow::Tensor* ngrams_splits;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, splits->shape(), &ngrams_splits));
+    auto ngrams_splits_data = ngrams_splits->flat<SPLITS_TYPE>().data();
+
+    ngrams_splits_data[0] = 0;
+    for (int i = 1; i <= num_batch_items; ++i) {
+      int length = splits_vec(i) - splits_vec(i - 1);
+      int num_ngrams = 0;
+      for (int ngram_width : ngram_widths_)
+        num_ngrams += get_num_ngrams(length, ngram_width);
+      if (preserve_short_ && length > 0 && num_ngrams == 0) {
+        num_ngrams = 1;
+      }
+      ngrams_splits_data[i] = ngrams_splits_data[i - 1] + num_ngrams;
+    }
+
+    tensorflow::Tensor* ngrams;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({ngrams_splits_data[num_batch_items]}), &ngrams));
+    auto ngrams_data = ngrams->flat<tstring>().data();
+
+    for (int i = 0; i < num_batch_items; ++i) {
+      auto data_start = &input_data[splits_vec(i)];
+      int output_start_idx = ngrams_splits_data[i];
+      for (int ngram_width : ngram_widths_) {
+        auto output_start = &ngrams_data[output_start_idx];
+        int length = splits_vec(i + 1) - splits_vec(i);
+        int num_ngrams = get_num_ngrams(length, ngram_width);
+        CreateNgrams(data_start, output_start, num_ngrams, ngram_width);
+        output_start_idx += num_ngrams;
+      }
+      // If we're preserving short sequences, check to see if no sequence was
+      // generated by comparing the current output start idx to the original
+      // one (ngram_splits_data). If no ngrams were generated, then they will
+      // be equal (since we increment output_start_idx by num_ngrams every
+      // time we create a set of ngrams.)
+      if (preserve_short_ && output_start_idx == ngrams_splits_data[i]) {
+        int data_length = splits_vec(i + 1) - splits_vec(i);
+        // One legitimate reason to not have any ngrams when preserve_short_
+        // is true is if the sequence itself is empty. In that case, move on.
+        if (data_length == 0) {
+          continue;
+        }
+        // We don't have to worry about dynamic padding sizes here: if padding
+        // was dynamic, every sequence would have had sufficient padding to
+        // generate at least one ngram.
+        int ngram_width = data_length + 2 * pad_width_;
+        auto output_start = &ngrams_data[output_start_idx];
+        int num_ngrams = 1;
+        CreateNgrams(data_start, output_start, num_ngrams, ngram_width);
+      }
+    }
+  }
+
+  void CreateNgrams(const tstring* data, tstring* output, int num_ngrams,
+                    int ngram_width) const {
+    for (int ngram_index = 0; ngram_index < num_ngrams; ++ngram_index) {
+      int pad_width = get_pad_width(ngram_width);
+      int left_padding = std::max(0, pad_width - ngram_index);
+      int right_padding =
+          std::max(0, pad_width - (num_ngrams - (ngram_index + 1)));
+      int num_tokens = ngram_width - (left_padding + right_padding);
+      int data_start_index = left_padding > 0 ? 0 : ngram_index - pad_width;
+
+      // Calculate the total expected size of the ngram so we can reserve the
+      // correct amount of space in the string.
+      int ngram_size = 0;
+      // Size of the left padding.
+      ngram_size += left_padding * left_pad_.length();
+      // Size of the tokens.
+      for (int n = 0; n < num_tokens; ++n) {
+        ngram_size += data[data_start_index + n].length();
+      }
+      // Size of the right padding.
+      ngram_size += right_padding * right_pad_.length();
+      // Size of the separators.
+      int num_separators = left_padding + right_padding + num_tokens - 1;
+      ngram_size += num_separators * separator_.length();
+
+      // Build the ngram.
+      tstring* ngram = &output[ngram_index];
+      ngram->reserve(ngram_size);
+      for (int n = 0; n < left_padding; ++n) {
+        ngram->append(left_pad_);
+        ngram->append(separator_);
+      }
+      for (int n = 0; n < num_tokens - 1; ++n) {
+        ngram->append(data[data_start_index + n]);
+        ngram->append(separator_);
+      }
+      ngram->append(data[data_start_index + num_tokens - 1]);
+      for (int n = 0; n < right_padding; ++n) {
+        ngram->append(separator_);
+        ngram->append(right_pad_);
+      }
+
+      // In debug mode only: validate that we've reserved enough space for the
+      // ngram.
+      DCHECK_EQ(ngram_size, ngram->size());
+    }
+  }
+
+  string separator_;
+  string left_pad_;
+  string right_pad_;
+  bool use_pad_;
+  bool extend_pad_;
+  bool preserve_short_;
+
+  std::vector<int> ngram_widths_;
+  int pad_width_;
+};
+
+}  // namespace
+REGISTER_KERNEL_BUILDER(Name("StringNGrams")
+                            .Device(tensorflow::DEVICE_CPU)
+                            .TypeConstraint<int32>("Tsplits"),
+                        StringNGramsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("StringNGrams")
+                            .Device(tensorflow::DEVICE_CPU)
+                            .TypeConstraint<int64>("Tsplits"),
+                        StringNGramsOp<int64>);
+
+}  // namespace text
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_ngrams_op_test.cc b/tensorflow/core/kernels/string_ngrams_op_test.cc
new file mode 100644
index 00000000000..b89de9ad16d
--- /dev/null
+++ b/tensorflow/core/kernels/string_ngrams_op_test.cc
@@ -0,0 +1,555 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace text {
+
+using tensorflow::FakeInput;
+using tensorflow::NodeDefBuilder;
+using tensorflow::Status;
+using tensorflow::TensorShape;
+
+class NgramKernelTest : public tensorflow::OpsTestBase {
+ public:
+  void MakeOp(string separator, std::vector<int> ngram_width, string left_pad,
+              string right_pad, int pad_width, bool preserve) {
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "StringNGrams")
+                     .Attr("separator", separator)
+                     .Attr("ngram_widths", ngram_width)
+                     .Attr("left_pad", left_pad)
+                     .Attr("right_pad", right_pad)
+                     .Attr("pad_width", pad_width)
+                     .Attr("preserve_short_sequences", preserve)
+                     .Input(FakeInput())
+                     .Input(FakeInput())
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+
+  void assert_string_equal(const std::vector<tstring> &expected,
+                           const Tensor &value) {
+    Tensor expected_tensor(allocator(), DT_STRING,
+                           TensorShape({static_cast<int64>(expected.size())}));
+    test::FillValues<tstring>(&expected_tensor, expected);
+    test::ExpectTensorEqual<tstring>(expected_tensor, value);
+  }
+  void assert_int64_equal(const std::vector<int64> &expected,
+                          const Tensor &value) {
+    Tensor expected_tensor(allocator(), DT_INT64,
+                           TensorShape({static_cast<int64>(expected.size())}));
+    test::FillValues<int64>(&expected_tensor, expected);
+    test::ExpectTensorEqual<int64>(expected_tensor, value);
+  }
+};
+
+TEST_F(NgramKernelTest, TestPaddedTrigrams) {
+  MakeOp("|", {3}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(                             //
+      {"LP|LP|a", "LP|a|b", "a|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // 0
+       "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});                  // 1
+  std::vector<int64> expected_splits({0, 6, 10});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestPaddedBigramsAndTrigrams) {
+  MakeOp("|", {2, 3}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(
+      {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|LP|a", "LP|a|b", "a|b|c",
+       "b|c|d", "c|d|RP", "d|RP|RP",                                       // 0
+       "LP|e", "e|f", "f|RP", "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});  // 1
+  std::vector<int64> expected_splits({0, 11, 18});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestPaddedBigrams) {
+  MakeOp("|", {2}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(      //
+      {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
+       "LP|e", "e|f", "f|RP"});              // 1
+  std::vector<int64> expected_splits({0, 5, 8});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestPaddingIsAtMostNGramSizeMinus1) {
+  MakeOp("|", {2}, "LP", "RP", 4, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(      //
+      {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
+       "LP|e", "e|f", "f|RP"});              // 1
+  std::vector<int64> expected_splits({0, 5, 8});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestPaddedUnigramAndBigrams) {
+  MakeOp("|", {1, 2}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(                          //
+      {"a", "b", "c", "d", "LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
+       "e", "f", "LP|e", "e|f", "f|RP"});                        // 1
+  std::vector<int64> expected_splits({0, 9, 14});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingPaddedNGrams) {
+  // This test validates that n-grams with both left and right padding in a
+  // single ngram token are created correctly.
+  MakeOp("|", {3}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(                    //
+      {"LP|LP|a", "LP|a|RP", "a|RP|RP",                    // ngrams for elem. 0
+       "LP|LP|b", "LP|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // ngrams for elem. 1
+       "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});         // ngrams for elem. 2
+  std::vector<int64> expected_splits({0, 3, 8, 12});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingPaddedMultiCharNGrams) {
+  MakeOp("|", {3}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}),
+                             {"aa", "bb", "cc", "dd", "ee", "ff"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(                             //
+      {"LP|LP|aa", "LP|aa|RP", "aa|RP|RP",                          //
+       "LP|LP|bb", "LP|bb|cc", "bb|cc|dd", "cc|dd|RP", "dd|RP|RP",  //
+       "LP|LP|ee", "LP|ee|ff", "ee|ff|RP", "ff|RP|RP"});            //
+  std::vector<int64> expected_splits({0, 3, 8, 12});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestMultiOverlappingPaddedNGrams) {
+  // This test validates that n-grams with more than 1 padding value on each
+  // side are created correctly.
+  MakeOp("|", {5}, "LP", "RP", -1, false);
+  // Batch items are:
+  // 0: "a"
+  AddInputFromArray<tstring>(TensorShape({1}), {"a"});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"LP|LP|LP|LP|a", "LP|LP|LP|a|RP",
+                                        "LP|LP|a|RP|RP", "LP|a|RP|RP|RP",
+                                        "a|RP|RP|RP|RP"});
+  std::vector<int64> expected_splits({0, 5});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigrams) {
+  MakeOp("|", {3}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
+  std::vector<int64> expected_splits({0, 2, 2});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithEmptySequence) {
+  MakeOp("|", {3}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
+  std::vector<int64> expected_splits({0, 2, 2, 2});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShort) {
+  MakeOp("|", {3}, "", "", 0, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 2, 3});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShortAndEmptySequence) {
+  MakeOp("|", {3}, "", "", 0, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 2, 2, 3});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndQuadgramsWithPreserveShort) {
+  MakeOp("|", {4, 3}, "", "", 0, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b|c|d", "a|b|c", "b|c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 3, 4});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigrams) {
+  MakeOp("|", {2, 3}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(
+      {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 5, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigramsWithPreserveShort) {
+  MakeOp("|", {2, 3}, "", "", 0, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Note that in this case, because the bigram 'e|f' was already generated,
+  // the op will not generate a special preserve_short bigram.
+  std::vector<tstring> expected_values(
+      {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 5, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndBigramsWithPreserveShort) {
+  MakeOp("|", {3, 2}, "", "", 0, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Note that in this case, because the bigram 'e|f' was already generated,
+  // the op will not generate a special preserve_short bigram.
+  std::vector<tstring> expected_values(
+      {"a|b|c", "b|c|d", "a|b", "b|c", "c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 5, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestUnpaddedBigrams) {
+  MakeOp("|", {2}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a|b", "b|c", "c|d", "e|f"});
+  std::vector<int64> expected_splits({0, 3, 4});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGrams) {
+  MakeOp("|", {3}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"b|c|d"});
+  std::vector<int64> expected_splits({0, 0, 1, 1});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGramsNoOutput) {
+  MakeOp("|", {5}, "", "", 0, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({});
+  std::vector<int64> expected_splits({0, 0, 0, 0});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestSinglyPaddedTrigrams) {
+  MakeOp("|", {3}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"LP|a|b", "a|b|c", "b|c|d",
+                                        "c|d|RP",  //
+                                        "LP|e|f", "e|f|RP"});
+  std::vector<int64> expected_splits({0, 4, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestSinglyPaddedBigrams) {
+  MakeOp("|", {2}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"LP|a", "a|b", "b|c", "c|d", "d|RP",  //
+                                        "LP|e", "e|f", "f|RP"});
+  std::vector<int64> expected_splits({0, 5, 8});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestSinglyPaddedBigramsAnd5grams) {
+  MakeOp("|", {2, 5}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(                                  //
+      {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|a|b|c|d", "a|b|c|d|RP",  //
+       "LP|e", "e|f", "f|RP"});
+  std::vector<int64> expected_splits({0, 7, 10});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestSinglyPadded5gramsWithPreserveShort) {
+  MakeOp("|", {5}, "LP", "RP", 1, true);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(  //
+      {"LP|a|b|c|d", "a|b|c|d|RP",       //
+       "LP|e|f|RP"});
+  std::vector<int64> expected_splits({0, 2, 3});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGrams) {
+  MakeOp("|", {3}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values(
+      {"LP|a|RP",                    // ngrams for elem. 0
+       "LP|b|c", "b|c|d", "c|d|RP",  // ngrams for elem. 1
+       "LP|e|f", "e|f|RP"});         // ngrams for elem. 2
+  std::vector<int64> expected_splits({0, 1, 4, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGramsNoOutput) {
+  MakeOp("|", {5}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a"
+  // 1: "b", "c", "d"
+  // 2: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"LP|b|c|d|RP"});
+  std::vector<int64> expected_splits({0, 0, 1, 1});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestSinglyPaddedUnigrams) {
+  MakeOp("|", {1}, "LP", "RP", 1, false);
+  // Batch items are:
+  // 0: "a", "b", "c", "d"
+  // 1: "e", "f"
+  AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({"a", "b", "c", "d", "e", "f"});
+  std::vector<int64> expected_splits({0, 4, 6});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, TestEmptyInput) {
+  MakeOp("|", {1}, "LP", "RP", 3, false);
+  AddInputFromArray<tstring>(TensorShape({0}), {});
+  AddInputFromArray<int64>(TensorShape({0}), {});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::vector<tstring> expected_values({});
+  std::vector<int64> expected_splits({});
+
+  assert_string_equal(expected_values, *GetOutput(0));
+  assert_int64_equal(expected_splits, *GetOutput(1));
+}
+
+TEST_F(NgramKernelTest, ShapeFn) {
+  ShapeInferenceTestOp op("StringNGrams");
+  INFER_OK(op, "?;?", "[?];[?]");
+  INFER_OK(op, "[1];?", "[?];[?]");
+  INFER_OK(op, "[1];[2]", "[?];in1");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
+}
+
+}  // namespace text
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 3884370a6c6..79ffdb1b069 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -34,7 +34,7 @@ namespace {
 // a series of finds in the input string, making it much more effcient than
 // SplitOnCharSet.
 template <typename Predicate>
-std::vector<StringPiece> SplitOnChar(const string& str, const char delim,
+std::vector<StringPiece> SplitOnChar(const tstring& str, const char delim,
                                      Predicate p) {
   std::vector<StringPiece> result;
   StringPiece text(str);
@@ -58,8 +58,8 @@ std::vector<StringPiece> SplitOnChar(const string& str, const char delim,
 // is valid.
 // Based on str_util::Split.
 template <typename Predicate>
-std::vector<StringPiece> SplitOnCharSet(const string& str,
-                                        const string& delim_set, Predicate p) {
+std::vector<StringPiece> SplitOnCharSet(const tstring& str,
+                                        const tstring& delim_set, Predicate p) {
   std::vector<StringPiece> result;
   StringPiece text(str);
   StringPiece delims(delim_set);
@@ -80,7 +80,7 @@ std::vector<StringPiece> SplitOnCharSet(const string& str,
 // Returns a vector of StringPieces which are valid as long as input `str`
 // is valid.
 template <typename Predicate>
-std::vector<StringPiece> Split(const string& str, const string& delimiter,
+std::vector<StringPiece> Split(const tstring& str, const tstring& delimiter,
                                Predicate predicate) {
   if (str.empty()) {
     return std::vector<StringPiece>();
@@ -99,7 +99,7 @@ std::vector<StringPiece> Split(const string& str, const string& delimiter,
   return SplitOnCharSet(str, delimiter, predicate);
 }
 
-std::vector<StringPiece> SplitV2(const string& str, StringPiece sep,
+std::vector<StringPiece> SplitV2(const tstring& str, StringPiece sep,
                                  int maxsplit) {
   // This SplitV2 method matches the behavior of python's str.split:
   //   If sep is given, consecutive delimiters are not grouped together
@@ -178,7 +178,7 @@ class StringSplitOp : public OpKernel {
                 errors::InvalidArgument("input must be a vector, got shape: ",
                                         input_tensor->shape().DebugString()));
 
-    const auto input_vec = input_tensor->vec<string>();
+    const auto input_vec = input_tensor->vec<tstring>();
     const int64 batch_size = input_vec.dimension(0);
 
     const Tensor* delimiter_tensor;
@@ -187,8 +187,8 @@ class StringSplitOp : public OpKernel {
         ctx, TensorShapeUtils::IsScalar(delimiter_tensor->shape()),
         errors::InvalidArgument("delimiter must be a scalar, got shape: ",
                                 delimiter_tensor->shape().DebugString()));
-    const auto delimiter_vec = delimiter_tensor->flat<string>();
-    const string& delimiter = delimiter_vec(0);
+    const auto delimiter_vec = delimiter_tensor->flat<tstring>();
+    const tstring& delimiter = delimiter_vec(0);
     // Empty delimiter means split the input character by character.
     std::vector<StringPiece> tokens;
     // Guess that we'll be unpacking a handful of tokens per example.
@@ -220,7 +220,7 @@ class StringSplitOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
 
     auto sp_indices = sp_indices_t->matrix<int64>();
-    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_tokens = sp_tokens_t->vec<tstring>();
     auto sp_shape = sp_shape_t->vec<int64>();
     sp_shape(0) = batch_size;
     sp_shape(1) = max_num_entries;
@@ -253,7 +253,7 @@ class StringSplitV2Op : public OpKernel {
                 errors::InvalidArgument("input must be a vector, got shape: ",
                                         input_tensor->shape().DebugString()));
 
-    const auto input_vec = input_tensor->vec<string>();
+    const auto input_vec = input_tensor->vec<tstring>();
     const int64 batch_size = input_vec.dimension(0);
 
     const Tensor* sep_tensor;
@@ -261,7 +261,7 @@ class StringSplitV2Op : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
                 errors::InvalidArgument("sep must be a scalar, got shape: ",
                                         sep_tensor->shape().DebugString()));
-    const auto sep_vec = sep_tensor->flat<string>();
+    const auto sep_vec = sep_tensor->flat<tstring>();
     StringPiece sep(sep_vec(0));
     std::vector<StringPiece> tokens;
     // Guess that we'll be unpacking a handful of tokens per example.
@@ -290,7 +290,7 @@ class StringSplitV2Op : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
 
     auto sp_indices = sp_indices_t->matrix<int64>();
-    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_tokens = sp_tokens_t->vec<tstring>();
     auto sp_shape = sp_shape_t->vec<int64>();
     sp_shape(0) = batch_size;
     sp_shape(1) = max_num_entries;
diff --git a/tensorflow/core/kernels/string_split_op_test.cc b/tensorflow/core/kernels/string_split_op_test.cc
index 58ad61adc86..4494cf9dcf3 100644
--- a/tensorflow/core/kernels/string_split_op_test.cc
+++ b/tensorflow/core/kernels/string_split_op_test.cc
@@ -57,7 +57,7 @@ const char* lines[] = {
 Tensor GetTestTensor(int batch) {
   const int sz = TF_ARRAYSIZE(lines);
   Tensor t(DT_STRING, {batch});
-  auto s = t.flat<string>();
+  auto s = t.flat<tstring>();
   for (int i = 0; i < batch; ++i) {
     s(i) = lines[i % sz];
   }
@@ -67,7 +67,7 @@ Tensor GetTestTensor(int batch) {
 Graph* SetupStringSplitGraph(const Tensor& input) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor delim(DT_STRING, TensorShape({}));
-  delim.flat<string>().setConstant(" ");
+  delim.flat<tstring>().setConstant(" ");
 
   TF_CHECK_OK(NodeBuilder("string_split_op", "StringSplit")
                   .Input(test::graph::Constant(g, input))
@@ -98,7 +98,7 @@ BENCHMARK(BM_StringSplit)
 Graph* SetupStringSplitV2Graph(const Tensor& input) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor sep(DT_STRING, TensorShape({}));
-  sep.flat<string>().setConstant(" ");
+  sep.flat<tstring>().setConstant(" ");
 
   TF_CHECK_OK(NodeBuilder("string_split_op", "StringSplitV2")
                   .Input(test::graph::Constant(g, input))
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
index 544dca96ba7..715ec271db5 100644
--- a/tensorflow/core/kernels/string_strip_op.cc
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -37,8 +37,8 @@ class StringStripOp : public OpKernel {
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
 
-    const auto input = input_tensor->flat<string>();
-    auto output = output_tensor->flat<string>();
+    const auto input = input_tensor->flat<tstring>();
+    auto output = output_tensor->flat<tstring>();
 
     for (int64 i = 0; i < input.size(); ++i) {
       StringPiece entry(input(i));
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
index 10fc6ee5434..1505ddbb9bc 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -33,7 +33,7 @@ class LegacyStringToHashBucketOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index 62ef35bbba4..8647695cf46 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -36,7 +36,7 @@ class StringToHashBucketOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
@@ -78,7 +78,7 @@ class StringToKeyedHashBucketOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 22742dd38e5..8340f35428b 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -40,7 +40,7 @@ class StringToNumberOp : public OpKernel {
     // underlying storage.
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
+    const auto& input_flat = input_tensor->flat<tstring>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc
index f2a1d33e7a6..d9f088a7b78 100644
--- a/tensorflow/core/kernels/string_upper_op.cc
+++ b/tensorflow/core/kernels/string_upper_op.cc
@@ -45,8 +45,8 @@ class StringUpperOp : public OpKernel {
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
 
-    const auto input = input_tensor->flat<string>();
-    auto output = output_tensor->flat<string>();
+    const auto input = input_tensor->flat<tstring>();
+    auto output = output_tensor->flat<tstring>();
     if (encoding_.empty()) {
       for (int64 i = 0; i < input.size(); ++i) {
         StringPiece entry(input(i));
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 77b16b9384d..e382381e122 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -59,13 +59,13 @@ class SubstrOp : public OpKernel {
       // Do not need to do broadcasting
 
       // Reshape input
-      auto input = input_tensor.flat<string>();
+      auto input = input_tensor.flat<tstring>();
       // Allocate output
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context,
                      context->allocate_output("output", input_tensor.shape(),
                                               &output_tensor));
-      auto output = output_tensor->flat<string>();
+      auto output = output_tensor->flat<tstring>();
       if (is_scalar) {
         // Perform Op with scalar pos/len
         const T pos =
@@ -141,8 +141,8 @@ class SubstrOp : public OpKernel {
       switch (ndims) {
         case 1: {
           // Reshape tensors according to BCast results
-          auto input = input_tensor.shaped<string, 1>(bcast.x_reshape());
-          auto output = output_tensor->shaped<string, 1>(bcast.result_shape());
+          auto input = input_tensor.shaped<tstring, 1>(bcast.x_reshape());
+          auto output = output_tensor->shaped<tstring, 1>(bcast.result_shape());
           auto pos_shaped = pos_tensor.shaped<T, 1>(bcast.y_reshape());
           auto len_shaped = len_tensor.shaped<T, 1>(bcast.y_reshape());
 
@@ -150,8 +150,8 @@ class SubstrOp : public OpKernel {
           Tensor input_buffer;
           OP_REQUIRES_OK(context, context->allocate_temp(
                                       DT_STRING, output_shape, &input_buffer));
-          TTypes<string, 1>::Tensor input_bcast =
-              input_buffer.shaped<string, 1>(bcast.result_shape());
+          TTypes<tstring, 1>::Tensor input_bcast =
+              input_buffer.shaped<tstring, 1>(bcast.result_shape());
           input_bcast =
               input.broadcast(BCast::ToIndexArray<1>(bcast.x_bcast()));
 
@@ -204,8 +204,8 @@ class SubstrOp : public OpKernel {
         }
         case 2: {
           // Reshape tensors according to BCast results
-          auto input = input_tensor.shaped<string, 2>(bcast.x_reshape());
-          auto output = output_tensor->shaped<string, 2>(bcast.result_shape());
+          auto input = input_tensor.shaped<tstring, 2>(bcast.x_reshape());
+          auto output = output_tensor->shaped<tstring, 2>(bcast.result_shape());
           auto pos_shaped = pos_tensor.shaped<T, 2>(bcast.y_reshape());
           auto len_shaped = len_tensor.shaped<T, 2>(bcast.y_reshape());
 
@@ -213,8 +213,8 @@ class SubstrOp : public OpKernel {
           Tensor input_buffer;
           OP_REQUIRES_OK(context, context->allocate_temp(
                                       DT_STRING, output_shape, &input_buffer));
-          TTypes<string, 2>::Tensor input_bcast =
-              input_buffer.shaped<string, 2>(bcast.result_shape());
+          TTypes<tstring, 2>::Tensor input_bcast =
+              input_buffer.shaped<tstring, 2>(bcast.result_shape());
           input_bcast =
               input.broadcast(BCast::ToIndexArray<2>(bcast.x_bcast()));
 
diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc
index ea6b1ed5006..3aebfe3a212 100644
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@@ -115,7 +115,7 @@ const char* const kUTF8Unit = "UTF8_CHAR";
 Tensor GetTestTensor(int batch) {
   const int sz = TF_ARRAYSIZE(ascii_lines);
   Tensor t(DT_STRING, {batch});
-  auto s = t.flat<string>();
+  auto s = t.flat<tstring>();
   for (int i = 0; i < batch; ++i) {
     s(i) = ascii_lines[i % sz];
   }
@@ -125,7 +125,7 @@ Tensor GetTestTensor(int batch) {
 Tensor GetTestUTF8Tensor(int batch) {
   const int sz = TF_ARRAYSIZE(unicode_lines);
   Tensor t(DT_STRING, {batch});
-  auto s = t.flat<string>();
+  auto s = t.flat<tstring>();
   for (int i = 0; i < batch; ++i) {
     s(i) = unicode_lines[i % sz];
   }
diff --git a/tensorflow/core/kernels/summary_audio_op.cc b/tensorflow/core/kernels/summary_audio_op.cc
index f5ddb9081d6..26be2680b4a 100644
--- a/tensorflow/core/kernels/summary_audio_op.cc
+++ b/tensorflow/core/kernels/summary_audio_op.cc
@@ -44,7 +44,7 @@ class SummaryAudioOp : public OpKernel {
     OP_REQUIRES(c, tensor.dims() >= 2 && tensor.dims() <= 3,
                 errors::InvalidArgument("Tensor must be 3-D or 2-D, got: ",
                                         tensor.shape().DebugString()));
-    const string& base_tag = tag.scalar<string>()();
+    const string& base_tag = tag.scalar<tstring>()();
 
     float sample_rate = sample_rate_attr_;
     if (!has_sample_rate_attr_) {
@@ -92,7 +92,7 @@ class SummaryAudioOp : public OpKernel {
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 
  private:
diff --git a/tensorflow/core/kernels/summary_audio_op_test.cc b/tensorflow/core/kernels/summary_audio_op_test.cc
index 1b957c548b6..d5160a2501f 100644
--- a/tensorflow/core/kernels/summary_audio_op_test.cc
+++ b/tensorflow/core/kernels/summary_audio_op_test.cc
@@ -81,7 +81,7 @@ TEST_F(SummaryAudioOpTest, Basic3D) {
   MakeOp(kMaxOutputs);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag"});
   AddInputFromArray<float>(TensorShape({4, 2, 2}),
                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
@@ -93,7 +93,7 @@ TEST_F(SummaryAudioOpTest, Basic3D) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   CheckAndRemoveEncodedAudio(&summary);
   EXPECT_SummaryMatches(summary, R"(
@@ -115,7 +115,7 @@ TEST_F(SummaryAudioOpTest, Basic2D) {
   MakeOp(kMaxOutputs);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag"});
   AddInputFromArray<float>(TensorShape({4, 4}),
                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
@@ -127,7 +127,7 @@ TEST_F(SummaryAudioOpTest, Basic2D) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   CheckAndRemoveEncodedAudio(&summary);
   EXPECT_SummaryMatches(summary, R"(
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 68f17c2e78d..025e22c958d 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -61,7 +61,7 @@ class SummaryImageOp : public OpKernel {
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
-    const string& base_tag = tags.scalar<string>()();
+    const string& base_tag = tags.scalar<tstring>()();
 
     OP_REQUIRES(c,
                 tensor.dim_size(0) < (1LL << 31) &&
@@ -106,7 +106,7 @@ class SummaryImageOp : public OpKernel {
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 
   template <class T>
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
index 74e0d092c2d..4e4e0d23326 100644
--- a/tensorflow/core/kernels/summary_image_op_test.cc
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -78,7 +78,7 @@ TEST_F(SummaryImageOpTest, ThreeGrayImagesOutOfFive4dInput) {
   MakeOp(3 /* max images */);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag"});
   AddInputFromArray<float>(TensorShape({5, 2, 1, 1}),
                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
   TF_ASSERT_OK(RunOpKernel());
@@ -87,7 +87,7 @@ TEST_F(SummaryImageOpTest, ThreeGrayImagesOutOfFive4dInput) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   CheckAndRemoveEncodedImages(&summary);
   EXPECT_SummaryMatches(summary, R"(
@@ -101,7 +101,7 @@ TEST_F(SummaryImageOpTest, OneGrayImage4dInput) {
   MakeOp(1 /* max images */);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag"});
   AddInputFromArray<float>(TensorShape({5 /*batch*/, 2, 1, 1 /*depth*/}),
                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
   TF_ASSERT_OK(RunOpKernel());
@@ -110,7 +110,7 @@ TEST_F(SummaryImageOpTest, OneGrayImage4dInput) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   CheckAndRemoveEncodedImages(&summary);
   EXPECT_SummaryMatches(summary, R"(
@@ -121,7 +121,7 @@ TEST_F(SummaryImageOpTest, OneColorImage4dInput) {
   MakeOp(1 /* max images */);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag"});
   AddInputFromArray<float>(
       TensorShape({1 /*batch*/, 5 /*rows*/, 2 /*columns*/, 3 /*depth*/}),
       {
@@ -142,7 +142,7 @@ TEST_F(SummaryImageOpTest, OneColorImage4dInput) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   CheckAndRemoveEncodedImages(&summary);
   EXPECT_SummaryMatches(summary, R"(
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index e17e28efc63..7f888da69d6 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -38,13 +38,13 @@ class CreateSummaryFileWriterOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* tmp;
     OP_REQUIRES_OK(ctx, ctx->input("logdir", &tmp));
-    const string logdir = tmp->scalar<string>()();
+    const string logdir = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("max_queue", &tmp));
     const int32 max_queue = tmp->scalar<int32>()();
     OP_REQUIRES_OK(ctx, ctx->input("flush_millis", &tmp));
     const int32 flush_millis = tmp->scalar<int32>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
-    const string filename_suffix = tmp->scalar<string>()();
+    const string filename_suffix = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
@@ -67,13 +67,13 @@ class CreateSummaryDbWriterOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* tmp;
     OP_REQUIRES_OK(ctx, ctx->input("db_uri", &tmp));
-    const string db_uri = tmp->scalar<string>()();
+    const string db_uri = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("experiment_name", &tmp));
-    const string experiment_name = tmp->scalar<string>()();
+    const string experiment_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("run_name", &tmp));
-    const string run_name = tmp->scalar<string>()();
+    const string run_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
-    const string user_name = tmp->scalar<string>()();
+    const string user_name = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(
@@ -132,9 +132,9 @@ class WriteSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<string>()();
+    const string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
-    const string& serialized_metadata = tmp->scalar<string>()();
+    const string& serialized_metadata = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
@@ -166,7 +166,7 @@ class WriteRawProtoSummaryOp : public OpKernel {
     // Each Summary proto contains just one repeated field "value" of Value
     // messages with the actual data, so repeated Merge() is equivalent to
     // concatenating all the Value entries together into a single Event.
-    const auto summary_pbs = t->flat<string>();
+    const auto summary_pbs = t->flat<tstring>();
     for (int i = 0; i < summary_pbs.size(); ++i) {
       if (!event->mutable_summary()->MergeFromString(summary_pbs(i))) {
         ctx->CtxFailureWithWarning(errors::DataLoss(
@@ -191,7 +191,7 @@ class ImportEventOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("event", &t));
     std::unique_ptr<Event> event{new Event};
-    if (!ParseProtoUnlimited(event.get(), t->scalar<string>()())) {
+    if (!ParseProtoUnlimited(event.get(), t->scalar<tstring>()())) {
       ctx->CtxFailureWithWarning(
           errors::DataLoss("Bad tf.Event binary proto tensor string"));
       return;
@@ -212,7 +212,7 @@ class WriteScalarSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<string>()();
+    const string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("value", &t));
@@ -234,7 +234,7 @@ class WriteHistogramSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<string>()();
+    const string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("values", &t));
@@ -262,7 +262,7 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<string>()();
+    const string& tag = tmp->scalar<tstring>()();
     const Tensor* bad_color;
     OP_REQUIRES_OK(ctx, ctx->input("bad_color", &bad_color));
     OP_REQUIRES(
@@ -297,7 +297,7 @@ class WriteAudioSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<string>()();
+    const string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
     const float sample_rate = tmp->scalar<float>()();
 
@@ -326,7 +326,7 @@ class WriteGraphSummaryOp : public OpKernel {
     const int64 step = t->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
     std::unique_ptr<GraphDef> graph{new GraphDef};
-    if (!ParseProtoUnlimited(graph.get(), t->scalar<string>()())) {
+    if (!ParseProtoUnlimited(graph.get(), t->scalar<tstring>()())) {
       ctx->CtxFailureWithWarning(
           errors::DataLoss("Bad tf.GraphDef binary proto tensor string"));
       return;
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 1053aa7d53a..07ebb5e0000 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -47,24 +47,24 @@ class SummaryScalarOp : public OpKernel {
         errors::InvalidArgument(
             "tags and values not the same shape: ", tags.shape().DebugString(),
             " != ", values.shape().DebugString(), SingleTag(tags)));
-    auto Ttags = tags.flat<string>();
+    auto Ttags = tags.flat<tstring>();
     auto Tvalues = values.flat<T>();
     Summary s;
     for (int i = 0; i < Ttags.size(); i++) {
       Summary::Value* v = s.add_value();
-      v->set_tag(Ttags(i));
+      v->set_tag(string(Ttags(i)));  // NOLINT
       v->set_simple_value(float(Tvalues(i)));
     }
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 
   // If there's only one tag, include it in the error message
   static string SingleTag(const Tensor& tags) {
     if (tags.NumElements() == 1) {
-      return strings::StrCat(" (tag '", tags.flat<string>()(0), "')");
+      return strings::StrCat(" (tag '", tags.flat<tstring>()(0), "')");
     } else {
       return "";
     }
@@ -102,12 +102,12 @@ class SummaryHistoOp : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(tags.scalar<string>()());
+    v->set_tag(string(tags.scalar<tstring>()()));  // NOLINT
     histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 };
 
@@ -138,7 +138,7 @@ class SummaryMergeOp : public OpKernel {
     std::unordered_set<string> tags;
     for (int input_num = 0; input_num < c->num_inputs(); input_num++) {
       const Tensor& in = c->input(input_num);
-      auto in_vec = in.flat<string>();
+      auto in_vec = in.flat<tstring>();
       for (int i = 0; i < in_vec.dimension(0); i++) {
         const string& s_in = in_vec(i);
         Summary summary_in;
@@ -164,7 +164,7 @@ class SummaryMergeOp : public OpKernel {
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 };
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 697c03a0082..1e5089bdeab 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -60,7 +60,7 @@ TEST_F(SummaryScalarOpTest, SimpleFloat) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+  AddInputFromArray<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -68,7 +68,7 @@ TEST_F(SummaryScalarOpTest, SimpleFloat) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   EXPECT_SummaryMatches(summary, R"(
       value { tag: 'tag1' simple_value: 1.0 }
       value { tag: 'tag2' simple_value: -0.73 }
@@ -80,7 +80,7 @@ TEST_F(SummaryScalarOpTest, SimpleDouble) {
   MakeOp(DT_DOUBLE);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+  AddInputFromArray<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
   AddInputFromArray<double>(TensorShape({3}), {1.0, -0.73, 10000.0});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -88,7 +88,7 @@ TEST_F(SummaryScalarOpTest, SimpleDouble) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   EXPECT_SummaryMatches(summary, R"(
       value { tag: 'tag1' simple_value: 1.0 }
       value { tag: 'tag2' simple_value: -0.73 }
@@ -100,7 +100,7 @@ TEST_F(SummaryScalarOpTest, SimpleHalf) {
   MakeOp(DT_HALF);
 
   // Feed and run
-  AddInputFromList<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+  AddInputFromList<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
   AddInputFromList<Eigen::half>(TensorShape({3}), {1.0, -2.0, 10000.0});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -108,7 +108,7 @@ TEST_F(SummaryScalarOpTest, SimpleHalf) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   EXPECT_SummaryMatches(summary, R"(
       value { tag: 'tag1' simple_value: 1.0 }
       value { tag: 'tag2' simple_value: -2.0 }
@@ -120,7 +120,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<tstring>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(), "not the same shape")) << s;
@@ -130,7 +130,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+  AddInputFromArray<tstring>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
@@ -142,7 +142,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<tstring>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
@@ -168,7 +168,7 @@ TEST_F(SummaryHistoOpTest, SimpleFloat) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"taghisto"});
+  AddInputFromArray<tstring>(TensorShape({}), {"taghisto"});
   AddInputFromArray<float>(TensorShape({3, 2}),
                            {0.1f, -0.7f, 4.1f, 4., 5.f, 4.f});
   TF_ASSERT_OK(RunOpKernel());
@@ -177,7 +177,7 @@ TEST_F(SummaryHistoOpTest, SimpleFloat) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   ASSERT_EQ(summary.value_size(), 1);
   EXPECT_EQ(summary.value(0).tag(), "taghisto");
   histogram::Histogram histo;
@@ -197,7 +197,7 @@ TEST_F(SummaryHistoOpTest, SimpleDouble) {
   MakeOp(DT_DOUBLE);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"taghisto"});
+  AddInputFromArray<tstring>(TensorShape({}), {"taghisto"});
   AddInputFromArray<double>(TensorShape({3, 2}), {0.1, -0.7, 4.1, 4., 5., 4.});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -205,7 +205,7 @@ TEST_F(SummaryHistoOpTest, SimpleDouble) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   ASSERT_EQ(summary.value_size(), 1);
   EXPECT_EQ(summary.value(0).tag(), "taghisto");
   histogram::Histogram histo;
@@ -225,7 +225,7 @@ TEST_F(SummaryHistoOpTest, SimpleHalf) {
   MakeOp(DT_HALF);
 
   // Feed and run
-  AddInputFromList<string>(TensorShape({}), {"taghisto"});
+  AddInputFromList<tstring>(TensorShape({}), {"taghisto"});
   AddInputFromList<Eigen::half>(TensorShape({3, 2}),
                                 {0.1, -0.7, 4.1, 4., 5., 4.});
   TF_ASSERT_OK(RunOpKernel());
@@ -234,7 +234,7 @@ TEST_F(SummaryHistoOpTest, SimpleHalf) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   ASSERT_EQ(summary.value_size(), 1);
   EXPECT_EQ(summary.value(0).tag(), "taghisto");
   histogram::Histogram histo;
@@ -254,7 +254,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+  AddInputFromArray<tstring>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(), "tags must be scalar")) << s;
@@ -264,7 +264,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   MakeOp(DT_FLOAT);
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<tstring>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(), "tags must be scalar")) << s;
@@ -299,7 +299,7 @@ TEST_F(SummaryMergeOpTest, Simple) {
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
       "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
 
-  AddInputFromArray<string>(
+  AddInputFromArray<tstring>(
       TensorShape({3}),
       {s1.SerializeAsString(), s2.SerializeAsString(), s3.SerializeAsString()});
   TF_ASSERT_OK(RunOpKernel());
@@ -308,7 +308,7 @@ TEST_F(SummaryMergeOpTest, Simple) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   EXPECT_SummaryMatches(summary,
                         "value { tag: \"tag1\" simple_value: 1.0 } "
@@ -333,16 +333,16 @@ TEST_F(SummaryMergeOpTest, Simple_MultipleInputs) {
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
       "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
 
-  AddInputFromArray<string>(TensorShape({}), {s1.SerializeAsString()});
-  AddInputFromArray<string>(TensorShape({}), {s2.SerializeAsString()});
-  AddInputFromArray<string>(TensorShape({}), {s3.SerializeAsString()});
+  AddInputFromArray<tstring>(TensorShape({}), {s1.SerializeAsString()});
+  AddInputFromArray<tstring>(TensorShape({}), {s2.SerializeAsString()});
+  AddInputFromArray<tstring>(TensorShape({}), {s3.SerializeAsString()});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output size.
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
 
   EXPECT_SummaryMatches(summary,
                         "value { tag: \"tag1\" simple_value: 1.0 } "
@@ -363,8 +363,8 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   Summary s2;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
       "value { tag: \"tagduplicate\" simple_value: 1.0 } ", &s2));
-  AddInputFromArray<string>(TensorShape({2}),
-                            {s1.SerializeAsString(), s2.SerializeAsString()});
+  AddInputFromArray<tstring>(TensorShape({2}),
+                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
diff --git a/tensorflow/core/kernels/summary_tensor_op.cc b/tensorflow/core/kernels/summary_tensor_op.cc
index c816974378b..9cbc812ffa9 100644
--- a/tensorflow/core/kernels/summary_tensor_op.cc
+++ b/tensorflow/core/kernels/summary_tensor_op.cc
@@ -39,7 +39,7 @@ class SummaryTensorOpV2 : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(tag.scalar<string>()());
+    v->set_tag(string(tag.scalar<tstring>()()));  // NOLINT
 
     if (tensor.dtype() == DT_STRING) {
       // tensor_util.makeNdarray doesn't work for strings in tensor_content
@@ -49,11 +49,11 @@ class SummaryTensorOpV2 : public OpKernel {
     }
 
     v->mutable_metadata()->ParseFromString(
-        serialized_summary_metadata_tensor.scalar<string>()());
+        serialized_summary_metadata_tensor.scalar<tstring>()());
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 };
 
@@ -92,7 +92,7 @@ class SummaryTensorOp : public OpKernel {
 
     Tensor* summary_tensor = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
   }
 };
 
diff --git a/tensorflow/core/kernels/summary_tensor_op_test.cc b/tensorflow/core/kernels/summary_tensor_op_test.cc
index 55a0cb3ec5a..58aff4ba331 100644
--- a/tensorflow/core/kernels/summary_tensor_op_test.cc
+++ b/tensorflow/core/kernels/summary_tensor_op_test.cc
@@ -62,8 +62,8 @@ TEST_F(SummaryTensorOpV2Test, BasicPluginData) {
   MakeOp();
 
   // Feed and run
-  AddInputFromArray<string>(TensorShape({}), {"tag_foo"});
-  AddInputFromArray<string>(TensorShape({}), {"some string tensor content"});
+  AddInputFromArray<tstring>(TensorShape({}), {"tag_foo"});
+  AddInputFromArray<tstring>(TensorShape({}), {"some string tensor content"});
 
   // Create a SummaryMetadata that stores data for 2 plugins.
   SummaryMetadata summary_metadata;
@@ -71,8 +71,8 @@ TEST_F(SummaryTensorOpV2Test, BasicPluginData) {
       summary_metadata.mutable_plugin_data();
   plugin_data->set_plugin_name("foo");
   plugin_data->set_content("content_for_plugin_foo");
-  AddInputFromArray<string>(TensorShape({}),
-                            {summary_metadata.SerializeAsString()});
+  AddInputFromArray<tstring>(TensorShape({}),
+                             {summary_metadata.SerializeAsString()});
 
   TF_ASSERT_OK(RunOpKernel());
 
@@ -80,14 +80,14 @@ TEST_F(SummaryTensorOpV2Test, BasicPluginData) {
   Tensor* out_tensor = GetOutput(0);
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
   ASSERT_EQ(1, summary.value_size());
 
   // Check the content of the tensor stored in the summary.
   Tensor string_content_tensor;
   CHECK(string_content_tensor.FromProto(summary.value(0).tensor()));
   ASSERT_EQ("some string tensor content",
-            string_content_tensor.scalar<string>()());
+            string_content_tensor.scalar<tstring>()());
 
   // Check plugin-related data.
   ASSERT_EQ("tag_foo", summary.value(0).tag());
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index aa95df6dfe6..957a088e28d 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -60,9 +60,10 @@ namespace {
 // real value of V (which should be computed)
 template <class Scalar>
 __global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m,
-                                      int64 ldu, const Scalar* M,
-                                      const Scalar* U, const Scalar* S,
-                                      Scalar* V) {
+                                      int64 ldu, const Scalar* __restrict__ M,
+                                      const Scalar* __restrict__ U,
+                                      const Scalar* __restrict__ S,
+                                      Scalar* __restrict__ V) {
   GPU_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count.x, X) {
     GPU_AXIS_KERNEL_LOOP(i, config.virtual_thread_count.y, Y) {
       Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
@@ -74,7 +75,8 @@ __global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m,
 // Extracts the sign of V
 // V[i] = V[i]>=0 ? 1 : 0
 template <class Scalar>
-__global__ void ExtractSignOfVKernel(GpuLaunchConfig config, Scalar* V) {
+__global__ void ExtractSignOfVKernel(GpuLaunchConfig config,
+                                     Scalar* __restrict__ V) {
   GPU_1D_KERNEL_LOOP(i, config.virtual_thread_count) {
     V[i] = V[i] >= 0 ? Scalar(1) : Scalar(-1);
   }
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index 8e8faf89837..2bd6ac0b08d 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -91,8 +91,8 @@ Status TensorArray::CopyShapesFrom(TensorArray* rhs,
   if (tensors_.size() != rhs->tensors_.size()) {
     return errors::InvalidArgument(
         "TensorArray sizes do not match during CopyShapesFrom: ",
-        handle_.vec<string>()(1), " has size ", tensors_.size(), " but rhs ",
-        rhs->handle_.vec<string>()(1), " has size ", rhs->tensors_.size());
+        handle_.vec<tstring>()(1), " has size ", tensors_.size(), " but rhs ",
+        rhs->handle_.vec<tstring>()(1), " has size ", rhs->tensors_.size());
   }
   for (std::size_t i = 0; i < tensors_.size(); ++i) {
     // Skip "soft copy" of indices which have not been written.
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 964b4631023..bea97d1a1f1 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -365,7 +365,7 @@ class TensorArray : public ResourceBase {
 
   Status LockedReturnIfClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (closed_) {
-      return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+      return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
                                      " has already been closed.");
     }
     return Status::OK();
@@ -447,7 +447,7 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
   size_t index_size = static_cast<size_t>(index);
   if (index < 0 || (!dynamic_size_ && index_size >= tensors_.size())) {
     return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1), ": Tried to write to index ",
+        "TensorArray ", handle_.vec<tstring>()(1), ": Tried to write to index ",
         index, " but array is not resizeable and size is: ", tensors_.size());
   }
   if (dynamic_size_) {
@@ -464,14 +464,14 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
   Tensor* value_t = value->AccessTensor(ctx);
   if (value_t->dtype() != dtype_) {
     return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1),
+        "TensorArray ", handle_.vec<tstring>()(1),
         ": Could not write to TensorArray index ", index,
         " because the value dtype is ", DataTypeString(value_t->dtype()),
         " but TensorArray dtype is ", DataTypeString(dtype_), ".");
   }
   if (!element_shape_.IsCompatibleWith(value_t->shape())) {
     return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1),
+        "TensorArray ", handle_.vec<tstring>()(1),
         ": Could not write to TensorArray index ", index,
         " because the value shape is ", value_t->shape().DebugString(),
         " which is incompatible with the TensorArray's inferred element "
@@ -482,13 +482,13 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
   }
 
   if (t.read) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
                                    ": Could not write to TensorArray index ",
                                    index, " because it has already been read.");
   }
 
   if (!multiple_writes_aggregate_ && t.written) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
                                    ": Could not write to TensorArray index ",
                                    index,
                                    " because it has already been written to.");
@@ -500,7 +500,7 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
     // Check that value_t shape matches t.shape
     if (value_t->shape() != t.shape) {
       return errors::InvalidArgument(
-          "TensorArray ", handle_.vec<string>()(1),
+          "TensorArray ", handle_.vec<tstring>()(1),
           ": Could not aggregate to TensorArray index ", index,
           " because the existing shape is ", t.shape.DebugString(),
           " but the new input shape is ", value_t->shape().DebugString(), ".");
@@ -568,7 +568,7 @@ Status TensorArray::LockedRead(OpKernelContext* ctx, const int32 index,
       element_shape = tensors_[index].shape;
     } else if (!element_shape_.IsFullyDefined()) {
       return errors::InvalidArgument(
-          "TensorArray ", handle_.vec<string>()(1),
+          "TensorArray ", handle_.vec<tstring>()(1),
           ": Could not read from TensorArray index ", index,
           ".  Furthermore, the element shape is not fully defined: ",
           element_shape_.DebugString(),
@@ -598,7 +598,7 @@ Status TensorArray::LockedRead(OpKernelContext* ctx, const int32 index,
   TensorAndState& t = tensors_[index];
 
   if (t.cleared) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
                                    ": Could not read index ", index,
                                    " twice because it was cleared after a "
                                    "previous read (perhaps try setting "
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index d5c9470cc89..52162e94650 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -65,7 +65,7 @@ Status GetHandle(OpKernelContext* ctx, string* container, string* ta_handle) {
           "Tensor array handle must be 2-element vector, but had shape: ",
           tensor.shape().DebugString());
     }
-    auto h = tensor.flat<string>();
+    auto h = tensor.flat<tstring>();
     *container = h(0);
     *ta_handle = h(1);
   }
@@ -194,7 +194,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
       return errors::InvalidArgument("Size should be >= 0.");
     }
 
-    auto handle = tensor_array_output_handle->flat<string>();
+    auto handle = tensor_array_output_handle->flat<tstring>();
     string unique_tensor_array_name =
         strings::StrCat(tensor_array_name_, "_",
                         TensorArray::tensor_array_counter.fetch_add(1));
@@ -301,7 +301,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
           string(StringPiece(resource.name()).substr(container.size()));
     }
 
-    auto output_handle = tensor_array_output_handle->flat<string>();
+    auto output_handle = tensor_array_output_handle->flat<tstring>();
     output_handle(0) = "_tensor_array_grads";
     output_handle(1) = strings::StrCat(tensor_array_name, "@", source_);
 
diff --git a/tensorflow/core/kernels/tensor_forest/resource_ops.cc b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
index c225d83674f..0c7b9e91263 100644
--- a/tensorflow/core/kernels/tensor_forest/resource_ops.cc
+++ b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
@@ -34,7 +34,7 @@ class TensorForestCreateTreeVariableOp : public OpKernel {
 
     auto* const result = new TensorForestTreeResource();
 
-    if (!result->InitFromSerialized(tree_config_t->scalar<string>()())) {
+    if (!result->InitFromSerialized(tree_config_t->scalar<tstring>()())) {
       result->Unref();
       OP_REQUIRES(context, false,
                   errors::InvalidArgument("Unable to parse tree config."));
@@ -63,7 +63,7 @@ class TensorForestTreeSerializeOp : public OpKernel {
     Tensor* output_config_t = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape(), &output_config_t));
-    output_config_t->scalar<string>()() =
+    output_config_t->scalar<tstring>()() =
         decision_tree_resource->decision_tree().SerializeAsString();
   }
 };
@@ -86,7 +86,7 @@ class TensorForestTreeDeserializeOp : public OpKernel {
     decision_tree_resource->Reset();
 
     if (!decision_tree_resource->InitFromSerialized(
-            tree_config_t->scalar<string>()())) {
+            tree_config_t->scalar<tstring>()())) {
       OP_REQUIRES(context, false,
                   errors::InvalidArgument("Unable to parse tree config."));
     }
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
index 41b59949465..15eb6ec3a3e 100644
--- a/tensorflow/core/kernels/text_line_reader_op.cc
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -56,7 +56,7 @@ class TextLineReader : public ReaderBase {
     return Status::OK();
   }
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     Status status = input_buffer_->ReadLine(value);
     ++line_number_;
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index e63f6206689..a3766c9c8c2 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -50,7 +50,7 @@ class TFRecordReader : public ReaderBase {
     return Status::OK();
   }
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     *key = strings::StrCat(current_work(), ":", offset_);
     Status status = reader_->ReadRecord(&offset_, value);
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index 5a8af3468fa..2a5fb3f62d6 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -81,7 +81,7 @@ TF_CALL_int64(DEFINE_TYPE);
 TF_CALL_half(DEFINE_TYPE);
 TF_CALL_complex64(DEFINE_TYPE);
 TF_CALL_complex128(DEFINE_TYPE);
-TF_CALL_string(DEFINE_TYPE);
+TF_CALL_tstring(DEFINE_TYPE);
 
 #undef DEFINE_TYPE
 
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index b013d68c7fe..a8db29926fc 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -30,8 +30,9 @@ namespace tensorflow {
 namespace internal {
 
 template <typename T>
-__global__ void TileKernel(int nthreads, const T* src, const int32* buf,
-                           const int32 ndims, T* dst) {
+__global__ void TileKernel(int nthreads, const T* __restrict__ src,
+                           const int32* __restrict__ buf, const int32 ndims,
+                           T* __restrict__ dst) {
   const int32* in_strides = buf;
   const int32* out_strides = buf + ndims;
   const int32* in_dim_sizes = buf + ndims * 2;
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index cee334ec707..e1080acb700 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -142,7 +142,7 @@ TF_CALL_int64(DECLARE_TYPE);
 TF_CALL_half(DECLARE_TYPE);
 TF_CALL_complex64(DECLARE_TYPE);
 TF_CALL_complex128(DECLARE_TYPE);
-TF_CALL_string(DECLARE_TYPE);
+TF_CALL_tstring(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #define DECLARE_DIM(T, NDIM)                           \
@@ -241,7 +241,7 @@ class TileOp : public OpKernel {
     TF_CALL_int16(HANDLE_TYPE_NAME);
     TF_CALL_int64(HANDLE_TYPE_NAME);
     TF_CALL_half(HANDLE_TYPE_NAME);
-    TF_CALL_string(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
+    TF_CALL_tstring(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
 
@@ -322,7 +322,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
-TF_CALL_string(HANDLE_TYPE_NAME_CPU);
+TF_CALL_tstring(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index 12717ca11fe..3156b6d9bd9 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -341,8 +341,9 @@ __device__ void mergeShards(int num_shards, int k,
 extern __shared__ char shared_memory[];
 
 template <typename T>
-__global__ void TopKKernel(const T* input, int length, int k, bool sorted,
-                           T* output, int* indices) {
+__global__ void TopKKernel(const T* __restrict__ input, int length, int k,
+                           bool sorted, T* __restrict__ output,
+                           int* __restrict__ indices) {
   const int batch_index = blockIdx.x;
   const T* batch_input = input + batch_index * length;
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 7451004911e..330e02c8490 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -161,6 +161,20 @@ struct ApplyAdagrad<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdagradV2<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
+    var.device(d) -= grad * lr() / (accum.sqrt() + epsilon());
+  }
+};
+
 template <typename T>
 struct ApplyProximalAdagrad<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -1264,6 +1278,106 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdagradV2Op : public OpKernel {
+ public:
+  explicit ApplyAdagradV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& epsilon = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    const Tensor& grad = ctx->input(4);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdagradV2<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+                                         lr.scalar<T>(), epsilon.scalar<T>(),
+                                         grad.flat<T>(), update_slots_);
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool update_slots_;
+};
+
+#define REGISTER_KERNELS(D, T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ApplyAdagradV2").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdagradV2Op<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagradV2")                \
+                              .HostMemory("var")                        \
+                              .HostMemory("accum")                      \
+                              .Device(DEVICE_##D)                       \
+                              .TypeConstraint<T>("T"),                  \
+                          ApplyAdagradV2Op<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyAdagradV2<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar epsilon,                            \
+      typename TTypes<T>::ConstFlat grad, bool update_slots);             \
+  extern template struct ApplyAdagradV2<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyProximalAdagradOp : public OpKernel {
  public:
@@ -1530,6 +1644,179 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyAdagradV2Op : public OpKernel {
+ public:
+  explicit SparseApplyAdagradV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& epsilon = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    const Tensor& grad = ctx->input(4);
+    const Tensor& indices = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    // This op is implemented only for CPU device.
+    const auto& d = ctx->eigen_cpu_device();
+
+    if (N > 0) {
+      const int in_bytes = inner_dim * sizeof(T) * 3;
+      const int out_bytes = inner_dim * sizeof(T) * 2;
+      const int cycles = inner_dim * (Eigen::TensorOpCost::AddCost<T>() * 2 +
+                                      Eigen::TensorOpCost::MulCost<T>() * 2);
+      const Eigen::TensorOpCost cost(in_bytes, out_bytes, cycles);
+
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto accum_flat = accum.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        const T lr_scalar = lr.scalar<T>()();
+        const T epsilon_scalar = epsilon.scalar<T>()();
+
+        for (Tindex i = 0; i < N; ++i) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+        }
+
+        const auto shard = [&](Tindex start_idx, Tindex end_idx) -> void {
+          for (Tindex i = start_idx; i < end_idx; ++i) {
+            const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+            auto a = accum_flat.template chip<0>(index);
+            auto g = grad_flat.template chip<0>(i);
+            auto v = var_flat.template chip<0>(index);
+            if (update_slots_) {
+              a += g.square();
+            }
+            v -= g.constant(lr_scalar) * g /
+                 (a.sqrt() + a.constant(epsilon_scalar));
+          }
+        };
+
+        d.parallelFor(N, cost, shard);
+
+      } else {
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto accum_flat = accum.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        const T epsilon_scalar = epsilon.scalar<T>()();
+        const Tindex first_dim_size = accum_flat.size();
+
+        for (Tindex i = 0; i < N; ++i) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+        }
+
+        const auto shard = [&](Tindex start_idx, Tindex end_idx) -> void {
+          for (Tindex i = start_idx; i < end_idx; ++i) {
+            const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+            T& a = accum_flat(index);
+            const T& g = grad_flat(i);
+            if (update_slots_) {
+              a += g * g;
+            }
+            var_flat(index) -=
+                lr_scalar * g / (Eigen::numext::sqrt(a) + epsilon_scalar);
+          }
+        };
+
+        d.parallelFor(N, cost, shard);
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool update_slots_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagradV2")               \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradV2Op<T, Tindices>);      \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagradV2")       \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradV2Op<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 // Note, this op works on cpu only.
 template <typename T, typename Tindex>
 class SparseApplyProximalAdagradOp : public OpKernel {
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 054f07350e6..e1776dddfbe 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -71,6 +71,15 @@ struct ApplyAdagrad {
                   typename TTypes<T>::ConstFlat grad, bool update_slots);
 };
 
+template <typename Device, typename T>
+struct ApplyAdagradV2 {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool update_slots);
+};
+
 template <typename Device, typename T>
 struct ApplyAdagradDA {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index e67ac07517f..b9240cc5325 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -53,6 +53,25 @@ struct ApplyAdagrad<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdagradV2<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
+    const auto update =
+        grad / (accum.sqrt() + epsilon.reshape(single).broadcast(bcast));
+    var.device(d) -= lr.reshape(single).broadcast(bcast) * update;
+  }
+};
+
 template <typename T>
 struct ApplyAdadelta<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -348,6 +367,10 @@ template struct functor::ApplyAdagrad<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdagrad<GPUDevice, float>;
 template struct functor::ApplyAdagrad<GPUDevice, double>;
 
+template struct functor::ApplyAdagradV2<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdagradV2<GPUDevice, float>;
+template struct functor::ApplyAdagradV2<GPUDevice, double>;
+
 template struct functor::ApplyAdadelta<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdadelta<GPUDevice, float>;
 template struct functor::ApplyAdadelta<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index add4635331e..a89fc40d772 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -225,7 +225,7 @@ Status DoTransposeImpl(const Device& d, const Tensor& in,
       break;
 
     case DT_STRING:
-      Transpose<Device, string>::run(d, in, perm, out);
+      Transpose<Device, tstring>::run(d, in, perm, out);
       break;
 
     default:
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 5198df7e16e..1271c02fae7 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -186,7 +186,7 @@ struct Transpose<SYCLDevice, T, conjugate> {
 };
 
 template <bool conjugate>
-struct Transpose<SYCLDevice, string, conjugate> {
+struct Transpose<SYCLDevice, tstring, conjugate> {
   static void run(const SYCLDevice& d, const Tensor& in,
                   const gtl::ArraySlice<int32> perm, Tensor* out) {
     LOG(FATAL) << "DT_STRING not supported on SYCL device.";
@@ -194,7 +194,7 @@ struct Transpose<SYCLDevice, string, conjugate> {
 };
 
 // Explicit instantiation.
-template struct Transpose<SYCLDevice, string, false>;
+template struct Transpose<SYCLDevice, tstring, false>;
 
 INSTANTIATE(SYCLDevice)
 #undef INSTANTIATE
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index 4f54d7340aa..cb54533998e 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -32,8 +32,9 @@ namespace tensorflow {
 namespace internal {
 
 template <typename T, bool conjugate>
-__global__ void TransposeKernel(int nthreads, const T* src, const int32* buf,
-                                const int32 ndims, T* dst) {
+__global__ void TransposeKernel(int nthreads, const T* __restrict__ src,
+                                const int32* __restrict__ buf,
+                                const int32 ndims, T* __restrict__ dst) {
   const int32* in_strides = buf;
   const int32* out_strides = buf + ndims;
   const int32* perm = buf + ndims * 2;
@@ -201,7 +202,7 @@ struct Transpose<GPUDevice, T, conjugate> {
 #undef HANDLE_DIM
 
 template <bool conjugate>
-struct Transpose<GPUDevice, string, conjugate> {
+struct Transpose<GPUDevice, tstring, conjugate> {
   static void run(const GPUDevice& d, const Tensor& in,
                   const gtl::ArraySlice<int32> perm, Tensor* out) {
     LOG(FATAL) << "Transpose of DT_STRING tensor not supported on GPU.";
@@ -209,7 +210,7 @@ struct Transpose<GPUDevice, string, conjugate> {
 };
 
 // Explicit instantiation.
-template struct Transpose<GPUDevice, string, false>;
+template struct Transpose<GPUDevice, tstring, false>;
 
 template <>
 Status DoTranspose(const GPUDevice& device, const Tensor& in,
diff --git a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc
index 8c829be4785..1c82cc18e32 100644
--- a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc
@@ -35,10 +35,11 @@ namespace tensorflow {
 
 template <typename Scalar>
 __global__ void TridiagonalMatMulKernel(int batch_size, int m, int n,
-                                        const Scalar* superdiag,
-                                        const Scalar* maindiag,
-                                        const Scalar* subdiag,
-                                        const Scalar* rhs, Scalar* product) {
+                                        const Scalar* __restrict__ superdiag,
+                                        const Scalar* __restrict__ maindiag,
+                                        const Scalar* __restrict__ subdiag,
+                                        const Scalar* __restrict__ rhs,
+                                        Scalar* __restrict__ product) {
   for (int i : CudaGridRangeX(batch_size * m * n)) {
     int row_id = i / n;
     Scalar result = maindiag[row_id] * rhs[i];
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
index 88a3f2d1ca9..4899cd8642f 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
@@ -40,9 +40,12 @@ static const char kNotInvertibleScalarMsg[] =
     "The matrix is not invertible: it is a scalar with value zero.";
 
 template <typename Scalar>
-__global__ void SolveForSizeOneOrTwoKernel(const int m, const Scalar* diags,
-                                           const Scalar* rhs, const int num_rhs,
-                                           Scalar* x, bool* not_invertible) {
+__global__ void SolveForSizeOneOrTwoKernel(const int m,
+                                           const Scalar* __restrict__ diags,
+                                           const Scalar* __restrict__ rhs,
+                                           const int num_rhs,
+                                           Scalar* __restrict__ x,
+                                           bool* __restrict__ not_invertible) {
   if (m == 1) {
     if (diags[1] == Scalar(0)) {
       *not_invertible = true;
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index 59ebbedcd7f..331139d2fe4 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -52,7 +52,7 @@ namespace tensorflow {
 namespace {
 
 void Encode(const UnicodeEncoding encoding, const icu::UnicodeString& in,
-            string* out) {
+            tstring* out) {
   if (encoding == UnicodeEncoding::UTF8) {
     out->clear();
     in.toUTF8String(*out);
@@ -295,10 +295,10 @@ class UnicodeTranscodeOp : public OpKernel {
     } else {
       OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
                                                &output_tensor));
-      output_tensor->flat<string>() = input_tensor->flat<string>();
+      output_tensor->flat<tstring>() = input_tensor->flat<tstring>();
     }
 
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
     bool found_any_format_error = false;
     for (size_t i = 0; i < output_flat.size(); ++i) {
       Transcode(&(output_flat(i)), input_encoder->converter_,
@@ -330,7 +330,7 @@ class UnicodeTranscodeOp : public OpKernel {
   // Transcode the string from input encoding to the output_encoding_. If
   // non-valid characters are encountered, use the subst_/elide_replacement_
   // config to handle them.
-  void Transcode(string* s, UConverter* input_encoder,
+  void Transcode(tstring* s, UConverter* input_encoder,
                  bool* found_any_format_error) {
     icu::UnicodeString source;
     IterateUnicodeString(
@@ -404,7 +404,7 @@ class UnicodeDecodeBaseOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
 
     // Go through all the strings in `input`.
-    const auto& input_vec = input_tensor->flat<string>();
+    const auto& input_vec = input_tensor->flat<tstring>();
 
     std::unique_ptr<WrappedConverter> input_encoder =
         absl::make_unique<WrappedConverter>();
@@ -538,7 +538,7 @@ class UnicodeEncodeOp : public OpKernel {
     Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
                                                      &output_tensor));
-    auto output_tensor_flat = output_tensor->flat<string>();
+    auto output_tensor_flat = output_tensor->flat<tstring>();
 
     // Use a single index over the flattened input values tensor.
     int idx = 0;
@@ -561,9 +561,9 @@ class UnicodeEncodeOp : public OpKernel {
         appendable_unicode_string.appendCodePoint(code_point);
       }
       // Encode our string and save in the output.
-      string result;
+      tstring result;
       Encode(encoding_, unicode_string, &result);
-      output_tensor_flat(i - 1) = result;
+      output_tensor_flat(i - 1) = std::move(result);
     }
   }
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index adf84bae49c..a3ceedfc733 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -236,7 +236,8 @@ class UniqueOp : public OpKernel {
                               .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
-REGISTER_UNIQUE(string)
+REGISTER_UNIQUE(tstring)
+REGISTER_UNIQUE(bool)
 #undef REGISTER_UNIQUE
 
 // Fake integer GPU kernels so that the use of Unique in optimizers (to
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index 0273f08090b..e2da66d42d9 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -132,7 +132,7 @@ static void BM_Unique_STRING(int iters, int dim) {
                   .Attr("T", DT_STRING)
                   .Finalize(g, &node));
 
-  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(string));
+  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
   testing::UseRealTime();
   testing::StartTiming();
   test::Benchmark("cpu", g).Run(iters);
diff --git a/tensorflow/core/kernels/unsorted_segment_join_op.cc b/tensorflow/core/kernels/unsorted_segment_join_op.cc
index 4ab890c44bd..f0b9388f7cf 100644
--- a/tensorflow/core/kernels/unsorted_segment_join_op.cc
+++ b/tensorflow/core/kernels/unsorted_segment_join_op.cc
@@ -115,9 +115,9 @@ class UnsortedSegmentJoinOp : public OpKernel {
                                                      &output_tensor));
 
     // Preprating flat tensors.
-    auto output_flat = output_tensor->flat<string>();
+    auto output_flat = output_tensor->flat<tstring>();
     auto flat_segment_id = segment_id.flat<INDICES_TYPE>();
-    auto flat_input = input.flat<string>();
+    auto flat_input = input.flat<tstring>();
 
     for (int i = 0; i < flat_segment_id.size(); i++) {
       OP_REQUIRES(
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 8c228d60ebb..bbed89629ef 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -52,7 +52,7 @@ namespace functor {
 template <int NDIM, typename TIndex>
 __global__ void PropagateWhereIndicesKernel(
     const TIndex output_rows, const typename Eigen::array<TIndex, NDIM> strides,
-    int64* output) {
+    int64* __restrict__ output) {
   // TODO(ebrevdo): Use a multi-dimensional loop, increasing the
   // dimensions of individual indices manually, instead of relying on
   // a scalar loop variable and using integer division.
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index b617b76a508..bf70033b5c3 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -34,8 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-static Status ReadEntireFile(Env* env, const string& filename,
-                             string* contents) {
+template <typename T>
+static Status ReadEntireFile(Env* env, const string& filename, T* contents) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
   io::RandomAccessInputStream input_stream(file.get());
@@ -50,7 +50,7 @@ class WholeFileReader : public ReaderBase {
       : ReaderBase(strings::StrCat("WholeFileReader '", node_name, "'")),
         env_(env) {}
 
-  Status ReadLocked(string* key, string* value, bool* produced,
+  Status ReadLocked(tstring* key, tstring* value, bool* produced,
                     bool* at_end) override {
     *key = current_work();
     TF_RETURN_IF_ERROR(ReadEntireFile(env_, *key, value));
@@ -61,14 +61,14 @@ class WholeFileReader : public ReaderBase {
 
   // Stores state in a ReaderBaseState proto, since WholeFileReader has
   // no additional state beyond ReaderBase.
-  Status SerializeStateLocked(string* state) override {
+  Status SerializeStateLocked(tstring* state) override {
     ReaderBaseState base_state;
     SaveBaseState(&base_state);
-    base_state.SerializeToString(state);
+    SerializeToTString(base_state, state);
     return Status::OK();
   }
 
-  Status RestoreStateLocked(const string& state) override {
+  Status RestoreStateLocked(const tstring& state) override {
     ReaderBaseState base_state;
     if (!ParseProtoUnlimited(&base_state, state)) {
       return errors::InvalidArgument("Could not parse state for ", name(), ": ",
@@ -112,8 +112,8 @@ class ReadFileOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output("contents",
                                                      TensorShape({}), &output));
     OP_REQUIRES_OK(context,
-                   ReadEntireFile(context->env(), input->scalar<string>()(),
-                                  &output->scalar<string>()()));
+                   ReadEntireFile(context->env(), input->scalar<tstring>()(),
+                                  &output->scalar<tstring>()()));
   }
 };
 
@@ -135,14 +135,14 @@ class WriteFileOp : public OpKernel {
                 errors::InvalidArgument(
                     "Contents tensor must be scalar, but had shape: ",
                     contents_input->shape().DebugString()));
-    const string& filename = filename_input->scalar<string>()();
+    const string& filename = filename_input->scalar<tstring>()();
     const string dir(io::Dirname(filename));
     if (!context->env()->FileExists(dir).ok()) {
       OP_REQUIRES_OK(context, context->env()->RecursivelyCreateDir(dir));
     }
     OP_REQUIRES_OK(context,
                    WriteStringToFile(context->env(), filename,
-                                     contents_input->scalar<string>()()));
+                                     contents_input->scalar<tstring>()()));
   }
 };
 
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 3477445197a..42b70e92bab 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -209,14 +209,14 @@ class SkipgramOp : public OpKernel {
     vocab_size_ = static_cast<int32>(1 + ordered.size());
     Tensor word(DT_STRING, TensorShape({vocab_size_}));
     Tensor freq(DT_INT32, TensorShape({vocab_size_}));
-    word.flat<string>()(0) = "UNK";
+    word.flat<tstring>()(0) = "UNK";
     static const int32 kUnkId = 0;
     std::unordered_map<string, int32> word_id;
     int64 total_counted = 0;
     for (std::size_t i = 0; i < ordered.size(); ++i) {
       const auto& w = ordered[i].first;
       auto id = i + 1;
-      word.flat<string>()(id) = w;
+      word.flat<tstring>()(id) = w;
       auto word_count = ordered[i].second;
       freq.flat<int32>()(id) = word_count;
       total_counted += word_count;
diff --git a/tensorflow/core/lib/bfloat16/BUILD b/tensorflow/core/lib/bfloat16/BUILD
new file mode 100644
index 00000000000..4f955c37f3f
--- /dev/null
+++ b/tensorflow/core/lib/bfloat16/BUILD
@@ -0,0 +1,21 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "bfloat16",
+    srcs = ["bfloat16.cc"],
+    hdrs = ["bfloat16.h"],
+    deps = [
+        "//tensorflow/core/platform:byte_order",
+        "//third_party/eigen3",
+    ],
+)
+
+# TODO(bmzhao): Remove the following once references in core/BUILD is removed.
+exports_files(
+    glob(["*"]),
+)
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 1294ccff267..a133f7e0f17 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -494,7 +494,13 @@ inline bool isnan(const bfloat16& a) { return std::isnan(float(a)); }
 inline bool isfinite(const bfloat16& a) { return std::isfinite(float(a)); }
 inline bfloat16 abs(const bfloat16& a) { return bfloat16(std::abs(float(a))); }
 inline bfloat16 exp(const bfloat16& a) { return bfloat16(std::exp(float(a))); }
+inline bfloat16 expm1(const bfloat16& a) {
+  return bfloat16(std::expm1(float(a)));
+}
 inline bfloat16 log(const bfloat16& a) { return bfloat16(std::log(float(a))); }
+inline bfloat16 log1p(const bfloat16& a) {
+  return bfloat16(std::log1p(float(a)));
+}
 inline bfloat16 log10(const bfloat16& a) {
   return bfloat16(std::log10(float(a)));
 }
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
new file mode 100644
index 00000000000..afa38ee1363
--- /dev/null
+++ b/tensorflow/core/lib/core/BUILD
@@ -0,0 +1,198 @@
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are:
+# arena, blocking_counter, errors, notification, status, threadpool
+# threadpool_interface, threadpool_options, + all tests.
+
+cc_library(
+    name = "bitmap",
+    srcs = ["bitmap.cc"],
+    hdrs = ["bitmap.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "bits",
+    hdrs = ["bits.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "coding",
+    srcs = ["coding.cc"],
+    hdrs = ["coding.h"],
+    deps = [
+        "//tensorflow/core/lib/core:raw_coding",
+        "//tensorflow/core/lib/core:stringpiece",
+        "//tensorflow/core/platform:byte_order",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "raw_coding",
+    hdrs = ["raw_coding.h"],
+    deps = [
+        "//tensorflow/core/platform:byte_order",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "refcount",
+    hdrs = ["refcount.h"],
+    deps = ["//tensorflow/core/platform:logging"],
+)
+
+cc_library(
+    name = "stringpiece",
+    hdrs = ["stringpiece.h"],
+    deps = [
+        "//tensorflow/core/platform:stringpiece",
+    ],
+)
+
+tf_proto_library(
+    name = "error_codes_proto",
+    srcs = ["error_codes.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    provide_cc_alias = True,
+)
+
+filegroup(
+    name = "legacy_lib_core_all_headers",
+    srcs = [
+        "arena.h",
+        "bitmap.h",
+        "bits.h",
+        "blocking_counter.h",
+        "coding.h",
+        "errors.h",
+        "notification.h",
+        "raw_coding.h",
+        "refcount.h",
+        "status.h",
+        "status_test_util.h",
+        "stringpiece.h",
+        "threadpool.h",
+        "threadpool_interface.h",
+        "threadpool_options.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_all_srcs",
+    srcs = [
+        "arena.cc",
+        "bitmap.cc",
+        "coding.cc",
+        "status.cc",
+        "threadpool.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_all_tests",
+    srcs = [
+        "arena_test.cc",
+        "bitmap_test.cc",
+        "blocking_counter_test.cc",
+        "coding_test.cc",
+        "notification_test.cc",
+        "refcount_test.cc",
+        "status_test.cc",
+        "threadpool_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_core_headers",
+    srcs = [
+        "blocking_counter.h",
+        "refcount.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_status_header",
+    srcs = [
+        "status.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_status_test_util_header",
+    srcs = [
+        "status_test_util.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_stringpiece_header",
+    srcs = [
+        "stringpiece.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_threadpool_options_header",
+    srcs = [
+        "threadpool_options.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_proto_parsing_headers",
+    srcs = [
+        "errors.h",
+        "status.h",
+        "stringpiece.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_core_headers",
+    srcs = [
+        "arena.h",
+        "bitmap.h",
+        "bits.h",
+        "coding.h",
+        "errors.h",
+        "notification.h",
+        "raw_coding.h",
+        "status.h",
+        "stringpiece.h",
+        "threadpool.h",
+        "threadpool_interface.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+# This is needed because of how tf_android_core_proto_sources parses proto paths.
+exports_files(
+    srcs = ["error_codes.proto"],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/core/bitmap.cc b/tensorflow/core/lib/core/bitmap.cc
index 38fcafb2c6e..5048c578ff2 100644
--- a/tensorflow/core/lib/core/bitmap.cc
+++ b/tensorflow/core/lib/core/bitmap.cc
@@ -97,7 +97,7 @@ size_t Bitmap::FirstUnset(size_t start) const {
 string Bitmap::ToString() const {
   string result;
   result.resize(bits());
-  for (int i = 0; i < nbits_; i++) {
+  for (size_t i = 0; i < nbits_; i++) {
     result[i] = get(i) ? '1' : '0';
   }
   return result;
diff --git a/tensorflow/core/lib/core/bitmap_test.cc b/tensorflow/core/lib/core/bitmap_test.cc
index 5046c5e2c3c..cf418f552ad 100644
--- a/tensorflow/core/lib/core/bitmap_test.cc
+++ b/tensorflow/core/lib/core/bitmap_test.cc
@@ -71,7 +71,7 @@ TEST(BitmapTest, FirstUnset) {
         // Fill rest with a pattern of 0 followed by q 1s.
         while (i < n) {
           i++;
-          for (int j = 0; j < q && i < n; j++, i++) {
+          for (size_t j = 0; j < q && i < n; j++, i++) {
             one_count++;
             bitmap.set(i);
           }
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 6edff139ae2..ba389e5bfb3 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -26,13 +26,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
 #define TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
 
-#include "absl/strings/string_view.h"
-
-namespace tensorflow {
-
-// Deprecated: please use absl::string_view directly.
-using StringPiece = absl::string_view;
-
-}  // namespace tensorflow
+#include "tensorflow/core/platform/stringpiece.h"
 
 #endif  // TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
new file mode 100644
index 00000000000..fca412f40ef
--- /dev/null
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -0,0 +1,187 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are:
+# compactptrset, flatmap, flatset, manual_constructor, + all tests.
+
+cc_library(
+    name = "array_slice",
+    hdrs = ["array_slice.h"],
+    deps = [
+        "//tensorflow/core/lib/gtl:inlined_vector",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cleanup",
+    hdrs = ["cleanup.h"],
+    deps = ["//tensorflow/core/platform:macros"],
+)
+
+cc_library(
+    name = "edit_distance",
+    hdrs = ["edit_distance.h"],
+    deps = [
+        "//tensorflow/core/lib/gtl:array_slice",
+        "//tensorflow/core/lib/gtl:inlined_vector",
+    ],
+)
+
+cc_library(
+    name = "flatrep",
+    hdrs = ["flatrep.h"],
+    deps = [
+        "//tensorflow/core/platform:prefetch",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "inlined_vector",
+    hdrs = ["inlined_vector.h"],
+    deps = [
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/container:inlined_vector",
+    ],
+)
+
+cc_library(
+    name = "int_type",
+    hdrs = ["int_type.h"],
+    deps = [
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "iterator_range",
+    hdrs = ["iterator_range.h"],
+    deps = [],
+)
+
+cc_library(
+    name = "map_util",
+    srcs = [
+        "map_util.h",
+        "subtle/map_traits.h",
+    ],
+    hdrs = ["map_util.h"],
+)
+
+cc_library(
+    name = "optional",
+    hdrs = ["optional.h"],
+    deps = ["@com_google_absl//absl/types:optional"],
+)
+
+cc_library(
+    name = "priority_queue_util",
+    hdrs = ["priority_queue_util.h"],
+    deps = [],
+)
+
+cc_library(
+    name = "stl_util",
+    hdrs = ["stl_util.h"],
+    deps = ["@com_google_absl//absl/meta:type_traits"],
+)
+
+cc_library(
+    name = "top_n",
+    hdrs = ["top_n.h"],
+    deps = ["//tensorflow/core/platform:logging"],
+)
+
+filegroup(
+    name = "legacy_lib_gtl_headers",
+    srcs = [
+        "array_slice.h",
+        "cleanup.h",
+        "compactptrset.h",
+        "edit_distance.h",
+        "flatmap.h",
+        "flatset.h",
+        "inlined_vector.h",
+        "optional.h",
+        "priority_queue_util.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_public_gtl_headers",
+    srcs = [
+        "edit_distance.h",
+        "int_type.h",
+        "iterator_range.h",
+        "manual_constructor.h",
+        "map_util.h",
+        "stl_util.h",
+        "top_n.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_test_internal_headers",
+    srcs = [
+        "manual_constructor.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_android_gif_internal_headers",
+    srcs = [
+        "cleanup.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_gtl_all_headers",
+    srcs = [
+        "array_slice.h",
+        "cleanup.h",
+        "compactptrset.h",
+        "edit_distance.h",
+        "flatmap.h",
+        "flatrep.h",
+        "flatset.h",
+        "inlined_vector.h",
+        "int_type.h",
+        "iterator_range.h",
+        "manual_constructor.h",
+        "map_util.h",
+        "optional.h",
+        "priority_queue_util.h",
+        "stl_util.h",
+        "subtle/map_traits.h",
+        "top_n.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_gtl_tests",
+    srcs = [
+        "cleanup_test.cc",
+        "compactptrset_test.cc",
+        "edit_distance_test.cc",
+        "flatmap_test.cc",
+        "flatset_test.cc",
+        "int_type_test.cc",
+        "iterator_range_test.cc",
+        "manual_constructor_test.cc",
+        "map_util_test.cc",
+        "top_n_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/gtl/top_n.h b/tensorflow/core/lib/gtl/top_n.h
index 900f670a55d..5c4e01f238b 100644
--- a/tensorflow/core/lib/gtl/top_n.h
+++ b/tensorflow/core/lib/gtl/top_n.h
@@ -235,10 +235,18 @@ void TopN<T, Cmp>::PushInternal(U &&v, T *dropped) {  // NOLINT(build/c++11)
   } else {
     // Only insert the new element if it is greater than the least element.
     if (cmp_(v, elements_.front())) {
+      // Store new element in the last slot of elements_.  Remember from the
+      // comments on elements_ that this last slot is unused, so we don't
+      // overwrite anything useful.
       elements_.back() = std::forward<U>(v);  // NOLINT(build/c++11)
-      std::push_heap(elements_.begin(), elements_.end(), cmp_);
-      if (dropped) *dropped = std::move(elements_.front());
+
+      // stp::pop_heap() swaps elements_.front() and elements_.back() and
+      // rearranges elements from [elements_.begin(), elements_.end() - 1) such
+      // that they are a heap according to cmp_.  Net effect: remove
+      // elements_.front() from the heap, and add the new element instead.  For
+      // more info, see https://en.cppreference.com/w/cpp/algorithm/pop_heap.
       std::pop_heap(elements_.begin(), elements_.end(), cmp_);
+      if (dropped) *dropped = std::move(elements_.back());
     } else {
       if (dropped) *dropped = std::forward<U>(v);  // NOLINT(build/c++11)
     }
diff --git a/tensorflow/core/lib/io/buffered_inputstream.cc b/tensorflow/core/lib/io/buffered_inputstream.cc
index b247e9c5756..dbf73fb337a 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream.cc
@@ -85,7 +85,7 @@ Status BufferedInputStream::ReadLineHelper(string* result, bool include_eol) {
   return s;
 }
 
-Status BufferedInputStream::ReadNBytes(int64 bytes_to_read, string* result) {
+Status BufferedInputStream::ReadNBytes(int64 bytes_to_read, tstring* result) {
   if (bytes_to_read < 0) {
     return errors::InvalidArgument("Can't read a negative number of bytes: ",
                                    bytes_to_read);
@@ -167,7 +167,8 @@ Status BufferedInputStream::Seek(int64 position) {
   return SkipNBytes(position - bufpos);
 }
 
-Status BufferedInputStream::ReadAll(string* result) {
+template <typename T>
+Status BufferedInputStream::ReadAll(T* result) {
   result->clear();
   Status status;
   while (status.ok()) {
@@ -186,6 +187,11 @@ Status BufferedInputStream::ReadAll(string* result) {
   return status;
 }
 
+template Status BufferedInputStream::ReadAll<string>(string* result);
+#ifdef USE_TSTRING
+template Status BufferedInputStream::ReadAll<tstring>(tstring* result);
+#endif  // USE_TSTRING
+
 Status BufferedInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
   pos_ = 0;
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index 96a95b7ed95..a574d4517ac 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -41,7 +41,7 @@ class BufferedInputStream : public InputStreamInterface {
 
   ~BufferedInputStream() override;
 
-  Status ReadNBytes(int64 bytes_to_read, string* result) override;
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
 
   Status SkipNBytes(int64 bytes_to_skip) override;
 
@@ -79,7 +79,8 @@ class BufferedInputStream : public InputStreamInterface {
   //
   // Note: the amount of memory used by this function call is unbounded, so only
   // use in ops that expect that behavior.
-  Status ReadAll(string* result);
+  template <typename T>
+  Status ReadAll(T* result);
 
   Status Reset() override;
 
@@ -89,7 +90,7 @@ class BufferedInputStream : public InputStreamInterface {
 
   InputStreamInterface* input_stream_;  // not owned.
   size_t size_;                         // buffer size.
-  string buf_;                          // the buffer itself.
+  tstring buf_;                         // the buffer itself.
   // buf_[pos_, limit_) holds the valid "read ahead" data in the file.
   size_t pos_ = 0;    // current position in buf_.
   size_t limit_ = 0;  // just past the end of valid data in buf_.
diff --git a/tensorflow/core/lib/io/buffered_inputstream_test.cc b/tensorflow/core/lib/io/buffered_inputstream_test.cc
index ad4c8013bc2..ee4e11ac824 100644
--- a/tensorflow/core/lib/io/buffered_inputstream_test.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream_test.cc
@@ -163,7 +163,7 @@ TEST(BufferedInputStream, ReadNBytes) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
-    string read;
+    tstring read;
     BufferedInputStream in(input_stream.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_ASSERT_OK(in.ReadNBytes(3, &read));
@@ -200,7 +200,7 @@ TEST(BufferedInputStream, SkipNBytes) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
-    string read;
+    tstring read;
     BufferedInputStream in(input_stream.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_ASSERT_OK(in.SkipNBytes(3));
@@ -235,7 +235,7 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
   for (auto buf_size : BufferSizes()) {
-    string read;
+    tstring read;
     BufferedInputStream in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_ASSERT_OK(in.ReadNBytes(3, &read));
@@ -270,7 +270,7 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
   for (auto buf_size : BufferSizes()) {
-    string read;
+    tstring read;
     BufferedInputStream in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_ASSERT_OK(in.SkipNBytes(3));
@@ -307,7 +307,7 @@ TEST(BufferedInputStream, Seek) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
-    string read;
+    tstring read;
     BufferedInputStream in(input_stream.get(), buf_size);
 
     // Seek forward
@@ -378,7 +378,7 @@ void BM_BufferedReaderSmallReads(const int iters, const int buff_size,
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
-  string result;
+  tstring result;
   testing::StartTiming();
 
   for (int itr = 0; itr < iters; ++itr) {
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index 4d35af49b2c..820fdc262b6 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -42,7 +42,8 @@ Status InputBuffer::FillBuffer() {
   return s;
 }
 
-Status InputBuffer::ReadLine(string* result) {
+template <typename T>
+Status InputBuffer::ReadLine(T* result) {
   result->clear();
   Status s;
   do {
@@ -71,6 +72,11 @@ Status InputBuffer::ReadLine(string* result) {
   return s;
 }
 
+template Status InputBuffer::ReadLine<string>(string* result);
+#ifdef USE_TSTRING
+template Status InputBuffer::ReadLine<tstring>(tstring* result);
+#endif  // USE_TSTRING
+
 Status InputBuffer::ReadNBytes(int64 bytes_to_read, string* result) {
   result->clear();
   if (bytes_to_read < 0) {
diff --git a/tensorflow/core/lib/io/inputbuffer.h b/tensorflow/core/lib/io/inputbuffer.h
index b3740f396ce..f04d37ed1ff 100644
--- a/tensorflow/core/lib/io/inputbuffer.h
+++ b/tensorflow/core/lib/io/inputbuffer.h
@@ -43,7 +43,8 @@ class InputBuffer {
   // If successful, returns OK.  If we are already at the end of the
   // file, we return an OUT_OF_RANGE error.  Otherwise, we return
   // some other non-OK status.
-  Status ReadLine(string* result);
+  template <typename T>
+  Status ReadLine(T* result);
 
   // Reads bytes_to_read bytes into *result, overwriting *result.
   //
diff --git a/tensorflow/core/lib/io/inputstream_interface.cc b/tensorflow/core/lib/io/inputstream_interface.cc
index b18d833e063..2318068eb67 100644
--- a/tensorflow/core/lib/io/inputstream_interface.cc
+++ b/tensorflow/core/lib/io/inputstream_interface.cc
@@ -28,7 +28,7 @@ Status InputStreamInterface::SkipNBytes(int64 bytes_to_skip) {
   if (bytes_to_skip < 0) {
     return errors::InvalidArgument("Can't skip a negative number of bytes");
   }
-  string unused;
+  tstring unused;
   // Read kDefaultSkipSize at a time till bytes_to_skip.
   while (bytes_to_skip > 0) {
     int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
diff --git a/tensorflow/core/lib/io/inputstream_interface.h b/tensorflow/core/lib/io/inputstream_interface.h
index 0165bb7dce9..1cb30265c34 100644
--- a/tensorflow/core/lib/io/inputstream_interface.h
+++ b/tensorflow/core/lib/io/inputstream_interface.h
@@ -35,7 +35,7 @@ class InputStreamInterface {
   // Reads the next bytes_to_read from the file. Typical return codes:
   //  * OK - in case of success.
   //  * OUT_OF_RANGE - not enough bytes remaining before end of file.
-  virtual Status ReadNBytes(int64 bytes_to_read, string* result) = 0;
+  virtual Status ReadNBytes(int64 bytes_to_read, tstring* result) = 0;
 
 #if defined(PLATFORM_GOOGLE)
   // Reads the next bytes_to_read from the file. Typical return codes:
diff --git a/tensorflow/core/lib/io/inputstream_interface_test.cc b/tensorflow/core/lib/io/inputstream_interface_test.cc
index 43c4c55b7ad..984e6907eed 100644
--- a/tensorflow/core/lib/io/inputstream_interface_test.cc
+++ b/tensorflow/core/lib/io/inputstream_interface_test.cc
@@ -27,7 +27,7 @@ class TestStringStream : public InputStreamInterface {
  public:
   explicit TestStringStream(const string& content) : content_(content) {}
 
-  Status ReadNBytes(int64 bytes_to_read, string* result) override {
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override {
     result->clear();
     if (pos_ + bytes_to_read > content_.size()) {
       return errors::OutOfRange("limit reached");
@@ -51,7 +51,7 @@ class TestStringStream : public InputStreamInterface {
 
 TEST(InputStreamInterface, Basic) {
   TestStringStream ss("This is a test string");
-  string res;
+  tstring res;
   TF_ASSERT_OK(ss.ReadNBytes(4, &res));
   EXPECT_EQ("This", res);
   TF_ASSERT_OK(ss.SkipNBytes(6));
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 0d7bd2edf2b..60b1a31be99 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -30,7 +30,7 @@ RandomAccessInputStream::~RandomAccessInputStream() {
 }
 
 Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
-                                           string* result) {
+                                           tstring* result) {
   if (bytes_to_read < 0) {
     return errors::InvalidArgument("Cannot read negative number of bytes");
   }
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 21e10b9f9f7..8d19d31e32c 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -33,7 +33,7 @@ class RandomAccessInputStream : public InputStreamInterface {
 
   ~RandomAccessInputStream();
 
-  Status ReadNBytes(int64 bytes_to_read, string* result) override;
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
 
 #if defined(PLATFORM_GOOGLE)
   Status ReadNBytes(int64 bytes_to_read, absl::Cord* result) override;
diff --git a/tensorflow/core/lib/io/random_inputstream_test.cc b/tensorflow/core/lib/io/random_inputstream_test.cc
index 7f697d5fa4f..2fb325b6e76 100644
--- a/tensorflow/core/lib/io/random_inputstream_test.cc
+++ b/tensorflow/core/lib/io/random_inputstream_test.cc
@@ -30,7 +30,7 @@ TEST(RandomInputStream, ReadNBytes) {
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
-  string read;
+  tstring read;
   RandomAccessInputStream in(file.get());
   TF_ASSERT_OK(in.ReadNBytes(3, &read));
   EXPECT_EQ(read, "012");
@@ -59,7 +59,7 @@ TEST(RandomInputStream, SkipNBytes) {
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
-  string read;
+  tstring read;
   RandomAccessInputStream in(file.get());
   TF_ASSERT_OK(in.SkipNBytes(3));
   EXPECT_EQ(3, in.Tell());
@@ -90,7 +90,7 @@ TEST(RandomInputStream, Seek) {
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
-  string read;
+  tstring read;
   RandomAccessInputStream in(file.get());
 
   // Seek forward
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index e22adcd5696..2c24a74f54b 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -84,7 +84,7 @@ RecordReader::RecordReader(RandomAccessFile* file,
 //
 // offset corresponds to the user-provided value to ReadRecord()
 // and is used only in error messages.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, tstring* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
@@ -125,7 +125,7 @@ Status RecordReader::GetMetadata(Metadata* md) {
     // loop should be guaranteed to either return after reaching EOF
     // or encountering an error.
     uint64 offset = 0;
-    string record;
+    tstring record;
     while (true) {
       // Read header, containing size of data.
       Status s = ReadChecksummed(offset, sizeof(uint64), &record);
@@ -161,7 +161,7 @@ Status RecordReader::GetMetadata(Metadata* md) {
   return Status::OK();
 }
 
-Status RecordReader::ReadRecord(uint64* offset, string* record) {
+Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   // Position the input stream.
   int64 curr_pos = input_stream_->Tell();
   int64 desired_pos = static_cast<int64>(*offset);
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 17444660d46..d1453e7cff3 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -89,7 +89,7 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  Status ReadRecord(uint64* offset, string* record);
+  Status ReadRecord(uint64* offset, tstring* record);
 
   // Return the metadata of the Record file.
   //
@@ -103,7 +103,7 @@ class RecordReader {
   Status GetMetadata(Metadata* md);
 
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, string* result);
+  Status ReadChecksummed(uint64 offset, size_t n, tstring* result);
 
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
@@ -129,7 +129,7 @@ class SequentialRecordReader {
 
   // Reads the next record in the file into *record. Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  Status ReadRecord(string* record) {
+  Status ReadRecord(tstring* record) {
     return underlying_.ReadRecord(&offset_, record);
   }
 
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index a88d34d2936..373c0d8b664 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -86,7 +86,7 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
 
     // Verify that file has all records written so far and no more.
     uint64 offset = 0;
-    string record;
+    tstring record;
     for (size_t j = 0; j <= i; j++) {
       // Check that j'th record is written correctly.
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
@@ -142,7 +142,7 @@ TEST(RecordReaderWriterTest, TestBasics) {
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
       uint64 offset = 0;
-      string record;
+      tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
@@ -187,7 +187,7 @@ TEST(RecordReaderWriterTest, TestZlib) {
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
       uint64 offset = 0;
-      string record;
+      tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 7f820ba7373..f21f88dfa22 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -149,7 +149,7 @@ class RecordioTest : public ::testing::Test {
     if (!reading_) {
       reading_ = true;
     }
-    string record;
+    tstring record;
     Status s = reader_->ReadRecord(&readpos_, &record);
     if (s.ok()) {
       return record;
@@ -183,7 +183,7 @@ class RecordioTest : public ::testing::Test {
     Write(BigString("x", 10000));
     reading_ = true;
     uint64 offset = WrittenBytes() + offset_past_end;
-    string record;
+    tstring record;
     Status s = reader_->ReadRecord(&offset, &record);
     ASSERT_TRUE(errors::IsOutOfRange(s)) << s;
   }
@@ -261,7 +261,7 @@ void TestNonSequentialReads(const RecordWriterOptions& writer_options,
   StringSource file(&contents);
   RecordReader reader(&file, reader_options);
 
-  string record;
+  tstring record;
   // First read sequentially to fill in the offsets table.
   uint64 offsets[10] = {0};
   uint64 offset = 0;
@@ -315,7 +315,7 @@ void TestReadError(const RecordWriterOptions& writer_options,
   RecordReader reader(&file, reader_options);
 
   uint64 offset = 0;
-  string read;
+  tstring read;
   file.force_error();
   Status status = reader.ReadRecord(&offset, &read);
   ASSERT_TRUE(errors::IsDataLoss(status));
diff --git a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc b/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
index e0918c70a79..ec32c653f72 100644
--- a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
@@ -121,7 +121,7 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
   for (int attempt = 0; attempt < 2; ++attempt) {
     string actual_result;
     for (int i = 0; i < num_writes; i++) {
-      string decompressed_output;
+      tstring decompressed_output;
       TF_RETURN_IF_ERROR(in.ReadNBytes(data.size(), &decompressed_output));
       strings::StrAppend(&actual_result, decompressed_output);
     }
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index 853d86cb230..1d764fe4ffc 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -29,7 +29,7 @@ SnappyInputBuffer::SnappyInputBuffer(
       output_buffer_(new char[output_buffer_capacity_]),
       next_in_(input_buffer_.get()) {}
 
-Status SnappyInputBuffer::ReadNBytes(int64 bytes_to_read, string* result) {
+Status SnappyInputBuffer::ReadNBytes(int64 bytes_to_read, tstring* result) {
   result->clear();
   // Read as many bytes as possible from cache.
   bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
@@ -62,7 +62,7 @@ Status SnappyInputBuffer::Reset() {
 }
 
 size_t SnappyInputBuffer::ReadBytesFromCache(size_t bytes_to_read,
-                                             string* result) {
+                                             tstring* result) {
   size_t can_read_bytes = std::min(bytes_to_read, avail_out_);
   if (can_read_bytes > 0) {
     result->append(next_out_, can_read_bytes);
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.h b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.h
index 84e67992217..9b9b3d320cf 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.h
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.h
@@ -54,7 +54,7 @@ class SnappyInputBuffer : public InputStreamInterface {
   //   If input_buffer_ is smaller in size than a compressed block.
   // others:
   //   If reading from file failed.
-  Status ReadNBytes(int64 bytes_to_read, string* result) override;
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
 
   int64 Tell() const override;
 
@@ -86,7 +86,7 @@ class SnappyInputBuffer : public InputStreamInterface {
   // bytes have been read or `next_out_` is reached.
   // Returns the number of bytes read and advances the `next_out_`
   // pointer to the next location to read from.
-  size_t ReadBytesFromCache(size_t bytes_to_read, string* result);
+  size_t ReadBytesFromCache(size_t bytes_to_read, tstring* result);
 
   // Reads the length of the next *compressed* block and stores in `length`.
   // The length is stored in 4 bytes in little endian notation.
diff --git a/tensorflow/core/lib/io/zlib_buffers_test.cc b/tensorflow/core/lib/io/zlib_buffers_test.cc
index 156c712db87..2aeeec38fdb 100644
--- a/tensorflow/core/lib/io/zlib_buffers_test.cc
+++ b/tensorflow/core/lib/io/zlib_buffers_test.cc
@@ -69,7 +69,7 @@ void TestAllCombinations(CompressionOptions input_options,
       for (auto output_buf_size : OutputBufferSizes()) {
         std::unique_ptr<WritableFile> file_writer;
         TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
-        string result;
+        tstring result;
 
         ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
                              output_options);
@@ -142,7 +142,7 @@ void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
                      input_options);
 
   for (int i = 0; i < num_writes; i++) {
-    string decompressed_output;
+    tstring decompressed_output;
     TF_ASSERT_OK(in.ReadNBytes(data.size(), &decompressed_output));
     strings::StrAppend(&actual_result, decompressed_output);
   }
@@ -171,7 +171,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   string data = GenTestString(10);
   std::unique_ptr<WritableFile> file_writer;
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
-  string result;
+  tstring result;
   ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
                        output_options);
   TF_ASSERT_OK(out.Init());
@@ -229,8 +229,8 @@ void TestTell(CompressionOptions input_options,
         ZlibInputStream in(input_stream.get(), input_buf_size, output_buf_size,
                            input_options);
 
-        string first_half(data, 0, data.size() / 2);
-        string bytes_read;
+        tstring first_half(string(data, 0, data.size() / 2));
+        tstring bytes_read;
 
         // Read the first half of the uncompressed file and expect that Tell()
         // returns half the uncompressed length of the file.
@@ -240,7 +240,7 @@ void TestTell(CompressionOptions input_options,
 
         // Read the remaining half of the uncompressed file and expect that
         // Tell() points past the end of file.
-        string second_half;
+        tstring second_half;
         TF_ASSERT_OK(
             in.ReadNBytes(data.size() - first_half.size(), &second_half));
         EXPECT_EQ(in.Tell(), data.size());
@@ -283,7 +283,7 @@ void TestSkipNBytes(CompressionOptions input_options,
 
         // Expect that second half is read correctly and Tell() returns past
         // end of file after reading complete file.
-        string bytes_read;
+        tstring bytes_read;
         TF_ASSERT_OK(in.ReadNBytes(second_half.size(), &bytes_read));
         EXPECT_EQ(bytes_read, second_half);
         EXPECT_EQ(in.Tell(), data.size());
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index a489d2e9d50..addaa6a3575 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -132,7 +132,7 @@ Status ZlibInputStream::ReadFromStream() {
     bytes_to_read -= z_stream_def_->stream->avail_in;
     read_location += z_stream_def_->stream->avail_in;
   }
-  string data;
+  tstring data;
   // Try to read enough data to fill up z_stream_def_->input.
   // TODO(rohanj): Add a char* version of ReadNBytes to InputStreamInterface
   // and use that instead to make this more efficient.
@@ -166,7 +166,7 @@ Status ZlibInputStream::ReadFromStream() {
 }
 
 size_t ZlibInputStream::ReadBytesFromCache(size_t bytes_to_read,
-                                           string* result) {
+                                           tstring* result) {
   size_t unread_bytes =
       reinterpret_cast<char*>(z_stream_def_->stream->next_out) -
       next_unread_byte_;
@@ -186,7 +186,7 @@ size_t ZlibInputStream::NumUnreadBytes() const {
          read_bytes;
 }
 
-Status ZlibInputStream::ReadNBytes(int64 bytes_to_read, string* result) {
+Status ZlibInputStream::ReadNBytes(int64 bytes_to_read, tstring* result) {
   result->clear();
   // Read as many bytes as possible from cache.
   bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index ac9e23ca972..5ffba2d9372 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -66,7 +66,7 @@ class ZlibInputStream : public InputStreamInterface {
   // ABORTED:      If inflate() fails, we return the error code with the
   //               error message in `z_stream_->msg`.
   // others:       If reading from stream failed.
-  Status ReadNBytes(int64 bytes_to_read, string* result) override;
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
 
   int64 Tell() const override;
 
@@ -107,7 +107,7 @@ class ZlibInputStream : public InputStreamInterface {
   // bytes have been read or `z_stream_->next_out` is reached.
   // Returns the number of bytes read and advances the `next_unread_byte_`
   // pointer to the next location to read from.
-  size_t ReadBytesFromCache(size_t bytes_to_read, string* result);
+  size_t ReadBytesFromCache(size_t bytes_to_read, tstring* result);
 
   // The number of unread bytes in z_stream_output_.
   //
diff --git a/tensorflow/core/lib/jpeg/jpeg_handle.cc b/tensorflow/core/lib/jpeg/jpeg_handle.cc
index 0ab9249cf59..5f1c9dfa94d 100644
--- a/tensorflow/core/lib/jpeg/jpeg_handle.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_handle.cc
@@ -84,7 +84,7 @@ void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize) {
 
 // -----------------------------------------------------------------------------
 void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize,
-             string *destination) {
+             tstring *destination) {
   MemDestMgr *dest;
   if (cinfo->dest == nullptr) {
     cinfo->dest = reinterpret_cast<struct jpeg_destination_mgr *>(
diff --git a/tensorflow/core/lib/jpeg/jpeg_handle.h b/tensorflow/core/lib/jpeg/jpeg_handle.h
index 86fa3ac5c23..d40cbaae939 100644
--- a/tensorflow/core/lib/jpeg/jpeg_handle.h
+++ b/tensorflow/core/lib/jpeg/jpeg_handle.h
@@ -33,7 +33,7 @@ typedef struct {
   JOCTET *buffer;
   int bufsize;
   int datacount;
-  string *dest;
+  tstring *dest;
 } MemDestMgr;
 
 typedef struct {
@@ -52,7 +52,7 @@ void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize);
 // Same as above, except that buffer is only used as a temporary structure and
 // is emptied into "destination" as soon as it fills up.
 void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize,
-             string *destination);
+             tstring *destination);
 
 }  // namespace jpeg
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index a21b440318e..03befabdefe 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -592,7 +592,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
 
 namespace {
 bool CompressInternal(const uint8* srcdata, int width, int height,
-                      const CompressFlags& flags, string* output) {
+                      const CompressFlags& flags, tstring* output) {
   output->clear();
   const int components = (static_cast<int>(flags.format) & 0xff);
 
@@ -762,14 +762,14 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
 // -----------------------------------------------------------------------------
 
 bool Compress(const void* srcdata, int width, int height,
-              const CompressFlags& flags, string* output) {
+              const CompressFlags& flags, tstring* output) {
   return CompressInternal(static_cast<const uint8*>(srcdata), width, height,
                           flags, output);
 }
 
-string Compress(const void* srcdata, int width, int height,
-                const CompressFlags& flags) {
-  string temp;
+tstring Compress(const void* srcdata, int width, int height,
+                 const CompressFlags& flags) {
+  tstring temp;
   CompressInternal(static_cast<const uint8*>(srcdata), width, height, flags,
                    &temp);
   // If CompressInternal fails, temp will be empty.
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h
index 03437a4e78a..08b379887a4 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.h
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -149,12 +149,12 @@ struct CompressFlags {
 // The encoded data is returned as a string.
 // If not empty, XMP metadata can be embedded in the image header
 // On error, returns the empty string (which is never a valid jpeg).
-string Compress(const void* srcdata, int width, int height,
-                const CompressFlags& flags);
+tstring Compress(const void* srcdata, int width, int height,
+                 const CompressFlags& flags);
 
 // On error, returns false and sets output to empty.
 bool Compress(const void* srcdata, int width, int height,
-              const CompressFlags& flags, string* output);
+              const CompressFlags& flags, tstring* output);
 
 }  // namespace jpeg
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index 62dd31a65f6..bec84dbf0ae 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -326,7 +326,7 @@ TEST(JpegMemTest, Jpeg2) {
     CHECK_NE(string::npos, cpdata1.find(kXMP));
 
     // Test the other API, where a storage string is supplied
-    string cptest;
+    tstring cptest;
     flags.stride = 0;
     Compress(refdata1.get(), in_w, in_h, flags, &cptest);
     CHECK_EQ(cptest, cpdata1);
@@ -465,7 +465,7 @@ TEST(JpegMemTest, ChromaDownsampling) {
     flags.format = FORMAT_RGB;
     flags.quality = 85;
     flags.chroma_downsampling = downsample;
-    string recompressed;
+    tstring recompressed;
     Compress(uncompressed.get(), w, h, flags, &recompressed);
     CHECK(!recompressed.empty());
     CHECK_EQ(IsChromaDownsampled(recompressed), downsample);
diff --git a/tensorflow/core/lib/lmdb/testdata/data_bigendian.mdb b/tensorflow/core/lib/lmdb/testdata/data_bigendian.mdb
new file mode 100644
index 00000000000..3b1193b56cd
Binary files /dev/null and b/tensorflow/core/lib/lmdb/testdata/data_bigendian.mdb differ
diff --git a/tensorflow/core/lib/math/BUILD b/tensorflow/core/lib/math/BUILD
new file mode 100644
index 00000000000..07d0a3e07cd
--- /dev/null
+++ b/tensorflow/core/lib/math/BUILD
@@ -0,0 +1,22 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are: math_util_test.
+
+cc_library(
+    name = "math_util",
+    hdrs = ["math_util.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+exports_files([
+    "math_util.h",
+    "math_util_test.cc",
+])
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index e8dbcb97b94..3c7a42ae4a7 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -105,8 +105,9 @@ void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
   }
 }
 
+template <typename T>
 void StringWriter(png_structp png_ptr, png_bytep data, png_size_t length) {
-  string* const s = absl::bit_cast<string*>(png_get_io_ptr(png_ptr));
+  T* const s = absl::bit_cast<T*>(png_get_io_ptr(png_ptr));
   s->append(absl::bit_cast<const char*>(data), length);
 }
 
@@ -340,9 +341,10 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
   return ok;
 }
 
+template <typename T>
 bool WriteImageToBuffer(
     const void* image, int width, int height, int row_bytes, int num_channels,
-    int channel_bits, int compression, string* png_string,
+    int channel_bits, int compression, T* png_string,
     const std::vector<std::pair<string, string> >* metadata) {
   CHECK_NOTNULL(image);
   CHECK_NOTNULL(png_string);
@@ -384,7 +386,7 @@ bool WriteImageToBuffer(
       return false;
   }
 
-  png_set_write_fn(png_ptr, png_string, StringWriter, StringWriterFlush);
+  png_set_write_fn(png_ptr, png_string, StringWriter<T>, StringWriterFlush);
   if (compression < 0) compression = Z_DEFAULT_COMPRESSION;
   png_set_compression_level(png_ptr, compression);
   png_set_compression_mem_level(png_ptr, MAX_MEM_LEVEL);
@@ -418,5 +420,16 @@ bool WriteImageToBuffer(
   return true;
 }
 
+template bool WriteImageToBuffer<string>(
+    const void* image, int width, int height, int row_bytes, int num_channels,
+    int channel_bits, int compression, string* png_string,
+    const std::vector<std::pair<string, string> >* metadata);
+#ifdef USE_TSTRING
+template bool WriteImageToBuffer<tstring>(
+    const void* image, int width, int height, int row_bytes, int num_channels,
+    int channel_bits, int compression, tstring* png_string,
+    const std::vector<std::pair<string, string> >* metadata);
+#endif  // USE_TSTRING
+
 }  // namespace png
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index d3a44b19eed..5ecbee89fb7 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -94,9 +94,10 @@ void CommonFreeDecode(DecodeContext* context);
 // compression is in [-1,9], where 0 is fast and weak compression, 9 is slow
 // and strong, and -1 is the zlib default.
 
+template <typename T>
 bool WriteImageToBuffer(
     const void* image, int width, int height, int row_bytes, int num_channels,
-    int channel_bits, int compression, string* png_string,
+    int channel_bits, int compression, T* png_string,
     const std::vector<std::pair<string, string> >* metadata);
 
 }  // namespace png
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
new file mode 100644
index 00000000000..3bd933261bc
--- /dev/null
+++ b/tensorflow/core/lib/random/BUILD
@@ -0,0 +1,138 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are:
+# random, philox_random_test_utils, + all tests.
+
+cc_library(
+    name = "distribution_sampler",
+    srcs = ["distribution_sampler.cc"],
+    hdrs = ["distribution_sampler.h"],
+    deps = [
+        "//tensorflow/core/lib/gtl:array_slice",
+        "//tensorflow/core/lib/random:philox",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "exact_uniform_int",
+    hdrs = ["exact_uniform_int.h"],
+)
+
+cc_library(
+    name = "philox",
+    srcs = [
+        "distribution_sampler.cc",
+        "random_distributions.cc",
+        "simple_philox.cc",
+        "weighted_picker.cc",
+    ],
+    hdrs = [
+        "distribution_sampler.h",
+        "philox_random.h",
+        "random_distributions.h",
+        "simple_philox.h",
+        "weighted_picker.h",
+    ],
+    deps = [
+        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/lib/gtl:array_slice",
+        "//tensorflow/core/lib/random:exact_uniform_int",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+        "//third_party/eigen3",
+    ],
+)
+
+filegroup(
+    name = "legacy_lib_random_headers",
+    srcs = [
+        "distribution_sampler.h",
+        "philox_random.h",
+        "random_distributions.h",
+        "simple_philox.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_public_random_headers",
+    srcs = [
+        "random.h",
+        "random_distributions.h",
+        "weighted_picker.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_test_internal_headers",
+    srcs = [
+        "philox_random_test_utils.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_random_all_headers",
+    srcs = [
+        "distribution_sampler.h",
+        "exact_uniform_int.h",
+        "philox_random.h",
+        "philox_random_test_utils.h",
+        "random.h",
+        "random_distributions.h",
+        "simple_philox.h",
+        "weighted_picker.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_random_all_srcs",
+    srcs = [
+        "distribution_sampler.cc",
+        "random.cc",
+        "random_distributions.cc",
+        "simple_philox.cc",
+        "weighted_picker.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_random_tests",
+    srcs = [
+        "distribution_sampler_test.cc",
+        "philox_random_test.cc",
+        "random_distributions_test.cc",
+        "random_test.cc",
+        "simple_philox_test.cc",
+        "weighted_picker_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_random_random_distributions_test",
+    srcs = [
+        "random_distributions_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_random_random_weighted_picker_test",
+    srcs = [
+        "weighted_picker_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/strings/BUILD b/tensorflow/core/lib/strings/BUILD
new file mode 100644
index 00000000000..b24e7860a12
--- /dev/null
+++ b/tensorflow/core/lib/strings/BUILD
@@ -0,0 +1,164 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are:
+# base64, proto_serialization, proto_text_util, and all tests.
+# Note: proto_serialization depends on target lib/hash:hash being added.
+
+cc_library(
+    name = "ordered_code",
+    srcs = ["ordered_code.cc"],
+    hdrs = ["ordered_code.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "scanner",
+    hdrs = ["scanner.h"],
+    deps = ["//tensorflow/core/platform:scanner"],
+)
+
+cc_library(
+    name = "string_utils",
+    srcs = [
+        "strcat.cc",
+    ],
+    hdrs = [
+        "numbers.h",
+        "str_util.h",
+        "strcat.h",
+    ],
+    deps = [
+        "//tensorflow/core/lib/gtl:stl_util",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:numbers",
+        "//tensorflow/core/platform:str_util",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:stringprintf",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "stringprintf",
+    hdrs = ["stringprintf.h"],
+    deps = ["//tensorflow/core/platform:stringprintf"],
+)
+
+filegroup(
+    name = "legacy_lib_strings_all_headers",
+    srcs = [
+        "base64.h",
+        "numbers.h",
+        "ordered_code.h",
+        "proto_serialization.h",
+        "proto_text_util.h",
+        "scanner.h",
+        "str_util.h",
+        "strcat.h",
+        "stringprintf.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_strings_all_srcs",
+    srcs = [
+        "base64.cc",
+        "ordered_code.cc",
+        "proto_serialization.cc",
+        "proto_text_util.cc",
+        "strcat.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_strings_all_tests",
+    srcs = [
+        "base64_test.cc",
+        "ordered_code_test.cc",
+        "proto_serialization_test.cc",
+        "strcat_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_proto_parsing_headers",
+    srcs = [
+        "numbers.h",
+        "strcat.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_string_headers",
+    srcs = [
+        "numbers.h",
+        "proto_serialization.h",
+        "str_util.h",
+        "strcat.h",
+        "stringprintf.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_public_string_headers",
+    srcs = [
+        "base64.h",
+        "ordered_code.h",
+        "proto_serialization.h",
+        "proto_text_util.h",
+        "scanner.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_android_gif_internal_string_headers",
+    srcs = [
+        "numbers.h",
+        "strcat.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_low_level_library_tests",
+    srcs = [
+        "base64_test.cc",
+        "strcat_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_strings_ordered_code_test",
+    srcs = [
+        "ordered_code_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_strings_proto_serialization_test",
+    srcs = [
+        "proto_serialization_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/strings/base64.cc b/tensorflow/core/lib/strings/base64.cc
index c5a521f18ae..80eec3a9403 100644
--- a/tensorflow/core/lib/strings/base64.cc
+++ b/tensorflow/core/lib/strings/base64.cc
@@ -73,7 +73,8 @@ Status DecodeThreeChars(const char* codes, char* result) {
 }
 }  // namespace
 
-Status Base64Decode(StringPiece data, string* decoded) {
+template <typename T>
+Status Base64Decode(StringPiece data, T* decoded) {
   if (decoded == nullptr) {
     return errors::Internal("'decoded' cannot be nullptr.");
   }
@@ -135,11 +136,13 @@ Status Base64Decode(StringPiece data, string* decoded) {
   return Status::OK();
 }
 
-Status Base64Encode(StringPiece source, string* encoded) {
+template <typename T>
+Status Base64Encode(StringPiece source, T* encoded) {
   return Base64Encode(source, false, encoded);
 }
 
-Status Base64Encode(StringPiece source, bool with_padding, string* encoded) {
+template <typename T>
+Status Base64Encode(StringPiece source, bool with_padding, T* encoded) {
   const char* const base64_chars = kBase64UrlSafeChars;
   if (encoded == nullptr) {
     return errors::Internal("'encoded' cannot be nullptr.");
@@ -191,4 +194,16 @@ Status Base64Encode(StringPiece source, bool with_padding, string* encoded) {
   return Status::OK();
 }
 
+template Status Base64Decode<string>(StringPiece data, string* decoded);
+template Status Base64Encode<string>(StringPiece source, string* encoded);
+template Status Base64Encode<string>(StringPiece source, bool with_padding,
+                                     string* encoded);
+
+#ifdef USE_TSTRING
+template Status Base64Decode<tstring>(StringPiece data, tstring* decoded);
+template Status Base64Encode<tstring>(StringPiece source, tstring* encoded);
+template Status Base64Encode<tstring>(StringPiece source, bool with_padding,
+                                      tstring* encoded);
+#endif  // USE_TSTRING
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/base64.h b/tensorflow/core/lib/strings/base64.h
index cb8f50df11f..7eecbcae437 100644
--- a/tensorflow/core/lib/strings/base64.h
+++ b/tensorflow/core/lib/strings/base64.h
@@ -24,13 +24,17 @@ namespace tensorflow {
 /// \brief Converts data into web-safe base64 encoding.
 ///
 /// See https://en.wikipedia.org/wiki/Base64
-Status Base64Encode(StringPiece data, bool with_padding, string* encoded);
-Status Base64Encode(StringPiece data, string* encoded);  // with_padding=false.
+template <typename T>
+Status Base64Encode(StringPiece source, bool with_padding, T* encoded);
+template <typename T>
+Status Base64Encode(StringPiece source,
+                    T* encoded);  // with_padding=false.
 
 /// \brief Converts data from web-safe base64 encoding.
 ///
 /// See https://en.wikipedia.org/wiki/Base64
-Status Base64Decode(StringPiece data, string* decoded);
+template <typename T>
+Status Base64Decode(StringPiece data, T* decoded);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/base64_test.cc b/tensorflow/core/lib/strings/base64_test.cc
index 3e03d595d27..df4a4bcf593 100644
--- a/tensorflow/core/lib/strings/base64_test.cc
+++ b/tensorflow/core/lib/strings/base64_test.cc
@@ -21,11 +21,11 @@ namespace tensorflow {
 
 TEST(Base64, EncodeDecode) {
   const string original = "a simple test message!";
-  string encoded;
+  tstring encoded;
   TF_EXPECT_OK(Base64Encode(original, &encoded));
   EXPECT_EQ("YSBzaW1wbGUgdGVzdCBtZXNzYWdlIQ", encoded);
 
-  string decoded;
+  tstring decoded;
   TF_EXPECT_OK(Base64Decode(encoded, &decoded));
   EXPECT_EQ(original, decoded);
 }
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 959290ba8c7..cbc53d47a8f 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -16,164 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
 #define TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
 
-#include <string>
-
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace strings {
-
-// ----------------------------------------------------------------------
-// FastIntToBufferLeft()
-//    These are intended for speed.
-//
-//    All functions take the output buffer as an arg.  FastInt() uses
-//    at most 22 bytes, FastTime() uses exactly 30 bytes.  They all
-//    return a pointer to the beginning of the output, which is the same as
-//    the beginning of the input buffer.
-//
-//    NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible
-//    to pass to FastTimeToBuffer() a time whose year cannot be
-//    represented in 4 digits. In this case, the output buffer
-//    will contain the string "Invalid:<value>"
-// ----------------------------------------------------------------------
-
-// Previously documented minimums -- the buffers provided must be at least this
-// long, though these numbers are subject to change:
-//     Int32, UInt32:                   12 bytes
-//     Int64, UInt64, Int, Uint:        22 bytes
-//     Time:                            30 bytes
-// Use kFastToBufferSize rather than hardcoding constants.
-static const int kFastToBufferSize = 32;
-
-// ----------------------------------------------------------------------
-// FastInt32ToBufferLeft()
-// FastUInt32ToBufferLeft()
-// FastInt64ToBufferLeft()
-// FastUInt64ToBufferLeft()
-//
-// These functions convert their numeric argument to an ASCII
-// representation of the numeric value in base 10, with the
-// representation being left-aligned in the buffer.  The caller is
-// responsible for ensuring that the buffer has enough space to hold
-// the output.  The buffer should typically be at least kFastToBufferSize
-// bytes.
-//
-// Returns the number of characters written.
-// ----------------------------------------------------------------------
-
-size_t FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
-size_t FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
-size_t FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
-size_t FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
-
-// Required buffer size for DoubleToBuffer is kFastToBufferSize.
-// Required buffer size for FloatToBuffer is kFastToBufferSize.
-size_t DoubleToBuffer(double value, char* buffer);
-size_t FloatToBuffer(float value, char* buffer);
-
-// Convert a 64-bit fingerprint value to an ASCII representation.
-string FpToString(Fprint fp);
-
-// Attempt to parse a fingerprint in the form encoded by FpToString.  If
-// successful, stores the fingerprint in *fp and returns true.  Otherwise,
-// returns false.
-bool StringToFp(const string& s, Fprint* fp);
-
-// Convert a 64-bit fingerprint value to an ASCII representation that
-// is terminated by a '\0'.
-// Buf must point to an array of at least kFastToBufferSize characters
-StringPiece Uint64ToHexString(uint64 v, char* buf);
-
-// Attempt to parse a uint64 in the form encoded by FastUint64ToHexString.  If
-// successful, stores the value in *v and returns true.  Otherwise,
-// returns false.
-bool HexStringToUint64(const StringPiece& s, uint64* v);
-
-// Convert strings to 32bit integer values.
-// Leading and trailing spaces are allowed.
-// Return false with overflow or invalid input.
-bool safe_strto32(StringPiece str, int32* value);
-
-// Convert strings to unsigned 32bit integer values.
-// Leading and trailing spaces are allowed.
-// Return false with overflow or invalid input.
-bool safe_strtou32(StringPiece str, uint32* value);
-
-// Convert strings to 64bit integer values.
-// Leading and trailing spaces are allowed.
-// Return false with overflow or invalid input.
-bool safe_strto64(StringPiece str, int64* value);
-
-// Convert strings to unsigned 64bit integer values.
-// Leading and trailing spaces are allowed.
-// Return false with overflow or invalid input.
-bool safe_strtou64(StringPiece str, uint64* value);
-
-// Convert strings to floating point values.
-// Leading and trailing spaces are allowed.
-// Values may be rounded on over- and underflow.
-// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtof(StringPiece str, float* value);
-
-// Convert strings to double precision floating point values.
-// Leading and trailing spaces are allowed.
-// Values may be rounded on over- and underflow.
-// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtod(StringPiece str, double* value);
-
-inline bool ProtoParseNumeric(StringPiece s, int32* value) {
-  return safe_strto32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
-  return safe_strtou32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, int64* value) {
-  return safe_strto64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
-  return safe_strtou64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, float* value) {
-  return safe_strtof(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, double* value) {
-  return safe_strtod(s, value);
-}
-
-// Convert strings to number of type T.
-// Leading and trailing spaces are allowed.
-// Values may be rounded on over- and underflow.
-template <typename T>
-bool SafeStringToNumeric(StringPiece s, T* value) {
-  return ProtoParseNumeric(s, value);
-}
-
-// Converts from an int64 to a human readable string representing the
-// same number, using decimal powers.  e.g. 1200000 -> "1.20M".
-string HumanReadableNum(int64 value);
-
-// Converts from an int64 representing a number of bytes to a
-// human readable string representing the same number.
-// e.g. 12345678 -> "11.77MiB".
-string HumanReadableNumBytes(int64 num_bytes);
-
-// Converts a time interval as double to a human readable
-// string. For example:
-//   0.001       -> "1 ms"
-//   10.0        -> "10 s"
-//   933120.0    -> "10.8 days"
-//   39420000.0  -> "1.25 years"
-//   -10         -> "-10 s"
-string HumanReadableElapsedTime(double seconds);
-
-}  // namespace strings
-}  // namespace tensorflow
+#include "tensorflow/core/platform/numbers.h"
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index ef90050b4f6..bce0070c802 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <assert.h>
 #include <stddef.h>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace strings {
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index 91870cfec63..ddc7fc7acdd 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -39,8 +39,9 @@ limitations under the License.
 #define TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__
 
 #include <string>
-#include "tensorflow/core/lib/core/stringpiece.h"
+
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index c82e771368c..349f6091e2f 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -16,226 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_
 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_
 
-#include <string>
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace strings {
-
-// Scanner provides simplified string parsing, in which a string is parsed as a
-// series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
-// finally GetResult is called. If GetResult returns true, then it also returns
-// the remaining characters and any captured substring.
-//
-// The range to capture can be controlled with RestartCapture and StopCapture;
-// by default, all processed characters are captured.
-class Scanner {
- public:
-  // Classes of characters. Each enum name is to be read as the union of the
-  // parts - e.g., class LETTER_DIGIT means the class includes all letters and
-  // all digits.
-  //
-  // LETTER means ascii letter a-zA-Z.
-  // DIGIT means ascii digit: 0-9.
-  enum CharClass {
-    // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
-    // in scanner_test.cc
-    ALL,
-    DIGIT,
-    LETTER,
-    LETTER_DIGIT,
-    LETTER_DIGIT_DASH_UNDERSCORE,
-    LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
-    LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
-    LETTER_DIGIT_DOT,
-    LETTER_DIGIT_DOT_PLUS_MINUS,
-    LETTER_DIGIT_DOT_UNDERSCORE,
-    LETTER_DIGIT_UNDERSCORE,
-    LOWERLETTER,
-    LOWERLETTER_DIGIT,
-    LOWERLETTER_DIGIT_UNDERSCORE,
-    NON_ZERO_DIGIT,
-    SPACE,
-    UPPERLETTER,
-  };
-
-  explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
-
-  // Consume the next character of the given class from input. If the next
-  // character is not in the class, then GetResult will ultimately return false.
-  Scanner& One(CharClass clz) {
-    if (cur_.empty() || !Matches(clz, cur_[0])) {
-      return Error();
-    }
-    cur_.remove_prefix(1);
-    return *this;
-  }
-
-  // Consume the next s.size() characters of the input, if they match <s>. If
-  // they don't match <s>, this is a no-op.
-  Scanner& ZeroOrOneLiteral(StringPiece s) {
-    str_util::ConsumePrefix(&cur_, s);
-    return *this;
-  }
-
-  // Consume the next s.size() characters of the input, if they match <s>. If
-  // they don't match <s>, then GetResult will ultimately return false.
-  Scanner& OneLiteral(StringPiece s) {
-    if (!str_util::ConsumePrefix(&cur_, s)) {
-      error_ = true;
-    }
-    return *this;
-  }
-
-  // Consume characters from the input as long as they match <clz>. Zero
-  // characters is still considered a match, so it will never cause GetResult to
-  // return false.
-  Scanner& Any(CharClass clz) {
-    while (!cur_.empty() && Matches(clz, cur_[0])) {
-      cur_.remove_prefix(1);
-    }
-    return *this;
-  }
-
-  // Shorthand for One(clz).Any(clz).
-  Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
-
-  // Reset the capture start point.
-  //
-  // Later, when GetResult is called and if it returns true, the capture
-  // returned will start at the position at the time this was called.
-  Scanner& RestartCapture() {
-    capture_start_ = cur_.data();
-    capture_end_ = nullptr;
-    return *this;
-  }
-
-  // Stop capturing input.
-  //
-  // Later, when GetResult is called and if it returns true, the capture
-  // returned will end at the position at the time this was called.
-  Scanner& StopCapture() {
-    capture_end_ = cur_.data();
-    return *this;
-  }
-
-  // If not at the input of input, then GetResult will ultimately return false.
-  Scanner& Eos() {
-    if (!cur_.empty()) error_ = true;
-    return *this;
-  }
-
-  // Shorthand for Any(SPACE).
-  Scanner& AnySpace() { return Any(SPACE); }
-
-  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
-  Scanner& ScanUntil(char end_ch) {
-    ScanUntilImpl(end_ch, false);
-    return *this;
-  }
-
-  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
-  // Backslash escape sequences are skipped.
-  // Used for implementing quoted string scanning.
-  Scanner& ScanEscapedUntil(char end_ch) {
-    ScanUntilImpl(end_ch, true);
-    return *this;
-  }
-
-  // Return the next character that will be scanned, or <default_value> if there
-  // are no more characters to scan.
-  // Note that if a scan operation has failed (so GetResult() returns false),
-  // then the value of Peek may or may not have advanced since the scan
-  // operation that failed.
-  char Peek(char default_value = '\0') const {
-    return cur_.empty() ? default_value : cur_[0];
-  }
-
-  // Returns false if there are no remaining characters to consume.
-  int empty() const { return cur_.empty(); }
-
-  // Returns true if the input string successfully matched. When true is
-  // returned, the remaining string is returned in <remaining> and the captured
-  // string returned in <capture>, if non-NULL.
-  bool GetResult(StringPiece* remaining = nullptr,
-                 StringPiece* capture = nullptr);
-
- private:
-  void ScanUntilImpl(char end_ch, bool escaped);
-
-  Scanner& Error() {
-    error_ = true;
-    return *this;
-  }
-
-  static bool IsLetter(char ch) {
-    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
-  }
-
-  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
-
-  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
-
-  static bool IsSpace(char ch) {
-    return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
-            ch == '\r');
-  }
-
-  static bool Matches(CharClass clz, char ch) {
-    switch (clz) {
-      case ALL:
-        return true;
-      case DIGIT:
-        return IsDigit(ch);
-      case LETTER:
-        return IsLetter(ch);
-      case LETTER_DIGIT:
-        return IsLetter(ch) || IsDigit(ch);
-      case LETTER_DIGIT_DASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
-      case LETTER_DIGIT_DASH_DOT_SLASH:
-        return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
-               ch == '/';
-      case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
-                ch == '/' || ch == '_');
-      case LETTER_DIGIT_DOT:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.';
-      case LETTER_DIGIT_DOT_PLUS_MINUS:
-        return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
-               ch == '.';
-      case LETTER_DIGIT_DOT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
-      case LETTER_DIGIT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '_';
-      case LOWERLETTER:
-        return ch >= 'a' && ch <= 'z';
-      case LOWERLETTER_DIGIT:
-        return IsLowerLetter(ch) || IsDigit(ch);
-      case LOWERLETTER_DIGIT_UNDERSCORE:
-        return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
-      case NON_ZERO_DIGIT:
-        return IsDigit(ch) && ch != '0';
-      case SPACE:
-        return IsSpace(ch);
-      case UPPERLETTER:
-        return ch >= 'A' && ch <= 'Z';
-    }
-    return false;
-  }
-
-  StringPiece cur_;
-  const char* capture_start_ = nullptr;
-  const char* capture_end_ = nullptr;
-  bool error_ = false;
-
-  friend class ScannerTest;
-  TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
-};
-
-}  // namespace strings
-}  // namespace tensorflow
+#include "tensorflow/core/platform/scanner.h"
 
 #endif  // TENSORFLOW_LIB_STRINGS_SCANNER_H_
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 122044b4c91..fa5a41ff0eb 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -16,215 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
 #define TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
 
-#include <functional>
-#include <string>
-#include <vector>
-#include "absl/base/macros.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/types.h"
-
-// Basic string utility routines
-namespace tensorflow {
-namespace str_util {
-
-// Returns a version of 'src' where unprintable characters have been
-// escaped using C-style escape sequences.
-ABSL_DEPRECATED("Use absl::CEscape instead.")
-string CEscape(StringPiece src);
-
-// Copies "source" to "dest", rewriting C-style escape sequences --
-// '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
-//
-// Errors: Sets the description of the first encountered error in
-// 'error'. To disable error reporting, set 'error' to NULL.
-//
-// NOTE: Does not support \u or \U!
-ABSL_DEPRECATED("Use absl::CUnescape instead.")
-bool CUnescape(StringPiece source, string* dest, string* error);
-
-// Removes any trailing whitespace from "*s".
-ABSL_DEPRECATED("Use absl::StripTrailingAsciiWhitespace instead.")
-void StripTrailingWhitespace(string* s);
-
-// Removes leading ascii_isspace() characters.
-// Returns number of characters removed.
-ABSL_DEPRECATED("Use absl::StripLeadingAsciiWhitespace instead.")
-size_t RemoveLeadingWhitespace(StringPiece* text);
-
-// Removes trailing ascii_isspace() characters.
-// Returns number of characters removed.
-ABSL_DEPRECATED("Use absl::StripTrailingAsciiWhitespace instead.")
-size_t RemoveTrailingWhitespace(StringPiece* text);
-
-// Removes leading and trailing ascii_isspace() chars.
-// Returns number of chars removed.
-ABSL_DEPRECATED("Use absl::StripAsciiWhitespace instead.")
-size_t RemoveWhitespaceContext(StringPiece* text);
-
-// Consume a leading positive integer value.  If any digits were
-// found, store the value of the leading unsigned number in "*val",
-// advance "*s" past the consumed number, and return true.  If
-// overflow occurred, returns false.  Otherwise, returns false.
-bool ConsumeLeadingDigits(StringPiece* s, uint64* val);
-
-// Consume a leading token composed of non-whitespace characters only.
-// If *s starts with a non-zero number of non-whitespace characters, store
-// them in *val, advance *s past them, and return true.  Else return false.
-bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val);
-
-// If "*s" starts with "expected", consume it and return true.
-// Otherwise, return false.
-ABSL_DEPRECATED("Use absl::ConsumePrefix instead.")
-bool ConsumePrefix(StringPiece* s, StringPiece expected);
-
-// If "*s" ends with "expected", remove it and return true.
-// Otherwise, return false.
-ABSL_DEPRECATED("Use absl::ConsumeSuffix instead.")
-bool ConsumeSuffix(StringPiece* s, StringPiece expected);
-
-// Return lower-cased version of s.
-ABSL_DEPRECATED("Use absl::AsciiStrToLower instead.")
-string Lowercase(StringPiece s);
-
-// Return upper-cased version of s.
-ABSL_DEPRECATED("Use absl::AsciiStrToUpper instead.")
-string Uppercase(StringPiece s);
-
-// Converts "^2ILoveYou!" to "i_love_you_". More specifically:
-// - converts all non-alphanumeric characters to underscores
-// - replaces each occurrence of a capital letter (except the very
-//   first character and if there is already an '_' before it) with '_'
-//   followed by this letter in lower case
-// - Skips leading non-alpha characters
-// This method is useful for producing strings matching "[a-z][a-z0-9_]*"
-// as required by OpDef.ArgDef.name. The resulting string is either empty or
-// matches this regex.
-string ArgDefCase(StringPiece s);
-
-// Capitalize first character of each word in "*s".  "delimiters" is a
-// set of characters that can be used as word boundaries.
-void TitlecaseString(string* s, StringPiece delimiters);
-
-// Replaces the first occurrence (if replace_all is false) or all occurrences
-// (if replace_all is true) of oldsub in s with newsub.
-string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
-                     bool replace_all);
-
-// Join functionality
-template <typename T>
-ABSL_DEPRECATED("Use absl::StrJoin instead.")
-string Join(const T& s, const char* sep);
-
-// A variant of Join where for each element of "s", f(&dest_string, elem)
-// is invoked (f is often constructed with a lambda of the form:
-//   [](string* result, ElemType elem)
-template <typename T, typename Formatter>
-ABSL_DEPRECATED("Use absl::StrJoin instead.")
-string Join(const T& s, const char* sep, Formatter f);
-
-struct AllowEmpty {
-  bool operator()(StringPiece sp) const { return true; }
-};
-struct SkipEmpty {
-  bool operator()(StringPiece sp) const { return !sp.empty(); }
-};
-struct SkipWhitespace {
-  bool operator()(StringPiece sp) const {
-    return !absl::StripTrailingAsciiWhitespace(sp).empty();
-  }
-};
-
-// Split strings using any of the supplied delimiters. For example:
-// Split("a,b.c,d", ".,") would return {"a", "b", "c", "d"}.
-ABSL_DEPRECATED("Use absl::StrSplit instead.")
-std::vector<string> Split(StringPiece text, StringPiece delims);
-
-template <typename Predicate>
-ABSL_DEPRECATED("Use absl::StrSplit instead.")
-std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p);
-
-// Split "text" at "delim" characters, and parse each component as
-// an integer.  If successful, adds the individual numbers in order
-// to "*result" and returns true.  Otherwise returns false.
-bool SplitAndParseAsInts(StringPiece text, char delim,
-                         std::vector<int32>* result);
-bool SplitAndParseAsInts(StringPiece text, char delim,
-                         std::vector<int64>* result);
-bool SplitAndParseAsFloats(StringPiece text, char delim,
-                           std::vector<float>* result);
-
-// StartsWith()
-//
-// Returns whether a given string `text` begins with `prefix`.
-ABSL_DEPRECATED("Use absl::StartsWith instead.")
-bool StartsWith(StringPiece text, StringPiece prefix);
-
-// EndsWith()
-//
-// Returns whether a given string `text` ends with `suffix`.
-ABSL_DEPRECATED("Use absl::EndsWith instead.")
-bool EndsWith(StringPiece text, StringPiece suffix);
-
-// StrContains()
-//
-// Returns whether a given string `haystack` contains the substring `needle`.
-ABSL_DEPRECATED("Use absl::StrContains instead.")
-bool StrContains(StringPiece haystack, StringPiece needle);
-
-// ------------------------------------------------------------------
-// Implementation details below
-template <typename T>
-string Join(const T& s, const char* sep) {
-  return absl::StrJoin(s, sep);
-}
-
-template <typename T>
-class Formatter {
- public:
-  Formatter(std::function<void(string*, T)> f) : f_(f) {}
-  void operator()(string* out, const T& t) { f_(out, t); }
-
- private:
-  std::function<void(string*, T)> f_;
-};
-
-template <typename T, typename Formatter>
-string Join(const T& s, const char* sep, Formatter f) {
-  return absl::StrJoin(s, sep, f);
-}
-
-inline std::vector<string> Split(StringPiece text, StringPiece delims) {
-  return text.empty() ? std::vector<string>()
-                      : absl::StrSplit(text, absl::ByAnyChar(delims));
-}
-
-template <typename Predicate>
-std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p) {
-  return text.empty() ? std::vector<string>()
-                      : absl::StrSplit(text, absl::ByAnyChar(delims), p);
-}
-
-ABSL_DEPRECATED("Use absl::StrSplit instead.")
-inline std::vector<string> Split(StringPiece text, char delim) {
-  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim);
-}
-
-template <typename Predicate>
-ABSL_DEPRECATED("Use absl::StrSplit instead.")
-std::vector<string> Split(StringPiece text, char delim, Predicate p) {
-  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim, p);
-}
-
-// Returns the length of the given null-terminated byte string 'str'.
-// Returns 'string_max_len' if the null character was not found in the first
-// 'string_max_len' bytes of 'str'.
-size_t Strnlen(const char* str, const size_t string_max_len);
-
-}  // namespace str_util
-}  // namespace tensorflow
+#include "tensorflow/core/platform/str_util.h"
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index ef308052767..9ed46d8bf8d 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 // The AlphaNum type was designed to be used as the parameter type for StrCat().
@@ -124,6 +124,12 @@ class AlphaNum {
   AlphaNum(const StringPiece &pc) : piece_(pc) {}  // NOLINT(runtime/explicit)
   AlphaNum(const tensorflow::string &str)          // NOLINT(runtime/explicit)
       : piece_(str) {}
+#ifdef USE_TSTRING
+  // TODO(dero): Temp guard to prevent duplicate declaration during tstring
+  // migration.
+  AlphaNum(const tensorflow::tstring &str)  // NOLINT(runtime/explicit)
+      : piece_(str) {}
+#endif
   template <typename A>
   AlphaNum(const std::basic_string<char, std::char_traits<char>, A> &str)
       : piece_(str) {}  // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/lib/strings/stringprintf.h b/tensorflow/core/lib/strings/stringprintf.h
index 52af410d429..836632d7866 100644
--- a/tensorflow/core/lib/strings/stringprintf.h
+++ b/tensorflow/core/lib/strings/stringprintf.h
@@ -23,30 +23,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
 #define TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
 
-#include <stdarg.h>
-#include <string>
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace strings {
-
-// Return a C++ string
-extern string Printf(const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(1, 2);
-
-// Append result to a supplied string
-extern void Appendf(string* dst, const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(2, 3);
-
-// Lower-level routine that takes a va_list and appends to a specified
-// string.  All other routines are just convenience wrappers around it.
-extern void Appendv(string* dst, const char* format, va_list ap);
-
-}  // namespace strings
-}  // namespace tensorflow
+#include "tensorflow/core/platform/stringprintf.h"
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index b4f0bfbfb96..62bd7cdf157 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -132,9 +132,10 @@ Status ReadString(const string& data, int expected_length, string* value,
   return Status::OK();
 }
 
+template <typename T>
 Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
                              size_t num_channels, size_t num_frames,
-                             string* wav_string) {
+                             T* wav_string) {
   constexpr size_t kFormatChunkSize = 16;
   constexpr size_t kCompressionCodePcm = 1;
   constexpr size_t kBitsPerSample = 16;
@@ -173,7 +174,7 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   }
 
   wav_string->resize(file_size);
-  char* data = &wav_string->at(0);
+  char* data = &(*wav_string)[0];
   WavHeader* header = absl::bit_cast<WavHeader*>(data);
 
   // Fill RIFF chunk.
@@ -208,6 +209,19 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   return Status::OK();
 }
 
+template Status EncodeAudioAsS16LEWav<string>(const float* audio,
+                                              size_t sample_rate,
+                                              size_t num_channels,
+                                              size_t num_frames,
+                                              string* wav_string);
+#ifdef USE_TSTRING
+template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
+                                               size_t sample_rate,
+                                               size_t num_channels,
+                                               size_t num_frames,
+                                               tstring* wav_string);
+#endif  // USE_TSTRING
+
 Status DecodeLin16WaveAsFloatVector(const string& wav_string,
                                     std::vector<float>* float_values,
                                     uint32* sample_count, uint16* channel_count,
diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index 9145e7c9f22..0c8c1abe38c 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -41,9 +41,10 @@ namespace wav {
 // if (EncodeAudioAsS16LEWav(audio_buffer, 8000, 2, 4, &wav_string).ok()) {
 //   // Use wav_string.
 // }
+template <typename T>
 Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
                              size_t num_channels, size_t num_frames,
-                             string* wav_string);
+                             T* wav_string);
 
 // Decodes the little-endian signed 16-bit PCM WAV file data (aka LIN16
 // encoding) into a float Tensor. The channels are encoded as the lowest
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index 9dc42929104..66811e3669e 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -34,12 +34,13 @@ Status ReadString(const string& data, int expected_length, string* value,
 
 TEST(WavIO, BadArguments) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
-  string result;
+  tstring result;
 
   EXPECT_EQ(error::INVALID_ARGUMENT,
             EncodeAudioAsS16LEWav(nullptr, 44100, 2, 3, &result).code());
-  EXPECT_EQ(error::INVALID_ARGUMENT,
-            EncodeAudioAsS16LEWav(audio, 44100, 2, 3, nullptr).code());
+  EXPECT_EQ(
+      error::INVALID_ARGUMENT,
+      EncodeAudioAsS16LEWav(audio, 44100, 2, 3, (tstring*)nullptr).code());
 
   const size_t kuint32max_plus_one = static_cast<size_t>(kuint32max) + 1;
   const size_t kuint16max_plus_one = static_cast<size_t>(kuint16max) + 1;
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index b1f7bcfaaff..b6892eef1e9 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -5,8 +5,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -19,17 +21,21 @@ exports_files(["LICENSE"])
 
 cc_library(
     name = "nccl_lib",
-    srcs = if_cuda([
+    srcs = if_cuda_or_rocm([
         "nccl_manager.cc",
         "nccl_rewrite.cc",
     ]),
-    hdrs = if_cuda([
+    hdrs = if_cuda_or_rocm([
         "nccl_manager.h",
     ]),
     copts = tf_copts(),
     deps = if_cuda([
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
         "@local_config_nccl//:nccl",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+    ]) + if_cuda_or_rocm([
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
@@ -50,9 +56,13 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
         ":nccl_lib",
+    ]) + if_cuda([
         "@local_config_nccl//:nccl",
         "//tensorflow/core:cuda",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+        "//tensorflow/core:rocm",
     ]),
 )
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 9f26cb2e6f7..4a439a46525 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -16,15 +16,32 @@ limitations under the License.
 
 #include <utility>
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/env.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif
 
 namespace tensorflow {
 
+#if GOOGLE_CUDA
+using se::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+using se::rocm::ScopedActivateExecutorContext;
+// Local hipify of cuda symbols
+#define cudaError_t hipError_t
+#define cudaStream_t hipStream_t
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaSuccess hipSuccess
+#endif
+
 #define NCCL_RETURN_IF_ERROR(...)                               \
   do {                                                          \
     ncclResult_t nccl_status = (__VA_ARGS__);                   \
@@ -41,8 +58,6 @@ namespace tensorflow {
     }                                                           \
   } while (0)
 
-using se::cuda::ScopedActivateExecutorContext;
-
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
 struct NcclManager::NcclStream : public core::RefCounted {
@@ -209,6 +224,9 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
   std::sort(collective->participants.begin(), collective->participants.end(),
             [](const std::unique_ptr<Participant>& a,
                const std::unique_ptr<Participant>& b) {
+              if (a->executor == b->executor) {
+                return a->global_rank < b->global_rank;
+              }
               return a->executor < b->executor;
             });
 
@@ -402,6 +420,8 @@ void NcclManager::SignalMultiNodeReady(const string& collective_key) {
       if (CheckReady(collective_key, collective)) {
         to_run = collective;
       }
+      VLOG(2) << "SignalMultiNodeReady collective " << collective_key
+              << " to_run " << to_run;
     }
   }
 
@@ -480,7 +500,18 @@ void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
           collective->participants.size(),
           " with one more participant being added");
     }
+    if (collective->status.ok() && collective->root_rank >= 0 &&
+        context.source_rank >= 0 &&
+        collective->root_rank != context.source_rank) {
+      collective->status = errors::Internal(
+          "Collective ", collective->collective_key, " already has root_rank ",
+          collective->root_rank, " but new participant has root_rank ",
+          context.source_rank);
+    }
 
+    if (context.source_rank >= 0) {
+      collective->root_rank = context.source_rank;
+    }
     collective->participants.emplace_back(std::move(participant));
     ++collective->available_participants;
 
@@ -508,19 +539,12 @@ bool NcclManager::CheckReady(const string& collective_key,
 void NcclManager::RunCollective(Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  Status s = collective->status;
-  if (s.ok()) {
-    s = GetCommunicator(collective, &collective->communicator);
-  }
-  if (!s.ok()) {
-    for (int i = 0; i < collective->num_local_devices; ++i) {
-      collective->participants[i]->done_callback(s);
-    }
-    collective->Unref();
-    return;
+  Status status = collective->status;
+  if (status.ok()) {
+    status = GetCommunicator(collective, &collective->communicator);
   }
 
-  for (int i = 0; i < collective->num_local_devices; ++i) {
+  for (int i = 0; status.ok() && i < collective->num_local_devices; ++i) {
     Participant* p = collective->participants[i].get();
     NcclStream* nccl_stream = collective->communicator->members[i].nccl_stream;
     CHECK(nccl_stream != nullptr);
@@ -530,16 +554,33 @@ void NcclManager::RunCollective(Collective* collective) {
       // Wait to ensure that the kernel that produces the data in the input
       // tensor has finished running before the nccl kernel runs on the
       // communication stream.
-      nccl_stream->stream->ThenWaitFor(p->tensor_stream);
+      nccl_stream->stream->ThenWaitFor(p->input_event.get());
     }
     if (p->root) {
-      CHECK_EQ(collective->root_rank, -1);
-      collective->root_rank = rank;
+      if (collective->root_rank == -1) {
+        collective->root_rank = rank;
+      } else if (collective->root_rank != rank) {
+        status = errors::Internal(
+            "Inconsistent root rank ", collective->root_rank, " and GPU id ",
+            p->gpu_device_id, " rank ", rank, " also marked as root.");
+      }
     }
+    VLOG(2) << "RunCollective rank " << rank << " global_rank "
+            << p->global_rank << " root_rank " << collective->root_rank;
   }
 
-  if (collective->type == kBroadcast) {
-    CHECK_NE(collective->root_rank, -1);
+  if (status.ok() && collective->type == kBroadcast &&
+      collective->root_rank < 0) {
+    status = errors::Internal("Root rank not indicated for collective ",
+                              collective->collective_key);
+  }
+
+  if (!status.ok()) {
+    for (int i = 0; i < collective->num_local_devices; ++i) {
+      collective->participants[i]->done_callback(status);
+    }
+    collective->Unref();
+    return;
   }
 
   {
@@ -608,10 +649,31 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
         break;
       }
       case kBroadcast: {
-        const Tensor* buf_t = p->input ? p->input : p->output;
-        void* buf = const_cast<char*>(buf_t->tensor_data().data());
-        nccl_result = ncclBcast(buf, buf_t->NumElements(), data_type,
-                                collective->root_rank, nccl_comm, *cu_stream);
+        const void* sendbuff = nullptr;
+        void* recvbuff = nullptr;
+        int num_elements = -1;
+        if (p->input) {
+          sendbuff = p->input->tensor_data().data();
+          num_elements = p->input->NumElements();
+        }
+        if (p->output) {
+          recvbuff = const_cast<char*>(p->output->tensor_data().data());
+          num_elements = p->output->NumElements();
+        }
+        if (num_elements < 0) {
+          p->done_callback(errors::Internal(
+              "Both input and output are null in ncclBroadcast"));
+          collective->Unref();
+          continue;
+        }
+        VLOG(2) << "call NcclBroadcast collective_key "
+                << collective->collective_key << " participant " << p_idx
+                << " sendbuff " << sendbuff << " recvbuff " << recvbuff
+                << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
+                << " cuda_stream " << cu_stream;
+        nccl_result =
+            ncclBroadcast(sendbuff, recvbuff, num_elements, data_type,
+                          collective->root_rank, nccl_comm, *cu_stream);
         break;
       }
       case kReduce: {
@@ -662,4 +724,4 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index ebb2aab44e0..9f4ef255ab3 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
 #define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <vector>
 
@@ -27,7 +27,12 @@ limitations under the License.
 #endif
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/rccl/rccl.h"
+#endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -63,6 +68,7 @@ class NcclManager {
           event_mgr(event_mgr),
           gpu_device_id(gpu_device_id),
           input(input),
+          input_event(nullptr),
           output(output),
           global_rank(global_rank),
           done_callback(std::move(done_callback)),
@@ -70,6 +76,11 @@ class NcclManager {
       DCHECK(executor != nullptr);
       DCHECK(event_mgr != nullptr);
       DCHECK(tensor_stream != nullptr);
+      if (input != nullptr) {
+        input_event = absl::make_unique<se::Event>(executor);
+        input_event->Init();
+        tensor_stream->ThenRecordEvent(input_event.get());
+      }
     }
 
     // StreamExecutor for the device. Expected to be live for process lifetime.
@@ -94,6 +105,10 @@ class NcclManager {
     // called. Is NULL for participants that only receive data.
     const Tensor* input;
 
+    // Wait on this event rather than synchronizing on the entire stream.
+    // This allows greater concurrency between compute and nccl streams.
+    std::unique_ptr<se::Event> input_event;
+
     // Owned by the caller, who must keep it live until `done_callback` is
     // called. Is NULL for participants that only send data.
     Tensor* output;
@@ -115,11 +130,13 @@ class NcclManager {
   // operation key, number of participants, and communicator key.
   struct Context {
     Context(const string& collective_key, int num_local_devices,
-            int num_global_devices, const string& communicator_key)
+            int num_global_devices, const string& communicator_key,
+            int source_rank)
         : collective_key(collective_key),
           num_local_devices(num_local_devices),
           num_global_devices(num_global_devices),
-          communicator_key(communicator_key) {}
+          communicator_key(communicator_key),
+          source_rank(source_rank) {}
 
     // Unique key for this collective instance
     const string& collective_key;
@@ -137,6 +154,9 @@ class NcclManager {
     // `communicator_key` is not required for single-node collectives and can be
     // empty.
     const string& communicator_key;
+
+    // Rank of broadcast source.
+    int source_rank;
   };
 
   // Adds one participant to an all-reduce.
@@ -229,6 +249,6 @@ class NcclManager {
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index fcf67c2e8b5..9b650c66fa7 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/nccl/nccl_manager.h"
 
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 
@@ -60,7 +61,8 @@ class NcclManagerTest : public ::testing::Test {
 
     mutex mu;
     Status final_status;
-    int num_completed = 0;
+    int num_completed GUARDED_BY(mu) = 0;
+    condition_variable done_cv;
   };
 
   static void SetUpTestSuite() {
@@ -68,13 +70,20 @@ class NcclManagerTest : public ::testing::Test {
     setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
     devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(INFO) << "Running test with " << devices_->size() << " gpus";
+    work_queue_ = new UnboundedWorkQueue(Env::Default(), "nccl_manager_test");
   }
 
-  void SetUp() override { ASSERT_GT(devices_->size(), 0) << "No GPUs found"; }
+  void SetUp() override {
+    ASSERT_GT(devices_->size(), 0) << "No GPUs found";
+    ASSERT_NE(work_queue_, nullptr);
+  }
 
   static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
 
-  static void TearDownTestSuite() { delete devices_; }
+  static void TearDownTestSuite() {
+    delete devices_;
+    delete work_queue_;
+  }
 
   TestCase* MakeReductionTestCase(int num_nodes, int num_ranks_per_node,
                                   ncclRedOp_t reduction_op, TensorShape shape,
@@ -178,15 +187,53 @@ class NcclManagerTest : public ::testing::Test {
     return test_case;
   }
 
+  // Make a broadcast test which broadcasts a tensor with shape `shape` from
+  // `src_node`, `src_rank` to all other ranks.
+  // If `in_place` is true, input and output are the same for the source,
+  // otherwise they are tensors backed by different buffers.
+  TestCase* MakeBroadcastTestCase(int num_nodes, int num_ranks_per_node,
+                                  TensorShape shape, int src_node, int src_rank,
+                                  bool in_place) {
+    TestCase* test_case = new TestCase();
+    test_case->expected = Tensor(data_type_, shape);
+    test::FillFn<Scalar>(&test_case->expected,
+                         [](int) { return static_cast<Scalar>(1); });
+
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int local_rank = 0; local_rank < num_ranks_per_node; ++local_rank) {
+        auto* device = GetDevice(local_rank);
+        if (node == src_node && local_rank == src_rank) {
+          test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+          if (in_place) {
+            test_case->outs.emplace_back(test_case->ins.back());
+          } else {
+            test_case->outs.emplace_back(GpuAllocator(device), data_type_,
+                                         shape);
+          }
+          Tensor in_cpu(data_type_, shape);
+          test::FillFn<Scalar>(&in_cpu,
+                               [](int) { return static_cast<Scalar>(1); });
+          const Tensor& in_gpu = test_case->ins.back();
+          auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+          auto* stream = device->tensorflow_gpu_device_info()->stream;
+          stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
+                             in_cpu.TotalBytes());
+        } else {
+          test_case->ins.emplace_back(Tensor());
+          test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
+        }
+      }
+    }
+
+    return test_case;
+  }
+
   // Waits for the done callback to be called for each participant.
   void WaitForTestCompletion(TestCase* test_case) {
-    test_case->mu.lock();
+    mutex_lock l(test_case->mu);
     while (test_case->num_completed != test_case->outs.size()) {
-      test_case->mu.unlock();
-      Env::Default()->SleepForMicroseconds(10);
-      test_case->mu.lock();
+      test_case->done_cv.wait(l);
     }
-    test_case->mu.unlock();
   }
 
   void VerifyResults(TestCase* test_case) {
@@ -218,23 +265,31 @@ class NcclManagerTest : public ::testing::Test {
   NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
     return [this, test_case](Status s) {
       mutex_lock l(test_case->mu);
-      ++test_case->num_completed;
       test_case->final_status.Update(s);
+      if (++test_case->num_completed == test_case->outs.size()) {
+        test_case->done_cv.notify_one();
+      }
     };
   }
 
-  void RunMultiNodeTest(const int num_nodes, const int num_ranks_per_node) {
+  struct NodeState {
+    NcclManager nccl_manager;
+    std::atomic<int> launched{0};
+  };
+
+  void RunMultiNodeAllReduceTest(const int num_nodes,
+                                 const int num_ranks_per_node) {
     const int num_global_ranks = num_nodes * num_ranks_per_node;
-    std::vector<NcclManager> nccl_managers(num_nodes);
+    std::vector<NodeState> node_states(num_nodes);
     const string collective_key = "allreduce";
     // The NcclManagers in this test synchronize in real-time, so we need to run
     // each node's code in a separate thread.
     // Specifically, the call to ncclGroupEnd() after calling ncclCommInitRank
     // waits for all communicators before returning.
-    thread::ThreadPool pool(Env::Default(), "test_multi_node_nccl", num_nodes);
 
     // First, initialize the communicator_key used for this collective.
-    const string communicator_key = nccl_managers[0].GenerateCommunicatorKey();
+    const string communicator_key =
+        node_states[0].nccl_manager.GenerateCommunicatorKey();
 
     for (int op = 0; op < 4; ++op) {
       ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@@ -243,7 +298,7 @@ class NcclManagerTest : public ::testing::Test {
                                       reduction_op, TensorShape({2, 3}), 0.0f));
       for (int node = 0; node < num_nodes; ++node) {
         auto node_fn = [this, node, num_ranks_per_node, num_global_ranks,
-                        &nccl_managers, &communicator_key, &collective_key,
+                        &node_states, &communicator_key, &collective_key,
                         reduction_op, &test_case] {
           for (int local_rank = 0; local_rank < num_ranks_per_node;
                ++local_rank) {
@@ -255,19 +310,19 @@ class NcclManagerTest : public ::testing::Test {
                 device->executor(), stream, event_mgr, device->gpu_id(),
                 &test_case->ins[global_rank], &test_case->outs[global_rank],
                 global_rank, this->CreateDoneCallback(test_case.get()));
-            nccl_managers[node].AddToAllReduce(
+            node_states[node].nccl_manager.AddToAllReduce(
                 std::move(participant),
                 {collective_key, num_ranks_per_node, num_global_ranks,
-                 communicator_key},
+                 communicator_key, /*source_rank=*/-1},
                 reduction_op);
             VLOG(1) << "AddToAllReduce node " << node << " global_rank "
                     << global_rank;
           }
 
           // Signal collective ready to launch at this node.
-          nccl_managers[node].SignalMultiNodeReady(collective_key);
+          node_states[node].nccl_manager.SignalMultiNodeReady(collective_key);
         };
-        pool.Schedule(node_fn);
+        this->work_queue_->Schedule(node_fn);
       }
 
       VLOG(2) << "Verifying results";
@@ -275,10 +330,74 @@ class NcclManagerTest : public ::testing::Test {
     }
   }
 
+  void RunMultiNodeBroadcastTest(const int num_nodes,
+                                 const int num_ranks_per_node,
+                                 const int src_node, const int src_local_rank,
+                                 const bool in_place) {
+    const int num_global_ranks = num_nodes * num_ranks_per_node;
+    const int src_global_rank = src_node * num_ranks_per_node + src_local_rank;
+    const string collective_key = "broadcast";
+    std::vector<NodeState> node_states(num_nodes);
+    const string communicator_key =
+        node_states[0].nccl_manager.GenerateCommunicatorKey();
+    std::unique_ptr<TestCase> test_case(this->MakeBroadcastTestCase(
+        num_nodes, num_ranks_per_node, TensorShape({5, 6}), src_node,
+        src_local_rank, in_place));
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int local_rank = 0; local_rank < num_ranks_per_node; ++local_rank) {
+        // Launch each rank in a separate thread to test concurrent,
+        // randomly-ordered calls into NcclManager.
+        auto rank_fn = [this, node, num_ranks_per_node, num_global_ranks,
+                        src_global_rank, local_rank, &node_states,
+                        &collective_key, &communicator_key, &test_case]() {
+          auto* device = this->GetDevice(local_rank);
+          auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+          auto* stream = device->tensorflow_gpu_device_info()->stream;
+          const int global_rank = node * num_ranks_per_node + local_rank;
+          auto* input = global_rank == src_global_rank
+                            ? &test_case->ins[global_rank]
+                            : nullptr;
+          auto* output = test_case->outs[global_rank].NumElements() == 0
+                             ? nullptr
+                             : &test_case->outs[global_rank];
+          auto participant = absl::make_unique<NcclManager::Participant>(
+              device->executor(), stream, event_mgr, device->gpu_id(), input,
+              output, global_rank, this->CreateDoneCallback(test_case.get()));
+          if (global_rank == src_global_rank) {
+            VLOG(1) << "AddBroadcastSend node " << node << " global_rank "
+                    << global_rank;
+            node_states[node].nccl_manager.AddBroadcastSend(
+                std::move(participant),
+                {collective_key, num_ranks_per_node, num_global_ranks,
+                 communicator_key, src_global_rank});
+          } else {
+            VLOG(1) << "AddBroadcastRecv node " << node << " global_rank "
+                    << global_rank;
+            node_states[node].nccl_manager.AddBroadcastRecv(
+                std::move(participant),
+                {collective_key, num_ranks_per_node, num_global_ranks,
+                 communicator_key, src_global_rank});
+          }
+
+          if (++node_states[node].launched == num_ranks_per_node) {
+            // Signal collective ready to launch at this node.
+            node_states[node].nccl_manager.SignalMultiNodeReady(collective_key);
+          }
+        };
+        this->work_queue_->Schedule(std::move(rank_fn));
+      }
+    }
+
+    VLOG(2) << "Verifying results";
+    this->VerifyResults(test_case.get());
+  }
+
   static BaseGPUDevice* GetDevice(size_t rank) {
     return devices_->at(rank % devices_->size()).get();
   }
 
+  static UnboundedWorkQueue* work_queue_;
+
  private:
   static Allocator* GpuAllocator(BaseGPUDevice* device) {
     return device->GetAllocator(AllocatorAttributes());
@@ -290,7 +409,6 @@ class NcclManagerTest : public ::testing::Test {
     return typed;
   }
 
- private:
   static std::vector<std::unique_ptr<BaseGPUDevice>>* devices_;
   static const DataType data_type_;
   static const Scalar max_;
@@ -305,6 +423,8 @@ const DataType NcclManagerTest<Scalar>::data_type_ =
 template <typename Scalar>
 const Scalar NcclManagerTest<Scalar>::max_ =
     Eigen::NumTraits<Scalar>::highest();
+template <typename Scalar>
+UnboundedWorkQueue* NcclManagerTest<Scalar>::work_queue_ = nullptr;
 
 // Instantiate tests for float and double.
 using TypeList = ::testing::Types<float, double>;
@@ -331,7 +451,8 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
       NcclManager::instance()->AddToAllReduce(
           std::move(participant),
           {"allreduce", /*num_local_devices=*/num_ranks,
-           /*num_global_devices=*/num_ranks, /*communicator_key=*/""},
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/"",
+           /*source_rank=*/-1},
           reduction_op);
     }
 
@@ -348,7 +469,6 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 4;
   const int num_collectives_per_iteration = 10;
-  const int num_threads = num_ranks * 2;
   const int time_limit_micros = 1 * 1000 * 1000;  // 1 second
 
   int64 start = Env::Default()->NowMicros();
@@ -376,8 +496,6 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
                  std::mt19937(std::random_device()()));
 
     mutex mu;  // guards case_and_rank.
-    std::unique_ptr<thread::ThreadPool> pool(
-        new thread::ThreadPool(Env::Default(), "test", num_threads));
     const int to_schedule = case_and_rank.size();
     for (int i = 0; i < to_schedule; ++i) {
       auto fn = [&]() {
@@ -402,12 +520,11 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
             {strings::StrCat("allreduce", test_num),
              /*num_local_devices=*/num_ranks,
              /*num_global_devices=*/num_ranks,
-             /*communicator_key=*/""},
+             /*communicator_key=*/"", /*source_rank=*/-1},
             ncclSum);
       };
-      pool->Schedule(fn);
+      this->work_queue_->Schedule(fn);
     }
-    pool.reset();  // wait for all work to be scheduled.
 
     VLOG(2) << "Verifying results for " << num_collectives_per_iteration
             << " collectives";
@@ -443,7 +560,8 @@ TYPED_TEST(NcclManagerTest, BasicAllGather) {
       NcclManager::instance()->AddToAllGather(
           std::move(participant),
           {"allgather", /*num_local_devices=*/num_ranks,
-           /*num_global_devices=*/num_ranks, /*communicator_key=*/""});
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/"",
+           /*source_rank=*/-1});
     }
 
     LOG(INFO) << "Verifying results";
@@ -451,6 +569,32 @@ TYPED_TEST(NcclManagerTest, BasicAllGather) {
   }
 }
 
+// Test basic broadcast.
+TYPED_TEST(NcclManagerTest, BasicBroadcast) {
+  this->RunMultiNodeBroadcastTest(/*num_nodes=*/1, /*num_ranks_per_node=*/4,
+                                  /*src_node=*/0, /*src_local_rank=*/2,
+                                  /*in_place=*/false);
+}
+
+// Test in-place broadcast.
+TYPED_TEST(NcclManagerTest, InPlaceBroadcast) {
+  this->RunMultiNodeBroadcastTest(/*num_nodes=*/1, /*num_ranks_per_node=*/4,
+                                  /*src_node=*/0, /*src_local_rank=*/1,
+                                  /*in_place=*/true);
+}
+
+// Test broadcast with increasing ranks.
+TYPED_TEST(NcclManagerTest, BroadcastWithDifferentRanks) {
+  for (int num_ranks = 4; num_ranks <= 8; ++num_ranks) {
+    const int src_rank = static_cast<int>(random::New64() % num_ranks);
+    for (int in_place_idx = 0; in_place_idx <= 1; ++in_place_idx) {
+      const bool in_place = in_place_idx == 0;
+      this->RunMultiNodeBroadcastTest(/*num_nodes=*/1, num_ranks,
+                                      /*src_node=*/0, src_rank, in_place);
+    }
+  }
+}
+
 // Multi-node NCCL tests.
 
 TEST(NcclManagerTest, CommunicatorKey) {
@@ -463,13 +607,20 @@ TEST(NcclManagerTest, CommunicatorKey) {
 // environment.  It works on a single node and reuses GPUs.  It enqueues NCCL
 // kernels on separate stream per rank.
 TYPED_TEST(NcclManagerTest, MultiNode) {
-  this->RunMultiNodeTest(/*num_nodes=*/2, /*num_ranks_per_node=*/4);
+  this->RunMultiNodeAllReduceTest(/*num_nodes=*/2, /*num_ranks_per_node=*/4);
 }
 
 // Tests that specifying `communicator_key` with a single node NCCL collective
 // works well.
 TYPED_TEST(NcclManagerTest, MultiNodeSingle) {
-  this->RunMultiNodeTest(/*num_nodes=*/1, /*num_ranks_per_node=*/4);
+  this->RunMultiNodeAllReduceTest(/*num_nodes=*/1, /*num_ranks_per_node=*/4);
+}
+
+// Multi-node broadcast.
+TYPED_TEST(NcclManagerTest, MultiNodeBroadcast) {
+  this->RunMultiNodeBroadcastTest(/*num_nodes=*/4, /*num_ranks_per_node=*/8,
+                                  /*src_node=*/2, /*src_local_rank=*/3,
+                                  /*in_place=*/true);
 }
 
 // Checks that we return error status if a collective_key is used for different
@@ -493,14 +644,16 @@ TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
                                               {"bad_coll_type",
                                                /*num_local_devices=*/num_ranks,
                                                /*num_global_devices=*/num_ranks,
-                                               /*communicator_key=*/""},
+                                               /*communicator_key=*/"",
+                                               /*source_rank=*/-1},
                                               ncclSum);
     } else {
       NcclManager::instance()->AddBroadcastSend(
-          std::move(participant), {"bad_coll_type",
-                                   /*num_local_devices=*/num_ranks,
-                                   /*num_global_devices=*/num_ranks,
-                                   /*communicator_key=*/""});
+          std::move(participant),
+          {"bad_coll_type",
+           /*num_local_devices=*/num_ranks,
+           /*num_global_devices=*/num_ranks,
+           /*communicator_key=*/"", /*source_rank=*/-1});
     }
   }
 
@@ -528,7 +681,8 @@ TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
         {"bad_coll_type",
          /*num_local_devices=*/num_ranks,
          /*num_global_devices=*/num_ranks,
-         rank == 0 ? "" : NcclManager::instance()->GenerateCommunicatorKey()},
+         rank == 0 ? "" : NcclManager::instance()->GenerateCommunicatorKey(),
+         /*source_rank=*/-1},
         ncclSum);
   }
 
@@ -556,13 +710,96 @@ TYPED_TEST(NcclManagerTest, ConsistentNumberOfDevices) {
                                             {"bad_coll_type",
                                              /*num_local_devices=*/num_devices,
                                              /*num_global_devices=*/num_devices,
-                                             /*communicator_key=*/""},
+                                             /*communicator_key=*/"",
+                                             /*source_rank=*/-1},
                                             ncclSum);
   }
 
   this->VerifyError(test_case.get());
-}  // namespace tensorflow
+}
+
+// Checks that we return error status if a broadcast does not have source.
+TYPED_TEST(NcclManagerTest, BroadcastNoSource) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeBroadcastTestCase(/*num_nodes=*/1, num_ranks,
+                                  TensorShape({2, 3}), /*src_node=*/-1,
+                                  /*src_rank=*/-1, false));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(), nullptr,
+        &test_case->outs[rank], rank,
+        this->CreateDoneCallback(test_case.get()));
+    NcclManager::instance()->AddBroadcastRecv(std::move(participant),
+                                              {"bcast_no_send",
+                                               /*num_local_devices=*/num_ranks,
+                                               /*num_global_devices=*/num_ranks,
+                                               /*communicator_key=*/"",
+                                               /*source_rank=*/-1});
+  }
+
+  this->VerifyError(test_case.get());
+}
+
+// Checks that we return error status if a broadcast has multiple sends.
+TYPED_TEST(NcclManagerTest, BroadcastMultipleSends) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeBroadcastTestCase(/*num_nodes=*/1, num_ranks,
+                                  TensorShape({2, 3}), /*src_node=*/-1,
+                                  /*src_rank=*/-1, false));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(),
+        &test_case->outs[rank], &test_case->outs[rank], rank,
+        this->CreateDoneCallback(test_case.get()));
+    NcclManager::instance()->AddBroadcastSend(std::move(participant),
+                                              {"bcast_multiple_send",
+                                               /*num_local_devices=*/num_ranks,
+                                               /*num_global_devices=*/num_ranks,
+                                               /*communicator_key=*/"",
+                                               /*source_rank=*/-1});
+  }
+
+  this->VerifyError(test_case.get());
+}
+
+// Checks that we return error status if a broadcast has inconsistent source
+// ranks.
+TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeBroadcastTestCase(/*num_nodes=*/1, num_ranks,
+                                  TensorShape({2, 3}), /*src_node=*/-1,
+                                  /*src_rank=*/-1, false));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(),
+        &test_case->outs[rank], &test_case->outs[rank], rank,
+        this->CreateDoneCallback(test_case.get()));
+    NcclManager::instance()->AddBroadcastRecv(std::move(participant),
+                                              {"bcast_inconsistent_source",
+                                               /*num_local_devices=*/num_ranks,
+                                               /*num_global_devices=*/num_ranks,
+                                               /*communicator_key=*/"",
+                                               /*source_rank=*/rank});
+  }
+
+  this->VerifyError(test_case.get());
+}
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 8d9759c54d6..f53210a4d46 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1555,6 +1555,7 @@ REGISTER_OP("CheckNumerics")
     .Output("output: T")
     .Attr("T: {bfloat16, half, float, double}")
     .Attr("message: string")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
@@ -2979,10 +2980,21 @@ REGISTER_OP("QuantizeAndDequantizeV2")
         "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
         "'HALF_TO_EVEN'")
     .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
       c->set_output(0, c->input(0));
       return Status::OK();
     });
@@ -2997,10 +3009,22 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Output("output: T")
     .Attr("T: {bfloat16, half, float, double}")
     .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
     .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
       ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(0, c->input(0));
       return Status::OK();
@@ -3397,7 +3421,7 @@ REGISTER_OP("Fingerprint")
           return errors::InvalidArgument("`method` must be rank 0: ",
                                          method->shape());
         }
-        const string& method_string = method->scalar<string>()();
+        const string& method_string = method->scalar<tstring>()();
         if (method_string != "farmhash64") {
           return errors::InvalidArgument("Unsupported method: ", method_string);
         }
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index dae0281ec86..e255553f385 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -1310,12 +1310,24 @@ TEST(ArrayOpsTest, ExtractImagePatchesShapeTest) {
 
 TEST(ArrayOpsTest, QuantizeAndDequantizeV2_ShapeFn) {
   ShapeInferenceTestOp op("QuantizeAndDequantizeV2");
+  op.input_tensors.resize(3);
+  TF_ASSERT_OK(NodeDefBuilder("test", "QuantizeAndDequantizeV2")
+                   .Input("input", 0, DT_FLOAT)
+                   .Input("input_min", 1, DT_FLOAT)
+                   .Input("input_max", 2, DT_FLOAT)
+                   .Attr("signed_input", true)
+                   .Attr("num_bits", 8)
+                   .Attr("range_given", false)
+                   .Attr("narrow_range", false)
+                   .Attr("axis", -1)
+                   .Finalize(&op.node_def));
   INFER_OK(op, "?;?;?", "in0");
   INFER_OK(op, "[];?;?", "in0");
   INFER_OK(op, "[1,2,?,4,5];?;?", "in0");
 
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[1];[]");
-  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[];[1]");
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op,
+              "[1,2,?,4,5];[];[1]");
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[1];[1]");
 }
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index b05b2f57898..39fbd1606cf 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -105,7 +106,7 @@ REGISTER_OP("BoostedTreesCalculateBestFeatureSplit")
     .Input("tree_complexity: float")
     .Input("min_node_weight: float")
     .Attr("logits_dimension: int >= 1")
-    .Attr("split_type: {'inequality'} = 'inequality'")
+    .Attr("split_type: {'inequality', 'equality'} = 'inequality'")
     .Output("node_ids: int32")
     .Output("gains: float32")
     .Output("feature_dimensions: int32")
@@ -301,7 +302,7 @@ REGISTER_OP("BoostedTreesAggregateStats")
       DimensionHandle stats_dim;
       TF_RETURN_IF_ERROR(c->Add(logits_dim, hessian_dim, &stats_dim));
       c->set_output(
-          0, c->MakeShape({max_splits, num_buckets, feature_dim, stats_dim}));
+          0, c->MakeShape({max_splits, feature_dim, num_buckets, stats_dim}));
       return Status::OK();
     });
 
@@ -378,10 +379,13 @@ REGISTER_OP("BoostedTreesPredict")
       TF_RETURN_IF_ERROR(
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
       shape_inference::ShapeHandle unused_input;
+      shape_inference::DimensionHandle batch_size = c->Dim(c->input(1), 0);
       for (int i = 0; i < num_bucketized_features; ++i) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 1), 1, &feature_shape));
-        // Check that the shapes of all bucketized features are the same.
-        TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtMost(c->input(i + 1), 2, &feature_shape));
+        // Check that all bucketized features have the same batch size.
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(c->input(1), 0),
+                                    c->Dim(c->input(i + 1), 0), &batch_size));
       }
 
       int logits_dimension;
@@ -405,10 +409,13 @@ REGISTER_OP("BoostedTreesExampleDebugOutputs")
       TF_RETURN_IF_ERROR(
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
       shape_inference::ShapeHandle unused_input;
+      shape_inference::DimensionHandle batch_dim = c->Dim(c->input(1), 0);
       for (int i = 0; i < num_bucketized_features; ++i) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 1), 1, &feature_shape));
-        // Check that the shapes of all bucketized features are the same.
-        TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtMost(c->input(i + 1), 2, &feature_shape));
+        // Check that all bucketized features have the same batch size.
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(c->input(1), 0),
+                                    c->Dim(c->input(i + 1), 0), &batch_dim));
       }
 
       // Multi-class will be supported by modifying the proto.
@@ -446,14 +453,19 @@ REGISTER_OP("BoostedTreesTrainingPredict")
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
 
       shape_inference::ShapeHandle unused_input;
+      shape_inference::DimensionHandle batch_size = c->Dim(c->input(3), 0);
       for (int i = 0; i < num_bucketized_features; ++i) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtMost(c->input(i + 3), 2, &feature_shape));
         TF_RETURN_IF_ERROR(
             c->Merge(c->input(i + 3), feature_shape, &unused_input));
       }
-      // all inputs/outputs except logits should have same shape.
-      TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(2), feature_shape, &unused_input));
+      shape_inference::ShapeHandle tree_ids_shape;
+      shape_inference::ShapeHandle node_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &tree_ids_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &node_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(tree_ids_shape, 0),
+                                  c->Dim(node_ids_shape, 0), &batch_size));
 
       int logits_dimension;
       TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
@@ -521,6 +533,75 @@ REGISTER_OP("BoostedTreesUpdateEnsemble")
       return Status::OK();
     });
 
+REGISTER_OP("BoostedTreesUpdateEnsembleV2")
+    .Input("tree_ensemble_handle: resource")
+    .Input("feature_ids: int32")
+    .Input("dimension_ids: num_features * int32")
+    .Input("node_ids: num_features * int32")
+    .Input("gains: num_features * float")
+    .Input("thresholds: num_features * int32")
+    .Input("left_node_contribs: num_features * float")
+    .Input("right_node_contribs: num_features * float")
+    .Input("split_types: num_features * string")
+    .Input("max_depth: int32")
+    .Input("learning_rate: float")
+    .Input("pruning_mode: int32")
+    .Attr("num_features: int >= 0")  // Inferred.
+    .Attr("logits_dimension: int = 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+
+      // Feature_ids, should be one for each feature.
+      shape_inference::ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->input(1), c->Vector(num_features), &shape_handle));
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      for (int i = 0; i < num_features; ++i) {
+        // Dimension ids.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 2), 1, &shape_handle));
+
+        // Node ids.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features + 2), 1, &shape_handle));
+        auto shape_rank_1 = c->MakeShape({c->Dim(shape_handle, 0)});
+        auto shape_rank_2 =
+            c->MakeShape({c->Dim(shape_handle, 0), logits_dimension});
+
+        // Gains.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 2 + 2), 1, &shape_handle));
+        // TODO(nponomareva): replace this with input("name",vector of shapes).
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 2 + 2),
+                                    shape_rank_1, &shape_handle));
+        // Thresholds.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 3 + 2), 1, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 3 + 2),
+                                    shape_rank_1, &shape_handle));
+        // Left and right node contribs.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 4 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 4 + 2),
+                                    shape_rank_2, &shape_handle));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 5 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 5 + 2),
+                                    shape_rank_2, &shape_handle));
+
+        // Split types.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 6 + 2), 1, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 6 + 2),
+                                    shape_rank_1, &shape_handle));
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("BoostedTreesCenterBias")
     .Input("tree_ensemble_handle: resource")
     .Input("mean_gradients: float")
@@ -594,6 +675,20 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries")
       return Status::OK();
     });
 
+REGISTER_OP("BoostedTreesFlushQuantileSummaries")
+    .Attr("num_features: int >= 0")
+    .Input("quantile_stream_resource_handle: resource")
+    .Output("summaries: num_features * float")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      for (int i = 0; i < num_features; ++i) {
+        // the columns are value, weight, min_rank, max_rank.
+        c->set_output(i, c->MakeShape({c->UnknownDim(), 4}));
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("BoostedTreesQuantileStreamResourceAddSummaries")
     .Attr("num_features: int >= 0")
     .Input("quantile_stream_resource_handle: resource")
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 4b50d62ee7b..8b13f2d68dc 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -30,6 +30,7 @@ REGISTER_OP("CollectiveReduce")
     .Attr("final_op: {'Id', 'Div'}")
     .Attr("subdiv_offsets: list(int)")
     .Attr("wait_for: list(int) = []")
+    .Attr("communication_hint: string = 'auto'")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -41,6 +42,7 @@ REGISTER_OP("CollectiveGather")
     .Attr("group_key: int")
     .Attr("instance_key: int")
     .Attr("shape: shape")
+    .Attr("communication_hint: string = 'auto'")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Scalar input is not supported.
@@ -83,6 +85,7 @@ REGISTER_OP("CollectiveBcastSend")
     .Attr("group_key: int")
     .Attr("instance_key: int")
     .Attr("shape: shape")
+    .Attr("communication_hint: string = 'auto'")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
@@ -93,6 +96,7 @@ REGISTER_OP("CollectiveBcastRecv")
     .Attr("group_key: int")
     .Attr("instance_key: int")
     .Attr("shape: shape")
+    .Attr("communication_hint: string = 'auto'")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 566fa892060..299076d8cfd 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -34,11 +34,11 @@ tf_cc_test(
     size = "small",
     srcs = ["backwards_compatibility_test.cc"],
     data = [
-        ":ops_history.v0.pbtxt",
-        ":ops_history.v1.pbtxt",
-        ":ops_history.v2.pbtxt",
         "//tensorflow/core:ops/ops.pbtxt",
-    ],
+    ] + glob([
+        "ops_history_v*/*.pbtxt",
+        "ops_history.v*.pbtxt",
+    ]),
     deps = [
         ":op_compatibility_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.cc b/tensorflow/core/ops/compat/op_compatibility_lib.cc
index a44fead871a..9005e743c03 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.cc
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.cc
@@ -27,17 +27,83 @@ limitations under the License.
 
 namespace tensorflow {
 
+static string OpsHistoryDirectory(const string& ops_prefix,
+                                  const string& history_version) {
+  return io::JoinPath(ops_prefix,
+                      strings::StrCat("compat/ops_history_", history_version));
+}
+
 static string OpsHistoryFile(const string& ops_prefix,
                              const string& history_version) {
   return io::JoinPath(ops_prefix, strings::StrCat("compat/ops_history.",
                                                   history_version, ".pbtxt"));
 }
 
+static string FileNameFromOpName(const string& op_name) {
+  return strings::StrCat(op_name, ".pbtxt");
+}
+
+static void AddNewOpToHistory(const OpDef& op,
+                              OpCompatibilityLib::OpHistory* out_op_history) {
+  if (out_op_history != nullptr) {
+    out_op_history->emplace_back(FileNameFromOpName(op.name()), OpList());
+    *out_op_history->back().second.add_op() = op;
+  }
+}
+
+static Status ReadOpHistory(Env* env, const string& file,
+                            const string& directory,
+                            OpCompatibilityLib::OpHistory* out) {
+  // Read op history form `directory` if it exists there.
+  std::vector<string> matching_files;
+  Status status = env->GetMatchingPaths(io::JoinPath(directory, "*.pbtxt"),
+                                        &matching_files);
+  if (status.ok() && !matching_files.empty()) {
+    printf("Reading op history from %s/*.pbtxt...\n", directory.c_str());
+    std::sort(matching_files.begin(), matching_files.end());
+    for (const string& full_file : matching_files) {
+      string op_history_str;
+      TF_RETURN_IF_ERROR(ReadFileToString(env, full_file, &op_history_str));
+      OpList in_op_history;
+      protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
+      const string file_tail = FileNameFromOpName(in_op_history.op(0).name());
+      const string expected = io::JoinPath(directory, file_tail);
+      if (full_file != expected) {
+        return errors::Internal("Expected file paths to match but '", full_file,
+                                "' != '", expected, "'");
+      }
+      out->emplace_back(file_tail, in_op_history);
+    }
+  } else {  // Otherwise, fall back to reading op history from `file`.
+    printf("Reading op history from %s...\n", file.c_str());
+    string op_history_str;
+    TF_RETURN_IF_ERROR(ReadFileToString(env, file, &op_history_str));
+    OpList in_op_history;
+    protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
+    // Convert from a linear OpList to OpHistory format with one OpList per
+    // unique op name.
+    int start = 0;
+    while (start < in_op_history.op_size()) {
+      int end = start + 1;
+      while (end < in_op_history.op_size() &&
+             in_op_history.op(start).name() == in_op_history.op(end).name()) {
+        ++end;
+      }
+      AddNewOpToHistory(in_op_history.op(start), out);
+      for (++start; start < end; ++start) {
+        *out->back().second.add_op() = in_op_history.op(start);
+      }
+    }
+  }
+  return Status::OK();
+}
+
 OpCompatibilityLib::OpCompatibilityLib(const string& ops_prefix,
                                        const string& history_version,
                                        const std::set<string>* stable_ops)
     : ops_file_(io::JoinPath(ops_prefix, "ops.pbtxt")),
       op_history_file_(OpsHistoryFile(ops_prefix, history_version)),
+      op_history_directory_(OpsHistoryDirectory(ops_prefix, history_version)),
       stable_ops_(stable_ops) {
   // Get the sorted list of all registered OpDefs.
   printf("Getting all registered ops...\n");
@@ -46,7 +112,7 @@ OpCompatibilityLib::OpCompatibilityLib(const string& ops_prefix,
 
 Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
                                               int* added_ops,
-                                              OpList* out_op_history) {
+                                              OpHistory* out_op_history) {
   *changed_ops = 0;
   *added_ops = 0;
 
@@ -78,104 +144,90 @@ Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
     }
   }
 
-  OpList in_op_history;
-  {  // Read op history.
-    printf("Reading op history from %s...\n", op_history_file_.c_str());
-    string op_history_str;
-    TF_RETURN_IF_ERROR(
-        ReadFileToString(env, op_history_file_, &op_history_str));
-    protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
-  }
+  OpHistory in_op_history;
+  TF_RETURN_IF_ERROR(ReadOpHistory(env, op_history_file_, op_history_directory_,
+                                   &in_op_history));
 
   int cur = 0;
-  int start = 0;
+  int hist = 0;
 
   printf("Verifying updates are compatible...\n");
-  // Note: Op history is in (alphabetical, oldest-first) order.
-  while (cur < op_list_.op_size() && start < in_op_history.op_size()) {
-    const string& op_name = op_list_.op(cur).name();
-    if (stable_ops_ != nullptr && stable_ops_->count(op_name) == 0) {
+  // Note: Op history is one OpList per unique op name in alphabetical order.
+  // Within the OplList it has versions in oldest-first order.
+  while (cur < op_list_.op_size() && hist < in_op_history.size()) {
+    const OpDef& cur_op = op_list_.op(cur);
+    const string& cur_op_name = cur_op.name();
+    const OpList& history_op_list = in_op_history[hist].second;
+    const string& history_op_name = history_op_list.op(0).name();
+    if (stable_ops_ != nullptr && stable_ops_->count(cur_op_name) == 0) {
       // Ignore unstable op.
       for (++cur; cur < op_list_.op_size(); ++cur) {
-        if (op_list_.op(cur).name() != op_name) break;
+        if (op_list_.op(cur).name() != cur_op_name) break;
       }
-    } else if (op_name < in_op_history.op(start).name()) {
+    } else if (cur_op_name < history_op_name) {
       // New op: add it.
-      if (out_op_history != nullptr) {
-        *out_op_history->add_op() = op_list_.op(cur);
-      }
+      AddNewOpToHistory(cur_op, out_op_history);
       ++*added_ops;
       ++cur;
-    } else if (op_name > in_op_history.op(start).name()) {
+    } else if (cur_op_name > history_op_name) {
       if (stable_ops_ != nullptr) {
         // Okay to remove ops from the history that have been made unstable.
-        for (++start; start < in_op_history.op_size(); ++start) {
-          if (op_name <= in_op_history.op(start).name()) break;
-        }
+        ++hist;
       } else {
         // Op removed: error.
         return errors::InvalidArgument("Error, removed op: ",
-                                       SummarizeOpDef(in_op_history.op(start)));
+                                       SummarizeOpDef(history_op_list.op(0)));
       }
     } else {
       // Op match.
-
-      // Find all historical version of this op.
-      int end = start + 1;
-      for (; end < in_op_history.op_size(); ++end) {
-        if (in_op_history.op(end).name() != op_name) break;
-      }
-
       if (out_op_history != nullptr) {
         // Copy from in_op_history to *out_op_history.
-        for (int i = start; i < end; ++i) {
-          *out_op_history->add_op() = in_op_history.op(i);
-        }
+        out_op_history->push_back(in_op_history[hist]);
       }
 
+      const int end = history_op_list.op_size();
       // Is the last op in the history the same as the current op?
       // Compare using their serialized representations.
       string history_str, cur_str;
-      in_op_history.op(end - 1).SerializeToString(&history_str);
-      op_list_.op(cur).SerializeToString(&cur_str);
+      history_op_list.op(end - 1).SerializeToString(&history_str);
+      cur_op.SerializeToString(&cur_str);
 
       if (history_str != cur_str) {
         // Op changed, verify the change is compatible.
-        for (int i = start; i < end; ++i) {
-          TF_RETURN_IF_ERROR(
-              OpDefCompatible(in_op_history.op(i), op_list_.op(cur)));
+        for (int i = 0; i < end; ++i) {
+          TF_RETURN_IF_ERROR(OpDefCompatible(history_op_list.op(i), cur_op));
         }
 
         // Verify default value of attrs has not been added/removed/modified
         // as compared to only the last historical version.
-        TF_RETURN_IF_ERROR(OpDefAttrDefaultsUnchanged(in_op_history.op(end - 1),
-                                                      op_list_.op(cur)));
+        TF_RETURN_IF_ERROR(
+            OpDefAttrDefaultsUnchanged(history_op_list.op(end - 1), cur_op));
 
-        // Check that attrs missing from in_op_history.op(start) don't
-        // change their defaults.
-        if (start < end - 1) {
+        // Check that attrs missing from history_op_list.op(0) don't change
+        // their defaults.
+        if (end > 1) {
           TF_RETURN_IF_ERROR(OpDefAddedDefaultsUnchanged(
-              in_op_history.op(start), in_op_history.op(end - 1),
-              op_list_.op(cur)));
+              history_op_list.op(0), history_op_list.op(end - 1), cur_op));
         }
 
         // Compatible! Add changed op to the end of the history.
         if (out_op_history != nullptr) {
-          *out_op_history->add_op() = op_list_.op(cur);
+          *out_op_history->back().second.add_op() = cur_op;
         }
         ++*changed_ops;
       }
 
       // Advance past this op.
-      start = end;
+      ++hist;
       ++cur;
     }
   }
 
   // Error if missing ops.
-  if (stable_ops_ == nullptr && start < in_op_history.op_size()) {
-    return errors::InvalidArgument("Error, removed op: ",
-                                   SummarizeOpDef(in_op_history.op(start)));
+  if (stable_ops_ == nullptr && hist < in_op_history.size()) {
+    return errors::InvalidArgument(
+        "Error, removed op: ",
+        SummarizeOpDef(in_op_history[hist].second.op(0)));
   }
 
   // Add remaining new ops.
@@ -184,9 +236,7 @@ Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
     if (stable_ops_ != nullptr && stable_ops_->count(op_name) == 0) {
       // Ignore unstable op.
     } else {
-      if (out_op_history) {
-        *out_op_history->add_op() = op_list_.op(cur);
-      }
+      AddNewOpToHistory(op_list_.op(cur), out_op_history);
       ++*added_ops;
     }
   }
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.h b/tensorflow/core/ops/compat/op_compatibility_lib.h
index 054f903d6b2..2f26fd63bdd 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.h
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.h
@@ -45,6 +45,11 @@ class OpCompatibilityLib {
   // order.
   const string& op_history_file() const { return op_history_file_; }
 
+  // Name of the directory that contains all versions of *stable* ops,
+  // without docs.  Op history is one file per op, in oldest-first
+  // order within the file.
+  const string& op_history_directory() const { return op_history_directory_; }
+
   // Should match the contents of ops_file().  Run before calling
   // ValidateCompatible().
   string OpsString() const { return op_list_.DebugString(); }
@@ -53,17 +58,21 @@ class OpCompatibilityLib {
   // just stable ops.
   int num_all_ops() const { return op_list_.op_size(); }
 
+  // <file name, file contents> pairs representing op history.
+  typedef std::vector<std::pair<string, OpList>> OpHistory;
+
   // Make sure the current version of the *stable* ops are compatible
   // with the historical versions, and if out_op_history != nullptr,
   // generate a new history adding all changed ops.  Sets
   // *changed_ops/*added_ops to the number of changed/added ops
   // (ignoring doc changes).
   Status ValidateCompatible(Env* env, int* changed_ops, int* added_ops,
-                            OpList* out_op_history);
+                            OpHistory* out_op_history);
 
  private:
   const string ops_file_;
   const string op_history_file_;
+  const string op_history_directory_;
   const std::set<string>* stable_ops_;
   OpList op_list_;
 };
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
deleted file mode 100644
index db25c1d0f6a..00000000000
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ /dev/null
@@ -1,95010 +0,0 @@
-op {
-  name: "Abort"
-  attr {
-    name: "error_msg"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "exit_without_error"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "Abs"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Abs"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Abs"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "AccumulateNV2"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AccumulateNV2"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AccumulateNV2"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorNumAccumulated"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "num_accumulated"
-    type: DT_INT32
-  }
-}
-op {
-  name: "AccumulatorSetGlobalStep"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "new_global_step"
-    type: DT_INT64
-  }
-}
-op {
-  name: "AccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "average"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "average"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "average"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "AccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "average"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Acos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Acos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Acos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Acosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Acosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Acosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Add"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "Add"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "Add"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "AddManySparseToTensorsMap"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_handles"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "AddN"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddN"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_VARIANT
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddN"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_VARIANT
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddN"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-        type: DT_VARIANT
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddN"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "sum"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_VARIANT
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddSparseToTensorsMap"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_handle"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "AddV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AddV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_aggregate: true
-  is_commutative: true
-}
-op {
-  name: "AdjustContrast"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "contrast_factor"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_value"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_value"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 2
-  }
-}
-op {
-  name: "AdjustContrastv2"
-  input_arg {
-    name: "images"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "contrast_factor"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "AdjustContrastv2"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "contrast_factor"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "AdjustHue"
-  input_arg {
-    name: "images"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "delta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "AdjustHue"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "AdjustSaturation"
-  input_arg {
-    name: "images"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "AdjustSaturation"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "All"
-  input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "AllCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "AllCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "AllToAll"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "group_assignment"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "concat_dimension"
-    type: "int"
-  }
-  attr {
-    name: "split_dimension"
-    type: "int"
-  }
-  attr {
-    name: "split_count"
-    type: "int"
-  }
-}
-op {
-  name: "AllToAll"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "group_assignment"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BOOL
-      }
-    }
-  }
-  attr {
-    name: "concat_dimension"
-    type: "int"
-  }
-  attr {
-    name: "split_dimension"
-    type: "int"
-  }
-  attr {
-    name: "split_count"
-    type: "int"
-  }
-}
-op {
-  name: "Angle"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AnonymousIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "AnonymousIteratorV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "deleter"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "AnonymousMultiDeviceIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "deleter"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "devices"
-    type: "list(string)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "Any"
-  input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ApplyAdaMax"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdam"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdam"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdam"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdam"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdam"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAddSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAddSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAddSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyPowerSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyPowerSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyPowerSign"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApproximateEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "tolerance"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "ApproximateEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "tolerance"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "ApproximateEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "tolerance"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "ApproximateEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "tolerance"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "ArgMax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMin"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMin"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMin"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMin"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ArgMin"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "AsString"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_BOOL
-        type: DT_INT8
-      }
-    }
-  }
-  attr {
-    name: "precision"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "scientific"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "shortest"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "width"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "fill"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "AsString"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_BOOL
-      }
-    }
-  }
-  attr {
-    name: "precision"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "scientific"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "shortest"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "width"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "fill"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "AsString"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_BOOL
-      }
-    }
-  }
-  attr {
-    name: "precision"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "scientific"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "shortest"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "width"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "fill"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "Asin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Asin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Asin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Asinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Asinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Asinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Assert"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "AssertNextDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "transformations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Assign"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "validate_shape"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "AssignAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignAddVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "AssignSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AssignSubVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "AssignVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "Atan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Atan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Atan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Atan2"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Atan2"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Atan2"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Atanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Atanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Atanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "AudioSpectrogram"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "window_size"
-    type: "int"
-  }
-  attr {
-    name: "stride"
-    type: "int"
-  }
-  attr {
-    name: "magnitude_squared"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "AudioSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "sample_rate"
-    type: "float"
-  }
-  attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "AudioSummaryV2"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "AutoShardDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_workers"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "AvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Barrier"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "BarrierClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BarrierIncompleteSize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "BarrierInsertMany"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "component_index"
-    type: "int"
-  }
-}
-op {
-  name: "BarrierReadySize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "BarrierTakeMany"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "allow_small_batch"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "wait_for_incomplete"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "Batch"
-  input_arg {
-    name: "in_tensors"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "batched_tensors"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "batch_index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "id"
-    type: DT_INT64
-  }
-  attr {
-    name: "num_batch_threads"
-    type: "int"
-  }
-  attr {
-    name: "max_batch_size"
-    type: "int"
-  }
-  attr {
-    name: "batch_timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "allowed_batch_sizes"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "grad_timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "batching_queue"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Batch"
-  input_arg {
-    name: "in_tensors"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "batched_tensors"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "batch_index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "id"
-    type: DT_INT64
-  }
-  attr {
-    name: "num_batch_threads"
-    type: "int"
-  }
-  attr {
-    name: "max_batch_size"
-    type: "int"
-  }
-  attr {
-    name: "max_enqueued_batches"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
-  attr {
-    name: "batch_timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "allowed_batch_sizes"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "grad_timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "batching_queue"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchCholesky"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchCholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "BatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "parallel_copy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFunction"
-  input_arg {
-    name: "in_tensors"
-    type_list_attr: "Tin"
-  }
-  input_arg {
-    name: "captured_tensors"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "out_tensors"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "num_batch_threads"
-    type: "int"
-  }
-  attr {
-    name: "max_batch_size"
-    type: "int"
-  }
-  attr {
-    name: "batch_timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "max_enqueued_batches"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
-  attr {
-    name: "allowed_batch_sizes"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "batching_queue"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tcaptured"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchIFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchIFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchIFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchMatMul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BatchMatMul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BatchMatMul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BatchMatMul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BatchMatMulV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "BatchMatrixBandPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_upper"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "band"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixDiag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixDiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixSetDiag"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchSelfAdjointEig"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "BatchSelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchSvd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "s"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "u"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_uv"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchToSpace"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "crops"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "BatchToSpaceND"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
-  }
-  input_arg {
-    name: "crops"
-    type_attr: "Tcrops"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tcrops"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "BesselI0e"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "BesselI1e"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Betainc"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "BiasAdd"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAdd"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAdd"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAdd"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddGrad"
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddGrad"
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddGrad"
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddGrad"
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Bincount"
-  input_arg {
-    name: "arr"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "weights"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "bins"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Bitcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Bitcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Bitcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Bitcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "Bitcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseOr"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseOr"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseXor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseXor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BlockLSTM"
-  input_arg {
-    name: "seq_len_max"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wcf"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wco"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "i"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "cs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "f"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "o"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "ci"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "co"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "h"
-    type_attr: "T"
-  }
-  attr {
-    name: "forget_bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "cell_clip"
-    type: "float"
-    default_value {
-      f: 3
-    }
-  }
-  attr {
-    name: "use_peephole"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "BlockLSTMGrad"
-  input_arg {
-    name: "seq_len_max"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wcf"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wco"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "i"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "f"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "o"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "co"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "x_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "cs_prev_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "h_prev_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "w_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wci_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wcf_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wco_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "b_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "use_peephole"
-    type: "bool"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "BoostedTreesAggregateStats"
-  input_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "hessians"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "feature"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "stats_summary"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "max_splits"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BoostedTreesBucketize"
-  input_arg {
-    name: "float_values"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "bucket_boundaries"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "buckets"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "BoostedTreesCalculateBestFeatureSplit"
-  input_arg {
-    name: "node_id_range"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "stats_summary"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l1"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l2"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "tree_complexity"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_node_weight"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "gains"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "feature_dimensions"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "thresholds"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "left_node_contribs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "right_node_contribs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "split_with_default_directions"
-    type: DT_STRING
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "split_type"
-    type: "string"
-    default_value {
-      s: "inequality"
-    }
-    allowed_values {
-      list {
-        s: "inequality"
-      }
-    }
-  }
-}
-op {
-  name: "BoostedTreesCalculateBestGainsPerFeature"
-  input_arg {
-    name: "node_id_range"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "stats_summary_list"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "l1"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l2"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "tree_complexity"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_node_weight"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "node_ids_list"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "gains_list"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "thresholds_list"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "left_node_contribs_list"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "right_node_contribs_list"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  attr {
-    name: "max_splits"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BoostedTreesCenterBias"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mean_gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "mean_hessians"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l1"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l2"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "continue_centering"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesCreateEnsemble"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stamp_token"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tree_ensemble_serialized"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesCreateQuantileStreamResource"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "epsilon"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "num_streams"
-    type: DT_INT64
-  }
-  attr {
-    name: "max_elements"
-    type: "int"
-    default_value {
-      i: 1099511627776
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesDeserializeEnsemble"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stamp_token"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tree_ensemble_serialized"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesEnsembleResourceHandleOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesExampleDebugOutputs"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "bucketized_features"
-    type: DT_INT32
-    number_attr: "num_bucketized_features"
-  }
-  output_arg {
-    name: "examples_debug_outputs_serialized"
-    type: DT_STRING
-  }
-  attr {
-    name: "num_bucketized_features"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesGetEnsembleStates"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "stamp_token"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_trees"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "num_finalized_trees"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "num_attempted_layers"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "last_layer_nodes_range"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesMakeQuantileSummaries"
-  input_arg {
-    name: "float_values"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "epsilon"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summaries"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "BoostedTreesMakeStatsSummary"
-  input_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "hessians"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "bucketized_features_list"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  output_arg {
-    name: "stats_summary"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "max_splits"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BoostedTreesPredict"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "bucketized_features"
-    type: DT_INT32
-    number_attr: "num_bucketized_features"
-  }
-  output_arg {
-    name: "logits"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bucketized_features"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesQuantileStreamResourceAddSummaries"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "summaries"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesQuantileStreamResourceDeserialize"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "bucket_boundaries"
-    type: DT_FLOAT
-    number_attr: "num_streams"
-  }
-  attr {
-    name: "num_streams"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesQuantileStreamResourceFlush"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "num_buckets"
-    type: DT_INT64
-  }
-  attr {
-    name: "generate_quantiles"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "bucket_boundaries"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesQuantileStreamResourceHandleOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesSerializeEnsemble"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "stamp_token"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "tree_ensemble_serialized"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesSparseAggregateStats"
-  input_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "hessians"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "feature_indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "feature_values"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "feature_shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "stats_summary_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "stats_summary_values"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "stats_summary_shape"
-    type: DT_INT32
-  }
-  attr {
-    name: "max_splits"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BoostedTreesSparseCalculateBestFeatureSplit"
-  input_arg {
-    name: "node_id_range"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "stats_summary_indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "stats_summary_values"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "stats_summary_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "l1"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "l2"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "tree_complexity"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_node_weight"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "gains"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "feature_dimensions"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "thresholds"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "left_node_contribs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "right_node_contribs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "split_with_default_directions"
-    type: DT_STRING
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "split_type"
-    type: "string"
-    default_value {
-      s: "inequality"
-    }
-    allowed_values {
-      list {
-        s: "inequality"
-      }
-    }
-  }
-}
-op {
-  name: "BoostedTreesTrainingPredict"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "cached_tree_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "cached_node_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "bucketized_features"
-    type: DT_INT32
-    number_attr: "num_bucketized_features"
-  }
-  output_arg {
-    name: "partial_logits"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "tree_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "node_ids"
-    type: DT_INT32
-  }
-  attr {
-    name: "num_bucketized_features"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "BoostedTreesUpdateEnsemble"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "feature_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "node_ids"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "gains"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "thresholds"
-    type: DT_INT32
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "left_node_contribs"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "right_node_contribs"
-    type: DT_FLOAT
-    number_attr: "num_features"
-  }
-  input_arg {
-    name: "max_depth"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "learning_rate"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "pruning_mode"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "BroadcastArgs"
-  input_arg {
-    name: "s0"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "s1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r0"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "BroadcastGradientArgs"
-  input_arg {
-    name: "s0"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "s1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r0"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r1"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "BroadcastTo"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Bucketize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
-  }
-}
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "CSVDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "header"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "field_delim"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "use_quote_delim"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "na_value"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "select_cols"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "output_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "CTCBeamSearchDecoder"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "beam_width"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "top_paths"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "CTCGreedyDecoder"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "labels_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "labels_values"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "loss"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "labels_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "labels_values"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "loss"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "ignore_longer_outputs_than_inputs"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "CacheDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "CacheDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Case"
-  input_arg {
-    name: "branch_index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "branches"
-    type: "list(func)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Cast"
-  input_arg {
-    name: "x"
-    type_attr: "SrcT"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "DstT"
-  }
-  attr {
-    name: "SrcT"
-    type: "type"
-  }
-  attr {
-    name: "DstT"
-    type: "type"
-  }
-}
-op {
-  name: "Cast"
-  input_arg {
-    name: "x"
-    type_attr: "SrcT"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "DstT"
-  }
-  attr {
-    name: "SrcT"
-    type: "type"
-  }
-  attr {
-    name: "DstT"
-    type: "type"
-  }
-  attr {
-    name: "Truncate"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "Ceil"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Ceil"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Ceil"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "CheckNumerics"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "message"
-    type: "string"
-  }
-}
-op {
-  name: "CheckNumerics"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "message"
-    type: "string"
-  }
-}
-op {
-  name: "CheckNumerics"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "message"
-    type: "string"
-  }
-}
-op {
-  name: "Cholesky"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "Cholesky"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cholesky"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "CholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "CholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "ChooseFastestBranchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "ratio_numerator"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "ratio_denominator"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "num_elements_per_branch"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "branches"
-    type: "list(func)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "other_arguments_lengths"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ChooseFastestDataset"
-  input_arg {
-    name: "input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "num_experiments"
-    type: "int"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ClipByValue"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "clip_value_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "clip_value_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "CloseSummaryWriter"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "CollectiveBcastRecv"
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "group_size"
-    type: "int"
-  }
-  attr {
-    name: "group_key"
-    type: "int"
-  }
-  attr {
-    name: "instance_key"
-    type: "int"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "CollectiveBcastSend"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "group_size"
-    type: "int"
-  }
-  attr {
-    name: "group_key"
-    type: "int"
-  }
-  attr {
-    name: "instance_key"
-    type: "int"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "CollectiveGather"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "group_size"
-    type: "int"
-  }
-  attr {
-    name: "group_key"
-    type: "int"
-  }
-  attr {
-    name: "instance_key"
-    type: "int"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "CollectivePermute"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "source_target_pairs"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "CollectiveReduce"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "group_size"
-    type: "int"
-  }
-  attr {
-    name: "group_key"
-    type: "int"
-  }
-  attr {
-    name: "instance_key"
-    type: "int"
-  }
-  attr {
-    name: "merge_op"
-    type: "string"
-    allowed_values {
-      list {
-        s: "Min"
-        s: "Max"
-        s: "Mul"
-        s: "Add"
-      }
-    }
-  }
-  attr {
-    name: "final_op"
-    type: "string"
-    allowed_values {
-      list {
-        s: "Id"
-        s: "Div"
-      }
-    }
-  }
-  attr {
-    name: "subdiv_offsets"
-    type: "list(int)"
-  }
-  is_stateful: true
-}
-op {
-  name: "CollectiveReduce"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "group_size"
-    type: "int"
-  }
-  attr {
-    name: "group_key"
-    type: "int"
-  }
-  attr {
-    name: "instance_key"
-    type: "int"
-  }
-  attr {
-    name: "merge_op"
-    type: "string"
-    allowed_values {
-      list {
-        s: "Min"
-        s: "Max"
-        s: "Mul"
-        s: "Add"
-      }
-    }
-  }
-  attr {
-    name: "final_op"
-    type: "string"
-    allowed_values {
-      list {
-        s: "Id"
-        s: "Div"
-      }
-    }
-  }
-  attr {
-    name: "subdiv_offsets"
-    type: "list(int)"
-  }
-  attr {
-    name: "wait_for"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CombinedNonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size_per_class"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "max_total_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_scores"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_classes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "valid_detections"
-    type: DT_INT32
-  }
-  attr {
-    name: "pad_per_class"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "CombinedNonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size_per_class"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "max_total_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_scores"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "nmsed_classes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "valid_detections"
-    type: DT_INT32
-  }
-  attr {
-    name: "pad_per_class"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clip_boxes"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "CompareAndBitpack"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "threshold"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_UINT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Complex"
-  input_arg {
-    name: "real"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "imag"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ComplexAbs"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "ComputeAccidentalHits"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "ids"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "weights"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "Concat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "ConcatOffset"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  output_arg {
-    name: "offset"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "ConcatV2"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ConcatenateDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ConcatenateDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reduction_type"
-    type: "string"
-    default_value {
-      s: "MEAN"
-    }
-    allowed_values {
-      list {
-        s: "MEAN"
-        s: "SUM"
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ConfigureDistributedTPU"
-  output_arg {
-    name: "topology"
-    type: DT_STRING
-  }
-  attr {
-    name: "embedding_config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tpu_embedding_config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "is_global_init"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Conj"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Conj"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_VARIANT
-      }
-    }
-  }
-}
-op {
-  name: "ConjugateTranspose"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tperm"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Const"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "value"
-    type: "tensor"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "ConsumeMutexLock"
-  input_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "ControlTrigger"
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-        s: "EXPLICIT"
-      }
-    }
-  }
-  attr {
-    name: "explicit_paddings"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-        s: "EXPLICIT"
-      }
-    }
-  }
-  attr {
-    name: "explicit_paddings"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-        s: "EXPLICIT"
-      }
-    }
-  }
-  attr {
-    name: "explicit_paddings"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type_attr: "Tshape"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "Tshape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Copy"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Copy"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "CopyHost"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "CopyHost"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Cos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cos"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "CountUpTo"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "limit"
-    type: "int"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "CreateSummaryDbWriter"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "db_uri"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "experiment_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "run_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "user_name"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "CreateSummaryFileWriter"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "logdir"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "max_queue"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flush_millis"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filename_suffix"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "CropAndResize"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "crop_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "crops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
-  }
-  attr {
-    name: "extrapolation_value"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-}
-op {
-  name: "CropAndResize"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "crop_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "crops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
-  }
-  attr {
-    name: "extrapolation_value"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-}
-op {
-  name: "CropAndResize"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "crop_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "crops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-        s: "nearest"
-      }
-    }
-  }
-  attr {
-    name: "extrapolation_value"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-}
-op {
-  name: "CropAndResizeGradBoxes"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
-  }
-}
-op {
-  name: "CropAndResizeGradBoxes"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
-  }
-}
-op {
-  name: "CropAndResizeGradImage"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "image_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
-  }
-}
-op {
-  name: "CropAndResizeGradImage"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "image_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-        s: "nearest"
-      }
-    }
-  }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "CrossReplicaSum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "group_assignment"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "CrossReplicaSum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "group_assignment"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_INT32
-        type: DT_UINT32
-      }
-    }
-  }
-}
-op {
-  name: "CudnnRNN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNBackprop"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_h_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_c_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "params_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNBackpropV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  output_arg {
-    name: "input_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_h_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_c_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "params_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNBackpropV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  output_arg {
-    name: "input_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_h_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_c_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "params_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNBackpropV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  output_arg {
-    name: "input_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_h_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_c_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "params_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "time_major"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNBackpropV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  output_arg {
-    name: "input_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_h_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "input_c_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "params_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_proj"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "time_major"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNCanonicalToParams"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "weights"
-    type_attr: "T"
-    number_attr: "num_params"
-  }
-  input_arg {
-    name: "biases"
-    type_attr: "T"
-    number_attr: "num_params"
-  }
-  output_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNCanonicalToParamsV2"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "weights"
-    type_attr: "T"
-    number_attr: "num_params_weights"
-  }
-  input_arg {
-    name: "biases"
-    type_attr: "T"
-    number_attr: "num_params_biases"
-  }
-  output_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params_weights"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_params_biases"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_proj"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNParamsSize"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "params_size"
-    type_attr: "S"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNParamsSize"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "params_size"
-    type_attr: "S"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_proj"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNParamsToCanonical"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "weights"
-    type_attr: "T"
-    number_attr: "num_params"
-  }
-  output_arg {
-    name: "biases"
-    type_attr: "T"
-    number_attr: "num_params"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNParamsToCanonicalV2"
-  input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "weights"
-    type_attr: "T"
-    number_attr: "num_params_weights"
-  }
-  output_arg {
-    name: "biases"
-    type_attr: "T"
-    number_attr: "num_params_biases"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params_weights"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_params_biases"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_proj"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "CudnnRNNV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "time_major"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "CudnnRNNV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sequence_lengths"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "host_reserved"
-    type: DT_INT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
-  }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
-  }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
-  }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_proj"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "time_major"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Cumprod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumprod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumprod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumprod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumsum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumsum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumsum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Cumsum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "DataFormatDimMap"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "src_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "dst_format"
-    type: "string"
-    default_value {
-      s: "NCHW"
-    }
-  }
-}
-op {
-  name: "DataFormatVecPermute"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "src_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "dst_format"
-    type: "string"
-    default_value {
-      s: "NCHW"
-    }
-  }
-}
-op {
-  name: "DatasetCardinality"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "cardinality"
-    type: DT_INT64
-  }
-}
-op {
-  name: "DatasetFromGraph"
-  input_arg {
-    name: "graph_def"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
-op {
-  name: "DatasetToGraph"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "graph"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DatasetToSingleElement"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "DatasetToSingleElement"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "DebugGradientIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugGradientRefIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "mute_if_healthy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "mute_if_healthy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "mute_if_healthy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DecodeAndCropJpeg"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "crop_window"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ratio"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "fancy_upscaling"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "try_recover_truncated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "acceptable_fraction"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "dct_method"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodeBase64"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DecodeBmp"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
-  }
-  attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
-  }
-  attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
-  }
-  attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "na_value"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
-  }
-  attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "na_value"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
-  }
-  attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "na_value"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "select_cols"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "DecodeCompressed"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "compression_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodeGif"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-}
-op {
-  name: "DecodeJSONExample"
-  input_arg {
-    name: "json_examples"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "binary_examples"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DecodeJpeg"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ratio"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "fancy_upscaling"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "try_recover_truncated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "acceptable_fraction"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "dct_method"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodePaddedRaw"
-  input_arg {
-    name: "input_bytes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "fixed_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT16
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodePng"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_UINT8
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-}
-op {
-  name: "DecodeProtoV2"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "sizes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "message_type"
-    type: "string"
-  }
-  attr {
-    name: "field_names"
-    type: "list(string)"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "descriptor_source"
-    type: "string"
-    default_value {
-      s: "local://"
-    }
-  }
-  attr {
-    name: "message_format"
-    type: "string"
-    default_value {
-      s: "binary"
-    }
-  }
-  attr {
-    name: "sanitize"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT16
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT16
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT16
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-  attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeWav"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "audio"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  attr {
-    name: "desired_channels"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "desired_samples"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "DeepCopy"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "DeleteIterator"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "deleter"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "DeleteMultiDeviceIterator"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "iterators"
-    type: DT_RESOURCE
-    number_attr: "N"
-  }
-  input_arg {
-    name: "deleter"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "DeleteSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DeleteSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToDenseSetOperation"
-  input_arg {
-    name: "set1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set2"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "DenseToSparseSetOperation"
-  input_arg {
-    name: "set1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set2_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set2_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set2_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "DepthToSpace"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "DepthToSpace"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNative"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNative"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNative"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Dequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-      }
-    }
-  }
-}
-op {
-  name: "Dequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "Dequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "DeserializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "DeserializeManySparse"
-  input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "DeserializeSparse"
-  input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "DeserializeSparse"
-  input_arg {
-    name: "serialized_sparse"
-    type_attr: "Tserialized"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tserialized"
-    type: "type"
-    default_value {
-      type: DT_STRING
-    }
-    allowed_values {
-      list {
-        type: DT_STRING
-        type: DT_VARIANT
-      }
-    }
-  }
-}
-op {
-  name: "DestroyResourceOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "ignore_lookup_error"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "DestroyTemporaryVariable"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "var_name"
-    type: "string"
-  }
-}
-op {
-  name: "Diag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Diag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Diag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "DiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "DiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "DiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Digamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Digamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Digamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "in_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "in_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "in_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Dilation2DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "in_backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "DirectedInterleaveDataset"
-  input_arg {
-    name: "selector_input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "data_input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Div"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Div"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Div"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "DivNoNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "DivNoNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "DrawBoundingBoxes"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "DrawBoundingBoxesV2"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "colors"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "DynamicPartition"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "partitions"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "outputs"
-    type_attr: "T"
-    number_attr: "num_partitions"
-  }
-  attr {
-    name: "num_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "DynamicStitch"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "merged"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "EagerPyFunc"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "EditDistance"
-  input_arg {
-    name: "hypothesis_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "hypothesis_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "hypothesis_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "truth_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "truth_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "truth_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "normalize"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Einsum"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "equation"
-    type: "string"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "EluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "EluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "EluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Empty"
-  input_arg {
-    name: "shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "init"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "EmptyTensorList"
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  input_arg {
-    name: "max_num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "EncodeBase64"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "pad"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "EncodeJpeg"
-  input_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  attr {
-    name: "format"
-    type: "string"
-    default_value {
-      s: ""
-    }
-    allowed_values {
-      list {
-        s: ""
-        s: "grayscale"
-        s: "rgb"
-      }
-    }
-  }
-  attr {
-    name: "quality"
-    type: "int"
-    default_value {
-      i: 95
-    }
-  }
-  attr {
-    name: "progressive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "optimize_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "chroma_downsampling"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "density_unit"
-    type: "string"
-    default_value {
-      s: "in"
-    }
-    allowed_values {
-      list {
-        s: "in"
-        s: "cm"
-      }
-    }
-  }
-  attr {
-    name: "x_density"
-    type: "int"
-    default_value {
-      i: 300
-    }
-  }
-  attr {
-    name: "y_density"
-    type: "int"
-    default_value {
-      i: 300
-    }
-  }
-  attr {
-    name: "xmp_metadata"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "EncodeJpegVariableQuality"
-  input_arg {
-    name: "images"
-    type: DT_UINT8
-  }
-  input_arg {
-    name: "quality"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "EncodePng"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  attr {
-    name: "compression"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_UINT8
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-}
-op {
-  name: "EncodeProto"
-  input_arg {
-    name: "sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "Tinput_types"
-  }
-  output_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  attr {
-    name: "field_names"
-    type: "list(string)"
-  }
-  attr {
-    name: "message_type"
-    type: "string"
-  }
-  attr {
-    name: "descriptor_source"
-    type: "string"
-    default_value {
-      s: "local://"
-    }
-  }
-  attr {
-    name: "Tinput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "EncodeWav"
-  input_arg {
-    name: "audio"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "EnqueueTPUEmbeddingIntegerBatch"
-  input_arg {
-    name: "batch"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "EnqueueTPUEmbeddingSparseBatch"
-  input_arg {
-    name: "sample_indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "embedding_indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "aggregation_weights"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "combiners"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "EnqueueTPUEmbeddingSparseBatch"
-  input_arg {
-    name: "sample_indices"
-    type_attr: "T1"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "embedding_indices"
-    type_attr: "T2"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "aggregation_weights"
-    type_attr: "T3"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T3"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "combiners"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "EnqueueTPUEmbeddingSparseTensorBatch"
-  input_arg {
-    name: "sample_indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "embedding_indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "aggregation_weights"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "combiners"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "table_ids"
-    type: "list(int)"
-  }
-  is_stateful: true
-}
-op {
-  name: "EnqueueTPUEmbeddingSparseTensorBatch"
-  input_arg {
-    name: "sample_indices"
-    type_attr: "T1"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "embedding_indices"
-    type_attr: "T2"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "aggregation_weights"
-    type_attr: "T3"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T3"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "combiners"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "table_ids"
-    type: "list(int)"
-  }
-  is_stateful: true
-}
-op {
-  name: "EnqueueTPUEmbeddingSparseTensorBatch"
-  input_arg {
-    name: "sample_indices"
-    type_attr: "T1"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "embedding_indices"
-    type_attr: "T2"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "aggregation_weights"
-    type_attr: "T3"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "mode_override"
-    type: DT_STRING
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T3"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "combiners"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "table_ids"
-    type: "list(int)"
-  }
-  attr {
-    name: "max_sequence_lengths"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "EnsureShape"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Enter"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "frame_name"
-    type: "string"
-  }
-  attr {
-    name: "is_constant"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
-}
-op {
-  name: "Equal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Equal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Equal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Erf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Erf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Erf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Erfc"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Erfc"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Erfc"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "EuclideanNorm"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Exit"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Exp"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Exp"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Exp"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ExpandDims"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dim"
-    type_attr: "Tdim"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tdim"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ExperimentalAssertNextDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "transformations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalAutoShardDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_workers"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalBytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalCSVDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "header"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "field_delim"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "use_quote_delim"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "na_value"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "select_cols"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "output_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalChooseFastestDataset"
-  input_arg {
-    name: "input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "num_experiments"
-    type: "int"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalChooseFastestDataset"
-  input_arg {
-    name: "input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "num_experiments"
-    type: "int"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalDatasetCardinality"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "cardinality"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ExperimentalDatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ExperimentalDatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalDenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalDenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalDirectedInterleaveDataset"
-  input_arg {
-    name: "selector_input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "data_input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalGroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalGroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalGroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalIgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalIteratorGetDevice"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "device"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalLMDBDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalLatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalMapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalMapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ExperimentalMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ExperimentalMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ExperimentalMatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalMaxIntraOpParallelismDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "max_intra_op_parallelism"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalNonSerializableDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ExperimentalPrivateThreadPoolDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_threads"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalRandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalRebatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_workers"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ExperimentalSetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalSleepDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "sleep_microseconds"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalSlidingWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalSqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalStatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalStatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalTakeWhileDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalThreadPoolDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "thread_pool"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalThreadPoolHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "num_threads"
-    type: "int"
-  }
-  attr {
-    name: "max_intra_op_parallelism"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "display_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalUnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalUniqueDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Expm1"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Expm1"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Expm1"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ExtractGlimpse"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "offsets"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "glimpse"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "centered"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "normalized"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "uniform_noise"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ExtractGlimpse"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "offsets"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "glimpse"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "centered"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "normalized"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "uniform_noise"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "noise"
-    type: "string"
-    default_value {
-      s: "uniform"
-    }
-  }
-}
-op {
-  name: "ExtractImagePatches"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "ExtractImagePatches"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "ExtractImagePatches"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "ExtractImagePatches"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "ExtractJpegShape"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image_shape"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ExtractVolumePatches"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "FFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT3D"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FIFOQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Fact"
-  output_arg {
-    name: "fact"
-    type: DT_STRING
-  }
-}
-op {
-  name: "FakeParam"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
-  attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "FakeQueue"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  is_stateful: true
-}
-op {
-  name: "Fill"
-  input_arg {
-    name: "dims"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Fill"
-  input_arg {
-    name: "dims"
-    type_attr: "index_type"
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "index_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "FilterByLastComponentDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "output"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "FilterDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "FilterDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Fingerprint"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "method"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "fingerprint"
-    type: DT_UINT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "FixedLengthRecordDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "header_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "record_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "footer_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordDatasetV2"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "header_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "record_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "footer_bytes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
-  }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "encoding"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FixedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "vocab_file"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shard"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "FixedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "vocab_file"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shard"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "FlatMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "FlatMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Floor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Floor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Floor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "FloorDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FloorDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FloorDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "FloorMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "FloorMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "FloorMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "FlushSummaryWriter"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "For"
-  input_arg {
-    name: "start"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "limit"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "delta"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "body"
-    type: "func"
-  }
-}
-op {
-  name: "FractionalAvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "deterministic"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "FractionalAvgPoolGrad"
-  input_arg {
-    name: "orig_input_tensor_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "FractionalMaxPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "deterministic"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "FractionalMaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "FusedBatchNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormGrad"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormGrad"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormGradV2"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormGradV3"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "reserve_space_3"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_5"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormV3"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedPadConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "FusedPadConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "FusedResizeAndPadConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "paddings"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "resize_align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "FusedResizeAndPadConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "paddings"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "resize_align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "GRUBlockCell"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w_ru"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_ru"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "u"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "h"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "GRUBlockCellGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w_ru"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_ru"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "u"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "d_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_h_prev"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_c_bar"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_r_bar_u_bar"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "Gather"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "GatherNd"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "GatherV2"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Taxis"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Taxis"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "GatherV2"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Taxis"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "batch_dims"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Taxis"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "GenerateVocabRemapping"
-  input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "remapping"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
-  }
-  attr {
-    name: "new_vocab_offset"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_new_vocab"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "GenerateVocabRemapping"
-  input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "remapping"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
-  }
-  attr {
-    name: "new_vocab_offset"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_new_vocab"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "old_vocab_size"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-}
-op {
-  name: "GeneratorDataset"
-  input_arg {
-    name: "init_func_other_args"
-    type_list_attr: "Tinit_func_args"
-  }
-  input_arg {
-    name: "next_func_other_args"
-    type_list_attr: "Tnext_func_args"
-  }
-  input_arg {
-    name: "finalize_func_other_args"
-    type_list_attr: "Tfinalize_func_args"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "next_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tinit_func_args"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tnext_func_args"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_args"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 23
-  }
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "GetSessionHandleV2"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "GetSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "GreaterEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "GreaterEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "GreaterEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "GreaterEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "GuaranteeConst"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "HSVToRGB"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "HSVToRGB"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "HashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "HashTableV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "HistogramFixedWidth"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "value_range"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "nbins"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "out"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "HostConst"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "value"
-    type: "tensor"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "IFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "IFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "IFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT3D"
-  input_arg {
-    name: "input"
-    type_attr: "Tcomplex"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tcomplex"
-  }
-  attr {
-    name: "Tcomplex"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "IRFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "IRFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "IRFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "Identity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "IdentityN"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "IdentityReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "IdentityReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "IdentityReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "If"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-}
-op {
-  name: "If"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-}
-op {
-  name: "If"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-}
-op {
-  name: "If"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-  is_stateful: true
-}
-op {
-  name: "If"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Igamma"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IgammaGradA"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Igammac"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Imag"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "ImageSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "bad_color"
-    type: "tensor"
-    default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
-      }
-    }
-  }
-}
-op {
-  name: "ImageSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "bad_color"
-    type: "tensor"
-    default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
-      }
-    }
-  }
-}
-op {
-  name: "ImmutableConst"
-  output_arg {
-    name: "tensor"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "memory_region_name"
-    type: "string"
-  }
-}
-op {
-  name: "ImportEvent"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "event"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "InTopK"
-  input_arg {
-    name: "predictions"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "targets"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "precision"
-    type: DT_BOOL
-  }
-  attr {
-    name: "k"
-    type: "int"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "InTopKV2"
-  input_arg {
-    name: "predictions"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "targets"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "precision"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "InfeedDequeue"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "InfeedDequeueTuple"
-  output_arg {
-    name: "outputs"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-  }
-  is_stateful: true
-}
-op {
-  name: "InfeedEnqueue"
-  input_arg {
-    name: "input"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  attr {
-    name: "layout"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "InfeedEnqueuePrelinearizedBuffer"
-  input_arg {
-    name: "input"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "InfeedEnqueueTuple"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-  }
-  attr {
-    name: "layouts"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "InitializeTable"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tkey"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tval"
-  }
-  attr {
-    name: "Tkey"
-    type: "type"
-  }
-  attr {
-    name: "Tval"
-    type: "type"
-  }
-}
-op {
-  name: "InitializeTableFromTextFile"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  attr {
-    name: "key_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
-  }
-  attr {
-    name: "value_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
-  }
-  attr {
-    name: "vocab_size"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
-  }
-}
-op {
-  name: "InitializeTableFromTextFileV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  attr {
-    name: "key_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
-  }
-  attr {
-    name: "value_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
-  }
-  attr {
-    name: "vocab_size"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "InitializeTableV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tkey"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tval"
-  }
-  attr {
-    name: "Tkey"
-    type: "type"
-  }
-  attr {
-    name: "Tval"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "InplaceAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "i"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "InplaceSub"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "i"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "InplaceUpdate"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "i"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "InterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "InterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Invert"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-}
-op {
-  name: "Invert"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "InvertPermutation"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "IsBoostedTreesEnsembleInitialized"
-  input_arg {
-    name: "tree_ensemble_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "IsBoostedTreesQuantileStreamResourceInitialized"
-  input_arg {
-    name: "quantile_stream_resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "IsFinite"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsFinite"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsFinite"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsInf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsInf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsInf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "IsVariableInitialized"
-  input_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Iterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorFromStringHandleV2"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorGetDevice"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "device"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorGetNext"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorGetNextAsOptional"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorGetNextSync"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorToStringHandle"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "KMC2ChainInitialization"
-  input_arg {
-    name: "distances"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "index"
-    type: DT_INT64
-  }
-}
-op {
-  name: "KmeansPlusPlusInitialization"
-  input_arg {
-    name: "points"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "num_to_sample"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_retries_per_sample"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "samples"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "L2Loss"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "L2Loss"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "L2Loss"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LMDBDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "LMDBReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "LRN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "LRN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LSTMBlockCell"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wcf"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wco"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "i"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "cs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "f"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "o"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "ci"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "co"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "h"
-    type_attr: "T"
-  }
-  attr {
-    name: "forget_bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "cell_clip"
-    type: "float"
-    default_value {
-      f: 3
-    }
-  }
-  attr {
-    name: "use_peephole"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LSTMBlockCellGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_prev"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "w"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wcf"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "wco"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "i"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "f"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "o"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ci"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "co"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "cs_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "h_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "cs_prev_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dicfo"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wci_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wcf_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "wco_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "use_peephole"
-    type: "bool"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "LeakyRelu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 0.2
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LeakyRelu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 0.2
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LeakyReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 0.2
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LeakyReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 0.2
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LearnedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "LearnedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "LeftShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "LeftShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Lgamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Lgamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Lgamma"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LinSpace"
-  input_arg {
-    name: "start"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "stop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "LinSpace"
-  input_arg {
-    name: "start"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "stop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ListDiff"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "LoadAndRemapMatrix"
-  input_arg {
-    name: "ckpt_path"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "row_remapping"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "col_remapping"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "initializing_values"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_matrix"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_rows"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_cols"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "max_rows_in_memory"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingADAMParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "velocities"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "velocities"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingAdadeltaParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "updates"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "updates"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingAdagradParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "mg"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingFTRLParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "linears"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "linears"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "benefits"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingMomentumParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingProximalAdagradParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingRMSPropParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
-  input_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "LogUniformCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "LogUniformCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "LogicalAnd"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  is_commutative: true
-}
-op {
-  name: "LogicalNot"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "LogicalOr"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tkeys"
-    type: "type"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableExportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tkeys"
-    type: "type"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableFind"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableFindV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableImport"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableImportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableInsertV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableSize"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
-  }
-}
-op {
-  name: "LookupTableSizeV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "LoopCond"
-  input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "LowerBound"
-  input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Lu"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "lu"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "output_idx_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Lu"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "lu"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "output_idx_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "MakeIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "MapDefun"
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "MapDefun"
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tcaptured"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "MapDefun"
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tcaptured"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "max_intra_op_parallelism"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-}
-op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapPeek"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapStage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapUnstageNoKey"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatchingFiles"
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-}
-op {
-  name: "MatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "MatrixBandPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_upper"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "band"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixBandPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
-  }
-  input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
-  }
-  output_arg {
-    name: "band"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindex"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDiag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagPartV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "padding_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagV2"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_rows"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_cols"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "padding_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 27
-  }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 27
-  }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixLogarithm"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSetDiag"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixSetDiagV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MatrixSquareRoot"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSquareRoot"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Max"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Max"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Max"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Max"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxIntraOpParallelismDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "max_intra_op_parallelism"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "TInput"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "TInput"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "TInput"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "include_batch_in_index"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "include_batch_in_index"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "include_batch_in_index"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Mean"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Mean"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Mean"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Mean"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Merge"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeSummary"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Mfcc"
-  input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
-    }
-  }
-  attr {
-    name: "lower_frequency_limit"
-    type: "float"
-    default_value {
-      f: 20
-    }
-  }
-  attr {
-    name: "filterbank_channel_count"
-    type: "int"
-    default_value {
-      i: 40
-    }
-  }
-  attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Minimum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Minimum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Minimum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Minimum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "MirrorPad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-}
-op {
-  name: "MirrorPadGrad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-}
-op {
-  name: "Mod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Mod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Mod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "ModelDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ModelDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "cpu_budget"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ModelDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "algorithm"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "cpu_budget"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "MulNoNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "MulNoNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "MulNoNan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MultiDeviceIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "devices"
-    type: "list(string)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorGetNextFromShard"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shard_num"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "incarnation_id"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorInit"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "max_buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "incarnation_id"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorToStringHandle"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "output_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "output_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableDenseHashTable"
-  input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
-  }
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableDenseHashTableV2"
-  input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
-  }
-  input_arg {
-    name: "deleted_key"
-    type_attr: "key_dtype"
-  }
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableOfTensors"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableOfTensorsV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "MutexLock"
-  input_arg {
-    name: "mutex"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "MutexV2"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "NcclAllReduce"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "reduction"
-    type: "string"
-    allowed_values {
-      list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "num_devices"
-    type: "int"
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "NcclBroadcast"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "NcclReduce"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-    number_attr: "num_devices"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  attr {
-    name: "reduction"
-    type: "string"
-    allowed_values {
-      list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "num_devices"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "NearestNeighbors"
-  input_arg {
-    name: "points"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "centers"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "nearest_center_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "nearest_center_distances"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "Neg"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Neg"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Neg"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "NegTrain"
-  input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
-  }
-  input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
-  }
-  input_arg {
-    name: "examples"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "labels"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "lr"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "vocab_count"
-    type: "list(int)"
-  }
-  attr {
-    name: "num_negative_samples"
-    type: "int"
-  }
-  deprecation {
-    version: 19
-  }
-  is_stateful: true
-}
-op {
-  name: "NextAfter"
-  input_arg {
-    name: "x1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x2"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NextIteration"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "NoOp"
-}
-op {
-  name: "NonDeterministicInts"
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "NonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "iou_threshold"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type_attr: "T_threshold"
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "T_threshold"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-}
-op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type_attr: "T_threshold"
-  }
-  input_arg {
-    name: "score_threshold"
-    type_attr: "T_threshold"
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "T_threshold"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
-  }
-  attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type_attr: "T_threshold"
-  }
-  input_arg {
-    name: "score_threshold"
-    type_attr: "T_threshold"
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "T_threshold"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV5"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "score_threshold"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "soft_nms_sigma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "selected_scores"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionWithOverlaps"
-  input_arg {
-    name: "overlaps"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "overlap_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-}
-op {
-  name: "NonSerializableDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "NthElement"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "NthElement"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "NthElement"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "OneHot"
-  input_arg {
-    name: "indices"
-    type_attr: "TI"
-  }
-  input_arg {
-    name: "depth"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "on_value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "off_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "TI"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dataset_factory"
-    type: "func"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "OptimizeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "optimizations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "OptimizeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "optimizations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "optimization_configs"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "OptionalFromValue"
-  input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "OptionalGetValue"
-  input_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "OptionalHasValue"
-  input_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "has_value"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "OptionalNone"
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-}
-op {
-  name: "OrderedMapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapPeek"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapStage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapUnstageNoKey"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OutfeedDequeue"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OutfeedDequeueTuple"
-  output_arg {
-    name: "outputs"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-  }
-  attr {
-    name: "device_ordinal"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OutfeedEnqueue"
-  input_arg {
-    name: "input"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "OutfeedEnqueueTuple"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "Pack"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "Pad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "PadV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  input_arg {
-    name: "constant_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PaddedBatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PaddedBatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "parallel_copy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PaddingFIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PaddingFIFOQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ParallelConcat"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "ParallelDynamicStitch"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "merged"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParallelInterleaveDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParallelInterleaveDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ParameterizedTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ParameterizedTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ParseExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
-  }
-  input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
-  }
-  attr {
-    name: "Nsparse"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "Ndense"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-}
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ParseSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "debug_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
-  }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
-  }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
-  }
-  output_arg {
-    name: "feature_list_dense_lengths"
-    type: DT_INT64
-    number_attr: "Nfeature_list_dense"
-  }
-  attr {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "context_sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "context_dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "feature_list_sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "feature_list_dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "Ncontext_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Ncontext_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "context_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-}
-op {
-  name: "ParseSingleExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
-  output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
-  }
-  attr {
-    name: "num_sparse"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-}
-op {
-  name: "ParseSingleSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
-  }
-  input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
-  }
-  input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
-  }
-  input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
-  }
-  input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
-  }
-  input_arg {
-    name: "debug_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
-  }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
-  }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
-  }
-  attr {
-    name: "Ncontext_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Ncontext_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "context_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-}
-op {
-  name: "ParseTensor"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-  }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "executor_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "config_proto"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "executor_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-}
-op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
-  }
-}
-op {
-  name: "PlaceholderWithDefault"
-  input_arg {
-    name: "input"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "Polygamma"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Pow"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Pow"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Pow"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "slack_period"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "Prelinearize"
-  input_arg {
-    name: "input"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  attr {
-    name: "layout"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "PrelinearizeTuple"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "dtypes"
-  }
-  output_arg {
-    name: "output"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-  }
-  attr {
-    name: "layouts"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "PreventGradient"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "Print"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "U"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Print"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "U"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  attr {
-    name: "output_stream"
-    type: "string"
-    default_value {
-      s: "stderr"
-    }
-    allowed_values {
-      list {
-        s: "stdout"
-        s: "stderr"
-        s: "log(info)"
-        s: "log(warning)"
-        s: "log(error)"
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  attr {
-    name: "output_stream"
-    type: "string"
-    default_value {
-      s: "stderr"
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  attr {
-    name: "output_stream"
-    type: "string"
-    default_value {
-      s: "stderr"
-    }
-  }
-  attr {
-    name: "end"
-    type: "string"
-    default_value {
-      s: "\n"
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PriorityQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PriorityQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PrivateThreadPoolDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_threads"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Prod"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Prod"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Prod"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Prod"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "PyFunc"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "PyFuncStateless"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-}
-op {
-  name: "Qr"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "q"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Qr"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "q"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 21
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 22
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 22
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 22
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "round_mode"
-    type: "string"
-    default_value {
-      s: "HALF_TO_EVEN"
-    }
-    allowed_values {
-      list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "round_mode"
-    type: "string"
-    default_value {
-      s: "HALF_TO_EVEN"
-    }
-    allowed_values {
-      list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
-      }
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "QuantizeDownAndShrinkRange"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeDownAndShrinkRange"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-  attr {
-    name: "round_mode"
-    type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
-    allowed_values {
-      list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-  attr {
-    name: "round_mode"
-    type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
-    allowed_values {
-      list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "QuantizedAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "QuantizedAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedAvgPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedAvgPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "v_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gamma_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "result_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-}
-op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "v_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "gamma_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "result_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-}
-op {
-  name: "QuantizedBiasAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_bias"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedBiasAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_bias"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DPerChannel"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBias"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBias"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type_attr: "Tsummand"
-  }
-  input_arg {
-    name: "min_summand"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tsummand"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type_attr: "Tsummand"
-  }
-  input_arg {
-    name: "min_summand"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tsummand"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSumAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSumAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type_attr: "Tsummand"
-  }
-  input_arg {
-    name: "min_summand"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tsummand"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "summand"
-    type_attr: "Tsummand"
-  }
-  input_arg {
-    name: "min_summand"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_summand"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Tsummand"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  attr {
-    name: "padding_list"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedDepthwiseConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedDepthwiseConv2DWithBias"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedDepthwiseConv2DWithBiasAndRelu"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tfilter"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedInstanceNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "x_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "y_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "output_range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "given_y_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "given_y_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  attr {
-    name: "min_separation"
-    type: "float"
-    default_value {
-      f: 0.001
-    }
-  }
-}
-op {
-  name: "QuantizedInstanceNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "x_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "y_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "output_range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "given_y_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "given_y_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-    default_value {
-      f: 1e-05
-    }
-  }
-  attr {
-    name: "min_separation"
-    type: "float"
-    default_value {
-      f: 0.001
-    }
-  }
-}
-op {
-  name: "QuantizedMatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tactivation"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tactivation"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMatMulWithBias"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_quant_mode"
-    type: "string"
-    default_value {
-      s: "MIN_FIRST"
-    }
-    allowed_values {
-      list {
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMatMulWithBiasAndRelu"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "bias"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_quant_mode"
-    type: "string"
-    default_value {
-      s: "MIN_FIRST"
-    }
-    allowed_values {
-      list {
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMatMulWithBiasAndReluAndRequantize"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "Tbias"
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_a"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_freezed_output"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_freezed_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Tbias"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_quant_mode"
-    type: "string"
-    default_value {
-      s: "MIN_FIRST"
-    }
-    allowed_values {
-      list {
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedMul"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "QuantizedMul"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "QuantizedMul"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_x"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_y"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "Toutput"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedRelu"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedRelu"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedRelu6"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedRelu6"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedReluX"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "max_value"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedReluX"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "max_value"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_features"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedReshape"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "Tshape"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tshape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "QuantizedResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "QueueClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "QueueCloseV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeue"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueDequeueMany"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueDequeueManyV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeueUpTo"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueDequeueUpToV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeueV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueEnqueue"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueEnqueueMany"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueEnqueueManyV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueEnqueueV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueIsClosed"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "QueueIsClosedV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueSize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "QueueSizeV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "RFFT"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RGBToHSV"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RGBToHSV"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RaggedGather"
-  input_arg {
-    name: "params_nested_splits"
-    type: DT_INT64
-    number_attr: "PARAMS_RAGGED_RANK"
-  }
-  input_arg {
-    name: "params_dense_values"
-    type_attr: "Tvalues"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output_nested_splits"
-    type: DT_INT64
-    number_attr: "OUTPUT_RAGGED_RANK"
-  }
-  output_arg {
-    name: "output_dense_values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "PARAMS_RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "OUTPUT_RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "RaggedGather"
-  input_arg {
-    name: "params_nested_splits"
-    type_attr: "Tsplits"
-    number_attr: "PARAMS_RAGGED_RANK"
-  }
-  input_arg {
-    name: "params_dense_values"
-    type_attr: "Tvalues"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output_nested_splits"
-    type_attr: "Tsplits"
-    number_attr: "OUTPUT_RAGGED_RANK"
-  }
-  output_arg {
-    name: "output_dense_values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "PARAMS_RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "OUTPUT_RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "RaggedRange"
-  input_arg {
-    name: "starts"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "limits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "deltas"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "rt_nested_splits"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "rt_dense_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "RaggedRange"
-  input_arg {
-    name: "starts"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "limits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "deltas"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "rt_nested_splits"
-    type_attr: "Tsplits"
-  }
-  output_arg {
-    name: "rt_dense_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "RaggedTensorFromVariant"
-  input_arg {
-    name: "encoded_ragged"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "output_nested_splits"
-    type_attr: "Tsplits"
-    number_attr: "output_ragged_rank"
-  }
-  output_arg {
-    name: "output_dense_values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "input_ragged_rank"
-    type: "int"
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "output_ragged_rank"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "RaggedTensorToSparse"
-  input_arg {
-    name: "rt_nested_splits"
-    type: DT_INT64
-    number_attr: "RAGGED_RANK"
-  }
-  input_arg {
-    name: "rt_dense_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sparse_dense_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "RaggedTensorToSparse"
-  input_arg {
-    name: "rt_nested_splits"
-    type_attr: "Tsplits"
-    number_attr: "RAGGED_RANK"
-  }
-  input_arg {
-    name: "rt_dense_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sparse_dense_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "RaggedTensorToVariant"
-  input_arg {
-    name: "rt_nested_splits"
-    type_attr: "Tsplits"
-    number_attr: "RAGGED_RANK"
-  }
-  input_arg {
-    name: "rt_dense_values"
-    type_attr: "Tvalues"
-  }
-  output_arg {
-    name: "encoded_ragged"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "RAGGED_RANK"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "batched_input"
-    type: "bool"
-  }
-}
-op {
-  name: "RandomCrop"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  deprecation {
-    version: 8
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomGamma"
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomGammaGrad"
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sample"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RandomPoisson"
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "rate"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomPoisson"
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "rate"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 25
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomPoissonV2"
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "rate"
-    type_attr: "R"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "R"
-    type: "type"
-    default_value {
-      type: DT_DOUBLE
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomShuffle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomShuffleQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomShuffleQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomStandardNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomStandardNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomUniform"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomUniform"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomUniformInt"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "minval"
-    type_attr: "Tout"
-  }
-  input_arg {
-    name: "maxval"
-    type_attr: "Tout"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Range"
-  input_arg {
-    name: "start"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "limit"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tidx"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Range"
-  input_arg {
-    name: "start"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "limit"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tidx"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "RangeDataset"
-  input_arg {
-    name: "start"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "stop"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "Rank"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "ReadFile"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReadVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderNumRecordsProduced"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumRecordsProducedV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderNumWorkUnitsCompleted"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumWorkUnitsCompletedV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderRead"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "key"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderReadUpTo"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_records"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderReadUpToV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "queue_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "num_records"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderReadV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "queue_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "key"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderReset"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-}
-op {
-  name: "ReaderResetV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderRestoreState"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "state"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderRestoreStateV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "state"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderSerializeState"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "state"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderSerializeStateV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "state"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "Real"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RealDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RealDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RealDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RebatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_workers"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Reciprocal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Reciprocal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Reciprocal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ReciprocalGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ReciprocalGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ReciprocalGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ReciprocalGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RecordInput"
-  output_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  attr {
-    name: "file_pattern"
-    type: "string"
-  }
-  attr {
-    name: "file_random_seed"
-    type: "int"
-    default_value {
-      i: 301
-    }
-  }
-  attr {
-    name: "file_shuffle_shift_ratio"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "file_buffer_size"
-    type: "int"
-    default_value {
-      i: 10000
-    }
-  }
-  attr {
-    name: "file_parallelism"
-    type: "int"
-    default_value {
-      i: 16
-    }
-  }
-  attr {
-    name: "batch_size"
-    type: "int"
-    default_value {
-      i: 32
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RecordInput"
-  output_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  attr {
-    name: "file_pattern"
-    type: "string"
-  }
-  attr {
-    name: "file_random_seed"
-    type: "int"
-    default_value {
-      i: 301
-    }
-  }
-  attr {
-    name: "file_shuffle_shift_ratio"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "file_buffer_size"
-    type: "int"
-    default_value {
-      i: 10000
-    }
-  }
-  attr {
-    name: "file_parallelism"
-    type: "int"
-    default_value {
-      i: 16
-    }
-  }
-  attr {
-    name: "batch_size"
-    type: "int"
-    default_value {
-      i: 32
-    }
-  }
-  attr {
-    name: "compression_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RecvTPUEmbeddingActivations"
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-    number_attr: "num_outputs"
-  }
-  attr {
-    name: "num_outputs"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "config"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "ReduceDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ReduceDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ReduceJoin"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "reduction_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "separator"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "RefEnter"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "frame_name"
-    type: "string"
-  }
-  attr {
-    name: "is_constant"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
-}
-op {
-  name: "RefExit"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "RefIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "RefMerge"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "RefNextIteration"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "RefSelect"
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "RefSwitch"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "pred"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output_false"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output_true"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "RegexFullMatch"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "RegexReplace"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "rewrite"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "replace_global"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_QINT8
-      }
-    }
-  }
-}
-op {
-  name: "Relu6"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Relu6"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Relu6"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Relu6"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Relu6Grad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Relu6Grad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Relu6Grad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Relu6Grad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "RemoteCall"
-  input_arg {
-    name: "target"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "RemoteCall"
-  input_arg {
-    name: "target"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  is_stateful: true
-}
-op {
-  name: "RemoteFusedGraphExecute"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "Toutputs"
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Toutputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "serialized_remote_fused_graph_execute_info"
-    type: "string"
-  }
-}
-op {
-  name: "RepeatDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "RepeatDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "RequantizationRange"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "RequantizationRange"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "RequantizationRangePerChannel"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "clip_value_max"
-    type: "float"
-  }
-}
-op {
-  name: "Requantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-}
-op {
-  name: "Requantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "RequantizePerChannel"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
-      }
-    }
-  }
-}
-op {
-  name: "Reshape"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "Tshape"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tshape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ResizeArea"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeArea"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubic"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubic"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubic"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubicGrad"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubicGrad"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinear"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinearGrad"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinearGrad"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBilinearGrad"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeNearestNeighbor"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeNearestNeighbor"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeNearestNeighbor"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeNearestNeighborGrad"
-  input_arg {
-    name: "grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeNearestNeighborGrad"
-  input_arg {
-    name: "grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "half_pixel_centers"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResourceAccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceAccumulatorNumAccumulated"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "num_accumulated"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceAccumulatorSetGlobalStep"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "new_global_step"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "average"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdaMax"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdamWithAmsgrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "vhat"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAddSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAddSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAddSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyKerasMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyPowerSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyPowerSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyPowerSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "logbase"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sign_decay"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "delta"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reduction_type"
-    type: "string"
-    default_value {
-      s: "MEAN"
-    }
-    allowed_values {
-      list {
-        s: "MEAN"
-        s: "SUM"
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceCountUpTo"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "limit"
-    type: "int"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceGather"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceGather"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "batch_dims"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceGatherNd"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterAdd"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterAdd"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterAdd"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterAdd"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterDiv"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterMax"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterMin"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterMul"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterNdAdd"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterNdSub"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterNdUpdate"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterSub"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterUpdate"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterUpdate"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterUpdate"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterUpdate"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyKerasMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
-  attr {
-    name: "dt"
-    type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
-  attr {
-    name: "dt"
-    type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
-  attr {
-    name: "dt"
-    type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
-  attr {
-    name: "dt"
-    type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingADAMParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "velocities"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "velocities"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingAdadeltaParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updates"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updates"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingAdagradParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "mg"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingFTRLParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "linears"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "linears"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "weights"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "benefits"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingMomentumParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "momenta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "accumulators"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingRMSPropParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "ms"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "mom"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient_accumulators"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
-  output_arg {
-    name: "parameters"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "num_shards"
-    type: "int"
-  }
-  attr {
-    name: "shard_id"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "Reverse"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Reverse"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "Reverse"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "ReverseSequence"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seq_lengths"
-    type_attr: "Tlen"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "seq_dim"
-    type: "int"
-  }
-  attr {
-    name: "batch_dim"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tlen"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ReverseV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ReverseV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "ReverseV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "ReverseV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "ReverseV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "RightShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "RightShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Rint"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Rint"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Rint"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RngSkip"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "delta"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "Roll"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shift"
-    type_attr: "Tshift"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Taxis"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tshift"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Taxis"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Round"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Round"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Round"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Rpc"
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "method"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "request"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "response"
-    type: DT_STRING
-  }
-  attr {
-    name: "protocol"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "fail_fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "timeout_in_ms"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Rsqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Rsqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Rsqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RsqrtGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RsqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RsqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "RsqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SampleDistortedBoundingBox"
-  input_arg {
-    name: "image_size"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "begin"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "size"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "min_object_covered"
-    type: "float"
-    default_value {
-      f: 0.1
-    }
-  }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
-      }
-    }
-  }
-  attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.05
-        f: 1
-      }
-    }
-  }
-  attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
-    }
-  }
-  attr {
-    name: "use_image_if_no_bounding_boxes"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SampleDistortedBoundingBoxV2"
-  input_arg {
-    name: "image_size"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_object_covered"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "begin"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "size"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
-      }
-    }
-  }
-  attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.05
-        f: 1
-      }
-    }
-  }
-  attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
-    }
-  }
-  attr {
-    name: "use_image_if_no_bounding_boxes"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SamplingDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "rate"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Save"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Save"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "SaveSlices"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "SaveSlices"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ScalarSummary"
-  input_arg {
-    name: "tags"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "ScalarSummary"
-  input_arg {
-    name: "tags"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "ScalarSummary"
-  input_arg {
-    name: "tags"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "ScalarSummary"
-  input_arg {
-    name: "tags"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "ScaleAndTranslate"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "translation"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "kernel_type"
-    type: "string"
-    default_value {
-      s: "lanczos3"
-    }
-  }
-}
-op {
-  name: "ScaleAndTranslate"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "translation"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "kernel_type"
-    type: "string"
-    default_value {
-      s: "lanczos3"
-    }
-  }
-  attr {
-    name: "antialias"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ScaleAndTranslateGrad"
-  input_arg {
-    name: "grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "translation"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "kernel_type"
-    type: "string"
-    default_value {
-      s: "lanczos3"
-    }
-  }
-}
-op {
-  name: "ScaleAndTranslateGrad"
-  input_arg {
-    name: "grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "original_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "translation"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "kernel_type"
-    type: "string"
-    default_value {
-      s: "lanczos3"
-    }
-  }
-  attr {
-    name: "antialias"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMax"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMin"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMul"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMul"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMul"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterMul"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNd"
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdAdd"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdNonAliasingAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdNonAliasingAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdNonAliasingAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdNonAliasingAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdNonAliasingAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BOOL
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ScatterNdSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterNdUpdate"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "ScatterSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterSub"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterUpdate"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "SdcaFprint"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-}
-op {
-  name: "SdcaOptimizer"
-  input_arg {
-    name: "sparse_example_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_values"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features_with_values"
-  }
-  input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "example_labels"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_delta_sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  output_arg {
-    name: "out_delta_dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  attr {
-    name: "loss_type"
-    type: "string"
-    allowed_values {
-      list {
-        s: "logistic_loss"
-        s: "squared_loss"
-        s: "hinge_loss"
-        s: "smooth_hinge_loss"
-      }
-    }
-  }
-  attr {
-    name: "adaptative"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "num_sparse_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_sparse_features_with_values"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_dense_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "num_loss_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_inner_iterations"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "SdcaOptimizer"
-  input_arg {
-    name: "sparse_example_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_values"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features_with_values"
-  }
-  input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "example_labels"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_delta_sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  output_arg {
-    name: "out_delta_dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  attr {
-    name: "loss_type"
-    type: "string"
-    allowed_values {
-      list {
-        s: "logistic_loss"
-        s: "squared_loss"
-        s: "hinge_loss"
-        s: "smooth_hinge_loss"
-        s: "poisson_loss"
-      }
-    }
-  }
-  attr {
-    name: "adaptative"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "num_sparse_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_sparse_features_with_values"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_dense_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "num_loss_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_inner_iterations"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "SdcaOptimizerV2"
-  input_arg {
-    name: "sparse_example_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_values"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features_with_values"
-  }
-  input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "example_labels"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_delta_sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  output_arg {
-    name: "out_delta_dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  attr {
-    name: "loss_type"
-    type: "string"
-    allowed_values {
-      list {
-        s: "logistic_loss"
-        s: "squared_loss"
-        s: "hinge_loss"
-        s: "smooth_hinge_loss"
-        s: "poisson_loss"
-      }
-    }
-  }
-  attr {
-    name: "adaptive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "num_sparse_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_sparse_features_with_values"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_dense_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "num_loss_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_inner_iterations"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "SdcaShrinkL1"
-  input_arg {
-    name: "weights"
-    type: DT_FLOAT
-    number_attr: "num_features"
-    is_ref: true
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-}
-op {
-  name: "SegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMin"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMin"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMin"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMin"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Select"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SelectV2"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SelfAdjointEig"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "SelfAdjointEig"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "SelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "SelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Selu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Selu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SeluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SeluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SendTPUEmbeddingGradients"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "learning_rates"
-    type: DT_FLOAT
-    number_attr: "NN"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "NN"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "config"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "SerializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "SerializeManySparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SerializeManySparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_STRING
-    }
-    allowed_values {
-      list {
-        type: DT_STRING
-        type: DT_VARIANT
-      }
-    }
-  }
-}
-op {
-  name: "SerializeSparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SerializeSparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_STRING
-    }
-    allowed_values {
-      list {
-        type: DT_STRING
-        type: DT_VARIANT
-      }
-    }
-  }
-}
-op {
-  name: "SerializeTensor"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SetSize"
-  input_arg {
-    name: "set_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "Shape"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ShapeN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-    number_attr: "N"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ShardDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_shards"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ShardDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_shards"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "require_non_empty"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ShardedFilename"
-  input_arg {
-    name: "basename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shard"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_shards"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ShardedFilespec"
-  input_arg {
-    name: "basename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "num_shards"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ShuffleAndRepeatDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "reshuffle_each_iteration"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ShutdownDistributedTPU"
-  is_stateful: true
-}
-op {
-  name: "Sigmoid"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sigmoid"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sigmoid"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SigmoidGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SigmoidGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SigmoidGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SigmoidGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sign"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sign"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sign"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sin"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Size"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SkipDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "SkipDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Skipgram"
-  output_arg {
-    name: "vocab_word"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "vocab_freq"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "words_per_epoch"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "current_epoch"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "total_words_processed"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "examples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "labels"
-    type: DT_INT32
-  }
-  attr {
-    name: "filename"
-    type: "string"
-  }
-  attr {
-    name: "batch_size"
-    type: "int"
-  }
-  attr {
-    name: "window_size"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "min_count"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "subsample"
-    type: "float"
-    default_value {
-      f: 0.001
-    }
-  }
-  deprecation {
-    version: 19
-  }
-  is_stateful: true
-}
-op {
-  name: "SleepDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "sleep_microseconds"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Slice"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "size"
-    type_attr: "Index"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SlidingWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Snapshot"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SnapshotDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "path"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "compression"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reader_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "writer_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "SnapshotDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "path"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "compression"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reader_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "writer_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shard_size_bytes"
-    type: "int"
-    default_value {
-      i: 10737418240
-    }
-  }
-  attr {
-    name: "pending_snapshot_expiry_seconds"
-    type: "int"
-    default_value {
-      i: 86400
-    }
-  }
-}
-op {
-  name: "SnapshotDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "path"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "compression"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reader_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "writer_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shard_size_bytes"
-    type: "int"
-    default_value {
-      i: 10737418240
-    }
-  }
-  attr {
-    name: "pending_snapshot_expiry_seconds"
-    type: "int"
-    default_value {
-      i: 86400
-    }
-  }
-  attr {
-    name: "num_reader_threads"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "reader_buffer_size"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-}
-op {
-  name: "SnapshotDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "path"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "compression"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reader_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "writer_path_prefix"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shard_size_bytes"
-    type: "int"
-    default_value {
-      i: 10737418240
-    }
-  }
-  attr {
-    name: "pending_snapshot_expiry_seconds"
-    type: "int"
-    default_value {
-      i: 86400
-    }
-  }
-  attr {
-    name: "num_reader_threads"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "reader_buffer_size"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "num_writer_threads"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "writer_buffer_size"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-}
-op {
-  name: "Softmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "softmax"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Softmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "softmax"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SoftmaxCrossEntropyWithLogits"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "labels"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "loss"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SoftmaxCrossEntropyWithLogits"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "labels"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "loss"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SoftsignGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SoftsignGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SoftsignGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SoftsignGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SoftsignGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SpaceToBatch"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "SpaceToBatchND"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SpaceToDepth"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "SpaceToDepth"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "SparseAccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "has_known_shape"
-    type: "bool"
-  }
-}
-op {
-  name: "SparseAccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "has_known_shape"
-    type: "bool"
-  }
-}
-op {
-  name: "SparseAccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "has_known_shape"
-    type: "bool"
-  }
-}
-op {
-  name: "SparseAccumulatorApplyGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "local_step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "has_known_shape"
-    type: "bool"
-  }
-}
-op {
-  name: "SparseAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseAdd"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sum_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Treal"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseAdd"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sum_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Treal"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseAdd"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sum_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Treal"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseAdd"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sum_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Treal"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseAddGrad"
-  input_arg {
-    name: "backprop_val_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "a_val_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "b_val_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseAddGrad"
-  input_arg {
-    name: "backprop_val_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "a_val_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "b_val_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseAddGrad"
-  input_arg {
-    name: "backprop_val_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "a_val_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "b_val_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseAddGrad"
-  input_arg {
-    name: "backprop_val_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "a_val_grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "b_val_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseConcat"
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "concat_dim"
-    type: "int"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "reduction_type"
-    type: "string"
-    default_value {
-      s: "MEAN"
-    }
-    allowed_values {
-      list {
-        s: "MEAN"
-        s: "SUM"
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseCross"
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "sparse_types"
-  }
-  input_arg {
-    name: "shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "dense_inputs"
-    type_list_attr: "dense_types"
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "hashed_output"
-    type: "bool"
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "hash_key"
-    type: "int"
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "internal_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseAdd"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseAdd"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseAdd"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseAdd"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseDiv"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseDiv"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseDiv"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseDiv"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseMul"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseMul"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseMul"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseDenseCwiseMul"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseFillEmptyRows"
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dense_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "empty_row_indicator"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "reverse_index_map"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseFillEmptyRowsGrad"
-  input_arg {
-    name: "reverse_index_map"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "grad_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_default_value"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseMatMul"
-  input_arg {
-    name: "a"
-    type_attr: "Ta"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "Tb"
-  }
-  output_arg {
-    name: "product"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "a_is_sparse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "b_is_sparse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Ta"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tb"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMax"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMax"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMax"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMax"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMaxSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMaxSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMaxSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceMaxSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSum"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSum"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSum"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSum"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSumSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSumSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSumSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseReduceSumSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseReorder"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseReshape"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "new_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-}
-op {
-  name: "SparseSegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentMeanGrad"
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output_dim0"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentMeanWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSqrtN"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSqrtNGrad"
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output_dim0"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSqrtNWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSumWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSegmentSumWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSlice"
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "start"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseSliceGrad"
-  input_arg {
-    name: "backprop_val_grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_start"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "val_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSoftmax"
-  input_arg {
-    name: "sp_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sp_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "sp_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SparseSoftmaxCrossEntropyWithLogits"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "labels"
-    type_attr: "Tlabels"
-  }
-  output_arg {
-    name: "loss"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tlabels"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSoftmaxCrossEntropyWithLogits"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "labels"
-    type_attr: "Tlabels"
-  }
-  output_arg {
-    name: "loss"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tlabels"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMaximum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMaximum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMaximum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMaximum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMinimum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMinimum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMinimum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "SparseSparseMinimum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseSplit"
-  input_arg {
-    name: "split_dim"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-    number_attr: "num_split"
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-    number_attr: "num_split"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-    number_attr: "num_split"
-  }
-  attr {
-    name: "num_split"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseTensorDenseAdd"
-  input_arg {
-    name: "a_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseTensorDenseAdd"
-  input_arg {
-    name: "a_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseTensorDenseAdd"
-  input_arg {
-    name: "a_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseTensorDenseAdd"
-  input_arg {
-    name: "a_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseTensorDenseMatMul"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "adjoint_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adjoint_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseTensorDenseMatMul"
-  input_arg {
-    name: "a_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "adjoint_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adjoint_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseTensorSliceDataset"
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tvalues"
-  }
-  input_arg {
-    name: "dense_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "SparseToDense"
-  input_arg {
-    name: "sparse_indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "output_shape"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "sparse_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dense"
-    type_attr: "T"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SparseToSparseSetOperation"
-  input_arg {
-    name: "set1_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set1_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set1_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set2_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set2_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "set2_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
-      }
-    }
-  }
-}
-op {
-  name: "Split"
-  input_arg {
-    name: "split_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    number_attr: "num_split"
-  }
-  attr {
-    name: "num_split"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SplitV"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size_splits"
-    type_attr: "Tlen"
-  }
-  input_arg {
-    name: "split_dim"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    number_attr: "num_split"
-  }
-  attr {
-    name: "num_split"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tlen"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "Sqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SqrtGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SqrtGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Square"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Square"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Square"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "SquaredDifference"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "SquaredDifference"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "SquaredDifference"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Squeeze"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "squeeze_dims"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-}
-op {
-  name: "Stack"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "elem_type"
-    type: "type"
-  }
-  attr {
-    name: "stack_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StackClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-}
-op {
-  name: "StackCloseV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "StackPop"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "elem"
-    type_attr: "elem_type"
-  }
-  attr {
-    name: "elem_type"
-    type: "type"
-  }
-}
-op {
-  name: "StackPopV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "elem"
-    type_attr: "elem_type"
-  }
-  attr {
-    name: "elem_type"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "StackPush"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "elem"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "swap_memory"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "StackPushV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "elem"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "swap_memory"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StackV2"
-  input_arg {
-    name: "max_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "elem_type"
-    type: "type"
-  }
-  attr {
-    name: "stack_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Stage"
-  input_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Stage"
-  input_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StageClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StagePeek"
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StageSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulPartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulPartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulPartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "executor_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulPartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "config_proto"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "executor_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulRandomBinomial"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "counts"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "probs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_DOUBLE
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulStandardNormal"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulStandardNormal"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulStandardNormal"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  deprecation {
-    version: 29
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulStandardNormalV2"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulTruncatedNormal"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulUniform"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulUniformFullInt"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_UINT64
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatefulUniformInt"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "algorithm"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shape"
-    type_attr: "shape_dtype"
-  }
-  input_arg {
-    name: "minval"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxval"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    name: "shape_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatelessIf"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-}
-op {
-  name: "StatelessIf"
-  input_arg {
-    name: "cond"
-    type_attr: "Tcond"
-  }
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tcond"
-    type: "type"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "then_branch"
-    type: "func"
-  }
-  attr {
-    name: "else_branch"
-    type: "func"
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "StatelessMultinomial"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomUniform"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomUniform"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomUniform"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessRandomUniformInt"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  input_arg {
-    name: "minval"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxval"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seed"
-    type_attr: "Tseed"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StatelessWhile"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "cond"
-    type: "func"
-  }
-  attr {
-    name: "body"
-    type: "func"
-  }
-}
-op {
-  name: "StaticRegexFullMatch"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-  attr {
-    name: "pattern"
-    type: "string"
-  }
-}
-op {
-  name: "StaticRegexReplace"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "pattern"
-    type: "string"
-  }
-  attr {
-    name: "rewrite"
-    type: "string"
-  }
-  attr {
-    name: "replace_global"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorHandleV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSetSummaryWriter"
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "summary"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "StopGradient"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "StridedSlice"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "StridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "StridedSliceGrad"
-  input_arg {
-    name: "shape"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "StringFormat"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "template"
-    type: "string"
-    default_value {
-      s: "%s"
-    }
-  }
-  attr {
-    name: "placeholder"
-    type: "string"
-    default_value {
-      s: "%s"
-    }
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-}
-op {
-  name: "StringJoin"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "separator"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "StringLength"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT32
-  }
-}
-op {
-  name: "StringLength"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT32
-  }
-  attr {
-    name: "unit"
-    type: "string"
-    default_value {
-      s: "BYTE"
-    }
-    allowed_values {
-      list {
-        s: "BYTE"
-        s: "UTF8_CHAR"
-      }
-    }
-  }
-}
-op {
-  name: "StringLower"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "encoding"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "StringSplit"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "delimiter"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-}
-op {
-  name: "StringSplit"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "delimiter"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "skip_empty"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "StringSplitV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "sep"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "maxsplit"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "StringStrip"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-}
-op {
-  name: "StringToHashBucket"
-  input_arg {
-    name: "string_tensor"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "StringToHashBucketFast"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "StringToHashBucketStrong"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "key"
-    type: "list(int)"
-  }
-}
-op {
-  name: "StringToNumber"
-  input_arg {
-    name: "string_tensor"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT32
-      }
-    }
-  }
-}
-op {
-  name: "StringToNumber"
-  input_arg {
-    name: "string_tensor"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "StringUpper"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "encoding"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "Sub"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sub"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sub"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Sub"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Substr"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "pos"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "len"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Substr"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "pos"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "len"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "unit"
-    type: "string"
-    default_value {
-      s: "BYTE"
-    }
-    allowed_values {
-      list {
-        s: "BYTE"
-        s: "UTF8_CHAR"
-      }
-    }
-  }
-}
-op {
-  name: "Sum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Sum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Sum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Sum"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SummaryWriter"
-  output_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Svd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "s"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "u"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_uv"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Svd"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "s"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "u"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  attr {
-    name: "compute_uv"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Switch"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "pred"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output_false"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_true"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SymbolicGradient"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "TFRecordDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "TFRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "compression_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TFRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "compression_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "TFRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "compression_type"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TPUCompilationResult"
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-}
-op {
-  name: "TPUEmbeddingActivations"
-  input_arg {
-    name: "embedding_variable"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sliced_activations"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "table_id"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "lookup_id"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "TPUOrdinalSelector"
-  output_arg {
-    name: "device_ordinals"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "TPUPartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  input_arg {
-    name: "device_ordinal"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-}
-op {
-  name: "TPUReplicateMetadata"
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "computation_shape"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-}
-op {
-  name: "TPUReplicateMetadata"
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "computation_shape"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "step_marker_location"
-    type: "string"
-    default_value {
-      s: "STEP_MARK_AT_ENTRY"
-    }
-  }
-}
-op {
-  name: "TPUReplicatedInput"
-  input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TPUReplicatedOutput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "outputs"
-    type_attr: "T"
-    number_attr: "num_replicas"
-  }
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TakeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "TakeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "TakeManySparseFromTensorsMap"
-  input_arg {
-    name: "sparse_handles"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TakeWhileDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Tan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Tan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Tan"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Tanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Tanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Tanh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TanhGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TanhGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TanhGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TanhGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dy"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TemporaryVariable"
-  output_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "var_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArray"
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "dynamic_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clear_after_read"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "tensor_array_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  deprecation {
-    version: 16
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayCloseV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-}
-op {
-  name: "TensorArrayCloseV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArrayCloseV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayConcat"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape_except0"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayConcatV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape_except0"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-op {
-  name: "TensorArrayConcatV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape_except0"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGather"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayGatherV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-op {
-  name: "TensorArrayGatherV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArrayGatherV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGrad"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "grad_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "source"
-    type: "string"
-  }
-  deprecation {
-    version: 16
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGradV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "grad_handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "source"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGradV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "grad_handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "source"
-    type: "string"
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGradV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "grad_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "source"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayGradWithShape"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "shape_to_prepend"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "grad_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "source"
-    type: "string"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayPack"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayRead"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayReadV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorArrayReadV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArrayReadV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayScatter"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 19
-  }
-}
-op {
-  name: "TensorArrayScatterV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TensorArrayScatterV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArrayScatterV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArraySize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArraySizeV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "TensorArraySizeV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArraySizeV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArraySplit"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArraySplitV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TensorArraySplitV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArraySplitV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayUnpack"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 20
-  }
-}
-op {
-  name: "TensorArrayV2"
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  attr {
-    name: "dynamic_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clear_after_read"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "tensor_array_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayV2"
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  attr {
-    name: "dynamic_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clear_after_read"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "tensor_array_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayV3"
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "flow"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  attr {
-    name: "dynamic_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clear_after_read"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "tensor_array_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayV3"
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "flow"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-  attr {
-    name: "dynamic_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "clear_after_read"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "identical_element_shapes"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "tensor_array_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorArrayWrite"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 16
-  }
-}
-op {
-  name: "TensorArrayWriteV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TensorArrayWriteV2"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 26
-  }
-}
-op {
-  name: "TensorArrayWriteV3"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorDataset"
-  input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestCreateTreeVariable"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tree_config"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreeDeserialize"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tree_config"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreeIsInitializedOp"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreePredict"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "logits"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "logits_dimension"
-    type: "int"
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreeResourceHandleOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreeSerialize"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "tree_config"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorForestTreeSize"
-  input_arg {
-    name: "tree_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "tree_size"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorListConcat"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListConcat"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "element_shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
-}
-op {
-  name: "TensorListConcatLists"
-  input_arg {
-    name: "input_a"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "input_b"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "output"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListConcatV2"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  input_arg {
-    name: "leading_dims"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListElementShape"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListFromTensor"
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListGather"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "element_shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "element_dtype"
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListGetItem"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "element_shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "item"
-    type_attr: "element_dtype"
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListLength"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "length"
-    type: DT_INT32
-  }
-}
-op {
-  name: "TensorListPopBack"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "element_shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListPushBack"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListPushBackBatch"
-  input_arg {
-    name: "input_handles"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "output_handles"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListReserve"
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  input_arg {
-    name: "num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListResize"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorListScatter"
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListScatterIntoExistingList"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListScatterV2"
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  input_arg {
-    name: "num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListSetItem"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "item"
-    type_attr: "element_dtype"
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-}
-op {
-  name: "TensorListSplit"
-  input_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  input_arg {
-    name: "element_shape"
-    type_attr: "shape_type"
-  }
-  input_arg {
-    name: "lengths"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorListStack"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "element_shape"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "element_dtype"
-  }
-  attr {
-    name: "element_dtype"
-    type: "type"
-  }
-  attr {
-    name: "num_elements"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "TensorScatterAdd"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorScatterSub"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorScatterUpdate"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TensorSliceDataset"
-  input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "TensorStridedSliceUpdate"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "TensorSummary"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "description"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "labels"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "display_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "TensorSummaryV2"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "serialized_summary_metadata"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "TextLineDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "TextLineReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "skip_header_lines"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TextLineReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "skip_header_lines"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  deprecation {
-    version: 26
-  }
-  is_stateful: true
-}
-op {
-  name: "TextLineReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "skip_header_lines"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ThreadPoolDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "thread_pool"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ThreadPoolHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "num_threads"
-    type: "int"
-  }
-  attr {
-    name: "max_intra_op_parallelism"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "display_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ThreadUnsafeUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "ThreadUnsafeUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Tile"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "multiples"
-    type_attr: "Tmultiples"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tmultiples"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TileGrad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "multiples"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 3
-  }
-}
-op {
-  name: "Timestamp"
-  output_arg {
-    name: "ts"
-    type: DT_DOUBLE
-  }
-  is_stateful: true
-}
-op {
-  name: "TopK"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  deprecation {
-    version: 7
-  }
-}
-op {
-  name: "TopK"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  deprecation {
-    version: 7
-  }
-}
-op {
-  name: "TopK"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  deprecation {
-    version: 7
-  }
-}
-op {
-  name: "TopK"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  deprecation {
-    version: 7
-  }
-}
-op {
-  name: "TopKV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "TopKV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "TopKV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "TopKV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Transpose"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tperm"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "TridiagonalMatMul"
-  input_arg {
-    name: "superdiag"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "maindiag"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "subdiag"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TridiagonalSolve"
-  input_arg {
-    name: "diagonals"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TridiagonalSolve"
-  input_arg {
-    name: "diagonals"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "partial_pivoting"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TruncateDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TruncateDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TruncateDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "TruncateMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "TruncateMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "TruncateMod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "TruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TryRpc"
-  input_arg {
-    name: "address"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "method"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "request"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "response"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "status_code"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "status_message"
-    type: DT_STRING
-  }
-  attr {
-    name: "protocol"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "fail_fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "timeout_in_ms"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Unbatch"
-  input_arg {
-    name: "batched_tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "batch_index"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "id"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "unbatched_tensor"
-    type_attr: "T"
-  }
-  attr {
-    name: "timeout_micros"
-    type: "int"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "UnbatchGrad"
-  input_arg {
-    name: "original_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "batch_index"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "id"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "batched_grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "UnicodeDecode"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "row_splits"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "char_values"
-    type: DT_INT32
-  }
-  attr {
-    name: "input_encoding"
-    type: "string"
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "strict"
-        s: "replace"
-        s: "ignore"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "replace_control_characters"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "UnicodeDecode"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "row_splits"
-    type_attr: "Tsplits"
-  }
-  output_arg {
-    name: "char_values"
-    type: DT_INT32
-  }
-  attr {
-    name: "input_encoding"
-    type: "string"
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "strict"
-        s: "replace"
-        s: "ignore"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "replace_control_characters"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnicodeDecodeWithOffsets"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "row_splits"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "char_values"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "char_to_byte_starts"
-    type: DT_INT64
-  }
-  attr {
-    name: "input_encoding"
-    type: "string"
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "strict"
-        s: "replace"
-        s: "ignore"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "replace_control_characters"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "UnicodeDecodeWithOffsets"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "row_splits"
-    type_attr: "Tsplits"
-  }
-  output_arg {
-    name: "char_values"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "char_to_byte_starts"
-    type: DT_INT64
-  }
-  attr {
-    name: "input_encoding"
-    type: "string"
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "strict"
-        s: "replace"
-        s: "ignore"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "replace_control_characters"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnicodeEncode"
-  input_arg {
-    name: "input_values"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_splits"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "ignore"
-        s: "replace"
-        s: "strict"
-      }
-    }
-  }
-  attr {
-    name: "output_encoding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "UTF-8"
-        s: "UTF-16-BE"
-        s: "UTF-32-BE"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-}
-op {
-  name: "UnicodeEncode"
-  input_arg {
-    name: "input_values"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_splits"
-    type_attr: "Tsplits"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "ignore"
-        s: "replace"
-        s: "strict"
-      }
-    }
-  }
-  attr {
-    name: "output_encoding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "UTF-8"
-        s: "UTF-16-BE"
-        s: "UTF-32-BE"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "Tsplits"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnicodeScript"
-  input_arg {
-    name: "input"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT32
-  }
-}
-op {
-  name: "UnicodeTranscode"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "input_encoding"
-    type: "string"
-  }
-  attr {
-    name: "output_encoding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "UTF-8"
-        s: "UTF-16-BE"
-        s: "UTF-32-BE"
-      }
-    }
-  }
-  attr {
-    name: "errors"
-    type: "string"
-    default_value {
-      s: "replace"
-    }
-    allowed_values {
-      list {
-        s: "strict"
-        s: "replace"
-        s: "ignore"
-      }
-    }
-  }
-  attr {
-    name: "replacement_char"
-    type: "int"
-    default_value {
-      i: 65533
-    }
-  }
-  attr {
-    name: "replace_control_characters"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "UniformCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "UniformCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Unique"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UniqueDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "UniqueV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UniqueV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Taxis"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Taxis"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UniqueWithCounts"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  output_arg {
-    name: "count"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UniqueWithCountsV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Taxis"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
-  }
-  output_arg {
-    name: "count"
-    type_attr: "out_idx"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Taxis"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "out_idx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Unpack"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    number_attr: "num"
-  }
-  attr {
-    name: "num"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-}
-op {
-  name: "UnravelIndex"
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "dims"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tidx"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentJoin"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
-  }
-  attr {
-    name: "separator"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentMin"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentProd"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Unstage"
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Unstage"
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "UnwrapDatasetVariant"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-}
-op {
-  name: "UpperBound"
-  input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "VarHandleOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
-}
-op {
-  name: "VarIsInitializedOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "Variable"
-  output_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "VariableShape"
-  input_arg {
-    name: "input"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "VariableV2"
-  output_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Where"
-  input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "index"
-    type: DT_INT64
-  }
-}
-op {
-  name: "Where"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_BOOL
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "Where"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_BOOL
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "Where"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "index"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_BOOL
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "While"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "cond"
-    type: "func"
-  }
-  attr {
-    name: "body"
-    type: "func"
-  }
-  is_stateful: true
-}
-op {
-  name: "While"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "cond"
-    type: "func"
-  }
-  attr {
-    name: "body"
-    type: "func"
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "While"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "cond"
-    type: "func"
-  }
-  attr {
-    name: "body"
-    type: "func"
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WholeFileReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WholeFileReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "stride"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "WorkerHeartbeat"
-  input_arg {
-    name: "request"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "response"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "WrapDatasetVariant"
-  input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
-  }
-}
-op {
-  name: "WriteAudioSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteFile"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "WriteFile"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteGraphSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteHistogramSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteImageSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bad_color"
-    type: DT_UINT8
-  }
-  attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteRawProtoSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteScalarSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "WriteSummary"
-  input_arg {
-    name: "writer"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "summary_metadata"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "Xdivy"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Xlogy"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ZerosLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Zeta"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "q"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "ZipDataset"
-  input_arg {
-    name: "input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ZipDataset"
-  input_arg {
-    name: "input_datasets"
-    type: DT_VARIANT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history.v2.pbtxt b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
index 8bfe6ad275f..2851585889f 100644
--- a/tensorflow/core/ops/compat/ops_history.v2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
@@ -27142,79 +27142,6 @@ op {
     }
   }
 }
-op {
-  name: "FusedBatchNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
 op {
   name: "FusedBatchNorm"
   input_arg {
@@ -27294,79 +27221,6 @@ op {
     }
   }
 }
-op {
-  name: "FusedBatchNormGrad"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
 op {
   name: "FusedBatchNormGrad"
   input_arg {
@@ -27536,173 +27390,6 @@ op {
     }
   }
 }
-op {
-  name: "FusedBatchNormV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "FusedBatchNormV2"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
 op {
   name: "FusedBatchNormV2"
   input_arg {
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Abort.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Abort.pbtxt
new file mode 100644
index 00000000000..4752385d6ec
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Abort.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "Abort"
+  attr {
+    name: "error_msg"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "exit_without_error"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt
new file mode 100644
index 00000000000..80e7c7f5c18
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AccumulateNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AccumulateNV2.pbtxt
new file mode 100644
index 00000000000..5c9523d90fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AccumulateNV2.pbtxt
@@ -0,0 +1,146 @@
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorApplyGradient.pbtxt
new file mode 100644
index 00000000000..b29f4a0a745
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorApplyGradient.pbtxt
@@ -0,0 +1,160 @@
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AccumulatorNumAccumulated.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorNumAccumulated.pbtxt
new file mode 100644
index 00000000000..f378509e1e0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "AccumulatorNumAccumulated"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "num_accumulated"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 00000000000..9b4170df332
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "AccumulatorSetGlobalStep"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "new_global_step"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorTakeGradient.pbtxt
new file mode 100644
index 00000000000..22b521a6559
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AccumulatorTakeGradient.pbtxt
@@ -0,0 +1,160 @@
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt
new file mode 100644
index 00000000000..3ed45186f6e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Acosh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Acosh.pbtxt
new file mode 100644
index 00000000000..e53c8177f7f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Acosh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Add.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Add.pbtxt
new file mode 100644
index 00000000000..ce30e6d5544
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Add.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AddManySparseToTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AddManySparseToTensorsMap.pbtxt
new file mode 100644
index 00000000000..c1433ccbaf9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AddManySparseToTensorsMap.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "AddManySparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AddN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AddN.pbtxt
new file mode 100644
index 00000000000..076f3e0e90d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AddN.pbtxt
@@ -0,0 +1,222 @@
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AddSparseToTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AddSparseToTensorsMap.pbtxt
new file mode 100644
index 00000000000..8a4c020d067
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AddSparseToTensorsMap.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "AddSparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handle"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AddV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AddV2.pbtxt
new file mode 100644
index 00000000000..8781485827d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AddV2.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AdjustContrast.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AdjustContrast.pbtxt
new file mode 100644
index 00000000000..e51900d718b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AdjustContrast.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "AdjustContrast"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 2
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AdjustContrastv2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AdjustContrastv2.pbtxt
new file mode 100644
index 00000000000..6869f269dad
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AdjustContrastv2.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AdjustHue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AdjustHue.pbtxt
new file mode 100644
index 00000000000..9a6c72d3d8f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AdjustHue.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AdjustSaturation.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AdjustSaturation.pbtxt
new file mode 100644
index 00000000000..918ea188d85
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AdjustSaturation.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/All.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/All.pbtxt
new file mode 100644
index 00000000000..c0bc8f4beae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/All.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "All"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AllCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AllCandidateSampler.pbtxt
new file mode 100644
index 00000000000..e452850c261
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AllCandidateSampler.pbtxt
@@ -0,0 +1,99 @@
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AllToAll.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AllToAll.pbtxt
new file mode 100644
index 00000000000..9f6bafdfb91
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AllToAll.pbtxt
@@ -0,0 +1,90 @@
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Angle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Angle.pbtxt
new file mode 100644
index 00000000000..ce28927f2b8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Angle.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "Angle"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AnonymousIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AnonymousIterator.pbtxt
new file mode 100644
index 00000000000..bf8f8fc2ed4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AnonymousIterator.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "AnonymousIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AnonymousIteratorV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AnonymousIteratorV2.pbtxt
new file mode 100644
index 00000000000..e7dca69e3e0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AnonymousIteratorV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "AnonymousIteratorV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AnonymousMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AnonymousMemoryCache.pbtxt
new file mode 100644
index 00000000000..7f15df3e956
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AnonymousMemoryCache.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "AnonymousMemoryCache"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AnonymousMultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AnonymousMultiDeviceIterator.pbtxt
new file mode 100644
index 00000000000..b8afaa363d6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AnonymousMultiDeviceIterator.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "AnonymousMultiDeviceIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AnonymousRandomSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AnonymousRandomSeedGenerator.pbtxt
new file mode 100644
index 00000000000..da2558b596a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AnonymousRandomSeedGenerator.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "AnonymousRandomSeedGenerator"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Any.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Any.pbtxt
new file mode 100644
index 00000000000..da020906a6d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Any.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "Any"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdaMax.pbtxt
new file mode 100644
index 00000000000..5d9e4147c5d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdaMax.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdadelta.pbtxt
new file mode 100644
index 00000000000..5d98b49cf4a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdadelta.pbtxt
@@ -0,0 +1,280 @@
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagrad.pbtxt
new file mode 100644
index 00000000000..25cbf4bf295
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagrad.pbtxt
@@ -0,0 +1,293 @@
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradDA.pbtxt
new file mode 100644
index 00000000000..8a8053f5e04
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradDA.pbtxt
@@ -0,0 +1,296 @@
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..d2d70ee310f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdagradV2.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "ApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdam.pbtxt
new file mode 100644
index 00000000000..44f15603172
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAdam.pbtxt
@@ -0,0 +1,436 @@
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyAddSign.pbtxt
new file mode 100644
index 00000000000..83d621bfd6c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyAddSign.pbtxt
@@ -0,0 +1,209 @@
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyCenteredRMSProp.pbtxt
new file mode 100644
index 00000000000..300fbeab113
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,316 @@
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrl.pbtxt
new file mode 100644
index 00000000000..23c85a15ccd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrl.pbtxt
@@ -0,0 +1,296 @@
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrlV2.pbtxt
new file mode 100644
index 00000000000..3a8e207bcf6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyFtrlV2.pbtxt
@@ -0,0 +1,312 @@
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyGradientDescent.pbtxt
new file mode 100644
index 00000000000..c4df8170a26
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyGradientDescent.pbtxt
@@ -0,0 +1,208 @@
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyMomentum.pbtxt
new file mode 100644
index 00000000000..c7232444b29
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyMomentum.pbtxt
@@ -0,0 +1,272 @@
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyPowerSign.pbtxt
new file mode 100644
index 00000000000..701301035ab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyPowerSign.pbtxt
@@ -0,0 +1,209 @@
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalAdagrad.pbtxt
new file mode 100644
index 00000000000..bf76e3057a7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalAdagrad.pbtxt
@@ -0,0 +1,260 @@
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalGradientDescent.pbtxt
new file mode 100644
index 00000000000..89ab89c19fe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,240 @@
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApplyRMSProp.pbtxt
new file mode 100644
index 00000000000..b30e9b0d660
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApplyRMSProp.pbtxt
@@ -0,0 +1,296 @@
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ApproximateEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ApproximateEqual.pbtxt
new file mode 100644
index 00000000000..40c9025bef9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ApproximateEqual.pbtxt
@@ -0,0 +1,188 @@
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt
new file mode 100644
index 00000000000..6fd71eb1bcb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ArgMax.pbtxt
@@ -0,0 +1,310 @@
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt
new file mode 100644
index 00000000000..b6fa24e7750
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ArgMin.pbtxt
@@ -0,0 +1,310 @@
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AsString.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AsString.pbtxt
new file mode 100644
index 00000000000..2bbf48d1164
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AsString.pbtxt
@@ -0,0 +1,186 @@
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_INT8
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt
new file mode 100644
index 00000000000..7df768f7c66
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Asinh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Asinh.pbtxt
new file mode 100644
index 00000000000..7f31ec1236c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Asinh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Assert.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Assert.pbtxt
new file mode 100644
index 00000000000..a891ca8c601
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Assert.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "Assert"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssertNextDataset.pbtxt
new file mode 100644
index 00000000000..7ca9d5685de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssertNextDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "AssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Assign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Assign.pbtxt
new file mode 100644
index 00000000000..9255e12f1a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Assign.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "Assign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "validate_shape"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssignAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssignAdd.pbtxt
new file mode 100644
index 00000000000..d36449ec9e3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssignAdd.pbtxt
@@ -0,0 +1,192 @@
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssignAddVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssignAddVariableOp.pbtxt
new file mode 100644
index 00000000000..c3a8b74a0da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssignAddVariableOp.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "AssignAddVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssignSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssignSub.pbtxt
new file mode 100644
index 00000000000..55ed79063da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssignSub.pbtxt
@@ -0,0 +1,192 @@
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssignSubVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssignSubVariableOp.pbtxt
new file mode 100644
index 00000000000..a5c9a567d07
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssignSubVariableOp.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "AssignSubVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AssignVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AssignVariableOp.pbtxt
new file mode 100644
index 00000000000..5fb0396ae36
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AssignVariableOp.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "AssignVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt
new file mode 100644
index 00000000000..86f0628ab53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Atan2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Atan2.pbtxt
new file mode 100644
index 00000000000..e58675db4c1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Atan2.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Atanh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Atanh.pbtxt
new file mode 100644
index 00000000000..28d417a0854
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Atanh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AudioSpectrogram.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AudioSpectrogram.pbtxt
new file mode 100644
index 00000000000..dbc2a2280de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AudioSpectrogram.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+  }
+  attr {
+    name: "stride"
+    type: "int"
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AudioSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AudioSummary.pbtxt
new file mode 100644
index 00000000000..4b1830595e0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AudioSummary.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "AudioSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "sample_rate"
+    type: "float"
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AudioSummaryV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AudioSummaryV2.pbtxt
new file mode 100644
index 00000000000..313c044aaeb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AudioSummaryV2.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "AudioSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AutoShardDataset.pbtxt
new file mode 100644
index 00000000000..2b7dcfa8383
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AutoShardDataset.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "AutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AvgPool.pbtxt
new file mode 100644
index 00000000000..8e7db139a9a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AvgPool.pbtxt
@@ -0,0 +1,229 @@
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AvgPool3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AvgPool3D.pbtxt
new file mode 100644
index 00000000000..f3f60cbc1f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AvgPool3D.pbtxt
@@ -0,0 +1,214 @@
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AvgPool3DGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AvgPool3DGrad.pbtxt
new file mode 100644
index 00000000000..67fef957287
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AvgPool3DGrad.pbtxt
@@ -0,0 +1,230 @@
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/AvgPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/AvgPoolGrad.pbtxt
new file mode 100644
index 00000000000..6c72effaffa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/AvgPoolGrad.pbtxt
@@ -0,0 +1,245 @@
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Barrier.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Barrier.pbtxt
new file mode 100644
index 00000000000..9391157b888
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Barrier.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "Barrier"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BarrierClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BarrierClose.pbtxt
new file mode 100644
index 00000000000..69230484813
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BarrierClose.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "BarrierClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BarrierIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BarrierIncompleteSize.pbtxt
new file mode 100644
index 00000000000..0d17c183684
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BarrierIncompleteSize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "BarrierIncompleteSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BarrierInsertMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BarrierInsertMany.pbtxt
new file mode 100644
index 00000000000..86b64f603eb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BarrierInsertMany.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "BarrierInsertMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "component_index"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BarrierReadySize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BarrierReadySize.pbtxt
new file mode 100644
index 00000000000..e7b06300559
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BarrierReadySize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "BarrierReadySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BarrierTakeMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BarrierTakeMany.pbtxt
new file mode 100644
index 00000000000..e3240429304
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BarrierTakeMany.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "BarrierTakeMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "allow_small_batch"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "wait_for_incomplete"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Batch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Batch.pbtxt
new file mode 100644
index 00000000000..5e2ea7aca10
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Batch.pbtxt
@@ -0,0 +1,147 @@
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchCholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchCholesky.pbtxt
new file mode 100644
index 00000000000..5d38acc7c2d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchCholesky.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "BatchCholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchCholeskyGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchCholeskyGrad.pbtxt
new file mode 100644
index 00000000000..286ae3a8116
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchCholeskyGrad.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "BatchCholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchDataset.pbtxt
new file mode 100644
index 00000000000..6770f2f8d88
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchDatasetV2.pbtxt
new file mode 100644
index 00000000000..f5534183482
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchDatasetV2.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT.pbtxt
new file mode 100644
index 00000000000..4fe86a392f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT2D.pbtxt
new file mode 100644
index 00000000000..b52a6bdca44
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT2D.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT3D.pbtxt
new file mode 100644
index 00000000000..7f19cf13c10
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchFFT3D.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchFunction.pbtxt
new file mode 100644
index 00000000000..daf3c4692b8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchFunction.pbtxt
@@ -0,0 +1,84 @@
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT.pbtxt
new file mode 100644
index 00000000000..09d7b4ad786
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchIFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT2D.pbtxt
new file mode 100644
index 00000000000..23cc9cc51df
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT2D.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchIFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT3D.pbtxt
new file mode 100644
index 00000000000..10a78fab914
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchIFFT3D.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "BatchIFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatMul.pbtxt
new file mode 100644
index 00000000000..29c5a6c819b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatMul.pbtxt
@@ -0,0 +1,176 @@
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatMulV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatMulV2.pbtxt
new file mode 100644
index 00000000000..77224c111ba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatMulV2.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "BatchMatMulV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixBandPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixBandPart.pbtxt
new file mode 100644
index 00000000000..413681e6129
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixBandPart.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "BatchMatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDeterminant.pbtxt
new file mode 100644
index 00000000000..4bc6081aa44
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDeterminant.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiag.pbtxt
new file mode 100644
index 00000000000..6104bef9340
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiag.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiagPart.pbtxt
new file mode 100644
index 00000000000..9bd200f8cf5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixDiagPart.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixInverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixInverse.pbtxt
new file mode 100644
index 00000000000..03a694d973a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixInverse.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "BatchMatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSetDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSetDiag.pbtxt
new file mode 100644
index 00000000000..f459184a0a3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSetDiag.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "BatchMatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolve.pbtxt
new file mode 100644
index 00000000000..909502e91ab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolve.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "BatchMatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolveLs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolveLs.pbtxt
new file mode 100644
index 00000000000..8c9d24efc7a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixSolveLs.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "BatchMatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixTriangularSolve.pbtxt
new file mode 100644
index 00000000000..406fa62171f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchMatrixTriangularSolve.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "BatchMatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 00000000000..a15b0378900
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalizationGrad.pbtxt
new file mode 100644
index 00000000000..ef973cc415b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -0,0 +1,312 @@
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEig.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEig.pbtxt
new file mode 100644
index 00000000000..42ba04199f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEig.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "BatchSelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEigV2.pbtxt
new file mode 100644
index 00000000000..df3996ea237
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchSelfAdjointEigV2.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchSvd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchSvd.pbtxt
new file mode 100644
index 00000000000..0595ffcd2a6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchSvd.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchToSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchToSpace.pbtxt
new file mode 100644
index 00000000000..ac089e5ca76
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchToSpace.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "BatchToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BatchToSpaceND.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BatchToSpaceND.pbtxt
new file mode 100644
index 00000000000..464beb31614
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BatchToSpaceND.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "BatchToSpaceND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tcrops"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BesselI0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BesselI0e.pbtxt
new file mode 100644
index 00000000000..299cf82535a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BesselI0e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BesselI1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BesselI1e.pbtxt
new file mode 100644
index 00000000000..a9c8d0eb0e5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BesselI1e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Betainc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Betainc.pbtxt
new file mode 100644
index 00000000000..b1523bff9e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Betainc.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "Betainc"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BiasAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BiasAdd.pbtxt
new file mode 100644
index 00000000000..0d64c079ba3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BiasAdd.pbtxt
@@ -0,0 +1,208 @@
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BiasAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BiasAddGrad.pbtxt
new file mode 100644
index 00000000000..bb867217329
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BiasAddGrad.pbtxt
@@ -0,0 +1,192 @@
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BiasAddV1.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BiasAddV1.pbtxt
new file mode 100644
index 00000000000..ecd1245df86
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BiasAddV1.pbtxt
@@ -0,0 +1,156 @@
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Bincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Bincount.pbtxt
new file mode 100644
index 00000000000..12135bbd54a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Bincount.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Bitcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Bitcast.pbtxt
new file mode 100644
index 00000000000..993a0c6da9e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Bitcast.pbtxt
@@ -0,0 +1,301 @@
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BitwiseAnd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BitwiseAnd.pbtxt
new file mode 100644
index 00000000000..4b90e0e3de2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BitwiseAnd.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BitwiseOr.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BitwiseOr.pbtxt
new file mode 100644
index 00000000000..393a506f339
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BitwiseOr.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BitwiseXor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BitwiseXor.pbtxt
new file mode 100644
index 00000000000..c72b23fc432
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BitwiseXor.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BlockLSTM.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTM.pbtxt
new file mode 100644
index 00000000000..63180f534f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTM.pbtxt
@@ -0,0 +1,98 @@
+op {
+  name: "BlockLSTM"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  attr {
+    name: "forget_bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "cell_clip"
+    type: "float"
+    default_value {
+      f: 3
+    }
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGrad.pbtxt
new file mode 100644
index 00000000000..e7b6458bc8a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGrad.pbtxt
@@ -0,0 +1,121 @@
+op {
+  name: "BlockLSTMGrad"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "w_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wci_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wcf_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wco_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGradV2.pbtxt
new file mode 100644
index 00000000000..ed0bd6b2456
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMGradV2.pbtxt
@@ -0,0 +1,121 @@
+op {
+  name: "BlockLSTMGradV2"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "w_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wci_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wcf_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wco_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMV2.pbtxt
new file mode 100644
index 00000000000..5fce517277d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BlockLSTMV2.pbtxt
@@ -0,0 +1,91 @@
+op {
+  name: "BlockLSTMV2"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  attr {
+    name: "cell_clip"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesAggregateStats.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesAggregateStats.pbtxt
new file mode 100644
index 00000000000..72994094399
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesAggregateStats.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "BoostedTreesAggregateStats"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "feature"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesBucketize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesBucketize.pbtxt
new file mode 100644
index 00000000000..5f277d3e0db
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesBucketize.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BoostedTreesBucketize"
+  input_arg {
+    name: "float_values"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "buckets"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestFeatureSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestFeatureSplit.pbtxt
new file mode 100644
index 00000000000..929d54da391
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestFeatureSplit.pbtxt
@@ -0,0 +1,147 @@
+op {
+  name: "BoostedTreesCalculateBestFeatureSplit"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "feature_dimensions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "thresholds"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "split_with_default_directions"
+    type: DT_STRING
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "split_type"
+    type: "string"
+    default_value {
+      s: "inequality"
+    }
+    allowed_values {
+      list {
+        s: "inequality"
+      }
+    }
+  }
+}
+op {
+  name: "BoostedTreesCalculateBestFeatureSplit"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "feature_dimensions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "thresholds"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "split_with_default_directions"
+    type: DT_STRING
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "split_type"
+    type: "string"
+    default_value {
+      s: "inequality"
+    }
+    allowed_values {
+      list {
+        s: "inequality"
+        s: "equality"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 00000000000..f100db7b386
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCenterBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCenterBias.pbtxt
new file mode 100644
index 00000000000..5c2fb9b5c54
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCenterBias.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "BoostedTreesCenterBias"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mean_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mean_hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "continue_centering"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 00000000000..cea6d23f91f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateQuantileStreamResource.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateQuantileStreamResource.pbtxt
new file mode 100644
index 00000000000..3d0d64adcd5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesCreateQuantileStreamResource.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BoostedTreesCreateQuantileStreamResource"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "epsilon"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_streams"
+    type: DT_INT64
+  }
+  attr {
+    name: "max_elements"
+    type: "int"
+    default_value {
+      i: 1099511627776
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 00000000000..b6d55ea0544
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 00000000000..00573c1b95a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesExampleDebugOutputs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesExampleDebugOutputs.pbtxt
new file mode 100644
index 00000000000..066be042842
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesExampleDebugOutputs.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "BoostedTreesExampleDebugOutputs"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "examples_debug_outputs_serialized"
+    type: DT_STRING
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesFlushQuantileSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesFlushQuantileSummaries.pbtxt
new file mode 100644
index 00000000000..ae35e1023d4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesFlushQuantileSummaries.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "BoostedTreesFlushQuantileSummaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 00000000000..1959384a36b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeQuantileSummaries.pbtxt
new file mode 100644
index 00000000000..bbefa8b8711
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeQuantileSummaries.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "BoostedTreesMakeQuantileSummaries"
+  input_arg {
+    name: "float_values"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "epsilon"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 00000000000..49a82d2ba0c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesPredict.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesPredict.pbtxt
new file mode 100644
index 00000000000..7f176cdc901
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesPredict.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
new file mode 100644
index 00000000000..97e875f4ea0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "BoostedTreesQuantileStreamResourceAddSummaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 00000000000..c3f01fef2a4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceFlush.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceFlush.pbtxt
new file mode 100644
index 00000000000..fc2613a0d37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceFlush.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "BoostedTreesQuantileStreamResourceFlush"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  attr {
+    name: "generate_quantiles"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
new file mode 100644
index 00000000000..b2aa8dd4e72
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceHandleOp.pbtxt
new file mode 100644
index 00000000000..ca40a0aa1ed
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesQuantileStreamResourceHandleOp.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "BoostedTreesQuantileStreamResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 00000000000..29d19f03117
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseAggregateStats.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseAggregateStats.pbtxt
new file mode 100644
index 00000000000..9260634bf1e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseAggregateStats.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "BoostedTreesSparseAggregateStats"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "feature_indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "feature_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "feature_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "stats_summary_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "stats_summary_values"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "stats_summary_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
new file mode 100644
index 00000000000..86f7a5ffd21
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "BoostedTreesSparseCalculateBestFeatureSplit"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_values"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "stats_summary_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "feature_dimensions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "thresholds"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "split_with_default_directions"
+    type: DT_STRING
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "split_type"
+    type: "string"
+    default_value {
+      s: "inequality"
+    }
+    allowed_values {
+      list {
+        s: "inequality"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 00000000000..615f52c656d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 00000000000..9cd779e314a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsembleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsembleV2.pbtxt
new file mode 100644
index 00000000000..49624d649b6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BoostedTreesUpdateEnsembleV2.pbtxt
@@ -0,0 +1,135 @@
+op {
+  name: "BoostedTreesUpdateEnsembleV2"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "dimension_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "split_types"
+    type: DT_STRING
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "pruning_mode"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsembleV2"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "dimension_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "split_types"
+    type: DT_STRING
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "pruning_mode"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BroadcastArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BroadcastArgs.pbtxt
new file mode 100644
index 00000000000..e6dc3990634
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BroadcastArgs.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "BroadcastArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BroadcastGradientArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BroadcastGradientArgs.pbtxt
new file mode 100644
index 00000000000..2e1d739f988
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BroadcastGradientArgs.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "BroadcastGradientArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BroadcastTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BroadcastTo.pbtxt
new file mode 100644
index 00000000000..4d29f9ebeb8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BroadcastTo.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "BroadcastTo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Bucketize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Bucketize.pbtxt
new file mode 100644
index 00000000000..abe818e666c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Bucketize.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BytesProducedStatsDataset.pbtxt
new file mode 100644
index 00000000000..fabe50c5d7c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "BytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CSVDataset.pbtxt
new file mode 100644
index 00000000000..3cac4059cd0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CSVDataset.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "CSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CTCBeamSearchDecoder.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CTCBeamSearchDecoder.pbtxt
new file mode 100644
index 00000000000..5bd1968c832
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CTCBeamSearchDecoder.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "CTCBeamSearchDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCBeamSearchDecoder"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type_attr: "T"
+  }
+  attr {
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CTCGreedyDecoder.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CTCGreedyDecoder.pbtxt
new file mode 100644
index 00000000000..1266b175563
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CTCGreedyDecoder.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "CTCGreedyDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CTCGreedyDecoder"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type_attr: "T"
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CTCLoss.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CTCLoss.pbtxt
new file mode 100644
index 00000000000..1d7e041dd91
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CTCLoss.pbtxt
@@ -0,0 +1,150 @@
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "gradient"
+    type_attr: "T"
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CacheDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CacheDataset.pbtxt
new file mode 100644
index 00000000000..c53eb7aec5c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CacheDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CacheDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CacheDatasetV2.pbtxt
new file mode 100644
index 00000000000..c65c4c19888
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CacheDatasetV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "CacheDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "cache"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Case.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Case.pbtxt
new file mode 100644
index 00000000000..39cfc3f7236
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Case.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cast.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cast.pbtxt
new file mode 100644
index 00000000000..581a8e4a8cc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cast.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+}
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Ceil.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Ceil.pbtxt
new file mode 100644
index 00000000000..cdec0850007
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Ceil.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CheckNumerics.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CheckNumerics.pbtxt
new file mode 100644
index 00000000000..9e63b170b23
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CheckNumerics.pbtxt
@@ -0,0 +1,108 @@
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cholesky.pbtxt
new file mode 100644
index 00000000000..e3cee5fcf89
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cholesky.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CholeskyGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CholeskyGrad.pbtxt
new file mode 100644
index 00000000000..0f7c7efd88c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CholeskyGrad.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestBranchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestBranchDataset.pbtxt
new file mode 100644
index 00000000000..8638a919678
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestBranchDataset.pbtxt
@@ -0,0 +1,58 @@
+op {
+  name: "ChooseFastestBranchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "ratio_numerator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "ratio_denominator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "num_elements_per_branch"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "other_arguments_lengths"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestDataset.pbtxt
new file mode 100644
index 00000000000..d4ead6df800
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ChooseFastestDataset.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "ChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ClipByValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ClipByValue.pbtxt
new file mode 100644
index 00000000000..3c3919ba54f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ClipByValue.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CloseSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CloseSummaryWriter.pbtxt
new file mode 100644
index 00000000000..f67e1aaff24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CloseSummaryWriter.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "CloseSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastRecv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastRecv.pbtxt
new file mode 100644
index 00000000000..b25eea84fdd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastRecv.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastSend.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastSend.pbtxt
new file mode 100644
index 00000000000..d2fdf631974
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CollectiveBcastSend.pbtxt
@@ -0,0 +1,89 @@
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CollectiveGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CollectiveGather.pbtxt
new file mode 100644
index 00000000000..23326568a79
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CollectiveGather.pbtxt
@@ -0,0 +1,89 @@
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CollectivePermute.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CollectivePermute.pbtxt
new file mode 100644
index 00000000000..fd224d36a4b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CollectivePermute.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CollectiveReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CollectiveReduce.pbtxt
new file mode 100644
index 00000000000..25b1485e3e1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CollectiveReduce.pbtxt
@@ -0,0 +1,212 @@
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CombinedNonMaxSuppression.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CombinedNonMaxSuppression.pbtxt
new file mode 100644
index 00000000000..55e27122e9d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CombinedNonMaxSuppression.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clip_boxes"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CompareAndBitpack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CompareAndBitpack.pbtxt
new file mode 100644
index 00000000000..b2df17ba4c8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CompareAndBitpack.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Complex.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Complex.pbtxt
new file mode 100644
index 00000000000..5d17643c894
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Complex.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ComplexAbs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ComplexAbs.pbtxt
new file mode 100644
index 00000000000..6e7cfc1266a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ComplexAbs.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ComputeAccidentalHits.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ComputeAccidentalHits.pbtxt
new file mode 100644
index 00000000000..0bac269ba6b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ComputeAccidentalHits.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Concat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Concat.pbtxt
new file mode 100644
index 00000000000..21ff0fda433
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Concat.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConcatOffset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConcatOffset.pbtxt
new file mode 100644
index 00000000000..805767263a5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConcatOffset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConcatV2.pbtxt
new file mode 100644
index 00000000000..d11dc14a9ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConcatV2.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "ConcatV2"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConcatenateDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConcatenateDataset.pbtxt
new file mode 100644
index 00000000000..058d0ea6a16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConcatenateDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConditionalAccumulator.pbtxt
new file mode 100644
index 00000000000..d6da7884e39
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConditionalAccumulator.pbtxt
@@ -0,0 +1,269 @@
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConfigureDistributedTPU.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConfigureDistributedTPU.pbtxt
new file mode 100644
index 00000000000..8d19a0276fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConfigureDistributedTPU.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConfigureTPUEmbedding.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConfigureTPUEmbedding.pbtxt
new file mode 100644
index 00000000000..6e61f8870c3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConfigureTPUEmbedding.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "ConfigureTPUEmbedding"
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conj.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conj.pbtxt
new file mode 100644
index 00000000000..6e98e166726
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conj.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConjugateTranspose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConjugateTranspose.pbtxt
new file mode 100644
index 00000000000..417a2a53cd0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConjugateTranspose.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "ConjugateTranspose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Const.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Const.pbtxt
new file mode 100644
index 00000000000..6512d22e435
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Const.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ConsumeMutexLock.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ConsumeMutexLock.pbtxt
new file mode 100644
index 00000000000..4340267edf9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ConsumeMutexLock.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "ConsumeMutexLock"
+  input_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ControlTrigger.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ControlTrigger.pbtxt
new file mode 100644
index 00000000000..2fe84fee801
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ControlTrigger.pbtxt
@@ -0,0 +1,3 @@
+op {
+  name: "ControlTrigger"
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv2D.pbtxt
new file mode 100644
index 00000000000..10f6c1db837
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv2D.pbtxt
@@ -0,0 +1,286 @@
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropFilter.pbtxt
new file mode 100644
index 00000000000..1c656127f7e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropFilter.pbtxt
@@ -0,0 +1,302 @@
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropInput.pbtxt
new file mode 100644
index 00000000000..7d4681371ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv2DBackpropInput.pbtxt
@@ -0,0 +1,302 @@
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv3D.pbtxt
new file mode 100644
index 00000000000..a04a66081ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv3D.pbtxt
@@ -0,0 +1,164 @@
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilter.pbtxt
new file mode 100644
index 00000000000..3091cfdcba7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilter.pbtxt
@@ -0,0 +1,159 @@
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilterV2.pbtxt
new file mode 100644
index 00000000000..2494eba6e8a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropFilterV2.pbtxt
@@ -0,0 +1,176 @@
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInput.pbtxt
new file mode 100644
index 00000000000..7fa3a5548d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInput.pbtxt
@@ -0,0 +1,159 @@
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInputV2.pbtxt
new file mode 100644
index 00000000000..e01b33dc4a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Conv3DBackpropInputV2.pbtxt
@@ -0,0 +1,262 @@
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Copy.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Copy.pbtxt
new file mode 100644
index 00000000000..258aecc2947
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Copy.pbtxt
@@ -0,0 +1,54 @@
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CopyHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CopyHost.pbtxt
new file mode 100644
index 00000000000..07eb864f460
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CopyHost.pbtxt
@@ -0,0 +1,54 @@
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cos.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cos.pbtxt
new file mode 100644
index 00000000000..52b7c1e7955
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cos.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cosh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cosh.pbtxt
new file mode 100644
index 00000000000..7a293163050
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cosh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CountUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CountUpTo.pbtxt
new file mode 100644
index 00000000000..05726df8c11
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CountUpTo.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryDbWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryDbWriter.pbtxt
new file mode 100644
index 00000000000..7a5f844bbb6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryFileWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryFileWriter.pbtxt
new file mode 100644
index 00000000000..61106e9fc85
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "CreateSummaryFileWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flush_millis"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CropAndResize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CropAndResize.pbtxt
new file mode 100644
index 00000000000..57b02c66388
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CropAndResize.pbtxt
@@ -0,0 +1,177 @@
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradBoxes.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradBoxes.pbtxt
new file mode 100644
index 00000000000..d3f62e3ba4c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradBoxes.pbtxt
@@ -0,0 +1,103 @@
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradImage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradImage.pbtxt
new file mode 100644
index 00000000000..6ae744f428a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CropAndResizeGradImage.pbtxt
@@ -0,0 +1,93 @@
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cross.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cross.pbtxt
new file mode 100644
index 00000000000..b80215fcbc1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cross.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CrossReplicaSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CrossReplicaSum.pbtxt
new file mode 100644
index 00000000000..09c2402cc5a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CrossReplicaSum.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNN.pbtxt
new file mode 100644
index 00000000000..deab6082f77
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNN.pbtxt
@@ -0,0 +1,117 @@
+op {
+  name: "CudnnRNN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackprop.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackprop.pbtxt
new file mode 100644
index 00000000000..2100ed239d1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackprop.pbtxt
@@ -0,0 +1,138 @@
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV2.pbtxt
new file mode 100644
index 00000000000..eac269a4e5e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV2.pbtxt
@@ -0,0 +1,142 @@
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV3.pbtxt
new file mode 100644
index 00000000000..c5342f43843
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNBackpropV3.pbtxt
@@ -0,0 +1,459 @@
+op {
+  name: "CudnnRNNBackpropV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 00000000000..63a702267e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,109 @@
+op {
+  name: "CudnnRNNCanonicalToParams"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParamsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParamsV2.pbtxt
new file mode 100644
index 00000000000..ce8a7f8e20f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNCanonicalToParamsV2.pbtxt
@@ -0,0 +1,122 @@
+op {
+  name: "CudnnRNNCanonicalToParamsV2"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params_weights"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params_biases"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params_weights"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_params_biases"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsSize.pbtxt
new file mode 100644
index 00000000000..50c8a286f05
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,213 @@
+op {
+  name: "CudnnRNNParamsSize"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsSize"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 00000000000..568e338487c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,109 @@
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonicalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonicalV2.pbtxt
new file mode 100644
index 00000000000..e9c02a23a75
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNParamsToCanonicalV2.pbtxt
@@ -0,0 +1,122 @@
+op {
+  name: "CudnnRNNParamsToCanonicalV2"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params_weights"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params_biases"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params_weights"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_params_biases"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV2.pbtxt
new file mode 100644
index 00000000000..6ad7a0e2fbf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV2.pbtxt
@@ -0,0 +1,121 @@
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV3.pbtxt
new file mode 100644
index 00000000000..a65518383ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CudnnRNNV3.pbtxt
@@ -0,0 +1,396 @@
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cumprod.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cumprod.pbtxt
new file mode 100644
index 00000000000..72e2c03a443
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cumprod.pbtxt
@@ -0,0 +1,264 @@
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Cumsum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Cumsum.pbtxt
new file mode 100644
index 00000000000..8249a8fb32b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Cumsum.pbtxt
@@ -0,0 +1,264 @@
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/CumulativeLogsumexp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/CumulativeLogsumexp.pbtxt
new file mode 100644
index 00000000000..3b70a7bca55
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/CumulativeLogsumexp.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "CumulativeLogsumexp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DataFormatDimMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DataFormatDimMap.pbtxt
new file mode 100644
index 00000000000..a01806bad6a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DataFormatDimMap.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DataFormatVecPermute.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DataFormatVecPermute.pbtxt
new file mode 100644
index 00000000000..e439414d6d3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DataFormatVecPermute.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "DataFormatVecPermute"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DatasetCardinality.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DatasetCardinality.pbtxt
new file mode 100644
index 00000000000..61638a6840b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DatasetCardinality.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DatasetFromGraph.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DatasetFromGraph.pbtxt
new file mode 100644
index 00000000000..30f53841b58
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DatasetFromGraph.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DatasetFromGraph"
+  input_arg {
+    name: "graph_def"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DatasetToGraph.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DatasetToGraph.pbtxt
new file mode 100644
index 00000000000..72388f62ccd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DatasetToGraph.pbtxt
@@ -0,0 +1,58 @@
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+  attr {
+    name: "stateful_whitelist"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+  attr {
+    name: "stateful_whitelist"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "allow_stateful"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DatasetToSingleElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DatasetToSingleElement.pbtxt
new file mode 100644
index 00000000000..d9080d75f7d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DatasetToSingleElement.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DatasetToTFRecord.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DatasetToTFRecord.pbtxt
new file mode 100644
index 00000000000..e13f88e1c25
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DatasetToTFRecord.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DebugGradientIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DebugGradientIdentity.pbtxt
new file mode 100644
index 00000000000..e1b425730d8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DebugGradientIdentity.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "DebugGradientIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DebugGradientRefIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DebugGradientRefIdentity.pbtxt
new file mode 100644
index 00000000000..f75b7784ac0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DebugGradientRefIdentity.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "DebugGradientRefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DebugIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DebugIdentity.pbtxt
new file mode 100644
index 00000000000..50f97d847d4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DebugIdentity.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DebugNanCount.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DebugNanCount.pbtxt
new file mode 100644
index 00000000000..82ae073497f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DebugNanCount.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DebugNumericSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DebugNumericSummary.pbtxt
new file mode 100644
index 00000000000..d108b54e1cd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DebugNumericSummary.pbtxt
@@ -0,0 +1,208 @@
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeAndCropJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeAndCropJpeg.pbtxt
new file mode 100644
index 00000000000..d1d767f4714
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeAndCropJpeg.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "DecodeAndCropJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "crop_window"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeBase64.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeBase64.pbtxt
new file mode 100644
index 00000000000..cc7d61ce119
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeBase64.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DecodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeBmp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeBmp.pbtxt
new file mode 100644
index 00000000000..40ac5f0b20b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeBmp.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeCSV.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeCSV.pbtxt
new file mode 100644
index 00000000000..f4fee2a95a5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeCSV.pbtxt
@@ -0,0 +1,239 @@
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeCompressed.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeCompressed.pbtxt
new file mode 100644
index 00000000000..8a345fffa8f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeCompressed.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "DecodeCompressed"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeGif.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeGif.pbtxt
new file mode 100644
index 00000000000..89b21b376e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeGif.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeJSONExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeJSONExample.pbtxt
new file mode 100644
index 00000000000..ec37ae546c5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeJSONExample.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DecodeJSONExample"
+  input_arg {
+    name: "json_examples"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "binary_examples"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeJpeg.pbtxt
new file mode 100644
index 00000000000..9a4b4e4443c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeJpeg.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "DecodeJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodePaddedRaw.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodePaddedRaw.pbtxt
new file mode 100644
index 00000000000..dac2d95b98d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodePaddedRaw.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "DecodePaddedRaw"
+  input_arg {
+    name: "input_bytes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "fixed_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodePng.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodePng.pbtxt
new file mode 100644
index 00000000000..dd7bd024365
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodePng.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "DecodePng"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeProtoV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeProtoV2.pbtxt
new file mode 100644
index 00000000000..ae72d29d4e7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeProtoV2.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeRaw.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeRaw.pbtxt
new file mode 100644
index 00000000000..77f27f9cde7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeRaw.pbtxt
@@ -0,0 +1,144 @@
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DecodeWav.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DecodeWav.pbtxt
new file mode 100644
index 00000000000..8eba7b95ad0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DecodeWav.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  attr {
+    name: "desired_channels"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "desired_samples"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeepCopy.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeepCopy.pbtxt
new file mode 100644
index 00000000000..e673960be5d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeepCopy.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeleteIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeleteIterator.pbtxt
new file mode 100644
index 00000000000..3050ea92261
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeleteIterator.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "DeleteIterator"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeleteMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeleteMemoryCache.pbtxt
new file mode 100644
index 00000000000..821293ba6a7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeleteMemoryCache.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "DeleteMemoryCache"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeleteMultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeleteMultiDeviceIterator.pbtxt
new file mode 100644
index 00000000000..b4ae640cec2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeleteMultiDeviceIterator.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "DeleteMultiDeviceIterator"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "iterators"
+    type: DT_RESOURCE
+    number_attr: "N"
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeleteRandomSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeleteRandomSeedGenerator.pbtxt
new file mode 100644
index 00000000000..0c0d2d10574
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeleteRandomSeedGenerator.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "DeleteRandomSeedGenerator"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeleteSessionTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeleteSessionTensor.pbtxt
new file mode 100644
index 00000000000..def4c105535
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeleteSessionTensor.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DenseToDenseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DenseToDenseSetOperation.pbtxt
new file mode 100644
index 00000000000..5188a82414e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DenseToDenseSetOperation.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseBatchDataset.pbtxt
new file mode 100644
index 00000000000..051589d4986
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseSetOperation.pbtxt
new file mode 100644
index 00000000000..71c9c37798f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DenseToSparseSetOperation.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthToSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthToSpace.pbtxt
new file mode 100644
index 00000000000..422fe7ff53f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthToSpace.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt
new file mode 100644
index 00000000000..14dc12e3db9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt
@@ -0,0 +1,157 @@
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt
new file mode 100644
index 00000000000..9ae9df11c84
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt
new file mode 100644
index 00000000000..d3329f2bda7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt
new file mode 100644
index 00000000000..6471f9d1675
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt
@@ -0,0 +1,137 @@
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeserializeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeserializeIterator.pbtxt
new file mode 100644
index 00000000000..1ae290e93c0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeserializeIterator.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "DeserializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeserializeManySparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeserializeManySparse.pbtxt
new file mode 100644
index 00000000000..f0e75d96d94
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeserializeManySparse.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "DeserializeManySparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DeserializeSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DeserializeSparse.pbtxt
new file mode 100644
index 00000000000..c23a9b58a62
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DeserializeSparse.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type_attr: "Tserialized"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tserialized"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DestroyResourceOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DestroyResourceOp.pbtxt
new file mode 100644
index 00000000000..aa16c5ad523
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DestroyResourceOp.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "DestroyResourceOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "ignore_lookup_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DestroyTemporaryVariable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DestroyTemporaryVariable.pbtxt
new file mode 100644
index 00000000000..7e073b2f20b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DestroyTemporaryVariable.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "DestroyTemporaryVariable"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Diag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Diag.pbtxt
new file mode 100644
index 00000000000..92cb2071cf9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Diag.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DiagPart.pbtxt
new file mode 100644
index 00000000000..aec8c871407
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DiagPart.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Digamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Digamma.pbtxt
new file mode 100644
index 00000000000..0c294e54f21
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Digamma.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Dilation2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Dilation2D.pbtxt
new file mode 100644
index 00000000000..1db8503014a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Dilation2D.pbtxt
@@ -0,0 +1,224 @@
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropFilter.pbtxt
new file mode 100644
index 00000000000..5a5a9f1dbb3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropFilter.pbtxt
@@ -0,0 +1,240 @@
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropInput.pbtxt
new file mode 100644
index 00000000000..8944211d86d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Dilation2DBackpropInput.pbtxt
@@ -0,0 +1,240 @@
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DirectedInterleaveDataset.pbtxt
new file mode 100644
index 00000000000..dccdf1e6710
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DirectedInterleaveDataset.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "DirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Div.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Div.pbtxt
new file mode 100644
index 00000000000..6ccb981ec12
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Div.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DivNoNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DivNoNan.pbtxt
new file mode 100644
index 00000000000..17ec8671825
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DivNoNan.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxes.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxes.pbtxt
new file mode 100644
index 00000000000..729817314e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxes.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "DrawBoundingBoxes"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxesV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxesV2.pbtxt
new file mode 100644
index 00000000000..0a561796ca0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DrawBoundingBoxesV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "DrawBoundingBoxesV2"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "colors"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DynamicPartition.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DynamicPartition.pbtxt
new file mode 100644
index 00000000000..3565bd6f754
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DynamicPartition.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "DynamicPartition"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "partitions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_partitions"
+  }
+  attr {
+    name: "num_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DynamicStitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DynamicStitch.pbtxt
new file mode 100644
index 00000000000..aba8346995c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DynamicStitch.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "DynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EagerPyFunc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EagerPyFunc.pbtxt
new file mode 100644
index 00000000000..56c12e3845c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EagerPyFunc.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "is_async"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EditDistance.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EditDistance.pbtxt
new file mode 100644
index 00000000000..aba098b7020
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EditDistance.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "EditDistance"
+  input_arg {
+    name: "hypothesis_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "hypothesis_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "hypothesis_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "truth_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "normalize"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Einsum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Einsum.pbtxt
new file mode 100644
index 00000000000..3855daa079b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Einsum.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "Einsum"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "equation"
+    type: "string"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Elu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Elu.pbtxt
new file mode 100644
index 00000000000..4b8a8152756
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Elu.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EluGrad.pbtxt
new file mode 100644
index 00000000000..cfbc9f99e31
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EluGrad.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Empty.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Empty.pbtxt
new file mode 100644
index 00000000000..147854bb88c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Empty.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EmptyTensorList.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EmptyTensorList.pbtxt
new file mode 100644
index 00000000000..829a6d43a5c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EmptyTensorList.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "EmptyTensorList"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodeBase64.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodeBase64.pbtxt
new file mode 100644
index 00000000000..6e5241d0fd7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodeBase64.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "EncodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pad"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodeJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodeJpeg.pbtxt
new file mode 100644
index 00000000000..9f3c3453e33
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodeJpeg.pbtxt
@@ -0,0 +1,87 @@
+op {
+  name: "EncodeJpeg"
+  input_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "format"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    allowed_values {
+      list {
+        s: ""
+        s: "grayscale"
+        s: "rgb"
+      }
+    }
+  }
+  attr {
+    name: "quality"
+    type: "int"
+    default_value {
+      i: 95
+    }
+  }
+  attr {
+    name: "progressive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "optimize_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "chroma_downsampling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "density_unit"
+    type: "string"
+    default_value {
+      s: "in"
+    }
+    allowed_values {
+      list {
+        s: "in"
+        s: "cm"
+      }
+    }
+  }
+  attr {
+    name: "x_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "y_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "xmp_metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodeJpegVariableQuality.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodeJpegVariableQuality.pbtxt
new file mode 100644
index 00000000000..94c41ea4e5d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodeJpegVariableQuality.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "EncodeJpegVariableQuality"
+  input_arg {
+    name: "images"
+    type: DT_UINT8
+  }
+  input_arg {
+    name: "quality"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodePng.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodePng.pbtxt
new file mode 100644
index 00000000000..7d2cbd85225
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodePng.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "EncodePng"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodeProto.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodeProto.pbtxt
new file mode 100644
index 00000000000..e619618946a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodeProto.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EncodeWav.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EncodeWav.pbtxt
new file mode 100644
index 00000000000..b013362a47d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EncodeWav.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "EncodeWav"
+  input_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingIntegerBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingIntegerBatch.pbtxt
new file mode 100644
index 00000000000..26d63b6e49d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingIntegerBatch.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseBatch.pbtxt
new file mode 100644
index 00000000000..64b8cb5178c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseBatch.pbtxt
@@ -0,0 +1,127 @@
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
new file mode 100644
index 00000000000..14849fc1d61
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
@@ -0,0 +1,230 @@
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  attr {
+    name: "max_sequence_lengths"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EnsureShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EnsureShape.pbtxt
new file mode 100644
index 00000000000..24fa5589131
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EnsureShape.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "EnsureShape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Enter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Enter.pbtxt
new file mode 100644
index 00000000000..d39d15f34db
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Enter.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "Enter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt
new file mode 100644
index 00000000000..590849b602b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt
@@ -0,0 +1,119 @@
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Erf.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Erf.pbtxt
new file mode 100644
index 00000000000..680b736fa3e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Erf.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Erfc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Erfc.pbtxt
new file mode 100644
index 00000000000..2fcfc68f04f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Erfc.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/EuclideanNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/EuclideanNorm.pbtxt
new file mode 100644
index 00000000000..1117fce2d1b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/EuclideanNorm.pbtxt
@@ -0,0 +1,60 @@
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Exit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Exit.pbtxt
new file mode 100644
index 00000000000..56a1371bec6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Exit.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "Exit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Exp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Exp.pbtxt
new file mode 100644
index 00000000000..7afeb677a29
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Exp.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExpandDims.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExpandDims.pbtxt
new file mode 100644
index 00000000000..c7bb353162c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExpandDims.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "ExpandDims"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dim"
+    type_attr: "Tdim"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tdim"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAssertNextDataset.pbtxt
new file mode 100644
index 00000000000..8f3d58c3e4b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAssertNextDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalAssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAutoShardDataset.pbtxt
new file mode 100644
index 00000000000..92ef9f673ba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalAutoShardDataset.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "ExperimentalAutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalBytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalBytesProducedStatsDataset.pbtxt
new file mode 100644
index 00000000000..a06fc97959c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalBytesProducedStatsDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalCSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalCSVDataset.pbtxt
new file mode 100644
index 00000000000..54706daf93f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalCSVDataset.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "ExperimentalCSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalChooseFastestDataset.pbtxt
new file mode 100644
index 00000000000..4500a8afcde
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalChooseFastestDataset.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetCardinality.pbtxt
new file mode 100644
index 00000000000..f6ba3657864
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetCardinality.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetToTFRecord.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetToTFRecord.pbtxt
new file mode 100644
index 00000000000..0d0e46c8b39
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDatasetToTFRecord.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDenseToSparseBatchDataset.pbtxt
new file mode 100644
index 00000000000..886168cc10f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDirectedInterleaveDataset.pbtxt
new file mode 100644
index 00000000000..e0d0dc6dbbf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalDirectedInterleaveDataset.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "ExperimentalDirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByReducerDataset.pbtxt
new file mode 100644
index 00000000000..87977dabff5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByReducerDataset.pbtxt
@@ -0,0 +1,76 @@
+op {
+  name: "ExperimentalGroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByWindowDataset.pbtxt
new file mode 100644
index 00000000000..500e8ebbe02
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalGroupByWindowDataset.pbtxt
@@ -0,0 +1,125 @@
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIgnoreErrorsDataset.pbtxt
new file mode 100644
index 00000000000..e334e93046a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIgnoreErrorsDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIteratorGetDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIteratorGetDevice.pbtxt
new file mode 100644
index 00000000000..8e1e10240f9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalIteratorGetDevice.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLMDBDataset.pbtxt
new file mode 100644
index 00000000000..7f06e8d20c2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLMDBDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLatencyStatsDataset.pbtxt
new file mode 100644
index 00000000000..601867b8bd0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalLatencyStatsDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapAndBatchDataset.pbtxt
new file mode 100644
index 00000000000..c586c33251d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapAndBatchDataset.pbtxt
@@ -0,0 +1,103 @@
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapDataset.pbtxt
new file mode 100644
index 00000000000..f3e13c9f9ea
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMapDataset.pbtxt
@@ -0,0 +1,93 @@
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMatchingFilesDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMatchingFilesDataset.pbtxt
new file mode 100644
index 00000000000..b67c8af5c6d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMatchingFilesDataset.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMaxIntraOpParallelismDataset.pbtxt
new file mode 100644
index 00000000000..f6510b7dc05
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalNonSerializableDataset.pbtxt
new file mode 100644
index 00000000000..546dd090c17
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalNonSerializableDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParallelInterleaveDataset.pbtxt
new file mode 100644
index 00000000000..8827543fa88
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParallelInterleaveDataset.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParseExampleDataset.pbtxt
new file mode 100644
index 00000000000..6ed7e88f9c3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalParseExampleDataset.pbtxt
@@ -0,0 +1,147 @@
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalPrivateThreadPoolDataset.pbtxt
new file mode 100644
index 00000000000..799dab7a76e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRandomDataset.pbtxt
new file mode 100644
index 00000000000..e3dc22c5018
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRandomDataset.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRebatchDataset.pbtxt
new file mode 100644
index 00000000000..8f56e85d731
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalRebatchDataset.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_replicas"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalScanDataset.pbtxt
new file mode 100644
index 00000000000..d7083540d10
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalScanDataset.pbtxt
@@ -0,0 +1,99 @@
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSetStatsAggregatorDataset.pbtxt
new file mode 100644
index 00000000000..f3601201aae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSleepDataset.pbtxt
new file mode 100644
index 00000000000..19d20c7af63
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSleepDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ExperimentalSleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSlidingWindowDataset.pbtxt
new file mode 100644
index 00000000000..344dc2804d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSlidingWindowDataset.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSqlDataset.pbtxt
new file mode 100644
index 00000000000..c4663c5c1ec
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalSqlDataset.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorHandle.pbtxt
new file mode 100644
index 00000000000..b00cadbca09
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorHandle.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorSummary.pbtxt
new file mode 100644
index 00000000000..7886f7a6cb3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalStatsAggregatorSummary.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalTakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalTakeWhileDataset.pbtxt
new file mode 100644
index 00000000000..5e8a62aedf6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalTakeWhileDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolDataset.pbtxt
new file mode 100644
index 00000000000..1be5fd201eb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolDataset.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "ExperimentalThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolHandle.pbtxt
new file mode 100644
index 00000000000..8b230f90470
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalThreadPoolHandle.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ExperimentalThreadPoolHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "num_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUnbatchDataset.pbtxt
new file mode 100644
index 00000000000..ab48c8464f5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUnbatchDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUniqueDataset.pbtxt
new file mode 100644
index 00000000000..aacdfbabc02
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExperimentalUniqueDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "ExperimentalUniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Expm1.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Expm1.pbtxt
new file mode 100644
index 00000000000..b09aac454d0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Expm1.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExtractGlimpse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExtractGlimpse.pbtxt
new file mode 100644
index 00000000000..597a77a3f3b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExtractGlimpse.pbtxt
@@ -0,0 +1,87 @@
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExtractImagePatches.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExtractImagePatches.pbtxt
new file mode 100644
index 00000000000..ebbbd75556c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExtractImagePatches.pbtxt
@@ -0,0 +1,232 @@
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExtractJpegShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExtractJpegShape.pbtxt
new file mode 100644
index 00000000000..ac3d34ca234
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExtractJpegShape.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ExtractJpegShape"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image_shape"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ExtractVolumePatches.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ExtractVolumePatches.pbtxt
new file mode 100644
index 00000000000..09cc21a38b4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ExtractVolumePatches.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "ExtractVolumePatches"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FFT.pbtxt
new file mode 100644
index 00000000000..e986f323936
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FFT.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FFT2D.pbtxt
new file mode 100644
index 00000000000..adb1c253867
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FFT2D.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FFT3D.pbtxt
new file mode 100644
index 00000000000..9266d6db4a6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FFT3D.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FIFOQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FIFOQueue.pbtxt
new file mode 100644
index 00000000000..c3321a8c6e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FIFOQueue.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "FIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FIFOQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FIFOQueueV2.pbtxt
new file mode 100644
index 00000000000..9b1c8404d0d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FIFOQueueV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "FIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Fact.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Fact.pbtxt
new file mode 100644
index 00000000000..90a0ad8dd00
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Fact.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "Fact"
+  output_arg {
+    name: "fact"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeParam.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeParam.pbtxt
new file mode 100644
index 00000000000..dc2a7c5ea46
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeParam.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgs.pbtxt
new file mode 100644
index 00000000000..2d8eac83c59
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgs.pbtxt
@@ -0,0 +1,96 @@
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgsGradient.pbtxt
new file mode 100644
index 00000000000..5d02f59da1e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -0,0 +1,108 @@
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVars.pbtxt
new file mode 100644
index 00000000000..233f5cc2f66
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVars.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsGradient.pbtxt
new file mode 100644
index 00000000000..cf8ed6f8b7e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannel.pbtxt
new file mode 100644
index 00000000000..551ae79cd94
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
new file mode 100644
index 00000000000..a787e251c60
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FakeQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FakeQueue.pbtxt
new file mode 100644
index 00000000000..5e4cb62d941
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FakeQueue.pbtxt
@@ -0,0 +1,13 @@
+op {
+  name: "FakeQueue"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Fill.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Fill.pbtxt
new file mode 100644
index 00000000000..543ae42239b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Fill.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type_attr: "index_type"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FilterByLastComponentDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FilterByLastComponentDataset.pbtxt
new file mode 100644
index 00000000000..d1e814c52df
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FilterByLastComponentDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "FilterByLastComponentDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FilterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FilterDataset.pbtxt
new file mode 100644
index 00000000000..217e420105a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FilterDataset.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Fingerprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Fingerprint.pbtxt
new file mode 100644
index 00000000000..3a5585701ba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Fingerprint.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "Fingerprint"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "fingerprint"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDataset.pbtxt
new file mode 100644
index 00000000000..c743db8ec6d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDataset.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 00000000000..cb9b65ac0e7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReader.pbtxt
new file mode 100644
index 00000000000..75b6018f249
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReader.pbtxt
@@ -0,0 +1,140 @@
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReaderV2.pbtxt
new file mode 100644
index 00000000000..b16e5225240
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FixedLengthRecordReaderV2.pbtxt
@@ -0,0 +1,141 @@
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FixedUnigramCandidateSampler.pbtxt
new file mode 100644
index 00000000000..a7911344886
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FixedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,203 @@
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FlatMapDataset.pbtxt
new file mode 100644
index 00000000000..7dd76eeff8a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FlatMapDataset.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Floor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Floor.pbtxt
new file mode 100644
index 00000000000..27e405e22de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Floor.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FloorDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FloorDiv.pbtxt
new file mode 100644
index 00000000000..08f232b9345
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FloorDiv.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FloorMod.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FloorMod.pbtxt
new file mode 100644
index 00000000000..c53a6c81d24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FloorMod.pbtxt
@@ -0,0 +1,84 @@
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FlushSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FlushSummaryWriter.pbtxt
new file mode 100644
index 00000000000..f928d4abe99
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FlushSummaryWriter.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/For.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/For.pbtxt
new file mode 100644
index 00000000000..139990f3994
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/For.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPool.pbtxt
new file mode 100644
index 00000000000..5fc527b066d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPool.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "FractionalAvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPoolGrad.pbtxt
new file mode 100644
index 00000000000..cceb2fe903a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FractionalAvgPoolGrad.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "FractionalAvgPoolGrad"
+  input_arg {
+    name: "orig_input_tensor_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPool.pbtxt
new file mode 100644
index 00000000000..a11b4ef05f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPool.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "FractionalMaxPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPoolGrad.pbtxt
new file mode 100644
index 00000000000..711e98a5df1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FractionalMaxPoolGrad.pbtxt
@@ -0,0 +1,46 @@
+op {
+  name: "FractionalMaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt
new file mode 100644
index 00000000000..9f30c2acf11
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGrad.pbtxt
new file mode 100644
index 00000000000..bff7eecf0ce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGrad.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "FusedBatchNormGrad"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV2.pbtxt
new file mode 100644
index 00000000000..dea20af8afc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV2.pbtxt
@@ -0,0 +1,90 @@
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV3.pbtxt
new file mode 100644
index 00000000000..b1576ffb772
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormGradV3.pbtxt
@@ -0,0 +1,94 @@
+op {
+  name: "FusedBatchNormGradV3"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_5"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt
new file mode 100644
index 00000000000..170a90af2f5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt
@@ -0,0 +1,90 @@
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt
new file mode 100644
index 00000000000..f79e4938cb0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt
@@ -0,0 +1,94 @@
+op {
+  name: "FusedBatchNormV3"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedPadConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedPadConv2D.pbtxt
new file mode 100644
index 00000000000..7dc3eec7857
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedPadConv2D.pbtxt
@@ -0,0 +1,106 @@
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedResizeAndPadConv2D.pbtxt
new file mode 100644
index 00000000000..cfc716fa1d6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedResizeAndPadConv2D.pbtxt
@@ -0,0 +1,128 @@
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCell.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCell.pbtxt
new file mode 100644
index 00000000000..7c0dd9d5fcf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCell.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "GRUBlockCell"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w_ru"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_ru"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCellGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCellGrad.pbtxt
new file mode 100644
index 00000000000..723bcbd0b6f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GRUBlockCellGrad.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "GRUBlockCellGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w_ru"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_ru"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "d_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_h_prev"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_c_bar"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_r_bar_u_bar"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Gather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Gather.pbtxt
new file mode 100644
index 00000000000..264a8366bb8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Gather.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "Gather"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GatherNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GatherNd.pbtxt
new file mode 100644
index 00000000000..43b7d3ed1fe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GatherNd.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GatherV2.pbtxt
new file mode 100644
index 00000000000..bec3fa91559
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GatherV2.pbtxt
@@ -0,0 +1,93 @@
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "batch_dims"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GenerateVocabRemapping.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GenerateVocabRemapping.pbtxt
new file mode 100644
index 00000000000..a095253dbb2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GenerateVocabRemapping.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GeneratorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GeneratorDataset.pbtxt
new file mode 100644
index 00000000000..86d75b241ea
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GeneratorDataset.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "GeneratorDataset"
+  input_arg {
+    name: "init_func_other_args"
+    type_list_attr: "Tinit_func_args"
+  }
+  input_arg {
+    name: "next_func_other_args"
+    type_list_attr: "Tnext_func_args"
+  }
+  input_arg {
+    name: "finalize_func_other_args"
+    type_list_attr: "Tfinalize_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "next_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tinit_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tnext_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandle.pbtxt
new file mode 100644
index 00000000000..e5345ec6f0e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandle.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandleV2.pbtxt
new file mode 100644
index 00000000000..60405234b16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GetSessionHandleV2.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GetSessionTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GetSessionTensor.pbtxt
new file mode 100644
index 00000000000..5c4cf8af9c5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GetSessionTensor.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Greater.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Greater.pbtxt
new file mode 100644
index 00000000000..8860e3c0c10
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Greater.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GreaterEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GreaterEqual.pbtxt
new file mode 100644
index 00000000000..5bcdd3789c7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GreaterEqual.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GroupByReducerDataset.pbtxt
new file mode 100644
index 00000000000..412cdabcddc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GroupByReducerDataset.pbtxt
@@ -0,0 +1,76 @@
+op {
+  name: "GroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GroupByWindowDataset.pbtxt
new file mode 100644
index 00000000000..5c0785584ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GroupByWindowDataset.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/GuaranteeConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/GuaranteeConst.pbtxt
new file mode 100644
index 00000000000..71d47e37580
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/GuaranteeConst.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HSVToRGB.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HSVToRGB.pbtxt
new file mode 100644
index 00000000000..2b209cc6547
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HSVToRGB.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HashTable.pbtxt
new file mode 100644
index 00000000000..83afe2b9448
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HashTable.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HashTableV2.pbtxt
new file mode 100644
index 00000000000..24a9bc7176d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HashTableV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HistogramFixedWidth.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HistogramFixedWidth.pbtxt
new file mode 100644
index 00000000000..f39eabe4f72
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HistogramFixedWidth.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "HistogramFixedWidth"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "value_range"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "nbins"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "out"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HistogramSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HistogramSummary.pbtxt
new file mode 100644
index 00000000000..0c46f397972
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HistogramSummary.pbtxt
@@ -0,0 +1,148 @@
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/HostConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/HostConst.pbtxt
new file mode 100644
index 00000000000..6dd4c175707
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/HostConst.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IFFT.pbtxt
new file mode 100644
index 00000000000..8571a132950
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IFFT.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IFFT2D.pbtxt
new file mode 100644
index 00000000000..0b208d46939
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IFFT2D.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IFFT3D.pbtxt
new file mode 100644
index 00000000000..8b9667f882c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IFFT3D.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt
new file mode 100644
index 00000000000..0975c353fcf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt
new file mode 100644
index 00000000000..b850a6a1bbc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt
new file mode 100644
index 00000000000..1cc8666e5b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Identity.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Identity.pbtxt
new file mode 100644
index 00000000000..f3ca3dbd243
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Identity.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "Identity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IdentityN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IdentityN.pbtxt
new file mode 100644
index 00000000000..61c3b632790
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IdentityN.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "IdentityN"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IdentityReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IdentityReader.pbtxt
new file mode 100644
index 00000000000..3330154b4d6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IdentityReader.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IdentityReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IdentityReaderV2.pbtxt
new file mode 100644
index 00000000000..f37e9cedab9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IdentityReaderV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "IdentityReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/If.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/If.pbtxt
new file mode 100644
index 00000000000..7ccb12afa61
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/If.pbtxt
@@ -0,0 +1,198 @@
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Igamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Igamma.pbtxt
new file mode 100644
index 00000000000..822871d8c21
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Igamma.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "Igamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IgammaGradA.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IgammaGradA.pbtxt
new file mode 100644
index 00000000000..964067de5dc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IgammaGradA.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "IgammaGradA"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Igammac.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Igammac.pbtxt
new file mode 100644
index 00000000000..46254f44056
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Igammac.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "Igammac"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IgnoreErrorsDataset.pbtxt
new file mode 100644
index 00000000000..0670fd69e1b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IgnoreErrorsDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "IgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Imag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Imag.pbtxt
new file mode 100644
index 00000000000..1444b0c60b0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Imag.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "Imag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ImageSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ImageSummary.pbtxt
new file mode 100644
index 00000000000..fafd7173195
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ImageSummary.pbtxt
@@ -0,0 +1,113 @@
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ImmutableConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ImmutableConst.pbtxt
new file mode 100644
index 00000000000..ba1180951f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ImmutableConst.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "ImmutableConst"
+  output_arg {
+    name: "tensor"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "memory_region_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ImportEvent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ImportEvent.pbtxt
new file mode 100644
index 00000000000..7be31dd0ae5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ImportEvent.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InTopK.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InTopK.pbtxt
new file mode 100644
index 00000000000..6acd3b62e91
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InTopK.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "InTopK"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InTopKV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InTopKV2.pbtxt
new file mode 100644
index 00000000000..a6ca2b83a45
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InTopKV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "InTopKV2"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeue.pbtxt
new file mode 100644
index 00000000000..a48d840da66
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeue.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeueTuple.pbtxt
new file mode 100644
index 00000000000..dc6ab2b0b66
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InfeedDequeueTuple.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueue.pbtxt
new file mode 100644
index 00000000000..759b91401e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueue.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueuePrelinearizedBuffer.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueuePrelinearizedBuffer.pbtxt
new file mode 100644
index 00000000000..d281b700bd4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueuePrelinearizedBuffer.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "InfeedEnqueuePrelinearizedBuffer"
+  input_arg {
+    name: "input"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueueTuple.pbtxt
new file mode 100644
index 00000000000..459c5d9218f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InfeedEnqueueTuple.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InitializeTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InitializeTable.pbtxt
new file mode 100644
index 00000000000..35a46a99c24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InitializeTable.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "InitializeTable"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFile.pbtxt
new file mode 100644
index 00000000000..c4de3da2b66
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFile.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "InitializeTableFromTextFile"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFileV2.pbtxt
new file mode 100644
index 00000000000..0096e947e8f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableFromTextFileV2.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InitializeTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableV2.pbtxt
new file mode 100644
index 00000000000..62c565902fa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InitializeTableV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InplaceAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InplaceAdd.pbtxt
new file mode 100644
index 00000000000..7c6685770b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InplaceAdd.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InplaceSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InplaceSub.pbtxt
new file mode 100644
index 00000000000..42d6c14a586
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InplaceSub.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InplaceUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InplaceUpdate.pbtxt
new file mode 100644
index 00000000000..94b7f24aecc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InplaceUpdate.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InterleaveDataset.pbtxt
new file mode 100644
index 00000000000..ac36177cab0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InterleaveDataset.pbtxt
@@ -0,0 +1,89 @@
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Inv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Inv.pbtxt
new file mode 100644
index 00000000000..ca208664617
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Inv.pbtxt
@@ -0,0 +1,170 @@
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InvGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InvGrad.pbtxt
new file mode 100644
index 00000000000..af882a90b23
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InvGrad.pbtxt
@@ -0,0 +1,213 @@
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Invert.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Invert.pbtxt
new file mode 100644
index 00000000000..cd9c8123179
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Invert.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/InvertPermutation.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/InvertPermutation.pbtxt
new file mode 100644
index 00000000000..fa028961e32
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/InvertPermutation.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "InvertPermutation"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 00000000000..1b19fef0df2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
new file mode 100644
index 00000000000..359e0e9ba57
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "IsBoostedTreesQuantileStreamResourceInitialized"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsFinite.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsFinite.pbtxt
new file mode 100644
index 00000000000..8410dce0cb0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsFinite.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsInf.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsInf.pbtxt
new file mode 100644
index 00000000000..1ce6c74691e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsInf.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsNan.pbtxt
new file mode 100644
index 00000000000..826f2fff6c5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsNan.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IsVariableInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IsVariableInitialized.pbtxt
new file mode 100644
index 00000000000..03496db8d32
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IsVariableInitialized.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "IsVariableInitialized"
+  input_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Iterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Iterator.pbtxt
new file mode 100644
index 00000000000..76b9fdef4ed
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Iterator.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandle.pbtxt
new file mode 100644
index 00000000000..ebd34378194
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandle.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandleV2.pbtxt
new file mode 100644
index 00000000000..624c47394db
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorFromStringHandleV2.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "IteratorFromStringHandleV2"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorGetDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetDevice.pbtxt
new file mode 100644
index 00000000000..8d379c1557b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetDevice.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "IteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNext.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNext.pbtxt
new file mode 100644
index 00000000000..f204011ed43
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNext.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextAsOptional.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextAsOptional.pbtxt
new file mode 100644
index 00000000000..4c13586c510
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextAsOptional.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "IteratorGetNextAsOptional"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextSync.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextSync.pbtxt
new file mode 100644
index 00000000000..e1a7351d2da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorGetNextSync.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorToStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorToStringHandle.pbtxt
new file mode 100644
index 00000000000..87f2dffc941
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorToStringHandle.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "IteratorToStringHandle"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IteratorV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IteratorV2.pbtxt
new file mode 100644
index 00000000000..6f7ab705485
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/IteratorV2.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "IteratorV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/KMC2ChainInitialization.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/KMC2ChainInitialization.pbtxt
new file mode 100644
index 00000000000..e9640975b0e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/KMC2ChainInitialization.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/KmeansPlusPlusInitialization.pbtxt
new file mode 100644
index 00000000000..27ab4b34885
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/KmeansPlusPlusInitialization.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/L2Loss.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/L2Loss.pbtxt
new file mode 100644
index 00000000000..90e8619d09f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/L2Loss.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LMDBDataset.pbtxt
new file mode 100644
index 00000000000..ff42e6f42de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LMDBDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LMDBReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LMDBReader.pbtxt
new file mode 100644
index 00000000000..967c74bb72c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LMDBReader.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LRN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LRN.pbtxt
new file mode 100644
index 00000000000..75880682c31
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LRN.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LRNGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LRNGrad.pbtxt
new file mode 100644
index 00000000000..37db775eaa2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LRNGrad.pbtxt
@@ -0,0 +1,121 @@
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCell.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCell.pbtxt
new file mode 100644
index 00000000000..f1071f7fc51
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCell.pbtxt
@@ -0,0 +1,94 @@
+op {
+  name: "LSTMBlockCell"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  attr {
+    name: "forget_bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "cell_clip"
+    type: "float"
+    default_value {
+      f: 3
+    }
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCellGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCellGrad.pbtxt
new file mode 100644
index 00000000000..b20d47c5c01
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LSTMBlockCellGrad.pbtxt
@@ -0,0 +1,101 @@
+op {
+  name: "LSTMBlockCellGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dicfo"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wci_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wcf_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wco_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LatencyStatsDataset.pbtxt
new file mode 100644
index 00000000000..2459e86db5c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LatencyStatsDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LeakyRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LeakyRelu.pbtxt
new file mode 100644
index 00000000000..c0358f96a87
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LeakyRelu.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LeakyReluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LeakyReluGrad.pbtxt
new file mode 100644
index 00000000000..786872202c4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LeakyReluGrad.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LearnedUnigramCandidateSampler.pbtxt
new file mode 100644
index 00000000000..71466c56726
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LearnedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LeftShift.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LeftShift.pbtxt
new file mode 100644
index 00000000000..c3f56bee3bb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LeftShift.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "LeftShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "LeftShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Less.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Less.pbtxt
new file mode 100644
index 00000000000..e4f12455aa5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Less.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LessEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LessEqual.pbtxt
new file mode 100644
index 00000000000..9162a684069
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LessEqual.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Lgamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Lgamma.pbtxt
new file mode 100644
index 00000000000..fcb0241217b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Lgamma.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LinSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LinSpace.pbtxt
new file mode 100644
index 00000000000..931c7518e30
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LinSpace.pbtxt
@@ -0,0 +1,85 @@
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ListDiff.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ListDiff.pbtxt
new file mode 100644
index 00000000000..39c3ee8606c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ListDiff.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "ListDiff"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadAndRemapMatrix.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadAndRemapMatrix.pbtxt
new file mode 100644
index 00000000000..54b4a68f2b3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadAndRemapMatrix.pbtxt
@@ -0,0 +1,46 @@
+op {
+  name: "LoadAndRemapMatrix"
+  input_arg {
+    name: "ckpt_path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "row_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "initializing_values"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_matrix"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_rows"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cols"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_rows_in_memory"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 00000000000..bdf33d62308
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingADAMParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..a03310563e3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 00000000000..02136e511b9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..32485bf9fa8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 00000000000..e40d457949a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "LoadTPUEmbeddingAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..ba403c1893a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 00000000000..36280a5e8e7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 00000000000..8785f4e86d1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingFTRLParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..640801bd6f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 00000000000..2b86a8e499b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 00000000000..1622c9abda5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "LoadTPUEmbeddingMomentumParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..fe66f27c756
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 00000000000..75a3ca574aa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..58ea4050fba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 00000000000..2867fdaea95
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..506e17e1382
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 00000000000..2c69b16a46e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Log.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Log.pbtxt
new file mode 100644
index 00000000000..a16862c0735
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Log.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Log1p.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Log1p.pbtxt
new file mode 100644
index 00000000000..1f8ba12957e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Log1p.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogMatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogMatrixDeterminant.pbtxt
new file mode 100644
index 00000000000..3807cdda425
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogMatrixDeterminant.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogSoftmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogSoftmax.pbtxt
new file mode 100644
index 00000000000..92d2727bbe7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogSoftmax.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogUniformCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogUniformCandidateSampler.pbtxt
new file mode 100644
index 00000000000..9ec45571159
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogUniformCandidateSampler.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogicalAnd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogicalAnd.pbtxt
new file mode 100644
index 00000000000..b10b115df4f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogicalAnd.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "LogicalAnd"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogicalNot.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogicalNot.pbtxt
new file mode 100644
index 00000000000..5cf13ad8399
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogicalNot.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LogicalOr.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LogicalOr.pbtxt
new file mode 100644
index 00000000000..635a66d8ba0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LogicalOr.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableExport.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableExport.pbtxt
new file mode 100644
index 00000000000..6c56cdeb1de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableExport.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableExportV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableExportV2.pbtxt
new file mode 100644
index 00000000000..b86fd3a32ad
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableExportV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableFind.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableFind.pbtxt
new file mode 100644
index 00000000000..5923b502abc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableFind.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "LookupTableFind"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableFindV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableFindV2.pbtxt
new file mode 100644
index 00000000000..53cbafbeee0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableFindV2.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableImport.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableImport.pbtxt
new file mode 100644
index 00000000000..73b53a5b6f5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableImport.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableImportV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableImportV2.pbtxt
new file mode 100644
index 00000000000..41c03b83c71
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableImportV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsert.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsert.pbtxt
new file mode 100644
index 00000000000..b96cb478887
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsert.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsertV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsertV2.pbtxt
new file mode 100644
index 00000000000..19d7d49b860
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableInsertV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableRemoveV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableRemoveV2.pbtxt
new file mode 100644
index 00000000000..d7fe0bb4dd4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableRemoveV2.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableSize.pbtxt
new file mode 100644
index 00000000000..0d4bf61189f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableSize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LookupTableSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LookupTableSizeV2.pbtxt
new file mode 100644
index 00000000000..511beedff01
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LookupTableSizeV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoopCond.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoopCond.pbtxt
new file mode 100644
index 00000000000..7111fff007b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoopCond.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LowerBound.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LowerBound.pbtxt
new file mode 100644
index 00000000000..b7d1dee7797
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/LowerBound.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Lu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Lu.pbtxt
new file mode 100644
index 00000000000..59c28e09e70
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Lu.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MakeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MakeIterator.pbtxt
new file mode 100644
index 00000000000..b11c2b9e1dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MakeIterator.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapAndBatchDataset.pbtxt
new file mode 100644
index 00000000000..2f6b79a4a29
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapAndBatchDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "MapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapClear.pbtxt
new file mode 100644
index 00000000000..22c5e5fcad0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapClear.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapDataset.pbtxt
new file mode 100644
index 00000000000..59354f3f0c4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapDataset.pbtxt
@@ -0,0 +1,166 @@
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapDefun.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapDefun.pbtxt
new file mode 100644
index 00000000000..7cb9d19231c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapDefun.pbtxt
@@ -0,0 +1,132 @@
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapIncompleteSize.pbtxt
new file mode 100644
index 00000000000..ca9c629887f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapIncompleteSize.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapPeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapPeek.pbtxt
new file mode 100644
index 00000000000..4a61cb9e40f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapPeek.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapSize.pbtxt
new file mode 100644
index 00000000000..6828f8fbb09
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapSize.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapStage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapStage.pbtxt
new file mode 100644
index 00000000000..4ad2131a1f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapStage.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapUnstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapUnstage.pbtxt
new file mode 100644
index 00000000000..9901130961c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapUnstage.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MapUnstageNoKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MapUnstageNoKey.pbtxt
new file mode 100644
index 00000000000..ee4cca51346
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MapUnstageNoKey.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatMul.pbtxt
new file mode 100644
index 00000000000..83e157c3d48
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatMul.pbtxt
@@ -0,0 +1,176 @@
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatchingFiles.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatchingFiles.pbtxt
new file mode 100644
index 00000000000..3f8af5f3226
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatchingFiles.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatchingFilesDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatchingFilesDataset.pbtxt
new file mode 100644
index 00000000000..8c46cf6cff0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatchingFilesDataset.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "MatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixBandPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixBandPart.pbtxt
new file mode 100644
index 00000000000..c25aa9615c5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixBandPart.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixDeterminant.pbtxt
new file mode 100644
index 00000000000..4dd524d4894
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixDeterminant.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiag.pbtxt
new file mode 100644
index 00000000000..9b0ddb0285d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiag.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPart.pbtxt
new file mode 100644
index 00000000000..efb1e18fccb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPart.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPartV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPartV2.pbtxt
new file mode 100644
index 00000000000..f709c6d5eb5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagPartV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "MatrixDiagPartV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "padding_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagV2.pbtxt
new file mode 100644
index 00000000000..3f6aa1e6a72
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixDiagV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "MatrixDiagV2"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_rows"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_cols"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "padding_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixExponential.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixExponential.pbtxt
new file mode 100644
index 00000000000..008291a4caf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixExponential.pbtxt
@@ -0,0 +1,76 @@
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixInverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixInverse.pbtxt
new file mode 100644
index 00000000000..81d35ad1d08
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixInverse.pbtxt
@@ -0,0 +1,89 @@
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixLogarithm.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixLogarithm.pbtxt
new file mode 100644
index 00000000000..0a87e5905d0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixLogarithm.pbtxt
@@ -0,0 +1,21 @@
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiag.pbtxt
new file mode 100644
index 00000000000..e8c08f8d295
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiag.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiagV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiagV2.pbtxt
new file mode 100644
index 00000000000..1147220c00c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixSetDiagV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "MatrixSetDiagV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixSolve.pbtxt
new file mode 100644
index 00000000000..2a28fa0adb0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixSolve.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixSolveLs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixSolveLs.pbtxt
new file mode 100644
index 00000000000..5df48fc28e3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixSolveLs.pbtxt
@@ -0,0 +1,113 @@
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixSquareRoot.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixSquareRoot.pbtxt
new file mode 100644
index 00000000000..32ff859e8f9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixSquareRoot.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MatrixTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MatrixTriangularSolve.pbtxt
new file mode 100644
index 00000000000..1755e7d3869
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MatrixTriangularSolve.pbtxt
@@ -0,0 +1,122 @@
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Max.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Max.pbtxt
new file mode 100644
index 00000000000..4c931ccac4d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Max.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxIntraOpParallelismDataset.pbtxt
new file mode 100644
index 00000000000..dd209ee835d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxIntraOpParallelismDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "MaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPool.pbtxt
new file mode 100644
index 00000000000..ff78964704c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPool.pbtxt
@@ -0,0 +1,262 @@
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPool3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3D.pbtxt
new file mode 100644
index 00000000000..7af4fca0e93
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3D.pbtxt
@@ -0,0 +1,210 @@
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGrad.pbtxt
new file mode 100644
index 00000000000..77edcb4c898
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGrad.pbtxt
@@ -0,0 +1,353 @@
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGradGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGradGrad.pbtxt
new file mode 100644
index 00000000000..55d26c13c9c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPool3DGradGrad.pbtxt
@@ -0,0 +1,137 @@
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGrad.pbtxt
new file mode 100644
index 00000000000..b54e555594c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGrad.pbtxt
@@ -0,0 +1,371 @@
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGrad.pbtxt
new file mode 100644
index 00000000000..9b1f4de08ea
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGrad.pbtxt
@@ -0,0 +1,292 @@
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradV2.pbtxt
new file mode 100644
index 00000000000..fba1ab57dc6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradV2.pbtxt
@@ -0,0 +1,276 @@
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradWithArgmax.pbtxt
new file mode 100644
index 00000000000..3c3cdbb90d1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradGradWithArgmax.pbtxt
@@ -0,0 +1,358 @@
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradV2.pbtxt
new file mode 100644
index 00000000000..7e38cf840dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradV2.pbtxt
@@ -0,0 +1,288 @@
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradWithArgmax.pbtxt
new file mode 100644
index 00000000000..7c3ab4a0cd0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolGradWithArgmax.pbtxt
@@ -0,0 +1,422 @@
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolV2.pbtxt
new file mode 100644
index 00000000000..3ef7da8d9d9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolV2.pbtxt
@@ -0,0 +1,191 @@
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MaxPoolWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolWithArgmax.pbtxt
new file mode 100644
index 00000000000..d33bbd2f707
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MaxPoolWithArgmax.pbtxt
@@ -0,0 +1,416 @@
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Maximum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Maximum.pbtxt
new file mode 100644
index 00000000000..6ca150466b9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Maximum.pbtxt
@@ -0,0 +1,118 @@
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Mean.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Mean.pbtxt
new file mode 100644
index 00000000000..ae37662e551
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Mean.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Merge.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Merge.pbtxt
new file mode 100644
index 00000000000..d08f9cc55e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Merge.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "Merge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MergeSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MergeSummary.pbtxt
new file mode 100644
index 00000000000..d9b14d45110
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MergeSummary.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MergeV2Checkpoints.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MergeV2Checkpoints.pbtxt
new file mode 100644
index 00000000000..44158b91b34
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MergeV2Checkpoints.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Mfcc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Mfcc.pbtxt
new file mode 100644
index 00000000000..4c22eb8c69f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Mfcc.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Min.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Min.pbtxt
new file mode 100644
index 00000000000..f0ebdb0e41f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Min.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Minimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Minimum.pbtxt
new file mode 100644
index 00000000000..9cebfc5352e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Minimum.pbtxt
@@ -0,0 +1,118 @@
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MirrorPad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MirrorPad.pbtxt
new file mode 100644
index 00000000000..bf64a6ca504
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MirrorPad.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MirrorPadGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MirrorPadGrad.pbtxt
new file mode 100644
index 00000000000..b544cfbe72e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MirrorPadGrad.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Mod.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Mod.pbtxt
new file mode 100644
index 00000000000..6c39ed683f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Mod.pbtxt
@@ -0,0 +1,85 @@
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ModelDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ModelDataset.pbtxt
new file mode 100644
index 00000000000..81973fd8def
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ModelDataset.pbtxt
@@ -0,0 +1,90 @@
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "algorithm"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Mul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Mul.pbtxt
new file mode 100644
index 00000000000..a52bc1442bf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Mul.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MulNoNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MulNoNan.pbtxt
new file mode 100644
index 00000000000..1ce9f1dd5f5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MulNoNan.pbtxt
@@ -0,0 +1,83 @@
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIterator.pbtxt
new file mode 100644
index 00000000000..d85c553f186
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIterator.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "MultiDeviceIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorFromStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorFromStringHandle.pbtxt
new file mode 100644
index 00000000000..384b1477dbc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorFromStringHandle.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "MultiDeviceIteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorGetNextFromShard.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorGetNextFromShard.pbtxt
new file mode 100644
index 00000000000..2e007c25b23
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorGetNextFromShard.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "MultiDeviceIteratorGetNextFromShard"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shard_num"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "incarnation_id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorInit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorInit.pbtxt
new file mode 100644
index 00000000000..a011997186a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorInit.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "MultiDeviceIteratorInit"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "max_buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "incarnation_id"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorToStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorToStringHandle.pbtxt
new file mode 100644
index 00000000000..d7780d79687
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MultiDeviceIteratorToStringHandle.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "MultiDeviceIteratorToStringHandle"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Multinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Multinomial.pbtxt
new file mode 100644
index 00000000000..c258fa6e7ed
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Multinomial.pbtxt
@@ -0,0 +1,222 @@
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTable.pbtxt
new file mode 100644
index 00000000000..eecaeb2d4fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTable.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTableV2.pbtxt
new file mode 100644
index 00000000000..739079ced16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableDenseHashTableV2.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "deleted_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTable.pbtxt
new file mode 100644
index 00000000000..a8ecc34cb18
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTable.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensors.pbtxt
new file mode 100644
index 00000000000..bdec2ff5939
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensors.pbtxt
@@ -0,0 +1,46 @@
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensorsV2.pbtxt
new file mode 100644
index 00000000000..dc46d075df3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableOfTensorsV2.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableV2.pbtxt
new file mode 100644
index 00000000000..610214dfa76
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutableHashTableV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutexLock.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutexLock.pbtxt
new file mode 100644
index 00000000000..243770b4cb7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutexLock.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "MutexLock"
+  input_arg {
+    name: "mutex"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MutexV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MutexV2.pbtxt
new file mode 100644
index 00000000000..b20f9b1e799
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MutexV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NcclAllReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NcclAllReduce.pbtxt
new file mode 100644
index 00000000000..80f91edef1d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NcclAllReduce.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "NcclAllReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NcclBroadcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NcclBroadcast.pbtxt
new file mode 100644
index 00000000000..02a5487d1ac
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NcclBroadcast.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "NcclBroadcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NcclReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NcclReduce.pbtxt
new file mode 100644
index 00000000000..507f92cff2c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NcclReduce.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "NcclReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "num_devices"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NearestNeighbors.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NearestNeighbors.pbtxt
new file mode 100644
index 00000000000..5d1e5ed5765
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NearestNeighbors.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "NearestNeighbors"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "centers"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Neg.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Neg.pbtxt
new file mode 100644
index 00000000000..77bb4a5872d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Neg.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NegTrain.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NegTrain.pbtxt
new file mode 100644
index 00000000000..f12529fd632
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NegTrain.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "NegTrain"
+  input_arg {
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "lr"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "vocab_count"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_negative_samples"
+    type: "int"
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NextAfter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NextAfter.pbtxt
new file mode 100644
index 00000000000..70e4afe6c77
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NextAfter.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NextIteration.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NextIteration.pbtxt
new file mode 100644
index 00000000000..7186fc0b684
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NextIteration.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NoOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NoOp.pbtxt
new file mode 100644
index 00000000000..8f0370633fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NoOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  name: "NoOp"
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonDeterministicInts.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonDeterministicInts.pbtxt
new file mode 100644
index 00000000000..3fa5aa4a605
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonDeterministicInts.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "NonDeterministicInts"
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppression.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppression.pbtxt
new file mode 100644
index 00000000000..ded8b3728f4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppression.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV2.pbtxt
new file mode 100644
index 00000000000..90c23bc0457
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV2.pbtxt
@@ -0,0 +1,108 @@
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type_attr: "T_threshold"
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "T_threshold"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV3.pbtxt
new file mode 100644
index 00000000000..daeffd841b8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,120 @@
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type_attr: "T_threshold"
+  }
+  input_arg {
+    name: "score_threshold"
+    type_attr: "T_threshold"
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "T_threshold"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV4.pbtxt
new file mode 100644
index 00000000000..07ca92fef71
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV4.pbtxt
@@ -0,0 +1,153 @@
+op {
+  name: "NonMaxSuppressionV4"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV4"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV4"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type_attr: "T_threshold"
+  }
+  input_arg {
+    name: "score_threshold"
+    type_attr: "T_threshold"
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "T_threshold"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV5.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV5.pbtxt
new file mode 100644
index 00000000000..cabec767a15
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionV5.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "NonMaxSuppressionV5"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "score_threshold"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "soft_nms_sigma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_scores"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionWithOverlaps.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionWithOverlaps.pbtxt
new file mode 100644
index 00000000000..d89eeee4a4a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonMaxSuppressionWithOverlaps.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "NonMaxSuppressionWithOverlaps"
+  input_arg {
+    name: "overlaps"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "overlap_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NonSerializableDataset.pbtxt
new file mode 100644
index 00000000000..a1290dc5363
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NonSerializableDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "NonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt
new file mode 100644
index 00000000000..b08a3ccffc2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt
@@ -0,0 +1,119 @@
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NthElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NthElement.pbtxt
new file mode 100644
index 00000000000..c9e797273df
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/NthElement.pbtxt
@@ -0,0 +1,125 @@
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OneHot.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OneHot.pbtxt
new file mode 100644
index 00000000000..7c2d6b880da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OneHot.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OneShotIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OneShotIterator.pbtxt
new file mode 100644
index 00000000000..a2969bcc0e3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OneShotIterator.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OnesLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OnesLike.pbtxt
new file mode 100644
index 00000000000..270d01a6a6d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OnesLike.pbtxt
@@ -0,0 +1,88 @@
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OptimizeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OptimizeDataset.pbtxt
new file mode 100644
index 00000000000..cd611bf9feb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OptimizeDataset.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "optimization_configs"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OptionalFromValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OptionalFromValue.pbtxt
new file mode 100644
index 00000000000..b079f56f28a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OptionalFromValue.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "OptionalFromValue"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OptionalGetValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OptionalGetValue.pbtxt
new file mode 100644
index 00000000000..e7364a1014a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OptionalGetValue.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "OptionalGetValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OptionalHasValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OptionalHasValue.pbtxt
new file mode 100644
index 00000000000..da76333cecb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OptionalHasValue.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "OptionalHasValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "has_value"
+    type: DT_BOOL
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OptionalNone.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OptionalNone.pbtxt
new file mode 100644
index 00000000000..c47d6a74548
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OptionalNone.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "OptionalNone"
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapClear.pbtxt
new file mode 100644
index 00000000000..726e26e6172
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapClear.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapIncompleteSize.pbtxt
new file mode 100644
index 00000000000..9a9572a51be
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapIncompleteSize.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapPeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapPeek.pbtxt
new file mode 100644
index 00000000000..0d9fd20fe07
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapPeek.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapSize.pbtxt
new file mode 100644
index 00000000000..ea07d7e4215
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapSize.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapStage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapStage.pbtxt
new file mode 100644
index 00000000000..76af456ed83
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapStage.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstage.pbtxt
new file mode 100644
index 00000000000..c09b4be94f4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstage.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstageNoKey.pbtxt
new file mode 100644
index 00000000000..bc3e8c7da30
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OrderedMapUnstageNoKey.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeue.pbtxt
new file mode 100644
index 00000000000..29dc8b5c587
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeue.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeueTuple.pbtxt
new file mode 100644
index 00000000000..3e0d31078b3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OutfeedDequeueTuple.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueue.pbtxt
new file mode 100644
index 00000000000..d8c16f4d629
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueue.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueueTuple.pbtxt
new file mode 100644
index 00000000000..0bf1a5ba480
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/OutfeedEnqueueTuple.pbtxt
@@ -0,0 +1,14 @@
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Pack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Pack.pbtxt
new file mode 100644
index 00000000000..65eb67509d3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Pack.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Pad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Pad.pbtxt
new file mode 100644
index 00000000000..1c7b9c7b457
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Pad.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PadV2.pbtxt
new file mode 100644
index 00000000000..463cb71f207
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PadV2.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDataset.pbtxt
new file mode 100644
index 00000000000..69834b67251
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDataset.pbtxt
@@ -0,0 +1,85 @@
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDatasetV2.pbtxt
new file mode 100644
index 00000000000..52b5acc5bd8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PaddedBatchDatasetV2.pbtxt
@@ -0,0 +1,99 @@
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueue.pbtxt
new file mode 100644
index 00000000000..f5eca52ba19
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueue.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueueV2.pbtxt
new file mode 100644
index 00000000000..c398f9ee3a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PaddingFIFOQueueV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParallelConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParallelConcat.pbtxt
new file mode 100644
index 00000000000..b0d1cc39185
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParallelConcat.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParallelDynamicStitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParallelDynamicStitch.pbtxt
new file mode 100644
index 00000000000..9ab18a1ba5e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParallelDynamicStitch.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDataset.pbtxt
new file mode 100644
index 00000000000..6b9d2a756b8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDataset.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "ParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDatasetV2.pbtxt
new file mode 100644
index 00000000000..de73483be37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParallelInterleaveDatasetV2.pbtxt
@@ -0,0 +1,103 @@
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParallelMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParallelMapDataset.pbtxt
new file mode 100644
index 00000000000..31aed520f00
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParallelMapDataset.pbtxt
@@ -0,0 +1,243 @@
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..1f96da6f788
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,127 @@
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseExample.pbtxt
new file mode 100644
index 00000000000..a1e35bde86b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseExample.pbtxt
@@ -0,0 +1,82 @@
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseExampleDataset.pbtxt
new file mode 100644
index 00000000000..6a51864819f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseExampleDataset.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "ParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseSequenceExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseSequenceExample.pbtxt
new file mode 100644
index 00000000000..03ac5be8d26
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseSequenceExample.pbtxt
@@ -0,0 +1,195 @@
+op {
+  name: "ParseSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  output_arg {
+    name: "feature_list_dense_lengths"
+    type: DT_INT64
+    number_attr: "Nfeature_list_dense"
+  }
+  attr {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseSingleExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseSingleExample.pbtxt
new file mode 100644
index 00000000000..aaa69af4f62
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseSingleExample.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseSingleSequenceExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseSingleSequenceExample.pbtxt
new file mode 100644
index 00000000000..a0f52dbdd1f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseSingleSequenceExample.pbtxt
@@ -0,0 +1,189 @@
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ParseTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ParseTensor.pbtxt
new file mode 100644
index 00000000000..63d1f1292ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ParseTensor.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PartitionedCall.pbtxt
new file mode 100644
index 00000000000..b51bd1de9fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PartitionedCall.pbtxt
@@ -0,0 +1,142 @@
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Placeholder.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Placeholder.pbtxt
new file mode 100644
index 00000000000..7c0f57a94e1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Placeholder.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PlaceholderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PlaceholderV2.pbtxt
new file mode 100644
index 00000000000..b2cd20b238f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PlaceholderV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PlaceholderWithDefault.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PlaceholderWithDefault.pbtxt
new file mode 100644
index 00000000000..79a2ffb4492
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PlaceholderWithDefault.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Polygamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Polygamma.pbtxt
new file mode 100644
index 00000000000..6bf0d9ba4cf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Polygamma.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PopulationCount.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PopulationCount.pbtxt
new file mode 100644
index 00000000000..d66c1ac00ae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PopulationCount.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Pow.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Pow.pbtxt
new file mode 100644
index 00000000000..99086552977
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Pow.pbtxt
@@ -0,0 +1,92 @@
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PrefetchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PrefetchDataset.pbtxt
new file mode 100644
index 00000000000..396c69136c0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PrefetchDataset.pbtxt
@@ -0,0 +1,130 @@
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "slack_period"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "slack_period"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "legacy_autotune"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Prelinearize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Prelinearize.pbtxt
new file mode 100644
index 00000000000..b5ed810c25a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Prelinearize.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "Prelinearize"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PrelinearizeTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PrelinearizeTuple.pbtxt
new file mode 100644
index 00000000000..bb1ae7d3e2f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PrelinearizeTuple.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "PrelinearizeTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PreventGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PreventGradient.pbtxt
new file mode 100644
index 00000000000..1649fc808aa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PreventGradient.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Print.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Print.pbtxt
new file mode 100644
index 00000000000..fbbb514b177
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Print.pbtxt
@@ -0,0 +1,93 @@
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PrintV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PrintV2.pbtxt
new file mode 100644
index 00000000000..c5942f0a614
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PrintV2.pbtxt
@@ -0,0 +1,61 @@
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+    allowed_values {
+      list {
+        s: "stdout"
+        s: "stderr"
+        s: "log(info)"
+        s: "log(warning)"
+        s: "log(error)"
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  attr {
+    name: "end"
+    type: "string"
+    default_value {
+      s: "\n"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PriorityQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PriorityQueue.pbtxt
new file mode 100644
index 00000000000..b44d83dfb20
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PriorityQueue.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PriorityQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PriorityQueueV2.pbtxt
new file mode 100644
index 00000000000..a4e7c750b65
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PriorityQueueV2.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PrivateThreadPoolDataset.pbtxt
new file mode 100644
index 00000000000..91a10175a16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PrivateThreadPoolDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "PrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Prod.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Prod.pbtxt
new file mode 100644
index 00000000000..0583fc12ba1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Prod.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PyFunc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PyFunc.pbtxt
new file mode 100644
index 00000000000..987f028051e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PyFunc.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/PyFuncStateless.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/PyFuncStateless.pbtxt
new file mode 100644
index 00000000000..2a587d53d93
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/PyFuncStateless.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Qr.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Qr.pbtxt
new file mode 100644
index 00000000000..8319528f8a0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Qr.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantize.pbtxt
new file mode 100644
index 00000000000..fb662f7057c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantize.pbtxt
@@ -0,0 +1,295 @@
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2.pbtxt
new file mode 100644
index 00000000000..46375bd9a59
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2.pbtxt
@@ -0,0 +1,290 @@
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV3.pbtxt
new file mode 100644
index 00000000000..3ece936bb92
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV3.pbtxt
@@ -0,0 +1,200 @@
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeDownAndShrinkRange.pbtxt
new file mode 100644
index 00000000000..42783d3a14e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeDownAndShrinkRange.pbtxt
@@ -0,0 +1,106 @@
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt
new file mode 100644
index 00000000000..93869068b53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt
@@ -0,0 +1,241 @@
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedAdd.pbtxt
new file mode 100644
index 00000000000..4532bc23d50
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedAdd.pbtxt
@@ -0,0 +1,245 @@
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedAvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedAvgPool.pbtxt
new file mode 100644
index 00000000000..0ae3390d303
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedAvgPool.pbtxt
@@ -0,0 +1,116 @@
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedBatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 00000000000..832b8ba5775
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,218 @@
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedBiasAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedBiasAdd.pbtxt
new file mode 100644
index 00000000000..b479c2c54e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedBiasAdd.pbtxt
@@ -0,0 +1,156 @@
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConcat.pbtxt
new file mode 100644
index 00000000000..449f588ac8f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConcat.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "QuantizedConcat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2D.pbtxt
new file mode 100644
index 00000000000..b1cf1c8d334
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2D.pbtxt
@@ -0,0 +1,309 @@
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRelu.pbtxt
new file mode 100644
index 00000000000..229e4c436dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRelu.pbtxt
@@ -0,0 +1,222 @@
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..bc566896f53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndReluAndRequantize.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRequantize.pbtxt
new file mode 100644
index 00000000000..5d26709f14e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DAndRequantize.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DPerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DPerChannel.pbtxt
new file mode 100644
index 00000000000..93640944477
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DPerChannel.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "QuantizedConv2DPerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBias.pbtxt
new file mode 100644
index 00000000000..8372a882260
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBias.pbtxt
@@ -0,0 +1,230 @@
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRelu.pbtxt
new file mode 100644
index 00000000000..af0ce39a844
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRelu.pbtxt
@@ -0,0 +1,230 @@
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..599f19e666d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,266 @@
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRequantize.pbtxt
new file mode 100644
index 00000000000..8cf8fbb2eae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasAndRequantize.pbtxt
@@ -0,0 +1,266 @@
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..e46786a9a74
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,316 @@
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndRelu.pbtxt
new file mode 100644
index 00000000000..d74439b670e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndRelu.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..70c2366a19f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,316 @@
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2D.pbtxt
new file mode 100644
index 00000000000..f88bba239bf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2D.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "QuantizedDepthwiseConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBias.pbtxt
new file mode 100644
index 00000000000..4faf839b3b6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBias.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "QuantizedDepthwiseConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt
new file mode 100644
index 00000000000..cc6d92389f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "QuantizedDepthwiseConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..5413d151a6c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,129 @@
+op {
+  name: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedInstanceNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedInstanceNorm.pbtxt
new file mode 100644
index 00000000000..98136d82ebd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedInstanceNorm.pbtxt
@@ -0,0 +1,150 @@
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMul.pbtxt
new file mode 100644
index 00000000000..7e4707a316f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMul.pbtxt
@@ -0,0 +1,222 @@
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBias.pbtxt
new file mode 100644
index 00000000000..a59adb7f78c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBias.pbtxt
@@ -0,0 +1,122 @@
+op {
+  name: "QuantizedMatMulWithBias"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_quant_mode"
+    type: "string"
+    default_value {
+      s: "MIN_FIRST"
+    }
+    allowed_values {
+      list {
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndRelu.pbtxt
new file mode 100644
index 00000000000..cd0acb9d721
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndRelu.pbtxt
@@ -0,0 +1,112 @@
+op {
+  name: "QuantizedMatMulWithBiasAndRelu"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_quant_mode"
+    type: "string"
+    default_value {
+      s: "MIN_FIRST"
+    }
+    allowed_values {
+      list {
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 00000000000..b591d3fb37c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,130 @@
+op {
+  name: "QuantizedMatMulWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_quant_mode"
+    type: "string"
+    default_value {
+      s: "MIN_FIRST"
+    }
+    allowed_values {
+      list {
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMaxPool.pbtxt
new file mode 100644
index 00000000000..47d6ac80518
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMaxPool.pbtxt
@@ -0,0 +1,116 @@
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMul.pbtxt
new file mode 100644
index 00000000000..795ab1341d9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedMul.pbtxt
@@ -0,0 +1,245 @@
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu.pbtxt
new file mode 100644
index 00000000000..724d8b3946b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu.pbtxt
@@ -0,0 +1,112 @@
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu6.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu6.pbtxt
new file mode 100644
index 00000000000..0f389d5eae9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedRelu6.pbtxt
@@ -0,0 +1,112 @@
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedReluX.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedReluX.pbtxt
new file mode 100644
index 00000000000..9ee6f0d2e27
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedReluX.pbtxt
@@ -0,0 +1,120 @@
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedReshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedReshape.pbtxt
new file mode 100644
index 00000000000..f54db98943c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedReshape.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "QuantizedReshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizedResizeBilinear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizedResizeBilinear.pbtxt
new file mode 100644
index 00000000000..bee577ed23b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizedResizeBilinear.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueClose.pbtxt
new file mode 100644
index 00000000000..582eeccd6a2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueClose.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "QueueClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueCloseV2.pbtxt
new file mode 100644
index 00000000000..e0544c13e65
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueCloseV2.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "QueueCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeue.pbtxt
new file mode 100644
index 00000000000..f06745f20fe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeue.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueDequeue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueMany.pbtxt
new file mode 100644
index 00000000000..374ecfb18a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueMany.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "QueueDequeueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueManyV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueManyV2.pbtxt
new file mode 100644
index 00000000000..f3ebc6c7e59
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueManyV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "QueueDequeueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpTo.pbtxt
new file mode 100644
index 00000000000..6fa30ac810a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpTo.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "QueueDequeueUpTo"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpToV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpToV2.pbtxt
new file mode 100644
index 00000000000..2016cc7f04a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueUpToV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "QueueDequeueUpToV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueV2.pbtxt
new file mode 100644
index 00000000000..e338ccbd355
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueDequeueV2.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueDequeueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueue.pbtxt
new file mode 100644
index 00000000000..fb94d288f24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueue.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueEnqueue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueMany.pbtxt
new file mode 100644
index 00000000000..2d958243072
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueMany.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueEnqueueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueManyV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueManyV2.pbtxt
new file mode 100644
index 00000000000..c327d27e2f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueManyV2.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueEnqueueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueV2.pbtxt
new file mode 100644
index 00000000000..da8cdd3cd67
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueEnqueueV2.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "QueueEnqueueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosed.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosed.pbtxt
new file mode 100644
index 00000000000..11a421b27c8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosed.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "QueueIsClosed"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosedV2.pbtxt
new file mode 100644
index 00000000000..7cf1fde1bce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueIsClosedV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "QueueIsClosedV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueSize.pbtxt
new file mode 100644
index 00000000000..d2a49624f20
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueSize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "QueueSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QueueSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QueueSizeV2.pbtxt
new file mode 100644
index 00000000000..46eb229a3fa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QueueSizeV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "QueueSizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt
new file mode 100644
index 00000000000..0d65e7cc82f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt
new file mode 100644
index 00000000000..4e4ef532e4e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt
new file mode 100644
index 00000000000..2f044b33c24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RGBToHSV.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RGBToHSV.pbtxt
new file mode 100644
index 00000000000..9ed50d337d0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RGBToHSV.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedGather.pbtxt
new file mode 100644
index 00000000000..afa14e8e8a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedGather.pbtxt
@@ -0,0 +1,113 @@
+op {
+  name: "RaggedGather"
+  input_arg {
+    name: "params_nested_splits"
+    type: DT_INT64
+    number_attr: "PARAMS_RAGGED_RANK"
+  }
+  input_arg {
+    name: "params_dense_values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type: DT_INT64
+    number_attr: "OUTPUT_RAGGED_RANK"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "RaggedGather"
+  input_arg {
+    name: "params_nested_splits"
+    type_attr: "Tsplits"
+    number_attr: "PARAMS_RAGGED_RANK"
+  }
+  input_arg {
+    name: "params_dense_values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type_attr: "Tsplits"
+    number_attr: "OUTPUT_RAGGED_RANK"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedRange.pbtxt
new file mode 100644
index 00000000000..866c9b472d9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedRange.pbtxt
@@ -0,0 +1,91 @@
+op {
+  name: "RaggedRange"
+  input_arg {
+    name: "starts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "limits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "deltas"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RaggedRange"
+  input_arg {
+    name: "starts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "limits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "deltas"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "rt_nested_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorFromVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorFromVariant.pbtxt
new file mode 100644
index 00000000000..1d2201c23d5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorFromVariant.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "RaggedTensorFromVariant"
+  input_arg {
+    name: "encoded_ragged"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type_attr: "Tsplits"
+    number_attr: "output_ragged_rank"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "input_ragged_rank"
+    type: "int"
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "output_ragged_rank"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToSparse.pbtxt
new file mode 100644
index 00000000000..f9172b4cf37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToSparse.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "RaggedTensorToSparse"
+  input_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RaggedTensorToSparse"
+  input_arg {
+    name: "rt_nested_splits"
+    type_attr: "Tsplits"
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToTensor.pbtxt
new file mode 100644
index 00000000000..60fceb565eb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToTensor.pbtxt
@@ -0,0 +1,58 @@
+op {
+  name: "RaggedTensorToTensor"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_partition_tensors"
+    type_attr: "Tindex"
+    number_attr: "num_row_partition_tensors"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "num_row_partition_tensors"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "row_partition_types"
+    type: "list(string)"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToVariant.pbtxt
new file mode 100644
index 00000000000..6121fbdcaf1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RaggedTensorToVariant.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RaggedTensorToVariant"
+  input_arg {
+    name: "rt_nested_splits"
+    type_attr: "Tsplits"
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "Tvalues"
+  }
+  output_arg {
+    name: "encoded_ragged"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "batched_input"
+    type: "bool"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomCrop.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomCrop.pbtxt
new file mode 100644
index 00000000000..a5353cf58d5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomCrop.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "RandomCrop"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  deprecation {
+    version: 8
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomDataset.pbtxt
new file mode 100644
index 00000000000..777c509f826
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomDataset.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "RandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomGamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomGamma.pbtxt
new file mode 100644
index 00000000000..2f38a20f8f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomGamma.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "RandomGamma"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomGammaGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomGammaGrad.pbtxt
new file mode 100644
index 00000000000..1e1c0723f6c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomGammaGrad.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "RandomGammaGrad"
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sample"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomPoisson.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomPoisson.pbtxt
new file mode 100644
index 00000000000..5499e8d678c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomPoisson.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 25
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomPoissonV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomPoissonV2.pbtxt
new file mode 100644
index 00000000000..6c3d9827a35
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomPoissonV2.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "RandomPoissonV2"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "R"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "R"
+    type: "type"
+    default_value {
+      type: DT_DOUBLE
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomShuffle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffle.pbtxt
new file mode 100644
index 00000000000..ddd1a8d3f2b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffle.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "RandomShuffle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueue.pbtxt
new file mode 100644
index 00000000000..550acae8d5b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueue.pbtxt
@@ -0,0 +1,66 @@
+op {
+  name: "RandomShuffleQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueueV2.pbtxt
new file mode 100644
index 00000000000..7d9807c4e95
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomShuffleQueueV2.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "RandomShuffleQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomStandardNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomStandardNormal.pbtxt
new file mode 100644
index 00000000000..71fe5e5ef32
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomStandardNormal.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomUniform.pbtxt
new file mode 100644
index 00000000000..449a9ef9739
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomUniform.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RandomUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RandomUniformInt.pbtxt
new file mode 100644
index 00000000000..3b89715afca
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RandomUniformInt.pbtxt
@@ -0,0 +1,54 @@
+op {
+  name: "RandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "Tout"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Range.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Range.pbtxt
new file mode 100644
index 00000000000..fc134081bf1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Range.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RangeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RangeDataset.pbtxt
new file mode 100644
index 00000000000..782d23aa0f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RangeDataset.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Rank.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Rank.pbtxt
new file mode 100644
index 00000000000..c12fd9a0abf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Rank.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "Rank"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReadFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReadFile.pbtxt
new file mode 100644
index 00000000000..ce1985ec3c5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReadFile.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "ReadFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReadVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReadVariableOp.pbtxt
new file mode 100644
index 00000000000..5459632d583
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReadVariableOp.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "ReadVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProduced.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProduced.pbtxt
new file mode 100644
index 00000000000..50b1ea00da8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProduced.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderNumRecordsProduced"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProducedV2.pbtxt
new file mode 100644
index 00000000000..f560f01d443
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumRecordsProducedV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderNumRecordsProducedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompleted.pbtxt
new file mode 100644
index 00000000000..b1e361e0119
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompleted.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderNumWorkUnitsCompleted"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompletedV2.pbtxt
new file mode 100644
index 00000000000..ee4c93e19ba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderNumWorkUnitsCompletedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderRead.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderRead.pbtxt
new file mode 100644
index 00000000000..b2a933892c4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderRead.pbtxt
@@ -0,0 +1,21 @@
+op {
+  name: "ReaderRead"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpTo.pbtxt
new file mode 100644
index 00000000000..e3bb64ec391
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpTo.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "ReaderReadUpTo"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpToV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpToV2.pbtxt
new file mode 100644
index 00000000000..2ad62b16c9e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadUpToV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ReaderReadUpToV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderReadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadV2.pbtxt
new file mode 100644
index 00000000000..3a1573147dc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderReadV2.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "ReaderReadV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderReset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderReset.pbtxt
new file mode 100644
index 00000000000..9607f83c470
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderReset.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "ReaderReset"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderResetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderResetV2.pbtxt
new file mode 100644
index 00000000000..56f862ae795
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderResetV2.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "ReaderResetV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreState.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreState.pbtxt
new file mode 100644
index 00000000000..717a5c34c8d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreState.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderRestoreState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreStateV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreStateV2.pbtxt
new file mode 100644
index 00000000000..f75b04fc59d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderRestoreStateV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderRestoreStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeState.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeState.pbtxt
new file mode 100644
index 00000000000..2f708cb8926
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeState.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderSerializeState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeStateV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeStateV2.pbtxt
new file mode 100644
index 00000000000..c4ade1409fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReaderSerializeStateV2.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ReaderSerializeStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Real.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Real.pbtxt
new file mode 100644
index 00000000000..d7e783ebe72
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Real.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "Real"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RealDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RealDiv.pbtxt
new file mode 100644
index 00000000000..43d814a8659
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RealDiv.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RebatchDataset.pbtxt
new file mode 100644
index 00000000000..b0f222c6194
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RebatchDataset.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "RebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "RebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_replicas"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Reciprocal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Reciprocal.pbtxt
new file mode 100644
index 00000000000..5ea1abe4c9c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Reciprocal.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReciprocalGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReciprocalGrad.pbtxt
new file mode 100644
index 00000000000..8884c796da5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReciprocalGrad.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RecordInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RecordInput.pbtxt
new file mode 100644
index 00000000000..a72374420ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RecordInput.pbtxt
@@ -0,0 +1,101 @@
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RecvTPUEmbeddingActivations.pbtxt
new file mode 100644
index 00000000000..0fec828421f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RecvTPUEmbeddingActivations.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReduceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReduceDataset.pbtxt
new file mode 100644
index 00000000000..4ec194830c7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReduceDataset.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReduceJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReduceJoin.pbtxt
new file mode 100644
index 00000000000..28880fcfb06
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReduceJoin.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefEnter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefEnter.pbtxt
new file mode 100644
index 00000000000..9af599d1b33
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefEnter.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "RefEnter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefExit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefExit.pbtxt
new file mode 100644
index 00000000000..1f9e84e7fad
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefExit.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "RefExit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefIdentity.pbtxt
new file mode 100644
index 00000000000..d2293fdf467
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefIdentity.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "RefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefMerge.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefMerge.pbtxt
new file mode 100644
index 00000000000..fc4794d2f2f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefMerge.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "RefMerge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefNextIteration.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefNextIteration.pbtxt
new file mode 100644
index 00000000000..d447a3a87b2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefNextIteration.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "RefNextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefSelect.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefSelect.pbtxt
new file mode 100644
index 00000000000..aa2645f9ff1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefSelect.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "RefSelect"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RefSwitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RefSwitch.pbtxt
new file mode 100644
index 00000000000..6d12be2e5a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RefSwitch.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "RefSwitch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RegexFullMatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RegexFullMatch.pbtxt
new file mode 100644
index 00000000000..f2c0b7b99f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RegexFullMatch.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "RegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RegexReplace.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RegexReplace.pbtxt
new file mode 100644
index 00000000000..591773ce374
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RegexReplace.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "RegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "rewrite"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Relu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Relu.pbtxt
new file mode 100644
index 00000000000..703fbbeff56
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Relu.pbtxt
@@ -0,0 +1,152 @@
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Relu6.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Relu6.pbtxt
new file mode 100644
index 00000000000..311c3297a16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Relu6.pbtxt
@@ -0,0 +1,120 @@
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Relu6Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Relu6Grad.pbtxt
new file mode 100644
index 00000000000..618e13a2297
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Relu6Grad.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReluGrad.pbtxt
new file mode 100644
index 00000000000..b14f23bb30b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReluGrad.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RemoteCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RemoteCall.pbtxt
new file mode 100644
index 00000000000..c6bc5945103
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RemoteCall.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RemoteFusedGraphExecute.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RemoteFusedGraphExecute.pbtxt
new file mode 100644
index 00000000000..c47a45f5afe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RemoteFusedGraphExecute.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RepeatDataset.pbtxt
new file mode 100644
index 00000000000..de78c677478
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RepeatDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RequantizationRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RequantizationRange.pbtxt
new file mode 100644
index 00000000000..6a489081eda
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RequantizationRange.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RequantizationRangePerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RequantizationRangePerChannel.pbtxt
new file mode 100644
index 00000000000..b621afb7a80
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RequantizationRangePerChannel.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Requantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Requantize.pbtxt
new file mode 100644
index 00000000000..c04d32f19ae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Requantize.pbtxt
@@ -0,0 +1,122 @@
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RequantizePerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RequantizePerChannel.pbtxt
new file mode 100644
index 00000000000..3ed03fe12de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RequantizePerChannel.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Reshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Reshape.pbtxt
new file mode 100644
index 00000000000..e422ffa2470
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Reshape.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "Reshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt
new file mode 100644
index 00000000000..688728082dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt
new file mode 100644
index 00000000000..9abf6283b02
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt
@@ -0,0 +1,123 @@
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubicGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubicGrad.pbtxt
new file mode 100644
index 00000000000..6de227d7e0f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubicGrad.pbtxt
@@ -0,0 +1,71 @@
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt
new file mode 100644
index 00000000000..4e7c77278ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt
@@ -0,0 +1,164 @@
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinearGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinearGrad.pbtxt
new file mode 100644
index 00000000000..79d1605fd1c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinearGrad.pbtxt
@@ -0,0 +1,108 @@
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt
new file mode 100644
index 00000000000..627bae843a5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt
@@ -0,0 +1,123 @@
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighborGrad.pbtxt
new file mode 100644
index 00000000000..b16307e3ba7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighborGrad.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorApplyGradient.pbtxt
new file mode 100644
index 00000000000..ba21fe2bce5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "ResourceAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorNumAccumulated.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorNumAccumulated.pbtxt
new file mode 100644
index 00000000000..398171da210
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ResourceAccumulatorNumAccumulated"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "num_accumulated"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorSetGlobalStep.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 00000000000..3e9c5a29882
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "ResourceAccumulatorSetGlobalStep"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "new_global_step"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorTakeGradient.pbtxt
new file mode 100644
index 00000000000..56e941c4598
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "ResourceAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdaMax.pbtxt
new file mode 100644
index 00000000000..0eb3c7cc387
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdadelta.pbtxt
new file mode 100644
index 00000000000..b2267d6ab7d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdadelta.pbtxt
@@ -0,0 +1,252 @@
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagrad.pbtxt
new file mode 100644
index 00000000000..9c3ee6a14f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagrad.pbtxt
@@ -0,0 +1,263 @@
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradDA.pbtxt
new file mode 100644
index 00000000000..acdf74c64d4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradDA.pbtxt
@@ -0,0 +1,268 @@
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..e9071d47785
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdagradV2.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "ResourceApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdam.pbtxt
new file mode 100644
index 00000000000..0344526fec1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdam.pbtxt
@@ -0,0 +1,401 @@
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 00000000000..7e0c6a0670a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAddSign.pbtxt
new file mode 100644
index 00000000000..ca9ee735246
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyAddSign.pbtxt
@@ -0,0 +1,191 @@
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyCenteredRMSProp.pbtxt
new file mode 100644
index 00000000000..65248ab0ea0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,284 @@
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrl.pbtxt
new file mode 100644
index 00000000000..a879b761b56
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrl.pbtxt
@@ -0,0 +1,268 @@
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrlV2.pbtxt
new file mode 100644
index 00000000000..6b11b0cd51e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyFtrlV2.pbtxt
@@ -0,0 +1,284 @@
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyGradientDescent.pbtxt
new file mode 100644
index 00000000000..7badaf379f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyGradientDescent.pbtxt
@@ -0,0 +1,188 @@
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..6837960a3fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyMomentum.pbtxt
new file mode 100644
index 00000000000..a72c0ade6e7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyMomentum.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyPowerSign.pbtxt
new file mode 100644
index 00000000000..1c4bae7116e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,191 @@
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalAdagrad.pbtxt
new file mode 100644
index 00000000000..ab95b2d4df7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalAdagrad.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalGradientDescent.pbtxt
new file mode 100644
index 00000000000..e9abbd7fdc7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,220 @@
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyRMSProp.pbtxt
new file mode 100644
index 00000000000..6fcefea1e80
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceApplyRMSProp.pbtxt
@@ -0,0 +1,268 @@
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceConditionalAccumulator.pbtxt
new file mode 100644
index 00000000000..36ffdbb7605
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceConditionalAccumulator.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "ResourceConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceCountUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceCountUpTo.pbtxt
new file mode 100644
index 00000000000..352935c2c16
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceCountUpTo.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "ResourceCountUpTo"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceGather.pbtxt
new file mode 100644
index 00000000000..9aa33d994c6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceGather.pbtxt
@@ -0,0 +1,81 @@
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "batch_dims"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceGatherNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceGatherNd.pbtxt
new file mode 100644
index 00000000000..04794f402cd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceGatherNd.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "ResourceGatherNd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterAdd.pbtxt
new file mode 100644
index 00000000000..95243679199
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterAdd.pbtxt
@@ -0,0 +1,200 @@
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterDiv.pbtxt
new file mode 100644
index 00000000000..d428855c423
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterDiv.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ResourceScatterDiv"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMax.pbtxt
new file mode 100644
index 00000000000..41ef2a38467
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMax.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMin.pbtxt
new file mode 100644
index 00000000000..d6a50b08980
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMin.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMul.pbtxt
new file mode 100644
index 00000000000..9d124a0e6a1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterMul.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ResourceScatterMul"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdAdd.pbtxt
new file mode 100644
index 00000000000..507b30eff9a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdAdd.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ResourceScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdSub.pbtxt
new file mode 100644
index 00000000000..9d1a74daa93
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdSub.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 00000000000..4305163fc92
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ResourceScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterSub.pbtxt
new file mode 100644
index 00000000000..af78b065a51
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterSub.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterUpdate.pbtxt
new file mode 100644
index 00000000000..55101b84c33
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceScatterUpdate.pbtxt
@@ -0,0 +1,182 @@
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdadelta.pbtxt
new file mode 100644
index 00000000000..24f1a400972
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdadelta.pbtxt
@@ -0,0 +1,308 @@
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagrad.pbtxt
new file mode 100644
index 00000000000..1bac35a9f86
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagrad.pbtxt
@@ -0,0 +1,333 @@
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradDA.pbtxt
new file mode 100644
index 00000000000..f37acfc09c3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradDA.pbtxt
@@ -0,0 +1,324 @@
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..2c8896919bd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyAdagradV2.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "ResourceSparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 00000000000..feedcd57d5f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,340 @@
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrl.pbtxt
new file mode 100644
index 00000000000..9f45b6b9262
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrl.pbtxt
@@ -0,0 +1,324 @@
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrlV2.pbtxt
new file mode 100644
index 00000000000..e4a3aa2f4cb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyFtrlV2.pbtxt
@@ -0,0 +1,340 @@
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..84e146e6888
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyMomentum.pbtxt
new file mode 100644
index 00000000000..4248207b252
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyMomentum.pbtxt
@@ -0,0 +1,304 @@
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 00000000000..35f04095aa1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,292 @@
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 00000000000..d63e4e9c36a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,276 @@
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyRMSProp.pbtxt
new file mode 100644
index 00000000000..4bf71a98214
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceSparseApplyRMSProp.pbtxt
@@ -0,0 +1,324 @@
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResourceStridedSliceAssign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResourceStridedSliceAssign.pbtxt
new file mode 100644
index 00000000000..867f205958e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResourceStridedSliceAssign.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Restore.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Restore.pbtxt
new file mode 100644
index 00000000000..1db02907603
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Restore.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RestoreSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RestoreSlice.pbtxt
new file mode 100644
index 00000000000..03d2aa3bbf5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RestoreSlice.pbtxt
@@ -0,0 +1,61 @@
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RestoreV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RestoreV2.pbtxt
new file mode 100644
index 00000000000..a88db314bc4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RestoreV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 00000000000..4a31692cc95
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..dd1651c3266
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 00000000000..145e322b1fe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..64bb295eb0c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 00000000000..ceb4b68a8f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..9959a8edf0e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 00000000000..27e66ba43de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 00000000000..28b74a13849
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..917d4a16c92
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 00000000000..2510f7e3ba9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 00000000000..555a8c108f5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..fba454ac774
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 00000000000..fdbcf9d00d6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..1fbf9a248f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 00000000000..73ae099b92f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 00000000000..193af7d5c45
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 00000000000..7c70f9f2a4d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Reverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Reverse.pbtxt
new file mode 100644
index 00000000000..99b3f2e7c15
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Reverse.pbtxt
@@ -0,0 +1,103 @@
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReverseSequence.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReverseSequence.pbtxt
new file mode 100644
index 00000000000..74d3601e1f4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReverseSequence.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "ReverseSequence"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seq_lengths"
+    type_attr: "Tlen"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ReverseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ReverseV2.pbtxt
new file mode 100644
index 00000000000..39ee8b26b9c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ReverseV2.pbtxt
@@ -0,0 +1,242 @@
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RightShift.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RightShift.pbtxt
new file mode 100644
index 00000000000..97257a01972
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RightShift.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "RightShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "RightShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Rint.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Rint.pbtxt
new file mode 100644
index 00000000000..feed3bca0b3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Rint.pbtxt
@@ -0,0 +1,66 @@
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RngSkip.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RngSkip.pbtxt
new file mode 100644
index 00000000000..dc3e9b948b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RngSkip.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "RngSkip"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Roll.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Roll.pbtxt
new file mode 100644
index 00000000000..ac81404fece
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Roll.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "Roll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Round.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Round.pbtxt
new file mode 100644
index 00000000000..4f59b21afd5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Round.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Rpc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Rpc.pbtxt
new file mode 100644
index 00000000000..224e52ea574
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Rpc.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Rsqrt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Rsqrt.pbtxt
new file mode 100644
index 00000000000..6d066c9e00c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Rsqrt.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RsqrtGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RsqrtGrad.pbtxt
new file mode 100644
index 00000000000..4509b1af361
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RsqrtGrad.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBox.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..95b4a2ddd5f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,92 @@
+op {
+  name: "SampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "min_object_covered"
+    type: "float"
+    default_value {
+      f: 0.1
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBoxV2.pbtxt
new file mode 100644
index 00000000000..d857ee0a687
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SampleDistortedBoundingBoxV2.pbtxt
@@ -0,0 +1,89 @@
+op {
+  name: "SampleDistortedBoundingBoxV2"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SamplingDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SamplingDataset.pbtxt
new file mode 100644
index 00000000000..fd183f806dc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SamplingDataset.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "SamplingDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Save.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Save.pbtxt
new file mode 100644
index 00000000000..c632730bd91
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Save.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SaveSlices.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SaveSlices.pbtxt
new file mode 100644
index 00000000000..306d67bd688
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SaveSlices.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SaveV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SaveV2.pbtxt
new file mode 100644
index 00000000000..d9bae4c8b8e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SaveV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScalarSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScalarSummary.pbtxt
new file mode 100644
index 00000000000..bf4948076ab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScalarSummary.pbtxt
@@ -0,0 +1,136 @@
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslate.pbtxt
new file mode 100644
index 00000000000..516cca34539
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslate.pbtxt
@@ -0,0 +1,103 @@
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslateGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslateGrad.pbtxt
new file mode 100644
index 00000000000..8eaa03c3933
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScaleAndTranslateGrad.pbtxt
@@ -0,0 +1,85 @@
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScanDataset.pbtxt
new file mode 100644
index 00000000000..8ac7432b021
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScanDataset.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "ScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterAdd.pbtxt
new file mode 100644
index 00000000000..71b488572e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterAdd.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterDiv.pbtxt
new file mode 100644
index 00000000000..256da22c96d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterDiv.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterMax.pbtxt
new file mode 100644
index 00000000000..fe176e143b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterMax.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "ScatterMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterMin.pbtxt
new file mode 100644
index 00000000000..7099d89f366
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterMin.pbtxt
@@ -0,0 +1,52 @@
+op {
+  name: "ScatterMin"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterMul.pbtxt
new file mode 100644
index 00000000000..ae16baf8e3d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterMul.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterNd.pbtxt
new file mode 100644
index 00000000000..62cfd053c35
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterNd.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "ScatterNd"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterNdAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdAdd.pbtxt
new file mode 100644
index 00000000000..5eb628795da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdAdd.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdNonAliasingAdd.pbtxt
new file mode 100644
index 00000000000..d65f4713e81
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdNonAliasingAdd.pbtxt
@@ -0,0 +1,267 @@
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterNdSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdSub.pbtxt
new file mode 100644
index 00000000000..d13fe160efd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdSub.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterNdUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdUpdate.pbtxt
new file mode 100644
index 00000000000..73def71d094
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterNdUpdate.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "ScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterSub.pbtxt
new file mode 100644
index 00000000000..dbfb97fc2be
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterSub.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ScatterUpdate.pbtxt
new file mode 100644
index 00000000000..2f292734f17
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ScatterUpdate.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SdcaFprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SdcaFprint.pbtxt
new file mode 100644
index 00000000000..979c0016b34
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SdcaFprint.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "SdcaFprint"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizer.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizer.pbtxt
new file mode 100644
index 00000000000..3746f9504ac
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizer.pbtxt
@@ -0,0 +1,237 @@
+op {
+  name: "SdcaOptimizer"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SdcaOptimizer"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+        s: "poisson_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizerV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizerV2.pbtxt
new file mode 100644
index 00000000000..cb16c8f55a1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SdcaOptimizerV2.pbtxt
@@ -0,0 +1,119 @@
+op {
+  name: "SdcaOptimizerV2"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+        s: "poisson_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SdcaShrinkL1.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SdcaShrinkL1.pbtxt
new file mode 100644
index 00000000000..23d9fdd793a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SdcaShrinkL1.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "SdcaShrinkL1"
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+    number_attr: "num_features"
+    is_ref: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SegmentMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SegmentMax.pbtxt
new file mode 100644
index 00000000000..a1d5968f9dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SegmentMax.pbtxt
@@ -0,0 +1,176 @@
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SegmentMean.pbtxt
new file mode 100644
index 00000000000..e359dbebd47
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SegmentMean.pbtxt
@@ -0,0 +1,226 @@
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SegmentMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SegmentMin.pbtxt
new file mode 100644
index 00000000000..bf87e8294e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SegmentMin.pbtxt
@@ -0,0 +1,176 @@
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SegmentProd.pbtxt
new file mode 100644
index 00000000000..1666c499b23
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SegmentProd.pbtxt
@@ -0,0 +1,196 @@
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SegmentSum.pbtxt
new file mode 100644
index 00000000000..c7957a9c744
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SegmentSum.pbtxt
@@ -0,0 +1,196 @@
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Select.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Select.pbtxt
new file mode 100644
index 00000000000..38d00af197a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Select.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "Select"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SelectV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SelectV2.pbtxt
new file mode 100644
index 00000000000..a7c59f0d2a1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SelectV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "SelectV2"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEig.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEig.pbtxt
new file mode 100644
index 00000000000..3657cc1bdc3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEig.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEigV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEigV2.pbtxt
new file mode 100644
index 00000000000..8fbbfc961d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SelfAdjointEigV2.pbtxt
@@ -0,0 +1,101 @@
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Selu.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Selu.pbtxt
new file mode 100644
index 00000000000..2acf579a5ca
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Selu.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SeluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SeluGrad.pbtxt
new file mode 100644
index 00000000000..f96c7cbc158
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SeluGrad.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SendTPUEmbeddingGradients.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SendTPUEmbeddingGradients.pbtxt
new file mode 100644
index 00000000000..f6c486fe542
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SendTPUEmbeddingGradients.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SerializeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SerializeIterator.pbtxt
new file mode 100644
index 00000000000..618ff2753cd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SerializeIterator.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "SerializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SerializeManySparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SerializeManySparse.pbtxt
new file mode 100644
index 00000000000..9e741634385
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SerializeManySparse.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SerializeSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SerializeSparse.pbtxt
new file mode 100644
index 00000000000..5040d77fc2f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SerializeSparse.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SerializeTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SerializeTensor.pbtxt
new file mode 100644
index 00000000000..4d7b5cf5766
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SerializeTensor.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SetSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SetSize.pbtxt
new file mode 100644
index 00000000000..185e5e09734
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SetSize.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 00000000000..35613fa4698
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Shape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Shape.pbtxt
new file mode 100644
index 00000000000..3618b527e2a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Shape.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Shape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShapeN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShapeN.pbtxt
new file mode 100644
index 00000000000..15e9f11c0f8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShapeN.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "ShapeN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShardDataset.pbtxt
new file mode 100644
index 00000000000..e21b5bacafa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShardDataset.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "require_non_empty"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShardedFilename.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShardedFilename.pbtxt
new file mode 100644
index 00000000000..cf46ffdbd78
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShardedFilename.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "ShardedFilename"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShardedFilespec.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShardedFilespec.pbtxt
new file mode 100644
index 00000000000..7d1badcf09e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShardedFilespec.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "ShardedFilespec"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 00000000000..5af8dd5896a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShuffleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShuffleDataset.pbtxt
new file mode 100644
index 00000000000..70d1e1decdc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShuffleDataset.pbtxt
@@ -0,0 +1,113 @@
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShuffleDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShuffleDatasetV2.pbtxt
new file mode 100644
index 00000000000..e2dd11d6257
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShuffleDatasetV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "ShuffleDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ShutdownDistributedTPU.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ShutdownDistributedTPU.pbtxt
new file mode 100644
index 00000000000..9e60b7f5075
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ShutdownDistributedTPU.pbtxt
@@ -0,0 +1,4 @@
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sigmoid.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sigmoid.pbtxt
new file mode 100644
index 00000000000..dee59f6fa02
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sigmoid.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SigmoidGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SigmoidGrad.pbtxt
new file mode 100644
index 00000000000..788c3385098
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SigmoidGrad.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sign.pbtxt
new file mode 100644
index 00000000000..9afaa145858
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sign.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sin.pbtxt
new file mode 100644
index 00000000000..f6122e6e30b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sin.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sinh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sinh.pbtxt
new file mode 100644
index 00000000000..7225234c7ed
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sinh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Size.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Size.pbtxt
new file mode 100644
index 00000000000..db039e4254c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Size.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Size"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SkipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SkipDataset.pbtxt
new file mode 100644
index 00000000000..6f5d1f69169
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SkipDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Skipgram.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Skipgram.pbtxt
new file mode 100644
index 00000000000..d31bc826301
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Skipgram.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "Skipgram"
+  output_arg {
+    name: "vocab_word"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "vocab_freq"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "words_per_epoch"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "current_epoch"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "total_words_processed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  attr {
+    name: "filename"
+    type: "string"
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "min_count"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "subsample"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SleepDataset.pbtxt
new file mode 100644
index 00000000000..ed669a788dc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SleepDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "SleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Slice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Slice.pbtxt
new file mode 100644
index 00000000000..ced3fb6e0f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Slice.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "Slice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SlidingWindowDataset.pbtxt
new file mode 100644
index 00000000000..87298cacf64
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SlidingWindowDataset.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "SlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Snapshot.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Snapshot.pbtxt
new file mode 100644
index 00000000000..aea213f7c50
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Snapshot.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "Snapshot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SnapshotDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SnapshotDataset.pbtxt
new file mode 100644
index 00000000000..8a104bce688
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SnapshotDataset.pbtxt
@@ -0,0 +1,276 @@
+op {
+  name: "SnapshotDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "SnapshotDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_size_bytes"
+    type: "int"
+    default_value {
+      i: 10737418240
+    }
+  }
+  attr {
+    name: "pending_snapshot_expiry_seconds"
+    type: "int"
+    default_value {
+      i: 86400
+    }
+  }
+}
+op {
+  name: "SnapshotDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_size_bytes"
+    type: "int"
+    default_value {
+      i: 10737418240
+    }
+  }
+  attr {
+    name: "pending_snapshot_expiry_seconds"
+    type: "int"
+    default_value {
+      i: 86400
+    }
+  }
+  attr {
+    name: "num_reader_threads"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "reader_buffer_size"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+}
+op {
+  name: "SnapshotDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_size_bytes"
+    type: "int"
+    default_value {
+      i: 10737418240
+    }
+  }
+  attr {
+    name: "pending_snapshot_expiry_seconds"
+    type: "int"
+    default_value {
+      i: 86400
+    }
+  }
+  attr {
+    name: "num_reader_threads"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "reader_buffer_size"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "num_writer_threads"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "writer_buffer_size"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Softmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Softmax.pbtxt
new file mode 100644
index 00000000000..03f499777ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Softmax.pbtxt
@@ -0,0 +1,45 @@
+op {
+  name: "Softmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "softmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Softmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "softmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 00000000000..8ac8052e30f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,61 @@
+op {
+  name: "SoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Softplus.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Softplus.pbtxt
new file mode 100644
index 00000000000..3757e8d7503
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Softplus.pbtxt
@@ -0,0 +1,143 @@
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SoftplusGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SoftplusGrad.pbtxt
new file mode 100644
index 00000000000..331b1abbf37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SoftplusGrad.pbtxt
@@ -0,0 +1,163 @@
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Softsign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Softsign.pbtxt
new file mode 100644
index 00000000000..c83bc9929ab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Softsign.pbtxt
@@ -0,0 +1,143 @@
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SoftsignGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SoftsignGrad.pbtxt
new file mode 100644
index 00000000000..5411f9b5187
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SoftsignGrad.pbtxt
@@ -0,0 +1,163 @@
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatch.pbtxt
new file mode 100644
index 00000000000..155e1b3a985
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatch.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "SpaceToBatch"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatchND.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatchND.pbtxt
new file mode 100644
index 00000000000..c38026e5cde
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SpaceToBatchND.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SpaceToDepth.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SpaceToDepth.pbtxt
new file mode 100644
index 00000000000..c7dd03ea104
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SpaceToDepth.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "SpaceToDepth"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SpaceToDepth"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorApplyGradient.pbtxt
new file mode 100644
index 00000000000..45b7e1cd5e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,208 @@
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorTakeGradient.pbtxt
new file mode 100644
index 00000000000..e12c8c2059c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,192 @@
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseAdd.pbtxt
new file mode 100644
index 00000000000..a7f3970d91b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseAdd.pbtxt
@@ -0,0 +1,344 @@
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseAddGrad.pbtxt
new file mode 100644
index 00000000000..87edd919ad8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseAddGrad.pbtxt
@@ -0,0 +1,204 @@
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdadelta.pbtxt
new file mode 100644
index 00000000000..655b73d272c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdadelta.pbtxt
@@ -0,0 +1,336 @@
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagrad.pbtxt
new file mode 100644
index 00000000000..70d42a4e4b6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagrad.pbtxt
@@ -0,0 +1,363 @@
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradDA.pbtxt
new file mode 100644
index 00000000000..dedda3432fd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradDA.pbtxt
@@ -0,0 +1,352 @@
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradV2.pbtxt
new file mode 100644
index 00000000000..8a6a92b4623
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyAdagradV2.pbtxt
@@ -0,0 +1,83 @@
+op {
+  name: "SparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 00000000000..4ae5fb10406
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,372 @@
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrl.pbtxt
new file mode 100644
index 00000000000..f20f10603e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrl.pbtxt
@@ -0,0 +1,352 @@
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrlV2.pbtxt
new file mode 100644
index 00000000000..93a7eff19d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyFtrlV2.pbtxt
@@ -0,0 +1,368 @@
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyMomentum.pbtxt
new file mode 100644
index 00000000000..7ea3d815e7f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyMomentum.pbtxt
@@ -0,0 +1,328 @@
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 00000000000..165a8920cc2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,316 @@
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 00000000000..f661a6350a5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,296 @@
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyRMSProp.pbtxt
new file mode 100644
index 00000000000..254fb7f26b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseApplyRMSProp.pbtxt
@@ -0,0 +1,352 @@
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseConcat.pbtxt
new file mode 100644
index 00000000000..ac291f4acba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseConcat.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "SparseConcat"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "concat_dim"
+    type: "int"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseConditionalAccumulator.pbtxt
new file mode 100644
index 00000000000..8ede8ba5728
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseConditionalAccumulator.pbtxt
@@ -0,0 +1,269 @@
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseCross.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseCross.pbtxt
new file mode 100644
index 00000000000..f25372f5808
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseCross.pbtxt
@@ -0,0 +1,93 @@
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseAdd.pbtxt
new file mode 100644
index 00000000000..87eb5e4c083
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseAdd.pbtxt
@@ -0,0 +1,188 @@
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseDiv.pbtxt
new file mode 100644
index 00000000000..e3b0f587c03
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseDiv.pbtxt
@@ -0,0 +1,188 @@
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseMul.pbtxt
new file mode 100644
index 00000000000..494ce7845cb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseDenseCwiseMul.pbtxt
@@ -0,0 +1,188 @@
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRows.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRows.pbtxt
new file mode 100644
index 00000000000..d99257aa710
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRows.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "SparseFillEmptyRows"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "empty_row_indicator"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRowsGrad.pbtxt
new file mode 100644
index 00000000000..87f1c5c4e2d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseFillEmptyRowsGrad.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseMatMul.pbtxt
new file mode 100644
index 00000000000..d1eaa6a5edc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseMatMul.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "SparseMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "product"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "a_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "b_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Ta"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tb"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMax.pbtxt
new file mode 100644
index 00000000000..4df1254af74
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMax.pbtxt
@@ -0,0 +1,196 @@
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMaxSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMaxSparse.pbtxt
new file mode 100644
index 00000000000..81896440ab7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceMaxSparse.pbtxt
@@ -0,0 +1,228 @@
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSum.pbtxt
new file mode 100644
index 00000000000..48a5627649c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSum.pbtxt
@@ -0,0 +1,216 @@
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSumSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSumSparse.pbtxt
new file mode 100644
index 00000000000..1c134646dd8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReduceSumSparse.pbtxt
@@ -0,0 +1,248 @@
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReorder.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReorder.pbtxt
new file mode 100644
index 00000000000..5c5ad907838
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReorder.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "SparseReorder"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseReshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseReshape.pbtxt
new file mode 100644
index 00000000000..934b5010a1e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseReshape.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "SparseReshape"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "new_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMean.pbtxt
new file mode 100644
index 00000000000..0447e6f4c96
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMean.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanGrad.pbtxt
new file mode 100644
index 00000000000..c31439fdfbe
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanGrad.pbtxt
@@ -0,0 +1,46 @@
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 00000000000..ed3693a59de
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtN.pbtxt
new file mode 100644
index 00000000000..f8564802e51
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtN.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNGrad.pbtxt
new file mode 100644
index 00000000000..569b5b88177
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNGrad.pbtxt
@@ -0,0 +1,46 @@
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 00000000000..753cfe4d7a0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,59 @@
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSum.pbtxt
new file mode 100644
index 00000000000..9ecc20766d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSum.pbtxt
@@ -0,0 +1,204 @@
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 00000000000..06087450eb6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,138 @@
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSlice.pbtxt
new file mode 100644
index 00000000000..a6434cbfa71
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSlice.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "SparseSlice"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSliceGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSliceGrad.pbtxt
new file mode 100644
index 00000000000..ce82f94f4db
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSliceGrad.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "SparseSliceGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmax.pbtxt
new file mode 100644
index 00000000000..efa3df83db7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmax.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "SparseSoftmax"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 00000000000..57d8f4c4662
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,87 @@
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMaximum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMaximum.pbtxt
new file mode 100644
index 00000000000..bdd017c2252
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMaximum.pbtxt
@@ -0,0 +1,216 @@
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMinimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMinimum.pbtxt
new file mode 100644
index 00000000000..52fc6bec3ee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSparseMinimum.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseSplit.pbtxt
new file mode 100644
index 00000000000..997b2b21abd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseSplit.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "SparseSplit"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseAdd.pbtxt
new file mode 100644
index 00000000000..397d3fdfa9f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseAdd.pbtxt
@@ -0,0 +1,228 @@
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseMatMul.pbtxt
new file mode 100644
index 00000000000..ce66c5306a5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorDenseMatMul.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseTensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorSliceDataset.pbtxt
new file mode 100644
index 00000000000..0009238a19b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseTensorSliceDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseToDense.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseToDense.pbtxt
new file mode 100644
index 00000000000..351603424e6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseToDense.pbtxt
@@ -0,0 +1,44 @@
+op {
+  name: "SparseToDense"
+  input_arg {
+    name: "sparse_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "output_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SparseToSparseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SparseToSparseSetOperation.pbtxt
new file mode 100644
index 00000000000..a7775a2f24a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SparseToSparseSetOperation.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "SparseToSparseSetOperation"
+  input_arg {
+    name: "set1_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set1_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set1_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Split.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Split.pbtxt
new file mode 100644
index 00000000000..49428f7e5ce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Split.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "Split"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SplitV.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SplitV.pbtxt
new file mode 100644
index 00000000000..a7a9839e0f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SplitV.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "SplitV"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size_splits"
+    type_attr: "Tlen"
+  }
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SqlDataset.pbtxt
new file mode 100644
index 00000000000..337b379f61f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SqlDataset.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "SqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sqrt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sqrt.pbtxt
new file mode 100644
index 00000000000..3c566b98b0d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sqrt.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SqrtGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SqrtGrad.pbtxt
new file mode 100644
index 00000000000..d738e2023ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SqrtGrad.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Square.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Square.pbtxt
new file mode 100644
index 00000000000..4d07faf4fd0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Square.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SquaredDifference.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SquaredDifference.pbtxt
new file mode 100644
index 00000000000..29ea33c95e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SquaredDifference.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Squeeze.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Squeeze.pbtxt
new file mode 100644
index 00000000000..54335545f48
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Squeeze.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "Squeeze"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "squeeze_dims"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Stack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Stack.pbtxt
new file mode 100644
index 00000000000..e8e459cfe2c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Stack.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "Stack"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackClose.pbtxt
new file mode 100644
index 00000000000..8c916ab52a2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackClose.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "StackClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackCloseV2.pbtxt
new file mode 100644
index 00000000000..18c5934b0d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackCloseV2.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "StackCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackPop.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackPop.pbtxt
new file mode 100644
index 00000000000..80e3ef79d09
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackPop.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "StackPop"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackPopV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackPopV2.pbtxt
new file mode 100644
index 00000000000..438d52b8ea5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackPopV2.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "StackPopV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackPush.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackPush.pbtxt
new file mode 100644
index 00000000000..44fae0ce455
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackPush.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "StackPush"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackPushV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackPushV2.pbtxt
new file mode 100644
index 00000000000..7149b4fda43
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackPushV2.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "StackPushV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StackV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StackV2.pbtxt
new file mode 100644
index 00000000000..606361dd26f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StackV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "StackV2"
+  input_arg {
+    name: "max_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Stage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Stage.pbtxt
new file mode 100644
index 00000000000..8a64d696118
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Stage.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StageClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StageClear.pbtxt
new file mode 100644
index 00000000000..1f43cdb9019
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StageClear.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "StageClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StagePeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StagePeek.pbtxt
new file mode 100644
index 00000000000..a7397c48816
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StagePeek.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "StagePeek"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StageSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StageSize.pbtxt
new file mode 100644
index 00000000000..6f22fd3d032
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StageSize.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "StageSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulPartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulPartitionedCall.pbtxt
new file mode 100644
index 00000000000..b6f9b978545
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulPartitionedCall.pbtxt
@@ -0,0 +1,146 @@
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulRandomBinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulRandomBinomial.pbtxt
new file mode 100644
index 00000000000..97eb7d4e8f0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulRandomBinomial.pbtxt
@@ -0,0 +1,70 @@
+op {
+  name: "StatefulRandomBinomial"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "counts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "probs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_DOUBLE
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormal.pbtxt
new file mode 100644
index 00000000000..44ef92c5a8f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormal.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  deprecation {
+    version: 29
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormalV2.pbtxt
new file mode 100644
index 00000000000..1b99b2320b0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..e74de4f0fce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulTruncatedNormal.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "StatefulTruncatedNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniform.pbtxt
new file mode 100644
index 00000000000..fd2b87c6e45
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniform.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "StatefulUniform"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformFullInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformFullInt.pbtxt
new file mode 100644
index 00000000000..35ab70e0f3a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformFullInt.pbtxt
@@ -0,0 +1,34 @@
+op {
+  name: "StatefulUniformFullInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformInt.pbtxt
new file mode 100644
index 00000000000..06f62faaace
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatefulUniformInt.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "StatefulUniformInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessIf.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessIf.pbtxt
new file mode 100644
index 00000000000..6eda6df052a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessIf.pbtxt
@@ -0,0 +1,82 @@
+op {
+  name: "StatelessIf"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "StatelessIf"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessMultinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessMultinomial.pbtxt
new file mode 100644
index 00000000000..16dac7dfd15
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessMultinomial.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomNormal.pbtxt
new file mode 100644
index 00000000000..804d904c148
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomNormal.pbtxt
@@ -0,0 +1,153 @@
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniform.pbtxt
new file mode 100644
index 00000000000..22a5b25466b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniform.pbtxt
@@ -0,0 +1,153 @@
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniformInt.pbtxt
new file mode 100644
index 00000000000..834a6fd5ad3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessRandomUniformInt.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "StatelessRandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..c8c8d850341
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessTruncatedNormal.pbtxt
@@ -0,0 +1,153 @@
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatelessWhile.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatelessWhile.pbtxt
new file mode 100644
index 00000000000..28579edbde5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatelessWhile.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "StatelessWhile"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
+op {
+  name: "StatelessWhile"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StaticRegexFullMatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StaticRegexFullMatch.pbtxt
new file mode 100644
index 00000000000..be6078c1022
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StaticRegexFullMatch.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "StaticRegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StaticRegexReplace.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StaticRegexReplace.pbtxt
new file mode 100644
index 00000000000..fe3eb69a1a0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StaticRegexReplace.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "StaticRegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+  attr {
+    name: "rewrite"
+    type: "string"
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandle.pbtxt
new file mode 100644
index 00000000000..2d55e00492f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandle.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "StatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandleV2.pbtxt
new file mode 100644
index 00000000000..7dc361e958d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorHandleV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "StatsAggregatorHandleV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSetSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSetSummaryWriter.pbtxt
new file mode 100644
index 00000000000..24730ade149
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSetSummaryWriter.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "StatsAggregatorSetSummaryWriter"
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "summary"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSummary.pbtxt
new file mode 100644
index 00000000000..a0702a11168
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StatsAggregatorSummary.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "StatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StopGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StopGradient.pbtxt
new file mode 100644
index 00000000000..26f7c677ab8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StopGradient.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "StopGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StridedSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StridedSlice.pbtxt
new file mode 100644
index 00000000000..7c5fd7b42b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StridedSlice.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "StridedSlice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StridedSliceAssign.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StridedSliceAssign.pbtxt
new file mode 100644
index 00000000000..8393dc7272c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StridedSliceAssign.pbtxt
@@ -0,0 +1,78 @@
+op {
+  name: "StridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StridedSliceGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StridedSliceGrad.pbtxt
new file mode 100644
index 00000000000..14f6a464020
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StridedSliceGrad.pbtxt
@@ -0,0 +1,76 @@
+op {
+  name: "StridedSliceGrad"
+  input_arg {
+    name: "shape"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringFormat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringFormat.pbtxt
new file mode 100644
index 00000000000..bea32908608
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringFormat.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "StringFormat"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "template"
+    type: "string"
+    default_value {
+      s: "%s"
+    }
+  }
+  attr {
+    name: "placeholder"
+    type: "string"
+    default_value {
+      s: "%s"
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringJoin.pbtxt
new file mode 100644
index 00000000000..5854eb6d795
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringJoin.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "StringJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringLength.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringLength.pbtxt
new file mode 100644
index 00000000000..5bdf993f907
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringLength.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringLower.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringLower.pbtxt
new file mode 100644
index 00000000000..1c886146d05
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringLower.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "StringLower"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringNGrams.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringNGrams.pbtxt
new file mode 100644
index 00000000000..025fc052819
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringNGrams.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "StringNGrams"
+  input_arg {
+    name: "data"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "ngrams"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "ngrams_splits"
+    type_attr: "Tsplits"
+  }
+  attr {
+    name: "separator"
+    type: "string"
+  }
+  attr {
+    name: "ngram_widths"
+    type: "list(int)"
+    has_minimum: true
+  }
+  attr {
+    name: "left_pad"
+    type: "string"
+  }
+  attr {
+    name: "right_pad"
+    type: "string"
+  }
+  attr {
+    name: "pad_width"
+    type: "int"
+  }
+  attr {
+    name: "preserve_short_sequences"
+    type: "bool"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringSplit.pbtxt
new file mode 100644
index 00000000000..35e8594235e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringSplit.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "skip_empty"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringSplitV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringSplitV2.pbtxt
new file mode 100644
index 00000000000..fbdf8e06f37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringSplitV2.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "StringSplitV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "maxsplit"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringStrip.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringStrip.pbtxt
new file mode 100644
index 00000000000..3fff999e937
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringStrip.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "StringStrip"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucket.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucket.pbtxt
new file mode 100644
index 00000000000..7147a40a12d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucket.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "StringToHashBucket"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketFast.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketFast.pbtxt
new file mode 100644
index 00000000000..8ef1227faae
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketFast.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "StringToHashBucketFast"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketStrong.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketStrong.pbtxt
new file mode 100644
index 00000000000..2dbd9920711
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringToHashBucketStrong.pbtxt
@@ -0,0 +1,21 @@
+op {
+  name: "StringToHashBucketStrong"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "key"
+    type: "list(int)"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringToNumber.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringToNumber.pbtxt
new file mode 100644
index 00000000000..680938085ab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringToNumber.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+      }
+    }
+  }
+}
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/StringUpper.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/StringUpper.pbtxt
new file mode 100644
index 00000000000..8df4881554c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/StringUpper.pbtxt
@@ -0,0 +1,18 @@
+op {
+  name: "StringUpper"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt
new file mode 100644
index 00000000000..b9b17ec7a93
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt
@@ -0,0 +1,134 @@
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Substr.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Substr.pbtxt
new file mode 100644
index 00000000000..a5c1d2c0ae0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Substr.pbtxt
@@ -0,0 +1,71 @@
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sum.pbtxt
new file mode 100644
index 00000000000..d21e4444f5e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sum.pbtxt
@@ -0,0 +1,236 @@
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SummaryWriter.pbtxt
new file mode 100644
index 00000000000..a6fd9170f2a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SummaryWriter.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Svd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Svd.pbtxt
new file mode 100644
index 00000000000..48003906cc0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Svd.pbtxt
@@ -0,0 +1,91 @@
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Switch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Switch.pbtxt
new file mode 100644
index 00000000000..0856f3459b3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Switch.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "Switch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/SymbolicGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/SymbolicGradient.pbtxt
new file mode 100644
index 00000000000..aae5457863e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/SymbolicGradient.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "SymbolicGradient"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TFRecordDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TFRecordDataset.pbtxt
new file mode 100644
index 00000000000..dd8ac372af1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TFRecordDataset.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TFRecordReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TFRecordReader.pbtxt
new file mode 100644
index 00000000000..684c21ea45e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TFRecordReader.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TFRecordReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TFRecordReaderV2.pbtxt
new file mode 100644
index 00000000000..bcdb4764d37
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TFRecordReaderV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUCompilationResult.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUCompilationResult.pbtxt
new file mode 100644
index 00000000000..04a95cc089f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUCompilationResult.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUEmbeddingActivations.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUEmbeddingActivations.pbtxt
new file mode 100644
index 00000000000..3975077297a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUEmbeddingActivations.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUOrdinalSelector.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUOrdinalSelector.pbtxt
new file mode 100644
index 00000000000..3fb27250406
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUOrdinalSelector.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUPartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUPartitionedCall.pbtxt
new file mode 100644
index 00000000000..534f78f83a9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUPartitionedCall.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUReplicateMetadata.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicateMetadata.pbtxt
new file mode 100644
index 00000000000..1d1b1bad889
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicateMetadata.pbtxt
@@ -0,0 +1,204 @@
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+  attr {
+    name: "allow_soft_placement"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedInput.pbtxt
new file mode 100644
index 00000000000..380265c4a39
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedInput.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "is_mirrored_variable"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedOutput.pbtxt
new file mode 100644
index 00000000000..70b7d0ae71a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TPUReplicatedOutput.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TakeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TakeDataset.pbtxt
new file mode 100644
index 00000000000..9993d405bc5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TakeDataset.pbtxt
@@ -0,0 +1,55 @@
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TakeManySparseFromTensorsMap.pbtxt
new file mode 100644
index 00000000000..0e3ca630eb1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TakeManySparseFromTensorsMap.pbtxt
@@ -0,0 +1,38 @@
+op {
+  name: "TakeManySparseFromTensorsMap"
+  input_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TakeWhileDataset.pbtxt
new file mode 100644
index 00000000000..87841fa47f6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TakeWhileDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "TakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt
new file mode 100644
index 00000000000..7dc7f84fd38
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt
@@ -0,0 +1,80 @@
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Tanh.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Tanh.pbtxt
new file mode 100644
index 00000000000..1672b0dc825
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Tanh.pbtxt
@@ -0,0 +1,74 @@
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TanhGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TanhGrad.pbtxt
new file mode 100644
index 00000000000..67d28f8ad7e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TanhGrad.pbtxt
@@ -0,0 +1,114 @@
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TemporaryVariable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TemporaryVariable.pbtxt
new file mode 100644
index 00000000000..191354ec959
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TemporaryVariable.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TemporaryVariable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArray.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArray.pbtxt
new file mode 100644
index 00000000000..74b1a54976c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArray.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayClose.pbtxt
new file mode 100644
index 00000000000..63c01009420
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayClose.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "TensorArrayClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV2.pbtxt
new file mode 100644
index 00000000000..b0fb5804f1a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV3.pbtxt
new file mode 100644
index 00000000000..c5d1c2b1f24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayCloseV3.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "TensorArrayCloseV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcat.pbtxt
new file mode 100644
index 00000000000..e2c59abd687
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcat.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "TensorArrayConcat"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV2.pbtxt
new file mode 100644
index 00000000000..72376bd561c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "TensorArrayConcatV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV3.pbtxt
new file mode 100644
index 00000000000..91e575ca87f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayConcatV3.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorArrayConcatV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGather.pbtxt
new file mode 100644
index 00000000000..a8ded38550b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGather.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "TensorArrayGather"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV2.pbtxt
new file mode 100644
index 00000000000..f7296838843
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV2.pbtxt
@@ -0,0 +1,67 @@
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV3.pbtxt
new file mode 100644
index 00000000000..c87538a40d2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGatherV3.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorArrayGatherV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGrad.pbtxt
new file mode 100644
index 00000000000..422154510db
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TensorArrayGrad"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV2.pbtxt
new file mode 100644
index 00000000000..d989c407143
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV2.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV3.pbtxt
new file mode 100644
index 00000000000..53e20429ec0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradV3.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TensorArrayGradV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradWithShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradWithShape.pbtxt
new file mode 100644
index 00000000000..1ce739062eb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayGradWithShape.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "TensorArrayGradWithShape"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "shape_to_prepend"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayPack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayPack.pbtxt
new file mode 100644
index 00000000000..f608e453cee
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayPack.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "TensorArrayPack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayRead.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayRead.pbtxt
new file mode 100644
index 00000000000..62660bec758
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayRead.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "TensorArrayRead"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV2.pbtxt
new file mode 100644
index 00000000000..cd0a2a32a8c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV2.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV3.pbtxt
new file mode 100644
index 00000000000..59e66fc84d5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayReadV3.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TensorArrayReadV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatter.pbtxt
new file mode 100644
index 00000000000..b2017163f5c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatter.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "TensorArrayScatter"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 19
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV2.pbtxt
new file mode 100644
index 00000000000..1eacf2d9acf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV2.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV3.pbtxt
new file mode 100644
index 00000000000..5053ed60b11
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayScatterV3.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "TensorArrayScatterV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySize.pbtxt
new file mode 100644
index 00000000000..7f6ce9510a0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySize.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorArraySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV2.pbtxt
new file mode 100644
index 00000000000..8ee9eda30bd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV3.pbtxt
new file mode 100644
index 00000000000..8932b0dcf2d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySizeV3.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "TensorArraySizeV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplit.pbtxt
new file mode 100644
index 00000000000..06bf8bfc359
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplit.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "TensorArraySplit"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV2.pbtxt
new file mode 100644
index 00000000000..b45ea7a6108
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV2.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV3.pbtxt
new file mode 100644
index 00000000000..c072c0c65fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArraySplitV3.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "TensorArraySplitV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayUnpack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayUnpack.pbtxt
new file mode 100644
index 00000000000..81e5abec891
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayUnpack.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "TensorArrayUnpack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 20
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV2.pbtxt
new file mode 100644
index 00000000000..1293e1999c2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV2.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV3.pbtxt
new file mode 100644
index 00000000000..906e407de18
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayV3.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "identical_element_shapes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWrite.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWrite.pbtxt
new file mode 100644
index 00000000000..8f1a94c36b3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWrite.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "TensorArrayWrite"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV2.pbtxt
new file mode 100644
index 00000000000..fa0c1a679f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV2.pbtxt
@@ -0,0 +1,57 @@
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV3.pbtxt
new file mode 100644
index 00000000000..45327d42b1b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorArrayWriteV3.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "TensorArrayWriteV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorDataset.pbtxt
new file mode 100644
index 00000000000..ecb4fb53157
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 00000000000..e09d1be713e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeDeserialize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 00000000000..932eda72d76
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 00000000000..df8b190dd95
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreePredict.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreePredict.pbtxt
new file mode 100644
index 00000000000..8ee1a9bdfe2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreePredict.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 00000000000..881aeadf2d8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSerialize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSerialize.pbtxt
new file mode 100644
index 00000000000..24350a71239
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSize.pbtxt
new file mode 100644
index 00000000000..44161109475
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorForestTreeSize.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcat.pbtxt
new file mode 100644
index 00000000000..010be2e120b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcat.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatLists.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatLists.pbtxt
new file mode 100644
index 00000000000..faf228ca3c8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatLists.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatV2.pbtxt
new file mode 100644
index 00000000000..0bb9546d155
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListConcatV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListElementShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListElementShape.pbtxt
new file mode 100644
index 00000000000..26b982f6cfd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListElementShape.pbtxt
@@ -0,0 +1,21 @@
+op {
+  name: "TensorListElementShape"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListFromTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListFromTensor.pbtxt
new file mode 100644
index 00000000000..6372e1e73f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListFromTensor.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TensorListFromTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListGather.pbtxt
new file mode 100644
index 00000000000..43b4773a4f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListGather.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorListGather"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListGetItem.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListGetItem.pbtxt
new file mode 100644
index 00000000000..fa124bc9497
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListGetItem.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorListGetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListLength.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListLength.pbtxt
new file mode 100644
index 00000000000..b4ea660dca1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListLength.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "TensorListLength"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "length"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListPopBack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListPopBack.pbtxt
new file mode 100644
index 00000000000..35aa68e0758
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListPopBack.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorListPopBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBack.pbtxt
new file mode 100644
index 00000000000..f603c307cab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBack.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorListPushBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBackBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBackBatch.pbtxt
new file mode 100644
index 00000000000..186a36f8c76
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListPushBackBatch.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListReserve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListReserve.pbtxt
new file mode 100644
index 00000000000..deacc5e31e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListReserve.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TensorListReserve"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListResize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListResize.pbtxt
new file mode 100644
index 00000000000..55f80300917
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListResize.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatter.pbtxt
new file mode 100644
index 00000000000..9e9bdcc65c8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatter.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorListScatter"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 00000000000..a67c3446e73
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterV2.pbtxt
new file mode 100644
index 00000000000..2391a219121
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListScatterV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListSetItem.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListSetItem.pbtxt
new file mode 100644
index 00000000000..0d1fb788c05
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListSetItem.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListSplit.pbtxt
new file mode 100644
index 00000000000..2acc29a75a8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListSplit.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorListStack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorListStack.pbtxt
new file mode 100644
index 00000000000..5a8e7bcd81b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorListStack.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "TensorListStack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterAdd.pbtxt
new file mode 100644
index 00000000000..5fb5b8cb0dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterAdd.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterSub.pbtxt
new file mode 100644
index 00000000000..81920523ae9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterSub.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterUpdate.pbtxt
new file mode 100644
index 00000000000..f6de5eadfbb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorScatterUpdate.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorSliceDataset.pbtxt
new file mode 100644
index 00000000000..af024aa70b9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorSliceDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorStridedSliceUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorStridedSliceUpdate.pbtxt
new file mode 100644
index 00000000000..3854eeed137
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorStridedSliceUpdate.pbtxt
@@ -0,0 +1,76 @@
+op {
+  name: "TensorStridedSliceUpdate"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorSummary.pbtxt
new file mode 100644
index 00000000000..bf4114aeef3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorSummary.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "TensorSummary"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "description"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "labels"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TensorSummaryV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TensorSummaryV2.pbtxt
new file mode 100644
index 00000000000..39092b07816
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TensorSummaryV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "serialized_summary_metadata"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TextLineDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TextLineDataset.pbtxt
new file mode 100644
index 00000000000..66cedaf53d7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TextLineDataset.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TextLineReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TextLineReader.pbtxt
new file mode 100644
index 00000000000..baf1ef10d91
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TextLineReader.pbtxt
@@ -0,0 +1,63 @@
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TextLineReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TextLineReaderV2.pbtxt
new file mode 100644
index 00000000000..c669951acdf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TextLineReaderV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TextLineReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolDataset.pbtxt
new file mode 100644
index 00000000000..eac7485fb86
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolDataset.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "ThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolHandle.pbtxt
new file mode 100644
index 00000000000..e2518b1439d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ThreadPoolHandle.pbtxt
@@ -0,0 +1,37 @@
+op {
+  name: "ThreadPoolHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "num_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ThreadUnsafeUnigramCandidateSampler.pbtxt
new file mode 100644
index 00000000000..89106aab220
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Tile.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Tile.pbtxt
new file mode 100644
index 00000000000..67de1e52016
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Tile.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "Tile"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type_attr: "Tmultiples"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tmultiples"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TileGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TileGrad.pbtxt
new file mode 100644
index 00000000000..f710e1c470f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TileGrad.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "TileGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 3
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Timestamp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Timestamp.pbtxt
new file mode 100644
index 00000000000..6e51504d176
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Timestamp.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "Timestamp"
+  output_arg {
+    name: "ts"
+    type: DT_DOUBLE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TopK.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TopK.pbtxt
new file mode 100644
index 00000000000..71c98b7fd81
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TopK.pbtxt
@@ -0,0 +1,196 @@
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TopKV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TopKV2.pbtxt
new file mode 100644
index 00000000000..5089d2be1f8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TopKV2.pbtxt
@@ -0,0 +1,180 @@
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Transpose.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Transpose.pbtxt
new file mode 100644
index 00000000000..fa4fb6d5893
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Transpose.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "Transpose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TridiagonalMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TridiagonalMatMul.pbtxt
new file mode 100644
index 00000000000..117d68b48d2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TridiagonalMatMul.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "TridiagonalMatMul"
+  input_arg {
+    name: "superdiag"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "maindiag"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "subdiag"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TridiagonalSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TridiagonalSolve.pbtxt
new file mode 100644
index 00000000000..ee2cf7400d1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TridiagonalSolve.pbtxt
@@ -0,0 +1,61 @@
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "partial_pivoting"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TruncateDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TruncateDiv.pbtxt
new file mode 100644
index 00000000000..82eda3ecfbf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TruncateDiv.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TruncateMod.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TruncateMod.pbtxt
new file mode 100644
index 00000000000..70ce81b35c4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TruncateMod.pbtxt
@@ -0,0 +1,84 @@
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TruncatedNormal.pbtxt
new file mode 100644
index 00000000000..018d657985d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TruncatedNormal.pbtxt
@@ -0,0 +1,95 @@
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/TryRpc.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/TryRpc.pbtxt
new file mode 100644
index 00000000000..e585195fb9b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/TryRpc.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Unbatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Unbatch.pbtxt
new file mode 100644
index 00000000000..3934b1823ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Unbatch.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnbatchDataset.pbtxt
new file mode 100644
index 00000000000..cd61d316aa2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnbatchDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnbatchGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnbatchGrad.pbtxt
new file mode 100644
index 00000000000..97240f0be53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnbatchGrad.pbtxt
@@ -0,0 +1,41 @@
+op {
+  name: "UnbatchGrad"
+  input_arg {
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "batched_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecode.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecode.pbtxt
new file mode 100644
index 00000000000..fa036b31ef6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecode.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 00000000000..29d274738da
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,115 @@
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnicodeEncode.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnicodeEncode.pbtxt
new file mode 100644
index 00000000000..31a7a5b8388
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnicodeEncode.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnicodeScript.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnicodeScript.pbtxt
new file mode 100644
index 00000000000..60877b54448
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnicodeScript.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "UnicodeScript"
+  input_arg {
+    name: "input"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnicodeTranscode.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnicodeTranscode.pbtxt
new file mode 100644
index 00000000000..5cab73782ce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnicodeTranscode.pbtxt
@@ -0,0 +1,54 @@
+op {
+  name: "UnicodeTranscode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UniformCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UniformCandidateSampler.pbtxt
new file mode 100644
index 00000000000..bea963f908e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UniformCandidateSampler.pbtxt
@@ -0,0 +1,111 @@
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Unique.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Unique.pbtxt
new file mode 100644
index 00000000000..be389ba1482
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Unique.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "Unique"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UniqueDataset.pbtxt
new file mode 100644
index 00000000000..ef44284582a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UniqueDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "UniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UniqueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UniqueV2.pbtxt
new file mode 100644
index 00000000000..83113e14232
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UniqueV2.pbtxt
@@ -0,0 +1,85 @@
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCounts.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCounts.pbtxt
new file mode 100644
index 00000000000..c386059943a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCounts.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "UniqueWithCounts"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCountsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCountsV2.pbtxt
new file mode 100644
index 00000000000..85a12b70007
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UniqueWithCountsV2.pbtxt
@@ -0,0 +1,53 @@
+op {
+  name: "UniqueWithCountsV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Unpack.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Unpack.pbtxt
new file mode 100644
index 00000000000..cc5fd918d4c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Unpack.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Unpack"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num"
+  }
+  attr {
+    name: "num"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnravelIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnravelIndex.pbtxt
new file mode 100644
index 00000000000..df2c2bc8469
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnravelIndex.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentJoin.pbtxt
new file mode 100644
index 00000000000..dcbb91bc2f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentJoin.pbtxt
@@ -0,0 +1,49 @@
+op {
+  name: "UnsortedSegmentJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMax.pbtxt
new file mode 100644
index 00000000000..ee8578f289b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMax.pbtxt
@@ -0,0 +1,218 @@
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMin.pbtxt
new file mode 100644
index 00000000000..6a8e5ba6d1f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentMin.pbtxt
@@ -0,0 +1,62 @@
+op {
+  name: "UnsortedSegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentProd.pbtxt
new file mode 100644
index 00000000000..e255518a99f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentProd.pbtxt
@@ -0,0 +1,129 @@
+op {
+  name: "UnsortedSegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentSum.pbtxt
new file mode 100644
index 00000000000..19253281de7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnsortedSegmentSum.pbtxt
@@ -0,0 +1,238 @@
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Unstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Unstage.pbtxt
new file mode 100644
index 00000000000..4bcfd02758c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Unstage.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UnwrapDatasetVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UnwrapDatasetVariant.pbtxt
new file mode 100644
index 00000000000..10e23a97750
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UnwrapDatasetVariant.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/UpperBound.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/UpperBound.pbtxt
new file mode 100644
index 00000000000..d1b3fa060c6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/UpperBound.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "UpperBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt
new file mode 100644
index 00000000000..b5722b97032
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "VarHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/VarIsInitializedOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/VarIsInitializedOp.pbtxt
new file mode 100644
index 00000000000..39536015826
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/VarIsInitializedOp.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "VarIsInitializedOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Variable.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Variable.pbtxt
new file mode 100644
index 00000000000..943c24def59
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Variable.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "Variable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/VariableShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/VariableShape.pbtxt
new file mode 100644
index 00000000000..570b4f241aa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/VariableShape.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "VariableShape"
+  input_arg {
+    name: "input"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/VariableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/VariableV2.pbtxt
new file mode 100644
index 00000000000..c27112f1588
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/VariableV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "VariableV2"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Where.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Where.pbtxt
new file mode 100644
index 00000000000..c85edfd8e38
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Where.pbtxt
@@ -0,0 +1,130 @@
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/While.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/While.pbtxt
new file mode 100644
index 00000000000..807461b0098
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/While.pbtxt
@@ -0,0 +1,98 @@
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WholeFileReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WholeFileReader.pbtxt
new file mode 100644
index 00000000000..729d76503e5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WholeFileReader.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "WholeFileReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WholeFileReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WholeFileReaderV2.pbtxt
new file mode 100644
index 00000000000..2430494342d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WholeFileReaderV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "WholeFileReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WindowDataset.pbtxt
new file mode 100644
index 00000000000..9b68c413188
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WindowDataset.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "WindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WorkerHeartbeat.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WorkerHeartbeat.pbtxt
new file mode 100644
index 00000000000..ae5c7b8caaa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WorkerHeartbeat.pbtxt
@@ -0,0 +1,12 @@
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WrapDatasetVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WrapDatasetVariant.pbtxt
new file mode 100644
index 00000000000..0b1e4363bd2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WrapDatasetVariant.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteAudioSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteAudioSummary.pbtxt
new file mode 100644
index 00000000000..8cc81eba8ff
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteAudioSummary.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteFile.pbtxt
new file mode 100644
index 00000000000..6a15b39873d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteFile.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "WriteFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "WriteFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteGraphSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteGraphSummary.pbtxt
new file mode 100644
index 00000000000..2957e224f59
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteGraphSummary.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteHistogramSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteHistogramSummary.pbtxt
new file mode 100644
index 00000000000..544dab62507
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteHistogramSummary.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteImageSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteImageSummary.pbtxt
new file mode 100644
index 00000000000..d4248e69fdb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteImageSummary.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteRawProtoSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteRawProtoSummary.pbtxt
new file mode 100644
index 00000000000..82ac51a1378
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteRawProtoSummary.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "WriteRawProtoSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteScalarSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteScalarSummary.pbtxt
new file mode 100644
index 00000000000..0f359a85dce
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteScalarSummary.pbtxt
@@ -0,0 +1,40 @@
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/WriteSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/WriteSummary.pbtxt
new file mode 100644
index 00000000000..a641ece08df
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/WriteSummary.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Xdivy.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Xdivy.pbtxt
new file mode 100644
index 00000000000..472d8726f76
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Xdivy.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Xdivy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Xlogy.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Xlogy.pbtxt
new file mode 100644
index 00000000000..cf727d21f99
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Xlogy.pbtxt
@@ -0,0 +1,28 @@
+op {
+  name: "Xlogy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ZerosLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ZerosLike.pbtxt
new file mode 100644
index 00000000000..5bb8d0ab378
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ZerosLike.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "ZerosLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Zeta.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Zeta.pbtxt
new file mode 100644
index 00000000000..c391bd1f22c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/Zeta.pbtxt
@@ -0,0 +1,25 @@
+op {
+  name: "Zeta"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ZipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ZipDataset.pbtxt
new file mode 100644
index 00000000000..16e7ed82c28
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/ZipDataset.pbtxt
@@ -0,0 +1,61 @@
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/update_ops_main.cc b/tensorflow/core/ops/compat/update_ops_main.cc
index 79c830ae89e..3618617c4fc 100644
--- a/tensorflow/core/ops/compat/update_ops_main.cc
+++ b/tensorflow/core/ops/compat/update_ops_main.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include <stdio.h>
 
 #include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/ops/compat/op_compatibility_lib.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -41,19 +43,26 @@ void WriteUpdateTo(const string& directory) {
   // Make sure the current version of ops are compatible with the
   // historical versions, and generate a new history adding all
   // changed ops.
-  OpList out_op_history;
+  OpCompatibilityLib::OpHistory out_op_history;
   int changed_ops = 0;
   int added_ops = 0;
   TF_QCHECK_OK(compatibility.ValidateCompatible(env, &changed_ops, &added_ops,
                                                 &out_op_history));
   printf("%d changed ops\n%d added ops\n", changed_ops, added_ops);
 
-  if (changed_ops + added_ops > 0) {
+  const string& history_dir = compatibility.op_history_directory();
+  Status status = env->CreateDir(history_dir);
+  if (!errors::IsAlreadyExists(status)) {
+    TF_QCHECK_OK(status);
+  }
+  if (changed_ops + added_ops > 0 || !errors::IsAlreadyExists(status)) {
     // Write out new op history.
-    const string& history_file = compatibility.op_history_file();
-    printf("Writing updated op history to %s...\n", history_file.c_str());
-    TF_QCHECK_OK(
-        WriteStringToFile(env, history_file, out_op_history.DebugString()));
+    printf("Writing updated op history to %s/...\n", history_dir.c_str());
+    for (const auto& op_file : out_op_history) {
+      TF_QCHECK_OK(WriteStringToFile(env,
+                                     io::JoinPath(history_dir, op_file.first),
+                                     op_file.second.DebugString()));
+    }
   }
 }
 
diff --git a/tensorflow/core/ops/ctc_ops.cc b/tensorflow/core/ops/ctc_ops.cc
index f2322c730bc..f82ebb77001 100644
--- a/tensorflow/core/ops/ctc_ops.cc
+++ b/tensorflow/core/ops/ctc_ops.cc
@@ -25,15 +25,16 @@ using shape_inference::ShapeHandle;
 // CTC is Connectionist Temporal Classification.  See util/ctc/ for details.
 
 REGISTER_OP("CTCLoss")
-    .Input("inputs: float")
+    .Input("inputs: T")
     .Input("labels_indices: int64")
     .Input("labels_values: int32")
     .Input("sequence_length: int32")
     .Attr("preprocess_collapse_repeated: bool = false")
     .Attr("ctc_merge_repeated: bool = true")
     .Attr("ignore_longer_outputs_than_inputs: bool = false")
-    .Output("loss: float")
-    .Output("gradient: float")
+    .Output("loss: T")
+    .Output("gradient: T")
+    .Attr("T: {float, double} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle inputs;
       ShapeHandle labels_indices;
@@ -62,13 +63,14 @@ REGISTER_OP("CTCLoss")
     });
 
 REGISTER_OP("CTCGreedyDecoder")
-    .Input("inputs: float")
+    .Input("inputs: T")
     .Input("sequence_length: int32")
     .Attr("merge_repeated: bool = false")
     .Output("decoded_indices: int64")
     .Output("decoded_values: int64")
     .Output("decoded_shape: int64")
-    .Output("log_probability: float")
+    .Output("log_probability: T")
+    .Attr("T: {float, double} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle inputs;
       ShapeHandle sequence_length;
@@ -90,7 +92,7 @@ REGISTER_OP("CTCGreedyDecoder")
     });
 
 REGISTER_OP("CTCBeamSearchDecoder")
-    .Input("inputs: float")
+    .Input("inputs: T")
     .Input("sequence_length: int32")
     .Attr("beam_width: int >= 1")
     .Attr("top_paths: int >= 1")
@@ -98,7 +100,8 @@ REGISTER_OP("CTCBeamSearchDecoder")
     .Output("decoded_indices: top_paths * int64")
     .Output("decoded_values: top_paths * int64")
     .Output("decoded_shape: top_paths * int64")
-    .Output("log_probability: float")
+    .Output("log_probability: T")
+    .Attr("T: {float, double} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle inputs;
       ShapeHandle sequence_length;
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b4cd4f6faf4..d21b2e3d88d 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -168,6 +168,7 @@ REGISTER_OP("PrefetchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("slack_period: int = 0")
+    .Attr("legacy_autotune: bool = true")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size should be a scalar.
@@ -354,6 +355,22 @@ REGISTER_OP("RangeDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("AnonymousRandomSeedGenerator")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: resource")
+    .Output("deleter: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("DeleteRandomSeedGenerator")
+    .Input("handle: resource")
+    .Input("deleter: variant")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -372,6 +389,21 @@ REGISTER_OP("ShuffleDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShuffleDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("buffer_size: int64")
+    .Input("seed_generator: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ShuffleAndRepeatDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -391,6 +423,20 @@ REGISTER_OP("ShuffleAndRepeatDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("AnonymousMemoryCache")
+    .Output("handle: resource")
+    .Output("deleter: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("DeleteMemoryCache")
+    .Input("handle: resource")
+    .Input("deleter: variant")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
     .Input("filename: string")
@@ -404,6 +450,22 @@ REGISTER_OP("CacheDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("CacheDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("cache: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // filename should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // cache should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -643,6 +705,8 @@ REGISTER_OP("DeserializeIterator")
 
 REGISTER_OP("DatasetToGraph")
     .Input("input_dataset: variant")
+    .Attr("stateful_whitelist: list(string) >= 0 = []")
+    .Attr("allow_stateful: bool = false")
     .Output("graph: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 1d4c3a0e8ba..68823c8b8c0 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -658,18 +658,20 @@ REGISTER_OP("RandomDataset")
 
 REGISTER_OP("ExperimentalRebatchDataset")
     .Input("input_dataset: variant")
-    .Input("num_workers: int64")
+    .Input("num_replicas: int64")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_fallback: bool = true")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RebatchDataset")
     .Input("input_dataset: variant")
-    .Input("num_workers: int64")
+    .Input("num_replicas: int64")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_fallback: bool = true")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SamplingDataset")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 8f1ac77af7d..f5f7244d306 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -195,6 +195,31 @@ body: A function that takes a list of tensors and returns another
       by T.
 )doc");
 
+Status WhileShapeInferenceFn(shape_inference::InferenceContext* c) {
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  // If `output_shapes` attr is set use that as the shapes of the outputs
+  // else use the input shapes.
+  if (!output_shapes.empty()) {
+    if (output_shapes.size() != c->num_outputs()) {
+      return errors::InvalidArgument(
+          "`output_shapes` must be the same length as num outputs (",
+          output_shapes.size(), " vs. ", c->num_outputs());
+    }
+    for (size_t i = 0; i < output_shapes.size(); ++i) {
+      shape_inference::ShapeHandle output_shape_handle;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+          output_shapes[i], &output_shape_handle));
+      c->set_output(static_cast<int>(i), output_shape_handle);
+    }
+  } else {
+    for (int i = 0; i < c->num_outputs(); ++i) {
+      c->set_output(i, c->input(i));
+    }
+  }
+  return Status::OK();
+}
+
 REGISTER_OP("While")
     .Input("input: T")
     .Output("output: T")
@@ -204,30 +229,7 @@ REGISTER_OP("While")
     .Attr("output_shapes: list(shape) = []")
     .Attr("parallel_iterations: int = 10")
     .SetIsStateful()
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      // If `output_shapes` attr is set use that as the shapes of the outputs
-      // else use the input shapes.
-      if (!output_shapes.empty()) {
-        if (output_shapes.size() != c->num_outputs()) {
-          return errors::InvalidArgument(
-              "`output_shapes` must be the same length as num outputs (",
-              output_shapes.size(), " vs. ", c->num_outputs());
-        }
-        for (size_t i = 0; i < output_shapes.size(); ++i) {
-          shape_inference::ShapeHandle output_shape_handle;
-          TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-              output_shapes[i], &output_shape_handle));
-          c->set_output(static_cast<int>(i), output_shape_handle);
-        }
-      } else {
-        for (int i = 0; i < c->num_outputs(); ++i) {
-          c->set_output(i, c->input(i));
-        }
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(WhileShapeInferenceFn);
 
 REGISTER_OP("StatelessWhile")
     .Input("input: T")
@@ -235,12 +237,9 @@ REGISTER_OP("StatelessWhile")
     .Attr("T: list(type) >= 0")
     .Attr("cond: func")
     .Attr("body: func")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      for (int i = 0; i < c->num_outputs(); ++i) {
-        c->set_output(i, c->input(i));
-      }
-      return Status::OK();
-    });
+    .Attr("output_shapes: list(shape) = []")
+    .Attr("parallel_iterations: int = 10")
+    .SetShapeFn(WhileShapeInferenceFn);
 
 REGISTER_OP("For")
     .Input("start: int32")
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 1f2edee9054..e2078e001e4 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -101,7 +101,7 @@ REGISTER_OP("RestoreV2")
       const Tensor* shape_and_slices_tensor = c->input_tensor(2);
       if (shape_and_slices_tensor) {
         const auto& shape_and_slices_flat =
-            shape_and_slices_tensor->flat<string>();
+            shape_and_slices_tensor->flat<tstring>();
         if (shape_and_slices_flat.size() != c->num_outputs()) {
           return errors::InvalidArgument(
               "The number of shape_and_slice doesn't match tensor outputs.");
@@ -222,7 +222,7 @@ REGISTER_OP("RestoreSlice")
       const Tensor* shape_and_slices_tensor = c->input_tensor(2);
       if (shape_and_slices_tensor) {
         const auto& shape_and_slice =
-            shape_and_slices_tensor->flat<string>()(0);
+            shape_and_slices_tensor->flat<tstring>()(0);
         if (shape_and_slice.empty()) {
           c->set_output(0, c->UnknownShape());
         } else {
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7a0ccb11f1d..c5d21ef8f5a 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -590,18 +590,17 @@ REGISTER_OP("TensorListConcatLists")
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
-      if ((handle_data_a == nullptr || handle_data_a->empty()) &&
-          (handle_data_b == nullptr || handle_data_b->empty())) {
+      bool handle_data_a_nonempty = handle_data_a && !handle_data_a->empty();
+      bool handle_data_b_nonempty = handle_data_b && !handle_data_b->empty();
+      if (!(handle_data_a_nonempty || handle_data_b_nonempty)) {
         c->set_output_handle_shapes_and_types(
             0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
-          (handle_data_a && !handle_data_a->empty()) ? handle_data_a->at(0)
-                                                     : handle_data_b->at(0);
+          handle_data_a_nonempty ? handle_data_a->at(0) : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
-          (handle_data_b && !handle_data_b->empty()) ? handle_data_b->at(0)
-                                                     : handle_data_a->at(0);
+          handle_data_b_nonempty ? handle_data_b->at(0) : handle_data_a->at(0);
       if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index d87d377b8c7..5d25e92bae0 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -142,10 +142,23 @@ REGISTER_OP("_MklBatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulShape);
+
+REGISTER_OP("_MklBatchMatMulV2")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("output: T")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
+    .Attr("adj_x: bool = false")
+    .Attr("adj_y: bool = false")
+    .SetShapeFn(shape_inference::BatchMatMulV2Shape);
 #endif  // INTEL_MKL
 
 // --------------------------------------------------------------------------
@@ -371,8 +384,6 @@ REGISTER_OP("Add")
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-// TODO(rmlarsen): Add a Python wrapper that swiches non-string instances to
-// use AddV2 (b/68646025).
 REGISTER_OP("AddV2")
     .Input("x: T")
     .Input("y: T")
@@ -1597,6 +1608,16 @@ REGISTER_OP("Cumprod")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CumulativeLogsumexp")
+    .Input("x : T")
+    .Input("axis: Tidx")
+    .Attr("exclusive: bool = false")
+    .Attr("reverse: bool = false")
+    .Output("out: T")
+    .Attr("T: {float16, float32, float64}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("QuantizedMatMul")
     .Input("a: T1")
     .Input("b: T2")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index a55dde64e1b..4d100547f99 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1652,6 +1652,25 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklEagerConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding)
+    .Doc(R"doc(
+    MKL version of Conv2D operator for Eager mode. Uses MKL DNN APIs to perform 2D convolution.
+
+    NOTE Do not invoke this operator directly in Python. Eager Op rewrite is
+    expected to invoke these operators.
+    )doc");
+
 REGISTER_OP("__MklDummyConv2DWithBias")
     .Input("input: T")
     .Input("filter: T")
@@ -1782,6 +1801,33 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklEagerConv2DBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilter for Eager mode. Uses MKL DNN APIs
+to compute the gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Eager Op rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
     .Input("input: T")
     .Input("filter_sizes: int32")
@@ -1915,6 +1961,33 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklEagerConv2DBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Convolution2D backward input for Eager mode. Uses MKL DNN APIs
+to compute the gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Eager op rewrite is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv3D")
     .Input("input: T")
     .Input("filter: T")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 30c638c9462..9e88039655d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -785,6 +785,18 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AnonymousMemoryCache"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "AnonymousMultiDeviceIterator"
   output_arg {
@@ -815,6 +827,26 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AnonymousRandomSeedGenerator"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "Any"
   input_arg {
@@ -1140,6 +1172,75 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ApplyAdam"
   input_arg {
@@ -4747,6 +4848,218 @@ op {
     }
   }
 }
+op {
+  name: "BlockLSTMGradV2"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h_prev_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "w_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wci_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wcf_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "wco_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "BlockLSTMV2"
+  input_arg {
+    name: "seq_len_max"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "cs_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "h_prev"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "w"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wci"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wcf"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "wco"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "i"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "cs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "f"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "o"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "ci"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "co"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "h"
+    type_attr: "T"
+  }
+  attr {
+    name: "cell_clip"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "use_peephole"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "BoostedTreesAggregateStats"
   input_arg {
@@ -4874,6 +5187,7 @@ op {
     allowed_values {
       list {
         s: "inequality"
+        s: "equality"
       }
     }
   }
@@ -5075,6 +5389,24 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesFlushQuantileSummaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesGetEnsembleStates"
   input_arg {
@@ -5540,6 +5872,77 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesUpdateEnsembleV2"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "dimension_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "split_types"
+    type: DT_STRING
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "pruning_mode"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "BroadcastArgs"
   input_arg {
@@ -5755,7 +6158,7 @@ op {
   name: "CTCBeamSearchDecoder"
   input_arg {
     name: "inputs"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "sequence_length"
@@ -5778,7 +6181,7 @@ op {
   }
   output_arg {
     name: "log_probability"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "beam_width"
@@ -5799,12 +6202,25 @@ op {
       b: true
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
 }
 op {
   name: "CTCGreedyDecoder"
   input_arg {
     name: "inputs"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "sequence_length"
@@ -5824,7 +6240,7 @@ op {
   }
   output_arg {
     name: "log_probability"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "merge_repeated"
@@ -5833,12 +6249,25 @@ op {
       b: false
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
 }
 op {
   name: "CTCLoss"
   input_arg {
     name: "inputs"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "labels_indices"
@@ -5854,11 +6283,11 @@ op {
   }
   output_arg {
     name: "loss"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "gradient"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "preprocess_collapse_repeated"
@@ -5881,6 +6310,19 @@ op {
       b: false
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
 }
 op {
   name: "CacheDataset"
@@ -5909,6 +6351,38 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "CacheDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "cache"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Case"
   input_arg {
@@ -6024,6 +6498,7 @@ op {
     name: "message"
     type: "string"
   }
+  is_stateful: true
 }
 op {
   name: "Cholesky"
@@ -6254,6 +6729,13 @@ op {
     name: "shape"
     type: "shape"
   }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
   is_stateful: true
 }
 op {
@@ -6295,6 +6777,13 @@ op {
     name: "shape"
     type: "shape"
   }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
   is_stateful: true
 }
 op {
@@ -6336,6 +6825,13 @@ op {
     name: "shape"
     type: "shape"
   }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
   is_stateful: true
 }
 op {
@@ -6447,6 +6943,13 @@ op {
       }
     }
   }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
   is_stateful: true
 }
 op {
@@ -6865,6 +7368,14 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureTPUEmbedding"
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -9498,6 +10009,59 @@ op {
     }
   }
 }
+op {
+  name: "CumulativeLogsumexp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "DataFormatDimMap"
   input_arg {
@@ -9606,6 +10170,22 @@ op {
     name: "graph"
     type: DT_STRING
   }
+  attr {
+    name: "stateful_whitelist"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "allow_stateful"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "DatasetToSingleElement"
@@ -10313,6 +10893,18 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DeleteMemoryCache"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteMultiDeviceIterator"
   input_arg {
@@ -10335,6 +10927,18 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DeleteRandomSeedGenerator"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "deleter"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -11353,6 +11957,13 @@ op {
     name: "token"
     type: "string"
   }
+  attr {
+    name: "is_async"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "Tin"
     type: "list(type)"
@@ -13073,7 +13684,7 @@ op {
     type: DT_VARIANT
   }
   input_arg {
-    name: "num_workers"
+    name: "num_replicas"
     type: DT_INT64
   }
   output_arg {
@@ -13092,6 +13703,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ExperimentalScanDataset"
@@ -25324,6 +25942,13 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "legacy_autotune"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "Prelinearize"
@@ -25868,6 +26493,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "QuantizeAndDequantizeV3"
@@ -25924,6 +26556,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "QuantizeDownAndShrinkRange"
@@ -29991,6 +30630,64 @@ op {
     }
   }
 }
+op {
+  name: "RaggedTensorToTensor"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_partition_tensors"
+    type_attr: "Tindex"
+    number_attr: "num_row_partition_tensors"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "num_row_partition_tensors"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "row_partition_types"
+    type: "list(string)"
+  }
+}
 op {
   name: "RaggedTensorToVariant"
   input_arg {
@@ -31012,7 +31709,7 @@ op {
     type: DT_VARIANT
   }
   input_arg {
-    name: "num_workers"
+    name: "num_replicas"
     type: DT_INT64
   }
   output_arg {
@@ -31031,6 +31728,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "Reciprocal"
@@ -32581,6 +33285,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdam"
   input_arg {
@@ -34286,6 +35053,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
@@ -38351,6 +39195,38 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShuffleDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ShutdownDistributedTPU"
   is_stateful: true
@@ -39577,6 +40453,89 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "SparseApplyCenteredRMSProp"
   input_arg {
@@ -43153,6 +44112,21 @@ op {
     name: "body"
     type: "func"
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
 }
 op {
   name: "StaticRegexFullMatch"
@@ -43608,6 +44582,63 @@ op {
     }
   }
 }
+op {
+  name: "StringNGrams"
+  input_arg {
+    name: "data"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_splits"
+    type_attr: "Tsplits"
+  }
+  output_arg {
+    name: "ngrams"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "ngrams_splits"
+    type_attr: "Tsplits"
+  }
+  attr {
+    name: "separator"
+    type: "string"
+  }
+  attr {
+    name: "ngram_widths"
+    type: "list(int)"
+    has_minimum: true
+  }
+  attr {
+    name: "left_pad"
+    type: "string"
+  }
+  attr {
+    name: "right_pad"
+    type: "string"
+  }
+  attr {
+    name: "pad_width"
+    type: "int"
+  }
+  attr {
+    name: "preserve_short_sequences"
+    type: "bool"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StringSplit"
   input_arg {
@@ -44252,6 +45283,13 @@ op {
       s: "STEP_MARK_AT_ENTRY"
     }
   }
+  attr {
+    name: "allow_soft_placement"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "TPUReplicatedInput"
@@ -44274,6 +45312,13 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "is_mirrored_variable"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "TPUReplicatedOutput"
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
index 5794b89a64e..78fa5db34b2 100644
--- a/tensorflow/core/ops/ragged_conversion_ops.cc
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -15,16 +15,84 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/ops/ragged_to_dense_util.h"
 
 namespace tensorflow {
 
+using errors::InvalidArgument;
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+tensorflow::Status ValidateRowPartitionTypesAndShapes(
+    const std::vector<RowPartitionType>& row_partition_types,
+    InferenceContext* c) {
+  // Note: the allowed types may be extended in the future.
+  for (RowPartitionType row_partition_type : row_partition_types) {
+    switch (row_partition_type) {
+      case RowPartitionType::FIRST_DIM_SIZE:
+      case RowPartitionType::VALUE_ROWIDS:
+      case RowPartitionType::ROW_SPLITS:
+        break;
+      default:
+        return InvalidArgument("Unsupported partition type: ",
+                               RowPartitionTypeToString(row_partition_type));
+    }
+  }
+
+  if (row_partition_types.empty()) {
+    return InvalidArgument("Partition info types should not be empty");
+  }
+  for (int i = 1; i < row_partition_types.size(); ++i) {
+    if (row_partition_types[i] == RowPartitionType::FIRST_DIM_SIZE) {
+      return InvalidArgument("FIRST_DIM_SIZE must be first");
+    }
+  }
+  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE &&
+      (row_partition_types.size() < 2 ||
+       row_partition_types[1] != RowPartitionType::VALUE_ROWIDS)) {
+    return InvalidArgument("FIRST_DIM_SIZE must be followed by VALUE_ROWIDS");
+  }
+  if (row_partition_types[0] == RowPartitionType::VALUE_ROWIDS) {
+    return InvalidArgument("VALUE_ROWIDS cannot be first");
+  }
+
+  int num_row_partition_tensors;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr("num_row_partition_tensors", &num_row_partition_tensors));
+  if (num_row_partition_tensors != row_partition_types.size()) {
+    return InvalidArgument(
+        "Number of row partition tensors (", num_row_partition_tensors,
+        ") does not equal the number of row partition types(",
+        row_partition_types.size(), ").");
+  }
+
+  for (int i = 0; i < num_row_partition_tensors; ++i) {
+    TensorShapeProto partition_shape;
+    c->ShapeHandleToProto(c->input(3 + i), &partition_shape);
+    if (partition_shape.unknown_rank()) {
+      continue;
+    }
+    if (row_partition_types[i] == RowPartitionType::FIRST_DIM_SIZE) {
+      if (partition_shape.dim_size() != 0) {
+        return InvalidArgument("FIRST_DIM_SIZE must be a scalar.");
+      }
+    } else {
+      if (partition_shape.dim_size() != 1) {
+        return InvalidArgument("Row partition must be a vector.");
+      }
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
 Status RaggedTensorToSparseShapeFn(InferenceContext* c);
 Status RaggedTensorToVariantShapeFn(InferenceContext* c);
 Status RaggedTensorFromVariantShapeFn(InferenceContext* c);
+tensorflow::Status RaggedTensorToTensorShapeFn(InferenceContext* c);
 
 //==============================================================================
 // Registered Ops
@@ -61,6 +129,19 @@ REGISTER_OP("RaggedTensorFromVariant")
     .Attr("Tsplits: {int32, int64}")
     .SetShapeFn(RaggedTensorFromVariantShapeFn);
 
+REGISTER_OP("RaggedTensorToTensor")
+    .Attr("T: type")
+    .Attr("Tindex: {int64, int32}")
+    .Attr("Tshape: {int64, int32}")
+    .Attr("num_row_partition_tensors: int")
+    .Attr("row_partition_types: list(string)")
+    .Input("shape: Tshape")
+    .Input("values: T")
+    .Input("default_value: T")
+    .Input("row_partition_tensors: num_row_partition_tensors * Tindex")
+    .Output("result: T")
+    .SetShapeFn(RaggedTensorToTensorShapeFn);
+
 //==============================================================================
 // Shape Functions
 //==============================================================================
@@ -136,4 +217,47 @@ Status RaggedTensorFromVariantShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+tensorflow::Status RaggedTensorToTensorShapeFn(InferenceContext* c) {
+  TensorShapeProto shape;
+  {
+    ShapeHandle shape_handle;
+    TF_RETURN_IF_ERROR(
+        c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &shape_handle));
+    c->ShapeHandleToProto(shape_handle, &shape);
+  }
+
+  std::vector<RowPartitionType> row_partition_types;
+  TF_RETURN_IF_ERROR(GetRowPartitionTypes(c, &row_partition_types));
+  int ragged_rank = GetRaggedRank(row_partition_types);
+  TF_RETURN_IF_ERROR(
+      ValidateRowPartitionTypesAndShapes(row_partition_types, c));
+
+  TensorShapeProto value_shape;
+  c->ShapeHandleToProto(c->input(1), &value_shape);
+
+  TensorShapeProto default_value_shape;
+  c->ShapeHandleToProto(c->input(2), &default_value_shape);
+
+  TF_RETURN_IF_ERROR(
+      ValidateDefaultValueShape(default_value_shape, value_shape));
+
+  // TODO(martinz): Theoretically, we could check the first dimension of
+  // value_shape against the first dimension of the last row_partition_tensor
+  // assuming it is a VALUE_ROWIDS type.
+  // TODO(martinz): Although we normally don't know the first dimension of the
+  // output, we could infer it from the first dimension of the first
+  // row_partition_tensor if it is ROW_SPLITS type.
+  // TODO(martinz): If the shape is provided, but the value_shape has missing
+  // dimensions, we can check the default_value_shape against the shape.
+  TensorShapeProto output_shape;
+  TF_RETURN_IF_ERROR(CombineRaggedTensorToTensorShapes(
+      ragged_rank, shape, value_shape, &output_shape));
+
+  ShapeHandle output_shape_handle;
+  TF_RETURN_IF_ERROR(
+      c->MakeShapeFromShapeProto(output_shape, &output_shape_handle));
+  c->set_output(0, output_shape_handle);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ragged_to_dense_util.cc b/tensorflow/core/ops/ragged_to_dense_util.cc
new file mode 100644
index 00000000000..246f72494fc
--- /dev/null
+++ b/tensorflow/core/ops/ragged_to_dense_util.cc
@@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/ops/ragged_to_dense_util.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace tensorflow {
+
+using errors::InvalidArgument;
+
+string RowPartitionTypeToString(RowPartitionType row_partition_type) {
+  switch (row_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      return "FIRST_DIM_SIZE";
+    case RowPartitionType::VALUE_ROWIDS:
+      return "VALUE_ROWIDS";
+    case RowPartitionType::ROW_LENGTHS:
+      return "ROW_LENGTHS";
+    case RowPartitionType::ROW_SPLITS:
+      return "ROW_SPLITS";
+    case RowPartitionType::ROW_LIMITS:
+      return "ROW_LIMITS";
+    case RowPartitionType::ROW_STARTS:
+      return "ROW_STARTS";
+    default:
+      return "UNKNOWN ROW PARTITION TYPE";
+  }
+}
+tensorflow::Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types) {
+  static const auto kStringToType =
+      new std::unordered_map<string, RowPartitionType>(
+          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
+           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
+           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
+           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
+           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
+           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
+
+  for (const string& type_str : row_partition_type_strings) {
+    const auto iter = kStringToType->find(type_str);
+    if (iter == kStringToType->end()) {
+      return InvalidArgument("Unknown string for partition info type: ",
+                             type_str);
+    }
+    row_partition_types->push_back(iter->second);
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CombineRaggedTensorToTensorShapes(
+    int ragged_rank, const TensorShapeProto& shape,
+    const TensorShapeProto& value_shape, TensorShapeProto* output_shape) {
+  // Test for consistency of value_shape and shape specified.
+  // If shape is unspecified and value_shape is specified, then copy
+  // over the size from the value_shape dimension.
+
+  if (value_shape.unknown_rank() && shape.unknown_rank()) {
+    output_shape->Clear();
+    output_shape->set_unknown_rank(true);
+    return tensorflow::Status::OK();
+  }
+
+  if (shape.unknown_rank()) {
+    // Here, value_shape must be of known size.
+    while (output_shape->dim_size() < ragged_rank + value_shape.dim_size()) {
+      output_shape->add_dim()->set_size(-1);
+    }
+  } else {
+    *output_shape = shape;
+  }
+  if (value_shape.unknown_rank()) {
+    return tensorflow::Status::OK();
+  }
+  // At this point, value_shape and output_shape have known ranks.
+  if (ragged_rank + value_shape.dim_size() != output_shape->dim_size()) {
+    return InvalidArgument("Value shape (", value_shape.DebugString(),
+                           "), ragged_rank(", ragged_rank, ") and shape(",
+                           shape.DebugString(),
+                           ") do not have a consistent number of dimensions");
+  }
+
+  for (int i = 1; i < value_shape.dim_size(); ++i) {
+    const TensorShapeProto::Dim& value_dim = value_shape.dim(i);
+    TensorShapeProto::Dim* output_shape_dim = output_shape->mutable_dim(
+        output_shape->dim_size() - value_shape.dim_size() + i);
+
+    if (value_dim.size() >= 0) {
+      if (output_shape_dim->size() >= 0) {
+        if (output_shape_dim->size() != value_dim.size()) {
+          return InvalidArgument("Value and shape dimension are inconsistent.");
+        }
+      } else {
+        output_shape_dim->set_size(value_dim.size());
+      }
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types) {
+  if (row_partition_types.empty()) {
+    return 0;
+  }
+  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return row_partition_types.size() - 1;
+  }
+  return row_partition_types.size();
+}
+
+tensorflow::Status ValidateDefaultValueShape(
+    const TensorShapeProto& default_value_shape,
+    const TensorShapeProto& value_shape) {
+  if (default_value_shape.unknown_rank() || value_shape.unknown_rank()) {
+    return tensorflow::Status::OK();
+  }
+
+  if (default_value_shape.dim_size() > value_shape.dim_size()) {
+    // TODO(martinz): This constraint is unnecessary. The
+    // default value could have as many dimensions as shape. If there is a
+    // discrepancy, it will be picked up when we broadcast the default value.
+    // For now, I'll relax the constraint only slightly.
+    return InvalidArgument(
+        "default_value_shape must have no more dimensions than the value. "
+        "default_value_shape: ",
+        default_value_shape.DebugString(),
+        " default_value_shape.dim_size(): ", default_value_shape.dim_size(),
+        " value_shape: ", value_shape.DebugString(),
+        " value_shape.dim_size(): ", value_shape.dim_size());
+  }
+  for (int i = 0;
+       i < std::min(default_value_shape.dim_size(), value_shape.dim_size() - 1);
+       ++i) {
+    if (default_value_shape.dim(i).size() >= 0 &&
+        value_shape.dim(i + 1).size() >= 0 &&
+        default_value_shape.dim(i).size() != 1 &&
+        default_value_shape.dim(i).size() != value_shape.dim(i + 1).size()) {
+      return InvalidArgument(
+          "default_value_shape and value_shape do not match on dimension ", i);
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/ragged_to_dense_util.h b/tensorflow/core/ops/ragged_to_dense_util.h
new file mode 100644
index 00000000000..d29d6a5b62d
--- /dev/null
+++ b/tensorflow/core/ops/ragged_to_dense_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_OPS_RAGGED_TO_DENSE_UTIL_H_
+#define TENSORFLOW_CORE_OPS_RAGGED_TO_DENSE_UTIL_H_
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace tensorflow {
+enum class RowPartitionType {
+  FIRST_DIM_SIZE,
+  VALUE_ROWIDS,
+  ROW_LENGTHS,
+  ROW_SPLITS,
+  ROW_LIMITS,
+  ROW_STARTS
+};
+
+string RowPartitionTypeToString(RowPartitionType row_partition_type);
+
+Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types);
+
+// ContextType must be InferenceContext or OpKernelConstruction.
+template <typename ContextType>
+Status GetRowPartitionTypes(
+    ContextType* context, std::vector<RowPartitionType>* row_partition_types) {
+  std::vector<string> row_partition_type_strings;
+  TF_RETURN_IF_ERROR(
+      context->GetAttr("row_partition_types", &row_partition_type_strings));
+  return GetRowPartitionTypesHelper(row_partition_type_strings,
+                                    row_partition_types);
+}
+
+Status CombineRaggedTensorToTensorShapes(int ragged_rank,
+                                         const TensorShapeProto& shape,
+                                         const TensorShapeProto& value_shape,
+                                         TensorShapeProto* output_shape);
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types);
+
+Status ValidateDefaultValueShape(const TensorShapeProto& default_value_shape,
+                                 const TensorShapeProto& value_shape);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_OPS_RAGGED_TO_DENSE_UTIL_H_
diff --git a/tensorflow/core/ops/ragged_to_dense_util_test.cc b/tensorflow/core/ops/ragged_to_dense_util_test.cc
new file mode 100644
index 00000000000..d3d9e68ae2e
--- /dev/null
+++ b/tensorflow/core/ops/ragged_to_dense_util_test.cc
@@ -0,0 +1,214 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/ops/ragged_to_dense_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(CombineRaggedTensorToTensorShapes, UnknownShapeUnknownValue) {
+  TensorShapeProto shape_proto;
+  shape_proto.set_unknown_rank(true);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.set_unknown_rank(true);
+  int ragged_rank = 1;
+
+  TensorShapeProto actual_output_shape_proto;
+  TF_ASSERT_OK(CombineRaggedTensorToTensorShapes(
+      ragged_rank, shape_proto, value_shape_proto, &actual_output_shape_proto));
+
+  EXPECT_EQ(true, actual_output_shape_proto.unknown_rank());
+}
+
+TEST(CombineRaggedTensorToTensorShapes, UnknownShape) {
+  TensorShapeProto shape_proto;
+  shape_proto.set_unknown_rank(true);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(6);
+  int ragged_rank = 1;
+
+  TensorShapeProto actual_output_shape_proto;
+  TF_ASSERT_OK(CombineRaggedTensorToTensorShapes(
+      ragged_rank, shape_proto, value_shape_proto, &actual_output_shape_proto));
+
+  ASSERT_EQ(actual_output_shape_proto.dim_size(), 2);
+  EXPECT_EQ(actual_output_shape_proto.dim(0).size(), -1);
+  EXPECT_EQ(actual_output_shape_proto.dim(1).size(), -1);
+}
+
+TEST(CombineRaggedTensorToTensorShapes, UnknownShapeDenseValue) {
+  TensorShapeProto shape_proto;
+  shape_proto.set_unknown_rank(true);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(6);
+  value_shape_proto.add_dim()->set_size(3);
+  int ragged_rank = 1;
+
+  TensorShapeProto actual_output_shape_proto;
+  TF_ASSERT_OK(CombineRaggedTensorToTensorShapes(
+      ragged_rank, shape_proto, value_shape_proto, &actual_output_shape_proto));
+
+  ASSERT_EQ(actual_output_shape_proto.dim_size(), 3);
+  EXPECT_EQ(actual_output_shape_proto.dim(0).size(), -1);
+  EXPECT_EQ(actual_output_shape_proto.dim(1).size(), -1);
+  EXPECT_EQ(actual_output_shape_proto.dim(2).size(), 3);
+}
+
+TEST(GetRowPartitionTypesHelper, BasicTest) {
+  const std::vector<string> row_partition_type_strings = {
+      "FIRST_DIM_SIZE", "VALUE_ROWIDS", "ROW_SPLITS"};
+  std::vector<RowPartitionType> row_partition_types;
+  TF_ASSERT_OK(GetRowPartitionTypesHelper(row_partition_type_strings,
+                                          &row_partition_types));
+  EXPECT_THAT(row_partition_types,
+              ::testing::ElementsAre(RowPartitionType::FIRST_DIM_SIZE,
+                                     RowPartitionType::VALUE_ROWIDS,
+                                     RowPartitionType::ROW_SPLITS));
+}
+
+TEST(RowPartitionTypeToString, BasicTest) {
+  EXPECT_EQ("FIRST_DIM_SIZE",
+            RowPartitionTypeToString(RowPartitionType::FIRST_DIM_SIZE));
+  EXPECT_EQ("VALUE_ROWIDS",
+            RowPartitionTypeToString(RowPartitionType::VALUE_ROWIDS));
+  EXPECT_EQ("ROW_SPLITS",
+            RowPartitionTypeToString(RowPartitionType::ROW_SPLITS));
+}
+
+TEST(ValidateDefaultValueShape, UnknownDefaultValueShape) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.set_unknown_rank(true);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(6);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, UnknownValueShape) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(5);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.set_unknown_rank(true);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, ScalarShape) {
+  TensorShapeProto default_value_shape_proto;
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, TensorShapeEqual) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(2);
+  default_value_shape_proto.add_dim()->set_size(3);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(2);
+  value_shape_proto.add_dim()->set_size(3);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, TensorDimensionUnknown) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(-1);
+  default_value_shape_proto.add_dim()->set_size(3);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(2);
+  value_shape_proto.add_dim()->set_size(3);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, TensorDimensionUnknownForValue) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(2);
+  default_value_shape_proto.add_dim()->set_size(3);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(-1);
+  value_shape_proto.add_dim()->set_size(3);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, TensorDimensionFewDims) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(3);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(-1);
+  value_shape_proto.add_dim()->set_size(3);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+TEST(ValidateDefaultValueShape, WrongNumberOfDimensions) {
+  // I have modified this test to make the default value shape have more
+  // dimensions, instead of the same number.
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(-1);
+  default_value_shape_proto.add_dim()->set_size(-1);
+  default_value_shape_proto.add_dim()->set_size(-1);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(-1);
+  value_shape_proto.add_dim()->set_size(-1);
+  EXPECT_FALSE(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto)
+          .ok());
+}
+
+TEST(ValidateDefaultValueShape, WrongDimensionSize) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(3);
+  default_value_shape_proto.add_dim()->set_size(-1);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(6);
+  value_shape_proto.add_dim()->set_size(-1);
+  EXPECT_FALSE(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto)
+          .ok());
+}
+
+// This is the case where broadcast could work, but we throw an error.
+TEST(ValidateDefaultValueShape, WrongDimensionSizeBut1) {
+  TensorShapeProto default_value_shape_proto;
+  default_value_shape_proto.add_dim()->set_size(3);
+  default_value_shape_proto.add_dim()->set_size(1);
+  TensorShapeProto value_shape_proto;
+  value_shape_proto.add_dim()->set_size(5);
+  value_shape_proto.add_dim()->set_size(3);
+  value_shape_proto.add_dim()->set_size(7);
+  TF_EXPECT_OK(
+      ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/rnn_ops.cc b/tensorflow/core/ops/rnn_ops.cc
index b926feb9d2e..af5dc3d26d1 100644
--- a/tensorflow/core/ops/rnn_ops.cc
+++ b/tensorflow/core/ops/rnn_ops.cc
@@ -199,6 +199,45 @@ REGISTER_OP("BlockLSTM")
       return Status::OK();
     });
 
+REGISTER_OP("BlockLSTMV2")
+    .Input("seq_len_max: int64")
+    .Input("x: T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Output("i: T")
+    .Output("cs: T")
+    .Output("f: T")
+    .Output("o: T")
+    .Output("ci: T")
+    .Output("co: T")
+    .Output("h: T")
+    .Attr("cell_clip: float = 0.0")
+    .Attr("use_peephole: bool = false")
+    .Attr("T: {half, float}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle x, b;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &b));
+
+      DimensionHandle timelen = c->Dim(x, 0);
+      DimensionHandle batch_size = c->Dim(x, 1);
+      DimensionHandle cell_size;
+      TF_RETURN_IF_ERROR(
+          c->Divide(c->Dim(b, 0), 4, true /* evenly_divisible */, &cell_size));
+
+      DCHECK_EQ(7, c->num_outputs());
+      ShapeHandle output = c->MakeShape({timelen, batch_size, cell_size});
+      for (int i = 0; i < 7; ++i) {
+        c->set_output(i, output);
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("BlockLSTMGrad")
     .Input("seq_len_max: int64")
     .Input("x: T")
@@ -251,4 +290,56 @@ REGISTER_OP("BlockLSTMGrad")
       return Status::OK();
     });
 
+REGISTER_OP("BlockLSTMGradV2")
+    .Input("seq_len_max: int64")
+    .Input("x: T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Input("i: T")
+    .Input("cs: T")
+    .Input("f: T")
+    .Input("o: T")
+    .Input("ci: T")
+    .Input("co: T")
+    .Input("h: T")
+    .Input("cs_grad: T")
+    .Input("h_grad: T")
+    .Output("x_grad: T")
+    .Output("cs_prev_grad: T")
+    .Output("h_prev_grad: T")
+    .Output("w_grad: T")
+    .Output("wci_grad: T")
+    .Output("wcf_grad: T")
+    .Output("wco_grad: T")
+    .Output("b_grad: T")
+    .Attr("use_peephole: bool")
+    .Attr("T: {half, float}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle x, cs_prev, h_prev, w, wci, wco, wcf, b;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &cs_prev));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &h_prev));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &w));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &wci));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &wco));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &wcf));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &b));
+
+      c->set_output(0, x);
+      c->set_output(1, cs_prev);
+      c->set_output(2, h_prev);
+      c->set_output(3, w);
+      c->set_output(4, wci);
+      c->set_output(5, wco);
+      c->set_output(6, wcf);
+      c->set_output(7, b);
+
+      return Status::OK();
+    });
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/script_ops.cc b/tensorflow/core/ops/script_ops.cc
index d8716f0389a..004f0bb2dd2 100644
--- a/tensorflow/core/ops/script_ops.cc
+++ b/tensorflow/core/ops/script_ops.cc
@@ -39,6 +39,7 @@ REGISTER_OP("EagerPyFunc")
     .Input("input: Tin")
     .Output("output: Tout")
     .Attr("token: string")
+    .Attr("is_async: bool=false")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >=0")
     .SetIsStateful()
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 2e07db36531..4d9ad0a56c5 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -365,4 +365,26 @@ REGISTER_OP("UnicodeDecodeWithOffsets")
       return Status::OK();
     });
 
+REGISTER_OP("StringNGrams")
+    .Attr("separator: string")
+    .Attr("ngram_widths: list(int) >= 0")
+    .Attr("left_pad: string")
+    .Attr("right_pad: string")
+    .Attr("pad_width: int")
+    .Attr("preserve_short_sequences: bool")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
+    .Input("data: string")
+    .Input("data_splits: Tsplits")
+    .Output("ngrams: string")
+    .Output("ngrams_splits: Tsplits")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(1));
+      ShapeHandle data = c->input(0);
+      TF_RETURN_IF_ERROR(c->WithRank(data, 1, &data));
+      ShapeHandle data_splits = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(data_splits, 1, &data_splits));
+      c->set_output(1, data_splits);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_configuration_ops.cc b/tensorflow/core/ops/tpu_configuration_ops.cc
index febb25096fd..94a6b32bdbe 100644
--- a/tensorflow/core/ops/tpu_configuration_ops.cc
+++ b/tensorflow/core/ops/tpu_configuration_ops.cc
@@ -199,4 +199,9 @@ REGISTER_OP("ShutdownDistributedTPU")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("ConfigureTPUEmbedding")
+    .Attr("config: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
index c26b49eb34b..adce0b51a05 100644
--- a/tensorflow/core/ops/tpu_cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -40,6 +40,9 @@ REGISTER_OP("AllToAll")
       }
       int concat_dimension;
       int split_dimension;
+      int split_count;
+
+      TF_RETURN_IF_ERROR(c->GetAttr("split_count", &split_count));
 
       TF_RETURN_IF_ERROR(c->GetAttr("concat_dimension", &concat_dimension));
 
@@ -58,14 +61,13 @@ REGISTER_OP("AllToAll")
       dims.resize(rank);
 
       for (int32 i = 0; i < rank; ++i) {
-        int64 in_idx = i;
+        dims[i] = c->Dim(input, i);
         if (i == concat_dimension) {
-          in_idx = split_dimension;
-        } else if (i == split_dimension) {
-          in_idx = concat_dimension;
+          dims[i] = c->MakeDim(c->Value(dims[i]) * split_count);
+        }
+        if (i == split_dimension) {
+          dims[i] = c->MakeDim(c->Value(dims[i]) / split_count);
         }
-
-        dims[i] = c->Dim(input, in_idx);
       }
 
       c->set_output(0, c->MakeShape(dims));
diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
index b7fd2a18e0e..f457ddc46f1 100644
--- a/tensorflow/core/ops/tpu_replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -33,6 +33,7 @@ REGISTER_OP("TPUReplicateMetadata")
     .Attr("host_compute_core: list(string) = []")
     .Attr("padding_map: list(string) = []")
     .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
+    .Attr("allow_soft_placement: bool = false")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -40,6 +41,7 @@ REGISTER_OP("TPUReplicatedInput")
     .Output("output: T")
     .Attr("N: int >= 1")
     .Attr("T: type")
+    .Attr("is_mirrored_variable: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle cur = c->input(c->num_inputs() - 1);
       for (int i = c->num_inputs() - 2; i >= 0; --i) {
@@ -103,6 +105,7 @@ REGISTER_OP("_TPUReplicate")
     .Attr("output_types: list(type) >= 0")
     .Attr("padding_map: list(string) = []")
     .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
+    .Attr("allow_soft_placement: bool = false")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 995ed42d53d..08794a982f9 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -245,6 +245,20 @@ static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
   return Status::OK();
 }
 
+static Status ApplyAdagradV2ShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
 REGISTER_OP("ApplyAdagrad")
     .Input("var: Ref(T)")
     .Input("accum: Ref(T)")
@@ -270,6 +284,33 @@ REGISTER_OP("ResourceApplyAdagrad")
       return ApplyAdagradShapeFn(c, false /* sparse */);
     });
 
+REGISTER_OP("ApplyAdagradV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradV2ShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceApplyAdagradV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradV2ShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
@@ -341,6 +382,37 @@ REGISTER_OP("ResourceSparseApplyAdagrad")
       return ApplyAdagradShapeFn(c, true /* sparse */);
     });
 
+REGISTER_OP("SparseApplyAdagradV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradV2ShapeFn(c, true /* sparse */);
+    });
+
+REGISTER_OP("ResourceSparseApplyAdagradV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradV2ShapeFn(c, true /* sparse */);
+    });
+
 static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);  // var
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
new file mode 100644
index 00000000000..7f5669ab20c
--- /dev/null
+++ b/tensorflow/core/platform/BUILD
@@ -0,0 +1,550 @@
+# Description:
+#   TensorFlow Base libraries.
+#   This package contains the following libraries:
+#     - Platform dependent libraries that require different implementations
+#       across different OSs or environments.
+#     - STL replacement libraries rest of TensorFlow should depend on.
+#
+#   The libraries in this package are not allowed to have ANY dependencies
+#   to any TensorFlow code outside this package.
+
+load(
+    "//tensorflow/core/platform:default/build_config.bzl",
+    "tf_additional_device_tracer_srcs",
+    "tf_additional_lib_hdrs",
+    "tf_additional_lib_srcs",
+    "tf_additional_libdevice_srcs",
+    "tf_additional_minimal_lib_srcs",
+    "tf_additional_monitoring_srcs",
+    "tf_additional_proto_hdrs",
+    "tf_additional_rocdl_deps",
+    "tf_additional_rocdl_srcs",
+    "tf_additional_test_srcs",
+    "tf_logging_absl_deps",
+    "tf_platform_hdrs",
+    "tf_platform_srcs",
+    "tf_protobuf_compiler_deps",
+    "tf_protobuf_deps",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "abi",
+    srcs = ["abi.cc"],
+    hdrs = ["abi.h"],
+    deps = [":types"],
+)
+
+cc_library(
+    name = "annotation",
+    srcs = ["annotation.cc"],
+    hdrs = ["annotation.h"],
+    visibility = [
+        "//perftools/accelerators/xprof:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    deps = [
+        ":macros",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "byte_order",
+    hdrs = ["byte_order.h"],
+)
+
+cc_library(
+    name = "cpu_feature_guard",
+    srcs = ["cpu_feature_guard.cc"],
+    hdrs = ["cpu_feature_guard.h"],
+    deps = [
+        ":byte_order",
+        ":cpu_info",
+        ":logging",
+    ],
+)
+
+cc_library(
+    name = "cpu_info",
+    srcs = ["cpu_info.cc"],
+    hdrs = ["cpu_info.h"],
+    copts = tf_copts(),
+    deps = [
+        ":byte_order",
+        ":logging",
+        ":platform",
+        ":types",
+    ],
+)
+
+cc_library(
+    name = "denormal",
+    srcs = ["denormal.cc"],
+    hdrs = ["denormal.h"],
+    deps = [
+        ":byte_order",
+        ":cpu_info",
+        ":logging",
+        ":macros",
+        ":platform",
+    ],
+)
+
+cc_library(
+    name = "env_time",
+    srcs = tf_platform_srcs(["env_time.cc"]),
+    hdrs = ["env_time.h"],
+    deps = [
+        ":types",
+    ],
+)
+
+cc_library(
+    name = "file_statistics",
+    hdrs = ["file_statistics.h"],
+    deps = [":types"],
+)
+
+cc_library(
+    name = "host_info",
+    hdrs = ["host_info.h"],
+    deps = [":types"],
+)
+
+cc_library(
+    name = "logging",
+    srcs = tf_platform_hdrs(["logging.h"]) + tf_platform_srcs(["logging.cc"]),
+    hdrs = ["logging.h"],
+    deps = [
+        ":env_time",
+        ":macros",
+        ":platform",
+        ":types",
+        "//tensorflow/core/platform/default/build_config:base",
+    ] + tf_logging_absl_deps(),
+)
+
+cc_library(
+    name = "macros",
+    hdrs = ["macros.h"],
+)
+
+cc_library(
+    name = "numbers",
+    srcs = ["numbers.cc"],
+    hdrs = ["numbers.h"],
+    deps = [
+        ":logging",
+        ":macros",
+        ":str_util",
+        ":stringpiece",
+        ":stringprintf",
+        ":types",
+        "@double_conversion//:double-conversion",
+    ],
+)
+
+cc_library(
+    name = "rocm_rocdl_path",
+    srcs = ["rocm_rocdl_path.cc"] + tf_additional_rocdl_srcs(),
+    hdrs = ["rocm_rocdl_path.h"],
+    deps = [
+        ":types",
+        "//tensorflow/core:lib",
+    ] + tf_additional_rocdl_deps(),
+)
+
+cc_library(
+    name = "platform",
+    hdrs = ["platform.h"],
+)
+
+cc_library(
+    name = "platform_strings",
+    srcs = [
+        "platform_strings.cc",
+        "platform_strings_computed.h",
+    ],
+    hdrs = ["platform_strings.h"],
+)
+
+cc_library(
+    name = "prefetch",
+    hdrs = ["prefetch.h"],
+    deps = [":platform"],
+)
+
+cc_library(
+    name = "protobuf",
+    srcs = [
+        "protobuf.cc",
+        "protobuf_util.cc",
+    ],
+    hdrs = ["protobuf.h"],
+    deps = [
+        ":platform",
+        ":types",
+    ] + tf_protobuf_deps(),
+)
+
+cc_library(
+    name = "protobuf_compiler",
+    hdrs = ["protobuf_compiler.h"],
+    deps = tf_protobuf_compiler_deps(),
+)
+
+cc_library(
+    name = "scanner",
+    srcs = ["scanner.cc"],
+    hdrs = ["scanner.h"],
+    deps = [
+        ":macros",
+        ":str_util",
+        ":stringpiece",
+    ],
+)
+
+cc_library(
+    name = "stacktrace",
+    srcs = glob(["*/stacktrace.h"]),
+    hdrs = ["stacktrace.h"],
+    deps = [
+        ":abi",
+        ":platform",
+        "//tensorflow/core/platform/default/build_config:stacktrace",
+    ],
+)
+
+cc_library(
+    name = "stacktrace_handler",
+    srcs = tf_platform_srcs(["stacktrace_handler.cc"]),
+    hdrs = ["stacktrace_handler.h"],
+    deps = [
+        ":platform",
+        ":stacktrace",
+    ],
+)
+
+cc_library(
+    name = "setround",
+    srcs = ["setround.cc"],
+    hdrs = ["setround.h"],
+    deps = [
+        ":logging",
+        ":macros",
+    ],
+)
+
+cc_library(
+    name = "str_util",
+    srcs = ["str_util.cc"],
+    hdrs = ["str_util.h"],
+    deps = [
+        ":logging",
+        ":stringpiece",
+        ":types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "stringpiece",
+    hdrs = ["stringpiece.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "stringprintf",
+    srcs = ["stringprintf.cc"],
+    hdrs = ["stringprintf.h"],
+    deps = [
+        ":macros",
+        ":types",
+    ],
+)
+
+cc_library(
+    name = "thread_annotations",
+    hdrs = ["thread_annotations.h"],
+)
+
+cc_library(
+    name = "tstring",
+    hdrs = ["tstring.h"],
+)
+
+cc_library(
+    name = "types",
+    srcs = tf_platform_hdrs(["integral_types.h"]),
+    hdrs = ["types.h"],
+    deps = [
+        ":platform",
+        ":tstring",
+        "//tensorflow/core/platform/default/build_config:base",
+    ],
+)
+
+# --------------------------------------------------------------------------
+#     Below libraries are here only to make sure the legacy build rules
+#     in tensorflow/core/BUILD are working!
+#
+#     DO NOT add any new dependencies on these rules!
+#
+# --------------------------------------------------------------------------
+
+filegroup(
+    name = "legacy_platform_lib_hdrs",
+    srcs = tf_additional_lib_hdrs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_platform_lib_srcs",
+    srcs = tf_additional_lib_srcs(
+        exclude = [
+            "*test*",
+            "**/*test*",
+            "**/cuda.h",
+            "**/cuda_libdevice_path.cc",
+            "**/rocm.h",
+            "**/monitoring.cc",
+            "**/stream_executor.h",
+            "**/env_time.cc",
+            "**/device_tracer.cc",
+            "**/tpu_tracer.cc",
+            "**/logger.cc",
+            "**/logging.cc",
+            "**/human_readable_json.cc",
+            "**/rocm.h",
+            "**/rocm_rocdl_path.cc",
+            "abi.cc",
+            "cpu_info.cc",
+            "numbers.cc",
+            "platform_strings.cc",
+            "protobuf.cc",
+            "scanner.cc",
+            "stringprintf.cc",
+        ],
+    ),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_proto_hdrs",
+    srcs = tf_additional_proto_hdrs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_srcs_no_runtime",
+    srcs = glob(
+        [
+            "**/*.h",
+            "**/*.cc",
+        ],
+        exclude = [
+            "*test.*",
+            "*testutil*",
+            "*testlib*",
+            "*main.cc",
+            "**/*test.*",
+            "**/*testutil*",
+            "**/*testlib*",
+            "**/*main.cc",
+            "**/cuda_libdevice_path.*",
+            "**/logger.cc",
+            # Exclude env_time and logging to avoid collisions with
+            # :platform_base, a common dependency for downstream targets.
+            "**/env_time.cc",
+            "**/logging.cc",
+            "**/rocm_rocdl_path.*",
+            "default/test_benchmark.*",
+            "cuda.h",
+            "rocm.h",
+            "google/**/*",
+            "hadoop/**/*",
+            "gif.h",
+            "jpeg.h",
+            "png.h",
+            "stream_executor.*",
+            "windows/**/*",
+        ],
+    ),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_headers",
+    srcs = glob(
+        [
+            "*.h",
+            "profile_utils/**/*.h",
+        ],
+        exclude = [
+            "annotation.h",
+            "gif.h",
+            "jpeg.h",
+            "png.h",
+            "stringprintf.h",
+            "str_util.h",
+            "**/cuda.h",
+            "**/rocm.h",
+            "**/stream_executor.h",
+        ],
+    ),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "profile_utils/**/*.cc",
+        ],
+        exclude = [
+            "*test*",
+            "**/*test*",
+            "**/env_time.cc",
+            "**/monitoring.cc",
+            "**/cuda_libdevice_path.cc",
+            "**/device_tracer.cc",
+            "**/tpu_tracer.cc",
+            "**/logger.cc",
+            "**/logging.cc",
+            "**/human_readable_json.cc",
+            "**/rocm_rocdl_path.cc",
+            "abi.cc",
+            "annotation.cc",
+            "cpu_info.cc",
+            "numbers.cc",
+            "platform_strings.cc",
+            "protobuf.cc",
+            "scanner.cc",
+            "stringprintf.cc",
+            "str_util.cc",
+        ],
+    ),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_test_srcs",
+    srcs = tf_additional_test_srcs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_device_tracer_srcs",
+    srcs = tf_additional_device_tracer_srcs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_minimal_lib_srcs",
+    srcs = tf_additional_minimal_lib_srcs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_libdevice_srcs",
+    srcs = tf_additional_libdevice_srcs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_monitoring_srcs",
+    srcs = tf_additional_monitoring_srcs(),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_platform_port_srcs",
+    srcs = tf_platform_hdrs([
+        "cpu_info.h",
+        "dynamic_annotations.h",
+        "thread_annotations.h",
+        "mutex.h",
+    ]) + tf_platform_srcs([
+        "port.cc",
+    ]),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_platform_env_srcs",
+    srcs = tf_platform_srcs([
+        "env.cc",
+        "load_library.cc",
+    ]) + tf_platform_hdrs([
+        "wide_char.h",
+    ]),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_file_system_hdrs",
+    srcs = tf_platform_hdrs([
+        "windows_file_system.h",
+    ]),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_platform_other_srcs",
+    srcs = tf_platform_srcs([
+        "subprocess.cc",
+        "net.cc",
+        "tracing.cc",
+    ]) + tf_platform_hdrs([
+        "tracing.h",
+        "error.h",
+        "context.h",
+        "fingerprint.h",
+        "notification.h",
+        "strong_hash.h",
+        "subprocess.h",
+        "tracing_impl.h",
+    ]),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_human_readable_json_src",
+    srcs = tf_platform_srcs(["human_readable_json.cc"]),
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+# TODO(gunan): Remove the following once references in core/BUILD is removed.
+exports_files(
+    glob(
+        [
+            "*",
+            "**",
+        ],
+        exclude = [
+            "abi.h",
+            "byte_order.h",
+            "cpu_info.cc",
+            "cpu_info.h",
+            "logging.h",
+            "macros.h",
+            "platform.h",
+            "types.h",
+            "stacktrace.h",
+        ],
+    ),
+)
diff --git a/tensorflow/core/platform/annotation.cc b/tensorflow/core/platform/annotation.cc
new file mode 100644
index 00000000000..f80d29113a3
--- /dev/null
+++ b/tensorflow/core/platform/annotation.cc
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/annotation.h"
+
+namespace tensorflow {
+/*static*/ std::string* Annotation::ThreadAnnotation() {
+  static thread_local std::string annotation;
+  return &annotation;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/annotation.h b/tensorflow/core/platform/annotation.h
new file mode 100644
index 00000000000..6d8a2e2bef5
--- /dev/null
+++ b/tensorflow/core/platform/annotation.h
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_
+#define TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_
+
+#include <stddef.h>
+
+#include <atomic>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Backend for ScopedAnnotation.
+class Annotation {
+ public:
+  // Appends name to the annotation for the current thread and returns the
+  // original length of the annotation.
+  // Append name to the current annotation, separated by "::".
+  // The choice of separator "::" is based on characters not used by
+  // TensorFlow for its TensorOps.
+  static size_t PushAnnotation(absl::string_view name) {
+    std::string* annotation = ThreadAnnotation();
+    size_t old_length = annotation->size();
+    if (old_length != 0) {
+      absl::StrAppend(annotation, "::", name);
+    } else {
+      *annotation = std::string(name);
+    }
+    return old_length;
+  }
+
+  static size_t PushAnnotation(std::string&& name) {
+    std::string* annotation = ThreadAnnotation();
+    size_t old_length = annotation->size();
+    if (old_length != 0) {
+      absl::StrAppend(annotation, "::", name);
+    } else {
+      *annotation = std::move(name);
+    }
+    return old_length;
+  }
+
+  // Returns the annotation for the current thread.
+  static const std::string& CurrentAnnotation() { return *ThreadAnnotation(); }
+
+  // Resizes the annotation for the current thread to its old length.
+  static void PopAnnotation(size_t old_length) {
+    ThreadAnnotation()->resize(old_length);
+  }
+
+ private:
+  Annotation(const Annotation&) = delete;  // Unconstructible.
+
+  // Returns a reference to the annotation for the current thread.
+  static std::string* ThreadAnnotation();
+};
+
+namespace tracing {
+// Adds an annotation to all activities for the duration of the instance
+// lifetime through the currently registered TraceCollector.
+//
+// Usage: {
+//          ScopedAnnotation annotation("my kernels");
+//          Kernel1<<<x,y>>>;
+//          LaunchKernel2(); // Launches a CUDA kernel.
+//        }
+// This will add 'my kernels' to both kernels in the profiler UI
+class ScopedAnnotation {
+ public:
+  explicit ScopedAnnotation(absl::string_view name) {
+    if (TF_PREDICT_FALSE(IsEnabled())) {
+      old_length_ = Annotation::PushAnnotation(name);
+    }
+  }
+
+  explicit ScopedAnnotation(const char* name)
+      : ScopedAnnotation(absl::string_view(name)) {}
+
+  explicit ScopedAnnotation(const std::string& name) {
+    if (TF_PREDICT_FALSE(IsEnabled())) {
+      old_length_ = Annotation::PushAnnotation(name);
+    }
+  }
+
+  explicit ScopedAnnotation(std::string&& name) {
+    if (TF_PREDICT_FALSE(IsEnabled())) {
+      old_length_ = Annotation::PushAnnotation(std::move(name));
+    }
+  }
+
+  template <typename NameGeneratorT>
+  explicit ScopedAnnotation(NameGeneratorT name_generator) {
+    if (TF_PREDICT_FALSE(IsEnabled())) {
+      old_length_ = Annotation::PushAnnotation(name_generator());
+    }
+  }
+
+  // Pops the name passed in the constructor from the current annotation.
+  ~ScopedAnnotation() {
+    // TODO(b/137971921): without this memory fence, two presubmit tests will
+    // fail probably due to compiler in that presubmit config.
+    std::atomic_thread_fence(std::memory_order_acquire);
+    if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) {
+      Annotation::PopAnnotation(old_length_);
+    }
+  }
+
+  static void Enable(bool enable);
+  static const bool IsEnabled();
+
+ private:
+  // signals that annotation is disabled at the constructor.
+  static constexpr size_t kInvalidLength = static_cast<size_t>(-1);
+  size_t old_length_ = kInvalidLength;
+};
+
+}  // namespace tracing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 9110ee3a307..e1dc78c0748 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -624,17 +624,31 @@ class GcsWritableFile : public WritableFile {
       StringPiece range_piece(received_range);
       absl::ConsumePrefix(&range_piece,
                           "bytes=");  // May or may not be present.
-      std::vector<int64> range_parts;
-      if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
-          range_parts.size() != 2) {
+
+      auto return_error = [this](string error_message) {
         return errors::Internal("Unexpected response from GCS when writing ",
-                                GetGcsPath(), ": Range header '",
-                                received_range, "' could not be parsed.");
+                                GetGcsPath(), ": ", error_message);
+      };
+
+      std::vector<string> range_strs = str_util::Split(range_piece, '-');
+      std::vector<int64> range_parts;
+      for (const string& range_str : range_strs) {
+        int64 tmp;
+        if (strings::safe_strto64(range_str, &tmp)) {
+          range_parts.push_back(tmp);
+        } else {
+          return return_error("Range header '" + received_range +
+                              "' could not be parsed.");
+        }
       }
+      if (range_parts.size() != 2) {
+        return return_error("Range header '" + received_range +
+                            "' could not be parsed.");
+      }
+
       if (range_parts[0] != 0) {
-        return errors::Internal("Unexpected response from GCS when writing to ",
-                                GetGcsPath(), ": the returned range '",
-                                received_range, "' does not start at zero.");
+        return return_error("The returned range '" + received_range +
+                            "' does not start at zero.");
       }
       // If GCS returned "Range: 0-10", this means 11 bytes were uploaded.
       *uploaded = range_parts[1] + 1;
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b2d0f21fe7a..60574bf67d0 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -23,7 +23,8 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 
 #if defined(_MSC_VER)
-#include "tensorflow/core/platform/windows/cpu_info.h"
+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index b1248af6381..55fab9ffe54 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -4,7 +4,7 @@ load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
@@ -471,12 +471,12 @@ def tf_proto_library(
 # must be compiled in the 'default' platform, this is a list of all headers
 # mentioned in the platform/* files.
 def tf_platform_hdrs(files):
-    return native.glob(["platform/*/" + f for f in files])
+    return native.glob(["*/" + f for f in files])
 
 def tf_platform_srcs(files):
-    base_set = ["platform/default/" + f for f in files]
-    windows_set = base_set + ["platform/windows/" + f for f in files]
-    posix_set = base_set + ["platform/posix/" + f for f in files]
+    base_set = ["default/" + f for f in files]
+    windows_set = base_set + ["windows/" + f for f in files]
+    posix_set = base_set + ["posix/" + f for f in files]
 
     # Handle cases where we must also bring the posix file in. Usually, the list
     # of files to build on windows builds is just all the stuff in the
@@ -485,7 +485,7 @@ def tf_platform_srcs(files):
     # file instead of making a copy in 'windows'.
     for f in files:
         if f == "error.cc":
-            windows_set.append("platform/posix/" + f)
+            windows_set.append("posix/" + f)
 
     return select({
         "//tensorflow:windows": native.glob(windows_set),
@@ -494,29 +494,29 @@ def tf_platform_srcs(files):
 
 def tf_additional_lib_hdrs(exclude = []):
     windows_hdrs = native.glob([
-        "platform/default/*.h",
-        "platform/windows/*.h",
-        "platform/posix/error.h",
+        "default/*.h",
+        "windows/*.h",
+        "posix/error.h",
     ], exclude = exclude)
     return select({
         "//tensorflow:windows": windows_hdrs,
         "//conditions:default": native.glob([
-            "platform/default/*.h",
-            "platform/posix/*.h",
+            "default/*.h",
+            "posix/*.h",
         ], exclude = exclude),
     })
 
 def tf_additional_lib_srcs(exclude = []):
     windows_srcs = native.glob([
-        "platform/default/*.cc",
-        "platform/windows/*.cc",
-        "platform/posix/error.cc",
+        "default/*.cc",
+        "windows/*.cc",
+        "posix/error.cc",
     ], exclude = exclude)
     return select({
         "//tensorflow:windows": windows_srcs,
         "//conditions:default": native.glob([
-            "platform/default/*.cc",
-            "platform/posix/*.cc",
+            "default/*.cc",
+            "posix/*.cc",
         ], exclude = exclude),
     })
 
@@ -525,29 +525,24 @@ def tf_additional_monitoring_hdrs():
 
 def tf_additional_monitoring_srcs():
     return [
-        "platform/default/monitoring.cc",
+        "default/monitoring.cc",
     ]
 
 def tf_additional_minimal_lib_srcs():
     return [
-        "platform/default/integral_types.h",
-        "platform/default/mutex.h",
-        "platform/default/mutex_data.h",
+        "default/integral_types.h",
+        "default/mutex.h",
+        "default/mutex_data.h",
     ]
 
 def tf_additional_proto_hdrs():
     return [
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
+        "default/integral_types.h",
+        "default/logging.h",
     ] + if_windows([
-        "platform/windows/integral_types.h",
+        "windows/integral_types.h",
     ])
 
-def tf_additional_proto_srcs():
-    return [
-        "platform/protobuf.cc",
-    ]
-
 def tf_additional_human_readable_json_deps():
     return []
 
@@ -596,7 +591,7 @@ def tf_additional_cupti_wrapper_deps():
     ]
 
 def tf_additional_device_tracer_srcs():
-    return ["platform/default/device_tracer.cc"]
+    return ["default/device_tracer.cc"]
 
 def tf_additional_device_tracer_cuda_deps():
     return []
@@ -613,13 +608,6 @@ def tf_additional_device_tracer_test_flags():
 def tf_additional_cupti_test_flags():
     return []
 
-def tf_additional_profiler_lib_deps():
-    return [
-        "//tensorflow/core/profiler/internal/cpu:host_tracer",
-    ] + if_cuda([
-        "//tensorflow/core/profiler/internal/gpu:device_tracer",
-    ])
-
 def tf_additional_libdevice_data():
     return []
 
@@ -627,20 +615,26 @@ def tf_additional_libdevice_deps():
     return ["@local_config_cuda//cuda:cuda_headers"]
 
 def tf_additional_libdevice_srcs():
-    return ["platform/default/cuda_libdevice_path.cc"]
+    return ["default/cuda_libdevice_path.cc"]
+
+def tf_additional_rocdl_deps():
+    return ["@local_config_rocm//rocm:rocm_headers"]
+
+def tf_additional_rocdl_srcs():
+    return ["default/rocm_rocdl_path.cc"]
 
 def tf_additional_test_deps():
     return []
 
 def tf_additional_test_srcs():
     return [
-        "platform/default/test_benchmark.cc",
+        "default/test_benchmark.cc",
     ] + select({
         "//tensorflow:windows": [
-            "platform/windows/test.cc",
+            "windows/test.cc",
         ],
         "//conditions:default": [
-            "platform/posix/test.cc",
+            "posix/test.cc",
         ],
     })
 
@@ -743,12 +737,6 @@ def tf_additional_verbs_lib_defines():
         "//conditions:default": [],
     })
 
-def tf_additional_mpi_lib_defines():
-    return select({
-        "//tensorflow:with_mpi_support": ["TENSORFLOW_USE_MPI"],
-        "//conditions:default": [],
-    })
-
 def tf_additional_gdr_lib_defines():
     return select({
         "//tensorflow:with_gdr_support": ["TENSORFLOW_USE_GDR"],
@@ -816,3 +804,22 @@ def tf_additional_numa_copts():
             "-DTENSORFLOW_USE_NUMA",
         ],
     })
+
+def tf_additional_rpc_deps():
+    return []
+
+def tf_logging_absl_deps():
+    return [
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/strings",
+    ]
+
+def tf_protobuf_deps():
+    return [
+        "@com_google_protobuf//:protobuf",
+    ]
+
+def tf_protobuf_compiler_deps():
+    return [
+        "@com_google_protobuf//:protobuf",
+    ]
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index d917d442f5c..4f96be29ae6 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -12,7 +12,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
 load("@local_config_sycl//sycl:build_defs.bzl", "if_ccpp")
 
@@ -65,13 +65,13 @@ cc_library(
     name = "stream_executor_cuda",
     deps = [
         ":stream_executor_no_cuda",
-    ] + if_static(
-        [
+    ] + select({
+        "//tensorflow:oss": ["//tensorflow/stream_executor/cuda:cudart_stub"],
+        "//conditions:default": [
             "//tensorflow/stream_executor/cuda:all_runtime",
             ":cuda",
         ],
-        ["//tensorflow/stream_executor/cuda:cudart_stub"],
-    ) + select({
+    }) + select({
         "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }),
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 72755341220..ee5fba109c5 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -49,14 +49,6 @@ def tf_additional_verbs_deps():
         "//conditions:default": [],
     })
 
-def tf_additional_mpi_deps():
-    return select({
-        str(Label("//tensorflow:with_mpi_support")): [
-            str(Label("//tensorflow/contrib/mpi:mpi_server_lib")),
-        ],
-        "//conditions:default": [],
-    })
-
 def tf_additional_gdr_deps():
     return select({
         str(Label("//tensorflow:with_gdr_support")): [
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 04e6282edbe..cd34b32593e 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -105,12 +106,6 @@ Status CreateAndRecordEvent(CUevent* event, CUstream stream) {
   return ToStatus(cuEventRecord(*event, stream));
 }
 
-// Thread-local state recording the most recent annotation (if any).
-// When non-null, this points to a string in the active annotation
-// of the current thread.  The annotation is guaranteed to remain live
-// for the duration of the CUPTI API callback.
-static thread_local const char* tls_current_annotation;
-
 // Stores a series of kernel and memcpy records.
 class CudaEventRecorder {
  public:
@@ -121,8 +116,9 @@ class CudaEventRecorder {
     KernelRecord record = {kernel_name, context, stream};
     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
     mutex_lock lock(mutex_);
-    if (tls_current_annotation) {
-      record.annotation = &*annotations_.emplace(tls_current_annotation).first;
+    if (tracing::ScopedAnnotation::IsEnabled()) {
+      record.annotation =
+          &*annotations_.emplace(Annotation::CurrentAnnotation()).first;
     }
     kernel_records_.push_back(record);
     return kernel_records_.size() - 1;
@@ -140,8 +136,9 @@ class CudaEventRecorder {
     MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream};
     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
     mutex_lock lock(mutex_);
-    if (tls_current_annotation) {
-      record.annotation = &*annotations_.emplace(tls_current_annotation).first;
+    if (tracing::ScopedAnnotation::IsEnabled()) {
+      record.annotation =
+          &*annotations_.emplace(Annotation::CurrentAnnotation()).first;
     }
     memcpy_records_.push_back(record);
     return memcpy_records_.size() - 1;
@@ -319,62 +316,12 @@ class CuptiCallbackHook {
   CUpti_SubscriberHandle subscriber_;
 };
 
-class TraceCollectorImpl : public tracing::TraceCollector {
- public:
-  TraceCollectorImpl() : active_trace_session_(false) {
-    tracing::SetTraceCollector(this);
-  }
-
-  ~TraceCollectorImpl() override {
-    DCHECK(!active_trace_session_)
-        << "Unexpected active trace session detected.";
-  }
-
-  // Note the method can be called after a call to Stop().
-  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
-      StringPiece name_part1, StringPiece name_part2) const {
-    struct Impl : public tracing::TraceCollector::Handle {
-      std::string annotation;
-      explicit Impl(std::string&& name_scope) : annotation(name_scope) {
-        VLOG(2) << "CreateAnnotationHandle " << annotation;
-        // Remember the most recent ScopedAnnotation for each thread.
-        tls_current_annotation = annotation.c_str();
-      }
-      ~Impl() override { tls_current_annotation = nullptr; }
-    };
-    return absl::make_unique<Impl>(ConcatenateNames(name_part1, name_part2));
-  }
-
-  bool IsEnabledForAnnotations() const override {
-    return active_trace_session_.load(std::memory_order_relaxed);
-  }
-
-  void Start() {
-    DCHECK(!active_trace_session_)
-        << "Unexpected active trace session detected.";
-    active_trace_session_ = true;
-  }
-
-  void Stop() {
-    DCHECK(active_trace_session_) << "No active trace session detected. ";
-    active_trace_session_ = false;
-  }
-
- private:
-  std::atomic<bool> active_trace_session_;
-};
-
-TraceCollectorImpl* GlobalDefaultTraceCollector() {
-  static auto* instance = new TraceCollectorImpl();
-  return instance;
-}
-
-// 'DeviceTracer' is an interface for collecting low-level execution timings
+// 'GpuTracer' is an interface for collecting low-level execution timings
 // of hardware accelerator (e.g. GPU) computation and DMA transfers.
-class DeviceTracer : public profiler::ProfilerInterface {
+class GpuTracer : public profiler::ProfilerInterface {
  public:
-  DeviceTracer();
-  ~DeviceTracer() override;
+  GpuTracer();
+  ~GpuTracer() override;
 
   // ProfilerInterface interface:
   Status Start() override;
@@ -383,6 +330,9 @@ class DeviceTracer : public profiler::ProfilerInterface {
   // StepStatsCollector.  Does not clear any existing stats.
   // It is an error to call 'Collect' while a trace is running.
   Status CollectData(RunMetadata* run_metadata) override;
+  profiler::DeviceType GetDeviceType() override {
+    return profiler::DeviceType::kGpu;
+  }
 
  private:
   std::unique_ptr<CudaEventRecorder> recorder_;
@@ -392,41 +342,39 @@ class DeviceTracer : public profiler::ProfilerInterface {
   bool enabled_ GUARDED_BY(mu_);
 };
 
-DeviceTracer::DeviceTracer()
-    : recorder_(new CudaEventRecorder()), enabled_(false) {
-  VLOG(1) << "DeviceTracer created.";
+GpuTracer::GpuTracer() : recorder_(new CudaEventRecorder()), enabled_(false) {
+  VLOG(1) << "GpuTracer created.";
 }
 
-DeviceTracer::~DeviceTracer() {
+GpuTracer::~GpuTracer() {
   // Unregister the CUPTI callbacks if needed to prevent them from accessing
   // freed memory.
   Stop().IgnoreError();
 }
 
-Status DeviceTracer::Start() {
-  VLOG(1) << "DeviceTracer::Start";
+Status GpuTracer::Start() {
+  VLOG(1) << "GpuTracer::Start";
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("DeviceTracer is already enabled.");
+    return errors::FailedPrecondition("GpuTracer is already enabled.");
   }
   cupti_hook_.reset(new CuptiCallbackHook());
   TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get()));
 
-  // Register as a TraceEngine to receive ScopedAnnotations.
-  GlobalDefaultTraceCollector()->Start();
+  tracing::ScopedAnnotation::Enable(true);
 
   enabled_ = true;
   return Status::OK();
 }
 
-Status DeviceTracer::Stop() {
-  VLOG(1) << "DeviceTracer::Stop";
+Status GpuTracer::Stop() {
+  VLOG(1) << "GpuTracer::Stop";
   mutex_lock l(mu_);
   if (!enabled_) {
     return Status::OK();
   }
   cupti_hook_.reset();
-  GlobalDefaultTraceCollector()->Stop();
+  tracing::ScopedAnnotation::Enable(false);
 
   enabled_ = false;
   return Status::OK();
@@ -593,7 +541,7 @@ class CudaEventCollector {
       node_name = "<invalid_name>";
     }
     if (record.annotation) {
-      node_name = absl::StrCat(*record.annotation, "::", node_name);
+      node_name = absl::StrCat(*record.annotation, "@@", node_name);
     }
     stats->set_node_name(node_name);
     // TODO(csigg): Report grid size?
@@ -622,7 +570,7 @@ class CudaEventCollector {
       node_name = "<invalid_name>";
     }
     if (record.annotation) {
-      node_name = absl::StrCat(*record.annotation, "::", node_name);
+      node_name = absl::StrCat(*record.annotation, "@@", node_name);
     }
     stats->set_node_name(node_name);
     // TODO(csigg): Show label in Chrome trace viewer.
@@ -692,10 +640,10 @@ class CudaEventCollector {
   int64 end_walltime_us_;
 };
 
-Status DeviceTracer::CollectData(RunMetadata* run_metadata) {
+Status GpuTracer::CollectData(RunMetadata* run_metadata) {
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("DeviceTracer is still enabled.");
+    return errors::FailedPrecondition("GpuTracer is still enabled.");
   }
 
   StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
@@ -707,21 +655,24 @@ Status DeviceTracer::CollectData(RunMetadata* run_metadata) {
 }  // namespace
 
 // Not in anonymous namespace for testing purposes.
-std::unique_ptr<profiler::ProfilerInterface> CreateDeviceTracer(
-    const ProfilerContext*) {
+std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+    const profiler::ProfilerOptions& options) {
   auto status = cuInit(0);
   if (status != CUDA_SUCCESS) {
     LogIfError(ToStatus(status));
     return nullptr;
   }
-  return absl::make_unique<DeviceTracer>();
+  if (options.device_type != profiler::DeviceType::kGpu &&
+      options.device_type != profiler::DeviceType::kUnspecified)
+    return nullptr;
+  return absl::make_unique<GpuTracer>();
 }
 
 auto register_device_tracer_factory = [] {
   bool enable;
   TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_GPU_PROFILER", true, &enable));
   if (enable) {
-    RegisterProfilerFactory(&CreateDeviceTracer);
+    RegisterProfilerFactory(&CreateGpuTracer);
   }
   return 0;
 }();
diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc
new file mode 100644
index 00000000000..14196044656
--- /dev/null
+++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+
+#include <stdlib.h>
+
+#if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+string RocmRoot() {
+#if TENSORFLOW_USE_ROCM
+  VLOG(3) << "ROCM root = " << TF_ROCM_TOOLKIT_PATH;
+  return TF_ROCM_TOOLKIT_PATH;
+#else
+  return "";
+#endif
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/stacktrace_handler.cc b/tensorflow/core/platform/default/stacktrace_handler.cc
similarity index 95%
rename from tensorflow/core/platform/stacktrace_handler.cc
rename to tensorflow/core/platform/default/stacktrace_handler.cc
index ff31c97be0a..72907ecb526 100644
--- a/tensorflow/core/platform/stacktrace_handler.cc
+++ b/tensorflow/core/platform/default/stacktrace_handler.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if !defined(PLATFORM_GOOGLE) && !defined(IS_MOBILE_PLATFORM) && \
-    defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
+#if !defined(IS_MOBILE_PLATFORM) && defined(PLATFORM_POSIX) && \
+    (defined(__clang__) || defined(__GNUC__))
 #define TF_GENERATE_STACKTRACE
 #endif
 
@@ -30,7 +30,6 @@ limitations under the License.
 #include <unistd.h>
 #include <string>
 
-#include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
 #endif  // defined(TF_GENERATE_STACKTRACE)
diff --git a/tensorflow/core/platform/default/thread_annotations.h b/tensorflow/core/platform/default/thread_annotations.h
deleted file mode 100644
index d21d60ab0b6..00000000000
--- a/tensorflow/core/platform/default/thread_annotations.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This header file contains the macro definitions for thread safety
-// annotations that allow the developers to document the locking policies
-// of their multi-threaded code. The annotations can also help program
-// analysis tools to identify potential thread safety issues.
-//
-// The primary documentation on these annotations is external:
-// http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
-//
-// The annotations are implemented using compiler attributes.
-// Using the macros defined here instead of the raw attributes allows
-// for portability and future compatibility.
-//
-// When referring to mutexes in the arguments of the attributes, you should
-// use variable names or more complex expressions (e.g. my_object->mutex_)
-// that evaluate to a concrete mutex object whenever possible. If the mutex
-// you want to refer to is not in scope, you may use a member pointer
-// (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
-//
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/thread_annotations.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/thread_annotations.h
-
-#if defined(__clang__) && (!defined(SWIG))
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
-#else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
-#endif
-
-// Document if a shared variable/field needs to be protected by a mutex.
-// GUARDED_BY allows the user to specify a particular mutex that should be
-// held when accessing the annotated variable.  GUARDED_VAR indicates that
-// a shared variable is guarded by some unspecified mutex, for use in rare
-// cases where a valid mutex expression cannot be specified.
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-#define GUARDED_VAR  // no-op
-
-// Document if the memory location pointed to by a pointer should be guarded
-// by a mutex when dereferencing the pointer.  PT_GUARDED_VAR is analogous to
-// GUARDED_VAR.   Note that a pointer variable to a shared memory location
-// could itself be a shared variable. For example, if a shared global pointer
-// q, which is guarded by mu1, points to a shared memory location that is
-// guarded by mu2, q should be annotated as follows:
-//     int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2);
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
-#define PT_GUARDED_VAR  // no-op
-
-// Document the acquisition order between locks that can be held
-// simultaneously by a thread. For any two locks that need to be annotated
-// to establish an acquisition order, only one of them needs the annotation.
-// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
-// and ACQUIRED_BEFORE.)
-#define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
-
-#define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
-
-#define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
-
-#define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
-
-#define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
-
-// Document a function that expects a mutex to be held prior to entry.
-// The mutex is expected to be held both on entry to and exit from the
-// function.
-#define EXCLUSIVE_LOCKS_REQUIRED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
-
-#define SHARED_LOCKS_REQUIRED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
-
-// Document the locks acquired in the body of the function. These locks
-// cannot be held when calling this function (for instance, when the
-// mutex implementation is non-reentrant).
-#define LOCKS_EXCLUDED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
-
-// Document a function that returns a mutex without acquiring it.  For example,
-// a public getter method that returns a pointer to a private mutex should
-// be annotated with LOCK_RETURNED.
-#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
-
-// Document if a class/type is a lockable type (such as the Mutex class).
-#define LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(lockable)
-
-// Document if a class does RAII locking (such as the MutexLock class).
-// The constructor should use LOCK_FUNCTION to specify the mutex that is
-// acquired, and the destructor should use UNLOCK_FUNCTION with no arguments;
-// the analysis will assume that the destructor unlocks whatever the
-// constructor locked.
-#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
-
-// Document functions that acquire a lock in the body of a function, and do
-// not release it.
-#define EXCLUSIVE_LOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
-
-#define SHARED_LOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
-
-// Document functions that expect a lock to be held on entry to the function,
-// and release it in the body of the function.
-#define UNLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
-
-// Document functions that try to acquire a lock, and return success or failure
-// (or a non-boolean value that can be interpreted as a boolean).
-// The first argument should be true for functions that return true on success,
-// or false for functions that return false on success. The second argument
-// specifies the mutex that is locked on success. If unspecified, it is assumed
-// to be 'this'.
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
-
-#define SHARED_TRYLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
-
-// Document functions that dynamically check to see if a lock is held, and fail
-// if it is not held.
-#define ASSERT_EXCLUSIVE_LOCK(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_exclusive_lock(__VA_ARGS__))
-
-#define ASSERT_SHARED_LOCK(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_lock(__VA_ARGS__))
-
-// Turns off thread safety checking within the body of a particular function.
-// This is used as an escape hatch for cases where either (a) the function
-// is correct, but the locking is more complicated than the analyzer can handle,
-// or (b) the function contains race conditions that are known to be benign.
-#define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
-
-// TS_UNCHECKED should be placed around lock expressions that are not valid
-// C++ syntax, but which are present for documentation purposes.  These
-// annotations will be ignored by the analysis.
-#define TS_UNCHECKED(x) ""
-
-namespace tensorflow {
-namespace thread_safety_analysis {
-
-// Takes a reference to a guarded data member, and returns an unguarded
-// reference.
-template <class T>
-inline const T& ts_unchecked_read(const T& v) NO_THREAD_SAFETY_ANALYSIS {
-  return v;
-}
-
-template <class T>
-inline T& ts_unchecked_read(T& v) NO_THREAD_SAFETY_ANALYSIS {
-  return v;
-}
-}  // namespace thread_safety_analysis
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/default/unbounded_work_queue.cc b/tensorflow/core/platform/default/unbounded_work_queue.cc
new file mode 100644
index 00000000000..3cc66b65e2a
--- /dev/null
+++ b/tensorflow/core/platform/default/unbounded_work_queue.cc
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+UnboundedWorkQueue::UnboundedWorkQueue(Env* env, const string& thread_name)
+    : env_(env), thread_name_(thread_name) {}
+
+UnboundedWorkQueue::~UnboundedWorkQueue() {
+  {
+    mutex_lock l(work_queue_mu_);
+    // Wake up all `PooledThreadFunc` threads and cause them to terminate before
+    // joining them when `threads_` is cleared.
+    cancelled_ = true;
+    work_queue_cv_.notify_all();
+    if (!work_queue_.empty()) {
+      LOG(ERROR) << "UnboundedWorkQueue named \"" << thread_name_ << "\" was "
+                 << "deleted with pending work in its queue. This may indicate "
+                 << "a potential use-after-free bug.";
+    }
+  }
+
+  {
+    mutex_lock l(thread_pool_mu_);
+    // Clear the list of pooled threads, which will eventually terminate due to
+    // the previous notification.
+    //
+    // NOTE: It is safe to do this while holding `thread_pool_mu_`, because
+    // no subsequent calls to `this->Schedule()` should be issued after the
+    // destructor starts.
+    thread_pool_.clear();
+  }
+}
+
+void UnboundedWorkQueue::Schedule(WorkFunction fn) {
+  // Enqueue a work item for the new thread's function, and wake up a
+  // cached thread to process it.
+  mutex_lock l(work_queue_mu_);
+  work_queue_.push_back(std::move(fn));
+  work_queue_cv_.notify_one();
+  // NOTE: The queue may be non-empty, so we must account for queued work when
+  // considering how many threads are free.
+  if (work_queue_.size() > num_idle_threads_) {
+    // Spawn a new physical thread to process the given function.
+    // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_`
+    // at the beginning of its work loop.
+    Thread* new_thread =
+        env_->StartThread({}, thread_name_, [this]() { PooledThreadFunc(); });
+
+    mutex_lock l(thread_pool_mu_);
+    thread_pool_.emplace_back(new_thread);
+  }
+}
+
+void UnboundedWorkQueue::PooledThreadFunc() {
+  while (true) {
+    WorkFunction fn;
+    {
+      mutex_lock l(work_queue_mu_);
+      ++num_idle_threads_;
+      while (!cancelled_ && work_queue_.empty()) {
+        // Wait for a new work function to be submitted, or the cache to be
+        // destroyed.
+        work_queue_cv_.wait(l);
+      }
+      if (cancelled_) {
+        return;
+      }
+      fn = std::move(work_queue_.front());
+      work_queue_.pop_front();
+      --num_idle_threads_;
+    }
+
+    fn();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/unbounded_work_queue.h b/tensorflow/core/platform/default/unbounded_work_queue.h
new file mode 100644
index 00000000000..cba83622a3a
--- /dev/null
+++ b/tensorflow/core/platform/default/unbounded_work_queue.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// An `UnboundedWorkQueue` provides a mechanism for temporally multiplexing a
+// potentially large number of "logical" threads onto a smaller number of
+// "physical" threads. The multiplexing is achieved by maintaining an internal
+// pool of long-running "physical" threads that are used to execute the
+// "logical" threads.  Like a regular thread, a "logical" thread may block on
+// other threads, and the size of the pool will increase to ensure that progress
+// is made. This mechanism is recommended in situations where short-lived
+// threads are created repeatedly, to avoid the overhead and memory
+// fragmentation that can result from excessive thread creation.
+class UnboundedWorkQueue {
+ public:
+  UnboundedWorkQueue(Env* env, const string& thread_name);
+  ~UnboundedWorkQueue();
+
+  using WorkFunction = std::function<void()>;
+
+  // Schedule `fn` on a thread.  `fn` may perform blocking work, so if all the
+  // existing threads are blocked or busy, this may spawn a new thread which
+  // will be added to the thread pool managed by this work queue.
+  void Schedule(WorkFunction fn);
+
+ private:
+  void PooledThreadFunc();
+
+  Env* const env_;  // Not owned.
+  const string thread_name_;
+  mutex work_queue_mu_;
+  condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_);
+  size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0;
+  bool cancelled_ GUARDED_BY(work_queue_mu_) = false;
+  std::deque<WorkFunction> work_queue_ GUARDED_BY(work_queue_mu_);
+  mutex thread_pool_mu_;
+  std::vector<std::unique_ptr<Thread>> thread_pool_ GUARDED_BY(thread_pool_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index d90e1265817..9ed8896f16c 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -39,15 +39,14 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
-struct ProfilerContext;
 
 #if GOOGLE_CUDA
-std::unique_ptr<profiler::ProfilerInterface> CreateDeviceTracer(
-    const ProfilerContext*);
+std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+    const profiler::ProfilerOptions& options);
 #else
 // We don't have device tracer for non-cuda case.
-std::unique_ptr<profiler::ProfilerInterface> CreateDeviceTracer(
-    const ProfilerContext*) {
+std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+    const profiler::ProfilerOptions& options) {
   return nullptr;
 }
 #endif
@@ -111,21 +110,24 @@ class DeviceTracerTest : public ::testing::Test {
 };
 
 TEST_F(DeviceTracerTest, StartStop) {
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   TF_EXPECT_OK(tracer->Stop());
 }
 
 TEST_F(DeviceTracerTest, StopBeforeStart) {
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Stop());
   TF_EXPECT_OK(tracer->Stop());
 }
 
 TEST_F(DeviceTracerTest, CollectBeforeStart) {
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
   RunMetadata run_metadata;
   TF_EXPECT_OK(tracer->CollectData(&run_metadata));
@@ -133,7 +135,8 @@ TEST_F(DeviceTracerTest, CollectBeforeStart) {
 }
 
 TEST_F(DeviceTracerTest, CollectBeforeStop) {
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   RunMetadata run_metadata;
@@ -143,8 +146,9 @@ TEST_F(DeviceTracerTest, CollectBeforeStop) {
 }
 
 TEST_F(DeviceTracerTest, StartTwoTracers) {
-  auto tracer1 = CreateDeviceTracer(nullptr);
-  auto tracer2 = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer1 = CreateGpuTracer(options);
+  auto tracer2 = CreateGpuTracer(options);
   if (!tracer1 || !tracer2) return;
 
   TF_EXPECT_OK(tracer1->Start());
@@ -157,7 +161,8 @@ TEST_F(DeviceTracerTest, StartTwoTracers) {
 
 TEST_F(DeviceTracerTest, RunWithTracer) {
   // On non-GPU platforms, we may not support DeviceTracer.
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
@@ -184,7 +189,8 @@ TEST_F(DeviceTracerTest, RunWithTracer) {
 }
 
 TEST_F(DeviceTracerTest, TraceToStepStatsCollector) {
-  auto tracer = CreateDeviceTracer(nullptr);
+  profiler::ProfilerOptions options;
+  auto tracer = CreateGpuTracer(options);
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 75e5b31f3ff..e681d940083 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -468,7 +468,9 @@ class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
     pos_ += count;
     return true;
   }
-  protobuf_int64 ByteCount() const override { return pos_; }
+  int64_t ByteCount() const override {
+    return pos_;
+  }
   Status status() const { return status_; }
 
   bool Next(const void** data, int* size) override {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index fe781a57611..f7a91c7703e 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -32,6 +32,11 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
+// Delete the definition of CopyFile as the linker gets confused.
+#ifdef PLATFORM_WINDOWS
+#undef CopyFile
+#endif
+
 namespace tensorflow {
 
 class Thread;
diff --git a/tensorflow/core/platform/env_time.h b/tensorflow/core/platform/env_time.h
index 1b791cef374..1d879a052ed 100644
--- a/tensorflow/core/platform/env_time.h
+++ b/tensorflow/core/platform/env_time.h
@@ -33,7 +33,7 @@ class EnvTime {
   static constexpr uint64 kSecondsToMicros = 1000ULL * 1000ULL;
   static constexpr uint64 kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
 
-  EnvTime();
+  EnvTime() = default;
   virtual ~EnvTime() = default;
 
   /// \brief Returns a default impl suitable for the current operating
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 8ab43c416bc..21d9f3f097e 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -32,6 +32,7 @@ limitations under the License.
 
 #ifdef PLATFORM_WINDOWS
 #undef DeleteFile
+#undef CopyFile
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/grpc_services.h b/tensorflow/core/platform/grpc_services.h
index cd918193dc5..13b84cab5df 100644
--- a/tensorflow/core/platform/grpc_services.h
+++ b/tensorflow/core/platform/grpc_services.h
@@ -12,11 +12,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
 #define TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
 
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 #if !defined(PLATFORM_GOOGLE)
-
 namespace tensorflow {
 namespace grpc {
 
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/platform/numbers.cc
similarity index 99%
rename from tensorflow/core/lib/strings/numbers.cc
rename to tensorflow/core/platform/numbers.cc
index dc8e5d9d6b7..f2158291d94 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/platform/numbers.cc
@@ -12,12 +12,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/numbers.h"
 
 #include <ctype.h>
 #include <float.h>
 #include <stdio.h>
 #include <stdlib.h>
+
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
@@ -25,11 +26,10 @@ limitations under the License.
 #include <unordered_map>
 
 #include "double-conversion/double-conversion.h"
-
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/numbers.h b/tensorflow/core/platform/numbers.h
new file mode 100644
index 00000000000..9d16dc554fa
--- /dev/null
+++ b/tensorflow/core/platform/numbers.h
@@ -0,0 +1,179 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
+#define TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace strings {
+
+// ----------------------------------------------------------------------
+// FastIntToBufferLeft()
+//    These are intended for speed.
+//
+//    All functions take the output buffer as an arg.  FastInt() uses
+//    at most 22 bytes, FastTime() uses exactly 30 bytes.  They all
+//    return a pointer to the beginning of the output, which is the same as
+//    the beginning of the input buffer.
+//
+//    NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible
+//    to pass to FastTimeToBuffer() a time whose year cannot be
+//    represented in 4 digits. In this case, the output buffer
+//    will contain the string "Invalid:<value>"
+// ----------------------------------------------------------------------
+
+// Previously documented minimums -- the buffers provided must be at least this
+// long, though these numbers are subject to change:
+//     Int32, UInt32:                   12 bytes
+//     Int64, UInt64, Int, Uint:        22 bytes
+//     Time:                            30 bytes
+// Use kFastToBufferSize rather than hardcoding constants.
+static const int kFastToBufferSize = 32;
+
+// ----------------------------------------------------------------------
+// FastInt32ToBufferLeft()
+// FastUInt32ToBufferLeft()
+// FastInt64ToBufferLeft()
+// FastUInt64ToBufferLeft()
+//
+// These functions convert their numeric argument to an ASCII
+// representation of the numeric value in base 10, with the
+// representation being left-aligned in the buffer.  The caller is
+// responsible for ensuring that the buffer has enough space to hold
+// the output.  The buffer should typically be at least kFastToBufferSize
+// bytes.
+//
+// Returns the number of characters written.
+// ----------------------------------------------------------------------
+
+size_t FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
+size_t FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
+size_t FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
+size_t FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
+
+// Required buffer size for DoubleToBuffer is kFastToBufferSize.
+// Required buffer size for FloatToBuffer is kFastToBufferSize.
+size_t DoubleToBuffer(double value, char* buffer);
+size_t FloatToBuffer(float value, char* buffer);
+
+// Convert a 64-bit fingerprint value to an ASCII representation.
+string FpToString(Fprint fp);
+
+// Attempt to parse a fingerprint in the form encoded by FpToString.  If
+// successful, stores the fingerprint in *fp and returns true.  Otherwise,
+// returns false.
+bool StringToFp(const string& s, Fprint* fp);
+
+// Convert a 64-bit fingerprint value to an ASCII representation that
+// is terminated by a '\0'.
+// Buf must point to an array of at least kFastToBufferSize characters
+StringPiece Uint64ToHexString(uint64 v, char* buf);
+
+// Attempt to parse a uint64 in the form encoded by FastUint64ToHexString.  If
+// successful, stores the value in *v and returns true.  Otherwise,
+// returns false.
+bool HexStringToUint64(const StringPiece& s, uint64* v);
+
+// Convert strings to 32bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+bool safe_strto32(StringPiece str, int32* value);
+
+// Convert strings to unsigned 32bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+bool safe_strtou32(StringPiece str, uint32* value);
+
+// Convert strings to 64bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+bool safe_strto64(StringPiece str, int64* value);
+
+// Convert strings to unsigned 64bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+bool safe_strtou64(StringPiece str, uint64* value);
+
+// Convert strings to floating point values.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
+bool safe_strtof(StringPiece str, float* value);
+
+// Convert strings to double precision floating point values.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
+bool safe_strtod(StringPiece str, double* value);
+
+inline bool ProtoParseNumeric(StringPiece s, int32* value) {
+  return safe_strto32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
+  return safe_strtou32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, int64* value) {
+  return safe_strto64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
+  return safe_strtou64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, float* value) {
+  return safe_strtof(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, double* value) {
+  return safe_strtod(s, value);
+}
+
+// Convert strings to number of type T.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+template <typename T>
+bool SafeStringToNumeric(StringPiece s, T* value) {
+  return ProtoParseNumeric(s, value);
+}
+
+// Converts from an int64 to a human readable string representing the
+// same number, using decimal powers.  e.g. 1200000 -> "1.20M".
+string HumanReadableNum(int64 value);
+
+// Converts from an int64 representing a number of bytes to a
+// human readable string representing the same number.
+// e.g. 12345678 -> "11.77MiB".
+string HumanReadableNumBytes(int64 num_bytes);
+
+// Converts a time interval as double to a human readable
+// string. For example:
+//   0.001       -> "1 ms"
+//   10.0        -> "10 s"
+//   933120.0    -> "10.8 days"
+//   39420000.0  -> "1.25 years"
+//   -10         -> "-10 s"
+string HumanReadableElapsedTime(double seconds);
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/platform/numbers_test.cc
similarity index 99%
rename from tensorflow/core/lib/strings/numbers_test.cc
rename to tensorflow/core/platform/numbers_test.cc
index 5b595f98478..b35de111fbe 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/platform/numbers_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/numbers.h"
 
 #include <cmath>
 #include <string>
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
index c1852633d59..489a211ccf7 100644
--- a/tensorflow/core/platform/platform_strings.cc
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -15,14 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform_strings.h"
 
+#include <cerrno>
 #include <cstdio>
 #include <cstring>
-
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/lib/core/status.h"
-
 namespace tensorflow {
 
 int GetPlatformStrings(const std::string& path,
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 00e4066d4b9..f9693d709c3 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -36,7 +36,6 @@ TEST(Port, AlignedMalloc) {
 }
 
 TEST(Port, GetCurrentCPU) {
-  GTEST_SKIP() << "Currently not stable.";  // b/132640908
   const int cpu = GetCurrentCPU();
 #if !defined(__APPLE__)
   // GetCurrentCPU does not currently work on MacOS.
diff --git a/tensorflow/core/platform/posix/env_time.cc b/tensorflow/core/platform/posix/env_time.cc
index e7658108654..78f3f74f1e8 100644
--- a/tensorflow/core/platform/posix/env_time.cc
+++ b/tensorflow/core/platform/posix/env_time.cc
@@ -36,11 +36,9 @@ class PosixEnvTime : public EnvTime {
 
 }  // namespace
 
-#if defined(PLATFORM_POSIX) || defined(__ANDROID__)
 EnvTime* EnvTime::Default() {
   static EnvTime* default_env_time = new PosixEnvTime;
   return default_env_time;
 }
-#endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index a3699de965a..47f4abae3bb 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -303,10 +303,6 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
 
 std::size_t MallocExtension_GetAllocatedSize(const void* p) { return 0; }
 
-void AdjustFilenameForLogging(string* filename) {
-  // Nothing to do
-}
-
 bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
diff --git a/tensorflow/core/platform/protobuf.cc b/tensorflow/core/platform/protobuf.cc
index c9e6f3bf5c6..17ecbf77da6 100644
--- a/tensorflow/core/platform/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -20,4 +20,43 @@ namespace tensorflow {
 const char* kProtobufInt64Typename = "::tensorflow::protobuf_int64";
 const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 
+#ifdef USE_TSTRING
+TStringOutputStream::TStringOutputStream(tstring* target) : target_(target) {}
+
+bool TStringOutputStream::Next(void** data, int* size) {
+  int old_size = target_->size();
+
+  // Grow the string.
+  if (old_size < target_->capacity()) {
+    // Resize the string to match its capacity, since we can get away
+    // without a memory allocation this way.
+    target_->resize_uninitialized(target_->capacity());
+  } else {
+    // Size has reached capacity, try to double the size.
+    if (old_size > std::numeric_limits<int>::max() / 2) {
+      // Can not double the size otherwise it is going to cause integer
+      // overflow in the expression below: old_size * 2 ";
+      return false;
+    }
+    // Double the size, also make sure that the new size is at least
+    // kMinimumSize.
+    target_->resize_uninitialized(
+        std::max(old_size * 2,
+                 kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
+  }
+
+  *data = target_->data() + old_size;
+  *size = target_->size() - old_size;
+  return true;
+}
+
+void TStringOutputStream::BackUp(int count) {
+  target_->resize(target_->size() - count);
+}
+
+protobuf::io::ByteCountInt64 TStringOutputStream::ByteCount() const {
+  return target_->size();
+}
+#endif  // USE_TSTRING
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index 59f4129adf4..28d34690091 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -79,6 +79,40 @@ inline void SetProtobufStringSwapAllowed(string* src, Cord* dest) {
 }
 #endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
+inline bool SerializeToTString(const protobuf::MessageLite& proto,
+                               tstring* output) {
+#ifdef USE_TSTRING
+  size_t size = proto.ByteSizeLong();
+  output->resize_uninitialized(size);
+  return proto.SerializeToArray(output->data(), static_cast<int>(size));
+#else   // USE_TSTRING
+  return proto.SerializeToString(output);
+#endif  // USE_TSTRING
+}
+
+#ifdef USE_TSTRING
+// Analogue to StringOutputStream for tstring.
+class TStringOutputStream : public protobuf::io::ZeroCopyOutputStream {
+ public:
+  explicit TStringOutputStream(tstring* target);
+  ~TStringOutputStream() override = default;
+
+  TStringOutputStream(const TStringOutputStream&) = delete;
+  void operator=(const TStringOutputStream&) = delete;
+
+  bool Next(void** data, int* size) override;
+  void BackUp(int count) override;
+  protobuf::io::ByteCountInt64 ByteCount() const override;
+
+ private:
+  static const int kMinimumSize = 16;
+
+  tstring* target_;
+};
+#else   // USE_TSTRING
+typedef protobuf::io::StringOutputStream TStringOutputStream;
+#endif  // USE_TSTRING
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
diff --git a/tensorflow/core/platform/env_time.cc b/tensorflow/core/platform/rocm_rocdl_path.cc
similarity index 79%
rename from tensorflow/core/platform/env_time.cc
rename to tensorflow/core/platform/rocm_rocdl_path.cc
index 10ba2abe7cb..bf5b2bf722c 100644
--- a/tensorflow/core/platform/env_time.cc
+++ b/tensorflow/core/platform/rocm_rocdl_path.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+
+#include "tensorflow/core/lib/io/path.h"
 
 namespace tensorflow {
 
-EnvTime::EnvTime() {}
+string RocdlRoot() {
+  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/rocm_rocdl_path.h b/tensorflow/core/platform/rocm_rocdl_path.h
new file mode 100644
index 00000000000..e83ef5b8235
--- /dev/null
+++ b/tensorflow/core/platform/rocm_rocdl_path.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
+#define TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns the root directory of the ROCM SDK, which contains sub-folders such
+// as bin, lib, and rocdl.
+string RocmRoot();
+
+// Returns the directory that contains ROCm-Device-Libs files in the ROCm SDK.
+string RocdlRoot();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
diff --git a/tensorflow/core/platform/rocm_rocdl_path_test.cc b/tensorflow/core/platform/rocm_rocdl_path_test.cc
new file mode 100644
index 00000000000..4a4d9b89c59
--- /dev/null
+++ b/tensorflow/core/platform/rocm_rocdl_path_test.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+#if TENSORFLOW_USE_ROCM
+TEST(RocmRocdlPathTest, ROCDLPath) {
+  VLOG(2) << "ROCm-Deivce-Libs root = " << RocdlRoot();
+  std::vector<string> rocdl_files;
+  TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
+      io::JoinPath(RocdlRoot(), "*.amdgcn.bc"), &rocdl_files));
+  EXPECT_LT(0, rocdl_files.size());
+}
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/scanner.cc b/tensorflow/core/platform/scanner.cc
similarity index 96%
rename from tensorflow/core/lib/strings/scanner.cc
rename to tensorflow/core/platform/scanner.cc
index 39a2265aa27..031ccf0a2e8 100644
--- a/tensorflow/core/lib/strings/scanner.cc
+++ b/tensorflow/core/platform/scanner.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/platform/scanner.h"
 
 namespace tensorflow {
 namespace strings {
diff --git a/tensorflow/core/platform/scanner.h b/tensorflow/core/platform/scanner.h
new file mode 100644
index 00000000000..ac93061949b
--- /dev/null
+++ b/tensorflow/core/platform/scanner.h
@@ -0,0 +1,245 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_SCANNER_H_
+#define TENSORFLOW_CORE_PLATFORM_SCANNER_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace strings {
+
+// Scanner provides simplified string parsing, in which a string is parsed as a
+// series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
+// finally GetResult is called. If GetResult returns true, then it also returns
+// the remaining characters and any captured substring.
+//
+// The range to capture can be controlled with RestartCapture and StopCapture;
+// by default, all processed characters are captured.
+class Scanner {
+ public:
+  // Classes of characters. Each enum name is to be read as the union of the
+  // parts - e.g., class LETTER_DIGIT means the class includes all letters and
+  // all digits.
+  //
+  // LETTER means ascii letter a-zA-Z.
+  // DIGIT means ascii digit: 0-9.
+  enum CharClass {
+    // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
+    // in scanner_test.cc
+    ALL,
+    DIGIT,
+    LETTER,
+    LETTER_DIGIT,
+    LETTER_DIGIT_DASH_UNDERSCORE,
+    LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
+    LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
+    LETTER_DIGIT_DOT,
+    LETTER_DIGIT_DOT_PLUS_MINUS,
+    LETTER_DIGIT_DOT_UNDERSCORE,
+    LETTER_DIGIT_UNDERSCORE,
+    LOWERLETTER,
+    LOWERLETTER_DIGIT,
+    LOWERLETTER_DIGIT_UNDERSCORE,
+    NON_ZERO_DIGIT,
+    SPACE,
+    UPPERLETTER,
+    RANGLE,
+  };
+
+  explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
+
+  // Consume the next character of the given class from input. If the next
+  // character is not in the class, then GetResult will ultimately return false.
+  Scanner& One(CharClass clz) {
+    if (cur_.empty() || !Matches(clz, cur_[0])) {
+      return Error();
+    }
+    cur_.remove_prefix(1);
+    return *this;
+  }
+
+  // Consume the next s.size() characters of the input, if they match <s>. If
+  // they don't match <s>, this is a no-op.
+  Scanner& ZeroOrOneLiteral(StringPiece s) {
+    str_util::ConsumePrefix(&cur_, s);
+    return *this;
+  }
+
+  // Consume the next s.size() characters of the input, if they match <s>. If
+  // they don't match <s>, then GetResult will ultimately return false.
+  Scanner& OneLiteral(StringPiece s) {
+    if (!str_util::ConsumePrefix(&cur_, s)) {
+      error_ = true;
+    }
+    return *this;
+  }
+
+  // Consume characters from the input as long as they match <clz>. Zero
+  // characters is still considered a match, so it will never cause GetResult to
+  // return false.
+  Scanner& Any(CharClass clz) {
+    while (!cur_.empty() && Matches(clz, cur_[0])) {
+      cur_.remove_prefix(1);
+    }
+    return *this;
+  }
+
+  // Shorthand for One(clz).Any(clz).
+  Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
+
+  // Reset the capture start point.
+  //
+  // Later, when GetResult is called and if it returns true, the capture
+  // returned will start at the position at the time this was called.
+  Scanner& RestartCapture() {
+    capture_start_ = cur_.data();
+    capture_end_ = nullptr;
+    return *this;
+  }
+
+  // Stop capturing input.
+  //
+  // Later, when GetResult is called and if it returns true, the capture
+  // returned will end at the position at the time this was called.
+  Scanner& StopCapture() {
+    capture_end_ = cur_.data();
+    return *this;
+  }
+
+  // If not at the input of input, then GetResult will ultimately return false.
+  Scanner& Eos() {
+    if (!cur_.empty()) error_ = true;
+    return *this;
+  }
+
+  // Shorthand for Any(SPACE).
+  Scanner& AnySpace() { return Any(SPACE); }
+
+  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
+  Scanner& ScanUntil(char end_ch) {
+    ScanUntilImpl(end_ch, false);
+    return *this;
+  }
+
+  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
+  // Backslash escape sequences are skipped.
+  // Used for implementing quoted string scanning.
+  Scanner& ScanEscapedUntil(char end_ch) {
+    ScanUntilImpl(end_ch, true);
+    return *this;
+  }
+
+  // Return the next character that will be scanned, or <default_value> if there
+  // are no more characters to scan.
+  // Note that if a scan operation has failed (so GetResult() returns false),
+  // then the value of Peek may or may not have advanced since the scan
+  // operation that failed.
+  char Peek(char default_value = '\0') const {
+    return cur_.empty() ? default_value : cur_[0];
+  }
+
+  // Returns false if there are no remaining characters to consume.
+  int empty() const { return cur_.empty(); }
+
+  // Returns true if the input string successfully matched. When true is
+  // returned, the remaining string is returned in <remaining> and the captured
+  // string returned in <capture>, if non-NULL.
+  bool GetResult(StringPiece* remaining = nullptr,
+                 StringPiece* capture = nullptr);
+
+ private:
+  void ScanUntilImpl(char end_ch, bool escaped);
+
+  Scanner& Error() {
+    error_ = true;
+    return *this;
+  }
+
+  static bool IsLetter(char ch) {
+    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+  }
+
+  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
+
+  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
+
+  static bool IsSpace(char ch) {
+    return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
+            ch == '\r');
+  }
+
+  static bool Matches(CharClass clz, char ch) {
+    switch (clz) {
+      case ALL:
+        return true;
+      case DIGIT:
+        return IsDigit(ch);
+      case LETTER:
+        return IsLetter(ch);
+      case LETTER_DIGIT:
+        return IsLetter(ch) || IsDigit(ch);
+      case LETTER_DIGIT_DASH_UNDERSCORE:
+        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
+      case LETTER_DIGIT_DASH_DOT_SLASH:
+        return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+               ch == '/';
+      case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
+        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+                ch == '/' || ch == '_');
+      case LETTER_DIGIT_DOT:
+        return IsLetter(ch) || IsDigit(ch) || ch == '.';
+      case LETTER_DIGIT_DOT_PLUS_MINUS:
+        return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
+               ch == '.';
+      case LETTER_DIGIT_DOT_UNDERSCORE:
+        return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
+      case LETTER_DIGIT_UNDERSCORE:
+        return IsLetter(ch) || IsDigit(ch) || ch == '_';
+      case LOWERLETTER:
+        return ch >= 'a' && ch <= 'z';
+      case LOWERLETTER_DIGIT:
+        return IsLowerLetter(ch) || IsDigit(ch);
+      case LOWERLETTER_DIGIT_UNDERSCORE:
+        return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
+      case NON_ZERO_DIGIT:
+        return IsDigit(ch) && ch != '0';
+      case SPACE:
+        return IsSpace(ch);
+      case UPPERLETTER:
+        return ch >= 'A' && ch <= 'Z';
+      case RANGLE:
+        return ch == '>';
+    }
+    return false;
+  }
+
+  StringPiece cur_;
+  const char* capture_start_ = nullptr;
+  const char* capture_end_ = nullptr;
+  bool error_ = false;
+
+  friend class ScannerTest;
+  TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
+};
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_SCANNER_H_
diff --git a/tensorflow/core/lib/strings/scanner_test.cc b/tensorflow/core/platform/scanner_test.cc
similarity index 99%
rename from tensorflow/core/lib/strings/scanner_test.cc
rename to tensorflow/core/platform/scanner_test.cc
index b0f568a03e1..7537ffce179 100644
--- a/tensorflow/core/lib/strings/scanner_test.cc
+++ b/tensorflow/core/platform/scanner_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/platform/scanner.h"
 
 #include "tensorflow/core/platform/test.h"
 
@@ -310,6 +310,7 @@ TEST_F(ScannerTest, AllCharClasses) {
   EXPECT_EQ("123456789", ClassStr(Scanner::NON_ZERO_DIGIT));
   EXPECT_EQ("\t\n\v\f\r ", ClassStr(Scanner::SPACE));
   EXPECT_EQ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", ClassStr(Scanner::UPPERLETTER));
+  EXPECT_EQ(">", ClassStr(Scanner::RANGLE));
 }
 
 TEST_F(ScannerTest, Peek) {
diff --git a/tensorflow/core/platform/stacktrace_handler.h b/tensorflow/core/platform/stacktrace_handler.h
index 9f118b91b85..aafafc821ad 100644
--- a/tensorflow/core/platform/stacktrace_handler.h
+++ b/tensorflow/core/platform/stacktrace_handler.h
@@ -20,6 +20,9 @@ namespace tensorflow {
 namespace testing {
 
 // Installs signal handlers to print out stack trace.
+// Although GoogleTest has support for generating stacktraces with abseil via
+// https://github.com/google/googletest/pull/1653, this doesn't cover our use
+// case of getting C++ stacktraces in our python tests.
 void InstallStacktraceHandler();
 
 }  // namespace testing
diff --git a/tensorflow/core/platform/stacktrace_handler_test.cc b/tensorflow/core/platform/stacktrace_handler_test.cc
index 958c7de232e..5bdb1491790 100644
--- a/tensorflow/core/platform/stacktrace_handler_test.cc
+++ b/tensorflow/core/platform/stacktrace_handler_test.cc
@@ -14,12 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Testing proper operation of the stacktrace handler.
 
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <string>
+#include <csignal>
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -27,55 +22,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#define READ_BUFFER_SIZE 1024
-
 TEST(StacktraceHandlerTest, GeneratesStacktrace) {
-  // Create a pipe to write/read the child stdout.
-  int test_pipe[2];
-  EXPECT_EQ(pipe(test_pipe), 0);
-
-  // Fork the process.
-  int test_pid = fork();
-
-  if (test_pid == 0) {
-    // Child process.
-    // Close the read end of the pipe, redirect stdout and sleep.
-    close(test_pipe[0]);
-    dup2(test_pipe[1], STDOUT_FILENO);
-    dup2(test_pipe[1], STDERR_FILENO);
-    sleep(10);
-  } else {
-    // Parent process.
-    // Close the write end of the pipe, wait a little and send SIGABRT to the
-    // child process. Then watch the pipe.
-    close(test_pipe[1]);
-    sleep(1);
-
-    // Send the signal.
-    kill(test_pid, SIGABRT);
-
-    // Read from the pipe.
-    char buffer[READ_BUFFER_SIZE];
-    std::string child_output = "";
-    while (true) {
-      int read_length = read(test_pipe[0], buffer, READ_BUFFER_SIZE);
-      if (read_length > 0) {
-        child_output += std::string(buffer, read_length);
-      } else {
-        break;
-      }
-    }
-    close(test_pipe[0]);
-
-    // Just make sure we can detect one of the calls in testing stack.
-    string test_stack_frame = "testing::internal::UnitTestImpl::RunAllTests()";
-
-    // Print the stack trace detected for information.
-    LOG(INFO) << "Output from the child process:";
-    LOG(INFO) << child_output;
-
-    EXPECT_NE(child_output.find(test_stack_frame), std::string::npos);
-  }
+  // Just make sure we can detect one of the calls in testing stack.
+  EXPECT_DEATH(raise(SIGABRT),
+               "testing::internal::UnitTestImpl::RunAllTests()");
 }
 
 }  // namespace
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/platform/str_util.cc
similarity index 81%
rename from tensorflow/core/lib/strings/str_util.cc
rename to tensorflow/core/platform/str_util.cc
index b2feadea9bc..fb74acd9c2b 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/platform/str_util.cc
@@ -13,43 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/str_util.h"
 
-#include <ctype.h>
-#include <algorithm>
-#include <cstring>
+#include <cctype>
+#include <string>
 #include <vector>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace str_util {
 
 string CEscape(StringPiece src) { return absl::CEscape(src); }
 
-namespace {  // Private helpers for CUnescape().
-
-template <typename T>
-bool SplitAndParseAsInts(StringPiece text, char delim,
-                         std::function<bool(StringPiece, T*)> converter,
-                         std::vector<T>* result) {
-  result->clear();
-  std::vector<string> num_strings = Split(text, delim);
-  for (const auto& s : num_strings) {
-    T num;
-    if (!converter(s, &num)) return false;
-    result->push_back(num);
-  }
-  return true;
-}
-
-}  // namespace
-
 bool CUnescape(StringPiece source, string* dest, string* error) {
   return absl::CUnescape(source, dest, error);
 }
@@ -58,12 +39,134 @@ void StripTrailingWhitespace(string* s) {
   absl::StripTrailingAsciiWhitespace(s);
 }
 
+size_t RemoveLeadingWhitespace(StringPiece* text) {
+  absl::string_view new_text = absl::StripLeadingAsciiWhitespace(*text);
+  size_t count = text->size() - new_text.size();
+  *text = new_text;
+  return count;
+}
+
+size_t RemoveTrailingWhitespace(StringPiece* text) {
+  absl::string_view new_text = absl::StripTrailingAsciiWhitespace(*text);
+  size_t count = text->size() - new_text.size();
+  *text = new_text;
+  return count;
+}
+
+size_t RemoveWhitespaceContext(StringPiece* text) {
+  absl::string_view new_text = absl::StripAsciiWhitespace(*text);
+  size_t count = text->size() - new_text.size();
+  *text = new_text;
+  return count;
+}
+
+bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
+  const char* p = s->data();
+  const char* limit = p + s->size();
+  uint64 v = 0;
+  while (p < limit) {
+    const char c = *p;
+    if (c < '0' || c > '9') break;
+    uint64 new_v = (v * 10) + (c - '0');
+    if (new_v / 8 < v) {
+      // Overflow occurred
+      return false;
+    }
+    v = new_v;
+    p++;
+  }
+  if (p > s->data()) {
+    // Consume some digits
+    s->remove_prefix(p - s->data());
+    *val = v;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
+  const char* p = s->data();
+  const char* limit = p + s->size();
+  while (p < limit) {
+    const char c = *p;
+    if (isspace(c)) break;
+    p++;
+  }
+  const size_t n = p - s->data();
+  if (n > 0) {
+    *val = StringPiece(s->data(), n);
+    s->remove_prefix(n);
+    return true;
+  } else {
+    *val = StringPiece();
+    return false;
+  }
+}
+
+bool ConsumePrefix(StringPiece* s, StringPiece expected) {
+  return absl::ConsumePrefix(s, expected);
+}
+
+bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
+  return absl::ConsumeSuffix(s, expected);
+}
+
 // Return lower-cased version of s.
 string Lowercase(StringPiece s) { return absl::AsciiStrToLower(s); }
 
 // Return upper-cased version of s.
 string Uppercase(StringPiece s) { return absl::AsciiStrToUpper(s); }
 
+void TitlecaseString(string* s, StringPiece delimiters) {
+  bool upper = true;
+  for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
+    if (upper) {
+      *ss = toupper(*ss);
+    }
+    upper = (delimiters.find(*ss) != StringPiece::npos);
+  }
+}
+
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all) {
+  // TODO(jlebar): We could avoid having to shift data around in the string if
+  // we had a StringPiece::find() overload that searched for a StringPiece.
+  string res(s);
+  size_t pos = 0;
+  while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
+    res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
+    pos += newsub.size();
+    if (oldsub.empty()) {
+      pos++;  // Match at the beginning of the text and after every byte
+    }
+    if (!replace_all) {
+      break;
+    }
+  }
+  return res;
+}
+
+bool StartsWith(StringPiece text, StringPiece prefix) {
+  return absl::StartsWith(text, prefix);
+}
+
+bool EndsWith(StringPiece text, StringPiece suffix) {
+  return absl::EndsWith(text, suffix);
+}
+
+bool StrContains(StringPiece haystack, StringPiece needle) {
+  return absl::StrContains(haystack, needle);
+}
+
+size_t Strnlen(const char* str, const size_t string_max_len) {
+  size_t len = 0;
+  while (len < string_max_len && str[len] != '\0') {
+    ++len;
+  }
+  return len;
+}
+
 string ArgDefCase(StringPiece s) {
   const size_t n = s.size();
 
@@ -116,147 +219,5 @@ string ArgDefCase(StringPiece s) {
   return result;
 }
 
-void TitlecaseString(string* s, StringPiece delimiters) {
-  bool upper = true;
-  for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
-    if (upper) {
-      *ss = toupper(*ss);
-    }
-    upper = (delimiters.find(*ss) != StringPiece::npos);
-  }
-}
-
-string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
-                     bool replace_all) {
-  // TODO(jlebar): We could avoid having to shift data around in the string if
-  // we had a StringPiece::find() overload that searched for a StringPiece.
-  string res(s);
-  size_t pos = 0;
-  while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
-    res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
-    pos += newsub.size();
-    if (oldsub.empty()) {
-      pos++;  // Match at the beginning of the text and after every byte
-    }
-    if (!replace_all) {
-      break;
-    }
-  }
-  return res;
-}
-
-size_t RemoveLeadingWhitespace(StringPiece* text) {
-  absl::string_view new_text = absl::StripLeadingAsciiWhitespace(*text);
-  size_t count = text->size() - new_text.size();
-  *text = new_text;
-  return count;
-}
-
-size_t RemoveTrailingWhitespace(StringPiece* text) {
-  absl::string_view new_text = absl::StripTrailingAsciiWhitespace(*text);
-  size_t count = text->size() - new_text.size();
-  *text = new_text;
-  return count;
-}
-
-size_t RemoveWhitespaceContext(StringPiece* text) {
-  absl::string_view new_text = absl::StripAsciiWhitespace(*text);
-  size_t count = text->size() - new_text.size();
-  *text = new_text;
-  return count;
-}
-
-bool ConsumePrefix(StringPiece* s, StringPiece expected) {
-  return absl::ConsumePrefix(s, expected);
-}
-
-bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
-  return absl::ConsumeSuffix(s, expected);
-}
-
-bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
-  const char* p = s->data();
-  const char* limit = p + s->size();
-  uint64 v = 0;
-  while (p < limit) {
-    const char c = *p;
-    if (c < '0' || c > '9') break;
-    uint64 new_v = (v * 10) + (c - '0');
-    if (new_v / 8 < v) {
-      // Overflow occurred
-      return false;
-    }
-    v = new_v;
-    p++;
-  }
-  if (p > s->data()) {
-    // Consume some digits
-    s->remove_prefix(p - s->data());
-    *val = v;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
-  const char* p = s->data();
-  const char* limit = p + s->size();
-  while (p < limit) {
-    const char c = *p;
-    if (isspace(c)) break;
-    p++;
-  }
-  const size_t n = p - s->data();
-  if (n > 0) {
-    *val = StringPiece(s->data(), n);
-    s->remove_prefix(n);
-    return true;
-  } else {
-    *val = StringPiece();
-    return false;
-  }
-}
-
-bool SplitAndParseAsInts(StringPiece text, char delim,
-                         std::vector<int32>* result) {
-  return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result);
-}
-
-bool SplitAndParseAsInts(StringPiece text, char delim,
-                         std::vector<int64>* result) {
-  return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result);
-}
-
-bool SplitAndParseAsFloats(StringPiece text, char delim,
-                           std::vector<float>* result) {
-  return SplitAndParseAsInts<float>(
-      text, delim,
-      [](StringPiece str, float* value) {
-        return strings::safe_strtof(str, value);
-      },
-      result);
-}
-
-size_t Strnlen(const char* str, const size_t string_max_len) {
-  size_t len = 0;
-  while (len < string_max_len && str[len] != '\0') {
-    ++len;
-  }
-  return len;
-}
-
-bool StrContains(StringPiece haystack, StringPiece needle) {
-  return absl::StrContains(haystack, needle);
-}
-
-bool StartsWith(StringPiece text, StringPiece prefix) {
-  return absl::StartsWith(text, prefix);
-}
-
-bool EndsWith(StringPiece text, StringPiece suffix) {
-  return absl::EndsWith(text, suffix);
-}
-
 }  // namespace str_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/str_util.h b/tensorflow/core/platform/str_util.h
new file mode 100644
index 00000000000..65fc97d69b0
--- /dev/null
+++ b/tensorflow/core/platform/str_util.h
@@ -0,0 +1,176 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
+#define TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+// Basic string utility routines
+namespace tensorflow {
+namespace str_util {
+
+// Returns a version of 'src' where unprintable characters have been
+// escaped using C-style escape sequences.
+string CEscape(StringPiece src);
+
+// Copies "source" to "dest", rewriting C-style escape sequences --
+// '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
+//
+// Errors: Sets the description of the first encountered error in
+// 'error'. To disable error reporting, set 'error' to NULL.
+//
+// NOTE: Does not support \u or \U!
+bool CUnescape(StringPiece source, string* dest, string* error);
+
+// Removes any trailing whitespace from "*s".
+void StripTrailingWhitespace(string* s);
+
+// Removes leading ascii_isspace() characters.
+// Returns number of characters removed.
+size_t RemoveLeadingWhitespace(StringPiece* text);
+
+// Removes trailing ascii_isspace() characters.
+// Returns number of characters removed.
+size_t RemoveTrailingWhitespace(StringPiece* text);
+
+// Removes leading and trailing ascii_isspace() chars.
+// Returns number of chars removed.
+size_t RemoveWhitespaceContext(StringPiece* text);
+
+// Consume a leading positive integer value.  If any digits were
+// found, store the value of the leading unsigned number in "*val",
+// advance "*s" past the consumed number, and return true.  If
+// overflow occurred, returns false.  Otherwise, returns false.
+bool ConsumeLeadingDigits(StringPiece* s, uint64* val);
+
+// Consume a leading token composed of non-whitespace characters only.
+// If *s starts with a non-zero number of non-whitespace characters, store
+// them in *val, advance *s past them, and return true.  Else return false.
+bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val);
+
+// If "*s" starts with "expected", consume it and return true.
+// Otherwise, return false.
+bool ConsumePrefix(StringPiece* s, StringPiece expected);
+
+// If "*s" ends with "expected", remove it and return true.
+// Otherwise, return false.
+bool ConsumeSuffix(StringPiece* s, StringPiece expected);
+
+// Return lower-cased version of s.
+string Lowercase(StringPiece s);
+
+// Return upper-cased version of s.
+string Uppercase(StringPiece s);
+
+// Capitalize first character of each word in "*s".  "delimiters" is a
+// set of characters that can be used as word boundaries.
+void TitlecaseString(string* s, StringPiece delimiters);
+
+// Replaces the first occurrence (if replace_all is false) or all occurrences
+// (if replace_all is true) of oldsub in s with newsub.
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all);
+
+// Join functionality
+template <typename T>
+string Join(const T& s, const char* sep) {
+  return absl::StrJoin(s, sep);
+}
+
+// A variant of Join where for each element of "s", f(&dest_string, elem)
+// is invoked (f is often constructed with a lambda of the form:
+//   [](string* result, ElemType elem)
+template <typename T, typename Formatter>
+string Join(const T& s, const char* sep, Formatter f) {
+  return absl::StrJoin(s, sep, f);
+}
+
+struct AllowEmpty {
+  bool operator()(StringPiece sp) const { return true; }
+};
+struct SkipEmpty {
+  bool operator()(StringPiece sp) const { return !sp.empty(); }
+};
+struct SkipWhitespace {
+  bool operator()(StringPiece sp) const {
+    return !absl::StripTrailingAsciiWhitespace(sp).empty();
+  }
+};
+
+// Split strings using any of the supplied delimiters. For example:
+// Split("a,b.c,d", ".,") would return {"a", "b", "c", "d"}.
+inline std::vector<string> Split(StringPiece text, StringPiece delims) {
+  return text.empty() ? std::vector<string>()
+                      : absl::StrSplit(text, absl::ByAnyChar(delims));
+}
+
+template <typename Predicate>
+std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p) {
+  return text.empty() ? std::vector<string>()
+                      : absl::StrSplit(text, absl::ByAnyChar(delims), p);
+}
+
+inline std::vector<string> Split(StringPiece text, char delim) {
+  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim);
+}
+
+template <typename Predicate>
+std::vector<string> Split(StringPiece text, char delim, Predicate p) {
+  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim, p);
+}
+
+// StartsWith()
+//
+// Returns whether a given string `text` begins with `prefix`.
+bool StartsWith(StringPiece text, StringPiece prefix);
+
+// EndsWith()
+//
+// Returns whether a given string `text` ends with `suffix`.
+bool EndsWith(StringPiece text, StringPiece suffix);
+
+// StrContains()
+//
+// Returns whether a given string `haystack` contains the substring `needle`.
+bool StrContains(StringPiece haystack, StringPiece needle);
+
+// Returns the length of the given null-terminated byte string 'str'.
+// Returns 'string_max_len' if the null character was not found in the first
+// 'string_max_len' bytes of 'str'.
+size_t Strnlen(const char* str, const size_t string_max_len);
+
+//   ----- NON STANDARD, TF SPECIFIC METHOD -----
+// Converts "^2ILoveYou!" to "i_love_you_". More specifically:
+// - converts all non-alphanumeric characters to underscores
+// - replaces each occurrence of a capital letter (except the very
+//   first character and if there is already an '_' before it) with '_'
+//   followed by this letter in lower case
+// - Skips leading non-alpha characters
+// This method is useful for producing strings matching "[a-z][a-z0-9_]*"
+// as required by OpDef.ArgDef.name. The resulting string is either empty or
+// matches this regex.
+string ArgDefCase(StringPiece s);
+
+}  // namespace str_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/platform/str_util_test.cc
similarity index 83%
rename from tensorflow/core/lib/strings/str_util_test.cc
rename to tensorflow/core/platform/str_util_test.cc
index 3bf3e99825f..68261217470 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/platform/str_util_test.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/str_util.h"
 
 #include <vector>
+
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -267,78 +268,6 @@ TEST(Split, Basic) {
             "a|b|c");
 }
 
-TEST(SplitAndParseAsInts, Int32) {
-  std::vector<int32> nums;
-  EXPECT_TRUE(str_util::SplitAndParseAsInts("", ',', &nums));
-  EXPECT_EQ(nums.size(), 0);
-
-  EXPECT_TRUE(str_util::SplitAndParseAsInts("134", ',', &nums));
-  EXPECT_EQ(nums.size(), 1);
-  EXPECT_EQ(nums[0], 134);
-
-  EXPECT_TRUE(str_util::SplitAndParseAsInts("134,2,13,-5", ',', &nums));
-  EXPECT_EQ(nums.size(), 4);
-  EXPECT_EQ(nums[0], 134);
-  EXPECT_EQ(nums[1], 2);
-  EXPECT_EQ(nums[2], 13);
-  EXPECT_EQ(nums[3], -5);
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("-13,abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("13,abc,5", ',', &nums));
-}
-
-TEST(SplitAndParseAsInts, Int64) {
-  std::vector<int64> nums;
-  EXPECT_TRUE(str_util::SplitAndParseAsInts("", ',', &nums));
-  EXPECT_EQ(nums.size(), 0);
-
-  EXPECT_TRUE(str_util::SplitAndParseAsInts("134", ',', &nums));
-  EXPECT_EQ(nums.size(), 1);
-  EXPECT_EQ(nums[0], 134);
-
-  EXPECT_TRUE(
-      str_util::SplitAndParseAsInts("134,2,13,-4000000000", ',', &nums));
-  EXPECT_EQ(nums.size(), 4);
-  EXPECT_EQ(nums[0], 134);
-  EXPECT_EQ(nums[1], 2);
-  EXPECT_EQ(nums[2], 13);
-  EXPECT_EQ(nums[3], static_cast<int64>(-4000000000ull));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("-13,abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsInts("13,abc,5", ',', &nums));
-}
-
-TEST(SplitAndParseAsFloats, Float) {
-  std::vector<float> nums;
-  EXPECT_TRUE(str_util::SplitAndParseAsFloats("", ',', &nums));
-  EXPECT_EQ(nums.size(), 0);
-
-  EXPECT_TRUE(str_util::SplitAndParseAsFloats("134.2323", ',', &nums));
-  ASSERT_EQ(nums.size(), 1);
-  EXPECT_NEAR(nums[0], 134.2323f, 1e-5f);
-
-  EXPECT_TRUE(str_util::SplitAndParseAsFloats("134.9,2.123,13.0000,-5.999,1e6",
-                                              ',', &nums));
-  ASSERT_EQ(nums.size(), 5);
-  EXPECT_NEAR(nums[0], 134.9f, 1e-5f);
-  EXPECT_NEAR(nums[1], 2.123f, 1e-5f);
-  EXPECT_NEAR(nums[2], 13.0f, 1e-5f);
-  EXPECT_NEAR(nums[3], -5.999f, 1e-5f);
-  EXPECT_NEAR(nums[4], 1e6f, 1e1f);
-
-  EXPECT_FALSE(str_util::SplitAndParseAsFloats("abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsFloats("-13.0,abc", ',', &nums));
-
-  EXPECT_FALSE(str_util::SplitAndParseAsFloats("13.0,abc,-5.999", ',', &nums));
-}
-
 TEST(Lowercase, Basic) {
   EXPECT_EQ("", str_util::Lowercase(""));
   EXPECT_EQ("hello", str_util::Lowercase("hello"));
diff --git a/tensorflow/core/platform/stringpiece.h b/tensorflow/core/platform/stringpiece.h
new file mode 100644
index 00000000000..4ca42b474dd
--- /dev/null
+++ b/tensorflow/core/platform/stringpiece.h
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// StringPiece is a simple structure containing a pointer into some external
+// storage and a size.  The user of a StringPiece must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a StringPiece without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same StringPiece must use
+// external synchronization.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
+#define TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+
+using StringPiece = absl::string_view;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/platform/stringpiece_test.cc
similarity index 97%
rename from tensorflow/core/lib/core/stringpiece_test.cc
rename to tensorflow/core/platform/stringpiece_test.cc
index e4b489fe17f..4643c0c4f33 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/platform/stringpiece_test.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 #include <unordered_map>
+
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/platform/stringprintf.cc
similarity index 97%
rename from tensorflow/core/lib/strings/stringprintf.cc
rename to tensorflow/core/platform/stringprintf.cc
index bbffa062a93..89d99c8d53c 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/platform/stringprintf.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/stringprintf.h"
 
 #include <errno.h>
 #include <stdarg.h>  // For va_list and related operations
diff --git a/tensorflow/core/platform/stringprintf.h b/tensorflow/core/platform/stringprintf.h
new file mode 100644
index 00000000000..802b568101e
--- /dev/null
+++ b/tensorflow/core/platform/stringprintf.h
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Printf variants that place their output in a C++ string.
+//
+// Usage:
+//      string result = strings::Printf("%d %s\n", 10, "hello");
+//      strings::Appendf(&result, "%d %s\n", 20, "there");
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
+#define TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
+
+#include <stdarg.h>
+
+#include <string>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace strings {
+
+// Return a C++ string
+extern string Printf(const char* format, ...)
+    // Tell the compiler to do printf format string checking.
+    TF_PRINTF_ATTRIBUTE(1, 2);
+
+// Append result to a supplied string
+extern void Appendf(string* dst, const char* format, ...)
+    // Tell the compiler to do printf format string checking.
+    TF_PRINTF_ATTRIBUTE(2, 3);
+
+// Lower-level routine that takes a va_list and appends to a specified
+// string.  All other routines are just convenience wrappers around it.
+extern void Appendv(string* dst, const char* format, va_list ap);
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/platform/stringprintf_test.cc
similarity index 98%
rename from tensorflow/core/lib/strings/stringprintf_test.cc
rename to tensorflow/core/platform/stringprintf_test.cc
index 02cf4cbcadc..d24523be843 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/platform/stringprintf_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/stringprintf.h"
 
 #include <string>
 
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 3280802bac4..c493a4fd957 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -33,7 +33,7 @@ void AssignRefCounted(StringPiece src, core::RefCounted* obj, string* out) {
   out->assign(src.data(), src.size());
 }
 
-void EncodeStringList(const string* strings, int64 n, string* out) {
+void EncodeStringList(const tstring* strings, int64 n, string* out) {
   out->clear();
   for (int i = 0; i < n; ++i) {
     core::PutVarint32(out, strings[i].size());
@@ -43,7 +43,7 @@ void EncodeStringList(const string* strings, int64 n, string* out) {
   }
 }
 
-bool DecodeStringList(const string& src, string* strings, int64 n) {
+bool DecodeStringList(const string& src, tstring* strings, int64 n) {
   std::vector<uint32> sizes(n);
   StringPiece reader(src);
   int64 tot = 0;
@@ -55,7 +55,7 @@ bool DecodeStringList(const string& src, string* strings, int64 n) {
     return false;
   }
 
-  string* data = strings;
+  tstring* data = strings;
   for (int64 i = 0; i < n; ++i, ++data) {
     auto size = sizes[i];
     if (size > reader.size()) {
@@ -144,7 +144,7 @@ void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out) {
                             cleanup);
 }
 
-void EncodeStringList(const string* strings, int64 n, Cord* out) {
+void EncodeStringList(const tstring* strings, int64 n, Cord* out) {
   out->Clear();
   for (int i = 0; i < n; ++i) {
     ::strings::CordAppendVarint(strings[i].size(), out);
@@ -177,6 +177,31 @@ bool DecodeStringList(const Cord& src, string* strings, int64 n) {
   return true;
 }
 
+#ifdef USE_TSTRING
+bool DecodeStringList(const Cord& src, tstring* strings, int64 n) {
+  std::vector<uint32> sizes(n);
+  CordReader reader(src);
+  int64 tot = 0;
+  for (auto& v : sizes) {
+    if (!::strings::CordReaderReadVarint(&reader, &v)) return false;
+    tot += v;
+  }
+  if (tot != reader.Available()) {
+    return false;
+  }
+  tstring* data = strings;
+  for (int i = 0; i < n; ++i, ++data) {
+    auto size = sizes[i];
+    if (size > reader.Available()) {
+      return false;
+    }
+    data->resize_uninitialized(size);
+    reader.ReadN(size, data->data());
+  }
+  return true;
+}
+#endif  // USE_TSTRING
+
 void CopyFromArray(Cord* c, const char* base, size_t bytes) {
   c->CopyFrom(base, bytes);
 }
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 993ce537ffc..614a9da0834 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -47,11 +47,11 @@ inline void CopySubrangeToArray(const string& src, size_t pos, size_t n,
 }
 
 // Store encoding of strings[0..n-1] in *out.
-void EncodeStringList(const string* strings, int64 n, string* out);
+void EncodeStringList(const tstring* strings, int64 n, string* out);
 
 // Decode n strings from src and store in strings[0..n-1].
 // Returns true if successful, false on parse error.
-bool DecodeStringList(const string& src, string* strings, int64 n);
+bool DecodeStringList(const string& src, tstring* strings, int64 n);
 
 // Assigns base[0..bytes-1] to *s
 void CopyFromArray(string* s, const char* base, size_t bytes);
@@ -112,11 +112,12 @@ inline void CopySubrangeToArray(const Cord& src, int64 pos, int64 n,
 }
 
 // Store encoding of strings[0..n-1] in *out.
-void EncodeStringList(const string* strings, int64 n, Cord* out);
+void EncodeStringList(const tstring* strings, int64 n, Cord* out);
 
 // Decode n strings from src and store in strings[0..n-1].
 // Returns true if successful, false on parse error.
 bool DecodeStringList(const Cord& src, string* strings, int64 n);
+bool DecodeStringList(const Cord& src, tstring* strings, int64 n);
 
 // Assigns base[0..bytes-1] to *c
 void CopyFromArray(Cord* c, const char* base, size_t bytes);
diff --git a/tensorflow/core/platform/thread_annotations.h b/tensorflow/core/platform/thread_annotations.h
index aec34df8a18..2abe0b001e6 100644
--- a/tensorflow/core/platform/thread_annotations.h
+++ b/tensorflow/core/platform/thread_annotations.h
@@ -13,18 +13,165 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This header file contains the macro definitions for thread safety
+// annotations that allow the developers to document the locking policies
+// of their multi-threaded code. The annotations can also help program
+// analysis tools to identify potential thread safety issues.
+//
+// The primary documentation on these annotations is external:
+// http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+//
+// The annotations are implemented using compiler attributes.
+// Using the macros defined here instead of the raw attributes allows
+// for portability and future compatibility.
+//
+// When referring to mutexes in the arguments of the attributes, you should
+// use variable names or more complex expressions (e.g. my_object->mutex_)
+// that evaluate to a concrete mutex object whenever possible. If the mutex
+// you want to refer to is not in scope, you may use a member pointer
+// (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
+//
+
 #ifndef TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
 #define TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
 
-#include "tensorflow/core/platform/types.h"
+// IWYU pragma: private, include "third_party/tensorflow/core/platform/thread_annotations.h"
+// IWYU pragma: friend third_party/tensorflow/core/platform/thread_annotations.h
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/build_config/thread_annotations.h"
-#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/default/thread_annotations.h"
+#if defined(__clang__) && (!defined(SWIG))
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
 #else
-#error Define the appropriate PLATFORM_<foo> macro for this platform
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
 #endif
 
+// Document if a shared variable/field needs to be protected by a mutex.
+// GUARDED_BY allows the user to specify a particular mutex that should be
+// held when accessing the annotated variable.  GUARDED_VAR indicates that
+// a shared variable is guarded by some unspecified mutex, for use in rare
+// cases where a valid mutex expression cannot be specified.
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_VAR  // no-op
+
+// Document if the memory location pointed to by a pointer should be guarded
+// by a mutex when dereferencing the pointer.  PT_GUARDED_VAR is analogous to
+// GUARDED_VAR.   Note that a pointer variable to a shared memory location
+// could itself be a shared variable. For example, if a shared global pointer
+// q, which is guarded by mu1, points to a shared memory location that is
+// guarded by mu2, q should be annotated as follows:
+//     int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2);
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_VAR  // no-op
+
+// Document the acquisition order between locks that can be held
+// simultaneously by a thread. For any two locks that need to be annotated
+// to establish an acquisition order, only one of them needs the annotation.
+// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
+// and ACQUIRED_BEFORE.)
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+// Document a function that expects a mutex to be held prior to entry.
+// The mutex is expected to be held both on entry to and exit from the
+// function.
+#define EXCLUSIVE_LOCKS_REQUIRED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
+
+#define SHARED_LOCKS_REQUIRED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
+
+// Document the locks acquired in the body of the function. These locks
+// cannot be held when calling this function (for instance, when the
+// mutex implementation is non-reentrant).
+#define LOCKS_EXCLUDED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+// Document a function that returns a mutex without acquiring it.  For example,
+// a public getter method that returns a pointer to a private mutex should
+// be annotated with LOCK_RETURNED.
+#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+// Document if a class/type is a lockable type (such as the Mutex class).
+#define LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+
+// Document if a class does RAII locking (such as the MutexLock class).
+// The constructor should use LOCK_FUNCTION to specify the mutex that is
+// acquired, and the destructor should use UNLOCK_FUNCTION with no arguments;
+// the analysis will assume that the destructor unlocks whatever the
+// constructor locked.
+#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+// Document functions that acquire a lock in the body of a function, and do
+// not release it.
+#define EXCLUSIVE_LOCK_FUNCTION(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
+
+#define SHARED_LOCK_FUNCTION(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
+
+// Document functions that expect a lock to be held on entry to the function,
+// and release it in the body of the function.
+#define UNLOCK_FUNCTION(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
+
+// Document functions that try to acquire a lock, and return success or failure
+// (or a non-boolean value that can be interpreted as a boolean).
+// The first argument should be true for functions that return true on success,
+// or false for functions that return false on success. The second argument
+// specifies the mutex that is locked on success. If unspecified, it is assumed
+// to be 'this'.
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
+
+#define SHARED_TRYLOCK_FUNCTION(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
+
+// Document functions that dynamically check to see if a lock is held, and fail
+// if it is not held.
+#define ASSERT_EXCLUSIVE_LOCK(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_exclusive_lock(__VA_ARGS__))
+
+#define ASSERT_SHARED_LOCK(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_lock(__VA_ARGS__))
+
+// Turns off thread safety checking within the body of a particular function.
+// This is used as an escape hatch for cases where either (a) the function
+// is correct, but the locking is more complicated than the analyzer can handle,
+// or (b) the function contains race conditions that are known to be benign.
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+// TS_UNCHECKED should be placed around lock expressions that are not valid
+// C++ syntax, but which are present for documentation purposes.  These
+// annotations will be ignored by the analysis.
+#define TS_UNCHECKED(x) ""
+
+namespace tensorflow {
+namespace thread_safety_analysis {
+
+// Takes a reference to a guarded data member, and returns an unguarded
+// reference.
+template <class T>
+inline const T& ts_unchecked_read(const T& v) NO_THREAD_SAFETY_ANALYSIS {
+  return v;
+}
+
+template <class T>
+inline T& ts_unchecked_read(T& v) NO_THREAD_SAFETY_ANALYSIS {
+  return v;
+}
+}  // namespace thread_safety_analysis
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/tracing.cc b/tensorflow/core/platform/tracing.cc
index c0386c0a3fc..ab8c3ec4ea5 100644
--- a/tensorflow/core/platform/tracing.cc
+++ b/tensorflow/core/platform/tracing.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace tracing {
 namespace {
 std::atomic<uint64> unique_arg{1};
-std::atomic<const TraceCollector*> trace_collector;
+std::atomic<bool> enable_annotation;
 }  // namespace
 
 const char* GetEventCategoryName(EventCategory category) {
@@ -61,23 +61,12 @@ uint64 GetArgForName(StringPiece name) {
   return Hash64(name.data(), name.size());
 }
 
-string TraceCollector::ConcatenateNames(StringPiece first, StringPiece second) {
-  std::string result;
-  bool has_two_parts = !first.empty() && !second.empty();
-  result.reserve(first.size() + second.size() +
-                 static_cast<int>(has_two_parts));
-  result.append(first.data(), first.size());
-  if (has_two_parts) result.append({':'});
-  result.append(second.data(), second.size());
-  return result;
+void ScopedAnnotation::Enable(bool enable) {
+  return enable_annotation.store(enable, std::memory_order_release);
 }
 
-void SetTraceCollector(const TraceCollector* collector) {
-  return trace_collector.store(collector, std::memory_order_release);
-}
-
-const TraceCollector* GetTraceCollector() {
-  return trace_collector.load(std::memory_order_acquire);
+const bool ScopedAnnotation::IsEnabled() {
+  return enable_annotation.load(std::memory_order_acquire);
 }
 
 }  // namespace tracing
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 9b2886f1c42..45d28f84f40 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
@@ -141,67 +142,6 @@ class ScopedRegion {
   const EventCollector* collector_;
 };
 
-// Interface for accelerator profiler annotations.
-class TraceCollector {
- public:
-  class Handle {
-   public:
-    virtual ~Handle() {}
-  };
-
-  virtual ~TraceCollector() {}
-  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
-      StringPiece name_part1, StringPiece name_part2) const = 0;
-
-  // Returns true if this annotation tracing is enabled for any op.
-  virtual bool IsEnabledForAnnotations() const = 0;
-
-  static string ConcatenateNames(StringPiece first, StringPiece second);
-
- private:
-  friend void SetTraceCollector(const TraceCollector*);
-  friend const TraceCollector* GetTraceCollector();
-};
-// Set the callback for ScopedAnnotation and ScopedActivity.
-void SetTraceCollector(const TraceCollector* collector);
-// Returns the callback for ScopedAnnotation and ScopedActivity.
-const TraceCollector* GetTraceCollector();
-
-// Adds an annotation to all activities for the duration of the instance
-// lifetime through the currently registered TraceCollector.
-//
-// Usage: {
-//          ScopedAnnotation annotation("my kernels");
-//          Kernel1<<<x,y>>>;
-//          LaunchKernel2(); // Launches a CUDA kernel.
-//        }
-// This will add 'my kernels' to both kernels in the profiler UI
-class ScopedAnnotation {
- public:
-  explicit ScopedAnnotation(StringPiece name)
-      : ScopedAnnotation(name, StringPiece()) {}
-
-  // If tracing is enabled, add a name scope of
-  // "<name_part1>:<name_part2>".  This can be cheaper than the
-  // single-argument constructor because the concatenation of the
-  // label string is only done if tracing is enabled.
-  ScopedAnnotation(StringPiece name_part1, StringPiece name_part2)
-      : handle_([&] {
-          auto trace_collector = GetTraceCollector();
-          return trace_collector ? trace_collector->CreateAnnotationHandle(
-                                       name_part1, name_part2)
-                                 : nullptr;
-        }()) {}
-
-  static bool IsEnabled() {
-    auto* trace_collector = GetTraceCollector();
-    return trace_collector && trace_collector->IsEnabledForAnnotations();
-  }
-
- private:
-  std::unique_ptr<TraceCollector::Handle> handle_;
-};
-
 // Return the pathname of the directory where we are writing log files.
 const char* GetLogDir();
 
diff --git a/tensorflow/core/platform/tstring.h b/tensorflow/core/platform/tstring.h
new file mode 100644
index 00000000000..e14fdf2826f
--- /dev/null
+++ b/tensorflow/core/platform/tstring.h
@@ -0,0 +1,268 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TSTRING_H_
+#define TENSORFLOW_CORE_PLATFORM_TSTRING_H_
+
+#include <string>
+
+// TODO(b/138799229): Used to toggle until global presubmits pass.
+// #define USE_TSTRING
+
+#ifdef USE_TSTRING
+
+// The inclusion of absl/strings/string_view.h in tstring.h would preclude the
+// use of tstring in tflite.  Given that, in order to mitigate the forced
+// inclusion of absl/strings/string_view.h while providing convenience methods
+// for implicit conversion, we replace explicit uses of absl::string_view with a
+// forward declaration and associated templates.
+namespace absl {
+class string_view;
+}
+
+#ifdef PLATFORM_GOOGLE
+// TODO(dero): Move above to 'namespace absl' when absl moves Cord out of global
+// namepace.
+class Cord;
+#endif  // PLATFORM_GOOGLE
+
+namespace tensorflow {
+
+// tensorflow::tstring is the scalar type for DT_STRING tensors.
+//
+// TODO(b/138799229): In order to ease migration from tensorflow::string to
+// tensorflow::tstring, we define a simplified tstring class which wraps
+// std::string.  The API defined below is the expected subset of methods for
+// tstring.
+//
+// The underlying implementation of tstring will be replaced with the one
+// defined in [1] once the migration in tensorflow/ is complete.
+//
+// [1] https://github.com/tensorflow/community/pull/91
+class tstring {
+  std::string str_;
+
+  template <typename T, typename = void>
+  struct ResizeUninitialized {
+    static void Resize(T& s, size_t new_size) { s.resize(new_size); }
+  };
+
+  template <typename T>
+  struct ResizeUninitialized<
+      T, decltype(std::declval<T>().__resize_default_init(0))> {
+    static void Resize(T& s, size_t new_size) {
+      s.__resize_default_init(new_size);
+    }
+  };
+
+ public:
+  tstring() = default;
+
+  tstring(const tstring&) = default;
+
+  tstring(const std::string& str) : str_(str) {}
+
+  tstring(const char* str, size_t len) : str_(str, len) {}
+
+  tstring(const char* str) : str_(str) {}
+
+  tstring(size_t n, char c) : str_(n, c) {}
+
+  template <typename T,
+            typename std::enable_if<std::is_same<T, absl::string_view>::value,
+                                    T>::type* = nullptr>
+  explicit tstring(const T& str) : str_(str.data(), str.size()) {}
+
+#ifdef PLATFORM_GOOGLE
+  template <typename T, typename std::enable_if<std::is_same<T, Cord>::value,
+                                                T>::type* = nullptr>
+  explicit tstring(const T& cord) : str_(string(cord)) {}
+#endif  // PLATFORM_GOOGLE
+
+  tstring(tstring&&) noexcept = default;
+
+  ~tstring() = default;
+
+  tstring& operator=(const tstring& str) = default;
+
+  tstring& operator=(const std::string& str) {
+    str_ = str;
+
+    return *this;
+  }
+
+  template <typename T,
+            typename std::enable_if<std::is_same<T, absl::string_view>::value,
+                                    T>::type* = nullptr>
+  tstring& operator=(const T& str) {
+    str_.assign(str.data(), str.size());
+
+    return *this;
+  }
+
+#ifdef PLATFORM_GOOGLE
+  template <typename T, typename std::enable_if<std::is_same<T, Cord>::value,
+                                                T>::type* = nullptr>
+  tstring& operator=(const T& cord) {
+    str_ = string(cord);
+
+    return *this;
+  }
+#endif  // PLATFORM_GOOGLE
+
+  tstring& operator=(const char* str) {
+    str_ = str;
+
+    return *this;
+  }
+
+  tstring& operator=(tstring&&) noexcept = default;
+
+  bool operator<(const tstring& o) const { return str_ < o.str_; }
+
+  bool operator>(const tstring& o) const { return str_ > o.str_; }
+
+  bool operator==(const char* o) const { return str_ == o; }
+
+  bool operator==(const tstring& o) const { return str_ == o.str_; }
+
+  bool operator!=(const char* o) const { return str_ != o; }
+
+  bool operator!=(const tstring& o) const { return str_ != o.str_; }
+
+  operator std::string() const { return str_; }
+
+  template <typename T,
+            typename std::enable_if<std::is_same<T, absl::string_view>::value,
+                                    T>::type* = nullptr>
+  operator T() const {
+    return T(str_.data(), str_.size());
+  }
+
+  bool empty() const { return str_.empty(); }
+
+  size_t length() const { return str_.length(); }
+
+  size_t size() const { return str_.size(); }
+
+  size_t capacity() const { return str_.capacity(); }
+
+  const char* c_str() const { return str_.c_str(); }
+
+  const char* data() const { return str_.data(); }
+
+  char back() const { return str_.back(); }
+
+  const char& operator[](size_t i) const { return str_[i]; }
+
+  char* data() { return &str_[0]; }
+
+  char& operator[](size_t i) { return str_[i]; }
+
+  void clear() noexcept { str_.clear(); }
+
+  void resize(size_t new_size) { str_.resize(new_size); }
+
+  void resize_uninitialized(size_t new_size) {
+    ResizeUninitialized<decltype(str_)>::Resize(str_, new_size);
+  }
+
+  void reserve(size_t n) { str_.reserve(n); }
+
+  tstring& assign(const char* str, size_t len) {
+    str_.assign(str, len);
+
+    return *this;
+  }
+
+  tstring& assign(const char* str) {
+    str_.assign(str);
+
+    return *this;
+  }
+
+  tstring& append(const tstring& str) {
+    str_.append(str);
+
+    return *this;
+  }
+
+  tstring& append(const char* str, size_t len) {
+    str_.append(str, len);
+
+    return *this;
+  }
+
+  tstring& append(const char* str) {
+    str_.append(str);
+
+    return *this;
+  }
+
+  void swap(tstring& str) { str_.swap(str.str_); }
+
+  tstring& insert(size_t pos, const tstring& str, size_t subpos,
+                  size_t sublen) {
+    str_.insert(pos, str.str_, subpos, sublen);
+
+    return *this;
+  }
+
+  void push_back(char ch) { str_.push_back(ch); }
+
+  friend const tstring operator+(const tstring& a, const tstring& b);
+  friend bool operator==(const char* a, const tstring& b);
+  friend bool operator==(const std::string& a, const tstring& b);
+  friend std::ostream& operator<<(std::ostream& o, const tstring& str);
+  friend std::hash<tstring>;
+};
+
+inline bool operator==(const char* a, const tstring& b) { return a == b.str_; }
+
+inline bool operator==(const std::string& a, const tstring& b) {
+  return a == b.str_;
+}
+
+inline const tstring operator+(const tstring& a, const tstring& b) {
+  return tstring(a.str_ + b.str_);
+}
+
+inline std::ostream& operator<<(std::ostream& o, const tstring& str) {
+  return o << str.str_;
+}
+
+}  // namespace tensorflow
+
+namespace std {
+template <>
+struct hash<tensorflow::tstring> {
+  size_t operator()(const tensorflow::tstring& o) const {
+    std::hash<std::string> fn;
+    return fn(o.str_);
+  }
+};
+}  // namespace std
+
+#else  // USE_TSTRING
+
+namespace tensorflow {
+
+typedef std::string tstring;
+
+}  // namespace tensorflow
+
+#endif  // USE_TSTRING
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TSTRING_H_
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index b82d9cc3247..ef6a8f93332 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_TYPES_H_
 
 #include <string>
+
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/tstring.h"
 
 // Include appropriate platform-dependent implementations
 #if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
diff --git a/tensorflow/core/platform/unbounded_work_queue.h b/tensorflow/core/platform/unbounded_work_queue.h
new file mode 100644
index 00000000000..242980dafa9
--- /dev/null
+++ b/tensorflow/core/platform/unbounded_work_queue.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+#define TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool
+// whose size automatically increases with demand.
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/platform/google/unbounded_work_queue.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/default/unbounded_work_queue.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
diff --git a/tensorflow/core/platform/unbounded_work_queue_test.cc b/tensorflow/core/platform/unbounded_work_queue_test.cc
new file mode 100644
index 00000000000..03d91cd4893
--- /dev/null
+++ b/tensorflow/core/platform/unbounded_work_queue_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class UnboundedWorkQueueTest : public ::testing::Test {
+ protected:
+  UnboundedWorkQueueTest()
+      : work_queue_(
+            absl::make_unique<UnboundedWorkQueue>(Env::Default(), "test")) {}
+  ~UnboundedWorkQueueTest() override = default;
+
+  void RunMultipleCopiesOfClosure(const int num_closures,
+                                  std::function<void()> fn) {
+    for (int i = 0; i < num_closures; ++i) {
+      work_queue_->Schedule([this, fn]() {
+        fn();
+        mutex_lock l(mu_);
+        ++closure_count_;
+        cond_var_.notify_all();
+      });
+    }
+  }
+
+  void BlockUntilClosuresDone(const int num_closures) {
+    mutex_lock l(mu_);
+    while (closure_count_ < num_closures) {
+      cond_var_.wait(l);
+    }
+  }
+
+  void ResetQueue() { work_queue_.reset(); }
+
+  int NumClosuresExecuted() {
+    mutex_lock l(mu_);
+    return closure_count_;
+  }
+
+ private:
+  mutex mu_;
+  int closure_count_ GUARDED_BY(mu_) = 0;
+  condition_variable cond_var_;
+  std::unique_ptr<UnboundedWorkQueue> work_queue_;
+};
+
+TEST_F(UnboundedWorkQueueTest, SingleClosure) {
+  constexpr int num_closures = 1;
+  RunMultipleCopiesOfClosure(num_closures, []() {});
+  BlockUntilClosuresDone(num_closures);
+}
+
+TEST_F(UnboundedWorkQueueTest, MultipleClosures) {
+  constexpr int num_closures = 10;
+  RunMultipleCopiesOfClosure(num_closures, []() {});
+  BlockUntilClosuresDone(num_closures);
+}
+
+TEST_F(UnboundedWorkQueueTest, MultipleClosuresSleepingRandomly) {
+  constexpr int num_closures = 1000;
+  RunMultipleCopiesOfClosure(num_closures, []() {
+    Env::Default()->SleepForMicroseconds(random::New64() % 10);
+  });
+  BlockUntilClosuresDone(num_closures);
+}
+
+TEST_F(UnboundedWorkQueueTest, NestedClosures) {
+  constexpr int num_closures = 10;
+  // Run `num_closures` closures, each of which runs `num_closures` closures.
+  RunMultipleCopiesOfClosure(num_closures, [this]() {
+    RunMultipleCopiesOfClosure(num_closures, []() {});
+  });
+  BlockUntilClosuresDone(num_closures * num_closures + num_closures);
+}
+
+TEST_F(UnboundedWorkQueueTest, RacyDestructor) {
+  constexpr int num_closures = 100;
+  // Run `num_closures` closures, then delete `work_queue_`.
+  RunMultipleCopiesOfClosure(num_closures, []() {});
+  ResetQueue();
+  EXPECT_LE(NumClosuresExecuted(), num_closures);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 2aa84d650f5..2303b587ce6 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -128,10 +128,6 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
 
 std::size_t MallocExtension_GetAllocatedSize(const void* p) { return 0; }
 
-void AdjustFilenameForLogging(string* filename) {
-  // Nothing to do
-}
-
 bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 8580c3a3efb..14543c29f52 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -122,7 +122,13 @@ class WindowsRandomAccessFile : public RandomAccessFile {
     Status s;
     char* dst = scratch;
     while (n > 0 && s.ok()) {
-      SSIZE_T r = pread(hfile_, dst, n, offset);
+      size_t requested_read_length;
+      if (n > std::numeric_limits<DWORD>::max()) {
+        requested_read_length = std::numeric_limits<DWORD>::max();
+      } else {
+        requested_read_length = n;
+      }
+      SSIZE_T r = pread(hfile_, dst, requested_read_length, offset);
       if (r > 0) {
         offset += r;
         dst += r;
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 470472e34ce..73b938eb639 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library", "tf_additional_all_protos")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 # Placeholder for Google-internal load statements.
 
 package(
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 15a9497890f..53505eb1210 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -370,8 +370,8 @@ tf_cuda_library(
     srcs = ["traceme_recorder.cc"],
     hdrs = ["traceme_recorder.h"],
     visibility = [
-        "//learning/brain/runtime:__pkg__",  # xprof_bridge
-        "//perftools/accelerators/xprof/xprofilez:__pkg__",  # alias xprof::TraceMeRecorder
+        "//perftools/accelerators/xprof/xprofilez/cpu:__pkg__",  # host_tracer
+        "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",  # traceme_test
         "//tensorflow/core:__pkg__",  # executor.cc
         "//tensorflow/core/profiler/internal/cpu:__pkg__",  # host_tracer
         "//tensorflow/core/profiler/lib:__pkg__",  # traceme
@@ -379,7 +379,6 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -435,7 +434,50 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/profiler/lib:profiler_session",
+        "//tensorflow/core/platform:annotation",
         "@com_google_absl//absl/strings",
     ],
 )
+
+tf_cuda_library(
+    name = "python_traceme",
+    hdrs = ["python_traceme.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        ":traceme_recorder",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cuda_library(
+    name = "python_scoped_annotation",
+    hdrs = ["python_scoped_annotation.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        "//tensorflow/core/platform:annotation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "parse_annotation",
+    srcs = ["parse_annotation.cc"],
+    hdrs = ["parse_annotation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "parse_annotation_test",
+    srcs = ["parse_annotation_test.cc"],
+    deps = [
+        ":parse_annotation",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 6fddd5829ce..ca5d0713262 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -47,6 +47,10 @@ class HostTracer : public ProfilerInterface {
   // The user traces and thread names are in no particular order.
   Status CollectData(RunMetadata* run_metadata) override;
 
+  profiler::DeviceType GetDeviceType() override {
+    return profiler::DeviceType::kCpu;
+  }
+
  private:
   // Level of host tracing.
   const int host_trace_level_;
@@ -141,9 +145,10 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
 }  // namespace
 
 // Not in anonymous namespace for testing purposes.
-std::unique_ptr<ProfilerInterface> CreateHostTracer(const ProfilerContext*) {
-  int host_trace_level = 2;
-  return absl::make_unique<HostTracer>(host_trace_level);
+std::unique_ptr<ProfilerInterface> CreateHostTracer(
+    const profiler::ProfilerOptions& options) {
+  if (options.host_tracer_level == 0) return nullptr;
+  return absl::make_unique<HostTracer>(options.host_tracer_level);
 }
 
 auto register_host_tracer_factory = [] {
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 8b0e027bad5..e83fe8a3e1a 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -28,7 +28,9 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 namespace cpu {
-std::unique_ptr<ProfilerInterface> CreateHostTracer(const ProfilerContext*);
+
+std::unique_ptr<ProfilerInterface> CreateHostTracer(
+    const ProfilerOptions& options);
 
 namespace {
 
@@ -80,7 +82,8 @@ inline ::testing::PolymorphicMatcher<NodeStatsMatcher> EqualsNodeStats(
 TEST(HostTracerTest, CollectsTraceMeEvents) {
   uint32 thread_id = Env::Default()->GetCurrentThreadId();
 
-  auto tracer = CreateHostTracer(nullptr);
+  const ProfilerOptions options;
+  auto tracer = CreateHostTracer(options);
 
   TF_ASSERT_OK(tracer->Start());
   { TraceMe traceme("hello"); }
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 4622be277ca..d1b78c631f5 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -7,3 +7,51 @@ alias(
     name = "device_tracer",
     actual = "//tensorflow/core:device_tracer",
 )
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cuda_library",
+    "if_cuda_is_configured_compat",
+)
+
+tf_cuda_library(
+    name = "cupti_interface",
+    hdrs = if_cuda_is_configured_compat(["cupti_interface.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:platform_base",
+        "//tensorflow/stream_executor/cuda:cupti_stub",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+tf_cuda_library(
+    name = "cupti_wrapper",
+    srcs = if_cuda_is_configured_compat(["cupti_wrapper.cc"]),
+    hdrs = if_cuda_is_configured_compat(["cupti_wrapper.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cupti_interface",
+        "//tensorflow/stream_executor/cuda:cupti_stub",
+    ],
+)
+
+tf_cuda_library(
+    name = "cupti_tracer",
+    srcs = if_cuda_is_configured_compat(["cupti_tracer.cc"]),
+    hdrs = if_cuda_is_configured_compat(["cupti_tracer.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cupti_interface",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:annotation",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_interface.h b/tensorflow/core/profiler/internal/gpu/cupti_interface.h
new file mode 100644
index 00000000000..11baac4e92d
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_interface.h
@@ -0,0 +1,196 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_INTERFACE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Provides a wrapper interface to every single CUPTI API function. This class
+// is needed to create an easy mock object for CUPTI API calls. All member
+// functions are defined in the following order: activity related APIs, callback
+// related APIs, Event APIs, and metric APIs. Within each category, we follow
+// the order in the original CUPTI documentation.
+class CuptiInterface {
+ public:
+  CuptiInterface() {}
+
+  virtual ~CuptiInterface() {}
+
+  // CUPTI activity API
+  virtual CUptiResult ActivityDisable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityEnable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityFlushAll(uint32_t flag) = 0;
+
+  virtual CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                            size_t valid_buffer_size_bytes,
+                                            CUpti_Activity** record) = 0;
+
+  virtual CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                                   uint32_t stream_id,
+                                                   size_t* dropped) = 0;
+
+  virtual CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) = 0;
+
+  virtual CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
+
+  virtual CUptiResult GetDeviceId(CUcontext context, uint32* deviceId) = 0;
+
+  virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
+
+  virtual CUptiResult Finalize() = 0;
+
+  // CUPTI callback API
+  virtual CUptiResult EnableCallback(uint32_t enable,
+                                     CUpti_SubscriberHandle subscriber,
+                                     CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid) = 0;
+
+  virtual CUptiResult EnableDomain(uint32_t enable,
+                                   CUpti_SubscriberHandle subscriber,
+                                   CUpti_CallbackDomain domain) = 0;
+
+  virtual CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                                CUpti_CallbackFunc callback,
+                                void* userdata) = 0;
+
+  virtual CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) = 0;
+
+  // CUPTI event API
+  virtual CUptiResult DeviceEnumEventDomains(
+      CUdevice device, size_t* array_size_bytes,
+      CUpti_EventDomainID* domain_array) = 0;
+
+  virtual CUptiResult DeviceGetEventDomainAttribute(
+      CUdevice device, CUpti_EventDomainID event_domain,
+      CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult DisableKernelReplayMode(CUcontext context) = 0;
+
+  virtual CUptiResult EnableKernelReplayMode(CUcontext context) = 0;
+
+  virtual CUptiResult DeviceGetNumEventDomains(CUdevice device,
+                                               uint32_t* num_domains) = 0;
+
+  virtual CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
+                                            size_t* array_size_bytes,
+                                            CUpti_EventID* event_array) = 0;
+
+  virtual CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
+                                              uint32_t* num_events) = 0;
+
+  virtual CUptiResult EventGetAttribute(CUpti_EventID event,
+                                        CUpti_EventAttribute attrib,
+                                        size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult EventGetIdFromName(CUdevice device,
+                                         const char* event_name,
+                                         CUpti_EventID* event) = 0;
+
+  virtual CUptiResult EventGroupDisable(CUpti_EventGroup event_group) = 0;
+
+  virtual CUptiResult EventGroupEnable(CUpti_EventGroup event_group) = 0;
+
+  virtual CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
+                                             CUpti_EventGroupAttribute attrib,
+                                             size_t* value_size,
+                                             void* value) = 0;
+
+  virtual CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
+                                          CUpti_ReadEventFlags flags,
+                                          CUpti_EventID event,
+                                          size_t* event_value_buffer_size_bytes,
+                                          uint64_t* eventValueBuffer) = 0;
+
+  virtual CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
+                                             CUpti_EventGroupAttribute attrib,
+                                             size_t value_size,
+                                             void* value) = 0;
+
+  virtual CUptiResult EventGroupSetsCreate(
+      CUcontext context, size_t event_id_array_size_bytes,
+      CUpti_EventID* event_id_array,
+      CUpti_EventGroupSets** event_group_passes) = 0;
+
+  virtual CUptiResult EventGroupSetsDestroy(
+      CUpti_EventGroupSets* event_group_sets) = 0;
+
+  // CUPTI metric API
+  virtual CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
+                                        CUpti_MetricID* metricArray) = 0;
+
+  virtual CUptiResult DeviceGetNumMetrics(CUdevice device,
+                                          uint32_t* num_metrics) = 0;
+
+  virtual CUptiResult MetricGetIdFromName(CUdevice device,
+                                          const char* metric_name,
+                                          CUpti_MetricID* metric) = 0;
+
+  virtual CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
+                                         uint32_t* num_events) = 0;
+
+  virtual CUptiResult MetricEnumEvents(CUpti_MetricID metric,
+                                       size_t* event_id_array_size_bytes,
+                                       CUpti_EventID* event_id_array) = 0;
+
+  virtual CUptiResult MetricGetAttribute(CUpti_MetricID metric,
+                                         CUpti_MetricAttribute attrib,
+                                         size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                                     size_t event_id_array_size_bytes,
+                                     CUpti_EventID* event_id_array,
+                                     size_t event_value_array_size_bytes,
+                                     uint64_t* event_value_array,
+                                     uint64_t time_duration,
+                                     CUpti_MetricValue* metric_value) = 0;
+
+  virtual CUptiResult GetResultString(CUptiResult result, const char** str) = 0;
+
+  // Interface maintenance functions. Not directly related to CUPTI, but
+  // required for implementing an error resilient layer over CUPTI API.
+
+  // Performance any clean up work that is required each time profile session
+  // is done. Therefore this can be called multiple times during process life
+  // time.
+  virtual void CleanUp() = 0;
+
+  // Whether CUPTI API is currently disabled due to unrecoverable errors.
+  // All subsequent calls will fail immediately without forwarding calls to
+  // CUPTI library.
+  virtual bool Disabled() const = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiInterface);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_INTERFACE_H_
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
new file mode 100644
index 00000000000..c85f67ee97b
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -0,0 +1,901 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/annotation.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+// Maps an OverheadKind enum to a const string.
+const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
+      return "COMPILER";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
+      return "BUFFER_FLUSH";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
+      return "INSTRUMENTATION";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
+      return "RESOURCE";
+    default:
+      break;
+  }
+  return "<UNKNOWN>";
+}
+
+const char *getActivityUnifiedMemoryKindString(
+    CUpti_ActivityUnifiedMemoryCounterKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
+      return "UM_BYTES_TRANSFER_HTOD";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
+      return "UM_BYTES_TRANSFER_DTOH";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
+      return "UM_CPU_PAGE_FAULT";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
+      return "UM_GPU_PAGE_FAULT";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
+      return "UM_THRASHING";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
+      return "UM_THROTTLING";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
+      return "UM_REMOTE_MAP";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
+      return "UM_BYTES_TRANSFER_DTOD";
+    default:
+      break;
+  }
+  return "<UNKNOWN>";
+}
+
+#define RETURN_IF_CUPTI_ERROR(expr)                                         \
+  do {                                                                      \
+    CUptiResult status = expr;                                              \
+    if (status != CUPTI_SUCCESS) {                                          \
+      const char *errstr = "";                                              \
+      cupti_interface_->GetResultString(status, &errstr);                   \
+      LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
+      return errors::Internal(absl::StrCat("cutpi call error", errstr));    \
+    }                                                                       \
+  } while (false)
+
+// GetCachedTID() caches the thread ID in thread-local storage (which is a
+// userspace construct) to avoid unnecessary system calls. Without this caching,
+// it can take roughly 98ns, while it takes roughly 1ns with this caching.
+pid_t GetCachedTID() {
+  static thread_local pid_t current_thread_id =
+      Env::Default()->GetCurrentThreadId();
+  return current_thread_id;
+}
+
+size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
+
+size_t Bytes3D(const CUDA_MEMCPY3D *p) {
+  return p->Depth * p->Height * p->WidthInBytes;
+}
+
+template <typename CudaMemcpy>
+CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
+  if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
+      p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
+    return CuptiTracerEventType::MemcpyH2D;
+  }
+  if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
+      p->dstMemoryType == CU_MEMORYTYPE_HOST) {
+    return CuptiTracerEventType::MemcpyD2H;
+  }
+  if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
+      p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
+    return CuptiTracerEventType::MemcpyD2D;
+  }
+  return CuptiTracerEventType::Unsupported;
+}
+
+std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
+DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
+  switch (cbid) {
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
+      const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
+                             false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
+      const auto *p =
+          reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
+                             true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
+      const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
+                             false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
+      const auto *p =
+          reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
+                             true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
+      const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
+                             false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
+      const auto *p =
+          reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
+                             true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
+      const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::Unsupported,
+                             false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
+      const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
+      return std::make_tuple(p->ByteCount, CuptiTracerEventType::Unsupported,
+                             true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
+      const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
+      return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
+      const auto *p =
+          reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
+      return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
+      const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
+      return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
+      const auto *p =
+          reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
+      return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
+      const cuMemcpyPeer_params *p2p_params =
+          reinterpret_cast<const cuMemcpyPeer_params *>(params);
+      return std::make_tuple(p2p_params->ByteCount,
+                             CuptiTracerEventType::MemcpyP2P, false);
+    }
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
+      const cuMemcpyPeerAsync_params_st *p2p_params =
+          reinterpret_cast<const cuMemcpyPeerAsync_params_st *>(params);
+      return std::make_tuple(p2p_params->ByteCount,
+                             CuptiTracerEventType::MemcpyP2P, true);
+    }
+    default: {
+      LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
+      return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
+    }
+  }
+}
+
+// Cupti callback corresponding to a driver or runtime API. This global function
+// is invoked twice for each API: at entry and at exit. The callback_info
+// parameter is guaranteed by Cupti to be thread-safe. Most invocations are
+// dropped to the floor and entry/exit is tracked for the APIs we deem
+// performance-relevant.
+void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
+                          CUpti_CallbackId cbid,
+                          const CUpti_CallbackData *callback_info) {
+  CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
+  tracer->HandleCallback(domain, cbid, callback_info).IgnoreError();
+}
+
+// Callback which is invoked when an empty buffer is requested by CUPTI.
+// Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
+// ring buffer where device maintains activity profiles that have been
+// collected.
+void CUPTIAPI AllocCuptiActivityBuffer(uint8_t **buffer, size_t *size,
+                                       size_t *maxNumRecords) {
+  // Buffer size and alignment, 32K and 8 as in CUPTI samples.
+  constexpr size_t kBufferSize = 32 * 1024;
+  constexpr int kBufferAlignSize = 8;
+  *buffer = reinterpret_cast<uint8_t *>(
+      port::AlignedMalloc(kBufferSize, kBufferAlignSize));
+  if (*buffer == nullptr) {
+    LOG(WARNING)
+        << "Cupti Buffer not allocated, activity records will be dropped";
+    return;
+  }
+  *size = kBufferSize;
+  *maxNumRecords = 0;  // Cupti to fill as many records as fit in the buffer.
+  VLOG(3) << "Allocated Cupti Buffer, buffer=" << std::hex
+          << reinterpret_cast<uintptr_t>(*buffer) << std::dec
+          << " size=" << *size;
+}
+
+// Callback which is invoked when a buffer containing activity records is
+// available from CUPTI. Frees the buffer after reading activity records from
+// it.
+void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
+                                      uint8_t *buffer, size_t size,
+                                      size_t valid_size) {
+  VLOG(3) << "Freeing Cupti Buffer, buffer:" << std::hex
+          << reinterpret_cast<uintptr_t>(buffer) << std::dec
+          << " size: " << size << " valid_size: " << valid_size;
+
+  // Ensure buffer is free when this function returns.
+  auto buffer_cleanup =
+      gtl::MakeCleanup([buffer] { port::AlignedFree(buffer); });
+
+  if (valid_size <= 0) {
+    return;
+  }
+
+  VLOG(3) << "Activity profile for stream " << stream_id;
+
+  CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
+  cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
+      .IgnoreError();
+}
+
+void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
+                               const CUpti_CallbackData *callback_info,
+                               uint64 start_time, uint64 end_time) {
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::Kernel;
+  event.source = CuptiTracerEventSource::DriverCallback;
+  event.name = callback_info->symbolName;
+  event.start_time_ns = start_time;
+  event.end_time_ns = end_time;
+  event.thread_id = GetCachedTID();
+  event.device_id = device_id;
+  event.context_id = callback_info->contextUid;
+  event.correlation_id = callback_info->correlationId;
+  VLOG(3) << "Cuda Kernel Launched: " << event.name;
+  collector->AddEvent(std::move(event));
+}
+
+// Performs the actual callback for both normal and P2P memcpy operations.
+CuptiTracerEvent PopulateMemcpyCallbackEvent(
+    CuptiTracerEventType type, const CUpti_CallbackData *callback_info,
+    size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
+    uint64 start_time, uint64 end_time) {
+  CuptiTracerEvent event;
+  event.type = type;
+  event.source = CuptiTracerEventSource::DriverCallback;
+  event.start_time_ns = start_time;
+  event.end_time_ns = end_time;
+  event.thread_id = GetCachedTID();
+  event.device_id = src_device;
+  event.context_id = callback_info->contextUid;
+  event.correlation_id = callback_info->correlationId;
+  event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
+  event.memcpy_info.num_bytes = num_bytes;
+  event.memcpy_info.destination = dst_device;
+  event.memcpy_info.async = async;
+  return event;
+}
+
+void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
+                                     uint32 device_id, CUpti_CallbackId cbid,
+                                     const CUpti_CallbackData *callback_info,
+                                     uint64 start_time, uint64 end_time) {
+  size_t num_bytes;
+  CuptiTracerEventType type;
+  bool async;
+  std::tie(num_bytes, type, async) =
+      DecodeDriverMemcpy(cbid, callback_info->functionParams);
+
+  VLOG(3) << "Cuda Memcpy observed :" << num_bytes;
+  CuptiTracerEvent event =
+      PopulateMemcpyCallbackEvent(type, callback_info, num_bytes, device_id,
+                                  device_id, async, start_time, end_time);
+  collector->AddEvent(std::move(event));
+}
+
+void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
+                                  CuptiInterface *cupti_interface,
+                                  uint32 device_id, CUpti_CallbackId cbid,
+                                  const CUpti_CallbackData *callback_info,
+                                  uint64 start_time, uint64 end_time) {
+  size_t num_bytes;
+  CuptiTracerEventType type;
+  bool async;
+  std::tie(num_bytes, type, async) =
+      DecodeDriverMemcpy(cbid, callback_info->functionParams);
+
+  uint32 dst_device = -1, src_device = -1;
+  const cuMemcpyPeer_params *p2p_params =
+      reinterpret_cast<const cuMemcpyPeer_params *>(
+          callback_info->functionParams);
+  cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
+  cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
+  VLOG(3) << "Cuda P2P Memcpy observed, src: " << src_device
+          << " dst: " << dst_device << " size:" << num_bytes;
+  CuptiTracerEvent event =
+      PopulateMemcpyCallbackEvent(type, callback_info, num_bytes, src_device,
+                                  dst_device, async, start_time, end_time);
+  collector->AddEvent(std::move(event));
+}
+
+void AddCudaMallocEventUponApiExit(CuptiTraceCollector *collector,
+                                   uint32 device_id, CUpti_CallbackId cbid,
+                                   const CUpti_CallbackData *callback_info,
+                                   uint64 start_time, uint64 end_time) {
+  const cuMemAlloc_v2_params_st *params =
+      reinterpret_cast<const cuMemAlloc_v2_params_st *>(
+          callback_info->functionParams);
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::MemoryAlloc;
+  event.source = CuptiTracerEventSource::DriverCallback;
+  event.name = callback_info->functionName;
+  event.start_time_ns = start_time;
+  event.end_time_ns = end_time;
+  event.thread_id = GetCachedTID();
+  event.device_id = device_id;
+  event.context_id = callback_info->contextUid;
+  event.correlation_id = callback_info->correlationId;
+  event.memalloc_info.num_bytes = params->bytesize;
+  VLOG(3) << "Cuda Malloc/Free observed: " << params->bytesize;
+  collector->AddEvent(std::move(event));
+}
+
+void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
+                                uint32 device_id, CUpti_CallbackId cbid,
+                                const CUpti_CallbackData *callback_info,
+                                uint64 start_time, uint64 end_time) {
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::Generic;
+  event.source = CuptiTracerEventSource::DriverCallback;
+  event.name = callback_info->functionName;
+  event.start_time_ns = start_time;
+  event.end_time_ns = end_time;
+  event.thread_id = GetCachedTID();
+  event.device_id = device_id;
+  event.context_id = callback_info->contextUid;
+  event.correlation_id = callback_info->correlationId;
+  collector->AddEvent(std::move(event));
+}
+
+void AddKernelActivityEvent(CuptiTraceCollector *collector,
+                            AnnotationMap *annotation_map,
+                            const CUpti_ActivityKernel4 *kernel) {
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::Kernel;
+  event.source = CuptiTracerEventSource::Activity;
+  event.name = kernel->name;
+  event.start_time_ns = kernel->start;
+  event.end_time_ns = kernel->end;
+  event.device_id = kernel->deviceId;
+  event.context_id = kernel->contextId;
+  event.stream_id = kernel->streamId;
+  event.correlation_id = kernel->correlationId;
+  event.annotation =
+      annotation_map->LookUp(event.device_id, event.correlation_id);
+  event.kernel_info.registers_per_thread = kernel->registersPerThread;
+  event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
+  event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
+  event.kernel_info.block_x = kernel->blockX;
+  event.kernel_info.block_y = kernel->blockY;
+  event.kernel_info.block_z = kernel->blockZ;
+  event.kernel_info.grid_x = kernel->gridX;
+  event.kernel_info.grid_y = kernel->gridY;
+  event.kernel_info.grid_z = kernel->gridZ;
+  collector->AddEvent(std::move(event));
+}
+
+void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
+                            AnnotationMap *annotation_map,
+                            const CUpti_ActivityMemcpy *memcpy) {
+  CuptiTracerEvent event;
+  switch (memcpy->copyKind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      event.type = CuptiTracerEventType::MemcpyH2D;
+      event.name = "MemcpyH2D";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      event.type = CuptiTracerEventType::MemcpyD2H;
+      event.name = "MemcpyD2H";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      event.type = CuptiTracerEventType::MemcpyD2D;
+      event.name = "MemcpyD2D";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      event.type = CuptiTracerEventType::MemcpyP2P;
+      event.name = "MemcpyP2P";
+      break;
+    default:
+      event.type = CuptiTracerEventType::MemcpyOther;
+      event.name = "MemcpyOther";
+      break;
+  }
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = memcpy->start;
+  event.end_time_ns = memcpy->end;
+  event.device_id = memcpy->deviceId;
+  event.context_id = memcpy->contextId;
+  event.stream_id = memcpy->streamId;
+  event.correlation_id = memcpy->correlationId;
+  event.annotation =
+      annotation_map->LookUp(event.device_id, event.correlation_id);
+  event.memcpy_info.kind = memcpy->copyKind;
+  event.memcpy_info.num_bytes = memcpy->bytes;
+  event.memcpy_info.destination = memcpy->deviceId;
+  event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
+  collector->AddEvent(std::move(event));
+}
+
+// Invokes callback upon peer-2-peer memcpy between different GPU devices.
+void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
+                             AnnotationMap *annotation_map,
+                             const CUpti_ActivityMemcpy2 *memcpy2) {
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::MemcpyP2P;
+  event.name = "MemcpyP2P";
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = memcpy2->start;
+  event.end_time_ns = memcpy2->end;
+  event.device_id = memcpy2->srcDeviceId;
+  event.context_id = memcpy2->contextId;
+  event.stream_id = memcpy2->streamId;
+  event.correlation_id = memcpy2->correlationId;
+  event.annotation =
+      annotation_map->LookUp(event.device_id, event.correlation_id);
+  event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
+  event.memcpy_info.num_bytes = memcpy2->bytes;
+  event.memcpy_info.destination = memcpy2->dstDeviceId;
+  event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
+  collector->AddEvent(std::move(event));
+}
+
+void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
+                                   const CUpti_ActivityOverhead *overhead) {
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::Overhead;
+  event.name = getActivityOverheadKindString(overhead->overheadKind);
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = overhead->start;
+  event.end_time_ns = overhead->end;
+  // If the overhead is not related to a device, we assign it to device 0.
+  event.device_id = 0;
+  // NOTE: no correlation id.
+  switch (overhead->objectKind) {
+    case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
+      // Don't know how to deal with such activities because of we need either
+      // attribute it to a GPU stream or a CPU thread.
+      return;
+
+    case CUPTI_ACTIVITY_OBJECT_THREAD:
+    case CUPTI_ACTIVITY_OBJECT_PROCESS:
+      event.thread_id = overhead->objectId.pt.threadId;
+      break;
+    case CUPTI_ACTIVITY_OBJECT_STREAM:
+      event.stream_id = overhead->objectId.dcs.streamId;
+      ABSL_FALLTHROUGH_INTENDED;
+    case CUPTI_ACTIVITY_OBJECT_DEVICE:
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT:
+      event.device_id = overhead->objectId.dcs.deviceId;
+      break;
+    default:
+      LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
+      return;
+  }
+  collector->AddEvent(std::move(event));
+}
+
+void AddUnifiedMemoryActivityEvent(
+    CuptiTraceCollector *collector,
+    const CUpti_ActivityUnifiedMemoryCounter2 *record) {
+  VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
+          << " src: " << record->srcId << " dst: " << record->dstId;
+  CuptiTracerEvent event;
+  event.type = CuptiTracerEventType::UnifiedMemory;
+  event.name = getActivityUnifiedMemoryKindString(record->counterKind);
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = record->start;
+  if (record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
+      record->end <= record->start) {
+    // If the end time is not valid, trim it so that it can be shown on the UI.
+    event.end_time_ns = record->start + 1;
+  } else {
+    event.end_time_ns = record->end;
+  }
+  event.device_id = record->srcId;
+  // NOTE: not context id and correlation id.
+
+  // For visualization purpose, we assign a pseudo stream id for each
+  // record->counterKind of unified memory related events.
+  constexpr int kPseudoStreamId = 0x10000000;
+  event.stream_id = kPseudoStreamId + record->counterKind;
+  event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
+  // Check whether the activity is byte transfer.
+  if (record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
+    event.memcpy_info.num_bytes = record->value;
+  } else {
+    event.memcpy_info.num_bytes = 0;
+  }
+  event.memcpy_info.destination = record->dstId;
+  event.memcpy_info.async = false;
+  collector->AddEvent(std::move(event));
+}
+
+}  // namespace
+
+void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
+                        const string &annotation) {
+  if (annotation.empty()) return;
+  VLOG(3) << "Add annotation: device_id: " << device_id
+          << " correlation_id: " << correlation_id
+          << " annotation: " << annotation;
+  if (device_id >= per_device_map_.size()) return;
+  auto &per_device_map = per_device_map_[device_id];
+  absl::MutexLock lock(&per_device_map.mutex);
+  if (per_device_map.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *per_device_map.annotations.insert(annotation).first;
+    per_device_map.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32 device_id,
+                                        uint32 correlation_id) {
+  if (device_id >= per_device_map_.size()) return absl::string_view();
+  auto &per_device_map = per_device_map_[device_id];
+  absl::MutexLock lock(&per_device_map.mutex);
+  auto it = per_device_map.correlation_map.find(correlation_id);
+  return it != per_device_map.correlation_map.end() ? it->second
+                                                    : absl::string_view();
+}
+
+/* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
+  static auto *singleton = new CuptiTracer();
+  return singleton;
+}
+
+bool CuptiTracer::IsAvailable() const {
+  return !activity_tracing_enabled_ && !api_tracing_enabled_;
+}
+
+int CuptiTracer::NumGpus() {
+  static int num_gpus = []() -> int {
+    if (cuInit(0) != CUDA_SUCCESS) {
+      return 0;
+    }
+    int gpu_count;
+    if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
+      return 0;
+    }
+    LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
+    return gpu_count;
+  }();
+  return num_gpus;
+}
+
+void CuptiTracer::Enable(const CuptiTracerOptions &option,
+                         CuptiInterface *cupti_interface,
+                         CuptiTraceCollector *collector) {
+  option_ = option;
+  cupti_interface_ = cupti_interface, collector_ = collector;
+  annotation_map_.emplace(option.max_annotation_strings, NumGpus());
+  EnableApiTracing().IgnoreError();
+  if (option_->enable_activity_api) {
+    EnableActivityTracing().IgnoreError();
+  }
+}
+
+void CuptiTracer::Disable() {
+  DisableApiTracing().IgnoreError();
+  if (option_->enable_activity_api) {
+    DisableActivityTracing().IgnoreError();
+  }
+  cupti_interface_->CleanUp();
+  Finalize().IgnoreError();
+  collector_->Flush();
+  collector_ = nullptr;
+  cupti_interface_ = nullptr;
+  option_.reset();
+  annotation_map_.reset();
+}
+
+Status CuptiTracer::EnableApiTracing() {
+  if (api_tracing_enabled_) return Status::OK();
+  api_tracing_enabled_ = true;
+
+  VLOG(1) << "Enable subscriber";
+  RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
+      &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
+
+  if (!option_->cbids_selected.empty()) {
+    for (auto cbid : option_->cbids_selected) {
+      RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
+          1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
+    }
+  } else {  // select all callback ids.
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
+        1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
+  }
+  return Status::OK();
+}
+
+Status CuptiTracer::DisableApiTracing() {
+  if (!api_tracing_enabled_) return Status::OK();
+
+  api_tracing_enabled_ = false;
+
+  if (!option_->cbids_selected.empty()) {
+    for (auto cbid : option_->cbids_selected) {
+      RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
+          0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
+    }
+  } else {
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
+        0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
+  }
+
+  VLOG(1) << "Disable subscriber";
+  RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
+  return Status::OK();
+}
+
+Status CuptiTracer::EnableActivityTracing() {
+  if (!option_->activities_selected.empty()) {
+    // Initialize callback functions for Cupti Activity API.
+    VLOG(1) << "Registering CUPTI activity callbacks";
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
+        AllocCuptiActivityBuffer, FreeCuptiActivityBuffer));
+
+    VLOG(1) << "Enabling activity tracing for "
+            << option_->activities_selected.size() << " activities";
+    for (auto activity : option_->activities_selected) {
+      VLOG(1) << "Enabling activity tracing for: " << activity;
+      if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
+        ConfigureActivityUnifiedMemoryCounter(true);
+      }
+      RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
+    }
+  }
+  activity_tracing_enabled_ = true;
+  return Status::OK();
+}
+
+Status CuptiTracer::DisableActivityTracing() {
+  if (activity_tracing_enabled_) {
+    VLOG(1) << "Disabling activity tracing for "
+            << option_->activities_selected.size() << " activities";
+    for (auto activity : option_->activities_selected) {
+      VLOG(1) << "Disabling activity tracing for: " << activity;
+      if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
+        ConfigureActivityUnifiedMemoryCounter(false);
+      }
+      RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
+    }
+    option_->activities_selected.clear();
+
+    VLOG(1) << "Flushing CUPTI activity buffer";
+    RETURN_IF_CUPTI_ERROR(
+        cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
+  }
+  activity_tracing_enabled_ = false;
+  return Status::OK();
+}
+
+Status CuptiTracer::Finalize() {
+  if (option_->cupti_finalize) {
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
+  }
+  return Status::OK();
+}
+
+uint64 CuptiTracer::GetTimestamp() {
+  uint64_t tsc;
+  if (cupti_interface_ &&
+      cupti_interface_->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
+    return tsc;
+  }
+  // Return 0 on error. If an activity timestamp is 0, the activity will be
+  // dropped during time normalization.
+  return 0;
+}
+
+Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
+                                   CUpti_CallbackId cbid,
+                                   const CUpti_CallbackData *callback_info) {
+  if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
+  if (callback_info->callbackSite == CUPTI_API_ENTER) {
+    // Stash away the current Cupti timestamp into callback_info.
+    *callback_info->correlationData = GetTimestamp();
+
+  } else if (callback_info->callbackSite == CUPTI_API_EXIT) {
+    if (callback_info->context == nullptr) {
+      // API callback is called before any CUDA context is created.
+      // This is expected to be rare, and we ignore this case.
+      VLOG(3) << "API callback received before creation of CUDA context\n";
+      return errors::Internal("cutpi callback without context");
+    }
+    // Grab timestamp for API exit. API entry timestamp saved in callback_info
+    // data.
+    uint64 end_tsc = GetTimestamp();
+    uint64 start_tsc = *callback_info->correlationData;
+
+    // Grab a correct device ID.
+    uint32 device_id = -1;
+    RETURN_IF_CUPTI_ERROR(
+        cupti_interface_->GetDeviceId(callback_info->context, &device_id));
+
+    // Set up the map from correlation id to annotation string.
+    const string &annotation = tensorflow::Annotation::CurrentAnnotation();
+    if (!annotation.empty()) {
+      annotation_map_->Add(device_id, callback_info->correlationId, annotation);
+    }
+
+    // If we are not collecting CPU events from Callback API, we can return now.
+    if (!option_->required_callback_api_events) {
+      return Status::OK();
+    }
+
+    switch (cbid) {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+        AddKernelEventUponApiExit(collector_, device_id, callback_info,
+                                  start_tsc, end_tsc);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
+        AddNormalMemcpyEventUponApiExit(collector_, device_id, cbid,
+                                        callback_info, start_tsc, end_tsc);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
+        AddP2PMemcpyEventUponApiExit(collector_, cupti_interface_, device_id,
+                                     cbid, callback_info, start_tsc, end_tsc);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
+        AddCudaMallocEventUponApiExit(collector_, device_id, cbid,
+                                      callback_info, start_tsc, end_tsc);
+        break;
+      default:
+        AddGenericEventUponApiExit(collector_, device_id, cbid, callback_info,
+                                   start_tsc, end_tsc);
+        break;
+    }
+  }  // CUPTI_API_EXIT
+  return Status::OK();
+}
+
+void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
+  CUpti_ActivityUnifiedMemoryCounterConfig config[2];
+  // By experiments, currently only measurements from these two activities are
+  // trustworthy. Others like GPU page fault may be problematic.
+  config[0].kind =
+      CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
+  config[1].kind =
+      CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
+
+  for (size_t i = 0; i < 2; i++) {
+    config[i].enable = enable;
+  }
+
+  CUptiResult res;
+
+  res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
+  if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
+    LOG(ERROR) << "Unified memory is not supported on the "
+                  "underlying platform.\n";
+  } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
+    LOG(ERROR) << "Unified memory is not supported on the device.\n";
+  } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
+    LOG(ERROR) << "Unified memory is not supported on the "
+                  "non-P2P multi-gpu setup.\n";
+  } else if (res != CUPTI_SUCCESS) {
+    const char *errstr = "";
+    cuptiGetResultString(res, &errstr);
+    LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
+  } else {
+    VLOG(1) << "Configuring Unified memory profiling: " << res;
+  }
+}
+
+Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
+                                          uint8_t *buffer, size_t size) {
+  if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
+
+  CUpti_Activity *record = nullptr;
+  while (true) {
+    CUptiResult status =
+        cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
+    if (status == CUPTI_SUCCESS) {
+      switch (record->kind) {
+        case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+          AddKernelActivityEvent(
+              collector_, &*annotation_map_,
+              reinterpret_cast<CUpti_ActivityKernel4 *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_MEMCPY:
+          AddMemcpyActivityEvent(
+              collector_, &*annotation_map_,
+              reinterpret_cast<CUpti_ActivityMemcpy *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_MEMCPY2:
+          AddMemcpy2ActivityEvent(
+              collector_, &*annotation_map_,
+              reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_OVERHEAD:
+          AddCuptiOverheadActivityEvent(
+              collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
+          AddUnifiedMemoryActivityEvent(
+              collector_,
+              reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
+          break;
+        default:
+          LOG(ERROR) << "Activity type " << record->kind << " not supported.";
+          break;
+      }
+    } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+      break;
+    } else {
+      return errors::Internal("Parse cupti activity buffer error.");
+    }
+  }
+
+  // Report dropped records.
+  size_t dropped;
+  RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
+      context, stream_id, &dropped));
+  if (dropped != 0) {
+    uint32 device_id = -1;
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
+    collector_->OnEventsDropped("CUpti activity buffer", dropped);
+  }
+  return Status::OK();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
new file mode 100644
index 00000000000..3ba1c0366b9
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -0,0 +1,241 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/types/optional.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: its the current device.
+  uint32 destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
+  // For events from other CuptiTracerEventSource, it is always 0.
+  int8 kind;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64 num_bytes;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint64 registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint64 static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint64 dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint64 block_x;
+  // Y-dimension of a thread block.
+  uint64 block_y;
+  // Z-dimension of a thread block.
+  uint64 block_z;
+  // X-dimension of a grid.
+  uint64 grid_x;
+  // Y-dimension of a grid.
+  uint64 grid_y;
+  // Z-dimension of a grid.
+  uint64 grid_z;
+};
+
+enum class CuptiTracerEventType {
+  Unsupported = 0,
+  Kernel = 1,
+  MemcpyH2D = 2,
+  MemcpyD2H = 3,
+  MemcpyD2D = 4,
+  MemcpyP2P = 5,
+  MemcpyOther = 6,
+  MemoryAlloc = 7,
+  Overhead = 8,
+  UnifiedMemory = 9,
+  Generic = 100,
+};
+
+enum class CuptiTracerEventSource {
+  DriverCallback = 0,
+  Activity = 1,
+  // Maybe consider adding runtime callback and metric api in the future.
+};
+
+struct CuptiTracerEvent {
+  static constexpr uint32 kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32 kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64 kInvalidContextId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint64 kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  CuptiTracerEventType type;
+  CuptiTracerEventSource source;
+  // name and annotation are only guaranteed to be valid in collector->AddEvent.
+  absl::string_view name;
+  absl::string_view annotation;
+  uint64 start_time_ns;
+  uint64 end_time_ns;
+  uint32 device_id;
+  uint32 correlation_id = kInvalidCorrelationId;
+  uint32 thread_id = kInvalidThreadId;
+  int64 context_id = kInvalidContextId;
+  int64 stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;      // If type == Memcpy*
+    MemAllocDetails memalloc_info;  // If type == MemoryAlloc
+    KernelDetails kernel_info;      // If type == Kernel
+  };
+};
+
+struct CuptiTracerOptions {
+  bool enable_activity_api = true;
+  bool required_callback_api_events = true;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64 max_annotation_strings = 1024 * 1024;
+  // The callback ids that will be enabled and monitored, if empty, all
+  // Callback ids to be enabled using Callback API.
+  // We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
+  // redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
+  // CUPTI_CB_DOMAIN_RUNTIME_API.
+  std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
+  // Activity kinds to be collected using Activity API. If empty, the Activity
+  // API is disable.
+  std::vector<CUpti_ActivityKind> activities_selected;
+  // Whether to call cuptiFinalize.
+  bool cupti_finalize = false;
+};
+
+struct CuptiTracerCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64 max_callback_api_events = 2 * 1024 * 1024;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64 max_activity_api_events = 2 * 1024 * 1024;
+};
+
+class CuptiTraceCollector {
+ public:
+  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
+      : options_(options) {}
+  virtual ~CuptiTraceCollector() {}
+
+  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
+  virtual void OnEventsDropped(const string& reason, uint32 num_events) = 0;
+  virtual void Flush() = 0;
+
+ protected:
+  CuptiTracerCollectorOptions options_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
+      : max_size_(max_size), per_device_map_(num_gpus) {}
+  void Add(uint32 device_id, uint32 correlation_id, const string& annotation);
+  absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
+
+ private:
+  struct PerDeviceAnnotationMap {
+    // The population/consuption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<string> annotations;
+    absl::flat_hash_map<uint32, absl::string_view> correlation_map;
+  };
+  const uint64 max_size_;
+  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
+};
+
+// The class use to enable cupti callback/activity API and forward the collected
+// trace events to CuptiTraceCollector. There should be only one CuptiTracer
+// per process.
+class CuptiTracer {
+ public:
+  // Returns a pointer to singleton CuptiTracer.
+  static CuptiTracer* GetCuptiTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+
+  void Enable(const CuptiTracerOptions& option, CuptiInterface* cupti_interface,
+              CuptiTraceCollector* collector);
+  void Disable();
+
+  Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+                        const CUpti_CallbackData* callback_info);
+
+  // This function is public because called from registered callback.
+  Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
+                               uint8_t* buffer, size_t size);
+
+  uint64 GetTimestamp();
+  static int NumGpus();
+
+ private:
+  CuptiTracer() {}
+
+  Status EnableApiTracing();
+  Status EnableActivityTracing();
+  Status DisableApiTracing();
+  Status DisableActivityTracing();
+  Status Finalize();
+  void ConfigureActivityUnifiedMemoryCounter(bool enable);
+
+  absl::optional<CuptiTracerOptions> option_;
+  CuptiInterface* cupti_interface_ = nullptr;
+  CuptiTraceCollector* collector_ = nullptr;
+  absl::optional<AnnotationMap> annotation_map_;
+
+  bool api_tracing_enabled_ = false;
+  // Cupti handle for driver or runtime API callbacks. Cupti permits a single
+  // subscriber to be active at any time and can be used to trace Cuda runtime
+  // as and driver calls for all contexts and devices.
+  CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.
+
+  bool activity_tracing_enabled_ = false;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTracer);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_wrapper.cc b/tensorflow/core/profiler/internal/gpu/cupti_wrapper.cc
new file mode 100644
index 00000000000..ef2aff3c028
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_wrapper.cc
@@ -0,0 +1,237 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
+
+#include <type_traits>
+
+namespace tensorflow {
+namespace profiler {
+
+CUptiResult CuptiWrapper::ActivityDisable(CUpti_ActivityKind kind) {
+  return cuptiActivityDisable(kind);
+}
+
+CUptiResult CuptiWrapper::ActivityEnable(CUpti_ActivityKind kind) {
+  return cuptiActivityEnable(kind);
+}
+
+CUptiResult CuptiWrapper::ActivityFlushAll(uint32_t flag) {
+  return cuptiActivityFlushAll(flag);
+}
+
+CUptiResult CuptiWrapper::ActivityGetNextRecord(uint8_t* buffer,
+                                                size_t valid_buffer_size_bytes,
+                                                CUpti_Activity** record) {
+  return cuptiActivityGetNextRecord(buffer, valid_buffer_size_bytes, record);
+}
+
+CUptiResult CuptiWrapper::ActivityGetNumDroppedRecords(CUcontext context,
+                                                       uint32_t stream_id,
+                                                       size_t* dropped) {
+  return cuptiActivityGetNumDroppedRecords(context, stream_id, dropped);
+}
+
+CUptiResult CuptiWrapper::ActivityConfigureUnifiedMemoryCounter(
+    CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) {
+  return cuptiActivityConfigureUnifiedMemoryCounter(config, count);
+}
+
+CUptiResult CuptiWrapper::ActivityRegisterCallbacks(
+    CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+    CUpti_BuffersCallbackCompleteFunc func_buffer_completed) {
+  return cuptiActivityRegisterCallbacks(func_buffer_requested,
+                                        func_buffer_completed);
+}
+
+CUptiResult CuptiWrapper::GetDeviceId(CUcontext context, uint32* deviceId) {
+  return cuptiGetDeviceId(context, deviceId);
+}
+
+CUptiResult CuptiWrapper::GetTimestamp(uint64_t* timestamp) {
+  return cuptiGetTimestamp(timestamp);
+}
+
+CUptiResult CuptiWrapper::Finalize() { return cuptiFinalize(); }
+
+CUptiResult CuptiWrapper::EnableCallback(uint32_t enable,
+                                         CUpti_SubscriberHandle subscriber,
+                                         CUpti_CallbackDomain domain,
+                                         CUpti_CallbackId cbid) {
+  return cuptiEnableCallback(enable, subscriber, domain, cbid);
+}
+
+CUptiResult CuptiWrapper::EnableDomain(uint32_t enable,
+                                       CUpti_SubscriberHandle subscriber,
+                                       CUpti_CallbackDomain domain) {
+  return cuptiEnableDomain(enable, subscriber, domain);
+}
+
+CUptiResult CuptiWrapper::Subscribe(CUpti_SubscriberHandle* subscriber,
+                                    CUpti_CallbackFunc callback,
+                                    void* userdata) {
+  return cuptiSubscribe(subscriber, callback, userdata);
+}
+
+CUptiResult CuptiWrapper::Unsubscribe(CUpti_SubscriberHandle subscriber) {
+  return cuptiUnsubscribe(subscriber);
+}
+
+CUptiResult CuptiWrapper::DeviceEnumEventDomains(
+    CUdevice device, size_t* array_size_bytes,
+    CUpti_EventDomainID* domain_array) {
+  return cuptiDeviceEnumEventDomains(device, array_size_bytes, domain_array);
+}
+
+CUptiResult CuptiWrapper::DeviceGetEventDomainAttribute(
+    CUdevice device, CUpti_EventDomainID event_domain,
+    CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) {
+  return cuptiDeviceGetEventDomainAttribute(device, event_domain, attrib,
+                                            value_size, value);
+}
+
+CUptiResult CuptiWrapper::DisableKernelReplayMode(CUcontext context) {
+  return cuptiDisableKernelReplayMode(context);
+}
+
+CUptiResult CuptiWrapper::EnableKernelReplayMode(CUcontext context) {
+  return cuptiEnableKernelReplayMode(context);
+}
+
+CUptiResult CuptiWrapper::DeviceGetNumEventDomains(CUdevice device,
+                                                   uint32_t* num_domains) {
+  return cuptiDeviceGetNumEventDomains(device, num_domains);
+}
+
+CUptiResult CuptiWrapper::EventDomainEnumEvents(
+    CUpti_EventDomainID event_domain, size_t* array_size_bytes,
+    CUpti_EventID* event_array) {
+  return cuptiEventDomainEnumEvents(event_domain, array_size_bytes,
+                                    event_array);
+}
+
+CUptiResult CuptiWrapper::EventDomainGetNumEvents(
+    CUpti_EventDomainID event_domain, uint32_t* num_events) {
+  return cuptiEventDomainGetNumEvents(event_domain, num_events);
+}
+
+CUptiResult CuptiWrapper::EventGetAttribute(CUpti_EventID event,
+                                            CUpti_EventAttribute attrib,
+                                            size_t* value_size, void* value) {
+  return cuptiEventGetAttribute(event, attrib, value_size, value);
+}
+
+CUptiResult CuptiWrapper::EventGetIdFromName(CUdevice device,
+                                             const char* event_name,
+                                             CUpti_EventID* event) {
+  return cuptiEventGetIdFromName(device, event_name, event);
+}
+
+CUptiResult CuptiWrapper::EventGroupDisable(CUpti_EventGroup event_group) {
+  return cuptiEventGroupDisable(event_group);
+}
+
+CUptiResult CuptiWrapper::EventGroupEnable(CUpti_EventGroup event_group) {
+  return cuptiEventGroupEnable(event_group);
+}
+
+CUptiResult CuptiWrapper::EventGroupGetAttribute(
+    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
+    size_t* value_size, void* value) {
+  return cuptiEventGroupGetAttribute(event_group, attrib, value_size, value);
+}
+
+CUptiResult CuptiWrapper::EventGroupReadEvent(
+    CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
+    CUpti_EventID event, size_t* event_value_buffer_size_bytes,
+    uint64_t* event_value_buffer) {
+  return cuptiEventGroupReadEvent(event_group, flags, event,
+                                  event_value_buffer_size_bytes,
+                                  event_value_buffer);
+}
+
+CUptiResult CuptiWrapper::EventGroupSetAttribute(
+    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
+    size_t value_size, void* value) {
+  return cuptiEventGroupSetAttribute(event_group, attrib, value_size, value);
+}
+
+CUptiResult CuptiWrapper::EventGroupSetsCreate(
+    CUcontext context, size_t event_id_array_size_bytes,
+    CUpti_EventID* event_id_array, CUpti_EventGroupSets** event_group_passes) {
+  return cuptiEventGroupSetsCreate(context, event_id_array_size_bytes,
+                                   event_id_array, event_group_passes);
+}
+
+CUptiResult CuptiWrapper::EventGroupSetsDestroy(
+    CUpti_EventGroupSets* event_group_sets) {
+  return cuptiEventGroupSetsDestroy(event_group_sets);
+}
+
+// CUPTI metric API
+CUptiResult CuptiWrapper::DeviceEnumMetrics(CUdevice device,
+                                            size_t* arraySizeBytes,
+                                            CUpti_MetricID* metricArray) {
+  return cuptiDeviceEnumMetrics(device, arraySizeBytes, metricArray);
+}
+
+CUptiResult CuptiWrapper::DeviceGetNumMetrics(CUdevice device,
+                                              uint32_t* num_metrics) {
+  return cuptiDeviceGetNumMetrics(device, num_metrics);
+}
+
+CUptiResult CuptiWrapper::MetricGetIdFromName(CUdevice device,
+                                              const char* metric_name,
+                                              CUpti_MetricID* metric) {
+  return cuptiMetricGetIdFromName(device, metric_name, metric);
+}
+
+CUptiResult CuptiWrapper::MetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t* num_events) {
+  return cuptiMetricGetNumEvents(metric, num_events);
+}
+
+CUptiResult CuptiWrapper::MetricEnumEvents(CUpti_MetricID metric,
+                                           size_t* event_id_array_size_bytes,
+                                           CUpti_EventID* event_id_array) {
+  return cuptiMetricEnumEvents(metric, event_id_array_size_bytes,
+                               event_id_array);
+}
+
+CUptiResult CuptiWrapper::MetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t* value_size, void* value) {
+  return cuptiMetricGetAttribute(metric, attrib, value_size, value);
+}
+
+CUptiResult CuptiWrapper::MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                                         size_t event_id_array_size_bytes,
+                                         CUpti_EventID* event_id_array,
+                                         size_t event_value_array_size_bytes,
+                                         uint64_t* event_value_array,
+                                         uint64_t time_duration,
+                                         CUpti_MetricValue* metric_value) {
+  return cuptiMetricGetValue(device, metric, event_id_array_size_bytes,
+                             event_id_array, event_value_array_size_bytes,
+                             event_value_array, time_duration, metric_value);
+}
+
+CUptiResult CuptiWrapper::GetResultString(CUptiResult result,
+                                          const char** str) {
+  return cuptiGetResultString(result, str);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_wrapper.h b/tensorflow/core/profiler/internal/gpu/cupti_wrapper.h
new file mode 100644
index 00000000000..e7a586d2c7d
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/cupti_wrapper.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_WRAPPER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_WRAPPER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+
+class CuptiWrapper : public tensorflow::profiler::CuptiInterface {
+ public:
+  CuptiWrapper() {}
+
+  ~CuptiWrapper() override {}
+
+  // CUPTI activity API
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  CUptiResult GetDeviceId(CUcontext context, uint32* deviceId) override;
+
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // cuptiFinalize is only defined in CUDA8 and above.
+  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
+  // be set to 1.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId cbid) override;
+
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  // CUPTI event API
+  CUptiResult DeviceEnumEventDomains(
+      CUdevice device, size_t* array_size_bytes,
+      CUpti_EventDomainID* domain_array) override;
+
+  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
+                                            CUpti_EventDomainID event_domain,
+                                            CUpti_EventDomainAttribute attrib,
+                                            size_t* value_size,
+                                            void* value) override;
+
+  CUptiResult DisableKernelReplayMode(CUcontext context) override;
+
+  CUptiResult EnableKernelReplayMode(CUcontext context) override;
+
+  CUptiResult DeviceGetNumEventDomains(CUdevice device,
+                                       uint32_t* num_domains) override;
+
+  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
+                                    size_t* array_size_bytes,
+                                    CUpti_EventID* event_array) override;
+
+  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
+                                      uint32_t* num_events) override;
+
+  CUptiResult EventGetAttribute(CUpti_EventID event,
+                                CUpti_EventAttribute attrib, size_t* value_size,
+                                void* value) override;
+
+  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
+                                 CUpti_EventID* event) override;
+
+  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
+
+  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
+
+  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t* value_size, void* value) override;
+
+  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
+                                  CUpti_ReadEventFlags flags,
+                                  CUpti_EventID event,
+                                  size_t* event_value_buffer_size_bytes,
+                                  uint64_t* event_value_buffer) override;
+
+  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t value_size, void* value) override;
+
+  CUptiResult EventGroupSetsCreate(
+      CUcontext context, size_t event_id_array_size_bytes,
+      CUpti_EventID* event_id_array,
+      CUpti_EventGroupSets** event_group_passes) override;
+
+  CUptiResult EventGroupSetsDestroy(
+      CUpti_EventGroupSets* event_group_sets) override;
+
+  // CUPTI metric API
+  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
+                                CUpti_MetricID* metricArray) override;
+
+  CUptiResult DeviceGetNumMetrics(CUdevice device,
+                                  uint32_t* num_metrics) override;
+
+  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
+                                  CUpti_MetricID* metric) override;
+
+  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
+                                 uint32_t* num_events) override;
+
+  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
+                               size_t* event_id_array_size_bytes,
+                               CUpti_EventID* event_id_array) override;
+
+  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
+                                 CUpti_MetricAttribute attrib,
+                                 size_t* value_size, void* value) override;
+
+  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                             size_t event_id_array_size_bytes,
+                             CUpti_EventID* event_id_array,
+                             size_t event_value_array_size_bytes,
+                             uint64_t* event_value_array,
+                             uint64_t time_duration,
+                             CUpti_MetricValue* metric_value) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  void CleanUp() override {}
+  bool Disabled() const override { return false; }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiWrapper);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // PERFTOOLS_ACCELERATORS_XPROF_XPROFILEZ_NVIDIA_GPU_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
new file mode 100644
index 00000000000..b3f2e0824c1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/parse_annotation.h"
+
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+std::vector<absl::string_view> SplitNameAndMetadata(
+    absl::string_view annotation) {
+  std::vector<absl::string_view> parts;
+  if (annotation.empty() || annotation.back() != '#') {
+    parts.emplace_back(annotation);
+  } else {
+    annotation.remove_suffix(1);
+    parts = absl::StrSplit(annotation, '#', absl::SkipEmpty());
+    if (parts.size() > 2) {
+      parts.resize(2);
+    }
+  }
+  while (parts.size() < 2) {
+    parts.emplace_back();
+  }
+  return parts;
+}
+
+std::vector<std::pair<absl::string_view, absl::string_view>> ParseMetadata(
+    absl::string_view metadata) {
+  std::vector<std::pair<absl::string_view, absl::string_view>> key_values;
+  for (absl::string_view pair : absl::StrSplit(metadata, ',')) {
+    std::vector<absl::string_view> parts = absl::StrSplit(pair, '=');
+    if (parts.size() == 2 && !parts[0].empty() && !parts[1].empty()) {
+      key_values.push_back(std::make_pair(parts[0], parts[1]));
+    }
+  }
+  return key_values;
+}
+
+}  // namespace
+
+Annotation ParseAnnotation(absl::string_view annotation) {
+  Annotation result;
+  std::vector<absl::string_view> parts = SplitNameAndMetadata(annotation);
+  if (!parts.empty()) {
+    result.name = parts[0];
+    for (const auto& key_value : ParseMetadata(parts[1])) {
+      result.metadata.push_back({key_value.first, key_value.second});
+    }
+  }
+  return result;
+}
+
+std::vector<Annotation> ParseAnnotationStack(
+    absl::string_view annotation_stack) {
+  std::vector<Annotation> annotations;
+  const std::string kAnnotationDelimiter = "::";
+  for (absl::string_view annotation : absl::StrSplit(
+           annotation_stack, kAnnotationDelimiter, absl::SkipEmpty())) {
+    annotations.emplace_back(ParseAnnotation(annotation));
+  }
+  return annotations;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/internal/parse_annotation.h
new file mode 100644
index 00000000000..6c2e536962b
--- /dev/null
+++ b/tensorflow/core/profiler/internal/parse_annotation.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Parses a string passed to TraceMe or ScopedAnnotation.
+// Expect the format will be "<name>#<metadata>#".
+// <metadata> is a comma-separated list of "<key>=<value>" pairs.
+// If the format does not match, the result will be empty.
+struct Annotation {
+  absl::string_view name;
+  struct Metadata {
+    absl::string_view key;
+    absl::string_view value;
+  };
+  std::vector<Metadata> metadata;
+};
+Annotation ParseAnnotation(absl::string_view annotation);
+
+std::vector<Annotation> ParseAnnotationStack(
+    absl::string_view annotation_stack);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
new file mode 100644
index 00000000000..8217bf1e42b
--- /dev/null
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/parse_annotation.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+TEST(ParseAnnotationStackTest, EmptyAnnotationStackTest) {
+  std::vector<Annotation> annotations = ParseAnnotationStack("");
+  ASSERT_TRUE(annotations.empty());
+}
+
+TEST(ParseAnnotationStackTest, SingleAnnotationStackTest) {
+  std::vector<Annotation> annotations = ParseAnnotationStack("name");
+  ASSERT_FALSE(annotations.empty());
+  EXPECT_EQ(annotations.back().name, "name");
+  EXPECT_TRUE(annotations.back().metadata.empty());
+}
+
+TEST(ParseAnnotationStackTest, MultiLevelAnnotationStackTest) {
+  std::vector<Annotation> annotations = ParseAnnotationStack("outer::inner");
+  ASSERT_EQ(annotations.size(), 2);
+  EXPECT_EQ(annotations.front().name, "outer");
+  EXPECT_TRUE(annotations.front().metadata.empty());
+  EXPECT_EQ(annotations.back().name, "inner");
+  EXPECT_TRUE(annotations.back().metadata.empty());
+}
+
+TEST(ParseAnnotationTest, EmptyAnnotationTest) {
+  Annotation annotation = ParseAnnotation("");
+  EXPECT_TRUE(annotation.name.empty());
+  EXPECT_TRUE(annotation.metadata.empty());
+}
+
+TEST(ParseAnnotationTest, SimpleNameTest) {
+  Annotation annotation = ParseAnnotation("name");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_TRUE(annotation.metadata.empty());
+}
+
+TEST(ParseAnnotationTest, EmptyMetadataTest) {
+  Annotation annotation = ParseAnnotation("name#");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_TRUE(annotation.metadata.empty());
+
+  annotation = ParseAnnotation("name1##");
+  EXPECT_EQ(annotation.name, "name1");
+  EXPECT_TRUE(annotation.metadata.empty());
+
+  annotation = ParseAnnotation("name2###");
+  EXPECT_EQ(annotation.name, "name2");
+  EXPECT_TRUE(annotation.metadata.empty());
+}
+
+TEST(ParseAnnotationTest, SingleMetadataTest) {
+  Annotation annotation = ParseAnnotation("name#key=value#");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_EQ(annotation.metadata.size(), 1);
+  EXPECT_EQ(annotation.metadata.at(0).key, "key");
+  EXPECT_EQ(annotation.metadata.at(0).value, "value");
+}
+
+TEST(ParseAnnotationTest, MultipleMetadataTest) {
+  Annotation annotation = ParseAnnotation("name#k1=v1,k2=v2,k3=v3#");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_EQ(annotation.metadata.size(), 3);
+  EXPECT_EQ(annotation.metadata.at(0).key, "k1");
+  EXPECT_EQ(annotation.metadata.at(0).value, "v1");
+  EXPECT_EQ(annotation.metadata.at(1).key, "k2");
+  EXPECT_EQ(annotation.metadata.at(1).value, "v2");
+  EXPECT_EQ(annotation.metadata.at(2).key, "k3");
+  EXPECT_EQ(annotation.metadata.at(2).value, "v3");
+}
+
+TEST(ParseAnnotationTest, ExtraCharactersTest) {
+  Annotation annotation = ParseAnnotation("name#k1=v1,k2=,k3=v3,k4=v4=#more#");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_EQ(annotation.metadata.size(), 2);
+  EXPECT_EQ(annotation.metadata.at(0).key, "k1");
+  EXPECT_EQ(annotation.metadata.at(0).value, "v1");
+  // "k2=" is ignored due to missing value.
+  EXPECT_EQ(annotation.metadata.at(1).key, "k3");
+  EXPECT_EQ(annotation.metadata.at(1).value, "v3");
+  // "k4=v4=" is ignored due to extra '='.
+  // "more#" is ignored.
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/profiler_interface.cc b/tensorflow/core/profiler/internal/profiler_interface.cc
index 2f48102318c..e442aa6c039 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.cc
+++ b/tensorflow/core/profiler/internal/profiler_interface.cc
@@ -34,11 +34,11 @@ void RegisterProfilerFactory(ProfilerFactory factory) {
 }
 
 void CreateProfilers(
-    const ProfilerContext* context,
+    const profiler::ProfilerOptions& options,
     std::vector<std::unique_ptr<profiler::ProfilerInterface>>* result) {
   absl::MutexLock lock(GetMutex());
   for (auto factory : *GetFactories()) {
-    if (auto profiler = factory(context)) {
+    if (auto profiler = factory(options)) {
       result->push_back(std::move(profiler));
     }
   }
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 4754f4f03a6..2c10373287f 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -15,16 +15,32 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
-class EagerContext;
-struct ProfilerContext {
-  EagerContext* eager_context = nullptr;
+namespace profiler {
+
+enum class DeviceType {
+  kUnspecified,
+  kCpu,
+  kGpu,
+  kTpu,
 };
 
-namespace profiler {
+struct ProfilerOptions {
+  // DeviceType::kUnspecified: All registered device profiler will be enabled.
+  // DeviceType::kCpu: only CPU will be profiled.
+  // DeviceType::kGpu: only CPU/GPU will be profiled.
+  // DeviceType::kTpu: only CPU/TPU will be profiled.
+  DeviceType device_type = DeviceType::kUnspecified;
+
+  // Inexpensive ops are not traced by default.
+  int host_tracer_level = 2;
+};
 
 // Interface for tensorflow profiler plugins.
 //
@@ -46,17 +62,20 @@ class ProfilerInterface {
 
   // Moves collected profile data into step_stats_collector.
   virtual Status CollectData(RunMetadata* run_metadata) = 0;
+
+  // Which device this ProfilerInterface is used for.
+  virtual DeviceType GetDeviceType() = 0;
 };
 
 }  // namespace profiler
 
-using ProfilerFactory =
-    std::unique_ptr<profiler::ProfilerInterface> (*)(const ProfilerContext*);
+using ProfilerFactory = std::unique_ptr<profiler::ProfilerInterface> (*)(
+    const profiler::ProfilerOptions&);
 
 void RegisterProfilerFactory(ProfilerFactory factory);
 
 void CreateProfilers(
-    const ProfilerContext* context,
+    const profiler::ProfilerOptions& options,
     std::vector<std::unique_ptr<profiler::ProfilerInterface>>* result);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/python_scoped_annotation.h b/tensorflow/core/profiler/internal/python_scoped_annotation.h
new file mode 100644
index 00000000000..bcabad983e4
--- /dev/null
+++ b/tensorflow/core/profiler/internal/python_scoped_annotation.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_SCOPED_ANNOTATION_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_SCOPED_ANNOTATION_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/annotation.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// DO NOT USE THIS CLASS DIRECTLY IN C++ CODE.
+// This class is only used to implement ScopedAnnotation
+// as a python context manager.
+class PythonScopedAnnotation {
+ public:
+  explicit PythonScopedAnnotation(const std::string& name) : name_(name) {}
+
+  void Enter() { current_.emplace(std::move(name_)); }
+  void Exit() { current_.reset(); }
+
+  static bool IsEnabled() { return tracing::ScopedAnnotation::IsEnabled(); }
+
+ private:
+  std::string name_;
+  absl::optional<tracing::ScopedAnnotation> current_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_SCOPED_ANNOTATION_H_
diff --git a/tensorflow/core/profiler/internal/python_traceme.h b/tensorflow/core/profiler/internal/python_traceme.h
new file mode 100644
index 00000000000..0824b0a5411
--- /dev/null
+++ b/tensorflow/core/profiler/internal/python_traceme.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_TRACEME_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// DO NOT USE THIS CLASS DIRECTLY IN C++ CODE.
+// This class is only used to implement TraceMe as a python context manager.
+class PythonTraceMe {
+ public:
+  explicit PythonTraceMe(const std::string& name) : activity_name_(name) {}
+  void Enter() { current_.emplace(std::move(activity_name_)); }
+  void Exit() { current_.reset(); }
+
+  static bool IsEnabled() { return TraceMeRecorder::Active(); }
+
+ private:
+  std::string activity_name_;
+  absl::optional<TraceMe> current_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PYTHON_TRACEME_H_
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
index ddf8c3dbf99..56a5e974107 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
@@ -13,22 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-/*
- * bazel run -c opt --config cuda --dynamic_mode=off \
- * --define tf_use_oss_timeline_nonprod=1 \
- * third_party/tensorflow/core/profiler/internal:scoped_annotation_test \
- * -- --benchmarks=all
- */
-
 #include "absl/strings/str_cat.h"
+#include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
 namespace {
 
+TEST(ScopedAnnotation, Simple) {
+  {
+    tracing::ScopedAnnotation trace("blah");
+    EXPECT_EQ(Annotation::CurrentAnnotation(), "");  // not enabled
+  }
+
+  {
+    tracing::ScopedAnnotation::Enable(true);
+    tracing::ScopedAnnotation trace("blah");
+    EXPECT_EQ(Annotation::CurrentAnnotation(), "blah");  // enabled
+    tracing::ScopedAnnotation::Enable(false);
+  }
+  {
+    tracing::ScopedAnnotation::Enable(true);
+    tracing::ScopedAnnotation outer("foo");
+    tracing::ScopedAnnotation inner("bar");
+    EXPECT_EQ(Annotation::CurrentAnnotation(), "foo::bar");  // enabled
+    tracing::ScopedAnnotation::Enable(false);
+  }
+
+  EXPECT_EQ(Annotation::CurrentAnnotation(), "");  // not enabled
+}
+
 std::string GenerateRandomString(int length) {
   return std::string(length, 'a');
 }
@@ -48,59 +64,65 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128);
 void BM_ScopedAnnotationEnabled(int iters, int annotation_size) {
   testing::StopTiming();
   std::string annotation = GenerateRandomString(annotation_size);
-  auto profiler_session =
-      tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr);
+  tracing::ScopedAnnotation::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     tracing::ScopedAnnotation trace(annotation);
   }
   testing::StopTiming();
+  tracing::ScopedAnnotation::Enable(false);
 }
 
 BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128);
 
-void BM_ScopedAnnotationEnabled_TwoParts(int iters, int annotation_size) {
-  testing::StopTiming();
-  std::string annotation = GenerateRandomString(annotation_size);
-  auto profiler_session =
-      tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr);
-  testing::StartTiming();
-  for (int i = 0; i < iters; i++) {
-    tracing::ScopedAnnotation trace(annotation, annotation);
-  }
-  testing::StopTiming();
-}
-
-BENCHMARK(BM_ScopedAnnotationEnabled_TwoParts)->Arg(8)->Arg(32)->Arg(128);
-
 void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) {
   testing::StopTiming();
   std::string annotation = GenerateRandomString(annotation_size);
-  auto profiler_session =
-      tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr);
+  tracing::ScopedAnnotation::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     tracing::ScopedAnnotation trace(annotation);
     { tracing::ScopedAnnotation trace(annotation); }
   }
   testing::StopTiming();
+  tracing::ScopedAnnotation::Enable(false);
 }
 
 BENCHMARK(BM_ScopedAnnotationEnabled_Nested)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled_Adhoc(int iters, int annotation_size) {
   testing::StopTiming();
-  auto profiler_session =
-      tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr);
+  tracing::ScopedAnnotation::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     // generate the annotation on the fly.
     tracing::ScopedAnnotation trace(absl::StrCat(i, "-", i * i));
   }
   testing::StopTiming();
+  tracing::ScopedAnnotation::Enable(false);
 }
 
 BENCHMARK(BM_ScopedAnnotationEnabled_Adhoc)->Arg(8)->Arg(32)->Arg(128);
 
+void BM_ScopedAnnotationDisabled_Lambda(int iters, int annotation_size) {
+  for (int i = 0; i < iters; i++) {
+    tracing::ScopedAnnotation trace(
+        [&]() { return absl::StrCat(i, "-", i * i); });
+  }
+}
+
+BENCHMARK(BM_ScopedAnnotationDisabled_Lambda)->Arg(8)->Arg(32)->Arg(128);
+
+void BM_ScopedAnnotationEnabled_Adhoc_Lambda(int iters, int annotation_size) {
+  tracing::ScopedAnnotation::Enable(true);
+  for (int i = 0; i < iters; i++) {
+    tracing::ScopedAnnotation trace(
+        [&]() { return absl::StrCat(i, "-", i * i); });
+  }
+  tracing::ScopedAnnotation::Enable(false);
+}
+
+BENCHMARK(BM_ScopedAnnotationEnabled_Adhoc_Lambda)->Arg(8)->Arg(32)->Arg(128);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 88bcf9e1393..b39d9a99d50 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -295,13 +295,23 @@ class PprofProfileImpl : public PprofProfile {
     io::ZlibOutputBuffer* zlib_output_buffer = new io::ZlibOutputBuffer(
         file.get(), buf_size, buf_size, io::ZlibCompressionOptions::GZIP());
     s = zlib_output_buffer->Init();
-    if (!s.ok()) return s;
+    if (!s.ok()) {
+      delete zlib_output_buffer;
+      return s;
+    }
     s = zlib_output_buffer->Append(profile_pb.SerializeAsString());
-    if (!s.ok()) return s;
+    if (!s.ok()) {
+      delete zlib_output_buffer;
+      return s;
+    }
     s = zlib_output_buffer->Close();
-    if (!s.ok()) return s;
+    if (!s.ok()) {
+      delete zlib_output_buffer;
+      return s;
+    }
     fprintf(stdout, "\nRun pprof -png --nodecount=100 --sample_index=1 <%s>\n",
             filename.c_str());
+    delete zlib_output_buffer;
     return s;
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index b5867085ae8..3b1c66880df 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -198,8 +198,9 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
       continue;
     }
     node_added = true;
+    size_t num_nodes = nodes_map_.size();
     nodes_map_[node.name()] = std::unique_ptr<TFGraphNode>(
-        new TFGraphNode(&node, nodes_map_.size(), &nodes_map_));
+        new TFGraphNode(&node, num_nodes, &nodes_map_));
     node_defs[node.name()] = &node;
   }
   for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
@@ -292,8 +293,9 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
       if (node == nodes_map_.end()) {
         NodeDef def;
         if (CreateRunMetadataNode(name, &def)) {
+          size_t num_nodes = nodes_map_.size();
           nodes_map_[name] = std::unique_ptr<TFGraphNode>(
-              new TFGraphNode(&def, nodes_map_.size(), &nodes_map_));
+              new TFGraphNode(&def, num_nodes, &nodes_map_));
           nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
         }
       } else {
@@ -316,7 +318,7 @@ string TFStats::MaybeReportMissingTrace() const {
         "stream stats!\n\n"
         "It's likely a gpu tracing issue rather than tf-profiler issue.\n"
         "If you found your operation missing accelerator time, "
-        "consider filing a bug to xprof-dev@!\n\n";
+        "consider to post to discuss@tensorflow.org!\n\n";
   }
   return report;
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor.cc b/tensorflow/core/profiler/internal/tfprof_tensor.cc
index d8ec086315f..a610175b72b 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.cc
@@ -62,9 +62,9 @@ void TFProfTensor::Build() {
     }
     case DataType::DT_STRING: {
       // Not supported by TensorFlow.
-      std::vector<string> values_vec;
-      GetValueVec<string, string>(&values_vec);
-      BuildOutput<string>(0, 0, values_vec, &tfprof_tensor_pb_);
+      std::vector<tstring> values_vec;
+      GetValueVec<tstring, tstring>(&values_vec);
+      BuildOutput<tstring>(0, 0, values_vec, &tfprof_tensor_pb_);
       break;
     }
     default: {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 374029714a3..921df8531c3 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -18,10 +18,10 @@ limitations under the License.
 #include <atomic>
 #include <cstddef>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "absl/base/optimization.h"
-#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -109,7 +109,7 @@ class TraceMeRecorder {
   mutex mutex_;
   // Map of the static container instances (thread_local storage) for each
   // thread. While active, a ThreadLocalRecorder stores trace events.
-  absl::flat_hash_map<int32, ThreadLocalRecorder*> threads_ GUARDED_BY(mutex_);
+  std::unordered_map<int32, ThreadLocalRecorder*> threads_ GUARDED_BY(mutex_);
   // Events from threads that died during recording.
   TraceMeRecorder::Events orphaned_events_ GUARDED_BY(mutex_);
 };
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index d4dd151e86b..aa0a4fd938f 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -2,10 +2,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cuda_library",
 )
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_additional_profiler_lib_deps",
-)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
     default_visibility = [
@@ -26,6 +23,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:profiler_utils",
         "//tensorflow/core/profiler:protos_all_cc",
         "@com_google_absl//absl/strings",
     ] + select({
@@ -45,7 +43,11 @@ tf_cuda_library(
 tf_cuda_library(
     name = "profiler_lib",
     visibility = ["//tensorflow:internal"],
-    deps = tf_additional_profiler_lib_deps(),
+    deps = [
+        "//tensorflow/core/profiler/internal/cpu:host_tracer",
+    ] + if_cuda([
+        "//tensorflow/core/profiler/internal/gpu:device_tracer",
+    ]),
     alwayslink = 1,
 )
 
@@ -61,6 +63,14 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "profiler_utils",
+    srcs = ["profiler_utils.cc"],
+    hdrs = ["profiler_utils.h"],
+    visibility = ["//tensorflow:internal"],
+    alwayslink = 1,
+)
+
 filegroup(
     name = "mobile_srcs",
     srcs = glob(["*"]),
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 390ab14bf18..3decf1eff59 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -26,25 +26,36 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/trace_events.pb.h"
 
 namespace tensorflow {
 namespace {
 
-// Track whether there's an active ProfilerSession.
-// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
-// use singletons that do not allow concurrent profiling request (e.g.,
-// DeviceTracer).
-std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
-
 // Given a node_name in the format "op_name:op_type", returns the "op_type".
 // If the "op_type" is missing, returns the node_name.
 // This is done so all ops with the same type appear in the same color in trace
 // viewer.
 inline std::string EventName(absl::string_view node_name) {
-  std::vector<absl::string_view> parts = absl::StrSplit(node_name, ':');
-  return std::string(parts.back());
+  // NOTE: open source device tracer now append cupti kernel name after
+  // annotation as node_name, @@ is used as separator. kernel name is
+  // demangled and possibly contains "::" patterns.
+  std::vector<absl::string_view> segments = absl::StrSplit(node_name, "@@");
+  if (segments.size() > 1) {  // unparsed
+    // find the last annotation.
+    std::vector<absl::string_view> annotation_stack =
+        absl::StrSplit(segments.front(), "::");
+    // strip trace argument.
+    std::vector<absl::string_view> annotation_parts =
+        absl::StrSplit(annotation_stack.back(), '#');
+    std::vector<absl::string_view> parts =
+        absl::StrSplit(annotation_parts.front(), ':');
+    return std::string(parts.back());
+  } else {
+    std::vector<absl::string_view> parts = absl::StrSplit(node_name, ':');
+    return std::string(parts.back());
+  }
 }
 
 void AssignLanes(RunMetadata* run_metadata) {
@@ -122,7 +133,12 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
           EnvTime::kMicrosToPicos);
       event->set_duration_ps(node.all_end_rel_micros() *
                              EnvTime::kMicrosToPicos);
-      (*args)["label"] = node.timeline_label();
+      if (!node.timeline_label().empty()) {
+        (*args)["label"] = node.timeline_label();
+      }
+      if (event->name() != node.node_name()) {
+        (*args)["long name"] = node.node_name();
+      }
     }
   }
 
@@ -131,8 +147,8 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
 }  // namespace
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
-    ProfilerContext* const context) {
-  return absl::WrapUnique(new ProfilerSession(context));
+    const profiler::ProfilerOptions& options) {
+  return absl::WrapUnique(new ProfilerSession(options));
 }
 
 Status ProfilerSession::Status() {
@@ -153,7 +169,7 @@ Status ProfilerSession::CollectData(RunMetadata* run_metadata) {
 
   if (active_) {
     // Allow another session to start.
-    session_active.store(false);
+    profiler::ReleaseProfilerLock();
     active_ = false;
   }
 
@@ -173,8 +189,8 @@ Status ProfilerSession::SerializeToString(string* content) {
   return Status::OK();
 }
 
-ProfilerSession::ProfilerSession(ProfilerContext* const context)
-    : active_(!session_active.exchange(true)),
+ProfilerSession::ProfilerSession(const profiler::ProfilerOptions& options)
+    : active_(profiler::AcquireProfilerLock()),
       start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
   if (!active_) {
     status_ = tensorflow::Status(error::UNAVAILABLE,
@@ -184,7 +200,7 @@ ProfilerSession::ProfilerSession(ProfilerContext* const context)
 
   LOG(INFO) << "Profiler session started.";
 
-  CreateProfilers(context, &profilers_);
+  CreateProfilers(options, &profilers_);
   status_ = Status::OK();
 
   for (auto& profiler : profilers_) {
@@ -203,7 +219,7 @@ ProfilerSession::~ProfilerSession() {
 
   if (active_) {
     // Allow another session to start.
-    session_active.store(false);
+    profiler::ReleaseProfilerLock();
   }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index b1a12336a57..6bf28bf0fba 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -33,7 +33,10 @@ class ProfilerSession {
  public:
   // Creates and ProfilerSession and starts profiling.
   static std::unique_ptr<ProfilerSession> Create(
-      ProfilerContext* const context);
+      const profiler::ProfilerOptions& options);
+  static std::unique_ptr<ProfilerSession> Create() {
+    return Create(profiler::ProfilerOptions());
+  }
 
   // Deletes an exsiting Profiler and enables starting a new one.
   ~ProfilerSession();
@@ -45,9 +48,9 @@ class ProfilerSession {
 
  private:
   // Constructs an instance of the class and starts profiling
-  explicit ProfilerSession(ProfilerContext* const context);
+  explicit ProfilerSession(const profiler::ProfilerOptions& options);
 
-  // Profiler is neither copyable or movable.
+  // ProfilerSession is neither copyable or movable.
   ProfilerSession(const ProfilerSession&) = delete;
   ProfilerSession& operator=(const ProfilerSession&) = delete;
 
diff --git a/tensorflow/core/profiler/lib/profiler_utils.cc b/tensorflow/core/profiler/lib/profiler_utils.cc
new file mode 100644
index 00000000000..ce3278f4519
--- /dev/null
+++ b/tensorflow/core/profiler/lib/profiler_utils.cc
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/lib/profiler_utils.h"
+
+#include <atomic>
+
+namespace tensorflow {
+namespace profiler {
+
+// Track whether there's an active profiler session.
+// Prevents another profiler session from creating ProfilerInterface(s).
+std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
+
+bool AcquireProfilerLock() { return !session_active.exchange(true); }
+
+void ReleaseProfilerLock() { session_active.store(false); }
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_utils.h b/tensorflow/core/profiler/lib/profiler_utils.h
new file mode 100644
index 00000000000..140f12776db
--- /dev/null
+++ b/tensorflow/core/profiler/lib/profiler_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+
+namespace tensorflow {
+namespace profiler {
+
+// If return false, other profiler session is active right now.
+// Otherwise the profiler lock is acquired.
+bool AcquireProfilerLock();
+
+// Release the acquired profiler lock.
+void ReleaseProfilerLock();
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
diff --git a/tensorflow/core/profiler/lib/traceme.cc b/tensorflow/core/profiler/lib/traceme.cc
index 90272b8bf58..7d02cfaf94b 100644
--- a/tensorflow/core/profiler/lib/traceme.cc
+++ b/tensorflow/core/profiler/lib/traceme.cc
@@ -32,14 +32,14 @@ uint64 NewActivityId() {
     absl::string_view activity_name) {
   uint64 activity_id = NewActivityId();
   TraceMeRecorder::Record({activity_id, string(activity_name),
-                           /*start_time=*/Env::Default()->NowNanos(),
+                           /*start_time=*/EnvTime::Default()->NowNanos(),
                            /*end_time=*/0});
   return activity_id;
 }
 
 /* static */ void TraceMe::ActivityEndImpl(uint64 activity_id) {
   TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
-                           /*end_time=*/Env::Default()->NowNanos()});
+                           /*end_time=*/EnvTime::Default()->NowNanos()});
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 5a5ba524856..b8e4acf6ab8 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
@@ -81,7 +81,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(activity_name);
-      start_time_ = Env::Default()->NowNanos();
+      start_time_ = EnvTime::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -96,7 +96,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(std::move(activity_name));
-      start_time_ = Env::Default()->NowNanos();
+      start_time_ = EnvTime::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -126,7 +126,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(name_generator());
-      start_time_ = Env::Default()->NowNanos();
+      start_time_ = EnvTime::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -147,7 +147,7 @@ class TraceMe {
     if (start_time_ != kUntracedActivity) {
       if (TraceMeRecorder::Active()) {
         TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name),
-                                 start_time_, Env::Default()->NowNanos()});
+                                 start_time_, EnvTime::Default()->NowNanos()});
       }
       no_init_.name.~string();
       start_time_ = kUntracedActivity;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 7d9e82635b1..c3437424383 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -14,7 +14,7 @@ tf_cuda_library(
         "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:grpc_services",
-        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler/lib:profiler_lib",
         "//tensorflow/core/profiler/lib:profiler_session",
@@ -32,7 +32,7 @@ tf_cuda_library(
         "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:grpc_services",
-        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler/lib:profiler_lib",
         "//tensorflow/core/profiler/lib:profiler_session",
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index 7684c923117..842aa4a483b 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -80,6 +80,11 @@ ProfileRequest PopulateProfileRequest(int duration_ms,
   return request;
 }
 
+bool ShouldRetryTracing(Status status) {
+  return status.code() == error::Code::UNAVAILABLE ||
+         status.code() == error::Code::ALREADY_EXISTS;
+}
+
 // Returns whether the returned trace is empty.
 // Failure are handled by CHECK, i.e. abort()
 Status Profile(const string& service_addr, const string& logdir,
@@ -215,16 +220,14 @@ Status StartTracing(const tensorflow::string& service_addr,
       status = NewSession(tpu_master, hostnames, duration_ms, repository_root,
                           session_id, opts);
     }
-    if (remaining_attempts <= 0 || status.ok() ||
-        status.code() != tensorflow::error::Code::UNAVAILABLE ||
-        status.code() != tensorflow::error::Code::ALREADY_EXISTS)
+    if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status))
       break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
               << std::endl;
   }
 
-  if (status.code() == tensorflow::error::Code::UNAVAILABLE) {
+  if (ShouldRetryTracing(status)) {
     std::cout << "No trace event is collected after " << num_tracing_attempts
               << " attempt(s). "
               << "Perhaps, you want to try again (with more attempts?)."
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 257e4e0bf5f..38fe9c18514 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -14,27 +14,26 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
+
 #include <memory>
 #include <utility>
+
 #include "grpcpp/grpcpp.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
-std::unique_ptr<Thread> StartProfilerServer(
-    ProfilerContext* const profiler_context, int32 port) {
-  Env* env = profiler_context->eager_context != nullptr
-                 ? profiler_context->eager_context->TFEnv()
-                 : Env::Default();
-  // Starting the server in the child thread may be delay and user may already
-  // delete the profiler context at that point. So we need to make a copy.
-  ProfilerContext ctx = *profiler_context;
-  return WrapUnique(env->StartThread({}, "profiler server", [ctx, port]() {
+std::unique_ptr<Thread> StartProfilerServer(int32 port) {
+  Env* env = Env::Default();
+  return WrapUnique(env->StartThread({}, "profiler server", [port]() {
     string server_address = strings::StrCat("0.0.0.0:", port);
     std::unique_ptr<grpc::ProfilerService::Service> service =
-        CreateProfilerService(ctx);
+        CreateProfilerService();
     ::grpc::ServerBuilder builder;
     builder.AddListeningPort(server_address,
                              ::grpc::InsecureServerCredentials());
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
index 21898d491f0..fd516121799 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -15,11 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
 
-#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <memory>
+
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 class Thread;
-std::unique_ptr<Thread> StartProfilerServer(
-    ProfilerContext* const profiler_context, int32 port);
+
+std::unique_ptr<Thread> StartProfilerServer(int32 port);
+
 }  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index f25ee668336..3b80519d375 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+
 #include "grpcpp/support/status.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/grpc_services.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -25,10 +26,6 @@ namespace {
 
 class ProfilerServiceImpl : public grpc::ProfilerService::Service {
  public:
-  explicit ProfilerServiceImpl(const ProfilerContext& profiler_context)
-      : profiler_context_(profiler_context) {}
-  ~ProfilerServiceImpl() override {}
-
   ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
                          MonitorResponse* response) override {
     return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
@@ -37,16 +34,13 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
   ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
                          ProfileResponse* response) override {
     LOG(INFO) << "Received a profile request.";
-    std::unique_ptr<ProfilerSession> profiler =
-        ProfilerSession::Create(&profiler_context_);
+    std::unique_ptr<ProfilerSession> profiler = ProfilerSession::Create();
     if (!profiler->Status().ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
                             profiler->Status().error_message());
     }
 
-    Env* env = profiler_context_.eager_context != nullptr
-                   ? profiler_context_.eager_context->TFEnv()
-                   : Env::Default();
+    Env* env = Env::Default();
     for (size_t i = 0; i < req->duration_ms(); ++i) {
       env->SleepForMicroseconds(1000);
       if (ctx->IsCancelled()) {
@@ -61,15 +55,11 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 
     return ::grpc::Status::OK;
   }
-
- private:
-  ProfilerContext profiler_context_;
 };
 }  // namespace
 
-std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
-    const ProfilerContext& profiler_context) {
-  return MakeUnique<ProfilerServiceImpl>(profiler_context);
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
+  return MakeUnique<ProfilerServiceImpl>();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 64ae01d5837..c003040c7b5 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -18,14 +18,13 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/server_context.h"
 #include "grpcpp/support/status.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/platform/grpc_services.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
 
-std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
-    const ProfilerContext& profiler_context);
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/protobuf/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
index 86cbc4a4cc6..f43dbbeac5e 100644
--- a/tensorflow/core/protobuf/autotuning.proto
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -80,5 +80,7 @@ message AutotuningLog {
   // stream_executor::DeviceDescription::pci_bus_id.
   string device_pci_bus_id = 5;
 
-  // Next ID: 6
+  string blas_version = 6;
+
+  // Next ID: 7
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index e0283e07eac..8096c5420c3 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -7,7 +7,7 @@ option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
@@ -335,6 +335,9 @@ message RPCOptions {
   // while with it we'll be able to complete long steps (like complex
   // initializations) in the face of some network errors during RecvTensor.
   bool cache_rpc_response = 4;
+
+  // Disables TCP connection sharing when opening a new RPC channel.
+  bool disable_session_connection_sharing = 5;
 }
 
 // Metadata about the session.
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 8ca76c44c0b..3cfab170f02 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -10,13 +10,15 @@ option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobu
 // Option for watching a node in TensorFlow Debugger (tfdbg).
 message DebugTensorWatch {
   // Name of the node to watch.
+  // Use "*" for wildcard. But note: currently, regex is not supported in
+  // general.
   string node_name = 1;
 
   // Output slot to watch.
-  // The semantics of output_slot == -1 is that the node is only watched for
-  // completion, but not for any output tensors. See NodeCompletionCallback
-  // in debug_gateway.h.
-  // TODO(cais): Implement this semantics.
+  // The semantics of output_slot == -1 is that all outputs of the node
+  // will be watched (i.e., a wildcard).
+  // Other negative values of output_slot are invalid and will lead to
+  // errors currently.
   int32 output_slot = 2;
 
   // Name(s) of the debugging op(s).
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 05603940db4..038ba3c1fc6 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -7,6 +7,7 @@ import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/versions.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 
@@ -15,6 +16,14 @@ message RemoteTensorHandle {
   int64 op_id = 1;
   // The index into the outputs of the operation that produced this tensor.
   int32 output_num = 2;
+  // Device of the operation that produced this tensor. Cannot be empty.
+  // For multi-device functions, it's the default device passed to placer.
+  string device = 3;
+  // Device where the tensor is located. Can be empty if the operation producing
+  // this tensor is a multi-device function.
+  string op_device = 4;
+  // Tensor type.
+  DataType dtype = 5;
 }
 
 // A proto representation of an eager operation.
@@ -44,6 +53,7 @@ message QueueItem {
   oneof item {
     RemoteTensorHandle handle_to_decref = 1;
     Operation operation = 2;
+    SendTensorOp send_tensor = 3;
   }
 }
 
@@ -131,6 +141,19 @@ message RegisterFunctionRequest {
 
 message RegisterFunctionResponse {}
 
+message SendTensorOp {
+  // All remote tensors are identified by <Op ID, Output num>. To mimic this
+  // situation when directly sending tensors, we include an "artificial" op ID
+  // (which would have corresponded to the _Recv op when not using SendTensor).
+  int64 op_id = 1;
+  // The index within the repeated field is the output number that will help
+  // uniquely identify (along with the above op_id) the particular tensor.
+  repeated TensorProto tensors = 2;
+
+  // The device on which the tensors should be resident.
+  string device_name = 3;
+}
+
 message SendTensorRequest {
   fixed64 context_id = 1;
 
@@ -203,7 +226,10 @@ service EagerService {
   rpc RegisterFunction(RegisterFunctionRequest)
       returns (RegisterFunctionResponse);
 
+  // TODO(fishx): Remove this method.
   // An RPC to push tensors to the server. At times, certain environments don't
   // allow the server to connect back to the client.
-  rpc SendTensor(SendTensorRequest) returns (SendTensorResponse);
+  rpc SendTensor(SendTensorRequest) returns (SendTensorResponse) {
+    option deprecated = true;
+  }
 }
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 4a998c5bfcd..9addf67908c 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -22,7 +22,7 @@ option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index fa0192cf67c..1eb2023f01d 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -14,6 +14,7 @@ import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/protobuf/saved_object_graph.proto";
 import "tensorflow/core/protobuf/saver.proto";
+import "tensorflow/core/protobuf/struct.proto";
 
 // NOTE: This protocol buffer is evolving, and will go through revisions in the
 // coming months.
@@ -225,6 +226,15 @@ message TensorInfo {
     string dense_shape_tensor_name = 3;
   }
 
+  // Generic encoding for composite tensors.
+  message CompositeTensor {
+    // The serialized TypeSpec for the composite tensor.
+    TypeSpecProto type_spec = 1;
+
+    // A TensorInfo for each flattened component tensor.
+    repeated TensorInfo components = 2;
+  }
+
   oneof encoding {
     // For dense `Tensor`s, the name of the tensor in the graph.
     string name = 1;
@@ -233,6 +243,8 @@ message TensorInfo {
     // uses only the COO encoding.  This is supported and documented in the
     // SparseTensor Python class.
     CooSparse coo_sparse = 4;
+    // Generic encoding for CompositeTensors.
+    CompositeTensor composite_tensor = 5;
   }
   DataType dtype = 2;
   // The static shape should be recorded here, to the extent that it can
diff --git a/tensorflow/core/protobuf/replay_log.proto b/tensorflow/core/protobuf/replay_log.proto
index 7644314fc9d..5506ec0c8ea 100644
--- a/tensorflow/core/protobuf/replay_log.proto
+++ b/tensorflow/core/protobuf/replay_log.proto
@@ -1,12 +1,11 @@
 syntax = "proto3";
 
-option cc_enable_arenas = true;
 package tensorflow;
 
-import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/master.proto";
 
+option cc_enable_arenas = true;
+
 // Records the creation of a new replay session.  We record the device listing
 // here to capture the state of the cluster.
 message NewReplaySession {
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index e1701b075ef..54943eec07b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -7,7 +7,7 @@ option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/protobuf/verifier_config.proto";
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index 55b9b520a89..ecf48776c56 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -56,6 +56,8 @@ message StructuredValue {
     tensorflow.DataType tensor_dtype_value = 32;
     // Represents a value for tf.TensorSpec.
     TensorSpecProto tensor_spec_value = 33;
+    // Represents a value for tf.TypeSpec.
+    TypeSpecProto type_spec_value = 34;
 
     // Represents a list of `Value`.
     ListValue list_value = 51;
@@ -104,4 +106,29 @@ message TensorSpecProto {
   string name = 1;
   tensorflow.TensorShapeProto shape = 2;
   tensorflow.DataType dtype = 3;
-};
+}
+
+// Represents a tf.TypeSpec
+message TypeSpecProto {
+  enum TypeSpecClass {
+    UNKNOWN = 0;
+    SPARSE_TENSOR_SPEC = 1;   // tf.SparseTensorSpec
+    INDEXED_SLICES_SPEC = 2;  // tf.IndexedSlicesSpec
+    RAGGED_TENSOR_SPEC = 3;   // tf.RaggedTensorSpec
+    TENSOR_ARRAY_SPEC = 4;    // tf.TensorArraySpec
+    DATA_DATASET_SPEC = 5;    // tf.data.DatasetSpec
+    DATA_ITERATOR_SPEC = 6;   // IteratorSpec from data/ops/iterator_ops.py
+    OPTIONAL_SPEC = 7;        // tf.OptionalSpec
+    PER_REPLICA_SPEC = 8;     // PerReplicaSpec from distribute/values.py
+  }
+  TypeSpecClass type_spec_class = 1;
+
+  // The value returned by TypeSpec._serialize().
+  StructuredValue type_state = 2;
+
+  // This is currently redundant with the type_spec_class enum, and is only
+  // used for error reporting.  In particular, if you use an older binary to
+  // load a newer model, and the model uses a TypeSpecClass that the older
+  // binary doesn't support, then this lets us display a useful error message.
+  string type_spec_class_name = 3;
+}
diff --git a/tensorflow/core/protobuf/tpu/BUILD b/tensorflow/core/protobuf/tpu/BUILD
index 33db44a7020..98aa1b8e5cf 100644
--- a/tensorflow/core/protobuf/tpu/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
     "tf_proto_library_py",
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 7190001cb6f..f52f7bf7f6d 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -166,7 +166,7 @@ message MdlAdagradLightParameters {
   float initial_benefit = 15;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
 message AdadeltaParameters {
   float rho = 1;
@@ -175,7 +175,7 @@ message AdadeltaParameters {
   float initial_update = 4;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
 message ProximalAdagradParameters {
   float l1 = 1;
@@ -183,6 +183,45 @@ message ProximalAdagradParameters {
   float initial_accumulator = 3;
 }
 
+// The online Yogi optimizer does not implement hyper-parameter update; use the
+// dynamic learning rate feature instead, setting the learning rate to:
+// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// Here, t is the current timestep.
+//
+// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
+// plus some extensions based on FTRL.
+//
+// Note that the code by default implements the lazy version of online Yogi.
+message OnlineYogiParameters {
+  // The L1 regularization parameter (used analogously to the one in FTRL).
+  float l1 = 1;
+
+  // The L2 regularization parameter (used analogously to the one in FTRL).
+  float l2 = 2;
+
+  // \beta_2 from Algorithm 2 in the paper.
+  float beta2 = 3;
+
+  // Initial value of V variable in paper.
+  float initial_v = 4;
+
+  // Initial value of linear variable in FTRL.
+  float initial_linear = 5;
+
+  // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
+  message SignActivation {}
+
+  // x -> tanh(x * 10)
+  message TanhActivation {}
+
+  // Activation to use to replace sign function in v_t update in Algorithm 2 of
+  // paper.
+  oneof activation {
+    SignActivation sign = 6;
+    TanhActivation tanh = 7;
+  }
+}
+
 // Status of using gradient accumulation (doing two passes over the input
 // gradients: one to accumulate them into a temporary array and another to apply
 // them using the actual optimization algorithm). The extra message is to wrap
@@ -253,6 +292,7 @@ message OptimizationParameters {
     MdlAdagradLightParameters mdl_adagrad_light = 11;
     AdadeltaParameters adadelta = 12;
     ProximalAdagradParameters proximal_adagrad = 14;
+    OnlineYogiParameters online_yogi = 20;
   }
 
   reserved 15;  // Old use_gradient_accumulation.
diff --git a/tensorflow/core/protobuf/verifier_config.proto b/tensorflow/core/protobuf/verifier_config.proto
index 207f0f2a974..5a1373b1ccf 100644
--- a/tensorflow/core/protobuf/verifier_config.proto
+++ b/tensorflow/core/protobuf/verifier_config.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VerifierConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // The config for graph verifiers.
 message VerifierConfig {
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 9ded2fb55a1..88fb76de657 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -22,7 +22,7 @@ option java_outer_classname = "WorkerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
-// add go_package externally with copybara
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/device_attributes.proto";
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0f98cd91fe3..80814b46411 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 101  // Updated: 2019/7/19
+#define TF_GRAPH_DEF_VERSION 141  // Updated: 2019/8/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/summary/loader.cc b/tensorflow/core/summary/loader.cc
index 68535feacfa..3af1f1b32dc 100644
--- a/tensorflow/core/summary/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -96,7 +96,7 @@ int main(int argc, char* argv[]) {
   uint64 start = env->NowMicros();
   uint64 records = 0;
   uint64 offset = 0;
-  string record;
+  tstring record;
   while (true) {
     std::unique_ptr<Event> event = std::unique_ptr<Event>(new Event);
     Status s = reader.ReadRecord(&offset, &record);
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index b203d439ccf..a7c5e792e31 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -32,7 +32,7 @@ limitations under the License.
 
 // clang-format off
 #define CALL_SUPPORTED_TYPES(m) \
-  TF_CALL_string(m)             \
+  TF_CALL_tstring(m)             \
   TF_CALL_half(m)               \
   TF_CALL_float(m)              \
   TF_CALL_double(m)             \
@@ -676,7 +676,7 @@ class SeriesWriter {
                const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db) {
     if (t.dtype() == DT_STRING) {
       if (t.dims() == 0) {
-        return Update(db, step, computed_time, t, t.scalar<string>()(), rowid);
+        return Update(db, step, computed_time, t, t.scalar<tstring>()(), rowid);
       } else {
         SqliteTransaction txn(*db);
         TF_RETURN_IF_ERROR(
@@ -735,7 +735,7 @@ class SeriesWriter {
     )sql";
     SqliteStatement inserter;
     TF_RETURN_IF_ERROR(db->Prepare(inserter_sql, &inserter));
-    auto flat = t.flat<string>();
+    auto flat = t.flat<tstring>();
     for (int64 i = 0; i < flat.size(); ++i) {
       inserter.BindInt(1, tensor_rowid);
       inserter.BindInt(2, i);
@@ -751,7 +751,7 @@ class SeriesWriter {
     unflushed_bytes_ = 0;
     if (t.dtype() == DT_STRING) {
       if (t.dims() == 0) {
-        TF_RETURN_IF_ERROR(ReserveData(db, &txn, t.scalar<string>()().size()));
+        TF_RETURN_IF_ERROR(ReserveData(db, &txn, t.scalar<tstring>()().size()));
       } else {
         TF_RETURN_IF_ERROR(ReserveTensors(db, &txn, kReserveMinBytes));
       }
@@ -1106,9 +1106,9 @@ class SummaryDbWriter : public SummaryWriterInterface {
     // See tensorboard/plugins/image/summary.py and data_compat.py
     Tensor t{DT_STRING, {3}};
     auto img = s->mutable_image();
-    t.flat<string>()(0) = strings::StrCat(img->width());
-    t.flat<string>()(1) = strings::StrCat(img->height());
-    t.flat<string>()(2) = std::move(*img->mutable_encoded_image_string());
+    t.flat<tstring>()(0) = strings::StrCat(img->width());
+    t.flat<tstring>()(1) = strings::StrCat(img->height());
+    t.flat<tstring>()(2) = std::move(*img->mutable_encoded_image_string());
     int64 tag_id;
     PatchPluginName(s->mutable_metadata(), kImagePluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
@@ -1120,8 +1120,8 @@ class SummaryDbWriter : public SummaryWriterInterface {
     // See tensorboard/plugins/audio/summary.py and data_compat.py
     Tensor t{DT_STRING, {1, 2}};
     auto wav = s->mutable_audio();
-    t.flat<string>()(0) = std::move(*wav->mutable_encoded_audio_string());
-    t.flat<string>()(1) = "";
+    t.flat<tstring>()(0) = std::move(*wav->mutable_encoded_audio_string());
+    t.flat<tstring>()(1) = "";
     int64 tag_id;
     PatchPluginName(s->mutable_metadata(), kAudioPluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index 41060d7fe64..0377720017d 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -69,7 +69,7 @@ class SummaryFileWriterTest : public ::testing::Test {
         TF_CHECK_OK(env_.NewRandomAccessFile(io::JoinPath(testing::TmpDir(), f),
                                              &read_file));
         io::RecordReader reader(read_file.get(), io::RecordReaderOptions());
-        string record;
+        tstring record;
         uint64 offset = 0;
         TF_CHECK_OK(
             reader.ReadRecord(&offset,
@@ -109,7 +109,7 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
       "string_tensor_test",
       [](SummaryWriterInterface* writer) {
         Tensor hello(DT_STRING, TensorShape({}));
-        hello.scalar<string>()() = "hello";
+        hello.scalar<tstring>()() = "hello";
         TF_RETURN_IF_ERROR(writer->WriteTensor(
             2, hello, "name", SummaryMetadata().SerializeAsString()));
         TF_RETURN_IF_ERROR(writer->Flush());
@@ -165,7 +165,7 @@ TEST_F(SummaryFileWriterTest, WriteImage) {
       "image_test",
       [](SummaryWriterInterface* writer) {
         Tensor one(DT_UINT8, TensorShape({1, 1, 1, 1}));
-        one.scalar<int8>()() = 1;
+        one.scalar<uint8>()() = 1;
         TF_RETURN_IF_ERROR(writer->WriteImage(2, one, "name", 1, Tensor()));
         TF_RETURN_IF_ERROR(writer->Flush());
         return Status::OK();
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 915348b8e23..fa49c42a39d 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -47,6 +47,8 @@ string GetOptimizationAlgorithmName(OptimizationAlgorithm alg) {
       return "Adadelta";
     case OptimizationAlgorithm::kProximalAdagrad:
       return "ProximalAdagrad";
+    case OptimizationAlgorithm::kOnlineYogi:
+      return "OnlineYogi";
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "*** Not set ***";
   }
@@ -77,6 +79,8 @@ string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
       return "Adadelta";
     case OptimizationAlgorithm::kProximalAdagrad:
       return "proximal Adagrad";
+    case OptimizationAlgorithm::kOnlineYogi:
+      return "online Yogi";
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "unknown (not specified)";
   }
@@ -121,6 +125,9 @@ Status GetBaseAuxiliaryParameterCount(OptimizationAlgorithm alg, int* count) {
     case OptimizationAlgorithm::kProximalAdagrad:
       *count = 1;
       return Status::OK();
+    case OptimizationAlgorithm::kOnlineYogi:
+      *count = 2;
+      return Status::OK();
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return errors::InvalidArgument("No optimization algorithm specified");
   }
@@ -242,6 +249,13 @@ Status GetOptimizationAlgorithmStateVariables(
           MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
+    case OptimizationAlgorithm::kOnlineYogi: {
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("vs", 0.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("linears", 0.0));
+      break;
+    }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET: {
       return errors::InvalidArgument("No optimization algorithm specified");
     }
@@ -277,6 +291,7 @@ std::vector<OptimizationAlgorithm> GetOptimizationAlgorithms() {
       OptimizationAlgorithm::kMdlAdagradLight,
       OptimizationAlgorithm::kAdadelta,
       OptimizationAlgorithm::kProximalAdagrad,
+      OptimizationAlgorithm::kOnlineYogi,
   };
 }
 
@@ -508,7 +523,8 @@ Status IsOptimizationAlgorithmInternal(OptimizationAlgorithm alg,
       *internal = false;
       return Status::OK();
     }
-    case OptimizationAlgorithm::kBoundedAdagrad: {
+    case OptimizationAlgorithm::kBoundedAdagrad:
+    case OptimizationAlgorithm::kOnlineYogi: {
       *internal = true;
       return Status::OK();
     }
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
index 320863d19be..bdd3c15fb89 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -101,7 +101,7 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     OpRegistrationData *op_reg_data);
 
 // Returns whether an optimization algorithm is only supported internally.
-// Returns an error if the algorithm is not recongized at all.
+// Returns an error if the algorithm is not recognized at all.
 Status IsOptimizationAlgorithmInternal(OptimizationAlgorithm alg,
                                        bool *internal);
 
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index 2e8b22a49b6..706b1c183dc 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -34,7 +34,7 @@ class FactOp : public tensorflow::OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, tensorflow::TensorShape(), &output_tensor));
     using tensorflow::string;
-    auto output = output_tensor->template scalar<string>();
+    auto output = output_tensor->template scalar<tensorflow::tstring>();
 
     output() = "0! == 1";
   }
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index e1c32cd0069..c3c72113abf 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -50,8 +50,8 @@ Status HandleElementToSlice(T* src, T* dest, int64 num_values,
 }
 
 template <>
-Status HandleElementToSlice<string>(string* src, string* dest, int64 num_values,
-                                    bool can_move) {
+Status HandleElementToSlice<tstring>(tstring* src, tstring* dest,
+                                     int64 num_values, bool can_move) {
   if (can_move) {
     for (int64 i = 0; i < num_values; ++i) {
       *dest++ = std::move(*src++);
@@ -107,8 +107,8 @@ void HandleSliceToElement(Tensor* parent, Tensor* element, int64 index,
 template <>
 void HandleSliceToElement<string>(Tensor* parent, Tensor* element, int64 index,
                                   bool can_move) {
-  auto parent_as_matrix = parent->flat_outer_dims<string>();
-  auto element_flat = element->flat<string>();
+  auto parent_as_matrix = parent->flat_outer_dims<tstring>();
+  auto element_flat = element->flat<tstring>();
   if (can_move) {
     for (int64 i = 0; i < element->NumElements(); ++i) {
       element_flat(i) = std::move(parent_as_matrix(index, i));
diff --git a/tensorflow/core/util/ctc/ctc_beam_entry.h b/tensorflow/core/util/ctc/ctc_beam_entry.h
index 7382b8e6849..eda91ce21f6 100644
--- a/tensorflow/core/util/ctc/ctc_beam_entry.h
+++ b/tensorflow/core/util/ctc/ctc_beam_entry.h
@@ -41,30 +41,32 @@ namespace ctc_beam_search {
 
 struct EmptyBeamState {};
 
+template <typename T>
 struct BeamProbability {
-  BeamProbability() : total(kLogZero), blank(kLogZero), label(kLogZero) {}
+  BeamProbability()
+      : total(kLogZero<T>()), blank(kLogZero<T>()), label(kLogZero<T>()) {}
   void Reset() {
-    total = kLogZero;
-    blank = kLogZero;
-    label = kLogZero;
+    total = kLogZero<T>();
+    blank = kLogZero<T>();
+    label = kLogZero<T>();
   }
-  float total;
-  float blank;
-  float label;
+  T total;
+  T blank;
+  T label;
 };
 
-template <class CTCBeamState>
+template <class T, class CTCBeamState>
 class BeamRoot;
 
-template <class CTCBeamState = EmptyBeamState>
+template <class T, class CTCBeamState = EmptyBeamState>
 struct BeamEntry {
   // BeamRoot<CTCBeamState>::AddEntry() serves as the factory method.
-  friend BeamEntry<CTCBeamState>* BeamRoot<CTCBeamState>::AddEntry(
-      BeamEntry<CTCBeamState>* p, int l);
-  inline bool Active() const { return newp.total != kLogZero; }
+  friend BeamEntry<T, CTCBeamState>* BeamRoot<T, CTCBeamState>::AddEntry(
+      BeamEntry<T, CTCBeamState>* p, int l);
+  inline bool Active() const { return newp.total != kLogZero<T>(); }
   // Return the child at the given index, or construct a new one in-place if
   // none was found.
-  BeamEntry& GetChild(int ind) {
+  BeamEntry<T, CTCBeamState>& GetChild(int ind) {
     auto entry = children.emplace(ind, nullptr);
     auto& child_entry = entry.first->second;
     // If this is a new child, populate the BeamEntry<CTCBeamState>*.
@@ -76,7 +78,7 @@ struct BeamEntry {
   std::vector<int> LabelSeq(bool merge_repeated) const {
     std::vector<int> labels;
     int prev_label = -1;
-    const BeamEntry* c = this;
+    const BeamEntry<T, CTCBeamState>* c = this;
     while (c->parent != nullptr) {  // Checking c->parent to skip root leaf.
       if (!merge_repeated || c->label != prev_label) {
         labels.push_back(c->label);
@@ -88,12 +90,12 @@ struct BeamEntry {
     return labels;
   }
 
-  BeamEntry<CTCBeamState>* parent;
+  BeamEntry<T, CTCBeamState>* parent;
   int label;
   // All instances of child BeamEntry are owned by *beam_root.
-  gtl::FlatMap<int, BeamEntry<CTCBeamState>*> children;
-  BeamProbability oldp;
-  BeamProbability newp;
+  gtl::FlatMap<int, BeamEntry<T, CTCBeamState>*> children;
+  BeamProbability<T> oldp;
+  BeamProbability<T> newp;
   CTCBeamState state;
 
  private:
@@ -102,40 +104,42 @@ struct BeamEntry {
   // otherwise parent will become invalid.
   // This private constructor is only called through the factory method
   // BeamRoot<CTCBeamState>::AddEntry().
-  BeamEntry(BeamEntry* p, int l, BeamRoot<CTCBeamState>* beam_root)
+  BeamEntry(BeamEntry* p, int l, BeamRoot<T, CTCBeamState>* beam_root)
       : parent(p), label(l), beam_root(beam_root) {}
-  BeamRoot<CTCBeamState>* beam_root;
+  BeamRoot<T, CTCBeamState>* beam_root;
   TF_DISALLOW_COPY_AND_ASSIGN(BeamEntry);
 };
 
 // This class owns all instances of BeamEntry.  This is used to avoid recursive
 // destructor call during destruction.
-template <class CTCBeamState = EmptyBeamState>
+template <class T, class CTCBeamState = EmptyBeamState>
 class BeamRoot {
  public:
-  BeamRoot(BeamEntry<CTCBeamState>* p, int l) { root_entry_ = AddEntry(p, l); }
+  BeamRoot(BeamEntry<T, CTCBeamState>* p, int l) {
+    root_entry_ = AddEntry(p, l);
+  }
   BeamRoot(const BeamRoot&) = delete;
   BeamRoot& operator=(const BeamRoot&) = delete;
 
-  BeamEntry<CTCBeamState>* AddEntry(BeamEntry<CTCBeamState>* p, int l) {
-    auto* new_entry = new BeamEntry<CTCBeamState>(p, l, this);
+  BeamEntry<T, CTCBeamState>* AddEntry(BeamEntry<T, CTCBeamState>* p, int l) {
+    auto* new_entry = new BeamEntry<T, CTCBeamState>(p, l, this);
     beam_entries_.emplace_back(new_entry);
     return new_entry;
   }
-  BeamEntry<CTCBeamState>* RootEntry() const { return root_entry_; }
+  BeamEntry<T, CTCBeamState>* RootEntry() const { return root_entry_; }
 
  private:
-  BeamEntry<CTCBeamState>* root_entry_ = nullptr;
-  std::vector<std::unique_ptr<BeamEntry<CTCBeamState>>> beam_entries_;
+  BeamEntry<T, CTCBeamState>* root_entry_ = nullptr;
+  std::vector<std::unique_ptr<BeamEntry<T, CTCBeamState>>> beam_entries_;
 };
 
 // BeamComparer is the default beam comparer provided in CTCBeamSearch.
-template <class CTCBeamState = EmptyBeamState>
+template <class T, class CTCBeamState = EmptyBeamState>
 class BeamComparer {
  public:
   virtual ~BeamComparer() {}
-  virtual bool inline operator()(const BeamEntry<CTCBeamState>* a,
-                                 const BeamEntry<CTCBeamState>* b) const {
+  virtual bool inline operator()(const BeamEntry<T, CTCBeamState>* a,
+                                 const BeamEntry<T, CTCBeamState>* b) const {
     return a->newp.total > b->newp.total;
   }
 };
diff --git a/tensorflow/core/util/ctc/ctc_beam_scorer.h b/tensorflow/core/util/ctc/ctc_beam_scorer.h
index fc63dfb0fd2..1e50f667e88 100644
--- a/tensorflow/core/util/ctc/ctc_beam_scorer.h
+++ b/tensorflow/core/util/ctc/ctc_beam_scorer.h
@@ -34,7 +34,7 @@ namespace ctc {
 // be subclassed and provided as an argument to CTCBeamSearchDecoder, if complex
 // scoring is required. Its main purpose is to provide a thin layer for
 // integrating language model scoring easily.
-template <typename CTCBeamState>
+template <typename T, typename CTCBeamState>
 class BaseBeamScorer {
  public:
   virtual ~BaseBeamScorer() {}
@@ -56,8 +56,8 @@ class BaseBeamScorer {
   //
   // The score returned should be a log-probability. In the simplest case, as
   // there's no state expansion logic, the expansion score is zero.
-  virtual float GetStateExpansionScore(const CTCBeamState& state,
-                                       float previous_score) const {
+  virtual T GetStateExpansionScore(const CTCBeamState& state,
+                                   T previous_score) const {
     return previous_score;
   }
   // GetStateEndExpansionScore should be an inexpensive method to retrieve the
@@ -65,8 +65,8 @@ class BaseBeamScorer {
   // multiplied (log-addition) with the final probability of the beam.
   //
   // The score returned should be a log-probability.
-  virtual float GetStateEndExpansionScore(const CTCBeamState& state) const {
-    return 0;
+  virtual T GetStateEndExpansionScore(const CTCBeamState& state) const {
+    return T(0);
   }
 };
 
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index f2022d486c7..6fffb155315 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -38,10 +38,10 @@ limitations under the License.
 namespace tensorflow {
 namespace ctc {
 
-template <typename CTCBeamState = ctc_beam_search::EmptyBeamState,
+template <typename T, typename CTCBeamState = ctc_beam_search::EmptyBeamState,
           typename CTCBeamComparer =
-              ctc_beam_search::BeamComparer<CTCBeamState>>
-class CTCBeamSearchDecoder : public CTCDecoder {
+              ctc_beam_search::BeamComparer<T, CTCBeamState>>
+class CTCBeamSearchDecoder : public CTCDecoder<T> {
   // Beam Search
   //
   // Example (GravesTh Fig. 7.5):
@@ -73,12 +73,12 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   // starts at 0).  This special case can be calculated as:
   //   P(l=abc? @ t=3) = P(a @ 0)*P(b @ 1)*P(c @ 2)*P(? @ 3)
   // but we calculate it recursively for speed purposes.
-  typedef ctc_beam_search::BeamEntry<CTCBeamState> BeamEntry;
-  typedef ctc_beam_search::BeamRoot<CTCBeamState> BeamRoot;
-  typedef ctc_beam_search::BeamProbability BeamProbability;
+  typedef ctc_beam_search::BeamEntry<T, CTCBeamState> BeamEntry;
+  typedef ctc_beam_search::BeamRoot<T, CTCBeamState> BeamRoot;
+  typedef ctc_beam_search::BeamProbability<T> BeamProbability;
 
  public:
-  typedef BaseBeamScorer<CTCBeamState> DefaultBeamScorer;
+  typedef BaseBeamScorer<T, CTCBeamState> DefaultBeamScorer;
 
   // The beam search decoder is constructed specifying the beam_width (number of
   // candidates to keep at each decoding timestep) and a beam scorer (used for
@@ -87,9 +87,9 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   // implementation, CTCBeamSearchDecoder<>::DefaultBeamScorer, generates the
   // standard beam search.
   CTCBeamSearchDecoder(int num_classes, int beam_width,
-                       BaseBeamScorer<CTCBeamState>* scorer, int batch_size = 1,
-                       bool merge_repeated = false)
-      : CTCDecoder(num_classes, batch_size, merge_repeated),
+                       BaseBeamScorer<T, CTCBeamState>* scorer,
+                       int batch_size = 1, bool merge_repeated = false)
+      : CTCDecoder<T>(num_classes, batch_size, merge_repeated),
         beam_width_(beam_width),
         leaves_(beam_width),
         beam_scorer_(CHECK_NOTNULL(scorer)) {
@@ -99,27 +99,28 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   ~CTCBeamSearchDecoder() override {}
 
   // Run the hibernating beam search algorithm on the given input.
-  Status Decode(const CTCDecoder::SequenceLength& seq_len,
-                const std::vector<CTCDecoder::Input>& input,
-                std::vector<CTCDecoder::Output>* output,
-                CTCDecoder::ScoreOutput* scores) override;
+  Status Decode(const typename CTCDecoder<T>::SequenceLength& seq_len,
+                const std::vector<typename CTCDecoder<T>::Input>& input,
+                std::vector<typename CTCDecoder<T>::Output>* output,
+                typename CTCDecoder<T>::ScoreOutput* scores) override;
 
   // Calculate the next step of the beam search and update the internal state.
   template <typename Vector>
   void Step(const Vector& log_input_t);
 
   template <typename Vector>
-  float GetTopK(const int K, const Vector& input,
-                std::vector<float>* top_k_logits,
-                std::vector<int>* top_k_indices);
+  T GetTopK(const int K, const Vector& input, std::vector<T>* top_k_logits,
+            std::vector<int>* top_k_indices);
 
   // Retrieve the beam scorer instance used during decoding.
-  BaseBeamScorer<CTCBeamState>* GetBeamScorer() const { return beam_scorer_; }
+  BaseBeamScorer<T, CTCBeamState>* GetBeamScorer() const {
+    return beam_scorer_;
+  }
 
   // Set label selection parameters for faster decoding.
   // See comments for label_selection_size_ and label_selection_margin_.
   void SetLabelSelectionParameters(int label_selection_size,
-                                   float label_selection_margin) {
+                                   T label_selection_margin) {
     label_selection_size_ = label_selection_size;
     label_selection_margin_ = label_selection_margin;
   }
@@ -129,7 +130,7 @@ class CTCBeamSearchDecoder : public CTCDecoder {
 
   // Extract the top n paths at current time step
   Status TopPaths(int n, std::vector<std::vector<int>>* paths,
-                  std::vector<float>* log_probs, bool merge_repeated) const;
+                  std::vector<T>* log_probs, bool merge_repeated) const;
 
  private:
   int beam_width_;
@@ -145,37 +146,38 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   // Default is to do no label selection.
   // For more detail: https://research.google.com/pubs/pub44823.html
   int label_selection_size_ = 0;       // zero means unlimited
-  float label_selection_margin_ = -1;  // -1 means unlimited.
+  T label_selection_margin_ = -1;      // -1 means unlimited.
 
   gtl::TopN<BeamEntry*, CTCBeamComparer> leaves_;
   std::unique_ptr<BeamRoot> beam_root_;
-  BaseBeamScorer<CTCBeamState>* beam_scorer_;
+  BaseBeamScorer<T, CTCBeamState>* beam_scorer_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoder);
 };
 
-template <typename CTCBeamState, typename CTCBeamComparer>
-Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
-    const CTCDecoder::SequenceLength& seq_len,
-    const std::vector<CTCDecoder::Input>& input,
-    std::vector<CTCDecoder::Output>* output, ScoreOutput* scores) {
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Decode(
+    const typename CTCDecoder<T>::SequenceLength& seq_len,
+    const std::vector<typename CTCDecoder<T>::Input>& input,
+    std::vector<typename CTCDecoder<T>::Output>* output,
+    typename CTCDecoder<T>::ScoreOutput* scores) {
   // Storage for top paths.
   std::vector<std::vector<int>> beams;
-  std::vector<float> beam_log_probabilities;
+  std::vector<T> beam_log_probabilities;
   int top_n = output->size();
   if (std::any_of(output->begin(), output->end(),
-                  [this](const CTCDecoder::Output& output) -> bool {
+                  [this](const typename CTCDecoder<T>::Output& output) -> bool {
                     return output.size() < this->batch_size_;
                   })) {
     return errors::InvalidArgument(
         "output needs to be of size at least (top_n, batch_size).");
   }
-  if (scores->rows() < batch_size_ || scores->cols() < top_n) {
+  if (scores->rows() < this->batch_size_ || scores->cols() < top_n) {
     return errors::InvalidArgument(
         "scores needs to be of size at least (batch_size, top_n).");
   }
 
-  for (int b = 0; b < batch_size_; ++b) {
+  for (int b = 0; b < this->batch_size_; ++b) {
     int seq_len_b = seq_len[b];
     Reset();
 
@@ -196,7 +198,7 @@ Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
     }
 
     Status status =
-        TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+        TopPaths(top_n, &beams, &beam_log_probabilities, this->merge_repeated_);
     if (!status.ok()) {
       return status;
     }
@@ -213,20 +215,20 @@ Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
   return Status::OK();
 }
 
-template <typename CTCBeamState, typename CTCBeamComparer>
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
 template <typename Vector>
-float CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::GetTopK(
-    const int K, const Vector& input, std::vector<float>* top_k_logits,
+T CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::GetTopK(
+    const int K, const Vector& input, std::vector<T>* top_k_logits,
     std::vector<int>* top_k_indices) {
   // Find Top K choices, complexity nk in worst case. The array input is read
   // just once.
-  CHECK_EQ(num_classes_, input.size());
+  CHECK_EQ(this->num_classes_, input.size());
   top_k_logits->clear();
   top_k_indices->clear();
   top_k_logits->resize(K, -INFINITY);
   top_k_indices->resize(K, -1);
-  for (int j = 0; j < num_classes_ - 1; ++j) {
-    const float logit = input(j);
+  for (int j = 0; j < this->num_classes_ - 1; ++j) {
+    const T logit = input(j);
     if (logit > (*top_k_logits)[K - 1]) {
       int k = K - 1;
       while (k > 0 && logit > (*top_k_logits)[k - 1]) {
@@ -239,43 +241,43 @@ float CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::GetTopK(
     }
   }
   // Return max value which is in 0th index or blank character logit
-  return std::max((*top_k_logits)[0], input(num_classes_ - 1));
+  return std::max((*top_k_logits)[0], input(this->num_classes_ - 1));
 }
 
-template <typename CTCBeamState, typename CTCBeamComparer>
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
 template <typename Vector>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
+void CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Step(
     const Vector& raw_input) {
-  std::vector<float> top_k_logits;
+  std::vector<T> top_k_logits;
   std::vector<int> top_k_indices;
   const bool top_k =
       (label_selection_size_ > 0 && label_selection_size_ < raw_input.size());
   // Number of character classes to consider in each step.
-  const int max_classes = top_k ? label_selection_size_ : (num_classes_ - 1);
+  const int max_classes =
+      top_k ? label_selection_size_ : (this->num_classes_ - 1);
   // Get max coefficient and remove it from raw_input later.
-  float max_coeff;
+  T max_coeff;
   if (top_k) {
     max_coeff = GetTopK(label_selection_size_, raw_input, &top_k_logits,
                         &top_k_indices);
   } else {
     max_coeff = raw_input.maxCoeff();
   }
-
   // Get normalization term of softmax: log(sum(exp(logit[j]-max_coeff))).
-  float logsumexp = 0.0;
+  T logsumexp = T(0.0);
   for (int j = 0; j < raw_input.size(); ++j) {
     logsumexp += Eigen::numext::exp(raw_input(j) - max_coeff);
   }
   logsumexp = Eigen::numext::log(logsumexp);
   // Final normalization offset to get correct log probabilities.
-  float norm_offset = max_coeff + logsumexp;
+  T norm_offset = max_coeff + logsumexp;
 
-  const float label_selection_input_min =
+  const T label_selection_input_min =
       (label_selection_margin_ >= 0) ? (max_coeff - label_selection_margin_)
-                                     : -std::numeric_limits<float>::infinity();
+                                     : -std::numeric_limits<T>::infinity();
 
   // Extract the beams sorted in decreasing new probability
-  CHECK_EQ(num_classes_, raw_input.size());
+  CHECK_EQ(this->num_classes_, raw_input.size());
 
   std::unique_ptr<std::vector<BeamEntry*>> branches(leaves_.Extract());
   leaves_.Reset();
@@ -294,8 +296,8 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
         // else:
         //   Plabel(l=abc @ t=6) = (Plabel(l=abc @ t=5)
         //                          + P(l=ab @ t=5))
-        float previous = (b->label == b->parent->label) ? b->parent->oldp.blank
-                                                        : b->parent->oldp.total;
+        T previous = (b->label == b->parent->label) ? b->parent->oldp.blank
+                                                    : b->parent->oldp.total;
         b->newp.label =
             LogSumExp(b->newp.label,
                       beam_scorer_->GetStateExpansionScore(b->state, previous));
@@ -304,7 +306,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
       b->newp.label += raw_input(b->label) - norm_offset;
     }
     // Pblank(l=abc @ t=6) = P(l=abc @ t=5) * P(- @ 6)
-    b->newp.blank = b->oldp.total + raw_input(blank_index_) - norm_offset;
+    b->newp.blank = b->oldp.total + raw_input(this->blank_index_) - norm_offset;
     // P(l=abc @ t=6) = Plabel(l=abc @ t=6) + Pblank(l=abc @ t=6)
     b->newp.total = LogSumExp(b->newp.blank, b->newp.label);
 
@@ -325,7 +327,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
     // isn't full, or the lowest probability entry in the beam has a
     // lower probability than the leaf.
     auto is_candidate = [this](const BeamProbability& prob) {
-      return (prob.total > kLogZero &&
+      return (prob.total > kLogZero<T>() &&
               (leaves_.size() < beam_width_ ||
                prob.total > leaves_.peek_bottom()->newp.total));
     };
@@ -336,7 +338,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
 
     for (int ind = 0; ind < max_classes; ind++) {
       const int label = top_k ? top_k_indices[ind] : ind;
-      const float logit = top_k ? top_k_logits[ind] : raw_input(ind);
+      const T logit = top_k ? top_k_logits[ind] : raw_input(ind);
       // Perform label selection: if input for this label looks very
       // unpromising, never evaluate it with a scorer.
       // We may compare logits instead of log probabilities, 
@@ -347,13 +349,13 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
       BeamEntry& c = b->GetChild(label);
       if (!c.Active()) {
         //   Pblank(l=abcd @ t=6) = 0
-        c.newp.blank = kLogZero;
+        c.newp.blank = kLogZero<T>();
         // If new child label is identical to beam label:
         //   Plabel(l=abcc @ t=6) = Pblank(l=abc @ t=5) * P(c @ 6)
         // Otherwise:
         //   Plabel(l=abcd @ t=6) = P(l=abc @ t=5) * P(d @ 6)
         beam_scorer_->ExpandState(b->state, b->label, &c.state, c.label);
-        float previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total;
+        T previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total;
         c.newp.label = logit - norm_offset +
                        beam_scorer_->GetStateExpansionScore(c.state, previous);
         // P(l=abcd @ t=6) = Plabel(l=abcd @ t=6)
@@ -379,15 +381,15 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
   }  // for (BeamEntry* b...
 }
 
-template <typename CTCBeamState, typename CTCBeamComparer>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+void CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Reset() {
   leaves_.Reset();
 
   // This beam root, and all of its children, will be in memory until
   // the next reset.
   beam_root_.reset(new BeamRoot(nullptr, -1));
-  beam_root_->RootEntry()->newp.total = 0.0;  // ln(1)
-  beam_root_->RootEntry()->newp.blank = 0.0;  // ln(1)
+  beam_root_->RootEntry()->newp.total = T(0.0);  // ln(1)
+  beam_root_->RootEntry()->newp.blank = T(0.0);  // ln(1)
 
   // Add the root as the initial leaf.
   leaves_.push(beam_root_->RootEntry());
@@ -396,9 +398,9 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
   beam_scorer_->InitializeState(&beam_root_->RootEntry()->state);
 }
 
-template <typename CTCBeamState, typename CTCBeamComparer>
-Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
-    int n, std::vector<std::vector<int>>* paths, std::vector<float>* log_probs,
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::TopPaths(
+    int n, std::vector<std::vector<int>>* paths, std::vector<T>* log_probs,
     bool merge_repeated) const {
   CHECK_NOTNULL(paths)->clear();
   CHECK_NOTNULL(log_probs)->clear();
diff --git a/tensorflow/core/util/ctc/ctc_beam_search_test.cc b/tensorflow/core/util/ctc/ctc_beam_search_test.cc
index b2d5ef56adf..88f2a4acbb6 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search_test.cc
+++ b/tensorflow/core/util/ctc/ctc_beam_search_test.cc
@@ -19,19 +19,20 @@ limitations under the License.
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 
 #include <cmath>
+
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace {
 
-typedef std::vector<std::vector<std::vector<float>>> TestData;
-using tensorflow::ctc::CTCBeamSearchDecoder;
-using tensorflow::ctc::CTCDecoder;
+template <class T>
+using TestData = std::vector<std::vector<std::vector<T>>>;
 
 // The HistoryBeamState is used to keep track of the current candidate and
 // caches the expansion score (needed by the scorer).
+template <class T>
 struct HistoryBeamState {
-  float score;
+  T score;
   std::vector<int> labels;
 };
 
@@ -40,48 +41,48 @@ struct HistoryBeamState {
 // a prefix of a dictionary word it gets a low probability at each step.
 //
 // The dictionary itself is hard-coded a static const variable of the class.
+template <class T, class BeamState>
 class DictionaryBeamScorer
-    : public tensorflow::ctc::BaseBeamScorer<HistoryBeamState> {
+    : public tensorflow::ctc::BaseBeamScorer<T, BeamState> {
  public:
-  void InitializeState(HistoryBeamState* root) const override {
-    root->score = 0;
-  }
+  DictionaryBeamScorer()
+      : tensorflow::ctc::BaseBeamScorer<T, BeamState>(),
+        dictionary_({{3}, {3, 1}}) {}
 
-  void ExpandState(const HistoryBeamState& from_state, int from_label,
-                   HistoryBeamState* to_state, int to_label) const override {
+  void InitializeState(BeamState* root) const override { root->score = 0; }
+
+  void ExpandState(const BeamState& from_state, int from_label,
+                   BeamState* to_state, int to_label) const override {
     // Keep track of the current complete candidate by storing the labels along
     // the expansion path in the beam state.
     to_state->labels.push_back(to_label);
     SetStateScoreAccordingToDict(to_state);
   }
 
-  void ExpandStateEnd(HistoryBeamState* state) const override {
+  void ExpandStateEnd(BeamState* state) const override {
     SetStateScoreAccordingToDict(state);
   }
 
-  float GetStateExpansionScore(const HistoryBeamState& state,
-                               float previous_score) const override {
+  T GetStateExpansionScore(const BeamState& state,
+                           T previous_score) const override {
     return previous_score + state.score;
   }
 
-  float GetStateEndExpansionScore(
-      const HistoryBeamState& state) const override {
+  T GetStateEndExpansionScore(const BeamState& state) const override {
     return state.score;
   }
 
   // Simple dictionary used when scoring the beams to check if they are prefixes
   // of dictionary words (see SetStateScoreAccordingToDict below).
-  static const std::vector<std::vector<int>> dictionary_;
+  const std::vector<std::vector<int>> dictionary_;
 
  private:
-  void SetStateScoreAccordingToDict(HistoryBeamState* state) const;
+  void SetStateScoreAccordingToDict(BeamState* state) const;
 };
 
-const std::vector<std::vector<int>> DictionaryBeamScorer::dictionary_ = {
-    {3}, {3, 1}};
-
-void DictionaryBeamScorer::SetStateScoreAccordingToDict(
-    HistoryBeamState* state) const {
+template <class T, class BeamState>
+void DictionaryBeamScorer<T, BeamState>::SetStateScoreAccordingToDict(
+    BeamState* state) const {
   // Check if the beam can still be a dictionary word (e.g. prefix of one).
   const std::vector<int>& candidate = state->labels;
   for (int w = 0; w < dictionary_.size(); ++w) {
@@ -92,32 +93,35 @@ void DictionaryBeamScorer::SetStateScoreAccordingToDict(
     }
     if (std::equal(word.begin(), word.begin() + candidate.size(),
                    candidate.begin())) {
-      state->score = std::log(1.0);
+      state->score = std::log(T(1.0));
       return;
     }
   }
   // At this point, the candidate certainly can't be in the dictionary.
-  state->score = std::log(0.01);
+  state->score = std::log(T(0.01));
 }
 
-TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
+template <class T>
+void ctc_beam_search_decoding_with_and_without_dictionary() {
   const int batch_size = 1;
   const int timesteps = 5;
   const int top_paths = 3;
   const int num_classes = 6;
 
   // Plain decoder using hibernating beam search algorithm.
-  CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
-  CTCBeamSearchDecoder<> decoder(num_classes, 10 * top_paths, &default_scorer);
+  typename tensorflow::ctc::CTCBeamSearchDecoder<T>::DefaultBeamScorer
+      default_scorer;
+  tensorflow::ctc::CTCBeamSearchDecoder<T> decoder(num_classes, 10 * top_paths,
+                                                   &default_scorer);
 
   // Dictionary decoder, allowing only two dictionary words : {3}, {3, 1}.
-  DictionaryBeamScorer dictionary_scorer;
-  CTCBeamSearchDecoder<HistoryBeamState> dictionary_decoder(
-      num_classes, top_paths, &dictionary_scorer);
+  DictionaryBeamScorer<T, HistoryBeamState<T>> dictionary_scorer;
+  tensorflow::ctc::CTCBeamSearchDecoder<T, HistoryBeamState<T>>
+      dictionary_decoder(num_classes, top_paths, &dictionary_scorer);
 
-  // Raw data containers (arrays of floats, ints, etc.).
+  // Raw data containers (arrays of floats64, ints, etc.).
   int sequence_lengths[batch_size] = {timesteps};
-  float input_data_mat[timesteps][batch_size][num_classes] = {
+  T input_data_mat[timesteps][batch_size][num_classes] = {
       {{0, 0.6, 0, 0.4, 0, 0}},
       {{0, 0.5, 0, 0.5, 0, 0}},
       {{0, 0.4, 0, 0.6, 0, 0}},
@@ -134,34 +138,40 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
   }
 
   // Plain output, without any additional scoring.
-  std::vector<CTCDecoder::Output> expected_output = {
-      {{1, 3}, {1, 3, 1}, {3, 1, 3}},
-  };
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output> expected_output =
+      {
+          {{1, 3}, {1, 3, 1}, {3, 1, 3}},
+      };
 
   // Dictionary outputs: preference for dictionary candidates. The
   // second-candidate is there, despite it not being a dictionary word, due to
   // stronger probability in the input to the decoder.
-  std::vector<CTCDecoder::Output> expected_dict_output = {
-      {{3}, {1, 3}, {3, 1}},
-  };
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output>
+      expected_dict_output = {
+          {{3}, {1, 3}, {3, 1}},
+      };
 
   // Convert data containers to the format accepted by the decoder, simply
   // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
   // using Eigen::Map.
   Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
-  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  std::vector<
+      Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>>
+      inputs;
   inputs.reserve(timesteps);
   for (int t = 0; t < timesteps; ++t) {
     inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
   }
 
   // Prepare containers for output and scores.
-  std::vector<CTCDecoder::Output> outputs(top_paths);
-  for (CTCDecoder::Output& output : outputs) {
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output> outputs(
+      top_paths);
+  for (typename tensorflow::ctc::CTCDecoder<T>::Output& output : outputs) {
     output.resize(batch_size);
   }
-  float score[batch_size][top_paths] = {{0.0}};
-  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
+  T score[batch_size][top_paths] = {{0.0}};
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(
+      &score[0][0], batch_size, top_paths);
 
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
@@ -169,8 +179,9 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
   }
 
   // Prepare dictionary outputs.
-  std::vector<CTCDecoder::Output> dict_outputs(top_paths);
-  for (CTCDecoder::Output& output : dict_outputs) {
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output> dict_outputs(
+      top_paths);
+  for (typename tensorflow::ctc::CTCDecoder<T>::Output& output : dict_outputs) {
     output.resize(batch_size);
   }
   EXPECT_TRUE(
@@ -180,38 +191,45 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
   }
 }
 
-TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
+template <class T>
+void ctc_beam_search_decoding_all_beam_elements_have_finite_scores() {
   const int batch_size = 1;
   const int timesteps = 1;
   const int top_paths = 3;
   const int num_classes = 6;
 
   // Plain decoder using hibernating beam search algorithm.
-  CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
-  CTCBeamSearchDecoder<> decoder(num_classes, top_paths, &default_scorer);
+  typename tensorflow::ctc::CTCBeamSearchDecoder<T>::DefaultBeamScorer
+      default_scorer;
+  tensorflow::ctc::CTCBeamSearchDecoder<T> decoder(num_classes, top_paths,
+                                                   &default_scorer);
 
-  // Raw data containers (arrays of floats, ints, etc.).
+  // Raw data containers (arrays of floats64, ints, etc.).
   int sequence_lengths[batch_size] = {timesteps};
-  float input_data_mat[timesteps][batch_size][num_classes] = {
+  T input_data_mat[timesteps][batch_size][num_classes] = {
       {{0.4, 0.3, 0, 0, 0, 0.5}}};
 
   // Convert data containers to the format accepted by the decoder, simply
   // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
   // using Eigen::Map.
   Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
-  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  std::vector<
+      Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>>
+      inputs;
   inputs.reserve(timesteps);
   for (int t = 0; t < timesteps; ++t) {
     inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
   }
 
   // Prepare containers for output and scores.
-  std::vector<CTCDecoder::Output> outputs(top_paths);
-  for (CTCDecoder::Output& output : outputs) {
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output> outputs(
+      top_paths);
+  for (typename tensorflow::ctc::CTCDecoder<T>::Output& output : outputs) {
     output.resize(batch_size);
   }
-  float score[batch_size][top_paths] = {{0.0}};
-  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
+  T score[batch_size][top_paths] = {{0.0}};
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(
+      &score[0][0], batch_size, top_paths);
 
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   // Make sure all scores are finite.
@@ -226,8 +244,9 @@ TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
 
 typedef int LabelState;  // The state is simply the final label.
 
+template <class T>
 class RapidlyDroppingLabelScorer
-    : public tensorflow::ctc::BaseBeamScorer<LabelState> {
+    : public tensorflow::ctc::BaseBeamScorer<T, LabelState> {
  public:
   void InitializeState(LabelState* root) const override {}
 
@@ -238,75 +257,84 @@ class RapidlyDroppingLabelScorer
 
   void ExpandStateEnd(LabelState* state) const override {}
 
-  float GetStateExpansionScore(const LabelState& state,
-                               float previous_score) const override {
+  T GetStateExpansionScore(const LabelState& state,
+                           T previous_score) const override {
     // Drop off rapidly for later labels.
-    const float kRapidly = 100;
+    const T kRapidly = 100;
     return previous_score - kRapidly * state;
   }
 
-  float GetStateEndExpansionScore(const LabelState& state) const override {
-    return 0;
+  T GetStateEndExpansionScore(const LabelState& state) const override {
+    return T(0);
   }
 };
 
-TEST(CtcBeamSearch, LabelSelection) {
+template <class T>
+void ctc_beam_search_label_selection() {
   const int batch_size = 1;
   const int timesteps = 3;
   const int top_paths = 5;
   const int num_classes = 6;
 
   // Decoder which drops off log-probabilities for labels 0 >> 1 >> 2 >> 3.
-  RapidlyDroppingLabelScorer scorer;
-  CTCBeamSearchDecoder<LabelState> decoder(num_classes, top_paths, &scorer);
+  RapidlyDroppingLabelScorer<T> scorer;
+  tensorflow::ctc::CTCBeamSearchDecoder<T, LabelState> decoder(
+      num_classes, top_paths, &scorer);
 
-  // Raw data containers (arrays of floats, ints, etc.).
+  // Raw data containers (arrays of floats64, ints, etc.).
   int sequence_lengths[batch_size] = {timesteps};
   // Log probabilities, slightly preferring later labels, this decision
   // should be overridden by the scorer which strongly prefers earlier labels.
   // The last one is empty label, and for simplicity  we give it an extremely
   // high cost to ignore it. We also use the first label to break up the
   // repeated label sequence.
-  float input_data_mat[timesteps][batch_size][num_classes] = {
+  T input_data_mat[timesteps][batch_size][num_classes] = {
       {{-1e6, 1, 2, 3, 4, -1e6}},
       {{1e6, 0, 0, 0, 0, -1e6}},  // force label 0 to break up repeated
       {{-1e6, 1.1, 2.2, 3.3, 4.4, -1e6}},
   };
 
   // Expected output without label selection
-  std::vector<CTCDecoder::Output> expected_default_output = {
-      {{1, 0, 1}, {1, 0, 2}, {2, 0, 1}, {1, 0, 3}, {2, 0, 2}},
-  };
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output>
+      expected_default_output = {
+          {{1, 0, 1}, {1, 0, 2}, {2, 0, 1}, {1, 0, 3}, {2, 0, 2}},
+      };
 
   // Expected output with label selection limiting to 2 items
   // this is suboptimal because only labels 3 and 4 were allowed to be seen.
-  std::vector<CTCDecoder::Output> expected_output_size2 = {
-      {{3, 0, 3}, {3, 0, 4}, {4, 0, 3}, {4, 0, 4}, {3}},
-  };
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output>
+      expected_output_size2 = {
+          {{3, 0, 3}, {3, 0, 4}, {4, 0, 3}, {4, 0, 4}, {3}},
+      };
 
   // Expected output with label width of 2.0. This would permit three labels at
   // the first timestep, but only two at the last.
-  std::vector<CTCDecoder::Output> expected_output_width2 = {
-      {{2, 0, 3}, {2, 0, 4}, {3, 0, 3}, {3, 0, 4}, {4, 0, 3}},
-  };
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output>
+      expected_output_width2 = {
+          {{2, 0, 3}, {2, 0, 4}, {3, 0, 3}, {3, 0, 4}, {4, 0, 3}},
+      };
 
   // Convert data containers to the format accepted by the decoder, simply
   // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
   // using Eigen::Map.
   Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
-  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  std::vector<
+      Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>>
+      inputs;
   inputs.reserve(timesteps);
   for (int t = 0; t < timesteps; ++t) {
     inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
   }
 
   // Prepare containers for output and scores.
-  std::vector<CTCDecoder::Output> outputs(top_paths);
-  for (CTCDecoder::Output& output : outputs) {
+  std::vector<typename tensorflow::ctc::CTCDecoder<T>::Output> outputs(
+      top_paths);
+  for (typename tensorflow::ctc::CTCDecoder<T>::Output& output : outputs) {
     output.resize(batch_size);
   }
-  float score[batch_size][top_paths] = {{0.0}};
-  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
+  T score[batch_size][top_paths] = {{0.0}};
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(
+      &score[0][0], batch_size, top_paths);
 
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
@@ -314,14 +342,14 @@ TEST(CtcBeamSearch, LabelSelection) {
   }
 
   // Try label selection size 2
-  decoder.SetLabelSelectionParameters(2, -1);
+  decoder.SetLabelSelectionParameters(2, T(-1));
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
   }
 
   // Try label selection width 2.0
-  decoder.SetLabelSelectionParameters(0, 2.0);
+  decoder.SetLabelSelectionParameters(0, T(2.0));
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_width2[0][path]);
@@ -329,18 +357,42 @@ TEST(CtcBeamSearch, LabelSelection) {
 
   // Try both size 2 and width 2.0: the former is more constraining, so
   // it's equivalent to that.
-  decoder.SetLabelSelectionParameters(2, 2.0);
+  decoder.SetLabelSelectionParameters(2, T(2.0));
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
   }
 
   // Size 4 and width > 3.3 are equivalent to no label selection
-  decoder.SetLabelSelectionParameters(4, 3.3001);
+  decoder.SetLabelSelectionParameters(4, T(3.3001));
   EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
   }
 }
 
+TEST(CtcBeamSearch, FloatDecodingWithAndWithoutDictionary) {
+  ctc_beam_search_decoding_with_and_without_dictionary<float>();
+}
+
+TEST(CtcBeamSearch, DoubleDecodingWithAndWithoutDictionary) {
+  ctc_beam_search_decoding_with_and_without_dictionary<double>();
+}
+
+TEST(CtcBeamSearch, FloatAllBeamElementsHaveFiniteScores) {
+  ctc_beam_search_decoding_all_beam_elements_have_finite_scores<float>();
+}
+
+TEST(CtcBeamSearch, DoubleAllBeamElementsHaveFiniteScores) {
+  ctc_beam_search_decoding_all_beam_elements_have_finite_scores<double>();
+}
+
+TEST(CtcBeamSearch, FloatLabelSelection) {
+  ctc_beam_search_label_selection<float>();
+}
+
+TEST(CtcBeamSearch, DoubleLabelSelection) {
+  ctc_beam_search_label_selection<double>();
+}
+
 }  // namespace
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index f5c9e4bb596..bc31ae4e91b 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -33,12 +33,15 @@ namespace ctc {
 // The two types of decoding available are:
 //   - greedy path, through the CTCGreedyDecoder
 //   - beam search, through the CTCBeamSearchDecoder
+template <class T>
 class CTCDecoder {
  public:
   typedef Eigen::Map<const Eigen::ArrayXi> SequenceLength;
-  typedef Eigen::Map<const Eigen::MatrixXf> Input;
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      Input;
   typedef std::vector<std::vector<int>> Output;
-  typedef Eigen::Map<Eigen::MatrixXf> ScoreOutput;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      ScoreOutput;
 
   CTCDecoder(int num_classes, int batch_size, bool merge_repeated)
       : num_classes_(num_classes),
@@ -69,25 +72,27 @@ class CTCDecoder {
 
 // CTCGreedyDecoder is an implementation of the simple best path decoding
 // algorithm, selecting at each timestep the most likely class at each timestep.
-class CTCGreedyDecoder : public CTCDecoder {
+template <class T>
+class CTCGreedyDecoder : public CTCDecoder<T> {
  public:
+  typedef CTCDecoder<T> Decoder;
   CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated)
-      : CTCDecoder(num_classes, batch_size, merge_repeated) {}
+      : CTCDecoder<T>(num_classes, batch_size, merge_repeated) {}
 
-  Status Decode(const CTCDecoder::SequenceLength& seq_len,
-                const std::vector<CTCDecoder::Input>& input,
-                std::vector<CTCDecoder::Output>* output,
-                CTCDecoder::ScoreOutput* scores) override {
-    if (output->empty() || (*output)[0].size() < batch_size_) {
+  Status Decode(const typename CTCDecoder<T>::SequenceLength& seq_len,
+                const std::vector<typename CTCDecoder<T>::Input>& input,
+                std::vector<typename CTCDecoder<T>::Output>* output,
+                typename CTCDecoder<T>::ScoreOutput* scores) override {
+    if (output->empty() || (*output)[0].size() < Decoder::batch_size_) {
       return errors::InvalidArgument(
           "output needs to be of size at least (1, batch_size).");
     }
-    if (scores->rows() < batch_size_ || scores->cols() == 0) {
+    if (scores->rows() < Decoder::batch_size_ || scores->cols() == 0) {
       return errors::InvalidArgument(
           "scores needs to be of size at least (batch_size, 1).");
     }
     // For each batch entry, identify the transitions
-    for (int b = 0; b < batch_size_; ++b) {
+    for (int b = 0; b < Decoder::batch_size_; ++b) {
       int seq_len_b = seq_len[b];
       // Only writing to beam 0
       std::vector<int>& output_b = (*output)[0][b];
@@ -98,8 +103,8 @@ class CTCGreedyDecoder : public CTCDecoder {
         auto row = input[t].row(b);
         int max_class_ix;
         (*scores)(b, 0) += -row.maxCoeff(&max_class_ix);
-        if (max_class_ix != blank_index_ &&
-            !(merge_repeated_ && max_class_ix == prev_class_ix)) {
+        if (max_class_ix != Decoder::blank_index_ &&
+            !(Decoder::merge_repeated_ && max_class_ix == prev_class_ix)) {
           output_b.push_back(max_class_ix);
         }
         prev_class_ix = max_class_ix;
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.cc b/tensorflow/core/util/ctc/ctc_loss_calculator.cc
index a0ac5eec4bc..8d84d7e3b12 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.cc
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.cc
@@ -14,173 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/util/ctc/ctc_loss_calculator.h"
+
 #include <cmath>
 
 namespace tensorflow {
-namespace ctc {
-
-// Calculates the alpha(t, u) as described in (GravesTh) Section 7.3.
-// Starting with t = 0 instead of t = 1 used in the text.
-// Based on Kanishka's CTC.
-void CTCLossCalculator::CalculateForwardVariables(
-    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
-    Matrix* log_alpha) const {
-  // Number of cols is the number of time steps = number of cols in target
-  // after the output delay.
-  log_alpha->setConstant(kLogZero);
-
-  int U = l_prime.size();
-  int T = log_alpha->cols();
-
-  CHECK_EQ(U, log_alpha->rows());
-
-  // Initial alpha values in (GravesTh) Eq 7.5 and Eq 7.6.
-  log_alpha->coeffRef(0, 0) = std::log(y(blank_index_, output_delay_));
-  // Below, l_prime[1] == labels[0]
-  auto label_0 = (l_prime.size() > 1) ? l_prime[1] : blank_index_;
-  log_alpha->coeffRef(1, 0) = std::log(y(label_0, output_delay_));
-
-  for (int t = 1; t < T; ++t) {
-    // If there is not enough time to output the remaining labels or
-    // some labels have been skipped, then let log_alpha(u, t) continue to
-    // be kLogZero.
-    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
-         ++u) {
-      // Begin (GravesTh) Eq 7.9
-      // Add in the u, t - 1 term.
-      float sum_log_alpha = kLogZero;
-      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
-        sum_log_alpha = log_alpha->coeff(u, t - 1);
-      }
-
-      // Add in the u - 1, t - 1 term.
-      if (u > 0) {
-        sum_log_alpha =
-            LogSumExp(sum_log_alpha, log_alpha->coeff(u - 1, t - 1));
-      }
-
-      // Add in the u - 2, t - 1 term if l_prime(u) != blank or l_prime(u-2).
-      if (u > 1) {
-        const bool matching_labels_merge =
-            ctc_merge_repeated && (l_prime[u] == l_prime[u - 2]);
-        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
-          sum_log_alpha =
-              LogSumExp(sum_log_alpha, log_alpha->coeff(u - 2, t - 1));
-        }
-      }
-      // Multiply the summed alphas with the activation log probability.
-      log_alpha->coeffRef(u, t) =
-          std::log(y(l_prime[u], output_delay_ + t)) + sum_log_alpha;
-    }  // End (GravesTh) Eq 7.9.
-  }
-}
-
-// Calculates the beta(t, u) as described in (GravesTh) Section 7.3.
-void CTCLossCalculator::CalculateBackwardVariables(
-    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
-    Matrix* log_beta) const {
-  // Number of cols is the number of time steps =  number of cols in target.
-  // Matrix log_beta =
-  //    Matrix::Constant(l_prime.size(), y.cols() - output_delay_,
-  // kLogZero);
-  log_beta->setConstant(kLogZero);
-  int T = log_beta->cols();
-  int U = l_prime.size();
-  CHECK_EQ(U, log_beta->rows());
-
-  // Initial beta values in (GravesTh) Eq 7.13: log of probability 1.
-  for (int u = U - 2; u < U; ++u) log_beta->coeffRef(u, T - 1) = 0;
-
-  for (int t = T - 1 - 1; t >= 0; --t) {
-    // If there is not enough time to output the remaining labels or
-    // some labels have been skipped, then let log_beta(u, t) continue to
-    // be kLogZero.
-    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
-         ++u) {
-      // Begin (GravesTh) Eq 7.15
-      // Add in the u, t + 1 term.
-      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
-        log_beta->coeffRef(u, t) =
-            LogSumExp(log_beta->coeff(u, t),
-                      log_beta->coeff(u, t + 1) +
-                          std::log(y(l_prime[u], output_delay_ + t + 1)));
-      }
-
-      // Add in the u + 1, t + 1 term.
-      if (u + 1 < U) {
-        log_beta->coeffRef(u, t) =
-            LogSumExp(log_beta->coeff(u, t),
-                      log_beta->coeff(u + 1, t + 1) +
-                          std::log(y(l_prime[u + 1], output_delay_ + t + 1)));
-      }
-
-      // Add in the u + 2, t + 1 term if l_prime(u) != blank or l_prime(u+2).
-      if (u + 2 < U) {
-        const bool matching_labels_merge =
-            ctc_merge_repeated && (l_prime[u] == l_prime[u + 2]);
-        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
-          // Add in u + 2 term.
-          log_beta->coeffRef(u, t) =
-              LogSumExp(log_beta->coeff(u, t),
-                        log_beta->coeff(u + 2, t + 1) +
-                            std::log(y(l_prime[u + 2], output_delay_ + t + 1)));
-        }
-      }  // End (GravesTh) Eq. 7.15
-    }
-  }
-}
-
-// Using (GravesTh) Eq 7.26 & 7.34.
-void CTCLossCalculator::CalculateGradient(const std::vector<int>& l_prime,
-                                          const Matrix& y,
-                                          const Matrix& log_alpha,
-                                          const Matrix& log_beta,
-                                          float log_p_z_x, Matrix* dy) const {
-  // Only working with the leftmost part of dy for this batch element.
-  auto dy_b = dy->leftCols(y.cols());
-
-  // It is possible that no valid path is found if the activations for the
-  // targets are zero.
-  if (log_p_z_x == kLogZero) {
-    LOG(WARNING) << "No valid path found.";
-    dy_b = y;
-    return;
-  }
-
-  int L = y.rows();
-  int T = y.cols();
-  int U = l_prime.size();
-
-  for (int t = 0; t < T - output_delay_; ++t) {
-    Array prob_sum(L);
-    prob_sum.setConstant(kLogZero);
-
-    for (int u = 0; u < U; ++u) {
-      int l = l_prime[u];
-      prob_sum[l] = LogSumExp(prob_sum[l], log_alpha(u, t) + log_beta(u, t));
-    }
-
-    for (int l = 0; l < L; ++l) {
-      // Negative term in (GravesTh) Eq 7.28.
-      float negative_term = expf(prob_sum[l] - log_p_z_x);
-
-      dy_b(l, output_delay_ + t) = y(l, output_delay_ + t) - negative_term;
-    }
-  }
-}
-
-void CTCLossCalculator::GetLPrimeIndices(const std::vector<int>& l,
-                                         std::vector<int>* l_prime) const {
-  // Assumption is that l_prime is empty.
-  l_prime->reserve(2 * l.size() + 1);
-
-  for (auto label : l) {
-    l_prime->push_back(blank_index_);
-    l_prime->push_back(label);
-  }
-  // Add final blank to l'.
-  l_prime->push_back(blank_index_);
-}
-
-}  // namespace ctc
+namespace ctc {}  // namespace ctc
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index 5f4c4cd8a08..8cce828ce47 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -30,6 +30,7 @@ limitations under the License.
 namespace tensorflow {
 namespace ctc {
 
+template <class T>
 class CTCLossCalculator {
   // Connectionist Temporal Classification Loss
   //
@@ -50,10 +51,14 @@ class CTCLossCalculator {
   //    Neural Networks" (PhD Thesis), Technische Universit¨at M¨unchen.
  public:
   typedef std::vector<std::vector<int>> LabelSequences;
-  typedef Eigen::MatrixXf Matrix;
-  typedef Eigen::ArrayXf Array;
-  typedef Eigen::Map<const Eigen::MatrixXf> InputMap;
-  typedef Eigen::Map<Eigen::MatrixXf> OutputMap;
+  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  // typedef Eigen::MatrixXd Matrix;
+  using Array = Eigen::Array<T, Eigen::Dynamic, 1>;
+  // typedef Eigen::ArrayXd Array;
+  using InputMap = Eigen::Map<const Matrix>;
+  // typedef Eigen::Map<const Eigen::MatrixXd> InputMap;
+  using OutputMap = Eigen::Map<Matrix>;
+  // typedef Eigen::Map<Eigen::MatrixXd> OutputMap;
 
   CTCLossCalculator(int blank_index, int output_delay)
       : blank_index_(blank_index), output_delay_(output_delay) {}
@@ -79,7 +84,7 @@ class CTCLossCalculator {
 
   void CalculateGradient(const std::vector<int>& l_prime, const Matrix& y,
                          const Matrix& log_alpha, const Matrix& log_beta,
-                         float log_p_z_x, Matrix* dy) const;
+                         T log_p_z_x, Matrix* dy) const;
 
   void GetLPrimeIndices(const std::vector<int>& l,
                         std::vector<int>* l_prime) const;
@@ -103,14 +108,17 @@ class CTCLossCalculator {
   const int output_delay_;
 };
 
+template <class T>
 template <typename VectorIn, typename VectorOut, typename MatrixIn,
           typename MatrixOut>
-Status CTCLossCalculator::CalculateLoss(
+Status CTCLossCalculator<T>::CalculateLoss(
     const VectorIn& seq_len, const LabelSequences& labels,
     const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
     bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
     VectorOut* loss, std::vector<MatrixOut>* gradients,
     DeviceBase::CpuWorkerThreads* workers) const {
+  using Eigen::numext::log;
+
   auto num_time_steps = inputs.size();
 
   if (loss == nullptr) {
@@ -205,11 +213,11 @@ Status CTCLossCalculator::CalculateLoss(
       // Convert label from DistBelief
       // y, prob are in num_classes x seq_len(b)
       // Output activations.
-      Eigen::ArrayXf y_b_col;
+      Array y_b_col;
       for (int t = 0; t < seq_len(b); t++) {
-        // Calculate the softmax of y_b.  Use double precision
+        // Calculate the softmax of y_b.  Use original precision
         // arithmetic for the sum.
-        float max_coeff = inputs[t].row(b).maxCoeff();
+        T max_coeff = inputs[t].row(b).maxCoeff();
         y_b_col = (inputs[t].row(b).array() - max_coeff).exp();
         y_b.col(t) = y_b_col / y_b_col.sum();
       }
@@ -222,7 +230,7 @@ Status CTCLossCalculator::CalculateLoss(
 
       // The loss is computed as the log(p(z|x)) between the target and
       // prediction. Do lazy evaluation of log_prob here.
-      float log_p_z_x = kLogZero;
+      T log_p_z_x = kLogZero<T>();
       for (int u = 0; u < l_prime.size(); ++u) {
         // (GravesTh) Eq 7.26, sum over all paths for t = 0.
         log_p_z_x = LogSumExp(log_p_z_x, log_alpha_b(u, 0) + log_beta_b(u, 0));
@@ -253,19 +261,19 @@ Status CTCLossCalculator::CalculateLoss(
     // fwd,bwd: T * 2 * (2*L + 1) * (Cost(LogSumExp) + Cost(Log)) +
     // grad: T * ((2L + 1) * Cost(LogSumExp) + L * (Cost(Expf) + Cost(Add)).
     const int64 cost_exp = Eigen::internal::functor_traits<
-        Eigen::internal::scalar_exp_op<float>>::Cost;
+        Eigen::internal::scalar_exp_op<T>>::Cost;
     const int64 cost_log = Eigen::internal::functor_traits<
-        Eigen::internal::scalar_log_op<float>>::Cost;
+        Eigen::internal::scalar_log_op<T>>::Cost;
     const int64 cost_log_sum_exp =
-        Eigen::TensorOpCost::AddCost<float>() + cost_exp + cost_log;
+        Eigen::TensorOpCost::AddCost<T>() + cost_exp + cost_log;
     const int64 cost =
         max_seq_len * num_classes *
-            (cost_exp + Eigen::TensorOpCost::DivCost<float>()) +
+            (cost_exp + Eigen::TensorOpCost::DivCost<T>()) +
         max_seq_len * 2 * (2 * num_classes + 1) *
             (cost_log_sum_exp + cost_log) +
         max_seq_len *
             ((2 * num_classes + 1) * cost_log_sum_exp +
-             num_classes * (cost_exp + Eigen::TensorOpCost::AddCost<float>()));
+             num_classes * (cost_exp + Eigen::TensorOpCost::AddCost<T>()));
     Shard(workers->num_threads, workers->workers, batch_size, cost,
           ComputeLossAndGradients);
   } else {
@@ -274,8 +282,9 @@ Status CTCLossCalculator::CalculateLoss(
   return Status::OK();
 }
 
+template <class T>
 template <typename Vector>
-Status CTCLossCalculator::PopulateLPrimes(
+Status CTCLossCalculator<T>::PopulateLPrimes(
     bool preprocess_collapse_repeated, bool ignore_longer_outputs_than_inputs,
     int batch_size, int num_classes, const Vector& seq_len,
     const LabelSequences& labels, size_t* max_u_prime,
@@ -357,6 +366,177 @@ Status CTCLossCalculator::PopulateLPrimes(
   return Status::OK();
 }
 
+// Calculates the alpha(t, u) as described in (GravesTh) Section 7.3.
+// Starting with t = 0 instead of t = 1 used in the text.
+// Based on Kanishka's CTC.
+template <typename TT>
+void CTCLossCalculator<TT>::CalculateForwardVariables(
+    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
+    Matrix* log_alpha) const {
+  using Eigen::numext::log;
+
+  // Number of cols is the number of time steps = number of cols in target
+  // after the output delay.
+  log_alpha->setConstant(kLogZero<TT>());
+
+  int U = l_prime.size();
+  int T = log_alpha->cols();
+
+  CHECK_EQ(U, log_alpha->rows());
+
+  // Initial alpha values in (GravesTh) Eq 7.5 and Eq 7.6.
+  log_alpha->coeffRef(0, 0) = log(y(blank_index_, output_delay_));
+  // Below, l_prime[1] == labels[0]
+  auto label_0 = (l_prime.size() > 1) ? l_prime[1] : blank_index_;
+  log_alpha->coeffRef(1, 0) = log(y(label_0, output_delay_));
+
+  for (int t = 1; t < T; ++t) {
+    // If there is not enough time to output the remaining labels or
+    // some labels have been skipped, then let log_alpha(u, t) continue to
+    // be kLogZero.
+    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
+         ++u) {
+      // Begin (GravesTh) Eq 7.9
+      // Add in the u, t - 1 term.
+      auto sum_log_alpha = kLogZero<TT>();
+      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
+        sum_log_alpha = log_alpha->coeff(u, t - 1);
+      }
+
+      // Add in the u - 1, t - 1 term.
+      if (u > 0) {
+        sum_log_alpha =
+            LogSumExp(sum_log_alpha, log_alpha->coeff(u - 1, t - 1));
+      }
+
+      // Add in the u - 2, t - 1 term if l_prime(u) != blank or l_prime(u-2).
+      if (u > 1) {
+        const bool matching_labels_merge =
+            ctc_merge_repeated && (l_prime[u] == l_prime[u - 2]);
+        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
+          sum_log_alpha =
+              LogSumExp(sum_log_alpha, log_alpha->coeff(u - 2, t - 1));
+        }
+      }
+      // Multiply the summed alphas with the activation log probability.
+      log_alpha->coeffRef(u, t) =
+          log(y(l_prime[u], output_delay_ + t)) + sum_log_alpha;
+    }  // End (GravesTh) Eq 7.9.
+  }
+}
+
+// Calculates the beta(t, u) as described in (GravesTh) Section 7.3.
+template <class TT>
+void CTCLossCalculator<TT>::CalculateBackwardVariables(
+    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
+    Matrix* log_beta) const {
+  // Number of cols is the number of time steps =  number of cols in target.
+  // Matrix log_beta =
+  //    Matrix::Constant(l_prime.size(), y.cols() - output_delay_,
+  // kLogZero);
+  using Eigen::numext::log;
+
+  log_beta->setConstant(kLogZero<TT>());
+  int T = log_beta->cols();
+  int U = l_prime.size();
+  CHECK_EQ(U, log_beta->rows());
+
+  // Initial beta values in (GravesTh) Eq 7.13: log of probability 1.
+  for (int u = U - 2; u < U; ++u) log_beta->coeffRef(u, T - 1) = 0;
+
+  for (int t = T - 1 - 1; t >= 0; --t) {
+    // If there is not enough time to output the remaining labels or
+    // some labels have been skipped, then let log_beta(u, t) continue to
+    // be kLogZero.
+    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
+         ++u) {
+      // Begin (GravesTh) Eq 7.15
+      // Add in the u, t + 1 term.
+      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
+        log_beta->coeffRef(u, t) =
+            LogSumExp(log_beta->coeff(u, t),
+                      log_beta->coeff(u, t + 1) +
+                          log(y(l_prime[u], output_delay_ + t + 1)));
+      }
+
+      // Add in the u + 1, t + 1 term.
+      if (u + 1 < U) {
+        log_beta->coeffRef(u, t) =
+            LogSumExp(log_beta->coeff(u, t),
+                      log_beta->coeff(u + 1, t + 1) +
+                          log(y(l_prime[u + 1], output_delay_ + t + 1)));
+      }
+
+      // Add in the u + 2, t + 1 term if l_prime(u) != blank or l_prime(u+2).
+      if (u + 2 < U) {
+        const bool matching_labels_merge =
+            ctc_merge_repeated && (l_prime[u] == l_prime[u + 2]);
+        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
+          // Add in u + 2 term.
+          log_beta->coeffRef(u, t) =
+              LogSumExp(log_beta->coeff(u, t),
+                        log_beta->coeff(u + 2, t + 1) +
+                            log(y(l_prime[u + 2], output_delay_ + t + 1)));
+        }
+      }  // End (GravesTh) Eq. 7.15
+    }
+  }
+}
+
+// Using (GravesTh) Eq 7.26 & 7.34.
+template <typename TT>
+void CTCLossCalculator<TT>::CalculateGradient(const std::vector<int>& l_prime,
+                                              const Matrix& y,
+                                              const Matrix& log_alpha,
+                                              const Matrix& log_beta,
+                                              TT log_p_z_x, Matrix* dy) const {
+  // Only working with the leftmost part of dy for this batch element.
+  auto dy_b = dy->leftCols(y.cols());
+
+  // It is possible that no valid path is found if the activations for the
+  // targets are zero.
+  if (log_p_z_x == kLogZero<TT>()) {
+    LOG(WARNING) << "No valid path found.";
+    dy_b = y;
+    return;
+  }
+
+  int L = y.rows();
+  int T = y.cols();
+  int U = l_prime.size();
+
+  for (int t = 0; t < T - output_delay_; ++t) {
+    Array prob_sum(L);
+    prob_sum.setConstant(kLogZero<TT>());
+
+    for (int u = 0; u < U; ++u) {
+      int l = l_prime[u];
+      prob_sum[l] = LogSumExp(prob_sum[l], log_alpha(u, t) + log_beta(u, t));
+    }
+
+    for (int l = 0; l < L; ++l) {
+      // Negative term in (GravesTh) Eq 7.28.
+      auto negative_term = expf(prob_sum[l] - log_p_z_x);
+
+      dy_b(l, output_delay_ + t) = y(l, output_delay_ + t) - negative_term;
+    }
+  }
+}
+
+template <class TT>
+void CTCLossCalculator<TT>::GetLPrimeIndices(const std::vector<int>& l,
+                                             std::vector<int>* l_prime) const {
+  // Assumption is that l_prime is empty.
+  l_prime->reserve(2 * l.size() + 1);
+
+  for (auto label : l) {
+    l_prime->push_back(blank_index_);
+    l_prime->push_back(label);
+  }
+  // Add final blank to l'.
+  l_prime->push_back(blank_index_);
+}
+
 }  // namespace ctc
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/ctc/ctc_loss_util.h b/tensorflow/core/util/ctc/ctc_loss_util.h
index df0de926d9a..7f9ea64c101 100644
--- a/tensorflow/core/util/ctc/ctc_loss_util.h
+++ b/tensorflow/core/util/ctc/ctc_loss_util.h
@@ -23,18 +23,23 @@ limitations under the License.
 namespace tensorflow {
 namespace ctc {
 
-const float kLogZero = -std::numeric_limits<float>::infinity();
+template <class T>
+constexpr T kLogZero() {
+  return -std::numeric_limits<T>::infinity();  // NOLINT
+}
 
 // Add logarithmic probabilities using:
 // ln(a + b) = ln(a) + ln(1 + exp(ln(b) - ln(a)))
 // The two inputs are assumed to be log probabilities.
 // (GravesTh) Eq. 7.18
-inline float LogSumExp(float log_prob_1, float log_prob_2) {
+template <typename T>
+inline T LogSumExp(T log_prob_1, T log_prob_2) {
+  // const T kLogZero = -std::numeric_limits<T>::infinity();
   // Always have 'b' be the smaller number to avoid the exponential from
   // blowing up.
-  if (log_prob_1 == kLogZero) {
+  if (log_prob_1 == kLogZero<T>()) {
     return log_prob_2;
-  } else if (log_prob_2 == kLogZero) {
+  } else if (log_prob_2 == kLogZero<T>()) {
     return log_prob_1;
   } else {
     return (log_prob_1 > log_prob_2)
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 25da9e186bb..5ff6c508e2e 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -443,6 +443,27 @@ bool DeviceNameUtils::IsSameAddressSpace(StringPiece src, StringPiece dst) {
          IsSameAddressSpace(x, y);
 }
 
+/* static */
+bool DeviceNameUtils::IsDifferentAddressSpace(const ParsedName& a,
+                                              const ParsedName& b) {
+  return (a.has_job && b.has_job && (a.job != b.job)) ||
+         (a.has_replica && b.has_replica && (a.replica != b.replica)) ||
+         (a.has_task && b.has_task && (a.task != b.task));
+}
+
+/* static */
+const DeviceNameUtils::ParsedName DeviceNameUtils::AddressSpace(
+    const ParsedName& name) {
+  ParsedName address_space;
+  address_space.has_job = name.has_job;
+  address_space.has_replica = name.has_replica;
+  address_space.has_task = name.has_task;
+  address_space.job = name.job;
+  address_space.replica = name.replica;
+  address_space.task = name.task;
+  return address_space;
+}
+
 /* static */
 string DeviceNameUtils::LocalName(StringPiece type, int id) {
   return strings::StrCat("/device:", type, ":", id);
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 830f0f065cd..25ddd2402a5 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -153,6 +153,14 @@ class DeviceNameUtils {
   static bool IsSameAddressSpace(StringPiece src, StringPiece dst);
   static bool IsSameAddressSpace(const ParsedName& src, const ParsedName& dst);
 
+  // Returns true iff devices identified by 'a' and 'b' are in different
+  // address space.
+  static bool IsDifferentAddressSpace(const ParsedName& a, const ParsedName& b);
+
+  // Returns the an address space specification containing only the
+  // job/replica/task of the given name.
+  static const ParsedName AddressSpace(const ParsedName& name);
+
   // Returns the local device given its "type" and "id".
   static string LocalName(StringPiece type, int id);
 
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index d275e076f86..e14e7968cff 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -90,18 +90,24 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
         << "variable or function argument.";
     return "(TF_DUMP_GRAPH_PREFIX not specified)";
   }
-  Status status = env->RecursivelyCreateDir(dir);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dir << " for dumping " << proto_type
-                 << ": " << status;
-    return "(unavailable)";
-  }
-  string filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
-  status = WriteToFile(filepath, proto);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
-                 << " : " << status;
-    return "(unavailable)";
+  string filepath = "NULL";
+  if (std::strncmp(dir, "-", 2) == 0) {
+    LOG(INFO) << proto.DebugString();
+    filepath = "LOG(INFO)";
+  } else {
+    Status status = env->RecursivelyCreateDir(dir);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to create " << dir << " for dumping "
+                   << proto_type << ": " << status;
+      return "(unavailable)";
+    }
+    filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
+    status = WriteToFile(filepath, proto);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to dump " << proto_type
+                   << " to file: " << filepath << " : " << status;
+      return "(unavailable)";
+    }
   }
   LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
   return filepath;
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
index 03dc807a2b3..fe64d1766ea 100644
--- a/tensorflow/core/util/dump_graph.h
+++ b/tensorflow/core/util/dump_graph.h
@@ -28,6 +28,9 @@ namespace tensorflow {
 // Dumps 'graph_def' to a file, as a GraphDef text proto. Returns the file name
 // chosen.
 //
+// If the TF_DUMP_GRAPH_PREFIX environment variable is "-", then instead the
+// GraphDef will be logged (using the LOG() macro).
+//
 // Automatically picks a file name. Prefixes 'name' with the value of the
 // TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
 // 'name' with ".pbtxt" to form a name. If a graph has already been dumped by
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index ee1040d7574..f5dfffa671b 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -106,9 +106,14 @@ message WatchdogConfig {
   int64 timeout_ms = 1;
 }
 
+message RequestedExitCode {
+  int32 exit_code = 1;
+}
+
 message WorkerHeartbeatRequest {
   WorkerShutdownMode shutdown_mode = 1;
   WatchdogConfig watchdog_config = 2;
+  RequestedExitCode exit_code = 3;
 }
 
 message WorkerHeartbeatResponse {
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index a75b26abc63..a2abbaa6fc7 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -53,7 +53,7 @@ void WriteFile(EventsWriter* writer) {
 
 static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
                            Event* proto) {
-  string record;
+  tstring record;
   Status s = reader->ReadRecord(offset, &record);
   if (!s.ok()) {
     return false;
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 2dc5c8397aa..ed70c73ebb5 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb_text.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -465,7 +465,7 @@ enum class Type { Sparse, Dense };
 struct SparseBuffer {
   // Features are in one of the 3 vectors below depending on config's dtype.
   // Other 2 vectors remain empty.
-  SmallVector<string> bytes_list;
+  SmallVector<tstring> bytes_list;
   SmallVector<float> float_list;
   SmallVector<int64> int64_list;
 
@@ -666,8 +666,8 @@ Status FastParseSerializedExample(
             break;
           }
           case DT_STRING: {
-            auto out_p = out.flat<string>().data() + offset;
-            LimitedArraySlice<string> slice(out_p, num_elements);
+            auto out_p = out.flat<tstring>().data() + offset;
+            LimitedArraySlice<tstring> slice(out_p, num_elements);
             if (!feature.ParseBytesList(&slice)) return parse_error();
             if (slice.EndDistance() != 0) {
               return shape_error(num_elements - slice.EndDistance(), "bytes");
@@ -852,8 +852,8 @@ Status FastParseSerializedExample(
         break;
       }
       case DT_STRING: {
-        std::copy_n(in.flat<string>().data(), num_elements,
-                    out.flat<string>().data() + offset);
+        std::copy_n(in.flat<tstring>().data(), num_elements,
+                    out.flat<tstring>().data() + offset);
         break;
       }
       default:
@@ -907,7 +907,7 @@ const SmallVector<float>& GetListFromBuffer<float>(const SparseBuffer& buffer) {
   return buffer.float_list;
 }
 template <>
-const SmallVector<string>& GetListFromBuffer<string>(
+const SmallVector<tstring>& GetListFromBuffer<tstring>(
     const SparseBuffer& buffer) {
   return buffer.bytes_list;
 }
@@ -917,7 +917,7 @@ void CopyOrMoveBlock(const T* b, const T* e, T* t) {
   std::copy(b, e, t);
 }
 template <>
-void CopyOrMoveBlock(const string* b, const string* e, string* t) {
+void CopyOrMoveBlock(const tstring* b, const tstring* e, tstring* t) {
   std::move(b, e, t);
 }
 
@@ -1002,8 +1002,8 @@ class TensorVector {
 }  // namespace
 
 Status FastParseExample(const Config& config,
-                        gtl::ArraySlice<string> serialized,
-                        gtl::ArraySlice<string> example_names,
+                        gtl::ArraySlice<tstring> serialized,
+                        gtl::ArraySlice<tstring> example_names,
                         thread::ThreadPool* thread_pool, Result* result) {
   DCHECK(result != nullptr);
   // Check config so we can safely CHECK(false) in switches on config.*.dtype
@@ -1194,7 +1194,7 @@ Status FastParseExample(const Config& config,
         }
         case DT_STRING: {
           std::move(buffer.bytes_list.begin(), buffer.bytes_list.end(),
-                    values->flat<string>().data() + offset);
+                    values->flat<tstring>().data() + offset);
           break;
         }
         default:
@@ -1253,8 +1253,8 @@ Status FastParseExample(const Config& config,
         break;
       }
       case DT_STRING: {
-        FillAndCopyVarLen<string>(d, num_elements, num_elements_per_minibatch,
-                                  config, varlen_dense_buffers, &values);
+        FillAndCopyVarLen<tstring>(d, num_elements, num_elements_per_minibatch,
+                                   config, varlen_dense_buffers, &values);
         break;
       }
       default:
@@ -1273,8 +1273,8 @@ Status FastParseExample(const Config& config,
   return Status::OK();
 }
 
-Status FastParseSingleExample(const Config& config, const string& serialized,
-                              Result* result) {
+Status FastParseSingleExample(const Config& config,
+                              absl::string_view serialized, Result* result) {
   DCHECK(result != nullptr);
   // Check config so we can safely CHECK(false) in switches on config.*.dtype
   for (auto& c : config.sparse) {
@@ -1440,8 +1440,8 @@ Status FastParseSingleExample(const Config& config, const string& serialized,
           break;
         }
         case DT_STRING: {
-          auto out_p = out->flat<string>().data();
-          LimitedArraySlice<string> slice(out_p, num_elements);
+          auto out_p = out->flat<tstring>().data();
+          LimitedArraySlice<tstring> slice(out_p, num_elements);
           if (!feature.ParseBytesList(&slice)) return parse_error();
           if (slice.EndDistance() != 0) {
             return parse_error();
@@ -1453,7 +1453,7 @@ Status FastParseSingleExample(const Config& config, const string& serialized,
       }
 
     } else {  // if variable length
-      SmallVector<string> bytes_list;
+      SmallVector<tstring> bytes_list;
       TensorVector<float> float_list;
       SmallVector<int64> int64_list;
 
@@ -1578,7 +1578,7 @@ Status FastParseSingleExample(const Config& config, const string& serialized,
         case DT_STRING: {
           *out = Tensor(out_dtype, out_shape);
           CopyOrMoveBlock(bytes_list.begin(), bytes_list.end(),
-                          out->flat<string>().data());
+                          out->flat<tstring>().data());
           break;
         }
         default:
@@ -1627,7 +1627,7 @@ Status FastParseSingleExample(const Config& config, const string& serialized,
 // Return the number of bytes elements parsed, or -1 on error. If out is null,
 // this method simply counts the number of elements without any copying.
 inline int ParseBytesFeature(protobuf::io::CodedInputStream* stream,
-                             string* out) {
+                             tstring* out) {
   int num_elements = 0;
   uint32 length;
   if (!stream->ExpectTag(kDelimitedTag(1)) || !stream->ReadVarint32(&length)) {
@@ -1638,12 +1638,23 @@ inline int ParseBytesFeature(protobuf::io::CodedInputStream* stream,
     while (!stream->ExpectAtEnd()) {
       uint32 bytes_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
-          !stream->ReadVarint32(&bytes_length) ||
-          (out != nullptr && !stream->ReadString(out++, bytes_length))) {
+          !stream->ReadVarint32(&bytes_length)) {
         return -1;
       }
       if (out == nullptr) {
         stream->Skip(bytes_length);
+      } else {
+#ifdef USE_TSTRING
+        out->resize_uninitialized(bytes_length);
+        if (!stream->ReadRaw(out->data(), bytes_length)) {
+          return -1;
+        }
+#else   // USE_TSTRING
+        if (!stream->ReadString(out, bytes_length)) {
+          return -1;
+        }
+#endif  // USE_TSTRING
+        out++;
       }
       num_elements++;
     }
@@ -1809,7 +1820,7 @@ inline bool SkipEmptyFeature(protobuf::io::CodedInputStream* stream,
 Status FastParseSequenceExample(
     const FastParseExampleConfig& context_config,
     const FastParseExampleConfig& feature_list_config,
-    gtl::ArraySlice<string> serialized, gtl::ArraySlice<string> example_names,
+    gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
     thread::ThreadPool* thread_pool, Result* context_result,
     Result* feature_list_result, std::vector<Tensor>* dense_feature_lengths) {
   int num_examples = serialized.size();
@@ -1878,10 +1889,10 @@ Status FastParseSequenceExample(
       all_context_features(num_examples);
   std::vector<absl::flat_hash_map<StringPiece, StringPiece>>
       all_sequence_features(num_examples);
-  const string kUnknown = "<unknown>";
+  const tstring kUnknown = "<unknown>";
   for (int d = 0; d < num_examples; d++) {
-    const string& example = serialized[d];
-    const string& example_name =
+    const tstring& example = serialized[d];
+    const tstring& example_name =
         example_names.empty() ? kUnknown : example_names[d];
     auto* context_features = &all_context_features[d];
     auto* sequence_features = &all_sequence_features[d];
@@ -2074,12 +2085,12 @@ Status FastParseSequenceExample(
 
     // TODO(sundberg): Refactor to reduce code duplication, and add bounds
     // checking for the outputs.
-    string* out_bytes = nullptr;
+    tstring* out_bytes = nullptr;
     float* out_float = nullptr;
     int64* out_int64 = nullptr;
     switch (dtype) {
       case DT_STRING:
-        out_bytes = context_result->dense_values[t].flat<string>().data();
+        out_bytes = context_result->dense_values[t].flat<tstring>().data();
         break;
       case DT_FLOAT:
         out_float = context_result->dense_values[t].flat<float>().data();
@@ -2097,7 +2108,7 @@ Status FastParseSequenceExample(
     for (int e = 0; e < num_examples; e++) {
       size_t num_elements = 0;
       const auto feature_iter = all_context_features[e].find(c.feature_name);
-      const string& example_name =
+      const tstring& example_name =
           example_names.empty() ? kUnknown : example_names[e];
       if (feature_iter == all_context_features[e].end()) {
         // Copy the default value, if present. If not, return an error.
@@ -2107,13 +2118,13 @@ Status FastParseSequenceExample(
               " (data type: ", DataTypeString(c.dtype), ")",
               " is required but could not be found.");
         }
-        const string* in_bytes = nullptr;
+        const tstring* in_bytes = nullptr;
         const float* in_float = nullptr;
         const int64* in_int64 = nullptr;
         size_t num = 0;
         switch (dtype) {
           case DT_STRING:
-            in_bytes = c.default_value.flat<string>().data();
+            in_bytes = c.default_value.flat<tstring>().data();
             num = c.default_value.NumElements();
             for (int p = 0; p < num; p++) {
               *out_bytes++ = *in_bytes++;
@@ -2185,12 +2196,12 @@ Status FastParseSequenceExample(
         Tensor(allocator, DT_INT64, TensorShape({2}));
     // TODO(sundberg): Refactor to reduce code duplication, and add bounds
     // checking for the outputs.
-    string* out_bytes = nullptr;
+    tstring* out_bytes = nullptr;
     float* out_float = nullptr;
     int64* out_int64 = nullptr;
     switch (dtype) {
       case DT_STRING:
-        out_bytes = context_result->sparse_values[t].flat<string>().data();
+        out_bytes = context_result->sparse_values[t].flat<tstring>().data();
         break;
       case DT_FLOAT:
         out_float = context_result->sparse_values[t].flat<float>().data();
@@ -2211,7 +2222,7 @@ Status FastParseSequenceExample(
     size_t max_num_cols = 0;
     for (int e = 0; e < num_examples; e++) {
       const auto& feature = all_context_features[e][c.feature_name];
-      const string& example_name =
+      const tstring& example_name =
           example_names.empty() ? kUnknown : example_names[e];
       if (!feature.empty()) {
         protobuf::io::CodedInputStream stream(
@@ -2276,12 +2287,12 @@ Status FastParseSequenceExample(
         Tensor(allocator, DT_INT64, dense_length_shape);
     int64* out_lengths = (*dense_feature_lengths)[t].flat<int64>().data();
 
-    string* out_bytes = nullptr;
+    tstring* out_bytes = nullptr;
     float* out_float = nullptr;
     int64* out_int64 = nullptr;
     switch (dtype) {
       case DT_STRING:
-        out_bytes = feature_list_result->dense_values[t].flat<string>().data();
+        out_bytes = feature_list_result->dense_values[t].flat<tstring>().data();
         break;
       case DT_FLOAT:
         out_float = feature_list_result->dense_values[t].flat<float>().data();
@@ -2299,7 +2310,7 @@ Status FastParseSequenceExample(
     for (int e = 0; e < num_examples; e++) {
       size_t num_elements = 0, num_rows = 0;
       const auto feature_iter = all_sequence_features[e].find(c.feature_name);
-      const string& example_name =
+      const tstring& example_name =
           example_names.empty() ? kUnknown : example_names[e];
       if (feature_iter == all_sequence_features[e].end()) {
         // Return an error if this feature was not allowed to be missing.
@@ -2387,12 +2398,13 @@ Status FastParseSequenceExample(
     feature_list_result->sparse_shapes[t] =
         Tensor(allocator, DT_INT64, TensorShape({3}));
 
-    string* out_bytes = nullptr;
+    tstring* out_bytes = nullptr;
     float* out_float = nullptr;
     int64* out_int64 = nullptr;
     switch (dtype) {
       case DT_STRING:
-        out_bytes = feature_list_result->sparse_values[t].flat<string>().data();
+        out_bytes =
+            feature_list_result->sparse_values[t].flat<tstring>().data();
         break;
       case DT_FLOAT:
         out_float = feature_list_result->sparse_values[t].flat<float>().data();
@@ -2415,7 +2427,7 @@ Status FastParseSequenceExample(
     size_t max_num_cols = 0;
     for (int e = 0; e < num_examples; e++) {
       const auto& feature = all_sequence_features[e][c.feature_name];
-      const string& example_name =
+      const tstring& example_name =
           example_names.empty() ? kUnknown : example_names[e];
       if (!feature.empty()) {
         protobuf::io::CodedInputStream stream(
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 055d9c2c305..3b005f2f1c8 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -99,15 +99,15 @@ struct Result {
 // Given example names have to either be empty or the same size as serialized.
 // example_names are used only for error messages.
 Status FastParseExample(const FastParseExampleConfig& config,
-                        gtl::ArraySlice<string> serialized,
-                        gtl::ArraySlice<string> example_names,
+                        gtl::ArraySlice<tstring> serialized,
+                        gtl::ArraySlice<tstring> example_names,
                         thread::ThreadPool* thread_pool, Result* result);
 
 // TODO(mrry): Move the hash table construction into the config object.
 typedef FastParseExampleConfig FastParseSingleExampleConfig;
 
 Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
-                              const string& serialized, Result* result);
+                              absl::string_view serialized, Result* result);
 
 // Parses a batch of serialized SequenceExample protos and converts them into
 // result according to given config.
@@ -116,7 +116,7 @@ Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
 Status FastParseSequenceExample(
     const example::FastParseExampleConfig& context_config,
     const example::FastParseExampleConfig& feature_list_config,
-    gtl::ArraySlice<string> serialized, gtl::ArraySlice<string> example_names,
+    gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
     thread::ThreadPool* thread_pool, example::Result* context_result,
     example::Result* feature_list_result,
     std::vector<Tensor>* dense_feature_lengths);
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 6c5f80a5356..19a667d7dab 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -273,7 +273,7 @@ static void AddSparseFeature(const char* feature_name, DataType dtype,
 
 TEST(FastParse, StatsCollection) {
   const size_t kNumExamples = 13;
-  std::vector<string> serialized(kNumExamples, ExampleWithSomeFeatures());
+  std::vector<tstring> serialized(kNumExamples, ExampleWithSomeFeatures());
 
   FastParseExampleConfig config_dense;
   AddDenseFeature("bytes_list", DT_STRING, {2}, false, 2, &config_dense);
@@ -417,8 +417,9 @@ TEST(TestFastParseExample, Empty) {
   Result result;
   FastParseExampleConfig config;
   config.sparse.push_back({"test", DT_STRING});
-  Status status = FastParseExample(config, gtl::ArraySlice<string>(),
-                                   gtl::ArraySlice<string>(), nullptr, &result);
+  Status status =
+      FastParseExample(config, gtl::ArraySlice<tstring>(),
+                       gtl::ArraySlice<tstring>(), nullptr, &result);
   EXPECT_TRUE(status.ok()) << status;
 }
 
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 41fb20c00a9..bff08f78829 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb_text.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -101,7 +101,7 @@ Status FeatureDenseCopy(const std::size_t out_index, const string& name,
             "Values size: ",
             values.value_size(), " but output shape: ", shape.DebugString());
       }
-      auto out_p = out->flat<string>().data() + offset;
+      auto out_p = out->flat<tstring>().data() + offset;
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
                      [](const string* s) { return *s; });
@@ -136,7 +136,7 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
       const BytesList& values = feature.bytes_list();
       const int64 num_elements = values.value_size();
       Tensor out(dtype, TensorShape({num_elements}));
-      auto out_p = out.flat<string>().data();
+      auto out_p = out.flat<tstring>().data();
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
                      [](const string* s) { return *s; });
@@ -175,8 +175,8 @@ int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
       break;
     }
     case DT_STRING: {
-      std::copy_n(in.flat<string>().data(), num_elements,
-                  values->flat<string>().data() + offset);
+      std::copy_n(in.flat<tstring>().data(), num_elements,
+                  values->flat<tstring>().data() + offset);
       break;
     }
     default:
@@ -203,8 +203,9 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
       break;
     }
     case DT_STRING: {
-      std::copy_n(in.flat<string>().data(), num_elements,
-                  out->flat<string>().data() + offset);
+      // TODO(dero): verify.
+      std::copy_n(in.flat<tstring>().data(), num_elements,
+                  out->flat<tstring>().data() + offset);
       break;
     }
     default:
@@ -251,7 +252,7 @@ Status SingleExampleProtoToTensors(
                                        ", Feature: ", key,
                                        ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
-                                       "  Feature is: ", ProtoDebugString(f));
+                                       "  Feature is: ", f.DebugString());
       }
       TF_RETURN_IF_ERROR(FeatureDenseCopy(batch_index, example_name, key, dtype,
                                           shape, f,
@@ -283,7 +284,7 @@ Status SingleExampleProtoToTensors(
                                        ", Feature: ", key,
                                        ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
-                                       "  Feature is: ", ProtoDebugString(f));
+                                       "  Feature is: ", f.DebugString());
       }
       (*output_sparse_values_tmp)[d][batch_index] =
           FeatureSparseCopy(batch_index, key, dtype, f);
diff --git a/tensorflow/core/util/example_proto_helper_test.cc b/tensorflow/core/util/example_proto_helper_test.cc
index 1bf430b2c78..141c2400e91 100644
--- a/tensorflow/core/util/example_proto_helper_test.cc
+++ b/tensorflow/core/util/example_proto_helper_test.cc
@@ -57,7 +57,7 @@ class SingleExampleProtoToTensorsTest : public ::testing::Test {
     string_dense_config.dtype = DT_STRING;
     string_dense_config.shape = TensorShape({1});
     string_dense_config.default_value = Tensor(DT_STRING, TensorShape({1}));
-    string_dense_config.default_value.scalar<string>()() = "default";
+    string_dense_config.default_value.scalar<tstring>()() = "default";
     dense_vec_.push_back(string_dense_config);
 
     // Setup sparse feature configuration.
@@ -115,7 +115,7 @@ TEST_F(SingleExampleProtoToTensorsTest, SparseOnlyTrivial) {
 
   const std::vector<Tensor>& string_tensor_vec = output_sparse_values_tmp[2];
   EXPECT_EQ(1, string_tensor_vec.size());
-  EXPECT_EQ("forty-two", string_tensor_vec[0].vec<string>()(0));
+  EXPECT_EQ("forty-two", string_tensor_vec[0].vec<tstring>()(0));
 }
 
 TEST_F(SingleExampleProtoToTensorsTest, SparseOnlyEmpty) {
@@ -143,7 +143,7 @@ TEST_F(SingleExampleProtoToTensorsTest, SparseOnlyEmpty) {
 
   const std::vector<Tensor>& string_tensor_vec = output_sparse_values_tmp[2];
   EXPECT_EQ(1, string_tensor_vec.size());
-  EXPECT_EQ(0, string_tensor_vec[0].vec<string>().size());
+  EXPECT_EQ(0, string_tensor_vec[0].vec<tstring>().size());
 }
 
 TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyTrivial) {
@@ -182,8 +182,8 @@ TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyTrivial) {
   EXPECT_EQ(1, float_dense_output.matrix<float>().size());
   EXPECT_NEAR(4.2, float_dense_output.matrix<float>()(0, 0), 0.001);
 
-  EXPECT_EQ(1, str_dense_output.matrix<string>().size());
-  EXPECT_EQ("forty-two", str_dense_output.matrix<string>()(0, 0));
+  EXPECT_EQ(1, str_dense_output.matrix<tstring>().size());
+  EXPECT_EQ("forty-two", str_dense_output.matrix<tstring>()(0, 0));
 }
 
 TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyDefaults) {
@@ -211,8 +211,8 @@ TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyDefaults) {
   EXPECT_EQ(1, float_dense_output.matrix<float>().size());
   EXPECT_NEAR(0.0, float_dense_output.matrix<float>()(0, 0), 0.001);
 
-  EXPECT_EQ(1, str_dense_output.matrix<string>().size());
-  EXPECT_EQ("default", str_dense_output.matrix<string>()(0, 0));
+  EXPECT_EQ(1, str_dense_output.matrix<tstring>().size());
+  EXPECT_EQ("default", str_dense_output.matrix<tstring>()(0, 0));
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 9040e78d6fd..7c54294d734 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -498,7 +498,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
 // Note: this function does not synchronize, and therefore the memory range is
 // not guaranteed to be zero until the next kernel launch.
 template <typename T>
-__global__ void SetZero(const int count, T* ptr) {
+__global__ void SetZero(const int count, T* __restrict__ ptr) {
   // Check that the grid is one dimensional and index doesn't overflow.
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
@@ -510,7 +510,7 @@ __global__ void SetZero(const int count, T* ptr) {
 
 // Helper to set all tensor entries to a specific value.
 template <typename T>
-__global__ void SetToValue(const int count, T* ptr, T value) {
+__global__ void SetToValue(const int count, T* __restrict__ ptr, T value) {
   // Check that the grid is one dimensional and index doesn't overflow.
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index 528bc559a20..0b84aed9234 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -41,12 +41,14 @@ namespace tensorflow {
 
 namespace {
 
-__global__ void SetOutbufZero(GpuLaunchConfig config, int* outbuf) {
+__global__ void SetOutbufZero(GpuLaunchConfig config,
+                              int* __restrict__ outbuf) {
   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
 }
 
 // counting number of jobs by using atomic +1
-__global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count1D(GpuLaunchConfig config, int bufsize,
+                        int* __restrict__ outbuf) {
   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -54,7 +56,8 @@ __global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) {
     atomicAdd(&outbuf[x % bufsize], 1);
   }
 }
-__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize,
+                        int* __restrict__ outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -68,7 +71,8 @@ __global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) {
     }
   }
 }
-__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize,
+                        int* __restrict__ outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -90,7 +94,8 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 
-__global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
+__global__ void CudaShuffleGetSrcLaneTest(
+    unsigned* __restrict__ failure_count) {
   unsigned lane_id = CudaLaneId();
   for (int width = warpSize; width > 1; width /= 2) {
     auto check_result = [&](const char* op_name, int param, unsigned actual,
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index fea11a8351e..64b8c580fd4 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -43,9 +43,7 @@ namespace tensorflow {
 // Region naming:
 // Region naming is up to the application, all of them starts from
 // kMemmappedPackagePrefix. The default graph usually has name
-// kMemmappedPackageDefaultGraphDef; for more details see the conversion
-// utility
-// third_party/tensorflow/contrib/util/convert_graphdef_memmapped_format.cc
+// kMemmappedPackageDefaultGraphDef;
 //
 // A "frozen" GraphDef can be converted into this format using
 // tensorflow/contrib/util/convert_graphdef_memmapped_format
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 36ae80a6d2a..8610bdc1c69 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -44,6 +44,11 @@ using mkldnn::memory;
 using mkldnn::padding_kind;
 using mkldnn::primitive;
 using mkldnn::reorder;
+using mkldnn::stream;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using MemoryArgsMap = std::unordered_map<int, memory>;
+using ReorderPd = mkldnn::reorder::primitive_desc;
 
 #ifdef _WIN32
 typedef unsigned int uint;
@@ -122,25 +127,145 @@ enum class MklQuantization {
 
 static const int kSmallBatchSize = 32;
 
-// Forward decl
-TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format);
-TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+#ifdef ENABLE_MKLDNN_V1
+#define ENGINE_CPU engine::kind::cpu
+#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(md, tensor, net, net_args, \
+                                         engine_ptr)                \
+  md, tensor, net, net_args, &engine_ptr
+#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) mem_ptr->get_desc()
+#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \
+  GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr)
+#define MEMORY_CONSTRUCTOR(mem_desc, cpu_engine, data) \
+  memory(mem_desc, cpu_engine, data)
+#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \
+  memory(GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine, data)
+#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_desc, cpu_engine) \
+  memory(mem_desc, cpu_engine)
+#define MEMORY_FORMAT memory::format_tag
+#define MKL_TENSOR_FORMAT MklTensorFormat
+#define MKL_TENSOR_FORMAT_BLOCKED MklTensorFormat::FORMAT_BLOCKED
+#define MKL_TENSOR_FORMAT_INVALID MklTensorFormat::FORMAT_INVALID
+#define MKL_TENSOR_FORMAT_NCDHW MklTensorFormat::FORMAT_NCDHW
+#define MKL_TENSOR_FORMAT_NDHWC MklTensorFormat::FORMAT_NDHWC
+#define MKL_TENSOR_FORMAT_NHWC MklTensorFormat::FORMAT_NHWC
+#define MKL_TENSOR_FORMAT_NCHW MklTensorFormat::FORMAT_NCHW
+#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_BLOCKED
+#define MEMORY_DATA_TYPE_UNDEF memory::data_type::undef
+#define MEMORY_PRIMITIVE_DESC memory::desc
+#define NET_ARGS_PTR &net_args
+#define OUTPUT_TF_MD output_tf_md
+#define TENSOR_FORMAT MKL_TENSOR_FORMAT
+#define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC
+#else
+#define ENGINE_CPU engine::cpu
+#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(pd, tensor, net_ptr, net_args, \
+                                         engine_ptr)                    \
+  pd, tensor, &net_ptr
+#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) \
+  mem_ptr->get_primitive_desc().desc()
+#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \
+  mem_ptr->get_primitive_desc()
+#define MEMORY_CONSTRUCTOR(mem_pd, cpu_engine, data) memory(mem_pd, data)
+#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \
+  memory({GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine}, data)
+#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_pd, cpu_engine) memory(mem_pd)
+#define MEMORY_FORMAT memory::format
+#define MKL_TENSOR_FORMAT memory::format
+#define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked
+#define MKL_TENSOR_FORMAT_INVALID memory::format::format_undef
+#define MKL_TENSOR_FORMAT_NCDHW memory::format::ncdhw
+#define MKL_TENSOR_FORMAT_NDHWC memory::format::ndhwc
+#define MKL_TENSOR_FORMAT_NHWC memory::format::nhwc
+#define MKL_TENSOR_FORMAT_NCHW memory::format::nchw
+#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_INVALID
+#define MEMORY_DATA_TYPE_UNDEF memory::data_type::data_undef
+#define MEMORY_PRIMITIVE_DESC memory::primitive_desc
+#define NET_ARGS_PTR nullptr
+#define OUTPUT_TF_MD output_tf_pd
+#define TENSOR_FORMAT TensorFormat
+#define TENSOR_FORMAT_NHWC FORMAT_NHWC
+#endif  // ENABLE_MKLDNN_V1
+
+#ifdef ENABLE_MKLDNN_V1
+// In MKL-DNN v1.x, the format (ex. NCHW) used to initialize a memory descriptor
+// (md) structure will no longer be recorded in its `format` field. Instead, it
+// will be set to a canonical `blocked` format for every fully described md.
+//
+// Currently, we query this `format` field while mapping MKL-DNN's data format
+// to TF's data format. Due to the above restriction, we will now get this data
+// format information from TF's `data_format` attribute (i.e. via
+// `TensorFormat`) for MKL-DNN v1.x.
+//
+// Some MKL-DNN operators such as ReLU do not have a `data_format` attribute
+// since they are usually in `blocked` format. Therefore, in order to
+// distinguish between blocked and non-blocked formats, we have defined a new
+// enum called `MklTensorFormat` that is semantically similar to `TensorFormat`
+// but with two additional fields namely:
+//  1) FORMAT_BLOCKED: as described above, this is needed for element-wise
+//     operators such as ReLU.
+//  2) FORMAT_INVALID: for error-checking (ex. unsupported format)
+enum class MklTensorFormat {
+  FORMAT_NHWC = 0,
+  FORMAT_NCHW = 1,
+  FORMAT_NDHWC = 2,
+  FORMAT_NCDHW = 3,
+  FORMAT_BLOCKED = 4,
+  FORMAT_INVALID = 5,
+};
+
+// Forward declarations
+memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format);
+#endif  // ENABLE_MKLDNN_V1
+
+TensorFormat MklDnn3DDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
+TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
+
 memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
 memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
                                         const memory::dims& strides,
                                         memory::data_type dtype);
 
+#ifdef ENABLE_MKLDNN_V1
+inline std::ostream& operator<<(std::ostream& os,
+                                const memory::format_tag& tag) {
+  if (tag == memory::format_tag::undef) {
+    os << "undef";
+  } else if (tag == memory::format_tag::any) {
+    os << "any";
+  } else {
+    os << "invalid";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const MklTensorFormat& format) {
+  if (format == MklTensorFormat::FORMAT_NHWC) {
+    os << "FORMAT_NHWC";
+  } else if (format == MklTensorFormat::FORMAT_NCHW) {
+    os << "FORMAT_NCHW";
+  } else if (format == MklTensorFormat::FORMAT_NDHWC) {
+    os << "FORMAT_NDHWC";
+  } else if (format == MklTensorFormat::FORMAT_NCDHW) {
+    os << "FORMAT_NCDHW";
+  } else if (format == MklTensorFormat::FORMAT_BLOCKED) {
+    os << "FORMAT_BLOCKED";
+  } else {
+    os << "INVALID FORMAT";
+  }
+}
+#endif  // ENABLE_MKLDNN_V1
+
 class MklDnnShape {
  private:
   typedef struct {
-    /// Flag to indicate if the tensor is an  MKL tensor or not
+    // Flag to indicate if the tensor is an MKL tensor or not
     bool is_mkl_tensor_ = false;
-    /// Number of dimensions in Tensorflow format
+    // Number of dimensions in Tensorflow format
     size_t dimension_ = 0;
-    /// Required by MKLDNN for conversions
     mkldnn_dims_t sizes_;  // Required by MKL for conversions
-    memory::format tf_data_format_ = memory::format::format_undef;
-    memory::data_type T_ = memory::data_type::data_undef;
+    MKL_TENSOR_FORMAT tf_data_format_ = MKL_TENSOR_FORMAT_UNDEF;
+    memory::data_type T_ = MEMORY_DATA_TYPE_UNDEF;
     // MKL layout
     mkldnn_memory_desc_t mkl_md_;
     /// TF dimension corresponding to this MKL dimension
@@ -190,14 +315,23 @@ class MklDnnShape {
       return false;
     }
 
-    // If input tensors are in Mkl layout, then we check for dimensions and
+    // If input tensors are in MKL layout, then we check for dimensions and
     // sizes.
     if (this->IsMklTensor()) {
+#ifdef ENABLE_MKLDNN_V1
+      const mkldnn_memory_desc_t& cur_md = (this->GetMklLayout()).data;
+      const mkldnn_memory_desc_t& input_shape_md =
+          input_shape.GetMklLayout().data;
+      return this->GetTfShape() == input_shape.GetTfShape() &&
+             mkldnn_memory_desc_equal(&cur_md, &input_shape_md);
+#else
       return this->GetTfShape() == input_shape.GetTfShape() &&
              CompareMklDnnLayouts(this->GetMklLayout(),
                                   input_shape.GetMklLayout());
+#endif  // ENABLE_MKLDNN_V1
     }
 
+    // Both inputs are not MKL tensors.
     return true;
   }
 
@@ -299,7 +433,9 @@ class MklDnnShape {
     CHECK_EQ(data_.is_mkl_tensor_, true);
 
     std::vector<int32> shape(data_.dimension_, -1);
-    if (data_.tf_data_format_ != memory::format::blocked) {
+    // As mentioned in the comment above, we now rely on TF's `data_format`
+    // attribute to determine if TF shape is in blocked format or not.
+    if (data_.tf_data_format_ != MKL_TENSOR_FORMAT_BLOCKED) {
       for (size_t idx = 0; idx < data_.dimension_; ++idx) {
         shape[idx] = data_.sizes_[TfDimIdx(idx)];
       }
@@ -321,10 +457,13 @@ class MklDnnShape {
   inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
   inline const memory::data_type GetElemType() { return data_.T_; }
 
+#ifndef ENABLE_MKLDNN_V1
+  // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
   inline void SetMklLayout(memory::primitive_desc* pd) {
     CHECK_NOTNULL(pd);
     data_.mkl_md_ = pd->desc().data;
   }
+#endif  // !ENABLE_MKLDNN_V1
 
   inline void SetMklLayout(memory::desc* md) {
     CHECK_NOTNULL(md);
@@ -335,55 +474,52 @@ class MklDnnShape {
     return memory::desc(data_.mkl_md_);
   }
 
-  inline memory::format GetTfDataFormat() const {
+  inline MKL_TENSOR_FORMAT GetTfDataFormat() const {
     return data_.tf_data_format_;
   }
+
   /// We don't create primitive_descriptor for TensorFlow layout now.
   /// We use lazy evaluation and create it only when needed. Input format can
   /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
-                          memory::format format) {
+                          MKL_TENSOR_FORMAT format, bool is_2d = false) {
     DCHECK_EQ(dims, sizes.size())
         << "SetTfLayout: Number of dimensions does not"
            "match with dimension array";
     data_.dimension_ = dims;
-    for (size_t ii = 0; ii < dims; ii++) {
-      data_.sizes_[ii] = sizes[ii];
-    }
-    data_.tf_data_format_ = format;
-    if (format != memory::format::blocked) {
-      SetTfDimOrder(dims, format);
-    }
-  }
-
-  inline void SetTfLayout2D(size_t dims, const memory::dims& sizes,
-                            memory::format format) {
-    DCHECK_EQ(dims, sizes.size())
-        << "SetTfLayout2D: Number of dimensions does not"
-           "match with dimension array";
-    data_.dimension_ = dims;
     for (size_t ii = 0; ii < dims; ++ii) {
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    if (format != memory::format::blocked) {
-      data_.map_[0] = MklDnnDims::Dim_N;
-      data_.map_[1] = MklDnnDims::Dim_C;
+    if (format != MKL_TENSOR_FORMAT_BLOCKED) {
+      if (is_2d) {
+        data_.map_[0] = MklDnnDims::Dim_N;
+        data_.map_[1] = MklDnnDims::Dim_C;
+      } else {
+        SetTfDimOrder(dims, format);
+      }
     }
   }
 
   inline const memory::desc GetTfLayout() const {
     memory::dims dims;
-    for (size_t ii = 0; ii < data_.dimension_; ii++) {
+    for (size_t ii = 0; ii < data_.dimension_; ++ii) {
       dims.push_back(data_.sizes_[ii]);
     }
 
     // Create Blocked memory desc if input TF format was set like that.
-    if (data_.tf_data_format_ == memory::format::blocked) {
+    if (data_.tf_data_format_ == MKL_TENSOR_FORMAT_BLOCKED) {
       auto strides = CalculateTFStrides(dims);
       return CreateBlockedMemDescHelper(dims, strides, data_.T_);
     } else {
+#ifdef ENABLE_MKLDNN_V1
+      auto format_tag =
+          MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_);
+      DCHECK_NE(format_tag, memory::format_tag::undef);
+      return memory::desc(dims, data_.T_, format_tag);
+#else
       return memory::desc(dims, data_.T_, data_.tf_data_format_);
+#endif  // ENABLE_MKLDNN_V1
     }
   }
 
@@ -424,7 +560,7 @@ class MklDnnShape {
     }
   }
 
-  inline void SetTfDimOrder(const size_t dimension, memory::format format) {
+  inline void SetTfDimOrder(const size_t dimension, MKL_TENSOR_FORMAT format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
   }
@@ -458,7 +594,7 @@ class MklDnnShape {
     return TfDimIdx(d) == MklDnnDims::Dim_H;
   }
 
-  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// Check if the TF-MKL dimension ordering map specifies if the input
   /// tensor is in NCHW format.
   inline bool IsTensorInNCHWFormat() const {
     TensorFormat data_format = FORMAT_NCHW;
@@ -468,7 +604,7 @@ class MklDnnShape {
             IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
   }
 
-  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// Check if the TF-MKL dimension ordering map specifies if the input
   /// tensor is in NHWC format.
   inline bool IsTensorInNHWCFormat() const {
     TensorFormat data_format = FORMAT_NHWC;
@@ -510,62 +646,102 @@ class MklDnnShape {
 // List of MklShape objects. Used in Concat/Split layers.
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 
-using mkldnn::stream;
 template <typename T>
 class MklDnnData;
 
-template <typename T>
-inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
-                             const MklDnnShape& mkl_shape) {
-  Tensor output_tensor;
-  try {
-    if (!mkl_shape.IsMklTensor())
-      return mkl_tensor;  // return input since it is already TF tensor
+inline void ExecutePrimitive(const std::vector<primitive>& net,
+                             const std::vector<MemoryArgsMap>* net_args,
+                             const engine& cpu_engine) {
+#ifdef ENABLE_MKLDNN_V1
+  DCHECK(net_args);
+  DCHECK_EQ(net.size(), net_args->size());
+  stream cpu_stream(cpu_engine);
+  for (size_t i = 0; i < net.size(); ++i) {
+    net.at(i).execute(cpu_stream, net_args->at(i));
+  }
+  cpu_stream.wait();
+#else
+  stream(stream::kind::eager).submit(net).wait();
+#endif  // ENABLE_MKLDNN_V1
+}
 
-    TensorShape output_shape = mkl_shape.GetTfShape();
+template <typename T>
+inline Status ConvertMklToTF(OpKernelContext* context,
+                             const Tensor& input_mkl_tensor,
+                             const MklDnnShape& input_mkl_shape,
+                             Tensor* output_tf_tensor) {
+  try {
+    if (!input_mkl_shape.IsMklTensor()) {
+      // Return input as is since it is already a TF tensor
+      *output_tf_tensor = input_mkl_tensor;
+      return Status::OK();
+    }
 
     // Allocate output tensor.
-    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
-                                       &output_tensor));
+    TensorShape output_tf_shape = input_mkl_shape.GetTfShape();
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<T>::v(), output_tf_shape,
+                                       output_tf_tensor));
 
-    auto cpu_engine = engine(engine::cpu, 0);
+    engine cpu_engine(ENGINE_CPU, 0);
     MklDnnData<T> input(&cpu_engine);
 
-    // Get Mkl layout of input tensor.
-    auto input_mkl_md = mkl_shape.GetMklLayout();
-    auto output_tf_md = mkl_shape.GetTfLayout();
+    // Get MKL layout of input tensor.
+    auto input_mkl_md = input_mkl_shape.GetMklLayout();
+    auto output_tf_md = input_mkl_shape.GetTfLayout();
+#ifndef ENABLE_MKLDNN_V1
+    // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
     auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+#endif  // !ENABLE_MKLDNN_V1
+    input.SetUsrMem(input_mkl_md, &input_mkl_tensor);
 
-    // reorder
-    if (input.IsReorderNeeded(output_tf_pd)) {
+    if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
       std::vector<primitive> net;
-      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-               true);
-      stream(stream::kind::eager).submit(net).wait();
+      std::vector<MemoryArgsMap> net_args;
+      bool status = input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
+          OUTPUT_TF_MD, output_tf_tensor, net, net_args, cpu_engine));
+      if (!status) {
+        return Status(error::Code::INTERNAL,
+                      "ConvertMklToTF(): Failed to create reorder for input");
+      }
+      ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine);
     } else {
       // If not, just forward input tensor to output tensor.
-      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+      bool status =
+          output_tf_tensor->CopyFrom(input_mkl_tensor, output_tf_shape);
+      if (!status) {
+        return Status(
+            error::Code::INTERNAL,
+            "ConvertMklToTF(): Failed to forward input tensor to output");
+      }
     }
+    return Status::OK();
   } catch (mkldnn::error& e) {
     string error_msg = "Status: " + std::to_string(e.status) +
                        ", message: " + string(e.message) + ", in file " +
                        string(__FILE__) + ":" + std::to_string(__LINE__);
     LOG(FATAL) << "Operation received an exception: " << error_msg;
   }
-  return output_tensor;
 }
 
 // Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape,
+                        bool eager_mode) {
+  if (!eager_mode) {
+    mklshape->DeSerializeMklDnnShape(
+        ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+            .flat<uint8>()
+            .data(),
+        ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+                .flat<uint8>()
+                .size() *
+            sizeof(uint8));
+  } else {
+    mklshape->SetMklTensor(false);
+  }
+}
+
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
-  mklshape->DeSerializeMklDnnShape(
-      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
-          .flat<uint8>()
-          .data(),
-      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
-              .flat<uint8>()
-              .size() *
-          sizeof(uint8));
+  GetMklShape(ctext, n, mklshape, false);
 }
 
 // Gets the actual input
@@ -594,14 +770,15 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
-inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx) {
+inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx,
+                              bool eager_mode = false) {
   // Sanity check.
   CHECK_NOTNULL(context);
   CHECK_LT(input_idx, context->num_inputs());
 
   MklDnnShape input_mkl_shape;
-  GetMklShape(context, input_idx, &input_mkl_shape);
-  if (input_mkl_shape.IsMklTensor()) {
+  GetMklShape(context, input_idx, &input_mkl_shape, eager_mode);
+  if (input_mkl_shape.IsMklTensor() && !eager_mode) {
     return input_mkl_shape.GetTfShape();
   } else {
     const Tensor& t = MklGetInput(context, input_idx);
@@ -629,25 +806,28 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
                                       Tensor** output,
                                       const TensorShape& tf_shape,
-                                      const MklDnnShape& mkl_shape) {
-  Tensor* second_tensor = nullptr;
-  TensorShape second_shape;
-  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+                                      const MklDnnShape& mkl_shape,
+                                      bool eager_mode = false) {
   OP_REQUIRES_OK(
       ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
                                     tf_shape, output));
-  OP_REQUIRES_OK(ctext, ctext->allocate_output(
-                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
-                            second_shape, &second_tensor));
-  mkl_shape.SerializeMklDnnShape(
-      second_tensor->flat<uint8>().data(),
-      second_tensor->flat<uint8>().size() * sizeof(uint8));
+  if (!eager_mode) {
+    Tensor* second_tensor = nullptr;
+    TensorShape second_shape;
+    second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+    OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                              GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                              second_shape, &second_tensor));
+    mkl_shape.SerializeMklDnnShape(
+        second_tensor->flat<uint8>().data(),
+        second_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
 }
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
-                           const memory::primitive_desc& pd, void** buf_out) {
+                           const MEMORY_PRIMITIVE_DESC& pd, void** buf_out) {
   TensorShape tf_shape;
 
   tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
@@ -663,10 +843,13 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                                                  tf_shape, tensor_out));
 }
 
-inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
+inline void GetStridesFromSizes(TENSOR_FORMAT data_format, size_t* strides,
                                 const size_t* sizes) {
+#ifdef ENABLE_MKLDNN_V1
+  DCHECK_NE(data_format, MklTensorFormat::FORMAT_INVALID);
+#endif  // ENABLE_MKLDNN_V1
   // MKL requires strides in NCHW
-  if (data_format == FORMAT_NHWC) {
+  if (data_format == TENSOR_FORMAT_NHWC) {
     strides[0] = sizes[2];
     strides[1] = sizes[0] * sizes[2];
     strides[2] = 1;
@@ -832,42 +1015,56 @@ memory::data_type MklDnnType<bfloat16>() {
   return memory::data_type::f32;
 }
 
-/// Map TensorFlow's data format into MKL-DNN 3D data format
+#ifdef ENABLE_MKLDNN_V1
+// Map MklTensorFormat to MKL-DNN format tag
+//
+// @input: MklTensorFormat i.e. TensorFlow data format
+// @return: MKL-DNN's memory format tag corresponding to MklTensorFormat.
+//          Fails with an error if invalid data format.
+inline memory::format_tag MklTensorFormatToMklDnnDataFormat(
+    MklTensorFormat format) {
+  DCHECK_NE(format, MklTensorFormat::FORMAT_INVALID);
+  using tag = memory::format_tag;
+  if (format == MklTensorFormat::FORMAT_NHWC) return tag::nhwc;
+  if (format == MklTensorFormat::FORMAT_NCHW) return tag::nchw;
+  if (format == MklTensorFormat::FORMAT_NDHWC) return tag::ndhwc;
+  if (format == MklTensorFormat::FORMAT_NCDHW) return tag::ncdhw;
+  return tag::undef;
+}
+#endif  // ENABLE_MKLDNN_V1
+
+/// Map TensorFlow data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
-/// @return: memory::format corresponding to TensorFlow data format;
+/// @return: MKL-DNN 3D data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline memory::format TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC)
-    return memory::format::ndhwc;
-  else if (format == FORMAT_NCHW)
-    return memory::format::ncdhw;
+inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NDHWC;
+  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCDHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return memory::format::format_undef;
+  return MKL_TENSOR_FORMAT_INVALID;
 }
 
-/// Map TensorFlow's data format into MKL-DNN data format
+/// Map TensorFlow data format into MKL-DNN data format
 ///
 /// @input: TensorFlow data format
-/// @return: memory::format corresponding to TensorFlow data format;
+/// @return: MKL-DNN data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC)
-    return memory::format::nhwc;
-  else if (format == FORMAT_NCHW)
-    return memory::format::nchw;
+inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NHWC;
+  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return memory::format::format_undef;
+  return MKL_TENSOR_FORMAT_INVALID;
 }
 
-/// Map MKL-DNN data format to TensorFlow's data format
+/// Map MKL-DNN data format into TensorFlow data format
 ///
-/// @input: memory::format
-/// @return: Tensorflow data format corresponding to memory::format
+/// @input: MKL-DNN data format
+/// @return: Tensorflow data format corresponding to MKL-DNN data format;
 ///          Fails with an error if invalid data format.
-inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc || format == memory::format::ndhwc)
+inline TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format) {
+  if (format == MKL_TENSOR_FORMAT_NHWC || format == MKL_TENSOR_FORMAT_NDHWC)
     return FORMAT_NHWC;
-  else if (format == memory::format::nchw || format == memory::format::ncdhw)
+  if (format == MKL_TENSOR_FORMAT_NCHW || format == MKL_TENSOR_FORMAT_NCDHW)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
@@ -904,8 +1101,7 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
 inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
                                               TensorFormat format) {
   // Check validity of format.
-  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
-           memory::format::format_undef);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex(format, 'C'));
@@ -919,8 +1115,8 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
 inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
                                                TensorFormat format) {
   // Validate format.
-  CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
-           memory::format::format_undef);
+  DCHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
+            MKL_TENSOR_FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
@@ -932,13 +1128,12 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
   return memory::dims({n, c, d, h, w});
 }
 
-/// Overloaded version of function above. Input parameters are
-/// self-explanatory.
+/// Overloaded version of function TFShapeToMklDnnDimsInNCHW above.
+/// Input parameters are self-explanatory.
 inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
                                      TensorFormat format) {
   // Validate format.
-  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
-           memory::format::format_undef);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
 
   int n = in_dims[GetTensorDimIndex(format, 'N')];
   int c = in_dims[GetTensorDimIndex(format, 'C')];
@@ -949,6 +1144,24 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
   return memory::dims({n, c, h, w});
 }
 
+/// Overloaded version of function TFShapeToMklDnnDimsInNCDHW above.
+/// Input parameters are self-explanatory.
+inline memory::dims MklDnnDimsInNCDHW(const memory::dims& in_dims,
+                                      TensorFormat format) {
+  // Validate format.
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            memory::format::format_undef);
+
+  int n = in_dims[GetTensorDimIndex<3>(format, 'N')];
+  int c = in_dims[GetTensorDimIndex<3>(format, 'C')];
+  int d = in_dims[GetTensorDimIndex<3>(format, '0')];
+  int h = in_dims[GetTensorDimIndex<3>(format, '1')];
+  int w = in_dims[GetTensorDimIndex<3>(format, '2')];
+
+  // MKL DNN requires dimensions in NCDHW format.
+  return memory::dims({n, c, d, h, w});
+}
+
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
@@ -1002,10 +1215,24 @@ inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
 inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
                                                const memory::dims& strides,
                                                memory::data_type dtype) {
-  CHECK_EQ(dim.size(), strides.size());
-
+  DCHECK_EQ(dim.size(), strides.size());
+#ifdef ENABLE_MKLDNN_V1
+  mkldnn_dim_t input_dims[dim.size()];
+  mkldnn_dim_t input_strides[dim.size()];
+  for (size_t i = 0; i < dim.size(); ++i) {
+    input_dims[i] = dim[i];
+    input_strides[i] = strides[i];
+  }
+  mkldnn_memory_desc_t md;
+  auto status = mkldnn_memory_desc_init_by_strides(
+      &md, dim.size(), input_dims, memory::convert_to_c(dtype), input_strides);
+  if (!status) {
+    TF_CHECK_OK(Status(error::Code::INTERNAL,
+                       "Failed to create blocked memory descriptor"));
+  }
+#else
   // We have to construct memory descriptor in a C style. This is not at all
-  // ideal but MKLDNN does not offer any API to construct descriptor in
+  // ideal but MKL-DNN does not offer any API to construct descriptor in
   // blocked format except a copy constructor that accepts
   // mkldnn_memory_desc_t.
   mkldnn_memory_desc_t md;
@@ -1023,10 +1250,25 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
     md.dims[i] = dim[i];
   }
   md.layout_desc.blocking.offset_padding = 0;
-
+#endif  // ENABLE_MKLDNN_V1
   return memory::desc(md);
 }
 
+inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
+                                    const memory& src_mem,
+                                    const memory& dst_mem,
+                                    const engine& engine) {
+  std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+  net.push_back(mkldnn::reorder(reorder_desc));
+  std::vector<MemoryArgsMap> net_args;
+  net_args.push_back({{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
+#else
+  net.push_back(mkldnn::reorder(reorder_desc, src_mem, dst_mem));
+#endif  // ENABLE_MKLDNN_V1
+  ExecutePrimitive(net, NET_ARGS_PTR, engine);
+}
+
 template <typename T>
 inline primitive FindOrCreateReorder(const memory* from, const memory* to);
 
@@ -1055,6 +1297,7 @@ class MklDnnData {
       : user_memory_(nullptr),
         reorder_memory_(nullptr),
         op_md_(nullptr),
+        bIs3D(false),
         allocated_buffer_(nullptr),
         cpu_engine_(e) {}
 
@@ -1077,23 +1320,23 @@ class MklDnnData {
   void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; }
   bool GetIs3D() { return bIs3D; }
 
-  /// Set user memory primitive using specified dimensions, memory format and
-  /// data_buffer. Function automatically uses element data type by using
+  /// Set user memory primitive using specified dimensions, memory format tag
+  /// and data_buffer. Function automatically uses element data type by using
   /// input type T used for creating call object.
   ///
   /// In a nutshell, function allows user to describe the input tensor to
   /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
-  /// memory format HWIO, and the buffer that contains actual values is
+  /// memory format tag HWIO, and the buffer that contains actual values is
   /// pointed by data_buffer.
-  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
                         void* data_buffer = nullptr) {
     auto md = memory::desc(dim, MklDnnType<T>(), fm);
     SetUsrMem(md, data_buffer);
   }
 
-  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
                         const Tensor* tensor) {
-    CHECK_NOTNULL(tensor);
+    DCHECK(tensor);
     SetUsrMem(dim, fm, GetTensorBuffer(tensor));
   }
 
@@ -1129,6 +1372,8 @@ class MklDnnData {
     SetUsrMem(dim, strides, GetTensorBuffer(tensor));
   }
 
+#ifndef ENABLE_MKLDNN_V1
+  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
   /// A version of function to set user memory primitive that accepts memory
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic that the one above, but the function above is
@@ -1137,6 +1382,7 @@ class MklDnnData {
     auto pd = memory::primitive_desc(md, *cpu_engine_);
     SetUsrMem(pd, data_buffer);
   }
+#endif  // !ENABLE_MKLDNN_V1
 
   /// A version of SetUsrMem with memory descriptor and tensor
   inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
@@ -1144,43 +1390,54 @@ class MklDnnData {
     SetUsrMem(md, GetTensorBuffer(tensor));
   }
 
-  /// A version of function to set user memory primitive that accepts primitive
+  /// A version of function to set user memory type that accepts memory
   /// descriptor directly, instead of accepting dimensions and format. This
-  /// function is more generic that the one above, but the function above is
+  /// function is more generic than the one above, but the function above is
   /// sufficient in most cases.
-  inline void SetUsrMem(const memory::primitive_desc& pd,
+  inline void SetUsrMem(const MEMORY_PRIMITIVE_DESC& pd,
                         void* data_buffer = nullptr) {
-    CHECK_NOTNULL(cpu_engine_);
+    DCHECK(cpu_engine_);
     if (user_memory_) delete user_memory_;
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
-      user_memory_ = new memory(pd, data_buffer);
+      user_memory_ = new MEMORY_CONSTRUCTOR(pd, *cpu_engine_, data_buffer);
     } else {
-      user_memory_ = new memory(pd);
+      user_memory_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(pd, *cpu_engine_);
     }
   }
 
+#ifndef ENABLE_MKLDNN_V1
+  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x
   /// A version of SetUsrMem with primitive descriptor and tensor
   inline void SetUsrMem(const memory::primitive_desc& pd,
                         const Tensor* tensor) {
-    CHECK_NOTNULL(tensor);
+    DCHECK(tensor);
     SetUsrMem(pd, GetTensorBuffer(tensor));
   }
+#endif  // !ENABLE_MKLDNN_V1
 
   /// Get function for user memory primitive.
   inline const memory* GetUsrMem() const { return user_memory_; }
 
+#ifndef ENABLE_MKLDNN_V1
+  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
   /// Get function for primitive descriptor of user memory primitive.
   inline const memory::primitive_desc GetUsrMemPrimDesc() const {
-    CHECK_NOTNULL(user_memory_);
+    DCHECK(user_memory_);
     return user_memory_->get_primitive_desc();
   }
+#endif  // !ENABLE_MKLDNN_V1
 
   /// Get function for descriptor of user memory.
-  inline memory::desc GetUsrMemDesc() {
+  inline memory::desc GetUsrMemDesc() const {
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK(user_memory_);
+    return user_memory_->get_desc();
+#else
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
     const memory::primitive_desc pd = GetUsrMemPrimDesc();
     return const_cast<memory::primitive_desc*>(&pd)->desc();
+#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Get function for data buffer of user memory primitive.
@@ -1225,9 +1482,9 @@ class MklDnnData {
 
   /// Set memory descriptor of an operation in terms of dimensions and memory
   /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
-  /// but memory::format would be mkldnn::any because we want MKL-DNN to choose
-  /// best layout/format for given input dimensions.
-  inline void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
+  /// but memory::format_tag would be mkldnn::any because we want MKL-DNN to
+  /// choose the best layout/format for given input dimensions.
+  inline void SetOpMemDesc(const memory::dims& dim, MEMORY_FORMAT fm) {
     // TODO(nhasabni): can we remove dynamic memory allocation?
     op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
   }
@@ -1236,16 +1493,20 @@ class MklDnnData {
   inline const memory::desc& GetOpMemDesc() const { return *op_md_; }
 
   /// Predicate that checks if we need to reorder user's memory into memory
-  /// pointed by op_pd.
+  /// pointed by op_md.
   ///
-  /// @input: op_pd - memory primitive descriptor of the given input of an
-  ///               operation
+  /// @input: op_md - memory descriptor of the given input of an operation.
   /// @return: true in case reorder of input is needed; false, otherwise.
-  inline bool IsReorderNeeded(const memory::primitive_desc& op_pd) const {
-    CHECK_NOTNULL(user_memory_);
-    return op_pd != user_memory_->get_primitive_desc();
+  inline bool IsReorderNeeded(const MEMORY_PRIMITIVE_DESC& op_pd) const {
+    DCHECK(user_memory_);
+    return op_pd != GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(user_memory_);
   }
 
+#ifndef ENABLE_MKLDNN_V1
+  /// In MKL-DNN v1.x, it it is not possible to directly compare two memory
+  /// format tags since they only provide a partial description of the memory
+  /// layout. Hence, this function is disabled for MKL-DNN v1.x.
+  ///
   /// Predicate that checks if we need to reorder user's memory into memory
   /// based on the provided format.
   ///
@@ -1257,6 +1518,7 @@ class MklDnnData {
     return target_format !=
            user_memory_->get_primitive_desc().desc().data.format;
   }
+#endif  // !ENABLE_MKLDNN_V1
 
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
@@ -1266,61 +1528,109 @@ class MklDnnData {
     return reorder(*from, *to);
   }
 
-  /// Function to handle input reordering
-  ///
-  /// Check if we need to reorder this input of an operation.
-  /// Return true and allocate reorder memory primitive if reorder is needed.
-  /// Otherwise, return false and do not allocate reorder memory primitive.
-  ///
-  /// To check if reorder is needed, this function compares memory primitive
-  /// descriptor of an operation (op_pd) for the given input with the
-  /// user-specified memory primitive descriptor.
-  ///
-  /// @input: op_pd - memory primitive descriptor of the given input of an
-  ///               operation
-  /// @input: net - net to which to add reorder primitive in case it is needed.
-  /// @return: true in case reorder of input is needed; false, otherwise.
+/// Function to handle input reordering
+///
+/// Check if we need to reorder this input of an operation.
+/// Return true and allocate reorder memory primitive if reorder is needed.
+/// Otherwise, return false and do not allocate reorder memory primitive.
+///
+/// To check if reorder is needed, this function compares memory primitive
+/// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
+/// the given input with the user-specified memory descriptor.
+///
+/// @input: op_pd - memory primitive descriptor of the given input of an
+///                 operation
+/// @input: net - net to which to add reorder primitive in case it is needed.
+/// @input: net_args - net to which user and reorder memories are added if
+///                    needed. Each entry is a key-value pair of the form
+///                    <argument-type, mkldnn::memory>.
+/// @return: true in case reorder of input is needed; false, otherwise.
+#ifdef ENABLE_MKLDNN_V1
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(user_memory_);
+    DCHECK_EQ(net.size(), net_args.size());
+    if (IsReorderNeeded(op_md)) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_md, engine);
+      net.push_back(CreateReorder(user_memory_, reorder_memory_));
+      net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
+                                       {MKLDNN_ARG_TO, *reorder_memory_}});
+#else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
                                   std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(user_memory_);
+    DCHECK(net);
+    DCHECK(user_memory_);
     if (IsReorderNeeded(op_pd)) {
-      // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd);
       net->push_back(CreateReorder(user_memory_, reorder_memory_));
+#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-  /// This is a faster path with reorder primitive cache compared with
-  /// CheckReorderToOpMem(..., std::vector<primitive>* net).
-  /// TODO(gzmkl): Remove the slower path.
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) {
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
+/// TODO: this is a faster path with reorder primitive cache compared with
+/// CheckReorderToOpMem(..., std::vector<primitive>* net).
+/// TODO(gzmkl): Remove the slower path.
+#ifdef ENABLE_MKLDNN_V1
+  /// TODO(bhavanis): Need to use reorder cache here for better performance.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  const engine& engine) {
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       // primitive reuse don't allow two same reorder prim in
       // one stream, so submit it immediately
+      reorder_memory_ = new memory(op_md, engine);
+      stream cpu_stream(engine);
+      reorder(*user_memory_, *reorder_memory_)
+          .execute(cpu_stream, *user_memory_, *reorder_memory_);
+#else
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) {
+    CHECK_NOTNULL(user_memory_);
+    if (IsReorderNeeded(op_pd)) {
       reorder_memory_ = new memory(op_pd);
       std::vector<primitive> net;
       net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
       stream(stream::kind::eager).submit(net).wait();
+#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-  /// Overloaded version of above function that accepts memory buffer
-  /// where output of reorder needs to be stored.
-  ///
-  /// @input: op_pd - memory primitive descriptor of the given input of an
-  ///               operation
-  /// @reorder_data_handle - memory buffer where output of reorder needs to be
-  ///                        stored. Primitive does not check if buffer is
-  ///                        enough size to write.
-  /// @input: net - net to which to add reorder primitive in case it is needed.
-  /// @return: true in case reorder of input is needed; false, otherwise.
+/// Overloaded version of above function that accepts memory buffer
+/// where output of reorder needs to be stored.
+///
+/// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
+///                 of the given input of an operation
+/// @reorder_data_handle - memory buffer where output of reorder needs to be
+///                        stored. Primitive does not check if buffer has
+///                        enough size to write.
+/// @input: net - net to which to add reorder primitive in case it is needed.
+/// @input: net_args - net to which user and reorder memories are added if
+///                    needed. Each entry is a key-value pair of the form
+///                    <argument-type, mkldnn::memory>.
+/// @input: engine - MKL-DNN's abstraction of a computational device
+/// @return: true in case reorder of input is needed; false, otherwise.
+#ifdef ENABLE_MKLDNN_V1
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  void* reorder_data_handle,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(reorder_data_handle);
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
+      net.push_back(CreateReorder(user_memory_, reorder_memory_));
+      net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
+                                       {MKLDNN_ARG_TO, *reorder_memory_}});
+#else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
                                   void* reorder_data_handle,
                                   std::vector<primitive>* net) {
@@ -1328,44 +1638,73 @@ class MklDnnData {
     CHECK_NOTNULL(reorder_data_handle);
     CHECK_NOTNULL(user_memory_);
     if (IsReorderNeeded(op_pd)) {
-      // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd, reorder_data_handle);
       net->push_back(CreateReorder(user_memory_, reorder_memory_));
+#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-  /// This is a faster path with reorder primitive cache compared with
-  /// CheckReorderToOpMem(..., std::vector<primitive>* net).
-  /// The slower path will be removed in the future
+/// This is a faster path with reorder primitive cache compared with
+/// CheckReorderToOpMem(..., std::vector<primitive>* net).
+/// The slower path will be removed in the future
+#ifdef ENABLE_MKLDNN_V1
+  /// TODO(bhavanis): Need to use reorder cache here for better performance.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  void* reorder_data_handle,
+                                  const engine& engine) {
+    DCHECK(reorder_data_handle);
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      // primitive reuse don't allow two same reorder prim in
+      // one stream, so submit it immediately
+      reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
+      stream cpu_stream(engine);
+      reorder(*user_memory_, *reorder_memory_)
+          .execute(cpu_stream, *user_memory_, *reorder_memory_);
+#else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
                                   void* reorder_data_handle) {
     CHECK_NOTNULL(reorder_data_handle);
     CHECK_NOTNULL(user_memory_);
     if (IsReorderNeeded(op_pd)) {
-      // TODO(nhasabni): can we remove dynamic memory allocation?
-      // primitive reuse don't allow two same reorder prim in
-      // one stream, so submit it immediately
       std::vector<primitive> net;
       reorder_memory_ = new memory(op_pd, reorder_data_handle);
       net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
       stream(stream::kind::eager).submit(net).wait();
+#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
-  /// where output of reorder needs to be stored.
-  ///
-  /// @input: op_pd - memory primitive descriptor of the given input of an
-  ///               operation
-  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
-  ///                   reorder. Primitive does not check if buffer is
-  ///                   enough size to write.
-  /// @input: net - net to which to add reorder primitive in case it is needed.
-  /// @return: true in case reorder of input is needed; false, otherwise.
+/// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+/// where output of reorder needs to be stored.
+///
+/// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
+///                 of the given input of an operation
+/// @reorder_tensor - Tensor whose buffer is to be used to store output of
+///                   reorder. Primitive does not check if buffer is
+///                   enough size to write.
+/// @input: net - net to which to add reorder primitive in case it is needed.
+/// @input: net_args - net to which user and reorder memories are added if
+///                    needed. Each entry is a key-value pair of the form
+///                    <argument-type, mkldnn::memory>.
+/// @input: engine - MKL-DNN's abstraction of a computational device
+/// @return: true in case reorder of input is needed; false, otherwise.
+#ifdef ENABLE_MKLDNN_V1
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  Tensor* reorder_tensor,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(reorder_tensor);
+    return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net,
+                               net_args, engine);
+  }
+#else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
                                   Tensor* reorder_tensor,
                                   std::vector<primitive>* net) {
@@ -1373,14 +1712,21 @@ class MklDnnData {
     CHECK_NOTNULL(reorder_tensor);
     return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net);
   }
+#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
-  /// CheckReorderToOpMem(..., std::vector<primitive>* net), will remove
+  /// CheckReorderToOpMem(op_md, reorder_tensor, net, net_args, engine), will
+  /// remove
   /// slow path in the future
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+  inline bool CheckReorderToOpMem(const MEMORY_PRIMITIVE_DESC& op_pd,
                                   Tensor* reorder_tensor) {
-    CHECK_NOTNULL(reorder_tensor);
+    DCHECK(reorder_tensor);
+#ifdef ENABLE_MKLDNN_V1
+    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor),
+                               *cpu_engine_);
+#else
     return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor));
+#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Function to handle output reorder
@@ -1393,44 +1739,67 @@ class MklDnnData {
   /// reorder is needed. And this temporary buffer will hold the output of
   /// an operation before it is fed to reorder primitive.
   ///
-  /// @input memory primitive descriptor for the given output of an operation
+  /// @input - memory primitive descriptor (memory descriptor for v1.x) for the
+  ///          given output of an operation
   /// @return: true in case reorder of output is needed; false, otherwise.
-  inline bool PrepareReorderToUserMemIfReq(
-      const memory::primitive_desc& op_pd) {
-    CHECK_NOTNULL(user_memory_);
+  inline bool PrepareReorderToUserMemIfReq(const MEMORY_PRIMITIVE_DESC& op_pd) {
+    DCHECK(user_memory_);
     if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
-      reorder_memory_ = new memory(op_pd);
+      reorder_memory_ =
+          new MEMORY_CONSTRUCTOR_WITHOUT_DATA(op_pd, *cpu_engine_);
       return true;
     }
     return false;
   }
 
-  /// Function to actually insert reorder primitive in the net
-  ///
-  /// This function completes remaining part of output reordering. It inserts
-  /// a reordering primitive from the temporary buffer that holds the output
-  /// to the user-specified output buffer.
-  ///
-  /// @input: net - net to which to add reorder primitive
+/// Function to actually insert reorder primitive in the net
+///
+/// This function completes remaining part of output reordering. It inserts
+/// a reordering primitive from the temporary buffer that holds the output
+/// to the user-specified output buffer.
+///
+/// @input: net - net to which to add reorder primitive
+/// @input: net_args - net to which user and reorder memories are added if
+///                    needed. Each entry is a key-value pair of the form
+///                    <argument-type, mkldnn::memory>.
+#ifdef ENABLE_MKLDNN_V1
+  inline void InsertReorderToUserMem(std::vector<primitive>& net,
+                                     std::vector<MemoryArgsMap>& net_args) {
+    DCHECK(user_memory_);
+    DCHECK(reorder_memory_);
+    net.push_back(CreateReorder(reorder_memory_, user_memory_));
+    net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_},
+                                     {MKLDNN_ARG_TO, *user_memory_}});
+  }
+#else
   inline void InsertReorderToUserMem(std::vector<primitive>* net) {
     CHECK_NOTNULL(net);
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(reorder_memory_);
     net->push_back(CreateReorder(reorder_memory_, user_memory_));
   }
+#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
-  ///       InsertReorderToUserMem(std::vector<primitive>* net), will remove
+  ///       InsertReorderToUserMem(net, net_args), will remove
   ///       slow path in the future
   inline void InsertReorderToUserMem() {
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(reorder_memory_);
+    DCHECK(user_memory_);
+    DCHECK(reorder_memory_);
+    DCHECK(cpu_engine_);
     // primitive reuse don't allow two same reorder prim in
     // one stream, so submit it immediately
     std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<MemoryArgsMap> net_args;
     net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
-    stream(stream::kind::eager).submit(net).wait();
+    net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_},
+                                     {MKLDNN_ARG_TO, *user_memory_}});
+#else
+    net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
+#endif  // ENABLE_MKLDNN_V1
+    ExecutePrimitive(net, NET_ARGS_PTR, *cpu_engine_);
   }
 };
 
@@ -1624,18 +1993,17 @@ class FactoryKeyCreator {
   }
 };
 
-static inline memory::format get_desired_format(int channel,
-                                                bool is_2d = true) {
-  memory::format fmt_desired = memory::format::any;
+static inline MEMORY_FORMAT get_desired_format(int channel, bool is_2d = true) {
+  MEMORY_FORMAT fmt_desired = MEMORY_FORMAT::any;
 
   if (port::TestCPUFeature(port::CPUFeature::AVX512F)) {
-    fmt_desired = is_2d ? memory::format::nChw16c : memory::format::nCdhw16c;
+    fmt_desired = is_2d ? MEMORY_FORMAT::nChw16c : MEMORY_FORMAT::nCdhw16c;
   } else if (port::TestCPUFeature(port::CPUFeature::AVX2) &&
              (channel % 8) == 0) {
-    fmt_desired = is_2d ? memory::format::nChw8c
-                        : memory::format::ncdhw;  // no avx2 support for 3d yet.
+    fmt_desired = is_2d ? MEMORY_FORMAT::nChw8c
+                        : MEMORY_FORMAT::ncdhw;  // no avx2 support for 3d yet.
   } else {
-    fmt_desired = is_2d ? memory::format::nchw : memory::format::ncdhw;
+    fmt_desired = is_2d ? MEMORY_FORMAT::nchw : MEMORY_FORMAT::ncdhw;
   }
   return fmt_desired;
 }
@@ -1663,13 +2031,13 @@ class MklReorderPrimitive : public MklPrimitive {
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
   void Setup(const memory* from, const memory* to) {
-    context_.src_mem.reset(new memory(
-        {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.src_mem.reset(
+        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(from, cpu_engine_, DummyData));
     context_.dst_mem.reset(
-        new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(to, cpu_engine_, DummyData));
     context_.reorder_prim = std::make_shared<mkldnn::reorder>(
         reorder(*context_.src_mem, *context_.dst_mem));
   }
@@ -1702,24 +2070,40 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const memory* from, const memory* to) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = from->get_primitive_desc().desc().data;
-    auto const& to_desc = to->get_primitive_desc().desc().data;
+    auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(from).data;
+    auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(to).data;
     const int KIdxFirstStride = 0;
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
     memory::dims from_strides(
+#ifdef ENABLE_MKLDNN_V1
+        from_desc.format_desc.blocking.strides,
+        &from_desc.format_desc.blocking.strides[from_desc.ndims]);
+#else
         from_desc.layout_desc.blocking.strides[KIdxFirstStride],
         &from_desc.layout_desc.blocking
              .strides[KIdxFirstStride][from_desc.ndims]);
+#endif  // ENABLE_MKLDNN_V1
     memory::dims to_strides(
+#ifdef ENABLE_MKLDNN_V1
+        to_desc.format_desc.blocking.strides,
+        &to_desc.format_desc.blocking.strides[to_desc.ndims]);
+#else
         to_desc.layout_desc.blocking.strides[KIdxFirstStride],
         &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]);
+#endif  // ENABLE_MKLDNN_V1
     key_creator.AddAsKey(prefix);
+#ifndef ENABLE_MKLDNN_V1
+    // `format_kind` is not added in v1.x since it will always set to
+    // `mkldnn_blocked`
     key_creator.AddAsKey(static_cast<int>(from_desc.format));
+#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides);
+#ifndef ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.format));
+#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides);
@@ -1760,6 +2144,29 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
           ((strides[0] != 1) || (strides[1] != 1)));
 }
 
+#undef ENGINE_CPU
+#undef GET_CHECK_REORDER_TO_OP_MEM_ARGS
+#undef GET_MEMORY_DESC_FROM_MEM_PTR
+#undef GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR
+#undef MEMORY_CONSTRUCTOR
+#undef MEMORY_CONSTRUCTOR_WITH_MEM_PD
+#undef MEMORY_CONSTRUCTOR_WITHOUT_DATA
+#undef MEMORY_FORMAT
+#undef MKL_TENSOR_FORMAT
+#undef MKL_TENSOR_FORMAT_BLOCKED
+#undef MKL_TENSOR_FORMAT_INVALID
+#undef MKL_TENSOR_FORMAT_NCDHW
+#undef MKL_TENSOR_FORMAT_NDHWC
+#undef MKL_TENSOR_FORMAT_NHWC
+#undef MKL_TENSOR_FORMAT_NCHW
+#undef MKL_TENSOR_FORMAT_UNDEF
+#undef MEMORY_DATA_TYPE_UNDEF
+#undef MEMORY_PRIMITIVE_DESC
+#undef NET_ARGS_PTR
+#undef OUTPUT_TF_MD
+#undef TENSOR_FORMAT
+#undef TENSOR_FORMAT_NHWC
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index 188830cc1f4..d415f999ad1 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -337,10 +337,24 @@ inline Status ReadPrimitive(CodedInputStream* input, int index, void* data) {
 // serialized proto.
 // May read all or part of a repeated field.
 inline Status ReadBytes(CodedInputStream* input, int index, void* datap) {
-  string* data = reinterpret_cast<string*>(datap) + index;
+  tstring* data = reinterpret_cast<tstring*>(datap) + index;
+
+#ifdef USE_TSTRING
+  uint32 length;
+  if (!input->ReadVarint32(&length)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+
+  data->resize_uninitialized(length);
+
+  if (!input->ReadRaw(data->data(), length)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+#else   // USE_TSTRING
   if (!WireFormatLite::ReadBytes(input, data)) {
     return errors::DataLoss("Failed reading bytes");
   }
+#endif  // USE_TSTRING
   return Status::OK();
 }
 
@@ -354,8 +368,19 @@ inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
   // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
   // on input->IsFlat() == true and using input->GetDirectBufferPointer()
   // with input->CurrentPosition().
-  string* data = reinterpret_cast<string*>(datap) + index;
+  tstring* data = reinterpret_cast<tstring*>(datap) + index;
+#ifdef USE_TSTRING
+  // TODO(dero): To mitigate the string to tstring copy, we can implement our
+  // own scanner as described above.  We would first need to obtain the length
+  // in an initial pass and resize/reserve the tstring. But, given that
+  // TYPE_GROUP is deprecated and currently no tests in
+  // tensorflow/python/kernel_tests/proto:decode_proto_op_test target a
+  // TYPE_GROUP tag, we use std::string as a read buffer.
+  string buf;
+  StringOutputStream string_stream(&buf);
+#else   // USE_TSTRING
   StringOutputStream string_stream(data);
+#endif  // USE_TSTRING
   CodedOutputStream out(&string_stream);
   if (!WireFormatLite::SkipField(
           input,
@@ -364,6 +389,9 @@ inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
           &out)) {
     return errors::DataLoss("Failed reading group");
   }
+#ifdef USE_TSTRING
+  *data = buf;
+#endif  // USE_TSTRING
   return Status::OK();
 }
 
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index 02687095c9c..eb69e292116 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -21,25 +21,57 @@ limitations under the License.
 
 namespace tensorflow {
 
-TestReporter::TestReporter(const string& fname, const string& test_name)
+TestReportFile::TestReportFile(const string& fname, const string& test_name)
     : closed_(true), fname_(fname), test_name_(test_name) {}
 
-Status TestReporter::Close() {
+Status TestReportFile::Append(const string& content) {
   if (closed_) return Status::OK();
+  return log_file_->Append(content);
+}
+
+Status TestReportFile::Close() {
+  if (closed_) return Status::OK();
+  closed_ = true;
+  return log_file_->Close();
+}
+
+Status TestReportFile::Initialize() {
+  if (fname_.empty()) {
+    return Status::OK();
+  }
+  string mangled_fname = strings::StrCat(
+      fname_, absl::StrJoin(str_util::Split(test_name_, '/'), "__"));
+  Env* env = Env::Default();
+  if (env->FileExists(mangled_fname).ok()) {
+    return errors::InvalidArgument(
+        "Cannot create TestReportFile, file exists: ", mangled_fname);
+  }
+  TF_RETURN_IF_ERROR(env->NewWritableFile(mangled_fname, &log_file_));
+  TF_RETURN_IF_ERROR(log_file_->Flush());
+
+  closed_ = false;
+  return Status::OK();
+}
+
+TestReporter::TestReporter(const string& fname, const string& test_name)
+    : report_file_(fname, test_name) {
+  benchmark_entry_.set_name(test_name);
+}
+
+Status TestReporter::Close() {
+  if (report_file_.IsClosed()) return Status::OK();
 
   BenchmarkEntries entries;
   *entries.add_entry() = benchmark_entry_;
-  TF_RETURN_IF_ERROR(log_file_->Append(entries.SerializeAsString()));
-
+  TF_RETURN_IF_ERROR(report_file_.Append(entries.SerializeAsString()));
   benchmark_entry_.Clear();
-  closed_ = true;
 
-  return log_file_->Close();
+  return report_file_.Close();
 }
 
 Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
                                double throughput) {
-  if (closed_) return Status::OK();
+  if (report_file_.IsClosed()) return Status::OK();
   benchmark_entry_.set_iters(iters);
   benchmark_entry_.set_cpu_time(cpu_time / iters);
   benchmark_entry_.set_wall_time(wall_time / iters);
@@ -48,34 +80,17 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
 }
 
 Status TestReporter::SetProperty(const string& name, const string& value) {
-  if (closed_) return Status::OK();
+  if (report_file_.IsClosed()) return Status::OK();
   (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
   return Status::OK();
 }
 
 Status TestReporter::SetProperty(const string& name, double value) {
-  if (closed_) return Status::OK();
+  if (report_file_.IsClosed()) return Status::OK();
   (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
   return Status::OK();
 }
 
-Status TestReporter::Initialize() {
-  if (fname_.empty()) {
-    return Status::OK();
-  }
-  string mangled_fname = strings::StrCat(
-      fname_, absl::StrJoin(str_util::Split(test_name_, '/'), "__"));
-  Env* env = Env::Default();
-  if (env->FileExists(mangled_fname).ok()) {
-    return errors::InvalidArgument("Cannot create TestReporter, file exists: ",
-                                   mangled_fname);
-  }
-  TF_RETURN_IF_ERROR(env->NewWritableFile(mangled_fname, &log_file_));
-  TF_RETURN_IF_ERROR(log_file_->Flush());
-
-  benchmark_entry_.set_name(test_name_);
-  closed_ = false;
-  return Status::OK();
-}
+Status TestReporter::Initialize() { return report_file_.Initialize(); }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index e551e2e4f57..51d7502701c 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -29,6 +29,34 @@ limitations under the License.
 
 namespace tensorflow {
 
+// The TestReportFile provides a file abstraction for TF tests to use.
+class TestReportFile {
+ public:
+  // Create a TestReportFile with the test name 'test_name'.
+  TestReportFile(const string& fname, const string& test_name);
+
+  // Initialize the TestReportFile.  If the reporting env flag is set,
+  // try to create the reporting file.  Fails if the file already exists.
+  Status Initialize();
+
+  // Append the report file w/ 'content'.
+  Status Append(const string& content);
+
+  // Close the report file.
+  Status Close();
+
+  bool IsClosed() const { return closed_; }
+
+  ~TestReportFile() { Close().IgnoreError(); }  // Autoclose in destructor.
+
+ private:
+  bool closed_;
+  string fname_;
+  string test_name_;
+  std::unique_ptr<WritableFile> log_file_;
+  TF_DISALLOW_COPY_AND_ASSIGN(TestReportFile);
+};
+
 // The TestReporter writes test / benchmark output to binary Protobuf files when
 // the environment variable "TEST_REPORT_FILE_PREFIX" is defined.
 //
@@ -91,10 +119,7 @@ class TestReporter {
     const char* fname_ptr = getenv(kTestReporterEnv);
     return (fname_ptr != nullptr) ? fname_ptr : "";
   }
-  bool closed_;
-  string fname_;
-  string test_name_;
-  std::unique_ptr<WritableFile> log_file_;
+  TestReportFile report_file_;
   BenchmarkEntry benchmark_entry_;
   TF_DISALLOW_COPY_AND_ASSIGN(TestReporter);
 };
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 7c9cfa35f7b..09b9235b711 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -179,29 +179,29 @@ inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
 // Custom implementation for string.
 
 template <>
-struct SaveTypeTraits<string> {
+struct SaveTypeTraits<tstring> {
   static constexpr bool supported = true;
   typedef const string* SavedType;
   typedef protobuf::RepeatedPtrField<string> RepeatedField;
 };
 
 template <>
-inline const string* const* TensorProtoData<string>(const TensorProto& t) {
-  static_assert(SaveTypeTraits<string>::supported,
-                "Specified type string not supported for Restore");
+inline const string* const* TensorProtoData<tstring>(const TensorProto& t) {
+  static_assert(SaveTypeTraits<tstring>::supported,
+                "Specified type tstring not supported for Restore");
   return t.string_val().data();
 }
 
 template <>
-inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<string>(
+inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
     TensorProto* t) {
-  static_assert(SaveTypeTraits<string>::supported,
-                "Specified type string not supported for Save");
+  static_assert(SaveTypeTraits<tstring>::supported,
+                "Specified type tstring not supported for Save");
   return t->mutable_string_val();
 }
 
 template <>
-inline void Fill(const string* data, size_t n, TensorProto* t) {
+inline void Fill(const tstring* data, size_t n, TensorProto* t) {
   typename protobuf::RepeatedPtrField<string> copy(data, data + n);
   t->mutable_string_val()->Swap(&copy);
 }
diff --git a/tensorflow/core/util/sparse/README.md b/tensorflow/core/util/sparse/README.md
index 7b0799eb0e3..69b299d142a 100644
--- a/tensorflow/core/util/sparse/README.md
+++ b/tensorflow/core/util/sparse/README.md
@@ -102,7 +102,7 @@ Example of grouping:
     Tensor values(DT_STRING, TensorShape({N});
     TensorShape shape({dim0,...});
     SparseTensor sp(indices, vals, shape);
-    sp.Reorder<string>({1, 2, 0, 3, ...}); // Must provide NDIMS dims.
+    sp.Reorder<tstring>({1, 2, 0, 3, ...}); // Must provide NDIMS dims.
     // group according to dims 1 and 2
     for (const auto& g : sp.group({1, 2})) {
       cout << "vals of ix[:, 1,2] for this group: "
@@ -111,7 +111,7 @@ Example of grouping:
       cout << "values of group:\n" << g.values();
 
       TTypes<int64>::UnalignedMatrix g_ix = g.indices();
-      TTypes<string>::UnalignedVec g_v = g.values();
+      TTypes<tstring>::UnalignedVec g_v = g.values();
       ASSERT(g_ix.dimension(0) == g_v.size());  // number of elements match.
     }
 
@@ -133,7 +133,7 @@ Shape checking is performed, as is boundary checking.
 
     Tensor dense(DT_STRING, shape);
     // initialize other indices to zero.  copy.
-    ASSERT(sp.ToDense<string>(&dense, true));
+    ASSERT(sp.ToDense<tstring>(&dense, true));
 
 
 Concat
@@ -215,7 +215,7 @@ Coding Example:
     EXPECT_EQ(conc.Order(), {-1, -1, -1});
 
     // Reorder st3 so all input tensors have the exact same orders.
-    st3.Reorder<string>({1, 0, 2});
+    st3.Reorder<tstring>({1, 0, 2});
     SparseTensor conc2 = SparseTensor::Concat<string>({st1, st2, st3});
     EXPECT_EQ(conc2.Order(), {1, 0, 2});
     // All indices' orders matched, so output is in order.
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 4e53c59ba36..d33bd03db29 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -312,7 +312,11 @@ class SparseTensor {
                                        str_util::Join(shape_, ","), "]");
       }
       if (!increasing) {
-        return errors::InvalidArgument(index, " is out of order");
+        return errors::InvalidArgument(
+            index,
+            " is out of order. Many sparse ops require sorted indices.\n"
+            "    Use `tf.sparse.reorder` to create a correctly ordered copy."
+            "\n\n");
       }
       if (!different) {
         return errors::InvalidArgument(index, " is repeated");
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 5ab0a3d084e..9c5a3449857 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -170,7 +170,7 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
   int N = 5;
   const int NDIM = 3;
   auto ix_c = GetSimpleIndexTensor(N, NDIM);
-  Eigen::Tensor<string, 1, Eigen::RowMajor> vals_c(N);
+  Eigen::Tensor<tstring, 1, Eigen::RowMajor> vals_c(N);
   vals_c(0) = "hi0";
   vals_c(1) = "hi1";
   vals_c(2) = "hi2";
@@ -181,7 +181,7 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
   Tensor vals(DT_STRING, TensorShape({N}));
 
   auto ix_t = ix.matrix<int64>();
-  auto vals_t = vals.vec<string>();
+  auto vals_t = vals.vec<tstring>();
   vals_t = vals_c;
   ix_t = ix_c;
 
@@ -191,12 +191,16 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
   Status st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
-  EXPECT_EQ("indices[2] = [2,0,0] is out of order",
-            st_indices_valid.error_message());
+  EXPECT_EQ(
+      "indices[2] = [2,0,0] is out of order. "
+      "Many sparse ops require sorted indices.\n"
+      "    Use `tf.sparse.reorder` to create a correctly ordered copy."
+      "\n\n",
+      st_indices_valid.error_message());
 
   // Regardless of how order is updated; so long as there are no
   // duplicates, the resulting indices are valid.
-  st.Reorder<string>({2, 0, 1});
+  st.Reorder<tstring>({2, 0, 1});
   TF_EXPECT_OK(st.IndicesValid());
   EXPECT_EQ(vals_t(0), "hi0");
   EXPECT_EQ(vals_t(1), "hi3");
@@ -206,7 +210,7 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
 
   ix_t = ix_c;
   vals_t = vals_c;
-  st.Reorder<string>({0, 1, 2});
+  st.Reorder<tstring>({0, 1, 2});
   TF_EXPECT_OK(st.IndicesValid());
   EXPECT_EQ(vals_t(0), "hi0");
   EXPECT_EQ(vals_t(1), "hi4");
@@ -216,7 +220,7 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
 
   ix_t = ix_c;
   vals_t = vals_c;
-  st.Reorder<string>({2, 1, 0});
+  st.Reorder<tstring>({2, 1, 0});
   TF_EXPECT_OK(st.IndicesValid());
 }
 
@@ -235,7 +239,7 @@ TEST(SparseTensorTest, EmptySparseTensorAllowed) {
   EXPECT_EQ(st.order(), order);
 
   std::vector<int64> new_order{1, 0, 2};
-  st.Reorder<string>(new_order);
+  st.Reorder<tstring>(new_order);
   TF_EXPECT_OK(st.IndicesValid());
   EXPECT_EQ(st.order(), new_order);
 }
@@ -255,13 +259,13 @@ TEST(SparseTensorTest, SortingWorksCorrectly) {
   for (int n = 0; n < 100; ++n) {
     ix_t = ix_t.random(Eigen::internal::UniformRandomGenerator<int64>(n + 1));
     ix_t = ix_t.abs() % 1000;
-    st.Reorder<string>({0, 1, 2, 3});
+    st.Reorder<tstring>({0, 1, 2, 3});
     TF_EXPECT_OK(st.IndicesValid());
-    st.Reorder<string>({3, 2, 1, 0});
+    st.Reorder<tstring>({3, 2, 1, 0});
     TF_EXPECT_OK(st.IndicesValid());
-    st.Reorder<string>({1, 0, 2, 3});
+    st.Reorder<tstring>({1, 0, 2, 3});
     TF_EXPECT_OK(st.IndicesValid());
-    st.Reorder<string>({3, 0, 2, 1});
+    st.Reorder<tstring>({3, 0, 2, 1});
     TF_EXPECT_OK(st.IndicesValid());
   }
 }
@@ -290,7 +294,7 @@ TEST(SparseTensorTest, ValidateIndicesFindsInvalid) {
   SparseTensor st;
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
 
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   Status st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
   EXPECT_EQ("indices[1] = [0,0,0] is repeated",
@@ -298,12 +302,12 @@ TEST(SparseTensorTest, ValidateIndicesFindsInvalid) {
 
   ix_orig(1, 2) = 1;
   ix_t = ix_orig;
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   TF_EXPECT_OK(st.IndicesValid());  // second index now (0, 0, 1)
 
   ix_orig(0, 2) = 1;
   ix_t = ix_orig;
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());  // first index now (0, 0, 1)
   EXPECT_EQ("indices[1] = [0,0,1] is repeated",
@@ -328,12 +332,12 @@ TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
   EXPECT_FALSE(st.IndicesValid().ok());
 
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   TF_EXPECT_OK(st.IndicesValid());
 
   ix_t(0, 0) = 11;
   ix.matrix<int64>() = ix_t;
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   Status st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
   // Error message references index 4 because of the call to Reorder.
@@ -342,7 +346,7 @@ TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
 
   ix_t(0, 0) = -1;
   ix.matrix<int64>() = ix_t;
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
   EXPECT_EQ("[-1,0,0] is out of bounds: need 0 <= index < [10,10,10]",
@@ -350,7 +354,7 @@ TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
 
   ix_t(0, 0) = 0;
   ix.matrix<int64>() = ix_t;
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   TF_EXPECT_OK(st.IndicesValid());
 }
 
@@ -362,7 +366,7 @@ TEST(SparseTensorTest, SparseTensorToDenseTensor) {
   Tensor vals(DT_STRING, TensorShape({N}));
 
   auto ix_t = GetSimpleIndexTensor(N, NDIM);
-  auto vals_t = vals.vec<string>();
+  auto vals_t = vals.vec<tstring>();
 
   ix.matrix<int64>() = ix_t;
 
@@ -378,9 +382,9 @@ TEST(SparseTensorTest, SparseTensorToDenseTensor) {
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
 
   Tensor dense(DT_STRING, TensorShape({4, 4, 5}));
-  st.ToDense<string>(&dense);
+  st.ToDense<tstring>(&dense);
 
-  auto dense_t = dense.tensor<string, 3>();
+  auto dense_t = dense.tensor<tstring, 3>();
   Eigen::array<Eigen::DenseIndex, NDIM> ix_n;
   for (int n = 0; n < N; ++n) {
     for (int d = 0; d < NDIM; ++d) ix_n[d] = ix_t(n, d);
@@ -402,7 +406,7 @@ TEST(SparseTensorTest, SparseTensorToLargerDenseTensor) {
   Tensor vals(DT_STRING, TensorShape({N}));
 
   auto ix_t = GetSimpleIndexTensor(N, NDIM);
-  auto vals_t = vals.vec<string>();
+  auto vals_t = vals.vec<tstring>();
 
   ix.matrix<int64>() = ix_t;
 
@@ -418,9 +422,9 @@ TEST(SparseTensorTest, SparseTensorToLargerDenseTensor) {
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
 
   Tensor dense(DT_STRING, TensorShape({10, 10, 10}));
-  st.ToDense<string>(&dense);
+  st.ToDense<tstring>(&dense);
 
-  auto dense_t = dense.tensor<string, 3>();
+  auto dense_t = dense.tensor<tstring, 3>();
   Eigen::array<Eigen::DenseIndex, NDIM> ix_n;
   for (int n = 0; n < N; ++n) {
     for (int d = 0; d < NDIM; ++d) ix_n[d] = ix_t(n, d);
@@ -540,7 +544,7 @@ TEST(SparseTensorTest, Concat) {
   auto ix_c = GetSimpleIndexTensor(N, NDIM);
 
   auto ix_t = ix.matrix<int64>();
-  auto vals_t = vals.vec<string>();
+  auto vals_t = vals.vec<tstring>();
 
   ix_t = ix_c;
 
@@ -550,10 +554,10 @@ TEST(SparseTensorTest, Concat) {
   SparseTensor st;
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
   EXPECT_FALSE(st.IndicesValid().ok());
-  st.Reorder<string>(order);
+  st.Reorder<tstring>(order);
   TF_EXPECT_OK(st.IndicesValid());
 
-  SparseTensor concatted = SparseTensor::Concat<string>({st, st, st, st});
+  SparseTensor concatted = SparseTensor::Concat<tstring>({st, st, st, st});
   EXPECT_EQ(concatted.order(), st.order());
   gtl::InlinedVector<int64, 8> expected_shape{40, 10, 10};
   EXPECT_EQ(concatted.shape(), expected_shape);
@@ -561,7 +565,7 @@ TEST(SparseTensorTest, Concat) {
   TF_EXPECT_OK(concatted.IndicesValid());
 
   auto conc_ix_t = concatted.indices().matrix<int64>();
-  auto conc_vals_t = concatted.values().vec<string>();
+  auto conc_vals_t = concatted.values().vec<tstring>();
 
   for (int n = 0; n < 4; ++n) {
     for (int i = 0; i < N; ++i) {
@@ -581,7 +585,7 @@ TEST(SparseTensorTest, Concat) {
   SparseTensor st_ooo;
   TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, {0, 2, 1},
                                     &st_ooo));  // non-primary ix OOO
-  SparseTensor conc_ooo = SparseTensor::Concat<string>({st, st, st, st_ooo});
+  SparseTensor conc_ooo = SparseTensor::Concat<tstring>({st, st, st, st_ooo});
   std::vector<int64> expected_ooo{-1, -1, -1};
   EXPECT_EQ(conc_ooo.order(), expected_ooo);
   EXPECT_EQ(conc_ooo.shape(), expected_shape);
@@ -750,7 +754,7 @@ static void BM_SparseReorderString(int iters, int N32, int NDIM32) {
   TensorShape shape;
   std::vector<int64> order;
   auto ix_t = ix.matrix<int64>();
-  auto vals_t = vals.vec<string>();
+  auto vals_t = vals.vec<tstring>();
   for (int i = 0; i < N32; ++i) {
     int len = rnd.Rand32() % 1000;
     vals_t(i).resize(len);
@@ -778,7 +782,7 @@ static void BM_SparseReorderString(int iters, int N32, int NDIM32) {
     TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, order, &st));
 
     testing::StartTiming();
-    st.Reorder<string>(reorder);
+    st.Reorder<tstring>(reorder);
   }
 }
 
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 2117042034b..99f4c08c5c6 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -146,6 +146,15 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
           ds.device().find("/stream:all") == std::string::npos) {
         continue;
       }
+      // NOTE(fishx): We will record ops execution time twice: one as CPU
+      // activity with device name "/host:CPU" and the other as TF runtime
+      // activity with device name started with "/job:*". It is safe to ignore
+      // CPU activties here.
+      // TODO(b/138729463): Read ops execution time from CPU activities instead
+      // of runtime acitivities.
+      if (ds.device().find("/host:CPU") != std::string::npos) {
+        continue;
+      }
 
       std::string name = ns.node_name();
       std::string op_type = "<>";
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 6782e518d4f..d6c5fcf3f73 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -50,7 +50,6 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 0756b47f220..185c25d708b 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -23,10 +23,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb_text.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb_text.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
@@ -69,7 +68,7 @@ namespace {
 // Checksums the string lengths (as restored uint32 or uint64, not varint64
 // bytes) and string bytes, and stores it into "actual_crc32c".
 Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
-                        size_t offset, size_t size, string* destination,
+                        size_t offset, size_t size, tstring* destination,
                         uint32* actual_crc32c, bool need_to_swap_bytes) {
   if (size == 0) return Status::OK();
   CHECK_GT(size, 0);
@@ -127,7 +126,7 @@ Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
   // Reads the actual string bytes.
   for (size_t i = 0; i < num_elements; ++i) {
     const uint64 string_length = string_lengths[i];
-    string* buffer = &destination[i];
+    tstring* buffer = &destination[i];
 
     buffer->resize(string_length);
     size_t bytes_read = 0;
@@ -205,9 +204,9 @@ char* GetBackingBuffer(const Tensor& val) {
   return const_cast<char*>(val.tensor_data().data());
 }
 
-string* GetStringBackingBuffer(const Tensor& val) {
+tstring* GetStringBackingBuffer(const Tensor& val) {
   CHECK_EQ(DT_STRING, val.dtype());
-  return const_cast<string*>(val.flat<string>().data());
+  return const_cast<tstring*>(val.flat<tstring>().data());
 }
 
 Status ParseEntryProto(StringPiece key, StringPiece value,
@@ -244,14 +243,14 @@ Status WriteStringTensor(const Tensor& val, FileOutputBuffer* out,
   // Var "crc32c" checksums the string lengths (as uint64, not varint64 bytes),
   // the length-checksum, and all the string bytes.
   DCHECK_EQ(val.dtype(), DT_STRING);
-  const string* strings = GetStringBackingBuffer(val);
+  const tstring* strings = GetStringBackingBuffer(val);
 
   // Writes the varint lengths.
   string lengths;
   lengths.reserve(val.NumElements());  // At least 1 byte per element.
   *crc32c = 0;
   for (int64 i = 0; i < val.NumElements(); ++i) {
-    const string* elem = &strings[i];
+    const tstring* elem = &strings[i];
     DCHECK_EQ(elem->size(), static_cast<uint64>(elem->size()));
     const uint64 elem_size = static_cast<uint64>(elem->size());
 
@@ -281,7 +280,7 @@ Status WriteStringTensor(const Tensor& val, FileOutputBuffer* out,
 
   // Writes all the string bytes out.
   for (int64 i = 0; i < val.NumElements(); ++i) {
-    const string* string = &strings[i];
+    const tstring* string = &strings[i];
     TF_RETURN_IF_ERROR(out->Append(*string));
     *bytes_written += string->size();
     *crc32c = crc32c::Extend(*crc32c, string->data(), string->size());
@@ -675,7 +674,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
   return Status::OK();
 }
 
-Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
+Status MergeBundles(Env* env, gtl::ArraySlice<tstring> prefixes,
                     StringPiece merged_prefix) {
   // Merges all metadata tables.
   // TODO(zhifengc): KeyValue sorter if it becomes too big.
@@ -798,7 +797,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
       ParseEntryProto(iter_->key(), iter_->value(), &entry_copy));
   if (!TensorShape::IsValid(entry_copy.shape())) {
     return errors::DataLoss("Invalid tensor shape: ", key, " ",
-                            ProtoShortDebugString(entry_copy.shape()));
+                            entry_copy.shape().ShortDebugString());
   }
 
   *entry = entry_copy;
@@ -823,10 +822,10 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
     // Relaxes the check for string tensors as follows:
     //   entry.size() == bytes(varint lengths) + bytes(data)
     //                >= NumElems + bytes(data), since size bytes(varint) >= 1.
-    //   TotalBytes() == sizeof(string) * NumElems + bytes(data)
+    //   TotalBytes() == sizeof(tstring) * NumElems + bytes(data)
     // Since we don't know bytes(varint lengths), we just check an inequality.
     const size_t lower_bound = ret->NumElements() + ret->TotalBytes() -
-                               sizeof(string) * ret->NumElements();
+                               sizeof(tstring) * ret->NumElements();
     if (entry.size() < lower_bound) {
       return errors::DataLoss("Invalid size in bundle entry: key ", key(),
                               "; stored size ", entry.size(),
@@ -920,7 +919,7 @@ Status BundleReader::ReadCurrent(Tensor* val) {
   TF_RETURN_IF_ERROR(ParseEntryProto(iter_->key(), iter_->value(), &entry));
   if (!TensorShape::IsValid(entry.shape())) {
     return errors::DataLoss("Invalid tensor shape: ", iter_->key(), " ",
-                            ProtoShortDebugString(entry.shape()));
+                            entry.shape().ShortDebugString());
   }
 
   if (entry.slices().empty()) {
@@ -1095,9 +1094,8 @@ string BundleReader::DebugString() {
     CHECK(entry.ParseFromArray(value().data(), value().size()));
     if (entry.slices_size() > 0) continue;  // Slice of some partitioned var.
 
-    strings::StrAppend(&shape_str, key(), " (",
-                       EnumName_DataType(entry.dtype()), ") ",
-                       TensorShape(entry.shape()).DebugString());
+    strings::StrAppend(&shape_str, key(), " (", DataType_Name(entry.dtype()),
+                       ") ", TensorShape(entry.shape()).DebugString());
     strings::StrAppend(&shape_str, "\n");
   }
   return shape_str;
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 0320878df8d..e1f39eccd17 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -172,7 +172,7 @@ class BundleWriter {
 //
 // Once merged, makes a best effort to delete the old metadata files.
 // Returns OK iff all bundles are successfully merged.
-Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
+Status MergeBundles(Env* env, gtl::ArraySlice<tstring> prefixes,
                     StringPiece merged_prefix);
 
 // On construction, silently attempts to read the metadata associated with
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 7cd4b82c815..4f885718749 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -710,11 +710,12 @@ TEST(TensorBundleTest, StringTensorsOldFormat) {
   EXPECT_EQ(AllTensorKeys(&reader),
             std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
 
-  Expect<string>(&reader, "string_tensor", Tensor(DT_STRING, TensorShape({1})));
-  Expect<string>(&reader, "scalar", test::AsTensor<string>({"hello"}));
-  Expect<string>(
+  Expect<tstring>(&reader, "string_tensor",
+                  Tensor(DT_STRING, TensorShape({1})));
+  Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
+  Expect<tstring>(
       &reader, "strs",
-      test::AsTensor<string>({"hello", "", "x01", string(1 << 10, 'c')}));
+      test::AsTensor<tstring>({"hello", "", "x01", string(1 << 10, 'c')}));
   Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 }
 
@@ -726,14 +727,19 @@ TEST(TensorBundleTest, StringTensors) {
     BundleWriter writer(Env::Default(), Prefix("foo"));
     TF_EXPECT_OK(writer.Add("string_tensor",
                             Tensor(DT_STRING, TensorShape({1}))));  // Empty.
-    TF_EXPECT_OK(writer.Add("scalar", test::AsTensor<string>({"hello"})));
+    TF_EXPECT_OK(writer.Add("scalar", test::AsTensor<tstring>({"hello"})));
     TF_EXPECT_OK(writer.Add(
         "strs",
-        test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')})));
+        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')})));
 
     // Requires a 64-bit length.
-    string* backing_string = long_string_tensor.flat<string>().data();
+    tstring* backing_string = long_string_tensor.flat<tstring>().data();
+#ifdef USE_TSTRING
+    backing_string->resize_uninitialized(kLongLength);
+    std::char_traits<char>::assign(backing_string->data(), kLongLength, 'd');
+#else   // USE_TSTRING
     backing_string->assign(kLongLength, 'd');
+#endif  // USE_TSTRING
     TF_EXPECT_OK(writer.Add("long_scalar", long_string_tensor));
 
     // Mixes in some floats.
@@ -747,12 +753,12 @@ TEST(TensorBundleTest, StringTensors) {
               std::vector<string>({"floats", "long_scalar", "scalar",
                                    "string_tensor", "strs"}));
 
-    Expect<string>(&reader, "string_tensor",
-                   Tensor(DT_STRING, TensorShape({1})));
-    Expect<string>(&reader, "scalar", test::AsTensor<string>({"hello"}));
-    Expect<string>(
+    Expect<tstring>(&reader, "string_tensor",
+                    Tensor(DT_STRING, TensorShape({1})));
+    Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
+    Expect<tstring>(
         &reader, "strs",
-        test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')}));
+        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')}));
 
     Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 
@@ -767,17 +773,17 @@ TEST(TensorBundleTest, StringTensors) {
     EXPECT_EQ(TensorShape({1}), shape);
 
     // Zero-out the string so that we can be sure the new one is read in.
-    string* backing_string = long_string_tensor.flat<string>().data();
+    tstring* backing_string = long_string_tensor.flat<tstring>().data();
     backing_string->assign("");
 
     // Read long_scalar and check it contains kLongLength 'd's.
     TF_ASSERT_OK(reader.Lookup("long_scalar", &long_string_tensor));
-    ASSERT_EQ(backing_string, long_string_tensor.flat<string>().data());
+    ASSERT_EQ(backing_string, long_string_tensor.flat<tstring>().data());
     EXPECT_EQ(kLongLength, backing_string->length());
-    for (char c : *backing_string) {
+    for (size_t i = 0; i < kLongLength; i++) {
       // Not using ASSERT_EQ('d', c) because this way is twice as fast due to
       // compiler optimizations.
-      if (c != 'd') {
+      if ((*backing_string)[i] != 'd') {
         FAIL() << "long_scalar is not full of 'd's as expected.";
         break;
       }
@@ -945,7 +951,7 @@ TEST(TensorBundleTest, Checksum) {
     auto WriteStrings = []() {
       BundleWriter writer(Env::Default(), Prefix("strings"));
       TF_EXPECT_OK(
-          writer.Add("foo", test::AsTensor<string>({"hello", "world"})));
+          writer.Add("foo", test::AsTensor<tstring>({"hello", "world"})));
       TF_ASSERT_OK(writer.Finish());
     };
     // Corrupts the first two bytes, which are the varint32-encoded lengths
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 82af5c545f7..aea7021d0bd 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -123,6 +124,9 @@ inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
       // Note: the VECT_W is not counted as an independent spatial dim here,
       // since it just a component of the width dimension.
       return num_dims - 3;  // Exclude N,C,VectDim.
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
   }
 }
 
@@ -147,6 +151,9 @@ inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
     case FORMAT_NCHW_VECT_C:
     case FORMAT_NHWC_VECT_W:
       return num_spatial_dims + 3;  // Include N,C,VectDim.
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
   }
 }
 
@@ -441,7 +448,9 @@ T GetFilterDim(gtl::ArraySlice<T> dimension_attribute,
                                           filter_tensor_format) == 3)
                   ? GetFilterDimIndex<3>(filter_tensor_format, dimension)
                   : GetFilterDimIndex<2>(filter_tensor_format, dimension);
-  CHECK(index >= 0 && index < dimension_attribute.size())
+  using size_type = typename gtl::ArraySlice<T>::size_type;
+  CHECK(index >= 0 &&
+        static_cast<size_type>(index) < dimension_attribute.size())
       << "Invalid index from the dimension: " << index << ", "
       << filter_tensor_format << ", " << dimension;
   return dimension_attribute[index];
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index c6dda2ec298..b04c0af8a32 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <utility>
 #include <vector>
-#include "tensorflow/core/framework/types.pb_text.h"
+
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -301,7 +302,7 @@ const string TensorSliceReader::DebugString() const {
   if (status().ok()) {
     for (auto e : Tensors()) {
       strings::StrAppend(&shape_str, e.first, " (",
-                         EnumName_DataType(e.second->type()), ") ",
+                         DataType_Name(e.second->type()), ") ",
                          e.second->shape().DebugString());
       // Indicates if a tensor has more than 1 slice (i.e., it's partitioned).
       const int num_slices = e.second->Slices().size();
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index 7c1d325c0a5..a2b8ca7dea2 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -30,8 +30,7 @@ TensorSliceSet::TensorSliceSet(const TensorShape& shape, DataType type)
 
 TensorSliceSet::~TensorSliceSet() {}
 
-Status TensorSliceSet::Register(const TensorSlice& slice, const string& tag,
-                                const float* data) {
+Status TensorSliceSet::Register(const TensorSlice& slice, const string& tag) {
   TensorShape result_shape;
   TF_RETURN_IF_ERROR(slice.SliceTensorShape(shape_, &result_shape));
   string str = slice.DebugString();
@@ -53,69 +52,11 @@ Status TensorSliceSet::Register(const TensorSlice& slice, const string& tag,
     slices_hull_.UpdateToCover(slice);
   }
 
-  TensorSliceSet::SliceInfo info = {slice, tag, data,
-                                    result_shape.num_elements()};
+  TensorSliceSet::SliceInfo info = {slice, tag, result_shape.num_elements()};
   slices_.insert(std::make_pair(str, info));
   return Status::OK();
 }
 
-// TODO(yangke): merge Query() with QueryMeta()
-bool TensorSliceSet::Query(const TensorSlice& slice, float* data) const {
-  Status s;
-  string str = slice.DebugString();
-  // First we check if there is an exactly match (this is the dominant case).
-  const TensorSliceSet::SliceInfo* info = gtl::FindOrNull(slices_, str);
-  if (info) {
-    if (data) {
-      std::copy_n(info->data, info->num_floats, data);
-    }
-    return true;
-  } else {
-    // We didn't find any exact match but there is still a possibility that
-    // multiple existing slices can be patched together to output the slice.
-    // We figure this out by computing the intersection of each of the existing
-    // slices with the query slice, and check if the union of all these
-    // intersections cover the entire slice. We rely on the fact that the
-    // existing slices don't have any intersection among themselves.
-    TensorShape target_shape;
-    Status s;
-    s = slice.SliceTensorShape(shape_, &target_shape);
-    if (!s.ok()) {
-      LOG(WARNING) << s;
-      return false;
-    }
-    int64 total_size = target_shape.num_elements();
-
-    int64 overlap_size = 0;
-    TensorSlice intersection;
-    TensorShape inter_shape;
-    for (const auto& x : slices_) {
-      if (slice.Intersect(x.second.slice, &intersection)) {
-        s = intersection.SliceTensorShape(shape_, &inter_shape);
-        if (!s.ok()) {
-          LOG(WARNING) << s;
-          return false;
-        }
-        overlap_size += inter_shape.num_elements();
-      }
-    }
-    if (total_size == overlap_size) {
-      // We have it!
-      // Now we need to copy the data to "data"
-      if (data) {
-        for (const auto& x : slices_) {
-          CopyDataFromTensorSliceToTensorSlice(shape_, x.second.slice, slice,
-                                               x.second.data, data);
-        }
-      }
-      return true;
-    } else {
-      // We don't have all the data for the asked tensor slice
-      return false;
-    }
-  }
-}
-
 bool TensorSliceSet::QueryMeta(
     const TensorSlice& slice,
     std::vector<std::pair<TensorSlice, string>>* results) const {
@@ -194,7 +135,7 @@ Status RegisterTensorSlice(
     }
   }
   // Register the tensor slices without the actual data.
-  return tss->Register(slice, tag, nullptr);
+  return tss->Register(slice, tag);
 }
 
 }  // namespace checkpoint
diff --git a/tensorflow/core/util/tensor_slice_set.h b/tensorflow/core/util/tensor_slice_set.h
index 22baed06278..7ab3586c00c 100644
--- a/tensorflow/core/util/tensor_slice_set.h
+++ b/tensorflow/core/util/tensor_slice_set.h
@@ -16,11 +16,8 @@ limitations under the License.
 // A class to manage slices of a tensor. You can "register" set of slices for a
 // tensor and then "query" if we have data for a given slice.
 
-// TODO(yangke): consider moving it to a more private place so that we don't
-// need to expose the API.
-
-#ifndef TENSORFLOW_UTIL_TENSOR_SLICE_SET_H_
-#define TENSORFLOW_UTIL_TENSOR_SLICE_SET_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
 
 #include <string>  // for string
 #include <unordered_map>
@@ -49,18 +46,7 @@ class TensorSliceSet {
   // associated with the slice (in one application it denotes the name of the
   // file that contains the slice); the "data" points to the data of the tensor
   // slice (it can be a nullptr).
-  // We don't take the ownership of "data" and the caller needs to make sure
-  // the data is always available during the life time of the tensor slice set
-  // if it is not nullptr.
-  Status Register(const TensorSlice& slice, const string& tag,
-                  const float* data);
-
-  // Query about a new slice: checks if we have data for "slice" and if we have
-  // the data and "data" is not nullptr, fill "data" with the slice data. The
-  // caller needs to make sure "data" point to a large enough buffer.
-  // TODO(yangke): avoid unnecessary copying by using a core::RefCounted
-  // pointer.
-  bool Query(const TensorSlice& slice, float* data) const;
+  Status Register(const TensorSlice& slice, const string& tag);
 
   // Alternative way of querying about a new slice: instead of copying the
   // data, it returns a list of meta data about the stored slices that will
@@ -72,7 +58,6 @@ class TensorSliceSet {
   struct SliceInfo {
     TensorSlice slice;
     const string tag;
-    const float* data;
     int64 num_floats;
   };
 
@@ -105,4 +90,4 @@ Status RegisterTensorSlice(
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_SLICE_SET_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
diff --git a/tensorflow/core/util/tensor_slice_set_test.cc b/tensorflow/core/util/tensor_slice_set_test.cc
index 8e12f7c7874..919629eab88 100644
--- a/tensorflow/core/util/tensor_slice_set_test.cc
+++ b/tensorflow/core/util/tensor_slice_set_test.cc
@@ -36,107 +36,6 @@ namespace {
 //
 // We assume this is a row-major matrix.
 //
-// We store the tensor in a couple of slices and verify that we can recover all
-// of them.
-TEST(TensorSliceSetTest, QueryTwoD) {
-  TensorShape shape({4, 5});
-
-  TensorSliceSet tss(shape, DT_FLOAT);
-  // We store a few slices.
-
-  // Slice #1 is the top two rows:
-  //   0   1   2   3   4
-  //   5   6   7   8   9
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  const float src_1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  TensorSlice slice_1 = TensorSlice::ParseOrDie("0,2:-");
-  TF_CHECK_OK(tss.Register(slice_1, "", src_1));
-
-  // Slice #2 is the bottom left corner
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  //  10  11  12   .   .
-  //  15  16  17   .   .
-  const float src_2[] = {10, 11, 12, 15, 16, 17};
-  TensorSlice slice_2 = TensorSlice::ParseOrDie("2,2:0,3");
-  TF_CHECK_OK(tss.Register(slice_2, "", src_2));
-
-  // Slice #3 is the bottom right corner
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  //   .   .   .  18  19
-  const float src_3[] = {18, 19};
-  TensorSlice slice_3 = TensorSlice::ParseOrDie("3,1:3,2");
-  TF_CHECK_OK(tss.Register(slice_3, "", src_3));
-
-  // Notice that we leave a hole in the tensor
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  //   .   .   . (13) (14)
-  //   .   .   .   .   .
-
-  // Now we query some of the slices
-
-  // Slice #1 is an exact match
-  //   0   1   2   3   4
-  //   5   6   7   8   9
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  {
-    TensorSlice s = TensorSlice::ParseOrDie("0,2:-");
-    float expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    float results[10];
-    EXPECT_TRUE(tss.Query(s, results));
-    for (int i = 0; i < 10; ++i) {
-      EXPECT_EQ(expected[i], results[i]);
-    }
-  }
-
-  // Slice #2 is a subset match
-  //   .   .   .   .   .
-  //   5   6   7   8   9
-  //   .   .   .   .   .
-  //   .   .   .   .   .
-  {
-    TensorSlice s = TensorSlice::ParseOrDie("1,1:-");
-    float expected[] = {5, 6, 7, 8, 9};
-    float results[5];
-    EXPECT_TRUE(tss.Query(s, results));
-    for (int i = 0; i < 5; ++i) {
-      EXPECT_EQ(expected[i], results[i]);
-    }
-  }
-
-  // Slice #3 is a more complicated match: it needs the combination of a couple
-  // of slices
-  //   .   .   .   .   .
-  //   5   6   7   .   .
-  //  10  11  12   .   .
-  //   .   .   .   .   .
-  {
-    TensorSlice s = TensorSlice::ParseOrDie("1,2:0,3");
-    float expected[] = {5, 6, 7, 10, 11, 12};
-    float results[6];
-    EXPECT_TRUE(tss.Query(s, results));
-    for (int i = 0; i < 6; ++i) {
-      EXPECT_EQ(expected[i], results[i]);
-    }
-  }
-
-  // Slice #4 includes the hole and so there is no match
-  //   .   .   .   .   .
-  //   .   .   7   8   9
-  //   .   .  12  13  14
-  //   .   .   .   .   .
-  {
-    TensorSlice s = TensorSlice::ParseOrDie("1,2:2,3");
-    float results[6];
-    EXPECT_FALSE(tss.Query(s, results));
-  }
-}
-
 // Testing the meta version of the tensor slice set.
 TEST(TensorSliceSetTest, QueryMetaTwoD) {
   TensorShape shape({4, 5});
@@ -150,7 +49,7 @@ TEST(TensorSliceSetTest, QueryMetaTwoD) {
   //   .   .   .   .   .
   //   .   .   .   .   .
   TensorSlice slice_1 = TensorSlice::ParseOrDie("0,2:-");
-  TF_CHECK_OK(tss.Register(slice_1, "slice_1", nullptr));
+  TF_CHECK_OK(tss.Register(slice_1, "slice_1"));
 
   // Slice #2 is the bottom left corner
   //   .   .   .   .   .
@@ -158,7 +57,7 @@ TEST(TensorSliceSetTest, QueryMetaTwoD) {
   //  10  11  12   .   .
   //  15  16  17   .   .
   TensorSlice slice_2 = TensorSlice::ParseOrDie("2,2:0,3");
-  TF_CHECK_OK(tss.Register(slice_2, "slice_2", nullptr));
+  TF_CHECK_OK(tss.Register(slice_2, "slice_2"));
 
   // Slice #3 is the bottom right corner
   //   .   .   .   .   .
@@ -166,7 +65,7 @@ TEST(TensorSliceSetTest, QueryMetaTwoD) {
   //   .   .   .   .   .
   //   .   .   .  18  19
   TensorSlice slice_3 = TensorSlice::ParseOrDie("3,1:3,2");
-  TF_CHECK_OK(tss.Register(slice_3, "slice_3", nullptr));
+  TF_CHECK_OK(tss.Register(slice_3, "slice_3"));
 
   // Notice that we leave a hole in the tensor
   //   .   .   .   .   .
@@ -250,7 +149,7 @@ static void BM_RegisterOneByOne(int parts) {
   TensorSliceSet slice_set(shape, DT_INT32);
   for (int i = 0; i < parts; ++i) {
     TensorSlice part({{i, 1}, {0, -1}});
-    TF_CHECK_OK(slice_set.Register(part, part.DebugString(), nullptr));
+    TF_CHECK_OK(slice_set.Register(part, part.DebugString()));
   }
 }
 
diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h
index 6d478349a78..7ade6d76efc 100644
--- a/tensorflow/core/util/tensor_slice_util.h
+++ b/tensorflow/core/util/tensor_slice_util.h
@@ -55,7 +55,7 @@ struct CopyThatWorksWithStringPointer {
 // Eigen makes it extremely difficult to dereference a tensor of string* into
 // string, so we roll our own loop instead.
 template <>
-struct CopyThatWorksWithStringPointer<string> {
+struct CopyThatWorksWithStringPointer<tstring> {
   template <typename SrcTensor, typename DstTensor, typename Shape>
   static void Copy(const SrcTensor& s, Shape s_start, Shape len, DstTensor& d,
                    Shape d_start) {
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index 7ebde002e16..793faa6ed1f 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -176,7 +176,7 @@ size_t TensorSliceWriter::MaxBytesPerElement(DataType dt) {
 }
 
 template <>
-Status TensorSliceWriter::SaveData(const string* data, int64 num_elements,
+Status TensorSliceWriter::SaveData(const tstring* data, int64 num_elements,
                                    SavedSlice* ss) {
   size_t size_bound = ss->ByteSize() + kTensorProtoHeaderBytes +
                       (num_elements * MaxBytesPerElement(DT_INT32));
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 0db2fb48047..b610565e1e0 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/saved_tensor_slice.pb_text.h"
 #include "tensorflow/core/util/saved_tensor_slice.pb.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 
@@ -111,7 +110,7 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
     // The same tensor has been registered -- we verify that the shapes and the
     // type agree.
     const SavedSliceMeta& ssm = sts_.meta().tensor(index);
-    CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm);
+    CHECK_EQ(name, ssm.name()) << ssm.ShortDebugString();
     TensorShape ssm_shape(ssm.shape());
     if (!shape.IsSameSize(ssm_shape)) {
       return errors::Internal(
@@ -178,7 +177,7 @@ Status TensorSliceWriter::SaveData(const T* data, int64 num_elements,
 }
 
 template <>
-Status TensorSliceWriter::SaveData(const string* data, int64 num_elements,
+Status TensorSliceWriter::SaveData(const tstring* data, int64 num_elements,
                                    SavedSlice* ss);
 
 // Create a table builder that will write to "filename" in
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index 8fe6b358b3d..88f561d5a8f 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -342,7 +342,7 @@ TEST(TensorSliceWriteTest, SizeErrors) {
   {
     TensorShape shape({256, 1024});
     TensorSlice slice = TensorSlice::ParseOrDie("-:-");
-    const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
+    const std::vector<tstring> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_TRUE(absl::StrContains(s.error_message(),
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 9feb6eb0ffd..47dc19cf95c 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -2,7 +2,7 @@
 # Code examples referenced by adding_an_op
 
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_exec_compatible_with",
 )
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 4e4e1685f6d..bb646d2da0e 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -45,7 +45,7 @@ on API >= 14 devices.
 
 ## Prebuilt Components:
 
-The fastest path to trying the demo is to download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+The fastest path to trying the demo is to download the [prebuilt demo APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
 
 Also available are precompiled native libraries, and a jcenter package that you
 may simply drop into your own applications. See
@@ -109,7 +109,9 @@ protobuf compilation.
 
 NOTE: Bazel does not currently support building for Android on Windows. Full
 support for gradle/cmake builds is coming soon, but in the meantime we suggest
-that Windows users download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
+that Windows users download the
+[prebuilt demo APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
+instead.
 
 ##### Install Bazel and Android Prerequisites
 
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
deleted file mode 100644
index c04934b7d6d..00000000000
--- a/tensorflow/examples/get_started/regression/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_test(
-    name = "test",
-    size = "medium",
-    srcs = [
-        "custom_regression.py",
-        "dnn_regression.py",
-        "imports85.py",
-        "linear_regression_categorical.py",
-        "test.py",
-    ],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/get_started/regression/custom_regression.py b/tensorflow/examples/get_started/regression/custom_regression.py
deleted file mode 100644
index 7b7cbb78666..00000000000
--- a/tensorflow/examples/get_started/regression/custom_regression.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Regression using the DNNRegressor Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import imports85  # pylint: disable=g-bad-import-order
-
-STEPS = 1000
-PRICE_NORM_FACTOR = 1000
-
-
-def my_dnn_regression_fn(features, labels, mode, params):
-  """A model function implementing DNN regression for a custom Estimator."""
-
-  # Extract the input into a dense layer, according to the feature_columns.
-  top = tf.feature_column.input_layer(features, params["feature_columns"])
-
-  # Iterate over the "hidden_units" list of layer sizes, default is [20].
-  for units in params.get("hidden_units", [20]):
-    # Add a hidden layer, densely connected on top of the previous layer.
-    top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
-
-  # Connect a linear output layer on top.
-  output_layer = tf.layers.dense(inputs=top, units=1)
-
-  # Reshape the output layer to a 1-dim Tensor to return predictions
-  predictions = tf.squeeze(output_layer, 1)
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    # In `PREDICT` mode we only need to return predictions.
-    return tf.estimator.EstimatorSpec(
-        mode=mode, predictions={"price": predictions})
-
-  # Calculate loss using mean squared error
-  average_loss = tf.losses.mean_squared_error(labels, predictions)
-
-  # Pre-made estimators use the total_loss instead of the average,
-  # so report total_loss for compatibility.
-  batch_size = tf.shape(labels)[0]
-  total_loss = tf.to_float(batch_size) * average_loss
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = params.get("optimizer", tf.train.AdamOptimizer)
-    optimizer = optimizer(params.get("learning_rate", None))
-    train_op = optimizer.minimize(
-        loss=average_loss, global_step=tf.train.get_global_step())
-
-    return tf.estimator.EstimatorSpec(
-        mode=mode, loss=total_loss, train_op=train_op)
-
-  # In evaluation mode we will calculate evaluation metrics.
-  assert mode == tf.estimator.ModeKeys.EVAL
-
-  # Calculate root mean squared error
-  rmse = tf.metrics.root_mean_squared_error(labels, predictions)
-
-  # Add the rmse to the collection of evaluation metrics.
-  eval_metrics = {"rmse": rmse}
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      # Report sum of error for compatibility with pre-made estimators
-      loss=total_loss,
-      eval_metric_ops=eval_metrics)
-
-
-def main(argv):
-  """Builds, trains, and evaluates the model."""
-  assert len(argv) == 1
-  (train, test) = imports85.dataset()
-
-  # Switch the labels to units of thousands for better convergence.
-  def normalize_price(features, labels):
-    return features, labels / PRICE_NORM_FACTOR
-
-  train = train.map(normalize_price)
-  test = test.map(normalize_price)
-
-  # Build the training input_fn.
-  def input_train():
-    return (
-        # Shuffling with a buffer larger than the data set ensures
-        # that the examples are well mixed.
-        train.shuffle(1000).batch(128)
-        # Repeat forever
-        .repeat())
-
-  # Build the validation input_fn.
-  def input_test():
-    return test.shuffle(1000).batch(128)
-
-  # The first way assigns a unique weight to each category. To do this you must
-  # specify the category's vocabulary (values outside this specification will
-  # receive a weight of zero). Here we specify the vocabulary using a list of
-  # options. The vocabulary can also be specified with a vocabulary file (using
-  # `categorical_column_with_vocabulary_file`). For features covering a
-  # range of positive integers use `categorical_column_with_identity`.
-  body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
-  body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="body-style", vocabulary_list=body_style_vocab)
-  make = tf.feature_column.categorical_column_with_hash_bucket(
-      key="make", hash_bucket_size=50)
-
-  feature_columns = [
-      tf.feature_column.numeric_column(key="curb-weight"),
-      tf.feature_column.numeric_column(key="highway-mpg"),
-      # Since this is a DNN model, convert categorical columns from sparse
-      # to dense.
-      # Wrap them in an `indicator_column` to create a
-      # one-hot vector from the input.
-      tf.feature_column.indicator_column(body_style),
-      # Or use an `embedding_column` to create a trainable vector for each
-      # index.
-      tf.feature_column.embedding_column(make, dimension=3),
-  ]
-
-  # Build a custom Estimator, using the model_fn.
-  # `params` is passed through to the `model_fn`.
-  model = tf.estimator.Estimator(
-      model_fn=my_dnn_regression_fn,
-      params={
-          "feature_columns": feature_columns,
-          "learning_rate": 0.001,
-          "optimizer": tf.train.AdamOptimizer,
-          "hidden_units": [20, 20]
-      })
-
-  # Train the model.
-  model.train(input_fn=input_train, steps=STEPS)
-
-  # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=input_test)
-
-  # Print the Root Mean Square Error (RMSE).
-  print("\n" + 80 * "*")
-  print("\nRMS error for the test set: ${:.0f}"
-        .format(PRICE_NORM_FACTOR * eval_result["rmse"]))
-
-  print()
-
-
-if __name__ == "__main__":
-  # The Estimator periodically generates "INFO" logs; make these logs visible.
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run(main=main)
diff --git a/tensorflow/examples/get_started/regression/dnn_regression.py b/tensorflow/examples/get_started/regression/dnn_regression.py
deleted file mode 100644
index 94669a5082b..00000000000
--- a/tensorflow/examples/get_started/regression/dnn_regression.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Regression using the DNNRegressor Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import imports85  # pylint: disable=g-bad-import-order
-
-STEPS = 5000
-PRICE_NORM_FACTOR = 1000
-
-
-def main(argv):
-  """Builds, trains, and evaluates the model."""
-  assert len(argv) == 1
-  (train, test) = imports85.dataset()
-
-  # Switch the labels to units of thousands for better convergence.
-  def normalize_price(features, labels):
-    return features, labels / PRICE_NORM_FACTOR
-
-  train = train.map(normalize_price)
-  test = test.map(normalize_price)
-
-  # Build the training input_fn.
-  def input_train():
-    return (
-        # Shuffling with a buffer larger than the data set ensures
-        # that the examples are well mixed.
-        train.shuffle(1000).batch(128)
-        # Repeat forever
-        .repeat())
-
-  # Build the validation input_fn.
-  def input_test():
-    return test.shuffle(1000).batch(128)
-
-  # The first way assigns a unique weight to each category. To do this you must
-  # specify the category's vocabulary (values outside this specification will
-  # receive a weight of zero). Here we specify the vocabulary using a list of
-  # options. The vocabulary can also be specified with a vocabulary file (using
-  # `categorical_column_with_vocabulary_file`). For features covering a
-  # range of positive integers use `categorical_column_with_identity`.
-  body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
-  body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="body-style", vocabulary_list=body_style_vocab)
-  make = tf.feature_column.categorical_column_with_hash_bucket(
-      key="make", hash_bucket_size=50)
-
-  feature_columns = [
-      tf.feature_column.numeric_column(key="curb-weight"),
-      tf.feature_column.numeric_column(key="highway-mpg"),
-      # Since this is a DNN model, convert categorical columns from sparse
-      # to dense.
-      # Wrap them in an `indicator_column` to create a
-      # one-hot vector from the input.
-      tf.feature_column.indicator_column(body_style),
-      # Or use an `embedding_column` to create a trainable vector for each
-      # index.
-      tf.feature_column.embedding_column(make, dimension=3),
-  ]
-
-  # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
-  # defined above as input.
-  model = tf.estimator.DNNRegressor(
-      hidden_units=[20, 20], feature_columns=feature_columns)
-
-  # Train the model.
-  model.train(input_fn=input_train, steps=STEPS)
-
-  # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=input_test)
-
-  # The evaluation returns a Python dictionary. The "average_loss" key holds the
-  # Mean Squared Error (MSE).
-  average_loss = eval_result["average_loss"]
-
-  # Convert MSE to Root Mean Square Error (RMSE).
-  print("\n" + 80 * "*")
-  print("\nRMS error for the test set: ${:.0f}"
-        .format(PRICE_NORM_FACTOR * average_loss**0.5))
-
-  print()
-
-
-if __name__ == "__main__":
-  # The Estimator periodically generates "INFO" logs; make these logs visible.
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run(main=main)
diff --git a/tensorflow/examples/get_started/regression/imports85.py b/tensorflow/examples/get_started/regression/imports85.py
deleted file mode 100644
index 4fdaceea9af..00000000000
--- a/tensorflow/examples/get_started/regression/imports85.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A dataset loader for imports85.data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-import tensorflow as tf
-
-try:
-  import pandas as pd  # pylint: disable=g-import-not-at-top
-except ImportError:
-  pass
-
-
-URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
-
-# Order is important for the csv-readers, so we use an OrderedDict here.
-defaults = collections.OrderedDict([
-    ("symboling", [0]),
-    ("normalized-losses", [0.0]),
-    ("make", [""]),
-    ("fuel-type", [""]),
-    ("aspiration", [""]),
-    ("num-of-doors", [""]),
-    ("body-style", [""]),
-    ("drive-wheels", [""]),
-    ("engine-location", [""]),
-    ("wheel-base", [0.0]),
-    ("length", [0.0]),
-    ("width", [0.0]),
-    ("height", [0.0]),
-    ("curb-weight", [0.0]),
-    ("engine-type", [""]),
-    ("num-of-cylinders", [""]),
-    ("engine-size", [0.0]),
-    ("fuel-system", [""]),
-    ("bore", [0.0]),
-    ("stroke", [0.0]),
-    ("compression-ratio", [0.0]),
-    ("horsepower", [0.0]),
-    ("peak-rpm", [0.0]),
-    ("city-mpg", [0.0]),
-    ("highway-mpg", [0.0]),
-    ("price", [0.0])
-])  # pyformat: disable
-
-
-types = collections.OrderedDict((key, type(value[0]))
-                                for key, value in defaults.items())
-
-
-def _get_imports85():
-  path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)
-  return path
-
-
-def dataset(y_name="price", train_fraction=0.7):
-  """Load the imports85 data as a (train,test) pair of `Dataset`.
-
-  Each dataset generates (features_dict, label) pairs.
-
-  Args:
-    y_name: The name of the column to use as the label.
-    train_fraction: A float, the fraction of data to use for training. The
-        remainder will be used for evaluation.
-  Returns:
-    A (train,test) pair of `Datasets`
-  """
-  # Download and cache the data
-  path = _get_imports85()
-
-  # Define how the lines of the file should be parsed
-  def decode_line(line):
-    """Convert a csv line into a (features_dict,label) pair."""
-    # Decode the line to a tuple of items based on the types of
-    # csv_header.values().
-    items = tf.decode_csv(line, list(defaults.values()))
-
-    # Convert the keys and items to a dict.
-    pairs = zip(defaults.keys(), items)
-    features_dict = dict(pairs)
-
-    # Remove the label from the features_dict
-    label = features_dict.pop(y_name)
-
-    return features_dict, label
-
-  def has_no_question_marks(line):
-    """Returns True if the line of text has no question marks."""
-    # split the line into an array of characters
-    chars = tf.string_split(line[tf.newaxis], "").values
-    # for each character check if it is a question mark
-    is_question = tf.equal(chars, "?")
-    any_question = tf.reduce_any(is_question)
-    no_question = ~any_question
-
-    return no_question
-
-  def in_training_set(line):
-    """Returns a boolean tensor, true if the line is in the training set."""
-    # If you randomly split the dataset you won't get the same split in both
-    # sessions if you stop and restart training later. Also a simple
-    # random split won't work with a dataset that's too big to `.cache()` as
-    # we are doing here.
-    num_buckets = 1000000
-    bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
-    # Use the hash bucket id as a random number that's deterministic per example
-    return bucket_id < int(train_fraction * num_buckets)
-
-  def in_test_set(line):
-    """Returns a boolean tensor, true if the line is in the training set."""
-    # Items not in the training set are in the test set.
-    # This line must use `~` instead of `not` because `not` only works on python
-    # booleans but we are dealing with symbolic tensors.
-    return ~in_training_set(line)
-
-  base_dataset = (
-      tf.data
-      # Get the lines from the file.
-      .TextLineDataset(path)
-      # drop lines with question marks.
-      .filter(has_no_question_marks))
-
-  train = (base_dataset
-           # Take only the training-set lines.
-           .filter(in_training_set)
-           # Decode each line into a (features_dict, label) pair.
-           .map(decode_line)
-           # Cache data so you only decode the file once.
-           .cache())
-
-  # Do the same for the test-set.
-  test = (base_dataset.filter(in_test_set).cache().map(decode_line))
-
-  return train, test
-
-
-def raw_dataframe():
-  """Load the imports85 data as a pd.DataFrame."""
-  # Download and cache the data
-  path = _get_imports85()
-
-  # Load it into a pandas dataframe
-  df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")
-
-  return df
-
-
-def load_data(y_name="price", train_fraction=0.7, seed=None):
-  """Get the imports85 data set.
-
-  A description of the data is available at:
-    https://archive.ics.uci.edu/ml/datasets/automobile
-
-  The data itself can be found at:
-    https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
-
-  Args:
-    y_name: the column to return as the label.
-    train_fraction: the fraction of the dataset to use for training.
-    seed: The random seed to use when shuffling the data. `None` generates a
-      unique shuffle every run.
-  Returns:
-    a pair of pairs where the first pair is the training data, and the second
-    is the test data:
-    `(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
-    `x` contains a pandas DataFrame of features, while `y` contains the label
-    array.
-  """
-  # Load the raw data columns.
-  data = raw_dataframe()
-
-  # Delete rows with unknowns
-  data = data.dropna()
-
-  # Shuffle the data
-  np.random.seed(seed)
-
-  # Split the data into train/test subsets.
-  x_train = data.sample(frac=train_fraction, random_state=seed)
-  x_test = data.drop(x_train.index)
-
-  # Extract the label from the features dataframe.
-  y_train = x_train.pop(y_name)
-  y_test = x_test.pop(y_name)
-
-  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/examples/get_started/regression/linear_regression_categorical.py b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
deleted file mode 100644
index 5312272a959..00000000000
--- a/tensorflow/examples/get_started/regression/linear_regression_categorical.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Linear regression with categorical features."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import imports85  # pylint: disable=g-bad-import-order
-
-STEPS = 1000
-PRICE_NORM_FACTOR = 1000
-
-
-def main(argv):
-  """Builds, trains, and evaluates the model."""
-  assert len(argv) == 1
-  (train, test) = imports85.dataset()
-
-  # Switch the labels to units of thousands for better convergence.
-  def normalize_price(features, labels):
-    return features, labels / PRICE_NORM_FACTOR
-
-  train = train.map(normalize_price)
-  test = test.map(normalize_price)
-
-  # Build the training input_fn.
-  def input_train():
-    return (
-        # Shuffling with a buffer larger than the data set ensures
-        # that the examples are well mixed.
-        train.shuffle(1000).batch(128)
-        # Repeat forever
-        .repeat())
-
-  # Build the validation input_fn.
-  def input_test():
-    return test.shuffle(1000).batch(128)
-
-  # The following code demonstrates two of the ways that `feature_columns` can
-  # be used to build a model with categorical inputs.
-
-  # The first way assigns a unique weight to each category. To do this, you must
-  # specify the category's vocabulary (values outside this specification will
-  # receive a weight of zero).
-  # Alternatively, you can define the vocabulary in a file (by calling
-  # `categorical_column_with_vocabulary_file`) or as a range of positive
-  # integers (by calling `categorical_column_with_identity`)
-  body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
-  body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="body-style", vocabulary_list=body_style_vocab)
-
-  # The second way, appropriate for an unspecified vocabulary, is to create a
-  # hashed column. It will create a fixed length list of weights, and
-  # automatically assign each input category to a weight. Due to the
-  # pseudo-randomness of the process, some weights may be shared between
-  # categories, while others will remain unused.
-  make_column = tf.feature_column.categorical_column_with_hash_bucket(
-      key="make", hash_bucket_size=50)
-
-  feature_columns = [
-      # This model uses the same two numeric features as `linear_regressor.py`
-      tf.feature_column.numeric_column(key="curb-weight"),
-      tf.feature_column.numeric_column(key="highway-mpg"),
-      # This model adds two categorical colums that will adjust the price based
-      # on "make" and "body-style".
-      body_style_column,
-      make_column,
-  ]
-
-  # Build the Estimator.
-  model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
-
-  # Train the model.
-  # By default, the Estimators log output every 100 steps.
-  model.train(input_fn=input_train, steps=STEPS)
-
-  # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=input_test)
-
-  # The evaluation returns a Python dictionary. The "average_loss" key holds the
-  # Mean Squared Error (MSE).
-  average_loss = eval_result["average_loss"]
-
-  # Convert MSE to Root Mean Square Error (RMSE).
-  print("\n" + 80 * "*")
-  print("\nRMS error for the test set: ${:.0f}"
-        .format(PRICE_NORM_FACTOR * average_loss**0.5))
-
-  print()
-
-
-if __name__ == "__main__":
-  # The Estimator periodically generates "INFO" logs; make these logs visible.
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run(main=main)
diff --git a/tensorflow/examples/get_started/regression/test.py b/tensorflow/examples/get_started/regression/test.py
deleted file mode 100644
index 1c37e4a671b..00000000000
--- a/tensorflow/examples/get_started/regression/test.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A simple smoke test that runs these examples for 1 training iteration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import pandas as pd
-
-from six.moves import StringIO
-
-import tensorflow.examples.get_started.regression.imports85 as imports85
-
-sys.modules["imports85"] = imports85
-
-# pylint: disable=g-bad-import-order,g-import-not-at-top
-import tensorflow.data as data
-
-import tensorflow.examples.get_started.regression.dnn_regression as dnn_regression
-import tensorflow.examples.get_started.regression.linear_regression_categorical as linear_regression_categorical
-import tensorflow.examples.get_started.regression.custom_regression as custom_regression
-
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
-# pylint: disable=g-bad-import-order,g-import-not-at-top
-
-
-# pylint: disable=line-too-long
-FOUR_LINES = "\n".join([
-    "1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500",
-    "2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950",
-    "2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450",
-    "2,?,audi,gas,std,two,sedan,fwd,front,99.80,177.30,66.30,53.10,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250",
-])
-
-# pylint: enable=line-too-long
-
-
-def four_lines_dataframe():
-  text = StringIO(FOUR_LINES)
-
-  return pd.read_csv(
-      text, names=imports85.types.keys(), dtype=imports85.types, na_values="?")
-
-
-def four_lines_dataset(*args, **kwargs):
-  del args, kwargs
-  return data.Dataset.from_tensor_slices(FOUR_LINES.split("\n"))
-
-
-class RegressionTest(googletest.TestCase):
-  """Test the regression examples in this directory."""
-
-  @test.mock.patch.dict(data.__dict__, {"TextLineDataset": four_lines_dataset})
-  @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
-  @test.mock.patch.dict(linear_regression_categorical.__dict__, {"STEPS": 1})
-  def test_linear_regression_categorical(self):
-    linear_regression_categorical.main([""])
-
-  @test.mock.patch.dict(data.__dict__, {"TextLineDataset": four_lines_dataset})
-  @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
-  @test.mock.patch.dict(dnn_regression.__dict__, {"STEPS": 1})
-  def test_dnn_regression(self):
-    dnn_regression.main([""])
-
-  @test.mock.patch.dict(data.__dict__, {"TextLineDataset": four_lines_dataset})
-  @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
-  @test.mock.patch.dict(custom_regression.__dict__, {"STEPS": 1})
-  def test_custom_regression(self):
-    custom_regression.main([""])
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
deleted file mode 100644
index 45f192d0b7d..00000000000
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Description:
-# Example MNIST TensorFlow models for demonstrating data reading.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "convert_to_records",
-    srcs = ["convert_to_records.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn/python/learn/datasets",
-    ],
-)
-
-py_binary(
-    name = "fully_connected_reader",
-    srcs = [
-        "fully_connected_reader.py",
-    ],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist",
-    ],
-)
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
deleted file mode 100644
index c89e8395632..00000000000
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Converts MNIST data to TFRecords file format with Example protos."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.contrib.learn.python.learn.datasets import mnist
-
-FLAGS = None
-
-
-def _int64_feature(value):
-  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
-
-
-def _bytes_feature(value):
-  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-
-def convert_to(data_set, name):
-  """Converts a dataset to tfrecords."""
-  images = data_set.images
-  labels = data_set.labels
-  num_examples = data_set.num_examples
-
-  if images.shape[0] != num_examples:
-    raise ValueError('Images size %d does not match label size %d.' %
-                     (images.shape[0], num_examples))
-  rows = images.shape[1]
-  cols = images.shape[2]
-  depth = images.shape[3]
-
-  filename = os.path.join(FLAGS.directory, name + '.tfrecords')
-  print('Writing', filename)
-  with tf.python_io.TFRecordWriter(filename) as writer:
-    for index in range(num_examples):
-      image_raw = images[index].tostring()
-      example = tf.train.Example(
-          features=tf.train.Features(
-              feature={
-                  'height': _int64_feature(rows),
-                  'width': _int64_feature(cols),
-                  'depth': _int64_feature(depth),
-                  'label': _int64_feature(int(labels[index])),
-                  'image_raw': _bytes_feature(image_raw)
-              }))
-      writer.write(example.SerializeToString())
-
-
-def main(unused_argv):
-  # Get the data.
-  data_sets = mnist.read_data_sets(FLAGS.directory,
-                                   dtype=tf.uint8,
-                                   reshape=False,
-                                   validation_size=FLAGS.validation_size)
-
-  # Convert to Examples and write the result to TFRecords.
-  convert_to(data_sets.train, 'train')
-  convert_to(data_sets.validation, 'validation')
-  convert_to(data_sets.test, 'test')
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--directory',
-      type=str,
-      default='/tmp/data',
-      help='Directory to download data files and write the converted result'
-  )
-  parser.add_argument(
-      '--validation_size',
-      type=int,
-      default=5000,
-      help="""\
-      Number of examples to separate from the training data for the validation
-      set.\
-      """
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
deleted file mode 100644
index d701444b1ab..00000000000
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Train and Eval the MNIST network.
-
-This version is like fully_connected_feed.py but uses data converted
-to a TFRecords file containing tf.train.Example protocol buffers.
-See:
-https://www.tensorflow.org/guide/reading_data#reading_from_files
-for context.
-
-YOU MUST run convert_to_records before running this (but you only need to
-run it once).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os.path
-import sys
-import time
-
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import mnist
-
-# Basic model parameters as external flags.
-FLAGS = None
-
-# Constants used for dealing with the files, matches convert_to_records.
-TRAIN_FILE = 'train.tfrecords'
-VALIDATION_FILE = 'validation.tfrecords'
-
-
-def decode(serialized_example):
-  """Parses an image and label from the given `serialized_example`."""
-  features = tf.parse_single_example(
-      serialized_example,
-      # Defaults are not specified since both keys are required.
-      features={
-          'image_raw': tf.FixedLenFeature([], tf.string),
-          'label': tf.FixedLenFeature([], tf.int64),
-      })
-
-  # Convert from a scalar string tensor (whose single string has
-  # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
-  # [mnist.IMAGE_PIXELS].
-  image = tf.decode_raw(features['image_raw'], tf.uint8)
-  image.set_shape((mnist.IMAGE_PIXELS))
-
-  # Convert label from a scalar uint8 tensor to an int32 scalar.
-  label = tf.cast(features['label'], tf.int32)
-
-  return image, label
-
-
-def augment(image, label):
-  """Placeholder for data augmentation."""
-  # OPTIONAL: Could reshape into a 28x28 image and apply distortions
-  # here.  Since we are not applying any distortions in this
-  # example, and the next step expects the image to be flattened
-  # into a vector, we don't bother.
-  return image, label
-
-
-def normalize(image, label):
-  """Convert `image` from [0, 255] -> [-0.5, 0.5] floats."""
-  image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
-  return image, label
-
-
-def inputs(train, batch_size, num_epochs):
-  """Reads input data num_epochs times.
-
-  Args:
-    train: Selects between the training (True) and validation (False) data.
-    batch_size: Number of examples per returned batch.
-    num_epochs: Number of times to read the input data, or 0/None to
-       train forever.
-
-  Returns:
-    A tuple (images, labels), where:
-    * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
-      in the range [-0.5, 0.5].
-    * labels is an int32 tensor with shape [batch_size] with the true label,
-      a number in the range [0, mnist.NUM_CLASSES).
-
-    This function creates a one_shot_iterator, meaning that it will only iterate
-    over the dataset once. On the other hand there is no special initialization
-    required.
-  """
-  if not num_epochs:
-    num_epochs = None
-  filename = os.path.join(FLAGS.train_dir, TRAIN_FILE
-                          if train else VALIDATION_FILE)
-
-  with tf.name_scope('input'):
-    # TFRecordDataset opens a binary file and reads one record at a time.
-    # `filename` could also be a list of filenames, which will be read in order.
-    dataset = tf.data.TFRecordDataset(filename)
-
-    # The map transformation takes a function and applies it to every element
-    # of the dataset.
-    dataset = dataset.map(decode)
-    dataset = dataset.map(augment)
-    dataset = dataset.map(normalize)
-
-    # The shuffle transformation uses a finite-sized buffer to shuffle elements
-    # in memory. The parameter is the number of elements in the buffer. For
-    # completely uniform shuffling, set the parameter to be the same as the
-    # number of elements in the dataset.
-    dataset = dataset.shuffle(1000 + 3 * batch_size)
-
-    dataset = dataset.repeat(num_epochs)
-    dataset = dataset.batch(batch_size)
-
-    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
-  return iterator.get_next()
-
-
-def run_training():
-  """Train MNIST for a number of steps."""
-
-  # Tell TensorFlow that the model will be built into the default Graph.
-  with tf.Graph().as_default():
-    # Input images and labels.
-    image_batch, label_batch = inputs(
-        train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
-
-    # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(image_batch, FLAGS.hidden1, FLAGS.hidden2)
-
-    # Add to the Graph the loss calculation.
-    loss = mnist.loss(logits, label_batch)
-
-    # Add to the Graph operations that train the model.
-    train_op = mnist.training(loss, FLAGS.learning_rate)
-
-    # The op for initializing the variables.
-    init_op = tf.group(tf.global_variables_initializer(),
-                       tf.local_variables_initializer())
-
-    # Create a session for running operations in the Graph.
-    with tf.compat.v1.Session() as sess:
-      # Initialize the variables (the trained variables and the
-      # epoch counter).
-      sess.run(init_op)
-      try:
-        step = 0
-        while True:  # Train until OutOfRangeError
-          start_time = time.time()
-
-          # Run one step of the model.  The return values are
-          # the activations from the `train_op` (which is
-          # discarded) and the `loss` op.  To inspect the values
-          # of your ops or variables, you may include them in
-          # the list passed to sess.run() and the value tensors
-          # will be returned in the tuple from the call.
-          _, loss_value = sess.run([train_op, loss])
-
-          duration = time.time() - start_time
-
-          # Print an overview fairly often.
-          if step % 100 == 0:
-            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                       duration))
-          step += 1
-      except tf.errors.OutOfRangeError:
-        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs,
-                                                          step))
-
-
-def main(_):
-  run_training()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='Initial learning rate.')
-  parser.add_argument(
-      '--num_epochs',
-      type=int,
-      default=2,
-      help='Number of epochs to run trainer.')
-  parser.add_argument(
-      '--hidden1',
-      type=int,
-      default=128,
-      help='Number of units in hidden layer 1.')
-  parser.add_argument(
-      '--hidden2',
-      type=int,
-      default=32,
-      help='Number of units in hidden layer 2.')
-  parser.add_argument('--batch_size', type=int, default=100, help='Batch size.')
-  parser.add_argument(
-      '--train_dir',
-      type=str,
-      default='/tmp/data',
-      help='Directory with the training data.')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
index 22ee785dc34..3f949846923 100644
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ b/tensorflow/examples/ios/benchmark/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "third_party/tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_width,
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
index 277b6e272dc..0aefbc6eedb 100644
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.h
@@ -16,8 +16,8 @@
 #import <UIKit/UIKit.h>
 
 #include <memory>
-#include "third_party/tensorflow/core/public/session.h"
-#include "third_party/tensorflow/core/util/memmapped_file_system.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/memmapped_file_system.h"
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate,
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
index 8f2da481f46..3de812c34b3 100644
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ b/tensorflow/examples/ios/camera/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "third_party/tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_width,
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
index 33e95b185c7..78bdb82aae6 100644
--- a/tensorflow/examples/ios/camera/tensorflow_utils.h
+++ b/tensorflow/examples/ios/camera/tensorflow_utils.h
@@ -18,8 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "third_party/tensorflow/core/public/session.h"
-#include "third_party/tensorflow/core/util/memmapped_file_system.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/memmapped_file_system.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Reads a serialized GraphDef protobuf file from the bundle, typically
diff --git a/tensorflow/examples/ios/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
index 2d2ee78e991..0e0b771118b 100644
--- a/tensorflow/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/examples/ios/simple/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "third_party/tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index ee2927d0a53..d76fc8046f3 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -60,10 +60,11 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
+using tensorflow::tstring;
 
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
@@ -106,7 +107,7 @@ static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
                                         "' expected ", file_size, " got ",
                                         data.size());
   }
-  output->scalar<string>()() = string(data);
+  output->scalar<tstring>()() = tstring(data);
   return Status::OK();
 }
 
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
deleted file mode 100644
index 249e256797d..00000000000
--- a/tensorflow/examples/learn/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Description:
-# Examples of tf.learn usage
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "iris_custom_decay_dnn",
-    srcs = ["iris_custom_decay_dnn.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_binary(
-    name = "iris_custom_model",
-    srcs = ["iris_custom_model.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-sh_test(
-    name = "examples_test",
-    size = "large",
-    srcs = ["examples_test.sh"],
-    data = [
-        ":iris_custom_decay_dnn",
-        ":iris_custom_model",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
deleted file mode 100644
index 07f9e051374..00000000000
--- a/tensorflow/examples/learn/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Estimator Examples
-
-TensorFlow Estimators are a high-level API for TensorFlow that allows you to
-create, train, and use deep learning models easily.
-
-See the [Quickstart tutorial](https://www.tensorflow.org/get_started/estimator)
-for an introduction to the API.
-
-## Basics
-
-* [Building a Custom Model](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_model.py)
-
-## Techniques
-
-* [Deep Neural Network with Customized Decay Function](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_decay_dnn.py)
-
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
deleted file mode 100755
index e26848b0074..00000000000
--- a/tensorflow/examples/learn/examples_test.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# This script exercises the examples of using TF.Learn.
-
-DIR="$TEST_SRCDIR"
-
-# Check if TEST_WORKSPACE is defined, and set as empty string if not.
-if [ -z "${TEST_WORKSPACE-}" ]
-then
-  TEST_WORKSPACE=""
-fi
-
-if [ ! -z "$TEST_WORKSPACE" ]
-then
-  DIR="$DIR"/"$TEST_WORKSPACE"
-fi
-
-TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
-
-
-function test() {
-  echo "Test $1:"
-  $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
-  if [ $? -eq 0 ]
-  then
-    echo "Test passed."
-    return 0
-  else
-    echo "Test failed."
-    exit 1
-  fi
-}
-
-test iris_custom_decay_dnn
-test iris_custom_model
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
deleted file mode 100644
index 73bf20fada4..00000000000
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset, with exponential decay."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-import tensorflow as tf
-
-
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def my_model(features, labels, mode):
-  """DNN with three hidden layers."""
-  # Create three fully connected layers respectively of size 10, 20, and 10.
-  net = features[X_FEATURE]
-  for units in [10, 20, 10]:
-    net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
-
-  # Compute logits (1 per class).
-  logits = tf.layers.dense(net, 3, activation=None)
-
-  # Compute predictions.
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {
-        'class': predicted_classes,
-        'prob': tf.nn.softmax(logits)
-    }
-    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-  # Compute loss.
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  # Create training op with exponentially decaying learning rate.
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    global_step = tf.train.get_global_step()
-    learning_rate = tf.train.exponential_decay(
-        learning_rate=0.1, global_step=global_step,
-        decay_steps=100, decay_rate=0.001)
-    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
-    train_op = optimizer.minimize(loss, global_step=global_step)
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  # Compute evaluation metrics.
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  classifier = tf.estimator.Estimator(model_fn=my_model)
-
-  # Train.
-  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=1000)
-
-  # Predict.
-  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
deleted file mode 100644
index bf34d72ba07..00000000000
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of Estimator for Iris plant dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-import tensorflow as tf
-
-
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def my_model(features, labels, mode):
-  """DNN with three hidden layers, and dropout of 0.1 probability."""
-  # Create three fully connected layers respectively of size 10, 20, and 10 with
-  # each layer having a dropout probability of 0.1.
-  net = features[X_FEATURE]
-  for units in [10, 20, 10]:
-    net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
-    net = tf.layers.dropout(net, rate=0.1)
-
-  # Compute logits (1 per class).
-  logits = tf.layers.dense(net, 3, activation=None)
-
-  # Compute predictions.
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {
-        'class': predicted_classes,
-        'prob': tf.nn.softmax(logits)
-    }
-    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-  # Compute loss.
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  # Create training op.
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  # Compute evaluation metrics.
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  classifier = tf.estimator.Estimator(model_fn=my_model)
-
-  # Train.
-  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=1000)
-
-  # Predict.
-  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 82552a71740..823a2c718e4 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <setjmp.h>
 #include <stdio.h>
 #include <string.h>
+
 #include <cmath>
 #include <fstream>
 #include <vector>
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -59,9 +61,11 @@ Status ReadLocationsFile(const string& file_name, std::vector<float>* result,
   result->clear();
   string line;
   while (std::getline(file, line)) {
-    std::vector<float> tokens;
-    CHECK(tensorflow::str_util::SplitAndParseAsFloats(line, ',', &tokens));
-    for (auto number : tokens) {
+    std::vector<string> string_tokens = tensorflow::str_util::Split(line, ',');
+    result->reserve(string_tokens.size());
+    for (const string& string_token : string_tokens) {
+      float number;
+      CHECK(tensorflow::strings::safe_strtof(string_token, &number));
       result->push_back(number);
     }
   }
diff --git a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
index 64e0c6a79b3..49fd03395f6 100644
--- a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
+++ b/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
@@ -79,5 +79,13 @@ def main(argv):
     np.testing.assert_allclose(y_lite, y_tf, rtol=0, atol=1e-5,
                                err_msg='Mismatch at test example %d' % i)
 
+  # Test that it loads correctly with v1 load APIs as well.
+  with tf.compat.v1.Graph().as_default(), tf.compat.v1.Session() as session:
+    tf.compat.v1.saved_model.load(
+        session,
+        [tf.compat.v1.saved_model.SERVING],
+        FLAGS.saved_model_dir)
+
+
 if __name__ == '__main__':
   app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
index 74981b5fbf7..f61631a3b62 100644
--- a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
+++ b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
@@ -40,6 +40,11 @@ flags.DEFINE_string(
 flags.DEFINE_integer(
     'epochs', 10,
     'Number of epochs to train.')
+flags.DEFINE_bool(
+    'use_keras_save_api', False,
+    'Uses tf.keras.models.save_model() on the feature extractor '
+    'instead of tf.saved_model.save() on a manually wrapped version. '
+    'With this, the exported model as no hparams.')
 flags.DEFINE_bool(
     'fast_test_mode', False,
     'Shortcut training for running in unit tests.')
@@ -117,7 +122,7 @@ def wrap_keras_model_for_export(model, batch_input_shape,
   # the desired argspec.
   def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
     return call_fn(*args, **kwargs)
-  traced_call_fn = tf.function(autograph=False)(
+  traced_call_fn = tf.function(
       tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
 
   # Now we need to trigger traces for all supported combinations of the
@@ -180,11 +185,19 @@ def main(argv):
   # Save the feature extractor to a framework-agnostic SavedModel for reuse.
   # Note that the feature_extractor object has not been compiled or fitted,
   # so it does not contain an optimizer and related state.
-  exportable = wrap_keras_model_for_export(feature_extractor,
-                                           (None,) + mnist_util.INPUT_SHAPE,
-                                           set_feature_extractor_hparams,
-                                           default_hparams)
-  tf.saved_model.save(exportable, FLAGS.export_dir)
+  if FLAGS.use_keras_save_api:
+    # Use Keras' built-in way of creating reusable SavedModels.
+    # This has no support for adjustable hparams at this time (July 2019).
+    # (We could also call tf.saved_model.save(feature_extractor, ...),
+    # point is we're passing a Keras model, not a plain Checkpoint.)
+    tf.keras.models.save_model(feature_extractor, FLAGS.export_dir)
+  else:
+    # Assemble a reusable SavedModel manually, with adjustable hparams.
+    exportable = wrap_keras_model_for_export(feature_extractor,
+                                             (None,) + mnist_util.INPUT_SHAPE,
+                                             set_feature_extractor_hparams,
+                                             default_hparams)
+    tf.saved_model.save(exportable, FLAGS.export_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
index 876e3004bca..6a2853f0617 100644
--- a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
+++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
@@ -37,7 +37,7 @@ def main(argv):
   root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
 
   # Wrap the rnn_cell.__call__ function and assign to next_state.
-  root.next_state = tf.function(root.rnn_cell.__call__, autograph=False)
+  root.next_state = tf.function(root.rnn_cell.__call__)
 
   # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
   # attribute with the same name.
diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
index 0db91facd65..fae61ddbd9a 100644
--- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
+++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
@@ -61,5 +61,5 @@ def MaybeRunScriptInstead():
     # Append current path to import path and execute `SCRIPT_NAME` main.
     sys.path.extend([os.path.dirname(__file__)])
     module_name = os.environ["SCRIPT_NAME"]
-    retval = app.run(importlib.import_module(module_name).main)
+    retval = app.run(importlib.import_module(module_name).main)  # pylint: disable=assignment-from-no-return
     sys.exit(retval)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
index b516b8e8a8d..232a5b5e1ba 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -74,16 +74,19 @@ class SavedModelTest(scripts.TestCase, parameterized.TestCase):
       combinations=(
           combinations.combine(
               # Test all combinations with tf.saved_model.save().
-              use_keras_save_api=False,
+              # Test all combinations using tf.keras.models.save_model()
+              # for both the reusable and the final full model.
+              use_keras_save_api=True,
               named_strategy=list(ds_utils.named_strategies.values()),
               retrain_flag_value=["true", "false"],
               regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
           ) + combinations.combine(
-              # Test few critcial combinations with tf.keras.models.save_model()
-              # which is merely a thin wrapper (as of June 2019).
-              use_keras_save_api=True,
+              # Test few critcial combinations with raw tf.saved_model.save(),
+              # including export of a reusable SavedModel that gets assembled
+              # manually, including support for adjustable hparams.
+              use_keras_save_api=False,
               named_strategy=None,
-              retrain_flag_value="true",
+              retrain_flag_value=["true", "false"],
               regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
           )),
       test_combinations=[combinations.NamedGPUCombination()])
@@ -97,24 +100,19 @@ class SavedModelTest(scripts.TestCase, parameterized.TestCase):
     fast_test_mode = True
     temp_dir = self.get_temp_dir()
     feature_extrator_dir = os.path.join(temp_dir, "mnist_feature_extractor")
-
-    # TODO(b/135043074): remove this if-else.
-    if named_strategy is None:
-      full_model_dir = os.path.join(temp_dir, "full_model")
-    else:
-      full_model_dir = None
+    full_model_dir = os.path.join(temp_dir, "full_model")
 
     self.assertCommandSucceeded(
         "export_mnist_cnn",
         fast_test_mode=fast_test_mode,
-        export_dir=feature_extrator_dir)
+        export_dir=feature_extrator_dir,
+        use_keras_save_api=use_keras_save_api)
 
     use_kwargs = dict(fast_test_mode=fast_test_mode,
                       input_saved_model_dir=feature_extrator_dir,
                       retrain=retrain_flag_value,
+                      output_saved_model_dir=full_model_dir,
                       use_keras_save_api=use_keras_save_api)
-    if full_model_dir is not None:
-      use_kwargs["output_saved_model_dir"] = full_model_dir
     if named_strategy:
       use_kwargs["strategy"] = str(named_strategy)
     if regularization_loss_multiplier is not None:
@@ -122,11 +120,10 @@ class SavedModelTest(scripts.TestCase, parameterized.TestCase):
           "regularization_loss_multiplier"] = regularization_loss_multiplier
     self.assertCommandSucceeded("use_mnist_cnn", **use_kwargs)
 
-    if full_model_dir is not None:
-      self.assertCommandSucceeded(
-          "deploy_mnist_cnn",
-          fast_test_mode=fast_test_mode,
-          saved_model_dir=full_model_dir)
+    self.assertCommandSucceeded(
+        "deploy_mnist_cnn",
+        fast_test_mode=fast_test_mode,
+        saved_model_dir=full_model_dir)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
index 24d1be4aa50..ae45a02a59b 100644
--- a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -47,7 +47,8 @@ flags.DEFINE_bool(
     'If set, the imported SavedModel is trained further.')
 flags.DEFINE_float(
     'dropout_rate', None,
-    'If set, dropout rate passed to the SavedModel.')
+    'If set, dropout rate passed to the SavedModel. '
+    'Requires a SavedModel with support for adjustable hyperparameters.')
 flags.DEFINE_float(
     'regularization_loss_multiplier', None,
     'If set, multiplier for the regularization losses in the SavedModel.')
diff --git a/tensorflow/examples/speech_commands/README.md b/tensorflow/examples/speech_commands/README.md
index 63be04ee582..82907811ce1 100644
--- a/tensorflow/examples/speech_commands/README.md
+++ b/tensorflow/examples/speech_commands/README.md
@@ -1,4 +1,4 @@
 # Speech Commands Example
 
 This is a basic speech recognition example. For more information, see the
-tutorial at https://www.tensorflow.org/versions/master/tutorials/audio_recognition.
+tutorial at https://www.tensorflow.org/tutorials/sequences/audio_recognition.
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index c61e564463d..57981aca5a6 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -90,7 +90,8 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
       window_stride_ms, feature_bin_count, preprocess)
   runtime_settings = {'clip_stride_ms': clip_stride_ms}
 
-  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
+  wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [],
+                                                  name='wav_data')
   decoded_sample_data = contrib_audio.decode_wav(
       wav_data_placeholder,
       desired_channels=1,
@@ -104,7 +105,7 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
 
   if preprocess == 'average':
     fingerprint_input = tf.nn.pool(
-        tf.expand_dims(spectrogram, -1),
+        input=tf.expand_dims(spectrogram, -1),
         window_shape=[1, model_settings['average_window_width']],
         strides=[1, model_settings['average_window_width']],
         pooling_type='AVG',
@@ -155,7 +156,7 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
 def main(_):
 
   # Create the model and load its weights.
-  sess = tf.InteractiveSession()
+  sess = tf.compat.v1.InteractiveSession()
   create_inference_graph(
       FLAGS.wanted_words, FLAGS.sample_rate, FLAGS.clip_duration_ms,
       FLAGS.clip_stride_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms,
@@ -167,12 +168,12 @@ def main(_):
   # Turn all the variables into inline constants inside the graph and save it.
   frozen_graph_def = graph_util.convert_variables_to_constants(
       sess, sess.graph_def, ['labels_softmax'])
-  tf.train.write_graph(
+  tf.io.write_graph(
       frozen_graph_def,
       os.path.dirname(FLAGS.output_file),
       os.path.basename(FLAGS.output_file),
       as_text=False)
-  tf.logging.info('Saved frozen graph to %s', FLAGS.output_file)
+  tf.compat.v1.logging.info('Saved frozen graph to %s', FLAGS.output_file)
 
 
 if __name__ == '__main__':
@@ -236,4 +237,4 @@ if __name__ == '__main__':
       default='mfcc',
       help='Spectrogram processing mode. Can be "mfcc" or "average"')
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
index 98589069277..d3df7f4613e 100644
--- a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
+++ b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
@@ -174,7 +174,7 @@ if __name__ == '__main__':
       '--data_url',
       type=str,
       # pylint: disable=line-too-long
-      default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
+      default='https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
       # pylint: enable=line-too-long
       help='Location of speech training data')
   parser.add_argument(
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 6c2ce3f13eb..2bb48e01b79 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -123,7 +123,7 @@ def load_wav_file(filename):
     Numpy array holding the sample data as floats between -1.0 and 1.0.
   """
   with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-    wav_filename_placeholder = tf.placeholder(tf.string, [])
+    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(wav_filename_placeholder)
     wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
     return sess.run(
@@ -140,9 +140,9 @@ def save_wav_file(filename, wav_data, sample_rate):
     sample_rate: Samples per second to encode in the file.
   """
   with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-    wav_filename_placeholder = tf.placeholder(tf.string, [])
-    sample_rate_placeholder = tf.placeholder(tf.int32, [])
-    wav_data_placeholder = tf.placeholder(tf.float32, [None, 1])
+    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
+    sample_rate_placeholder = tf.compat.v1.placeholder(tf.int32, [])
+    wav_data_placeholder = tf.compat.v1.placeholder(tf.float32, [None, 1])
     wav_encoder = contrib_audio.encode_wav(wav_data_placeholder,
                                            sample_rate_placeholder)
     wav_saver = io_ops.write_file(wav_filename_placeholder, wav_encoder)
@@ -230,15 +230,16 @@ class AudioProcessor(object):
       try:
         filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
       except:
-        tf.logging.error('Failed to download URL: %s to folder: %s', data_url,
-                         filepath)
-        tf.logging.error('Please make sure you have enough free space and'
-                         ' an internet connection')
+        tf.compat.v1.logging.error(
+            'Failed to download URL: %s to folder: %s', data_url, filepath)
+        tf.compat.v1.logging.error(
+            'Please make sure you have enough free space and'
+            ' an internet connection')
         raise
       print()
       statinfo = os.stat(filepath)
-      tf.logging.info('Successfully downloaded %s (%d bytes)', filename,
-                      statinfo.st_size)
+      tf.compat.v1.logging.info('Successfully downloaded %s (%d bytes)',
+                                filename, statinfo.st_size)
     tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 
   def prepare_data_index(self, silence_percentage, unknown_percentage,
@@ -350,7 +351,7 @@ class AudioProcessor(object):
     if not os.path.exists(background_dir):
       return self.background_data
     with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-      wav_filename_placeholder = tf.placeholder(tf.string, [])
+      wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
       wav_loader = io_ops.read_file(wav_filename_placeholder)
       wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
       search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME,
@@ -389,34 +390,34 @@ class AudioProcessor(object):
       ValueError: If the preprocessing mode isn't recognized.
       Exception: If the preprocessor wasn't compiled in.
     """
-    with tf.get_default_graph().name_scope('data'):
+    with tf.compat.v1.get_default_graph().name_scope('data'):
       desired_samples = model_settings['desired_samples']
-      self.wav_filename_placeholder_ = tf.placeholder(
+      self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
           tf.string, [], name='wav_filename')
       wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
       wav_decoder = contrib_audio.decode_wav(
           wav_loader, desired_channels=1, desired_samples=desired_samples)
       # Allow the audio sample's volume to be adjusted.
-      self.foreground_volume_placeholder_ = tf.placeholder(
+      self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
           tf.float32, [], name='foreground_volume')
       scaled_foreground = tf.multiply(wav_decoder.audio,
                                       self.foreground_volume_placeholder_)
       # Shift the sample's start position, and pad any gaps with zeros.
-      self.time_shift_padding_placeholder_ = tf.placeholder(
+      self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
           tf.int32, [2, 2], name='time_shift_padding')
-      self.time_shift_offset_placeholder_ = tf.placeholder(
+      self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
           tf.int32, [2], name='time_shift_offset')
       padded_foreground = tf.pad(
-          scaled_foreground,
-          self.time_shift_padding_placeholder_,
+          tensor=scaled_foreground,
+          paddings=self.time_shift_padding_placeholder_,
           mode='CONSTANT')
       sliced_foreground = tf.slice(padded_foreground,
                                    self.time_shift_offset_placeholder_,
                                    [desired_samples, -1])
       # Mix in background noise.
-      self.background_data_placeholder_ = tf.placeholder(
+      self.background_data_placeholder_ = tf.compat.v1.placeholder(
           tf.float32, [desired_samples, 1], name='background_data')
-      self.background_volume_placeholder_ = tf.placeholder(
+      self.background_volume_placeholder_ = tf.compat.v1.placeholder(
           tf.float32, [], name='background_volume')
       background_mul = tf.multiply(self.background_data_placeholder_,
                                    self.background_volume_placeholder_)
@@ -428,7 +429,7 @@ class AudioProcessor(object):
           window_size=model_settings['window_size_samples'],
           stride=model_settings['window_stride_samples'],
           magnitude_squared=True)
-      tf.summary.image(
+      tf.compat.v1.summary.image(
           'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
       # The number of buckets in each FFT row in the spectrogram will depend on
       # how many input samples there are in each window. This can be quite
@@ -440,18 +441,20 @@ class AudioProcessor(object):
       # algorithm to shrink the representation.
       if model_settings['preprocess'] == 'average':
         self.output_ = tf.nn.pool(
-            tf.expand_dims(spectrogram, -1),
+            input=tf.expand_dims(spectrogram, -1),
             window_shape=[1, model_settings['average_window_width']],
             strides=[1, model_settings['average_window_width']],
             pooling_type='AVG',
             padding='SAME')
-        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
+        tf.compat.v1.summary.image('shrunk_spectrogram',
+                                   self.output_,
+                                   max_outputs=1)
       elif model_settings['preprocess'] == 'mfcc':
         self.output_ = contrib_audio.mfcc(
             spectrogram,
             wav_decoder.sample_rate,
             dct_coefficient_count=model_settings['fingerprint_width'])
-        tf.summary.image(
+        tf.compat.v1.summary.image(
             'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
       elif model_settings['preprocess'] == 'micro':
         if not frontend_op:
@@ -474,7 +477,7 @@ class AudioProcessor(object):
             out_scale=1,
             out_type=tf.float32)
         self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
-        tf.summary.image(
+        tf.compat.v1.summary.image(
             'micro',
             tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
             max_outputs=1)
@@ -485,10 +488,10 @@ class AudioProcessor(object):
 
       # Merge all the summaries and write them out to /tmp/retrain_logs (by
       # default)
-      self.merged_summaries_ = tf.summary.merge_all(scope='data')
+      self.merged_summaries_ = tf.compat.v1.summary.merge_all(scope='data')
       if summaries_dir:
-        self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
-                                                     tf.get_default_graph())
+        self.summary_writer_ = tf.compat.v1.summary.FileWriter(
+            summaries_dir + '/data', tf.compat.v1.get_default_graph())
 
   def set_size(self, mode):
     """Calculates the number of samples in the dataset partition.
@@ -655,11 +658,11 @@ class AudioProcessor(object):
     data = np.zeros((sample_count, desired_samples))
     labels = []
     with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-      wav_filename_placeholder = tf.placeholder(tf.string, [])
+      wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
       wav_loader = io_ops.read_file(wav_filename_placeholder)
       wav_decoder = contrib_audio.decode_wav(
           wav_loader, desired_channels=1, desired_samples=desired_samples)
-      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
+      foreground_volume_placeholder = tf.compat.v1.placeholder(tf.float32, [])
       scaled_foreground = tf.multiply(wav_decoder.audio,
                                       foreground_volume_placeholder)
       for i in range(sample_count):
diff --git a/tensorflow/examples/speech_commands/label_wav.cc b/tensorflow/examples/speech_commands/label_wav.cc
index d8267388317..f505da22648 100644
--- a/tensorflow/examples/speech_commands/label_wav.cc
+++ b/tensorflow/examples/speech_commands/label_wav.cc
@@ -26,10 +26,11 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Status;
-using tensorflow::Tensor;
 using tensorflow::int32;
+using tensorflow::Status;
 using tensorflow::string;
+using tensorflow::Tensor;
+using tensorflow::tstring;
 
 namespace {
 
@@ -149,7 +150,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
   Tensor wav_tensor(tensorflow::DT_STRING, tensorflow::TensorShape({}));
-  wav_tensor.scalar<string>()() = wav_string;
+  wav_tensor.scalar<tstring>()() = wav_string;
 
   // Actually run the audio through the model.
   std::vector<Tensor> outputs;
diff --git a/tensorflow/examples/speech_commands/label_wav.py b/tensorflow/examples/speech_commands/label_wav.py
index 5af16691e82..eef0fc288d2 100644
--- a/tensorflow/examples/speech_commands/label_wav.py
+++ b/tensorflow/examples/speech_commands/label_wav.py
@@ -45,15 +45,15 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.GFile(filename, 'rb') as f:
-    graph_def = tf.GraphDef()
+  with tf.io.gfile.GFile(filename, 'rb') as f:
+    graph_def = tf.compat.v1.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
 
 
 def load_labels(filename):
   """Read in labels, one label per line."""
-  return [line.rstrip() for line in tf.gfile.GFile(filename)]
+  return [line.rstrip() for line in tf.io.gfile.GFile(filename)]
 
 
 def run_graph(wav_data, labels, input_layer_name, output_layer_name,
@@ -79,14 +79,14 @@ def run_graph(wav_data, labels, input_layer_name, output_layer_name,
 
 def label_wav(wav, labels, graph, input_name, output_name, how_many_labels):
   """Loads the model and labels, and runs the inference to print predictions."""
-  if not wav or not tf.gfile.Exists(wav):
-    tf.logging.fatal('Audio file does not exist %s', wav)
+  if not wav or not tf.io.gfile.exists(wav):
+    tf.compat.v1.logging.fatal('Audio file does not exist %s', wav)
 
-  if not labels or not tf.gfile.Exists(labels):
-    tf.logging.fatal('Labels file does not exist %s', labels)
+  if not labels or not tf.io.gfile.exists(labels):
+    tf.compat.v1.logging.fatal('Labels file does not exist %s', labels)
 
-  if not graph or not tf.gfile.Exists(graph):
-    tf.logging.fatal('Graph file does not exist %s', graph)
+  if not graph or not tf.io.gfile.exists(graph):
+    tf.compat.v1.logging.fatal('Graph file does not exist %s', graph)
 
   labels_list = load_labels(labels)
 
@@ -130,4 +130,4 @@ if __name__ == '__main__':
       help='Number of results to show.')
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index 3c833d66735..8dbbb7104ef 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -49,7 +49,7 @@ class LabelWavTest(test.TestCase):
     output_name = "test_output"
     graph_filename = os.path.join(tmp_dir, "test_graph.pb")
     with tf.compat.v1.Session() as sess:
-      tf.placeholder(tf.string, name=input_name)
+      tf.compat.v1.placeholder(tf.string, name=input_name)
       tf.zeros([1, 3], name=output_name)
       with open(graph_filename, "wb") as f:
         f.write(sess.graph.as_graph_def().SerializeToString())
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 23bc55d3170..4fc144b0fbb 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -157,7 +157,7 @@ def load_variables_from_checkpoint(sess, start_checkpoint):
     sess: TensorFlow session.
     start_checkpoint: Path to saved checkpoint on disk.
   """
-  saver = tf.train.Saver(tf.global_variables())
+  saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
   saver.restore(sess, start_checkpoint)
 
 
@@ -187,15 +187,16 @@ def create_single_fc_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
   fingerprint_size = model_settings['fingerprint_size']
   label_count = model_settings['label_count']
-  weights = tf.get_variable(
+  weights = tf.compat.v1.get_variable(
       name='weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.001),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.001),
       shape=[fingerprint_size, label_count])
-  bias = tf.get_variable(
-      name='bias', initializer=tf.zeros_initializer, shape=[label_count])
+  bias = tf.compat.v1.get_variable(name='bias',
+                                   initializer=tf.compat.v1.zeros_initializer,
+                                   shape=[label_count])
   logits = tf.matmul(fingerprint_input, weights) + bias
   if is_training:
     return logits, dropout_prob
@@ -252,7 +253,7 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -260,41 +261,48 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
   first_filter_width = 8
   first_filter_height = 20
   first_filter_count = 64
-  first_weights = tf.get_variable(
+  first_weights = tf.compat.v1.get_variable(
       name='first_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_filter_height, first_filter_width, 1, first_filter_count])
-  first_bias = tf.get_variable(
+  first_bias = tf.compat.v1.get_variable(
       name='first_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_filter_count])
-  first_conv = tf.nn.conv2d(fingerprint_4d, first_weights, [1, 1, 1, 1],
-                            'SAME') + first_bias
+  first_conv = tf.nn.conv2d(input=fingerprint_4d,
+                            filters=first_weights,
+                            strides=[1, 1, 1, 1],
+                            padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+    first_dropout = tf.compat.v1.nn.dropout(first_relu, dropout_prob)
   else:
     first_dropout = first_relu
-  max_pool = tf.nn.max_pool(first_dropout, [1, 2, 2, 1], [1, 2, 2, 1], 'SAME')
+  max_pool = tf.nn.max_pool2d(input=first_dropout,
+                              ksize=[1, 2, 2, 1],
+                              strides=[1, 2, 2, 1],
+                              padding='SAME')
   second_filter_width = 4
   second_filter_height = 10
   second_filter_count = 64
-  second_weights = tf.get_variable(
+  second_weights = tf.compat.v1.get_variable(
       name='second_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[
           second_filter_height, second_filter_width, first_filter_count,
           second_filter_count
       ])
-  second_bias = tf.get_variable(
+  second_bias = tf.compat.v1.get_variable(
       name='second_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[second_filter_count])
-  second_conv = tf.nn.conv2d(max_pool, second_weights, [1, 1, 1, 1],
-                             'SAME') + second_bias
+  second_conv = tf.nn.conv2d(input=max_pool,
+                             filters=second_weights,
+                             strides=[1, 1, 1, 1],
+                             padding='SAME') + second_bias
   second_relu = tf.nn.relu(second_conv)
   if is_training:
-    second_dropout = tf.nn.dropout(second_relu, dropout_prob)
+    second_dropout = tf.compat.v1.nn.dropout(second_relu, dropout_prob)
   else:
     second_dropout = second_relu
   second_conv_shape = second_dropout.get_shape()
@@ -306,13 +314,13 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
   flattened_second_conv = tf.reshape(second_dropout,
                                      [-1, second_conv_element_count])
   label_count = model_settings['label_count']
-  final_fc_weights = tf.get_variable(
+  final_fc_weights = tf.compat.v1.get_variable(
       name='final_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[second_conv_element_count, label_count])
-  final_fc_bias = tf.get_variable(
+  final_fc_bias = tf.compat.v1.get_variable(
       name='final_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[label_count])
   final_fc = tf.matmul(flattened_second_conv, final_fc_weights) + final_fc_bias
   if is_training:
@@ -368,7 +376,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -378,20 +386,21 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
   first_filter_count = 186
   first_filter_stride_x = 1
   first_filter_stride_y = 1
-  first_weights = tf.get_variable(
+  first_weights = tf.compat.v1.get_variable(
       name='first_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_filter_height, first_filter_width, 1, first_filter_count])
-  first_bias = tf.get_variable(
+  first_bias = tf.compat.v1.get_variable(
       name='first_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_filter_count])
-  first_conv = tf.nn.conv2d(fingerprint_4d, first_weights, [
-      1, first_filter_stride_y, first_filter_stride_x, 1
-  ], 'VALID') + first_bias
+  first_conv = tf.nn.conv2d(
+      input=fingerprint_4d, filters=first_weights,
+      strides=[1, first_filter_stride_y, first_filter_stride_x, 1],
+      padding='VALID') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+    first_dropout = tf.compat.v1.nn.dropout(first_relu, dropout_prob)
   else:
     first_dropout = first_relu
   first_conv_output_width = math.floor(
@@ -405,41 +414,41 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
   flattened_first_conv = tf.reshape(first_dropout,
                                     [-1, first_conv_element_count])
   first_fc_output_channels = 128
-  first_fc_weights = tf.get_variable(
+  first_fc_weights = tf.compat.v1.get_variable(
       name='first_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_conv_element_count, first_fc_output_channels])
-  first_fc_bias = tf.get_variable(
+  first_fc_bias = tf.compat.v1.get_variable(
       name='first_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_fc_output_channels])
   first_fc = tf.matmul(flattened_first_conv, first_fc_weights) + first_fc_bias
   if is_training:
-    second_fc_input = tf.nn.dropout(first_fc, dropout_prob)
+    second_fc_input = tf.compat.v1.nn.dropout(first_fc, dropout_prob)
   else:
     second_fc_input = first_fc
   second_fc_output_channels = 128
-  second_fc_weights = tf.get_variable(
+  second_fc_weights = tf.compat.v1.get_variable(
       name='second_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_fc_output_channels, second_fc_output_channels])
-  second_fc_bias = tf.get_variable(
+  second_fc_bias = tf.compat.v1.get_variable(
       name='second_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[second_fc_output_channels])
   second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
   if is_training:
-    final_fc_input = tf.nn.dropout(second_fc, dropout_prob)
+    final_fc_input = tf.compat.v1.nn.dropout(second_fc, dropout_prob)
   else:
     final_fc_input = second_fc
   label_count = model_settings['label_count']
-  final_fc_weights = tf.get_variable(
+  final_fc_weights = tf.compat.v1.get_variable(
       name='final_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[second_fc_output_channels, label_count])
-  final_fc_bias = tf.get_variable(
+  final_fc_bias = tf.compat.v1.get_variable(
       name='final_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[label_count])
   final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
   if is_training:
@@ -504,7 +513,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       ValueError: If the inputs tensor is incorrectly shaped.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
 
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
@@ -528,12 +537,12 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
   num_filters = rank * num_units
   # Create the runtime memory: [num_filters, batch, input_time_size]
   batch = 1
-  memory = tf.get_variable(
-      initializer=tf.zeros_initializer,
+  memory = tf.compat.v1.get_variable(
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[num_filters, batch, input_time_size],
       trainable=False,
       name='runtime-memory')
-  first_time_flag = tf.get_variable(
+  first_time_flag = tf.compat.v1.get_variable(
       name="first_time_flag",
       dtype=tf.int32,
       initializer=1)
@@ -547,9 +556,9 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
     window_stride_ms = int(model_settings['window_stride_samples'] * 1000 /
                            model_settings['sample_rate'])
     num_new_frames = tf.cond(
-        tf.equal(first_time_flag, 1),
-        lambda: input_time_size,
-        lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))
+        pred=tf.equal(first_time_flag, 1),
+        true_fn=lambda: input_time_size,
+        false_fn=lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))  # pylint:disable=line-too-long
   first_time_flag = 0
   new_fingerprint_input = fingerprint_input[
       :, -num_new_frames*input_frequency_size:]
@@ -557,20 +566,22 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
   new_fingerprint_input = tf.expand_dims(new_fingerprint_input, 2)
 
   # Create the frequency filters.
-  weights_frequency = tf.get_variable(
+  weights_frequency = tf.compat.v1.get_variable(
       name='weights_frequency',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[input_frequency_size, num_filters])
   # Expand to add input channels dimensions.
   # weights_frequency: [input_frequency_size, 1, num_filters]
   weights_frequency = tf.expand_dims(weights_frequency, 1)
   # Convolve the 1D feature filters sliding over the time dimension.
   # activations_time: [batch, num_new_frames, num_filters]
-  activations_time = tf.nn.conv1d(
-      new_fingerprint_input, weights_frequency, input_frequency_size, 'VALID')
+  activations_time = tf.nn.conv1d(input=new_fingerprint_input,
+                                  filters=weights_frequency,
+                                  stride=input_frequency_size,
+                                  padding='VALID')
   # Rearrange such that we can perform the batched matmul.
   # activations_time: [num_filters, batch, num_new_frames]
-  activations_time = tf.transpose(activations_time, perm=[2, 0, 1])
+  activations_time = tf.transpose(a=activations_time, perm=[2, 0, 1])
 
   # Runtime memory optimization.
   if not is_training:
@@ -578,13 +589,13 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
     # then add those corresponding to the new frames.
     new_memory = memory[:, :, num_new_frames:]
     new_memory = tf.concat([new_memory, activations_time], 2)
-    tf.assign(memory, new_memory)
+    tf.compat.v1.assign(memory, new_memory)
     activations_time = new_memory
 
   # Create the time filters.
-  weights_time = tf.get_variable(
+  weights_time = tf.compat.v1.get_variable(
       name='weights_time',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[num_filters, input_time_size])
   # Apply the time filter on the outputs of the feature filters.
   # weights_time: [num_filters, input_time_size, 1]
@@ -597,59 +608,60 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
   # [num_filters, batch, 1] => [num_units, rank, batch]
   outputs = tf.reshape(outputs, [num_units, rank, -1])
   # Sum the rank outputs per unit => [num_units, batch].
-  units_output = tf.reduce_sum(outputs, axis=1)
+  units_output = tf.reduce_sum(input_tensor=outputs, axis=1)
   # Transpose to shape [batch, num_units]
-  units_output = tf.transpose(units_output)
+  units_output = tf.transpose(a=units_output)
 
   # Appy bias.
-  bias = tf.get_variable(
-      name='bias', initializer=tf.zeros_initializer, shape=[num_units])
+  bias = tf.compat.v1.get_variable(name='bias',
+                                   initializer=tf.compat.v1.zeros_initializer,
+                                   shape=[num_units])
   first_bias = tf.nn.bias_add(units_output, bias)
 
   # Relu.
   first_relu = tf.nn.relu(first_bias)
 
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+    first_dropout = tf.compat.v1.nn.dropout(first_relu, dropout_prob)
   else:
     first_dropout = first_relu
 
   first_fc_output_channels = 256
-  first_fc_weights = tf.get_variable(
+  first_fc_weights = tf.compat.v1.get_variable(
       name='first_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[num_units, first_fc_output_channels])
-  first_fc_bias = tf.get_variable(
+  first_fc_bias = tf.compat.v1.get_variable(
       name='first_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_fc_output_channels])
   first_fc = tf.matmul(first_dropout, first_fc_weights) + first_fc_bias
   if is_training:
-    second_fc_input = tf.nn.dropout(first_fc, dropout_prob)
+    second_fc_input = tf.compat.v1.nn.dropout(first_fc, dropout_prob)
   else:
     second_fc_input = first_fc
   second_fc_output_channels = 256
-  second_fc_weights = tf.get_variable(
+  second_fc_weights = tf.compat.v1.get_variable(
       name='second_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_fc_output_channels, second_fc_output_channels])
-  second_fc_bias = tf.get_variable(
+  second_fc_bias = tf.compat.v1.get_variable(
       name='second_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[second_fc_output_channels])
   second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
   if is_training:
-    final_fc_input = tf.nn.dropout(second_fc, dropout_prob)
+    final_fc_input = tf.compat.v1.nn.dropout(second_fc, dropout_prob)
   else:
     final_fc_input = second_fc
   label_count = model_settings['label_count']
-  final_fc_weights = tf.get_variable(
+  final_fc_weights = tf.compat.v1.get_variable(
       name='final_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[second_fc_output_channels, label_count])
-  final_fc_bias = tf.get_variable(
+  final_fc_bias = tf.compat.v1.get_variable(
       name='final_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[label_count])
   final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
   if is_training:
@@ -698,7 +710,7 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -706,22 +718,23 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
   first_filter_width = 8
   first_filter_height = 10
   first_filter_count = 8
-  first_weights = tf.get_variable(
+  first_weights = tf.compat.v1.get_variable(
       name='first_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_filter_height, first_filter_width, 1, first_filter_count])
-  first_bias = tf.get_variable(
+  first_bias = tf.compat.v1.get_variable(
       name='first_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_filter_count])
   first_conv_stride_x = 2
   first_conv_stride_y = 2
-  first_conv = tf.nn.conv2d(fingerprint_4d, first_weights,
-                            [1, first_conv_stride_y, first_conv_stride_x, 1],
-                            'SAME') + first_bias
+  first_conv = tf.nn.conv2d(
+      input=fingerprint_4d, filters=first_weights,
+      strides=[1, first_conv_stride_y, first_conv_stride_x, 1],
+      padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+    first_dropout = tf.compat.v1.nn.dropout(first_relu, dropout_prob)
   else:
     first_dropout = first_relu
   first_dropout_shape = first_dropout.get_shape()
@@ -733,13 +746,13 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
   flattened_first_dropout = tf.reshape(first_dropout,
                                        [-1, first_dropout_element_count])
   label_count = model_settings['label_count']
-  final_fc_weights = tf.get_variable(
+  final_fc_weights = tf.compat.v1.get_variable(
       name='final_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_dropout_element_count, label_count])
-  final_fc_bias = tf.get_variable(
+  final_fc_bias = tf.compat.v1.get_variable(
       name='final_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[label_count])
   final_fc = (
       tf.matmul(flattened_first_dropout, final_fc_weights) + final_fc_bias)
@@ -802,7 +815,7 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -811,47 +824,49 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
   first_filter_width = 8
   first_filter_height = 10
   first_filter_count = 8
-  first_weights = tf.get_variable(
+  first_weights = tf.compat.v1.get_variable(
       name='first_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[first_filter_height, first_filter_width, 1, first_filter_count])
-  first_bias = tf.get_variable(
+  first_bias = tf.compat.v1.get_variable(
       name='first_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[first_filter_count])
   first_conv_stride_x = 2
   first_conv_stride_y = 2
-  first_conv = tf.nn.conv2d(fingerprint_4d, first_weights,
-                            [1, first_conv_stride_y, first_conv_stride_x, 1],
-                            'SAME') + first_bias
+  first_conv = tf.nn.conv2d(
+      input=fingerprint_4d, filters=first_weights,
+      strides=[1, first_conv_stride_y, first_conv_stride_x, 1],
+      padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+    first_dropout = tf.compat.v1.nn.dropout(first_relu, dropout_prob)
   else:
     first_dropout = first_relu
 
   second_filter_width = 8
   second_filter_height = 10
   second_filter_count = 8
-  second_weights = tf.get_variable(
+  second_weights = tf.compat.v1.get_variable(
       name='second_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[
           second_filter_height, second_filter_width, first_filter_count,
           second_filter_count
       ])
-  second_bias = tf.get_variable(
+  second_bias = tf.compat.v1.get_variable(
       name='second_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[second_filter_count])
   second_conv_stride_x = 8
   second_conv_stride_y = 8
-  second_conv = tf.nn.conv2d(first_dropout, second_weights,
-                             [1, second_conv_stride_y, second_conv_stride_x, 1],
-                             'SAME') + second_bias
+  second_conv = tf.nn.conv2d(
+      input=first_dropout, filters=second_weights,
+      strides=[1, second_conv_stride_y, second_conv_stride_x, 1],
+      padding='SAME') + second_bias
   second_relu = tf.nn.relu(second_conv)
   if is_training:
-    second_dropout = tf.nn.dropout(second_relu, dropout_prob)
+    second_dropout = tf.compat.v1.nn.dropout(second_relu, dropout_prob)
   else:
     second_dropout = second_relu
 
@@ -864,13 +879,13 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
   flattened_second_dropout = tf.reshape(second_dropout,
                                         [-1, second_dropout_element_count])
   label_count = model_settings['label_count']
-  final_fc_weights = tf.get_variable(
+  final_fc_weights = tf.compat.v1.get_variable(
       name='final_fc_weights',
-      initializer=tf.truncated_normal_initializer(stddev=0.01),
+      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
       shape=[second_dropout_element_count, label_count])
-  final_fc_bias = tf.get_variable(
+  final_fc_bias = tf.compat.v1.get_variable(
       name='final_fc_bias',
-      initializer=tf.zeros_initializer,
+      initializer=tf.compat.v1.zeros_initializer,
       shape=[label_count])
   final_fc = (
       tf.matmul(flattened_second_dropout, final_fc_weights) + final_fc_bias)
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 43a399b912e..446e351cb81 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -86,11 +86,11 @@ FLAGS = None
 
 
 def main(_):
-  # We want to see all the logging messages for this tutorial.
-  tf.logging.set_verbosity(tf.logging.INFO)
+  # Set the verbosity based on flags (default is INFO, so we see all messages)
+  tf.compat.v1.logging.set_verbosity(FLAGS.verbosity)
 
   # Start a new TensorFlow session.
-  sess = tf.InteractiveSession()
+  sess = tf.compat.v1.InteractiveSession()
 
   # Begin by making sure we have the training data we need. If you already have
   # training data of your own, use `--data_url= ` on the command line to avoid
@@ -122,12 +122,12 @@ def main(_):
         'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                    len(learning_rates_list)))
 
-  input_placeholder = tf.placeholder(
+  input_placeholder = tf.compat.v1.placeholder(
       tf.float32, [None, fingerprint_size], name='fingerprint_input')
   if FLAGS.quantize:
     fingerprint_min, fingerprint_max = input_data.get_features_range(
         model_settings)
-    fingerprint_input = tf.fake_quant_with_min_max_args(
+    fingerprint_input = tf.quantization.fake_quant_with_min_max_args(
         input_placeholder, fingerprint_min, fingerprint_max)
   else:
     fingerprint_input = input_placeholder
@@ -139,48 +139,52 @@ def main(_):
       is_training=True)
 
   # Define loss and optimizer
-  ground_truth_input = tf.placeholder(
+  ground_truth_input = tf.compat.v1.placeholder(
       tf.int64, [None], name='groundtruth_input')
 
   # Optionally we can add runtime checks to spot when NaNs or other symptoms of
   # numerical errors start occurring during training.
   control_dependencies = []
   if FLAGS.check_nans:
-    checks = tf.add_check_numerics_ops()
+    checks = tf.compat.v1.add_check_numerics_ops()
     control_dependencies = [checks]
 
   # Create the back propagation and training evaluation machinery in the graph.
-  with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
+  with tf.compat.v1.name_scope('cross_entropy'):
+    cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy(
         labels=ground_truth_input, logits=logits)
   if FLAGS.quantize:
     tf.contrib.quantize.create_training_graph(quant_delay=0)
-  with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
-    learning_rate_input = tf.placeholder(
+  with tf.compat.v1.name_scope('train'), tf.control_dependencies(
+      control_dependencies):
+    learning_rate_input = tf.compat.v1.placeholder(
         tf.float32, [], name='learning_rate_input')
-    train_step = tf.train.GradientDescentOptimizer(
+    train_step = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate_input).minimize(cross_entropy_mean)
-  predicted_indices = tf.argmax(logits, 1)
+  predicted_indices = tf.argmax(input=logits, axis=1)
   correct_prediction = tf.equal(predicted_indices, ground_truth_input)
-  confusion_matrix = tf.confusion_matrix(
-      ground_truth_input, predicted_indices, num_classes=label_count)
-  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  with tf.get_default_graph().name_scope('eval'):
-    tf.summary.scalar('cross_entropy', cross_entropy_mean)
-    tf.summary.scalar('accuracy', evaluation_step)
+  confusion_matrix = tf.math.confusion_matrix(labels=ground_truth_input,
+                                              predictions=predicted_indices,
+                                              num_classes=label_count)
+  evaluation_step = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
+                                                        tf.float32))
+  with tf.compat.v1.get_default_graph().name_scope('eval'):
+    tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean)
+    tf.compat.v1.summary.scalar('accuracy', evaluation_step)
 
-  global_step = tf.train.get_or_create_global_step()
-  increment_global_step = tf.assign(global_step, global_step + 1)
+  global_step = tf.compat.v1.train.get_or_create_global_step()
+  increment_global_step = tf.compat.v1.assign(global_step, global_step + 1)
 
-  saver = tf.train.Saver(tf.global_variables())
+  saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
 
   # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
-  merged_summaries = tf.summary.merge_all(scope='eval')
-  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                       sess.graph)
-  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')
+  merged_summaries = tf.compat.v1.summary.merge_all(scope='eval')
+  train_writer = tf.compat.v1.summary.FileWriter(FLAGS.summaries_dir + '/train',
+                                                 sess.graph)
+  validation_writer = tf.compat.v1.summary.FileWriter(
+      FLAGS.summaries_dir + '/validation')
 
-  tf.global_variables_initializer().run()
+  tf.compat.v1.global_variables_initializer().run()
 
   start_step = 1
 
@@ -188,11 +192,11 @@ def main(_):
     models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
     start_step = global_step.eval(session=sess)
 
-  tf.logging.info('Training from step: %d ', start_step)
+  tf.compat.v1.logging.info('Training from step: %d ', start_step)
 
   # Save graph.pbtxt.
-  tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
-                       FLAGS.model_architecture + '.pbtxt')
+  tf.io.write_graph(sess.graph_def, FLAGS.train_dir,
+                    FLAGS.model_architecture + '.pbtxt')
 
   # Save list of words.
   with gfile.GFile(
@@ -230,9 +234,10 @@ def main(_):
             dropout_prob: 0.5
         })
     train_writer.add_summary(train_summary, training_step)
-    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
-                    (training_step, learning_rate_value, train_accuracy * 100,
-                     cross_entropy_value))
+    tf.compat.v1.logging.info(
+        'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
+        (training_step, learning_rate_value, train_accuracy * 100,
+         cross_entropy_value))
     is_last_step = (training_step == training_steps_max)
     if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
       set_size = audio_processor.set_size('validation')
@@ -258,20 +263,21 @@ def main(_):
           total_conf_matrix = conf_matrix
         else:
           total_conf_matrix += conf_matrix
-      tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
-      tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
-                      (training_step, total_accuracy * 100, set_size))
+      tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
+      tf.compat.v1.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
+                                (training_step, total_accuracy * 100, set_size))
 
     # Save the model checkpoint periodically.
     if (training_step % FLAGS.save_step_interval == 0 or
         training_step == training_steps_max):
       checkpoint_path = os.path.join(FLAGS.train_dir,
                                      FLAGS.model_architecture + '.ckpt')
-      tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
+      tf.compat.v1.logging.info('Saving to "%s-%d"', checkpoint_path,
+                                training_step)
       saver.save(sess, checkpoint_path, global_step=training_step)
 
   set_size = audio_processor.set_size('testing')
-  tf.logging.info('set_size=%d', set_size)
+  tf.compat.v1.logging.info('set_size=%d', set_size)
   total_accuracy = 0
   total_conf_matrix = None
   for i in xrange(0, set_size, FLAGS.batch_size):
@@ -290,9 +296,9 @@ def main(_):
       total_conf_matrix = conf_matrix
     else:
       total_conf_matrix += conf_matrix
-  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
-  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100,
-                                                           set_size))
+  tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
+  tf.compat.v1.logging.info('Final test accuracy = %.1f%% (N=%d)' %
+                            (total_accuracy * 100, set_size))
 
 
 if __name__ == '__main__':
@@ -301,7 +307,7 @@ if __name__ == '__main__':
       '--data_url',
       type=str,
       # pylint: disable=line-too-long
-      default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+      default='https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
       # pylint: enable=line-too-long
       help='Location of speech training data archive on the web.')
   parser.add_argument(
@@ -448,5 +454,33 @@ if __name__ == '__main__':
       default='mfcc',
       help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
 
+  # Function used to parse --verbosity argument
+  def verbosity_arg(value):
+    """Parses verbosity argument.
+
+    Args:
+      value: A member of tf.logging.
+    Raises:
+      ArgumentTypeError: Not an expected value.
+    """
+    value = value.upper()
+    if value == 'INFO':
+      return tf.compat.v1.logging.INFO
+    elif value == 'DEBUG':
+      return tf.compat.v1.logging.DEBUG
+    elif value == 'ERROR':
+      return tf.compat.v1.logging.ERROR
+    elif value == 'FATAL':
+      return tf.compat.v1.logging.FATAL
+    elif value == 'WARN':
+      return tf.compat.v1.logging.WARN
+    else:
+      raise argparse.ArgumentTypeError('Not an expected value')
+  parser.add_argument(
+      '--verbosity',
+      type=verbosity_arg,
+      default=tf.compat.v1.logging.INFO,
+      help='Log verbosity. Can be "INFO", "DEBUG", "ERROR", "FATAL", or "WARN"')
+
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/train_test.py b/tensorflow/examples/speech_commands/train_test.py
index db195760e98..f17e2ba2c08 100644
--- a/tensorflow/examples/speech_commands/train_test.py
+++ b/tensorflow/examples/speech_commands/train_test.py
@@ -100,6 +100,7 @@ class TrainTest(test.TestCase):
         'background_frequency': 0.8,
         'eval_step_interval': 1,
         'save_step_interval': 1,
+        'verbosity': tf.compat.v1.logging.INFO
     }
     return DictStruct(**flags)
 
diff --git a/tensorflow/examples/speech_commands/wav_to_features.py b/tensorflow/examples/speech_commands/wav_to_features.py
index d7f2446d355..be3d045f570 100644
--- a/tensorflow/examples/speech_commands/wav_to_features.py
+++ b/tensorflow/examples/speech_commands/wav_to_features.py
@@ -62,7 +62,7 @@ def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
   """
 
   # Start a new TensorFlow session.
-  sess = tf.InteractiveSession()
+  sess = tf.compat.v1.InteractiveSession()
 
   model_settings = models.prepare_model_settings(
       0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms,
@@ -124,12 +124,12 @@ def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
 
 def main(_):
   # We want to see all the logging messages.
-  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   wav_to_features(FLAGS.sample_rate, FLAGS.clip_duration_ms,
                   FLAGS.window_size_ms, FLAGS.window_stride_ms,
                   FLAGS.feature_bin_count, FLAGS.quantize, FLAGS.preprocess,
                   FLAGS.input_wav, FLAGS.output_c_file)
-  tf.logging.info('Wrote to "%s"' % (FLAGS.output_c_file))
+  tf.compat.v1.logging.info('Wrote to "%s"' % (FLAGS.output_c_file))
 
 
 if __name__ == '__main__':
@@ -182,4 +182,4 @@ if __name__ == '__main__':
       help='Where to save the generated C source file containing the features')
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/deepdream/README.md b/tensorflow/examples/tutorials/deepdream/README.md
index 403e4b34f9b..e16b366488a 100644
--- a/tensorflow/examples/tutorials/deepdream/README.md
+++ b/tensorflow/examples/tutorials/deepdream/README.md
@@ -5,11 +5,18 @@ by [Alexander Mordvintsev](mailto:moralex@google.com)
 This directory contains Jupyter notebook that demonstrates a number of Convolutional Neural Network
 image generation techniques implemented with TensorFlow:
 
-- visualizing individual feature channels and their combinations to explore the space of patterns learned by the neural network (see [GoogLeNet](http://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](http://storage.googleapis.com/deepdream/visualz/vgg16/index.html) galleries)
-- embedding TensorBoard graph visualizations into Jupyter notebooks
-- producing high-resolution images with tiled computation ([example](http://storage.googleapis.com/deepdream/pilatus_flowers.jpg))
-- using Laplacian Pyramid Gradient Normalization to produce smooth and colorful visuals at low cost
-- generating DeepDream-like images with TensorFlow
+-   visualizing individual feature channels and their combinations to explore
+    the space of patterns learned by the neural network (see
+    [GoogLeNet](https://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html)
+    and
+    [VGG16](https://storage.googleapis.com/deepdream/visualz/vgg16/index.html)
+    galleries)
+-   embedding TensorBoard graph visualizations into Jupyter notebooks
+-   producing high-resolution images with tiled computation
+    ([example](https://storage.googleapis.com/deepdream/pilatus_flowers.jpg))
+-   using Laplacian Pyramid Gradient Normalization to produce smooth and
+    colorful visuals at low cost
+-   generating DeepDream-like images with TensorFlow
 
 You can view "deepdream.ipynb" directly on GitHub. Note that GitHub Jupyter notebook preview removes
 embedded graph visualizations. You can still see them online
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
index 15112aafb34..448f3f6f438 100644
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@@ -40,14 +40,14 @@
       "source": [
         "This notebook demonstrates a number of Convolutional Neural Network image generation techniques implemented with TensorFlow for fun and science:\n",
         "\n",
-        "- visualize individual feature channels and their combinations to explore the space of patterns learned by the neural network (see [GoogLeNet](http://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](http://storage.googleapis.com/deepdream/visualz/vgg16/index.html) galleries)\n",
+        "- visualize individual feature channels and their combinations to explore the space of patterns learned by the neural network (see [GoogLeNet](https://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](https://storage.googleapis.com/deepdream/visualz/vgg16/index.html) galleries)\n",
         "- embed TensorBoard graph visualizations into Jupyter notebooks\n",
-        "- produce high-resolution images with tiled computation ([example](http://storage.googleapis.com/deepdream/pilatus_flowers.jpg))\n",
+        "- produce high-resolution images with tiled computation ([example](https://storage.googleapis.com/deepdream/pilatus_flowers.jpg))\n",
         "- use Laplacian Pyramid Gradient Normalization to produce smooth and colorful visuals at low cost\n",
         "- generate DeepDream-like images with TensorFlow (DogSlugs included)\n",
         "\n",
         "\n",
-        "The network under examination is the [GoogLeNet architecture](http://arxiv.org/abs/1409.4842), trained to classify images into one of 1000 categories of the [ImageNet](http://image-net.org/) dataset. It consists of a set of layers that apply a sequence of transformations to the input image. The parameters of these transformations were determined during the training process by a variant of gradient descent algorithm. The internal image representations may seem obscure, but it is possible to visualize and interpret them. In this notebook we are going to present a few tricks that allow to make these visualizations both efficient to generate and even beautiful. Impatient readers can start with exploring the full galleries of images generated by the method described here for [GoogLeNet](http://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](http://storage.googleapis.com/deepdream/visualz/vgg16/index.html) architectures."
+        "The network under examination is the [GoogLeNet architecture](http://arxiv.org/abs/1409.4842), trained to classify images into one of 1000 categories of the [ImageNet](http://image-net.org/) dataset. It consists of a set of layers that apply a sequence of transformations to the input image. The parameters of these transformations were determined during the training process by a variant of gradient descent algorithm. The internal image representations may seem obscure, but it is possible to visualize and interpret them. In this notebook we are going to present a few tricks that allow to make these visualizations both efficient to generate and even beautiful. Impatient readers can start with exploring the full galleries of images generated by the method described here for [GoogLeNet](https://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](https://storage.googleapis.com/deepdream/visualz/vgg16/index.html) architectures."
       ]
     },
     {
@@ -1117,7 +1117,7 @@
         "id": "mYsY6_Ngpfwl"
       },
       "source": [
-        "Don't hesitate to use higher resolution inputs (also increase the number of octaves)! Here is an [example](http://storage.googleapis.com/deepdream/pilatus_flowers.jpg) of running the flower dream over the bigger image."
+        "Don't hesitate to use higher resolution inputs (also increase the number of octaves)! Here is an [example](https://storage.googleapis.com/deepdream/pilatus_flowers.jpg) of running the flower dream over the bigger image."
       ]
     },
     {
diff --git a/tensorflow/examples/tutorials/input_fn/boston_predict.csv b/tensorflow/examples/tutorials/input_fn/boston_predict.csv
deleted file mode 100644
index cc757a4a7d3..00000000000
--- a/tensorflow/examples/tutorials/input_fn/boston_predict.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO
-0.03359,75.0,2.95,0.428,7.024,15.8,5.4011,252,18.3
-5.09017,0.0,18.1,0.713,6.297,91.8,2.3682,666,20.2
-0.1265,25.0,5.13,0.453,6.762,43.4,7.9809,284,19.7
-0.05515,33.0,2.18,0.472,7.236,41.1,4.022,222,18.4
-8.15174,0.0,18.1,0.7,5.39,98.9,1.7281,666,20.2
-0.24522,0.0,9.9,0.544,5.782,71.7,4.0317,304,18.4
diff --git a/tensorflow/examples/tutorials/input_fn/boston_test.csv b/tensorflow/examples/tutorials/input_fn/boston_test.csv
deleted file mode 100644
index 769aee040ce..00000000000
--- a/tensorflow/examples/tutorials/input_fn/boston_test.csv
+++ /dev/null
@@ -1,101 +0,0 @@
-CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,MEDV
-0.13587,0.0,10.59,0.489,6.064,59.1,4.2392,277,18.6,24.4
-0.08664,45.0,3.44,0.437,7.178,26.3,6.4798,398,15.2,36.4
-0.26938,0.0,9.9,0.544,6.266,82.8,3.2628,304,18.4,21.6
-0.05302,0.0,3.41,0.489,7.079,63.1,3.4145,270,17.8,28.7
-0.0686,0.0,2.89,0.445,7.416,62.5,3.4952,276,18.0,33.2
-0.14231,0.0,10.01,0.547,6.254,84.2,2.2565,432,17.8,18.5
-0.1676,0.0,7.38,0.493,6.426,52.3,4.5404,287,19.6,23.8
-0.04301,80.0,1.91,0.413,5.663,21.9,10.5857,334,22.0,18.2
-0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222,18.7,36.2
-0.21719,0.0,10.59,0.489,5.807,53.8,3.6526,277,18.6,22.4
-0.06076,0.0,11.93,0.573,6.976,91.0,2.1675,273,21.0,23.9
-4.03841,0.0,18.1,0.532,6.229,90.7,3.0993,666,20.2,19.6
-0.16902,0.0,25.65,0.581,5.986,88.4,1.9929,188,19.1,21.4
-0.52693,0.0,6.2,0.504,8.725,83.0,2.8944,307,17.4,50.0
-12.0482,0.0,18.1,0.614,5.648,87.6,1.9512,666,20.2,20.8
-3.1636,0.0,18.1,0.655,5.759,48.2,3.0665,666,20.2,19.9
-0.10008,0.0,2.46,0.488,6.563,95.6,2.847,193,17.8,32.5
-2.15505,0.0,19.58,0.871,5.628,100.0,1.5166,403,14.7,15.6
-0.03584,80.0,3.37,0.398,6.29,17.8,6.6115,337,16.1,23.5
-0.13554,12.5,6.07,0.409,5.594,36.8,6.498,345,18.9,17.4
-17.8667,0.0,18.1,0.671,6.223,100.0,1.3861,666,20.2,10.2
-0.09378,12.5,7.87,0.524,5.889,39.0,5.4509,311,15.2,21.7
-0.05372,0.0,13.92,0.437,6.549,51.0,5.9604,289,16.0,27.1
-9.96654,0.0,18.1,0.74,6.485,100.0,1.9784,666,20.2,15.4
-3.8497,0.0,18.1,0.77,6.395,91.0,2.5052,666,20.2,21.7
-0.47547,0.0,9.9,0.544,6.113,58.8,4.0019,304,18.4,21.0
-0.06642,0.0,4.05,0.51,6.86,74.4,2.9153,296,16.6,29.9
-0.04011,80.0,1.52,0.404,7.287,34.1,7.309,329,12.6,33.3
-0.40202,0.0,9.9,0.544,6.382,67.2,3.5325,304,18.4,23.1
-0.04527,0.0,11.93,0.573,6.12,76.7,2.2875,273,21.0,20.6
-0.06617,0.0,3.24,0.46,5.868,25.8,5.2146,430,16.9,19.3
-0.06417,0.0,5.96,0.499,5.933,68.2,3.3603,279,19.2,18.9
-0.09068,45.0,3.44,0.437,6.951,21.5,6.4798,398,15.2,37.0
-0.10469,40.0,6.41,0.447,7.267,49.0,4.7872,254,17.6,33.2
-0.36894,22.0,5.86,0.431,8.259,8.4,8.9067,330,19.1,42.8
-0.17134,0.0,10.01,0.547,5.928,88.2,2.4631,432,17.8,18.3
-0.77299,0.0,8.14,0.538,6.495,94.4,4.4547,307,21.0,18.4
-0.5405,20.0,3.97,0.575,7.47,52.6,2.872,264,13.0,43.5
-0.1396,0.0,8.56,0.52,6.167,90.0,2.421,384,20.9,20.1
-0.76162,20.0,3.97,0.647,5.56,62.8,1.9865,264,13.0,22.8
-0.09065,20.0,6.96,0.464,5.92,61.5,3.9175,223,18.6,20.7
-0.13117,0.0,8.56,0.52,6.127,85.2,2.1224,384,20.9,20.4
-0.43571,0.0,10.59,0.489,5.344,100.0,3.875,277,18.6,20.0
-1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,403,14.7,17.0
-11.8123,0.0,18.1,0.718,6.824,76.5,1.794,666,20.2,8.4
-7.02259,0.0,18.1,0.718,6.006,95.3,1.8746,666,20.2,14.2
-4.26131,0.0,18.1,0.77,6.112,81.3,2.5091,666,20.2,22.6
-2.3139,0.0,19.58,0.605,5.88,97.3,2.3887,403,14.7,19.1
-0.62739,0.0,8.14,0.538,5.834,56.5,4.4986,307,21.0,19.9
-0.11747,12.5,7.87,0.524,6.009,82.9,6.2267,311,15.2,18.9
-22.5971,0.0,18.1,0.7,5.0,89.5,1.5184,666,20.2,7.4
-4.22239,0.0,18.1,0.77,5.803,89.0,1.9047,666,20.2,16.8
-0.03548,80.0,3.64,0.392,5.876,19.1,9.2203,315,16.4,20.9
-0.08199,0.0,13.92,0.437,6.009,42.3,5.5027,289,16.0,21.7
-0.11432,0.0,8.56,0.52,6.781,71.3,2.8561,384,20.9,26.5
-1.34284,0.0,19.58,0.605,6.066,100.0,1.7573,403,14.7,24.3
-12.2472,0.0,18.1,0.584,5.837,59.7,1.9976,666,20.2,10.2
-0.21977,0.0,6.91,0.448,5.602,62.0,6.0877,233,17.9,19.4
-0.01096,55.0,2.25,0.389,6.453,31.9,7.3073,300,15.3,22.0
-0.12816,12.5,6.07,0.409,5.885,33.0,6.498,345,18.9,20.9
-0.17004,12.5,7.87,0.524,6.004,85.9,6.5921,311,15.2,18.9
-2.63548,0.0,9.9,0.544,4.973,37.8,2.5194,304,18.4,16.1
-0.21409,22.0,5.86,0.431,6.438,8.9,7.3967,330,19.1,24.8
-0.04417,70.0,2.24,0.4,6.871,47.4,7.8278,358,14.8,24.8
-0.75026,0.0,8.14,0.538,5.924,94.1,4.3996,307,21.0,15.6
-13.6781,0.0,18.1,0.74,5.935,87.9,1.8206,666,20.2,8.4
-0.09604,40.0,6.41,0.447,6.854,42.8,4.2673,254,17.6,32.0
-0.1403,22.0,5.86,0.431,6.487,13.0,7.3967,330,19.1,24.4
-0.44178,0.0,6.2,0.504,6.552,21.4,3.3751,307,17.4,31.5
-0.03445,82.5,2.03,0.415,6.162,38.4,6.27,348,14.7,24.1
-0.10084,0.0,10.01,0.547,6.715,81.6,2.6775,432,17.8,22.8
-0.08707,0.0,12.83,0.437,6.14,45.8,4.0905,398,18.7,20.8
-0.35114,0.0,7.38,0.493,6.041,49.9,4.7211,287,19.6,20.4
-0.11027,25.0,5.13,0.453,6.456,67.8,7.2255,284,19.7,22.2
-0.01501,80.0,2.01,0.435,6.635,29.7,8.344,280,17.0,24.5
-9.51363,0.0,18.1,0.713,6.728,94.1,2.4961,666,20.2,14.9
-0.07503,33.0,2.18,0.472,7.42,71.9,3.0992,222,18.4,33.4
-11.1604,0.0,18.1,0.74,6.629,94.6,2.1247,666,20.2,13.4
-7.52601,0.0,18.1,0.713,6.417,98.3,2.185,666,20.2,13.0
-2.73397,0.0,19.58,0.871,5.597,94.9,1.5257,403,14.7,15.4
-0.2909,0.0,21.89,0.624,6.174,93.6,1.6119,437,21.2,14.0
-9.33889,0.0,18.1,0.679,6.38,95.6,1.9682,666,20.2,9.5
-0.02498,0.0,1.89,0.518,6.54,59.7,6.2669,422,15.9,16.5
-0.12269,0.0,6.91,0.448,6.069,40.0,5.7209,233,17.9,21.2
-0.10959,0.0,11.93,0.573,6.794,89.3,2.3889,273,21.0,22.0
-0.08014,0.0,5.96,0.499,5.85,41.5,3.9342,279,19.2,21.0
-0.03113,0.0,4.39,0.442,6.014,48.5,8.0136,352,18.8,17.5
-5.82115,0.0,18.1,0.713,6.513,89.9,2.8016,666,20.2,20.2
-0.65665,20.0,3.97,0.647,6.842,100.0,2.0107,264,13.0,30.1
-0.79041,0.0,9.9,0.544,6.122,52.8,2.6403,304,18.4,22.1
-0.09266,34.0,6.09,0.433,6.495,18.4,5.4917,329,16.1,26.4
-0.17783,0.0,9.69,0.585,5.569,73.5,2.3999,391,19.2,17.5
-0.07013,0.0,13.89,0.55,6.642,85.1,3.4211,276,16.4,28.7
-0.41238,0.0,6.2,0.504,7.163,79.9,3.2157,307,17.4,31.6
-15.5757,0.0,18.1,0.58,5.926,71.0,2.9084,666,20.2,19.1
-0.13262,0.0,8.56,0.52,5.851,96.7,2.1069,384,20.9,19.5
-6.80117,0.0,18.1,0.713,6.081,84.4,2.7175,666,20.2,20.0
-12.8023,0.0,18.1,0.74,5.854,96.6,1.8956,666,20.2,10.8
-10.233,0.0,18.1,0.614,6.185,96.7,2.1705,666,20.2,14.6
-0.35809,0.0,6.2,0.507,6.951,88.5,2.8617,307,17.4,26.7
diff --git a/tensorflow/examples/tutorials/input_fn/boston_train.csv b/tensorflow/examples/tutorials/input_fn/boston_train.csv
deleted file mode 100644
index e675a268178..00000000000
--- a/tensorflow/examples/tutorials/input_fn/boston_train.csv
+++ /dev/null
@@ -1,401 +0,0 @@
-CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,MEDV
-2.3004,0.0,19.58,0.605,6.319,96.1,2.1,403,14.7,23.8
-13.3598,0.0,18.1,0.693,5.887,94.7,1.7821,666,20.2,12.7
-0.12744,0.0,6.91,0.448,6.77,2.9,5.7209,233,17.9,26.6
-0.15876,0.0,10.81,0.413,5.961,17.5,5.2873,305,19.2,21.7
-0.03768,80.0,1.52,0.404,7.274,38.3,7.309,329,12.6,34.6
-0.03705,20.0,3.33,0.4429,6.968,37.2,5.2447,216,14.9,35.4
-0.07244,60.0,1.69,0.411,5.884,18.5,10.7103,411,18.3,18.6
-0.1,34.0,6.09,0.433,6.982,17.7,5.4917,329,16.1,33.1
-4.81213,0.0,18.1,0.713,6.701,90.0,2.5975,666,20.2,16.4
-0.15086,0.0,27.74,0.609,5.454,92.7,1.8209,711,20.1,15.2
-38.3518,0.0,18.1,0.693,5.453,100.0,1.4896,666,20.2,5.0
-0.05479,33.0,2.18,0.472,6.616,58.1,3.37,222,18.4,28.4
-4.54192,0.0,18.1,0.77,6.398,88.0,2.5182,666,20.2,25.0
-0.7857,20.0,3.97,0.647,7.014,84.6,2.1329,264,13.0,30.7
-3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,403,14.7,15.6
-1.22358,0.0,19.58,0.605,6.943,97.4,1.8773,403,14.7,41.3
-0.08826,0.0,10.81,0.413,6.417,6.6,5.2873,305,19.2,24.2
-3.56868,0.0,18.1,0.58,6.437,75.0,2.8965,666,20.2,23.2
-3.47428,0.0,18.1,0.718,8.78,82.9,1.9047,666,20.2,21.9
-0.05735,0.0,4.49,0.449,6.63,56.1,4.4377,247,18.5,26.6
-0.04981,21.0,5.64,0.439,5.998,21.4,6.8147,243,16.8,23.4
-0.17505,0.0,5.96,0.499,5.966,30.2,3.8473,279,19.2,24.7
-7.75223,0.0,18.1,0.713,6.301,83.7,2.7831,666,20.2,14.9
-8.26725,0.0,18.1,0.668,5.875,89.6,1.1296,666,20.2,50.0
-5.66637,0.0,18.1,0.74,6.219,100.0,2.0048,666,20.2,18.4
-0.17331,0.0,9.69,0.585,5.707,54.0,2.3817,391,19.2,21.8
-0.14476,0.0,10.01,0.547,5.731,65.2,2.7592,432,17.8,19.3
-1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,403,14.7,27.0
-6.39312,0.0,18.1,0.584,6.162,97.4,2.206,666,20.2,13.3
-8.79212,0.0,18.1,0.584,5.565,70.6,2.0635,666,20.2,11.7
-0.0566,0.0,3.41,0.489,7.007,86.3,3.4217,270,17.8,23.6
-0.03932,0.0,3.41,0.489,6.405,73.9,3.0921,270,17.8,22.0
-8.64476,0.0,18.1,0.693,6.193,92.6,1.7912,666,20.2,13.8
-0.46296,0.0,6.2,0.504,7.412,76.9,3.6715,307,17.4,31.7
-2.924,0.0,19.58,0.605,6.101,93.0,2.2834,403,14.7,25.0
-15.288,0.0,18.1,0.671,6.649,93.3,1.3449,666,20.2,13.9
-1.46336,0.0,19.58,0.605,7.489,90.8,1.9709,403,14.7,50.0
-0.29819,0.0,6.2,0.504,7.686,17.0,3.3751,307,17.4,46.7
-9.32909,0.0,18.1,0.713,6.185,98.7,2.2616,666,20.2,14.1
-0.32264,0.0,21.89,0.624,5.942,93.5,1.9669,437,21.2,17.4
-0.03041,0.0,5.19,0.515,5.895,59.6,5.615,224,20.2,18.5
-41.5292,0.0,18.1,0.693,5.531,85.4,1.6074,666,20.2,8.5
-0.55778,0.0,21.89,0.624,6.335,98.2,2.1107,437,21.2,18.1
-0.34006,0.0,21.89,0.624,6.458,98.9,2.1185,437,21.2,19.2
-1.80028,0.0,19.58,0.605,5.877,79.2,2.4259,403,14.7,23.8
-0.38214,0.0,6.2,0.504,8.04,86.5,3.2157,307,17.4,37.6
-0.09744,0.0,5.96,0.499,5.841,61.4,3.3779,279,19.2,20.0
-9.82349,0.0,18.1,0.671,6.794,98.8,1.358,666,20.2,13.3
-11.9511,0.0,18.1,0.659,5.608,100.0,1.2852,666,20.2,27.9
-0.08387,0.0,12.83,0.437,5.874,36.6,4.5026,398,18.7,20.3
-1.19294,0.0,21.89,0.624,6.326,97.7,2.271,437,21.2,19.6
-1.49632,0.0,19.58,0.871,5.404,100.0,1.5916,403,14.7,19.6
-22.0511,0.0,18.1,0.74,5.818,92.4,1.8662,666,20.2,10.5
-0.09849,0.0,25.65,0.581,5.879,95.8,2.0063,188,19.1,18.8
-7.99248,0.0,18.1,0.7,5.52,100.0,1.5331,666,20.2,12.3
-5.82401,0.0,18.1,0.532,6.242,64.7,3.4242,666,20.2,23.0
-0.19073,22.0,5.86,0.431,6.718,17.5,7.8265,330,19.1,26.2
-1.25179,0.0,8.14,0.538,5.57,98.1,3.7979,307,21.0,13.6
-0.0795,60.0,1.69,0.411,6.579,35.9,10.7103,411,18.3,24.1
-0.15098,0.0,10.01,0.547,6.021,82.6,2.7474,432,17.8,19.2
-2.36862,0.0,19.58,0.871,4.926,95.7,1.4608,403,14.7,14.6
-4.64689,0.0,18.1,0.614,6.98,67.6,2.5329,666,20.2,29.8
-0.11132,0.0,27.74,0.609,5.983,83.5,2.1099,711,20.1,20.1
-0.03738,0.0,5.19,0.515,6.31,38.5,6.4584,224,20.2,20.7
-0.32982,0.0,21.89,0.624,5.822,95.4,2.4699,437,21.2,18.4
-7.83932,0.0,18.1,0.655,6.209,65.4,2.9634,666,20.2,21.4
-0.09103,0.0,2.46,0.488,7.155,92.2,2.7006,193,17.8,37.9
-0.37578,0.0,10.59,0.489,5.404,88.6,3.665,277,18.6,19.3
-9.18702,0.0,18.1,0.7,5.536,100.0,1.5804,666,20.2,11.3
-0.07875,45.0,3.44,0.437,6.782,41.1,3.7886,398,15.2,32.0
-9.2323,0.0,18.1,0.631,6.216,100.0,1.1691,666,20.2,50.0
-9.39063,0.0,18.1,0.74,5.627,93.9,1.8172,666,20.2,12.8
-0.20608,22.0,5.86,0.431,5.593,76.5,7.9549,330,19.1,17.6
-6.28807,0.0,18.1,0.74,6.341,96.4,2.072,666,20.2,14.9
-0.24103,0.0,7.38,0.493,6.083,43.7,5.4159,287,19.6,22.2
-51.1358,0.0,18.1,0.597,5.757,100.0,1.413,666,20.2,15.0
-0.80271,0.0,8.14,0.538,5.456,36.6,3.7965,307,21.0,20.2
-0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242,17.8,34.7
-0.13642,0.0,10.59,0.489,5.891,22.3,3.9454,277,18.6,22.6
-0.05083,0.0,5.19,0.515,6.316,38.1,6.4584,224,20.2,22.2
-0.18836,0.0,6.91,0.448,5.786,33.3,5.1004,233,17.9,20.0
-1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,403,14.7,15.3
-0.537,0.0,6.2,0.504,5.981,68.1,3.6715,307,17.4,24.3
-0.07886,80.0,4.95,0.411,7.148,27.7,5.1167,245,19.2,37.3
-0.00906,90.0,2.97,0.4,7.088,20.8,7.3073,285,15.3,32.2
-0.17899,0.0,9.69,0.585,5.67,28.8,2.7986,391,19.2,23.1
-0.11425,0.0,13.89,0.55,6.373,92.4,3.3633,276,16.4,23.0
-18.0846,0.0,18.1,0.679,6.434,100.0,1.8347,666,20.2,7.2
-0.62356,0.0,6.2,0.507,6.879,77.7,3.2721,307,17.4,27.5
-5.66998,0.0,18.1,0.631,6.683,96.8,1.3567,666,20.2,50.0
-0.05425,0.0,4.05,0.51,6.315,73.4,3.3175,296,16.6,24.6
-15.1772,0.0,18.1,0.74,6.152,100.0,1.9142,666,20.2,8.7
-4.89822,0.0,18.1,0.631,4.97,100.0,1.3325,666,20.2,50.0
-7.05042,0.0,18.1,0.614,6.103,85.1,2.0218,666,20.2,13.4
-19.6091,0.0,18.1,0.671,7.313,97.9,1.3163,666,20.2,15.0
-0.07165,0.0,25.65,0.581,6.004,84.1,2.1974,188,19.1,20.3
-0.05188,0.0,4.49,0.449,6.015,45.1,4.4272,247,18.5,22.5
-0.04297,52.5,5.32,0.405,6.565,22.9,7.3172,293,16.6,24.8
-0.20746,0.0,27.74,0.609,5.093,98.0,1.8226,711,20.1,8.1
-0.6718,0.0,18.1,0.74,6.459,94.8,1.9879,666,20.2,11.8
-0.03551,25.0,4.86,0.426,6.167,46.7,5.4007,281,19.0,22.9
-0.55007,20.0,3.97,0.647,7.206,91.6,1.9301,264,13.0,36.5
-5.70818,0.0,18.1,0.532,6.75,74.9,3.3317,666,20.2,23.7
-0.33147,0.0,6.2,0.507,8.247,70.4,3.6519,307,17.4,48.3
-0.61154,20.0,3.97,0.647,8.704,86.9,1.801,264,13.0,50.0
-0.19802,0.0,10.59,0.489,6.182,42.4,3.9454,277,18.6,25.0
-0.15936,0.0,6.91,0.448,6.211,6.5,5.7209,233,17.9,24.7
-10.0623,0.0,18.1,0.584,6.833,94.3,2.0882,666,20.2,14.1
-1.51902,0.0,19.58,0.605,8.375,93.9,2.162,403,14.7,50.0
-13.5222,0.0,18.1,0.631,3.863,100.0,1.5106,666,20.2,23.1
-0.13914,0.0,4.05,0.51,5.572,88.5,2.5961,296,16.6,23.1
-0.01501,90.0,1.21,0.401,7.923,24.8,5.885,198,13.6,50.0
-0.08265,0.0,13.92,0.437,6.127,18.4,5.5027,289,16.0,23.9
-5.87205,0.0,18.1,0.693,6.405,96.0,1.6768,666,20.2,12.5
-0.04462,25.0,4.86,0.426,6.619,70.4,5.4007,281,19.0,23.9
-0.30347,0.0,7.38,0.493,6.312,28.9,5.4159,287,19.6,23.0
-0.01439,60.0,2.93,0.401,6.604,18.8,6.2196,265,15.6,29.1
-0.05644,40.0,6.41,0.447,6.758,32.9,4.0776,254,17.6,32.4
-4.66883,0.0,18.1,0.713,5.976,87.9,2.5806,666,20.2,12.7
-0.95577,0.0,8.14,0.538,6.047,88.8,4.4534,307,21.0,14.8
-0.03871,52.5,5.32,0.405,6.209,31.3,7.3172,293,16.6,23.2
-5.73116,0.0,18.1,0.532,7.061,77.0,3.4106,666,20.2,25.0
-11.5779,0.0,18.1,0.7,5.036,97.0,1.77,666,20.2,9.7
-0.21124,12.5,7.87,0.524,5.631,100.0,6.0821,311,15.2,16.5
-0.1146,20.0,6.96,0.464,6.538,58.7,3.9175,223,18.6,24.4
-2.37934,0.0,19.58,0.871,6.13,100.0,1.4191,403,14.7,13.8
-0.34109,0.0,7.38,0.493,6.415,40.1,4.7211,287,19.6,25.0
-0.40771,0.0,6.2,0.507,6.164,91.3,3.048,307,17.4,21.7
-0.04203,28.0,15.04,0.464,6.442,53.6,3.6659,270,18.2,22.9
-0.02543,55.0,3.78,0.484,6.696,56.4,5.7321,370,17.6,23.9
-0.49298,0.0,9.9,0.544,6.635,82.5,3.3175,304,18.4,22.8
-0.08187,0.0,2.89,0.445,7.82,36.9,3.4952,276,18.0,43.8
-4.83567,0.0,18.1,0.583,5.905,53.2,3.1523,666,20.2,20.6
-25.9406,0.0,18.1,0.679,5.304,89.1,1.6475,666,20.2,10.4
-3.67822,0.0,18.1,0.77,5.362,96.2,2.1036,666,20.2,20.8
-0.22438,0.0,9.69,0.585,6.027,79.7,2.4982,391,19.2,16.8
-15.8744,0.0,18.1,0.671,6.545,99.1,1.5192,666,20.2,10.9
-0.32543,0.0,21.89,0.624,6.431,98.8,1.8125,437,21.2,18.0
-0.03049,55.0,3.78,0.484,6.874,28.1,6.4654,370,17.6,31.2
-0.06588,0.0,2.46,0.488,7.765,83.3,2.741,193,17.8,39.8
-0.03537,34.0,6.09,0.433,6.59,40.4,5.4917,329,16.1,22.0
-0.04544,0.0,3.24,0.46,6.144,32.2,5.8736,430,16.9,19.8
-0.06151,0.0,5.19,0.515,5.968,58.5,4.8122,224,20.2,18.7
-0.05602,0.0,2.46,0.488,7.831,53.6,3.1992,193,17.8,50.0
-73.5341,0.0,18.1,0.679,5.957,100.0,1.8026,666,20.2,8.8
-0.03466,35.0,6.06,0.4379,6.031,23.3,6.6407,304,16.9,19.4
-0.26169,0.0,9.9,0.544,6.023,90.4,2.834,304,18.4,19.4
-0.33983,22.0,5.86,0.431,6.108,34.9,8.0555,330,19.1,24.3
-0.04819,80.0,3.64,0.392,6.108,32.0,9.2203,315,16.4,21.9
-0.06211,40.0,1.25,0.429,6.49,44.4,8.7921,335,19.7,22.9
-0.02177,82.5,2.03,0.415,7.61,15.7,6.27,348,14.7,42.3
-2.37857,0.0,18.1,0.583,5.871,41.9,3.724,666,20.2,20.6
-0.04337,21.0,5.64,0.439,6.115,63.0,6.8147,243,16.8,20.5
-0.52014,20.0,3.97,0.647,8.398,91.5,2.2885,264,13.0,48.8
-0.63796,0.0,8.14,0.538,6.096,84.5,4.4619,307,21.0,18.2
-1.00245,0.0,8.14,0.538,6.674,87.3,4.239,307,21.0,21.0
-8.24809,0.0,18.1,0.713,7.393,99.3,2.4527,666,20.2,17.8
-0.14866,0.0,8.56,0.52,6.727,79.9,2.7778,384,20.9,27.5
-0.06263,0.0,11.93,0.573,6.593,69.1,2.4786,273,21.0,22.4
-15.8603,0.0,18.1,0.679,5.896,95.4,1.9096,666,20.2,8.3
-0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222,18.7,33.4
-1.23247,0.0,8.14,0.538,6.142,91.7,3.9769,307,21.0,15.2
-9.91655,0.0,18.1,0.693,5.852,77.8,1.5004,666,20.2,6.3
-1.61282,0.0,8.14,0.538,6.096,96.9,3.7598,307,21.0,13.5
-0.0351,95.0,2.68,0.4161,7.853,33.2,5.118,224,14.7,48.5
-0.02187,60.0,2.93,0.401,6.8,9.9,6.2196,265,15.6,31.1
-0.06888,0.0,2.46,0.488,6.144,62.2,2.5979,193,17.8,36.2
-0.18337,0.0,27.74,0.609,5.414,98.3,1.7554,711,20.1,7.0
-0.03615,80.0,4.95,0.411,6.63,23.4,5.1167,245,19.2,27.9
-0.84054,0.0,8.14,0.538,5.599,85.7,4.4546,307,21.0,13.9
-0.0578,0.0,2.46,0.488,6.98,58.4,2.829,193,17.8,37.2
-9.92485,0.0,18.1,0.74,6.251,96.6,2.198,666,20.2,12.6
-0.04932,33.0,2.18,0.472,6.849,70.3,3.1827,222,18.4,28.2
-0.25915,0.0,21.89,0.624,5.693,96.0,1.7883,437,21.2,16.2
-6.44405,0.0,18.1,0.584,6.425,74.8,2.2004,666,20.2,16.1
-16.8118,0.0,18.1,0.7,5.277,98.1,1.4261,666,20.2,7.2
-0.54452,0.0,21.89,0.624,6.151,97.9,1.6687,437,21.2,17.8
-0.12204,0.0,2.89,0.445,6.625,57.8,3.4952,276,18.0,28.4
-0.54011,20.0,3.97,0.647,7.203,81.8,2.1121,264,13.0,33.8
-14.2362,0.0,18.1,0.693,6.343,100.0,1.5741,666,20.2,7.2
-0.22927,0.0,6.91,0.448,6.03,85.5,5.6894,233,17.9,16.6
-0.14052,0.0,10.59,0.489,6.375,32.3,3.9454,277,18.6,28.1
-0.26363,0.0,8.56,0.52,6.229,91.2,2.5451,384,20.9,19.4
-0.67191,0.0,8.14,0.538,5.813,90.3,4.682,307,21.0,16.6
-6.65492,0.0,18.1,0.713,6.317,83.0,2.7344,666,20.2,19.5
-14.4383,0.0,18.1,0.597,6.852,100.0,1.4655,666,20.2,27.5
-4.75237,0.0,18.1,0.713,6.525,86.5,2.4358,666,20.2,14.1
-0.0187,85.0,4.15,0.429,6.516,27.7,8.5353,351,17.9,23.1
-7.40389,0.0,18.1,0.597,5.617,97.9,1.4547,666,20.2,17.2
-5.29305,0.0,18.1,0.7,6.051,82.5,2.1678,666,20.2,23.2
-37.6619,0.0,18.1,0.679,6.202,78.7,1.8629,666,20.2,10.9
-0.51183,0.0,6.2,0.507,7.358,71.6,4.148,307,17.4,31.5
-0.10328,25.0,5.13,0.453,5.927,47.2,6.932,284,19.7,19.6
-14.3337,0.0,18.1,0.7,4.88,100.0,1.5895,666,20.2,10.2
-0.06899,0.0,25.65,0.581,5.87,69.7,2.2577,188,19.1,22.0
-0.03659,25.0,4.86,0.426,6.302,32.2,5.4007,281,19.0,24.8
-0.53412,20.0,3.97,0.647,7.52,89.4,2.1398,264,13.0,43.1
-0.02985,0.0,2.18,0.458,6.43,58.7,6.0622,222,18.7,28.7
-0.6147,0.0,6.2,0.507,6.618,80.8,3.2721,307,17.4,30.1
-4.42228,0.0,18.1,0.584,6.003,94.5,2.5403,666,20.2,19.1
-0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242,17.8,21.6
-0.62976,0.0,8.14,0.538,5.949,61.8,4.7075,307,21.0,20.4
-0.85204,0.0,8.14,0.538,5.965,89.2,4.0123,307,21.0,19.6
-0.08829,12.5,7.87,0.524,6.012,66.6,5.5605,311,15.2,22.9
-0.11069,0.0,13.89,0.55,5.951,93.8,2.8893,276,16.4,21.5
-0.03961,0.0,5.19,0.515,6.037,34.5,5.9853,224,20.2,21.1
-0.27957,0.0,9.69,0.585,5.926,42.6,2.3817,391,19.2,24.5
-0.82526,20.0,3.97,0.647,7.327,94.5,2.0788,264,13.0,31.0
-0.10793,0.0,8.56,0.52,6.195,54.4,2.7778,384,20.9,21.7
-25.0461,0.0,18.1,0.693,5.987,100.0,1.5888,666,20.2,5.6
-0.01709,90.0,2.02,0.41,6.728,36.1,12.1265,187,17.0,30.1
-1.83377,0.0,19.58,0.605,7.802,98.2,2.0407,403,14.7,50.0
-0.06664,0.0,4.05,0.51,6.546,33.1,3.1323,296,16.6,29.4
-0.08244,30.0,4.93,0.428,6.481,18.5,6.1899,300,16.6,23.7
-0.01965,80.0,1.76,0.385,6.23,31.5,9.0892,241,18.2,20.1
-24.8017,0.0,18.1,0.693,5.349,96.0,1.7028,666,20.2,8.3
-3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,403,14.7,13.4
-0.22212,0.0,10.01,0.547,6.092,95.4,2.548,432,17.8,18.7
-14.4208,0.0,18.1,0.74,6.461,93.3,2.0026,666,20.2,9.6
-0.7258,0.0,8.14,0.538,5.727,69.5,3.7965,307,21.0,18.2
-1.42502,0.0,19.58,0.871,6.51,100.0,1.7659,403,14.7,23.3
-14.0507,0.0,18.1,0.597,6.657,100.0,1.5275,666,20.2,17.2
-8.20058,0.0,18.1,0.713,5.936,80.3,2.7792,666,20.2,13.5
-15.0234,0.0,18.1,0.614,5.304,97.3,2.1007,666,20.2,12.0
-1.35472,0.0,8.14,0.538,6.072,100.0,4.175,307,21.0,14.5
-8.05579,0.0,18.1,0.584,5.427,95.4,2.4298,666,20.2,13.8
-0.19657,22.0,5.86,0.431,6.226,79.2,8.0555,330,19.1,20.5
-0.05059,0.0,4.49,0.449,6.389,48.0,4.7794,247,18.5,23.9
-0.09299,0.0,25.65,0.581,5.961,92.9,2.0869,188,19.1,20.5
-1.05393,0.0,8.14,0.538,5.935,29.3,4.4986,307,21.0,23.1
-0.12579,45.0,3.44,0.437,6.556,29.1,4.5667,398,15.2,29.8
-0.17446,0.0,10.59,0.489,5.96,92.1,3.8771,277,18.6,21.7
-0.16211,20.0,6.96,0.464,6.24,16.3,4.429,223,18.6,25.2
-0.22489,12.5,7.87,0.524,6.377,94.3,6.3467,311,15.2,15.0
-45.7461,0.0,18.1,0.693,4.519,100.0,1.6582,666,20.2,7.0
-2.24236,0.0,19.58,0.605,5.854,91.8,2.422,403,14.7,22.7
-0.29916,20.0,6.96,0.464,5.856,42.1,4.429,223,18.6,21.1
-8.71675,0.0,18.1,0.693,6.471,98.8,1.7257,666,20.2,13.1
-0.07022,0.0,4.05,0.51,6.02,47.2,3.5549,296,16.6,23.2
-0.09164,0.0,10.81,0.413,6.065,7.8,5.2873,305,19.2,22.8
-24.3938,0.0,18.1,0.7,4.652,100.0,1.4672,666,20.2,10.5
-0.14932,25.0,5.13,0.453,5.741,66.2,7.2254,284,19.7,18.7
-0.12932,0.0,13.92,0.437,6.678,31.1,5.9604,289,16.0,28.6
-5.20177,0.0,18.1,0.77,6.127,83.4,2.7227,666,20.2,22.7
-0.31827,0.0,9.9,0.544,5.914,83.2,3.9986,304,18.4,17.8
-0.11504,0.0,2.89,0.445,6.163,69.6,3.4952,276,18.0,21.4
-0.10574,0.0,27.74,0.609,5.983,98.8,1.8681,711,20.1,13.6
-0.7842,0.0,8.14,0.538,5.99,81.7,4.2579,307,21.0,17.5
-2.44668,0.0,19.58,0.871,5.272,94.0,1.7364,403,14.7,13.1
-0.06047,0.0,2.46,0.488,6.153,68.8,3.2797,193,17.8,29.6
-0.25387,0.0,6.91,0.448,5.399,95.3,5.87,233,17.9,14.4
-3.77498,0.0,18.1,0.655,5.952,84.7,2.8715,666,20.2,19.0
-0.19539,0.0,10.81,0.413,6.245,6.2,5.2873,305,19.2,23.4
-0.3494,0.0,9.9,0.544,5.972,76.7,3.1025,304,18.4,20.3
-11.0874,0.0,18.1,0.718,6.411,100.0,1.8589,666,20.2,16.7
-5.69175,0.0,18.1,0.583,6.114,79.8,3.5459,666,20.2,19.1
-0.08447,0.0,4.05,0.51,5.859,68.7,2.7019,296,16.6,22.6
-0.44791,0.0,6.2,0.507,6.726,66.5,3.6519,307,17.4,29.0
-28.6558,0.0,18.1,0.597,5.155,100.0,1.5894,666,20.2,16.3
-0.28955,0.0,10.59,0.489,5.412,9.8,3.5875,277,18.6,23.7
-2.01019,0.0,19.58,0.605,7.929,96.2,2.0459,403,14.7,50.0
-4.87141,0.0,18.1,0.614,6.484,93.6,2.3053,666,20.2,16.7
-0.14455,12.5,7.87,0.524,6.172,96.1,5.9505,311,15.2,27.1
-0.05497,0.0,5.19,0.515,5.985,45.4,4.8122,224,20.2,19.0
-4.34879,0.0,18.1,0.58,6.167,84.0,3.0334,666,20.2,19.9
-0.06129,20.0,3.33,0.4429,7.645,49.7,5.2119,216,14.9,46.0
-6.71772,0.0,18.1,0.713,6.749,92.6,2.3236,666,20.2,13.4
-0.07151,0.0,4.49,0.449,6.121,56.8,3.7476,247,18.5,22.2
-0.11329,30.0,4.93,0.428,6.897,54.3,6.3361,300,16.6,22.0
-10.8342,0.0,18.1,0.679,6.782,90.8,1.8195,666,20.2,7.5
-3.83684,0.0,18.1,0.77,6.251,91.1,2.2955,666,20.2,19.9
-0.12757,30.0,4.93,0.428,6.393,7.8,7.0355,300,16.6,23.7
-1.15172,0.0,8.14,0.538,5.701,95.0,3.7872,307,21.0,13.1
-0.01301,35.0,1.52,0.442,7.241,49.3,7.0379,284,15.5,32.7
-0.09178,0.0,4.05,0.51,6.416,84.1,2.6463,296,16.6,23.6
-0.0536,21.0,5.64,0.439,6.511,21.1,6.8147,243,16.8,25.0
-0.12802,0.0,8.56,0.52,6.474,97.1,2.4329,384,20.9,19.8
-0.07978,40.0,6.41,0.447,6.482,32.1,4.1403,254,17.6,29.1
-0.04113,25.0,4.86,0.426,6.727,33.5,5.4007,281,19.0,28.0
-1.13081,0.0,8.14,0.538,5.713,94.1,4.233,307,21.0,12.7
-88.9762,0.0,18.1,0.671,6.968,91.9,1.4165,666,20.2,10.4
-0.26838,0.0,9.69,0.585,5.794,70.6,2.8927,391,19.2,18.3
-11.1081,0.0,18.1,0.668,4.906,100.0,1.1742,666,20.2,13.8
-0.14103,0.0,13.92,0.437,5.79,58.0,6.32,289,16.0,20.3
-8.98296,0.0,18.1,0.77,6.212,97.4,2.1222,666,20.2,17.8
-0.12329,0.0,10.01,0.547,5.913,92.9,2.3534,432,17.8,18.8
-0.25199,0.0,10.59,0.489,5.783,72.7,4.3549,277,18.6,22.5
-0.22188,20.0,6.96,0.464,7.691,51.8,4.3665,223,18.6,35.2
-0.08873,21.0,5.64,0.439,5.963,45.7,6.8147,243,16.8,19.7
-0.02899,40.0,1.25,0.429,6.939,34.5,8.7921,335,19.7,26.6
-0.08308,0.0,2.46,0.488,5.604,89.8,2.9879,193,17.8,26.4
-0.01311,90.0,1.22,0.403,7.249,21.9,8.6966,226,17.9,35.4
-0.13158,0.0,10.01,0.547,6.176,72.5,2.7301,432,17.8,21.2
-0.02009,95.0,2.68,0.4161,8.034,31.9,5.118,224,14.7,50.0
-0.09252,30.0,4.93,0.428,6.606,42.2,6.1899,300,16.6,23.3
-0.15038,0.0,25.65,0.581,5.856,97.0,1.9444,188,19.1,17.3
-0.2498,0.0,21.89,0.624,5.857,98.2,1.6686,437,21.2,13.3
-0.57834,20.0,3.97,0.575,8.297,67.0,2.4216,264,13.0,50.0
-0.05023,35.0,6.06,0.4379,5.706,28.4,6.6407,304,16.9,17.1
-9.72418,0.0,18.1,0.74,6.406,97.2,2.0651,666,20.2,17.1
-0.22969,0.0,10.59,0.489,6.326,52.5,4.3549,277,18.6,24.4
-0.22876,0.0,8.56,0.52,6.405,85.4,2.7147,384,20.9,18.6
-9.59571,0.0,18.1,0.693,6.404,100.0,1.639,666,20.2,12.1
-1.62864,0.0,21.89,0.624,5.019,100.0,1.4394,437,21.2,14.4
-0.05646,0.0,12.83,0.437,6.232,53.7,5.0141,398,18.7,21.2
-0.52058,0.0,6.2,0.507,6.631,76.5,4.148,307,17.4,25.1
-1.20742,0.0,19.58,0.605,5.875,94.6,2.4259,403,14.7,17.4
-0.17171,25.0,5.13,0.453,5.966,93.4,6.8185,284,19.7,16.0
-2.44953,0.0,19.58,0.605,6.402,95.2,2.2625,403,14.7,22.3
-0.04741,0.0,11.93,0.573,6.03,80.8,2.505,273,21.0,11.9
-0.04379,80.0,3.37,0.398,5.787,31.1,6.6115,337,16.1,19.4
-0.01432,100.0,1.32,0.411,6.816,40.5,8.3248,256,15.1,31.6
-0.03427,0.0,5.19,0.515,5.869,46.3,5.2311,224,20.2,19.5
-0.18159,0.0,7.38,0.493,6.376,54.3,4.5404,287,19.6,23.1
-5.58107,0.0,18.1,0.713,6.436,87.9,2.3158,666,20.2,14.3
-23.6482,0.0,18.1,0.671,6.38,96.2,1.3861,666,20.2,13.1
-2.14918,0.0,19.58,0.871,5.709,98.5,1.6232,403,14.7,19.4
-2.33099,0.0,19.58,0.871,5.186,93.8,1.5296,403,14.7,17.8
-0.2896,0.0,9.69,0.585,5.39,72.9,2.7986,391,19.2,19.7
-0.06466,70.0,2.24,0.4,6.345,20.1,7.8278,358,14.8,22.5
-0.01381,80.0,0.46,0.422,7.875,32.0,5.6484,255,14.4,50.0
-0.03306,0.0,5.19,0.515,6.059,37.3,4.8122,224,20.2,20.6
-0.35233,0.0,21.89,0.624,6.454,98.4,1.8498,437,21.2,17.1
-0.06724,0.0,3.24,0.46,6.333,17.2,5.2146,430,16.9,22.6
-0.04294,28.0,15.04,0.464,6.249,77.3,3.615,270,18.2,20.6
-4.55587,0.0,18.1,0.718,3.561,87.9,1.6132,666,20.2,27.5
-0.66351,20.0,3.97,0.647,7.333,100.0,1.8946,264,13.0,36.0
-7.67202,0.0,18.1,0.693,5.747,98.9,1.6334,666,20.2,8.5
-0.0136,75.0,4.0,0.41,5.888,47.6,7.3197,469,21.1,18.9
-0.17142,0.0,6.91,0.448,5.682,33.8,5.1004,233,17.9,19.3
-0.08221,22.0,5.86,0.431,6.957,6.8,8.9067,330,19.1,29.6
-0.31533,0.0,6.2,0.504,8.266,78.3,2.8944,307,17.4,44.8
-6.96215,0.0,18.1,0.7,5.713,97.0,1.9265,666,20.2,15.1
-20.7162,0.0,18.1,0.659,4.138,100.0,1.1781,666,20.2,11.9
-0.12083,0.0,2.89,0.445,8.069,76.0,3.4952,276,18.0,38.7
-0.01951,17.5,1.38,0.4161,7.104,59.5,9.2229,216,18.6,33.0
-0.09512,0.0,12.83,0.437,6.286,45.0,4.5026,398,18.7,21.4
-6.53876,0.0,18.1,0.631,7.016,97.5,1.2024,666,20.2,50.0
-0.02875,28.0,15.04,0.464,6.211,28.9,3.6659,270,18.2,25.0
-2.81838,0.0,18.1,0.532,5.762,40.3,4.0983,666,20.2,21.8
-0.02763,75.0,2.95,0.428,6.595,21.8,5.4011,252,18.3,30.8
-0.10153,0.0,12.83,0.437,6.279,74.5,4.0522,398,18.7,20.0
-0.06911,45.0,3.44,0.437,6.739,30.8,6.4798,398,15.2,30.5
-0.33045,0.0,6.2,0.507,6.086,61.5,3.6519,307,17.4,24.0
-2.77974,0.0,19.58,0.871,4.903,97.8,1.3459,403,14.7,11.8
-0.1029,30.0,4.93,0.428,6.358,52.9,7.0355,300,16.6,22.2
-0.10659,80.0,1.91,0.413,5.936,19.5,10.5857,334,22.0,20.6
-8.49213,0.0,18.1,0.584,6.348,86.1,2.0527,666,20.2,14.5
-0.25356,0.0,9.9,0.544,5.705,77.7,3.945,304,18.4,16.2
-0.59005,0.0,21.89,0.624,6.372,97.9,2.3274,437,21.2,23.0
-0.03502,80.0,4.95,0.411,6.861,27.9,5.1167,245,19.2,28.5
-13.9134,0.0,18.1,0.713,6.208,95.0,2.2222,666,20.2,11.7
-3.67367,0.0,18.1,0.583,6.312,51.9,3.9917,666,20.2,21.2
-67.9208,0.0,18.1,0.693,5.683,100.0,1.4254,666,20.2,5.0
-3.69311,0.0,18.1,0.713,6.376,88.4,2.5671,666,20.2,17.7
-0.21038,20.0,3.33,0.4429,6.812,32.2,4.1007,216,14.9,35.1
-0.04684,0.0,3.41,0.489,6.417,66.1,3.0923,270,17.8,22.6
-0.28392,0.0,7.38,0.493,5.708,74.3,4.7211,287,19.6,18.5
-0.19133,22.0,5.86,0.431,5.605,70.2,7.9549,330,19.1,18.5
-0.06162,0.0,4.39,0.442,5.898,52.3,8.0136,352,18.8,17.2
-13.0751,0.0,18.1,0.58,5.713,56.7,2.8237,666,20.2,20.1
-0.16439,22.0,5.86,0.431,6.433,49.1,7.8265,330,19.1,24.5
-0.13058,0.0,10.01,0.547,5.872,73.1,2.4775,432,17.8,20.4
-0.04666,80.0,1.52,0.404,7.107,36.6,7.309,329,12.6,30.3
-5.44114,0.0,18.1,0.713,6.655,98.2,2.3552,666,20.2,15.2
-0.21161,0.0,8.56,0.52,6.137,87.4,2.7147,384,20.9,19.3
-0.38735,0.0,25.65,0.581,5.613,95.6,1.7572,188,19.1,15.7
-3.69695,0.0,18.1,0.718,4.963,91.4,1.7523,666,20.2,21.9
-0.0837,45.0,3.44,0.437,7.185,38.9,4.5667,398,15.2,34.9
-0.02055,85.0,0.74,0.41,6.383,35.7,9.1876,313,17.3,24.7
-0.0459,52.5,5.32,0.405,6.315,45.6,7.3172,293,16.6,22.3
-0.06127,40.0,6.41,0.447,6.826,27.6,4.8628,254,17.6,33.1
-0.19186,0.0,7.38,0.493,6.431,14.7,5.4159,287,19.6,24.6
-0.1712,0.0,8.56,0.52,5.836,91.9,2.211,384,20.9,19.5
-1.6566,0.0,19.58,0.871,6.122,97.3,1.618,403,14.7,21.5
-0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296,15.3,24.0
-0.88125,0.0,21.89,0.624,5.637,94.7,1.9799,437,21.2,14.3
-0.98843,0.0,8.14,0.538,5.813,100.0,4.0952,307,21.0,14.5
-0.01538,90.0,3.75,0.394,7.454,34.2,6.3361,244,15.9,44.0
-0.05561,70.0,2.24,0.4,7.041,10.0,7.8278,358,14.8,29.0
-0.3692,0.0,9.9,0.544,6.567,87.3,3.6023,304,18.4,23.8
-0.0315,95.0,1.47,0.403,6.975,15.3,7.6534,402,17.0,34.9
-0.05789,12.5,6.07,0.409,5.878,21.4,6.498,345,18.9,22.0
-0.01778,95.0,1.47,0.403,7.135,13.9,7.6534,402,17.0,32.9
-0.57529,0.0,6.2,0.507,8.337,73.3,3.8384,307,17.4,41.7
-20.0849,0.0,18.1,0.7,4.368,91.2,1.4395,666,20.2,8.8
-18.811,0.0,18.1,0.597,4.628,100.0,1.5539,666,20.2,17.9
-14.3337,0.0,18.1,0.614,6.229,88.0,1.9512,666,20.2,21.4
-0.07896,0.0,12.83,0.437,6.273,6.0,4.2515,398,18.7,24.1
-0.03578,20.0,3.33,0.4429,7.82,64.5,4.6947,216,14.9,45.4
-0.10612,30.0,4.93,0.428,6.095,65.1,6.3361,300,16.6,20.1
-0.97617,0.0,21.89,0.624,5.757,98.4,2.346,437,21.2,15.6
-18.4982,0.0,18.1,0.668,4.138,100.0,1.137,666,20.2,13.8
-4.0974,0.0,19.58,0.871,5.468,100.0,1.4118,403,14.7,15.6
-0.15445,25.0,5.13,0.453,6.145,29.2,7.8148,284,19.7,23.3
-0.23912,0.0,9.69,0.585,6.019,65.3,2.4091,391,19.2,21.2
-0.0456,0.0,13.89,0.55,5.888,56.0,3.1121,276,16.4,23.3
-1.38799,0.0,8.14,0.538,5.95,82.0,3.99,307,21.0,13.2
-7.36711,0.0,18.1,0.679,6.193,78.1,1.9356,666,20.2,11.0
-0.1415,0.0,6.91,0.448,6.169,6.6,5.7209,233,17.9,25.3
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
deleted file mode 100644
index 99a9a9c840c..00000000000
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-# Example Estimator model
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "cnn_mnist",
-    srcs = [
-        "cnn_mnist.py",
-    ],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
deleted file mode 100644
index 670e929236f..00000000000
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Convolutional Neural Network Estimator for MNIST, built with tf.layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-
-def cnn_model_fn(features, labels, mode):
-  """Model function for CNN."""
-  # Input Layer
-  # Reshape X to 4-D tensor: [batch_size, width, height, channels]
-  # MNIST images are 28x28 pixels, and have one color channel
-  input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
-
-  # Convolutional Layer #1
-  # Computes 32 features using a 5x5 filter with ReLU activation.
-  # Padding is added to preserve width and height.
-  # Input Tensor Shape: [batch_size, 28, 28, 1]
-  # Output Tensor Shape: [batch_size, 28, 28, 32]
-  conv1 = tf.layers.conv2d(
-      inputs=input_layer,
-      filters=32,
-      kernel_size=[5, 5],
-      padding="same",
-      activation=tf.nn.relu)
-
-  # Pooling Layer #1
-  # First max pooling layer with a 2x2 filter and stride of 2
-  # Input Tensor Shape: [batch_size, 28, 28, 32]
-  # Output Tensor Shape: [batch_size, 14, 14, 32]
-  pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
-
-  # Convolutional Layer #2
-  # Computes 64 features using a 5x5 filter.
-  # Padding is added to preserve width and height.
-  # Input Tensor Shape: [batch_size, 14, 14, 32]
-  # Output Tensor Shape: [batch_size, 14, 14, 64]
-  conv2 = tf.layers.conv2d(
-      inputs=pool1,
-      filters=64,
-      kernel_size=[5, 5],
-      padding="same",
-      activation=tf.nn.relu)
-
-  # Pooling Layer #2
-  # Second max pooling layer with a 2x2 filter and stride of 2
-  # Input Tensor Shape: [batch_size, 14, 14, 64]
-  # Output Tensor Shape: [batch_size, 7, 7, 64]
-  pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
-
-  # Flatten tensor into a batch of vectors
-  # Input Tensor Shape: [batch_size, 7, 7, 64]
-  # Output Tensor Shape: [batch_size, 7 * 7 * 64]
-  pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
-
-  # Dense Layer
-  # Densely connected layer with 1024 neurons
-  # Input Tensor Shape: [batch_size, 7 * 7 * 64]
-  # Output Tensor Shape: [batch_size, 1024]
-  dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
-
-  # Add dropout operation; 0.6 probability that element will be kept
-  dropout = tf.layers.dropout(
-      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
-
-  # Logits layer
-  # Input Tensor Shape: [batch_size, 1024]
-  # Output Tensor Shape: [batch_size, 10]
-  logits = tf.layers.dense(inputs=dropout, units=10)
-
-  predictions = {
-      # Generate predictions (for PREDICT and EVAL mode)
-      "classes": tf.argmax(input=logits, axis=1),
-      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
-      # `logging_hook`.
-      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
-  }
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
-
-  # Calculate Loss (for both TRAIN and EVAL modes)
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  # Configure the Training Op (for TRAIN mode)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
-    train_op = optimizer.minimize(
-        loss=loss,
-        global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
-  # Add evaluation metrics (for EVAL mode)
-  eval_metric_ops = {
-      "accuracy": tf.metrics.accuracy(
-          labels=labels, predictions=predictions["classes"])}
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  # Load training and eval data
-  mnist = tf.contrib.learn.datasets.load_dataset("mnist")
-  train_data = mnist.train.images  # Returns np.array
-  train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
-  eval_data = mnist.test.images  # Returns np.array
-  eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
-
-  # Create the Estimator
-  mnist_classifier = tf.estimator.Estimator(
-      model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
-
-  # Set up logging for predictions
-  # Log the values in the "Softmax" tensor with label "probabilities"
-  tensors_to_log = {"probabilities": "softmax_tensor"}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=50)
-
-  # Train the model
-  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={"x": train_data},
-      y=train_labels,
-      batch_size=100,
-      num_epochs=None,
-      shuffle=True)
-  mnist_classifier.train(
-      input_fn=train_input_fn,
-      steps=20000,
-      hooks=[logging_hook])
-
-  # Evaluate the model and print results
-  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
-      x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False)
-  eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
-  print(eval_results)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 5d9362081b4..264d0849256 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -1,7 +1,7 @@
 # Description:
 # Example TensorFlow models for MNIST used in tutorials
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -94,43 +94,42 @@ py_binary(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "fully_connected_feed_test",
-    size = "medium",
     srcs = [
         "fully_connected_feed.py",
     ],
+    additional_deps = [
+        ":input_data",
+        ":mnist",
+        "//tensorflow:tensorflow_py",
+    ],
     args = [
         "--fake_data",
         "--max_steps=10",
     ],
     main = "fully_connected_feed.py",
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":input_data",
-        ":mnist",
-        "//tensorflow:tensorflow_py",
-    ],
+    tags = ["no_pip"],
 )
 
-py_test(
+tf_py_test(
     name = "mnist_with_summaries_test",
     size = "small",
     srcs = [
         "mnist_with_summaries.py",
     ],
+    additional_deps = [
+        ":input_data",
+        "//tensorflow:tensorflow_py",
+    ],
     args = [
         "--fake_data",
         "--max_steps=10",
         "--learning_rate=0.00",
     ],
     main = "mnist_with_summaries.py",
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # http://b/29184009
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
+    tags = [
+        "no_pip",
+        "notsan",  # http://b/29184009
     ],
 )
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
index e61cbab6ef4..8eb57100058 100644
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@@ -50,9 +50,9 @@ def placeholder_inputs(batch_size):
   # Note that the shapes of the placeholders match the shapes of the full
   # image and label tensors, except the first dimension is now batch_size
   # rather than the full size of the train or test data sets.
-  images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
-                                                         mnist.IMAGE_PIXELS))
-  labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
+  images_placeholder = tf.compat.v1.placeholder(
+      tf.float32, shape=(batch_size, mnist.IMAGE_PIXELS))
+  labels_placeholder = tf.compat.v1.placeholder(tf.int32, shape=(batch_size))
   return images_placeholder, labels_placeholder
 
 
@@ -140,19 +140,19 @@ def run_training():
     eval_correct = mnist.evaluation(logits, labels_placeholder)
 
     # Build the summary Tensor based on the TF collection of Summaries.
-    summary = tf.summary.merge_all()
+    summary = tf.compat.v1.summary.merge_all()
 
     # Add the variable initializer Op.
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
 
     # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
 
     # Create a session for running Ops on the Graph.
     sess = tf.compat.v1.Session()
 
     # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
+    summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir, sess.graph)
 
     # And then after everything is built:
 
@@ -216,9 +216,9 @@ def run_training():
 
 
 def main(_):
-  if tf.gfile.Exists(FLAGS.log_dir):
-    tf.gfile.DeleteRecursively(FLAGS.log_dir)
-  tf.gfile.MakeDirs(FLAGS.log_dir)
+  if tf.io.gfile.exists(FLAGS.log_dir):
+    tf.io.gfile.rmtree(FLAGS.log_dir)
+  tf.io.gfile.makedirs(FLAGS.log_dir)
   run_training()
 
 
@@ -276,4 +276,4 @@ if __name__ == '__main__':
   )
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index 7cedd0e264f..0141d4b25ea 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -54,29 +54,29 @@ def inference(images, hidden1_units, hidden2_units):
     softmax_linear: Output tensor with the computed logits.
   """
   # Hidden 1
-  with tf.name_scope('hidden1'):
+  with tf.compat.v1.name_scope('hidden1'):
     weights = tf.Variable(
-        tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
-                            stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
-        name='weights')
+        tf.random.truncated_normal(
+            [IMAGE_PIXELS, hidden1_units],
+            stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))), name='weights')
     biases = tf.Variable(tf.zeros([hidden1_units]),
                          name='biases')
     hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
   # Hidden 2
-  with tf.name_scope('hidden2'):
+  with tf.compat.v1.name_scope('hidden2'):
     weights = tf.Variable(
-        tf.truncated_normal([hidden1_units, hidden2_units],
-                            stddev=1.0 / math.sqrt(float(hidden1_units))),
-        name='weights')
+        tf.random.truncated_normal(
+            [hidden1_units, hidden2_units],
+            stddev=1.0 / math.sqrt(float(hidden1_units))), name='weights')
     biases = tf.Variable(tf.zeros([hidden2_units]),
                          name='biases')
     hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
   # Linear
-  with tf.name_scope('softmax_linear'):
+  with tf.compat.v1.name_scope('softmax_linear'):
     weights = tf.Variable(
-        tf.truncated_normal([hidden2_units, NUM_CLASSES],
-                            stddev=1.0 / math.sqrt(float(hidden2_units))),
-        name='weights')
+        tf.random.truncated_normal(
+            [hidden2_units, NUM_CLASSES],
+            stddev=1.0 / math.sqrt(float(hidden2_units))), name='weights')
     biases = tf.Variable(tf.zeros([NUM_CLASSES]),
                          name='biases')
     logits = tf.matmul(hidden2, weights) + biases
@@ -93,8 +93,9 @@ def loss(logits, labels):
   Returns:
     loss: Loss tensor of type float.
   """
-  labels = tf.to_int64(labels)
-  return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+  labels = tf.cast(labels, dtype=tf.int64)
+  return tf.compat.v1.losses.sparse_softmax_cross_entropy(
+      labels=labels, logits=logits)
 
 
 def training(loss, learning_rate):
@@ -115,9 +116,9 @@ def training(loss, learning_rate):
     train_op: The Op for training.
   """
   # Add a scalar summary for the snapshot loss.
-  tf.summary.scalar('loss', loss)
+  tf.compat.v1.summary.scalar('loss', loss)
   # Create the gradient descent optimizer with the given learning rate.
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+  optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
   # Create a variable to track the global step.
   global_step = tf.Variable(0, name='global_step', trainable=False)
   # Use the optimizer to apply the gradients that minimize the loss
@@ -142,6 +143,6 @@ def evaluation(logits, labels):
   # It returns a bool tensor with shape [batch_size] that is true for
   # the examples where the label is in the top k (here k=1)
   # of all logits for that example.
-  correct = tf.nn.in_top_k(logits, labels, 1)
+  correct = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
   # Return the number of true entries.
-  return tf.reduce_sum(tf.cast(correct, tf.int32))
+  return tf.reduce_sum(input_tensor=tf.cast(correct, tf.int32))
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index efe35ca096f..04315ad8a3f 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -40,22 +40,22 @@ def train():
   mnist = input_data.read_data_sets(FLAGS.data_dir,
                                     fake_data=FLAGS.fake_data)
 
-  sess = tf.InteractiveSession()
+  sess = tf.compat.v1.InteractiveSession()
   # Create a multilayer model.
 
   # Input placeholders
-  with tf.name_scope('input'):
-    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-    y_ = tf.placeholder(tf.int64, [None], name='y-input')
+  with tf.compat.v1.name_scope('input'):
+    x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input')
+    y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input')
 
-  with tf.name_scope('input_reshape'):
+  with tf.compat.v1.name_scope('input_reshape'):
     image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
-    tf.summary.image('input', image_shaped_input, 10)
+    tf.compat.v1.summary.image('input', image_shaped_input, 10)
 
   # We can't initialize these variables to 0 - the network will get stuck.
   def weight_variable(shape):
     """Create a weight variable with appropriate initialization."""
-    initial = tf.truncated_normal(shape, stddev=0.1)
+    initial = tf.random.truncated_normal(shape, stddev=0.1)
     return tf.Variable(initial)
 
   def bias_variable(shape):
@@ -65,15 +65,15 @@ def train():
 
   def variable_summaries(var):
     """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-    with tf.name_scope('summaries'):
-      mean = tf.reduce_mean(var)
-      tf.summary.scalar('mean', mean)
-      with tf.name_scope('stddev'):
-        stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
-      tf.summary.scalar('stddev', stddev)
-      tf.summary.scalar('max', tf.reduce_max(var))
-      tf.summary.scalar('min', tf.reduce_min(var))
-      tf.summary.histogram('histogram', var)
+    with tf.compat.v1.name_scope('summaries'):
+      mean = tf.reduce_mean(input_tensor=var)
+      tf.compat.v1.summary.scalar('mean', mean)
+      with tf.compat.v1.name_scope('stddev'):
+        stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean)))
+      tf.compat.v1.summary.scalar('stddev', stddev)
+      tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var))
+      tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var))
+      tf.compat.v1.summary.histogram('histogram', var)
 
   def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
     """Reusable code for making a simple neural net layer.
@@ -83,32 +83,32 @@ def train():
     and adds a number of summary ops.
     """
     # Adding a name scope ensures logical grouping of the layers in the graph.
-    with tf.name_scope(layer_name):
+    with tf.compat.v1.name_scope(layer_name):
       # This Variable will hold the state of the weights for the layer
-      with tf.name_scope('weights'):
+      with tf.compat.v1.name_scope('weights'):
         weights = weight_variable([input_dim, output_dim])
         variable_summaries(weights)
-      with tf.name_scope('biases'):
+      with tf.compat.v1.name_scope('biases'):
         biases = bias_variable([output_dim])
         variable_summaries(biases)
-      with tf.name_scope('Wx_plus_b'):
+      with tf.compat.v1.name_scope('Wx_plus_b'):
         preactivate = tf.matmul(input_tensor, weights) + biases
-        tf.summary.histogram('pre_activations', preactivate)
+        tf.compat.v1.summary.histogram('pre_activations', preactivate)
       activations = act(preactivate, name='activation')
-      tf.summary.histogram('activations', activations)
+      tf.compat.v1.summary.histogram('activations', activations)
       return activations
 
   hidden1 = nn_layer(x, 784, 500, 'layer1')
 
-  with tf.name_scope('dropout'):
-    keep_prob = tf.placeholder(tf.float32)
-    tf.summary.scalar('dropout_keep_probability', keep_prob)
+  with tf.compat.v1.name_scope('dropout'):
+    keep_prob = tf.compat.v1.placeholder(tf.float32)
+    tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob)
     dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob))
 
   # Do not apply softmax activation yet, see below.
   y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
 
-  with tf.name_scope('cross_entropy'):
+  with tf.compat.v1.name_scope('cross_entropy'):
     # The raw formulation of cross-entropy,
     #
     # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)),
@@ -119,28 +119,30 @@ def train():
     # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the
     # raw logit outputs of the nn_layer above, and then average across
     # the batch.
-    with tf.name_scope('total'):
-      cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+    with tf.compat.v1.name_scope('total'):
+      cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
           labels=y_, logits=y)
-  tf.summary.scalar('cross_entropy', cross_entropy)
+  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
 
-  with tf.name_scope('train'):
-    train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
+  with tf.compat.v1.name_scope('train'):
+    train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize(
         cross_entropy)
 
-  with tf.name_scope('accuracy'):
-    with tf.name_scope('correct_prediction'):
-      correct_prediction = tf.equal(tf.argmax(y, 1), y_)
-    with tf.name_scope('accuracy'):
-      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.summary.scalar('accuracy', accuracy)
+  with tf.compat.v1.name_scope('accuracy'):
+    with tf.compat.v1.name_scope('correct_prediction'):
+      correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_)
+    with tf.compat.v1.name_scope('accuracy'):
+      accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
+                                                     tf.float32))
+  tf.compat.v1.summary.scalar('accuracy', accuracy)
 
   # Merge all the summaries and write them out to
   # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
-  merged = tf.summary.merge_all()
-  train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
-  test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
-  tf.global_variables_initializer().run()
+  merged = tf.compat.v1.summary.merge_all()
+  train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train',
+                                                 sess.graph)
+  test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test')
+  tf.compat.v1.global_variables_initializer().run()
 
   # Train the model, and also write summaries.
   # Every 10th step, measure test-set accuracy, and write test summaries
@@ -163,8 +165,9 @@ def train():
       print('Accuracy at step %s: %s' % (i, acc))
     else:  # Record train set summaries, and train
       if i % 100 == 99:  # Record execution stats
-        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-        run_metadata = tf.RunMetadata()
+        run_options = tf.compat.v1.RunOptions(
+            trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        run_metadata = tf.compat.v1.RunMetadata()
         summary, _ = sess.run([merged, train_step],
                               feed_dict=feed_dict(True),
                               options=run_options,
@@ -180,9 +183,9 @@ def train():
 
 
 def main(_):
-  if tf.gfile.Exists(FLAGS.log_dir):
-    tf.gfile.DeleteRecursively(FLAGS.log_dir)
-  tf.gfile.MakeDirs(FLAGS.log_dir)
+  if tf.io.gfile.exists(FLAGS.log_dir):
+    tf.io.gfile.rmtree(FLAGS.log_dir)
+  tf.io.gfile.makedirs(FLAGS.log_dir)
   with tf.Graph().as_default():
     train()
 
@@ -211,4 +214,4 @@ if __name__ == '__main__':
                            'tensorflow/mnist/logs/mnist_with_summaries'),
       help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 380cd2be515..5eef590c1dd 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import argparse
 import collections
+import hashlib
 import math
 import os
 import random
@@ -37,6 +38,14 @@ from tensorflow.contrib.tensorboard.plugins import projector
 data_index = 0
 
 
+def _hash_file(fpath):
+  hasher = hashlib.sha256()
+  with open(fpath, 'rb') as fpath_file:
+    for chunk in iter(lambda: fpath_file.read(65535), b''):
+      hasher.update(chunk)
+  return hasher.hexdigest()
+
+
 def word2vec_basic(log_dir):
   """Example of building, training and visualizing a word2vec model."""
   # Create the directory for TensorBoard variables if there is not.
@@ -44,16 +53,22 @@ def word2vec_basic(log_dir):
     os.makedirs(log_dir)
 
   # Step 1: Download the data.
+  # Note: Source website does not support HTTPS right now.
   url = 'http://mattmahoney.net/dc/'
 
   # pylint: disable=redefined-outer-name
-  def maybe_download(filename, expected_bytes):
+  def maybe_download(filename, expected_bytes, sha256=None):
     """Download a file if not present, and make sure it's the right size."""
     local_filename = os.path.join(gettempdir(), filename)
     if not os.path.exists(local_filename):
       local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                      local_filename)
     statinfo = os.stat(local_filename)
+
+    if sha256 and _hash_file(local_filename) != sha256:
+      raise Exception('Failed to verify ' + local_filename + ' due to hash '
+                      'mismatch. Can you get to it with a browser?')
+
     if statinfo.st_size == expected_bytes:
       print('Found and verified', filename)
     else:
@@ -62,7 +77,10 @@ def word2vec_basic(log_dir):
                       '. Can you get to it with a browser?')
     return local_filename
 
-  filename = maybe_download('text8.zip', 31344016)
+  filename = maybe_download(
+      'text8.zip',
+      31344016,
+      sha256='a6640522afe85d1963ad56c05b0ede0a0c000dddc9671758a6cc09b7a38e5232')
 
   # Read the data into a list of strings.
   def read_data(filename):
@@ -100,7 +118,7 @@ def word2vec_basic(log_dir):
   #   This is the original text but words are replaced by their codes
   # count - map of words(strings) to count of occurrences
   # dictionary - map of words(strings) to their codes(integers)
-  # reverse_dictionary - maps codes(integers) to words(strings)
+  # reverse_dictionary - map of codes(integers) to words(strings)
   data, count, unused_dictionary, reverse_dictionary = build_dataset(
       vocabulary, vocabulary_size)
   del vocabulary  # Hint to reduce memory.
@@ -186,8 +204,9 @@ def word2vec_basic(log_dir):
     # Compute the average NCE loss for the batch.
     # tf.nce_loss automatically draws a new sample of the negative labels each
     # time we evaluate the loss.
-    # Explanation of the meaning of NCE loss:
+    # Explanation of the meaning of NCE loss and why choosing NCE over tf.nn.sampled_softmax_loss:
     #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
+    #   http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf
     with tf.name_scope('loss'):
       loss = tf.reduce_mean(
           tf.nn.nce_loss(
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index f1dffb1d513..5a967577b9b 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -90,6 +90,17 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
index 9012ea14ea6..269bbc916a0 100755
--- a/tensorflow/java/maven/release.sh
+++ b/tensorflow/java/maven/release.sh
@@ -51,6 +51,7 @@ docker run \
   -e TF_VERSION="${TF_VERSION}" \
   -e DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" \
   -e DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" \
+  -e DEPLOY_LOCAL="${DEPLOY_LOCAL:-false}" \
   -v ${PWD}:/tensorflow \
   -v "${SETTINGS_XML}":/root/.m2/settings.xml \
   -v ${HOME}/.gnupg:/root/.gnupg \
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 75c6cff5298..3899ebbf1f4 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -25,10 +25,11 @@ TF_ECOSYSTEM_URL="https://github.com/tensorflow/ecosystem.git"
 # environment variables can be set to skip either repository.
 DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}"
 DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}"
+DEPLOY_LOCAL="${DEPLOY_LOCAL:-false}"
 
 PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.5.1/protoc-3.5.1-linux-x86_64.zip"
-if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" ]]; then
-  echo "Must deploy to at least one of Bintray or OSSRH" >&2
+if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" && "${DEPLOY_LOCAL}" != "true" ]]; then
+  echo "Must deploy to at least one of Bintray, OSSRH or local" >&2
   exit 2
 fi
 
@@ -40,7 +41,7 @@ clean() {
   # artifacts lying around)
   mvn -q clean
   rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \
-    libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \
+    libtensorflow/src libtensorflow/target proto/src proto/target \
     tensorflow-hadoop/src tensorflow-hadoop/target spark-tensorflow-connector/src spark-tensorflow-connector/target
 }
 
@@ -71,17 +72,6 @@ download_libtensorflow() {
   cd "${DIR}"
 }
 
-# Fetch the android aar artifact from the CI build system, and update
-# its associated pom file.
-update_tensorflow_android() {
-  TARGET_DIR="${DIR}/tensorflow-android/target"
-  mkdir -p "${TARGET_DIR}"
-  python "${DIR}/tensorflow-android/update.py" \
-    --version "${TF_VERSION}" \
-    --template "${DIR}/tensorflow-android/pom-android.xml.template" \
-    --dir "${TARGET_DIR}"
-}
-
 download_libtensorflow_jni() {
   NATIVE_DIR="${DIR}/libtensorflow_jni/src/main/resources/org/tensorflow/native"
   mkdir -p "${NATIVE_DIR}"
@@ -95,6 +85,21 @@ download_libtensorflow_jni() {
   curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-cpu-darwin-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C darwin-x86_64
   curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-cpu-windows-x86_64-${TF_VERSION}.zip" -o /tmp/windows.zip
 
+  # Get rid of symlinks, those are not supported by jar. As of tensorflow 1.14,
+  # libtensorflow_jni.so expects to find
+  # libtensorflow_framework.so.<majorVersion>.
+  MAJOR_VERSION="${TF_VERSION/\.*/}"
+
+  FRAMEWORK_SO="$(readlink -f linux-x86_64/libtensorflow_framework.so)"
+  rm linux-x86_64/libtensorflow_framework.so
+  rm "linux-x86_64/libtensorflow_framework.so.${MAJOR_VERSION}"
+  mv "${FRAMEWORK_SO}" "linux-x86_64/libtensorflow_framework.so.${MAJOR_VERSION}"
+
+  FRAMEWORK_DYLIB="$(readlink -f darwin-x86_64/libtensorflow_framework.dylib)"
+  rm darwin-x86_64/libtensorflow_framework.dylib
+  rm "darwin-x86_64/libtensorflow_framework.${MAJOR_VERSION}.dylib"
+  mv "${FRAMEWORK_DYLIB}" "darwin-x86_64/libtensorflow_framework.${MAJOR_VERSION}.dylib"
+
   unzip /tmp/windows.zip -d windows-x86_64
   rm -f /tmp/windows.zip
   # Updated timestamps seem to be required to get Maven to pick up the file.
@@ -115,6 +120,11 @@ download_libtensorflow_jni_gpu() {
   curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-gpu-linux-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C linux-x86_64
   curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-gpu-windows-x86_64-${TF_VERSION}.zip" -o /tmp/windows.zip
 
+  FRAMEWORK_SO="$(readlink -f linux-x86_64/libtensorflow_framework.so)"
+  rm linux-x86_64/libtensorflow_framework.so
+  rm "linux-x86_64/libtensorflow_framework.so.${MAJOR_VERSION}"
+  mv "${FRAMEWORK_SO}" "linux-x86_64/libtensorflow_framework.so.${MAJOR_VERSION}"
+
   unzip /tmp/windows.zip -d windows-x86_64
   rm -f /tmp/windows.zip
 
@@ -211,19 +221,11 @@ download_tf_ecosystem() {
 #   n/a
 deploy_profile() {
   local profile="$1"
-  # Deploy the non-android pieces.
-  mvn deploy -P"${profile}"
-  # Determine the correct pom file property to use
-  # for the repository url.
-  local rtype
-  rtype='repository'
-  local url=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.url")
-  local repositoryId=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.id")
-  mvn gpg:sign-and-deploy-file \
-    -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" \
-    -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" \
-    -Durl="${url}" \
-    -DrepositoryId="${repositoryId}"
+  if [[ ${profile} == "local" ]]; then
+    mvn install
+  else
+    mvn deploy -P"${profile}"
+  fi
 }
 
 # If successfully built, try to deploy.
@@ -232,6 +234,10 @@ deploy_profile() {
 #   ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash
 # To get a shell to poke around the maven artifacts with.
 deploy_artifacts() {
+  # Deploy artifacts to local maven repository if requested
+  if [[ "${DEPLOY_LOCAL}" == "true" ]]; then
+    deploy_profile 'local'
+  fi
   # Deploy artifacts to ossrh if requested.
   if [[ "${DEPLOY_OSSRH}" == "true" ]]; then
     deploy_profile 'ossrh'
@@ -264,7 +270,6 @@ update_version_in_pom
 download_libtensorflow
 download_libtensorflow_jni
 download_libtensorflow_jni_gpu
-update_tensorflow_android
 generate_java_protos
 download_tf_ecosystem
 
diff --git a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
deleted file mode 100644
index 37d2372d7b0..00000000000
--- a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
+++ /dev/null
@@ -1,27 +0,0 @@
-<project
-    xmlns="http://maven.apache.org/POM/4.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <groupId>org.tensorflow</groupId>
-  <artifactId>tensorflow-android</artifactId>
-  <version>${version}</version>
-  <packaging>aar</packaging>
-
-  <name>TensorFlow AAR for Android Inference Library and Java API</name>
-  <url>https://github.com/tensorflow/tensorflow/</url>
-  <parent>
-    <groupId>org.tensorflow</groupId>
-    <artifactId>parentpom</artifactId>
-    <version>${version}</version>
-    <relativePath>../</relativePath>
-  </parent>
-
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <project.build.commitid>${build_commit_id}</project.build.commitid>
-    <project.build.type>${build_type}</project.build.type>
-  </properties>
-
-</project>
diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py
deleted file mode 100644
index c620564072c..00000000000
--- a/tensorflow/java/maven/tensorflow-android/update.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#  Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Fetch android artifacts and update pom properties."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import json
-import string
-import sys
-import urllib2
-
-
-def get_args():
-  """Parse command line args."""
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--version', required=True, help='Version for the artifact.')
-  parser.add_argument(
-      '--dir',
-      required=True,
-      help='Directory where the pom and aar artifact will be written.')
-  parser.add_argument(
-      '--template', required=True, help='Path to pom template file.')
-  return parser.parse_args()
-
-
-def get_json(url):
-  """Load the contents of the URL as a json object."""
-  return json.load(urllib2.urlopen(url))
-
-
-def get_commit_id(build_info):
-  """Fetch the git commit id from the build info json object."""
-  release_commit_id = build_info.get('build_commit_id')
-  if release_commit_id:
-    return release_commit_id
-  actions = build_info.get('actions')
-  build_data = next(
-      a for a in actions
-      if a.get('_class') == 'hudson.plugins.git.util.BuildData')
-  if not build_data:
-    raise ValueError('Missing BuildData: %s' % build_info)
-  revision_info = build_data.get('lastBuiltRevision')
-  if not revision_info:
-    raise ValueError('Missing lastBuiltRevision: %s' % build_info)
-  return revision_info.get('SHA1')
-
-
-def get_aar_url(build_info):
-  """Given the json build info, find the URL to the tensorflow.aar artifact."""
-  base_url = build_info.get('url')
-  if not base_url:
-    raise ValueError('Missing url: %s' % build_info)
-  build_class = build_info.get('_class')
-  if (build_class == 'hudson.model.FreeStyleBuild' or
-      build_class == 'hudson.matrix.MatrixRun'):
-    aar_info = next(
-        a for a in build_info.get('artifacts')
-        if a.get('fileName') == 'tensorflow.aar')
-    if not aar_info:
-      raise ValueError('Missing aar artifact: %s' % build_info)
-    return '%s/artifact/%s' % (base_url, aar_info.get('relativePath'))
-
-  raise ValueError('Unknown build_type %s' % build_info)
-
-
-def read_template(path):
-  with open(path) as f:
-    return string.Template(f.read())
-
-
-def main():
-  args = get_args()
-
-  release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow'
-  info_url = '%s/android_buildinfo-%s.json' % (release_prefix, args.version)
-  aar_url = '%s/tensorflow-%s.aar' % (release_prefix, args.version)
-  build_type = 'release-android'
-
-  # Retrieve build information
-  build_info = get_json(info_url)
-
-  # Check all required build info is present
-  build_commit_id = get_commit_id(build_info)
-  if not build_commit_id:
-    raise ValueError('Missing commit id: %s' % build_info)
-
-  # Write the pom file updated with build attributes.
-  template = read_template(args.template)
-  with open('%s/pom-android.xml' % args.dir, 'w') as f:
-    f.write(
-        template.substitute({
-            'build_commit_id': build_commit_id,
-            'build_type': build_type,
-            'version': args.version
-        }))
-
-  # Retrieve the aar location if needed.
-  if not aar_url:
-    aar_url = get_aar_url(build_info)
-
-  # And download the aar to the desired location.
-  with open('%s/tensorflow.aar' % args.dir, 'w') as f:
-    aar = urllib2.urlopen(aar_url)
-    f.write(aar.read())
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
index 972e9cc1064..cbb878ed867 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
@@ -141,7 +141,7 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
      * <p>{@link DevicePlacementPolicy#SILENT} is used by default.
      *
      * @param value policy to apply
-     * @see {@link DevicePlacementPolicy}
+     * @see DevicePlacementPolicy
      */
     public Options devicePlacementPolicy(DevicePlacementPolicy value) {
       devicePlacementPolicy = value;
@@ -154,7 +154,7 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
      * <p>{@link ResourceCleanupStrategy#IN_BACKGROUND} is used by default.
      *
      * @param value strategy to use
-     * @see {@link ResourceCleanupStrategy}
+     * @see ResourceCleanupStrategy
      */
     public Options resourceCleanupStrategy(ResourceCleanupStrategy value) {
       resourceCleanupStrategy = value;
@@ -169,8 +169,8 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
      * not be supported on public endpoints in the future.
      *
      * @param value a serialized config proto
-     * @see
-     *     https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto
+     * @see <a
+     *     href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto"/>
      */
     public Options config(byte[] value) {
       config = value;
@@ -231,7 +231,7 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
    * @param options options to use to build default session
    * @return default eager session
    * @throws IllegalStateException if the default session is already initialized
-   * @see {@link #getDefault()}
+   * @see #getDefault()
    */
   public static EagerSession initDefault(Options options) {
     synchronized (EagerSession.class) {
@@ -262,12 +262,12 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
    * Ops tf = Ops.create();
    *
    * // Starting to build eager operations using default session, by calling
-   * // EagerSession.getDefault() explictly
+   * // EagerSession.getDefault() explicitly
    * Ops tf = Ops.create(EagerSession.getDefault());
    * }</pre>
    *
    * @return default eager session
-   * @see {@link #initDefault(Options)}
+   * @see #initDefault
    */
   public static EagerSession getDefault() {
     if (defaultSession == null) {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index a0e14f1512c..3a175b17ecc 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -226,8 +226,8 @@ public final class Graph implements ExecutionEnvironment, AutoCloseable {
    * Adds operations to compute the partial derivatives of sum of {@code y}s w.r.t {@code x}s,
    * i.e., {@code dy/dx_1, dy/dx_2...}
    * <p>
-   * This is a simplified version of {@link #addGradients(Output[], Output[], Output[]) where {@code y} is
-   * a single output, {@code dx} is null and {@code prefix} is null.
+   * This is a simplified version of {@link #addGradients(String, Output[], Output[], Output[])
+   * where {@code y} is a single output, {@code dx} is null and {@code prefix} is null.
    *
    * @param y output of the function to derive
    * @param x inputs of the function for which partial derivatives are computed
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index cf773e1686d..2ab0e4706cb 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -65,7 +65,7 @@ final class NativeLibrary {
         NativeLibrary.class.getClassLoader().getResourceAsStream(jniResourceName);
     // Extract the JNI's dependency
     final String frameworkLibName =
-        maybeAdjustForMacOS(System.mapLibraryName("tensorflow_framework"));
+        getVersionedLibraryName(System.mapLibraryName("tensorflow_framework"));
     final String frameworkResourceName = makeResourceName(frameworkLibName);
     log("frameworkResourceName: " + frameworkResourceName);
     final InputStream frameworkResource =
@@ -126,22 +126,66 @@ final class NativeLibrary {
     }
   }
 
-  private static String maybeAdjustForMacOS(String libFilename) {
-    if (!System.getProperty("os.name").contains("OS X")) {
+  private static boolean resourceExists(String baseName) {
+    return NativeLibrary.class.getClassLoader().getResource(makeResourceName(baseName)) != null;
+  }
+
+  private static String getVersionedLibraryName(String libFilename) {
+    // If the resource exists as an unversioned file, return that.
+    if (resourceExists(libFilename)) {
       return libFilename;
     }
-    // This is macOS, and the TensorFlow release process might have setup dependencies on
-    // libtensorflow_framework.so instead of libtensorflow_framework.dylib. Adjust for that.
-    final ClassLoader cl = NativeLibrary.class.getClassLoader();
-    if (cl.getResource(makeResourceName(libFilename)) != null) {
-      return libFilename;
+
+    final String versionName = getMajorVersionNumber();
+
+    // If we're on darwin, the versioned libraries look like blah.1.dylib.
+    final String darwinSuffix = ".dylib";
+    if (libFilename.endsWith(darwinSuffix)) {
+      final String prefix = libFilename.substring(0, libFilename.length() - darwinSuffix.length());
+      if (versionName != null) {
+        final String darwinVersionedLibrary = prefix + "." + versionName + darwinSuffix;
+        if (resourceExists(darwinVersionedLibrary)) {
+          return darwinVersionedLibrary;
+        }
+      } else {
+        // If we're here, we're on darwin, but we couldn't figure out the major version number. We
+        // already tried the library name without any changes, but let's do one final try for the
+        // library with a .so suffix.
+        final String darwinSoName = prefix + ".so";
+        if (resourceExists(darwinSoName)) {
+          return darwinSoName;
+        }
+      }
+    } else if (libFilename.endsWith(".so")) {
+      // Libraries ending in ".so" are versioned like "libfoo.so.1", so try that.
+      final String versionedSoName = libFilename + "." + versionName;
+      if (versionName != null && resourceExists(versionedSoName)) {
+        return versionedSoName;
+      }
     }
-    // liftensorflow_framework.dylib not found, try libtensorflow_framework.so
-    final String suffix = ".dylib";
-    if (!libFilename.endsWith(suffix)) {
-      return libFilename;
+
+    // Otherwise, we've got no idea.
+    return libFilename;
+  }
+
+  /**
+   * Returns the major version number of this TensorFlow Java API, or {@code null} if it cannot be
+   * determined.
+   */
+  private static String getMajorVersionNumber() {
+    String version = NativeLibrary.class.getPackage().getImplementationVersion();
+    // expecting a string like 1.14.0, we want to get the first '1'.
+    int dotIndex;
+    if (version == null || (dotIndex = version.indexOf('.')) == -1) {
+      return null;
+    }
+    String majorVersion = version.substring(0, dotIndex);
+    try {
+      Integer.parseInt(majorVersion);
+      return majorVersion;
+    } catch (NumberFormatException unused) {
+      return null;
     }
-    return libFilename.substring(0, libFilename.length() - suffix.length()) + ".so";
   }
 
   private static String extractResource(
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index b5e0f7ac508..bdcb4fdf4bb 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -22,7 +22,7 @@ import java.util.List;
  * Driver for {@link Graph} execution.
  *
  * <p>A {@code Session} instance encapsulates the environment in which {@link Operation}s in a
- * {@link Graph} are executed to compute {@link Tensor}s. For example:
+ * {@link Graph} are executed to compute {@link Tensor Tensors}. For example:
  *
  * <pre>{@code
  * // Let's say graph is an instance of the Graph class
@@ -109,12 +109,13 @@ public final class Session implements AutoCloseable {
   }
 
   /**
-   * Run {@link Operation}s and evaluate {@link Tensor}s.
+   * Run {@link Operation}s and evaluate {@link Tensor Tensors}.
    *
    * <p>A Runner runs the necessary graph fragments to execute every {@link Operation} required to
-   * evaluate the {@link Tensor}s to fetch. The {@link #feed(String,int,Tensor)} call allows callers
-   * to override the value of {@link Tensor}s in the graph by substituting the provided {@link
-   * Tensor}s for the outputs of the operations provided to {@link #feed(String,int,Tensor)}.
+   * evaluate the {@link Tensor Tensors} to fetch. The {@link #feed(String,int,Tensor)} call allows
+   * callers to override the value of {@link Tensor Tensors} in the graph by substituting the
+   * provided {@link Tensor Tensors} for the outputs of the operations provided to {@link
+   * #feed(String,int,Tensor)}.
    */
   public final class Runner {
     /**
@@ -201,7 +202,8 @@ public final class Session implements AutoCloseable {
     }
 
     /**
-     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor}s.
+     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor
+     * Tensors}.
      */
     public Runner addTarget(String operation) {
       GraphOperation op = operationByName(operation);
@@ -212,9 +214,10 @@ public final class Session implements AutoCloseable {
     }
 
     /**
-     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor}s.
+     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor
+     * Tensors}.
      *
-     * @throws execption if the operation is not a {@link GraphOperation}
+     * @throws IllegalArgumentException if the operation is not a {@link GraphOperation}
      */
     public Runner addTarget(Operation operation) {
       if (!(operation instanceof GraphOperation)) {
@@ -226,9 +229,10 @@ public final class Session implements AutoCloseable {
       targets.add((GraphOperation) operation);
       return this;
     }
-    
+
     /**
-     * Make {@link #run()} execute {@code operand}, but not return any evaluated {@link Tensor}s.
+     * Make {@link #run} execute {@code operand}, but not return any evaluated {@link Tensor
+     * Tensors}.
      */
     public Runner addTarget(Operand<?> operand) {
       return addTarget(operand.asOutput().op());
@@ -256,8 +260,8 @@ public final class Session implements AutoCloseable {
     /**
      * Execute the graph fragments necessary to compute all requested fetches.
      *
-     * <p><b>WARNING:</b> The caller assumes ownership of all returned {@link Tensor}s, i.e., the
-     * caller must call {@link Tensor#close()} on all elements of the returned list to free up
+     * <p><b>WARNING:</b> The caller assumes ownership of all returned {@link Tensor Tensors}, i.e.,
+     * the caller must call {@link Tensor#close} on all elements of the returned list to free up
      * resources.
      *
      * <p>TODO(ashankar): Reconsider the return type here. Two things in particular: (a) Make it
@@ -458,7 +462,7 @@ public final class Session implements AutoCloseable {
    * @param inputOpIndices (see inputTensorHandles)
    * @param inputTensorHandles together with inputOpHandles and inputOpIndices specifies the values
    *     that are being "fed" (do not need to be computed) during graph execution.
-   *     inputTensorHandles[i] (which correponds to a Tensor.nativeHandle) is considered to be the
+   *     inputTensorHandles[i] (which corresponds to a Tensor.nativeHandle) is considered to be the
    *     inputOpIndices[i]-th output of the Operation inputOpHandles[i]. Thus, it is required that
    *     inputOpHandles.length == inputOpIndices.length == inputTensorHandles.length.
    * @param outputOpHandles (see outputOpIndices)
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index ebc5b01ee85..8472509a9fa 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -247,23 +247,8 @@ public final class Tensor<T> implements AutoCloseable {
     return ret;
   }
 
-  /**
-   * Creates a Tensor of any type with data from the given buffer.
-   *
-   * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
-   * encoded into {@code data} as per the specification of the TensorFlow <a
-   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
-   * API</a>.
-   *
-   * @param <T> The tensor element type
-   * @param type the tensor element type, specified as a DataType. This must agree with T.
-   * @param shape the tensor shape.
-   * @param data a buffer containing the tensor data.
-   * @throws IllegalArgumentException If the tensor datatype or shape is not compatible with the
-   *     buffer
-   */
   private static Tensor<?> create(DataType dtype, long[] shape, ByteBuffer data) {
-    int nremaining = 0;
+    int nremaining;
     if (dtype != DataType.STRING) {
       int elemBytes = elemByteSize(dtype);
       if (data.remaining() % elemBytes != 0) {
@@ -633,7 +618,7 @@ public final class Tensor<T> implements AutoCloseable {
    *
    * <p>This helper class wraps the tensor native handle and support both situations; If an eager
    * reference to the tensor exists, it will take care of releasing the tensor at the end of its
-   * life. If the tensor is being explicetly closed before this happens, it will take cake of
+   * life. If the tensor is being explicitly closed before this happens, it will take cake of
    * clearing its association with any eager session before cleaning up the resources.
    */
   private static class NativeReference {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
index 4042fb16692..a3d6edd942a 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
@@ -24,6 +24,6 @@ limitations under the License.
  *
  * <p>TensorFlow element types are also separately represented by the {@link
  * org.tensorflow.DataType} enum, with one enum value per element type. The enum representation is
- * not usually needed, but can be obtained using {@link org.tensorflow.DataType.fromClass}.
+ * not usually needed, but can be obtained using {@link org.tensorflow.DataType#fromClass}.
  */
 package org.tensorflow.types;
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
index cf9025e33e9..02cbcdf3df9 100644
--- a/tensorflow/js/BUILD
+++ b/tensorflow/js/BUILD
@@ -45,7 +45,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index c5742adce6f..e353edd121e 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -94,6 +94,16 @@ cc_library(
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
+cc_library(
+    name = "external_cpu_backend_context",
+    srcs = ["external_cpu_backend_context.cc"],
+    hdrs = ["external_cpu_backend_context.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
 cc_library(
     name = "graph_info",
     hdrs = ["graph_info.h"],
@@ -201,6 +211,7 @@ cc_library(
     deps = [
         ":allocation",
         ":arena_planner",
+        ":external_cpu_backend_context",
         ":graph_info",
         ":memory_planner",
         ":minimal_logging",
@@ -213,6 +224,7 @@ cc_library(
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/experimental/resource_variable:resource_variable",
     ] + select({
         ":with_select_tf_ops": [
             "//tensorflow/lite/delegates/flex:delegate",
@@ -316,6 +328,7 @@ cc_test(
         "testdata/2_subgraphs.bin",
         "testdata/empty_model.bin",
         "testdata/multi_add_flex.bin",
+        "testdata/test_min_runtime.bin",
         "testdata/test_model.bin",
         "testdata/test_model_broken.bin",
     ],
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index e695c43f13a..3258f612c18 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -153,7 +153,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     }
   }
   // Go through the graph in execution order.
-  for (int i = 0; i < graph_info_->num_nodes(); ++i) {
+  for (size_t i = 0; i < graph_info_->num_nodes(); ++i) {
     const TfLiteNode& node = graph_info_->node(i);
 
     // First queue output tensors for allocation.
@@ -193,7 +193,7 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node));
   TF_LITE_ENSURE_STATUS(Commit());
 
-  for (int i = 0; i < graph_info_->num_tensors(); ++i) {
+  for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
     // TODO(ahentz): we could do this only for the tensors that were modified
     // in CalculateAllocations(), instead of redoing it for tensors that
     // already had proper pointers. However we must be very careful, because
@@ -237,9 +237,14 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
     }
   }
 
-  // Don't forget to deallocate temporaries of last node.
-  TF_LITE_ENSURE_STATUS(
-      CalculateDeallocationOfInternalTensors(active_node - 1));
+  // For the case if the graph is empty the node index can be negative since we
+  // substract from the active node, so the node_index can be zero for those
+  // cases
+  if (active_node > 0) {
+    // Don't forget to deallocate temporaries of last node.
+    TF_LITE_ENSURE_STATUS(
+        CalculateDeallocationOfInternalTensors(active_node - 1));
+  }
 
   return kTfLiteOk;
 }
@@ -284,8 +289,8 @@ TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) {
 
 TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors(
     int node_index) {
-  if (node_index < graph_info_->num_nodes()) {
-    const TfLiteNode& node = graph_info_->node(node_index);
+  if (node_index < static_cast<int>(graph_info_->num_nodes())) {
+    const TfLiteNode& node = graph_info_->node(static_cast<size_t>(node_index));
     TfLiteIntArray* node_temporaries = node.temporaries;
     for (int i = 0; i < node_temporaries->size; ++i) {
       int tensor_index = node_temporaries->data[i];
@@ -297,8 +302,8 @@ TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors(
 
 TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors(
     int node_index) {
-  if (node_index < graph_info_->num_nodes()) {
-    const TfLiteNode& node = graph_info_->node(node_index);
+  if (node_index < static_cast<int>(graph_info_->num_nodes())) {
+    const TfLiteNode& node = graph_info_->node(static_cast<size_t>(node_index));
     TfLiteIntArray* node_temporaries = node.temporaries;
     for (int i = 0; i < node_temporaries->size; ++i) {
       int tensor_index = node_temporaries->data[i];
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 3b6c9d5f54d..0e80d429c0d 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -211,6 +211,18 @@ TEST_F(ArenaPlannerTest, EmptyGraph) {
   Execute(0, 10);
 }
 
+TEST_F(ArenaPlannerTest, DeallocationOfInputTensor) {
+  // This is a negative TC, which will try to make sure that no allocation for
+  // input tensors is done, when making call with negative node_index, since
+  // previous check was doing comparison of node_index which was int and
+  // unsigned int, implicit conversion was passing this case, as the negative
+  // number was converted to unsigned it making it invalid.The new check
+  // takes care of this problem and removes the warning as well.
+  TestGraph graph({-1}, {}, {1});
+  SetGraph(&graph);
+  Execute(0, 10);
+}
+
 TEST_F(ArenaPlannerTest, GraphWithNoOps) {
   TestGraph graph({0, 10}, {}, {5, 11});
   SetGraph(&graph);
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 2311359308a..cd235d3bca3 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -23,7 +23,7 @@ def tflite_copts():
             "-msse4.1",
         ],
         str(Label("//tensorflow:windows")): [
-            "/DTF_COMPILE_LIBRARY",
+            "/DTFL_COMPILE_LIBRARY",
             "/wd4018",  # -Wno-sign-compare
         ],
         "//conditions:default": [
@@ -51,7 +51,6 @@ def tflite_linkopts_unstripped():
     return select({
         "//tensorflow:android": [
             "-Wl,--no-export-dynamic",  # Only inc syms referenced by dynamic obj.
-            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
             "-Wl,--gc-sections",  # Eliminate unused code and data.
             "-Wl,--as-needed",  # Don't link unused libs.
         ],
@@ -110,6 +109,7 @@ def tflite_jni_binary(
         linkstatic = 1,
         testonly = 0,
         deps = [],
+        tags = [],
         srcs = []):
     """Builds a jni binary for TFLite."""
     linkopts = linkopts + select({
@@ -130,6 +130,7 @@ def tflite_jni_binary(
         linkstatic = linkstatic,
         deps = deps + [linkscript, exported_symbols],
         srcs = srcs,
+        tags = tags,
         linkopts = linkopts,
         testonly = testonly,
     )
@@ -246,6 +247,7 @@ def generated_test_models():
         "conv_to_depthwiseconv_with_shared_weights",
         "cos",
         "depthwiseconv",
+        "depth_to_space",
         "div",
         "elu",
         "equal",
@@ -265,6 +267,7 @@ def generated_test_models():
         "global_batch_norm",
         "greater",
         "greater_equal",
+        "hardswish",
         "identity",
         "sum",
         "l2norm",
@@ -336,6 +339,7 @@ def generated_test_models():
         "topk",
         "transpose",
         "transpose_conv",
+        "uint8_hardswish",
         "unfused_gru",
         "unidirectional_sequence_lstm",
         "unidirectional_sequence_rnn",
@@ -498,6 +502,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags):
             ] + args,
             data = data,
             srcs_version = "PY2AND3",
+            python_version = "PY2",
             tags = [
                 "no_oss",
                 "no_windows",
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index 1ed7022fc02..75bfd9f2f6c 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -31,6 +31,7 @@ typedef enum {
   kTfLiteBuiltinConcatenation = 2,
   kTfLiteBuiltinConv2d = 3,
   kTfLiteBuiltinDepthwiseConv2d = 4,
+  kTfLiteBuiltinDepthToSpace = 5,
   kTfLiteBuiltinDequantize = 6,
   kTfLiteBuiltinEmbeddingLookup = 7,
   kTfLiteBuiltinFloor = 8,
@@ -143,6 +144,8 @@ typedef enum {
   kTfLiteBuiltinMatrixSetDiag = 115,
   kTfLiteBuiltinRound = 116,
   kTfLiteBuiltinHardSwish = 117,
+  kTfLiteBuiltinIf = 118,
+  kTfLiteBuiltinWhile = 119,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 283d15de67b..d9040994ab5 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -270,6 +270,10 @@ typedef struct {
   int block_size;
 } TfLiteSpaceToDepthParams;
 
+typedef struct {
+  int block_size;
+} TfLiteDepthToSpaceParams;
+
 typedef struct {
   TfLiteType in_data_type;
   TfLiteType out_data_type;
@@ -391,6 +395,16 @@ typedef struct {
   EmptyStructPlaceholder placeholder;
 } TfLiteMatrixSetDiagParams;
 
+typedef struct {
+  int then_subgraph_index;
+  int else_subgraph_index;
+} TfLiteIfParams;
+
+typedef struct {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+} TfLiteWhileParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 4967183dd56..af4f47433ed 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -56,6 +56,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteReshapeParams reshape_params;
   TfLiteSkipGramParams skip_gram_params;
   TfLiteSpaceToDepthParams space_to_depth_params;
+  TfLiteDepthToSpaceParams depth_to_space_params;
   TfLiteCastParams cast_params;
   TfLiteCombinerType combiner_type = kTfLiteCombinerTypeSqrtn;
   TfLiteEmbeddingLookupSparseParams lookup_sparse_params;
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index e1c54cba9b3..c31d3e50cc0 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -51,7 +51,11 @@ typedef enum {
   kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;
 
+// Forward declare so dependent structs and methods can reference these types
+// prior to the struct definitions.
 struct TfLiteContext;
+struct TfLiteDelegate;
+struct TfLiteRegistration;
 
 // An external context is a collection of information unrelated to the TF Lite
 // framework, but useful to a subset of the ops. TF Lite knows very little
@@ -63,10 +67,6 @@ typedef struct {
   TfLiteStatus (*Refresh)(struct TfLiteContext* context);
 } TfLiteExternalContext;
 
-// Forward declare so GetNode can use this is in Context.
-typedef struct _TfLiteRegistration TfLiteRegistration;
-typedef struct _TfLiteDelegate TfLiteDelegate;
-
 #define kOptionalTensor (-1)
 
 // Fixed size list of integers. Used for dimensions and inputs/outputs tensor
@@ -330,7 +330,7 @@ typedef struct {
 
   // The delegate which knows how to handle `buffer_handle`.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
+  struct TfLiteDelegate* delegate;
 
   // An integer buffer handle that can be handled by `delegate`.
   // The value is valid only when delegate is not null.
@@ -405,7 +405,7 @@ typedef struct {
   // The pointer to the delegate. This is non-null only when the node is
   // created by calling `interpreter.ModifyGraphWithDelegate`.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
+  struct TfLiteDelegate* delegate;
 } TfLiteNode;
 
 typedef struct TfLiteContext {
@@ -451,15 +451,15 @@ typedef struct TfLiteContext {
 
   // Get a Tensor node by node_index.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
-                                         TfLiteNode** node,
-                                         TfLiteRegistration** registration);
+  TfLiteStatus (*GetNodeAndRegistration)(
+      struct TfLiteContext*, int node_index, TfLiteNode** node,
+      struct TfLiteRegistration** registration);
 
   // Replace ops with one or more stub delegate operations. This function
   // does not take ownership of `nodes_to_replace`.
   TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
-      struct TfLiteContext*, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+      struct TfLiteContext*, struct TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
 
   // Number of threads that are recommended to subsystems like gemmlowp and
   // eigen.
@@ -484,7 +484,7 @@ typedef struct TfLiteContext {
   void* profiler;
 } TfLiteContext;
 
-typedef struct _TfLiteRegistration {
+typedef struct TfLiteRegistration {
   // Initializes the op from serialized data.
   // If a built-in op:
   //   `buffer` is the op's params data (TfLiteLSTMParams*).
@@ -560,7 +560,7 @@ typedef enum {
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
-typedef struct _TfLiteDelegate {
+typedef struct TfLiteDelegate {
   // Data that delegate needs to identify itself. This data is owned by the
   // delegate. The delegate is owned in the user code, so the delegate is
   // responsible for doing this when it is destroyed.
@@ -571,20 +571,21 @@ typedef struct _TfLiteDelegate {
   // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
   // to ask the TensorFlow lite runtime to create macro-nodes to represent
   // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
+  TfLiteStatus (*Prepare)(TfLiteContext* context,
+                          struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
   // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
   // bytes as long as it follows the rules for kTfLiteDynamic tensors.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
-                                       TfLiteDelegate* delegate,
+                                       struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
                                        TfLiteTensor* tensor);
 
   // Copy the data from raw memory of the given 'tensor' to delegate buffer
   // handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
-                                     TfLiteDelegate* delegate,
+                                     struct TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
                                      TfLiteTensor* tensor);
 
@@ -592,7 +593,8 @@ typedef struct _TfLiteDelegate {
   // this doesn't release the underlying resource (e.g. textures). The
   // resources are either owned by application layer or the delegate.
   // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
+  void (*FreeBufferHandle)(TfLiteContext* context,
+                           struct TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
 
   // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 5df9b87ce15..44b1728f86c 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -11,12 +11,14 @@ cc_library(
         "error_reporter.cc",
         "flatbuffer_conversions.cc",
         "op_resolver.cc",
+        "tensor_utils.cc",
     ],
     hdrs = [
         "error_reporter.h",
         "flatbuffer_conversions.h",
         "op_resolver.h",
         "profiler.h",
+        "tensor_utils.h",
     ],
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index a0f97da58ce..369c48aa4a3 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -435,6 +435,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    lstm_params->kernel_type());
             return kTfLiteError;
         }
+      } else {
+        error_reporter->Report("No valid LSTM builtin options exist");
+        return kTfLiteError;
       }
       *builtin_data = reinterpret_cast<void*>(params.release());
       break;
@@ -524,6 +527,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params.release());
       break;
     }
+    case BuiltinOperator_DEPTH_TO_SPACE: {
+      auto params = safe_allocator.Allocate<TfLiteDepthToSpaceParams>();
+      if (const auto* schema_params =
+              op->builtin_options_as_DepthToSpaceOptions()) {
+        params->block_size = schema_params->block_size();
+      }
+      *builtin_data = reinterpret_cast<void*>(params.release());
+      break;
+    }
     case BuiltinOperator_GATHER: {
       auto params = safe_allocator.Allocate<TfLiteGatherParams>();
       params->axis = 0;
@@ -721,6 +733,24 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params.release());
       break;
     }
+    case BuiltinOperator_IF: {
+      TfLiteIfParams* params = allocator->AllocatePOD<TfLiteIfParams>();
+      if (const auto* if_params = op->builtin_options_as_IfOptions()) {
+        params->then_subgraph_index = if_params->then_subgraph_index();
+        params->else_subgraph_index = if_params->else_subgraph_index();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_WHILE: {
+      TfLiteWhileParams* params = allocator->AllocatePOD<TfLiteWhileParams>();
+      if (const auto* while_params = op->builtin_options_as_WhileOptions()) {
+        params->cond_subgraph_index = while_params->cond_subgraph_index();
+        params->body_subgraph_index = while_params->body_subgraph_index();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     // Below are the ops with no builtin_data structure.
     case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
new file mode 100644
index 00000000000..91f40980701
--- /dev/null
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/api/tensor_utils.h"
+
+#include <string.h>
+
+namespace tflite {
+
+TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
+  if (!tensor->is_variable) {
+    return kTfLiteOk;
+  }
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  int value = 0;
+  if (tensor->type == kTfLiteInt8) {
+    value = tensor->params.zero_point;
+  }
+  // TODO(b/139446230): Provide a platform header to better handle these
+  // specific scenarios.
+#if __ANDROID__ || defined(__x86_64__) || defined(__i386__) || \
+    defined(__i386) || defined(__x86__) || defined(__X86__) || \
+    defined(_X86_) || defined(_M_IX86) || defined(_M_X64)
+  memset(tensor->data.raw, value, tensor->bytes);
+#else
+  char* raw_ptr = tensor->data.raw;
+  for (int i = 0; i < tensor->bytes; ++i) {
+    *raw_ptr = value;
+    raw_ptr++;
+  }
+#endif
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_support.h b/tensorflow/lite/core/api/tensor_utils.h
similarity index 62%
rename from tensorflow/lite/kernels/cpu_backend_support.h
rename to tensorflow/lite/core/api/tensor_utils.h
index e7cec5cdd23..3b39a559e66 100644
--- a/tensorflow/lite/kernels/cpu_backend_support.h
+++ b/tensorflow/lite/core/api/tensor_utils.h
@@ -12,23 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
-#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
+
+#ifndef TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 namespace tflite {
 
-namespace cpu_backend_support {
+// Resets a variable tensor to the default value.
+TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor);
 
-CpuBackendContext* GetFromContext(TfLiteContext* context);
-
-void IncrementUsageCounter(TfLiteContext* context);
-
-void DecrementUsageCounter(TfLiteContext* context);
-
-}  // namespace cpu_backend_support
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_
+#endif  // TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index a5934270448..d2c6b874702 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/api/tensor_utils.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/minimal_logging.h"
@@ -156,12 +157,14 @@ class InterpreterInfo : public GraphInfo {
 
 Subgraph::Subgraph(ErrorReporter* error_reporter,
                    TfLiteExternalContext** external_contexts,
-                   std::vector<std::unique_ptr<Subgraph>>* subgraphs)
+                   std::vector<std::unique_ptr<Subgraph>>* subgraphs,
+                   ResourceVariableMap* resource_variables)
     : external_contexts_(external_contexts),
       error_reporter_(error_reporter),
       next_execution_plan_index_to_prepare_(0),
       next_execution_plan_index_to_plan_allocation_(0),
-      subgraphs_(subgraphs) {
+      subgraphs_(subgraphs),
+      resource_variables_(resource_variables) {
   context_.impl_ = static_cast<void*>(this);
   context_.ResizeTensor = ResizeTensor;
   context_.ReportError = ReportErrorC;
@@ -523,11 +526,8 @@ TfLiteStatus Subgraph::ResetVariableTensors() {
     TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
                       kTfLiteArenaRwPersistent);
     TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-    int value = 0;
-    if (tensor.type == kTfLiteInt8) {
-      value = tensor.params.zero_point;
-    }
-    memset(tensor.data.raw, value, tensor.bytes);
+
+    tflite::ResetVariableTensor(&tensor);
   }
   return kTfLiteOk;
 }
@@ -1056,6 +1056,9 @@ void Subgraph::SwitchToKernelContext() {
 }
 
 TfLiteStatus Subgraph::UndoAllDelegates() {
+  // Return early if there is nothing to reset to.
+  if (pre_delegation_execution_plan_.empty()) return kTfLiteOk;
+
   // First free all delegate nodes.
   for (int execution_plan_index = 0;
        execution_plan_index < execution_plan_.size(); ++execution_plan_index) {
@@ -1069,6 +1072,7 @@ TfLiteStatus Subgraph::UndoAllDelegates() {
 
   // Reset execution plan.
   execution_plan_ = pre_delegation_execution_plan_;
+  pre_delegation_execution_plan_.clear();
 
   // Delegate nodes are appended to nodes_and_registration_. Therefore,
   // cleanup nodes_and_registration_ to only contain nodes from
@@ -1099,6 +1103,16 @@ TfLiteStatus Subgraph::RedoAllDelegates() {
   return kTfLiteOk;
 }
 
+TfLiteStatus Subgraph::EnsureMemoryAllocations() {
+  if (memory_planner_) {
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(&context_, memory_planner_->PlanAllocations());
+  }
+  TF_LITE_ENSURE_OK(&context_, AllocateTensors());
+  TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
+  return kTfLiteOk;
+}
+
 TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   // Restore delegation state if applicable.
   TF_LITE_ENSURE_STATUS(RedoAllDelegates());
@@ -1114,6 +1128,9 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
     TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
                                      0, &last_execution_plan_index_prepared));
     if (has_dynamic_tensors_) {
+      // Make sure that we are in a defined ready state before returning.
+      // Plan and allocate tensors before returning.
+      TF_LITE_ENSURE_OK(&context_, EnsureMemoryAllocations());
       ReportError(
           "Attempting to use a delegate that only supports static-sized "
           "tensors with a graph that has dynamic-sized tensors.");
@@ -1134,33 +1151,41 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   // Setup additional context interface.
   SwitchToDelegateContext();
 
+  auto reset_delegation_if_not_ok = [this](TfLiteStatus status) {
+    if (status != kTfLiteOk) {
+      // This will undo all delegate nodes currently in the graph.
+      TF_LITE_ENSURE_STATUS(this->UndoAllDelegates());
+      // This will call AllocateTensors, thus-reapplying any (successfully
+      // applied) previous delegates.
+      TF_LITE_ENSURE_STATUS(this->EnsureMemoryAllocations());
+      ReportError(
+          "Restored previous execution plan after delegate application "
+          "failure.");
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  };
+
   TfLiteStatus status = delegate->Prepare(&context_, delegate);
 
   // Remove additional context info.
   SwitchToKernelContext();
 
-  TF_LITE_ENSURE_OK(&context_, status);
-
-  // If the memory planner has already been created, we need to execute
-  // planning again to account for the updated graph topology.
-  if (memory_planner_) {
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, memory_planner_->PlanAllocations());
-  }
+  TF_LITE_ENSURE_STATUS(reset_delegation_if_not_ok(status));
 
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
     // Reset the state to force tensor/op reallocation.
     state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
+    TF_LITE_ENSURE_STATUS(
+        reset_delegation_if_not_ok(EnsureMemoryAllocations()));
     // After using a delegate which doesn't support dynamic tensors, make the
     // entire graph immutable.
     state_ = kStateInvokableAndImmutable;
   } else if (was_invokable_before_delegate) {
     // If the graph was invokable prior to delegate application, flush
     // allocation now to leave it in a consistent state.
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
+    TF_LITE_ENSURE_STATUS(
+        reset_delegation_if_not_ok(EnsureMemoryAllocations()));
   }
   delegates_applied_.push_back(delegate);
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 7776f90429e..b9736d89f9a 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -16,12 +16,14 @@ limitations under the License.
 #define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 
 #include <cstdlib>
+#include <map>
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/experimental/resource_variable/resource_variable.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/util.h"
 
@@ -36,7 +38,8 @@ class Subgraph {
 
   Subgraph(ErrorReporter* error_reporter,
            TfLiteExternalContext** external_contexts,
-           std::vector<std::unique_ptr<Subgraph>>* subgraphs);
+           std::vector<std::unique_ptr<Subgraph>>* subgraphs,
+           ResourceVariableMap* resource_variables);
 
   Subgraph(const Subgraph&) = delete;
 
@@ -160,6 +163,10 @@ class Subgraph {
   // Read only access to list of variable tensors.
   const std::vector<int>& variables() const { return variables_; }
 
+  // WARNING: Experimental interface, subject to change.
+  // TODO(ycling): Move this function to an external context interface.
+  ResourceVariableMap& resource_variables() { return *resource_variables_; }
+
   size_t tensors_size() const { return tensors_.size(); }
 
   // Return the number of ops in the model.
@@ -457,6 +464,9 @@ class Subgraph {
     }
   }
 
+  // Ensures the memory required is planned and allocated.
+  TfLiteStatus EnsureMemoryAllocations();
+
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
@@ -578,6 +588,10 @@ class Subgraph {
   // Reference to data used by the cancellation function in
   // `check_cancelled_func_`.
   void* cancellation_data_ = nullptr;
+
+  // A map of resource variables. Owned by interpreter and shared by multiple
+  // subgraphs.
+  ResourceVariableMap* resource_variables_ = nullptr;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 1f6df9ada73..6d792d97cb2 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -100,18 +100,20 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 
   ~StringTfLiteTensorBuffer() override {
     LogDeallocation();
-    tensorflow::TypedAllocator::Deallocate<tensorflow::string>(
-        tensorflow::cpu_allocator(), static_cast<tensorflow::string*>(data()),
+    tensorflow::TypedAllocator::Deallocate<tensorflow::tstring>(
+        tensorflow::cpu_allocator(), static_cast<tensorflow::tstring*>(data()),
         num_strings_);
   }
 
-  size_t size() const override { return num_strings_ * sizeof(string); }
+  size_t size() const override {
+    return num_strings_ * sizeof(tensorflow::tstring);
+  }
 
  private:
   StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
       : BaseTfLiteTensorBuffer(
             num_strings != 0
-                ? tensorflow::TypedAllocator::Allocate<tensorflow::string>(
+                ? tensorflow::TypedAllocator::Allocate<tensorflow::tstring>(
                       tensorflow::cpu_allocator(), num_strings,
                       tensorflow::AllocationAttributes())
                 : nullptr),
@@ -119,7 +121,7 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
     LogAllocation();
 
     if (data()) {
-      string* p = static_cast<string*>(data());
+      tensorflow::tstring* p = static_cast<tensorflow::tstring*>(data());
       for (size_t i = 0; i < num_strings_; ++p, ++i) {
         auto ref = GetString(tensor->data.raw, i);
         p->assign(ref.str, ref.len);
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index accaf304524..6b09b692a0b 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -128,7 +128,7 @@ TEST(BufferMapTest, SetFromTfLiteString) {
   buffer_map.SetFromTfLite(0, t.get());
   ASSERT_TRUE(buffer_map.HasTensor(0));
 
-  EXPECT_THAT(GetTensorData<string>(buffer_map.GetTensor(0)),
+  EXPECT_THAT(GetTensorData<tensorflow::tstring>(buffer_map.GetTensor(0)),
               ElementsAre("", "", "", "str1", "", ""));
 
   // Also check details of the tensor.
@@ -162,7 +162,7 @@ TEST(BufferMapTest, SetFromTfLiteStringTwice) {
   buffer_map.SetFromTfLite(0, t1.get());
   buffer_map.SetFromTfLite(0, t2.get());
 
-  EXPECT_THAT(GetTensorData<string>(buffer_map.GetTensor(0)),
+  EXPECT_THAT(GetTensorData<tensorflow::tstring>(buffer_map.GetTensor(0)),
               ElementsAre("", "", "", "s3", "", "", "s1", "s2"));
 }
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4584b6fada3..985b2b68afe 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -97,7 +97,7 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
     }
     DynamicBuffer dynamic_buffer;
 
-    auto tf_data = t.flat<string>();
+    auto tf_data = t.flat<tensorflow::tstring>();
     for (int i = 0; i < t.NumElements(); ++i) {
       dynamic_buffer.AddString(tf_data(i).data(), tf_data(i).size());
     }
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index 4ade603ce2f..2459bd157f6 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -182,7 +182,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "LRN",
           "MatMul",
           "MatrixDiag",
+          "MatrixDiagV2",
           "MatrixSetDiag",
+          "MatrixSetDiagV2",
           "Max",
           "Maximum",
           "MaxPool",
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 7eacef025ba..7fe31477a27 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:command_queue",
         "//tensorflow/lite/delegates/gpu/gl:compiler",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
+        "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:gl_call",
         "//tensorflow/lite/delegates/gpu/gl/converters:bhwc_to_phwc4",
         "//tensorflow/lite/delegates/gpu/gl/converters:phwc4_to_bhwc",
@@ -73,7 +74,6 @@ objc_library(
     name = "metal_delegate",
     srcs = ["metal_delegate.mm"],
     hdrs = ["metal_delegate.h"],
-    copts = ["-std=c++11"],
     sdk_frameworks = ["Metal"],
     deps = [
         "//tensorflow/lite:kernel_api",
@@ -96,6 +96,16 @@ objc_library(
     ],
 )
 
+objc_library(
+    name = "metal_delegate_internal",
+    hdrs = ["metal_delegate_internal.h"],
+    copts = ["-std=c++11"],
+    sdk_frameworks = ["Metal"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
+    ],
+)
+
 # build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtensorflowlite_gpu_gl.so
 cc_binary(
     name = "libtensorflowlite_gpu_gl.so",
@@ -138,9 +148,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/gl:portable",
-        "//third_party/opencl_headers",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
+        "@opencl_headers",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/README.md b/tensorflow/lite/delegates/gpu/README.md
index 7e2807038e6..9fda9ea3ff2 100644
--- a/tensorflow/lite/delegates/gpu/README.md
+++ b/tensorflow/lite/delegates/gpu/README.md
@@ -107,14 +107,8 @@ There are GPU options that can be set and passed on to
 Basic Usage, it translates to:
 
 ```c++
-const TfLiteGpuDelegateOptions kDefaultOptions = {
-  .metadata = nullptr,
-  .compile_options = {
-    .precision_loss_allowed = 0,  // false
-    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
-    .dynamic_batch_enabled = 0,  // false
-  },
-};
+const TfLiteGpuDelegateOptions kDefaultOptions =
+    TfLiteGpuDelegateOptionsDefault();
 ```
 
 Similar for `NewTfLiteMetalDelgate()`:
diff --git a/tensorflow/lite/delegates/gpu/api.cc b/tensorflow/lite/delegates/gpu/api.cc
index ac649658c34..17bfa10ac1a 100644
--- a/tensorflow/lite/delegates/gpu/api.cc
+++ b/tensorflow/lite/delegates/gpu/api.cc
@@ -86,5 +86,20 @@ bool IsObjectPresent(ObjectType type, const TensorObject& obj) {
   }
 }
 
+uint32_t NumElements(const TensorObjectDef& def) {
+  const auto& d = def.dimensions;
+  switch (def.object_def.data_layout) {
+    case DataLayout::BHWC:
+      return d.product();
+    case DataLayout::HWDC4:
+    case DataLayout::HDWC4:
+    case DataLayout::DHWC4:
+      return d.b * d.h * d.w * AlignByN(d.c, 4);
+    case DataLayout::UNKNOWN:
+      return 0;
+  }
+  return 0;
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 00e80956c43..47a060ac97e 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -39,7 +39,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
-#include "third_party/opencl_headers/CL/cl.h"
+#include <CL/cl.h>
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -57,13 +57,9 @@ namespace gpu {
 //   C4 - is the constant = 4.
 enum class DataLayout {
   UNKNOWN,
-
   BHWC,
-
   DHWC4,
-
   HWDC4,
-
   HDWC4,
 };
 
@@ -171,6 +167,9 @@ struct TensorObjectDef {
 // @return true if tensor object def is defined.
 bool IsValid(const TensorObjectDef& def);
 
+// @return the number of elements in a tensor object.
+uint32_t NumElements(const TensorObjectDef& def);
+
 using TensorObject = absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture,
                                    CpuMemory, OpenClBuffer, OpenClTexture>;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
new file mode 100644
index 00000000000..99d0434c5e7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -0,0 +1,424 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "opencl_wrapper",
+    srcs = ["opencl_wrapper.cc"],
+    hdrs = ["opencl_wrapper.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-ldl",  # opencl_wrapper calls dlopen()
+            "-lm",
+        ],
+        "//conditions:default": ["-ldl"],  # opencl_wrapper calls dlopen()
+    }),
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@opencl_headers",
+    ],
+)
+
+cc_library(
+    name = "cl_device",
+    srcs = ["cl_device.cc"],
+    hdrs = ["cl_device.h"],
+    deps = [
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_event",
+    srcs = ["cl_event.cc"],
+    hdrs = ["cl_event.h"],
+    deps = [
+        ":opencl_wrapper",
+    ],
+)
+
+cc_library(
+    name = "cl_context",
+    srcs = ["cl_context.cc"],
+    hdrs = ["cl_context.h"],
+    deps = [
+        ":cl_device",
+        ":cl_image_format",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_memory",
+    srcs = ["cl_memory.cc"],
+    hdrs = ["cl_memory.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_library(
+    name = "cl_command_queue",
+    srcs = ["cl_command_queue.cc"],
+    hdrs = ["cl_command_queue.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_event",
+        ":cl_kernel",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_image_format",
+    srcs = ["cl_image_format.cc"],
+    hdrs = ["cl_image_format.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cl_errors",
+    hdrs = ["cl_errors.h"],
+    deps = [
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_library(
+    name = "cl_program",
+    srcs = ["cl_program.cc"],
+    hdrs = ["cl_program.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gl_interop",
+    srcs = ["gl_interop.cc"],
+    hdrs = ["gl_interop.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_errors",
+        ":cl_event",
+        ":cl_memory",
+        ":egl_sync",
+        ":environment",
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:gl_call",
+        "//tensorflow/lite/delegates/gpu/gl:gl_sync",
+        "//tensorflow/lite/delegates/gpu/gl:portable",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "program_cache",
+    srcs = ["program_cache.cc"],
+    hdrs = ["program_cache.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_kernel",
+        ":cl_program",
+        ":compiled_program_cache_cc_fbs",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/types:span",
+        "@farmhash_archive//:farmhash",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "precision",
+    srcs = ["precision.cc"],
+    hdrs = ["precision.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "cl_kernel",
+    srcs = ["cl_kernel.cc"],
+    hdrs = ["cl_kernel.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_program",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:flt_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "buffer",
+    srcs = ["buffer.cc"],
+    hdrs = ["buffer.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "texture2d",
+    srcs = ["texture2d.cc"],
+    hdrs = ["texture2d.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":opencl_wrapper",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tensor",
+    srcs = ["tensor.cc"],
+    hdrs = ["tensor.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":cl_device",
+        ":cl_image_format",
+        ":cl_memory",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tensor_type",
+    srcs = ["tensor_type.cc"],
+    hdrs = ["tensor_type.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "tensor_type_util",
+    srcs = ["tensor_type_util.cc"],
+    hdrs = ["tensor_type_util.h"],
+    deps = [
+        ":tensor_type",
+        "//tensorflow/lite/delegates/gpu:api",
+    ],
+)
+
+cc_library(
+    name = "environment",
+    srcs = ["environment.cc"],
+    hdrs = ["environment.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":cl_device",
+        ":cl_kernel",
+        ":precision",
+        ":program_cache",
+        ":tensor",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+    ],
+)
+
+cc_library(
+    name = "inference_context",
+    srcs = ["inference_context.cc"],
+    hdrs = ["inference_context.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_device",
+        ":environment",
+        ":model_hints",
+        ":opencl_wrapper",
+        ":precision",
+        ":tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
+        "//tensorflow/lite/delegates/gpu/common:memory_management",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common/transformations:add_bias",
+        "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
+    ],
+)
+
+cc_library(
+    name = "linear_storage",
+    srcs = ["linear_storage.cc"],
+    hdrs = ["linear_storage.h"],
+    deps = [
+        ":buffer",
+        ":opencl_wrapper",
+        ":tensor_type",
+        ":texture2d",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "model_hints",
+    hdrs = ["model_hints.h"],
+)
+
+cc_library(
+    name = "egl_sync",
+    srcs = ["egl_sync.cc"],
+    hdrs = ["egl_sync.h"],
+    defines = [
+        "EGL_EGLEXT_PROTOTYPES",
+    ],
+    deps = [
+        ":cl_device",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:gl_call",
+    ],
+)
+
+cc_library(
+    name = "api",
+    srcs = ["api.cc"],
+    hdrs = ["api.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_errors",
+        ":cl_event",
+        ":egl_sync",
+        ":environment",
+        ":gl_interop",
+        ":inference_context",
+        ":opencl_wrapper",
+        ":precision",
+        ":tensor",
+        ":tensor_type",
+        ":tensor_type_util",
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:converter",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gpu_api_delegate",
+    srcs = ["gpu_api_delegate.cc"],
+    hdrs = ["gpu_api_delegate.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":api",
+        ":opencl_wrapper",
+        ":tensor_type_util",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "compiled_program_cache_cc_fbs",
+    srcs = ["compiled_program_cache.fbs"],
+    flatc_args = [
+        "--scoped-enums",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
new file mode 100644
index 00000000000..6b6333b3bc1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -0,0 +1,771 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/api.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include <EGL/eglext.h>
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Connects tensor definition provided by a user (external) with tensor
+// definition used by the inference engine (internal).
+struct TensorTieDef {
+  ValueId id;
+  AccessType access_type;
+  TensorObjectDef internal_def;
+  TensorObjectDef external_def;
+};
+
+// Connects external tensor object to internal tensor object and provides
+// functionality to copy data to/from external object to internal.
+class TensorTie {
+ public:
+  explicit TensorTie(const TensorTieDef& def) : def_(def) {}
+
+  virtual ~TensorTie() {}
+
+  virtual Status SetExternalObject(TensorObject obj) {
+    return InvalidArgumentError("Tensor object is readonly.");
+  }
+
+  virtual TensorObject GetExternalObject() = 0;
+
+  virtual Status CopyToExternalObject() = 0;
+
+  virtual Status CopyFromExternalObject() = 0;
+
+  const TensorTieDef& def() const { return def_; }
+
+ private:
+  const TensorTieDef def_;
+};
+
+// Both internal and external defs are identical, therefore nothing to connect
+// here.
+class NoopTensorTie : public TensorTie {
+ public:
+  NoopTensorTie(const TensorTieDef& def, TensorObject obj)
+      : TensorTie(def), obj_(obj) {}
+
+  static bool IsSupported(const TensorTieDef& def) {
+    return def.external_def == def.internal_def;
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    std::unique_ptr<TensorTie>* tie) {
+    *tie = absl::make_unique<NoopTensorTie>(def, internal_object);
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return obj_; }
+
+  Status CopyToExternalObject() final { return OkStatus(); }
+
+  Status CopyFromExternalObject() final { return OkStatus(); }
+
+ private:
+  TensorObject obj_;
+};
+
+// Does one-step conversion between internal and external objects.
+// It may also allocate external objects if requested.
+class DefaultTensorTie : public TensorTie {
+ public:
+  DefaultTensorTie(const TensorTieDef& def, TensorObject internal_obj)
+      : TensorTie(def), internal_obj_(internal_obj) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    auto object_type = def.external_def.object_def.object_type;
+    return (object_type == ObjectType::OPENCL_BUFFER ||
+            object_type == ObjectType::OPENCL_TEXTURE ||
+            object_type == ObjectType::CPU_MEMORY) &&
+           converter_builder->IsSupported(def.internal_def, def.external_def) &&
+           converter_builder->IsSupported(def.external_def, def.internal_def);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    Environment* env, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl = absl::make_unique<DefaultTensorTie>(def, internal_object);
+    RETURN_IF_ERROR(tie_impl->Init(converter_builder, env));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    if (!converter_to_) {
+      return UnavailableError("Conversion is not available");
+    }
+    return converter_to_->Convert(internal_obj_, GetExternalObject());
+  }
+
+  Status CopyFromExternalObject() final {
+    if (!converter_from_) {
+      return UnavailableError("Conversion is not available");
+    }
+    return converter_from_->Convert(GetExternalObject(), internal_obj_);
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    if (!def().external_def.object_def.user_provided) {
+      return InvalidArgumentError("External object is read-only");
+    }
+    if (!IsValid(def().external_def, obj)) {
+      return InvalidArgumentError("Given object is not valid");
+    }
+    external_obj_ = obj;
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return external_obj_; }
+
+ private:
+  Status Init(TensorObjectConverterBuilder* converter_builder,
+              Environment* env) {
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().internal_def, def().external_def, &converter_to_));
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().external_def, def().internal_def, &converter_from_));
+    return MaybeAllocateExternalObject(env);
+  }
+
+  Status MaybeAllocateExternalObject(Environment* env) {
+    const TensorObjectDef& d = def().external_def;
+    if (d.object_def.user_provided) {
+      return OkStatus();
+    }
+    switch (d.object_def.object_type) {
+      case ObjectType::CPU_MEMORY: {
+        size_t bytes_size = NumElements(d) * SizeOf(d.object_def.data_type);
+        cpu_memory_.resize(bytes_size);
+        external_obj_ = CpuMemory{cpu_memory_.data(), cpu_memory_.size()};
+        break;
+      }
+      case ObjectType::OPENCL_TEXTURE:
+      case ObjectType::OPENCL_BUFFER: {
+        auto& dims = d.dimensions;
+        RETURN_IF_ERROR(
+            AllocateTensorMemory(env->context(), env->device(), dims.w, dims.h,
+                                 dims.c, d.object_def.data_type,
+                                 ToTensorStorageType(d.object_def.object_type,
+                                                     d.object_def.data_layout),
+                                 &cl_memory_));
+        if (d.object_def.object_type == ObjectType::OPENCL_TEXTURE) {
+          external_obj_ = OpenClTexture{cl_memory_.memory()};
+        } else {
+          external_obj_ = OpenClBuffer{cl_memory_.memory()};
+        }
+        break;
+      }
+      default:
+        return InternalError("Unexpected object type");
+    }
+    return OkStatus();
+  }
+
+  const TensorObject internal_obj_;
+  TensorObject external_obj_;
+  CLMemory cl_memory_;
+  std::vector<uint8_t> cpu_memory_;
+  std::unique_ptr<TensorObjectConverter> converter_to_;
+  std::unique_ptr<TensorObjectConverter> converter_from_;
+};
+
+// Copies data to intermediate OpenCL buffer and then does two step conversion.
+// It drives the following cases were one-step conversion is not supported:
+//   - CPU BHWC -> CL buffer BHWC -> CL texture DHWC4.
+class TwoStepTensorTie : public TensorTie {
+ public:
+  explicit TwoStepTensorTie(const TensorTieDef& def) : TensorTie(def) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    auto defs = MakeOuterInnerDefs(def);
+    return DefaultTensorTie::IsSupported(defs.first, converter_builder) &&
+           DefaultTensorTie::IsSupported(defs.second, converter_builder);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    Environment* env, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl = absl::make_unique<TwoStepTensorTie>(def);
+    RETURN_IF_ERROR(tie_impl->Init(internal_object, converter_builder, env));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    RETURN_IF_ERROR(inner_tie_->CopyToExternalObject());
+    return outer_tie_->CopyToExternalObject();
+  }
+
+  Status CopyFromExternalObject() final {
+    RETURN_IF_ERROR(outer_tie_->CopyFromExternalObject());
+    return inner_tie_->CopyFromExternalObject();
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    return outer_tie_->SetExternalObject(obj);
+  }
+
+  TensorObject GetExternalObject() final {
+    return outer_tie_->GetExternalObject();
+  }
+
+ private:
+  static std::pair<TensorTieDef, TensorTieDef> MakeOuterInnerDefs(
+      const TensorTieDef& def) {
+    TensorTieDef outer_def;
+    outer_def.external_def = def.external_def;
+    outer_def.internal_def = def.external_def;
+    outer_def.internal_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
+    outer_def.internal_def.object_def.user_provided = true;
+
+    TensorTieDef inner_def;
+    inner_def.external_def = outer_def.internal_def;
+    inner_def.external_def.object_def.user_provided = false;
+    inner_def.internal_def = def.internal_def;
+    return std::make_pair(outer_def, inner_def);
+  }
+
+  Status Init(TensorObject internal_object,
+              TensorObjectConverterBuilder* converter_builder,
+              Environment* env) {
+    auto defs = MakeOuterInnerDefs(def());
+    RETURN_IF_ERROR(DefaultTensorTie::New(defs.second, internal_object,
+                                          converter_builder, env, &inner_tie_));
+    return DefaultTensorTie::New(defs.first, inner_tie_->GetExternalObject(),
+                                 converter_builder, env, &outer_tie_);
+  }
+
+  std::unique_ptr<TensorTie> inner_tie_;
+  std::unique_ptr<TensorTie> outer_tie_;
+};
+
+// Captures GL object into CL context before performing a conversion.
+class GlBufferHolder : public TensorTie {
+ public:
+  GlBufferHolder(const TensorTieDef& def, GlInteropFabric* gl_interop_fabric,
+                 Environment* env)
+      : TensorTie(def),
+        gl_interop_fabric_(gl_interop_fabric),
+        environment_(env) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    if (!def.external_def.object_def.user_provided ||
+        def.external_def.object_def.object_type != ObjectType::OPENGL_SSBO) {
+      return false;
+    }
+    return DefaultTensorTie::IsSupported(MakeClDef(def), converter_builder);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    GlInteropFabric* gl_interop_fabric, Environment* env,
+                    std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl =
+        absl::make_unique<GlBufferHolder>(def, gl_interop_fabric, env);
+    RETURN_IF_ERROR(DefaultTensorTie::New(MakeClDef(def), internal_object,
+                                          converter_builder, env,
+                                          &tie_impl->tie_));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    auto ssbo = absl::get_if<OpenGlBuffer>(&obj);
+    if (!ssbo) {
+      return InvalidArgumentError("Missing OpenGL SSBO");
+    }
+    auto old_ssbo = absl::get_if<OpenGlBuffer>(&external_obj_);
+    if (old_ssbo && ssbo->id == old_ssbo->id) {
+      return OkStatus();
+    }
+    if (cl_object_.memory()) {
+      gl_interop_fabric_->UnregisterMemory(cl_object_.memory());
+    }
+    RETURN_IF_ERROR(CreateClMemoryFromGlBuffer(
+        ssbo->id, def().access_type, &environment_->context(), &cl_object_));
+    external_obj_ = obj;
+    RETURN_IF_ERROR(tie_->SetExternalObject(OpenClBuffer{cl_object_.memory()}));
+    gl_interop_fabric_->RegisterMemory(cl_object_.memory());
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return external_obj_; }
+
+  Status CopyFromExternalObject() final {
+    return tie_->CopyFromExternalObject();
+  }
+
+  Status CopyToExternalObject() final { return tie_->CopyToExternalObject(); }
+
+ private:
+  static TensorTieDef MakeClDef(const TensorTieDef& def) {
+    auto cl_def = def;
+    cl_def.external_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
+    cl_def.external_def.object_def.user_provided = true;
+    return cl_def;
+  }
+
+  CLMemory cl_object_;
+  GlInteropFabric* gl_interop_fabric_;
+  Environment* environment_;
+  std::unique_ptr<TensorTie> tie_;
+  TensorObject external_obj_;
+};
+
+TensorObject TensorToObj(const Tensor& tensor) {
+  if (tensor.StorageType() == TensorStorageType::BUFFER) {
+    return OpenClBuffer{tensor.GetMemoryPtr()};
+  }
+  return OpenClTexture{tensor.GetMemoryPtr()};
+}
+
+// Responsible for creating new tensor objects.
+class TensorTieFactory {
+ public:
+  TensorTieFactory(Environment* env, InferenceContext* context,
+                   GlInteropFabric* gl_interop_fabric)
+      : env_(*env),
+        context_(*context),
+        gl_interop_fabric_(gl_interop_fabric),
+        converter_builder_(NewConverterBuilder(env)) {}
+
+  bool IsSupported(const TensorTieDef& def) const {
+    auto converter = converter_builder_.get();
+    return IsValid(def.external_def.object_def) &&
+           (NoopTensorTie::IsSupported(def) ||
+            DefaultTensorTie::IsSupported(def, converter) ||
+            GlBufferHolder::IsSupported(def, converter) ||
+            TwoStepTensorTie::IsSupported(def, converter));
+  }
+
+  Status NewTensorTie(const TensorTieDef& def,
+                      std::unique_ptr<TensorTie>* tie) {
+    TensorObject internal_object = TensorToObj(*context_.GetTensor(def.id));
+    auto converter = converter_builder_.get();
+    if (NoopTensorTie::IsSupported(def)) {
+      return NoopTensorTie::New(def, internal_object, tie);
+    }
+    if (DefaultTensorTie::IsSupported(def, converter)) {
+      return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
+    }
+    if (GlBufferHolder::IsSupported(def, converter)) {
+      if (!gl_interop_fabric_) {
+        return InvalidArgumentError(
+            "GL object is used but InferenceEnvironmentOptions does not have "
+            "EGL display and context set.");
+      }
+      return GlBufferHolder::New(def, internal_object, converter,
+                                 gl_interop_fabric_, &env_, tie);
+    }
+    if (TwoStepTensorTie::IsSupported(def, converter)) {
+      return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
+    }
+    return UnimplementedError("Unsupported tensor tie definition.");
+  }
+
+ private:
+  Environment& env_;
+  InferenceContext& context_;
+  GlInteropFabric* gl_interop_fabric_;
+  std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
+};
+
+class InferenceRunnerImpl : public InferenceRunner {
+ public:
+  InferenceRunnerImpl(Environment* environment,
+                      std::unique_ptr<InferenceContext> context,
+                      std::unique_ptr<GlInteropFabric> gl_interop_fabric)
+      : queue_(environment->queue()),
+        context_(std::move(context)),
+        gl_interop_fabric_(std::move(gl_interop_fabric)) {}
+
+  Status Initialize(const std::vector<TensorTieDef>& inputs,
+                    const std::vector<TensorTieDef>& outputs,
+                    TensorTieFactory* factory) {
+    RETURN_IF_ERROR(LinkTensors(inputs, factory, &inputs_));
+    return LinkTensors(outputs, factory, &outputs_);
+  }
+
+  std::vector<TensorObjectDef> inputs() const override {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const override {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status GetInputObject(int index, TensorObject* object) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = inputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status GetOutputObject(int index, TensorObject* object) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = outputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status SetInputObject(int index, TensorObject object) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return inputs_[index]->SetExternalObject(object);
+  }
+
+  Status SetOutputObject(int index, TensorObject object) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return outputs_[index]->SetExternalObject(object);
+  }
+
+  Status Run() override {
+    if (gl_interop_fabric_) {
+      RETURN_IF_ERROR(gl_interop_fabric_->Start());
+    }
+    for (auto& obj : inputs_) {
+      RETURN_IF_ERROR(obj->CopyFromExternalObject());
+    }
+    RETURN_IF_ERROR(context_->AddToQueue(queue_));
+    clFlush(queue_->queue());
+    for (auto& obj : outputs_) {
+      RETURN_IF_ERROR(obj->CopyToExternalObject());
+    }
+    if (gl_interop_fabric_) {
+      RETURN_IF_ERROR(gl_interop_fabric_->Finish());
+    }
+    return OkStatus();
+  }
+
+ private:
+  static Status LinkTensors(const std::vector<TensorTieDef>& defs,
+                            TensorTieFactory* factory,
+                            std::vector<std::unique_ptr<TensorTie>>* objects) {
+    objects->reserve(defs.size());
+    for (auto& def : defs) {
+      std::unique_ptr<TensorTie> object;
+      RETURN_IF_ERROR(factory->NewTensorTie(def, &object));
+      objects->push_back(std::move(object));
+    }
+    return OkStatus();
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<std::unique_ptr<TensorTie>>& objects) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(objects.size());
+    for (auto& obj : objects) {
+      defs.push_back(obj->def().external_def);
+    }
+    return defs;
+  }
+
+  CLCommandQueue* queue_;
+  std::unique_ptr<InferenceContext> context_;
+  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+  std::vector<std::unique_ptr<TensorTie>> inputs_;
+  std::vector<std::unique_ptr<TensorTie>> outputs_;
+};
+
+TensorObjectDef TensorToDef(const Tensor& tensor) {
+  TensorObjectDef def;
+  def.dimensions.b = 1;
+  def.dimensions.h = tensor.Height();
+  def.dimensions.w = tensor.Width();
+  def.dimensions.c = tensor.Channels();
+  def.object_def.data_layout = ToDataLayout(tensor.StorageType());
+  def.object_def.data_type = tensor.DataType();
+  def.object_def.object_type = ToObjectType(tensor.StorageType());
+  def.object_def.user_provided = false;
+  return def;
+}
+
+class InferenceBuilderImpl : public InferenceBuilder {
+ public:
+  explicit InferenceBuilderImpl(Environment* environment)
+      : environment_(environment) {}
+
+  Status Initialize(const InferenceOptions& options,
+                    const InferenceEnvironmentOptions& env_options,
+                    const GraphFloat32& graph) {
+    // Select precision based on given options.
+    CalculationsPrecision precision = CalculationsPrecision::F32;
+    if (options.allow_precision_loss) {
+      precision = options.priority == InferencePriority::MAX_PRECISION
+                      ? CalculationsPrecision::F32_F16
+                      : CalculationsPrecision::F16;
+    }
+
+    // Increase precision if not supported.
+    if (!environment_->IsSupported(precision)) {
+      precision = CalculationsPrecision::F32_F16;
+      if (!environment_->IsSupported(precision)) {
+        precision = CalculationsPrecision::F32;
+      }
+    }
+
+    context_ = absl::make_unique<InferenceContext>();
+    InferenceContext::CreateInferenceInfo create_info;
+    create_info.precision = precision;
+    create_info.storage_type = GetOptimalStorageType(environment_->device());
+    create_info.hints.Add(ModelHints::kReduceKernelsCount);
+    // TODO(sorokin) temporary hack to speed up init time in some cases.
+    // TODO(sorokin): move this check to the place where hint is applied.
+    if ((precision == CalculationsPrecision::F16 ||
+         precision == CalculationsPrecision::F32_F16) &&
+        create_info.storage_type == TensorStorageType::TEXTURE_ARRAY &&
+        environment_->device().IsAdreno6xxOrHigher()) {
+      create_info.hints.Add(ModelHints::kFastTuning);
+    }
+    RETURN_IF_ERROR(context_->InitFromGraph(create_info, graph, environment_));
+
+    if (env_options.IsGlAware()) {
+      gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
+          env_options.egl_display, environment_);
+    }
+    tie_factory_ = absl::make_unique<TensorTieFactory>(
+        environment_, context_.get(), gl_interop_fabric_.get());
+
+    inputs_ = LinkTensors(graph, graph.inputs());
+    outputs_ = LinkTensors(graph, graph.outputs());
+    return OkStatus();
+  }
+
+  std::vector<TensorObjectDef> inputs() const override {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const override {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status SetInputShape(int index, const Dimensions& dimensions) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return UnimplementedError("Changing input shapes is not supported");
+  }
+
+  Status SetInputObjectDef(int index, ObjectDef new_def) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = inputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_->IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    inputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status SetOutputObjectDef(int index, ObjectDef new_def) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = outputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_->IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    outputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status Build(std::unique_ptr<InferenceRunner>* runner) override {
+    if (gl_interop_fabric_ && !HasGlObjects()) {
+      // destroy interop layer when there are no GL objects to avoid
+      // extra synchronization cost.
+      gl_interop_fabric_.reset(nullptr);
+    }
+    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
+        environment_, std::move(context_), std::move(gl_interop_fabric_));
+    RETURN_IF_ERROR(
+        runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
+    *runner = std::move(runner_impl);
+    return OkStatus();
+  }
+
+ private:
+  // Links internal tensors with external user-facing objects.
+  std::vector<TensorTieDef> LinkTensors(
+      const GraphFloat32& graph,
+      const std::vector<Value<TensorRef<BHWC>>*>& values) {
+    std::vector<TensorTieDef> links;
+    links.reserve(values.size());
+    for (const auto& value : values) {
+      TensorObjectDef def = TensorToDef(*context_->GetTensor(value->id));
+      AccessType access =
+          graph.IsGraphInput(value->id) ? AccessType::READ : AccessType::WRITE;
+      links.push_back({value->id, access, def, def});
+    }
+    return links;
+  }
+
+  bool HasGlObjects() const {
+    auto is_gl = [](ObjectType t) {
+      return t == ObjectType::OPENGL_SSBO || t == ObjectType::OPENGL_TEXTURE;
+    };
+    for (const TensorTieDef& def : inputs_) {
+      if (is_gl(def.external_def.object_def.object_type)) {
+        return true;
+      }
+    }
+    for (const TensorTieDef& def : outputs_) {
+      if (is_gl(def.external_def.object_def.object_type)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<TensorTieDef>& links) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(links.size());
+    for (auto& desc : links) {
+      defs.push_back(desc.external_def);
+    }
+    return defs;
+  }
+
+  std::unique_ptr<InferenceContext> context_;
+  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+  Environment* environment_;
+
+  std::vector<TensorTieDef> inputs_;
+  std::vector<TensorTieDef> outputs_;
+  std::unique_ptr<TensorTieFactory> tie_factory_;
+};
+
+class InferenceEnvironmentImpl : public InferenceEnvironment {
+ public:
+  explicit InferenceEnvironmentImpl(const InferenceEnvironmentOptions& options)
+      : options_(options) {}
+
+  Status Init() {
+    RETURN_IF_ERROR(LoadOpenCL());
+    properties_.is_opencl_available = true;
+
+    if (options_.IsGlAware()) {
+      RETURN_IF_ERROR(CreateGLCompatibleEnvironment(
+          reinterpret_cast<cl_context_properties>(options_.egl_context),
+          reinterpret_cast<cl_context_properties>(options_.egl_display),
+          &environment_));
+    } else {
+      RETURN_IF_ERROR(CreateEnvironment(&environment_));
+    }
+    auto& device = environment_.device();
+    properties_.is_gl_sharing_supported = IsGlSharingSupported(device);
+    properties_.is_gl_to_cl_fast_sync_supported =
+        IsClEventFromEglSyncSupported(device);
+    properties_.is_cl_to_gl_fast_sync_supported =
+        IsEglSyncFromClEventSupported();
+    if (options_.IsGlAware() && !properties_.is_gl_sharing_supported) {
+      return UnavailableError("GL sharing is not supported");
+    }
+    return OkStatus();
+  }
+
+  Status NewInferenceBuilder(const InferenceOptions& options,
+                             GraphFloat32 model,
+                             std::unique_ptr<InferenceBuilder>* builder) final {
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    RETURN_IF_ERROR(RunGraphTransforms(&model));
+    auto builder_impl = absl::make_unique<InferenceBuilderImpl>(&environment_);
+    RETURN_IF_ERROR(builder_impl->Initialize(options, options_, model));
+    *builder = std::move(builder_impl);
+    return OkStatus();
+  }
+
+  std::vector<uint8_t> GetSerializedBinaryCache() const final {
+    std::vector<uint8_t> data;
+    // Is there was a problem, data would be empty.
+    environment_.program_cache()
+        ->GetSerializedCache(environment_.device(), &data)
+        .IgnoreError();
+    return data;
+  }
+
+  const InferenceEnvironmentProperties& properties() const {
+    return properties_;
+  }
+
+ private:
+  const InferenceEnvironmentOptions options_;
+  Environment environment_;
+  InferenceEnvironmentProperties properties_;
+};
+
+}  // namespace
+
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties) {
+  auto env_impl = absl::make_unique<InferenceEnvironmentImpl>(options);
+  Status status = env_impl->Init();
+  if (properties) {
+    *properties = env_impl->properties();
+  }
+  RETURN_IF_ERROR(status);
+  *environment = std::move(env_impl);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
new file mode 100644
index 00000000000..7d5fce06a28
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+
+#include <cstdint>
+#include <memory>
+
+#include <EGL/egl.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+// Usage example:
+//
+//   std::unique_ptr<InferenceEnvironment> env;
+//   RETURN_IF_ERROR(NewInferenceEnvironment(option, &env));
+//
+//   InferenceOptions options;
+//
+//   std::unique_ptr<InferenceBuilder> builder;
+//   RETURN_IF_ERROR(env->NewInferenceBuilder(options, model, &builder));
+//   // now builder is ready to prepare inference runner.
+//
+// -----------------
+// Supported formats
+// -----------------
+//
+// OpenCL implementation uses 2D textures as the primary format.
+// Tensor in HWDC4 layout is {TEXTURE_2D, RGBA, width := W*D, height := H}.
+//
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class InferencePriority {
+  MIN_LATENCY,
+
+  MAX_PRECISION,
+};
+
+struct InferenceOptions {
+  bool allow_precision_loss = false;
+
+  InferencePriority priority = InferencePriority::MAX_PRECISION;
+};
+
+// Indicates environment
+struct InferenceEnvironmentProperties {
+  bool is_opencl_available = false;
+
+  // GL objects (buffers and textures) could be shared with CL context.
+  bool is_gl_sharing_supported = false;
+
+  // Indicates whether fast GL->CL synchronization is supported.
+  bool is_gl_to_cl_fast_sync_supported = false;
+
+  // Indicates whether fast CL->GL synchronization is supported.
+  bool is_cl_to_gl_fast_sync_supported = false;
+};
+
+// Environment manages all resources that need to stay until any inference is
+// running using OpenCL backend.
+class InferenceEnvironment {
+ public:
+  virtual ~InferenceEnvironment() {}
+
+  virtual Status NewInferenceBuilder(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
+  // Returns opaque binary blob that contains a collection of already compiled
+  // OpenCL kernels present in a cache. Returned data could be re-used later
+  // to speed up compilation time when new environment is created for the same
+  // set of models.
+  // Returned data is valid only if used on the same device, otherwise it will
+  // not be compatible and will be discarded.
+  virtual std::vector<uint8_t> GetSerializedBinaryCache() const = 0;
+};
+
+struct InferenceEnvironmentOptions {
+  // Whenever input and/or output is GL object, EGL display and context must be
+  // set to create GL aware OpenCL context. Do not set these variables whenever
+  // GL interoperability is not needed.
+  EGLDisplay egl_display = EGL_NO_DISPLAY;
+  EGLContext egl_context = EGL_NO_CONTEXT;
+
+  // Should contain data returned from
+  // InferenceEnvironment::GetSerializedBinaryCache method.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  absl::Span<const uint8_t> serialized_binary_cache;
+
+  bool IsGlAware() const {
+    return egl_context != EGL_NO_CONTEXT && egl_display != EGL_NO_DISPLAY;
+  }
+};
+
+// Creates new OpenCL environment that needs to stay around until all inference
+// runners are destroyed.
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties /* optional */);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
new file mode 100644
index 00000000000..51d9a59e888
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void* data,
+                    CLContext* context, Buffer* result) {
+  cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+  if (data != nullptr) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  cl_int error_code;
+  cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
+                                 const_cast<void*>(data), &error_code);
+  if (!buffer) {
+    return UnknownError(
+        absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = Buffer(buffer, size_in_bytes);
+
+  return OkStatus();
+}
+}  // namespace
+
+Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
+    : buffer_(buffer), size_(size_in_bytes) {}
+
+Buffer::Buffer(Buffer&& buffer) : buffer_(buffer.buffer_), size_(buffer.size_) {
+  buffer.buffer_ = nullptr;
+  buffer.size_ = 0;
+}
+
+Buffer& Buffer::operator=(Buffer&& buffer) {
+  if (this != &buffer) {
+    Release();
+    std::swap(size_, buffer.size_);
+    std::swap(buffer_, buffer.buffer_);
+  }
+  return *this;
+}
+
+Buffer::~Buffer() { Release(); }
+
+void Buffer::Release() {
+  if (buffer_) {
+    clReleaseMemObject(buffer_);
+    buffer_ = nullptr;
+    size_ = 0;
+  }
+}
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
+                            Buffer* result) {
+  return CreateBuffer(size_in_bytes, true, nullptr, context, result);
+}
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
+                            CLContext* context, Buffer* result) {
+  return CreateBuffer(size_in_bytes, true, data, context, result);
+}
+
+Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
+                             Buffer* result) {
+  return CreateBuffer(size_in_bytes, false, nullptr, context, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
new file mode 100644
index 00000000000..bab4cc13552
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Buffer represent linear GPU data storage with arbitrary data format.
+// Buffer is moveable but not copyable.
+class Buffer {
+ public:
+  Buffer() {}  // just for using Buffer as a class members
+  Buffer(cl_mem buffer, size_t size_in_bytes);
+
+  // Move only
+  Buffer(Buffer&& buffer);
+  Buffer& operator=(Buffer&& buffer);
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+  ~Buffer();
+
+  cl_mem GetMemoryPtr() const { return buffer_; }
+
+  // Writes data to a buffer. Data should point to a region that
+  // has exact size in bytes as size_in_bytes(constructor parameter).
+  template <typename T>
+  Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
+
+  // Reads data from Buffer into CPU memory.
+  template <typename T>
+  Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
+
+ private:
+  void Release();
+
+  cl_mem buffer_ = nullptr;
+  int size_;
+};
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
+                            Buffer* result);
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
+                            CLContext* context, Buffer* result);
+
+Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
+                             Buffer* result);
+
+template <typename T>
+Status Buffer::WriteData(CLCommandQueue* queue, const absl::Span<T> data) {
+  if (size_ != sizeof(T) * data.size()) {
+    return InvalidArgumentError(
+        "absl::Span<T> data size is different from buffer allocated size.");
+  }
+  RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
+  return OkStatus();
+}
+
+template <typename T>
+Status Buffer::ReadData(CLCommandQueue* queue, std::vector<T>* result) const {
+  if (size_ % sizeof(T) != 0) {
+    return UnknownError("Wrong element size(typename T is not correct?");
+  }
+
+  const int elements_count = size_ / sizeof(T);
+  result->resize(elements_count);
+
+  return queue->EnqueueReadBuffer(buffer_, size_, result->data());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
new file mode 100644
index 00000000000..5371baae6d6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -0,0 +1,326 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLCommandQueue::CLCommandQueue(cl_command_queue queue) : queue_(queue) {}
+
+CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue) : queue_(queue.queue_) {
+  queue.queue_ = nullptr;
+}
+
+CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
+  if (this != &queue) {
+    Release();
+    std::swap(queue_, queue.queue_);
+  }
+  return *this;
+}
+
+CLCommandQueue::~CLCommandQueue() { Release(); }
+
+void CLCommandQueue::Release() {
+  if (queue_) {
+    clReleaseCommandQueue(queue_);
+    queue_ = nullptr;
+  }
+}
+
+Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                        int3 work_group_size, CLEvent* event) {
+  std::vector<size_t> local(3);
+  std::vector<size_t> global(3);
+  for (int i = 0; i < 3; ++i) {
+    local[i] = work_group_size[i];
+    global[i] = AlignByN(grid[i], work_group_size[i]);
+  }
+  cl_event resulting_event;
+  const int error_code =
+      clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
+                             local.data(), 0, nullptr, &resulting_event);
+  *event = CLEvent(resulting_event);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                        int3 work_group_size) {
+  std::vector<size_t> local(3);
+  std::vector<size_t> global(3);
+  for (int i = 0; i < 3; ++i) {
+    local[i] = work_group_size[i];
+    global[i] = AlignByN(grid[i], work_group_size[i]);
+  }
+  const int error_code =
+      clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
+                             local.data(), 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
+  cl_event resulting_event;
+  const int error_code = clEnqueueMarker(queue_, &resulting_event);
+  *event = CLEvent(resulting_event);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
+                                         const void* data) {
+  const size_t origin[] = {0, 0, 0};
+  const size_t r[] = {static_cast<size_t>(region.x),
+                      static_cast<size_t>(region.y),
+                      static_cast<size_t>(region.z)};
+  auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
+                                        0, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
+                                        void* data) {
+  const size_t origin[] = {0, 0, 0};
+  const size_t r[] = {static_cast<size_t>(region.x),
+                      static_cast<size_t>(region.y),
+                      static_cast<size_t>(region.z)};
+  auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
+                                       data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+                                          const void* data) {
+  auto error_code = clEnqueueWriteBuffer(
+      queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
+                                         void* data) {
+  auto error_code = clEnqueueReadBuffer(
+      queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::WaitForCompletion() {
+  auto error_code = clFinish(queue_);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
+    : CLCommandQueue(queue) {
+  events_.reserve(128);
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
+    : CLCommandQueue(std::move(queue)),
+      events_(std::move(queue.events_)),
+      current_label_(std::move(queue.current_label_)) {}
+
+ProfilingCommandQueue& ProfilingCommandQueue::operator=(
+    ProfilingCommandQueue&& queue) {
+  if (this != &queue) {
+    events_ = std::move(queue.events_);
+    current_label_ = std::move(queue.current_label_);
+    CLCommandQueue::operator=(std::move(queue));
+  }
+  return *this;
+}
+
+void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
+  current_label_ = name;
+}
+
+void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
+
+Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
+                                               int3 grid,
+                                               int3 work_group_size) {
+  events_.push_back(CLEvent());
+  RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
+      kernel, grid, work_group_size, &events_[events_.size() - 1]));
+  events_.back().SetName(current_label_);
+  return OkStatus();
+}
+
+ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
+  ProfilingInfo result;
+  result.dispatches.resize(events_.size());
+  for (int i = 0; i < events_.size(); ++i) {
+    result.dispatches[i].label = events_[i].GetName();
+    result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
+  }
+  return result;
+}
+
+Status ProfilingCommandQueue::GetBestWorkGroupIndex(
+    const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
+    const std::vector<int3>& work_group_sizes, int* index) {
+  // Some Adreno 3xx can have wrong numbers for some events
+  const bool possible_bug_with_events =
+      device_info.vendor == Vendor::QUALCOMM &&
+      device_info.adreno_info.gpu_version < 400;
+  events_.resize(work_group_sizes.size());
+  for (int i = 0; i < work_group_sizes.size(); ++i) {
+    RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
+        kernel, grid, work_group_sizes[i], &events_[i]));
+
+    // reducing the speed of memory leak on Mali for some kernels
+    if (device_info.vendor == Vendor::MALI && i % 8 == 7) {
+      events_[i - 7].Wait();
+    }
+    if (possible_bug_with_events) {
+      // We are trying to increase probability for correct result.
+      RETURN_IF_ERROR(WaitForCompletion());
+    }
+  }
+
+  RETURN_IF_ERROR(WaitForCompletion());
+
+  // To release memory of some kernel pool on Mali.
+  if (device_info.vendor == Vendor::MALI) {
+    RETURN_IF_ERROR(kernel.ReInit());
+  }
+
+  int minimum_index = 0;
+  double minimum_time = std::numeric_limits<double>::max();
+  if (possible_bug_with_events) {  // we will try to cut out suspicious results
+    double average_time = 0.0;
+    int average_samples_count = 0;
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      if (events_[i].GetEventTimeMs() < 100 * 1000) {  // 100 sec
+        average_time += events_[i].GetEventTimeMs();
+        average_samples_count++;
+      }
+    }
+    average_time /= average_samples_count;
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      double time = events_[i].GetEventTimeMs();
+      if (time < minimum_time && time >= 0.1 * average_time) {
+        minimum_index = i;
+        minimum_time = time;
+      }
+    }
+  } else {
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      double time = events_[i].GetEventTimeMs();
+      if (time < minimum_time) {
+        minimum_index = i;
+        minimum_time = time;
+      }
+    }
+  }
+
+  *index = minimum_index;
+
+  return OkStatus();
+}
+
+Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
+                            CLCommandQueue* result) {
+  int error_code;
+  cl_command_queue queue =
+      clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
+  if (!queue) {
+    return UnknownError(absl::StrCat("Failed to create a command queue - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLCommandQueue(queue);
+  return OkStatus();
+}
+
+double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
+  const uint64_t start = events_.front().GetStartedTimeNs();
+  const uint64_t end = events_.back().GetFinishedTimeNs();
+  const uint64_t time_ns = (end - start);
+
+  return static_cast<double>(time_ns) / 1000000.0;
+}
+
+double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
+  double sum = 0.0;
+  for (int i = 0; i < events_.size(); ++i) {
+    sum += events_[i].GetEventTimeMs();
+  }
+  return sum;
+}
+
+Status CreateProfilingCommandQueue(const CLDevice& device,
+                                   const CLContext& context,
+                                   ProfilingCommandQueue* result) {
+  int error_code;
+  cl_command_queue queue = clCreateCommandQueue(
+      context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
+  if (!queue) {
+    return UnknownError(absl::StrCat("Failed to create a command queue - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = ProfilingCommandQueue(queue);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
new file mode 100644
index 00000000000..caea7c41628
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct ProfilingInfo {
+  struct DispatchInfo {
+    std::string label;
+    uint64_t time_ns;
+    double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
+  };
+
+  std::vector<DispatchInfo> dispatches;
+};
+
+// A wrapper around opencl command queue
+class CLCommandQueue {
+ public:
+  CLCommandQueue() {}
+  explicit CLCommandQueue(cl_command_queue queue);
+
+  // Move only
+  CLCommandQueue(CLCommandQueue&& queue);
+  CLCommandQueue& operator=(CLCommandQueue&& queue);
+  CLCommandQueue(const CLCommandQueue&) = delete;
+  CLCommandQueue& operator=(const CLCommandQueue&) = delete;
+
+  virtual ~CLCommandQueue();
+
+  cl_command_queue queue() const { return queue_; }
+
+  virtual Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                  int3 work_group_size);
+
+  Status EnqueueEvent(CLEvent* event);
+
+  Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                          int3 work_group_size, CLEvent* event);
+
+  Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
+  Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
+
+  Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+                            const void* data);
+  Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void* data);
+
+  Status WaitForCompletion();
+
+ protected:
+  void Release();
+
+  cl_command_queue queue_ = nullptr;
+};
+
+class ProfilingCommandQueue : public CLCommandQueue {
+ public:
+  ProfilingCommandQueue() {}
+  explicit ProfilingCommandQueue(cl_command_queue queue);
+
+  // Move only
+  ProfilingCommandQueue(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
+  ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
+
+  Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                          int3 work_group_size) override;
+
+  // will write index for fastest work_group among work_group_sizes
+  Status GetBestWorkGroupIndex(const CLKernel& kernel,
+                               const DeviceInfo& device_info, const int3& grid,
+                               const std::vector<int3>& work_group_sizes,
+                               int* index);
+
+  // call ResetMeasurements() to start new seriese of measurements
+  void ResetMeasurements();
+
+  double GetQueueExecutionTimeMs() const;
+
+  // Difference from GetQueueExecutionTimeMs is that this number doesn't include
+  // time between kernels(kernels launchs or preparing) on GPU. Usually, this
+  // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
+  // spend on something else(maybe kernels launchs or preparing)
+  double GetSumOfEventsTimeMs() const;
+
+  // This label will be used for all subsequent dispatches.
+  void SetEventsLabel(const std::string& name);
+
+  ProfilingInfo GetProfilingInfo() const;
+
+ private:
+  std::vector<CLEvent> events_;
+  std::string current_label_;
+};
+
+Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
+                            CLCommandQueue* result);
+
+Status CreateProfilingCommandQueue(const CLDevice& device,
+                                   const CLContext& context,
+                                   ProfilingCommandQueue* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_context.cc b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
new file mode 100644
index 00000000000..bf63406a7d4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
+                                                        cl_mem_flags flags) {
+  cl_uint num_image_formats;
+  cl_int error = clGetSupportedImageFormats(
+      context, flags, CL_MEM_OBJECT_IMAGE2D, 0, nullptr, &num_image_formats);
+  if (error != CL_SUCCESS) {
+    return {};
+  }
+
+  std::vector<cl_image_format> result(num_image_formats);
+  error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D,
+                                     num_image_formats, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return {};
+  }
+  return result;
+}
+
+Status CreateCLContext(const CLDevice& device,
+                       cl_context_properties* properties, CLContext* result) {
+  int error_code;
+  cl_device_id device_id = device.id();
+  cl_context context =
+      clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code);
+  if (!context) {
+    return UnknownError(absl::StrCat("Failed to create a compute context - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLContext(context);
+  return OkStatus();
+}
+
+}  // namespace
+
+CLContext::CLContext(cl_context context) : context_(context) {}
+
+CLContext::CLContext(CLContext&& context) : context_(context.context_) {
+  context.context_ = nullptr;
+}
+
+CLContext& CLContext::operator=(CLContext&& context) {
+  if (this != &context) {
+    Release();
+    std::swap(context_, context.context_);
+  }
+  return *this;
+}
+
+CLContext::~CLContext() { Release(); }
+
+void CLContext::Release() {
+  if (context_) {
+    clReleaseContext(context_);
+    context_ = nullptr;
+  }
+}
+
+bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type,
+                                          cl_mem_flags flags) const {
+  auto supported_formats = GetSupportedImage2DFormats(context_, flags);
+  for (auto format : supported_formats) {
+    if (format.image_channel_data_type == ToImageChannelType(data_type) &&
+        format.image_channel_order == ToChannelOrder(num_channels)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Status CreateCLContext(const CLDevice& device, CLContext* result) {
+  return CreateCLContext(device, nullptr, result);
+}
+
+Status CreateCLGLContext(const CLDevice& device,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display, CLContext* result) {
+  if (!device.SupportsExtension("cl_khr_gl_sharing")) {
+    return UnavailableError("Device doesn't support CL-GL sharing.");
+  }
+  cl_context_properties platform =
+      reinterpret_cast<cl_context_properties>(device.platform());
+  cl_context_properties props[] = {CL_GL_CONTEXT_KHR,
+                                   egl_context,
+                                   CL_EGL_DISPLAY_KHR,
+                                   egl_display,
+                                   CL_CONTEXT_PLATFORM,
+                                   platform,
+                                   0};
+  return CreateCLContext(device, props, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_context.h b/tensorflow/lite/delegates/gpu/cl/cl_context.h
new file mode 100644
index 00000000000..7187ca7e863
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl context
+class CLContext {
+ public:
+  CLContext() {}
+  explicit CLContext(cl_context context);
+
+  // Move only
+  CLContext(CLContext&& context);
+  CLContext& operator=(CLContext&& context);
+  CLContext(const CLContext&) = delete;
+  CLContext& operator=(const CLContext&) = delete;
+
+  ~CLContext();
+
+  cl_context context() const { return context_; }
+
+  bool IsFloatTexture2DSupported(int num_channels, DataType data_type,
+                                 cl_mem_flags flags = CL_MEM_READ_WRITE) const;
+
+ private:
+  void Release();
+
+  cl_context context_ = nullptr;
+};
+
+Status CreateCLContext(const CLDevice& device, CLContext* result);
+Status CreateCLGLContext(const CLDevice& device,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display, CLContext* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
new file mode 100644
index 00000000000..81709c5b12c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -0,0 +1,410 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+template <>
+std::string GetDeviceInfo<std::string>(cl_device_id id, cl_device_info info) {
+  size_t size;
+  cl_int error = clGetDeviceInfo(id, info, 0, nullptr, &size);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+
+  std::string result(size - 1, 0);
+  error = clGetDeviceInfo(id, info, size, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+  return result;
+}
+
+namespace {
+template <typename T>
+T GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
+  T result;
+  cl_int error = clGetPlatformInfo(id, info, sizeof(T), &result, nullptr);
+  if (error != CL_SUCCESS) {
+    return -1;
+  }
+  return result;
+}
+
+std::string GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
+  size_t size;
+  cl_int error = clGetPlatformInfo(id, info, 0, nullptr, &size);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+
+  std::string result(size - 1, 0);
+  error = clGetPlatformInfo(id, info, size, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+  return result;
+}
+
+void GetDeviceWorkDimsSizes(cl_device_id id, int3* result) {
+  int dims_count =
+      GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+  if (dims_count < 3) {
+    return;
+  }
+  std::vector<size_t> limits(dims_count);
+  cl_int error =
+      clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                      sizeof(size_t) * dims_count, limits.data(), nullptr);
+  if (error != CL_SUCCESS) {
+    return;
+  }
+  // dims_count must be at least 3 according to spec
+  result->x = limits[0];
+  result->y = limits[1];
+  result->z = limits[2];
+}
+
+OpenCLVersion ParseCLVersion(const std::string& version) {
+  const auto first_dot_pos = version.find_first_of('.');
+  if (first_dot_pos == std::string::npos) {
+    return OpenCLVersion::CL_1_0;
+  }
+  const int major = version[first_dot_pos - 1] - '0';
+  const int minor = version[first_dot_pos + 1] - '0';
+
+  if (major == 1) {
+    if (minor == 2) {
+      return OpenCLVersion::CL_1_2;
+    } else if (minor == 1) {
+      return OpenCLVersion::CL_1_1;
+    } else {
+      return OpenCLVersion::CL_1_0;
+    }
+  } else {
+    return OpenCLVersion::CL_2_0;
+  }
+}
+
+Vendor ParseVendor(const std::string& device_name,
+                   const std::string& vendor_name) {
+  std::string d_name = device_name;
+  std::string v_name = vendor_name;
+  std::transform(d_name.begin(), d_name.end(), d_name.begin(), ::tolower);
+  std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower);
+  if (d_name.find("qualcomm") != std::string::npos ||
+      v_name.find("qualcomm") != std::string::npos) {
+    return Vendor::QUALCOMM;
+  } else if (d_name.find("mali") != std::string::npos ||
+             v_name.find("mali") != std::string::npos) {
+    return Vendor::MALI;
+  } else if (d_name.find("power") != std::string::npos ||
+             v_name.find("power") != std::string::npos) {
+    return Vendor::POWERVR;
+  } else if (d_name.find("nvidia") != std::string::npos ||
+             v_name.find("nvidia") != std::string::npos) {
+    return Vendor::NVIDIA;
+  } else {
+    return Vendor::UNKNOWN;
+  }
+}
+
+// check that gpu_version belong to range min_version-max_version
+// min_version is included and max_version is excluded.
+bool isGPUVersionInRange(int gpu_version, int min_version, int max_version) {
+  return gpu_version >= min_version && gpu_version < max_version;
+}
+}  // namespace
+
+// There is no rule for gpu version encoding, but we found these samples:
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Pixel 2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Sony Compact XZ2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Pixel 3
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Samsung S8
+// Version: OpenCL C 1.2 Adreno(TM) 430   // HTC One M9
+// Version: OpenCL C 2.0 Adreno(TM) 530   // Samsung S7 Edge
+// Version: OpenCL C 1.2 Adreno(TM) 405   // Motorola Moto G(4)
+// After the number string ends.
+// It is assumed that the <vendor-specific information> for Adreno GPUs has
+// the following format:
+// <text?><space?>Adreno(TM)<space><text?><version>
+// Returns -1 if vendor-specific information cannot be parsed
+int GetAdrenoGPUVersion(const std::string& gpu_version) {
+  const std::string gpu = absl::AsciiStrToLower(gpu_version);
+  const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
+  int i = 0;
+  for (; i < words.size(); ++i) {
+    if (words[i].find("adreno") != words[i].npos) {
+      break;
+    }
+  }
+  i += 1;
+  for (; i < words.size(); ++i) {
+    int number;
+    bool is_number = absl::SimpleAtoi(words[i], &number);
+    // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
+    if (is_number && number >= 300) {
+      return number;
+    }
+  }
+  return -1;
+}
+
+std::string VendorToString(Vendor v) {
+  switch (v) {
+    case Vendor::QUALCOMM:
+      return "Qualcomm";
+    case Vendor::MALI:
+      return "Mali";
+    case Vendor::POWERVR:
+      return "PowerVR";
+    case Vendor::NVIDIA:
+      return "NVIDIA";
+    case Vendor::UNKNOWN:
+      return "unknown vendor";
+  }
+}
+
+std::string OpenCLVersionToString(OpenCLVersion version) {
+  switch (version) {
+    case OpenCLVersion::CL_1_0:
+      return "1.0";
+    case OpenCLVersion::CL_1_1:
+      return "1.1";
+    case OpenCLVersion::CL_1_2:
+      return "1.2";
+    case OpenCLVersion::CL_2_0:
+      return "2.0";
+  }
+}
+
+AdrenoInfo::AdrenoInfo(const std::string& device_version)
+    : gpu_version(GetAdrenoGPUVersion(device_version)) {}
+
+int AdrenoInfo::GetMaximumWavesCount() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 30 : 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
+                                     bool full_wave) const {
+  const int register_usage_per_wave =
+      GetWaveSize(full_wave) * register_footprint_per_tread;
+  const int possible_waves_count =
+      GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
+  return std::min(possible_waves_count, GetMaximumWavesCount());
+}
+
+int AdrenoInfo::GetWaveSize(bool full_wave) const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version < 600) {
+    return full_wave ? 64 : 32;
+  } else {
+    return full_wave ? 128 : 64;
+  }
+}
+
+DeviceInfo::DeviceInfo(cl_device_id id)
+    : adreno_info(GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION)) {
+  const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
+  const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
+  vendor = ParseVendor(device_name, vendor_name);
+  cl_version = ParseCLVersion(
+      GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION));
+  extensions =
+      absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
+  supports_fp16 = false;
+  for (const auto& ext : extensions) {
+    if (ext == "cl_khr_fp16") {
+      supports_fp16 = true;
+    }
+  }
+  if (vendor == Vendor::POWERVR && !supports_fp16) {
+    // PowerVR doesn't have full support of fp16 and so doesn't list this
+    // extension. But it can support fp16 in MADs and as buffers/textures types,
+    // so we will use it.
+    supports_fp16 = true;
+  }
+  compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
+  image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+  image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
+  if (cl_version >= OpenCLVersion::CL_1_2) {
+    image_buffer_max_size =
+        GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
+    image_array_max_layers =
+        GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
+  }
+  GetDeviceWorkDimsSizes(id, &max_work_group_sizes);
+}
+
+bool DeviceInfo::SupportsTextureArray() const {
+  return cl_version >= OpenCLVersion::CL_1_2;
+}
+
+CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
+    : id_(id), platform_id_(platform_id), info_(id) {}
+
+CLDevice::CLDevice(const CLDevice& device)
+    : id_(device.id_), platform_id_(device.platform_id_), info_(device.info_) {}
+
+CLDevice& CLDevice::operator=(const CLDevice& device) {
+  if (this != &device) {
+    id_ = device.id_;
+    platform_id_ = device.platform_id_;
+    info_ = device.info_;
+  }
+  return *this;
+}
+
+CLDevice::CLDevice(CLDevice&& device)
+    : id_(device.id_),
+      platform_id_(device.platform_id_),
+      info_(std::move(device.info_)) {
+  device.id_ = nullptr;
+  device.platform_id_ = nullptr;
+}
+
+CLDevice& CLDevice::operator=(CLDevice&& device) {
+  if (this != &device) {
+    id_ = nullptr;
+    platform_id_ = nullptr;
+    std::swap(id_, device.id_);
+    std::swap(platform_id_, device.platform_id_);
+    info_ = std::move(device.info_);
+  }
+  return *this;
+}
+
+bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
+
+bool CLDevice::SupportsExtension(const std::string& extension) const {
+  for (const auto& ext : info_.extensions) {
+    if (ext == extension) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CLDevice::SupportsTextureArray() const {
+  return info_.SupportsTextureArray();
+}
+
+std::string CLDevice::GetPlatformVersion() const {
+  return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
+}
+
+bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
+
+bool CLDevice::IsAdreno3xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 300, 400);
+}
+
+bool CLDevice::IsAdreno4xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 400, 500);
+}
+
+bool CLDevice::IsAdreno5xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 500, 600);
+}
+
+bool CLDevice::IsAdreno6xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 600, 700);
+}
+
+bool CLDevice::IsAdreno6xxOrHigher() const {
+  return IsAdreno() && info_.adreno_info.gpu_version >= 600;
+}
+
+bool CLDevice::IsPowerVR() const { return info_.vendor == Vendor::POWERVR; }
+
+bool CLDevice::IsNvidia() const { return info_.vendor == Vendor::NVIDIA; }
+
+bool CLDevice::IsMali() const { return info_.vendor == Vendor::MALI; }
+
+bool CLDevice::SupportsOneLayerTextureArray() const {
+  return !IsAdreno() || info_.adreno_info.support_one_layer_texture_array;
+}
+
+void CLDevice::DisableOneLayerTextureArray() {
+  info_.adreno_info.support_one_layer_texture_array = false;
+}
+
+Status CreateDefaultGPUDevice(CLDevice* result) {
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, nullptr, &num_platforms);
+  if (num_platforms == 0) {
+    return UnknownError("No supported OpenCL platform.");
+  }
+  std::vector<cl_platform_id> platforms(num_platforms);
+  clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+
+  cl_uint num_devices;
+  clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
+  if (num_devices == 0) {
+    return UnknownError("No GPU on current platform.");
+  }
+
+  std::vector<cl_device_id> devices(num_devices);
+  clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices.data(),
+                 nullptr);
+
+  *result = CLDevice(devices[0], platforms[0]);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
new file mode 100644
index 00000000000..08490892955
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class Vendor { QUALCOMM, MALI, POWERVR, NVIDIA, UNKNOWN };
+std::string VendorToString(Vendor v);
+
+enum class OpenCLVersion { CL_1_0, CL_1_1, CL_1_2, CL_2_0 };
+std::string OpenCLVersionToString(OpenCLVersion version);
+
+// for use only in cl_device.cc, but putted here to make tests
+int GetAdrenoGPUVersion(const std::string& gpu_version);
+
+struct AdrenoInfo {
+  AdrenoInfo() = default;
+  explicit AdrenoInfo(const std::string& device_version);
+  int gpu_version = -1;  // can be, for example, 405/430/540/530/630 etc.
+
+  // This function returns some not very documented physical parameter of
+  // Adreno6xx GPU.
+  // We obtained it using Snapdragon Profiler.
+  int GetMaximumWavesCount() const;
+
+  // returns amount of register memory per CU(Compute Unit) in bytes.
+  int GetRegisterMemorySizePerComputeUnit() const;
+
+  // returns maximum possible amount of waves based on register usage.
+  int GetMaximumWavesCount(int register_footprint_per_tread,
+                           bool full_wave = true) const;
+
+  int GetWaveSize(bool full_wave) const;
+
+  // Not supported on some Adreno devices with specific driver version.
+  // b/131099086
+  bool support_one_layer_texture_array = true;
+};
+
+struct DeviceInfo {
+  DeviceInfo() = default;
+  explicit DeviceInfo(cl_device_id id);
+
+  bool SupportsTextureArray() const;
+
+  std::vector<std::string> extensions;
+  bool supports_fp16;
+  Vendor vendor;
+  OpenCLVersion cl_version;
+  int compute_units_count;
+  int image2d_max_width;
+  int image2d_max_height;
+  int image_buffer_max_size;
+  int image_array_max_layers;
+  int3 max_work_group_sizes;
+
+  AdrenoInfo adreno_info;
+};
+
+// A wrapper around opencl device id
+class CLDevice {
+ public:
+  CLDevice() = default;
+  CLDevice(cl_device_id id, cl_platform_id platform_id);
+
+  CLDevice(CLDevice&& device);
+  CLDevice& operator=(CLDevice&& device);
+  CLDevice(const CLDevice&);
+  CLDevice& operator=(const CLDevice&);
+
+  ~CLDevice() {}
+
+  cl_device_id id() const { return id_; }
+  cl_platform_id platform() const { return platform_id_; }
+  std::string GetPlatformVersion() const;
+
+  const DeviceInfo& GetInfo() const { return info_; }
+  const DeviceInfo* GetInfoPtr() const { return &info_; }
+
+  Vendor vendor() const { return info_.vendor; }
+  OpenCLVersion cl_version() const { return info_.cl_version; }
+  bool SupportsFP16() const;
+  bool SupportsTextureArray() const;
+  bool SupportsExtension(const std::string& extension) const;
+  bool IsAdreno() const;
+  bool IsAdreno3xx() const;
+  bool IsAdreno4xx() const;
+  bool IsAdreno5xx() const;
+  bool IsAdreno6xx() const;
+  bool IsAdreno6xxOrHigher() const;
+  bool IsPowerVR() const;
+  bool IsNvidia() const;
+  bool IsMali() const;
+
+  // To track bug on some Adreno. b/131099086
+  bool SupportsOneLayerTextureArray() const;
+  void DisableOneLayerTextureArray();
+
+ private:
+  cl_device_id id_ = nullptr;
+  cl_platform_id platform_id_ = nullptr;
+  DeviceInfo info_;
+};
+
+Status CreateDefaultGPUDevice(CLDevice* result);
+
+template <typename T>
+T GetDeviceInfo(cl_device_id id, cl_device_info info) {
+  T result;
+  cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
+  if (error != CL_SUCCESS) {
+    return -1;
+  }
+  return result;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_errors.h b/tensorflow/lite/delegates/gpu/cl/cl_errors.h
new file mode 100644
index 00000000000..8c16b2696d7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_errors.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// @return if error_code is success, then return OK status. Otherwise translates
+// error code into a message.
+inline Status GetOpenCLError(cl_int error_code) {
+  if (error_code == CL_SUCCESS) {
+    return OkStatus();
+  }
+  return InternalError("OpenCL error: " + CLErrorCodeToString(error_code));
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_event.cc b/tensorflow/lite/delegates/gpu/cl/cl_event.cc
new file mode 100644
index 00000000000..84b24f30f33
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_event.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLEvent::CLEvent(cl_event event) : event_(event) {}
+
+CLEvent::CLEvent(CLEvent&& event)
+    : event_(event.event_), name_(std::move(event.name_)) {
+  event.event_ = nullptr;
+}
+
+CLEvent& CLEvent::operator=(CLEvent&& event) {
+  if (this != &event) {
+    Release();
+    std::swap(event_, event.event_);
+    name_ = std::move(event.name_);
+  }
+  return *this;
+}
+
+uint64_t CLEvent::GetStartedTimeNs() const {
+  cl_ulong time_ns;
+  clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                          &time_ns, nullptr);
+  return time_ns;
+}
+
+uint64_t CLEvent::GetFinishedTimeNs() const {
+  cl_ulong time_ns;
+  clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+                          &time_ns, nullptr);
+  return time_ns;
+}
+
+double CLEvent::GetEventTimeMs() const {
+  const uint64_t start = GetStartedTimeNs();
+  const uint64_t end = GetFinishedTimeNs();
+  const uint64_t time_ns = (end - start);
+
+  return static_cast<double>(time_ns) * 1e-6;
+}
+
+uint64_t CLEvent::GetEventTimeNs() const {
+  return GetFinishedTimeNs() - GetStartedTimeNs();
+}
+
+void CLEvent::SetName(const std::string& name) { name_ = name; }
+
+void CLEvent::Wait() const { clWaitForEvents(1, &event_); }
+
+CLEvent::~CLEvent() { Release(); }
+
+void CLEvent::Release() {
+  if (event_) {
+    clReleaseEvent(event_);
+    event_ = nullptr;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_event.h b/tensorflow/lite/delegates/gpu/cl/cl_event.h
new file mode 100644
index 00000000000..898e7a92321
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_event.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl event
+class CLEvent {
+ public:
+  CLEvent() {}
+  explicit CLEvent(cl_event event);
+
+  // Move only
+  CLEvent(CLEvent&& event);
+  CLEvent& operator=(CLEvent&& event);
+  CLEvent(const CLEvent&) = delete;
+  CLEvent& operator=(const CLEvent&) = delete;
+
+  ~CLEvent();
+
+  uint64_t GetStartedTimeNs() const;
+  uint64_t GetFinishedTimeNs() const;
+
+  double GetEventTimeMs() const;
+  uint64_t GetEventTimeNs() const;
+
+  void Wait() const;
+
+  cl_event event() const { return event_; }
+
+  bool is_valid() const { return event_ != nullptr; }
+
+  void SetName(const std::string& name);
+  std::string GetName() const { return name_; }
+
+ private:
+  void Release();
+
+  cl_event event_ = nullptr;
+
+  std::string name_;  // optional, for profiling mostly
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_image_format.cc b/tensorflow/lite/delegates/gpu/cl/cl_image_format.cc
new file mode 100644
index 00000000000..a855ca60936
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_image_format.cc
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_channel_order ToChannelOrder(int num_channels) {
+  switch (num_channels) {
+    case 1:
+      return CL_R;
+    case 2:
+      return CL_RG;
+    case 3:
+      return CL_RGB;
+    case 4:
+      return CL_RGBA;
+    default:
+      return -1;
+  }
+}
+
+cl_channel_type ToImageChannelType(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return CL_FLOAT;
+    case DataType::FLOAT16:
+      return CL_HALF_FLOAT;
+    default:
+      return -1;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_image_format.h b/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
new file mode 100644
index 00000000000..b4d0044abcc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_channel_order ToChannelOrder(int num_channels);
+
+cl_channel_type ToImageChannelType(DataType data_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
new file mode 100644
index 00000000000..27d4d36c68a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
@@ -0,0 +1,188 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id,
+                                 int* result) {
+  size_t max_work_group_size;
+  cl_int error_code =
+      clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
+                               sizeof(size_t), &max_work_group_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *result = static_cast<int>(max_work_group_size);
+  return OkStatus();
+}
+
+Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id,
+                                  int* result) {
+  cl_ulong private_mem_size;
+  cl_int error_code =
+      clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE,
+                               sizeof(cl_ulong), &private_mem_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *result = static_cast<int>(private_mem_size);
+  return OkStatus();
+}
+
+}  // namespace
+
+CLKernel::CLKernel(CLKernel&& kernel)
+    : private_memory_size_(kernel.private_memory_size_),
+      max_work_group_size_(kernel.max_work_group_size_),
+      binding_counter_(kernel.binding_counter_),
+      function_name_(std::move(kernel.function_name_)),
+      program_(kernel.program_),
+      kernel_(kernel.kernel_) {
+  kernel.kernel_ = nullptr;
+}
+
+CLKernel& CLKernel::operator=(CLKernel&& kernel) {
+  if (this != &kernel) {
+    Release();
+    std::swap(private_memory_size_, kernel.private_memory_size_);
+    std::swap(max_work_group_size_, kernel.max_work_group_size_);
+    std::swap(binding_counter_, kernel.binding_counter_);
+    function_name_ = std::move(kernel.function_name_);
+    std::swap(program_, kernel.program_);
+    std::swap(kernel_, kernel.kernel_);
+  }
+  return *this;
+}
+
+CLKernel::~CLKernel() { Release(); }
+
+Status CLKernel::ReInit() const {
+  clReleaseKernel(kernel_);
+  cl_kernel* kern_ptr = const_cast<cl_kernel*>(&kernel_);
+  int error_code;
+  *kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code);
+  if (!kernel_ || error_code != CL_SUCCESS) {
+    *kern_ptr = nullptr;
+    return UnknownError(absl::StrCat("Failed to create ", function_name_,
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+void CLKernel::Release() {
+  if (kernel_) {
+    clReleaseKernel(kernel_);
+    clReleaseProgram(program_);
+    kernel_ = nullptr;
+  }
+}
+
+Status CLKernel::CreateFromProgram(const CLProgram& program,
+                                   const std::string& function_name) {
+  int error_code;
+  function_name_ = function_name;
+  kernel_ =
+      clCreateKernel(program.program(), function_name.c_str(), &error_code);
+  if (!kernel_ || error_code != CL_SUCCESS) {
+    kernel_ = nullptr;
+    return UnknownError(absl::StrCat("Failed to create ", function_name,
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  program_ = program.program();
+  clRetainProgram(program_);
+
+  RETURN_IF_ERROR(GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(),
+                                             &private_memory_size_));
+  RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
+                                            &max_work_group_size_));
+  return OkStatus();
+}
+
+Status CLKernel::SetMemory(int index, cl_mem memory) {
+  return SetBytes(index, &memory, sizeof(cl_mem));
+}
+
+Status CLKernel::SetMemoryAuto(cl_mem memory) {
+  return SetBytesAuto(&memory, sizeof(cl_mem));
+}
+
+Status CLKernel::SetBytes(int index, const void* ptr, int length) const {
+  const int error_code = clSetKernelArg(kernel_, index, length, ptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLKernel::SetBytesAuto(const void* ptr, int length) {
+  const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+                                     CLErrorCodeToString(error_code),
+                                     "(at index - ", binding_counter_, ")"));
+  }
+  binding_counter_++;
+  return OkStatus();
+}
+
+template <>
+Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
+  return SetBytes(index, value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const {
+  return SetBytes(index, value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
+  return SetBytes(index, value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
+  return SetBytesAuto(value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value) {
+  return SetBytesAuto(value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
+  return SetBytesAuto(value.GetData(), value.GetSize());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
new file mode 100644
index 00000000000..3b63e43c967
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Arguments binding to CLKernel can be manual or automatic
+// In manual you specify binding index explicitly
+// In automatic binding, index auto-incremented with every binding call
+// Also, if you use automatic mode you must call ResetBindingCounter
+//   before parameters binding
+class CLKernel {
+ public:
+  CLKernel() {}
+
+  // Move only
+  CLKernel(CLKernel&& kernel);
+  CLKernel& operator=(CLKernel&& kernel);
+  CLKernel(const CLKernel&) = delete;
+  CLKernel& operator=(const CLKernel&) = delete;
+
+  ~CLKernel();
+
+  cl_kernel kernel() const { return kernel_; }
+
+  Status CreateFromProgram(const CLProgram& program,
+                           const std::string& function_name);
+
+  Status SetMemory(int index, cl_mem memory);
+  Status SetMemoryAuto(cl_mem memory);
+  template <typename T>
+  Status SetBytes(int index, const T& value) const {
+    return SetBytes(index, static_cast<const void*>(&value), sizeof(T));
+  }
+  template <typename T>
+  Status SetBytesAuto(const T& value) {
+    return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
+  }
+
+  int GetPrivateMemorySize() const { return private_memory_size_; }
+  int GetMaxWorkGroupSize() const { return max_work_group_size_; }
+
+  void ResetBindingCounter() { binding_counter_ = 0; }
+
+  // Do not use this function
+  // workaround for Mali memory leak
+  Status ReInit() const;
+
+ private:
+  void Release();
+  Status SetBytes(int index, const void* ptr, int length) const;
+  Status SetBytesAuto(const void* ptr, int length);
+
+  int private_memory_size_;
+  int max_work_group_size_;
+  int binding_counter_ = -1;
+
+  std::string function_name_;
+  // reference to program from which kernel was created
+  cl_program program_ = nullptr;
+  cl_kernel kernel_ = nullptr;
+};
+
+template <>
+Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
+
+template <>
+Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const;
+
+template <>
+Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
+
+template <>
+Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
+
+template <>
+Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value);
+
+template <>
+Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_memory.cc b/tensorflow/lite/delegates/gpu/cl/cl_memory.cc
new file mode 100644
index 00000000000..107414e977d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_memory.cc
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_mem_flags ToClMemFlags(AccessType access_type) {
+  switch (access_type) {
+    case AccessType::READ:
+      return CL_MEM_READ_ONLY;
+    case AccessType::WRITE:
+      return CL_MEM_WRITE_ONLY;
+    case AccessType::READ_WRITE:
+      return CL_MEM_READ_WRITE;
+  }
+
+  return CL_MEM_READ_ONLY;  // unreachable
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_memory.h b/tensorflow/lite/delegates/gpu/cl/cl_memory.h
new file mode 100644
index 00000000000..9252a2f134f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_memory.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for OpenCL memory object.
+//
+// Image is moveable but not copyable.
+class CLMemory {
+ public:
+  // Creates invalid object.
+  CLMemory() : CLMemory(nullptr, false) {}
+
+  CLMemory(cl_mem memory, bool has_ownership)
+      : memory_(memory), has_ownership_(has_ownership) {}
+
+  // Move-only
+  CLMemory(const CLMemory&) = delete;
+  CLMemory& operator=(const CLMemory&) = delete;
+  CLMemory(CLMemory&& image)
+      : memory_(image.memory_), has_ownership_(image.has_ownership_) {
+    image.memory_ = nullptr;
+  }
+
+  ~CLMemory() { Invalidate(); }
+
+  CLMemory& operator=(CLMemory&& image) {
+    if (this != &image) {
+      Invalidate();
+      std::swap(memory_, image.memory_);
+      has_ownership_ = image.has_ownership_;
+    }
+    return *this;
+  }
+
+  cl_mem memory() const { return memory_; }
+
+  bool is_valid() const { return memory_ != nullptr; }
+
+  // @return true if this object actually owns corresponding CL memory
+  //         and manages it's lifetime.
+  bool has_ownership() const { return has_ownership_; }
+
+  cl_mem Release() {
+    cl_mem to_return = memory_;
+    memory_ = nullptr;
+    return to_return;
+  }
+
+ private:
+  void Invalidate() {
+    if (memory_ && has_ownership_) {
+      clReleaseMemObject(memory_);
+    }
+    memory_ = nullptr;
+  }
+
+  cl_mem memory_ = nullptr;
+  bool has_ownership_ = false;
+};
+
+cl_mem_flags ToClMemFlags(AccessType access_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
new file mode 100644
index 00000000000..c96e6d31327
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -0,0 +1,188 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetProgramBuildInfo(cl_program program, cl_device_id id,
+                                cl_program_build_info info) {
+  size_t size;
+  cl_int error_code =
+      clGetProgramBuildInfo(program, id, info, 0, nullptr, &size);
+  if (error_code != CL_SUCCESS) {
+    return absl::StrCat("Failed to GetProgramBuildInfo - ",
+                        CLErrorCodeToString(error_code));
+  }
+
+  std::string result(size - 1, 0);
+  error_code =
+      clGetProgramBuildInfo(program, id, info, size, &result[0], nullptr);
+  if (error_code != CL_SUCCESS) {
+    return absl::StrCat("Failed to GetProgramBuildInfo - ",
+                        CLErrorCodeToString(error_code));
+  }
+  return result;
+}
+
+Status GetBinarySize(cl_program program, size_t* binary_size) {
+  cl_int error_code = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                                       sizeof(size_t), binary_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to get program binary size - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status BuildProgram(cl_program program, const CLDevice& device,
+                    const std::string& compiler_options) {
+  const int error_code = clBuildProgram(
+      program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat(
+        "Failed to build program executable - ",
+        CLErrorCodeToString(error_code),
+        GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG)));
+  }
+
+  return OkStatus();
+}
+
+std::string CompilerOptionToString(const CLDevice& device,
+                                   CompilerOptions option) {
+  switch (option) {
+    case CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      if (device.GetInfo().adreno_info.gpu_version < 500) {
+        return "-qcom-accelerate-16-bit";
+      } else {
+        return "-qcom-accelerate-16-bit=true";
+      }
+    case CompilerOptions::POWERVR_FP16:
+      return "-cl-fast-relaxed-math";
+  }
+}
+
+}  // namespace
+
+std::string CompilerOptionsToString(
+    const CLDevice& device,
+    const std::vector<CompilerOptions>& compiler_options) {
+  std::string result;
+  for (auto option : compiler_options) {
+    absl::StrAppend(&result, CompilerOptionToString(device, option), " ");
+  }
+  return result;
+}
+
+CLProgram::CLProgram(cl_program program, cl_device_id device_id)
+    : program_(program), device_id_(device_id) {}
+
+CLProgram::CLProgram(CLProgram&& program)
+    : program_(program.program_), device_id_(program.device_id_) {
+  program.program_ = nullptr;
+}
+
+CLProgram& CLProgram::operator=(CLProgram&& program) {
+  if (this != &program) {
+    Release();
+    std::swap(program_, program.program_);
+    std::swap(device_id_, program.device_id_);
+  }
+  return *this;
+}
+
+CLProgram::~CLProgram() { Release(); }
+
+void CLProgram::Release() {
+  if (program_) {
+    clReleaseProgram(program_);
+    program_ = nullptr;
+  }
+}
+
+Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
+  size_t binary_size;
+  RETURN_IF_ERROR(GetBinarySize(program_, &binary_size));
+  result->resize(result->size() + binary_size);
+  uint8_t* binary_ptr = result->data() + result->size() - binary_size;
+  cl_int error_code = clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+                                       binary_size, &binary_ptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to get program binary - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CreateCLProgram(const std::string& code,
+                       const std::string& compiler_options,
+                       const CLContext& context, const CLDevice& device,
+                       CLProgram* result) {
+  int error_code;
+  const char* source = code.c_str();
+
+  cl_program program = clCreateProgramWithSource(context.context(), 1, &source,
+                                                 nullptr, &error_code);
+  if (!program || error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to create compute program - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLProgram(program, device.id());
+  RETURN_IF_ERROR(BuildProgram(program, device, compiler_options));
+  return OkStatus();
+}
+
+Status CreateCLProgramFromBinary(const CLContext& context,
+                                 const CLDevice& device,
+                                 absl::Span<const uint8_t> binary,
+                                 CLProgram* result) {
+  cl_int binary_status;
+  cl_int error_code;
+  cl_device_id devices_list[] = {device.id()};
+  size_t binary_size = binary.size();
+  const uint8_t* binary_pointer = binary.data();
+  cl_program program = clCreateProgramWithBinary(
+      context.context(), 1, devices_list, &binary_size, &binary_pointer,
+      &binary_status, &error_code);
+  if (binary_status != CL_SUCCESS) {
+    return UnknownError(absl::StrCat(
+        "Something wrong with binary after clCreateProgramWithBinary - ",
+        binary_status));
+  }
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to create program - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  *result = CLProgram(program, device.id());
+  return BuildProgram(program, device, "");
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
new file mode 100644
index 00000000000..5b7423fe807
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// ADRENO_FULL_SIMD_LINE:
+//   Adreno can have 2 sizes for SIMD size.
+//   On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
+//   Some our algorithms actually rely on exact size, for example on full
+//   SIMD size, so we need this define.
+//   This define is actually -qcom-accelerate-16-bit, but it controls SIMD size.
+enum class CompilerOptions { ADRENO_FULL_SIMD_LINE, POWERVR_FP16 };
+
+std::string CompilerOptionsToString(
+    const CLDevice& device,
+    const std::vector<CompilerOptions>& compiler_options);
+
+class CLProgram {
+ public:
+  CLProgram() {}
+  CLProgram(cl_program program, cl_device_id device_id);
+
+  // Move only
+  CLProgram(CLProgram&& program);
+  CLProgram& operator=(CLProgram&& program);
+  CLProgram(const CLProgram&) = delete;
+  CLProgram& operator=(const CLProgram&) = delete;
+
+  ~CLProgram();
+
+  cl_program program() const { return program_; }
+
+  // Return the cl_device_id associated with the program object.
+  // This can be the device associated with context on which the program object
+  // has been created or can be device that was specified when a progam object
+  // was created using clCreateProgramWithBinary.
+  cl_device_id GetDeviceId() const { return device_id_; }
+
+  Status GetBinary(std::vector<uint8_t>* result) const;
+
+ private:
+  void Release();
+
+  cl_program program_ = nullptr;
+
+  // reference
+  cl_device_id device_id_ = nullptr;
+};
+
+Status CreateCLProgram(const std::string& code,
+                       const std::string& compiler_options,
+                       const CLContext& context, const CLDevice& device,
+                       CLProgram* result);
+
+Status CreateCLProgramFromBinary(const CLContext& context,
+                                 const CLDevice& device,
+                                 absl::Span<const uint8_t> binary,
+                                 CLProgram* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache.fbs b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache.fbs
new file mode 100644
index 00000000000..8498059426c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache.fbs
@@ -0,0 +1,17 @@
+namespace tflite.gpu.cl.data;
+
+file_identifier "AFCM";
+
+file_extension "jetbin";
+
+table Program {
+  fingerprint:uint64;
+  binary:[ubyte];
+}
+
+table CompiledCache {
+  driver_version:string;
+  programs:[Program];
+}
+
+root_type CompiledCache;
diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
new file mode 100644
index 00000000000..8493fbb049f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
+  EGLSyncKHR egl_sync;
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
+                                      EGL_SYNC_FENCE_KHR, nullptr));
+  if (egl_sync == EGL_NO_SYNC_KHR) {
+    return InternalError("Returned empty KHR EGL sync");
+  }
+  *sync = EglSync(display, egl_sync);
+  return OkStatus();
+}
+
+EglSync& EglSync::operator=(EglSync&& sync) {
+  if (this != &sync) {
+    Invalidate();
+    std::swap(sync_, sync.sync_);
+    display_ = sync.display_;
+  }
+  return *this;
+}
+
+void EglSync::Invalidate() {
+  if (sync_ != EGL_NO_SYNC_KHR) {
+    eglDestroySyncKHR(display_, sync_);
+    sync_ = EGL_NO_SYNC_KHR;
+  }
+}
+
+Status EglSync::ServerWait() {
+  EGLint result;
+  RETURN_IF_ERROR(
+      TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
+  return result == EGL_TRUE ? OkStatus() : InternalError("eglWaitSync failed");
+}
+
+Status EglSync::ClientWait() {
+  EGLint result;
+  // TODO(akulik): make it active wait for better performance
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
+                                      sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
+                                      EGL_FOREVER_KHR));
+  return result == EGL_CONDITION_SATISFIED_KHR
+             ? OkStatus()
+             : InternalError("eglClientWaitSync failed");
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.h b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
new file mode 100644
index 00000000000..27a551c5d59
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for EGL sync object.
+// EglSync is moveable but not copyable.
+class EglSync {
+ public:
+  // Creates a fence in OpenGL command stream. This sync is enqueued and *not*
+  // flushed.
+  //
+  // Depends on EGL_KHR_fence_sync extension.
+  static Status NewFence(EGLDisplay display, EglSync* sync);
+
+  // Creates invalid object.
+  EglSync() : EglSync(EGL_NO_DISPLAY, EGL_NO_SYNC_KHR) {}
+
+  EglSync(EGLDisplay display, EGLSyncKHR sync)
+      : display_(display), sync_(sync) {}
+
+  // Move-only
+  EglSync(EglSync&& sync);
+  EglSync& operator=(EglSync&& sync);
+  EglSync(const EglSync&) = delete;
+  EglSync& operator=(const EglSync&) = delete;
+
+  ~EglSync() { Invalidate(); }
+
+  // Causes GPU to block and wait until this sync has been signaled.
+  // This call does not block and returns immediately.
+  Status ServerWait();
+
+  // Causes CPU to block and wait until this sync has been signaled.
+  Status ClientWait();
+
+  // Returns the EGLDisplay on which this instance was created.
+  EGLDisplay display() const { return display_; }
+
+  // Returns the EGLSyncKHR wrapped by this instance.
+  EGLSyncKHR sync() const { return sync_; }
+
+  // Returns true if this instance wraps a valid EGLSync object.
+  bool is_valid() const { return sync_ != nullptr; }
+
+ private:
+  void Invalidate();
+
+  EGLDisplay display_;
+  EGLSyncKHR sync_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
new file mode 100644
index 00000000000..32c13b50766
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -0,0 +1,241 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+CalculationsPrecision GetPossiblePrecision(
+    const CLDevice& gpu, CalculationsPrecision desired_precision) {
+  if (!gpu.SupportsFP16() && desired_precision != CalculationsPrecision::F32) {
+    return CalculationsPrecision::F32;
+  }
+
+  return desired_precision;
+}
+
+std::string GetKernelOneLayerTextureArray() {
+  return R"(
+
+__kernel void main_function(__write_only image2d_array_t dst) {
+  int X = (int)(get_global_id(0));
+  int Y = (int)(get_global_id(1));
+
+  write_imagef(dst, (int4)(X, Y, 0, 0), (float4)(2.0, 2.0, 2.0, 2.0));
+}
+)";
+}
+
+// Some Adreno < 600 have bug with one layer texture array. b/131099086
+// If we have one layer texture array and will write smt from kernel to this
+// texture, we will get zeroes instead of actual values.
+// The same kernel will work, if we use texture array with more than one layer.
+// With help of this code we can detect this bug.
+Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
+                                                bool* result) {
+  // No bug on Adreno 6xx
+  if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
+    *result = true;
+    return OkStatus();
+  }
+  CLKernel kernel;
+  RETURN_IF_ERROR(CreateKernel(GetKernelOneLayerTextureArray(), "main_function",
+                               env, &kernel));
+  Tensor tensor;
+  RETURN_IF_ERROR(CreateTensor(env->context(), env->device(), 4, 4, 4,
+                               DataType::FLOAT32,
+                               TensorStorageType::TEXTURE_ARRAY, &tensor));
+  RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
+  RETURN_IF_ERROR(env->queue()->DispatchImplicit(kernel, {4, 4, 1}, {4, 4, 1}));
+  std::vector<float> cpu_data(64, 0.0f);
+  RETURN_IF_ERROR(tensor.ReadDataBHWC(absl::MakeSpan(cpu_data), env->queue()));
+
+  *result = true;
+  for (int i = 0; i < 64; ++i) {
+    if (cpu_data[i] != 2.0) {
+      *result = false;
+      break;
+    }
+  }
+  return OkStatus();
+}
+
+Status CreateEnvironment(Environment* result, bool shared,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display) {
+  CLDevice gpu;
+  RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
+
+  CLContext context;
+  if (shared) {
+    RETURN_IF_ERROR(CreateCLGLContext(gpu, egl_context, egl_display, &context));
+  } else {
+    RETURN_IF_ERROR(CreateCLContext(gpu, &context));
+  }
+  CLCommandQueue queue;
+  RETURN_IF_ERROR(CreateCLCommandQueue(gpu, context, &queue));
+  ProfilingCommandQueue profiling_queue;
+  RETURN_IF_ERROR(CreateProfilingCommandQueue(gpu, context, &profiling_queue));
+
+  *result = Environment(std::move(gpu), std::move(context), std::move(queue),
+                        std::move(profiling_queue));
+
+  if (result->device().IsAdreno() && result->device().SupportsTextureArray()) {
+    bool supports_one_layer;
+    RETURN_IF_ERROR(
+        CheckKernelSupportOfOneLayerTextureArray(result, &supports_one_layer));
+    if (!supports_one_layer) {
+      result->GetDevicePtr()->DisableOneLayerTextureArray();
+    }
+  }
+
+  return OkStatus();
+}
+}  // namespace
+
+Environment::Environment(CLDevice&& device, CLContext&& context,
+                         CLCommandQueue&& queue,
+                         ProfilingCommandQueue&& profiling_queue)
+    : device_(std::move(device)),
+      context_(std::move(context)),
+      queue_(std::move(queue)),
+      profiling_queue_(std::move(profiling_queue)) {}
+
+Environment::Environment(Environment&& environment)
+    : device_(std::move(environment.device_)),
+      context_(std::move(environment.context_)),
+      queue_(std::move(environment.queue_)),
+      profiling_queue_(std::move(environment.profiling_queue_)),
+      program_cache_(std::move(environment.program_cache_)) {}
+
+Environment& Environment::operator=(Environment&& environment) {
+  if (this != &environment) {
+    device_ = std::move(environment.device_);
+    context_ = std::move(environment.context_);
+    queue_ = std::move(environment.queue_);
+    profiling_queue_ = std::move(environment.profiling_queue_);
+    program_cache_ = std::move(environment.program_cache_);
+  }
+  return *this;
+}
+
+void Environment::SetHighPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetDefaultPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetLowPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const {
+  std::vector<CalculationsPrecision> precisions;
+  for (CalculationsPrecision precision :
+       {CalculationsPrecision::F32, CalculationsPrecision::F32_F16,
+        CalculationsPrecision::F16}) {
+    if (IsSupported(precision)) {
+      precisions.push_back(precision);
+    }
+  }
+  return precisions;
+}
+
+bool Environment::IsSupported(CalculationsPrecision precision) const {
+  switch (precision) {
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      return device_.SupportsFP16();
+    case CalculationsPrecision::F32:
+      return true;
+  }
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedTextureStorages()
+    const {
+  std::vector<TensorStorageType> storage_types = {
+      TensorStorageType::TEXTURE_2D};
+  if (device_.SupportsTextureArray()) {
+    storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
+  }
+  return storage_types;
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
+  std::vector<TensorStorageType> storage_types = {TensorStorageType::TEXTURE_2D,
+                                                  TensorStorageType::BUFFER};
+  if (device_.SupportsTextureArray()) {
+    storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
+  }
+  return storage_types;
+}
+
+TensorStorageType GetOptimalStorageType(const CLDevice& gpu) {
+  if (gpu.IsAdreno()) {
+    if (gpu.IsAdreno6xxOrHigher()) {
+      return TensorStorageType::TEXTURE_ARRAY;
+    } else {
+      return TensorStorageType::TEXTURE_2D;
+    }
+  } else if (gpu.IsPowerVR() || gpu.IsNvidia()) {
+    return TensorStorageType::TEXTURE_2D;
+  } else if (gpu.IsMali()) {
+    return TensorStorageType::BUFFER;
+  }
+
+  return TensorStorageType::BUFFER;
+}
+
+Status CreateDefaultEnvironment(Environment* result) {
+  return CreateEnvironment(result, false, 0, 0);
+}
+
+Status CreateEnvironment(Environment* result) {
+  return CreateEnvironment(result, false, 0, 0);
+}
+
+Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
+                                     cl_context_properties egl_display,
+                                     Environment* result) {
+  return CreateEnvironment(result, true, egl_context, egl_display);
+}
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    Environment* env, CLKernel* result) {
+  return CreateKernel(code, function_name, {}, env, result);
+}
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    const std::vector<CompilerOptions>& compiler_options,
+                    Environment* env, CLKernel* result) {
+  return env->program_cache()->GetOrCreateCLKernel(
+      code, function_name, compiler_options, env->context(), env->device(),
+      result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.h b/tensorflow/lite/delegates/gpu/cl/environment.h
new file mode 100644
index 00000000000..cbf73255576
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -0,0 +1,93 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Environment {
+ public:
+  Environment() = default;
+  explicit Environment(CLDevice&& device, CLContext&& context,
+                       CLCommandQueue&& queue,
+                       ProfilingCommandQueue&& profiling_queue);
+
+  // Move only
+  Environment(Environment&& environment);
+  Environment& operator=(Environment&& environment);
+  Environment(const Environment&) = delete;
+  Environment& operator=(const Environment&) = delete;
+
+  const CLDevice& device() const { return device_; }
+  CLDevice* GetDevicePtr() { return &device_; }
+  const CLDevice* GetDevicePtr() const { return &device_; }
+  CLContext& context() { return context_; }
+  CLCommandQueue* queue() { return &queue_; }
+  ProfilingCommandQueue* profiling_queue() { return &profiling_queue_; }
+  ProgramCache* program_cache() { return &program_cache_; }
+  const ProgramCache* program_cache() const { return &program_cache_; }
+
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
+  bool IsSupported(CalculationsPrecision precision) const;
+  std::vector<TensorStorageType> GetSupportedTextureStorages() const;
+  std::vector<TensorStorageType> GetSupportedStorages() const;
+
+  void SetHighPerformance() const;
+  void SetDefaultPerformance() const;
+  void SetLowPerformance() const;  // for energy saving
+
+ private:
+  CLDevice device_;
+  CLContext context_;
+  CLCommandQueue queue_;
+  ProfilingCommandQueue profiling_queue_;
+  ProgramCache program_cache_;
+};
+
+TensorStorageType GetOptimalStorageType(const CLDevice& gpu);
+
+Status CreateDefaultEnvironment(Environment* result);
+
+Status CreateEnvironment(Environment* result);
+Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
+                                     cl_context_properties egl_display,
+                                     Environment* result);
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    Environment* env, CLKernel* result);
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    const std::vector<CompilerOptions>& compiler_options,
+                    Environment* env, CLKernel* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
new file mode 100644
index 00000000000..a40d1b537ed
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
@@ -0,0 +1,259 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_sync.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// TODO(b/131897059): replace with 64 version when EGL 1.5 is available.
+// it should use KHR_cl_event2 extension. More details are in b/129974818.
+using PFNEGLCREATESYNCPROC = EGLSync(EGLAPIENTRYP)(
+    EGLDisplay dpy, EGLenum type, const EGLAttrib* attrib_list);
+
+PFNEGLCREATESYNCPROC g_eglCreateSync = nullptr;
+
+}  // namespace
+
+Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
+                                EglSync* sync) {
+  if (!IsEglSyncFromClEventSupported()) {
+    return UnimplementedError("CreateEglSyncFromClEvent is not supported");
+  }
+  EGLSync egl_sync;
+  const EGLAttrib attributes[] = {EGL_CL_EVENT_HANDLE,
+                                  reinterpret_cast<EGLAttrib>(event), EGL_NONE};
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(g_eglCreateSync, &egl_sync, display,
+                                      EGL_SYNC_CL_EVENT, attributes));
+  if (egl_sync == EGL_NO_SYNC) {
+    return InternalError("Returned empty EGL sync");
+  }
+  *sync = EglSync(display, egl_sync);
+  return OkStatus();
+}
+
+bool IsEglSyncFromClEventSupported() {
+  // In C++11, static initializers are guaranteed to be evaluated only once.
+  static bool supported = []() -> bool {
+    // This function requires EGL 1.5 to work
+    g_eglCreateSync = reinterpret_cast<PFNEGLCREATESYNCPROC>(
+        eglGetProcAddress("eglCreateSync"));
+    // eglQueryString accepts EGL_NO_DISPLAY only starting EGL 1.5
+    if (!eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS)) {
+      g_eglCreateSync = nullptr;
+    }
+    return (g_eglCreateSync != nullptr);
+  }();
+  return supported;
+}
+
+Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
+                                CLEvent* event) {
+  cl_int error_code;
+  cl_event new_event = clCreateEventFromEGLSyncKHR(
+      context, egl_sync.sync(), egl_sync.display(), &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to create CL sync from EGL sync. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *event = CLEvent(new_event);
+  return OkStatus();
+}
+
+bool IsClEventFromEglSyncSupported(const CLDevice& device) {
+  return device.SupportsExtension("cl_khr_egl_event");
+}
+
+Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
+                                  CLContext* context, CLMemory* memory) {
+  cl_int error_code;
+  auto mem = clCreateFromGLBuffer(context->context(), ToClMemFlags(access_type),
+                                  gl_ssbo_id, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to acquire CL buffer from GL buffer. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *memory = CLMemory(mem, true);
+  return OkStatus();
+}
+
+Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
+                                   AccessType access_type, CLContext* context,
+                                   CLMemory* memory) {
+  cl_int error_code;
+  auto mem =
+      clCreateFromGLTexture(context->context(), ToClMemFlags(access_type),
+                            texture_target, 0, texture_id, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to create CL buffer from GL texture. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *memory = CLMemory(mem, true);
+  return OkStatus();
+}
+
+bool IsGlSharingSupported(const CLDevice& device) {
+  return clCreateFromGLBuffer && clCreateFromGLTexture &&
+         device.SupportsExtension("cl_khr_gl_sharing");
+}
+
+AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
+
+Status AcquiredGlObjects::Acquire(const std::vector<cl_mem>& memory,
+                                  cl_command_queue queue,
+                                  const std::vector<cl_event>& wait_events,
+                                  CLEvent* acquire_event,
+                                  AcquiredGlObjects* objects) {
+  if (!memory.empty()) {
+    cl_event new_event;
+    cl_int error_code = clEnqueueAcquireGLObjects(
+        queue, memory.size(), memory.data(), wait_events.size(),
+        wait_events.data(), acquire_event ? &new_event : nullptr);
+    if (error_code != CL_SUCCESS) {
+      return InternalError(absl::StrCat("Unable to acquire GL object. ",
+                                        CLErrorCodeToString(error_code)));
+    }
+    if (acquire_event) {
+      *acquire_event = CLEvent(new_event);
+    }
+    clFlush(queue);
+  }
+  *objects = AcquiredGlObjects(memory, queue);
+  return OkStatus();
+}
+
+Status AcquiredGlObjects::Release(const std::vector<cl_event>& wait_events,
+                                  CLEvent* release_event) {
+  if (queue_ && !memory_.empty()) {
+    cl_event new_event;
+    cl_int error_code = clEnqueueReleaseGLObjects(
+        queue_, memory_.size(), memory_.data(), wait_events.size(),
+        wait_events.data(), release_event ? &new_event : nullptr);
+    if (error_code != CL_SUCCESS) {
+      return InternalError(absl::StrCat("Unable to release GL object. ",
+                                        CLErrorCodeToString(error_code)));
+    }
+    if (release_event) {
+      *release_event = CLEvent(new_event);
+    }
+    clFlush(queue_);
+    queue_ = nullptr;
+  }
+  return OkStatus();
+}
+
+GlInteropFabric::GlInteropFabric(EGLDisplay egl_display,
+                                 Environment* environment)
+    : is_egl_sync_supported_(true),
+      is_egl_to_cl_mapping_supported_(
+          IsClEventFromEglSyncSupported(environment->device())),
+      is_cl_to_egl_mapping_supported_(IsEglSyncFromClEventSupported()),
+      egl_display_(egl_display),
+      context_(environment->context().context()),
+      queue_(environment->queue()->queue()) {}
+
+void GlInteropFabric::RegisterMemory(cl_mem memory) {
+  memory_.push_back(memory);
+}
+
+void GlInteropFabric::UnregisterMemory(cl_mem memory) {
+  auto it = std::find(memory_.begin(), memory_.end(), memory);
+  if (it != memory_.end()) {
+    memory_.erase(it);
+  }
+}
+
+Status GlInteropFabric::Start() {
+  if (!is_enabled()) {
+    return OkStatus();
+  }
+
+  // In GL-CL interoperability, we need to make sure GL finished processing of
+  // all commands that might affect GL objects. There are a few ways:
+  //   a) glFinish
+  //      slow, but portable
+  //   b) EglSync + ClientWait
+  //      faster alternative for glFinish, but still slow as it stalls GPU
+  //      pipeline.
+  //   c) EglSync->CLEvent or GlSync->CLEvent mapping
+  //      Fast, as it allows to map sync to CL event and use it as a dependency
+  //      later without stalling GPU pipeline.
+  if (is_egl_sync_supported_) {
+    EglSync sync;
+    RETURN_IF_ERROR(EglSync::NewFence(egl_display_, &sync));
+    if (is_egl_to_cl_mapping_supported_) {
+      // (c) EglSync->CLEvent or GlSync->CLEvent mapping
+      glFlush();
+      RETURN_IF_ERROR(
+          CreateClEventFromEglSync(context_, sync, &inbound_event_));
+    } else {
+      // (b) EglSync + ClientWait
+      RETURN_IF_ERROR(sync.ClientWait());
+    }
+  } else {
+    // (a) glFinish / GL fence sync
+    RETURN_IF_ERROR(gl::GlActiveSyncWait());
+  }
+
+  // Acquire all GL objects needed while processing.
+  auto make_acquire_wait = [&]() -> std::vector<cl_event> {
+    if (inbound_event_.is_valid()) {
+      return {inbound_event_.event()};
+    }
+    return {};
+  };
+  return AcquiredGlObjects::Acquire(memory_, queue_, make_acquire_wait(),
+                                    nullptr, &gl_objects_);
+}
+
+Status GlInteropFabric::Finish() {
+  if (!is_enabled()) {
+    return OkStatus();
+  }
+  RETURN_IF_ERROR(gl_objects_.Release({}, &outbound_event_));
+
+  // if (is_egl_sync_supported_ && is_cl_to_egl_mapping_supported_) {
+  //   EglSync egl_outbound_sync;
+  //   RETURN_IF_ERROR(CreateEglSyncFromClEvent(outbound_event_.event(),
+  //                                            egl_display_,
+  //                                            &egl_outbound_sync));
+  //   // Instruct GL pipeline to wait until corresponding CL event is signaled.
+  //   RETURN_IF_ERROR(egl_outbound_sync.ServerWait());
+  //   glFlush();
+  // } else {
+  //   // Slower option if proper sync is not supported. It is equivalent to
+  //   // clFinish, but, hopefully, faster.
+  //   outbound_event_.Wait();
+  // }
+
+  // This slow sync is the only working solution right now. We have to debug why
+  // above version is not working fast and reliable.
+  outbound_event_.Wait();
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.h b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
new file mode 100644
index 00000000000..74c9553016b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+
+#include <vector>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Creates an EglSync from OpenCL event. Source event does not need to outlive
+// returned sync and could be safely destroyed.
+//
+// Depends on EGL 1.5.
+Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
+                                EglSync* sync);
+
+// Returns true if 'CreateEglSyncFromClEvent' is supported.
+bool IsEglSyncFromClEventSupported();
+
+// Creates CL event from EGL sync.
+// Created event could only be comsumed by AcquiredGlObject::Acquire call as
+// a 'wait_event'.
+Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
+                                CLEvent* event);
+
+// Returns true if 'CreateClEventFromEglSync' is supported.
+bool IsClEventFromEglSyncSupported(const CLDevice& device);
+
+// Creates new CL memory object from OpenGL buffer.
+Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
+                                  CLContext* context, CLMemory* memory);
+
+// Creates new CL memory object from OpenGL texture.
+Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
+                                   AccessType access_type, CLContext* context,
+                                   CLMemory* memory);
+
+// Returns true if GL objects could be shared with OpenCL context.
+bool IsGlSharingSupported(const CLDevice& device);
+
+// RAII-wrapper for GL objects acquired into CL context.
+class AcquiredGlObjects {
+ public:
+  static bool IsSupported(const CLDevice& device);
+
+  AcquiredGlObjects() : AcquiredGlObjects({}, nullptr) {}
+
+  // Quitely releases OpenGL objects. It is recommended to call Release()
+  // explicitly to properly handle potential errors.
+  ~AcquiredGlObjects();
+
+  // Acquires memory from the OpenGL context. Memory must be created by either
+  // CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture calls.
+  // If 'acquire_event' is not nullptr, it will be signared once acquisition is
+  // complete.
+  static Status Acquire(const std::vector<cl_mem>& memory,
+                        cl_command_queue queue,
+                        const std::vector<cl_event>& wait_events,
+                        CLEvent* acquire_event /* optional */,
+                        AcquiredGlObjects* objects);
+
+  // Releases OpenCL memory back to OpenGL context. If 'release_event' is not
+  // nullptr, it will be signalled once release is complete.
+  Status Release(const std::vector<cl_event>& wait_events,
+                 CLEvent* release_event /* optional */);
+
+ private:
+  AcquiredGlObjects(const std::vector<cl_mem>& memory, cl_command_queue queue)
+      : memory_(memory), queue_(queue) {}
+
+  std::vector<cl_mem> memory_;
+  cl_command_queue queue_;
+};
+
+// Incapsulates all complicated GL-CL synchronization. It manages life time of
+// all appropriate events to ensure fast synchronization whenever possible.
+class GlInteropFabric {
+ public:
+  GlInteropFabric(EGLDisplay egl_display, Environment* environment);
+
+  // Ensures proper GL->CL synchronization is in place before
+  // GL objects that are mapped to CL objects are used.
+  Status Start();
+
+  // Puts appropriate CL->GL synchronization after all work is complete.
+  Status Finish();
+
+  // Registers memory to be used from GL context. Such CL memory object must
+  // be created with CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture
+  // call.
+  void RegisterMemory(cl_mem memory);
+
+  // Unregisters memory registered with RegisterMemory call.
+  void UnregisterMemory(cl_mem memory);
+
+ private:
+  bool is_enabled() const { return egl_display_ && !memory_.empty(); }
+
+  bool is_egl_sync_supported_;
+  bool is_egl_to_cl_mapping_supported_;
+  bool is_cl_to_egl_mapping_supported_;
+
+  const EGLDisplay egl_display_;
+  cl_context context_;
+  cl_command_queue queue_;
+  CLEvent inbound_event_;
+  CLEvent outbound_event_;
+  std::vector<cl_mem> memory_;
+  AcquiredGlObjects gl_objects_;  // transient during Start/Finish calls.
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
new file mode 100644
index 00000000000..7ce5dae5c51
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
@@ -0,0 +1,380 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h"
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Forward declarations.
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
+
+InferencePriority ToPriority(int32_t priority) {
+  switch (priority) {
+    case TfLiteGpuInferencePriority::
+        TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
+      return InferencePriority::MAX_PRECISION;
+    case TfLiteGpuInferencePriority::TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
+      return InferencePriority::MIN_LATENCY;
+  }
+  return InferencePriority::MAX_PRECISION;
+}
+
+DataType ToDataType(TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat16:
+      return DataType::FLOAT16;
+    case kTfLiteFloat32:
+      return DataType::FLOAT32;
+    default:
+      return DataType::UNKNOWN;
+  }
+}
+
+DataLayout ToDataLayoutFromTFL(TfLiteGpuDataLayout data_layout) {
+  switch (data_layout) {
+    case TFLITE_GPU_DATA_LAYOUT_BHWC:
+      return DataLayout::BHWC;
+    case TFLITE_GPU_DATA_LAYOUT_DHWC4:
+      return DataLayout::DHWC4;
+    default:
+      return DataLayout::UNKNOWN;
+  }
+}
+
+class Delegate {
+ public:
+  explicit Delegate(const TfLiteGpuDelegateOptions_New* options) {
+    if (options) {
+      options_ = *options;
+    } else {
+      // Default options.
+      options_.compile_options.precision_loss_allowed = 0;
+      options_.compile_options.inference_priority = TfLiteGpuInferencePriority::
+          TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+      options_.egl_display = eglGetCurrentDisplay();
+      options_.egl_context = eglGetCurrentContext();
+      options_.serialized_binary_cache_data = nullptr;
+      options_.serialized_binary_cache_size = 0;
+    }
+  }
+
+  Status Prepare(TfLiteContext* context,
+                 const TfLiteDelegateParams* delegate_params) {
+    // Extract TFLite delegate execution plan from the context and convert it
+    // into FlowGraph32.
+    GraphFloat32 graph;
+    RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
+
+    // Apply general transformations on the graph.
+    NullTransformationReporter reporter;
+    ModelTransformer transformer(&graph, &reporter);
+    if (!ApplyGeneralTransformations(&transformer)) {
+      return InternalError("Graph general transformations failed");
+    }
+
+    InferenceEnvironmentOptions env_options;
+    env_options.egl_context = options_.egl_context;
+    env_options.egl_display = options_.egl_display;
+    env_options.serialized_binary_cache = {
+        options_.serialized_binary_cache_data,
+        options_.serialized_binary_cache_size};
+    InferenceEnvironmentProperties properties;
+    Status status =
+        NewInferenceEnvironment(env_options, &environment_, &properties);
+    if (!properties.is_opencl_available) {
+      context->ReportError(context,
+                           "TfLiteGpuDelegate: OpenCL is not available");
+    }
+    if (!properties.is_gl_sharing_supported) {
+      context->ReportError(context,
+                           "TfLiteGpuDelegate: GL sharing is not supported");
+    }
+    if (!properties.is_cl_to_gl_fast_sync_supported) {
+      context->ReportError(
+          context, "TfLiteGpuDelegate: fast CL to GL sync is not supported");
+    }
+    if (!properties.is_gl_to_cl_fast_sync_supported) {
+      context->ReportError(
+          context, "TfLiteGpuDelegate: fast GL to CL sync is not supported");
+    }
+    RETURN_IF_ERROR(status);
+
+    std::vector<uint32_t> input_refs;
+    {
+      const auto& inputs = graph.inputs();
+      input_refs.reserve(inputs.size());
+      for (auto input : inputs) {
+        input_refs.push_back(input->tensor.ref);
+      }
+    }
+    std::vector<uint32_t> output_refs;
+    {
+      const auto& outputs = graph.outputs();
+      output_refs.reserve(outputs.size());
+      for (auto output : outputs) {
+        output_refs.push_back(output->tensor.ref);
+      }
+    }
+
+    InferenceOptions options;
+    options.priority = ToPriority(options_.compile_options.inference_priority);
+    options.allow_precision_loss =
+        options_.compile_options.precision_loss_allowed != 0;
+    std::unique_ptr<InferenceBuilder> builder;
+    RETURN_IF_ERROR(
+        environment_->NewInferenceBuilder(options, std::move(graph), &builder));
+
+    // At this point tflite didn't allocate tensors yet, therefore, collect
+    // indices and set all input and output tensors from tflite later.
+    input_indices_.reserve(input_refs.size());
+    for (auto tensor_index : input_refs) {
+      int object_index = input_indices_.size();
+      input_indices_.push_back(tensor_index);
+      RETURN_IF_ERROR(
+          builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
+    }
+    output_indices_.reserve(output_refs.size());
+    for (auto tensor_index : output_refs) {
+      int object_index = output_indices_.size();
+      output_indices_.push_back(tensor_index);
+      RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
+                                                  GetObjectDef(tensor_index)));
+    }
+
+    return builder->Build(&runner_);
+  }
+
+  Status SetInputsAndOutputs(TfLiteContext* context) {
+    int i = 0;
+    for (auto index : input_indices_) {
+      RETURN_IF_ERROR(
+          runner_->SetInputObject(i++, GetTensorObject(index, context)));
+    }
+    i = 0;
+    for (auto index : output_indices_) {
+      RETURN_IF_ERROR(
+          runner_->SetOutputObject(i++, GetTensorObject(index, context)));
+    }
+    return OkStatus();
+  }
+
+  Status Invoke(TfLiteContext* context) {
+    RETURN_IF_ERROR(SetInputsAndOutputs(context));
+    return runner_->Run();
+  }
+
+  void BindGlBufferToTensor(GLuint buffer_id, int tensor_index,
+                            DataType data_type, DataLayout data_layout) {
+    // At this point the delegate haven't seen a model yet. Therefore, just
+    // record what object gets assigned.
+    if (tensor_index >= tensors_.size()) {
+      tensors_.resize(tensor_index + 1);
+    }
+    TensorObjectDef def;
+    def.object_def.data_type = data_type;
+    def.object_def.data_layout = data_layout;
+    def.object_def.object_type = ObjectType::OPENGL_SSBO;
+    def.object_def.user_provided = true;
+    def.dimensions = Dimensions(0, 0, 0, 0);
+    OpenGlBuffer buffer;
+    buffer.id = buffer_id;
+    TensorObject obj = buffer;
+    tensors_[tensor_index] = std::make_pair(obj, def);
+  }
+
+  ObjectDef GetObjectDef(int index) const {
+    if (index < tensors_.size() && IsValid(tensors_[index].second)) {
+      return tensors_[index].second.object_def;
+    }
+    ObjectDef default_object_def;
+    default_object_def.data_type = DataType::FLOAT32;
+    default_object_def.data_layout = DataLayout::BHWC;
+    default_object_def.object_type = ObjectType::CPU_MEMORY;
+    default_object_def.user_provided = true;
+    return default_object_def;
+  }
+
+  TensorObject GetTensorObject(int index, TfLiteContext* context) const {
+    if (index < tensors_.size() &&
+        IsValid(tensors_[index].second, tensors_[index].first)) {
+      return tensors_[index].first;
+    }
+    auto& tensor = context->tensors[index];
+    return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
+  }
+
+  TfLiteDelegate* tflite_delegate() { return &delegate_; }
+
+  bool SupportsGlObjects() const {
+    return options_.egl_context != EGL_NO_CONTEXT &&
+           options_.egl_display != EGL_NO_DISPLAY;
+  }
+
+  absl::Span<const uint8_t> GetSerializedBinaryCache() {
+    binary_cache_ = environment_->GetSerializedBinaryCache();
+    return binary_cache_;
+  }
+
+ private:
+  TfLiteDelegate delegate_ = {
+      reinterpret_cast<void*>(this),  // .data_
+      DelegatePrepare,                // .Prepare
+      nullptr,                        // .CopyFromBufferHandle
+      nullptr,                        // .CopyToBufferHandle
+      nullptr,                        // .FreeBufferHandle
+      kTfLiteDelegateFlagsNone,       // .flags
+  };
+
+  TfLiteGpuDelegateOptions_New options_;
+  std::unique_ptr<InferenceEnvironment> environment_;
+  std::unique_ptr<InferenceRunner> runner_;
+  std::vector<int64_t> input_indices_;
+  std::vector<int64_t> output_indices_;
+  std::vector<uint8_t> binary_cache_;
+  std::vector<std::pair<TensorObject, TensorObjectDef>> tensors_;
+};
+
+inline Delegate* GetDelegate(TfLiteNode* node) {
+  return reinterpret_cast<Delegate*>(node->user_data);
+}
+
+inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
+  return reinterpret_cast<Delegate*>(delegate->data_);
+}
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  const TfLiteRegistration kRegistration = {
+      // .init
+      [](TfLiteContext* context, const char* buffer, size_t) -> void* {
+        const auto* params =
+            reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+        auto* gpu_delegate = GetDelegate(params->delegate);
+        // Everything below should happen in prepare function call, but TFLite
+        // for whatever reason forbids that.
+        const auto status = gpu_delegate->Prepare(context, params);
+        if (!status.ok()) {
+          context->ReportError(context, "TfLiteGpuDelegate Init: %s",
+                               status.error_message().c_str());
+          return nullptr;
+        }
+        return gpu_delegate;
+      },
+      // .free
+      [](TfLiteContext*, void* buffer) -> void {},
+      // .prepare
+      [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        if (!node->user_data) {
+          context->ReportError(
+              context,
+              "TfLiteGpuDelegate Prepare: delegate is not initialized");
+          return kTfLiteError;
+        }
+        // TODO(akulik): tflite tensors are not allocated here either. It would
+        // be good to set inputs and outputs only once here instead of setting
+        // them every time in .invoke.
+        return kTfLiteOk;
+      },
+      // .invoke
+      [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        const auto status = GetDelegate(node)->Invoke(context);
+        if (!status.ok()) {
+          context->ReportError(context, "TfLiteGpuDelegate Invoke: %s",
+                               status.error_message().c_str());
+          return kTfLiteError;
+        }
+        return kTfLiteOk;
+      },
+      nullptr,                  // .profiling_string
+      0,                        // .builtin_code
+      "TfLiteGpuDelegate_New",  // .custom_name
+      1,                        // .version
+  };
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, kRegistration, ops_to_replace, delegate);
+  TfLiteIntArrayFree(ops_to_replace);
+  return status;
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+TfLiteDelegate* TfLiteGpuDelegateCreate_New(
+    const TfLiteGpuDelegateOptions_New* options) {
+  auto* gpu_delegate = new tflite::gpu::cl::Delegate(options);
+  return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
+}
+
+void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate) {
+  delete tflite::gpu::cl::GetDelegate(delegate);
+}
+
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
+    TfLiteType data_type, TfLiteGpuDataLayout data_layout) {
+  auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
+  if (!gpu_delegate) {
+    return kTfLiteError;
+  }
+  if (!gpu_delegate->SupportsGlObjects()) {
+    return kTfLiteError;
+  }
+  auto type = tflite::gpu::cl::ToDataType(data_type);
+  if (type == tflite::gpu::DataType::UNKNOWN) {
+    return kTfLiteError;
+  }
+  auto layout = tflite::gpu::cl::ToDataLayoutFromTFL(data_layout);
+  if (layout == tflite::gpu::DataLayout::UNKNOWN) {
+    return kTfLiteError;
+  }
+  gpu_delegate->BindGlBufferToTensor(buffer_id, tensor_index, type, layout);
+  return kTfLiteOk;
+}
+
+bool TfLiteGpuDelegateGetSerializedBinaryCache(TfLiteDelegate* delegate,
+                                               size_t* size,
+                                               const uint8_t** data) {
+  *size = 0;
+  auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
+  if (!gpu_delegate) {
+    return false;
+  }
+  auto cache = gpu_delegate->GetSerializedBinaryCache();
+  if (cache.empty()) {
+    return false;
+  }
+  *size = cache.size();
+  *data = cache.data();
+  return true;
+}
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
new file mode 100644
index 00000000000..1b442623c28
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+
+#include <stdint.h>
+
+#include <EGL/egl.h>
+#include <GLES3/gl31.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+enum TfLiteGpuInferencePriority {
+  TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION = 0,
+  TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY = 1,
+};
+
+// Shader compilation options.
+typedef struct {
+  // When set to zero, computations are carried out in 32-bit floating point.
+  // Otherwise, the GPU may quantify tensors, downcast values, process in FP16
+  // (recommended).
+  int32_t precision_loss_allowed;
+
+  // Priority is defined in TfLiteGpuInferencePriority.
+  int32_t inference_priority;
+} TfLiteGpuCompileOptions_New;
+
+typedef struct {
+  TfLiteGpuCompileOptions_New compile_options;
+
+  // [Optional]
+  // Whenever EGL display and EGL context are set, corresponding OpenCL context
+  // will be created.
+  // These variables are required when using GL objects as inputs or outputs.
+  EGLDisplay egl_display;
+  EGLContext egl_context;
+
+  // [Optional]
+  // Contains data returned from TfLiteGpuDelegateGetSerializedBinaryCache call.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  const uint8_t* serialized_binary_cache_data;
+  size_t serialized_binary_cache_size;
+} TfLiteGpuDelegateOptions_New;
+
+// Creates a new delegate instance that need to be destroyed with
+// TfLiteGpuDelegateDelete_New when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the following default values are used:
+// .compile_options = {
+//   .precision_loss_allowed = false,
+// }
+// .egl_display = eglGetCurrentDisplay(),
+// .egl_context = eglGetCurrentContext();
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
+    const TfLiteGpuDelegateOptions_New* options);
+
+// Destroys a delegate created with `TfLiteGpuDelegateCreate_New` call.
+TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate);
+
+typedef enum {
+  TFLITE_GPU_DATA_LAYOUT_BHWC = 0,
+  TFLITE_GPU_DATA_LAYOUT_DHWC4 = 1,
+} TfLiteGpuDataLayout;
+
+// Binds GL shader storage object to an input or an output tensor in the
+// initialized delegate. Bound buffer should have sufficient storage to
+// accommodate all elements of a tensor.
+//
+// Supports data of kTfliteFloat16 or kTfliteFloat32 types in BHWC or DHWC4 data
+// layouts.
+//
+// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
+    TfLiteType data_type, TfLiteGpuDataLayout data_layout);
+
+// Returns opaque binary blob that contains a collection of cached OpenCL
+// binaries. Returned data could be re-used later to speed up initialization
+// time when new delegate is created for the same model.
+// Returned data is valid only if used on the same device, otherwise it will
+// not be compatible and will be discarded.
+TFL_CAPI_EXPORT bool TfLiteGpuDelegateGetSerializedBinaryCache(
+    TfLiteDelegate* delegate, size_t* size, const uint8_t** data);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
new file mode 100644
index 00000000000..e0ad1355cd1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -0,0 +1,423 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
+             const CLNode& node) {
+  for (const ValueId in_id : node.inputs) {
+    if (ready_tensors.find(in_id) == ready_tensors.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
+    const CLNode& node) {
+  std::vector<std::pair<ValueId, TensorDescriptor>> result;
+  for (int i = 0; i < node.operations.size(); ++i) {
+    const OperationDef op_def = node.operations[i]->GetDefinition();
+    const auto& first_range = node.ranges[0];
+    for (int k = first_range.x; k < first_range.y; ++k) {
+      result.push_back({node.inputs[k], op_def.src_tensors[k - first_range.x]});
+    }
+    for (int j = 1; j < node.ranges.size(); ++j) {
+      const auto& range = node.ranges[j];
+      for (int k = range.x; k < range.y; ++k) {
+        result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
+      }
+    }
+    for (int j = 0; j < node.outputs.size(); ++j) {
+      result.push_back({node.outputs[j], op_def.dst_tensors[j]});
+    }
+  }
+
+  return result;
+}
+
+void MergeCLNodes(CLNode* src, CLNode* dst) {
+  int offset = dst->inputs.size();
+  for (int j = 0; j < src->inputs.size(); ++j) {
+    if (src->inputs[j] != dst->outputs[0]) {
+      dst->inputs.push_back(src->inputs[j]);
+    }
+  }
+  auto first_range = src->ranges[0];
+  dst->ranges.push_back(
+      int2(first_range.x + offset, first_range.y - 1 + offset));
+  for (int i = 1; i < src->ranges.size(); ++i) {
+    auto range = src->ranges[i];
+    dst->ranges.push_back(int2(range.x + offset, range.y + offset));
+  }
+  dst->outputs[0] = src->outputs[0];
+  for (int i = 0; i < src->operations.size(); ++i) {
+    dst->operations.push_back(std::move(src->operations[i]));
+  }
+  dst->name += " linked : " + src->name;
+}
+
+void AddUsage(ValueId id, int task_index,
+              std::map<ValueId, int2>* usage_records) {
+  auto it = usage_records->find(id);
+  if (it == usage_records->end()) {
+    (*usage_records)[id].x = task_index;
+    (*usage_records)[id].y = task_index;
+  } else {
+    (*usage_records)[id].y = task_index;
+  }
+}
+
+}  // namespace
+
+CLNode::CLNode(CLNode&& node)
+    : operations(std::move(node.operations)),
+      inputs(std::move(node.inputs)),
+      outputs(std::move(node.outputs)),
+      ranges(std::move(node.ranges)),
+      name(std::move(node.name)) {}
+
+CLNode& CLNode::operator=(CLNode&& node) {
+  if (this != &node) {
+    operations = std::move(node.operations);
+    inputs = std::move(node.inputs);
+    outputs = std::move(node.outputs);
+    ranges = std::move(node.ranges);
+    name = std::move(node.name);
+  }
+  return *this;
+}
+
+Status InferenceContext::InitFromGraph(const CreateInferenceInfo& create_info,
+                                       const GraphFloat32& graph,
+                                       Environment* env) {
+  precision_ = create_info.precision;
+  storage_type_ = create_info.storage_type;
+  auto vendor = env->device().vendor();
+  if (vendor == Vendor::MALI) {
+    need_flush_ = true;
+    need_manual_release_ = true;
+  }
+  if (vendor == Vendor::POWERVR) {
+    need_flush_ = true;
+  }
+  CopyInAndOutIds(graph);
+  CreationContext creation_context;
+  creation_context.device = env->GetDevicePtr();
+  creation_context.context = &env->context();
+  creation_context.queue = env->queue();
+  creation_context.cache = env->program_cache();
+  RETURN_IF_ERROR(
+      ConvertOperations(creation_context, graph, create_info.hints));
+  Merge();
+  RETURN_IF_ERROR(
+      AllocateMemory(graph, env->device(), creation_context.context));
+  BindMemoryToOperations();
+  RETURN_IF_ERROR(Compile(creation_context));
+
+  TuningParameters tuning_parameters;
+  tuning_parameters.queue = env->profiling_queue();
+  tuning_parameters.info = env->device().GetInfoPtr();
+  if (create_info.hints.Check(ModelHints::kFastTuning)) {
+    tuning_parameters.tuning_type = TuningType::FAST;
+  }
+  RETURN_IF_ERROR(Tune(tuning_parameters));
+  return OkStatus();
+}
+
+Status InferenceContext::InitFromGraphWithTransforms(
+    const CreateInferenceInfo& create_info, GraphFloat32* graph,
+    Environment* env) {
+  RETURN_IF_ERROR(RunGraphTransforms(graph));
+  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
+  return OkStatus();
+}
+
+void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
+  const auto inputs = graph.inputs();
+  for (const auto& input : inputs) {
+    input_ids_.push_back(input->id);
+  }
+
+  const auto outputs = graph.outputs();
+  for (const auto& output : outputs) {
+    output_ids_.push_back(output->id);
+  }
+}
+
+Status InferenceContext::ConvertOperations(
+    const CreationContext& creation_context, const GraphFloat32& graph,
+    ModelHints hints) {
+  std::vector<Node*> graph_nodes = graph.nodes();
+  for (int i = 0; i < graph_nodes.size(); ++i) {
+    const Node& node = *graph_nodes[i];
+    auto inputs = graph.FindInputs(node.id);
+    auto outputs = graph.FindOutputs(node.id);
+    OperationDef op_def;
+    op_def.precision = precision_;
+    auto data_type = DeduceDataTypeFromPrecision(precision_);
+    for (int j = 0; j < inputs.size(); ++j) {
+      op_def.src_tensors.push_back({data_type, storage_type_});
+    }
+    for (int j = 0; j < outputs.size(); ++j) {
+      op_def.dst_tensors.push_back({data_type, storage_type_});
+    }
+    std::unique_ptr<GPUOperation> gpu_op;
+    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints, graph,
+                                         node, &gpu_op));
+    CLNode cl_node;
+    cl_node.operations.push_back(std::move(gpu_op));
+    cl_node.ranges.push_back(int2(0, static_cast<int>(inputs.size())));
+    cl_node.inputs.resize(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      cl_node.inputs[j] = inputs[j]->id;
+    }
+    cl_node.outputs.resize(outputs.size());
+    for (int j = 0; j < outputs.size(); ++j) {
+      cl_node.outputs[j] = outputs[j]->id;
+    }
+    cl_node.name = node.operation.type + " " + std::to_string(node.id) + " " +
+                   std::to_string(i);
+    nodes_.push_back(std::move(cl_node));
+  }
+
+  return OkStatus();
+}
+
+void InferenceContext::Merge() {
+  std::unordered_set<ValueId> ready_tensors;
+  for (const auto& input_id : input_ids_) {
+    ready_tensors.insert(input_id);
+  }
+  for (int i = 0; i < nodes_.size(); ++i) {
+    auto& node = nodes_[i];
+    for (const auto& out_id : node.outputs) {
+      ready_tensors.insert(out_id);
+    }
+    if (node.outputs.size() != 1) {
+      continue;
+    }
+    std::vector<int> next_nodes;
+    for (int j = i + 1; j < nodes_.size(); ++j) {
+      for (int k = 0; k < nodes_[j].inputs.size(); ++k) {
+        if (nodes_[j].inputs[k] == node.outputs[0]) {
+          next_nodes.push_back(j);
+        }
+      }
+    }
+    if (next_nodes.size() != 1) {
+      continue;
+    }
+    auto& linkable_node = nodes_[next_nodes[0]];
+    auto* elementwise =
+        dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
+    if (!elementwise || linkable_node.outputs.size() != 1 ||
+        !IsReady(ready_tensors, linkable_node)) {
+      continue;
+    }
+    MergeCLNodes(&linkable_node, &node);
+    nodes_.erase(nodes_.begin() + next_nodes[0]);
+    i -= 1;
+  }
+  for (auto& node : nodes_) {
+    for (int j = 1; j < node.operations.size(); ++j) {
+      auto* elementwise =
+          dynamic_cast<ElementwiseOperation*>(node.operations[j].get());
+      node.operations[0]->AddOperation(elementwise);
+    }
+  }
+}
+
+Status InferenceContext::AllocateMemory(const GraphFloat32& graph,
+                                        const CLDevice& device,
+                                        CLContext* context) {
+  std::map<ValueId, int2> usages;
+  for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
+    auto tensors = GetCLNodeTensors(nodes_[op_index]);
+    for (auto& tensor : tensors) {
+      AddUsage(tensor.first, op_index, &usages);
+    }
+  }
+
+  std::vector<TensorUsageRecord<BHWC>> usage_records;
+  std::map<ValueId, ValueId> remap_from_graph_ids;
+  for (auto& usage : usages) {
+    const auto& shape = graph.GetValue(usage.first)->tensor.shape;
+    remap_from_graph_ids[usage.first] = usage_records.size();
+    usage_records.push_back({shape, static_cast<TaskId>(usage.second.x),
+                             static_cast<TaskId>(usage.second.y)});
+  }
+
+  ObjectsAssignment<BHWC> assignment;
+  RETURN_IF_ERROR(AssignObjectsToTensors(
+      usage_records, MemoryStrategy::EQUALITY, &assignment));
+
+  for (auto& node : nodes_) {
+    for (auto& id : node.inputs) {
+      ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
+      remap_from_graph_ids_to_shared_[id] = new_id;
+      id = new_id;
+    }
+    for (auto& id : node.outputs) {
+      ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
+      remap_from_graph_ids_to_shared_[id] = new_id;
+      id = new_id;
+    }
+  }
+
+  for (auto& node : nodes_) {
+    auto tensors = GetCLNodeTensors(node);
+    for (auto& tensor : tensors) {
+      const auto& it = tensors_.find(tensor.first);
+      if (it == tensors_.end()) {
+        const auto& shape = assignment.object_sizes[tensor.first];
+        Tensor* t = &tensors_[tensor.first];
+        RETURN_IF_ERROR(CreateTensor(*context, device, shape.w, shape.h,
+                                     shape.c, tensor.second.data_type,
+                                     tensor.second.storage_type, t));
+      }
+    }
+  }
+  return OkStatus();
+}
+
+void InferenceContext::BindMemoryToOperations() {
+  for (auto& node : nodes_) {
+    const auto& first_range = node.ranges[0];
+    for (int k = first_range.x; k < first_range.y; ++k) {
+      auto id = node.inputs[k];
+      const auto& it = tensors_.find(id);
+      node.operations[0]->SetSrc(&it->second, k - first_range.x);
+    }
+    for (int i = 1; i < node.ranges.size(); ++i) {
+      const auto& range = node.ranges[i];
+      for (int k = range.x; k < range.y; ++k) {
+        auto id = node.inputs[k];
+        const auto& it = tensors_.find(id);
+        node.operations[i]->SetSrc(&it->second, k - range.x + 1);
+      }
+    }
+
+    for (int i = 0; i < node.outputs.size(); ++i) {
+      auto id = node.outputs[i];
+      const auto& it = tensors_.find(id);
+      node.operations[0]->SetDst(&it->second, i);
+    }
+  }
+}
+
+Status InferenceContext::Compile(const CreationContext& creation_context) {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
+  if (need_manual_release_) {
+    if (prev_enqueue_start_point_.is_valid()) {
+      prev_enqueue_start_point_.Wait();
+    }
+    RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_));
+  }
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+  }
+  if (need_flush_) {
+    clFlush(queue->queue());
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::Profile(ProfilingCommandQueue* queue,
+                                 ProfilingInfo* result) {
+  queue->ResetMeasurements();
+  for (auto& node : nodes_) {
+    queue->SetEventsLabel(node.name);
+    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+  }
+  RETURN_IF_ERROR(queue->WaitForCompletion());
+  *result = queue->GetProfilingInfo();
+  return OkStatus();
+}
+
+Tensor* InferenceContext::GetTensor(ValueId id) {
+  return &tensors_[remap_from_graph_ids_to_shared_[id]];
+}
+
+Status InferenceContext::SetInputTensor(ValueId id, const TensorFloat32& tensor,
+                                        CLCommandQueue* queue) {
+  return GetTensor(id)->WriteData(queue, tensor);
+}
+
+Status InferenceContext::GetOutputTensor(ValueId id, CLCommandQueue* queue,
+                                         TensorFloat32* result) {
+  const auto& gpu_tensor = *GetTensor(id);
+  const int4 dst_size = gpu_tensor.GetSizeWithDepth();
+  const auto dst_shape = BHWC(1, dst_size.y, dst_size.x, dst_size.z);
+  result->id = id;
+  result->shape = dst_shape;
+  result->data.resize(dst_shape.DimensionsProduct());
+  return gpu_tensor.ReadData(queue, result);
+}
+
+Status RunGraphTransforms(GraphFloat32* graph) {
+  auto merge_padding_transform = NewMergePaddingWithAdd();
+  auto add_bias_transform = NewAddBias();
+  ModelTransformer transformer(graph, /*reporter=*/nullptr);
+  if (!transformer.Apply("add_bias", add_bias_transform.get())) {
+    return InternalError("Invalid add_bias transform");
+  }
+  if (!transformer.Apply("merge_padding", merge_padding_transform.get())) {
+    return InternalError("Invalid merge_padding transform");
+  }
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
new file mode 100644
index 00000000000..f5691bbc0e8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -0,0 +1,131 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CLNode {
+  std::vector<std::unique_ptr<GPUOperation>> operations;
+  std::vector<ValueId> inputs;
+  std::vector<ValueId> outputs;
+  // So as CLNode can have few operations, ranges keep range of ids from inputs,
+  // for every operation.
+  std::vector<int2> ranges;
+
+  // Mostly for debug purposess.
+  std::string name;
+
+  CLNode() = default;
+
+  CLNode(CLNode&& node);
+  CLNode& operator=(CLNode&& node);
+  CLNode(const CLNode&) = delete;
+  CLNode& operator=(const CLNode&) = delete;
+};
+
+class InferenceContext {
+ public:
+  struct CreateInferenceInfo {
+    CalculationsPrecision precision;
+    TensorStorageType storage_type;
+    ModelHints hints;
+  };
+  Status InitFromGraph(const CreateInferenceInfo& create_info,
+                       const GraphFloat32& graph, Environment* env);
+
+  // Applies OpenCL-specific transformations to the graph before the
+  // initialization. These transformations are either impossible or useless in
+  // other backends.
+  Status InitFromGraphWithTransforms(const CreateInferenceInfo& create_info,
+                                     GraphFloat32* graph, Environment* env);
+
+  Status AddToQueue(CLCommandQueue* queue);
+  Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
+
+  Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
+                        CLCommandQueue* queue);
+
+  // It will work only with input/output tensor ids. For all other ids we don't
+  // have any guarantees.
+  Tensor* GetTensor(ValueId id);
+
+  Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
+                         TensorFloat32* result);
+
+ private:
+  void CopyInAndOutIds(const GraphFloat32& graph);
+  Status ConvertOperations(const CreationContext& creation_context,
+                           const GraphFloat32& graph, ModelHints hints);
+  void CreateLinks();
+  void Merge();
+  Status AllocateMemory(const GraphFloat32& graph, const CLDevice& device,
+                        CLContext* context);
+  void BindMemoryToOperations();
+  Status Compile(const CreationContext& creation_context);
+  Status Tune(const TuningParameters& tuning_parameters);
+
+  // performance hacks
+  bool need_flush_ = false;
+
+  // In order to reduce memory leak on Mali a pipeline needs to be synchronized
+  // with CPU to prevent growing internal global OpenCL kernel pool. One trick
+  // is to enqueue an event from a previous run. Most of the time is should
+  // already be executed on GPU and should not stall the pipeline.
+  bool need_manual_release_ = false;
+  CLEvent prev_enqueue_start_point_;
+
+  CalculationsPrecision precision_;
+  TensorStorageType storage_type_;
+
+  // Directly mapped nodes from graph, but some of them "inactiv" due
+  //  to fusion (inactiv = fused).
+  // Memory is allocated only once, in ConvertOperations, and is not modified
+  //  anywhere.
+  std::vector<CLNode> nodes_;
+  std::map<ValueId, Tensor> tensors_;
+  std::map<ValueId, ValueId> remap_from_graph_ids_to_shared_;
+
+  std::vector<ValueId> input_ids_;
+  std::vector<ValueId> output_ids_;
+};
+
+// Runs OpenCL specific transforms for the graph.
+Status RunGraphTransforms(GraphFloat32* graph);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
new file mode 100644
index 00000000000..9f89b67d3fb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -0,0 +1,1191 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "abs",
+    srcs = ["abs.cc"],
+    hdrs = ["abs.h"],
+    deps = [
+        ":gpu_operation",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "abs_test",
+    srcs = ["abs_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":abs",
+        ":cl_test",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "add",
+    srcs = ["add.cc"],
+    hdrs = ["add.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "add_test",
+    srcs = ["add_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":add",
+        ":cl_test",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "apply_mask",
+    srcs = ["apply_mask.cc"],
+    hdrs = ["apply_mask.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "apply_mask_test",
+    srcs = ["apply_mask_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":apply_mask",
+        ":cl_test",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "concat_xy",
+    srcs = ["concat_xy.cc"],
+    hdrs = ["concat_xy.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_library(
+    name = "concat_z",
+    srcs = ["concat_z.cc"],
+    hdrs = ["concat_z.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "concat_test",
+    srcs = ["concat_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":concat_xy",
+        ":concat_z",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "conv_constants",
+    srcs = ["conv_constants.cc"],
+    hdrs = ["conv_constants.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "conv_constants_test",
+    srcs = ["conv_constants_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":conv_constants",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "conv_powervr",
+    srcs = ["conv_powervr.cc"],
+    hdrs = ["conv_powervr.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "conv_powervr_test",
+    srcs = ["conv_powervr_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":conv_powervr",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "conv_texture",
+    srcs = ["conv_texture.cc"],
+    hdrs = ["conv_texture.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "conv_texture_test",
+    srcs = ["conv_texture_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":conv_texture",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "conv_buffer",
+    srcs = ["conv_buffer.cc"],
+    hdrs = ["conv_buffer.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "conv_buffer_test",
+    srcs = ["conv_buffer_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":conv_buffer",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "conv_buffer_1x1",
+    srcs = ["conv_buffer_1x1.cc"],
+    hdrs = ["conv_buffer_1x1.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "conv_buffer_1x1_test",
+    srcs = ["conv_buffer_1x1_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":conv_buffer_1x1",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "convolution_transposed",
+    srcs = ["convolution_transposed.cc"],
+    hdrs = ["convolution_transposed.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "convolution_transposed_test",
+    srcs = ["convolution_transposed_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":convolution_transposed",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "convolution_transposed_3x3_thin",
+    srcs = ["convolution_transposed_3x3_thin.cc"],
+    hdrs = ["convolution_transposed_3x3_thin.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "convolution_transposed_3x3_thin_test",
+    srcs = ["convolution_transposed_3x3_thin_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":convolution_transposed_3x3_thin",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "convolution_transposed_thin",
+    srcs = ["convolution_transposed_thin.cc"],
+    hdrs = ["convolution_transposed_thin.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "convolution_transposed_thin_test",
+    srcs = ["convolution_transposed_thin_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":convolution_transposed_thin",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "depth_wise_conv",
+    srcs = ["depth_wise_conv.cc"],
+    hdrs = ["depth_wise_conv.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "depth_wise_conv_test",
+    srcs = ["depth_wise_conv_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":depth_wise_conv",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "depth_wise_conv_3x3_texture",
+    srcs = ["depth_wise_conv_3x3_texture.cc"],
+    hdrs = ["depth_wise_conv_3x3_texture.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "depth_wise_conv_3x3_texture_test",
+    srcs = ["depth_wise_conv_3x3_texture_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":depth_wise_conv_3x3_texture",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "flt_type",
+    srcs = ["flt_type.cc"],
+    hdrs = ["flt_type.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "fully_connected_texture",
+    srcs = ["fully_connected_texture.cc"],
+    hdrs = ["fully_connected_texture.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "fully_connected_texture_test",
+    srcs = ["fully_connected_texture_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":fully_connected_texture",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "gpu_operation",
+    srcs = ["gpu_operation.cc"],
+    hdrs = ["gpu_operation.h"],
+    deps = [
+        ":tuning_parameters",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:program_cache",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_library(
+    name = "hard_swish",
+    hdrs = ["hard_swish.h"],
+    deps = [
+        ":gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "hard_swish_test",
+    srcs = ["hard_swish_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":hard_swish",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "max_unpooling",
+    srcs = ["max_unpooling.cc"],
+    hdrs = ["max_unpooling.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "max_unpooling_test",
+    srcs = ["max_unpooling_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":max_unpooling",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "multiply_add",
+    srcs = ["multiply_add.cc"],
+    hdrs = ["multiply_add.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
+cc_test(
+    name = "multiply_add_test",
+    srcs = ["multiply_add_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":multiply_add",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "padding",
+    srcs = ["padding.cc"],
+    hdrs = ["padding.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "padding_test",
+    srcs = ["padding_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":padding",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "pooling",
+    srcs = ["pooling.cc"],
+    hdrs = ["pooling.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "pooling_test",
+    srcs = ["pooling_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":pooling",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "prelu",
+    srcs = ["prelu.cc"],
+    hdrs = ["prelu.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
+cc_test(
+    name = "prelu_test",
+    srcs = ["prelu_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":prelu",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "relu",
+    srcs = ["relu.cc"],
+    hdrs = ["relu.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "relu_test",
+    srcs = ["relu_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":relu",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "reshape",
+    srcs = ["reshape.cc"],
+    hdrs = ["reshape.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "reshape_test",
+    srcs = ["reshape_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":reshape",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "reshapex4",
+    srcs = ["reshapex4.cc"],
+    hdrs = ["reshapex4.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "reshapex4_test",
+    srcs = ["reshapex4_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":reshapex4",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "sigmoid",
+    srcs = ["sigmoid.cc"],
+    hdrs = ["sigmoid.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "sigmoid_test",
+    srcs = ["sigmoid_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":sigmoid",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "softmax",
+    srcs = ["softmax.cc"],
+    hdrs = ["softmax.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "softmax_test",
+    srcs = ["softmax_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":softmax",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "softmax1x1",
+    srcs = ["softmax1x1.cc"],
+    hdrs = ["softmax1x1.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_test(
+    name = "softmax1x1_test",
+    srcs = ["softmax1x1_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":softmax1x1",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "strided_slice",
+    srcs = ["strided_slice.cc"],
+    hdrs = ["strided_slice.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "strided_slice_test",
+    srcs = ["strided_slice_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":strided_slice",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tuning_parameters",
+    hdrs = ["tuning_parameters.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+    ],
+)
+
+cc_library(
+    name = "upsample",
+    srcs = ["upsample.cc"],
+    hdrs = ["upsample.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "upsample_test",
+    srcs = ["upsample_test.cc"],
+    linkstatic = True,
+    tags = [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":upsample",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "work_group_picking",
+    srcs = ["work_group_picking.cc"],
+    hdrs = ["work_group_picking.h"],
+    deps = [
+        ":tuning_parameters",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+    ],
+)
+
+cc_library(
+    name = "converter",
+    srcs = ["converter.cc"],
+    hdrs = ["converter.h"],
+    deps = [
+        ":util",
+        "//tensorflow/lite/delegates/gpu:spi",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
+        "//tensorflow/lite/delegates/gpu/cl:cl_errors",
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type_util",
+        "//tensorflow/lite/delegates/gpu/common:util",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cl_test",
+    testonly = 1,
+    srcs = ["cl_test.cc"],
+    hdrs = ["cl_test.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+test_suite(
+    name = "all_tests",
+    tests = [
+        "abs_test",
+        "add_test",
+        "apply_mask_test",
+        "concat_test",
+        "conv_buffer_1x1_test",
+        "conv_buffer_test",
+        "conv_constants_test",
+        "conv_powervr_test",
+        "conv_texture_test",
+        "convolution_transposed_3x3_thin_test",
+        "convolution_transposed_test",
+        "convolution_transposed_thin_test",
+        "depth_wise_conv_3x3_texture_test",
+        "depth_wise_conv_test",
+        "fully_connected_texture_test",
+        "hard_swish_test",
+        "max_unpooling_test",
+        "multiply_add_test",
+        "padding_test",
+        "pooling_test",
+        "prelu_test",
+        "relu_test",
+        "reshape_test",
+        "reshapex4_test",
+        "sigmoid_test",
+        "softmax1x1_test",
+        "softmax_test",
+        "strided_slice_test",
+        "upsample_test",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc b/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
new file mode 100644
index 00000000000..e57901168be
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Abs::Abs(Abs&& operation) : ElementwiseOperation(std::move(operation)) {}
+
+Abs& Abs::operator=(Abs&& operation) {
+  if (this != &operation) {
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Abs::GetCoreCode(const std::string& src, const std::string& z_coord,
+                             const std::string& address) const {
+  return absl::StrCat(src, " = fabs(", src, ");\n");
+}
+
+Abs CreateAbs(const OperationDef& definition) {
+  Abs operation(definition);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/abs.h b/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
new file mode 100644
index 00000000000..2663794f8cd
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Abs : public ElementwiseOperation {
+ public:
+  explicit Abs(const OperationDef& definition)
+      : ElementwiseOperation(definition) {}
+
+  // Move only
+  Abs(Abs&& operation);
+  Abs& operator=(Abs&& operation);
+  Abs(const Abs&) = delete;
+  Abs& operator=(const Abs&) = delete;
+
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+};
+
+Abs CreateAbs(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/abs_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/abs_test.cc
new file mode 100644
index 00000000000..efede7aa0ba
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Abs) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Abs operation = CreateAbs(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f), {half(0.0f), half(1.0f),
+                                              half(0.05f), half(0.045f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
new file mode 100644
index 00000000000..1ea8da90c49
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -0,0 +1,201 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+bool HasTexture2DStorageType(const OperationDef& def) {
+  for (auto& src_tensor : def.src_tensors) {
+    if (src_tensor.storage_type == TensorStorageType::TEXTURE_2D) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+std::string Add::GetElementWiseCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration();
+  c += ::tflite::gpu::cl::GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT4 src = (FLT4)(0.0);\n";
+  c += "    " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  if (src_depthes_[0] != dst_depth_) {
+    c += "  if (Z < " + std::to_string(src_depthes_[0]) + ") {\n";
+    if (src_descriptor.storage_type == TensorStorageType::TEXTURE_2D) {
+      c += "    float t_y = address.y - Z; \n";
+      c += "    int ti_y = (t_y + 0.5) * " + inv_divisor_name_ + "; \n";
+      c += "    int2 tmp_add = (int2)(address.x, ti_y  * " +
+           std::to_string(src_depthes_[0]) + " + Z);\n";
+      c += "    src += " +
+           src_tensor.Read3D("tmp_add", TextureAddressMode::DONT_CARE) + ";\n";
+    } else {
+      c += "    src += " +
+           src_tensor.Read3D("address", TextureAddressMode::DONT_CARE) + ";\n";
+    }
+    c += "  }\n";
+  } else {
+    c += "  src += " +
+         src_tensor.Read3D("address", TextureAddressMode::DONT_CARE) + ";\n";
+  }
+  c += "  " + GetCoreCode("src", "Z", "address");
+  c += PostProcess(linked_operations, "src", "Z", "address");
+  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  c += "} \n";
+  return c;
+}
+
+Add::Add(const OperationDef& definition, const std::vector<int>& channels,
+         int dst_channels)
+    : ElementwiseOperation(definition),
+      dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
+  src_depthes_.resize(channels.size());
+  for (int i = 0; i < channels.size(); ++i) {
+    src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
+  }
+}
+
+Add::Add(Add&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      link_index_(operation.link_index_),
+      inv_divisor_name_(std::move(operation.inv_divisor_name_)),
+      src_depthes_(std::move(operation.src_depthes_)),
+      dst_depth_(operation.dst_depth_) {}
+
+Add& Add::operator=(Add&& operation) {
+  if (this != &operation) {
+    link_index_ = operation.link_index_;
+    inv_divisor_name_ = std::move(operation.inv_divisor_name_);
+    src_depthes_ = std::move(operation.src_depthes_);
+    dst_depth_ = operation.dst_depth_;
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void Add::SetLinkIndex(int index) {
+  inv_divisor_name_ = absl::StrCat("inv_divisor_", index);
+  link_index_ = index;
+}
+
+std::string Add::GetCoreCode(const std::string& src, const std::string& z_coord,
+                             const std::string& address) const {
+  std::string result;
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    const std::string tensor_name =
+        absl::StrCat("src_data_", link_index_, "_", i);
+    TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
+    if (src_depthes_[i] != dst_depth_) {
+      absl::StrAppend(&result, "  if (", z_coord, " < ", src_depthes_[i],
+                      ") {\n");
+      if (definition_.src_tensors[i].storage_type ==
+          TensorStorageType::TEXTURE_2D) {
+        absl::StrAppend(&result, "    float t_y = ", address, ".y - ", z_coord,
+                        ";\n");
+        absl::StrAppend(&result, "    int ti_y = (t_y + 0.5) * ",
+                        inv_divisor_name_, ";\n");
+        absl::StrAppend(&result, "    int2 tmp_add = (int2)(", address,
+                        ".x, ti_y * ", src_depthes_[i], " + ", z_coord, ");\n");
+        absl::StrAppend(
+            &result, "    ", src,
+            " += ", src_tensor.Read3D("tmp_add", TextureAddressMode::DONT_CARE),
+            ";\n");
+      } else {
+        absl::StrAppend(
+            &result, "    ", src,
+            " += ", src_tensor.Read3D(address, TextureAddressMode::DONT_CARE),
+            ";\n");
+      }
+      absl::StrAppend(&result, "  }\n");
+    } else {
+      absl::StrAppend(
+          &result, "  ", src, " += ",
+          src_tensor.Read3D(address, TextureAddressMode::DONT_CARE) + ";\n");
+    }
+  }
+  return result;
+}
+
+std::string Add::GetArgsDeclaration() const {
+  std::string args;
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    const std::string tensor_name =
+        absl::StrCat("src_data_", link_index_, "_", i);
+    TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
+    absl::StrAppend(&args, ",\n", src_tensor.GetDeclaration(AccessType::READ));
+  }
+  if (HasTexture2DStorageType(definition_)) {
+    absl::StrAppend(&args, ",\n   float ", inv_divisor_name_);
+  }
+  return args;
+}
+
+Status Add::BindArguments(CLKernel* kernel) {
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  if (HasTexture2DStorageType(definition_)) {
+    float inv_divisor = 1.0f / static_cast<float>(dst_depth_);
+    RETURN_IF_ERROR(kernel->SetBytesAuto(inv_divisor));
+  }
+  return OkStatus();
+}
+
+Status Add::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                         definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
+              int dst_channels) {
+  Add operation(definition, channels, dst_channels);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
new file mode 100644
index 00000000000..1779673cf81
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Add operation inherited from ElementwiseOperation, but it is much more
+// complicated than usual elementwise, that is why it has own versions for
+// Compile. Add operation support not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in Z dimension)
+class Add : public ElementwiseOperation {
+ public:
+  Add(const OperationDef& definition, const std::vector<int>& channels,
+      int dst_channels);
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Add(Add&& operation);
+  Add& operator=(Add&& operation);
+  Add(const Add&) = delete;
+  Add& operator=(const Add&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+ private:
+  std::string GetElementWiseCode(
+      const TensorDescriptor& src_descriptor,
+      const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+      const std::vector<ElementwiseOperation*>& linked_operations);
+
+  int link_index_;
+  std::string inv_divisor_name_;
+  std::vector<int> src_depthes_;
+  int dst_depth_;
+};
+
+Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
+              int dst_channels);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
new file mode 100644
index 00000000000..616aa6f7966
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {0.0f, -1.0f, -0.05f, 0.045f};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {2, 2};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, channels[0]);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, -0.1f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 6);
+  src0.data = {0.0f,   -1.0f,  -0.05f, 0.045f, 1.0f,   -2.0f,
+               -1.05f, 1.045f, 2.0f,   -3.0f,  -2.05f, 2.045f};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {6, 2};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, channels[0]);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
+                             1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
+  TensorFloat32 src0, src1;
+  src1.shape = BHWC(1, 2, 1, 6);
+  src1.data = {0.0f,   -1.0f,  -0.05f, 0.045f, 1.0f,   -2.0f,
+               -1.05f, 1.045f, 2.0f,   -3.0f,  -2.05f, 2.045f};
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {2, 6};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, 6);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
+                             1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
new file mode 100644
index 00000000000..8ce45987775
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetApplyMaskKernelCode(
+    const OperationDef& definition,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src("src_data", "src_size", definition.src_tensors[0]);
+  TensorCodeGenerator mask("src_mask", "src_size_1", definition.src_tensors[1]);
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(definition.precision);
+
+  c += "__kernel void main_function(\n";
+  c += src.GetDeclaration(AccessType::READ) + ",\n";
+  c += mask.GetDeclaration(AccessType::READ) + ",\n";
+  c += dst.GetDeclaration(AccessType::WRITE);
+  c += GetArgsDeclaration(linked_operations);
+  c += "    int apply_mask_type,\n";
+  c += "    int4 src_size,\n";
+  c += "    int4 src_size_1,\n";
+  c += "    int4 dst_size  \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  FLT4 result = " +
+       src.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
+  c += "  if (apply_mask_type == 1) {\n";
+  c += "    result *= " +
+       mask.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
+  c += "  } else if (apply_mask_type == 2) {\n";
+  c += "    result *= " +
+       mask.Read3D("0", "0", "Z", TextureAddressMode::DONT_CARE) + ";\n";
+  c += "  } else {\n";
+  c += "    result *= " +
+       mask.Read3D("X", "Y", "0", TextureAddressMode::DONT_CARE) + ".x;\n";
+  c += "  }\n";
+  c += "  " + dst.GetAddress("dst_adr", "X", "Y", "Z");
+  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+  c += "  " + dst.Write3D("result", "dst_adr");
+  c += "}\n";
+  return c;
+}
+
+int GetMaskType(int4 src_size, int4 mask_size) {
+  if (mask_size.z == 1) {
+    return 0;
+  } else if (src_size.x == mask_size.x && src_size.y == mask_size.y) {
+    return 1;
+  } else {
+    return 2;
+  }
+}
+
+}  // namespace
+
+ApplyMask::ApplyMask(ApplyMask&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ApplyMask& ApplyMask::operator=(ApplyMask&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ApplyMask::Compile(const CreationContext& creation_context) {
+  const auto code = GetApplyMaskKernelCode(definition_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ApplyMask::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(
+      GetMaskType(src_[0]->GetSizeWithDepth(), src_[1]->GetSizeWithDepth()))));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[1]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ApplyMask::GetGridSize() const {
+  return int3(dst_[0]->Width(), dst_[0]->Height(), dst_[0]->Depth());
+}
+
+Status ApplyMask::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ApplyMask::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ApplyMask CreateApplyMask(const OperationDef& definition) {
+  return ApplyMask(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
new file mode 100644
index 00000000000..81303598e0a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ApplyMask : public GPUOperation {
+ public:
+  explicit ApplyMask(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ApplyMask(ApplyMask&& operation);
+  ApplyMask& operator=(ApplyMask&& operation);
+  ApplyMask(const ApplyMask&) = delete;
+  ApplyMask& operator=(const ApplyMask&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ApplyMask CreateApplyMask(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
new file mode 100644
index 00000000000..a179244ca7e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ApplyMaskOneChannel) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 2, 2, 1);
+  mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -6.0f, -0.5f, 0.0f, 1.0f,
+                                             3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ApplyMaskEqualSizes) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 2, 2, 2);
+  mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f, 2.0f, 0.5f, 1.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -1.5f, -1.0f, 0.0f, 2.0f,
+                                             1.5f, 4.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ApplyMaskVector) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 1, 1, 2);
+  mask_tensor.data = {2.0f, 0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -1.5f, -2.0f, 0.0f, 2.0f,
+                                             1.5f, 8.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
new file mode 100644
index 00000000000..8e3311afa94
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation,
+                           const std::vector<BHWC>& dst_sizes,
+                           const std::vector<TensorFloat32*>& dst_cpu) {
+  const OperationDef& op_def = operation->GetDefinition();
+  std::vector<Tensor> src(src_cpu.size());
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    auto src_shape = src_cpu[i].shape;
+    RETURN_IF_ERROR(CreateTensor(
+        *creation_context.context, *creation_context.device, src_shape.w,
+        src_shape.h, src_shape.c, op_def.src_tensors[0].data_type,
+        op_def.src_tensors[0].storage_type, &src[i]));
+    RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
+    operation->SetSrc(&src[i], i);
+  }
+
+  std::vector<Tensor> dst(dst_cpu.size());
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    auto dst_shape = dst_sizes[i];
+    RETURN_IF_ERROR(CreateTensor(
+        *creation_context.context, *creation_context.device, dst_shape.w,
+        dst_shape.h, dst_shape.c, op_def.dst_tensors[0].data_type,
+        op_def.dst_tensors[0].storage_type, &dst[i]));
+
+    operation->SetDst(&dst[i], i);
+  }
+
+  RETURN_IF_ERROR(operation->Compile(creation_context));
+  RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
+  RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
+
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_cpu[i]->shape = dst_sizes[i];
+    dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
+    RETURN_IF_ERROR(dst[i].ReadData(creation_context.queue, dst_cpu[i]));
+  }
+  return OkStatus();
+}
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result) {
+  return ExecuteGPUOperation(
+      std::vector<TensorFloat32>{src_cpu}, creation_context, operation,
+      std::vector<BHWC>{dst_size}, std::vector<TensorFloat32*>{result});
+}
+
+Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result) {
+  return ExecuteGPUOperation(std::vector<TensorFloat32>{src_cpu},
+                             creation_context, operation, dst_size, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
new file mode 100644
index 00000000000..7c65ab0a070
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
+#endif
+
+class OpenCLOperationTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    ASSERT_OK(LoadOpenCL());
+    ASSERT_OK(CreateDefaultEnvironment(&env_));
+    creation_context_.device = env_.GetDevicePtr();
+    creation_context_.context = &env_.context();
+    creation_context_.queue = env_.queue();
+    creation_context_.cache = env_.program_cache();
+  }
+
+ protected:
+  Environment env_;
+  CreationContext creation_context_;
+};
+
+Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result);
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result);
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation,
+                           const std::vector<BHWC>& dst_sizes,
+                           const std::vector<TensorFloat32*>& dst_cpu);
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
new file mode 100644
index 00000000000..441fbf4f890
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConcatWidth) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+  src1.shape = BHWC(1, 2, 2, 2);
+  src1.data = {half(1.0f), half(-1.2f), half(-0.45f), half(1.045f),
+               half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::WIDTH;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 3, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f),
+                    {half(0.0f), half(-1.0f), half(1.0f), half(-1.2f),
+                     half(-0.45f), half(1.045f), half(-0.05f), half(0.045f),
+                     half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatHeight) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+  src1.shape = BHWC(1, 1, 1, 2);
+  src1.data = {half(1.0f), half(-1.2f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::HEIGHT;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f), {half(0.0f), half(-1.0f), half(-0.05f),
+                                      half(0.045f), half(1.0f), half(-1.2f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatChannels) {
+  TensorFloat32 src0, src1, src2;
+  src0.shape = BHWC(1, 2, 1, 1);
+  src0.data = {half(0.0f), half(-1.0f)};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
+  src2.shape = BHWC(1, 2, 1, 3);
+  src2.data = {half(5.0f), half(6.0f), half(7.0f),
+               half(8.0f), half(9.0),  half(10.0f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatZ operation = CreateConcatZ(op_def, {1, 2, 3});
+      ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
+                                    &operation, BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f),
+                            {half(0.0f), half(1.0f), half(2.0f), half(5.0f),
+                             half(6.0f), half(7.0f), half(-1.0f), half(3.0f),
+                             half(4.0f), half(8.0f), half(9.0), half(10.0f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 4);
+  src0.data = {half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
+               half(1.0f),  half(2.0f),  half(3.0f),  half(4.0f)};
+  src1.shape = BHWC(1, 2, 1, 4);
+  src1.data = {half(5.0f),  half(6.0f),  half(7.0f),  half(8.0f),
+               half(-5.0f), half(-6.0f), half(-7.0f), half(-8.0f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatZ operation = CreateConcatZ(op_def, {4, 4});
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 8), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f),
+                    {half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
+                     half(5.0f), half(6.0f), half(7.0f), half(8.0f), half(1.0f),
+                     half(2.0f), half(3.0f), half(4.0f), half(-5.0f),
+                     half(-6.0f), half(-7.0f), half(-8.0f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
new file mode 100644
index 00000000000..fd19052a158
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetConcatKernelCode(
+    const OperationDef& definition, int tensors_count,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(tensors_count);
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string tensor_name = "src_data_" + std::to_string(i);
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
+        tensor_name, uniform_name, definition.src_tensors[i]));
+  }
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(definition.precision);
+
+  c += "__kernel void main_function(\n";
+  for (const auto& src : srcs) {
+    c += src->GetDeclaration(AccessType::READ) + ",\n";
+  }
+  c += dst.GetDeclaration(AccessType::WRITE);
+  c += GetArgsDeclaration(linked_operations);
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    c += "    int4 " + uniform_name + ",\n";
+  }
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string uniform_name = "dst_offset_" + std::to_string(i);
+    c += "    int2 " + uniform_name + ",\n";
+  }
+  c += "    int4 dst_size  \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string offset_name = "dst_offset_" + std::to_string(i);
+    const std::string size_name = "src_size_" + std::to_string(i);
+    c += "  if (X < " + size_name + ".x && Y < " + size_name + ".y) { \n";
+    c += "    FLT4 result = " +
+         srcs[i]->Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
+    c += "    int dst_x = X + " + offset_name + ".x;\n";
+    c += "    int dst_y = Y + " + offset_name + ".y;\n";
+    c += "    " + dst.GetAddress("dst_adr", "dst_x", "dst_y", "Z");
+    c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+    c += "    " + dst.Write3D("result", "dst_adr");
+    c += "  } \n";
+  }
+  c += "}\n";
+  return c;
+}
+
+}  // namespace
+
+ConcatXY::ConcatXY(ConcatXY&& operation)
+    : GPUOperation(std::move(operation)),
+      attr_(operation.attr_),
+      tensors_count_(operation.tensors_count_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    tensors_count_ = operation.tensors_count_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConcatXY::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConcatXY::BindArguments() {
+  kernel_.ResetBindingCounter();
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  int max_src_width = 0;
+  int max_src_height = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->GetSizeWithDepth()));
+    max_src_width = std::max(max_src_width, src_[i]->Width());
+    max_src_height = std::max(max_src_height, src_[i]->Height());
+  }
+  int x_offset = 0;
+  int y_offset = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(x_offset, y_offset)));
+    x_offset += attr_.axis == Axis::WIDTH ? src_[i]->Width() : 0;
+    y_offset += attr_.axis == Axis::HEIGHT ? src_[i]->Height() : 0;
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConcatXY::GetGridSize() const {
+  int max_src_width = 0;
+  int max_src_height = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    max_src_width = std::max(max_src_width, src_[i]->Width());
+    max_src_height = std::max(max_src_height, src_[i]->Height());
+  }
+
+  const int grid_x = max_src_width;
+  const int grid_y = max_src_height;
+  const int grid_z = dst_[0]->Depth();
+
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConcatXY::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ConcatXY CreateConcatXY(const OperationDef& definition,
+                        const ConcatAttributes& attr, int tensors_count) {
+  return ConcatXY(definition, attr, tensors_count);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
new file mode 100644
index 00000000000..6bc0c87a51f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConcatXY : public GPUOperation {
+ public:
+  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
+           int tensors_count)
+      : GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConcatXY(ConcatXY&& operation);
+  ConcatXY& operator=(ConcatXY&& operation);
+  ConcatXY(const ConcatXY&) = delete;
+  ConcatXY& operator=(const ConcatXY&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  ConcatAttributes attr_;
+  int tensors_count_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ConcatXY CreateConcatXY(const OperationDef& definition,
+                        const ConcatAttributes& attr, int tensors_count);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
new file mode 100644
index 00000000000..9f8f0ada52b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -0,0 +1,224 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+bool IsAllChannelsX4(const std::vector<int>& channels) {
+  for (int channel : channels) {
+    if (channel % 4 != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string GetConcatKernelCode(
+    const OperationDef& definition, const std::vector<int>& channels,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(channels.size());
+  for (int i = 0; i < channels.size(); ++i) {
+    const std::string tensor_name = "src_data_" + std::to_string(i);
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
+        tensor_name, uniform_name, definition.src_tensors[i]));
+  }
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string code = GetCommonDefines(definition.precision);
+  const std::string postfix[] = {".x", ".y", ".z", ".w"};
+
+  code += "__kernel void main_function(\n";
+  for (const auto& src : srcs) {
+    code += src->GetDeclaration(AccessType::READ) + ",\n";
+  }
+  code += dst.GetDeclaration(AccessType::WRITE);
+  code += GetArgsDeclaration(linked_operations);
+  for (int i = 0; i < channels.size(); ++i) {
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    code += "    int4 " + uniform_name + ",\n";
+  }
+  code += "    int4 dst_size\n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  code += "    return; \n";
+  code += "  } \n";
+
+  if (IsAllChannelsX4(channels)) {
+    // When all channels % 4 == 0 we can read/assign/write FLT4 elements easily.
+    // Also it is easy to write a loop in this case, to prevent long kernel
+    // generation.
+    code += "  int Z = 0;\n";
+    for (int i = 0; i < channels.size(); ++i) {
+      const std::string uniform_name = "src_size_" + std::to_string(i);
+      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      if (depth % 2 == 0) {
+        // We can read more at once inside of loop in case depth % 2 == 0
+        // it should be better for reading latency hiding
+        code += "  for (int i = 0; i < " + uniform_name + ".w; i += 2) {\n";
+        code += "    FLT4 result0 = " +
+                srcs[i]->Read3D("X", "Y", "i", TextureAddressMode::DONT_CARE) +
+                ";\n";
+        code +=
+            "    FLT4 result1 = " +
+            srcs[i]->Read3D("X", "Y", "i + 1", TextureAddressMode::DONT_CARE) +
+            ";\n";
+        code += "    " + dst.GetAddress("dst_adr0", "X", "Y", "Z") + "\n";
+        code += "    " + dst.GetAddress("dst_adr1", "X", "Y", "Z + 1") + "\n";
+        code += PostProcess(linked_operations, "result0", "Z", "dst_adr0");
+        code += PostProcess(linked_operations, "result1", "Z + 1", "dst_adr1");
+        code += "    " + dst.Write3D("result0", "dst_adr0");
+        code += "    " + dst.Write3D("result1", "dst_adr1");
+        code += "    Z += 2;\n";
+        code += "  }\n";
+      } else {
+        code += "  for (int i = 0; i < " + uniform_name + ".w; ++i) {\n";
+        code += "    FLT4 result = " +
+                srcs[i]->Read3D("X", "Y", "i", TextureAddressMode::DONT_CARE) +
+                ";\n";
+        code += "    " + dst.GetAddress("dst_adr", "X", "Y", "Z") + "\n";
+        code += PostProcess(linked_operations, "result", "Z", "dst_adr");
+        code += "    " + dst.Write3D("result", "dst_adr");
+        code += "    Z++;\n";
+        code += "  }\n";
+      }
+    }
+  } else {
+    code += "  FLT4 result = (FLT4)(0.0);\n";
+    int out_channel = 0;
+    int read_index = 0;
+    int z = 0;
+    for (int i = 0; i < channels.size(); ++i) {
+      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      for (int d = 0; d < depth; ++d) {
+        const int channels_in_group = std::min(4, channels[i] - d * 4);
+        const std::string temp_name = "t" + std::to_string(read_index);
+        code += "  FLT4 " + temp_name + " = ";
+        code += srcs[i]->Read3D("X", "Y", std::to_string(d),
+                                TextureAddressMode::DONT_CARE) +
+                ";\n";
+        for (int c = 0; c < channels_in_group; ++c) {
+          code += "  result" + postfix[out_channel] + " = ";
+          code += temp_name + postfix[c] + ";\n";
+          out_channel++;
+          if (out_channel == 4) {
+            out_channel = 0;
+            code += "  {\n";
+            code += "  " +
+                    dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) +
+                    "\n";
+            code += PostProcess(linked_operations, "result", std::to_string(z),
+                                "dst_adr");
+            code += "  " + dst.Write3D("result", "dst_adr");
+            code += "  }\n";
+            z++;
+          }
+        }
+        read_index++;
+      }
+    }
+    if (out_channel != 0) {
+      code += "  {\n";
+      code +=
+          "  " + dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) + "\n";
+      code += PostProcess(linked_operations, "result", std::to_string(z),
+                          "dst_adr");
+      code += "  " + dst.Write3D("result", "dst_adr");
+      code += "  }\n";
+    }
+  }
+  code += "}\n";
+  return code;
+}
+}  // namespace
+
+ConcatZ::ConcatZ(ConcatZ&& kernel)
+    : GPUOperation(std::move(kernel)),
+      channels_(std::move(kernel.channels_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
+  if (this != &kernel) {
+    channels_ = std::move(kernel.channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConcatZ::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetConcatKernelCode(definition_, channels_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConcatZ::BindArguments() {
+  kernel_.ResetBindingCounter();
+  for (int i = 0; i < channels_.size(); ++i) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  for (int i = 0; i < channels_.size(); ++i) {
+    int4 size(src_[i]->Width(), src_[i]->Height(), channels_[i],
+              IntegralDivideRoundUp(channels_[i], 4));
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConcatZ::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConcatZ::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ConcatZ CreateConcatZ(const OperationDef& definition,
+                      const std::vector<int>& channels) {
+  return ConcatZ(definition, channels);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
new file mode 100644
index 00000000000..9fc0fcc1fdb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConcatZ : public GPUOperation {
+ public:
+  ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
+      : GPUOperation(definition), channels_(channels) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConcatZ(ConcatZ&& kernel);
+  ConcatZ& operator=(ConcatZ&& kernel);
+  ConcatZ(const ConcatZ&) = delete;
+  ConcatZ& operator=(const ConcatZ&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  std::vector<int> channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ConcatZ CreateConcatZ(const OperationDef& definition,
+                      const std::vector<int>& channels);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
new file mode 100644
index 00000000000..f8994014aa3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
@@ -0,0 +1,281 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvBuffer(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int x_elements, int y_elements,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV(R, S)    \\\n";
+      c += "R += S.x * f0.s0123; \\\n";
+      c += "R += S.y * f0.s4567; \\\n";
+      c += "R += S.z * f0.s89ab; \\\n";
+      c += "R += S.w * f0.scdef;   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV(R, S) \\\n";
+      c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
+           "f0.s89ab + S.w * f0.scdef);\n";
+      break;
+  }
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __global FLT16* filters_buffer,   \n";
+  c += "    __global FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size,                   \n";
+  c += "    int2 kernel_size,                \n";
+  c += "    int2 dillation,                  \n";
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding                     \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * " + std::to_string(x_elements) + ";\n";
+  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  __global FLT16* temp = filters_buffer + Z * src_size.w * "
+       "kernel_size.x * kernel_size.y;\n";
+  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int xc" + x_s + " = (X + " + x_s + ") * stride.x + padding.x;\n";
+  }
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int yc" + y_s + " = (Y + " + y_s + ") * stride.y + padding.y;\n";
+  }
+  c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int c" + y_s + "y = y * dillation.y + yc" + y_s + ";\n";
+    c += "  bool y" + y_s + "_in = c" + y_s + "y >= 0 && c" + y_s +
+         "y < src_size.y;\n";
+    c += "  c" + y_s + "y = clamp(c" + y_s + "y, 0, src_size.y - 1);\n";
+  }
+  c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int c" + x_s + "x = x * dillation.x + xc" + x_s + ";\n";
+    c += "  bool x" + x_s + "_in = c" + x_s + "x >= 0 && c" + x_s +
+         "x < src_size.x;\n";
+    c += "  c" + x_s + "x = clamp(c" + x_s + "x, 0, src_size.x - 1);\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  int src_addr_" + i_s + " = c" + y_s + "y * src_size.x + c" + x_s +
+           "x;\n";
+    }
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "    FLT4 s" + i_s + " = src_data[src_addr_" + i_s + "] * (FLT)(y" +
+           y_s + "_in && x" + x_s + "_in);\n";
+    }
+  }
+  c += "    FLT16 f0 = temp[0];\n";
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    CONV(r" + i_s + ", s" + i_s + ");\n";
+  }
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    src_addr_" + i_s + " += src_size.z;\n";
+  }
+  c += "    temp += 1;\n";
+  c += "  }\n";  // src_size.w - SRC_DEPTH
+  c += "  }\n";  // kernel_size.x
+  c += "  }\n";  // kernel_size.y
+
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
+           " < dst_size.y) {\n";
+      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
+      c += "  " +
+           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+           "\n";
+      c += PostProcess(linked_operations, "res", "Z", "address");
+      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+ConvBuffer::ConvBuffer(const OperationDef& definition,
+                       const Convolution2DAttributes& attr, int x_elements,
+                       int y_elements)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      x_elements_(x_elements),
+      y_elements_(y_elements),
+      work_group_size_(4, 4, 4) {}
+
+ConvBuffer::ConvBuffer(ConvBuffer&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      x_elements_(operation.x_elements_),
+      y_elements_(operation.y_elements_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(x_elements_, operation.x_elements_);
+    std::swap(y_elements_, operation.y_elements_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvBuffer::Compile(const CreationContext& creation_context) {
+  std::string code = GenerateConvBuffer(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, x_elements_, y_elements_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvBuffer::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
+                       src_[0]->Width() * src_[0]->Height(), src_[0]->Depth());
+  int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
+                       dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_size));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  return OkStatus();
+}
+
+int3 ConvBuffer::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), x_elements_);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), y_elements_);
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvBuffer::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvBuffer::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvBuffer(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const Convolution2DAttributes& attr,
+                        ConvBuffer* result) {
+  int x_elements = 2;
+  int y_elements = 1;
+  if (definition.precision != CalculationsPrecision::F16) {
+    x_elements = 1;
+    y_elements = 1;
+  }
+  *result = ConvBuffer(definition, attr, x_elements, y_elements);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
new file mode 100644
index 00000000000..71ae6d905cb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvBuffer : public GPUOperation {
+ public:
+  ConvBuffer() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvBuffer(ConvBuffer&& operation);
+  ConvBuffer& operator=(ConvBuffer&& operation);
+  ConvBuffer(const ConvBuffer&) = delete;
+  ConvBuffer& operator=(const ConvBuffer&) = delete;
+
+ private:
+  friend Status CreateConvBuffer(const CreationContext& creation_context,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr,
+                                 ConvBuffer* result);
+  ConvBuffer(const OperationDef& definition,
+             const Convolution2DAttributes& attr, int x_elements,
+             int y_elements);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int x_elements_;
+  int y_elements_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvBuffer::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                 CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  const int float4_size = definition_.precision == CalculationsPrecision::F32
+                              ? sizeof(float4)
+                              : sizeof(half4);
+
+  const int elements_count =
+      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+Status CreateConvBuffer(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const Convolution2DAttributes& attr,
+                        ConvBuffer* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
new file mode 100644
index 00000000000..545463881ea
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -0,0 +1,351 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
+
+#include <array>
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// x_elements - amount of elements processed by thread in W dimension
+// y_elements - amount of elements processed by thread in H dimension
+// element_size must be 1, 2 or 4
+// 1 - is FLT4
+// 2 - is FLT8
+// 4 - is FLT16
+// This function generates code for arithmetic part of convolution
+std::string GetComputationPart(int x_elements, int y_elements, int element_size,
+                               CalculationsPrecision precision) {
+  const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "a", "b", "c", "d", "e", "f"};
+  std::string c;
+  for (int y = 0; y < y_elements; ++y) {
+    for (int x = 0; x < x_elements; ++x) {
+      std::string s_index = std::to_string(y * x_elements + x);
+      for (int e = 0; e < element_size; ++e) {
+        std::string r_index =
+            std::to_string((y * x_elements + x) * element_size + e);
+        switch (precision) {
+          case CalculationsPrecision::F32:
+          case CalculationsPrecision::F16:
+            c += "    r" + r_index + " += f0.s0123 * s" + s_index + ".s" +
+                 hexes[e * 4 + 0] + ";\n";
+            c += "    r" + r_index + " += f0.s4567 * s" + s_index + ".s" +
+                 hexes[e * 4 + 1] + ";\n";
+            c += "    r" + r_index + " += f0.s89ab * s" + s_index + ".s" +
+                 hexes[e * 4 + 2] + ";\n";
+            c += "    r" + r_index + " += f0.scdef * s" + s_index + ".s" +
+                 hexes[e * 4 + 3] + ";\n";
+            break;
+          case CalculationsPrecision::F32_F16:
+            c += "    r" + r_index + " += convert_float4(f0.s0123 * s" +
+                 s_index + ".s" + hexes[e * 4 + 0] + " + f0.s4567 * s" +
+                 s_index + ".s" + hexes[e * 4 + 1] + " + f0.s89ab * s" +
+                 s_index + ".s" + hexes[e * 4 + 2] + " + f0.scdef * s" +
+                 s_index + ".s" + hexes[e * 4 + 3] + ");\n";
+            break;
+        }
+      }
+    }
+  }
+  return c;
+}
+
+std::string GetShiftFromElementSize(int element_size) {
+  if (element_size == 4) {
+    return " >> 2";
+  } else if (element_size == 2) {
+    return " >> 1";
+  } else {
+    return "";
+  }
+}
+
+std::string GenerateConvBuffer1x1(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int x_elements, int y_elements, int element_size,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT8 float8\n";
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT8 half8\n";
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += "    __global FLT" + std::to_string(element_size * 4) + "* src_data,\n";
+  c += "    __global FLT16* filters_buffer,   \n";
+  c += "    __global FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * " +
+       std::to_string(x_elements * element_size) + ";\n";
+  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  __global FLT16* temp = filters_buffer + Z * src_size.w;\n";
+  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
+  for (int i = 0; i < x_elements * element_size * y_elements; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int xc" + x_s + " = min(X + " + std::to_string(x * element_size) +
+         ", src_size.x - 1);\n";
+  }
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int yc" + y_s + " = min(Y + " + y_s + ", src_size.y - 1);\n";
+  }
+  std::string shift = GetShiftFromElementSize(element_size);
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  int src_addr_" + i_s + " = ((yc" + y_s + ") * src_size.x + (xc" +
+           x_s + "))" + shift + ";\n";
+    }
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "    FLT" + std::to_string(element_size * 4) + " s" + i_s +
+           " = src_data[src_addr_" + i_s + "];\n";
+    }
+  }
+  c += "    FLT16 f0 = temp[0];\n";
+  c += GetComputationPart(x_elements, y_elements, element_size, precision);
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    src_addr_" + i_s + " += src_size.z;\n";
+  }
+  c += "    temp += 1;\n";
+  c += "  }\n";  // src_size.w = SRC_DEPTH
+
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements * element_size; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements * element_size + x);
+      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
+           " < dst_size.y) {\n";
+      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
+      c += "  " +
+           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+           "\n";
+      c += PostProcess(linked_operations, "res", "Z", "address");
+      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+  return c;
+}
+
+int GetGridWidth(int width) {
+  if (width % 2 == 0) {  // using kernel_flt8_
+    return width / 2;
+  } else {  // using kernel_flt4_
+    return width;
+  }
+}
+
+}  // namespace
+
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             int flt4_x_count, int flt4_y_count,
+                             int flt8_x_count, int flt8_y_count)
+    : GPUOperation(definition),
+      flt4_x_count_(flt4_x_count),
+      flt4_y_count_(flt4_y_count),
+      flt8_x_count_(flt8_x_count),
+      flt8_y_count_(flt8_y_count),
+      work_group_size_(2, 4, 1) {}
+
+ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      kernel_flt4_(std::move(operation.kernel_flt4_)),
+      flt4_x_count_(operation.flt4_x_count_),
+      flt4_y_count_(operation.flt4_y_count_),
+      kernel_flt8_(std::move(operation.kernel_flt8_)),
+      flt8_x_count_(operation.flt8_x_count_),
+      flt8_y_count_(operation.flt8_y_count_),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    kernel_flt4_ = std::move(operation.kernel_flt4_);
+    std::swap(flt4_x_count_, operation.flt4_x_count_);
+    std::swap(flt4_y_count_, operation.flt4_y_count_);
+    kernel_flt8_ = std::move(operation.kernel_flt8_);
+    std::swap(flt8_x_count_, operation.flt8_x_count_);
+    std::swap(flt8_y_count_, operation.flt8_y_count_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
+  std::string code_flt4 = GenerateConvBuffer1x1(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, flt4_x_count_, flt4_y_count_, 1,
+      linked_operations_);
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_flt4, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_flt4_));
+  std::string code_flt8 = GenerateConvBuffer1x1(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, flt8_x_count_, flt8_y_count_, 2,
+      linked_operations_);
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_flt8, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_flt8_));
+  return OkStatus();
+}
+
+CLKernel* ConvBuffer1x1::GetKernel(int width) {
+  if (width % 2 == 0) {
+    return &kernel_flt8_;
+  } else {
+    return &kernel_flt4_;
+  }
+}
+
+Status ConvBuffer1x1::BindArguments() {
+  CLKernel* kernel = GetKernel(src_[0]->Width());
+  kernel->ResetBindingCounter();
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(kernel, linked_operations_));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
+                       GetGridWidth(src_[0]->Width()) * src_[0]->Height(),
+                       src_[0]->Depth());
+  int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
+                       dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
+  RETURN_IF_ERROR(kernel->SetBytesAuto(src_size));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(dst_size));
+  return OkStatus();
+}
+
+int3 ConvBuffer1x1::GetGridSize() const {
+  if (src_[0]->Width() % 2 == 0) {  // using kernel_flt8_
+    const int grid_x =
+        IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt8_x_count_);
+    const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt8_y_count_);
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  } else {  // using kernel_flt4_
+    const int grid_x =
+        IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt4_x_count_);
+    const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt4_y_count_);
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  }
+}
+
+Status ConvBuffer1x1::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, *GetKernel(src_[0]->Width()),
+                              GetGridSize(), &work_group_size_);
+}
+
+Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(*GetKernel(src_[0]->Width()), GetGridSize(),
+                                 work_group_size_);
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  auto src_storage_type = definition.src_tensors[0].storage_type;
+  return src_storage_type == TensorStorageType::BUFFER &&
+         attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
+         attr.dilations.w == 1 && attr.dilations.w == 1 &&
+         attr.strides.w == 1 && attr.strides.h == 1 &&
+         attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
+         attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
+}
+
+Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvBuffer1x1* result) {
+  if (!IsConvBuffer1x1Supported(definition, attr)) {
+    return InvalidArgumentError("ConvBuffer1x1 doesn't supported");
+  }
+  int flt4_x_count = 1;
+  int flt4_y_count = 1;
+  int flt8_x_count = 1;
+  int flt8_y_count = 1;
+  if (creation_context.device->vendor() == Vendor::MALI) {
+    if (definition.precision == CalculationsPrecision::F16 &&
+        creation_context.device->GetInfo().compute_units_count <= 4) {
+      flt4_x_count = 2;
+      flt8_x_count = 2;
+    }
+  }
+  *result = ConvBuffer1x1(definition, attr, flt4_x_count, flt4_y_count,
+                          flt8_x_count, flt8_y_count);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
new file mode 100644
index 00000000000..c5502bf6bc2
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvBuffer1x1 : public GPUOperation {
+ public:
+  ConvBuffer1x1() = default;
+
+  // Move only
+  ConvBuffer1x1(ConvBuffer1x1&& operation);
+  ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
+  ConvBuffer1x1(const ConvBuffer1x1&) = delete;
+  ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
+
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+ private:
+  friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr,
+                                    ConvBuffer1x1* result);
+  ConvBuffer1x1(const OperationDef& definition,
+                const Convolution2DAttributes& attr, int flt4_x_count,
+                int flt4_y_count, int flt8_x_count, int flt8_y_count);
+  template <DataType T>
+
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel* GetKernel(int width);
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  CLKernel kernel_flt4_;
+  int flt4_x_count_;
+  int flt4_y_count_;
+
+  CLKernel kernel_flt8_;
+  int flt8_x_count_;
+  int flt8_y_count_;
+
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvBuffer1x1::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  const int float4_size = definition_.precision == CalculationsPrecision::F32
+                              ? sizeof(float4)
+                              : sizeof(half4);
+
+  const int elements_count =
+      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
+
+Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvBuffer1x1* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
new file mode 100644
index 00000000000..b561975cd1a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 1, 1, 4);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer1x1 operation;
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 1, 2), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {6.0f, 6.0f, 22.0f, 22.0f}));
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(4, 1, 1, 4);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(4);
+  attr.bias.data = {0.5f, -0.5f, 0.5f, -0.5f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer1x1 operation;
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 1, 4), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {20.5f, 43.5f, 68.5f, 91.5f, 60.5f,
+                                           147.5f, 236.5f, 323.5f}));
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
new file mode 100644
index 00000000000..921af4d406b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvBufferSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer operation;
+    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 2, 1), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvBuffer) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer operation;
+    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 2, 2), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f, 60.5f,
+                                           235.5f, 20.5f, 123.5f}));
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
new file mode 100644
index 00000000000..cafb94f7173
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -0,0 +1,295 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionConstantCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const int2& kernel_size, const int2& dilation, int src_channels,
+    int dst_channels, const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  const int out_z = IntegralDivideRoundUp(dst_channels, 4);
+  const std::string kOutZ = std::to_string(out_z);
+  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV4(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \\\n";
+      c += "  R += SRC.w * F[i + 3];   \n";
+
+      c += "#define CONV3(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \n";
+
+      c += "#define CONV2(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \n";
+
+      c += "#define CONV1(R, SRC, F, i) \\\n";
+      c += "  R += SRC * F[i + 0]; \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV4(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
+
+      c += "#define CONV3(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += " + SRC.z * F[i + 2]);\n";
+
+      c += "#define CONV2(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n";
+
+      c += "#define CONV1(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC * F[i + 0]);\n";
+      break;
+  }
+
+  const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters,  \n";
+  c += "    __constant FLT4* biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 stride,               \n";
+  c += "    int2 padding,              \n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  int start_x = X * stride.x - padding.x;\n";
+  c += "  int start_y = Y * stride.y - padding.y;\n";
+  c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
+  c += "  for (int i = 0; i < " + kOutZ + "; ++i) {\n";
+  c += "    r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  }\n";
+  const auto address_mode = GetFastestZeroMode(device);
+  int filters_counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    const int ch_count = std::min(4, src_channels - s * 4);
+    const std::string s_conv = "CONV" + std::to_string(ch_count);
+    const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
+    const std::string s_type = absl::StrCat("FLT", s_count);
+    const std::string s_postfix = postfixes[ch_count - 1];
+    for (int ky = 0; ky < kernel_size.y; ++ky) {
+      std::string s_y = absl::StrCat("(start_y + ", ky * dilation.y, ")");
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "  {\n";
+        c += "  bool y_out = " + s_y + " < 0 || " + s_y + " >= src_size.y;\n";
+      }
+      for (int kx = 0; kx < kernel_size.x; ++kx) {
+        c += "  {\n";
+        std::string s_x = absl::StrCat("(start_x + ", kx * dilation.x, ")");
+        if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+          c += "    bool x_out = " + s_x + "< 0 || " + s_x + ">= src_size.x;\n";
+          c += "    " + s_type + " src = x_out || y_out ?";
+          c += "(" + s_type + ")(0.0) : ";
+          c += src_tensor.Read3D(s_x, s_y, std::to_string(s)) + s_postfix +
+               ";\n";
+        } else {
+          c += "    " + s_type + " src = " +
+               src_tensor.Read3D(s_x, s_y, std::to_string(s), address_mode) +
+               s_postfix + ";\n";
+        }
+        for (int d = 0; d < out_z; ++d) {
+          c += "    " + s_conv + "(r[" + std::to_string(d) + "], src, filters,";
+          c += " " + std::to_string(filters_counter) + ");\n";
+          filters_counter += ch_count;
+        }
+        c += "  }\n";
+      }
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "  }\n";
+      }
+    }
+  }
+  for (int i = 0; i < out_z; ++i) {
+    std::string s_i = std::to_string(i);
+    c += "  {\n";
+    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
+    c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", s_i) + "\n";
+    c += PostProcess(linked_operations, "res", s_i, "dst_adr");
+    c += "  " + dst_tensor.Write3D("res", "dst_adr");
+    c += "  }\n";
+  }
+  c += "}\n";
+
+  return c;
+}
+
+// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
+// 3KB can have very bad performance.
+int GetAdrenoOptimalMaxConstantSize(int gpu_version) {
+  if (gpu_version < 600) {
+    return 256 * 10;  // 2.5KB
+  } else {
+    return 256 * 14;  // 3.5KB
+  }
+}
+
+int GetOptimalMaxConstantSize(const DeviceInfo& info) {
+  if (info.vendor != Vendor::QUALCOMM) {
+    // In general we not expect that this kernel will be used with non Adreno
+    // so as it tuned for Adreno special memory.
+    return 256 * 16;  // 4KB
+  } else {
+    return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
+  }
+}
+}  // namespace
+
+ConvConstants::ConvConstants(ConvConstants&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      biases_(std::move(kernel.biases_)),
+      kernel_size_(kernel.kernel_size_),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      dilation_(kernel.dilation_),
+      src_channels_(kernel.src_channels_),
+      dst_channels_(kernel.dst_channels_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_);
+    biases_ = std::move(kernel.biases_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(dilation_, kernel.dilation_);
+    std::swap(src_channels_, kernel.src_channels_);
+    std::swap(dst_channels_, kernel.dst_channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConvConstants::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionConstantCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, kernel_size_, dilation_, src_channels_,
+      dst_channels_, *creation_context.device, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsAdreno3xx()) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvConstants::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvConstants::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  return int3(grid_x, grid_y, 1);
+}
+
+Status ConvConstants::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvConstantsSupported(const CLDevice& device,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  if (!device.IsAdreno()) {
+    return false;
+  }
+  const auto& w_shape = attr.weights.shape;
+  const int dst_channels = AlignByN(w_shape.o, 4);
+  const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w;
+  const int float_size = definition.precision == CalculationsPrecision::F32
+                             ? sizeof(float)
+                             : sizeof(half);
+  const int filters_buffer_size = filters_count * float_size;
+  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
+  const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
+  return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
+}
+
+Status CreateConvConstants(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvConstants* result) {
+  if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
+    return InvalidArgumentError("ConvConstants doesn't supported");
+  }
+  *result = ConvConstants(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
new file mode 100644
index 00000000000..e31e15e2b9a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -0,0 +1,169 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvConstants : public GPUOperation {
+ public:
+  ConvConstants() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvConstants(ConvConstants&& kernel);
+  ConvConstants& operator=(ConvConstants&& kernel);
+  ConvConstants(const ConvConstants&) = delete;
+  ConvConstants& operator=(const ConvConstants&) = delete;
+
+ private:
+  friend Status CreateConvConstants(const CreationContext& creation_context,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr,
+                                    ConvConstants* result);
+  explicit ConvConstants(const OperationDef& definition,
+                         const Convolution2DAttributes& attr)
+      : GPUOperation(definition),
+        kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+        stride_(attr.strides.w, attr.strides.h),
+        padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+        dilation_(attr.dilations.w, attr.dilations.h),
+        src_channels_(attr.weights.shape.i),
+        dst_channels_(attr.weights.shape.o) {}
+
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvConstants::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const int float_size =
+      definition_.precision == CalculationsPrecision::F32 ? 4 : 2;
+  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(float_count / 4);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(float_count / 4);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvConstants::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int d = 0; d < dst_depth; ++d) {
+          const int channels_count = std::min(4, src_channels_ - s * 4);
+          T filters[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < channels_count; ++j) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          T filters_new[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+              filters_new[i][j] = filters[j][i];
+            }
+          }
+          for (int i = 0; i < channels_count; ++i) {
+            dst[counter++] = filters_new[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool IsConvConstantsSupported(const CLDevice& device,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
+
+Status CreateConvConstants(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvConstants* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
new file mode 100644
index 00000000000..3bb281a5554
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvConstants operation;
+      ASSERT_OK(
+          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvConstants) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvConstants operation;
+      ASSERT_OK(
+          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
+                                             60.5f, 235.5f, 20.5f, 123.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
new file mode 100644
index 00000000000..94380daeb83
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -0,0 +1,380 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ConvPowerVR::ConvPowerVR(const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         const ConvParams& conv_params)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      conv_params_(conv_params) {}
+
+ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      conv_params_(operation.conv_params_),
+      kernel_(std::move(operation.kernel_)) {}
+
+ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(conv_params_, operation.conv_params_);
+    kernel_ = std::move(operation.kernel_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvPowerVR::Compile(const CreationContext& creation_context) {
+  const std::string code = GenerateConvPowerVR1x1(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, conv_params_, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsPowerVR()) {
+    options.push_back(CompilerOptions::POWERVR_FP16);
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvPowerVR::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvPowerVR::GetGridSize() const {
+  const int grid_x =
+      IntegralDivideRoundUp(dst_[0]->Width(), conv_params_.block_size.x);
+  const int grid_y =
+      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+  const int grid_z =
+      IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+  const int wg_x =
+      IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+  const int wg_y =
+      IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+  const int wg_z =
+      IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+  return int3(wg_z * conv_params_.work_group_size.x,
+              wg_x * conv_params_.work_group_size.y,
+              wg_y * conv_params_.work_group_size.z);
+}
+
+Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(),
+                                 conv_params_.work_group_size);
+}
+
+std::string GenerateConvPowerVR1x1(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const ConvPowerVR::ConvParams& conv_params,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  c += "#define SIMD_BARRIER " +
+       (!conv_params.explicit_sync
+            ? std::string("")
+            : std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
+       "\n";
+  c += "#define SIMD_WAIT_EVENT(E) " +
+       (!conv_params.explicit_sync ? std::string("")
+                                   : std::string("wait_group_events(1, &E);")) +
+       "\n";
+  const int3 work_group_size = conv_params.work_group_size;
+  const int3 block_size = conv_params.block_size;
+  c += "__attribute__((reqd_work_group_size(" +
+       std::to_string(work_group_size.x) + ", " +
+       std::to_string(work_group_size.y) + ", " +
+       std::to_string(work_group_size.z) + ")))\n";
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __global ACCUM_FLT4* filters_buffer,    \n";
+  c += "    __global ACCUM_FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
+       std::to_string(block_size.x) + ";\n";
+  c += "  int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
+       std::to_string(block_size.y) + ";\n";
+  c += "  int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
+       std::to_string(block_size.z) + ";\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
+             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+      }
+    }
+  }
+  c += "  __local ACCUM_FLT4 data[" +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       "];\n";
+  c += "  __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
+       "src_size.w;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  const int src_layer_offset = src_size.x * src_size.y;\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
+        std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
+        std::string id = std::to_string(y) + std::to_string(x);
+        c += "  int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
+      }
+    }
+  }
+
+  auto declare_src = [&]() {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string id = std::to_string(y) + std::to_string(x);
+        if (precision == CalculationsPrecision::F32_F16) {
+          c += "    ACCUM_FLT4 src" + id + ";\n";
+        } else {
+          c += "    FLT4 src" + id + ";\n";
+        }
+      }
+    }
+  };
+  auto read_src = [&]() {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+          std::string id = std::to_string(y) + std::to_string(x);
+          if (precision == CalculationsPrecision::F32_F16) {
+            c += "    src" + id + " = convert_float4(src_data[src_a_" + id +
+                 "]);\n";
+          } else {
+            c += "    src" + id + " = src_data[src_a_" + id + "];\n";
+          }
+          c += "    src_a_" + id + " += src_layer_offset;\n";
+        } else {
+          std::string id = std::to_string(y) + std::to_string(x);
+          if (precision == CalculationsPrecision::F32_F16) {
+            c += "    src" + id + " = " +
+                 src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
+                                          "Y + " + std::to_string(y), "s",
+                                          TextureAddressMode::DONT_CARE) +
+                 ";\n";
+          } else {
+            c += "    src" + id + " = " +
+                 src_tensor.Read3D("X + " + std::to_string(x),
+                                   "Y + " + std::to_string(y), "s",
+                                   TextureAddressMode::DONT_CARE) +
+                 ";\n";
+          }
+        }
+      }
+    }
+  };
+  auto conv_core = [&]() {
+    const std::string channels[] = {"x", "y", "z", "w"};
+    for (int z = 0; z < block_size.z; ++z) {
+      for (int ch = 0; ch < 4; ++ch) {
+        for (int y = 0; y < block_size.y; ++y) {
+          for (int x = 0; x < block_size.x; ++x) {
+            std::string id = std::to_string(y) + std::to_string(x);
+            c += "    r" + std::to_string(z) + id + " += data[" +
+                 std::to_string(z * 4 + ch) + "] * src" + id + "." +
+                 channels[ch] + ";\n";
+          }
+        }
+      }
+    }
+  };
+
+  c += "  int s = 0;\n";
+  c += "  do {\n";
+  declare_src();
+  c += "    SIMD_BARRIER;\n";
+  c += "    event_t e = async_work_group_copy(data, filters_loc, " +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       ", 0);\n";
+  read_src();
+  c += "    SIMD_WAIT_EVENT(e);\n";
+  c += "    s += 1;\n";
+  conv_core();
+  for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
+    read_src();
+    conv_core();
+    c += "    s += 1;\n";
+  }
+  c += "    filters_loc += " +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       ";\n";
+  c += "  } while (s < src_size.w);\n";
+  c += "  SIMD_BARRIER;\n";
+  c += "  event_t e = async_work_group_copy(data, biases + Z, " +
+       std::to_string(block_size.z) + ", 0);\n";
+  c += "  SIMD_WAIT_EVENT(e);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
+  c += "    return;\n";
+  c += "  }\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    c += "  if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xs = "X + " + std::to_string(x);
+        const std::string ys = "Y + " + std::to_string(y);
+        const std::string zs = "Z + " + std::to_string(z);
+        const std::string r_id =
+            std::to_string(z) + std::to_string(y) + std::to_string(x);
+        bool need_x_check = x != 0;
+        bool need_y_check = y != 0;
+        if (need_x_check && need_y_check) {
+          c += "  if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
+        } else if (need_x_check && !need_y_check) {
+          c += "  if (" + xs + " < dst_size.x) {\n";
+        } else if (!need_x_check && need_y_check) {
+          c += "  if (" + ys + " < dst_size.y) {\n";
+        } else {
+          c += "  {\n";
+        }
+        c += "    FLT4 res = TO_FLT4(r" + r_id + " + data[" +
+             std::to_string(z) + "]);\n";
+        c += "    " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
+        c += PostProcess(linked_operations, "res", zs, "address");
+        c += "    " + dst_tensor.Write3D("res", "address") + "\n";
+        c += "  }\n";
+      }
+    }
+  }
+  c += "}\n";
+  return c;
+}
+
+bool IsConvPowerVRSupported(const OperationDef& definition,
+                            const Convolution2DAttributes& attr) {
+  return attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
+         attr.strides == HW(1, 1) && attr.dilations == HW(1, 1) &&
+         attr.padding.prepended == HW(0, 0) &&
+         attr.padding.appended == HW(0, 0);
+}
+
+ConvPowerVR::ConvParams GuessBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        const Convolution2DAttributes& attr) {
+  ConvPowerVR::ConvParams conv_params;
+  conv_params.block_size = int3(1, 1, 4);
+  conv_params.work_group_size = int3(8, 4, 1);
+  conv_params.src_depth_loop_size = 1;
+  conv_params.explicit_sync = !device.IsPowerVR();
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  if (dst_depth % 8 == 0 || dst_depth >= 32) {
+    conv_params.block_size.z = 8;
+  } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
+    conv_params.block_size.z = 4;
+  } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
+    conv_params.block_size.z = 2;
+  } else {
+    conv_params.block_size.z = dst_depth;
+  }
+  if (definition.precision == CalculationsPrecision::F16) {
+    conv_params.block_size.z = std::min(4, conv_params.block_size.z);
+    if (src_depth % 2 == 0) {
+      conv_params.src_depth_loop_size = 2;
+    }
+    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      conv_params.src_depth_loop_size = 4;
+    }
+    if (conv_params.block_size.z == 1) {
+      if (src_depth % 8 == 0) {
+        conv_params.src_depth_loop_size = 8;
+      }
+      if (src_depth % 4 == 0) {
+        conv_params.src_depth_loop_size = 4;
+      }
+      if (src_depth % 2 == 0) {
+        conv_params.src_depth_loop_size = 2;
+      }
+      if (src_depth <= 8) {
+        conv_params.src_depth_loop_size = src_depth;
+      }
+    }
+    conv_params.block_size.x = 2;
+    conv_params.work_group_size = int3(4, 8, 1);
+  }
+
+  return conv_params;
+}
+
+Status CreateConvPowerVR(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvPowerVR* result) {
+  *result =
+      ConvPowerVR(definition, attr,
+                  GuessBestParams(*creation_context.device, definition, attr));
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.precision == CalculationsPrecision::F16
+                              ? DataType::FLOAT16
+                              : DataType::FLOAT32;
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
new file mode 100644
index 00000000000..e4a93a3236c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -0,0 +1,178 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvPowerVR : public GPUOperation {
+ public:
+  ConvPowerVR() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvPowerVR(ConvPowerVR&& operation);
+  ConvPowerVR& operator=(ConvPowerVR&& operation);
+  ConvPowerVR(const ConvPowerVR&) = delete;
+  ConvPowerVR& operator=(const ConvPowerVR&) = delete;
+
+ private:
+  struct ConvParams {
+    int3 block_size;
+    int3 work_group_size;
+    int src_depth_loop_size;
+    bool explicit_sync;
+  };
+
+  ConvPowerVR(const OperationDef& definition,
+              const Convolution2DAttributes& attr,
+              const ConvParams& conv_params);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+  template <DataType S, typename T>
+  void RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                       absl::Span<T> dst);
+
+  friend Status CreateConvPowerVR(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  ConvPowerVR* result);
+
+  friend std::string GenerateConvPowerVR1x1(
+      const TensorDescriptor& src_descriptor,
+      const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+      const ConvParams& conv_params,
+      const std::vector<ElementwiseOperation*>& linked_operations);
+
+  friend ConvParams GuessBestParams(const CLDevice& device,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  ConvParams conv_params_;
+
+  CLKernel kernel_;
+};
+
+template <DataType T>
+Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                  CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  const bool f32_weights = definition_.precision != CalculationsPrecision::F16;
+  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
+
+  const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
+  const int elements_count =
+      weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
+
+  if (f32_weights) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeight(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeight(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvPowerVR::RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                                  absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int d = 0;
+       d < IntegralDivideRoundUp(dst_depth, conv_params_.block_size.z); ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          for (int k = 0; k < conv_params_.block_size.z; ++k) {
+            T filters[4];
+            for (int i = 0; i < 4; ++i) {
+              for (int j = 0; j < 4; ++j) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * conv_params_.block_size.z + k) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filters[j][i] = weights.data[f_index];
+                } else {
+                  filters[j][i] = 0.0f;
+                }
+              }
+            }
+            dst[counter++] = filters[0];
+            dst[counter++] = filters[1];
+            dst[counter++] = filters[2];
+            dst[counter++] = filters[3];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool IsConvPowerVRSupported(const OperationDef& definition,
+                            const Convolution2DAttributes& attr);
+
+Status CreateConvPowerVR(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvPowerVR* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
new file mode 100644
index 00000000000..77bff2e4de2
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 1, 1, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : {CalculationsPrecision::F32}) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvPowerVR operation;
+      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f,
+                                             13.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 1, 1, 2);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : {CalculationsPrecision::F32}) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvPowerVR operation;
+      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {2.5f, 3.5f, 8.5f, 17.5f, 14.5f,
+                                             31.5f, 20.5f, 45.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
new file mode 100644
index 00000000000..686c7be4318
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -0,0 +1,314 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    bool is1x1, bool adreno4xx_optimization, const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV1(R, S)    \\\n";
+      c += "R += S.x * f0; \\\n";
+      c += "R += S.y * f1; \\\n";
+      c += "R += S.z * f2; \\\n";
+      c += "R += S.w * f3;   \n";
+      c += "#define CONV2(R, S)    \\\n";
+      c += "R += S.x * f4; \\\n";
+      c += "R += S.y * f5; \\\n";
+      c += "R += S.z * f6; \\\n";
+      c += "R += S.w * f7;   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV1(R, S) \\\n";
+      c += "R += convert_float4(S.x * f0 + S.y * f1 + S.z * f2 + S.w * f3);\n";
+      c += "#define CONV2(R, S) \\\n";
+      c += "R += convert_float4(S.x * f4 + S.y * f5 + S.z * f6 + S.w * f7);\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters0,   \n";
+  c += "    __read_only image2d_t filters1,   \n";
+  c += "    __read_only image2d_t filters2,   \n";
+  c += "    __read_only image2d_t filters3,   \n";
+  c += "    __read_only image2d_t biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size,                   \n";
+  if (!is1x1) {
+    c += "    int2 kernel_size,              \n";
+    c += "    int2 dillation,                \n";
+  }
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding                     \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * 2;\n";
+  c += "  int Y = get_global_id(1) * 2;\n";
+  c += "  int Z = get_global_id(2) * 2;\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  int xc0 = X * stride.x + padding.x;\n";
+  c += "  int xc1 = (X + 1) * stride.x + padding.x;\n";
+  c += "  int yc0 = Y * stride.y + padding.y;\n";
+  c += "  int yc1 = (Y + 1) * stride.y + padding.y;\n";
+  for (int i = 0; i < 8; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) +
+         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  }
+  std::string f_y = is1x1 ? "s" : "filter_offset";
+  std::string s_x0 = is1x1 ? "xc0" : "c0.x";
+  std::string s_x1 = is1x1 ? "xc1" : "c1.x";
+  std::string s_y0 = is1x1 ? "yc0" : "c0.y";
+  std::string s_y1 = is1x1 ? "yc1" : "c1.y";
+  if (!is1x1) {
+    c += "  int2 c0;\n";
+    c += "  int2 c1;\n";
+    c += "  int filter_offset = 0;\n";
+    c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+    c += "  c0.y = y * dillation.y + yc0;\n";
+    c += "  c1.y = y * dillation.y + yc1;\n";
+    c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+    c += "  c0.x = x * dillation.x + xc0;\n";
+    c += "  c1.x = x * dillation.x + xc1;\n";
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  std::string fc0 = "(int2)(Z, " + f_y + ")";
+  std::string fc1 = "(int2)(Z + 1, " + f_y + ")";
+  c += "    FLT4 f0 = READ_IMAGE(filters0, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f1 = READ_IMAGE(filters1, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f2 = READ_IMAGE(filters2, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f3 = READ_IMAGE(filters3, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f4 = READ_IMAGE(filters0, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f5 = READ_IMAGE(filters1, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f6 = READ_IMAGE(filters2, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f7 = READ_IMAGE(filters3, smp_none, " + fc1 + ");\n";
+  const auto mode = GetFastestZeroMode(device);
+  c += "    FLT4 src0 =" + src_tensor.Read3D(s_x0, s_y0, "s", mode) + ";\n";
+  c += "    FLT4 src1 =" + src_tensor.Read3D(s_x1, s_y0, "s", mode) + ";\n";
+  c += "    FLT4 src2 =" + src_tensor.Read3D(s_x0, s_y1, "s", mode) + ";\n";
+  c += "    FLT4 src3 =" + src_tensor.Read3D(s_x1, s_y1, "s", mode) + ";\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "    CONV1(r" + std::to_string(i) + ", src" + std::to_string(i) +
+         ");\n";
+  }
+  for (int i = 0; i < 4; ++i) {
+    c += "    CONV2(r" + std::to_string(i + 4) + ", src" + std::to_string(i) +
+         ");\n";
+  }
+  if (!is1x1) {
+    c += "    filter_offset++;\n";
+  }
+  c += "  }\n";  // src_size.w
+  if (!is1x1) {
+    c += "  }\n";  // kernel_size.x
+    c += "  }\n";  // kernel_size.y
+  }
+  // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
+  std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
+  std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
+  c += "  if (Z < dst_size.w) {\n";
+  c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "  {\n";
+    c += "  int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
+    c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
+    c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
+    c += "    FLT4 res = TO_FLT4(r" + std::to_string(i) + ") + bias_val;\n";
+    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
+    c += PostProcess(linked_operations, "res", "Z", "address");
+    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    c += "  }\n";
+    c += "  }\n";
+  }
+  c += "  }\n";
+  c += "  Z++;\n";
+  c += "  if (Z < dst_size.w) {\n";
+  c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "  {\n";
+    c += "  int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
+    c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
+    c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
+    c += "    FLT4 res = TO_FLT4(r" + std::to_string(i + 4) + ") + bias_val;\n";
+    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
+    c += PostProcess(linked_operations, "res", "Z", "address");
+    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    c += "  }\n";
+    c += "  }\n";
+  }
+  c += "  }\n";
+  c += "}\n";
+  return c;
+}
+
+bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
+                 bool kernel1x1) {
+  if (!device.IsAdreno()) {
+    return false;
+  }
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      return false;
+    case CalculationsPrecision::F16:
+      return device.IsAdreno3xx() && kernel1x1;
+  }
+}
+}  // namespace
+
+ConvTexture::ConvTexture(const OperationDef& definition,
+                         const Convolution2DAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      work_group_size_(4, 4, 2) {}
+
+ConvTexture::ConvTexture(ConvTexture&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_0_(std::move(operation.weights_0_)),
+      weights_1_(std::move(operation.weights_1_)),
+      weights_2_(std::move(operation.weights_2_)),
+      weights_3_(std::move(operation.weights_3_)),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
+  if (this != &operation) {
+    weights_0_ = std::move(operation.weights_0_);
+    weights_1_ = std::move(operation.weights_1_);
+    weights_2_ = std::move(operation.weights_2_);
+    weights_3_ = std::move(operation.weights_3_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvTexture::Compile(const CreationContext& creation_context) {
+  auto storage_type = definition_.GetPrimaryStorageType();
+  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
+  bool adreno4xx_optimization =
+      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
+      creation_context.device->IsAdreno4xx() &&
+      storage_type == TensorStorageType::TEXTURE_ARRAY &&
+      definition_.precision == CalculationsPrecision::F16;
+  std::string code =
+      GenerateConvCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                       definition_.precision, is1x1, adreno4xx_optimization,
+                       *creation_context.device, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvTexture::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  return OkStatus();
+}
+
+int3 ConvTexture::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), 2);
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvTexture::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvTexture(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvTexture* result) {
+  *result = ConvTexture(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
new file mode 100644
index 00000000000..f6b6ed2865a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// This convolution process 2x2x2(XxYxZ) block of FLT4 values per thread.
+class ConvTexture : public GPUOperation {
+ public:
+  ConvTexture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvTexture(ConvTexture&& operation);
+  ConvTexture& operator=(ConvTexture&& operation);
+  ConvTexture(const ConvTexture&) = delete;
+  ConvTexture& operator=(const ConvTexture&) = delete;
+
+ private:
+  friend Status CreateConvTexture(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  ConvTexture* result);
+  ConvTexture(const OperationDef& definition,
+              const Convolution2DAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst_0, absl::Span<T> dst_1,
+                            absl::Span<T> dst_2, absl::Span<T> dst_3);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_0_;
+  Texture2D weights_1_;
+  Texture2D weights_2_;
+  Texture2D weights_3_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                  CLContext* context) {
+  const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int texture_width = dst_depth;
+  int texture_height = src_depth * kernel_size_.x * kernel_size_.y;
+
+  DataType data_type = definition_.GetDataType();
+
+  const int elements_count = texture_width * texture_height;
+
+  if (data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data_0(elements_count);
+    std::vector<float4> gpu_data_1(elements_count);
+    std::vector<float4> gpu_data_2(elements_count);
+    std::vector<float4> gpu_data_3(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
+                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
+                         absl::MakeSpan(gpu_data_3));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_0.data(),
+                                        context, &weights_0_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_1.data(),
+                                        context, &weights_1_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_2.data(),
+                                        context, &weights_2_));
+    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
+                               gpu_data_3.data(), context, &weights_3_);
+  } else {
+    std::vector<half4> gpu_data_0(elements_count);
+    std::vector<half4> gpu_data_1(elements_count);
+    std::vector<half4> gpu_data_2(elements_count);
+    std::vector<half4> gpu_data_3(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
+                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
+                         absl::MakeSpan(gpu_data_3));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_0.data(),
+                                        context, &weights_0_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_1.data(),
+                                        context, &weights_1_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_2.data(),
+                                        context, &weights_2_));
+    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
+                               gpu_data_3.data(), context, &weights_3_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvTexture::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
+    absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
+  const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int texture_width = dst_depth;
+
+  for (int d = 0; d < dst_depth / 2; ++d) {
+    for (int y = 0; y < kernel_size_.y; ++y) {
+      for (int x = 0; x < kernel_size_.x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          for (int sub_d = 0; sub_d < 2; ++sub_d) {
+            T filters[4];
+            for (int i = 0; i < 4; ++i) {
+              for (int j = 0; j < 4; ++j) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * 2 + sub_d) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filters[j][i] = weights.data[f_index];
+                } else {
+                  filters[j][i] = 0.0f;
+                }
+              }
+            }
+            int x_coord = d * 2 + sub_d;
+            int y_coord = (y * kernel_size_.x + x) * src_depth + s;
+            int offset = y_coord * texture_width + x_coord;
+            dst_0[offset] = filters[0];
+            dst_1[offset] = filters[1];
+            dst_2[offset] = filters[2];
+            dst_3[offset] = filters[3];
+          }
+        }
+      }
+    }
+  }
+}
+
+Status CreateConvTexture(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvTexture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
new file mode 100644
index 00000000000..82d2f1b3108
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvTexture operation;
+      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvTexture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvTexture operation;
+      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
+                                             60.5f, 235.5f, 20.5f, 123.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
new file mode 100644
index 00000000000..48746a604a8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -0,0 +1,472 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
+
+#include <algorithm>
+#include <array>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+class OpenClConverterImpl : public TensorObjectConverter {
+ public:
+  virtual Status Init(const TensorObjectDef& input_def,
+                      const TensorObjectDef& output_def,
+                      Environment* environment) = 0;
+
+ protected:
+  Status DispatchKernel(cl_mem input, cl_mem output) {
+    kernel_.ResetBindingCounter();
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(input));
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(output));
+    int3 grid = int3(dims_.w, dims_.h, dims_.d());
+    int4 size = int4(dims_.w, dims_.h, dims_.c, dims_.d());
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
+    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+  }
+
+  Dimensions dims_;
+  CLKernel kernel_;
+  CLCommandQueue* queue_ = nullptr;
+};
+
+bool IsSupportedDataType(DataType type) {
+  return type == DataType::FLOAT16 || type == DataType::FLOAT32;
+}
+
+// Implements conversion from OpenCL-specific tensor layout to BHWC.
+class FromTensorConverter : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Output is always Buffer/(BHWC|DHWC4)
+           output.object_type == ObjectType::OPENCL_BUFFER &&
+           (output.data_layout == DataLayout::BHWC ||
+            output.data_layout == DataLayout::DHWC4) &&
+           // Texture2D/HDWC4 ->
+           ((input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::HDWC4) ||
+            // SingleTextureArray/BHWC ->
+            (input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::BHWC) ||
+            // TextureArray/DHWC4 ->
+            (input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::DHWC4) ||
+            // Buffer/DHWC4 ->
+            (input.object_type == ObjectType::OPENCL_BUFFER &&
+             input.data_layout == DataLayout::DHWC4));
+  }
+
+  std::pair<std::string, std::string> GetToDhwc4Kernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType4(output_def.object_def.data_type) + "* dst",
+        "dst[(d * size.y + y) * size.x + x] = " +
+            (output_def.object_def.data_type == input_def.object_def.data_type
+                 ? "input;"
+                 : "convert_" + GetDataType4(output_def.object_def.data_type) +
+                       "(input);"));
+  }
+
+  std::pair<std::string, std::string> GetToBhwcKernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType(output_def.object_def.data_type) + "* dst",
+        R"(
+  int c = d * 4;
+  int index = (y * size.x + x) * size.z + c;
+
+  dst[index] = input.x;
+  if (c + 1 < size.z) {
+    dst[index + 1] = input.y;
+  }
+  if (c + 2 < size.z) {
+    dst[index + 2] = input.z;
+  }
+  if (c + 3 < size.z) {
+    dst[index + 3] = input.w;
+  })");
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    auto params_kernel = output_def.object_def.data_layout == DataLayout::BHWC
+                             ? GetToBhwcKernel(input_def, output_def)
+                             : GetToDhwc4Kernel(input_def, output_def);
+
+    TensorStorageType src_tensor_type = ToTensorStorageType(
+        input_def.object_def.object_type, input_def.object_def.data_layout);
+    TensorDescriptor src_descr;
+    src_descr.storage_type = src_tensor_type;
+    src_descr.data_type = input_def.object_def.data_type;
+    TensorCodeGenerator src_tensor("src", "size", src_descr);
+
+    std::string shader_src =
+        R"(
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+const sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void from_tensor()" +
+        GetTensorDeclaration(src_tensor_type, AccessType::READ,
+                             input_def.object_def.data_type) +
+        " src, " + params_kernel.first + R"(, int4 size) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+  if (x >= size.x || y >= size.y || d >= size.w) return;
+  )" + GetDataType4(input_def.object_def.data_type) +
+        " input = " + src_tensor.Read3D("x", "y", "d") + ";\n" +
+        params_kernel.second + "\n}";
+    queue_ = environment->queue();
+    dims_ = input_def.dimensions;
+    return CreateKernel(shader_src, "from_tensor", environment, &kernel_);
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto output = absl::get_if<OpenClBuffer>(&output_obj);
+    if (!output || !output->memobj) {
+      return InvalidArgumentError("Missing output in from_tensor converter");
+    }
+    auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
+    if (input_texture && input_texture->memobj) {
+      return DispatchKernel(input_texture->memobj, output->memobj);
+    }
+    auto input_buffer = absl::get_if<OpenClBuffer>(&input_obj);
+    if (input_buffer && input_buffer->memobj) {
+      return DispatchKernel(input_buffer->memobj, output->memobj);
+    }
+    return InvalidArgumentError("Missing input in from_tensor converter");
+  }
+};
+
+// Implements conversion from BHWC to OpenCL-specific tensor layout.
+class ToTensorConverter : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Input is always Buffer/BHWC
+           input.object_type == ObjectType::OPENCL_BUFFER &&
+           (input.data_layout == DataLayout::BHWC ||
+            input.data_layout == DataLayout::DHWC4) &&
+           // -> Texture2D/HDWC4
+           ((output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::HDWC4) ||
+            // -> TextureArray/DHWC4
+            (output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::DHWC4) ||
+            // -> SingleTextureArray/BHWC
+            (output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::BHWC) ||
+            // -> Buffer/DHWC4
+            (output.object_type == ObjectType::OPENCL_BUFFER &&
+             output.data_layout == DataLayout::DHWC4));
+  }
+
+  std::pair<std::string, std::string> GetFromDhwc4Kernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType4(input_def.object_def.data_type) + "* src",
+        output_def.object_def.data_type == input_def.object_def.data_type
+            ? "result = src[(d * size.y + y) * size.x + x];"
+            : "result = convert_" +
+                  GetDataType4(output_def.object_def.data_type) +
+                  "(src[(d * size.y + y) * size.x + x]);");
+  }
+
+  std::pair<std::string, std::string> GetFromBhwcKernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType(input_def.object_def.data_type) + "* src",
+        R"(int c = d * 4;
+  int index = (y * size.x + x) * size.z + c;
+  result.x = src[index];
+  result.y = c + 1 < size.z ? src[index + 1] : 1;
+  result.z = c + 2 < size.z ? src[index + 2] : 2;
+  result.w = c + 3 < size.z ? src[index + 3] : 3;
+)");
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    auto params_kernel = input_def.object_def.data_layout == DataLayout::BHWC
+                             ? GetFromBhwcKernel(input_def, output_def)
+                             : GetFromDhwc4Kernel(input_def, output_def);
+    TensorStorageType dst_tensor_type = ToTensorStorageType(
+        output_def.object_def.object_type, output_def.object_def.data_layout);
+    TensorDescriptor dst_descr;
+    dst_descr.storage_type = dst_tensor_type;
+    dst_descr.data_type = output_def.object_def.data_type;
+    TensorCodeGenerator dst_tensor("dst", "size", dst_descr);
+    std::string shader_src =
+        R"(
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void to_tensor()" +
+        params_kernel.first + ", " +
+        GetTensorDeclaration(dst_tensor_type, AccessType::WRITE,
+                             output_def.object_def.data_type) +
+        R"( dst, int4 size) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+
+  if (x >= size.x || y >= size.y || d >= size.w) return;
+  )" + GetDataType4(output_def.object_def.data_type) +
+        " result;\n" + params_kernel.second + "\n  " +
+        dst_tensor.Write3D("result", "x", "y", "d") + ";\n}";
+    queue_ = environment->queue();
+    dims_ = output_def.dimensions;
+    return CreateKernel(shader_src, "to_tensor", environment, &kernel_);
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto input = absl::get_if<OpenClBuffer>(&input_obj);
+    if (!input || !input->memobj) {
+      return InvalidArgumentError("Missing input in to_tensor converter");
+    }
+    auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
+    if (output_texture && output_texture->memobj) {
+      return DispatchKernel(input->memobj, output_texture->memobj);
+    }
+    auto output_buffer = absl::get_if<OpenClBuffer>(&output_obj);
+    if (output_buffer && output_buffer->memobj) {
+      return DispatchKernel(input->memobj, output_buffer->memobj);
+    }
+    return InvalidArgumentError("Missing input in to_tensor converter");
+  }
+};
+
+std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef& def) {
+  const auto& dims = def.dimensions;
+  std::array<size_t, 3> region = {0, 0, 1};
+  switch (ToTensorStorageType(def.object_def.object_type,
+                              def.object_def.data_layout)) {
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h);
+      break;
+    case TensorStorageType::TEXTURE_2D:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h * dims.d());
+      break;
+    case TensorStorageType::TEXTURE_ARRAY:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h);
+      region[2] = static_cast<size_t>(dims.d());
+      break;
+    default:
+      break;
+  }
+  return region;
+}
+
+bool IsOpenClTextureOrBuffer(ObjectType type) {
+  return type == ObjectType::OPENCL_BUFFER ||
+         type == ObjectType::OPENCL_TEXTURE;
+}
+
+// Copies data from one object of the same type and layout to another object.
+class TrivialCopier : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsOpenClTextureOrBuffer(input.object_type) &&
+           input.data_type == output.data_type &&
+           input.object_type == output.object_type &&
+           input.data_layout == output.data_layout;
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    dims_ = input_def.dimensions;
+    data_type_ = input_def.object_def.data_type;
+    queue_ = environment->queue();
+    region_ = CalculateTextureRegion(output_def);
+    return OkStatus();
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+    auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+    if (texture_input && texture_output) {
+      return Copy(*texture_input, *texture_output);
+    }
+    auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+    auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+    if (buffer_input && buffer_output) {
+      return Copy(*buffer_input, *buffer_output);
+    }
+    return InternalError("Unexpected object");
+  }
+
+  Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
+    if (input.memobj == output.memobj) {
+      return OkStatus();
+    }
+    return GetOpenCLError(clEnqueueCopyBuffer(
+        queue_->queue(), input.memobj, output.memobj, 0, 0,
+        SizeOf(data_type_) * dims_.w * dims_.h * dims_.d() * 4, 0, nullptr,
+        nullptr));
+  }
+
+  Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
+    if (input.memobj == output.memobj) {
+      return OkStatus();
+    }
+    size_t origin[3] = {0, 0, 0};
+    return GetOpenCLError(
+        clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
+                           origin, region_.data(), 0, nullptr, nullptr));
+  }
+
+ private:
+  DataType data_type_ = DataType::UNKNOWN;
+  std::array<size_t, 3> region_;
+};
+
+// Copies data from/to CPU into a tensor.
+class CpuCopier : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.data_type == output.data_type &&
+           input.data_layout == output.data_layout &&
+           ((input.object_type == ObjectType::CPU_MEMORY &&
+             IsOpenClTextureOrBuffer(output.object_type)) ||
+            (output.object_type == ObjectType::CPU_MEMORY &&
+             IsOpenClTextureOrBuffer(input.object_type)));
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    region_ = CalculateTextureRegion(
+        input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
+                                                                   : input_def);
+    queue_ = environment->queue();
+    return OkStatus();
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
+    auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
+    if (cpu_input) {
+      auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+      if (texture_output) {
+        return queue_->EnqueueWriteImage(
+            texture_output->memobj, int3(region_[0], region_[1], region_[2]),
+            cpu_input->data);
+      }
+      auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+      if (buffer_output) {
+        return queue_->EnqueueWriteBuffer(
+            buffer_output->memobj, cpu_input->size_bytes, cpu_input->data);
+      }
+    } else if (cpu_output) {
+      auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+      if (texture_input) {
+        return queue_->EnqueueReadImage(
+            texture_input->memobj, int3(region_[0], region_[1], region_[2]),
+            cpu_output->data);
+      }
+      auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+      if (buffer_input) {
+        return queue_->EnqueueReadBuffer(
+            buffer_input->memobj, cpu_output->size_bytes, cpu_output->data);
+      }
+    }
+    return InternalError("Unexpected object");
+  }
+
+ private:
+  std::array<size_t, 3> region_;
+};
+
+class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
+ public:
+  explicit OpenClTensorConverterBuilder(Environment* environment)
+      : environment_(environment) {}
+
+  bool IsSupported(const TensorObjectDef& input,
+                   const TensorObjectDef& output) final {
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    return input.dimensions == output.dimensions &&
+           (TrivialCopier::IsSupported(input_def, output_def) ||
+            CpuCopier::IsSupported(input_def, output_def) ||
+            FromTensorConverter::IsSupported(input_def, output_def) ||
+            ToTensorConverter::IsSupported(input_def, output_def));
+  }
+
+  Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) final {
+    std::unique_ptr<OpenClConverterImpl> impl;
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    if (TrivialCopier::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<TrivialCopier>();
+    } else if (CpuCopier::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<CpuCopier>();
+    } else if (FromTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<FromTensorConverter>();
+    } else if (ToTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<ToTensorConverter>();
+    } else {
+      return UnimplementedError("Unsupported conversion");
+    }
+    RETURN_IF_ERROR(impl->Init(input, output, environment_));
+    *converter = std::move(impl);
+    return OkStatus();
+  }
+
+  Environment* environment_;
+};
+
+}  // namespace
+
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    Environment* environment) {
+  return absl::make_unique<OpenClTensorConverterBuilder>(environment);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
new file mode 100644
index 00000000000..83af85b1f82
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Supports conversions from BHWC to internal OpenCL tensor representation and
+// back. Also supports F16/F32.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    Environment* environment);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
new file mode 100644
index 00000000000..22dc05510d7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -0,0 +1,285 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases, const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "#define CONV(R, S)   \\\n";
+        c += "R += S.x * f0.s0123; \\\n";
+        c += "R += S.y * f0.s4567; \\\n";
+        c += "R += S.z * f0.s89ab; \\\n";
+        c += "R += S.w * f0.scdef;   \n";
+      } else {
+        c += "#define CONV(R, S)  \\\n";
+        c += "R += S.x * f[0];    \\\n";
+        c += "R += S.y * f[1];    \\\n";
+        c += "R += S.z * f[2];    \\\n";
+        c += "R += S.w * f[3];      \n";
+      }
+      break;
+    case CalculationsPrecision::F32_F16:
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "#define CONV(R, S) \\\n";
+        c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
+             "f0.s89ab + S.w * f0.scdef);\n";
+      } else {
+        c += "#define CONV(R, S) \\\n";
+        c += "R += convert_float4(S.x * f[0] + S.y * f[1]";
+        c += "+ S.z * f[2] + S.w * f[3]);\n";
+      }
+      break;
+  }
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "    __global FLT16* filters,  \n";
+    c += "    __global FLT4* biases";
+  } else {
+    c += "    __read_only image2d_t filters,  \n";
+    c += "    __read_only image2d_t biases";
+  }
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 kernel_size,          \n";
+  c += "    int2 stride,               \n";
+  c += "    int2 padding,              \n";
+  c += "    int2 k_offset,        \n";
+  c += "    int2 inner_size,           \n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  int f_base = Z * src_size.w * kernel_size.x * kernel_size.y;\n";
+  }
+  c += "  int2 offset = (int2)(X, Y) + padding - k_offset;\n";
+  c += "  offset.x = offset.x % stride.x;\n";
+  c += "  offset.y = offset.y % stride.y;\n";
+  c += "  offset += stride;\n";
+  c += "  offset.x = offset.x % stride.x;\n";
+  c += "  offset.y = offset.y % stride.y;\n";
+  c += "  int2 f_offset;\n";
+  c += "  f_offset.x = offset.x == 0 ? 0 : stride.x - offset.x;\n";
+  c += "  f_offset.y = offset.y == 0 ? 0 : stride.y - offset.y;\n";
+  c += "  ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  for (int ky = 0; ky < inner_size.y; ++ky) {\n";
+  c += "    int index_y = ky * stride.y + f_offset.y;\n";
+  c += "    bool inside_y = index_y < kernel_size.y;\n";
+  c += "    int s_y = (Y + index_y + padding.y - k_offset.y) / stride.y;\n";
+  c += "    index_y = kernel_size.y - 1 - index_y;\n";
+  c += "    bool out_y = s_y < 0 || s_y >= src_size.y;\n";
+  c += "    for (int kx = 0; kx < inner_size.x; ++kx) {\n";
+  c += "      int index_x = kx * stride.x + f_offset.x;\n";
+  c += "      bool inside_kernel = index_x < kernel_size.x && inside_y;\n";
+  c += "      int s_x = (X + index_x + padding.x - k_offset.x) / stride.x;\n";
+  c += "      index_x = kernel_size.x - 1 - index_x;\n";
+  c += "      bool out_x = s_x < 0 || s_x >= src_size.x;\n";
+  c += "      int kernel_index = index_y * kernel_size.x + index_x;\n";
+  c += "      if (inside_kernel && !(out_x || out_y)) {\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "        int f_offset = f_base + kernel_index * src_size.w;\n";
+  } else {
+    c += "        int x_c = kernel_index * src_size.w * 4;\n";
+  }
+  c += "        for (int l = 0; l < src_size.w; ++l) {\n";
+  c += "          FLT4 src =" +
+       src_tensor.Read3D("s_x", "s_y", "l", TextureAddressMode::DONT_CARE) +
+       ";\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "          FLT16 f0 = filters[f_offset]; f_offset++;\n";
+  } else {
+    c += "          FLT4 f[4];\n";
+    c += "          f[0] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[1] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[2] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[3] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+  }
+  c += "          CONV(r0, src);\n";
+  c += "        }\n";
+  c += "      }\n";
+  c += "    }\n";
+  c += "  }\n";
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 res0 = TO_FLT4(r0) + bias_val;\n";
+  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += PostProcess(linked_operations, "res0", "Z", "address");
+  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposed::ConvolutionTransposed(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.stride.w, attr.stride.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  const int inner_size_x = (kernel_size_.x - 1) / stride_.x + 1;
+  const int inner_size_y = (kernel_size_.y - 1) / stride_.y + 1;
+  inner_size_ = int2(inner_size_x, inner_size_y);
+  kernel_offset_ = int2(kernel_size_.x - 1, kernel_size_.y - 1);
+}
+
+ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& kernel)
+    : GPUOperation(std::move(kernel)),
+      biases_(std::move(kernel.biases_)),
+      weights_tex2d_(std::move(kernel.weights_tex2d_)),
+      weights_buf_(std::move(kernel.weights_buf_)),
+      weights_(kernel.weights_),
+      kernel_size_(kernel.kernel_size_),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_offset_(kernel.kernel_offset_),
+      inner_size_(kernel.inner_size_),
+      src_channels_(kernel.src_channels_),
+      dst_channels_(kernel.dst_channels_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConvolutionTransposed& ConvolutionTransposed::operator=(
+    ConvolutionTransposed&& kernel) {
+  if (this != &kernel) {
+    biases_ = std::move(kernel.biases_);
+    weights_tex2d_ = std::move(kernel.weights_tex2d_);
+    weights_buf_ = std::move(kernel.weights_buf_);
+    std::swap(weights_, kernel.weights_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_offset_, kernel.kernel_offset_);
+    std::swap(inner_size_, kernel.inner_size_);
+    std::swap(src_channels_, kernel.src_channels_);
+    std::swap(dst_channels_, kernel.dst_channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, *creation_context.device,
+      linked_operations_);
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposed::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_offset_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(inner_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposed::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposed::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvolutionTransposed(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const ConvolutionTransposedAttributes& attr,
+                                   ConvolutionTransposed* result) {
+  *result = ConvolutionTransposed(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
new file mode 100644
index 00000000000..52d4b892dce
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -0,0 +1,190 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposed : public GPUOperation {
+ public:
+  ConvolutionTransposed() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposed(ConvolutionTransposed&& kernel);
+  ConvolutionTransposed& operator=(ConvolutionTransposed&& kernel);
+  ConvolutionTransposed(const ConvolutionTransposed&) = delete;
+  ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
+
+ private:
+  friend Status CreateConvolutionTransposed(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposed* result);
+  explicit ConvolutionTransposed(const OperationDef& definition,
+                                 const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  LinearStorage biases_;
+
+  Texture2D weights_tex2d_;
+  Buffer weights_buf_;
+  cl_mem weights_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_offset_;
+  int2 inner_size_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposed::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
+  bool is_buffer_storage =
+      definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
+          dst_depth, gpu_data.data(), context, &weights_tex2d_));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
+          dst_depth, gpu_data.data(), context, &weights_tex2d_));
+    }
+  }
+
+  if (is_buffer_storage) {
+    weights_ = weights_buf_.GetMemoryPtr();
+  } else {
+    weights_ = weights_tex2d_.GetMemoryPtr();
+  }
+
+  return OkStatus();
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposed::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          T filters[4];
+          for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          T filters_new[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+              filters_new[i][j] = filters[j][i];
+            }
+          }
+          dst[counter++] = filters_new[0];
+          dst[counter++] = filters_new[1];
+          dst[counter++] = filters_new[2];
+          dst[counter++] = filters_new[3];
+        }
+      }
+    }
+  }
+}
+
+Status CreateConvolutionTransposed(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const ConvolutionTransposedAttributes& attr,
+                                   ConvolutionTransposed* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
new file mode 100644
index 00000000000..a324da8b91f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -0,0 +1,258 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases, int src_depth, int dst_depth,
+    const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \\\n";
+      c += "  R += SRC.w * F[i + 3];   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += "+ SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters,  \n";
+  c += biases.GetDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  for (int d = 0; d < dst_depth; ++d) {
+    const std::string layer = std::to_string(d);
+    c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
+    c += "  r" + layer + "[0][0] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[0][1] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[1][0] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[1][1] = (ACCUM_FLT4)(0.0f);\n";
+  }
+  int filters_index = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    const std::string z = std::to_string(s);
+    c += "  {\n";
+    if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+      c += "  bool x_in = X + 1 < src_size.x;\n";
+      c += "  bool y_in = Y + 1 < src_size.y;\n";
+      c += "  FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
+      c += "  FLT4 src1 = (FLT4)(0.0);\n";
+      c += "  FLT4 src2 = (FLT4)(0.0);\n";
+      c += "  FLT4 src3 = (FLT4)(0.0);\n";
+      c += "  if (x_in) {\n";
+      c += "    src1 = " + src_tensor.Read3D("X + 1", "Y", z) + ";\n";
+      c += "  }\n";
+      c += "  if (y_in) {\n";
+      c += "    src2 = " + src_tensor.Read3D("X", "Y + 1", z) + ";\n";
+      c += "  }\n";
+      c += "  if (x_in && y_in) {\n";
+      c += "    src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
+      c += "  }\n";
+    } else {
+      const auto mode = GetFastestZeroMode(device);
+      c += "  FLT4 src0 = " + src_tensor.Read3D("X", "Y", z, mode) + ";\n";
+      c += "  FLT4 src1 = " + src_tensor.Read3D("X + 1", "Y", z, mode) + ";\n";
+      c += "  FLT4 src2 = " + src_tensor.Read3D("X", "Y + 1", z, mode) + ";\n";
+      c += "  FLT4 src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z, mode) +
+           ";\n";
+    }
+    for (int d = 0; d < dst_depth; ++d) {
+      const std::string layer = std::to_string(d);
+      const std::string f_offset = std::to_string(filters_index);
+      filters_index++;
+      c += "  {\n";
+      c += "  __constant FLT4* L0 = filters + 36 * " + f_offset + ";\n";
+      c += "  CONV(r" + layer + "[0][0], src0, L0, 0);\n";
+      c += "  CONV(r" + layer + "[0][1], src0, L0, 4);\n";
+      c += "  CONV(r" + layer + "[0][1], src1, L0, 8);\n";
+      c += "  CONV(r" + layer + "[1][0], src0, L0, 12);\n";
+      c += "  CONV(r" + layer + "[1][0], src2, L0, 16);\n";
+      c += "  CONV(r" + layer + "[1][1], src0, L0, 20);\n";
+      c += "  CONV(r" + layer + "[1][1], src1, L0, 24);\n";
+      c += "  CONV(r" + layer + "[1][1], src2, L0, 28);\n";
+      c += "  CONV(r" + layer + "[1][1], src3, L0, 32);\n";
+      c += "  }\n";
+    }
+    c += "  }\n";
+  }
+  c += "  X *= 2;\n";
+  c += "  Y *= 2;\n";
+  for (int d = 0; d < dst_depth; ++d) {
+    const std::string layer = std::to_string(d);
+    c += "  {\n";
+    c += "  FLT4 bias_val = " + biases.ReadLinearFLT4(layer) + ";\n";
+    for (int y = 0; y < 2; ++y) {
+      for (int x = 0; x < 2; ++x) {
+        c += "  {\n";
+        c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
+             "][" + std::to_string(x) + "]) + bias_val;\n";
+        c += "    " +
+             dst_tensor.GetAddress("address", "X + " + std::to_string(x),
+                                   "Y + " + std::to_string(y), layer) +
+             "\n";
+        c += PostProcess(linked_operations, "result", layer, "address");
+        c += "    " + dst_tensor.Write3D("result", "address") + "\n";
+        c += "  }\n";
+      }
+    }
+    c += "  }\n";
+  }
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {}
+
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    ConvolutionTransposed3x3Thin&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      src_channels_(operation.src_channels_),
+      dst_channels_(operation.dst_channels_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
+    ConvolutionTransposed3x3Thin&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(src_channels_, operation.src_channels_);
+    std::swap(dst_channels_, operation.dst_channels_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposed3x3Thin::Compile(
+    const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, IntegralDivideRoundUp(src_channels_, 4),
+      IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
+      linked_operations_);
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposed3x3Thin::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
+  const int grid_x = src_[0]->Width();
+  const int grid_y = src_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposed3x3Thin::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvolutionTransposed3x3ThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+  return device.IsAdreno() && attr.weights.shape.o <= 8 &&
+         attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
+         attr.stride.w == 2 && attr.stride.h == 2 &&
+         attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
+}
+
+Status CreateConvolutionTransposed3x3Thin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposed3x3Thin* result) {
+  if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
+                                               attr)) {
+    return InvalidArgumentError(
+        "ConvolutionTransposed3x3Thin doesn't support this attributes");
+  }
+  *result = ConvolutionTransposed3x3Thin(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
new file mode 100644
index 00000000000..f8d10d6c6b8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposed3x3Thin : public GPUOperation {
+ public:
+  ConvolutionTransposed3x3Thin() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
+  ConvolutionTransposed3x3Thin& operator=(
+      ConvolutionTransposed3x3Thin&& operation);
+  ConvolutionTransposed3x3Thin(const ConvolutionTransposed3x3Thin&) = delete;
+  ConvolutionTransposed3x3Thin& operator=(const ConvolutionTransposed3x3Thin&) =
+      delete;
+
+ private:
+  friend Status CreateConvolutionTransposed3x3Thin(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposed3x3Thin* result);
+  explicit ConvolutionTransposed3x3Thin(
+      const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposed3x3Thin::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int kernel_x = 3;  //  This operation support only 3x3 kernel
+  const int kernel_y = 3;
+  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
+
+  const int flt4_size = definition_.precision == CalculationsPrecision::F32
+                            ? sizeof(float4)
+                            : sizeof(half4);
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(flt4_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(flt4_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int kernel_x = 3;
+  const int kernel_y = 3;
+
+  const int remap[9] = {4, 5, 3, 7, 1, 8, 6, 2, 0};
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          const int kernel_index = remap[y * kernel_x + x];
+          const int kernel_index_x = kernel_index % kernel_x;
+          const int kernel_index_y = kernel_index / kernel_x;
+          T filters[4];
+          for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+                const int f_index = weights.shape.LinearIndex(
+                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          dst[counter++] = filters[0];
+          dst[counter++] = filters[1];
+          dst[counter++] = filters[2];
+          dst[counter++] = filters[3];
+        }
+      }
+    }
+  }
+}
+
+bool IsConvolutionTransposed3x3ThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+
+Status CreateConvolutionTransposed3x3Thin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposed3x3Thin* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
new file mode 100644
index 00000000000..d78fe4e6bba
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 3, 3, 1);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed3x3Thin operation;
+      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
+                                                   attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 1.0f, 1.0f, 1.0f, 2.0f, 6.0f, 4.0f, 4.0f,
+                             2.0f, 5.0f, 3.0f, 3.0f, 2.0f, 5.0f, 3.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 3, 3, 1);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed3x3Thin operation;
+      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
+                                                   attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {0.5f, 4.5f, 5.5f, 6.5f, 4.5f, 16.5f, 14.5f, 18.5f, 10.5f,
+                     24.5f, 15.5f, 18.5f, 16.5f, 39.5f, 24.5f, 27.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
new file mode 100644
index 00000000000..aa5a8c5c517
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed operation;
+      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed operation;
+      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
+                     32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
new file mode 100644
index 00000000000..3cc932c6f78
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -0,0 +1,252 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int src_depth, int dst_channels, const int2& kernel_size,
+    const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  const std::string channel_x = dst_channels == 1 ? "" : ".x";
+  const std::vector<std::string> channel = {channel_x, ".y", ".z", ".w"};
+
+  const std::string type_postfix =
+      dst_channels == 1 ? "" : std::to_string(dst_channels);
+
+  std::string accum_type;
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      accum_type = "float" + type_postfix;
+      break;
+    case CalculationsPrecision::F16:
+      accum_type = "half" + type_postfix;
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    FLT4 bias_value            \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  c += "  " + accum_type + " r[" + std::to_string(kernel_size.y) + "][" +
+       std::to_string(kernel_size.x) + "];\n";
+  c += "  {\n";
+  c += "  FLT4 src = " +
+       src_tensor.Read3D("X", "Y", "0", TextureAddressMode::DONT_CARE) + ";\n";
+  int index = 0;
+  for (int y = 0; y < kernel_size.y; ++y) {
+    for (int x = 0; x < kernel_size.x; ++x) {
+      std::string r_s =
+          "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
+      for (int d = 0; d < dst_channels; ++d) {
+        c += r_s + channel[d] + " = TO_ACCUM_FLT(dot(src, filters[" +
+             std::to_string(index) + "]));\n";
+        index++;
+      }
+    }
+  }
+  c += "  }\n";
+  for (int i = 1; i < src_depth; ++i) {
+    if (precision != CalculationsPrecision::F32_F16) {
+      c += "  if (X < src_size.x + " + std::to_string(i + 1) + ") {\n";
+    } else {
+      c += "  {\n";
+    }
+    c += "  FLT4 src = " +
+         src_tensor.Read3D("X", "Y", std::to_string(i),
+                           TextureAddressMode::DONT_CARE) +
+         ";\n";
+    for (int y = 0; y < kernel_size.y; ++y) {
+      for (int x = 0; x < kernel_size.x; ++x) {
+        std::string r_s =
+            "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
+        for (int d = 0; d < dst_channels; ++d) {
+          c += r_s + channel[d] + " += TO_ACCUM_FLT(dot(src, filters[" +
+               std::to_string(index) + "]));\n";
+          index++;
+        }
+      }
+    }
+    c += "  }\n";
+  }
+  c += "  X *= " + std::to_string(kernel_size.x) + ";\n";
+  c += "  Y *= " + std::to_string(kernel_size.x) + ";\n";
+  for (int y = 0; y < kernel_size.y; ++y) {
+    for (int x = 0; x < kernel_size.x; ++x) {
+      if (precision != CalculationsPrecision::F32_F16) {
+        c += "  if (X + " + std::to_string(x) + " < dst_size.x && ";
+        c += "Y + " + std::to_string(y) + " < dst_size.y) {\n";
+      } else {
+        c += "  {\n";
+      }
+      c += "    FLT4 result = bias_value;\n";
+      for (int d = 0; d < dst_channels; ++d) {
+        c += "    result" + channel[d] + " += r[" + std::to_string(y) + "][" +
+             std::to_string(x) + "]" + channel[d] + ";\n";
+      }
+      c += "    " +
+           dst_tensor.GetAddress("address", "X + " + std::to_string(x),
+                                 "Y + " + std::to_string(y), "0") +
+           "\n";
+      c += PostProcess(linked_operations, "result", "0", "address");
+      c += "    " + dst_tensor.Write3D("result", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  float4 bias_value(0.0f);
+  for (int i = 0; i < attr.weights.shape.o; ++i) {
+    bias_value[i] = attr.bias.data[i];
+  }
+  bias_value_ = FLT4(definition_.precision, bias_value);
+}
+
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    ConvolutionTransposedThin&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_buf_(std::move(operation.weights_buf_)),
+      bias_value_(std::move(operation.bias_value_)),
+      kernel_size_(operation.kernel_size_),
+      src_channels_(operation.src_channels_),
+      dst_channels_(operation.dst_channels_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
+    ConvolutionTransposedThin&& operation) {
+  if (this != &operation) {
+    weights_buf_ = std::move(operation.weights_buf_);
+    bias_value_ = std::move(operation.bias_value_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(src_channels_, operation.src_channels_);
+    std::swap(dst_channels_, operation.dst_channels_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposedThin::Compile(
+    const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, IntegralDivideRoundUp(src_channels_, 4),
+      dst_channels_, kernel_size_, *creation_context.device,
+      linked_operations_);
+
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsAdreno3xx()) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposedThin::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(bias_value_));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposedThin::GetGridSize() const {
+  const int grid_x = src_[0]->Width();
+  const int grid_y = src_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvolutionTransposedThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+  return device.IsAdreno() && attr.weights.shape.o <= 4 &&
+         attr.weights.shape.w == attr.stride.w &&
+         attr.weights.shape.h == attr.stride.h &&
+         attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0;
+}
+
+Status CreateConvolutionTransposedThin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposedThin* result) {
+  if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
+    return InvalidArgumentError(
+        "ConvolutionTransposedThin doesn't support this attributes");
+  }
+  *result = ConvolutionTransposedThin(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
new file mode 100644
index 00000000000..0642a7c928b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposedThin : public GPUOperation {
+ public:
+  ConvolutionTransposedThin() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin& operator=(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin(const ConvolutionTransposedThin&) = delete;
+  ConvolutionTransposedThin& operator=(const ConvolutionTransposedThin&) =
+      delete;
+
+ private:
+  friend Status CreateConvolutionTransposedThin(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposedThin* result);
+  ConvolutionTransposedThin(const OperationDef& definition,
+                            const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_buf_;
+  FLT4 bias_value_;
+
+  int2 kernel_size_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposedThin::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int elements_count =
+      kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_buf_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_buf_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposedThin::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        std::vector<T> filters(dst_channels_);
+        for (int j = 0; j < dst_channels_; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int s_ch = s * 4 + i;
+            const int d_ch = j;
+            if (s_ch < src_channels_ && d_ch < dst_channels_) {
+              const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+              filters[j][i] = weights.data[f_index];
+            } else {
+              filters[j][i] = 0.0f;
+            }
+          }
+        }
+        for (int j = 0; j < dst_channels_; ++j) {
+          dst[counter++] = filters[j];
+        }
+      }
+    }
+  }
+}
+
+bool IsConvolutionTransposedThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+
+Status CreateConvolutionTransposedThin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposedThin* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
new file mode 100644
index 00000000000..4e9676cfe2a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposedThin operation;
+      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
+                                                &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposedThin operation;
+      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
+                                                &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
+                     32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
new file mode 100644
index 00000000000..8367cecd14d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
@@ -0,0 +1,269 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+bool IsSpecializedCase(int channel_multiplier) {
+  return channel_multiplier == 1 || channel_multiplier == 2 ||
+         channel_multiplier == 4;
+}
+
+std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
+                        int channel_multiplier,
+                        TextureAddressMode address_mode) {
+  std::string c;
+  if (channel_multiplier == 1) {
+    c += "      FLT4 src_final =" +
+         src_tensor.Read3D("x_c", "y_c", "Z", address_mode) + ";\n";
+  } else if (channel_multiplier == 2) {
+    c += "      int z_layer = Z / 2;\n";
+    c += "      FLT4 src =" +
+         src_tensor.Read3D("x_c", "y_c", "z_layer", address_mode) + ";\n";
+    c += "      FLT2 t0 = Z % 2 == 0 ? src.xy : src.zw;\n";
+    c += "      FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
+  } else if (channel_multiplier == 4) {
+    c += "      int z_layer = Z / 4;\n";
+    c += "      FLT4 src =" +
+         src_tensor.Read3D("x_c", "y_c", "z_layer", address_mode) + ";\n";
+    c += "      FLT t0 = src.x;\n";
+    c += "      int reminder = Z % 4;\n";
+    c += "      if (reminder == 1) t0 = src.y;\n";
+    c += "      if (reminder == 2) t0 = src.z;\n";
+    c += "      if (reminder == 3) t0 = src.w;\n";
+    c += "      FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
+  } else {
+    c += "      int z_layer = Z / channel_multiplier;\n";
+    c += "      FLT4 src =" +
+         src_tensor.Read3D("x_c", "y_c", "z_layer", address_mode) + ";\n";
+    c += "      int z_offset = (Z % channel_multiplier) * 4;\n";
+    c += "      FLT4 src_final;\n";
+    c += "      FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
+    c += "      src_final.x = temp_arr[(z_offset + 0) / channel_multiplier];\n";
+    c += "      src_final.y = temp_arr[(z_offset + 1) / channel_multiplier];\n";
+    c += "      src_final.z = temp_arr[(z_offset + 2) / channel_multiplier];\n";
+    c += "      src_final.w = temp_arr[(z_offset + 3) / channel_multiplier];\n";
+  }
+
+  return c;
+}
+
+std::string GenerateDepthWiseConvolutionCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases, int channel_multiplier,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    const CLDevice& device) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "    __global FLT4* filters,  \n";
+  } else {
+    c += "    __read_only image2d_t filters,  \n";
+  }
+  c += biases.GetDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 kernel_size,                \n";
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding,                    \n";
+  c += "    int2 dilation,                   \n";
+  if (!IsSpecializedCase(channel_multiplier)) {
+    c += "    int channel_multiplier,            \n";
+  }
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  int x_offseted = X * stride.x - padding.x;\n";
+  c += "  int y_offseted = Y * stride.y - padding.y;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  int fx_c = Z * kernel_size.x * kernel_size.y;\n";
+  } else {
+    c += "  int fx_c = 0;\n";
+  }
+
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * dilation.y;\n";
+    c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * dilation.x;\n";
+    c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
+    c += "      if (!outside_x && !outside_y) {\n";
+    c += "        FLT4 f = filters[fx_c];\n";
+    c += GetSrcValue(src_tensor, channel_multiplier,
+                     TextureAddressMode::DONT_CARE);
+    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
+    c += "      };\n";
+    c += "      fx_c++;\n";
+    c += "    }\n";
+    c += "  }\n";
+  } else {  // Texture types with ZERO clamping
+    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * dilation.y;\n";
+    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * dilation.x;\n";
+    const auto access_mode = GetFastestZeroMode(device);
+    c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
+    c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+    c += "      fx_c++;\n";
+    c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
+    c += "    }\n";
+    c += "  }\n";
+  }
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
+  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += PostProcess(linked_operations, "res0", "Z", "address");
+  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+DepthWiseConvolution::DepthWiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      channel_multiplier_(attr.weights.shape.o),
+      work_group_size_(8, 8, 1) {}
+
+DepthWiseConvolution::DepthWiseConvolution(DepthWiseConvolution&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_tex2d_(std::move(operation.weights_tex2d_)),
+      weights_buf_(std::move(operation.weights_buf_)),
+      weights_(operation.weights_),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      channel_multiplier_(operation.channel_multiplier_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+DepthWiseConvolution& DepthWiseConvolution::operator=(
+    DepthWiseConvolution&& operation) {
+  if (this != &operation) {
+    weights_tex2d_ = std::move(operation.weights_tex2d_);
+    weights_buf_ = std::move(operation.weights_buf_);
+    std::swap(weights_, operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(channel_multiplier_, operation.channel_multiplier_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateDepthWiseConvolutionCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, channel_multiplier_, linked_operations_,
+      *creation_context.device);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status DepthWiseConvolution::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  if (!IsSpecializedCase(channel_multiplier_)) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 DepthWiseConvolution::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status DepthWiseConvolution::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status DepthWiseConvolution::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateDepthWiseConvolution(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const DepthwiseConvolution2DAttributes& attr,
+                                  DepthWiseConvolution* result) {
+  *result = DepthWiseConvolution(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
new file mode 100644
index 00000000000..3f051cbcded
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class DepthWiseConvolution : public GPUOperation {
+ public:
+  DepthWiseConvolution() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  DepthWiseConvolution(DepthWiseConvolution&& operation);
+  DepthWiseConvolution& operator=(DepthWiseConvolution&& operation);
+  DepthWiseConvolution(const DepthWiseConvolution&) = delete;
+  DepthWiseConvolution& operator=(const DepthWiseConvolution&) = delete;
+
+ private:
+  friend Status CreateDepthWiseConvolution(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr,
+      DepthWiseConvolution* result);
+  explicit DepthWiseConvolution(const OperationDef& definition,
+                                const DepthwiseConvolution2DAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_tex2d_;
+  Buffer weights_buf_;
+  cl_mem weights_;
+
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int channel_multiplier_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status DepthWiseConvolution::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const int elements_count = kernel_x * kernel_y * dst_depth;
+
+  bool is_buffer_storage =
+      definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
+          gpu_data.data(), context, &weights_tex2d_));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
+          gpu_data.data(), context, &weights_tex2d_));
+    }
+  }
+
+  if (is_buffer_storage) {
+    weights_ = weights_buf_.GetMemoryPtr();
+  } else {
+    weights_ = weights_tex2d_.GetMemoryPtr();
+  }
+
+  return OkStatus();
+}
+
+template <DataType S, typename T>
+void DepthWiseConvolution::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int d_ch = d * 4 + i;
+          if (d_ch < dst_channels) {
+            const int f_index = weights.shape.LinearIndex(
+                {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+  }
+}
+
+Status CreateDepthWiseConvolution(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const DepthwiseConvolution2DAttributes& attr,
+                                  DepthWiseConvolution* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.cc
new file mode 100644
index 00000000000..71ac2e088f1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.cc
@@ -0,0 +1,252 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateDepthWiseConvCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    const CLDevice& device) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  const auto mode = GetFastestZeroMode(device);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters\n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * 2;\n";
+  c += "  int Y = get_global_id(1) * 2;\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "   ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   FLT4 f0 = READ_IMAGE(filters, smp_none, (int2)(0, Z));\n";
+  c += "   FLT4 f1 = READ_IMAGE(filters, smp_none, (int2)(1, Z));\n";
+  c += "   FLT4 f2 = READ_IMAGE(filters, smp_none, (int2)(2, Z));\n";
+  c += "   FLT4 f3 = READ_IMAGE(filters, smp_none, (int2)(3, Z));\n";
+  c += "   FLT4 f4 = READ_IMAGE(filters, smp_none, (int2)(4, Z));\n";
+  c += "   FLT4 f5 = READ_IMAGE(filters, smp_none, (int2)(5, Z));\n";
+  c += "   FLT4 f6 = READ_IMAGE(filters, smp_none, (int2)(6, Z));\n";
+  c += "   FLT4 f7 = READ_IMAGE(filters, smp_none, (int2)(7, Z));\n";
+  c += "   FLT4 f8 = READ_IMAGE(filters, smp_none, (int2)(8, Z));\n";
+  c += " \n";
+  c += "   FLT4 s0;\n";
+  c += "   FLT4 s1;\n";
+  c += "   FLT4 s2;\n";
+  c += "   FLT4 s3;\n";
+  c += " \n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y - 1", "Z", mode) + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y - 1", "Z", mode) + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y - 1", "Z", mode) + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y - 1", "Z", mode) + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f0 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f1 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f0 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f2 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f1 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f2 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y", "Z", mode) + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y", "Z", mode) + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y", "Z", mode) + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y", "Z", mode) + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f3 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f0 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f4 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f3 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f1 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f0 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f5 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f4 * s2);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f2 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f1 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f5 * s3);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f2 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y + 1", "Z", mode) + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y + 1", "Z", mode) + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y + 1", "Z", mode) + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y + 1", "Z", mode) + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f6 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f3 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f7 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f6 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f4 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f3 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f8 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f7 * s2);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f5 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f4 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f8 * s3);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f5 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y + 2", "Z", mode) + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y + 2", "Z", mode) + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y + 2", "Z", mode) + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y + 2", "Z", mode) + ";\n";
+  c += "   r2 += TO_ACCUM_TYPE(f6 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f7 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f6 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f8 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f7 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f8 * s3);\n";
+  c += " }\n";
+  c += "   FLT4 bias = READ_IMAGE(filters, smp_none, (int2)(9, Z));\n";
+  c += "   r0 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r1 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r2 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r3 += TO_ACCUM_TYPE(bias);\n";
+  c += "   if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r0);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 0", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r1);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 0", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r2);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 1", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r3);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 1", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += " }\n";
+
+  return c;
+}
+
+}  // namespace
+
+DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(const OperationDef& definition)
+    : GPUOperation(definition) {}
+
+DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(
+    DepthWiseConv3x3Texture&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+DepthWiseConv3x3Texture& DepthWiseConv3x3Texture::operator=(
+    DepthWiseConv3x3Texture&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status DepthWiseConv3x3Texture::Compile(
+    const CreationContext& creation_context) {
+  std::string code = GenerateDepthWiseConvCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, linked_operations_, *creation_context.device);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status DepthWiseConv3x3Texture::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+
+  return OkStatus();
+}
+
+int3 DepthWiseConv3x3Texture::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status DepthWiseConv3x3Texture::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status DepthWiseConv3x3Texture::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsDepthWiseConv3x3TextureSupported(
+    const DepthwiseConvolution2DAttributes& attr) {
+  return attr.weights.shape.o == 1 && attr.dilations.w == 1 &&
+         attr.dilations.h == 1 && attr.weights.shape.w == 3 &&
+         attr.weights.shape.h == 3 && attr.strides.w == 1 &&
+         attr.strides.h == 1 && attr.padding.prepended.w == 1 &&
+         attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
+         attr.padding.appended.h == 1;
+}
+
+Status CreateDepthWiseConv3x3Texture(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr,
+    DepthWiseConv3x3Texture* result) {
+  if (!IsDepthWiseConv3x3TextureSupported(attr)) {
+    return InvalidArgumentError(
+        "DepthWiseConv3x3Texture doesn't support this attributes");
+  }
+  *result = DepthWiseConv3x3Texture(definition);
+  RETURN_IF_ERROR(result->UploadWeightsAndBiases(attr.weights, attr.bias,
+                                                 creation_context.context));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h
new file mode 100644
index 00000000000..29cd019c536
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class DepthWiseConv3x3Texture : public GPUOperation {
+ public:
+  DepthWiseConv3x3Texture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  DepthWiseConv3x3Texture(DepthWiseConv3x3Texture&& kernel);
+  DepthWiseConv3x3Texture& operator=(DepthWiseConv3x3Texture&& kernel);
+  DepthWiseConv3x3Texture(const DepthWiseConv3x3Texture&) = delete;
+  DepthWiseConv3x3Texture& operator=(const DepthWiseConv3x3Texture&) = delete;
+
+ private:
+  explicit DepthWiseConv3x3Texture(const OperationDef& definition);
+  template <DataType T>
+  Status UploadWeightsAndBiases(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                const ::tflite::gpu::Tensor<Linear, T>& biases,
+                                CLContext* context);
+
+  friend Status CreateDepthWiseConv3x3Texture(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr,
+      DepthWiseConv3x3Texture* result);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsAndBiasesData(
+      const ::tflite::gpu::Tensor<OHWI, S>& weights,
+      const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status DepthWiseConv3x3Texture::UploadWeightsAndBiases(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights,
+    const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  int texture_width = 10;  // 3x3 kernel + 1 bias
+  int texture_height = src_depth;
+  const int elements_count = texture_width * texture_height;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
+                               texture_height, gpu_data.data(), context,
+                               &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
+                               texture_height, gpu_data.data(), context,
+                               &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void DepthWiseConv3x3Texture::RearrangeWeightsAndBiasesData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights,
+    const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int s_ch = s * 4 + i;
+          if (s_ch < weights.shape.i) {
+            const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+
+    T bias_val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = s * 4 + i;
+      bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
+    }
+    dst[counter++] = bias_val;
+  }
+}
+
+bool IsDepthWiseConv3x3TextureSupported(
+    const DepthwiseConvolution2DAttributes& attr);
+
+Status CreateDepthWiseConv3x3Texture(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr,
+    DepthWiseConv3x3Texture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture_test.cc
new file mode 100644
index 00000000000..1de8a0082f1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, DepthWiseConv3x3TextureSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 3, 2);
+  attr.weights.data = {0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConv3x3Texture operation;
+      ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
+                                              &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {6.0f, 16.0f, 8.0f, 16.0f, 10.0f,
+                                             16.0f, 12.0f, 16.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConv3x3Texture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 3, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f,
+                       3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConv3x3Texture operation;
+      ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
+                                              &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {40.5f, 67.5f, 16.5f, 35.5f, 40.5f,
+                                             67.5f, 16.5f, 35.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_test.cc
new file mode 100644
index 00000000000..f5564712ad5
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, DepthWiseConvSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 1, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {4.0f, 6.0f, 8.0f, 10.0f, 4.0f,
+                                             6.0f, 8.0f, 10.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConvNoMultiplier) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 1, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {16.5f, 27.5f, 28.5f, 43.5f, 8.5f,
+                                             15.5f, 12.5f, 23.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConvMultiplier2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 3, 1, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f,  5.0f,
+                       6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  attr.bias.shape = Linear(4);
+  attr.bias.data = {0.5f, -0.5f, 1.0f, -1.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 4), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {16.5f, 39.5f, 29.0f, 63.0f, 28.5f, 75.5f, 45.0f, 103.0f,
+                     8.5f, 31.5f, 17.0f, 51.0f, 12.5f, 59.5f, 25.0f, 83.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
new file mode 100644
index 00000000000..e49267bad99
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
@@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+FLT::FLT(CalculationsPrecision precision, float value)
+    : f32_(precision == CalculationsPrecision::F32), active_(true) {
+  if (f32_) {
+    f_value_ = value;
+  } else {
+    h_value_ = half(value);
+  }
+}
+
+const void* FLT::GetData() const {
+  return f32_ ? static_cast<const void*>(&f_value_)
+              : static_cast<const void*>(&h_value_);
+}
+
+std::string FLT::GetDeclaration() const {
+  const std::string type = f32_ ? "float" : "half";
+  return absl::StrCat(type, " ", name_);
+}
+
+FLT2::FLT2(CalculationsPrecision precision, const float2& value)
+    : f32_(precision == CalculationsPrecision::F32), active_(true) {
+  if (f32_) {
+    f_value_ = value;
+  } else {
+    h_value_ = half2(value);
+  }
+}
+
+const void* FLT2::GetData() const {
+  return f32_ ? static_cast<const void*>(&f_value_)
+              : static_cast<const void*>(&h_value_);
+}
+
+std::string FLT2::GetDeclaration() const {
+  const std::string type = f32_ ? "float2" : "half2";
+  return absl::StrCat(type, " ", name_);
+}
+
+FLT4::FLT4(CalculationsPrecision precision, const float4& value)
+    : f32_(precision == CalculationsPrecision::F32), active_(true) {
+  if (f32_) {
+    f_value_ = value;
+  } else {
+    h_value_ = half4(value);
+  }
+}
+
+const void* FLT4::GetData() const {
+  return f32_ ? static_cast<const void*>(&f_value_)
+              : static_cast<const void*>(&h_value_);
+}
+
+std::string FLT4::GetDeclaration() const {
+  const std::string type = f32_ ? "float4" : "half4";
+  return absl::StrCat(type, " ", name_);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
new file mode 100644
index 00000000000..9caf017fce7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class FLT {
+ public:
+  FLT() = default;
+  FLT(CalculationsPrecision precision, float value);
+
+  const void* GetData() const;
+  size_t GetSize() const { return f32_ ? sizeof(float) : sizeof(half); }
+  bool Active() const { return active_; }
+  std::string GetDeclaration() const;
+  std::string GetName() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+ private:
+  float f_value_;
+  half h_value_;
+  bool f32_;
+  bool active_ = false;
+  std::string name_;
+};
+
+class FLT2 {
+ public:
+  FLT2() = default;
+  FLT2(CalculationsPrecision precision, const float2& value);
+
+  const void* GetData() const;
+  size_t GetSize() const { return f32_ ? 8 : 4; }
+  bool Active() const { return active_; }
+  std::string GetDeclaration() const;
+  std::string GetName() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+ private:
+  float2 f_value_;
+  half2 h_value_;
+  bool f32_;
+  bool active_ = false;
+  std::string name_;
+};
+
+class FLT4 {
+ public:
+  FLT4() {}
+  FLT4(CalculationsPrecision precision, const float4& value);
+
+  const void* GetData() const;
+  size_t GetSize() const { return f32_ ? sizeof(float4) : sizeof(half4); }
+  bool Active() const { return active_; }
+  std::string GetDeclaration() const;
+  std::string GetName() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+ private:
+  float4 f_value_;
+  half4 h_value_;
+  bool f32_;
+  bool active_ = false;
+  std::string name_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
new file mode 100644
index 00000000000..3e62224095f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
@@ -0,0 +1,190 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// We split vec vec dot (every thread do vec vec dot product in basic
+// vec mat mult) on 4 parts to create more threads
+// tid.y thread process every 4-th element in vec vec dot
+// Good results for ~1024 x 1024 sizes, for other can be written more
+// otimized shaders
+
+std::string GetFullyConnectedKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    const int3& work_group_size) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define READ_IMAGE read_imagef\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define READ_IMAGE read_imageh\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters,\n";
+  c += "    __read_only image2d_t biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    int src_depth_x4          \n";
+  c += ") {\n";
+  c += "  int gid = get_global_id(0);\n";
+  c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
+  c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
+  c += "  uint c = tid.y;\n";       // vector coord for every thread
+  c += "  uint c2 = tid.y * 2;\n";  // it should be * 4, so as we have FLT4
+  // but we keep half8 in float4 so, we have * 2 y_coord for texture
+  c += "  for (int i = 0; i < src_depth_x4; ++i, c += 4, c2 += 8) {\n";
+  c += "    FLT4 v = " +
+       src_tensor.Read3D("0", "0", "c", TextureAddressMode::DONT_CARE) + ";\n";
+  if (precision != CalculationsPrecision::F32) {
+    c += "   half8 m0 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
+         "c2+0)));\n";
+    c += "   half8 m1 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
+         "c2+1)));\n";
+    c += "   s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
+    c += "   s.y += (v.x * m0.s4 + v.y * m0.s5 + v.z * m0.s6 + v.w * m0.s7);\n";
+    c += "   s.z += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
+    c += "   s.w += (v.x * m1.s4 + v.y * m1.s5 + v.z * m1.s6 + v.w * m1.s7);\n";
+  } else {
+    c += "   float4 m0 = read_imagef(filters, smp_none, (int2)(gid * 4 + 0, "
+         "c));\n";
+    c += "   float4 m1 = read_imagef(filters, smp_none, (int2)(gid * 4 + 1, "
+         "c));\n";
+    c += "   float4 m2 = read_imagef(filters, smp_none, (int2)(gid * 4 + 2, "
+         "c));\n";
+    c += "   float4 m3 = read_imagef(filters, smp_none, (int2)(gid * 4 + 3, "
+         "c));\n";
+    c += "   s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
+    c += "   s.y += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
+    c += "   s.z += (v.x * m2.s0 + v.y * m2.s1 + v.z * m2.s2 + v.w * m2.s3);\n";
+    c += "   s.w += (v.x * m3.s0 + v.y * m3.s1 + v.z * m3.s2 + v.w * m3.s3);\n";
+  }
+  c += "  }\n";
+  c += "  __local ACCUM_FLT4 temp[" + std::to_string(work_group_size.x) + "][" +
+       std::to_string(work_group_size.y) + "];\n";
+  c += "  temp[tid.x][tid.y] = s;\n";
+  c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+  c += "  if (tid.y == 0 && gid < dst_size.w) {\n";
+  c += "    s += temp[tid.x][1];\n";
+  c += "    s += temp[tid.x][2];\n";
+  c += "    s += temp[tid.x][3];\n";
+  c += "    FLT4 r0 = TO_FLT4(s) + READ_IMAGE(biases, smp_none, (int2)(gid, "
+       "0));\n";
+  c += "  " + dst_tensor.GetAddress("dst_adr", "0", "0", "gid") + "\n";
+  c += PostProcess(linked_operations, "r0", "gid", "dst_adr");
+  c += "  " + dst_tensor.Write3D("r0", "dst_adr") + "\n";
+  c += "  }\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+FullyConnectedTexture::FullyConnectedTexture(const OperationDef& definition)
+    : GPUOperation(definition) {}
+
+FullyConnectedTexture::FullyConnectedTexture(FullyConnectedTexture&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      biases_(std::move(kernel.biases_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+FullyConnectedTexture& FullyConnectedTexture::operator=(
+    FullyConnectedTexture&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_), biases_ = std::move(kernel.biases_),
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
+  int wg_width = 32;
+  int wg_height = 4;
+  int work_items;
+  do {
+    work_group_size_ = {wg_width, wg_height, 1};
+    wg_width /= 2;
+    const auto code = GetFullyConnectedKernelCode(
+        definition_.src_tensors[0], definition_.dst_tensors[0],
+        definition_.precision, linked_operations_, work_group_size_);
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code, "main_function", *creation_context.context,
+        *creation_context.device, &kernel_));
+    work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
+  } while (work_items > kernel_.GetMaxWorkGroupSize());
+  return OkStatus();
+}
+
+Status FullyConnectedTexture::AddToQueue(CLCommandQueue* queue) {
+  const int src_depth_x4 = IntegralDivideRoundUp(src_[0]->Depth(), 4);
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_depth_x4));
+
+  return queue->DispatchImplicit(kernel_, {dst_[0]->Depth(), 1, 1},
+                                 work_group_size_);
+}
+
+Status CreateFullyConnectedTexture(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const FullyConnectedAttributes& attr,
+                                   FullyConnectedTexture* result) {
+  *result = FullyConnectedTexture(definition);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
new file mode 100644
index 00000000000..71e8a445cc6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class FullyConnectedTexture : public GPUOperation {
+ public:
+  FullyConnectedTexture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  FullyConnectedTexture(FullyConnectedTexture&& kernel);
+  FullyConnectedTexture& operator=(FullyConnectedTexture&& kernel);
+  FullyConnectedTexture(const FullyConnectedTexture&) = delete;
+  FullyConnectedTexture& operator=(const FullyConnectedTexture&) = delete;
+
+ private:
+  explicit FullyConnectedTexture(const OperationDef& definition);
+  friend Status CreateFullyConnectedTexture(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const FullyConnectedAttributes& attr, FullyConnectedTexture* result);
+
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType T>
+  void RearrangeWeightsFP16(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                            absl::Span<half4> dst);
+  template <DataType T>
+  void RearrangeWeightsFP32(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                            absl::Span<float4> dst);
+
+  Texture2D weights_;
+  LinearStorage biases_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(0, 0, 0);
+};
+
+template <DataType T>
+Status FullyConnectedTexture::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(dst_depth * src_depth * 4);
+    RearrangeWeightsFP32(weights, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth * 4, src_depth,
+                               gpu_data.data(), context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(dst_depth * src_depth * 4);
+    RearrangeWeightsFP16(weights, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth, src_depth * 2,
+                               gpu_data.data(), context, &weights_);
+  }
+}
+
+template <DataType T>
+void FullyConnectedTexture::RearrangeWeightsFP16(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<half4> dst) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int counter = 0;
+
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      half4 filters[2];
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+    }
+    for (int d = 0; d < dst_depth; ++d) {
+      half4 filters[2];
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + 2 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+    }
+  }
+}
+
+template <DataType T>
+void FullyConnectedTexture::RearrangeWeightsFP32(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<float4> dst) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int counter = 0;
+
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      float4 filters[4];
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+      dst[counter++] = filters[2];
+      dst[counter++] = filters[3];
+    }
+  }
+}
+
+Status CreateFullyConnectedTexture(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const FullyConnectedAttributes& attr,
+                                   FullyConnectedTexture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
new file mode 100644
index 00000000000..98057623311
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, FullyConnectedTexture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(2, 1, 1, 4);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      FullyConnectedTexture operation;
+      ASSERT_OK(CreateFullyConnectedTexture(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
new file mode 100644
index 00000000000..46235756cfc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -0,0 +1,192 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetElementWiseCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const ElementwiseOperation& op,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += op.GetArgsDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  " + src_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += "  FLT4 src = " + src_tensor.Read3D("address") + ";\n";
+  c += "  " + op.GetCoreCode("src", "Z", "address");
+  c += PostProcess(linked_operations, "src", "Z", "address");
+  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  c += "} \n";
+  return c;
+}
+
+}  // namespace
+
+DataType OperationDef::GetDataType() const {
+  return DeduceDataTypeFromPrecision(precision);
+}
+
+DataType OperationDef::GetPrimaryDataType() const {
+  return src_tensors[0].data_type;
+}
+TensorStorageType OperationDef::GetPrimaryStorageType() const {
+  return src_tensors[0].storage_type;
+}
+
+GPUOperation::GPUOperation(const OperationDef& definition)
+    : definition_(definition) {}
+
+void GPUOperation::SetSrc(Tensor* ptr, int index) {
+  if (index >= src_.size()) {
+    src_.resize(index + 1, nullptr);
+  }
+  src_[index] = ptr;
+}
+
+void GPUOperation::SetDst(Tensor* ptr, int index) {
+  if (index >= dst_.size()) {
+    dst_.resize(index + 1, nullptr);
+  }
+  dst_[index] = ptr;
+}
+
+GPUOperation::GPUOperation(GPUOperation&& operation)
+    : definition_(std::move(operation.definition_)),
+      src_(std::move(operation.src_)),
+      dst_(std::move(operation.dst_)),
+      linked_operations_(std::move(operation.linked_operations_)) {}
+
+GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
+  if (this != &operation) {
+    definition_ = std::move(operation.definition_);
+    src_ = std::move(operation.src_);
+    dst_ = std::move(operation.dst_);
+    linked_operations_ = std::move(operation.linked_operations_);
+  }
+  return *this;
+}
+
+void GPUOperation::AddOperation(ElementwiseOperation* operation) {
+  linked_operations_.push_back(operation);
+  operation->SetLinkIndex(linked_operations_.size());
+}
+
+ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ElementwiseOperation& ElementwiseOperation::operator=(
+    ElementwiseOperation&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ElementwiseOperation::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArguments(&kernel_));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ElementwiseOperation::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ElementwiseOperation::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                         definition_.precision, *this, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status ElementwiseOperation::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+std::string GetArgsDeclaration(
+    const std::vector<ElementwiseOperation*>& linked_ops) {
+  std::string code;
+  for (auto linked_op : linked_ops) {
+    code += linked_op->GetArgsDeclaration();
+  }
+  code += ",\n";
+
+  return code;
+}
+
+std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
+                        const std::string& var_name, const std::string& z_coord,
+                        const std::string& global_address) {
+  std::string code;
+  for (auto linked_op : linked_ops) {
+    code += linked_op->GetCoreCode(var_name, z_coord, global_address);
+  }
+  return code;
+}
+
+Status BindArgs(CLKernel* kernel,
+                const std::vector<ElementwiseOperation*>& linked_ops) {
+  for (auto linked_op : linked_ops) {
+    RETURN_IF_ERROR(linked_op->BindArguments(kernel));
+  }
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
new file mode 100644
index 00000000000..1a076d88036
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CreationContext {
+  const CLDevice* device;
+  CLContext* context;
+  CLCommandQueue* queue;
+  ProgramCache* cache;
+};
+
+struct OperationDef {
+  CalculationsPrecision precision;
+  std::vector<TensorDescriptor> src_tensors;
+  std::vector<TensorDescriptor> dst_tensors;
+
+  // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
+  DataType GetDataType() const;
+  // Primary means the first src tensor, because first tensor usually defines
+  // the structure of kernel, all other resources(biases) types and etc.
+  DataType GetPrimaryDataType() const;
+  TensorStorageType GetPrimaryStorageType() const;
+};
+
+class ElementwiseOperation;
+
+// GPUOperation represents some implementation of neural network operation on
+// GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
+// ElementwiseOperation still hold necessary data and should be alive.
+// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces
+// some sequence of operations Op + el_op0 + el_op1 + ...
+// Because of this abilities of GPUOperation, usage scenario is next:
+// Create instance of GPUOperation.
+// Create all instances of ElementwiseOperations that we will(probably) attach
+// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call
+// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it
+// attached, it useless(and may be error)
+class GPUOperation {
+ public:
+  GPUOperation() = default;
+  explicit GPUOperation(const OperationDef& definition);
+  virtual ~GPUOperation() = default;
+  // Move only
+  GPUOperation(GPUOperation&& operation);
+  GPUOperation& operator=(GPUOperation&& operation);
+  GPUOperation(const GPUOperation&) = delete;
+  GPUOperation& operator=(const GPUOperation&) = delete;
+
+  void AddOperation(ElementwiseOperation* operation);
+
+  void SetSrc(Tensor* ptr, int index = 0);
+  void SetDst(Tensor* ptr, int index = 0);
+
+  virtual Status AddToQueue(CLCommandQueue* queue) { return OkStatus(); }
+  virtual Status Tune(const TuningParameters& params) { return OkStatus(); }
+
+  virtual Status Compile(const CreationContext& creation_context) {
+    return OkStatus();
+  }
+
+  const OperationDef& GetDefinition() const { return definition_; }
+
+ protected:
+  // Defines operation calculation precision and format of src/dst tensors.
+  OperationDef definition_;
+  std::vector<Tensor*> src_;
+  std::vector<Tensor*> dst_;
+  std::vector<ElementwiseOperation*> linked_operations_;
+};
+
+// ElementwiseOperation can be fused(linked) to another operation.
+// field linked_ indicate about this
+// link_index_ used mostly for generating of correct names for
+//   linked code variables
+// link_index_ is number of operation in sequence of linked operations
+// and should be unique in this sequence
+// link_index_ = 0 is equivalent that operation not linked.
+class ElementwiseOperation : public GPUOperation {
+ public:
+  ElementwiseOperation() {}
+  explicit ElementwiseOperation(const OperationDef& definition)
+      : GPUOperation(definition) {}
+
+  virtual ~ElementwiseOperation() {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ElementwiseOperation(ElementwiseOperation&& operation);
+  ElementwiseOperation& operator=(ElementwiseOperation&& operation);
+  ElementwiseOperation(const ElementwiseOperation&) = delete;
+  ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
+
+  // We need this function for resolving naming conflicts.
+  // Unfortunately we don't know upfront(at creation time) will be the operation
+  // linked or not. Operation should be created and SetLinkIndex(0) must be
+  // called to initialize specific for this op linked info, and this is mean
+  // that operation is not linked. But if we decided to link it, we need update
+  // operation linked info and use names for kernel arguments according to this
+  // index(this is responsibility of particular implementation of
+  // ElementwiseOperation to generate right names).
+  virtual void SetLinkIndex(int index) {}
+
+  virtual std::string GetCoreCode(const std::string& src,
+                                  const std::string& z_coord,
+                                  const std::string& address) const = 0;
+  virtual std::string GetArgsDeclaration() const { return ""; }
+  virtual Status BindArguments(CLKernel* kernel) { return OkStatus(); }
+
+ protected:
+  Status BindArguments();
+  int3 GetGridSize() const;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+// Generates arguments declarations string for elementwise
+// operations in linked_ops.
+// Every ElementwiseOperation can generate arguments declarations.
+std::string GetArgsDeclaration(
+    const std::vector<ElementwiseOperation*>& linked_ops);
+
+// Generates shader code for every elementwise operation in
+// linked_ops.
+// linked_ops - vector of operations pointers
+// var_name - name of variable in shader code that we update/change
+// z_coord - name of variable in shader code for currently processed Z -
+//   coordinate in 3D grid (WHC/XYZ) for tensor, this coordinate is in
+//   layer/slice(group of 4 channels) space not in channels.
+// global_address - name of variable for coordinates in 3D grid (WHC/XYZ) for
+//   tensor, different tensor layouts encode this address differently.
+std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
+                        const std::string& var_name, const std::string& z_coord,
+                        const std::string& global_address);
+
+// Binds arguments to given kernel for elementwise operations in
+// linked_ops.
+// Every ElementwiseOperation can bind her arguments.
+Status BindArgs(CLKernel* kernel,
+                const std::vector<ElementwiseOperation*>& linked_ops);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
new file mode 100644
index 00000000000..6bb880f61d7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class HardSwish : public ElementwiseOperation {
+ public:
+  static std::unique_ptr<HardSwish> Create(const OperationDef& op_def) {
+    auto h_swish = absl::make_unique<HardSwish>(op_def);
+    h_swish->SetLinkIndex(0);
+    return h_swish;
+  }
+
+  HardSwish() = delete;
+  explicit HardSwish(const OperationDef& op_def)
+      : ElementwiseOperation(op_def) {}
+  HardSwish(const HardSwish&) = delete;
+  HardSwish(HardSwish&& h_swish) : ElementwiseOperation(std::move(h_swish)) {}
+
+  HardSwish& operator=(const HardSwish&) = delete;
+  HardSwish& operator=(HardSwish&& h_swish) {
+    if (this != &h_swish) ElementwiseOperation::operator=(std::move(h_swish));
+    return *this;
+  }
+
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override {
+    return absl::Substitute(
+        "$0 *= clamp($0 * (FLT)(0.16666667f) + (FLT)(0.5f), (FLT4)(0.0f), "
+        "(FLT4)(1.0f));\n",
+        src);
+  }
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish_test.cc
new file mode 100644
index 00000000000..c24855bbed1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, HardSwish) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 7);
+  src_tensor.data = {-4.5f, -3.0f, -1.5f, 0.0f, 1.5f, 3.0f, 4.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      auto h_swish = HardSwish::Create(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_,
+                                    h_swish.get(), src_tensor.shape,
+                                    &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          testing::Pointwise(testing::FloatNear(eps),
+                             {0.0f, 0.0f, -0.375f, 0.0f, 1.125f, 3.f, 4.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
new file mode 100644
index 00000000000..de87a8ebbcf
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetMaxUnoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& src_ind_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator src_ind("src_data_indices", "src_size",
+                              src_ind_descriptor);
+  TensorCodeGenerator dst("dst_data", "dst_size", dst_descriptor);
+
+  const auto address_mode = GetFastestZeroMode(device);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src.GetDeclaration(AccessType::READ) + ",\n";
+  code += src_ind.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,      \n";
+  code += "    int4 dst_size,      \n";
+  code += "    int2 kernel_size,   \n";
+  code += "    int2 padding,       \n";
+  code += "    int2 stride         \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  int src_x = (X + padding.x) / stride.x;\n";
+  code += "  int src_y = (Y + padding.y) / stride.y;\n";
+  code += "  " + src.GetAddress("src_adr", "src_x", "src_y", "Z") + "\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    code += "  bool outside = src_x < 0 || src_y < 0 ||";
+    code += "  src_x >= src_size.x || src_y >= src_size.y;\n";
+    code += "  FLT4 src = (FLT4)(0.0f);\n";
+    code += "  int4 ind = (int4)(0);\n";
+    code += "  if (!outside) {\n";
+    code +=
+        "    src = " + src.Read3D("src_adr", TextureAddressMode::DONT_CARE) +
+        ";\n";
+    code += "    ind = convert_int4(" +
+            src_ind.Read3D("src_adr", TextureAddressMode::DONT_CARE) + ");\n";
+    code += "  }\n";
+  } else {
+    code += "  FLT4 src = " + src.Read3D("src_adr", address_mode) + ";\n";
+    code += "  int4 ind = convert_int4(" +
+            src_ind.Read3D("src_adr", address_mode) + ");\n";
+  }
+  code += "  int t_x = X - (src_x * stride.x - padding.x);\n";
+  code += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
+  code += "  int t_index = t_y * kernel_size.x + t_x;\n";
+  code += "  FLT4 result;\n";
+  const std::string channels[] = {".x", ".y", ".z", ".w"};
+  for (int i = 0; i < 4; ++i) {
+    const auto& s = channels[i];
+    code += "  result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
+  }
+  code += "  " + dst.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+}  // namespace
+
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.appended.w, attr.padding.appended.h),
+      kernel_size_(attr.kernel.w, attr.kernel.h) {}
+
+MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status MaxUnpooling::Compile(const CreationContext& creation_context) {
+  const auto code = GetMaxUnoolingKernelCode(
+      definition_.src_tensors[0], definition_.src_tensors[1],
+      definition_.dst_tensors[0], definition_.precision,
+      *creation_context.device, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status MaxUnpooling::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+
+  return OkStatus();
+}
+
+int3 MaxUnpooling::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status MaxUnpooling::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling2DAttributes& attr) {
+  return MaxUnpooling(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
new file mode 100644
index 00000000000..2af3c5e3fe2
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class MaxUnpooling : public GPUOperation {
+ public:
+  MaxUnpooling(const OperationDef& definition,
+               const MaxUnpooling2DAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  MaxUnpooling(MaxUnpooling&& kernel);
+  MaxUnpooling& operator=(MaxUnpooling&& kernel);
+  MaxUnpooling(const MaxUnpooling&) = delete;
+  MaxUnpooling& operator=(const MaxUnpooling&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_size_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling2DAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
new file mode 100644
index 00000000000..613d5ca7299
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, MaxUnpooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+  TensorFloat32 src_ind_tensor;
+  src_ind_tensor.shape = BHWC(1, 2, 2, 1);
+  src_ind_tensor.data = {0.1f, 1.1f, 2.1f, 3.1f};
+
+  MaxUnpooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                             0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
new file mode 100644
index 00000000000..cfe60a98331
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
+
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+MultiplyAdd::MultiplyAdd(MultiplyAdd&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      mul_vec_(std::move(operation.mul_vec_)),
+      add_vec_(std::move(operation.add_vec_)),
+      use_mul_vec_(operation.use_mul_vec_),
+      use_add_vec_(operation.use_add_vec_),
+      scalar_mul_(std::move(operation.scalar_mul_)),
+      scalar_add_(std::move(operation.scalar_add_)) {}
+
+MultiplyAdd& MultiplyAdd::operator=(MultiplyAdd&& operation) {
+  if (this != &operation) {
+    mul_vec_ = std::move(operation.mul_vec_);
+    add_vec_ = std::move(operation.add_vec_);
+    use_mul_vec_ = operation.use_mul_vec_;
+    use_add_vec_ = operation.use_add_vec_;
+    scalar_mul_ = std::move(operation.scalar_mul_);
+    scalar_add_ = std::move(operation.scalar_add_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void MultiplyAdd::SetLinkIndex(int index) {
+  scalar_mul_.SetName(absl::StrCat("mad_scalar_mul_", index));
+  scalar_add_.SetName(absl::StrCat("mad_scalar_add_", index));
+  mul_vec_.SetName(absl::StrCat("mad_mul_", index));
+  add_vec_.SetName(absl::StrCat("mad_add_", index));
+}
+
+std::string MultiplyAdd::GetCoreCode(const std::string& src,
+                                     const std::string& z_coord,
+                                     const std::string& address) const {
+  std::string result = absl::StrCat(src, " = ", src);
+  if (use_mul_vec_) {
+    absl::StrAppend(&result, " * ", mul_vec_.ReadLinearFLT4(z_coord));
+  }
+  if (scalar_mul_.Active()) {
+    absl::StrAppend(&result, " * (FLT)(", scalar_mul_.GetName(), ")");
+  }
+  if (use_add_vec_) {
+    absl::StrAppend(&result, " + ", add_vec_.ReadLinearFLT4(z_coord));
+  }
+  if (scalar_add_.Active()) {
+    absl::StrAppend(&result, " + (FLT)(", scalar_add_.GetName(), ")");
+  }
+  return absl::StrCat(result, ";\n");
+}
+
+std::string MultiplyAdd::GetArgsDeclaration() const {
+  std::string args;
+  if (use_mul_vec_) {
+    absl::StrAppend(&args, ",\n    ", mul_vec_.GetDeclaration());
+  }
+  if (use_add_vec_) {
+    absl::StrAppend(&args, ",\n    ", add_vec_.GetDeclaration());
+  }
+  if (scalar_mul_.Active()) {
+    absl::StrAppend(&args, ",\n    ", scalar_mul_.GetDeclaration());
+  }
+  if (scalar_add_.Active()) {
+    absl::StrAppend(&args, ",\n    ", scalar_add_.GetDeclaration());
+  }
+  return args;
+}
+
+Status MultiplyAdd::BindArguments(CLKernel* kernel) {
+  if (use_mul_vec_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
+  }
+  if (use_add_vec_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(add_vec_.GetMemoryPtr()));
+  }
+  if (scalar_mul_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_mul_));
+  }
+  if (scalar_add_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
+  }
+  return OkStatus();
+}
+
+Status MultiplyAdd::UploadMul(const MultiplyScalarAttributes& attr,
+                              CalculationsPrecision scalar_precision,
+                              CLContext* context) {
+  auto mul = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+      &attr.param);
+  auto mul_scalar = absl::get_if<float>(&attr.param);
+  if (mul) {
+    RETURN_IF_ERROR(UploadMul(*mul, context));
+  } else {
+    scalar_mul_ = FLT(scalar_precision, *mul_scalar);
+  }
+  return OkStatus();
+}
+
+Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
+                              CalculationsPrecision scalar_precision,
+                              CLContext* context) {
+  auto add = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+      &attr.param);
+  auto add_scalar = absl::get_if<float>(&attr.param);
+  if (add) {
+    RETURN_IF_ERROR(UploadAdd(*add, context));
+  } else {
+    scalar_add_ = FLT(scalar_precision, *add_scalar);
+  }
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& attr,
+                         MultiplyAdd* result) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(
+      result->UploadMul(attr, scalar_precision, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const AddAttributes& attr, MultiplyAdd* result) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(
+      result->UploadAdd(attr, scalar_precision, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& mul_attr,
+                         const AddAttributes& add_attr, MultiplyAdd* result) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(
+      result->UploadMul(mul_attr, scalar_precision, creation_context.context));
+  RETURN_IF_ERROR(
+      result->UploadAdd(add_attr, scalar_precision, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
new file mode 100644
index 00000000000..c6f9e977b48
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class MultiplyAdd : public ElementwiseOperation {
+ public:
+  // Move only
+  MultiplyAdd() = default;
+  MultiplyAdd(MultiplyAdd&& operation);
+  MultiplyAdd& operator=(MultiplyAdd&& operation);
+  MultiplyAdd(const MultiplyAdd&) = delete;
+  MultiplyAdd& operator=(const MultiplyAdd&) = delete;
+
+  Status UploadMul(const MultiplyScalarAttributes& attr,
+                   CalculationsPrecision scalar_precision, CLContext* context);
+  Status UploadAdd(const AddAttributes& attr,
+                   CalculationsPrecision scalar_precision, CLContext* context);
+
+  template <DataType T>
+  Status UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
+                   CLContext* context);
+
+  template <DataType T>
+  Status UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
+                   CLContext* context);
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const MultiplyScalarAttributes& attr,
+                                  MultiplyAdd* result);
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const AddAttributes& attr,
+                                  MultiplyAdd* result);
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const MultiplyScalarAttributes& mul_attr,
+                                  const AddAttributes& add_attr,
+                                  MultiplyAdd* result);
+
+ private:
+  explicit MultiplyAdd(const OperationDef& definition)
+      : ElementwiseOperation(definition),
+        use_mul_vec_(false),
+        use_add_vec_(false) {}
+
+  LinearStorage mul_vec_;
+  LinearStorage add_vec_;
+  bool use_mul_vec_;
+  bool use_add_vec_;
+  FLT scalar_mul_;
+  FLT scalar_add_;
+};
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& attr,
+                         MultiplyAdd* result);
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const AddAttributes& attr, MultiplyAdd* result);
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& mul_attr,
+                         const AddAttributes& add_attr, MultiplyAdd* result);
+
+template <DataType T>
+Status MultiplyAdd::UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
+                              CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
+  use_mul_vec_ = true;
+  return OkStatus();
+}
+
+template <DataType T>
+Status MultiplyAdd::UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
+                              CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
+  use_add_vec_ = true;
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
new file mode 100644
index 00000000000..920669a816b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, 1.0f, 6.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  AddAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.5f, 3.0f, 2.5f, 5.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddScalarMul) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes attr;
+  attr.param = 0.5f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.5f, 1.0f, 1.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddScalarAdd) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  AddAttributes attr;
+  attr.param = -0.5f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.5f, 0.5f, 1.5f, 2.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes mul_attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  mul_attr.param = parameters;
+
+  AddAttributes add_attr;
+  parameters.data = {-0.5f, 0.5f};
+  add_attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, mul_attr, add_attr,
+                                  &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.5f, 2.5f, 0.5f, 6.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
new file mode 100644
index 00000000000..5da0ae7fb51
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -0,0 +1,154 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetPaddingCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+  const std::string channels[] = {".x", ".y", ".z", ".w"};
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,      \n";
+  code += "    int4 dst_size,      \n";
+  code += "    int4 prepended      \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  FLT4 result = (FLT4)(0.0);\n";
+  code += "  int s_x = X - prepended.x;\n";
+  code += "  int s_y = Y - prepended.y;\n";
+  code += "  bool inside_x = s_x >= 0 && s_x < src_size.x;\n";
+  code += "  bool inside_y = s_y >= 0 && s_y < src_size.y;\n";
+  code += "  if (inside_x && inside_y) {\n";
+  code += "    int start_channel = Z * 4;\n";
+  for (int i = 0; i < 4; ++i) {
+    const auto& s = channels[i];
+    code += "    {\n";
+    code += "    int channel = start_channel + " + std::to_string(i) + ";\n";
+    code += "    int s_z = channel - prepended.z;\n";
+    code += "    if (s_z >= 0 && s_z < src_size.z) {\n";
+    code += "      FLT4 t = " +
+            src_tensor.Read3D("s_x", "s_y", "s_z / 4",
+                              TextureAddressMode::DONT_CARE) +
+            ";\n";
+    code += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+    code += "      result" + s + " = t_ar[s_z % 4];\n";
+    code += "    }\n";
+    code += "    }\n";
+  }
+  code += "  }\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst_tensor.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+}  // namespace
+
+Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
+    : GPUOperation(definition) {
+  SetPrepended(int3(attr.prepended.w, attr.prepended.h, attr.prepended.c));
+}
+
+Padding::Padding(Padding&& kernel)
+    : GPUOperation(std::move(kernel)),
+      prepended_(kernel.prepended_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+Padding& Padding::operator=(Padding&& kernel) {
+  if (this != &kernel) {
+    std::swap(prepended_, kernel.prepended_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+void Padding::SetPrepended(const int3& prepended) {
+  prepended_.x = prepended.x;
+  prepended_.y = prepended.y;
+  prepended_.z = prepended.z;
+  prepended_.w = 0;
+}
+
+Status Padding::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetPaddingCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                     definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Padding::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(prepended_));
+  return OkStatus();
+}
+
+int3 Padding::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Padding::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Padding::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Padding CreatePadding(const OperationDef& definition,
+                      const PadAttributes& attr) {
+  return Padding(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
new file mode 100644
index 00000000000..554e8424060
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Padding : public GPUOperation {
+ public:
+  Padding(const OperationDef& definition, const PadAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Padding(Padding&& kernel);
+  Padding& operator=(Padding&& kernel);
+  Padding(const Padding&) = delete;
+  Padding& operator=(const Padding&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  void SetPrepended(const int3& prepended);
+  int4 prepended_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Padding CreatePadding(const OperationDef& definition,
+                      const PadAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
new file mode 100644
index 00000000000..418e7cf5538
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
@@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(0, 1, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 1, 0);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(1, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(1, 0, 0);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(0, 0, 1);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.0f, 2.0f, 3.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 1);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 0.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingComplex) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 1, 1);
+  attr.appended = HWC(1, 1, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 3, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
new file mode 100644
index 00000000000..411560e1184
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -0,0 +1,261 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetAveragePoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  const auto address_mode = GetFastestZeroMode(device);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,             \n";
+  code += "    int4 dst_size,             \n";
+  code += "    int2 kernel_size,          \n";
+  code += "    int2 padding,              \n";
+  code += "    int2 stride                \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  float4 r = (float4)(0.0f);\n";
+  code += "  float window_size = 0.0;\n";
+  code += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  code += "    int y_c = Y * stride.y - padding.y + ky;\n";
+  code += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+  code += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  code += "      int x_c = X * stride.x - padding.x + kx;\n";
+  code += "      bool outside = outside_y || x_c < 0 || x_c >= src_size.x;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    code += "     r += !outside ? " +
+            src_tensor.ReadAsFloat3D("x_c", "y_c", "Z",
+                                     TextureAddressMode::DONT_CARE) +
+            " : (float4)(0.0f);\n";
+  } else {
+    code += "      r += " +
+            src_tensor.ReadAsFloat3D("x_c", "y_c", "Z", address_mode) + ";\n";
+  }
+  code += "        window_size += !outside ? 1.0 : 0.0;\n";
+  code += "    }\n";
+  code += "  }\n";
+  // If window_size==0, window covered nothing. This situation is a sign of
+  // incorrectly constructed operation. NaNs are expected as output.
+  code += "  FLT4 result = TO_FLT4(r / window_size);\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst_tensor.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+
+std::string GetMaxPoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    bool output_indices) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+  TensorCodeGenerator indices_tensor("dst_indices", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  if (output_indices) {
+    code += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  }
+  code += "    int4 src_size,             \n";
+  code += "    int4 dst_size,             \n";
+  code += "    int2 kernel_size,          \n";
+  code += "    int2 padding,              \n";
+  code += "    int2 stride                \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
+  if (output_indices) {
+    code += "  FLT4 indexes = (FLT4)(0.0f);\n";
+    code += "  FLT index_counter = (FLT)(0.1f);\n";
+  }
+  code += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  code += "    int y_c = Y * stride.y - padding.y + ky;\n";
+  code += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+  code += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  code += "      int x_c = X * stride.x - padding.x + kx;\n";
+  code += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
+  code += "      if (!outside_x && !outside_y) {\n";
+  code += "        FLT4 src = " +
+          src_tensor.Read3D("x_c", "y_c", "Z", TextureAddressMode::DONT_CARE) +
+          ";\n";
+  if (output_indices) {
+    code += "        if (src.x > maximum.x) {\n";
+    code += "          indexes.x = index_counter;\n";
+    code += "          maximum.x = src.x;\n";
+    code += "        }\n";
+    code += "        if (src.y > maximum.y) {\n";
+    code += "          indexes.y = index_counter;\n";
+    code += "          maximum.y = src.y;\n";
+    code += "        }\n";
+    code += "        if (src.z > maximum.z) {\n";
+    code += "          indexes.z = index_counter;\n";
+    code += "          maximum.z = src.z;\n";
+    code += "        }\n";
+    code += "        if (src.w > maximum.w) {\n";
+    code += "          indexes.w = index_counter;\n";
+    code += "          maximum.w = src.w;\n";
+    code += "        }\n";
+    code += "        index_counter += (FLT)(1.0f);\n";
+  }
+  code += "        maximum = max(src, maximum);\n";
+  code += "      };\n";
+  code += "    }\n";
+  code += "  }\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "maximum", "Z", "address");
+  code += "  " + dst_tensor.Write3D("maximum", "address");
+  if (output_indices) {
+    code += "  " + indices_tensor.Write3D("indexes", "address");
+  }
+  code += "}\n";
+
+  return code;
+}
+
+}  // namespace
+
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      kernel_size_(attr.kernel.w, attr.kernel.h),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {}
+
+Pooling::Pooling(Pooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      type_(kernel.type_),
+      output_indices_(kernel.output_indices_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+Pooling& Pooling::operator=(Pooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(type_, kernel.type_);
+    std::swap(output_indices_, kernel.output_indices_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status Pooling::Compile(const CreationContext& creation_context) {
+  std::string code;
+  switch (type_) {
+    case PoolingType::AVERAGE:
+      code = GetAveragePoolingKernelCode(
+          definition_.src_tensors[0], definition_.dst_tensors[0],
+          definition_.precision, *creation_context.device, linked_operations_);
+      break;
+    case PoolingType::MAX:
+      code = GetMaxPoolingKernelCode(
+          definition_.src_tensors[0], definition_.dst_tensors[0],
+          definition_.precision, linked_operations_, output_indices_);
+      break;
+    default:
+      return InvalidArgumentError(
+          "You should create another kernel with this params");
+      break;
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Pooling::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  if (output_indices_) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+
+  return OkStatus();
+}
+
+int3 Pooling::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Pooling::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Pooling::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling2DAttributes& attr) {
+  return Pooling(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
new file mode 100644
index 00000000000..cfce0ef542f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Pooling : public GPUOperation {
+ public:
+  Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Pooling(Pooling&& kernel);
+  Pooling& operator=(Pooling&& kernel);
+  Pooling(const Pooling&) = delete;
+  Pooling& operator=(const Pooling&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_size_;
+
+  PoolingType type_;
+  bool output_indices_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling2DAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
new file mode 100644
index 00000000000..27448bce1b6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, AveragePooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::AVERAGE;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::AVERAGE;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.5f, 2.0f, 2.5f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaxPooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::MAX;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::MAX;
+  attr.output_indices = true;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      TensorFloat32 dst_tensor_ind;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    {BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
+                                    {&dst_tensor, &dst_tensor_ind}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
+      for (auto& v : dst_tensor_ind.data) {
+        v = static_cast<int>(v);
+      }
+      EXPECT_THAT(dst_tensor_ind.data, Pointwise(FloatNear(eps), {0.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
new file mode 100644
index 00000000000..1c07b08b845
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr,
+             CalculationsPrecision scalar_precision)
+    : ElementwiseOperation(definition) {
+  if (attr.clip != 0) {
+    clip_ = FLT(scalar_precision, attr.clip);
+  }
+}
+
+PReLU::PReLU(PReLU&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      clip_(std::move(operation.clip_)),
+      alpha_(std::move(operation.alpha_)) {}
+
+PReLU& PReLU::operator=(PReLU&& operation) {
+  if (this != &operation) {
+    clip_ = std::move(operation.clip_);
+    alpha_ = std::move(operation.alpha_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void PReLU::SetLinkIndex(int index) {
+  clip_.SetName(absl::StrCat("prelu_clip", index));
+  alpha_.SetName(absl::StrCat("prelu_alpha_", index));
+}
+
+std::string PReLU::GetCoreCode(const std::string& src,
+                               const std::string& z_coord,
+                               const std::string& address) const {
+  if (!clip_.Active()) {
+    return absl::StrCat(src, " = max((FLT4)(0.0f), ", src,
+                        ") + min((FLT4)(0.0f), ", src, ") * ",
+                        alpha_.ReadLinearFLT4(z_coord), ";\n");
+  } else {
+    return absl::StrCat(src, " = clamp(", src, ", (FLT4)(0.0f), (FLT4)(",
+                        clip_.GetName(), ")) + min((FLT4)(0.0f), ", src, ") * ",
+                        alpha_.ReadLinearFLT4(z_coord), ";\n");
+  }
+}
+
+std::string PReLU::GetArgsDeclaration() const {
+  std::string args = absl::StrCat(",\n    ", alpha_.GetDeclaration());
+  if (clip_.Active()) {
+    absl::StrAppend(&args, ",\n    ", clip_.GetDeclaration());
+  }
+  return args;
+}
+
+Status PReLU::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(alpha_.GetMemoryPtr()));
+  if (clip_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
+  }
+  return OkStatus();
+}
+
+Status CreatePReLU(const CreationContext& creation_context,
+                   const OperationDef& definition, const PReLUAttributes& attr,
+                   PReLU* result) {
+  auto alpha = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+      &attr.alpha);
+  if (!alpha) {
+    return InvalidArgumentError("Alpha is missing");
+  }
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  *result = PReLU(definition, attr, scalar_precision);
+  RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
new file mode 100644
index 00000000000..b58b68a16e1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PRELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PRELU_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class PReLU : public ElementwiseOperation {
+ public:
+  PReLU() = default;
+  // Move only
+  PReLU(PReLU&& operation);
+  PReLU& operator=(PReLU&& operation);
+  PReLU(const PReLU&) = delete;
+  PReLU& operator=(const PReLU&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend Status CreatePReLU(const CreationContext& creation_context,
+                            const OperationDef& definition,
+                            const PReLUAttributes& attr, PReLU* result);
+
+ private:
+  PReLU(const OperationDef& definition, const PReLUAttributes& attr,
+        CalculationsPrecision scalar_precision);
+
+  template <DataType T>
+  Status UploadParameters(const ::tflite::gpu::Tensor<Linear, T>& parameters,
+                          CLContext* context);
+
+  FLT clip_;
+  LinearStorage alpha_;
+};
+
+Status CreatePReLU(const CreationContext& creation_context,
+                   const OperationDef& definition, const PReLUAttributes& attr,
+                   PReLU* result);
+
+template <DataType T>
+Status PReLU::UploadParameters(
+    const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetPrimaryDataType();
+  RETURN_IF_ERROR(
+      CreateLinearStorage(create_info, parameters, context, &alpha_));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PRELU_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
new file mode 100644
index 00000000000..50d5aabb47b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, PReLUAlpha) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
+
+  PReLUAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, -2.0f};
+  attr.alpha = parameters;
+  attr.clip = 0.0;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      PReLU operation;
+      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, -1.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
+
+  PReLUAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, -2.0f};
+  attr.alpha = parameters;
+  attr.clip = 0.7f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      PReLU operation;
+      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, -1.0f, 0.7f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
new file mode 100644
index 00000000000..ef2ea3f8f06
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/relu.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
+           CalculationsPrecision scalar_precision)
+    : ElementwiseOperation(definition) {
+  if (attr.alpha != 0.0f) {
+    alpha_ = FLT(scalar_precision, attr.alpha);
+  }
+  if (attr.clip != 0.0f) {
+    clip_ = FLT(scalar_precision, attr.clip);
+  }
+}
+
+ReLU::ReLU(ReLU&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      alpha_(std::move(operation.alpha_)),
+      clip_(std::move(operation.clip_)) {}
+
+ReLU& ReLU::operator=(ReLU&& operation) {
+  if (this != &operation) {
+    alpha_ = std::move(operation.alpha_);
+    clip_ = std::move(operation.clip_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void ReLU::SetLinkIndex(int index) {
+  alpha_.SetName(absl::StrCat("relu_alpha", index));
+  clip_.SetName(absl::StrCat("relu_clip", index));
+}
+
+std::string ReLU::GetCoreCode(const std::string& src,
+                              const std::string& z_coord,
+                              const std::string& address) const {
+  std::string min_func;
+  if (!alpha_.Active()) {
+    min_func = "(FLT)(0.0f)";
+  } else {
+    min_func = absl::StrCat("min(", src, " * (FLT)(", alpha_.GetName(),
+                            "), (FLT)(0.0f))");
+  }
+  if (!clip_.Active()) {
+    return absl::StrCat(src, " = max(", src, ", ", min_func, ");\n");
+  } else {
+    return absl::StrCat(src, " = clamp(", src, ", " + min_func + ", (FLT)(",
+                        clip_.GetName(), "));\n");
+  }
+}
+
+std::string ReLU::GetArgsDeclaration() const {
+  std::string args;
+  if (alpha_.Active()) {
+    absl::StrAppend(&args, ",\n    ", alpha_.GetDeclaration());
+  }
+  if (clip_.Active()) {
+    absl::StrAppend(&args, ",\n    ", clip_.GetDeclaration());
+  }
+  return args;
+}
+
+Status ReLU::BindArguments(CLKernel* kernel) {
+  if (alpha_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(alpha_));
+  }
+  if (clip_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
+  }
+  return OkStatus();
+}
+
+ReLU CreateReLU(const CreationContext& creation_context,
+                const OperationDef& definition, const ReLUAttributes& attr) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  ReLU operation(definition, attr, scalar_precision);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
new file mode 100644
index 00000000000..b1fb87e469e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RELU_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ReLU : public ElementwiseOperation {
+ public:
+  // Move only
+  ReLU(ReLU&& operation);
+  ReLU& operator=(ReLU&& operation);
+  ReLU(const ReLU&) = delete;
+  ReLU& operator=(const ReLU&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend ReLU CreateReLU(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const ReLUAttributes& attr);
+
+ private:
+  ReLU(const OperationDef& definition, const ReLUAttributes& attr,
+       CalculationsPrecision scalar_precision);
+
+  FLT alpha_;
+  FLT clip_;
+};
+
+ReLU CreateReLU(const CreationContext& creation_context,
+                const OperationDef& definition, const ReLUAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RELU_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
new file mode 100644
index 00000000000..d9e2718bf18
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/relu.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {-0.5f, 0.8f, -0.6f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.clip = 0.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.8f, 0.0f, 3.2f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReLUClip) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {-0.5f, 0.8f, -0.6f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.clip = 0.9f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.8f, 0.0f, 0.9f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReLUAlpha) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {-0.5f, 0.8f, -0.6f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.5f;
+  attr.clip = 0.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.25f, 0.8f, -0.3f, 3.2f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {-0.5f, 0.8f, -0.6f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.5f;
+  attr.clip = 0.5f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.25f, 0.5f, -0.3f, 0.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
new file mode 100644
index 00000000000..988fae76b0a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -0,0 +1,141 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshape.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetReshapeCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    int2 plane_xz            \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT temps[4];\n";
+  c += "  temps[0] = (FLT)(0.0f);\n";
+  c += "  temps[1] = (FLT)(0.0f);\n";
+  c += "  temps[2] = (FLT)(0.0f);\n";
+  c += "  temps[3] = (FLT)(0.0f);\n";
+  c += "  for (int i = 0; i < 4; ++i) {\n";
+  c += "    int dst_channel = Z * 4 + i;\n";
+  c += "    if (dst_channel < dst_size.z) {;\n";
+  c += "      int p = dst_channel + dst_size.z * X + plane_xz.y * Y;\n";
+  c += "      int src_y = p / plane_xz.x;\n";
+  c += "      int src_x = (p % plane_xz.x) / src_size.z;\n";
+  c += "      int src_ch = (p % plane_xz.x) % src_size.z;\n";
+  c += "      int src_z = src_ch / 4;\n";
+  c += "      int src_sub_ch = src_ch % 4;\n";
+  c += "      FLT4 t =" +
+       src_tensor.Read3D("src_x", "src_y", "src_z",
+                         TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+  c += "      temps[i] = t_ar[src_sub_ch];\n";
+  c += "    }\n";
+  c += "  }\n";
+  c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
+  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
+  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+Reshape::Reshape(Reshape&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+Reshape& Reshape::operator=(Reshape&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status Reshape::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                     definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Reshape::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  const int2 plane_size = int2(src_[0]->Width() * src_[0]->Channels(),
+                               dst_[0]->Width() * dst_[0]->Channels());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(plane_size));
+
+  return OkStatus();
+}
+
+int3 Reshape::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Reshape::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Reshape::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Reshape CreateReshape(const OperationDef& definition) {
+  return Reshape(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
new file mode 100644
index 00000000000..2117ef05907
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Reshape : public GPUOperation {
+ public:
+  explicit Reshape(const OperationDef& definition)
+      : GPUOperation(definition), work_group_size_(8, 4, 1) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Reshape(Reshape&& operation);
+  Reshape& operator=(Reshape&& operation);
+  Reshape(const Reshape&) = delete;
+  Reshape& operator=(const Reshape&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+Reshape CreateReshape(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
new file mode 100644
index 00000000000..62b38d8f1ef
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshape.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Reshape) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 3);
+  src_tensor.data = {half(0.5f), half(-1.1f), half(-2.2f),
+                     half(3.1f), half(1.2f),  half(2.9f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Reshape operation = CreateReshape(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f), {half(0.5f), half(-1.1f), half(-2.2f),
+                                      half(3.1f), half(1.2f), half(2.9f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
new file mode 100644
index 00000000000..317a002c605
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -0,0 +1,126 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetReshapeCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    int2 plane_xz            \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  int p = Z + dst_size.w * X + plane_xz.y * Y;\n";
+  c += "  int src_y = p / plane_xz.x;\n";
+  c += "  int src_x = (p % plane_xz.x) / src_size.w;\n";
+  c += "  int src_z = (p % plane_xz.x) % src_size.w;\n";
+  c += "  FLT4 result =" +
+       src_tensor.Read3D("src_x", "src_y", "src_z",
+                         TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
+  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+Reshapex4::Reshapex4(Reshapex4&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status Reshapex4::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                     definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Reshapex4::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  const int2 plane_size = int2(src_[0]->Width() * src_[0]->Depth(),
+                               dst_[0]->Width() * dst_[0]->Depth());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(plane_size));
+
+  return OkStatus();
+}
+
+int3 Reshapex4::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Reshapex4::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Reshapex4::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Reshapex4 CreateReshapex4(const OperationDef& definition) {
+  return Reshapex4(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
new file mode 100644
index 00000000000..656e299b547
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPEX4_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPEX4_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Reshapex4 : public GPUOperation {
+ public:
+  explicit Reshapex4(const OperationDef& definition)
+      : GPUOperation(definition), work_group_size_(8, 4, 1) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Reshapex4(Reshapex4&& operation);
+  Reshapex4& operator=(Reshapex4&& operation);
+  Reshapex4(const Reshapex4&) = delete;
+  Reshapex4& operator=(const Reshapex4&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+// More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
+Reshapex4 CreateReshapex4(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_RESHAPEX4_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
new file mode 100644
index 00000000000..8813a5f5208
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Reshapex4) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 8);
+  src_tensor.data = {half(0.5f), half(-1.1f), half(-2.2f), half(3.1f),
+                     half(1.2f), half(2.9f),  half(4.2f),  half(-1.9f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Reshapex4 operation = CreateReshapex4(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 2, 4), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f),
+                            {half(0.5f), half(-1.1f), half(-2.2f), half(3.1f),
+                             half(1.2f), half(2.9f), half(4.2f), half(-1.9f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh b/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh
new file mode 100755
index 00000000000..0b4a5459727
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+source gbash.sh || exit 1
+
+set -e  # Exit immediately if a command exits with a non-zero status.
+
+DEFINE_string test_target
+
+gbash::init_google "$@"
+
+_DEVICE="${DEVICE:+-s $DEVICE}"
+
+OPENCL_DIR=/data/local/tmp/opencl_tests/
+
+cleanup_device() {
+  adb ${_DEVICE} shell rm -rf $OPENCL_DIR
+}
+
+adb ${_DEVICE} shell mkdir -p $OPENCL_DIR
+trap "cleanup_device" EXIT
+
+targets=($(bazel query 'tests('${FLAGS_test_target}')'))
+num_targets=${#targets[@]}
+if ((num_targets == 1)); then
+  target=${targets[0]}
+  executable=${target##*:}  #finds last token after ':'
+  bazel build --config=android_arm64 -c opt $target
+  test_path=$(echo $target | tr : /)
+  exec_path=bazel-bin/$(echo $test_path | cut -c 3-)
+  adb ${_DEVICE} push "$exec_path" $OPENCL_DIR
+  adb ${_DEVICE} shell chmod +x $OPENCL_DIR/$executable
+  adb ${_DEVICE} shell ./$OPENCL_DIR/$executable
+  adb ${_DEVICE} shell rm -f $OPENCL_DIR/$executable
+else # Cleaning log records for multiple test targets
+  for ((i = 0; i < num_targets; i++)); do
+    target=${targets[i]}
+    executable=${target##*:}  #finds last token after ':'
+    bazel build --config=android_arm64 -c opt $target > /dev/null 2>&1
+    test_path=$(echo $target | tr : /)
+    exec_path=bazel-bin/$(echo $test_path | cut -c 3-)
+    adb ${_DEVICE} push "$exec_path" $OPENCL_DIR > /dev/null 2>&1
+    adb ${_DEVICE} shell chmod +x $OPENCL_DIR/$executable
+    adb ${_DEVICE} shell ./$OPENCL_DIR/$executable --logtostderr 2> /dev/null | grep '\][[:space:]][a-zA-Z][a-zA-Z0-9_]*\.'
+    adb ${_DEVICE} shell rm -f $OPENCL_DIR/$executable
+  done
+fi
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc
new file mode 100644
index 00000000000..4afa7adb50d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Sigmoid::Sigmoid(Sigmoid&& operation)
+    : ElementwiseOperation(std::move(operation)) {}
+
+Sigmoid& Sigmoid::operator=(Sigmoid&& operation) {
+  if (this != &operation) {
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Sigmoid::GetCoreCode(const std::string& src,
+                                 const std::string& z_coord,
+                                 const std::string& address) const {
+  if (definition_.precision != CalculationsPrecision::F32) {
+    return absl::StrCat(
+        src, ".x = convert_half(native_recip(1.0f + native_exp(convert_float(-",
+        src, ".x))));\n", "  ", src,
+        ".y = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
+        ".y))));\n", "  ", src,
+        ".z = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
+        ".z))));\n", "  ", src,
+        ".w = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
+        ".w))));\n");
+  } else {
+    return absl::StrCat(src, " = (FLT4)(1.0f) / ((FLT4)(1.0f) + exp(-(", src,
+                        ")));\n");
+  }
+}
+
+Sigmoid CreateSigmoid(const OperationDef& definition) {
+  Sigmoid operation(definition);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h
new file mode 100644
index 00000000000..e3340a8a3d4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SIGMOID_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SIGMOID_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Sigmoid : public ElementwiseOperation {
+ public:
+  explicit Sigmoid(const OperationDef& definition)
+      : ElementwiseOperation(definition) {}
+
+  // Move only
+  Sigmoid(Sigmoid&& operation);
+  Sigmoid& operator=(Sigmoid&& operation);
+  Sigmoid(const Sigmoid&) = delete;
+  Sigmoid& operator=(const Sigmoid&) = delete;
+
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+};
+
+Sigmoid CreateSigmoid(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SIGMOID_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid_test.cc
new file mode 100644
index 00000000000..ae2a38abf4c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Sigmoid) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {-std::log(1.0f), -std::log(2.0f), -std::log(3.0f),
+                     -std::log(4.0f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Sigmoid operation = CreateSigmoid(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.5f, 1.0f / 3.0f, 0.25f, 0.2f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
new file mode 100644
index 00000000000..d2a6524d8e6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetSoftmaxKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 size,\n";
+  code += "    float4 mask\n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  if (X >= size.x || Y >= size.y) { \n";
+  code += "    return; \n";
+  code += "  } \n";
+  code += "  float sum = 0.0f;\n";
+  code += "  for (int d = 0; d < size.w - 1; ++d) {\n";
+  code +=
+      "    float4 t = " +
+      src_tensor.ReadAsFloat3D("X", "Y", "d", TextureAddressMode::DONT_CARE) +
+      ";\n";
+  code += "    sum += dot((float4)(1.0f), exp(t));\n";
+  code += "  }\n";
+  code += "  {\n";
+  code += "    float4 t = " +
+          src_tensor.ReadAsFloat3D("X", "Y", "size.w - 1",
+                                   TextureAddressMode::DONT_CARE) +
+          ";\n";
+  code += "    sum += dot(mask, exp(t));\n";
+  code += "  }\n";
+  code += "  for (int d = 0; d < size.w; ++d) {\n";
+  code += "    " + src_tensor.GetAddress("address", "X", "Y", "d") + "\n";
+  code += "    float4 t = " +
+          src_tensor.ReadAsFloat3D("address", TextureAddressMode::DONT_CARE) +
+          ";\n";
+  code += "    t = exp(t) / sum;\n";
+  code += "    FLT4 result = TO_FLT4(t);\n";
+  code += PostProcess(linked_operations, "result", "d", "address");
+  code += "    " + dst_tensor.Write3D("result", "address");
+  code += "  }\n";
+  code += "}\n";
+  return code;
+}
+}  // namespace
+
+Softmax::Softmax(Softmax&& kernel)
+    : GPUOperation(std::move(kernel)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+Softmax& Softmax::operator=(Softmax&& kernel) {
+  if (this != &kernel) {
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status Softmax::Compile(const CreationContext& creation_context) {
+  const auto code = GetSoftmaxKernelCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Softmax::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
+  return OkStatus();
+}
+
+int3 Softmax::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Softmax::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Softmax::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Softmax CreateSoftmax(const OperationDef& definition) {
+  return Softmax(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
new file mode 100644
index 00000000000..b8b7846e8de
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Softmax : public GPUOperation {
+ public:
+  Softmax() = default;
+  explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Softmax(Softmax&& kernel);
+  Softmax& operator=(Softmax&& kernel);
+  Softmax(const Softmax&) = delete;
+  Softmax& operator=(const Softmax&) = delete;
+
+  friend Softmax CreateSoftmax();
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Softmax CreateSoftmax(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
new file mode 100644
index 00000000000..20bb6428180
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -0,0 +1,141 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetSoftmaxKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "tensor_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "tensor_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 tensor_size,\n";
+  code += "    int2 size,\n";
+  code += "    float4 mask\n";
+  code += ") {\n";
+  code += "  int offset = 0;\n";
+  code += "  float sum = 0.0f;\n";
+  code += "  int s = 0;\n";
+  code += "  int tid = get_local_id(0);\n";
+  code += "  do {\n";
+  code += "    int z = offset + tid;\n";
+  code += "    if (z < size.x) {\n";
+  code += "      float4 mask_temp = z == size.x - 1 ? mask : (float4)(1.0f);\n";
+  code +=
+      "      float4 src = " +
+      src_tensor.ReadAsFloat3D("0", "0", "z", TextureAddressMode::DONT_CARE) +
+      ";\n";
+  code += "      sum += dot(mask_temp, exp(src));\n";
+  code += "      offset += 32;\n";
+  code += "    }\n";
+  code += "    s++;\n";
+  code += "  } while (s < size.y);\n";
+  code += "\n";
+  code += "  __local float4 tmp[8];\n";
+  code += "  __local float* tmpx1 = (__local float*)tmp;\n";
+  code += "  tmpx1[tid] = sum;\n";
+  code += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+  code += "  if (tid == 0) {\n";
+  code += "    sum = dot((float4)(1.0f), tmp[0]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[1]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[2]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[3]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[4]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[5]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[6]);\n";
+  code += "    sum += dot((float4)(1.0f), tmp[7]);\n";
+  code += "    tmpx1[0] = 1.0f / sum;\n";
+  code += "  }\n";
+  code += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+  code += "  sum = tmpx1[0];\n";
+  code += "\n";
+  code += "  offset = 0;\n";
+  code += "  s = 0;\n";
+  code += "  do {\n";
+  code += "    int z = offset + tid;\n";
+  code += "    if (z < size.x) {\n";
+  code += "    " + dst_tensor.GetAddress("address", "0", "0", "z") + "\n";
+  code += "      FLT4 value = TO_FLT4(exp(" +
+          src_tensor.ReadAsFloat3D("address", TextureAddressMode::DONT_CARE) +
+          ") * sum);\n";
+  code += PostProcess(linked_operations, "value", "z", "address");
+  code += "    " + dst_tensor.Write3D("value", "address");
+  code += "      offset += 32;\n";
+  code += "    }\n";
+  code += "    s++;\n";
+  code += "  } while (s < size.y);\n";
+  code += "}\n";
+  return code;
+}
+}  // namespace
+
+Softmax1x1::Softmax1x1(Softmax1x1&& kernel)
+    : GPUOperation(std::move(kernel)), kernel_(std::move(kernel.kernel_)) {}
+
+Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
+  if (this != &kernel) {
+    kernel_ = std::move(kernel.kernel_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status Softmax1x1::Compile(const CreationContext& creation_context) {
+  const auto code = GetSoftmaxKernelCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  const int depth = src_[0]->Depth();
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
+
+  return queue->DispatchImplicit(kernel_, {32u, 1u, 1u}, {32u, 1u, 1u});
+}
+
+Softmax1x1 CreateSoftmax1x1(const OperationDef& definition) {
+  return Softmax1x1(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
new file mode 100644
index 00000000000..0fd5325a863
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX1X1_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX1X1_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Softmax1x1 : public GPUOperation {
+ public:
+  Softmax1x1() = default;
+  explicit Softmax1x1(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Softmax1x1(Softmax1x1&& kernel);
+  Softmax1x1& operator=(Softmax1x1&& kernel);
+  Softmax1x1(const Softmax1x1&) = delete;
+  Softmax1x1& operator=(const Softmax1x1&) = delete;
+
+  friend Softmax1x1 CreateSoftmax1x1();
+
+ private:
+  CLKernel kernel_;
+};
+
+Softmax1x1 CreateSoftmax1x1(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SOFTMAX1X1_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc
new file mode 100644
index 00000000000..fc86b961857
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Softmax1x1) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 4);
+  src_tensor.data = {std::log(1.0f), std::log(2.0f), std::log(3.0f),
+                     std::log(4.0f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Softmax1x1 operation = CreateSoftmax1x1(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 4), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.1f, 0.2f, 0.3f, 0.4f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
new file mode 100644
index 00000000000..037115e4399
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Softmax) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {std::log(1.0f), std::log(2.0f), std::log(3.0f),
+                     std::log(4.0f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Softmax operation = CreateSoftmax(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f / 3.0f, 2.0f / 3.0f,
+                                             3.0f / 7.0f, 4.0f / 7.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
new file mode 100644
index 00000000000..a956bcfa4ea
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -0,0 +1,194 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetStridedSliceCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    bool alignedx4,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 offset,            \n";
+  c += "    int4 stride,            \n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  int s_x = X * stride.x + offset.x;\n";
+  c += "  int s_y = Y * stride.y + offset.y;\n";
+  if (alignedx4) {
+    c += "  int s_z = Z + offset.z;\n";
+    c += "  FLT4 result = " +
+         src_tensor.Read3D("s_x", "s_y", "s_z", TextureAddressMode::DONT_CARE) +
+         ";\n";
+  } else {
+    c += "  FLT4 result;\n";
+    const std::string postfixes[] = {"x", "y", "z", "w"};
+    for (int i = 0; i < 4; ++i) {
+      c += "  {\n";
+      const std::string channel = "(Z * 4 + " + std::to_string(i) + ")";
+      c += "    int s_ch = " + channel + " * stride.z + offset.z;\n";
+      c += "    int s_z = s_ch >> 2;\n";
+      c += "    int s_z_rem = s_ch & 3;\n";
+      c += "    FLT4 t = " +
+           src_tensor.Read3D("s_x", "s_y", "s_z",
+                             TextureAddressMode::DONT_CARE) +
+           ";\n";
+      c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+      c += "    result." + postfixes[i] + " = t_ar[s_z_rem];\n";
+      c += "  }\n";
+    }
+  }
+  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
+  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  c += "}\n";
+  return c;
+}
+
+bool Is4Alighed(const SliceAttributes& attr) {
+  return attr.strides.c == 1 && attr.starts.c % 4 == 0;
+}
+
+int3 GetOffset(const SliceAttributes& attr, int src_width, int src_height,
+               int src_channels) {
+  int3 offset;
+  if (attr.strides.w > 0) {
+    offset.x = attr.starts.w;
+  } else {
+    if (attr.ends.w > 0) {
+      offset.x = attr.ends.w;
+    } else {
+      offset.x = src_width + attr.ends.w;
+    }
+  }
+  if (attr.strides.h > 0) {
+    offset.y = attr.starts.h;
+  } else {
+    if (attr.ends.h > 0) {
+      offset.y = attr.ends.h;
+    } else {
+      offset.y = src_height + attr.ends.h;
+    }
+  }
+  if (attr.strides.c > 0) {
+    offset.z = attr.starts.c;
+  } else {
+    if (attr.ends.c > 0) {
+      offset.z = attr.ends.c;
+    } else {
+      offset.z = src_channels + attr.ends.c;
+    }
+  }
+  if (Is4Alighed(attr)) {
+    offset.z /= 4;
+  }
+  return offset;
+}
+
+}  // namespace
+
+StridedSlice::StridedSlice(const OperationDef& definition,
+                           const SliceAttributes& attr)
+    : GPUOperation(definition), attributes_(attr), work_group_size_(8, 4, 1) {}
+
+StridedSlice::StridedSlice(StridedSlice&& operation)
+    : GPUOperation(std::move(operation)),
+      attributes_(operation.attributes_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
+  if (this != &operation) {
+    attributes_ = operation.attributes_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status StridedSlice::Compile(const CreationContext& creation_context) {
+  const auto code = GetStridedSliceCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, Is4Alighed(attributes_), linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status StridedSlice::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  int3 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
+                          src_[0]->Channels());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(offset.x, offset.y, offset.z, 1)));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(
+      attributes_.strides.w, attributes_.strides.h, attributes_.strides.c, 1)));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 StridedSlice::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status StridedSlice::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status StridedSlice::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+StridedSlice CreateStridedSlice(const OperationDef& definition,
+                                const SliceAttributes& attr) {
+  return StridedSlice(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
new file mode 100644
index 00000000000..f30f6777134
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_STRIDED_SLICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_STRIDED_SLICE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class StridedSlice : public GPUOperation {
+ public:
+  StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  StridedSlice(StridedSlice&& operation);
+  StridedSlice& operator=(StridedSlice&& operation);
+  StridedSlice(const StridedSlice&) = delete;
+  StridedSlice& operator=(const StridedSlice&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  SliceAttributes attributes_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+StridedSlice CreateStridedSlice(const OperationDef& definition,
+                                const SliceAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_STRIDED_SLICE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc
new file mode 100644
index 00000000000..538b4e3eba7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, StridedSlice) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 2, 4);
+  src_tensor.data = {half(0.1f),  half(0.2f),  half(0.3f),  half(0.4),
+                     half(1.1f),  half(1.2f),  half(1.3f),  half(1.4),
+                     half(10.1f), half(10.2f), half(10.3f), half(10.4),
+                     half(11.1f), half(11.2f), half(11.3f), half(11.4),
+                     half(20.1f), half(20.2f), half(20.3f), half(20.4),
+                     half(21.1f), half(21.2f), half(21.3f), half(21.4)};
+
+  SliceAttributes attr;
+  attr.starts = HWC(1, 0, 1);
+  attr.ends = HWC(2, 2, 3);
+  attr.strides = HWC(1, 2, 2);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      StridedSlice operation = CreateStridedSlice(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f), {half(10.2f), half(10.4),
+                                              half(20.2f), half(20.4)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h b/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h
new file mode 100644
index 00000000000..d6098b0cb81
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TUNING_PARAMETERS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TUNING_PARAMETERS_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class TuningType { EXHAUSTIVE, FAST };
+
+struct TuningParameters {
+  ProfilingCommandQueue* queue;
+  const DeviceInfo* info;
+  TuningType tuning_type = TuningType::EXHAUSTIVE;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TUNING_PARAMETERS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc
new file mode 100644
index 00000000000..960f39861e0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/upsample.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetUpsampleCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,\n";
+  c += "    int4 dst_size,\n";
+  c += "    float2 scale_factor\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  float2 f_coords = (float2)(X, Y) * scale_factor;\n";
+  c += "  int2 borders = src_size.xy - (int2)(1, 1);\n";
+  c += "  int4 st;\n";
+  c += "  st.xy = (int2)(f_coords.x, f_coords.y);\n";
+  c += "  st.zw = min(st.xy + (int2)(1, 1), borders);\n";
+  c += "  float2 t = f_coords - (float2)(st.x, st.y);\n";
+  c += "  float4 src0 = " +
+       src_tensor.ReadAsFloat3D("st.x", "st.y", "Z",
+                                TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "  float4 src1 = " +
+       src_tensor.ReadAsFloat3D("st.z", "st.y", "Z",
+                                TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "  float4 src2 = " +
+       src_tensor.ReadAsFloat3D("st.x", "st.w", "Z",
+                                TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "  float4 src3 = " +
+       src_tensor.ReadAsFloat3D("st.z", "st.w", "Z",
+                                TextureAddressMode::DONT_CARE) +
+       ";\n";
+  c += "  FLT4 r0 = TO_FLT4(mix(mix(src0, src1, t.x), mix(src2, src3, t.x), "
+       "t.y));\n";
+  c += "  " + dst_tensor.GetAddress("dst_addr", "X", "Y", "Z") + "\n";
+  c += PostProcess(linked_operations, "r0", "Z", "dst_addr");
+  c += "  " + dst_tensor.Write3D("r0", "dst_addr");
+  c += "}\n";
+  return c;
+}
+
+}  // namespace
+
+Upsample::Upsample(Upsample&& operation)
+    : GPUOperation(std::move(operation)),
+      attr_(operation.attr_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+Upsample& Upsample::operator=(Upsample&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status Upsample::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetUpsampleCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                      definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Upsample::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  float2 scale_factor =
+      float2(CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_),
+             CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(scale_factor));
+  return OkStatus();
+}
+
+int3 Upsample::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Upsample::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status Upsample::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Upsample CreateUpsample(const OperationDef& definition,
+                        const Upsample2DAttributes& attr) {
+  return Upsample(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/upsample.h b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.h
new file mode 100644
index 00000000000..efeb56d4583
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UPSAMPLE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UPSAMPLE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Upsample : public GPUOperation {
+ public:
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Upsample(Upsample&& operation);
+  Upsample& operator=(Upsample&& operation);
+  Upsample(const Upsample&) = delete;
+  Upsample& operator=(const Upsample&) = delete;
+
+  friend Upsample CreateUpsample(const OperationDef& definition,
+                                 const Upsample2DAttributes& attr);
+
+ private:
+  Upsample(const OperationDef& definition, const Upsample2DAttributes& attr)
+      : GPUOperation(definition), attr_(attr) {}
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Upsample2DAttributes attr_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Upsample CreateUpsample(const OperationDef& definition,
+                        const Upsample2DAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UPSAMPLE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/upsample_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/upsample_test.cc
new file mode 100644
index 00000000000..beafbb9eda7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/upsample_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/upsample.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, UpsampleBilinearAligned) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 3, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+
+  Upsample2DAttributes attr;
+  attr.type = UpsamplingType::BILINEAR;
+  attr.new_shape = HW(4, 4);
+  attr.align_corners = true;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Upsample operation = CreateUpsample(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.666667f, 1.33333f, 2.0f, 1.0f, 1.66667f,
+                             2.33333f, 3.0f, 2.0f, 2.66667f, 3.33333f, 4.0f,
+                             3.0f, 3.66667f, 4.33333f, 5.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, UpsampleBilinearNonAligned) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 3, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+
+  Upsample2DAttributes attr;
+  attr.type = UpsamplingType::BILINEAR;
+  attr.new_shape = HW(4, 4);
+  attr.align_corners = false;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Upsample operation = CreateUpsample(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {0.0f, 0.75f, 1.5f, 2.0f, 1.5f, 2.25f, 3.0f, 3.5f, 3.0f,
+                     3.75f, 4.5f, 5.0f, 3.0f, 3.75f, 4.5f, 5.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
new file mode 100644
index 00000000000..b9a79fb43f7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -0,0 +1,417 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+
+#include <cmath>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string GetCommonDefines(CalculationsPrecision precision) {
+  std::string result;
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      result += "#define ACCUM_FLT4 float4\n";
+      result += "#define FLT float\n";
+      result += "#define FLT2 float2\n";
+      result += "#define FLT3 float3\n";
+      result += "#define FLT4 float4\n";
+      result += "#define TO_FLT4 convert_float4\n";
+      result += "#define TO_ACCUM_TYPE convert_float4\n";
+      result += "#define TO_ACCUM_FLT convert_float\n";
+      result += "#define READ_IMAGE read_imagef\n";
+      result += "#define WRITE_IMAGE write_imagef\n";
+      break;
+    case CalculationsPrecision::F16:
+      result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+      result += "#define ACCUM_FLT4 half4\n";
+      result += "#define FLT half\n";
+      result += "#define FLT2 half2\n";
+      result += "#define FLT3 half3\n";
+      result += "#define FLT4 half4\n";
+      result += "#define TO_FLT4 convert_half4\n";
+      result += "#define TO_ACCUM_TYPE convert_half4\n";
+      result += "#define TO_ACCUM_FLT convert_half\n";
+      result += "#define READ_IMAGE read_imageh\n";
+      result += "#define WRITE_IMAGE write_imageh\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+      result += "#define ACCUM_FLT4 float4\n";
+      result += "#define FLT half\n";
+      result += "#define FLT2 half2\n";
+      result += "#define FLT3 half3\n";
+      result += "#define FLT4 half4\n";
+      result += "#define TO_FLT4 convert_half4\n";
+      result += "#define TO_ACCUM_TYPE convert_float4\n";
+      result += "#define TO_ACCUM_FLT convert_float\n";
+      result += "#define READ_IMAGE read_imageh\n";
+      result += "#define WRITE_IMAGE write_imageh\n";
+      break;
+  }
+
+  result +=
+      "const sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n";
+  result +=
+      "const sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+  result +=
+      "const sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
+
+  return result;
+}
+
+std::string GetGlobalAddressNoDeclaration(TensorStorageType storage_type,
+                                          const std::string& size_name,
+                                          const std::string& x,
+                                          const std::string& y,
+                                          const std::string& z) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::Substitute("((($2) * $3.y + ($1)) * $3.x + ($0))", x, y, z,
+                              size_name);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute("(int2)(($0), ($1) * $3.w + ($2))", x, y, z,
+                              size_name);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat("(int2)(", x, ", ", y, ")");
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat("(int4)(", x, ", ", y, ", ", z, ", 0)");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string GetGlobalAddress(TensorStorageType storage_type,
+                             const std::string& size_name,
+                             const std::string& var_name, const std::string& x,
+                             const std::string& y, const std::string& z) {
+  const std::string address =
+      GetGlobalAddressNoDeclaration(storage_type, size_name, x, y, z);
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat("int ", var_name, " = ", address, ";\n");
+    case TensorStorageType::TEXTURE_2D:
+      return absl::StrCat("int2 ", var_name, " = ", address, ";\n");
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat("int2 ", var_name, " = ", address, ";\n");
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat("int4 ", var_name, " = ", address, ";\n");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string GetReadImageFromDataType(DataType data_type) {
+  if (data_type == DataType::FLOAT32) {
+    return "read_imagef";
+  } else if (data_type == DataType::FLOAT16) {
+    return "read_imageh";
+  } else {
+    return "READ_IMAGE";
+  }
+}
+
+std::string GetWriteImageFromDataType(DataType data_type) {
+  if (data_type == DataType::FLOAT32) {
+    return "write_imagef";
+  } else if (data_type == DataType::FLOAT16) {
+    return "write_imageh";
+  } else {
+    return "WRITE_IMAGE";
+  }
+}
+
+std::string TextureAddressModeToString(TextureAddressMode address_mode) {
+  switch (address_mode) {
+    case TextureAddressMode::DONT_CARE:
+      return "smp_none";
+    case TextureAddressMode::ZERO:
+      return "smp_zero";
+  }
+}
+
+std::string ReadGlobalFLT4(TensorStorageType storage_type, DataType data_type,
+                           const std::string& tensor_name,
+                           const std::string& global_address,
+                           TextureAddressMode address_mode) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat(tensor_name, "[", global_address, "]");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(
+          GetReadImageFromDataType(data_type), "(", tensor_name,
+          ", " + TextureAddressModeToString(address_mode) + ", ",
+          global_address, ")");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string ReadGlobalFloat4(TensorStorageType storage_type,
+                             const std::string& tensor_name,
+                             const std::string& global_address,
+                             TextureAddressMode address_mode) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat("convert_float4(", tensor_name, "[", global_address,
+                          "])");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(
+          "read_imagef(", tensor_name,
+          ", " + TextureAddressModeToString(address_mode) + ", ",
+          global_address, ")");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string ReadGlobalFLT4(TensorStorageType storage_type, DataType data_type,
+                           const std::string& tensor_name,
+                           const std::string& size_name, const std::string& x,
+                           const std::string& y, const std::string& z,
+                           TextureAddressMode address_mode) {
+  const std::string address =
+      GetGlobalAddressNoDeclaration(storage_type, size_name, x, y, z);
+  return ReadGlobalFLT4(storage_type, data_type, tensor_name, address,
+                        address_mode);
+}
+
+std::string ReadGlobalFloat4(TensorStorageType storage_type,
+                             const std::string& tensor_name,
+                             const std::string& size_name, const std::string& x,
+                             const std::string& y, const std::string& z,
+                             TextureAddressMode address_mode) {
+  const std::string address =
+      GetGlobalAddressNoDeclaration(storage_type, size_name, x, y, z);
+  return ReadGlobalFloat4(storage_type, tensor_name, address, address_mode);
+}
+
+std::string WriteGlobalFLT4(TensorStorageType storage_type, DataType data_type,
+                            const std::string& tensor_name,
+                            const std::string& size_name,
+                            const std::string& var_name, const std::string& x,
+                            const std::string& y, const std::string& z) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat(tensor_name, "[((", z, ") * ", size_name, ".y + (", y,
+                          ")) * ", size_name, ".x + (", x, ")] = ", var_name,
+                          ";\n");
+    case TensorStorageType::TEXTURE_2D:
+      return absl::StrCat(GetWriteImageFromDataType(data_type), "(",
+                          tensor_name, ", (int2)((", x, "), (", y, ") * ",
+                          size_name, ".w + (", z, ")), ", var_name, ");\n");
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat(GetWriteImageFromDataType(data_type), "(",
+                          tensor_name, ", (int2)(", x, ", ", y, "), ", var_name,
+                          ");\n");
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(GetWriteImageFromDataType(data_type), "(",
+                          tensor_name, ", (int4)(", x, ", ", y, ", ", z,
+                          ", 0), ", var_name, ");\n");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string WriteGlobalFLT4(TensorStorageType storage_type, DataType data_type,
+                            const std::string& tensor_name,
+                            const std::string& var_name,
+                            const std::string& global_address) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat(tensor_name, "[", global_address, "] = ", var_name,
+                          ";\n");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat(GetWriteImageFromDataType(data_type), "(",
+                          tensor_name, ", ", global_address, ", ", var_name,
+                          ");\n");
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(GetWriteImageFromDataType(data_type), "(",
+                          tensor_name, ", ", global_address, ", ", var_name,
+                          ");\n");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string GetImageModifier(AccessType access) {
+  switch (access) {
+    case AccessType::READ:
+      return "__read_only";
+    case AccessType::WRITE:
+      return "__write_only";
+    case AccessType::READ_WRITE:
+      return "__read_write";
+  }
+}
+
+std::string GetDataType(DataType type) {
+  switch (type) {
+    case DataType::FLOAT16:
+      return "half";
+    case DataType::FLOAT32:
+      return "float";
+    default:
+      return "FLT";
+  }
+}
+
+std::string GetDataType4(DataType type) { return GetDataType(type) + "4"; }
+
+std::string GetTensorDeclaration(TensorStorageType storage_type,
+                                 AccessType access, DataType data_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat("__global ", GetDataType4(data_type), "*");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return GetImageModifier(access) + " image2d_t";
+    case TensorStorageType::TEXTURE_ARRAY:
+      return GetImageModifier(access) + " image2d_array_t";
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+std::string GetTensorDeclaration(TensorStorageType storage_type,
+                                 const std::string& tensor_name,
+                                 AccessType access, DataType data_type) {
+  return absl::StrCat(GetTensorDeclaration(storage_type, access, data_type),
+                      " ", tensor_name);
+}
+
+std::string GenerateGlobal3DCoords(TensorStorageType storage_type) {
+  std::string code;
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      code += "  int X = get_global_id(0);\n";
+      code += "  int Y = get_global_id(1);\n";
+      code += "  int Z = get_global_id(2);\n";
+      break;
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+
+  return code;
+}
+
+TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
+                                         const std::string& uniform_size_name,
+                                         TensorStorageType storage_type,
+                                         AccessType access)
+    : name_(name),
+      uniform_size_name_(uniform_size_name),
+      storage_type_(storage_type),
+      access_(access) {}
+
+TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
+                                         const std::string& uniform_size_name,
+                                         const TensorDescriptor& descriptor)
+    : name_(name),
+      uniform_size_name_(uniform_size_name),
+      storage_type_(descriptor.storage_type),
+      data_type_(descriptor.data_type) {}
+
+std::string TensorCodeGenerator::GetDeclaration() const {
+  return GetTensorDeclaration(storage_type_, name_, access_, data_type_);
+}
+
+std::string TensorCodeGenerator::GetDeclaration(AccessType access_type) const {
+  return GetTensorDeclaration(storage_type_, name_, access_type, data_type_);
+}
+
+std::string TensorCodeGenerator::Read3D(const std::string& x,
+                                        const std::string& y,
+                                        const std::string& z,
+                                        TextureAddressMode address_mode) const {
+  return ReadGlobalFLT4(storage_type_, data_type_, name_, uniform_size_name_, x,
+                        y, z, address_mode);
+}
+
+std::string TensorCodeGenerator::ReadAsFloat3D(
+    const std::string& x, const std::string& y, const std::string& z,
+    TextureAddressMode address_mode) const {
+  return ReadGlobalFloat4(storage_type_, name_, uniform_size_name_, x, y, z,
+                          address_mode);
+}
+
+std::string TensorCodeGenerator::Read3D(const std::string& global_address,
+                                        TextureAddressMode address_mode) const {
+  return ReadGlobalFLT4(storage_type_, data_type_, name_, global_address,
+                        address_mode);
+}
+
+std::string TensorCodeGenerator::ReadAsFloat3D(
+    const std::string& global_address, TextureAddressMode address_mode) const {
+  return ReadGlobalFloat4(storage_type_, name_, global_address, address_mode);
+}
+
+std::string TensorCodeGenerator::GetAddress(const std::string& var_name,
+                                            const std::string& x,
+                                            const std::string& y,
+                                            const std::string& z) const {
+  return GetGlobalAddress(storage_type_, uniform_size_name_, var_name, x, y, z);
+}
+
+std::string TensorCodeGenerator::Write3D(const std::string& var_name,
+                                         const std::string& x,
+                                         const std::string& y,
+                                         const std::string& z) const {
+  return WriteGlobalFLT4(storage_type_, data_type_, name_, uniform_size_name_,
+                         var_name, x, y, z);
+}
+
+std::string TensorCodeGenerator::Write3D(
+    const std::string& var_name, const std::string& global_address) const {
+  return WriteGlobalFLT4(storage_type_, data_type_, name_, var_name,
+                         global_address);
+}
+
+TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
+  return device.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
+                              : TextureAddressMode::ZERO;
+}
+
+float4 GetMaskForLastPlane(int channels) {
+  float4 mask = float4(0.0f);
+  const int reminder = channels % 4 == 0 ? 4 : channels % 4;
+  for (int i = 0; i < reminder; ++i) {
+    mask[i] = 1.0f;
+  }
+  return mask;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
new file mode 100644
index 00000000000..250bd76fdf6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -0,0 +1,191 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string GetCommonDefines(CalculationsPrecision precision);
+
+std::string GetGlobalAddress(TensorStorageType storage_type,
+                             const std::string& size_name,
+                             const std::string& var_name, const std::string& x,
+                             const std::string& y, const std::string& z);
+std::string ReadGlobalFLT4(TensorStorageType storage_type,
+                           const std::string& tensor_name,
+                           const std::string& size_name, const std::string& x,
+                           const std::string& y, const std::string& z);
+std::string ReadGlobalFLT4(TensorStorageType storage_type,
+                           const std::string& tensor_name,
+                           const std::string& global_address);
+std::string WriteGlobalFLT4(TensorStorageType storage_type,
+                            const std::string& tensor_name,
+                            const std::string& size_name,
+                            const std::string& var_name, const std::string& x,
+                            const std::string& y, const std::string& z);
+std::string WriteGlobalFLT4(TensorStorageType storage_type,
+                            const std::string& tensor_name,
+                            const std::string& var_name,
+                            const std::string& global_address);
+
+std::string GetDataType(DataType type);
+std::string GetDataType4(DataType type);
+
+std::string GetTensorDeclaration(TensorStorageType storage_type,
+                                 AccessType access, DataType data_type);
+
+std::string GetTensorDeclaration(TensorStorageType storage_type,
+                                 const std::string& tensor_name,
+                                 AccessType access, DataType data_type);
+
+std::string GenerateGlobal3DCoords(TensorStorageType storage_type);
+
+enum class TextureAddressMode {
+  DONT_CARE,  // translated to CLK_ADDRESS_NONE
+  ZERO,       // translated to CLK_ADDRESS_CLAMP
+};
+
+class TensorCodeGenerator {
+ public:
+  TensorCodeGenerator(const std::string& name,
+                      const std::string& uniform_size_name,
+                      TensorStorageType storage_type, AccessType access);
+
+  TensorCodeGenerator(const std::string& name,
+                      const std::string& uniform_size_name,
+                      const TensorDescriptor& descriptor);
+
+  std::string GetDeclaration() const;
+
+  std::string GetDeclaration(AccessType access) const;
+
+  // This function (and functions below) accept TextureAddressMode, but this
+  // argument applicable only for texture types. Buffer types ignore this
+  // parameter.
+  std::string Read3D(
+      const std::string& x, const std::string& y, const std::string& z,
+      TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
+
+  // Optimization for textures, so as in opencl we can use read_imagef for any
+  // texture type.
+  std::string ReadAsFloat3D(
+      const std::string& x, const std::string& y, const std::string& z,
+      TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
+
+  std::string Read3D(
+      const std::string& global_address,
+      TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
+
+  // Optimization for textures, so as in opencl we can use read_imagef for any
+  // texture type.
+  std::string ReadAsFloat3D(
+      const std::string& global_address,
+      TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
+
+  std::string GetAddress(const std::string& var_name, const std::string& x,
+                         const std::string& y, const std::string& z) const;
+
+  std::string Write3D(const std::string& var_name, const std::string& x,
+                      const std::string& y, const std::string& z) const;
+
+  std::string Write3D(const std::string& var_name,
+                      const std::string& global_address) const;
+
+ private:
+  std::string name_;
+  std::string uniform_size_name_;
+  TensorStorageType storage_type_;
+  AccessType access_;
+  DataType data_type_ = DataType::UNKNOWN;
+};
+
+template <DataType S, typename T>
+void RearrangeWeightsToOHWI4I4O(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                                absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          T filters[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[j][i] = weights.data[f_index];
+              } else {
+                filters[j][i] = 0.0f;
+              }
+            }
+          }
+          dst[counter++] = filters[0];
+          dst[counter++] = filters[1];
+          dst[counter++] = filters[2];
+          dst[counter++] = filters[3];
+        }
+      }
+    }
+  }
+}
+
+// Returns fastest TextureAddressMode that return ZERO for out-of-range image
+// coordinates.
+//
+// Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
+// we can observe huge register overhead when compared to other modes.
+
+// While using CLK_ADDRESS_NONE with out-of-range image coordinates is undefined
+// in the OpenCL specification, we have observed that CLK_ADDRESS_NONE works
+// like CLK_ADDRESS_CLAMP for out-of-range image coordinates for RGBA F16/F32
+// textures on Adreno3xx devices. Using CLK_ADDRESS_NONE is significantly faster
+// than CLK_ADDRESS_CLAMP on Adreno 3xx.
+TextureAddressMode GetFastestZeroMode(const CLDevice& device);
+
+// Returns float4 mask for last plane(batch of 4 channels)
+// assumes that plane size is 4;
+// for example we have 7 channels, in our data structures we align it to 8
+// but 8s-channel will be empty, then last plane (batch of 4 channels) will
+// have this mask (1, 1, 1, 0).
+float4 GetMaskForLastPlane(int channels);
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
new file mode 100644
index 00000000000..2e410e0d6aa
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@@ -0,0 +1,441 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+#include <algorithm>
+#include <limits>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+std::vector<int> GetDivisors(int number) {
+  const int max_divisor = static_cast<int>(std::sqrt(number));
+  std::vector<int> divisors;
+  // we don't know the number of dividers, so it is just heuristic.
+  divisors.reserve(max_divisor / 3 + 1);
+  for (int i = 1; i <= max_divisor; ++i) {
+    const int d = number / i;
+    if (i * d == number) {
+      divisors.push_back(i);
+      if (d != i) {
+        divisors.push_back(d);
+      }
+    }
+  }
+  return divisors;
+}
+
+std::vector<int> GetDivisorsForRange(int number, int range) {
+  const int last_number = number + range;
+  const int max_divisor = static_cast<int>(std::sqrt(last_number));
+  std::set<int> divisors;
+  for (int i = 1; i <= max_divisor; ++i) {
+    const int reminder = number % i;
+    // iterate through numbers that divisible by i in our range;
+    const int first_number = number + (i - reminder) % i;
+    if (first_number <= last_number) {
+      divisors.insert(i);
+    }
+    for (int j = first_number; j <= last_number; j += i) {
+      const int d = j / i;
+      if (d != i) {
+        divisors.insert(d);
+      }
+    }
+  }
+  return std::vector<int>(divisors.begin(), divisors.end());
+}
+
+std::vector<int2> Get2DWorkgroupsEqualTo128() {
+  return {{128, 1}, {64, 2}, {32, 4}, {16, 8},
+          {8, 16},  {4, 32}, {2, 64}, {1, 128}};
+}
+
+std::vector<int> GetPossibleSizes(int number,
+                                  WorkGroupSizeAlignment z_alignment) {
+  if (z_alignment == WorkGroupSizeAlignment::PRECISE) {
+    // we will use for potential sizes, sizes that cover grid preciselly
+    // work group size * k (k is integer) == grid_size
+    return GetDivisors(number);
+  } else {
+    // when we chose work group size we can use work group size that
+    //   work group size * k (k is integer) != grid_size (slightly bigger)
+    // so in this heuristic we trying to find potential size, that satisfies
+    //   to this : work group size * k (k is integer) <= grid_size + 5
+    //   and this : work group size * k (k is integer) >= grid_size
+    return GetDivisorsForRange(number, 5);
+  }
+}
+
+std::vector<int3> GenerateWorkGroupSizesXY128(
+    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+  std::vector<int3> work_groups;
+  work_groups.reserve(32);
+
+  std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+
+  for (int x = 1; x <= max_work_group_size; x *= 2) {
+    for (int y = 1; y <= max_work_group_size; y *= 2) {
+      int work_group_size_xy = x * y;
+      if (work_group_size_xy % 128 != 0 ||
+          work_group_size_xy > max_work_group_size) {
+        continue;
+      }
+      for (auto z : possible_z_sizes) {
+        if (work_group_size_xy * z > max_work_group_size) {
+          continue;
+        }
+        work_groups.push_back({x, y, z});
+      }
+    }
+  }
+  return work_groups;
+}
+
+std::vector<int3> GenerateWorkGroupSizesXY128Linear(
+    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+  std::vector<int3> work_groups;
+  work_groups.reserve(32);
+
+  std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+
+  for (int x = 128; x <= max_work_group_size && x < grid.x + 128; x += 128) {
+    for (auto z : possible_z_sizes) {
+      if (x * z <= max_work_group_size) {
+        work_groups.push_back({x, 1, z});
+      }
+    }
+  }
+  return work_groups;
+}
+
+std::vector<int3> GenerateWorkGroupSizes(const int3& grid,
+                                         int min_work_group_total_size,
+                                         int max_work_group_total_size,
+                                         const int3& max_work_group_sizes,
+                                         WorkGroupSizeAlignment x_alignment,
+                                         WorkGroupSizeAlignment y_alignment,
+                                         WorkGroupSizeAlignment z_alignment) {
+  std::vector<int3> work_groups;
+  work_groups.reserve(64);
+
+  std::vector<int> sizes_x = GetPossibleSizes(grid.x, x_alignment);
+  std::vector<int> sizes_y = GetPossibleSizes(grid.y, y_alignment);
+  std::vector<int> sizes_z = GetPossibleSizes(grid.z, z_alignment);
+
+  for (auto x : sizes_x) {
+    if (x > max_work_group_sizes.x) continue;
+    for (auto y : sizes_y) {
+      if (y > max_work_group_sizes.y) continue;
+      for (auto z : sizes_z) {
+        if (z > max_work_group_sizes.z) continue;
+        const int work_group_size = x * y * z;
+        if (work_group_size < min_work_group_total_size ||
+            work_group_size > max_work_group_total_size)
+          continue;
+        work_groups.push_back({x, y, z});
+      }
+    }
+  }
+
+  return work_groups;
+}
+
+void AddCornerCases(const int3& grid, int max_work_group_total_size,
+                    const int3& max_work_group_sizes,
+                    WorkGroupSizeAlignment x_alignment,
+                    WorkGroupSizeAlignment y_alignment,
+                    WorkGroupSizeAlignment z_alignment,
+                    std::vector<int3>* work_groups) {
+  for (int x = 1; x <= 4; ++x) {
+    for (int y = 1; y <= 4; ++y) {
+      for (int z = 1; z <= 4; ++z) {
+        int wg_x = IntegralDivideRoundUp(grid.x, x);
+        int wg_y = IntegralDivideRoundUp(grid.y, y);
+        int wg_z = IntegralDivideRoundUp(grid.z, z);
+        if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
+            wg_z > max_work_group_sizes.z ||
+            wg_x * wg_y * wg_z > max_work_group_total_size) {
+          continue;
+        }
+        if (x_alignment == WorkGroupSizeAlignment::PRECISE &&
+            grid.x % wg_x != 0) {
+          continue;
+        }
+        if (y_alignment == WorkGroupSizeAlignment::PRECISE &&
+            grid.y % wg_y != 0) {
+          continue;
+        }
+        if (z_alignment == WorkGroupSizeAlignment::PRECISE &&
+            grid.z % wg_z != 0) {
+          continue;
+        }
+        work_groups->push_back({wg_x, wg_y, wg_z});
+      }
+    }
+  }
+
+  // this will add at least {1, 1, 1} always.
+  for (int x = 1; x <= 4; ++x) {
+    for (int y = 1; y <= 4; ++y) {
+      for (int z = 1; z <= 4; ++z) {
+        if (x > max_work_group_sizes.x || y > max_work_group_sizes.y ||
+            z > max_work_group_sizes.z ||
+            x * y * z > max_work_group_total_size) {
+          continue;
+        }
+        if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % x != 0) {
+          continue;
+        }
+        if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % y != 0) {
+          continue;
+        }
+        if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % z != 0) {
+          continue;
+        }
+        work_groups->push_back({x, y, z});
+      }
+    }
+  }
+}
+
+Status GetBestWorkGroupAlignedToGrid(const TuningParameters& params,
+                                     const CLKernel& kernel, const int3& grid,
+                                     int3* best_work_group) {
+  auto alignment = WorkGroupSizeAlignment::PRECISE;
+  std::vector<int3> work_groups = GenerateWorkGroupSizes(
+      grid, /*min_work_group_total_size = */ 32, kernel.GetMaxWorkGroupSize(),
+      params.info->max_work_group_sizes, alignment, alignment, alignment);
+  int best_work_group_index;
+  // If the grid parameter too small, method below cannot generate workgroups.
+  if (work_groups.empty()) {
+    AddCornerCases(grid, kernel.GetMaxWorkGroupSize(),
+                   params.info->max_work_group_sizes, alignment, alignment,
+                   alignment, &work_groups);
+  }
+  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+      kernel, *params.info, grid, work_groups, &best_work_group_index));
+  *best_work_group = work_groups[best_work_group_index];
+  return OkStatus();
+}
+
+int GetPenalty(int grid_size, int group_size) {
+  const int reminder = grid_size % group_size;
+  return reminder == 0 ? 0 : group_size - reminder;
+}
+
+int GetPenalty(int2 grid_size, int2 group_size) {
+  const int p_x = GetPenalty(grid_size.x, group_size.x);
+  const int p_y = GetPenalty(grid_size.y, group_size.y);
+  return p_x * grid_size.y + p_y * grid_size.x + p_x * p_y;
+}
+
+int GetMaxSizeWithMinPenalty(int size, int max_size) {
+  int best_size = 128;
+  int min_penalty = GetPenalty(size, best_size);
+  for (int i = 2; i * 128 <= max_size; ++i) {
+    if (GetPenalty(size, i * 128) == min_penalty) {
+      best_size = i * 128;
+    }
+  }
+  return best_size;
+}
+
+int2 GetMaxSizeWithMinPenalty(int2 size, int max_size) {
+  std::vector<int2> base_groups = Get2DWorkgroupsEqualTo128();
+  int min_penalty = std::numeric_limits<int>::max();
+  for (auto group : base_groups) {
+    min_penalty = std::min(GetPenalty(size, group), min_penalty);
+  }
+  for (auto group : base_groups) {
+    for (int y = 1; y * group.y <= max_size; ++y) {
+      int new_group_y = y * group.y;
+      for (int x = 1; x * group.x <= max_size; ++x) {
+        int new_group_x = x * group.x;
+        if (new_group_x * new_group_y > max_size) {
+          break;
+        }
+        if (GetPenalty(size, int2(new_group_x, new_group_y)) == min_penalty) {
+          return int2(new_group_x, new_group_y);
+        }
+      }
+    }
+  }
+  return int2(0, 0);
+}
+
+int GetBiggestDividerWithPriority(int number, int max_divider) {
+  if (number % 8 == 0 && 8 <= max_divider) {
+    return 8;
+  }
+  if (number % 4 == 0 && 4 <= max_divider) {
+    return 4;
+  }
+  if (number % 2 == 0 && 2 <= max_divider) {
+    return 2;
+  }
+  for (int i = max_divider; i != 0; i--) {
+    if (number % i == 0) {
+      return i;
+    }
+  }
+  return 1;
+}
+
+int GetBiggestDivider(int number, int max_divider) {
+  for (int i = max_divider; i != 0; i--) {
+    if (number % i == 0) {
+      return i;
+    }
+  }
+  return 1;
+}
+
+}  // namespace
+
+int3 GetWorkGroupXY128ConvLinear(const int3& grid) {
+  int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
+  if (grid.x <= 128) {
+    return int3(128, 1, grid_z);
+  }
+  int grid_x = GetMaxSizeWithMinPenalty(grid.x, 512 / grid_z);
+  return {grid_x, 1, grid_z};
+}
+
+int3 GetWorkGroupXY128Conv(const int3& grid) {
+  int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
+  if (grid.x <= 16 && grid.y <= 8) {
+    return int3(16, 8, grid_z);
+  }
+  int2 grid_xy = GetMaxSizeWithMinPenalty(int2(grid.x, grid.y), 512 / grid_z);
+  return int3(grid_xy.x, grid_xy.y, grid_z);
+}
+
+int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
+
+int3 GetWorkGroup(const int3& grid, int max_size) {
+  int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
+  int wg_xy_size = max_size / wg_z;
+  int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size);
+  int wg_y = std::min(wg_xy_size / wg_x, grid.y);
+  return int3(wg_x, wg_y, wg_z);
+}
+
+int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
+  int wg_z = GetBiggestDivider(grid.z, max_z_size);
+  int wg_xy_size = std::min(256, max_size) / wg_z;
+  int wg_x = std::min(grid.x, wg_xy_size);
+  int wg_y = std::min(wg_xy_size / wg_x, grid.y);
+  if (wg_y == grid.y && grid.y % 2 == 0) {
+    wg_y = grid.y / 2;
+  }
+  return int3(wg_x, wg_y, wg_z);
+}
+
+Status GetBestWorkGroupXY128(const TuningParameters& params,
+                             const CLKernel& kernel, const int3& grid,
+                             WorkGroupSizeAlignment z_alignment,
+                             int3* best_work_group) {
+  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128(
+      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
+  int best_work_group_index;
+  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+      kernel, *params.info, grid, work_groups, &best_work_group_index));
+  *best_work_group = work_groups[best_work_group_index];
+  return OkStatus();
+}
+
+Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
+                                   const CLKernel& kernel, const int3& grid,
+                                   WorkGroupSizeAlignment z_alignment,
+                                   int3* best_work_group) {
+  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128Linear(
+      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
+  int best_work_group_index;
+  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+      kernel, *params.info, grid, work_groups, &best_work_group_index));
+  *best_work_group = work_groups[best_work_group_index];
+  return OkStatus();
+}
+
+bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
+  int planar_work_groups = IntegralDivideRoundUp(width * height, 128);
+  auto base_work_groups = Get2DWorkgroupsEqualTo128();
+  bool have_equal_work_groups = false;
+  for (auto& work_group : base_work_groups) {
+    int x_groups = IntegralDivideRoundUp(width, work_group.x);
+    int y_groups = IntegralDivideRoundUp(height, work_group.y);
+    int xy_groups = x_groups * y_groups;
+    if (xy_groups == planar_work_groups) {
+      have_equal_work_groups = true;
+      break;
+    }
+  }
+  return !have_equal_work_groups;
+}
+
+Status GetBestWorkGroup(const TuningParameters& params, const CLKernel& kernel,
+                        const int3& grid, int3* best_work_group) {
+  switch (params.tuning_type) {
+    case TuningType::FAST:
+      if (params.info->vendor != Vendor::QUALCOMM) {
+        *best_work_group = int3(8, 4, 1);
+        return OkStatus();
+      } else {
+        *best_work_group = GetWorkGroup(grid, kernel.GetMaxWorkGroupSize());
+        return OkStatus();
+      }
+    case TuningType::EXHAUSTIVE:
+      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
+                                           best_work_group);
+    default:
+      *best_work_group = {8, 4, 1};
+      return OkStatus();
+  }
+}
+
+Status GetBestWorkGroupConv(const TuningParameters& params,
+                            const CLKernel& kernel, const int3& grid,
+                            int3* best_work_group) {
+  switch (params.tuning_type) {
+    case TuningType::FAST:
+      if (params.info->vendor != Vendor::QUALCOMM) {
+        *best_work_group = int3(8, 4, 1);
+        return OkStatus();
+      } else {
+        int max_z_size = params.info->adreno_info.gpu_version < 400 ? 16 : 64;
+        *best_work_group =
+            GetWorkGroupConv(grid, kernel.GetMaxWorkGroupSize(), max_z_size);
+        return OkStatus();
+      }
+    case TuningType::EXHAUSTIVE:
+      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
+                                           best_work_group);
+    default:
+      *best_work_group = {8, 4, 1};
+      return OkStatus();
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
new file mode 100644
index 00000000000..99d4bb49a93
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WORK_GROUP_PICKING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WORK_GROUP_PICKING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// PRECISE assume that WorkGroupSize * k = GridSize;
+// NO_ALIGNMENT no restrictions;
+// We need PRECISE when we don't have check in kernel for boundaries
+// If we have the check, we can use PRECISE or NO_ALIGNMENT as well.
+enum class WorkGroupSizeAlignment { PRECISE, NO_ALIGNMENT };
+
+// writes best_work_group if successful
+// Here and later you can find XY128, this is because 128 is SIMD width of A6xx
+// And XY128 means that work_group_size.x * work_group_size.y % 128 = 0
+// We need it to correctly work with constants uploading on A6xx
+Status GetBestWorkGroupXY128(const TuningParameters& params,
+                             const CLKernel& kernel, const int3& grid,
+                             WorkGroupSizeAlignment z_alignment,
+                             int3* best_work_group);
+
+Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
+                                   const CLKernel& kernel, const int3& grid,
+                                   WorkGroupSizeAlignment z_alignment,
+                                   int3* best_work_group);
+
+int3 GetWorkGroupXY128ConvLinear(const int3& grid);
+
+int3 GetWorkGroupXY128Simple(const int3& grid);
+int3 GetWorkGroupXY128Conv(const int3& grid);
+
+bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
+
+Status GetBestWorkGroup(const TuningParameters& params, const CLKernel& kernel,
+                        const int3& grid, int3* best_work_group);
+
+Status GetBestWorkGroupConv(const TuningParameters& params,
+                            const CLKernel& kernel, const int3& grid,
+                            int3* best_work_group);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WORK_GROUP_PICKING_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
new file mode 100644
index 00000000000..cd7fe729c7d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
+                             DataType data_type)
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
+
+LinearStorage::LinearStorage(LinearStorage&& storage)
+    : texture_storage_(std::move(storage.texture_storage_)),
+      buffer_storage_(std::move(storage.buffer_storage_)),
+      memory_(storage.memory_),
+      depth_(storage.depth_),
+      name_(std::move(storage.name_)),
+      storage_type_(storage.storage_type_),
+      data_type_(storage.data_type_) {
+  storage.memory_ = nullptr;
+}
+
+LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
+  if (this != &storage) {
+    texture_storage_ = std::move(storage.texture_storage_);
+    buffer_storage_ = std::move(storage.buffer_storage_);
+    std::swap(memory_, storage.memory_);
+    std::swap(depth_, storage.depth_);
+    name_ = std::move(storage.name_);
+    std::swap(storage_type_, storage.storage_type_);
+    std::swap(data_type_, storage.data_type_);
+  }
+  return *this;
+}
+
+std::string LinearStorage::ReadLinearFLT4(const std::string& z_coord) const {
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    return absl::StrCat(name_, "[", z_coord, "]");
+  } else {
+    return absl::StrCat("READ_IMAGE(", name_, ", smp_none, (int2)(", z_coord,
+                        ", 0))");
+  }
+}
+
+std::string LinearStorage::GetDeclaration() const {
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    return absl::StrCat("__global FLT4* ", name_);
+  } else {
+    return absl::StrCat("__read_only image2d_t ", name_);
+  }
+}
+
+LinearStorageType DeduceLinearStorageType(
+    TensorStorageType tensor_storage_type) {
+  if (tensor_storage_type == TensorStorageType::BUFFER) {
+    return LinearStorageType::BUFFER;
+  } else {
+    return LinearStorageType::TEXTURE_2D;
+  }
+}
+
+Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
+                                 CLContext* context, LinearStorage* result) {
+  const int float4_size =
+      data_type == DataType::FLOAT32 ? sizeof(float4) : sizeof(half4);
+  *result = LinearStorage(size, LinearStorageType::BUFFER, data_type);
+  RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * size, data, context,
+                                       &result->buffer_storage_));
+  result->memory_ = result->buffer_storage_.GetMemoryPtr();
+  return OkStatus();
+}
+
+Status CreateTextureLinearStorage(int size, DataType data_type, void* data,
+                                  CLContext* context, LinearStorage* result) {
+  *result = LinearStorage(size, LinearStorageType::TEXTURE_2D, data_type);
+  RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, size, 1, data, context,
+                                      &result->texture_storage_));
+  result->memory_ = result->texture_storage_.GetMemoryPtr();
+  return OkStatus();
+}
+
+Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
+                           int size, void* data, CLContext* context,
+                           LinearStorage* result) {
+  if (creation_info.storage_type == LinearStorageType::BUFFER) {
+    return CreateBufferLinearStorage(size, creation_info.data_type, data,
+                                     context, result);
+  } else {
+    return CreateTextureLinearStorage(size, creation_info.data_type, data,
+                                      context, result);
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
new file mode 100644
index 00000000000..3d3d9d5222f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_LINEAR_STORAGE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_LINEAR_STORAGE_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class LinearStorageType { BUFFER, TEXTURE_2D };
+
+struct LinearStorageCreateInfo {
+  LinearStorageType storage_type;
+  DataType data_type;
+  std::string name;      // optional
+  int aligned_size = 0;  // optional, to pad with zeroes
+};
+
+LinearStorageType DeduceLinearStorageType(
+    TensorStorageType tensor_storage_type);
+
+// Represent GPU 1D-array of FLT4(float4/half4) values
+// Can use inside texture2d or buffer
+class LinearStorage {
+ public:
+  LinearStorage() {}
+
+  // Move only
+  LinearStorage(LinearStorage&& storage);
+  LinearStorage& operator=(LinearStorage&& storage);
+  LinearStorage(const LinearStorage&) = delete;
+  LinearStorage& operator=(const LinearStorage&) = delete;
+
+  void SetName(const std::string& name) { name_ = name; }
+  cl_mem GetMemoryPtr() const { return memory_; }
+  std::string ReadLinearFLT4(const std::string& z_coord) const;
+  std::string GetDeclaration() const;
+
+ private:
+  friend Status CreateTextureLinearStorage(int size, DataType data_type,
+                                           void* data, CLContext* context,
+                                           LinearStorage* result);
+  friend Status CreateBufferLinearStorage(int size, DataType data_type,
+                                          void* data, CLContext* context,
+                                          LinearStorage* result);
+
+  LinearStorage(int depth, LinearStorageType storage_type, DataType data_type);
+
+  Texture2D texture_storage_;
+  Buffer buffer_storage_;
+  cl_mem memory_ = nullptr;  // Just a reference to texture_storage_ or
+                             // buffer_storage_ memory, not an owner
+  int depth_;
+  std::string name_;
+  LinearStorageType storage_type_;
+  DataType data_type_;
+};
+
+Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
+                                 CLContext* context, LinearStorage* result);
+
+Status CreateTextureLinearStorage(int size, DataType data_type, void* data,
+                                  CLContext* context, LinearStorage* result);
+
+Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
+                           int size, void* data, CLContext* context,
+                           LinearStorage* result);
+
+template <DataType T>
+Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
+                           const ::tflite::gpu::Tensor<Linear, T>& tensor,
+                           CLContext* context, LinearStorage* result) {
+  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
+                                             : tensor.shape.v;
+  const int depth = IntegralDivideRoundUp(size, 4);
+  if (creation_info.data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  } else {
+    std::vector<half4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  }
+  result->SetName(creation_info.name);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_LINEAR_STORAGE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/model_hints.h b/tensorflow/lite/delegates/gpu/cl/model_hints.h
new file mode 100644
index 00000000000..274064dcf13
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/model_hints.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_MODEL_HINTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_MODEL_HINTS_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct ModelHints {
+  using ModelHint = uint64_t;
+
+  // By default we want the fastest inference
+  static const ModelHint kFastestInference = 0x00000000;
+  // Can improve compilation time, but inference can be slower
+  static const ModelHint kReduceKernelsCount = 0x00000001;
+  // Can improve tuning time, but inference can be slower
+  static const ModelHint kFastTuning = 0x00000002;
+
+  void Add(ModelHint hint) {
+    if (hint == kFastestInference) {
+      hints = kFastestInference;
+    } else {
+      hints |= hint;
+    }
+  }
+
+  bool Check(ModelHint hint) const { return hints & hint; }
+
+  uint64_t hints = kFastestInference;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_MODEL_HINTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
new file mode 100644
index 00000000000..1bfa04b32f2
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -0,0 +1,302 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+#include <dlfcn.h>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#define LoadFunction(function)                                                 \
+  if (is_pixel) {                                                              \
+    function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
+  } else {                                                                     \
+    function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
+  }
+
+Status LoadOpenCL() {
+  void* libopencl = dlopen("libOpenCL.so", RTLD_NOW | RTLD_LOCAL);
+  if (libopencl) {
+    LoadOpenCLFunctions(libopencl, false);
+    return OkStatus();
+  } else {
+    // Pixel phone?
+    libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+    if (libopencl) {
+      typedef void (*enableOpenCL_t)();
+      enableOpenCL_t enableOpenCL =
+          reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
+      enableOpenCL();
+      LoadOpenCLFunctions(libopencl, true);
+      return OkStatus();
+    } else {
+      return UnknownError(
+          absl::StrCat("OpenCL library not loaded - ", dlerror()));
+    }
+  }
+}
+
+void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
+  typedef void* (*loadOpenCLPointer_t)(const char* name);
+  loadOpenCLPointer_t loadOpenCLPointer;
+  if (is_pixel) {
+    loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
+        dlsym(libopencl, "loadOpenCLPointer"));
+  }
+
+  LoadFunction(clGetPlatformIDs);
+  LoadFunction(clGetPlatformInfo);
+  LoadFunction(clGetDeviceIDs);
+  LoadFunction(clGetDeviceInfo);
+  LoadFunction(clCreateSubDevices);
+  LoadFunction(clRetainDevice);
+  LoadFunction(clReleaseDevice);
+  LoadFunction(clCreateContext);
+  LoadFunction(clCreateContextFromType);
+  LoadFunction(clRetainContext);
+  LoadFunction(clReleaseContext);
+  LoadFunction(clGetContextInfo);
+  LoadFunction(clCreateCommandQueueWithProperties);
+  LoadFunction(clRetainCommandQueue);
+  LoadFunction(clReleaseCommandQueue);
+  LoadFunction(clGetCommandQueueInfo);
+  LoadFunction(clCreateBuffer);
+  LoadFunction(clCreateSubBuffer);
+  LoadFunction(clCreateImage);
+  LoadFunction(clCreatePipe);
+  LoadFunction(clRetainMemObject);
+  LoadFunction(clReleaseMemObject);
+  LoadFunction(clGetSupportedImageFormats);
+  LoadFunction(clGetMemObjectInfo);
+  LoadFunction(clGetImageInfo);
+  LoadFunction(clGetPipeInfo);
+  LoadFunction(clSetMemObjectDestructorCallback);
+  LoadFunction(clSVMAlloc);
+  LoadFunction(clSVMFree);
+  LoadFunction(clCreateSamplerWithProperties);
+  LoadFunction(clRetainSampler);
+  LoadFunction(clReleaseSampler);
+  LoadFunction(clGetSamplerInfo);
+  LoadFunction(clCreateProgramWithSource);
+  LoadFunction(clCreateProgramWithBinary);
+  LoadFunction(clCreateProgramWithBuiltInKernels);
+  LoadFunction(clRetainProgram);
+  LoadFunction(clReleaseProgram);
+  LoadFunction(clBuildProgram);
+  LoadFunction(clCompileProgram);
+  LoadFunction(clLinkProgram);
+  LoadFunction(clUnloadPlatformCompiler);
+  LoadFunction(clGetProgramInfo);
+  LoadFunction(clGetProgramBuildInfo);
+  LoadFunction(clCreateKernel);
+  LoadFunction(clCreateKernelsInProgram);
+  LoadFunction(clRetainKernel);
+  LoadFunction(clReleaseKernel);
+  LoadFunction(clSetKernelArg);
+  LoadFunction(clSetKernelArgSVMPointer);
+  LoadFunction(clSetKernelExecInfo);
+  LoadFunction(clGetKernelInfo);
+  LoadFunction(clGetKernelArgInfo);
+  LoadFunction(clGetKernelWorkGroupInfo);
+  LoadFunction(clWaitForEvents);
+  LoadFunction(clGetEventInfo);
+  LoadFunction(clCreateUserEvent);
+  LoadFunction(clRetainEvent);
+  LoadFunction(clReleaseEvent);
+  LoadFunction(clSetUserEventStatus);
+  LoadFunction(clSetEventCallback);
+  LoadFunction(clGetEventProfilingInfo);
+  LoadFunction(clFlush);
+  LoadFunction(clFinish);
+  LoadFunction(clEnqueueReadBuffer);
+  LoadFunction(clEnqueueReadBufferRect);
+  LoadFunction(clEnqueueWriteBuffer);
+  LoadFunction(clEnqueueWriteBufferRect);
+  LoadFunction(clEnqueueFillBuffer);
+  LoadFunction(clEnqueueCopyBuffer);
+  LoadFunction(clEnqueueCopyBufferRect);
+  LoadFunction(clEnqueueReadImage);
+  LoadFunction(clEnqueueWriteImage);
+  LoadFunction(clEnqueueFillImage);
+  LoadFunction(clEnqueueCopyImage);
+  LoadFunction(clEnqueueCopyImageToBuffer);
+  LoadFunction(clEnqueueCopyBufferToImage);
+  LoadFunction(clEnqueueMapBuffer);
+  LoadFunction(clEnqueueMapImage);
+  LoadFunction(clEnqueueUnmapMemObject);
+  LoadFunction(clEnqueueMigrateMemObjects);
+  LoadFunction(clEnqueueNDRangeKernel);
+  LoadFunction(clEnqueueNativeKernel);
+  LoadFunction(clEnqueueMarkerWithWaitList);
+  LoadFunction(clEnqueueBarrierWithWaitList);
+  LoadFunction(clEnqueueSVMFree);
+  LoadFunction(clEnqueueSVMMemcpy);
+  LoadFunction(clEnqueueSVMMemFill);
+  LoadFunction(clEnqueueSVMMap);
+  LoadFunction(clEnqueueSVMUnmap);
+  LoadFunction(clGetExtensionFunctionAddressForPlatform);
+  LoadFunction(clCreateImage2D);
+  LoadFunction(clCreateImage3D);
+  LoadFunction(clEnqueueMarker);
+  LoadFunction(clEnqueueWaitForEvents);
+  LoadFunction(clEnqueueBarrier);
+  LoadFunction(clUnloadCompiler);
+  LoadFunction(clGetExtensionFunctionAddress);
+  LoadFunction(clCreateCommandQueue);
+  LoadFunction(clCreateSampler);
+  LoadFunction(clEnqueueTask);
+
+  // OpenGL sharing
+  LoadFunction(clCreateFromGLBuffer);
+  LoadFunction(clCreateFromGLTexture);
+  LoadFunction(clEnqueueAcquireGLObjects);
+  LoadFunction(clEnqueueReleaseGLObjects);
+
+  // cl_khr_egl_event extension
+  LoadFunction(clCreateEventFromEGLSyncKHR);
+}
+
+// No OpenCL support, do not set function addresses
+PFN_clGetPlatformIDs clGetPlatformIDs;
+PFN_clGetPlatformInfo clGetPlatformInfo;
+PFN_clGetDeviceIDs clGetDeviceIDs;
+PFN_clGetDeviceInfo clGetDeviceInfo;
+PFN_clCreateSubDevices clCreateSubDevices;
+PFN_clRetainDevice clRetainDevice;
+PFN_clReleaseDevice clReleaseDevice;
+PFN_clCreateContext clCreateContext;
+PFN_clCreateContextFromType clCreateContextFromType;
+PFN_clRetainContext clRetainContext;
+PFN_clReleaseContext clReleaseContext;
+PFN_clGetContextInfo clGetContextInfo;
+PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
+PFN_clRetainCommandQueue clRetainCommandQueue;
+PFN_clReleaseCommandQueue clReleaseCommandQueue;
+PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
+PFN_clCreateBuffer clCreateBuffer;
+PFN_clCreateSubBuffer clCreateSubBuffer;
+PFN_clCreateImage clCreateImage;
+PFN_clCreatePipe clCreatePipe;
+PFN_clRetainMemObject clRetainMemObject;
+PFN_clReleaseMemObject clReleaseMemObject;
+PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
+PFN_clGetMemObjectInfo clGetMemObjectInfo;
+PFN_clGetImageInfo clGetImageInfo;
+PFN_clGetPipeInfo clGetPipeInfo;
+PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+PFN_clSVMAlloc clSVMAlloc;
+PFN_clSVMFree clSVMFree;
+PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+PFN_clRetainSampler clRetainSampler;
+PFN_clReleaseSampler clReleaseSampler;
+PFN_clGetSamplerInfo clGetSamplerInfo;
+PFN_clCreateProgramWithSource clCreateProgramWithSource;
+PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
+PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+PFN_clRetainProgram clRetainProgram;
+PFN_clReleaseProgram clReleaseProgram;
+PFN_clBuildProgram clBuildProgram;
+PFN_clCompileProgram clCompileProgram;
+PFN_clLinkProgram clLinkProgram;
+PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+PFN_clGetProgramInfo clGetProgramInfo;
+PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
+PFN_clCreateKernel clCreateKernel;
+PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
+PFN_clRetainKernel clRetainKernel;
+PFN_clReleaseKernel clReleaseKernel;
+PFN_clSetKernelArg clSetKernelArg;
+PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+PFN_clSetKernelExecInfo clSetKernelExecInfo;
+PFN_clGetKernelInfo clGetKernelInfo;
+PFN_clGetKernelArgInfo clGetKernelArgInfo;
+PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+PFN_clWaitForEvents clWaitForEvents;
+PFN_clGetEventInfo clGetEventInfo;
+PFN_clCreateUserEvent clCreateUserEvent;
+PFN_clRetainEvent clRetainEvent;
+PFN_clReleaseEvent clReleaseEvent;
+PFN_clSetUserEventStatus clSetUserEventStatus;
+PFN_clSetEventCallback clSetEventCallback;
+PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
+PFN_clFlush clFlush;
+PFN_clFinish clFinish;
+PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
+PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
+PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+PFN_clEnqueueReadImage clEnqueueReadImage;
+PFN_clEnqueueWriteImage clEnqueueWriteImage;
+PFN_clEnqueueFillImage clEnqueueFillImage;
+PFN_clEnqueueCopyImage clEnqueueCopyImage;
+PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
+PFN_clEnqueueMapImage clEnqueueMapImage;
+PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
+PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+PFN_clEnqueueSVMFree clEnqueueSVMFree;
+PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+PFN_clEnqueueSVMMap clEnqueueSVMMap;
+PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+PFN_clGetExtensionFunctionAddressForPlatform
+    clGetExtensionFunctionAddressForPlatform;
+PFN_clCreateImage2D clCreateImage2D;
+PFN_clCreateImage3D clCreateImage3D;
+PFN_clEnqueueMarker clEnqueueMarker;
+PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+PFN_clEnqueueBarrier clEnqueueBarrier;
+PFN_clUnloadCompiler clUnloadCompiler;
+PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+PFN_clCreateCommandQueue clCreateCommandQueue;
+PFN_clCreateSampler clCreateSampler;
+PFN_clEnqueueTask clEnqueueTask;
+
+PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
+PFN_clCreateFromGLTexture clCreateFromGLTexture;
+PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+
+PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
+                           const cl_image_format* image_format,
+                           const cl_image_desc* image_desc, void* host_ptr,
+                           cl_int* errcode_ret) {
+  if (clCreateImage) {  // clCreateImage available since OpenCL 1.2
+    return clCreateImage(context, flags, image_format, image_desc, host_ptr,
+                         errcode_ret);
+  } else {
+    return clCreateImage2D(context, flags, image_format,
+                           image_desc->image_width, image_desc->image_height,
+                           image_desc->image_row_pitch, host_ptr, errcode_ret);
+  }
+}
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
new file mode 100644
index 00000000000..673adec8058
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
@@ -0,0 +1,634 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_platform.h>
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status LoadOpenCL();
+void LoadOpenCLFunctions(void *libopencl, bool is_pixel);
+
+typedef cl_int (*PFN_clGetPlatformIDs)(
+    cl_uint /* num_entries */, cl_platform_id * /* platforms */,
+    cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetPlatformInfo)(
+    cl_platform_id /* platform */, cl_platform_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetDeviceIDs)(
+    cl_platform_id /* platform */, cl_device_type /* device_type */,
+    cl_uint /* num_entries */, cl_device_id * /* devices */,
+    cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetDeviceInfo)(
+    cl_device_id /* device */, cl_device_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clCreateSubDevices)(
+    cl_device_id /* in_device */,
+    const cl_device_partition_property * /* properties */,
+    cl_uint /* num_devices */, cl_device_id * /* out_devices */,
+    cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clRetainDevice)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clReleaseDevice)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+typedef cl_context (*PFN_clCreateContext)(
+    const cl_context_properties * /* properties */, cl_uint /* num_devices */,
+    const cl_device_id * /* devices */,
+    void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t,
+                                         void *),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_context (*PFN_clCreateContextFromType)(
+    const cl_context_properties * /* properties */,
+    cl_device_type /* device_type */,
+    void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t,
+                                        void *),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clRetainContext)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseContext)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetContextInfo)(
+    cl_context /* context */, cl_context_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_command_queue (*PFN_clCreateCommandQueueWithProperties)(
+    cl_context /* context */, cl_device_id /* device */,
+    const cl_queue_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clRetainCommandQueue)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseCommandQueue)(
+    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetCommandQueueInfo)(
+    cl_command_queue /* command_queue */,
+    cl_command_queue_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem (*PFN_clCreateBuffer)(
+    cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */,
+    void * /* host_ptr */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem (*PFN_clCreateSubBuffer)(
+    cl_mem /* buffer */, cl_mem_flags /* flags */,
+    cl_buffer_create_type /* buffer_create_type */,
+    const void * /* buffer_create_info */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_mem (*PFN_clCreateImage)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */,
+    const cl_image_desc * /* image_desc */, void * /* host_ptr */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem (*PFN_clCreatePipe)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
+    const cl_pipe_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clRetainMemObject)(cl_mem /* memobj */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseMemObject)(cl_mem /* memobj */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetSupportedImageFormats)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_mem_object_type /* image_type */, cl_uint /* num_entries */,
+    cl_image_format * /* image_formats */,
+    cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetMemObjectInfo)(
+    cl_mem /* memobj */, cl_mem_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetImageInfo)(
+    cl_mem /* image */, cl_image_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetPipeInfo)(
+    cl_mem /* pipe */, cl_pipe_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clSetMemObjectDestructorCallback)(
+    cl_mem /* memobj */,
+    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
+                                       void * /*user_data*/),
+    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef void *(*PFN_clSVMAlloc)(
+    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
+    cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+typedef void (*PFN_clSVMFree)(cl_context /* context */,
+                              void * /* svm_pointer */)
+    CL_API_SUFFIX__VERSION_2_0;
+typedef cl_sampler (*PFN_clCreateSamplerWithProperties)(
+    cl_context /* context */,
+    const cl_sampler_properties * /* normalized_coords */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clRetainSampler)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseSampler)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetSamplerInfo)(
+    cl_sampler /* sampler */, cl_sampler_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program (*PFN_clCreateProgramWithSource)(
+    cl_context /* context */, cl_uint /* count */, const char ** /* strings */,
+    const size_t * /* lengths */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program (*PFN_clCreateProgramWithBinary)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const size_t * /* lengths */,
+    const unsigned char ** /* binaries */, cl_int * /* binary_status */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program (*PFN_clCreateProgramWithBuiltInKernels)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* kernel_names */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clRetainProgram)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseProgram)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clBuildProgram)(
+    cl_program /* program */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clCompileProgram)(
+    cl_program /* program */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    cl_uint /* num_input_headers */, const cl_program * /* input_headers */,
+    const char ** /* header_include_names */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_program (*PFN_clLinkProgram)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    cl_uint /* num_input_programs */, const cl_program * /* input_programs */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clUnloadPlatformCompiler)(cl_platform_id /* platform */)
+    CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clGetProgramInfo)(
+    cl_program /* program */, cl_program_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetProgramBuildInfo)(
+    cl_program /* program */, cl_device_id /* device */,
+    cl_program_build_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_kernel (*PFN_clCreateKernel)(
+    cl_program /* program */, const char * /* kernel_name */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clCreateKernelsInProgram)(
+    cl_program /* program */, cl_uint /* num_kernels */,
+    cl_kernel * /* kernels */,
+    cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clRetainKernel)(cl_kernel /* kernel */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseKernel)(cl_kernel /* kernel */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clSetKernelArg)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clSetKernelArgSVMPointer)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clSetKernelExecInfo)(
+    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
+    size_t /* param_value_size */,
+    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clGetKernelInfo)(
+    cl_kernel /* kernel */, cl_kernel_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetKernelArgInfo)(
+    cl_kernel /* kernel */, cl_uint /* arg_indx */,
+    cl_kernel_arg_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clGetKernelWorkGroupInfo)(
+    cl_kernel /* kernel */, cl_device_id /* device */,
+    cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clWaitForEvents)(cl_uint /* num_events */,
+                                      const cl_event * /* event_list */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clGetEventInfo)(
+    cl_event /* event */, cl_event_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_event (*PFN_clCreateUserEvent)(cl_context /* context */,
+                                          cl_int * /* errcode_ret */)
+    CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clRetainEvent)(cl_event /* event */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clReleaseEvent)(cl_event /* event */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clSetUserEventStatus)(cl_event /* event */,
+                                           cl_int /* execution_status */)
+    CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clSetEventCallback)(
+    cl_event /* event */, cl_int /* command_exec_callback_type */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clGetEventProfilingInfo)(
+    cl_event /* event */, cl_profiling_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clFlush)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clFinish)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueReadBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */,
+    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueReadBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_read */, const size_t * /* buffer_offset */,
+    const size_t * /* host_offset */, const size_t * /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
+    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
+    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clEnqueueWriteBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */,
+    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueWriteBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_write */, const size_t * /* buffer_offset */,
+    const size_t * /* host_offset */, const size_t * /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
+    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
+    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clEnqueueFillBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */,
+    size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueCopyBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */,
+    size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueCopyBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_buffer */, const size_t * /* src_origin */,
+    const size_t * /* dst_origin */, const size_t * /* region */,
+    size_t /* src_row_pitch */, size_t /* src_slice_pitch */,
+    size_t /* dst_row_pitch */, size_t /* dst_slice_pitch */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int (*PFN_clEnqueueReadImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_read */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, size_t /* row_pitch */,
+    size_t /* slice_pitch */, void * /* ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueWriteImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_write */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, size_t /* input_row_pitch */,
+    size_t /* input_slice_pitch */, const void * /* ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueFillImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    const void * /* fill_color */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueCopyImage)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */,
+    cl_mem /* dst_image */, const size_t * /* src_origin[3] */,
+    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueCopyImageToBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */,
+    cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */,
+    const size_t * /* region[3] */, size_t /* dst_offset */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueCopyBufferToImage)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_image */, size_t /* src_offset */,
+    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef void *(*PFN_clEnqueueMapBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
+    size_t /* offset */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */,
+    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef void *(*PFN_clEnqueueMapImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
+    const size_t * /* origin[3] */, const size_t * /* region[3] */,
+    size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */,
+    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueUnmapMemObject)(
+    cl_command_queue /* command_queue */, cl_mem /* memobj */,
+    void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueMigrateMemObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
+    const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueNDRangeKernel)(
+    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
+    cl_uint /* work_dim */, const size_t * /* global_work_offset */,
+    const size_t * /* global_work_size */, const size_t * /* local_work_size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueNativeKernel)(
+    cl_command_queue /* command_queue */,
+    void(CL_CALLBACK * /*user_func*/)(void *), void * /* args */,
+    size_t /* cb_args */, cl_uint /* num_mem_objects */,
+    const cl_mem * /* mem_list */, const void ** /* args_mem_loc */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int (*PFN_clEnqueueMarkerWithWaitList)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueBarrierWithWaitList)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueSVMFree)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    void *[] /* svm_pointers[] */,
+    void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                          cl_uint /* num_svm_pointers */,
+                                          void *[] /* svm_pointers[] */,
+                                          void * /* user_data */),
+    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clEnqueueSVMMemcpy)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
+    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clEnqueueSVMMemFill)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clEnqueueSVMMap)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
+    cl_map_flags /* flags */, void * /* svm_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int (*PFN_clEnqueueSVMUnmap)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef void *(*PFN_clGetExtensionFunctionAddressForPlatform)(
+    cl_platform_id /* platform */,
+    const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem (*PFN_clCreateImage2D)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */, size_t /* image_width */,
+    size_t /* image_height */, size_t /* image_row_pitch */,
+    void * /* host_ptr */, cl_int * /* errcode_ret */);
+typedef cl_mem (*PFN_clCreateImage3D)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */, size_t /* image_width */,
+    size_t /* image_height */, size_t /* image_depth */,
+    size_t /* image_row_pitch */, size_t /* image_slice_pitch */,
+    void * /* host_ptr */, cl_int * /* errcode_ret */);
+typedef cl_int (*PFN_clEnqueueMarker)(cl_command_queue /* command_queue */,
+                                      cl_event * /* event */);
+typedef cl_int (*PFN_clEnqueueWaitForEvents)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events */,
+    const cl_event * /* event_list */);
+typedef cl_int (*PFN_clEnqueueBarrier)(cl_command_queue /* command_queue */);
+typedef cl_int (*PFN_clUnloadCompiler)();
+typedef void *(*PFN_clGetExtensionFunctionAddress)(
+    const char * /* func_name */);
+typedef cl_command_queue (*PFN_clCreateCommandQueue)(
+    cl_context /* context */, cl_device_id /* device */,
+    cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */);
+typedef cl_sampler (*PFN_clCreateSampler)(
+    cl_context /* context */, cl_bool /* normalized_coords */,
+    cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */,
+    cl_int * /* errcode_ret */);
+typedef cl_int (*PFN_clEnqueueTask)(cl_command_queue /* command_queue */,
+                                    cl_kernel /* kernel */,
+                                    cl_uint /* num_events_in_wait_list */,
+                                    const cl_event * /* event_wait_list */,
+                                    cl_event * /* event */);
+
+// OpenGL sharing
+typedef cl_mem (*PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint,
+                                           int *);
+typedef cl_mem (*PFN_clCreateFromGLTexture)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
+    cl_GLint /* miplevel */, cl_GLuint /* texture */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (*PFN_clEnqueueAcquireGLObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */);
+
+typedef cl_int (*PFN_clEnqueueReleaseGLObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+// cl_khr_egl_event extension
+
+// CLeglDisplayKHR is an opaque handle to an EGLDisplay
+typedef void *CLeglDisplayKHR;
+
+// CLeglSyncKHR is an opaque handle to an EGLSync object
+typedef void *CLeglSyncKHR;
+
+typedef cl_event (*PFN_clCreateEventFromEGLSyncKHR)(
+    cl_context /* context */, CLeglSyncKHR /* sync */,
+    CLeglDisplayKHR /* display */, cl_int * /* errcode_ret */);
+
+extern PFN_clGetPlatformIDs clGetPlatformIDs;
+extern PFN_clGetPlatformInfo clGetPlatformInfo;
+extern PFN_clGetDeviceIDs clGetDeviceIDs;
+extern PFN_clGetDeviceInfo clGetDeviceInfo;
+extern PFN_clCreateSubDevices clCreateSubDevices;
+extern PFN_clRetainDevice clRetainDevice;
+extern PFN_clReleaseDevice clReleaseDevice;
+extern PFN_clCreateContext clCreateContext;
+extern PFN_clCreateContextFromType clCreateContextFromType;
+extern PFN_clRetainContext clRetainContext;
+extern PFN_clReleaseContext clReleaseContext;
+extern PFN_clGetContextInfo clGetContextInfo;
+extern PFN_clCreateCommandQueueWithProperties
+    clCreateCommandQueueWithProperties;
+extern PFN_clRetainCommandQueue clRetainCommandQueue;
+extern PFN_clReleaseCommandQueue clReleaseCommandQueue;
+extern PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
+extern PFN_clCreateBuffer clCreateBuffer;
+extern PFN_clCreateSubBuffer clCreateSubBuffer;
+extern PFN_clCreateImage clCreateImage;
+extern PFN_clCreatePipe clCreatePipe;
+extern PFN_clRetainMemObject clRetainMemObject;
+extern PFN_clReleaseMemObject clReleaseMemObject;
+extern PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
+extern PFN_clGetMemObjectInfo clGetMemObjectInfo;
+extern PFN_clGetImageInfo clGetImageInfo;
+extern PFN_clGetPipeInfo clGetPipeInfo;
+extern PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+extern PFN_clSVMAlloc clSVMAlloc;
+extern PFN_clSVMFree clSVMFree;
+extern PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+extern PFN_clRetainSampler clRetainSampler;
+extern PFN_clReleaseSampler clReleaseSampler;
+extern PFN_clGetSamplerInfo clGetSamplerInfo;
+extern PFN_clCreateProgramWithSource clCreateProgramWithSource;
+extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
+extern PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+extern PFN_clRetainProgram clRetainProgram;
+extern PFN_clReleaseProgram clReleaseProgram;
+extern PFN_clBuildProgram clBuildProgram;
+extern PFN_clCompileProgram clCompileProgram;
+extern PFN_clLinkProgram clLinkProgram;
+extern PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+extern PFN_clGetProgramInfo clGetProgramInfo;
+extern PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
+extern PFN_clCreateKernel clCreateKernel;
+extern PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
+extern PFN_clRetainKernel clRetainKernel;
+extern PFN_clReleaseKernel clReleaseKernel;
+extern PFN_clSetKernelArg clSetKernelArg;
+extern PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+extern PFN_clSetKernelExecInfo clSetKernelExecInfo;
+extern PFN_clGetKernelInfo clGetKernelInfo;
+extern PFN_clGetKernelArgInfo clGetKernelArgInfo;
+extern PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+extern PFN_clWaitForEvents clWaitForEvents;
+extern PFN_clGetEventInfo clGetEventInfo;
+extern PFN_clCreateUserEvent clCreateUserEvent;
+extern PFN_clRetainEvent clRetainEvent;
+extern PFN_clReleaseEvent clReleaseEvent;
+extern PFN_clSetUserEventStatus clSetUserEventStatus;
+extern PFN_clSetEventCallback clSetEventCallback;
+extern PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
+extern PFN_clFlush clFlush;
+extern PFN_clFinish clFinish;
+extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
+extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+extern PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
+extern PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+extern PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+extern PFN_clEnqueueReadImage clEnqueueReadImage;
+extern PFN_clEnqueueWriteImage clEnqueueWriteImage;
+extern PFN_clEnqueueFillImage clEnqueueFillImage;
+extern PFN_clEnqueueCopyImage clEnqueueCopyImage;
+extern PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+extern PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
+extern PFN_clEnqueueMapImage clEnqueueMapImage;
+extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+extern PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+extern PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+extern PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
+extern PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+extern PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+extern PFN_clEnqueueSVMFree clEnqueueSVMFree;
+extern PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+extern PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+extern PFN_clEnqueueSVMMap clEnqueueSVMMap;
+extern PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+extern PFN_clGetExtensionFunctionAddressForPlatform
+    clGetExtensionFunctionAddressForPlatform;
+extern PFN_clCreateImage2D clCreateImage2D;
+extern PFN_clCreateImage3D clCreateImage3D;
+extern PFN_clEnqueueMarker clEnqueueMarker;
+extern PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+extern PFN_clEnqueueBarrier clEnqueueBarrier;
+extern PFN_clUnloadCompiler clUnloadCompiler;
+extern PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+extern PFN_clCreateCommandQueue clCreateCommandQueue;
+extern PFN_clCreateSampler clCreateSampler;
+extern PFN_clEnqueueTask clEnqueueTask;
+
+// OpenGL sharing
+extern PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
+extern PFN_clCreateFromGLTexture clCreateFromGLTexture;
+extern PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+
+// cl_khr_egl_event extension
+extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+// For convinient image creation
+// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
+// otherwise it will use legacy clCreateImage2D
+cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
+                           const cl_image_format *image_format,
+                           const cl_image_desc *image_desc, void *host_ptr,
+                           cl_int *errcode_ret);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/precision.cc b/tensorflow/lite/delegates/gpu/cl/precision.cc
new file mode 100644
index 00000000000..c4d48668bff
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/precision.cc
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string ToString(CalculationsPrecision precision) {
+  switch (precision) {
+    case CalculationsPrecision::F32_F16:
+      return "CalculationsPrecision::F32_F16";
+    case CalculationsPrecision::F32:
+      return "CalculationsPrecision::F32";
+    case CalculationsPrecision::F16:
+      return "CalculationsPrecision::F16";
+  }
+}
+
+DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision) {
+  if (precision == CalculationsPrecision::F32) {
+    return DataType::FLOAT32;
+  } else {
+    return DataType::FLOAT16;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/precision.h b/tensorflow/lite/delegates/gpu/cl/precision.h
new file mode 100644
index 00000000000..e5bf480802b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/precision.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_PRECISION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_PRECISION_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class CalculationsPrecision { F32, F32_F16, F16 };
+// F32 - all data and all math ops in F32
+// F16 - all data and all math ops in F16
+// F32_F16 - as F16, but some operations (Convolution,
+// DepthWiseConvolution, FullyConnected, ConvolutionTransposed)
+// have accumulator in F32 and usually it calculates 4 mads in F16, sum them,
+// than converts this partial sum to F32 and add to acumulator.
+
+DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision);
+
+std::string ToString(CalculationsPrecision precision);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_PRECISION_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/program_cache.cc b/tensorflow/lite/delegates/gpu/cl/program_cache.cc
new file mode 100644
index 00000000000..00ab1b791c4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/program_cache.cc
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+
+#include <cstdint>
+#include <string>
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include <farmhash.h>
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ProgramCache::ProgramDescriptor::ProgramDescriptor(const std::string& code_text,
+                                                   const std::string& options,
+                                                   bool use_fingerprints)
+    : code(code_text),
+      compiler_options(options),
+      use_fingerprint(use_fingerprints) {
+  const uint64_t code_fingerprint = ::util::Fingerprint64(code);
+  const uint64_t options_fingerprint =
+      ::util::Fingerprint64(compiler_options);
+  fingerprint = code_fingerprint + options_fingerprint;
+}
+
+ProgramCache::ProgramDescriptor::ProgramDescriptor(uint64_t fingerprints)
+    : fingerprint(fingerprints), use_fingerprint(true) {}
+
+ProgramCache::ProgramCache(ProgramCache&& program_cache)
+    : use_fingerprints_(program_cache.use_fingerprints_),
+      programs_(std::move(program_cache.programs_)) {}
+
+ProgramCache& ProgramCache::operator=(ProgramCache&& program_cache) {
+  if (this != &program_cache) {
+    use_fingerprints_ = program_cache.use_fingerprints_;
+    programs_ = std::move(program_cache.programs_);
+  }
+  return *this;
+}
+
+Status ProgramCache::GetOrCreateCLKernel(
+    const std::string& code, const std::string& function_name,
+    const std::vector<CompilerOptions>& compiler_options,
+    const CLContext& context, const CLDevice& device, CLKernel* result) {
+  const std::string options = CompilerOptionsToString(device, compiler_options);
+  ProgramDescriptor desc{code, options, use_fingerprints_};
+  auto it = programs_.find(desc);
+  if (it != programs_.end()) {
+    RETURN_IF_ERROR(result->CreateFromProgram(it->second, function_name));
+    return OkStatus();
+  }
+
+  CLProgram program;
+  RETURN_IF_ERROR(CreateCLProgram(code, options, context, device, &program));
+  RETURN_IF_ERROR(result->CreateFromProgram(program, function_name));
+  programs_.insert(std::make_pair(std::move(desc), std::move(program)));
+  return OkStatus();
+}
+
+Status ProgramCache::GetOrCreateCLKernel(const std::string& code,
+                                         const std::string& function_name,
+                                         const CLContext& context,
+                                         const CLDevice& device,
+                                         CLKernel* result) {
+  return GetOrCreateCLKernel(code, function_name, {}, context, device, result);
+}
+
+Status ProgramCache::AddSerializedCache(
+    const CLContext& context, const CLDevice& device,
+    absl::Span<const uint8_t> serialized_cache) {
+  flatbuffers::Verifier verifier(serialized_cache.data(),
+                                 serialized_cache.size());
+  if (!data::VerifyCompiledCacheBuffer(verifier)) {
+    return InvalidArgumentError("Serialized model is corrupted.");
+  }
+
+  auto model = data::GetCompiledCache(serialized_cache.data());
+  std::string platform_version(model->driver_version()->c_str(),
+                               model->driver_version()->size());
+
+  if (device.GetPlatformVersion() != platform_version) {
+    return InvalidArgumentError(
+        "OpenCL driver changed, cache invalid, should be regenerated");
+  }
+
+  use_fingerprints_ = true;
+
+  for (auto serialized_program : *model->programs()) {
+    ProgramDescriptor desc(serialized_program->fingerprint());
+    CLProgram program;
+    RETURN_IF_ERROR(CreateCLProgramFromBinary(
+        context, device,
+        absl::MakeSpan(serialized_program->binary()->data(),
+                       serialized_program->binary()->size()),
+        &program));
+    auto it = programs_.find(desc);
+    if (it == programs_.end()) {
+      programs_.insert(std::make_pair(std::move(desc), std::move(program)));
+    }
+  }
+  return OkStatus();
+}
+
+Status ProgramCache::GetSerializedCache(
+    const CLDevice& device, std::vector<uint8_t>* serialized_cache) const {
+  ::flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<data::Program>> serialized_programs;
+  for (auto& program : programs_) {
+    std::vector<uint8_t> binary;
+    RETURN_IF_ERROR(program.second.GetBinary(&binary));
+    auto binary_offset = builder.CreateVector(binary);
+    data::ProgramBuilder program_builder(builder);
+    program_builder.add_fingerprint(program.first.fingerprint);
+    program_builder.add_binary(binary_offset);
+    serialized_programs.push_back(program_builder.Finish());
+  }
+  auto driver_version = builder.CreateString(device.GetPlatformVersion());
+  auto programs_s = builder.CreateVector(serialized_programs);
+  data::CompiledCacheBuilder cache_builder(builder);
+  cache_builder.add_driver_version(driver_version);
+  cache_builder.add_programs(programs_s);
+  data::FinishCompiledCacheBuffer(builder, cache_builder.Finish());
+  size_t next_element = serialized_cache->size();
+  serialized_cache->resize(serialized_cache->size() + builder.GetSize());
+  memcpy(&(*serialized_cache)[next_element], builder.GetBufferPointer(),
+         builder.GetSize());
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/program_cache.h b/tensorflow/lite/delegates/gpu/cl/program_cache.h
new file mode 100644
index 00000000000..b8d019d3d47
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/program_cache.h
@@ -0,0 +1,104 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ProgramCache {
+ public:
+  ProgramCache() = default;
+
+  ProgramCache(ProgramCache&& program_cache);
+  ProgramCache& operator=(ProgramCache&& program_cache);
+  ProgramCache(const ProgramCache&) = delete;
+  ProgramCache& operator=(const ProgramCache&) = delete;
+
+  Status GetOrCreateCLKernel(
+      const std::string& code, const std::string& function_name,
+      const std::vector<CompilerOptions>& compiler_options,
+      const CLContext& context, const CLDevice& device, CLKernel* result);
+
+  Status GetOrCreateCLKernel(const std::string& code,
+                             const std::string& function_name,
+                             const CLContext& context, const CLDevice& device,
+                             CLKernel* result);
+
+  Status AddSerializedCache(const CLContext& context, const CLDevice& device,
+                            absl::Span<const uint8_t> serialized_cache);
+  Status GetSerializedCache(const CLDevice& device,
+                            std::vector<uint8_t>* serialized_cache) const;
+
+ private:
+  struct ProgramDescriptor {
+    ProgramDescriptor() = default;
+    ProgramDescriptor(const std::string& code_text, const std::string& options,
+                      bool use_fingerprint);
+    explicit ProgramDescriptor(uint64_t fingerprint);
+
+    std::string code;
+    std::string compiler_options;
+    uint64_t fingerprint;
+    bool use_fingerprint;
+  };
+  struct ProgramDescriptorHasher {
+    std::size_t operator()(const ProgramDescriptor& k) const {
+      if (k.use_fingerprint) {
+        return std::hash<uint64_t>()(k.fingerprint);
+      } else {
+        return std::hash<std::string>()(k.code) +
+               std::hash<std::string>()(k.compiler_options);
+      }
+    }
+  };
+  struct ProgramDescriptorEqual {
+    bool operator()(const ProgramDescriptor& a,
+                    const ProgramDescriptor& b) const {
+      if (a.use_fingerprint && b.use_fingerprint) {
+        return a.fingerprint == b.fingerprint;
+      } else {
+        return a.compiler_options == b.compiler_options && a.code == b.code;
+      }
+    }
+  };
+
+  // There is a low probability of a hash collision when cache is deserialized
+  // because only fingerprints are serialized instead of full source code.
+  bool use_fingerprints_ = false;
+  std::unordered_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
+                     ProgramDescriptorEqual>
+      programs_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
new file mode 100644
index 00000000000..7d165eeb28a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -0,0 +1,122 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "operation_selector",
+    srcs = ["operation_selector.cc"],
+    hdrs = ["operation_selector.h"],
+    deps = [
+        ":convolution_selector",
+        ":convolution_transposed_selector",
+        ":dw_convolution_selector",
+        ":fully_connected_selector",
+        ":simple_selectors",
+        "//tensorflow/lite/delegates/gpu/cl:model_hints",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:hard_swish",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
+cc_library(
+    name = "convolution_selector",
+    srcs = ["convolution_selector.cc"],
+    hdrs = ["convolution_selector.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:model_hints",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "convolution_transposed_selector",
+    srcs = ["convolution_transposed_selector.cc"],
+    hdrs = ["convolution_transposed_selector.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_3x3_thin",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_thin",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "fully_connected_selector",
+    srcs = ["fully_connected_selector.cc"],
+    hdrs = ["fully_connected_selector.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected_texture",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "dw_convolution_selector",
+    srcs = ["dw_convolution_selector.cc"],
+    hdrs = ["dw_convolution_selector.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl/kernels:depth_wise_conv",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:depth_wise_conv_3x3_texture",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "simple_selectors",
+    srcs = ["simple_selectors.cc"],
+    hdrs = ["simple_selectors.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl/kernels:abs",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:add",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:apply_mask",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:multiply_add",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:padding",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:pooling",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:prelu",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:relu",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:reshape",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:reshapex4",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:sigmoid",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:softmax",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:softmax1x1",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:strided_slice",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:upsample",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/memory",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
new file mode 100644
index 00000000000..fe30d115c41
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -0,0 +1,127 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+Status SelectConvolutionTextureArray(const Convolution2DAttributes& attr,
+                                     const BHWC& dst_shape,
+                                     const CreationContext& creation_context,
+                                     const OperationDef& op_def,
+                                     ModelHints hints,
+                                     std::unique_ptr<GPUOperation>* ptr) {
+  if (creation_context.device->IsPowerVR() &&
+    IsConvPowerVRSupported(op_def, attr)) {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return OkStatus();
+  }
+  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
+    ConvConstants conv;
+    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
+  } else {
+    ConvTexture conv;
+    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+  }
+
+  return OkStatus();
+}
+
+Status SelectConvolutionTexture2D(const Convolution2DAttributes& attr,
+                                  const CreationContext& creation_context,
+                                  const OperationDef& op_def,
+                                  std::unique_ptr<GPUOperation>* ptr) {
+  if (creation_context.device->IsPowerVR() &&
+    IsConvPowerVRSupported(op_def, attr)) {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return OkStatus();
+  }
+  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
+    ConvConstants conv;
+    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
+  } else {
+    ConvTexture conv;
+    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+  }
+  return OkStatus();
+}
+
+Status SelectConvolutionBuffer(const Convolution2DAttributes& attr,
+                               const CreationContext& creation_context,
+                               const OperationDef& op_def,
+                               std::unique_ptr<GPUOperation>* ptr) {
+  if (creation_context.device->IsPowerVR() &&
+    IsConvPowerVRSupported(op_def, attr)) {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return OkStatus();
+  }
+  if (IsConvBuffer1x1Supported(op_def, attr)) {
+    ConvBuffer1x1 conv;
+    RETURN_IF_ERROR(CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+  } else {
+    ConvBuffer conv;
+    RETURN_IF_ERROR(CreateConvBuffer(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvBuffer>(std::move(conv));
+  }
+  return OkStatus();
+}
+}  // namespace
+
+Status SelectConvolution(const Convolution2DAttributes& attr,
+                         const BHWC& dst_shape,
+                         const CreationContext& creation_context,
+                         const OperationDef& op_def, ModelHints hints,
+                         std::unique_ptr<GPUOperation>* ptr) {
+  switch (op_def.GetPrimaryStorageType()) {
+    case TensorStorageType::TEXTURE_ARRAY:
+      return SelectConvolutionTextureArray(attr, dst_shape, creation_context,
+                                           op_def, hints, ptr);
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return SelectConvolutionTexture2D(attr, creation_context, op_def, ptr);
+    case TensorStorageType::BUFFER:
+      return SelectConvolutionBuffer(attr, creation_context, op_def, ptr);
+    default:
+      return InternalError("Unknown storage type.");
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
new file mode 100644
index 00000000000..7dd6c79eea0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectConvolution(const Convolution2DAttributes& attr,
+                         const BHWC& dst_shape,
+                         const CreationContext& creation_context,
+                         const OperationDef& op_def, ModelHints hints,
+                         std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
new file mode 100644
index 00000000000..8d8bfb08e47
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+Status SelectConvolutionTransposedTextureArray(
+    const ConvolutionTransposedAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    std::unique_ptr<GPUOperation>* ptr) {
+  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
+    ConvolutionTransposedThin conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
+                                                     attr)) {
+    ConvolutionTransposed3x3Thin conv;
+    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
+                                                       attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+  } else {
+    ConvolutionTransposed conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+  }
+  return OkStatus();
+}
+
+Status SelectConvolutionTransposedTexture2D(
+    const ConvolutionTransposedAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    std::unique_ptr<GPUOperation>* ptr) {
+  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
+    ConvolutionTransposedThin conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
+                                                     attr)) {
+    ConvolutionTransposed3x3Thin conv;
+    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
+                                                       attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+  } else {
+    ConvolutionTransposed conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+  }
+  return OkStatus();
+}
+
+Status SelectConvolutionTransposedBuffer(
+    const ConvolutionTransposedAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    std::unique_ptr<GPUOperation>* ptr) {
+  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
+    ConvolutionTransposedThin conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
+                                                     attr)) {
+    ConvolutionTransposed3x3Thin conv;
+    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
+                                                       attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+  } else {
+    ConvolutionTransposed conv;
+    RETURN_IF_ERROR(
+        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+  }
+  return OkStatus();
+}
+}  // namespace
+
+Status SelectConvolutionTransposed(const ConvolutionTransposedAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr) {
+  switch (op_def.GetPrimaryStorageType()) {
+    case TensorStorageType::TEXTURE_ARRAY:
+      return SelectConvolutionTransposedTextureArray(attr, creation_context,
+                                                     op_def, ptr);
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return SelectConvolutionTransposedTexture2D(attr, creation_context,
+                                                  op_def, ptr);
+    case TensorStorageType::BUFFER:
+      return SelectConvolutionTransposedBuffer(attr, creation_context, op_def,
+                                               ptr);
+    default:
+      return InternalError("Unknown storage type.");
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
new file mode 100644
index 00000000000..50f5e5baad5
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectConvolutionTransposed(const ConvolutionTransposedAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
new file mode 100644
index 00000000000..109e6871328
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+Status SelectDWConvolutionTextureArray(
+    const DepthwiseConvolution2DAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    std::unique_ptr<GPUOperation>* ptr) {
+  if (IsDepthWiseConv3x3TextureSupported(attr)) {
+    DepthWiseConv3x3Texture dw_conv;
+    RETURN_IF_ERROR(CreateDepthWiseConv3x3Texture(creation_context, op_def,
+                                                  attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConv3x3Texture>(std::move(dw_conv));
+  } else {
+    DepthWiseConvolution dw_conv;
+    RETURN_IF_ERROR(
+        CreateDepthWiseConvolution(creation_context, op_def, attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConvolution>(std::move(dw_conv));
+  }
+  return OkStatus();
+}
+
+Status SelectDWConvolutionTexture2D(
+    const DepthwiseConvolution2DAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    std::unique_ptr<GPUOperation>* ptr) {
+  if (IsDepthWiseConv3x3TextureSupported(attr)) {
+    DepthWiseConv3x3Texture dw_conv;
+    RETURN_IF_ERROR(CreateDepthWiseConv3x3Texture(creation_context, op_def,
+                                                  attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConv3x3Texture>(std::move(dw_conv));
+  } else {
+    DepthWiseConvolution dw_conv;
+    RETURN_IF_ERROR(
+        CreateDepthWiseConvolution(creation_context, op_def, attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConvolution>(std::move(dw_conv));
+  }
+  return OkStatus();
+}
+
+Status SelectDWConvolutionBuffer(const DepthwiseConvolution2DAttributes& attr,
+                                 const CreationContext& creation_context,
+                                 const OperationDef& op_def,
+                                 std::unique_ptr<GPUOperation>* ptr) {
+  DepthWiseConvolution dw_conv;
+  RETURN_IF_ERROR(
+      CreateDepthWiseConvolution(creation_context, op_def, attr, &dw_conv));
+  *ptr = absl::make_unique<DepthWiseConvolution>(std::move(dw_conv));
+  return OkStatus();
+}
+}  // namespace
+
+Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
+                           const CreationContext& creation_context,
+                           const OperationDef& op_def,
+                           std::unique_ptr<GPUOperation>* ptr) {
+  switch (op_def.GetPrimaryStorageType()) {
+    case TensorStorageType::TEXTURE_ARRAY:
+      return SelectDWConvolutionTextureArray(attr, creation_context, op_def,
+                                             ptr);
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return SelectDWConvolutionTexture2D(attr, creation_context, op_def, ptr);
+    case TensorStorageType::BUFFER:
+      return SelectDWConvolutionBuffer(attr, creation_context, op_def, ptr);
+    default:
+      return InternalError("Unknown storage type.");
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
new file mode 100644
index 00000000000..c15f2946495
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
+                           const CreationContext& creation_context,
+                           const OperationDef& op_def,
+                           std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
new file mode 100644
index 00000000000..68138fa752e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectFullyConnected(const FullyConnectedAttributes& attr,
+                            const CreationContext& creation_context,
+                            const OperationDef& op_def,
+                            std::unique_ptr<GPUOperation>* ptr) {
+  FullyConnectedTexture fc;
+  RETURN_IF_ERROR(
+      CreateFullyConnectedTexture(creation_context, op_def, attr, &fc));
+  *ptr = absl::make_unique<FullyConnectedTexture>(std::move(fc));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
new file mode 100644
index 00000000000..76da18be7d6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectFullyConnected(const FullyConnectedAttributes& attr,
+                            const CreationContext& creation_context,
+                            const OperationDef& op_def,
+                            std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
new file mode 100644
index 00000000000..34bbd604a97
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -0,0 +1,167 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status GPUOperationFromNode(const CreationContext& creation_context,
+                            const OperationDef& op_def, ModelHints hints,
+                            const GraphFloat32& graph, const Node& node,
+                            std::unique_ptr<GPUOperation>* gpu_op) {
+  auto inputs = graph.FindInputs(node.id);
+  auto outputs = graph.FindOutputs(node.id);
+
+  auto op_type = OperationTypeFromString(node.operation.type);
+  switch (op_type) {
+    case OperationType::ABS: {
+      SelectAbs(op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::ADD: {
+      const auto attr =
+          absl::any_cast<AddAttributes>(node.operation.attributes);
+      const auto* adds =
+          absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      const auto* adds_scalar = absl::get_if<float>(&attr.param);
+      if (adds || adds_scalar) {
+        return SelectBroadcastAdd(attr, creation_context, op_def, gpu_op);
+      } else {
+        auto output = outputs[0];
+        std::vector<int> channels(inputs.size());
+        for (int i = 0; i < inputs.size(); ++i) {
+          channels[i] = inputs[i]->tensor.shape.c;
+        }
+        SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
+        return OkStatus();
+      }
+    }
+    case OperationType::APPLY_MASK: {
+      SelectApplyMask(op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::CONCAT: {
+      auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
+      std::vector<int> channels(inputs.size());
+      for (int i = 0; i < inputs.size(); ++i) {
+        channels[i] = inputs[i]->tensor.shape.c;
+      }
+      return SelectConcat(attr, channels, op_def, gpu_op);
+    }
+    case OperationType::CONVOLUTION_2D: {
+      auto attr =
+          absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
+      auto input = inputs[0];
+      return SelectConvolution(attr, input->tensor.shape, creation_context,
+                               op_def, hints, gpu_op);
+    }
+    case OperationType::CONVOLUTION_TRANSPOSED: {
+      auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
+          node.operation.attributes);
+      return SelectConvolutionTransposed(attr, creation_context, op_def,
+                                         gpu_op);
+    }
+    case OperationType::DEPTHWISE_CONVOLUTION: {
+      auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
+          node.operation.attributes);
+      return SelectDWConvolution(attr, creation_context, op_def, gpu_op);
+    }
+    case OperationType::FULLY_CONNECTED: {
+      auto attr =
+          absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
+      return SelectFullyConnected(attr, creation_context, op_def, gpu_op);
+    }
+    case OperationType::HARD_SWISH:
+      *gpu_op = HardSwish::Create(op_def);
+      return OkStatus();
+    case OperationType::MAX_UNPOOLING_2D: {
+      auto attr =
+          absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
+      SelectMaxUnpooling(attr, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::MULTIPLY_SCALAR: {
+      auto attr =
+          absl::any_cast<MultiplyScalarAttributes>(node.operation.attributes);
+      return SelectMultiplyScalar(attr, creation_context, op_def, gpu_op);
+    }
+    case OperationType::PAD: {
+      auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
+      SelectPadding(attr, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::POOLING_2D: {
+      auto attr =
+          absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
+      SelectPooling(attr, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::PRELU: {
+      auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
+      return SelectPReLU(attr, creation_context, op_def, gpu_op);
+    }
+    case OperationType::RELU: {
+      auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
+      SelectReLU(creation_context, attr, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::RESHAPE: {
+      const int src_channels = inputs[0]->tensor.shape.c;
+      auto attr = absl::any_cast<ReshapeAttributes>(node.operation.attributes);
+      SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::SIGMOID: {
+      SelectSigmoid(op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::SLICE: {
+      auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
+      SelectStridedSlice(attr, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::SOFTMAX: {
+      SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
+      return OkStatus();
+    }
+    case OperationType::UPSAMPLE_2D: {
+      auto attr =
+          absl::any_cast<Upsample2DAttributes>(node.operation.attributes);
+      return SelectUpsampling(attr, op_def, gpu_op);
+    }
+    default:
+      return UnimplementedError(
+          absl::StrCat("No selector for ", node.operation.type));
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
new file mode 100644
index 00000000000..c1fb0557f6f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_OPERATION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_OPERATION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status GPUOperationFromNode(const CreationContext& creation_context,
+                            const OperationDef& op_def, ModelHints hints,
+                            const GraphFloat32& graph, const Node& node,
+                            std::unique_ptr<GPUOperation>* gpu_op);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_OPERATION_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
new file mode 100644
index 00000000000..fcafe584ffc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -0,0 +1,188 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/relu.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshape.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/upsample.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+void SelectAbs(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr) {
+  Abs operation = CreateAbs(op_def);
+  *ptr = absl::make_unique<Abs>(std::move(operation));
+}
+
+void SelectApplyMask(const OperationDef& op_def,
+                     std::unique_ptr<GPUOperation>* ptr) {
+  ApplyMask operation = CreateApplyMask(op_def);
+  *ptr = absl::make_unique<ApplyMask>(std::move(operation));
+}
+
+void SelectReLU(const CreationContext& creation_context,
+                const ReLUAttributes& attr, const OperationDef& op_def,
+                std::unique_ptr<GPUOperation>* ptr) {
+  ReLU relu = CreateReLU(creation_context, op_def, attr);
+  *ptr = absl::make_unique<ReLU>(std::move(relu));
+}
+
+Status SelectPReLU(const PReLUAttributes& attr,
+                   const CreationContext& creation_context,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  PReLU operation;
+  RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation));
+  *ptr = absl::make_unique<PReLU>(std::move(operation));
+  return OkStatus();
+}
+
+void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  Pooling pooling = CreatePooling(op_def, attr);
+  *ptr = absl::make_unique<Pooling>(std::move(pooling));
+}
+
+void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr) {
+  MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+  *ptr = absl::make_unique<MaxUnpooling>(std::move(operation));
+}
+
+void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
+               int dst_channels, std::unique_ptr<GPUOperation>* ptr) {
+  Add operation = CreateAdd(op_def, channels, dst_channels);
+  *ptr = absl::make_unique<Add>(std::move(operation));
+}
+
+void SelectSigmoid(const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  Sigmoid operation = CreateSigmoid(op_def);
+  *ptr = absl::make_unique<Sigmoid>(std::move(operation));
+}
+
+Status SelectUpsampling(const Upsample2DAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr) {
+  if (attr.type != UpsamplingType::BILINEAR) {
+    return UnimplementedError("Upsample2D supports only bilinear type.");
+  }
+  Upsample operation = CreateUpsample(op_def, attr);
+  *ptr = absl::make_unique<Upsample>(std::move(operation));
+  return OkStatus();
+}
+
+Status SelectConcat(const ConcatAttributes& attr,
+                    const std::vector<int>& channels,
+                    const OperationDef& op_def,
+                    std::unique_ptr<GPUOperation>* ptr) {
+  switch (attr.axis) {
+    case Axis::CHANNELS: {
+      ConcatZ operation = CreateConcatZ(op_def, channels);
+      *ptr = absl::make_unique<ConcatZ>(std::move(operation));
+      return OkStatus();
+    }
+    case Axis::WIDTH:
+    case Axis::HEIGHT: {
+      ConcatXY operation = CreateConcatXY(op_def, attr, channels.size());
+      *ptr = absl::make_unique<ConcatXY>(std::move(operation));
+      return OkStatus();
+    }
+    default:
+      return UnimplementedError("No concat for this axis.");
+  }
+}
+
+void SelectReshape(int src_channels, int dst_channels,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  if (src_channels % 4 == 0 && dst_channels % 4 == 0) {
+    Reshapex4 operation = CreateReshapex4(op_def);
+    *ptr = absl::make_unique<Reshapex4>(std::move(operation));
+  } else {
+    Reshape operation = CreateReshape(op_def);
+    *ptr = absl::make_unique<Reshape>(std::move(operation));
+  }
+}
+
+void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  Padding operation = CreatePadding(op_def, attr);
+  *ptr = absl::make_unique<Padding>(std::move(operation));
+}
+
+void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr) {
+  StridedSlice operation = CreateStridedSlice(op_def, attr);
+  *ptr = absl::make_unique<StridedSlice>(std::move(operation));
+}
+
+Status SelectMultiplyScalar(const MultiplyScalarAttributes& attr,
+                            const CreationContext& creation_context,
+                            const OperationDef& op_def,
+                            std::unique_ptr<GPUOperation>* ptr) {
+  MultiplyAdd operation;
+  RETURN_IF_ERROR(
+      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
+  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
+  return OkStatus();
+}
+
+Status SelectBroadcastAdd(const AddAttributes& attr,
+                          const CreationContext& creation_context,
+                          const OperationDef& op_def,
+                          std::unique_ptr<GPUOperation>* ptr) {
+  MultiplyAdd operation;
+  RETURN_IF_ERROR(
+      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
+  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
+  return OkStatus();
+}
+
+void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr) {
+  if (shape.w == 1 && shape.h == 1) {
+    Softmax1x1 operation = CreateSoftmax1x1(op_def);
+    *ptr = absl::make_unique<Softmax1x1>(std::move(operation));
+  } else {
+    Softmax operation = CreateSoftmax(op_def);
+    *ptr = absl::make_unique<Softmax>(std::move(operation));
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
new file mode 100644
index 00000000000..f78030a0746
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -0,0 +1,93 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SIMPLE_SELECTORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SIMPLE_SELECTORS_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+void SelectAbs(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr);
+
+void SelectApplyMask(const OperationDef& op_def,
+                     std::unique_ptr<GPUOperation>* ptr);
+
+void SelectReLU(const CreationContext& creation_context,
+                const ReLUAttributes& attr, const OperationDef& op_def,
+                std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectPReLU(const PReLUAttributes& attr,
+                   const CreationContext& creation_context,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
+               int dst_channels, std::unique_ptr<GPUOperation>* ptr);
+
+void SelectSigmoid(const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectUpsampling(const Upsample2DAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectConcat(const ConcatAttributes& attr,
+                    const std::vector<int>& channels,
+                    const OperationDef& op_def,
+                    std::unique_ptr<GPUOperation>* ptr);
+
+void SelectReshape(int src_channels, int dst_channels,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectMultiplyScalar(const MultiplyScalarAttributes& attr,
+                            const CreationContext& creation_context,
+                            const OperationDef& op_def,
+                            std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectBroadcastAdd(const AddAttributes& attr,
+                          const CreationContext& creation_context,
+                          const OperationDef& op_def,
+                          std::unique_ptr<GPUOperation>* ptr);
+
+void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SIMPLE_SELECTORS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
new file mode 100644
index 00000000000..6a2d16c514d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -0,0 +1,452 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Tensor::Tensor(cl_mem memory, int width, int height, int channels,
+               enum DataType data_type, TensorStorageType storage_type)
+    : memory_(memory),
+      width_(width),
+      height_(height),
+      channels_(channels),
+      data_type_(data_type),
+      storage_type_(storage_type) {}
+
+Tensor::Tensor(Tensor&& tensor)
+    : memory_(tensor.memory_),
+      width_(tensor.width_),
+      height_(tensor.height_),
+      channels_(tensor.channels_),
+      data_type_(tensor.data_type_),
+      storage_type_(tensor.storage_type_) {
+  tensor.memory_ = nullptr;
+}
+
+Tensor& Tensor::operator=(Tensor&& tensor) {
+  if (this != &tensor) {
+    Release();
+    std::swap(memory_, tensor.memory_);
+    std::swap(width_, tensor.width_);
+    std::swap(height_, tensor.height_);
+    std::swap(channels_, tensor.channels_);
+    std::swap(data_type_, tensor.data_type_);
+    std::swap(storage_type_, tensor.storage_type_);
+  }
+  return *this;
+}
+
+void Tensor::Release() {
+  if (memory_) {
+    clReleaseMemObject(memory_);
+    memory_ = nullptr;
+  }
+}
+
+int3 Tensor::GetFullTensorRegion() const {
+  switch (storage_type_) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return {width_, height_, Depth()};
+    case TensorStorageType::TEXTURE_2D:
+      return {width_, height_ * Depth(), 1};
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return {width_, height_, 1};
+    case TensorStorageType::UNKNOWN:
+      return {-1, -1, -1};
+  }
+}
+
+Status Tensor::IsValid(const BHWC& shape) const {
+  if (shape.b != 1) {
+    return InvalidArgumentError("Batch is not equal to 1.");
+  }
+  if (shape.w != width_) {
+    return InvalidArgumentError("Shape width does not match tensor width");
+  }
+  if (shape.h != height_) {
+    return InvalidArgumentError("Shape height does not match tensor height");
+  }
+  if (shape.c != channels_) {
+    return InvalidArgumentError(
+        "Shape channels does not match tensor channels");
+  }
+  return OkStatus();
+}
+
+Status Tensor::WriteDataBHWC(absl::Span<const float> in,
+                             CLCommandQueue* queue) {
+  if (in.size() != channels_ * width_ * height_) {
+    return InvalidArgumentError("Input data size not match expected size");
+  }
+
+  void* data_ptr = nullptr;
+  int channels = storage_type_ == TensorStorageType::SINGLE_TEXTURE_2D
+                     ? channels_
+                     : AlignByN(channels_, 4);
+  const int elements_count = width_ * height_ * channels;
+
+  const size_t data_size = elements_count * SizeOf(data_type_);
+  std::vector<float> data_f;
+  std::vector<half> data_h;
+  if (data_type_ == DataType::FLOAT32) {
+    data_f.resize(elements_count);
+    data_ptr = data_f.data();
+    DataFromBHWC(in, absl::MakeSpan(data_f.data(), data_f.size()));
+  } else {
+    data_h.resize(elements_count);
+    data_ptr = data_h.data();
+    DataFromBHWC(in, absl::MakeSpan(data_h.data(), data_h.size()));
+  }
+
+  switch (storage_type_) {
+    case TensorStorageType::BUFFER: {
+      RETURN_IF_ERROR(queue->EnqueueWriteBuffer(memory_, data_size, data_ptr));
+      break;
+    }
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      RETURN_IF_ERROR(
+          queue->EnqueueWriteImage(memory_, GetFullTensorRegion(), data_ptr));
+      break;
+    default:
+      return InternalError("Unsupported tensor storage type");
+  }
+
+  return OkStatus();
+}
+
+Status Tensor::WriteData(CLCommandQueue* queue, const TensorFloat32& src) {
+  RETURN_IF_ERROR(IsValid(src.shape));
+  return WriteDataBHWC(absl::MakeConstSpan(src.data), queue);
+}
+
+Status Tensor::ReadDataBHWC(absl::Span<float> out,
+                            CLCommandQueue* queue) const {
+  if (out.size() != channels_ * width_ * height_) {
+    return InvalidArgumentError("Output data size not match expected size");
+  }
+
+  void* data_ptr = nullptr;
+  int channels = storage_type_ == TensorStorageType::SINGLE_TEXTURE_2D
+                     ? channels_
+                     : AlignByN(channels_, 4);
+  const int elements_count = width_ * height_ * channels;
+  const size_t data_size = elements_count * SizeOf(data_type_);
+  std::vector<float> data_f;
+  std::vector<half> data_h;
+  if (data_type_ == DataType::FLOAT32) {
+    data_f.resize(elements_count);
+    data_ptr = data_f.data();
+  } else {
+    data_h.resize(elements_count);
+    data_ptr = data_h.data();
+  }
+
+  switch (storage_type_) {
+    case TensorStorageType::BUFFER:
+      RETURN_IF_ERROR(queue->EnqueueReadBuffer(memory_, data_size, data_ptr));
+      break;
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      RETURN_IF_ERROR(
+          queue->EnqueueReadImage(memory_, GetFullTensorRegion(), data_ptr));
+      break;
+    default:
+      return InternalError("Unsupported tensor storage type");
+  }
+
+  if (data_type_ == DataType::FLOAT32) {
+    DataToBHWC(absl::MakeConstSpan(data_f.data(), data_f.size()), out);
+  } else {
+    DataToBHWC(absl::MakeConstSpan(data_h.data(), data_h.size()), out);
+  }
+
+  return OkStatus();
+}
+
+Status Tensor::ReadData(CLCommandQueue* queue, TensorFloat32* dst) const {
+  RETURN_IF_ERROR(IsValid(dst->shape));
+  return ReadDataBHWC(absl::MakeSpan(dst->data), queue);
+}
+
+Status CreateTensor(const CLContext& context, const CLDevice& device, int width,
+                    int height, int channels, DataType data_type,
+                    TensorStorageType storage_type, Tensor* result) {
+  CLMemory memory;
+  RETURN_IF_ERROR(AllocateTensorMemory(context, device, width, height, channels,
+                                       data_type, storage_type, &memory));
+  *result = Tensor(memory.Release(), width, height, channels, data_type,
+                   storage_type);
+  return OkStatus();
+}
+
+Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
+                            int width, int height, int channels,
+                            DataType data_type, TensorStorageType storage_type,
+                            CLMemory* result) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER: {
+      const size_t data_size =
+          width * height * AlignByN(channels, 4) * SizeOf(data_type);
+      cl_int error_code;
+      cl_mem memory = clCreateBuffer(context.context(), CL_MEM_READ_WRITE,
+                                     data_size, nullptr, &error_code);
+      if (!memory) {
+        return UnknownError(
+            absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+                         CLErrorCodeToString(error_code)));
+      }
+      *result = CLMemory(memory, true);
+      return OkStatus();
+    }
+    case TensorStorageType::TEXTURE_2D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = width;
+      desc.image_height = height * IntegralDivideRoundUp(channels, 4);
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(data_type);
+
+      cl_int error_code;
+      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
+                                          &format, &desc, nullptr, &error_code);
+      if (error_code != CL_SUCCESS) {
+        return UnknownError(
+            absl::StrCat("Failed to create Texture2D (clCreateImage)",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return OkStatus();
+    }
+    case TensorStorageType::TEXTURE_ARRAY: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+      desc.image_width = width;
+      desc.image_height = height;
+      desc.image_depth = 0;
+      int layers_count = IntegralDivideRoundUp(channels, 4);
+      // Adreno bug. b/131099086
+      if (layers_count == 1 && !device.SupportsOneLayerTextureArray()) {
+        layers_count = 2;
+      }
+      desc.image_array_size = layers_count;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(data_type);
+
+      cl_int error_code;
+      cl_mem memory = clCreateImage(context.context(), CL_MEM_READ_WRITE,
+                                    &format, &desc, nullptr, &error_code);
+      if (error_code != CL_SUCCESS) {
+        return UnknownError(
+            absl::StrCat("Failed to create TextureArray (clCreateImage)",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return OkStatus();
+    }
+
+    case TensorStorageType::SINGLE_TEXTURE_2D: {
+      if (IntegralDivideRoundUp(channels, 4) != 1) {
+        return InvalidArgumentError(absl::StrCat(
+            "SINGLE_TEXTURE_2D support only cnannels in range [1-4], but ",
+            channels, "was provided"));
+      }
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = width;
+      desc.image_height = height;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      if (context.IsFloatTexture2DSupported(channels, data_type)) {
+        format.image_channel_order = ToChannelOrder(channels);
+        format.image_channel_data_type = ToImageChannelType(data_type);
+      } else {
+        return InvalidArgumentError(absl::StrCat(
+            "This device doesn't support ", channels, "-channel textures."));
+      }
+
+      cl_int error_code;
+      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
+                                          &format, &desc, nullptr, &error_code);
+      if (error_code != CL_SUCCESS) {
+        return UnknownError(
+            absl::StrCat("Failed to create Texture2D (clCreateImage)",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return OkStatus();
+    }
+
+    default:
+      return InternalError("Unsupported tensor storage type");
+  }
+}
+
+template <typename T>
+void Tensor::DataFromBHWC(absl::Span<const float> src,
+                          absl::Span<T> dst) const {
+  int channels =
+      storage_type_ == TensorStorageType::SINGLE_TEXTURE_2D ? channels_ : 4;
+  BHWC src_shape;
+  src_shape.b = 1;
+  src_shape.h = height_;
+  src_shape.w = width_;
+  src_shape.c = channels_;
+  for (int d = 0; d < Depth(); ++d) {
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          float value;
+          if (d * 4 + c < channels_) {
+            const int cpu_index = src_shape.LinearIndex({0, y, x, d * 4 + c});
+            value = src[cpu_index];
+          } else {
+            value = 0.0f;
+          }
+          const int gpu_index = GetLinearIndex(x, y, d, c);
+          dst[gpu_index] = value;
+        }
+      }
+    }
+  }
+}
+
+template void Tensor::DataFromBHWC<float>(absl::Span<const float> src,
+                                          absl::Span<float> dst) const;
+template void Tensor::DataFromBHWC<half>(absl::Span<const float> src,
+                                         absl::Span<half> dst) const;
+
+template <typename T>
+void Tensor::DataToBHWC(absl::Span<const T> src, absl::Span<float> dst) const {
+  int channels =
+      storage_type_ == TensorStorageType::SINGLE_TEXTURE_2D ? channels_ : 4;
+  BHWC dst_shape;
+  dst_shape.b = 1;
+  dst_shape.h = height_;
+  dst_shape.w = width_;
+  dst_shape.c = channels_;
+  for (int d = 0; d < Depth(); ++d) {
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          if (d * 4 + c >= channels_) continue;
+
+          const int cpu_index = dst_shape.LinearIndex({0, y, x, d * 4 + c});
+          const int gpu_index = GetLinearIndex(x, y, d, c);
+          dst[cpu_index] = src[gpu_index];
+        }
+      }
+    }
+  }
+}
+
+template void Tensor::DataToBHWC<float>(absl::Span<const float> src,
+                                        absl::Span<float> dst) const;
+template void Tensor::DataToBHWC<half>(absl::Span<const half> src,
+                                       absl::Span<float> dst) const;
+
+TensorBHWC::TensorBHWC(TensorBHWC&& tensor)
+    : Tensor(std::move(tensor)), owner_(tensor.owner_) {}
+
+TensorBHWC& TensorBHWC::operator=(TensorBHWC&& tensor) {
+  if (this != &tensor) {
+    ReleaseBHWC();
+    owner_ = tensor.owner_;
+    Tensor::operator=(std::move(tensor));
+  }
+  return *this;
+}
+
+void TensorBHWC::ReleaseBHWC() {
+  // Base class is handling deletion if we are not owners
+  if (!owner_ && memory_) {
+    memory_ = nullptr;
+  }
+}
+
+Status CreateTensorBHWC(const CLContext& context, const HWC& shape,
+                        DataType data_type, void* data, Tensor* result) {
+  const size_t data_size = shape.w * shape.h * shape.c * SizeOf(data_type);
+  cl_int error_code;
+  int flags =
+      data ? (CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR) : CL_MEM_READ_WRITE;
+  cl_mem memory =
+      clCreateBuffer(context.context(), flags, data_size, data, &error_code);
+  if (!memory) {
+    return UnknownError(
+        absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = TensorBHWC(memory, shape.w, shape.h, shape.c, data_type,
+                       TensorStorageType::BUFFER);
+  return OkStatus();
+}
+
+Status CreateTensorBHWCFromOpenGlObject(const CLContext& context,
+                                        cl_int ssbo_id, const HWC& shape,
+                                        bool is_readonly, TensorBHWC* tensor) {
+  cl_int error_code;
+  auto cl_buffer = clCreateFromGLBuffer(
+      context.context(), is_readonly ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
+      ssbo_id, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return ResourceExhaustedError(
+        absl::StrCat("Unable to create CL buffer from GL buffer.",
+                     CLErrorCodeToString(error_code)));
+  }
+  *tensor = TensorBHWC(cl_buffer, shape.w, shape.h, shape.c, DataType::FLOAT32,
+                       TensorStorageType::BUFFER);
+  tensor->owner_ = false;
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
new file mode 100644
index 00000000000..7991d373584
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Tensor {
+ public:
+  Tensor() : memory_(nullptr) {}
+  Tensor(cl_mem memory, int width, int height, int channels, DataType data_type,
+         TensorStorageType storage_type);
+
+  // Move only
+  Tensor(Tensor&& tensor);
+  Tensor& operator=(Tensor&& tensor);
+  Tensor(const Tensor&) = delete;
+  Tensor& operator=(const Tensor&) = delete;
+
+  virtual ~Tensor() { Release(); }
+
+  int Width() const { return width_; }
+  int Height() const { return height_; }
+  int Channels() const { return channels_; }
+  enum DataType DataType() const { return data_type_; }
+  TensorStorageType StorageType() const { return storage_type_; }
+
+  int Depth() const { return IntegralDivideRoundUp(channels_, 4); }
+  int4 GetSizeWithDepth() const {
+    return int4(width_, height_, channels_,
+                IntegralDivideRoundUp(channels_, 4));
+  }
+  cl_mem GetMemoryPtr() const { return memory_; }
+
+  Status WriteDataBHWC(absl::Span<const float> in, CLCommandQueue* queue);
+
+  Status ReadDataBHWC(absl::Span<float> out, CLCommandQueue* queue) const;
+
+  Status WriteData(CLCommandQueue* queue, const TensorFloat32& src);
+  Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
+
+ protected:
+  Status IsValid(const BHWC& shape) const;
+
+  template <typename T>
+  void DataFromBHWC(absl::Span<const float> src, absl::Span<T> dst) const;
+  template <typename T>
+  void DataToBHWC(absl::Span<const T> src, absl::Span<float> dst) const;
+
+  // TODO(sorokin) might be bad performance
+  int GetLinearIndex(int x, int y, int d, int sub_d) const {
+    switch (storage_type_) {
+      case TensorStorageType::BUFFER:
+      case TensorStorageType::TEXTURE_ARRAY:
+        return ((d * height_ + y) * width_ + x) * 4 + sub_d;  // DHWC4
+      case TensorStorageType::TEXTURE_2D:
+        return ((y * Depth() + d) * width_ + x) * 4 + sub_d;  // HDWC4
+      case TensorStorageType::SINGLE_TEXTURE_2D:
+        return (sub_d * height_ + y) * width_ + x;
+      case TensorStorageType::UNKNOWN:
+        return -1;
+    }
+  }
+
+  int3 GetFullTensorRegion() const;
+  void Release();
+
+  cl_mem memory_;
+  int width_;
+  int height_;
+  int channels_;
+  enum DataType data_type_;
+  TensorStorageType storage_type_;
+};
+
+class TensorBHWC : public Tensor {
+ public:
+  TensorBHWC() = default;
+  TensorBHWC(cl_mem memory, int width, int height, int channels,
+             enum DataType data_type, TensorStorageType storage_type)
+      : Tensor(memory, width, height, channels, data_type, storage_type) {}
+
+  // Move only
+  TensorBHWC(TensorBHWC&& tensor);
+  TensorBHWC& operator=(TensorBHWC&& tensor);
+  TensorBHWC(const TensorBHWC&) = delete;
+  TensorBHWC& operator=(const TensorBHWC&) = delete;
+
+  Status WriteData(CLCommandQueue* queue, void* data_ptr) const {
+    const size_t data_size =
+        Width() * Height() * Channels() * SizeOf(DataType());
+    RETURN_IF_ERROR(
+        queue->EnqueueWriteBuffer(GetMemoryPtr(), data_size, data_ptr));
+    return OkStatus();
+  }
+
+  Status ReadData(CLCommandQueue* queue, void* data_ptr) const {
+    const size_t data_size =
+        Width() * Height() * Channels() * SizeOf(DataType());
+    RETURN_IF_ERROR(
+        queue->EnqueueReadBuffer(GetMemoryPtr(), data_size, data_ptr));
+    return OkStatus();
+  }
+
+  ~TensorBHWC() override { ReleaseBHWC(); }
+
+ private:
+  friend Status CreateTensorBHWCFromOpenGlObject(const CLContext& context,
+                                                 cl_int ssbo_id,
+                                                 const HWC& shape,
+                                                 bool is_readonly,
+                                                 TensorBHWC* tensor);
+
+  void ReleaseBHWC();
+
+  // When object created from GL object it isn't owner
+  bool owner_ = true;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+
+Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
+                            int width, int height, int channels,
+                            DataType data_type, TensorStorageType storage_type,
+                            CLMemory* result);
+
+Status CreateTensor(const CLContext& context, const CLDevice& device, int width,
+                    int height, int channels, DataType data_type,
+                    TensorStorageType storage_type, Tensor* result);
+
+Status CreateTensorBHWC(const CLContext& context, const HWC& shape,
+                        DataType data_type, void* data, Tensor* result);
+
+Status CreateTensorBHWCFromOpenGlObject(const CLContext& context,
+                                        cl_int ssbo_id, const HWC& shape,
+                                        bool is_readonly, TensorBHWC* tensor);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
new file mode 100644
index 00000000000..75c2a622b00
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string ToString(TensorStorageType type) {
+  switch (type) {
+    case TensorStorageType::UNKNOWN:
+      return "TensorStorageType::UNKNOWN";
+    case TensorStorageType::BUFFER:
+      return "TensorStorageType::BUFFER";
+    case TensorStorageType::TEXTURE_ARRAY:
+      return "TensorStorageType::TEXTURE_ARRAY";
+    case TensorStorageType::TEXTURE_2D:
+      return "TensorStorageType::TEXTURE_2D";
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return "TensorStorageType::SINGLE_TEXTURE_2D";
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
new file mode 100644
index 00000000000..b81650e6f53
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_H_
+
+#include <cstddef>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class TensorStorageType {
+  UNKNOWN,
+  BUFFER,
+  TEXTURE_2D,
+  TEXTURE_ARRAY,
+  SINGLE_TEXTURE_2D
+};
+
+struct TensorDescriptor {
+  DataType data_type;
+  TensorStorageType storage_type;
+};
+
+std::string ToString(TensorStorageType type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type_util.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type_util.cc
new file mode 100644
index 00000000000..17b7afcf5be
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type_util.cc
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ObjectType ToObjectType(TensorStorageType type) {
+  switch (type) {
+    case TensorStorageType::BUFFER:
+      return ObjectType::OPENCL_BUFFER;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return ObjectType::OPENCL_TEXTURE;
+    default:
+      return ObjectType::UNKNOWN;
+  }
+}
+
+DataLayout ToDataLayout(TensorStorageType type) {
+  switch (type) {
+    case TensorStorageType::BUFFER:
+      return DataLayout::DHWC4;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return DataLayout::BHWC;
+    case TensorStorageType::TEXTURE_2D:
+      return DataLayout::HDWC4;
+    case TensorStorageType::TEXTURE_ARRAY:
+      return DataLayout::DHWC4;
+    default:
+      return DataLayout::UNKNOWN;
+  }
+}
+
+TensorStorageType ToTensorStorageType(ObjectType object_type,
+                                      DataLayout data_layout) {
+  switch (object_type) {
+    case ObjectType::OPENCL_BUFFER:
+      return TensorStorageType::BUFFER;
+    case ObjectType::OPENCL_TEXTURE:
+      switch (data_layout) {
+        case DataLayout::BHWC:
+          return TensorStorageType::SINGLE_TEXTURE_2D;
+        case DataLayout::DHWC4:
+          return TensorStorageType::TEXTURE_ARRAY;
+        case DataLayout::HDWC4:
+          return TensorStorageType::TEXTURE_2D;
+        default:
+          return TensorStorageType::UNKNOWN;
+      }
+    default:
+      return TensorStorageType::UNKNOWN;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h b/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h
new file mode 100644
index 00000000000..bfc0bde94e1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ObjectType ToObjectType(TensorStorageType type);
+
+DataLayout ToDataLayout(TensorStorageType type);
+
+TensorStorageType ToTensorStorageType(ObjectType object_type,
+                                      DataLayout data_layout);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
new file mode 100644
index 00000000000..907721dad8c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -0,0 +1,131 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Creates new 4-channel 2D texture with cl_channel_type elements
+Status CreateTexture2D(int width, int height, cl_channel_type type, void* data,
+                       CLContext* context, Texture2D* result) {
+  cl_image_desc desc;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = width;
+  desc.image_height = height;
+  desc.image_depth = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = nullptr;
+
+  cl_image_format format;
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = type;
+
+  cl_mem_flags flags = CL_MEM_READ_WRITE;
+  if (data != nullptr) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+
+  cl_int error_code;
+  cl_mem texture = CreateImage2DLegacy(context->context(), flags, &format,
+                                       &desc, data, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to create Texture2D (clCreateImage)",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = Texture2D(texture, width, height, type);
+
+  return OkStatus();
+}
+}  // namespace
+
+Texture2D::Texture2D(cl_mem texture, int width, int height,
+                     cl_channel_type type)
+    : texture_(texture), width_(width), height_(height), channel_type_(type) {}
+
+Texture2D::Texture2D(Texture2D&& texture)
+    : texture_(texture.texture_),
+      width_(texture.width_),
+      height_(texture.height_),
+      channel_type_(texture.channel_type_) {
+  texture.texture_ = nullptr;
+  texture.width_ = 0;
+  texture.height_ = 0;
+}
+
+Texture2D& Texture2D::operator=(Texture2D&& texture) {
+  if (this != &texture) {
+    Release();
+    std::swap(channel_type_, texture.channel_type_);
+    std::swap(width_, texture.width_);
+    std::swap(height_, texture.height_);
+    std::swap(texture_, texture.texture_);
+  }
+  return *this;
+}
+
+Texture2D::~Texture2D() { Release(); }
+
+void Texture2D::Release() {
+  if (texture_) {
+    clReleaseMemObject(texture_);
+    texture_ = nullptr;
+    width_ = 0;
+    height_ = 0;
+  }
+}
+
+// Creates new 4-channel 2D texture with f32 elements
+Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
+                              Texture2D* result) {
+  return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
+}
+
+// Creates new 4-channel 2D texture with f16 elements
+Status CreateTexture2DRGBA16F(int width, int height, CLContext* context,
+                              Texture2D* result) {
+  return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
+                         result);
+}
+
+Status CreateTexture2DRGBA(DataType type, int width, int height,
+                           CLContext* context, Texture2D* result) {
+  if (type == DataType::FLOAT32) {
+    return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
+  } else {
+    return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
+                           result);
+  }
+}
+
+Status CreateTexture2DRGBA(DataType type, int width, int height, void* data,
+                           CLContext* context, Texture2D* result) {
+  if (type == DataType::FLOAT32) {
+    return CreateTexture2D(width, height, CL_FLOAT, data, context, result);
+  } else {
+    return CreateTexture2D(width, height, CL_HALF_FLOAT, data, context, result);
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
new file mode 100644
index 00000000000..bdac984a2db
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -0,0 +1,121 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TEXTURE2D_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TEXTURE2D_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Texture2D represent formatted GPU data storage.
+// Texture2D is moveable but not copyable.
+class Texture2D {
+ public:
+  Texture2D() {}  // just for using Texture2D as a class members
+  Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
+
+  // Move only
+  Texture2D(Texture2D&& texture);
+  Texture2D& operator=(Texture2D&& texture);
+  Texture2D(const Texture2D&) = delete;
+  Texture2D& operator=(const Texture2D&) = delete;
+
+  ~Texture2D();
+
+  cl_mem GetMemoryPtr() const { return texture_; }
+
+  // Writes data to a texture. Data should point to a region that
+  // has exact width * height * sizeof(pixel) bytes.
+  template <typename T>
+  Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
+
+  // Reads data from Texture2D into CPU memory.
+  template <typename T>
+  Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
+
+ private:
+  void Release();
+
+  cl_mem texture_ = nullptr;
+  int width_;
+  int height_;
+  cl_channel_type channel_type_;
+};
+
+using Texture2DPtr = std::shared_ptr<Texture2D>;
+
+// Creates new 4-channel 2D texture with f32 elements
+Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
+                              Texture2D* result);
+
+// Creates new 4-channel 2D texture with f16 elements
+Status CreateTexture2DRGBA16F(int width, int height, CLContext* context,
+                              Texture2D* result);
+
+Status CreateTexture2DRGBA(DataType type, int width, int height,
+                           CLContext* context, Texture2D* result);
+
+Status CreateTexture2DRGBA(DataType type, int width, int height, void* data,
+                           CLContext* context, Texture2D* result);
+
+template <typename T>
+Status Texture2D::WriteData(CLCommandQueue* queue, const absl::Span<T> data) {
+  const int element_size = ChannelTypeToSizeInBytes(channel_type_);
+  if (sizeof(T) % element_size != 0) {
+    return InvalidArgumentError(
+        "Template type T has not suitable element type for created texture.");
+  }
+  if (4 * width_ * height_ * element_size != data.size() * sizeof(T)) {
+    return InvalidArgumentError(
+        "absl::Span<T> data size is different from texture allocated size.");
+  }
+
+  RETURN_IF_ERROR(queue->EnqueueWriteImage(texture_, int3(width_, height_, 1),
+                                           data.data()));
+
+  return OkStatus();
+}
+
+template <typename T>
+Status Texture2D::ReadData(CLCommandQueue* queue,
+                           std::vector<T>* result) const {
+  const int element_size = ChannelTypeToSizeInBytes(channel_type_);
+  if (sizeof(T) != element_size) {
+    return InvalidArgumentError("Pixel format is different.");
+  }
+
+  const int elements_count = width_ * height_ * 4;
+  result->resize(elements_count);
+
+  return queue->EnqueueReadImage(texture_, int3(width_, height_, 1),
+                                 result->data());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TEXTURE2D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/util.cc b/tensorflow/lite/delegates/gpu/cl/util.cc
new file mode 100644
index 00000000000..ac996d8ffa6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/util.cc
@@ -0,0 +1,173 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string CLErrorCodeToString(cl_int error_code) {
+  switch (error_code) {
+    case CL_SUCCESS:
+      return "Success";
+    case CL_DEVICE_NOT_FOUND:
+      return "Device not found";
+    case CL_DEVICE_NOT_AVAILABLE:
+      return "Device not available";
+    case CL_COMPILER_NOT_AVAILABLE:
+      return "Compiler not available";
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+      return "Memory object allocation failure";
+    case CL_OUT_OF_RESOURCES:
+      return "Out of resources";
+    case CL_OUT_OF_HOST_MEMORY:
+      return "Out of host memory";
+    case CL_PROFILING_INFO_NOT_AVAILABLE:
+      return "Profiling information not available";
+    case CL_MEM_COPY_OVERLAP:
+      return "Memory copy overlap";
+    case CL_IMAGE_FORMAT_MISMATCH:
+      return "Image format mismatch";
+    case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+      return "Image format not supported";
+    case CL_BUILD_PROGRAM_FAILURE:
+      return "Build program failure";
+    case CL_MAP_FAILURE:
+      return "Mapping failure";
+    case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+      return "Misaligned sub-buffer offset";
+    case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+      return "Execution status error for events in wait list";
+    case CL_COMPILE_PROGRAM_FAILURE:
+      return "Compile program failure";
+    case CL_LINKER_NOT_AVAILABLE:
+      return "Linker not available";
+    case CL_LINK_PROGRAM_FAILURE:
+      return "Link program failure";
+    case CL_DEVICE_PARTITION_FAILED:
+      return "Device partition failed";
+    case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+      return "Kernel argument information not available";
+
+    case CL_INVALID_VALUE:
+      return "Invalid value";
+    case CL_INVALID_DEVICE_TYPE:
+      return "Invalid device type";
+    case CL_INVALID_PLATFORM:
+      return "Invalid platform";
+    case CL_INVALID_DEVICE:
+      return "Invalid device";
+    case CL_INVALID_CONTEXT:
+      return "Invalid context";
+    case CL_INVALID_QUEUE_PROPERTIES:
+      return "Invalid queue properties";
+    case CL_INVALID_COMMAND_QUEUE:
+      return "Invalid command queue";
+    case CL_INVALID_HOST_PTR:
+      return "Invalid host pointer";
+    case CL_INVALID_MEM_OBJECT:
+      return "Invalid memory object";
+    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+      return "Invalid image format descriptor";
+    case CL_INVALID_IMAGE_SIZE:
+      return "Invalid image size";
+    case CL_INVALID_SAMPLER:
+      return "Invalid sampler";
+    case CL_INVALID_BINARY:
+      return "Invalid binary";
+    case CL_INVALID_BUILD_OPTIONS:
+      return "Invalid build options";
+    case CL_INVALID_PROGRAM:
+      return "Invalid program";
+    case CL_INVALID_PROGRAM_EXECUTABLE:
+      return "Invalid program executable";
+    case CL_INVALID_KERNEL_NAME:
+      return "Invalid kernel name";
+    case CL_INVALID_KERNEL_DEFINITION:
+      return "Invalid kernel definition";
+    case CL_INVALID_KERNEL:
+      return "Invalid kernel";
+    case CL_INVALID_ARG_INDEX:
+      return "Invalid argument index";
+    case CL_INVALID_ARG_VALUE:
+      return "Invalid argument value";
+    case CL_INVALID_ARG_SIZE:
+      return "Invalid argument size";
+    case CL_INVALID_KERNEL_ARGS:
+      return "Invalid kernel arguments";
+    case CL_INVALID_WORK_DIMENSION:
+      return "Invalid work dimension";
+    case CL_INVALID_WORK_GROUP_SIZE:
+      return "Invalid work group size";
+    case CL_INVALID_WORK_ITEM_SIZE:
+      return "Invalid work item size";
+    case CL_INVALID_GLOBAL_OFFSET:
+      return "Invalid global offset";
+    case CL_INVALID_EVENT_WAIT_LIST:
+      return "Invalid event wait list";
+    case CL_INVALID_EVENT:
+      return "Invalid event";
+    case CL_INVALID_OPERATION:
+      return "Invalid operation";
+    case CL_INVALID_GL_OBJECT:
+      return "Invalid GL object";
+    case CL_INVALID_BUFFER_SIZE:
+      return "Invalid buffer size";
+    case CL_INVALID_MIP_LEVEL:
+      return "Invalid mip-level";
+    case CL_INVALID_GLOBAL_WORK_SIZE:
+      return "Invalid global work size";
+    case CL_INVALID_PROPERTY:
+      return "Invalid property";
+    case CL_INVALID_IMAGE_DESCRIPTOR:
+      return "Invalid image descriptor";
+    case CL_INVALID_COMPILER_OPTIONS:
+      return "Invalid compiler options";
+    case CL_INVALID_LINKER_OPTIONS:
+      return "Invalid linker options";
+    case CL_INVALID_DEVICE_PARTITION_COUNT:
+      return "Invalid device partition count";
+    case CL_INVALID_PIPE_SIZE:
+      return "Invalid pipe size";
+    case CL_INVALID_DEVICE_QUEUE:
+      return "Invalid device queue";
+    case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
+      return "Invalid GL sharegroup reference KHR";
+
+    default:
+      return absl::StrCat("Unknown OpenCL error code - ", error_code);
+  }
+}
+
+int ChannelTypeToSizeInBytes(cl_channel_type type) {
+  switch (type) {
+    case CL_FLOAT:
+      return 4;
+    case CL_HALF_FLOAT:
+      return 2;
+    default:
+      return 0;
+  }
+}
+
+bool OpenCLSupported() { return LoadOpenCL().ok(); }
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/util.h b/tensorflow/lite/delegates/gpu/cl/util.h
new file mode 100644
index 00000000000..4b100a1b4b0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/util.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string CLErrorCodeToString(cl_int error_code);
+
+int ChannelTypeToSizeInBytes(cl_channel_type type);
+
+bool OpenCLSupported();
+
+template <DataType S, typename T>
+void CopyLinearFLT4(const Tensor<Linear, S>& src, absl::Span<T> dst) {
+  const int dst_depth = dst.size();
+  for (int d = 0; d < dst_depth; ++d) {
+    T val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = d * 4 + i;
+      val[i] = dst_ch >= src.shape.v ? 0.0f : src.data[dst_ch];
+    }
+    dst[d] = val;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index fe5f5ed89cb..d43defb3e80 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -24,6 +24,15 @@ cc_library(
     hdrs = ["access_type.h"],
 )
 
+cc_library(
+    name = "gpu_info",
+    srcs = ["gpu_info.cc"],
+    hdrs = ["gpu_info.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "data_type",
     srcs = ["data_type.cc"],
@@ -32,11 +41,27 @@ cc_library(
 
 cc_library(
     name = "memory_management",
-    srcs = ["memory_management.cc"],
-    hdrs = ["memory_management.h"],
+    srcs = [
+        "memory_management.cc",
+        "memory_management/greedy_by_breadth_assignment.cc",
+        "memory_management/greedy_by_size_assignment.cc",
+        "memory_management/internal.cc",
+        "memory_management/min_cost_flow_assignment.cc",
+    ],
+    hdrs = [
+        "memory_management.h",
+        "memory_management/equality_assignment.h",
+        "memory_management/greedy_by_breadth_assignment.h",
+        "memory_management/greedy_by_size_assignment.h",
+        "memory_management/greedy_in_order_assignment.h",
+        "memory_management/internal.h",
+        "memory_management/min_cost_flow_assignment.h",
+        "memory_management/naive_assignment.h",
+    ],
     deps = [
         ":shape",
         ":status",
+        ":types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
@@ -77,6 +102,7 @@ cc_library(
         ":tensor",
         "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/schema:schema_fbs",
@@ -184,6 +210,15 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "memory_management_internal_test",
+    srcs = ["memory_management/internal_test.cc"],
+    deps = [
+        ":memory_management",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "util_test",
     srcs = ["util_test.cc"],
diff --git a/tensorflow/lite/delegates/gpu/common/convert.cc b/tensorflow/lite/delegates/gpu/common/convert.cc
index 53db297571d..81d09b2797e 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@@ -29,21 +29,9 @@ constexpr int kPhwc4ChannelsInPlane = 4;
 constexpr int kPhwo4i4ChannelsInPlane = 4;
 constexpr int kPiohw4ChannelsInPlane = 4;
 
-}  // namespace
-
-uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape) {
-  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
-         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
-}
-
-uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape) {
-  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
-         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
-}
-
 // Layout is Po,H,W,OI4x4.
 Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
-                        absl::Span<float> out) {
+                        absl::Span<float> out, bool reverse_space) {
   if (in.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWO4I4: Input data size does not match expected size: ",
@@ -70,7 +58,9 @@ Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
                 // tensor is in OHWI
                 int tensor_o = p * kPhwo4i4ChannelsInPlane + co;
                 int tensor_i = c * kPhwo4i4ChannelsInPlane + ci;
-                value = in[shape.LinearIndex({tensor_o, h, w, tensor_i})];
+                const int in_h = reverse_space ? shape.h - 1 - h : h;
+                const int in_w = reverse_space ? shape.w - 1 - w : w;
+                value = in[shape.LinearIndex({tensor_o, in_h, in_w, tensor_i})];
               }
               (*output++) = value;
             }
@@ -82,11 +72,34 @@ Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
   return OkStatus();
 }
 
+}  // namespace
+
+uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape) {
+  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
+         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
+}
+
+uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape) {
+  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
+         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
+}
+
 std::vector<float> ConvertToPHWO4I4(
     const Tensor<OHWI, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
   ConvertToPHWO4I4(tensor.data, tensor.shape,
-                   absl::MakeSpan(transposed.data(), transposed.size()))
+                   absl::MakeSpan(transposed.data(), transposed.size()),
+                   /*reverse_space=*/false)
+      .IgnoreError();
+  return transposed;
+}
+
+std::vector<float> ConvertToPHWO4I4Transposed(
+    const Tensor<OHWI, DataType::FLOAT32>& tensor) {
+  std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
+  ConvertToPHWO4I4(tensor.data, tensor.shape,
+                   absl::MakeSpan(transposed.data(), transposed.size()),
+                   /*reverse_space=*/true)
       .IgnoreError();
   return transposed;
 }
diff --git a/tensorflow/lite/delegates/gpu/common/convert.h b/tensorflow/lite/delegates/gpu/common/convert.h
index fdf9e02707d..30a0a5f3183 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.h
+++ b/tensorflow/lite/delegates/gpu/common/convert.h
@@ -63,14 +63,14 @@ std::vector<float> ConvertToPIOHW4(
 // @return number of elements when shape is converted into PHWO4I4.
 uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape);
 
-// Layout is Po,H,W,OI4x4.
-Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
-                        absl::Span<float> out);
-
 // Convenience wrapper around a method above.
 std::vector<float> ConvertToPHWO4I4(
     const Tensor<OHWI, DataType::FLOAT32>& tensor);
 
+// Convenience wrapper around a method above, for Transposed Convolution.
+std::vector<float> ConvertToPHWO4I4Transposed(
+    const Tensor<OHWI, DataType::FLOAT32>& tensor);
+
 // @return (x,y,z) size for PHWO4I4 to access elements where each element
 // consists of 4 values.
 uint3 Get3DSizeForPHWO4I4(const OHWI& shape);
diff --git a/tensorflow/lite/delegates/gpu/gl/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
similarity index 63%
rename from tensorflow/lite/delegates/gpu/gl/gpu_info.cc
rename to tensorflow/lite/delegates/gpu/common/gpu_info.cc
index d40910c3357..14fb48a2d2d 100644
--- a/tensorflow/lite/delegates/gpu/gl/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -13,19 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
 #include <algorithm>
 #include <cctype>
 #include <string>
 
 #include "absl/strings/ascii.h"
-#include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
-#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 
 namespace tflite {
 namespace gpu {
-namespace gl {
 namespace {
 
 GpuType GetGpuType(const std::string& renderer) {
@@ -102,54 +99,5 @@ void GetGpuModelAndType(const std::string& renderer, GpuModel* gpu_model,
       *gpu_type == GpuType::ADRENO ? GetGpuModel(lowered) : GpuModel::UNKNOWN;
 }
 
-Status RequestGpuInfo(GpuInfo* gpu_info) {
-  GpuInfo info;
-
-  const GLubyte* renderer_name = glGetString(GL_RENDERER);
-  if (renderer_name) {
-    info.renderer_name = reinterpret_cast<const char*>(renderer_name);
-    GetGpuModelAndType(info.renderer_name, &info.gpu_model, &info.type);
-  }
-
-  const GLubyte* vendor_name = glGetString(GL_VENDOR);
-  if (vendor_name) {
-    info.vendor_name = reinterpret_cast<const char*>(vendor_name);
-  }
-
-  const GLubyte* version_name = glGetString(GL_VERSION);
-  if (version_name) {
-    info.version = reinterpret_cast<const char*>(version_name);
-  }
-
-  glGetIntegerv(GL_MAJOR_VERSION, &info.major_version);
-  glGetIntegerv(GL_MINOR_VERSION, &info.minor_version);
-
-  GLint extensions_count;
-  glGetIntegerv(GL_NUM_EXTENSIONS, &extensions_count);
-  info.extensions.resize(extensions_count);
-  for (int i = 0; i < extensions_count; ++i) {
-    info.extensions[i] = std::string(
-        reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i)));
-  }
-  glGetIntegerv(GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, &info.max_ssbo_bindings);
-  glGetIntegerv(GL_MAX_COMPUTE_IMAGE_UNIFORMS, &info.max_image_bindings);
-  info.max_work_group_size.resize(3);
-  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0,
-                  &info.max_work_group_size[0]);
-  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1,
-                  &info.max_work_group_size[1]);
-  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2,
-                  &info.max_work_group_size[2]);
-  glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS,
-                &info.max_work_group_invocations);
-  glGetIntegerv(GL_MAX_TEXTURE_SIZE, &info.max_texture_size);
-  glGetIntegerv(GL_MAX_IMAGE_UNITS, &info.max_image_units);
-  glGetIntegerv(GL_MAX_ARRAY_TEXTURE_LAYERS, &info.max_array_texture_layers);
-  RETURN_IF_ERROR(GetOpenGlErrors());
-  *gpu_info = info;
-  return OkStatus();
-}
-
-}  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
similarity index 84%
rename from tensorflow/lite/delegates/gpu/gl/gpu_info.h
rename to tensorflow/lite/delegates/gpu/common/gpu_info.h
index ba7e0a5f3dc..44d10b323df 100644
--- a/tensorflow/lite/delegates/gpu/gl/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GPU_INFO_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GPU_INFO_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
 namespace tflite {
 namespace gpu {
-namespace gl {
 
 enum class GpuType { UNKNOWN, MALI, ADRENO, POWERVR, INTEL, NVIDIA };
 enum class GpuModel {
@@ -89,12 +86,7 @@ struct GpuInfo {
 void GetGpuModelAndType(const std::string& renderer, GpuModel* gpu_model,
                         GpuType* gpu_type);
 
-// This method performs multiple GL calls, therefore, egl context needs to be
-// created upfront.
-Status RequestGpuInfo(GpuInfo* gpu_info);
-
-}  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GPU_INFO_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index a5d5fc9111c..6c7c7283c85 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -17,414 +17,75 @@ limitations under the License.
 
 #include <algorithm>
 #include <limits>
+#include <numeric>
 #include <queue>
 #include <set>
 #include <type_traits>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
 namespace {
 
-const size_t kNotAssigned = std::numeric_limits<size_t>::max();
-
-struct PoolRecord {
-  PoolRecord(size_t size, size_t obj_id)
-      : object_size(size), object_id(obj_id) {}
-
-  // Objects in pool are ordered by size.
-  bool operator<(const PoolRecord& other) const {
-    return (object_size < other.object_size) ||
-           (object_size == other.object_size && object_id < other.object_id);
-  }
-
-  size_t object_size;
-  size_t object_id;
-};
-
-struct QueueRecord {
-  QueueRecord(TaskId task_id, size_t obj_id)
-      : last_task(task_id), object_id(obj_id) {}
-
-  // Objects in queue are ordered by last_task.
-  bool operator<(const QueueRecord& other) const {
-    return (last_task > other.last_task) ||
-           (last_task == other.last_task && object_id > other.object_id);
-  }
-
-  // Last task, where shared object is used.
-  TaskId last_task;
-  size_t object_id;
-};
-
-// Implements memory management with a naive algorithm.
-//
-// The problem of memory management is NP-complete. This implements a
-// naive algorithm that assigns each tensor to a separate object in memory.
-template <typename TensorSizeT>
-Status NaiveAssignment(
-    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
-    ObjectsAssignment<TensorSizeT>* assignment) {
-  assignment->object_sizes.resize(usage_records.size());
-  assignment->object_ids.assign(usage_records.size(), kNotAssigned);
-  for (size_t i = 0; i < usage_records.size(); i++) {
-    auto& record = usage_records[i];
-    assignment->object_ids[i] = i;
-    assignment->object_sizes[i] = record.tensor_size;
-  }
-  return OkStatus();
-}
-
-template <typename TensorSizeT>
-Status EqualityAssignment(
-    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
-    ObjectsAssignment<TensorSizeT>* assignment) {
-  size_t num_records = usage_records.size();
-  assignment->object_sizes.clear();
-  assignment->object_ids.assign(num_records, kNotAssigned);
-
-  // Pool is a map with size as a key and vector with ids of free shared objects
-  // of this size as a value.
-  absl::flat_hash_map<TensorSizeT, std::vector<size_t>> pool;
-  std::priority_queue<QueueRecord> objects_in_use;
-  for (size_t i = 0; i < num_records; ++i) {
-    // Pop from the queue and add to the pool all objects that are no longer
-    // in use at the time of execution of the first_task of i-th intermediate
-    // tensor.
-    while (!objects_in_use.empty() &&
-           objects_in_use.top().last_task < usage_records[i].first_task) {
-      auto object_id = objects_in_use.top().object_id;
-      pool[assignment->object_sizes[object_id]].push_back(object_id);
-      objects_in_use.pop();
-    }
-
-    TensorSizeT tensor_size = usage_records[i].tensor_size;
-    auto pool_it = pool.find(tensor_size);
-    if (pool_it == pool.end() || pool_it->second.empty()) {
-      // No free shared object with size equal to tensor_size. Create a new one,
-      // assign i-th tensor to it and add to the queue of objects in use.
-      assignment->object_ids[i] = assignment->object_sizes.size();
-      assignment->object_sizes.push_back(tensor_size);
-      objects_in_use.push(
-          {usage_records[i].last_task, assignment->object_ids[i]});
-    } else {
-      // Share object with id it->second has size equal to tensor_size. Reuse
-      // this object: erase it from pool and add to the queue of objects in use.
-      assignment->object_ids[i] = pool_it->second.back();
-      pool_it->second.pop_back();
-      objects_in_use.push(
-          {usage_records[i].last_task, assignment->object_ids[i]});
-    }
-  }
-  return OkStatus();
-}
-
-// Implements memory management with a greedy algorithm.
-//
-// The problem of memory management is NP-complete. This implements a
-// greedy algorithm that approximates an optimal solution with following
-// heuristic:
-//
-//   1. Iterates through all tensor usage records and for every object
-//   reference
-//      assigns shared object from the pool. When object reference is used
-//      for the last time, corresponding shared object is returned back to
-//      the pool.
-//
-//   2. Shared object pool grows when there are no free shared object
-//      available.
-//
-//   3. Shared object size may increase when tensor requests larger size.
-Status GreedyAssignment(
-    const std::vector<TensorUsageRecord<size_t>>& usage_records,
-    ObjectsAssignment<size_t>* assignment) {
-  size_t num_records = usage_records.size();
-  assignment->object_sizes.clear();
-  assignment->object_ids.assign(num_records, kNotAssigned);
-
-  // Pool of free shared objects is ordered by object size, because we perform
-  // lower_bound search in it.
-  std::set<PoolRecord> pool;
-  // Queue of shared objects in use, ordered by their last_task.
-  std::priority_queue<QueueRecord> objects_in_use;
-  for (size_t i = 0; i < num_records; i++) {
-    // Pop from the queue and add to the pool all objects that are no longer
-    // in use at the time of execution of the first_task of i-th intermediate
-    // tensor.
-    while (!objects_in_use.empty() &&
-           objects_in_use.top().last_task < usage_records[i].first_task) {
-      auto object_id = objects_in_use.top().object_id;
-      pool.insert({assignment->object_sizes[object_id], object_id});
-      objects_in_use.pop();
-    }
-    size_t tensor_size = usage_records[i].tensor_size;
-    if (pool.empty()) {
-      // No free shared object, creating a new one, assign i-th tensor to
-      // it and add to the queue of objects in use.
-      assignment->object_ids[i] = assignment->object_sizes.size();
-      assignment->object_sizes.push_back(tensor_size);
-      objects_in_use.push(
-          {usage_records[i].last_task, assignment->object_ids[i]});
-    } else {
-      auto best_it = pool.end();
-      // Find shared object from pool, that will waste the least possible
-      // amount of memory when reused for current tensor.
-      auto pool_it = pool.lower_bound({tensor_size, 0});
-      size_t size_diff = 0;
-      if (pool_it != pool.end()) {
-        // Try smallest shared object from pool with size >= tensor_size.
-        size_diff = pool_it->object_size - tensor_size;
-        best_it = pool_it;
-      }
-      if (pool_it != pool.begin()) {
-        // Try largest shared object from pool with size < tensor_size.
-        pool_it--;
-        if (best_it == pool.end() ||
-            tensor_size - pool_it->object_size < size_diff) {
-          size_diff = tensor_size - pool_it->object_size;
-          best_it = pool_it;
-        }
-      }
-      // best_it can't be equal to pool.end(), because pool is not empty
-      if (best_it == pool.end()) {
-        return InternalError(
-            "No shared object is found in non-empty pool in "
-            "GreedyAssignment.");
-      }
-      size_t shared_id = best_it->object_id;
-      pool.erase(best_it);
-      assignment->object_ids[i] = shared_id;
-      assignment->object_sizes[shared_id] =
-          std::max(assignment->object_sizes[shared_id], tensor_size);
-      objects_in_use.push(
-          {usage_records[i].last_task, assignment->object_ids[i]});
-    }
-  }
-  return OkStatus();
-}
-
-// This class build flow graph and solves Minimum-cost flow problem in it.
-class MinCostFlowSolver {
- public:
-  // Build auxiliary flow graph, based on information about intermediate
-  // tensors.
-  void Build(const std::vector<TensorUsageRecord<size_t>>& usage_records) {
-    usage_records_ = &usage_records;
-    num_tensors_ = usage_records.size();
-    source_ = 2 * num_tensors_;
-    sink_ = source_ + 1;
-    edges_from_.resize(sink_ + 1);
-    std::vector<size_t> old_record_ids;
-    std::priority_queue<QueueRecord> objects_in_use;
-    for (size_t i = 0; i < usage_records.size(); i++) {
-      // Pop from the queue all objects that are no longer in use at the time
-      // of execution of the first_task of i-th intermediate tensor.
-      while (!objects_in_use.empty() &&
-             objects_in_use.top().last_task < usage_records[i].first_task) {
-        old_record_ids.push_back(objects_in_use.top().object_id);
-        objects_in_use.pop();
-      }
-      objects_in_use.push({usage_records[i].last_task, i});
-      AddEdge(source_, i, 1, 0);
-      AddEdge(RightPartTwin(i), sink_, 1, 0);
-
-      // Edge from source_ to i-th vertex in the right part of flow graph
-      // are added for the case of allocation of new shared object for i-th
-      // tensor. Cost of these edges is equal to the size of i-th tensor.
-      AddEdge(source_, RightPartTwin(i), 1, usage_records[i].tensor_size);
-
-      // Edges from vertices of the left part of flow graph, corresponding to
-      // old_record_ids, to i-th vertex in the right part of flow graph are
-      // added for the case of reusing previously created shared objects for
-      // i-th tensor. Cost of these edges is an approximation of the size of
-      // new allocated memory.
-      for (auto record_id : old_record_ids) {
-        int cost = 0;
-        if (usage_records[i].tensor_size >
-            usage_records[record_id].tensor_size) {
-          cost = usage_records[i].tensor_size -
-                 usage_records[record_id].tensor_size;
-        }
-        AddEdge(record_id, RightPartTwin(i), 1, cost);
-      }
-    }
-  }
-
-  // Solve Minimum-cost flow problem with Shortest Path Faster Algorithm.
-  void Solve() {
-    const int kInf = std::numeric_limits<int>::max();
-    std::vector<size_t> prev_edge(sink_ + 1);
-    while (true) {
-      std::queue<size_t> cur_queue, next_queue;
-      std::vector<size_t> last_it_in_queue(sink_ + 1);
-      std::vector<size_t> dist(sink_ + 1, kInf);
-      size_t it = 1;
-      cur_queue.push(source_);
-      last_it_in_queue[source_] = it;
-      dist[source_] = 0;
-      // Find shortest path from source_ to sink_, using only edges with
-      // positive capacity.
-      while (!cur_queue.empty()) {
-        ++it;
-        while (!cur_queue.empty()) {
-          auto v = cur_queue.front();
-          cur_queue.pop();
-          for (const auto& edge_id : edges_from_[v]) {
-            const Edge& edge = edges_[edge_id];
-            if (edge.cap > 0) {
-              auto u = edge.dst;
-              int new_dist = dist[v] + edge.cost;
-              if (new_dist < dist[u]) {
-                dist[u] = new_dist;
-                prev_edge[u] = edge_id;
-                if (last_it_in_queue[u] != it) {
-                  next_queue.push(u);
-                  last_it_in_queue[u] = it;
-                }
-              }
-            }
-          }
-        }
-        std::swap(cur_queue, next_queue);
-      }
-      // If path is not found, final result is ready.
-      if (dist[sink_] == kInf) break;
-
-      // If path is found, we need to decrease the capacity of its edges, and
-      // increase the capacity of its reversed edges.
-      for (size_t v = sink_; v != source_;) {
-        --edges_[prev_edge[v]].cap;
-        Edge& rev_edge = edges_[prev_edge[v] ^ 1];
-        ++rev_edge.cap;
-        v = rev_edge.dst;
-      }
-    }
-  }
-
-  void CalculateAssignment(ObjectsAssignment<size_t>* assignment) {
-    assignment->object_sizes.clear();
-    assignment->object_ids.assign(num_tensors_, kNotAssigned);
-    is_tensor_assigned_.resize(num_tensors_);
-    for (const auto& edge_id : edges_from_[source_]) {
-      const Edge& edge = edges_[edge_id];
-      if (edge.cap == 0 && IsRightPartVertex(edge.dst)) {
-        assignment->object_sizes.push_back(
-            AssignTensorsToNewSharedObject(LeftPartTwin(edge.dst), assignment));
-      }
-    }
-  }
-
- private:
-  struct Edge {
-    Edge(size_t dst, int cap, int cost) : dst(dst), cap(cap), cost(cost) {}
-
-    size_t dst;
-    int cap;
-    int cost;
-  };
-
-  // Add edge from vertex src to vertex dst with given capacity and cost and
-  // its reversed edge to the flow graph. If some edge has index idx, its
-  // reversed edge has index idx^1.
-  void AddEdge(size_t src, size_t dst, int cap, int cost) {
-    edges_from_[src].push_back(edges_.size());
-    edges_.emplace_back(dst, cap, cost);
-    edges_from_[dst].push_back(edges_.size());
-    edges_.push_back({src, 0, -cost});
-  }
-
-  // Check, if vertex_id belongs to right part of the flow graph.
-  bool IsRightPartVertex(size_t vertex_id) const {
-    return vertex_id >= num_tensors_ && vertex_id < 2 * num_tensors_;
-  }
-
-  // Return vertex from another part of the graph, that corresponds to the
-  // same intermediate tensor.
-  size_t LeftPartTwin(size_t vertex_id) const {
-    return vertex_id - num_tensors_;
-  }
-  size_t RightPartTwin(size_t vertex_id) const {
-    return vertex_id + num_tensors_;
-  }
-
-  // This function uses recursive implementation of depth-first search and
-  // returns maximum size from tensor tensor_id and all tensors, that will be
-  // allocated at the same place with it after all operations that use
-  // tensor_id are executed. Next tensor to be allocated at the same place
-  // with tensor_id is a left part twin of such vertex v, that the edge
-  // tensor_id->v is saturated (has zero residual capacity).
-  size_t AssignTensorsToNewSharedObject(size_t tensor_id,
-                                        ObjectsAssignment<size_t>* assignment) {
-    size_t cost = (*usage_records_)[tensor_id].tensor_size;
-    is_tensor_assigned_[tensor_id] = true;
-    assignment->object_ids[tensor_id] = assignment->object_sizes.size();
-    for (const auto& edge_id : edges_from_[tensor_id]) {
-      const Edge& edge = edges_[edge_id];
-      size_t v = edge.dst;
-      size_t left_twin = LeftPartTwin(v);
-      if (edge.cap == 0 && IsRightPartVertex(v) &&
-          !is_tensor_assigned_[left_twin]) {
-        cost = std::max(cost,
-                        AssignTensorsToNewSharedObject(left_twin, assignment));
-      }
-    }
-    return cost;
-  }
-
-  size_t source_;
-  size_t sink_;
-  size_t num_tensors_;
-  const std::vector<TensorUsageRecord<size_t>>* usage_records_;
-  std::vector<Edge> edges_;
-  std::vector<std::vector<size_t>> edges_from_;
-  std::vector<bool> is_tensor_assigned_;
-};
-
-// Implements memory management with a Minimum-cost flow matching algorithm.
-//
-// The problem of memory management is NP-complete. This function creates
-// auxiliary flow graph, find minimum-cost flow in it and calculates the
-// assignment of shared objects to tensors, using the result of the flow
-// algorithm.
-Status MinCostFlowAssignment(
-    const std::vector<TensorUsageRecord<size_t>>& usage_records,
-    ObjectsAssignment<size_t>* assignment) {
-  MinCostFlowSolver solver;
-  solver.Build(usage_records);
-  solver.Solve();
-  solver.CalculateAssignment(assignment);
-  return OkStatus();
+size_t TotalSize(const ObjectsAssignment<size_t>& assignment) {
+  return std::accumulate(assignment.object_sizes.begin(),
+                         assignment.object_sizes.end(), static_cast<size_t>(0));
 }
 
 }  // namespace
 
-Status AssignObjectsToTensors(
-    const std::vector<TensorUsageRecord<size_t>>& usage_records,
-    const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment) {
-  switch (strategy) {
-    case MemoryStrategy::NAIVE:
-      return NaiveAssignment<size_t>(usage_records, assignment);
-    case MemoryStrategy::EQUALITY:
-      return EqualityAssignment<size_t>(usage_records, assignment);
-    case MemoryStrategy::GREEDY:
-      return GreedyAssignment(usage_records, assignment);
-    case MemoryStrategy::MINCOSTFLOW:
-      return MinCostFlowAssignment(usage_records, assignment);
+OffsetsAssignment ObjectsToOffsets(
+    const ObjectsAssignment<size_t>& obj_assignment) {
+  size_t num_tensors = obj_assignment.object_ids.size();
+  size_t num_objects = obj_assignment.object_sizes.size();
+  OffsetsAssignment result = {/*offsets=*/std::vector<size_t>(num_tensors),
+                              /*total_size=*/0};
+  std::vector<size_t> ids_to_offset(num_objects);
+  for (size_t i = 0; i < num_objects; ++i) {
+    ids_to_offset[i] = result.total_size;
+    result.total_size += obj_assignment.object_sizes[i];
   }
-  return OkStatus();
+  for (size_t i = 0; i < num_tensors; ++i) {
+    result.offsets[i] = ids_to_offset[obj_assignment.object_ids[i]];
+  }
+  return result;
 }
 
 Status AssignObjectsToTensors(
-    const std::vector<TensorUsageRecord<BHWC>>& usage_records,
-    const MemoryStrategy& strategy, ObjectsAssignment<BHWC>* assignment) {
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<size_t>* assignment) {
   switch (strategy) {
     case MemoryStrategy::NAIVE:
-      return NaiveAssignment<BHWC>(usage_records, assignment);
+      return NaiveAssignment(usage_records, assignment);
     case MemoryStrategy::EQUALITY:
-      return EqualityAssignment<BHWC>(usage_records, assignment);
+      return EqualityAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_IN_ORDER:
+      return GreedyInOrderAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_BY_BREADTH:
+      return GreedyByBreadthAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_BY_SIZE:
+      return GreedyBySizeDistPriorityAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_BEST: {
+      RETURN_IF_ERROR(
+          GreedyBySizeDistPriorityAssignment(usage_records, assignment));
+      ObjectsAssignment<size_t> assignment_by_breadth;
+      if (GreedyByBreadthAssignment(usage_records, &assignment_by_breadth)
+              .ok() &&
+          TotalSize(assignment_by_breadth) < TotalSize(*assignment)) {
+        std::swap(*assignment, assignment_by_breadth);
+      }
+      return OkStatus();
+    }
+    case MemoryStrategy::MINCOSTFLOW:
+      return MinCostFlowAssignment(usage_records, assignment);
     default:
       return InternalError(
           "MemoryStrategy is not supported with current tensor size type.");
@@ -432,5 +93,63 @@ Status AssignObjectsToTensors(
   return OkStatus();
 }
 
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<BHWC>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<BHWC>* assignment) {
+  switch (strategy) {
+    case MemoryStrategy::NAIVE:
+      return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::EQUALITY:
+      return EqualityAssignment(usage_records, assignment);
+    default:
+      return InternalError(
+          "MemoryStrategy is not supported with current tensor size type.");
+  }
+  return OkStatus();
+}
+
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint2>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint2>* assignment) {
+  switch (strategy) {
+    case MemoryStrategy::NAIVE:
+      return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_IN_ORDER:
+      return GreedyInOrderAssignmentMultidimensional(usage_records, assignment);
+    default:
+      return InternalError(
+          "MemoryStrategy is not supported with current tensor size type.");
+  }
+  return OkStatus();
+}
+
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint3>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment) {
+  switch (strategy) {
+    case MemoryStrategy::NAIVE:
+      return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::GREEDY_IN_ORDER:
+      return GreedyInOrderAssignmentMultidimensional(usage_records, assignment);
+    default:
+      return InternalError(
+          "MemoryStrategy is not supported with current tensor size type.");
+  }
+  return OkStatus();
+}
+
+Status AssignOffsetsToTensors(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    const MemoryStrategy& strategy, OffsetsAssignment* assignment) {
+  if (strategy == MemoryStrategy::GREEDY_BY_SIZE) {
+    return GreedyBySizeAssignment(usage_records, assignment);
+  }
+  ObjectsAssignment<size_t> objects_assignment;
+  RETURN_IF_ERROR(
+      AssignObjectsToTensors(usage_records, strategy, &objects_assignment));
+  *assignment = ObjectsToOffsets(objects_assignment);
+  return OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h
index d3fec0a1291..fb2e3f9eb01 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
@@ -57,6 +58,18 @@ struct ObjectsAssignment {
   std::vector<TensorSizeT> object_sizes;
 };
 
+// Information about assignment of tensors to offsets for the case, when all of
+// them are going to be allocated in one continuous memory block.
+struct OffsetsAssignment {
+  std::vector<size_t> offsets;
+  size_t total_size;
+};
+
+// Converts given assignment of tensors to shared objects to the assignment of
+// the same tensors to offsets in continuous memory block.
+OffsetsAssignment ObjectsToOffsets(
+    const ObjectsAssignment<size_t>& obj_assignment);
+
 enum class MemoryStrategy {
   // Naive strategy is to allocate each object separately.
   // Can be useful for debugging to see all intermediate outputs.
@@ -66,9 +79,25 @@ enum class MemoryStrategy {
   // tensors with the same size, but non-intersecting usage intervals.
   EQUALITY,
 
-  // Greedy strategy uses greedy algorithm to reuse memory from tensors, that
+  // Greedy strategy uses greedy algorithm, iterating through all the tensors in
+  // order of their first_task, to reuse memory from tensors, that
   // won't be used anymore, for new ones.
-  GREEDY,
+  GREEDY_IN_ORDER,
+
+  // Greedy by size strategy uses greedy algorithm, iterating through all the
+  // tasks in non-increasing of their breadth, and calculating allocations for
+  // tensors used in these tasks. By breadth of the task we understand sum of
+  // sizes of all tensors in its TaskProfile.
+  GREEDY_BY_BREADTH,
+
+  // Greedy by size strategy uses greedy algorithm, iterating through all the
+  // tensors in non-increasing of their size, to reuse memory from tensors, that
+  // won't be used anymore, for new ones.
+  GREEDY_BY_SIZE,
+
+  // Choose greedy strategy from several fast algorithms, that provides best
+  // memory allocation for the given usage records.
+  GREEDY_BEST,
 
   // Mincostflow strategy consists of building auxiliary flow graph and solving
   // the minimum-cost flow problem in it. In the end edges with zero residual
@@ -78,17 +107,37 @@ enum class MemoryStrategy {
 
 // Calculates the assignement of shared objects to given tensors, including
 // objects' sizes. Initial tensor sizes are given as size_t. This function is
-// intended to use with GPU buffers.
+// intended to use with GPU buffers and one-dimensional textures.
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<size_t>>& usage_records,
-    const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment);
+    MemoryStrategy strategy, ObjectsAssignment<size_t>* assignment);
 
 // Calculates the assignement of shared objects to given tensors, including
 // objects' sizes. Initial tensor sizes are given as BHWC. This function is
-// intended to use with GPU textures.
+// intended to use with OpenCL textures.
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<BHWC>>& usage_records,
-    const MemoryStrategy& strategy, ObjectsAssignment<BHWC>* assignment);
+    MemoryStrategy strategy, ObjectsAssignment<BHWC>* assignment);
+
+// Calculates the assignement of shared objects to given tensors, including
+// objects' sizes. Initial tensor sizes are given as uint2. This function is
+// intended to use with OpenGL textures.
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint2>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint2>* assignment);
+
+// Calculates the assignement of shared objects to given tensors, including
+// objects' sizes. Initial tensor sizes are given as uint3. This function is
+// intended to use with OpenGL textures.
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint3>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment);
+
+// Calculates the assignement of tensors to offsets, considering those tensors
+// are going to be allocated in one continuous memory block.
+Status AssignOffsetsToTensors(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    const MemoryStrategy& strategy, OffsetsAssignment* assignment);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
new file mode 100644
index 00000000000..a5e6c3a85eb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
+
+#include <queue>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+template <typename TensorSizeT>
+Status EqualityAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool is a map with size as a key and vector with ids of free shared objects
+  // of this size as a value.
+  absl::flat_hash_map<TensorSizeT, std::vector<size_t>> pool;
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; ++i) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool[assignment->object_sizes[object_id]].push_back(object_id);
+      objects_in_use.pop();
+    }
+
+    TensorSizeT tensor_size = usage_records[i].tensor_size;
+    auto pool_it = pool.find(tensor_size);
+    if (pool_it == pool.end() || pool_it->second.empty()) {
+      // No free shared object with size equal to tensor_size. Create a new one,
+      // assign i-th tensor to it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      // Shared object with id it->second has size equal to tensor_size. Reuse
+      // this object: erase it from pool and add to the queue of objects in use.
+      assignment->object_ids[i] = pool_it->second.back();
+      pool_it->second.pop_back();
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
new file mode 100644
index 00000000000..5d0f6b620b0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
@@ -0,0 +1,143 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+// Set of usage records for all tensors assigned to the shared object, ordered
+// by first_task.
+using SharedObjectSchedule = std::set<TensorUsageRecord<size_t>>;
+
+struct TaskBreadthWithId {
+  size_t breadth;
+  TaskId task_id;
+
+  TaskBreadthWithId(size_t breadth, size_t task_id)
+      : breadth(breadth), task_id(task_id) {}
+
+  // Default order of TaskBreadthWithId is increasing order of their breadth.
+  bool operator<(const TaskBreadthWithId& other) const {
+    return breadth < other.breadth;
+  }
+};
+
+}  // namespace
+
+Status GreedyByBreadthAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment) {
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles(usage_records);
+
+  // Task breadth is a sum of sizes of all tensors in its TaskProfile
+  std::vector<TaskBreadthWithId> task_breadth;
+  for (size_t task_id = 0; task_id < task_profiles.size(); ++task_id) {
+    size_t breadth = 0;
+    for (const auto& tensor_info : task_profiles[task_id]) {
+      breadth += tensor_info.usage_record->tensor_size;
+    }
+    task_breadth.emplace_back(breadth, task_id);
+  }
+
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(usage_records.size(), kNotAssigned);
+  std::vector<SharedObjectSchedule> obj_schedules;
+
+  // Iterate through all tasks in non-increasing order of their breadth.
+  std::sort(task_breadth.rbegin(), task_breadth.rend());
+  for (const auto& task : task_breadth) {
+    // Iterate through all tensors, that must be allocated during the execution
+    // of task, in non-increasing order of their tensor_size.
+    for (const auto& tensor_info : task_profiles[task.task_id]) {
+      if (assignment->object_ids[tensor_info.idx] != kNotAssigned) {
+        continue;
+      }
+      const auto& rec = *tensor_info.usage_record;
+      const size_t num_objects = obj_schedules.size();
+      size_t best_object = num_objects;
+      for (size_t obj_id = 0; obj_id < num_objects; ++obj_id) {
+        // If size of current_object is worse than size of best found before, we
+        // can skip it.
+        if (best_object != num_objects) {
+          const size_t best_size = assignment->object_sizes[best_object];
+          const size_t cur_size = assignment->object_sizes[obj_id];
+          if (best_size < rec.tensor_size) {
+            if (cur_size <= best_size) {
+              // best_size is smaller than tensor_size, but cur_size is even
+              // smaller.
+              continue;
+            }
+          } else if (cur_size < rec.tensor_size || cur_size >= best_size) {
+            // best_size is larger or equal to tensor_size, and cur_size is
+            // either smaller than tensor_size, or too large.
+            continue;
+          }
+        }
+        const auto& schedule = obj_schedules[obj_id];
+        auto it = schedule.lower_bound(rec);
+        bool update_best_object = true;
+        if (it != schedule.end() && it->first_task <= rec.last_task) {
+          // Some tensor, which usage interval intersects with current, already
+          // assigned to this object.
+          update_best_object = false;
+        }
+        if (update_best_object && it != schedule.begin()) {
+          it--;
+          if (it->last_task >= rec.first_task) {
+            // Some tensor, which usage interval intersects with current,
+            // already assigned to this object.
+            update_best_object = false;
+          }
+        }
+        if (update_best_object) {
+          best_object = obj_id;
+        }
+      }
+      if (best_object == num_objects) {
+        // Create new shared object and assign current tensor to it.
+        obj_schedules.push_back({rec});
+        assignment->object_sizes.push_back(rec.tensor_size);
+      } else {
+        // Assign current tensor to best_object.
+        obj_schedules[best_object].insert(rec);
+        // Size of best_object can be increased, if it is smaller than
+        // tensor_size.
+        assignment->object_sizes[best_object] =
+            std::max(assignment->object_sizes[best_object], rec.tensor_size);
+      }
+      assignment->object_ids[tensor_info.idx] = best_object;
+    }
+  }
+  // In the end all tensors must be assigned to some objects.
+  for (const auto& obj_id : assignment->object_ids) {
+    if (obj_id == kNotAssigned) {
+      return InternalError("Error while calculating the assignment.");
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
new file mode 100644
index 00000000000..b073c505837
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Assigns given tensors to shared objects, using the following greedy
+// algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - For each task calculate its TaskProfile. By breadth of the task we
+// understand sum of sizes of all tensors in its TaskProfile;
+// - Iterate through all tasks in non-increasing order of breadth;
+// - For each of these tasks iterate through all tensors in its TaskProfile in
+// non-increasing order of tensor_size;
+// - For every such tensor usage record find a shared object, that is not
+// assigned to some tensors, which usage intervals intersect with usage interval
+// of current tensor;
+// - If there are no suitable shared objects, assign current tensor to the new
+// object with size equal to current tensor's size;
+// - If there are suitable objects with size greater than or equal to current
+// tensor’s size, assign current tensor to the smallest of them;
+// - If there are suitable objects only with size less than current tensor’s
+// size, assign current tensor to the largest of them and increase its size.
+Status GreedyByBreadthAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
new file mode 100644
index 00000000000..1234326b4ea
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+struct SizeDistPriorityInfo {
+  // - Tensor with leftmost position in positional maximums vector has higher
+  // priority;
+  // - If two tensors have equal position, the one, that has usage interval with
+  // smallest positive distance (best_dist) to some of already assigned tensors,
+  // has higher priority;
+  // - If two tensors have equal position and best_dist, the one with greater
+  // tensor_size has higher priority.
+  bool operator>(const SizeDistPriorityInfo& other) const {
+    return position < other.position ||
+           (position == other.position &&
+            (best_dist < other.best_dist || (best_dist == other.best_dist &&
+                                             tensor_size > other.tensor_size)));
+  }
+
+  // Recalculate best distance and best object, based on precalculated distances
+  // in vector dist.
+  void RecalcBestDist() {
+    best_dist = kNotAssigned;
+    for (size_t obj_id = 0; obj_id < dist.size(); ++obj_id) {
+      if (dist[obj_id] < best_dist) {
+        best_dist = dist[obj_id];
+        best_object = obj_id;
+      }
+    }
+  }
+
+  size_t position;
+  size_t tensor_size;
+  std::vector<size_t> dist;
+  size_t best_dist;
+  size_t best_object;
+  size_t tensor_usage_id;
+};
+
+}  // namespace
+
+Status GreedyBySizeAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    OffsetsAssignment* assignment) {
+  const size_t num_tensors = usage_records.size();
+  assignment->offsets.resize(num_tensors);
+  assignment->total_size = 0;
+
+  // Ordered records are to be sorted by size of corrseponding tensor.
+  std::vector<TensorUsageWithIndex<size_t>> ordered_records;
+  for (size_t i = 0; i < num_tensors; ++i) {
+    ordered_records.emplace_back(&usage_records[i], i);
+  }
+  std::sort(ordered_records.begin(), ordered_records.end(), CompareBySize);
+
+  // Vector of ids of already allocated tensors, ordered by offset.
+  std::vector<size_t> ordered_allocs;
+
+  for (const auto& rec_with_idx : ordered_records) {
+    const TensorUsageRecord<size_t>* rec = rec_with_idx.usage_record;
+    size_t best_diff = kNotAssigned;
+    size_t best_offset = kNotAssigned;
+    size_t prev_offset = 0;
+    for (const auto& allocated_id : ordered_allocs) {
+      if (usage_records[allocated_id].last_task < rec->first_task ||
+          usage_records[allocated_id].first_task > rec->last_task) {
+        // Tensor allocated_id has usage interval, that doesn't intersect with
+        // current tensor's usage interval, so we skip it.
+        continue;
+      }
+      size_t cur_offset = assignment->offsets[allocated_id];
+      if (cur_offset >= prev_offset) {
+        size_t diff = cur_offset - prev_offset;
+        // Check, if current_tensor fits into the gap, located directly to the
+        // left of tensor allocated_id offset, and that this gap is the smallest
+        // of previously considered suitable gaps.
+        if (diff >= rec->tensor_size && diff < best_diff) {
+          best_diff = diff;
+          best_offset = prev_offset;
+        }
+      }
+      prev_offset = std::max(
+          prev_offset, cur_offset + usage_records[allocated_id].tensor_size);
+    }
+    if (assignment->total_size < prev_offset) {
+      return InternalError("Total size is wrong.");
+    }
+
+    // If no suitable gap found, we should allocate current tensor after the
+    // rightmost tensor, which usage interval intersects with the current one.
+    if (best_offset == kNotAssigned) {
+      best_offset = prev_offset;
+    }
+
+    // Assign best_offset to the current tensor and find the correct place to
+    // insert information about it into ordered_allocs to save the order.
+    auto it = ordered_allocs.begin();
+    while (it != ordered_allocs.end() &&
+           assignment->offsets[*it] <= best_offset) {
+      ++it;
+    }
+    ordered_allocs.insert(it, rec_with_idx.idx);
+    assignment->offsets[rec_with_idx.idx] = best_offset;
+    assignment->total_size =
+        std::max(assignment->total_size, best_offset + rec->tensor_size);
+  }
+  return OkStatus();
+}
+
+// Assigns given tensors to shared objects, using the following greedy
+// algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - Distance between two usage intervals is the absoulte difference between
+// closest tasks in their intervals. If two usage intervals don't intersect,
+// than the distance between them is positive;
+// - Calculate positional maximums vector, e.g. the vector of lower bounds on
+// size of each shared object;
+// - For each tensor find the rightmost positional maximum, that is greater or
+// equal, than current tensor's size (call it position);
+// - Iterate through all tensors in non-decreasing order of their
+// SizeDistPriority (described above);
+// - For every such tensor, assign it to the object, that already has tensor,
+// which usage interval has the smallest existing positive distance to the
+// current tensor's usage interval (this distance and object id are already
+// precalculated in its SizeDistPriority record). Size of the chosen object can
+// possible increase;
+// - If there are several such objects, use the largest one;
+// - If there are no suitable shared objects, assign current tensor to the new
+// object with size equal to current tensor's size;
+// - Modify SizeDistPriority records of tensors, that haven't been assigned yet,
+// to reflect distance changes after that assignment.
+Status GreedyBySizeDistPriorityAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment) {
+  std::vector<size_t> positional_max =
+      CalculatePositionalMaximums(usage_records);
+
+  size_t num_records = usage_records.size();
+  std::vector<SizeDistPriorityInfo> priority_info(num_records);
+  for (size_t rec_id = 0; rec_id < usage_records.size(); ++rec_id) {
+    priority_info[rec_id].tensor_usage_id = rec_id;
+    priority_info[rec_id].tensor_size = usage_records[rec_id].tensor_size;
+
+    // No objects have been created yet.
+    priority_info[rec_id].best_dist = kNotAssigned;
+    priority_info[rec_id].best_object = kNotAssigned;
+
+    // Find the rightmost positional maximum, that is greater or
+    size_t pos = 0;
+    while (pos < positional_max.size() &&
+           positional_max[pos] >= priority_info[rec_id].tensor_size) {
+      ++pos;
+    }
+    if (pos == 0) {
+      return InternalError("Variable pos must be positive.");
+    }
+    priority_info[rec_id].position = pos - 1;
+  }
+
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+  for (size_t it = 0; it < num_records; ++it) {
+    size_t best_info_id = kNotAssigned;
+    for (size_t info_id = 0; info_id < num_records; ++info_id) {
+      if (assignment->object_ids[priority_info[info_id].tensor_usage_id] !=
+          kNotAssigned) {
+        // Tensor already assigned.
+        continue;
+      }
+      if (best_info_id == kNotAssigned ||
+          priority_info[info_id] > priority_info[best_info_id]) {
+        best_info_id = info_id;
+      }
+    }
+    if (best_info_id == kNotAssigned) {
+      // During each iteration we assign exactly one of the tensors, so some not
+      // yet assigned tensors must exist.
+      return InternalError("Invalid value for variable best_info_id.");
+    }
+
+    size_t best_rec_id = priority_info[best_info_id].tensor_usage_id;
+    size_t best_obj_id = priority_info[best_info_id].best_object;
+    bool new_object = false;
+    if (priority_info[best_info_id].best_dist == kNotAssigned) {
+      // No suitable shared object, so we create a new one.
+      new_object = true;
+      best_obj_id = assignment->object_sizes.size();
+      assignment->object_ids[best_rec_id] = best_obj_id;
+      assignment->object_sizes.push_back(
+          usage_records[best_rec_id].tensor_size);
+    } else {
+      // Assign tensor best_rec_id to the already existing object best_obj_id.
+      assignment->object_ids[best_rec_id] = best_obj_id;
+      assignment->object_sizes[best_obj_id] =
+          std::max(assignment->object_sizes[best_obj_id],
+                   usage_records[best_rec_id].tensor_size);
+    }
+
+    // Modify SizeDistPriority records of tensors, that haven't been assigned
+    // yet, to reflect distance changes after that assignment.
+    for (size_t info_id = 0; info_id < num_records; ++info_id) {
+      // SizeDistPriority record info_id contains priority of tensor rec_id.
+      size_t rec_id = priority_info[info_id].tensor_usage_id;
+
+      if (assignment->object_ids[rec_id] != kNotAssigned) {
+        // Tensor rec_id is already assigned.
+        continue;
+      }
+      if (!new_object &&
+          priority_info[info_id].dist[best_obj_id] == kNotAssigned) {
+        // Tensor rec_id intersects with some of the tensors, that are assigned
+        // to object best_obj_id.
+        continue;
+      }
+
+      size_t dist = kNotAssigned;
+      if (usage_records[rec_id].last_task <
+          usage_records[best_rec_id].first_task) {
+        dist = usage_records[best_rec_id].first_task -
+               usage_records[rec_id].last_task;
+      } else if (usage_records[best_rec_id].last_task <
+                 usage_records[rec_id].first_task) {
+        dist = usage_records[rec_id].first_task -
+               usage_records[best_rec_id].last_task;
+      }
+
+      if (new_object) {
+        // best_rec_id is the only tensor, assigned to the new object.
+        priority_info[info_id].dist.push_back(dist);
+      } else if (dist == kNotAssigned) {
+        // Usage intervals of tensors rec_id and best_rec_id intersect. So
+        // rec_id can't be assigned to best_obj_id anymore.
+        priority_info[info_id].dist[best_obj_id] = kNotAssigned;
+        if (priority_info[info_id].best_object == best_obj_id) {
+          // best_obj_id was the best shared object for tensor rec_id, but now
+          // it's not suitable anymore, so we need some recalculation.
+          priority_info[info_id].RecalcBestDist();
+        }
+      } else {
+        // Update distance, because it has probably been changed.
+        priority_info[info_id].dist[best_obj_id] =
+            std::min(priority_info[info_id].dist[best_obj_id], dist);
+      }
+      if (dist < priority_info[info_id].best_dist) {
+        // Update best distance and best object for tensor rec_id.
+        priority_info[info_id].best_dist = dist;
+        priority_info[info_id].best_object = best_obj_id;
+      }
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
new file mode 100644
index 00000000000..ba77a83cfc8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Assigns given tensors to offsets, using the following greedy algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - Iterate through tensor usage records in non-increasing order of
+// corresponding tensor sizes;
+// - For each of these records consider already assigned tensors, which usage
+// intervals intersect with usage interval of current tensor, and find the
+// smallest gap in memory between them such, that current tensor fits into that
+// gap;
+// - If such a gap has been found, current tensor should be allocated into this
+// gap. Otherwise we can allocate it after the rightmost tensor, which usage
+// interval intersects with usage inteval of current tensor. So we assign
+// corresponding offset to current tensor and the tensor becomes assigned.
+Status GreedyBySizeAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    OffsetsAssignment* assignment);
+
+// Assigns given tensors to shared objects, using the following greedy
+// algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - Distance between two usage intervals is the absoulte difference between
+// closest tasks in their intervals. If two usage intervals don't intersect,
+// than the distance between them is positive;
+// - Calculate positional maximums vector, e.g. the vector of lower bounds on
+// size of each shared object;
+// - For each tensor find the rightmost positional maximum, that is greater or
+// equal, than current tensor's size (call it position);
+// - Iterate through all tensors in non-decreasing order of their
+// SizeDistPriority (described above);
+// - For every such tensor, assign it to the object, that already has tensor,
+// which usage interval has the smallest existing positive distance to the
+// current tensor's usage interval (this distance and object id are already
+// precalculated in its SizeDistPriority record). Size of the chosen object can
+// possible increase;
+// - If there are several such objects, use the largest one;
+// - If there are no suitable shared objects, assign current tensor to the new
+// object with size equal to current tensor's size;
+// - Modify SizeDistPriority records of tensors, that haven't been assigned yet,
+// to reflect distance changes after that assignment.
+Status GreedyBySizeDistPriorityAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
new file mode 100644
index 00000000000..7acf81afd29
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
@@ -0,0 +1,182 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
+
+#include <algorithm>
+#include <queue>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a greedy algorithm.
+//
+// The problem of memory management is NP-complete. This implements a
+// greedy algorithm that approximates an optimal solution with following
+// heuristic:
+//
+//   1. Iterates through all tensor usage records and for every object
+//   reference
+//      assigns shared object from the pool. When object reference is used
+//      for the last time, corresponding shared object is returned back to
+//      the pool.
+//
+//   2. Shared object pool grows when there are no free shared object
+//      available.
+//
+//   3. Shared object size may increase when tensor requests larger size.
+template <typename TensorSizeT>
+Status GreedyInOrderAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool of free shared objects is ordered by object size, because we perform
+  // lower_bound search in it.
+  std::set<PoolRecord<TensorSizeT>> pool;
+  // Queue of shared objects in use, ordered by their last_task.
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; i++) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool.insert({assignment->object_sizes[object_id], object_id});
+      objects_in_use.pop();
+    }
+    TensorSizeT tensor_size = usage_records[i].tensor_size;
+    if (pool.empty()) {
+      // No free shared object, creating a new one, assign i-th tensor to
+      // it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      auto best_it = pool.end();
+      // Find shared object from pool, that will waste the least possible
+      // amount of memory when reused for current tensor.
+      auto pool_it = pool.lower_bound({tensor_size, 0});
+      TensorSizeT size_diff = 0;
+      if (pool_it != pool.end()) {
+        // Try smallest shared object from pool with size >= tensor_size.
+        size_diff = pool_it->object_size - tensor_size;
+        best_it = pool_it;
+      }
+      if (pool_it != pool.begin()) {
+        // Try largest shared object from pool with size < tensor_size.
+        pool_it--;
+        if (best_it == pool.end() ||
+            tensor_size - pool_it->object_size < size_diff) {
+          size_diff = tensor_size - pool_it->object_size;
+          best_it = pool_it;
+        }
+      }
+      // best_it can't be equal to pool.end(), because pool is not empty
+      if (best_it == pool.end()) {
+        return InternalError(
+            "No shared object is found in non-empty pool in "
+            "GreedyInOrderAssignment.");
+      }
+      size_t shared_id = best_it->object_id;
+      pool.erase(best_it);
+      assignment->object_ids[i] = shared_id;
+      assignment->object_sizes[shared_id] =
+          std::max(assignment->object_sizes[shared_id], tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return OkStatus();
+}
+
+// The same algorithm as above, but for multidimensional case. The only
+// difference is that shared object dimensions can't be increased to be reused
+// for tensor, that is larger (at least by one dimension).
+template <typename TensorSizeT>
+Status GreedyInOrderAssignmentMultidimensional(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool of free shared objects is unordered in multidimensional version of the
+  // algorithm.
+  std::list<size_t> pool;
+  // Queue of shared objects in use, ordered by their last_task.
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; i++) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool.push_back(object_id);
+      objects_in_use.pop();
+    }
+    const TensorSizeT& tensor_size = usage_records[i].tensor_size;
+    auto best_it = pool.end();
+    size_t best_size_diff = 0;
+    // Find shared object from pool, that will waste the least possible
+    // amount of memory when reused for current tensor.
+    for (auto pool_it = pool.begin(); pool_it != pool.end(); ++pool_it) {
+      // Needed size of shared object to cover current tensor and all previous
+      // tensors assigned to it.
+      const TensorSizeT& shared_object_size =
+          assignment->object_sizes[*pool_it];
+      if (IsCoveringObject(shared_object_size, tensor_size)) {
+        // Prefer shared object that will waste less memory.
+        size_t size_diff = AbsDiffInElements(shared_object_size, tensor_size);
+        if (best_it == pool.end() || size_diff < best_size_diff) {
+          best_it = pool_it;
+          best_size_diff = size_diff;
+        }
+      }
+    }
+    if (best_it == pool.end()) {
+      // No free suitable shared object, creating a new one, assign i-th tensor
+      // to it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      size_t shared_id = *best_it;
+      pool.erase(best_it);
+      assignment->object_ids[i] = shared_id;
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
new file mode 100644
index 00000000000..5f25df1259f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+
+namespace tflite {
+namespace gpu {
+
+bool CompareBySize(const TensorUsageWithIndex<size_t>& first,
+                   const TensorUsageWithIndex<size_t>& second) {
+  return first.usage_record->tensor_size > second.usage_record->tensor_size;
+}
+
+bool IsCoveringObject(const uint2& first_object, const uint2& second_object) {
+  return first_object.x >= second_object.x && first_object.y >= second_object.y;
+}
+
+bool IsCoveringObject(const uint3& first_object, const uint3& second_object) {
+  return first_object.x >= second_object.x &&
+         first_object.y >= second_object.y && first_object.z >= second_object.z;
+}
+
+size_t AbsDiffInElements(const uint2& first_size, const uint2& second_size) {
+  const size_t first_elements_cnt = first_size.y * first_size.x;
+  const size_t second_elements_cnt = second_size.y * second_size.x;
+  return first_elements_cnt >= second_elements_cnt
+             ? first_elements_cnt - second_elements_cnt
+             : second_elements_cnt - first_elements_cnt;
+}
+
+size_t AbsDiffInElements(const uint3& first_size, const uint3& second_size) {
+  const size_t first_elements_cnt = first_size.z * first_size.y * first_size.x;
+  const size_t second_elements_cnt =
+      second_size.z * second_size.y * second_size.x;
+  return first_elements_cnt >= second_elements_cnt
+             ? first_elements_cnt - second_elements_cnt
+             : second_elements_cnt - first_elements_cnt;
+}
+
+std::vector<TaskProfile> CalculateTaskProfiles(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records) {
+  TaskId num_tasks = 0;
+  for (size_t i = 0; i < usage_records.size(); ++i) {
+    num_tasks = std::max(num_tasks, usage_records[i].last_task + 1);
+  }
+  std::vector<TaskProfile> task_profiles(num_tasks);
+  for (size_t rec_id = 0; rec_id < usage_records.size(); ++rec_id) {
+    // Each tensor usage record must be added to profile of every task between
+    // its first_task and last_task.
+    for (TaskId task_id = usage_records[rec_id].first_task;
+         task_id <= usage_records[rec_id].last_task; ++task_id) {
+      task_profiles[task_id].emplace_back(&usage_records[rec_id], rec_id);
+    }
+  }
+  // Records in each TaskProfile must be sorted in non-increasing order of
+  // corresponding tensors sizes.
+  for (auto& task_profile : task_profiles) {
+    std::sort(task_profile.begin(), task_profile.end(), CompareBySize);
+  }
+  return task_profiles;
+}
+
+std::vector<size_t> CalculatePositionalMaximums(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records) {
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles(usage_records);
+  std::vector<size_t> positional_max;
+  for (const auto& task_profile : task_profiles) {
+    // Update positional_max with values of current TaskProfile.
+    size_t i = 0;
+    for (; i < task_profile.size() && i < positional_max.size(); ++i) {
+      positional_max[i] = std::max(positional_max[i],
+                                   task_profile[i].usage_record->tensor_size);
+    }
+    // If current task_profile has more records, than there are in
+    // positional_max, we should append new elements into positional_max.
+    for (; i < task_profile.size(); ++i) {
+      positional_max.push_back(task_profile[i].usage_record->tensor_size);
+    }
+  }
+  return positional_max;
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
new file mode 100644
index 00000000000..35050fd2b1d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+const size_t kNotAssigned = std::numeric_limits<size_t>::max();
+
+// This structure is used to save the initial indices of usage records after
+// they are sorted.
+template <typename TensorSizeT>
+struct TensorUsageWithIndex {
+  const TensorUsageRecord<TensorSizeT>* usage_record;
+  size_t idx;
+
+  TensorUsageWithIndex(const TensorUsageRecord<TensorSizeT>* usage_record,
+                       size_t idx)
+      : usage_record(usage_record), idx(idx) {}
+};
+
+bool CompareBySize(const TensorUsageWithIndex<size_t>& first,
+                   const TensorUsageWithIndex<size_t>& second);
+
+// TaskProfile is a vector with information about all intermediate tensors, that
+// should exist in memory during the executon of the task. Elements of the
+// vector must be sorted in non-increasing order of corresponding tensors sizes.
+using TaskProfile = std::vector<TensorUsageWithIndex<size_t>>;
+
+// Size of object, that covers both input objects (2-dimensional case).
+bool IsCoveringObject(const uint2& first_object, const uint2& second_object);
+
+// Size of object, that covers both input objects (3-dimensional case).
+bool IsCoveringObject(const uint3& first_object, const uint3& second_object);
+
+// Difference between two objects in elements count (2-dimensional case).
+size_t AbsDiffInElements(const uint2& first_size, const uint2& second_size);
+
+// Difference between two objects in elements count (3-dimensional case).
+size_t AbsDiffInElements(const uint3& first_size, const uint3& second_size);
+
+template <typename ObjectSizeT>
+struct PoolRecord {
+  PoolRecord(ObjectSizeT size, size_t obj_id)
+      : object_size(size), object_id(obj_id) {}
+
+  // Objects in pool are ordered by size.
+  bool operator<(const PoolRecord& other) const {
+    return (object_size < other.object_size) ||
+           (object_size == other.object_size && object_id < other.object_id);
+  }
+
+  ObjectSizeT object_size;
+  size_t object_id;
+};
+
+struct QueueRecord {
+  QueueRecord(TaskId task_id, size_t obj_id)
+      : last_task(task_id), object_id(obj_id) {}
+
+  // Objects in queue are ordered by last_task.
+  bool operator<(const QueueRecord& other) const {
+    return (last_task > other.last_task) ||
+           (last_task == other.last_task && object_id > other.object_id);
+  }
+
+  // Last task, where shared object is used.
+  TaskId last_task;
+  size_t object_id;
+};
+
+// Returns a vector that contains TaskProfile for each task.
+std::vector<TaskProfile> CalculateTaskProfiles(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records);
+
+// Iterates over all task profiles to calculate maximum at each position.
+std::vector<size_t> CalculatePositionalMaximums(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
new file mode 100644
index 00000000000..757cb89b366
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TaskProfileTest, EmptyRecords) {
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles({});
+  EXPECT_TRUE(task_profiles.empty());
+  std::vector<size_t> positional_max = CalculatePositionalMaximums({});
+  EXPECT_TRUE(positional_max.empty());
+}
+
+TEST(TaskProfileTest, OneRecord) {
+  std::vector<TensorUsageRecord<size_t>> usage_records{
+      {/*size=*/16, /*first=*/0, /*last=*/1}};
+  const std::vector<std::vector<size_t>> correct_idx = {{0}, {0}};
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles(usage_records);
+  ASSERT_EQ(task_profiles.size(), correct_idx.size());
+  for (size_t i = 0; i < task_profiles.size(); ++i) {
+    ASSERT_EQ(task_profiles[i].size(), correct_idx[i].size());
+    for (size_t j = 0; j < task_profiles[i].size(); ++j) {
+      ASSERT_EQ(task_profiles[i][j].usage_record,
+                &usage_records[correct_idx[i][j]]);
+      ASSERT_EQ(task_profiles[i][j].idx, correct_idx[i][j]);
+    }
+  }
+  std::vector<size_t> positional_max =
+      CalculatePositionalMaximums(usage_records);
+  EXPECT_THAT(positional_max, ElementsAre(16));
+}
+
+TEST(TaskProfileTest, ChainRecords) {
+  std::vector<TensorUsageRecord<size_t>> usage_records{
+      {/*size=*/16, /*first=*/0, /*last=*/1},
+      {/*size=*/8, /*first=*/1, /*last=*/2},
+      {/*size=*/64, /*first=*/2, /*last=*/3},
+      {/*size=*/32, /*first=*/3, /*last=*/4},
+      {/*size=*/8, /*first=*/4, /*last=*/5},
+  };
+  const std::vector<std::vector<size_t>> correct_idx = {{0},    {0, 1}, {2, 1},
+                                                        {2, 3}, {3, 4}, {4}};
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles(usage_records);
+  ASSERT_EQ(task_profiles.size(), correct_idx.size());
+  for (size_t i = 0; i < task_profiles.size(); ++i) {
+    ASSERT_EQ(task_profiles[i].size(), correct_idx[i].size());
+    for (size_t j = 0; j < task_profiles[i].size(); ++j) {
+      ASSERT_EQ(task_profiles[i][j].usage_record,
+                &usage_records[correct_idx[i][j]]);
+      ASSERT_EQ(task_profiles[i][j].idx, correct_idx[i][j]);
+    }
+  }
+  std::vector<size_t> positional_max =
+      CalculatePositionalMaximums(usage_records);
+  EXPECT_THAT(positional_max, ElementsAre(64, 32));
+}
+
+TEST(TaskProfileTest, ComplexRecords) {
+  std::vector<TensorUsageRecord<size_t>> usage_records{
+      {/*size=*/32, /*first=*/0, /*last=*/1},
+      {/*size=*/32, /*first=*/1, /*last=*/4},
+      {/*size=*/8, /*first=*/2, /*last=*/5},
+      {/*size=*/16, /*first=*/3, /*last=*/5},
+      {/*size=*/8, /*first=*/4, /*last=*/5},
+      {/*size=*/64, /*first=*/5, /*last=*/7},
+      {/*size=*/8, /*first=*/6, /*last=*/8},
+      {/*size=*/8, /*first=*/7, /*last=*/8},
+      {/*size=*/16, /*first=*/8, /*last=*/9}};
+  const std::vector<std::vector<size_t>> correct_idx = {
+      {0},          {0, 1}, {1, 2},    {1, 3, 2}, {1, 3, 2, 4},
+      {5, 3, 2, 4}, {5, 6}, {5, 6, 7}, {8, 6, 7}, {8}};
+  std::vector<TaskProfile> task_profiles = CalculateTaskProfiles(usage_records);
+  ASSERT_EQ(task_profiles.size(), correct_idx.size());
+  for (size_t i = 0; i < task_profiles.size(); ++i) {
+    ASSERT_EQ(task_profiles[i].size(), correct_idx[i].size());
+    for (size_t j = 0; j < task_profiles[i].size(); ++j) {
+      ASSERT_EQ(task_profiles[i][j].usage_record,
+                &usage_records[correct_idx[i][j]]);
+      ASSERT_EQ(task_profiles[i][j].idx, correct_idx[i][j]);
+    }
+  }
+  std::vector<size_t> positional_max =
+      CalculatePositionalMaximums(usage_records);
+  EXPECT_THAT(positional_max, ElementsAre(64, 32, 8, 8));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
new file mode 100644
index 00000000000..ab15af88429
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
@@ -0,0 +1,225 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
+
+#include <algorithm>
+#include <queue>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+// This class build flow graph and solves Minimum-cost flow problem in it.
+class MinCostFlowSolver {
+ public:
+  // Build auxiliary flow graph, based on information about intermediate
+  // tensors.
+  void Build(const std::vector<TensorUsageRecord<size_t>>& usage_records) {
+    usage_records_ = &usage_records;
+    num_tensors_ = usage_records.size();
+    source_ = 2 * num_tensors_;
+    sink_ = source_ + 1;
+    edges_from_.resize(sink_ + 1);
+    std::vector<size_t> old_record_ids;
+    std::priority_queue<QueueRecord> objects_in_use;
+    for (size_t i = 0; i < usage_records.size(); i++) {
+      // Pop from the queue all objects that are no longer in use at the time
+      // of execution of the first_task of i-th intermediate tensor.
+      while (!objects_in_use.empty() &&
+             objects_in_use.top().last_task < usage_records[i].first_task) {
+        old_record_ids.push_back(objects_in_use.top().object_id);
+        objects_in_use.pop();
+      }
+      objects_in_use.push({usage_records[i].last_task, i});
+      AddEdge(source_, i, 1, 0);
+      AddEdge(RightPartTwin(i), sink_, 1, 0);
+
+      // Edge from source_ to i-th vertex in the right part of flow graph
+      // are added for the case of allocation of new shared object for i-th
+      // tensor. Cost of these edges is equal to the size of i-th tensor.
+      AddEdge(source_, RightPartTwin(i), 1, usage_records[i].tensor_size);
+
+      // Edges from vertices of the left part of flow graph, corresponding to
+      // old_record_ids, to i-th vertex in the right part of flow graph are
+      // added for the case of reusing previously created shared objects for
+      // i-th tensor. Cost of these edges is an approximation of the size of
+      // new allocated memory.
+      for (auto record_id : old_record_ids) {
+        int cost = 0;
+        if (usage_records[i].tensor_size >
+            usage_records[record_id].tensor_size) {
+          cost = usage_records[i].tensor_size -
+                 usage_records[record_id].tensor_size;
+        }
+        AddEdge(record_id, RightPartTwin(i), 1, cost);
+      }
+    }
+  }
+
+  // Solve Minimum-cost flow problem with Shortest Path Faster Algorithm.
+  void Solve() {
+    const int kInf = std::numeric_limits<int>::max();
+    std::vector<size_t> prev_edge(sink_ + 1);
+    while (true) {
+      std::queue<size_t> cur_queue, next_queue;
+      std::vector<size_t> last_it_in_queue(sink_ + 1);
+      std::vector<size_t> dist(sink_ + 1, kInf);
+      size_t it = 1;
+      cur_queue.push(source_);
+      last_it_in_queue[source_] = it;
+      dist[source_] = 0;
+      // Find shortest path from source_ to sink_, using only edges with
+      // positive capacity.
+      while (!cur_queue.empty()) {
+        ++it;
+        while (!cur_queue.empty()) {
+          auto v = cur_queue.front();
+          cur_queue.pop();
+          for (const auto& edge_id : edges_from_[v]) {
+            const Edge& edge = edges_[edge_id];
+            if (edge.cap > 0) {
+              auto u = edge.dst;
+              int new_dist = dist[v] + edge.cost;
+              if (new_dist < dist[u]) {
+                dist[u] = new_dist;
+                prev_edge[u] = edge_id;
+                if (last_it_in_queue[u] != it) {
+                  next_queue.push(u);
+                  last_it_in_queue[u] = it;
+                }
+              }
+            }
+          }
+        }
+        std::swap(cur_queue, next_queue);
+      }
+      // If path is not found, final result is ready.
+      if (dist[sink_] == kInf) break;
+
+      // If path is found, we need to decrease the capacity of its edges, and
+      // increase the capacity of its reversed edges.
+      for (size_t v = sink_; v != source_;) {
+        --edges_[prev_edge[v]].cap;
+        Edge& rev_edge = edges_[prev_edge[v] ^ 1];
+        ++rev_edge.cap;
+        v = rev_edge.dst;
+      }
+    }
+  }
+
+  void CalculateAssignment(ObjectsAssignment<size_t>* assignment) {
+    assignment->object_sizes.clear();
+    assignment->object_ids.assign(num_tensors_, kNotAssigned);
+    is_tensor_assigned_.resize(num_tensors_);
+    for (const auto& edge_id : edges_from_[source_]) {
+      const Edge& edge = edges_[edge_id];
+      if (edge.cap == 0 && IsRightPartVertex(edge.dst)) {
+        assignment->object_sizes.push_back(
+            AssignTensorsToNewSharedObject(LeftPartTwin(edge.dst), assignment));
+      }
+    }
+  }
+
+ private:
+  struct Edge {
+    Edge(size_t dst, int cap, int cost) : dst(dst), cap(cap), cost(cost) {}
+
+    size_t dst;
+    int cap;
+    int cost;
+  };
+
+  // Add edge from vertex src to vertex dst with given capacity and cost and
+  // its reversed edge to the flow graph. If some edge has index idx, its
+  // reversed edge has index idx^1.
+  void AddEdge(size_t src, size_t dst, int cap, int cost) {
+    edges_from_[src].push_back(edges_.size());
+    edges_.emplace_back(dst, cap, cost);
+    edges_from_[dst].push_back(edges_.size());
+    edges_.push_back({src, 0, -cost});
+  }
+
+  // Check, if vertex_id belongs to right part of the flow graph.
+  bool IsRightPartVertex(size_t vertex_id) const {
+    return vertex_id >= num_tensors_ && vertex_id < 2 * num_tensors_;
+  }
+
+  // Return vertex from another part of the graph, that corresponds to the
+  // same intermediate tensor.
+  size_t LeftPartTwin(size_t vertex_id) const {
+    return vertex_id - num_tensors_;
+  }
+  size_t RightPartTwin(size_t vertex_id) const {
+    return vertex_id + num_tensors_;
+  }
+
+  // This function uses recursive implementation of depth-first search and
+  // returns maximum size from tensor tensor_id and all tensors, that will be
+  // allocated at the same place with it after all operations that use
+  // tensor_id are executed. Next tensor to be allocated at the same place
+  // with tensor_id is a left part twin of such vertex v, that the edge
+  // tensor_id->v is saturated (has zero residual capacity).
+  size_t AssignTensorsToNewSharedObject(size_t tensor_id,
+                                        ObjectsAssignment<size_t>* assignment) {
+    size_t cost = (*usage_records_)[tensor_id].tensor_size;
+    is_tensor_assigned_[tensor_id] = true;
+    assignment->object_ids[tensor_id] = assignment->object_sizes.size();
+    for (const auto& edge_id : edges_from_[tensor_id]) {
+      const Edge& edge = edges_[edge_id];
+      size_t v = edge.dst;
+      size_t left_twin = LeftPartTwin(v);
+      if (edge.cap == 0 && IsRightPartVertex(v) &&
+          !is_tensor_assigned_[left_twin]) {
+        cost = std::max(cost,
+                        AssignTensorsToNewSharedObject(left_twin, assignment));
+      }
+    }
+    return cost;
+  }
+
+  size_t source_;
+  size_t sink_;
+  size_t num_tensors_;
+  const std::vector<TensorUsageRecord<size_t>>* usage_records_;
+  std::vector<Edge> edges_;
+  std::vector<std::vector<size_t>> edges_from_;
+  std::vector<bool> is_tensor_assigned_;
+};
+
+}  // namespace
+
+// Implements memory management with a Minimum-cost flow matching algorithm.
+//
+// The problem of memory management is NP-complete. This function creates
+// auxiliary flow graph, find minimum-cost flow in it and calculates the
+// assignment of shared objects to tensors, using the result of the flow
+// algorithm.
+Status MinCostFlowAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment) {
+  MinCostFlowSolver solver;
+  solver.Build(usage_records);
+  solver.Solve();
+  solver.CalculateAssignment(assignment);
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
new file mode 100644
index 00000000000..494dbf9abb8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a Minimum-cost flow matching algorithm.
+//
+// The problem of memory management is NP-complete. This function creates
+// auxiliary flow graph, find minimum-cost flow in it and calculates the
+// assignment of shared objects to tensors, using the result of the flow
+// algorithm.
+Status MinCostFlowAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
new file mode 100644
index 00000000000..0d637934974
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a naive algorithm.
+//
+// The problem of memory management is NP-complete. This implements a
+// naive algorithm that assigns each tensor to a separate object in memory.
+template <typename TensorSizeT>
+Status NaiveAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  assignment->object_sizes.resize(usage_records.size());
+  assignment->object_ids.assign(usage_records.size(), kNotAssigned);
+  for (size_t i = 0; i < usage_records.size(); i++) {
+    auto& record = usage_records[i];
+    assignment->object_ids[i] = i;
+    assignment->object_sizes[i] = record.tensor_size;
+  }
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 34cc684788f..6b915e2caed 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -24,6 +24,35 @@ namespace {
 
 using ::testing::ElementsAre;
 
+TEST(Model, EmptyAssignment) {
+  ObjectsAssignment<size_t> objects_assignment;
+  OffsetsAssignment result = ObjectsToOffsets(objects_assignment);
+  EXPECT_TRUE(result.offsets.empty());
+  EXPECT_EQ(result.total_size, 0);
+}
+
+TEST(Model, OneObjectAssignment) {
+  ObjectsAssignment<size_t> objects_assignment;
+  objects_assignment.object_sizes = {16};
+  objects_assignment.object_ids = {0};
+  OffsetsAssignment result = ObjectsToOffsets(objects_assignment);
+  EXPECT_EQ(result.total_size, 16);
+  EXPECT_THAT(result.offsets, ElementsAre(0));
+
+  objects_assignment.object_ids = {0, 0, 0};
+  result = ObjectsToOffsets(objects_assignment);
+  EXPECT_EQ(result.total_size, 16);
+  EXPECT_THAT(result.offsets, ElementsAre(0, 0, 0));
+}
+
+TEST(Model, ManyObjectsAssignment) {
+  ObjectsAssignment<size_t> objects_assignment;
+  objects_assignment.object_sizes = {16, 8, 32, 32, 4, 16};
+  objects_assignment.object_ids = {2, 0, 2, 1, 3, 3, 1, 5};
+  OffsetsAssignment result = ObjectsToOffsets(objects_assignment);
+  EXPECT_THAT(result.offsets, ElementsAre(24, 0, 24, 16, 56, 56, 16, 92));
+}
+
 TEST(Model, EmptyRecords) {
   ObjectsAssignment<size_t> assignment;
   ASSERT_TRUE(
@@ -37,7 +66,8 @@ TEST(Model, EmptyRecords) {
   EXPECT_TRUE(assignment.object_sizes.empty());
 
   ASSERT_TRUE(
-      AssignObjectsToTensors({}, MemoryStrategy::GREEDY, &assignment).ok());
+      AssignObjectsToTensors({}, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+          .ok());
   EXPECT_TRUE(assignment.object_ids.empty());
   EXPECT_TRUE(assignment.object_sizes.empty());
 
@@ -46,11 +76,31 @@ TEST(Model, EmptyRecords) {
           .ok());
   EXPECT_TRUE(assignment.object_ids.empty());
   EXPECT_TRUE(assignment.object_sizes.empty());
+
+  ASSERT_TRUE(
+      AssignObjectsToTensors({}, MemoryStrategy::GREEDY_BY_BREADTH, &assignment)
+          .ok());
+  EXPECT_TRUE(assignment.object_ids.empty());
+  EXPECT_TRUE(assignment.object_sizes.empty());
+
+  ASSERT_TRUE(
+      AssignObjectsToTensors({}, MemoryStrategy::GREEDY_BY_SIZE, &assignment)
+          .ok());
+  EXPECT_TRUE(assignment.object_ids.empty());
+  EXPECT_TRUE(assignment.object_sizes.empty());
+
+  OffsetsAssignment offsets_assignment;
+  ASSERT_TRUE(AssignOffsetsToTensors({}, MemoryStrategy::GREEDY_BY_SIZE,
+                                     &offsets_assignment)
+                  .ok());
+  EXPECT_TRUE(offsets_assignment.offsets.empty());
+  EXPECT_EQ(offsets_assignment.total_size, 0);
 }
 
 TEST(Model, OneRecord) {
   std::vector<TensorUsageRecord<size_t>> usage_records{
       {/*size=*/16, /*first=*/0, /*last=*/1}};
+
   ObjectsAssignment<size_t> assignment;
   ASSERT_TRUE(
       AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
@@ -64,9 +114,9 @@ TEST(Model, OneRecord) {
   EXPECT_THAT(assignment.object_ids, ElementsAre(0));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(16));
 
-  ASSERT_TRUE(
-      AssignObjectsToTensors(usage_records, MemoryStrategy::GREEDY, &assignment)
-          .ok());
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+                  .ok());
   EXPECT_THAT(assignment.object_ids, ElementsAre(0));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(16));
 
@@ -75,6 +125,26 @@ TEST(Model, OneRecord) {
                   .ok());
   EXPECT_THAT(assignment.object_ids, ElementsAre(0));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(16));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_BREADTH, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(16));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_SIZE, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(16));
+
+  OffsetsAssignment offsets_assignment;
+  ASSERT_TRUE(AssignOffsetsToTensors(usage_records,
+                                     MemoryStrategy::GREEDY_BY_SIZE,
+                                     &offsets_assignment)
+                  .ok());
+  EXPECT_THAT(offsets_assignment.offsets, ElementsAre(0));
+  EXPECT_EQ(offsets_assignment.total_size, 16);
 }
 
 TEST(Model, ChainRecords) {
@@ -85,6 +155,7 @@ TEST(Model, ChainRecords) {
       {/*size=*/32, /*first=*/3, /*last=*/4},
       {/*size=*/8, /*first=*/4, /*last=*/5},
   };
+
   ObjectsAssignment<size_t> assignment;
   ASSERT_TRUE(
       AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
@@ -98,17 +169,37 @@ TEST(Model, ChainRecords) {
   EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 3, 1));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(16, 8, 64, 32));
 
-  ASSERT_TRUE(
-      AssignObjectsToTensors(usage_records, MemoryStrategy::GREEDY, &assignment)
-          .ok());
-  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 1, 0));
-  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32));
-
   ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::MINCOSTFLOW,
                                      &assignment)
                   .ok());
   EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 1, 0));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 1, 0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_BREADTH, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 1, 0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_SIZE, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 1, 0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32));
+
+  OffsetsAssignment offsets_assignment;
+  ASSERT_TRUE(AssignOffsetsToTensors(usage_records,
+                                     MemoryStrategy::GREEDY_BY_SIZE,
+                                     &offsets_assignment)
+                  .ok());
+  EXPECT_THAT(offsets_assignment.offsets, ElementsAre(0, 64, 0, 64, 0));
+  EXPECT_EQ(offsets_assignment.total_size, 96);
 }
 
 TEST(Model, ComplexRecords) {
@@ -122,6 +213,7 @@ TEST(Model, ComplexRecords) {
       {/*size=*/8, /*first=*/6, /*last=*/8},
       {/*size=*/8, /*first=*/7, /*last=*/8},
       {/*size=*/16, /*first=*/8, /*last=*/9}};
+
   ObjectsAssignment<size_t> assignment;
   ASSERT_TRUE(
       AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
@@ -136,17 +228,38 @@ TEST(Model, ComplexRecords) {
   EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 3, 4, 5, 4, 2, 3));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 32, 8, 16, 8, 64));
 
-  ASSERT_TRUE(
-      AssignObjectsToTensors(usage_records, MemoryStrategy::GREEDY, &assignment)
-          .ok());
-  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 3, 1, 3, 2, 0));
-  EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 16, 8));
-
   ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::MINCOSTFLOW,
                                      &assignment)
                   .ok());
   EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 0, 3, 1, 3, 2, 0));
   EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 8, 8));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 3, 1, 3, 2, 0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 16, 8));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_BREADTH, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 4, 2, 1, 3, 0, 2, 3, 1));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 16, 8, 8, 32));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_BY_SIZE, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(1, 0, 2, 1, 3, 0, 1, 2, 0));
+  EXPECT_THAT(assignment.object_sizes, ElementsAre(64, 32, 8, 8));
+
+  OffsetsAssignment offsets_assignment;
+  ASSERT_TRUE(AssignOffsetsToTensors(usage_records,
+                                     MemoryStrategy::GREEDY_BY_SIZE,
+                                     &offsets_assignment)
+                  .ok());
+  EXPECT_THAT(offsets_assignment.offsets,
+              ElementsAre(0, 32, 80, 64, 88, 0, 64, 72, 0));
+  EXPECT_EQ(offsets_assignment.total_size, 96);
 }
 
 TEST(Model, BHWCRecords) {
@@ -182,6 +295,67 @@ TEST(Model, BHWCRecords) {
                   BHWC(1, 1, 8, 2), BHWC(1, 16, 1, 1), BHWC(16, 1, 1, 1)));
 }
 
+TEST(Model, UInt2Records) {
+  std::vector<TensorUsageRecord<uint2>> usage_records{
+      {/*size=*/uint2(2, 8), /*first=*/0, /*last=*/1},
+      {/*size=*/uint2(2, 8), /*first=*/1, /*last=*/2},
+      {/*size=*/uint2(1, 12), /*first=*/2, /*last=*/4},
+      {/*size=*/uint2(2, 8), /*first=*/3, /*last=*/5},
+      {/*size=*/uint2(8, 2), /*first=*/4, /*last=*/5},
+      {/*size=*/uint2(2, 8), /*first=*/5, /*last=*/7},
+      {/*size=*/uint2(1, 8), /*first=*/6, /*last=*/8},
+      {/*size=*/uint2(2, 8), /*first=*/7, /*last=*/8},
+      {/*size=*/uint2(4, 1), /*first=*/8, /*last=*/9}};
+
+  ObjectsAssignment<uint2> assignment;
+  ASSERT_TRUE(
+      AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
+          .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint2(2, 8), uint2(2, 8), uint2(1, 12), uint2(2, 8),
+                          uint2(8, 2), uint2(2, 8), uint2(1, 8), uint2(2, 8),
+                          uint2(4, 1)));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 0, 3, 1, 2, 0, 3));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint2(2, 8), uint2(2, 8), uint2(1, 12), uint2(8, 2)));
+}
+
+TEST(Model, UInt3Records) {
+  std::vector<TensorUsageRecord<uint3>> usage_records{
+      {/*size=*/uint3(1, 2, 8), /*first=*/0, /*last=*/1},
+      {/*size=*/uint3(4, 3, 2), /*first=*/1, /*last=*/2},
+      {/*size=*/uint3(1, 1, 1), /*first=*/2, /*last=*/4},
+      {/*size=*/uint3(2, 4, 1), /*first=*/3, /*last=*/5},
+      {/*size=*/uint3(2, 2, 2), /*first=*/4, /*last=*/5},
+      {/*size=*/uint3(8, 1, 2), /*first=*/5, /*last=*/7},
+      {/*size=*/uint3(1, 2, 1), /*first=*/6, /*last=*/8},
+      {/*size=*/uint3(1, 1, 1), /*first=*/7, /*last=*/8},
+      {/*size=*/uint3(2, 2, 2), /*first=*/8, /*last=*/9}};
+
+  ObjectsAssignment<uint3> assignment;
+  ASSERT_TRUE(
+      AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
+          .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint3(1, 2, 8), uint3(4, 3, 2), uint3(1, 1, 1),
+                          uint3(2, 4, 1), uint3(2, 2, 2), uint3(8, 1, 2),
+                          uint3(1, 2, 1), uint3(1, 1, 1), uint3(2, 2, 2)));
+
+  ASSERT_TRUE(AssignObjectsToTensors(
+                  usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 1, 3, 2, 0, 1));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint3(1, 2, 8), uint3(4, 3, 2), uint3(2, 4, 1),
+                          uint3(8, 1, 2)));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 986cbe5d5b7..5bb22a10014 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -43,14 +44,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace gpu {
 namespace {
 
-using ::absl::make_unique;
-using ::absl::StrCat;
-
 // Creates a node that consumes output from the given node. Because output need
 // to stay the same, newly created node will inherit the output from the given
 // node, which will in turn get newly created copy of output. This is necessary
@@ -77,8 +76,8 @@ template <typename T>
 Status CreateVectorCopyData(const TfLiteTensor& tensor, T* tensor_data) {
   if (tensor.bytes % sizeof(T) != 0) {
     return InvalidArgumentError(
-        StrCat("Input data size ", tensor.bytes,
-               " is not aligned to expected type: ", sizeof(T)));
+        absl::StrCat("Input data size ", tensor.bytes,
+                     " is not aligned to expected type: ", sizeof(T)));
   }
   std::memcpy(tensor_data, tensor.data.uint8, tensor.bytes);
   return OkStatus();
@@ -171,7 +170,7 @@ template <>
 Status SetAllDimensions<OHWI>(const TfLiteIntArray* dimensions, OHWI* shape) {
   if (dimensions->size != 4) {
     return InvalidArgumentError(
-        StrCat("Dimensions are not OHWI: ", dimensions->size));
+        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
   }
   shape->o = dimensions->data[0];
   shape->h = dimensions->data[1];
@@ -184,7 +183,7 @@ template <>
 Status SetAllDimensions<IHWO>(const TfLiteIntArray* dimensions, IHWO* shape) {
   if (dimensions->size != 4) {
     return InvalidArgumentError(
-        StrCat("Dimensions are not IHWO: ", dimensions->size));
+        absl::StrCat("Dimensions are not IHWO: ", dimensions->size));
   }
   shape->i = dimensions->data[0];
   shape->h = dimensions->data[1];
@@ -265,7 +264,8 @@ class ObjectReader {
 
   Status ReadValue(uint32_t idx, Value<TensorRef<BHWC>>** value) const {
     if (idx >= tflite_node_->inputs->size) {
-      return OutOfRangeError(StrCat("ReadValue: input tensor index: ", idx));
+      return OutOfRangeError(
+          absl::StrCat("ReadValue: input tensor index: ", idx));
     }
     return ReadValueByTensorIdx(tflite_node_->inputs->data[idx], value);
   }
@@ -276,11 +276,11 @@ class ObjectReader {
 
   Status GetTensorDims(uint32_t idx, TfLiteIntArray* dimensions) const {
     if (idx >= tflite_node_->inputs->size) {
-      return OutOfRangeError(StrCat("Input tensor index: ", idx));
+      return OutOfRangeError(absl::StrCat("Input tensor index: ", idx));
     }
     const int tensor_idx = tflite_node_->inputs->data[idx];
     if (tensor_idx < 0 || tensor_idx > context_->tensors_size) {
-      return OutOfRangeError(StrCat("Tensor index: ", tensor_idx));
+      return OutOfRangeError(absl::StrCat("Tensor index: ", tensor_idx));
     }
     const TfLiteTensor& tflite_tensor = context_->tensors[tensor_idx];
     *dimensions = *tflite_tensor.dims;
@@ -303,9 +303,9 @@ class ObjectReader {
 
   Status AddOutput(const Node* node, int id) {
     if (tflite_node_->outputs->size <= id) {
-      return InvalidArgumentError(
-          StrCat("Data id ", id, " must be less than tflite node outputs size ",
-                 tflite_node_->outputs->size));
+      return InvalidArgumentError(absl::StrCat(
+          "Data id ", id, " must be less than tflite node outputs size ",
+          tflite_node_->outputs->size));
     }
     int output_tensor_idx = tflite_node_->outputs->data[id];
     Value<TensorRef<BHWC>>* value;
@@ -331,13 +331,13 @@ class ObjectReader {
                               Value<TensorRef<BHWC>>** value) const {
     if (tensor_idx >= tensor_to_value_->size()) {
       return OutOfRangeError(
-          StrCat("ReadValue: input tensor index: ", tensor_idx));
+          absl::StrCat("ReadValue: input tensor index: ", tensor_idx));
     }
     if ((*tensor_to_value_)[tensor_idx] == nullptr) {
       const TfLiteTensor& tflite_tensor = context_->tensors[tensor_idx];
       if (tflite::IsConstantTensor(&tflite_tensor)) {
-        return NotFoundError(
-            StrCat("ReadValue: value is a constant tensor: ", tensor_idx));
+        return NotFoundError(absl::StrCat(
+            "ReadValue: value is a constant tensor: ", tensor_idx));
       }
       Value<TensorRef<BHWC>>* value = graph_->NewValue();
       RETURN_IF_ERROR(
@@ -355,6 +355,12 @@ class ObjectReader {
                : nullptr;
   }
 
+  TfLiteTensor* GetOutputTensor(int index) const {
+    return index >= 0 && index < tflite_node_->outputs->size
+               ? context_->tensors + tflite_node_->outputs->data[index]
+               : nullptr;
+  }
+
  private:
   GraphFloat32* graph_ = nullptr;
   const TfLiteContext* context_ = nullptr;
@@ -385,7 +391,7 @@ Status CheckInputsOutputs(const TfLiteContext* context,
 // A parser responsible for parsing TFLite operation and adding it to a graph.
 class TFLiteOperationParser {
  public:
-  virtual ~TFLiteOperationParser() {}
+  virtual ~TFLiteOperationParser() = default;
 
   // Parses TFLite operation. This method allows expanding fused operations
   // into more than one node.
@@ -399,19 +405,21 @@ class TFLiteOperationParser {
                              const TfLiteRegistration* registration) = 0;
 };
 
-Status CheckActivationSupported(TfLiteFusedActivation fused_activation) {
-  if (fused_activation == kTfLiteActNone) {
-    return OkStatus();
-  }
+Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
   switch (fused_activation) {
+    case kTfLiteActNone:
     case kTfLiteActRelu:
     case kTfLiteActRelu1:
     case kTfLiteActRelu6:
     case kTfLiteActTanh:
       return OkStatus();
-    default:
-      return NotFoundError(absl::StrFormat("Unsupported fused activation: %d.",
-                                           fused_activation));
+    case kTfLiteActSignBit:
+      return UnimplementedError("TfLiteFusedActivation.kTfLiteActSignBit");
+    case kTfLiteActSigmoid:
+      return UnimplementedError("TfLiteFusedActivation.kTfLiteActSigmoid");
+
+      // Do not add default; we want compilation error rather than run-time
+      // error.
   }
 }
 
@@ -455,7 +463,7 @@ Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
       break;
     default:
       return NotFoundError(
-          StrCat("Unsupported fused activation: ", fused_activation));
+          absl::StrCat("Unsupported fused activation: ", fused_activation));
   }
   return OkStatus();
 }
@@ -497,15 +505,15 @@ Status GetFullyConnectedAttributes(int weights_tensor_id, int bias_tensor_id,
   return OkStatus();
 }
 
-template <typename ParamsType>
+template <typename ParamsT>
 Status RetrieveBuiltinData(const TfLiteNode* tflite_node,
-                           ParamsType** tf_options) {
+                           ParamsT** tf_options) {
   const auto* params =
-      reinterpret_cast<const ParamsType*>(tflite_node->builtin_data);
+      reinterpret_cast<const ParamsT*>(tflite_node->builtin_data);
   if (!params) {
     return InternalError("Unable to retrieve builtin_data.");
   }
-  *tf_options = const_cast<ParamsType*>(params);
+  *tf_options = const_cast<ParamsT*>(params);
   return OkStatus();
 }
 
@@ -585,53 +593,6 @@ Status CheckKernelsAndStrides(int kernel_h, int kernel_w, int strides_h,
   return OkStatus();
 }
 
-class Conv2DOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteConvParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    RETURN_IF_ERROR(CheckStridesAndDilation(
-        tf_options->stride_height, tf_options->stride_width,
-        tf_options->dilation_height_factor, tf_options->dilation_width_factor));
-    RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation));
-    return OkStatus();
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::CONVOLUTION_2D);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    Convolution2DAttributes attr;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
-    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
-    attr.dilations = HW(tf_options->dilation_height_factor,
-                        tf_options->dilation_width_factor);
-    UpdatePadding(tf_options->padding,
-                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, node));
-    node->operation.attributes = std::move(attr);
-    return OkStatus();
-  }
-};
-
 // Creates a simple node that holds tensor value.
 Status NewConstNode(TensorFloat32 t, GraphFloat32* graph,
                     Value<TensorRef<BHWC>>** value) {
@@ -649,6 +610,114 @@ Status NewConstNode(TensorFloat32 t, GraphFloat32* graph,
   return OkStatus();
 }
 
+Status ParsePoolingAttributes(const TfLitePoolParams* tf_options,
+                              const BHWC& input_shape,
+                              Pooling2DAttributes* attr) {
+  attr->kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
+  attr->strides = ToHW(tf_options->stride_height, tf_options->stride_width);
+  UpdatePadding(tf_options->padding, input_shape, attr);
+  return OkStatus();
+}
+
+Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
+  const TfLiteIntArray* dims = tflite_tensor.dims;
+  switch (dims->size) {
+    case 1:
+      *bhwc = BHWC(dims->data[0], 1, 1, 1);
+      return OkStatus();
+    case 2:
+      *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
+      return OkStatus();
+    case 3:
+      *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
+      return OkStatus();
+    case 4:
+      *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
+      return OkStatus();
+    default:
+      return InvalidArgumentError(absl::StrCat(
+          "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr",
+          "\" has bad input dims size: ", dims->size, "."));
+  }
+}
+
+class AddOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    if (tflite_node->inputs->size != 2) {
+      return UnimplementedError("ADD requires two input tensors.");
+    }
+    // TODO(eignasheva): Add shapes check.
+    TfLiteAddParams* tf_options = nullptr;
+    return RetrieveBuiltinData(tflite_node, &tf_options);
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    // TFLite currently only supports 2 input ADDs.  Thus, the logic below only
+    // considers 2 input cases.  The underlying GPU shader programs can accept
+    // more inputs, but the logic below would have to be expanded.
+
+    // Determine runtime/constant tensors.
+    const TfLiteTensor* input0 = reader->GetInputTensor(0);
+    if (!input0) {
+      return InvalidArgumentError("Couldn't get the 1st input tensor for ADD.");
+    }
+    const TfLiteTensor* input1 = reader->GetInputTensor(1);
+    if (!input1) {
+      return InvalidArgumentError("Couldn't get the 2nd input tensor for ADD.");
+    }
+    const bool constant_tensor0 = IsConstantTensor(input0);
+    const bool constant_tensor1 = IsConstantTensor(input1);
+    if (constant_tensor0 && constant_tensor1) {
+      return InvalidArgumentError("No runtime input tensors for ADD.");
+    }
+    const bool runtime_tensor0 = !constant_tensor0;
+    const bool runtime_tensor1 = !constant_tensor1;
+
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::ADD);
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    AddAttributes attr;
+    if (runtime_tensor0 && runtime_tensor1) {
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+    } else {
+      int runtime_tensor = 0;
+      int constant_tensor = 1;
+      TfLiteIntArray* constant_dims = input1->dims;
+      if (constant_tensor0 && runtime_tensor1) {
+        runtime_tensor = 1;
+        constant_tensor = 0;
+        constant_dims = input0->dims;
+      }
+      RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
+      if (constant_dims->size <= 0) {
+        Tensor<Scalar, DataType::FLOAT32> tensor;
+        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
+        attr.param = tensor.data[0];
+      } else {
+        Tensor<Linear, DataType::FLOAT32> tensor;
+        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
+        attr.param = std::move(tensor);
+      }
+    }
+    node->operation.attributes = std::move(attr);
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+    return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
+                                                node);
+  }
+};
+
 class ConcatenationOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -770,6 +839,90 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
   }
 };
 
+class Conv2DOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    TfLiteConvParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(CheckStridesAndDilation(
+        tf_options->stride_height, tf_options->stride_width,
+        tf_options->dilation_height_factor, tf_options->dilation_width_factor));
+    return IsActivationSupported(tf_options->activation);
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::CONVOLUTION_2D);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    Convolution2DAttributes attr;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+    attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
+    attr.dilations = HW(tf_options->dilation_height_factor,
+                        tf_options->dilation_width_factor);
+    UpdatePadding(tf_options->padding,
+                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
+    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
+                                                         graph, node));
+    node->operation.attributes = std::move(attr);
+    return OkStatus();
+  }
+};
+
+class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    TfLiteTransposeConvParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(
+        CheckStrides(tf_options->stride_height, tf_options->stride_width));
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    auto* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const auto* params = reinterpret_cast<const TfLiteTransposeConvParams*>(
+        tflite_node->custom_initial_data);
+    ConvolutionTransposedAttributes attr;
+    attr.stride =
+        params ? HW(params->stride_height, params->stride_width) : HW(1, 1);
+
+    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+
+    UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape,
+                  &attr);
+
+    node->operation.attributes = std::move(attr);
+    return OkStatus();
+  }
+};
+
 class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -779,12 +932,46 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(
         CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteDepthwiseConvParams* tf_options = nullptr;
+    TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
         tf_options->stride_height, tf_options->stride_width,
         tf_options->dilation_height_factor, tf_options->dilation_width_factor));
-    RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation));
+    RETURN_IF_ERROR(IsActivationSupported(tf_options->activation));
+
+    const int depth_multiplier = tf_options->depth_multiplier;
+    const auto* input = context->tensors + tflite_node->inputs->data[0];
+    const auto* filter = context->tensors + tflite_node->inputs->data[1];
+    const auto* bias = tflite_node->inputs->size > 2
+                           ? context->tensors + tflite_node->inputs->data[2]
+                           : nullptr;
+    const auto* output = context->tensors + tflite_node->outputs->data[0];
+    if (!input->dims || input->dims->size != 4) {
+      return InvalidArgumentError("input.dims.size != 4");
+    }
+    if (!filter->dims || filter->dims->size != 4) {
+      return InvalidArgumentError("filter.dims.size != 4");
+    }
+    if (!output->dims || output->dims->size != 4) {
+      return InvalidArgumentError("output.dims.size != 4");
+    }
+    if (input->dims->data[0] != output->dims->data[0]) {
+      return InvalidArgumentError("input.b != output.b");
+    }
+    const int input_depth = input->dims->data[3];
+    const int output_depth = output->dims->data[3];
+    if (filter->dims->data[3] != output_depth) {
+      return InvalidArgumentError("filter.i != output.c");
+    }
+    if (output_depth != input_depth * depth_multiplier) {
+      return InvalidArgumentError("output.c != input.c * depth_multiplier");
+    }
+    if (bias && NumElements(bias) != output_depth) {
+      return InvalidArgumentError("bias.size != output.c");
+    }
+    if (depth_multiplier != 1 && input_depth != 1) {
+      return UnimplementedError("depth_multiplier != 1 && input.c != 1");
+    }
     return OkStatus();
   }
 
@@ -799,11 +986,8 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     DepthwiseConvolution2DAttributes attr;
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
-    const auto* tf_options = reinterpret_cast<const TfLiteDepthwiseConvParams*>(
-        tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
+    TfLiteDepthwiseConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     attr.dilations = HW(std::max(1, tf_options->dilation_height_factor),
                         std::max(1, tf_options->dilation_width_factor));
@@ -811,9 +995,239 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
                   graph->FindInputs(node->id)[0]->tensor.shape, &attr);
     RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
                                                          graph, node));
+    const int depth_multiplier = tf_options->depth_multiplier;
+    if (depth_multiplier != 1) {
+      const TfLiteTensor* input = reader->GetInputTensor(0);
+      const TfLiteTensor* filter = reader->GetInputTensor(1);
+      const TfLiteTensor* output = reader->GetOutputTensor(0);
+      TransposeWeights(input, filter, output, depth_multiplier, &attr);
+    }
     node->operation.attributes = std::move(attr);
     return OkStatus();
   }
+
+ private:
+  // TFLite CPU stores weights as:
+  //   [1, kernel_height, kernel_width, input_depth * depth_multiplier]
+  // TFLite GPU stores weights as:
+  //   [depth_multiplier, kernel_height, kernel_width, input_depth]
+  static void TransposeWeights(const TfLiteTensor* input,
+                               const TfLiteTensor* filter,
+                               const TfLiteTensor* output, int depth_multiplier,
+                               DepthwiseConvolution2DAttributes* attr) {
+    const int input_depth = input->dims->data[3];
+    const int filter_height = filter->dims->data[1];
+    const int filter_width = filter->dims->data[2];
+    const int output_depth = output->dims->data[3];
+    Tensor<OHWI, DataType::FLOAT32> weights;
+    weights.id = attr->weights.id;
+    weights.shape =
+        OHWI(output_depth, filter_height, filter_width, input_depth);
+    weights.data.resize(weights.shape.DimensionsProduct());
+    float* dst = &weights.data[0];
+    for (int j = 0; j < output_depth; ++j) {
+      const float* src = attr->weights.data.data() + j;
+      for (int i = 0; i < filter_height * filter_width; ++i) {
+        *dst = *src;
+        dst++;
+        src += output_depth;
+      }
+    }
+    attr->weights = std::move(weights);
+  }
+};
+
+class ElementwiseOperationParser : public TFLiteOperationParser {
+ public:
+  explicit ElementwiseOperationParser(OperationType operation_type)
+      : operation_type_(operation_type) {}
+
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    if (IsOneArgumentOperation()) {
+      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1,
+                                         /*outputs=*/1));
+    } else if (IsTwoArgumentOperation()) {
+      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/2,
+                                         /*outputs=*/1));
+    } else {
+      return InvalidArgumentError("Op can only handle 1 or 2 operand(s).");
+    }
+    TfLiteFusedActivation activation;
+    RETURN_IF_ERROR(GetActivation(tflite_node, &activation));
+    return IsActivationSupported(activation);
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(operation_type_);
+
+    if (IsOneArgumentOperation()) {
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+    } else if (IsTwoArgumentOperation()) {
+      if (tflite_node->inputs->size != 2) {
+        return InvalidArgumentError("Applies only two input tensors");
+      }
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+
+      TfLiteFusedActivation activation = kTfLiteActNone;
+      switch (operation_type_) {
+        case OperationType::SUB: {
+          const auto* tf_options = reinterpret_cast<const TfLiteSubParams*>(
+              tflite_node->builtin_data);
+          if (tf_options != nullptr) {
+            activation = tf_options->activation;
+          }
+          break;
+        }
+        case OperationType::DIV: {
+          const auto* tf_options = reinterpret_cast<const TfLiteDivParams*>(
+              tflite_node->builtin_data);
+          if (tf_options != nullptr) {
+            activation = tf_options->activation;
+          }
+          break;
+        }
+        default:
+          // No activation expected.
+          activation = kTfLiteActNone;
+      }
+
+      if (activation) {
+        RETURN_IF_ERROR(
+            MaybeFuseActivationToTheSingleOutput(activation, graph, node));
+      }
+    } else {
+      return InvalidArgumentError("Incorrect operation type passed");
+    }
+
+    return reader->AddOutputs(node);
+  }
+
+ private:
+  Status GetActivation(const TfLiteNode* tflite_node,
+                       TfLiteFusedActivation* activation) const {
+    if (operation_type_ == OperationType::DIV) {
+      TfLiteDivParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      return OkStatus();
+    }
+    if (operation_type_ == OperationType::SUB) {
+      TfLiteSubParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      return OkStatus();
+    }
+
+    // Return kTfLiteActNone as other ops either do not have TfLiteXxxParams or
+    // TfLiteXxxParams.activation.
+    *activation = kTfLiteActNone;
+    return OkStatus();
+  }
+
+  bool IsOneArgumentOperation() const {
+    switch (operation_type_) {
+      case OperationType::ABS:
+      case OperationType::COS:
+      case OperationType::LOG:
+      case OperationType::RSQRT:
+      case OperationType::SIGMOID:
+      case OperationType::SIN:
+      case OperationType::SQRT:
+      case OperationType::SQUARE:
+      case OperationType::TANH:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  bool IsTwoArgumentOperation() const {
+    switch (operation_type_) {
+      case OperationType::DIV:
+      case OperationType::POW:
+      case OperationType::SQUARED_DIFF:
+      case OperationType::SUB:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  OperationType operation_type_;
+};
+
+class FullyConnectedOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    TfLiteFullyConnectedParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    if (tf_options->weights_format !=
+        kTfLiteFullyConnectedWeightsFormatDefault) {
+      return UnimplementedError("Unsupported FullyConnected weights format.");
+    }
+    // TODO(eignasheva): check input shape
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteFullyConnectedParams*>(
+            tflite_node->builtin_data);
+    if (tf_options->weights_format !=
+        kTfLiteFullyConnectedWeightsFormatDefault) {
+      return UnimplementedError("Unsupported FullyConnected weights format.");
+    }
+
+    FullyConnectedAttributes attr;
+    RETURN_IF_ERROR(GetFullyConnectedAttributes(1, 2, reader, &attr));
+
+    Tensor<HW, DataType::FLOAT32> weights;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &weights));
+    auto input = graph->FindInputs(node->id)[0];
+    int batch_size = input->tensor.shape.b;
+    if (input->tensor.shape.DimensionsProduct() / batch_size !=
+        weights.shape.w) {
+      return UnimplementedError(
+          "Amount of input data should match weights width");
+    }
+
+    Node* conv = node;
+    if (input->tensor.shape.h != 1 || input->tensor.shape.w != 1) {
+      auto& reshape = node;
+      conv = graph->NewNode();  // reset conv pointer!
+      Value<TensorRef<BHWC>>* reshaped_value = graph->NewValue();
+      reshaped_value->tensor.shape = BHWC(1, 1, 1, weights.shape.w);
+      RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id));
+      reshape->operation.type = ToString(OperationType::RESHAPE);
+      ReshapeAttributes attr;
+      attr.new_shape = reshaped_value->tensor.shape;
+      reshape->operation.attributes = attr;
+      RETURN_IF_ERROR(graph->AddConsumer(conv->id, reshaped_value->id));
+    }
+
+    conv->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    conv->operation.attributes = std::move(attr);
+    Status result = reader->AddOutputs(conv);
+    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
+                                                         graph, conv));
+
+    return result;
+  }
 };
 
 class HardSwishOperationParser : public TFLiteOperationParser {
@@ -834,285 +1248,6 @@ class HardSwishOperationParser : public TFLiteOperationParser {
   }
 };
 
-class ReshapeOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    // TODO(eignasheva): add shape checking
-    return OkStatus();
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::RESHAPE);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    // Here we may have extra inputs. Other tensors were supposed to
-    // define new shape, but in TFLite these are ignored.
-    // TODO(akulik): check that shapes match?
-
-    // New shape comes from output shape.
-    ReshapeAttributes attr;
-    attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
-Status ParsePoolingAttributes(const TfLitePoolParams* tf_options,
-                              const BHWC& input_shape,
-                              Pooling2DAttributes* attr) {
-  attr->kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
-  attr->strides = ToHW(tf_options->stride_height, tf_options->stride_width);
-  UpdatePadding(tf_options->padding, input_shape, attr);
-  return OkStatus();
-}
-
-class Pooling2DOperationParser : public TFLiteOperationParser {
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    TfLitePoolParams* tf_options = nullptr;
-    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
-    if (status.ok()) {  // custom case with indices as a second output
-      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1,
-                                         /*outputs=*/2));
-    } else {  // common pooling with 1 output
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1,
-                                         /*outputs=*/1));
-    }
-    RETURN_IF_ERROR(CheckKernelsAndStrides(
-        tf_options->filter_height, tf_options->filter_width,
-        tf_options->stride_height, tf_options->stride_width));
-    RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation));
-    return OkStatus();
-  }
-
- public:
-  explicit Pooling2DOperationParser(PoolingType type) : type_(type) {}
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::POOLING_2D);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutput(node, 0));
-
-    Pooling2DAttributes attr;
-    attr.type = type_;
-
-    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
-
-    // check whether there are custom options encoded. It happens if operation
-    // is MaxPoolingWithArgmax2D. There is no way to read
-    // tflite_node->builtin_code, so, simply check whether custom data is
-    // available.
-    auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
-    if (!tf_options) {
-      tf_options =
-          reinterpret_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
-    }
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-
-    std::vector<uint32_t> max_tensor_id{0};
-    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, max_tensor_id,
-                                        graph, node));
-    // Second output is optional. It is not required, it but must be added after
-    // MaybeAddFusedActivation function is called
-    reader->AddOutput(node, 1).IgnoreError();
-
-    // First output is the result of pooling operation, while second output is
-    // indices used for pooling.
-    auto outputs = graph->FindOutputs(node->id);
-    attr.output_indices = outputs.size() == 2;
-    if (attr.output_indices) {
-      // Fix data type for output indices. In the model it is set as float32.
-      outputs[1]->tensor.type = DataType::INT32;
-    }
-    RETURN_IF_ERROR(ParsePoolingAttributes(tf_options, input_shape, &attr));
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-
- private:
-  const PoolingType type_;
-};
-
-class Unpooling2DOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    TfLitePoolParams* tf_options = nullptr;
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/2, /*outputs=*/1));
-    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
-    RETURN_IF_ERROR(CheckKernelsAndStrides(
-        tf_options->filter_height, tf_options->filter_width,
-        tf_options->stride_height, tf_options->stride_width));
-    return OkStatus();
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::MAX_UNPOOLING_2D);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddInput(node, 1));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
-    MaxUnpooling2DAttributes attr;
-    const auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
-    attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
-    UpdatePadding(tf_options->padding, input_shape, &attr);
-
-    node->operation.attributes = attr;
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = CalculateOutputShape(input_shape, attr);
-    return OkStatus();
-  }
-};
-
-class SoftMaxOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    TfLiteSoftmaxParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    if (tf_options->beta != 1) {
-      // TODO(eignasheva): figure out, what's wrong with softmax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-    }
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::SOFT_MAX);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    if (tf_options->beta != 1) {
-      // there is multiply by scalar operation fused in SoftMax. Make a layer
-      // out of it before SoftMax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-      // auto mul_node = reader->NewPassthroughNode(node);
-      // mul_node->operation.type = ToString(OperationType::MUL);
-    }
-    SoftMaxAttributes attr;
-    attr.axis = Axis::CHANNELS;  // always by channels
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
-class AddOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    if (tflite_node->inputs->size != 2) {
-      return UnimplementedError("ADD requires two input tensors.");
-    }
-    // TODO(eignasheva): Add shapes check.
-    TfLiteAddParams* tf_options = nullptr;
-    return RetrieveBuiltinData(tflite_node, &tf_options);
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    // TFLite currently only supports 2 input ADDs.  Thus, the logic below only
-    // considers 2 input cases.  The underlying GPU shader programs can accept
-    // more inputs, but the logic below would have to be expanded.
-
-    // Determine runtime/constant tensors.
-    const TfLiteTensor* input0 = reader->GetInputTensor(0);
-    if (!input0) {
-      return InvalidArgumentError("Couldn't get the 1st input tensor for ADD.");
-    }
-    const TfLiteTensor* input1 = reader->GetInputTensor(1);
-    if (!input1) {
-      return InvalidArgumentError("Couldn't get the 2nd input tensor for ADD.");
-    }
-    const bool constant_tensor0 = IsConstantTensor(input0);
-    const bool constant_tensor1 = IsConstantTensor(input1);
-    if (constant_tensor0 && constant_tensor1) {
-      return InvalidArgumentError("No runtime input tensors for ADD.");
-    }
-    const bool runtime_tensor0 = !constant_tensor0;
-    const bool runtime_tensor1 = !constant_tensor1;
-
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::ADD);
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    AddAttributes attr;
-    if (runtime_tensor0 && runtime_tensor1) {
-      RETURN_IF_ERROR(reader->AddInput(node, 0));
-      RETURN_IF_ERROR(reader->AddInput(node, 1));
-    } else {
-      int runtime_tensor = 0;
-      int constant_tensor = 1;
-      TfLiteIntArray* constant_dims = input1->dims;
-      if (constant_tensor0 && runtime_tensor1) {
-        runtime_tensor = 1;
-        constant_tensor = 0;
-        constant_dims = input0->dims;
-      }
-      RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-      if (constant_dims->size <= 0) {
-        Tensor<Scalar, DataType::FLOAT32> tensor;
-        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-        attr.param = tensor.data[0];
-      } else {
-        Tensor<Linear, DataType::FLOAT32> tensor;
-        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-        attr.param = std::move(tensor);
-      }
-    }
-    node->operation.attributes = std::move(attr);
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
-                                                node);
-  }
-};
-
 // Basic LSTM Cell:
 //
 //  1name = name is at input  index 1
@@ -1132,7 +1267,7 @@ class AddOperationParser : public TFLiteOperationParser {
 //                 /      \
 //           new_state1    activation0
 //
-class LstmOperationParser : public TFLiteOperationParser {
+class LSTMOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
                      const TfLiteNode* tflite_node,
@@ -1225,278 +1360,6 @@ class LstmOperationParser : public TFLiteOperationParser {
   }
 };
 
-class ResizeBilinearOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-
-    // TODO(eignasheva): check shapes.
-    TfLiteResizeBilinearParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::UPSAMPLE_2D);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    // Here we may have extra inputs. Other tensors were supposed to
-    // define new shape, but in TFLite these are ignored.
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteResizeBilinearParams*>(
-            tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    Upsample2DAttributes attr;
-    attr.align_corners = tf_options->align_corners;
-    attr.type = UpsamplingType::BILINEAR;
-    attr.new_shape.CopyAllDefinedAxis(
-        graph->FindOutputs(node->id)[0]->tensor.shape);
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
-class PadOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::PAD);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    PadAttributes attr;
-    attr.type = PaddingContentType::ZEROS;
-    Tensor<HW, DataType::INT32> paddings;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &paddings));
-
-    // 4x2 tensor with paddings.
-    if (paddings.shape.h != 4 || paddings.shape.w != 2) {
-      return InvalidArgumentError("Paddings tensor has unexpected shape.");
-    }
-    if (paddings.data[0] != 0 || paddings.data[1] != 0) {
-      return UnimplementedError("Padding for BATCH channel is not supported.");
-    }
-    attr.prepended = HWC(paddings.data[2], paddings.data[4], paddings.data[6]);
-    attr.appended = HWC(paddings.data[3], paddings.data[5], paddings.data[7]);
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
-class ElementwiseOperationParser : public TFLiteOperationParser {
- public:
-  explicit ElementwiseOperationParser(OperationType operation_type)
-      : operation_type_(operation_type) {}
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    if (IsTwoArgumentOperation()) {
-      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/2,
-                                         /*outputs=*/1));
-      TfLiteSubParams* tf_options = nullptr;
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation));
-    } else if (!IsOneArgumentOperation()) {
-      return InvalidArgumentError("Incorrect operation type passed");
-    }
-
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(operation_type_);
-
-    if (IsOneArgumentOperation()) {
-      RETURN_IF_ERROR(reader->AddInput(node, 0));
-    } else if (IsTwoArgumentOperation()) {
-      if (tflite_node->inputs->size != 2) {
-        return InvalidArgumentError("Applies only two input tensors");
-      }
-      RETURN_IF_ERROR(reader->AddInput(node, 0));
-      RETURN_IF_ERROR(reader->AddInput(node, 1));
-
-      TfLiteFusedActivation activation = kTfLiteActNone;
-      switch (operation_type_) {
-        case OperationType::SUB: {
-          const auto* tf_options = reinterpret_cast<const TfLiteSubParams*>(
-              tflite_node->builtin_data);
-          if (tf_options != nullptr) {
-            activation = tf_options->activation;
-          }
-          break;
-        }
-        case OperationType::DIV: {
-          const auto* tf_options = reinterpret_cast<const TfLiteDivParams*>(
-              tflite_node->builtin_data);
-          if (tf_options != nullptr) {
-            activation = tf_options->activation;
-          }
-          break;
-        }
-        default:
-          // No activation expected.
-          activation = kTfLiteActNone;
-      }
-
-      if (activation) {
-        RETURN_IF_ERROR(
-            MaybeFuseActivationToTheSingleOutput(activation, graph, node));
-      }
-    } else {
-      return InvalidArgumentError("Incorrect operation type passed");
-    }
-
-    return reader->AddOutputs(node);
-  }
-
- private:
-  bool IsOneArgumentOperation() const {
-    switch (operation_type_) {
-      case OperationType::ABS:
-      case OperationType::SIN:
-      case OperationType::COS:
-      case OperationType::LOG:
-      case OperationType::SQRT:
-      case OperationType::RSQRT:
-      case OperationType::SQUARE:
-      case OperationType::SIGMOID:
-      case OperationType::TANH:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  bool IsTwoArgumentOperation() const {
-    switch (operation_type_) {
-      case OperationType::SUB:
-      case OperationType::DIV:
-      case OperationType::POW:
-      case OperationType::SQUARED_DIFF:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  OperationType operation_type_;
-};
-
-class PReLuOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    // TODO(eignasheva): add params check
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::PRELU);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
-
-    PReLUAttributes attr;
-    Tensor<Linear, DataType::FLOAT32> linear_alpha;
-    Status status = reader->ReadTensor(1, &linear_alpha);
-    if (status.ok()) {
-      if (linear_alpha.shape.v != input_shape.c) {
-        return InvalidArgumentError(
-            "Linear alpha shape does not match the number of input channels.");
-      }
-      attr.alpha = std::move(linear_alpha);
-    } else {
-      Tensor<HWC, DataType::FLOAT32> hwc_alpha;
-      RETURN_IF_ERROR(reader->ReadTensor(1, &hwc_alpha));
-      if (hwc_alpha.shape.h != input_shape.h ||
-          hwc_alpha.shape.w != input_shape.w ||
-          hwc_alpha.shape.c != input_shape.c) {
-        return InvalidArgumentError("Alpha shape does not match input shape.");
-      }
-      attr.alpha = std::move(hwc_alpha);
-    }
-    node->operation.attributes = std::move(attr);
-    return reader->AddOutputs(node);
-  }
-};
-
-class ReLuOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    return OkStatus();
-  }
-  explicit ReLuOperationParser(int clip) : clip_(clip) {}
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::RELU);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-
-    ReLUAttributes attr;
-    TfLiteLeakyReluParams* tf_options = nullptr;
-    RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError();
-    attr.alpha = tf_options ? tf_options->alpha : 0;
-    attr.clip = clip_;
-    node->operation.attributes = attr;
-    return reader->AddOutputs(node);
-  }
-
- private:
-  int clip_;
-};
-
-Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
-  const TfLiteIntArray* dims = tflite_tensor.dims;
-  switch (dims->size) {
-    case 1:
-      *bhwc = BHWC(dims->data[0], 1, 1, 1);
-      return OkStatus();
-    case 2:
-      *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
-      return OkStatus();
-    case 3:
-      *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
-      return OkStatus();
-    case 4:
-      *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
-      return OkStatus();
-    default:
-      return InvalidArgumentError(absl::StrCat(
-          "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr",
-          "\" has bad input dims size: ", dims->size, "."));
-  }
-}
-
 class MulOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -1592,69 +1455,306 @@ class MulOperationParser : public TFLiteOperationParser {
   }
 };
 
-class FullyConnectedOperationParser : public TFLiteOperationParser {
+class PReLUOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
                      const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    TfLiteFullyConnectedParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    if (tf_options->weights_format !=
-        kTfLiteFullyConnectedWeightsFormatDefault) {
-      return UnimplementedError("Unsupported FullyConnected weights format.");
-    }
-    // TODO(eignasheva): check input shape
+    // TODO(eignasheva): add params check
     return OkStatus();
   }
   Status Parse(const TfLiteNode* tflite_node,
                const TfLiteRegistration* registration, GraphFloat32* graph,
                ObjectReader* reader) final {
     Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::PRELU);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
+
+    PReLUAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> linear_alpha;
+    Status status = reader->ReadTensor(1, &linear_alpha);
+    if (status.ok()) {
+      if (linear_alpha.shape.v != input_shape.c) {
+        return InvalidArgumentError(
+            "Linear alpha shape does not match the number of input channels.");
+      }
+      attr.alpha = std::move(linear_alpha);
+    } else {
+      Tensor<HWC, DataType::FLOAT32> hwc_alpha;
+      RETURN_IF_ERROR(reader->ReadTensor(1, &hwc_alpha));
+      if (hwc_alpha.shape.h != input_shape.h ||
+          hwc_alpha.shape.w != input_shape.w ||
+          hwc_alpha.shape.c != input_shape.c) {
+        return InvalidArgumentError("Alpha shape does not match input shape.");
+      }
+      attr.alpha = std::move(hwc_alpha);
+    }
+    node->operation.attributes = std::move(attr);
+    return reader->AddOutputs(node);
+  }
+};
+
+class PadOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::PAD);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    PadAttributes attr;
+    attr.type = PaddingContentType::ZEROS;
+    Tensor<HW, DataType::INT32> paddings;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &paddings));
+
+    // 4x2 tensor with paddings.
+    if (paddings.shape.h != 4 || paddings.shape.w != 2) {
+      return InvalidArgumentError("Paddings tensor has unexpected shape.");
+    }
+    if (paddings.data[0] != 0 || paddings.data[1] != 0) {
+      return UnimplementedError("Padding for BATCH channel is not supported.");
+    }
+    attr.prepended = HWC(paddings.data[2], paddings.data[4], paddings.data[6]);
+    attr.appended = HWC(paddings.data[3], paddings.data[5], paddings.data[7]);
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
+class Pooling2DOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    TfLitePoolParams* tf_options = nullptr;
+    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
+    if (status.ok()) {  // custom case with indices as a second output
+      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1,
+                                         /*outputs=*/2));
+    } else {  // common pooling with 1 output
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+      RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1,
+                                         /*outputs=*/1));
+    }
+    RETURN_IF_ERROR(CheckKernelsAndStrides(
+        tf_options->filter_height, tf_options->filter_width,
+        tf_options->stride_height, tf_options->stride_width));
+    return IsActivationSupported(tf_options->activation);
+  }
+
+ public:
+  explicit Pooling2DOperationParser(PoolingType type) : type_(type) {}
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::POOLING_2D);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutput(node, 0));
+
+    Pooling2DAttributes attr;
+    attr.type = type_;
+
+    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
+
+    // check whether there are custom options encoded. It happens if operation
+    // is MaxPoolingWithArgmax2D. There is no way to read
+    // tflite_node->builtin_code, so, simply check whether custom data is
+    // available.
+    auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
+        tflite_node->custom_initial_data);
+    if (!tf_options) {
+      tf_options =
+          reinterpret_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
+    }
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+
+    std::vector<uint32_t> max_tensor_id{0};
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, max_tensor_id,
+                                        graph, node));
+    // Second output is optional. It is not required, it but must be added after
+    // MaybeAddFusedActivation function is called
+    reader->AddOutput(node, 1).IgnoreError();
+
+    // First output is the result of pooling operation, while second output is
+    // indices used for pooling.
+    auto outputs = graph->FindOutputs(node->id);
+    attr.output_indices = outputs.size() == 2;
+    if (attr.output_indices) {
+      // Fix data type for output indices. In the model it is set as float32.
+      outputs[1]->tensor.type = DataType::INT32;
+    }
+    RETURN_IF_ERROR(ParsePoolingAttributes(tf_options, input_shape, &attr));
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+
+ private:
+  const PoolingType type_;
+};
+
+class ReLUOperationParser : public TFLiteOperationParser {
+ public:
+  explicit ReLUOperationParser(int clip) : clip_(clip) {}
+
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::RELU);
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
+    ReLUAttributes attr;
+    TfLiteLeakyReluParams* tf_options = nullptr;
+    RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError();
+    attr.alpha = tf_options ? tf_options->alpha : 0;
+    attr.clip = clip_;
+    node->operation.attributes = attr;
+    return reader->AddOutputs(node);
+  }
+
+ private:
+  const int clip_;
+};
+
+class ReshapeOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    // TODO(eignasheva): add shape checking
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::RESHAPE);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    // Here we may have extra inputs. Other tensors were supposed to
+    // define new shape, but in TFLite these are ignored.
+    // TODO(akulik): check that shapes match?
+
+    // New shape comes from output shape.
+    ReshapeAttributes attr;
+    attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
+class ResizeBilinearOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+
+    // TODO(eignasheva): check shapes.
+    TfLiteResizeBilinearParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::UPSAMPLE_2D);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    // Here we may have extra inputs. Other tensors were supposed to
+    // define new shape, but in TFLite these are ignored.
+
     const auto* tf_options =
-        reinterpret_cast<const TfLiteFullyConnectedParams*>(
+        reinterpret_cast<const TfLiteResizeBilinearParams*>(
             tflite_node->builtin_data);
-    if (tf_options->weights_format !=
-        kTfLiteFullyConnectedWeightsFormatDefault) {
-      return UnimplementedError("Unsupported FullyConnected weights format.");
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
     }
+    Upsample2DAttributes attr;
+    attr.align_corners = tf_options->align_corners;
+    attr.type = UpsamplingType::BILINEAR;
+    attr.new_shape.CopyAllDefinedAxis(
+        graph->FindOutputs(node->id)[0]->tensor.shape);
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
 
-    FullyConnectedAttributes attr;
-    RETURN_IF_ERROR(GetFullyConnectedAttributes(1, 2, reader, &attr));
-
-    Tensor<HW, DataType::FLOAT32> weights;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &weights));
-    auto input = graph->FindInputs(node->id)[0];
-    int batch_size = input->tensor.shape.b;
-    if (input->tensor.shape.DimensionsProduct() / batch_size !=
-        weights.shape.w) {
-      return UnimplementedError(
-          "Amount of input data should match weights width");
+class SoftmaxOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    TfLiteSoftmaxParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    if (tf_options->beta != 1) {
+      // TODO(eignasheva): figure out, what's wrong with softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
     }
+    return OkStatus();
+  }
 
-    Node* conv = node;
-    if (input->tensor.shape.h != 1 || input->tensor.shape.w != 1) {
-      auto& reshape = node;
-      conv = graph->NewNode();  // reset conv pointer!
-      Value<TensorRef<BHWC>>* reshaped_value = graph->NewValue();
-      reshaped_value->tensor.shape = BHWC(1, 1, 1, weights.shape.w);
-      RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id));
-      reshape->operation.type = ToString(OperationType::RESHAPE);
-      ReshapeAttributes attr;
-      attr.new_shape = reshaped_value->tensor.shape;
-      reshape->operation.attributes = attr;
-      RETURN_IF_ERROR(graph->AddConsumer(conv->id, reshaped_value->id));
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SOFTMAX);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
     }
-
-    conv->operation.type = ToString(OperationType::FULLY_CONNECTED);
-    conv->operation.attributes = std::move(attr);
-    Status result = reader->AddOutputs(conv);
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, conv));
-
-    return result;
+    if (tf_options->beta != 1) {
+      // there is multiply by scalar operation fused in softmax. Make a layer
+      // out of it before softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
+      // auto mul_node = reader->NewPassthroughNode(node);
+      // mul_node->operation.type = ToString(OperationType::MUL);
+    }
+    SoftmaxAttributes attr;
+    attr.axis = Axis::CHANNELS;  // always by channels
+    node->operation.attributes = attr;
+    return OkStatus();
   }
 };
 
@@ -1669,6 +1769,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
     return OkStatus();
   }
+
   Status Parse(const TfLiteNode* tflite_node,
                const TfLiteRegistration* registration, GraphFloat32* graph,
                ObjectReader* reader) final {
@@ -1706,16 +1807,22 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
       RETURN_IF_ERROR(
           ReadAttribsWithBatch(reader, tf_options, input->tensor.shape, &attr));
     }
+    if (attr.strides.h == 0 || attr.strides.w == 0 || attr.strides.c == 0) {
+      return InvalidArgumentError("stride values must be non-zero");
+    }
     if (attr.strides.h < 0 || attr.strides.w < 0 || attr.strides.c < 0) {
       return UnimplementedError("Reverse slices are not supported.");
     }
-    if (attr.ends.h - attr.starts.h != out_shape.h) {
+    if ((attr.ends.h - attr.starts.h + attr.strides.h - 1) / attr.strides.h !=
+        out_shape.h) {
       return UnimplementedError("Output height doesn't match");
     }
-    if (attr.ends.w - attr.starts.w != out_shape.w) {
+    if ((attr.ends.w - attr.starts.w + attr.strides.w - 1) / attr.strides.w !=
+        out_shape.w) {
       return UnimplementedError("Output width doesn't match");
     }
-    if (attr.ends.c - attr.starts.c != out_shape.c) {
+    if ((attr.ends.c - attr.starts.c + attr.strides.c - 1) / attr.strides.c !=
+        out_shape.c) {
       return UnimplementedError("Output channels don't match");
     }
     node->operation.attributes = attr;
@@ -1830,6 +1937,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
         CheckStrides(tf_options->stride_height, tf_options->stride_width));
     return OkStatus();
   }
+
   // TFLite's TRANSPOSE_CONV expects 3 input (output shape, weights, and input)
   // and allows configurable padding & stride.
   // TODO(impjdi): Translate output_shape to attr.adjacent.
@@ -1863,85 +1971,49 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
   }
 };
 
-class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
+class Unpooling2DOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
                      const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteTransposeConvParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+    TfLitePoolParams* tf_options = nullptr;
     RETURN_IF_ERROR(
-        CheckStrides(tf_options->stride_height, tf_options->stride_width));
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/2, /*outputs=*/1));
+    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(CheckKernelsAndStrides(
+        tf_options->filter_height, tf_options->filter_width,
+        tf_options->stride_height, tf_options->stride_width));
     return OkStatus();
   }
+
   Status Parse(const TfLiteNode* tflite_node,
                const TfLiteRegistration* registration, GraphFloat32* graph,
                ObjectReader* reader) final {
-    auto* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED);
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MAX_UNPOOLING_2D);
     RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddInput(node, 1));
     RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    const auto* params = reinterpret_cast<const TfLiteTransposeConvParams*>(
+    auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    MaxUnpooling2DAttributes attr;
+    const auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
         tflite_node->custom_initial_data);
-    ConvolutionTransposedAttributes attr;
-    attr.stride =
-        params ? HW(params->stride_height, params->stride_width) : HW(1, 1);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+    attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
+    attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
+    UpdatePadding(tf_options->padding, input_shape, &attr);
 
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
-    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+    node->operation.attributes = attr;
 
-    UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape,
-                  &attr);
-
-    node->operation.attributes = std::move(attr);
-    return OkStatus();
-  }
-};
-
-class SpaceToBatchOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    return OkStatus();
-  }
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    auto* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::SPACE_TO_BATCH);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    SpaceToBatchAttributes sb_attr;
-    Tensor<Linear, DataType::INT32> block;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &block));
-    if (block.shape.v != 2) {
-      return InternalError("Space has to be HxW.");
-    }
-    sb_attr.block.h = block.data[0];
-    sb_attr.block.w = block.data[1];
-
-    Tensor<HW, DataType::INT32> padding;
-    RETURN_IF_ERROR(reader->ReadTensor(2, &padding));
-    auto padding_shape = padding.shape;
-
-    if (padding_shape.h != 2 && padding_shape.w != 2) {
-      return InternalError("Space has to be HxW.");
-    }
-
-    sb_attr.padding.prepended.h = padding.data[0];
-    sb_attr.padding.prepended.w = padding.data[2];
-
-    sb_attr.padding.appended.h = padding.data[1];
-    sb_attr.padding.appended.w = padding.data[3];
-
-    node->operation.attributes = std::move(sb_attr);
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = CalculateOutputShape(input_shape, attr);
     return OkStatus();
   }
 };
 
+// TODO(impjdi): BATCH_TO_SPACE/SPACE_TO_BATCH shouldn't be supported.
 class BatchToSpaceOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -1949,6 +2021,7 @@ class BatchToSpaceOperationParser : public TFLiteOperationParser {
                      const TfLiteRegistration* registration) final {
     return OkStatus();
   }
+
   Status Parse(const TfLiteNode* tflite_node,
                const TfLiteRegistration* registration, GraphFloat32* graph,
                ObjectReader* reader) final {
@@ -1984,7 +2057,51 @@ class BatchToSpaceOperationParser : public TFLiteOperationParser {
   }
 };
 
+class SpaceToBatchOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    auto* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SPACE_TO_BATCH);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    SpaceToBatchAttributes sb_attr;
+    Tensor<Linear, DataType::INT32> block;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &block));
+    if (block.shape.v != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+    sb_attr.block.h = block.data[0];
+    sb_attr.block.w = block.data[1];
+
+    Tensor<HW, DataType::INT32> padding;
+    RETURN_IF_ERROR(reader->ReadTensor(2, &padding));
+    auto padding_shape = padding.shape;
+
+    if (padding_shape.h != 2 && padding_shape.w != 2) {
+      return InternalError("Space has to be HxW.");
+    }
+
+    sb_attr.padding.prepended.h = padding.data[0];
+    sb_attr.padding.prepended.w = padding.data[2];
+
+    sb_attr.padding.appended.h = padding.data[1];
+    sb_attr.padding.appended.w = padding.data[3];
+
+    node->operation.attributes = std::move(sb_attr);
+    return OkStatus();
+  }
+};
+
 class UnsupportedOperationParser : public TFLiteOperationParser {
+ public:
   Status IsSupported(const TfLiteContext* context,
                      const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration) final {
@@ -2004,86 +2121,89 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
   const absl::string_view custom_name = registration->custom_name;
   switch (builtin_code) {
     case kTfLiteBuiltinAbs:
-      return make_unique<ElementwiseOperationParser>(OperationType::ABS);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::ABS);
     case kTfLiteBuiltinAdd:
-      return make_unique<AddOperationParser>();
+      return absl::make_unique<AddOperationParser>();
     case kTfLiteBuiltinAveragePool2d:
-      return make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
+      return absl::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
     case kTfLiteBuiltinConcatenation:
-      return make_unique<ConcatenationOperationParser>();
+      return absl::make_unique<ConcatenationOperationParser>();
     case kTfLiteBuiltinConv2d:
-      return make_unique<Conv2DOperationParser>();
+      return absl::make_unique<Conv2DOperationParser>();
     case kTfLiteBuiltinCos:
-      return make_unique<ElementwiseOperationParser>(OperationType::COS);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::COS);
     case kTfLiteBuiltinDepthwiseConv2d:
-      return make_unique<DepthwiseConvolutionOperationParser>();
+      return absl::make_unique<DepthwiseConvolutionOperationParser>();
     case kTfLiteBuiltinDiv:
-      return make_unique<ElementwiseOperationParser>(OperationType::DIV);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::DIV);
     case kTfLiteBuiltinFullyConnected:
-      return make_unique<FullyConnectedOperationParser>();
+      return absl::make_unique<FullyConnectedOperationParser>();
     case kTfLiteBuiltinHardSwish:
-      return make_unique<HardSwishOperationParser>();
+      return absl::make_unique<HardSwishOperationParser>();
     case kTfLiteBuiltinLogistic:
-      return make_unique<ElementwiseOperationParser>(OperationType::SIGMOID);
+      return absl::make_unique<ElementwiseOperationParser>(
+          OperationType::SIGMOID);
     case kTfLiteBuiltinLog:
-      return make_unique<ElementwiseOperationParser>(OperationType::LOG);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::LOG);
     case kTfLiteBuiltinLstm:
-      return make_unique<LstmOperationParser>();
+      return absl::make_unique<LSTMOperationParser>();
     case kTfLiteBuiltinMaxPool2d:
-      return make_unique<Pooling2DOperationParser>(PoolingType::MAX);
+      return absl::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
     case kTfLiteBuiltinMul:
-      return make_unique<MulOperationParser>();
+      return absl::make_unique<MulOperationParser>();
     case kTfLiteBuiltinPad:
-      return make_unique<PadOperationParser>();
+      return absl::make_unique<PadOperationParser>();
     case kTfLiteBuiltinPow:
-      return make_unique<ElementwiseOperationParser>(OperationType::POW);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::POW);
     case kTfLiteBuiltinRelu:
-      return make_unique<ReLuOperationParser>(0);
+      return absl::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinRelu6:
-      return make_unique<ReLuOperationParser>(6);
+      return absl::make_unique<ReLUOperationParser>(6);
     case kTfLiteBuiltinLeakyRelu:
-      return make_unique<ReLuOperationParser>(0);
+      return absl::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinPrelu:
-      return make_unique<PReLuOperationParser>();
+      return absl::make_unique<PReLUOperationParser>();
     case kTfLiteBuiltinReshape:
-      return make_unique<ReshapeOperationParser>();
+      return absl::make_unique<ReshapeOperationParser>();
     case kTfLiteBuiltinResizeBilinear:
-      return make_unique<ResizeBilinearOperationParser>();
+      return absl::make_unique<ResizeBilinearOperationParser>();
     case kTfLiteBuiltinRsqrt:
-      return make_unique<ElementwiseOperationParser>(OperationType::RSQRT);
+      return absl::make_unique<ElementwiseOperationParser>(
+          OperationType::RSQRT);
     case kTfLiteBuiltinSin:
-      return make_unique<ElementwiseOperationParser>(OperationType::SIN);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::SIN);
     case kTfLiteBuiltinSoftmax:
-      return make_unique<SoftMaxOperationParser>();
+      return absl::make_unique<SoftmaxOperationParser>();
     case kTfLiteBuiltinStridedSlice:
-      return make_unique<StridedSliceOperationParser>();
+      return absl::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSqrt:
-      return make_unique<ElementwiseOperationParser>(OperationType::SQRT);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::SQRT);
     case kTfLiteBuiltinSquare:
-      return make_unique<ElementwiseOperationParser>(OperationType::SQUARE);
+      return absl::make_unique<ElementwiseOperationParser>(
+          OperationType::SQUARE);
     case kTfLiteBuiltinSquaredDifference:
-      return make_unique<ElementwiseOperationParser>(
+      return absl::make_unique<ElementwiseOperationParser>(
           OperationType::SQUARED_DIFF);
     case kTfLiteBuiltinSub:
-      return make_unique<ElementwiseOperationParser>(OperationType::SUB);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::SUB);
     case kTfLiteBuiltinTanh:
-      return make_unique<ElementwiseOperationParser>(OperationType::TANH);
+      return absl::make_unique<ElementwiseOperationParser>(OperationType::TANH);
     case kTfLiteBuiltinTransposeConv:
-      return make_unique<TransposeConvOperationParser>();
+      return absl::make_unique<TransposeConvOperationParser>();
 
     case kTfLiteBuiltinCustom:
       if (custom_name == "Convolution2DTransposeBias") {
-        return make_unique<Convolution2DTransposeBiasParser>();
+        return absl::make_unique<Convolution2DTransposeBiasParser>();
       }
       if (custom_name == "MaxPoolingWithArgmax2D") {
-        return make_unique<Pooling2DOperationParser>(PoolingType::MAX);
+        return absl::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
       }
       if (custom_name == "MaxUnpooling2D") {
-        return make_unique<Unpooling2DOperationParser>();
+        return absl::make_unique<Unpooling2DOperationParser>();
       }
       break;
   }
-  return make_unique<UnsupportedOperationParser>();
+  return absl::make_unique<UnsupportedOperationParser>();
 }
 
 }  // namespace
@@ -2128,12 +2248,114 @@ Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                               TfLiteRegistration** registration) {
   if (context->GetNodeAndRegistration(context, node_id, tflite_node,
                                       registration) != kTfLiteOk) {
-    return InvalidArgumentError(
-        StrCat("Couldn't get node and registration info for op: ", node_id));
+    return InvalidArgumentError(absl::StrCat(
+        "Couldn't get node and registration info for op: ", node_id));
   }
   return OkStatus();
 }
 
+TfLiteIntArray* GetOpsToReplaceFromGraphWithDequantize(TfLiteContext* context) {
+  TfLiteIntArray* execution_plan = nullptr;
+  if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
+    context->ReportError(context, "Unable to get graph execution plan.");
+    return nullptr;
+  }
+  std::set<std::string> errors;
+  std::unordered_map<int, int> dequant_nodes;
+  std::vector<int> ops_to_replace;
+  std::vector<int> dequant_nodes_to_save;
+
+  // Map the output tensor of a Dequantize nodes to its input tensor.
+  std::unordered_map<int, int> node_map;
+  for (int i = 0; i < execution_plan->size; ++i) {
+    bool replace_node = false;
+    // Keep track of any inputs from a Dequantize node.
+    std::vector<int> inputs_from_dequant;
+    std::vector<int> orig_inputs;
+
+    const int node_id = execution_plan->data[i];
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+    auto status =
+        GetNodeAndRegistration(context, node_id, &node, &registration);
+    if (!status.ok()) {
+      context->ReportError(context, status.error_message().c_str());
+      return nullptr;
+    }
+    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
+        context->tensors[node->inputs->data[0]].type ==
+            TfLiteType::kTfLiteFloat16) {
+      // Record the output->input mapping for the op.
+      node_map[node->outputs->data[0]] = node->inputs->data[0];
+      // For now, add the node to the list of ops to replace.
+      ops_to_replace.push_back(node_id);
+      // Record the dequant node id, indexed by output id.
+      dequant_nodes[node->outputs->data[0]] = node_id;
+      continue;
+    }
+    TfLiteIntArray* inputs = node->inputs;
+    // Fix the node's inputs (i.e. prune out the preceding dequantize node)
+    // in order to test if it is supported on the GPU.
+    for (int j = 0; j < inputs->size; ++j) {
+      orig_inputs.push_back(inputs->data[j]);
+      if (node_map.find(inputs->data[j]) != node_map.end()) {
+        inputs_from_dequant.push_back(dequant_nodes[inputs->data[j]]);
+        // Remap inputs of this node to the inputs of the preceding dequant.
+        inputs->data[j] = node_map[inputs->data[j]];
+      }
+    }
+    status = IsSupported(context, node, registration);
+    if (status.ok() &&
+        // TODO(eignasheva): resolve sub operation support for metal delegate
+        // registration->builtin_code != kTfLiteBuiltinSub &&
+        IsAllFloatTensors(context, node->inputs) &&
+        IsAllFloatTensors(context, node->outputs)) {
+      if (errors.empty()) {
+        replace_node = true;
+        ops_to_replace.push_back(i);
+      }
+    } else {
+      // Unable to replace this node. Restore the inputs to the original
+      // if they were modified.
+      if (!inputs_from_dequant.empty()) {
+        TfLiteIntArray* inputs = node->inputs;
+        for (int j = 0; j < inputs->size; ++j) {
+          inputs->data[j] = orig_inputs[j];
+        }
+      }
+      errors.insert(GetOpNameByRegistration(registration) + ": " +
+                    status.error_message());
+    }
+    // if any input is the output of a dequantize node AND we failed to
+    // replace this op, mark the corresponding dequantize node as a node to
+    // save.
+    if (!replace_node && !inputs_from_dequant.empty()) {
+      dequant_nodes_to_save.insert(dequant_nodes_to_save.end(),
+                                   inputs_from_dequant.begin(),
+                                   inputs_from_dequant.end());
+    }
+  }
+  if (!errors.empty()) {
+    std::string unsupported = absl::StrJoin(errors, "\n");
+    std::string error_message =
+        "Next operations are not supported by GPU delegate:\n" + unsupported +
+        "\nFirst " + std::to_string(ops_to_replace.size()) +
+        " operations will run on the GPU, and the remaining " +
+        std::to_string(execution_plan->size - ops_to_replace.size()) +
+        " on the CPU.";
+    context->ReportError(context, error_message.c_str());
+  }
+  // Pop all dequantize nodes that must be preserved.
+  for (int i = 0; i < dequant_nodes_to_save.size(); ++i) {
+    auto it = std::find(ops_to_replace.begin(), ops_to_replace.end(),
+                        dequant_nodes_to_save[i]);
+    if (it != ops_to_replace.end()) {
+      ops_to_replace.erase(it);
+    }
+  }
+  return ConvertVectorToTfLiteIntArray(ops_to_replace);
+}
+
 // TODO(impjdi): Check number of input/output tensors and their dimensions.
 // TODO(impjdi): Check ops' parameters.
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
@@ -2142,56 +2364,49 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
     context->ReportError(context, "Unable to get graph execution plan.");
     return nullptr;
   }
-  TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size);
-  std::vector<int> pruned_graph;
-  subgraph->size = 0;
-  // pruned_graph will not include dequantize operations.
-  std::set<std::string> errors;
 
-  // Map the output tensor of a Dequantize nodes to its input tensor.
-  std::unordered_map<int, int> node_map;
+  // Dispatch to another function if graph has Dequantize nodes.
   for (int i = 0; i < execution_plan->size; ++i) {
+    const int node_id = execution_plan->data[i];
     TfLiteNode* node = nullptr;
     TfLiteRegistration* registration = nullptr;
-    auto status = GetNodeAndRegistration(context, i, &node, &registration);
+    auto status =
+        GetNodeAndRegistration(context, node_id, &node, &registration);
     if (!status.ok()) {
       context->ReportError(context, status.error_message().c_str());
-      TfLiteIntArrayFree(subgraph);
       return nullptr;
     }
     if (registration->builtin_code == kTfLiteBuiltinDequantize &&
         context->tensors[node->inputs->data[0]].type ==
             TfLiteType::kTfLiteFloat16) {
-      // Record the output->input mapping for the op.
-      node_map[node->outputs->data[0]] = node->inputs->data[0];
-    } else {
-      // Fix the node's inputs.
-      TfLiteIntArray* inputs = node->inputs;
-      for (int j = 0; j < inputs->size; ++j) {
-        if (node_map.find(inputs->data[j]) != node_map.end()) {
-          inputs->data[j] = node_map[inputs->data[j]];
-        }
-      }
-      // Add the op to the graph.
-      pruned_graph.push_back(i);
+      return GetOpsToReplaceFromGraphWithDequantize(context);
     }
   }
 
-  for (int i = 0; i < pruned_graph.size(); ++i) {
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    GetNodeAndRegistration(context, pruned_graph[i], &node, &registration)
-        .IgnoreError();
-    const auto status = IsSupported(context, node, registration);
+  // No Dequantize nodes. Iterate through graph and find ops to replace.
+  TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size);
+  subgraph->size = 0;
+  std::set<std::string> errors;
+  for (int i = 0; i < execution_plan->size; ++i) {
+    const int node_id = execution_plan->data[i];
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    auto status =
+        GetNodeAndRegistration(context, node_id, &node, &registration);
+    if (!status.ok()) {
+      context->ReportError(context, status.error_message().c_str());
+      return nullptr;
+    }
+    status = IsSupported(context, node, registration);
     if (status.ok() &&
         // TODO(eignasheva): resolve sub operation support for metal delegate
         // registration->builtin_code != kTfLiteBuiltinSub &&
         IsAllFloatTensors(context, node->inputs) &&
         IsAllFloatTensors(context, node->outputs)) {
-      if (errors.empty()) subgraph->data[subgraph->size++] = pruned_graph[i];
+      if (errors.empty()) subgraph->data[subgraph->size++] = node_id;
     } else {
-      errors.insert(GetOpNameByRegistration(registration) + ": " +
-                    status.error_message());
+      errors.insert(absl::StrCat(GetOpNameByRegistration(registration), ": ",
+                                 status.error_message()));
     }
   }
   if (!errors.empty()) {
@@ -2210,32 +2425,42 @@ Status BuildModel(TfLiteContext* context,
                   const TfLiteDelegateParams* delegate_params,
                   GraphFloat32* graph) {
   std::vector<std::unique_ptr<TFLiteOperationParser>> operations;
+  std::vector<int> tflite_nodes;
   for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) {
     TfLiteNode* tflite_node = nullptr;
     TfLiteRegistration* registration = nullptr;
     RETURN_IF_ERROR(GetNodeAndRegistration(
         context, delegate_params->nodes_to_replace->data[i], &tflite_node,
         &registration));
+    if (registration->builtin_code == kTfLiteBuiltinDequantize) {
+      // Ignore Dequantize nodes.
+      continue;
+    }
     auto op_parser = NewOperationParser(registration);
     if (!op_parser) {
       return UnimplementedError(
-          StrCat("Operation ", registration->builtin_code, "(",
-                 registration->custom_name,
-                 ") is not supported by TFLite GPU Delegate."));
+          absl::StrCat("Operation ", registration->builtin_code, "(",
+                       registration->custom_name,
+                       ") is not supported by TFLite GPU Delegate."));
     }
     operations.push_back(std::move(op_parser));
+    tflite_nodes.push_back(i);
   }
   std::vector<Value<TensorRef<BHWC>>*> tensor_to_value(context->tensors_size,
                                                        nullptr);
-  for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) {
-    TfLiteNode* tflite_node = nullptr;
-    TfLiteRegistration* registration = nullptr;
+  for (int i = 0; i < operations.size(); ++i) {
+    TfLiteNode* tflite_node;
+    TfLiteRegistration* registration;
     RETURN_IF_ERROR(GetNodeAndRegistration(
-        context, delegate_params->nodes_to_replace->data[i], &tflite_node,
-        &registration));
+        context, delegate_params->nodes_to_replace->data[tflite_nodes[i]],
+        &tflite_node, &registration));
     ObjectReader reader(graph, context, tflite_node, &tensor_to_value);
-    RETURN_IF_ERROR(
-        operations[i]->Parse(tflite_node, registration, graph, &reader));
+    const auto status =
+        operations[i]->Parse(tflite_node, registration, graph, &reader);
+    if (!status.ok()) {
+      return InternalError(absl::StrCat(GetOpNameByRegistration(registration),
+                                        ": ", status.error_message()));
+    }
   }
   return OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index 1f182b2e41d..f737612856d 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -122,7 +122,7 @@ TEST(ModelBuilderTest, ConvertTfLiteTensorToTensorRefFailsForRankGT3) {
 
 class InterpreterFp16 {
  public:
-  InterpreterFp16() {
+  explicit InterpreterFp16(TfLiteBuiltinOperator op) {
     void* builtin_data = malloc(sizeof(int));
     EXPECT_EQ(interpreter_.AddTensors(5), kTfLiteOk);
     EXPECT_EQ(interpreter_.SetInputs({0, 1}), kTfLiteOk);
@@ -147,7 +147,7 @@ class InterpreterFp16 {
               kTfLiteOk);
 
     // Add a node that GPU delegate can parse.
-    const TfLiteRegistration reg_add0 = {
+    const TfLiteRegistration reg_op0 = {
         [](TfLiteContext* context, const char* buffer, size_t length) {
           return reinterpret_cast<void*>(new int(1));
         },
@@ -157,15 +157,16 @@ class InterpreterFp16 {
         nullptr,
         nullptr,
         nullptr,
-        kTfLiteBuiltinAdd};
+        op};
     EXPECT_EQ(interpreter_.AddNodeWithParameters(
                   /*inputs=*/{1, 3}, /*outputs=*/{4}, /*init_data=*/nullptr,
                   /*init_data_size=*/0,
                   /*builtin_data=*/builtin_data,
-                  /*registration=*/&reg_add0),
+                  /*registration=*/&reg_op0),
               kTfLiteOk);
 
-    // Set inputs to Dequantize node to the specified type.
+    // Set inputs to Dequantize node to the fp16 type, and outputs
+    // to fp32 type.
     const std::vector<int> dims = {1};
     TfLiteQuantization quantization;
     quantization.type = kTfLiteNoQuantization;
@@ -177,6 +178,15 @@ class InterpreterFp16 {
         interpreter_.SetTensorParametersReadWrite(
             2, TfLiteType::kTfLiteFloat16, "t2", dims, quantization, false),
         kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            1, TfLiteType::kTfLiteFloat32, "t1", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
+        kTfLiteOk);
+
     exec_plan_ = TfLiteIntArrayCreate(3);
     exec_plan_->data[0] = 0;
     exec_plan_->data[1] = 1;
@@ -193,7 +203,8 @@ class InterpreterFp16 {
   TfLiteIntArray* exec_plan_;
 };
 
-InterpreterFp16* interpreter_fp16 = new InterpreterFp16();
+InterpreterFp16* interpreter_fp16_add_op =
+    new InterpreterFp16(kTfLiteBuiltinAdd);
 
 TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   // Before pruning, the graph has three nodes:
@@ -201,24 +212,25 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   //   t0 (FP16) -> DequantNode -> t1 (FP32) -> Add -> t4
   //   t2 (FP16) -> DequantNode -> t3 (FP32) --/
   //
-  // After pruning, the graph has one node:
+  // OpsToReplace should choose all three nodes for replacement, and
+  // the graph on the GPU will look like this (no Dequants):
   //
   //   t0 (FP16) --> Add -> t4
   //   t2 (FP16) --/
   //
-  TfLiteContext* context = interpreter_fp16->GetSubgraph()->context();
+  TfLiteContext* context = interpreter_fp16_add_op->GetSubgraph()->context();
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
   context->GetExecutionPlan = [](struct TfLiteContext* context,
                                  TfLiteIntArray** execution_plan) {
-    *execution_plan = interpreter_fp16->exec_plan();
+    *execution_plan = interpreter_fp16_add_op->exec_plan();
     return kTfLiteOk;
   };
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg =
-        interpreter_fp16->GetSubgraph()->nodes_and_registration()[node_index];
+    auto& node_and_reg = interpreter_fp16_add_op->GetSubgraph()
+                             ->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
@@ -226,11 +238,11 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
 
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
-  // Just one node left.
-  EXPECT_EQ(ops_to_replace->size, 1);
+  // Replace all nodes.
+  EXPECT_EQ(ops_to_replace->size, 3);
   TfLiteNode* node = nullptr;
   TfLiteRegistration* registration = nullptr;
-  context->GetNodeAndRegistration(context, ops_to_replace->data[0], &node,
+  context->GetNodeAndRegistration(context, ops_to_replace->data[2], &node,
                                   &registration);
   EXPECT_EQ(context->tensors[node->inputs->data[0]].type,
             TfLiteType::kTfLiteFloat16);
@@ -239,6 +251,64 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   TfLiteIntArrayFree(ops_to_replace);
 }
 
+// This interpreter instance is created at global scope to test *exactly*
+// the GetOpsToReplace function alone, and not the sequence of function calls
+// that includes GetOpsToReplace when calling ModifyGraphWithDelegate.
+// A TfLiteContext is needed to test GetOpsToReplace, but TfLiteContexts
+// intentionally make it difficult to call certain functions in a
+// non-delegate context (see tensorflow/lite/subgraph/subgraph.cc for details)
+// We create our own GetExecutionPlan and GetNodeAndRegistration lambdas
+// inside each test, but we can't use local captures without changing the
+// function signature. Therefore, this test data lives at global scope
+// in order to be accessible inside the lambda.
+
+InterpreterFp16* interpreter_fp16_gt_op =
+    new InterpreterFp16(kTfLiteBuiltinGreater);
+
+TEST(ModelBuilderTest, GetOpsToReplaceKeepsFp16DequantizeNodes) {
+  // Before pruning, the graph has three nodes:
+  //
+  //   t0 (FP16) -> DequantNode -> t1 (FP32) -> Greater Op -> t4
+  //   t2 (FP16) -> DequantNode -> t3 (FP32) --/
+  //
+  // Because there is no GPU equivalent for the Greater op, we don't prune
+  // the Dequantize nodes.
+
+  TfLiteContext* context = interpreter_fp16_gt_op->GetSubgraph()->context();
+  // These functions are meant to be called inside delegates. Swap out
+  // for similar functions to permit direct calling of GetOpsToReplace.
+  context->GetExecutionPlan = [](struct TfLiteContext* context,
+                                 TfLiteIntArray** execution_plan) {
+    *execution_plan = interpreter_fp16_gt_op->exec_plan();
+    return kTfLiteOk;
+  };
+  context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
+                                       TfLiteNode** node,
+                                       TfLiteRegistration** registration) {
+    auto& node_and_reg = interpreter_fp16_gt_op->GetSubgraph()
+                             ->nodes_and_registration()[node_index];
+    *node = &node_and_reg.first;
+    *registration = &node_and_reg.second;
+    return kTfLiteOk;
+  };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+
+  // No nodes were found to replace.
+  EXPECT_EQ(ops_to_replace->size, 0);
+  // Inputs to Greater op are still fp32.
+  TfLiteNode* node = nullptr;
+  TfLiteRegistration* registration = nullptr;
+  const int kGreaterOpIndex = 2;
+  context->GetNodeAndRegistration(context, kGreaterOpIndex, &node,
+                                  &registration);
+  EXPECT_EQ(context->tensors[node->inputs->data[0]].type,
+            TfLiteType::kTfLiteFloat32);
+  EXPECT_EQ(context->tensors[node->inputs->data[1]].type,
+            TfLiteType::kTfLiteFloat32);
+  TfLiteIntArrayFree(ops_to_replace);
+}
+
 class InterpreterFp32 {
  public:
   InterpreterFp32() {
@@ -347,6 +417,174 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
   TfLiteIntArrayFree(ops_to_replace);
 }
 
+class InterpreterMultiNode {
+ public:
+  InterpreterMultiNode() {
+    void* builtin_data = malloc(sizeof(int));
+    EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetInputs({0, 1, 2}), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetOutputs({6, 7}), kTfLiteOk);
+
+    // Add 3 Dequantize Nodes with float16 input.
+    for (int i = 0; i < 3; ++i) {
+      const TfLiteRegistration reg_dequant = {/*init=*/nullptr,
+                                              /*free=*/nullptr,
+                                              /*prepare=*/nullptr,
+                                              /*invoke=*/nullptr,
+                                              /*profiling_string=*/nullptr,
+                                              kTfLiteBuiltinDequantize};
+      EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                    /*inputs=*/{i}, /*outputs=*/{i + 3}, /*init_data=*/nullptr,
+                    /*init_data_size=*/0, /*builtin_data=*/nullptr,
+                    /*registration=*/&reg_dequant),
+                kTfLiteOk);
+    }
+
+    // Add the ADD op node that GPU delegate supports.
+    const TfLiteRegistration reg_add0 = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int(1));
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinAdd};
+
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{4, 5}, /*outputs=*/{7}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_add0),
+              kTfLiteOk);
+
+    // Add the GreaterThan op node that GPU delegate doesn't support.
+    const TfLiteRegistration reg_greater = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int(1));
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinGreater};
+
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{3, 4}, /*outputs=*/{6}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_greater),
+              kTfLiteOk);
+
+    const std::vector<int> dims = {1};
+    TfLiteQuantization quantization;
+    quantization.type = kTfLiteNoQuantization;
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            0, TfLiteType::kTfLiteFloat16, "t0", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            1, TfLiteType::kTfLiteFloat16, "t1", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            2, TfLiteType::kTfLiteFloat16, "t2", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            4, TfLiteType::kTfLiteFloat32, "t4", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            5, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            6, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            7, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
+        kTfLiteOk);
+    exec_plan_ = TfLiteIntArrayCreate(5);
+    exec_plan_->data[0] = 0;
+    exec_plan_->data[1] = 1;
+    exec_plan_->data[2] = 2;
+    exec_plan_->data[3] = 3;
+    exec_plan_->data[4] = 4;
+  }
+
+  ~InterpreterMultiNode() { TfLiteIntArrayFree(exec_plan_); }
+
+  Subgraph* GetSubgraph() { return interpreter_.subgraph(0); }
+  TfLiteIntArray* exec_plan() const { return exec_plan_; }
+
+ private:
+  Interpreter interpreter_;
+  TfLiteIntArray* exec_plan_;
+};
+
+InterpreterMultiNode* interpreter_mn = new InterpreterMultiNode();
+
+TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequants) {
+  // A graph with three Dequant nodes feeding two ops, 'Add' and 'Greater'.
+  // 'Add' can be replaced by the GPU delegate, but 'Greater' can not.
+  //   t0 (FP16) --> Dequant --> t3 (FP32) --> Greater -> t6
+  //   t1 (FP16) --> Dequant --> t4 (FP32) --/
+  //                                       --\
+  //   t3 (FP16) --> Dequant --> t5 (FP32) --> Add -> t7
+  //
+  //  OpsToReplace should replace the 'Add' op and the Dequant outputing
+  //  t5, but leave the other Dequant nodes because 'Greater' must run
+  //  on the CPU.
+  TfLiteContext* context = interpreter_mn->GetSubgraph()->context();
+
+  // These functions are meant to be called inside delegates. Swap out
+  // for similar functions to permit direct calling of GetOpsToReplace.
+  context->GetExecutionPlan = [](struct TfLiteContext* context,
+                                 TfLiteIntArray** execution_plan) {
+    *execution_plan = interpreter_mn->exec_plan();
+    return kTfLiteOk;
+  };
+  context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
+                                       TfLiteNode** node,
+                                       TfLiteRegistration** registration) {
+    auto& node_and_reg =
+        interpreter_mn->GetSubgraph()->nodes_and_registration()[node_index];
+    *node = &node_and_reg.first;
+    *registration = &node_and_reg.second;
+    return kTfLiteOk;
+  };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+
+  EXPECT_EQ(ops_to_replace->size, 2);
+  // Op at index 2 is the Dequant op (t3 -> t5).
+  EXPECT_EQ(ops_to_replace->data[0], 2);
+  // Op at index 3 is the Add op.
+  EXPECT_EQ(ops_to_replace->data[1], 3);
+
+  TfLiteNode* node = nullptr;
+  TfLiteRegistration* registration = nullptr;
+  // Verify that Add op has fp16 inputs.
+  context->GetNodeAndRegistration(context, ops_to_replace->data[1], &node,
+                                  &registration);
+  EXPECT_EQ(context->tensors[node->inputs->data[0]].type,
+            TfLiteType::kTfLiteFloat16);
+  EXPECT_EQ(context->tensors[node->inputs->data[1]].type,
+            TfLiteType::kTfLiteFloat16);
+  TfLiteIntArrayFree(ops_to_replace);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 8a8e80e3f12..8ce12024275 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -106,12 +106,10 @@ std::string ToString(enum OperationType op) {
       return "sin";
     case OperationType::SLICE:
       return "slice";
-    case OperationType::SOFT_MAX:
-      return "soft_max";
+    case OperationType::SOFTMAX:
+      return "softmax";
     case OperationType::SPACE_TO_BATCH:
       return "space_to_batch";
-    case OperationType::STRETCH_TIME:
-      return "stretch_time";
     case OperationType::SQRT:
       return "sqrt";
     case OperationType::SQUARE:
@@ -160,8 +158,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"sigmoid", OperationType::SIGMOID},
           {"sin", OperationType::SIN},
           {"slice", OperationType::SLICE},
-          {"soft_max", OperationType::SOFT_MAX},
-          {"stretch_time", OperationType::STRETCH_TIME},
+          {"softmax", OperationType::SOFTMAX},
           {"sqrt", OperationType::SQRT},
           {"square", OperationType::SQUARE},
           {"subtract", OperationType::SUB},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 3e2b36ed8f4..89f46106703 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -63,9 +63,8 @@ enum class OperationType {
   SIGMOID,
   SIN,
   SLICE,
-  SOFT_MAX,
+  SOFTMAX,
   SPACE_TO_BATCH,
-  STRETCH_TIME,
   SQRT,
   SQUARE,
   SQUARED_DIFF,
@@ -133,12 +132,6 @@ struct MaxUnpooling2DAttributes {
   Padding2D padding;
 };
 
-struct StretchTimeAttributes {
-  Axis axis;
-  int32_t factor;
-  HW slice;
-};
-
 struct ConcatAttributes {
   // Defines axis by which to concat on.
   Axis axis = Axis::UNKNOWN;
@@ -246,7 +239,7 @@ struct PReLUAttributes {
       alpha;
 };
 
-struct SoftMaxAttributes {
+struct SoftmaxAttributes {
   Axis axis = Axis::UNKNOWN;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape.cc b/tensorflow/lite/delegates/gpu/common/shape.cc
index df34076313c..3ffc651765e 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@@ -111,15 +111,5 @@ std::string ToString(const Shape& s) {
                       absl::StrJoin(s.dimensions, ", "), "}}");
 }
 
-template <>
-int64_t StrongShape<Layout::OHWI>::LinearIndex(
-    const std::array<int32_t, 4>& coordinates) const {
-  int64_t index = coordinates[0];
-  index = index * StrongShape::get(1) + coordinates[1];
-  index = index * StrongShape::get(2) + coordinates[2];
-  index = index * StrongShape::get(3) + coordinates[3];
-  return index;
-}
-
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index cf7bbc1dd4a..586c7a34a37 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -184,7 +184,7 @@ void FuseAddWithConvolution2D(const AddAttributes& add_attr,
       const float add_value = add ? add->data[s] : *add_scalar;
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
           attr->bias.data[d] += attr->weights.data[index] * add_value;
         }
       }
@@ -206,7 +206,7 @@ void FuseAddWithDepthwiseConvolution2D(const AddAttributes& add_attr,
       const int d = s * attr->weights.shape.o + g;
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{g, k_y, k_x, s}});
           attr->bias.data[d] += attr->weights.data[index] * add_value;
         }
       }
@@ -225,7 +225,7 @@ void FuseAddWithFullyConnected(const AddAttributes& add_attr,
   for (int d = 0; d < attr->weights.shape.o; ++d) {
     for (int s = 0; s < attr->weights.shape.i; ++s) {
       const float add_value = add ? add->data[s] : *add_scalar;
-      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
+      const int index = attr->weights.shape.LinearIndex({{d, 0, 0, s}});
       attr->bias.data[d] += attr->weights.data[index] * add_value;
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 3090c3f71be..fc351dbce71 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -164,7 +164,7 @@ void FuseConvolution2DWithMultiply(const MultiplyScalarAttributes& mul_attr,
     for (int s = 0; s < attr->weights.shape.i; ++s) {
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -186,7 +186,7 @@ void FuseDepthwiseConvolution2DWithMultiply(
       const float multiplier = mul ? mul->data[d] : *mul_scalar;
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{g, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -207,7 +207,7 @@ void FuseConvolutionTransposedWithMultiply(
     for (int s = 0; s < attr->weights.shape.i; ++s) {
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -225,7 +225,7 @@ void FuseFullyConnectedWithMultiply(const MultiplyScalarAttributes& mul_attr,
   for (int d = 0; d < attr->weights.shape.o; ++d) {
     const float multiplier = mul ? mul->data[d] : *mul_scalar;
     for (int s = 0; s < attr->weights.shape.i; ++s) {
-      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
+      const int index = attr->weights.shape.LinearIndex({{d, 0, 0, s}});
       attr->weights.data[index] *= multiplier;
     }
     if (!attr->bias.data.empty()) {
@@ -243,7 +243,7 @@ void FuseMultiplyWithConvolution2D(const MultiplyScalarAttributes& mul_attr,
     for (int d = 0; d < attr->weights.shape.o; ++d) {
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -261,7 +261,7 @@ void FuseMultiplyWithDepthwiseConvolution2D(
     for (int g = 0; g < attr->weights.shape.o; ++g) {
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{g, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -279,7 +279,7 @@ void FuseMultiplyWithConvolutionTransposed(
     for (int d = 0; d < attr->weights.shape.o; ++d) {
       for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
         for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
+          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
           attr->weights.data[index] *= multiplier;
         }
       }
@@ -294,7 +294,7 @@ void FuseMultiplyWithFullyConnected(const MultiplyScalarAttributes& mul_attr,
   for (int s = 0; s < attr->weights.shape.i; ++s) {
     const float multiplier = mul ? mul->data[s] : *mul_scalar;
     for (int d = 0; d < attr->weights.shape.o; ++d) {
-      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
+      const int index = attr->weights.shape.LinearIndex({{d, 0, 0, s}});
       attr->weights.data[index] *= multiplier;
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index b3385ea1fa6..734d34b67b3 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -16,7 +16,7 @@ cc_library(
         ":compiler",
         ":compiler_options",
         ":gl_call",
-        ":gpu_info",
+        ":request_gpu_info",
         ":node_shader",
         ":object",
         ":object_manager",
@@ -48,8 +48,8 @@ cc_library(
         ":gl_call",
         ":gl_program",
         ":gl_sync",
-        ":gpu_info",
         ":portable",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "@com_google_absl//absl/memory",
@@ -80,9 +80,9 @@ cc_library(
     deps = [
         ":compiler_options",
         ":float16_conversions",
-        ":gpu_info",
         ":node_shader",
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -103,7 +103,6 @@ cc_library(
     name = "compiler_options",
     hdrs = ["compiler_options.h"],
     deps = [
-        ":gpu_info",
         ":object",
     ],
 )
@@ -128,8 +127,8 @@ cc_library(
         ":egl_context",
         ":egl_surface",
         ":gl_call",
-        ":gpu_info",
         ":portable",
+        ":request_gpu_info",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/memory",
     ],
@@ -272,18 +271,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_info",
-    srcs = ["gpu_info.cc"],
-    hdrs = ["gpu_info.h"],
-    deps = [
-        ":gl_errors",
-        ":portable",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 flatbuffer_cc_library(
     name = "metadata_cc_fbs",
     srcs = ["metadata.fbs"],
@@ -298,9 +285,9 @@ cc_library(
     hdrs = ["node_shader.h"],
     deps = [
         ":compiler_options",
-        ":gpu_info",
         ":object",
         ":variable",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -344,6 +331,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "request_gpu_info",
+    srcs = ["request_gpu_info.cc"],
+    hdrs = ["request_gpu_info.h"],
+    deps = [
+        ":gl_errors",
+        ":portable",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "runtime",
     srcs = ["runtime.cc"],
@@ -356,7 +356,6 @@ cc_library(
         ":gl_program",
         ":gl_shader",
         ":gl_texture",
-        ":gpu_info",
         ":object",
         ":object_manager",
         ":portable",
@@ -364,6 +363,8 @@ cc_library(
         ":stats",
         ":variable",
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:memory_management",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl/runtime:shared_buffer",
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index 2767bc399c6..fc9fcae84a9 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -31,9 +31,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/runtime.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
diff --git a/tensorflow/lite/delegates/gpu/gl/command_queue.cc b/tensorflow/lite/delegates/gpu/gl/command_queue.cc
index 8e0e085da28..462d52e1258 100644
--- a/tensorflow/lite/delegates/gpu/gl/command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/gl/command_queue.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
@@ -35,7 +36,7 @@ class DefaultCommandQueue : public CommandQueue {
   }
 
   Status WaitForCompletion() override {
-    // TODO(akulik): may be let a user to choose what wait method to use.
+    // TODO(akulik): Maybe let the user choose which wait method to use.
     return GlActiveSyncWait();
   }
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/command_queue.h b/tensorflow/lite/delegates/gpu/gl/command_queue.h
index bf313b495a3..a4c21001cf2 100644
--- a/tensorflow/lite/delegates/gpu/gl/command_queue.h
+++ b/tensorflow/lite/delegates/gpu/gl/command_queue.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.cc b/tensorflow/lite/delegates/gpu/gl/compiler.cc
index 12ee49d3ce7..cef8139fe1e 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.h b/tensorflow/lite/delegates/gpu/gl/compiler.h
index 3b692117024..e8b434869e2 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <memory>
 #include <unordered_set>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index 6ff34577844..5a2ba10bb88 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -28,41 +28,13 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "parameter_accessor",
-    srcs = ["parameter_accessor.cc"],
-    hdrs = ["parameter_accessor.h"],
-    deps = [
-        ":preprocessor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/gl:variable",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:variant",
-    ],
-)
-
-cc_test(
-    name = "parameter_accessor_test",
-    srcs = ["parameter_accessor_test.cc"],
-    tags = [
-        "local",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":parameter_accessor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "object_accessor",
     srcs = ["object_accessor.cc"],
     hdrs = ["object_accessor.h"],
     deps = [
-        ":parameter_accessor",
         ":preprocessor",
+        ":variable_accessor",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:object",
@@ -80,7 +52,7 @@ cc_test(
     ],
     deps = [
         ":object_accessor",
-        ":parameter_accessor",
+        ":variable_accessor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:variable",
         "@com_google_absl//absl/types:variant",
@@ -106,13 +78,13 @@ cc_library(
     deps = [
         ":compiled_node",
         ":object_accessor",
-        ":parameter_accessor",
         ":preprocessor",
         ":shader_code",
+        ":variable_accessor",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:compiler_options",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:object",
         "//tensorflow/lite/delegates/gpu/gl:variable",
         "@com_google_absl//absl/strings",
@@ -172,8 +144,8 @@ cc_library(
     hdrs = ["rename.h"],
     deps = [
         ":object_accessor",
-        ":parameter_accessor",
         ":preprocessor",
+        ":variable_accessor",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object",
@@ -198,4 +170,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "variable_accessor",
+    srcs = ["variable_accessor.cc"],
+    hdrs = ["variable_accessor.h"],
+    deps = [
+        ":preprocessor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
+cc_test(
+    name = "variable_accessor_test",
+    srcs = ["variable_accessor_test.cc"],
+    tags = [
+        "local",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":variable_accessor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
index fff15b455c1..1f402ea0e1b 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
@@ -63,7 +63,7 @@ void MaybeConvertFromHalf(DataType data_type, absl::string_view value,
 }
 
 struct ReadFromTextureGenerator {
-  RewriteStatus operator()(uint32_t) const {
+  RewriteStatus operator()(size_t) const {
     if (element.indices.size() != 1) {
       result->append("WRONG_NUMBER_OF_INDICES");
       return RewriteStatus::ERROR;
@@ -103,7 +103,7 @@ struct ReadFromTextureGenerator {
 };
 
 struct ReadFromBufferGenerator {
-  RewriteStatus operator()(uint32_t) const {
+  RewriteStatus operator()(size_t) const {
     if (element.indices.size() != 1) {
       result->append("WRONG_NUMBER_OF_INDICES");
       return RewriteStatus::ERROR;
@@ -180,7 +180,7 @@ RewriteStatus GenerateReadAccessor(
 }
 
 struct WriteToBufferGenerator {
-  RewriteStatus operator()(uint32_t) const {
+  RewriteStatus operator()(size_t) const {
     if (element.indices.size() != 1) {
       result->append("WRONG_NUMBER_OF_INDICES");
       return RewriteStatus::ERROR;
@@ -236,7 +236,7 @@ struct WriteToBufferGenerator {
 };
 
 struct WriteToTextureGenerator {
-  RewriteStatus operator()(uint32_t) const {
+  RewriteStatus operator()(size_t) const {
     if (element.indices.size() != 1) {
       result->append("WRONG_NUMBER_OF_INDICES");
       return RewriteStatus::ERROR;
@@ -314,7 +314,7 @@ std::string ToBufferType(DataType data_type) {
 }
 
 struct TextureImageTypeGetter {
-  std::string operator()(uint32_t) const {
+  std::string operator()(size_t) const {
     // 1D textures are emulated as 2D textures
     return (*this)(uint2());
   }
@@ -355,7 +355,7 @@ struct TextureImageTypeGetter {
 };
 
 struct TextureSamplerTypeGetter {
-  std::string operator()(uint32_t) const {
+  std::string operator()(size_t) const {
     // 1D textures are emulated as 2D textures
     return (*this)(uint2());
   }
@@ -438,24 +438,24 @@ std::string ToImagePrecision(DataType type) {
 }
 
 struct SizeParametersAdder {
-  void operator()(uint32_t) const {}
+  void operator()(size_t) const {}
 
   void operator()(const uint2& size) const {
-    parameters->AddParameter(
+    variable_accessor->AddUniformParameter(
         {absl::StrCat(object_name, "_w"), static_cast<int32_t>(size.x)});
   }
 
   // p1 and p2 are padding. For some reason buffer does not map correctly
   // without it.
   void operator()(const uint3& size) const {
-    parameters->AddParameter(
+    variable_accessor->AddUniformParameter(
         {absl::StrCat(object_name, "_w"), static_cast<int32_t>(size.x)});
-    parameters->AddParameter(
+    variable_accessor->AddUniformParameter(
         {absl::StrCat(object_name, "_h"), static_cast<int32_t>(size.y)});
   }
 
   absl::string_view object_name;
-  ParameterAccessor* parameters;
+  VariableAccessor* variable_accessor;
 };
 
 // Adds necessary parameters to parameter accessor that represent object size
@@ -464,7 +464,7 @@ struct SizeParametersAdder {
 //  - 2D : 'int object_name_w'
 //  - 3D : 'int object_name_w' + 'int object_name_h'
 void AddSizeParameters(absl::string_view object_name, const Object& object,
-                       ParameterAccessor* parameters) {
+                       VariableAccessor* parameters) {
   absl::visit(SizeParametersAdder{object_name, parameters}, object.size);
 }
 
@@ -533,7 +533,7 @@ RewriteStatus ObjectAccessor::RewriteRead(absl::string_view location,
   auto status = GenerateReadAccessor(it->second, element, sampler_textures_,
                                      output, &requires_sizes);
   if (requires_sizes) {
-    AddSizeParameters(it->first, it->second, parameter_accessor_);
+    AddSizeParameters(it->first, it->second, variable_accessor_);
   }
   return status;
 }
@@ -555,7 +555,7 @@ RewriteStatus ObjectAccessor::RewriteWrite(absl::string_view location,
   auto status = GenerateWriteAccessor(it->second, element, value, output,
                                       &requires_sizes);
   if (requires_sizes) {
-    AddSizeParameters(it->first, it->second, parameter_accessor_);
+    AddSizeParameters(it->first, it->second, variable_accessor_);
   }
   return status;
 }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
index e5bf6285893..78e7a2f1e17 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 
 namespace tflite {
@@ -54,15 +54,15 @@ namespace gl {
 //
 class ObjectAccessor : public InlineRewrite {
  public:
-  ObjectAccessor(bool is_mali, ParameterAccessor* parameter_accessor)
-      : ObjectAccessor(is_mali, /*sampler_textures=*/false,
-                       parameter_accessor) {}
+  ObjectAccessor(bool is_mali, VariableAccessor* variable_accessor)
+      : ObjectAccessor(is_mali, /*sampler_textures=*/false, variable_accessor) {
+  }
 
   ObjectAccessor(bool is_mali, bool sampler_textures,
-                 ParameterAccessor* parameter_accessor)
+                 VariableAccessor* variable_accessor)
       : is_mali_(is_mali),
         sampler_textures_(sampler_textures),
-        parameter_accessor_(parameter_accessor) {}
+        variable_accessor_(variable_accessor) {}
 
   RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
 
@@ -89,7 +89,7 @@ class ObjectAccessor : public InlineRewrite {
 
   const bool is_mali_;
   const bool sampler_textures_;
-  ParameterAccessor* parameter_accessor_;
+  VariableAccessor* variable_accessor_;
 };
 
 // Implementation details below.
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
index 0b04210a00d..c344d8f5fe8 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
 namespace tflite {
@@ -46,101 +46,101 @@ bool operator==(const Variable& l, const Variable& r) {
 namespace {
 
 TEST(Preprocessor, CornerCases) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   std::string result;
   ASSERT_EQ(accessor.Rewrite("", &result), RewriteStatus::NOT_RECOGNIZED);
   ASSERT_EQ(accessor.Rewrite("=", &result), RewriteStatus::NOT_RECOGNIZED);
 }
 
 TEST(Preprocessor, ReadFromBuffer) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "obj.data[i]");
 }
 
 TEST(Preprocessor, ReadFromBufferLinear) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyBuffer(uint3(1, 2, 3), std::vector<float>{1.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "obj.data[i]");
 }
 
 TEST(Preprocessor, ReadFromBufferByIndex) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyBuffer(uint3(1, 2, 3), std::vector<float>{1.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[x,y + 5,z]", &result),
             RewriteStatus::SUCCESS);
-  EXPECT_THAT(parameters.GetUniformParameters(),
+  EXPECT_THAT(variable_accessor.GetUniformParameters(),
               testing::UnorderedElementsAre(Variable{"obj_w", 1},
                                             Variable{"obj_h", 2}));
   ASSERT_EQ(result, "obj.data[x + $obj_w$ * (y + 5 + $obj_h$ * (z))]");
 }
 
 TEST(Preprocessor, ReadFromTexture) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyTexture(uint3(1, 2, 3), {1.0, 2.0, 3.0, 4.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[i,j,k]", &result), RewriteStatus::SUCCESS);
   // textures don't need extra variables to be stored for indexed access
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "imageLoad(obj, ivec3(i, j, k))");
 }
 
 TEST(Preprocessor, ReadFromTexture1D) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyTexture({1.0, 2.0, 3.0, 4.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "imageLoad(obj, ivec2(i, 0))");
 }
 
 TEST(Preprocessor, WriteToBuffer) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite(" obj[i]  =value", &result),
             RewriteStatus::SUCCESS);
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "obj.data[i] = value");
 }
 
 TEST(Preprocessor, WriteToBufferByIndex) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyBuffer(uint3(1, 2, 3), {1.0, 2.0, 3.0, 4.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite(" obj[i,j,k]  =value", &result),
             RewriteStatus::SUCCESS);
-  EXPECT_THAT(parameters.GetUniformParameters(),
+  EXPECT_THAT(variable_accessor.GetUniformParameters(),
               testing::UnorderedElementsAre(Variable{"obj_w", 1},
                                             Variable{"obj_h", 2}));
   ASSERT_EQ(result, "obj.data[i + $obj_w$ * (j + $obj_h$ * (k))] = value");
 }
 
 TEST(Preprocessor, WriteToTexture) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
   std::string result;
@@ -150,20 +150,20 @@ TEST(Preprocessor, WriteToTexture) {
 }
 
 TEST(Preprocessor, WriteToTexture1D) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyTexture({1.0, 2.0, 3.0, 4.0})));
   std::string result;
   EXPECT_EQ(accessor.Rewrite("obj[i]= value ", &result),
             RewriteStatus::SUCCESS);
-  EXPECT_TRUE(parameters.GetUniformParameters().empty());
+  EXPECT_TRUE(variable_accessor.GetUniformParameters().empty());
   ASSERT_EQ(result, "imageStore(obj, ivec2(i, 0), value)");
 }
 
 TEST(Preprocessor, FailedWriteToBuffer) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
   std::string result;
@@ -173,8 +173,8 @@ TEST(Preprocessor, FailedWriteToBuffer) {
 }
 
 TEST(Preprocessor, FailedWriteToTexture) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
   std::string result;
@@ -183,8 +183,8 @@ TEST(Preprocessor, FailedWriteToTexture) {
 }
 
 TEST(Preprocessor, DeclareTexture) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(false, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(false, &variable_accessor);
   ASSERT_TRUE(accessor.AddObject(
       "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
   ASSERT_EQ(accessor.GetObjectDeclarations(),
@@ -193,8 +193,8 @@ TEST(Preprocessor, DeclareTexture) {
 }
 
 TEST(Preprocessor, DeclareBuffer) {
-  ParameterAccessor parameters(false);
-  ObjectAccessor accessor(true, &parameters);
+  VariableAccessor variable_accessor(/*inline_values=*/false);
+  ObjectAccessor accessor(true, &variable_accessor);
   ASSERT_TRUE(
       accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
   ASSERT_EQ(accessor.GetObjectDeclarations(),
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h
deleted file mode 100644
index 3dacc34d21f..00000000000
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
-#include "tensorflow/lite/delegates/gpu/gl/variable.h"
-
-namespace tflite {
-namespace gpu {
-namespace gl {
-
-// This rewrite handles access to parameters. It may rewrite a parameter with
-// actual values if inline_values is set to true.
-//
-// The following syntax is supported to access parameters:
-//  - simple parameter: name
-//  - parameter with field: name.(x|y|z|w)
-//  - parameter with index: name[i]
-//  - parameter with index and field: name[i].(x|y|z|w)
-//
-// If 'inline_values' is set to true, non variable-length parameters will be
-// inlined. For example, 'base.x' will be replaced with value of 'x' field from
-// 'base'. Variable-length are declared as const and accessed via index.
-// These declarations are returned by GetConstDeclarations.
-//
-// If 'inline_values' is set to false, all parameters will be declared as
-// uniforms. Uniform declarations are returned by GetUniformDeclarations.
-class ParameterAccessor : public InlineRewrite {
- public:
-  explicit ParameterAccessor(bool inline_values)
-      : inline_values_(inline_values) {}
-
-  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
-
-  // Return true if parameter was successfully added.
-  bool AddParameter(Variable param);
-
-  // Returns const parameters that need to be inlined in the a shader's code.
-  std::string GetConstDeclarations() const;
-
-  // Returns uniforms declarations that need to be inlined in a shader's code.
-  std::string GetUniformDeclarations() const;
-
-  // Returns a collection of uniform parameters.
-  std::vector<Variable> GetUniformParameters() const;
-
- private:
-  const bool inline_values_;
-  // Unique parameter index used for obfuscation.
-  uint32_t unique_param_index_ = 0;
-
-  std::unordered_map<std::string, Variable> name_to_param_;
-};
-
-// Implementation details below.
-
-namespace parameter_accessor_internal {
-
-struct ParameterReference {
-  absl::string_view name;
-  absl::string_view index;
-  absl::string_view field;
-};
-
-// Parse the following regex manually
-// name(\[index\])?(\.field)?
-ParameterReference Parse(absl::string_view input);
-
-}  // namespace parameter_accessor_internal
-}  // namespace gl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor_test.cc b/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor_test.cc
deleted file mode 100644
index d8c634e8c85..00000000000
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
-
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace gl {
-namespace {
-
-TEST(Preprocessor, CornerCases) {
-  ParameterAccessor accessor(true);
-  std::string result;
-  ASSERT_EQ(accessor.Rewrite("unknown", &result),
-            RewriteStatus::NOT_RECOGNIZED);
-}
-
-TEST(Preprocessor, Value) {
-  ParameterAccessor accessor(true);
-  ASSERT_TRUE(accessor.AddParameter({"var", int32_t(1)}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
-  ASSERT_EQ(result, "1");
-}
-
-TEST(Preprocessor, ValueVec) {
-  ParameterAccessor accessor(true);
-  ASSERT_TRUE(accessor.AddParameter({"var", int2(1, 2)}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
-  ASSERT_EQ(result, "ivec2(1,2)");
-}
-
-TEST(Preprocessor, Field) {
-  ParameterAccessor accessor(true);
-  ASSERT_TRUE(accessor.AddParameter({"var", float2(1.0, 2.1234567)}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var.y", &result), RewriteStatus::SUCCESS);
-  ASSERT_EQ(result, "2.123456717f");
-}
-
-TEST(Preprocessor, FieldFail) {
-  ParameterAccessor accessor(true);
-  ASSERT_TRUE(accessor.AddParameter({"var", 1.0f}));
-  ASSERT_TRUE(accessor.AddParameter({"vec", float2(1.0, 1.0)}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var.y", &result), RewriteStatus::ERROR);
-  ASSERT_EQ(result, "INVALID_ACCESS_BY_FIELD");
-
-  result.clear();
-  EXPECT_EQ(accessor.Rewrite("vec.z", &result), RewriteStatus::ERROR);
-  ASSERT_EQ(result, "INVALID_ACCESS_BY_FIELD");
-}
-
-TEST(Preprocessor, Variable) {
-  ParameterAccessor accessor(true);
-  std::vector<int2> v;
-  v.push_back(int2(1, 2));
-  ASSERT_TRUE(accessor.AddParameter({"var", v}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var[i].y", &result), RewriteStatus::SUCCESS);
-  ASSERT_EQ(result, "var[i].y");
-  ASSERT_EQ(accessor.GetConstDeclarations(),
-            "const ivec2 var[] = ivec2[1](ivec2(1,2));\n");
-}
-
-TEST(Preprocessor, InlineVariableFail) {
-  ParameterAccessor accessor(true);
-  ASSERT_TRUE(accessor.AddParameter({"var", 1}));
-  std::string result;
-  EXPECT_EQ(accessor.Rewrite("var[i]", &result), RewriteStatus::ERROR);
-  ASSERT_EQ(result, "INVALID_ACCESS_BY_INDEX");
-}
-
-}  // namespace
-}  // namespace gl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
index e8d1d786b0e..674002b74b2 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
@@ -35,24 +35,24 @@ namespace gpu {
 namespace gl {
 namespace {
 
-// Rewrites names of all parameters according to returned values from the
+// Rewrites names of all variables according to returned values from the
 // given NameFunctor.
-class ParameterRewriter : public InlineRewrite {
+class VariableRewriter : public InlineRewrite {
  public:
-  ParameterRewriter(const std::string& inline_delimiter,
-                    const NameFunctor& name_func)
+  VariableRewriter(const std::string& inline_delimiter,
+                   const NameFunctor& name_func)
       : inline_delimiter_(inline_delimiter), name_func_(name_func) {}
 
   RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
-    auto ref = parameter_accessor_internal::Parse(input);
+    auto ref = variable_accessor_internal::Parse(input);
     if (ref.name.empty()) {
       absl::StrAppend(output, "INVALID_SYNTAX");
       return RewriteStatus::ERROR;
     }
 
     auto it =
-        name_to_param_.find(std::string(ref.name.data(), ref.name.size()));
-    if (it == name_to_param_.end()) {
+        name_to_variable_.find(std::string(ref.name.data(), ref.name.size()));
+    if (it == name_to_variable_.end()) {
       return RewriteStatus::NOT_RECOGNIZED;
     }
 
@@ -65,28 +65,28 @@ class ParameterRewriter : public InlineRewrite {
     return RewriteStatus::SUCCESS;
   }
 
-  // Return true if parameter was successfully added.
-  bool AddParameter(Variable param) {
-    std::string old_name = param.name;
-    param.name = name_func_(old_name);
-    return name_to_param_.insert({old_name, std::move(param)}).second;
+  // Return true if variable was successfully added.
+  bool AddVariable(Variable&& variable) {
+    std::string old_name = variable.name;
+    variable.name = name_func_(old_name);
+    return name_to_variable_.insert({old_name, std::move(variable)}).second;
   }
 
   // Returns a collection of uniform parameters with updated names.
   std::vector<Variable> GetUniformParameters() const {
-    std::vector<Variable> params;
-    params.reserve(name_to_param_.size());
-    for (auto& param : name_to_param_) {
-      params.push_back(param.second);
+    std::vector<Variable> variables;
+    variables.reserve(name_to_variable_.size());
+    for (const auto& variable : name_to_variable_) {
+      variables.push_back(variable.second);
     }
-    return params;
+    return variables;
   }
 
  private:
   const std::string inline_delimiter_;
   const NameFunctor name_func_;
 
-  std::unordered_map<std::string, Variable> name_to_param_;
+  std::unordered_map<std::string, Variable> name_to_variable_;
 };
 
 // Rewrites names of all objects according to returned values from the
@@ -122,7 +122,7 @@ class ObjectRewriter : public InlineRewrite {
   std::vector<std::pair<std::string, Object>> GetObjects() const {
     std::vector<std::pair<std::string, Object>> objects;
     objects.reserve(name_to_object_.size());
-    for (auto& o : name_to_object_) {
+    for (const auto& o : name_to_object_) {
       objects.push_back(o.second);
     }
     return objects;
@@ -175,11 +175,11 @@ class ObjectRewriter : public InlineRewrite {
 }  // namespace
 
 Status Rename(const NameFunctor& name_func, GeneratedCode* code) {
-  ParameterRewriter param_rewriter("$", name_func);
+  VariableRewriter variable_rewriter("$", name_func);
   ObjectRewriter object_rewriter("$", name_func);
-  for (auto&& param : code->parameters) {
-    if (!param_rewriter.AddParameter(std::move(param))) {
-      return InternalError("Parameter name already exists");
+  for (auto&& uniform_parameter : code->parameters) {
+    if (!variable_rewriter.AddVariable(std::move(uniform_parameter))) {
+      return InternalError("Variable name already exists");
     }
   }
   for (auto&& object : code->objects) {
@@ -187,13 +187,13 @@ Status Rename(const NameFunctor& name_func, GeneratedCode* code) {
       return InternalError("Object name already exists");
     }
   }
-  TextPreprocessor preprocessor('$', /* keep_unknown_rewrites = */ true);
-  preprocessor.AddRewrite(&param_rewriter);
+  TextPreprocessor preprocessor('$', /*keep_unknown_rewrites=*/true);
+  preprocessor.AddRewrite(&variable_rewriter);
   preprocessor.AddRewrite(&object_rewriter);
   std::string source_code;
   RETURN_IF_ERROR(preprocessor.Rewrite(code->source_code, &source_code));
   code->source_code = source_code;
-  code->parameters = param_rewriter.GetUniformParameters();
+  code->parameters = variable_rewriter.GetUniformParameters();
   code->objects = object_rewriter.GetObjects();
   return OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
index 30da5472565..4b61948f6bc 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
 namespace tflite {
@@ -32,32 +34,41 @@ ShaderCodegen::ShaderCodegen(const CompilationOptions& options,
 
 Status ShaderCodegen::Build(CompiledNodeAttributes attr,
                             ShaderCode* shader_code) const {
-  ParameterAccessor parameters(options_.inline_parameters);
-  ObjectAccessor objects(gpu_type_ == GpuType::MALI, options_.sampler_textures,
-                         &parameters);
+  VariableAccessor variable_accessor(options_.inline_parameters);
+  ObjectAccessor object_accessor(gpu_type_ == GpuType::MALI,
+                                 options_.sampler_textures, &variable_accessor);
 
-  auto add_object = [&](const std::string& name, Object&& object) {
-    if (!objects.AddObject(name, std::forward<Object>(object))) {
-      return InternalError("There is an object with the same name");
+  const auto add_object = [&](const std::string& name, Object&& object) {
+    if (!object_accessor.AddObject(name, std::forward<Object>(object))) {
+      return AlreadyExistsError(absl::StrCat("Object \"", name, "\""));
     }
     return OkStatus();
   };
 
-  auto add_parameter = [&](Variable&& param) {
-    if (!parameters.AddParameter(std::forward<Variable>(param))) {
-      return InternalError("There is a parameter with the same name");
+  const auto add_uniform_parameter = [&](Variable&& variable) {
+    const std::string name = variable.name;
+    if (!variable_accessor.AddUniformParameter(std::move(variable))) {
+      return AlreadyExistsError(
+          absl::StrCat("Uniform parameter \"", name, "\""));
     }
     return OkStatus();
   };
 
-  for (auto&& param : attr.code.parameters) {
-    RETURN_IF_ERROR(add_parameter(std::move(param)));
-  }
-
   for (auto&& object : attr.code.objects) {
     RETURN_IF_ERROR(add_object(object.first, std::move(object.second)));
   }
 
+  for (auto&& variable : attr.code.shared_variables) {
+    const std::string name = variable.name;
+    if (!variable_accessor.AddSharedVariable(std::move(variable))) {
+      return AlreadyExistsError(absl::StrCat("Shared variable \"", name, "\""));
+    }
+  }
+
+  for (auto&& variable : attr.code.parameters) {
+    RETURN_IF_ERROR(add_uniform_parameter(std::move(variable)));
+  }
+
   int index = 0;
   for (auto&& input : attr.inputs) {
     RETURN_IF_ERROR(
@@ -71,14 +82,14 @@ Status ShaderCodegen::Build(CompiledNodeAttributes attr,
 
   // TODO(akulik): workload params need to go away and be replaced with
   // output_data_0_w
-  RETURN_IF_ERROR(add_parameter(
+  RETURN_IF_ERROR(add_uniform_parameter(
       {"workload_x", static_cast<int32_t>(attr.code.workload.x)}));
-  RETURN_IF_ERROR(add_parameter(
+  RETURN_IF_ERROR(add_uniform_parameter(
       {"workload_y", static_cast<int32_t>(attr.code.workload.y)}));
-  RETURN_IF_ERROR(add_parameter(
+  RETURN_IF_ERROR(add_uniform_parameter(
       {"workload_z", static_cast<int32_t>(attr.code.workload.z)}));
 
-  std::string source_code = R"(
+  std::string main_source_code = R"(
   ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
   if (gid.x >= $workload_x$ || gid.y >= $workload_y$ || gid.z >= $workload_z$) {
     return;
@@ -88,60 +99,68 @@ Status ShaderCodegen::Build(CompiledNodeAttributes attr,
   switch (attr.code.input) {
     case IOStructure::ONLY_DEFINITIONS:
       for (int i = 0; i < attr.inputs.size(); ++i) {
-        absl::StrAppend(&source_code, "  highp vec4 value_", i,
+        absl::StrAppend(&main_source_code, "  highp vec4 value_", i,
                         " = vec4(0);\n");
       }
       break;
     case IOStructure::AUTO: {
       for (int i = 0; i < attr.inputs.size(); ++i) {
-        absl::StrAppend(&source_code, "  highp vec4 value_", i,
+        absl::StrAppend(&main_source_code, "  highp vec4 value_", i,
                         " = $input_data_", i, "[gid.x, gid.y, gid.z]$;\n");
       }
       break;
     }
   }
 
-  source_code.append(attr.code.source_code);
+  main_source_code.append(attr.code.source_code);
 
   if (attr.code.output == IOStructure::AUTO) {
     for (int i = 0; i < attr.outputs.size(); ++i) {
-      absl::StrAppend(&source_code, "  $output_data_", i,
+      absl::StrAppend(&main_source_code, "  $output_data_", i,
                       "[gid.x, gid.y, gid.z] = value_", i, "$;\n");
     }
   }
 
   // At this point main function is already generated. Now we need to process
-  // object and parameter accessors.
+  // object and variable accessors.
 
   // process objects first. Object accessor may introduce new uniform
   // parameters that need to be rewritten in the subsequent pass.
   {
     TextPreprocessor preprocessor('$', /*keep_unknown_rewrites=*/true);
-    preprocessor.AddRewrite(&objects);
-    RETURN_IF_ERROR(preprocessor.Rewrite(source_code, &source_code));
+    preprocessor.AddRewrite(&object_accessor);
+    RETURN_IF_ERROR(preprocessor.Rewrite(main_source_code, &main_source_code));
   }
 
   {
     TextPreprocessor preprocessor('$', /*keep_unknown_rewrites=*/false);
-    preprocessor.AddRewrite(&parameters);
-    RETURN_IF_ERROR(preprocessor.Rewrite(source_code, &source_code));
+    preprocessor.AddRewrite(&variable_accessor);
+    RETURN_IF_ERROR(preprocessor.Rewrite(main_source_code, &main_source_code));
   }
 
   if (options_.inline_parameters) {
-    source_code = absl::StrCat(parameters.GetConstDeclarations(), source_code);
+    main_source_code = absl::StrCat(variable_accessor.GetConstDeclarations(),
+                                    main_source_code);
   }
 
-  std::string declarations = absl::StrCat(
-      objects.GetFunctionsDeclarations(), "\n", objects.GetObjectDeclarations(),
-      "\n", parameters.GetUniformDeclarations());
-  *shader_code = ShaderCode(
-      parameters.GetUniformParameters(), objects.GetObjects(),
-      attr.code.workload, attr.code.workgroup,
-      absl::StrCat("layout(std430) buffer;\nprecision ",
-                   (options_.allow_precision_loss ? "mediump" : "highp"),
-                   " float;\n", declarations, "\nvoid main() {\n", source_code,
-                   "\n}"),
-      attr.node_indices);
+  // partial_source_code is only missing the following which is added later:
+  // #version 310 es
+  // layout(local_size_x = ..., local_size_y = ..., local_size_z = ...) in;
+  const char* precision = options_.allow_precision_loss ? "mediump" : "highp";
+  const std::string partial_source_code = absl::StrCat(
+      "layout(std430) buffer;\n",                                 //
+      "precision ", precision, " float;\n",                       //
+      object_accessor.GetFunctionsDeclarations(), "\n",           //
+      object_accessor.GetObjectDeclarations(), "\n",              //
+      variable_accessor.GetSharedVariableDeclarations(), "\n",    //
+      variable_accessor.GetUniformParameterDeclarations(), "\n",  //
+      "void main() {\n",                                          //
+      main_source_code,                                           //
+      "}");
+  *shader_code =
+      ShaderCode(variable_accessor.GetUniformParameters(),
+                 object_accessor.GetObjects(), attr.code.workload,
+                 attr.code.workgroup, partial_source_code, attr.node_indices);
   return OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
index 06e4cf8f002..c4f09a3b6b9 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
@@ -19,14 +19,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
similarity index 64%
rename from tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.cc
rename to tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
index 55d7152c0e4..9bac6d62a62 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -24,12 +24,12 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace gl {
-namespace parameter_accessor_internal {
+namespace variable_accessor_internal {
 
 // Parse the following regex manually
 // name(\[index\])?(\.field)?
-ParameterReference Parse(absl::string_view input) {
-  ParameterReference ref;
+VariableReference Parse(absl::string_view input) {
+  VariableReference ref;
   auto start_index = input.find('[');
   if (start_index != std::string::npos) {
     auto end_index = input.rfind(']');
@@ -51,11 +51,11 @@ ParameterReference Parse(absl::string_view input) {
   return ref;
 }
 
-}  // namespace parameter_accessor_internal
+}  // namespace variable_accessor_internal
 
 namespace {
 
-struct UniformTypeGetter {
+struct VariableTypeGetter {
   std::string operator()(int) const { return "int"; }
   std::string operator()(const int2&) const { return "ivec2"; }
   std::string operator()(const std::vector<int2>&) const { return "ivec2"; }
@@ -65,11 +65,12 @@ struct UniformTypeGetter {
   std::string operator()(float) const { return "float"; }
   std::string operator()(const float2&) const { return "vec2"; }
   std::string operator()(const float4&) const { return "vec4"; }
+  std::string operator()(const std::vector<float4>&) const { return "vec4"; }
 };
 
-// Returns GLSL uniform type of the given parameter.
-std::string GetUniformType(const Variable::ValueType& value) {
-  return absl::visit(UniformTypeGetter(), value);
+// Returns GLSL uniform type of the given variable.
+std::string GetVariableType(const Variable::ValueType& value) {
+  return absl::visit(VariableTypeGetter(), value);
 }
 
 template <typename T>
@@ -102,25 +103,25 @@ struct ConstGenerator {
 
   template <typename T>
   void operator()(const Vec2<T>& v) const {
-    absl::StrAppend(result, UniformTypeGetter()(v), "(",
+    absl::StrAppend(result, VariableTypeGetter()(v), "(",
                     absl::StrJoin(ToString<T, 2>(v.data_), ","), ")");
   }
 
   template <typename T>
   void operator()(const Vec3<T>& v) const {
-    absl::StrAppend(result, UniformTypeGetter()(v), "(",
+    absl::StrAppend(result, VariableTypeGetter()(v), "(",
                     absl::StrJoin(ToString<T, 3>(v.data_), ","), ")");
   }
 
   template <typename T>
   void operator()(const Vec4<T>& v) const {
-    absl::StrAppend(result, UniformTypeGetter()(v), "(",
+    absl::StrAppend(result, VariableTypeGetter()(v), "(",
                     absl::StrJoin(ToString<T, 4>(v.data_), ","), ")");
   }
 
   template <typename T>
   void operator()(const std::vector<T>& v) const {
-    std::string type = UniformTypeGetter()(v);
+    std::string type = VariableTypeGetter()(v);
     absl::StrAppend(result, type, "[", v.size(), "](");
     bool first = true;
     for (const auto& i : v) {
@@ -137,31 +138,55 @@ struct ConstGenerator {
   std::string* result;
 };
 
-// Appends string representation of a parameter value.
+// Appends string representation of a variable value.
 void GetValue(const Variable::ValueType& value, std::string* result) {
   absl::visit(ConstGenerator{result}, value);
 }
 
-struct UniformDeclarationGenerator {
+struct SharedVariableDeclarationGenerator {
   template <typename T>
   void operator()(const T&) const {
-    absl::StrAppend(result, "uniform ", GetUniformType(param.value), " ",
-                    param.name, ";\n");
+    absl::StrAppend(result, "shared ", GetVariableType(variable.value), " ",
+                    variable.name, ";\n");
   }
 
   template <typename T>
   void operator()(const std::vector<T>& v) const {
-    absl::StrAppend(result, "uniform ", GetUniformType(param.value), " ",
-                    param.name, "[", v.size(), "];\n");
+    absl::StrAppend(result, "shared ", GetVariableType(variable.value), " ",
+                    variable.name, "[", v.size(), "];\n");
   }
 
-  const Variable& param;
+  const Variable& variable;
   std::string* result;
 };
 
-void GenerateUniformDeclaration(const Variable& parameter,
-                                std::string* result) {
-  absl::visit(UniformDeclarationGenerator{parameter, result}, parameter.value);
+void GenerateSharedVariableDeclaration(const Variable& variable,
+                                       std::string* result) {
+  absl::visit(SharedVariableDeclarationGenerator{variable, result},
+              variable.value);
+}
+
+struct UniformParameterDeclarationGenerator {
+  template <typename T>
+  void operator()(const T&) const {
+    absl::StrAppend(result, "uniform ", GetVariableType(variable.value), " ",
+                    variable.name, ";\n");
+  }
+
+  template <typename T>
+  void operator()(const std::vector<T>& v) const {
+    absl::StrAppend(result, "uniform ", GetVariableType(variable.value), " ",
+                    variable.name, "[", v.size(), "];\n");
+  }
+
+  const Variable& variable;
+  std::string* result;
+};
+
+void GenerateUniformParameterDeclaration(const Variable& variable,
+                                         std::string* result) {
+  absl::visit(UniformParameterDeclarationGenerator{variable, result},
+              variable.value);
 }
 
 struct VariableLengthGetter {
@@ -277,37 +302,38 @@ void AssembleAccessor(absl::string_view name, absl::string_view index,
 
 }  // namespace
 
-RewriteStatus ParameterAccessor::Rewrite(absl::string_view input,
-                                         std::string* output) {
-  auto ref = parameter_accessor_internal::Parse(input);
+RewriteStatus VariableAccessor::Rewrite(absl::string_view input,
+                                        std::string* output) {
+  auto ref = variable_accessor_internal::Parse(input);
   if (ref.name.empty()) {
     absl::StrAppend(output, "INVALID_SYNTAX");
     return RewriteStatus::ERROR;
   }
 
-  auto it = name_to_param_.find(std::string(ref.name.data(), ref.name.size()));
-  if (it == name_to_param_.end()) {
+  auto it =
+      name_to_variable_.find(std::string(ref.name.data(), ref.name.size()));
+  if (it == name_to_variable_.end()) {
     // Uniform with this name is not registered.
     return RewriteStatus::NOT_RECOGNIZED;
   }
   const auto& value = it->second.value;
 
   if (!ref.index.empty() && !IsVariableLength(value)) {
-    // Trying to access parameter by index, but it is not variable-length.
+    // Trying to access variable by index, but it is not variable-length.
     absl::StrAppend(output, "INVALID_ACCESS_BY_INDEX");
     return RewriteStatus::ERROR;
   }
 
   Field f = ToField(ref.field);
   if (!ref.field.empty() && !HasField(value, f)) {
-    // Trying to access a parameter by field, but it does not have it.
+    // Trying to access a variable by field, but it does not have it.
     absl::StrAppend(output, "INVALID_ACCESS_BY_FIELD");
     return RewriteStatus::ERROR;
   }
 
   // Error checks are complete now.
 
-  // All variable-length parameters are encoded as-is without inlining.
+  // All variable-length variables are encoded as-is without inlining.
   if (!inline_values_ || IsVariableLength(value)) {
     AssembleAccessor(it->second.name, ref.index, ref.field, output);
   } else {
@@ -322,20 +348,39 @@ RewriteStatus ParameterAccessor::Rewrite(absl::string_view input,
   return RewriteStatus::SUCCESS;
 }
 
-bool ParameterAccessor::AddParameter(Variable param) {
-  std::string name = param.name;
-  return name_to_param_.insert({name, std::move(param)}).second;
+bool VariableAccessor::AddSharedVariable(Variable&& variable) {
+  const std::string name = variable.name;
+  if (!name_to_variable_.insert({name, std::move(variable)}).second) {
+    return false;
+  }
+  shared_variables_.insert(name);
+  return true;
 }
 
-std::string ParameterAccessor::GetConstDeclarations() const {
-  // Variable length parameters are declared as const and accessed via variable
+bool VariableAccessor::AddUniformParameter(Variable&& variable) {
+  const std::string name = variable.name;
+  if (!name_to_variable_.insert({name, std::move(variable)}).second) {
+    return false;
+  }
+  uniform_parameters_.insert(name);
+  return true;
+}
+
+std::string VariableAccessor::GetConstDeclarations() const {
+  // Variable length variables are declared as const and accessed via variable
   // with index.
   std::string declarations;
-  for (auto& param : name_to_param_) {
-    const auto& value = param.second.value;
+  for (const auto& variable : name_to_variable_) {
+    // Skip shared variables.
+    const std::string& variable_name = variable.second.name;
+    if (shared_variables_.find(variable_name) != shared_variables_.end()) {
+      continue;
+    }
+
+    const auto& value = variable.second.value;
     if (IsVariableLength(value)) {
-      absl::StrAppend(&declarations, "const ", GetUniformType(value), " ",
-                      param.second.name, "[] = ");
+      absl::StrAppend(&declarations, "const ", GetVariableType(value), " ",
+                      variable_name, "[] = ");
       GetValue(value, &declarations);
       absl::StrAppend(&declarations, ";\n");
     }
@@ -343,24 +388,35 @@ std::string ParameterAccessor::GetConstDeclarations() const {
   return declarations;
 }
 
-std::string ParameterAccessor::GetUniformDeclarations() const {
+std::string VariableAccessor::GetSharedVariableDeclarations() const {
+  std::string declarations;
+  for (const auto& name : shared_variables_) {
+    const auto& variable = name_to_variable_.at(name);
+    GenerateSharedVariableDeclaration(variable, &declarations);
+  }
+  return declarations;
+}
+
+std::string VariableAccessor::GetUniformParameterDeclarations() const {
   std::string declarations;
   if (!inline_values_) {
-    for (auto& param : name_to_param_) {
-      GenerateUniformDeclaration(param.second, &declarations);
+    for (const auto& name : uniform_parameters_) {
+      const auto& variable = name_to_variable_.at(name);
+      GenerateUniformParameterDeclaration(variable, &declarations);
     }
   }
   return declarations;
 }
 
-std::vector<Variable> ParameterAccessor::GetUniformParameters() const {
-  std::vector<Variable> params;
+std::vector<Variable> VariableAccessor::GetUniformParameters() const {
+  std::vector<Variable> variables;
   if (!inline_values_) {
-    for (auto& param : name_to_param_) {
-      params.push_back(param.second);
+    variables.reserve(name_to_variable_.size());
+    for (const auto& variable : name_to_variable_) {
+      variables.push_back(variable.second);
     }
   }
-  return params;
+  return variables;
 }
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
new file mode 100644
index 00000000000..d6a106340d4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
+
+#include <string>
+#include <unordered_map>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This rewrite handles access to variables. It may rewrite a variable with
+// actual values if 'inline_values' is set to true.
+//
+// The following syntax is supported to access variables:
+//  - simple variable: name
+//  - variable with field: name.(x|y|z|w)
+//  - variable with index: name[i]
+//  - variable with index and field: name[i].(x|y|z|w)
+//
+// If 'inline_values' is set to true, non-variable-length variables will be
+// inlined. For example, 'base.x' will be replaced with value of 'x' field from
+// 'base'. Variable-length variables are declared as const and accessed via
+// index. These declarations are returned by GetConstDeclarations.
+//
+// If 'inline_values' is set to false, all variables will be declared as
+// uniforms. Uniform declarations are returned by GetUniformDeclarations.
+class VariableAccessor : public InlineRewrite {
+ public:
+  explicit VariableAccessor(bool inline_values)
+      : inline_values_(inline_values) {}
+
+  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
+
+  // Returns true if variable was successfully added.
+  bool AddSharedVariable(Variable&& variable);
+
+  // Returns true if variable was successfully added.
+  bool AddUniformParameter(Variable&& variable);
+
+  // Returns const variables that need to be inlined in the a shader's code.
+  std::string GetConstDeclarations() const;
+
+  // Returns shared varaible declarations that need to be inlined.
+  std::string GetSharedVariableDeclarations() const;
+
+  // Returns uniform parameter declarations that need to be inlined.
+  std::string GetUniformParameterDeclarations() const;
+
+  // Returns a collection of uniform parameters.
+  std::vector<Variable> GetUniformParameters() const;
+
+ private:
+  const bool inline_values_;
+  std::unordered_map<std::string, Variable> name_to_variable_;
+  std::set<std::string> shared_variables_;
+  std::set<std::string> uniform_parameters_;
+};
+
+// Implementation details below.
+
+namespace variable_accessor_internal {
+
+struct VariableReference {
+  absl::string_view name;
+  absl::string_view index;
+  absl::string_view field;
+};
+
+// Parse the following regex manually
+// name(\[index\])?(\.field)?
+VariableReference Parse(absl::string_view input);
+
+}  // namespace variable_accessor_internal
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor_test.cc b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor_test.cc
new file mode 100644
index 00000000000..0e8be2a577b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+TEST(PreprocessorTest, CornerCases) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  std::string result;
+  EXPECT_EQ(variable_accessor.Rewrite("unknown", &result),
+            RewriteStatus::NOT_RECOGNIZED);
+}
+
+TEST(PreprocessorTest, Value) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"var", int32_t(1)}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
+  EXPECT_EQ(result, "1");
+}
+
+TEST(PreprocessorTest, ValueVec) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"var", int2(1, 2)}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
+  EXPECT_EQ(result, "ivec2(1,2)");
+}
+
+TEST(PreprocessorTest, Field) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  ASSERT_TRUE(
+      variable_accessor.AddUniformParameter({"var", float2(1.0, 2.1234567)}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var.y", &result),
+            RewriteStatus::SUCCESS);
+  EXPECT_EQ(result, "2.123456717f");
+}
+
+TEST(PreprocessorTest, FieldFail) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"var", 1.0f}));
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"vec", float2(1.0, 1.0)}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var.y", &result), RewriteStatus::ERROR);
+  EXPECT_EQ(result, "INVALID_ACCESS_BY_FIELD");
+
+  result.clear();
+  ASSERT_EQ(variable_accessor.Rewrite("vec.z", &result), RewriteStatus::ERROR);
+  EXPECT_EQ(result, "INVALID_ACCESS_BY_FIELD");
+}
+
+TEST(PreprocessorTest, Variable) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  std::vector<int2> v;
+  v.push_back(int2(1, 2));
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"var", v}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var[i].y", &result),
+            RewriteStatus::SUCCESS);
+  ASSERT_EQ(result, "var[i].y");
+  EXPECT_EQ(variable_accessor.GetConstDeclarations(),
+            "const ivec2 var[] = ivec2[1](ivec2(1,2));\n");
+}
+
+TEST(PreprocessorTest, InlineVariableFail) {
+  VariableAccessor variable_accessor(/*inline_values=*/true);
+  ASSERT_TRUE(variable_accessor.AddUniformParameter({"var", 1}));
+  std::string result;
+  ASSERT_EQ(variable_accessor.Rewrite("var[i]", &result), RewriteStatus::ERROR);
+  EXPECT_EQ(result, "INVALID_ACCESS_BY_INDEX");
+}
+
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler_options.h b/tensorflow/lite/delegates/gpu/gl/compiler_options.h
index a4545f52379..6dbe7cbeb8e 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler_options.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler_options.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
 
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/egl_environment.cc b/tensorflow/lite/delegates/gpu/gl/egl_environment.cc
index 7179696b2a2..baf6002e6c1 100644
--- a/tensorflow/lite/delegates/gpu/gl/egl_environment.cc
+++ b/tensorflow/lite/delegates/gpu/gl/egl_environment.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl/egl_environment.h b/tensorflow/lite/delegates/gpu/gl/egl_environment.h
index e23cc9c0480..fa7ca047b6e 100644
--- a/tensorflow/lite/delegates/gpu/gl/egl_environment.h
+++ b/tensorflow/lite/delegates/gpu/gl/egl_environment.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_context.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_surface.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 86c3c59639f..509cadca60d 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -35,6 +35,17 @@ Status CopyBuffer(const GlBuffer& read_buffer, const GlBuffer& write_buffer) {
                             write_buffer.offset(), read_buffer.bytes_size());
 }
 
+Status GetSSBOSize(GLuint id, int64_t* size_bytes) {
+  GLuint prev_id;
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glGetIntegerv,
+                                     GL_SHADER_STORAGE_BUFFER_BINDING,
+                                     reinterpret_cast<GLint*>(&prev_id)));
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id,
+                                          prev_id);
+  return TFLITE_GPU_CALL_GL(glGetBufferParameteri64v, GL_SHADER_STORAGE_BUFFER,
+                            GL_BUFFER_SIZE, size_bytes);
+}
+
 GlBuffer::GlBuffer(GlBuffer&& buffer)
     : GlBuffer(buffer.target_, buffer.id_, buffer.bytes_size_, buffer.offset_,
                buffer.has_ownership_) {
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 019022a5baa..a7e19abde70 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -114,6 +114,8 @@ class GlBuffer {
 
 Status CopyBuffer(const GlBuffer& read_buffer, const GlBuffer& write_buffer);
 
+Status GetSSBOSize(GLuint id, int64_t* size_bytes);
+
 // Creates new shader storage buffer that will be modified and used many
 // times.
 //
@@ -204,16 +206,22 @@ class BufferId {
 // RAII for binding and unbinding a buffer.
 class BufferBinder {
  public:
-  BufferBinder(GLenum target, GLuint id) : target_(target) {
+  BufferBinder(GLenum target, GLuint id) : target_(target), prev_id_(0) {
+    TFLITE_GPU_CALL_GL(glBindBuffer, target_, id).IgnoreError();
+  }
+
+  BufferBinder(GLenum target, GLuint id, GLuint prev_id)
+      : target_(target), prev_id_(prev_id) {
     TFLITE_GPU_CALL_GL(glBindBuffer, target_, id).IgnoreError();
   }
 
   ~BufferBinder() {
-    TFLITE_GPU_CALL_GL(glBindBuffer, target_, 0).IgnoreError();
+    TFLITE_GPU_CALL_GL(glBindBuffer, target_, prev_id_).IgnoreError();
   }
 
  private:
   const GLenum target_;
+  GLuint prev_id_;
 };
 
 // RAII for mapping and unmapping a buffer.
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_program.cc b/tensorflow/lite/delegates/gpu/gl/gl_program.cc
index 8e631288181..def82357a6a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_program.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_program.cc
@@ -57,14 +57,17 @@ struct ParameterSetter {
     return TFLITE_GPU_CALL_GL(glProgramUniform1i, program_id, uniform_id,
                               value);
   }
+
   Status operator()(const int2& value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform2i, program_id, uniform_id,
                               value.x, value.y);
   }
+
   Status operator()(const int4& value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform4i, program_id, uniform_id,
                               value.x, value.y, value.z, value.w);
   }
+
   Status operator()(const std::vector<int2>& value) {
     std::vector<GLint> ints(value.size() * 2, 0);
     for (int i = 0; i < value.size(); ++i) {
@@ -74,27 +77,44 @@ struct ParameterSetter {
     return TFLITE_GPU_CALL_GL(glProgramUniform2iv, program_id, uniform_id,
                               ints.size(), ints.data());
   }
+
   Status operator()(unsigned int value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform1ui, program_id, uniform_id,
                               value);
   }
+
   Status operator()(const uint4& value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform4ui, program_id, uniform_id,
                               value.x, value.y, value.z, value.w);
   }
+
   Status operator()(float value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform1f, program_id, uniform_id,
                               value);
   }
+
   Status operator()(const float2& value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform2f, program_id, uniform_id,
                               value.x, value.y);
   }
+
   Status operator()(const float4& value) {
     return TFLITE_GPU_CALL_GL(glProgramUniform4f, program_id, uniform_id,
                               value.x, value.y, value.z, value.w);
   }
 
+  Status operator()(const std::vector<float4>& value) {
+    std::vector<GLfloat> floats(value.size() * 4, 0);
+    for (int i = 0; i < value.size(); ++i) {
+      floats[i * 4] = value[i].x;
+      floats[i * 4 + 1] = value[i].y;
+      floats[i * 4 + 2] = value[i].z;
+      floats[i * 4 + 3] = value[i].w;
+    }
+    return TFLITE_GPU_CALL_GL(glProgramUniform4fv, program_id, uniform_id,
+                              floats.size(), floats.data());
+  }
+
   const GLuint program_id;
   const GLint uniform_id;
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 50d204c5348..03f8a479964 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -5,6 +5,52 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "converter",
+    srcs = ["converter.cc"],
+    hdrs = ["converter.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu:spi",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/gl:command_queue",
+        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
+        "//tensorflow/lite/delegates/gpu/gl:gl_program",
+        "//tensorflow/lite/delegates/gpu/gl:gl_shader",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_test(
+    name = "converter_test",
+    size = "small",
+    srcs = ["converter_test.cc"],
+    linkopts = [
+        "-lEGL",
+        "-lGLESv3",
+    ],
+    tags = [
+        "local",
+        "nobuilder",
+        "notap",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":converter",
+        "//tensorflow/lite/delegates/gpu/common:convert",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:egl_environment",
+        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
+        "//tensorflow/lite/delegates/gpu/gl:portable",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "add",
     srcs = ["add.cc"],
@@ -113,6 +159,13 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "custom_registry",
+    srcs = ["custom_registry.cc"],
+    hdrs = ["custom_registry.h"],
+    deps = ["//tensorflow/lite/delegates/gpu/gl:node_shader"],
+)
+
 cc_library(
     name = "depthwise_conv",
     srcs = ["depthwise_conv.cc"],
@@ -575,9 +628,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:compiler_options",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
         "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object_manager",
+        "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:runtime_options",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
         "@com_google_googletest//:gtest",
@@ -690,6 +743,7 @@ cc_library(
                "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
                "//conditions:default": NON_TFLITE_GPU_BINARY_RELEASE_OPERATORS,
            }) + [
+        ":custom_registry",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
index e1073299ecd..7c461e506f8 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
@@ -50,6 +50,7 @@ class Add : public NodeShader {
         *generated_code = {
             /*parameters=*/{},
             /*objects=*/{},
+            /*shared_variables=*/{},
             /*workload=*/uint3(),
             /*workgroup=*/uint3(),
             /*source_code=*/
@@ -72,6 +73,7 @@ class Add : public NodeShader {
       *generated_code = {
           /*parameters=*/{},
           /*objects=*/{},
+          /*shared_variables=*/{},
           /*workload=*/uint3(),
           /*workgroup=*/uint3(),
           /*source_code=*/std::move(code),
@@ -85,6 +87,7 @@ class Add : public NodeShader {
       *generated_code = {
           /*parameters=*/{{"scalar", *scalar}},
           /*objects=*/{},
+          /*shared_variables=*/{},
           /*workload=*/uint3(),
           /*workgroup=*/uint3(),
           /*source_code=*/"value_0 += $scalar$;",
@@ -96,6 +99,7 @@ class Add : public NodeShader {
       *generated_code = {
           /*parameters=*/{},
           /*objects=*/{{"add_buffer", MakeReadonlyObject(adds->data)}},
+          /*shared_variables=*/{},
           // Declare workload explicitly because shader depends on gid.z.
           /*workload=*/
           uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
index c6cdb078a6d..a97d618e0b6 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
@@ -87,6 +87,7 @@ class AlignedConcatByChannels : public NodeShader {
     *generated_code = {
         /*parameters=*/{{"border", inputs[0]->tensor.shape.c / 4}},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
@@ -174,6 +175,7 @@ class ConcatByAnyChannel : public NodeShader {
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
@@ -373,6 +375,7 @@ class FlatConcatByHeight : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(params),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
@@ -439,6 +442,7 @@ class FlatConcatByWidth : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(params),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
index 0314b959e64..0b18a4c4246 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
@@ -44,21 +44,38 @@ class Convolution : public NodeShader {
         ctx.node->operation.attributes);
     auto weights = attr.weights.shape;
     const int offsets_count = weights.h * weights.w;
-    std::vector<int2> offsets;
-    for (int h = 0; h < weights.h; ++h) {
-      for (int w = 0; w < weights.w; ++w) {
-        offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
-                             h * attr.dilations.h - attr.padding.prepended.h);
+    const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
+    std::vector<Variable> parameters;
+    if (offsets_count_too_large) {
+      parameters = {
+          {"input_data_0_h", input->tensor.shape.h},
+          {"input_data_0_w", input->tensor.shape.w},
+          {"padding_w", attr.padding.prepended.w},
+          {"padding_h", attr.padding.prepended.h},
+          {"dilation_w", attr.dilations.w},
+          {"dilation_h", attr.dilations.h},
+          {"kernel_w", weights.w},
+          {"kernel_h", weights.h},
+          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"stride", int2(attr.strides.w, attr.strides.h)},
+      };
+    } else {
+      std::vector<int2> offsets;
+      for (int h = 0; h < weights.h; ++h) {
+        for (int w = 0; w < weights.w; ++w) {
+          offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
+                               h * attr.dilations.h - attr.padding.prepended.h);
+        }
       }
+      parameters = {
+          {"input_data_0_h", input->tensor.shape.h},
+          {"input_data_0_w", input->tensor.shape.w},
+          {"offsets_count", offsets_count},
+          {"offsets", offsets},
+          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"stride", int2(attr.strides.w, attr.strides.h)},
+      };
     }
-    std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w},
-        {"offsets_count", offsets_count},
-        {"offsets", offsets},
-        {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
-        {"stride", int2(attr.strides.w, attr.strides.h)},
-    };
 
     // at least one padding is not empty
     bool non_empty_padding =
@@ -69,9 +86,18 @@ class Convolution : public NodeShader {
         {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
                                        ConvertToPHWO4I4(attr.weights))}};
 
-    std::string source = R"(
-      for (int i = 0; i < $offsets_count$; ++i) {
-        ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
+    std::string source;
+    if (offsets_count_too_large) {
+      source = R"(
+      int i = 0;
+      for (int ky = 0; ky < $kernel_h$; ky++) {
+        for (int kx = 0; kx < $kernel_w$; kx++, i++) {
+          ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
+    } else {
+      source = R"(
+        for (int i = 0; i < $offsets_count$; ++i) {
+          ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
+    }
     if (non_empty_padding) {
       source += R"(
         if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
@@ -79,37 +105,34 @@ class Convolution : public NodeShader {
         })";
     }
     source += R"(
-        for (int l = 0; l < $src_depth$; ++l) {
-          highp vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
-          value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
-          value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
-          value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
-          value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
+          for (int l = 0; l < $src_depth$; ++l) {
+            vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
+            value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
+            value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
+            value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
+            value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
+          }
         }
+)";
+    if (offsets_count_too_large) {
+      source += R"(
       }
-    )";
+)";
+    }
     if (!attr.bias.data.empty()) {
       source += "value_0 += $bias[gid.z]$;\n";
       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
     }
 
-    // This is a hotfix for special convolution, which worked 10ms on
-    // textures16. With this fix it works 4ms.
-    // TODO(eignasheva): fix this problem in the proper way
-    uint3 workgroup = uint3(0, 0, 0);
-    if (weights.h == 7 && weights.w == 7 && attr.strides.h == 4 &&
-        attr.strides.w == 4) {
-      workgroup = uint3(8, 8, 8);
-    }
-
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/
         GetIdealWorkgroupIfPossible(
             ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
-            HW(weights.h, weights.w), attr.strides, workgroup,
+            HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
             OHWI(weights.o, input->tensor.shape.h, input->tensor.shape.w,
                  input->tensor.shape.c)),
         /*source_code=*/std::move(source),
@@ -241,6 +264,7 @@ class Convolution1x1 : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
+        /*shared_variables=*/{},
         /*workload=*/
         uint3(output->tensor.shape.w / multiplier, output->tensor.shape.h,
               IntegralDivideRoundUp(output->tensor.shape.c, 4)),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
new file mode 100644
index 00000000000..a919b18402e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
@@ -0,0 +1,395 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/converter.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+// Wraps given SSBO into GlBuffer object that does not have ownership.
+Status WrapSSBO(OpenGlBuffer ssbo, GlBuffer* buffer) {
+  int64_t size_bytes;
+  RETURN_IF_ERROR(GetSSBOSize(ssbo.id, &size_bytes));
+  *buffer = GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo.id, size_bytes, 0, false);
+  return OkStatus();
+}
+
+std::string GetShaderHeader(const uint3& localsize) {
+  return absl::StrCat("#version 310 es\nlayout(local_size_x = ", localsize.x,
+                      ", local_size_y = ", localsize.y,
+                      ", local_size_z = ", localsize.z, ") in;\n");
+}
+
+class OpenGlConverterImpl : public TensorObjectConverter {
+ public:
+  explicit OpenGlConverterImpl(CommandQueue* command_queue)
+      : command_queue_(command_queue) {}
+
+  virtual Status Init(const TensorObjectDef& input_def,
+                      const TensorObjectDef& output_def) = 0;
+
+ protected:
+  Status InitializeProgram(const uint3& workgroup_size,
+                           const std::string& shader_source) {
+    workgroup_size_ = workgroup_size;
+    GlShader shader;
+    RETURN_IF_ERROR(GlShader::CompileShader(
+        GL_COMPUTE_SHADER, GetShaderHeader(workgroup_size) + shader_source,
+        &shader));
+    return GlProgram::CreateWithShader(shader, &program_);
+  }
+
+  Status Dispatch(const uint3& workload) {
+    uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+    if (command_queue_) {
+      return command_queue_->Dispatch(program_, num_workgroups);
+    }
+    return program_.Dispatch(num_workgroups);
+  }
+
+  GlProgram program_;
+  uint3 workgroup_size_;
+  CommandQueue* command_queue_;
+};
+
+bool IsSupportedDataType(DataType type) { return type == DataType::FLOAT32; }
+
+uint32_t SizeInBytesDHWC4(const BHWC& shape) {
+  return shape.b * shape.h * shape.w * AlignByN(shape.c, 4) * sizeof(float);
+}
+
+uint32_t SizeInBytesBHWC(const BHWC& shape) {
+  return shape.DimensionsProduct() * sizeof(float);
+}
+
+// Implements conversion from OpenGL-specific tensor layout to BHWC.
+class FromTensorConverter : public OpenGlConverterImpl {
+ public:
+  explicit FromTensorConverter(CommandQueue* command_queue)
+      : OpenGlConverterImpl(command_queue) {}
+
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Output is always SSBO/BHWC
+           output.object_type == ObjectType::OPENGL_SSBO &&
+           output.data_layout == DataLayout::BHWC &&
+           // SSBO/DHWC4 ->
+           input.object_type == ObjectType::OPENGL_SSBO &&
+           input.data_layout == DataLayout::DHWC4;
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def) final {
+    shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
+                  output_def.dimensions.w, output_def.dimensions.c);
+    if (shape_.b != 1) {
+      return UnimplementedError(
+          "FromTensorConverter: Batch size != 1 is not supported.");
+    }
+
+    return InitializeProgram(uint3(8, 4, 2), R"(
+    layout(std430) buffer;
+    precision highp float;
+
+    layout(binding = 0) readonly buffer B0 {
+      vec4 elements[];
+    } input_data;
+
+    layout(binding = 1) writeonly buffer B1 {
+      float elements[];
+    } output_data;
+
+    uniform ivec4 sizes;
+
+    void main() {
+      ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
+      if (gid.x >= sizes.x || gid.y >= sizes.y || gid.z >= sizes.z) {
+        return;
+      }
+      output_data.elements[(gid.y * sizes.x + gid.x) * sizes.z + gid.z] = input_data.elements[(gid.z / 4 * sizes.y + gid.y) * sizes.x + gid.x][gid.z % 4];
+    })");
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto output = absl::get_if<OpenGlBuffer>(&output_obj);
+    if (!output || !output->id) {
+      return InvalidArgumentError("Missing output in converter");
+    }
+    auto input = absl::get_if<OpenGlBuffer>(&input_obj);
+    if (!input || !input->id) {
+      return InvalidArgumentError("Missing input in converter");
+    }
+    if (input->id == output->id) {
+      return InvalidArgumentError("Can not execute inplace conversion");
+    }
+    GlBuffer input_ssbo;
+    RETURN_IF_ERROR(WrapSSBO(*input, &input_ssbo));
+    GlBuffer output_ssbo;
+    RETURN_IF_ERROR(WrapSSBO(*output, &output_ssbo));
+
+    if (input_ssbo.bytes_size() != SizeInBytesDHWC4(shape_)) {
+      return InvalidArgumentError(
+          "FromTensorConverter: input data size does not match expected size.");
+    }
+    if (output_ssbo.bytes_size() != SizeInBytesBHWC(shape_)) {
+      return InvalidArgumentError(
+          "FromTensorConverter: output data size does not match expected "
+          "size.");
+    }
+    RETURN_IF_ERROR(program_.SetParameter(
+        {"sizes",
+         int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),
+              static_cast<int32_t>(shape_.c), 0)}));
+    RETURN_IF_ERROR(input_ssbo.BindToIndex(0));
+    RETURN_IF_ERROR(output_ssbo.BindToIndex(1));
+    return Dispatch(uint3(shape_.w, shape_.h, shape_.c));
+  }
+
+  BHWC shape_;
+};
+
+// Implements conversion from BHWC to OpenCL-specific tensor layout.
+class ToTensorConverter : public OpenGlConverterImpl {
+ public:
+  explicit ToTensorConverter(CommandQueue* command_queue)
+      : OpenGlConverterImpl(command_queue) {}
+
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Input is always SSBO/BHWC
+           input.object_type == ObjectType::OPENGL_SSBO &&
+           input.data_layout == DataLayout::BHWC &&
+           // -> SSBO/DHWC4
+           output.object_type == ObjectType::OPENGL_SSBO &&
+           output.data_layout == DataLayout::DHWC4;
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def) final {
+    shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
+                  output_def.dimensions.w, output_def.dimensions.c);
+    if (shape_.b != 1) {
+      return UnimplementedError(
+          "FromTensorConverter: Batch size != 1 is not supported.");
+    }
+
+    return InitializeProgram(uint3(8, 4, 2), R"(
+    layout(std430) buffer;
+    precision highp float;
+
+    layout(binding = 0) readonly buffer B0 {
+      float elements[];
+    } input_data;
+
+    layout(binding = 1) writeonly buffer B1 {
+      vec4 elements[];
+    } output_data;
+
+    uniform ivec4 sizes;
+
+    void main() {
+      ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
+      if (gid.x >= sizes.x || gid.y >= sizes.y || gid.z >= sizes.w) {
+        return;
+      }
+      vec4 v = vec4(0);
+      int dst_channel = gid.z * 4;
+      int index = (gid.y * sizes.x + gid.x) * sizes.z + dst_channel;
+      for (int i = 0; i < 4; ++i, ++index, ++dst_channel) {
+        if (dst_channel >= sizes.z) break;
+        v[i] = input_data.elements[index];
+      }
+      output_data.elements[(gid.z * sizes.y + gid.y) * sizes.x + gid.x] = v;
+    })");
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto output = absl::get_if<OpenGlBuffer>(&output_obj);
+    if (!output || !output->id) {
+      return InvalidArgumentError("Missing output in converter");
+    }
+    auto input = absl::get_if<OpenGlBuffer>(&input_obj);
+    if (!input || !input->id) {
+      return InvalidArgumentError("Missing input in converter");
+    }
+    if (input->id == output->id) {
+      return InvalidArgumentError("Can not execute inplace conversion");
+    }
+    GlBuffer input_ssbo;
+    RETURN_IF_ERROR(WrapSSBO(*input, &input_ssbo));
+    GlBuffer output_ssbo;
+    RETURN_IF_ERROR(WrapSSBO(*output, &output_ssbo));
+
+    if (input_ssbo.bytes_size() != SizeInBytesBHWC(shape_)) {
+      return InvalidArgumentError(
+          "ToTensorConverter: input data size does not match expected size.");
+    }
+    if (output_ssbo.bytes_size() != SizeInBytesDHWC4(shape_)) {
+      return InvalidArgumentError(
+          "ToTensorConverter: output data size does not match expected size.");
+    }
+    auto d = IntegralDivideRoundUp(shape_.c, 4);
+    RETURN_IF_ERROR(program_.SetParameter(
+        {"sizes",
+         int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),
+              static_cast<int32_t>(shape_.c), static_cast<int32_t>(d))}));
+    RETURN_IF_ERROR(input_ssbo.BindToIndex(0));
+    RETURN_IF_ERROR(output_ssbo.BindToIndex(1));
+    return Dispatch(uint3(shape_.w, shape_.h, d));
+  }
+
+  BHWC shape_;
+};
+
+// Copies data from one object of the same type and layout to another object.
+class TrivialCopier : public TensorObjectConverter {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.object_type == ObjectType::OPENGL_SSBO &&
+           input.data_type == output.data_type &&
+           input.object_type == output.object_type &&
+           input.data_layout == output.data_layout;
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto ssbo_input = absl::get_if<OpenGlBuffer>(&input_obj);
+    auto ssbo_output = absl::get_if<OpenGlBuffer>(&output_obj);
+    if (ssbo_input && ssbo_output) {
+      return Copy(*ssbo_input, *ssbo_output);
+    }
+    return InternalError("Unexpected object");
+  }
+
+  Status Copy(OpenGlBuffer input, OpenGlBuffer output) {
+    if (input.id == output.id) {
+      return OkStatus();
+    }
+    GlBuffer input_obj;
+    RETURN_IF_ERROR(WrapSSBO(input, &input_obj));
+    GlBuffer output_obj;
+    RETURN_IF_ERROR(WrapSSBO(output, &output_obj));
+    return CopyBuffer(input_obj, output_obj);
+  }
+};
+
+// Copies data from/to CPU into a tensor.
+class CpuCopier : public TensorObjectConverter {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.data_type == output.data_type &&
+           input.data_layout == output.data_layout &&
+           ((input.object_type == ObjectType::CPU_MEMORY &&
+             output.object_type == ObjectType::OPENGL_SSBO) ||
+            (output.object_type == ObjectType::CPU_MEMORY &&
+             input.object_type == ObjectType::OPENGL_SSBO));
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
+    auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
+    if (cpu_input) {
+      auto ssbo_output = absl::get_if<OpenGlBuffer>(&output_obj);
+      if (ssbo_output) {
+        GlBuffer gl_buffer;
+        RETURN_IF_ERROR(WrapSSBO(*ssbo_output, &gl_buffer));
+        return gl_buffer.Write(
+            absl::MakeConstSpan(static_cast<const uint8_t*>(cpu_input->data),
+                                cpu_input->size_bytes));
+      }
+    } else if (cpu_output) {
+      auto ssbo_input = absl::get_if<OpenGlBuffer>(&input_obj);
+      if (ssbo_input) {
+        GlBuffer gl_buffer;
+        RETURN_IF_ERROR(WrapSSBO(*ssbo_input, &gl_buffer));
+        return gl_buffer.Read(absl::MakeSpan(
+            static_cast<uint8_t*>(cpu_input->data), cpu_input->size_bytes));
+      }
+    }
+    return InternalError("Unexpected object");
+  }
+};
+
+class TensorConverterBuilderImpl : public TensorObjectConverterBuilder {
+ public:
+  explicit TensorConverterBuilderImpl(CommandQueue* command_queue)
+      : command_queue_(command_queue) {}
+
+  bool IsSupported(const TensorObjectDef& input,
+                   const TensorObjectDef& output) final {
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    return input.dimensions == output.dimensions &&
+           (TrivialCopier::IsSupported(input_def, output_def) ||
+            CpuCopier::IsSupported(input_def, output_def) ||
+            FromTensorConverter::IsSupported(input_def, output_def) ||
+            ToTensorConverter::IsSupported(input_def, output_def));
+  }
+
+  Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) final {
+    std::unique_ptr<OpenGlConverterImpl> impl;
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    if (TrivialCopier::IsSupported(input_def, output_def)) {
+      *converter = absl::make_unique<TrivialCopier>();
+      return OkStatus();
+    } else if (CpuCopier::IsSupported(input_def, output_def)) {
+      *converter = absl::make_unique<CpuCopier>();
+      return OkStatus();
+    } else if (FromTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<FromTensorConverter>(command_queue_);
+    } else if (ToTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<ToTensorConverter>(command_queue_);
+    } else {
+      return UnimplementedError("Unsupported conversion");
+    }
+    RETURN_IF_ERROR(impl->Init(input, output));
+    *converter = std::move(impl);
+    return OkStatus();
+  }
+
+ private:
+  CommandQueue* command_queue_;
+};
+
+}  // namespace
+
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    CommandQueue* command_queue) {
+  return absl::make_unique<TensorConverterBuilderImpl>(command_queue);
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/converter.h b/tensorflow/lite/delegates/gpu/gl/kernels/converter.h
new file mode 100644
index 00000000000..c5f2ba208f8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Supports conversions from DHWC4 to internal OpenGL tensor representation and
+// back. Supports F32 only.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    CommandQueue* command_queue /* optional */);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/converter_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/converter_test.cc
new file mode 100644
index 00000000000..daba2f6d9ef
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/converter.h"
+
+#include <algorithm>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/convert.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+inline std::vector<float> GenerateFloats(float multiplier, int size) {
+  std::vector<float> v(size);
+  for (int i = 0; i < size; ++i) {
+    v[i] = multiplier * i * (i % 2 == 0 ? -1 : 1);
+  }
+  return v;
+}
+
+Dimensions ToDimensions(const BHWC& shape) {
+  return Dimensions(shape.b, shape.h, shape.w, shape.c);
+}
+
+Status RunFromTensorTest(const BHWC& shape) {
+  // Create random input and calculate expected output for it.
+  std::vector<float> input =
+      GenerateFloats(0.01, GetElementsSizeForPHWC4(shape));
+  std::vector<float> output(shape.DimensionsProduct(), 0);
+  RETURN_IF_ERROR(
+      ConvertFromPHWC4(absl::MakeConstSpan(input.data(), input.size()), shape,
+                       absl::MakeSpan(output.data(), output.size())));
+
+  std::unique_ptr<EglEnvironment> env;
+  RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env));
+
+  // Create input and output buffers
+  GlBuffer input_buffer;
+  RETURN_IF_ERROR(CreateReadOnlyShaderStorageBuffer(
+      absl::MakeConstSpan(input.data(), input.size()), &input_buffer));
+
+  GlBuffer output_buffer;
+  RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
+      shape.DimensionsProduct(), &output_buffer));
+
+  // Create converter and run it.
+  auto builder = NewConverterBuilder(nullptr);
+  TensorObjectDef input_def;
+  input_def.object_def.data_type = DataType::FLOAT32;
+  input_def.object_def.data_layout = DataLayout::DHWC4;
+  input_def.object_def.object_type = ObjectType::OPENGL_SSBO;
+  input_def.dimensions = ToDimensions(shape);
+  TensorObjectDef output_def = input_def;
+  output_def.object_def.data_layout = DataLayout::BHWC;
+  std::unique_ptr<TensorObjectConverter> converter;
+  RETURN_IF_ERROR(builder->MakeConverter(input_def, output_def, &converter));
+  RETURN_IF_ERROR(converter->Convert(OpenGlBuffer{input_buffer.id()},
+                                     OpenGlBuffer{output_buffer.id()}));
+
+  // Compare outputs.
+  std::vector<float> converted_output(output.size(), 0);
+  RETURN_IF_ERROR(output_buffer.Read(
+      absl::MakeSpan(converted_output.data(), converted_output.size())));
+  if (output != converted_output) {
+    return InternalError("Outputs don't match");
+  }
+  return OkStatus();
+}
+
+TEST(FromTensor, Smoke) {
+  for (int32_t h : {1, 2, 3, 7, 20}) {
+    for (int32_t w : {1, 2, 4, 5, 11}) {
+      for (int32_t c : {1, 2, 4, 5, 8, 9}) {
+        BHWC shape(1, h, w, c);
+        auto status = RunFromTensorTest(shape);
+        EXPECT_TRUE(status.ok()) << status << ", shape = " << shape.h << " "
+                                 << shape.w << " " << shape.c;
+      }
+    }
+  }
+}
+
+Status RunToTensorTest(const BHWC& shape) {
+  // Create random input and calculate expected output for it.
+  std::vector<float> input = GenerateFloats(0.01, shape.DimensionsProduct());
+  std::vector<float> output(GetElementsSizeForPHWC4(shape), 0);
+  RETURN_IF_ERROR(
+      ConvertToPHWC4(absl::MakeConstSpan(input.data(), input.size()), shape,
+                     absl::MakeSpan(output.data(), output.size())));
+
+  std::unique_ptr<EglEnvironment> env;
+  RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env));
+
+  // Create input and output buffers
+  GlBuffer input_buffer;
+  RETURN_IF_ERROR(CreateReadOnlyShaderStorageBuffer(
+      absl::MakeConstSpan(input.data(), input.size()), &input_buffer));
+
+  GlBuffer output_buffer;
+  RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
+      GetElementsSizeForPHWC4(shape), &output_buffer));
+
+  // Create converter and run it.
+  auto builder = NewConverterBuilder(nullptr);
+  TensorObjectDef input_def;
+  input_def.object_def.data_type = DataType::FLOAT32;
+  input_def.object_def.data_layout = DataLayout::BHWC;
+  input_def.object_def.object_type = ObjectType::OPENGL_SSBO;
+  input_def.dimensions = ToDimensions(shape);
+  TensorObjectDef output_def = input_def;
+  output_def.object_def.data_layout = DataLayout::DHWC4;
+  std::unique_ptr<TensorObjectConverter> converter;
+  RETURN_IF_ERROR(builder->MakeConverter(input_def, output_def, &converter));
+  RETURN_IF_ERROR(converter->Convert(OpenGlBuffer{input_buffer.id()},
+                                     OpenGlBuffer{output_buffer.id()}));
+
+  // Compare outputs.
+  std::vector<float> converted_output(output.size(), 0);
+  RETURN_IF_ERROR(output_buffer.Read(
+      absl::MakeSpan(converted_output.data(), converted_output.size())));
+  if (output != converted_output) {
+    return InternalError("Outputs don't match");
+  }
+  return OkStatus();
+}
+
+TEST(ToTensor, Smoke) {
+  for (int32_t h : {1, 2, 3, 7, 20}) {
+    for (int32_t w : {1, 2, 4, 5, 11}) {
+      for (int32_t c : {1, 2, 4, 5, 8, 9}) {
+        BHWC shape(1, h, w, c);
+        auto status = RunToTensorTest(shape);
+        EXPECT_TRUE(status.ok()) << status << ", shape = " << shape.h << " "
+                                 << shape.w << " " << shape.c;
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
new file mode 100644
index 00000000000..f5c5429e867
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+void RegisterCustomOps(
+    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+        shaders) {}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
new file mode 100644
index 00000000000..9a979a982db
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Registers custom operations.
+void RegisterCustomOps(
+    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+        shaders_);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
index c82723954b9..a8d71a943b7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
@@ -43,23 +43,40 @@ class DepthwiseConvolution : public NodeShader {
         ctx.node->operation.attributes);
     auto weights = attr.weights.shape;
     const int offsets_count = weights.h * weights.w;
-    std::vector<int2> offsets;
-    for (int h = 0; h < weights.h; ++h) {
-      for (int w = 0; w < weights.w; ++w) {
-        offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
-                             h * attr.dilations.h - attr.padding.prepended.h);
+    const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
+    std::vector<Variable> parameters;
+    if (offsets_count_too_large) {
+      parameters = {
+          {"input_data_0_h", input->tensor.shape.h},
+          {"input_data_0_w", input->tensor.shape.w},
+          {"padding_w", attr.padding.prepended.w},
+          {"padding_h", attr.padding.prepended.h},
+          {"dilation_w", attr.dilations.w},
+          {"dilation_h", attr.dilations.h},
+          {"kernel_w", weights.w},
+          {"kernel_h", weights.h},
+          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"channel_multiplier", weights.o},
+          {"stride", int2(attr.strides.w, attr.strides.h)},
+      };
+    } else {
+      std::vector<int2> offsets;
+      for (int h = 0; h < weights.h; ++h) {
+        for (int w = 0; w < weights.w; ++w) {
+          offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
+                               h * attr.dilations.h - attr.padding.prepended.h);
+        }
       }
+      parameters = {
+          {"input_data_0_h", input->tensor.shape.h},
+          {"input_data_0_w", input->tensor.shape.w},
+          {"offsets_count", offsets_count},
+          {"offsets", offsets},
+          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"channel_multiplier", weights.o},
+          {"stride", int2(attr.strides.w, attr.strides.h)},
+      };
     }
-    std::vector<Variable> parameters = {
-        {"input_data_0_h", input->tensor.shape.h},
-        {"input_data_0_w", input->tensor.shape.w},
-        {"offsets_count", offsets_count},
-        {"offsets", offsets},
-        {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
-        {"channel_multiplier", weights.o},
-        {"stride", int2(attr.strides.w, attr.strides.h)},
-    };
-
     bool non_empty_padding =
         attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
         attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
@@ -67,11 +84,24 @@ class DepthwiseConvolution : public NodeShader {
     std::vector<std::pair<std::string, Object>> objects = {
         {"weights", MakeReadonlyObject(ConvertToPIOHW4(attr.weights))}};
 
-    std::string source = R"(
-      int src_layer_offset = (gid.z % $channel_multiplier$) * 4;
-      int filter_offset = gid.z * $src_depth$ * $offsets_count$ * 4;
-      for (int i = 0; i < $offsets_count$; ++i) {
-        ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
+    std::string source;
+    if (offsets_count_too_large) {
+      source = R"(
+        int offsets_count = $kernel_w$ * $kernel_h$;
+        int src_layer_offset = (gid.z % $channel_multiplier$) * 4;
+        int filter_offset = gid.z * $src_depth$ * offsets_count * 4;
+        int i = 0;
+        for (int ky = 0; ky < $kernel_h$; ky++) {
+          for (int kx = 0; kx < $kernel_w$; kx++, i++) {
+            ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
+    } else {
+      source = R"(
+        int offsets_count = $offsets_count$;
+        int src_layer_offset = (gid.z % $channel_multiplier$) * 4;
+        int filter_offset = gid.z * $src_depth$ * offsets_count * 4;
+        for (int i = 0; i < offsets_count; ++i) {
+          ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
+    }
     if (non_empty_padding) {
       source += R"(
         if (coord.x < 0 || coord.y < 0 ||
@@ -82,15 +112,21 @@ class DepthwiseConvolution : public NodeShader {
     source += R"(
         int src_layer = gid.z / $channel_multiplier$;
         vec4 input_ = $input_data_0[coord.x, coord.y, src_layer]$;
-        highp vec4 input_shifted;
-        input_shifted[0] = input_[(src_layer_offset + 0) / $channel_multiplier$];
-        input_shifted[1] = input_[(src_layer_offset + 1) / $channel_multiplier$];
-        input_shifted[2] = input_[(src_layer_offset + 2) / $channel_multiplier$];
-        input_shifted[3] = input_[(src_layer_offset + 3) / $channel_multiplier$];
-        int filter_offset = gid.z * $offsets_count$ + i;
+        vec4 input_shifted = vec4(
+          input_[(src_layer_offset + 0) / $channel_multiplier$],
+          input_[(src_layer_offset + 1) / $channel_multiplier$],
+          input_[(src_layer_offset + 2) / $channel_multiplier$],
+          input_[(src_layer_offset + 3) / $channel_multiplier$]
+        );
+        int filter_offset = gid.z * offsets_count + i;
         value_0 += input_shifted * $weights[filter_offset]$;
       }
 )";
+    if (offsets_count_too_large) {
+      source += R"(
+      }
+)";
+    }
     if (!attr.bias.data.empty()) {
       source += "value_0 += $bias[gid.z]$;\n";
       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
@@ -98,6 +134,7 @@ class DepthwiseConvolution : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/
         GetIdealWorkgroupIfPossible(
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 8ad2679e62e..fb4f0a512a5 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -90,6 +90,7 @@ class ElementwiseOneArgument : public NodeShader {
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         source,
@@ -160,6 +161,7 @@ class ElementwiseTwoArguments : public NodeShader {
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/source,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
index f6c7526b5eb..2b43d7cc103 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
@@ -32,6 +32,9 @@ namespace gpu {
 namespace gl {
 namespace {
 
+constexpr int kWorkPerThread = 4;
+constexpr int kVectorizedWidth = 4;  // Also number of 'offsetN' in kernel.
+
 class FullyConnectedBuffers : public NodeShader {
  public:
   Status GenerateCode(const GenerationContext& ctx,
@@ -39,9 +42,17 @@ class FullyConnectedBuffers : public NodeShader {
     auto attr = absl::any_cast<const FullyConnectedAttributes&>(
         ctx.node->operation.attributes);
 
+    // Number of float4 chunks needed.
+    const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+    const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+
     // TODO(akulik): check that input has h,w == 1,1
     std::vector<Variable> parameters = {
-        {"src_depth", IntegralDivideRoundUp(attr.weights.shape.i, 4)},
+        {"src_depth", src_depth},
+        {"src_depth_x4", IntegralDivideRoundUp(src_depth, kVectorizedWidth)},
+        {"src_size", attr.weights.shape.i},
+        {"dst_depth", dst_depth},
+        {"dst_size", attr.weights.shape.o},
     };
 
     // TODO(akulik): refactor indexed access to weights.
@@ -49,29 +60,71 @@ class FullyConnectedBuffers : public NodeShader {
         {"weights", MakeReadonlyObject(ConvertToPHWO4I4(attr.weights))}};
 
     std::string source = R"(
-  int offset = gid.z * $src_depth$ * 4;
-  for (int d = 0; d < $src_depth$; ++d, offset += 4) {
-      vec4 src = $input_data_0[0, 0, d]$;
-      value_0.x += dot(src, $weights[offset]$);
-      value_0.y += dot(src, $weights[offset + 1]$);
-      value_0.z += dot(src, $weights[offset + 2]$);
-      value_0.w += dot(src, $weights[offset + 3]$);
+  // setup
+  ivec2 tid = ivec2(gl_LocalInvocationID.xy);
+  vec4 sum = vec4(0.0);  // accumulator
+  int channel = int(tid.y);  // vector coord for every thread
+  int work_per_thread = int(gl_WorkGroupSize.x);
+
+  // matrix vector workgroup mul
+  uint offset0 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 0);
+  uint offset1 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 1);
+  uint offset2 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 2);
+  uint offset3 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 3);
+  uint offset_stride = 16u;  // src_depth_x4 == (src_size / 16)
+  for (int i = 0; i < $src_depth_x4$; ++i, channel += int(4)) {
+    vec4 v = $input_data_0[0, 0, channel]$;
+    vec4 m0 = $weights[ offset0 ]$;
+    vec4 m1 = $weights[ offset1 ]$;
+    vec4 m2 = $weights[ offset2 ]$;
+    vec4 m3 = $weights[ offset3 ]$;
+    offset0 += offset_stride;
+    offset1 += offset_stride;
+    offset2 += offset_stride;
+    offset3 += offset_stride;
+    sum.x += dot(v, m0);  // matrix * vector
+    sum.y += dot(v, m1);
+    sum.z += dot(v, m2);
+    sum.w += dot(v, m3);
   }
-)";
+
+  // accumulate local partial sums
+  sh_mem[tid.x + tid.y * work_per_thread] = sum;
+  memoryBarrierShared();
+  barrier();
+
+  // accumulate global sums, write results
+  if (tid.y == 0 && gid.x < $dst_depth$) {
+    /*sum+=sh_mem[tid.x + 0 * work_per_thread];*/  // current thread
+    sum += sh_mem[tid.x + 1 * work_per_thread];
+    sum += sh_mem[tid.x + 2 * work_per_thread];
+    sum += sh_mem[tid.x + 3 * work_per_thread];
+    vec4 r0 = sum;
+)" + std::string(attr.bias.data.empty() ? R"( )" : R"(
+    r0 += $bias[gid.x]$;  )") +
+                         std::string(R"(
+    $output_data_0[0, 0, gid.x] = r0$;
+  }
+)");
     if (!attr.bias.data.empty()) {
-      source += "  value_0 += $bias[gid.z]$;\n";
       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
     }
+
+    std::vector<Variable> shared_variables = {
+        {"sh_mem", std::vector<float4>(kWorkPerThread * kVectorizedWidth)},
+    };
+
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
-        /*workload=*/
-        uint3(1, 1, IntegralDivideRoundUp(attr.weights.shape.o, 4)),
-        /*workgroup=*/uint3(),
+        /*shared_variables=*/std::move(shared_variables),
+        /*workload=*/uint3(dst_depth, kVectorizedWidth, 1),
+        /*workgroup=*/uint3(kWorkPerThread, kVectorizedWidth, 1),
         /*source_code=*/std::move(source),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
-        /*output=*/IOStructure::AUTO,
+        /*output=*/IOStructure::ONLY_DEFINITIONS,
     };
+
     return OkStatus();
   }
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
index 696d5257598..e248cdfb31a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
@@ -73,6 +73,7 @@ class LstmNodeShader : public NodeShader {
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
index fd9302cb00c..2e977625489 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc
@@ -59,6 +59,7 @@ class MaxUnpooling : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
index f57eaa70578..542b64ec2b3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
@@ -76,6 +76,7 @@ class ApplyMask : public NodeShader {
     *generated_code = {
         /*parameters=*/{},
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
@@ -99,6 +100,7 @@ class MultiplyScalar : public NodeShader {
       *generated_code = {
           /*parameters=*/{{"scalar", *scalar}},
           /*objects=*/{},
+          /*shared_variables=*/{},
           /*workload=*/uint3(),
           /*workgroup=*/uint3(),
           /*source_code=*/"value_0 *= $scalar$;",
@@ -113,6 +115,7 @@ class MultiplyScalar : public NodeShader {
       *generated_code = {
           /*parameters=*/{},
           /*objects=*/{{"mul_buffer", MakeReadonlyObject(muls->data)}},
+          /*shared_variables=*/{},
           // Declare workload explicitly because shader depends on gid.z.
           /*workload=*/
           uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
index a27835bbf36..a3a3ac75e60 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
@@ -72,6 +72,7 @@ class Pad : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
index ace3e801c54..8f140c33fca 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
@@ -87,6 +87,7 @@ Status GenerateMaxPoolingCode(const Pooling2DAttributes& attr,
   *generated_code = {
       /*parameters=*/std::move(parameters),
       /*objects=*/{},
+      /*shared_variables=*/{},
       /*workload=*/uint3(),
       /*workgroup=*/uint3(),
       /*source_code=*/std::move(source),
@@ -128,6 +129,7 @@ Status GenerateAveragePoolingCode(const Pooling2DAttributes& attr,
   *generated_code = {
       /*parameters=*/std::move(parameters),
       /*objects=*/{},
+      /*shared_variables=*/{},
       /*workload=*/uint3(),
       /*workgroup=*/uint3(),
       /*source_code=*/std::move(source),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
index 0662fcf8907..80df527ffa4 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
@@ -56,6 +56,7 @@ class PReLULinearAlpha : public NodeShader {
             ? GeneratedCode{
                   /*parameters=*/{{"clip", attr.clip}},
                   /*objects=*/{{"alpha", MakeReadonlyObject(alpha->data)}},
+                  /*shared_variables=*/{},
                   /*workload=*/uint3(),
                   /*workgroup=*/uint3(),
                   "value_0 = clamp(value_0, 0.0, $clip$) + $alpha[gid.z]$ * "
@@ -66,6 +67,7 @@ class PReLULinearAlpha : public NodeShader {
             : GeneratedCode{
                   /*parameters=*/{},
                   /*objects=*/{{"alpha", MakeReadonlyObject(alpha->data)}},
+                  /*shared_variables=*/{},
                   // Declare workload explicitly because shader depends on
                   // gid.z.
                   /*workload=*/
@@ -109,6 +111,7 @@ class PReLUFull : public NodeShader {
                   /*objects=*/
                   {{"alpha",
                     MakeReadonlyObject(obj_size, ConvertToPHWC4(*alpha))}},
+                  /*shared_variables=*/{},
                   // Declare workload explicitly because shader
                   // depends on gid.z.
                   /*workload=*/
@@ -125,6 +128,7 @@ class PReLUFull : public NodeShader {
                   /*objects=*/
                   {{"alpha",
                     MakeReadonlyObject(obj_size, ConvertToPHWC4(*alpha))}},
+                  /*shared_variables=*/{},
                   // Declare workload explicitly because shader depends on
                   // gid.z.
                   /*workload=*/
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 7c93ebd1caf..3744a772530 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/kernels/add.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/concat.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/conv.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.h"
@@ -86,7 +87,7 @@ class Registry : public NodeShader {
     insert_op(Type::RELU, NewReLUNodeShader);
     insert_op(Type::RESHAPE, NewReshapeNodeShader);
     insert_op(Type::SLICE, NewSliceNodeShader);
-    insert_op(Type::SOFT_MAX, NewSoftMaxNodeShader);
+    insert_op(Type::SOFTMAX, NewSoftmaxNodeShader);
     insert_op(Type::UPSAMPLE_2D, NewUpsamplingNodeShader);
 
     insert_elementwise_op(Type::ABS);
@@ -106,6 +107,7 @@ class Registry : public NodeShader {
 
 #ifndef TFLITE_GPU_BINARY_RELEASE
     insert_op(Type::MAX_UNPOOLING_2D, NewMaxUnpoolingNodeShader);
+    RegisterCustomOps(&shaders_);
 #endif  // TFLITE_GPU_BINARY_RELEASE
   }
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index aa5c6e855bc..a8e006ed151 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -55,6 +55,7 @@ class ReLU : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(params),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
index f2c0dc50e0b..5a0b6d7e3c3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
@@ -76,6 +76,7 @@ class Reshape : public NodeShader {
             {"output_channels", output->tensor.shape.c},
         },
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
index 678aa7a00ee..d0fe1923d4e 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
@@ -100,6 +100,7 @@ class Slice : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(code),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
index 04c80937676..871cd505368 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
@@ -33,14 +33,14 @@ namespace gpu {
 namespace gl {
 namespace {
 
-class SoftMax : public NodeShader {
+class Softmax : public NodeShader {
  public:
   Status GenerateCode(const GenerationContext& ctx,
                       GeneratedCode* generated_code) const final {
-    auto input = ctx.graph->FindInputs(ctx.node->id)[0];
-    auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
-    auto attr =
-        absl::any_cast<SoftMaxAttributes>(ctx.node->operation.attributes);
+    const auto* input = ctx.graph->FindInputs(ctx.node->id)[0];
+    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
+    const auto& attr = absl::any_cast<const SoftmaxAttributes&>(
+        ctx.node->operation.attributes);
     if (input->tensor.shape != output->tensor.shape) {
       return InvalidArgumentError("Input and output shape does not match");
     }
@@ -76,6 +76,7 @@ class SoftMax : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
@@ -88,8 +89,8 @@ class SoftMax : public NodeShader {
 
 }  // namespace
 
-std::unique_ptr<NodeShader> NewSoftMaxNodeShader() {
-  return absl::make_unique<SoftMax>();
+std::unique_ptr<NodeShader> NewSoftmaxNodeShader() {
+  return absl::make_unique<Softmax>();
 }
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h
index 2eaf91b6157..2b6c7863946 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h
@@ -25,7 +25,7 @@ namespace tflite {
 namespace gpu {
 namespace gl {
 
-std::unique_ptr<NodeShader> NewSoftMaxNodeShader();
+std::unique_ptr<NodeShader> NewSoftmaxNodeShader();
 
 }  // namespace gl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
index 1c82a80c987..2e031c6db68 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
@@ -31,7 +31,7 @@ namespace gpu {
 namespace gl {
 namespace {
 
-TEST(SoftmaxTest, WorksForChannelsAxis) {
+TEST(SoftmaxTest, Softmax) {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
@@ -42,13 +42,13 @@ TEST(SoftmaxTest, WorksForChannelsAxis) {
   output.ref = 1;
   output.shape = BHWC(1, 2, 2, 1);
 
-  SoftMaxAttributes attr;
+  SoftmaxAttributes attr;
   attr.axis = Axis::CHANNELS;
 
-  SingleOpModel model({ToString(OperationType::SOFT_MAX), attr}, {input},
+  SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
   ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.1, 0.2}));
-  ASSERT_OK(model.Invoke(*NewSoftMaxNodeShader()));
+  ASSERT_OK(model.Invoke(*NewSoftmaxNodeShader()));
   EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {1, 1, 1, 1}));
 }
 
@@ -63,15 +63,13 @@ TEST(SoftmaxTest, DoesNotWorkForHeightAxis) {
   output.ref = 1;
   output.shape = BHWC(1, 2, 2, 1);
 
-  SoftMaxAttributes attr;
+  SoftmaxAttributes attr;
   attr.axis = Axis::HEIGHT;
 
-  SingleOpModel model({ToString(OperationType::SOFT_MAX), attr}, {input},
+  SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {1, 2, 3, 4}));
-  ASSERT_THAT(
-      model.Invoke(*NewSoftMaxNodeShader()).message(),
-      testing::HasSubstr("Softmax is only supported for channels axis."));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  EXPECT_FALSE(model.Invoke(*NewSoftmaxNodeShader()).ok());
 }
 
 TEST(SoftmaxTest, DoesNotWorkForWidthAxis) {
@@ -85,15 +83,40 @@ TEST(SoftmaxTest, DoesNotWorkForWidthAxis) {
   output.ref = 1;
   output.shape = BHWC(1, 2, 2, 1);
 
-  SoftMaxAttributes attr;
+  SoftmaxAttributes attr;
   attr.axis = Axis::WIDTH;
 
-  SingleOpModel model({ToString(OperationType::SOFT_MAX), attr}, {input},
+  SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {1, 2, 3, 4}));
-  ASSERT_THAT(
-      model.Invoke(*NewSoftMaxNodeShader()).message(),
-      testing::HasSubstr("Softmax is only supported for channels axis."));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  EXPECT_FALSE(model.Invoke(*NewSoftmaxNodeShader()).ok());
+}
+
+TEST(SoftmaxTest, Softmax1x1) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 1, 1, 4);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 1, 1, 4);
+
+  SoftmaxAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  const double sum =
+      std::exp(0.1) + std::exp(0.2) + std::exp(0.3) + std::exp(0.4);
+
+  SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  ASSERT_OK(model.Invoke(*NewSoftmaxNodeShader()));
+  EXPECT_THAT(
+      model.GetOutput(0),
+      Pointwise(FloatNear(1e-6), {std::exp(0.1) / sum, std::exp(0.2) / sum,
+                                  std::exp(0.3) / sum, std::exp(0.4) / sum}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
index e55eaf444a9..de6e324017d 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/api.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object_manager.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
index 4682765421a..e84f3ef2e00 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
@@ -41,8 +41,6 @@ class ConvolutionTransposedBuffers : public NodeShader {
     auto attr = absl::any_cast<const ConvolutionTransposedAttributes&>(
         ctx.node->operation.attributes);
     auto weights = attr.weights.shape;
-    const int32_t inner_size_w = (weights.w - 1) / attr.stride.w + 1;
-    const int32_t inner_size_h = (weights.h - 1) / attr.stride.h + 1;
 
     std::vector<Variable> parameters = {
         {"input_data_0_h", input->tensor.shape.h},
@@ -50,33 +48,25 @@ class ConvolutionTransposedBuffers : public NodeShader {
         {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
         {"kernel_size", int2(weights.w, weights.h)},
         {"stride", int2(attr.stride.w, attr.stride.h)},
-        {"padding", int2(attr.padding.prepended.w, attr.padding.prepended.h)},
-        {"inner_size", int2(inner_size_w, inner_size_h)},
+        {"padding", int2(weights.w - 1 - attr.padding.prepended.w,
+                         weights.h - 1 - attr.padding.prepended.h)},
     };
 
     std::vector<std::pair<std::string, Object>> objects = {
-        {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
-                                       ConvertToPHWO4I4(attr.weights))}};
+        {"weights",
+         MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
+                            ConvertToPHWO4I4Transposed(attr.weights))}};
 
     std::string source = R"(
-    ivec2 kernel_offset = $kernel_size$ - ivec2(1,1);
-    ivec2 offset = gid.xy + $padding$ - kernel_offset;
-    offset %= $stride$;
-    offset += $stride$;
-    offset %= $stride$;
-    ivec2 f_offset;
-    f_offset.x = offset.x == 0 ? 0 : ($stride.x$ - offset.x);
-    f_offset.y = offset.y == 0 ? 0 : ($stride.y$ - offset.y);
-    for (int ky = 0; ky < $inner_size.y$; ++ky) {
-      for (int kx = 0; kx < $inner_size.x$; ++kx) {
-        ivec2 index = ivec2(kx, ky) * $stride$ + f_offset;
-        bool inside_kernel = index.x < $kernel_size.x$ && index.y < $kernel_size.y$;
-        ivec2 coord = (gid.xy + index + $padding$ - kernel_offset) / $stride$;
-        bool outside = coord.x < 0 || coord.y < 0 ||
-                       coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$;
-        if (inside_kernel && !outside) {
-          index = kernel_offset - index;
-          int i = index.y * $kernel_size.x$ + index.x;
+    #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
+
+    ivec2 p0 = ($padding$ + $stride$ - gid.xy % $stride$) % $stride$;
+    for (int y = p0.y; y < $kernel_size.y$; y += $stride.y$) {
+      for (int x = p0.x; x < $kernel_size.x$; x += $stride.x$) {
+        int i = y * $kernel_size.x$ + x;
+        ivec2 idx = gid.xy + ivec2(x, y) - $padding$;
+        if (IN_BOUNDS(idx, ivec2(0), ivec2($input_data_0_w$, $input_data_0_h$) * $stride$)) {
+          ivec2 coord = idx / $stride$;
           for (int l = 0; l < $src_depth$; ++l) {
             vec4 src_color = $input_data_0[coord.x, coord.y, l]$;
             value_0.x += dot(src_color, $weights[l * 4 + 0, i, gid.z]$);
@@ -95,6 +85,7 @@ class ConvolutionTransposedBuffers : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/source,
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc b/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc
index a30e5ad8e17..96708db84a8 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc
@@ -62,6 +62,7 @@ class UpsamplingBilinear : public NodeShader {
       *generated_code = {
           /*parameters=*/{},
           /*objects=*/{},
+          /*shared_variables=*/{},
           /*workload=*/uint3(),
           /*workgroup=*/uint3(),
           /*source_code=*/"value_0 = $input_data_0[0, 0, gid.z]$;",
@@ -100,6 +101,7 @@ class UpsamplingBilinear : public NodeShader {
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/{},
+        /*shared_variables=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
         /*source_code=*/std::move(source),
diff --git a/tensorflow/lite/delegates/gpu/gl/node_shader.h b/tensorflow/lite/delegates/gpu/gl/node_shader.h
index 710d4b6d5e8..38364656b7a 100644
--- a/tensorflow/lite/delegates/gpu/gl/node_shader.h
+++ b/tensorflow/lite/delegates/gpu/gl/node_shader.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
@@ -63,6 +63,9 @@ struct GeneratedCode {
   // A list of objects to bind before shader could be executed.
   std::vector<std::pair<std::string, Object>> objects;
 
+  // A list of shared variables in the shader program.
+  std::vector<Variable> shared_variables;
+
   // Compute shader operate on an abstract concept of work groups, each
   // three-dimensional. The number of work groups to be executed is defined by
   // workload tuple. Therefore,
@@ -100,6 +103,9 @@ class NodeShader {
   // Generates shader code for a node. The code should be just a function body.
   virtual Status GenerateCode(const GenerationContext& ctx,
                               GeneratedCode* generated_code) const = 0;
+
+  // Limit the size of the const offsets array
+  static constexpr int kMaxConstArraySize = 9;
 };
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/object.h b/tensorflow/lite/delegates/gpu/gl/object.h
index 3340caca8f3..7ea161400b9 100644
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@@ -46,7 +46,7 @@ enum class ObjectType : int {
   BUFFER = 2,
 };
 
-using ObjectSize = absl::variant<uint32_t, uint2, uint3>;
+using ObjectSize = absl::variant<size_t, uint2, uint3>;
 
 // An object represents a reference to or pre-defined constant OpenGL Buffer or
 // Texture. NodeShader is supposed to set all fields but leave binding = 0
diff --git a/tensorflow/lite/delegates/gpu/gl/request_gpu_info.cc b/tensorflow/lite/delegates/gpu/gl/request_gpu_info.cc
new file mode 100644
index 00000000000..7134fc010d0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/request_gpu_info.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+Status RequestGpuInfo(GpuInfo* gpu_info) {
+  GpuInfo info;
+
+  const GLubyte* renderer_name = glGetString(GL_RENDERER);
+  if (renderer_name) {
+    info.renderer_name = reinterpret_cast<const char*>(renderer_name);
+    GetGpuModelAndType(info.renderer_name, &info.gpu_model, &info.type);
+  }
+
+  const GLubyte* vendor_name = glGetString(GL_VENDOR);
+  if (vendor_name) {
+    info.vendor_name = reinterpret_cast<const char*>(vendor_name);
+  }
+
+  const GLubyte* version_name = glGetString(GL_VERSION);
+  if (version_name) {
+    info.version = reinterpret_cast<const char*>(version_name);
+  }
+
+  glGetIntegerv(GL_MAJOR_VERSION, &info.major_version);
+  glGetIntegerv(GL_MINOR_VERSION, &info.minor_version);
+
+  GLint extensions_count;
+  glGetIntegerv(GL_NUM_EXTENSIONS, &extensions_count);
+  info.extensions.resize(extensions_count);
+  for (int i = 0; i < extensions_count; ++i) {
+    info.extensions[i] = std::string(
+        reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i)));
+  }
+  glGetIntegerv(GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, &info.max_ssbo_bindings);
+  glGetIntegerv(GL_MAX_COMPUTE_IMAGE_UNIFORMS, &info.max_image_bindings);
+  info.max_work_group_size.resize(3);
+  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0,
+                  &info.max_work_group_size[0]);
+  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1,
+                  &info.max_work_group_size[1]);
+  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2,
+                  &info.max_work_group_size[2]);
+  glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS,
+                &info.max_work_group_invocations);
+  glGetIntegerv(GL_MAX_TEXTURE_SIZE, &info.max_texture_size);
+  glGetIntegerv(GL_MAX_IMAGE_UNITS, &info.max_image_units);
+  glGetIntegerv(GL_MAX_ARRAY_TEXTURE_LAYERS, &info.max_array_texture_layers);
+  RETURN_IF_ERROR(GetOpenGlErrors());
+  *gpu_info = info;
+  return OkStatus();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h b/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h
new file mode 100644
index 00000000000..4eba7a55c2a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This method performs multiple GL calls, therefore, egl context needs to be
+// created upfront.
+Status RequestGpuInfo(GpuInfo* gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index 7249ac40ce2..d3678864cae 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -22,12 +22,15 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_texture.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
@@ -43,8 +46,9 @@ struct TextureF16Maker {
   Status operator()(const uint2& size) const {
     return CreateReadOnlyImageTextureF16(size, data, gl_texture);
   }
-  Status operator()(const uint32_t& size) const {
-    return CreateReadOnlyImageTextureF16(uint2(size, 1U), data, gl_texture);
+  Status operator()(const size_t& size) const {
+    return CreateReadOnlyImageTextureF16(uint2(static_cast<uint32_t>(size), 1U),
+                                         data, gl_texture);
   }
   absl::Span<const uint16_t> data;
   GlTexture* gl_texture;
@@ -57,8 +61,9 @@ struct TextureF32Maker {
   Status operator()(const uint2& size) const {
     return CreateReadOnlyImageTexture(size, data, gl_texture);
   }
-  Status operator()(const uint32_t& size) const {
-    return CreateReadOnlyImageTexture(uint2(size, 1U), data, gl_texture);
+  Status operator()(const size_t& size) const {
+    return CreateReadOnlyImageTexture(uint2(static_cast<uint32_t>(size), 1U),
+                                      data, gl_texture);
   }
   absl::Span<const float> data;
   GlTexture* gl_texture;
@@ -113,8 +118,9 @@ struct TextureRefMaker {
   Status operator()(const uint2& size) const {
     return CreateReadWriteRgbaImageTexture(type, size, gl_texture);
   }
-  Status operator()(const uint32_t& size) const {
-    return CreateReadWriteRgbaImageTexture(type, uint2(size, 1U), gl_texture);
+  Status operator()(const size_t& size) const {
+    return CreateReadWriteRgbaImageTexture(
+        type, uint2(static_cast<uint32_t>(size), 1U), gl_texture);
   }
   DataType type;
   GlTexture* gl_texture;
@@ -331,261 +337,245 @@ Status Runtime::PrepareForExecution() {
 
 namespace {
 
-struct FitSizeFunc {
-  bool operator()(const uint3& size) const {
-    auto s = absl::get_if<uint3>(&b);
-    if (!s) return false;
-    *result = uint3(std::max(s->x, size.x), std::max(s->y, size.y),
-                    std::max(s->z, size.z));
-    return true;
-  }
+const size_t kNotAssigned = std::numeric_limits<size_t>::max();
 
-  bool operator()(const uint2& size) const {
-    auto s = absl::get_if<uint2>(&b);
-    if (!s) return false;
-    *result = uint2(std::max(s->x, size.x), std::max(s->y, size.y));
-    return true;
-  }
-
-  bool operator()(uint32_t size) const {
-    auto s = absl::get_if<uint32_t>(&b);
-    if (!s) return false;
-    *result = std::max(*s, size);
-    return true;
-  }
-
-  const ObjectSize& b;
-  ObjectSize* result;
+struct CombinedUsageRecords {
+  std::vector<TensorUsageRecord<size_t>> buffers;
+  std::vector<TensorUsageRecord<size_t>> textures_1d;
+  std::vector<TensorUsageRecord<uint2>> textures_2d;
+  std::vector<TensorUsageRecord<uint3>> textures_3d;
+  std::vector<size_t> usage_refs;
 };
 
-// Makes new size which combines largest dimensions of both given sizes.
-//
-// @return false if sizes have different number of dimensions
-bool FitSize(const ObjectSize& a, const ObjectSize& b, ObjectSize* result) {
-  return absl::visit(FitSizeFunc{b, result}, a);
+template <typename TensorSizeT>
+void UpdateUsageRecord(TensorUsageRecord<TensorSizeT>* usage_rec,
+                       size_t task_id) {
+  usage_rec->first_task = std::min(usage_rec->first_task, task_id);
+  usage_rec->last_task = std::max(usage_rec->last_task, task_id);
 }
 
-// Texture fitting policy is:
-//  - 1D: source texture will always fit into target because it is linear
-//  - 2D: source texture should fit without growing target texture
-//  - 3D: source texture should fit without growing target texture
-//
-struct TextureFitPolicy {
-  bool operator()(const uint3& size) const {
-    auto s = absl::get_if<uint3>(&target);
-    return s && size.x <= s->x && size.y <= s->y && size.z <= s->z;
-  }
-
-  bool operator()(const uint2& size) const {
-    auto s = absl::get_if<uint2>(&target);
-    return s && size.x <= s->x && size.y <= s->y;
-  }
-
-  bool operator()(uint32_t size) const {
-    return absl::get_if<uint32_t>(&target);
-  }
-
-  const ObjectSize& target;
-};
-
-// Makes new size which combines largest dimensions of both given sizes.
-//
-// @return false if sizes have different number of dimensions
-bool WillTextureFit(const ObjectSize& source, const ObjectSize& target) {
-  return absl::visit(TextureFitPolicy{target}, source);
-}
-
-struct TextureNumElementsFunc {
-  size_t operator()(const uint3& size) const {
-    auto s = absl::get_if<uint3>(&target);
-    return s ? size.z * s->x * s->y + size.y * s->x + size.x : 0;
-  }
-
-  size_t operator()(const uint2& size) const {
-    auto s = absl::get_if<uint2>(&target);
-    return s ? size.y * s->x + size.x : 0;
-  }
-
-  size_t operator()(uint32_t size) const {
-    auto s = absl::get_if<uint32_t>(&target);
-    return s ? size : 0;
-  }
-
-  const ObjectSize& target;
-};
-
-// @return estimated number of elements if target texture is used to keep source
-// texture data assuming XYZ layout.
-size_t TextureNumElements(const ObjectSize& source, const ObjectSize& target) {
-  return absl::visit(TextureNumElementsFunc{target}, source);
-}
-
-// Checks whether the given object fits into 'to' object. Returns number of
-// bytes used if an object fits, or 0 otherwise.
-//
-// Fitting policy:
-//   - buffer will always fit into another buffer because they all are linear.
-//   - textures are handles by the policy above
-//
-size_t WillItFit(const Object& object, const Object& to) {
-  if (object.object_type != to.object_type ||
-      object.data_type != to.data_type) {
-    return 0;
-  }
-  switch (object.object_type) {
-    case ObjectType::BUFFER:
-      return ByteSizeOf(object);
-    case ObjectType::TEXTURE: {
-      if (!WillTextureFit(object.size, to.size)) return 0;
-      // Expand 'to' dimensions to ensure an object fits.
-      ObjectSize new_texture_size;
-      if (!FitSize(object.size, to.size, &new_texture_size)) return 0;
-      return /* RGBA = */ 4 * SizeOf(object.data_type) *
-             TextureNumElements(object.size, new_texture_size);
+struct AddUsageRecordForTextureFunc {
+  void operator()(const uint3& size) const {
+    auto& usage_ref = usage_records->usage_refs[object_ref];
+    if (usage_ref == kNotAssigned) {
+      usage_ref = usage_records->textures_3d.size();
+      usage_records->textures_3d.emplace_back(/*tensor_size=*/size,
+                                              /*first_task=*/program_id,
+                                              /*last_task=*/program_id);
+    } else {
+      UpdateUsageRecord(&usage_records->textures_3d[usage_ref], program_id);
     }
-    default:
-      return 0;
   }
+
+  void operator()(const uint2& size) const {
+    auto& usage_ref = usage_records->usage_refs[object_ref];
+    if (usage_ref == kNotAssigned) {
+      usage_ref = usage_records->textures_2d.size();
+      usage_records->textures_2d.emplace_back(/*tensor_size=*/size,
+                                              /*first_task=*/program_id,
+                                              /*last_task=*/program_id);
+    } else {
+      UpdateUsageRecord(&usage_records->textures_2d[usage_ref], program_id);
+    }
+  }
+
+  void operator()(size_t size) const {
+    auto& usage_ref = usage_records->usage_refs[object_ref];
+    if (usage_ref == kNotAssigned) {
+      usage_ref = usage_records->textures_1d.size();
+      usage_records->textures_1d.emplace_back(/*tensor_size=*/size,
+                                              /*first_task=*/program_id,
+                                              /*last_task=*/program_id);
+    } else {
+      UpdateUsageRecord(&usage_records->textures_1d[usage_ref], program_id);
+    }
+  }
+
+  CombinedUsageRecords* usage_records;
+  const ObjectRef& object_ref;
+  const size_t program_id;
+};
+
+// We assume that AddUsageRecord for different objects is called in order of
+// program_id.
+Status AddUsageRecord(CombinedUsageRecords* usage_records, const Object& object,
+                      const size_t program_id) {
+  auto ref = GetRef(object);
+  if (ref >= usage_records->usage_refs.size()) {
+    usage_records->usage_refs.resize(ref + 1, kNotAssigned);
+  }
+  auto& usage_ref = usage_records->usage_refs[ref];
+  if (object.object_type == ObjectType::BUFFER) {
+    if (usage_ref == kNotAssigned) {
+      usage_ref = usage_records->buffers.size();
+      usage_records->buffers.emplace_back(
+          /*tensor_size=*/NumElements(object.size),
+          /*first_task=*/program_id,
+          /*last_task=*/program_id);
+    } else {
+      UpdateUsageRecord(&usage_records->buffers[usage_ref], program_id);
+    }
+    return OkStatus();
+  }
+  if (object.object_type == ObjectType::TEXTURE) {
+    absl::visit(AddUsageRecordForTextureFunc{usage_records, ref, program_id},
+                object.size);
+    return OkStatus();
+  }
+  return InternalError("Unexpected object type");
+}
+
+Status ApplyBuffersAssignment(
+    const ObjectsAssignment<size_t>& assignment,
+    const std::vector<size_t>& global_ref_to_usage_rec,
+    const std::vector<Object*>& global_ref_to_object_ptr,
+    std::vector<ObjectRef>* global_ref_to_shared_ref,
+    std::vector<Object>* shared_objects) {
+  std::vector<ObjectRef> assigned_id_to_shared_ref(
+      assignment.object_sizes.size(), kInvalidObjectRef);
+  for (size_t global_ref = 0; global_ref < global_ref_to_usage_rec.size();
+       ++global_ref) {
+    const auto& usage_rec_id = global_ref_to_usage_rec[global_ref];
+    Object* object = global_ref_to_object_ptr[global_ref];
+    if (usage_rec_id == kNotAssigned || object == nullptr ||
+        object->object_type != ObjectType::BUFFER) {
+      // Skip objects with other data type and non-buffers.
+      continue;
+    }
+
+    // id of shared object, returned by memory allocation algorithm.
+    size_t assigned_id = assignment.object_ids[usage_rec_id];
+
+    // id of corresponding shared object in vector share_objects.
+    ObjectRef shared_ref = assigned_id_to_shared_ref[assigned_id];
+
+    if (shared_ref == kInvalidObjectRef) {
+      // We need to create new shared object for current buffer.
+      shared_ref = shared_objects->size();
+      Object shared_object = *object;
+      shared_object.access = AccessType::READ_WRITE;
+      shared_object.object = shared_ref;
+      shared_object.size = assignment.object_sizes[assigned_id];
+      shared_objects->push_back(std::move(shared_object));
+      assigned_id_to_shared_ref[assigned_id] = shared_ref;
+    }
+    (*global_ref_to_shared_ref)[global_ref] = shared_ref;
+  }
+  return OkStatus();
+}
+
+template <typename ObjectSizeT>
+Status ApplyTexturesAssignment(
+    const ObjectsAssignment<ObjectSizeT>& assignment,
+    const std::vector<size_t>& global_ref_to_usage_rec,
+    const std::vector<Object*>& global_ref_to_object_ptr,
+    std::vector<ObjectRef>* global_ref_to_shared_ref,
+    std::vector<Object>* shared_objects) {
+  std::vector<ObjectRef> assigned_id_to_shared_ref(
+      assignment.object_sizes.size(), kInvalidObjectRef);
+  for (size_t global_ref = 0; global_ref < global_ref_to_usage_rec.size();
+       ++global_ref) {
+    const auto& usage_rec_id = global_ref_to_usage_rec[global_ref];
+    Object* object = global_ref_to_object_ptr[global_ref];
+    if (usage_rec_id == kNotAssigned || object == nullptr ||
+        object->object_type != ObjectType::TEXTURE ||
+        !absl::get_if<ObjectSizeT>(&object->size)) {
+      // Skip objects with other data type, non-textures and textures with wrong
+      // number of dimensions.
+      continue;
+    }
+
+    // id of shared object, returned by memory allocation algorithm.
+    size_t assigned_id = assignment.object_ids[usage_rec_id];
+
+    // id of corresponding shared object in vector share_objects.
+    ObjectRef shared_ref = assigned_id_to_shared_ref[assigned_id];
+
+    if (shared_ref == kInvalidObjectRef) {
+      // We need to create new shared object for current texture.
+      shared_ref = shared_objects->size();
+      Object shared_object = *object;
+      shared_object.access = AccessType::READ_WRITE;
+      shared_object.object = shared_ref;
+      shared_object.size = assignment.object_sizes[assigned_id];
+      shared_objects->push_back(std::move(shared_object));
+      assigned_id_to_shared_ref[assigned_id] = shared_ref;
+    }
+    (*global_ref_to_shared_ref)[global_ref] = shared_ref;
+  }
+  return OkStatus();
 }
 
 }  // namespace
 
-// Algorithm works as follows:
-//
-//   1. First it collects usage intervals for each object reference.
-//      For example: buffer #3 is introduced in program #2 and used for the
-//      last time in program #7.
-//
-//   2. Iterates through all programs where for every object reference
-//      assigns shared object from the pool. When object reference is used
-//      for the last time, corresponding shared object is returned back to
-//      the pool.
-//
-//   3. Shared object pool grows when there are no free shared object
-//      available.
-//
-//   4. Shared object size may increase when object reference requests bigger
-//      size.
-//
-// Therefore, in the end all references are remapped to ids in the range
-// [0..num_shared_objects]. To avoid ref space collision with global reference
-// all shared objects are allocated in internal_objects_.
+// Assign shared objects to internal objects, using memory allocation
+// algorithms. Usage records for the algorithms are calculated separately for
+// each data type and object type.
 Status Runtime::AssignInternalObjects(std::vector<Object>* shared_objects) {
-  // Build interval set for objects to know where each object is introduced
-  // and used for the last time.
-  std::vector<std::pair<int32_t, int32_t>> usage_intervals;
-  for (int32_t i = 0; i < programs_.size(); ++i) {
+  // Build tensor usage records, clusterized by object type and data type.
+  std::map<DataType, CombinedUsageRecords> usage_records_by_data_type;
+  std::vector<Object*> global_ref_to_object_ptr;
+  for (size_t i = 0; i < programs_.size(); ++i) {
     for (auto& object : programs_[i].refs) {
       auto ref = GetRef(object);
-      if (ref >= usage_intervals.size()) {
-        usage_intervals.resize(ref + 1, std::make_pair(programs_.size(), -1));
+      if (ref >= global_ref_to_object_ptr.size()) {
+        global_ref_to_object_ptr.resize(ref + 1, nullptr);
       }
-      auto& it = usage_intervals[ref];
-      it.first = std::min(it.first, i);
-      it.second = std::max(it.second, i);
+      if (global_ref_to_object_ptr[ref] == nullptr) {
+        global_ref_to_object_ptr[ref] = &object;
+      }
+      RETURN_IF_ERROR(AddUsageRecord(
+          &usage_records_by_data_type[object.data_type], object, i));
     }
   }
 
-  std::vector<bool> is_used_shared_object;
-  std::vector<ObjectRef> global_ref_to_shared_ref(usage_intervals.size(),
-                                                  kInvalidObjectRef);
+  std::vector<ObjectRef> global_ref_to_shared_ref(
+      global_ref_to_object_ptr.size(), kInvalidObjectRef);
+
+  // Calculate and apply shared objects assignment for each data type.
+  for (const auto& it : usage_records_by_data_type) {
+    const CombinedUsageRecords& usage_records = it.second;
+    if (!usage_records.buffers.empty()) {
+      ObjectsAssignment<size_t> buffer_assignment;
+      RETURN_IF_ERROR(AssignObjectsToTensors(usage_records.buffers,
+                                             MemoryStrategy::GREEDY_BEST,
+                                             &buffer_assignment));
+      RETURN_IF_ERROR(ApplyBuffersAssignment(
+          buffer_assignment, usage_records.usage_refs, global_ref_to_object_ptr,
+          &global_ref_to_shared_ref, shared_objects));
+    }
+    if (!usage_records.textures_1d.empty()) {
+      ObjectsAssignment<size_t> texture_1d_assignment;
+      RETURN_IF_ERROR(AssignObjectsToTensors(usage_records.textures_1d,
+                                             MemoryStrategy::GREEDY_BEST,
+                                             &texture_1d_assignment));
+      RETURN_IF_ERROR(ApplyTexturesAssignment(
+          texture_1d_assignment, usage_records.usage_refs,
+          global_ref_to_object_ptr, &global_ref_to_shared_ref, shared_objects));
+    }
+    if (!usage_records.textures_2d.empty()) {
+      ObjectsAssignment<uint2> texture_2d_assignment;
+      RETURN_IF_ERROR(AssignObjectsToTensors(usage_records.textures_2d,
+                                             MemoryStrategy::GREEDY_IN_ORDER,
+                                             &texture_2d_assignment));
+      RETURN_IF_ERROR(ApplyTexturesAssignment(
+          texture_2d_assignment, usage_records.usage_refs,
+          global_ref_to_object_ptr, &global_ref_to_shared_ref, shared_objects));
+    }
+    if (!usage_records.textures_3d.empty()) {
+      ObjectsAssignment<uint3> texture_3d_assignment;
+      RETURN_IF_ERROR(AssignObjectsToTensors(usage_records.textures_3d,
+                                             MemoryStrategy::GREEDY_IN_ORDER,
+                                             &texture_3d_assignment));
+      RETURN_IF_ERROR(ApplyTexturesAssignment(
+          texture_3d_assignment, usage_records.usage_refs,
+          global_ref_to_object_ptr, &global_ref_to_shared_ref, shared_objects));
+    }
+  }
 
   for (size_t i = 0; i < programs_.size(); ++i) {
-    auto& program = programs_[i];
-    // list of object indices to return to the pool.
-    std::vector<ObjectRef> object_refs_to_return;
-
-    // Assign to every internal buffer, that is not yet allocated, appropriate
-    // shared buffer from a heap of unused.
-    for (auto& object : program.refs) {
-      const ObjectRef ref = GetRef(object);
-      ObjectRef shared_ref = global_ref_to_shared_ref[ref];
-      const auto& usage = usage_intervals[ref];
-
-      if (usage.first == i) {
-        // First time a reference is introduced. Assign shared object.
-        if (shared_ref != kInvalidObjectRef) {
-          return InternalError(
-              "Internal object is introduced for the first time but is already "
-              "assigned");
-        }
-
-        // Try to find a free shared object that is as close as possible by
-        // size. Here we assume that number of shared objects is relatively
-        // small (< 100), therefore, search linearly over all of them.
-        size_t selected_waste_bytes = 0;
-        for (int32_t b = 0; b < shared_objects->size(); ++b) {
-          // Check whether shared object is available.
-          if (is_used_shared_object[b]) continue;
-          auto& shared_object = (*shared_objects)[b];
-
-          // Bytes needed to fit object in the shared object.
-          size_t alloc_bytes = WillItFit(object, shared_object);
-          if (alloc_bytes == 0) continue;
-
-          // Prefer shared object that will waste less memory.
-          size_t shared_byte_size = ByteSizeOf(shared_object);
-          // sizes are unsigned, therefore '-' may undeflow. Take smallest.
-          size_t waste_bytes = std::min(shared_byte_size - alloc_bytes,
-                                        alloc_bytes - shared_byte_size);
-          if (shared_ref == kInvalidObjectRef ||
-              waste_bytes < selected_waste_bytes) {
-            selected_waste_bytes = waste_bytes;
-            shared_ref = b;
-          }
-        }
-
-        if (shared_ref == kInvalidObjectRef) {
-          // Didn't find an object to share. Create new one.
-          shared_ref = shared_objects->size();
-          Object shared_object = object;
-          shared_object.access = AccessType::READ_WRITE;
-          shared_object.object = shared_ref;
-          if (shared_object.object_type == ObjectType::BUFFER) {
-            // Make a buffer linear.
-            shared_object.size =
-                static_cast<uint32_t>(NumElements(object.size));
-          }
-          shared_objects->push_back(std::move(shared_object));
-          is_used_shared_object.push_back(false);
-        } else {
-          // Check chosen shared object and update it's size.
-          Object& shared_object = (*shared_objects)[shared_ref];
-          switch (object.object_type) {
-            case ObjectType::BUFFER:
-              shared_object.size = std::max<uint32_t>(
-                  NumElements(object.size), NumElements(shared_object.size));
-              break;
-            case ObjectType::TEXTURE: {
-              if (!FitSize(object.size, shared_object.size,
-                           &shared_object.size)) {
-                return InternalError(
-                    "Already assigned shared texture does not fit an object");
-              }
-              break;
-            }
-            default:
-              return InternalError("Unexpected shared object type");
-          }
-        }
-      }
-
-      // Mark shared object as used and map internal object to it.
-      is_used_shared_object[shared_ref] = true;
-      global_ref_to_shared_ref[ref] = shared_ref;
-      object.object = shared_ref;
-
-      // At this point we want to return unused object, but it should be
-      // returned later to avoid re-using the same object in this operation
-      // for a different purpose.
-      if (usage.second == i) {
-        object_refs_to_return.push_back(shared_ref);
-      }
-    }
-
-    // Mark all returned objects from this program as unused.
-    for (size_t ref : object_refs_to_return) {
-      is_used_shared_object[ref] = false;
+    for (auto& object : programs_[i].refs) {
+      object.object = global_ref_to_shared_ref[GetRef(object)];
     }
   }
   return OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.h b/tensorflow/lite/delegates/gpu/gl/runtime.h
index 23fff931c2a..46e0732cd32 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.h
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.h
@@ -18,13 +18,13 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/object_manager.h"
 #include "tensorflow/lite/delegates/gpu/gl/runtime/shared_buffer.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/serialization.cc b/tensorflow/lite/delegates/gpu/gl/serialization.cc
index 200ca1fbb01..17db339fa98 100644
--- a/tensorflow/lite/delegates/gpu/gl/serialization.cc
+++ b/tensorflow/lite/delegates/gpu/gl/serialization.cc
@@ -37,12 +37,14 @@ struct ParameterValueGetter {
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const int2& value) {
     auto offset = builder->CreateVector(std::vector<int32_t>{value.x, value.y});
     data::DataInt32Builder data(*builder);
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const int4& value) {
     auto offset = builder->CreateVector(
         std::vector<int32_t>{value.x, value.y, value.z, value.w});
@@ -50,6 +52,7 @@ struct ParameterValueGetter {
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const std::vector<int2>& value) {
     std::vector<int32_t> d(value.size() * 2);
     for (size_t i = 0; i < value.size(); ++i) {
@@ -61,12 +64,14 @@ struct ParameterValueGetter {
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(uint32_t value) {
     auto offset = builder->CreateVector(std::vector<uint32_t>{value});
     data::DataUint32Builder data(*builder);
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const uint4& value) {
     auto offset = builder->CreateVector(
         std::vector<uint32_t>{value.x, value.y, value.z, value.w});
@@ -74,18 +79,21 @@ struct ParameterValueGetter {
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(float value) {
     auto offset = builder->CreateVector(std::vector<float>{value});
     data::DataFloatBuilder data(*builder);
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const float2& value) {
     auto offset = builder->CreateVector(std::vector<float>{value.x, value.y});
     data::DataFloatBuilder data(*builder);
     data.add_data(offset);
     return data.Finish().Union();
   }
+
   Offset<void> operator()(const float4& value) {
     auto offset = builder->CreateVector(
         std::vector<float>{value.x, value.y, value.z, value.w});
@@ -94,6 +102,20 @@ struct ParameterValueGetter {
     return data.Finish().Union();
   }
 
+  Offset<void> operator()(const std::vector<float4>& value) {
+    std::vector<float> d(value.size() * 4);
+    for (size_t i = 0; i < value.size(); ++i) {
+      d[i * 4] = value[i].x;
+      d[i * 4 + 1] = value[i].y;
+      d[i * 4 + 2] = value[i].z;
+      d[i * 4 + 3] = value[i].w;
+    }
+    auto offset = builder->CreateVector(d);
+    data::DataFloatBuilder data(*builder);
+    data.add_data(offset);
+    return data.Finish().Union();
+  }
+
   ::flatbuffers::FlatBufferBuilder* builder;
 };
 
@@ -101,60 +123,84 @@ struct DataVariantTypeGetter {
   data::DataVariant operator()(int32_t) const {
     return data::DataVariant::DataInt32;
   }
+
   data::DataVariant operator()(const int2&) const {
     return data::DataVariant::DataInt32;
   }
+
   data::DataVariant operator()(const int4&) const {
     return data::DataVariant::DataInt32;
   }
+
   data::DataVariant operator()(const std::vector<int2>&) const {
     return data::DataVariant::DataInt32;
   }
+
   data::DataVariant operator()(uint32_t) const {
     return data::DataVariant::DataUint32;
   }
+
   data::DataVariant operator()(const uint4&) const {
     return data::DataVariant::DataUint32;
   }
+
   data::DataVariant operator()(float) const {
     return data::DataVariant::DataFloat;
   }
+
   data::DataVariant operator()(const float2&) const {
     return data::DataVariant::DataFloat;
   }
+
   data::DataVariant operator()(const float4&) const {
     return data::DataVariant::DataFloat;
   }
+
+  data::DataVariant operator()(const std::vector<float4>&) const {
+    return data::DataVariant::DataFloat;
+  }
 };
 
 struct ParameterTypeGetter {
   data::ParameterType operator()(int32_t) const {
     return data::ParameterType::INT32;
   }
+
   data::ParameterType operator()(const int2&) const {
     return data::ParameterType::INT32;
   }
+
   data::ParameterType operator()(const int4&) const {
     return data::ParameterType::INT32;
   }
+
   data::ParameterType operator()(const std::vector<int2>&) const {
     return data::ParameterType::INT32_2;
   }
+
   data::ParameterType operator()(uint32_t) const {
     return data::ParameterType::UINT32;
   }
+
   data::ParameterType operator()(const uint4&) const {
     return data::ParameterType::UINT32;
   }
+
   data::ParameterType operator()(float) const {
     return data::ParameterType::FLOAT32;
   }
+
   data::ParameterType operator()(const float2&) const {
     return data::ParameterType::FLOAT32;
   }
+
   data::ParameterType operator()(const float4&) const {
     return data::ParameterType::FLOAT32;
   }
+
+  data::ParameterType operator()(const std::vector<float4>&) const {
+    return data::ParameterType::FLOAT32;
+  }
 };
 
 data::DataType ToFB(DataType type) {
diff --git a/tensorflow/lite/delegates/gpu/gl/serialization_test.cc b/tensorflow/lite/delegates/gpu/gl/serialization_test.cc
index 38db44122b4..27a3583a32f 100644
--- a/tensorflow/lite/delegates/gpu/gl/serialization_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/serialization_test.cc
@@ -70,14 +70,17 @@ struct ParameterComparator {
   bool operator()(int32_t value) const {
     return value == absl::get<int32_t>(a.value);
   }
+
   bool operator()(const int2& value) const {
     auto v = absl::get<int2>(a.value);
     return value.x == v.x && value.y == v.y;
   }
+
   bool operator()(const int4& value) const {
     auto v = absl::get<int4>(a.value);
     return value.x == v.x && value.y == v.y && value.z == v.z && value.w == v.w;
   }
+
   bool operator()(const std::vector<int2>& value) const {
     auto v = absl::get<std::vector<int2>>(a.value);
     if (v.size() != value.size()) {
@@ -90,24 +93,43 @@ struct ParameterComparator {
     }
     return true;
   }
+
   bool operator()(uint32_t value) const {
     return value == absl::get<uint32_t>(a.value);
   }
+
   bool operator()(const uint4& value) const {
     auto v = absl::get<uint4>(a.value);
     return value.x == v.x && value.y == v.y && value.z == v.z && value.w == v.w;
   }
+
   bool operator()(float value) const {
     return value == absl::get<float>(a.value);
   }
+
   bool operator()(float2 value) const {
     auto v = absl::get<float2>(a.value);
     return value.x == v.x && value.y == v.y;
   }
+
   bool operator()(const float4& value) const {
     auto v = absl::get<float4>(a.value);
     return value.x == v.x && value.y == v.y && value.z == v.z && value.w == v.w;
   }
+
+  bool operator()(const std::vector<float4>& value) const {
+    auto v = absl::get<std::vector<float4>>(a.value);
+    if (v.size() != value.size()) {
+      return false;
+    }
+    for (int i = 0; i < v.size(); ++i) {
+      if (v[i].x != value[i].x || v[i].y != value[i].y) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   Variable a;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/variable.h b/tensorflow/lite/delegates/gpu/gl/variable.h
index f2f3979b631..1c5bb26db62 100644
--- a/tensorflow/lite/delegates/gpu/gl/variable.h
+++ b/tensorflow/lite/delegates/gpu/gl/variable.h
@@ -28,8 +28,9 @@ namespace gpu {
 namespace gl {
 
 struct Variable {
-  using ValueType = absl::variant<int32_t, int2, int4, uint32_t, uint4, float,
-                                  float2, float4, std::vector<int2>>;
+  using ValueType =
+      absl::variant<int32_t, int2, int4, uint32_t, uint4, float, float2, float4,
+                    std::vector<int2>, std::vector<float4>>;
 
   std::string name;
   ValueType value;
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index 28a172b35de..52fdb7435f9 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -8,8 +8,8 @@ cc_library(
     srcs = ["calculator.cc"],
     hdrs = ["calculator.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
         "//tensorflow/lite/delegates/gpu/gl/compiler:shader_code",
     ],
 )
@@ -20,8 +20,8 @@ cc_library(
     hdrs = ["default_calculator.h"],
     deps = [
         ":calculator",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
     ],
 )
 
@@ -35,7 +35,7 @@ cc_library(
             ":default_calculator",
             "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
             "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
-            "//tensorflow/lite/delegates/gpu/gl:gpu_info",
+            "//tensorflow/lite/delegates/gpu/common:gpu_info",
             "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
             ":calculator",
             "@com_google_absl//absl/memory",
@@ -52,7 +52,7 @@ cc_library(
     deps = [
         ":calculator",
         ":default_calculator",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
     ] + select({
         "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
         "//conditions:default": [
@@ -67,9 +67,9 @@ cc_library(
     hdrs = ["ideal_workgroup_picker.h"],
     deps = [
         ":calculator",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.cc
index f0a1c4fbd40..528d75d656d 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h"
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h"
 
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h b/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h
index 56d192d55cc..e277e45fc27 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_BEST_EFFORT_CALCULATOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_BEST_EFFORT_CALCULATOR_H_
 
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.cc
index 82ddf006555..e21538b22a5 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h
index c59a9433ffd..132247426f8 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
index 673eedc3273..b258f2c4424 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
@@ -20,15 +20,15 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
+#include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/metadata_generated.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups_generated.h"
 
-#include "absl/memory/memory.h"
-#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
 #endif  // TFLITE_GPU_BINARY_RELEASE
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h
index cca859f8795..4c034b1604f 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_FROM_METADATA_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_FROM_METADATA_H_
 
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.cc
index ebfba146d93..7b6358e3a95 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h"
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h b/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h
index c8840abf4e5..6053c9e62e2 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_DEFAULT_CALCULATOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_DEFAULT_CALCULATOR_H_
 
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.cc
index 07dffa306a1..65636fe6467 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h b/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h
index 34461bdab50..34f628cb7cf 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_IDEAL_WORKGROUP_PICKER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_IDEAL_WORKGROUP_PICKER_H_
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index f624fb99204..3fbe713515b 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/registry.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h"
 #include "tensorflow/lite/minimal_logging.h"
 
@@ -118,12 +119,7 @@ class Delegate {
 
   Status BindBufferToTensor(GLuint ssbo, int tensor_index) {
     int64_t bytes_size;
-    {
-      gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, ssbo);
-      RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glGetBufferParameteri64v,
-                                         GL_SHADER_STORAGE_BUFFER,
-                                         GL_BUFFER_SIZE, &bytes_size));
-    }
+    RETURN_IF_ERROR(GetSSBOSize(ssbo, &bytes_size));
     return bhwc_objects_.RegisterBuffer(
         tensor_index, GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo, bytes_size,
                                /* offset = */ 0,
@@ -257,6 +253,8 @@ class Delegate {
         options_.compile_options.preferred_gl_object_type);
     compile_options.dynamic_batch =
         static_cast<bool>(options_.compile_options.dynamic_batch_enabled);
+    compile_options.inline_parameters =
+        static_cast<bool>(options_.compile_options.inline_parameters);
     auto shaders = NewNodeShaderRegistry();
     GpuInfo gpu_info;
     RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
@@ -369,18 +367,6 @@ class Delegate {
   std::unique_ptr<InferenceContext> inference_context_;
 };
 
-// TODO(impjdi): Merge with MetalDelegate.
-bool IsAllFloatTensors(const TfLiteContext* context,
-                       const TfLiteIntArray* array) {
-  for (int i = 0; i < array->size; ++i) {
-    const TfLiteTensor* t = context->tensors + array->data[i];
-    if (t->allocation_type == kTfLiteArenaRw && t->type != kTfLiteFloat32) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline Delegate* GetGpuDelegate(TfLiteNode* node) {
   return reinterpret_cast<Delegate*>(node->user_data);
 }
@@ -461,6 +447,22 @@ TfLiteStatus DelegateCopyToBufferHandle(TfLiteContext* context,
 }  // namespace gpu
 }  // namespace tflite
 
+TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault() {
+  TfLiteGlCompileOptions options;
+  options.precision_loss_allowed = 0;
+  options.preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
+  options.dynamic_batch_enabled = 0;
+  options.inline_parameters = 0;
+  return options;
+}
+
+TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault() {
+  TfLiteGpuDelegateOptions options;
+  options.metadata = nullptr;
+  options.compile_options = TfLiteGlCompileOptionsDefault();
+  return options;
+}
+
 TfLiteDelegate* TfLiteGpuDelegateCreate(
     const TfLiteGpuDelegateOptions* options) {
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.h b/tensorflow/lite/delegates/gpu/gl_delegate.h
index aa78e1b9804..d93aac88991 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.h
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -25,11 +25,11 @@ limitations under the License.
 #define TFL_CAPI_EXPORT
 #else
 #if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
+#ifdef TFL_COMPILE_LIBRARY
 #define TFL_CAPI_EXPORT __declspec(dllexport)
 #else
 #define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
+#endif  // TFL_COMPILE_LIBRARY
 #else
 #define TFL_CAPI_EXPORT __attribute__((visibility("default")))
 #endif  // _WIN32
@@ -47,8 +47,11 @@ enum TfLiteGlObjectType {
 };
 
 // Shader compilation options.
+// Always use TfLiteGlCompileOptionsDefault() method to create new instance
+// of TfLiteGlCompileOptions, otherwise every new added option may break
+// inference.
 // TODO(impjdi): Unify with opengl::CompilationOptions.
-struct TFL_CAPI_EXPORT TfLiteGlCompileOptions {
+typedef struct {
   // When set to zero, computations are carried out in 32-bit floating point.
   // Otherwise, the GPU may quantify tensors, downcast values, process in FP16
   // (recommended).
@@ -69,12 +72,32 @@ struct TFL_CAPI_EXPORT TfLiteGlCompileOptions {
   // Otherwise, enables dynamic batching and input/output tensor can have a
   // batch size greater than 1.
   int32_t dynamic_batch_enabled;
-};
 
-struct TFL_CAPI_EXPORT TfLiteGpuDelegateOptions {
+  // Parameters will be inlined into a shader. This in turn will generated more
+  // unique shaders where each will need to be compiled.
+  int32_t inline_parameters;
+} TfLiteGlCompileOptions;
+
+// Populates TfLiteGlCompileOptions as follows:
+//   precision_loss_allowed = 0;
+//   preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
+//   dynamic_batch_enabled = 0;
+//   inline_parameters = 0;
+TFL_CAPI_EXPORT TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault();
+
+// Always use TfLiteGpuDelegateOptionsDefault() method to create new instance
+// of TfLiteGpuDelegateOptions, otherwise every new added option may break
+// inference.
+typedef struct {
   const uint8_t* metadata;  // Internal.
   TfLiteGlCompileOptions compile_options;
-};
+} TfLiteGpuDelegateOptions;
+
+// Populates TfLiteGlCompileOptions as follows:
+//   metadata = nullptr;
+//   compile_options = TfLiteGlCompileOptionsDefault();
+TFL_CAPI_EXPORT TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault();
+
 // LINT.ThenChange(//tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java)
 
 // Creates a new delegate instance that need to be destroyed with
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 51e3ce130a8..ea1b2216026 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -24,8 +24,7 @@ extern "C" {
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
     jboolean dynamic_batch_enabled, jint preferred_gl_object_type) {
-  TfLiteGpuDelegateOptions options;
-  options.metadata = nullptr;
+  TfLiteGpuDelegateOptions options = TfLiteGpuDelegateOptionsDefault();
   options.compile_options.precision_loss_allowed =
       precision_loss_allowed == JNI_TRUE ? 1 : 0;
   options.compile_options.preferred_gl_object_type =
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 5c16a5fe227..c6dc95d1a58 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -22,6 +22,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/metal/kernels",
+        "//tensorflow/lite/delegates/gpu/metal/kernels:custom_registry",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 03e9efa8075..5ca48f62c33 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/concat.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h"
@@ -115,12 +117,148 @@ std::vector<ComputeTaskDescriptorPtr> SelectSoftmax(const GraphFloat32& graph,
   }
 }
 
+Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
+                          const std::vector<ValueId>& inputs,
+                          const std::vector<ValueId>& outputs,
+                          const RuntimeOptions& options,
+                          std::vector<ComputeTaskDescriptorPtr>* tasks) {
+  int node_id = static_cast<int>(node->id);
+  auto op_type = OperationTypeFromString(node->operation.type);
+  switch (op_type) {
+    case OperationType::ADD:
+      *tasks = Add(node_id, inputs, outputs[0],
+                   absl::any_cast<AddAttributes>(node->operation.attributes),
+                   options);
+      break;
+    case OperationType::CONCAT: {
+      std::vector<BHWC> input_shapes;
+      for (auto& input : graph.FindInputs(node->id)) {
+        input_shapes.push_back(input->tensor.shape);
+      }
+      *tasks =
+          Concat(node_id, inputs, outputs[0],
+                 absl::any_cast<ConcatAttributes>(node->operation.attributes),
+                 input_shapes);
+      break;
+    }
+    case OperationType::CONVOLUTION_2D:
+      *tasks = SelectConvolution(
+          graph, node_id, inputs[0], outputs[0],
+          absl::any_cast<Convolution2DAttributes>(node->operation.attributes),
+          options);
+      break;
+    case OperationType::CONVOLUTION_TRANSPOSED:
+      *tasks =
+          ConvolutionTransposed(node_id, inputs[0], outputs[0],
+                                absl::any_cast<ConvolutionTransposedAttributes>(
+                                    node->operation.attributes),
+                                options);
+      break;
+    case OperationType::DEPTHWISE_CONVOLUTION:
+      *tasks =
+          SelectDepthWiseConv(node_id, inputs[0], outputs[0],
+                              absl::any_cast<DepthwiseConvolution2DAttributes>(
+                                  node->operation.attributes),
+                              options);
+      break;
+    case OperationType::FULLY_CONNECTED:
+      *tasks = FullyConnected(
+          node_id, inputs[0], outputs[0],
+          absl::any_cast<FullyConnectedAttributes>(node->operation.attributes),
+          options);
+      break;
+    case OperationType::HARD_SWISH:
+      *tasks = HardSwish(node_id, inputs[0], outputs[0], options);
+      break;
+    case OperationType::MAX_UNPOOLING_2D:
+      *tasks = MaxUnpooling(
+          node_id, inputs[0], inputs[1], outputs[0],
+          absl::any_cast<MaxUnpooling2DAttributes>(node->operation.attributes));
+      break;
+    case OperationType::MULTIPLY_SCALAR:
+      *tasks = Multiply(
+          node_id, inputs[0], outputs[0],
+          absl::any_cast<MultiplyScalarAttributes>(node->operation.attributes),
+          options);
+      break;
+    case OperationType::PAD:
+      *tasks =
+          Padding(node_id, inputs[0], outputs[0],
+                  absl::any_cast<PadAttributes>(node->operation.attributes));
+      break;
+    case OperationType::POOLING_2D:
+      *tasks = Pooling(
+          node_id, inputs[0], outputs,
+          absl::any_cast<Pooling2DAttributes>(node->operation.attributes));
+      break;
+    case OperationType::PRELU:
+      *tasks = PReLU(
+          node_id, inputs[0], outputs[0],
+          absl::any_cast<PReLUAttributes>(node->operation.attributes), options);
+      break;
+    case OperationType::RELU:
+      *tasks = ReLU(node_id, inputs[0], outputs[0],
+                    absl::any_cast<ReLUAttributes>(node->operation.attributes));
+      break;
+    case OperationType::RESHAPE:
+      *tasks = SelectReshape(
+          graph, node_id, inputs[0], outputs[0],
+          absl::any_cast<ReshapeAttributes>(node->operation.attributes));
+      break;
+    case OperationType::SLICE:
+      *tasks =
+          Slice(node_id, inputs[0], outputs[0],
+                absl::any_cast<SliceAttributes>(node->operation.attributes));
+      break;
+    case OperationType::SOFTMAX: {
+      auto attr = absl::any_cast<SoftmaxAttributes>(node->operation.attributes);
+      if (attr.axis != Axis::CHANNELS) {
+        return UnimplementedError("Softmax supports only CHANNELS dimension");
+      }
+      *tasks = SelectSoftmax(graph, node_id, inputs[0], outputs[0]);
+      break;
+    }
+    case OperationType::UPSAMPLE_2D:
+      *tasks = Upsample(
+          node_id, inputs[0], outputs[0],
+          absl::any_cast<Upsample2DAttributes>(node->operation.attributes));
+      break;
+    case OperationType::ABS:
+    case OperationType::COS:
+    case OperationType::LOG:
+    case OperationType::RSQRT:
+    case OperationType::SIGMOID:
+    case OperationType::SIN:
+    case OperationType::SQRT:
+    case OperationType::SQUARE:
+    case OperationType::TANH:
+      *tasks = ElementwiseWithOneInput(node_id, inputs[0], outputs[0], op_type);
+      break;
+    case OperationType::SUB:
+    case OperationType::DIV:
+    case OperationType::POW:
+    case OperationType::SQUARED_DIFF:
+      *tasks = ElementwiseWithTwoInputs(node_id, inputs, outputs[0], op_type);
+      break;
+    case OperationType::APPLY_MASK:
+    case OperationType::BATCH_NORMALIZATION:
+    case OperationType::BATCH_TO_SPACE:
+    case OperationType::CONST:
+    case OperationType::LSTM:
+    case OperationType::MUL:
+    case OperationType::RESIZE:
+    case OperationType::SPACE_TO_BATCH:
+    case OperationType::UNKNOWN:
+      return UnimplementedError("Unsupported op: " + node->operation.type);
+  }
+  return OkStatus();
+}
+
 }  // namespace
 
 Status Compile(const GraphFloat32& graph, const RuntimeOptions& options,
                CompiledModel* compiled_model) {
   for (const auto& node : graph.nodes()) {
-    int node_id = static_cast<int>(node->id);
     std::vector<ValueId> inputs;
     for (auto& input : graph.FindInputs(node->id)) {
       inputs.push_back(static_cast<ValueId>(input->id));
@@ -129,143 +267,19 @@ Status Compile(const GraphFloat32& graph, const RuntimeOptions& options,
     for (auto& output : graph.FindOutputs(node->id)) {
       outputs.push_back(static_cast<ValueId>(output->id));
     }
-
     std::vector<ComputeTaskDescriptorPtr> tasks;
-    auto op_type = OperationTypeFromString(node->operation.type);
-    switch (op_type) {
-      case OperationType::ADD:
-        tasks = Add(node_id, inputs, outputs[0],
-                    absl::any_cast<AddAttributes>(node->operation.attributes),
-                    options);
-        break;
-      case OperationType::CONCAT: {
-        std::vector<BHWC> input_shapes;
-        for (auto& input : graph.FindInputs(node->id)) {
-          input_shapes.push_back(input->tensor.shape);
-        }
-        tasks =
-            Concat(node_id, inputs, outputs[0],
-                   absl::any_cast<ConcatAttributes>(node->operation.attributes),
-                   input_shapes);
-        break;
+    auto custom_status =
+        RegisterCustomOps(graph, node, inputs, outputs, options, &tasks);
+    if (!custom_status.ok()) {
+      auto primary_status =
+          RegisterPrimaryOps(graph, node, inputs, outputs, options, &tasks);
+      if (!primary_status.ok()) {
+        return UnimplementedError(absl::Substitute(
+            "Unsupported op type: $0; custom registry error: "
+            "$1; primary registry error: $2;",
+            node->operation.type, custom_status.error_message(),
+            primary_status.error_message()));
       }
-      case OperationType::CONVOLUTION_2D:
-        tasks = SelectConvolution(
-            graph, node_id, inputs[0], outputs[0],
-            absl::any_cast<Convolution2DAttributes>(node->operation.attributes),
-            options);
-        break;
-      case OperationType::CONVOLUTION_TRANSPOSED:
-        tasks = ConvolutionTransposed(
-            node_id, inputs[0], outputs[0],
-            absl::any_cast<ConvolutionTransposedAttributes>(
-                node->operation.attributes),
-            options);
-        break;
-      case OperationType::DEPTHWISE_CONVOLUTION:
-        tasks = SelectDepthWiseConv(
-            node_id, inputs[0], outputs[0],
-            absl::any_cast<DepthwiseConvolution2DAttributes>(
-                node->operation.attributes),
-            options);
-        break;
-      case OperationType::FULLY_CONNECTED:
-        tasks = FullyConnected(node_id, inputs[0], outputs[0],
-                               absl::any_cast<FullyConnectedAttributes>(
-                                   node->operation.attributes),
-                               options);
-        break;
-      case OperationType::HARD_SWISH:
-        tasks = HardSwish(node_id, inputs[0], outputs[0], options);
-        break;
-      case OperationType::MAX_UNPOOLING_2D:
-        tasks = MaxUnpooling(node_id, inputs[0], inputs[1], outputs[0],
-                             absl::any_cast<MaxUnpooling2DAttributes>(
-                                 node->operation.attributes));
-        break;
-      case OperationType::MULTIPLY_SCALAR:
-        tasks = Multiply(node_id, inputs[0], outputs[0],
-                         absl::any_cast<MultiplyScalarAttributes>(
-                             node->operation.attributes),
-                         options);
-        break;
-      case OperationType::PAD:
-        tasks =
-            Padding(node_id, inputs[0], outputs[0],
-                    absl::any_cast<PadAttributes>(node->operation.attributes));
-        break;
-      case OperationType::POOLING_2D:
-        tasks = Pooling(
-            node_id, inputs[0], outputs,
-            absl::any_cast<Pooling2DAttributes>(node->operation.attributes));
-        break;
-      case OperationType::PRELU:
-        tasks =
-            PReLU(node_id, inputs[0], outputs[0],
-                  absl::any_cast<PReLUAttributes>(node->operation.attributes),
-                  options);
-        break;
-      case OperationType::RELU:
-        tasks =
-            ReLU(node_id, inputs[0], outputs[0],
-                 absl::any_cast<ReLUAttributes>(node->operation.attributes));
-        break;
-      case OperationType::RESHAPE:
-        tasks = SelectReshape(
-            graph, node_id, inputs[0], outputs[0],
-            absl::any_cast<ReshapeAttributes>(node->operation.attributes));
-        break;
-      case OperationType::SLICE:
-        tasks =
-            Slice(node_id, inputs[0], outputs[0],
-                  absl::any_cast<SliceAttributes>(node->operation.attributes));
-        break;
-      case OperationType::SOFT_MAX: {
-        auto attr =
-            absl::any_cast<SoftMaxAttributes>(node->operation.attributes);
-        if (attr.axis != Axis::CHANNELS) {
-          return UnimplementedError("Softmax supports only CHANNELS dimension");
-        }
-        tasks = SelectSoftmax(graph, node_id, inputs[0], outputs[0]);
-        break;
-      }
-      case OperationType::UPSAMPLE_2D:
-        tasks = Upsample(
-            node_id, inputs[0], outputs[0],
-            absl::any_cast<Upsample2DAttributes>(node->operation.attributes));
-        break;
-
-      case OperationType::ABS:
-      case OperationType::COS:
-      case OperationType::LOG:
-      case OperationType::RSQRT:
-      case OperationType::SIGMOID:
-      case OperationType::SIN:
-      case OperationType::SQRT:
-      case OperationType::SQUARE:
-      case OperationType::TANH:
-        tasks =
-            ElementwiseWithOneInput(node_id, inputs[0], outputs[0], op_type);
-        break;
-
-      case OperationType::SUB:
-      case OperationType::DIV:
-      case OperationType::POW:
-      case OperationType::SQUARED_DIFF:
-        tasks = ElementwiseWithTwoInputs(node_id, inputs, outputs[0], op_type);
-        break;
-
-      case OperationType::APPLY_MASK:
-      case OperationType::BATCH_NORMALIZATION:
-      case OperationType::BATCH_TO_SPACE:
-      case OperationType::CONST:
-      case OperationType::LSTM:
-      case OperationType::MUL:
-      case OperationType::RESIZE:
-      case OperationType::SPACE_TO_BATCH:
-      case OperationType::STRETCH_TIME:
-      case OperationType::UNKNOWN:
-        return UnimplementedError("Unsupported op: " + node->operation.type);
     }
     compiled_model->insert(compiled_model->end(), tasks.begin(), tasks.end());
   }
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
index 309e36ebeca..2bbb5e01559 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
@@ -112,7 +112,7 @@ using ::tflite::gpu::TensorUsageRecord;
   }
 
   tflite::gpu::ObjectsAssignment<size_t> assignment;
-  RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY, &assignment));
+  RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY_BEST, &assignment));
   auto objectsCount = assignment.object_sizes.size();
   std::vector<id<MTLBuffer>> sharedBuffers(objectsCount);
   size_t dataTypeSize = _options.storage_precision == RuntimeOptions::Precision::FP32
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 467bb1d2012..17e59e70ebe 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -119,6 +119,41 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "conv_test_lib",
+    testonly = 1,
+    srcs = ["conv_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":conv",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "conv_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":conv_test_lib"],
+)
+
+cc_library(
+    name = "custom_registry",
+    srcs = ["custom_registry.cc"],
+    hdrs = ["custom_registry.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
+        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
+    ],
+)
+
 cc_library(
     name = "depthwise_conv",
     srcs = ["depthwise_conv.cc"],
@@ -136,6 +171,28 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "depthwise_conv_test_lib",
+    testonly = 1,
+    srcs = ["depthwise_conv_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":depthwise_conv",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "depthwise_conv_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":depthwise_conv_test_lib"],
+)
+
 cc_library(
     name = "elementwise",
     srcs = ["elementwise.cc"],
@@ -153,6 +210,28 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "elementwise_test_lib",
+    testonly = 1,
+    srcs = ["elementwise_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":elementwise",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "elementwise_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":elementwise_test_lib"],
+)
+
 cc_library(
     name = "fully_connected",
     srcs = ["fully_connected.cc"],
@@ -170,6 +249,28 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "fully_connected_test_lib",
+    testonly = 1,
+    srcs = ["fully_connected_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":fully_connected",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "fully_connected_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":fully_connected_test_lib"],
+)
+
 cc_library(
     name = "hard_swish",
     srcs = ["hard_swish.cc"],
@@ -197,6 +298,28 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "max_unpooling_test_lib",
+    testonly = 1,
+    srcs = ["max_unpooling_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":max_unpooling",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "max_unpooling_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":max_unpooling_test_lib"],
+)
+
 cc_library(
     name = "mul",
     srcs = ["mul.cc"],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
new file mode 100644
index 00000000000..b9cbd65620d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
@@ -0,0 +1,243 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::Axis;
+using ::tflite::gpu::Convolution2DAttributes;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::HW;
+using ::tflite::gpu::Linear;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::OperationType;
+using ::tflite::gpu::OHWI;
+using ::tflite::gpu::Tensor;
+using ::tflite::gpu::TensorRef;
+
+@interface ConvTest : XCTestCase
+@end
+
+@implementation ConvTest
+- (void)setUp {
+  [super setUp];
+}
+
+- (void)testO2H2W1I1Stride1x1Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  Convolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data = {1, 1};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 2, 1, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 2, 2, 2);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({4, 8, 4, 8, 2, 4, 2, 4}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO1H2W2I1Stride1x1Dilation2x2 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  Convolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data.push_back(0.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 2, 2, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(2, 2);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 1, 1, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({10}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO1H3W3I1Stride1x1Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  Convolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 1;
+  bias.id = 1;
+  bias.data.push_back(1.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 3, 3, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 1, 2, 3, 1, 2, 3};
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 1, 1, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({11}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO2H1W1I2Stride1x1Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 1, 2);
+
+  Convolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data = {1, 1};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 1, 1, 2);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 2, 1, 2);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({4, 8, 4, 8}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO1H1W1I1Stride2x2Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  Convolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data.push_back(0.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 1, 1, 1);
+  weights.id = 2;
+  weights.data.push_back(2.0);
+
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 2, 2, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({2, 4, 8, 16}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.cc b/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.cc
new file mode 100644
index 00000000000..228583c6e30
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.cc
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h"
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+Status RegisterCustomOps(const GraphFloat32& graph, const Node* node,
+                         const std::vector<ValueId>& inputs,
+                         const std::vector<ValueId>& outputs,
+                         const RuntimeOptions& options,
+                         std::vector<ComputeTaskDescriptorPtr>* tasks) {
+  return UnimplementedError("Unsupported op: " + node->operation.type);
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h b/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h
new file mode 100644
index 00000000000..bef2ba20def
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+// Registers custom operations.
+Status RegisterCustomOps(const GraphFloat32& graph, const Node* node,
+                         const std::vector<ValueId>& inputs,
+                         const std::vector<ValueId>& outputs,
+                         const RuntimeOptions& options,
+                         std::vector<ComputeTaskDescriptorPtr>* tasks);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm
new file mode 100644
index 00000000000..f4215be5ad5
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::Axis;
+using ::tflite::gpu::DepthwiseConvolution2DAttributes;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::HW;
+using ::tflite::gpu::Linear;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::OperationType;
+using ::tflite::gpu::OHWI;
+using ::tflite::gpu::Tensor;
+using ::tflite::gpu::TensorRef;
+
+@interface DepthwiseConvTest : XCTestCase
+@end
+
+@implementation DepthwiseConvTest
+- (void)setUp {
+  [super setUp];
+}
+
+- (void)testO4H1W1I2Strides1x1Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 1, 1, 2);
+
+  DepthwiseConvolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 4;
+  bias.id = 1;
+  bias.data = {1, 2, 3, 4};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 1, 1, 2);
+  weights.id = 2;
+  weights.data = {1, 3, 2, 4};
+
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 1, 1, 4);
+
+  SingleOpModel model(
+      {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input},
+      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 3}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({2, 4, 12, 16}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO2H1W1I1Strides2x2Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  DepthwiseConvolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 4;
+  bias.id = 1;
+  bias.data = {0, 0};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 1, 1, 1);
+  weights.id = 1;
+  weights.data = {1, 3};
+
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 2, 2, 2);
+
+  SingleOpModel model(
+      {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input},
+      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 0, 1, 1, 0, 1, 1, 0, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({1, 3, 1, 3, 1, 3, 1, 3}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testO2H2W2I1Strides1x1Dilation2x2 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  DepthwiseConvolution2DAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 4;
+  bias.id = 1;
+  bias.data = {0, 0};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 2, 2, 1);
+  weights.id = 1;
+  weights.data = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  attr.weights = std::move(weights);
+
+  attr.dilations = HW(2, 2);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 1, 1, 2);
+
+  SingleOpModel model(
+      {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input},
+      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 0, 1, 1, 0, 1, 1, 0, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({10, 26}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
new file mode 100644
index 00000000000..e2e4c5b7e0f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -0,0 +1,199 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::OperationType;
+using ::tflite::gpu::TensorRef;
+
+@interface ElementwiseTest : XCTestCase
+@end
+
+@implementation ElementwiseTest
+- (void)setUp {
+  [super setUp];
+}
+
+TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
+  TensorRef<BHWC> tensor_ref;
+  tensor_ref.type = DataType::FLOAT32;
+  tensor_ref.ref = ref;
+  tensor_ref.shape = shape;
+  return tensor_ref;
+}
+
+- (void)testAbs {
+  OperationType op_type = OperationType::ABS;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.0, 6.2, 2.0, 4.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testCos {
+  OperationType op_type = OperationType::COS;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, 3.1415926, -3.1415926, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({1.0, -1.0, -1.0, 0.540302}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testHardSwish {
+  OperationType op_type = OperationType::HARD_SWISH;
+  const BHWC shape(1, 1, 1, 7);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {-4.5f, -3.0f, -1.5f, 0.0f, 1.5f, 3.0f, 4.5f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status =
+      CompareVectors({0.0f, 0.0f, -0.375f, 0.0f, 1.125f, 3.f, 4.5f}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testLog {
+  OperationType op_type = OperationType::LOG;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {1.0, 3.1415926, 1.0, 1.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.0, 1.14473, 0.0, 0.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testRsqrt {
+  OperationType op_type = OperationType::RSQRT;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {1.0, 2.0, 4.0, 9.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({1.0, 0.707106, 0.5, 0.333333}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testSigmoid {
+  OperationType op_type = OperationType::SIGMOID;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.0, 2.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.5, 0.002473, 0.880797, 0.982014}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testSin {
+  OperationType op_type = OperationType::SIN;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, 3.1415926, -3.1415926, 1.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.0, 0.0, 0.0, 0.841471}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testSqrt {
+  OperationType op_type = OperationType::SQRT;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 2.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.0, 1.0, 1.414213, 2.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testSquare {
+  OperationType op_type = OperationType::SQUARE;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {1.0, 2.0, 0.5, -3.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({1.0, 4.0, 0.25, 9.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testSub {
+  OperationType op_type = OperationType::SUB;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)},
+                      /*outputs=*/{GetTensorRef(2, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0}));
+  XCTAssertTrue(model.PopulateTensor(1, {1.0, 2.0, 3.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({-1.0, -8.2, -1.0, 0.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testTanh {
+  OperationType op_type = OperationType::TANH;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.0, 2.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({0.0, -0.999987, 0.964027, 0.999329}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm
new file mode 100644
index 00000000000..8f67ef489b6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::FullyConnectedAttributes;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::Linear;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::Tensor;
+using ::tflite::gpu::TensorRef;
+using ::tflite::gpu::OHWI;
+using ::tflite::gpu::OperationType;
+
+@interface FullyConnectedTest : XCTestCase
+@end
+
+@implementation FullyConnectedTest
+- (void)setUp {
+  [super setUp];
+}
+
+- (void)testMatrixByVectorMultiplication {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 1, 1, 2);
+
+  FullyConnectedAttributes attr;
+
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 4;
+  bias.id = 1;
+  bias.data = {1, 2, 3, 4};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(4, 1, 1, 2);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4, 5, 6, 7, 8};
+  attr.weights = std::move(weights);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 1, 1, 4);
+
+  SingleOpModel model({ToString(OperationType::FULLY_CONNECTED), attr}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 2}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({6, 13, 20, 27}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm
new file mode 100644
index 00000000000..a7231295183
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::MaxUnpooling2DAttributes;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::HW;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::TensorRef;
+using ::tflite::gpu::OperationType;
+
+@interface MaxUnpoolingTest : XCTestCase
+@end
+
+@implementation MaxUnpoolingTest
+- (void)setUp {
+  [super setUp];
+}
+
+- (void)testKernel2x2Stride2x2 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> indices;
+  indices.type = DataType::INT32;
+  indices.ref = 1;
+  indices.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 4, 4, 1);
+
+  MaxUnpooling2DAttributes attr;
+  attr.kernel = HW(2, 2);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+
+  SingleOpModel model({ToString(OperationType::MAX_UNPOOLING_2D), attr}, {input, indices},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 2, 3, 4}));
+  XCTAssertTrue(model.PopulateTensor(1, {0, 0, 0, 0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status =
+      CompareVectors({1, 0, 2, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index d38e73a4a19..d77e9960098 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -18,55 +18,54 @@ limitations under the License.
 
 #import <Metal/Metal.h>
 
-#include <functional>
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
 
-#include "tensorflow/lite/c/c_api_internal.h"
+typedef struct TfLiteDelegate TfLiteDelegate;
+
+typedef enum {
+  // waitUntilCompleted
+  TFLGpuDelegateWaitTypePassive,
+  // Minimize latency. It uses active spinning instead of mutex and consumes
+  // additional CPU resources.
+  TFLGpuDelegateWaitTypeActive,
+  // Useful when the output is used with GPU pipeline then or if external
+  // command encoder is set.
+  TFLGpuDelegateWaitTypeDoNotWait,
+  // Tries to avoid GPU sleep mode.
+  TFLGpuDelegateWaitTypeAggressive,
+} TFLGpuDelegateWaitType;
 
 // Creates a new delegate instance that need to be destroyed with
 // DeleteFlowDelegate when delegate is no longer used by tflite.
-struct GpuDelegateOptions {
+typedef struct {
   // Allows to quantify tensors, downcast values, process in float16 etc.
   bool allow_precision_loss;
-
-  enum class WaitType {
-    // waitUntilCompleted
-    kPassive,
-    // Minimize latency. It uses active spinning instead of mutex and consumes
-    // additional CPU resources.
-    kActive,
-    // Useful when the output is used with GPU pipeline then or if external
-    // command encoder is set.
-    kDoNotWait,
-    // Tries to avoid GPU sleep mode.
-    kAggressive,
-  };
-  WaitType wait_type;
-};
+  TFLGpuDelegateWaitType wait_type;
+} TFLGpuDelegateOptions;
 
 // Creates a new delegate instance that need to be destroyed with
-// `DeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
+// `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
 // When `options` is set to `nullptr`, the following default values are used:
 // .precision_loss_allowed = false,
 // .wait_type = kPassive,
-TfLiteDelegate* NewGpuDelegate(const GpuDelegateOptions* options);
+TfLiteDelegate* TFLGpuDelegateCreate(const TFLGpuDelegateOptions* options);
 
-// Destroys a delegate created with `NewGpuDelegate` call.
-void DeleteGpuDelegate(TfLiteDelegate* delegate);
+// Destroys a delegate created with `TFLGpuDelegateCreate` call.
+void TFLGpuDelegateDelete(TfLiteDelegate* delegate);
 
 // Binds Metal buffer to an input or an output tensor in the initialized
-// delegate.  Bound buffer should have sufficient storage to accommodate all
-// elements of a tensor.  Returns non-zero on success, or zero otherwise.
+// delegate. Bound buffer should have sufficient storage to accommodate all
+// elements of a tensor. Returns non-zero on success, or zero otherwise.
 //
 // *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
-bool BindMetalBufferToTensor(TfLiteDelegate* delegate, int tensor_index,
-                             id<MTLBuffer> metal_buffer);
+bool TFLGpuDelegateBindMetalBufferToTensor(TfLiteDelegate* delegate,
+                                           int tensor_index,
+                                           id<MTLBuffer> metal_buffer);
 
-// Binds user-defined MTLComputeCommandEncoder. The delegate puts all GPU tasks
-// into this encoder instead of the internal encoder.
-// The callback is a user-defined function to take control over encoder and
-// command buffer. Can be nullptr.
-bool TFLSetCommandEncoder(
-    TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder,
-    std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
 
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 36d60e9891f..eed2abc2cf2 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -107,10 +107,12 @@ class GpuAlarmClock {
       total_alarms_ = 1;
       NSString* error;
       id<MTLComputePipelineState> program;
+      // TODO(impjdi): Properly handle returned status.
       CreateComputeProgram(device_,
                            @"kernel void ComputeFunction(device int* output_buffer [[buffer(0)]]) "
                            @"{ output_buffer[0] = 0; }",
-                           @"ComputeFunction", nullptr, &program);
+                           @"ComputeFunction", nullptr, &program)
+          .IgnoreError();
       stub_program_ = program;
       stub_buffer_ = [device_ newBufferWithLength:sizeof(int) * 4
                                           options:MTLResourceHazardTrackingModeUntracked];
@@ -165,17 +167,17 @@ class Delegate {
   };
 
  public:
-  explicit Delegate(const GpuDelegateOptions* options) {
+  explicit Delegate(const TFLGpuDelegateOptions* options) {
     if (options) {
       options_ = *options;
     } else {
       // Default options.
       options_.allow_precision_loss = false;
-      options_.wait_type = GpuDelegateOptions::WaitType::kPassive;
+      options_.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
     }
     metal_device_ = MTLCreateSystemDefaultDevice();
     command_queue_ = [metal_device_ newCommandQueue];
-    if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) {
+    if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeAggressive) {
       gpu_alarm_clock_ = std::unique_ptr<GpuAlarmClock>(new GpuAlarmClock(command_queue_));
       NSString* code = @R"(
           kernel void ComputeFunction(device int* output_buffer [[buffer(0)]],
@@ -185,7 +187,9 @@ class Delegate {
         )";
       NSString* error;
       id<MTLComputePipelineState> signal_program;
-      CreateComputeProgram(metal_device_, code, @"ComputeFunction", nullptr, &signal_program);
+      // TODO(impjdi): Properly handle returned status.
+      CreateComputeProgram(metal_device_, code, @"ComputeFunction", nullptr, &signal_program)
+          .IgnoreError();
       signal_program_ = signal_program;
       signal_buffer_ = [metal_device_ newBufferWithLength:sizeof(int) * 4
                                                   options:MTLResourceStorageModeShared |
@@ -395,7 +399,8 @@ class Delegate {
   }
 
   Status Invoke(TfLiteContext* context) {
-    if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) gpu_alarm_clock_->Stop();
+    if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeAggressive)
+      gpu_alarm_clock_->Stop();
     // We need only synchronization so volatile works better than atomic which reads from global
     // memory each time.
     __block volatile bool buffer_completed = false;
@@ -427,30 +432,31 @@ class Delegate {
       }
     }
 
-    [inference_context_ encodeWithEncoder:encoder
-                       inputOutputBuffers:bphwc4_buffers_
-                             encoderBlock:^(bool isLast) {
-                               if (control_encoder_ != nullptr) {
-                                 return control_encoder_(isLast);
-                               }
-                               if (external_command_encoder_ != nil ||
-                                   options_.wait_type == GpuDelegateOptions::WaitType::kPassive) {
-                                 return encoder;
-                               }
-                               if (isLast) {
-                                 if (options_.wait_type == GpuDelegateOptions::WaitType::kActive) {
-                                   [command_buffer addCompletedHandler:^(id<MTLCommandBuffer>) {
-                                     buffer_completed = true;
-                                   }];
-                                 }
-                               } else {
-                                 [encoder endEncoding];
-                                 [command_buffer commit];
-                                 command_buffer = [command_queue_ commandBuffer];
-                                 encoder = [command_buffer computeCommandEncoder];
-                               }
-                               return encoder;
-                             }];
+    [inference_context_
+         encodeWithEncoder:encoder
+        inputOutputBuffers:bphwc4_buffers_
+              encoderBlock:^(bool isLast) {
+                if (control_encoder_ != nullptr) {
+                  return control_encoder_(isLast);
+                }
+                if (external_command_encoder_ != nil ||
+                    options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive) {
+                  return encoder;
+                }
+                if (isLast) {
+                  if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeActive) {
+                    [command_buffer addCompletedHandler:^(id<MTLCommandBuffer>) {
+                      buffer_completed = true;
+                    }];
+                  }
+                } else {
+                  [encoder endEncoding];
+                  [command_buffer commit];
+                  command_buffer = [command_queue_ commandBuffer];
+                  encoder = [command_buffer computeCommandEncoder];
+                }
+                return encoder;
+              }];
     for (const auto& output : graph_outputs_) {
       if (output.set_externally) continue;
       if (bphwc4_buffers_[output.id] == input_output_buffers_[output.id]) continue;
@@ -463,16 +469,16 @@ class Delegate {
     if (external_command_encoder_ == nil) {
       [encoder endEncoding];
       [command_buffer commit];
-      if (options_.wait_type == GpuDelegateOptions::WaitType::kActive) {
+      if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeActive) {
         while (!buffer_completed) {
           // Busy wait. Use local variable. Volatile uses RAM access all the time.
           for (volatile int i = 0; i < 100; i++) {
           }
         }
-      } else if (options_.wait_type == GpuDelegateOptions::WaitType::kPassive) {
+      } else if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive) {
         // passive wait: this thread sleeps until GPU finishes.
         [command_buffer waitUntilCompleted];
-      } else if (options_.wait_type == GpuDelegateOptions::WaitType::kAggressive) {
+      } else if (options_.wait_type == TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeAggressive) {
         command_buffer = [command_queue_ commandBuffer];
         encoder = [command_buffer computeCommandEncoder];
         [encoder setComputePipelineState:signal_program_];
@@ -527,7 +533,7 @@ class Delegate {
       kTfLiteDelegateFlagsNone,       // .flags
   };
 
-  GpuDelegateOptions options_;
+  TFLGpuDelegateOptions options_;
 
   id<MTLDevice> metal_device_;
 
@@ -613,22 +619,25 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 }  // namespace gpu
 }  // namespace tflite
 
-TfLiteDelegate* NewGpuDelegate(const GpuDelegateOptions* options) {
+TfLiteDelegate* TFLGpuDelegateCreate(const TFLGpuDelegateOptions* options) {
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO, "Created TensorFlow Lite delegate for Metal.");
   auto* metal_delegate = new ::tflite::gpu::metal::Delegate(options);
   return metal_delegate ? metal_delegate->tflite_delegate() : nullptr;
 }
 
-void DeleteGpuDelegate(TfLiteDelegate* delegate) {
+void TFLGpuDelegateDelete(TfLiteDelegate* delegate) {
   delete ::tflite::gpu::metal::GetMetalDelegate(delegate);
 }
 
-bool BindMetalBufferToTensor(TfLiteDelegate* delegate, int tensor_index, id<MTLBuffer> buffer) {
+bool TFLGpuDelegateBindMetalBufferToTensor(TfLiteDelegate* delegate, int tensor_index,
+                                           id<MTLBuffer> buffer) {
   auto* metal_delegate = ::tflite::gpu::metal::GetMetalDelegate(delegate);
   return metal_delegate && metal_delegate->BindBufferToTensor(buffer, tensor_index).ok();
 }
 
-bool TFLSetCommandEncoder(
+// Note: This function is not exposed in `metal_delegate.h`, but it's exposed in
+// `metal_delegate_internal.h`.
+bool TFLGpuDelegateSetCommandEncoder(
     TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder,
     std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder) {
   auto* metal_delegate = ::tflite::gpu::metal::GetMetalDelegate(delegate);
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate_internal.h b/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
new file mode 100644
index 00000000000..005925fb01a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
+
+#import <Metal/Metal.h>
+
+#include <functional>
+
+struct TfLiteDelegate;
+
+// Binds user-defined MTLComputeCommandEncoder. The delegate puts all GPU tasks
+// into this encoder instead of the internal encoder.
+// The callback is a user-defined function to take control over encoder and
+// command buffer. Can be nullptr.
+bool TFLGpuDelegateSetCommandEncoder(
+    TfLiteDelegate* delegate, id<MTLComputeCommandEncoder> encoder,
+    std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder);
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
diff --git a/tensorflow/lite/delegates/gpu/spi.h b/tensorflow/lite/delegates/gpu/spi.h
index 023cc7a2c34..fcc3a5714ef 100644
--- a/tensorflow/lite/delegates/gpu/spi.h
+++ b/tensorflow/lite/delegates/gpu/spi.h
@@ -34,6 +34,18 @@ class TensorObjectConverter {
                          const TensorObject& output) = 0;
 };
 
+class TensorObjectConverterBuilder {
+ public:
+  virtual ~TensorObjectConverterBuilder() = default;
+
+  virtual bool IsSupported(const TensorObjectDef& input,
+                           const TensorObjectDef& output) = 0;
+
+  virtual Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) = 0;
+};
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 7cd5d146a13..954a943715c 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -18,9 +18,14 @@ cc_library(
         ],
         "//conditions:default": [
             "nnapi_delegate.cc",
+            "quant_lstm_sup.h",
+            "quant_lstm_sup.cc",
         ],
     }),
-    hdrs = ["nnapi_delegate.h"],
+    hdrs = [
+        "nnapi_delegate.h",
+        "nnapi_delegate_kernel.h",
+    ],
     deps = [
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:kernel_api",
@@ -51,4 +56,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "quant_lstm_sup_test",
+    size = "small",
+    srcs = [
+        "quant_lstm_sup.cc",
+        "quant_lstm_sup.h",
+        "quant_lstm_sup_test.cc",
+    ],
+    deps = [
+        ":nnapi_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 3e680162452..5e1e8960f40 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -16,6 +16,7 @@ limitations under the License.
 package org.tensorflow.lite.nnapi;
 
 import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.TensorFlowLite;
 
 /** {@link Delegate} for NNAPI inference. */
 public class NnApiDelegate implements Delegate, AutoCloseable {
@@ -44,4 +45,9 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
   }
 
   private static native long createDelegate();
+
+  static {
+    // Ensure the native TensorFlow Lite libraries are available.
+    TensorFlowLite.init();
+  }
 }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 4b4737c9084..2ed4620b1b1 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -14,27 +14,23 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
+#include <algorithm>
 #include <cstdarg>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <functional>
+#include <initializer_list>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <vector>
 
-#include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/context_util.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/nnapi/nnapi_implementation.h"
-#include "tensorflow/lite/util.h"
-
+// This section needs to be before the import of nnapi_delegate_kernel
+// because the code changes according to  the definition of
+// TFLITE_NNAPI_ALLOW_MMAP_SHARING
 #ifdef __ANDROID__
 #include <sys/system_properties.h>
 #endif
@@ -44,6 +40,19 @@ limitations under the License.
 #include <unistd.h>
 #endif
 
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
+#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include "tensorflow/lite/util.h"
+
 namespace tflite {
 namespace {
 
@@ -59,8 +68,6 @@ namespace {
     }                                                                         \
   } while (0)
 
-namespace {
-
 bool IsFloat(TfLiteType type) {
   switch (type) {
     case kTfLiteFloat32:
@@ -120,6 +127,12 @@ bool IsFloatOrUint8Operator(const TfLiteContext* context,
   return IsFloatOrUInt8(input_type);
 }
 
+bool IsFloatOrQuant8Operator(const TfLiteContext* context,
+                             const TfLiteNode* node) {
+  const auto input_type = context->tensors[node->inputs->data[0]].type;
+  return IsFloat(input_type) || IsQuantized(input_type);
+}
+
 // Check if the operation requires explict conversion from int8 to uint8 values.
 bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
                         const TfLiteNode* node) {
@@ -140,9 +153,51 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
       }
       return false;
     }
+    case kTfLiteBuiltinSelect: {
+      const auto value_type = context->tensors[node->inputs->data[1]].type;
+      return value_type == kTfLiteInt8;
+    }
+    case kTfLiteBuiltinAdd:
+    case kTfLiteBuiltinArgMax:
+    case kTfLiteBuiltinArgMin:
+    case kTfLiteBuiltinAveragePool2d:
+    case kTfLiteBuiltinBatchToSpaceNd:
+    case kTfLiteBuiltinConcatenation:
+    case kTfLiteBuiltinEqual:
+    case kTfLiteBuiltinExpandDims:
+    case kTfLiteBuiltinGreater:
+    case kTfLiteBuiltinGreaterEqual:
     case kTfLiteBuiltinL2Normalization:
+    case kTfLiteBuiltinLess:
+    case kTfLiteBuiltinLessEqual:
+    case kTfLiteBuiltinLogistic:
+    case kTfLiteBuiltinMaximum:
+    case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMean:
+    case kTfLiteBuiltinMinimum:
+    case kTfLiteBuiltinMul:
+    case kTfLiteBuiltinNotEqual:
+    case kTfLiteBuiltinPad:
+    case kTfLiteBuiltinPadv2:
+    case kTfLiteBuiltinReduceMax:
+    case kTfLiteBuiltinReduceMin:
+    case kTfLiteBuiltinRelu:
+    case kTfLiteBuiltinReluN1To1:
+    case kTfLiteBuiltinRelu6:
+    case kTfLiteBuiltinResizeBilinear:
+    case kTfLiteBuiltinResizeNearestNeighbor:
+    case kTfLiteBuiltinReshape:
+    case kTfLiteBuiltinSlice:
+    case kTfLiteBuiltinSoftmax:
+    case kTfLiteBuiltinSpaceToBatchNd:
+    case kTfLiteBuiltinSpaceToDepth:
+    case kTfLiteBuiltinDepthToSpace:
+    case kTfLiteBuiltinStridedSlice:
     case kTfLiteBuiltinSub:
-    case kTfLiteBuiltinTanh: {
+    case kTfLiteBuiltinTanh:
+    case kTfLiteBuiltinTile:
+    case kTfLiteBuiltinTopkV2:
+    case kTfLiteBuiltinTranspose: {
       return input_type == kTfLiteInt8;
     }
     default:
@@ -150,6 +205,22 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
   }
 }
 
+constexpr int kLstmFullKernelInputSize = 24;
+// The 20 input version is deprecated and kept only to
+// support old model. The latest version of the LSTM Full Kernel
+// is the one with 24 inputs
+constexpr int kLstmFullKernelNoOptionalParamsInputSize = 20;
+constexpr int kLstmBasicKernelInputSize = 5;
+
+inline bool isLstmBasicKernel(const TfLiteNode* node) {
+  return node->inputs->size == kLstmBasicKernelInputSize;
+}
+
+inline bool isLstmFullKernel(const TfLiteNode* node) {
+  return node->inputs->size == kLstmFullKernelInputSize ||
+         node->inputs->size == kLstmFullKernelNoOptionalParamsInputSize;
+}
+
 bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
                       const TfLiteNode* node) {
   switch (builtin_code) {
@@ -161,7 +232,15 @@ bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
       const TfLiteType filter_type = context->tensors[filter_id].type;
       return IsFloat(input_type) && IsQuantized(filter_type);
     }
-    case kTfLiteBuiltinLstm:
+    case kTfLiteBuiltinLstm: {
+      const int input_id = node->inputs->data[0];
+      // Input #1 is optional so use #2 to determine if hybrid.
+      const int weights_id = node->inputs->data[2];
+      const TfLiteType input_type = context->tensors[input_id].type;
+      const TfLiteType weights_type = context->tensors[weights_id].type;
+      return isLstmFullKernel(node) && IsFloat(input_type) &&
+             IsQuantized(weights_type);
+    }
     case kTfLiteBuiltinUnidirectionalSequenceLstm: {
       const int input_id = node->inputs->data[0];
       // Input #1 is optional so use #2 to determine if hybrid.
@@ -207,9 +286,6 @@ bool IsRestrictedScalesCompliant(const TfLiteContext* context,
   return input_scale * filter_scale < output_scale;
 }
 
-constexpr int32_t kMinSdkVersionForNNAPI = 27;
-constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
-constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
 constexpr size_t kDefaultByteAlignmentForNNAPI = 16;
 
 static size_t getNumPaddingBytes(size_t byte_size) {
@@ -221,16 +297,33 @@ static size_t getNumPaddingBytes(size_t byte_size) {
   return num_padding_bytes;
 }
 
+std::string SimpleJoin(const std::vector<const char*>& elements,
+                       const char* separator) {
+  // Note that we avoid use of sstream to avoid binary size bloat.
+  std::string joined_elements;
+  for (auto it = elements.begin(); it != elements.end(); ++it) {
+    if (separator && it != elements.begin()) {
+      joined_elements += separator;
+    }
+    if (*it) {
+      joined_elements += *it;
+    }
+  }
+  return joined_elements;
+}
+
 // Return NNAPI device handle with the provided null-terminated device name. If
 // no matching device could be found, nullptr will be returned.
-ANeuralNetworksDevice* GetDeviceHandle(const char* device_name_ptr) {
+ANeuralNetworksDevice* GetDeviceHandle(TfLiteContext* context,
+                                       const char* device_name_ptr) {
   if (!device_name_ptr) return nullptr;
   ANeuralNetworksDevice* device_handle = nullptr;
   std::string device_name(device_name_ptr);
-  uint32_t numDevices = 0;
-  NnApiImplementation()->ANeuralNetworks_getDeviceCount(&numDevices);
+  uint32_t num_devices = 0;
+  NnApiImplementation()->ANeuralNetworks_getDeviceCount(&num_devices);
 
-  for (uint32_t i = 0; i < numDevices; i++) {
+  std::vector<const char*> device_names;
+  for (uint32_t i = 0; i < num_devices; i++) {
     ANeuralNetworksDevice* device = nullptr;
     const char* buffer = nullptr;
     NnApiImplementation()->ANeuralNetworks_getDevice(i, &device);
@@ -239,6 +332,14 @@ ANeuralNetworksDevice* GetDeviceHandle(const char* device_name_ptr) {
       device_handle = device;
       break;
     }
+    device_names.push_back(buffer);
+  }
+  if (!device_handle) {
+    context->ReportError(context,
+                         "Could not find the specified NNAPI accelerator: %s. "
+                         "Must be one of: {%s}.",
+                         device_name_ptr,
+                         SimpleJoin(device_names, ",").c_str());
   }
   return device_handle;
 }
@@ -270,18 +371,8 @@ enum {
 
 }  // namespace
 
-// RAII NN API Model Destructor for use with std::unique_ptr
-struct NNFreeModel {
-  void operator()(ANeuralNetworksModel* model) {
-    NnApiImplementation()->ANeuralNetworksModel_free(model);
-  }
-};
-// RAII NN API Compilation Destructor for use with std::unique_ptr
-struct NNFreeCompilation {
-  void operator()(ANeuralNetworksCompilation* model) {
-    NnApiImplementation()->ANeuralNetworksCompilation_free(model);
-  }
-};
+namespace delegate {
+namespace nnapi {
 
 // RAII NN API Execution Destructor for use with std::unique_ptr
 struct NNFreeExecution {
@@ -290,110 +381,6 @@ struct NNFreeExecution {
   }
 };
 
-// Manage NNAPI shared memory handle
-class NNMemory {
- public:
-#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
-  NNMemory(const NnApi* nnapi, const char* name, size_t size) {
-    if (name && size > 0) {
-      nnapi_ = nnapi;
-      byte_size_ = size;
-      fd_ = nnapi_->ASharedMemory_create(name, size);
-      data_ptr_ = reinterpret_cast<uint8_t*>(
-          mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-      nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
-                                                 fd_, 0, &nn_memory_handle_);
-    }
-  }
-#else
-  NNMemory(const NnApi* /*nnapi*/, const char* /*name*/, size_t /*size*/) {}
-#endif
-
-  ~NNMemory() {
-#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
-    if (data_ptr_) {
-      munmap(data_ptr_, byte_size_);
-    }
-    if (nn_memory_handle_) {
-      nnapi_->ANeuralNetworksMemory_free(nn_memory_handle_);
-    }
-    if (fd_ > 0) close(fd_);
-#endif
-  }
-
-  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
-  uint8_t* get_data_ptr() { return data_ptr_; }
-
- private:
-#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
-  const NnApi* nnapi_;
-  int fd_ = 0;
-  size_t byte_size_ = 0;
-#endif
-  uint8_t* data_ptr_ = nullptr;
-  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
-};  // namespace
-
-// Track tensor indices to NN API tensor indices mapping.
-class OperandMapping {
- public:
-  // Given a TFLite index return the ANN index. If it doesn't exist
-  // return -1.
-  int lite_index_to_ann(int index) const {
-    if (index < lite_tensor_to_ann_tensor_.size())
-      return lite_tensor_to_ann_tensor_[index];
-    else
-      return -1;
-  }
-
-  // NN API uses non tensor operands instead of structs. This creates one
-  // and returns the index. It uses a std::vector and resizes it as needed
-  // keeping -1 to unmapped values. Intermediate tensors likely will not
-  // be mapped.
-  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
-
-  // Add a new mapping from `tflite_index` and return the NN API tensor index.
-  int add_new_ann_tensor_index(int tflite_index) {
-    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
-      lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
-    }
-    int new_tensor_index = next_ann_tensor_index_++;
-    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
-    return new_tensor_index;
-  }
-
-  // Given a TFLite index returns a TFLite type to which a tensor must be
-  // converted during copying the data to the memory allocated for NN API.
-  // kTfLiteNoType means no conversion is needed.
-  TfLiteType lite_index_to_ann_type_conversion(int index) const {
-    if (index >= 0 && index < index_to_type_conversion_.size())
-      return index_to_type_conversion_[index];
-    else
-      return kTfLiteNoType;
-  }
-
-  // Add a new mapping from TFLite index to a type conversion.
-  void add_type_conversion(int tflite_index, TfLiteType tflite_type) {
-    if (tflite_index >= index_to_type_conversion_.size()) {
-      index_to_type_conversion_.resize(tflite_index + 1, kTfLiteNoType);
-    }
-    index_to_type_conversion_[tflite_index] = tflite_type;
-  }
-
- private:
-  // Next index of ann tensor
-  int next_ann_tensor_index_ = 0;
-
-  // Mapping from lite index. Use a std::vector for speed and code size
-  // rather than a map.
-  std::vector<int> lite_tensor_to_ann_tensor_;
-  // Mapping from lite index to a type which tensor must be converted to during
-  // the copying of the data to the memory allocated for NN API. kTfLiteNoType
-  // means no conversion is needed. Use an std::vector for speed and code size
-  // rather than a map.
-  std::vector<TfLiteType> index_to_type_conversion_;
-};
-
 class DequantizeMapping {
  public:
   int DequantizedAnnIndex(int ann_index, TfLiteType type) const {
@@ -450,7 +437,14 @@ class NNAPIOpBuilder {
   TfLiteStatus AddVectorInt32Operand(const int32_t* values,
                                      uint32_t num_values) {
     return AddVectorOperand<int32_t>(values, num_values,
-                                     ANEURALNETWORKS_TENSOR_INT32);
+                                     ANEURALNETWORKS_TENSOR_INT32,
+                                     /*scale=*/0.f, /*zero_point=*/0);
+  }
+
+  TfLiteStatus AddVectorInt32Operand(const int32_t* values, uint32_t num_values,
+                                     float scale, int32_t zero_point) {
+    return AddVectorOperand<int32_t>(
+        values, num_values, ANEURALNETWORKS_TENSOR_INT32, scale, zero_point);
   }
 
   TfLiteStatus AddVectorFloat32Operand(const float* values,
@@ -577,6 +571,70 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
+  template <typename T>
+  TfLiteStatus AddNewInputConstantTensor(
+      int32_t nn_type, TfLiteType type, const TfLiteIntArray* dims,
+      const std::vector<T>& tensor_value,
+      const TfLiteQuantizationParams& quant_params, int* tensor_index) {
+    TF_LITE_ENSURE_OK(context_,
+                      context_->AddTensors(context_, 1, tensor_index));
+
+    TfLiteTensor* new_tensor = &context_->tensors[*tensor_index];
+    new_tensor->type = type;
+    new_tensor->allocation_type = kTfLiteDynamic;
+    new_tensor->params = quant_params;
+
+    // Not removing the new tensor in case of resizing errors since it will
+    // be cleared by the context
+    TF_LITE_ENSURE_OK(
+        context_,
+        context_->ResizeTensor(
+            context_, new_tensor,
+            // Resize Tensor takes ownership of the dims array passed as param
+            TfLiteIntArrayCopy(dims)));
+
+    memcpy(new_tensor->data.raw,
+           reinterpret_cast<const char*>(tensor_value.data()),
+           tensor_value.size() * sizeof(T));
+
+    const uint32_t tensor_rank = static_cast<uint32_t>(dims->size);
+    const uint32_t* tensor_dims = reinterpret_cast<const uint32_t*>(dims->data);
+    ANeuralNetworksOperandType operand_type{nn_type, tensor_rank, tensor_dims,
+                                            quant_params.scale,
+                                            quant_params.zero_point};
+
+    const int ann_tensor_index =
+        operand_mapping_->add_delegate_generated_input_ann_tensors_operand();
+
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    augmented_inputs_.push_back(ann_tensor_index);
+
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_tensor_index, new_tensor->data.raw,
+                      new_tensor->bytes));
+
+    return kTfLiteOk;
+  }
+
+  template <typename T>
+  TfLiteStatus AddNewInputConstantTensor(
+      int32_t nn_type, TfLiteType type, std::initializer_list<int> dims,
+      const std::vector<T>& tensor_value,
+      const TfLiteQuantizationParams& quant_params, int* tensor_index) {
+    TfLiteIntArray* dim_array = TfLiteIntArrayCreate(dims.size());
+    dim_array->size = dims.size();
+    std::copy(dims.begin(), dims.end(), dim_array->data);
+
+    const auto result = AddNewInputConstantTensor(
+        nn_type, type, dim_array, tensor_value, quant_params, tensor_index);
+    TfLiteIntArrayFree(dim_array);
+    return result;
+  }
+
  private:
   // Returns a TF Lite type which has the same memory representation as a
   // provided NN API type.
@@ -614,9 +672,13 @@ class NNAPIOpBuilder {
 
   template <typename T>
   TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
-                                int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{
-        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
+                                int32_t nn_type, float scale,
+                                int32_t zero_point) {
+    ANeuralNetworksOperandType operand_type{.type = nn_type,
+                                            .dimensionCount = 1,
+                                            .dimensions = &num_values,
+                                            .scale = scale,
+                                            .zeroPoint = zero_point};
 
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
@@ -630,6 +692,13 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
+  template <typename T>
+  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
+                                int32_t nn_type) {
+    return AddVectorOperand(values, num_values, nn_type, /*scale=*/0.f,
+                            /*zero_point=*/0);
+  }
+
   TfLiteStatus AddFloat32OutputTensor(uint32_t dimension_count,
                                       const uint32_t* dimension_data,
                                       int* ann_index_out) {
@@ -712,6 +781,11 @@ class NNAPIOpBuilder {
       case kTfLiteBool:
         nn_type = ANEURALNETWORKS_TENSOR_BOOL8;
         break;
+      case kTfLiteInt16:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT16_SYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
       default:
         context_->ReportError(
             context_, "Failed to add NN API tensor: type %s is not supported.",
@@ -763,10 +837,39 @@ class NNAPIOpBuilder {
               nn_model_, ann_tensor_index, &ann_perchannel_params));
     }
     if (tensor->allocation_type == kTfLiteMmapRo) {
+      if (IsQuantized(tensor_type) && need_int8_conversion) {
+        // We need to to add a tensor and convert the weights into uint8.
+        // Currently this is only needed for fully_connected. The new_tensor is
+        // needed for lifetime management for the converted weights.
+        int new_tensor_index = -1;
+        TF_LITE_ENSURE_OK(context_,
+                          context_->AddTensors(context_, 1, &new_tensor_index));
+        TfLiteTensor* new_tensor = &context_->tensors[new_tensor_index];
+        new_tensor->type = kTfLiteUInt8;
+        new_tensor->allocation_type = kTfLiteDynamic;
+        new_tensor->params.scale = scale;
+        new_tensor->params.zero_point = zeroPoint;
+        // Not removing the new tensor in case of resizing errors since it will
+        // be cleared by the context
+        TF_LITE_ENSURE_OK(
+            context_, context_->ResizeTensor(context_, new_tensor,
+                                             // Resize Tensor takes ownership of
+                                             // the dims array passed as param
+                                             TfLiteIntArrayCopy(tensor->dims)));
+        // Convert the int8 value into corresponding uint8 value;
+        const auto num_elements = NumElements(tensor);
+        for (int i = 0; i < num_elements; ++i) {
+          new_tensor->data.uint8[i] = static_cast<const uint8_t>(
+              static_cast<int32_t>(tensor->data.int8[i]) + 128);
+        }
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                          nn_model_, ann_tensor_index, new_tensor->data.raw,
+                          new_tensor->bytes));
 #ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
-      if (tensor->allocation &&
-          static_cast<const Allocation*>(tensor->allocation)->type() ==
-              Allocation::Type::kMMap) {
+      } else if (tensor->allocation &&
+                 static_cast<const Allocation*>(tensor->allocation)->type() ==
+                     Allocation::Type::kMMap) {
         const MMAPAllocation* mmap_alloc =
             static_cast<const MMAPAllocation*>(tensor->allocation);
         if (allocation_memory_mapping_->count(mmap_alloc) == 0) {
@@ -786,15 +889,13 @@ class NNAPIOpBuilder {
             context_, nnapi_->ANeuralNetworksModel_setOperandValueFromMemory(
                           nn_model_, ann_tensor_index, ann_memory_handle,
                           offset, tensor->bytes));
-      } else {
 #endif
+      } else {
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
                 nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
-#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
       }
-#endif
     }
 
     indices->push_back(ann_tensor_index);
@@ -829,14 +930,6 @@ class NNAPIOpBuilder {
   std::vector<uint32_t> augmented_outputs_;
 };
 
-struct NNAPIOpMappingArgs {
-  TfLiteContext* context;
-  NNAPIOpBuilder* builder;
-  TfLiteNode* node;
-  std::vector<int>* model_state_outputs;
-  std::vector<int>* model_state_tfl_inputs;
-};
-
 // Mapping function simply returning the operation type without adding any
 // additional parameter.
 template <ANeuralNetworksOperationType OperationType>
@@ -845,193 +938,175 @@ ANeuralNetworksOperationType BasicMappingFn(
   return OperationType;
 }
 
-// The kernel that represents the node sub set of TF Lite being run on NN API.
-class NNAPIDelegateKernel {
- public:
-  NNAPIDelegateKernel() { nnapi_ = NnApiImplementation(); }
-  ~NNAPIDelegateKernel() {
-    for (auto content : allocation_memory_mapping_) {
-      nnapi_->ANeuralNetworksMemory_free(content.second);
-    }
-  }
-
-  typedef ANeuralNetworksOperationType (*MappingFn)(
-      const NNAPIOpMappingArgs& mapping_args);
-
-  // Return a function that knows how to translate a node into its operands
-  // when called. You can use this function to see if a node is supported
-  // (i.e. if the returned MappingFn is null, then the node is not supported).
-  static MappingFn Map(const TfLiteContext* context, int builtin_code,
-                       int version, int android_sdk_version,
-                       const TfLiteNode* node) {
-    switch (builtin_code) {
-      case kTfLiteBuiltinAdd:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteAddParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_ADD;
-          };
+// Return a function that knows how to translate a node into its operands
+// when called. You can use this function to see if a node is supported
+// (i.e. if the returned MappingFn is null, then the node is not supported).
+NNAPIDelegateKernel::MappingFn NNAPIDelegateKernel::Map(
+    const TfLiteContext* context, int builtin_code, int version,
+    int android_sdk_version, const TfLiteNode* node,
+    bool is_accelerator_specified) {
+  switch (builtin_code) {
+    case kTfLiteBuiltinAdd:
+      if (version <= 2) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinArgMax:
-      case kTfLiteBuiltinArgMin:
-        if (version == 1) {
-          // Those operators were introduced in NNAPI 1.2.
-          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
-            return nullptr;
-          }
-          // Only certain input types are supported.
-          auto input_type = context->tensors[node->inputs->data[0]].type;
-          if (input_type != kTfLiteFloat16 && input_type != kTfLiteFloat32 &&
-              input_type != kTfLiteInt32 && input_type != kTfLiteUInt8) {
-            return nullptr;
-          }
-          // NNAPI only supports axis as int32. If the axis type is int64 and
-          // constant we can convert it to int32 if the value isn't too large.
-          const auto& axis_tensor = context->tensors[node->inputs->data[1]];
-          if (axis_tensor.type == kTfLiteInt64) {
-            if (axis_tensor.allocation_type != kTfLiteMmapRo ||
-                *axis_tensor.data.i64 > std::numeric_limits<int32_t>::max() ||
-                *axis_tensor.data.i64 < std::numeric_limits<int32_t>::min()) {
-              return nullptr;
-            }
-          } else if (axis_tensor.type != kTfLiteInt32) {
-            return nullptr;
-          }
-          if (builtin_code == kTfLiteBuiltinArgMax) {
-            // NNAPI only supports int32 output.
-            auto builtin =
-                reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
-            if (builtin->output_type != kTfLiteInt32) {
-              return nullptr;
-            }
-            return BasicMappingFn<ANEURALNETWORKS_ARGMAX>;
-          } else {
-            // NNAPI only supports int32 output.
-            auto builtin =
-                reinterpret_cast<TfLiteArgMinParams*>(node->builtin_data);
-            if (builtin->output_type != kTfLiteInt32) {
-              return nullptr;
-            }
-            return BasicMappingFn<ANEURALNETWORKS_ARGMIN>;
-          }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteAddParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_ADD;
+        };
+      }
+      break;
+    case kTfLiteBuiltinArgMax:
+    case kTfLiteBuiltinArgMin:
+      if (version <= 2) {
+        // Those operators were introduced in NNAPI 1.2.
+        if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinMul:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteMulParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_MUL;
-          };
+        // Only certain input types are supported.
+        auto input_type = context->tensors[node->inputs->data[0]].type;
+        if (input_type != kTfLiteFloat16 && input_type != kTfLiteFloat32 &&
+            input_type != kTfLiteInt32 && input_type != kTfLiteUInt8 &&
+            input_type != kTfLiteInt8) {
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinAveragePool2d:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
+        // NNAPI only supports axis as int32. If the axis type is int64 and
+        // constant we can convert it to int32 if the value isn't too large.
+        const auto& axis_tensor = context->tensors[node->inputs->data[1]];
+        if (axis_tensor.type == kTfLiteInt64) {
+          if (axis_tensor.allocation_type != kTfLiteMmapRo ||
+              *axis_tensor.data.i64 > std::numeric_limits<int32_t>::max() ||
+              *axis_tensor.data.i64 < std::numeric_limits<int32_t>::min()) {
             return nullptr;
           }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_AVERAGE_POOL_2D;
-          };
+        } else if (axis_tensor.type != kTfLiteInt32) {
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinMaxPool2d:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_MAX_POOL_2D;
-          };
-        }
-        break;
-      case kTfLiteBuiltinL2Pool2d:
-        if (version == 1) {
-          if (!IsFloatOperator(context, node)) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_L2_POOL_2D;
-          };
-        }
-        break;
-      case kTfLiteBuiltinConv2d:
-        if (version <= 2) {
-          if ((android_sdk_version < kMinSdkVersionForNNAPI12) &&
-              (IsHybridOperator(context, builtin_code, node) ||
-               !IsFloatOrUint8Operator(context, node))) {
-            // Hybrid operators not supported before NNAPI 1.2.
-            return nullptr;
-          }
-          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
-            // Per-channel quantized convolution not supported before NNAPI 1.2.
-            const auto& filter_tensor = context->tensors[node->inputs->data[1]];
-            if (filter_tensor.quantization.type == kTfLiteAffineQuantization) {
-              TfLiteAffineQuantization* quantization_params =
-                  static_cast<TfLiteAffineQuantization*>(
-                      filter_tensor.quantization.params);
-              if (quantization_params->scale->size > 1) {
-                return nullptr;
-              }
-            }
-          }
-          const auto input_type = context->tensors[node->inputs->data[0]].type;
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              input_type == kTfLiteUInt8 &&
-              !IsRestrictedScalesCompliant(context, node)) {
-            return nullptr;
-          }
+        if (builtin_code == kTfLiteBuiltinArgMax) {
+          // NNAPI only supports int32 output.
           auto builtin =
-              reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-          if (node->inputs->size != 3) {
-            // TODO(b/132950584): Add support for Conv2D with omitted bias
+              reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
+          if (builtin->output_type != kTfLiteInt32) {
             return nullptr;
           }
-          // NNAPI supports dilated Conv2D since NNAPI 1.2.
-          if (builtin->dilation_width_factor != 1 ||
-              builtin->dilation_height_factor != 1) {
-            if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+          return BasicMappingFn<ANEURALNETWORKS_ARGMAX>;
+        } else {
+          // NNAPI only supports int32 output.
+          auto builtin =
+              reinterpret_cast<TfLiteArgMinParams*>(node->builtin_data);
+          if (builtin->output_type != kTfLiteInt32) {
+            return nullptr;
+          }
+          return BasicMappingFn<ANEURALNETWORKS_ARGMIN>;
+        }
+      }
+      break;
+    case kTfLiteBuiltinMul:
+      if (version <= 2) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteMulParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_MUL;
+        };
+      }
+      break;
+    case kTfLiteBuiltinAveragePool2d:
+      if (version <= 2) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        auto builtin = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+        // TODO(b/138756912): Large filter window would overflow on the
+        // reference CPU path.
+        if (!is_accelerator_specified &&
+            (builtin->filter_width * builtin->filter_height > 256)) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          mapping_args.builder->AddPoolingParams(
+              mapping_args.node->builtin_data);
+          return ANEURALNETWORKS_AVERAGE_POOL_2D;
+        };
+      }
+      break;
+    case kTfLiteBuiltinMaxPool2d:
+      if (version <= 2) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          mapping_args.builder->AddPoolingParams(
+              mapping_args.node->builtin_data);
+          return ANEURALNETWORKS_MAX_POOL_2D;
+        };
+      }
+      break;
+    case kTfLiteBuiltinL2Pool2d:
+      if (version == 1) {
+        if (!IsFloatOperator(context, node)) {
+          return nullptr;
+        }
+        auto builtin = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+        // Pre-Q devices may not support fused activation for l2_pool.
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            builtin->activation != kTfLiteActNone) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          mapping_args.builder->AddPoolingParams(
+              mapping_args.node->builtin_data);
+          return ANEURALNETWORKS_L2_POOL_2D;
+        };
+      }
+      break;
+    case kTfLiteBuiltinConv2d:
+      if (version <= 3) {
+        if ((android_sdk_version < kMinSdkVersionForNNAPI12) &&
+            (IsHybridOperator(context, builtin_code, node) ||
+             !IsFloatOrUint8Operator(context, node))) {
+          // Hybrid operators not supported before NNAPI 1.2.
+          return nullptr;
+        }
+        if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+          // Per-channel quantized convolution not supported before NNAPI 1.2.
+          const auto& filter_tensor = context->tensors[node->inputs->data[1]];
+          if (filter_tensor.quantization.type == kTfLiteAffineQuantization) {
+            TfLiteAffineQuantization* quantization_params =
+                static_cast<TfLiteAffineQuantization*>(
+                    filter_tensor.quantization.params);
+            if (quantization_params->scale->size > 1) {
               return nullptr;
             }
-            return [](const NNAPIOpMappingArgs& mapping_args)
-                       -> ANeuralNetworksOperationType {
-              auto builtin = reinterpret_cast<TfLiteConvParams*>(
-                  mapping_args.node->builtin_data);
-              mapping_args.builder->AddScalarInt32Operand(builtin->padding);
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->stride_width);
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->stride_height);
-              mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-              mapping_args.builder->AddScalarBoolOperand(
-                  false);  // Use NHWC format
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->dilation_width_factor);
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->dilation_height_factor);
-              return ANEURALNETWORKS_CONV_2D;
-            };
+          }
+        }
+        const auto input_type = context->tensors[node->inputs->data[0]].type;
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            input_type == kTfLiteUInt8 &&
+            !IsRestrictedScalesCompliant(context, node)) {
+          return nullptr;
+        }
+        auto builtin = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+        if (node->inputs->size != 3) {
+          // TODO(b/132950584): Add support for Conv2D with omitted bias
+          return nullptr;
+        }
+        // NNAPI supports dilated Conv2D since NNAPI 1.2.
+        if (builtin->dilation_width_factor != 1 ||
+            builtin->dilation_height_factor != 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+            return nullptr;
           }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
@@ -1041,1627 +1116,2160 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
             mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            mapping_args.builder->AddScalarBoolOperand(
+                false);  // Use NHWC format
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->dilation_width_factor);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->dilation_height_factor);
             return ANEURALNETWORKS_CONV_2D;
           };
         }
-        break;
-      case kTfLiteBuiltinDepthwiseConv2d:
-        if (version == 1) {
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              !IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          const auto input_type = context->tensors[node->inputs->data[0]].type;
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              input_type == kTfLiteUInt8 &&
-              !IsRestrictedScalesCompliant(context, node)) {
-            return nullptr;
-          }
-          auto builtin =
-              reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              (builtin->dilation_width_factor != 1 ||
-               builtin->dilation_height_factor != 1)) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteConvParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_CONV_2D;
+        };
+      }
+      break;
+    case kTfLiteBuiltinDepthwiseConv2d:
+      if (version <= 3) {
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            !IsFloatOrUint8Operator(context, node)) {
+          return nullptr;
+        }
+        const auto input_type = context->tensors[node->inputs->data[0]].type;
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            input_type == kTfLiteUInt8 &&
+            !IsRestrictedScalesCompliant(context, node)) {
+          return nullptr;
+        }
+        auto builtin =
+            reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            (builtin->dilation_width_factor != 1 ||
+             builtin->dilation_height_factor != 1)) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+          mapping_args.builder->AddScalarInt32Operand(
+              builtin->depth_multiplier);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          if (builtin->dilation_width_factor != 1 ||
+              builtin->dilation_height_factor != 1) {
+            mapping_args.builder->AddScalarBoolOperand(
+                false);  // Use NHWC format
             mapping_args.builder->AddScalarInt32Operand(
-                builtin->depth_multiplier);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            if (builtin->dilation_width_factor != 1 ||
-                builtin->dilation_height_factor != 1) {
-              mapping_args.builder->AddScalarBoolOperand(
-                  false);  // Use NHWC format
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->dilation_width_factor);
-              mapping_args.builder->AddScalarInt32Operand(
-                  builtin->dilation_height_factor);
-            }
-            return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
-          };
+                builtin->dilation_width_factor);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->dilation_height_factor);
+          }
+          return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+        };
+      }
+      break;
+    case kTfLiteBuiltinFullyConnected:
+      if (version <= 4) {
+        if (node->inputs->size != 3 ||
+            node->inputs->data[2] == kOptionalTensor) {
+          // TODO(b/132950584): Add support for FullyConnected with no bias.
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinFullyConnected:
-        if (version == 1) {
-          if (node->inputs->size != 3 ||
-              node->inputs->data[2] == kOptionalTensor) {
-            // TODO(b/132950584): Add support for FullyConnected with no bias.
-            return nullptr;
-          }
-          const auto output_type =
-              context->tensors[node->outputs->data[0]].type;
-          if (output_type == kTfLiteInt16) {
-            return nullptr;
-          }
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              (IsHybridOperator(context, builtin_code, node) ||
-               !IsFloatOrUint8Operator(context, node))) {
-            // Hybrid operators not supported before NNAPI 1.2.
-            return nullptr;
-          }
-          const auto input_type = context->tensors[node->inputs->data[0]].type;
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              input_type == kTfLiteUInt8 &&
-              !IsRestrictedScalesCompliant(context, node)) {
-            return nullptr;
-          }
+        const auto output_type = context->tensors[node->outputs->data[0]].type;
+        if (output_type == kTfLiteInt16) {
+          return nullptr;
+        }
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            (IsHybridOperator(context, builtin_code, node) ||
+             !IsFloatOrUint8Operator(context, node))) {
+          // Hybrid operators not supported before NNAPI 1.2.
+          return nullptr;
+        }
+        const auto input_type = context->tensors[node->inputs->data[0]].type;
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            input_type == kTfLiteUInt8 &&
+            !IsRestrictedScalesCompliant(context, node)) {
+          return nullptr;
+        }
+        auto builtin =
+            reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+        if (builtin->keep_num_dims) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_FULLY_CONNECTED;
+        };
+      }
+      break;
+    case kTfLiteBuiltinHardSwish:
+      // TODO(131260336): Add support for hardswish, at the very least
+      // we should deconstruct it into basic ops. Though for some nnapi
+      // accelerators using optimized tflite kernels might even be faster.
+      return nullptr;
+
+    case kTfLiteBuiltinSoftmax:
+      if (version <= 2) {
+        const auto& input = context->tensors[node->outputs->data[0]];
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        const int input_rank = input.dims->size;
+        if (input_rank > 4) return nullptr;
+        // Before API level 29 only 2D and 4D input tensors were supported.
+        if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+          if (input_rank != 2 && input_rank != 4) return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
+          // Optional scalar specifying the dimension the activation would be
+          // performed on is not added. Default to -1.
+          return ANEURALNETWORKS_SOFTMAX;
+        };
+      }
+      break;
+    case kTfLiteBuiltinReshape:
+      if (version == 1) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        // The shape input tensor must be constant.
+        if ((node->inputs->size < 2) ||
+            (context->tensors[node->inputs->data[1]].allocation_type !=
+             kTfLiteMmapRo)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_RESHAPE>;
+      }
+      break;
+    case kTfLiteBuiltinResizeBilinear:
+      if (version <= 2) {
+        const auto& input = context->tensors[node->inputs->data[0]];
+        const auto output_dims = context->tensors[node->outputs->data[0]].dims;
+        if (input.dims->size != 4) return nullptr;
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        // The size input tensor must be constant.
+        if ((node->inputs->size < 2) ||
+            (context->tensors[node->inputs->data[1]].allocation_type !=
+             kTfLiteMmapRo)) {
+          return nullptr;
+        }
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            output_dims->data[1] != output_dims->data[2]) {
+          // Require width == height due to driver differences in NNAPI < 1.2
+          return nullptr;
+        }
+        auto builtin =
+            reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+        if (builtin->align_corners) {
+          // NNAPI does not support align_corners == true.
+          return nullptr;
+        }
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            input.type != kTfLiteFloat32) {
+          // NNAPI 1.0 & 1.1 only supports float input.
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          const int output_id = mapping_args.node->outputs->data[0];
+          auto& output = mapping_args.context->tensors[output_id];
+          const int output_height = output.dims->data[1];
+          const int output_width = output.dims->data[2];
+          mapping_args.builder->AddScalarInt32Operand(output_width);
+          mapping_args.builder->AddScalarInt32Operand(output_height);
+          return ANEURALNETWORKS_RESIZE_BILINEAR;
+        };
+      }
+      break;
+    case kTfLiteBuiltinResizeNearestNeighbor: {
+      if (version > 2 || android_sdk_version < kMinSdkVersionForNNAPI12) {
+        return nullptr;
+      }
+      if (!IsFloatOrQuant8Operator(context, node)) {
+        return nullptr;
+      }
+      auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
+          node->builtin_data);
+      if (builtin->align_corners) {
+        // NNAPI does not support align_corners == true.
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        const TfLiteTensor& new_shape =
+            mapping_args.context->tensors[mapping_args.node->inputs->data[1]];
+        // NNAPI uses scalar inputs for height and width.
+        mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[1]);
+        mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[0]);
+        mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
+
+        return ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR;
+      };
+    } break;
+    case kTfLiteBuiltinSqueeze:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
+        auto builtin =
+            reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
+        if (android_sdk_version == kMinSdkVersionForNNAPI11 &&
+            builtin->num_squeeze_dims == 0) {
+          // NNAPI 1.1 does not support null squeeze_dims properly.
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
+              mapping_args.node->builtin_data);
+          // Note that we add the squeeze dimensions even if the dimensions
+          // were unspecified (empty), as NNAPI requires the operand.
+          mapping_args.builder->AddVectorInt32Operand(
+              builtin->num_squeeze_dims ? builtin->squeeze_dims : nullptr,
+              static_cast<uint32_t>(builtin->num_squeeze_dims));
+          return ANEURALNETWORKS_SQUEEZE;
+        };
+      }
+      break;
+    case kTfLiteBuiltinUnidirectionalSequenceLstm:
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        if (IsHybridOperator(context, builtin_code, node)) {
+          // Hybrid version of this op is not supported by NN API.
+          return nullptr;
+        }
+        if (node->inputs->size != 20 && node->inputs->size != 24) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
           auto builtin =
-              reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-          if (builtin->keep_num_dims) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_FULLY_CONNECTED;
-          };
-        }
-        break;
-      case kTfLiteBuiltinSoftmax:
-        if (version == 1) {
-          const auto& input = context->tensors[node->outputs->data[0]];
-          if (input.type != kTfLiteFloat32 && input.type != kTfLiteUInt8) {
-            return nullptr;
-          }
-          const int input_rank = input.dims->size;
-          if (input_rank > 4) return nullptr;
-          // Before API level 29 only 2D and 4D input tensors were supported.
-          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
-            if (input_rank != 2 && input_rank != 4) return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
-            // Optional scalar specifying the dimension the activation would be
-            // performed on is not added. Default to -1.
-            return ANEURALNETWORKS_SOFTMAX;
-          };
-        }
-        break;
-      case kTfLiteBuiltinReshape:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          // The shape input tensor must be constant.
-          if ((node->inputs->size < 2) ||
-              (context->tensors[node->inputs->data[1]].allocation_type !=
-               kTfLiteMmapRo)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_RESHAPE>;
-        }
-        break;
-      case kTfLiteBuiltinResizeBilinear:
-        if (version == 1) {
-          const auto& input = context->tensors[node->inputs->data[0]];
-          const auto output_dims =
-              context->tensors[node->outputs->data[0]].dims;
-          if (input.dims->size != 4) return nullptr;
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          // The size input tensor must be constant.
-          if ((node->inputs->size < 2) ||
-              (context->tensors[node->inputs->data[1]].allocation_type !=
-               kTfLiteMmapRo)) {
-            return nullptr;
-          }
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              output_dims->data[1] != output_dims->data[2]) {
-            // Require width == height due to driver differences in NNAPI < 1.2
-            return nullptr;
-          }
-          auto builtin =
-              reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-          if (builtin->align_corners) {
-            // NNAPI does not support align_corners == true.
-            return nullptr;
-          }
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              input.type != kTfLiteFloat32) {
-            // NNAPI 1.0 & 1.1 only supports float input.
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            const int output_id = mapping_args.node->outputs->data[0];
-            auto& output = mapping_args.context->tensors[output_id];
-            const int output_height = output.dims->data[1];
-            const int output_width = output.dims->data[2];
-            mapping_args.builder->AddScalarInt32Operand(output_width);
-            mapping_args.builder->AddScalarInt32Operand(output_height);
-            return ANEURALNETWORKS_RESIZE_BILINEAR;
-          };
-        }
-        break;
-      case kTfLiteBuiltinSqueeze:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
-                mapping_args.node->builtin_data);
-            // Note that we add the squeeze dimensions even if the dimensions
-            // were unspecified (empty), as NNAPI requires the operand.
-            mapping_args.builder->AddVectorInt32Operand(
-                builtin->num_squeeze_dims ? builtin->squeeze_dims : nullptr,
-                static_cast<uint32_t>(builtin->num_squeeze_dims));
-            return ANEURALNETWORKS_SQUEEZE;
-          };
-        }
-        break;
-      case kTfLiteBuiltinUnidirectionalSequenceLstm:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          if (IsHybridOperator(context, builtin_code, node)) {
-            // Hybrid version of this op is not supported by NN API.
-            return nullptr;
-          }
-          if (node->inputs->size != 20 && node->inputs->size != 24) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
-                    mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
-            mapping_args.builder->AddScalarBoolOperand(builtin->time_major);
-            const bool hybrid_op = IsHybridOperator(
-                mapping_args.context, kTfLiteBuiltinUnidirectionalSequenceLstm,
-                mapping_args.node);
-            if (mapping_args.node->inputs->size == 24) {
-              // Add layer normalization tensors if they are provided.
-              for (int i = 20; i < 24; ++i) {
-                const int input_index = mapping_args.node->inputs->data[i];
-                if (input_index != kOptionalTensor) {
-                  mapping_args.builder->AddTensorInput(input_index, hybrid_op);
-                } else {
-                  mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
-                }
-              }
-            } else {
-              for (int i = 0; i < 4; ++i) {
+              reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+                  mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
+          mapping_args.builder->AddScalarBoolOperand(builtin->time_major);
+          const bool hybrid_op = IsHybridOperator(
+              mapping_args.context, kTfLiteBuiltinUnidirectionalSequenceLstm,
+              mapping_args.node);
+          if (mapping_args.node->inputs->size == 24) {
+            // Add layer normalization tensors if they are provided.
+            for (int i = 20; i < 24; ++i) {
+              const int input_index = mapping_args.node->inputs->data[i];
+              if (input_index != kOptionalTensor) {
+                mapping_args.builder->AddTensorInput(input_index, hybrid_op);
+              } else {
                 mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
               }
             }
-
-            return ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM;
-          };
-        }
-        break;
-      case kTfLiteBuiltinL2Normalization: {
-        if (version == 1) {
-          const auto& input = context->tensors[node->inputs->data[0]];
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              (!IsFloatOperator(context, node) || input.dims->size != 4)) {
-            return nullptr;
-          }
-          auto builtin =
-              reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
-          if (builtin->activation == kTfLiteActNone) {
-            return BasicMappingFn<ANEURALNETWORKS_L2_NORMALIZATION>;
-          }
-        }
-        break;
-      }
-      case kTfLiteBuiltinLocalResponseNormalization:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->radius);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->bias);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->alpha);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
-            return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
-          };
-        }
-        break;
-      case kTfLiteBuiltinLshProjection:
-        if (version == 1) {
-          // NNAPI does not support sparse projection correctly (b/111751836).
-          if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
-                  ->type == kTfLiteLshProjectionSparse) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->type);
-            return ANEURALNETWORKS_LSH_PROJECTION;
-          };
-        }
-        break;
-      case kTfLiteBuiltinConcatenation:
-        if (version == 1 &&
-            reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
-                    ->activation == kTfLiteActNone) {
-          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8 &&
-              android_sdk_version < kMinSdkVersionForNNAPI12) {
-            // NNAPI 1.0-1 only supported concatenating quantized tensor of the
-            // same scale and offset.
-            auto first_param = context->tensors[node->inputs->data[0]].params;
-            for (int i = 1; i < node->inputs->size; i++) {
-              auto curr_param = context->tensors[node->inputs->data[i]].params;
-              if (curr_param.scale != first_param.scale ||
-                  curr_param.zero_point != first_param.zero_point) {
-                return nullptr;
-              }
-            }
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->axis);
-            return ANEURALNETWORKS_CONCATENATION;
-          };
-        }
-        break;
-      case kTfLiteBuiltinDequantize:
-        if (version == 1 || version == 2) {
-          const auto& input = context->tensors[node->inputs->data[0]];
-          if (input.type == kTfLiteFloat16) {
-            return nullptr;
-          }
-          const auto zero_point = input.params.zero_point;
-          // NN API supports int8 type since version 1.2 but only for symmetric
-          // quantization.
-          if (input.type == kTfLiteInt8 &&
-              (zero_point != 0 ||
-               android_sdk_version < kMinSdkVersionForNNAPI12)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_DEQUANTIZE>;
-        }
-        break;
-      case kTfLiteBuiltinFloor:
-        if (version == 1) {
-          return BasicMappingFn<ANEURALNETWORKS_FLOOR>;
-        }
-        break;
-      case kTfLiteBuiltinRelu:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_RELU>;
-        }
-        break;
-      case kTfLiteBuiltinReluN1To1:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_RELU1>;
-        }
-        break;
-      case kTfLiteBuiltinRelu6:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_RELU6>;
-        }
-        break;
-      case kTfLiteBuiltinLogistic:
-        if (version == 1) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_LOGISTIC>;
-        }
-        break;
-      case kTfLiteBuiltinTanh:
-        // TODO(miaowang): add additional checks for the parameters.
-        if (version == 1) {
-          const TfLiteType input_type =
-              context->tensors[node->inputs->data[0]].type;
-          if (IsFloat(input_type) ||
-              (IsQuantized(input_type) &&
-               android_sdk_version >= kMinSdkVersionForNNAPI12)) {
-            // NNAPI only support float tanh.
-            return BasicMappingFn<ANEURALNETWORKS_TANH>;
-          }
-        }
-        break;
-      case kTfLiteBuiltinSub:
-        if (version == 1) {
-          const TfLiteType input_type =
-              context->tensors[node->inputs->data[0]].type;
-          if ((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
-               IsFloat(input_type)) ||
-              (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-               IsQuantized(input_type))) {
-            // NNAPI only support float sub.
-            return [](const NNAPIOpMappingArgs& mapping_args)
-                       -> ANeuralNetworksOperationType {
-              auto builtin = reinterpret_cast<TfLiteSubParams*>(
-                  mapping_args.node->builtin_data);
-              mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-              return ANEURALNETWORKS_SUB;
-            };
-          }
-        }
-        break;
-      case kTfLiteBuiltinDiv:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          // NNAPI only support float div.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteDivParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_DIV;
-          };
-        }
-        break;
-      case kTfLiteBuiltinPad:
-      case kTfLiteBuiltinPadv2: {
-        const TfLiteType input_type =
-            context->tensors[node->inputs->data[0]].type;
-        if (version == 1 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8)) {
-          const TfLiteIntArrayView input_shape(
-              context->tensors[node->inputs->data[0]].dims);
-          if (HasZeroes(input_shape)) {
-            // NN API pad ops do not support input tensors with no elements
-            return nullptr;
-          }
-          if (node->inputs->size == 2 &&
-              android_sdk_version >= kMinSdkVersionForNNAPI11 &&
-              (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
-               android_sdk_version >= kMinSdkVersionForNNAPI12)) {
-            // NNAPI does not support specifying the padding value.
-            // Before 1.2, NNAPI pads physical zero for quantized tensors, so
-            // only delegate float pad to NNAPI. NNAPI 1.2 onwards pads with
-            // zero-point, so delegate quantized pad as well.
-            return BasicMappingFn<ANEURALNETWORKS_PAD>;
-          } else if (node->inputs->size == 3 &&
-                     android_sdk_version >= kMinSdkVersionForNNAPI12) {
-            const int constant_value_id = node->inputs->data[2];
-            if (constant_value_id == kOptionalTensor) {
-              return BasicMappingFn<ANEURALNETWORKS_PAD>;
-            }
-            return BasicMappingFn<ANEURALNETWORKS_PAD_V2>;
-          }
-        }
-      } break;
-      case kTfLiteBuiltinUnidirectionalSequenceRnn:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          if (IsHybridOperator(context, builtin_code, node)) {
-            // Hybrid version of this op is not supported by NN API.
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSequenceRNNParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            mapping_args.builder->AddScalarInt32Operand(builtin->time_major);
-            return ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN;
-          };
-        }
-        break;
-      case kTfLiteBuiltinSpaceToBatchNd:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
-          return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
-        }
-        break;
-      case kTfLiteBuiltinStridedSlice:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask);
-            mapping_args.builder->AddScalarInt32Operand(builtin->end_mask);
-            mapping_args.builder->AddScalarInt32Operand(
-                builtin->shrink_axis_mask);
-            return ANEURALNETWORKS_STRIDED_SLICE;
-          };
-        }
-        break;
-      case kTfLiteBuiltinTranspose:
-        // Note that the permutation input tensor value dictates the output
-        // dimensions.
-        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
-        if ((version == 1) &&
-            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
-            (node->inputs->size > 1) &&
-            (context->tensors[node->inputs->data[1]].allocation_type ==
-             kTfLiteMmapRo)) {
-          return BasicMappingFn<ANEURALNETWORKS_TRANSPOSE>;
-        }
-        break;
-      case kTfLiteBuiltinAbs:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_ABS>;
-        }
-        break;
-      case kTfLiteBuiltinExp:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_EXP>;
-        }
-        break;
-      case kTfLiteBuiltinLog:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_LOG>;
-        }
-        break;
-      case kTfLiteBuiltinRsqrt:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_RSQRT>;
-        }
-        break;
-      case kTfLiteBuiltinPow:
-        // NN API only supports float inputs to this op.
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          return BasicMappingFn<ANEURALNETWORKS_POW>;
-        }
-        break;
-      case kTfLiteBuiltinSlice: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        const auto begin_type = context->tensors[node->inputs->data[1]].type;
-        const auto size_type = context->tensors[node->inputs->data[2]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteInt32 ||
-             input_type == kTfLiteUInt8) &&
-            begin_type == kTfLiteInt32 && size_type == kTfLiteInt32) {
-          return BasicMappingFn<ANEURALNETWORKS_SLICE>;
-        }
-      } break;
-      case kTfLiteBuiltinSin:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_SIN>;
-        }
-        break;
-      case kTfLiteBuiltinSqrt:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          return BasicMappingFn<ANEURALNETWORKS_SQRT>;
-        }
-        break;
-      case kTfLiteBuiltinRnn:
-        // NNAPI only support float32 weights.
-        if (version == 1 && node->inputs->size == 5 &&
-            context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type ==
-                kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            // NNAPI need both state_in and state_out.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4]);
-            auto builtin = reinterpret_cast<TfLiteRNNParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_RNN;
-          };
-        }
-        break;
-      case kTfLiteBuiltinSpaceToDepth: {
-        const TfLiteType input_type =
-            context->tensors[node->inputs->data[0]].type;
-        if (version == 1 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8)) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->block_size);
-            return ANEURALNETWORKS_SPACE_TO_DEPTH;
-          };
-        }
-      } break;
-      case kTfLiteBuiltinSvdf:
-        // NNAPI only support float32 weights.
-        // Only delegate to NNAPI 1.1, as SVDF does not support rank > 1 on 1.0.
-        if (version == 1 && node->inputs->size == 5 &&
-            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
-            context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
-                    .type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            // NNAPI need both state_in and state_out.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 4],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 4]);
-
-            auto builtin = reinterpret_cast<TfLiteSVDFParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->rank);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_SVDF;
-          };
-        }
-        break;
-      case kTfLiteBuiltinLstm:
-        // TODO(miaowang): add loggings to indicate why the op is rejected.
-        if (version == 1) {
-          if (android_sdk_version < kMinSdkVersionForNNAPI11) {
-            // Only delegate to NNAPI 1.1+, as 1.0 has a bug for optional
-            // tensors which would affect LSTM.
-            return nullptr;
-          }
-          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
-              IsHybridOperator(context, builtin_code, node)) {
-            // Hybrid operators not supported before NNAPI 1.2.
-            return nullptr;
-          }
-          // TODO(levp): name the constants for number of inputs in LSTM kernel.
-          if (node->inputs->size != 20 && node->inputs->size != 24) {
-            return nullptr;
-          }
-          if (node->inputs->size == 24 &&
-              android_sdk_version < kMinSdkVersionForNNAPI12) {
-            // LSTM with layer norm introduced in API level 29
-            return nullptr;
-          }
-          const TfLiteType weight_type =
-              context
-                  ->tensors[node->inputs
-                                ->data[/*kInputToOutputWeightsTensor*/ 4]]
-                  .type;
-          if (weight_type != kTfLiteFloat32 && weight_type != kTfLiteUInt8) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLSTMParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
-
-            // Current NNAPI implementation requires the scratch_buffer as
-            // output.
-            mapping_args.builder->AddAdditionalFloat32OutputTensor(2);
-
-            // NNAPI need both state_in and state_out for cell_state and
-            // output_state.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 18],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 18]);
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19]);
-
-            const bool hybrid_op = IsHybridOperator(
-                mapping_args.context, kTfLiteBuiltinLstm, mapping_args.node);
-
-            if (mapping_args.node->inputs->size == 24) {
-              for (int i = 20; i < 24; ++i) {
-                const auto input_index = mapping_args.node->inputs->data[i];
-                if (input_index != kOptionalTensor) {
-                  mapping_args.builder->AddTensorInput(input_index, hybrid_op);
-                } else {
-                  mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
-                }
-              }
-            }
-
-            return ANEURALNETWORKS_LSTM;
-          };
-        }
-        break;
-      case kTfLiteBuiltinMean:
-        // NNAPI does not support generating a scalar as output for MEAN.
-        if (version == 1 &&
-            ((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
-              context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) ||
-             (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-              context->tensors[node->inputs->data[0]].type == kTfLiteUInt8)) &&
-            context->tensors[node->outputs->data[0]].dims->size > 0) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteReducerParams*>(
-                mapping_args.node->builtin_data);
-            int32_t keep_dims = 0;
-            if (builtin->keep_dims) keep_dims = 1;
-            mapping_args.builder->AddScalarInt32Operand(keep_dims);
-            return ANEURALNETWORKS_MEAN;
-          };
-        }
-        break;
-      case kTfLiteBuiltinEmbeddingLookup:
-        // NNAPI only support float32 values.
-        if (version == 1 &&
-            context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
-          return BasicMappingFn<ANEURALNETWORKS_EMBEDDING_LOOKUP>;
-        }
-        break;
-      case kTfLiteBuiltinHashtableLookup:
-        // NNAPI only support float32 output.
-        if (version == 1 &&
-            context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
-          return BasicMappingFn<ANEURALNETWORKS_HASHTABLE_LOOKUP>;
-        }
-        break;
-      case kTfLiteBuiltinMaximum: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_MAXIMUM>;
-        }
-      } break;
-      case kTfLiteBuiltinMinimum: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_MINIMUM>;
-        }
-      } break;
-      case kTfLiteBuiltinCast: {
-        const TfLiteType input_type =
-            context->tensors[node->inputs->data[0]].type;
-        const TfLiteType output_type =
-            context->tensors[node->outputs->data[0]].type;
-        auto is_supported_tensor_type = [](const TfLiteType& type) {
-          return (type == kTfLiteFloat32 || type == kTfLiteInt32 ||
-                  type == kTfLiteUInt8);
-        };
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            is_supported_tensor_type(input_type) &&
-            is_supported_tensor_type(output_type)) {
-          return BasicMappingFn<ANEURALNETWORKS_CAST>;
-        }
-      } break;
-      case kTfLiteBuiltinPrelu:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          if (!IsFloatOrUint8Operator(context, node)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_PRELU>;
-        }
-        break;
-      case kTfLiteBuiltinTile: {
-        // NN API doesn't support int64 and boolean inputs to this op
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        const auto multipliers_type =
-            context->tensors[node->inputs->data[1]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteInt32) &&
-            (multipliers_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_TILE>;
-        }
-      } break;
-      case kTfLiteBuiltinLogicalOr: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            input_type == kTfLiteBool) {
-          return BasicMappingFn<ANEURALNETWORKS_LOGICAL_OR>;
-        }
-      } break;
-      case kTfLiteBuiltinLogicalAnd: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            input_type == kTfLiteBool) {
-          return BasicMappingFn<ANEURALNETWORKS_LOGICAL_AND>;
-        }
-      } break;
-      case kTfLiteBuiltinLogicalNot: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            input_type == kTfLiteBool) {
-          return BasicMappingFn<ANEURALNETWORKS_LOGICAL_NOT>;
-        }
-      } break;
-      case kTfLiteBuiltinLess: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_LESS>;
-        }
-      } break;
-      case kTfLiteBuiltinLessEqual: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_LESS_EQUAL>;
-        }
-      } break;
-      case kTfLiteBuiltinGreater: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_GREATER>;
-        }
-      } break;
-      case kTfLiteBuiltinGreaterEqual: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_GREATER_EQUAL>;
-        }
-      } break;
-      case kTfLiteBuiltinEqual: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_EQUAL>;
-        }
-      } break;
-      case kTfLiteBuiltinNotEqual: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-             input_type == kTfLiteBool || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_NOT_EQUAL>;
-        }
-      } break;
-      case kTfLiteBuiltinNeg: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat32 || input_type == kTfLiteInt32)) {
-          return BasicMappingFn<ANEURALNETWORKS_NEG>;
-        }
-      } break;
-      case kTfLiteBuiltinTopkV2: {
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          const auto& input = context->tensors[node->outputs->data[0]];
-          const auto& k_param = context->tensors[node->outputs->data[1]];
-          if ((input.type == kTfLiteFloat32 || input.type == kTfLiteInt32 ||
-               input.type == kTfLiteInt8) &&
-              (k_param.type == kTfLiteInt32 &&
-               k_param.allocation_type == kTfLiteMmapRo)) {
-            return [](const NNAPIOpMappingArgs& mapping_args)
-                       -> ANeuralNetworksOperationType {
-              const TfLiteTensor& k_param =
-                  mapping_args.context
-                      ->tensors[mapping_args.node->inputs->data[1]];
-              mapping_args.builder->AddScalarInt32Operand(*k_param.data.i32);
-              return ANEURALNETWORKS_TOPK_V2;
-            };
           } else {
-            return nullptr;
-          }
-        }
-      } break;
-      case kTfLiteBuiltinSelect: {
-        const auto value_type = context->tensors[node->inputs->data[1]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (value_type == kTfLiteFloat32 || value_type == kTfLiteUInt8 ||
-             value_type == kTfLiteInt32)) {
-          TfLiteIntArray* condition_shape =
-              context->tensors[node->inputs->data[0]].dims;
-          TfLiteIntArray* input_shape =
-              context->tensors[node->inputs->data[1]].dims;
-          // The Android Q-variant of select does not support broadcasting.
-          if (!TfLiteIntArrayEqual(condition_shape, input_shape)) {
-            return nullptr;
-          }
-          return BasicMappingFn<ANEURALNETWORKS_SELECT>;
-        }
-      } break;
-      case kTfLiteBuiltinGather: {
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          const auto& input = context->tensors[node->inputs->data[0]];
-          const auto& positions = context->tensors[node->inputs->data[1]];
-
-          auto is_supported_input_type = [](const TfLiteTensor& t) {
-            return (t.type == kTfLiteFloat32 || t.type == kTfLiteFloat16 ||
-                    t.type == kTfLiteInt32 || t.type == kTfLiteUInt8);
-          };
-
-          if (!is_supported_input_type(input) ||
-              !is_supported_input_type(positions)) {
-            return nullptr;
-          }
-
-          // 0-dimension args are not supported by NNAPI.
-          if (positions.dims->size == 0) {
-            return nullptr;
-          }
-
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteGatherParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddTensorInput(
-                mapping_args.node->inputs->data[0],
-                /* hybrid_op */ false,
-                /* scalar_as_tensor */ false);
-
-            mapping_args.builder->AddScalarInt32Operand(builtin->axis);
-
-            mapping_args.builder->AddTensorInput(
-                mapping_args.node->inputs->data[1],
-                /* hybrid_op */ false,
-                /* scalar_as_tensor */ false);
-
-            return ANEURALNETWORKS_GATHER;
-          };
-        }
-      } break;
-      case kTfLiteBuiltinBidirectionalSequenceLstm:
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
-          if (IsHybridOperator(context, builtin_code, node)) {
-            // Hybrid version of this op is not supported by NN API.
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
-                    mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
-            mapping_args.builder->AddScalarBoolOperand(builtin->merge_outputs);
-            mapping_args.builder->AddScalarBoolOperand(builtin->time_major);
-            // TF Lite doesn't support layer normalization in bidirectional
-            // sequence LSTM, so we insert optional tensors for NNAPI
-            for (int i = 0; i < 8; ++i) {
+            for (int i = 0; i < 4; ++i) {
               mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
             }
-            return ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM;
-          };
+          }
+
+          return ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM;
+        };
+      }
+      break;
+    case kTfLiteBuiltinL2Normalization: {
+      if (version <= 2) {
+        const auto& input = context->tensors[node->inputs->data[0]];
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            (!IsFloatOperator(context, node) || input.dims->size != 4)) {
+          return nullptr;
         }
-        break;
-      case kTfLiteBuiltinExpandDims: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        const auto axis = context->tensors[node->inputs->data[1]];
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input_type == kTfLiteFloat16 || input_type == kTfLiteFloat32 ||
-             input_type == kTfLiteInt32 || input_type == kTfLiteUInt8) &&
-            // TFLite supports axis also as int64 but NNAPI only int32
-            (axis.type == kTfLiteInt32 &&
-             axis.allocation_type == kTfLiteMmapRo)) {
+        auto builtin =
+            reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+        if (builtin->activation == kTfLiteActNone) {
+          return BasicMappingFn<ANEURALNETWORKS_L2_NORMALIZATION>;
+        }
+      }
+      break;
+    }
+    case kTfLiteBuiltinLocalResponseNormalization:
+      if (version == 1) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->radius);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->bias);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->alpha);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
+          return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
+        };
+      }
+      break;
+    case kTfLiteBuiltinLshProjection:
+      if (version == 1) {
+        if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
+                ->type == kTfLiteLshProjectionSparse) {
+          // NNAPI does not support sparse projection correctly pre-Q
+          // (b/111751836).
+          if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+            return nullptr;
+          }
+          // NNAPI does not support weights for sparse projects.
+          if (node->inputs->size != 2) {
+            return nullptr;
+          }
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
+              mapping_args.node->builtin_data);
+          int type = builtin->type;
+          // In Android Q+, NNAPI uses 3 to denote
+          // kTfLiteLshProjectionSparse.
+          const int kNNAPILshProjectionSparse = 3;
+          if (builtin->type == kTfLiteLshProjectionSparse) {
+            type = kNNAPILshProjectionSparse;
+            // Add NNAPI null weight operand.
+            mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
+          }
+          mapping_args.builder->AddScalarInt32Operand(type);
+          return ANEURALNETWORKS_LSH_PROJECTION;
+        };
+      }
+      break;
+    case kTfLiteBuiltinConcatenation:
+      if (version <= 2 &&
+          reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
+                  ->activation == kTfLiteActNone &&
+          context->tensors[node->inputs->data[0]].dims->size <= 4) {
+        if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8 &&
+            android_sdk_version < kMinSdkVersionForNNAPI12) {
+          // NNAPI 1.0-1 only supported concatenating quantized tensor of
+          // the same scale and offset.
+          auto first_param = context->tensors[node->inputs->data[0]].params;
+          for (int i = 1; i < node->inputs->size; i++) {
+            auto curr_param = context->tensors[node->inputs->data[i]].params;
+            if (curr_param.scale != first_param.scale ||
+                curr_param.zero_point != first_param.zero_point) {
+              return nullptr;
+            }
+          }
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
+              mapping_args.node->builtin_data);
+          int axis = builtin->axis < 0
+                         ? mapping_args.context
+                                   ->tensors[mapping_args.node->inputs->data[0]]
+                                   .dims->size +
+                               builtin->axis
+                         : builtin->axis;
+          mapping_args.builder->AddScalarInt32Operand(axis);
+          return ANEURALNETWORKS_CONCATENATION;
+        };
+      }
+      break;
+    case kTfLiteBuiltinDequantize:
+      if (version == 1 || version == 2) {
+        const auto& input = context->tensors[node->inputs->data[0]];
+        if (input.type == kTfLiteFloat16) {
+          return nullptr;
+        }
+        const auto zero_point = input.params.zero_point;
+        // NN API supports int8 type since version 1.2 but only for
+        // symmetric quantization.
+        if (input.type == kTfLiteInt8 &&
+            (zero_point != 0 ||
+             android_sdk_version < kMinSdkVersionForNNAPI12)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_DEQUANTIZE>;
+      }
+      break;
+    case kTfLiteBuiltinFloor:
+      if (version == 1) {
+        return BasicMappingFn<ANEURALNETWORKS_FLOOR>;
+      }
+      break;
+    case kTfLiteBuiltinRelu:
+      if (version == 1) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_RELU>;
+      }
+      break;
+    case kTfLiteBuiltinReluN1To1:
+      if (version == 1) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_RELU1>;
+      }
+      break;
+    case kTfLiteBuiltinRelu6:
+      if (version == 1) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_RELU6>;
+      }
+      break;
+    case kTfLiteBuiltinLogistic:
+      if (version <= 2) {
+        if (!IsFloatOrQuant8Operator(context, node)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_LOGISTIC>;
+      }
+      break;
+    case kTfLiteBuiltinTanh:
+      if (version <= 2) {
+        const TfLiteType input_type =
+            context->tensors[node->inputs->data[0]].type;
+        if (IsFloat(input_type) ||
+            (IsQuantized(input_type) &&
+             android_sdk_version >= kMinSdkVersionForNNAPI12)) {
+          // NNAPI only support float tanh.
+          return BasicMappingFn<ANEURALNETWORKS_TANH>;
+        }
+      }
+      break;
+    case kTfLiteBuiltinSub:
+      if (version <= 2) {
+        const TfLiteType input_type =
+            context->tensors[node->inputs->data[0]].type;
+        if ((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+             IsFloat(input_type)) ||
+            (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+             IsQuantized(input_type))) {
+          // NNAPI only support float sub.
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            const TfLiteTensor& axis_param =
+            auto builtin = reinterpret_cast<TfLiteSubParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_SUB;
+          };
+        }
+      }
+      break;
+    case kTfLiteBuiltinDiv:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+          context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+        // NNAPI only support float div.
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteDivParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_DIV;
+        };
+      }
+      break;
+    case kTfLiteBuiltinPad:
+    case kTfLiteBuiltinPadv2: {
+      if (version <= 2 && IsFloatOrQuant8Operator(context, node)) {
+        const TfLiteIntArrayView input_shape(
+            context->tensors[node->inputs->data[0]].dims);
+        if (HasZeroes(input_shape)) {
+          // NN API pad ops do not support input tensors with no elements
+          return nullptr;
+        }
+        if (node->inputs->size == 2 &&
+            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+            (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
+             android_sdk_version >= kMinSdkVersionForNNAPI12)) {
+          // NNAPI does not support specifying the padding value.
+          // Before 1.2, NNAPI pads physical zero for quantized tensors, so
+          // only delegate float pad to NNAPI. NNAPI 1.2 onwards pads with
+          // zero-point, so delegate quantized pad as well.
+          return BasicMappingFn<ANEURALNETWORKS_PAD>;
+        } else if (node->inputs->size == 3 &&
+                   android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          const int constant_value_id = node->inputs->data[2];
+          if (constant_value_id == kOptionalTensor) {
+            return BasicMappingFn<ANEURALNETWORKS_PAD>;
+          }
+          return BasicMappingFn<ANEURALNETWORKS_PAD_V2>;
+        }
+      }
+    } break;
+    case kTfLiteBuiltinUnidirectionalSequenceRnn:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        if (IsHybridOperator(context, builtin_code, node)) {
+          // Hybrid version of this op is not supported by NN API.
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteSequenceRNNParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          mapping_args.builder->AddScalarInt32Operand(builtin->time_major);
+          return ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN;
+        };
+      }
+      break;
+    case kTfLiteBuiltinSpaceToBatchNd:
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
+        return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
+      }
+      break;
+    case kTfLiteBuiltinBatchToSpaceNd:
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
+        auto crops = context->tensors[node->inputs->data[2]];
+        auto crops_data = crops.data.i32;
+        // Check if all crops are 0.
+        if (!crops_data || crops.bytes != 16 || crops_data[0] != 0 ||
+            crops_data[1] != 0 || crops_data[2] != 0 || crops_data[3] != 0) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_BATCH_TO_SPACE_ND>;
+      }
+      break;
+    case kTfLiteBuiltinStridedSlice:
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask);
+          mapping_args.builder->AddScalarInt32Operand(builtin->end_mask);
+          mapping_args.builder->AddScalarInt32Operand(
+              builtin->shrink_axis_mask);
+          return ANEURALNETWORKS_STRIDED_SLICE;
+        };
+      }
+      break;
+    case kTfLiteBuiltinTranspose:
+      // Note that the permutation input tensor value dictates the output
+      // dimensions.
+      // TODO(b/110888333): Support dynamically-sized tensors in delegates.
+      if ((version <= 2) && (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
+          (node->inputs->size > 1) &&
+          (context->tensors[node->inputs->data[1]].allocation_type ==
+           kTfLiteMmapRo)) {
+        return BasicMappingFn<ANEURALNETWORKS_TRANSPOSE>;
+      }
+      break;
+    case kTfLiteBuiltinAbs:
+      // NN API only supports float inputs to this op.
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_ABS>;
+      }
+      break;
+    case kTfLiteBuiltinExp:
+      // NN API only supports float inputs to this op.
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_EXP>;
+      }
+      break;
+    case kTfLiteBuiltinLog:
+      // NN API only supports float inputs to this op.
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_LOG>;
+      }
+      break;
+    case kTfLiteBuiltinRsqrt:
+      // NN API only supports float inputs to this op.
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloatOperator(context, node)) {
+        return BasicMappingFn<ANEURALNETWORKS_RSQRT>;
+      }
+      break;
+    case kTfLiteBuiltinPow:
+      // NN API only supports float inputs to this op.
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_POW>;
+      }
+      break;
+    case kTfLiteBuiltinSlice: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      const auto begin_type = context->tensors[node->inputs->data[1]].type;
+      const auto size_type = context->tensors[node->inputs->data[2]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteInt32 ||
+           input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) &&
+          begin_type == kTfLiteInt32 && size_type == kTfLiteInt32) {
+        return BasicMappingFn<ANEURALNETWORKS_SLICE>;
+      }
+    } break;
+    case kTfLiteBuiltinSin:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_SIN>;
+      }
+      break;
+    case kTfLiteBuiltinTransposeConv:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          const bool hybrid_op =
+              IsHybridOperator(mapping_args.context,
+                               kTfLiteBuiltinTransposeConv, mapping_args.node);
+          mapping_args.builder->AddTensorInput(/*kDataInputTensor*/ 2,
+                                               hybrid_op);
+          mapping_args.builder->AddTensorInput(/*kWeightsTensor*/ 1, hybrid_op);
+
+          // NNAPI requires a bias tensor, so we allocate a new tensor to fill
+          // it with zeroes. It is deleted with other tensors in the context
+          // during subgraph destructor call.
+          int bias_index = -1;
+          mapping_args.context->AddTensors(mapping_args.context, 1,
+                                           &bias_index);
+          TfLiteTensor* bias_tensor =
+              &mapping_args.context->tensors[bias_index];
+          const auto input_type =
+              mapping_args.context
+                  ->tensors[mapping_args.node->inputs
+                                ->data[/*kDataInputTensor*/ 2]]
+                  .type;
+          if (input_type == kTfLiteFloat32) {
+            bias_tensor->type = kTfLiteFloat32;
+          } else {
+            bias_tensor->type = kTfLiteInt32;
+          }
+
+          // Create an array with a required bias shape and resize the bias
+          // tensor.
+          TfLiteIntArray* bias_shape = TfLiteIntArrayCreate(1);
+          const TfLiteTensor& output_shape =
+              mapping_args.context->tensors
+                  [mapping_args.node->inputs->data[/*kOutputShapeTensor*/ 0]];
+          const int output_depth = output_shape.data.i32[3];
+          bias_shape->data[0] = output_depth;
+          bias_tensor->allocation_type = kTfLiteDynamic;
+          mapping_args.context->ResizeTensor(mapping_args.context, bias_tensor,
+                                             bias_shape);
+
+          // Set tensor's values to zeroes and add it using AddVector*, so
+          // that the values are copied to NNAPI. We don't use the AddTensor
+          // function because it doesn't copy values and the tensor we just
+          // created is not in the node->inputs.
+          if (input_type == kTfLiteFloat32) {
+            memset(bias_tensor->data.f, 0, output_depth * sizeof(float));
+            mapping_args.builder->AddVectorFloat32Operand(bias_tensor->data.f,
+                                                          output_depth);
+          } else {
+            memset(bias_tensor->data.i32, 0, output_depth * sizeof(int));
+            const TfLiteTensor& input_tensor =
+                mapping_args.context->tensors
+                    [mapping_args.node->inputs->data[/*kDataInputTensor*/ 2]];
+            const TfLiteTensor& filter_tensor =
+                mapping_args.context->tensors[mapping_args.node->inputs
+                                                  ->data[/*kWeightsTensor*/ 1]];
+            // NNAPI requires bias scale to be a product of an input scale and
+            // a filter scale.
+            bias_tensor->params.scale =
+                input_tensor.params.scale * filter_tensor.params.scale;
+            mapping_args.builder->AddVectorInt32Operand(
+                bias_tensor->data.i32, output_depth,
+                input_tensor.params.scale * filter_tensor.params.scale,
+                /*zero_point=*/0);
+          }
+
+          mapping_args.builder->AddTensorInput(/*kOutputShapeTensor*/ 0,
+                                               hybrid_op);
+
+          auto builtin = reinterpret_cast<TfLiteTransposeConvParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+          mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+          mapping_args.builder->AddScalarInt32Operand(
+              /*ANEURALNETWORKS_FUSED_NONE*/ 0);
+          // Use NHWC layout for input and output
+          mapping_args.builder->AddScalarBoolOperand(false);
+          return ANEURALNETWORKS_TRANSPOSE_CONV;
+        };
+      }
+      break;
+    case kTfLiteBuiltinSqrt:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          IsFloat(context->tensors[node->inputs->data[0]].type)) {
+        return BasicMappingFn<ANEURALNETWORKS_SQRT>;
+      }
+      break;
+    case kTfLiteBuiltinRnn:
+      // NNAPI only support float32 weights.
+      if (version == 1 && node->inputs->size == 5 &&
+          context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type ==
+              kTfLiteFloat32) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          // NNAPI need both state_in and state_out.
+          int ann_index;
+          mapping_args.builder->AddStateFloat32Tensor(
+              mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4],
+              &ann_index);
+          mapping_args.model_state_outputs->push_back(ann_index);
+          mapping_args.model_state_tfl_inputs->push_back(
+              mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4]);
+          auto builtin = reinterpret_cast<TfLiteRNNParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_RNN;
+        };
+      }
+      break;
+    case kTfLiteBuiltinSpaceToDepth: {
+      const TfLiteType input_type =
+          context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8)) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->block_size);
+          return ANEURALNETWORKS_SPACE_TO_DEPTH;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinDepthToSpace: {
+      const TfLiteType input_type =
+          context->tensors[node->inputs->data[0]].type;
+      if (version <= 1 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8)) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteDepthToSpaceParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->block_size);
+          return ANEURALNETWORKS_DEPTH_TO_SPACE;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinSvdf:
+      // NNAPI only support float32 weights.
+      // Only delegate to NNAPI 1.1, as SVDF does not support rank > 1
+      // on 1.0.
+      if (version == 1 && node->inputs->size == 5 &&
+          android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+          context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
+                  .type == kTfLiteFloat32) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          // NNAPI need both state_in and state_out.
+          int ann_index;
+          mapping_args.builder->AddStateFloat32Tensor(
+              mapping_args.node->inputs
+                  ->data[/*kInputActivationStateTensor*/ 4],
+              &ann_index);
+          mapping_args.model_state_outputs->push_back(ann_index);
+          mapping_args.model_state_tfl_inputs->push_back(
+              mapping_args.node->inputs
+                  ->data[/*kInputActivationStateTensor*/ 4]);
+
+          auto builtin = reinterpret_cast<TfLiteSVDFParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->rank);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_SVDF;
+        };
+      }
+      break;
+    case kTfLiteBuiltinLstm:
+      // TODO(miaowang): add loggings to indicate why the op is rejected.
+      if (version <= 3) {
+        if (android_sdk_version < kMinSdkVersionForNNAPI11) {
+          // Only delegate to NNAPI 1.1+, as 1.0 has a bug for optional
+          // tensors which would affect LSTM.
+          return nullptr;
+        }
+        if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+            IsHybridOperator(context, builtin_code, node)) {
+          // Hybrid operators not supported before NNAPI 1.2.
+          return nullptr;
+        }
+
+        const auto weight_input_index =
+            isLstmBasicKernel(node) ? 2 /*  basic::kInputWeights */
+                                    : 4 /* full::kInputToOutputWeightsTensor */;
+
+        const TfLiteType weight_type =
+            context->tensors[node->inputs->data[weight_input_index]].type;
+
+        if (isLstmBasicKernel(node)) {
+          if (weight_type != kTfLiteUInt8) {
+            return nullptr;
+          }
+          const auto input_quantization_params =
+              context->tensors[node->inputs->data[0]].params;
+          if (input_quantization_params.scale != 1. / 128. ||
+              input_quantization_params.zero_point != 128) {
+            return nullptr;
+          }
+
+          const auto output_quantization_params =
+              context->tensors[node->outputs->data[0]].params;
+          if (output_quantization_params.scale != 1. / 128. ||
+              output_quantization_params.zero_point != 128) {
+            return nullptr;
+          }
+
+          const auto cell_state_quantization_params =
+              context->tensors[node->outputs->data[1]].params;
+          if (cell_state_quantization_params.scale != 16. / 32768. ||
+              cell_state_quantization_params.zero_point != 0) {
+            return nullptr;
+          }
+
+          auto is_const_tensor = [&node, &context](int tensor_idx) {
+            return context->tensors[node->inputs->data[tensor_idx]]
+                       .allocation_type == kTfLiteMmapRo;
+          };
+
+          if (!is_const_tensor(2 /* kInputWeights */)) {
+            return nullptr;
+          }
+
+          if (!is_const_tensor(3 /* kInputBiases */)) {
+            return nullptr;
+          }
+
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            const auto output_dims =
+                mapping_args.context
+                    ->tensors[mapping_args.node->outputs->data[1]]
+                    .dims;
+
+            // Inputs kInputData
+            mapping_args.builder->AddTensorInput(
+                mapping_args.node->inputs->data[0 /* kInputData */],
+                /* hybrid_op */ false,
+                /* scalar_as_tensor */ false);
+
+            // The 8 weights tensors are set decomposing the
+            // kInputWeights param
+            const auto weight_tensor =
+                mapping_args.context->tensors
+                    [mapping_args.node->inputs->data[2 /* kInputWeights */]];
+
+            std::vector<uint8_t> recurrent_to_input;
+            std::vector<uint8_t> input_to_input;
+            std::vector<uint8_t> recurrent_to_cell;
+            std::vector<uint8_t> input_to_cell;
+            std::vector<uint8_t> recurrent_to_forget;
+            std::vector<uint8_t> input_to_forget;
+            std::vector<uint8_t> recurrent_to_output;
+            std::vector<uint8_t> input_to_output;
+            tflite::delegate::nnapi::DecomposeQuantLstmWeightsTensor(
+                weight_tensor.data.uint8, weight_tensor.dims,
+                &recurrent_to_input, &input_to_input, &recurrent_to_cell,
+                &input_to_cell, &recurrent_to_forget, &input_to_forget,
+                &recurrent_to_output, &input_to_output);
+
+            TfLiteIntArray* recurrent_weight_dims = TfLiteIntArrayCreate(2);
+            TfLiteIntArray* input_weight_dims = TfLiteIntArrayCreate(2);
+            tflite::delegate::nnapi::SetWeightSubmatrixDims(
+                weight_tensor.dims, recurrent_weight_dims, input_weight_dims);
+
+            int new_tensor_index = -1;
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                input_weight_dims, input_to_input, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                input_weight_dims, input_to_forget, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                input_weight_dims, input_to_cell, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                input_weight_dims, input_to_output, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                recurrent_weight_dims, recurrent_to_input, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                recurrent_weight_dims, recurrent_to_forget,
+                weight_tensor.params, &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                recurrent_weight_dims, recurrent_to_cell, weight_tensor.params,
+                &new_tensor_index);
+
+            mapping_args.builder->AddNewInputConstantTensor<uint8_t>(
+                ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8,
+                recurrent_weight_dims, recurrent_to_output,
+                weight_tensor.params, &new_tensor_index);
+
+            TfLiteIntArrayFree(input_weight_dims);
+            TfLiteIntArrayFree(recurrent_weight_dims);
+
+            // Biases have to be split in four
+            const auto bias_size = output_dims->data[1];
+            const TfLiteTensor& biases_tensor =
+                mapping_args.context->tensors[mapping_args.node->inputs
+                                                  ->data[3 /* kInputBiases */]];
+
+            std::vector<int32_t> input_bias;
+            std::vector<int32_t> cell_bias;
+            std::vector<int32_t> forget_bias;
+            std::vector<int32_t> output_bias;
+            delegate::nnapi::DecomposeBiasTensor(
+                biases_tensor.data.i32, bias_size, &input_bias, &cell_bias,
+                &forget_bias, &output_bias);
+
+            int input_bias_tensor = -1;
+            mapping_args.builder->AddNewInputConstantTensor<int32_t>(
+                ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size},
+                input_bias, biases_tensor.params, &input_bias_tensor);
+            int forget_bias_tensor = -1;
+            mapping_args.builder->AddNewInputConstantTensor(
+                ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size},
+                forget_bias, biases_tensor.params, &forget_bias_tensor);
+            int cell_gate_bias_tensor = -1;
+            mapping_args.builder->AddNewInputConstantTensor(
+                ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size},
+                cell_bias, biases_tensor.params, &cell_gate_bias_tensor);
+            int output_gate_bias_tensor = -1;
+            mapping_args.builder->AddNewInputConstantTensor(
+                ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size},
+                output_bias, biases_tensor.params, &output_gate_bias_tensor);
+
+            mapping_args.builder->AddTensorInput(
+                mapping_args.node->inputs->data[4 /* kInputPrevState */],
+                /* hybrid_op */ false,
+                /* scalar_as_tensor */ false);
+
+            // kInputPrevActivation
+            mapping_args.builder->AddTensorInput(
+                mapping_args.node->inputs->data[1 /* kInputPrevActivation */],
+                /* hybrid_op */ false,
+                /* scalar_as_tensor */ false);
+
+            // Configuring the copy from the activation, state outputs
+            // to their associated inputs
+            mapping_args.feedback_loops->push_back(std::make_tuple(
+                0 /*kOutputActivation*/, 1 /*kInputPrevActivation*/));
+
+            mapping_args.feedback_loops->push_back(
+                std::make_tuple(1 /*kOutputState*/, 4 /*kInputPrevState*/));
+
+            // OUTPUTS
+            // Setting only the first two since the remaining ones are
+            // ignored by NNAPI
+            mapping_args.builder->AddTensorOutput(
+                mapping_args.node->outputs->data[1 /* kOutputState */], 0);
+
+            mapping_args.builder->AddTensorOutput(
+                mapping_args.node->outputs
+                    ->data[0 /* kOutputkOutputActivationState */],
+                0);
+
+            return ANEURALNETWORKS_QUANTIZED_16BIT_LSTM;
+          };
+        }
+        if (node->inputs->size == 24 &&
+            android_sdk_version < kMinSdkVersionForNNAPI12) {
+          // LSTM with layer norm introduced in API level 29
+          return nullptr;
+        }
+        if (weight_type != kTfLiteFloat32 && weight_type != kTfLiteUInt8) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteLSTMParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
+
+          // Current NNAPI implementation requires the scratch_buffer as
+          // output.
+          mapping_args.builder->AddAdditionalFloat32OutputTensor(2);
+
+          // NNAPI need both state_in and state_out for cell_state and
+          // output_state.
+          int ann_index;
+          mapping_args.builder->AddStateFloat32Tensor(
+              mapping_args.node->inputs
+                  ->data[/*kInputActivationStateTensor*/ 18],
+              &ann_index);
+          mapping_args.model_state_outputs->push_back(ann_index);
+          mapping_args.model_state_tfl_inputs->push_back(
+              mapping_args.node->inputs
+                  ->data[/*kInputActivationStateTensor*/ 18]);
+          mapping_args.builder->AddStateFloat32Tensor(
+              mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19],
+              &ann_index);
+          mapping_args.model_state_outputs->push_back(ann_index);
+          mapping_args.model_state_tfl_inputs->push_back(
+              mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19]);
+
+          const bool hybrid_op = IsHybridOperator(
+              mapping_args.context, kTfLiteBuiltinLstm, mapping_args.node);
+
+          if (mapping_args.node->inputs->size == 24) {
+            for (int i = 20; i < 24; ++i) {
+              const auto input_index = mapping_args.node->inputs->data[i];
+              if (input_index != kOptionalTensor) {
+                mapping_args.builder->AddTensorInput(input_index, hybrid_op);
+              } else {
+                mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
+              }
+            }
+          }
+
+          return ANEURALNETWORKS_LSTM;
+        };
+      }
+      break;
+    case kTfLiteBuiltinMean:
+      // NNAPI does not support generating a scalar as output for MEAN.
+      if (version <= 2 &&
+          ((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) ||
+           (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+            IsQuantized(context->tensors[node->inputs->data[0]].type))) &&
+          context->tensors[node->outputs->data[0]].dims->size > 0) {
+        auto input_param = context->tensors[node->inputs->data[0]].params;
+        auto output_param = context->tensors[node->outputs->data[0]].params;
+        // NNAPI requires that the input and output have the same
+        // quantization parameters.
+        if (input_param.scale != output_param.scale ||
+            input_param.zero_point != output_param.zero_point) {
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+              mapping_args.node->builtin_data);
+          int32_t keep_dims = 0;
+          if (builtin->keep_dims) keep_dims = 1;
+          mapping_args.builder->AddScalarInt32Operand(keep_dims);
+          return ANEURALNETWORKS_MEAN;
+        };
+      }
+      break;
+    case kTfLiteBuiltinEmbeddingLookup:
+      // NNAPI only support float32 values.
+      if (version == 1 &&
+          context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
+        return BasicMappingFn<ANEURALNETWORKS_EMBEDDING_LOOKUP>;
+      }
+      break;
+    case kTfLiteBuiltinHashtableLookup:
+      // NNAPI only support float32 output.
+      if (version == 1 &&
+          context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
+        return BasicMappingFn<ANEURALNETWORKS_HASHTABLE_LOOKUP>;
+      }
+      break;
+    case kTfLiteBuiltinMaximum: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_MAXIMUM>;
+      }
+    } break;
+    case kTfLiteBuiltinMinimum: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_MINIMUM>;
+      }
+    } break;
+    case kTfLiteBuiltinCast: {
+      const TfLiteType input_type =
+          context->tensors[node->inputs->data[0]].type;
+      const TfLiteType output_type =
+          context->tensors[node->outputs->data[0]].type;
+      auto is_supported_tensor_type = [](const TfLiteType& type) {
+        return (type == kTfLiteFloat32 || type == kTfLiteInt32 ||
+                type == kTfLiteUInt8);
+      };
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          is_supported_tensor_type(input_type) &&
+          is_supported_tensor_type(output_type)) {
+        return BasicMappingFn<ANEURALNETWORKS_CAST>;
+      }
+    } break;
+    case kTfLiteBuiltinPrelu:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        if (!IsFloatOrUint8Operator(context, node)) {
+          return nullptr;
+        }
+        return BasicMappingFn<ANEURALNETWORKS_PRELU>;
+      }
+      break;
+    case kTfLiteBuiltinTile: {
+      // NN API doesn't support int64 and boolean inputs to this op
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      const auto multipliers_type =
+          context->tensors[node->inputs->data[1]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteInt32) &&
+          (multipliers_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_TILE>;
+      }
+    } break;
+    case kTfLiteBuiltinLogicalOr: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          input_type == kTfLiteBool) {
+        return BasicMappingFn<ANEURALNETWORKS_LOGICAL_OR>;
+      }
+    } break;
+    case kTfLiteBuiltinLogicalAnd: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          input_type == kTfLiteBool) {
+        return BasicMappingFn<ANEURALNETWORKS_LOGICAL_AND>;
+      }
+    } break;
+    case kTfLiteBuiltinLogicalNot: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          input_type == kTfLiteBool) {
+        return BasicMappingFn<ANEURALNETWORKS_LOGICAL_NOT>;
+      }
+    } break;
+    case kTfLiteBuiltinLess: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_LESS>;
+      }
+    } break;
+    case kTfLiteBuiltinLessEqual: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_LESS_EQUAL>;
+      }
+    } break;
+    case kTfLiteBuiltinGreater: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_GREATER>;
+      }
+    } break;
+    case kTfLiteBuiltinGreaterEqual: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_GREATER_EQUAL>;
+      }
+    } break;
+    case kTfLiteBuiltinEqual: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_EQUAL>;
+      }
+    } break;
+    case kTfLiteBuiltinNotEqual: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8 || input_type == kTfLiteBool ||
+           input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_NOT_EQUAL>;
+      }
+    } break;
+    case kTfLiteBuiltinNeg: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat32 || input_type == kTfLiteInt32)) {
+        return BasicMappingFn<ANEURALNETWORKS_NEG>;
+      }
+    } break;
+    case kTfLiteBuiltinTopkV2: {
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        const auto& input = context->tensors[node->inputs->data[0]];
+        const auto& k_param = context->tensors[node->inputs->data[1]];
+        if ((input.type == kTfLiteFloat32 || input.type == kTfLiteInt32 ||
+             input.type == kTfLiteUInt8 || input.type == kTfLiteInt8) &&
+            (k_param.type == kTfLiteInt32 &&
+             k_param.allocation_type == kTfLiteMmapRo)) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            const TfLiteTensor& k_param =
                 mapping_args.context
                     ->tensors[mapping_args.node->inputs->data[1]];
-            mapping_args.builder->AddScalarInt32Operand(*axis_param.data.i32);
-            return ANEURALNETWORKS_EXPAND_DIMS;
+            mapping_args.builder->AddScalarInt32Operand(*k_param.data.i32);
+            return ANEURALNETWORKS_TOPK_V2;
           };
+        } else {
+          return nullptr;
         }
-      } break;
-      case kTfLiteBuiltinSplit: {
-        // Tensor indices: split_dim: 0, value: 1
-        const TfLiteTensor& axis = context->tensors[node->inputs->data[0]];
-        const TfLiteTensor& input = context->tensors[node->inputs->data[1]];
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            (input.type == kTfLiteFloat32 || input.type == kTfLiteUInt8 ||
-             input.type == kTfLiteInt32) &&
-            (axis.type == kTfLiteInt32 &&
-             axis.allocation_type == kTfLiteMmapRo)) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            const TfLiteTensor& axis =
-                mapping_args.context
-                    ->tensors[mapping_args.node->inputs->data[0]];
-            auto builtin = reinterpret_cast<TfLiteSplitParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(*axis.data.i32);
-            mapping_args.builder->AddScalarInt32Operand(builtin->num_splits);
-            return ANEURALNETWORKS_SPLIT;
-          };
+      }
+    } break;
+    case kTfLiteBuiltinSelect: {
+      const auto value_type = context->tensors[node->inputs->data[1]].type;
+      if (version <= 2 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (value_type == kTfLiteFloat32 || value_type == kTfLiteUInt8 ||
+           value_type == kTfLiteInt8 || value_type == kTfLiteInt32)) {
+        TfLiteIntArray* condition_shape =
+            context->tensors[node->inputs->data[0]].dims;
+        TfLiteIntArray* input_shape =
+            context->tensors[node->inputs->data[1]].dims;
+        // The Android Q-variant of select does not support broadcasting.
+        if (!TfLiteIntArrayEqual(condition_shape, input_shape)) {
+          return nullptr;
         }
-      } break;
-      case kTfLiteBuiltinLogSoftmax: {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            input_type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            // Scaling and axis are hardcoded to respectively 1 and -1
-            // in TFLite.
-            mapping_args.builder->AddScalarFloat32Operand(1);
-            mapping_args.builder->AddScalarInt32Operand(-1);
-            return ANEURALNETWORKS_LOG_SOFTMAX;
-          };
+        return BasicMappingFn<ANEURALNETWORKS_SELECT>;
+      }
+    } break;
+    case kTfLiteBuiltinGather: {
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        const auto& input = context->tensors[node->inputs->data[0]];
+        const auto& positions = context->tensors[node->inputs->data[1]];
+
+        auto is_supported_input_type = [](const TfLiteTensor& t) {
+          return (t.type == kTfLiteFloat32 || t.type == kTfLiteFloat16 ||
+                  t.type == kTfLiteInt32 || t.type == kTfLiteUInt8);
+        };
+
+        if (!is_supported_input_type(input) ||
+            !is_supported_input_type(positions)) {
+          return nullptr;
         }
-      } break;
-      case kTfLiteBuiltinQuantize: {
-        const auto value_type = context->tensors[node->inputs->data[0]].type;
-        const auto output_type = context->tensors[node->outputs->data[0]].type;
-        const auto quantization_params =
-            context->tensors[node->outputs->data[0]].params;
-        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-            value_type == kTfLiteFloat32 && output_type == kTfLiteUInt8 &&
-            quantization_params.scale > 0.f) {
-          return BasicMappingFn<ANEURALNETWORKS_QUANTIZE>;
+
+        // 0-dimension args are not supported by NNAPI.
+        if (positions.dims->size == 0) {
+          return nullptr;
         }
-      } break;
-      default:
-        // All other operators are not mapped.
+
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteGatherParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddTensorInput(
+              mapping_args.node->inputs->data[0],
+              /* hybrid_op */ false,
+              /* scalar_as_tensor */ false);
+
+          mapping_args.builder->AddScalarInt32Operand(builtin->axis);
+
+          mapping_args.builder->AddTensorInput(
+              mapping_args.node->inputs->data[1],
+              /* hybrid_op */ false,
+              /* scalar_as_tensor */ false);
+
+          return ANEURALNETWORKS_GATHER;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinBidirectionalSequenceLstm:
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12) {
+        if (IsHybridOperator(context, builtin_code, node)) {
+          // Hybrid version of this op is not supported by NN API.
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          auto builtin =
+              reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+                  mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
+          mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
+          mapping_args.builder->AddScalarBoolOperand(builtin->merge_outputs);
+          mapping_args.builder->AddScalarBoolOperand(builtin->time_major);
+          // TF Lite doesn't support layer normalization in bidirectional
+          // sequence LSTM, so we insert optional tensors for NNAPI
+          for (int i = 0; i < 8; ++i) {
+            mapping_args.builder->AddVectorFloat32Operand(nullptr, 0);
+          }
+          return ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM;
+        };
+      }
+      break;
+    case kTfLiteBuiltinExpandDims: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      const auto axis = context->tensors[node->inputs->data[1]];
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input_type == kTfLiteFloat16 || input_type == kTfLiteFloat32 ||
+           input_type == kTfLiteInt32 || input_type == kTfLiteUInt8 ||
+           input_type == kTfLiteInt8) &&
+          // TFLite supports axis also as int64 but NNAPI only int32
+          (axis.type == kTfLiteInt32 &&
+           axis.allocation_type == kTfLiteMmapRo)) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          const TfLiteTensor& axis_param =
+              mapping_args.context->tensors[mapping_args.node->inputs->data[1]];
+          mapping_args.builder->AddScalarInt32Operand(*axis_param.data.i32);
+          return ANEURALNETWORKS_EXPAND_DIMS;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinSplit: {
+      // Tensor indices: split_dim: 0, value: 1
+      const TfLiteTensor& axis = context->tensors[node->inputs->data[0]];
+      const TfLiteTensor& input = context->tensors[node->inputs->data[1]];
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          (input.type == kTfLiteFloat32 || input.type == kTfLiteUInt8 ||
+           input.type == kTfLiteInt32) &&
+          (axis.type == kTfLiteInt32 &&
+           axis.allocation_type == kTfLiteMmapRo)) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          const TfLiteTensor& axis =
+              mapping_args.context->tensors[mapping_args.node->inputs->data[0]];
+          auto builtin = reinterpret_cast<TfLiteSplitParams*>(
+              mapping_args.node->builtin_data);
+          mapping_args.builder->AddScalarInt32Operand(*axis.data.i32);
+          mapping_args.builder->AddScalarInt32Operand(builtin->num_splits);
+          return ANEURALNETWORKS_SPLIT;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinLogSoftmax: {
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          input_type == kTfLiteFloat32) {
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          // Scaling and axis are hardcoded to respectively 1 and -1
+          // in TFLite.
+          mapping_args.builder->AddScalarFloat32Operand(1);
+          mapping_args.builder->AddScalarInt32Operand(-1);
+          return ANEURALNETWORKS_LOG_SOFTMAX;
+        };
+      }
+    } break;
+    case kTfLiteBuiltinQuantize: {
+      const auto value_type = context->tensors[node->inputs->data[0]].type;
+      const auto output_type = context->tensors[node->outputs->data[0]].type;
+      const auto quantization_params =
+          context->tensors[node->outputs->data[0]].params;
+      if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          value_type == kTfLiteFloat32 && output_type == kTfLiteUInt8 &&
+          quantization_params.scale > 0.f) {
+        return BasicMappingFn<ANEURALNETWORKS_QUANTIZE>;
+      }
+    } break;
+    case kTfLiteBuiltinReduceAny: {
+      if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) {
         return nullptr;
-    }
-    return nullptr;
+      }
+      // NNAPI does not support generating a scalar as output for REDUCE_ANY.
+      if (context->tensors[node->outputs->data[0]].dims->size == 0) {
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+            mapping_args.node->builtin_data);
+        mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
+        return ANEURALNETWORKS_REDUCE_ANY;
+      };
+    } break;
+    case kTfLiteBuiltinReduceMin: {
+      if (version > 2 || android_sdk_version < kMinSdkVersionForNNAPI12) {
+        return nullptr;
+      }
+      // NNAPI does not support generating a scalar as output for REDUCE_MIN.
+      if (context->tensors[node->outputs->data[0]].dims->size == 0) {
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+            mapping_args.node->builtin_data);
+        mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
+        return ANEURALNETWORKS_REDUCE_MIN;
+      };
+    } break;
+    case kTfLiteBuiltinReduceMax: {
+      if (version > 2 || android_sdk_version < kMinSdkVersionForNNAPI12) {
+        return nullptr;
+      }
+      // NNAPI does not support generating a scalar as output for REDUCE_MAX.
+      if (context->tensors[node->outputs->data[0]].dims->size == 0) {
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+            mapping_args.node->builtin_data);
+        mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
+        return ANEURALNETWORKS_REDUCE_MAX;
+      };
+    } break;
+    case kTfLiteBuiltinReduceProd: {
+      if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) {
+        return nullptr;
+      }
+      // NNAPI only supports floating point REDUCE_PROD.
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (input_type != kTfLiteFloat32) {
+        return nullptr;
+      }
+      // NNAPI does not support generating a scalar as output for REDUCE_PROD.
+      if (context->tensors[node->outputs->data[0]].dims->size == 0) {
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+            mapping_args.node->builtin_data);
+        mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
+        return ANEURALNETWORKS_REDUCE_PROD;
+      };
+    } break;
+    case kTfLiteBuiltinSum: {
+      if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) {
+        return nullptr;
+      }
+      // NNAPI only supports floating point REDUCE_SUM.
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      if (input_type != kTfLiteFloat32) {
+        return nullptr;
+      }
+      // NNAPI does not support generating a scalar as output for REDUCE_SUM.
+      if (context->tensors[node->outputs->data[0]].dims->size == 0) {
+        return nullptr;
+      }
+      return [](const NNAPIOpMappingArgs& mapping_args)
+                 -> ANeuralNetworksOperationType {
+        auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+            mapping_args.node->builtin_data);
+        mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
+        return ANEURALNETWORKS_REDUCE_SUM;
+      };
+    } break;
+    default:
+      // All other operators are not mapped.
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Initialize the kernel (a NN model).
+TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
+                                       const TfLiteDelegateParams* params) {
+  for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+    nodes_.push_back(node_index);
   }
 
-  // Initialize the kernel (a NN model).
-  TfLiteStatus Init(TfLiteContext* context,
-                    const TfLiteDelegateParams* params) {
-    for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
-      nodes_.push_back(node_index);
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(params->delegate);
+  const char* device_name_ptr = delegate_options.accelerator_name;
+  // user specified an acclelerator to use.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+      device_name_ptr != nullptr) {
+    nnapi_device_ = GetDeviceHandle(context, device_name_ptr);
+    if (nnapi_device_ == nullptr) {
+      return kTfLiteError;
     }
+  }
 
-    const auto delegate_options =
-        StatefulNnApiDelegate::GetOptions(params->delegate);
-    const char* device_name_ptr = delegate_options.accelerator_name;
-    // user specified an acclelerator to use.
-    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-        device_name_ptr != nullptr) {
-      nnapi_device_ = GetDeviceHandle(device_name_ptr);
-      if (nnapi_device_ == nullptr) {
-        context->ReportError(context,
-                             "Could not find the specified accelerator: %s.",
-                             device_name_ptr);
-        return kTfLiteError;
-      }
-    }
+  // Mark the handle backed tensors.
+  tensor_memory_map_ =
+      &StatefulNnApiDelegate::GetTensorMemoryMap(params->delegate);
 
-    // Mark the handle backed tensors.
-    tensor_memory_map_ =
-        &StatefulNnApiDelegate::GetTensorMemoryMap(params->delegate);
+  if (!nn_model_) {
+    ANeuralNetworksModel* model = nullptr;
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_create(&model));
+    nn_model_.reset(model);
 
-    if (!nn_model_) {
-      ANeuralNetworksModel* model = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        BuildGraph(context, params->input_tensors, params->output_tensors));
+  }
+
+  if (!nn_compilation_) {
+    ANeuralNetworksCompilation* compilation = nullptr;
+    if (nnapi_device_ != nullptr) {
+      // Compile for the selected accelerator.
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, nnapi_->ANeuralNetworksModel_create(&model));
-      nn_model_.reset(model);
-
-      TF_LITE_ENSURE_STATUS(
-          BuildGraph(context, params->input_tensors, params->output_tensors));
+          context, nnapi_->ANeuralNetworksCompilation_createForDevices(
+                       nn_model_.get(), &nnapi_device_, 1, &compilation));
+    } else {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                             &compilation));
     }
 
-    if (!nn_compilation_) {
-      ANeuralNetworksCompilation* compilation = nullptr;
-      if (nnapi_device_ != nullptr) {
-        // Compile for the selected accelerator.
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
-            context, nnapi_->ANeuralNetworksCompilation_createForDevices(
-                         nn_model_.get(), &nnapi_device_, 1, &compilation));
-      } else {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
-            context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                               &compilation));
-      }
-
-      auto preference = delegate_options.execution_preference;
-      if (preference !=
-          StatefulNnApiDelegate::Options::ExecutionPreference::kUndefined) {
-        const int preference_result =
-            nnapi_->ANeuralNetworksCompilation_setPreference(compilation,
-                                                             preference);
-        if (preference_result != ANEURALNETWORKS_NO_ERROR) {
-          nnapi_->ANeuralNetworksCompilation_free(compilation);
-          compilation = nullptr;
-        }
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result);
-      }
-
-      const char* cache_dir = delegate_options.cache_dir;
-      const char* model_token = delegate_options.model_token;
-      if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-          cache_dir && model_token) {
-        // Compilation caching could be enabled, try construct the uint8 token.
-        // TODO(133342794): use a generic token generator class.
-        uint64_t token_parts[4];
-        // bits from model_token.
-        token_parts[0] = std::hash<std::string>{}(model_token);
-        // bits from params->nodes_to_replace.
-        token_parts[1] = GetHash(params->nodes_to_replace);
-        // bits from params->input_tensors.
-        token_parts[2] = GetHash(params->input_tensors);
-        // bits from params->output_tensors.
-        token_parts[3] = GetHash(params->output_tensors);
-        // NNAPI requires the token to be 256bit long.
-        std::vector<uint8_t> nnapi_cache_token(32, 0);
-        // Copy the token bits.
-        uint8_t* p = reinterpret_cast<uint8_t*>(token_parts);
-        for (int i = 0; i < 4 * sizeof(uint64_t); i++) {
-          nnapi_cache_token[i] = p[i];
-        }
-        const int set_caching_result =
-            nnapi_->ANeuralNetworksCompilation_setCaching(
-                compilation, cache_dir, nnapi_cache_token.data());
-        if (set_caching_result != ANEURALNETWORKS_NO_ERROR) {
-          nnapi_->ANeuralNetworksCompilation_free(compilation);
-          compilation = nullptr;
-        }
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result);
-      }
-      const int finish_result =
-          nnapi_->ANeuralNetworksCompilation_finish(compilation);
-      if (finish_result != ANEURALNETWORKS_NO_ERROR) {
+    auto preference = delegate_options.execution_preference;
+    if (preference !=
+        StatefulNnApiDelegate::Options::ExecutionPreference::kUndefined) {
+      const int preference_result =
+          nnapi_->ANeuralNetworksCompilation_setPreference(compilation,
+                                                           preference);
+      if (preference_result != ANEURALNETWORKS_NO_ERROR) {
         nnapi_->ANeuralNetworksCompilation_free(compilation);
         compilation = nullptr;
       }
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, finish_result);
-      nn_compilation_.reset(compilation);
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result);
     }
-    return kTfLiteOk;
+
+    const char* cache_dir = delegate_options.cache_dir;
+    const char* model_token = delegate_options.model_token;
+    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 && cache_dir &&
+        model_token) {
+      // Compilation caching could be enabled, try construct the uint8
+      // token.
+      // TODO(133342794): use a generic token generator class.
+      uint64_t token_parts[4];
+      // bits from model_token.
+      token_parts[0] = std::hash<std::string>{}(model_token);
+      // bits from params->nodes_to_replace.
+      token_parts[1] = GetHash(params->nodes_to_replace);
+      // bits from params->input_tensors.
+      token_parts[2] = GetHash(params->input_tensors);
+      // bits from params->output_tensors.
+      token_parts[3] = GetHash(params->output_tensors);
+      // NNAPI requires the token to be 256bit long.
+      std::vector<uint8_t> nnapi_cache_token(32, 0);
+      // Copy the token bits.
+      uint8_t* p = reinterpret_cast<uint8_t*>(token_parts);
+      for (int i = 0; i < 4 * sizeof(uint64_t); i++) {
+        nnapi_cache_token[i] = p[i];
+      }
+      const int set_caching_result =
+          nnapi_->ANeuralNetworksCompilation_setCaching(
+              compilation, cache_dir, nnapi_cache_token.data());
+      if (set_caching_result != ANEURALNETWORKS_NO_ERROR) {
+        nnapi_->ANeuralNetworksCompilation_free(compilation);
+        compilation = nullptr;
+      }
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result);
+    }
+    const int finish_result =
+        nnapi_->ANeuralNetworksCompilation_finish(compilation);
+    if (finish_result != ANEURALNETWORKS_NO_ERROR) {
+      nnapi_->ANeuralNetworksCompilation_free(compilation);
+      compilation = nullptr;
+    }
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(context, finish_result);
+    nn_compilation_.reset(compilation);
   }
+  return kTfLiteOk;
+}
 
-  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
-    ANeuralNetworksExecution* execution = nullptr;
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, nnapi_->ANeuralNetworksExecution_create(nn_compilation_.get(),
-                                                         &execution));
-    std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
-        execution_unique_ptr(execution);
+TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
+                                          TfLiteNode* node) {
+  if (!nn_compilation_) {
+    // Compilation failed earlier, return error.
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
-    // Set the input tensor buffers. Note: we access tflite tensors using
-    // absolute indices but NN api indices inputs by relative indices.
-    int relative_input_index = 0;
+TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
+                                         TfLiteNode* node) {
+  ANeuralNetworksExecution* execution = nullptr;
+  RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      context, nnapi_->ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                       &execution));
+  std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
+      execution_unique_ptr(execution);
 
-    size_t input_offset = 0;
-    for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
-      if (absolute_input_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &context->tensors[absolute_input_index];
-      // TODO(miaowang): make sure the delegation works with dequantized weights
-      // as intermediate tensors.
-      if (tensor->allocation_type != kTfLiteMmapRo) {
-        if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
-            tensor->buffer_handle < tensor_memory_map_->size()) {
-          RETURN_TFLITE_ERROR_IF_NN_ERROR(
-              context, nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                           execution, relative_input_index, nullptr,
-                           tensor_memory_map_->at(tensor->buffer_handle).memory,
-                           0, tensor->bytes));
-          relative_input_index++;
-          continue;
-        }
-        TfLiteType ann_type_equivalent =
-            operand_mapping_.lite_index_to_ann_type_conversion(
-                absolute_input_index);
-        int tensor_size = 0;
-        if (ann_type_equivalent != kTfLiteNoType) {
-          if (tensor->type == kTfLiteUInt8 &&
-              ann_type_equivalent == kTfLiteInt32) {
-            for (int i = 0; i < NumElements(tensor); ++i) {
-              reinterpret_cast<int32_t*>(nn_input_memory_->get_data_ptr() +
-                                         input_offset)[i] =
-                  static_cast<const int32_t>(tensor->data.raw_const[i]);
-            }
-          } else if (tensor->type == kTfLiteInt8 &&
-                     ann_type_equivalent == kTfLiteUInt8) {
-            // Explicitly convert int8 values to uint8 values.
-            uint8_t* input_ptr = reinterpret_cast<uint8_t*>(
-                nn_input_memory_->get_data_ptr() + input_offset);
-            for (int i = 0; i < NumElements(tensor); ++i) {
-              input_ptr[i] = static_cast<const uint8_t>(
-                  static_cast<int32_t>(tensor->data.int8[i]) + 128);
-            }
-          } else {
-            context->ReportError(
-                context,
-                "NN API Delegate: unsupported tensor types conversion: "
-                "from type code %d to type code %d.\n",
-                tensor->type, ann_type_equivalent);
-            return kTfLiteError;
-          }
-          size_t type_size;
-          TF_LITE_ENSURE_OK(
-              context, GetSizeOfType(context, ann_type_equivalent, &type_size));
-          tensor_size = NumElements(tensor) * type_size;
-          RETURN_TFLITE_ERROR_IF_NN_ERROR(
-              context,
-              nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                  execution, relative_input_index, nullptr,
-                  nn_input_memory_->get_handle(), input_offset, tensor_size));
-        } else {
-          // copy data to pre-allocated shared memory.
-          memcpy(nn_input_memory_->get_data_ptr() + input_offset,
-                 tensor->data.raw, tensor->bytes);
-          RETURN_TFLITE_ERROR_IF_NN_ERROR(
-              context,
-              nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                  execution, relative_input_index, nullptr,
-                  nn_input_memory_->get_handle(), input_offset, tensor->bytes));
-          tensor_size = tensor->bytes;
-        }
-        input_offset += tensor_size;
-        input_offset += getNumPaddingBytes(tensor_size);
-        relative_input_index++;
-      }
+  // Set the input tensor buffers. Note: we access tflite tensors using
+  // absolute indices but NN api indices inputs by relative indices.
+  int relative_input_index = 0;
+
+  size_t input_offset = 0;
+  for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
+    if (absolute_input_index == kOptionalTensor) {
+      continue;
     }
-
-    // Set the output tensor buffers.
-    int relative_output_index = 0;
-    size_t output_offset = 0;
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      TfLiteTensor* tensor = &context->tensors[output_index];
+    TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+    if (tensor->allocation_type != kTfLiteMmapRo) {
       if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
           tensor->buffer_handle < tensor_memory_map_->size()) {
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
-            context, nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-                         execution, relative_output_index, nullptr,
+            context, nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                         execution, relative_input_index, nullptr,
                          tensor_memory_map_->at(tensor->buffer_handle).memory,
                          0, tensor->bytes));
-
-      } else {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
-            context,
-            nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-                execution, relative_output_index, nullptr,
-                nn_output_memory_->get_handle(), output_offset, tensor->bytes));
-        output_offset += tensor->bytes;
-        output_offset += getNumPaddingBytes(tensor->bytes);
-      }
-      relative_output_index++;
-    }
-
-    // The state_out of previous invocation need to be mapped to state_in of
-    // current invocation.
-    for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) {
-      int state_tensor_idx = model_state_tfl_inputs_[i];
-      TfLiteTensor* tensor = &context->tensors[state_tensor_idx];
-      // Here we are using a deep copy for state_in tensors so that we are not
-      // reading and writing into the same buffer during a invocation.
-      // TODO(110369471): using double shared buffer to minimize the copies.
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, nnapi_->ANeuralNetworksExecution_setOutput(
-                       execution, relative_output_index, nullptr,
-                       tensor->data.raw, tensor->bytes));
-      relative_output_index++;
-    }
-    // Invoke ANN in blocking fashion.
-    if (nnapi_->android_sdk_version < kMinSdkVersionForNNAPI12) {
-      ANeuralNetworksEvent* event = nullptr;
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context,
-          nnapi_->ANeuralNetworksExecution_startCompute(execution, &event));
-      const int wait_result = nnapi_->ANeuralNetworksEvent_wait(event);
-      nnapi_->ANeuralNetworksEvent_free(event);
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, wait_result);
-    } else {
-      // Use synchronous execution for NNAPI 1.2+.
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, nnapi_->ANeuralNetworksExecution_compute(execution));
-    }
-
-    // copy results from shared memory to the destination.
-    output_offset = 0;
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      TfLiteTensor* tensor = &context->tensors[output_index];
-      if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+        relative_input_index++;
         continue;
       }
       TfLiteType ann_type_equivalent =
-          operand_mapping_.lite_index_to_ann_type_conversion(output_index);
-      if (tensor->type == kTfLiteInt8 && ann_type_equivalent == kTfLiteUInt8) {
-        // Explicitly convert uint8 values to int8 values.
-        uint8_t* output_ptr = reinterpret_cast<uint8_t*>(
-            nn_output_memory_->get_data_ptr() + output_offset);
-        for (int i = 0; i < NumElements(tensor); ++i) {
-          output_ptr[i] =
-              static_cast<uint8_t>(static_cast<int32_t>(output_ptr[i]) - 128);
+          operand_mapping_.lite_index_to_ann_type_conversion(
+              absolute_input_index);
+      int tensor_size = 0;
+      if (ann_type_equivalent != kTfLiteNoType) {
+        const auto num_elements = NumElements(tensor);
+        uint8_t* input_ptr = nn_input_memory_->get_data_ptr() + input_offset;
+        if (tensor->type == kTfLiteUInt8 &&
+            ann_type_equivalent == kTfLiteInt32) {
+          for (int i = 0; i < num_elements; ++i) {
+            reinterpret_cast<int32_t*>(input_ptr)[i] =
+                static_cast<const int32_t>(tensor->data.raw_const[i]);
+          }
+        } else if (tensor->type == kTfLiteInt8 &&
+                   ann_type_equivalent == kTfLiteUInt8) {
+          // Explicitly convert int8 values to uint8 values.
+          for (int i = 0; i < num_elements; ++i) {
+            input_ptr[i] = static_cast<const uint8_t>(
+                static_cast<int32_t>(tensor->data.int8[i]) + 128);
+          }
+        } else if (tensor->type == kTfLiteInt8 &&
+                   ann_type_equivalent == kTfLiteInt32) {
+          for (int i = 0; i < num_elements; ++i) {
+            reinterpret_cast<int32_t*>(input_ptr)[i] =
+                static_cast<const int32_t>(tensor->data.raw_const[i]) + 128;
+          }
+        } else {
+          context->ReportError(
+              context,
+              "NN API Delegate: unsupported tensor types conversion: "
+              "from type code %d to type code %d.\n",
+              tensor->type, ann_type_equivalent);
+          return kTfLiteError;
         }
+        size_t type_size;
+        TF_LITE_ENSURE_OK(
+            context, GetSizeOfType(context, ann_type_equivalent, &type_size));
+        tensor_size = NumElements(tensor) * type_size;
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context,
+            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                execution, relative_input_index, nullptr,
+                nn_input_memory_->get_handle(), input_offset, tensor_size));
+      } else {
+        // copy data to pre-allocated shared memory.
+        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
+               tensor->data.raw, tensor->bytes);
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context,
+            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                execution, relative_input_index, nullptr,
+                nn_input_memory_->get_handle(), input_offset, tensor->bytes));
+        tensor_size = tensor->bytes;
       }
-      memcpy(tensor->data.raw,
-             nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
+      input_offset += tensor_size;
+      input_offset += getNumPaddingBytes(tensor_size);
+      relative_input_index++;
+    }
+  }
+
+  // Set the output tensor buffers.
+  int relative_output_index = 0;
+  size_t output_offset = 0;
+  for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+    // If the NNAPI implementation doesn't have some of the outputs
+    // they are left unmapped and we should not try to read their value here
+    if (operand_mapping_.lite_index_to_ann(output_index) == -1) {
+      continue;
+    }
+    TfLiteTensor* tensor = &context->tensors[output_index];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->buffer_handle < tensor_memory_map_->size()) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
+                       execution, relative_output_index, nullptr,
+                       tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
+                       tensor->bytes));
+
+    } else {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
+              execution, relative_output_index, nullptr,
+              nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
       output_offset += getNumPaddingBytes(tensor->bytes);
     }
-
-    return kTfLiteOk;
+    relative_output_index++;
   }
 
- private:
-  // Access to NNApi.
-  const NnApi* nnapi_;
-  // ANN device handle.
-  ANeuralNetworksDevice* nnapi_device_ = nullptr;
-  // ANN API state.
-  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
-  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
-      nn_compilation_;
-  // Node indices that this delegate is responsible for. Indices here
-  // indexes into the nodes array in the TfLiteContext.
-  std::vector<int> nodes_;
-  // Track indices we use
-  OperandMapping operand_mapping_;
-  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>
-      allocation_memory_mapping_;
-  // Track memory map
-  const std::vector<StatefulNnApiDelegate::MemoryRegistration>*
-      tensor_memory_map_;
-  std::vector<int> model_state_outputs_;
-  std::vector<int> model_state_tfl_inputs_;
-
-  std::unique_ptr<NNMemory> nn_input_memory_;
-  std::unique_ptr<NNMemory> nn_output_memory_;
-
-  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
-                                         int builtin_code,
-                                         const TfLiteNode* node,
-                                         NNAPIOpBuilder* builder) {
-    // Depending on the operator and the input data format, Dequantize
-    // operators may need to be added. For example when the input is
-    // floating-point but weights are quantized then the weights will first be
-    // dequantized to the same format as the input before being passed to the
-    // operator.
-
-    // The tensor determining whether the inputs should be floating-point.
-    int input_tensor_index = -1;
-    std::vector<int> inputs_to_potentially_dequantize;
-
-    switch (builtin_code) {
-      case kTfLiteBuiltinConv2d:
-      case kTfLiteBuiltinFullyConnected: {
-        input_tensor_index = 0;
-        // Weights and bias are inputs #1 and #2 respectively and may require
-        // dequantization.
-        inputs_to_potentially_dequantize = {1, 2};
-        break;
-      }
-      case kTfLiteBuiltinLstm: {
-        input_tensor_index = 0;
-        inputs_to_potentially_dequantize = {1,  2,  3,  4,  5,  6,  7,
-                                            8,  9,  10, 11, 12, 13, 14,
-                                            15, 16, 17, 20, 21, 22, 23};
-        break;
-      }
-      default:
-        return;
-    }
-
-    int tensor_id = node->inputs->data[input_tensor_index];
-    if (tensor_id < 0) return;
-
-    // Nothing to do if the input is not floating-point.
-    if (!IsFloat(context->tensors[tensor_id].type)) return;
-
-    for (int i : inputs_to_potentially_dequantize) {
-      if (i < 0 || i >= node->inputs->size) continue;  // Ignore invalid index.
-      tensor_id = node->inputs->data[i];
-      if (tensor_id < 0) continue;  // Ignore optional input.
-
-      const TfLiteType type = context->tensors[tensor_id].type;
-      // Nothing to do for this tensor if it's not quantized.
-      if (!IsQuantized(type)) continue;
-
-      // Insert Dequantize operator if it hasn't been done already and change
-      // the node's input accordingly.
-      builder->AddDequantize(i, node->inputs->data[i], type);
-    }
+  // The state_out of previous invocation need to be mapped to state_in of
+  // current invocation.
+  for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) {
+    int state_tensor_idx = model_state_tfl_inputs_[i];
+    TfLiteTensor* tensor = &context->tensors[state_tensor_idx];
+    // Here we are using a deep copy for state_in tensors so that we are not
+    // reading and writing into the same buffer during a invocation.
+    // TODO(110369471): using double shared buffer to minimize the copies.
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksExecution_setOutput(
+                     execution, relative_output_index, nullptr,
+                     tensor->data.raw, tensor->bytes));
+    relative_output_index++;
+  }
+  // Invoke ANN in blocking fashion.
+  if (nnapi_->android_sdk_version < kMinSdkVersionForNNAPI12) {
+    ANeuralNetworksEvent* event = nullptr;
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksExecution_startCompute(execution, &event));
+    const int wait_result = nnapi_->ANeuralNetworksEvent_wait(event);
+    nnapi_->ANeuralNetworksEvent_free(event);
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(context, wait_result);
+  } else {
+    // Use synchronous execution for NNAPI 1.2+.
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksExecution_compute(execution));
   }
 
-  TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
-    DequantizeMapping dequantize_mapping;
-    // The operand builder allows creating a single op. It is created outside
-    // the for loop to avoid reallocating the vectors.
-    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
-                           &dequantize_mapping, &allocation_memory_mapping_,
-                           nn_model_.get());
-    // Add Tensors.
-    for (auto node_index : nodes_) {
-      // Obtain the op and registration.
-      TfLiteNode* node;
-      TfLiteRegistration* reg;
-      TF_LITE_ENSURE_STATUS(
-          context->GetNodeAndRegistration(context, node_index, &node, &reg));
-
-      const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
-      const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
-      const bool need_int8_conversion =
-          NeedInt8Conversion(context, reg->builtin_code, node);
-      int input_tensor_flags = 0;
-      if (scalar_as_tensor) {
-        input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
+  // copy results from shared memory to the destination.
+  output_offset = 0;
+  for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+    TfLiteTensor* tensor = &context->tensors[output_index];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+      continue;
+    }
+    TfLiteType ann_type_equivalent =
+        operand_mapping_.lite_index_to_ann_type_conversion(output_index);
+    if (tensor->type == kTfLiteInt8 && ann_type_equivalent == kTfLiteUInt8) {
+      // Explicitly convert uint8 values to int8 values.
+      uint8_t* output_ptr = reinterpret_cast<uint8_t*>(
+          nn_output_memory_->get_data_ptr() + output_offset);
+      const auto num_elements = NumElements(tensor);
+      for (int i = 0; i < num_elements; ++i) {
+        output_ptr[i] =
+            static_cast<uint8_t>(static_cast<int32_t>(output_ptr[i]) - 128);
       }
+    }
+    memcpy(tensor->data.raw, nn_output_memory_->get_data_ptr() + output_offset,
+           tensor->bytes);
+    output_offset += tensor->bytes;
+    output_offset += getNumPaddingBytes(tensor->bytes);
+  }
 
-      // Map inputs to NN API tensor indices.
-      for (int input_pos = 0; input_pos < node->inputs->size; ++input_pos) {
-        const auto input_index = node->inputs->data[input_pos];
-        if (need_int8_conversion &&
-            (input_pos == 0 ||
-             reg->builtin_code == kTfLiteBuiltinFullyConnected ||
-             reg->builtin_code == kTfLiteBuiltinSub)) {
-          // Only selected inputs require int8 conversion.
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(
-              input_index, hybrid_op,
-              input_tensor_flags | NN_TENSOR_FLAG_INT8_CONVERSION));
-          continue;
-        }
-        if (reg->builtin_code == kTfLiteBuiltinLstm && input_pos >= 20) {
+  // copy output of all output tensors in feedback_loops_ into the
+  // associated input
+  for (auto feedback_loop : feedback_loops_) {
+    int output_tensor_idx;
+    int input_tensor_idx;
+    std::tie(output_tensor_idx, input_tensor_idx) = feedback_loop;
+    TfLiteTensor* src =
+        &context->tensors[node->outputs->data[output_tensor_idx]];
+    TfLiteTensor* dest =
+        &context->tensors[node->inputs->data[input_tensor_idx]];
+
+    memcpy(dest->data.raw, src->data.raw, src->bytes);
+  }
+
+  return kTfLiteOk;
+}
+
+void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
+    const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
+    NNAPIOpBuilder* builder) {
+  // Depending on the operator and the input data format, Dequantize
+  // operators may need to be added. For example when the input is
+  // floating-point but weights are quantized then the weights will first be
+  // dequantized to the same format as the input before being passed to the
+  // operator.
+
+  // The tensor determining whether the inputs should be floating-point.
+  int input_tensor_index = -1;
+  std::vector<int> inputs_to_potentially_dequantize;
+
+  switch (builtin_code) {
+    case kTfLiteBuiltinConv2d:
+    case kTfLiteBuiltinFullyConnected: {
+      input_tensor_index = 0;
+      // Weights and bias are inputs #1 and #2 respectively and may require
+      // dequantization.
+      inputs_to_potentially_dequantize = {1, 2};
+      break;
+    }
+    case kTfLiteBuiltinLstm: {
+      input_tensor_index = 0;
+      inputs_to_potentially_dequantize = {1,  2,  3,  4,  5,  6,  7,
+                                          8,  9,  10, 11, 12, 13, 14,
+                                          15, 16, 17, 20, 21, 22, 23};
+      break;
+    }
+    default:
+      return;
+  }
+
+  int tensor_id = node->inputs->data[input_tensor_index];
+  if (tensor_id < 0) return;
+
+  // Nothing to do if the input is not floating-point.
+  if (!IsFloat(context->tensors[tensor_id].type)) return;
+
+  for (int i : inputs_to_potentially_dequantize) {
+    if (i < 0 || i >= node->inputs->size) continue;  // Ignore invalid index.
+    tensor_id = node->inputs->data[i];
+    if (tensor_id < 0) continue;  // Ignore optional input.
+
+    const TfLiteType type = context->tensors[tensor_id].type;
+    // Nothing to do for this tensor if it's not quantized.
+    if (!IsQuantized(type)) continue;
+
+    // Insert Dequantize operator if it hasn't been done already and change
+    // the node's input accordingly.
+    builder->AddDequantize(i, node->inputs->data[i], type);
+  }
+}
+
+TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context) {
+  DequantizeMapping dequantize_mapping;
+  // The operand builder allows creating a single op. It is created outside
+  // the for loop to avoid reallocating the vectors.
+  NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
+                         &dequantize_mapping, &allocation_memory_mapping_,
+                         nn_model_.get());
+  // Add Tensors.
+  for (auto node_index : nodes_) {
+    // Obtain the op and registration.
+    TfLiteNode* node;
+    TfLiteRegistration* reg;
+    TF_LITE_ENSURE_STATUS(
+        context->GetNodeAndRegistration(context, node_index, &node, &reg));
+
+    const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
+    const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
+    const bool need_int8_conversion =
+        NeedInt8Conversion(context, reg->builtin_code, node);
+    int input_tensor_flags = 0;
+    if (scalar_as_tensor) {
+      input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
+    }
+
+    // Map inputs to NN API tensor indices.
+    for (int input_pos = 0; input_pos < node->inputs->size; ++input_pos) {
+      const auto input_index = node->inputs->data[input_pos];
+      if (need_int8_conversion &&
+          (input_pos == 0 ||
+           reg->builtin_code == kTfLiteBuiltinFullyConnected ||
+           reg->builtin_code == kTfLiteBuiltinAdd ||
+           reg->builtin_code == kTfLiteBuiltinMul ||
+           reg->builtin_code == kTfLiteBuiltinSub ||
+           reg->builtin_code == kTfLiteBuiltinConcatenation ||
+           reg->builtin_code == kTfLiteBuiltinMaximum ||
+           reg->builtin_code == kTfLiteBuiltinMinimum ||
+           reg->builtin_code == kTfLiteBuiltinLess ||
+           reg->builtin_code == kTfLiteBuiltinLessEqual ||
+           reg->builtin_code == kTfLiteBuiltinGreater ||
+           reg->builtin_code == kTfLiteBuiltinGreaterEqual ||
+           reg->builtin_code == kTfLiteBuiltinEqual ||
+           reg->builtin_code == kTfLiteBuiltinNotEqual ||
+           reg->builtin_code == kTfLiteBuiltinSelect)) {
+        // Only selected inputs require int8 conversion.
+        TF_LITE_ENSURE_STATUS(builder.AddTensorInput(
+            input_index, hybrid_op,
+            input_tensor_flags | NN_TENSOR_FLAG_INT8_CONVERSION));
+        continue;
+      }
+      if (reg->builtin_code == kTfLiteBuiltinLstm && isLstmFullKernel(node) &&
+          input_pos >= 20) {
+        // Skip layer normalization weights. They are added in the Map
+        // function (after all the other inputs added there) since layer
+        // normalization weights are the last four inputs of the LSTM op in
+        // NNAPI.
+        continue;
+      }
+      if (reg->builtin_code == kTfLiteBuiltinLstm && isLstmBasicKernel(node)) {
+        // Configuring all inputs in the Map function
+        continue;
+      }
+      if (reg->builtin_code == kTfLiteBuiltinUnidirectionalSequenceLstm) {
+        if (input_pos >= 20) {
           // Skip layer normalization weights. They are added in the Map
           // function (after all the other inputs added there) since layer
-          // normalization weights are the last four inputs of the LSTM op in
-          // NNAPI.
+          // normalization weights are the last four inputs of the
+          // unidirectional sequence LSTM op in NNAPI.
           continue;
         }
-        if (reg->builtin_code == kTfLiteBuiltinUnidirectionalSequenceLstm) {
-          if (input_pos >= 20) {
-            // Skip layer normalization weights. They are added in the Map
-            // function (after all the other inputs added there) since layer
-            // normalization weights are the last four inputs of the
-            // unidirectional sequence LSTM op in NNAPI.
-            continue;
-          }
-          if (input_index == kOptionalTensor) {
-            TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
-            continue;
-          }
-        }
-
-        if ((reg->builtin_code == kTfLiteBuiltinSplit) &&
-            (input_index == node->inputs->data[0])) {
-          // Skip the axis input tensor; it will be added as a scalar operand
-          // by the Map() mapping.
+        if (input_index == kOptionalTensor) {
+          TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
           continue;
         }
+      }
+      if ((reg->builtin_code == kTfLiteBuiltinSplit) &&
+          (input_index == node->inputs->data[0])) {
+        // Skip the axis input tensor; it will be added as a scalar operand
+        // by the Map() mapping.
+        continue;
+      }
+      if (reg->builtin_code == kTfLiteBuiltinTransposeConv) {
+        // Everything is added during Map since input tensors
+        // have different order.
+        continue;
+      }
 
-        // Pad and Padv2 have an optional parameter for a pad value which has
-        // to be converted to a scalar type in NN API.
-        if ((reg->builtin_code == kTfLiteBuiltinPadv2 ||
-             reg->builtin_code == kTfLiteBuiltinPad) &&
-            node->inputs->size == 3 && input_pos == 2) {
-          const int constant_value_id = node->inputs->data[2];
-          if (constant_value_id == kOptionalTensor) {
-            continue;
-          }
-          const TfLiteTensor constant_value =
-              context->tensors[constant_value_id];
+      // Pad and Padv2 have an optional parameter for a pad value which has
+      // to be converted to a scalar type in NN API.
+      if ((reg->builtin_code == kTfLiteBuiltinPadv2 ||
+           reg->builtin_code == kTfLiteBuiltinPad) &&
+          node->inputs->size == 3 && input_pos == 2) {
+        const int constant_value_id = node->inputs->data[2];
+        if (constant_value_id == kOptionalTensor) {
+          continue;
+        }
+        const TfLiteTensor constant_value = context->tensors[constant_value_id];
 
-          switch (constant_value.type) {
-            case kTfLiteFloat32:
-              if (constant_value.allocation_type == kTfLiteMmapRo) {
-                builder.AddScalarFloat32Operand(*constant_value.data.f);
+        switch (constant_value.type) {
+          case kTfLiteFloat32:
+            if (constant_value.allocation_type == kTfLiteMmapRo) {
+              builder.AddScalarFloat32Operand(*constant_value.data.f);
+            } else {
+              builder.AddSingleValueTensorAsScalarOperand(
+                  constant_value_id, ANEURALNETWORKS_FLOAT32);
+            }
+            break;
+          case kTfLiteUInt8:
+            if (constant_value.allocation_type == kTfLiteMmapRo) {
+              builder.AddScalarInt32Operand(
+                  static_cast<int32_t>(*constant_value.data.uint8));
+            } else {
+              builder.AddSingleValueTensorAsScalarOperand(
+                  constant_value_id, ANEURALNETWORKS_INT32);
+            }
+            break;
+          case kTfLiteInt8:
+            if (constant_value.allocation_type == kTfLiteMmapRo) {
+              builder.AddScalarInt32Operand(
+                  static_cast<int32_t>(*constant_value.data.int8) + 128);
+            } else {
+              builder.AddSingleValueTensorAsScalarOperand(
+                  constant_value_id, ANEURALNETWORKS_INT32);
+            }
+            break;
+          default:
+            context->ReportError(context,
+                                 "Unsupported type of pad value for pad_v2\n");
+            return kTfLiteError;
+        }
+        continue;
+      }
+
+      if (input_index == kOptionalTensor &&
+          (reg->builtin_code == kTfLiteBuiltinLstm ||
+           reg->builtin_code == kTfLiteBuiltinSvdf ||
+           reg->builtin_code == kTfLiteBuiltinBidirectionalSequenceLstm)) {
+        // properly handle the optional tensor for LSTM and SVDF.
+        // currently only support float32.
+        TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
+      } else if (reg->builtin_code == kTfLiteBuiltinResizeBilinear ||
+                 reg->builtin_code == kTfLiteBuiltinResizeNearestNeighbor) {
+        if (input_pos == 0) {
+          // Only the first input tensor is added. The second one,
+          // specifying the output height and width, is not added and
+          // instead the height and width will be added individually as
+          // scalars by the mapping function returned by Map().
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+        }
+      } else if (reg->builtin_code == kTfLiteBuiltinTopkV2 && input_pos > 0) {
+        // The K parameter tensor is not handled here but by the functor
+        // returned by Map, the input tensor is instead added in
+        // the else clause below
+        continue;
+      } else if (reg->builtin_code == kTfLiteBuiltinGather) {
+        // Everything is added during Map since input tensors
+        // have different order.
+        continue;
+      } else if (reg->builtin_code == kTfLiteBuiltinExpandDims &&
+                 input_pos == 1) {
+        // The axis param is added during Map
+        continue;
+      } else if (reg->builtin_code == kTfLiteBuiltinBatchToSpaceNd &&
+                 input_pos == 2) {
+        // NNAPI does not support crops.
+        // The Map fucntion will check if all crops are zero.
+        continue;
+      } else if (reg->builtin_code == kTfLiteBuiltinArgMin ||
+                 reg->builtin_code == kTfLiteBuiltinArgMax) {
+        // The first input tensor is added as is. The second one, specifying
+        // the axis, needs to be converted to a scalar since TFLite uses a
+        // tensor but NNAPI uses a scalar as the axis.
+        if (input_pos == 0) {
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+        } else {
+          const int axis_id = node->inputs->data[1];
+          const TfLiteTensor& axis_tensor = context->tensors[axis_id];
+          switch (axis_tensor.type) {
+            case kTfLiteInt32:
+              if (axis_tensor.allocation_type == kTfLiteMmapRo) {
+                TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
+                    static_cast<int32_t>(*axis_tensor.data.i32)));
               } else {
-                builder.AddSingleValueTensorAsScalarOperand(
-                    constant_value_id, ANEURALNETWORKS_FLOAT32);
+                TF_LITE_ENSURE_STATUS(
+                    builder.AddSingleValueTensorAsScalarOperand(
+                        axis_id, ANEURALNETWORKS_INT32));
               }
               break;
-            case kTfLiteUInt8:
-              if (constant_value.allocation_type == kTfLiteMmapRo) {
-                builder.AddScalarInt32Operand(
-                    static_cast<int32_t>(*constant_value.data.uint8));
-              } else {
-                builder.AddSingleValueTensorAsScalarOperand(
-                    constant_value_id, ANEURALNETWORKS_INT32);
-              }
+            case kTfLiteInt64:
+              // Map() function already makes sure int64 input is constant.
+              TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
+                  static_cast<int32_t>(*axis_tensor.data.i64)));
               break;
             default:
-              context->ReportError(
-                  context, "Unsupported type of pad value for pad_v2\n");
               return kTfLiteError;
           }
-          continue;
         }
-
-        if (input_index == kOptionalTensor &&
-            (reg->builtin_code == kTfLiteBuiltinLstm ||
-             reg->builtin_code == kTfLiteBuiltinSvdf ||
-             reg->builtin_code == kTfLiteBuiltinBidirectionalSequenceLstm)) {
-          // properly handle the optional tensor for LSTM and SVDF.
-          // currently only support float32.
-          // TODO(miaowang): make sure this is also able to handle quantized
-          // tensor when supported by NNAPI.
-          TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
-        } else if (reg->builtin_code == kTfLiteBuiltinResizeBilinear) {
-          if (input_pos == 0) {
-            // Only the first input tensor is added. The second one,
-            // specifying the output height and width, is not added and
-            // instead the height and width will be added individually as
-            // scalars by the mapping function returned by Map().
-            TF_LITE_ENSURE_STATUS(
-                builder.AddTensorInput(input_index, hybrid_op));
-          }
-        } else if (reg->builtin_code == kTfLiteBuiltinTopkV2 && input_pos > 0) {
-          // The K parameter tensor is not handled here but by the functor
-          // returned by Map, the input tensor is instead added in
-          // the else clause below
-          continue;
-        } else if (reg->builtin_code == kTfLiteBuiltinGather) {
-          // Everything is added during Map since input tensors
-          // have different order.
-          continue;
-        } else if (reg->builtin_code == kTfLiteBuiltinExpandDims &&
-                   input_pos == 1) {
-          // The axis param is added during Map
-          continue;
-        } else if (reg->builtin_code == kTfLiteBuiltinArgMin ||
-                   reg->builtin_code == kTfLiteBuiltinArgMax) {
-          // The first input tensor is added as is. The second one, specifying
-          // the axis, needs to be converted to a scalar since TFLite uses a
-          // tensor but NNAPI uses a scalar as the axis.
-          if (input_pos == 0) {
-            TF_LITE_ENSURE_STATUS(
-                builder.AddTensorInput(input_index, hybrid_op));
-          } else {
-            const int axis_id = node->inputs->data[1];
-            const TfLiteTensor& axis_tensor = context->tensors[axis_id];
-            switch (axis_tensor.type) {
-              case kTfLiteInt32:
-                if (axis_tensor.allocation_type == kTfLiteMmapRo) {
-                  TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
-                      static_cast<int32_t>(*axis_tensor.data.i32)));
-                } else {
-                  TF_LITE_ENSURE_STATUS(
-                      builder.AddSingleValueTensorAsScalarOperand(
-                          axis_id, ANEURALNETWORKS_INT32));
-                }
-                break;
-              case kTfLiteInt64:
-                // Map() function already makes sure int64 input is constant.
-                TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
-                    static_cast<int32_t>(*axis_tensor.data.i64)));
-                break;
-              default:
-                return kTfLiteError;
-            }
-          }
-        } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
-                                                       input_tensor_flags));
-        }
-      }
-      // Get op type and operands
-      int nn_op_type = Map(
-          context, reg->builtin_code, reg->version, nnapi_->android_sdk_version,
-          node)({context, &builder, node, &model_state_outputs_,
-                 &model_state_tfl_inputs_});
-      // Map outputs to NN API tensor indices.
-      int output_tensor_flags = 0;
-      if (need_int8_conversion) {
-        output_tensor_flags |= NN_TENSOR_FLAG_INT8_CONVERSION;
-      }
-      for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      } else {
         TF_LITE_ENSURE_STATUS(
-            builder.AddTensorOutput(output_index, output_tensor_flags));
+            builder.AddTensorInput(input_index, hybrid_op, input_tensor_flags));
+      }
+    }
+    // Get op type and operands
+    int nn_op_type = Map(context, reg->builtin_code, reg->version,
+                         nnapi_->android_sdk_version, node,
+                         /*is_accelerator_specified=*/nnapi_device_ != nullptr)(
+        {context, &builder, node, &model_state_outputs_,
+         &model_state_tfl_inputs_, &feedback_loops_});
+    // Map outputs to NN API tensor indices.
+    int output_tensor_flags = 0;
+    if (need_int8_conversion) {
+      output_tensor_flags |= NN_TENSOR_FLAG_INT8_CONVERSION;
+    }
+    for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) {
+      const auto output_index = node->outputs->data[output_pos];
+
+      // Outputs for  basic LSTM cell are set in the Map function since
+      if (reg->builtin_code == kTfLiteBuiltinLstm && isLstmBasicKernel(node)) {
+        continue;
       }
 
-      // Dequantize operators may have to be added in case inputs are to be
-      // floating-point.
-      AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
-                                        &builder);
-
-      builder.FinalizeAddOperation(nn_op_type);
+      TF_LITE_ENSURE_STATUS(
+          builder.AddTensorOutput(output_index, output_tensor_flags));
     }
-    return kTfLiteOk;
+
+    // Dequantize operators may have to be added in case inputs are to be
+    // floating-point.
+    AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
+                                      &builder);
+
+    builder.FinalizeAddOperation(nn_op_type);
   }
+  return kTfLiteOk;
+}
 
-  TfLiteStatus BuildGraph(TfLiteContext* context,
-                          const TfLiteIntArray* input_tensors,
-                          const TfLiteIntArray* output_tensors) {
-    // Build the ops and tensors.
-    TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
-    // Map input and output tensor indices to ANN
-    std::vector<uint32_t> inputs;
-    inputs.reserve(input_tensors->size);
-    std::vector<uint32_t> outputs;
-    outputs.reserve(output_tensors->size);
+TfLiteStatus NNAPIDelegateKernel::BuildGraph(
+    TfLiteContext* context, const TfLiteIntArray* input_tensors,
+    const TfLiteIntArray* output_tensors) {
+  // Build the ops and tensors.
+  TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
+  // Map input and output tensor indices to ANN
+  std::vector<uint32_t> inputs;
+  inputs.reserve(input_tensors->size);
+  std::vector<uint32_t> outputs;
+  outputs.reserve(output_tensors->size);
 
-    size_t total_input_byte_size = 0;
-    // Make the TensorFlow Lite inputs and outputs to ann_indices.
-    for (int i : TfLiteIntArrayView(input_tensors)) {
-      // Constant tensors are not NNAPI inputs.
-      if (i != kOptionalTensor &&
-          context->tensors[i].allocation_type != kTfLiteMmapRo) {
-        inputs.push_back(operand_mapping_.lite_index_to_ann(i));
-        if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
-          continue;
-        }
-        const TfLiteType nn_type_conversion =
-            operand_mapping_.lite_index_to_ann_type_conversion(i);
-        int tensor_size = 0;
-        if (nn_type_conversion == kTfLiteNoType) {
-          tensor_size = context->tensors[i].bytes;
-        } else {
-          size_t type_size;
-          TF_LITE_ENSURE_OK(
-              context, GetSizeOfType(context, nn_type_conversion, &type_size));
-          tensor_size = NumElements(&context->tensors[i]) * type_size;
-        }
-        total_input_byte_size += tensor_size;
-        total_input_byte_size += getNumPaddingBytes(tensor_size);
-      }
-    }
-
-    size_t total_output_byte_size = 0;
-    for (int i : TfLiteIntArrayView(output_tensors)) {
-      outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+  size_t total_input_byte_size = 0;
+  // Make the TensorFlow Lite inputs and outputs to ann_indices.
+  for (int i : TfLiteIntArrayView(input_tensors)) {
+    // Constant tensors are not NNAPI inputs.
+    if (i != kOptionalTensor &&
+        context->tensors[i].allocation_type != kTfLiteMmapRo &&
+        // The delegate might not have mapped this input (this can
+        // happen if one tensor is split in several ones)
+        operand_mapping_.lite_index_to_ann(i) != -1) {
+      inputs.push_back(operand_mapping_.lite_index_to_ann(i));
       if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
         continue;
       }
-      total_output_byte_size += context->tensors[i].bytes;
-      total_output_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
+      const TfLiteType nn_type_conversion =
+          operand_mapping_.lite_index_to_ann_type_conversion(i);
+      int tensor_size = 0;
+      if (nn_type_conversion == kTfLiteNoType) {
+        tensor_size = context->tensors[i].bytes;
+      } else {
+        size_t type_size;
+        TF_LITE_ENSURE_OK(
+            context, GetSizeOfType(context, nn_type_conversion, &type_size));
+        tensor_size = NumElements(&context->tensors[i]) * type_size;
+      }
+      total_input_byte_size += tensor_size;
+      total_input_byte_size += getNumPaddingBytes(tensor_size);
     }
-
-    // Add state output tensors as model outputs.
-    for (int i : model_state_outputs_) {
-      outputs.push_back(i);
-    }
-
-    // Tell ANN to declare inputs/outputs
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs(
-                     nn_model_.get(), inputs.size(), inputs.data(),
-                     outputs.size(), outputs.data()));
-
-    // Set relaxed computation mode for fp32 if possible.
-    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context,
-          nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-              nn_model_.get(), context->allow_fp32_relax_to_fp16));
-    }
-
-    // Finalize the model
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()));
-
-    // Create shared memory pool for inputs and outputs.
-    nn_input_memory_.reset(
-        new NNMemory(nnapi_, "input_pool", total_input_byte_size));
-    nn_output_memory_.reset(
-        new NNMemory(nnapi_, "output_pool", total_output_byte_size));
-
-    return kTfLiteOk;
   }
-};
 
-}  // namespace
+  size_t total_output_byte_size = 0;
+  for (int i : TfLiteIntArrayView(output_tensors)) {
+    const int output_tensor_ann_index = operand_mapping_.lite_index_to_ann(i);
+    // Unmapped outputs are not added
+    if (output_tensor_ann_index != -1) {
+      outputs.push_back(output_tensor_ann_index);
+    }
+    if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
+      continue;
+    }
+    total_output_byte_size += context->tensors[i].bytes;
+    total_output_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
+  }
+
+  // Add state output tensors as model outputs.
+  for (int i : model_state_outputs_) {
+    outputs.push_back(i);
+  }
+
+  // Tell ANN to declare inputs/outputs
+  RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      context, nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs(
+                   nn_model_.get(), inputs.size(), inputs.data(),
+                   outputs.size(), outputs.data()));
+
+  // Set relaxed computation mode for fp32 if possible.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+                     nn_model_.get(), context->allow_fp32_relax_to_fp16));
+  }
+
+  // Finalize the model
+  RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()));
+
+  // Create shared memory pool for inputs and outputs.
+  nn_input_memory_.reset(
+      new NNMemory(nnapi_, "input_pool", total_input_byte_size));
+  nn_output_memory_.reset(
+      new NNMemory(nnapi_, "output_pool", total_output_byte_size));
+
+  return kTfLiteOk;
+}
+
+}  // namespace nnapi
+}  // namespace delegate
+
+using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
     : TfLiteDelegate(TfLiteDelegateCreate()),
@@ -2761,6 +3369,9 @@ void StatefulNnApiDelegate::DoFreeBufferHandle(TfLiteContext* context,
   }
 }
 
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
+
 TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
                                               TfLiteDelegate* delegate) {
   // Do not check nodes_ if NN API is unavailable.
@@ -2769,18 +3380,20 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       !nnapi->nnapi_exists) {
     return kTfLiteOk;
   }
+  bool is_accelerator_specified = false;
   // For NNAPI 1.2+, check if there is any accelerator available.
   // If not, don't delegate to NNAPI's CPU reference implementation.
   if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
     // Check if user specified an acclelerator to use.
     const char* device_name_ptr = GetOptions(delegate).accelerator_name;
     if (device_name_ptr) {
-      if (!GetDeviceHandle(device_name_ptr)) {
+      if (!GetDeviceHandle(context, device_name_ptr)) {
         // If the selected accelerator cannot be found, NNAPI will not be used.
-        context->ReportError(context,
-                             "Could not find the specified accelerator: %s.",
-                             device_name_ptr);
         return kTfLiteOk;
+      } else {
+        // also check if the selected device is not CPU reference impl.
+        const string kNnapiReferenceImplName = "nnapi-reference";
+        is_accelerator_specified = kNnapiReferenceImplName != device_name_ptr;
       }
     } else {
       // If no accelerator is specified, only use NNAPI if an accelerator is
@@ -2805,7 +3418,6 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
 
   int android_sdk_version = NnApiImplementation()->android_sdk_version;
   // Check for every node if it is supported
-  // TODO(b/80625235): Fix this to do more careful checking of versioning.
   for (int node_index : TfLiteIntArrayView(plan)) {
     TfLiteNode* node;
     TfLiteRegistration* registration;
@@ -2813,7 +3425,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
         context, node_index, &node, &registration));
     if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
                                  registration->version, android_sdk_version,
-                                 node)) {
+                                 node, is_accelerator_specified)) {
       supported_nodes.push_back(node_index);
     }
   }
@@ -2842,9 +3454,9 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       },
 
       .prepare = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
-        // Since the underlying resize happened ahead of delegation
-        // worked. This does nothing.
-        return kTfLiteOk;
+        NNAPIDelegateKernel* state =
+            reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
+        return state->Prepare(context, node);
       },
 
       .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
new file mode 100644
index 00000000000..3a65c3d5620
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -0,0 +1,243 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
+
+#include <map>
+#include <memory>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+constexpr int32_t kMinSdkVersionForNNAPI = 27;
+constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
+constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+
+// Track tensor indices to NN API tensor indices mapping.
+class OperandMapping {
+ public:
+  // Given a TFLite index return the ANN index. If it doesn't exist
+  // return -1.
+  int lite_index_to_ann(int index) const {
+    if (index >= 0 && index < lite_tensor_to_ann_tensor_.size())
+      return lite_tensor_to_ann_tensor_[index];
+    else
+      return -1;
+  }
+
+  // NN API uses non tensor operands instead of structs. This creates one
+  // and returns the index. It uses a std::vector and resizes it as needed
+  // keeping -1 to unmapped values. Intermediate tensors likely will not
+  // be mapped.
+  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
+
+  // This call is necessary for input operands generated by the delegate
+  // to map constant inputs not present in TFLite but required by NNAPI,
+  // for example when splitting one input in several ones.
+  int add_delegate_generated_input_ann_tensors_operand() {
+    return next_ann_tensor_index_++;
+  }
+
+  // Add a new mapping from `tflite_index` and return the NN API tensor index.
+  int add_new_ann_tensor_index(int tflite_index) {
+    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
+      lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
+    }
+    const int new_tensor_index = next_ann_tensor_index_++;
+    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
+    return new_tensor_index;
+  }
+
+  // Given a TFLite index returns a TFLite type to which a tensor must be
+  // converted during copying the data to the memory allocated for NN API.
+  // kTfLiteNoType means no conversion is needed.
+  TfLiteType lite_index_to_ann_type_conversion(int index) const {
+    if (index >= 0 && index < index_to_type_conversion_.size())
+      return index_to_type_conversion_[index];
+    else
+      return kTfLiteNoType;
+  }
+
+  // Add a new mapping from TFLite index to a type conversion.
+  void add_type_conversion(int tflite_index, TfLiteType tflite_type) {
+    if (tflite_index >= index_to_type_conversion_.size()) {
+      index_to_type_conversion_.resize(tflite_index + 1, kTfLiteNoType);
+    }
+    index_to_type_conversion_[tflite_index] = tflite_type;
+  }
+
+ private:
+  // Next index of ann tensor
+  int next_ann_tensor_index_ = 0;
+
+  // Mapping from lite index. Use a std::vector for speed and code size
+  // rather than a map.
+  std::vector<int> lite_tensor_to_ann_tensor_;
+  // Mapping from lite index to a type which tensor must be converted to during
+  // the copying of the data to the memory allocated for NN API. kTfLiteNoType
+  // means no conversion is needed. Use an std::vector for speed and code size
+  // rather than a map.
+  std::vector<TfLiteType> index_to_type_conversion_;
+};
+
+class NNAPIOpBuilder;
+
+// The kernel that represents the node sub set of TF Lite being run on NN API.
+struct NNAPIOpMappingArgs {
+  TfLiteContext* context;
+  NNAPIOpBuilder* builder;
+  TfLiteNode* node;
+  std::vector<int>* model_state_outputs;
+  std::vector<int>* model_state_tfl_inputs;
+  std::vector<std::tuple<int, int>>* feedback_loops;
+};
+
+// RAII NN API Model Destructor for use with std::unique_ptr
+struct NNFreeModel {
+  void operator()(ANeuralNetworksModel* model) {
+    NnApiImplementation()->ANeuralNetworksModel_free(model);
+  }
+};
+// RAII NN API Compilation Destructor for use with std::unique_ptr
+struct NNFreeCompilation {
+  void operator()(ANeuralNetworksCompilation* model) {
+    NnApiImplementation()->ANeuralNetworksCompilation_free(model);
+  }
+};
+
+// Manage NNAPI shared memory handle
+class NNMemory {
+ public:
+#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
+  NNMemory(const NnApi* nnapi, const char* name, size_t size) {
+    if (name && size > 0) {
+      nnapi_ = nnapi;
+      byte_size_ = size;
+      fd_ = nnapi_->ASharedMemory_create(name, size);
+      data_ptr_ = reinterpret_cast<uint8_t*>(
+          mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
+      nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
+                                                 fd_, 0, &nn_memory_handle_);
+    }
+  }
+#else
+  NNMemory(const NnApi* /*nnapi*/, const char* /*name*/, size_t /*size*/) {}
+#endif
+
+  ~NNMemory() {
+#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
+    if (data_ptr_) {
+      munmap(data_ptr_, byte_size_);
+    }
+    if (nn_memory_handle_) {
+      nnapi_->ANeuralNetworksMemory_free(nn_memory_handle_);
+    }
+    if (fd_ > 0) close(fd_);
+#endif
+  }
+
+  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
+  uint8_t* get_data_ptr() { return data_ptr_; }
+
+ private:
+#ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
+  const NnApi* nnapi_;
+  int fd_ = 0;
+  size_t byte_size_ = 0;
+#endif
+  uint8_t* data_ptr_ = nullptr;
+  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
+};
+
+// The kernel that represents the node sub set of TF Lite being run on NN API.
+class NNAPIDelegateKernel {
+ public:
+  NNAPIDelegateKernel() { nnapi_ = NnApiImplementation(); }
+  ~NNAPIDelegateKernel() {
+    for (auto content : allocation_memory_mapping_) {
+      nnapi_->ANeuralNetworksMemory_free(content.second);
+    }
+  }
+
+  typedef ANeuralNetworksOperationType (*MappingFn)(
+      const NNAPIOpMappingArgs& mapping_args);
+
+  // Return a function that knows how to translate a node into its operands
+  // when called. You can use this function to see if a node is supported
+  // (i.e. if the returned MappingFn is null, then the node is not supported).
+  static MappingFn Map(const TfLiteContext* context, int builtin_code,
+                       int version, int android_sdk_version,
+                       const TfLiteNode* node, bool is_accelerator_specified);
+
+  // Initialize the kernel (a NN model).
+  TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
+
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
+
+ private:
+  // Access to NNApi.
+  const NnApi* nnapi_;
+  // ANN device handle.
+  ANeuralNetworksDevice* nnapi_device_ = nullptr;
+  // ANN API state.
+  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
+  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
+      nn_compilation_;
+  // Node indices that this delegate is responsible for. Indices here
+  // indexes into the nodes array in the TfLiteContext.
+  std::vector<int> nodes_;
+  // Track indices we use
+  OperandMapping operand_mapping_;
+  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>
+      allocation_memory_mapping_;
+  // Track memory map
+  const std::vector<StatefulNnApiDelegate::MemoryRegistration>*
+      tensor_memory_map_;
+  std::vector<int> model_state_outputs_;
+  std::vector<int> model_state_tfl_inputs_;
+  // This is the equivalent of the pair model_state_outputs_,
+  // model_state_tfl_inputs_ for all tensors where we have to keep the output
+  // data available for TFLite model users
+  std::vector<std::tuple<int, int>> feedback_loops_;
+
+  std::unique_ptr<NNMemory> nn_input_memory_;
+  std::unique_ptr<NNMemory> nn_output_memory_;
+
+  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
+                                         int builtin_code,
+                                         const TfLiteNode* node,
+                                         NNAPIOpBuilder* builder);
+
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context);
+
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors);
+};
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index c8e9e00d86a..b1b1dcd3677 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -259,6 +259,32 @@ TEST(NNAPIDelegate, StatefulDelegateWithAcceleratorName) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with invalid accelerator_name
+// specified.
+TEST(NNAPIDelegate, StatefulDelegateWithInvalidAcceleratorName) {
+  if (!NnApiImplementation()->ANeuralNetworksDevice_getName) {
+    GTEST_SKIP();
+  }
+  testing::internal::CaptureStderr();
+  StatefulNnApiDelegate::Options options;
+  options.execution_preference =
+      StatefulNnApiDelegate::Options::ExecutionPreference::kLowPower;
+  options.accelerator_name = "foo";
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  EXPECT_THAT(testing::internal::GetCapturedStderr(),
+              testing::HasSubstr(
+                  "Could not find the specified NNAPI accelerator: foo"));
+
+  // Execution should fall back to the default CPU path.
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 // Sanity check for the state-ful NNAPI delegate with compilation caching
 // enabled.
 TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
@@ -4423,6 +4449,35 @@ class BaseReduceOpModel : public SingleOpModelWithNNAPI {
   int output_;
 };
 
+// Model for the tests case where axis is a dynamic tensor.
+class MeanOpDynamicModel : public BaseReduceOpModel {
+ public:
+  MeanOpDynamicModel(const TensorData& input, const TensorData& output,
+                     const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+TEST(DynamicFloatMeanOpTest, NotKeepDims) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                       false);
+  std::vector<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
 // Model for the tests case where axis is a const tensor.
 class MeanOpConstModel : public BaseReduceOpModel {
  public:
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
new file mode 100644
index 00000000000..bcf2ff61825
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+// The function extracts a submatrix of the weights at a given row
+// and column offsets from  a 2D matrix
+void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims,
+                                      const int32_t offset_row,
+                                      const int32_t offset_column,
+                                      const TfLiteIntArray* weight_dims,
+                                      const uint8_t* weights,
+                                      std::vector<uint8_t>* submatrix) {
+  auto const& submatrix_rows = submatrix_dims->data[0];
+  auto const& submatrix_cols = submatrix_dims->data[1];
+  auto const& weight_cols = weight_dims->data[1];
+
+  submatrix->resize(NumElements(submatrix_dims));
+
+  for (uint32_t i = 0; i < submatrix_rows * submatrix_cols; ++i) {
+    const uint32_t row = i / submatrix_cols;
+    const uint32_t column = i % submatrix_cols;
+    (*submatrix)[i] =
+        weights[(row + offset_row) * weight_cols + column + offset_column];
+  }
+}
+
+inline int OutputDepth(const TfLiteIntArray* weight_dims) {
+  return weight_dims->data[0] / 4;
+}
+
+inline int InputDepth(const TfLiteIntArray* weight_dims) {
+  return weight_dims->data[1] - OutputDepth(weight_dims);
+}
+
+void SetWeightSubmatrixDims(const TfLiteIntArray* weight_dims,
+                            TfLiteIntArray* recurrent_submatrix_dims,
+                            TfLiteIntArray* input_submatrix_dims) {
+  const auto input_depth = InputDepth(weight_dims);
+  const auto output_depth = OutputDepth(weight_dims);
+
+  recurrent_submatrix_dims->data[0] = output_depth;
+  recurrent_submatrix_dims->data[1] = output_depth;
+
+  input_submatrix_dims->data[0] = output_depth;
+  input_submatrix_dims->data[1] = input_depth;
+}
+
+// Doing exactly the opposite work of QuantizedLSTMCell::concatenateWeights
+// in NNAPI, decomposing the concat_weights tensor data into its 8 components
+// according to the following diagram
+//
+// +-----------------------------------+
+// | recurrentToInput  | inputToInput  |
+// |-------------------+---------------|
+// | recurrentToCell   | inputToCell   |
+// |-------------------+---------------|
+// | recurrentToForget | inputToForget |
+// |-------------------+---------------|
+// | recurrentToOutput | inputToOutput |
+// +-----------------------------------+
+void DecomposeQuantLstmWeightsTensor(const uint8_t* concat_weights,
+                                     const TfLiteIntArray* weight_dims,
+                                     std::vector<uint8_t>* recurrent_to_input,
+                                     std::vector<uint8_t>* input_to_input,
+                                     std::vector<uint8_t>* recurrent_to_cell,
+                                     std::vector<uint8_t>* input_to_cell,
+                                     std::vector<uint8_t>* recurrent_to_forget,
+                                     std::vector<uint8_t>* input_to_forget,
+                                     std::vector<uint8_t>* recurrent_to_output,
+                                     std::vector<uint8_t>* input_to_output) {
+  const auto output_depth = OutputDepth(weight_dims);
+
+  TfLiteIntArray* recurrent_submatrix_dims = TfLiteIntArrayCreate(2);
+  TfLiteIntArray* input_submatrix_dims = TfLiteIntArrayCreate(2);
+  SetWeightSubmatrixDims(weight_dims, recurrent_submatrix_dims,
+                         input_submatrix_dims);
+
+  ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 0 * output_depth,
+                                   0, weight_dims, concat_weights,
+                                   recurrent_to_input);
+  ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 0 * output_depth,
+                                   output_depth, weight_dims, concat_weights,
+                                   input_to_input);
+
+  ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 1 * output_depth,
+                                   0, weight_dims, concat_weights,
+                                   recurrent_to_cell);
+  ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 1 * output_depth,
+                                   output_depth, weight_dims, concat_weights,
+                                   input_to_cell);
+
+  ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 2 * output_depth,
+                                   0, weight_dims, concat_weights,
+                                   recurrent_to_forget);
+  ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 2 * output_depth,
+                                   output_depth, weight_dims, concat_weights,
+                                   input_to_forget);
+
+  ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 3 * output_depth,
+                                   0, weight_dims, concat_weights,
+                                   recurrent_to_output);
+  ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 3 * output_depth,
+                                   output_depth, weight_dims, concat_weights,
+                                   input_to_output);
+
+  TfLiteIntArrayFree(recurrent_submatrix_dims);
+  TfLiteIntArrayFree(input_submatrix_dims);
+}
+
+void DecomposeBiasTensor(const int32_t* biases, int bias_size,
+                         std::vector<int32_t>* input_bias,
+                         std::vector<int32_t>* cell_bias,
+                         std::vector<int32_t>* forget_bias,
+                         std::vector<int32_t>* output_bias) {
+  input_bias->resize(bias_size);
+  std::copy(biases, biases + bias_size, input_bias->begin());
+
+  cell_bias->resize(bias_size);
+  std::copy(biases + bias_size, biases + 2 * bias_size, cell_bias->begin());
+
+  forget_bias->resize(bias_size);
+  std::copy(biases + 2 * bias_size, biases + 3 * bias_size,
+            forget_bias->begin());
+
+  output_bias->resize(bias_size);
+  std::copy(biases + 3 * bias_size, biases + 4 * bias_size,
+            output_bias->begin());
+}
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h
new file mode 100644
index 00000000000..1385b92fc51
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims,
+                                      const int32_t offset_row,
+                                      const int32_t offset_column,
+                                      const TfLiteIntArray* weight_dims,
+                                      const uint8_t* weights,
+                                      std::vector<uint8_t>* submatrix);
+
+void DecomposeQuantLstmWeightsTensor(const uint8_t* concat_weights,
+                                     const TfLiteIntArray* weight_dims,
+                                     std::vector<uint8_t>* recurrent_to_input,
+                                     std::vector<uint8_t>* input_to_input,
+                                     std::vector<uint8_t>* recurrent_to_cell,
+                                     std::vector<uint8_t>* input_to_cell,
+                                     std::vector<uint8_t>* recurrent_to_forget,
+                                     std::vector<uint8_t>* input_to_forget,
+                                     std::vector<uint8_t>* recurrent_to_output,
+                                     std::vector<uint8_t>* input_to_output);
+
+void SetWeightSubmatrixDims(const TfLiteIntArray* weight_dims,
+                            TfLiteIntArray* recurrent_submatrix_dims,
+                            TfLiteIntArray* input_submatrix_dims);
+
+void DecomposeBiasTensor(const int32_t* biases, int bias_size,
+                         std::vector<int32_t>* input_bias,
+                         std::vector<int32_t>* cell_bias,
+                         std::vector<int32_t>* forget_bias,
+                         std::vector<int32_t>* output_bias);
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc
new file mode 100644
index 00000000000..2bbf52c147e
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc
@@ -0,0 +1,344 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
+
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Test;
+
+class DimsAllocatingTest : public Test {
+ protected:
+  DimsAllocatingTest() : allocated_dims_() {}
+
+  ~DimsAllocatingTest() override {
+    for (TfLiteIntArray* dim : allocated_dims_) {
+      TfLiteIntArrayFree(dim);
+    }
+  }
+
+  TfLiteIntArray* CreateDimArray(int size,
+                                 std::initializer_list<int> dimensions) {
+    TfLiteIntArray* dims = TfLiteIntArrayCreate(size);
+    allocated_dims_.push_back(dims);
+
+    int i = 0;
+    for (const int dimension : dimensions) {
+      dims->data[i++] = dimension;
+    }
+
+    return dims;
+  }
+
+ private:
+  std::vector<TfLiteIntArray*> allocated_dims_;
+};
+
+using tflite::delegate::nnapi::ExtractQuantLstmWeightsSubmatrix;
+
+class ExtractQuantLstmWeightsSubmatrixTest : public DimsAllocatingTest {};
+
+TEST_F(ExtractQuantLstmWeightsSubmatrixTest, TopLeftSubmatrixIsExtracted) {
+  std::vector<uint8_t> weights = {1,   2,   3,   4,   5,    //
+                                  11,  12,  13,  14,  15,   //
+                                  101, 102, 103, 104, 105,  //
+                                  111, 112, 113, 114, 115,  //
+                                  201, 202, 203, 204, 205,  //
+                                  211, 212, 213, 214, 215,  //
+                                  221, 222, 223, 224, 225,  //
+                                  231, 232, 233, 234, 235};
+  const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5});
+
+  std::vector<uint8_t> submatrix;
+  const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 3});
+
+  ExtractQuantLstmWeightsSubmatrix(submatrix_dims, 0 /* offset_row */,
+                                   0 /* offset_column */, weight_dims,
+                                   weights.data(), &submatrix);
+
+  EXPECT_THAT(submatrix, ElementsAreArray({1, 2, 3, 11, 12, 13}));
+}
+
+TEST_F(ExtractQuantLstmWeightsSubmatrixTest, TopRightSubmatrixIsExtracted) {
+  std::vector<uint8_t> weights = {1,   2,   3,   4,   5,    //
+                                  11,  12,  13,  14,  15,   //
+                                  101, 102, 103, 104, 105,  //
+                                  111, 112, 113, 114, 115,  //
+                                  201, 202, 203, 204, 205,  //
+                                  211, 212, 213, 214, 215,  //
+                                  221, 222, 223, 224, 225,  //
+                                  231, 232, 233, 234, 235};
+  const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5});
+
+  std::vector<uint8_t> submatrix;
+  const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 2});
+
+  ExtractQuantLstmWeightsSubmatrix(submatrix_dims, 0 /* offset_row */,
+                                   3 /* offset_column */, weight_dims,
+                                   weights.data(), &submatrix);
+
+  EXPECT_THAT(submatrix, ElementsAreArray({4, 5, 14, 15}));
+}
+
+TEST_F(ExtractQuantLstmWeightsSubmatrixTest, RightCentralSubmatrixIsExtracted) {
+  std::vector<uint8_t> weights = {1,   2,   3,   4,   5,    //
+                                  11,  12,  13,  14,  15,   //
+                                  101, 102, 103, 104, 105,  //
+                                  111, 112, 113, 114, 115,  //
+                                  201, 202, 203, 204, 205,  //
+                                  211, 212, 213, 214, 215,  //
+                                  221, 222, 223, 224, 225,  //
+                                  231, 232, 233, 234, 235};
+  const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5});
+
+  std::vector<uint8_t> submatrix;
+  const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 2});
+
+  ExtractQuantLstmWeightsSubmatrix(
+      submatrix_dims, 1 * submatrix_dims->data[0] /* offset_row */,
+      3 /* offset_column */, weight_dims, weights.data(), &submatrix);
+
+  EXPECT_THAT(submatrix, ElementsAreArray({104, 105, 114, 115}));
+}
+
+using tflite::delegate::nnapi::DecomposeQuantLstmWeightsTensor;
+
+class QuantLstmWeightDecompTest : public DimsAllocatingTest {
+ protected:
+  QuantLstmWeightDecompTest()
+      : weights_({1,   2,   3,   4,   5,    //
+                  11,  12,  13,  14,  15,   //
+                  101, 102, 103, 104, 105,  //
+                  111, 112, 113, 114, 115,  //
+                  201, 202, 203, 204, 205,  //
+                  211, 212, 213, 214, 215,  //
+                  221, 222, 223, 224, 225,  //
+                  231, 232, 233, 234, 235}),
+        // Creating the arrays empty, the size is set by the decomposition
+        // function
+        recurrent_to_input_(),
+        input_to_input_(),
+        recurrent_to_cell_(),
+        input_to_cell_(),
+        recurrent_to_forget_(),
+        input_to_forget_(),
+        recurrent_to_output_(),
+        input_to_output_() {
+    weight_dims_ = CreateDimArray(2, {8, 5});
+  }
+
+  const std::vector<uint8_t> weights_;
+  const TfLiteIntArray* weight_dims_;
+  std::vector<uint8_t> recurrent_to_input_;
+  std::vector<uint8_t> input_to_input_;
+  std::vector<uint8_t> recurrent_to_cell_;
+  std::vector<uint8_t> input_to_cell_;
+  std::vector<uint8_t> recurrent_to_forget_;
+  std::vector<uint8_t> input_to_forget_;
+  std::vector<uint8_t> recurrent_to_output_;
+  std::vector<uint8_t> input_to_output_;
+};
+
+TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToInput) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(recurrent_to_input_, ElementsAreArray({1, 2,  //
+                                                     11, 12}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractInputToInput) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(input_to_input_, ElementsAreArray({3, 4, 5,  //
+                                                 13, 14, 15}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToCell) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(recurrent_to_cell_, ElementsAreArray({101, 102,  //
+                                                    111, 112}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractInputToCell) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(input_to_cell_, ElementsAreArray({103, 104, 105,  //
+                                                113, 114, 115}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToForget) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(recurrent_to_forget_, ElementsAreArray({201, 202,  //
+                                                      211, 212}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractInputToForget) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(input_to_forget_, ElementsAreArray({203, 204, 205,  //
+                                                  213, 214, 215}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToOutput) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(recurrent_to_output_, ElementsAreArray({221, 222,  //
+                                                      231, 232}));
+}
+
+TEST_F(QuantLstmWeightDecompTest, ExtractInputToOutput) {
+  DecomposeQuantLstmWeightsTensor(
+      weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_,
+      &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_,
+      &input_to_forget_, &recurrent_to_output_, &input_to_output_);
+
+  EXPECT_THAT(input_to_output_, ElementsAreArray({223, 224, 225,  //
+                                                  233, 234, 235}));
+}
+
+using tflite::delegate::nnapi::DecomposeBiasTensor;
+
+TEST(DecomposeBiasTensor, ExtractInputBias) {
+  // clang-format off
+  std::vector<int32_t> biases
+      // inputGateBias
+      {-7876, 13488, -726, 32839,
+      // cellGateBias
+      39481, 48624, 48976, -21419,
+      // forgetGateBias
+      9206, -46884, -11693, -38724,
+      // outputGateBias
+      -58999, -17050, -41852, -40538};
+  // clang-format on
+
+  std::vector<int32_t> input_bias;
+  std::vector<int32_t> cell_bias;
+  std::vector<int32_t> forget_bias;
+  std::vector<int32_t> output_bias;
+  DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias,
+                      &output_bias);
+
+  EXPECT_THAT(input_bias, ElementsAreArray({-7876, 13488, -726, 32839}));
+}
+
+TEST(DecomposeBiasTensor, ExtractCellBias) {
+  // clang-format off
+  std::vector<int32_t> biases
+      // inputGateBias
+      {-7876, 13488, -726, 32839,
+      // cellGateBias
+      39481, 48624, 48976, -21419,
+      // forgetGateBias
+      9206, -46884, -11693, -38724,
+      // outputGateBias
+      -58999, -17050, -41852, -40538};
+  // clang-format on
+
+  std::vector<int32_t> input_bias;
+  std::vector<int32_t> cell_bias;
+  std::vector<int32_t> forget_bias;
+  std::vector<int32_t> output_bias;
+  DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias,
+                      &output_bias);
+
+  EXPECT_THAT(cell_bias, ElementsAreArray({39481, 48624, 48976, -21419}));
+}
+
+TEST(DecomposeBiasTensor, ExtractForgetBias) {
+  // clang-format off
+  std::vector<int32_t> biases
+      // inputGateBias
+      {-7876, 13488, -726, 32839,
+      // cellGateBias
+      39481, 48624, 48976, -21419,
+      // forgetGateBias
+      9206, -46884, -11693, -38724,
+      // outputGateBias
+      -58999, -17050, -41852, -40538};
+  // clang-format on
+
+  std::vector<int32_t> input_bias;
+  std::vector<int32_t> cell_bias;
+  std::vector<int32_t> forget_bias;
+  std::vector<int32_t> output_bias;
+  DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias,
+                      &output_bias);
+
+  EXPECT_THAT(forget_bias, ElementsAreArray({9206, -46884, -11693, -38724}));
+}
+
+TEST(DecomposeBiasTensor, ExtractOutputBias) {
+  // clang-format off
+  std::vector<int32_t> biases
+      // inputGateBias
+      {-7876, 13488, -726, 32839,
+      // cellGateBias
+      39481, 48624, 48976, -21419,
+      // forgetGateBias
+      9206, -46884, -11693, -38724,
+      // outputGateBias
+      -58999, -17050, -41852, -40538};
+  // clang-format on
+
+  std::vector<int32_t> input_bias;
+  std::vector<int32_t> cell_bias;
+  std::vector<int32_t> forget_bias;
+  std::vector<int32_t> output_bias;
+  DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias,
+                      &output_bias);
+
+  EXPECT_THAT(output_bias, ElementsAreArray({-58999, -17050, -41852, -40538}));
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index c6a38e7dcc8..4c56d05eec7 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -386,7 +386,7 @@ void ProcessInputWithQuantizedModel(
 - (void)dealloc {
 #if TFLITE_USE_GPU_DELEGATE
   if (delegate) {
-    DeleteGpuDelegate(delegate);
+    TFLGpuDelegateDelete(delegate);
   }
 #endif
   [self teardownAVCapture];
@@ -415,10 +415,10 @@ void ProcessInputWithQuantizedModel(
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
 
 #if TFLITE_USE_GPU_DELEGATE
-  GpuDelegateOptions options;
+  TFLGpuDelegateOptions options;
   options.allow_precision_loss = true;
-  options.wait_type = GpuDelegateOptions::WaitType::kActive;
-  delegate = NewGpuDelegate(&options);
+  options.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeActive;
+  delegate = TFLGpuDelegateCreate(&options);
   interpreter->ModifyGraphWithDelegate(delegate);
 #endif
 
diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
index a450aba042e..68a9c96b84e 100755
--- a/tensorflow/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -17,8 +17,8 @@
 set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-FLOAT_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
-QUANTIZED_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
+FLOAT_MODEL_URL="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+QUANTIZED_MODEL_URL="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
 DOWNLOADS_DIR=$(mktemp -d)
 
 cd "$SCRIPT_DIR"
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 0fa92299af6..d2e55a71960 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -57,7 +57,7 @@ using TfLiteDelegatePtrMap = std::map<std::string, TfLiteDelegatePtr>;
 
 TfLiteDelegatePtr CreateGPUDelegate(Settings* s) {
 #if defined(__ANDROID__)
-  TfLiteGpuDelegateOptions options;
+  TfLiteGpuDelegateOptions options = TfLiteGpuDelegateOptionsDefault();
   options.metadata = TfLiteGpuDelegateGetModelMetadata(s->model->GetModel());
   if (s->allow_fp16) {
     options.compile_options.precision_loss_allowed = 1;
diff --git a/tensorflow/lite/examples/python/README.md b/tensorflow/lite/examples/python/README.md
new file mode 100644
index 00000000000..ddfedb2916c
--- /dev/null
+++ b/tensorflow/lite/examples/python/README.md
@@ -0,0 +1,47 @@
+# TensorFlow Lite Python image classification demo
+
+This `label_image.py` script shows how you can load a pre-trained and converted
+TensorFlow Lite model and use it to recognize objects in images. The Python
+script accepts arguments specifying the model to use, the corresponding labels
+file, and the image to process.
+
+Before you begin,
+make sure you [have TensorFlow installed](https://www.tensorflow.org/install).
+
+
+## Download sample model and image
+
+You can use any compatible model, but the following MobileNet v1 model offers
+a good demonstration of a model trained to recognize 1,000 different objects.
+
+```
+# Get photo
+curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp
+# Get model
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
+# Get labels
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
+
+mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
+```
+
+## Run the sample
+
+Note: Instead use `python` if you're using Python 2.x.
+
+```
+python3 label_image.py \
+  --model_file /tmp/mobilenet_v1_1.0_224.tflite \
+  --label_file /tmp/labels.txt \
+  --image /tmp/grace_hopper.bmp
+```
+
+You should see results like this:
+
+```
+0.728693: military uniform
+0.116163: Windsor tie
+0.035517: bow tie
+0.014874: mortarboard
+0.011758: bolo tie
+```
diff --git a/tensorflow/lite/examples/python/label_image.md b/tensorflow/lite/examples/python/label_image.md
deleted file mode 100644
index b4ec42f5259..00000000000
--- a/tensorflow/lite/examples/python/label_image.md
+++ /dev/null
@@ -1,50 +0,0 @@
-
-With model, input image (grace_hopper.bmp), and labels file (labels.txt)
-in /tmp.
-
-The example input image and labels file are from TensorFlow repo and
-MobileNet V1 model files.
-
-```
-curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp
-
-curl  https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
-mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
-
-```
-
-Run
-
-```
-curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp
-bazel run --config opt //tensorflow/lite/examples/python:label_image
-```
-
-We can get results like
-
-```
-0.470588: military uniform
-0.337255: Windsor tie
-0.047059: bow tie
-0.031373: mortarboard
-0.019608: suit
-```
-
-Run
-
-```
-curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
-bazel run --config opt //tensorflow/lite/examples/python:label_image \
--- --model_file /tmp/mobilenet_v1_1.0_224.tflite
-```
-
-We can get results like
-```
-0.728693: military uniform
-0.116163: Windsor tie
-0.035517: bow tie
-0.014874: mortarboard
-0.011758: bolo tie
-```
-
-Check [models](../../g3doc/models.md) for models hosted by Google.
diff --git a/tensorflow/lite/examples/python/label_image.py b/tensorflow/lite/examples/python/label_image.py
index 0bc15d36a8a..0e288deb99e 100644
--- a/tensorflow/lite/examples/python/label_image.py
+++ b/tensorflow/lite/examples/python/label_image.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""label_image for tflite"""
+"""label_image for tflite."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,46 +23,54 @@ import numpy as np
 
 from PIL import Image
 
-from tensorflow.lite.python import interpreter as interpreter_wrapper
+from tensorflow.lite.python.interpreter import Interpreter
+
 
 def load_labels(filename):
-  my_labels = []
-  input_file = open(filename, 'r')
-  for l in input_file:
-    my_labels.append(l.strip())
-  return my_labels
+  with open(filename, 'r') as f:
+    return [line.strip() for line in f.readlines()]
 
-if __name__ == "__main__":
-  floating_model = False
 
+if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument("-i", "--image", default="/tmp/grace_hopper.bmp", \
-    help="image to be classified")
-  parser.add_argument("-m", "--model_file", \
-    default="/tmp/mobilenet_v1_1.0_224_quant.tflite", \
-    help=".tflite model to be executed")
-  parser.add_argument("-l", "--label_file", default="/tmp/labels.txt", \
-    help="name of file containing labels")
-  parser.add_argument("--input_mean", default=127.5, help="input_mean")
-  parser.add_argument("--input_std", default=127.5, \
-    help="input standard deviation")
+  parser.add_argument(
+      '-i',
+      '--image',
+      default='/tmp/grace_hopper.bmp',
+      help='image to be classified')
+  parser.add_argument(
+      '-m',
+      '--model_file',
+      default='/tmp/mobilenet_v1_1.0_224_quant.tflite',
+      help='.tflite model to be executed')
+  parser.add_argument(
+      '-l',
+      '--label_file',
+      default='/tmp/labels.txt',
+      help='name of file containing labels')
+  parser.add_argument(
+      '--input_mean',
+      default=127.5, type=float,
+      help='input_mean')
+  parser.add_argument(
+      '--input_std',
+      default=127.5, type=float,
+      help='input standard deviation')
   args = parser.parse_args()
 
-  interpreter = interpreter_wrapper.Interpreter(model_path=args.model_file)
+  interpreter = Interpreter(model_path=args.model_file)
   interpreter.allocate_tensors()
 
   input_details = interpreter.get_input_details()
   output_details = interpreter.get_output_details()
 
   # check the type of the input tensor
-  if input_details[0]['dtype'] == np.float32:
-    floating_model = True
+  floating_model = input_details[0]['dtype'] == np.float32
 
   # NxHxWxC, H:1, W:2
   height = input_details[0]['shape'][1]
   width = input_details[0]['shape'][2]
-  img = Image.open(args.image)
-  img = img.resize((width, height))
+  img = Image.open(args.image).resize((width, height))
 
   # add N dim
   input_data = np.expand_dims(img, axis=0)
@@ -81,6 +89,6 @@ if __name__ == "__main__":
   labels = load_labels(args.label_file)
   for i in top_k:
     if floating_model:
-      print('{0:08.6f}'.format(float(results[i]))+":", labels[i])
+      print('{:08.6f}: {}'.format(float(results[i]), labels[i]))
     else:
-      print('{0:08.6f}'.format(float(results[i]/255.0))+":", labels[i])
+      print('{:08.6f}: {}'.format(float(results[i] / 255.0), labels[i]))
diff --git a/tensorflow/lite/experimental/c/c_api.cc b/tensorflow/lite/experimental/c/c_api.cc
index 67f826c8f61..ab3ee961bb1 100644
--- a/tensorflow/lite/experimental/c/c_api.cc
+++ b/tensorflow/lite/experimental/c/c_api.cc
@@ -50,47 +50,52 @@ class CallbackErrorReporter : public tflite::ErrorReporter {
 
 // LINT.IfChange
 
-const char* TFL_Version() { return TFLITE_VERSION_STRING; }
+const char* TfLiteVersion() { return TFLITE_VERSION_STRING; }
 
-TFL_Model* TFL_NewModel(const void* model_data, size_t model_size) {
+TfLiteModel* TfLiteModelCreate(const void* model_data, size_t model_size) {
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
       static_cast<const char*>(model_data), model_size);
   std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
-  return shared_model ? new TFL_Model{std::move(shared_model)} : nullptr;
+  return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
 }
 
-TFL_Model* TFL_NewModelFromFile(const char* model_path) {
+TfLiteModel* TfLiteModelCreateFromFile(const char* model_path) {
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(model_path);
   std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
-  return shared_model ? new TFL_Model{std::move(shared_model)} : nullptr;
+  return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
 }
 
-void TFL_DeleteModel(TFL_Model* model) { delete model; }
+void TfLiteModelDelete(TfLiteModel* model) { delete model; }
 
-TFL_InterpreterOptions* TFL_NewInterpreterOptions() {
-  return new TFL_InterpreterOptions{};
+TfLiteInterpreterOptions* TfLiteInterpreterOptionsCreate() {
+  return new TfLiteInterpreterOptions{};
 }
 
-void TFL_DeleteInterpreterOptions(TFL_InterpreterOptions* options) {
+void TfLiteInterpreterOptionsDelete(TfLiteInterpreterOptions* options) {
   delete options;
 }
 
-void TFL_InterpreterOptionsSetNumThreads(TFL_InterpreterOptions* options,
-                                         int32_t num_threads) {
+void TfLiteInterpreterOptionsSetNumThreads(TfLiteInterpreterOptions* options,
+                                           int32_t num_threads) {
   options->num_threads = num_threads;
 }
 
-void TFL_InterpreterOptionsSetErrorReporter(TFL_InterpreterOptions* options,
-                                            void (*reporter)(void* user_data,
-                                                             const char* format,
-                                                             va_list args),
-                                            void* user_data) {
+void TfLiteInterpreterOptionsAddDelegate(TfLiteInterpreterOptions* options,
+                                         TfLiteDelegate* delegate) {
+  options->delegates.push_back(delegate);
+}
+
+void TfLiteInterpreterOptionsSetErrorReporter(
+    TfLiteInterpreterOptions* options,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data) {
   options->error_reporter = reporter;
   options->error_reporter_user_data = user_data;
 }
 
-TFL_Interpreter* TFL_NewInterpreter(
-    const TFL_Model* model, const TFL_InterpreterOptions* optional_options) {
+TfLiteInterpreter* TfLiteInterpreterCreate(
+    const TfLiteModel* model,
+    const TfLiteInterpreterOptions* optional_options) {
   if (!model || !model->impl) {
     return nullptr;
   }
@@ -120,77 +125,92 @@ TFL_Interpreter* TFL_NewInterpreter(
 
   if (optional_options) {
     if (optional_options->num_threads !=
-        TFL_InterpreterOptions::kDefaultNumThreads) {
+        TfLiteInterpreterOptions::kDefaultNumThreads) {
       interpreter->SetNumThreads(optional_options->num_threads);
     }
+
+    for (auto* delegate : optional_options->delegates) {
+      if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
+        return nullptr;
+      }
+    }
   }
 
-  return new TFL_Interpreter{model->impl, std::move(optional_error_reporter),
-                             std::move(interpreter)};
+  return new TfLiteInterpreter{model->impl, std::move(optional_error_reporter),
+                               std::move(interpreter)};
 }
 
-void TFL_DeleteInterpreter(TFL_Interpreter* interpreter) { delete interpreter; }
+void TfLiteInterpreterDelete(TfLiteInterpreter* interpreter) {
+  delete interpreter;
+}
 
-int32_t TFL_InterpreterGetInputTensorCount(const TFL_Interpreter* interpreter) {
+int32_t TfLiteInterpreterGetInputTensorCount(
+    const TfLiteInterpreter* interpreter) {
   return static_cast<int>(interpreter->impl->inputs().size());
 }
 
-TFL_Tensor* TFL_InterpreterGetInputTensor(const TFL_Interpreter* interpreter,
-                                          int32_t input_index) {
+TfLiteTensor* TfLiteInterpreterGetInputTensor(
+    const TfLiteInterpreter* interpreter, int32_t input_index) {
   return interpreter->impl->tensor(interpreter->impl->inputs()[input_index]);
 }
 
-TFL_Status TFL_InterpreterResizeInputTensor(TFL_Interpreter* interpreter,
-                                            int32_t input_index,
-                                            const int* input_dims,
-                                            int32_t input_dims_size) {
+TfLiteStatus TfLiteInterpreterResizeInputTensor(TfLiteInterpreter* interpreter,
+                                                int32_t input_index,
+                                                const int* input_dims,
+                                                int32_t input_dims_size) {
   std::vector<int> dims{input_dims, input_dims + input_dims_size};
   return interpreter->impl->ResizeInputTensor(
       interpreter->impl->inputs()[input_index], dims);
 }
 
-TFL_Status TFL_InterpreterAllocateTensors(TFL_Interpreter* interpreter) {
+TfLiteStatus TfLiteInterpreterAllocateTensors(TfLiteInterpreter* interpreter) {
   return interpreter->impl->AllocateTensors();
 }
 
-TFL_Status TFL_InterpreterInvoke(TFL_Interpreter* interpreter) {
+TfLiteStatus TfLiteInterpreterInvoke(TfLiteInterpreter* interpreter) {
   return interpreter->impl->Invoke();
 }
 
-int32_t TFL_InterpreterGetOutputTensorCount(
-    const TFL_Interpreter* interpreter) {
+int32_t TfLiteInterpreterGetOutputTensorCount(
+    const TfLiteInterpreter* interpreter) {
   return static_cast<int>(interpreter->impl->outputs().size());
 }
 
-const TFL_Tensor* TFL_InterpreterGetOutputTensor(
-    const TFL_Interpreter* interpreter, int32_t output_index) {
+const TfLiteTensor* TfLiteInterpreterGetOutputTensor(
+    const TfLiteInterpreter* interpreter, int32_t output_index) {
   return interpreter->impl->tensor(interpreter->impl->outputs()[output_index]);
 }
 
-TFL_Type TFL_TensorType(const TFL_Tensor* tensor) { return tensor->type; }
+TfLiteType TfLiteTensorType(const TfLiteTensor* tensor) { return tensor->type; }
 
-int32_t TFL_TensorNumDims(const TFL_Tensor* tensor) {
+int32_t TfLiteTensorNumDims(const TfLiteTensor* tensor) {
   return tensor->dims->size;
 }
 
-int32_t TFL_TensorDim(const TFL_Tensor* tensor, int32_t dim_index) {
+int32_t TfLiteTensorDim(const TfLiteTensor* tensor, int32_t dim_index) {
   return tensor->dims->data[dim_index];
 }
 
-size_t TFL_TensorByteSize(const TFL_Tensor* tensor) { return tensor->bytes; }
+size_t TfLiteTensorByteSize(const TfLiteTensor* tensor) {
+  return tensor->bytes;
+}
 
-void* TFL_TensorData(const TFL_Tensor* tensor) {
+void* TfLiteTensorData(const TfLiteTensor* tensor) {
   return static_cast<void*>(tensor->data.raw);
 }
 
-const char* TFL_TensorName(const TFL_Tensor* tensor) { return tensor->name; }
+const char* TfLiteTensorName(const TfLiteTensor* tensor) {
+  return tensor->name;
+}
 
-TFL_QuantizationParams TFL_TensorQuantizationParams(const TFL_Tensor* tensor) {
+TfLiteQuantizationParams TfLiteTensorQuantizationParams(
+    const TfLiteTensor* tensor) {
   return tensor->params;
 }
 
-TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
-                                    size_t input_data_size) {
+TfLiteStatus TfLiteTensorCopyFromBuffer(TfLiteTensor* tensor,
+                                        const void* input_data,
+                                        size_t input_data_size) {
   if (tensor->bytes != input_data_size) {
     return kTfLiteError;
   }
@@ -198,8 +218,9 @@ TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
   return kTfLiteOk;
 }
 
-TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data,
-                                  size_t output_data_size) {
+TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
+                                      void* output_data,
+                                      size_t output_data_size) {
   if (tensor->bytes != output_data_size) {
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/experimental/c/c_api.h b/tensorflow/lite/experimental/c/c_api.h
index cea9ca6c7a3..09a045b1f2a 100644
--- a/tensorflow/lite/experimental/c/c_api.h
+++ b/tensorflow/lite/experimental/c/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 // most usage will be by language-specific wrappers.
 //
 // Conventions:
-// * We use the prefix TFL_ for everything in the API.
+// * We use the prefix TfLite for everything in the API.
 // * size_t is used to represent byte sizes of objects that are
 //   materialized in the address space of the calling process.
 // * int is used as an index into arrays.
@@ -39,11 +39,11 @@ limitations under the License.
 #define TFL_CAPI_EXPORT
 #else
 #if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
+#ifdef TFL_COMPILE_LIBRARY
 #define TFL_CAPI_EXPORT __declspec(dllexport)
 #else
 #define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
+#endif  // TFL_COMPILE_LIBRARY
 #else
 #define TFL_CAPI_EXPORT __attribute__((visibility("default")))
 #endif  // _WIN32
@@ -53,45 +53,51 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-typedef TfLiteQuantizationParams TFL_QuantizationParams;
-typedef TfLiteRegistration TFL_Registration;
-typedef TfLiteStatus TFL_Status;
-typedef TfLiteTensor TFL_Tensor;
-typedef TfLiteType TFL_Type;
-
 // --------------------------------------------------------------------------
-// TFL_Version returns a string describing version information of the
+// TfLiteVersion returns a string describing version information of the
 // TensorFlow Lite library. TensorFlow Lite uses semantic versioning.
-TFL_CAPI_EXPORT extern const char* TFL_Version(void);
+TFL_CAPI_EXPORT extern const char* TfLiteVersion(void);
 
 // --------------------------------------------------------------------------
-// TFL_Model wraps a loaded TensorFlow Lite model.
-typedef struct TFL_Model TFL_Model;
+// TfLiteModel wraps a loaded TensorFlow Lite model.
+typedef struct TfLiteModel TfLiteModel;
 
 // Returns a model from the provided buffer, or null on failure.
-TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModel(const void* model_data,
-                                               size_t model_size);
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreate(const void* model_data,
+                                                      size_t model_size);
 
 // Returns a model from the provided file, or null on failure.
-TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModelFromFile(const char* model_path);
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFile(
+    const char* model_path);
 
 // Destroys the model instance.
-TFL_CAPI_EXPORT extern void TFL_DeleteModel(TFL_Model* model);
+TFL_CAPI_EXPORT extern void TfLiteModelDelete(TfLiteModel* model);
 
 // --------------------------------------------------------------------------
-// TFL_InterpreterOptions allows customized interpreter configuration.
-typedef struct TFL_InterpreterOptions TFL_InterpreterOptions;
+// TfLiteInterpreterOptions allows customized interpreter configuration.
+typedef struct TfLiteInterpreterOptions TfLiteInterpreterOptions;
 
 // Returns a new interpreter options instances.
-TFL_CAPI_EXPORT extern TFL_InterpreterOptions* TFL_NewInterpreterOptions();
+TFL_CAPI_EXPORT extern TfLiteInterpreterOptions*
+TfLiteInterpreterOptionsCreate();
 
 // Destroys the interpreter options instance.
-TFL_CAPI_EXPORT extern void TFL_DeleteInterpreterOptions(
-    TFL_InterpreterOptions* options);
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsDelete(
+    TfLiteInterpreterOptions* options);
 
 // Sets the number of CPU threads to use for the interpreter.
-TFL_CAPI_EXPORT extern void TFL_InterpreterOptionsSetNumThreads(
-    TFL_InterpreterOptions* options, int32_t num_threads);
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetNumThreads(
+    TfLiteInterpreterOptions* options, int32_t num_threads);
+
+// Adds a delegate to be applied during `TfLiteInterpreter` creation.
+//
+// If delegate application fails, interpreter creation will also fail with an
+// associated error logged.
+//
+// NOTE: The caller retains ownership of the delegate and should ensure that it
+// remains valid for the duration of any created interpreter's lifetime.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddDelegate(
+    TfLiteInterpreterOptions* options, TfLiteDelegate* delegate);
 
 // Sets a custom error reporter for interpreter execution.
 //
@@ -99,14 +105,14 @@ TFL_CAPI_EXPORT extern void TFL_InterpreterOptionsSetNumThreads(
 //   format string and arg list (see also vprintf).
 // * `user_data` is optional. If provided, it is owned by the client and must
 //   remain valid for the duration of the interpreter lifetime.
-TFL_CAPI_EXPORT extern void TFL_InterpreterOptionsSetErrorReporter(
-    TFL_InterpreterOptions* options,
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetErrorReporter(
+    TfLiteInterpreterOptions* options,
     void (*reporter)(void* user_data, const char* format, va_list args),
     void* user_data);
 
 // --------------------------------------------------------------------------
-// TFL_Interpreter provides inference from a provided model.
-typedef struct TFL_Interpreter TFL_Interpreter;
+// TfLiteInterpreter provides inference from a provided model.
+typedef struct TfLiteInterpreter TfLiteInterpreter;
 
 // Returns a new interpreter using the provided model and options, or null on
 // failure.
@@ -119,28 +125,29 @@ typedef struct TFL_Interpreter TFL_Interpreter;
 //
 // NOTE: The client *must* explicitly allocate tensors before attempting to
 // access input tensor data or invoke the interpreter.
-TFL_CAPI_EXPORT extern TFL_Interpreter* TFL_NewInterpreter(
-    const TFL_Model* model, const TFL_InterpreterOptions* optional_options);
+TFL_CAPI_EXPORT extern TfLiteInterpreter* TfLiteInterpreterCreate(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options);
 
 // Destroys the interpreter.
-TFL_CAPI_EXPORT extern void TFL_DeleteInterpreter(TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern void TfLiteInterpreterDelete(
+    TfLiteInterpreter* interpreter);
 
 // Returns the number of input tensors associated with the model.
-TFL_CAPI_EXPORT extern int TFL_InterpreterGetInputTensorCount(
-    const TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern int TfLiteInterpreterGetInputTensorCount(
+    const TfLiteInterpreter* interpreter);
 
 // Returns the tensor associated with the input index.
-// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
-TFL_CAPI_EXPORT extern TFL_Tensor* TFL_InterpreterGetInputTensor(
-    const TFL_Interpreter* interpreter, int32_t input_index);
+// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteInterpreterGetInputTensor(
+    const TfLiteInterpreter* interpreter, int32_t input_index);
 
 // Resizes the specified input tensor.
 //
 // NOTE: After a resize, the client *must* explicitly allocate tensors before
 // attempting to access the resized tensor data or invoke the interpreter.
-// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
-TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResizeInputTensor(
-    TFL_Interpreter* interpreter, int32_t input_index, const int* input_dims,
+// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResizeInputTensor(
+    TfLiteInterpreter* interpreter, int32_t input_index, const int* input_dims,
     int32_t input_dims_size);
 
 // Updates allocations for all tensors, resizing dependent tensors using the
@@ -148,80 +155,80 @@ TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResizeInputTensor(
 //
 // This is a relatively expensive operation, and need only be called after
 // creating the graph and/or resizing any inputs.
-TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterAllocateTensors(
-    TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterAllocateTensors(
+    TfLiteInterpreter* interpreter);
 
 // Runs inference for the loaded graph.
 //
 // NOTE: It is possible that the interpreter is not in a ready state to
 // evaluate (e.g., if a ResizeInputTensor() has been performed without a call to
 // AllocateTensors()).
-TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterInvoke(
-    TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterInvoke(
+    TfLiteInterpreter* interpreter);
 
 // Returns the number of output tensors associated with the model.
-TFL_CAPI_EXPORT extern int32_t TFL_InterpreterGetOutputTensorCount(
-    const TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorCount(
+    const TfLiteInterpreter* interpreter);
 
 // Returns the tensor associated with the output index.
-// REQUIRES: 0 <= input_index < TFL_InterpreterGetOutputTensorCount(tensor)
+// REQUIRES: 0 <= input_index < TfLiteInterpreterGetOutputTensorCount(tensor)
 //
 // NOTE: The shape and underlying data buffer for output tensors may be not
 // be available until after the output tensor has been both sized and allocated.
 // In general, best practice is to interact with the output tensor *after*
-// calling TFL_InterpreterInvoke().
-TFL_CAPI_EXPORT extern const TFL_Tensor* TFL_InterpreterGetOutputTensor(
-    const TFL_Interpreter* interpreter, int32_t output_index);
+// calling TfLiteInterpreterInvoke().
+TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteInterpreterGetOutputTensor(
+    const TfLiteInterpreter* interpreter, int32_t output_index);
 
 // --------------------------------------------------------------------------
-// TFL_Tensor wraps data associated with a graph tensor.
+// TfLiteTensor wraps data associated with a graph tensor.
 //
-// Note that, while the TFL_Tensor struct is not currently opaque, and its
+// Note that, while the TfLiteTensor struct is not currently opaque, and its
 // fields can be accessed directly, these methods are still convenient for
 // language bindings. In the future the tensor struct will likely be made opaque
 // in the public API.
 
 // Returns the type of a tensor element.
-TFL_CAPI_EXPORT extern TFL_Type TFL_TensorType(const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern TfLiteType TfLiteTensorType(const TfLiteTensor* tensor);
 
 // Returns the number of dimensions that the tensor has.
-TFL_CAPI_EXPORT extern int32_t TFL_TensorNumDims(const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern int32_t TfLiteTensorNumDims(const TfLiteTensor* tensor);
 
 // Returns the length of the tensor in the "dim_index" dimension.
 // REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
-TFL_CAPI_EXPORT extern int32_t TFL_TensorDim(const TFL_Tensor* tensor,
-                                             int32_t dim_index);
+TFL_CAPI_EXPORT extern int32_t TfLiteTensorDim(const TfLiteTensor* tensor,
+                                               int32_t dim_index);
 
 // Returns the size of the underlying data in bytes.
-TFL_CAPI_EXPORT extern size_t TFL_TensorByteSize(const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern size_t TfLiteTensorByteSize(const TfLiteTensor* tensor);
 
 // Returns a pointer to the underlying data buffer.
 //
 // NOTE: The result may be null if tensors have not yet been allocated, e.g.,
-// if the Tensor has just been created or resized and `TFL_AllocateTensors()`
+// if the Tensor has just been created or resized and `TfLiteAllocateTensors()`
 // has yet to be called, or if the output tensor is dynamically sized and the
 // interpreter hasn't been invoked.
-TFL_CAPI_EXPORT extern void* TFL_TensorData(const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern void* TfLiteTensorData(const TfLiteTensor* tensor);
 
 // Returns the (null-terminated) name of the tensor.
-TFL_CAPI_EXPORT extern const char* TFL_TensorName(const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern const char* TfLiteTensorName(const TfLiteTensor* tensor);
 
 // Returns the parameters for asymmetric quantization. The quantization
 // parameters are only valid when the tensor type is `kTfLiteUInt8` and the
 // `scale != 0`. Quantized values can be converted back to float using:
 //    real_value = scale * (quantized_value - zero_point);
-TFL_CAPI_EXPORT extern TFL_QuantizationParams TFL_TensorQuantizationParams(
-    const TFL_Tensor* tensor);
+TFL_CAPI_EXPORT extern TfLiteQuantizationParams TfLiteTensorQuantizationParams(
+    const TfLiteTensor* tensor);
 
 // Copies from the provided input buffer into the tensor's buffer.
-// REQUIRES: input_data_size == TFL_TensorByteSize(tensor)
-TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer(
-    TFL_Tensor* tensor, const void* input_data, size_t input_data_size);
+// REQUIRES: input_data_size == TfLiteTensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyFromBuffer(
+    TfLiteTensor* tensor, const void* input_data, size_t input_data_size);
 
 // Copies to the provided output buffer from the tensor's buffer.
-// REQUIRES: output_data_size == TFL_TensorByteSize(tensor)
-TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer(
-    const TFL_Tensor* output_tensor, void* output_data,
+// REQUIRES: output_data_size == TfLiteTensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyToBuffer(
+    const TfLiteTensor* output_tensor, void* output_data,
     size_t output_data_size);
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/experimental/c/c_api_experimental.cc b/tensorflow/lite/experimental/c/c_api_experimental.cc
index a246ed99cd3..5bc305ef64b 100644
--- a/tensorflow/lite/experimental/c/c_api_experimental.cc
+++ b/tensorflow/lite/experimental/c/c_api_experimental.cc
@@ -21,23 +21,23 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-TFL_Status TFL_InterpreterResetVariableTensors(TFL_Interpreter* interpreter) {
+TfLiteStatus TfLiteInterpreterResetVariableTensors(
+    TfLiteInterpreter* interpreter) {
   return interpreter->impl->ResetVariableTensors();
 }
 
-void TFL_InterpreterOptionsAddBuiltinOp(TFL_InterpreterOptions* options,
-                                        TFL_BuiltinOperator op,
-                                        const TFL_Registration* registration,
-                                        int32_t min_version,
-                                        int32_t max_version) {
+void TfLiteInterpreterOptionsAddBuiltinOp(
+    TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
+    const TfLiteRegistration* registration, int32_t min_version,
+    int32_t max_version) {
   options->op_resolver.AddBuiltin(static_cast<tflite::BuiltinOperator>(op),
                                   registration, min_version, max_version);
 }
 
-void TFL_InterpreterOptionsAddCustomOp(TFL_InterpreterOptions* options,
-                                       const char* name,
-                                       const TFL_Registration* registration,
-                                       int min_version, int max_version) {
+void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
+                                         const char* name,
+                                         const TfLiteRegistration* registration,
+                                         int min_version, int max_version) {
   options->op_resolver.AddCustom(name, registration, min_version, max_version);
 }
 
diff --git a/tensorflow/lite/experimental/c/c_api_experimental.h b/tensorflow/lite/experimental/c/c_api_experimental.h
index 0f082c03137..ce1a4a37293 100644
--- a/tensorflow/lite/experimental/c/c_api_experimental.h
+++ b/tensorflow/lite/experimental/c/c_api_experimental.h
@@ -22,31 +22,29 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-typedef TfLiteBuiltinOperator TFL_BuiltinOperator;
-
 // Resets all variable tensors to zero.
-TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResetVariableTensors(
-    TFL_Interpreter* interpreter);
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
+    TfLiteInterpreter* interpreter);
 
 // Adds an op registration for a builtin operator.
 //
 // NOTE: The interpreter will make a copy of `registration` internally, so the
 // caller should ensure that its contents (function pointers, etc...) remain
 // valid for the duration of the interpreter's lifetime. A common practice is
-// making the provided TFL_Registration instance static.
-TFL_CAPI_EXPORT void TFL_InterpreterOptionsAddBuiltinOp(
-    TFL_InterpreterOptions* options, TFL_BuiltinOperator op,
-    const TFL_Registration* registration, int min_version, int max_version);
+// making the provided TfLiteRegistration instance static.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
+    TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
+    const TfLiteRegistration* registration, int min_version, int max_version);
 
 // Adds an op registration for a custom operator.
 //
 // NOTE: The interpreter will make a copy of `registration` internally, so the
 // caller should ensure that its contents (function pointers, etc...) remain
-// valid for the duration of the interpreter's lifetime. A common practice is
-// making the provided TFL_Registration instance static.
-TFL_CAPI_EXPORT void TFL_InterpreterOptionsAddCustomOp(
-    TFL_InterpreterOptions* options, const char* name,
-    const TFL_Registration* registration, int min_version, int max_version);
+// valid for the duration of any created interpreter's lifetime. A common
+// practice is making the provided TfLiteRegistration instance static.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
+    TfLiteInterpreterOptions* options, const char* name,
+    const TfLiteRegistration* registration, int min_version, int max_version);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/tensorflow/lite/experimental/c/c_api_experimental_test.cc b/tensorflow/lite/experimental/c/c_api_experimental_test.cc
index e79c7204c6e..0d383998a29 100644
--- a/tensorflow/lite/experimental/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/experimental/c/c_api_experimental_test.cc
@@ -32,24 +32,24 @@ TfLiteRegistration* GetDummyRegistration() {
   return &registration;
 }
 
-TEST(CApiExperimentalSimple, Smoke) {
-  TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/lite/testdata/add.bin");
+TEST(CApiExperimentalTest, Smoke) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
-  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
-  TFL_InterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
-                                     GetDummyRegistration(), 1, 1);
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
+                                       GetDummyRegistration(), 1, 1);
 
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
-  EXPECT_EQ(TFL_InterpreterResetVariableTensors(interpreter), kTfLiteOk);
-  EXPECT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterResetVariableTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
 
-  TFL_DeleteInterpreter(interpreter);
-  TFL_DeleteInterpreterOptions(options);
-  TFL_DeleteModel(model);
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/c/c_api_internal.h b/tensorflow/lite/experimental/c/c_api_internal.h
index 8a2987c8f1c..8f5c301bc1d 100644
--- a/tensorflow/lite/experimental/c/c_api_internal.h
+++ b/tensorflow/lite/experimental/c/c_api_internal.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
 
 #include "tensorflow/lite/experimental/c/c_api.h"
-
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -27,12 +26,12 @@ limitations under the License.
 // NOTE: This header does not follow C conventions and does not define a C API.
 // It is effectively an (internal) implementation detail of the C API.
 
-struct TFL_Model {
+struct TfLiteModel {
   // Sharing is safe as FlatBufferModel is const.
   std::shared_ptr<const tflite::FlatBufferModel> impl;
 };
 
-struct TFL_InterpreterOptions {
+struct TfLiteInterpreterOptions {
   enum {
     kDefaultNumThreads = -1,
   };
@@ -43,11 +42,13 @@ struct TFL_InterpreterOptions {
   void (*error_reporter)(void* user_data, const char* format,
                          va_list args) = nullptr;
   void* error_reporter_user_data = nullptr;
+
+  std::vector<TfLiteDelegate*> delegates;
 };
 
-struct TFL_Interpreter {
+struct TfLiteInterpreter {
   // Taking a reference to the (const) model data avoids lifetime-related issues
-  // and complexity with the TFL_Model's existence.
+  // and complexity with the TfLiteModel's existence.
   std::shared_ptr<const tflite::FlatBufferModel> model;
 
   // The interpreter does not take ownership of the provided ErrorReporter
diff --git a/tensorflow/lite/experimental/c/c_api_test.cc b/tensorflow/lite/experimental/c/c_api_test.cc
index 9729a004c5d..8de0f414086 100644
--- a/tensorflow/lite/experimental/c/c_api_test.cc
+++ b/tensorflow/lite/experimental/c/c_api_test.cc
@@ -24,126 +24,126 @@ limitations under the License.
 
 namespace {
 
-TEST(CAPI, Version) { EXPECT_STRNE("", TFL_Version()); }
+TEST(CAPI, Version) { EXPECT_STRNE("", TfLiteVersion()); }
 
 TEST(CApiSimple, Smoke) {
-  TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
-  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
-  TFL_InterpreterOptionsSetNumThreads(options, 2);
+  TfLiteInterpreterOptionsSetNumThreads(options, 2);
 
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
 
   // The options/model can be deleted immediately after interpreter creation.
-  TFL_DeleteInterpreterOptions(options);
-  TFL_DeleteModel(model);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
 
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
-  ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
-  ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterGetInputTensorCount(interpreter), 1);
+  ASSERT_EQ(TfLiteInterpreterGetOutputTensorCount(interpreter), 1);
 
   std::array<int, 1> input_dims = {2};
-  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
-                                             input_dims.size()),
+  ASSERT_EQ(TfLiteInterpreterResizeInputTensor(
+                interpreter, 0, input_dims.data(), input_dims.size()),
             kTfLiteOk);
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
 
-  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
   ASSERT_NE(input_tensor, nullptr);
-  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteFloat32);
-  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
-  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
-  EXPECT_EQ(TFL_TensorByteSize(input_tensor), sizeof(float) * 2);
-  EXPECT_NE(TFL_TensorData(input_tensor), nullptr);
-  EXPECT_STREQ(TFL_TensorName(input_tensor), "input");
+  EXPECT_EQ(TfLiteTensorType(input_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TfLiteTensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TfLiteTensorDim(input_tensor, 0), 2);
+  EXPECT_EQ(TfLiteTensorByteSize(input_tensor), sizeof(float) * 2);
+  EXPECT_NE(TfLiteTensorData(input_tensor), nullptr);
+  EXPECT_STREQ(TfLiteTensorName(input_tensor), "input");
 
-  TFL_QuantizationParams input_params =
-      TFL_TensorQuantizationParams(input_tensor);
+  TfLiteQuantizationParams input_params =
+      TfLiteTensorQuantizationParams(input_tensor);
   EXPECT_EQ(input_params.scale, 0.f);
   EXPECT_EQ(input_params.zero_point, 0);
 
   std::array<float, 2> input = {1.f, 3.f};
-  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
-                                     input.size() * sizeof(float)),
+  ASSERT_EQ(TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                                       input.size() * sizeof(float)),
             kTfLiteOk);
 
-  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
 
-  const TFL_Tensor* output_tensor =
-      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
   ASSERT_NE(output_tensor, nullptr);
-  EXPECT_EQ(TFL_TensorType(output_tensor), kTfLiteFloat32);
-  EXPECT_EQ(TFL_TensorNumDims(output_tensor), 1);
-  EXPECT_EQ(TFL_TensorDim(output_tensor, 0), 2);
-  EXPECT_EQ(TFL_TensorByteSize(output_tensor), sizeof(float) * 2);
-  EXPECT_NE(TFL_TensorData(output_tensor), nullptr);
-  EXPECT_STREQ(TFL_TensorName(output_tensor), "output");
+  EXPECT_EQ(TfLiteTensorType(output_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TfLiteTensorNumDims(output_tensor), 1);
+  EXPECT_EQ(TfLiteTensorDim(output_tensor, 0), 2);
+  EXPECT_EQ(TfLiteTensorByteSize(output_tensor), sizeof(float) * 2);
+  EXPECT_NE(TfLiteTensorData(output_tensor), nullptr);
+  EXPECT_STREQ(TfLiteTensorName(output_tensor), "output");
 
-  TFL_QuantizationParams output_params =
-      TFL_TensorQuantizationParams(output_tensor);
+  TfLiteQuantizationParams output_params =
+      TfLiteTensorQuantizationParams(output_tensor);
   EXPECT_EQ(output_params.scale, 0.f);
   EXPECT_EQ(output_params.zero_point, 0);
 
   std::array<float, 2> output;
-  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
-                                   output.size() * sizeof(float)),
+  ASSERT_EQ(TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                                     output.size() * sizeof(float)),
             kTfLiteOk);
   EXPECT_EQ(output[0], 3.f);
   EXPECT_EQ(output[1], 9.f);
 
-  TFL_DeleteInterpreter(interpreter);
+  TfLiteInterpreterDelete(interpreter);
 }
 
 TEST(CApiSimple, QuantizationParams) {
-  TFL_Model* model = TFL_NewModelFromFile(
+  TfLiteModel* model = TfLiteModelCreateFromFile(
       "tensorflow/lite/testdata/add_quantized.bin");
   ASSERT_NE(model, nullptr);
 
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, nullptr);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, nullptr);
   ASSERT_NE(interpreter, nullptr);
 
-  TFL_DeleteModel(model);
+  TfLiteModelDelete(model);
 
   const std::array<int, 1> input_dims = {2};
-  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
-                                             input_dims.size()),
+  ASSERT_EQ(TfLiteInterpreterResizeInputTensor(
+                interpreter, 0, input_dims.data(), input_dims.size()),
             kTfLiteOk);
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
 
-  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
   ASSERT_NE(input_tensor, nullptr);
-  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteUInt8);
-  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
-  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
+  EXPECT_EQ(TfLiteTensorType(input_tensor), kTfLiteUInt8);
+  EXPECT_EQ(TfLiteTensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TfLiteTensorDim(input_tensor, 0), 2);
 
-  TFL_QuantizationParams input_params =
-      TFL_TensorQuantizationParams(input_tensor);
+  TfLiteQuantizationParams input_params =
+      TfLiteTensorQuantizationParams(input_tensor);
   EXPECT_EQ(input_params.scale, 0.003922f);
   EXPECT_EQ(input_params.zero_point, 0);
 
   const std::array<uint8_t, 2> input = {1, 3};
-  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
-                                     input.size() * sizeof(uint8_t)),
+  ASSERT_EQ(TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                                       input.size() * sizeof(uint8_t)),
             kTfLiteOk);
 
-  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
 
-  const TFL_Tensor* output_tensor =
-      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
   ASSERT_NE(output_tensor, nullptr);
 
-  TFL_QuantizationParams output_params =
-      TFL_TensorQuantizationParams(output_tensor);
+  TfLiteQuantizationParams output_params =
+      TfLiteTensorQuantizationParams(output_tensor);
   EXPECT_EQ(output_params.scale, 0.003922f);
   EXPECT_EQ(output_params.zero_point, 0);
 
   std::array<uint8_t, 2> output;
-  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
-                                   output.size() * sizeof(uint8_t)),
+  ASSERT_EQ(TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                                     output.size() * sizeof(uint8_t)),
             kTfLiteOk);
   EXPECT_EQ(output[0], 3);
   EXPECT_EQ(output[1], 9);
@@ -155,38 +155,84 @@ TEST(CApiSimple, QuantizationParams) {
   EXPECT_EQ(dequantizedOutput0, 0.011766f);
   EXPECT_EQ(dequantizedOutput1, 0.035298f);
 
-  TFL_DeleteInterpreter(interpreter);
+  TfLiteInterpreterDelete(interpreter);
+}
+
+TEST(CApiSimple, Delegate) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+
+  // Create and install a delegate instance.
+  bool delegate_prepared = false;
+  TfLiteDelegate delegate = TfLiteDelegateCreate();
+  delegate.data_ = &delegate_prepared;
+  delegate.Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate) {
+    *static_cast<bool*>(delegate->data_) = true;
+    return kTfLiteOk;
+  };
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, &delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  // The delegate should have been applied.
+  EXPECT_TRUE(delegate_prepared);
+
+  // Subsequent exectuion should behave properly (the delegate is a no-op).
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+  TfLiteInterpreterDelete(interpreter);
+}
+
+TEST(CApiSimple, DelegateFails) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+
+  // Create and install a delegate instance.
+  TfLiteDelegate delegate = TfLiteDelegateCreate();
+  delegate.Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return kTfLiteError;
+  };
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, &delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  // Interpreter creation should fail as delegate preparation failed.
+  EXPECT_EQ(nullptr, interpreter);
+
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
 }
 
 TEST(CApiSimple, ErrorReporter) {
-  TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/lite/testdata/add.bin");
-  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
 
   // Install a custom error reporter into the interpreter by way of options.
   tflite::TestErrorReporter reporter;
-  TFL_InterpreterOptionsSetErrorReporter(
+  TfLiteInterpreterOptionsSetErrorReporter(
       options,
       [](void* user_data, const char* format, va_list args) {
         reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
                                                                         args);
       },
       &reporter);
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
   // The options/model can be deleted immediately after interpreter creation.
-  TFL_DeleteInterpreterOptions(options);
-  TFL_DeleteModel(model);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
 
   // Invoke the interpreter before tensor allocation.
-  EXPECT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteError);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteError);
 
   // The error should propagate to the custom error reporter.
   EXPECT_EQ(reporter.error_messages(),
             "Invoke called on model that is not ready.");
   EXPECT_EQ(reporter.num_calls(), 1);
 
-  TFL_DeleteInterpreter(interpreter);
+  TfLiteInterpreterDelete(interpreter);
 }
 
 TEST(CApiSimple, ValidModel) {
@@ -198,26 +244,28 @@ TEST(CApiSimple, ValidModel) {
   model_file.seekg(0, std::ios_base::beg);
   model_file.read(model_buffer.data(), model_buffer.size());
 
-  TFL_Model* model = TFL_NewModel(model_buffer.data(), model_buffer.size());
+  TfLiteModel* model =
+      TfLiteModelCreate(model_buffer.data(), model_buffer.size());
   ASSERT_NE(model, nullptr);
-  TFL_DeleteModel(model);
+  TfLiteModelDelete(model);
 }
 
 TEST(CApiSimple, ValidModelFromFile) {
-  TFL_Model* model =
-      TFL_NewModelFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
-  TFL_DeleteModel(model);
+  TfLiteModelDelete(model);
 }
 
 TEST(CApiSimple, InvalidModel) {
   std::vector<char> invalid_model(20, 'c');
-  TFL_Model* model = TFL_NewModel(invalid_model.data(), invalid_model.size());
+  TfLiteModel* model =
+      TfLiteModelCreate(invalid_model.data(), invalid_model.size());
   ASSERT_EQ(model, nullptr);
 }
 
 TEST(CApiSimple, InvalidModelFromFile) {
-  TFL_Model* model = TFL_NewModelFromFile("invalid/path/foo.tflite");
+  TfLiteModel* model = TfLiteModelCreateFromFile("invalid/path/foo.tflite");
   ASSERT_EQ(model, nullptr);
 }
 
diff --git a/tensorflow/lite/experimental/c/c_api_types.h b/tensorflow/lite/experimental/c/c_api_types.h
index e1c54cba9b3..c31d3e50cc0 100644
--- a/tensorflow/lite/experimental/c/c_api_types.h
+++ b/tensorflow/lite/experimental/c/c_api_types.h
@@ -51,7 +51,11 @@ typedef enum {
   kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;
 
+// Forward declare so dependent structs and methods can reference these types
+// prior to the struct definitions.
 struct TfLiteContext;
+struct TfLiteDelegate;
+struct TfLiteRegistration;
 
 // An external context is a collection of information unrelated to the TF Lite
 // framework, but useful to a subset of the ops. TF Lite knows very little
@@ -63,10 +67,6 @@ typedef struct {
   TfLiteStatus (*Refresh)(struct TfLiteContext* context);
 } TfLiteExternalContext;
 
-// Forward declare so GetNode can use this is in Context.
-typedef struct _TfLiteRegistration TfLiteRegistration;
-typedef struct _TfLiteDelegate TfLiteDelegate;
-
 #define kOptionalTensor (-1)
 
 // Fixed size list of integers. Used for dimensions and inputs/outputs tensor
@@ -330,7 +330,7 @@ typedef struct {
 
   // The delegate which knows how to handle `buffer_handle`.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
+  struct TfLiteDelegate* delegate;
 
   // An integer buffer handle that can be handled by `delegate`.
   // The value is valid only when delegate is not null.
@@ -405,7 +405,7 @@ typedef struct {
   // The pointer to the delegate. This is non-null only when the node is
   // created by calling `interpreter.ModifyGraphWithDelegate`.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
+  struct TfLiteDelegate* delegate;
 } TfLiteNode;
 
 typedef struct TfLiteContext {
@@ -451,15 +451,15 @@ typedef struct TfLiteContext {
 
   // Get a Tensor node by node_index.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
-                                         TfLiteNode** node,
-                                         TfLiteRegistration** registration);
+  TfLiteStatus (*GetNodeAndRegistration)(
+      struct TfLiteContext*, int node_index, TfLiteNode** node,
+      struct TfLiteRegistration** registration);
 
   // Replace ops with one or more stub delegate operations. This function
   // does not take ownership of `nodes_to_replace`.
   TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
-      struct TfLiteContext*, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+      struct TfLiteContext*, struct TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
 
   // Number of threads that are recommended to subsystems like gemmlowp and
   // eigen.
@@ -484,7 +484,7 @@ typedef struct TfLiteContext {
   void* profiler;
 } TfLiteContext;
 
-typedef struct _TfLiteRegistration {
+typedef struct TfLiteRegistration {
   // Initializes the op from serialized data.
   // If a built-in op:
   //   `buffer` is the op's params data (TfLiteLSTMParams*).
@@ -560,7 +560,7 @@ typedef enum {
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
-typedef struct _TfLiteDelegate {
+typedef struct TfLiteDelegate {
   // Data that delegate needs to identify itself. This data is owned by the
   // delegate. The delegate is owned in the user code, so the delegate is
   // responsible for doing this when it is destroyed.
@@ -571,20 +571,21 @@ typedef struct _TfLiteDelegate {
   // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
   // to ask the TensorFlow lite runtime to create macro-nodes to represent
   // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
+  TfLiteStatus (*Prepare)(TfLiteContext* context,
+                          struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
   // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
   // bytes as long as it follows the rules for kTfLiteDynamic tensors.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
-                                       TfLiteDelegate* delegate,
+                                       struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
                                        TfLiteTensor* tensor);
 
   // Copy the data from raw memory of the given 'tensor' to delegate buffer
   // handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
-                                     TfLiteDelegate* delegate,
+                                     struct TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
                                      TfLiteTensor* tensor);
 
@@ -592,7 +593,8 @@ typedef struct _TfLiteDelegate {
   // this doesn't release the underlying resource (e.g. textures). The
   // resources are either owned by application layer or the delegate.
   // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
+  void (*FreeBufferHandle)(TfLiteContext* context,
+                           struct TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
 
   // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
diff --git a/tensorflow/lite/experimental/c/exported_symbols.lds b/tensorflow/lite/experimental/c/exported_symbols.lds
index a3ddc6bc8d3..5625cec56a2 100644
--- a/tensorflow/lite/experimental/c/exported_symbols.lds
+++ b/tensorflow/lite/experimental/c/exported_symbols.lds
@@ -1 +1 @@
-_TFL_*
+TfLite*
diff --git a/tensorflow/lite/experimental/c/version_script.lds b/tensorflow/lite/experimental/c/version_script.lds
index c0c8a2bca19..07f5a25a5f7 100644
--- a/tensorflow/lite/experimental/c/version_script.lds
+++ b/tensorflow/lite/experimental/c/version_script.lds
@@ -1,7 +1,7 @@
 VERS_1.0 {
   # Export symbols in c_api.h.
   global:
-    *TFL_*;
+    TfLite*;
 
   # Hide everything else.
   local:
diff --git a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
index 676783063d0..4214b93c17e 100644
--- a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
+++ b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -15,10 +15,10 @@ limitations under the License.
 using System;
 using System.Runtime.InteropServices;
 
-using TFL_Interpreter = System.IntPtr;
-using TFL_InterpreterOptions = System.IntPtr;
-using TFL_Model = System.IntPtr;
-using TFL_Tensor = System.IntPtr;
+using TfLiteInterpreter = System.IntPtr;
+using TfLiteInterpreterOptions = System.IntPtr;
+using TfLiteModel = System.IntPtr;
+using TfLiteTensor = System.IntPtr;
 
 namespace TensorFlowLite
 {
@@ -29,15 +29,15 @@ namespace TensorFlowLite
   {
     private const string TensorFlowLibrary = "tensorflowlite_c";
 
-    private TFL_Model model;
-    private TFL_Interpreter interpreter;
+    private TfLiteModel model;
+    private TfLiteInterpreter interpreter;
 
     public Interpreter(byte[] modelData) {
       GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
       IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
-      model = TFL_NewModel(modelDataPtr, modelData.Length);
+      model = TfLiteModelCreate(modelDataPtr, modelData.Length);
       if (model == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Model");
-      interpreter = TFL_NewInterpreter(model, /*options=*/IntPtr.Zero);
+      interpreter = TfLiteInterpreterCreate(model, /*options=*/IntPtr.Zero);
       if (interpreter == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
     }
 
@@ -46,46 +46,46 @@ namespace TensorFlowLite
     }
 
     public void Dispose() {
-      if (interpreter != IntPtr.Zero) TFL_DeleteInterpreter(interpreter);
+      if (interpreter != IntPtr.Zero) TfLiteInterpreterDelete(interpreter);
       interpreter = IntPtr.Zero;
-      if (model != IntPtr.Zero) TFL_DeleteModel(model);
+      if (model != IntPtr.Zero) TfLiteModelDelete(model);
       model = IntPtr.Zero;
     }
 
     public void Invoke() {
-      ThrowIfError(TFL_InterpreterInvoke(interpreter));
+      ThrowIfError(TfLiteInterpreterInvoke(interpreter));
     }
 
     public int GetInputTensorCount() {
-      return TFL_InterpreterGetInputTensorCount(interpreter);
+      return TfLiteInterpreterGetInputTensorCount(interpreter);
     }
 
     public void SetInputTensorData(int inputTensorIndex, Array inputTensorData) {
       GCHandle tensorDataHandle = GCHandle.Alloc(inputTensorData, GCHandleType.Pinned);
       IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
-      TFL_Tensor tensor = TFL_InterpreterGetInputTensor(interpreter, inputTensorIndex);
-      ThrowIfError(TFL_TensorCopyFromBuffer(
+      TfLiteTensor tensor = TfLiteInterpreterGetInputTensor(interpreter, inputTensorIndex);
+      ThrowIfError(TfLiteTensorCopyFromBuffer(
           tensor, tensorDataPtr, Buffer.ByteLength(inputTensorData)));
     }
 
     public void ResizeInputTensor(int inputTensorIndex, int[] inputTensorShape) {
-      ThrowIfError(TFL_InterpreterResizeInputTensor(
+      ThrowIfError(TfLiteInterpreterResizeInputTensor(
           interpreter, inputTensorIndex, inputTensorShape, inputTensorShape.Length));
     }
 
     public void AllocateTensors() {
-      ThrowIfError(TFL_InterpreterAllocateTensors(interpreter));
+      ThrowIfError(TfLiteInterpreterAllocateTensors(interpreter));
     }
 
     public int GetOutputTensorCount() {
-      return TFL_InterpreterGetOutputTensorCount(interpreter);
+      return TfLiteInterpreterGetOutputTensorCount(interpreter);
     }
 
     public void GetOutputTensorData(int outputTensorIndex, Array outputTensorData) {
       GCHandle tensorDataHandle = GCHandle.Alloc(outputTensorData, GCHandleType.Pinned);
       IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
-      TFL_Tensor tensor = TFL_InterpreterGetOutputTensor(interpreter, outputTensorIndex);
-      ThrowIfError(TFL_TensorCopyToBuffer(
+      TfLiteTensor tensor = TfLiteInterpreterGetOutputTensor(interpreter, outputTensorIndex);
+      ThrowIfError(TfLiteTensorCopyToBuffer(
           tensor, tensorDataPtr, Buffer.ByteLength(outputTensorData)));
     }
 
@@ -96,60 +96,60 @@ namespace TensorFlowLite
     #region Externs
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe TFL_Interpreter TFL_NewModel(IntPtr model_data, int model_size);
+    private static extern unsafe TfLiteInterpreter TfLiteModelCreate(IntPtr model_data, int model_size);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe TFL_Interpreter TFL_DeleteModel(TFL_Model model);
+    private static extern unsafe TfLiteInterpreter TfLiteModelDelete(TfLiteModel model);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe TFL_Interpreter TFL_NewInterpreter(
-        TFL_Model model,
-        TFL_InterpreterOptions optional_options);
+    private static extern unsafe TfLiteInterpreter TfLiteInterpreterCreate(
+        TfLiteModel model,
+        TfLiteInterpreterOptions optional_options);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe void TFL_DeleteInterpreter(TFL_Interpreter interpreter);
+    private static extern unsafe void TfLiteInterpreterDelete(TfLiteInterpreter interpreter);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_InterpreterGetInputTensorCount(
-        TFL_Interpreter interpreter);
+    private static extern unsafe int TfLiteInterpreterGetInputTensorCount(
+        TfLiteInterpreter interpreter);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe TFL_Tensor TFL_InterpreterGetInputTensor(
-        TFL_Interpreter interpreter,
+    private static extern unsafe TfLiteTensor TfLiteInterpreterGetInputTensor(
+        TfLiteInterpreter interpreter,
         int input_index);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_InterpreterResizeInputTensor(
-        TFL_Interpreter interpreter,
+    private static extern unsafe int TfLiteInterpreterResizeInputTensor(
+        TfLiteInterpreter interpreter,
         int input_index,
         int[] input_dims,
         int input_dims_size);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_InterpreterAllocateTensors(
-        TFL_Interpreter interpreter);
+    private static extern unsafe int TfLiteInterpreterAllocateTensors(
+        TfLiteInterpreter interpreter);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_InterpreterInvoke(TFL_Interpreter interpreter);
+    private static extern unsafe int TfLiteInterpreterInvoke(TfLiteInterpreter interpreter);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_InterpreterGetOutputTensorCount(
-        TFL_Interpreter interpreter);
+    private static extern unsafe int TfLiteInterpreterGetOutputTensorCount(
+        TfLiteInterpreter interpreter);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe TFL_Tensor TFL_InterpreterGetOutputTensor(
-        TFL_Interpreter interpreter,
+    private static extern unsafe TfLiteTensor TfLiteInterpreterGetOutputTensor(
+        TfLiteInterpreter interpreter,
         int output_index);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_TensorCopyFromBuffer(
-        TFL_Tensor tensor,
+    private static extern unsafe int TfLiteTensorCopyFromBuffer(
+        TfLiteTensor tensor,
         IntPtr input_data,
         int input_data_size);
 
     [DllImport (TensorFlowLibrary)]
-    private static extern unsafe int TFL_TensorCopyToBuffer(
-        TFL_Tensor tensor,
+    private static extern unsafe int TfLiteTensorCopyToBuffer(
+        TfLiteTensor tensor,
         IntPtr output_data,
         int output_data_size);
 
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 2d78b2163d5..2660126d99f 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,19 +1,13 @@
 # TensorFlow Lite for iOS
 
-load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_IOS_BUILD_VERSION", "TFL_MINIMUM_OS_VERSION")
+load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@build_bazel_rules_apple//apple:versioning.bzl", "apple_bundle_version")
 
 package(
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-apple_bundle_version(
-    name = "TensorFlowLiteC_version",
-    build_version = TFL_IOS_BUILD_VERSION,
-)
-
 ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
@@ -22,6 +16,7 @@ ios_static_framework(
     ],
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    version = ":TensorFlowLiteC_version",
-    deps = ["//tensorflow/lite/experimental/c:c_api"],
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
 )
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
index cb16346c757..5efd12c43df 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '0.2.0'
+  s.version          = '1.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/dl/cpdc/9d0ec5e53f4ff34a/TensorFlowLiteC-#{s.version}.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/0e27bc28472e2519/TensorFlowLiteC-#{s.version}.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index 1698134fb1d..976c6b09a97 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -1,8 +1,5 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
-# Current version of the TensorFlow Lite iOS libraries.
-TFL_IOS_BUILD_VERSION = "0.2.0"
-
 TFL_MINIMUM_OS_VERSION = "9.0"
 
 # Default tags for filtering iOS targets. Targets are restricted to Apple platforms.
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index aed87a2e643..e3d05ae4f51 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -106,7 +106,6 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:cpu_backend_context",
-        "//tensorflow/lite/kernels:cpu_backend_support",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal:tensor",
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
index fc0d681f3bc..9ef8107dc9f 100644
--- a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/kernels/gru_cell.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -112,14 +111,12 @@ enum TemporaryTensor {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  cpu_backend_support::IncrementUsageCounter(context);
   auto* scratch_tensor_index = new int;
   context->AddTensors(context, kTemporaryNum, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<int*>(buffer);
 }
 
@@ -221,7 +218,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
   TfLiteTensor* activation = GetTemporary(context, node, kActivation);
   TfLiteTensor* concat = GetTemporary(context, node, kConcat);
-  auto cpu_backend_context = cpu_backend_support::GetFromContext(context);
+  auto cpu_backend_context = CpuBackendContext::GetFromContext(context);
 
   if (gate_weight->type == kTfLiteFloat32) {
     GruImpl(input, input_state, gate_weight, gate_bias, candidate_weight,
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index 8c61934e3e5..7f94ad3b922 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -29,6 +29,11 @@ cc_library(
         "micro_mutable_op_resolver.h",
         "simple_tensor_allocator.h",
     ],
+    copts = [
+        "-Werror",
+        "-Wdouble-promotion",
+        "-Wsign-compare",
+    ],
     deps = [
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index b70aeb60515..ddfc5bd7a27 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -341,7 +341,7 @@ To flash a part with JFlash Lite, do the following:
     to down load the Tensorflow source code and the support libraries \(but do
     not run the make command shown there.\)
 2.  Download the Eta Compute SDK, version 0.0.17. Contact info@etacompute.com
-3.  You will need the the Arm compiler arm-none-eabi-gcc, version 7.3.1
+3.  You will need the Arm compiler arm-none-eabi-gcc, version 7.3.1
     20180622, release ARM/embedded-7-branch revision 261907, 7-2018-q2-update.
     This compiler is downloaded through make.
 4.  Edit the file
@@ -392,17 +392,42 @@ optimizations and link it with the microlite lib.
 To utilize the CMSIS-NN optimized kernels, choose your target, e.g. Bluepill,
 and build with:
 
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn
-TARGET=bluepill test
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=bluepill test
+```
 
 That will build the microlite lib including CMSIS-NN optimized kernels based on
 the version downloaded by 'download_dependencies.sh', so make sure you have run
 this script. If you want to utilize another version of CMSIS, clone it to a
 custom location run the following command:
 
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile
-CMSIS_PATH=<CUSTOM_LOCATION> TAGS=cmsis-nn TARGET=bluepill test (--- Under
-development, it will build, but test will fail ---)
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile CMSIS_PATH=<CUSTOM_LOCATION> TAGS=cmsis-nn TARGET=bluepill test
+```
+
+To test the optimized kernel(s) on your target platform using mbed (depthwise
+conv in this example), follow these steps:
+
+1.  Clone CMSIS to a custom location (<CUSTOM_LOCATION>) url:
+    https://github.com/ARM-software/CMSIS_5.git Make sure you're on the
+    development branch.
+2.  Generate the project for depthwise conv mbed test: `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn
+    CMSIS_PATH=<CUSTOM_LOCATION> generate_depthwise_conv_test_mbed_project`
+3.  Go to the generated mbed folder: `cd
+    tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/depthwise_conv_test/mbed`
+4.  Follow the steps in README_MBED.md to setup the environment. Or simply do:
+    `mbed config root . mbed deploy python -c 'import fileinput, glob; for
+    filename in glob.glob("mbed-os/tools/profiles/*.json"): for line in
+    fileinput.input(filename, inplace=True):
+    print(line.replace("\"-std=gnu++98\"","\"-std=gnu++11\",
+    \"-fpermissive\""))'`
+5.  Compile and flash. The 'auto' flag requires your target to be plugged in.
+    `mbed compile -m auto -t GCC_ARM -f --source . --source
+    <CUSTOM_LOCATION>/CMSIS/NN/Include --source
+    <CUSTOM_LOCATION>/CMSIS/NN/Source/ConvolutionFunctions --source
+    <CUSTOM_LOCATION>/CMSIS/DSP/Include --source
+    <CUSTOM_LOCATION>/CMSIS/Core/Include -j8`
 
 ## Goals
 
diff --git a/tensorflow/lite/experimental/micro/arduino/debug_log.cc b/tensorflow/lite/experimental/micro/arduino/debug_log.cc
index 4d18f6f97e9..3cdd006f047 100644
--- a/tensorflow/lite/experimental/micro/arduino/debug_log.cc
+++ b/tensorflow/lite/experimental/micro/arduino/debug_log.cc
@@ -34,5 +34,5 @@ extern "C" void DebugLog(const char* s) {
     DEBUG_SERIAL_OBJECT.begin(9600);
     is_initialized = true;
   }
-  DEBUG_SERIAL_OBJECT.println(s);
+  DEBUG_SERIAL_OBJECT.print(s);
 }
diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/BUILD b/tensorflow/lite/experimental/micro/examples/hello_world/BUILD
index f3340492333..b7d218533eb 100644
--- a/tensorflow/lite/experimental/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/hello_world/BUILD
@@ -65,6 +65,11 @@ cc_binary(
     srcs = [
         "main.cc",
     ],
+    copts = [
+        "-Werror",
+        "-Wdouble-promotion",
+        "-Wsign-compare",
+    ],
     deps = [
         ":constants",
         ":output_handler",
diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/README.md b/tensorflow/lite/experimental/micro/examples/hello_world/README.md
index 1de9730848c..89804d438d1 100644
--- a/tensorflow/lite/experimental/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/experimental/micro/examples/hello_world/README.md
@@ -32,11 +32,17 @@ Microcontrollers.
 
 ### Build the code
 
-To compile and test this example on a desktop Linux or MacOS machine, download
-[the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd`
-into the source directory from a terminal, and then run the following command:
+To compile and test this example on a desktop Linux or macOS machine, first
+clone the TensorFlow repository from GitHub to a convenient place:
 
+```bash
+git clone --depth 1 https://github.com/tensorflow/tensorflow.git
 ```
+
+Next, `cd` into the source directory from a terminal, and then run the following
+command:
+
+```bash
 make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_hello_world_test
 ```
 
@@ -76,11 +82,12 @@ blink instead of fading.
 ### Obtain and import the library
 
 To use this sample application with Arduino, we've created an Arduino library
-that includes it as an example that you can open in the Arduino IDE.
+that includes it as an example that you can open in the Arduino Desktop IDE.
 
 Download the current nightly build of the library: [hello_world.zip](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/hello_world/hello_world.zip)
 
-Next, import this zip file into the Arduino IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`.
+Next, import this zip file into the Arduino Desktop IDE by going to `Sketch ->
+Include Library -> Add .ZIP Library...`.
 
 #### Building the library
 
@@ -98,7 +105,8 @@ A zip file will be created at the following location:
 tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/hello_world/hello_world.zip
 ```
 
-You can then import this zip file into the Arduino IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`.
+You can then import this zip file into the Arduino Desktop IDE by going to
+`Sketch -> Include Library -> Add .ZIP Library...`.
 
 ### Load and run the example
 
@@ -106,11 +114,11 @@ Once the library has been added, go to `File -> Examples`. You should see an
 example near the bottom of the list named `TensorFlowLite:hello_world`. Select
 it and click `hello_world` to load the example.
 
-Use the Arduino IDE to build and upload the example. Once it is running, you
-should see the built-in LED on your device flashing.
+Use the Arduino Desktop IDE to build and upload the example. Once it is running,
+you should see the built-in LED on your device flashing.
 
-The Arduino IDE includes a plotter that we can use to display the sine wave
-graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
+The Arduino Desktop IDE includes a plotter that we can use to display the sine
+wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
 datapoint being logged for each inference cycle, expressed as a number between 0
 and 255.
 
diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc
index 22281e7be2a..8e8cc39b486 100644
--- a/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc
@@ -90,24 +90,24 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Obtain the output value from the tensor
   float value = output->data.f[0];
-  // Check that the output value is within 0.000001 of the expected value
-  TF_LITE_MICRO_EXPECT_NEAR(0.0486171, value, 0.000001);
+  // Check that the output value is within 0.05 of the expected value
+  TF_LITE_MICRO_EXPECT_NEAR(0., value, 0.05);
 
   // Run inference on several more values and confirm the expected outputs
   input->data.f[0] = 1.;
   interpreter.Invoke();
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(0.8071436, value, 0.000001);
+  TF_LITE_MICRO_EXPECT_NEAR(0.841, value, 0.05);
 
   input->data.f[0] = 3.;
   interpreter.Invoke();
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(0.0964818, value, 0.000001);
+  TF_LITE_MICRO_EXPECT_NEAR(0.141, value, 0.05);
 
   input->data.f[0] = 5.;
   interpreter.Invoke();
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(-0.9352637, value, 0.000001);
+  TF_LITE_MICRO_EXPECT_NEAR(-0.959, value, 0.05);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/main.cc b/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
index faf680df1d0..146e356f441 100644
--- a/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
@@ -35,6 +35,7 @@ int main(int argc, char* argv[]) {
         "Model provided is schema version %d not equal "
         "to supported version %d.\n",
         model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
   }
 
   // This pulls in all the operation implementations we need
@@ -75,7 +76,8 @@ int main(int argc, char* argv[]) {
     // Run inference, and report any error
     TfLiteStatus invoke_status = interpreter.Invoke();
     if (invoke_status != kTfLiteOk) {
-      error_reporter->Report("Invoke failed on x_val: %f\n", x_val);
+      error_reporter->Report("Invoke failed on x_val: %f\n",
+                             static_cast<double>(x_val));
       continue;
     }
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
index b9c99574be3..8e3453a5eb0 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -1,81 +1,508 @@
-# Micro Speech Example
+# Micro Speech example
 
-This examples shows how you can use TensorFlow Lite to run a 20 kilobyte neural
+This example shows how you can use TensorFlow Lite to run a 20 kilobyte neural
 network model to recognize keywords in speech. It's designed to run on systems
-with very small amounts of memory such as microcontrollers and DSPs. The code
-itself also has a small footprint (for example around 22 kilobytes on a Cortex
+with very small amounts of memory such as microcontrollers and DSPs.
+
+The example application listens to its surroundings with a microphone and
+indicates when it has detected a word by lighting an LED or displaying data on a
+screen, depending on the capabilities of the device.
+
+The code has a small footprint (for example around 22 kilobytes on a Cortex
 M3) and only uses about 10 kilobytes of RAM for working memory, so it's able to
 run on systems like an STM32F103 with only 20 kilobytes of total SRAM and 64
 kilobytes of Flash.
 
-## Table of Contents
+## Table of contents
 
--   [Getting Started](#getting-started)
--   [Getting Started on a Microcontroller](#getting-started-on-a-microcontroller)
--   [Calculating the Input to the Neural Network](#calculating-the-input-to-the-neural-network)
--   [Creating Your Own Model](#creating-your-own-model)
+-   [Getting started](#getting-started)
+-   [Run on macOS](#run-on-macos)
+-   [Deploy to Arduino](#deploy-to-arduino)
+-   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
+-   [Deploy to STM32F746](#deploy-to-STM32F746)
+-   [Deploy to NXP FRDM K66F](#deploy-to-nxp-frdm-k66f)
+-   [Calculating the input to the neural network](#calculating-the-input-to-the-neural-network)
+-   [Train your own model](#train-your-own-model)
 
-## Getting Started
 
-To compile and test this example on a desktop Linux or MacOS machine, download
+## Getting started
+
+This code has been tested on the following devices:
+
+* [SparkFun Edge](https://sparkfun.com/products/15170)
+* [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
+* [ST Microelectronics STM32F746G Discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
+* [NXP FRDM K66F](https://www.nxp.com/design/development-boards/freedom-development-boards/mcu-boards/freedom-development-platform-for-kinetis-k66-k65-and-k26-mcus:FRDM-K66F)
+
+This readme contains instructions for building the code on Linux and macOS, and
+deploying the code to the above microcontroller platforms and macOS.
+
+### Build the tests
+
+To compile and test this example on a desktop Linux or macOS machine, download
 [the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd`
 into the source directory from a terminal, and then run the following command:
 
 ```
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_micro_speech_test
 ```
 
 This will take a few minutes, and downloads frameworks the code uses like
 [CMSIS](https://developer.arm.com/embedded/cmsis) and
 [flatbuffers](https://google.github.io/flatbuffers/). Once that process has
-finished, run:
+finished, you should see a series of files get compiled, followed by some
+logging output from a test, which should conclude with `~~~ALL TESTS PASSED~~~`.
 
-```
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_micro_speech
-```
+If you see this, it means that a small program has been built and run that loads
+the trained TensorFlow model, runs some example inputs through it, and got the
+expected outputs.
 
-You should see a series of files get compiled, followed by some logging output
-from a test, which should conclude with `~~~ALL TESTS PASSED~~~`. If you see
-this, it means that a small program has been built and run that loads a trained
-TensorFlow model, runs some example inputs through it, and got the expected
-outputs. This particular test runs spectrograms generated from recordings of
-people saying "Yes" and "No", and checks that the network correctly identifies
-them.
-
-To understand how TensorFlow Lite does this, you can look at the `TestInvoke()`
-function in
-[micro_speech_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc).
-It's a fairly small amount of code, creating an interpreter, getting a handle to
-a model that's been compiled into the program, and then invoking the interpreter
+To understand how TensorFlow Lite does this, you can look at the source in
+[hello_world_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc).
+It's a fairly small amount of code that creates an interpreter, gets a handle to
+a model that's been compiled into the program, and then invokes the interpreter
 with the model and sample inputs.
 
-## Getting Started on a Microcontroller
+### Run on macOS
 
-Once you have downloaded the dependencies and got the x86/Linux build working,
-you can try building a version for the STM32F103 'bluepill' device. The
-following command will build the test and then run it on an emulator, assuming
-you have Docker installed:
+The example contains an audio provider compatible with macOS. If you have access
+to a Mac, you can run the example on your development machine.
 
-*On Mac OS you need to have ARM compiler installed, one way of doing so is with
-brew: brew install caskroom/cask/gcc-arm-embedded*
+First, use the following command to build it:
 
 ```
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test_micro_speech
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile micro_speech
 ```
 
-If you have a real device
-[(see here for how to set one up)](https://github.com/google/stm32_bare_lib/tree/master/README.md)
-you can then convert the ELF file into a a `.bin` format executable to load onto
-it by running:
+Once the build completes, you can run the example with the following command:
 
 ```
-arm-none-eabi-objcopy \
-tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/micro_speech_test \
-tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/micro_speech_test.bin \
---output binary
+tensorflow/lite/experimental/micro/tools/make/gen/osx_x86_64/bin/micro_speech
 ```
 
-## Calculating the Input to the Neural Network
+You might see a pop-up asking for microphone access. If so, grant it, and the
+program will start.
+
+Try saying "yes" and "no". You should see output that looks like the following:
+
+```
+Heard yes (201) @4056ms
+Heard no (205) @6448ms
+Heard unknown (201) @13696ms
+Heard yes (205) @15000ms
+Heard yes (205) @16856ms
+Heard unknown (204) @18704ms
+Heard no (206) @21000ms
+```
+
+The number after each detected word is its score. By default, the recognize
+commands component only considers matches as valid if their score is over 200,
+so all of the scores you see will be at least 200.
+
+The number after the score is the number of milliseconds since the program was
+started.
+
+If you don't see any output, make sure your Mac's internal microphone is
+selected in the Mac's *Sound* menu, and that its input volume is turned up high
+enough.
+
+## Deploy to Arduino
+
+The following instructions will help you build and deploy this sample
+to [Arduino](https://www.arduino.cc/) devices.
+
+The sample has been tested with the following devices:
+
+- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
+
+The Arduino Nano 33 BLE Sense is currently the only Arduino with a built-in
+microphone. If you're using a different Arduino board and attaching your own
+microphone, you'll need to implement your own +audio_provider.cc+. It also has a
+built-in LED, which is used to indicate that a word has been recognized.
+
+### Obtain and import the library
+
+To use this sample application with Arduino, we've created an Arduino library
+that includes it as an example that you can open in the Arduino IDE.
+
+Download the current nightly build of the library: [micro_speech.zip](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/micro_speech/micro_speech.zip)
+
+Next, import this zip file into the Arduino IDE by going to
+`Sketch -> Include Library -> Add .ZIP Library...`.
+
+#### Build the library
+
+If you need to build the library from source (for example, if you're making
+modifications to the code), run this command to generate a zip file containing
+the required source files:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=arduino TAGS="portable_optimized" generate_micro_speech_arduino_library_zip
+```
+
+A zip file will be created at the following location:
+
+```
+tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/micro_speech/micro_speech.zip
+```
+
+You can then import this zip file into the Arduino IDE by going to
+`Sketch -> Include Library -> Add .ZIP Library...`.
+
+### Load and run the example
+
+Once the library has been added, go to `File -> Examples`. You should see an
+example near the bottom of the list named `TensorFlowLite:micro_speech`. Select
+it and click `micro_speech` to load the example.
+
+Use the Arduino IDE to build and upload the example. Once it is running, you
+should see the built-in LED on your device flashing. Saying the word "yes" will
+cause the LED to remain on for 3 seconds. The current model has fairly low
+accuracy, so you may have to repeat "yes" a few times.
+
+The program also outputs inference results to the serial port, which appear as
+follows:
+
+```
+Heard yes (201) @4056ms
+Heard no (205) @6448ms
+Heard unknown (201) @13696ms
+Heard yes (205) @15000ms
+```
+
+The number after each detected word is its score. By default, the program only
+considers matches as valid if their score is over 200, so all of the scores you
+see will be at least 200.
+
+When the program is run, it waits 5 seconds for a USB-serial connection to be
+available. If there is no connection available, it will not output data. To see
+the serial output in the Arduino desktop IDE, do the following:
+
+1. Open the Arduino IDE
+1. Connect the Arduino board to your computer via USB
+1. Press the reset button on the Arduino board
+1. Within 5 seconds, go to `Tools -> Serial Monitor` in the Arduino IDE. You may
+   have to try several times, since the board will take a moment to connect.
+
+If you don't see any output, repeat the process again.
+
+## Deploy to SparkFun Edge
+
+The following instructions will help you build and deploy this sample on the
+[SparkFun Edge development board](https://sparkfun.com/products/15170).
+
+The program will toggle the blue LED on and off with each inference. It will
+switch on the yellow LED when a "yes" is heard, the red LED when a "no" is
+heard, and the green LED when an unknown command is heard.
+
+The [AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+walks through the deployment process in detail. The steps are also
+summarized below.
+
+### Compile the binary
+
+The following command will download the required dependencies and then compile a
+binary for the SparkFun Edge:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=sparkfun_edge TAGS="CMSIS" micro_speech_bin
+```
+
+The binary will be created in the following location:
+
+```
+tensorflow/lite/experimental/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/micro_speech.bin
+```
+
+### Sign the binary
+
+The binary must be signed with cryptographic keys to be deployed to the device.
+We'll now run some commands that will sign our binary so it can be flashed to
+the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
+downloaded when the `Makefile` is run.
+
+Enter the following command to set up some dummy cryptographic keys we can use
+for development:
+
+```
+cp tensorflow/lite/experimental/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info0.py \
+tensorflow/lite/experimental/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info.py
+```
+
+Next, run the following command to create a signed binary:
+
+```
+python3 tensorflow/lite/experimental/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_image_blob.py \
+--bin tensorflow/lite/experimental/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/micro_speech.bin \
+--load-address 0xC000 \
+--magic-num 0xCB \
+-o main_nonsecure_ota \
+--version 0x0
+```
+
+This will create the file `main_nonsecure_ota.bin`. We'll now run another
+command to create a final version of the file that can be used to flash our
+device with the bootloader script we will use in the next step:
+
+```
+python3 tensorflow/lite/experimental/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
+--load-address 0x20000 \
+--bin main_nonsecure_ota.bin \
+-i 6 \
+-o main_nonsecure_wire \
+--options 0x1
+```
+
+You should now have a file called `main_nonsecure_wire.bin` in the directory
+where you ran the commands. This is the file we'll be flashing to the device.
+
+### Flash the binary
+
+Next, attach the board to your computer via a USB-to-serial adapter.
+
+**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
+you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
+before you continue.
+
+Once connected, assign the USB device name to an environment variable:
+
+```
+export DEVICENAME=put your device name here
+```
+
+Set another variable with the baud rate:
+
+```
+export BAUD_RATE=921600
+```
+
+Now, hold the button marked `14` on the device. While still holding the button,
+hit the button marked `RST`. Continue holding the button marked `14` while
+running the following command:
+
+```
+python3 tensorflow/lite/experimental/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/uart_wired_update.py \
+-b ${BAUD_RATE} ${DEVICENAME} \
+-r 1 \
+-f main_nonsecure_wire.bin \
+-i 6
+```
+
+You should see a long stream of output as the binary is flashed to the device.
+Once you see the following lines, flashing is complete:
+
+```
+Sending Reset Command.
+Done.
+```
+
+If you don't see these lines, flashing may have failed. Try running through the
+steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
+the environment variables). If you continue to run into problems, follow the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab, which includes more comprehensive instructions for the flashing
+process.
+
+The binary should now be deployed to the device. Hit the button marked `RST` to
+reboot the board.
+
+You should see the device's blue LED flashing. The yellow LED should light when
+a "yes" is heard, the red LED when a "no" is heard, and the green LED when an
+unknown command is heard. The current model has fairly low accuracy, so you may
+have to repeat "yes" a few times.
+
+Debug information is logged by the board while the program is running. To view
+it, establish a serial connection to the board using a baud rate of `115200`.
+On OSX and Linux, the following command should work:
+
+```
+screen ${DEVICENAME} 115200
+```
+
+You will see a line output for every word that is detected:
+
+```
+Heard yes (201) @4056ms
+Heard no (205) @6448ms
+Heard unknown (201) @13696ms
+Heard yes (205) @15000ms
+```
+
+The number after each detected word is its score. By default, the program only
+considers matches as valid if their score is over 200, so all of the scores you
+see will be at least 200.
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+## Deploy to STM32F746
+
+The following instructions will help you build and deploy the sample to the
+[STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
+using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
+
+Before we begin, you'll need the following:
+
+- STM32F7 discovery kit board
+- Mini-USB cable
+- ARM Mbed CLI ([installation instructions](https://os.mbed.com/docs/mbed-os/v5.12/tools/installation-and-setup.html))
+- Python 2.7 and pip
+
+Since Mbed requires a special folder structure for projects, we'll first run a
+command to generate a subfolder containing the required source files in this
+structure:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_micro_speech_mbed_project
+```
+
+This will result in the creation of a new folder:
+
+```
+tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
+```
+
+This folder contains all of the example's dependencies structured in the correct
+way for Mbed to be able to build it.
+
+Change into the directory and run the following commands, making sure you are
+using Python 2.7.15.
+
+First, tell Mbed that the current directory is the root of an Mbed project:
+
+```
+mbed config root .
+```
+
+Next, tell Mbed to download the dependencies and prepare to build:
+
+```
+mbed deploy
+```
+
+By default, Mbed will build the project using C++98. However, TensorFlow Lite
+requires C++11. Run the following Python snippet to modify the Mbed
+configuration files so that it uses C++11:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+
+```
+
+Finally, run the following command to compile:
+
+```
+mbed compile -m DISCO_F746NG -t GCC_ARM
+```
+
+This should result in a binary at the following path:
+
+```
+./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin
+```
+
+To deploy, plug in your STM board and copy the file to it. On macOS, you can do
+this with the following command:
+
+```
+cp ./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin /Volumes/DIS_F746NG/
+```
+
+Copying the file will initiate the flashing process.
+
+The inference results are logged by the board while the program is running.
+To view it, establish a serial connection to the board
+using a baud rate of `9600`. On OSX and Linux, the following command should
+work, replacing `/dev/tty.devicename` with the name of your device as it appears
+in `/dev`:
+
+```
+screen /dev/tty.devicename 9600
+```
+
+You will see a line output for every word that is detected:
+
+```
+Heard yes (201) @4056ms
+Heard no (205) @6448ms
+Heard unknown (201) @13696ms
+Heard yes (205) @15000ms
+```
+
+The number after each detected word is its score. By default, the program only
+considers matches as valid if their score is over 200, so all of the scores you
+see will be at least 200.
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+## Deploy to NXP FRDM K66F
+
+The following instructions will help you build and deploy the sample to the
+[NXP FRDM K66F](https://www.nxp.com/design/development-boards/freedom-development-boards/mcu-boards/freedom-development-platform-for-kinetis-k66-k65-and-k26-mcus:FRDM-K66F)
+using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
+
+1.  Download [the TensorFlow source code](https://github.com/tensorflow/tensorflow).
+2.  Follow instructions from [mbed website](https://os.mbed.com/docs/mbed-os/v5.13/tools/installation-and-setup.html) to setup and install mbed CLI.
+3.  Compile TensorFlow with the following command to generate mbed project:
+
+    ```
+    make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="nxp_k66f" generate_micro_speech_mbed_project
+    ```
+4.  Go to the location of the generated project. The generated project is usually
+    in `tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech/mbed`
+5.  Create a mbed project using the generated files: `mbed new .`
+6.  Change the project setting to use C++ 11 rather than C++ 14 using:
+
+    ```
+    python -c 'import fileinput, glob;
+    for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+      for line in fileinput.input(filename, inplace=True):
+        print line.replace("\"-std=gnu++14\"","\"-std=c++11\", \"-fpermissive\"")'
+    ```
+7.  To compile project, use the following command:
+
+    ```
+    mbed compile --target K66F --toolchain GCC_ARM --profile release
+    ```
+8.  For some mbed compliers, you may get compile error in mbed_rtc_time.cpp.
+    Go to `mbed-os/platform/mbed_rtc_time.h` and comment line 32 and line 37:
+
+    ```
+    //#if !defined(__GNUC__) || defined(__CC_ARM) || defined(__clang__)
+    struct timeval {
+    time_t tv_sec;
+    int32_t tv_usec;
+    };
+    //#endif
+    ```
+9.  Look at helpful resources from NXP website such as [NXP FRDM-K66F User guide](https://www.nxp.com/docs/en/user-guide/FRDMK66FUG.pdf) and [NXP FRDM-K66F Getting Started](https://www.nxp.com/document/guide/get-started-with-the-frdm-k66f:NGS-FRDM-K66F)
+    to understand information about the board.
+10. Connect the USB cable to the micro USB port. When the Ethernet port is
+    facing towards you, the micro USB port is left of the Ethernet port.
+11.  To compile and flash in a single step, add the `--flash` option:
+
+    ```
+    mbed compile --target K66F --toolchain GCC_ARM --profile release --flash
+    ```
+12. Disconnect USB cable from the device to power down the device and connect
+    back the power cable to start running the model.
+13. Connect to serial port with baud rate of 9600 and correct serial device
+    to view the output from the MCU. In linux, you can run the following screen
+    command if the serial device is `/dev/ttyACM0`:
+
+    ```
+    sudo screen /dev/ttyACM0 9600
+    ```
+14. Saying "Yes" will print "Yes" and "No" will print "No" on the serial port.
+15. A loopback path from microphone to headset jack is enabled. Headset jack is
+    in black color. If there is no output on the serial port, you can connect
+    headphone to headphone port to check if audio loopback path is working.
+
+## Calculating the input to the neural network
 
 The TensorFlow Lite model doesn't take in raw audio sample data. Instead it
 works with spectrograms, which are two dimensional arrays that are made up of
@@ -88,63 +515,96 @@ yet included in this sample code.
 The recipe for creating the spectrogram data is that each frequency slice is
 created by running an FFT across a 30ms section of the audio sample data. The
 input samples are treated as being between -1 and +1 as real values (encoded as
--32,768 and 32,767 in 16-bit signed integer samples). This results in an FFT
-with 256 entries. Every sequence of six entries is averaged together, giving a
-total of 43 frequency buckets in the final slice. The results are stored as
-unsigned eight-bit values, where 0 represents a real number of zero, and 255
-represents 127.5 as a real number. Each adjacent frequency entry is stored in
-ascending memory order (frequency bucket 0 at data[0], bucket 1 at data [1],
-etc). The window for the frequency analysis is then moved forward by 20ms, and
-the process repeated, storing the results in the next memory row (for example
-bucket 0 in this moved window would be in data[43 + 0], etc). This process
-happens 49 times in total, producing a single channel image that is 43 pixels
-wide, and 49 rows high. Here's an illustration of the process:
+-32,768 and 32,767 in 16-bit signed integer samples).
+
+This results in an FFT with 256 entries. Every sequence of six entries is
+averaged together, giving a total of 43 frequency buckets in the final slice.
+The results are stored as unsigned eight-bit values, where 0 represents a real
+number of zero, and 255 represents 127.5 as a real number.
+
+Each adjacent frequency entry is stored in ascending memory order (frequency
+bucket 0 at data[0], bucket 1 at data [1], etc). The window for the frequency
+analysis is then moved forward by 20ms, and the process repeated, storing the
+results in the next memory row (for example bucket 0 in this moved window would
+be in data[43 + 0], etc). This process happens 49 times in total, producing a
+single channel image that is 43 pixels wide, and 49 rows high.
+
+Here's an illustration of the process:
 
 ![spectrogram diagram](https://storage.googleapis.com/download.tensorflow.org/example_images/spectrogram_diagram.png)
 
-The test data files have been generated by running the following commands:
+The test data files have been generated by running the following commands. See
+the training instructions below to learn how to set up the environment to run
+them.
 
 ```
-bazel run tensorflow/examples/speech_commands:wav_to_features -- \
---input_wav=${HOME}/speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav \
---output_c_file=yes_features_data.cc \
+python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
+--input_wav=/tmp/speech_dataset/yes/f2e59fea_nohash_1.wav \
+--output_c_file=/tmp/yes_features_data.cc \
 --window_stride=20 --preprocess=average --quantize=1
 
-bazel run tensorflow/examples/speech_commands:wav_to_features -- \
---input_wav=${HOME}/speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav \
---output_c_file=no_features_data.cc \
+python tensorflow/tensorflow/examples/speech_commands/wav_to_features.py \
+--input_wav=/tmp/speech_dataset/no/f9643d42_nohash_4.wav \
+--output_c_file=/tmp/no_features_data.cc \
 --window_stride=20 --preprocess=average --quantize=1
 ```
 
-## Creating Your Own Model
+## Train your own model
 
 The neural network model used in this example was built using the
 [TensorFlow speech commands tutorial](https://www.tensorflow.org/tutorials/sequences/audio_recognition).
+You can retrain it to recognize any combination of words from this list:
 
-If you would like to create your own, you can start by training a model with the
-following commands. Note that this will begin a full build of TensorFlow from
-source; it is not currently possible to use the TensorFlow pip package. Due to
-the complexity of setting up a build environment, it's easiest to run these
-commands in a
-[TensorFlow Docker container](https://www.tensorflow.org/install/docker). A full
-build may take a couple of hours.
+```
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
+```
+
+### Use Google Colaboratory
+
+The easiest way to train your own speech model is by running [`train_speech_model.ipynb`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb)
+in Google Colaboratory. This avoids the need to install dependencies, and allows
+the use of GPUs for training. Total training time will be 1.5-2hrs.
+
+We strongly recommend trying this approach first.
+
+### Use your local machine
+
+You can use the following commands to train the model on your own machine. It
+may be easiest to run these commands in a
+[TensorFlow Docker container](https://www.tensorflow.org/install/docker).
+
+You must currently use the TensorFlow Nightly `pip` package. This version is
+confirmed to work:
+
+```
+tf-nightly-gpu==1.15.0.dev20190729
+```
 
 To begin training, run the following:
 
 ```
-bazel run -c opt --copt=-mavx2 --copt=-mfma \
-tensorflow/examples/speech_commands:train -- \
+python tensorflow/tensorflow/examples/speech_commands/train.py \
 --model_architecture=tiny_conv --window_stride=20 --preprocess=micro \
---wanted_words="yes,no" --silence_percentage=25 --unknown_percentage=25 --quantize=1
+--wanted_words="on,off" --silence_percentage=25 --unknown_percentage=25 \
+--quantize=1 --verbosity=INFO --how_many_training_steps="15000,3000" \
+--learning_rate="0.001,0.0001" --summaries_dir=/tmp/retrain_logs \
+--data_dir=/tmp/speech_dataset --train_dir=/tmp/speech_commands_train
 ```
 
-If you see a compiling error on older machines, try leaving out the `--copt`
-arguments, they are just there to accelerate training on chips that support the
-extensions. The training process is likely to take a couple of hours. Once it
+The training process is likely to take a couple of hours. Once it
 has completed, the next step is to freeze the variables:
 
 ```
-bazel run tensorflow/examples/speech_commands:freeze -- \
+python tensorflow/tensorflow/examples/speech_commands/freeze.py \
 --model_architecture=tiny_conv --window_stride=20 --preprocess=micro \
 --wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb \
 --start_checkpoint=/tmp/speech_commands_train/tiny_conv.ckpt-18000
@@ -153,10 +613,10 @@ bazel run tensorflow/examples/speech_commands:freeze -- \
 The next step is to create a TensorFlow Lite file from the frozen graph:
 
 ```
-bazel run tensorflow/lite/toco:toco -- \
---input_file=/tmp/tiny_conv.pb --output_file=/tmp/tiny_conv.tflite \
---input_shapes=1,49,40,1 --input_arrays=Reshape_1 --output_arrays='labels_softmax' \
---inference_type=QUANTIZED_UINT8 --mean_values=0 --std_values=9.8077
+toco \
+--graph_def_file=/tmp/tiny_conv.pb --output_file=/tmp/tiny_conv.tflite \
+--input_shapes=1,49,40,1 --input_arrays=Reshape_2 --output_arrays='labels_softmax' \
+--inference_type=QUANTIZED_UINT8 --mean_values=0 --std_dev_values=9.8077
 ```
 
 Finally, convert the file into a C source file that can be compiled into an
@@ -166,45 +626,7 @@ embedded system:
 xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_micro_features_model_data.cc
 ```
 
-Next, we need to update `tiny_conv_micro_features_model_data.cc` so that it is
-compatible with the `micro_features` sample code.
-
-First, open the file. The top two lines should look approximately as follows
-(the exact hex values may be different):
-
-```cpp
-unsigned char _tmp_tiny_conv_tflite[] = {
-  0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
-```
-
-You need to add the include from the following snippet, and tweak the variable
-declaration. Don’t change the hex values, though:
-
-```cpp
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
-
-const unsigned char g_tiny_conv_micro_features_model_data[] = {
-  0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
-```
-
-Next, go to the very bottom of the file and find the variable named
-`_tmp_tiny_conv_tflite_len`.
-
-```cpp
-unsigned int _tmp_tiny_conv_tflite_len = 19800;
-```
-
-Change the declaration as follows, but do not change the number assigned to it,
-even if your number is different from the one in this guide.
-
-```cpp
-const int g_tiny_conv_micro_features_model_data_len = 19800;
-```
-
-Finally, save the file, then copy the `tiny_conv_micro_features_model_data.cc`
-file into the `micro_features/` subdirectory of your `tf_microspeech/` project.
-
-### Creating Your Own Model With Google Cloud
+### Use Google Cloud
 
 If want to train your model in Google Cloud you can do so by using
 pre-configured Deep Learning images.
@@ -231,28 +653,8 @@ As soon as instance has been created you can SSH to it(as a jupyter user!):
 gcloud compute ssh "jupyter@${INSTANCE_NAME}"
 ```
 
-now install Bazel:
-
-```
-wget https://github.com/bazelbuild/bazel/releases/download/0.15.0/bazel-0.15.0-installer-linux-x86_64.sh
-sudo bash ./bazel-0.15.0-installer-linux-x86_64.sh
-source /usr/local/lib/bazel/bin/bazel-complete.bash
-sudo ln /usr/local/bin/bazel /usr/bin/bazel
-```
-
-and finally run the build:
-
-```
-# TensorFlow already pre-baked on the image
-cd src/tensorflow
-bazel run -c opt --copt=-mavx2 --copt=-mfma \
-tensorflow/examples/speech_commands:train -- \
---model_architecture=tiny_conv --window_stride=20 --preprocess=average \
---wanted_words="yes,no" --silence_percentage=25 --unknown_percentage=25 --quantize=1
-```
-
-After build is over follow the rest of the instructions from this tutorial. And
-finally do not forget to remove the instance when training is done:
+Finally, follow the instructions in the previous section to train the model. Do
+not forget to remove the instance when training is done:
 
 ```
 gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}"
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/audio_provider.cc
new file mode 100644
index 00000000000..e8c27c897eb
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/audio_provider.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "PDM.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+bool g_is_audio_initialized = false;
+// An internal buffer able to fit 16x our sample size
+constexpr int kAudioCaptureBufferSize = DEFAULT_PDM_BUFFER_SIZE * 16;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+// A buffer that holds our output
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+// Mark as volatile so we can check in a while loop to see if
+// any samples have arrived yet.
+volatile int32_t g_latest_audio_timestamp = 0;
+}  // namespace
+
+void CaptureSamples() {
+  // This is how many bytes of new data we have each time this is called
+  const int number_of_samples = DEFAULT_PDM_BUFFER_SIZE;
+  // Calculate what timestamp the last audio sample represents
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp +
+      (number_of_samples / (kAudioSampleFrequency / 1000));
+  // Determine the index, in the history of all samples, of the last sample
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+  // Determine the index of this sample in our ring buffer
+  const int capture_index = start_sample_offset % kAudioCaptureBufferSize;
+  // Read the data to the correct place in our buffer
+  PDM.read(g_audio_capture_buffer + capture_index, DEFAULT_PDM_BUFFER_SIZE);
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  // Hook up the callback that will be called with each sample
+  PDM.onReceive(CaptureSamples);
+  // Start listening for audio: MONO @ 16KHz with gain at 20
+  PDM.begin(1, kAudioSampleFrequency);
+  PDM.setGain(20);
+  // Block until we have our first audio sample
+  while (!g_latest_audio_timestamp) {
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  // Set everything up to start receiving audio
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This next part should only be called when the main thread notices that the
+  // latest audio sample data timestamp has changed, so that there's new data
+  // in the capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+
+  // Determine the index, in the history of all samples, of the first
+  // sample we want
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  // Determine how many samples we want in total
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    // For each sample, transform its index in the history of all samples into
+    // its index in g_audio_capture_buffer
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    // Write the sample to the output buffer
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  // Set pointers to provide access to the audio
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/command_responder.cc
new file mode 100644
index 00000000000..c98b8fb71e6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/arduino/command_responder.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
+
+#include "Arduino.h"
+
+// Toggles the LED every inference, and keeps it on for ~2 seconds if a "yes"
+// was heard
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command) {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    pinMode(LED_BUILTIN, OUTPUT);
+    is_initialized = true;
+  }
+  static int32_t last_yes_time = 0;
+  static int count = 0;
+
+  if (is_new_command) {
+    error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                           current_time);
+    // If we heard a "yes", switch on an LED and store the time.
+    if (found_command[0] == 'y') {
+      last_yes_time = current_time;
+      digitalWrite(LED_BUILTIN, HIGH);
+    }
+  }
+
+  // If last_yes_time is non-zero but was >3 seconds ago, zero it
+  // and switch off the LED.
+  if (last_yes_time != 0) {
+    if (last_yes_time < (current_time - 3000)) {
+      last_yes_time = 0;
+      digitalWrite(LED_BUILTIN, LOW);
+    }
+    // If it is non-zero but <3 seconds ago, do nothing.
+    return;
+  }
+
+  // Otherwise, toggle the LED every time an inference is performed.
+  ++count;
+  if (count & 1) {
+    digitalWrite(LED_BUILTIN, HIGH);
+  } else {
+    digitalWrite(LED_BUILTIN, LOW);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
index 5585ed7269b..de82216a616 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
@@ -3,5 +3,6 @@ ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
   MBED_PROJECT_FILES += \
     AUDIO_DISCO_F746NG.lib \
     BSP_DISCO_F746NG.lib \
-    SDRAM_DISCO_F746NG.lib
+    SDRAM_DISCO_F746NG.lib \
+    LCD_DISCO_F746NG.lib
 endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/command_responder.cc
new file mode 100644
index 00000000000..a7f12eab1ab
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/command_responder.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
+
+#include "LCD_DISCO_F746NG.h"
+
+LCD_DISCO_F746NG lcd;
+
+// When a command is detected, write it to the display and log it to the
+// serial port.
+void RespondToCommand(tflite::ErrorReporter *error_reporter,
+                      int32_t current_time, const char *found_command,
+                      uint8_t score, bool is_new_command) {
+  if (is_new_command) {
+    error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                           current_time);
+    if (*found_command == 'y') {
+      lcd.Clear(0xFF0F9D58);
+      lcd.DisplayStringAt(0, LINE(5), (uint8_t *)"Heard yes!", CENTER_MODE);
+    } else if (*found_command == 'n') {
+      lcd.Clear(0xFFDB4437);
+      lcd.DisplayStringAt(0, LINE(5), (uint8_t *)"Heard no :(", CENTER_MODE);
+    } else if (*found_command == 'u') {
+      lcd.Clear(0xFFF4B400);
+      lcd.DisplayStringAt(0, LINE(5), (uint8_t *)"Heard unknown", CENTER_MODE);
+    } else {
+      lcd.Clear(0xFF4285F4);
+      lcd.DisplayStringAt(0, LINE(5), (uint8_t *)"Heard silence", CENTER_MODE);
+    }
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index b5dfa3d9440..ebb02076436 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -95,8 +95,10 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
       const int32_t slice_start_ms = (new_step * kFeatureSliceStrideMs);
       int16_t* audio_samples = nullptr;
       int audio_samples_size = 0;
-      GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
-                      &audio_samples_size, &audio_samples);
+      // TODO(petewarden): Fix bug that leads to non-zero slice_start_ms
+      GetAudioSamples(error_reporter, (slice_start_ms > 0 ? slice_start_ms : 0),
+                      kFeatureSliceDurationMs, &audio_samples_size,
+                      &audio_samples);
       if (audio_samples_size < kMaxAudioSampleSize) {
         error_reporter->Report("Audio data size %d too small, want %d",
                                audio_samples_size, kMaxAudioSampleSize);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/nxp_k66f/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/nxp_k66f/audio_provider.cc
new file mode 100644
index 00000000000..55267e5ad50
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/nxp_k66f/audio_provider.cc
@@ -0,0 +1,380 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TensorFlow Headers
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+// mbed and NXP FRDM-K66F Headers
+#include "fsl_clock_config.h"  // NOLINT
+#include "fsl_common.h"        // NOLINT
+#include "fsl_dmamux.h"        // NOLINT
+#include "fsl_edma.h"          // NOLINT
+#include "fsl_gpio.h"          // NOLINT
+#include "fsl_i2c.h"           // NOLINT
+#include "fsl_lmem_cache.h"    // NOLINT
+#include "fsl_port.h"          // NOLINT
+#include "fsl_sai.h"           // NOLINT
+#include "fsl_sai_edma.h"      // NOLINT
+#include "mbed.h"              // NOLINT
+
+// Compiler pragma for alignment of data to make efficient use of DMA
+#if (defined(__ICCARM__))
+#if ((!(defined(FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION) && \
+        FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION)) &&        \
+     defined(FSL_FEATURE_L1ICACHE_LINESIZE_BYTE))
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) \
+  SDK_PRAGMA(data_alignment = alignbytes) var @"NonCacheable"
+#else
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) \
+  SDK_PRAGMA(data_alignment = alignbytes) var
+#endif
+#elif (defined(__CC_ARM) || defined(__ARMCC_VERSION))
+#if ((!(defined(FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION) && \
+        FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION)) &&        \
+     defined(FSL_FEATURE_L1ICACHE_LINESIZE_BYTE))
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) \
+  __attribute__((section("NonCacheable"), zero_init))  \
+      __attribute__((aligned(alignbytes))) var
+#else
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) \
+  __attribute__((aligned(alignbytes))) var
+#endif
+#elif (defined(__GNUC__))
+#if ((!(defined(FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION) && \
+        FSL_FEATURE_HAS_NO_NONCACHEABLE_SECTION)) &&        \
+     defined(FSL_FEATURE_L1ICACHE_LINESIZE_BYTE))
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes)          \
+  __attribute__((section("NonCacheable,\"aw\",%nobits @"))) var \
+      __attribute__((aligned(alignbytes)))
+#else
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) \
+  var __attribute__((aligned(alignbytes)))
+#endif
+#else
+#error Toolchain not supported.
+#define AT_NONCACHEABLE_SECTION_ALIGN(var, alignbytes) var
+#endif
+
+namespace {
+
+// Buffer configuration for receiving audio data
+constexpr int kNoOfSamples = 512;
+constexpr int kBufferSize = kNoOfSamples * 2;
+constexpr int kNoOfBuffers = 4;
+constexpr int kOverSampleRate = 384;
+
+// Buffer management
+AT_NONCACHEABLE_SECTION_ALIGN(
+    static int16_t g_rx_buffer[kNoOfBuffers * kNoOfSamples], 4);
+sai_edma_handle_t g_tx_sai_handle;
+sai_edma_handle_t g_rx_sai_handle;
+static volatile uint32_t g_tx_index = 0;
+static volatile uint32_t g_rx_index = 0;
+edma_handle_t g_tx_dma_handle = {0};
+edma_handle_t g_rx_dma_handle = {0};
+sai_transfer_t g_sai_transfer;
+
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// DA7212 configuration
+constexpr int da7212ConfigurationSize = 48;
+constexpr int da7212I2cAddress = 0x1A;
+volatile uint8_t g_da7212_register_config[da7212ConfigurationSize][2] = {
+    {0x21, 0x10},  // Set DIG_ROUTING_DAI to ADC right and ADC left
+    {0x22, 0x05},  // Set Sampling rate to 16 KHz
+    {0x23, 0x08},  // Enable master bias
+    {0x24, 0x00},  // Clear PLL Fractional division top
+    {0x25, 0x00},  // Clear PLL Fractional division bottom
+    {0x26, 0x20},  // Set PLL Integer division to 32
+    {0x27, 0x80},  // Set PLL input range to 2-10 MHz,system clock is PLL output
+    {0x28, 0x01},  // 64  BCLK per WCLK and S
+    {0x29, 0xC0},  // I2S 16-bit per channel, output is driven, DAI enable
+    {0x2A, 0x32},  // One stream for left and another for right
+    {0x45, 0x67},  // Set DAC Gain to 6 dB
+    {0x46, 0x67},  // Set DAC Gain to 6 dB
+    {0x47, 0xF1},  // Enable charge pump
+    {0x4B, 0x08},  // DAC_L selected
+    {0x4C, 0x08},  // DAC_R selected
+    {0x69, 0xA0},  // Enable DAC_L
+    {0x6A, 0xA0},  // Enable DAC_R
+    {0x6B, 0xB8},  // Enable HP_L
+    {0x6C, 0xB8},  // Enable HP_R
+    {0x6E, 0x98},  // Enable MIXOUT_L
+    {0x6F, 0x98},  // Enable MIXOUT_R
+    {0x95, 0x32}, {0xE0, 0x00}, {0x32, 0x80},  // Enable MIC
+    {0x33, 0x80},                              // Enable MIC
+    {0x34, 0x03},                              // Add MXIN Gain
+    {0x35, 0x03},                              // Add MXIN Gain
+    {0x36, 0x78},                              // Add ADC Gain
+    {0x37, 0x78},                              // Add ADC Gain
+    {0x60, 0xB0}, {0x61, 0xB0}, {0x65, 0x88}, {0x66, 0x88}, {0x67, 0xA0},
+    {0x68, 0xA0}, {0x62, 0xA9}, {0x50, 0xFE}, {0x51, 0xF7}, {0x93, 0x07},
+    {0x3A, 0x04}, {0x64, 0x84}, {0x39, 0x01}, {0x63, 0x80}, {0x38, 0x88},
+    {0x24, 0x00}, {0x25, 0x00}, {0x26, 0x20}, {0x20, 0x80}};
+
+// Save audio samples into intermediate buffer
+void CaptureSamples(const int16_t *sample_data) {
+  const int sample_size = kNoOfSamples;
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = sample_data[i];
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+// Callback function for SAI RX EDMA transfer complete
+static void SaiRxCallback(I2S_Type *base, sai_edma_handle_t *handle,
+                          status_t status, void *userData) {
+  if (kStatus_SAI_RxError == status) {
+    // Handle the error
+  } else {
+    // Save audio data into intermediate buffer
+    CaptureSamples(
+        reinterpret_cast<int16_t *>(g_rx_buffer + g_tx_index * kNoOfSamples));
+
+    // Submit received audio buffer to SAI TX for audio loopback debug
+    g_sai_transfer.data = (uint8_t *)(g_rx_buffer + g_tx_index * kNoOfSamples);
+    g_sai_transfer.dataSize = kBufferSize;
+    if (kStatus_Success ==
+        SAI_TransferSendEDMA(I2S0, &g_tx_sai_handle, &g_sai_transfer)) {
+      g_tx_index++;
+    }
+    if (g_tx_index == kNoOfBuffers) {
+      g_tx_index = 0U;
+    }
+
+    // Submit buffer to SAI RX to receive audio data
+    g_sai_transfer.data = (uint8_t *)(g_rx_buffer + g_rx_index * kNoOfSamples);
+    g_sai_transfer.dataSize = kBufferSize;
+    if (kStatus_Success ==
+        SAI_TransferReceiveEDMA(I2S0, &g_rx_sai_handle, &g_sai_transfer)) {
+      g_rx_index++;
+    }
+    if (g_rx_index == kNoOfBuffers) {
+      g_rx_index = 0U;
+    }
+  }
+}
+
+// Callback function for TX Buffer transfer
+static void SaiTxCallback(I2S_Type *base, sai_edma_handle_t *handle,
+                          status_t status, void *userData) {
+  if (kStatus_SAI_TxError == status) {
+    // Handle the error
+  }
+  // Do nothing
+}
+
+// Initialize MCU pins
+void McuInitializePins(void) {
+  // Port B Clock Gate Control: Clock enabled
+  CLOCK_EnableClock(kCLOCK_PortB);
+  // Port C Clock Gate Control: Clock enabled
+  CLOCK_EnableClock(kCLOCK_PortC);
+  // Port E Clock Gate Control: Clock enabled
+  CLOCK_EnableClock(kCLOCK_PortE);
+
+  // PORTB16 (pin E10) is configured as UART0_RX
+  PORT_SetPinMux(PORTB, 16U, kPORT_MuxAlt3);
+  // PORTB17 (pin E9) is configured as UART0_TX
+  PORT_SetPinMux(PORTB, 17U, kPORT_MuxAlt3);
+  // PORTC1 (pin B11) is configured as I2S0_TXD0
+  PORT_SetPinMux(PORTC, 1U, kPORT_MuxAlt6);
+
+  // PORTC10 (pin C7) is configured as I2C1_SCL
+  const port_pin_config_t portc10_pinC7_config = {
+      kPORT_PullUp,          kPORT_FastSlewRate,     kPORT_PassiveFilterDisable,
+      kPORT_OpenDrainEnable, kPORT_LowDriveStrength, kPORT_MuxAlt2,
+      kPORT_UnlockRegister};
+  PORT_SetPinConfig(PORTC, 10U, &portc10_pinC7_config);
+
+  // PORTC11 (pin B7) is configured as I2C1_SDA
+  const port_pin_config_t portc11_pinB7_config = {
+      kPORT_PullUp,          kPORT_FastSlewRate,     kPORT_PassiveFilterDisable,
+      kPORT_OpenDrainEnable, kPORT_LowDriveStrength, kPORT_MuxAlt2,
+      kPORT_UnlockRegister};
+  PORT_SetPinConfig(PORTC, 11U, &portc11_pinB7_config);
+
+  // PORTC6 (pin C8) is configured as I2S0_MCLK
+  PORT_SetPinMux(PORTC, 6U, kPORT_MuxAlt6);
+  // PORTE11 (pin G4) is configured as I2S0_TX_FS
+  PORT_SetPinMux(PORTE, 11U, kPORT_MuxAlt4);
+  // PORTE12 (pin G3) is configured as I2S0_TX_BCLK
+  PORT_SetPinMux(PORTE, 12U, kPORT_MuxAlt4);
+  SIM->SOPT5 =
+      ((SIM->SOPT5 & (~(SIM_SOPT5_UART0TXSRC_MASK))) | SIM_SOPT5_UART0TXSRC(0));
+  // PORTE7 (pin F4) is configured as I2S0_RXD0
+  PORT_SetPinMux(PORTE, 7U, kPORT_MuxAlt4);
+  SIM->SOPT5 =
+      ((SIM->SOPT5 & (~(SIM_SOPT5_UART0TXSRC_MASK))) | SIM_SOPT5_UART0TXSRC(0));
+}
+
+// Write DA7212 registers using I2C
+status_t Da7212WriteRegister(uint8_t register_address, uint8_t register_data) {
+  uint8_t data[1];
+  data[0] = (uint8_t)register_data;
+  i2c_master_transfer_t i2c_data;
+  i2c_data.slaveAddress = da7212I2cAddress;
+  i2c_data.direction = kI2C_Write;
+  i2c_data.subaddress = register_address;
+  i2c_data.subaddressSize = 1;
+  i2c_data.data = (uint8_t * volatile) data;
+  i2c_data.dataSize = 1;
+  i2c_data.flags = kI2C_TransferDefaultFlag;
+  return I2C_MasterTransferBlocking(I2C1, &i2c_data);
+}
+
+// Initialize DA7212
+void Da7212Initialize(void) {
+  for (uint32_t i = 0; i < da7212ConfigurationSize; i++) {
+    Da7212WriteRegister(g_da7212_register_config[i][0],
+                        g_da7212_register_config[i][1]);
+  }
+}
+
+// Initalization for receiving audio data
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter *error_reporter) {
+  edma_config_t dma_config = {0};
+  sai_config_t sai_config;
+  sai_transfer_format_t sai_format;
+  volatile uint32_t delay_cycle = 500000;
+  i2c_master_config_t i2c_config = {0};
+
+  // Initialize FRDM-K66F pins
+  McuInitializePins();
+
+  // Set Clock to 180 MHz
+  // BOARD_BootClockRUN();
+  BOARD_BootClockHSRUN();
+
+  // Enable Code Caching to improve performance
+  LMEM_EnableCodeCache(LMEM, true);
+
+  // Initialize I2C
+  I2C_MasterGetDefaultConfig(&i2c_config);
+  I2C_MasterInit(I2C1, &i2c_config, CLOCK_GetFreq(kCLOCK_BusClk));
+
+  // Initialize SAI
+  memset(&sai_format, 0U, sizeof(sai_transfer_format_t));
+  SAI_TxGetDefaultConfig(&sai_config);
+  SAI_TxInit(I2S0, &sai_config);
+  SAI_RxGetDefaultConfig(&sai_config);
+  SAI_RxInit(I2S0, &sai_config);
+  sai_format.bitWidth = kSAI_WordWidth16bits;
+  sai_format.channel = 0U;
+  sai_format.sampleRate_Hz = kSAI_SampleRate16KHz;
+  sai_format.masterClockHz = kOverSampleRate * sai_format.sampleRate_Hz;
+  sai_format.protocol = sai_config.protocol;
+  sai_format.stereo = kSAI_MonoRight;
+  sai_format.watermark = FSL_FEATURE_SAI_FIFO_COUNT / 2U;
+
+  // Initialize DA7212
+  Da7212Initialize();
+
+  // Initialize SAI EDMA
+  EDMA_GetDefaultConfig(&dma_config);
+  EDMA_Init(DMA0, &dma_config);
+  EDMA_CreateHandle(&g_tx_dma_handle, DMA0, 0);
+  EDMA_CreateHandle(&g_rx_dma_handle, DMA0, 1);
+
+  // Initialize DMA MUX
+  DMAMUX_Init(DMAMUX);
+  DMAMUX_SetSource(DMAMUX, 0, (uint8_t)kDmaRequestMux0I2S0Tx);
+  DMAMUX_EnableChannel(DMAMUX, 0);
+  DMAMUX_SetSource(DMAMUX, 1, (uint8_t)kDmaRequestMux0I2S0Rx);
+  DMAMUX_EnableChannel(DMAMUX, 1);
+
+  // Wait few cycles for DA7212
+  while (delay_cycle) {
+    __ASM("nop");
+    delay_cycle--;
+  }
+
+  // Setup SAI EDMA Callbacks
+  SAI_TransferTxCreateHandleEDMA(I2S0, &g_tx_sai_handle, SaiTxCallback, NULL,
+                                 &g_tx_dma_handle);
+  SAI_TransferRxCreateHandleEDMA(I2S0, &g_rx_sai_handle, SaiRxCallback, NULL,
+                                 &g_rx_dma_handle);
+  SAI_TransferTxSetFormatEDMA(I2S0, &g_tx_sai_handle, &sai_format,
+                              CLOCK_GetFreq(kCLOCK_CoreSysClk),
+                              sai_format.masterClockHz);
+  SAI_TransferRxSetFormatEDMA(I2S0, &g_rx_sai_handle, &sai_format,
+                              CLOCK_GetFreq(kCLOCK_CoreSysClk),
+                              sai_format.masterClockHz);
+
+  // Submit buffers to SAI RX to start receiving audio
+  g_sai_transfer.data = (uint8_t *)(g_rx_buffer + g_rx_index * kNoOfSamples);
+  g_sai_transfer.dataSize = kBufferSize;
+  if (kStatus_Success ==
+      SAI_TransferReceiveEDMA(I2S0, &g_rx_sai_handle, &g_sai_transfer)) {
+    g_rx_index++;
+  }
+  if (g_rx_index == kNoOfBuffers) {
+    g_rx_index = 0U;
+  }
+  g_sai_transfer.data = (uint8_t *)(g_rx_buffer + g_rx_index * kNoOfSamples);
+  g_sai_transfer.dataSize = kBufferSize;
+  if (kStatus_Success ==
+      SAI_TransferReceiveEDMA(I2S0, &g_rx_sai_handle, &g_sai_transfer)) {
+    g_rx_index++;
+  }
+  if (g_rx_index == kNoOfBuffers) {
+    g_rx_index = 0U;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter *error_reporter,
+                             int start_ms, int duration_ms,
+                             int *audio_samples_size, int16_t **audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
index 6582c948d16..875fface496 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -78,8 +78,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
 
   RecognizeCommands recognize_commands(error_reporter);
 
+  std::initializer_list<uint8_t> result_data = {255, 0, 0, 0};
+  auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      {255, 0, 0, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
       "input_tensor", 0.0f, 128.0f);
 
   const char* found_command;
@@ -96,8 +98,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
+  std::initializer_list<uint8_t> yes_data = {0, 0, 255, 0};
+  auto yes_dims = {2, 1, 4};
   TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
-      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      yes_data, tflite::testing::IntArrayFromInitializer(yes_dims),
       "input_tensor", 0.0f, 128.0f);
 
   bool has_found_new_command = false;
@@ -122,8 +126,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
   }
 
+  std::initializer_list<uint8_t> no_data = {0, 0, 0, 255};
+  auto no_dims = {2, 1, 4};
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
-      {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      no_data, tflite::testing::IntArrayFromInitializer(no_dims),
       "input_tensor", 0.0f, 128.0f);
   has_found_new_command = false;
   new_command = "";
@@ -155,8 +161,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
+  std::initializer_list<uint8_t> bad_data = {0, 0, 255};
+  auto bad_dims = {2, 1, 3};
   TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
-      {0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 3}),
+      bad_data, tflite::testing::IntArrayFromInitializer(bad_dims),
       "input_tensor", 0.0f, 128.0f);
 
   const char* found_command;
@@ -173,8 +181,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
+  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
       "input_tensor", 0.0f, 128.0f);
 
   const char* found_command;
@@ -194,8 +204,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
+  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
       "input_tensor", 0.0f, 128.0f);
 
   const char* found_command;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb
new file mode 100644
index 00000000000..e5ac84d17e1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb
@@ -0,0 +1,327 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Train simple audio recognition model",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pO4-CY_TCZZS",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Train a Simple Audio Recognition model for microcontroller use"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BaFfr7DHRmGF",
+        "colab_type": "text"
+      },
+      "source": [
+        "This notebook demonstrates how to train a 20kb [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model for [TensorFlow Lite for Microcontrollers](https://tensorflow.org/lite/microcontrollers/overview). It will produce the same model used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech) example application.\n",
+        "\n",
+        "The model is designed to be used with [Google Colaboratory](https://colab.research.google.com).\n",
+        "\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/train_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XaVtYN4nlCft",
+        "colab_type": "text"
+      },
+      "source": [
+        "The notebook runs Python scripts to train and freeze the model, and uses the TensorFlow Lite converter to convert it for use with TensorFlow Lite for Microcontrollers.\n",
+        "\n",
+        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and selecting **GPU**. Training 18,000 iterations will take 1.5-2 hours on a GPU runtime.\n",
+        "\n",
+        "## Configure training\n",
+        "\n",
+        "The following `os.environ` lines can be customized to set the words that will be trained for, and the steps and learning rate of the training. The default values will result in the same model that is used in the micro_speech example. Run the cell to set the configuration:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ludfxbNIaegy",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import os\n",
+        "\n",
+        "# A comma-delimited list of the words you want to train for.\n",
+        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
+        "# All other words will be used to train an \"unknown\" category.\n",
+        "os.environ[\"WANTED_WORDS\"] = \"yes,no\"\n",
+        "\n",
+        "# The number of steps and learning rates can be specified as comma-separated\n",
+        "# lists to define the rate at each stage. For example,\n",
+        "# TRAINING_STEPS=15000,3000 and LEARNING_RATE=0.001,0.0001\n",
+        "# will run 18,000 training loops in total, with a rate of 0.001 for the first\n",
+        "# 15,000, and 0.0001 for the final 3,000.\n",
+        "os.environ[\"TRAINING_STEPS\"]=\"15000,3000\"\n",
+        "os.environ[\"LEARNING_RATE\"]=\"0.001,0.0001\"\n",
+        "\n",
+        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
+        "# file name.\n",
+        "total_steps = sum(map(lambda string: int(string),\n",
+        "                  os.environ[\"TRAINING_STEPS\"].split(\",\")))\n",
+        "os.environ[\"TOTAL_STEPS\"] = str(total_steps)\n",
+        "\n",
+        "# Print the configuration to confirm it\n",
+        "!echo \"Training these words: ${WANTED_WORDS}\"\n",
+        "!echo \"Training steps in each stage: ${TRAINING_STEPS}\"\n",
+        "!echo \"Learning rate in each stage: ${LEARNING_RATE}\"\n",
+        "!echo \"Total number of training steps: ${TOTAL_STEPS}\"\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gCgeOpvY9pAi",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Install dependencies\n",
+        "\n",
+        "Next, we'll install a GPU build of TensorFlow, so we can use GPU acceleration for training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Nd1iM1o2ymvA",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Replace Colab's default TensorFlow install with a more recent\n",
+        "# build that contains the operations that are needed for training\n",
+        "!pip uninstall -y tensorflow tensorflow_estimator\n",
+        "!pip install -q tf-estimator-nightly==1.14.0.dev2019072901 tf-nightly-gpu==1.15.0.dev20190729"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T9Ty5mR58E4i",
+        "colab_type": "text"
+      },
+      "source": [
+        "We'll also clone the TensorFlow repository, which contains the scripts that train and freeze the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "APGx0fEh7hFF",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Clone the repository from GitHub\n",
+        "!git clone -q https://github.com/tensorflow/tensorflow\n",
+        "# Check out a commit that has been tested to work\n",
+        "# with the build of TensorFlow we're using\n",
+        "!git -c advice.detachedHead=false -C tensorflow checkout 17ce384df70"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aV_0qkYh98LD",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Load TensorBoard\n",
+        "\n",
+        "Now, set up TensorBoard so that we can graph our accuracy and loss as training proceeds."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yZArmzT85SLq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Delete any old logs from previous runs\n",
+        "!rm -rf /content/retrain_logs\n",
+        "# Load TensorBoard\n",
+        "%load_ext tensorboard\n",
+        "%tensorboard --logdir /content/retrain_logs"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x1J96Ron-O4R",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Begin training\n",
+        "\n",
+        "Next, run the following script to begin training. The script will first download the training data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VJsEZx6lynbY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
+        "--model_architecture=tiny_conv --window_stride=20 --preprocess=micro \\\n",
+        "--wanted_words=${WANTED_WORDS} --silence_percentage=25 --unknown_percentage=25 \\\n",
+        "--quantize=1 --verbosity=WARN --how_many_training_steps=${TRAINING_STEPS} \\\n",
+        "--learning_rate=${LEARNING_RATE} --summaries_dir=/content/retrain_logs \\\n",
+        "--data_dir=/content/speech_dataset --train_dir=/content/speech_commands_train \\\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XQUJLrdS-ftl",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Freeze the graph\n",
+        "\n",
+        "Once training is complete, run the following cell to freeze the graph."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xyc3_eLh9sAg",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
+        "--model_architecture=tiny_conv --window_stride=20 --preprocess=micro \\\n",
+        "--wanted_words=${WANTED_WORDS} --quantize=1 --output_file=/content/tiny_conv.pb \\\n",
+        "--start_checkpoint=/content/speech_commands_train/tiny_conv.ckpt-${TOTAL_STEPS}"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_DBGDxVI-nKG",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Convert the model\n",
+        "\n",
+        "Run this cell to use the TensorFlow Lite converter to convert the frozen graph into the TensorFlow Lite format, fully quantized for use with embedded devices."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lBj_AyCh1cC0",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!toco \\\n",
+        "--graph_def_file=/content/tiny_conv.pb --output_file=/content/tiny_conv.tflite \\\n",
+        "--input_shapes=1,49,40,1 --input_arrays=Reshape_2 --output_arrays='labels_softmax' \\\n",
+        "--inference_type=QUANTIZED_UINT8 --mean_values=0 --std_dev_values=9.8077"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dt6Zqbxu-wIi",
+        "colab_type": "text"
+      },
+      "source": [
+        "The following cell will print the model size, which will be under 20 kilobytes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XohZOTjR8ZyE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import os\n",
+        "model_size = os.path.getsize(\"/content/tiny_conv.tflite\")\n",
+        "print(\"Model is %d bytes\" % model_size)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2pQnN0i_-0L2",
+        "colab_type": "text"
+      },
+      "source": [
+        "Finally, we use xxd to transform the model into a source file that can be included in a C++ project and loaded by TensorFlow Lite for Microcontrollers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "eoYyh0VU8pca",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Install xxd if it is not available\n",
+        "!apt-get -qq install xxd\n",
+        "# Save the file as a C source file\n",
+        "!xxd -i /content/tiny_conv.tflite > /content/tiny_conv.cc\n",
+        "# Print the source file\n",
+        "!cat /content/tiny_conv.cc"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_vision/Makefile.inc
index af792629e08..b2aae872178 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/Makefile.inc
@@ -1,8 +1,8 @@
-$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
 
 MICRO_VISION_MODEL_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.cc \
-$(MAKEFILE_DIR)/downloads/person_model/person_detect_model_data.cc
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/person_detect_model_data.cc
 
 MICRO_VISION_MODEL_HDRS := \
 tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h \
@@ -10,8 +10,8 @@ tensorflow/lite/experimental/micro/examples/micro_vision/person_detect_model_dat
 
 MICRO_VISION_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc \
-$(MAKEFILE_DIR)/downloads/person_model/no_person_image_data.cc \
-$(MAKEFILE_DIR)/downloads/person_model/person_image_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/no_person_image_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_grayscale/person_image_data.cc \
 $(MICRO_VISION_MODEL_SRCS)
 
 MICRO_VISION_TEST_HDRS := \
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
index 48fbb92e562..8b067e33f05 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
@@ -48,6 +48,7 @@ int main(int argc, char* argv[]) {
         "Model provided is schema version %d not equal "
         "to supported version %d.",
         model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
   }
 
   // This pulls in all the operation implementations we need.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
index a0874de6e44..c5735769fb6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
@@ -69,10 +69,9 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_NE(nullptr, input);
   TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(96, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(96, input->dims->data[2]);
-  // TODO(rocky): This will be a single channel for monochrome inputs
-  TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[3]);
+  TF_LITE_MICRO_EXPECT_EQ(kNumRows, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kNumCols, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kNumChannels, input->dims->data[3]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
 
   // Copy an image with a person into the memory area used for the input.
@@ -95,7 +94,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[1]);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[2]);
-  TF_LITE_MICRO_EXPECT_EQ(3, output->dims->data[3]);
+  TF_LITE_MICRO_EXPECT_EQ(kCategoryCount, output->dims->data[3]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
 
   // Make sure that the expected "Person" score is higher than the other class.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h
index c50688a13e6..e3cec7ad03d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h
@@ -23,7 +23,7 @@ limitations under the License.
 // if you change your model you'll need to update these constants.
 constexpr int kNumCols = 96;
 constexpr int kNumRows = 96;
-constexpr int kNumChannels = 3;
+constexpr int kNumChannels = 1;
 
 constexpr int kMaxImageSize = kNumCols * kNumRows * kNumChannels;
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
index f5da9865d55..6685d93cf44 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
@@ -143,6 +143,10 @@ TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) {
   am_hal_gpio_pinconfig(HM01B0_PIN_DVDD_EN, g_AM_HAL_GPIO_OUTPUT_12);
   am_hal_gpio_output_set(HM01B0_PIN_DVDD_EN);
 
+  // Configure Red LED for debugging.
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_RED, g_AM_HAL_GPIO_OUTPUT_12);
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED);
+
   hm01b0_power_up(&s_HM01B0Cfg);
 
   // TODO(njeff): check the delay time to just fit the spec.
@@ -153,22 +157,23 @@ TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) {
   // TODO(njeff): check the delay time to just fit the spec.
   am_util_delay_ms(1);
 
-  hm01b0_init_if(&s_HM01B0Cfg);
+  if (HM01B0_ERR_OK != hm01b0_init_if(&s_HM01B0Cfg)) {
+    return kTfLiteError;
+  }
 
-  hm01b0_init_system(&s_HM01B0Cfg, (hm_script_t*)sHM01B0InitScript,
-                     sizeof(sHM01B0InitScript) / sizeof(hm_script_t));
+  if (HM01B0_ERR_OK !=
+      hm01b0_init_system(&s_HM01B0Cfg, (hm_script_t*)sHM01B0InitScript,
+                         sizeof(sHM01B0InitScript) / sizeof(hm_script_t))) {
+    return kTfLiteError;
+  }
 
   // Put camera into streaming mode - this makes it so that the camera
   // constantly captures images.  It is still OK to read and image since the
   // camera uses a double-buffered input.  This means there is always one valid
   // image to read while the other buffer fills.  Streaming mode allows the
   // camera to perform auto exposure constantly.
-  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED);
-  uint32_t error_code =
-      hm01b0_set_mode(&s_HM01B0Cfg, HM01B0_REG_MODE_SELECT_STREAMING, 0);
-  if (error_code == HM01B0_ERR_OK) {
-    am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED);
-
+  if (HM01B0_ERR_OK !=
+      hm01b0_set_mode(&s_HM01B0Cfg, HM01B0_REG_MODE_SELECT_STREAMING, 0)) {
     return kTfLiteError;
   }
 
@@ -182,6 +187,7 @@ TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int frame_width,
   if (!g_is_camera_initialized) {
     TfLiteStatus init_status = InitCamera(error_reporter);
     if (init_status != kTfLiteOk) {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED);
       return init_status;
     }
     // Drop a few frames until auto exposure is calibrated.
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index 43288c9de60..10cf1b9a1d6 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -14,17 +14,35 @@ package(
 cc_library(
     name = "micro_ops",
     srcs = [
+        "add.cc",
+        "arg_min_max.cc",
+        "ceil.cc",
+        "comparisons.cc",
         "conv.cc",
         "depthwise_conv.cc",
         "elementwise.cc",
+        "floor.cc",
         "fully_connected.cc",
+        "logical.cc",
+        "maximum_minimum.cc",
+        "neg.cc",
+        "pack.cc",
         "pooling.cc",
+        "prelu.cc",
+        "reshape.cc",
+        "round.cc",
         "softmax.cc",
+        "split.cc",
+        "strided_slice.cc",
+        "svdf.cc",
+        "unpack.cc",
     ],
     hdrs = [
     ],
     copts = tflite_copts(),
     deps = [
+        ":activation_utils",
+        ":micro_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
@@ -54,17 +72,35 @@ cc_library(
 cc_library(
     name = "portable_optimized_micro_ops",
     srcs = [
+        "add.cc",
+        "arg_min_max.cc",
+        "ceil.cc",
+        "comparisons.cc",
         "conv.cc",
         "elementwise.cc",
+        "floor.cc",
         "fully_connected.cc",
+        "logical.cc",
+        "maximum_minimum.cc",
+        "neg.cc",
+        "pack.cc",
         "pooling.cc",
         "portable_optimized/depthwise_conv.cc",
+        "prelu.cc",
+        "reshape.cc",
+        "round.cc",
         "softmax.cc",
+        "split.cc",
+        "strided_slice.cc",
+        "svdf.cc",
+        "unpack.cc",
     ],
     hdrs = [
     ],
     copts = tflite_copts(),
     deps = [
+        ":activation_utils",
+        ":micro_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
@@ -72,6 +108,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:strided_slice_logic",
         "//tensorflow/lite/kernels/internal:tensor",
     ],
 )
@@ -91,6 +128,10 @@ cc_library(
     ],
 )
 
+test_suite(
+    name = "all_tests",
+)
+
 tflite_micro_cc_test(
     name = "elementwise_test",
     srcs = ["elementwise_test.cc"],
@@ -167,6 +208,19 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "svdf_test",
+    srcs = [
+        "svdf_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "conv_test",
     srcs = [
@@ -179,3 +233,211 @@ tflite_micro_cc_test(
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "prelu_test",
+    srcs = [
+        "prelu_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "floor_test",
+    srcs = [
+        "floor_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "logical_test",
+    srcs = [
+        "logical_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "neg_test",
+    srcs = [
+        "neg_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "maximum_minimum_test",
+    srcs = [
+        "maximum_minimum_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "arg_min_max_test",
+    srcs = [
+        "arg_min_max_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":micro_utils",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "comparisons_test",
+    srcs = [
+        "comparisons_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/kernels:micro_utils",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "ceil_test",
+    srcs = [
+        "ceil_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "round_test",
+    srcs = [
+        "round_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "strided_slice_test",
+    srcs = [
+        "strided_slice_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "pack_test",
+    srcs = [
+        "pack_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "unpack_test",
+    srcs = [
+        "unpack_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "split_test",
+    srcs = [
+        "split_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "add_test",
+    srcs = [
+        "add_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "activation_utils",
+    hdrs = ["activation_utils.h"],
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "micro_utils",
+    hdrs = ["micro_utils.h"],
+)
+
+tflite_micro_cc_test(
+    name = "reshape_test",
+    srcs = [
+        "reshape_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+        "//tensorflow/lite/kernels/internal:tensor",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/kernels/activation_utils.h b/tensorflow/lite/experimental/micro/kernels/activation_utils.h
new file mode 100644
index 00000000000..6367558c8a2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/activation_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ACTIVATION_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ACTIVATION_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+// Returns the floating point value for a fused activation:
+inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
+  switch (act) {
+    case kTfLiteActNone:
+      return a;
+    case kTfLiteActRelu:
+      return a < 0.f ? 0.f : a;
+    default:
+      // TODO(kreeger): Implement more activations.
+      exit(1);
+  }
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ACTIVATION_UTILS_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/add.cc b/tensorflow/lite/experimental/micro/kernels/add.cc
new file mode 100644
index 00000000000..a2a14d9cfa6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/add.cc
@@ -0,0 +1,212 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace add {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32 input1_multiplier;
+  int32 input2_multiplier;
+  int32 output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
+                             const TfLiteTensor* input1,
+                             const TfLiteTensor* input2, TfLiteTensor* output,
+                             OpData* data) {
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    // 8bit -> 8bit general quantized path, with general rescalings
+    data->input1_offset = -input1->params.zero_point;
+    data->input2_offset = -input2->params.zero_point;
+    data->output_offset = output->params.zero_point;
+    data->left_shift = 20;
+    const double twice_max_input_scale =
+        2 * std::max(input1->params.scale, input2->params.scale);
+    const double real_input1_multiplier =
+        input1->params.scale / twice_max_input_scale;
+    const double real_input2_multiplier =
+        input2->params.scale / twice_max_input_scale;
+    const double real_output_multiplier =
+        twice_max_input_scale /
+        ((1 << data->left_shift) * output->params.scale);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_output_multiplier, &data->output_multiplier, &data->output_shift);
+
+    if (output->type == kTfLiteUInt8) {
+      CalculateActivationRangeUint8(params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+    } else {
+      CalculateActivationRangeInt8(params->activation, output,
+                                   &data->output_activation_min,
+                                   &data->output_activation_max);
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
+             const OpData* data, const TfLiteTensor* input1,
+             const TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+#define TF_LITE_ADD(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output))
+  if (data->requires_broadcast) {
+    TF_LITE_ADD(BroadcastAdd4DSlow);
+  } else {
+    TF_LITE_ADD(Add);
+  }
+#undef TF_LITE_ADD
+}
+
+TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteAddParams* params, const OpData* data,
+                              const TfLiteTensor* input1,
+                              const TfLiteTensor* input2,
+                              TfLiteTensor* output) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+      } else {
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+      } else {
+        TF_LITE_ADD(reference_ops, Add, uint8_t);
+      }
+    }
+#undef TF_LITE_ADD
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData data;
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, &data));
+
+  if (output->type == kTfLiteFloat32) {
+    EvalAdd(context, node, params, &data, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, &data,
+                                                input1, input2, output));
+  } else {
+    context->ReportError(context,
+                         "Inputs and outputs not all float|uint8|int8 types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace add
+
+TfLiteRegistration* Register_ADD() {
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare, add::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/add_test.cc b/tensorflow/lite/experimental/micro/kernels/add_test.cc
new file mode 100644
index 00000000000..037f67bb3ea
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/add_test.cc
@@ -0,0 +1,540 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestAddFloat(const int* input1_dims_data, const float* input1_data,
+                  const int* input2_dims_data, const float* input2_data,
+                  const int* output_dims_data, const float* expected_output,
+                  TfLiteFusedActivation activation, float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(::tflite::BuiltinOperator_ADD, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteAddParams builtin_data;
+  builtin_data.activation = activation;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  const size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const int output_dims_count = ElementCount(*output_dims);
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output[i], output_data[i], 1e-5f);
+  }
+}
+
+void TestAddFloat(std::initializer_list<int> input1_dims_data,
+                  std::initializer_list<float> input1_data,
+                  std::initializer_list<int> input2_dims_data,
+                  std::initializer_list<float> input2_data,
+                  std::initializer_list<int> output_dims_data,
+                  std::initializer_list<float> expected_output,
+                  TfLiteFusedActivation activation, float* output_data) {
+  TestAddFloat(input1_dims_data.begin(), input1_data.begin(),
+               input2_dims_data.begin(), input2_data.begin(),
+               output_dims_data.begin(), expected_output.begin(), activation,
+               output_data);
+}
+
+template <typename integer_dtype>
+void TestAddQuantized(const int* input1_dims_data,
+                      const integer_dtype* input1_data, float input1_min,
+                      float input1_max, const int* input2_dims_data,
+                      const integer_dtype* input2_data, float input2_min,
+                      float input2_max, const int* output_dims_data,
+                      const integer_dtype* expected_output, float output_min,
+                      float output_max, TfLiteFusedActivation activation,
+                      integer_dtype* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input1_min, input1_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input2_min, input2_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(::tflite::BuiltinOperator_ADD, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteAddParams builtin_data;
+  builtin_data.activation = activation;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  const size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const int output_dims_count = ElementCount(*output_dims);
+  for (int i = 0; i < output_dims_count; ++i) {
+    // For quantized Add, the maximum error should be one step.
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output[i], output_data[i], 1);
+  }
+}
+
+template <typename integer_dtype>
+void TestAddQuantized(std::initializer_list<int> input1_dims_data,
+                      std::initializer_list<integer_dtype> input1_data,
+                      float input1_min, float input1_max,
+                      std::initializer_list<int> input2_dims_data,
+                      std::initializer_list<integer_dtype> input2_data,
+                      float input2_min, float input2_max,
+                      std::initializer_list<int> output_dims_data,
+                      std::initializer_list<integer_dtype> expected_output,
+                      float output_min, float output_max,
+                      TfLiteFusedActivation activation,
+                      integer_dtype* output_data) {
+  TestAddQuantized<integer_dtype>(
+      input1_dims_data.begin(), input1_data.begin(), input1_min, input1_max,
+      input2_dims_data.begin(), input2_data.begin(), input2_min, input2_max,
+      output_dims_data.begin(), expected_output.begin(), output_min, output_max,
+      activation, output_data);
+}
+
+// Quantization helpers.
+template <typename integer_dtype>
+integer_dtype Quantize(const float value, const float min, const float max);
+
+template <>
+uint8_t Quantize<uint8_t>(const float value, const float min, const float max) {
+  return tflite::testing::F2Q(value, min, max);
+}
+
+template <>
+int8_t Quantize<int8_t>(const float value, const float min, const float max) {
+  return tflite::testing::F2QS(value, min, max);
+}
+
+// Quantized tests are defined here for templatizing.
+template <typename integer_dtype>
+void QuantizedAddNoActivation() {
+  const int output_dims_count = 4;
+  integer_dtype output_data[output_dims_count];
+
+  const float kMin = -1.0;
+  const float kMax = 1.0;
+
+#define Q(x) Quantize<integer_dtype>((x), kMin, kMax)
+  const int input_shape[] = {4, 1, 2, 2, 1};
+  constexpr int num_test_cases = 3;
+  constexpr int num_values = 4;
+  const integer_dtype input1_values[num_test_cases][num_values] = {
+      {Q(0.1), Q(0.2), Q(0.3), Q(0.4)},
+      {Q(-0.8), Q(0.2), Q(0.4), Q(0.7)},
+      {Q(-0.8), Q(0.2), Q(0.7), Q(0.3)},
+  };
+  const integer_dtype input2_values[num_test_cases][num_values] = {
+      {Q(0.6), Q(0.4), Q(0.3), Q(0.1)},
+      {Q(0.6), Q(0.4), Q(0.5), Q(-0.8)},
+      {Q(0.6), Q(0.4), Q(-0.8), Q(0.5)},
+  };
+  const integer_dtype expected_output[num_test_cases][num_values] = {
+      {Q(0.7), Q(0.6), Q(0.6), Q(0.5)},
+      {Q(-0.2), Q(0.6), Q(0.9), Q(-0.1)},
+      {Q(-0.2), Q(0.6), Q(-0.1), Q(0.8)},
+  };
+#undef Q
+
+  for (int i = 0; i < num_test_cases; ++i) {
+    TestAddQuantized<integer_dtype>(
+        input_shape, input1_values[i], kMin, kMax,    // Input 1
+        input_shape, input2_values[i], kMin, kMax,    // Input 2
+        input_shape, expected_output[i], kMin, kMax,  // Output
+        kTfLiteActNone, output_data);
+  }
+}
+
+template <typename integer_dtype>
+void QuantizedAddActivationRelu1() {
+  const int output_dims_count = 4;
+  integer_dtype output_data[output_dims_count];
+
+  const float kMin = -1.0;
+  const float kMax = 1.0;
+
+#define Q(x) Quantize<integer_dtype>((x), kMin, kMax)
+  const int input_shape[] = {4, 1, 2, 2, 1};
+  constexpr int num_test_cases = 3;
+  constexpr int num_values = 4;
+  const integer_dtype input1_values[num_test_cases][num_values] = {
+      {Q(-0.8), Q(0.2), Q(0.9), Q(0.7)},
+      {Q(-0.8), Q(-0.9), Q(0.7), Q(0.3)},
+  };
+  const integer_dtype input2_values[num_test_cases][num_values] = {
+      {Q(0.6), Q(0.4), Q(0.9), Q(-0.8)},
+      {Q(0.6), Q(-0.7), Q(-0.8), Q(0.5)},
+  };
+  const integer_dtype expected_output[num_test_cases][num_values] = {
+      {Q(-0.2), Q(0.6), Q(1.0), Q(-0.1)},
+      {Q(-0.2), Q(-1.0), Q(-0.1), Q(0.8)},
+  };
+#undef Q
+
+  for (int i = 0; i < num_test_cases; ++i) {
+    TestAddQuantized<integer_dtype>(
+        input_shape, input1_values[i], kMin, kMax,    // Input 1
+        input_shape, input2_values[i], kMin, kMax,    // Input 2
+        input_shape, expected_output[i], kMin, kMax,  // Output
+        kTfLiteActRelu1, output_data);
+  }
+}
+
+template <typename integer_dtype>
+void QuantizedAddVariousInputShapes() {
+  const int output_dims_count = 6;
+  integer_dtype output_data[output_dims_count];
+
+  const float kMin = -3.0;
+  const float kMax = 3.0;
+
+#define Q(x) Quantize<integer_dtype>((x), kMin, kMax)
+  const integer_dtype input1_values[] = {Q(-2.0), Q(0.2), Q(0.7),
+                                         Q(0.8),  Q(1.1), Q(2.0)};
+  const integer_dtype input2_values[] = {Q(0.1), Q(0.3), Q(0.3),
+                                         Q(0.5), Q(1.1), Q(0.1)};
+  const integer_dtype expected_output[] = {Q(-1.9), Q(0.5), Q(1.0),
+                                           Q(1.3),  Q(2.2), Q(2.1)};
+#undef Q
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    TestAddQuantized<integer_dtype>(
+        test_shapes[i], input1_values, kMin, kMax,    // Input 1
+        test_shapes[i], input2_values, kMin, kMax,    // Input 2
+        test_shapes[i], expected_output, kMin, kMax,  // Output
+        kTfLiteActNone, output_data);
+  }
+}
+
+template <typename integer_dtype>
+void QuantizedAddWithScalarBroadcast() {
+  const int output_dims_count = 6;
+  integer_dtype output_data[output_dims_count];
+
+  const float kMin = -3.0;
+  const float kMax = 3.0;
+
+#define Q(x) Quantize<integer_dtype>((x), kMin, kMax)
+  const integer_dtype input1_values[] = {Q(-2.0), Q(0.2), Q(0.7),
+                                         Q(0.8),  Q(1.1), Q(2.0)};
+  const int input2_shape[] = {0};
+  const integer_dtype input2_values[] = {Q(0.1)};
+  const integer_dtype expected_output[] = {Q(-1.9), Q(0.3), Q(0.8),
+                                           Q(0.9),  Q(1.2), Q(2.1)};
+#undef Q
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestAddQuantized(
+        test_shapes[i], input1_values, kMin, kMax,    // Input 1
+        input2_shape, input2_values, kMin, kMax,      // Input 2
+        test_shapes[i], expected_output, kMin, kMax,  // Output
+        kTfLiteActNone, output_data);
+  }
+}
+
+template <typename integer_dtype>
+void QuantizedAddWithMixedBroadcast() {
+  const int output_dims_count = 36;
+  integer_dtype output_data[output_dims_count];
+
+  const float kMin = -3.0;
+  const float kMax = 3.0;
+
+  constexpr int num_shapes = 4;
+
+#define Q(x) Quantize<integer_dtype>((x), kMin, kMax)
+  const int input1_shape[] = {4, 2, 3, 1, 2};
+  const integer_dtype input1_values[] = {Q(-0.3), Q(2.3),  Q(0.9), Q(0.5),
+                                         Q(0.8),  Q(-1.1), Q(1.2), Q(2.8),
+                                         Q(-1.6), Q(0.0),  Q(0.7), Q(-2.2)};
+  const integer_dtype input2_values[] = {Q(0.2), Q(0.3), Q(-0.4),
+                                         Q(0.5), Q(1.0), Q(0.9)};
+  const integer_dtype expected_outputs[num_shapes][output_dims_count] = {
+      {Q(-0.1), Q(2.6),  Q(-0.7), Q(2.8), Q(0.7),  Q(3.0),  Q(1.1), Q(0.8),
+       Q(0.5),  Q(1.0),  Q(1.9),  Q(1.4), Q(1.0),  Q(-0.8), Q(0.4), Q(-0.6),
+       Q(1.8),  Q(-0.2), Q(1.4),  Q(3.0), Q(0.8),  Q(3.0),  Q(2.2), Q(3.0),
+       Q(-1.4), Q(0.3),  Q(-2.0), Q(0.5), Q(-0.6), Q(0.9),  Q(0.9), Q(-1.9),
+       Q(0.3),  Q(-1.7), Q(1.7),  Q(-1.3)},
+      {Q(-0.1), Q(2.6), Q(0.5), Q(1.0), Q(1.8), Q(-0.2), Q(1.4), Q(3.0),
+       Q(-2.0), Q(0.5), Q(1.7), Q(-1.3)},
+      {Q(-0.1), Q(2.5),  Q(0.0),  Q(2.6), Q(-0.7), Q(1.9),  Q(1.1), Q(0.7),
+       Q(1.2),  Q(0.8),  Q(0.5),  Q(0.1), Q(1.0),  Q(-0.9), Q(1.1), Q(-0.8),
+       Q(0.4),  Q(-1.5), Q(1.7),  Q(3.0), Q(2.2),  Q(3.0),  Q(2.1), Q(3.0),
+       Q(-1.1), Q(0.5),  Q(-0.6), Q(1.0), Q(-0.7), Q(0.9),  Q(1.2), Q(-1.7),
+       Q(1.7),  Q(-1.2), Q(1.6),  Q(-1.3)},
+      {Q(-0.1), Q(2.5), Q(1.2), Q(0.8), Q(0.4), Q(-1.5), Q(1.7), Q(3.0),
+       Q(-0.6), Q(1.0), Q(1.6), Q(-1.3)},
+  };
+#undef Q
+
+  constexpr int max_shape_size = 5;
+  const int input2_shapes[num_shapes][max_shape_size] = {
+      {4, 1, 1, 3, 2},
+      {4, 1, 3, 1, 2},
+      {4, 2, 1, 3, 1},
+      {4, 2, 3, 1, 1},
+  };
+  const int output_shapes[num_shapes][max_shape_size] = {
+      {4, 2, 3, 3, 2},
+      {4, 2, 3, 1, 2},
+      {4, 2, 3, 3, 2},
+      {4, 2, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestAddQuantized(
+        input1_shape, input1_values, kMin, kMax,            // Input 1
+        input2_shapes[i], input2_values, kMin, kMax,        // Input 2
+        output_shapes[i], expected_outputs[i], kMin, kMax,  // Output
+        kTfLiteActNone, output_data);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloatAddNoActivation) {
+  const int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestAddFloat(
+      {4, 1, 2, 2, 1}, {-2.0, 0.2, 0.7, 0.8},  // Input1
+      {4, 1, 2, 2, 1}, {0.1, 0.2, 0.3, 0.5},   // Input2
+      {4, 1, 2, 2, 1}, {-1.9, 0.4, 1.0, 1.3},  // Expected output
+      kTfLiteActNone,                          // No activation
+      output_data);                            // Output buffer
+}
+
+TF_LITE_MICRO_TEST(FloatAddActivationRelu1) {
+  const int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestAddFloat(
+      {4, 1, 2, 2, 1}, {-2.0, 0.2, 0.7, 0.8},  // Input1
+      {4, 1, 2, 2, 1}, {0.1, 0.2, 0.3, 0.5},   // Input2
+      {4, 1, 2, 2, 1}, {-1.0, 0.4, 1.0, 1.0},  // Expected output
+      kTfLiteActRelu1,                         // RELU -1 to 1 activation
+      output_data);                            // Output buffer
+}
+
+TF_LITE_MICRO_TEST(FloatAddVariousInputShapes) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const float input2_values[] = {0.1, 0.2, 0.3, 0.5, 1.1, 0.1};
+  const float expected_output[] = {-1.9, 0.4, 1.0, 1.3, 2.2, 2.1};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestAddFloat(test_shapes[i], input1_values, test_shapes[i],
+                                  input2_values, test_shapes[i],
+                                  expected_output, kTfLiteActNone, output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(FloatAddWithScalarBroadcast) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const int input2_shape[] = {0};
+  const float input2_values[] = {0.1};
+  const float expected_output[] = {-1.9, 0.3, 0.8, 0.9, 1.2, 2.1};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestAddFloat(test_shapes[i], input1_values, input2_shape,
+                                  input2_values, test_shapes[i],
+                                  expected_output, kTfLiteActNone, output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddNoActivationUint8) {
+  tflite::testing::QuantizedAddNoActivation<uint8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddNoActivationInt8) {
+  tflite::testing::QuantizedAddNoActivation<int8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Uint8) {
+  tflite::testing::QuantizedAddActivationRelu1<uint8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Int8) {
+  tflite::testing::QuantizedAddActivationRelu1<int8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddVariousInputShapesUint8) {
+  tflite::testing::QuantizedAddVariousInputShapes<uint8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddVariousInputShapesInt8) {
+  tflite::testing::QuantizedAddVariousInputShapes<int8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddWithScalarBroadcastUint8) {
+  tflite::testing::QuantizedAddWithScalarBroadcast<uint8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddWithScalarBroadcastInt8) {
+  tflite::testing::QuantizedAddWithScalarBroadcast<int8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddWithMixedBroadcastUint8) {
+  tflite::testing::QuantizedAddWithMixedBroadcast<uint8_t>();
+}
+
+TF_LITE_MICRO_TEST(QuantizedAddWithMixedBroadcastInt8) {
+  tflite::testing::QuantizedAddWithMixedBroadcast<int8_t>();
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
index 6fb2e664802..ddbd1148414 100644
--- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -19,21 +19,85 @@ namespace micro {
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_FULLY_CONNECTED();
 TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_AVERAGE_POOL_2D();
 TfLiteRegistration* Register_MAX_POOL_2D();
 TfLiteRegistration* Register_ABS();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
+TfLiteRegistration* Register_LOG();
+TfLiteRegistration* Register_SQRT();
+TfLiteRegistration* Register_RSQRT();
+TfLiteRegistration* Register_SQUARE();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_ARG_MIN();
+TfLiteRegistration* Register_LOGICAL_OR();
+TfLiteRegistration* Register_LOGICAL_AND();
+TfLiteRegistration* Register_LOGICAL_NOT();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_EQUAL();
+TfLiteRegistration* Register_NOT_EQUAL();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
+TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
+TfLiteRegistration* Register_CEIL();
+TfLiteRegistration* Register_ROUND();
+TfLiteRegistration* Register_STRIDED_SLICE();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_UNPACK();
+TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_ADD();
 
 AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 3);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
+  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
+  AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
+  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
+             /* min_version */ 1,
+             /* max_version */ 3);
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/experimental/micro/kernels/arg_min_max.cc b/tensorflow/lite/experimental/micro/kernels/arg_min_max.cc
new file mode 100644
index 00000000000..8b54096fc91
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/arg_min_max.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/micro_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace arg_min_max {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxis = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ArgMinMaxHelper(const RuntimeShape& input1_shape,
+                            const T1* input1_data, const T3* input2_data,
+                            const RuntimeShape& output_shape, T2* output_data,
+                            bool is_arg_max) {
+  if (is_arg_max) {
+    reference_ops::ArgMinMax(input1_shape, input1_data, input2_data,
+                             output_shape, output_data, micro::Greater());
+  } else {
+    reference_ops::ArgMinMax(input1_shape, input1_data, input2_data,
+                             output_shape, output_data, micro::Less());
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)            \
+  ArgMinMaxHelper(GetTensorShape(input), GetTensorData<data_type>(input), \
+                  GetTensorData<axis_type>(axis), GetTensorShape(output), \
+                  GetTensorData<output_type>(output), is_arg_max)
+  if (axis->type == kTfLiteInt32) {
+    if (output->type == kTfLiteInt32) {
+      switch (input->type) {
+        case kTfLiteFloat32:
+          TF_LITE_ARG_MIN_MAX(float, int32_t, int32_t);
+          break;
+        case kTfLiteUInt8:
+          TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
+          break;
+        case kTfLiteInt8:
+          TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int32_t);
+          break;
+        default:
+          context->ReportError(context,
+                               "Only float32, uint8 and int8 are "
+                               "supported currently, got %s.",
+                               TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+    } else {
+      context->ReportError(context,
+                           "Only int32 are supported currently, got %s.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  } else {
+    context->ReportError(context, "Only int32 are supported currently, got %s.",
+                         TfLiteTypeGetName(axis->type));
+    return kTfLiteError;
+  }
+
+#undef TF_LITE_ARG_MIN_MAX
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArgMinEval(TfLiteContext* context, TfLiteNode* node) {
+  return Eval(context, node, false);
+}
+
+TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) {
+  return Eval(context, node, true);
+}
+
+}  // namespace arg_min_max
+
+TfLiteRegistration* Register_ARG_MAX() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_min_max::Prepare,
+                                 arg_min_max::ArgMaxEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_ARG_MIN() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_min_max::Prepare,
+                                 arg_min_max::ArgMinEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc
new file mode 100644
index 00000000000..722c0da9144
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc
@@ -0,0 +1,274 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestArgMinMax(TfLiteTensor* input_tensor, TfLiteTensor* axis_tensor,
+                   TfLiteTensor* output_tensor,
+                   std::initializer_list<int> expected_output_data,
+                   bool using_min = false) {
+  const int output_dims_count = ElementCount(*output_tensor->dims);
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      *input_tensor,
+      *axis_tensor,
+      *output_tensor,
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration;
+  if (using_min) {
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN, 1);
+  } else {
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX, 1);
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, init_data_size);
+  }
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i],
+                              output_tensor->data.i32[i], 1e-5f);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(GetMaxArgFloat) {
+  int32_t output_data[1];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  auto input_tensor = tflite::testing::CreateFloatTensor(
+      {0.1, 0.9, 0.7, 0.3}, input_dims, "input_tensor");
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {1});
+}
+
+TF_LITE_MICRO_TEST(GetMaxArgUInt8) {
+  using tflite::testing::F2Q;
+  int32_t output_data[1];
+  float input_min = 0;
+  float input_max = 15.9375;
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  auto input_data = {
+      F2Q(1., input_min, input_max), F2Q(9., input_min, input_max),
+      F2Q(7., input_min, input_max), F2Q(3., input_min, input_max)};
+  auto input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_dims, "input_tensor", input_min, input_max);
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {1});
+}
+
+TF_LITE_MICRO_TEST(GetMaxArgInt8) {
+  int32_t output_data[1];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  std::initializer_list<int8_t> input_data = {1, 9, 7, 3};
+  auto input_tensor = tflite::testing::CreateTensor<int8_t, kTfLiteInt8>(
+      input_data, input_dims, "input_tensor");
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {1});
+}
+
+TF_LITE_MICRO_TEST(GetMaxArgMulDimensions) {
+  using tflite::testing::F2Q;
+  int32_t output_data[2];
+  float input_min = 0;
+  float input_max = 15.9375;
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 2, 4});
+  auto input_data = {
+      F2Q(1, input_min, input_max), F2Q(2, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(8, input_min, input_max),
+      F2Q(1, input_min, input_max), F2Q(9, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(3, input_min, input_max)};
+  auto input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_dims, "input_tensor", input_min, input_max);
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 2}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {3, 1});
+}
+
+TF_LITE_MICRO_TEST(GetMaxArgNegativeAxis) {
+  using tflite::testing::F2Q;
+  int32_t output_data[4];
+  float input_min = 0;
+  float input_max = 15.9375;
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 2, 4});
+  auto input_data = {
+      F2Q(1, input_min, input_max), F2Q(2, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(8, input_min, input_max),
+      F2Q(1, input_min, input_max), F2Q(9, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(3, input_min, input_max)};
+  auto input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_dims, "input_tensor", input_min, input_max);
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {-2}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 4}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {0, 1, 0, 0});
+}
+
+TF_LITE_MICRO_TEST(GetMinArgFloat) {
+  int32_t output_data[1];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  auto input_tensor = tflite::testing::CreateFloatTensor(
+      {0.1, 0.9, 0.7, 0.3}, input_dims, "input_tensor");
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {0}, true);
+}
+
+TF_LITE_MICRO_TEST(GetMinArgUInt8) {
+  using tflite::testing::F2Q;
+  float input_min = 0;
+  float input_max = 15.9375;
+  int32_t output_data[1];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  // Getting weird error when defining input_data directly in
+  // CreateQuantizedTensor. So I have to define it ahead.
+  auto input_data = {
+      F2Q(1.0, input_min, input_max), F2Q(9.0, input_min, input_max),
+      F2Q(7.0, input_min, input_max), F2Q(3.0, input_min, input_max)};
+  auto input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_dims, "input_tensor", input_min, input_max);
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {0}, true);
+}
+
+TF_LITE_MICRO_TEST(GetMinArgInt8) {
+  int32_t output_data[1];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 1, 4});
+  std::initializer_list<int8_t> input_data = {1, 9, 7, 3};
+  auto input_tensor = tflite::testing::CreateTensor<int8_t, kTfLiteInt8>(
+      input_data, input_dims, "input_tensor");
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {0}, true);
+}
+
+TF_LITE_MICRO_TEST(GetMinArgMulDimensions) {
+  using tflite::testing::F2Q;
+  float input_min = 0;
+  float input_max = 15.9375;
+  int32_t output_data[2];
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInitializer({4, 1, 1, 2, 4});
+  auto input_data = {
+      F2Q(1, input_min, input_max), F2Q(2, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(8, input_min, input_max),
+      F2Q(1, input_min, input_max), F2Q(9, input_min, input_max),
+      F2Q(7, input_min, input_max), F2Q(3, input_min, input_max)};
+  auto input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_dims, "input_tensor", input_min, input_max);
+  auto axis_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {3}, tflite::testing::IntArrayFromInitializer({3, 1, 1, 1}),
+      "axis_tensor");
+  auto output_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      output_data, tflite::testing::IntArrayFromInitializer({3, 1, 1, 2}),
+      "output_tensor");
+  tflite::testing::TestArgMinMax(&input_tensor, &axis_tensor, &output_tensor,
+                                 {0, 0}, true);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/ceil.cc b/tensorflow/lite/experimental/micro/kernels/ceil.cc
new file mode 100644
index 00000000000..41f7331726d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/ceil.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace ceil {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
+  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
+  for (int i = 0; i < output->dims->size; ++i) {
+    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  reference_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
+                      GetTensorShape(output), GetTensorData<float>(output));
+
+  return kTfLiteOk;
+}
+}  // namespace ceil
+
+TfLiteRegistration* Register_CEIL() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, ceil::Prepare, ceil::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/ceil_test.cc b/tensorflow/lite/experimental/micro/kernels/ceil_test.cc
new file mode 100644
index 00000000000..a57fc70b8ce
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/ceil_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestCeil(std::initializer_list<int> input_dims_data,
+              std::initializer_list<float> input_data,
+              std::initializer_list<float> expected_output_data,
+              float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(input_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_CEIL, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SingleDim) {
+  float output_data[2];
+  tflite::testing::TestCeil({1, 2},      // input_dims_data
+                            {8.5, 0.0},  // input_data
+                            {9, 0},      // expected_output_data
+                            output_data);
+}
+
+TF_LITE_MICRO_TEST(MultiDims) {
+  float output_data[10];
+  tflite::testing::TestCeil(
+      {4, 2, 1, 1, 5},  // input_dims_data
+      {
+          0.0001,
+          8.0001,
+          0.9999,
+          9.9999,
+          0.5,
+          -0.0001,
+          -8.0001,
+          -0.9999,
+          -9.9999,
+          -0.5,
+      },                                  // input_data
+      {1, 9, 1, 10, 1, 0, -8, 0, -9, 0},  // expected_output_data
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/comparisons.cc b/tensorflow/lite/experimental/micro/kernels/comparisons.cc
new file mode 100644
index 00000000000..d7a0eb286f8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/comparisons.cc
@@ -0,0 +1,338 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace comparisons {
+namespace {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// TODO(ruic): optimize macros below to using template functions.
+#define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
+  template <typename input_dtype>                                              \
+  void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
+                             const TfLiteTensor* input1,                       \
+                             const TfLiteTensor* input2, TfLiteTensor* output, \
+                             bool requires_broadcast) {                        \
+    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
+      auto input1_offset = -input1->params.zero_point;                         \
+      auto input2_offset = -input2->params.zero_point;                         \
+      const int left_shift = 8;                                                \
+                                                                               \
+      int32 input1_multiplier;                                                 \
+      int input1_shift;                                                        \
+      QuantizeMultiplierSmallerThanOneExp(input1->params.scale,                \
+                                          &input1_multiplier, &input1_shift);  \
+      int32 input2_multiplier;                                                 \
+      int input2_shift;                                                        \
+      QuantizeMultiplierSmallerThanOneExp(input2->params.scale,                \
+                                          &input2_multiplier, &input2_shift);  \
+                                                                               \
+      ComparisonParams op_params;                                              \
+      op_params.left_shift = left_shift;                                       \
+      op_params.input1_offset = input1_offset;                                 \
+      op_params.input1_multiplier = input1_multiplier;                         \
+      op_params.input1_shift = input1_shift;                                   \
+      op_params.input2_offset = input2_offset;                                 \
+      op_params.input2_multiplier = input2_multiplier;                         \
+      op_params.input2_shift = input2_shift;                                   \
+      if (requires_broadcast) {                                                \
+        reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
+      } else {                                                                 \
+        reference_ops::opname##WithScaling(                                    \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
+      }                                                                        \
+    }                                                                          \
+  }
+TF_LITE_QUANTIZE_COMPARISON(Equal);
+TF_LITE_QUANTIZE_COMPARISON(NotEqual);
+TF_LITE_QUANTIZE_COMPARISON(Greater);
+TF_LITE_QUANTIZE_COMPARISON(GreaterEqual);
+TF_LITE_QUANTIZE_COMPARISON(Less);
+TF_LITE_QUANTIZE_COMPARISON(LessEqual);
+#undef TF_LITE_QUANTIZE_COMPARISON
+
+#define TF_LITE_COMPARISON(type, opname, requires_broadcast)                  \
+  {                                                                           \
+    ComparisonParams op_params;                                               \
+    requires_broadcast                                                        \
+        ? reference_ops::Broadcast4DSlow##opname##NoScaling(                  \
+              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
+              GetTensorShape(input2), GetTensorData<type>(input2),            \
+              GetTensorShape(output), GetTensorData<bool>(output))            \
+        : reference_ops::opname##NoScaling(                                   \
+              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
+              GetTensorShape(input2), GetTensorData<type>(input2),            \
+              GetTensorShape(output), GetTensorData<bool>(output));           \
+  }
+
+TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Equal, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
+                                  requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
+      break;
+    default:
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// TODO(renjieliu): Refactor the logic to avoid duplications.
+TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
+      break;
+    default:
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
+                                   requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int|uint8",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
+                                         requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
+                                        requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int|uint8",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
+                                requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int|uint8",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
+                                      requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int|uint8",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace comparisons
+
+TfLiteRegistration* Register_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::EqualEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_NOT_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::NotEqualEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_GREATER() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::GreaterEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_GREATER_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::GreaterEqualEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LESS() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::LessEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LESS_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr,
+                                 comparisons::LessEqualEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc b/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc
new file mode 100644
index 00000000000..13ab2a4b5ce
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc
@@ -0,0 +1,1100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <initializer_list>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+constexpr int inputs_size = 2;
+constexpr int outputs_size = 1;
+constexpr int tensors_size = inputs_size + outputs_size;
+
+void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
+                    std::initializer_list<bool> expected_output_data,
+                    bool* output_data) {
+  const int output_dims_count = ElementCount(*tensors[inputs_size].dims);
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+void TestComparisonFloat(tflite::BuiltinOperator op,
+                         std::initializer_list<int> input1_dims_data,
+                         std::initializer_list<float> input1_data,
+                         std::initializer_list<int> input2_dims_data,
+                         std::initializer_list<float> input2_data,
+                         std::initializer_list<bool> expected_output_data,
+                         std::initializer_list<int> output_dims_data,
+                         bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TestComparison(op, tensors, expected_output_data, output_data);
+}
+
+void TestComparisonBool(tflite::BuiltinOperator op,
+                        std::initializer_list<int> input1_dims_data,
+                        std::initializer_list<bool> input1_data,
+                        std::initializer_list<int> input2_dims_data,
+                        std::initializer_list<bool> input2_data,
+                        std::initializer_list<bool> expected_output_data,
+                        std::initializer_list<int> output_dims_data,
+                        bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+
+  TfLiteTensor tensors[tensors_size] = {
+      CreateBoolTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateBoolTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TestComparison(op, tensors, expected_output_data, output_data);
+}
+
+void TestComparisonInt(tflite::BuiltinOperator op,
+                       std::initializer_list<int> input1_dims_data,
+                       std::initializer_list<int32_t> input1_data,
+                       std::initializer_list<int> input2_dims_data,
+                       std::initializer_list<int32_t> input2_data,
+                       std::initializer_list<bool> expected_output_data,
+                       std::initializer_list<int> output_dims_data,
+                       bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(input1_data, input1_dims, "input1_tensor", 1.0),
+      CreateQuantized32Tensor(input2_data, input2_dims, "input2_tensor", 1.0),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TestComparison(op, tensors, expected_output_data, output_data);
+}
+
+void TestComparisonQuantizedUInt8(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<uint8_t> input1_data, float input1_min,
+    float input1_max, std::initializer_list<int> input2_dims_data,
+    std::initializer_list<uint8_t> input2_data, float input2_min,
+    float input2_max, std::initializer_list<bool> expected_output_data,
+    std::initializer_list<int> output_dims_data, bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input1_min, input1_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input2_min, input2_max),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TestComparison(op, tensors, expected_output_data, output_data);
+}
+
+void TestComparisonQuantizedInt8(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<int8_t> input1_data, float input1_min,
+    float input1_max, std::initializer_list<int> input2_dims_data,
+    std::initializer_list<int8_t> input2_data, float input2_min,
+    float input2_max, std::initializer_list<bool> expected_output_data,
+    std::initializer_list<int> output_dims_data, bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input1_min, input1_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input2_min, input2_max),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TestComparison(op, tensors, expected_output_data, output_data);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+using ::tflite::testing::F2Q;
+using ::tflite::testing::F2QS;
+
+TF_LITE_MICRO_TEST(EqualBool) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<bool> input1_data = {true, false, true, false};
+  std::initializer_list<bool> input2_data = {true, true, false, false};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonBool(tflite::BuiltinOperator_EQUAL, input1_dim,
+                                      input1_data, input2_dim, input2_data,
+                                      expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {true, false, false, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 7, 5};
+
+  std::initializer_list<bool> expected_data = {false, false, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {false, false, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 2, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {false, false, false, false,
+                                               false, false, true,  false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualBool) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<bool> input1_data = {true, false, true, false};
+  std::initializer_list<bool> input2_data = {true, true, false, false};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonBool(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {false, true, true, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 7, 5};
+
+  std::initializer_list<bool> expected_data = {true, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {true, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 2, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {true, true, true,  true,
+                                               true, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 7, 5};
+
+  std::initializer_list<bool> expected_data = {false, true, false, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {false, true, false, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 2, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {false, true, true,  false,
+                                               false, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {true, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 7, 5};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 2, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false,
+                                               false, true, true, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {false, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_LESS, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 6, 5};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 6, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true,
+                                               true, false, false, false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
+                                     input1_data, input2_dim, input2_data,
+                                     expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessEqualFloat) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<float> input1_data = {0.1, 0.9, 0.7, 0.3};
+  std::initializer_list<float> input2_data = {0.1, 0.2, 0.6, 0.5};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonFloat(
+      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessEqualInt) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {1, 2, 7, 5};
+
+  std::initializer_list<bool> expected_data = {true, false, true, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessEqualBroadcast) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 1, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 1};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3};
+  std::initializer_list<int32_t> input2_data = {7};
+
+  std::initializer_list<bool> expected_data = {true, false, true, true};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 1, 4};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(LessEqualBroadcastTwoD) {
+  std::initializer_list<int> input1_dim = {4, 1, 1, 2, 4};
+  std::initializer_list<int> input2_dim = {4, 1, 1, 1, 4};
+
+  std::initializer_list<int32_t> input1_data = {-1, 9, 7, 3, 2, 4, 2, 8};
+  std::initializer_list<int32_t> input2_data = {7, 1, 2, 4};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true,
+                                               true, false, true,  false};
+  std::initializer_list<int> expected_dim = {4, 1, 1, 2, 4};
+
+  bool output_data[8];
+  tflite::testing::TestComparisonInt(
+      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualQuantizedUInt8) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {true, false, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualQuantizedInt8) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+
+  std::initializer_list<int8_t> input1_data = {
+      F2QS(1, kMin, kMax), F2QS(-9, kMin, kMax), F2QS(7, kMin, kMax),
+      F2QS(3, kMin, kMax)};
+  std::initializer_list<int8_t> input2_data = {
+      F2QS(-1, kMin, kMax), F2QS(2, kMin, kMax), F2QS(7, kMin, kMax),
+      F2QS(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {false, false, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedInt8(
+      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualQuantizedUInt8) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(0, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {false, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(NotEqualQuantizedInt8) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+
+  std::initializer_list<int8_t> input1_data = {
+      F2QS(1, kMin, kMax), F2QS(-9, kMin, kMax), F2QS(7, kMin, kMax),
+      F2QS(3, kMin, kMax)};
+  std::initializer_list<int8_t> input2_data = {
+      F2QS(1, kMin, kMax), F2QS(2, kMin, kMax), F2QS(7, kMin, kMax),
+      F2QS(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {false, true, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedInt8(
+      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterQuantizedUInt8) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(6, kMin, kMax),
+      F2Q(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterQuantizedUInt8SmallRange) {
+  const float input1_min = 0.f;
+  const float input1_max = 1.f;
+  const float input2_min = 0.f;
+  const float input2_max = 2.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1.0, input1_min, input1_max), F2Q(0.5, input1_min, input1_max),
+      F2Q(0.35, input1_min, input1_max), F2Q(0.1, input1_min, input1_max)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1.01, input2_min, input2_max), F2Q(0.25, input2_min, input2_max),
+      F2Q(0.3, input2_min, input2_max), F2Q(0.4, input2_min, input2_max)};
+
+  std::initializer_list<bool> expected_data = {false, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input1_min,
+      input1_max, input2_dim, input2_data, input2_min, input2_max,
+      expected_data, expected_dim, output_data);
+}
+
+TF_LITE_MICRO_TEST(GreaterUInt8EqualQuantized) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(6, kMin, kMax),
+      F2Q(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {true, true, true, false};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data, kMin,
+      kMax, input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(LessQuantizedUInt8) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(6, kMin, kMax),
+      F2Q(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {false, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_LESS, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(LessEqualQuantizedUInt8) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<int> input1_dim = {4, 1, 2, 2, 1};
+  std::initializer_list<int> input2_dim = {4, 1, 2, 2, 1};
+
+  std::initializer_list<uint8_t> input1_data = {
+      F2Q(1, kMin, kMax), F2Q(9, kMin, kMax), F2Q(7, kMin, kMax),
+      F2Q(3, kMin, kMax)};
+  std::initializer_list<uint8_t> input2_data = {
+      F2Q(1, kMin, kMax), F2Q(2, kMin, kMax), F2Q(6, kMin, kMax),
+      F2Q(5, kMin, kMax)};
+
+  std::initializer_list<bool> expected_data = {true, false, false, true};
+  std::initializer_list<int> expected_dim = {4, 1, 2, 2, 1};
+
+  bool output_data[4];
+  tflite::testing::TestComparisonQuantizedUInt8(
+      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, kMin, kMax,
+      input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(EqualQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(2, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {false, true,  false,
+                                                 false, false, false};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(NotEqualQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(2, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true, false, true,
+                                                 true, true,  true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(NotEqualQuantizedInt8WithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<int8_t> input1_data = {
+        F2QS(20, kMin, kMax), F2QS(-2, kMin, kMax), F2QS(-71, kMin, kMax),
+        F2QS(8, kMin, kMax),  F2QS(11, kMin, kMax), F2QS(20, kMin, kMax)};
+    std::initializer_list<int8_t> input2_data = {F2QS(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true,  true, true,
+                                                 false, true, true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedInt8(
+        tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(GreaterQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true,  false, false,
+                                                 false, true,  true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_GREATER, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(GreaterQuantizedInt8WithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<int8_t> input1_data = {
+        F2QS(20, kMin, kMax), F2QS(-2, kMin, kMax), F2QS(-71, kMin, kMax),
+        F2QS(8, kMin, kMax),  F2QS(11, kMin, kMax), F2QS(20, kMin, kMax)};
+    std::initializer_list<int8_t> input2_data = {F2QS(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true,  false, false,
+                                                 false, true,  true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedInt8(
+        tflite::BuiltinOperator_GREATER, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true, false, false,
+                                                 true, true,  true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data, kMin,
+        kMax, input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(GreaterEqualQuantizedInt8WithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<int8_t> input1_data = {
+        F2QS(20, kMin, kMax), F2QS(-2, kMin, kMax), F2QS(-71, kMin, kMax),
+        F2QS(8, kMin, kMax),  F2QS(11, kMin, kMax), F2QS(20, kMin, kMax)};
+    std::initializer_list<int8_t> input2_data = {F2QS(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {true, false, false,
+                                                 true, true,  true};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedInt8(
+        tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data, kMin,
+        kMax, input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(LessQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {false, true,  true,
+                                                 false, false, false};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_LESS, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(LessQuantizedInt8WithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<int8_t> input1_data = {
+        F2QS(20, kMin, kMax), F2QS(-2, kMin, kMax), F2QS(-71, kMin, kMax),
+        F2QS(8, kMin, kMax),  F2QS(11, kMin, kMax), F2QS(20, kMin, kMax)};
+    std::initializer_list<int8_t> input2_data = {F2QS(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {false, true,  true,
+                                                 false, false, false};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedInt8(
+        tflite::BuiltinOperator_LESS, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(LessEqualQuantizedUInt8WithBroadcast) {
+  const float kMin = -1.f;
+  const float kMax = 128.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<uint8_t> input1_data = {
+        F2Q(20, kMin, kMax), F2Q(2, kMin, kMax),  F2Q(7, kMin, kMax),
+        F2Q(8, kMin, kMax),  F2Q(11, kMin, kMax), F2Q(20, kMin, kMax)};
+    std::initializer_list<uint8_t> input2_data = {F2Q(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {false, true,  true,
+                                                 true,  false, false};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedUInt8(
+        tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(LessEqualQuantizedInt8WithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::initializer_list<std::initializer_list<int>> test_shapes = {
+      {1, 6}, {2, 2, 3}, {3, 2, 1, 3}, {4, 1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    std::initializer_list<int> input1_dim = test_shapes.begin()[i];
+    std::initializer_list<int> input2_dim = {1, 1};
+
+    std::initializer_list<int8_t> input1_data = {
+        F2QS(20, kMin, kMax), F2QS(-2, kMin, kMax), F2QS(-71, kMin, kMax),
+        F2QS(8, kMin, kMax),  F2QS(11, kMin, kMax), F2QS(20, kMin, kMax)};
+    std::initializer_list<int8_t> input2_data = {F2QS(8, kMin, kMax)};
+
+    std::initializer_list<bool> expected_data = {false, true,  true,
+                                                 true,  false, false};
+    std::initializer_list<int> expected_dim = input1_dim;
+
+    bool output_data[6];
+    tflite::testing::TestComparisonQuantizedInt8(
+        tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, kMin, kMax,
+        input2_dim, input2_data, kMin, kMax, expected_data, expected_dim,
+        output_data);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/conv_test.cc b/tensorflow/lite/experimental/micro/kernels/conv_test.cc
index 0dd019d529f..8e0b22f8599 100644
--- a/tensorflow/lite/experimental/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/conv_test.cc
@@ -115,7 +115,7 @@ void TestConvQuantized(
     std::initializer_list<int> filter_dims_data,
     std::initializer_list<uint8_t> filter_data, float filter_min,
     float filter_max, std::initializer_list<int> bias_dims_data,
-    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<int32_t> bias_data, float bias_scale,
     std::initializer_list<int> output_dims_data,
     std::initializer_list<uint8_t> expected_output_data, float output_min,
     float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
@@ -135,8 +135,7 @@ void TestConvQuantized(
                             input_max),
       CreateQuantizedTensor(filter_data, filter_dims, "filter_tensor",
                             filter_min, filter_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
-                              bias_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
       CreateQuantizedTensor(output_data, output_dims, "output_tensor",
                             output_min, output_max),
   };
@@ -273,8 +272,7 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
   const float input_max = 64;
   const float filter_min = -63.5;
   const float filter_max = 64;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127;
   const float output_max = 128;
 
@@ -314,11 +312,11 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
                                      },
                                      filter_min, filter_max, {1, 3},
                                      {
-                                         F2Q32(1, bias_min, bias_max),
-                                         F2Q32(2, bias_min, bias_max),
-                                         F2Q32(3, bias_min, bias_max),
+                                         F2Q32(1, bias_scale),
+                                         F2Q32(2, bias_scale),
+                                         F2Q32(3, bias_scale),
                                      },
-                                     bias_min, bias_max, {4, 2, 1, 2, 3},
+                                     bias_scale, {4, 2, 1, 2, 3},
                                      {
                                          F2Q(18, output_min, output_max),
                                          F2Q(2, output_min, output_max),
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index ff952b39c00..c6423759f77 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -111,7 +111,7 @@ void TestDepthwiseConvQuantized(
     std::initializer_list<int> filter_dims_data,
     std::initializer_list<uint8_t> filter_data, float filter_min,
     float filter_max, std::initializer_list<int> bias_dims_data,
-    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<int32_t> bias_data, float bias_scale,
     std::initializer_list<uint8_t> expected_output_data,
     std::initializer_list<int> output_dims_data, float output_min,
     float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
@@ -129,8 +129,7 @@ void TestDepthwiseConvQuantized(
                             input_max),
       CreateQuantizedTensor(filter_data, filter_dims, "filter_tensor",
                             filter_min, filter_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
-                              bias_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
       CreateQuantizedTensor(output_data, output_dims, "output_tensor",
                             output_min, output_max),
   };
@@ -229,8 +228,7 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
   const float input_max = 64.0f;
   const float filter_min = -63.5f;
   const float filter_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
   const int output_dims_count = 8;
@@ -278,12 +276,12 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
       {1, 4},                  // Bias shape.
       {
           // Bias values.
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-          F2Q32(4, bias_min, bias_max),
+          F2Q32(1, bias_scale),
+          F2Q32(2, bias_scale),
+          F2Q32(3, bias_scale),
+          F2Q32(4, bias_scale),
       },
-      bias_min, bias_max,  // Bias quantization range.
+      bias_scale,
       {
           // Expected results.
           F2Q(71, output_min, output_max),
@@ -337,8 +335,7 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
   const float input_max = 64.0f;
   const float filter_min = -63.5f;
   const float filter_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
   const int output_dims_count = 8;
@@ -386,12 +383,12 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
       {1, 4},                  // Bias shape.
       {
           // Bias values.
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-          F2Q32(4, bias_min, bias_max),
+          F2Q32(1, bias_scale),
+          F2Q32(2, bias_scale),
+          F2Q32(3, bias_scale),
+          F2Q32(4, bias_scale),
       },
-      bias_min, bias_max,  // Bias quantization range.
+      bias_scale,
       {
           // Expected results.
           F2Q(71, output_min, output_max),
@@ -416,8 +413,7 @@ TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
   const float input_max = 255.0f;
   const float filter_min = -63.5f;
   const float filter_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 128.0f * (1 << 24);
+  const float bias_scale = 0.5f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
   const int output_dims_count = 9;
@@ -465,12 +461,12 @@ TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
       {1, 1},                  // Bias shape.
       {
           // Bias values.
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-          F2Q32(4, bias_min, bias_max),
+          F2Q32(1, bias_scale),
+          F2Q32(2, bias_scale),
+          F2Q32(3, bias_scale),
+          F2Q32(4, bias_scale),
       },
-      bias_min, bias_max,  // Bias quantization range.
+      bias_scale,
       {
           // Expected results.
           220,
diff --git a/tensorflow/lite/experimental/micro/kernels/elementwise.cc b/tensorflow/lite/experimental/micro/kernels/elementwise.cc
index e6ea15e814d..3c6729412be 100644
--- a/tensorflow/lite/experimental/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/experimental/micro/kernels/elementwise.cc
@@ -29,6 +29,10 @@ bool IsNumericSupportedType(const TfLiteType type) {
   return type == kTfLiteFloat32;
 }
 
+bool IsLogicalSupportedType(const TfLiteType type) {
+  return type == kTfLiteBool;
+}
+
 typedef bool (*IsSupportedType)(TfLiteType);
 template <IsSupportedType>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -38,8 +42,8 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
   if (!IsSupportedType(input->type)) {
-    context->ReportError(context, "Current data type %d is not supported.",
-                         input->type);
+    context->ReportError(context, "Input data type %s (%d) is not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
     return kTfLiteError;
   }
   return kTfLiteOk;
@@ -65,10 +69,43 @@ inline TfLiteStatus EvalNumeric(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<float>(context, node, float_func, kTfLiteFloat32);
 }
 
+inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
+                                bool bool_func(bool)) {
+  return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
+}
+
 TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::abs);
 }
 
+TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::sin);
+}
+
+TfLiteStatus CosEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::cos);
+}
+
+TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::log);
+}
+
+TfLiteStatus SqrtEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::sqrt);
+}
+
+TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, [](float f) { return 1.f / std::sqrt(f); });
+}
+
+TfLiteStatus SquareEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, [](float f) { return f * f; });
+}
+
+TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalLogical(context, node, [](bool v) { return !v; });
+}
+
 }  // namespace
 }  // namespace elementwise
 
@@ -79,6 +116,63 @@ TfLiteRegistration* Register_ABS() {
       elementwise::AbsEval};
   return &r;
 }
+
+TfLiteRegistration* Register_SIN() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SinEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_COS() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::CosEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOG() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::LogEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_SQRT() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SqrtEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_RSQRT() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::RsqrtEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_SQUARE() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SquareEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGICAL_NOT() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+      elementwise::LogicalNotEval};
+  return &r;
+}
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc b/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc
index 1ba98af5301..c369260542a 100644
--- a/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc
@@ -22,7 +22,8 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-void TestElementwiseFloat(std::initializer_list<int> input_dims_data,
+void TestElementwiseFloat(tflite::BuiltinOperator op,
+                          std::initializer_list<int> input_dims_data,
                           std::initializer_list<float> input_data,
                           std::initializer_list<int> output_dims_data,
                           std::initializer_list<float> expected_output_data,
@@ -47,7 +48,73 @@ void TestElementwiseFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_ABS, /* version= */ 1);
+      resolver.FindOp(op, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  auto inputs_array_data = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer(inputs_array_data);
+  auto outputs_array_data = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer(outputs_array_data);
+  auto temporaries_array_data = {0};
+  TfLiteIntArray* temporaries_array =
+      IntArrayFromInitializer(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestElementwiseBool(tflite::BuiltinOperator op,
+                         std::initializer_list<int> input_dims_data,
+                         std::initializer_list<bool> input_data,
+                         std::initializer_list<int> output_dims_data,
+                         std::initializer_list<bool> expected_output_data,
+                         bool* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateBoolTensor(input_data, input_dims, "input_tensor"),
+      CreateBoolTensor(output_data, output_dims, "output_tensor")};
+
+  // Place false in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = false;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(op, /* version= */ 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
@@ -78,8 +145,7 @@ void TestElementwiseFloat(std::initializer_list<int> input_dims_data,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
   }
 }
 
@@ -92,14 +158,95 @@ TF_LITE_MICRO_TEST(Abs) {
   constexpr int output_dims_count = 4;
   float output_data[output_dims_count];
   tflite::testing::TestElementwiseFloat(
-      {2, 2, 2},  // Input shape
-      {
-          0.01, -0.01, 10, -10,  // Input values
-      },
-      {2, 2, 2},  // Output shape
-      {
-          0.01, 0.01, 10, 10,  // Output values
-      },
+      tflite::BuiltinOperator_ABS,  // ABS operator
+      {2, 2, 2},                    // Input shape
+      {0.01, -0.01, 10, -10},       // Input values
+      {2, 2, 2},                    // Output shape
+      {0.01, 0.01, 10, 10},         // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Sin) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_SIN,    // SIN operator
+      {2, 2, 2},                      // Input shape
+      {0, 3.1415926, -3.1415926, 1},  // Input values
+      {2, 2, 2},                      // Output shape
+      {0, 0, 0, 0.84147},             // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Cos) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_COS,    // COS operator
+      {2, 2, 2},                      // Input shape
+      {0, 3.1415926, -3.1415926, 1},  // Input values
+      {2, 2, 2},                      // Output shape
+      {1, -1, -1, 0.54030},           // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Log) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_LOG,    // LOG operator
+      {2, 2, 2},                      // Input shape
+      {1, 2.7182818, 0.5, 2},         // Input values
+      {2, 2, 2},                      // Output shape
+      {0, 1, -0.6931472, 0.6931472},  // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Sqrt) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_SQRT,  // SQRT operator
+      {2, 2, 2},                     // Input shape
+      {0, 1, 2, 4},                  // Input values
+      {2, 2, 2},                     // Output shape
+      {0, 1, 1.41421, 2},            // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Rsqrt) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_RSQRT,  // RSQRT operator
+      {2, 2, 2},                      // Input shape
+      {1, 2, 4, 9},                   // Input values
+      {2, 2, 2},                      // Output shape
+      {1, 0.7071, 0.5, 0.33333},      // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(Square) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      tflite::BuiltinOperator_SQUARE,  // SQARE operator
+      {2, 2, 2},                       // Input shape
+      {1, 2, 0.5, -3.0},               // Input values
+      {2, 2, 2},                       // Output shape
+      {1, 4.0, 0.25, 9.0},             // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(LogicalNot) {
+  constexpr int output_dims_count = 4;
+  bool output_data[output_dims_count];
+  tflite::testing::TestElementwiseBool(
+      tflite::BuiltinOperator_LOGICAL_NOT,  // Logical NOT operator
+      {2, 2, 2},                            // Input shape
+      {true, false, false, true},           // Input values
+      {2, 2, 2},                            // Output shape
+      {false, true, true, false},           // Output values
       output_data);
 }
 
diff --git a/tensorflow/lite/experimental/micro/kernels/floor.cc b/tensorflow/lite/experimental/micro/kernels/floor.cc
new file mode 100644
index 00000000000..7b55cff01a8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/floor.cc
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace floor {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                       GetTensorShape(output), GetTensorData<float>(output));
+  return kTfLiteOk;
+}
+}  // namespace floor
+
+TfLiteRegistration* Register_FLOOR() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, /*prepare=*/nullptr,
+                                 floor::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/floor_test.cc b/tensorflow/lite/experimental/micro/kernels/floor_test.cc
new file mode 100644
index 00000000000..7b65a409fd7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/floor_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestFloor(std::initializer_list<int> input_dims_data,
+               std::initializer_list<float> input_data,
+               std::initializer_list<float> expected_output_data,
+               std::initializer_list<int> output_dims_data,
+               float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FLOOR, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloorOpSingleDimFloat32) {
+  float output_data[2];
+  tflite::testing::TestFloor(/*input_dims_data=*/{1, 2},
+                             /*input_data=*/{8.5f, 0.0f},
+                             /*expected_output_data=*/{8, 0},
+                             /*output_dims_data*/ {1, 2},
+                             /*output_data=*/output_data);
+}
+
+TF_LITE_MICRO_TEST(FloorOpMultiDimFloat32) {
+  float output_data[10];
+  tflite::testing::TestFloor(
+      /*input_dims_data=*/{4, 2, 1, 1, 5},
+      /*input_data=*/
+      {0.0001f, 8.0001f, 0.9999f, 9.9999f, 0.5f, -0.0001f, -8.0001f, -0.9999f,
+       -9.9999f, -0.5f},
+      /*expected_output_data=*/
+      {0.0f, 8.0f, 0.0f, 9.0f, 0.0f, -1.0f, -9.0f, -1.0f, -10.0f, -1.0f},
+      /*output_dims_data=*/{4, 2, 1, 1, 5},
+      /*output_data=*/output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
index 2cacee775e5..4db5b9dc3cb 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -59,6 +61,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     int exponent;
     QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
     data->output_shift = -exponent;
+    printf("%d \n", data->output_multiplier);
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
@@ -78,6 +81,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                               TfLiteFullyConnectedParams* params, OpData* data,
+                               const TfLiteTensor* input,
+                               const TfLiteTensor* filter,
+                               const TfLiteTensor* bias, TfLiteTensor* output) {
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  // TODO(b/138810107): Figure out whether output shift should be inverted
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+  return kTfLiteOk;
+}
+
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
                            const TfLiteTensor* input,
@@ -157,6 +183,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, params, data, input, filter, bias,
                        output);
+    case kTfLiteInt8:
+      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+                               output);
+
     case kTfLiteUInt8:
       return EvalQuantized(context, node, params, data, input, filter, bias,
                            output);
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
index c2e1446848d..4f46c0e0daa 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
@@ -24,20 +26,16 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestFullyConnectedFloat(std::initializer_list<int> input_dims_data,
-                             std::initializer_list<float> input_data,
-                             std::initializer_list<int> weights_dims_data,
-                             std::initializer_list<float> weights_data,
-                             std::initializer_list<int> bias_dims_data,
-                             std::initializer_list<float> bias_data,
-                             std::initializer_list<float> expected_output_data,
-                             std::initializer_list<int> output_dims_data,
-                             TfLiteFusedActivation activation,
-                             float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestFullyConnectedFloat(
+    const int* input_dims_data, const float* input_data,
+    const int* weights_dims_data, const float* weights_data,
+    const int* bias_dims_data, const float* bias_data,
+    const float* expected_output_data, const int* output_dims_data,
+    TfLiteFusedActivation activation, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 3;
@@ -93,25 +91,23 @@ void TestFullyConnectedFloat(std::initializer_list<int> input_dims_data,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
+template <typename T>
 void TestFullyConnectedQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
-    std::initializer_list<int> weights_dims_data,
-    std::initializer_list<uint8_t> weights_data, float weights_min,
-    float weights_max, std::initializer_list<int> bias_dims_data,
-    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
-    std::initializer_list<uint8_t> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
-    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int* weights_dims_data, const T* weights_data,
+    const float weights_min, const float weights_max, const int* bias_dims_data,
+    const int32_t* bias_data, const float bias_scale,
+    const T* expected_output_data, const int* output_dims_data,
+    const float output_min, const float output_max,
+    TfLiteFusedActivation activation, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 3;
@@ -122,8 +118,7 @@ void TestFullyConnectedQuantized(
                             input_max),
       CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
                             weights_min, weights_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
-                              bias_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
       CreateQuantizedTensor(output_data, output_dims, "output_tensor",
                             output_min, output_max),
   };
@@ -133,7 +128,7 @@ void TestFullyConnectedQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteFullyConnectedParams builtin_data = {
@@ -173,7 +168,7 @@ void TestFullyConnectedQuantized(
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -184,82 +179,86 @@ void TestFullyConnectedQuantized(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SimpleTest) {
+  const int input_dims_data[] = {2, 2, 10};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, 2, 3};
+  const float expected_output_data[] = {
+      24, 25, 26, 58, 59, 60,
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 10},                            // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, 2, 3,  // Bias values.
-      },
-      {
-          24, 25, 26, 58, 59, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
       kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest2) {
+  const int input_dims_data[] = {2, 2, 2};
+  const float input_data[] = {
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 1, 2};
+  const float weights_data[] = {
+      2, 4,  // u = 0
+  };
+  const int bias_dims_data[] = {1, 1};
+  const float bias_data[] = {1};
+  const float expected_output_data[] = {
+      11,
+      9,
+  };
+  const int output_dims_data[] = {2, 2, 1};
+
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 2},                             // Input shape.
-      {
-          1, 2,  // b = 0
-          2, 1,  // b = 1
-      },
-      {2, 1, 2},  // Weights shape.
-      {
-          2, 4,  // u = 0
-      },
-      {1, 1},  // Bias shape.
-      {
-          1,  // Bias values.
-      },
-      {
-          11, 9,  // Expected results.
-      },
-      {2, 2, 1},  // Output shape.
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
       kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int input_dims_data[] = {2, 2, 10};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
+      -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
+      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, -2, 3};
+  const float expected_output_data[] = {
+      24, 0, 26, 58, 0, 60,
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 10},                            // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
-          -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
-          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, -2, 3,  // Bias values.
-      },
-      {
-          24, 0, 26, 58, 0, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
       kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
   using tflite::testing::F2Q;
   using tflite::testing::F2Q32;
 
@@ -267,70 +266,130 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
   const float input_max = 64.0f;
   const float weights_min = -63.5f;
   const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedRelu) {
+// TODO(b/138811455): Fix code duplication in micro tests
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
   using tflite::testing::F2Q;
   using tflite::testing::F2Q32;
 
@@ -338,70 +397,129 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedRelu) {
   const float input_max = 64.0f;
   const float weights_min = -63.5f;
   const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+      F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
+      F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
+      F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
+      F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
+      F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(0, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(0, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(0, output_min, output_max),  F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-          F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
-          F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
-          F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
-          F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
-          F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(0, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActRelu, output_data);
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
+      F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max),
+      F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max),
+      F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max),
+      F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max),
+      F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(0, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(0, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(0, output_min, output_max),  F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
   using tflite::testing::F2Q;
   using tflite::testing::F2Q32;
 
@@ -409,96 +527,156 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedOutputMultiplierGreaterThan1) {
   const float input_max = 128.0f;
   const float weights_min = -127.0f;
   const float weights_max = 128.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 256.0f * (1 << 24);
+  const float bias_scale = 1.0f;
   const float output_min = -63.5f;
   const float output_max = 64.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInput) {
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, 2, 3};
+  const float expected_output_data[] = {
+      24, 25, 26, 58, 59, 60,  // Expected results.
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {4, 1, 1, 5, 1},                       // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, 2, 3,  // Bias values.
-      },
-      {
-          24, 25, 26, 58, 59, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
       kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantized) {
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
   using tflite::testing::F2Q;
   using tflite::testing::F2Q32;
 
@@ -506,70 +684,130 @@ TF_LITE_MICRO_TEST(SimpleTest4DInputQuantized) {
   const float input_max = 64.0f;
   const float weights_min = -63.5f;
   const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
+  const float bias_scale = 0.25f;
   const float output_min = -127.0f;
   const float output_max = 128.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {4, 1, 1, 5, 1},                           // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedOutputMultiplierGreaterThan1) {
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(
+    SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) {
   using tflite::testing::F2Q;
   using tflite::testing::F2Q32;
 
@@ -577,67 +815,126 @@ TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedOutputMultiplierGreaterThan1) {
   const float input_max = 128.0f;
   const float weights_min = -127.0f;
   const float weights_max = 128.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 256.0f * (1 << 24);
+  const float bias_scale = 1.0f;
   const float output_min = -63.5f;
   const float output_max = 64.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
   const int output_dims_count = 6;
   uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {4, 1, 1, 5, 1},                           // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/logical.cc b/tensorflow/lite/experimental/micro/kernels/logical.cc
new file mode 100644
index 00000000000..8c2aa3446fa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/logical.cc
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace logical {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
+                         bool (*func)(bool, bool)) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (HaveSameShapes(input1, input2)) {
+    reference_ops::BinaryFunction<bool, bool, bool>(
+        GetTensorShape(input1), GetTensorData<bool>(input1),
+        GetTensorShape(input2), GetTensorData<bool>(input2),
+        GetTensorShape(output), GetTensorData<bool>(output), func);
+  } else {
+    reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
+        GetTensorShape(input1), GetTensorData<bool>(input1),
+        GetTensorShape(input2), GetTensorData<bool>(input2),
+        GetTensorShape(output), GetTensorData<bool>(output), func);
+  }
+
+  return kTfLiteOk;
+}
+
+bool LogicalOr(bool x, bool y) { return x || y; }
+
+TfLiteStatus LogicalOrEval(TfLiteContext* context, TfLiteNode* node) {
+  return LogicalImpl(context, node, LogicalOr);
+}
+
+bool LogicalAnd(bool x, bool y) { return x && y; }
+
+TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
+  return LogicalImpl(context, node, LogicalAnd);
+}
+
+}  // namespace
+}  // namespace logical
+
+TfLiteRegistration* Register_LOGICAL_OR() {
+  // Init, Free, Prepare, Eval are satisfying the Interface required by
+  // TfLiteRegistration.
+  static TfLiteRegistration r = {/* init */ nullptr, /* free */ nullptr,
+                                 /* prepare */ nullptr, logical::LogicalOrEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGICAL_AND() {
+  // Init, Free, Prepare, Eval are satisfying the Interface required by
+  // TfLiteRegistration.
+  static TfLiteRegistration r = {/* init */ nullptr, /* free */ nullptr,
+                                 /* prepare */ nullptr,
+                                 logical::LogicalAndEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/logical_test.cc b/tensorflow/lite/experimental/micro/kernels/logical_test.cc
new file mode 100644
index 00000000000..55dfaca317a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/logical_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestLogicalOp(tflite::BuiltinOperator op,
+                   std::initializer_list<int> input1_dims_data,
+                   std::initializer_list<bool> input1_data,
+                   std::initializer_list<int> input2_dims_data,
+                   std::initializer_list<bool> input2_data,
+                   std::initializer_list<int> output_dims_data,
+                   std::initializer_list<bool> expected_output_data,
+                   bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateBoolTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateBoolTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateBoolTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  TF_LITE_MICRO_EXPECT_EQ(output_dims_count, 4);
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogicalOr) {
+  bool output_data[4];
+  tflite::testing::TestLogicalOp(
+      tflite::BuiltinOperator_LOGICAL_OR,           // operator
+      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
+      {4, 1, 1, 1, 4}, {true, false, true, false},  // input2
+      {4, 1, 1, 1, 4}, {true, false, true, true},   // expected output
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(BroadcastLogicalOr) {
+  bool output_data[4];
+  tflite::testing::TestLogicalOp(
+      tflite::BuiltinOperator_LOGICAL_OR,           // operator
+      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
+      {4, 1, 1, 1, 1}, {false},                     // input2
+      {4, 1, 1, 1, 4}, {true, false, false, true},  // expected output
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(LogicalAnd) {
+  bool output_data[4];
+  tflite::testing::TestLogicalOp(
+      tflite::BuiltinOperator_LOGICAL_AND,           // operator
+      {4, 1, 1, 1, 4}, {true, false, false, true},   // input1
+      {4, 1, 1, 1, 4}, {true, false, true, false},   // input2
+      {4, 1, 1, 1, 4}, {true, false, false, false},  // expected output
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(BroadcastLogicalAnd) {
+  bool output_data[4];
+  tflite::testing::TestLogicalOp(
+      tflite::BuiltinOperator_LOGICAL_AND,          // operator
+      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
+      {4, 1, 1, 1, 1}, {true},                      // input2
+      {4, 1, 1, 1, 4}, {true, false, false, true},  // expected output
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc b/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc
new file mode 100644
index 00000000000..dbf819849f7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace maximum_minimum {
+namespace {
+
+// This file has a reference implementation of TFMaximum/TFMinimum.
+enum KernelType {
+  kReference,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input1 = GetInput(context, node, kInputTensor1);
+    input2 = GetInput(context, node, kInputTensor2);
+    output = GetOutput(context, node, kOutputTensor);
+  }
+  const TfLiteTensor* input1;
+  const TfLiteTensor* input2;
+  TfLiteTensor* output;
+};
+
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+}  // namespace
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                     const OpContext& op_context) {
+  reference_ops::MaximumMinimumBroadcast4DSlow(
+      GetTensorShape(op_context.input1),
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorShape(op_context.input2),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorShape(op_context.output),
+      GetTensorData<data_type>(op_context.output),
+      op_type::template op<data_type>);
+}
+
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  if (kernel_type == kReference) {
+    switch (op_context.output->type) {
+      case kTfLiteFloat32:
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt8:
+        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        break;
+      default:
+        context->ReportError(
+            context, "Type %s (%d) is not supported by Maximum/Minimum.",
+            TfLiteTypeGetName(op_context.output->type),
+            op_context.output->type);
+        return kTfLiteError;
+    }
+  } else {
+    context->ReportError(context,
+                         "Kernel type not supported by Maximum/Minimum.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace maximum_minimum
+
+TfLiteRegistration* Register_MAXIMUM() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr,
+      /* free */ nullptr,
+      /* prepare */ nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MINIMUM() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr,
+      /* free */ nullptr,
+      /* prepare */ nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
new file mode 100644
index 00000000000..e911d9c7cb4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
@@ -0,0 +1,310 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestMaxMinFloat(tflite::BuiltinOperator op,
+                     std::initializer_list<int> input1_dims_data,
+                     std::initializer_list<float> input1_data,
+                     std::initializer_list<int> input2_dims_data,
+                     std::initializer_list<float> input2_data,
+                     std::initializer_list<float> expected_output_data,
+                     std::initializer_list<int> output_dims_data,
+                     float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5);
+  }
+}
+
+void TestMaxMinQuantized(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<uint8_t> input1_data, float input1_min,
+    float input1_max, std::initializer_list<int> input2_dims_data,
+    std::initializer_list<uint8_t> input2_data, float input2_min,
+    float input2_max, std::initializer_list<uint8_t> expected_output_data,
+    float output_min, float output_max,
+    std::initializer_list<int> output_dims_data, uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input1_min, input1_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input2_min, input2_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+void TestMaxMinQuantizedInt32(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<int32_t> input1_data, float input1_scale,
+    std::initializer_list<int> input2_dims_data,
+    std::initializer_list<int32_t> input2_data, float input2_scale,
+    std::initializer_list<int32_t> expected_output_data, float output_scale,
+    std::initializer_list<int> output_dims_data, int32_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(input1_data, input1_dims, "input1_tensor",
+                              input1_scale),
+      CreateQuantized32Tensor(input2_data, input2_dims, "input2_tensor",
+                              input2_scale),
+      CreateQuantized32Tensor(output_data, output_dims, "output_tensor",
+                              output_scale),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  float output_data[6];
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
+      data1,                               // input1 shape and data
+      {3, 3, 1, 2}, data2,                 // input2 shape and data
+      {1.0, 0.0, 1.0, 12.0, -2.0, -1.43},  // expected output
+      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
+      data1,                                 // input1 shape and data
+      {3, 3, 1, 2}, data2,                   // input2 shape and data
+      {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44},  // expected output
+      {3, 3, 1, 2}, output_data);            // output shape and data buffer
+}
+
+TF_LITE_MICRO_TEST(Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  const float input1_min = -63.5;
+  const float input1_max = 64;
+  const float input2_min = -63.5;
+  const float input2_max = 64;
+  const float output_min = -63.5;
+  const float output_max = 64;
+
+  uint8_t output_data[6];
+
+  tflite::testing::TestMaxMinQuantized(
+      tflite::BuiltinOperator_MAXIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {3, 3, 1, 2}, data2, input2_min, input2_max,
+      // expected output
+      {1, 0, 2, 12, 255, 23},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+
+  tflite::testing::TestMaxMinQuantized(
+      tflite::BuiltinOperator_MINIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {3, 3, 1, 2}, data2, input2_min, input2_max,
+      // expected output
+      {0, 0, 1, 11, 2, 1},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+}
+
+TF_LITE_MICRO_TEST(FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  float output_data[6];
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
+      data1,                            // input1 shape and data
+      {1, 2}, data2,                    // input2 shape and data
+      {1.0, 2.0, 0.5, 2.0, 0.5, 11.0},  // expected output
+      {3, 3, 1, 2}, output_data);       // output shape and data buffer
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
+      data1,                               // input1 shape and data
+      {1, 2}, data2,                       // input2 shape and data
+      {0.5, 0.0, -1.0, -2.0, -1.44, 2.0},  // expected output
+      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+}
+
+TF_LITE_MICRO_TEST(Int32WithBroadcastTest) {
+  const float input1_scale = 0.5;
+  const float input2_scale = 0.5;
+  const float output_scale = 0.5;
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  int32_t output_data[6];
+
+  tflite::testing::TestMaxMinQuantizedInt32(
+      tflite::BuiltinOperator_MAXIMUM,
+      // input1 shape, data and scale
+      {3, 3, 1, 2}, data1, input1_scale,
+      // input2 shape, data and scale
+      {1, 1}, data2, input2_scale,
+      // expected output
+      {2, 2, 2, 2, 3, 11},
+      // output scale, shape and data buffer
+      output_scale, {3, 3, 1, 2}, output_data);
+
+  tflite::testing::TestMaxMinQuantizedInt32(
+      tflite::BuiltinOperator_MINIMUM,
+      // input1 shape, data and scale
+      {3, 3, 1, 2}, data1, input1_scale,
+      // input2 shape, data and scale
+      {1, 1}, data2, input2_scale,
+      // expected output
+      {1, 0, -1, -2, 2, 2},
+      // output scale, shape and data buffer
+      output_scale, {3, 3, 1, 2}, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/micro_utils.h b/tensorflow/lite/experimental/micro/kernels/micro_utils.h
new file mode 100644
index 00000000000..dcb691ff883
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/micro_utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_MICRO_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_MICRO_UTILS_H_
+namespace tflite {
+namespace ops {
+namespace micro {
+
+// Same as gtl::Greater but defined here to reduce dependencies and
+// binary size for micro environment.
+struct Greater {
+  template <typename T>
+  bool operator()(const T& x, const T& y) const {
+    return x > y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  bool operator()(const T& x, const T& y) const {
+    return x < y;
+  }
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_MICRO_UTILS_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/neg.cc b/tensorflow/lite/experimental/micro/kernels/neg.cc
new file mode 100644
index 00000000000..8d87c83e785
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/neg.cc
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace neg {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  switch (input->type) {
+    // TODO(wangtz): handle for kTfLiteInt8
+    case kTfLiteFloat32:
+      reference_ops::Negate(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+      break;
+    default:
+      context->ReportError(
+          context, "Neg only currently supports float32, got %d.", input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace neg
+
+TfLiteRegistration* Register_NEG() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 /*prepare=*/nullptr, neg::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/neg_test.cc b/tensorflow/lite/experimental/micro/kernels/neg_test.cc
new file mode 100644
index 00000000000..f751049fbc1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/neg_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestNegFloat(std::initializer_list<int> input_dims_data,
+                  std::initializer_list<float> input_data,
+                  std::initializer_list<float> expected_output_data,
+                  std::initializer_list<int> output_dims_data,
+                  float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_NEG, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[0], output_data[0]);
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NegOpSingleFloat) {
+  float output_data[2];
+  tflite::testing::TestNegFloat(/*input_dims_data=*/{1, 2},
+                                /*input_data=*/{8.5f, 0.0f},
+                                /*expected_output_data=*/{-8.5f, 0.0f},
+                                /*output_dims_data*/ {1, 2},
+                                /*output_data=*/output_data);
+}
+
+TF_LITE_MICRO_TEST(NegOpFloat) {
+  float output_data[6];
+  tflite::testing::TestNegFloat(/*input_dims_data=*/{2, 2, 3},
+                                /*input_data=*/
+                                {-2.0f, -1.0f, 0.f, 1.0f, 2.0f, 3.0f},
+                                /*expected_output_data=*/
+                                {2.0f, 1.0f, -0.f, -1.0f, -2.0f, -3.0f},
+                                /*output_dims_data=*/{2, 2, 3},
+                                /*output_data=*/output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/pack.cc b/tensorflow/lite/experimental/micro/kernels/pack.cc
new file mode 100644
index 00000000000..091f81faaf4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/pack.cc
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace pack {
+namespace {
+
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus PackImpl(TfLiteContext* context, TfLiteNode* node,
+                      TfLiteTensor* output, int values_count, int axis) {
+  const int dimensions = output->dims->size;
+  const TfLiteTensor* input0 = &context->tensors[node->inputs->data[0]];
+  const TfLiteIntArray* input_dims = input0->dims;
+  const TfLiteIntArray* output_dims = output->dims;
+
+  if (axis < 0) {
+    axis += dimensions;
+  }
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= output_dims->data[i];
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i) {
+    copy_size *= output_dims->data[i];
+  }
+  int input_size = 1;
+  for (int i = 0; i < input_dims->size; ++i) {
+    input_size *= input_dims->data[i];
+  }
+  TFLITE_DCHECK_EQ(input_size, copy_size * outer_size);
+
+  T* output_data = GetTensorData<T>(output);
+
+  for (int i = 0; i < values_count; ++i) {
+    TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
+    const T* input_data = GetTensorData<T>(t);
+    for (int k = 0; k < outer_size; ++k) {
+      const T* input_ptr = input_data + copy_size * k;
+      int loc = k * values_count * copy_size + i * copy_size;
+      T* output_ptr = output_data + loc;
+      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLitePackParams* data =
+      reinterpret_cast<TfLitePackParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      return PackImpl<float>(context, node, output, data->values_count,
+                             data->axis);
+    }
+    case kTfLiteUInt8: {
+      return PackImpl<uint8_t>(context, node, output, data->values_count,
+                               data->axis);
+    }
+    case kTfLiteInt8: {
+      return PackImpl<int8_t>(context, node, output, data->values_count,
+                              data->axis);
+    }
+    case kTfLiteInt32: {
+      return PackImpl<int32_t>(context, node, output, data->values_count,
+                               data->axis);
+    }
+    case kTfLiteInt64: {
+      return PackImpl<int64_t>(context, node, output, data->values_count,
+                               data->axis);
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by pack.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace pack
+
+TfLiteRegistration* Register_PACK() {
+  static TfLiteRegistration r = {nullptr, nullptr, pack::Prepare, pack::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/pack_test.cc b/tensorflow/lite/experimental/micro/kernels/pack_test.cc
new file mode 100644
index 00000000000..90ae8aa900f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/pack_test.cc
@@ -0,0 +1,417 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+
+void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
+                            std::initializer_list<float> input1_data,
+                            std::initializer_list<int> input2_dims_data,
+                            std::initializer_list<float> input2_data, int axis,
+                            std::initializer_list<int> output_dims_data,
+                            std::initializer_list<float> expected_output_data,
+                            float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 2;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor")};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePackParams builtin_data = {
+      .values_count = 2,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
+                              std::initializer_list<float> input1_data,
+                              std::initializer_list<int> input2_dims_data,
+                              std::initializer_list<float> input2_data,
+                              std::initializer_list<int> input3_dims_data,
+                              std::initializer_list<float> input3_data,
+                              int axis,
+                              std::initializer_list<int> output_dims_data,
+                              std::initializer_list<float> expected_output_data,
+                              float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* input3_dims = IntArrayFromInitializer(input3_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 3;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(input3_data, input3_dims, "input3_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor")};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePackParams builtin_data = {
+      .values_count = 3,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({3, 0, 1, 2});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestPackTwoInputsQuantized(
+    std::initializer_list<int> input1_dims_data,
+    std::initializer_list<uint8_t> input1_data,
+    std::initializer_list<int> input2_dims_data,
+    std::initializer_list<uint8_t> input2_data, int axis,
+    std::initializer_list<int> output_dims_data,
+    std::initializer_list<uint8_t> expected_output_data, uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 2;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      // CreateQuantizedTensor needs min/max values as input, but these values
+      // don't matter as to the functionality of PACK, so just set as 0 and 10.
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor", 0, 10),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor", 0, 10),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor", 0, 10)};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePackParams builtin_data = {
+      .values_count = 2,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+void TestPackTwoInputsQuantized32(
+    std::initializer_list<int> input1_dims_data,
+    std::initializer_list<int32_t> input1_data,
+    std::initializer_list<int> input2_dims_data,
+    std::initializer_list<int32_t> input2_data, int axis,
+    std::initializer_list<int> output_dims_data,
+    std::initializer_list<int32_t> expected_output_data, int32_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 2;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(input1_data, input1_dims, "input1_tensor", 1.0),
+      CreateQuantized32Tensor(input2_data, input2_dims, "input2_tensor", 1.0),
+      CreateQuantized32Tensor(output_data, output_dims, "output_tensor", 1.0)};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePackParams builtin_data = {
+      .values_count = 2,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PackFloatThreeInputs) {
+  constexpr int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestPackThreeInputsFloat(
+      {1, 2},        // Input1 shape
+      {1, 4},        // Input1 values
+      {1, 2},        // Input2 shape
+      {2, 5},        // Input2 values
+      {1, 2},        // Input3 shape
+      {3, 6},        // Input3 values
+      0, {2, 3, 2},  // Output shape
+      {
+          1, 4, 2, 5, 3, 6  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(PackFloatThreeInputsDifferentAxis) {
+  constexpr int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestPackThreeInputsFloat(
+      {1, 2},        // Input1 shape
+      {1, 4},        // Input1 values
+      {1, 2},        // Input2 shape
+      {2, 5},        // Input2 values
+      {1, 2},        // Input3 shape
+      {3, 6},        // Input3 values
+      1, {2, 2, 3},  // Output shape
+      {
+          1, 2, 3, 4, 5, 6  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(PackFloatThreeInputsNegativeAxis) {
+  constexpr int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestPackThreeInputsFloat(
+      {1, 2},         // Input1 shape
+      {1, 4},         // Input1 values
+      {1, 2},         // Input2 shape
+      {2, 5},         // Input2 values
+      {1, 2},         // Input3 shape
+      {3, 6},         // Input3 values
+      -1, {2, 2, 3},  // Output shape
+      {
+          1, 2, 3, 4, 5, 6  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(PackFloatMultilDimensions) {
+  constexpr int output_dims_count = 12;
+  float output_data[output_dims_count];
+  tflite::testing::TestPackTwoInputsFloat(
+      {2, 2, 3},              // Input1 shape
+      {1, 2, 3, 4, 5, 6},     // Input1 values
+      {2, 2, 3},              // Input2 shape
+      {7, 8, 9, 10, 11, 12},  // Input2 values
+      1, {3, 2, 2, 3},        // Output shape
+      {
+          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(PackQuantizedMultilDimensions) {
+  constexpr int output_dims_count = 12;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestPackTwoInputsQuantized(
+      {2, 2, 3},              // Input1 shape
+      {1, 2, 3, 4, 5, 6},     // Input1 values
+      {2, 2, 3},              // Input2 shape
+      {7, 8, 9, 10, 11, 12},  // Input2 values
+      1, {3, 2, 2, 3},        // Output shape
+      {
+          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(PackQuantized32MultilDimensions) {
+  constexpr int output_dims_count = 12;
+  int32_t output_data[output_dims_count];
+  tflite::testing::TestPackTwoInputsQuantized32(
+      {2, 2, 3},              // Input1 shape
+      {1, 2, 3, 4, 5, 6},     // Input1 values
+      {2, 2, 3},              // Input2 shape
+      {7, 8, 9, 10, 11, 12},  // Input2 values
+      1, {3, 2, 2, 3},        // Output shape
+      {
+          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/pooling.cc b/tensorflow/lite/experimental/micro/kernels/pooling.cc
index 385143050fd..38732ba0f3e 100644
--- a/tensorflow/lite/experimental/micro/kernels/pooling.cc
+++ b/tensorflow/lite/experimental/micro/kernels/pooling.cc
@@ -44,9 +44,10 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
   int out_height, out_width;
 
   data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width, /* dilation_rate= */ 1,
-      height, width, params->filter_height, params->filter_width,
-      params->padding, params->padding, &out_height, &out_width);
+      params->stride_height, params->stride_width,
+      /*dilation_rate_height=*/1,
+      /*dilation_rate_width=*/1, height, width, params->filter_height,
+      params->filter_width, params->padding, &out_height, &out_width);
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/micro/kernels/prelu.cc b/tensorflow/lite/experimental/micro/kernels/prelu.cc
new file mode 100644
index 00000000000..bfa5b9a0e75
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/prelu.cc
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+
+TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+inline void BroadcastPrelu4DSlowFloat(
+    const RuntimeShape& unextended_input1_shape, const float* input1_data,
+    const RuntimeShape& unextended_input2_shape, const float* input2_data,
+    const RuntimeShape& unextended_output_shape, float* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val >= 0.0 ? in1_val : in1_val * in2_val;
+        }
+      }
+    }
+  }
+}
+
+TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+    double real_multiplier =
+        input->params.scale * alpha->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(real_multiplier, &output_multiplier,
+                                        &output_shift);
+  }
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      BroadcastPrelu4DSlowFloat(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(alpha), GetTensorData<float>(alpha),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier = output_multiplier;
+      op_params.output_shift = output_shift;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(
+          context, "Only float32 and uint8 are supported currently, got %d.",
+          TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+}  // namespace activations
+
+TfLiteRegistration* Register_PRELU() {
+  static TfLiteRegistration r = {nullptr, nullptr, activations::PreluPrepare,
+                                 activations::PreluEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/prelu_test.cc b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc
new file mode 100644
index 00000000000..6bc96abc245
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestPreluFloat(std::initializer_list<int> input_dims_data,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<int> alpha_dims_data,
+                    std::initializer_list<float> alpha_data,
+                    std::initializer_list<float> expected_output_data,
+                    std::initializer_list<int> output_dims_data,
+                    float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(alpha_data, alpha_dims, "alpha_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, init_data_size);
+  }
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestPreluQuantized(std::initializer_list<int> input_dims_data,
+                        std::initializer_list<uint8_t> input_data,
+                        float input_min, float input_max,
+                        std::initializer_list<int> alpha_dims_data,
+                        std::initializer_list<uint8_t> alpha_data,
+                        float alpha_min, float alpha_max,
+                        std::initializer_list<uint8_t> expected_output_data,
+                        std::initializer_list<int> output_dims_data,
+                        float output_min, float output_max,
+                        uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(alpha_data, alpha_dims, "alpha_tensor", alpha_min,
+                            alpha_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, init_data_size);
+  }
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
+  const int output_dims_count = 12;
+  float output_data[output_dims_count];
+  tflite::testing::TestPreluFloat({4, 1, 2, 2, 3},  // input shape
+                                  {
+                                      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+                                      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+                                      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+                                      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+                                  },
+                                  {3, 1, 1, 3},        // alpha shape
+                                  {0.0f, 1.0f, 2.0f},  // alpha values
+                                  {
+                                      0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                      1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                      0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                      0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                                  },
+                                  {4, 1, 2, 2, 3},  // output shape
+                                  output_data);
+}
+
+TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
+  using tflite::testing::F2Q;
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  const float kAlphaMin = -0.5f;
+  const float kAlphaMax = 0.5f;
+  const int output_dims_count = 12;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestPreluQuantized(
+      {4, 1, 2, 2, 3},  // input shape
+      {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
+       F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
+       F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
+       F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
+       F2Q(-0.25f, kMin, kMax)},
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
+      {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
+      kMin, kMax,
+      {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
+       F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
+       F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
+       F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
+       F2Q(0.125f, kMin, kMax)},
+      {4, 1, 2, 2, 3},  // output shape
+      kMin, kMax, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/reshape.cc b/tensorflow/lite/experimental/micro/kernels/reshape.cc
new file mode 100644
index 00000000000..338fc52e49c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/reshape.cc
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace reshape {
+
+constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  // Tensorflow's Reshape allows one of the shape components to have the
+  // special -1 value, meaning it will be calculated automatically based on the
+  // input. Here we calculate what that dimension should be so that the number
+  // of output elements in the same as the number of input elements.
+  int num_input_elements = NumElements(input);
+  TfLiteIntArray* output_shape = output->dims;
+
+  if (NumInputs(node) == 1 &&  // Legacy scalar supported with params.
+      output_shape->size == 1 && output_shape->data[0] == 0) {
+    // Legacy tflite models use a shape parameter of [0] to indicate scalars,
+    // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during
+    // toco conversion.
+    output_shape->size = 0;
+  }
+
+  int num_output_elements = 1;
+  int stretch_dim = -1;
+  for (int i = 0; i < output_shape->size; ++i) {
+    int value = output_shape->data[i];
+    if (value == -1) {
+      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
+      stretch_dim = i;
+    } else {
+      num_output_elements *= value;
+    }
+  }
+  if (stretch_dim != -1) {
+    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->data[stretch_dim];
+  }
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (ReshapeOutput(context, node) != kTfLiteOk) {
+    return kTfLiteError;
+  }
+
+  for (int i = 0; i < input->bytes; ++i) {
+    output->data.raw[i] = input->data.raw[i];
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace reshape
+
+TfLiteRegistration* Register_RESHAPE() {
+  static TfLiteRegistration r = {nullptr, nullptr, reshape::Prepare,
+                                 reshape::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/reshape_test.cc b/tensorflow/lite/experimental/micro/kernels/reshape_test.cc
new file mode 100644
index 00000000000..cf13c640142
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/reshape_test.cc
@@ -0,0 +1,292 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+TfLiteReshapeParams create_params(int* shape_data) {
+  TfLiteReshapeParams op_params = {};
+  op_params.num_dimensions = shape_data[0];
+  for (int i = 0; i < shape_data[0]; ++i)
+    op_params.shape[i] = shape_data[i + 1];
+  return op_params;
+}
+
+// If expected output is empty, the test is expected to fail.
+template <typename T>
+void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
+                     TfLiteTensor* output_tensor,
+                     std::initializer_list<T> expected_output,
+                     std::initializer_list<int> expected_dims,
+                     bool expect_failure) {
+  TfLiteContext context;
+  TfLiteTensor tensors[3];
+  TfLiteNode node;
+  if (shape_tensor == nullptr) {
+    constexpr int inputs_size = 1;
+    constexpr int outputs_size = 1;
+    constexpr int tensors_size = inputs_size + outputs_size;
+    tensors[0] = *input_tensor;
+    tensors[1] = *output_tensor,
+    PopulateContext(tensors, tensors_size, &context);
+    node.inputs = IntArrayFromInitializer({1, 0});
+    node.outputs = IntArrayFromInitializer({1, 1});
+  } else {
+    constexpr int inputs_size = 2;
+    constexpr int outputs_size = 1;
+    constexpr int tensors_size = inputs_size + outputs_size;
+    tensors[0] = *input_tensor;
+    tensors[1] = *shape_tensor;
+    tensors[2] = *output_tensor;
+    PopulateContext(tensors, tensors_size, &context);
+    node.inputs = IntArrayFromInitializer({2, 0, 1});
+    node.outputs = IntArrayFromInitializer({1, 2});
+  }
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_RESHAPE, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TfLiteReshapeParams builtin_data =
+      create_params(reinterpret_cast<int*>(output_tensor->dims));
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  if (expect_failure) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                            registration->invoke(&context, &node));
+    return;
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const int output_dims_count = ElementCount(*output_tensor->dims);
+  const T* output_data = GetTensorData<T>(output_tensor);
+  for (int i = 0; i < expected_output.size(); ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output.begin()[i], output_data[i],
+                              1e-5f);
+  }
+  TF_LITE_MICRO_EXPECT_EQ(expected_dims.size(), output_tensor->dims->size);
+  for (int i = 0; i < expected_dims.size(); ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_dims.begin()[i],
+                              output_tensor->dims->data[i], 1e-5f);
+  }
+}
+
+template <typename T = float, TfLiteType tensor_input_type = kTfLiteFloat32>
+void TestReshape(std::initializer_list<int> input_dims_data,
+                 std::initializer_list<T> input_data,
+                 std::initializer_list<int> shape_dims_data,
+                 std::initializer_list<int32_t> shape_data,
+                 int* output_dims_data, uint8_t* output_data_raw,
+                 std::initializer_list<T> expected_output,
+                 std::initializer_list<int> expected_dims,
+                 bool expect_failure = false) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  TfLiteTensor input_tensor = CreateTensor<T, tensor_input_type>(
+      input_data, input_dims, "input_tensor");
+  T* output_data = reinterpret_cast<T*>(output_data_raw);
+  TfLiteTensor output_tensor = CreateTensor<T, tensor_input_type>(
+      output_data, output_dims, "input_tensor");
+  // Reshape param is passed as op's param.
+  TestReshapeImpl<T>(&input_tensor, nullptr, &output_tensor, expected_output,
+                     expected_dims, expect_failure);
+  // Reshape param is passed as a tensor.
+  TfLiteIntArray* shape_dims = IntArrayFromInitializer(shape_dims_data);
+  auto shape_tensor = CreateTensor<int32_t, kTfLiteInt32>(
+      shape_data, shape_dims, "shape_tensor");
+  TestReshapeImpl<T>(&input_tensor, &shape_tensor, &output_tensor,
+                     expected_output, expected_dims, expect_failure);
+}
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+#define TEST_RESHAPE(...)                                           \
+  using tflite::testing::TestReshape;                               \
+  tflite::testing::TestReshape<float, kTfLiteFloat32>(__VA_ARGS__); \
+  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(__VA_ARGS__); \
+  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(__VA_ARGS__);
+
+TF_LITE_MICRO_TEST(MismatchedDimensions) {
+  uint8_t output_data[32];
+  int output_dims[3] = {2, 2, 1};
+  TEST_RESHAPE({4, 1, 2, 4, 1},  // input_dims
+               {3},              // input_data
+               {1, 2},           // shape_dims
+               {2, 1},           // shape_data
+               output_dims,      // output_dims
+               output_data, {},  // expected_output
+               {},               // expected_dims
+               true              // expect failure
+  );
+}
+
+TF_LITE_MICRO_TEST(TooManyDimensions) {
+  uint8_t output_data[32];
+  int output_dims[10] = {9, 1, 1, 1, 1, 1, 1, 1, 1, 2};
+  TEST_RESHAPE({9, 1, 1, 2, 1, 1, 1, 1, 1, 1},  // input_dims
+               {3, 2},                          // input_data
+               {1, 9},                          // shape_dims
+               {1, 1, 1, 1, 1, 1, 1, 1, 2},     // shape_data
+               output_dims,                     // output_dims
+               output_data, {3, 2},             // expected_output
+               {1, 1, 1, 1, 1, 1, 1, 1, 2}      // expected_dims
+  );
+}
+
+// Number of dimensions > 8 is accepted in micro since it does not use
+// TfLiteReshapeParams.
+TF_LITE_MICRO_TEST(TooManySpecialDimensions) {
+  uint8_t output_data[32];
+  int output_dims[5] = {4, -1, -1, 2, 4};
+  TEST_RESHAPE({4, 1, 2, 4, 1},  // input_dims
+               {3},              // input_data
+               {1, 4},           // shape_dims
+               {-1, -1, 2, 4},   // shape_data
+               output_dims,      // output_dims
+               output_data, {},  // expected_output
+               {},               // expected_dims
+               true              // expect failure
+  );
+}
+
+// Create the model with a 2x2 shape. Processing still works because the new
+// shape ends up being hardcoded as a flat vector.
+TF_LITE_MICRO_TEST(InvalidShape) {
+  using tflite::testing::CreateFloatTensor;
+  using tflite::testing::IntArrayFromInitializer;
+  using tflite::testing::IntArrayFromInts;
+  TfLiteIntArray* input_dims = IntArrayFromInitializer({3, 1, 2, 2});
+  auto input_data = {3.0f};
+  auto input_tensor = CreateFloatTensor(input_data, input_dims, "input_tensor");
+  float output_data[4];
+  int output_dims_data[6] = {2, 2, 1, 2, 2, 1};
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  auto output_tensor =
+      CreateFloatTensor(output_data, output_dims, "input_tensor");
+  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
+                                          nullptr,         // shape_tensor
+                                          &output_tensor,  // output_tensor
+                                          {},              // expected_output
+                                          {},              // expected_dims
+                                          true             // expect failure
+  );
+}
+
+TF_LITE_MICRO_TEST(RegularShapes) {
+  uint8_t output_data[32];
+  int output_dims[4] = {3, 2, 2, 2};
+  TEST_RESHAPE({4, 1, 2, 4, 1},                        // input_dims
+               {1, 2, 3, 4, 5, 6, 7, 8},               // input_data
+               {1, 3},                                 // shape_dims
+               {2, 2, 2},                              // shape_data
+               output_dims,                            // output_dims
+               output_data, {1, 2, 3, 4, 5, 6, 7, 8},  // expected_output
+               {2, 2, 2}                               // expected_dims
+  );
+}
+
+TF_LITE_MICRO_TEST(WithStretchDimension) {
+  uint8_t output_data[32];
+  int output_dims[4] = {3, 2, 1, -1};
+  TEST_RESHAPE({4, 1, 2, 4, 1},                        // input_dims
+               {1, 2, 3, 4, 5, 6, 7, 8},               // input_data
+               {1, 3},                                 // shape_dims
+               {2, 1, -1},                             // shape_data
+               output_dims,                            // output_dims
+               output_data, {1, 2, 3, 4, 5, 6, 7, 8},  // expected_output
+               {2, 1, 4}                               // expected_dims
+  );
+}
+
+// Shape is specified as '[]', which is the modern way to represent scalar
+// input and output.
+TF_LITE_MICRO_TEST(ScalarOutput) {
+  uint8_t output_data[4];
+  int output_dims[1] = {0};
+  TEST_RESHAPE({1, 1},            // input_dims
+               {3},               // input_data
+               {0},               // shape_dims
+               {},                // shape_data
+               output_dims,       // output_dims
+               output_data, {3},  // expected_output
+               {}                 // expected_dims
+  );
+}
+
+// Some old models specify '[0]' as the new shape, indicating that both input
+// and output are scalars.
+TF_LITE_MICRO_TEST(LegacyScalarOutput) {
+  using tflite::testing::CreateFloatTensor;
+  using tflite::testing::IntArrayFromInitializer;
+  using tflite::testing::IntArrayFromInts;
+  TfLiteIntArray* input_dims = IntArrayFromInitializer({1, 1});
+  auto input_data = {3.0f};
+  auto input_tensor = CreateFloatTensor(input_data, input_dims, "input_tensor");
+  float output_data[1];
+  int output_dims_data[2] = {1, 0};
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  auto output_tensor =
+      CreateFloatTensor(output_data, output_dims, "input_tensor");
+  TfLiteIntArray* shape_dims = tflite::testing::IntArrayFromInitializer({1, 0});
+  auto shape_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      {0}, shape_dims, "shape_tensor");
+  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
+                                          &shape_tensor,   // shape_tensor
+                                          &output_tensor,  // output_tensor
+                                          {},              // expected_output
+                                          {},              // expected_dims
+                                          true             // expect failure
+  );
+  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
+                                          nullptr,         // shape_tensor
+                                          &output_tensor,  // output_tensor
+                                          {3},             // expected_output
+                                          {},              // expected_dims
+                                          false            // expect failure
+  );
+}
+
+#undef TEST_RESHAPE
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/round.cc b/tensorflow/lite/experimental/micro/kernels/round.cc
new file mode 100644
index 00000000000..a6231a6e0f2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/round.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace round {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
+  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
+  for (int i = 0; i < output->dims->size; ++i) {
+    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  reference_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
+                       GetTensorShape(output), GetTensorData<float>(output));
+
+  return kTfLiteOk;
+}
+}  // namespace round
+
+TfLiteRegistration* Register_ROUND() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, round::Prepare, round::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/round_test.cc b/tensorflow/lite/experimental/micro/kernels/round_test.cc
new file mode 100644
index 00000000000..511d25033b2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/round_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestRound(std::initializer_list<int> input_dims_data,
+               std::initializer_list<float> input_data,
+               std::initializer_list<float> expected_output_data,
+               float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(input_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_ROUND, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SingleDim) {
+  float output_data[6];
+  tflite::testing::TestRound({1, 6},  // input_dims_data
+                             {8.5, 0.0, 3.5, 4.2, -3.5, -4.5},  // input_data
+                             {8, 0, 4, 4, -4, -4},  // expected_output_data
+                             output_data);
+}
+
+TF_LITE_MICRO_TEST(MultiDims) {
+  float output_data[12];
+  tflite::testing::TestRound(
+      {4, 2, 1, 1, 6},  // input_dims_data
+      {0.0001, 8.0001, 0.9999, 9.9999, 0.5, -0.0001, -8.0001, -0.9999, -9.9999,
+       -0.5, -2.5, 1.5},                            // input_data
+      {0, 8, 1, 10, 0, 0, -8, -1, -10, -0, -2, 2},  // expected_output_data
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax.cc b/tensorflow/lite/experimental/micro/kernels/softmax.cc
index 6d2d8b470fc..ff4ee435bdd 100644
--- a/tensorflow/lite/experimental/micro/kernels/softmax.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax.cc
@@ -42,7 +42,7 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     OpData* data) {
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+    TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
 
     static const int kScaledDiffIntegerBits = 5;
 
diff --git a/tensorflow/lite/experimental/micro/kernels/split.cc b/tensorflow/lite/experimental/micro/kernels/split.cc
new file mode 100644
index 00000000000..dc0fe3c7eca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/split.cc
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace split {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
+                       const TfLiteTensor* input, int axis_value) {
+  const int output_count = NumOutputs(node);
+  const TfLiteIntArray* input_dims = input->dims;
+  const TfLiteTensor* output0 = &context->tensors[node->outputs->data[0]];
+  const TfLiteIntArray* output_dims = output0->dims;
+
+  const int split_dimensions = input_dims->size;
+  int axis = axis_value < 0 ? axis_value + split_dimensions : axis_value;
+
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+  TFLITE_DCHECK_EQ(output_dims->size, split_dimensions);
+
+  int64_t split_size = output_dims->data[axis] * output_count;
+
+  TFLITE_DCHECK_EQ(split_size, input_dims->data[axis]);
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_dims->data[i];
+  }
+
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i) {
+    base_inner_size *= input_dims->data[i];
+  }
+
+  const T* input_ptr = GetTensorData<T>(input);
+  for (int k = 0; k < outer_size; ++k) {
+    for (int i = 0; i < output_count; ++i) {
+      TfLiteTensor* t = &context->tensors[node->outputs->data[i]];
+      T* output_data = GetTensorData<T>(t);
+      const int copy_size = output_dims->data[axis] * base_inner_size;
+      T* output_ptr = output_data + k * copy_size;
+      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
+      input_ptr += copy_size;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* axis = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 1);
+
+  // Dynamic output tensors are needed if axis tensor is not constant.
+  // But Micro doesn't support dynamic memeory allocation, so we only support
+  // constant axis tensor for now.
+  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
+                     "Non constant axis tensor not supported");
+
+  int axis_value = GetTensorData<int32_t>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  TF_LITE_ENSURE(context, axis_value >= 0);
+  TF_LITE_ENSURE(context, axis_value < NumDimensions(input));
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      return SplitImpl<float>(context, node, input, axis_value);
+    }
+    case kTfLiteUInt8: {
+      return SplitImpl<uint8_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt8: {
+      return SplitImpl<int8_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt16: {
+      return SplitImpl<int16_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt32: {
+      return SplitImpl<int32_t>(context, node, input, axis_value);
+    }
+    default:
+      context->ReportError(context, "Type %s currently not supported.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPLIT
+
+  return kTfLiteOk;
+}
+
+}  // namespace split
+
+TfLiteRegistration* Register_SPLIT() {
+  static TfLiteRegistration r = {nullptr, nullptr, split::Prepare, split::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/split_test.cc b/tensorflow/lite/experimental/micro/kernels/split_test.cc
new file mode 100644
index 00000000000..0a2e68a1e4e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/split_test.cc
@@ -0,0 +1,567 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+
+void TestSplitTwoOutputsFloat(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<float> input_data,
+    std::initializer_list<int> axis_dims_data,
+    std::initializer_list<int32_t> axis_data,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<float> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<float> expected_output2_data, float* output1_data,
+    float* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 2;
+  constexpr int axis_size = 1;
+  constexpr int tensors_size = input_size + output_size + axis_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(axis_data, axis_dims, "axis_tensor", 1.0),
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output1_data, output1_dims, "output1_tensor"),
+      CreateFloatTensor(output2_data, output2_dims, "output2_tensor")};
+
+  // Currently only support constant axis tensor.
+  tensors[0].allocation_type = kTfLiteMmapRo;
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSplitParams builtin_data = {
+      .num_splits = 2,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
+                              1e-5f);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
+                              1e-5f);
+  }
+}
+
+void TestSplitFourOutputsFloat(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<float> input_data,
+    std::initializer_list<int> axis_dims_data,
+    std::initializer_list<int32_t> axis_data,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<float> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<float> expected_output2_data,
+    std::initializer_list<int> output3_dims_data,
+    std::initializer_list<float> expected_output3_data,
+    std::initializer_list<int> output4_dims_data,
+    std::initializer_list<float> expected_output4_data, float* output1_data,
+    float* output2_data, float* output3_data, float* output4_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+  TfLiteIntArray* output4_dims = IntArrayFromInitializer(output4_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+  const int output3_dims_count = ElementCount(*output3_dims);
+  const int output4_dims_count = ElementCount(*output4_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 4;
+  constexpr int axis_size = 1;
+  constexpr int tensors_size = input_size + output_size + axis_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(axis_data, axis_dims, "axis_tensor", 1.0),
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output1_data, output1_dims, "output1_tensor"),
+      CreateFloatTensor(output2_data, output2_dims, "output2_tensor"),
+      CreateFloatTensor(output3_data, output1_dims, "output3_tensor"),
+      CreateFloatTensor(output4_data, output1_dims, "output4_tensor")};
+
+  // Currently only support constant axis tensor.
+  tensors[0].allocation_type = kTfLiteMmapRo;
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+  for (int i = 0; i < output3_dims_count; ++i) {
+    output3_data[i] = 23;
+  }
+  for (int i = 0; i < output4_dims_count; ++i) {
+    output4_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSplitParams builtin_data = {
+      .num_splits = 4,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({4, 2, 3, 4, 5});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
+                              1e-5f);
+  }
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
+                              1e-5f);
+  }
+  for (int i = 0; i < output3_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data.begin()[i], output3_data[i],
+                              1e-5f);
+  }
+  for (int i = 0; i < output4_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output4_data.begin()[i], output4_data[i],
+                              1e-5f);
+  }
+}
+
+void TestSplitTwoOutputsQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data,
+    std::initializer_list<int> axis_dims_data,
+    std::initializer_list<int32_t> axis_data,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<uint8_t> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<uint8_t> expected_output2_data, uint8_t* output1_data,
+    uint8_t* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 2;
+  constexpr int axis_size = 1;
+  constexpr int tensors_size = input_size + output_size + axis_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(axis_data, axis_dims, "axis_tensor", 1.0),
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", 0, 10),
+      CreateQuantizedTensor(output1_data, output1_dims, "output1_tensor", 0,
+                            10),
+      CreateQuantizedTensor(output2_data, output2_dims, "output2_tensor", 0,
+                            10)};
+
+  // Currently only support constant axis tensor.
+  tensors[0].allocation_type = kTfLiteMmapRo;
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSplitParams builtin_data = {
+      .num_splits = 2,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+  }
+}
+
+void TestSplitTwoOutputsQuantized32(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<int32_t> input_data,
+    std::initializer_list<int> axis_dims_data,
+    std::initializer_list<int32_t> axis_data,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<int32_t> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<int32_t> expected_output2_data, int32_t* output1_data,
+    int32_t* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 2;
+  constexpr int axis_size = 1;
+  constexpr int tensors_size = input_size + output_size + axis_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(axis_data, axis_dims, "axis_tensor", 1.0),
+      CreateQuantized32Tensor(input_data, input_dims, "input_tensor", 1.0),
+      CreateQuantized32Tensor(output1_data, output1_dims, "output1_tensor",
+                              1.0),
+      CreateQuantized32Tensor(output2_data, output2_dims, "output2_tensor",
+                              1.0)};
+
+  // Currently only support constant axis tensor.
+  tensors[0].allocation_type = kTfLiteMmapRo;
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSplitParams builtin_data = {
+      .num_splits = 2,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisZero) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {0},                                                      // Axis value
+      {4, 1, 2, 2, 2},                                          // Output1 shape
+      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
+      {4, 1, 2, 2, 2},                  // Output2 shape
+      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisOne) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {1},                                                      // Axis value
+      {4, 2, 1, 2, 2},                                          // Output1 shape
+      {1, 2, 3, 4, 9, 10, 11, 12},   // Output1 values
+      {4, 2, 1, 2, 2},               // Output2 shape
+      {5, 6, 7, 8, 13, 14, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisTwo) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {2},                                                      // Axis value
+      {4, 2, 2, 1, 2},                                          // Output1 shape
+      {1, 2, 5, 6, 9, 10, 13, 14},   // Output1 values
+      {4, 2, 2, 1, 2},               // Output2 shape
+      {3, 4, 7, 8, 11, 12, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisThree) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {3},                                                      // Axis value
+      {4, 2, 2, 2, 1},                                          // Output1 shape
+      {1, 3, 5, 7, 9, 11, 13, 15},   // Output1 values
+      {4, 2, 2, 2, 1},               // Output2 shape
+      {2, 4, 6, 8, 10, 12, 14, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalNegativeAxis) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {-4},                                                     // Axis value
+      {4, 1, 2, 2, 2},                                          // Output1 shape
+      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
+      {4, 1, 2, 2, 2},                  // Output2 shape
+      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(FourSplit) {
+  constexpr int output1_dims_count = 1;
+  constexpr int output2_dims_count = 1;
+  constexpr int output3_dims_count = 1;
+  constexpr int output4_dims_count = 1;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  float output4_data[output4_dims_count];
+  tflite::testing::TestSplitFourOutputsFloat({1, 4},        // Input shape
+                                             {1, 2, 3, 4},  // Input values
+                                             {1, 1},        // Axis shape
+                                             {0},           // Axis value
+                                             {1, 1},        // Output1 shape
+                                             {1},           // Output1 values
+                                             {1, 1},        // Output2 shape
+                                             {2},           // Output2 values
+                                             {1, 1},        // Output3 shape
+                                             {3},           // Output3 values
+                                             {1, 1},        // Output4 shape
+                                             {4},           // Output4 values
+                                             output1_data, output2_data,
+                                             output3_data, output4_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitOneDimensional) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsFloat({1, 2},  // Input shape
+                                            {1, 2},  // Input values
+                                            {1, 1},  // Axis shape
+                                            {0},     // Axis value
+                                            {1, 1},  // Output1 shape
+                                            {1},     // Output1 values
+                                            {1, 1},  // Output2 shape
+                                            {2},     // Output2 values
+                                            output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalQuantized) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  uint8_t output1_data[output1_dims_count];
+  uint8_t output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsQuantized(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {0},                                                      // Axis value
+      {4, 1, 2, 2, 2},                                          // Output1 shape
+      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
+      {4, 1, 2, 2, 2},                  // Output2 shape
+      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TEST(TwoSplitFourDimensionalQuantized32) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  int32_t output1_data[output1_dims_count];
+  int32_t output2_data[output2_dims_count];
+  tflite::testing::TestSplitTwoOutputsQuantized32(
+      {4, 2, 2, 2, 2},                                          // Input shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
+      {1, 1},                                                   // Axis shape
+      {0},                                                      // Axis value
+      {4, 1, 2, 2, 2},                                          // Output1 shape
+      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
+      {4, 1, 2, 2, 2},                  // Output2 shape
+      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
+      output1_data, output2_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/strided_slice.cc b/tensorflow/lite/experimental/micro/kernels/strided_slice.cc
new file mode 100644
index 00000000000..0bf267d3a84
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/strided_slice.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+
+#include <cmath>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace strided_slice {
+
+enum KernelType {
+  kReference,
+  // TODO(soroosh): add kGenericOptimized
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kBeginTensor = 1;
+constexpr int kEndTensor = 2;
+constexpr int kStridesTensor = 3;
+constexpr int kOutputTensor = 0;
+
+struct StridedSliceContext {
+  StridedSliceContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
+    input = GetInput(context, node, kInputTensor);
+    begin = GetInput(context, node, kBeginTensor);
+    end = GetInput(context, node, kEndTensor);
+    strides = GetInput(context, node, kStridesTensor);
+    output = GetOutput(context, node, kOutputTensor);
+    dims = NumDimensions(input);
+  }
+  const TfLiteStridedSliceParams* params;
+  const TfLiteTensor* input;
+  const TfLiteTensor* begin;
+  const TfLiteTensor* end;
+  const TfLiteTensor* strides;
+  TfLiteTensor* output;
+  int dims;
+};
+
+// This Op only supports 1-4D cases and since we use the reference 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 4;
+
+tflite::StridedSliceParams BuildStridedSliceParams(
+    StridedSliceContext* op_context) {
+  tflite::StridedSliceParams op_params;
+  op_params.start_indices_count = op_context->dims;
+  op_params.stop_indices_count = op_context->dims;
+  op_params.strides_count = op_context->dims;
+
+  for (int i = 0; i < op_context->dims; ++i) {
+    op_params.start_indices[i] = GetTensorData<int32_t>(op_context->begin)[i];
+    op_params.stop_indices[i] = GetTensorData<int32_t>(op_context->end)[i];
+    op_params.strides[i] = GetTensorData<int32_t>(op_context->strides)[i];
+  }
+
+  op_params.begin_mask = op_context->params->begin_mask;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = op_context->params->end_mask;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = op_context->params->shrink_axis_mask;
+  return op_params;
+}
+
+// Processes the indexing tensors (begin, end and strides) to resize the
+// output tensor. This function is callable from both Prepare() and Eval() as
+// long as the caller ensures the indexing tensors are present.
+TfLiteStatus CheckOutputSize(TfLiteContext* context,
+                             StridedSliceContext* op_context) {
+  using ::tflite::strided_slice::StartForAxis;
+  using ::tflite::strided_slice::StopForAxis;
+  TfLiteIntArray* output_shape = op_context->output->dims;
+  int shape_size = 0;
+  auto op_params = BuildStridedSliceParams(op_context);
+  auto input_shape = GetTensorShape(op_context->input);
+  for (int idx = 0; idx < op_context->dims; ++idx) {
+    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
+    int32_t begin = StartForAxis(op_params, input_shape, idx);
+    int32_t end = StopForAxis(op_params, input_shape, idx, begin);
+
+    // When shrinking an axis, the end position does not matter (and can be
+    // incorrect when negative indexing is used, see Issue #19260). Always use
+    // begin + 1 to generate a length 1 slice, since begin has
+    // already been adjusted for negative indices by StartForAxis.
+    const bool shrink_axis = op_context->params->shrink_axis_mask & (1 << idx);
+    if (shrink_axis) {
+      end = begin + 1;
+    }
+
+    // This is valid for both positive and negative strides
+    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!shrink_axis) {
+      TF_LITE_ENSURE_EQ(context, output_shape->data[shape_size], dim_shape);
+      shape_size++;
+    }
+  }
+  TF_LITE_ENSURE_EQ(context, output_shape->size, shape_size);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  StridedSliceContext op_context(context, node);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= kMaxDim,
+                     "input dim should not exceed 4");
+  return CheckOutputSize(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  StridedSliceContext op_context(context, node);
+  auto op_params = BuildStridedSliceParams(&op_context);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
+  kernel_type::StridedSlice(op_params, GetTensorShape(op_context.input), \
+                            GetTensorData<data_type>(op_context.input),  \
+                            GetTensorShape(op_context.output),           \
+                            GetTensorData<data_type>(op_context.output))
+
+  switch (op_context.input->type) {
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type %d is currently not supported "
+                           "by StridedSlice.",
+                           op_context.input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_STRIDED_SLICE
+  return kTfLiteOk;
+}
+}  // namespace strided_slice
+
+TfLiteRegistration* Register_STRIDED_SLICE() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, strided_slice::Prepare,
+      strided_slice::Eval<strided_slice::kReference>};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/strided_slice_test.cc b/tensorflow/lite/experimental/micro/kernels/strided_slice_test.cc
new file mode 100644
index 00000000000..59bee45a0bf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/strided_slice_test.cc
@@ -0,0 +1,1190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+template <typename input_type = int32_t,
+          TfLiteType tensor_input_type = kTfLiteInt32>
+inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
+                                 const char* name, bool is_variable = false) {
+  TfLiteTensor result;
+  result.type = tensor_input_type;
+  result.data.raw = reinterpret_cast<char*>(const_cast<input_type*>(data));
+  result.dims = dims;
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(input_type);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+template <typename input_type = int32_t,
+          TfLiteType tensor_input_type = kTfLiteInt32>
+inline TfLiteTensor CreateTensor(std::initializer_list<input_type> data,
+                                 TfLiteIntArray* dims, const char* name,
+                                 bool is_variable = false) {
+  return CreateTensor<input_type, tensor_input_type>(data.begin(), dims, name,
+                                                     is_variable);
+}
+
+template <typename input_type = float,
+          TfLiteType tensor_input_type = kTfLiteFloat32>
+void TestStrideSlide(std::initializer_list<int> input_shape,
+                     std::initializer_list<int> begin_shape,
+                     std::initializer_list<int> end_shape,
+                     std::initializer_list<int> strides_shape, int begin_mask,
+                     int end_mask, int ellipsis_mask, int new_axis_mask,
+                     int shrink_axis_mask,
+                     std::initializer_list<input_type> input_data,
+                     std::initializer_list<int32_t> begin_data,
+                     std::initializer_list<int32_t> end_data,
+                     std::initializer_list<int32_t> strides_data,
+                     std::initializer_list<int> output_shape,
+                     input_type* output_data,
+                     std::initializer_list<int> expected_output,
+                     bool expect_prepare_err, bool expect_invoke_err,
+                     int num_invoke = 1) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_shape);
+  TfLiteIntArray* begin_dims = IntArrayFromInitializer(begin_shape);
+  TfLiteIntArray* end_dims = IntArrayFromInitializer(end_shape);
+  TfLiteIntArray* strides_dims = IntArrayFromInitializer(strides_shape);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_shape);
+  constexpr int inputs_size = 4;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateTensor<input_type, tensor_input_type>(input_data, input_dims,
+                                                  "input_tensor"),
+      CreateTensor<int32_t, kTfLiteInt32>(begin_data, begin_dims,
+                                          "begin_tensor"),
+      CreateTensor<int32_t, kTfLiteInt32>(end_data, end_dims, "end_tensor"),
+      CreateTensor<int32_t, kTfLiteInt32>(strides_data, strides_dims,
+                                          "stride_tensor"),
+      CreateTensor<input_type, tensor_input_type>(output_data, output_dims,
+                                                  "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TfLiteStridedSliceParams builtin_data = {begin_mask, end_mask, ellipsis_mask,
+                                           new_axis_mask, shrink_axis_mask};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {4, 0, 1, 2, 3};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 4};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    if (expect_prepare_err) {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                              registration->prepare(&context, &node));
+      return;
+    }
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  if (expect_invoke_err) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                            registration->invoke(&context, &node));
+    return;
+  }
+  for (int i = 0; i < num_invoke; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  }
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  auto* output_tensor = &context.tensors[node.outputs->data[0]];
+  for (int i = 0; i < expected_output.size(); ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+using tflite::testing::TestStrideSlide;
+
+TF_LITE_MICRO_TEST(UnsupportedInputSize) {
+  float output_data[4];
+  TestStrideSlide<float>({5, 2, 2, 2, 2, 2},  // input_shape
+                         {1, 5},              //  begin_shape
+                         {1, 5},              // end_shape
+                         {1, 5},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {},                  // input_data
+                         {},                  // begin_data
+                         {},                  // end_data
+                         {},                  // strides_data
+                         {0},                 // output_shape
+                         output_data,         // output_data
+                         {},                  // expected_output
+                         true,                // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 2},        // output_shape
+                         output_data,   // output_data
+                         {2, 3},        // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_EmptyOutput) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {10},          // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 0},        // output_shape
+                         output_data,   // output_data
+                         {},            // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_NegativeBegin) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {-3},          // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 2},        // output_shape
+                         output_data,   // output_data
+                         {2, 3},        // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_OutOfRangeBegin) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {-5},          // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 3},        // output_shape
+                         output_data,   // output_data
+                         {1, 2, 3},     // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_NegativeEnd) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {-2},          // end_data
+                         {1},           // strides_data
+                         {1, 1},        // output_shape
+                         output_data,   // output_data
+                         {2},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_OutOfRangeEnd) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {-3},          // begin_data
+                         {5},           // end_data
+                         {1},           // strides_data
+                         {1, 3},        // output_shape
+                         output_data,   // output_data
+                         {2, 3, 4},     // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_BeginMask) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         1,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 3},        // output_shape
+                         output_data,   // output_data
+                         {1, 2, 3},     // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_NegativeBeginNegativeStride) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {-2},          // begin_data
+                         {-3},          // end_data
+                         {-1},          // strides_data
+                         {1, 1},        // output_shape
+                         output_data,   // output_data
+                         {3},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_OutOfRangeBeginNegativeStride) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {5},           // begin_data
+                         {2},           // end_data
+                         {-1},          // strides_data
+                         {1, 1},        // output_shape
+                         output_data,   // output_data
+                         {4},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_NegativeEndNegativeStride) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {2},           // begin_data
+                         {-4},          // end_data
+                         {-1},          // strides_data
+                         {1, 2},        // output_shape
+                         output_data,   // output_data
+                         {3, 2},        // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_OutOfRangeEndNegativeStride) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {-3},          // begin_data
+                         {-5},          // end_data
+                         {-1},          // strides_data
+                         {1, 2},        // output_shape
+                         output_data,   // output_data
+                         {2, 1},        // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_EndMask) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         1,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {3},           // end_data
+                         {1},           // strides_data
+                         {1, 3},        // output_shape
+                         output_data,   // output_data
+                         {2, 3, 4},     // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_NegStride) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 3},       // input_shape
+                         {1, 1},       //  begin_shape
+                         {1, 1},       // end_shape
+                         {1, 1},       //  strides_shape
+                         0,            // begin_mask
+                         0,            // end_mask
+                         0,            // ellipsis_mask
+                         0,            // new_axis_mask
+                         0,            // shrink_axis_mask
+                         {1, 2, 3},    // input_data
+                         {-1},         // begin_data
+                         {-4},         // end_data
+                         {-1},         // strides_data
+                         {1, 3},       // output_shape
+                         output_data,  // output_data
+                         {3, 2, 1},    // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_EvenLenStride2) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 2},       // input_shape
+                         {1, 1},       //  begin_shape
+                         {1, 1},       // end_shape
+                         {1, 1},       //  strides_shape
+                         0,            // begin_mask
+                         0,            // end_mask
+                         0,            // ellipsis_mask
+                         0,            // new_axis_mask
+                         0,            // shrink_axis_mask
+                         {1, 2},       // input_data
+                         {0},          // begin_data
+                         {4},          // end_data
+                         {2},          // strides_data
+                         {1, 1},       // output_shape
+                         output_data,  // output_data
+                         {1},          // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_OddLenStride2) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 3},       // input_shape
+                         {1, 1},       //  begin_shape
+                         {1, 1},       // end_shape
+                         {1, 1},       //  strides_shape
+                         0,            // begin_mask
+                         0,            // end_mask
+                         0,            // ellipsis_mask
+                         0,            // new_axis_mask
+                         0,            // shrink_axis_mask
+                         {1, 2, 3},    // input_data
+                         {0},          // begin_data
+                         {3},          // end_data
+                         {2},          // strides_data
+                         {1, 2},       // output_shape
+                         output_data,  // output_data
+                         {1, 3},       // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_Identity) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {0, 0},              // begin_data
+                         {2, 3},              // end_data
+                         {1, 1},              // strides_data
+                         {2, 2, 3},           // output_shape
+                         output_data,         // output_data
+                         {1, 2, 3, 4, 5, 6},  // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, 0},              // begin_data
+                         {2, 2},              // end_data
+                         {1, 1},              // strides_data
+                         {2, 1, 2},           // output_shape
+                         output_data,         // output_data
+                         {4, 5},              // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_Stride2) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {0, 0},              // begin_data
+                         {2, 3},              // end_data
+                         {2, 2},              // strides_data
+                         {2, 1, 2},           // output_shape
+                         output_data,         // output_data
+                         {1, 3},              // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_NegStride) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, -1},             // begin_data
+                         {2, -4},             // end_data
+                         {2, -1},             // strides_data
+                         {2, 1, 3},           // output_shape
+                         output_data,         // output_data
+                         {6, 5, 4},           // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_BeginMask) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         1,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, 0},              // begin_data
+                         {2, 2},              // end_data
+                         {1, 1},              // strides_data
+                         {2, 2, 2},           // output_shape
+                         output_data,         // output_data
+                         {1, 2, 4, 5},        // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_EndMask) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         2,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, 0},              // begin_data
+                         {2, 2},              // end_data
+                         {1, 1},              // strides_data
+                         {2, 1, 3},           // output_shape
+                         output_data,         // output_data
+                         {4, 5, 6},           // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_NegStrideBeginMask) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         2,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, -2},             // begin_data
+                         {2, -4},             // end_data
+                         {1, -1},             // strides_data
+                         {2, 1, 3},           // output_shape
+                         output_data,         // output_data
+                         {6, 5, 4},           // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_NegStrideEndMask) {
+  float output_data[8];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         2,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, -2},             // begin_data
+                         {2, -3},             // end_data
+                         {1, -1},             // strides_data
+                         {2, 1, 2},           // output_shape
+                         output_data,         // output_data
+                         {5, 4},              // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_Identity) {
+  float output_data[16];
+  TestStrideSlide<float>(
+      {3, 2, 3, 2},                             // input_shape
+      {1, 3},                                   //  begin_shape
+      {1, 3},                                   // end_shape
+      {1, 3},                                   //  strides_shape
+      0,                                        // begin_mask
+      0,                                        // end_mask
+      0,                                        // ellipsis_mask
+      0,                                        // new_axis_mask
+      0,                                        // shrink_axis_mask
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+      {0, 0, 0},                                // begin_data
+      {2, 3, 2},                                // end_data
+      {1, 1, 1},                                // strides_data
+      {3, 2, 3, 2},                             // output_shape
+      output_data,                              // output_data
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // expected_output
+      false,                                    // expect_prepare_err
+      false                                     // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_NegStride) {
+  float output_data[16];
+  TestStrideSlide<float>(
+      {3, 2, 3, 2},                             // input_shape
+      {1, 3},                                   //  begin_shape
+      {1, 3},                                   // end_shape
+      {1, 3},                                   //  strides_shape
+      0,                                        // begin_mask
+      0,                                        // end_mask
+      0,                                        // ellipsis_mask
+      0,                                        // new_axis_mask
+      0,                                        // shrink_axis_mask
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+      {-1, -1, -1},                             // begin_data
+      {-3, -4, -3},                             // end_data
+      {-1, -1, -1},                             // strides_data
+      {3, 2, 3, 2},                             // output_shape
+      output_data,                              // output_data
+      {12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1},  // expected_output
+      false,                                    // expect_prepare_err
+      false                                     // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_Strided2) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         0,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {2, 3, 2},                                // end_data
+                         {2, 2, 2},     // strides_data
+                         {3, 1, 2, 1},  // output_shape
+                         output_data,   // output_data
+                         {1, 5},        // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_ShrinkAxisMask1) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         1,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {2},           // end_data
+                         {1},           // strides_data
+                         {0},           // output_shape
+                         output_data,   // output_data
+                         {2},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_ShrinkAxisMask1_NegativeSlice) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         1,             // shrink_axis_mask
+                         {0, 1, 2, 3},  // input_data
+                         {-1},          // begin_data
+                         {0},           // end_data
+                         {1},           // strides_data
+                         {0},           // output_shape
+                         output_data,   // output_data
+                         {3},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_ShrinkAxis3_NegativeSlice) {
+  float output_data[4];
+  TestStrideSlide<float>({2, 4, 1},     // input_shape
+                         {1, 2},        //  begin_shape
+                         {1, 2},        // end_shape
+                         {1, 2},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         3,             // shrink_axis_mask
+                         {0, 1, 2, 3},  // input_data
+                         {-2, -1},      // begin_data
+                         {-1, 0},       // end_data
+                         {1, 1},        // strides_data
+                         {0},           // output_shape
+                         output_data,   // output_data
+                         {2},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice) {
+  float output_data[4];
+  TestStrideSlide<float>({2, 4, 1},     // input_shape
+                         {1, 2},        //  begin_shape
+                         {1, 2},        // end_shape
+                         {1, 2},        //  strides_shape
+                         1,             // begin_mask
+                         1,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         2,             // shrink_axis_mask
+                         {0, 1, 2, 3},  // input_data
+                         {0, -1},       // begin_data
+                         {0, 0},        // end_data
+                         {1, 1},        // strides_data
+                         {1, 4},        // output_shape
+                         output_data,   // output_data
+                         {0, 1, 2, 3},  // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In1D_BeginMaskShrinkAxisMask1) {
+  float output_data[4];
+  TestStrideSlide<float>({1, 4},        // input_shape
+                         {1, 1},        //  begin_shape
+                         {1, 1},        // end_shape
+                         {1, 1},        //  strides_shape
+                         1,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         1,             // shrink_axis_mask
+                         {1, 2, 3, 4},  // input_data
+                         {1},           // begin_data
+                         {1},           // end_data
+                         {1},           // strides_data
+                         {0},           // output_shape
+                         output_data,   // output_data
+                         {1},           // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask1) {
+  float output_data[6];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         1,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {0, 0},              // begin_data
+                         {1, 3},              // end_data
+                         {1, 1},              // strides_data
+                         {1, 3},              // output_shape
+                         output_data,         // output_data
+                         {1, 2, 3},           // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask2) {
+  float output_data[6];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         2,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {0, 0},              // begin_data
+                         {2, 1},              // end_data
+                         {1, 1},              // strides_data
+                         {1, 2},              // output_shape
+                         output_data,         // output_data
+                         {1, 4},              // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask3) {
+  float output_data[6];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         0,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         3,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {0, 0},              // begin_data
+                         {1, 1},              // end_data
+                         {1, 1},              // strides_data
+                         {0},                 // output_shape
+                         output_data,         // output_data
+                         {1},                 // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         1,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {1, 3, 2},                                // end_data
+                         {1, 1, 1},           // strides_data
+                         {2, 3, 2},           // output_shape
+                         output_data,         // output_data
+                         {1, 2, 3, 4, 5, 6},  // expected_output
+                         false,               // expect_prepare_err
+                         false                // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis2) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         2,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {2, 1, 2},                                // end_data
+                         {1, 1, 1},     // strides_data
+                         {2, 2, 2},     // output_shape
+                         output_data,   // output_data
+                         {1, 2, 7, 8},  // expected_output
+                         false,         // expect_prepare_err
+                         false          // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis3) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         3,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {1, 1, 2},                                // end_data
+                         {1, 1, 1},    // strides_data
+                         {1, 2},       // output_shape
+                         output_data,  // output_data
+                         {1, 2},       // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis4) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         4,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {2, 3, 1},                                // end_data
+                         {1, 1, 1},            // strides_data
+                         {2, 2, 3},            // output_shape
+                         output_data,          // output_data
+                         {1, 3, 5, 7, 9, 11},  // expected_output
+                         false,                // expect_prepare_err
+                         false                 // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis5) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         5,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {1, 3, 1},                                // end_data
+                         {1, 1, 1},    // strides_data
+                         {1, 3},       // output_shape
+                         output_data,  // output_data
+                         {1, 3, 5},    // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis6) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         6,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {2, 1, 1},                                // end_data
+                         {1, 1, 1},    // strides_data
+                         {1, 2},       // output_shape
+                         output_data,  // output_data
+                         {1, 7},       // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis7) {
+  float output_data[16];
+  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
+                         {1, 3},        //  begin_shape
+                         {1, 3},        // end_shape
+                         {1, 3},        //  strides_shape
+                         0,             // begin_mask
+                         0,             // end_mask
+                         0,             // ellipsis_mask
+                         0,             // new_axis_mask
+                         7,             // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+                         {0, 0, 0},                                // begin_data
+                         {1, 1, 1},                                // end_data
+                         {1, 1, 1},    // strides_data
+                         {0},          // output_shape
+                         output_data,  // output_data
+                         {1},          // expected_output
+                         false,        // expect_prepare_err
+                         false         // expect_invoke_err
+  );
+}
+
+// This tests catches a very subtle bug that was fixed by cl/188403234.
+TF_LITE_MICRO_TEST(RunTwice) {
+  float output_data[6];
+  TestStrideSlide<float>({2, 2, 3},           // input_shape
+                         {1, 2},              //  begin_shape
+                         {1, 2},              // end_shape
+                         {1, 2},              //  strides_shape
+                         1,                   // begin_mask
+                         0,                   // end_mask
+                         0,                   // ellipsis_mask
+                         0,                   // new_axis_mask
+                         0,                   // shrink_axis_mask
+                         {1, 2, 3, 4, 5, 6},  // input_data
+                         {1, 0},              // begin_data
+                         {2, 2},              // end_data
+                         {1, 1},              // strides_data
+                         {2, 2, 2},           // output_shape
+                         output_data,         // output_data
+                         {1, 2, 4, 5},        // expected_output
+                         false,               // expect_prepare_err
+                         false,               // expect_invoke_err
+                         2                    // num_invoke
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1Uint8) {
+  uint8_t output_data[12];
+  TestStrideSlide<uint8_t, kTfLiteUInt8>(
+      {3, 2, 3, 2},                             // input_shape
+      {1, 3},                                   //  begin_shape
+      {1, 3},                                   // end_shape
+      {1, 3},                                   //  strides_shape
+      0,                                        // begin_mask
+      0,                                        // end_mask
+      0,                                        // ellipsis_mask
+      0,                                        // new_axis_mask
+      1,                                        // shrink_axis_mask
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+      {0, 0, 0},                                // begin_data
+      {1, 3, 2},                                // end_data
+      {1, 1, 1},                                // strides_data
+      {2, 3, 2},                                // output_shape
+      output_data,                              // output_data
+      {1, 2, 3, 4, 5, 6},                       // expected_output
+      false,                                    // expect_prepare_err
+      false                                     // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1int8) {
+  int8_t output_data[12];
+  TestStrideSlide<int8_t, kTfLiteInt8>(
+      {3, 2, 3, 2},                             // input_shape
+      {1, 3},                                   //  begin_shape
+      {1, 3},                                   // end_shape
+      {1, 3},                                   //  strides_shape
+      0,                                        // begin_mask
+      0,                                        // end_mask
+      0,                                        // ellipsis_mask
+      0,                                        // new_axis_mask
+      1,                                        // shrink_axis_mask
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
+      {0, 0, 0},                                // begin_data
+      {1, 3, 2},                                // end_data
+      {1, 1, 1},                                // strides_data
+      {2, 3, 2},                                // output_shape
+      output_data,                              // output_data
+      {1, 2, 3, 4, 5, 6},                       // expected_output
+      false,                                    // expect_prepare_err
+      false                                     // expect_invoke_err
+  );
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/svdf.cc b/tensorflow/lite/experimental/micro/kernels/svdf.cc
new file mode 100644
index 00000000000..ac3442a238a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/svdf.cc
@@ -0,0 +1,305 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace svdf {
+namespace {
+
+// TODO(kreeger): upstream these reference methods into
+// `lite/kernels/reference/svdf.h`
+
+static inline void ApplyTimeWeightsBiasAndActivation(
+    int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const TfLiteTensor* weights_time, const TfLiteTensor* bias,
+    TfLiteFusedActivation activation, TfLiteTensor* activation_state,
+    TfLiteTensor* scratch, TfLiteTensor* output) {
+  // Compute matmul(state, weights_time).
+  // The rightmost column is used to save temporary output (with the size of
+  // num_filters). This is achieved by starting at
+  // GetTensorData<float>(activation_state), and having the stride equal to
+  // memory_size.
+  for (int b = 0; b < batch_size; ++b) {
+    // Perform batched vector dot product:
+    float* scratch_ptr_batch = GetTensorData<float>(scratch) + b * num_filters;
+    const float* vector1_ptr = GetTensorData<float>(weights_time);
+    const float* vector2_ptr =
+        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+    for (int i = 0; i < num_filters; ++i) {
+      *scratch_ptr_batch = 0.f;
+      for (int j = 0; j < memory_size; ++j) {
+        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+      }
+      scratch_ptr_batch++;
+    }
+  }
+
+  // Initialize output with bias if provided.
+  if (bias) {
+    // TODO(kreeger): doc me - VectorBatchVectorAssign
+    const float* bias_data = GetTensorData<float>(bias);
+    float* output_data = GetTensorData<float>(output);
+    for (int i = 0; i < batch_size; ++i) {
+      float* output_ptr = output_data + i * num_units;
+      const float* bias_ptr = bias_data;
+      for (int j = 0; j < num_units; ++j) {
+        *output_ptr++ = *bias_ptr++;
+      }
+    }
+  } else {
+    float* output_data = GetTensorData<float>(output);
+    for (int i = 0; i < batch_size * num_units; ++i) {
+      *output_data++ = 0.0f;
+    }
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    float* scratch_ptr_batch = GetTensorData<float>(scratch) + b * num_filters;
+
+    // Reduction sum vector
+    const float* input_vector_ptr = scratch_ptr_batch;
+    for (int i = 0; i < num_units; ++i) {
+      for (int j = 0; j < rank; j++) {
+        output_ptr_batch[i] += *input_vector_ptr++;
+      }
+    }
+  }
+
+  // Apply activation.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    for (int i = 0; i < num_units; ++i) {
+      *output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
+      ++output_ptr_batch;
+    }
+  }
+
+  // Left shift the activation_state to make room for next cycle's activation.
+  // TODO(alanchiao): explore collapsing this into a single loop.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+    for (int f = 0; f < num_filters; ++f) {
+      // Shift the vector left:
+      float* batch_ptr = state_ptr_batch;
+      float* batch_start = state_ptr_batch + 1;
+      float* batch_end = state_ptr_batch + memory_size;
+      while (batch_start != batch_end) {
+        *batch_ptr++ = *batch_start++;
+      }
+      state_ptr_batch[memory_size - 1] = 0.0f;
+      state_ptr_batch += memory_size;
+    }
+  }
+}
+
+inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
+                          const TfLiteTensor* input,
+                          const TfLiteTensor* weights_feature,
+                          const TfLiteTensor* weights_time,
+                          const TfLiteTensor* bias,
+                          const TfLiteSVDFParams* params, TfLiteTensor* scratch,
+                          TfLiteTensor* state, TfLiteTensor* output) {
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Clear the activation (state's leftmost column).
+  // TODO(ghodrat): Add a test which initialize activation_state with invalid
+  // values in leftmost column and make sure it passes.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(state) + b * memory_size * num_filters;
+    for (int c = 0; c < num_filters; ++c) {
+      float* state_ptr = state_ptr_batch + c * memory_size;
+      state_ptr[memory_size - 1] = 0.0f;
+    }
+  }
+
+  // Compute conv1d(inputs, weights_feature).
+  // The state's rightmost column is used to save current cycle activation. This
+  // is achieved by starting at GetTensorData<float>(state)[memory_size - 1] and
+  // having the stride equal to memory_size.
+
+  // Perform batched matrix vector multiply accumulate operation:
+  const float* matrix = GetTensorData<float>(weights_feature);
+  const float* vector = GetTensorData<float>(input);
+  float* result = &GetTensorData<float>(state)[memory_size - 1];
+  float* result_in_batch = result;
+  for (int i = 0; i < batch_size; ++i) {
+    const float* matrix_ptr = matrix;
+    for (int j = 0; j < num_filters; ++j) {
+      float dot_prod = 0.0f;
+      const float* vector_in_batch = vector + i * input_size;
+      for (int k = 0; k < input_size; ++k) {
+        dot_prod += *matrix_ptr++ * *vector_in_batch++;
+      }
+      *result_in_batch += dot_prod;
+      result_in_batch += memory_size;
+    }
+  }
+
+  ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters,
+                                    num_units, rank, weights_time, bias,
+                                    params->activation, state, scratch, output);
+}
+
+}  // namespace
+
+// Input tensors.
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+constexpr int kScratchTensorEvalFloat = 5;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // auto op_data = new OpData();
+  // // TODO(kreeger): Handle hybrid quant b/137786105
+  // op_data->float_weights_time_initialized = false;
+  // return op_data;
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
+
+  // The weights are of consistent type, so it suffices to check one.
+  const bool is_hybrid_op = IsHybridOp(input, weights_feature);
+
+  // TODO(kreeger): Handle hybrid quant b/137786105
+  // Note: only needs 4 scratch tensors when is_hybrid_op, only 1 otherwise.
+  int scratch_tensor_index = kScratchTensorEvalFloat;
+  TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
+
+  // TODO(kreeger): Handle this case for full quant svdf b/139435798
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int num_filters = weights_feature->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+  TF_LITE_ENSURE_EQ(context, input->dims->data[1],
+                    weights_feature->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
+
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  if (bias) {
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
+  }
+
+  const int activation_state_tensor_index =
+      node->inputs->data[kInputActivationStateTensor];
+  TfLiteTensor* activation_state =
+      &context->tensors[activation_state_tensor_index];
+
+  // Check the shape of input state tensors.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 0), batch_size);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 1),
+                    memory_size * num_filters);
+
+  node->temporaries->data[0] = scratch_tensor_index;
+
+  if (is_hybrid_op) {
+    // TODO(kreeger): Handle hybrid quant b/137786105
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+
+  TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+
+  const int activation_state_tensor_index =
+      node->inputs->data[kInputActivationStateTensor];
+  TfLiteTensor* activation_state =
+      &context->tensors[activation_state_tensor_index];
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (weights_feature->type) {
+    case kTfLiteFloat32: {
+      EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
+                    params, scratch, activation_state, output);
+      return kTfLiteOk;
+      break;
+    }
+    default:
+      // TODO(kreeger): Handle hybrid quant b/137786105
+      // TODO(kreeger): Handle this case for full quant svdf b/139435798
+      context->ReportError(context, "Type %s not currently supported.",
+                           TfLiteTypeGetName(weights_feature->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace svdf
+
+TfLiteRegistration* Register_SVDF() {
+  static TfLiteRegistration r = {svdf::Init, svdf::Free, svdf::Prepare,
+                                 svdf::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/svdf_test.cc b/tensorflow/lite/experimental/micro/kernels/svdf_test.cc
new file mode 100644
index 00000000000..32084022493
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/svdf_test.cc
@@ -0,0 +1,361 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+static float svdf_input[] = {
+    0.12609188,  -0.46347019, -0.89598465,
+    0.35867718,  0.36897406,  0.73463392,
+
+    0.14278367,  -1.64410412, -0.75222826,
+    -0.57290924, 0.12729003,  0.7567004,
+
+    0.49837467,  0.19278903,  0.26584083,
+    0.17660543,  0.52949083,  -0.77931279,
+
+    -0.11186574, 0.13164264,  -0.05349274,
+    -0.72674477, -0.5683046,  0.55900657,
+
+    -0.68892461, 0.37783599,  0.18263303,
+    -0.63690937, 0.44483393,  -0.71817774,
+
+    -0.81299269, -0.86831826, 1.43940818,
+    -0.95760226, 1.82078898,  0.71135032,
+
+    -1.45006323, -0.82251364, -1.69082689,
+    -1.65087092, -1.89238167, 1.54172635,
+
+    0.03966608,  -0.24936394, -0.77526885,
+    2.06740379,  -1.51439476, 1.43768692,
+
+    0.11771342,  -0.23761693, -0.65898693,
+    0.31088525,  -1.55601168, -0.87661445,
+
+    -0.89477462, 1.67204106,  -0.53235275,
+    -0.6230064,  0.29819036,  1.06939757,
+};
+
+static float svdf_golden_output_rank_1[] = {
+    0.014899,    -0.0517661,  -0.143725,   -0.00271883,
+    -0.03004015, 0.09565311,  0.1587342,   0.00784263,
+
+    0.068281,    -0.162217,   -0.152268,   0.00323521,
+    0.01582633,  0.03858774,  -0.03001583, -0.02671271,
+
+    -0.0317821,  -0.0333089,  0.0609602,   0.0333759,
+    -0.01432795, 0.05524484,  0.1101355,   -0.02382665,
+
+    -0.00623099, -0.077701,   -0.391193,   -0.0136691,
+    -0.02333033, 0.02293761,  0.12338032,  0.04326871,
+
+    0.201551,    -0.164607,   -0.179462,   -0.0592739,
+    0.01064911,  -0.17503069, 0.07821996,  -0.00224009,
+
+    0.0886511,   -0.0875401,  -0.269283,   0.0281379,
+    -0.02282338, 0.09741908,  0.32973239,  0.12281385,
+
+    -0.201174,   -0.586145,   -0.628624,   -0.0330412,
+    0.24780814,  -0.39304617, -0.22473189, 0.02589256,
+
+    -0.0839096,  -0.299329,   0.108746,    0.109808,
+    0.10084175,  -0.06416984, 0.28936723,  0.0026358,
+
+    0.419114,    -0.237824,   -0.422627,   0.175115,
+    -0.2314795,  -0.18584411, -0.4228974,  -0.12928449,
+
+    0.36726,     -0.522303,   -0.456502,   -0.175475,
+    0.17012937,  -0.34447709, 0.38505614,  -0.28158101,
+};
+
+static float svdf_golden_output_rank_2[] = {
+    -0.09623547, -0.10193135, 0.11083051,  -0.0347917,
+    0.1141196,   0.12965347,  -0.12652366, 0.01007236,
+
+    -0.16396809, -0.21247184, 0.11259045,  -0.04156673,
+    0.10132131,  -0.06143532, -0.00924693, 0.10084561,
+
+    0.01257364,  0.0506071,   -0.19287863, -0.07162561,
+    -0.02033747, 0.22673416,  0.15487903,  0.02525555,
+
+    -0.1411963,  -0.37054959, 0.01774767,  0.05867489,
+    0.09607603,  -0.0141301,  -0.08995658, 0.12867066,
+
+    -0.27142537, -0.16955489, 0.18521598,  -0.12528358,
+    0.00331409,  0.11167502,  0.02218599,  -0.07309391,
+
+    0.09593632,  -0.28361851, -0.0773851,  0.17199151,
+    -0.00075242, 0.33691186,  -0.1536046,  0.16572715,
+
+    -0.27916506, -0.27626723, 0.42615682,  0.3225764,
+    -0.37472126, -0.55655634, -0.05013514, 0.289112,
+
+    -0.24418658, 0.07540751,  -0.1940318,  -0.08911639,
+    0.00732617,  0.46737891,  0.26449674,  0.24888524,
+
+    -0.17225097, -0.54660404, -0.38795233, 0.08389944,
+    0.07736043,  -0.28260678, 0.15666828,  1.14949894,
+
+    -0.57454878, -0.64704704, 0.73235172,  -0.34616736,
+    0.21120001,  -0.22927976, 0.02455296,  -0.35906726,
+};
+
+void TestSVDF(const int batch_size, const int num_units, const int input_size,
+              const int memory_size, const int rank,
+              const std::initializer_list<int> input_dims_data,
+              const std::initializer_list<int> weights_feature_dims_data,
+              const std::initializer_list<int> weights_time_dims_data,
+              const std::initializer_list<int> activation_state_dims_data,
+              const std::initializer_list<int> output_dims_data,
+              float* input_data, float* weights_feature_data,
+              float* weights_time_data, float* activation_state_data,
+              float* scratch_data, float* output_data, float* golden_input_data,
+              int golden_input_data_size, float* expected_output) {
+  const int num_filters = num_units * rank;
+
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* weights_feature_dims =
+      IntArrayFromInitializer(weights_feature_dims_data);
+  TfLiteIntArray* weights_time_dims =
+      IntArrayFromInitializer(weights_time_dims_data);
+  TfLiteIntArray* activation_state_dims =
+      IntArrayFromInitializer(activation_state_dims_data);
+  // Scratch output is the same shape as output:
+  TfLiteIntArray* scratch_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+
+  const int tensor_count = 6;  // 4 inputs, 1 output, 1 scratch
+  TfLiteTensor tensors[tensor_count];
+  tensors[0] = CreateFloatTensor(input_data, input_dims, "input");
+  tensors[1] = CreateFloatTensor(weights_feature_data, weights_feature_dims,
+                                 "weights_feature");
+  tensors[2] =
+      CreateFloatTensor(weights_time_data, weights_time_dims, "weights_time");
+  tensors[3] = CreateFloatTensor(activation_state_data, activation_state_dims,
+                                 "activation_state", true /* is_variable */);
+  tensors[4] = CreateFloatTensor(output_data, output_dims, "output");
+  tensors[5] = CreateFloatTensor(scratch_data, scratch_dims, "scratch");
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensor_count, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SVDF, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSVDFParams params;
+  params.rank = rank;
+  params.activation = kTfLiteActNone;
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+
+  // Bias is an optional tensor:
+  int inputs_array_data[] = {5, 0, 1, 2, kOptionalTensor, 3};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+
+  int outputs_array_data[] = {1, 4};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  int temporaries_array_data[] = {1, 5};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&params);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  int input_sequence_size =
+      golden_input_data_size / sizeof(float) / (input_size * batch_size);
+  for (int i = 0; i < input_sequence_size; ++i) {
+    float* input_batch_start = golden_input_data + i * input_size * batch_size;
+    float* input_batch_end = input_batch_start + input_size * batch_size;
+
+    PopulateFloatTensor(&tensors[0], input_batch_start, input_batch_end);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+    int output_idx = 0;
+    int golden_idx = i * batch_size * num_units;
+    for (int j = golden_idx; j < golden_idx + batch_size * num_units; ++j) {
+      float expected = expected_output[j];
+      TF_LITE_MICRO_EXPECT_NEAR(expected, output_data[output_idx], 1e-5f);
+      output_idx++;
+    }
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(BlackBoxTestRank1) {
+  constexpr int batch_size = 2;
+  constexpr int num_units = 4;
+  constexpr int input_size = 3;
+  constexpr int memory_size = 10;
+  constexpr int rank = 1;
+  constexpr int num_filters = num_units * rank;
+
+  std::initializer_list<int> input_dims_data = {2, batch_size, input_size};
+  std::initializer_list<int> weights_feature_dims_data = {2, num_filters,
+                                                          input_size};
+  std::initializer_list<int> weights_time_dims_data = {2, num_filters,
+                                                       memory_size};
+  std::initializer_list<int> activation_state_dims_data = {
+      2, batch_size, memory_size * num_filters};
+  std::initializer_list<int> output_dims_data = {2, batch_size, num_units};
+
+  const int input_size_dims_count = batch_size * input_size;
+  float input_data[input_size_dims_count];
+
+  float weights_feature_data[] = {-0.31930989, -0.36118156, 0.0079667,
+                                  0.37613347,  0.22197971,  0.12416199,
+                                  0.27901134,  0.27557442,  0.3905206,
+                                  -0.36137494, -0.06634006, -0.10640851};
+
+  float weights_time_data[] = {
+      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+      0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+      0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+      -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+      -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+      0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+      -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+      -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
+
+  const int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  float activation_state_data[activation_state_dims_count];
+
+  const int scratch_dims_count = batch_size * num_filters;
+  float scratch_data[scratch_dims_count];
+
+  const int output_dims_count = batch_size * num_units;
+  float output_data[output_dims_count];
+
+  tflite::testing::TestSVDF(
+      batch_size, num_units, input_size, memory_size, rank, input_dims_data,
+      weights_feature_dims_data, weights_time_dims_data,
+      activation_state_dims_data, output_dims_data, input_data,
+      weights_feature_data, weights_time_data, activation_state_data,
+      scratch_data, output_data, tflite::testing::svdf_input,
+      sizeof(tflite::testing::svdf_input),
+      tflite::testing::svdf_golden_output_rank_1);
+}
+
+TF_LITE_MICRO_TEST(BlackBoxTestRank2) {
+  constexpr int batch_size = 2;
+  constexpr int num_units = 4;
+  constexpr int input_size = 3;
+  constexpr int memory_size = 10;
+  constexpr int rank = 2;
+  constexpr int num_filters = num_units * rank;
+
+  std::initializer_list<int> input_dims_data = {2, batch_size, input_size};
+  std::initializer_list<int> weights_feature_dims_data = {2, num_filters,
+                                                          input_size};
+  std::initializer_list<int> weights_time_dims_data = {2, num_filters,
+                                                       memory_size};
+  std::initializer_list<int> activation_state_dims_data = {
+      2, batch_size, memory_size * num_filters};
+  std::initializer_list<int> output_dims_data = {2, batch_size, num_units};
+
+  const int input_size_dims_count = batch_size * input_size;
+  float input_data[input_size_dims_count];
+
+  float weights_feature_data[] = {
+      -0.31930989, 0.0079667,   0.39296314,  0.37613347, 0.12416199,
+      0.15785322,  0.27901134,  0.3905206,   0.21931258, -0.36137494,
+      -0.10640851, 0.31053296,  -0.36118156, -0.0976817, -0.36916667,
+      0.22197971,  0.15294972,  0.38031587,  0.27557442, 0.39635518,
+      -0.21580373, -0.06634006, -0.02702999, 0.27072677};
+
+  float weights_time_data[] = {
+      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+      0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+      0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+      -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+      -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+      0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+      -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+      -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+      -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+      0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+      -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+      0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+      -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+      -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+      0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+      0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763};
+
+  const int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  float activation_state_data[activation_state_dims_count];
+
+  const int scratch_dims_count = batch_size * num_filters;
+  float scratch_data[scratch_dims_count];
+
+  const int output_dims_count = batch_size * num_units;
+  float output_data[output_dims_count];
+
+  tflite::testing::TestSVDF(
+      batch_size, num_units, input_size, memory_size, rank, input_dims_data,
+      weights_feature_dims_data, weights_time_dims_data,
+      activation_state_dims_data, output_dims_data, input_data,
+      weights_feature_data, weights_time_data, activation_state_data,
+      scratch_data, output_data, tflite::testing::svdf_input,
+      sizeof(tflite::testing::svdf_input),
+      tflite::testing::svdf_golden_output_rank_2);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/unpack.cc b/tensorflow/lite/experimental/micro/kernels/unpack.cc
new file mode 100644
index 00000000000..c4468449d72
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/unpack.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace unpack {
+namespace {
+
+constexpr int kInputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus UnpackImpl(TfLiteContext* context, TfLiteNode* node,
+                        const TfLiteTensor* input, int output_count, int axis) {
+  const TfLiteTensor* output0 = &context->tensors[node->outputs->data[0]];
+  const TfLiteIntArray* input_dims = input->dims;
+  const TfLiteIntArray* output_dims = output0->dims;
+  const int dimensions = input_dims->size;
+
+  if (axis < 0) {
+    axis += NumDimensions(input);
+  }
+
+  TFLITE_DCHECK_LT(axis, dimensions);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_dims->data[i];
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i) {
+    copy_size *= input_dims->data[i];
+  }
+  int output_size = 1;
+  for (int i = 0; i < output_dims->size; ++i) {
+    output_size *= output_dims->data[i];
+  }
+  TFLITE_DCHECK_EQ(output_size, copy_size * outer_size);
+
+  const T* input_data = GetTensorData<T>(input);
+
+  for (int i = 0; i < output_count; ++i) {
+    TfLiteTensor* t = &context->tensors[node->outputs->data[i]];
+    T* output_data = GetTensorData<T>(t);
+    for (int k = 0; k < outer_size; ++k) {
+      T* output_ptr = output_data + copy_size * k;
+      int loc = k * output_count * copy_size + i * copy_size;
+      const T* input_ptr = input_data + loc;
+      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteUnpackParams* data =
+      reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      return UnpackImpl<float>(context, node, input, data->num, data->axis);
+    }
+    case kTfLiteInt32: {
+      return UnpackImpl<int32_t>(context, node, input, data->num, data->axis);
+    }
+    case kTfLiteUInt8: {
+      return UnpackImpl<uint8_t>(context, node, input, data->num, data->axis);
+    }
+    case kTfLiteInt8: {
+      return UnpackImpl<int8_t>(context, node, input, data->num, data->axis);
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by unpack.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace
+}  // namespace unpack
+
+TfLiteRegistration* Register_UNPACK() {
+  static TfLiteRegistration r = {nullptr, nullptr, unpack::Prepare,
+                                 unpack::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/unpack_test.cc b/tensorflow/lite/experimental/micro/kernels/unpack_test.cc
new file mode 100644
index 00000000000..33e43b1410e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/unpack_test.cc
@@ -0,0 +1,476 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+
+void TestUnpackThreeOutputsFloat(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<float> input_data, int axis,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<float> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<float> expected_output2_data,
+    std::initializer_list<int> output3_dims_data,
+    std::initializer_list<float> expected_output3_data, float* output1_data,
+    float* output2_data, float* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+  const int output3_dims_count = ElementCount(*output3_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 3;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output1_data, output1_dims, "output1_tensor"),
+      CreateFloatTensor(output2_data, output2_dims, "output2_tensor"),
+      CreateFloatTensor(output3_data, output3_dims, "output3_tensor")};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    output3_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteUnpackParams builtin_data = {
+      .num = 3,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
+                              1e-5f);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
+                              1e-5f);
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data.begin()[i], output3_data[i],
+                              1e-5f);
+  }
+}
+
+void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
+                              std::initializer_list<float> input_data, int axis,
+                              std::initializer_list<int> output_dims_data,
+                              std::initializer_list<float> expected_output_data,
+                              float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor")};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteUnpackParams builtin_data = {
+      .num = 1,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 1});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestUnpackThreeOutputsQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, int axis,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<uint8_t> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<uint8_t> expected_output2_data,
+    std::initializer_list<int> output3_dims_data,
+    std::initializer_list<uint8_t> expected_output3_data, uint8_t* output1_data,
+    uint8_t* output2_data, uint8_t* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+  const int output3_dims_count = ElementCount(*output3_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 3;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      // CreateQuantizedTensor needs min/max values as input, but these values
+      // don't matter as to the functionality of UNPACK, so just set as 0
+      // and 10.
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", 0, 10),
+      CreateQuantizedTensor(output1_data, output1_dims, "output1_tensor", 0,
+                            10),
+      CreateQuantizedTensor(output2_data, output2_dims, "output2_tensor", 0,
+                            10),
+      CreateQuantizedTensor(output3_data, output3_dims, "output3_tensor", 0,
+                            10)};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    output3_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteUnpackParams builtin_data = {
+      .num = 3,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data.begin()[i], output3_data[i]);
+  }
+}
+
+void TestUnpackThreeOutputsQuantized32(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<int32_t> input_data, int axis,
+    std::initializer_list<int> output1_dims_data,
+    std::initializer_list<int32_t> expected_output1_data,
+    std::initializer_list<int> output2_dims_data,
+    std::initializer_list<int32_t> expected_output2_data,
+    std::initializer_list<int> output3_dims_data,
+    std::initializer_list<int32_t> expected_output3_data, int32_t* output1_data,
+    int32_t* output2_data, int32_t* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+  const int output1_dims_count = ElementCount(*output1_dims);
+  const int output2_dims_count = ElementCount(*output2_dims);
+  const int output3_dims_count = ElementCount(*output3_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 3;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(input_data, input_dims, "input_tensor", 1.0),
+      CreateQuantized32Tensor(output1_data, output1_dims, "output1_tensor",
+                              1.0),
+      CreateQuantized32Tensor(output2_data, output2_dims, "output2_tensor",
+                              1.0),
+      CreateQuantized32Tensor(output3_data, output3_dims, "output3_tensor",
+                              1.0)};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output1_dims_count; ++i) {
+    output1_data[i] = 23;
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    output2_data[i] = 23;
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    output3_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteUnpackParams builtin_data = {
+      .num = 3,
+      .axis = axis,
+  };
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output1_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+  }
+
+  for (int i = 0; i < output2_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+  }
+
+  for (int i = 0; i < output3_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data.begin()[i], output3_data[i]);
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(UnpackFloatThreeOutputs) {
+  constexpr int output1_dims_count = 2;
+  constexpr int output2_dims_count = 2;
+  constexpr int output3_dims_count = 2;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  tflite::testing::TestUnpackThreeOutputsFloat(
+      {2, 3, 2},           // Input shape
+      {1, 2, 3, 4, 5, 6},  // Input values
+      0, {1, 2},           // Output1 shape
+      {1, 2},              // Output1 values
+      {1, 2},              // Output2 shape
+      {3, 4},              // Output2 values
+      {1, 2},              // Output3 shape
+      {5, 6},              // Output3 values
+      output1_data, output2_data, output3_data);
+}
+
+TF_LITE_MICRO_TEST(UnpackFloatThreeOutputsNegativeAxisTwo) {
+  constexpr int output1_dims_count = 2;
+  constexpr int output2_dims_count = 2;
+  constexpr int output3_dims_count = 2;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  tflite::testing::TestUnpackThreeOutputsFloat(
+      {2, 3, 2},           // Input shape
+      {1, 2, 3, 4, 5, 6},  // Input values
+      -2, {1, 2},          // Output1 shape
+      {1, 2},              // Output1 values
+      {1, 2},              // Output2 shape
+      {3, 4},              // Output2 values
+      {1, 2},              // Output3 shape
+      {5, 6},              // Output3 values
+      output1_data, output2_data, output3_data);
+}
+
+TF_LITE_MICRO_TEST(UnpackFloatOneOutput) {
+  constexpr int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestUnpackOneOutputFloat(
+      {2, 1, 6},           // Input shape
+      {1, 2, 3, 4, 5, 6},  // Input values
+      0, {1, 6},           // Output shape
+      {1, 2, 3, 4, 5, 6},  // Output values
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(UnpackQuantizedThreeOutputs) {
+  constexpr int output1_dims_count = 2;
+  constexpr int output2_dims_count = 2;
+  constexpr int output3_dims_count = 2;
+  uint8_t output1_data[output1_dims_count];
+  uint8_t output2_data[output2_dims_count];
+  uint8_t output3_data[output3_dims_count];
+  tflite::testing::TestUnpackThreeOutputsQuantized(
+      {2, 3, 2},           // Input shape
+      {1, 2, 3, 4, 5, 6},  // Input values
+      0, {1, 2},           // Output1 shape
+      {1, 2},              // Output1 values
+      {1, 2},              // Output2 shape
+      {3, 4},              // Output2 values
+      {1, 2},              // Output3 shape
+      {5, 6},              // Output3 values
+      output1_data, output2_data, output3_data);
+}
+
+TF_LITE_MICRO_TEST(UnpackQuantized32ThreeOutputs) {
+  constexpr int output1_dims_count = 2;
+  constexpr int output2_dims_count = 2;
+  constexpr int output3_dims_count = 2;
+  int32_t output1_data[output1_dims_count];
+  int32_t output2_data[output2_dims_count];
+  int32_t output3_data[output3_dims_count];
+  tflite::testing::TestUnpackThreeOutputsQuantized32(
+      {2, 3, 2},           // Input shape
+      {1, 2, 3, 4, 5, 6},  // Input values
+      0, {1, 2},           // Output1 shape
+      {1, 2},              // Output1 values
+      {1, 2},              // Output2 shape
+      {3, 4},              // Output2 values
+      {1, 2},              // Output3 shape
+      {5, 6},              // Output3 values
+      output1_data, output2_data, output3_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/micro_allocator.cc b/tensorflow/lite/experimental/micro/micro_allocator.cc
index d9a5f73eba8..84fb6416a19 100644
--- a/tensorflow/lite/experimental/micro/micro_allocator.cc
+++ b/tensorflow/lite/experimental/micro/micro_allocator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/micro_allocator.h"
 
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/tensor_utils.h"
 
 namespace tflite {
 
@@ -42,7 +43,7 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
 
   // Null all inputs so we can later perform a null check to avoid re-allocating
   // registered pre-allocated inputs.
-  for (int i = 0; i < subgraph_->inputs()->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->inputs()->size(); ++i) {
     const int tensor_index = subgraph_->inputs()->Get(i);
     context_->tensors[tensor_index].data.raw = nullptr;
   }
@@ -74,7 +75,7 @@ TfLiteStatus MicroAllocator::AllocateTensors() {
       sizeof(int) * tensors_->size(), sizeof(int)));
   int* last_used = reinterpret_cast<int*>(tensor_allocator_.AllocateMemory(
       sizeof(int) * tensors_->size(), sizeof(int)));
-  for (int i = 0; i < tensors_->size(); ++i) {
+  for (size_t i = 0; i < tensors_->size(); ++i) {
     first_created[i] = -1;
     last_used[i] = -1;
   }
@@ -83,7 +84,7 @@ TfLiteStatus MicroAllocator::AllocateTensors() {
   // re-allocating later.  Since inputs are not created by a particular node, we
   // make up an index which does not overlap with any node.
   const int kInputIndex = subgraph_->inputs()->size();
-  for (int i = 0; i < subgraph_->inputs()->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->inputs()->size(); ++i) {
     const int tensor_index = subgraph_->inputs()->Get(i);
     const auto* tensor = tensors_->Get(tensor_index);
     // Check for and skip pre-allocated inputs.
@@ -98,13 +99,13 @@ TfLiteStatus MicroAllocator::AllocateTensors() {
 
   for (int i = (operators_->size() - 1); i >= 0; --i) {
     const auto* op = operators_->Get(i);
-    for (int n = 0; n < op->inputs()->size(); ++n) {
+    for (size_t n = 0; n < op->inputs()->size(); ++n) {
       const int tensor_index = op->inputs()->Get(n);
       if ((last_used[tensor_index] == -1) || (last_used[tensor_index] < i)) {
         last_used[tensor_index] = i;
       }
     }
-    for (int n = 0; n < op->outputs()->size(); ++n) {
+    for (size_t n = 0; n < op->outputs()->size(); ++n) {
       const int tensor_index = op->outputs()->Get(n);
       const int create_before = i;
       int destroy_after = last_used[tensor_index];
@@ -124,7 +125,7 @@ TfLiteStatus MicroAllocator::AllocateTensors() {
     }
   }
 
-  for (int i = 0; i < tensors_->size(); ++i) {
+  for (size_t i = 0; i < tensors_->size(); ++i) {
     const auto* tensor = tensors_->Get(i);
     const bool is_read_only = (first_created[i] == -1) && (last_used[i] != -1);
     if (tensor->is_variable() || is_read_only) {
@@ -134,6 +135,11 @@ TfLiteStatus MicroAllocator::AllocateTensors() {
       if (status != kTfLiteOk) {
         return status;
       }
+
+      // Set default value for variable tensors:
+      if (tensor->is_variable()) {
+        tflite::ResetVariableTensor(&context_->tensors[i]);
+      }
     }
   }
 
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc
index 5997196ea86..dc5643a7827 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc
@@ -84,9 +84,60 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.impl_ = static_cast<void*>(this);
   context_.ReportError = ReportOpError;
   context_.recommended_num_threads = 1;
+
+  // If the system is big endian then convert weights from the flatbuffer from
+  // little to big endian on startup so that it does not need to be done during
+  // inference.
+  // NOTE: This requires that the flatbuffer is held in memory which can be
+  // modified by this process.
+  if (!FLATBUFFERS_LITTLEENDIAN) {
+    for (int t = 0; t < tensors_size(); ++t) {
+      TfLiteTensor* thisTensor = &context_.tensors[t];
+      if (thisTensor->allocation_type == kTfLiteMmapRo)
+        CorrectTensorEndianness(thisTensor);
+    }
+  }
+
   initialization_status_ = kTfLiteOk;
 }
 
+void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
+  int32_t tensorSize = 1;
+  for (int d = 0; d < tensorCorr->dims->size; ++d)
+    tensorSize *= reinterpret_cast<const int32_t*>(tensorCorr->dims->data)[d];
+
+  switch (tensorCorr->type) {
+    case TfLiteType::kTfLiteFloat32:
+      CorrectTensorDataEndianness(tensorCorr->data.f, tensorSize);
+      break;
+    case TfLiteType::kTfLiteFloat16:
+      CorrectTensorDataEndianness(tensorCorr->data.f16, tensorSize);
+      break;
+    case TfLiteType::kTfLiteInt64:
+      CorrectTensorDataEndianness(tensorCorr->data.i64, tensorSize);
+      break;
+    case TfLiteType::kTfLiteInt32:
+      CorrectTensorDataEndianness(tensorCorr->data.i32, tensorSize);
+      break;
+    case TfLiteType::kTfLiteInt16:
+      CorrectTensorDataEndianness(tensorCorr->data.i16, tensorSize);
+      break;
+    case TfLiteType::kTfLiteComplex64:
+      CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize);
+      break;
+    default:
+      // Do nothing for other data types.
+      break;
+  }
+}
+
+template <class T>
+void MicroInterpreter::CorrectTensorDataEndianness(T* data, int32_t size) {
+  for (int32_t i = 0; i < size; ++i) {
+    data[i] = flatbuffers::EndianScalar(data[i]);
+  }
+}
+
 TfLiteStatus MicroInterpreter::RegisterPreallocatedInput(uint8_t* buffer,
                                                          size_t input_index) {
   return allocator_.RegisterPreallocatedInput(buffer, input_index);
@@ -112,9 +163,9 @@ TfLiteStatus MicroInterpreter::Invoke() {
   }
   TfLiteStatus status = kTfLiteOk;
   auto opcodes = model_->operator_codes();
-  for (int i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < operators_->size(); ++i) {
     const auto* op = operators_->Get(i);
-    int index = op->opcode_index();
+    size_t index = op->opcode_index();
     if (index < 0 || index >= opcodes->size()) {
       error_reporter_->Report("Missing registration for opcode_index %d\n",
                               index);
@@ -136,8 +187,10 @@ TfLiteStatus MicroInterpreter::Invoke() {
 
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
       error_reporter_->Report(
-          "Found builtin operator %s with custom options.\n",
+          "Unsupported behavior: found builtin operator %s with custom "
+          "options.\n",
           EnumNameBuiltinOperator(op_type));
+      return kTfLiteError;
     }
     StackDataAllocator stack_data_allocator;
     const char* custom_data = nullptr;
@@ -214,7 +267,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
   return status;
 }
 
-TfLiteTensor* MicroInterpreter::input(int index) {
+TfLiteTensor* MicroInterpreter::input(size_t index) {
   const flatbuffers::Vector<int32_t>* inputs = subgraph_->inputs();
   const size_t length = inputs->size();
   if ((index < 0) || (index >= length)) {
@@ -225,7 +278,7 @@ TfLiteTensor* MicroInterpreter::input(int index) {
   return &(context_.tensors[inputs->Get(index)]);
 }
 
-TfLiteTensor* MicroInterpreter::output(int index) {
+TfLiteTensor* MicroInterpreter::output(size_t index) {
   const flatbuffers::Vector<int32_t>* outputs = subgraph_->outputs();
   const size_t length = outputs->size();
   if ((index < 0) || (index >= outputs->size())) {
@@ -236,4 +289,14 @@ TfLiteTensor* MicroInterpreter::output(int index) {
   return &(context_.tensors[outputs->Get(index)]);
 }
 
+TfLiteTensor* MicroInterpreter::tensor(size_t index) {
+  const size_t length = tensors_size();
+  if ((index < 0) || (index >= tensors_size())) {
+    error_reporter_->Report("Tensor index %d out of range (length is %d)",
+                            index, length);
+    return nullptr;
+  }
+  return &context_.tensors[index];
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h
index e3d3f79e64a..ca0a42e2c38 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.h
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.h
@@ -57,12 +57,12 @@ class MicroInterpreter {
   TfLiteStatus Invoke();
 
   size_t tensors_size() const { return context_.tensors_size; }
-  TfLiteTensor* tensor(int tensor_index);
+  TfLiteTensor* tensor(size_t tensor_index);
 
-  TfLiteTensor* input(int index);
+  TfLiteTensor* input(size_t index);
   size_t inputs_size() const { return subgraph_->inputs()->Length(); }
 
-  TfLiteTensor* output(int index);
+  TfLiteTensor* output(size_t index);
   size_t outputs_size() const { return subgraph_->outputs()->Length(); }
 
   TfLiteStatus initialization_status() const { return initialization_status_; }
@@ -70,6 +70,10 @@ class MicroInterpreter {
   ErrorReporter* error_reporter() { return error_reporter_; }
 
  private:
+  void CorrectTensorEndianness(TfLiteTensor* tensorCorr);
+
+  template <class T>
+  void CorrectTensorDataEndianness(T* data, int32_t size);
 
   const Model* model_;
   const OpResolver& op_resolver_;
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
index f4473774a8a..dad998c8a8f 100644
--- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
+++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 namespace tflite {
@@ -32,6 +33,9 @@ TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
     case kTfLiteInt32:
       *size = sizeof(int32_t);
       break;
+    case kTfLiteInt8:
+      *size = sizeof(int8_t);
+      break;
     case kTfLiteUInt8:
       *size = sizeof(uint8_t);
       break;
@@ -45,9 +49,8 @@ TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
       *size = sizeof(float) * 2;
       break;
     default:
-      reporter->Report(
-          "Only float32, int16, int32, int64, uint8, bool, complex64 "
-          "supported currently.");
+      reporter->Report("Type %s (%d) not is not supported",
+                       TfLiteTypeGetName(type), type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -102,7 +105,7 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor(
     result->allocation_type = kTfLiteMmapRo;
   } else {
     int data_size = 1;
-    for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
       data_size *= flatbuffer_tensor.shape()->Get(n);
     }
     size_t type_size;
@@ -131,7 +134,7 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor(
   result->dims = reinterpret_cast<TfLiteIntArray*>(AllocateMemory(
       sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1), sizeof(int)));
   result->dims->size = flatbuffer_tensor.shape()->Length();
-  for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
     result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
   }
   const auto* src_quantization = flatbuffer_tensor.quantization();
@@ -140,10 +143,16 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor(
       src_quantization->zero_point() &&
       (src_quantization->zero_point()->size() > 0)) {
     result->params.scale = src_quantization->scale()->Get(0);
-    result->params.zero_point = src_quantization->zero_point()->Get(0);
+    for (unsigned int b = 0; b < sizeof(int64_t); ++b)
+      *(reinterpret_cast<char*>(&result->params.zero_point) + b) =
+          *(reinterpret_cast<const char*>(
+                src_quantization->zero_point()->Data()) +
+            b);
+    result->params.zero_point =
+        flatbuffers::EndianScalar(result->params.zero_point);
   }
   result->allocation = nullptr;
-  if (flatbuffer_tensor.name()) {
+  if (flatbuffer_tensor.name()->c_str() != nullptr) {
     result->name = flatbuffer_tensor.name()->c_str();
   } else {
     result->name = "<No name>";
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.h b/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
index 1c999c6bbef..87e9b7b104f 100644
--- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
+++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
@@ -27,7 +27,7 @@ namespace tflite {
 // This makes it pretty wasteful, so we should use a more intelligent method.
 class SimpleTensorAllocator {
  public:
-  SimpleTensorAllocator(uint8_t* buffer, int buffer_size)
+  SimpleTensorAllocator(uint8_t* buffer, size_t buffer_size)
       : data_size_(0), data_size_max_(buffer_size), data_(buffer) {}
 
   TfLiteStatus AllocateTensor(
@@ -43,7 +43,7 @@ class SimpleTensorAllocator {
 
  private:
   int data_size_;
-  int data_size_max_;
+  size_t data_size_max_;
   uint8_t* data_;
 };
 
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
index 97abc299992..bf7b6fa30e9 100644
--- a/tensorflow/lite/experimental/micro/testing/BUILD
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -1,3 +1,8 @@
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -13,6 +18,17 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "test_utils_test",
+    srcs = [
+        "test_utils_test.cc",
+    ],
+    deps = [
+        ":micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/testing/leon_commands b/tensorflow/lite/experimental/micro/testing/leon_commands
new file mode 100644
index 00000000000..5deb5f5dbc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/leon_commands
@@ -0,0 +1,3 @@
+run
+quit
+
diff --git a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh
new file mode 100755
index 00000000000..249ddae4857
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh
@@ -0,0 +1,47 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests a LEON 3 ELF binary by executing it using the TSIM emulator and parsing
+# the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output
+# logs for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_leon_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r LEON_
+mkdir -p ${MICRO_LOG_PATH}
+
+# Get the location of this script file as an absolute path
+SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`"
+SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`"
+LEON_COMMANDS="$SCRIPT_PATH/leon_commands"
+TSIM_PATH="tensorflow/lite/experimental/micro/tools/make/downloads/tsim/tsim/linux-x64/tsim-leon3"
+
+${TSIM_PATH} $1 -c ${LEON_COMMANDS} 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/lite/experimental/micro/testing/test_utils.h b/tensorflow/lite/experimental/micro/testing/test_utils.h
index 5130901852e..323c8576f1e 100644
--- a/tensorflow/lite/experimental/micro/testing/test_utils.h
+++ b/tensorflow/lite/experimental/micro/testing/test_utils.h
@@ -15,12 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
 
+#include <cmath>
 #include <cstdarg>
+#include <cstdint>
 #include <initializer_list>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/tensor_utils.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
@@ -46,6 +49,18 @@ inline void ReportOpError(struct TfLiteContext* context, const char* format,
   va_end(args);
 }
 
+// Derives the quantization range max from scaling factor and zero point.
+template <typename T>
+inline float MaxFromZeroPointScale(const int zero_point, const float scale) {
+  return (std::numeric_limits<T>::max() - zero_point) * scale;
+}
+
+// Derives the quantization range min from scaling factor and zero point.
+template <typename T>
+inline float MinFromZeroPointScale(const int zero_point, const float scale) {
+  return (std::numeric_limits<T>::min() - zero_point) * scale;
+}
+
 // Derives the quantization scaling factor from a min and max range.
 template <typename T>
 inline float ScaleFromMinMax(const float min, const float max) {
@@ -56,26 +71,39 @@ inline float ScaleFromMinMax(const float min, const float max) {
 // Derives the quantization zero point from a min and max range.
 template <typename T>
 inline int ZeroPointFromMinMax(const float min, const float max) {
-  return static_cast<int>((-min / ScaleFromMinMax<T>(min, max)) + 0.5f);
+  return static_cast<int>(std::numeric_limits<T>::min()) +
+         static_cast<int>(-min / ScaleFromMinMax<T>(min, max) + 0.5f);
 }
 
 // Converts a float value into an unsigned eight-bit quantized value.
 inline uint8_t F2Q(const float value, const float min, const float max) {
   int32_t result = ZeroPointFromMinMax<uint8_t>(min, max) +
                    (value / ScaleFromMinMax<uint8_t>(min, max)) + 0.5f;
-  if (result < 0) {
-    result = 0;
+  if (result < std::numeric_limits<uint8_t>::min()) {
+    result = std::numeric_limits<uint8_t>::min();
   }
-  if (result > 256) {
-    result = 256;
+  if (result > std::numeric_limits<uint8_t>::max()) {
+    result = std::numeric_limits<uint8_t>::max();
   }
   return result;
 }
 
-// Converts a float value into a signed thirty-two-bit quantized value.
-inline uint8_t F2Q32(const float value, const float min, const float max) {
-  return static_cast<int32_t>((value - ZeroPointFromMinMax<int32_t>(min, max)) /
-                              ScaleFromMinMax<int32_t>(min, max));
+// Converts a float value into a signed eight-bit quantized value.
+inline int8_t F2QS(const float value, const float min, const float max) {
+  return F2Q(value, min, max) + std::numeric_limits<int8_t>::min();
+}
+
+// Converts a float value into a signed thirty-two-bit quantized value.  Note
+// that values close to max int and min int may see significant error due to
+// a lack of floating point granularity for large values.
+inline int32_t F2Q32(const float value, const float scale) {
+  double quantized = value / scale;
+  if (quantized > std::numeric_limits<int32_t>::max()) {
+    quantized = std::numeric_limits<int32_t>::max();
+  } else if (quantized < std::numeric_limits<int32_t>::min()) {
+    quantized = std::numeric_limits<int32_t>::min();
+  }
+  return static_cast<int>(quantized);
 }
 
 inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
@@ -92,6 +120,12 @@ inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->recommended_num_threads = 1;
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
+
+  for (int i = 0; i < tensors_size; ++i) {
+    if (context->tensors[i].is_variable) {
+      tflite::ResetVariableTensor(&context->tensors[i]);
+    }
+  }
 }
 
 inline TfLiteIntArray* IntArrayFromInts(const int* int_array) {
@@ -105,7 +139,8 @@ inline TfLiteIntArray* IntArrayFromInitializer(
 }
 
 inline TfLiteTensor CreateFloatTensor(const float* data, TfLiteIntArray* dims,
-                                      const char* name) {
+                                      const char* name,
+                                      bool is_variable = false) {
   TfLiteTensor result;
   result.type = kTfLiteFloat32;
   result.data.f = const_cast<float*>(data);
@@ -115,18 +150,51 @@ inline TfLiteTensor CreateFloatTensor(const float* data, TfLiteIntArray* dims,
   result.bytes = ElementCount(*dims) * sizeof(float);
   result.allocation = nullptr;
   result.name = name;
+  result.is_variable = is_variable;
   return result;
 }
 
 inline TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
-                                      TfLiteIntArray* dims, const char* name) {
-  return CreateFloatTensor(data.begin(), dims, name);
+                                      TfLiteIntArray* dims, const char* name,
+                                      bool is_variable = false) {
+  return CreateFloatTensor(data.begin(), dims, name, is_variable);
+}
+
+inline void PopulateFloatTensor(TfLiteTensor* tensor, float* begin,
+                                float* end) {
+  float* p = begin;
+  float* v = tensor->data.f;
+  while (p != end) {
+    *v++ = *p++;
+  }
+}
+
+inline TfLiteTensor CreateBoolTensor(const bool* data, TfLiteIntArray* dims,
+                                     const char* name,
+                                     bool is_variable = false) {
+  TfLiteTensor result;
+  result.type = kTfLiteBool;
+  result.data.b = const_cast<bool*>(data);
+  result.dims = dims;
+  result.params = {};
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(bool);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+inline TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
+                                     TfLiteIntArray* dims, const char* name,
+                                     bool is_variable = false) {
+  return CreateBoolTensor(data.begin(), dims, name, is_variable);
 }
 
 inline TfLiteTensor CreateQuantizedTensor(const uint8_t* data,
                                           TfLiteIntArray* dims,
                                           const char* name, float min,
-                                          float max) {
+                                          float max, bool is_variable = false) {
   TfLiteTensor result;
   result.type = kTfLiteUInt8;
   result.data.uint8 = const_cast<uint8_t*>(data);
@@ -137,38 +205,92 @@ inline TfLiteTensor CreateQuantizedTensor(const uint8_t* data,
   result.bytes = ElementCount(*dims) * sizeof(uint8_t);
   result.allocation = nullptr;
   result.name = name;
+  result.is_variable = false;
   return result;
 }
 
 inline TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
                                           TfLiteIntArray* dims,
                                           const char* name, float min,
-                                          float max) {
-  return CreateQuantizedTensor(data.begin(), dims, name, min, max);
+                                          float max, bool is_variable = false) {
+  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
+}
+
+inline TfLiteTensor CreateQuantizedTensor(const int8_t* data,
+                                          TfLiteIntArray* dims,
+                                          const char* name, float min,
+                                          float max, bool is_variable = false) {
+  TfLiteTensor result;
+  result.type = kTfLiteInt8;
+  result.data.int8 = const_cast<int8_t*>(data);
+  result.dims = dims;
+  result.params = {ScaleFromMinMax<int8_t>(min, max),
+                   ZeroPointFromMinMax<int8_t>(min, max)};
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(int8_t);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+inline TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
+                                          TfLiteIntArray* dims,
+                                          const char* name, float min,
+                                          float max, bool is_variable = false) {
+  return CreateQuantizedTensor(data.begin(), dims, name, min, max, is_variable);
 }
 
 inline TfLiteTensor CreateQuantized32Tensor(const int32_t* data,
                                             TfLiteIntArray* dims,
-                                            const char* name, float min,
-                                            float max) {
+                                            const char* name, float scale,
+                                            bool is_variable = false) {
   TfLiteTensor result;
   result.type = kTfLiteInt32;
   result.data.i32 = const_cast<int32_t*>(data);
   result.dims = dims;
-  result.params = {ScaleFromMinMax<int32_t>(min, max),
-                   ZeroPointFromMinMax<int32_t>(min, max)};
+  // Quantized int32 tensors always have a zero point of 0, since the range of
+  // int32 values is large, and because zero point costs extra cycles during
+  // processing.
+  result.params = {scale, 0};
   result.allocation_type = kTfLiteMemNone;
   result.bytes = ElementCount(*dims) * sizeof(int32_t);
   result.allocation = nullptr;
   result.name = name;
+  result.is_variable = is_variable;
   return result;
 }
 
 inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
                                             TfLiteIntArray* dims,
-                                            const char* name, float min,
-                                            float max) {
-  return CreateQuantized32Tensor(data.begin(), dims, name, min, max);
+                                            const char* name, float scale,
+                                            bool is_variable = false) {
+  return CreateQuantized32Tensor(data.begin(), dims, name, scale, is_variable);
+}
+
+template <typename input_type = int32_t,
+          TfLiteType tensor_input_type = kTfLiteInt32>
+inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
+                                 const char* name, bool is_variable = false) {
+  TfLiteTensor result;
+  result.type = tensor_input_type;
+  result.data.raw = reinterpret_cast<char*>(const_cast<input_type*>(data));
+  result.dims = dims;
+  result.allocation_type = kTfLiteMemNone;
+  result.bytes = ElementCount(*dims) * sizeof(input_type);
+  result.allocation = nullptr;
+  result.name = name;
+  result.is_variable = is_variable;
+  return result;
+}
+
+template <typename input_type = int32_t,
+          TfLiteType tensor_input_type = kTfLiteInt32>
+inline TfLiteTensor CreateTensor(std::initializer_list<input_type> data,
+                                 TfLiteIntArray* dims, const char* name,
+                                 bool is_variable = false) {
+  return CreateTensor<input_type, tensor_input_type>(data.begin(), dims, name,
+                                                     is_variable);
 }
 
 // Do a simple string comparison for testing purposes, without requiring the
diff --git a/tensorflow/lite/experimental/micro/testing/test_utils_test.cc b/tensorflow/lite/experimental/micro/testing/test_utils_test.cc
new file mode 100644
index 00000000000..a65c55452c9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_utils_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(F2QTest) {
+  using tflite::testing::F2Q;
+  // [0, 127.5] -> zero_point=0, scale=0.5
+  TF_LITE_MICRO_EXPECT_EQ(0, F2Q(0, 0, 127.5));
+  TF_LITE_MICRO_EXPECT_EQ(254, F2Q(127, 0, 127.5));
+  TF_LITE_MICRO_EXPECT_EQ(255, F2Q(127.5, 0, 127.5));
+  // [-10, 245] -> zero_point=-10, scale=1.0
+  TF_LITE_MICRO_EXPECT_EQ(0, F2Q(-10, -10, 245));
+  TF_LITE_MICRO_EXPECT_EQ(1, F2Q(-9, -10, 245));
+  TF_LITE_MICRO_EXPECT_EQ(128, F2Q(118, -10, 245));
+  TF_LITE_MICRO_EXPECT_EQ(253, F2Q(243, -10, 245));
+  TF_LITE_MICRO_EXPECT_EQ(254, F2Q(244, -10, 245));
+  TF_LITE_MICRO_EXPECT_EQ(255, F2Q(245, -10, 245));
+}
+
+TF_LITE_MICRO_TEST(F2QSTest) {
+  using tflite::testing::F2QS;
+  // [-64, 63.5] -> zero_point=0, scale=0.5
+  TF_LITE_MICRO_EXPECT_EQ(2, F2QS(1, -64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(4, F2QS(2, -64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(6, F2QS(3, -64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(-10, F2QS(-5, -64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(-128, F2QS(-64, -64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(127, F2QS(63.5, -64, 63.5));
+  // [-127, 128] -> zero_point=1, scale=1.0
+  TF_LITE_MICRO_EXPECT_EQ(0, F2QS(1, -127, 128));
+  TF_LITE_MICRO_EXPECT_EQ(-1, F2QS(0, -127, 128));
+  TF_LITE_MICRO_EXPECT_EQ(126, F2QS(127, -127, 128));
+  TF_LITE_MICRO_EXPECT_EQ(127, F2QS(128, -127, 128));
+  TF_LITE_MICRO_EXPECT_EQ(-127, F2QS(-126, -127, 128));
+  TF_LITE_MICRO_EXPECT_EQ(-128, F2QS(-127, -127, 128));
+}
+
+TF_LITE_MICRO_TEST(F2Q32Test) {
+  using tflite::testing::F2Q32;
+  TF_LITE_MICRO_EXPECT_EQ(0, F2Q32(0, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(2, F2Q32(1, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-2, F2Q32(-1, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(-100, F2Q32(-50, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(100, F2Q32(50, 0.5));
+}
+
+TF_LITE_MICRO_TEST(ZeroPointTest) {
+  TF_LITE_MICRO_EXPECT_EQ(
+      10, tflite::testing::ZeroPointFromMinMax<int8_t>(-69, 58.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      -10, tflite::testing::ZeroPointFromMinMax<int8_t>(-59, 68.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, tflite::testing::ZeroPointFromMinMax<uint8_t>(0, 255));
+  TF_LITE_MICRO_EXPECT_EQ(
+      64, tflite::testing::ZeroPointFromMinMax<uint8_t>(-32, 95.5));
+}
+
+TF_LITE_MICRO_TEST(ZeroPointRoundingTest) {
+  TF_LITE_MICRO_EXPECT_EQ(
+      -1, tflite::testing::ZeroPointFromMinMax<int8_t>(-126.51, 128.49));
+  TF_LITE_MICRO_EXPECT_EQ(
+      -1, tflite::testing::ZeroPointFromMinMax<int8_t>(-127.49, 127.51));
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, tflite::testing::ZeroPointFromMinMax<int8_t>(-127.51, 127.49));
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, tflite::testing::ZeroPointFromMinMax<int8_t>(-128.49, 126.51));
+  TF_LITE_MICRO_EXPECT_EQ(
+      1, tflite::testing::ZeroPointFromMinMax<int8_t>(-128.51, 126.49));
+  TF_LITE_MICRO_EXPECT_EQ(
+      1, tflite::testing::ZeroPointFromMinMax<int8_t>(-129.49, 125.51));
+}
+
+TF_LITE_MICRO_TEST(ScaleTest) {
+  int min_int = std::numeric_limits<int32_t>::min();
+  int max_int = std::numeric_limits<int32_t>::max();
+  TF_LITE_MICRO_EXPECT_EQ(
+      0.5, tflite::testing::ScaleFromMinMax<int32_t>(-0.5, max_int));
+  TF_LITE_MICRO_EXPECT_EQ(
+      1.0, tflite::testing::ScaleFromMinMax<int32_t>(min_int, max_int));
+  TF_LITE_MICRO_EXPECT_EQ(0.25, tflite::testing::ScaleFromMinMax<int32_t>(
+                                    min_int / 4, max_int / 4));
+  TF_LITE_MICRO_EXPECT_EQ(0.5,
+                          tflite::testing::ScaleFromMinMax<int8_t>(-64, 63.5));
+  TF_LITE_MICRO_EXPECT_EQ(0.25,
+                          tflite::testing::ScaleFromMinMax<int8_t>(0, 63.75));
+  TF_LITE_MICRO_EXPECT_EQ(0.5,
+                          tflite::testing::ScaleFromMinMax<uint8_t>(0, 127.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      0.25, tflite::testing::ScaleFromMinMax<uint8_t>(63.75, 127.5));
+}
+
+TF_LITE_MICRO_TEST(MinMaxTest) {
+  TF_LITE_MICRO_EXPECT_EQ(
+      -128, tflite::testing::MinFromZeroPointScale<int8_t>(0, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(
+      127, tflite::testing::MaxFromZeroPointScale<int8_t>(0, 1.0));
+  TF_LITE_MICRO_EXPECT_EQ(
+      -64, tflite::testing::MinFromZeroPointScale<int8_t>(0, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      63.5, tflite::testing::MaxFromZeroPointScale<int8_t>(0, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      -65, tflite::testing::MinFromZeroPointScale<int8_t>(2, 0.5));
+  TF_LITE_MICRO_EXPECT_EQ(
+      62.5, tflite::testing::MaxFromZeroPointScale<int8_t>(2, 0.5));
+}
+
+TF_LITE_MICRO_TEST(ZeroPointScaleMinMaxSanityTest) {
+  float min = -150.0f;
+  float max = 105.0f;
+  float scale = tflite::testing::ScaleFromMinMax<int8_t>(min, max);
+  int zero_point = tflite::testing::ZeroPointFromMinMax<int8_t>(min, max);
+  float min_test =
+      tflite::testing::MinFromZeroPointScale<int8_t>(zero_point, scale);
+  float max_test =
+      tflite::testing::MaxFromZeroPointScale<int8_t>(zero_point, scale);
+  TF_LITE_MICRO_EXPECT_EQ(min, min_test);
+  TF_LITE_MICRO_EXPECT_EQ(max, max_test);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/install_arduino_cli.sh b/tensorflow/lite/experimental/micro/tools/ci_build/install_arduino_cli.sh
index 3c7ebe3a043..f55c354013e 100755
--- a/tensorflow/lite/experimental/micro/tools/ci_build/install_arduino_cli.sh
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/install_arduino_cli.sh
@@ -21,9 +21,8 @@ set -e
 cd /tmp
 
 rm -rf arduino-cli*
-curl -L -O "https://downloads.arduino.cc/arduino-cli/arduino-cli-latest-linux64.tar.bz2"
-tar xjf arduino-cli-latest-linux64.tar.bz2
-mv arduino-cli-*linux64 arduino-cli
+curl -L -O "https://github.com/arduino/arduino-cli/releases/download/0.4.0/arduino-cli_0.4.0_Linux_64bit.tar.gz"
+tar xzf arduino-cli_0.4.0_Linux_64bit.tar.gz
 
 /tmp/arduino-cli core update-index
-/tmp/arduino-cli core install arduino:sam
+/tmp/arduino-cli core install arduino:mbed
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
index 7e64c217241..c1b3006f5ae 100755
--- a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
@@ -34,12 +34,17 @@ make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
 
 make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
   TARGET="arduino" \
-  TAGS="" \
-  generate_projects
+  TAGS="portable_optimized" \
+  generate_non_kernel_projects
 
 tensorflow/lite/experimental/micro/tools/ci_build/install_arduino_cli.sh
 
 for f in tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/*/*.zip; do
+  # There are too many kernel tests, so the presubmit takes too long. Skip any
+  # kernel-only tests to speed the process up.
+  if [[ ${f} =~ kernel_ ]]; then
+    continue
+  fi
   tensorflow/lite/experimental/micro/tools/ci_build/test_arduino_library.sh ${f}
 done
 
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino_library.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino_library.sh
index bb4a33f4d01..c0681764158 100755
--- a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino_library.sh
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino_library.sh
@@ -23,15 +23,22 @@ set -e
 ARDUINO_HOME_DIR=${HOME}/Arduino
 ARDUINO_LIBRARIES_DIR=${ARDUINO_HOME_DIR}/libraries
 ARDUINO_CLI_TOOL=/tmp/arduino-cli
+# Necessary due to bug in arduino-cli that allows it to build files in pwd
+TEMP_BUILD_DIR=/tmp/tflite-arduino-build
 
 LIBRARY_ZIP=${1}
 
 rm -rf ${ARDUINO_LIBRARIES_DIR}
+rm -rf ${TEMP_BUILD_DIR}
 
 mkdir -p ${ARDUINO_HOME_DIR}/libraries
+mkdir -p ${TEMP_BUILD_DIR}
 
 unzip -q ${LIBRARY_ZIP} -d ${ARDUINO_LIBRARIES_DIR}
 
+# Change into this dir before running the tests
+cd ${TEMP_BUILD_DIR}
+
 for f in ${ARDUINO_LIBRARIES_DIR}/*/examples/*/*.ino; do
-  ${ARDUINO_CLI_TOOL} compile --fqbn arduino:sam:arduino_due_x $f
+  ${ARDUINO_CLI_TOOL} compile --fqbn arduino:mbed:nano33ble $f
 done
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 67a3ea97db6..d2ed2e62341 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -88,6 +88,7 @@ tensorflow/lite/c/c_api_internal.c \
 tensorflow/lite/core/api/error_reporter.cc \
 tensorflow/lite/core/api/flatbuffer_conversions.cc \
 tensorflow/lite/core/api/op_resolver.cc \
+tensorflow/lite/core/api/tensor_utils.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
@@ -96,30 +97,49 @@ MICROLITE_CC_HDRS := \
 $(wildcard tensorflow/lite/experimental/micro/*.h) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.h) \
 LICENSE \
-tensorflow/lite/c/c_api_internal.h \
+tensorflow/core/public/version.h \
 tensorflow/lite/c/builtin_op_data.h \
+tensorflow/lite/c/c_api_internal.h \
 tensorflow/lite/core/api/error_reporter.h \
 tensorflow/lite/core/api/flatbuffer_conversions.h \
 tensorflow/lite/core/api/op_resolver.h \
-tensorflow/lite/kernels/kernel_util.h \
-tensorflow/lite/kernels/op_macros.h \
-tensorflow/lite/kernels/padding.h \
+tensorflow/lite/core/api/tensor_utils.h \
 tensorflow/lite/kernels/internal/common.h \
 tensorflow/lite/kernels/internal/compatibility.h \
 tensorflow/lite/kernels/internal/optimized/neon_check.h \
+tensorflow/lite/kernels/internal/quantization_util.h \
+tensorflow/lite/kernels/internal/reference/add.h \
+tensorflow/lite/kernels/internal/reference/arg_min_max.h \
+tensorflow/lite/kernels/internal/reference/binary_function.h \
+tensorflow/lite/kernels/internal/reference/ceil.h \
+tensorflow/lite/kernels/internal/reference/comparisons.h \
 tensorflow/lite/kernels/internal/reference/conv.h \
 tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \
 tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \
+tensorflow/lite/kernels/internal/reference/floor.h \
 tensorflow/lite/kernels/internal/reference/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/add.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
+tensorflow/lite/kernels/internal/reference/neg.h \
 tensorflow/lite/kernels/internal/reference/pooling.h \
+tensorflow/lite/kernels/internal/reference/prelu.h \
+tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h \
+tensorflow/lite/kernels/internal/reference/round.h \
 tensorflow/lite/kernels/internal/reference/softmax.h \
+tensorflow/lite/kernels/internal/reference/strided_slice.h \
 tensorflow/lite/kernels/internal/round.h \
+tensorflow/lite/kernels/internal/strided_slice_logic.h \
+tensorflow/lite/kernels/internal/tensor.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \
 tensorflow/lite/kernels/internal/types.h \
-tensorflow/lite/kernels/internal/quantization_util.h \
+tensorflow/lite/kernels/kernel_util.h \
+tensorflow/lite/kernels/op_macros.h \
+tensorflow/lite/kernels/padding.h \
 tensorflow/lite/schema/schema_generated.h \
-tensorflow/lite/version.h \
-tensorflow/core/public/version.h
+tensorflow/lite/string.h \
+tensorflow/lite/string_util.h \
+tensorflow/lite/version.h
 
 THIRD_PARTY_CC_HDRS := \
 third_party/gemmlowp/fixedpoint/fixedpoint.h \
@@ -248,13 +268,17 @@ $(BINDIR)%.bin: $(BINDIR)%
 	$(OBJCOPY) $< $@ -O binary
 
 # Generate standalone makefile projects for all of the test targets.
-$(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
+$(foreach TEST_TARGET,$(filter-out tensorflow/lite/experimental/micro/kernels/%,$(MICROLITE_TEST_SRCS)),\
 $(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
+$(foreach TEST_TARGET,$(filter tensorflow/lite/experimental/micro/kernels/%,$(MICROLITE_TEST_SRCS)),\
+$(eval $(call microlite_test,kernel_$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
 
 test: $(MICROLITE_TEST_TARGETS)
 
 generate_projects: $(ALL_PROJECT_TARGETS)
 
+generate_non_kernel_projects: $(filter-out generate_kernel%,$(ALL_PROJECT_TARGETS))
+
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh b/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh
index 7ffa5a28741..0de91e9001a 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh
@@ -108,6 +108,8 @@ download_and_extract() {
   
   if [[ "${url}" == *gz ]]; then
     tar -C "${dir}" --strip-components=1 -xzf ${tempfile}
+  elif [[ "${url}" == *tar.xz ]]; then
+    tar -C "${dir}" --strip-components=1 -xf ${tempfile}
   elif [[ "${url}" == *bz2 ]]; then
     curl -Ls "${url}" > ${tempdir}/tarred.bz2
     tar -C "${dir}" --strip-components=1 -xjf ${tempfile}
@@ -122,6 +124,8 @@ download_and_extract() {
     else
       cp -R ${tempdir2}/* ${dir}/
     fi
+  else
+    echo "Error unsupported archive type. Failed to extract tool after download."
   fi
   rm -rf ${tempdir2} ${tempdir}
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
index b991bc6b27c..87a6b0b0a84 100644
--- a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -137,7 +137,7 @@ $(PRJDIR)$(2)/arduino/src/third_party/kissfft/kiss_fft.h: tensorflow/lite/experi
 	@mkdir -p $$(dir $$@)
 	@python tensorflow/lite/experimental/micro/tools/make/transform_arduino_source.py \
         --third_party_headers="$(4)" < $$< | \
-        sed -E 's/<string.h>/<string.h>\n#include <stdint.h>/g' > $$@
+        sed -E 's@#include <string.h>@//#include <string.h> /* Patched by helper_functions.inc for Arduino compatibility */@g' > $$@
 
 $(PRJDIR)$(2)/arduino/%: tensorflow/lite/experimental/micro/tools/make/templates/%
 	@mkdir -p $$(dir $$@)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc
new file mode 100644
index 00000000000..7d7832411b3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc
@@ -0,0 +1,14 @@
+# Settings for SparcV8 based LEON processors from Gaisler Aeroflex
+ifeq ($(TARGET), leon)
+  PLATFORM_FLAGS = -O3 -mcpu=leon3
+  CXXFLAGS += -std=c++11 $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  TARGET_ARCH := leon
+  TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/experimental/micro/tools/make/downloads/leon_bcc2/bin/sparc-gaisler-elf-
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_leon_binary.sh
+  GCC_LEON := $(MAKEFILE_DIR)/downloads/leon_bcc2/
+
+  $(eval $(call add_third_party_download,$(LEON_BCC2_URL),$(LEON_BCC2_MD5),leon_bcc2,))
+  $(eval $(call add_third_party_download,$(TSIM_URL),$(TSIM_MD5),tsim,))
+
+endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/arduino_example.ino b/tensorflow/lite/experimental/micro/tools/make/templates/arduino_example.ino
index 02ebe5f8acc..ac8813fc89c 100644
--- a/tensorflow/lite/experimental/micro/tools/make/templates/arduino_example.ino
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/arduino_example.ino
@@ -18,11 +18,25 @@ limitations under the License.
 // Include an empty header so that Arduino knows to build the TF Lite library.
 #include <TensorFlowLite.h>
 
+// TensorFlow Lite defines its own main function
 extern int tflite_micro_main(int argc, char* argv[]);
 
+// So the example works with or without a serial connection,
+// wait to see one for 5 seconds before giving up.
+void waitForSerial() {
+  int start = millis();
+  while(!Serial) {
+    int diff = millis() - start;
+    if (diff > 5000) break;
+  }
+}
+
+// Runs once when the program starts
 void setup() {
+  waitForSerial();
   tflite_micro_main(0, NULL);
 }
 
+// Leave the loop unused
 void loop() {
 }
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
index 1d0164b718c..6f84e25d294 100644
--- a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
@@ -3,7 +3,7 @@
 GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
 GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
 
-FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
+FLATBUFFERS_URL := "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
 FLATBUFFERS_MD5 := "02c64880acb89dbd57eebacfd67200d8"
 
 ifeq ($(HOST_OS),osx)
@@ -14,6 +14,12 @@ else
   GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0"
 endif
 
+LEON_BCC2_URL := "https://www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz"
+LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
+
+TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
+TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
+
 CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 CMSIS_MD5 := "f451f1dccc844e894939055db278a40e"
 
@@ -46,5 +52,5 @@ SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
 KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
 KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
 
-PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data.tgz"
-PERSON_MODEL_MD5 := "dc0ffad71adb651fb7b2d472b6c901ef"
+PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale.zip"
+PERSON_MODEL_MD5 := "cd1059dd1c94afadd59608202732ad63"
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index 8f3c0aa8169..81d5e995d27 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -75,7 +75,6 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":audio_microfrontend_op",
-        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:common_shapes",
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 3ce861707fd..913c330ef0e 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 
 SAMPLE_RATE = 1000
 WINDOW_SIZE = 25
@@ -34,7 +34,10 @@ SMOOTHING_BITS = 10
 
 class AudioFeatureGenerationTest(tf.test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  def setUp(self):
+    super(AudioFeatureGenerationTest, self).setUp()
+    ops.disable_eager_execution()
+
   def testSimple(self):
     with self.test_session():
       audio = tf.constant(
@@ -53,7 +56,6 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
       self.assertAllEqual(filterbanks.eval(),
                           [[479, 425], [436, 378], [410, 350], [391, 325]])
 
-  @test_util.run_v1_only("b/120545219")
   def testSimpleFloatScaled(self):
     with self.test_session():
       audio = tf.constant(
@@ -75,7 +77,6 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
                           [[7.484375, 6.640625], [6.8125, 5.90625],
                            [6.40625, 5.46875], [6.109375, 5.078125]])
 
-  @test_util.run_v1_only("b/120545219")
   def testStacking(self):
     with self.test_session():
       audio = tf.constant(
@@ -118,7 +119,6 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
            [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
 
-  @test_util.run_v1_only("b/120545219")
   def testStackingDropFrame(self):
     with self.test_session():
       audio = tf.constant(
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index d89e02494fb..41af895a260 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '0.2.0'
+  s.version          = '1.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '37c101d' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :tag => "v#{s.version}" }
   s.summary          = 'TensorFlow Lite for Objective-C'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 7bc122ea910..e8e69484e21 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -25,7 +25,7 @@
 NS_ASSUME_NONNULL_BEGIN
 
 FOUNDATION_EXPORT NSString *const TFLVersion =
-    TFL_Version() == NULL ? @"" : [NSString stringWithUTF8String:TFL_Version()];
+    TfLiteVersion() == NULL ? @"" : [NSString stringWithUTF8String:TfLiteVersion()];
 
 /**
  * Error reporter for TFLInterpreter.
@@ -40,8 +40,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 @interface TFLInterpreter ()
 
-/** TFL_Interpreter backed by C API. */
-@property(nonatomic, nullable) TFL_Interpreter *interpreter;
+/** TfLiteInterpreter backed by C API. */
+@property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
 @end
 
@@ -50,7 +50,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 #pragma mark - NSObject
 
 - (void)dealloc {
-  TFL_DeleteInterpreter(_interpreter);
+  TfLiteInterpreterDelete(_interpreter);
 }
 
 #pragma mark - Public
@@ -67,8 +67,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
   self = [super init];
 
   if (self != nil) {
-    TFL_Model *model = nullptr;
-    TFL_InterpreterOptions *cOptions = nullptr;
+    TfLiteModel *model = nullptr;
+    TfLiteInterpreterOptions *cOptions = nullptr;
 
     @try {
       const char *modelPathCString = modelPath.UTF8String;
@@ -81,7 +81,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
         return nil;
       }
 
-      model = TFL_NewModelFromFile(modelPathCString);
+      model = TfLiteModelCreateFromFile(modelPathCString);
       if (model == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
                                        description:pathErrorString
@@ -89,7 +89,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
         return nil;
       }
 
-      cOptions = TFL_NewInterpreterOptions();
+      cOptions = TfLiteInterpreterOptionsCreate();
       if (cOptions == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
                                        description:@"Failed to create the interpreter."
@@ -98,11 +98,11 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       }
 
       if (options.numberOfThreads > 0) {
-        TFL_InterpreterOptionsSetNumThreads(cOptions, (int32_t)options.numberOfThreads);
+        TfLiteInterpreterOptionsSetNumThreads(cOptions, (int32_t)options.numberOfThreads);
       }
-      TFL_InterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
+      TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
-      _interpreter = TFL_NewInterpreter(model, cOptions);
+      _interpreter = TfLiteInterpreterCreate(model, cOptions);
       if (_interpreter == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
                                        description:@"Failed to create the interpreter."
@@ -110,8 +110,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
         return nil;
       }
 
-      _inputTensorCount = (NSUInteger)TFL_InterpreterGetInputTensorCount(_interpreter);
-      _outputTensorCount = (NSUInteger)TFL_InterpreterGetOutputTensorCount(_interpreter);
+      _inputTensorCount = (NSUInteger)TfLiteInterpreterGetInputTensorCount(_interpreter);
+      _outputTensorCount = (NSUInteger)TfLiteInterpreterGetOutputTensorCount(_interpreter);
       if (_inputTensorCount <= 0 || _outputTensorCount <= 0) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
                                        description:@"Failed to create the interpreter."
@@ -119,8 +119,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
         return nil;
       }
     } @finally {
-      TFL_DeleteInterpreterOptions(cOptions);
-      TFL_DeleteModel(model);
+      TfLiteInterpreterOptionsDelete(cOptions);
+      TfLiteModelDelete(model);
     }
   }
 
@@ -128,7 +128,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 }
 
 - (BOOL)invokeWithError:(NSError **)error {
-  if (TFL_InterpreterInvoke(self.interpreter) != kTfLiteOk) {
+  if (TfLiteInterpreterInvoke(self.interpreter) != kTfLiteOk) {
     [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToInvoke
                                    description:@"Failed to invoke the interpreter."
                                          error:error];
@@ -181,7 +181,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
     cDimensions[dimIndex] = dimension;
   }
 
-  if (TFL_InterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions,
+  if (TfLiteInterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions,
                                        (int32_t)shape.count) != kTfLiteOk) {
     NSString *errorDescription = [NSString
         stringWithFormat:@"Failed to resize input tensor at index (%lu).", (unsigned long)index];
@@ -195,7 +195,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 }
 
 - (BOOL)allocateTensorsWithError:(NSError **)error {
-  if (TFL_InterpreterAllocateTensors(self.interpreter) != kTfLiteOk) {
+  if (TfLiteInterpreterAllocateTensors(self.interpreter) != kTfLiteOk) {
     [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToAllocateTensors
                                    description:@"Failed to allocate memory for tensors."
                                          error:error];
@@ -207,12 +207,12 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 #pragma mark - TFLInterpreter (Internal)
 
 - (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
-  const TFL_Tensor *cTensor = [self cTensorOfType:TFLTensorTypeInput atIndex:index error:error];
+  const TfLiteTensor *cTensor = [self cTensorOfType:TFLTensorTypeInput atIndex:index error:error];
   if (cTensor == nullptr) {
     return NO;
   }
 
-  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  NSUInteger byteSize = (NSUInteger)TfLiteTensorByteSize(cTensor);
   if (data.length != byteSize) {
     NSString *errorDescription = [NSString
         stringWithFormat:@"Input tensor at index (%lu) expects data size (%lu), but got (%lu).",
@@ -223,7 +223,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
     return NO;
   }
 
-  if (TFL_TensorCopyFromBuffer((TFL_Tensor *)cTensor, data.bytes, data.length) != kTfLiteOk) {
+  if (TfLiteTensorCopyFromBuffer((TfLiteTensor *)cTensor, data.bytes, data.length) != kTfLiteOk) {
     NSString *errorDescription =
         [NSString stringWithFormat:@"Failed to copy data into input tensor at index (%lu).",
                                    (unsigned long)index];
@@ -237,13 +237,13 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 }
 
 - (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error {
-  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  const TfLiteTensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
   if (cTensor == nullptr) {
     return nil;
   }
 
-  void *bytes = TFL_TensorData(cTensor);
-  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  void *bytes = TfLiteTensorData(cTensor);
+  NSUInteger byteSize = (NSUInteger)TfLiteTensorByteSize(cTensor);
   if (bytes == nullptr || byteSize == 0) {
     NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
     NSString *errorDescription =
@@ -259,13 +259,13 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 }
 
 - (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error {
-  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  const TfLiteTensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
   if (cTensor == nullptr) {
     return nil;
   }
 
   NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
-  int32_t rank = TFL_TensorNumDims(cTensor);
+  int32_t rank = TfLiteTensorNumDims(cTensor);
   if (rank <= 0) {
     NSString *errorDescription =
         [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid rank (%d).", tensorType,
@@ -278,7 +278,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
   NSMutableArray *shape = [NSMutableArray arrayWithCapacity:rank];
   for (int32_t dimIndex = 0; dimIndex < rank; dimIndex++) {
-    int32_t dimension = TFL_TensorDim(cTensor, dimIndex);
+    int32_t dimension = TfLiteTensorDim(cTensor, dimIndex);
     if (dimension <= 0) {
       NSString *errorDescription =
           [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid %d-th dimension (%d).",
@@ -296,17 +296,17 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 #pragma mark - Private
 
-- (const TFL_Tensor *)cTensorOfType:(TFLTensorType)type
+- (const TfLiteTensor *)cTensorOfType:(TFLTensorType)type
                             atIndex:(NSUInteger)index
                               error:(NSError **)error {
-  const TFL_Tensor *tensor = nullptr;
+  const TfLiteTensor *tensor = nullptr;
 
   switch (type) {
     case TFLTensorTypeInput:
-      tensor = TFL_InterpreterGetInputTensor(self.interpreter, (int32_t)index);
+      tensor = TfLiteInterpreterGetInputTensor(self.interpreter, (int32_t)index);
       break;
     case TFLTensorTypeOutput:
-      tensor = TFL_InterpreterGetOutputTensor(self.interpreter, (int32_t)index);
+      tensor = TfLiteInterpreterGetOutputTensor(self.interpreter, (int32_t)index);
       break;
   }
 
@@ -326,14 +326,14 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 - (nullable TFLTensor *)tensorOfType:(TFLTensorType)type
                              atIndex:(NSUInteger)index
                                error:(NSError **)error {
-  const TFL_Tensor *tensor = [self cTensorOfType:type atIndex:index error:error];
+  const TfLiteTensor *tensor = [self cTensorOfType:type atIndex:index error:error];
 
   if (tensor == nullptr) {
     return nil;
   }
 
   NSString *tensorType = [TFLTensor stringForTensorType:type];
-  const char *cName = TFL_TensorName(tensor);
+  const char *cName = TfLiteTensorName(tensor);
   if (cName == nullptr) {
     NSString *errorDescription =
         [NSString stringWithFormat:@"Failed to get name of %@ tensor at index (%lu).", tensorType,
@@ -345,12 +345,12 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
   }
   NSString *name = [NSString stringWithUTF8String:cName];
 
-  TFLTensorDataType dataType = [self tensorDataTypeFromCTensorType:TFL_TensorType(tensor)];
+  TFLTensorDataType dataType = [self tensorDataTypeFromCTensorType:TfLiteTensorType(tensor)];
 
-  TFL_QuantizationParams cParams = TFL_TensorQuantizationParams(tensor);
+  TfLiteQuantizationParams cParams = TfLiteTensorQuantizationParams(tensor);
   TFLQuantizationParameters *quantizationParams;
 
-  // TODO(b/119735362): Update this check once the TFL_QuantizationParams struct has a mode.
+  // TODO(b/119735362): Update this check once the TfLiteQuantizationParams struct has a mode.
   if (cParams.scale != 0.0) {
     quantizationParams = [[TFLQuantizationParameters alloc] initWithScale:cParams.scale
                                                                 zeroPoint:cParams.zero_point];
@@ -365,7 +365,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
                          quantizationParameters:quantizationParams];
 }
 
-- (TFLTensorDataType)tensorDataTypeFromCTensorType:(TFL_Type)cTensorType {
+- (TFLTensorDataType)tensorDataTypeFromCTensorType:(TfLiteType)cTensorType {
   switch (cTensorType) {
     case kTfLiteFloat32:
       return TFLTensorDataTypeFloat32;
diff --git a/tensorflow/lite/experimental/resource_variable/BUILD b/tensorflow/lite/experimental/resource_variable/BUILD
new file mode 100644
index 00000000000..af2ed19d214
--- /dev/null
+++ b/tensorflow/lite/experimental/resource_variable/BUILD
@@ -0,0 +1,17 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "resource_variable",
+    srcs = [
+        "resource_variable.cc",
+    ],
+    hdrs = [
+        "resource_variable.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
diff --git a/tensorflow/lite/experimental/resource_variable/resource_variable.cc b/tensorflow/lite/experimental/resource_variable/resource_variable.cc
new file mode 100644
index 00000000000..502ca273464
--- /dev/null
+++ b/tensorflow/lite/experimental/resource_variable/resource_variable.cc
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/resource_variable/resource_variable.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <map>
+
+namespace tflite {
+
+ResourceVariable::ResourceVariable() {
+  memset(&tensor_, 0, sizeof(TfLiteTensor));
+}
+
+ResourceVariable::ResourceVariable(ResourceVariable&& other) {
+  tensor_ = other.tensor_;
+  is_initialized_ = other.is_initialized_;
+
+  memset(&other.tensor_, 0, sizeof(TfLiteTensor));
+  other.is_initialized_ = false;
+}
+
+ResourceVariable::~ResourceVariable() {
+  if (is_initialized_) {
+    free(tensor_.data.raw);
+    if (tensor_.dims) {
+      TfLiteIntArrayFree(tensor_.dims);
+    }
+  }
+}
+
+TfLiteStatus ResourceVariable::AssignFrom(const TfLiteTensor* tensor) {
+  // Save the old allocated resources and attributes that we might use.
+  char* old_raw = tensor_.data.raw;
+  size_t old_bytes = tensor_.bytes;
+  TfLiteIntArray* old_dims = tensor_.dims;
+
+  // Copy primitive parameters.
+  memset(&tensor_, 0, sizeof(tensor_));
+  tensor_.allocation_type = kTfLiteDynamic;
+  tensor_.type = tensor->type;
+  tensor_.params = tensor->params;
+  tensor_.quantization = tensor->quantization;
+
+  // Copy old shape if possible otherwise create a new one.
+  if (TfLiteIntArrayEqual(old_dims, tensor->dims)) {
+    tensor_.dims = old_dims;
+  } else {
+    TfLiteIntArrayFree(old_dims);
+    tensor_.dims = TfLiteIntArrayCopy(tensor->dims);
+  }
+
+  // Reuse the same buffer if possible otherwise allocate a new one.
+  tensor_.data.raw = old_raw;
+  if (old_bytes != tensor->bytes) {
+    TfLiteTensorRealloc(tensor->bytes, &tensor_);
+  }
+
+  memcpy(tensor_.data.raw, tensor->data.raw, tensor_.bytes);
+  is_initialized_ = true;
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/resource_variable/resource_variable.h b/tensorflow/lite/experimental/resource_variable/resource_variable.h
new file mode 100644
index 00000000000..6a938489eea
--- /dev/null
+++ b/tensorflow/lite/experimental/resource_variable/resource_variable.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_VARIABLE_RESOURCE_VARIABLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_VARIABLE_RESOURCE_VARIABLE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+
+/// WARNING: Experimental interface, subject to change.
+// A resource variable class. It's similar to TensorFlow Resource
+// Variable, but it's identified with int32 ID in TFLite (instead of
+// using Resource handle like TensorFlow).
+//
+// TODO(b/137042749): TFLite converter cannot convert variables yet.
+// Variable functionalities are only tested with unit tests now.
+class ResourceVariable {
+ public:
+  ResourceVariable();
+  ResourceVariable(ResourceVariable&& other);
+
+  ResourceVariable(const ResourceVariable&) = delete;
+  ResourceVariable& operator=(const ResourceVariable&) = delete;
+
+  ~ResourceVariable();
+
+  // Assigns data from a tensor. Copies its type, shape and data over.
+  TfLiteStatus AssignFrom(const TfLiteTensor* tensor);
+
+  // Get the data tensor stored in the resource variable.
+  // Returns `nullptr` if the variable is never initialized by calling
+  // `AssignFrom`.
+  TfLiteTensor* GetTensor() { return is_initialized_ ? &tensor_ : nullptr; }
+
+ private:
+  // The tensor (and its buffer stored in `tensor_.data` is fully owned by
+  // the `ResourceVariable` object.
+  TfLiteTensor tensor_;
+  // True if `AssignFrom` function is every called.
+  // False if and only if `tensor_` is filled with zeros.
+  bool is_initialized_ = false;
+};
+
+using ResourceVariableMap = std::unordered_map<int, ResourceVariable>;
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_VARIABLE_RESOURCE_VARIABLE_H_
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index 6c75783f2ce..56087276d4a 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -2,7 +2,7 @@
 
 # TODO(b/123403203) actually make TFLite use ruy.
 
-load(":ruy_visibility.bzl", "ruy_visibility")
+load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_base", "ruy_copts_skylake", "ruy_visibility")
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
 load(":ruy_test.bzl", "ruy_benchmark", "ruy_benchmark_opt_sets", "ruy_test")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
@@ -15,28 +15,33 @@ package(
 cc_library(
     name = "platform",
     hdrs = ["platform.h"],
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "check_macros",
     hdrs = ["check_macros.h"],
+    copts = ruy_copts_base(),
     deps = ["//tensorflow/lite/kernels/internal:compatibility"],
 )
 
 cc_library(
     name = "opt_set",
     hdrs = ["opt_set.h"],
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "time",
     hdrs = ["time.h"],
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "wait",
     srcs = ["wait.cc"],
     hdrs = ["wait.h"],
+    copts = ruy_copts_base(),
     deps = [":time"],
 )
 
@@ -44,6 +49,7 @@ cc_test(
     name = "wait_test",
     srcs = ["wait_test.cc"],
     deps = [
+        ":platform",
         ":wait",
         "@com_google_googletest//:gtest",
     ],
@@ -52,9 +58,19 @@ cc_test(
 cc_library(
     name = "size_util",
     hdrs = ["size_util.h"],
+    copts = ruy_copts_base(),
     deps = [":check_macros"],
 )
 
+cc_test(
+    name = "size_util_test",
+    srcs = ["size_util_test.cc"],
+    deps = [
+        ":size_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "tune",
     srcs = [
@@ -63,6 +79,7 @@ cc_library(
     hdrs = [
         "tune.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":opt_set",
         ":platform",
@@ -95,6 +112,7 @@ cc_library(
     hdrs = [
         "allocator.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":size_util",
@@ -110,6 +128,13 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "side_pair",
+    hdrs = ["side_pair.h"],
+    copts = ruy_copts_base(),
+    deps = [":check_macros"],
+)
+
 cc_library(
     name = "block_map",
     srcs = [
@@ -118,9 +143,11 @@ cc_library(
     hdrs = [
         "block_map.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":opt_set",
+        ":side_pair",
         ":size_util",
         "@gemmlowp//:profiler",
     ],
@@ -134,6 +161,7 @@ cc_library(
     hdrs = [
         "blocking_counter.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":wait",
@@ -148,6 +176,7 @@ cc_library(
     hdrs = [
         "thread_pool.h",
     ],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":blocking_counter",
@@ -157,19 +186,36 @@ cc_library(
 )
 
 cc_library(
-    name = "detect_dotprod",
+    name = "detect_arm",
     srcs = [
-        "detect_dotprod.cc",
+        "detect_arm.cc",
     ],
     hdrs = [
-        "detect_dotprod.h",
+        "detect_arm.h",
     ],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
 )
 
+cc_library(
+    name = "detect_x86",
+    srcs = [
+        "detect_x86.cc",
+    ],
+    hdrs = [
+        "detect_x86.h",
+    ],
+    copts = ruy_copts_base(),
+    visibility = ruy_visibility(),
+    deps = [
+        ":platform",
+    ],
+)
+
 cc_library(
     name = "path",
     hdrs = ["path.h"],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":platform",
@@ -185,37 +231,19 @@ cc_library(
     hdrs = [
         "trace.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":block_map",
         ":check_macros",
-        ":common",
+        ":side_pair",
         ":time",
     ],
 )
 
-cc_library(
-    name = "context",
-    srcs = [
-        "context.cc",
-    ],
-    hdrs = [
-        "context.h",
-    ],
-    visibility = ruy_visibility(),
-    deps = [
-        ":allocator",
-        ":check_macros",
-        ":detect_dotprod",
-        ":path",
-        ":thread_pool",
-        ":trace",
-        ":tune",
-    ],
-)
-
 cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [":check_macros"],
 )
@@ -223,6 +251,7 @@ cc_library(
 cc_library(
     name = "spec",
     hdrs = ["spec.h"],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [":matrix"],
 )
@@ -230,6 +259,7 @@ cc_library(
 cc_library(
     name = "internal_matrix",
     hdrs = ["internal_matrix.h"],
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -243,6 +273,7 @@ cc_library(
     hdrs = [
         "common.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":matrix",
@@ -253,20 +284,210 @@ cc_library(
 )
 
 cc_library(
-    name = "kernel",
+    name = "kernel_common",
+    hdrs = [
+        "kernel.h",
+        "kernel_arm.h",
+        "kernel_common.h",
+        "kernel_x86.h",
+    ],
+    copts = ruy_copts_base(),
+    deps = [
+        ":check_macros",
+        ":common",
+        ":internal_matrix",
+        ":matrix",
+        ":opt_set",
+        ":path",
+        ":platform",
+        ":side_pair",
+        ":size_util",
+        ":spec",
+        ":tune",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "pack_common",
+    hdrs = [
+        "pack.h",
+        "pack_arm.h",
+        "pack_common.h",
+        "pack_x86.h",
+    ],
+    copts = ruy_copts_base(),
+    deps = [
+        ":check_macros",
+        ":common",
+        ":internal_matrix",
+        ":matrix",
+        ":opt_set",
+        ":path",
+        ":platform",
+        ":tune",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "kernel_arm",
     srcs = [
         "kernel_arm32.cc",
         "kernel_arm64.cc",
     ],
-    hdrs = [
-        "kernel.h",
-    ],
+    copts = ruy_copts_base(),
     deps = [
+        ":common",
+        ":kernel_common",
+        ":opt_set",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "pack_arm",
+    srcs = [
+        "pack_arm.cc",
+    ],
+    copts = ruy_copts_base(),
+    deps = [
+        ":common",
+        ":opt_set",
+        ":pack_common",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+# AVX-512 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX512 = ruy_copts_base() + ruy_copts_skylake()
+
+cc_library(
+    name = "kernel_avx512",
+    srcs = [
+        "kernel_avx512.cc",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
+    deps = [
+        ":check_macros",
+        ":kernel_common",
+        ":opt_set",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "pack_avx512",
+    srcs = [
+        "pack_avx512.cc",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
+    deps = [
+        ":check_macros",
+        ":matrix",
+        ":opt_set",
+        ":pack_common",
+        ":path",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "have_built_path_for_avx512",
+    srcs = [
+        "have_built_path_for_avx512.cc",
+    ],
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
+    deps = [
+        ":opt_set",
+        ":platform",
+    ],
+)
+# End: AVX-512 compilation units.
+
+# AVX2 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX2 = ruy_copts_base() + ruy_copts_avx2()
+
+cc_library(
+    name = "kernel_avx2",
+    srcs = [
+        "kernel_avx2.cc",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
+    deps = [
+        ":check_macros",
+        ":kernel_common",
+        ":opt_set",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "pack_avx2",
+    srcs = [
+        "pack_avx2.cc",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
+    deps = [
+        ":check_macros",
+        ":matrix",
+        ":opt_set",
+        ":pack_common",
+        ":path",
+        ":platform",
+        "@gemmlowp//:profiler",
+    ],
+)
+
+cc_library(
+    name = "have_built_path_for_avx2",
+    srcs = [
+        "have_built_path_for_avx2.cc",
+    ],
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
+    deps = [
+        ":opt_set",
+        ":platform",
+    ],
+)
+# End: AVX2 compilation units.
+
+cc_library(
+    name = "kernel",
+    hdrs = [
+        "kernel.h",
+        "kernel_common.h",
+    ],
+    copts = ruy_copts_base(),
+    deps = [
+        ":check_macros",
         ":common",
         ":internal_matrix",
+        ":kernel_arm",  # fixdeps: keep
+        ":kernel_avx2",  # fixdeps: keep
+        ":kernel_avx512",  # fixdeps: keep
+        ":kernel_common",
+        ":matrix",
         ":opt_set",
         ":path",
         ":platform",
+        ":side_pair",
         ":size_util",
         ":spec",
         ":tune",
@@ -277,39 +498,106 @@ cc_library(
 
 cc_library(
     name = "pack",
-    srcs = [
-        "pack.cc",
-    ],
     hdrs = [
         "pack.h",
+        "pack_common.h",
     ],
+    copts = ruy_copts_base(),
     deps = [
+        ":check_macros",
         ":common",
         ":internal_matrix",
+        ":matrix",
         ":opt_set",
+        ":pack_arm",  # fixdeps: keep
+        ":pack_avx2",  # fixdeps: keep
+        ":pack_avx512",  # fixdeps: keep
+        ":pack_common",
         ":path",
         ":platform",
-        ":spec",
         ":tune",
         "@gemmlowp//:profiler",
     ],
 )
 
+cc_library(
+    name = "have_built_path_for",
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    deps = [
+        ":have_built_path_for_avx2",
+        ":have_built_path_for_avx512",
+        ":platform",
+    ],
+)
+
+cc_library(
+    name = "context",
+    srcs = [
+        "context.cc",
+    ],
+    hdrs = [
+        "context.h",
+    ],
+    copts = ruy_copts_base(),
+    visibility = ruy_visibility(),
+    deps = [
+        ":allocator",
+        ":check_macros",
+        ":detect_arm",
+        ":detect_x86",
+        ":have_built_path_for",
+        ":path",
+        ":platform",
+        ":thread_pool",
+        ":trace",
+        ":tune",
+    ],
+)
+
+cc_test(
+    name = "context_test",
+    srcs = ["context_test.cc"],
+    deps = [
+        ":context",
+        ":path",
+        ":platform",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "trmul_params",
+    hdrs = ["trmul_params.h"],
+    copts = ruy_copts_base(),
+    deps = [
+        ":internal_matrix",
+        ":side_pair",
+        ":tune",
+    ],
+)
+
 cc_library(
     name = "trmul",
     srcs = ["trmul.cc"],
     hdrs = ["trmul.h"],
+    copts = ruy_copts_base(),
     deps = [
         ":allocator",
         ":block_map",
+        ":check_macros",
         ":common",
         ":context",
         ":internal_matrix",
-        ":kernel",
+        ":matrix",
         ":opt_set",
-        ":pack",
+        ":side_pair",
+        ":size_util",
+        ":spec",
         ":thread_pool",
         ":trace",
+        ":trmul_params",
         ":tune",
         "@gemmlowp//:profiler",
     ],
@@ -326,16 +614,23 @@ cc_library(
         "ruy.h",
         "ruy_advanced.h",
     ],
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":check_macros",
         ":common",
         ":context",
+        ":internal_matrix",
+        ":kernel",
         ":matrix",
+        ":opt_set",
+        ":pack",
         ":path",
+        ":side_pair",
         ":size_util",
         ":spec",
         ":trmul",
+        ":trmul_params",
         ":tune",
         "@gemmlowp//:profiler",
     ],
@@ -345,18 +640,14 @@ cc_library(
 cc_binary(
     name = "example",
     srcs = ["example.cc"],
-    deps = [
-        ":ruy",
-    ],
+    deps = [":ruy"],
 )
 
 # Usage examples of the advanced API.
 cc_binary(
     name = "example_advanced",
     srcs = ["example_advanced.cc"],
-    deps = [
-        ":ruy",
-    ],
+    deps = [":ruy"],
 )
 
 # Small library to query PMU counters, for benchmark only
@@ -365,6 +656,7 @@ cc_library(
     testonly = True,
     srcs = ["pmu.cc"],
     hdrs = ["pmu.h"],
+    copts = ruy_copts_base(),
     deps = [":check_macros"],
 )
 
@@ -373,6 +665,7 @@ cc_library(
     name = "test_lib",
     testonly = True,
     hdrs = ["test.h"],
+    copts = ruy_copts_base(),
     # need defines, not copts, because it's controlling a header, test.h
     defines = ruy_test_ext_defines(),
     linkopts = select({
@@ -380,8 +673,10 @@ cc_library(
         "//conditions:default": ["-lm"],
     }),
     deps = [
+        ":matrix",
         ":pmu",
         ":ruy",
+        ":spec",
         ":time",
         "@com_google_googletest//:gtest",
         ":platform",
@@ -391,6 +686,7 @@ cc_library(
 ruy_benchmark(
     name = "benchmark",
     srcs = ["benchmark.cc"],
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -404,6 +700,7 @@ ruy_benchmark(
 ruy_test(
     name = "test_fast",
     srcs = ["test_fast.cc"],
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("f64", "f32", "f64", "f32"),
@@ -419,6 +716,7 @@ ruy_test(
 ruy_test(
     name = "test_slow",
     srcs = ["test_slow.cc"],
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -432,6 +730,7 @@ ruy_test(
 ruy_test(
     name = "test_special_specs",
     srcs = ["test_special_specs.cc"],
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -442,6 +741,7 @@ ruy_test(
 ruy_benchmark_opt_sets(
     name = "benchmark_opt_set",
     srcs = ["benchmark.cc"],
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
diff --git a/tensorflow/lite/experimental/ruy/allocator.cc b/tensorflow/lite/experimental/ruy/allocator.cc
index 044288847bb..8c4536bdeb1 100644
--- a/tensorflow/lite/experimental/ruy/allocator.cc
+++ b/tensorflow/lite/experimental/ruy/allocator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/allocator.h"
 
+#include <cstdint>
 #include <cstdlib>
 
 #ifdef _WIN32
@@ -25,7 +26,7 @@ namespace ruy {
 
 namespace detail {
 
-void *AlignedAllocator::SystemAlignedAlloc(std::size_t num_bytes) {
+void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
 #ifdef _WIN32
   return _aligned_malloc(num_bytes, kAlignment);
 #else
diff --git a/tensorflow/lite/experimental/ruy/allocator.h b/tensorflow/lite/experimental/ruy/allocator.h
index ef1db4da269..2a3b4ba94d9 100644
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_ALLOCATOR_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_ALLOCATOR_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -27,7 +28,8 @@ namespace ruy {
 
 namespace detail {
 
-inline void* VoidPtrAdd(void* p, std::size_t offset) {
+inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
+  RUY_DCHECK(p);
   std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
   return reinterpret_cast<void*>(addr);
 }
@@ -62,7 +64,7 @@ class AlignedAllocator {
   //    ARM reference manual mentions that this granule size may be as large
   //    as 2048 bytes, in practice we observe it to be 64 bytes. It can
   //    be queried cheaply, at runtime, from userspace, if needed.
-  static constexpr std::size_t kAlignment = 64;
+  static constexpr std::ptrdiff_t kAlignment = 64;
 
   void operator=(const AlignedAllocator&) = delete;
   ~AlignedAllocator() {
@@ -70,7 +72,7 @@ class AlignedAllocator {
     SystemAlignedFree(ptr_);
   }
 
-  void* AllocateAlignedBytes(std::size_t num_bytes) {
+  void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
     RUY_DCHECK(num_bytes > 0);
     RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
     if (void* p = AllocateFast(num_bytes)) {
@@ -85,7 +87,13 @@ class AlignedAllocator {
       return;
     }
 
-    std::size_t new_size = round_up_pot(size_ + fallback_blocks_total_size_);
+    // No rounding-up of the size means linear instead of logarithmic
+    // bound on the number of allocation in some worst-case calling patterns.
+    // This is considered worth it because minimizing memory usage is important
+    // and actual calling patterns in applications that we care about still
+    // reach the no-further-allocations steady state in a small finite number
+    // of iterations.
+    std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
     SystemAlignedFree(ptr_);
     ptr_ = SystemAlignedAlloc(new_size);
     size_ = new_size;
@@ -98,16 +106,16 @@ class AlignedAllocator {
   }
 
  private:
-  void* AllocateFast(std::size_t num_bytes) {
-    if (current_ + num_bytes <= size_) {
-      void* ret = VoidPtrAdd(ptr_, current_);
-      current_ += num_bytes;
-      return ret;
+  void* AllocateFast(std::ptrdiff_t num_bytes) {
+    if (current_ + num_bytes > size_) {
+      return nullptr;
     }
-    return nullptr;
+    void* ret = VoidPtrAdd(ptr_, current_);
+    current_ += num_bytes;
+    return ret;
   }
 
-  void* AllocateSlow(std::size_t num_bytes) {
+  void* AllocateSlow(std::ptrdiff_t num_bytes) {
     void* p = SystemAlignedAlloc(num_bytes);
     fallback_blocks_total_size_ += num_bytes;
     fallback_blocks_.push_back(p);
@@ -116,7 +124,7 @@ class AlignedAllocator {
 
   // Primitive allocation functions obtaining aligned memory from the
   // operating system.
-  void* SystemAlignedAlloc(std::size_t num_bytes);
+  void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
   void SystemAlignedFree(void* ptr);
 
   // Theory of operation:
@@ -135,10 +143,10 @@ class AlignedAllocator {
   // bump-ptr allocator's buffer so that the next sequence of allocations
   // will hopefully not need any fallback blocks.
   void* ptr_ = nullptr;
-  std::size_t current_ = 0;
-  std::size_t size_ = 0;
+  std::ptrdiff_t current_ = 0;
+  std::ptrdiff_t size_ = 0;
   std::vector<void*> fallback_blocks_;
-  std::size_t fallback_blocks_total_size_ = 0;
+  std::ptrdiff_t fallback_blocks_total_size_ = 0;
 };
 
 }  // namespace detail
@@ -147,7 +155,7 @@ class AlignedAllocator {
 // typed buffer.
 class Allocator {
  public:
-  void* AllocateBytes(std::size_t num_bytes) {
+  void* AllocateBytes(std::ptrdiff_t num_bytes) {
     if (num_bytes == 0) {
       return nullptr;
     }
@@ -155,7 +163,7 @@ class Allocator {
         round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
   }
   template <typename Pointer>
-  void Allocate(std::size_t count, Pointer* out) {
+  void Allocate(std::ptrdiff_t count, Pointer* out) {
     using T = typename std::pointer_traits<Pointer>::element_type;
     *out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
   }
diff --git a/tensorflow/lite/experimental/ruy/allocator_test.cc b/tensorflow/lite/experimental/ruy/allocator_test.cc
index 7006b0d1107..4bc99568163 100644
--- a/tensorflow/lite/experimental/ruy/allocator_test.cc
+++ b/tensorflow/lite/experimental/ruy/allocator_test.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/allocator.h"
 
-#include <cstdlib>
-
 #include <gtest/gtest.h>
 
 namespace ruy {
diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc
index 7d055791a1e..b1db2c05cac 100644
--- a/tensorflow/lite/experimental/ruy/benchmark.cc
+++ b/tensorflow/lite/experimental/ruy/benchmark.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdio>
 #include <cstdlib>
 #include <string>
 
diff --git a/tensorflow/lite/experimental/ruy/block_map.cc b/tensorflow/lite/experimental/ruy/block_map.cc
index 74055801d39..bb74c120e8c 100644
--- a/tensorflow/lite/experimental/ruy/block_map.cc
+++ b/tensorflow/lite/experimental/ruy/block_map.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/block_map.h"
 
+#include <algorithm>
+#include <cstdint>
+
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
@@ -22,46 +25,52 @@ limitations under the License.
 
 namespace ruy {
 
-void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
-                     std::uint16_t* block_r, std::uint16_t* block_c) {
+void GetBlockByIndex(const BlockMap& block_map, int index,
+                     SidePair<int>* block) {
   gemmlowp::ScopedProfilingLabel label("GetBlockByIndex");
-  std::uint16_t rectr =
-      index & ((1 << block_map.rows_rectangularness_log2) - 1);
-  std::uint16_t rectc =
-      index & ((1 << block_map.cols_rectangularness_log2) - 1);
+  const std::uint32_t index_u32 = index;
+  const std::uint32_t rectr =
+      index_u32 & ((1u << block_map.rectangularness_log2[Side::kLhs]) - 1);
+  const std::uint32_t rectc =
+      index_u32 & ((1u << block_map.rectangularness_log2[Side::kRhs]) - 1);
 
-  std::uint16_t n1 = index >> (block_map.rows_rectangularness_log2 +
-                               block_map.cols_rectangularness_log2);
-  RUY_DCHECK_EQ(index, (n1 << (block_map.rows_rectangularness_log2 +
-                               block_map.cols_rectangularness_log2)) +
-                           rectr + rectc);
+  const std::uint32_t n1 =
+      index_u32 >> (block_map.rectangularness_log2[Side::kLhs] +
+                    block_map.rectangularness_log2[Side::kRhs]);
+  RUY_DCHECK_EQ(index_u32,
+                (n1 << (block_map.rectangularness_log2[Side::kLhs] +
+                        block_map.rectangularness_log2[Side::kRhs])) +
+                    rectr + rectc);
 
-  std::uint16_t br, bc;
+  std::uint32_t br, bc;
   if (block_map.traversal_order == BlockMapTraversalOrder::kLinear) {
-    br = n1 & ((1 << block_map.num_blocks_base_log2) - 1);
+    br = n1 & ((1u << block_map.num_blocks_base_log2) - 1);
     bc = n1 >> block_map.num_blocks_base_log2;
   } else {
     // Decode fractal z-order
-    std::uint16_t n2 =
-        (n1 & 0x9999) | ((n1 & 0x4444) >> 1) | ((n1 & 0x2222) << 1);
-    std::uint16_t n4 =
-        (n2 & 0xc3c3) | ((n2 & 0x3030) >> 2) | ((n2 & 0x0c0c) << 2);
-    std::uint16_t n8 =
-        (n4 & 0xf00f) | ((n4 & 0x0f00) >> 4) | ((n4 & 0x00f0) << 4);
-    br = n8 & 0xff;
-    bc = n8 >> 8;
+    const std::uint32_t n2 = (n1 & 0x99999999u) | ((n1 & 0x44444444u) >> 1) |
+                             ((n1 & 0x22222222u) << 1);
+    const std::uint32_t n4 = (n2 & 0xc3c3c3c3u) | ((n2 & 0x30303030u) >> 2) |
+                             ((n2 & 0x0c0c0c0cu) << 2);
+    const std::uint32_t n8 = (n4 & 0xf00ff00fu) | ((n4 & 0x0f000f00u) >> 4) |
+                             ((n4 & 0x00f000f0u) << 4);
+    const std::uint32_t n16 = (n8 & 0xff0000ffu) | ((n8 & 0x00ff0000u) >> 8) |
+                              ((n8 & 0x0000ff00u) << 8);
+
+    br = n16 & 0xffff;
+    bc = n16 >> 16;
     if (block_map.traversal_order == BlockMapTraversalOrder::kFractalU) {
       // Change fractal z-order to u-order
       br ^= bc;
     }
   }
 
-  br = (br << block_map.rows_rectangularness_log2) + rectr;
-  bc = (bc << block_map.cols_rectangularness_log2) + rectc;
+  br = (br << block_map.rectangularness_log2[Side::kLhs]) + rectr;
+  bc = (bc << block_map.rectangularness_log2[Side::kRhs]) + rectc;
 
   // Store
-  *block_r = br;
-  *block_c = bc;
+  (*block)[Side::kLhs] = br;
+  (*block)[Side::kRhs] = bc;
 }
 
 namespace {
@@ -85,6 +94,8 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
   gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
   RUY_DCHECK_GE(rows, kernel_rows);
   RUY_DCHECK_GE(cols, kernel_cols);
+  RUY_DCHECK_EQ(rows % kernel_rows, 0);
+  RUY_DCHECK_EQ(cols % kernel_cols, 0);
 
   block_map->traversal_order = BlockMapTraversalOrder::kLinear;
   if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
@@ -161,86 +172,62 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                         ceil_log2(std::max(lhs_scalar_size, rhs_scalar_size)));
   l1_size_log2 = std::max(l1_size_log2, kernel_width_log2);
   l1_size_log2 = std::min(l1_size_log2, size_floor_log2);
-  l1_size_log2 = std::max(l1_size_log2, size_floor_log2 - 8);
 
   int num_blocks_base_log2 = size_floor_log2 - l1_size_log2;
   RUY_DCHECK_GE(num_blocks_base_log2, 0);
-  RUY_DCHECK_LE(num_blocks_base_log2, 8);
-  if (num_blocks_base_log2 == 0) {
-    if ((rows % kernel_rows) || (cols % kernel_cols)) {
-      num_blocks_base_log2 = 1;
-    }
-  }
-  RUY_DCHECK_LE(num_blocks_base_log2 + rows_rectangularness_log2, 16);
-  RUY_DCHECK_LE(num_blocks_base_log2 + cols_rectangularness_log2, 16);
-
-  int rows_rounded_up = round_up_pot(rows, kernel_rows);
-  int cols_rounded_up = round_up_pot(cols, kernel_cols);
 
   const int num_blocks_of_rows_log2 =
       num_blocks_base_log2 + rows_rectangularness_log2;
   const int num_blocks_of_cols_log2 =
       num_blocks_base_log2 + cols_rectangularness_log2;
 
-  std::uint16_t smallr =
-      round_down_pot(rows_rounded_up >> num_blocks_of_rows_log2, kernel_rows);
-  std::uint16_t smallc =
-      round_down_pot(cols_rounded_up >> num_blocks_of_cols_log2, kernel_cols);
-  std::uint16_t missr =
-      round_up_pot(rows_rounded_up - (smallr << num_blocks_of_rows_log2),
-                   kernel_rows) /
-      kernel_rows;
-  std::uint16_t missc =
-      round_up_pot(cols_rounded_up - (smallc << num_blocks_of_cols_log2),
-                   kernel_cols) /
-      kernel_cols;
+  const int smallr =
+      round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows);
+  const int smallc =
+      round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols);
+  const int missr =
+      round_up_pot(rows - (smallr << num_blocks_of_rows_log2), kernel_rows) >>
+      floor_log2(kernel_rows);
+  const int missc =
+      round_up_pot(cols - (smallc << num_blocks_of_cols_log2), kernel_cols) >>
+      floor_log2(kernel_cols);
 
-  block_map->rows = rows;
-  block_map->cols = cols;
-  block_map->kernel_rows = kernel_rows;
-  block_map->kernel_cols = kernel_cols;
+  block_map->dims[Side::kLhs] = rows;
+  block_map->dims[Side::kRhs] = cols;
+  block_map->kernel_dims[Side::kLhs] = kernel_rows;
+  block_map->kernel_dims[Side::kRhs] = kernel_cols;
   block_map->num_blocks_base_log2 = num_blocks_base_log2;
-  block_map->rows_rectangularness_log2 = rows_rectangularness_log2;
-  block_map->cols_rectangularness_log2 = cols_rectangularness_log2;
-  block_map->smallr = smallr;
-  block_map->smallc = smallc;
-  block_map->missr = missr;
-  block_map->missc = missc;
+  block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2;
+  block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2;
+  block_map->small_block_dims[Side::kLhs] = smallr;
+  block_map->small_block_dims[Side::kRhs] = smallc;
+  block_map->large_blocks[Side::kLhs] = missr;
+  block_map->large_blocks[Side::kRhs] = missc;
 }
 
-void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r,
-                          std::uint16_t block_c, int* start_r, int* start_c,
-                          int* end_r, int* end_c) {
+void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
+                          int* start, int* end) {
   gemmlowp::ScopedProfilingLabel label("GetBlockMatrixCoords");
-  int sr = block_r * block_map.smallr +
-           std::min(block_r, block_map.missr) * block_map.kernel_rows;
-  int er = sr + block_map.smallr +
-           (block_r < block_map.missr) * block_map.kernel_rows;
-  int sc = block_c * block_map.smallc +
-           std::min(block_c, block_map.missc) * block_map.kernel_cols;
-  int ec = sc + block_map.smallc +
-           (block_c < block_map.missc) * block_map.kernel_cols;
-  sc = round_down_pot(sc, block_map.kernel_cols);
-  ec = round_down_pot(ec, block_map.kernel_cols);
-  sr = round_down_pot(sr, block_map.kernel_rows);
-  er = round_down_pot(er, block_map.kernel_rows);
+  *start = block * block_map.small_block_dims[side] +
+           std::min(block, block_map.large_blocks[side]) *
+               block_map.kernel_dims[side];
+  *end =
+      *start + block_map.small_block_dims[side] +
+      (block < block_map.large_blocks[side] ? block_map.kernel_dims[side] : 0);
 
-  ec = std::min(ec, block_map.cols);
-  er = std::min(er, block_map.rows);
-  sc = std::max(0, ec - round_up_pot(ec - sc, block_map.kernel_cols));
-  sr = std::max(0, er - round_up_pot(er - sr, block_map.kernel_rows));
+  RUY_DCHECK_EQ(0, *start % block_map.kernel_dims[side]);
+  RUY_DCHECK_EQ(0, *end % block_map.kernel_dims[side]);
+  RUY_DCHECK_LE(*end, block_map.dims[side]);
+  RUY_DCHECK_LT(*start, *end);
+  RUY_DCHECK_GE(*start, 0);
+}
 
-  *start_c = sc;
-  *end_c = ec;
-  *start_r = sr;
-  *end_r = er;
-
-  RUY_DCHECK_LE(ec, block_map.cols);
-  RUY_DCHECK_LE(er, block_map.rows);
-  RUY_DCHECK_LT(sc, ec);
-  RUY_DCHECK_LT(sr, er);
-  RUY_DCHECK_GE(sc, 0);
-  RUY_DCHECK_GE(sr, 0);
+void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block,
+                          SidePair<int>* start, SidePair<int>* end) {
+  for (Side side : {Side::kLhs, Side::kRhs}) {
+    GetBlockMatrixCoords(side, block_map, block[side], &(*start)[side],
+                         &(*end)[side]);
+  }
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/block_map.h b/tensorflow/lite/experimental/ruy/block_map.h
index b0567ea481f..b51a1f5d12b 100644
--- a/tensorflow/lite/experimental/ruy/block_map.h
+++ b/tensorflow/lite/experimental/ruy/block_map.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCK_MAP_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCK_MAP_H_
 
-#include <cstdint>
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
 
 namespace ruy {
 
@@ -82,30 +82,22 @@ struct BlockMap {
   // The order in which to traverse the matrix of which this BlockMap represents
   // a tiling (hereafter "the matrix").
   BlockMapTraversalOrder traversal_order;
-  // The number of rows in the matrix.
-  int rows;
-  // The number of columns in the matrix.
-  int cols;
+  // The dimensions of the block_map, that is, of the destination
+  // matrix rounded up to next multiples of kernel_dims.
+  SidePair<int> dims;
   // Log2 of the minimum number of subdivisions of the grid along either axis.
   int num_blocks_base_log2;
-  // Log2 of the additional subdivision of the rows axis.
-  int rows_rectangularness_log2;
-  // Log2 of the additional subdivision of the columns axis.
-  int cols_rectangularness_log2;
-  // Requested alignment of the subdivions grid along the rows axis.
-  int kernel_rows;
-  // Requested alignment of the subdivions grid along the columns axis.
-  int kernel_cols;
-  // Internal helper. Minimum number of rows in each block.
-  std::uint16_t smallr;
-  // Internal helper. Minimum number of columns in each block.
-  std::uint16_t smallc;
-  // Internal helper. Number of rows that would be missed at the end if
-  // all blocks had exactly `smallr` rows.
-  std::uint16_t missr;
-  // Internal helper. Number of columns that would be missed at the end if
-  // all blocks had exactly `smallc` columns.
-  std::uint16_t missc;
+  // Log2 of the additional subdivision of the rows/columns axis.
+  SidePair<int> rectangularness_log2;
+  // Requested alignment of the subdivisions of the grid along the rows/columns
+  // axis.
+  SidePair<int> kernel_dims;
+  // Internal helper. Minimum number of rows/columns in each block.
+  SidePair<int> small_block_dims;
+  // Internal helper. Number of blocks along each dimension that need to have
+  // their size in that dimension be given by (small_block_dims + kernel_dims)
+  // instead of just small_block_dims.
+  SidePair<int> large_blocks;
 };
 
 // Create a BlockMap suitable for tiling the destination matrix in a
@@ -114,28 +106,28 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
                   int cache_friendly_traversal_threshold, BlockMap* block_map);
 
-// Maps an integer index to a (block_r, block_c) block position in the grid.
-void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
-                     std::uint16_t* block_r, std::uint16_t* block_c);
+// Maps an integer index to a block position in the grid.
+void GetBlockByIndex(const BlockMap& block_map, int index,
+                     SidePair<int>* block);
 
-// Given a (block_r, block_c) block position in the grid, returns its actual
+// Given a block position in the grid, returns its actual
+// position in the matrix that the BlockMap refers to in the dimension
+// referred to by `side`: along rows if side==kLhs, along columns if
+// side==kRhs.
+void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
+                          int* start, int* end);
+
+// Given a block position in the grid, returns its actual
 // position in the matrix that the BlockMap refers to in terms of
-// actual row/column indices: starting at row start_r and column start_c,
-// ending at row (end_r - 1) and column (end_c - 1).
-void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r,
-                          std::uint16_t block_c, int* start_r, int* start_c,
-                          int* end_r, int* end_c);
+// actual row/column indices.
+void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block,
+                          SidePair<int>* start, SidePair<int>* end);
 
-// Returns the number of grid subdivisions along the rows dimension.
-inline std::uint16_t NumBlocksOfRows(const BlockMap& block_map) {
+// Returns the number of grid subdivisions along the rows dimension (if
+// side == kLhs) or columns dimension (if side == kRhs).
+inline int NumBlocksPerSide(Side side, const BlockMap& block_map) {
   return 1 << (block_map.num_blocks_base_log2 +
-               block_map.rows_rectangularness_log2);
-}
-
-// Returns the number of grid subdivisions along the columns dimension.
-inline std::uint16_t NumBlocksOfCols(const BlockMap& block_map) {
-  return 1 << (block_map.num_blocks_base_log2 +
-               block_map.cols_rectangularness_log2);
+               block_map.rectangularness_log2[side]);
 }
 
 // Returns the overall number of blocks in
@@ -145,10 +137,10 @@ inline std::uint16_t NumBlocksOfCols(const BlockMap& block_map) {
 // Note that it is always true that
 //   NumBlocks == NumBlocksOfRows * NumBlocksOfCols
 // because either rows_rectangularness_log2 or cols_rectangularness_log2 is 0.
-inline std::uint32_t NumBlocks(const BlockMap& block_map) {
+inline int NumBlocks(const BlockMap& block_map) {
   return 1 << (2 * block_map.num_blocks_base_log2 +
-               block_map.rows_rectangularness_log2 +
-               block_map.cols_rectangularness_log2);
+               block_map.rectangularness_log2[Side::kLhs] +
+               block_map.rectangularness_log2[Side::kRhs]);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/blocking_counter.cc b/tensorflow/lite/experimental/ruy/blocking_counter.cc
index ac8a32803fd..97b096d0e4f 100644
--- a/tensorflow/lite/experimental/ruy/blocking_counter.cc
+++ b/tensorflow/lite/experimental/ruy/blocking_counter.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/blocking_counter.h"
 
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>               // NOLINT(build/c++11)
-
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/wait.h"
 
diff --git a/tensorflow/lite/experimental/ruy/blocking_counter.h b/tensorflow/lite/experimental/ruy/blocking_counter.h
index 40f903ba1ab..e8c76d514a5 100644
--- a/tensorflow/lite/experimental/ruy/blocking_counter.h
+++ b/tensorflow/lite/experimental/ruy/blocking_counter.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCKING_COUNTER_H_
 
 #include <atomic>
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>               // NOLINT(build/c++11)
+#include <condition_variable>  // NOLINT(build/c++11) // IWYU pragma: keep
+#include <mutex>               // NOLINT(build/c++11) // IWYU pragma: keep
 
 namespace ruy {
 
diff --git a/tensorflow/lite/experimental/ruy/build_defs.bzl b/tensorflow/lite/experimental/ruy/build_defs.bzl
new file mode 100644
index 00000000000..d375b4f7ff5
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/build_defs.bzl
@@ -0,0 +1,32 @@
+"""Build definitions for Ruy."""
+
+def ruy_visibility():
+    return [
+        "//tensorflow/lite/kernels:__subpackages__",
+    ]
+
+# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
+#    ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
+# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
+#    We would want to only do that when compilation_mode is "opt", but limitations of
+#    the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
+#    at the moment. For debugging purposes, this can be overridded on the command line, e.g.
+#      bazel build -c dbg --copt=-O0 ...
+
+def ruy_copts_base():
+    return select({
+        "//tensorflow:android_arm64": ["-O3"],
+        "//tensorflow:android_arm": [
+            "-O3",
+            "-mfpu=neon",
+        ],
+        "//conditions:default": [],
+    })
+
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
+def ruy_copts_skylake():
+    return []
+
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
+def ruy_copts_avx2():
+    return []
diff --git a/tensorflow/lite/experimental/ruy/common.h b/tensorflow/lite/experimental/ruy/common.h
index 9a596815fa2..66bb4c5d54a 100644
--- a/tensorflow/lite/experimental/ruy/common.h
+++ b/tensorflow/lite/experimental/ruy/common.h
@@ -18,7 +18,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_COMMON_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_COMMON_H_
 
-#include <atomic>
 #include <limits>
 #include <type_traits>
 
@@ -28,10 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/platform.h"
 
-#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32))
-#include <arm_neon.h>
-#endif
-
 #if RUY_OPT_ENABLED(RUY_OPT_PREFETCH)
 #define RUY_PREFETCH(X) X
 #else
@@ -56,20 +51,6 @@ void* ToVoidPtr(T* p) {
   return const_cast<void*>(static_cast<const void*>(p));
 }
 
-// We need this where we have multiple threads potentially writing concurrently
-// to the same memory location. That is currently the case for Pack (see
-// the comment in TrMulTask where Pack is called) and in tracing.
-//
-// This is a strict-aliasing violation. For nicer things, see C++20 atomic_ref
-// and the defunct N4013. (Thanks to hboehm@).
-template <typename T>
-void relaxed_atomic_store(T* ptr, T value) {
-  static_assert(sizeof(std::atomic<T>) == sizeof(T), "");
-  std::atomic<T>* atomic = reinterpret_cast<std::atomic<T>*>(ptr);
-  RUY_DCHECK(atomic->is_lock_free());
-  atomic->store(value, std::memory_order_relaxed);
-}
-
 template <typename Scalar>
 Scalar SymmetricZeroPoint() {
   if (std::is_floating_point<Scalar>::value) {
diff --git a/tensorflow/lite/experimental/ruy/context.cc b/tensorflow/lite/experimental/ruy/context.cc
index d192cf19b17..aea42cdf501 100644
--- a/tensorflow/lite/experimental/ruy/context.cc
+++ b/tensorflow/lite/experimental/ruy/context.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/context.h"
 
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/detect_dotprod.h"
+#include "tensorflow/lite/experimental/ruy/detect_arm.h"
+#include "tensorflow/lite/experimental/ruy/detect_x86.h"
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
 
@@ -37,14 +40,34 @@ Path Context::GetRuntimeEnabledPaths() {
   // Need to resolve now. Start by considering all paths enabled.
   runtime_enabled_paths_ = kAllPaths;
 
+#if RUY_PLATFORM(ARM)
   // Now selectively disable paths that aren't supported on this machine.
   if ((runtime_enabled_paths_ & Path::kNeonDotprod) != Path::kNone) {
     if (!DetectDotprod()) {
-      runtime_enabled_paths_ = runtime_enabled_paths_ ^ Path::kNeonDotprod;
+      runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kNeonDotprod;
       // Sanity check.
       RUY_DCHECK((runtime_enabled_paths_ & Path::kNeonDotprod) == Path::kNone);
     }
   }
+#endif  // RUY_PLATFORM(ARM)
+
+#if RUY_PLATFORM(X86)
+  if ((runtime_enabled_paths_ & Path::kAvx2) != Path::kNone) {
+    if (!(HaveBuiltPathForAvx2() && DetectCpuAvx2())) {
+      runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx2;
+      // Sanity check.
+      RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx2) == Path::kNone);
+    }
+  }
+
+  if ((runtime_enabled_paths_ & Path::kAvx512) != Path::kNone) {
+    if (!(HaveBuiltPathForAvx512() && DetectCpuAvx512())) {
+      runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx512;
+      // Sanity check.
+      RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx512) == Path::kNone);
+    }
+  }
+#endif  // RUY_PLATFORM(X86)
 
   // Sanity check. We can't possibly have disabled all paths, as some paths
   // are universally available (kReference, kStandardCpp).
diff --git a/tensorflow/lite/experimental/ruy/context.h b/tensorflow/lite/experimental/ruy/context.h
index 194e0af3462..3ca6a633281 100644
--- a/tensorflow/lite/experimental/ruy/context.h
+++ b/tensorflow/lite/experimental/ruy/context.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_CONTEXT_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_CONTEXT_H_
 
+#include <cstddef>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/lite/experimental/ruy/context_test.cc b/tensorflow/lite/experimental/ruy/context_test.cc
new file mode 100644
index 00000000000..1a184b843af
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/context_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/context.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+namespace {
+
+TEST(ContextTest, EnabledPathsGeneral) {
+  ruy::Context ruy_context;
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  const auto ruy_paths_repeat = ruy_context.GetRuntimeEnabledPaths();
+  ASSERT_EQ(ruy_paths, ruy_paths_repeat);
+  EXPECT_NE(ruy_paths, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kReference);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
+}
+
+#if RUY_PLATFORM(X86)
+TEST(ContextTest, EnabledPathsX86) {
+  ruy::Context ruy_context;
+  ruy_context.SetRuntimeEnabledPaths(Path::kAvx2 | Path::kAvx512);
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+}
+#endif  // RUY_PLATFORM(X86)
+
+#if RUY_PLATFORM(ARM)
+TEST(ContextTest, EnabledPathsArm) {
+  ruy::Context ruy_context;
+  ruy_context.SetRuntimeEnabledPaths(Path::kNeon | Path::kNeonDotprod);
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kNeon, Path::kNeon);
+}
+#endif  // RUY_PLATFORM(ARM)
+
+}  // namespace
+}  // namespace ruy
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.cc b/tensorflow/lite/experimental/ruy/detect_arm.cc
similarity index 96%
rename from tensorflow/lite/experimental/ruy/detect_dotprod.cc
rename to tensorflow/lite/experimental/ruy/detect_arm.cc
index 5aa1e307c1f..f40a963ef33 100644
--- a/tensorflow/lite/experimental/ruy/detect_dotprod.cc
+++ b/tensorflow/lite/experimental/ruy/detect_arm.cc
@@ -70,6 +70,8 @@ bool try_asm_snippet(bool (*asm_snippet)()) {
 ```
 */
 
+#include "tensorflow/lite/experimental/ruy/detect_arm.h"
+
 #if defined __aarch64__ && defined __linux__
 #define RUY_IMPLEMENT_DETECT_DOTPROD
 #endif
@@ -78,12 +80,12 @@ bool try_asm_snippet(bool (*asm_snippet)()) {
 
 #include <setjmp.h>
 #include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
 #include <unistd.h>
 
-#include <mutex>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>  // NOLINT(build/c++11)
 
 // Intentionally keep checking for __linux__ here in case we want to
 // extend RUY_IMPLEMENT_DETECT_DOTPROD outside of linux in the future.
@@ -113,7 +115,7 @@ void wait_until_no_pending_sigill() {
 sigjmp_buf& global_sigjmp_buf_just_before_trying_snippet() {
   static sigjmp_buf g;
   return g;
-};
+}
 
 // SIGILL signal handler. Long-jumps to just before
 // we ran the snippet that we know is the only thing that could have generated
@@ -173,7 +175,7 @@ bool dotprod_asm_snippet() {
       : "x0", "v0", "v1");
   // Expecting 100 (input accumulator value) + 100 * 100 + ... (repeat 4 times)
   return result == 40100;
-};
+}
 
 bool DetectDotprodBySigIllMethod() {
   return try_asm_snippet(dotprod_asm_snippet);
@@ -215,8 +217,8 @@ bool DetectDotprod() {
   return DetectDotprodBySigIllMethod();
 }
 
-#else
+#else   // RUY_IMPLEMENT_DETECT_DOTPROD
 bool DetectDotprod() { return false; }
-#endif
+#endif  // RUY_IMPLEMENT_DETECT_DOTPROD
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.h b/tensorflow/lite/experimental/ruy/detect_arm.h
similarity index 83%
rename from tensorflow/lite/experimental/ruy/detect_dotprod.h
rename to tensorflow/lite/experimental/ruy/detect_arm.h
index 39c73013ba0..e843a684396 100644
--- a/tensorflow/lite/experimental/ruy/detect_dotprod.h
+++ b/tensorflow/lite/experimental/ruy/detect_arm.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Temporary dotprod-detection code until we can rely on getauxval.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_
 
 namespace ruy {
 
@@ -26,4 +26,4 @@ bool DetectDotprod();
 
 }  // namespace ruy
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_
diff --git a/tensorflow/lite/experimental/ruy/detect_x86.cc b/tensorflow/lite/experimental/ruy/detect_x86.cc
new file mode 100644
index 00000000000..f96f172ee80
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/detect_x86.cc
@@ -0,0 +1,90 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/detect_x86.h"
+
+#include <cstdint>
+
+#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS)
+#include <immintrin.h>  // IWYU pragma: keep
+
+#endif
+
+namespace ruy {
+#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS)
+
+namespace {
+
+// See Intel docs, such as http://goo.gl/c6IkGX.
+inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx,
+                     std::uint32_t abcd[4]) {
+  std::uint32_t ebx, edx;
+#if defined(__i386__) && defined(__PIC__)
+  /* in case of PIC under 32-bit EBX cannot be clobbered */
+  asm volatile("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi"
+               : "=D"(ebx),
+#else
+  asm volatile("cpuid"
+               : "+b"(ebx),
+#endif
+                 "+a"(eax), "+c"(ecx), "=d"(edx));
+  abcd[0] = eax;
+  abcd[1] = ebx;
+  abcd[2] = ecx;
+  abcd[3] = edx;
+}
+
+}  // namespace
+
+bool DetectCpuSse42() {
+  constexpr std::uint32_t kAvx512EcxSse42 = 1u << 20;
+  constexpr std::uint32_t kAvx512EcxAbm = 1u << 5;
+
+  std::uint32_t abcd[4];
+
+  RunCpuid(1, 0, abcd);
+  const bool has_sse4_2_base = (abcd[2] & kAvx512EcxSse42) == kAvx512EcxSse42;
+  RunCpuid(0x80000001, 0, abcd);
+  const bool has_abm = (abcd[2] & kAvx512EcxAbm) == kAvx512EcxAbm;
+
+  return has_sse4_2_base && has_abm;
+}
+
+bool DetectCpuAvx2() {
+  constexpr std::uint32_t kAvx2Ebx = 1u << 5;
+
+  std::uint32_t abcd[4];
+  RunCpuid(7, 0, abcd);
+
+  return (abcd[1] & kAvx2Ebx) == kAvx2Ebx;
+}
+
+bool DetectCpuAvx512() {
+  constexpr std::uint32_t kAvx512EbxF = 1u << 16;
+  constexpr std::uint32_t kAvx512EbxDq = 1u << 17;
+  constexpr std::uint32_t kAvx512EbxCd = 1u << 28;
+  constexpr std::uint32_t kAvx512EbxBw = 1u << 30;
+  constexpr std::uint32_t kAvx512EbxVl = 1u << 31;
+
+  constexpr std::uint32_t kAvx512EbxMask =
+      kAvx512EbxF | kAvx512EbxDq | kAvx512EbxCd | kAvx512EbxBw | kAvx512EbxVl;
+  std::uint32_t abcd[4];
+  RunCpuid(7, 0, abcd);
+
+  return (abcd[1] & kAvx512EbxMask) == kAvx512EbxMask;
+}
+
+#endif
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/detect_x86.h b/tensorflow/lite/experimental/ruy/detect_x86.h
new file mode 100644
index 00000000000..e469bcf8e84
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/detect_x86.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_
+
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+#if RUY_PLATFORM(X86_ENHANCEMENTS)
+
+// This also checks ABM support, which implies LZCNT and POPCNT.
+bool DetectCpuSse42();
+bool DetectCpuAvx2();
+bool DetectCpuAvx512();
+
+#else  // RUY_PLATFORM(X86_ENHANCEMENTS)
+
+inline bool DetectCpuSse42() { return false; }
+inline bool DetectCpuAvx2() { return false; }
+inline bool DetectCpuAvx512() { return false; }
+
+#endif  // !RUY_PLATFORM(X86_ENHANCEMENTS)
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index 9044be70bb7..de74ef7ccc0 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -33,14 +33,28 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 
-#include <limits>
+#include <algorithm>
+#include <cstdint>
+#include <limits>  // IWYU pragma: keep
+#include <type_traits>
 
 #include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/kernel_common.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/pack.h"
+#include "tensorflow/lite/experimental/ruy/pack_common.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
 #include "tensorflow/lite/experimental/ruy/spec.h"
 #include "tensorflow/lite/experimental/ruy/trmul.h"
+#include "tensorflow/lite/experimental/ruy/trmul_params.h"
 
 namespace ruy {
 
@@ -108,10 +122,10 @@ void EnforceDstSpecSupport(const Spec& spec, DstScalar dst_zero_point) {
   RUY_DCHECK(spec.multiplier_exponent_perchannel == nullptr);
 }
 
-inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs,
-                            const DMatrix& dst) {
-  return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) &&
-         IsColMajor(dst.layout);
+inline bool IsColMajorTrMul(const TrMulParams& params) {
+  return IsColMajor(params.src[Side::kLhs].layout) &&
+         IsColMajor(params.src[Side::kRhs].layout) &&
+         IsColMajor(params.dst.layout);
 }
 
 inline void CreatePackedLayout(const Layout& src, const Type& scalar,
@@ -131,8 +145,8 @@ inline void CreatePackedLayout(const Layout& src, const Type& scalar,
 }
 
 template <typename Scalar, typename PackedScalar>
-void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout,
-                        PMatrix* packed) {
+void CreatePackedMatrix(Side side, const KernelLayout& kernel_layout,
+                        TrMulParams* params) {
   // Ruy always uses 32-bit signed accumulators for quantized
   // matrix multiplication, so we would like to always use std::int32_t
   // unconditionally for SumsType.
@@ -142,6 +156,8 @@ void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout,
       typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
                                 std::int32_t>::type;
 
+  const DMatrix& src = params->src[side];
+  PMatrix* packed = &params->packed[side];
   packed->data_type = Type::Create<PackedScalar>();
   packed->sums_type = Type::Create<SumsType>();
   CreatePackedLayout(src.layout, packed->data_type, kernel_layout,
@@ -160,7 +176,7 @@ void PopulateTrMulParams(TrMulParams* params) {
   if (ThePath != Path::kStandardCpp) {
     // The optimized code paths currently only handle the case of all matrices
     // being column major.
-    if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) {
+    if (!IsColMajorTrMul(*params)) {
       fallback_to_standard_cpp = true;
     }
   }
@@ -179,13 +195,12 @@ void PopulateTrMulParams(TrMulParams* params) {
   using RhsKernelLayout = typename Kernel::RhsLayout;
 
   CreatePackedMatrix<LhsScalar, PackedLhsScalar>(
-      params->lhs, ToKernelLayout<LhsKernelLayout>(), &params->packed_lhs);
+      Side::kLhs, ToKernelLayout<LhsKernelLayout>(), params);
   CreatePackedMatrix<RhsScalar, PackedRhsScalar>(
-      params->rhs, ToKernelLayout<RhsKernelLayout>(), &params->packed_rhs);
-
-  params->lhs_run_pack =
+      Side::kRhs, ToKernelLayout<RhsKernelLayout>(), params);
+  params->run_pack[Side::kLhs] =
       &RunPack<ThePath, LhsKernelLayout, LhsScalar, PackedLhsScalar>;
-  params->rhs_run_pack =
+  params->run_pack[Side::kRhs] =
       &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
   params->run_kernel =
       &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
@@ -304,8 +319,8 @@ void CreateTrMulParams(const Matrix<LhsScalar>& lhs,
                        Context* context, Matrix<DstScalar>* dst, Path the_path,
                        TrMulParams* params) {
   // Fill in the fields we already know.
-  params->lhs = ToDMatrix(lhs);
-  params->rhs = ToDMatrix(rhs);
+  params->src[Side::kLhs] = ToDMatrix(lhs);
+  params->src[Side::kRhs] = ToDMatrix(rhs);
   params->dst = ToDMatrix(*dst);
   params->spec = ToVoidPtr(&spec);
 
diff --git a/tensorflow/lite/experimental/ruy/example.cc b/tensorflow/lite/experimental/ruy/example.cc
index 31da97df485..c1a3d27f7c6 100644
--- a/tensorflow/lite/experimental/ruy/example.cc
+++ b/tensorflow/lite/experimental/ruy/example.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <iostream>
 
 #include "tensorflow/lite/experimental/ruy/ruy.h"
diff --git a/tensorflow/lite/experimental/ruy/example_advanced.cc b/tensorflow/lite/experimental/ruy/example_advanced.cc
index 802c85c85a7..f4415e1cb4b 100644
--- a/tensorflow/lite/experimental/ruy/example_advanced.cc
+++ b/tensorflow/lite/experimental/ruy/example_advanced.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <iostream>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
 
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for.h b/tensorflow/lite/experimental/ruy/have_built_path_for.h
new file mode 100644
index 00000000000..4e340f5b118
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+bool HaveBuiltPathForAvx2();
+bool HaveBuiltPathForAvx512();
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
new file mode 100644
index 00000000000..be694cea228
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx2() { return false; }
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx2() { return true; }
+
+#endif  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
new file mode 100644
index 00000000000..ccfea773b15
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx512() { return false; }
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx512() { return true; }
+
+#endif  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h
index f44ce444dc4..34826f19e80 100644
--- a/tensorflow/lite/experimental/ruy/internal_matrix.h
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@@ -90,9 +90,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
 
+#include <cstddef>
+#include <cstdint>
 #include <type_traits>
 #include <utility>
 
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/size_util.h"
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index 0c7a2e356f5..fd470efc5de 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -16,552 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_H_
 
-#include <cstddef>
-#include <cstdint>
-
-#include "fixedpoint/fixedpoint.h"
-#include "profiling/instrumentation.h"
-#include "tensorflow/lite/experimental/ruy/common.h"
-#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
-#include "tensorflow/lite/experimental/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/platform.h"
-#include "tensorflow/lite/experimental/ruy/size_util.h"
-#include "tensorflow/lite/experimental/ruy/spec.h"
-#include "tensorflow/lite/experimental/ruy/tune.h"
 
-namespace ruy {
-
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-struct Kernel {};
-
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-void RunKernelTyped(Tuning tuning, const PackedMatrix<LhsScalar>& lhs,
-                    const PackedMatrix<RhsScalar>& rhs, const Spec& spec,
-                    int start_row, int start_col, int end_row, int end_col,
-                    Matrix<DstScalar>* dst) {
-  using Kernel = Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>;
-  Kernel kernel(tuning);
-  using LhsLayout = typename Kernel::LhsLayout;
-  using RhsLayout = typename Kernel::RhsLayout;
-  // end_row and end_col may be larger than dst dimensions.
-  // that is because kernels write directly to the destination matrix, whose
-  // dimensions may not be a multiple of the kernel dimensions, and we try to
-  // keep this annoyance localized as an implementation detail in kernels,
-  // by allowing to pass rounded-up values down as far as possible.
-  // These assertions encode the contract.
-  RUY_DCHECK_LE(0, start_row);
-  RUY_DCHECK_LE(start_row, end_row);
-  RUY_DCHECK_LT(end_row, dst->layout.rows + LhsLayout::kCols);
-  RUY_DCHECK_EQ((end_row - start_row) % LhsLayout::kCols, 0);
-  RUY_DCHECK_LE(0, start_col);
-  RUY_DCHECK_LE(start_col, end_col);
-  RUY_DCHECK_LT(end_col, dst->layout.cols + RhsLayout::kCols);
-  RUY_DCHECK_EQ((end_col - start_col) % RhsLayout::kCols, 0);
-#if RUY_OPT_ENABLED(RUY_OPT_FAT_KERNEL)
-  kernel.Run(lhs, rhs, spec, start_row, start_col, end_row, end_col, dst);
+// IWYU pragma: begin_exports
+#if RUY_PLATFORM(NEON)
+#include "tensorflow/lite/experimental/ruy/kernel_arm.h"
+#elif RUY_PLATFORM(X86)
+#include "tensorflow/lite/experimental/ruy/kernel_x86.h"
 #else
-  for (int col = start_col; col < end_col; col += RhsLayout::kCols) {
-    int block_end_col = std::min(col + RhsLayout::kCols, end_col);
-    for (int row = start_row; row < end_row; row += LhsLayout::kCols) {
-      int block_end_row = std::min(row + LhsLayout::kCols, end_row);
-      kernel.Run(lhs, rhs, spec, row, col, block_end_row, block_end_col, dst);
-    }
-  }
+#include "tensorflow/lite/experimental/ruy/kernel_common.h"
 #endif
-}
-
-// Main entry point for kernels.
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs,
-               void* spec, int start_row, int start_col, int end_row,
-               int end_col, DMatrix* dst) {
-  Matrix<DstScalar> mdst = ToMatrix<DstScalar>(*dst);
-  RunKernelTyped<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>(
-      tuning, ToPackedMatrix<LhsScalar>(lhs), ToPackedMatrix<RhsScalar>(rhs),
-      *static_cast<const Spec*>(spec), start_row, start_col, end_row, end_col,
-      &mdst);
-}
-
-// The signature of RunKernel is the same, regardless of template parameters.
-using RunKernelFn =
-    decltype(RunKernel<Path::kStandardCpp, std::int8_t, std::int8_t,
-                       std::int8_t, BasicSpec<std::int32_t, std::int8_t>>);
-
-// Copied from TF Lite code.
-inline std::int32_t MultiplyByQuantizedMultiplier(
-    std::int32_t x, std::int32_t quantized_multiplier, int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                                 x * (1 << left_shift), quantized_multiplier),
-                             right_shift);
-}
-
-// Helper to apply a fixed-point multiplier.  Only 'applicable' if AccumScalar
-// is int32 (i.e. in all cases except floating-point) and if the destination is
-// not int32 (i.e. unless the user wants to get raw accumulators).
-template <typename Spec,
-          bool IsApplicable =
-              std::is_same<typename Spec::AccumScalar, std::int32_t>::value &&
-              !std::is_same<typename Spec::DstScalar, std::int32_t>::value>
-struct ApplyMultiplierImpl {};
-
-// Specialization in non-applicable case: do nothing, just check that values
-// are default.
-template <typename Spec>
-struct ApplyMultiplierImpl<Spec, false> {
-  using AccumScalar = typename Spec::AccumScalar;
-  using DstScalar = typename Spec::DstScalar;
-  static void Run(const Spec& spec, int row, AccumScalar* accum) {
-    RUY_DCHECK_EQ(spec.multiplier_fixedpoint, 0);
-    RUY_DCHECK_EQ(spec.multiplier_exponent, 0);
-  }
-};
-
-template <typename Spec>
-struct ApplyMultiplierImpl<Spec, true> {
-  using AccumScalar = typename Spec::AccumScalar;
-  using DstScalar = typename Spec::DstScalar;
-  static void Run(const Spec& spec, int row, AccumScalar* accum) {
-    AccumScalar m = spec.multiplier_fixedpoint_perchannel
-                        ? spec.multiplier_fixedpoint_perchannel[row]
-                        : spec.multiplier_fixedpoint;
-    int e = spec.multiplier_exponent_perchannel
-                ? spec.multiplier_exponent_perchannel[row]
-                : spec.multiplier_exponent;
-    *accum = MultiplyByQuantizedMultiplier(*accum, m, e);
-  }
-};
-
-template <typename Spec>
-void ApplyMultiplier(const Spec& spec, int row,
-                     typename Spec::AccumScalar* accum) {
-  ApplyMultiplierImpl<Spec>::Run(spec, row, accum);
-}
-
-template <typename LhsScalar, typename RhsScalar, typename DstScalar,
-          typename Spec>
-struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
-  using AccumScalar = typename Spec::AccumScalar;
-  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
-  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
-  explicit Kernel(Tuning) {}
-  void Run(const PackedMatrix<LhsScalar>& lhs,
-           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
-           int start_col, int end_row, int end_col,
-           Matrix<DstScalar>* dst) const {
-    // See the comment in RunKernelTyped. end_row may be larger than
-    // dst->layout.rows. It's the responsibility of the kernel to avoid
-    // overrunning dst boundaries, which we do here by computing
-    // clamped_end_row.
-    int clamped_end_row = std::min(end_row, dst->layout.rows);
-    int clamped_end_col = std::min(end_col, dst->layout.cols);
-    RUY_DCHECK_LE(0, start_row);
-    RUY_DCHECK_LE(start_row, clamped_end_row);
-    RUY_DCHECK_LE(clamped_end_row, dst->layout.rows);
-    RUY_DCHECK_LE(clamped_end_row, end_row);
-    RUY_DCHECK_LE(end_row - clamped_end_row, LhsLayout::kCols);
-    RUY_DCHECK_LE(0, start_col);
-    RUY_DCHECK_LE(start_col, clamped_end_col);
-    RUY_DCHECK_LE(clamped_end_col, dst->layout.cols);
-    RUY_DCHECK_LE(clamped_end_col, end_col);
-    RUY_DCHECK_LE(end_col - clamped_end_col, RhsLayout::kCols);
-    gemmlowp::ScopedProfilingLabel label("Kernel (Standard Cpp)");
-    const int depth = lhs.layout.rows;
-    for (int i = start_row; i < clamped_end_row; i++) {
-      for (int j = start_col; j < clamped_end_col; j++) {
-        using AccumScalar = typename Spec::AccumScalar;
-        AccumScalar accum = 0;
-        for (int k = 0; k < depth; k++) {
-          AccumScalar lhs_val = Element(lhs, k, i);
-          AccumScalar rhs_val = Element(rhs, k, j);
-          accum += lhs_val * rhs_val;
-        }
-        if (spec.bias) {
-          accum += spec.bias[i];
-        }
-        if (lhs.zero_point) {
-          accum -= lhs.zero_point * rhs.sums[j];
-        }
-        if (rhs.zero_point) {
-          accum -= rhs.zero_point * lhs.sums[i];
-        }
-        if (lhs.zero_point && rhs.zero_point) {
-          accum += lhs.zero_point * rhs.zero_point * depth;
-        }
-        ApplyMultiplier(spec, i, &accum);
-        accum += dst->zero_point;
-        accum = std::min<AccumScalar>(accum, spec.clamp_max);
-        accum = std::max<AccumScalar>(accum, spec.clamp_min);
-        *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
-      }
-    }
-  }
-};
-
-#define RUY_INHERIT_KERNEL(PARENT, CHILD)                                  \
-  template <typename LhsScalar, typename RhsScalar, typename DstScalar,    \
-            typename Spec>                                                 \
-  struct Kernel<CHILD, LhsScalar, RhsScalar, DstScalar, Spec>              \
-      : Kernel<PARENT, LhsScalar, RhsScalar, DstScalar, Spec> {            \
-    explicit Kernel(Tuning tuning)                                         \
-        : Kernel<PARENT, LhsScalar, RhsScalar, DstScalar, Spec>(tuning) {} \
-  };
-
-RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon)
-RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
-
-// KernelParams are shared across 32-bit and 64-bit NEON code.
-#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
-    (RUY_OPT_ENABLED(RUY_OPT_ASM))
-
-#define RUY_ASM_FLAG_HAS_BIAS 0x1
-#define RUY_ASM_FLAG_HAS_LHS_SUMS 0x2
-#define RUY_ASM_FLAG_HAS_RHS_SUMS 0x4
-#define RUY_ASM_FLAG_HAS_PERCHANNEL 0x8
-#define RUY_ASM_FLAG_NEEDS_LEFT_SHIFT 0x10
-
-#define RUY_ASM_TYPE_ID_UINT8 1
-#define RUY_ASM_TYPE_ID_INT8 2
-#define RUY_ASM_TYPE_ID_INT16 3
-#define RUY_ASM_TYPE_ID_INT32 4
-
-template <typename DstScalar>
-struct DstTypeId {};
-
-template <>
-struct DstTypeId<std::uint8_t> {
-  static constexpr int kValue = RUY_ASM_TYPE_ID_UINT8;
-};
-
-template <>
-struct DstTypeId<std::int8_t> {
-  static constexpr int kValue = RUY_ASM_TYPE_ID_INT8;
-};
-
-template <>
-struct DstTypeId<std::int16_t> {
-  static constexpr int kValue = RUY_ASM_TYPE_ID_INT16;
-};
-
-template <>
-struct DstTypeId<std::int32_t> {
-  static constexpr int kValue = RUY_ASM_TYPE_ID_INT32;
-};
-
-template <int LhsCols, int RhsCols>
-struct KernelParams8bit {
-  static constexpr int kMaxDstTypeSize = 4;
-
-  const std::int32_t* bias;
-  const std::int32_t* lhs_sums;
-  const std::int32_t* rhs_sums;
-  const std::int8_t* lhs_base_ptr;
-  const std::int32_t* multiplier_fixedpoint;
-  const std::int32_t* multiplier_exponent;
-  const std::int8_t* rhs_base_ptr;
-  void* dst_base_ptr;
-  std::int32_t lhs_zero_point;
-  std::int32_t rhs_zero_point;
-  std::int32_t dst_zero_point;
-  std::int32_t prod_zp_depth;
-  std::int32_t start_row;
-  std::int32_t start_col;
-  std::int32_t last_row;
-  std::int32_t last_col;
-  std::int32_t dst_rows;
-  std::int32_t dst_cols;
-  std::int32_t lhs_stride;
-  std::int32_t rhs_stride;
-  std::int32_t dst_stride;
-  std::int32_t depth;
-  std::int32_t clamp_min;
-  std::int32_t clamp_max;
-  std::uint8_t flags;
-  std::uint8_t dst_type_id;
-  const std::int32_t zero_data[LhsCols] = {0};
-  std::uint8_t dst_tmp_buf[LhsCols * RhsCols * kMaxDstTypeSize];
-  std::int32_t multiplier_fixedpoint_buf[LhsCols];
-  std::int32_t multiplier_exponent_buf[LhsCols];
-};
-
-template <typename DstScalar, int LhsCols, int RhsCols>
-void MakeKernelParams8bit(const PackedMatrix<std::int8_t>& lhs,
-                          const PackedMatrix<std::int8_t>& rhs,
-                          const BasicSpec<std::int32_t, DstScalar>& spec,
-                          int start_row, int start_col, int end_row,
-                          int end_col, Matrix<DstScalar>* dst,
-                          KernelParams8bit<LhsCols, RhsCols>* params) {
-  using Params = KernelParams8bit<LhsCols, RhsCols>;
-
-  static_assert(sizeof(DstScalar) <= Params::kMaxDstTypeSize, "");
-
-  const int depth = lhs.layout.rows;
-  RUY_DCHECK_EQ(start_row % LhsCols, 0);
-  RUY_DCHECK_EQ(start_col % RhsCols, 0);
-  RUY_DCHECK_EQ(end_row % LhsCols, 0);
-  RUY_DCHECK_EQ(end_col % RhsCols, 0);
-
-  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
-  params->flags = 0;
-  params->bias = params->zero_data;
-  if (spec.bias) {
-    params->bias = spec.bias;
-    params->flags |= RUY_ASM_FLAG_HAS_BIAS;
-  }
-  if (lhs.sums) {
-    params->lhs_sums = lhs.sums;
-    params->flags |= RUY_ASM_FLAG_HAS_LHS_SUMS;
-  }
-  if (rhs.sums) {
-    params->rhs_sums = rhs.sums;
-    params->flags |= RUY_ASM_FLAG_HAS_RHS_SUMS;
-  }
-  params->start_row = start_row;
-  params->start_col = start_col;
-  params->last_row = end_row - LhsCols;
-  params->last_col = end_col - RhsCols;
-  params->lhs_stride = lhs.layout.stride;
-  params->rhs_stride = rhs.layout.stride;
-  params->dst_stride = sizeof(DstScalar) * dst->layout.stride;
-  params->lhs_zero_point = lhs.zero_point;
-  params->rhs_zero_point = rhs.zero_point;
-  params->dst_zero_point = dst->zero_point;
-  params->depth = depth;
-  params->prod_zp_depth = lhs.zero_point * rhs.zero_point * depth;
-  if (spec.multiplier_fixedpoint_perchannel) {
-    params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
-    params->flags |= RUY_ASM_FLAG_HAS_PERCHANNEL;
-    params->multiplier_fixedpoint = spec.multiplier_fixedpoint_perchannel;
-    params->multiplier_exponent = spec.multiplier_exponent_perchannel;
-  } else {
-    if (spec.multiplier_exponent > 0) {
-      params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
-    }
-    params->multiplier_fixedpoint = params->multiplier_fixedpoint_buf;
-    params->multiplier_exponent = params->multiplier_exponent_buf;
-    for (int i = 0; i < LhsCols; i++) {
-      params->multiplier_fixedpoint_buf[i] = spec.multiplier_fixedpoint;
-      params->multiplier_exponent_buf[i] = spec.multiplier_exponent;
-    }
-  }
-  params->clamp_min = spec.clamp_min;
-  params->clamp_max = spec.clamp_max;
-  params->dst_rows = dst->layout.rows;
-  params->dst_cols = dst->layout.cols;
-
-  RUY_DCHECK_LT(params->last_row, params->dst_rows);
-  RUY_DCHECK_LT(params->last_col, params->dst_cols);
-
-  params->dst_type_id = DstTypeId<DstScalar>::kValue;
-  params->dst_base_ptr =
-      dst->data.get() + start_col * dst->layout.stride + start_row;
-}
-
-void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params);
-void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params);
-void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params);
-void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params);
-
-#if RUY_PLATFORM(NEON_64)
-template <typename DstScalar>
-struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
-              BasicSpec<std::int32_t, DstScalar>> {
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
-  Tuning tuning = Tuning::kAuto;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PackedMatrix<std::int8_t>& lhs,
-           const PackedMatrix<std::int8_t>& rhs,
-           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
-           int start_col, int end_row, int end_col,
-           Matrix<DstScalar>* dst) const {
-    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
-                         dst, &params);
-    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-      Kernel8bitNeonInOrder(params);
-    } else {
-      Kernel8bitNeonOutOfOrder(params);
-    }
-  }
-};
-
-template <typename DstScalar>
-struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, DstScalar,
-              BasicSpec<std::int32_t, DstScalar>> {
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PackedMatrix<std::int8_t>& lhs,
-           const PackedMatrix<std::int8_t>& rhs,
-           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
-           int start_col, int end_row, int end_col,
-           Matrix<DstScalar>* dst) const {
-    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
-                         dst, &params);
-    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-      Kernel8bitNeonDotprodInOrder(params);
-    } else {
-      Kernel8bitNeonDotprodOutOfOrder(params);
-    }
-  }
-};
-#endif
-
-template <int LhsCols, int RhsCols>
-struct KernelParamsFloat {
-  const float* lhs_base_ptr;
-  const float* rhs_base_ptr;
-  float* dst_base_ptr;
-  const float* bias;
-  std::int32_t start_row;
-  std::int32_t start_col;
-  std::int32_t last_row;
-  std::int32_t last_col;
-  std::int32_t dst_rows;
-  std::int32_t dst_cols;
-  std::int32_t lhs_stride;
-  std::int32_t rhs_stride;
-  std::int32_t dst_stride;
-  std::int32_t depth;
-  float clamp_min;
-  float clamp_max;
-  std::uint8_t flags;
-  const float zero_data[LhsCols] = {0};
-  float dst_tmp_buf[LhsCols * RhsCols];
-};
-
-template <int LhsCols, int RhsCols>
-inline void MakeKernelParamsFloat(const PackedMatrix<float>& lhs,
-                                  const PackedMatrix<float>& rhs,
-                                  const BasicSpec<float, float>& spec,
-                                  int start_row, int start_col, int end_row,
-                                  int end_col, Matrix<float>* dst,
-                                  KernelParamsFloat<LhsCols, RhsCols>* params) {
-  using Params = KernelParamsFloat<LhsCols, RhsCols>;
-
-  const int depth = lhs.layout.rows;
-  RUY_DCHECK_EQ(start_row % LhsCols, 0);
-  RUY_DCHECK_EQ(start_col % RhsCols, 0);
-  RUY_DCHECK_EQ(end_row % LhsCols, 0);
-  RUY_DCHECK_EQ(end_col % RhsCols, 0);
-
-  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
-  params->dst_base_ptr =
-      dst->data.get() + start_col * dst->layout.stride + start_row;
-
-  std::uint8_t flags = 0;
-  params->bias = params->zero_data;
-  if (spec.bias) {
-    params->bias = spec.bias;
-    flags |= RUY_ASM_FLAG_HAS_BIAS;
-  }
-  params->flags = flags;
-  params->start_row = start_row;
-  params->start_col = start_col;
-  params->last_row = end_row - LhsCols;
-  params->last_col = end_col - RhsCols;
-  params->lhs_stride = sizeof(float) * lhs.layout.stride;
-  params->rhs_stride = sizeof(float) * rhs.layout.stride;
-  params->dst_stride = sizeof(float) * dst->layout.stride;
-  params->depth = depth;
-  params->clamp_min = spec.clamp_min;
-  params->clamp_max = spec.clamp_max;
-  params->dst_rows = dst->layout.rows;
-  params->dst_cols = dst->layout.cols;
-
-  RUY_DCHECK_LT(params->last_row, params->dst_rows);
-  RUY_DCHECK_LT(params->last_col, params->dst_cols);
-}
-
-void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params);
-void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params);
-void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params);
-void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
-
-#if RUY_PLATFORM(NEON_64)
-// A Float kernel for ARM64 Neon.
-template <>
-struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
-           const BasicSpec<float, float>& spec, int start_row, int start_col,
-           int end_row, int end_col, Matrix<float>* dst) const {
-    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
-                          end_col, dst, &params);
-    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-      KernelFloatNeonInOrder(params);
-    } else {
-      KernelFloatNeonOutOfOrder(params);
-    }
-  }
-};
-#endif
-
-#if RUY_PLATFORM(NEON_32)
-// A Float kernel for ARM32 Neon.
-template <>
-struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 4>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
-           const BasicSpec<float, float>& spec, int start_row, int start_col,
-           int end_row, int end_col, Matrix<float>* dst) const {
-    KernelParamsFloat<8, 4> params;
-
-    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
-                          end_col, dst, &params);
-
-    KernelFloat32NeonOutOfOrder(params);
-  }
-};
-#endif
-
-// While the dotprod NEON extension does not concern floating-point arithmetic,
-// its presence allows us to distinguish, in the in-order tuning case, between
-// A53 and A55r1. TODO: should this be folded into tuning?
-template <>
-struct Kernel<Path::kNeonDotprod, float, float, float,
-              BasicSpec<float, float>> {
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  using Base =
-      Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
-           const BasicSpec<float, float>& spec, int start_row, int start_col,
-           int end_row, int end_col, Matrix<float>* dst) const {
-    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
-                          end_col, dst, &params);
-    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-      KernelFloatNeonDotprodInOrder(params);
-    } else {
-      KernelFloatNeonOutOfOrder(params);
-    }
-  }
-};
-
-#endif  // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) &&
-        // (RUY_OPT_ENABLED(RUY_OPT_ASM)
-}  // namespace ruy
+// IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel_arm.h b/tensorflow/lite/experimental/ruy/kernel_arm.h
new file mode 100644
index 00000000000..7bf87a2a874
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/kernel_arm.h
@@ -0,0 +1,199 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_ARM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_ARM_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/kernel_common.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(NEON) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_64)
+void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params);
+#elif RUY_PLATFORM(NEON_32)
+void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params);
+#endif
+void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params);
+void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params);
+void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params);
+
+#if RUY_PLATFORM(NEON_64)
+template <typename DstScalar>
+struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
+              BasicSpec<std::int32_t, DstScalar>> {
+  using LhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
+  using RhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
+  Tuning tuning = Tuning::kAuto;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
+           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
+                         dst, &params);
+    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+      Kernel8bitNeonInOrder(params);
+    } else {
+      Kernel8bitNeonOutOfOrder(params);
+    }
+  }
+};
+#endif
+
+#if RUY_PLATFORM(NEON_32)
+template <typename DstScalar>
+struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
+              BasicSpec<std::int32_t, DstScalar>> {
+  using LhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
+  using RhsLayout = FixedKernelLayout<Order::kColMajor, 16, 2>;
+  Tuning tuning = Tuning::kAuto;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
+           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
+                         dst, &params);
+    Kernel8bitNeonOutOfOrder(params);
+  }
+};
+#endif
+
+#if RUY_PLATFORM(NEON_64)
+template <typename DstScalar>
+struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, DstScalar,
+              BasicSpec<std::int32_t, DstScalar>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
+           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
+                         dst, &params);
+    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+      Kernel8bitNeonDotprodInOrder(params);
+    } else {
+      Kernel8bitNeonDotprodOutOfOrder(params);
+    }
+  }
+};
+#endif
+
+void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params);
+void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params);
+void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params);
+void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
+
+#if RUY_PLATFORM(NEON_64)
+// A Float kernel for ARM64 Neon.
+template <>
+struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
+           const BasicSpec<float, float>& spec, int start_row, int start_col,
+           int end_row, int end_col, Matrix<float>* dst) const {
+    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
+                          end_col, dst, &params);
+    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+      KernelFloatNeonInOrder(params);
+    } else {
+      KernelFloatNeonOutOfOrder(params);
+    }
+  }
+};
+#endif
+
+#if RUY_PLATFORM(NEON_32)
+// A Float kernel for ARM32 Neon.
+template <>
+struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 4>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
+           const BasicSpec<float, float>& spec, int start_row, int start_col,
+           int end_row, int end_col, Matrix<float>* dst) const {
+    KernelParamsFloat<8, 4> params;
+
+    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
+                          end_col, dst, &params);
+
+    KernelFloat32NeonOutOfOrder(params);
+  }
+};
+#endif
+
+// While the dotprod NEON extension does not concern floating-point arithmetic,
+// its presence allows us to distinguish, in the in-order tuning case, between
+// A53 and A55r1. TODO: should this be folded into tuning?
+template <>
+struct Kernel<Path::kNeonDotprod, float, float, float,
+              BasicSpec<float, float>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  using Base =
+      Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
+           const BasicSpec<float, float>& spec, int start_row, int start_col,
+           int end_row, int end_col, Matrix<float>* dst) const {
+    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
+                          end_col, dst, &params);
+    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+      KernelFloatNeonDotprodInOrder(params);
+    } else {
+      KernelFloatNeonOutOfOrder(params);
+    }
+  }
+};
+
+#endif  // RUY_PLATFORM(NEON) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_ARM_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel_arm32.cc b/tensorflow/lite/experimental/ruy/kernel_arm32.cc
index 8607f256c9a..1e81852bd57 100644
--- a/tensorflow/lite/experimental/ruy/kernel_arm32.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_arm32.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
@@ -125,20 +126,18 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
   //  \---------------------/  \--------------------------/
   //                             accumulators 8x4 block
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "mov r0, #0\n vdup.32 " #reg ", r0\n"
+#define RUY_MAKE_ZERO(reg) "vmov.f32 " #reg ", #0.0\n"
 
         // clang-format off
 
         // Load the first 32 bytes of LHS and RHS data.
-        // Load q0
-        "vld1.32 {d0}, [%[lhs_ptr]]!\n"
-        "vld1.32 {d1}, [%[lhs_ptr]]!\n"
-        // Load q1
-        "vld1.32 {d2}, [%[lhs_ptr]]!\n"
-        "vld1.32 {d3}, [%[lhs_ptr]]!\n"
+        // Load q0, q1
+        "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[lhs_ptr]]\n")
         // Load q2
-        "vld1.32 {d4}, [%[rhs_ptr]]!\n"
-        "vld1.32 {d5}, [%[rhs_ptr]]!\n"
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[rhs_ptr]]\n")
 
         "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
 
@@ -189,17 +188,16 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
         "vmla.f32 q5, q0, d4[1]\n"
         "vmla.f32 q7, q0, d5[0]\n"
         "vmla.f32 q9, q0, d5[1]\n"
-        "vld1.32 {d0}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0
-        "vld1.32 {d1}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0
+        "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n" // Reload LHS
 
         "vmla.f32 q4, q1, d4[0]\n"
         "vmla.f32 q6, q1, d4[1]\n"
         "vmla.f32 q8, q1, d5[0]\n"
         "vmla.f32 q10, q1, d5[1]\n"
-        "vld1.32 {d2}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
-        "vld1.32 {d3}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
-        "vld1.32 {d4}, [%[rhs_ptr]]!\n" // Reload RHS into r2
-        "vld1.32 {d5}, [%[rhs_ptr]]!\n" // Reload RHS into r2
+        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
+        RUY_PREFETCH("pld [%[lhs_ptr]]\n")
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" // Reload RHS
+        RUY_PREFETCH("pld [%[rhs_ptr]]\n")
 
         "add r1, r1, #1\n"
         "cmp r1, r2\n"
@@ -291,25 +289,18 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
         "movne r1, r5\n"
 
         // Load 8 bias values.
-        "vld1.32 {d24}, [r1]!\n"
-        "vld1.32 {d25}, [r1]!\n"
-        "vld1.32 {d26}, [r1]!\n"
-        "vld1.32 {d27}, [r1]\n"
+        "vld1.32 {d24, d25, d26, d27}, [r1]\n"
 
         // Now that we know what LHS and RHS data the next iteration of the
         // main loop will need to load, we start loading the first 32 bytes of
         // each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
         // in the rest of the work on the current block.
-        // Load q0
-        "vld1.32 {d0}, [%[lhs_ptr]]!\n"
-        "vld1.32 {d1}, [%[lhs_ptr]]!\n"
-        // Load q1
-        "vld1.32 {d2}, [%[lhs_ptr]]!\n"
-        "vld1.32 {d3}, [%[lhs_ptr]]!\n"
+        // Load q0, q1
+        "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[lhs_ptr]]\n")
         // Load q2
-        "vld1.32 {d4}, [%[rhs_ptr]]!\n"
-        "vld1.32 {d5}, [%[rhs_ptr]]!\n"
-
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[rhs_ptr]]\n")
 
         // Perform the bias-addition (per the above, we have just folded into
         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
@@ -391,40 +382,20 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
         "31:\n"
 
         // Write our float values to the destination described by
-        // (r3 address, r4 stride).
-        // q3 = d6, d7
-        "vstr d6, [r3, #0]\n"
-        "vstr d7, [r3, #8]\n"
-        // q4 = d8, d9
-        "vstr d8, [r3, #16]\n"
-        "vstr d9, [r3, #24]\n"
+        // (r3 address, r4 stride)
+        "vst1.32 {d6, d7, d8, d9}, [r3]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q3)
         RUY_MAKE_ZERO(q4)
-        // q5 = d10, d11
-        "vstr d10, [r3, #0]\n"
-        "vstr d11, [r3, #8]\n"
-        // q6 = d12, d13
-        "vstr d12, [r3, #16]\n"
-        "vstr d13, [r3, #24]\n"
+        "vst1.32 {d10, d11, d12, d13}, [r3]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q5)
         RUY_MAKE_ZERO(q6)
-        // q7 = d14, d15
-        "vstr d14, [r3, #0]\n"
-        "vstr d15, [r3, #8]\n"
-        // q8 = d16, d17
-        "vstr d16, [r3, #16]\n"
-        "vstr d17, [r3, #24]\n"
+        "vst1.32 {d14, d15, d16, d17}, [r3]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q7)
         RUY_MAKE_ZERO(q8)
-        // q9 = d18, d19
-        "vstr d18, [r3, #0]\n"
-        "vstr d19, [r3, #8]\n"
-        // q10 = d20, d21
-        "vstr d20, [r3, #16]\n"
-        "vstr d21, [r3, #24]\n"
+        "vst1.32 {d18, d19, d20, d21}, [r3]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q9)
         RUY_MAKE_ZERO(q10)
@@ -505,7 +476,7 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
         "cmp r8, r4\n"
 
-        // w1 is the number of levels of depth that we have already loaded
+        // r1 is the number of levels of depth that we have already loaded
         // LHS and RHS data for. Corresponding to the initial ld1 instructions
         // above, this is currently 1.
         "mov r1, #1\n"
@@ -518,27 +489,1156 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
         // clang-format on
         : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
         : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
+        // Clobber list must specify q registers (and not their constituent
+        // d registers). There is a (currently unexplained) slowdown if
+        // d registers are listed in the clobbers list.
         : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
-          "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
-          "d9", "d10", "d12", "d13", "d14", "d15", "d16", "d17", "d18","d19",
-          "d20", "d21", "d22", "d23", "d24", "d25", "d26");
+          "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10", "q12", "q13");
 }
 
-#undef RUY_OFFSET_BIAS
-#undef RUY_OFFSET_FLAGS
+#undef RUY_MAKE_ZERO
+#undef RUY_STACK_OFFSET_SIZE
+#undef RUY_STACK_OFFSET_DST_COL_PTR
+#undef RUY_STACK_OFFSET_DST_PTR
+#undef RUY_STACK_OFFSET_ROW
+#undef RUY_STACK_OFFSET_COL
+#undef RUY_STACK_OFFSET_LHS_COL_PTR
+#undef RUY_STACK_OFFSET_RHS_COL_PTR
+
 #undef RUY_OFFSET_LHS_BASE_PTR
-#undef RUY_OFFSET_CLAMP_MIN
-#undef RUY_OFFSET_CLAMP_MAX
+#undef RUY_OFFSET_RHS_BASE_PTR
+#undef RUY_OFFSET_DST_BASE_PTR
+#undef RUY_OFFSET_BIAS
 #undef RUY_OFFSET_START_ROW
+#undef RUY_OFFSET_START_COL
 #undef RUY_OFFSET_LAST_ROW
 #undef RUY_OFFSET_LAST_COL
+#undef RUY_OFFSET_DST_ROWS
+#undef RUY_OFFSET_DST_COLS
 #undef RUY_OFFSET_LHS_STRIDE
 #undef RUY_OFFSET_RHS_STRIDE
 #undef RUY_OFFSET_DST_STRIDE
 #undef RUY_OFFSET_DEPTH
-#undef RUY_OFFSET_START_COL
+#undef RUY_OFFSET_CLAMP_MIN
+#undef RUY_OFFSET_CLAMP_MAX
+#undef RUY_OFFSET_FLAGS
+
+#define RUY_OFFSET_BIAS 0
+#define RUY_OFFSET_LHS_SUMS 4
+#define RUY_OFFSET_RHS_SUMS 8
+#define RUY_OFFSET_LHS_BASE_PTR 12
+#define RUY_OFFSET_MULTIPLIER_FIXEDPOINT 16
+#define RUY_OFFSET_MULTIPLIER_EXPONENT 20
+#define RUY_OFFSET_RHS_BASE_PTR 24
+#define RUY_OFFSET_DST_BASE_PTR 28
+#define RUY_OFFSET_LHS_ZERO_POINT 32
+#define RUY_OFFSET_RHS_ZERO_POINT 36
+#define RUY_OFFSET_DST_ZERO_POINT 40
+#define RUY_OFFSET_PROD_ZP_DEPTH 44
+#define RUY_OFFSET_START_ROW 48
+#define RUY_OFFSET_START_COL 52
+#define RUY_OFFSET_LAST_ROW 56
+#define RUY_OFFSET_LAST_COL 60
+#define RUY_OFFSET_DST_ROWS 64
+#define RUY_OFFSET_DST_COLS 68
+#define RUY_OFFSET_LHS_STRIDE 72
+#define RUY_OFFSET_RHS_STRIDE 76
+#define RUY_OFFSET_DST_STRIDE 80
+#define RUY_OFFSET_DEPTH 84
+#define RUY_OFFSET_CLAMP_MIN 88
+#define RUY_OFFSET_CLAMP_MAX 92
+#define RUY_OFFSET_FLAGS 96
+#define RUY_OFFSET_DST_TYPE_ID 97
+
+#define RUY_STACK_OFFSET_SIZE 96
+#define RUY_STACK_OFFSET_DST_COL_PTR 0
+#define RUY_STACK_OFFSET_DST_PTR 16
+#define RUY_STACK_OFFSET_ROW 32
+#define RUY_STACK_OFFSET_COL 48
+#define RUY_STACK_OFFSET_LHS_COL_PTR 64
+#define RUY_STACK_OFFSET_RHS_COL_PTR 80
+
+template <typename Params>
+void CheckOffsetsInKernelParams8bit(const Params&) {
+  static_assert(offsetof(Params, lhs_zero_point) == RUY_OFFSET_LHS_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, rhs_zero_point) == RUY_OFFSET_RHS_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, dst_zero_point) == RUY_OFFSET_DST_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, prod_zp_depth) == RUY_OFFSET_PROD_ZP_DEPTH,
+                "");
+  static_assert(offsetof(Params, multiplier_fixedpoint) ==
+                    RUY_OFFSET_MULTIPLIER_FIXEDPOINT,
+                "");
+  static_assert(
+      offsetof(Params, multiplier_exponent) == RUY_OFFSET_MULTIPLIER_EXPONENT,
+      "");
+  static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
+  static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
+  static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
+  static_assert(offsetof(Params, lhs_sums) == RUY_OFFSET_LHS_SUMS, "");
+  static_assert(offsetof(Params, rhs_sums) == RUY_OFFSET_RHS_SUMS, "");
+  static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
+  static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
+  static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
+  static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
+  static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
+  static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
+  static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
+  static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
+  static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
+}
+
+// Fast-int8 kernel, ported from ARM 64 version.
+// Relevant target CPUs for this kernel include Krait 400 and A9,
+// since these are 32-bit, out-of-order CPUs.
+void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
+  gemmlowp::ScopedProfilingLabel label(
+      "Kernel (kNeon, optimized for out-of-order cores)");
+
+  CheckOffsetsInKernelParams8bit(params);
+
+  const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
+  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
+  const std::int8_t* lhs_ptr = lhs_col_ptr;
+  const std::int8_t* rhs_ptr = rhs_col_ptr;
+
+  // The asm kernel below has the following NEON register allocation:
+  //
+  // q6 - q13 are 128-bit (4x32b) accumulators.
+  // During accumulation, d0 -- d7 are used to load int8 data from LHS and
+  // d8 -- d11 from RHS:
+  //                                      int8 RHS 16x2 block
+  //                              /-----------------------------\
+  //                              |d8.b[0-7]   .....  d10.b[0-7]|
+  //                              |  ...                  ...   |
+  //                              |d9.b[0-7]   .....  d11.b[0-7]|
+  //                              \-----------------------------/
+  //    int8 LHS 4x16 block
+  //  /------------------------\  /-----------------------------\
+  //  |d0.b[0-7] ... d1.b[0-7] |  | q6         .....      q10   |
+  //  |d2.b[0-7] ... d3.b[0-7] |  | q7         .....      q11   |
+  //  |d4.b[0-7] ... d5.b[0-7] |  | q8         .....      q12   |
+  //  |d6.b[0-7] ... d7.b[0-7] |  | q9         .....      q13   |
+  //  \------------------------/  \-----------------------------/
+  //                                128-bit accumulators 4x2 block
+  //
+  // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
+  // optimization for this kernel.
+  asm volatile(
+#define RUY_MAKE_ZERO(reg) "vmov.i32 " #reg ", #0x00000000\n"
+
+        // clang-format off
+
+        // Load the first 64 bytes of LHS and RHS data.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        "vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
+
+        "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
+        "str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        // r1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        // Main loop of the whole GEMM, over rows and columns of the
+        // destination matrix.
+        "1:\n"
+
+        // r1 is how many levels of depth we have already loaded
+        // data for, r10 is the total depth.
+        "ldr r10, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
+        "cmp r1, r10\n"
+        "beq 79f\n"
+
+        "2:\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q15, d2, d8\n"
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q15, d3, d9\n"
+
+        // Then pairwise accumulate in to q6, q7
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d4, d8\n"
+        "vmull.s8 q15, d6, d8\n"
+        "vmlal.s8 q14, d5, d9\n"
+        "vmlal.s8 q15, d7, d9\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vpadal.s16 q8, q14\n"
+        "vpadal.s16 q9, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d10\n"
+        "vmull.s8 q15, d2, d10\n"
+        "vmlal.s8 q14, d1, d11\n"
+        "vmlal.s8 q15, d3, d11\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vpadal.s16 q10, q14\n"
+        "vpadal.s16 q11, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d4, d10\n"
+        "vmull.s8 q15, d6, d10\n"
+        "vmlal.s8 q14, d5, d11\n"
+        "vmlal.s8 q15, d7, d11\n"
+        // Then pairwise accumulate in to q12, q13
+        "vpadal.s16 q12, q14\n"
+        "vpadal.s16 q13, q15\n"
+
+        // Load the next 64 bytes of LHS and RHS data.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[lhs_ptr]]\n")
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        "vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[rhs_ptr]]\n")
+
+        // Each iteration of this loop advances by 16 levels of depth.
+        "add r1, r1, #16\n"
+
+        // Loop termination condition
+        "cmp r1, r10\n"
+
+        "blt 2b\n"
+
+        "79:\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q15, d2, d8\n"
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q15, d3, d9\n"
+
+        // Then pairwise accumulate in to q6, q7
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d4, d8\n"
+        "vmull.s8 q15, d6, d8\n"
+        "vmlal.s8 q14, d5, d9\n"
+        "vmlal.s8 q15, d7, d9\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vpadal.s16 q8, q14\n"
+        "vpadal.s16 q9, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d10\n"
+        "vmull.s8 q15, d2, d10\n"
+        "vmlal.s8 q14, d1, d11\n"
+        "vmlal.s8 q15, d3, d11\n"
+
+        // Then pairwise accumulate in to q10, q11
+        "vpadal.s16 q10, q14\n"
+        "vpadal.s16 q11, q15\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vmull.s8 q14, d4, d10\n"
+        "vmull.s8 q15, d6, d10\n"
+        "vmlal.s8 q14, d5, d11\n"
+        "vmlal.s8 q15, d7, d11\n"
+
+        // Then pairwise accumulate in to q12, q13
+        "vpadal.s16 q12, q14\n"
+        "vpadal.s16 q13, q15\n"
+
+        // All accumulation over depth done. q6 - q13 contain the 4x32b
+        // accumulators for the 4x2 final matrix. Need to collapse down
+        // to one 32b value per entry.
+        RUY_MAKE_ZERO(q0)
+        RUY_MAKE_ZERO(q1)
+        RUY_MAKE_ZERO(q2)
+        RUY_MAKE_ZERO(q3)
+        RUY_MAKE_ZERO(q4)
+        RUY_MAKE_ZERO(q5)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        // We now have to compute the final 8-bit values from these int32
+        // accumulators, and advance to the next 4x2 block. We intertwine
+        // these two aspects whenever possible for optimal pipelining, both
+        // at the data flow level (prefetch data for next block as early as
+        // possible) and instruction pipelining level (some of the next-block
+        // work can dual-issue with some of the final work on the current
+        // block).
+
+        // q6-q13 now contain 4 x 32b
+        "vpadd.i32 d0, d12, d13\n"
+        "vpadd.i32 d1, d14, d15\n"
+        "vpadd.i32 d2, d16, d17\n"
+        "vpadd.i32 d3, d18, d19\n"
+        "vpadd.i32 d4, d20, d21\n"
+        "vpadd.i32 d5, d22, d23\n"
+        "vpadd.i32 d6, d24, d25\n"
+        "vpadd.i32 d7, d26, d27\n"
+
+        // d0-d7 each contain 2 x 32b accumulators.
+        // Need to add pairwise to get 1 x 32b for each of the 4x2 entries
+        // of destination, (Four 'd' registers total)
+        "vpadd.i32 d28, d0, d1\n"
+        "vpadd.i32 d29, d2, d3\n"
+        "vpadd.i32 d30, d4, d5\n"
+        "vpadd.i32 d31, d6, d7\n"
+
+        //Now d28 - d31 have the 1 x 32b accumulators for the 4x2 entries
+
+        // Logic to advance to the next block in preparation for the next
+        // iteration of the main loop. For now, we only want to compute
+        // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
+        // not yet ready to update the values of row and col, as we still need
+        // the current values for the rest of the work on the current block.
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r1, r3\n"  // Have we finished the last row?
+
+        "bge 4f\n"           // If finished last row, go to 4
+        // Not finished last row: then advance to next row.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "add r4, r4, r1, lsl #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "b 5f\n"
+        "4:\n"  // Finished last row...
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        // Go back to first row
+        "str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        // Now we need to advance to the next column. If we already
+        // finished the last column, then in principle we are done, however
+        // we can't just return here, as we need to allow the end work of the
+        // current block to complete. The good news is that at this point it
+        // doesn't matter what data we load for the next column, since
+        // we will exit from the main loop below before actually storing
+        // anything computed from that data.
+
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"  // Have we finished the last column?
+        "bge 5f\n" // If yes, just carry on without updating the column pointer.
+        // Not finished last column: then advance to next column.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
+        "ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "add r10, r10, r1, lsl #1\n"
+        "str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "5:\n"
+
+        // Set the LHS and RHS data pointers to the start of the columns just
+        // computed.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "mov %[lhs_ptr], r4\n"
+        "ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "mov %[rhs_ptr], r5\n"
+
+        // Now we load: bias data, LHS sums data, RHS sums data.
+
+        // First, load the base pointers from the params.
+        "ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
+
+        // Offset these base pointers as needed given the current row, col.
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r8, lsl #2\n"
+
+        "tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        // Load 4 bias values.
+        "vld1.32 {d24, d25}, [r1]\n"
+
+        // Now that we know what LHS and RHS data the next iteration of the
+        // main loop will need to load, we start loading the first 32 bytes of
+        // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
+        // in the rest of the work on the current block.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[lhs_ptr]]\n")
+        "vld1.8 {d8, d9, d10, d11}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH("pld [%[rhs_ptr]]\n")
+
+        // Add to the bias values the product
+        // (depth * lhs_zero_point * rhs_zero_point),
+        // See the term NZ1Z2 in equation (7) in
+        // https://arxiv.org/pdf/1712.05877.pdf
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
+        "vdup.32 q9, r3\n"
+        "vadd.i32 q12, q12, q9\n"
+
+        // Perform the bias-addition (per the above, we have just folded into
+        // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
+        "vadd.i32 q14, q14, q12\n"
+        "vadd.i32 q15, q15, q12\n"
+
+        // LHS/RHS zero points
+        // Has RHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
+        "beq 401f\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        // Offset by current col * number of bytes per value
+        "add r3, r3, r4, lsl #2\n"
+        "vld1.32 { d12 }, [r3]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
+        "vdup.32 q10, r5\n"  // create lhs_zero_point_vec
+        // Subtract rhs_sums * lhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vmls.i32 q14, q10, d12[0]\n"
+        "vmls.i32 q15, q10, d12[1]\n"
+        "401:\n"
+
+        // Has LHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
+        "beq 402f\n"
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Offset by current row * number of bytes per value
+        "add r2, r2, r4, lsl #2\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
+
+        // Load 4 lhs_sums values.
+        "vld1.32 {d22, d23}, [r2]\n"
+        "vdup.32 d13, r5\n" // rhs_zero_point
+
+        // Compute lhs_sums * rhs_zero_point.
+        "vmul.i32 q11, q11, d13[1]\n"
+        // Subtract lhs_sums * rhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vsub.s32 q14, q14, q11\n"
+        "vsub.s32 q15, q15, q11\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
+        "402:\n"
+
+        // At this point we have computed the final int32 values. Now we
+        // start down-quantizing them to obtain the final 8bit values from them.
+
+        // As part of this down-quantization, our int32 values will be
+        // multiplied by a multiplier that has a fixed-point component and an
+        // exponent component.
+
+        //Load the exponent part of the multiplier.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r4, lsl #2\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        "vld1.32 {q10}, [r1]\n"
+
+        RUY_MAKE_ZERO(q8)
+        "vmax.s32 q12, q10, q8\n"
+
+        "vshl.s32 q14, q14, q12\n"
+        "vshl.s32 q15, q15, q12\n"
+
+        "vmin.s32 q12, q10, q8\n"
+
+        // Load fixed point part of the multiplier
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
+        // r6 has flags, r4 has row
+        "add r5, r1, r4, lsl #2\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+        "vld1.32 {q10}, [r1]\n" // multiplier_fixedpoint
+
+        // Apply the fixed-point part of the multiplier.
+        "vqrdmulh.s32 q14, q14, q10\n"
+        "vqrdmulh.s32 q15, q15, q10\n"
+
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
+#if !RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)
+        // Fix up values to be right-shifted, so that the (round to nearest,
+        // break ties upward) behavior of srshl applied to these fixed-up
+        // values, produces the same result as the desired (round to nearest,
+        // break ties away from zero) behavior on the original values.
+        "vand q8, q14, q12\n"
+        "vand q9, q15, q12\n"
+        "vshr.s32 q8, q8, #31\n"
+        "vshr.s32 q9, q9, #31\n"
+        "vqadd.s32 q14, q14, q8\n"
+        "vqadd.s34 q15, q15, q9\n"
+
+#endif
+        // At this point we have reduced the problem of correctly implementing
+        // rounding divide-by-power-of-two, to what the SRSHL instruction can
+        // do.
+        "vrshl.s32 q14, q14, q12\n"
+        "vrshl.s32 q15, q15, q12\n"
+
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
+
+        // Store uint8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, d12 -- d26, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to uint8
+        // Now all 8 1-byte values are in d30.
+        "vqmovun.s16 d30, q14\n"
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.u8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.u8 d30, d30, d29\n"
+
+        // Compute how much of the 4x2 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #4\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.8 {d30[0]}, [r3], r6\n"
+        "add r4, r4, r5\n"
+        "vst1.8 {d30[1]}, [r3], r6\n"
+        "vst1.8 {d30[2]}, [r3], r6\n"
+        "vst1.8 {d30[3]}, [r3], r6\n"
+        "mov r3, r4\n"
+        "vst1.8 {d30[4]}, [r3], r6\n"
+        "vst1.8 {d30[5]}, [r3], r6\n"
+        "vst1.8 {d30[6]}, [r3], r6\n"
+        "vst1.8 {d30[7]}, [r3], r6\n"
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        // Store int8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, d12 -- d26, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to int8
+        // Now all 8 1-byte values are in d30.
+        "vqmovn.s16 d30, q14\n"
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.s8 d30, d30, d29\n"
+
+        // Compute how much of the 4x2 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #4\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.8 {d30[0]}, [r3], r6\n"
+        "add r4, r4, r5\n"
+        "vst1.8 {d30[1]}, [r3], r6\n"
+        "vst1.8 {d30[2]}, [r3], r6\n"
+        "vst1.8 {d30[3]}, [r3], r6\n"
+        "mov r3, r4\n"
+        "vst1.8 {d30[4]}, [r3], r6\n"
+        "vst1.8 {d30[5]}, [r3], r6\n"
+        "vst1.8 {d30[6]}, [r3], r6\n"
+        "vst1.8 {d30[7]}, [r3], r6\n"
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
+
+        // Load the destination zero point into each of the 4 32-bit slots
+        // in a q register.
+        "ldrsh r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.32 q13, r4\n" // dst_zero_point
+        // Add the destination zero point
+        "vadd.s32 q14, q14, q13\n"
+        "vadd.s32 q15, q15, q13\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, v18 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q15)
+
+         // Load the clamp_min, clamp_max bounds
+        "ldrh r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrh r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.16 q12, r2\n"  // clamp_min
+        "vdup.16 q13, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s16 q14, q14, q12\n"
+        // Apply the clamp_max bound
+        "vmin.s16 q14, q14, q13\n"
+
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x2 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.16 {q14}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        // Shift of offset register for half-word loads not allowed in A32,
+        // so we shift, load/store, then shift back r8.
+        "lsl r8, r8, #1\n"
+        "ldrh r10, [r3, r8]\n"
+        "strh r10, [r4, r8]\n"
+        "lsr r8, r8, #1\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #8\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #2\n"
+
+        "vst1.16 {d28[0]}, [r3], r6\n"
+        "add r4, r4, r5\n"
+        "vst1.16 {d28[1]}, [r3], r6\n"
+        "vst1.16 {d28[2]}, [r3], r6\n"
+        "vst1.16 {d28[3]}, [r3], r6\n"
+        "mov r3, r4\n"
+        "vst1.16 {d29[0]}, [r3], r6\n"
+        "vst1.16 {d29[1]}, [r3], r6\n"
+        "vst1.16 {d29[2]}, [r3], r6\n"
+        "vst1.16 {d29[3]}, [r3], r6\n"
+        "31:\n"
+
+         // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #8\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q14)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x4 block of destination 32 bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Set (r3 address, r4 stride) to write to dst_tmp_buf
+        "mov r3, %[dst_tmp_buf]\n"
+        "mov r4, #16\n"
+        "b 31f\n"
+
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // r3 address, r4 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r5\n"
+
+        "31:\n"
+
+        "vst1.32 {d28, d29}, [r3]\n"
+        "add r3, r3, r4\n"
+        "vst1.32 {d30, d31}, [r3]\n"
+
+        // If all of the 4x2 block fits, we just finished writing it to the
+        // destination, so we skip the next part.
+        "beq 41f\n"
+        // Not all of the 4x2 block fits in the destination matrix.  We just
+        // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
+        // it to copy into the destination matrix the part that fits.
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "mov r3, %[dst_tmp_buf]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r5, #0\n"
+        "51:\n"
+        "ldr r10, [r3, r5, lsl #2]\n"
+        "str r10, [r4, r5, lsl #2]\n"
+        "add r5, r5, #1\n"
+        "cmp r5, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #16\n"
+        "add r4, r4, r8\n"
+        // r2 = how many cols of the 8x4 block fit
+        "cmp r6, r2\n"
+        "blt 50b\n"
+
+        "41:\n"
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #16\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
+
+        // Reload some params --- we had used x5 -- x7 for a few other things
+        // since the last time we had loaded them.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+
+        // Move to the next block of the destination matrix, for the next iter
+        // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
+        // been updated earlier.
+        // Have we reached the end row?
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r8, r3\n"
+
+        "beq 20f\n"  // yes, end row.
+        // Not end row. Move to the next row.
+        "add r8, r8, #4\n"
+        // Store new value of row
+        "str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "b 21f\n"
+        "20:\n"
+        // Was already at end row.
+        // Move back to first row.
+        "str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Move to the next column.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "add r4, r4, #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Increment dst_col_ptr by 2 * dst_stride (i.e. 2 columns)
+        "add r1, r1, r8, lsl #1\n"
+        // Store dst_col_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Store dst_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "21:\n"
+
+        // Main loop exit condition: have we hit the end column?
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"
+
+        // w1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        "ble 1b\n"
+
+        // Restore stack pointer.
+        "add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        // clang-format on
+
+        : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
+        : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
+        : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
+           // Clobber list must specify q registers (and not their constituent
+           // d registers). There is a (currently unexplained) slowdown if
+           // d registers are listed in the clobbers list.
+          "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10", "q12", "q13", "q14", "q15");
+}
+
+#undef RUY_OFFSET_BIAS
+#undef RUY_OFFSET_LHS_SUMS
+#undef RUY_OFFSET_RHS_SUMS
+#undef RUY_OFFSET_LHS_BASE_PTR
+#undef RUY_OFFSET_MULTIPLIER_FIXEDPOINT
+#undef RUY_OFFSET_MULTIPLIER_EXPONENT
 #undef RUY_OFFSET_RHS_BASE_PTR
 #undef RUY_OFFSET_DST_BASE_PTR
+#undef RUY_OFFSET_LHS_ZERO_POINT
+#undef RUY_OFFSET_RHS_ZERO_POINT
+#undef RUY_OFFSET_DST_ZERO_POINT
+#undef RUY_OFFSET_PROD_ZP_DEPTH
+#undef RUY_OFFSET_START_ROW
+#undef RUY_OFFSET_START_COL
+#undef RUY_OFFSET_LAST_ROW
+#undef RUY_OFFSET_LAST_COL
+#undef RUY_OFFSET_DST_ROWS
+#undef RUY_OFFSET_DST_COLS
+#undef RUY_OFFSET_LHS_STRIDE
+#undef RUY_OFFSET_RHS_STRIDE
+#undef RUY_OFFSET_DST_STRIDE
+#undef RUY_OFFSET_DEPTH
+#undef RUY_OFFSET_CLAMP_MIN
+#undef RUY_OFFSET_CLAMP_MAX
+#undef RUY_OFFSET_FLAGS
+#undef RUY_OFFSET_DST_TYPE_ID
+
+#undef RUY_STACK_OFFSET_SIZE
+#undef RUY_STACK_OFFSET_DST_COL_PTR
+#undef RUY_STACK_OFFSET_DST_PTR
+#undef RUY_STACK_OFFSET_ROW
+#undef RUY_STACK_OFFSET_COL
+#undef RUY_STACK_OFFSET_LHS_COL_PTR
+#undef RUY_STACK_OFFSET_RHS_COL_PTR
 
 #endif  // RUY_PLATFORM(NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/kernel_arm64.cc b/tensorflow/lite/experimental/ruy/kernel_arm64.cc
index d2bcc1083eb..6fa71bdf07d 100644
--- a/tensorflow/lite/experimental/ruy/kernel_arm64.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_arm64.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx2.cc b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
new file mode 100644
index 00000000000..eb38addc725
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
@@ -0,0 +1,422 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#include <immintrin.h>  // IWYU pragma: keep
+#endif
+
+namespace ruy {
+
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void KernelFloatAvx2(const KernelParamsFloat<8, 8>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+static constexpr int kAvxFloatBlockSize = 8;
+static constexpr int kAvx8bitBlockSize = 8;
+static constexpr int kAvx8bitInnerSize = 4;
+
+void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params) {
+  gemmlowp::ScopedProfilingLabel label("Kernel kAvx2");
+
+  std::int32_t accum_data[kAvx8bitBlockSize][kAvx8bitBlockSize];
+  int bias_ptr_block_increment =
+      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvx8bitBlockSize : 0;
+
+  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
+  void* dst_col_ptr = params.dst_base_ptr;
+  const std::int32_t* bias_col_ptr = params.bias;
+  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
+    bias_col_ptr += params.start_row;
+  }
+
+  for (int col = params.start_col; col <= params.last_col;
+       col += kAvx8bitBlockSize) {
+    const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
+    void* dst_ptr = dst_col_ptr;
+    const std::int32_t* bias_ptr = bias_col_ptr;
+
+    for (int row = params.start_row; row <= params.last_row;
+         row += kAvx8bitBlockSize) {
+      const int residual_rows =
+          std::min(params.dst_rows - row, kAvx8bitBlockSize);
+      const int residual_cols =
+          std::min(params.dst_cols - col, kAvx8bitBlockSize);
+
+      // Initialize with bias.
+      std::int32_t initial_accum_data[kAvx8bitBlockSize];
+      for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+        initial_accum_data[i] = 0;
+      }
+      for (int i = 0; i < residual_rows; ++i) {
+        initial_accum_data[i] = bias_ptr[i];
+      }
+      for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+          accum_data[j][i] = initial_accum_data[i];
+        }
+      }
+      bias_ptr += bias_ptr_block_increment;
+
+      std::int8_t lhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
+      std::int8_t rhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
+      const std::int8_t* lhs_ptr = lhs_col_ptr;
+      const std::int8_t* rhs_ptr = rhs_col_ptr;
+      for (int d = 0; d < params.depth; d += kAvx8bitInnerSize) {
+        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+          for (int x = 0; x < kAvx8bitInnerSize; ++x) {
+            lhs_data[i][x] = lhs_ptr[i * kAvx8bitInnerSize + x];
+            rhs_data[i][x] = rhs_ptr[i * kAvx8bitInnerSize + x];
+          }
+        }
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            for (int x = 0; x < kAvx8bitInnerSize; ++x) {
+              accum_data[j][i] += lhs_data[i][x] * rhs_data[j][x];
+            }
+          }
+        }
+        lhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
+        rhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
+      }
+
+      //
+
+      if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && params.rhs_zero_point) {
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            accum_data[j][i] -=
+                params.rhs_zero_point * params.lhs_sums[row + i];
+          }
+        }
+      }
+      if ((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && params.lhs_zero_point) {
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            accum_data[j][i] -=
+                params.lhs_zero_point * params.rhs_sums[col + j];
+          }
+        }
+      }
+      if (params.lhs_zero_point && params.rhs_zero_point) {
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            accum_data[j][i] += params.prod_zp_depth;
+          }
+        }
+      }
+
+      if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
+        std::int32_t m_vector[kAvx8bitBlockSize];
+        std::int32_t e_vector[kAvx8bitBlockSize];
+        // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
+        if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
+          int i = 0;
+          for (; i < residual_rows; ++i) {
+            m_vector[i] = params.multiplier_fixedpoint[row + i];
+            e_vector[i] = params.multiplier_exponent[row + i];
+          }
+          for (; i < kAvx8bitBlockSize; ++i) {
+            m_vector[i] = m_vector[0];
+            e_vector[i] = e_vector[0];
+          }
+        } else {
+          // These arrays have size LhsCols, and are pre-filled.
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            m_vector[i] = params.multiplier_fixedpoint[i];
+            e_vector[i] = params.multiplier_exponent[i];
+          }
+        }
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            accum_data[j][i] = MultiplyByQuantizedMultiplier(
+                accum_data[j][i], m_vector[i], e_vector[i]);
+          }
+        }
+
+        if (params.dst_zero_point) {
+          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+              accum_data[j][i] += params.dst_zero_point;
+            }
+          }
+        }
+
+        //
+
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            accum_data[j][i] =
+                std::min<std::int32_t>(accum_data[j][i], params.clamp_max);
+            accum_data[j][i] =
+                std::max<std::int32_t>(accum_data[j][i], params.clamp_min);
+          }
+        }
+      }
+
+      const bool store_full_block = (residual_rows == kAvx8bitBlockSize) &&
+                                    (residual_cols == kAvx8bitBlockSize);
+
+      if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
+        std::int8_t* tmp_ptr =
+            store_full_block
+                ? static_cast<std::int8_t*>(dst_ptr)
+                : const_cast<std::int8_t*>(
+                      reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf));
+        const int block_col_offset =
+            store_full_block ? params.dst_stride / sizeof(std::int8_t)
+                             : kAvx8bitBlockSize;
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            tmp_ptr[i] = accum_data[j][i];
+          }
+          tmp_ptr += block_col_offset;
+        }
+
+        if (!store_full_block) {
+          const std::int8_t* block_ptr =
+              reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf);
+          for (int j = 0; j < residual_cols; ++j) {
+            for (int i = 0; i < residual_rows; ++i) {
+              static_cast<std::int8_t*>(
+                  dst_ptr)[j * params.dst_stride / sizeof(std::int8_t) + i] =
+                  block_ptr[i];
+            }
+            block_ptr += kAvx8bitBlockSize;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) +
+                                     kAvx8bitBlockSize);
+      } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
+        std::uint8_t* tmp_ptr = store_full_block
+                                    ? static_cast<std::uint8_t*>(dst_ptr)
+                                    : const_cast<std::uint8_t*>(
+                                          reinterpret_cast<const std::uint8_t*>(
+                                              params.dst_tmp_buf));
+        const int block_col_offset =
+            store_full_block ? params.dst_stride : kAvx8bitBlockSize;
+        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+            tmp_ptr[i] = accum_data[j][i];
+          }
+          tmp_ptr += block_col_offset;
+        }
+
+        if (!store_full_block) {
+          const std::uint8_t* block_ptr =
+              reinterpret_cast<const std::uint8_t*>(params.dst_tmp_buf);
+          for (int j = 0; j < residual_cols; ++j) {
+            for (int i = 0; i < residual_rows; ++i) {
+              static_cast<std::uint8_t*>(
+                  dst_ptr)[j * params.dst_stride / sizeof(std::uint8_t) + i] =
+                  block_ptr[i];
+            }
+            block_ptr += kAvx8bitBlockSize;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) +
+                                     kAvx8bitBlockSize);
+      } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
+        if (store_full_block) {
+          std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
+          const int block_col_offset = params.dst_stride / sizeof(std::int16_t);
+          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+              tmp_ptr[i] = accum_data[j][i];
+            }
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          std::int16_t* tmp_ptr = const_cast<std::int16_t*>(
+              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf));
+          const int block_col_offset = kAvx8bitBlockSize;
+          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+              tmp_ptr[i] = accum_data[j][i];
+            }
+            tmp_ptr += block_col_offset;
+          }
+          const std::int16_t* block_ptr =
+              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf);
+          std::int16_t* dst_block_ptr = static_cast<std::int16_t*>(dst_ptr);
+          for (int j = 0; j < residual_cols; ++j) {
+            for (int i = 0; i < residual_rows; ++i) {
+              dst_block_ptr[i] = block_ptr[i];
+            }
+            dst_block_ptr += params.dst_stride / sizeof(std::int16_t);
+            block_ptr += kAvx8bitBlockSize;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) +
+                                     kAvx8bitBlockSize);
+      } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
+        if (store_full_block) {
+          std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
+          const int block_col_offset = params.dst_stride / sizeof(std::int32_t);
+          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
+            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
+              tmp_ptr[i] = accum_data[j][i];
+            }
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
+          for (int j = 0; j < residual_cols; ++j) {
+            for (int i = 0; i < residual_rows; ++i) {
+              dst_block_ptr[i] = accum_data[j][i];
+            }
+            dst_block_ptr += params.dst_stride / sizeof(std::int32_t);
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) +
+                                     kAvx8bitBlockSize);
+      } else {
+        RUY_DCHECK(false);
+      }
+
+      lhs_col_ptr += kAvx8bitBlockSize * params.lhs_stride;
+    }  // End row-block loop.
+
+    dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
+                                     kAvx8bitBlockSize * params.dst_stride);
+    rhs_col_ptr += kAvx8bitBlockSize * params.rhs_stride;
+  }  // End col-block loop.
+}
+
+void KernelFloatAvx2(const KernelParamsFloat<8, 8>& params) {
+  gemmlowp::ScopedProfilingLabel label("Kernel kAvx2");
+
+  float lhs_data[kAvxFloatBlockSize];
+  float rhs_data[kAvxFloatBlockSize];
+  float accum_data[kAvxFloatBlockSize][kAvxFloatBlockSize];
+  int bias_ptr_block_increment =
+      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvxFloatBlockSize : 0;
+
+  const float* rhs_col_ptr = params.rhs_base_ptr;
+  float* dst_col_ptr = params.dst_base_ptr;
+  const float* bias_col_ptr = params.bias;
+  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
+    bias_col_ptr += params.start_row;
+  }
+
+  for (int col = params.start_col; col <= params.last_col;
+       col += kAvxFloatBlockSize) {
+    const float* lhs_col_ptr = params.lhs_base_ptr;
+    float* dst_ptr = dst_col_ptr;
+    const float* bias_ptr = bias_col_ptr;
+
+    for (int row = params.start_row; row <= params.last_row;
+         row += kAvxFloatBlockSize) {
+      const int residual_rows =
+          std::min(params.dst_rows - row, kAvxFloatBlockSize);
+      const int residual_cols =
+          std::min(params.dst_cols - col, kAvxFloatBlockSize);
+
+      // Initialize with bias.
+      float initial_accum_data[kAvxFloatBlockSize];
+      for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+        initial_accum_data[i] = 0.0f;
+      }
+      for (int i = 0; i < residual_rows; ++i) {
+        initial_accum_data[i] = bias_ptr[i];
+      }
+      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
+        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+          accum_data[j][i] = initial_accum_data[i];
+        }
+      }
+      bias_ptr += bias_ptr_block_increment;
+
+      const float* lhs_ptr = lhs_col_ptr;
+      const float* rhs_ptr = rhs_col_ptr;
+      for (int d = 0; d < params.depth; ++d) {
+        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+          lhs_data[i] = lhs_ptr[i];
+          rhs_data[i] = rhs_ptr[i];
+        }
+        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
+          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+            accum_data[j][i] += lhs_data[i] * rhs_data[j];
+          }
+        }
+        lhs_ptr += kAvxFloatBlockSize;
+        rhs_ptr += kAvxFloatBlockSize;
+      }
+
+      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
+        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+          accum_data[j][i] =
+              std::min<float>(accum_data[j][i], params.clamp_max);
+          accum_data[j][i] =
+              std::max<float>(accum_data[j][i], params.clamp_min);
+        }
+      }
+
+      const bool store_full_block = (residual_rows == kAvxFloatBlockSize) &&
+                                    (residual_cols == kAvxFloatBlockSize);
+
+      {
+        float* block_ptr =
+            store_full_block ? dst_ptr : const_cast<float*>(params.dst_tmp_buf);
+        const int block_col_offset = store_full_block
+                                         ? params.dst_stride / sizeof(float)
+                                         : kAvxFloatBlockSize;
+        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
+          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
+            block_ptr[i] = accum_data[j][i];
+          }
+          block_ptr += block_col_offset;
+        }
+      }
+      if (!store_full_block) {
+        const float* block_ptr = params.dst_tmp_buf;
+        for (int j = 0; j < residual_cols; ++j) {
+          for (int i = 0; i < residual_rows; ++i) {
+            dst_ptr[j * params.dst_stride / sizeof(float) + i] = block_ptr[i];
+          }
+          block_ptr += kAvxFloatBlockSize;
+        }
+      }
+
+      lhs_col_ptr += kAvxFloatBlockSize * params.lhs_stride / sizeof(float);
+      dst_ptr += kAvxFloatBlockSize;
+    }  // End row-block loop.
+
+    dst_col_ptr += kAvxFloatBlockSize * params.dst_stride / sizeof(float);
+    rhs_col_ptr += kAvxFloatBlockSize * params.rhs_stride / sizeof(float);
+  }  // End col-block loop.
+}
+
+#endif  //  RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx512.cc b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
new file mode 100644
index 00000000000..98bff00b40e
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
@@ -0,0 +1,824 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#include <immintrin.h>  // IWYU pragma: keep
+#endif
+
+namespace ruy {
+
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void KernelFloatAvx512(const KernelParamsFloat<16, 16>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+inline std::int32_t mm512_get1_epi32(const __m512i v, int i) {
+  __m256i a =
+      i < 8 ? _mm512_extracti32x8_epi32(v, 0) : _mm512_extracti32x8_epi32(v, 1);
+  switch (i & ~8) {
+    case 0:
+      return _mm256_extract_epi32(a, 0);
+    case 1:
+      return _mm256_extract_epi32(a, 1);
+    case 2:
+      return _mm256_extract_epi32(a, 2);
+    case 3:
+      return _mm256_extract_epi32(a, 3);
+    case 4:
+      return _mm256_extract_epi32(a, 4);
+    case 5:
+      return _mm256_extract_epi32(a, 5);
+    case 6:
+      return _mm256_extract_epi32(a, 6);
+    case 7:
+      return _mm256_extract_epi32(a, 7);
+    default:
+      RUY_DCHECK(i < 16);
+      return 0;
+  }
+}
+
+inline __m512i mm512_set1_epi32(__m512i* v, int i, std::int32_t x) {
+  return *v = _mm512_mask_set1_epi32(*v, 1 << i, x);
+}
+
+void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
+  gemmlowp::ScopedProfilingLabel label("Kernel kAvx512");
+
+  std::int32_t dst_stride;
+  if ((params.dst_type_id == DstTypeId<std::int8_t>::kValue) ||
+      (params.dst_type_id == DstTypeId<std::uint8_t>::kValue)) {
+    dst_stride = params.dst_stride;
+  } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
+    dst_stride = params.dst_stride / sizeof(std::int16_t);
+  } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
+    dst_stride = params.dst_stride / sizeof(std::int32_t);
+  } else {
+    RUY_DCHECK(false);
+  }
+
+  int bias_ptr_block_increment = params.flags & RUY_ASM_FLAG_HAS_BIAS ? 16 : 0;
+
+  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
+  void* dst_col_ptr = params.dst_base_ptr;
+  const std::int32_t* bias_col_ptr = params.bias;
+  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
+    bias_col_ptr += params.start_row;
+  }
+
+  for (int col = params.start_col; col <= params.last_col; col += 16) {
+    const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
+    void* dst_ptr = dst_col_ptr;
+    const std::int32_t* bias_ptr = bias_col_ptr;
+
+    for (int row = params.start_row; row <= params.last_row; row += 16) {
+      const int residual_rows = std::min(params.dst_rows - row, 16);
+      const int residual_cols = std::min(params.dst_cols - col, 16);
+
+      __m512i accum_data_v[16];
+      __m512i accum_data_v_low[16];
+      __m512i accum_data_v_high[16];
+
+      // Initialize with bias.
+      const __mmask16 row_mask =
+          (static_cast<std::uint32_t>(1) << residual_rows) - 1;
+      const __m512i initial_accum_data =
+          _mm512_maskz_loadu_epi32(row_mask, bias_ptr);
+      __m512i initial_accum_data_low = initial_accum_data;
+      __m512i initial_accum_data_high = _mm512_setzero_epi32();
+      bias_ptr += bias_ptr_block_increment;
+
+      for (int j = 0; j < 16; ++j) {
+        accum_data_v_low[j] = initial_accum_data_low;
+        accum_data_v_high[j] = initial_accum_data_high;
+      }
+
+      //
+
+      const std::int8_t* lhs_ptr = lhs_col_ptr;
+      const std::int8_t* rhs_ptr = rhs_col_ptr;
+      for (int d = 0; d < params.depth; d += 4) {
+        const __m512i lhs_data = _mm512_loadu_epi8(lhs_ptr);
+        __m512i rhs_data = _mm512_loadu_epi8(rhs_ptr);
+
+        // Take bytes 0, 1, 4, 5, 8, 9, ... and expand to 16-bit.
+        __m512i lhs_16_bit_low =
+            _mm512_cvtepi8_epi16(_mm512_cvtepi32_epi16(lhs_data));
+        // Take bytes 2, 3, 6, 7, 10, 11, ... and expand to 16-bit.
+        __m512i lhs_16_bit_high = _mm512_cvtepi8_epi16(
+            _mm512_cvtepi32_epi16(_mm512_srli_epi32(lhs_data, 16)));
+
+        for (int j = 0; j < 16; ++j) {
+          // Mask that drops the 0th element.
+          static constexpr std::uint16_t shift_mask = 0xfffe;
+          const __m256i dup_rhs_element_low =
+              _mm256_broadcastw_epi16(_mm512_castsi512_si128(rhs_data));
+          // Shift rhs_data, moving next element into 0 position.
+          const __m256i dup_rhs_element_high = _mm256_set1_epi16(
+              _mm_extract_epi16(_mm512_castsi512_si128(rhs_data), 1));
+          // Shift rhs_data, moving next element into 0 position.
+          rhs_data = _mm512_maskz_compress_epi32(shift_mask, rhs_data);
+
+          __m512i rhs_16_bit_dup_low =
+              _mm512_cvtepi8_epi16(dup_rhs_element_low);
+          __m512i rhs_16_bit_dup_high =
+              _mm512_cvtepi8_epi16(dup_rhs_element_high);
+
+          accum_data_v_low[j] = _mm512_add_epi32(
+              accum_data_v_low[j],
+              _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
+          accum_data_v_high[j] = _mm512_add_epi32(
+              accum_data_v_high[j],
+              _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
+        }
+
+        lhs_ptr += 16 * 4;
+        rhs_ptr += 16 * 4;
+      }
+      for (int j = 0; j < 16; ++j) {
+        accum_data_v[j] =
+            _mm512_add_epi32(accum_data_v_low[j], accum_data_v_high[j]);
+      }
+
+      // Move most of this up to bias, or even outside row loop.
+
+      const std::int32_t lhs_zero_point = params.lhs_zero_point;
+      const std::int32_t rhs_zero_point = params.rhs_zero_point;
+      const std::int32_t prod_zp_depth = params.prod_zp_depth;
+      if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && rhs_zero_point) {
+        const __m512i lhs_sums_offset =
+            _mm512_mullo_epi32(_mm512_set1_epi32(rhs_zero_point),
+                               _mm512_loadu_epi32(&params.lhs_sums[row]));
+        for (int j = 0; j < 16; ++j) {
+          accum_data_v[j] = _mm512_sub_epi32(accum_data_v[j], lhs_sums_offset);
+        }
+      }
+      if (((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && lhs_zero_point) ||
+          prod_zp_depth) {
+        __m512i non_lhs_sums_offset =
+            _mm512_mullo_epi32(_mm512_set1_epi32(lhs_zero_point),
+                               _mm512_loadu_epi32(&params.rhs_sums[col]));
+        non_lhs_sums_offset = _mm512_sub_epi32(
+            non_lhs_sums_offset, _mm512_set1_epi32(prod_zp_depth));
+
+        for (int j = 0; j < 16; ++j) {
+          accum_data_v[j] = _mm512_sub_epi32(
+              accum_data_v[j],
+              _mm512_set1_epi32(mm512_get1_epi32(non_lhs_sums_offset, j)));
+        }
+      }
+
+      //
+
+      if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
+        __m512i m_vector;
+        __m512i e_vector;
+        // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
+        if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
+          m_vector = _mm512_maskz_loadu_epi32(
+              row_mask, &params.multiplier_fixedpoint[row]);
+          e_vector = _mm512_maskz_loadu_epi32(row_mask,
+                                              &params.multiplier_exponent[row]);
+        } else {
+          // These arrays have size LhsCols, and are pre-filled.
+          m_vector =
+              _mm512_maskz_loadu_epi32(row_mask, params.multiplier_fixedpoint);
+          e_vector =
+              _mm512_maskz_loadu_epi32(row_mask, params.multiplier_exponent);
+        }
+
+        const __m512i m_64bit_low =
+            _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(m_vector, 0));
+        const __m512i m_64bit_high =
+            _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(m_vector, 1));
+
+        const __m512i zero_vector = _mm512_setzero_epi32();
+        const __m512i left_shift = _mm512_max_epi32(e_vector, zero_vector);
+        const __m512i neg_e_vector = _mm512_sub_epi32(zero_vector, e_vector);
+        const __m512i right_shift = _mm512_max_epi32(neg_e_vector, zero_vector);
+        const __m512i final_right_shift =
+            _mm512_add_epi32(right_shift, _mm512_set1_epi32(31));
+        const __m512i final_right_shift_low = _mm512_cvtepi32_epi64(
+            _mm512_extracti32x8_epi32(final_right_shift, 0));
+        const __m512i final_right_shift_high = _mm512_cvtepi32_epi64(
+            _mm512_extracti32x8_epi32(final_right_shift, 1));
+
+        const __m512i offset_vector =
+            _mm512_slli_epi64(_mm512_set1_epi64(1), 30);
+        // Really these should be shifted by neg_e_vector, but tests pass when
+        // using right_shift.
+        const __m512i offset_vector_low = _mm512_sllv_epi64(
+            offset_vector,
+            _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(right_shift, 0)));
+        const __m512i offset_vector_high = _mm512_sllv_epi64(
+            offset_vector,
+            _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(right_shift, 1)));
+
+        for (int j = 0; j < 16; ++j) {
+          accum_data_v[j] = _mm512_sllv_epi32(accum_data_v[j], left_shift);
+          // Apply the fixed-point part of the multiplier.
+          __m512i scaled_v_low =
+              _mm512_mul_epi32(_mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(
+                                   accum_data_v[j], 0)),
+                               m_64bit_low);
+          __m512i scaled_v_high =
+              _mm512_mul_epi32(_mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(
+                                   accum_data_v[j], 1)),
+                               m_64bit_high);
+
+          scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
+          scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
+
+          scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
+          scaled_v_high =
+              _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
+
+          accum_data_v[j] =
+              _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
+          accum_data_v[j] = _mm512_inserti32x8(
+              accum_data_v[j], _mm512_cvtepi64_epi32(scaled_v_high), 1);
+
+#if !RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)
+          RUY_DCHECK(false);
+#endif
+        }
+
+        if (params.dst_zero_point) {
+          __m512i dst_zero_point = _mm512_set1_epi32(params.dst_zero_point);
+          for (int j = 0; j < 16; ++j) {
+            accum_data_v[j] = _mm512_add_epi32(accum_data_v[j], dst_zero_point);
+          }
+        }
+        __m512i clamp_max_v = _mm512_set1_epi32(params.clamp_max);
+        __m512i clamp_min_v = _mm512_set1_epi32(params.clamp_min);
+        for (int j = 0; j < 16; ++j) {
+          accum_data_v[j] = _mm512_min_epi32(accum_data_v[j], clamp_max_v);
+          accum_data_v[j] = _mm512_max_epi32(accum_data_v[j], clamp_min_v);
+        }
+      }
+      const bool store_full_block =
+          (residual_rows == 16) && (residual_cols == 16);
+
+      if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
+        std::int8_t* tmp_ptr = static_cast<std::int8_t*>(dst_ptr);
+        const int block_col_offset = dst_stride;
+        if (store_full_block) {
+          for (int j = 0; j < 16; ++j) {
+            _mm_storeu_epi8(tmp_ptr, _mm512_cvtepi32_epi8(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          for (int j = 0; j < residual_cols; ++j) {
+            _mm_mask_storeu_epi8(tmp_ptr, row_mask,
+                                 _mm512_cvtepi32_epi8(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) + 16);
+      } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
+        std::uint8_t* tmp_ptr = static_cast<std::uint8_t*>(dst_ptr);
+        const int block_col_offset = dst_stride;
+        if (store_full_block) {
+          for (int j = 0; j < 16; ++j) {
+            _mm_storeu_epi8(tmp_ptr, _mm512_cvtepi32_epi8(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          for (int j = 0; j < residual_cols; ++j) {
+            _mm_mask_storeu_epi8(tmp_ptr, row_mask,
+                                 _mm512_cvtepi32_epi8(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) + 16);
+      } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
+        std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
+        const int block_col_offset = dst_stride;
+        if (store_full_block) {
+          for (int j = 0; j < 16; ++j) {
+            _mm256_storeu_epi16(tmp_ptr,
+                                _mm512_cvtepi32_epi16(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          for (int j = 0; j < residual_cols; ++j) {
+            _mm256_mask_storeu_epi16(tmp_ptr, row_mask,
+                                     _mm512_cvtepi32_epi16(accum_data_v[j]));
+            tmp_ptr += block_col_offset;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) + 16);
+      } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
+        if (store_full_block) {
+          std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
+          const int block_col_offset = dst_stride;
+          for (int j = 0; j < 16; ++j) {
+            _mm512_storeu_epi32(tmp_ptr, accum_data_v[j]);
+            tmp_ptr += block_col_offset;
+          }
+        } else {
+          std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
+          for (int j = 0; j < residual_cols; ++j) {
+            _mm512_mask_storeu_epi32(dst_block_ptr, row_mask, accum_data_v[j]);
+            dst_block_ptr += dst_stride;
+          }
+        }
+        dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) + 16);
+      } else {
+        RUY_DCHECK(false);
+      }
+
+      lhs_col_ptr += 16 * params.lhs_stride;
+    }  // End row-block loop.
+
+    dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
+                                     16 * params.dst_stride);
+    rhs_col_ptr += 16 * params.rhs_stride;
+  }  // End col-block loop.
+}
+
+void KernelFloatAvx512(const KernelParamsFloat<16, 16>& params) {
+  gemmlowp::ScopedProfilingLabel label("Kernel kAvx512");
+
+  // As parameters are defined, we need to scale by sizeof(float).
+  const std::int64_t lhs_stride = params.lhs_stride >> 2;
+  const std::int64_t dst_stride = params.dst_stride >> 2;
+  const std::int64_t rhs_stride = params.rhs_stride >> 2;
+
+  int bias_ptr_block_increment = params.flags & RUY_ASM_FLAG_HAS_BIAS ? 1 : 0;
+  const int end_row = std::min(params.dst_rows, params.last_row + 16);
+  const int end_col = std::min(params.dst_cols, params.last_col + 16);
+
+  const float* adj_rhs_col_ptr =
+      params.rhs_base_ptr - params.start_col * rhs_stride;
+  float* adj_dst_col_ptr =
+      params.dst_base_ptr - params.start_col * dst_stride - params.start_row;
+  const float* adj_lhs_col_ptr =
+      params.lhs_base_ptr - params.start_row * lhs_stride;
+  const float* bias_col_ptr = params.bias;
+
+  const __m512 clamp_max_v = _mm512_set1_ps(params.clamp_max);
+  const __m512 clamp_min_v = _mm512_set1_ps(params.clamp_min);
+
+  int col = params.start_col;
+  for (; col <= end_col - 16; col += 16) {
+    const float* rhs_col_ptr = adj_rhs_col_ptr + col * rhs_stride;
+    float* dst_col_ptr = adj_dst_col_ptr + col * dst_stride;
+
+    int row = params.start_row;
+    for (; row <= end_row - 16; row += 16) {
+      const float* lhs_col_ptr = adj_lhs_col_ptr + row * lhs_stride;
+      float* dst_ptr = dst_col_ptr + row;
+      const float* bias_ptr = bias_col_ptr + row * bias_ptr_block_increment;
+
+      // Initialize with bias.
+      const __m512 initial_accum_data = _mm512_loadu_ps(bias_ptr);
+
+      // Process block in two halves, split by columns.
+      {
+        constexpr int mmm = 0;
+
+        __m512 accum_data_v0 = initial_accum_data;
+        __m512 accum_data_v1 = initial_accum_data;
+        __m512 accum_data_v2 = initial_accum_data;
+        __m512 accum_data_v3 = initial_accum_data;
+        __m512 accum_data_v4 = initial_accum_data;
+        __m512 accum_data_v5 = initial_accum_data;
+        __m512 accum_data_v6 = initial_accum_data;
+        __m512 accum_data_v7 = initial_accum_data;
+
+        const float* lhs_ptr = lhs_col_ptr;
+        const float* rhs_ptr = rhs_col_ptr + 8 * mmm;
+        for (int d = 0; d < (params.depth - 1); ++d) {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+        }
+        {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+          {
+            float* block_ptr = dst_ptr + (mmm * 8 + 0) * dst_stride;
+            accum_data_v0 = _mm512_min_ps(accum_data_v0, clamp_max_v);
+            accum_data_v0 = _mm512_max_ps(accum_data_v0, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 0 * dst_stride, accum_data_v0);
+            accum_data_v1 = _mm512_min_ps(accum_data_v1, clamp_max_v);
+            accum_data_v1 = _mm512_max_ps(accum_data_v1, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 1 * dst_stride, accum_data_v1);
+            accum_data_v2 = _mm512_min_ps(accum_data_v2, clamp_max_v);
+            accum_data_v2 = _mm512_max_ps(accum_data_v2, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 2 * dst_stride, accum_data_v2);
+            accum_data_v3 = _mm512_min_ps(accum_data_v3, clamp_max_v);
+            accum_data_v3 = _mm512_max_ps(accum_data_v3, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 3 * dst_stride, accum_data_v3);
+            accum_data_v4 = _mm512_min_ps(accum_data_v4, clamp_max_v);
+            accum_data_v4 = _mm512_max_ps(accum_data_v4, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 4 * dst_stride, accum_data_v4);
+            accum_data_v5 = _mm512_min_ps(accum_data_v5, clamp_max_v);
+            accum_data_v5 = _mm512_max_ps(accum_data_v5, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 5 * dst_stride, accum_data_v5);
+            accum_data_v6 = _mm512_min_ps(accum_data_v6, clamp_max_v);
+            accum_data_v6 = _mm512_max_ps(accum_data_v6, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 6 * dst_stride, accum_data_v6);
+            accum_data_v7 = _mm512_min_ps(accum_data_v7, clamp_max_v);
+            accum_data_v7 = _mm512_max_ps(accum_data_v7, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 7 * dst_stride, accum_data_v7);
+          }
+        }
+      }  // Inner half-block loop, unrolled, first iteration.
+      {
+        constexpr int mmm = 1;
+
+        __m512 accum_data_v0 = initial_accum_data;
+        __m512 accum_data_v1 = initial_accum_data;
+        __m512 accum_data_v2 = initial_accum_data;
+        __m512 accum_data_v3 = initial_accum_data;
+        __m512 accum_data_v4 = initial_accum_data;
+        __m512 accum_data_v5 = initial_accum_data;
+        __m512 accum_data_v6 = initial_accum_data;
+        __m512 accum_data_v7 = initial_accum_data;
+
+        const float* lhs_ptr = lhs_col_ptr;
+        const float* rhs_ptr = rhs_col_ptr + 8 * mmm;
+        for (int d = 0; d < (params.depth - 1); ++d) {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+        }
+        {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+          {
+            float* block_ptr = dst_ptr + (mmm * 8 + 0) * dst_stride;
+            accum_data_v0 = _mm512_min_ps(accum_data_v0, clamp_max_v);
+            accum_data_v0 = _mm512_max_ps(accum_data_v0, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 0 * dst_stride, accum_data_v0);
+            accum_data_v1 = _mm512_min_ps(accum_data_v1, clamp_max_v);
+            accum_data_v1 = _mm512_max_ps(accum_data_v1, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 1 * dst_stride, accum_data_v1);
+            accum_data_v2 = _mm512_min_ps(accum_data_v2, clamp_max_v);
+            accum_data_v2 = _mm512_max_ps(accum_data_v2, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 2 * dst_stride, accum_data_v2);
+            accum_data_v3 = _mm512_min_ps(accum_data_v3, clamp_max_v);
+            accum_data_v3 = _mm512_max_ps(accum_data_v3, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 3 * dst_stride, accum_data_v3);
+            accum_data_v4 = _mm512_min_ps(accum_data_v4, clamp_max_v);
+            accum_data_v4 = _mm512_max_ps(accum_data_v4, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 4 * dst_stride, accum_data_v4);
+            accum_data_v5 = _mm512_min_ps(accum_data_v5, clamp_max_v);
+            accum_data_v5 = _mm512_max_ps(accum_data_v5, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 5 * dst_stride, accum_data_v5);
+            accum_data_v6 = _mm512_min_ps(accum_data_v6, clamp_max_v);
+            accum_data_v6 = _mm512_max_ps(accum_data_v6, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 6 * dst_stride, accum_data_v6);
+            accum_data_v7 = _mm512_min_ps(accum_data_v7, clamp_max_v);
+            accum_data_v7 = _mm512_max_ps(accum_data_v7, clamp_min_v);
+            _mm512_storeu_ps(block_ptr + 7 * dst_stride, accum_data_v7);
+          }
+        }
+      }  // Inner half-block loop, unrolled, second iteration.
+    }    // End row-block loop.
+
+    // The unrolling within this conditional may be somewhat pointless. It
+    // depends on the kinds of models.
+    if (row < end_row) {
+      const int residual_rows = end_row - row;
+
+      const float* lhs_col_ptr = adj_lhs_col_ptr + row * lhs_stride;
+      float* dst_ptr = dst_col_ptr + row;
+      const float* bias_ptr = bias_col_ptr + row * bias_ptr_block_increment;
+
+      // Initialize with bias.
+      const __mmask16 row_mask =
+          (static_cast<std::uint32_t>(1) << residual_rows) - 1;
+      const __m512 initial_accum_data =
+          _mm512_maskz_loadu_ps(row_mask, bias_ptr);
+
+      // Process block in two halves, split by columns.
+      for (int mmm = 0; mmm < 2; ++mmm) {
+        __m512 accum_data_v0 = initial_accum_data;
+        __m512 accum_data_v1 = initial_accum_data;
+        __m512 accum_data_v2 = initial_accum_data;
+        __m512 accum_data_v3 = initial_accum_data;
+        __m512 accum_data_v4 = initial_accum_data;
+        __m512 accum_data_v5 = initial_accum_data;
+        __m512 accum_data_v6 = initial_accum_data;
+        __m512 accum_data_v7 = initial_accum_data;
+
+        const float* lhs_ptr = lhs_col_ptr;
+        const float* rhs_ptr = rhs_col_ptr + 8 * mmm;
+        for (int d = 0; d < (params.depth - 1); ++d) {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+        }
+        {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+          {
+            const __m512 dup_rhs_element_j0 = _mm512_set1_ps(rhs_data[0]);
+            accum_data_v0 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j0, accum_data_v0);
+            const __m512 dup_rhs_element_j1 = _mm512_set1_ps(rhs_data[1]);
+            accum_data_v1 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j1, accum_data_v1);
+            const __m512 dup_rhs_element_j2 = _mm512_set1_ps(rhs_data[2]);
+            accum_data_v2 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j2, accum_data_v2);
+            const __m512 dup_rhs_element_j3 = _mm512_set1_ps(rhs_data[3]);
+            accum_data_v3 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j3, accum_data_v3);
+            const __m512 dup_rhs_element_j4 = _mm512_set1_ps(rhs_data[4]);
+            accum_data_v4 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j4, accum_data_v4);
+            const __m512 dup_rhs_element_j5 = _mm512_set1_ps(rhs_data[5]);
+            accum_data_v5 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j5, accum_data_v5);
+            const __m512 dup_rhs_element_j6 = _mm512_set1_ps(rhs_data[6]);
+            accum_data_v6 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j6, accum_data_v6);
+            const __m512 dup_rhs_element_j7 = _mm512_set1_ps(rhs_data[7]);
+            accum_data_v7 =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j7, accum_data_v7);
+          }
+          {
+            float* block_ptr = dst_ptr + (mmm * 8 + 0) * dst_stride;
+            accum_data_v0 = _mm512_min_ps(accum_data_v0, clamp_max_v);
+            accum_data_v0 = _mm512_max_ps(accum_data_v0, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 0 * dst_stride, row_mask,
+                                  accum_data_v0);
+            accum_data_v1 = _mm512_min_ps(accum_data_v1, clamp_max_v);
+            accum_data_v1 = _mm512_max_ps(accum_data_v1, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 1 * dst_stride, row_mask,
+                                  accum_data_v1);
+            accum_data_v2 = _mm512_min_ps(accum_data_v2, clamp_max_v);
+            accum_data_v2 = _mm512_max_ps(accum_data_v2, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 2 * dst_stride, row_mask,
+                                  accum_data_v2);
+            accum_data_v3 = _mm512_min_ps(accum_data_v3, clamp_max_v);
+            accum_data_v3 = _mm512_max_ps(accum_data_v3, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 3 * dst_stride, row_mask,
+                                  accum_data_v3);
+            accum_data_v4 = _mm512_min_ps(accum_data_v4, clamp_max_v);
+            accum_data_v4 = _mm512_max_ps(accum_data_v4, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 4 * dst_stride, row_mask,
+                                  accum_data_v4);
+            accum_data_v5 = _mm512_min_ps(accum_data_v5, clamp_max_v);
+            accum_data_v5 = _mm512_max_ps(accum_data_v5, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 5 * dst_stride, row_mask,
+                                  accum_data_v5);
+            accum_data_v6 = _mm512_min_ps(accum_data_v6, clamp_max_v);
+            accum_data_v6 = _mm512_max_ps(accum_data_v6, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 6 * dst_stride, row_mask,
+                                  accum_data_v6);
+            accum_data_v7 = _mm512_min_ps(accum_data_v7, clamp_max_v);
+            accum_data_v7 = _mm512_max_ps(accum_data_v7, clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr + 7 * dst_stride, row_mask,
+                                  accum_data_v7);
+          }
+        }
+      }  // Inner half-block loop.
+    }    // Residual rows, main col-block loop.
+  }      // End col-block loop.
+
+  if (col < end_col) {
+    RUY_DCHECK_GE(end_col - col, 0);
+    RUY_DCHECK_LT(end_col - col, 16);
+
+    __m512 accum_data_v[8];
+
+    const float* rhs_col_ptr = adj_rhs_col_ptr + col * rhs_stride;
+    float* dst_col_ptr = adj_dst_col_ptr + col * dst_stride;
+
+    for (int row = params.start_row; row < end_row; row += 16) {
+      const int residual_rows = std::min(end_row - row, 16);
+
+      const float* lhs_col_ptr = adj_lhs_col_ptr + row * lhs_stride;
+      float* dst_ptr = dst_col_ptr + row;
+      const float* bias_ptr = bias_col_ptr + row * bias_ptr_block_increment;
+
+      // Initialize with bias.
+      const __mmask16 row_mask =
+          (static_cast<std::uint32_t>(1) << residual_rows) - 1;
+      const __m512 initial_accum_data =
+          _mm512_maskz_loadu_ps(row_mask, bias_ptr);
+
+      // Process block in two halves, split by columns.
+      for (int mmm = 0; mmm < 2; ++mmm) {
+        for (int j = 0; j < 8; ++j) {
+          accum_data_v[j] = initial_accum_data;
+        }
+
+        const float* lhs_ptr = lhs_col_ptr;
+        const float* rhs_ptr = rhs_col_ptr + 8 * mmm;
+        for (int d = 0; d < params.depth; ++d) {
+          const __m512 lhs_data = _mm512_loadu_ps(lhs_ptr);
+          const __m256 rhs_data = _mm256_loadu_ps(rhs_ptr);
+
+          for (int j = 0; j < 8; ++j) {
+            const __m512 dup_rhs_element_j = _mm512_set1_ps(rhs_data[j]);
+            accum_data_v[j] =
+                _mm512_fmadd_ps(lhs_data, dup_rhs_element_j, accum_data_v[j]);
+          }
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+        }
+
+        const int residual_cols = std::min(end_col - col - 8 * mmm, 8);
+
+        if (residual_rows == 16) {
+          if (residual_cols == 8) {
+            for (int j = 0; j < 8; ++j) {
+              float* block_ptr = dst_ptr + (mmm * 8 + j) * dst_stride;
+              accum_data_v[j] = _mm512_min_ps(accum_data_v[j], clamp_max_v);
+              accum_data_v[j] = _mm512_max_ps(accum_data_v[j], clamp_min_v);
+              _mm512_storeu_ps(block_ptr, accum_data_v[j]);
+            }
+          } else {
+            for (int j = 0; j < residual_cols; ++j) {
+              float* block_ptr = dst_ptr + (mmm * 8 + j) * dst_stride;
+              accum_data_v[j] = _mm512_min_ps(accum_data_v[j], clamp_max_v);
+              accum_data_v[j] = _mm512_max_ps(accum_data_v[j], clamp_min_v);
+              _mm512_storeu_ps(block_ptr, accum_data_v[j]);
+            }
+          }
+        } else {
+          for (int j = 0; j < residual_cols; ++j) {
+            float* block_ptr = dst_ptr + (mmm * 8 + j) * dst_stride;
+            accum_data_v[j] = _mm512_min_ps(accum_data_v[j], clamp_max_v);
+            accum_data_v[j] = _mm512_max_ps(accum_data_v[j], clamp_min_v);
+            _mm512_mask_storeu_ps(block_ptr, row_mask, accum_data_v[j]);
+          }
+        }
+      }  // Inner half-block loop.
+    }    // End row-block loop.
+  }      // Residual cols.
+}
+
+#endif  //  RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/kernel_common.h b/tensorflow/lite/experimental/ruy/kernel_common.h
new file mode 100644
index 00000000000..1c3645d4ea7
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/kernel_common.h
@@ -0,0 +1,458 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_COMMON_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+struct Kernel {};
+
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void RunKernelTyped(Tuning tuning, const PackedMatrix<LhsScalar>& lhs,
+                    const PackedMatrix<RhsScalar>& rhs, const Spec& spec,
+                    int start_row, int start_col, int end_row, int end_col,
+                    Matrix<DstScalar>* dst) {
+  using Kernel = Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>;
+  Kernel kernel(tuning);
+  using LhsLayout = typename Kernel::LhsLayout;
+  using RhsLayout = typename Kernel::RhsLayout;
+  // end_row and end_col may be larger than dst dimensions.
+  // that is because kernels write directly to the destination matrix, whose
+  // dimensions may not be a multiple of the kernel dimensions, and we try to
+  // keep this annoyance localized as an implementation detail in kernels,
+  // by allowing to pass rounded-up values down as far as possible.
+  // These assertions encode the contract.
+  RUY_DCHECK_LE(0, start_row);
+  RUY_DCHECK_LE(start_row, end_row);
+  RUY_DCHECK_LT(end_row, dst->layout.rows + LhsLayout::kCols);
+  RUY_DCHECK_EQ((end_row - start_row) % LhsLayout::kCols, 0);
+  RUY_DCHECK_LE(0, start_col);
+  RUY_DCHECK_LE(start_col, end_col);
+  RUY_DCHECK_LT(end_col, dst->layout.cols + RhsLayout::kCols);
+  RUY_DCHECK_EQ((end_col - start_col) % RhsLayout::kCols, 0);
+#if RUY_OPT_ENABLED(RUY_OPT_FAT_KERNEL)
+  kernel.Run(lhs, rhs, spec, start_row, start_col, end_row, end_col, dst);
+#else
+  for (int col = start_col; col < end_col; col += RhsLayout::kCols) {
+    int block_end_col = std::min(col + RhsLayout::kCols, end_col);
+    for (int row = start_row; row < end_row; row += LhsLayout::kCols) {
+      int block_end_row = std::min(row + LhsLayout::kCols, end_row);
+      kernel.Run(lhs, rhs, spec, row, col, block_end_row, block_end_col, dst);
+    }
+  }
+#endif
+}
+
+// Main entry point for kernels.
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void RunKernel(Tuning tuning, const SidePair<PMatrix>& src, void* spec,
+               const SidePair<int>& start, const SidePair<int>& end,
+               DMatrix* dst) {
+  Matrix<DstScalar> mdst = ToMatrix<DstScalar>(*dst);
+  RunKernelTyped<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>(
+      tuning, ToPackedMatrix<LhsScalar>(src[Side::kLhs]),
+      ToPackedMatrix<RhsScalar>(src[Side::kRhs]),
+      *static_cast<const Spec*>(spec), start[Side::kLhs], start[Side::kRhs],
+      end[Side::kLhs], end[Side::kRhs], &mdst);
+}
+
+// Copied from TF Lite code.
+inline std::int32_t MultiplyByQuantizedMultiplier(
+    std::int32_t x, std::int32_t quantized_multiplier, int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+// Helper to apply a fixed-point multiplier.  Only 'applicable' if AccumScalar
+// is int32 (i.e. in all cases except floating-point) and if the destination is
+// not int32 (i.e. unless the user wants to get raw accumulators).
+template <typename Spec,
+          bool IsApplicable =
+              std::is_same<typename Spec::AccumScalar, std::int32_t>::value &&
+              !std::is_same<typename Spec::DstScalar, std::int32_t>::value>
+struct ApplyMultiplierImpl {};
+
+// Specialization in non-applicable case: do nothing, just check that values
+// are default.
+template <typename Spec>
+struct ApplyMultiplierImpl<Spec, false> {
+  using AccumScalar = typename Spec::AccumScalar;
+  using DstScalar = typename Spec::DstScalar;
+  static void Run(const Spec& spec, int row, AccumScalar* accum) {
+    RUY_DCHECK_EQ(spec.multiplier_fixedpoint, 0);
+    RUY_DCHECK_EQ(spec.multiplier_exponent, 0);
+  }
+};
+
+template <typename Spec>
+struct ApplyMultiplierImpl<Spec, true> {
+  using AccumScalar = typename Spec::AccumScalar;
+  using DstScalar = typename Spec::DstScalar;
+  static void Run(const Spec& spec, int row, AccumScalar* accum) {
+    AccumScalar m = spec.multiplier_fixedpoint_perchannel
+                        ? spec.multiplier_fixedpoint_perchannel[row]
+                        : spec.multiplier_fixedpoint;
+    int e = spec.multiplier_exponent_perchannel
+                ? spec.multiplier_exponent_perchannel[row]
+                : spec.multiplier_exponent;
+    *accum = MultiplyByQuantizedMultiplier(*accum, m, e);
+  }
+};
+
+template <typename Spec>
+void ApplyMultiplier(const Spec& spec, int row,
+                     typename Spec::AccumScalar* accum) {
+  ApplyMultiplierImpl<Spec>::Run(spec, row, accum);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          typename Spec>
+struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
+  using AccumScalar = typename Spec::AccumScalar;
+  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
+  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
+  explicit Kernel(Tuning) {}
+  void Run(const PackedMatrix<LhsScalar>& lhs,
+           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    // See the comment in RunKernelTyped. end_row may be larger than
+    // dst->layout.rows. It's the responsibility of the kernel to avoid
+    // overrunning dst boundaries, which we do here by computing
+    // clamped_end_row.
+    int clamped_end_row = std::min(end_row, dst->layout.rows);
+    int clamped_end_col = std::min(end_col, dst->layout.cols);
+    RUY_DCHECK_LE(0, start_row);
+    RUY_DCHECK_LE(start_row, clamped_end_row);
+    RUY_DCHECK_LE(clamped_end_row, dst->layout.rows);
+    RUY_DCHECK_LE(clamped_end_row, end_row);
+    RUY_DCHECK_LE(end_row - clamped_end_row, LhsLayout::kCols);
+    RUY_DCHECK_LE(0, start_col);
+    RUY_DCHECK_LE(start_col, clamped_end_col);
+    RUY_DCHECK_LE(clamped_end_col, dst->layout.cols);
+    RUY_DCHECK_LE(clamped_end_col, end_col);
+    RUY_DCHECK_LE(end_col - clamped_end_col, RhsLayout::kCols);
+    gemmlowp::ScopedProfilingLabel label("Kernel (Standard Cpp)");
+    const int depth = lhs.layout.rows;
+    for (int i = start_row; i < clamped_end_row; i++) {
+      for (int j = start_col; j < clamped_end_col; j++) {
+        using AccumScalar = typename Spec::AccumScalar;
+        AccumScalar accum = 0;
+        for (int k = 0; k < depth; k++) {
+          AccumScalar lhs_val = Element(lhs, k, i);
+          AccumScalar rhs_val = Element(rhs, k, j);
+          accum += lhs_val * rhs_val;
+        }
+        if (spec.bias) {
+          accum += spec.bias[i];
+        }
+        if (lhs.zero_point) {
+          accum -= lhs.zero_point * rhs.sums[j];
+        }
+        if (rhs.zero_point) {
+          accum -= rhs.zero_point * lhs.sums[i];
+        }
+        if (lhs.zero_point && rhs.zero_point) {
+          accum += lhs.zero_point * rhs.zero_point * depth;
+        }
+        ApplyMultiplier(spec, i, &accum);
+        accum += dst->zero_point;
+        accum = std::min<AccumScalar>(accum, spec.clamp_max);
+        accum = std::max<AccumScalar>(accum, spec.clamp_min);
+        *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
+      }
+    }
+  }
+};
+
+#define RUY_INHERIT_KERNEL(PARENT, CHILD)                                  \
+  template <typename LhsScalar, typename RhsScalar, typename DstScalar,    \
+            typename Spec>                                                 \
+  struct Kernel<CHILD, LhsScalar, RhsScalar, DstScalar, Spec>              \
+      : Kernel<PARENT, LhsScalar, RhsScalar, DstScalar, Spec> {            \
+    explicit Kernel(Tuning tuning)                                         \
+        : Kernel<PARENT, LhsScalar, RhsScalar, DstScalar, Spec>(tuning) {} \
+  };
+
+#if RUY_PLATFORM(NEON)
+RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon)
+RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
+#elif RUY_PLATFORM(X86)
+RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kAvx2)
+RUY_INHERIT_KERNEL(Path::kAvx2, Path::kAvx512)
+#endif
+
+// KernelParams are shared across 32-bit and 64-bit NEON code, and x86 code.
+//
+// In other cases, we still define (empty) versions, so that dummy kernels
+// can use the classes in function signatures.
+#if ((RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
+     RUY_OPT_ENABLED(RUY_OPT_ASM)) ||                    \
+    RUY_PLATFORM(X86)
+
+#define RUY_ASM_FLAG_HAS_BIAS 0x1
+#define RUY_ASM_FLAG_HAS_LHS_SUMS 0x2
+#define RUY_ASM_FLAG_HAS_RHS_SUMS 0x4
+#define RUY_ASM_FLAG_HAS_PERCHANNEL 0x8
+#define RUY_ASM_FLAG_NEEDS_LEFT_SHIFT 0x10
+
+#define RUY_ASM_TYPE_ID_UINT8 1
+#define RUY_ASM_TYPE_ID_INT8 2
+#define RUY_ASM_TYPE_ID_INT16 3
+#define RUY_ASM_TYPE_ID_INT32 4
+
+template <typename DstScalar>
+struct DstTypeId {};
+
+template <>
+struct DstTypeId<std::uint8_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_UINT8;
+};
+
+template <>
+struct DstTypeId<std::int8_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_INT8;
+};
+
+template <>
+struct DstTypeId<std::int16_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_INT16;
+};
+
+template <>
+struct DstTypeId<std::int32_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_INT32;
+};
+
+template <int LhsCols, int RhsCols>
+struct KernelParams8bit {
+  static constexpr int kMaxDstTypeSize = 4;
+
+  const std::int32_t* bias;
+  const std::int32_t* lhs_sums;
+  const std::int32_t* rhs_sums;
+  const std::int8_t* lhs_base_ptr;
+  const std::int32_t* multiplier_fixedpoint;
+  const std::int32_t* multiplier_exponent;
+  const std::int8_t* rhs_base_ptr;
+  void* dst_base_ptr;
+  std::int32_t lhs_zero_point;
+  std::int32_t rhs_zero_point;
+  std::int32_t dst_zero_point;
+  std::int32_t prod_zp_depth;
+  std::int32_t start_row;
+  std::int32_t start_col;
+  std::int32_t last_row;
+  std::int32_t last_col;
+  std::int32_t dst_rows;
+  std::int32_t dst_cols;
+  std::int32_t lhs_stride;
+  std::int32_t rhs_stride;
+  std::int32_t dst_stride;
+  std::int32_t depth;
+  std::int32_t clamp_min;
+  std::int32_t clamp_max;
+  std::uint8_t flags;
+  std::uint8_t dst_type_id;
+  const std::int32_t zero_data[LhsCols] = {0};
+  std::uint8_t dst_tmp_buf[LhsCols * RhsCols * kMaxDstTypeSize];
+  std::int32_t multiplier_fixedpoint_buf[LhsCols];
+  std::int32_t multiplier_exponent_buf[LhsCols];
+};
+
+template <typename DstScalar, int LhsCols, int RhsCols>
+void MakeKernelParams8bit(const PackedMatrix<std::int8_t>& lhs,
+                          const PackedMatrix<std::int8_t>& rhs,
+                          const BasicSpec<std::int32_t, DstScalar>& spec,
+                          int start_row, int start_col, int end_row,
+                          int end_col, Matrix<DstScalar>* dst,
+                          KernelParams8bit<LhsCols, RhsCols>* params) {
+  using Params = KernelParams8bit<LhsCols, RhsCols>;
+
+  static_assert(sizeof(DstScalar) <= Params::kMaxDstTypeSize, "");
+
+  const int depth = lhs.layout.rows;
+  RUY_DCHECK_EQ(start_row % LhsCols, 0);
+  RUY_DCHECK_EQ(start_col % RhsCols, 0);
+  RUY_DCHECK_EQ(end_row % LhsCols, 0);
+  RUY_DCHECK_EQ(end_col % RhsCols, 0);
+
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
+  params->flags = 0;
+  params->bias = params->zero_data;
+  if (spec.bias) {
+    params->bias = spec.bias;
+    params->flags |= RUY_ASM_FLAG_HAS_BIAS;
+  }
+  if (lhs.sums) {
+    params->lhs_sums = lhs.sums;
+    params->flags |= RUY_ASM_FLAG_HAS_LHS_SUMS;
+  }
+  if (rhs.sums) {
+    params->rhs_sums = rhs.sums;
+    params->flags |= RUY_ASM_FLAG_HAS_RHS_SUMS;
+  }
+  params->start_row = start_row;
+  params->start_col = start_col;
+  params->last_row = end_row - LhsCols;
+  params->last_col = end_col - RhsCols;
+  params->lhs_stride = lhs.layout.stride;
+  params->rhs_stride = rhs.layout.stride;
+  params->dst_stride = sizeof(DstScalar) * dst->layout.stride;
+  params->lhs_zero_point = lhs.zero_point;
+  params->rhs_zero_point = rhs.zero_point;
+  params->dst_zero_point = dst->zero_point;
+  params->depth = depth;
+  params->prod_zp_depth = lhs.zero_point * rhs.zero_point * depth;
+  if (spec.multiplier_fixedpoint_perchannel) {
+    params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
+    params->flags |= RUY_ASM_FLAG_HAS_PERCHANNEL;
+    params->multiplier_fixedpoint = spec.multiplier_fixedpoint_perchannel;
+    params->multiplier_exponent = spec.multiplier_exponent_perchannel;
+  } else {
+    if (spec.multiplier_exponent > 0) {
+      params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
+    }
+    params->multiplier_fixedpoint = params->multiplier_fixedpoint_buf;
+    params->multiplier_exponent = params->multiplier_exponent_buf;
+    for (int i = 0; i < LhsCols; i++) {
+      params->multiplier_fixedpoint_buf[i] = spec.multiplier_fixedpoint;
+      params->multiplier_exponent_buf[i] = spec.multiplier_exponent;
+    }
+  }
+  params->clamp_min = spec.clamp_min;
+  params->clamp_max = spec.clamp_max;
+  params->dst_rows = dst->layout.rows;
+  params->dst_cols = dst->layout.cols;
+
+  RUY_DCHECK_LT(params->last_row, params->dst_rows);
+  RUY_DCHECK_LT(params->last_col, params->dst_cols);
+
+  params->dst_type_id = DstTypeId<DstScalar>::kValue;
+  params->dst_base_ptr =
+      dst->data.get() + start_col * dst->layout.stride + start_row;
+}
+
+template <int LhsCols, int RhsCols>
+struct KernelParamsFloat {
+  const float* lhs_base_ptr;
+  const float* rhs_base_ptr;
+  float* dst_base_ptr;
+  const float* bias;
+  std::int32_t start_row;
+  std::int32_t start_col;
+  std::int32_t last_row;
+  std::int32_t last_col;
+  std::int32_t dst_rows;
+  std::int32_t dst_cols;
+  std::int32_t lhs_stride;
+  std::int32_t rhs_stride;
+  std::int32_t dst_stride;
+  std::int32_t depth;
+  float clamp_min;
+  float clamp_max;
+  std::uint8_t flags;
+  const float zero_data[LhsCols] = {0};
+  float dst_tmp_buf[LhsCols * RhsCols];
+};
+
+template <int LhsCols, int RhsCols>
+inline void MakeKernelParamsFloat(const PackedMatrix<float>& lhs,
+                                  const PackedMatrix<float>& rhs,
+                                  const BasicSpec<float, float>& spec,
+                                  int start_row, int start_col, int end_row,
+                                  int end_col, Matrix<float>* dst,
+                                  KernelParamsFloat<LhsCols, RhsCols>* params) {
+  using Params = KernelParamsFloat<LhsCols, RhsCols>;
+
+  const int depth = lhs.layout.rows;
+  RUY_DCHECK_EQ(start_row % LhsCols, 0);
+  RUY_DCHECK_EQ(start_col % RhsCols, 0);
+  RUY_DCHECK_EQ(end_row % LhsCols, 0);
+  RUY_DCHECK_EQ(end_col % RhsCols, 0);
+
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
+  params->dst_base_ptr =
+      dst->data.get() + start_col * dst->layout.stride + start_row;
+
+  std::uint8_t flags = 0;
+  params->bias = params->zero_data;
+  if (spec.bias) {
+    params->bias = spec.bias;
+    flags |= RUY_ASM_FLAG_HAS_BIAS;
+  }
+  params->flags = flags;
+  params->start_row = start_row;
+  params->start_col = start_col;
+  params->last_row = end_row - LhsCols;
+  params->last_col = end_col - RhsCols;
+  params->lhs_stride = sizeof(float) * lhs.layout.stride;
+  params->rhs_stride = sizeof(float) * rhs.layout.stride;
+  params->dst_stride = sizeof(float) * dst->layout.stride;
+  params->depth = depth;
+  params->clamp_min = spec.clamp_min;
+  params->clamp_max = spec.clamp_max;
+  params->dst_rows = dst->layout.rows;
+  params->dst_cols = dst->layout.cols;
+
+  RUY_DCHECK_LT(params->last_row, params->dst_rows);
+  RUY_DCHECK_LT(params->last_col, params->dst_cols);
+}
+
+#else  // ((RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) &&
+       // RUY_OPT_ENABLED(RUY_OPT_ASM)) || RUY_PLATFORM(X86)
+
+template <int LhsCols, int RhsCols>
+struct KernelParams8bit {};
+
+template <int LhsCols, int RhsCols>
+struct KernelParamsFloat {};
+
+#endif  // ((RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) &&
+        //  RUY_OPT_ENABLED(RUY_OPT_ASM)) || RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_COMMON_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel_x86.h b/tensorflow/lite/experimental/ruy/kernel_x86.h
new file mode 100644
index 00000000000..78dcffb5958
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/kernel_x86.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_X86_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_X86_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/kernel_common.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params);
+
+template <typename DstScalar>
+struct Kernel<Path::kAvx512, std::int8_t, std::int8_t, DstScalar,
+              BasicSpec<std::int32_t, DstScalar>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
+  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
+           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
+                         dst, &params);
+    Kernel8bitAvx512(params);
+  }
+};
+
+void KernelFloatAvx512(const KernelParamsFloat<16, 16>& params);
+
+template <>
+struct Kernel<Path::kAvx512, float, float, float, BasicSpec<float, float>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
+  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
+           const BasicSpec<float, float>& spec, int start_row, int start_col,
+           int end_row, int end_col, Matrix<float>* dst) const {
+    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
+                          end_col, dst, &params);
+    KernelFloatAvx512(params);
+  }
+};
+
+void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params);
+
+template <typename DstScalar>
+struct Kernel<Path::kAvx2, std::int8_t, std::int8_t, DstScalar,
+              BasicSpec<std::int32_t, DstScalar>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
+           const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
+    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
+                         dst, &params);
+    Kernel8bitAvx2(params);
+  }
+};
+
+void KernelFloatAvx2(const KernelParamsFloat<8, 8>& params);
+
+template <>
+struct Kernel<Path::kAvx2, float, float, float, BasicSpec<float, float>> {
+  Tuning tuning = Tuning::kAuto;
+  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
+           const BasicSpec<float, float>& spec, int start_row, int start_col,
+           int end_row, int end_col, Matrix<float>* dst) const {
+    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
+    MakeKernelParamsFloat(lhs, rhs, spec, start_row, start_col, end_row,
+                          end_col, dst, &params);
+    KernelFloatAvx2(params);
+  }
+};
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_KERNEL_X86_H_
diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h
index 3f26f091a79..b0596284fd8 100644
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
 
 #include <cstddef>
-#include <cstdint>
+#include <cstdint>  // IWYU pragma: keep
 #include <type_traits>
 
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
diff --git a/tensorflow/lite/experimental/ruy/opt_set.h b/tensorflow/lite/experimental/ruy/opt_set.h
index 122cb75a0de..525ba22e262 100644
--- a/tensorflow/lite/experimental/ruy/opt_set.h
+++ b/tensorflow/lite/experimental/ruy/opt_set.h
@@ -23,7 +23,7 @@ limitations under the License.
 // Each bit in RUY_OPT_SET controls a particular optimization done in Ruy.
 #if !defined(RUY_OPT_SET)
 // Default to all optimizations.
-#define RUY_OPT_SET 0x3ff
+#define RUY_OPT_SET 0x7ff
 #endif
 
 #define RUY_OPT_INTRINSICS 0x1
@@ -36,6 +36,7 @@ limitations under the License.
 #define RUY_OPT_AVOID_ALIASING 0x80
 #define RUY_OPT_MAX_STREAMING 0x100
 #define RUY_OPT_PREFETCH 0x200
+#define RUY_OPT_PACK_AHEAD 0x400
 
 #define RUY_OPT_ENABLED(ruy_opt) ((RUY_OPT_SET & ruy_opt) != 0)
 
diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h
index 8a4034cd2f2..61008c23605 100644
--- a/tensorflow/lite/experimental/ruy/pack.h
+++ b/tensorflow/lite/experimental/ruy/pack.h
@@ -83,420 +83,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
 
-#include <cstdint>
-
-#include "profiling/instrumentation.h"
-#include "tensorflow/lite/experimental/ruy/common.h"
-#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
-#include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/platform.h"
-#include "tensorflow/lite/experimental/ruy/tune.h"
 
-namespace ruy {
-
-template <Path ThePath, typename Scalar>
-struct PackedTypeImpl {
-  using Type = Scalar;
-};
-
-template <>
-struct PackedTypeImpl<Path::kNeon, std::uint8_t> {
-  using Type = std::int8_t;
-};
-template <>
-struct PackedTypeImpl<Path::kNeonDotprod, std::uint8_t> {
-  using Type = std::int8_t;
-};
-
-template <Path ThePath, typename Scalar>
-using PackedType = typename PackedTypeImpl<ThePath, Scalar>::Type;
-
-template <typename PackedScalar, typename Scalar>
-PackedScalar Pack(Scalar x) {
-  return x - SymmetricZeroPoint<Scalar>() + SymmetricZeroPoint<PackedScalar>();
-}
-
-template <Path ThePath, typename FixedKernelLayout, typename Scalar,
-          typename PackedScalar, typename SumsType>
-struct PackImpl {};
-
-#define RUY_INHERIT_PACK(PARENT, CHILD)                                       \
-  template <typename FixedKernelLayout, typename Scalar,                      \
-            typename PackedScalar, typename SumsType>                         \
-  struct PackImpl<CHILD, FixedKernelLayout, Scalar, PackedScalar, SumsType>   \
-      : PackImpl<PARENT, FixedKernelLayout, Scalar, PackedScalar, SumsType> { \
-  };
-
-template <typename FixedKernelLayout, typename Scalar, typename PackedScalar,
-          typename SumsType>
-struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
-                SumsType> {
-  static void Run(Tuning, const Matrix<Scalar>& src_matrix,
-                  PackedMatrix<PackedScalar>* packed_matrix, int start_col,
-                  int end_col) {
-    gemmlowp::ScopedProfilingLabel label("Pack (generic)");
-    RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
-    SumsType* sums = packed_matrix->sums;
-    for (int col = start_col; col < end_col; col++) {
-      SumsType accum = 0;
-      for (int row = 0; row < packed_matrix->layout.rows; row++) {
-        PackedScalar packed_val;
-        if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) {
-          packed_val = Pack<PackedScalar>(Element(src_matrix, row, col));
-        } else {
-          packed_val = packed_matrix->zero_point;
-        }
-        accum += packed_val;
-        relaxed_atomic_store(ElementPtr(packed_matrix, row, col), packed_val);
-      }
-      if (sums) {
-        relaxed_atomic_store(sums + col, accum);
-      }
-    }
-  }
-};
-
-RUY_INHERIT_PACK(Path::kStandardCpp, Path::kNeon)
-#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-RUY_INHERIT_PACK(Path::kNeon, Path::kNeonDotprod)
-#endif
-
-#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
-                            const void* src_ptr2, const void* src_ptr3,
-                            int src_inc0, int src_inc1, int src_inc2,
-                            int src_inc3, int src_rows, int src_zero_point,
-                            std::int8_t* packed_ptr, int start_col, int end_col,
-                            std::int32_t* sums_ptr, int input_xor);
-void Pack8bitNeonInOrder(const void* src_ptr0, const void* src_ptr1,
-                         const void* src_ptr2, const void* src_ptr3,
-                         int src_inc0, int src_inc1, int src_inc2, int src_inc3,
-                         int src_rows, int src_zero_point,
-                         std::int8_t* packed_ptr, int start_col, int end_col,
-                         std::int32_t* sums_ptr, int input_xor);
-void Pack8bitNeonDotprodOutOfOrder(const void* src_ptr0, const void* src_ptr1,
-                                   const void* src_ptr2, const void* src_ptr3,
-                                   int src_inc0, int src_inc1, int src_inc2,
-                                   int src_inc3, int src_rows,
-                                   int src_zero_point, std::int8_t* packed_ptr,
-                                   int start_col, int end_col,
-                                   std::int32_t* sums_ptr, int input_xor);
-void Pack8bitNeonDotprodInOrder(const void* src_ptr0, const void* src_ptr1,
-                                const void* src_ptr2, const void* src_ptr3,
-                                int src_inc0, int src_inc1, int src_inc2,
-                                int src_inc3, int src_rows, int src_zero_point,
-                                std::int8_t* packed_ptr, int start_col,
-                                int end_col, std::int32_t* sums_ptr,
-                                int input_xor);
-
-template <typename Scalar>
-struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
-                std::int8_t, std::int32_t> {
-  static_assert(std::is_same<Scalar, std::int8_t>::value ||
-                    std::is_same<Scalar, std::uint8_t>::value,
-                "");
-  static constexpr int kInputXor =
-      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
-  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
-                  int end_col) {
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ(start_col % 4, 0);
-    std::int32_t* sums = packed_matrix->sums;
-    Scalar zerobuf[16];
-    memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
-    for (int block_col = start_col; block_col < end_col; block_col += 4) {
-      int src_stride = src_matrix.layout.stride;
-      const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
-      const Scalar* src_ptr1 = src_ptr0 + src_stride;
-      const Scalar* src_ptr2 = src_ptr1 + src_stride;
-      const Scalar* src_ptr3 = src_ptr2 + src_stride;
-      int src_inc0 = 16;
-      int src_inc1 = 16;
-      int src_inc2 = 16;
-      int src_inc3 = 16;
-      if (block_col >= src_matrix.layout.cols - 3) {
-        if (block_col >= src_matrix.layout.cols - 0) {
-          src_ptr0 = zerobuf;
-          src_inc0 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 1) {
-          src_ptr1 = zerobuf;
-          src_inc1 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 2) {
-          src_ptr2 = zerobuf;
-          src_inc2 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 3) {
-          src_ptr3 = zerobuf;
-          src_inc3 = 0;
-        }
-      }
-      std::int8_t* packed_ptr =
-          packed_matrix->data + packed_matrix->layout.stride * block_col;
-      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
-      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-        Pack8bitNeonInOrder(
-            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
-            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
-            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
-      } else {
-        Pack8bitNeonOutOfOrder(
-            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
-            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
-            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
-      }
-    }
-  }
-};
-
-template <typename Scalar>
-struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
-                Scalar, std::int8_t, std::int32_t> {
-  static_assert(std::is_same<Scalar, std::int8_t>::value ||
-                    std::is_same<Scalar, std::uint8_t>::value,
-                "");
-  static constexpr int kInputXor =
-      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
-  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
-                  int end_col) {
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ(start_col % 8, 0);
-    std::int32_t* sums = packed_matrix->sums;
-    Scalar zerobuf[16];
-    memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
-    for (int block_col = start_col; block_col < end_col; block_col += 4) {
-      int src_stride = src_matrix.layout.stride;
-      const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
-      const Scalar* src_ptr1 = src_ptr0 + src_stride;
-      const Scalar* src_ptr2 = src_ptr1 + src_stride;
-      const Scalar* src_ptr3 = src_ptr2 + src_stride;
-      std::int64_t src_inc0 = 16;
-      std::int64_t src_inc1 = 16;
-      std::int64_t src_inc2 = 16;
-      std::int64_t src_inc3 = 16;
-      if (block_col >= src_matrix.layout.cols - 3) {
-        if (block_col >= src_matrix.layout.cols - 0) {
-          src_ptr0 = zerobuf;
-          src_inc0 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 1) {
-          src_ptr1 = zerobuf;
-          src_inc1 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 2) {
-          src_ptr2 = zerobuf;
-          src_inc2 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 3) {
-          src_ptr3 = zerobuf;
-          src_inc3 = 0;
-        }
-      }
-      std::int8_t* packed_ptr =
-          packed_matrix->data +
-          packed_matrix->layout.stride * (block_col & ~7) +
-          ((block_col & 4) * 4);
-      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
-      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-        Pack8bitNeonDotprodInOrder(
-            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
-            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
-            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
-      } else {
-        Pack8bitNeonDotprodOutOfOrder(
-            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
-            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
-            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
-      }
-    }
-  }
-};
-#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
-
-#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
-                             const float* src_ptr2, const float* src_ptr3,
-                             int src_inc0, int src_inc1, int src_inc2,
-                             int src_inc3, int src_rows, int src_zero_point,
-                             float* packed_ptr, int start_col, int end_col);
-void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
-                          const float* src_ptr2, const float* src_ptr3,
-                          int src_inc0, int src_inc1, int src_inc2,
-                          int src_inc3, int src_rows, int src_zero_point,
-                          float* packed_ptr, int start_col, int end_col);
-
-#elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
-                             const float* src_ptr2, const float* src_ptr3,
-                             int src_inc, int src_rows, int src_zero_point,
-                             float* packed_ptr, int start_col, int end_col,
-                             int stride);
-#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
-
-#if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \
-    RUY_OPT_ENABLED(RUY_OPT_ASM)
-
-template <>
-struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
-                float, float> {
-  static void Run(Tuning tuning, const Matrix<float>& src_matrix,
-                  PackedMatrix<float>* packed_matrix, int start_col,
-                  int end_col) {
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ(start_col % 8, 0);
-    const float zerobuf[4] = {0};
-    for (int block_col = start_col; block_col < end_col; block_col += 4) {
-      int src_stride = src_matrix.layout.stride;
-      const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
-      const float* src_ptr1 = src_ptr0 + src_stride;
-      const float* src_ptr2 = src_ptr1 + src_stride;
-      const float* src_ptr3 = src_ptr2 + src_stride;
-      std::int64_t src_inc0 = 16;
-      std::int64_t src_inc1 = 16;
-      std::int64_t src_inc2 = 16;
-      std::int64_t src_inc3 = 16;
-      if (block_col >= src_matrix.layout.cols - 3) {
-        if (block_col >= src_matrix.layout.cols - 0) {
-          src_ptr0 = zerobuf;
-          src_inc0 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 1) {
-          src_ptr1 = zerobuf;
-          src_inc1 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 2) {
-          src_ptr2 = zerobuf;
-          src_inc2 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 3) {
-          src_ptr3 = zerobuf;
-          src_inc3 = 0;
-        }
-      }
-      float* packed_ptr = packed_matrix->data +
-                          packed_matrix->layout.stride * (block_col & ~7) +
-                          ((block_col & 4));
-#if RUY_PLATFORM(NEON_64)
-      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
-        PackFloatNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0,
-                             src_inc1, src_inc2, src_inc3,
-                             src_matrix.layout.rows, src_matrix.zero_point,
-                             packed_ptr, start_col, end_col);
-      } else {
-        PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
-                                src_inc0, src_inc1, src_inc2, src_inc3,
-                                src_matrix.layout.rows, src_matrix.zero_point,
-                                packed_ptr, start_col, end_col);
-      }
+// IWYU pragma: begin_exports
+#if RUY_PLATFORM(NEON)
+#include "tensorflow/lite/experimental/ruy/pack_arm.h"
+#elif RUY_PLATFORM(X86)
+#include "tensorflow/lite/experimental/ruy/pack_x86.h"
 #else
-      // Encode each of src_inc0, ..., src_inc3 in lowest 4 bits of src_inc
-      // to save on registers (we have fewer general purpose registers in
-      // 32-bit ARM than in 64-bit ARM). For the 64-bit case, we pass four
-      // values that are each either 16 or 0 and use them directly. For the
-      // 32-bit case, bits 0, 1, 2, and 3 are used to determine if we should
-      // use the value 16 (bit is set) or 0 (bit is not set) for the
-      // respective increment value.
-      std::int64_t src_inc = 0;
-      src_inc += src_inc0 == 16 ? 1 : 0;
-      src_inc += src_inc1 == 16 ? 2 : 0;
-      src_inc += src_inc2 == 16 ? 4 : 0;
-      src_inc += src_inc3 == 16 ? 8 : 0;
-      const int kOutputStride = 32;
-      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
-                              src_matrix.layout.rows, src_matrix.zero_point,
-                              packed_ptr, start_col, end_col, kOutputStride);
-#endif  // RUY_PLATFORM(NEON_64)
-    }
-  }
-};
-
-#if RUY_PLATFORM(NEON_32)
-// The 32-bit float kernel is 8 rows X 4 columns, so we need an additional
-// specialization for a FixedKernelLayout with 4 columns.
-template <>
-struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float,
-                float, float> {
-  static void Run(Tuning tuning, const Matrix<float>& src_matrix,
-                  PackedMatrix<float>* packed_matrix, int start_col,
-                  int end_col) {
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ(start_col % 4, 0);
-    const float zerobuf[4] = {0};
-    for (int block_col = start_col; block_col < end_col; block_col += 4) {
-      int src_stride = src_matrix.layout.stride;
-      const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
-      const float* src_ptr1 = src_ptr0 + src_stride;
-      const float* src_ptr2 = src_ptr1 + src_stride;
-      const float* src_ptr3 = src_ptr2 + src_stride;
-      std::int64_t src_inc0 = 16;
-      std::int64_t src_inc1 = 16;
-      std::int64_t src_inc2 = 16;
-      std::int64_t src_inc3 = 16;
-      if (block_col >= src_matrix.layout.cols - 3) {
-        if (block_col >= src_matrix.layout.cols - 0) {
-          src_ptr0 = zerobuf;
-          src_inc0 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 1) {
-          src_ptr1 = zerobuf;
-          src_inc1 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 2) {
-          src_ptr2 = zerobuf;
-          src_inc2 = 0;
-        }
-        if (block_col >= src_matrix.layout.cols - 3) {
-          src_ptr3 = zerobuf;
-          src_inc3 = 0;
-        }
-      }
-      float* packed_ptr =
-          packed_matrix->data + packed_matrix->layout.stride * (block_col);
-      // Encode each of src_inc0, ..., src_inc1 in lowest 4 bits of scrc_inc
-      // to save registers.
-      std::int64_t src_inc = 0;
-      src_inc += src_inc0 == 16 ? 1 : 0;
-      src_inc += src_inc1 == 16 ? 2 : 0;
-      src_inc += src_inc2 == 16 ? 4 : 0;
-      src_inc += src_inc3 == 16 ? 8 : 0;
-      const int kOutputStride = 16;
-      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
-                              src_matrix.layout.rows, src_matrix.zero_point,
-                              packed_ptr, start_col, end_col, kOutputStride);
-    }
-  }
-};
-#endif  // (RUY_PLATFORM(NEON_32))
-#endif  // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
-        // RUY_OPT_ENABLED(RUY_OPT_ASM)
-
-// Main entry point for packing.
-template <Path ThePath, typename FixedKernelLayout, typename Scalar,
-          typename PackedScalar>
-void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix,
-             int start_col, int end_col) {
-  using SumsType = typename PackedMatrix<PackedScalar>::SumsType;
-  Matrix<Scalar> src = ToMatrix<Scalar>(src_matrix);
-  PackedMatrix<PackedScalar> packed =
-      ToPackedMatrix<PackedScalar>(*packed_matrix);
-  PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType>::Run(
-      tuning, src, &packed, start_col, end_col);
-}
-
-// The signature of RunPack is the same, regardless of its template parameters.
-using RunPackFn = decltype(
-    RunPack<Path::kStandardCpp, FixedKernelLayout<Order::kColMajor, 1, 1>,
-            std::int8_t, std::int8_t>);
-
-}  // namespace ruy
+#include "tensorflow/lite/experimental/ruy/pack_common.h"
+#endif
+// IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
diff --git a/tensorflow/lite/experimental/ruy/pack.cc b/tensorflow/lite/experimental/ruy/pack_arm.cc
similarity index 79%
rename from tensorflow/lite/experimental/ruy/pack.cc
rename to tensorflow/lite/experimental/ruy/pack_arm.cc
index 2ac955cadca..27d5fecd5ac 100644
--- a/tensorflow/lite/experimental/ruy/pack.cc
+++ b/tensorflow/lite/experimental/ruy/pack_arm.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/pack.h"
-
 #include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
@@ -182,6 +185,419 @@ void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
         "v27", "v28", "v29", "v30", "v31");
 }
+#endif
+
+#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#define RUY_OFFSET_SRC_PTR0 0
+#define RUY_OFFSET_SRC_PTR1 4
+#define RUY_OFFSET_SRC_PTR2 8
+#define RUY_OFFSET_SRC_PTR3 12
+#define RUY_OFFSET_SUMS_PTR 16
+#define RUY_OFFSET_PACKED_PTR 20
+#define RUY_OFFSET_SRC_INC0 24
+#define RUY_OFFSET_SRC_INC1 28
+#define RUY_OFFSET_SRC_INC2 32
+#define RUY_OFFSET_SRC_INC3 36
+#define RUY_OFFSET_SRC_ROWS 40
+#define RUY_OFFSET_SRC_ZERO_POINT 44
+#define RUY_OFFSET_INPUT_XOR 48
+
+template <typename Params>
+void CheckOffsetsInPackParams8bit(const Params&) {
+  static_assert(offsetof(Params, src_ptr0) == RUY_OFFSET_SRC_PTR0, "");
+  static_assert(offsetof(Params, src_ptr1) == RUY_OFFSET_SRC_PTR1, "");
+  static_assert(offsetof(Params, src_ptr2) == RUY_OFFSET_SRC_PTR2, "");
+  static_assert(offsetof(Params, src_ptr3) == RUY_OFFSET_SRC_PTR3, "");
+  static_assert(offsetof(Params, sums_ptr) == RUY_OFFSET_SUMS_PTR, "");
+  static_assert(offsetof(Params, packed_ptr) == RUY_OFFSET_PACKED_PTR, "");
+  static_assert(offsetof(Params, src_inc0) == RUY_OFFSET_SRC_INC0, "");
+  static_assert(offsetof(Params, src_inc1) == RUY_OFFSET_SRC_INC1, "");
+  static_assert(offsetof(Params, src_inc2) == RUY_OFFSET_SRC_INC2, "");
+  static_assert(offsetof(Params, src_inc3) == RUY_OFFSET_SRC_INC3, "");
+  static_assert(offsetof(Params, src_rows) == RUY_OFFSET_SRC_ROWS, "");
+  static_assert(offsetof(Params, src_zero_point) == RUY_OFFSET_SRC_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, input_xor) == RUY_OFFSET_INPUT_XOR, "");
+}
+
+// Packing code for out-of-order ARMv7 CPUs like the Krait 400 or A9.
+// No attempt made at making this code efficient on in-order cores yet.
+void Pack8bitNeonOutOfOrder4Cols(const PackParams8bit& params) {
+  CheckOffsetsInPackParams8bit(params);
+  gemmlowp::ScopedProfilingLabel label(
+      "Pack (kNeon, optimized for out-of-order cores)");
+  const void* src_ptr0 = params.src_ptr0;
+  const void* src_ptr1 = params.src_ptr1;
+  const void* src_ptr2 = params.src_ptr2;
+  const void* src_ptr3 = params.src_ptr3;
+  const int src_inc0 = params.src_inc0;
+  const int src_inc1 = params.src_inc1;
+  const int src_inc2 = params.src_inc2;
+  const int src_inc3 = params.src_inc3;
+  const std::int8_t* packed_ptr = params.packed_ptr;
+
+  asm volatile(
+      // clang-format off
+
+          "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_INPUT_XOR) "]\n"
+          "vdup.8 q11, r2\n"
+          "mov r1, #0\n"
+          // Zero-out the accumulators
+          "vmov.i32 q12, #0\n"
+          "vmov.i32 q13, #0\n"
+          "vmov.i32 q14, #0\n"
+          "vmov.i32 q15, #0\n"
+
+          // Round down src_rows to nearest multiple of 16.
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ROWS) "]\n"
+          "and r2, r3, #-16\n"
+          "cmp r1, r2\n"
+          "beq 3f\n"
+
+          "1:\n"
+          "add r1, r1, #16\n"
+          /* Load q0 */
+          "vld1.8 {d0, d1}, [%[src_ptr0]]\n"
+          "add %[src_ptr0], %[src_ptr0], %[src_inc0]\n"
+
+          /* Load q1 */
+          "vld1.8 {d2, d3}, [%[src_ptr1]]\n"
+          "add %[src_ptr1], %[src_ptr1], %[src_inc1]\n"
+
+          "veor.8 q4, q0, q11\n"
+          "veor.8 q5, q1, q11\n"
+
+          // Pairwise add in to 16b accumulators.
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          // Pairwise add accumulate into 32b accumulators.
+          // q12 and q13 contain 4x32b accumulators
+          "vpadal.s16 q12, q8\n"
+          "vpadal.s16 q13, q9\n"
+
+          // Now do the same for src_ptr2 and src_ptr3.
+          "vld1.8 {d0, d1}, [%[src_ptr2]]\n"
+          "add %[src_ptr2], %[src_ptr2], %[src_inc2]\n"
+
+          "vld1.8 {d2, d3}, [%[src_ptr3]]\n"
+          "add %[src_ptr3], %[src_ptr3], %[src_inc3]\n"
+
+          "veor.8 q4, q0, q11\n"
+          "veor.8 q5, q1, q11\n"
+
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          // Pairwise add accumulate into 32b accumulators.
+          // q14 and q15 contain 4x32b accumulators
+          "vpadal.s16 q14, q8\n"
+          "vpadal.s16 q15, q9\n"
+
+          "cmp r1, r2\n"
+          "bne 1b\n"
+
+          "3:\n"
+
+          // Now pack the last (num_rows % 16) rows.
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ROWS) "]\n"
+          "ands r2, r3, #15\n"
+          "beq 4f\n"
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ZERO_POINT) "]\n"
+          "vdup.8 q0, r3\n"
+          "vdup.8 q1, r3\n"
+
+// First, read/accumulate/write for src_ptr0 and src_ptr1.
+#define RUY_LOAD_ONE_ROW1(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d0[" #R "]}, [%[src_ptr0]]!\n" \
+  "vld1.8 { d2[" #R "]}, [%[src_ptr1]]!\n" \
+
+          RUY_LOAD_ONE_ROW1(0, 0)
+          RUY_LOAD_ONE_ROW1(1, 1)
+          RUY_LOAD_ONE_ROW1(2, 2)
+          RUY_LOAD_ONE_ROW1(3, 3)
+          RUY_LOAD_ONE_ROW1(4, 4)
+          RUY_LOAD_ONE_ROW1(5, 5)
+          RUY_LOAD_ONE_ROW1(6, 6)
+          RUY_LOAD_ONE_ROW1(7, 7)
+#undef RUY_LOAD_ONE_ROW1
+
+#define RUY_LOAD_ONE_ROW2(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d1[" #R "]}, [%[src_ptr0]]!\n" \
+  "vld1.8 { d3[" #R "]}, [%[src_ptr1]]!\n" \
+
+          RUY_LOAD_ONE_ROW2(8, 0)
+          RUY_LOAD_ONE_ROW2(9, 1)
+          RUY_LOAD_ONE_ROW2(10, 2)
+          RUY_LOAD_ONE_ROW2(11, 3)
+          RUY_LOAD_ONE_ROW2(12, 4)
+          RUY_LOAD_ONE_ROW2(13, 5)
+          RUY_LOAD_ONE_ROW2(14, 6)
+          RUY_LOAD_ONE_ROW2(15, 7)
+#undef RUY_LOAD_ONE_ROW2
+
+          "5:\n"
+
+          "veor.16 q4, q0, q11\n"
+          "veor.16 q5, q1, q11\n"
+
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+          // Pairwise add accumulate to 4x32b accumulators.
+          "vpadal.s16 q12, q8\n"
+          "vpadal.s16 q13, q9\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          // Reset to src_zero for src_ptr2 and src_ptr3.
+          "vdup.8 q0, r3\n"
+          "vdup.8 q1, r3\n"
+
+// Next, read/accumulate/write for src_ptr2 and src_ptr3.
+#define RUY_LOAD_ONE_ROW1(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d0[" #R "]}, [%[src_ptr2]]!\n" \
+  "vld1.8 { d2[" #R "]}, [%[src_ptr3]]!\n" \
+
+          RUY_LOAD_ONE_ROW1(0, 0)
+          RUY_LOAD_ONE_ROW1(1, 1)
+          RUY_LOAD_ONE_ROW1(2, 2)
+          RUY_LOAD_ONE_ROW1(3, 3)
+          RUY_LOAD_ONE_ROW1(4, 4)
+          RUY_LOAD_ONE_ROW1(5, 5)
+          RUY_LOAD_ONE_ROW1(6, 6)
+          RUY_LOAD_ONE_ROW1(7, 7)
+#undef RUY_LOAD_ONE_ROW1
+
+#define RUY_LOAD_ONE_ROW2(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d1[" #R "]}, [%[src_ptr2]]!\n" \
+  "vld1.8 { d3[" #R "]}, [%[src_ptr3]]!\n" \
+
+          RUY_LOAD_ONE_ROW2(8, 0)
+          RUY_LOAD_ONE_ROW2(9, 1)
+          RUY_LOAD_ONE_ROW2(10, 2)
+          RUY_LOAD_ONE_ROW2(11, 3)
+          RUY_LOAD_ONE_ROW2(12, 4)
+          RUY_LOAD_ONE_ROW2(13, 5)
+          RUY_LOAD_ONE_ROW2(14, 6)
+          RUY_LOAD_ONE_ROW2(15, 7)
+#undef RUY_LOAD_ONE_ROW2
+
+          "5:\n"
+
+          "veor.16 q4, q0, q11\n"
+          "veor.16 q5, q1, q11\n"
+
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+          // Pairwise add accumulate to 4x32b accumulators.
+          "vpadal.s16 q14, q8\n"
+          "vpadal.s16 q15, q9\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          "4:\n"
+          // Pairwise add 32-bit accumulators
+          "vpadd.i32 d24, d24, d25\n"
+          "vpadd.i32 d26, d26, d27\n"
+          "vpadd.i32 d28, d28, d29\n"
+          "vpadd.i32 d30, d30, d31\n"
+          // Final 32-bit values per row
+          "vpadd.i32 d25, d24, d26\n"
+          "vpadd.i32 d27, d28, d30\n"
+
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SUMS_PTR) "]\n"
+          "cmp r3, #0\n"
+          "beq 6f\n"
+          "vst1.32 {d25}, [r3]!\n"
+          "vst1.32 {d27}, [r3]!\n"
+          "6:\n"
+      // clang-format on
+
+      : [ src_ptr0 ] "+r"(src_ptr0), [ src_ptr1 ] "+r"(src_ptr1),
+        [ src_ptr2 ] "+r"(src_ptr2), [ src_ptr3 ] "+r"(src_ptr3)
+      : [ src_inc0 ] "r"(src_inc0), [ src_inc1 ] "r"(src_inc1),
+        [ src_inc2 ] "r"(src_inc2), [ src_inc3 ] "r"(src_inc3),
+        [ packed_ptr ] "r"(packed_ptr), [ params ] "r"(&params)
+      : "cc", "memory", "r1", "r2", "r3", "q0", "q1", "q2", "q3",
+        "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
+}
+
+// Packing code for out-of-order ARMv7 CPUs like the Krait 400 or A9.
+// No attempt made at making this code efficient on in-order cores yet.
+// This version differs from the above in that we only handle two columns
+// at a time.
+void Pack8bitNeonOutOfOrder2Cols(const PackParams8bit& params) {
+  CheckOffsetsInPackParams8bit(params);
+  gemmlowp::ScopedProfilingLabel label(
+      "Pack (kNeon, optimized for out-of-order cores)");
+  const void* src_ptr0 = params.src_ptr0;
+  const void* src_ptr1 = params.src_ptr1;
+  const int src_inc0 = params.src_inc0;
+  const int src_inc1 = params.src_inc1;
+  const std::int8_t* packed_ptr = params.packed_ptr;
+
+  asm volatile(
+      // clang-format off
+
+          "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_INPUT_XOR) "]\n"
+          "vdup.8 q11, r2\n"
+          "mov r1, #0\n"
+          // Zero-out the accumulators
+          "vmov.i32 q12, #0\n"
+          "vmov.i32 q13, #0\n"
+
+          // Round down src_rows to nearest multiple of 16.
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ROWS) "]\n"
+          "and r2, r3, #-16\n"
+          "cmp r1, r2\n"
+          "beq 3f\n"
+
+          "1:\n"
+          "add r1, r1, #16\n"
+          /* Load q0 */
+          "vld1.8 {d0, d1}, [%[src_ptr0]]\n"
+          "add %[src_ptr0], %[src_ptr0], %[src_inc0]\n"
+
+          /* Load q1 */
+          "vld1.8 {d2, d3}, [%[src_ptr1]]\n"
+          "add %[src_ptr1], %[src_ptr1], %[src_inc1]\n"
+
+          "veor.8 q4, q0, q11\n"
+          "veor.8 q5, q1, q11\n"
+
+          // Pairwise add in to 16b accumulators.
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          // Pairwise add accumulate into 32b accumulators.
+          // q12 and q13 contain 4x32b accumulators
+          "vpadal.s16 q12, q8\n"
+          "vpadal.s16 q13, q9\n"
+
+          "cmp r1, r2\n"
+
+          "bne 1b\n"
+
+          "3:\n"
+
+          // Now pack the last (num_rows % 16) rows.
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ROWS) "]\n"
+          "ands r2, r3, #15\n"
+          "beq 4f\n"
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SRC_ZERO_POINT) "]\n"
+          "vdup.8 q0, r3\n"
+          "vdup.8 q1, r3\n"
+
+// Read/accumulate/write for src_ptr0 and src_ptr1.
+#define RUY_LOAD_ONE_ROW1(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d0[" #R "]}, [%[src_ptr0]]!\n" \
+  "vld1.8 { d2[" #R "]}, [%[src_ptr1]]!\n" \
+
+          RUY_LOAD_ONE_ROW1(0, 0)
+          RUY_LOAD_ONE_ROW1(1, 1)
+          RUY_LOAD_ONE_ROW1(2, 2)
+          RUY_LOAD_ONE_ROW1(3, 3)
+          RUY_LOAD_ONE_ROW1(4, 4)
+          RUY_LOAD_ONE_ROW1(5, 5)
+          RUY_LOAD_ONE_ROW1(6, 6)
+          RUY_LOAD_ONE_ROW1(7, 7)
+#undef RUY_LOAD_ONE_ROW1
+
+#define RUY_LOAD_ONE_ROW2(I, R)            \
+  "cmp r2, #" #I "\n"                      \
+  "beq 5f\n"                               \
+  "vld1.8 { d1[" #R "]}, [%[src_ptr0]]!\n" \
+  "vld1.8 { d3[" #R "]}, [%[src_ptr1]]!\n" \
+
+          RUY_LOAD_ONE_ROW2(8, 0)
+          RUY_LOAD_ONE_ROW2(9, 1)
+          RUY_LOAD_ONE_ROW2(10, 2)
+          RUY_LOAD_ONE_ROW2(11, 3)
+          RUY_LOAD_ONE_ROW2(12, 4)
+          RUY_LOAD_ONE_ROW2(13, 5)
+          RUY_LOAD_ONE_ROW2(14, 6)
+          RUY_LOAD_ONE_ROW2(15, 7)
+#undef RUY_LOAD_ONE_ROW2
+
+          "5:\n"
+
+          "veor.16 q4, q0, q11\n"
+          "veor.16 q5, q1, q11\n"
+
+          "vpaddl.s8 q8, q4\n"
+          "vpaddl.s8 q9, q5\n"
+
+
+          // Pairwise add accumulate to 4x32b accumulators.
+          "vpadal.s16 q12, q8\n"
+          "vpadal.s16 q13, q9\n"
+
+          "vst1.32 {q4}, [%[packed_ptr]]!\n"
+          "vst1.32 {q5}, [%[packed_ptr]]!\n"
+
+          "4:\n"
+
+          // Pairwise add 32-bit accumulators
+          "vpadd.i32 d24, d24, d25\n"
+          "vpadd.i32 d26, d26, d27\n"
+          // Final 32-bit values per row
+          "vpadd.i32 d25, d24, d26\n"
+
+          "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_SUMS_PTR) "]\n"
+          "cmp r3, #0\n"
+          "beq 6f\n"
+          "vst1.32 {d25}, [r3]!\n"
+          "6:\n"
+      // clang-format on
+
+      : [ src_ptr0 ] "+r"(src_ptr0), [ src_ptr1 ] "+r"(src_ptr1)
+      : [ src_inc0 ] "r"(src_inc0), [ src_inc1 ] "r"(src_inc1),
+        [ packed_ptr ] "r"(packed_ptr), [ params ] "r"(&params)
+      : "cc", "memory", "r1", "r2", "r3", "q0", "q1", "q2", "q3",
+        "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
+}
+
+#undef RUY_OFFSET_SRC_PTR0
+#undef RUY_OFFSET_SRC_PTR1
+#undef RUY_OFFSET_SRC_PTR2
+#undef RUY_OFFSET_SRC_PTR32
+#undef RUY_OFFSET_SUMS_PTR
+#undef RUY_OFFSET_PACKED_PTR0
+#undef RUY_OFFSET_SRC_INC0
+#undef RUY_OFFSET_SRC_INC1
+#undef RUY_OFFSET_SRC_INC2
+#undef RUY_OFFSET_SRC_INC3
+#undef RUY_OFFSET_SRC_ROWS
+#undef RUY_OFFSET_SRC_ZERO_POINT
+#undef RUY_OFFSET_INPUT_XOR
+
+#endif  //  RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 void Pack8bitNeonInOrder(const void* src_ptr0, const void* src_ptr1,
                          const void* src_ptr2, const void* src_ptr3,
@@ -1199,26 +1615,22 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
           "beq 3f\n"
 #define RUY_LOAD_FOUR_BY_FOUR()               \
   /* Load q0 */                               \
-  "vldr d0, [%[src_ptr0], #0]\n"              \
-  "vldr d1, [%[src_ptr0], #8]\n"              \
+  "vld1.32 {d0, d1}, [%[src_ptr0]]\n"         \
   /* if src_inc0 != 0, add 16 to src_ptr0 */  \
   "and r3, %[src_inc], #1\n"                  \
   "add %[src_ptr0], %[src_ptr0], r3, lsl #4\n"\
   /* Load q1 */                               \
-  "vldr d2, [%[src_ptr1], #0]\n"              \
-  "vldr d3, [%[src_ptr1], #8]\n"              \
+  "vld1.32 {d2, d3}, [%[src_ptr1]]\n"         \
   /* if src_inc1 != 0, add 16 to src_ptr0 */  \
   "and r3, %[src_inc], #2\n"                  \
   "add %[src_ptr1], %[src_ptr1], r3, lsl #3\n"\
   /* Load q2 */                               \
-  "vldr d4, [%[src_ptr2], #0]\n"              \
-  "vldr d5, [%[src_ptr2], #8]\n"              \
+  "vld1.32 {d4, d5}, [%[src_ptr2]]\n"         \
   /* if src_inc2 != 0, add 16 to src_ptr0 */  \
   "and r3, %[src_inc], #4\n"                  \
   "add %[src_ptr2], %[src_ptr2], r3, lsl #2\n"\
   /* Load q3 */                               \
-  "vldr d6, [%[src_ptr3], #0]\n"              \
-  "vldr d7, [%[src_ptr3], #8]\n"              \
+  "vld1.32 {d6, d7}, [%[src_ptr3]]\n"         \
   /* if src_inc3 != 0, add 16 to src_ptr0 */  \
   "and r3, %[src_inc], #8\n"                  \
   "add %[src_ptr3], %[src_ptr3], r3, lsl #1\n"\
@@ -1253,20 +1665,16 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
 #define RUY_STORE_FOUR_BY_FOUR()                  \
   /* Store q8, q10, q9, q11 */                    \
   /* q8 = d16, d17 */                             \
-  "vstr d16, [%[packed_ptr], #0]\n"               \
-  "vstr d17, [%[packed_ptr], #8]\n"               \
+  "vst1.32 {d16, d17}, [%[packed_ptr]]\n"         \
   /* q10 = d20, d21 */                            \
   "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
-  "vstr d20, [%[packed_ptr], #0]\n"               \
-  "vstr d21, [%[packed_ptr], #8]\n"               \
+  "vst1.32 {d20, d21}, [%[packed_ptr]]\n"         \
   /* q9 = d18, d19 */                             \
   "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
-  "vstr d18, [%[packed_ptr], #0]\n"               \
-  "vstr d19, [%[packed_ptr], #8]\n"               \
+  "vst1.32 {d18, d19}, [%[packed_ptr]]\n"         \
   /* q11 = d22, d23 */                            \
   "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
-  "vstr d22, [%[packed_ptr], #0]\n"               \
-  "vstr d23, [%[packed_ptr], #8]\n"               \
+  "vst1.32 {d22, d23}, [%[packed_ptr]]\n"         \
   "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
 
           RUY_STORE_FOUR_BY_FOUR()
@@ -1297,7 +1705,7 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
 
           "ands r2, %[rows], #3\n"
           "beq 4f\n"
-          "mov r0, 0\n"
+          "mov r0, #0\n"
           // Zero out q0 - q3
           "vdup.32 q0, r0\n"
           "vdup.32 q1, r0\n"
@@ -1342,21 +1750,20 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
 
           "mov r1, #32\n"
 
-#define RUY_STORE_ONE_ROW(ROW, REGISTER1, REGISTER2)      \
+#define RUY_STORE_ONE_ROW(ROW, REGISTER)      \
           "cmp r2, #" #ROW "\n"                           \
           "beq 4f\n"                                      \
-          "vstr " #REGISTER1 ", [%[packed_ptr]]\n"    \
-          "vstr " #REGISTER2 ", [%[packed_ptr], #8]\n"    \
+          "vst1.32 {" #REGISTER "}, [%[packed_ptr]]\n"    \
           "add %[packed_ptr], %[packed_ptr], %[stride]\n"
 
           // Store q8
-          RUY_STORE_ONE_ROW(0, d16, d17)
+          RUY_STORE_ONE_ROW(0, q8)
           // Store q10
-          RUY_STORE_ONE_ROW(1, d20, d21)
+          RUY_STORE_ONE_ROW(1, q10)
           // Store q9
-          RUY_STORE_ONE_ROW(2, d18, d19)
+          RUY_STORE_ONE_ROW(2, q9)
           // Store q11
-          RUY_STORE_ONE_ROW(3, d22, d23)
+          RUY_STORE_ONE_ROW(3, q11)
 
 #undef RUY_STORE_ONE_ROW
 
@@ -1368,9 +1775,8 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
         [ packed_ptr ] "+r"(packed_ptr)
       : [ src_inc ] "r"(static_cast<std::int64_t>(src_inc)),
         [ rows ] "r"(src_rows), [ stride ] "r"(output_stride)
-      : "cc", "memory", "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3",
-        "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13",
-        "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23");
+      : "cc", "memory", "r0", "r1", "r2", "r3", "q0", "q1", "q2", "q3",
+        "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
 }
 
 #endif  // (RUY_PLATFORM(NEON_32)
diff --git a/tensorflow/lite/experimental/ruy/pack_arm.h b/tensorflow/lite/experimental/ruy/pack_arm.h
new file mode 100644
index 00000000000..f045d0af5f8
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/pack_arm.h
@@ -0,0 +1,497 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// # What is "packing"?
+//
+// Before feeding data to the gemm kernels (the parts of Ruy that do lots
+// of multiply-add operations), Ruy first performs a data transformation (which
+// we call "packing") on the input matrices. This transformation has two main
+// goals:
+// - rearrange data into blocks that are a convenient size/layout for the gemm
+// kernels to consume. This helps make the memory access pattern of the gemm
+// kernel simpler and more contiguous, and puts the data in a layout most
+// convenient for specific arithmetic instructions in the gemm kernel.
+// - compute row/column sums needed for handling quantization with non-symmetric
+// zero points.
+//
+// # Simplified algorithmic analysis of packing
+//
+// Packing is a relatively simple transformation which does a small constant
+// amount of work on each element of an input matrix, and hence for an NxM
+// matrix performs O(N*M) work. If N and M are of the same order, then this is
+// O(N^2) work.
+//
+// A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
+// Note that if N, K, and M are all the same order, then the number of
+// multiply-accumulate operations is O(N^3).
+//
+// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
+// case of all dimensions being roughly the same order.
+//
+// # Packing cost can be significant
+//
+// When matrix * matrix multiplications begin to look more like matrix * vector
+// multiplications, packing cost can become significant. We sometimes call these
+// cases "gemv-like".
+//
+// Continuing the algorithmic analysis above, if we consider a case where an
+// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
+// situation is different. In this case, the multiply-accumulate work is only
+// quadratic, so the quadratic cost of packing can be come significant.
+//
+// Another way to say this is that the cost of packing an input matrix (either
+// the LHS or RHS) is amortized across the non-depth dimension of the opposite
+// input matrix. Thus, when the LHS has very few rows or the RHS has very few
+// columns, the cost of packing the opposite input matrix can become
+// significant.
+//
+// As a rough rule of thumb, the cost of packing starts to become significant
+// when either N or M is below 32 (and other dimensions are hundreds), with very
+// significant packing costs at 8 or below. This varies by data type, Path, and
+// tuning, so these numbers are only rough guides.
+//
+// One practical use case that is affected by this is inference of
+// fully connected neural network layers with a low batch size. The weight
+// matrix (which is a constant for inference) is the one affected by significant
+// packing cost.
+//
+// Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
+// input matrices that are affected by significant packing costs.
+//
+// # Implementation notes
+//
+// Ruy's packing routines always operate on a range of columns and can be
+// applied to either the LHS or RHS. This is possible because Ruy internally
+// implements a TrMul, so the accumulation along depth is done along columns of
+// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
+// for the LHS is along rows). As another example, we are always computing
+// column sums for quantization (and never row sums, since the LHS is
+// transposed).
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_ARM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_ARM_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/pack_common.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
+                            const void* src_ptr2, const void* src_ptr3,
+                            int src_inc0, int src_inc1, int src_inc2,
+                            int src_inc3, int src_rows, int src_zero_point,
+                            std::int8_t* packed_ptr, int start_col, int end_col,
+                            std::int32_t* sums_ptr, int input_xor);
+void Pack8bitNeonInOrder(const void* src_ptr0, const void* src_ptr1,
+                         const void* src_ptr2, const void* src_ptr3,
+                         int src_inc0, int src_inc1, int src_inc2, int src_inc3,
+                         int src_rows, int src_zero_point,
+                         std::int8_t* packed_ptr, int start_col, int end_col,
+                         std::int32_t* sums_ptr, int input_xor);
+void Pack8bitNeonDotprodOutOfOrder(const void* src_ptr0, const void* src_ptr1,
+                                   const void* src_ptr2, const void* src_ptr3,
+                                   int src_inc0, int src_inc1, int src_inc2,
+                                   int src_inc3, int src_rows,
+                                   int src_zero_point, std::int8_t* packed_ptr,
+                                   int start_col, int end_col,
+                                   std::int32_t* sums_ptr, int input_xor);
+void Pack8bitNeonDotprodInOrder(const void* src_ptr0, const void* src_ptr1,
+                                const void* src_ptr2, const void* src_ptr3,
+                                int src_inc0, int src_inc1, int src_inc2,
+                                int src_inc3, int src_rows, int src_zero_point,
+                                std::int8_t* packed_ptr, int start_col,
+                                int end_col, std::int32_t* sums_ptr,
+                                int input_xor);
+
+#elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void Pack8bitNeonOutOfOrder4Cols(const PackParams8bit& params);
+void Pack8bitNeonOutOfOrder2Cols(const PackParams8bit& params);
+#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \
+    RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+template <typename Scalar>
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
+                std::int8_t, std::int32_t> {
+  static_assert(std::is_same<Scalar, std::int8_t>::value ||
+                    std::is_same<Scalar, std::uint8_t>::value,
+                "");
+  static constexpr int kInputXor =
+      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
+
+  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 4, 0);
+    std::int32_t* sums = packed_matrix->sums;
+    Scalar zerobuf[16];
+    memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
+    for (int block_col = start_col; block_col < end_col; block_col += 4) {
+      int src_stride = src_matrix.layout.stride;
+      const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const Scalar* src_ptr1 = src_ptr0 + src_stride;
+      const Scalar* src_ptr2 = src_ptr1 + src_stride;
+      const Scalar* src_ptr3 = src_ptr2 + src_stride;
+      int src_inc0 = 16;
+      int src_inc1 = 16;
+      int src_inc2 = 16;
+      int src_inc3 = 16;
+      if (block_col >= src_matrix.layout.cols - 3) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 2) {
+          src_ptr2 = zerobuf;
+          src_inc2 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 3) {
+          src_ptr3 = zerobuf;
+          src_inc3 = 0;
+        }
+      }
+      std::int8_t* packed_ptr =
+          packed_matrix->data + packed_matrix->layout.stride * block_col;
+      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
+#if RUY_PLATFORM(NEON_64)
+      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+        Pack8bitNeonInOrder(
+            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
+            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
+            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
+      } else {
+        Pack8bitNeonOutOfOrder(
+            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
+            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
+            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
+      }
+#else
+      // We have a more limited set of general purpose registers in ARMv7, so
+      // we use the "params" struct technique from the kernel code to save
+      // registers.
+      PackParams8bit params;
+      MakePackParams8bit(src_ptr0, src_ptr1, src_ptr2, src_ptr3, sums_ptr,
+                         packed_ptr, src_inc0, src_inc1, src_inc2, src_inc3,
+                         src_matrix.layout.rows, src_matrix.zero_point,
+                         kInputXor, &params);
+      Pack8bitNeonOutOfOrder4Cols(params);
+#endif  // RUY_PLATFORM(NEON_64)
+    }
+  }
+};
+
+#endif  // (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) &&
+        // RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+// The 32-bit float kernel is 4 rows X 2 columns, so we need an additional
+// partial specialization for the RHS, which has a FixedKernelLayout with 2
+// columns.
+template <typename Scalar>
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 2>, Scalar,
+                std::int8_t, std::int32_t> {
+  static_assert(std::is_same<Scalar, std::int8_t>::value ||
+                    std::is_same<Scalar, std::uint8_t>::value,
+                "");
+  static constexpr int kInputXor =
+      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
+  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 2, 0);
+    std::int32_t* sums = packed_matrix->sums;
+    Scalar zerobuf[16];
+    memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
+    for (int block_col = start_col; block_col < end_col; block_col += 2) {
+      int src_stride = src_matrix.layout.stride;
+      const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const Scalar* src_ptr1 = src_ptr0 + src_stride;
+      int src_inc0 = 16;
+      int src_inc1 = 16;
+      if (block_col >= src_matrix.layout.cols - 2) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+      }
+      std::int8_t* packed_ptr =
+          packed_matrix->data + packed_matrix->layout.stride * block_col;
+      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
+      PackParams8bit params;
+      MakePackParams8bit(src_ptr0, src_ptr1, nullptr, nullptr, sums_ptr,
+                         packed_ptr, src_inc0, src_inc1, -1, -1,
+                         src_matrix.layout.rows, src_matrix.zero_point,
+                         kInputXor, &params);
+      Pack8bitNeonOutOfOrder2Cols(params);
+    }
+  }
+};
+#endif  // (RUY_PLATFORM(NEON_32)) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+template <typename Scalar>
+struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
+                Scalar, std::int8_t, std::int32_t> {
+  static_assert(std::is_same<Scalar, std::int8_t>::value ||
+                    std::is_same<Scalar, std::uint8_t>::value,
+                "");
+  static constexpr int kInputXor =
+      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
+
+  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 8, 0);
+    std::int32_t* sums = packed_matrix->sums;
+    Scalar zerobuf[16];
+    memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
+    for (int block_col = start_col; block_col < end_col; block_col += 4) {
+      int src_stride = src_matrix.layout.stride;
+      const Scalar* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const Scalar* src_ptr1 = src_ptr0 + src_stride;
+      const Scalar* src_ptr2 = src_ptr1 + src_stride;
+      const Scalar* src_ptr3 = src_ptr2 + src_stride;
+      std::int64_t src_inc0 = 16;
+      std::int64_t src_inc1 = 16;
+      std::int64_t src_inc2 = 16;
+      std::int64_t src_inc3 = 16;
+      if (block_col >= src_matrix.layout.cols - 3) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 2) {
+          src_ptr2 = zerobuf;
+          src_inc2 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 3) {
+          src_ptr3 = zerobuf;
+          src_inc3 = 0;
+        }
+      }
+      std::int8_t* packed_ptr =
+          packed_matrix->data +
+          packed_matrix->layout.stride * (block_col & ~7) +
+          ((block_col & 4) * 4);
+      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
+      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+        Pack8bitNeonDotprodInOrder(
+            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
+            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
+            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
+      } else {
+        Pack8bitNeonDotprodOutOfOrder(
+            src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
+            src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
+            packed_ptr, start_col, end_col, sums_ptr, kInputXor);
+      }
+    }
+  }
+};
+#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
+                             const float* src_ptr2, const float* src_ptr3,
+                             int src_inc0, int src_inc1, int src_inc2,
+                             int src_inc3, int src_rows, int src_zero_point,
+                             float* packed_ptr, int start_col, int end_col);
+void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
+                          const float* src_ptr2, const float* src_ptr3,
+                          int src_inc0, int src_inc1, int src_inc2,
+                          int src_inc3, int src_rows, int src_zero_point,
+                          float* packed_ptr, int start_col, int end_col);
+
+#elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
+                             const float* src_ptr2, const float* src_ptr3,
+                             int src_inc, int src_rows, int src_zero_point,
+                             float* packed_ptr, int start_col, int end_col,
+                             int stride);
+#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \
+    RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+template <>
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
+                float, float> {
+  static void Run(Tuning tuning, const Matrix<float>& src_matrix,
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 8, 0);
+    const float zerobuf[4] = {0};
+    for (int block_col = start_col; block_col < end_col; block_col += 4) {
+      int src_stride = src_matrix.layout.stride;
+      const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const float* src_ptr1 = src_ptr0 + src_stride;
+      const float* src_ptr2 = src_ptr1 + src_stride;
+      const float* src_ptr3 = src_ptr2 + src_stride;
+      std::int64_t src_inc0 = 16;
+      std::int64_t src_inc1 = 16;
+      std::int64_t src_inc2 = 16;
+      std::int64_t src_inc3 = 16;
+      if (block_col >= src_matrix.layout.cols - 3) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 2) {
+          src_ptr2 = zerobuf;
+          src_inc2 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 3) {
+          src_ptr3 = zerobuf;
+          src_inc3 = 0;
+        }
+      }
+      float* packed_ptr = packed_matrix->data +
+                          packed_matrix->layout.stride * (block_col & ~7) +
+                          ((block_col & 4));
+#if RUY_PLATFORM(NEON_64)
+      if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+        PackFloatNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0,
+                             src_inc1, src_inc2, src_inc3,
+                             src_matrix.layout.rows, src_matrix.zero_point,
+                             packed_ptr, start_col, end_col);
+      } else {
+        PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
+                                src_inc0, src_inc1, src_inc2, src_inc3,
+                                src_matrix.layout.rows, src_matrix.zero_point,
+                                packed_ptr, start_col, end_col);
+      }
+#else
+      // Encode each of src_inc0, ..., src_inc3 in lowest 4 bits of src_inc
+      // to save on registers (we have fewer general purpose registers in
+      // 32-bit ARM than in 64-bit ARM). For the 64-bit case, we pass four
+      // values that are each either 16 or 0 and use them directly. For the
+      // 32-bit case, bits 0, 1, 2, and 3 are used to determine if we should
+      // use the value 16 (bit is set) or 0 (bit is not set) for the
+      // respective increment value.
+      std::int64_t src_inc = 0;
+      src_inc += src_inc0 == 16 ? 1 : 0;
+      src_inc += src_inc1 == 16 ? 2 : 0;
+      src_inc += src_inc2 == 16 ? 4 : 0;
+      src_inc += src_inc3 == 16 ? 8 : 0;
+      const int kOutputStride = 32;
+      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+                              src_matrix.layout.rows, src_matrix.zero_point,
+                              packed_ptr, start_col, end_col, kOutputStride);
+#endif  // RUY_PLATFORM(NEON_64)
+    }
+  }
+};
+
+#if RUY_PLATFORM(NEON_32)
+// The 32-bit float kernel is 8 rows X 4 columns, so we need an additional
+// specialization for a FixedKernelLayout with 4 columns.
+template <>
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float,
+                float, float> {
+  static void Run(Tuning tuning, const Matrix<float>& src_matrix,
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 4, 0);
+    const float zerobuf[4] = {0};
+    for (int block_col = start_col; block_col < end_col; block_col += 4) {
+      int src_stride = src_matrix.layout.stride;
+      const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const float* src_ptr1 = src_ptr0 + src_stride;
+      const float* src_ptr2 = src_ptr1 + src_stride;
+      const float* src_ptr3 = src_ptr2 + src_stride;
+      std::int64_t src_inc0 = 16;
+      std::int64_t src_inc1 = 16;
+      std::int64_t src_inc2 = 16;
+      std::int64_t src_inc3 = 16;
+      if (block_col >= src_matrix.layout.cols - 3) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 2) {
+          src_ptr2 = zerobuf;
+          src_inc2 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 3) {
+          src_ptr3 = zerobuf;
+          src_inc3 = 0;
+        }
+      }
+      float* packed_ptr =
+          packed_matrix->data + packed_matrix->layout.stride * (block_col);
+      // Encode each of src_inc0, ..., src_inc1 in lowest 4 bits of scrc_inc
+      // to save registers.
+      std::int64_t src_inc = 0;
+      src_inc += src_inc0 == 16 ? 1 : 0;
+      src_inc += src_inc1 == 16 ? 2 : 0;
+      src_inc += src_inc2 == 16 ? 4 : 0;
+      src_inc += src_inc3 == 16 ? 8 : 0;
+      const int kOutputStride = 16;
+      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+                              src_matrix.layout.rows, src_matrix.zero_point,
+                              packed_ptr, start_col, end_col, kOutputStride);
+    }
+  }
+};
+#endif  // (RUY_PLATFORM(NEON_32))
+#endif  // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
+        // RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_ARM_H_
diff --git a/tensorflow/lite/experimental/ruy/pack_avx2.cc b/tensorflow/lite/experimental/ruy/pack_avx2.cc
new file mode 100644
index 00000000000..74834190ed0
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/pack_avx2.cc
@@ -0,0 +1,465 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstring>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/pack.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#include <immintrin.h>  // IWYU pragma: keep
+#endif
+
+namespace ruy {
+
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
+                  const std::int8_t* zerobuf, int src_stride,
+                  int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
+                  std::int32_t* sums_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
+                   int remaining_src_cols, int src_rows, float* packed_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+static constexpr int kAvxFloatBlockSize = 8;
+static constexpr int kAvx8bitBlockSize = 8;
+static constexpr int kAvx8bitInnerSize = 4;
+
+// The first int8_t template parameter is arbitrary: this routine is common to
+// all 8-bit source matrix types.
+using PackImpl8bitAvx2 =
+    PackImpl<Path::kAvx2, FixedKernelLayout<Order::kColMajor, 4, 8>,
+             std::int8_t, std::int8_t, std::int32_t>;
+
+using PackImplFloatAvx2 =
+    PackImpl<Path::kAvx2, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
+             float, float>;
+
+namespace {
+
+inline void Pack8bitAvx2Packer(const std::int8_t* src_ptr,
+                               std::int8_t input_xor,
+                               const std::int8_t* zerobuf, int src_stride,
+                               int remaining_src_cols, int src_rows,
+                               std::int8_t* packed_ptr, std::int32_t* sums_ptr,
+                               std::int8_t* trailing_buf) {
+  using Layout = PackImpl8bitAvx2::Layout;
+  RUY_DCHECK_EQ(Layout::kCols, 8);
+  RUY_DCHECK_EQ(Layout::kRows, 4);
+  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
+  // We process 8 of these chunks at a time, padding short input chunks.
+  constexpr int kNumRowChunks = 8;
+  constexpr int kNumChunkedSrcRows = kNumRowChunks * Layout::kRows;
+
+  std::int8_t in_data[Layout::kCols][kNumRowChunks][Layout::kRows];
+
+  const std::int8_t* src_ptr0 = src_ptr;
+  const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
+  const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
+  const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
+  const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
+  const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
+  const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
+  const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
+  std::int64_t src_inc0 = kNumChunkedSrcRows;
+  std::int64_t src_inc1 = kNumChunkedSrcRows;
+  std::int64_t src_inc2 = kNumChunkedSrcRows;
+  std::int64_t src_inc3 = kNumChunkedSrcRows;
+  std::int64_t src_inc4 = kNumChunkedSrcRows;
+  std::int64_t src_inc5 = kNumChunkedSrcRows;
+  std::int64_t src_inc6 = kNumChunkedSrcRows;
+  std::int64_t src_inc7 = kNumChunkedSrcRows;
+  // Handle cases where source does not have Layout::kCols (8) columns.
+  if (remaining_src_cols < 8) {
+    if (remaining_src_cols <= 0) {
+      src_ptr0 = zerobuf;
+      src_inc0 = 0;
+    }
+    if (remaining_src_cols <= 1) {
+      src_ptr1 = zerobuf;
+      src_inc1 = 0;
+    }
+    if (remaining_src_cols <= 2) {
+      src_ptr2 = zerobuf;
+      src_inc2 = 0;
+    }
+    if (remaining_src_cols <= 3) {
+      src_ptr3 = zerobuf;
+      src_inc3 = 0;
+    }
+    if (remaining_src_cols <= 4) {
+      src_ptr4 = zerobuf;
+      src_inc4 = 0;
+    }
+    if (remaining_src_cols <= 5) {
+      src_ptr5 = zerobuf;
+      src_inc5 = 0;
+    }
+    if (remaining_src_cols <= 6) {
+      src_ptr6 = zerobuf;
+      src_inc6 = 0;
+    }
+    src_ptr7 = zerobuf;
+    src_inc7 = 0;
+  }
+
+  const std::int8_t zero_point = zerobuf[0];
+
+  if (sums_ptr) {
+    // i: Layout::kCols.
+    for (int i = 0; i < 8; ++i) {
+      sums_ptr[i] = 0;
+    }
+  }
+
+  // The overall packing effectively pads the source rows to
+  // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
+  // only pack for (src_rows + 31) & ~31. When there is an incomplete
+  // destination block, this is stored into trailing_buf instead of packed_ptr.
+  for (int k = 0; k < src_rows; k += kNumChunkedSrcRows) {
+    // Available source rows.
+    // If this is less than 0 (for m=1), we skip, having filled trailing
+    // buffer for m=0. Also, if source rows is zero on m=1, then we filled
+    // exactly to the end of the column in the packed buffer.
+    const int available_src_rows = src_rows - k;
+    // Effectively,
+    // available rows = std::max(0, std::min(8, src_rows - k));
+    // treat each case separately.
+    if (available_src_rows >= kNumChunkedSrcRows) {
+      // i: chunks, s: Layout::Rows.
+      for (int i = 0; i < 8; ++i) {
+        for (int s = 0; s < 4; ++s) {
+          in_data[0][i][s] = src_ptr0[i * 4 + s];
+          in_data[1][i][s] = src_ptr1[i * 4 + s];
+          in_data[2][i][s] = src_ptr2[i * 4 + s];
+          in_data[3][i][s] = src_ptr3[i * 4 + s];
+          in_data[4][i][s] = src_ptr4[i * 4 + s];
+          in_data[5][i][s] = src_ptr5[i * 4 + s];
+          in_data[6][i][s] = src_ptr6[i * 4 + s];
+          in_data[7][i][s] = src_ptr7[i * 4 + s];
+        }
+      }
+      // i: chunks, j: Layout::kCols, s: Layout::Rows.
+      for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+          for (int s = 0; s < 4; ++s) {
+            // 8 * 4 * i is offset for each block, that is
+            // (Layout::kCols * Layout::kRows * i)
+            packed_ptr[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
+          }
+          if (sums_ptr) {
+            for (int s = 0; s < 4; ++s) {
+              sums_ptr[j] += in_data[j][i][s] ^ input_xor;
+            }
+          }
+        }
+      }
+    } else if (available_src_rows > 0) {
+      RUY_DCHECK_LT(available_src_rows, kNumChunkedSrcRows);
+      int i = 0;
+      // Consume chunks of 4 rows that are complete.
+      for (; i < (available_src_rows >> 2); ++i) {
+        for (int s = 0; s < 4; ++s) {
+          in_data[0][i][s] = src_ptr0[i * 4 + s];
+          in_data[1][i][s] = src_ptr1[i * 4 + s];
+          in_data[2][i][s] = src_ptr2[i * 4 + s];
+          in_data[3][i][s] = src_ptr3[i * 4 + s];
+          in_data[4][i][s] = src_ptr4[i * 4 + s];
+          in_data[5][i][s] = src_ptr5[i * 4 + s];
+          in_data[6][i][s] = src_ptr6[i * 4 + s];
+          in_data[7][i][s] = src_ptr7[i * 4 + s];
+        }
+      }
+      // Consume any incomplete chunk.
+      if (i < ((available_src_rows + 3) >> 2)) {
+        int s = 0;
+        for (; s < (available_src_rows & 3); ++s) {
+          in_data[0][i][s] = src_ptr0[i * 4 + s];
+          in_data[1][i][s] = src_ptr1[i * 4 + s];
+          in_data[2][i][s] = src_ptr2[i * 4 + s];
+          in_data[3][i][s] = src_ptr3[i * 4 + s];
+          in_data[4][i][s] = src_ptr4[i * 4 + s];
+          in_data[5][i][s] = src_ptr5[i * 4 + s];
+          in_data[6][i][s] = src_ptr6[i * 4 + s];
+          in_data[7][i][s] = src_ptr7[i * 4 + s];
+        }
+        RUY_DCHECK_LE(s, 4);
+        for (; s < 4; ++s) {
+          // j: Layout::kCols.
+          for (int j = 0; j < 8; ++j) {
+            in_data[j][i][s] = zero_point;
+          }
+        }
+        ++i;
+      }
+      // We do not care what goes into the trailing buffer, but we want
+      // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
+      //
+      // It might prove better in optimized code to pad uniformly with
+      // zero_point, and compensate by initializing the summations with the
+      // compensating offset, effectively
+      // ((input_xor - zero_point) ^ input_xor) *
+      //                         4 * (8 - ((available_src_rows + 3) >> 2)).
+      for (; i < 8; ++i) {
+        for (int s = 0; s < 4; ++s) {
+          for (int j = 0; j < 8; ++j) {
+            in_data[j][i][s] = input_xor;
+          }
+        }
+      }
+      // We loop through [0, 8) rather than
+      // [0, (available_src_rows + 3) >> 2), since that emulates what we might
+      // do in fully-optimized code.
+      //
+      // i: chunks, j: Layout::kCols, s: Layout::Rows.
+      if (sums_ptr) {
+        for (int i = 0; i < 8; ++i) {
+          for (int j = 0; j < 8; ++j) {
+            for (int s = 0; s < 4; ++s) {
+              trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
+              sums_ptr[j] = sums_ptr[j] + (in_data[j][i][s] ^ input_xor);
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < 8; ++i) {
+          for (int j = 0; j < 8; ++j) {
+            for (int s = 0; s < 4; ++s) {
+              trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
+            }
+          }
+        }
+      }
+    }
+
+    packed_ptr += 8 * kNumChunkedSrcRows;
+    src_ptr0 += src_inc0;
+    src_ptr1 += src_inc1;
+    src_ptr2 += src_inc2;
+    src_ptr3 += src_inc3;
+    src_ptr4 += src_inc4;
+    src_ptr5 += src_inc5;
+    src_ptr6 += src_inc6;
+    src_ptr7 += src_inc7;
+  }
+}
+
+inline void PackFloatAvx2Packer(const float* src_ptr, const float* zerobuf,
+                                int src_stride, int remaining_src_cols,
+                                int src_rows, float* packed_ptr,
+                                float* trailing_buf) {
+  using Layout = PackImplFloatAvx2::Layout;
+  RUY_DCHECK_EQ(Layout::kCols, 8);
+  RUY_DCHECK_EQ(Layout::kRows, 1);
+
+  // This packing amounts to tranposition of 8x8 blocks.
+  static constexpr int kPackCols = 8;  // Source cols packed together.
+  static constexpr int kPackRows = 8;  // Short input is padded.
+
+  float in_data[kPackCols][kPackRows];
+
+  const float* src_ptr0 = src_ptr;
+  const float* src_ptr1 = src_ptr0 + src_stride;
+  const float* src_ptr2 = src_ptr1 + src_stride;
+  const float* src_ptr3 = src_ptr2 + src_stride;
+  const float* src_ptr4 = src_ptr3 + src_stride;
+  const float* src_ptr5 = src_ptr4 + src_stride;
+  const float* src_ptr6 = src_ptr5 + src_stride;
+  const float* src_ptr7 = src_ptr6 + src_stride;
+  std::int64_t src_inc0 = 8;
+  std::int64_t src_inc1 = 8;
+  std::int64_t src_inc2 = 8;
+  std::int64_t src_inc3 = 8;
+  std::int64_t src_inc4 = 8;
+  std::int64_t src_inc5 = 8;
+  std::int64_t src_inc6 = 8;
+  std::int64_t src_inc7 = 8;
+  // Handle cases where source does not have kPackDim (8) columns.
+  if (remaining_src_cols < kPackCols) {
+    if (remaining_src_cols <= 0) {
+      src_ptr0 = zerobuf;
+      src_inc0 = 0;
+    }
+    if (remaining_src_cols <= 1) {
+      src_ptr1 = zerobuf;
+      src_inc1 = 0;
+    }
+    if (remaining_src_cols <= 2) {
+      src_ptr2 = zerobuf;
+      src_inc2 = 0;
+    }
+    if (remaining_src_cols <= 3) {
+      src_ptr3 = zerobuf;
+      src_inc3 = 0;
+    }
+    if (remaining_src_cols <= 4) {
+      src_ptr4 = zerobuf;
+      src_inc4 = 0;
+    }
+    if (remaining_src_cols <= 5) {
+      src_ptr5 = zerobuf;
+      src_inc5 = 0;
+    }
+    if (remaining_src_cols <= 6) {
+      src_ptr6 = zerobuf;
+      src_inc6 = 0;
+    }
+    src_ptr7 = zerobuf;
+    src_inc7 = 0;
+  }
+
+  for (int k = 0; k < src_rows; k += kPackRows) {
+    const int available_src_rows = src_rows - k;
+    // Effectively,
+    // available_src_rows = std::max(0, std::min(kPackDim, src_rows - k));
+    // but treat each case separately.
+    if (available_src_rows >= kPackRows) {
+      for (int i = 0; i < 8; ++i) {
+        in_data[0][i] = src_ptr0[i];
+        in_data[1][i] = src_ptr1[i];
+        in_data[2][i] = src_ptr2[i];
+        in_data[3][i] = src_ptr3[i];
+        in_data[4][i] = src_ptr4[i];
+        in_data[5][i] = src_ptr5[i];
+        in_data[6][i] = src_ptr6[i];
+        in_data[7][i] = src_ptr7[i];
+      }
+      for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+          packed_ptr[8 * i + j] = in_data[j][i];
+        }
+      }
+    } else if (available_src_rows > 0) {
+      for (int i = 0; i < available_src_rows; ++i) {
+        in_data[0][i] = src_ptr0[i];
+        in_data[1][i] = src_ptr1[i];
+        in_data[2][i] = src_ptr2[i];
+        in_data[3][i] = src_ptr3[i];
+        in_data[4][i] = src_ptr4[i];
+        in_data[5][i] = src_ptr5[i];
+        in_data[6][i] = src_ptr6[i];
+        in_data[7][i] = src_ptr7[i];
+      }
+      for (int i = available_src_rows; i < kPackRows; ++i) {
+        in_data[0][i] = 0.0f;
+        in_data[1][i] = 0.0f;
+        in_data[2][i] = 0.0f;
+        in_data[3][i] = 0.0f;
+        in_data[4][i] = 0.0f;
+        in_data[5][i] = 0.0f;
+        in_data[6][i] = 0.0f;
+        in_data[7][i] = 0.0f;
+      }
+      // We loop through [0, 7) rather than [0, packed_rows), since that
+      // emulates what we might do in fully-optimized code.
+      // i: (kPackRows - 1), j: kPackCols.
+      for (int i = 0; i < 7; ++i) {
+        for (int j = 0; j < 8; ++j) {
+          trailing_buf[kPackRows * i + j] = in_data[j][i];
+        }
+      }
+    }
+
+    packed_ptr += kPackRows * kPackCols;
+    src_ptr0 += src_inc0;
+    src_ptr1 += src_inc1;
+    src_ptr2 += src_inc2;
+    src_ptr3 += src_inc3;
+    src_ptr4 += src_inc4;
+    src_ptr5 += src_inc5;
+    src_ptr6 += src_inc6;
+    src_ptr7 += src_inc7;
+  }
+}
+
+}  // namespace.
+
+void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
+                  const std::int8_t* zerobuf, int src_stride,
+                  int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
+                  std::int32_t* sums_ptr) {
+  gemmlowp::ScopedProfilingLabel label("Pack kAvx2 8bit");
+
+  using Layout = PackImpl8bitAvx2::Layout;
+  RUY_DCHECK_EQ(Layout::kCols, 8);
+  RUY_DCHECK_EQ(Layout::kRows, 4);
+
+  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
+  // We process 8 of these chunks at a time, padding short input chunks.
+  static constexpr int kNumRowChunks = 8;  // Short input is padded.
+
+  // Each packed block is 4*8, and there are normally 8. The trailing block is
+  // only slightly shorter.
+  constexpr int kTrailingBufSize =
+      kNumRowChunks * Layout::kCols * Layout::kRows;
+  std::int8_t trailing_buf[kTrailingBufSize];
+  memset(trailing_buf, 0, kTrailingBufSize * sizeof(std::int8_t));
+
+  Pack8bitAvx2Packer(src_ptr, input_xor, zerobuf, src_stride,
+                     remaining_src_cols, src_rows, packed_ptr, sums_ptr,
+                     trailing_buf);
+
+  constexpr int kChunkedRowMask = kNumRowChunks * Layout::kRows - 1;
+  const bool trailing_data = (src_rows & kChunkedRowMask) > 0;
+  // If the number of source rows is not a multiple of kChunkedRowMask, there
+  // will be data in the trailing buffer,
+  if (trailing_data > 0) {
+    const int non_trailing_rows = src_rows & ~kChunkedRowMask;
+    // Destination "rows" are padded to next highest multiple of Layout::kRows.
+    const int dst_rows = (src_rows + 3) & ~3;
+    const int trailing_rows = dst_rows - non_trailing_rows;
+    memcpy(packed_ptr + Layout::kCols * non_trailing_rows, trailing_buf,
+           Layout::kCols * trailing_rows * sizeof(std::int8_t));
+  }
+}
+
+void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
+                   int remaining_src_cols, int src_rows, float* packed_ptr) {
+  gemmlowp::ScopedProfilingLabel label("Pack kAvx2 float");
+  static constexpr int kPackCols = 8;  // Source cols packed together.
+  static constexpr int kPackRows = 8;  // Short input is padded.
+  float trailing_buf[(kPackRows - 1) * kPackCols];
+  if (remaining_src_cols < 8) {
+    memset(trailing_buf, 0, sizeof(trailing_buf));
+  }
+  PackFloatAvx2Packer(src_ptr, zerobuf, src_stride, remaining_src_cols,
+                      src_rows, packed_ptr, trailing_buf);
+
+  const int trailing_rows = src_rows & (kPackRows - 1);
+  if (trailing_rows > 0) {
+    const int non_trailing_rows = src_rows & ~(kPackRows - 1);
+    memcpy(packed_ptr + kPackCols * non_trailing_rows, trailing_buf,
+           kPackCols * trailing_rows * sizeof(float));
+  }
+}
+
+#endif  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/pack_avx512.cc b/tensorflow/lite/experimental/ruy/pack_avx512.cc
new file mode 100644
index 00000000000..0c146604881
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/pack_avx512.cc
@@ -0,0 +1,562 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstring>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/pack.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#include <immintrin.h>  // IWYU pragma: keep
+#endif
+
+namespace ruy {
+
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
+                    const std::int8_t* zerobuf, int src_stride,
+                    int remaining_src_cols, int src_rows,
+                    std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
+                     int remaining_src_cols, int src_rows, float* packed_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+// The first int8_t template parameter is arbitrary: this routine is common to
+// all 8-bit source matrix types.
+using PackImpl8bitAvx512 =
+    PackImpl<Path::kAvx512, FixedKernelLayout<Order::kColMajor, 4, 16>,
+             std::int8_t, std::int8_t, std::int32_t>;
+
+namespace {
+
+inline void ZeroHalf8bitAvx512(int src_rows, std::int8_t packed_zero_point,
+                               std::int8_t* packed_ptr) {
+  using Layout = PackImpl8bitAvx512::Layout;
+  static constexpr int kHalfLayoutCols =
+      PackImpl8bitAvx512::kHalfLayoutCols;  // Half the number of cols in a
+                                            // block.
+  RUY_DCHECK_EQ(kHalfLayoutCols, 8);
+  RUY_DCHECK_EQ(Layout::kCols, 16);
+  RUY_DCHECK_EQ(Layout::kRows, 4);
+
+  const int non_trailing_blocks = (src_rows & ~31) >> 2;
+  // This routine fills half blocks, and typically fills the second halves.
+  // Thus packed_ptr is already offset by 8 * 4.
+  for (int k = 0; k < non_trailing_blocks; ++k) {
+    for (int j = 0; j < (kHalfLayoutCols * Layout::kRows); ++j) {
+      packed_ptr[Layout::kCols * Layout::kRows * k + j] = packed_zero_point;
+    }
+  }
+}
+
+inline void HalfPack8bitAvx512(const std::int8_t* src_ptr,
+                               std::int8_t input_xor,
+                               const std::int8_t* zerobuf, int src_stride,
+                               int remaining_src_cols, int src_rows,
+                               std::int8_t* packed_ptr, std::int32_t* sums_ptr,
+                               std::int8_t* trailing_buf) {
+  using Layout = PackImpl8bitAvx512::Layout;
+  static constexpr int kHalfLayoutCols =
+      PackImpl8bitAvx512::kHalfLayoutCols;  // Half the number of cols in a
+                                            // block.
+  RUY_DCHECK_EQ(Layout::kCols, 16);
+  RUY_DCHECK_EQ(Layout::kRows, 4);
+  RUY_DCHECK_EQ(kHalfLayoutCols, 8);
+  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
+  // We process 8 of these chunks at a time, padding short input chunks.
+  constexpr int kNumRowChunks = 8;
+  constexpr int kNumChunkedSrcRows = kNumRowChunks * Layout::kRows;
+
+  std::int8_t in_data[kHalfLayoutCols][kNumRowChunks][Layout::kRows];
+
+  const std::int8_t* src_ptr0 = src_ptr;
+  const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
+  const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
+  const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
+  const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
+  const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
+  const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
+  const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
+  std::int64_t src_inc0 = kNumChunkedSrcRows;
+  std::int64_t src_inc1 = kNumChunkedSrcRows;
+  std::int64_t src_inc2 = kNumChunkedSrcRows;
+  std::int64_t src_inc3 = kNumChunkedSrcRows;
+  std::int64_t src_inc4 = kNumChunkedSrcRows;
+  std::int64_t src_inc5 = kNumChunkedSrcRows;
+  std::int64_t src_inc6 = kNumChunkedSrcRows;
+  std::int64_t src_inc7 = kNumChunkedSrcRows;
+  // Handle cases where source does not have kHalfLayoutCols (8) columns.
+  if (remaining_src_cols < 8) {
+    if (remaining_src_cols <= 0) {
+      src_ptr0 = zerobuf;
+      src_inc0 = 0;
+    }
+    if (remaining_src_cols <= 1) {
+      src_ptr1 = zerobuf;
+      src_inc1 = 0;
+    }
+    if (remaining_src_cols <= 2) {
+      src_ptr2 = zerobuf;
+      src_inc2 = 0;
+    }
+    if (remaining_src_cols <= 3) {
+      src_ptr3 = zerobuf;
+      src_inc3 = 0;
+    }
+    if (remaining_src_cols <= 4) {
+      src_ptr4 = zerobuf;
+      src_inc4 = 0;
+    }
+    if (remaining_src_cols <= 5) {
+      src_ptr5 = zerobuf;
+      src_inc5 = 0;
+    }
+    if (remaining_src_cols <= 6) {
+      src_ptr6 = zerobuf;
+      src_inc6 = 0;
+    }
+    src_ptr7 = zerobuf;
+    src_inc7 = 0;
+  }
+
+  const std::int8_t zero_point = zerobuf[0];
+
+  if (sums_ptr) {
+    // i: kHalfLayoutCols.
+    for (int i = 0; i < 8; ++i) {
+      sums_ptr[i] = 0;
+    }
+  }
+
+  // The overall packing effectively pads the source rows to
+  // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
+  // only pack for (src_rows + 31) & ~31. When there is an incomplete
+  // destination block, this is stored into trailing_buf instead of packed_ptr.
+  for (int k = 0; k < src_rows; k += 2 * kNumChunkedSrcRows) {
+    // m: {0, 1} for 2 chunks of rows.
+    for (int m = 0; m < 2; ++m) {
+      // Available source rows.
+      // If this is less than 0 (for m=1), we skip, having filled trailing
+      // buffer for m=0. Also, if source rows is zero on m=1, then we filled
+      // exactly to the end of the column in the packed buffer.
+      const int available_src_rows = src_rows - k - m * kNumChunkedSrcRows;
+      // Effectively,
+      // available rows = std::max(0, std::min(8, src_rows - k - 8 * 4 * m));
+      // treat each case separately.
+      if (available_src_rows >= kNumChunkedSrcRows) {
+        // i: chunks, s: Layout::Rows.
+        for (int i = 0; i < 8; ++i) {
+          for (int s = 0; s < 4; ++s) {
+            in_data[0][i][s] = src_ptr0[i * 4 + s];
+            in_data[1][i][s] = src_ptr1[i * 4 + s];
+            in_data[2][i][s] = src_ptr2[i * 4 + s];
+            in_data[3][i][s] = src_ptr3[i * 4 + s];
+            in_data[4][i][s] = src_ptr4[i * 4 + s];
+            in_data[5][i][s] = src_ptr5[i * 4 + s];
+            in_data[6][i][s] = src_ptr6[i * 4 + s];
+            in_data[7][i][s] = src_ptr7[i * 4 + s];
+          }
+        }
+        // i: chunks, j: kHalfLayoutCols, s: Layout::Rows.
+        for (int i = 0; i < 8; ++i) {
+          for (int j = 0; j < 8; ++j) {
+            for (int s = 0; s < 4; ++s) {
+              // 16 * 4 * i is offset for each block, that is
+              // (Layout::kCols * Layout::kRows * i)
+              packed_ptr[(16 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
+            }
+            if (sums_ptr) {
+              for (int s = 0; s < 4; ++s) {
+                sums_ptr[j] += in_data[j][i][s] ^ input_xor;
+              }
+            }
+          }
+        }
+      } else if (available_src_rows > 0) {
+        RUY_DCHECK_LT(available_src_rows >> 2, kNumChunkedSrcRows);
+        int i = 0;
+        // Consume chunks of 4 rows that are complete.
+        for (; i < (available_src_rows >> 2); ++i) {
+          for (int s = 0; s < 4; ++s) {
+            in_data[0][i][s] = src_ptr0[i * 4 + s];
+            in_data[1][i][s] = src_ptr1[i * 4 + s];
+            in_data[2][i][s] = src_ptr2[i * 4 + s];
+            in_data[3][i][s] = src_ptr3[i * 4 + s];
+            in_data[4][i][s] = src_ptr4[i * 4 + s];
+            in_data[5][i][s] = src_ptr5[i * 4 + s];
+            in_data[6][i][s] = src_ptr6[i * 4 + s];
+            in_data[7][i][s] = src_ptr7[i * 4 + s];
+          }
+        }
+        // Consume any incomplete chunk.
+        if (i < ((available_src_rows + 3) >> 2)) {
+          int s = 0;
+          for (; s < (available_src_rows & 3); ++s) {
+            in_data[0][i][s] = src_ptr0[i * 4 + s];
+            in_data[1][i][s] = src_ptr1[i * 4 + s];
+            in_data[2][i][s] = src_ptr2[i * 4 + s];
+            in_data[3][i][s] = src_ptr3[i * 4 + s];
+            in_data[4][i][s] = src_ptr4[i * 4 + s];
+            in_data[5][i][s] = src_ptr5[i * 4 + s];
+            in_data[6][i][s] = src_ptr6[i * 4 + s];
+            in_data[7][i][s] = src_ptr7[i * 4 + s];
+          }
+          RUY_DCHECK_LE(s, 4);
+          for (; s < 4; ++s) {
+            // j: kHalfLayoutCols.
+            for (int j = 0; j < 8; ++j) {
+              in_data[j][i][s] = zero_point;
+            }
+          }
+          ++i;
+        }
+        // We do not care what goes into the trailing buffer, but we want
+        // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
+        //
+        // It might prove better in optimized code to pad uniformly with
+        // zero_point, and compensate by initializing the summations with the
+        // compensating offset, effectively
+        // ((input_xor - zero_point) ^ input_xor) *
+        //                         4 * (8 - ((available_src_rows + 3) >> 2)).
+        for (; i < 8; ++i) {
+          for (int s = 0; s < 4; ++s) {
+            for (int j = 0; j < 8; ++j) {
+              in_data[j][i][s] = input_xor;
+            }
+          }
+        }
+        // We loop through [0, 8) rather than
+        // [0, (available_src_rows + 3) >> 2), since that emulates what we might
+        // do in fully-optimized code.
+        //
+        // i: chunks, j: kHalfLayoutCols, s: Layout::Rows.
+        if (sums_ptr) {
+          for (int i = 0; i < 8; ++i) {
+            for (int j = 0; j < 8; ++j) {
+              for (int s = 0; s < 4; ++s) {
+                trailing_buf[(16 * i + j) * 4 + s] =
+                    in_data[j][i][s] ^ input_xor;
+                sums_ptr[j] = sums_ptr[j] + (in_data[j][i][s] ^ input_xor);
+              }
+            }
+          }
+        } else {
+          for (int i = 0; i < 8; ++i) {
+            for (int j = 0; j < 8; ++j) {
+              for (int s = 0; s < 4; ++s) {
+                trailing_buf[(16 * i + j) * 4 + s] =
+                    in_data[j][i][s] ^ input_xor;
+              }
+            }
+          }
+        }
+      }
+
+      packed_ptr += 16 * kNumChunkedSrcRows;
+      src_ptr0 += src_inc0;
+      src_ptr1 += src_inc1;
+      src_ptr2 += src_inc2;
+      src_ptr3 += src_inc3;
+      src_ptr4 += src_inc4;
+      src_ptr5 += src_inc5;
+      src_ptr6 += src_inc6;
+      src_ptr7 += src_inc7;
+    }
+  }
+}
+
+inline __m512 LoaduTwo(const float* addr_lo, const float* addr_hi) {
+  __m512 lower_filled = _mm512_castps256_ps512(_mm256_loadu_ps(addr_lo));
+  return _mm512_insertf32x8(lower_filled, _mm256_loadu_ps(addr_hi), 1);
+}
+
+inline __m512 MaskLoaduTwo(__mmask8 row_mask, const float* addr_lo,
+                           const float* addr_hi) {
+  __m512 lower_filled =
+      _mm512_castps256_ps512(_mm256_maskz_loadu_ps(row_mask, addr_lo));
+  return _mm512_insertf32x8(lower_filled,
+                            _mm256_maskz_loadu_ps(row_mask, addr_hi), 1);
+}
+
+inline __m512 Mm512UnpackloPsx2(const __m512 a, const __m512 b) {
+  return _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
+}
+
+inline __m512 Mm512UnpackhiPsx2(const __m512 a, const __m512 b) {
+  return _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(a), _mm512_castps_pd(b)));
+}
+
+inline void HalfPackFloatAvx512(const float* src_ptr, const float* zerobuf,
+                                int src_stride, int remaining_src_cols,
+                                int src_rows, float* packed_ptr,
+                                float* trailing_buf) {
+  const float* src_ptr0 = src_ptr;
+  const float* src_ptr1 = src_ptr0 + src_stride;
+  const float* src_ptr2 = src_ptr1 + src_stride;
+  const float* src_ptr3 = src_ptr2 + src_stride;
+  const float* src_ptr4 = src_ptr3 + src_stride;
+  const float* src_ptr5 = src_ptr4 + src_stride;
+  const float* src_ptr6 = src_ptr5 + src_stride;
+  const float* src_ptr7 = src_ptr6 + src_stride;
+  std::int64_t src_inc0 = 8;
+  std::int64_t src_inc1 = 8;
+  std::int64_t src_inc2 = 8;
+  std::int64_t src_inc3 = 8;
+  std::int64_t src_inc4 = 8;
+  std::int64_t src_inc5 = 8;
+  std::int64_t src_inc6 = 8;
+  std::int64_t src_inc7 = 8;
+  if (remaining_src_cols < 8) {
+    if (remaining_src_cols <= 0) {
+      src_ptr0 = zerobuf;
+      src_inc0 = 0;
+    }
+    if (remaining_src_cols <= 1) {
+      src_ptr1 = zerobuf;
+      src_inc1 = 0;
+    }
+    if (remaining_src_cols <= 2) {
+      src_ptr2 = zerobuf;
+      src_inc2 = 0;
+    }
+    if (remaining_src_cols <= 3) {
+      src_ptr3 = zerobuf;
+      src_inc3 = 0;
+    }
+    if (remaining_src_cols <= 4) {
+      src_ptr4 = zerobuf;
+      src_inc4 = 0;
+    }
+    if (remaining_src_cols <= 5) {
+      src_ptr5 = zerobuf;
+      src_inc5 = 0;
+    }
+    if (remaining_src_cols <= 6) {
+      src_ptr6 = zerobuf;
+      src_inc6 = 0;
+    }
+    src_ptr7 = zerobuf;
+    src_inc7 = 0;
+  }
+
+  for (int k = 0; k < src_rows; k += 16) {
+    for (int m = 0; m < 2; ++m) {
+      const int available_src_rows = src_rows - k - 8 * m;
+      // Effectively,
+      // available_src_rows = std::max(0, std::min(8, src_rows - k - 8 * m));
+      // but treat each case separately.
+      if (available_src_rows > 7) {
+        __m512 t0, t1, t2, t3;
+        __m512 r0, r1, r2, r3;
+
+        t0 = LoaduTwo(src_ptr0, src_ptr4);
+        t1 = LoaduTwo(src_ptr1, src_ptr5);
+        t2 = LoaduTwo(src_ptr2, src_ptr6);
+        t3 = LoaduTwo(src_ptr3, src_ptr7);
+
+        r0 = _mm512_unpacklo_ps(t0, t1);
+        r2 = _mm512_unpackhi_ps(t0, t1);
+        r1 = _mm512_unpacklo_ps(t2, t3);
+        r3 = _mm512_unpackhi_ps(t2, t3);
+
+        t0 = Mm512UnpackloPsx2(r0, r1);
+        t2 = Mm512UnpackhiPsx2(r0, r1);
+        t1 = Mm512UnpackloPsx2(r2, r3);
+        t3 = Mm512UnpackhiPsx2(r2, r3);
+
+        r0 = _mm512_shuffle_f32x4(t0, t1, 0x88);
+        r1 = _mm512_shuffle_f32x4(t0, t1, 0xdd);
+        r2 = _mm512_shuffle_f32x4(t2, t3, 0x88);
+        r3 = _mm512_shuffle_f32x4(t2, t3, 0xdd);
+
+        _mm256_storeu_ps(packed_ptr + 0 * 16, _mm512_castps512_ps256(r0));
+        _mm256_storeu_ps(packed_ptr + 2 * 16, _mm512_extractf32x8_ps(r0, 1));
+        _mm256_storeu_ps(packed_ptr + 4 * 16, _mm512_castps512_ps256(r1));
+        _mm256_storeu_ps(packed_ptr + 6 * 16, _mm512_extractf32x8_ps(r1, 1));
+        _mm256_storeu_ps(packed_ptr + 1 * 16, _mm512_castps512_ps256(r2));
+        _mm256_storeu_ps(packed_ptr + 3 * 16, _mm512_extractf32x8_ps(r2, 1));
+        _mm256_storeu_ps(packed_ptr + 5 * 16, _mm512_castps512_ps256(r3));
+        _mm256_storeu_ps(packed_ptr + 7 * 16, _mm512_extractf32x8_ps(r3, 1));
+      } else if (available_src_rows > 0) {
+        const __mmask8 row_mask =
+            (static_cast<std::uint32_t>(1) << available_src_rows) - 1;
+
+        __m512 t0, t1, t2, t3;
+        __m512 r0, r1, r2, r3;
+
+        t0 = MaskLoaduTwo(row_mask, src_ptr0, src_ptr4);
+        t1 = MaskLoaduTwo(row_mask, src_ptr1, src_ptr5);
+        t2 = MaskLoaduTwo(row_mask, src_ptr2, src_ptr6);
+        t3 = MaskLoaduTwo(row_mask, src_ptr3, src_ptr7);
+
+        r0 = _mm512_unpacklo_ps(t0, t1);
+        r2 = _mm512_unpackhi_ps(t0, t1);
+        r1 = _mm512_unpacklo_ps(t2, t3);
+        r3 = _mm512_unpackhi_ps(t2, t3);
+
+        t0 = Mm512UnpackloPsx2(r0, r1);
+        t2 = Mm512UnpackhiPsx2(r0, r1);
+        t1 = Mm512UnpackloPsx2(r2, r3);
+        t3 = Mm512UnpackhiPsx2(r2, r3);
+
+        r0 = _mm512_shuffle_f32x4(t0, t1, 0x88);
+        r1 = _mm512_shuffle_f32x4(t0, t1, 0xdd);
+        r2 = _mm512_shuffle_f32x4(t2, t3, 0x88);
+        r3 = _mm512_shuffle_f32x4(t2, t3, 0xdd);
+
+        _mm256_storeu_ps(trailing_buf + 0 * 16, _mm512_castps512_ps256(r0));
+        _mm256_storeu_ps(trailing_buf + 2 * 16, _mm512_extractf32x8_ps(r0, 1));
+        _mm256_storeu_ps(trailing_buf + 4 * 16, _mm512_castps512_ps256(r1));
+        _mm256_storeu_ps(trailing_buf + 6 * 16, _mm512_extractf32x8_ps(r1, 1));
+        _mm256_storeu_ps(trailing_buf + 1 * 16, _mm512_castps512_ps256(r2));
+        _mm256_storeu_ps(trailing_buf + 3 * 16, _mm512_extractf32x8_ps(r2, 1));
+        _mm256_storeu_ps(trailing_buf + 5 * 16, _mm512_castps512_ps256(r3));
+        // Do not store _mm512_extractf32x8_ps(r3, 1).
+      }
+
+      packed_ptr += 16 * 8;
+      src_ptr0 += src_inc0;
+      src_ptr1 += src_inc1;
+      src_ptr2 += src_inc2;
+      src_ptr3 += src_inc3;
+      src_ptr4 += src_inc4;
+      src_ptr5 += src_inc5;
+      src_ptr6 += src_inc6;
+      src_ptr7 += src_inc7;
+    }
+  }
+}
+
+inline void ZeroHalfFloatAvx512(int src_rows, float* packed_ptr) {
+  const int non_trailing_rows = src_rows & ~7;
+  for (int k = 0; k < non_trailing_rows; ++k) {
+    for (int j = 0; j < 8; ++j) {
+      packed_ptr[j] = 0.0f;
+    }
+    packed_ptr += 16;
+  }
+}
+
+}  // namespace.
+
+void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
+                    const std::int8_t* zerobuf, int src_stride,
+                    int remaining_src_cols, int src_rows,
+                    std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
+  gemmlowp::ScopedProfilingLabel label("Pack kAvx512 8bit");
+
+  using Layout = PackImpl8bitAvx512::Layout;
+  constexpr int kHalfBlockOffset = 32;
+  RUY_DCHECK_EQ(kHalfBlockOffset * 2, Layout::kRows * Layout::kCols);
+  static constexpr int kHalfLayoutCols =
+      PackImpl8bitAvx512::kHalfLayoutCols;  // Half the number of cols in a
+                                            // block.
+  RUY_DCHECK_EQ(kHalfLayoutCols, 8);
+  RUY_DCHECK_EQ(Layout::kCols, 16);
+  RUY_DCHECK_EQ(Layout::kRows, 4);
+
+  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
+  // We process 8 of these chunks at a time, padding short input chunks.
+  constexpr int kNumRowChunks = 8;
+
+  // Each packed block is 4*16, and there are normally 8. The trailing block is
+  // only slightly shorter.
+  constexpr int kTrailingBufSize =
+      kNumRowChunks * Layout::kCols * Layout::kRows;
+  std::int8_t trailing_buf[kTrailingBufSize];
+  memset(trailing_buf, 0, kTrailingBufSize * sizeof(std::int8_t));
+
+  std::int32_t* second_sums_ptr =
+      sums_ptr ? sums_ptr + kHalfLayoutCols : nullptr;
+  if (remaining_src_cols > kHalfLayoutCols) {
+    HalfPack8bitAvx512(src_ptr, input_xor, zerobuf, src_stride,
+                       remaining_src_cols, src_rows, packed_ptr, sums_ptr,
+                       trailing_buf);
+    HalfPack8bitAvx512(src_ptr + src_stride * kHalfLayoutCols, input_xor,
+                       zerobuf, src_stride,
+                       remaining_src_cols - kHalfLayoutCols, src_rows,
+                       packed_ptr + kHalfBlockOffset, second_sums_ptr,
+                       trailing_buf + kHalfBlockOffset);
+  } else {
+    HalfPack8bitAvx512(src_ptr, input_xor, zerobuf, src_stride,
+                       remaining_src_cols, src_rows, packed_ptr, sums_ptr,
+                       trailing_buf);
+    ZeroHalf8bitAvx512(src_rows, zerobuf[0] ^ input_xor,
+                       packed_ptr + kHalfBlockOffset);
+    // The kernel may not need the second half-blocks sums to be set.
+    if (second_sums_ptr) {
+      for (int i = 0; i < kHalfLayoutCols; ++i) {
+        second_sums_ptr[i] = (zerobuf[0] ^ input_xor) * ((src_rows + 3) & ~3);
+      }
+    }
+  }
+  constexpr int kChunkedRowMask = kNumRowChunks * Layout::kRows - 1;
+  const bool trailing_data = (src_rows & kChunkedRowMask) > 0;
+  // If the number of source rows is not a multiple of kChunkedRowMask, there
+  // will be data in the trailing buffer,
+  if (trailing_data > 0) {
+    const int non_trailing_rows = src_rows & ~kChunkedRowMask;
+    // Destination "rows" are padded to next highest multiple of Layout::kRows.
+    const int dst_rows = (src_rows + 3) & ~3;
+    const int trailing_rows = dst_rows - non_trailing_rows;
+    memcpy(packed_ptr + Layout::kCols * non_trailing_rows, trailing_buf,
+           Layout::kCols * trailing_rows * sizeof(std::int8_t));
+  }
+}
+
+void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
+                     int remaining_src_cols, int src_rows, float* packed_ptr) {
+  gemmlowp::ScopedProfilingLabel label("Pack kAvx512 float");
+  float trailing_buf[7 * 16];
+  if (remaining_src_cols > 8) {
+    HalfPackFloatAvx512(src_ptr, zerobuf, src_stride, remaining_src_cols,
+                        src_rows, packed_ptr, trailing_buf);
+    HalfPackFloatAvx512(src_ptr + src_stride * 8, zerobuf, src_stride,
+                        remaining_src_cols - 8, src_rows, packed_ptr + 8,
+                        trailing_buf + 8);
+  } else {
+    memset(trailing_buf, 0, sizeof(trailing_buf));
+    HalfPackFloatAvx512(src_ptr, zerobuf, src_stride, remaining_src_cols,
+                        src_rows, packed_ptr, trailing_buf);
+    ZeroHalfFloatAvx512(src_rows, packed_ptr + 8);
+  }
+  const int trailing_rows = src_rows & 7;
+  if (trailing_rows > 0) {
+    const int non_trailing_rows = src_rows & ~7;
+    memcpy(packed_ptr + 16 * non_trailing_rows, trailing_buf,
+           16 * trailing_rows * sizeof(float));
+  }
+}
+
+#endif  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/pack_common.h b/tensorflow/lite/experimental/ruy/pack_common.h
new file mode 100644
index 00000000000..dbb0bbf60b9
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/pack_common.h
@@ -0,0 +1,238 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// # What is "packing"?
+//
+// Before feeding data to the gemm kernels (the parts of Ruy that do lots
+// of multiply-add operations), Ruy first performs a data transformation (which
+// we call "packing") on the input matrices. This transformation has two main
+// goals:
+// - rearrange data into blocks that are a convenient size/layout for the gemm
+// kernels to consume. This helps make the memory access pattern of the gemm
+// kernel simpler and more contiguous, and puts the data in a layout most
+// convenient for specific arithmetic instructions in the gemm kernel.
+// - compute row/column sums needed for handling quantization with non-symmetric
+// zero points.
+//
+// # Simplified algorithmic analysis of packing
+//
+// Packing is a relatively simple transformation which does a small constant
+// amount of work on each element of an input matrix, and hence for an NxM
+// matrix performs O(N*M) work. If N and M are of the same order, then this is
+// O(N^2) work.
+//
+// A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
+// Note that if N, K, and M are all the same order, then the number of
+// multiply-accumulate operations is O(N^3).
+//
+// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
+// case of all dimensions being roughly the same order.
+//
+// # Packing cost can be significant
+//
+// When matrix * matrix multiplications begin to look more like matrix * vector
+// multiplications, packing cost can become significant. We sometimes call these
+// cases "gemv-like".
+//
+// Continuing the algorithmic analysis above, if we consider a case where an
+// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
+// situation is different. In this case, the multiply-accumulate work is only
+// quadratic, so the quadratic cost of packing can be come significant.
+//
+// Another way to say this is that the cost of packing an input matrix (either
+// the LHS or RHS) is amortized across the non-depth dimension of the opposite
+// input matrix. Thus, when the LHS has very few rows or the RHS has very few
+// columns, the cost of packing the opposite input matrix can become
+// significant.
+//
+// As a rough rule of thumb, the cost of packing starts to become significant
+// when either N or M is below 32 (and other dimensions are hundreds), with very
+// significant packing costs at 8 or below. This varies by data type, Path, and
+// tuning, so these numbers are only rough guides.
+//
+// One practical use case that is affected by this is inference of
+// fully connected neural network layers with a low batch size. The weight
+// matrix (which is a constant for inference) is the one affected by significant
+// packing cost.
+//
+// Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
+// input matrices that are affected by significant packing costs.
+//
+// # Implementation notes
+//
+// Ruy's packing routines always operate on a range of columns and can be
+// applied to either the LHS or RHS. This is possible because Ruy internally
+// implements a TrMul, so the accumulation along depth is done along columns of
+// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
+// for the LHS is along rows). As another example, we are always computing
+// column sums for quantization (and never row sums, since the LHS is
+// transposed).
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_COMMON_H_
+
+#include <cstdint>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+template <Path ThePath, typename Scalar>
+struct PackedTypeImpl {
+  using Type = Scalar;
+};
+
+#if RUY_PLATFORM(NEON_32)
+struct PackParams8bit {
+  const void* src_ptr0;
+  const void* src_ptr1;
+  const void* src_ptr2;
+  const void* src_ptr3;
+  const std::int32_t* sums_ptr;
+  const std::int8_t* packed_ptr;
+  int src_inc0;
+  int src_inc1;
+  int src_inc2;
+  int src_inc3;
+  int src_rows;
+  int src_zero_point;
+  int input_xor;
+};
+
+inline void MakePackParams8bit(const void* src_ptr0, const void* src_ptr1,
+                               const void* src_ptr2, const void* src_ptr3,
+                               const std::int32_t* sums_ptr,
+                               const std::int8_t* packed_ptr, int src_inc0,
+                               int src_inc1, int src_inc2, int src_inc3,
+                               int src_rows, int src_zero_point, int input_xor,
+                               PackParams8bit* params) {
+  params->src_ptr0 = src_ptr0;
+  params->src_ptr1 = src_ptr1;
+  params->src_ptr2 = src_ptr2;
+  params->src_ptr3 = src_ptr3;
+  params->sums_ptr = sums_ptr;
+  params->packed_ptr = packed_ptr;
+  params->src_inc0 = src_inc0;
+  params->src_inc1 = src_inc1;
+  params->src_inc2 = src_inc2;
+  params->src_inc3 = src_inc3;
+  params->src_rows = src_rows;
+  params->src_zero_point = src_zero_point;
+  params->input_xor = input_xor;
+}
+#endif
+
+#if RUY_PLATFORM(NEON)
+template <>
+struct PackedTypeImpl<Path::kNeon, std::uint8_t> {
+  using Type = std::int8_t;
+};
+template <>
+struct PackedTypeImpl<Path::kNeonDotprod, std::uint8_t> {
+  using Type = std::int8_t;
+};
+#elif RUY_PLATFORM(X86)
+template <>
+struct PackedTypeImpl<Path::kAvx2, std::uint8_t> {
+  using Type = std::int8_t;
+};
+template <>
+struct PackedTypeImpl<Path::kAvx512, std::uint8_t> {
+  using Type = std::int8_t;
+};
+#endif
+
+template <Path ThePath, typename Scalar>
+using PackedType = typename PackedTypeImpl<ThePath, Scalar>::Type;
+
+template <typename PackedScalar, typename Scalar>
+PackedScalar Pack(Scalar x) {
+  return x - SymmetricZeroPoint<Scalar>() + SymmetricZeroPoint<PackedScalar>();
+}
+
+template <Path ThePath, typename FixedKernelLayout, typename Scalar,
+          typename PackedScalar, typename SumsType>
+struct PackImpl {};
+
+#define RUY_INHERIT_PACK(PARENT, CHILD)                                       \
+  template <typename FixedKernelLayout, typename Scalar,                      \
+            typename PackedScalar, typename SumsType>                         \
+  struct PackImpl<CHILD, FixedKernelLayout, Scalar, PackedScalar, SumsType>   \
+      : PackImpl<PARENT, FixedKernelLayout, Scalar, PackedScalar, SumsType> { \
+  };
+
+template <typename FixedKernelLayout, typename Scalar, typename PackedScalar,
+          typename SumsType>
+struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
+                SumsType> {
+  static void Run(Tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<PackedScalar>* packed_matrix, int start_col,
+                  int end_col) {
+    gemmlowp::ScopedProfilingLabel label("Pack (generic)");
+    RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
+    SumsType* sums = packed_matrix->sums;
+    for (int col = start_col; col < end_col; col++) {
+      SumsType accum = 0;
+      for (int row = 0; row < packed_matrix->layout.rows; row++) {
+        PackedScalar packed_val;
+        if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) {
+          packed_val = Pack<PackedScalar>(Element(src_matrix, row, col));
+        } else {
+          packed_val = packed_matrix->zero_point;
+        }
+        accum += packed_val;
+        *ElementPtr(packed_matrix, row, col) = packed_val;
+      }
+      if (sums) {
+        sums[col] = accum;
+      }
+    }
+  }
+};
+
+#if RUY_PLATFORM(NEON)
+RUY_INHERIT_PACK(Path::kStandardCpp, Path::kNeon)
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+RUY_INHERIT_PACK(Path::kNeon, Path::kNeonDotprod)
+#endif
+#elif RUY_PLATFORM(X86)
+RUY_INHERIT_PACK(Path::kStandardCpp, Path::kAvx2)
+RUY_INHERIT_PACK(Path::kAvx2, Path::kAvx512)
+#endif
+
+// Main entry point for packing.
+template <Path ThePath, typename FixedKernelLayout, typename Scalar,
+          typename PackedScalar>
+void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix,
+             int start_col, int end_col) {
+  using SumsType = typename PackedMatrix<PackedScalar>::SumsType;
+  Matrix<Scalar> src = ToMatrix<Scalar>(src_matrix);
+  PackedMatrix<PackedScalar> packed =
+      ToPackedMatrix<PackedScalar>(*packed_matrix);
+  PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType>::Run(
+      tuning, src, &packed, start_col, end_col);
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_COMMON_H_
diff --git a/tensorflow/lite/experimental/ruy/pack_x86.h b/tensorflow/lite/experimental/ruy/pack_x86.h
new file mode 100644
index 00000000000..cf8b09740a7
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/pack_x86.h
@@ -0,0 +1,272 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// # What is "packing"?
+//
+// Before feeding data to the gemm kernels (the parts of Ruy that do lots
+// of multiply-add operations), Ruy first performs a data transformation (which
+// we call "packing") on the input matrices. This transformation has two main
+// goals:
+// - rearrange data into blocks that are a convenient size/layout for the gemm
+// kernels to consume. This helps make the memory access pattern of the gemm
+// kernel simpler and more contiguous, and puts the data in a layout most
+// convenient for specific arithmetic instructions in the gemm kernel.
+// - compute row/column sums needed for handling quantization with non-symmetric
+// zero points.
+//
+// # Simplified algorithmic analysis of packing
+//
+// Packing is a relatively simple transformation which does a small constant
+// amount of work on each element of an input matrix, and hence for an NxM
+// matrix performs O(N*M) work. If N and M are of the same order, then this is
+// O(N^2) work.
+//
+// A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
+// Note that if N, K, and M are all the same order, then the number of
+// multiply-accumulate operations is O(N^3).
+//
+// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
+// case of all dimensions being roughly the same order.
+//
+// # Packing cost can be significant
+//
+// When matrix * matrix multiplications begin to look more like matrix * vector
+// multiplications, packing cost can become significant. We sometimes call these
+// cases "gemv-like".
+//
+// Continuing the algorithmic analysis above, if we consider a case where an
+// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
+// situation is different. In this case, the multiply-accumulate work is only
+// quadratic, so the quadratic cost of packing can be come significant.
+//
+// Another way to say this is that the cost of packing an input matrix (either
+// the LHS or RHS) is amortized across the non-depth dimension of the opposite
+// input matrix. Thus, when the LHS has very few rows or the RHS has very few
+// columns, the cost of packing the opposite input matrix can become
+// significant.
+//
+// As a rough rule of thumb, the cost of packing starts to become significant
+// when either N or M is below 32 (and other dimensions are hundreds), with very
+// significant packing costs at 8 or below. This varies by data type, Path, and
+// tuning, so these numbers are only rough guides.
+//
+// One practical use case that is affected by this is inference of
+// fully connected neural network layers with a low batch size. The weight
+// matrix (which is a constant for inference) is the one affected by significant
+// packing cost.
+//
+// Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
+// input matrices that are affected by significant packing costs.
+//
+// # Implementation notes
+//
+// Ruy's packing routines always operate on a range of columns and can be
+// applied to either the LHS or RHS. This is possible because Ruy internally
+// implements a TrMul, so the accumulation along depth is done along columns of
+// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
+// for the LHS is along rows). As another example, we are always computing
+// column sums for quantization (and never row sums, since the LHS is
+// transposed).
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/pack_common.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// Note that source and zero buffers can be uint8 type, but in the packing
+// function are reinterpreted as int8, and are XOR-ed with input_xor.
+void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
+                  const std::int8_t* zerobuf, int src_stride,
+                  int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
+                  std::int32_t* sums_ptr);
+
+template <typename Scalar>
+struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
+                std::int8_t, std::int32_t> {
+  static_assert(std::is_same<Scalar, std::int8_t>::value ||
+                    std::is_same<Scalar, std::uint8_t>::value,
+                "");
+  using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
+  static constexpr std::int8_t kInputXor =
+      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
+
+  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
+                  int end_col) {
+    gemmlowp::ScopedProfilingLabel label("Pack (AVX-512)");
+
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
+    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
+    std::int32_t* sums = packed_matrix->sums;
+    Scalar zerobuf[Layout::kCols * Layout::kRows];
+    memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
+           Layout::kCols * Layout::kRows * sizeof(Scalar));
+    for (int block_col = start_col; block_col < end_col;
+         block_col += Layout::kCols) {
+      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
+      int src_stride = src_matrix.layout.stride;
+      const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
+      int remaining_src_cols = src_matrix.layout.cols - block_col;
+
+      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
+      std::int8_t* packed_ptr =
+          packed_matrix->data +
+          packed_matrix->layout.stride * (block_col & block_col_mask);
+      Pack8bitAvx2(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
+                   reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
+                   remaining_src_cols, src_matrix.layout.rows, packed_ptr,
+                   sums_ptr);
+    }
+  }
+};
+
+void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
+                   int remaining_src_cols, int src_rows, float* packed_ptr);
+
+template <>
+struct PackImpl<Path::kAvx2, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
+                float, float> {
+  using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
+  static void Run(Tuning, const Matrix<float>& src_matrix,
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
+    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
+    const float zerobuf[Layout::kCols] = {
+        0.0f};  // Remainder default inits to 0.0f.
+    for (int block_col = start_col; block_col < end_col;
+         block_col += Layout::kCols) {
+      int src_stride = src_matrix.layout.stride;
+      const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
+      int remaining_src_cols = src_matrix.layout.cols - block_col;
+
+      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
+      float* packed_ptr =
+          packed_matrix->data +
+          packed_matrix->layout.stride * (block_col & block_col_mask);
+      PackFloatAvx2(src_ptr, zerobuf, src_stride, remaining_src_cols,
+                    src_matrix.layout.rows, packed_ptr);
+    }
+  }
+};
+
+// Note that source and zero buffers can be uint8 type, but in the packing
+// function are reinterpreted as int8, and are XOR-ed with input_xor.
+void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
+                    const std::int8_t* zerobuf, int src_stride,
+                    int remaining_src_cols, int src_rows,
+                    std::int8_t* packed_ptr, std::int32_t* sums_ptr);
+
+template <typename Scalar>
+struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kColMajor, 4, 16>,
+                Scalar, std::int8_t, std::int32_t> {
+  static_assert(std::is_same<Scalar, std::int8_t>::value ||
+                    std::is_same<Scalar, std::uint8_t>::value,
+                "");
+  using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
+  static constexpr int kHalfLayoutCols =
+      8;  // Half the number of cols in a block.
+  static constexpr std::int8_t kInputXor =
+      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
+
+  static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
+                  int end_col) {
+    gemmlowp::ScopedProfilingLabel label("Pack (AVX-512)");
+
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
+    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
+    RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
+    std::int32_t* sums = packed_matrix->sums;
+    Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
+    memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
+           kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
+    for (int block_col = start_col; block_col < end_col;
+         block_col += Layout::kCols) {
+      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
+      int src_stride = src_matrix.layout.stride;
+      const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
+      int remaining_src_cols = src_matrix.layout.cols - block_col;
+
+      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
+      std::int8_t* packed_ptr =
+          packed_matrix->data +
+          packed_matrix->layout.stride * (block_col & block_col_mask);
+      Pack8bitAvx512(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
+                     reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
+                     remaining_src_cols, src_matrix.layout.rows, packed_ptr,
+                     sums_ptr);
+    }
+  }
+};
+
+void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
+                     int remaining_src_cols, int src_rows, float* packed_ptr);
+
+template <>
+struct PackImpl<Path::kAvx512, FixedKernelLayout<Order::kRowMajor, 1, 16>,
+                float, float, float> {
+  static void Run(Tuning, const Matrix<float>& src_matrix,
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
+    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
+    const float zerobuf[Layout::kCols] = {
+        0.0f};  // Remainder default inits to 0.0f.
+    for (int block_col = start_col; block_col < end_col;
+         block_col += Layout::kCols) {
+      int src_stride = src_matrix.layout.stride;
+      const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
+      int remaining_src_cols = src_matrix.layout.cols - block_col;
+
+      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
+      float* packed_ptr =
+          packed_matrix->data +
+          packed_matrix->layout.stride * (block_col & block_col_mask);
+      PackFloatAvx512(src_ptr, zerobuf, src_stride, remaining_src_cols,
+                      src_matrix.layout.rows, packed_ptr);
+    }
+  }
+};
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index b82e3029f27..8d861a0b1ea 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -51,6 +51,11 @@ namespace ruy {
 // given base architecture (such as ARM). Higher values of this enum correspond
 // to "better" code paths within a given base architecture for which Ruy has
 // optimized code paths.
+//
+// Values are reused across architectures.
+// Rationale: Scale better to N architectures, it is good to have small values
+// both for the compile-time logic to select paths, and when manually spelling
+// out Path values, such as when invoking a test or benchmark.
 enum class Path : std::uint8_t {
   // This is a special null value, representing the absence of any path.
   kNone = 0,
@@ -66,11 +71,25 @@ enum class Path : std::uint8_t {
   //
   // This is intended for testing/development.
   kStandardCpp = 0x2,
+
+#if RUY_PLATFORM(ARM)
+  // ARM architectures.
+  //
   // Optimized path using a widely available subset of ARM NEON instructions.
   kNeon = 0x4,
   // Optimized path making use of ARM NEON dot product instructions that are
   // available on newer ARM cores.
   kNeonDotprod = 0x8,
+#endif  // RUY_PLATFORM(ARM)
+
+#if RUY_PLATFORM(X86)
+  // x86 architectures.
+  //
+  // Optimized for AVX2.
+  kAvx2 = 0x4,
+  // Optimized for AVX-512.
+  kAvx512 = 0x8,
+#endif  // RUY_PLATFORM(X86)
 };
 
 inline constexpr Path operator|(Path p, Path q) {
@@ -104,6 +123,11 @@ constexpr Path kAllPaths =
     Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
 #elif RUY_PLATFORM(NEON_32)
 constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
+#elif RUY_PLATFORM(X86)
+// TODO(b/138433137): kAllPaths should always contain kAvx512 regardless of
+// whether AVX-512 is enabled in the translation unit #including this header.
+constexpr Path kAllPaths =
+    Path::kReference | Path::kStandardCpp | Path::kAvx2 | Path::kAvx512;
 #else
 constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
 #endif
@@ -111,6 +135,9 @@ constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
 // We don't know how to do runtime dotprod detection outside of linux for now.
 #if RUY_PLATFORM(NEON)
 constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
+#elif RUY_PLATFORM(X86)
+constexpr Path kAllPaths =
+    Path::kReference | Path::kStandardCpp | Path::kAvx2 | Path::kAvx512;
 #else
 constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
 #endif
diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h
index 13eccf8acf6..00c6441a86d 100644
--- a/tensorflow/lite/experimental/ruy/platform.h
+++ b/tensorflow/lite/experimental/ruy/platform.h
@@ -16,37 +16,127 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
 
+#ifdef __ANDROID_NDK__
+#include <android/ndk-version.h>
+#endif
+
 #define RUY_PLATFORM(X) ((RUY_DONOTUSEDIRECTLY_##X) != 0)
 
-// Detect ARM 32-bit
+// Architecture-level platform detection.
+//
+// Ruy requires these to be mutually exclusive.
+
+// Detect x86.
+#if defined(__x86_64__) || defined(__i386__) || defined(__i386) || \
+    defined(__x86__) || defined(__X86__) || defined(_X86_) ||      \
+    defined(_M_IX86) || defined(_M_X64)
+#define RUY_DONOTUSEDIRECTLY_X86 1
+#else
+#define RUY_DONOTUSEDIRECTLY_X86 0
+#endif
+
+// Detect ARM 32-bit.
 #ifdef __arm__
 #define RUY_DONOTUSEDIRECTLY_ARM_32 1
 #else
 #define RUY_DONOTUSEDIRECTLY_ARM_32 0
 #endif
 
-// Detect ARM 64-bit
+// Detect ARM 64-bit.
 #ifdef __aarch64__
 #define RUY_DONOTUSEDIRECTLY_ARM_64 1
 #else
 #define RUY_DONOTUSEDIRECTLY_ARM_64 0
 #endif
 
-// Detect NEON
-#if (defined __ARM_NEON) || (defined __ARM_NEON__)
+// Combined ARM.
+#define RUY_DONOTUSEDIRECTLY_ARM \
+  (RUY_DONOTUSEDIRECTLY_ARM_64 || RUY_DONOTUSEDIRECTLY_ARM_32)
+
+// Feature and capability platform detection.
+//
+// These are mostly sub-selections of architectures.
+
+// Detect NEON. Explictly avoid emulation, or anything like it, on x86.
+#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !RUY_PLATFORM(X86)
 #define RUY_DONOTUSEDIRECTLY_NEON 1
 #else
 #define RUY_DONOTUSEDIRECTLY_NEON 0
 #endif
 
-// Define ARM 32-bit NEON
+// Define ARM 32-bit NEON.
 #define RUY_DONOTUSEDIRECTLY_NEON_32 \
   (RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_32)
 
-// Define ARM 64-bit NEON
+// Define ARM 64-bit NEON.
 // Note: NEON is implied by ARM64, so this define is redundant.
 // It still allows some conveyance of intent.
 #define RUY_DONOTUSEDIRECTLY_NEON_64 \
   (RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_64)
 
+// Disable X86 enhancements on __APPLE__ because b/138922878, see comment #8, we
+// may only need to disable this on XCode <= 10.2.
+//
+// Disable when not using Clang-Linux, because too many user issues arise from
+// compilation variations.
+//
+// NOTE: Consider guarding by !defined(__APPLE__) when removing Linux-only
+// restriction.
+//
+// __EMSCRIPTEN__ is checked because the runtime Path resolution can use asm.
+//
+// The Android NDK logic excludes earlier and very broken versions of intrinsics
+// headers.
+#if defined(RUY_FORCE_ENABLE_X86_ENHANCEMENTS) ||                          \
+    (defined(__clang__) && (__clang_major__ >= 8) && defined(__linux__) && \
+     !defined(__EMSCRIPTEN__) &&                                           \
+     (!defined(__ANDROID_NDK__) ||                                         \
+      (defined(__NDK_MAJOR__) && (__NDK_MAJOR__ >= 20))))
+#define RUY_DONOTUSEDIRECTLY_X86_ENHANCEMENTS 1
+#else
+#define RUY_DONOTUSEDIRECTLY_X86_ENHANCEMENTS 0
+#endif
+
+// These CPU capabilities will all be true when Skylake, etc, are enabled during
+// compilation.
+//
+// TODO(b/138433137) Select x86 enhancements at runtime rather than via compile
+// options.
+//
+#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) &&                    \
+    defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && \
+    defined(__AVX512BW__) && defined(__AVX512VL__)
+#define RUY_DONOTUSEDIRECTLY_AVX512 1
+#else
+#define RUY_DONOTUSEDIRECTLY_AVX512 0
+#endif
+
+#if defined(RUY_ENABLE_AVX2_ENHANCEMENTS) && RUY_PLATFORM(X86_ENHANCEMENTS) && \
+    RUY_PLATFORM(X86) && defined(__AVX2__)
+#define RUY_DONOTUSEDIRECTLY_AVX2 1
+#else
+#define RUY_DONOTUSEDIRECTLY_AVX2 0
+#endif
+
+// Note does not check for LZCNT or POPCNT.
+#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) && defined(__SSE4_2__)
+#define RUY_DONOTUSEDIRECTLY_SSE4_2 1
+#else
+#define RUY_DONOTUSEDIRECTLY_SSE4_2 0
+#endif
+
+// Detect APPLE.
+#ifdef __APPLE__
+#define RUY_DONOTUSEDIRECTLY_APPLE 1
+#else
+#define RUY_DONOTUSEDIRECTLY_APPLE 0
+#endif
+
+// Detect Emscripten, typically Wasm.
+#ifdef __EMSCRIPTEN__
+#define RUY_DONOTUSEDIRECTLY_EMSCRIPTEN 1
+#else
+#define RUY_DONOTUSEDIRECTLY_EMSCRIPTEN 0
+#endif
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
diff --git a/tensorflow/lite/experimental/ruy/pmu.cc b/tensorflow/lite/experimental/ruy/pmu.cc
index 40f5f50790c..3ec62bbc0de 100644
--- a/tensorflow/lite/experimental/ruy/pmu.cc
+++ b/tensorflow/lite/experimental/ruy/pmu.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 #include <sys/ioctl.h>
+#include <syscall.h>
 #include <unistd.h>
+
 #include <cstdio>
 #endif
 
@@ -47,7 +49,8 @@ class PerfEvent {
     pe.exclude_hv = 1;
     fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
     if (fd_ == -1) {
-      fprintf(stderr, "perf_event_open failed for config 0x%lx\n", config);
+      fprintf(stderr, "perf_event_open failed for config 0x%lx\n",
+              static_cast<unsigned long>(config));
       // abort();
     }
     ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
diff --git a/tensorflow/lite/experimental/ruy/pmu.h b/tensorflow/lite/experimental/ruy/pmu.h
index b77882cc74a..03f0cb7d878 100644
--- a/tensorflow/lite/experimental/ruy/pmu.h
+++ b/tensorflow/lite/experimental/ruy/pmu.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PMU_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PMU_H_
 
-#include <cstdint>
-
 namespace ruy {
 
 class PmuEventsPrivate;
diff --git a/tensorflow/lite/experimental/ruy/prepack.h b/tensorflow/lite/experimental/ruy/prepack.h
index 9019efa5de6..5966a5e5afb 100644
--- a/tensorflow/lite/experimental/ruy/prepack.h
+++ b/tensorflow/lite/experimental/ruy/prepack.h
@@ -18,13 +18,20 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
 
+#include <cstddef>
 #include <functional>
 
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/dispatch.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
 #include "tensorflow/lite/experimental/ruy/spec.h"
+#include "tensorflow/lite/experimental/ruy/trmul.h"
+#include "tensorflow/lite/experimental/ruy/trmul_params.h"
 #include "tensorflow/lite/experimental/ruy/tune.h"
 
 namespace ruy {
@@ -34,8 +41,7 @@ template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
 void PrePackForMulInternal(const Matrix<LhsScalar>& lhs,
                            const Matrix<RhsScalar>& rhs, const Spec& spec,
                            Context* context, Matrix<DstScalar>* dst,
-                           PrepackedMatrix* prepacked_lhs,
-                           PrepackedMatrix* prepacked_rhs,
+                           SidePair<PrepackedMatrix*> prepacked,
                            std::function<void*(std::size_t)> alloc_fn) {
   gemmlowp::ScopedProfilingLabel label("PrePackForMul");
   Path the_path = context->GetPathToTake<CompiledPaths>();
@@ -47,24 +53,21 @@ void PrePackForMulInternal(const Matrix<LhsScalar>& lhs,
   CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
                                         the_path, &params);
 
+  const SidePair<int> origin{0, 0};
+  const SidePair<int> rounded_dims{params.packed[Side::kLhs].layout.cols,
+                                   params.packed[Side::kRhs].layout.cols};
+
   Tuning tuning = context->GetMainThreadTuning();
-  if (prepacked_lhs) {
-    prepacked_lhs->data_size = DataSize(params.packed_lhs);
-    prepacked_lhs->sums_size = SumsSize(params.packed_lhs);
-    prepacked_lhs->data = alloc_fn(prepacked_lhs->data_size);
-    prepacked_lhs->sums = alloc_fn(prepacked_lhs->sums_size);
-    params.packed_lhs.data = prepacked_lhs->data;
-    params.packed_lhs.sums = prepacked_lhs->sums;
-    params.LhsRunPack(tuning, 0, params.packed_lhs.layout.cols);
-  }
-  if (prepacked_rhs) {
-    prepacked_rhs->data_size = DataSize(params.packed_rhs);
-    prepacked_rhs->sums_size = SumsSize(params.packed_rhs);
-    prepacked_rhs->data = alloc_fn(prepacked_rhs->data_size);
-    prepacked_rhs->sums = alloc_fn(prepacked_rhs->sums_size);
-    params.packed_rhs.data = prepacked_rhs->data;
-    params.packed_rhs.sums = prepacked_rhs->sums;
-    params.RhsRunPack(tuning, 0, params.packed_rhs.layout.cols);
+  for (Side side : {Side::kLhs, Side::kRhs}) {
+    if (prepacked[side]) {
+      prepacked[side]->data_size = DataSize(params.packed[side]);
+      prepacked[side]->sums_size = SumsSize(params.packed[side]);
+      prepacked[side]->data = alloc_fn(prepacked[side]->data_size);
+      prepacked[side]->sums = alloc_fn(prepacked[side]->sums_size);
+      params.packed[side].data = prepacked[side]->data;
+      params.packed[side].sums = prepacked[side]->sums;
+      params.RunPack(side, tuning, origin[side], rounded_dims[side]);
+    }
   }
 }
 
@@ -73,8 +76,7 @@ template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
 void MulWithPrepackedInternal(const Matrix<LhsScalar>& lhs,
                               const Matrix<RhsScalar>& rhs, const Spec& spec,
                               Context* context, Matrix<DstScalar>* dst,
-                              PrepackedMatrix* prepacked_lhs,
-                              PrepackedMatrix* prepacked_rhs) {
+                              SidePair<PrepackedMatrix*> prepacked) {
   gemmlowp::ScopedProfilingLabel label("MulWithPrepacked");
 
   EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
@@ -90,16 +92,14 @@ void MulWithPrepackedInternal(const Matrix<LhsScalar>& lhs,
   CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
                                         the_path, &params);
 
-  if (prepacked_lhs) {
-    params.packed_lhs.data = prepacked_lhs->data;
-    params.packed_lhs.sums = prepacked_lhs->sums;
-    params.lhs_is_prepacked = true;
-  }
-  if (prepacked_rhs) {
-    params.packed_rhs.data = prepacked_rhs->data;
-    params.packed_rhs.sums = prepacked_rhs->sums;
-    params.rhs_is_prepacked = true;
+  for (Side side : {Side::kLhs, Side::kRhs}) {
+    if (prepacked[side]) {
+      params.packed[side].data = prepacked[side]->data;
+      params.packed[side].sums = prepacked[side]->sums;
+      params.is_prepacked[side] = true;
+    }
   }
+
   TrMul(&params, context);
 }
 
diff --git a/tensorflow/lite/experimental/ruy/ruy.h b/tensorflow/lite/experimental/ruy/ruy.h
index e28e3974cbf..436b1af94a1 100644
--- a/tensorflow/lite/experimental/ruy/ruy.h
+++ b/tensorflow/lite/experimental/ruy/ruy.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/dispatch.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/spec.h"
 
 namespace ruy {
diff --git a/tensorflow/lite/experimental/ruy/ruy_advanced.h b/tensorflow/lite/experimental/ruy/ruy_advanced.h
index 36382e7d8e5..68748198f3e 100644
--- a/tensorflow/lite/experimental/ruy/ruy_advanced.h
+++ b/tensorflow/lite/experimental/ruy/ruy_advanced.h
@@ -16,7 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ADVANCED_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ADVANCED_H_
 
+#include <cstddef>
+#include <functional>
+
+#include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/prepack.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
 
 namespace ruy {
 
@@ -40,8 +47,9 @@ void PrePackForMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
                    PrepackedMatrix* prepacked_lhs,
                    PrepackedMatrix* prepacked_rhs,
                    std::function<void*(std::size_t)> alloc_fn) {
-  PrePackForMulInternal<CompiledPaths>(lhs, rhs, spec, context, dst,
-                                       prepacked_lhs, prepacked_rhs, alloc_fn);
+  SidePair<PrepackedMatrix*> prepacked(prepacked_lhs, prepacked_rhs);
+  PrePackForMulInternal<CompiledPaths>(lhs, rhs, spec, context, dst, prepacked,
+                                       alloc_fn);
 }
 
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
@@ -51,8 +59,9 @@ void MulWithPrepacked(const Matrix<LhsScalar>& lhs,
                       Context* context, Matrix<DstScalar>* dst,
                       PrepackedMatrix* prepacked_lhs,
                       PrepackedMatrix* prepacked_rhs) {
+  SidePair<PrepackedMatrix*> prepacked(prepacked_lhs, prepacked_rhs);
   MulWithPrepackedInternal<CompiledPaths>(lhs, rhs, spec, context, dst,
-                                          prepacked_lhs, prepacked_rhs);
+                                          prepacked);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy_test.bzl b/tensorflow/lite/experimental/ruy/ruy_test.bzl
index df9f58ce653..c32fbb35418 100644
--- a/tensorflow/lite/experimental/ruy/ruy_test.bzl
+++ b/tensorflow/lite/experimental/ruy/ruy_test.bzl
@@ -1,17 +1,12 @@
 # Provides the ruy_test macro for type-parametrized tests.
+"""ruy_test is a macro for building a test with multiple paths corresponding to tuples of types for LHS, RHS, accumulator and destination."""
 
-"""
-ruy_test is a macro for building a test with multiple paths
-corresponding to tuples of types for LHS, RHS, accumulator
-and destination.
-"""
-
-def ruy_test(name, srcs, lhs_rhs_accum_dst, tags = []):
+def ruy_test(name, srcs, lhs_rhs_accum_dst, copts, tags = []):
     for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
         native.cc_test(
             name = "%s_%s_%s_%s_%s" % (name, lhs, rhs, accum, dst),
             srcs = srcs,
-            copts = [
+            copts = copts + [
                 "-DRUY_TEST_LHSSCALAR=%s" % lhs,
                 "-DRUY_TEST_RHSSCALAR=%s" % rhs,
                 "-DRUY_TEST_ACCUMSCALAR=%s" % accum,
@@ -24,13 +19,14 @@ def ruy_test(name, srcs, lhs_rhs_accum_dst, tags = []):
             tags = tags,
         )
 
-def ruy_benchmark(name, srcs, lhs_rhs_accum_dst):
+def ruy_benchmark(name, srcs, lhs_rhs_accum_dst, copts):
+    tags = ["req_dep=@gemmlowp//:profiler"]
     for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
         native.cc_binary(
             name = "%s_%s_%s_%s_%s" % (name, lhs, rhs, accum, dst),
             testonly = True,
             srcs = srcs,
-            copts = [
+            copts = copts + [
                 "-DRUY_TEST_LHSSCALAR=%s" % lhs,
                 "-DRUY_TEST_RHSSCALAR=%s" % rhs,
                 "-DRUY_TEST_ACCUMSCALAR=%s" % accum,
@@ -38,18 +34,20 @@ def ruy_benchmark(name, srcs, lhs_rhs_accum_dst):
             ],
             deps = [
                 "//tensorflow/lite/experimental/ruy:test_lib",
-                "@gemmlowp//:profiler",
+                "@gemmlowp//:profiler",  # Note also tagged as req_dep.
             ],
+            tags = tags,
         )
 
-def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst):
+def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst, copts):
+    tags = ["req_dep=@gemmlowp//:profiler"]
     for opt_set in opt_sets:
         for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
             native.cc_binary(
                 name = "%s_%s_%s_%s_%s_%s" % (name, opt_set, lhs, rhs, accum, dst),
                 testonly = True,
                 srcs = srcs,
-                copts = [
+                copts = copts + [
                     "-DRUY_TEST_LHSSCALAR=%s" % lhs,
                     "-DRUY_TEST_RHSSCALAR=%s" % rhs,
                     "-DRUY_TEST_ACCUMSCALAR=%s" % accum,
@@ -58,6 +56,7 @@ def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst):
                 ],
                 deps = [
                     "//tensorflow/lite/experimental/ruy:test_lib",
-                    "@gemmlowp//:profiler",
+                    "@gemmlowp//:profiler",  # Note also tagged as req_dep.
                 ],
+                tags = tags,
             )
diff --git a/tensorflow/lite/experimental/ruy/ruy_test_ext.bzl b/tensorflow/lite/experimental/ruy/ruy_test_ext.bzl
index b95181d541b..5701fffa0f7 100644
--- a/tensorflow/lite/experimental/ruy/ruy_test_ext.bzl
+++ b/tensorflow/lite/experimental/ruy/ruy_test_ext.bzl
@@ -1,6 +1,4 @@
-"""
-Allows to specialize the ruy BUILD to availability of external libraries
-"""
+"""Allows to specialize the ruy BUILD to availability of external libraries"""
 
 def ruy_test_ext_defines():
     return []
diff --git a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl b/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
deleted file mode 100644
index bb111c23c74..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Control of ruy visibility
-"""
-
-def ruy_visibility():
-    return [
-        "//tensorflow/lite/kernels:__subpackages__",
-    ]
diff --git a/tensorflow/lite/experimental/ruy/side_pair.h b/tensorflow/lite/experimental/ruy/side_pair.h
new file mode 100644
index 00000000000..b20a2d1ef43
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/side_pair.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_
+
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+
+namespace ruy {
+
+enum class Side { kLhs = 0, kRhs = 1 };
+
+template <typename T>
+class SidePair final {
+ public:
+  SidePair() {}
+  SidePair(const T& a, const T& b) : elem_{a, b} {}
+  const T& operator[](Side side) const {
+    const int index = static_cast<int>(side);
+    // Technically this check is vacuous, since other values would be
+    // out-of-range for enum Side.
+    RUY_DCHECK(index == 0 || index == 1);
+    return elem_[index];
+  }
+
+  T& operator[](Side side) {
+    const int index = static_cast<int>(side);
+    // Technically this check is vacuous, since other values would be
+    // out-of-range for enum Side.
+    RUY_DCHECK(index == 0 || index == 1);
+    return elem_[index];
+  }
+
+ private:
+  static_assert(static_cast<int>(Side::kLhs) == 0, "");
+  static_assert(static_cast<int>(Side::kRhs) == 1, "");
+  T elem_[2];
+};
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_
diff --git a/tensorflow/lite/experimental/ruy/size_util.h b/tensorflow/lite/experimental/ruy/size_util.h
index 78ff90f62e1..2803ca19763 100644
--- a/tensorflow/lite/experimental/ruy/size_util.h
+++ b/tensorflow/lite/experimental/ruy/size_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
 
+#include <type_traits>
+
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 
 #ifdef _WIN32
@@ -24,40 +26,64 @@ limitations under the License.
 
 namespace ruy {
 
-inline int floor_log2(int n) {
+template <typename Integer>
+inline Integer floor_log2(Integer n) {
+  static_assert(std::is_integral<Integer>::value, "");
+  static_assert(std::is_signed<Integer>::value, "");
+  static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
+
   RUY_DCHECK_GE(n, 1);
 #ifdef _WIN32
-  unsigned long result;
-  _BitScanReverse(&result, n);
+  unsigned long result;  // NOLINT[runtime/int]
+  if (sizeof(Integer) == 4) {
+    _BitScanReverse(&result, n);
+  } else {
+    _BitScanReverse64(&result, n);
+  }
   return result;
 #else
-  return 31 - __builtin_clz(n);
+  if (sizeof(Integer) == 4) {
+    return 31 - __builtin_clz(n);
+  } else {
+    return 63 - __builtin_clzll(n);
+  }
 #endif
 }
 
-inline int ceil_log2(int n) {
+template <typename Integer>
+Integer ceil_log2(Integer n) {
   RUY_DCHECK_GE(n, 1);
   return n == 1 ? 0 : floor_log2(n - 1) + 1;
 }
 
-inline bool is_pot(int value) {
+template <typename Integer>
+bool is_pot(Integer value) {
   return (value > 0) && ((value & (value - 1)) == 0);
 }
 
-inline int round_down_pot(int value) { return 1 << floor_log2(value); }
+template <typename Integer>
+Integer round_down_pot(Integer value) {
+  return static_cast<Integer>(1) << floor_log2(value);
+}
 
-inline int round_up_pot(int value) { return 1 << ceil_log2(value); }
+template <typename Integer>
+Integer round_up_pot(Integer value) {
+  return static_cast<Integer>(1) << ceil_log2(value);
+}
 
-inline int round_down_pot(int value, int modulo) {
+template <typename Integer, typename Modulo>
+Integer round_down_pot(Integer value, Modulo modulo) {
   RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
   return value & ~(modulo - 1);
 }
 
-inline int round_up_pot(int value, int modulo) {
+template <typename Integer, typename Modulo>
+Integer round_up_pot(Integer value, Modulo modulo) {
   return round_down_pot(value + modulo - 1, modulo);
 }
 
-inline int clamp(int x, int lo, int hi) {
+template <typename Integer>
+Integer clamp(Integer x, Integer lo, Integer hi) {
   if (x < lo) {
     return lo;
   } else if (x > hi) {
diff --git a/tensorflow/lite/experimental/ruy/size_util_test.cc b/tensorflow/lite/experimental/ruy/size_util_test.cc
new file mode 100644
index 00000000000..bd97e1aae0c
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/size_util_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include <gtest/gtest.h>
+
+namespace ruy {
+namespace {
+
+template <typename Integer>
+void SizeUtilTestValue(Integer value) {
+  if (value == 0) {
+    return;
+  }
+
+  EXPECT_LE(0, floor_log2(value));
+  EXPECT_LE(floor_log2(value), ceil_log2(value));
+  EXPECT_LE(ceil_log2(value), 8 * sizeof(Integer));
+
+  if (is_pot(value)) {
+    EXPECT_EQ(floor_log2(value), ceil_log2(value));
+  } else {
+    EXPECT_EQ(floor_log2(value) + 1, ceil_log2(value));
+  }
+  EXPECT_EQ(value >> floor_log2(value), 1);
+  EXPECT_EQ(round_down_pot(value), static_cast<Integer>(1)
+                                       << floor_log2(value));
+  EXPECT_LE(round_down_pot(value), value);
+  EXPECT_GE(round_down_pot(value), value >> 1);
+  EXPECT_TRUE(is_pot(round_down_pot(value)));
+
+  if (ceil_log2(value) < 8 * sizeof(Integer) - 1) {
+    EXPECT_EQ(value >> ceil_log2(value), is_pot(value) ? 1 : 0);
+    EXPECT_EQ(round_up_pot(value), static_cast<Integer>(1) << ceil_log2(value));
+    EXPECT_GE(round_up_pot(value), value);
+    EXPECT_LE(round_up_pot(value) >> 1, value);
+    EXPECT_TRUE(is_pot(round_up_pot(value)));
+  }
+
+  for (std::uint8_t modulo : {1, 2, 8, 32, 128}) {
+    EXPECT_GE(value, round_down_pot(value, modulo));
+    EXPECT_EQ(round_down_pot(value, modulo) % modulo, 0);
+
+    if (value <= std::numeric_limits<Integer>::max() - modulo) {
+      EXPECT_LE(value, round_up_pot(value, modulo));
+      EXPECT_EQ(round_up_pot(value, modulo) % modulo, 0);
+    }
+  }
+}
+
+template <typename Integer>
+void SizeUtilTest() {
+  for (int exponent = 0; exponent < 8 * sizeof(Integer) - 1; exponent++) {
+    const Integer pot = static_cast<Integer>(1) << exponent;
+    SizeUtilTestValue(pot - 1);
+    SizeUtilTestValue(pot);
+    SizeUtilTestValue(pot + 1);
+    SizeUtilTestValue(pot + 12);
+    SizeUtilTestValue(pot + 123);
+  }
+  SizeUtilTestValue(std::numeric_limits<Integer>::max() - 1);
+  SizeUtilTestValue(std::numeric_limits<Integer>::max());
+}
+
+TEST(SizeUtilTest, Int) { SizeUtilTest<int>(); }
+
+TEST(SizeUtilTest, Long) { SizeUtilTest<long int>(); }  // NOLINT
+
+TEST(SizeUtilTest, LongLong) { SizeUtilTest<long long int>(); }  // NOLINT
+
+TEST(SizeUtilTest, Int32) { SizeUtilTest<std::int32_t>(); }
+
+TEST(SizeUtilTest, Int64) { SizeUtilTest<std::int64_t>(); }
+
+TEST(SizeUtilTest, Ptrdiff) { SizeUtilTest<std::ptrdiff_t>(); }
+
+}  // namespace
+}  // namespace ruy
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h
index 091344503ed..1d8c3390775 100644
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SPEC_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SPEC_H_
 
-#include <cstdint>
 #include <limits>
 #include <type_traits>
 
diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h
index d604bc7c1df..30c9deb1c65 100644
--- a/tensorflow/lite/experimental/ruy/test.h
+++ b/tensorflow/lite/experimental/ruy/test.h
@@ -16,23 +16,33 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_
 
+#include <math.h>
+
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <ctime>
-#include <initializer_list>
 #include <iostream>
+#include <iterator>
 #include <limits>
+#include <memory>
 #include <random>
 #include <set>
 #include <sstream>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <vector>
 
-#include <gtest/gtest.h>
+#include <gtest/gtest.h>  // IWYU pragma: export
+#include "tensorflow/lite/experimental/ruy/matrix.h"  // IWYU pragma: export
 #include "tensorflow/lite/experimental/ruy/platform.h"
 #include "tensorflow/lite/experimental/ruy/pmu.h"
 #include "tensorflow/lite/experimental/ruy/ruy.h"
 #include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"  // IWYU pragma: export
 #include "tensorflow/lite/experimental/ruy/time.h"
 
 #ifdef RUY_TEST_EXTERNAL_PATHS
@@ -66,8 +76,13 @@ const char* PathName(Path path) {
   switch (path) {
     RUY_PATHNAME_CASE(kReference)
     RUY_PATHNAME_CASE(kStandardCpp)
+#if RUY_PLATFORM(NEON)
     RUY_PATHNAME_CASE(kNeon)
     RUY_PATHNAME_CASE(kNeonDotprod)
+#elif RUY_PLATFORM(X86)
+    RUY_PATHNAME_CASE(kAvx2)
+    RUY_PATHNAME_CASE(kAvx512)
+#endif
     default:
       RUY_CHECK(false);
       return nullptr;
@@ -131,6 +146,32 @@ std::string Join(const ContainerType& container) {
 struct LogCoveredPathsOnDestruction final {
   ~LogCoveredPathsOnDestruction() {
     std::cerr << "Covered paths: " << Join(*CoveredPaths()) << std::endl;
+
+    // When testing on ARM64 ChromiumOS emulator, make sure that we covered
+    // the dotprod path. We're getting such coverage at the moment thanks to
+    // using a sufficiently recent emulator, and we don't want to regress that.
+#if RUY_PLATFORM(ARM_64) && defined RUY_TESTING_ON_CHROMIUMOS
+    bool found_dotprod = false;
+    for (const std::string& covered_path : *CoveredPaths()) {
+      if (covered_path == "kNeonDotprod") {
+        found_dotprod = true;
+      }
+    }
+    if (!found_dotprod) {
+      std::cerr
+          << "Error: we haven't tested the kNeonDotprod path as we should "
+             "have. At the moment, this is required on ChromiumOS as this is "
+             "what we run emulator tests in, that currently supports "
+             "dot-product "
+             "instructions, and we care very much about not regressing that. "
+             "If this test was run in an emulator, please upgrade to a newer "
+             "emulator version. If this test was run on an actual device, and "
+             "you need to be able to run ruy tests on devices not supporting "
+             "dot-product instructions, get in touch with us.\n"
+          << std::endl;
+      abort();
+    }
+#endif
   }
   static void Singleton() { static LogCoveredPathsOnDestruction singleton; }
 };
@@ -245,7 +286,7 @@ struct RandomRangeBounds<Scalar, false> {
 inline std::default_random_engine& global_random_engine() {
   static std::default_random_engine engine;
   return engine;
-};
+}
 
 template <typename Scalar>
 struct UniformRandomDistribution {
@@ -535,6 +576,13 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::DoMul(TestResultType* result) {
                               prepacked_rhs_ptr);
 }
 
+// When building for WAsm, ASSERT_DEATH is not defined.
+#ifdef ASSERT_DEATH
+#define RUY_ASSERT_DEATH(CONDITION, MESSAGE) ASSERT_DEATH(CONDITION, MESSAGE)
+#else
+#define RUY_ASSERT_DEATH(CONDITION, MESSAGE)
+#endif
+
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::EvalRuy(TestResultType* result) {
   GlobalContext().explicit_tuning = result->tuning;
@@ -553,7 +601,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::EvalRuy(TestResultType* result) {
     // TODO(benoitjacob) TSan and ASan seem to be breaking ASSERT_DEATH.
     // Report a bug?
 #if (!defined NDEBUG) && (!defined RUY_ASAN) && (!defined RUY_TSAN)
-    ASSERT_DEATH(DoMul(result), "");
+    RUY_ASSERT_DEATH(DoMul(result), "");
 #endif
   } else {
     RUY_CHECK(false);
@@ -660,7 +708,7 @@ void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
           -lhs.zero_point, -rhs.zero_point, output_pipeline);
-    } else
+    } else  // NOLINT[readability/braces]
 #endif
     {
       const auto& output_pipeline =
@@ -680,7 +728,7 @@ void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
           -lhs.zero_point, -rhs.zero_point, output_pipeline);
-    } else
+    } else  // NOLINT[readability/braces]
 #endif
     {
       const auto& output_pipeline = std::make_tuple(
@@ -1554,9 +1602,11 @@ std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) {
   if (benchmark) {
     return {Tuning::kAuto};
   }
+#if RUY_PLATFORM(ARM)
   if (path == Path::kNeon || path == Path::kNeonDotprod) {
     return {Tuning::kInOrder, Tuning::kOutOfOrder, Tuning::kAuto};
   }
+#endif
   return {Tuning::kAuto};
 }
 
@@ -1866,11 +1916,11 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
     if (record_pmu) {
       pmu_events.StartRecording();
     }
-    TimePoint time_start = Clock::now();
+    TimePoint time_start = Now();
     TimePoint t = time_start;
     int iters = 0;
     int iters_at_a_time = 1;
-    while (ToSeconds(t - time_start) < benchmark_min_secs) {
+    while (ToFloatSeconds(t - time_start) < benchmark_min_secs) {
       for (int i = 0; i < iters_at_a_time; i++) {
         if (cold) {
           lhs.matrix.data = cold_lhs.Next();
@@ -1887,10 +1937,10 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
         iters++;
       }
       iters_at_a_time *= 2;
-      t = Clock::now();
+      t = Now();
     }
-    latency = std::min(latency,
-                       static_cast<float>(ToSeconds(t - time_start) / iters));
+    latency = std::min(
+        latency, static_cast<float>(ToFloatSeconds(t - time_start) / iters));
     if (record_pmu) {
       pmu_events.StopRecording();
       const float normalization_factor =
diff --git a/tensorflow/lite/experimental/ruy/test_fast.cc b/tensorflow/lite/experimental/ruy/test_fast.cc
index 8e23b573d06..8e93d89c7a5 100644
--- a/tensorflow/lite/experimental/ruy/test_fast.cc
+++ b/tensorflow/lite/experimental/ruy/test_fast.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // This test contains cheap test cases, completes in a few seconds.
 
+#include <vector>
+
 #include "tensorflow/lite/experimental/ruy/test.h"
 
 namespace ruy {
diff --git a/tensorflow/lite/experimental/ruy/thread_pool.cc b/tensorflow/lite/experimental/ruy/thread_pool.cc
index db69dc8cc94..83ae0854cbc 100644
--- a/tensorflow/lite/experimental/ruy/thread_pool.cc
+++ b/tensorflow/lite/experimental/ruy/thread_pool.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <atomic>
 #include <chrono>              // NOLINT(build/c++11)
 #include <condition_variable>  // NOLINT(build/c++11)
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
 #include <mutex>               // NOLINT(build/c++11)
 #include <thread>              // NOLINT(build/c++11)
 
-#include "tensorflow/lite/experimental/ruy/blocking_counter.h"
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/wait.h"
 
@@ -153,6 +155,13 @@ class Thread {
 
 void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
   RUY_DCHECK_GE(task_count, 1);
+
+  // Case of 1 thread: just run the single task on the current thread.
+  if (task_count == 1) {
+    (tasks + 0)->Run();
+    return;
+  }
+
   // Task #0 will be run on the current thread.
   CreateThreads(task_count - 1);
   counter_to_decrement_when_ready_.Reset(task_count - 1);
@@ -160,8 +169,10 @@ void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
     auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
     threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
   }
-  // Execute task #0 workload immediately on the current thread.
+
+  // Execute task #0 immediately on the current thread.
   (tasks + 0)->Run();
+
   // Wait for the threads submitted above to finish.
   counter_to_decrement_when_ready_.Wait();
 }
diff --git a/tensorflow/lite/experimental/ruy/time.h b/tensorflow/lite/experimental/ruy/time.h
index 0c656ec7e95..d96ed3409e0 100644
--- a/tensorflow/lite/experimental/ruy/time.h
+++ b/tensorflow/lite/experimental/ruy/time.h
@@ -17,20 +17,63 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TIME_H_
 
 #include <chrono>  // NOLINT(build/c++11)
+#include <cstdint>  // IWYU pragma: keep
+#include <ratio>    // NOLINT(build/c++11)
+
+#ifdef __linux__
+#include <sys/time.h>
+// IWYU pragma: no_include <type_traits>
+
+#include <ctime>
+#endif
 
 namespace ruy {
 
-using Clock = std::chrono::steady_clock;
+using InternalDefaultClock = std::chrono::steady_clock;
 
-using TimePoint = Clock::time_point;
-using Duration = Clock::duration;
+using TimePoint = InternalDefaultClock::time_point;
+using Duration = InternalDefaultClock::duration;
 
-inline double ToSeconds(Duration d) {
-  return std::chrono::duration_cast<std::chrono::duration<double>>(d).count();
+template <typename RepresentationType>
+Duration DurationFromSeconds(RepresentationType representation) {
+  return std::chrono::duration_cast<Duration>(
+      std::chrono::duration<RepresentationType>(representation));
 }
 
-inline Duration DurationFromSeconds(double s) {
-  return std::chrono::duration_cast<Duration>(std::chrono::duration<double>(s));
+template <typename RepresentationType>
+Duration DurationFromMilliseconds(RepresentationType representation) {
+  return std::chrono::duration_cast<Duration>(
+      std::chrono::duration<RepresentationType, std::milli>(representation));
+}
+
+template <typename RepresentationType>
+Duration DurationFromNanoseconds(RepresentationType representation) {
+  return std::chrono::duration_cast<Duration>(
+      std::chrono::duration<RepresentationType, std::nano>(representation));
+}
+
+inline float ToFloatSeconds(const Duration& duration) {
+  return std::chrono::duration_cast<std::chrono::duration<float>>(duration)
+      .count();
+}
+
+inline std::int64_t ToInt64Nanoseconds(const Duration& duration) {
+  return std::chrono::duration_cast<
+             std::chrono::duration<std::int64_t, std::nano>>(duration)
+      .count();
+}
+
+inline TimePoint Now() { return InternalDefaultClock::now(); }
+
+inline TimePoint CoarseNow() {
+#ifdef __linux__
+  timespec t;
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &t);
+  return TimePoint(
+      DurationFromNanoseconds(1000000000LL * t.tv_sec + t.tv_nsec));
+#else
+  return Now();
+#endif
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/trace.cc b/tensorflow/lite/experimental/ruy/trace.cc
index c84a59e01b4..55b2fedbf41 100644
--- a/tensorflow/lite/experimental/ruy/trace.cc
+++ b/tensorflow/lite/experimental/ruy/trace.cc
@@ -16,207 +16,166 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/trace.h"
 
 #include <algorithm>
-#include <cerrno>
-#include <cstdint>
+#include <cerrno>  // IWYU pragma: keep
 #include <cstdio>
+#include <cstdlib>
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/experimental/ruy/block_map.h"
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
 #include "tensorflow/lite/experimental/ruy/time.h"
 
 namespace ruy {
 
 #ifdef RUY_TRACE
 
-struct BlockTraceEntry {
-  std::uint32_t thread_id = 0;
-  TimePoint time_reserved;
-  TimePoint time_computed_coords;
-  TimePoint time_packed_lhs;
-  TimePoint time_packed_rhs;
-  TimePoint time_finished;
+enum class TraceEvent : std::uint8_t {
+  kNone,
+  kThreadStart,
+  kThreadLoopStart,
+  kThreadEnd,
+  kBlockReserved,
+  kBlockPackedLhs,
+  kBlockPackedRhs,
+  kBlockFinished
 };
 
-struct ThreadTraceEntry {
-  TimePoint time_start;
-  TimePoint time_loop_start;
-  TimePoint time_end;
+struct TraceEntry {
+  TimePoint time_point;
+  TraceEvent event;
+  // ruy-internal thread id i.e. contiguous index into array of threads,
+  // with 0 designating the main thread.
+  std::uint16_t thread_id = 0;
+  // Additional parameters whose meaning depends on the 'event' type.
+  std::uint32_t params[1];
 };
 
 struct Trace {
-  enum class LifeStage {
-    kInitial,
-    kRecordingRootFields,
-    kRecordingBlockAndThreadFields,
-    kComplete
-  };
-  void StartRecordingBlockAndThreadFields(const BlockMap& block_map_,
-                                          int thread_count_) {
-    RUY_DCHECK(life_stage == LifeStage::kRecordingRootFields);
-    block_map = block_map_;
-    thread_count = thread_count_;
-    int num_blocks = NumBlocks(block_map);
-    if (num_blocks > block_entries.size()) {
-      block_entries.resize(NumBlocks(block_map));
-    }
-    if (thread_count > thread_entries.size()) {
-      thread_entries.resize(thread_count);
-    }
-    life_stage = LifeStage::kRecordingBlockAndThreadFields;
-  }
   BlockMap block_map;
   int thread_count = 0;
-  std::vector<BlockTraceEntry> block_entries;
-  std::vector<ThreadTraceEntry> thread_entries;
+  // During recording, to avoid having to use locks or atomics, we let
+  // each thread append to its own specific vector.
+  std::vector<std::vector<TraceEntry>> thread_specific_entries;
+  // Global vector of entries into which we coalesce thread_specific_entries
+  // after recording is finished, when dumping a trace. See
+  // AggregateThreadSpecificEntries.
+  std::vector<TraceEntry> entries;
   TimePoint time_start;
   TimePoint time_execute;
   TimePoint time_end;
-  LifeStage life_stage = LifeStage::kInitial;
 };
 
-struct ProcessedTrace {
-  enum class Event : std::uint8_t {
-    kNone,
-    kThreadStart,
-    kThreadLoopStart,
-    kThreadEnd,
-    kBlockReserved,
-    kBlockComputedCoords,
-    kBlockPackedLhs,
-    kBlockPackedRhs,
-    kBlockFinished
-  };
-  struct Entry {
-    Event event = Event::kNone;
-    std::uint32_t thread_id = 0;
-    std::uint32_t block_id = 0;
-    TimePoint time;
-  };
+namespace {
 
-  BlockMap block_map;
-  int thread_count = 0;
-  TimePoint time_start;
-  TimePoint time_execute;
-  TimePoint time_end;
-  std::vector<Entry> entries;
-  void Add(Event event, std::uint32_t thread_id, std::uint32_t block_id,
-           TimePoint time) {
-    // If the time point is still in its default-constructed state,
-    // that means we didn't record it.
-    if (!time.time_since_epoch().count()) {
-      return;
+// Coalesce Trace::thread_specific_entries into Trace::entries.
+void AggregateThreadSpecificEntries(Trace* trace) {
+  RUY_CHECK(trace->entries.empty());
+  for (auto& thread_specific_entries_vector : trace->thread_specific_entries) {
+    for (const TraceEntry& entry : thread_specific_entries_vector) {
+      trace->entries.push_back(entry);
     }
-    Entry entry;
-    entry.event = event;
-    entry.thread_id = thread_id;
-    entry.block_id = block_id;
-    entry.time = time;
-    entries.push_back(entry);
+    thread_specific_entries_vector.clear();
   }
-  void Process(const Trace& trace) {
-    thread_count = trace.thread_count;
-    block_map = trace.block_map;
-    time_start = trace.time_start;
-    time_execute = trace.time_execute;
-    time_end = trace.time_end;
-    entries.clear();
-    for (int i = 0; i < trace.thread_count; i++) {
-      const auto& entry = trace.thread_entries[i];
-      Add(Event::kThreadStart, i, 0, entry.time_start);
-      Add(Event::kThreadLoopStart, i, 0, entry.time_loop_start);
-      Add(Event::kThreadEnd, i, 0, entry.time_end);
-    }
-    std::uint32_t num_blocks = NumBlocks(block_map);
-    for (int i = 0; i < num_blocks; i++) {
-      const auto& entry = trace.block_entries[i];
-      Add(Event::kBlockReserved, entry.thread_id, i, entry.time_reserved);
-      Add(Event::kBlockComputedCoords, entry.thread_id, i,
-          entry.time_computed_coords);
-      Add(Event::kBlockPackedLhs, entry.thread_id, i, entry.time_packed_lhs);
-      Add(Event::kBlockPackedRhs, entry.thread_id, i, entry.time_packed_rhs);
-      Add(Event::kBlockFinished, entry.thread_id, i, entry.time_finished);
-    }
-    std::sort(entries.begin(), entries.end(),
-              [](const Entry& a, const Entry& b) -> bool {
-                return a.time < b.time ||
-                       (a.time == b.time &&
-                        static_cast<int>(a.event) < static_cast<int>(b.event));
-              });
-  }
-  void Dump() {
-    const char* trace_filename = getenv("RUY_TRACE_FILE");
-    FILE* trace_file = trace_filename ? fopen(trace_filename, "w") : stderr;
-    if (!trace_file) {
-      fprintf(stderr, "Failed to open %s for write, errno=%d\n", trace_filename,
-              errno);
-      RUY_CHECK(false);
-    }
-    fprintf(trace_file, "thread_count:%d\n", thread_count);
-    fprintf(trace_file, "num_blocks:%d\n", NumBlocks(block_map));
-    fprintf(trace_file, "rows:%d\n", block_map.rows);
-    fprintf(trace_file, "cols:%d\n", block_map.cols);
-    fprintf(trace_file, "Execute: %.9f\n",
-            ToSeconds(time_execute - time_start));
-    for (const Entry& entry : entries) {
-      double time = ToSeconds(entry.time - time_start);
-      switch (entry.event) {
-        case Event::kThreadStart:
-          fprintf(trace_file, "ThreadStart: %.9f, %d\n", time, entry.thread_id);
-          break;
-        case Event::kThreadLoopStart:
-          fprintf(trace_file, "ThreadLoopStart: %.9f, %d\n", time,
-                  entry.thread_id);
-          break;
-        case Event::kThreadEnd:
-          fprintf(trace_file, "ThreadEnd: %.9f, %d\n", time, entry.thread_id);
-          break;
-        case Event::kBlockReserved: {
-          std::uint16_t block_r, block_c;
-          int start_r, start_c, end_r, end_c;
-          GetBlockByIndex(block_map, entry.block_id, &block_r, &block_c);
-          GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c,
-                               &end_r, &end_c);
-          fprintf(trace_file, "BlockReserved: %.9f, %d, %d, %d, %d, %d, %d\n",
-                  time, entry.thread_id, entry.block_id, start_r, start_c,
-                  end_r, end_c);
-          break;
-        }
-        case Event::kBlockComputedCoords:
-          fprintf(trace_file, "BlockComputedCoords: %.9f, %d, %d\n", time,
-                  entry.thread_id, entry.block_id);
-          break;
-        case Event::kBlockPackedLhs:
-          fprintf(trace_file, "BlockPackedLhs: %.9f, %d, %d\n", time,
-                  entry.thread_id, entry.block_id);
-          break;
-        case Event::kBlockPackedRhs:
-          fprintf(trace_file, "BlockPackedRhs: %.9f, %d, %d\n", time,
-                  entry.thread_id, entry.block_id);
-          break;
-        case Event::kBlockFinished:
-          fprintf(trace_file, "BlockFinished: %.9f, %d, %d\n", time,
-                  entry.thread_id, entry.block_id);
-          break;
-        default:
-          RUY_CHECK(false);
-      }
-    }
-    fprintf(trace_file, "End: %.9f\n", ToSeconds(time_end - time_start));
-    if (trace_filename) {
-      fclose(trace_file);
-    }
-  }
-};
-
-void DumpTrace(const Trace& trace) {
-  ProcessedTrace processed_trace;
-  processed_trace.Process(trace);
-  processed_trace.Dump();
 }
 
+// Sort Trace::entries by ascending time. In case of equal timepoints,
+// sort by some semi-arbitrary ordering of event types.
+void Sort(Trace* trace) {
+  std::sort(std::begin(trace->entries), std::end(trace->entries),
+            [](const TraceEntry& a, const TraceEntry& b) -> bool {
+              return a.time_point < b.time_point ||
+                     (a.time_point == b.time_point &&
+                      static_cast<int>(a.event) < static_cast<int>(b.event));
+            });
+}
+
+// Dump a trace. Assumes that AggregateThreadSpecificEntries and Sort have
+// already been called on it.
+//
+// On some architectures long long ints are not same as std::int64_t, and
+// time is printed as %lld, so static_casts are necessary.
+void Dump(const Trace& trace) {
+  const char* trace_filename = getenv("RUY_TRACE_FILE");
+  FILE* trace_file = trace_filename ? fopen(trace_filename, "w") : stderr;
+  if (!trace_file) {
+    fprintf(stderr, "Failed to open %s for write, errno=%d\n", trace_filename,
+            errno);
+    RUY_CHECK(false);
+  }
+  fprintf(trace_file, "thread_count:%d\n", trace.thread_count);
+  fprintf(trace_file, "rows:%d\n", trace.block_map.dims[Side::kLhs]);
+  fprintf(trace_file, "cols:%d\n", trace.block_map.dims[Side::kRhs]);
+  fprintf(trace_file, "Execute: %lld\n",
+          static_cast<long long int>(
+              ToInt64Nanoseconds(trace.time_execute - trace.time_start)));
+  for (const TraceEntry& entry : trace.entries) {
+    long long int time = static_cast<long long int>(
+        ToInt64Nanoseconds(entry.time_point - trace.time_start));
+    switch (entry.event) {
+      case TraceEvent::kThreadStart:
+        fprintf(trace_file, "ThreadStart: %lld, %d\n", time, entry.thread_id);
+        break;
+      case TraceEvent::kThreadLoopStart:
+        fprintf(trace_file, "ThreadLoopStart: %lld, %d\n", time,
+                entry.thread_id);
+        break;
+      case TraceEvent::kThreadEnd:
+        fprintf(trace_file, "ThreadEnd: %lld, %d\n", time, entry.thread_id);
+        break;
+      case TraceEvent::kBlockReserved: {
+        std::uint32_t block_id = entry.params[0];
+        SidePair<int> block;
+        GetBlockByIndex(trace.block_map, block_id, &block);
+        SidePair<int> start, end;
+        GetBlockMatrixCoords(trace.block_map, block, &start, &end);
+        fprintf(trace_file,
+                "BlockReserved: %lld, %d, %d, %d, %d, %d, %d, %d, %d\n", time,
+                entry.thread_id, block_id, block[Side::kLhs], block[Side::kRhs],
+                start[Side::kLhs], start[Side::kRhs], end[Side::kLhs],
+                end[Side::kRhs]);
+        break;
+      }
+      case TraceEvent::kBlockPackedLhs: {
+        std::uint32_t block = entry.params[0];
+        int start, end;
+        GetBlockMatrixCoords(Side::kLhs, trace.block_map, block, &start, &end);
+        fprintf(trace_file, "BlockPackedLhs: %lld, %d, %d, %d, %d\n", time,
+                entry.thread_id, block, start, end);
+        break;
+      }
+      case TraceEvent::kBlockPackedRhs: {
+        std::uint32_t block = entry.params[0];
+        int start, end;
+        GetBlockMatrixCoords(Side::kRhs, trace.block_map, block, &start, &end);
+        fprintf(trace_file, "BlockPackedRhs: %lld, %d, %d, %d, %d\n", time,
+                entry.thread_id, block, start, end);
+        break;
+      }
+      case TraceEvent::kBlockFinished: {
+        std::uint32_t block_id = entry.params[0];
+        SidePair<int> block;
+        GetBlockByIndex(trace.block_map, block_id, &block);
+        fprintf(trace_file, "BlockFinished: %lld, %d, %d, %d, %d\n", time,
+                entry.thread_id, block_id, block[Side::kLhs],
+                block[Side::kRhs]);
+        break;
+      }
+      default:
+        RUY_CHECK(false);
+    }
+  }
+  fprintf(trace_file, "End: %lld\n",
+          static_cast<long long int>(
+              ToInt64Nanoseconds(trace.time_end - trace.time_start)));
+  if (trace_filename) {
+    fclose(trace_file);
+  }
+}
+
+}  // anonymous namespace
+
+// Get a Trace object to record to, or null of tracing is not enabled.
 Trace* NewTraceOrNull(TracingContext* tracing, int rows, int depth, int cols) {
   if (!tracing->initialized) {
     tracing->initialized = true;
@@ -253,130 +212,114 @@ Trace* NewTraceOrNull(TracingContext* tracing, int rows, int depth, int cols) {
   return tracing->trace;
 }
 
+// The trace recorded on a context is finalized and dumped by
+// this TracingContext destructor.
+//
+// The idea of dumping on context destructor is that typically one wants to
+// run many matrix multiplications, e.g. to hit a steady state in terms of
+// performance characteristics, but only trace the last repetition of the
+// workload, when that steady state was attained.
 TracingContext::~TracingContext() {
   if (trace) {
-    DumpTrace(*trace);
+    AggregateThreadSpecificEntries(trace);
+    Sort(trace);
+    Dump(*trace);
   }
   delete trace;
 }
 
+void TraceRecordStart(Trace* trace) {
+  if (trace) {
+    trace->time_start = Now();
+  }
+}
+
+void TraceRecordExecute(const BlockMap& block_map, int thread_count,
+                        Trace* trace) {
+  if (trace) {
+    trace->time_execute = Now();
+    trace->block_map = block_map;
+    trace->thread_count = thread_count;
+    trace->thread_specific_entries.resize(thread_count);
+    for (int thread = 0; thread < thread_count; thread++) {
+      trace->thread_specific_entries[thread].clear();
+      // Reserve some large size to avoid frequent heap allocations
+      // affecting the recorded timings.
+      trace->thread_specific_entries[thread].reserve(16384);
+    }
+  }
+}
+
+void TraceRecordEnd(Trace* trace) {
+  if (trace) {
+    trace->time_end = Now();
+  }
+}
+
 void TraceRecordThreadStart(std::uint32_t thread_id, Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    relaxed_atomic_store(&trace->block_entries[thread_id].thread_id, thread_id);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->block_entries[thread_id].time_reserved, now);
-    relaxed_atomic_store(&trace->thread_entries[thread_id].time_start, now);
+    TraceEntry entry;
+    entry.event = TraceEvent::kThreadStart;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
 void TraceRecordThreadLoopStart(std::uint32_t thread_id, Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->thread_entries[thread_id].time_loop_start,
-                         now);
+    TraceEntry entry;
+    entry.event = TraceEvent::kThreadLoopStart;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
 void TraceRecordBlockReserved(std::uint32_t thread_id, std::uint32_t block_id,
                               Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    // This is typically called on the next block id just obtained by atomic
-    // increment; this may be out of range.
-    if (block_id < trace->block_entries.size()) {
-      relaxed_atomic_store(&trace->block_entries[block_id].thread_id,
-                           thread_id);
-      TimePoint now = Clock::now();
-      relaxed_atomic_store(&trace->block_entries[block_id].time_reserved, now);
-    }
+    TraceEntry entry;
+    entry.event = TraceEvent::kBlockReserved;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    entry.params[0] = block_id;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
-void TraceRecordBlockCoordsComputed(std::uint32_t block_id, Trace* trace) {
+void TraceRecordBlockPacked(std::uint32_t thread_id, Side side, int block,
+                            Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->block_entries[block_id].time_computed_coords,
-                         now);
+    TraceEntry entry;
+    entry.event = side == Side::kLhs ? TraceEvent::kBlockPackedLhs
+                                     : TraceEvent::kBlockPackedRhs;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    entry.params[0] = block;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
-void TraceRecordBlockPackedLhs(std::uint32_t block_id, Trace* trace) {
+void TraceRecordBlockFinished(std::uint32_t thread_id, std::uint32_t block_id,
+                              Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->block_entries[block_id].time_packed_lhs, now);
-  }
-}
-
-void TraceRecordBlockPackedRhs(std::uint32_t block_id, Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->block_entries[block_id].time_packed_rhs, now);
-  }
-}
-
-void TraceRecordBlockFinished(std::uint32_t block_id, Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->block_entries[block_id].time_finished, now);
+    TraceEntry entry;
+    entry.event = TraceEvent::kBlockFinished;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    entry.params[0] = block_id;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
 void TraceRecordThreadEnd(std::uint32_t thread_id, Trace* trace) {
   if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->thread_entries[thread_id].time_end, now);
-  }
-}
-
-void TraceRecordStart(Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage == Trace::LifeStage::kInitial ||
-               trace->life_stage == Trace::LifeStage::kComplete);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->time_start, now);
-    trace->life_stage = Trace::LifeStage::kRecordingRootFields;
-  }
-}
-
-void TraceRecordExecute(Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage == Trace::LifeStage::kRecordingRootFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->time_execute, now);
-  }
-}
-
-void TraceRecordEnd(Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage ==
-               Trace::LifeStage::kRecordingBlockAndThreadFields);
-    TimePoint now = Clock::now();
-    relaxed_atomic_store(&trace->time_end, now);
-    trace->life_stage = Trace::LifeStage::kComplete;
-  }
-}
-
-void TraceStartRecordingBlockAndThreadFields(const BlockMap& block_map,
-                                             int thread_count, Trace* trace) {
-  if (trace) {
-    RUY_DCHECK(trace->life_stage == Trace::LifeStage::kRecordingRootFields);
-    trace->StartRecordingBlockAndThreadFields(block_map, thread_count);
-    trace->life_stage = Trace::LifeStage::kRecordingBlockAndThreadFields;
+    TraceEntry entry;
+    entry.event = TraceEvent::kThreadEnd;
+    entry.time_point = Now();
+    entry.thread_id = thread_id;
+    trace->thread_specific_entries[thread_id].push_back(entry);
   }
 }
 
diff --git a/tensorflow/lite/experimental/ruy/trace.h b/tensorflow/lite/experimental/ruy/trace.h
index ecd793dd0b8..db028214859 100644
--- a/tensorflow/lite/experimental/ruy/trace.h
+++ b/tensorflow/lite/experimental/ruy/trace.h
@@ -16,12 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRACE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRACE_H_
 
-#include <algorithm>
 #include <cstdint>
-#include <cstdio>
-#include <vector>
 
 #include "tensorflow/lite/experimental/ruy/block_map.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
 
 namespace ruy {
 
@@ -39,23 +37,20 @@ struct TracingContext {
   ~TracingContext();
 };
 
-void DumpTrace(const Trace& trace);
-
 Trace* NewTraceOrNull(TracingContext* context, int rows, int depth, int cols);
 void TraceRecordThreadStart(std::uint32_t thread_id, Trace* trace);
 void TraceRecordThreadLoopStart(std::uint32_t thread_id, Trace* trace);
 void TraceRecordBlockReserved(std::uint32_t thread_id, std::uint32_t block_id,
                               Trace* trace);
-void TraceRecordBlockCoordsComputed(std::uint32_t block_id, Trace* trace);
-void TraceRecordBlockPackedLhs(std::uint32_t block_id, Trace* trace);
-void TraceRecordBlockPackedRhs(std::uint32_t block_id, Trace* trace);
-void TraceRecordBlockFinished(std::uint32_t block_id, Trace* trace);
+void TraceRecordBlockPacked(std::uint32_t thread_id, Side side, int block,
+                            Trace* trace);
+void TraceRecordBlockFinished(std::uint32_t thread_id, std::uint32_t block_id,
+                              Trace* trace);
 void TraceRecordThreadEnd(std::uint32_t thread_id, Trace* trace);
 void TraceRecordStart(Trace* trace);
-void TraceRecordExecute(Trace* trace);
+void TraceRecordExecute(const BlockMap& block_map, int thread_count,
+                        Trace* trace);
 void TraceRecordEnd(Trace* trace);
-void TraceStartRecordingBlockAndThreadFields(const BlockMap& block_map,
-                                             int thread_count, Trace* trace);
 
 #else
 
@@ -65,16 +60,12 @@ inline Trace* NewTraceOrNull(TracingContext*, int, int, int) { return nullptr; }
 inline void TraceRecordThreadStart(std::uint32_t, Trace*) {}
 inline void TraceRecordThreadLoopStart(std::uint32_t, Trace*) {}
 inline void TraceRecordBlockReserved(std::uint32_t, std::uint32_t, Trace*) {}
-inline void TraceRecordBlockCoordsComputed(std::uint32_t, Trace*) {}
-inline void TraceRecordBlockPackedLhs(std::uint32_t, Trace*) {}
-inline void TraceRecordBlockPackedRhs(std::uint32_t, Trace*) {}
-inline void TraceRecordBlockFinished(std::uint32_t, Trace*) {}
+inline void TraceRecordBlockPacked(std::uint32_t, Side, int, Trace*) {}
+inline void TraceRecordBlockFinished(std::uint32_t, std::uint32_t, Trace*) {}
 inline void TraceRecordThreadEnd(std::uint32_t, Trace*) {}
 inline void TraceRecordStart(Trace*) {}
-inline void TraceRecordExecute(Trace*) {}
+inline void TraceRecordExecute(const BlockMap&, int, Trace*) {}
 inline void TraceRecordEnd(Trace*) {}
-inline void TraceStartRecordingBlockAndThreadFields(const BlockMap&, int,
-                                                    Trace*) {}
 
 #endif
 
diff --git a/tensorflow/lite/experimental/ruy/trmul.cc b/tensorflow/lite/experimental/ruy/trmul.cc
index 39f0171b838..561746f4ac6 100644
--- a/tensorflow/lite/experimental/ruy/trmul.cc
+++ b/tensorflow/lite/experimental/ruy/trmul.cc
@@ -15,114 +15,98 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/trmul.h"
 
+#include <atomic>
+#include <cstdint>
 #include <cstring>
+#include <memory>
+#include <vector>
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/allocator.h"
 #include "tensorflow/lite/experimental/ruy/block_map.h"
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
 #include "tensorflow/lite/experimental/ruy/thread_pool.h"
 #include "tensorflow/lite/experimental/ruy/trace.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
 
 namespace ruy {
 
 namespace {
 
+enum class PackingStatus : std::uint8_t { kNotStarted, kInProgress, kFinished };
+
 struct TrMulTask final : Task {
   TrMulTask(TrMulParams* params_, const BlockMap& block_map_,
-            std::atomic<std::uint32_t>* atomic_n_, std::uint32_t thread_id_,
-            std::atomic<bool>* lhs_packed_, std::atomic<bool>* rhs_packed_,
+            std::atomic<int>* atomic_block_id_, int thread_id_,
+            bool need_atomics_,
+            SidePair<std::atomic<PackingStatus>*> packing_status_,
             TuningResolver* tuning_resolver_, Allocator* local_allocator_,
             Trace* trace_)
       : params(params_),
         block_map(block_map_),
-        atomic_n(atomic_n_),
+        atomic_block_id(atomic_block_id_),
         thread_id(thread_id_),
-        lhs_packed(lhs_packed_),
-        rhs_packed(rhs_packed_),
+        need_atomics(need_atomics_),
+        packing_status(packing_status_),
         tuning_resolver(tuning_resolver_),
         local_allocator(local_allocator_),
-        trace(trace_) {}
+        trace(trace_),
+        local_packed{nullptr, nullptr} {}
 
   void Run() override {
     TraceRecordThreadStart(thread_id, trace);
 
-    std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
-    std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
-    std::uint32_t num_blocks = NumBlocks(block_map);
-
-    bool* local_lhs_packed = nullptr;
-    bool* local_rhs_packed = nullptr;
-
-    if (lhs_packed) {
-      local_allocator->Allocate(num_blocks_of_rows, &local_lhs_packed);
-      memset(local_lhs_packed, 0, num_blocks_of_rows * sizeof(bool));
-    }
-    if (rhs_packed) {
-      local_allocator->Allocate(num_blocks_of_cols, &local_rhs_packed);
-      memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool));
+    for (Side side : {Side::kLhs, Side::kRhs}) {
+      if (!params->is_prepacked[side]) {
+        const int size = NumBlocksPerSide(side, block_map);
+        local_allocator->Allocate(size, &local_packed[side]);
+        memset(local_packed[side], 0, size * sizeof(bool));
+      }
     }
 
+    const int num_blocks = NumBlocks(block_map);
+
     const Tuning tuning = tuning_resolver->Resolve();
 
     TraceRecordThreadLoopStart(thread_id, trace);
 
-    std::uint16_t block_r, block_c;
-    int start_r, start_c, end_r, end_c;
+    SidePair<int> block;
+    SidePair<int> start;
+    SidePair<int> end;
 
     // Each thread starts by initially reserving the block whose id
     // is the thread id.
-    std::uint32_t n = thread_id;
-    TraceRecordBlockReserved(thread_id, n, trace);
+    int block_id = thread_id;
+    TraceRecordBlockReserved(thread_id, block_id, trace);
 
-    while (n < num_blocks) {
+    while (block_id < num_blocks) {
       // Reserve the next block to handle. In order to hide the latency
       // (typically comparable to an access to the level of data cache that
       // is shared among CPU cores, e.g. 60 cycles on an ARM CPU as of 2019)
       // of this atomic operation, we structure this code so as to avoid
       // immediately depending on the `next_n` result.
-      const std::uint32_t next_n =
-          atomic_n->fetch_add(1, std::memory_order_relaxed);
-      TraceRecordBlockReserved(thread_id, next_n, trace);
+      const int next_block_id =
+          atomic_block_id->fetch_add(1, std::memory_order_relaxed);
+      TraceRecordBlockReserved(thread_id, next_block_id, trace);
       // Get coordinates of the current block to handle, in "block space".
-      GetBlockByIndex(block_map, n, &block_r, &block_c);
+      GetBlockByIndex(block_map, block_id, &block);
       // Get coordinates of the current block to handle, in matrix space.
-      GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c,
-                           &end_r, &end_c);
-      TraceRecordBlockCoordsComputed(n, trace);
-      // Maybe pack the current LHS block, if not already packed.
-      // Note that if two threads concurrently hit the same LHS block to pack,
-      // we allow them to concurrently pack it, writing the same packed matrix
-      // data to the same location. That is considered worth it to avoid
-      // having one thread blocked on another one. Avoiding that is considered
-      // important especially on mobile, where there can be large speed
-      // discrepancy between threads, e.g. if different threads are scheduled
-      // on CPU cores of different types (big/little), different clock speed,
-      // different contention with other processes.
-      if (local_lhs_packed && !local_lhs_packed[block_r]) {
-        if (!lhs_packed[block_r].load(std::memory_order_acquire)) {
-          params->LhsRunPack(tuning, start_r, end_r);
-          TraceRecordBlockPackedLhs(n, trace);
-          local_lhs_packed[block_r] = true;
-          lhs_packed[block_r].store(true, std::memory_order_release);
-        }
-      }
-      // Maybe pack the current RHS block. Same comments as above for LHS.
-      if (local_rhs_packed && !local_rhs_packed[block_c]) {
-        if (!rhs_packed[block_c].load(std::memory_order_acquire)) {
-          params->RhsRunPack(tuning, start_c, end_c);
-          TraceRecordBlockPackedRhs(n, trace);
-          local_rhs_packed[block_c] = true;
-          rhs_packed[block_c].store(true, std::memory_order_release);
-        }
-      }
+      GetBlockMatrixCoords(block_map, block, &start, &end);
+      // Maybe pack the current LHS/RHS block, if not already packed.
+      EnsurePacked(block, start, end, tuning);
       // Actually do matrix multiplication work
-      params->RunKernel(tuning, start_r, start_c, end_r, end_c);
-      TraceRecordBlockFinished(n, trace);
+      params->RunKernel(tuning, start, end);
+      TraceRecordBlockFinished(thread_id, block_id, trace);
       // Move on to the next block as obtained by the atomic increment
       // at the start of this while loop iteration.
-      n = next_n;
+      block_id = next_block_id;
     }
 
     local_allocator->FreeAll();
@@ -131,15 +115,128 @@ struct TrMulTask final : Task {
   }
 
  private:
+  // Tries to pack a block, without blocking.
+  // If the block was already packed, returns true.
+  // If the block was not started packing, packs it and returns true.
+  // If the block was being packed by another thread, returns false.
+  bool TryPack(Side side, int block, int start, int end, Tuning tuning) {
+    if (params->is_prepacked[side]) {
+      return true;
+    }
+    if (!local_packed[side][block]) {
+      if (need_atomics) {
+        // Explanation of this compare_exchange_strong operation:
+        // This atomically performs all of the following:
+        // 1. Read `status` with "acquire" memory order.
+        //    * That this read uses "acquire" is because both memory orders
+        //      specified have "acquire" as their read-component.
+        // 2. Compare (bitwise) with `exchanged_status`.
+        // 3. If equal, stores the value kInProgress to `status` with "release"
+        //    memory order, and returns true, so we take this 'if' branch.
+        //    * That this store uses "release" is because of the _rel part in
+        //      memory_order_acq_rel passed as the first memory order argument.
+        // 4. If not equal, stores the loaded value of `status` to
+        //    `exchanged_status` with "relaxed" semantics, and returns false,
+        //    so we take the 'else' branch.
+        //    * That this store uses "relaxed" is because the second memory
+        //      order argument, memory_order_acquire, implies no particular
+        //      store semantics. "relaxed" is acceptable here because this
+        //      stores to a local stack variable.
+        //
+        // Rationale for compare_exchange_strong as opposed to
+        // compare_exchange_weak:
+        // The spurious-failure case with compare_exchange_weak will actually
+        // happen a lot here, because the atomic 'status' bytes are stored
+        // contiguously in arrays and neighboring values will be accessed
+        // by multiple threads concurrently. On a typical ARM CPU, an exclusives
+        // reservation granule is 64 bytes, so a lot of false-sharing may
+        // happen. Using compare_exchange_weak would thus result in often having
+        // TryPack return 'false' when it could instead have done the packing
+        // work and returned 'true'. Heuristically, that is not a good thing.
+        // Moreover, this changes the TryPack contract, loosening it and making
+        // it harder for the caller to reason about. Finally, the overhead of
+        // atomic operations is mitigated by the enclosing check on
+        // local_packed, so maybe the overhead of compare_exchange_strong isn't
+        // such a problem. But we don't really know for sure, that would be
+        // interesting to experiment more with.
+        PackingStatus exchanged_status = PackingStatus::kNotStarted;
+        std::atomic<PackingStatus>& status = packing_status[side][block];
+        if (status.compare_exchange_strong(
+                exchanged_status, PackingStatus::kInProgress,
+                std::memory_order_acq_rel, std::memory_order_acquire)) {
+          // In this branch, the status was kNotStarted and we just atomically
+          // changed it to kInProgress as we are about to handle the packing
+          // ourselves.
+          params->RunPack(side, tuning, start, end);
+          TraceRecordBlockPacked(thread_id, side, block, trace);
+          status.store(PackingStatus::kFinished, std::memory_order_release);
+        } else if (exchanged_status == PackingStatus::kInProgress) {
+          // Another thread is currently packing this block.
+          return false;
+        }
+        RUY_DCHECK(status.load(std::memory_order_acquire) ==
+                   PackingStatus::kFinished);
+      } else {
+        // Single-threaded case: no need for expensive atomics, local_packed
+        // is the truth already.
+        params->RunPack(side, tuning, start, end);
+        TraceRecordBlockPacked(thread_id, side, block, trace);
+      }
+      local_packed[side][block] = true;
+    }
+    return true;
+  }
+
+  // Ensures that both the LHS and RHS blocks required by the specified block
+  // are packed. In the event that they are already being packed on another
+  // threads, this function may perform the packing of some other block while
+  // waiting for that other thread to finish packing the requested block.
+  void EnsurePacked(const SidePair<int>& block, const SidePair<int>& start,
+                    const SidePair<int>& end, Tuning tuning) {
+#if RUY_OPT_ENABLED(RUY_OPT_PACK_AHEAD)
+    SidePair<int> next_runahead_block{block[Side::kLhs] + 1,
+                                      block[Side::kRhs] + 1};
+    Side next_runahead_side = Side::kLhs;
+#endif
+    while (true) {
+      bool both_sides_packed = true;
+      for (Side side : {Side::kLhs, Side::kRhs}) {
+        both_sides_packed &=
+            TryPack(side, block[side], start[side], end[side], tuning);
+      }
+      if (both_sides_packed) {
+        break;
+      }
+#if RUY_OPT_ENABLED(RUY_OPT_PACK_AHEAD)
+      const Side runahead_side = next_runahead_side;
+      const int runahead_block = next_runahead_block[runahead_side];
+      next_runahead_side =
+          next_runahead_side == Side::kLhs ? Side::kRhs : Side::kLhs;
+      if (runahead_block >= NumBlocksPerSide(runahead_side, block_map)) {
+        continue;
+      }
+      int runahead_block_start, runahead_block_end;
+      GetBlockMatrixCoords(runahead_side, block_map, runahead_block,
+                           &runahead_block_start, &runahead_block_end);
+      TryPack(runahead_side, runahead_block, runahead_block_start,
+              runahead_block_end, tuning);
+      next_runahead_block[runahead_side] = runahead_block + 1;
+#endif
+    }
+  }
+
   TrMulParams* params;
   const BlockMap& block_map;
-  std::atomic<std::uint32_t>* atomic_n;
-  std::uint32_t thread_id;
-  std::atomic<bool>* lhs_packed;
-  std::atomic<bool>* rhs_packed;
+  std::atomic<int>* atomic_block_id;
+  int thread_id;
+  bool need_atomics;
+  SidePair<std::atomic<PackingStatus>*> packing_status;
   TuningResolver* tuning_resolver;
   Allocator* local_allocator;
   Trace* trace;
+
+  // Local indicators of packedness to avoid the overhead of atomic ops.
+  SidePair<bool*> local_packed;
 };
 
 void AllocatePMatrix(Allocator* allocator, PMatrix* packed) {
@@ -148,6 +245,10 @@ void AllocatePMatrix(Allocator* allocator, PMatrix* packed) {
 }
 
 int GetThreadCount(Context* context, int rows, int cols, int depth) {
+#if RUY_PLATFORM(EMSCRIPTEN)
+  // b/139927184, std::thread constructor raises exception
+  return 1;
+#endif
   // Empirically determined rule for reasonable number of
   // threads to use. This is proportional to the number of arithmetic ops
   // in this Mul (product of the 3 sizes).
@@ -169,16 +270,14 @@ LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
 void TrMul(TrMulParams* params, Context* context) {
   gemmlowp::ScopedProfilingLabel label("TrMul");
 
-  PMatrix& packed_lhs = params->packed_lhs;
-  PMatrix& packed_rhs = params->packed_rhs;
-  DMatrix& lhs = params->lhs;
-  DMatrix& rhs = params->rhs;
+  PMatrix& packed_lhs = params->packed[Side::kLhs];
+  PMatrix& packed_rhs = params->packed[Side::kRhs];
+  DMatrix& lhs = params->src[Side::kLhs];
+  DMatrix& rhs = params->src[Side::kRhs];
 
   const int rows = lhs.layout.cols;
   const int cols = rhs.layout.cols;
   const int depth = lhs.layout.rows;
-  const int rows_rounded_up = packed_lhs.layout.cols;
-  const int cols_rounded_up = packed_rhs.layout.cols;
 
   int thread_count = GetThreadCount(context, rows, cols, depth);
   const auto loop_structure =
@@ -186,24 +285,30 @@ void TrMul(TrMulParams* params, Context* context) {
                        params->cache_friendly_traversal_threshold);
   Allocator* allocator = context->GetMainAllocator();
 
-  if (!params->lhs_is_prepacked) {
-    AllocatePMatrix(allocator, &packed_lhs);
-  }
-  if (!params->rhs_is_prepacked) {
-    AllocatePMatrix(allocator, &packed_rhs);
+  // Allocate packed matrices
+  for (Side side : {Side::kLhs, Side::kRhs}) {
+    if (!params->is_prepacked[side]) {
+      AllocatePMatrix(allocator, &params->packed[side]);
+    }
   }
 
+  // Case of running this TrMul as a simple loop.
+  // This is a good place to start reading this function: all the rest
+  // of this function is just an optimized, but functionally equivalent,
+  // version of that.
   if (loop_structure == LoopStructure::kSimple) {
     gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
     Tuning tuning = context->GetMainThreadTuning();
 
-    if (!params->lhs_is_prepacked) {
-      params->LhsRunPack(tuning, 0, rows_rounded_up);
+    const SidePair<int> origin{0, 0};
+    const SidePair<int> rounded_dims{packed_lhs.layout.cols,
+                                     packed_rhs.layout.cols};
+    for (Side side : {Side::kLhs, Side::kRhs}) {
+      if (!params->is_prepacked[side]) {
+        params->RunPack(side, tuning, origin[side], rounded_dims[side]);
+      }
     }
-    if (!params->rhs_is_prepacked) {
-      params->RhsRunPack(tuning, 0, cols_rounded_up);
-    }
-    params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up);
+    params->RunKernel(tuning, origin, rounded_dims);
 
     allocator->FreeAll();
     return;
@@ -216,60 +321,56 @@ void TrMul(TrMulParams* params, Context* context) {
 
   // Initialize block map.
   BlockMap block_map;
-  MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
+  MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
                packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
                packed_lhs.data_type.size, packed_rhs.data_type.size,
                params->cache_friendly_traversal_threshold, &block_map);
-  std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
-  std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
-  std::uint32_t num_blocks = NumBlocks(block_map);
-  RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols);
 
   // Initialize per-thread state.
-  thread_count = clamp(thread_count, 1, num_blocks);
+  thread_count = clamp(thread_count, 1, NumBlocks(block_map));
+  const bool need_atomics = thread_count > 1;
   context->EnsureNPerThreadStates(thread_count);
   for (auto& per_thread_state : context->per_thread_states) {
     per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning);
   }
 
-  // Allocate memory.
-  std::atomic<bool>* lhs_packed = nullptr;
-  if (!params->lhs_is_prepacked) {
-    allocator->Allocate(num_blocks_of_rows, &lhs_packed);
+  // In the need_atomics case, allocate and initialize atomic values tracking
+  // the packing status of blocks.
+  SidePair<std::atomic<PackingStatus>*> packing_status{nullptr, nullptr};
+  if (need_atomics) {
+    for (Side side : {Side::kLhs, Side::kRhs}) {
+      if (!params->is_prepacked[side]) {
+        const int size = NumBlocksPerSide(side, block_map);
+        allocator->Allocate(size, &packing_status[side]);
+        for (int i = 0; i < size; i++) {
+          packing_status[side][i].store(PackingStatus::kNotStarted,
+                                        std::memory_order_relaxed);
+        }
+      }
+    }
   }
-  std::atomic<bool>* rhs_packed = nullptr;
-  if (!params->rhs_is_prepacked) {
-    allocator->Allocate(num_blocks_of_cols, &rhs_packed);
-  }
-  std::atomic<std::uint32_t>* atomic_n;
-  allocator->Allocate(1, &atomic_n);
+
+  // Create the atomic block id, allocate it using Allocator so that
+  // we get the alignment ensuring that it sits alone in its exclusives
+  // reservation granule.
+  std::atomic<int>* atomic_block_id;
+  allocator->Allocate(1, &atomic_block_id);
+
+  // Create task objects.
   TrMulTask* tasks;
   allocator->Allocate(thread_count, &tasks);
 
-  // Initialize allocated data.
-  if (lhs_packed != nullptr) {
-    for (int i = 0; i < num_blocks_of_rows; i++) {
-      lhs_packed[i].store(false, std::memory_order_release);
-    }
-  }
-  if (rhs_packed != nullptr) {
-    for (int i = 0; i < num_blocks_of_cols; i++) {
-      rhs_packed[i].store(false, std::memory_order_release);
-    }
-  }
-  atomic_n->store(thread_count);
+  atomic_block_id->store(thread_count);
 
   for (int i = 0; i < thread_count; i++) {
-    new (tasks + i)
-        TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed,
-                  &context->per_thread_states[i]->tuning_resolver,
-                  &context->per_thread_states[i]->allocator, trace);
+    new (tasks + i) TrMulTask(params, block_map, atomic_block_id, i,
+                              need_atomics, packing_status,
+                              &context->per_thread_states[i]->tuning_resolver,
+                              &context->per_thread_states[i]->allocator, trace);
   }
 
   // Do the computation.
-  TraceRecordExecute(trace);
-  TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace);
-
+  TraceRecordExecute(block_map, thread_count, trace);
   context->workers_pool.Execute(thread_count, tasks);
 
   // Finish up.
@@ -277,9 +378,8 @@ void TrMul(TrMulParams* params, Context* context) {
     tasks[i].~TrMulTask();
   }
 
-  TraceRecordEnd(trace);
-
   allocator->FreeAll();
+  TraceRecordEnd(trace);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/trmul.h b/tensorflow/lite/experimental/ruy/trmul.h
index 1a3872bc2ba..6f7d7ba4590 100644
--- a/tensorflow/lite/experimental/ruy/trmul.h
+++ b/tensorflow/lite/experimental/ruy/trmul.h
@@ -27,47 +27,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_H_
 
 #include "tensorflow/lite/experimental/ruy/context.h"
-#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
-#include "tensorflow/lite/experimental/ruy/kernel.h"
-#include "tensorflow/lite/experimental/ruy/pack.h"
-#include "tensorflow/lite/experimental/ruy/tune.h"
+#include "tensorflow/lite/experimental/ruy/trmul_params.h"
 
 namespace ruy {
 
-// Type-erased data needed for implementing TrMul.
-struct TrMulParams {
-  // Helper functions for invoking the function pointers.
-  void LhsRunPack(Tuning tuning, int start_c, int end_c) {
-    lhs_run_pack(tuning, lhs, &packed_lhs, start_c, end_c);
-  }
-  void RhsRunPack(Tuning tuning, int start_c, int end_c) {
-    rhs_run_pack(tuning, rhs, &packed_rhs, start_c, end_c);
-  }
-  void RunKernel(Tuning tuning, int start_r, int start_c, int end_r,
-                 int end_c) {
-    run_kernel(tuning, packed_lhs, packed_rhs, spec, start_r, start_c, end_r,
-               end_c, &dst);
-  }
-
-  // Function pointers to type-erased entry points for kernels and packers.
-  RunPackFn* lhs_run_pack = nullptr;
-  RunPackFn* rhs_run_pack = nullptr;
-  RunKernelFn* run_kernel = nullptr;
-
-  // Matrices and packed matrices.
-  DMatrix lhs;
-  DMatrix rhs;
-  DMatrix dst;
-  PMatrix packed_lhs;
-  PMatrix packed_rhs;
-  bool lhs_is_prepacked = false;
-  bool rhs_is_prepacked = false;
-  int cache_friendly_traversal_threshold = 0;
-
-  // Type-erased Spec.
-  void* spec = nullptr;
-};
-
 void TrMul(TrMulParams* params, Context* context);
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/trmul_params.h b/tensorflow/lite/experimental/ruy/trmul_params.h
new file mode 100644
index 00000000000..2d06604afa5
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/trmul_params.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_
+
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/side_pair.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
+
+namespace ruy {
+
+using RunKernelFn = void(Tuning, const SidePair<PMatrix>&, void*,
+                         const SidePair<int>&, const SidePair<int>&, DMatrix*);
+
+using RunPackFn = void(Tuning, const DMatrix&, PMatrix*, int, int);
+
+// Type-erased data needed for implementing TrMul.
+struct TrMulParams {
+  TrMulParams() : run_pack{nullptr, nullptr}, is_prepacked{false, false} {}
+  // Helper functions for invoking the function pointers.
+  void RunPack(Side side, Tuning tuning, int start, int end) {
+    run_pack[side](tuning, src[side], &packed[side], start, end);
+  }
+  void RunKernel(Tuning tuning, const SidePair<int>& start,
+                 const SidePair<int>& end) {
+    run_kernel(tuning, packed, spec, start, end, &dst);
+  }
+
+  // Function pointers to type-erased entry points for kernels and packers.
+  SidePair<RunPackFn*> run_pack;
+  RunKernelFn* run_kernel = nullptr;
+
+  // Matrices and packed matrices.
+  SidePair<DMatrix> src;
+  DMatrix dst;
+  SidePair<PMatrix> packed;
+  SidePair<bool> is_prepacked;
+  int cache_friendly_traversal_threshold = 0;
+
+  // Type-erased Spec.
+  void* spec = nullptr;
+};
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_
diff --git a/tensorflow/lite/experimental/ruy/tune.cc b/tensorflow/lite/experimental/ruy/tune.cc
index d2ca263e706..3249b5b211c 100644
--- a/tensorflow/lite/experimental/ruy/tune.cc
+++ b/tensorflow/lite/experimental/ruy/tune.cc
@@ -18,13 +18,9 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 
-#include "tensorflow/lite/experimental/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/platform.h"
-#include "tensorflow/lite/experimental/ruy/time.h"
-
 namespace ruy {
 
-#if RUY_PLATFORM(NEON_64)
+#ifdef RUY_IMPLEMENT_TUNING
 
 namespace {
 
@@ -90,16 +86,17 @@ float TuningResolver::EvalRatio() {
   Duration timing_nicely_ordered = Duration::max();
 
   for (int r = 0; r < kRepeats; r++) {
-    TimePoint t0 = Clock::now();
+    TimePoint t0 = Now();
     PoorlyOrderedKernel(kLoopIters);
-    TimePoint t1 = Clock::now();
+    TimePoint t1 = Now();
     NicelyOrderedKernel(kLoopIters);
-    TimePoint t2 = Clock::now();
+    TimePoint t2 = Now();
     timing_poorly_ordered = std::min(timing_poorly_ordered, t1 - t0);
     timing_nicely_ordered = std::min(timing_nicely_ordered, t2 - t1);
   }
 
-  return ToSeconds(timing_nicely_ordered) / ToSeconds(timing_poorly_ordered);
+  return ToFloatSeconds(timing_nicely_ordered) /
+         ToFloatSeconds(timing_poorly_ordered);
 }
 
 float TuningResolver::ThresholdRatio() {
@@ -131,7 +128,7 @@ Tuning TuningResolver::ResolveNow() {
   return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder;
 }
 
-#else  // not RUY_PLATFORM(NEON_64)
+#else  // not defined RUY_IMPLEMENT_TUNING
 
 float TuningResolver::EvalRatio() { return 0; }
 float TuningResolver::ThresholdRatio() { return 0; }
@@ -140,19 +137,15 @@ Tuning TuningResolver::ResolveNow() { return Tuning::kOutOfOrder; }
 
 #endif
 
-static constexpr double kExpirySecs = 0.25;
-
 TuningResolver::TuningResolver()
-    : expiry_duration_(DurationFromSeconds(kExpirySecs)) {}
+    : expiry_duration_(DurationFromMilliseconds(250)) {}
 
 Tuning TuningResolver::Resolve() {
-#if !RUY_OPT_ENABLED(RUY_OPT_TUNING)
-  return Tuning::kOutOfOrder;
-#endif
+#ifdef RUY_IMPLEMENT_TUNING
   if (unresolved_tuning_ != Tuning::kAuto) {
     return unresolved_tuning_;
   }
-  TimePoint new_timepoint = Clock::now();
+  TimePoint new_timepoint = CoarseNow();
   if (last_resolved_tuning_ != Tuning::kAuto &&
       (new_timepoint - last_resolved_timepoint_) < expiry_duration_) {
     return last_resolved_tuning_;
@@ -160,6 +153,9 @@ Tuning TuningResolver::Resolve() {
   last_resolved_timepoint_ = new_timepoint;
   last_resolved_tuning_ = ResolveNow();
   return last_resolved_tuning_;
+#else
+  return Tuning::kOutOfOrder;
+#endif
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/tune.h b/tensorflow/lite/experimental/ruy/tune.h
index c1b95842b87..c6257781402 100644
--- a/tensorflow/lite/experimental/ruy/tune.h
+++ b/tensorflow/lite/experimental/ruy/tune.h
@@ -72,10 +72,21 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TUNE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TUNE_H_
 
-#include <cstdint>
-
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
 #include "tensorflow/lite/experimental/ruy/time.h"
 
+// Tuning only implemented on NEON_64 at the moment (see assembly code
+// in the nano-benchmark) and not on Apple (some Apple CPUs produce incorrect
+// results on in-order-tuned kernels combining ARM and NEON load instructions
+// and NEON `ins` instructions).
+//
+// When tuning is not implemented, we simply always use Tuning::kOutOfOrder.
+#if RUY_OPT_ENABLED(RUY_OPT_TUNING) && RUY_PLATFORM(NEON_64) && \
+    !RUY_PLATFORM(APPLE)
+#define RUY_IMPLEMENT_TUNING
+#endif
+
 namespace ruy {
 
 enum class Tuning {
diff --git a/tensorflow/lite/experimental/ruy/tune_test.cc b/tensorflow/lite/experimental/ruy/tune_test.cc
index 571c2189e81..051c34910b6 100644
--- a/tensorflow/lite/experimental/ruy/tune_test.cc
+++ b/tensorflow/lite/experimental/ruy/tune_test.cc
@@ -33,6 +33,7 @@ TEST(TuneTest, TuneTest) {
 
   tuning_resolver.SetTuning(Tuning::kAuto);
 
+#ifdef RUY_IMPLEMENT_TUNING
   for (auto tuning : {Tuning::kOutOfOrder, Tuning::kInOrder}) {
     tuning_resolver.SetTuning(tuning);
     ASSERT_TRUE(tuning_resolver.Resolve() == tuning);
@@ -40,6 +41,7 @@ TEST(TuneTest, TuneTest) {
     std::this_thread::sleep_for(std::chrono::seconds(1));
     ASSERT_TRUE(tuning_resolver.Resolve() == tuning);
   }
+#endif
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/ruy/wait.cc b/tensorflow/lite/experimental/ruy/wait.cc
index 56000f32e0b..310f53d9ca5 100644
--- a/tensorflow/lite/experimental/ruy/wait.cc
+++ b/tensorflow/lite/experimental/ruy/wait.cc
@@ -15,11 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/wait.h"
 
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <functional>
-#include <mutex>  // NOLINT(build/c++11)
-
-#include "tensorflow/lite/experimental/ruy/time.h"
+#include <chrono>  // NOLINT(build/c++11)
 
 namespace ruy {
 
@@ -32,8 +28,8 @@ void WaitUntil(const std::function<bool()>& condition,
   }
 
   // Then try busy-waiting.
-  const TimePoint wait_start = Clock::now();
-  while (Clock::now() - wait_start < spin_duration) {
+  const TimePoint wait_start = Now();
+  while (Now() - wait_start < spin_duration) {
     if (condition()) {
       return;
     }
@@ -67,8 +63,7 @@ void WaitUntil(const std::function<bool()>& condition,
   // a little while, then start on a new GEMM. In that case the wait interval
   // may be a little longer. There may also not be another GEMM for a long time,
   // in which case we'll end up passively waiting below.
-  const double kMaxBusyWaitSeconds = 2e-3;
-  const Duration spin_duration = DurationFromSeconds(kMaxBusyWaitSeconds);
+  const Duration spin_duration = DurationFromMilliseconds(2);
   WaitUntil(condition, spin_duration, condvar, mutex);
 }
 
diff --git a/tensorflow/lite/experimental/ruy/wait.h b/tensorflow/lite/experimental/ruy/wait.h
index df4f3e32dba..ae38836626f 100644
--- a/tensorflow/lite/experimental/ruy/wait.h
+++ b/tensorflow/lite/experimental/ruy/wait.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <condition_variable>  // NOLINT(build/c++11)
 #include <functional>
-#include <mutex>  // NOLINT(build/c++11)
+#include <mutex>  //  NOLINT(build/c++11)
 
 #include "tensorflow/lite/experimental/ruy/time.h"
 
diff --git a/tensorflow/lite/experimental/ruy/wait_test.cc b/tensorflow/lite/experimental/ruy/wait_test.cc
index a19d8c85860..4185ac70eaf 100644
--- a/tensorflow/lite/experimental/ruy/wait_test.cc
+++ b/tensorflow/lite/experimental/ruy/wait_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <thread>              // NOLINT(build/c++11)
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
 namespace {
@@ -39,9 +40,15 @@ class ThreadCountingUpToValue {
         condvar_(condvar),
         mutex_(mutex) {}
   void operator()() {
+    // end_value_==-1 is how the master thread will tell us it's OK to terminate
     while (end_value_.load() != -1) {
+      // wait until end_value is set to a higher value
+      while (value_->load() == end_value_.load()) {
+      }
+      // increment value as long as it's lower than end_value
       while (value_->fetch_add(1) < end_value_.load() - 1) {
       }
+      // when value has reached end_value, notify the master thread.
       while (value_->load() == end_value_.load()) {
         std::lock_guard<std::mutex> lock(*mutex_);
         condvar_->notify_all();
@@ -56,13 +63,18 @@ class ThreadCountingUpToValue {
   std::mutex* mutex_;
 };
 
-void WaitTest(const Duration& spin_duration) {
+void WaitTest(const Duration& spin_duration, const Duration& delay) {
+#if RUY_PLATFORM(EMSCRIPTEN)
+  // b/139927184, std::thread constructor raises exception
+  return;
+#endif
   std::condition_variable condvar;
   std::mutex mutex;
   std::atomic<int> value(0);
   std::atomic<int> end_value(0);
   ThreadCountingUpToValue thread_callable(end_value, &value, &condvar, &mutex);
   std::thread thread(thread_callable);
+  std::this_thread::sleep_for(delay);
   for (int i = 1; i < 10; i++) {
     end_value.store(1000 * i);
     const auto& condition = [&value, &end_value]() {
@@ -75,17 +87,26 @@ void WaitTest(const Duration& spin_duration) {
   thread.join();
 }
 
-TEST(WaitTest, WaitTestNoSpin) { WaitTest(DurationFromSeconds(0)); }
+TEST(WaitTest, WaitTestNoSpin) {
+  WaitTest(DurationFromSeconds(0), DurationFromSeconds(0));
+}
 
 TEST(WaitTest, WaitTestSpinOneMicrosecond) {
-  WaitTest(DurationFromSeconds(1e-6));
+  WaitTest(DurationFromSeconds(1e-6), DurationFromSeconds(0));
 }
 
 TEST(WaitTest, WaitTestSpinOneMillisecond) {
-  WaitTest(DurationFromSeconds(1e-3));
+  WaitTest(DurationFromSeconds(1e-3), DurationFromSeconds(0));
 }
 
-TEST(WaitTest, WaitTestSpinOneSecond) { WaitTest(DurationFromSeconds(1)); }
+TEST(WaitTest, WaitTestSpinOneSecond) {
+  WaitTest(DurationFromSeconds(1), DurationFromSeconds(0));
+}
+
+// Testcase to consistently reproduce the hang in b/139062384.
+TEST(WaitTest, WaitTestNoSpinWithDelayBug139062384) {
+  WaitTest(DurationFromSeconds(0), DurationFromSeconds(1));
+}
 
 }  // namespace
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 0a2126be310..7a78c98a228 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -25,7 +25,9 @@ ios_unit_test(
     name = "Tests",
     size = "small",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
+    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS + [
+        "nozapfhahn",  # Fails during coverage build, see b/139134323.
+    ],
     deps = [
         ":TestsLibrary",
     ],
@@ -35,7 +37,9 @@ swift_library(
     name = "TestsLibrary",
     testonly = 1,
     srcs = glob(["Tests/*.swift"]),
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + [
+        "nozapfhahn",  # Fails during coverage build, see b/139134323.
+    ],
     deps = [
         ":Resources",
         ":TensorFlowLite",
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift b/tensorflow/lite/experimental/swift/Sources/Delegate.swift
similarity index 55%
rename from tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
rename to tensorflow/lite/experimental/swift/Sources/Delegate.swift
index ae2bbc4b5e6..7b73d65bf80 100644
--- a/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Delegate.swift
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2019 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/// Custom configuration options for a TensorFlow Lite `Interpreter`.
-public struct InterpreterOptions: Equatable {
+import TensorFlowLiteC
 
-  /// Maximum number of CPU threads that the interpreter should run on. Default is `nil` which
-  /// indicates that the `Interpreter` will decide the number of threads to use.
-  public var threadCount: Int? = nil
+/// A delegate that the `Interpreter` uses to perform TensorFlow Lite model computations.
+public protocol Delegate: class {
+  /// The `TfLiteDelegate` C pointer type.
+  typealias CDelegate = UnsafeMutablePointer<TfLiteDelegate>
 
-  /// Creates a new instance of interpreter options.
-  public init() {}
+  /// The delegate that performs model computations.
+  var cDelegate: CDelegate { get }
 }
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index 457ca4128f6..ab48e720264 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -17,41 +17,44 @@ import TensorFlowLiteC
 
 /// A TensorFlow Lite interpreter that performs inference from a given model.
 public final class Interpreter {
+  /// The configuration options for the `Interpreter`.
+  public let options: Options?
 
-  /// The `TFL_Interpreter` C pointer type represented as an `UnsafePointer<TFL_Interpreter>`.
+  /// The total number of input tensors associated with the model.
+  public var inputTensorCount: Int {
+    return Int(TfLiteInterpreterGetInputTensorCount(cInterpreter))
+  }
+
+  /// The total number of output tensors associated with the model.
+  public var outputTensorCount: Int {
+    return Int(TfLiteInterpreterGetOutputTensorCount(cInterpreter))
+  }
+
+  /// The `TfLiteInterpreter` C pointer type represented as an `UnsafePointer<TfLiteInterpreter>`.
   private typealias CInterpreter = OpaquePointer
 
-  /// Total number of input tensors associated with the model.
-  public var inputTensorCount: Int {
-    return Int(TFL_InterpreterGetInputTensorCount(cInterpreter))
-  }
-
-  /// Total number of output tensors associated with the model.
-  public var outputTensorCount: Int {
-    return Int(TFL_InterpreterGetOutputTensorCount(cInterpreter))
-  }
-
-  /// The underlying `TFL_Interpreter` C pointer.
+  /// The underlying `TfLiteInterpreter` C pointer.
   private var cInterpreter: CInterpreter?
 
-  /// Creates a new model interpreter instance.
+  /// Creates a new instance with the given values.
   ///
   /// - Parameters:
   ///   - modelPath: Local file path to a TensorFlow Lite model.
-  ///   - options: Custom configurations for the interpreter. The default is `nil` indicating that
-  ///       the interpreter will determine the configuration options.
+  ///   - options: Custom configuration options for the interpreter. Default is `nil` indicating
+  ///       that the interpreter will determine the configuration options.
   /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
-  public init(modelPath: String, options: InterpreterOptions? = nil) throws {
+  public init(modelPath: String, options: Options? = nil) throws {
     guard let model = Model(filePath: modelPath) else { throw InterpreterError.failedToLoadModel }
 
+    self.options = options
     let cInterpreterOptions: OpaquePointer? = try options.map { options in
-      guard let cOptions = TFL_NewInterpreterOptions() else {
+      guard let cOptions = TfLiteInterpreterOptionsCreate() else {
         throw InterpreterError.failedToCreateInterpreter
       }
       if let threadCount = options.threadCount, threadCount > 0 {
-        TFL_InterpreterOptionsSetNumThreads(cOptions, Int32(threadCount))
+        TfLiteInterpreterOptionsSetNumThreads(cOptions, Int32(threadCount))
       }
-      TFL_InterpreterOptionsSetErrorReporter(
+      TfLiteInterpreterOptionsSetErrorReporter(
         cOptions,
         { (_, format, args) -> Void in
           // Workaround for optionality differences for x86_64 (non-optional) and arm64 (optional).
@@ -68,23 +71,23 @@ public final class Interpreter {
       )
       return cOptions
     }
-    defer { TFL_DeleteInterpreterOptions(cInterpreterOptions) }
+    defer { TfLiteInterpreterOptionsDelete(cInterpreterOptions) }
 
-    guard let cInterpreter = TFL_NewInterpreter(model.cModel, cInterpreterOptions) else {
+    guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
       throw InterpreterError.failedToCreateInterpreter
     }
     self.cInterpreter = cInterpreter
   }
 
   deinit {
-    TFL_DeleteInterpreter(cInterpreter)
+    TfLiteInterpreterDelete(cInterpreter)
   }
 
   /// Invokes the interpreter to perform inference from the loaded graph.
   ///
   /// - Throws: An error if the model was not ready because tensors were not allocated.
   public func invoke() throws {
-    guard TFL_InterpreterInvoke(cInterpreter) == kTfLiteOk else {
+    guard TfLiteInterpreterInvoke(cInterpreter) == kTfLiteOk else {
       throw InterpreterError.allocateTensorsRequired
     }
   }
@@ -100,23 +103,23 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)),
-      let bytes = TFL_TensorData(cTensor),
-      let nameCString = TFL_TensorName(cTensor)
+    guard let cTensor = TfLiteInterpreterGetInputTensor(cInterpreter, Int32(index)),
+      let bytes = TfLiteTensorData(cTensor),
+      let nameCString = TfLiteTensorName(cTensor)
     else {
       throw InterpreterError.allocateTensorsRequired
     }
-    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+    guard let dataType = Tensor.DataType(type: TfLiteTensorType(cTensor)) else {
       throw InterpreterError.invalidTensorDataType
     }
 
     let name = String(cString: nameCString)
-    let rank = TFL_TensorNumDims(cTensor)
-    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
-    let shape = TensorShape(dimensions)
-    let byteCount = TFL_TensorByteSize(cTensor)
+    let rank = TfLiteTensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TfLiteTensorDim(cTensor, $0)) }
+    let shape = Tensor.Shape(dimensions)
+    let byteCount = TfLiteTensorByteSize(cTensor)
     let data = Data(bytes: bytes, count: byteCount)
-    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let cQuantizationParams = TfLiteTensorQuantizationParams(cTensor)
     let scale = cQuantizationParams.scale
     let zeroPoint = Int(cQuantizationParams.zero_point)
     var quantizationParameters: QuantizationParameters? = nil
@@ -146,23 +149,23 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard let cTensor = TFL_InterpreterGetOutputTensor(cInterpreter, Int32(index)),
-      let bytes = TFL_TensorData(cTensor),
-      let nameCString = TFL_TensorName(cTensor)
+    guard let cTensor = TfLiteInterpreterGetOutputTensor(cInterpreter, Int32(index)),
+      let bytes = TfLiteTensorData(cTensor),
+      let nameCString = TfLiteTensorName(cTensor)
     else {
       throw InterpreterError.invokeInterpreterRequired
     }
-    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+    guard let dataType = Tensor.DataType(type: TfLiteTensorType(cTensor)) else {
       throw InterpreterError.invalidTensorDataType
     }
 
     let name = String(cString: nameCString)
-    let rank = TFL_TensorNumDims(cTensor)
-    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
-    let shape = TensorShape(dimensions)
-    let byteCount = TFL_TensorByteSize(cTensor)
+    let rank = TfLiteTensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TfLiteTensorDim(cTensor, $0)) }
+    let shape = Tensor.Shape(dimensions)
+    let byteCount = TfLiteTensorByteSize(cTensor)
     let data = Data(bytes: bytes, count: byteCount)
-    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let cQuantizationParams = TfLiteTensorQuantizationParams(cTensor)
     let scale = cQuantizationParams.scale
     let zeroPoint = Int(cQuantizationParams.zero_point)
     var quantizationParameters: QuantizationParameters? = nil
@@ -188,12 +191,12 @@ public final class Interpreter {
   ///   - index: The index for the input tensor.
   ///   - shape: The shape that the input tensor should be resized to.
   /// - Throws: An error if the input tensor at the given index could not be resized.
-  public func resizeInput(at index: Int, to shape: TensorShape) throws {
+  public func resizeInput(at index: Int, to shape: Tensor.Shape) throws {
     let maxIndex = inputTensorCount - 1
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard TFL_InterpreterResizeInputTensor(
+    guard TfLiteInterpreterResizeInputTensor(
       cInterpreter,
       Int32(index),
       shape.int32Dimensions,
@@ -218,39 +221,49 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)) else {
+    guard let cTensor = TfLiteInterpreterGetInputTensor(cInterpreter, Int32(index)) else {
       throw InterpreterError.allocateTensorsRequired
     }
 
-    let byteCount = TFL_TensorByteSize(cTensor)
+    let byteCount = TfLiteTensorByteSize(cTensor)
     guard data.count == byteCount else {
       throw InterpreterError.invalidTensorDataCount(provided: data.count, required: byteCount)
     }
 
     #if swift(>=5.0)
     let status = data.withUnsafeBytes {
-      TFL_TensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
     }
     #else
-    let status = data.withUnsafeBytes { TFL_TensorCopyFromBuffer(cTensor, $0, data.count) }
+    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
     #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
   }
 
-  /// Allocates memory for all input tensors based on their `TensorShape`s.
+  /// Allocates memory for all input tensors based on their `Tensor.Shape`s.
   ///
   /// - Note: This is a relatively expensive operation and should only be called after creating the
   ///     interpreter and/or resizing any input tensors.
   /// - Throws: An error if memory could not be allocated for the input tensors.
   public func allocateTensors() throws {
-    guard TFL_InterpreterAllocateTensors(cInterpreter) == kTfLiteOk else {
+    guard TfLiteInterpreterAllocateTensors(cInterpreter) == kTfLiteOk else {
       throw InterpreterError.failedToAllocateTensors
     }
   }
 }
 
-// MARK: - Extensions
+extension Interpreter {
+  /// Options for configuring the `Interpreter`.
+  public struct Options: Equatable, Hashable {
+    /// The maximum number of CPU threads that the interpreter should run on. Default is `nil`
+    /// indicating that the `Interpreter` will decide the number of threads to use.
+    public var threadCount: Int? = nil
+
+    /// Creates a new instance with the default values.
+    public init() {}
+  }
+}
 
 extension String {
   /// Returns a new `String` initialized by using the given format C array as a template into which
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
index a07f8575b52..f6372614483 100644
--- a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -14,8 +14,8 @@
 
 import Foundation
 
-/// TensorFlow Lite interpreter errors.
-public enum InterpreterError: Error {
+/// Errors thrown by the TensorFlow Lite `Interpreter`.
+public enum InterpreterError: Error, Equatable, Hashable {
   case invalidTensorIndex(index: Int, maxIndex: Int)
   case invalidTensorDataCount(provided: Int, required: Int)
   case invalidTensorDataType
@@ -29,16 +29,14 @@ public enum InterpreterError: Error {
   case tensorFlowLiteError(String)
 }
 
-// MARK: - Extensions
-
 extension InterpreterError: LocalizedError {
-  /// Localized description of the interpreter error.
+  /// A localized description of the interpreter error.
   public var errorDescription: String? {
     switch self {
     case .invalidTensorIndex(let index, let maxIndex):
       return "Invalid tensor index \(index), max index is \(maxIndex)."
-    case .invalidTensorDataCount(let providedCount, let requiredCount):
-      return "Provided data count \(providedCount) must match the required count \(requiredCount)."
+    case .invalidTensorDataCount(let provided, let required):
+      return "Provided data count \(provided) must match the required count \(required)."
     case .invalidTensorDataType:
       return "Tensor data type is unsupported or could not be determined due to a model error."
     case .failedToLoadModel:
@@ -62,10 +60,6 @@ extension InterpreterError: LocalizedError {
 }
 
 extension InterpreterError: CustomStringConvertible {
-  /// Textual representation of the TensorFlow Lite interpreter error.
-  public var description: String {
-    return errorDescription ?? "Unknown error."
-  }
+  /// A textual representation of the TensorFlow Lite interpreter error.
+  public var description: String { return errorDescription ?? "Unknown error." }
 }
-
-extension InterpreterError: Equatable {}
diff --git a/tensorflow/lite/experimental/swift/Sources/Model.swift b/tensorflow/lite/experimental/swift/Sources/Model.swift
index 6d52dcc7fd0..5780a9962c8 100644
--- a/tensorflow/lite/experimental/swift/Sources/Model.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Model.swift
@@ -14,26 +14,25 @@
 
 import TensorFlowLiteC
 
-/// A TensorFlow Lite model used by the 'Interpreter` to perform inference.
+/// A TensorFlow Lite model used by the `Interpreter` to perform inference.
 final class Model {
-
-  /// The `TFL_Model` C pointer type represented as an `UnsafePointer<TFL_Model>`.
+  /// The `TfLiteModel` C pointer type represented as an `UnsafePointer<TfLiteModel>`.
   typealias CModel = OpaquePointer
 
-  /// The underlying `TFL_Model` C pointer.
+  /// The underlying `TfLiteModel` C pointer.
   let cModel: CModel?
 
-  /// Creates a new model instance.
+  /// Creates a new instance with the given `filePath`.
   ///
   /// - Precondition: Initialization can fail if the given `filePath` is invalid.
   /// - Parameters:
   ///   - filePath: Local file path to a TensorFlow Lite model.
   init?(filePath: String) {
-    guard !filePath.isEmpty, let cModel = TFL_NewModelFromFile(filePath) else { return nil }
+    guard !filePath.isEmpty, let cModel = TfLiteModelCreateFromFile(filePath) else { return nil }
     self.cModel = cModel
   }
 
   deinit {
-    TFL_DeleteModel(cModel)
+    TfLiteModelDelete(cModel)
   }
 }
diff --git a/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
index 254ab3f6871..77a484903b9 100644
--- a/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
+++ b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
@@ -15,13 +15,12 @@
 /// Parameters that determine the mapping of quantized values to real values. Quantized values can
 /// be mapped to float values using the following conversion:
 /// `realValue = scale * (quantizedValue - zeroPoint)`.
-public struct QuantizationParameters {
-
-  /// Difference between real values corresponding to consecutive quantized values differing by 1.
-  /// For example, the range of quantized values for `UInt8` data type is [0, 255].
+public struct QuantizationParameters: Equatable, Hashable {
+  /// The difference between real values corresponding to consecutive quantized values differing by
+  /// 1. For example, the range of quantized values for `UInt8` data type is [0, 255].
   public let scale: Float
 
-  /// Quantized value that corresponds to the real 0 value.
+  /// The quantized value that corresponds to the real 0 value.
   public let zeroPoint: Int
 
   /// Creates a new quantization parameters instance.
diff --git a/tensorflow/lite/experimental/swift/Sources/Tensor.swift b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
index 317914ff246..d011f993680 100644
--- a/tensorflow/lite/experimental/swift/Sources/Tensor.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
@@ -16,21 +16,20 @@ import Foundation
 import TensorFlowLiteC
 
 /// An input or output tensor in a TensorFlow Lite graph.
-public struct Tensor {
-
-  /// Name of the tensor.
+public struct Tensor: Equatable, Hashable {
+  /// The name of the tensor.
   public let name: String
 
-  /// Data type of the tensor.
-  public let dataType: TensorDataType
+  /// The data type of the tensor.
+  public let dataType: DataType
 
-  /// Shape of the tensor.
-  public let shape: TensorShape
+  /// The shape of the tensor.
+  public let shape: Shape
 
-  /// Data in the input or output tensor.
+  /// The data in the input or output tensor.
   public let data: Data
 
-  /// Quantization parameters for the tensor if using a quantized model.
+  /// The quantization parameters for the tensor if using a quantized model.
   public let quantizationParameters: QuantizationParameters?
 
   /// Creates a new input or output tensor instance.
@@ -38,13 +37,14 @@ public struct Tensor {
   /// - Parameters:
   ///   - name: Name of the tensor.
   ///   - dataType: Data type of the tensor.
+  ///   - shape: Shape of the tensor.
   ///   - data: Data in the input tensor.
   ///   - quantizationParameters Quantization parameters for the tensor if using a quantized model.
-  ///       The default is `nil`.
+  ///       Default is `nil`.
   init(
     name: String,
-    dataType: TensorDataType,
-    shape: TensorShape,
+    dataType: DataType,
+    shape: Shape,
     data: Data,
     quantizationParameters: QuantizationParameters? = nil
   ) {
@@ -56,83 +56,86 @@ public struct Tensor {
   }
 }
 
-/// Supported TensorFlow Lite tensor data types.
-public enum TensorDataType: Equatable {
-  /// Boolean.
-  case bool
-  /// 8-bit unsigned integer.
-  case uInt8
-  /// 16-bit signed integer.
-  case int16
-  /// 32-bit signed integer.
-  case int32
-  /// 64-bit signed integer.
-  case int64
-  /// 16-bit half precision floating point.
-  case float16
-  /// 32-bit single precision floating point.
-  case float32
+extension Tensor {
+  /// The supported `Tensor` data types.
+  public enum DataType: Equatable, Hashable {
+    /// A boolean.
+    case bool
+    /// An 8-bit unsigned integer.
+    case uInt8
+    /// A 16-bit signed integer.
+    case int16
+    /// A 32-bit signed integer.
+    case int32
+    /// A 64-bit signed integer.
+    case int64
+    /// A 16-bit half precision floating point.
+    case float16
+    /// A 32-bit single precision floating point.
+    case float32
 
-  /// Creates a new tensor data type from the given `TFL_Type` or `nil` if the data type is
-  /// unsupported or could not be determined because there was an error.
-  ///
-  /// - Parameter type: A data type supported by a tensor.
-  init?(type: TFL_Type) {
-    switch type {
-    case kTfLiteBool:
-      self = .bool
-    case kTfLiteUInt8:
-      self = .uInt8
-    case kTfLiteInt16:
-      self = .int16
-    case kTfLiteInt32:
-      self = .int32
-    case kTfLiteInt64:
-      self = .int64
-    case kTfLiteFloat16:
-      self = .float16
-    case kTfLiteFloat32:
-      self = .float32
-    case kTfLiteNoType:
-      fallthrough
-    default:
-      return nil
+    /// Creates a new instance from the given `TfLiteType` or `nil` if the data type is unsupported
+    /// or could not be determined because there was an error.
+    ///
+    /// - Parameter type: Data type supported by a tensor.
+    init?(type: TfLiteType) {
+      switch type {
+      case kTfLiteBool:
+        self = .bool
+      case kTfLiteUInt8:
+        self = .uInt8
+      case kTfLiteInt16:
+        self = .int16
+      case kTfLiteInt32:
+        self = .int32
+      case kTfLiteInt64:
+        self = .int64
+      case kTfLiteFloat16:
+        self = .float16
+      case kTfLiteFloat32:
+        self = .float32
+      case kTfLiteNoType:
+        fallthrough
+      default:
+        return nil
+      }
     }
   }
 }
 
-/// The shape of a TensorFlow Lite tensor.
-public struct TensorShape {
+extension Tensor {
+  /// The shape of a `Tensor`.
+  public struct Shape: Equatable, Hashable {
+    /// The number of dimensions of the tensor.
+    public let rank: Int
 
-  /// The number of dimensions of the tensor.
-  public let rank: Int
+    /// An array of dimensions for the tensor.
+    public let dimensions: [Int]
 
-  /// Array of dimensions for the tensor.
-  public let dimensions: [Int]
+    /// An array of `Int32` dimensions for the tensor.
+    var int32Dimensions: [Int32] { return dimensions.map(Int32.init) }
 
-  /// Array of `Int32` dimensions for the tensor.
-  var int32Dimensions: [Int32] { return dimensions.map(Int32.init) }
+    /// Creates a new instance with the given array of dimensions.
+    ///
+    /// - Parameters:
+    ///   - dimensions: Dimensions for the tensor.
+    public init(_ dimensions: [Int]) {
+      self.rank = dimensions.count
+      self.dimensions = dimensions
+    }
 
-  /// Creates a new tensor shape instance with the given array of dimensions.
-  ///
-  /// - Parameters:
-  ///   - dimensions: Dimensions for the tensor.
-  public init(_ dimensions: [Int]) {
-    self.rank = dimensions.count
-    self.dimensions = dimensions
-  }
-
-  /// Creates a new tensor shape instance with the given elements representing the dimensions.
-  ///
-  /// - Parameters:
-  ///   - elements: Dimensions for the tensor.
-  public init(_ elements: Int...) {
-    self.init(elements)
+    /// Creates a new instance with the given elements representing the dimensions.
+    ///
+    /// - Parameters:
+    ///   - elements: Dimensions for the tensor.
+    public init(_ elements: Int...) {
+      self.init(elements)
+    }
   }
 }
 
-extension TensorShape: ExpressibleByArrayLiteral {
-  /// Creates a new tensor shape instance with the given array literal representing the dimensions.
+extension Tensor.Shape: ExpressibleByArrayLiteral {
+  /// Creates a new instance with the given array literal representing the dimensions.
   ///
   /// - Parameters:
   ///   - arrayLiteral: Dimensions for the tensor.
diff --git a/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
index edc30cfca9c..b7234d78855 100644
--- a/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
+++ b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
@@ -18,5 +18,5 @@ import TensorFlowLiteC
 public enum Runtime {
   /// A string describing the semantic versioning information for the runtime. Is an empty string if
   /// the version could not be determined.
-  public static var version: String { return TFL_Version().map { String(cString: $0) } ?? "" }
+  public static var version: String { return TfLiteVersion().map { String(cString: $0) } ?? "" }
 }
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
index 3210ccc06c0..f50e99bf605 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '0.2.0'
+  s.version          = '1.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '37c101d' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :tag => "v#{s.version}" }
   s.summary          = 'TensorFlow Lite for Swift'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift
index d34b27a5c84..cc22043f7bb 100644
--- a/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift
@@ -15,16 +15,6 @@
 import UIKit
 
 @UIApplicationMain
-
 final class AppDelegate: UIResponder, UIApplicationDelegate {
-
-  /// The main window of the app.
   var window: UIWindow?
-
-  func application(
-    _ application: UIApplication,
-    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
-  ) -> Bool {
-    return true
-  }
 }
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
index 7041930a38e..add37475156 100644
--- a/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
@@ -12,18 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import class TensorFlowLite.Interpreter
-import struct TensorFlowLite.InterpreterOptions
-import struct TensorFlowLite.Tensor
-import struct TensorFlowLite.TensorShape
-import enum TensorFlowLite.Runtime
+import TensorFlowLite
 import UIKit
 
 class ViewController: UIViewController {
 
   // MARK: - Properties
 
-  /// TensorFlowLite interpreter object for performing inference from a given model.
+  /// TensorFlow Lite interpreter object for performing inference from a given model.
   private var interpreter: Interpreter?
 
   /// Serial dispatch queue for managing `Interpreter` calls.
@@ -122,7 +118,7 @@ class ViewController: UIViewController {
   private func setUpInterpreter(withModelPath modelPath: String) {
     interpreterQueue.async {
       do {
-        var options = InterpreterOptions()
+        var options = Interpreter.Options()
         options.threadCount = 2
         self.interpreter = try Interpreter(modelPath: modelPath, options: options)
       } catch let error {
@@ -211,7 +207,7 @@ class ViewController: UIViewController {
         return
       }
       do {
-        let shape = TensorShape(2)
+        let shape = Tensor.Shape(2)
         try (0..<interpreter.inputTensorCount).forEach { index in
           try interpreter.resizeInput(at: index, to: shape)
         }
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
deleted file mode 100644
index 2113f86b47a..00000000000
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-@testable import TensorFlowLite
-import XCTest
-
-class InterpreterOptionsTests: XCTestCase {
-
-  func testInterpreterOptions_InitWithDefaultValues() {
-    let options = InterpreterOptions()
-    XCTAssertNil(options.threadCount)
-  }
-
-  func testInterpreterOptions_InitWithCustomValues() {
-    var options = InterpreterOptions()
-    options.threadCount = 2
-    XCTAssertEqual(options.threadCount, 2)
-  }
-
-  func testInterpreterOptions_Equatable() {
-    var options1 = InterpreterOptions()
-    var options2 = InterpreterOptions()
-    XCTAssertEqual(options1, options2)
-
-    options1.threadCount = 2
-    options2.threadCount = 2
-    XCTAssertEqual(options1, options2)
-
-    options2.threadCount = 3
-    XCTAssertNotEqual(options1, options2)
-  }
-}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 1a9b898e480..6d001e1700e 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-@testable import TensorFlowLite
 import XCTest
 
+@testable import TensorFlowLite
+
 class InterpreterTests: XCTestCase {
 
   var interpreter: Interpreter!
@@ -31,55 +32,56 @@ class InterpreterTests: XCTestCase {
     super.tearDown()
   }
 
-  func testInterpreter_InitWithModelPath() {
+  func testInitWithModelPath() {
     XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path))
   }
 
-  func testInterpreter_Init_ThrowsFailedToLoadModel() {
+  func testInit_ThrowsFailedToLoadModel() {
     XCTAssertThrowsError(try Interpreter(modelPath: "/invalid/path")) { error in
       self.assertEqualErrors(actual: error, expected: .failedToLoadModel)
     }
   }
 
-  func testInterpreter_InitWithModelPathAndOptions() {
-    var options = InterpreterOptions()
+  func testInitWithModelPathAndOptions() throws {
+    var options = Interpreter.Options()
     options.threadCount = 2
-    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path, options: options))
+    let interpreter = try Interpreter(modelPath: AddModel.path, options: options)
+    XCTAssertNotNil(interpreter.options)
   }
 
-  func testInterpreter_InputTensorCount() {
+  func testInputTensorCount() {
     XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
   }
 
-  func testInterpreter_OutputTensorCount() {
+  func testOutputTensorCount() {
     XCTAssertEqual(interpreter.outputTensorCount, AddModel.outputTensorCount)
   }
 
-  func testInterpreter_Invoke() throws {
+  func testInvoke() throws {
     try interpreter.allocateTensors()
     XCTAssertNoThrow(try interpreter.invoke())
   }
 
-  func testInterpreter_Invoke_ThrowsAllocateTensorsRequired_ModelNotReady() {
+  func testInvoke_ThrowsAllocateTensorsRequired_ModelNotReady() {
     XCTAssertThrowsError(try interpreter.invoke()) { error in
       self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
     }
   }
 
-  func testInterpreter_InputTensorAtIndex() throws {
+  func testInputTensorAtIndex() throws {
     try setUpAddModelInputTensor()
     let inputTensor = try interpreter.input(at: AddModel.validIndex)
     XCTAssertEqual(inputTensor, AddModel.inputTensor)
   }
 
-  func testInterpreter_InputTensorAtIndex_QuantizedModel() throws {
+  func testInputTensorAtIndex_QuantizedModel() throws {
     interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
     try setUpAddQuantizedModelInputTensor()
     let inputTensor = try interpreter.input(at: AddQuantizedModel.inputOutputIndex)
     XCTAssertEqual(inputTensor, AddQuantizedModel.inputTensor)
   }
 
-  func testInterpreter_InputTensorAtIndex_ThrowsInvalidIndex() throws {
+  func testInputTensorAtIndex_ThrowsInvalidIndex() throws {
     try interpreter.allocateTensors()
     XCTAssertThrowsError(try interpreter.input(at: AddModel.invalidIndex)) { error in
       let maxIndex = AddModel.inputTensorCount - 1
@@ -90,13 +92,13 @@ class InterpreterTests: XCTestCase {
     }
   }
 
-  func testInterpreter_InputTensorAtIndex_ThrowsAllocateTensorsRequired() {
+  func testInputTensorAtIndex_ThrowsAllocateTensorsRequired() {
     XCTAssertThrowsError(try interpreter.input(at: AddModel.validIndex)) { error in
       self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
     }
   }
 
-  func testInterpreter_OutputTensorAtIndex() throws {
+  func testOutputTensorAtIndex() throws {
     try setUpAddModelInputTensor()
     try interpreter.invoke()
     let outputTensor = try interpreter.output(at: AddModel.validIndex)
@@ -105,7 +107,7 @@ class InterpreterTests: XCTestCase {
     XCTAssertEqual(expectedResults, AddModel.results)
   }
 
-  func testInterpreter_OutputTensorAtIndex_QuantizedModel() throws {
+  func testOutputTensorAtIndex_QuantizedModel() throws {
     interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
     try setUpAddQuantizedModelInputTensor()
     try interpreter.invoke()
@@ -115,7 +117,7 @@ class InterpreterTests: XCTestCase {
     XCTAssertEqual(expectedResults, AddQuantizedModel.results)
   }
 
-  func testInterpreter_OutputTensorAtIndex_ThrowsInvalidIndex() throws {
+  func testOutputTensorAtIndex_ThrowsInvalidIndex() throws {
     try interpreter.allocateTensors()
     try interpreter.invoke()
     XCTAssertThrowsError(try interpreter.output(at: AddModel.invalidIndex)) { error in
@@ -127,18 +129,18 @@ class InterpreterTests: XCTestCase {
     }
   }
 
-  func testInterpreter_OutputTensorAtIndex_ThrowsInvokeInterpreterRequired() {
+  func testOutputTensorAtIndex_ThrowsInvokeInterpreterRequired() {
     XCTAssertThrowsError(try interpreter.output(at: AddModel.validIndex)) { error in
       self.assertEqualErrors(actual: error, expected: .invokeInterpreterRequired)
     }
   }
 
-  func testInterpreter_ResizeInputTensorAtIndexToShape() {
+  func testResizeInputTensorAtIndexToShape() {
     XCTAssertNoThrow(try interpreter.resizeInput(at: AddModel.validIndex, to: [2, 2, 3]))
     XCTAssertNoThrow(try interpreter.allocateTensors())
   }
 
-  func testInterpreter_ResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
+  func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
     XCTAssertThrowsError(try interpreter.resizeInput(
       at: AddModel.invalidIndex,
       to: [2, 2, 3]
@@ -151,14 +153,14 @@ class InterpreterTests: XCTestCase {
     }
   }
 
-  func testInterpreter_CopyDataToInputTensorAtIndex() throws {
+  func testCopyDataToInputTensorAtIndex() throws {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let inputTensor = try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
     XCTAssertEqual(inputTensor.data, AddModel.inputData)
   }
 
-  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
+  func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
     XCTAssertThrowsError(try interpreter.copy(
       AddModel.inputData,
       toInputAt: AddModel.invalidIndex
@@ -171,7 +173,7 @@ class InterpreterTests: XCTestCase {
     }
   }
 
-  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidDataCount() throws {
+  func testCopyDataToInputTensorAtIndex_ThrowsInvalidDataCount() throws {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let invalidData = Data(count: AddModel.dataCount - 1)
@@ -186,7 +188,7 @@ class InterpreterTests: XCTestCase {
     }
   }
 
-  func testInterpreter_AllocateTensors() {
+  func testAllocateTensors() {
     XCTAssertNoThrow(try interpreter.allocateTensors())
   }
 
@@ -215,6 +217,33 @@ class InterpreterTests: XCTestCase {
   }
 }
 
+class InterpreterOptionsTests: XCTestCase {
+
+  func testInitWithDefaultValues() {
+    let options = Interpreter.Options()
+    XCTAssertNil(options.threadCount)
+  }
+
+  func testInitWithCustomValues() {
+    var options = Interpreter.Options()
+    options.threadCount = 2
+    XCTAssertEqual(options.threadCount, 2)
+  }
+
+  func testEquatable() {
+    var options1 = Interpreter.Options()
+    var options2 = Interpreter.Options()
+    XCTAssertEqual(options1, options2)
+
+    options1.threadCount = 2
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.threadCount = 3
+    XCTAssertNotEqual(options1, options2)
+  }
+}
+
 // MARK: - Constants
 
 /// Values for the `add.bin` model.
@@ -224,7 +253,7 @@ private enum AddModel {
   static let outputTensorCount = 1
   static let invalidIndex = 1
   static let validIndex = 0
-  static let shape: TensorShape = [2]
+  static let shape: Tensor.Shape = [2]
   static let dataCount = inputData.count
   static let inputData = Data(copyingBufferOf: [Float32(1.0), Float32(3.0)])
   static let outputData = Data(copyingBufferOf: [Float32(3.0), Float32(9.0)])
@@ -254,7 +283,7 @@ private enum AddModel {
 private enum AddQuantizedModel {
   static let info = (name: "add_quantized", extension: "bin")
   static let inputOutputIndex = 0
-  static let shape: TensorShape = [2]
+  static let shape: Tensor.Shape = [2]
   static let inputData = Data([1, 3])
   static let outputData = Data([3, 9])
   static let quantizationParameters = QuantizationParameters(scale: 0.003922, zeroPoint: 0)
diff --git a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
index c0fc15e7312..fc82b1f3ce7 100644
--- a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-@testable import TensorFlowLite
 import XCTest
 
+@testable import TensorFlowLite
+
 class ModelTests: XCTestCase {
 
   var modelPath: String!
@@ -39,15 +40,15 @@ class ModelTests: XCTestCase {
     super.tearDown()
   }
 
-  func testModel_InitWithFilePath() {
+  func testInitWithFilePath() {
     XCTAssertNotNil(Model(filePath: modelPath))
   }
 
-  func testModel_InitWithEmptyFilePath_FailsInitialization() {
+  func testInitWithEmptyFilePath_FailsInitialization() {
     XCTAssertNil(Model(filePath: ""))
   }
 
-  func testModel_InitWithInvalidFilePath_FailsInitialization() {
+  func testInitWithInvalidFilePath_FailsInitialization() {
     XCTAssertNil(Model(filePath: "invalid/path"))
   }
 }
diff --git a/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
index 65648c26982..e58809f07a1 100644
--- a/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
@@ -12,18 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-@testable import TensorFlowLite
 import XCTest
 
+@testable import TensorFlowLite
+
 class QuantizationParametersTests: XCTestCase {
 
-  func testQuantizationParameters_InitWithCustomValues() {
+  func testInitWithCustomValues() {
     let parameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
     XCTAssertEqual(parameters.scale, 0.5)
     XCTAssertEqual(parameters.zeroPoint, 1)
   }
 
-  func testQuantizationParameters_Equatable() {
+  func testEquatable() {
     let parameters1 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
     let parameters2 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
     XCTAssertEqual(parameters1, parameters2)
@@ -33,11 +34,3 @@ class QuantizationParametersTests: XCTestCase {
     XCTAssertNotEqual(parameters2, parameters3)
   }
 }
-
-// MARK: - Extensions
-
-extension QuantizationParameters: Equatable {
-  public static func == (lhs: QuantizationParameters, rhs: QuantizationParameters) -> Bool {
-    return lhs.scale == rhs.scale && lhs.zeroPoint == rhs.zeroPoint
-  }
-}
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift
index 191f802dcc4..f0b2302f722 100644
--- a/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-@testable import TensorFlowLite
 import XCTest
 
+@testable import TensorFlowLite
+
 class TensorFlowLiteTests: XCTestCase {
 
-  func testTensorFlowLite_Runtime_version() {
+  func testRuntime_Version() {
     #if swift(>=5.0)
     let pattern = #"^(\d+)\.(\d+)\.(\d+)([+-][-.0-9A-Za-z]+)?$"#
     #else
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
index 4540043a163..e1bb1b73111 100644
--- a/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-@testable import TensorFlowLite
 import XCTest
 
+@testable import TensorFlowLite
+
 class TensorTests: XCTestCase {
 
-  // MARK: - Tensor
-
-  func testTensor_Init() {
+  func testInit() {
     let name = "InputTensor"
-    let dataType: TensorDataType = .uInt8
-    let shape = TensorShape(Constant.dimensions)
+    let dataType: Tensor.DataType = .uInt8
+    let shape = Tensor.Shape(Constant.dimensions)
     guard let data = name.data(using: .utf8) else { XCTFail("Data should not be nil."); return }
     let quantizationParameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
     let inputTensor = Tensor(
@@ -39,25 +38,67 @@ class TensorTests: XCTestCase {
     XCTAssertEqual(inputTensor.quantizationParameters, quantizationParameters)
   }
 
-  // MARK: - TensorShape
+  func testEquatable() {
+    let name = "Tensor"
+    let dataType: Tensor.DataType = .uInt8
+    let shape = Tensor.Shape(Constant.dimensions)
+    guard let data = name.data(using: .utf8) else { XCTFail("Data should not be nil."); return }
+    let quantizationParameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let tensor1 = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    var tensor2 = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    XCTAssertEqual(tensor1, tensor2)
 
-  func testTensorShape_InitWithArray() {
-    let shape = TensorShape(Constant.dimensions)
+    tensor2 = Tensor(
+      name: "Tensor2",
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    XCTAssertNotEqual(tensor1, tensor2)
+  }
+}
+
+class TensorShapeTests: XCTestCase {
+
+  func testInitWithArray() {
+    let shape = Tensor.Shape(Constant.dimensions)
     XCTAssertEqual(shape.rank, Constant.dimensions.count)
     XCTAssertEqual(shape.dimensions, Constant.dimensions)
   }
 
-  func testTensorShape_InitWithElements() {
-    let shape = TensorShape(2, 2, 3)
+  func testInitWithElements() {
+    let shape = Tensor.Shape(2, 2, 3)
     XCTAssertEqual(shape.rank, Constant.dimensions.count)
     XCTAssertEqual(shape.dimensions, Constant.dimensions)
   }
 
-  func testTensorShape_InitWithArrayLiteral() {
-    let shape: TensorShape = [2, 2, 3]
+  func testInitWithArrayLiteral() {
+    let shape: Tensor.Shape = [2, 2, 3]
     XCTAssertEqual(shape.rank, Constant.dimensions.count)
     XCTAssertEqual(shape.dimensions, Constant.dimensions)
   }
+
+  func testEquatable() {
+    let shape1 = Tensor.Shape(2, 2, 3)
+    var shape2: Tensor.Shape = [2, 2, 3]
+    XCTAssertEqual(shape1, shape2)
+
+    shape2 = [2, 2, 4]
+    XCTAssertNotEqual(shape1, shape2)
+  }
 }
 
 // MARK: - Constants
@@ -66,18 +107,3 @@ private enum Constant {
   /// Array of 2 arrays of 2 arrays of 3 numbers: [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]].
   static let dimensions = [2, 2, 3]
 }
-
-// MARK: - Extensions
-
-extension TensorShape: Equatable {
-  public static func == (lhs: TensorShape, rhs: TensorShape) -> Bool {
-    return lhs.rank == rhs.rank && lhs.dimensions == rhs.dimensions
-  }
-}
-
-extension Tensor: Equatable {
-  public static func == (lhs: Tensor, rhs: Tensor) -> Bool {
-    return lhs.name == rhs.name && lhs.dataType == rhs.dataType && lhs.shape == rhs.shape &&
-           lhs.data == rhs.data && lhs.quantizationParameters == rhs.quantizationParameters
-  }
-}
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 2ea105f4127..daa6bff5d54 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -40,6 +40,7 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteFakeQuantParams",
                                       "TfLiteFullyConnectedParams",
                                       "TfLiteGatherParams",
+                                      "TfLiteIfParams",
                                       "TfLiteL2NormParams",
                                       "TfLiteLeakyReluParams",
                                       "TfLiteLocalResponseNormParams",
@@ -63,6 +64,7 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteSoftmaxParams",
                                       "TfLiteSpaceToBatchNDParams",
                                       "TfLiteSpaceToDepthParams",
+                                      "TfLiteDepthToSpaceParams",
                                       "TfLiteSparseToDenseParams",
                                       "TfLiteSplitParams",
                                       "TfLiteSplitVParams",
@@ -76,6 +78,7 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteUniqueParams",
                                       "TfLiteUnpackParams",
                                       "TfLiteReverseSequenceParams",
+                                      "TfLiteWhileParams",
                                       nullptr};
 }  // namespace
 
diff --git a/tensorflow/lite/external_cpu_backend_context.cc b/tensorflow/lite/external_cpu_backend_context.cc
new file mode 100644
index 00000000000..df1fc01b8b9
--- /dev/null
+++ b/tensorflow/lite/external_cpu_backend_context.cc
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/external_cpu_backend_context.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus RefreshExternalCpuBackendContext(TfLiteContext* context) {
+  auto* const external_context = static_cast<ExternalCpuBackendContext*>(
+      context->GetExternalContext(context, kTfLiteCpuBackendContext));
+  if (external_context && external_context->internal_backend_context() &&
+      context->recommended_num_threads != -1) {
+    external_context->internal_backend_context()->SetMaxNumThreads(
+        context->recommended_num_threads);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+ExternalCpuBackendContext::ExternalCpuBackendContext()
+    : internal_backend_context_(nullptr) {
+  this->type = kTfLiteCpuBackendContext;
+  this->Refresh = RefreshExternalCpuBackendContext;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/external_cpu_backend_context.h b/tensorflow/lite/external_cpu_backend_context.h
new file mode 100644
index 00000000000..88098635688
--- /dev/null
+++ b/tensorflow/lite/external_cpu_backend_context.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
+#define TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+
+// This is the base class for TF Lite internal backend contexts (like a
+// RUY-based cpu backend context class). A derived internal backend context is
+// generally a collection of utilities (i.e. a thread pool etc.) for TF Lite to
+// use certain keneral libraries, such as Gemmlowp, RUY, etc., to implement TF
+// Lite operators.
+class TfLiteInternalBackendContext {
+ public:
+  virtual ~TfLiteInternalBackendContext() {}
+
+  // Set the maximum number of threads that could be used for parallelizing
+  // TfLite computation.
+  virtual void SetMaxNumThreads(int max_num_threads) = 0;
+};
+
+// This TfLiteExternalContext-derived class is the default
+// 'kTfLiteCpuBackendContext'-typed context that's used internally in TF Lite
+// framework. The primary purpose of having this class is to allow the same cpu
+// backend context to be sharable among a set of TF Lite interpreters so that
+// certain system costs are saved, like saving the cost of having multiple
+// thread pools in each separate cpu backend context etc..
+//
+// Note: as of 2019/07/19, such context sharing among a set of interpreters will
+// break the execution if these interpreters are invoked simultaneously. It
+// works only when these context-sharing interpreters are invoked in a
+// serialized way. Here's an example to illustrate the context sharing among 2
+// TF Lite interpreters:
+//
+//  TfLiteExternalContext* global_ctxt = new ExternalCpuBackendContext();
+//  interpreter1 = /*...*/;
+//  interpreter1->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt);
+//  interpreter2 = /*...*/;
+//  interpreter2->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt);
+//
+//  interpreter1->SetNumThreads(2);
+//  interpreter1->Invoke();
+//
+//  interpreter2->SetNumThreads(4);
+//  interpreter2->Invoke();
+//
+// After sharing the context, calling 'SetNumThreads' on any of the
+// context-sharing interpreters will have the global impact as it also refreshes
+// the #thread info in the global cpu backend context (i.e. 'global_ctxt' above)
+// that affects how much parallelism an interpreter invocation will use.
+// Therefore, if different number of threads are used among different
+// interpreters, don't call 'SetNumThreads' consectutively but call it
+// separately between each interpreter's invocation as illustrated above.
+class ExternalCpuBackendContext : public TfLiteExternalContext {
+ public:
+  ExternalCpuBackendContext();
+  ~ExternalCpuBackendContext() {}
+
+  void set_internal_backend_context(
+      std::unique_ptr<TfLiteInternalBackendContext> internal_backend_context) {
+    internal_backend_context_ = std::move(internal_backend_context);
+  }
+
+  TfLiteInternalBackendContext* internal_backend_context() const {
+    return internal_backend_context_.get();
+  }
+
+ private:
+  // Note the actual internal backend context object is lazily initialized.
+  std::unique_ptr<TfLiteInternalBackendContext> internal_backend_context_;
+
+  ExternalCpuBackendContext(const ExternalCpuBackendContext&) = delete;
+  ExternalCpuBackendContext& operator=(const ExternalCpuBackendContext&) =
+      delete;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index b7b954e2db5..b9c4fea74e9 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -24,6 +24,8 @@ upper_tabs:
         path: /lite/guide/android
       - title: "iOS quickstart"
         path: /lite/guide/ios
+      - title: "Python quickstart"
+        path: /lite/guide/python
       - title: "FAQ"
         path: /lite/guide/faq
       - title: "Roadmap"
@@ -74,11 +76,12 @@ upper_tabs:
         path: /lite/performance/model_optimization
       - title: "Post-training quantization"
         path: /lite/performance/post_training_quantization
-      - title: "Post-training quantization example"
-        path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
-      - title: "Post-training integer quantization example"
-        path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_integer_quant.ipynb
-        status: external
+      - title: "Post-training weight quantization"
+        path: /lite/performance/post_training_quant
+      - title: "Post-training integer quantization"
+        path: /lite/performance/post_training_integer_quant
+      - title: "Post-training float16 quantization"
+        path: /lite/performance/post_training_float16_quant
       - title: "Delegates"
         path: /lite/performance/delegates
       - title: "GPU delegate"
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 1dd37ffdfd3..777c363e7fb 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -1,9 +1,12 @@
 # Converter Python API guide
 
-This page provides examples on how to use the TensorFlow Lite Converter and the
-TensorFlow Lite interpreter using the Python API.
+This page describes how to convert TensorFlow models into the TensorFlow Lite
+format using the TensorFlow Lite Converter Python API.
 
-Note: These docs describe the converter in the TensorFlow nightly release,
+If you're looking for information about how to run a TensorFlow Lite model,
+see [TensorFlow Lite inference](../guide/inference.md).
+
+Note: This page describes the converter in the TensorFlow nightly release,
 installed using `pip install tf-nightly`. For docs describing older versions
 reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
 
@@ -20,13 +23,12 @@ be targeted to devices with mobile.
 ## API
 
 The API for converting TensorFlow models to TensorFlow Lite is
-`tf.lite.TFLiteConverter`. The API for calling the Python interpreter is
-`tf.lite.Interpreter`.
+`tf.lite.TFLiteConverter`, which provides class methods based on the original
+format of the model. For example, `TFLiteConverter.from_session()` is available
+for GraphDefs, `TFLiteConverter.from_saved_model()` is available for
+SavedModels, and `TFLiteConverter.from_keras_model_file()` is available for
+`tf.Keras` files.
 
-`TFLiteConverter` provides class methods based on the original format of the
-model. `TFLiteConverter.from_session()` is available for GraphDefs.
-`TFLiteConverter.from_saved_model()` is available for SavedModels.
-`TFLiteConverter.from_keras_model_file()` is available for `tf.Keras` files.
 Example usages for simple float-point models are shown in
 [Basic Examples](#basic). Examples usages for more complex models is shown in
 [Complex Examples](#complex).
@@ -177,65 +179,6 @@ with tf.Session() as sess:
   open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-## TensorFlow Lite Python interpreter <a name="interpreter"></a>
-
-### Using the interpreter from a model file <a name="interpreter_file"></a>
-
-The following example shows how to use the TensorFlow Lite Python interpreter
-when provided a TensorFlow Lite FlatBuffer file. The example also demonstrates
-how to run inference on random input data. Run
-`help(tf.lite.Interpreter)` in the Python terminal to get detailed
-documentation on the interpreter.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path="converted_model.tflite")
-interpreter.allocate_tensors()
-
-# Get input and output tensors.
-input_details = interpreter.get_input_details()
-output_details = interpreter.get_output_details()
-
-# Test model on random input data.
-input_shape = input_details[0]['shape']
-input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
-interpreter.set_tensor(input_details[0]['index'], input_data)
-
-interpreter.invoke()
-
-# The function `get_tensor()` returns a copy of the tensor data.
-# Use `tensor()` in order to get a pointer to the tensor.
-output_data = interpreter.get_tensor(output_details[0]['index'])
-print(output_data)
-```
-
-### Using the interpreter from model data <a name="interpreter_data"></a>
-
-The following example shows how to use the TensorFlow Lite Python interpreter
-when starting with the TensorFlow Lite Flatbuffer model previously loaded. This
-example shows an end-to-end use case, starting from building the TensorFlow
-model.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
-  tflite_model = converter.convert()
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_content=tflite_model)
-interpreter.allocate_tensors()
-```
 
 ## Additional instructions
 
diff --git a/tensorflow/lite/g3doc/convert/quantization.md b/tensorflow/lite/g3doc/convert/quantization.md
index 895f3e637e7..9dfc7a2c20c 100644
--- a/tensorflow/lite/g3doc/convert/quantization.md
+++ b/tensorflow/lite/g3doc/convert/quantization.md
@@ -14,7 +14,29 @@ During conversion, set the `optimizations` flag to optimize for size:
 
 ```
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+tflite_quant_model = converter.convert()
+```
+
+# Full integer quantization of weights and activations
+
+We can get further latency improvements, reductions in peak memory usage, and
+access to integer only hardware accelerators by making sure all model math is
+quantized. To do this, we need to measure the dynamic range of activations and
+inputs with a representative data set. You can simply create an input data
+generator and provide it to our converter.
+
+```
+import tensorflow as tf
+
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.representative_dataset = representative_dataset_gen
 tflite_quant_model = converter.convert()
 ```
 
@@ -25,10 +47,13 @@ latency, smaller size, and integer-only accelerators compatible model.
 Currently, this requires training a model with
 ["fake-quantization" nodes](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize).
 
+This is only available in the v1 converter. A longer term solution that's
+compatible with 2.0 semantics is in progress.
+
 Convert the graph:
 
 ```
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
 input_arrays = converter.get_input_arrays()
 converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 7beaf3214dc..52bc287c151 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -11,26 +11,27 @@ models.
 ## Currently supported
 
 Currently, RNN models using
-[`tf.nn.static_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
+[`tf.compat.v1.nn.static_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
 can be converted successfully as long as no `sequence_length` is specified.
 
-The following `tf.nn.rnn_cell` operations work with `tf.nn.static_rnn`:
+The following `tf.compat.v1.nn.rnn_cell` operations work with
+`tf.compat.v1.nn.static_rnn`:
 
-*   [tf.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
-*   [tf.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell)
-*   [tf.nn.rnn_cell.BasicLSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicLSTMCell)
-*   [tf.nn.rnn_cell.BasicRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicRNNCell)
+*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
+*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
+*   [tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell)
+*   [tf.compat.v1.nn.rnn_cell.BasicLSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicLSTMCell)
+*   [tf.compat.v1.nn.rnn_cell.BasicRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicRNNCell)
 
 In addition, TensorFlow Lite provides some experimental drop-in replacements for
 RNN operations that enable dynamic RNN architectures with TensorFlow Lite.
 
 Drop-in replacements are available for the following:
 
-*   [tf.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*   [tf.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
+*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
+*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
 
 ## Not currently supported
 
@@ -40,10 +41,10 @@ operations. This means that, unless one of the conversion strategies discussed
 in the next section are employed, models built with the following TensorFlow
 functions will not convert successfully:
 
-*   [tf.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
+*   [tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
     where a `sequence_length` is specified
-*   [tf.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
 
 Note: TensorFlow Lite plans to implement all required Control Flow operations by
 the end of 2019. At this point, all RNN architectures will convert successfully.
@@ -56,7 +57,7 @@ to modify its architecture and retrain it. The following strategies can be used.
 ### 1. Refactoring
 
 The simplest approach, if possible, is to refactor the model architecture to use
-[tf.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
+[tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
 without `sequence_length`.
 
 ### 2. Drop-in replacements that use op hints and fused ops
@@ -69,24 +70,24 @@ when run by the Lite interpreter.
 
 The following drop-in replacements are available:
 
-*   [tf.lite.experimental.nn.dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L41)
+*   [tf.compat.v1.lite.experimental.nn.dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L41)
     *   replacement for tf.nn.dynamic_rnn
-*   [tf.lite.experimental.nn.bidirectional_dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L279)
+*   [tf.compat.v1.lite.experimental.nn.bidirectional_dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L279)
     *   replacement for tf.nn.bidirectional_dynamic_rnn
-*   [tf.lite.experimental.nn.TfLiteRNNCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L39)
+*   [tf.compat.v1.lite.experimental.nn.TfLiteRNNCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L39)
     *   replacement for tf.nn.rnn_cell.RNNCell
-*   [tf.lite.experimental.nn.TfLiteLSTMCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L159)
+*   [tf.compat.v1.lite.experimental.nn.TfLiteLSTMCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L159)
     *   replacement for tf.nn.rnn_cell.LSTMCell
 
 Note: These replacements must be used together. For example, if you are using
-`tf.lite.experimental.nn.dynamic_rnn`, you must combine it with
-`tf.lite.experimental.nn.TfLiteRNNCell` instead of using
-`tf.nn.rnn_cell.RNNCell`.
+`tf.compat.v1.lite.experimental.nn.dynamic_rnn`, you must combine it with
+`tf.compat.v1.lite.experimental.nn.TfLiteRNNCell` instead of using
+`tf.compat.v1.nn.rnn_cell.RNNCell`.
 
 Instead of
-[tf.nn.rnn_cell.MultiRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/MultiRNNCell),
+[tf.compat.v1.nn.rnn_cell.MultiRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/MultiRNNCell),
 you should use
-[tf.keras.layers.StackedRNNCells](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StackedRNNCells).
+[tf.compat.v1.keras.layers.StackedRNNCells](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StackedRNNCells).
 
 For a tutorial on using these replacements, see
 [TensorFlow Lite LSTM ops API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/g3doc/README.md).
@@ -95,4 +96,4 @@ For a Colab demonstrating these classes, refer to
 [TensorFlowLite_LSTM_Keras_Tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/TensorFlowLite_LSTM_Keras_Tutorial.ipynb).
 
 Note: There is no replacement available for
-[tf.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell).
+[tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell).
diff --git a/tensorflow/lite/g3doc/guide/build_arm64.md b/tensorflow/lite/g3doc/guide/build_arm64.md
index 304b7217e52..825e235d058 100644
--- a/tensorflow/lite/g3doc/guide/build_arm64.md
+++ b/tensorflow/lite/g3doc/guide/build_arm64.md
@@ -1,23 +1,37 @@
 # Build TensorFlow Lite for ARM64 boards
 
-## Cross compiling
+This page describes how to build the TensorFlow Lite static library for
+ARM64-based computers. If you just want to start using TensorFlow Lite to
+execute your models, the fastest option is to install the TensorFlow Lite
+runtime package as shown in the [Python quickstart](python.md).
 
-### Installing the toolchain
+Note: This page shows how to compile only the C++ static library for
+TensorFlow Lite. Alternative install options include: [install just the Python
+interpreter API](python.md) (for inferencing only); [install the full
+TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or [build the full TensorFlow package](
+https://www.tensorflow.org/install/source).
+
+## Cross-compile for ARM64
+
+To ensure the proper build environment, we recommend using one of our TensorFlow
+Docker images such as [tensorflow/tensorflow:nightly-devel](
+https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To get started, install the toolchain and libs:
 
 ```bash
 sudo apt-get update
 sudo apt-get install crossbuild-essential-arm64
 ```
 
-> If you are using Docker, you may not use `sudo`.
+If you are using Docker, you may not use `sudo`.
 
-### Building
-
-Clone this Tensorflow repository. Run this script at the root of the repository
-to download all the dependencies:
-
-> The Tensorflow repository is in `/tensorflow` if you are using
-> `tensorflow/tensorflow:nightly-devel` docker image, just try it.
+Now git-clone the TensorFlow repository
+(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
+Docker image, the repo is already provided in `/tensorflow_src/`—and then run
+this script at the root of the TensorFlow repository to download all the
+build dependencies:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -25,7 +39,7 @@ to download all the dependencies:
 
 Note that you only need to do this once.
 
-Compile:
+Then compile:
 
 ```bash
 ./tensorflow/lite/tools/make/build_aarch64_lib.sh
@@ -34,17 +48,19 @@ Compile:
 This should compile a static library in:
 `tensorflow/lite/tools/make/gen/aarch64_armv8-a/lib/libtensorflow-lite.a`.
 
-## Native compiling
+## Compile natively on ARM64
 
 These steps were tested on HardKernel Odroid C2, gcc version 5.4.0.
 
-Log in to your board, install the toolchain.
+Log in to your board and install the toolchain:
 
 ```bash
 sudo apt-get install build-essential
 ```
 
-First, clone the TensorFlow repository. Run this at the root of the repository:
+Now git-clone the TensorFlow repository
+(`https://github.com/tensorflow/tensorflow`) and run this at the root of
+the repository:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -52,7 +68,7 @@ First, clone the TensorFlow repository. Run this at the root of the repository:
 
 Note that you only need to do this once.
 
-Compile:
+Then compile:
 
 ```bash
 ./tensorflow/lite/tools/make/build_aarch64_lib.sh
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 1a438ab50e1..7ab4b434e4f 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -1,30 +1,42 @@
 # Build TensorFlow Lite for Raspberry Pi
 
-## Cross compiling
+This page describes how to build the TensorFlow Lite static library for
+Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
+models, the fastest option is to install the TensorFlow Lite runtime package as
+shown in the [Python quickstart](python.md).
 
-### Installing the toolchain
+Note: This page shows how to compile only the C++ static library for
+TensorFlow Lite. Alternative install options include: [install just the Python
+interpreter API](python.md) (for inferencing only); [install the full
+TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or [build the full TensorFlow package](
+https://www.tensorflow.org/install/source_rpi).
 
-This has been tested on Ubuntu 16.04.3 64bit and Tensorflow devel docker image
+
+## Cross-compile for Raspberry Pi
+
+This has been tested on Ubuntu 16.04.3 64bit and TensorFlow devel docker image
 [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
-To cross compile TensorFlow Lite, first install the toolchain and libs.
+To cross compile TensorFlow Lite, first install the toolchain and libs:
 
 ```bash
 sudo apt-get update
 sudo apt-get install crossbuild-essential-armhf
 ```
 
-> If you are using Docker, you may not use `sudo`.
+If you are using Docker, you may not use `sudo`.
 
-### Building
-
-Clone this Tensorflow repository, Run this script at the root of the repository to download all the dependencies:
-
-> The Tensorflow repository is in `/tensorflow` if you are using `tensorflow/tensorflow:nightly-devel` docker image, just try it.
+Now git-clone the TensorFlow repository
+(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
+Docker image, the repo is already provided in `/tensorflow_src/`—and then run
+this script at the root of the TensorFlow repository to download all the
+build dependencies:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
 ```
+
 Note that you only need to do this once.
 
 You should then be able to compile:
@@ -36,23 +48,29 @@ You should then be able to compile:
 This should compile a static library in:
 `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
 
-## Native compiling
+
+## Compile natively on Raspberry Pi
+
 This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
 
-Log in to you Raspberry Pi, install the toolchain.
+Log in to your Raspberry Pi and install the toolchain:
 
 ```bash
 sudo apt-get install build-essential
 ```
 
-First, clone the TensorFlow repository. Run this at the root of the repository:
+Now git-clone the TensorFlow repository
+(`https://github.com/tensorflow/tensorflow`) and run this at the root of
+the repository:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
 ```
+
 Note that you only need to do this once.
 
 You should then be able to compile:
+
 ```bash
 ./tensorflow/lite/tools/make/build_rpi_lib.sh
 ```
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index e20dc08d0ca..ce16b795ec9 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -4,22 +4,27 @@ TensorFlow Lite provides all the tools you need to convert and run TensorFlow
 models on mobile, embedded, and IoT devices. The following guide walks through
 each step of the developer workflow and provides links to further instructions.
 
+[TOC]
+
 ## 1. Choose a model
 
 <a id="1_choose_a_model"></a>
 
-TensorFlow Lite allows you to run TensorFlow models on a wide range of devices.
 A TensorFlow model is a data structure that contains the logic and knowledge of
 a machine learning network trained to solve a particular problem.
-
 There are many ways to obtain a TensorFlow model, from using pre-trained models
-to training your own. To use a model with TensorFlow Lite it must be converted
-into a special format. This is explained in section 2,
-[Convert the model](#2_convert_the_model_format).
+to training your own.
+
+To use a model with TensorFlow Lite, you must convert a
+full TensorFlow model into the TensorFlow Lite format—you
+cannot create or train a model using TensorFlow Lite. So you must start with a
+regular TensorFlow model, and then
+[convert the model](#2_convert_the_model_format).
+
+Note: TensorFlow Lite supports a limited subset of TensorFlow operations, so not
+all models can be converted. For details, read about the
+[TensorFlow Lite operator compatibility](ops_compatibility.md).
 
-Note: Not all TensorFlow models will work with TensorFlow Lite, since the
-interpreter supports a limited subset of TensorFlow operations. See section 2,
-[Convert the model](#2_convert_the_model_format) to learn about compatibility.
 
 ### Use a pre-trained model
 
@@ -60,35 +65,37 @@ flowers with TensorFlow</a> codelab.
 ### Train a custom model
 
 If you have designed and trained your own TensorFlow model, or you have trained
-a model obtained from another source, you should convert it to the TensorFlow
-Lite format before use.
+a model obtained from another source, you must
+[convert it to the TensorFlow Lite format](#2_convert_the_model_format).
 
 ## 2. Convert the model
 
 <a id="2_convert_the_model_format"></a>
 
-TensorFlow Lite is designed to execute models efficiently on devices. Some of
+TensorFlow Lite is designed to execute models efficiently on mobile and other
+embedded devices with limited compute and memory resources. Some of
 this efficiency comes from the use of a special format for storing models.
 TensorFlow models must be converted into this format before they can be used by
 TensorFlow Lite.
 
 Converting models reduces their file size and introduces optimizations that do
-not affect accuracy. Developers can opt to further reduce file size and increase
-speed of execution in exchange for some trade-offs. You can use the TensorFlow
-Lite converter to choose which optimizations to apply.
+not affect accuracy. The TensorFlow Lite converter provides options
+that allow you to further reduce file size and increase speed of execution, with
+some trade-offs.
+
+Note: TensorFlow Lite supports a limited subset of TensorFlow operations, so not
+all models can be converted. For details, read about the
+[TensorFlow Lite operator compatibility](ops_compatibility.md).
 
-TensorFlow Lite supports a limited subset of TensorFlow operations, so not all
-models can be converted. See [Ops compatibility](#ops-compatibility) for more
-information.
 
 ### TensorFlow Lite converter
 
-The [TensorFlow Lite converter](../convert) is a tool that converts trained
-TensorFlow models into the TensorFlow Lite format. It can also introduce
-optimizations, which are covered in section 4,
+The [TensorFlow Lite converter](../convert) is a tool available as a Python API
+that converts trained TensorFlow models into the TensorFlow Lite format. It can
+also introduce optimizations, which are covered in section 4,
 [Optimize your model](#4_optimize_your_model_optional).
 
-The converter is available as a Python API. The following example shows a
+The following example shows a
 TensorFlow `SavedModel` being converted into the TensorFlow Lite format:
 
 ```python
@@ -128,9 +135,9 @@ performance or reduce file size. This is covered in section 4,
 
 ### Ops compatibility
 
-TensorFlow Lite currently supports a [limited subset](ops_compatibility.md) of
-TensorFlow operations. The long term goal is for all TensorFlow operations to be
-supported.
+TensorFlow Lite currently supports a [limited subset of TensorFlow
+operations](ops_compatibility.md). The long term goal is for all TensorFlow
+operations to be supported.
 
 If the model you wish to convert contains unsupported operations, you can use
 [TensorFlow Select](ops_select.md) to include operations from TensorFlow. This
@@ -204,9 +211,15 @@ developers should use the
 
 ### Linux
 
-Embedded Linux is an important platform for deploying machine learning. We
-provide build instructions for both [Raspberry Pi](build_rpi.md) and
-[Arm64-based boards](build_arm64.md) such as Odroid C2, Pine64, and NanoPi.
+Embedded Linux is an important platform for deploying machine learning. To get
+started using Python to perform inference with your TensorFlow Lite models,
+follow the [Python quickstart](python.md).
+
+To instead install the C++ library, see the
+build instructions for [Raspberry Pi](build_rpi.md) or
+[Arm64-based boards](build_arm64.md) (for boards such as Odroid C2, Pine64, and
+NanoPi).
+
 
 ### Microcontrollers
 
@@ -259,11 +272,16 @@ following Python code quantizes a `SavedModel` and saves it to disk:
 import tensorflow as tf
 
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
 tflite_quant_model = converter.convert()
 open("converted_model.tflite", "wb").write(tflite_quantized_model)
 ```
 
+TensorFlow Lite supports reducing precision of values from full floating point
+to half-precision floats (float16) or 8-bit integers. There are trade-offs in
+model size and accuracy for each choice, and some operations have optimized
+implementations for these reduced precision types.
+
 To learn more about quantization, see
 [Post-training quantization](../performance/post_training_quantization.md).
 
@@ -282,5 +300,8 @@ resources:
 
 *   If you're a mobile developer, visit [Android quickstart](android.md) or
     [iOS quickstart](ios.md).
+*   If you're building Linux embedded devices, see the [Python quickstart](
+    python.md) or C++ build instructions for [Raspberry Pi](build_rpi.md) and
+    [Arm64-based boards](build_arm64.md).
 *   Explore our [pre-trained models](../models).
 *   Try our [example apps](https://www.tensorflow.org/lite/examples).
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index 323d31ba897..560a0261861 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -21,29 +21,29 @@ For more information about image classification, see
 classification models offer the smallest model size and fastest performance, at
 the expense of accuracy.
 
-Model name                  | Paper and model                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance
---------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------:
-Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb     | 39.5%          | 64.4%          | 3.7 ms
-Mobilenet_V1_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb     | 42.8%          | 68.1%          | 5.5 ms
-Mobilenet_V1_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb     | 45.7%          | 70.8%          | 7.9 ms
-Mobilenet_V1_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb     | 48.2%          | 72.8%          | 10.4 ms
-Mobilenet_V1_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_128_quant.tgz)  | 1.4 Mb     | 54.9%          | 78.1%          | 8.8 ms
-Mobilenet_V1_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_160_quant.tgz)  | 1.4 Mb     | 57.2%          | 80.5%          | 13.0 ms
-Mobilenet_V1_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_192_quant.tgz)  | 1.4 Mb     | 59.9%          | 82.1%          | 18.3 ms
-Mobilenet_V1_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_224_quant.tgz)  | 1.4 Mb     | 61.2%          | 83.2%          | 24.7 ms
-Mobilenet_V1_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb     | 55.9%          | 79.1%          | 16.2 ms
-Mobilenet_V1_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb     | 62.4%          | 83.7%          | 24.3 ms
-Mobilenet_V1_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb     | 66.1%          | 86.2%          | 33.8 ms
-Mobilenet_V1_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb     | 66.9%          | 86.9%          | 45.4 ms
-Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_128_quant.tgz)  | 4.3 Mb     | 63.3%          | 84.1%          | 24.9 ms
-Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
-Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
-Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 53.4 ms
-Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
-Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
-Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
-Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
+Model name                  | Paper and model                                                                                                                                                                   | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance
+--------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------:
+Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb     | 39.5%          | 64.4%          | 3.7 ms
+Mobilenet_V1_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb     | 42.8%          | 68.1%          | 5.5 ms
+Mobilenet_V1_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb     | 45.7%          | 70.8%          | 7.9 ms
+Mobilenet_V1_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb     | 48.2%          | 72.8%          | 10.4 ms
+Mobilenet_V1_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_128_quant.tgz)  | 1.4 Mb     | 54.9%          | 78.1%          | 8.8 ms
+Mobilenet_V1_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_160_quant.tgz)  | 1.4 Mb     | 57.2%          | 80.5%          | 13.0 ms
+Mobilenet_V1_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_192_quant.tgz)  | 1.4 Mb     | 59.9%          | 82.1%          | 18.3 ms
+Mobilenet_V1_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_224_quant.tgz)  | 1.4 Mb     | 61.2%          | 83.2%          | 24.7 ms
+Mobilenet_V1_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb     | 55.9%          | 79.1%          | 16.2 ms
+Mobilenet_V1_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb     | 62.4%          | 83.7%          | 24.3 ms
+Mobilenet_V1_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb     | 66.1%          | 86.2%          | 33.8 ms
+Mobilenet_V1_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb     | 66.9%          | 86.9%          | 45.4 ms
+Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_128_quant.tgz)  | 4.3 Mb     | 63.3%          | 84.1%          | 24.9 ms
+Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
+Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
+Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 53.4 ms
+Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
+Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
+Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
 Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen
 Graph.
@@ -68,23 +68,23 @@ ResNet_V2_101         | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](h
 Inception_V3          | [paper](http://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz)         | 95.3 Mb    | 77.9%          | 93.8%          | 1433 ms             | 1522 ms
 Inception_V4          | [paper](http://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz)         | 170.7 Mb   | 80.1%          | 95.1%          | 2986 ms             | 3139 ms
 Inception_ResNet_V2   | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz) | 121.0 Mb   | 77.5%          | 94.0%          | 2731 ms             | 2926 ms
-Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz)                                       | 1.9 Mb     | 41.4%          | 66.2%          | 6.2 ms              | 13.0 ms
-Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz)                                       | 1.9 Mb     | 45.4%          | 70.2%          | 8.6 ms              | 19.5 ms
-Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz)                                       | 1.9 Mb     | 47.1%          | 72.0%          | 12.1 ms             | 27.8 ms
-Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz)                                       | 1.9 Mb     | 49.7%          | 74.1%          | 16.2 ms             | 37.3 ms
-Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz)                                        | 5.3 Mb     | 56.2%          | 79.3%          | 18.1 ms             | 29.9 ms
-Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz)                                        | 5.3 Mb     | 59.0%          | 81.8%          | 26.8 ms             | 45.9 ms
-Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz)                                        | 5.3 Mb     | 61.7%          | 83.5%          | 35.6 ms             | 65.3 ms
-Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz)                                        | 5.3 Mb     | 63.2%          | 84.9%          | 47.6 ms             | 164.2 ms
-Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz)                                       | 10.3 Mb    | 62.0%          | 83.8%          | 34.6 ms             | 48.7 ms
-Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz)                                       | 10.3 Mb    | 65.2%          | 85.9%          | 51.3 ms             | 75.2 ms
-Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz)                                       | 10.3 Mb    | 67.1%          | 87.2%          | 71.7 ms             | 107.0 ms
-Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz)                                       | 10.3 Mb    | 68.3%          | 88.1%          | 95.7 ms             | 143.4 ms
-Mobilenet_V1_1.0_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz)                                        | 16.9 Mb    | 65.2%          | 85.7%          | 57.4 ms             | 76.8 ms
-Mobilenet_V1_1.0_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz)                                        | 16.9 Mb    | 68.0%          | 87.7%          | 86.0 ms             | 117.7 ms
-Mobilenet_V1_1.0_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz)                                        | 16.9 Mb    | 69.9%          | 89.1%          | 118.6 ms            | 167.3 ms
-Mobilenet_V1_1.0_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)                                        | 16.9 Mb    | 71.0%          | 89.9%          | 160.1 ms            | 224.3 ms
-Mobilenet_V2_1.0_224  | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz)                                                | 14.0 Mb    | 71.8%          | 90.6%          | 117 ms              |
+Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz)               | 1.9 Mb     | 41.4%          | 66.2%          | 6.2 ms              | 13.0 ms
+Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz)               | 1.9 Mb     | 45.4%          | 70.2%          | 8.6 ms              | 19.5 ms
+Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz)               | 1.9 Mb     | 47.1%          | 72.0%          | 12.1 ms             | 27.8 ms
+Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz)               | 1.9 Mb     | 49.7%          | 74.1%          | 16.2 ms             | 37.3 ms
+Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz)                | 5.3 Mb     | 56.2%          | 79.3%          | 18.1 ms             | 29.9 ms
+Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz)                | 5.3 Mb     | 59.0%          | 81.8%          | 26.8 ms             | 45.9 ms
+Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz)                | 5.3 Mb     | 61.7%          | 83.5%          | 35.6 ms             | 65.3 ms
+Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz)                | 5.3 Mb     | 63.2%          | 84.9%          | 47.6 ms             | 164.2 ms
+Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz)               | 10.3 Mb    | 62.0%          | 83.8%          | 34.6 ms             | 48.7 ms
+Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz)               | 10.3 Mb    | 65.2%          | 85.9%          | 51.3 ms             | 75.2 ms
+Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz)               | 10.3 Mb    | 67.1%          | 87.2%          | 71.7 ms             | 107.0 ms
+Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz)               | 10.3 Mb    | 68.3%          | 88.1%          | 95.7 ms             | 143.4 ms
+Mobilenet_V1_1.0_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz)                | 16.9 Mb    | 65.2%          | 85.7%          | 57.4 ms             | 76.8 ms
+Mobilenet_V1_1.0_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz)                | 16.9 Mb    | 68.0%          | 87.7%          | 86.0 ms             | 117.7 ms
+Mobilenet_V1_1.0_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz)                | 16.9 Mb    | 69.9%          | 89.1%          | 118.6 ms            | 167.3 ms
+Mobilenet_V1_1.0_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)                | 16.9 Mb    | 71.0%          | 89.9%          | 160.1 ms            | 224.3 ms
+Mobilenet_V2_1.0_224  | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz)                        | 14.0 Mb    | 71.8%          | 90.6%          | 117 ms              |
 
 ### AutoML mobile models
 
@@ -113,7 +113,7 @@ For more information about object detection, see
 The object detection model we currently host is
 **coco_ssd_mobilenet_v1_1.0_quant_2018_06_29**.
 
-<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
 model and labels</a>
 
 ## Pose estimation
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 353a656740e..4f5ddeb976b 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -1,91 +1,104 @@
 # TensorFlow Lite inference
 
 The term *inference* refers to the process of executing a TensorFlow Lite model
-on-device in order to make predictions based on input data. Inference is the
-final step in using the model on-device.
+on-device in order to make predictions based on input data. To perform an
+inference with a TensorFlow Lite model, you must run it through an
+*interpreter*. The TensorFlow Lite interpreter is designed to be lean and fast.
+The interpreter uses a static graph ordering and a custom (less-dynamic) memory
+allocator to ensure minimal load, initialization, and execution latency.
 
-Inference for TensorFlow Lite models is run through an interpreter. The
-TensorFlow Lite interpreter is designed to be lean and fast. The interpreter
-uses a static graph ordering and a custom (less-dynamic) memory allocator to
-ensure minimal load, initialization, and execution latency.
+This page describes how to access to the TensorFlow Lite interpreter and
+perform an inference using C++, Java, and Python, plus links to other resources
+for each [supported platform](#supported-platforms).
 
-This document outlines the various APIs for the interpreter, along with the
-[supported platforms](#supported-platforms).
+[TOC]
 
-### Important Concepts
+## Important concepts
 
-TensorFlow Lite inference on device typically follows the following steps.
+TensorFlow Lite inference typically follows the following steps:
 
-1. **Loading a Model**
+1. **Loading a model**
 
-   The user loads the `.tflite` model into memory which contains the model's
+   You must load the `.tflite` model into memory, which contains the model's
    execution graph.
 
-1. **Transforming Data**
-   Input data acquired by the user generally may not match the input data format
-   expected by the model. For eg., a user may need to resize an image or change
-   the image format to be used by the model.
+1. **Transforming data**
 
-1. **Running Inference**
+   Raw input data for the model generally does not match the input data format
+   expected by the model. For example, you might need to resize an image or
+   change the image format to be compatible with the model.
 
-   This step involves using the API to execute the model. It involves a few
-   steps such as building the interpreter, and allocating tensors as explained
-   in detail in [Running a Model](#running_a_model).
+1. **Running inference**
 
-1. **Interpreting Output**
+   This step involves using the TensorFlow Lite API to execute the model. It
+   involves a few steps such as building the interpreter, and allocating
+   tensors, as described in the following sections.
 
-   The user retrieves results from model inference and interprets the tensors in
-   a meaningful way to be used in the application.
+1. **Interpreting output**
 
-   For example, a model may only return a list of probabilities. It is up to the
-   application developer to meaningully map them to relevant categories and
-   present it to their user.
+   When you receive results from the model inference, you must interpret the
+   tensors in a meaningful way that's useful in your application.
 
-### Supported Platforms
+   For example, a model might return only a list of probabilities. It's up to
+   you to map the probabilities to relevant categories and present it to your
+   end-user.
+
+## Supported platforms
 
 TensorFlow inference APIs are provided for most common mobile/embedded platforms
-such as Android, iOS and Linux.
+such as Android, iOS and Linux, in multiple programming languages.
 
-#### Android
+In most cases, the API design reflects a preference for performance over ease of
+use. TensorFlow Lite is designed for fast inference on small devices, so it
+should be no surprise that the APIs try to avoid unnecessary copies at the
+expense of convenience. Similarly, consistency with TensorFlow APIs was not an
+explicit goal and some variance between languages is to be expected.
+
+Across all libraries, the TensorFlow Lite API enables you to load models,
+feed inputs, and retrieve inference outputs.
+
+### Android
 
 On Android, TensorFlow Lite inference can be performed using either Java or C++
 APIs. The Java APIs provide convenience and can be used directly within your
 Android Activity classes. The C++ APIs offer more flexibility and speed, but may
 require writing JNI wrappers to move data between Java and C++ layers.
 
-Visit the [Android quickstart](android.md) for a tutorial and example code.
+See below for details about using C++ and Java, or
+follow the [Android quickstart](android.md) for a tutorial and example code.
 
-#### iOS
+### iOS
 
-TensorFlow Lite provides native iOS libraries written in
+On iOS, TensorFlow Lite is available with native iOS libraries written in
 [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
 and
 [Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
 
-Visit the [iOS quickstart](ios.md) for a tutorial and example code.
+This page doesn't include a discussion for about these languages, so you should
+refer to the [iOS quickstart](ios.md) for a tutorial and example code.
 
-#### Linux
-On Linux platforms such as [Raspberry Pi](build_rpi.md), TensorFlow Lite C++
-and Python APIs can be used to run inference.
+### Linux
+
+On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run
+inferences using TensorFlow Lite APIs available in C++ and Python, as shown
+in the following sections.
 
 
-## API Guides
+## Load and run a model in C++
 
-TensorFlow Lite provides programming APIs in C++, Java and Python, with
-experimental bindings for several other languages (C, Swift, Objective-C). In
-most cases, the API design reflects a preference for performance over ease of
-use. TensorFlow Lite is designed for fast inference on small devices so it
-should be no surprise that the APIs try to avoid unnecessary copies at the
-expense of convenience. Similarly, consistency with TensorFlow APIs was not an
-explicit goal and some variance is to be expected.
+Running a TensorFlow Lite model with C++ involves a few simple steps:
 
-There is also a [Python API for TensorFlow Lite](../convert/python_api.md).
+  1. Load the model into memory as a `FlatBufferModel`.
+  2. Build an `Interpreter` based on an existing `FlatBufferModel`.
+  3. Set input tensor values. (Optionally resize input tensors if the
+     predefined sizes are not desired.)
+  4. Invoke inference.
+  5. Read output tensor values.
 
-### Loading a Model
-
-#### C++
-The `FlatBufferModel` class encapsulates a model and can be built in a couple of
-slightly different ways depending on where the model is stored:
+The [`FlatBufferModel`](
+https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
+class encapsulates a TensorFlow Lite model and you can
+build it in a couple of different ways, depending on where the model is stored:
 
 ```c++
 class FlatBufferModel {
@@ -104,72 +117,36 @@ class FlatBufferModel {
 };
 ```
 
-```c++
-tflite::FlatBufferModel model(path_to_model);
-```
+Note: If TensorFlow Lite detects the presence of the [Android NNAPI](
+https://developer.android.com/ndk/guides/neuralnetworks), it will
+automatically try to use shared memory to store the `FlatBufferModel`.
 
-Note that if TensorFlow Lite detects the presence of Android's NNAPI it will
-automatically try to use shared memory to store the FlatBufferModel.
+Now that you have the model as a `FlatBufferModel` object, you can execute it
+with an [`Interpreter`](
+https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
+A single `FlatBufferModel` can be used
+simultaneously by more than one `Interpreter`.
 
-#### Java
+Caution: The `FlatBufferModel` object must remain valid until
+all instances of `Interpreter` using it have been destroyed.
 
-TensorFlow Lite's Java API supports on-device inference and is provided as an
-Android Studio Library that allows loading models, feeding inputs, and
-retrieving inference outputs.
-
-The `Interpreter` class drives model inference with TensorFlow Lite. In
-most of the cases, this is the only class an app developer will need.
-
-The `Interpreter` can be initialized with a model file using the constructor:
-
-```java
-public Interpreter(@NotNull File modelFile);
-```
-
-or with a `MappedByteBuffer`:
-
-```java
-public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
-```
-
-In both cases a valid TensorFlow Lite model must be provided or an
-`IllegalArgumentException` with be thrown. If a `MappedByteBuffer` is used to
-initialize an Interpreter, it should remain unchanged for the whole lifetime of
-the `Interpreter`.
-
-### Running a Model {#running_a_model}
-
-#### C++
-Running a model involves a few simple steps:
-
-  * Build an `Interpreter` based on an existing `FlatBufferModel`
-  * Optionally resize input tensors if the predefined sizes are not desired.
-  * Set input tensor values
-  * Invoke inference
-  * Read output tensor values
-
-The important parts of public interface of the `Interpreter` are provided
-below. It should be noted that:
+The important parts of the `Interpreter` API are shown in the
+code snippet below. It should be noted that:
 
   * Tensors are represented by integers, in order to avoid string comparisons
     (and any fixed dependency on string libraries).
   * An interpreter must not be accessed from concurrent threads.
   * Memory allocation for input and output tensors must be triggered
-    by calling AllocateTensors() right after resizing tensors.
+    by calling `AllocateTensors()` right after resizing tensors.
 
-In order to run the inference model in TensorFlow Lite, one has to load the
-model into a `FlatBufferModel` object which then can be executed by an
-`Interpreter`.  The `FlatBufferModel` needs to remain valid for the whole
-lifetime of the `Interpreter`, and a single `FlatBufferModel` can be
-simultaneously used by more than one `Interpreter`. In concrete terms, the
-`FlatBufferModel` object must be created before any `Interpreter` objects that
-use it, and must be kept around until they have all been destroyed.
-
-The simplest usage of TensorFlow Lite will look like this:
+The simplest usage of TensorFlow Lite with C++ looks like this:
 
 ```c++
-tflite::FlatBufferModel model(path_to_model);
+// Load the model
+std::unique_ptr<tflite::FlatBufferModel> model =
+    tflite::FlatBufferModel::BuildFromFile(filename);
 
+// Build the interpreter
 tflite::ops::builtin::BuiltinOpResolver resolver;
 std::unique_ptr<tflite::Interpreter> interpreter;
 tflite::InterpreterBuilder(*model, resolver)(&interpreter);
@@ -185,9 +162,40 @@ interpreter->Invoke();
 float* output = interpreter->typed_output_tensor<float>(0);
 ```
 
-#### Java
+For more example code, see [`minimal.cc`](
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
+and [`label_image.cc`](
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
 
-The simplest usage of Tensorflow Lite Java API looks like this:
+
+## Load and run a model in Java
+
+The Java API for running an inference with TensorFlow Lite is primarily designed
+for use with Android, so it's available as an Android library dependency:
+`org.tensorflow:tensorflow-lite`.
+
+In Java, you'll use the `Interpreter` class to load a model and drive model
+inference. In many cases, this may be the only API you need.
+
+You can initialize an `Interpreter` using a `.tflite` file:
+
+```java
+public Interpreter(@NotNull File modelFile);
+```
+
+Or with a `MappedByteBuffer`:
+
+```java
+public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
+```
+
+In both cases, you must provide a valid TensorFlow Lite model or the API throws
+`IllegalArgumentException`. If you use `MappedByteBuffer` to
+initialize an `Interpreter`, it must remain unchanged for the whole lifetime
+of the `Interpreter`.
+
+To then run an inference with the model, simply call `Interpreter.run()`.
+For example:
 
 ```java
 try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
@@ -195,48 +203,44 @@ try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model))
 }
 ```
 
-If a model takes only one input and returns only one output, the following will
-trigger an inference run:
-
-```java
-interpreter.run(input, output);
-```
-
-For models with multiple inputs, or multiple outputs, use:
+The `run()` method takes only one input and returns only one output. So if your
+model has multiple inputs or multiple outputs, instead use:
 
 ```java
 interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
 ```
 
-where each entry in `inputs` corresponds to an input tensor and
+In this case, each entry in `inputs` corresponds to an input tensor and
 `map_of_indices_to_outputs` maps indices of output tensors to the corresponding
-output data. In both cases the tensor indices should correspond to the values
-given to the
-[TensorFlow Lite Optimized Converter](../convert/cmdline_examples.md) when the
-model was created. Be aware that the order of tensors in `input` must match the
-order given to the `TensorFlow Lite Optimized Converter`.
+output data.
 
-The Java API also provides convenient functions for app developers to get the
-index of any model input or output using a tensor name:
+In both cases, the tensor indices should correspond to the values you gave to
+the [TensorFlow Lite Converter](../convert/) when you created the model.
+Be aware that the order of tensors in `input` must match the
+order given to the TensorFlow Lite Converter.
+
+The `Interpreter` class also provides convenient functions for you to get the
+index of any model input or output using an operation name:
 
 ```java
-public int getInputIndex(String tensorName);
-public int getOutputIndex(String tensorName);
+public int getInputIndex(String opName);
+public int getOutputIndex(String opName);
 ```
 
-If tensorName is not a valid name in model, an `IllegalArgumentException` will
-be thrown.
+If `opName` is not a valid operation in the model, it throws an
+`IllegalArgumentException`.
 
-##### Releasing Resources After Use
-
-An `Interpreter` owns resources. To avoid memory leak, the resources must be
-released after use by:
+Also beware that `Interpreter` owns resources. To avoid memory leak, the
+resources must be released after use by:
 
 ```java
 interpreter.close();
 ```
 
-##### Supported Data Types
+For an example project with Java, see the [Android image classification sample](
+https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
+
+### Supported data types (in Java)
 
 To use TensorFlow Lite, the data types of the input and output tensors must be
 one of the following primitive types:
@@ -256,7 +260,7 @@ provided as a single, flat `ByteBuffer` argument.
 If other data types, including boxed types like `Integer` and `Float`, are used,
 an `IllegalArgumentException` will be thrown.
 
-##### Inputs
+#### Inputs
 
 Each input should be an array or multi-dimensional array of the supported
 primitive types, or a raw `ByteBuffer` of the appropriate size. If the input is
@@ -265,12 +269,12 @@ implicitly resized to the array's dimensions at inference time. If the input is
 a ByteBuffer, the caller should first manually resize the associated input
 tensor (via `Interpreter.resizeInput()`) before running inference.
 
-When using 'ByteBuffer', prefer using direct byte buffers, as this allows the
+When using `ByteBuffer`, prefer using direct byte buffers, as this allows the
 `Interpreter` to avoid unnecessary copies. If the `ByteBuffer` is a direct byte
 buffer, its order must be `ByteOrder.nativeOrder()`. After it is used for a
 model inference, it must remain unchanged until the model inference is finished.
 
-##### Outputs
+#### Outputs
 
 Each output should be an array or multi-dimensional array of the supported
 primitive types, or a ByteBuffer of the appropriate size. Note that some models
@@ -279,7 +283,75 @@ the input. There's no straightforward way of handling this with the existing
 Java inference API, but planned extensions will make this possible.
 
 
-## Writing Custom Operators
+## Load and run a model in Python
+
+The Python API for running an inference is provided in the `tf.lite`
+module. From which, you mostly need only [`tf.lite.Interpreter`](
+https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to load
+a model and run an inference.
+
+The following example shows how to use the Python interpreter to load a
+`.tflite` file and run inference with random input data:
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="converted_model.tflite")
+interpreter.allocate_tensors()
+
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+# Test model on random input data.
+input_shape = input_details[0]['shape']
+input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
+interpreter.set_tensor(input_details[0]['index'], input_data)
+
+interpreter.invoke()
+
+# The function `get_tensor()` returns a copy of the tensor data.
+# Use `tensor()` in order to get a pointer to the tensor.
+output_data = interpreter.get_tensor(output_details[0]['index'])
+print(output_data)
+```
+
+Alternative to loading the model as a pre-converted `.tflite` file, you can
+combine your code with the [TensorFlow Lite Converter Python API](
+../convert/python_api.md) (`tf.lite.TFLiteConverter`), allowing you to convert
+your TensorFlow model into the TensorFlow Lite format and then run an inference:
+
+```python
+import numpy as np
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+val = img + const
+out = tf.identity(val, name="out")
+
+# Convert to TF Lite format
+with tf.Session() as sess:
+  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
+  tflite_model = converter.convert()
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
+
+# Continue to get tensors and so forth, as shown above...
+```
+
+For more Python sample code, see [`label_image.py`](
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
+
+Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
+documentation about the interpreter.
+
+
+## Write a custom operator
 
 All TensorFlow Lite operators (both custom and builtin) are defined using a
 simple pure-C interface that consists of four functions:
@@ -343,7 +415,7 @@ Note that registration is not automatic and an explicit call to
 registration of builtins, custom ops will have to be collected in separate
 custom libraries.
 
-### Customizing the kernel library
+### Customize the kernel library
 
 Behind the scenes the interpreter will load a library of kernels which will be
 assigned to execute each of the operators in the model. While the default
@@ -362,21 +434,19 @@ class OpResolver {
 };
 ```
 
-Regular usage will require the developer to use the `BuiltinOpResolver` and
-write:
+Regular usage requires that you use the `BuiltinOpResolver` and write:
 
 ```c++
 tflite::ops::builtin::BuiltinOpResolver resolver;
 ```
 
-They can then optionally register custom ops:
+You can optionally register custom ops (before you pass the resolver to the
+`InterpreterBuilder`):
 
 ```c++
 resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
 ```
 
-before the resolver is passed to the `InterpreterBuilder`.
-
 If the set of builtin ops is deemed to be too large, a new `OpResolver` could
 be code-generated  based on a given subset of ops, possibly only the ones
 contained in a given model. This is the equivalent of TensorFlow's selective
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 7f2d5e31d26..7990a7ae0b6 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -40,7 +40,7 @@ for `target_spec.supported_ops`:
 *   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
 *   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
     supported ops can be found in the whitelist at
-    `lite/toco/tflite/whitelisted_flex_ops.cc`.
+    `lite/delegates/flex/whitelisted_flex_ops.cc`.
 
 Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
 
diff --git a/tensorflow/lite/g3doc/guide/ops_version.md b/tensorflow/lite/g3doc/guide/ops_version.md
index 9418ce4e92a..c83ea56e692 100644
--- a/tensorflow/lite/g3doc/guide/ops_version.md
+++ b/tensorflow/lite/g3doc/guide/ops_version.md
@@ -155,7 +155,7 @@ AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 2);
 
 ### Change TOCO TFLite exporter
 
-The last step is to make TOCO populate the minimum version that's required to
+The next step is to make TOCO populate the minimum version that's required to
 execute the op. In this example, it means:
 
 *   Populate version=1 when dilation factors are all 1.
@@ -184,6 +184,21 @@ int GetVersion(const Operator& op) const override {
 }
 ```
 
+### Update the operator version map
+
+The last step is to add the new version info into the operator version map. This
+step is required because we need generate the model's minimum required runtime
+version based on this version map.
+
+To do this, you need to add a new map entry in `lite/toco/tflite/op_version.cc`.
+
+In this example, it means you need to add the following into `op_version_map`:
+```
+{{OperatorType::kConv, 3}, "kPendingReleaseOpVersion"}
+```
+(`kPendingReleaseOpVersion` will be replaced with the appropriate release
+version in the next stable release.)
+
 ### Delegation Implementation
 
 TensorFlow Lite provides a delegation API which enables delegating ops to
diff --git a/tensorflow/lite/g3doc/guide/python.md b/tensorflow/lite/g3doc/guide/python.md
new file mode 100644
index 00000000000..fbedd0822b3
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/python.md
@@ -0,0 +1,100 @@
+# Python quickstart
+
+Using TensorFlow Lite with Python is great for embedded devices based on Linux,
+such as [Raspberry Pi](https://www.raspberrypi.org/){:.external} and
+[Coral devices with Edge TPU](https://coral.withgoogle.com/){:.external},
+among many others.
+
+This page shows how you can start running TensorFlow Lite models with Python in
+just a few minutes. All you need is a TensorFlow model [converted to TensorFlow
+Lite](../convert/). (If you don't have a model converted yet, you can experiment
+using the model provided with the example linked below.)
+
+## Install just the TensorFlow Lite interpreter
+
+To quickly start executing TensorFlow Lite models with Python, you can install
+just the TensorFlow Lite interpreter, instead of all TensorFlow packages.
+
+This interpreter-only package is a fraction the size of the full TensorFlow
+package and includes the bare minimum code required to run inferences with
+TensorFlow Lite—it includes only the [`tf.lite.Interpreter`](
+https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) Python class.
+This small package is ideal when all you want to do is execute `.tflite` models
+and avoid wasting disk space with the large TensorFlow library.
+
+Note: If you need access to other Python APIs, such as the [TensorFlow Lite
+Converter](../convert/python_api.md), you must install the [full TensorFlow
+package](https://www.tensorflow.org/install/).
+
+To install just the interpreter, download the appropriate Python wheel for your
+system from the following table, and then install it with the `pip install`
+command.
+
+For example, if you're setting up a Raspberry Pi (using Raspbian Buster, which
+has Python 3.7), install the Python wheel as follows (after you click to
+download the `.whl` file below):
+
+<pre class="devsite-terminal devsite-click-to-copy">
+pip3 install tflite_runtime-1.14.0-cp37-cp37m-linux_armv7l.whl
+</pre>
+
+<table>
+<tr><th></th><th>ARM 32</th><th>ARM 64</th><th>x86-64</th></tr>
+<tr><th style="white-space:nowrap">Python 3.5</th>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp35-cp35m-linux_armv7l.whl"
+    >tflite_runtime-1.14.0-cp35-cp35m-linux_armv7l.whl</a></td>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp35-cp35m-linux_aarch64.whl"
+    >tflite_runtime-1.14.0-cp35-cp35m-linux_aarch64.whl</a></td>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp35-cp35m-linux_x86_64.whl"
+    >tflite_runtime-1.14.0-cp35-cp35m-linux_x86_64.whl</a></td>
+</tr>
+<tr><th>Python 3.6</th>
+  <td>N/A</td>
+  <td>N/A</td>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp36-cp36m-linux_x86_64.whl"
+    >tflite_runtime-1.14.0-cp36-cp36m-linux_x86_64.whl</a></td>
+</tr>
+<tr><th>Python 3.7</th>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp37-cp37m-linux_armv7l.whl"
+    >tflite_runtime-1.14.0-cp37-cp37m-linux_armv7l.whl</a></td>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp37-cp37m-linux_aarch64.whl"
+    >tflite_runtime-1.14.0-cp37-cp37m-linux_aarch64.whl</a></td>
+  <td><a href="https://dl.google.com/coral/python/tflite_runtime-1.14.0-cp37-cp37m-linux_x86_64.whl"
+    >tflite_runtime-1.14.0-cp37-cp37m-linux_x86_64.whl</a></td>
+</tr>
+</table>
+
+
+## Run an inference using tflite_runtime
+
+To distinguish this interpreter-only package from the full TensorFlow package
+(allowing both to be installed, if you choose), the Python module provided in
+the above wheel is named `tflite_runtime`.
+
+So instead of importing `Interpreter` from the `tensorflow` module, you need to
+import it from `tflite_runtime`.
+
+For example, after you install the package above, copy and run the
+[`label_image.py`](
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/examples/python/)
+file. It will (probably) fail because you don't have the `tensorflow` library
+installed. To fix it, simply edit this line of the file:
+
+```python
+from tensorflow.lite.python.interpreter import Interpreter
+```
+
+So it instead reads:
+
+```python
+from tflite_runtime import Interpreter
+```
+
+Now run `label_image.py` again. That's it! You're now executing TensorFlow Lite
+models.
+
+For more details about the `Interpreter` API, read [Load and run a model
+in Python](inference.md#load-and-run-a-model-in-python).
+
+To convert other TensorFlow models to TensorFlow Lite, read about the
+the [TensorFlow Lite Converter](../convert/).
diff --git a/tensorflow/lite/g3doc/images/performance/accuracy_vs_latency.png b/tensorflow/lite/g3doc/images/performance/accuracy_vs_latency.png
new file mode 100644
index 00000000000..421acb6d98d
Binary files /dev/null and b/tensorflow/lite/g3doc/images/performance/accuracy_vs_latency.png differ
diff --git a/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png b/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png
deleted file mode 100644
index 94a63106128..00000000000
Binary files a/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png and /dev/null differ
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index 9c402c568e1..92fba24001e 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -9,6 +9,11 @@ This document explains the process of converting a TensorFlow model to run on
 microcontrollers. It also outlines the supported operations and gives some
 guidance on designing and training a model to fit in limited memory.
 
+For an end-to-end, runnable example of building and converting a model, see the
+following Jupyter notebook:
+
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/hello_world/create_sine_model.ipynb">create_sine_model.ipynb</a>
+
 ## Model conversion
 
 To convert a trained TensorFlow model to run on microcontrollers, you should use
@@ -53,7 +58,7 @@ xxd -i converted_model.tflite > model_data.cc
 
 The output will look similar to the following:
 
-```C
+```c
 unsigned char converted_model_tflite[] = {
   0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
   // <Lines omitted>
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
index f5afa01f160..9b126b5c02e 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -3,12 +3,54 @@
 This document will help you start working with TensorFlow Lite for
 Microcontrollers.
 
-## Sample code
+Start by reading through and running our [Examples](#examples).
 
-To get started, you can explore the following example:
+Note: If you need a device to get started, we recommend the
+[SparkFun Edge Powered by TensorFlow](https://www.sparkfun.com/products/15170).
+It was designed in conjunction with the TensorFlow Lite team to offer a flexible
+platform for experimenting with deep learning on microcontrollers.
 
-<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech">Micro
-speech example</a>
+For a walkthrough of the code required to run inference, see the *Run inference*
+section below.
+
+## Examples
+
+There are several examples that demonstrate how to build embedded machine
+learning applications with TensorFlow Lite:
+
+### Hello World example
+
+This example is designed to demonstrate the absolute basics of using TensorFlow
+Lite for Microcontrollers. It includes the full end-to-end workflow of training
+a model, converting it for use with TensorFlow Lite, and running inference on a
+microcontroller.
+
+In the example, a model is trained to replicate a sine function. When deployed
+to a microcontroller, its predictions are used to either blink LEDs or control
+an animation.
+
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/hello_world">Hello
+World example</a>
+
+The example code includes a Jupyter notebook that demonstrates how the model is
+trained and converted:
+
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/hello_world/create_sine_model.ipynb">create_sine_model.ipynb</a>
+
+The process of building and converting a model is also covered in the guide
+[Build and convert models](build_convert.md).
+
+To see how inference is performed, take a look at
+[hello_world_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc).
+
+The example is tested on the following platforms:
+
+-   [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170)
+-   [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero)
+-   [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
+-   Mac OS X
+
+### Micro Speech example
 
 This example uses a simple
 [audio recognition model](https://www.tensorflow.org/tutorials/sequences/audio_recognition)
@@ -16,48 +58,43 @@ to identify keywords in speech. The sample code captures audio from a device's
 microphones. The model classifies this audio in real time, determining whether
 the word "yes" or "no" has been spoken.
 
-The sample works end-to-end (including audio capture and inference) on the
-following platforms:
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech">Micro
+Speech example</a>
+
+The [Run inference](#run_inference) section walks through the code of the Micro
+Speech sample and explains how it works.
+
+The example is tested on the following platforms:
 
 -   [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170)
 -   [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
 -   Mac OS X
 
-### SparkFun Edge
-
-If you need a device to get started, we recommend the
-[SparkFun Edge Powered by TensorFlow](https://www.sparkfun.com/products/15170).
-It was designed in conjunction with the TensorFlow Lite team to offer a flexible
-platform for experimenting with deep learning on microcontrollers.
-
-To get started using the Edge board, we recommend following
+Note: To get started using the SparkFun Edge board, we recommend following
 [Machine learning on a microcontroller with SparkFun TensorFlow](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow),
-a codelab that introduces you to the development workflow.
+a codelab that introduces you to the development workflow using the Micro Speech
+example.
 
-## Workflow
+### Micro Vision example
 
-Using TensorFlow Lite for Microcontrollers involves four major steps:
+This example shows how you can use TensorFlow Lite to run a 250 kilobyte neural
+network to recognize people in images captured by a camera. It is designed to
+run on systems with small amounts of memory such as microcontrollers and DSPs.
 
-1.  Create or find a model architecture.
-2.  Train a model.
-3.  Convert the model.
-4.  Write code to run inference.
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_vision">Micro
+Vision example</a>
 
-The first three steps are covered in the guide
-[Build and convert models](build_convert.md). The sample code comes with a
-pretrained model, and includes scripts to train a model that recognizes
-different spoken words. Instructions on training are in
-[README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/README.md#creating-your-own-model).
+The example is tested on the following platforms:
 
-In this document, we will focus on the code that will feed processed audio data
-into the model and execute it, resulting in a prediction of which word was
-spoken. This process is called *inference*.
+-   [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170)
+-   [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)
+-   Mac OS X
 
 ## Run inference
 
-The sample's
+The following section walks through the [Micro Speech](#micro_speech) sample's
 [main.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc)
-contains the code that runs inference. We'll now walk through the key parts.
+and explains how it used TensorFlow Lite for Microcontrollers to run inference.
 
 ### Includes
 
@@ -277,48 +314,9 @@ recognition results across a number of frames. This is defined in
 The same technique can be used to improve reliability when processing any
 continuous stream of data.
 
-## Build the sample
-
-The sample contains build scripts that will download all required dependencies
-and compile a binary that can be run on a device.
-
-Note: The build process has been tested on MacOS and Linux, but not on Windows.
-
-To build the sample, take the following steps:
-
-1.  Clone the TensorFlow repository from GitHub to a convenient place.
-
-    ```bash
-    git clone --depth 1 https://github.com/tensorflow/tensorflow.git
-    ```
-
-1.  Enter the directory that was created in the previous step.
-
-    ```bash
-    cd tensorflow
-    ```
-
-1.  If you are using MacOS, run the following command. If you are using Linux,
-    you do not need to do this.
-
-    ```bash
-    PATH=tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/:$PATH
-    ```
-
-1.  To download all of the required dependencies and initiate the build process,
-    issue the following command. You can set `TARGET` depending on which
-    platform you want to build for. Explore
-    [`targets/`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/tools/make/targets)
-    for the current options.
-
-    ```bash
-    make -f tensorflow/lite/experimental/micro/tools/make/Makefile
-    TARGET=sparkfun_edge micro_speech_bin
-    ```
-
 ## Next steps
 
-Once you have built and run the sample, read the following documents:
+Once you have built and run the samples, read the following documents:
 
 *   Learn how to work with models in
     [Build and convert models](build_convert.md).
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
index 94df4aac0d9..f9da6398846 100644
--- a/tensorflow/lite/g3doc/models/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -20,7 +20,7 @@ If you are using a platform other than Android or iOS, or you are already
 familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
 download our starter object detection model and the accompanying labels.
 
-<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
 starter model and labels</a>
 
 For more information about the starter model, see
@@ -185,7 +185,7 @@ Note: Object detection models accept input images of a specific size. This is li
 We recommend starting with this pre-trained quantized COCO SSD MobileNet v1
 model.
 
-<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
 starter model and labels</a>
 
 ### Uses and limitations
@@ -193,7 +193,7 @@ starter model and labels</a>
 The object detection model we provide can identify and locate up to 10 objects
 in an image. It is trained to recognize 80 classes of object. For a full list of
 classes, see the labels file in the
-<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
+<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
 zip</a>.
 
 If you want to train a model to recognize new classes, see
@@ -256,7 +256,7 @@ each object. There will always be 10 objects detected.
 
 The pre-trained models we provide are trained to detect 80 classes of object.
 For a full list of classes, see the labels file in the
-<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
+<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
 zip</a>.
 
 You can use a technique known as transfer learning to re-train a model to
diff --git a/tensorflow/lite/g3doc/models/pose_estimation/overview.md b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
index b64d8b1c180..3ff915cbfea 100644
--- a/tensorflow/lite/g3doc/models/pose_estimation/overview.md
+++ b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
@@ -7,14 +7,22 @@
 _PoseNet_ is a vision model that can be used to estimate the pose of a person in
 an image or video by estimating where key body joints are.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download
-starter model</a>
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/posenet_mobilenet_v1_100_257x257_multi_kpt_stripped.tflite">
+Download starter model</a>
 
 Android and iOS end-to-end tutorials are coming soon. In the meantime, if you
 want to experiment this on a web browser, check out the
 <a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">TensorFlow.js
 GitHub repository</a>.
 
+### Example applications and guides
+
+There is a TensorFlow Lite sample application that demonstrates the PoseNet
+model on Android.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/posenet/android">
+Android example</a>.
+
 ## How it works
 
 Pose estimation refers to computer vision techniques that detect human figures
@@ -138,6 +146,7 @@ is faster but results in lower accuracy.
 <ul>
   <li><a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">Blog post: Real-time Human Pose Estimation in the Browser with TensorFlow.js</a></li>
   <li><a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">TF.js GitHub: Pose Detection in the Browser: PoseNet Model</a></li>
+   <li><a href="https://medium.com/tensorflow/track-human-poses-in-real-time-on-android-with-tensorflow-lite-e66d0f3e6f9e">Blog post: Track human poses in real-time on Android with TensorFlow Lite</a></li>
 </ul>
 
 ### Use cases
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
index b2363adcf48..abfcc8c2393 100644
--- a/tensorflow/lite/g3doc/models/smart_reply/overview.md
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -8,7 +8,7 @@ Our smart reply model generates reply suggestions based on chat messages. The
 suggestions are intended to be contextually relevant, one-touch responses that
 help the user to easily reply to an incoming message.
 
-<a class="button button-primary" href="http://download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download
 starter model and labels</a>
 
 ### Sample application
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
index a51fdb40807..c7305209f69 100644
--- a/tensorflow/lite/g3doc/performance/benchmarks.md
+++ b/tensorflow/lite/g3doc/performance/benchmarks.md
@@ -46,7 +46,7 @@ Pixel xl | 0c |
   </thead>
   <tr>
     <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
     </td>
     <td>Pixel 2 </td>
     <td>123.3 ms</td>
@@ -57,7 +57,7 @@ Pixel xl | 0c |
   </tr>
   <tr>
     <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
     </td>
     <td>Pixel 2 </td>
     <td>65.4 ms</td>
@@ -130,14 +130,14 @@ modified  to set `num_threads` to 1.
   </thead>
   <tr>
     <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
     </td>
     <td>iPhone 8 </td>
     <td>32.2 ms</td>
   </tr>
   <tr>
     <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
     </td>
     <td>iPhone 8 </td>
     <td>24.4 ms</td>
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index cb6d01d2edb..76553cedcfd 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -16,7 +16,7 @@ accuracy and latency tradeoffs for some common image classification models.
 
 ![Graph of model size vs accuracy](../images/performance/model_size_vs_accuracy.png "Model Size vs Accuracy")
 
-![Graph of model size vs latency](../images/performance/model_size_vs_latency.png "Model Size vs Latency")
+![Graph of accuracy vs latency](../images/performance/accuracy_vs_latency.png "Accuracy vs Latency")
 
 One example of models optimized for mobile devices are
 [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index cb6494dcbcd..b1ccb9ef072 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -1,4 +1,4 @@
-## TensorFlow Lite delegates
+# TensorFlow Lite delegates
 
 _Note: Delegate API is still experimental and is subject to change._
 
diff --git a/tensorflow/lite/g3doc/performance/images/optimization.jpg b/tensorflow/lite/g3doc/performance/images/optimization.jpg
index 1a419f607d6..f866768509d 100644
Binary files a/tensorflow/lite/g3doc/performance/images/optimization.jpg and b/tensorflow/lite/g3doc/performance/images/optimization.jpg differ
diff --git a/tensorflow/lite/tutorials/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
similarity index 71%
rename from tensorflow/lite/tutorials/post_training_integer_quant.ipynb
rename to tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index 1629b4c9af3..87f508165b8 100644
--- a/tensorflow/lite/tutorials/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -3,7 +3,7 @@
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
-      "name": "post-training--integer-quant.ipynb",
+      "name": "post_training-float16-quant.ipynb",
       "version": "0.3.2",
       "provenance": [],
       "private_outputs": true,
@@ -11,11 +11,45 @@
       "toc_visible": true
     },
     "kernelspec": {
-      "display_name": "Python 2",
-      "name": "python2"
+      "name": "python3",
+      "display_name": "Python 3"
     }
   },
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c8Cx-rUMVX25",
+        "colab_type": "text"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "I9sUhVL_VZNO",
+        "colab_type": "code",
+        "colab": {},
+        "cellView": "form"
+      },
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -23,22 +57,25 @@
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
-        "# Post Training Integer Quantization"
+        "# Post-training float16 quantization"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "CIGrZZPTZVeO"
+        "id": "CGuqeuPSVNo-",
+        "colab_type": "text"
       },
       "source": [
         "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
         "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_integer_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/performance/post_training_float16_quant\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
         "  </td>\n",
         "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_integer_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
         "  </td>\n",
         "</table>"
       ]
@@ -53,16 +90,12 @@
         "## Overview\n",
         "\n",
         "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
-        "converting an entire model (weights and activations) to 8-bit during model conversion from TensorFlow to TensorFlow Lite's flat buffer format. This results in a 4x reduction in model size and a 3 to 4x performance improvement on CPU performance. In addition, this fully quantized model can be consumed by integer-only hardware accelerators.\n",
+        "converting weights to 16-bit floating point values during model conversion from TensorFlow to TensorFlow Lite's flat buffer format. This results in a 2x reduction in model size. Some harware, like GPUs, can compute natively in this reduced precision arithmetic, realizing a speedup over traditional floating point execution. The Tensorflow Lite GPU delegate can be configured to run in this way. However, a model converted to float16 weights can still run on the CPU without additional modification: the float16 weights are  upsampled to float32 prior to the first inference. This permits a significant reduction in model size in exchange for a minimal impacts to latency and accuracy.\n",
         "\n",
-        "In contrast to [post-training \"on-the-fly\" quantization](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb)\n",
-        ", which only stores weights as 8-bit ints, in this technique all weights *and* activations are quantized statically during model conversion.\n",
-        "\n",
-        "In this tutorial, we train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the saved model into a Tensorflow Lite flatbuffer\n",
-        "with full quantization. We finally check the\n",
-        "accuracy of the converted model and compare it to the original saved model. We\n",
-        "run the training script [mnist.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py) from\n",
-        "[Tensorflow official MNIST tutorial](https://github.com/tensorflow/models/tree/master/official/mnist).\n"
+        "In this tutorial, you train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the saved model into a Tensorflow Lite flatbuffer\n",
+        "with float16 quantization. Finally, check the\n",
+        "accuracy of the converted model and compare it to the original saved model. The training script, `mnist.py`, is available from the\n",
+        "[TensorFlow official MNIST tutorial](https://github.com/tensorflow/models/tree/master/official/mnist).\n"
       ]
     },
     {
@@ -72,7 +105,7 @@
         "id": "2XsEP17Zelz9"
       },
       "source": [
-        "## Building an MNIST model"
+        "## Build an MNIST model"
       ]
     },
     {
@@ -108,7 +141,11 @@
       },
       "source": [
         "import tensorflow as tf\n",
-        "tf.enable_eager_execution()"
+        "tf.enable_eager_execution()\n",
+        "\n",
+        "import numpy as np\n",
+        "\n",
+        "tf.logging.set_verbosity(tf.logging.DEBUG)"
       ],
       "execution_count": 0,
       "outputs": []
@@ -126,6 +163,19 @@
       "execution_count": 0,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "c6nb7OPlXs_3",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "tf.lite.constants.FLOAT16"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "metadata": {
@@ -181,7 +231,6 @@
       },
       "source": [
         "# The above path addition is not visible to subprocesses, add the path for the subprocess as well.\n",
-        "# Note: channels_last is required here or the conversion may fail. \n",
         "!PYTHONPATH={models_path} python models/official/mnist/mnist.py --train_epochs=1 --export_dir {saved_models_root} --data_format=channels_last"
       ],
       "execution_count": 0,
@@ -194,8 +243,7 @@
         "id": "5NMaNZQCkW9X"
       },
       "source": [
-        "For the example, we only trained the model for a single epoch, so it only trains to ~96% accuracy.\n",
-        "\n"
+        "For the example, you trained the model for just a single epoch, so it only trains to ~96% accuracy."
       ]
     },
     {
@@ -244,10 +292,6 @@
         "colab": {}
       },
       "source": [
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "tf.logging.set_verbosity(tf.logging.DEBUG)\n",
-        "\n",
         "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
       ],
@@ -299,7 +343,7 @@
         "id": "7BONhYtYocQY"
       },
       "source": [
-        "To instead quantize the model on export, first set the `optimizations` flag to optimize for size:"
+        "To instead quantize the model to float16 on export, first set the `optimizations` flag to use default optimizations. Then specify that float16 is the supported type on the target platform:"
       ]
     },
     {
@@ -311,37 +355,8 @@
       },
       "source": [
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
-        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rTe8avZJHMDO",
-        "colab_type": "text"
-      },
-      "source": [
-        "Now, construct and provide a representative dataset, this is used to get the dynamic range of activations."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "FiwiWU3gHdkW",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
-        "images = tf.cast(mnist_train[0], tf.float32)/255.0\n",
-        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
-        "def representative_data_gen():\n",
-        "  for input_value in mnist_ds.take(100):\n",
-        "    yield [input_value]\n",
-        "\n",
-        "converter.representative_dataset = representative_data_gen"
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]"
       ],
       "execution_count": 0,
       "outputs": []
@@ -364,9 +379,9 @@
         "colab": {}
       },
       "source": [
-        "tflite_quant_model = converter.convert()\n",
-        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
-        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
+        "tflite_fp16_model = converter.convert()\n",
+        "tflite_model_fp16_file = tflite_models_dir/\"mnist_model_quant_f16.tflite\"\n",
+        "tflite_model_fp16_file.write_bytes(tflite_fp16_model)"
       ],
       "execution_count": 0,
       "outputs": []
@@ -378,7 +393,7 @@
         "id": "PhMmUTl4sbkz"
       },
       "source": [
-        "Note how the resulting file is approximately `1/4` the size."
+        "Note how the resulting file is approximately `1/2` the size."
       ]
     },
     {
@@ -411,8 +426,7 @@
         "id": "-5l6-ciItvX6"
       },
       "source": [
-        "We can run the TensorFlow Lite model using the Python TensorFlow Lite\n",
-        "Interpreter. \n",
+        "Run the TensorFlow Lite model using the Python TensorFlow Lite Interpreter. \n",
         "\n",
         "### Load the test data\n",
         "\n",
@@ -427,7 +441,6 @@
         "colab": {}
       },
       "source": [
-        "import numpy as np\n",
         "_, mnist_test = tf.keras.datasets.mnist.load_data()\n",
         "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
         "\n",
@@ -468,8 +481,8 @@
         "colab": {}
       },
       "source": [
-        "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))\n",
-        "interpreter_quant.allocate_tensors()"
+        "interpreter_fp16 = tf.lite.Interpreter(model_path=str(tflite_model_fp16_file))\n",
+        "interpreter_fp16.allocate_tensors()"
       ],
       "execution_count": 0,
       "outputs": []
@@ -530,11 +543,11 @@
         "colab": {}
       },
       "source": [
-        "interpreter_quant.set_tensor(\n",
-        "    interpreter_quant.get_input_details()[0][\"index\"], img)\n",
-        "interpreter_quant.invoke()\n",
-        "predictions = interpreter_quant.get_tensor(\n",
-        "    interpreter_quant.get_output_details()[0][\"index\"])"
+        "interpreter_fp16.set_tensor(\n",
+        "    interpreter_fp16.get_input_details()[0][\"index\"], img)\n",
+        "interpreter_fp16.invoke()\n",
+        "predictions = interpreter_fp16.get_tensor(\n",
+        "    interpreter_fp16.get_output_details()[0][\"index\"])"
       ],
       "execution_count": 0,
       "outputs": []
@@ -605,7 +618,10 @@
         "colab": {}
       },
       "source": [
-        "print(eval_model(interpreter, mnist_ds))"
+        "# Create smaller dataset for demonstration purposes\n",
+        "mnist_ds_demo = mnist_ds.take(2000)\n",
+        "\n",
+        "print(eval_model(interpreter, mnist_ds_demo))"
       ],
       "execution_count": 0,
       "outputs": []
@@ -617,7 +633,7 @@
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
-        "We can repeat the evaluation on the fully quantized model to obtain:\n"
+        "Repeat the evaluation on the float16 quantized model to obtain:"
       ]
     },
     {
@@ -632,7 +648,7 @@
         "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
         "# slower than the above float interpreter. But for mobile CPUs, considerable\n",
         "# speedup can be observed.\n",
-        "print(eval_model(interpreter_quant, mnist_ds))\n"
+        "print(eval_model(interpreter_fp16, mnist_ds_demo))"
       ],
       "execution_count": 0,
       "outputs": []
@@ -644,8 +660,24 @@
         "id": "L7lfxkor8pgv"
       },
       "source": [
-        "In this example, we have fully quantized a model with no difference in the accuracy."
+        "In this example, you have quantized a model to float16 with no difference in the accuracy.\n",
+        "\n",
+        "It's also possible to evaluate the fp16 quantized model on the GPU. To perform all arithmetic with the reduced precision values, be sure to create the `TfLiteGPUDelegateOptions` struct in your app and set `precision_loss_allowed` to `1`, like this:\n",
+        "\n",
+        "```\n",
+        "//Prepare GPU delegate.\n",
+        "const TfLiteGpuDelegateOptions options = {\n",
+        "  .metadata = NULL,\n",
+        "  .compile_options = {\n",
+        "    .precision_loss_allowed = 1,  // FP16\n",
+        "    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,\n",
+        "    .dynamic_batch_enabled = 0,   // Not fully functional yet\n",
+        "  },\n",
+        "};\n",
+        "```\n",
+        "\n",
+        "Detailed documentation on the TFLite GPU delegate and how to use it in your application can be found [here](https://www.tensorflow.org/lite/performance/gpu_advanced?source=post_page---------------------------)"
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
new file mode 100644
index 00000000000..23399da0999
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -0,0 +1,737 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "post_training_integer_quant.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_DDaAex5Q7u-",
+        "colab_type": "text"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W1dWWdNHQ9L0",
+        "colab_type": "code",
+        "colab": {},
+        "cellView": "form"
+      },
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6Y8E0lw5eYWm"
+      },
+      "source": [
+        "# Post-training integer quantization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CIGrZZPTZVeO"
+      },
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/performance/post_training_integer_quant\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BTC1rDAuei_1"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
+        "converting all model values (weights and activations) to 8-bit integers when converting from TensorFlow to TensorFlow Lite's flat buffer format. This results in a 4x reduction in model size and a 3 to 4x performance improvement on CPU performance. In addition, this fully quantized model can be consumed by integer-only hardware accelerators.\n",
+        "\n",
+        "In contrast to [post-training \"on-the-fly\" quantization](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb)—which stores only the weights as 8-bit integers—this technique statically quantizes all weights *and* activations during model conversion.\n",
+        "\n",
+        "In this tutorial, you'll train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the saved model into a Tensorflow Lite flatbuffer\n",
+        "with full quantization. Finally, you'll check the\n",
+        "accuracy of the converted model and compare it to the original float model.\n",
+        "\n",
+        "The training script, `mnist.py`, is available from the\n",
+        "[TensorFlow official MNIST tutorial](https://github.com/tensorflow/models/tree/master/official/mnist).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2XsEP17Zelz9"
+      },
+      "source": [
+        "## Build an MNIST model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "dDqqUIZjZjac"
+      },
+      "source": [
+        "### Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "gyqAw1M9lyab",
+        "colab": {}
+      },
+      "source": [
+        "! pip uninstall -y tensorflow\n",
+        "! pip install -U tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "WsN6s5L1ieNl",
+        "colab": {}
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "00U0taBoe-w7",
+        "colab": {}
+      },
+      "source": [
+        "! git clone --depth 1 https://github.com/tensorflow/models"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "4XZPtSh-fUOc",
+        "colab": {}
+      },
+      "source": [
+        "import sys\n",
+        "import os\n",
+        "\n",
+        "if sys.version_info.major >= 3:\n",
+        "    import pathlib\n",
+        "else:\n",
+        "    import pathlib2 as pathlib\n",
+        "\n",
+        "# Add `models` to the python path.\n",
+        "models_path = os.path.join(os.getcwd(), \"models\")\n",
+        "sys.path.append(models_path)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eQ6Q0qqKZogR"
+      },
+      "source": [
+        "### Train and export the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "eMsw_6HujaqM",
+        "colab": {}
+      },
+      "source": [
+        "saved_models_root = \"/tmp/mnist_saved_model\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "hWSAjQWagIHl",
+        "colab": {}
+      },
+      "source": [
+        "# The above path addition is not visible to subprocesses, add the path for the subprocess as well.\n",
+        "# Note: channels_last is required here or the conversion may fail. \n",
+        "!PYTHONPATH={models_path} python models/official/mnist/mnist.py --train_epochs=1 --export_dir {saved_models_root} --data_format=channels_last"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "5NMaNZQCkW9X"
+      },
+      "source": [
+        "This training won't take long because you're training the model for just a single epoch, which trains to about 96% accuracy."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xl8_fzVAZwOh"
+      },
+      "source": [
+        "### Convert to a TensorFlow Lite model\n",
+        "\n",
+        "Using the [Python `TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api), you can now convert the trained model into a TensorFlow Lite model.\n",
+        "\n",
+        "The trained model is saved in the `saved_models_root` directory, which is named with a timestamp. So select the most recent directory: "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Xp5oClaZkbtn",
+        "colab": {}
+      },
+      "source": [
+        "saved_model_dir = str(sorted(pathlib.Path(saved_models_root).glob(\"*\"))[-1])\n",
+        "saved_model_dir"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AT8BgkKmljOy"
+      },
+      "source": [
+        "Now load the model using the `TFLiteConverter`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "_i8B2nDZmAgQ",
+        "colab": {}
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "tf.logging.set_verbosity(tf.logging.DEBUG)\n",
+        "\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "tflite_model = converter.convert()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "F2o2ZfF0aiCx"
+      },
+      "source": [
+        "Write it out to a `.tflite` file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "vptWZq2xnclo",
+        "colab": {}
+      },
+      "source": [
+        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
+        "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Ie9pQaQrn5ue",
+        "colab": {}
+      },
+      "source": [
+        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
+        "tflite_model_file.write_bytes(tflite_model)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7BONhYtYocQY"
+      },
+      "source": [
+        "Now you have a trained MNIST model that's converted to a `.tflite` file, but it's still using 32-bit float values for all parameter data.\n",
+        "\n",
+        "So let's convert the model again, this time using quantization...\n",
+        "\n",
+        "#### Convert using quantization",
+        "\n",
+        "First, first set the `optimizations` flag to optimize for size:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "HEZ6ET1AHAS3",
+        "colab": {}
+      },
+      "source": [
+        "tf.logging.set_verbosity(tf.logging.INFO)\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rTe8avZJHMDO",
+        "colab_type": "text"
+      },
+      "source": [
+        "Now, in order to create quantized values with an accurate dynamic range of activations, you need to provide a representative dataset:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FiwiWU3gHdkW",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
+        "images = tf.cast(mnist_train[0], tf.float32)/255.0\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
+        "def representative_data_gen():\n",
+        "  for input_value in mnist_ds.take(100):\n",
+        "    yield [input_value]\n",
+        "\n",
+        "converter.representative_dataset = representative_data_gen"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xW84iMYjHd9t",
+        "colab_type": "text"
+      },
+      "source": [
+        "Finally, convert the model like usual. By default, the converted model will still use float input and outputs for invocation convenience."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yuNfl3CoHNK3",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "tflite_quant_model = converter.convert()\n",
+        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
+        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PhMmUTl4sbkz"
+      },
+      "source": [
+        "Note how the resulting file is approximately `1/4` the size:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "JExfcfLDscu4",
+        "colab": {}
+      },
+      "source": [
+        "!ls -lh {tflite_models_dir}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "RACBJuj2XO8x"
+      },
+      "source": [
+        "Your model should now be fully quantized. However, if you convert a model that includes any operations that TensorFlow Lite cannot quantize, those ops are left in floating point. This allows for conversion to complete so you have a smaller and more efficient model, but the model won't be compatible with some ML accelerators that require full integer quantization. Also, this model still uses float values for input and output, which also is not compatible with some accelerators.\n",
+        "\n",
+        "So to ensure that the converted model is fully quantized (make the converter throw an error if it encounters an operation it cannot quantize) and to use integers for the model's input and output, you need to convert the model again using these additional configurations:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kzjEjcDs3BHa"
+      },
+      "outputs": [],
+      "source": [
+        "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+        "converter.inference_input_type = tf.uint8\n",
+        "converter.inference_output_type = tf.uint8\n",
+        "\n",
+        "tflite_quant_model = converter.convert()\n",
+        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant_io.tflite\"\n",
+        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wYd6NxD03yjB"
+      },
+      "source": [
+        "In this example, the resulting model size remains the same because all operations successfully quantized to begin with. However, this new model now uses quantized input and output, making it compatible with more accelerators."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L8lQHMp_asCq"
+      },
+      "source": [
+        "## Run the TensorFlow Lite models"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-5l6-ciItvX6"
+      },
+      "source": [
+        "Run the TensorFlow Lite model using the Python TensorFlow Lite\n",
+        "Interpreter. \n",
+        "\n",
+        "### Load the test data\n",
+        "\n",
+        "First, let's load the MNIST test data to feed to the model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "eTIuU07NuKFL",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "_, mnist_test = tf.keras.datasets.mnist.load_data()\n",
+        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
+        "\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Ap_jE7QRvhPf"
+      },
+      "source": [
+        "### Load the model into the interpreters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Jn16Rc23zTss",
+        "colab": {}
+      },
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
+        "interpreter.allocate_tensors()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "J8Pztk1mvNVL",
+        "colab": {}
+      },
+      "source": [
+        "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))\n",
+        "interpreter_quant.allocate_tensors()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2opUt_JTdyEu"
+      },
+      "source": [
+        "### Test the models on one image"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "AKslvo2kwWac",
+        "colab": {}
+      },
+      "source": [
+        "for img, label in mnist_ds:\n",
+        "  break\n",
+        "\n",
+        "interpreter.set_tensor(interpreter.get_input_details()[0][\"index\"], img)\n",
+        "interpreter.invoke()\n",
+        "predictions = interpreter.get_tensor(\n",
+        "    interpreter.get_output_details()[0][\"index\"])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "XZClM2vo3_bm",
+        "colab": {}
+      },
+      "source": [
+        "import matplotlib.pylab as plt\n",
+        "\n",
+        "plt.imshow(img[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
+        "                              predict=str(predictions[0])))\n",
+        "plt.grid(False)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "3gwhv4lKbYZ4",
+        "colab": {}
+      },
+      "source": [
+        "interpreter_quant.set_tensor(\n",
+        "    interpreter_quant.get_input_details()[0][\"index\"], img)\n",
+        "interpreter_quant.invoke()\n",
+        "predictions = interpreter_quant.get_tensor(\n",
+        "    interpreter_quant.get_output_details()[0][\"index\"])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "CIH7G_MwbY2x",
+        "colab": {}
+      },
+      "source": [
+        "plt.imshow(img[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
+        "                              predict=str(predictions[0])))\n",
+        "plt.grid(False)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LwN7uIdCd8Gw"
+      },
+      "source": [
+        "### Evaluate the models"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "05aeAuWjvjPx",
+        "colab": {}
+      },
+      "source": [
+        "def eval_model(interpreter, mnist_ds):\n",
+        "  total_seen = 0\n",
+        "  num_correct = 0\n",
+        "\n",
+        "  input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "  output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "  for img, label in mnist_ds:\n",
+        "    total_seen += 1\n",
+        "    interpreter.set_tensor(input_index, img)\n",
+        "    interpreter.invoke()\n",
+        "    predictions = interpreter.get_tensor(output_index)\n",
+        "    if predictions == label.numpy():\n",
+        "      num_correct += 1\n",
+        "\n",
+        "    if total_seen % 500 == 0:\n",
+        "      print(\"Accuracy after %i images: %f\" %\n",
+        "            (total_seen, float(num_correct) / float(total_seen)))\n",
+        "\n",
+        "  return float(num_correct) / float(total_seen)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "T5mWkSbMcU5z",
+        "colab": {}
+      },
+      "source": [
+        "# Create smaller dataset for demonstration purposes\n",
+        "mnist_ds_demo = mnist_ds.take(2000)\n",
+        "\n",
+        "print(eval_model(interpreter, mnist_ds_demo))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Km3cY9ry8ZlG"
+      },
+      "source": [
+        "Repeat the evaluation on the fully quantized model to obtain:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "-9cnwiPp6EGm",
+        "colab": {}
+      },
+      "source": [
+        "# NOTE: Colab runs on server CPUs. At the time of writing this, TensorFlow Lite\n",
+        "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
+        "# slower than the above float interpreter. But for mobile CPUs, considerable\n",
+        "# speedup can be observed.\n",
+        "# Only use 2000 for demonstration purposes\n",
+        "print(eval_model(interpreter_quant, mnist_ds_demo))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L7lfxkor8pgv"
+      },
+      "source": [
+        "In this example, you have fully quantized a model with no difference in the accuracy."
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
similarity index 74%
rename from tensorflow/lite/tutorials/post_training_quant.ipynb
rename to tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index 8bc02eedf68..89b2c2bc842 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -1,5 +1,55 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "post_training_quant.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_-GR0EDHM1SO",
+        "colab_type": "text"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "R3yYtBPkM2qZ",
+        "colab_type": "code",
+        "colab": {},
+        "cellView": "form"
+      },
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -7,7 +57,7 @@
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
-        "# Post Training Quantization"
+        "# Post-training weight quantization"
       ]
     },
     {
@@ -17,14 +67,17 @@
         "id": "CIGrZZPTZVeO"
       },
       "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/performance/post_training_quant\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
       ]
     },
     {
@@ -38,34 +91,32 @@
         "\n",
         "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
         "converting weights to 8 bit precision as part of model conversion from\n",
-        "tensorflow graphdefs to TFLite's flat buffer format. Weight quantization\n",
+        "tensorflow graphdefs to TensorFlow Lite's flat buffer format. Weight quantization\n",
         "achieves a 4x reduction in the model size. In addition, TFLite supports on the\n",
         "fly quantization and dequantization of activations to allow for:\n",
         "\n",
         "1.  Using quantized kernels for faster implementation when available.\n",
-        "\n",
         "2.  Mixing of floating-point kernels with quantized kernels for different parts\n",
         "    of the graph.\n",
         "\n",
-        "Note that the activations are always stored in floating point. For ops that\n",
+        "The activations are always stored in floating point. For ops that\n",
         "support quantized kernels, the activations are quantized to 8 bits of precision\n",
         "dynamically prior to processing and are de-quantized to float precision after\n",
         "processing. Depending on the model being converted, this can give a speedup over\n",
         "pure floating point computation.\n",
         "\n",
         "In contrast to\n",
-        "[quantization aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)\n",
+        "[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize)\n",
         ", the weights are quantized post training and the activations are quantized dynamically \n",
         "at inference in this method.\n",
         "Therefore, the model weights are not retrained to compensate for quantization\n",
         "induced errors. It is important to check the accuracy of the quantized model to\n",
         "ensure that the degradation is acceptable.\n",
         "\n",
-        "In this tutorial, we train an MNIST model from scratch, check its accuracy in\n",
-        "tensorflow and then convert the saved model into a Tensorflow Lite flatbuffer\n",
-        "with weight quantization. We finally check the\n",
-        "accuracy of the converted model and compare it to the original saved model. We\n",
-        "run the training script mnist.py from\n",
+        "This tutorial trains an MNIST model from scratch, checks its accuracy in\n",
+        "TensorFlow, and then converts the saved model into a Tensorflow Lite flatbuffer\n",
+        "with weight quantization. Finally, it checks the\n",
+        "accuracy of the converted model and compare it to the original saved model. The training script, `mnist.py`, is from\n",
         "[Tensorflow official mnist tutorial](https://github.com/tensorflow/models/tree/master/official/mnist).\n"
       ]
     },
@@ -76,7 +127,7 @@
         "id": "2XsEP17Zelz9"
       },
       "source": [
-        "## Building an MNIST model"
+        "## Build an MNIST model"
       ]
     },
     {
@@ -91,59 +142,57 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "gyqAw1M9lyab"
+        "id": "gyqAw1M9lyab",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "! pip uninstall -y tensorflow\n",
         "! pip install -U tf-nightly"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WsN6s5L1ieNl"
+        "id": "WsN6s5L1ieNl",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "00U0taBoe-w7"
+        "id": "00U0taBoe-w7",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "! git clone --depth 1 https://github.com/tensorflow/models"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "4XZPtSh-fUOc"
+        "id": "4XZPtSh-fUOc",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import sys\n",
         "import os\n",
         "\n",
-        "if sys.version_info.major \u003e= 3:\n",
+        "if sys.version_info.major >= 3:\n",
         "    import pathlib\n",
         "else:\n",
         "    import pathlib2 as pathlib\n",
@@ -151,7 +200,9 @@
         "# Add `models` to the python path.\n",
         "models_path = os.path.join(os.getcwd(), \"models\")\n",
         "sys.path.append(models_path)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -165,31 +216,31 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "eMsw_6HujaqM"
+        "id": "eMsw_6HujaqM",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "saved_models_root = \"/tmp/mnist_saved_model\""
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "hWSAjQWagIHl"
+        "id": "hWSAjQWagIHl",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "# The above path addition is not visible to subprocesses, add the path for the subprocess as well.\n",
         "# Note: channels_last is required here or the conversion may fail. \n",
         "!PYTHONPATH={models_path} python models/official/mnist/mnist.py --train_epochs=1 --export_dir {saved_models_root} --data_format=channels_last"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -198,7 +249,7 @@
         "id": "5NMaNZQCkW9X"
       },
       "source": [
-        "For the example, we only trained the model for a single epoch, so it only trains to ~96% accuracy.\n",
+        "For the example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.\n",
         "\n"
       ]
     },
@@ -216,17 +267,17 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Xp5oClaZkbtn"
+        "id": "Xp5oClaZkbtn",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "saved_model_dir = str(sorted(pathlib.Path(saved_models_root).glob(\"*\"))[-1])\n",
         "saved_model_dir"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -242,19 +293,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "_i8B2nDZmAgQ"
+        "id": "_i8B2nDZmAgQ",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
         "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -268,31 +319,31 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "vptWZq2xnclo"
+        "id": "vptWZq2xnclo",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
         "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -306,14 +357,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "g8PUvLWDlmmz"
+        "id": "g8PUvLWDlmmz",
+        "colab": {}
       },
-      "outputs": [],
-     "source": [
+      "source": [
         "# Note: If you don't have a recent tf-nightly installed, the\n",
         "# \"optimizations\" line will have no effect.\n",
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
@@ -321,7 +370,9 @@
         "tflite_quant_model = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
         "tflite_model_quant_file.write_bytes(tflite_quant_model)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -329,22 +380,22 @@
         "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
-    "source": [
+      "source": [
         "Note how the resulting file, is approximately `1/4` the size."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "!ls -lh {tflite_models_dir}"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -363,7 +414,7 @@
         "id": "-5l6-ciItvX6"
       },
       "source": [
-        "We can run the TensorFlow Lite model using the python TensorFlow Lite\n",
+        "Run the TensorFlow Lite model using the Python TensorFlow Lite\n",
         "Interpreter. \n",
         "\n",
         "### load the test data\n",
@@ -373,13 +424,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "eTIuU07NuKFL"
+        "id": "eTIuU07NuKFL",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import numpy as np\n",
         "mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()\n",
@@ -389,7 +438,9 @@
         "# `tf.lite.Interpreter.resize_tensor_input` to also change it for\n",
         "# the interpreter.\n",
         "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -403,48 +454,48 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Jn16Rc23zTss"
+        "id": "Jn16Rc23zTss",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
         "interpreter.allocate_tensors()\n",
         "input_index = interpreter.get_input_details()[0][\"index\"]\n",
         "output_index = interpreter.get_output_details()[0][\"index\"]"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "J8Pztk1mvNVL"
+        "id": "J8Pztk1mvNVL",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "tf.logging.set_verbosity(tf.logging.DEBUG)\n",
         "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Afl6yGvWyqAr"
+        "id": "Afl6yGvWyqAr",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "interpreter_quant.allocate_tensors()\n",
         "input_index = interpreter_quant.get_input_details()[0][\"index\"]\n",
-        "output_index = interpreter_quant.get_output_details()[0][\"index\"]\n"
-      ]
+        "output_index = interpreter_quant.get_output_details()[0][\"index\"]"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -458,13 +509,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "AKslvo2kwWac"
+        "id": "AKslvo2kwWac",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "for img, label in mnist_ds.take(1):\n",
         "  break\n",
@@ -472,17 +521,17 @@
         "interpreter.set_tensor(input_index, img)\n",
         "interpreter.invoke()\n",
         "predictions = interpreter.get_tensor(output_index)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -491,7 +540,9 @@
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
         "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -505,13 +556,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "05aeAuWjvjPx"
+        "id": "05aeAuWjvjPx",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "def eval_model(interpreter, mnist_ds):\n",
         "  total_seen = 0\n",
@@ -530,20 +579,22 @@
         "              (total_seen, float(num_correct) / float(total_seen)))\n",
         "\n",
         "  return float(num_correct) / float(total_seen)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "DqXBnDfJ7qxL"
+        "id": "DqXBnDfJ7qxL",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "print(eval_model(interpreter, mnist_ds))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -552,21 +603,21 @@
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
-        "We can repeat the evaluation on the weight quantized model to obtain:\n"
+        "Repeat the evaluation on the weight quantized model to obtain:\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "print(eval_model(interpreter_quant, mnist_ds))\n"
-      ]
+        "print(eval_model(interpreter_quant, mnist_ds))"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -576,7 +627,7 @@
       },
       "source": [
         "\n",
-        "In this example, we have compressed model with no difference in the accuracy."
+        "In this example, the compressed model has no difference in the accuracy."
       ]
     },
     {
@@ -586,31 +637,29 @@
         "id": "M0o1FtmWeKZm"
       },
       "source": [
-        "\n",
-        "\n",
         "## Optimizing an existing model\n",
         "\n",
-        "We now consider another example. Resnets with pre-activation layers (Resnet-v2) are widely used for vision applications.\n",
+        "Resnets with pre-activation layers (Resnet-v2) are widely used for vision applications.\n",
         "  Pre-trained frozen graph for resnet-v2-101 is available at the\n",
         "  [Tensorflow Lite model repository](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md).\n",
         "\n",
-        "We can convert the frozen graph to a TFLite flatbuffer with quantization by:\n"
+        "You can convert the frozen graph to a TensorFLow Lite flatbuffer with quantization by:\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "v5p5VcNPjILQ"
+        "id": "v5p5VcNPjILQ",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "archive_path = tf.keras.utils.get_file(\"resnet_v2_101.tgz\", \"https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/resnet_v2_101.tgz\", extract=True)\n",
         "archive_path = pathlib.Path(archive_path)\n",
         "archive_dir = str(archive_path.parent)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -624,26 +673,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "g_Q_OMEJ4LIc"
+        "id": "g_Q_OMEJ4LIc",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "! cat {archive_dir}/resnet_v2_101_299_info.txt"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ujCAFhqm-C6H"
+        "id": "ujCAFhqm-C6H",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "graph_def_file = pathlib.Path(archive_path).parent/\"resnet_v2_101_299_frozen.pb\"\n",
         "input_arrays = [\"input\"] \n",
@@ -652,22 +699,23 @@
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
         "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
-        "resnet_tflite_file.write_bytes(converter.convert())\n"
-      ]
+        "resnet_tflite_file.write_bytes(converter.convert())"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "vhOjeg1x9Knp"
+        "id": "vhOjeg1x9Knp",
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "\n",
         "!ls -lh {archive_dir}/*.tflite"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -683,21 +731,5 @@
         "The optimized model top-1 accuracy is 76.8, the same as the floating point model."
       ]
     }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "post-training-quant.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true,
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 2",
-      "name": "python2"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 69ebf7ee4a0..b71160626ff 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -1,20 +1,47 @@
 # Post-training quantization
 
-Post-training quantization includes general techniques to reduce model size
-while also improving CPU and hardware accelerator latency with little
-degradation in model accuracy. These techniques can be performed on an
-already-trained float TensorFlow model and applied during TensorFlow Lite
-conversion.
+Post-training quantization is a conversion technique that can reduce model size
+while also improving CPU and hardware accelerator latency, with little
+degradation in model accuracy. You can perform these techniques using an
+already-trained float TensorFlow model when you convert it to TensorFlow Lite
+format.
+
+Note: The procedures on this page require TensorFlow 1.15 or higher (available
+with the `tf-nightly` build).
+
 
 ### Optimization options
 
+There are several post-training quantization options to choose from. Here is a
+summary table of the choices and the benefits they provide:
+
+| Technique                  | Benefits                  | Hardware            |
+| -------------------------- | ------------------------- | ------------------- |
+| Weight quantization        | 4x smaller, 2-3x speedup, | CPU                 |
+:                            : accuracy                  :                     :
+| Full integer quantization  | 4x smaller, 3x+ speedup   | CPU, Edge TPU, etc. |
+| Float16 quantization       | 2x smaller, potential GPU | CPU/GPU             |
+:                            : acceleration              :                     :
+
+This decision tree can help determine which post-training quantization method is
+best for your use case:
+
 ![post-training optimization options](images/optimization.jpg)
 
-### Quantizing weights
+Alternatively, you might achieve higher accuracy if you perform
+[quantization-aware training](
+https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize).
+However, doing so requires some model modifications to add fake quantization
+nodes, whereas the post-training quantization techniques on this page use an
+existing pre-trained model.
 
-The simplest form of post-training quantization quantizes weights from floating
-point to 8-bits of precision. This technique is enabled as an option in the
-[TensorFlow Lite converter](../convert/):
+
+### Weight quantization
+
+The simplest form of post-training quantization quantizes only the weights from
+floating point to 8-bits of precision (also called "hybrid" quantization). This
+technique is enabled as an option in the [TensorFlow Lite
+converter](../convert/):
 
 ```
 import tensorflow as tf
@@ -39,13 +66,16 @@ Hybrid ops are available for the most compute-intensive operators in a network:
 *  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
 *  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
 
+
 ### Full integer quantization of weights and activations
 
-We can get further latency improvements, reductions in peak memory usage, and
+You can get further latency improvements, reductions in peak memory usage, and
 access to integer only hardware accelerators by making sure all model math is
-quantized. To do this, we need to measure the dynamic range of activations and
-inputs with a representative data set. You can simply create an input data
-generator and provide it to our converter.
+quantized.
+
+To do this, you need to measure the dynamic range of activations and inputs by
+supplying a representative data set. You can simply create an input data
+generator and provide it to our converter. For example:
 
 ```
 import tensorflow as tf
@@ -61,22 +91,57 @@ converter.representative_dataset = representative_dataset_gen
 tflite_quant_model = converter.convert()
 ```
 
-The resulting model will be fully quantized but still take float input and
-output for convenience.
+The resulting model should be fully quantized, but any
+ops that do not have quantized implementations are left in
+floating point. This allows conversion to occur smoothly, but the model won't be
+compatible with accelerators that require full integer quantization.
 
-Ops that do not have quantized implementations will automatically be left in
-floating point. This allows conversion to occur smoothly but may restrict
-deployment to accelerators that support float. To require the converter to only
-output integer operations, one can specify:
+Additionally, the model still uses float input and output for convenience.
+
+To ensure compatibility with some accelerators (such as the Coral Edge TPU), you
+can enforce full integer quantization for all ops and use integer input and
+output by adding the following lines before you convert:
 
 ```
 converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.uint8
 ```
 
+The first line makes the converter throw an error if it encounters an operation
+it cannot currently quantize.
+
 Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
 
-This makes the converter throw an error if it encounters an operation it cannot
-currently quantize.
+
+### Float16 quantization of weights
+
+You can reduce the size of a floating point model by quantizing the weights to
+float16, the IEEE standard for 16-bit floating point numbers. The advantages of
+this quantization are as follows:
+
+-   reduce model size by up to half (since all weights are now half the original
+    size)
+-   minimal loss in accuracy
+-   some delegates (e.g. the GPU delegate) can operate directly on float16 data,
+    which results in faster execution than float32 computations.
+
+This quantization may not be a good choice if you need maximum performance (a
+quantization to fixed point math would be better in that case). To enable
+float16 quantization of weights, specify "DEFAULT" optimization as above and
+then specify that float16 is in supported types for the target_spec:
+
+```
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
+tflite_quant_model = converter.convert()
+```
+
+By default, a float16 quantized model will "dequantize" the weights values to
+float32 when run on the CPU. The GPU delegate will not perform this
+dequantization, since it can operate on float16 data.
 
 ### Model accuracy
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 99d117591fd..a0c4fc8c567 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -29,6 +29,20 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
+// TODO(b/139446230): Move to portable platform header.
+#if defined(__ANDROID__)
+#define TFLITE_IS_MOBILE_PLATFORM
+#endif  // defined(__ANDROID__)
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_IPHONE_SIMULATOR
+#define TFLITE_IS_MOBILE_PLATFORM
+#elif TARGET_OS_IPHONE
+#define TFLITE_IS_MOBILE_PLATFORM
+#endif
+#endif  // defined(__APPLE__)
+
 // TODO(b/132087118): move static_assert to c_api_internal when compiled with
 // C++.
 static_assert(sizeof(TfLiteFloat16) == sizeof(uint16_t),
@@ -60,7 +74,13 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
+  // Prod logging is useful for mobile platforms where scraping console logs is
+  // critical for debugging.
+#if defined(TFLITE_IS_MOBILE_PLATFORM)
   TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO, "Initialized TensorFlow Lite runtime.");
+#else
+  TFLITE_LOG_ONCE(TFLITE_LOG_INFO, "Initialized TensorFlow Lite runtime.");
+#endif
 
   // There's always at least 1 subgraph which is the primary subgraph.
   AddSubgraphs(1);
@@ -71,6 +91,12 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
     external_contexts_[i] = nullptr;
   }
 
+  // This operation is cheap because we allocate the CPU context resources (i.e.
+  // threads) lazily.
+  own_external_cpu_backend_context_.reset(new ExternalCpuBackendContext());
+  external_contexts_[kTfLiteCpuBackendContext] =
+      own_external_cpu_backend_context_.get();
+
   UseNNAPI(false);
 }
 
@@ -78,6 +104,26 @@ Interpreter::~Interpreter() {}
 
 void Interpreter::SetExternalContext(TfLiteExternalContextType type,
                                      TfLiteExternalContext* ctx) {
+  if (ctx == own_external_cpu_backend_context_.get()) {
+    error_reporter_->Report(
+        "WARNING: The passed external context is identical to the internally "
+        "owned one.");
+    return;
+  }
+
+  // We have an internally owned external context of kTfLiteCpuBackendContext.
+  // If it's overwritten here, we will release the resource of the internally
+  // owned external context.
+  // Note: the 'max thread count' info associated with the overwritten context
+  // will be lost here, and such info is now detemined by the new context, thus
+  // affecting how much parallelism a TFLite op would have.
+  if (kTfLiteCpuBackendContext == type &&
+      external_contexts_[kTfLiteCpuBackendContext] ==
+          own_external_cpu_backend_context_.get()) {
+    own_external_cpu_backend_context_.reset();
+  }
+
+  // This essentially changes the "external_contexts_[type]".
   primary_subgraph().SetExternalContext(type, ctx);
 }
 
@@ -108,8 +154,8 @@ void Interpreter::AddSubgraphs(int subgraphs_to_add,
 
   subgraphs_.reserve(base_index + subgraphs_to_add);
   for (int i = 0; i < subgraphs_to_add; ++i) {
-    Subgraph* subgraph =
-        new Subgraph(error_reporter_, external_contexts_, &subgraphs_);
+    Subgraph* subgraph = new Subgraph(error_reporter_, external_contexts_,
+                                      &subgraphs_, &resource_variables_);
     subgraphs_.emplace_back(subgraph);
   }
 }
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index b1353175530..397d47a6a8d 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <complex>
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
@@ -27,6 +28,8 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/resource_variable/resource_variable.h"
+#include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
@@ -460,7 +463,9 @@ class Interpreter {
     return op_reg.profiling_string(context_, node);
   }
 
-  /// Set the value of an external context.
+  // Set the value of an external context. TFLite interpreter doesn't take the
+  // memory ownership of this external context 'ctx', and the context should
+  // outlive the TFLite interpreter.
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
@@ -526,8 +531,19 @@ class Interpreter {
   // List of active external contexts.
   TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
 
+  // The default external cpu backend context. After an TFLite interpreter is
+  // initialized, 'external_contexts_[kTfLiteCpuBackendContext]' is set to point
+  // to this object. However, if this element value is overwritten via calling
+  // 'SetExternalContext(kTfLiteCpuBackendContext, ...)', we will reset this to
+  // nullptr if necessary.
+  std::unique_ptr<ExternalCpuBackendContext> own_external_cpu_backend_context_;
+
   // Subgraphs
   std::vector<std::unique_ptr<Subgraph>> subgraphs_;
+
+  // A map of resource variables. Owned by interpreter and shared by multiple
+  // subgraphs.
+  ResourceVariableMap resource_variables_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 71dc1efefa1..f6d8bae4eff 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -65,8 +65,14 @@ TEST(BasicInterpreter, ZeroInterpreter) {
   testing::internal::CaptureStderr();
 
   Interpreter interpreter;
+
+#ifndef NDEBUG
+  const char* kExpectedLog = "INFO: Initialized TensorFlow Lite runtime";
+#else
+  const char* kExpectedLog = "";
+#endif
   EXPECT_THAT(testing::internal::GetCapturedStderr(),
-              testing::HasSubstr("INFO: Initialized TensorFlow Lite runtime"));
+              testing::HasSubstr(kExpectedLog));
 
   interpreter.SetInputs({});
   interpreter.SetOutputs({});
@@ -1155,8 +1161,9 @@ class TestDelegate : public ::testing::Test {
     // value-copyable and compatible with TfLite.
     explicit SimpleDelegate(
         const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone)
-        : nodes_(nodes) {
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false)
+        : nodes_(nodes), fail_delegate_node_prepare_(fail_node_prepare) {
       delegate_.Prepare = [](TfLiteContext* context,
                              TfLiteDelegate* delegate) -> TfLiteStatus {
         auto* simple = reinterpret_cast<SimpleDelegate*>(delegate->data_);
@@ -1191,7 +1198,8 @@ class TestDelegate : public ::testing::Test {
         }
 
         context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, FakeFusedRegistration(), nodes_to_separate, delegate);
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
@@ -1224,7 +1232,7 @@ class TestDelegate : public ::testing::Test {
       delegate_.flags = delegate_flags;
     }
 
-    static TfLiteRegistration FakeFusedRegistration() {
+    TfLiteRegistration FakeFusedRegistration() {
       TfLiteRegistration reg = {nullptr};
       reg.custom_name = "fake_fused_op";
 
@@ -1270,6 +1278,12 @@ class TestDelegate : public ::testing::Test {
             context, output, TfLiteIntArrayCopy(input1->dims)));
         return kTfLiteOk;
       };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
       return reg;
     }
 
@@ -1278,7 +1292,9 @@ class TestDelegate : public ::testing::Test {
    private:
     std::vector<int> nodes_;
     TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
   };
+
   std::unique_ptr<Interpreter> interpreter_;
   std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
 };
@@ -1291,7 +1307,7 @@ TEST_F(TestDelegate, BasicDelegate) {
   int node = interpreter_->execution_plan()[0];
   const auto* node_and_reg = interpreter_->node_and_registration(node);
   EXPECT_EQ(node_and_reg->second.custom_name,
-            SimpleDelegate::FakeFusedRegistration().custom_name);
+            delegate_->FakeFusedRegistration().custom_name);
 
   const TfLiteDelegateParams* params =
       reinterpret_cast<const TfLiteDelegateParams*>(
@@ -1310,6 +1326,73 @@ TEST_F(TestDelegate, BasicDelegate) {
   EXPECT_EQ(params->output_tensors->data[1], 4);
 }
 
+TEST_F(TestDelegate, DelegateNodePrepareFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
+  // TfLiteRegistration returns an error status.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteError);
+  // Execution plan should remain unchanged.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
+  // First delegate only supports nodes 1, 2. Gets applied successfully.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports node 0, but fails during the delegate-node's
+  // Prepare.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+
+  // Initially, execution plan has 3 nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // First delegate should be applied successfully, yielding a plan with 2
+  // nodes.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // Second delegate won't get applied. However, we should be back to the
+  // previous 2-node plan.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteError);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  // Node 0: tensor_2 = tensor0 + tensor0
+  // Delegated node: tensor_2 + tensor_1
+  std::vector<float> expected_output = {3.0f, 6.0f, 9.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior to ensure Interpreter isn't broken.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   ASSERT_EQ(
@@ -1343,7 +1426,7 @@ TEST_F(TestDelegate, ComplexDelegate) {
   ASSERT_EQ(interpreter_->execution_plan()[1], 3);
   const auto* node_and_reg = interpreter_->node_and_registration(3);
   ASSERT_EQ(node_and_reg->second.custom_name,
-            SimpleDelegate::FakeFusedRegistration().custom_name);
+            delegate_->FakeFusedRegistration().custom_name);
 }
 
 TEST_F(TestDelegate, SetBufferHandleToInput) {
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index ace3d1e63ee..5d236ee0f9b 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -195,6 +195,27 @@ java_test(
     ],
 )
 
+java_test(
+    name = "NnApiDelegateTest",
+    size = "small",
+    srcs = [
+        "src/test/java/org/tensorflow/lite/TestUtils.java",
+        "src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java",
+    ],
+    data = [
+        "src/testdata/add.bin",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_mac"],
+    test_class = "org.tensorflow.lite.nnapi.NnApiDelegateTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "InterpreterFlexTest",
     size = "small",
@@ -244,6 +265,7 @@ filegroup(
     srcs = [
         "src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java",
         "src/test/java/org/tensorflow/lite/InterpreterTest.java",
+        "src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index c353b2c25ca..fca18430fa5 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -60,8 +60,8 @@ dependencies {
 }
 
 def targetFolder = "src/main/assets"
-def modelFloatDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
-def modelQuantDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
+def modelFloatDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+def modelQuantDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
 def localCacheFloat = "build/intermediates/mobilenet_v1_1.0_224.tgz"
 def localCacheQuant = "build/intermediates/mmobilenet_v1_1.0_224_quant.tgz"
 
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
index 3121cda7fe6..137ca32b048 100644
--- a/tensorflow/lite/java/jni/BUILD
+++ b/tensorflow/lite/java/jni/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow:android": [],
         "//conditions:default": ["."],
     }),
+    visibility = ["//visibility:public"],
 )
 
 # Silly rules to make
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 5aef4fb0572..37f8b38012d 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -388,6 +388,18 @@ public final class Interpreter implements AutoCloseable {
     wrapper.modifyGraphWithDelegate(delegate);
   }
 
+  /**
+   * Advanced: Resets all variable tensors to the default value.
+   *
+   * <p>If a variable tensor doesn't have an associated buffer, it will be reset to zero.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public void resetVariableTensors() {
+    checkNotClosed();
+    wrapper.resetVariableTensors();
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 160d4df2783..e0ed07bd31d 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import org.tensorflow.lite.nnapi.NnApiDelegate;
 
 /**
  * An internal wrapper that wraps native interpreter and controls model execution.
@@ -69,9 +70,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     this.interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
     this.inputTensors = new Tensor[getInputCount(interpreterHandle)];
     this.outputTensors = new Tensor[getOutputCount(interpreterHandle)];
-    if (options.useNNAPI != null) {
-      setUseNNAPI(options.useNNAPI.booleanValue());
-    }
     if (options.allowFp16PrecisionForFp32 != null) {
       allowFp16PrecisionForFp32(
           interpreterHandle, options.allowFp16PrecisionForFp32.booleanValue());
@@ -79,6 +77,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     if (options.allowBufferHandleOutput != null) {
       allowBufferHandleOutput(interpreterHandle, options.allowBufferHandleOutput.booleanValue());
     }
+    if (options.useNNAPI != null && options.useNNAPI.booleanValue()) {
+      optionalNnApiDelegate = new NnApiDelegate();
+      applyDelegate(interpreterHandle, errorHandle, optionalNnApiDelegate.getNativeHandle());
+    }
     for (Delegate delegate : options.delegates) {
       applyDelegate(interpreterHandle, errorHandle, delegate.getNativeHandle());
       delegates.add(delegate);
@@ -112,6 +114,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     outputsIndexes = null;
     isMemoryAllocated = false;
     delegates.clear();
+    if (optionalNnApiDelegate != null) {
+      optionalNnApiDelegate.close();
+      optionalNnApiDelegate = null;
+    }
   }
 
   /** Sets inputs, runs model inference and returns outputs. */
@@ -193,6 +199,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     delegates.add(delegate);
   }
 
+  void resetVariableTensors() {
+    resetVariableTensors(interpreterHandle, errorHandle);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -341,6 +351,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   // delegates for safety.
   private final List<Delegate> delegates = new ArrayList<>();
 
+  // Prefer using the NnApiDelegate directly rather than the deprecated useNNNAPI() method when
+  // NNAPI is enabled via Interpreter.Options.
+  private NnApiDelegate optionalNnApiDelegate;
+
   private static native long allocateTensors(long interpreterHandle, long errorHandle);
 
   private static native int getInputTensorIndex(long interpreterHandle, int inputIdx);
@@ -374,6 +388,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   private static native void applyDelegate(
       long interpreterHandle, long errorHandle, long delegateHandle);
 
+  private static native void resetVariableTensors(long interpreterHandle, long errorHandle);
+
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 
   static {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
index 2b82f04f760..7d8577b74b4 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
@@ -47,8 +47,10 @@ public final class TensorFlowLite {
 
   /**
    * Load the TensorFlowLite runtime C library.
+   *
+   * @hide
    */
-  static boolean init() {
+  public static boolean init() {
     Throwable primaryLibException;
     try {
       System.loadLibrary(PRIMARY_LIBNAME);
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index c2abbab1240..b86509788b0 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -508,6 +508,25 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
   }
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_resetVariableTensors(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle) {
+  tflite::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) return;
+
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return;
+
+  TfLiteStatus status = interpreter->ResetVariableTensors();
+  if (status != kTfLiteOk) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Failed to reset variable tensors: %s",
+                   error_reporter->CachedErrorMessage());
+  }
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
     JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
     jlong interpreter_handle) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index d62b1e194a1..6f22764abeb 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -479,6 +479,23 @@ public final class InterpreterTest {
     }
   }
 
+  @Test
+  public void testResetVariableTensors() throws Exception {
+    float[][][][] inputs = new float[2][8][8][3];
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+
+    // Smoke test to ensure resetting variables at various times in a simple graph doesn't fail.
+    // TODO(b/138197256): Test with model that has variables.
+    try (Interpreter interpreter = new Interpreter(MODEL_BUFFER)) {
+      interpreter.resetVariableTensors();
+      interpreter.run(inputs, parsedOutputs);
+
+      interpreter.resetVariableTensors();
+      interpreter.resetVariableTensors();
+      interpreter.run(inputs, parsedOutputs);
+    }
+  }
+
   private static native long getNativeHandleForDelegate();
 
   private static native long getNativeHandleForInvalidDelegate();
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
new file mode 100644
index 00000000000..82d4da0cefb
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.nnapi;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.nio.ByteBuffer;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.TestUtils;
+
+/** Unit tests for {@link org.tensorflow.lite.nnapi.NnApiDelegate}. */
+@RunWith(JUnit4.class)
+public final class NnApiDelegateTest {
+
+  private static final String MODEL_PATH = "tensorflow/lite/java/src/testdata/add.bin";
+  private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
+
+  @Test
+  public void testBasic() throws Exception {
+    try (NnApiDelegate delegate = new NnApiDelegate()) {
+      assertThat(delegate.getNativeHandle()).isNotEqualTo(0);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithNnApi() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    try (NnApiDelegate delegate = new NnApiDelegate();
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+      float[] outputOneD = parsedOutputs[0][0][0];
+      float[] expected = {3.69f, 19.62f, 23.43f};
+      assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    }
+  }
+}
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a75404eb276..fa8b98cff60 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -227,6 +227,7 @@ cc_library(
         # gemmlowp_context_ and ruy_context_ members.
         "//tensorflow/lite/experimental/ruy:context",
         "@gemmlowp",
+        "//tensorflow/lite:external_cpu_backend_context",
     ],
 )
 
@@ -307,23 +308,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "cpu_backend_support",
-    srcs = [
-        "cpu_backend_support.cc",
-    ],
-    hdrs = [
-        "cpu_backend_support.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":cpu_backend_context",
-        ":op_macros",
-        "//tensorflow/lite/c:c_api_internal",
-        "@gemmlowp",
-    ],
-)
-
 cc_library(
     name = "activation_functor",
     hdrs = [
@@ -353,6 +337,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:round",
+        "@flatbuffers",
     ],
 )
 
@@ -404,6 +389,7 @@ cc_library(
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
+        "depth_to_space.cc",
         "depthwise_conv.cc",
         "dequantize.cc",
         "detection_postprocess.cc",
@@ -482,7 +468,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":activation_functor",
-        ":cpu_backend_support",
+        ":cpu_backend_context",
         ":eigen_support",
         ":kernel_util",
         ":lstm_eval",
@@ -509,6 +495,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "variable_op_kernels",
+    srcs = [
+        "assign_variable.cc",
+        "read_variable.cc",
+    ],
+    deps = [
+        ":kernel_util",
+        ":op_macros",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:tensor",
+    ],
+)
+
+cc_test(
+    name = "variable_ops_test",
+    size = "small",
+    srcs = [
+        "variable_ops_test.cc",
+    ],
+    deps = [
+        ":test_main",
+        ":test_util",
+        ":variable_op_kernels",  # buildcleaner: keep
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "custom_ops",
     srcs = ["rfft2d.cc"],
@@ -531,10 +548,11 @@ cc_library(
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
     deps = [
-        ":kernel_util",
         ":op_macros",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -618,6 +636,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -724,6 +743,7 @@ cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -751,6 +771,7 @@ cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -887,6 +908,7 @@ cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -955,6 +977,7 @@ cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -995,6 +1018,7 @@ cc_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1022,6 +1046,7 @@ cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1107,6 +1132,7 @@ cc_test(
     name = "resize_nearest_neighbor_test",
     size = "small",
     srcs = ["resize_nearest_neighbor_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1120,6 +1146,7 @@ cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1133,6 +1160,7 @@ cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1147,11 +1175,13 @@ cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1190,6 +1220,7 @@ cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1247,12 +1278,14 @@ cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1261,6 +1294,7 @@ cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1270,6 +1304,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "lstm_eval_test",
+    size = "small",
+    srcs = ["lstm_eval_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":lstm_eval",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "skip_gram_test",
     size = "small",
@@ -1298,6 +1347,20 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "depth_to_space_test",
+    size = "small",
+    srcs = ["depth_to_space_test.cc"],
+    tags = ["tflite_nnapi"],
+    deps = [
+        ":builtin_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "split_test",
     size = "small",
@@ -1329,6 +1392,7 @@ cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1342,6 +1406,7 @@ cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1432,6 +1497,7 @@ cc_test(
     name = "transpose_conv_test",
     size = "small",
     srcs = ["transpose_conv_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
@@ -1850,3 +1916,18 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_test(
+    name = "quant_basic_lstm_test",
+    size = "small",
+    srcs = ["quant_basic_lstm_test.cc"],
+    tags = ["tflite_nnapi"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 7efd5df07fb..5c5ebc1edfe 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -44,6 +44,7 @@ namespace activations {
 enum KernelType {
   kReference,
   kGenericOptimized,
+  kFixedPointOptimized,
 };
 
 struct OpData {
@@ -55,6 +56,11 @@ struct OpData {
   uint8_t* table_zero = nullptr;
 };
 
+struct SoftmaxOpData {
+  struct SoftmaxParams params = {};
+  float table[256];
+};
+
 struct LogSoftmaxOpData : public OpData {
   int32_t reverse_scaling_divisor = 0;
   int32_t reverse_scaling_right_shift = 0;
@@ -131,6 +137,14 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   return new OpData;
 }
 
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new SoftmaxOpData;
+}
+
+void SoftmaxFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<SoftmaxOpData*>(buffer);
+}
+
 void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
                      size_t length) {
   return new LogSoftmaxOpData;
@@ -259,6 +273,7 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+template <KernelType kernel_type>
 TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -268,13 +283,36 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    PopulateLookupTable<uint8_t>(data, input, output,
-                                 [](float value) { return std::tanh(value); });
-  } else if (input->type == kTfLiteInt8) {
-    PopulateLookupTable<int8_t>(data, input, output,
-                                [](float value) { return std::tanh(value); });
-  } else if (input->type == kTfLiteInt16) {
+  if (kernel_type == kFixedPointOptimized) {
+    if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+      static constexpr int kInputIntegerBits = 4;
+
+      const double input_real_multiplier =
+          input->params.scale *
+          static_cast<double>(1 << (15 - kInputIntegerBits));
+
+      const double q =
+          std::frexp(input_real_multiplier, &data->input_left_shift);
+      auto q_fixed = static_cast<int32_t>(TfLiteRound(q * (1ll << 15)));
+      data->input_multiplier = static_cast<int16_t>(q_fixed);
+
+      int16_t input_range_radius =
+          CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 15);
+      data->input_range_radius = input_range_radius;
+    }
+  }
+
+  if (kernel_type == kGenericOptimized || kernel_type == kReference) {
+    if (input->type == kTfLiteUInt8) {
+      PopulateLookupTable<uint8_t>(
+          data, input, output, [](float value) { return std::tanh(value); });
+    } else if (input->type == kTfLiteInt8) {
+      PopulateLookupTable<int8_t>(data, input, output,
+                                  [](float value) { return std::tanh(value); });
+    }
+  }
+
+  if (input->type == kTfLiteInt16) {
     static constexpr int kInputIntegerBits = 3;
     static constexpr int kOutputFractionalBits = 15;
 
@@ -312,6 +350,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+template <KernelType kernel_type>
 TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -321,17 +360,50 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
-    PopulateLookupTable<uint8_t>(data, input, output, [](float value) {
-      return 1.0f / (1.0f + std::exp(-value));
-    });
-  } else if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
-    PopulateLookupTable<int8_t>(data, input, output, [](float value) {
-      return 1.0f / (1.0f + std::exp(-value));
-    });
-  } else if (input->type == kTfLiteInt16) {
+  if (kernel_type == kFixedPointOptimized) {
+    if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+      if (input->type == kTfLiteUInt8) {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<uint8_t>::min());
+      }
+      if (input->type == kTfLiteInt8) {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
+      }
+      TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+      static constexpr int kInputIntegerBits = 4;
+
+      const double input_real_multiplier =
+          input->params.scale *
+          static_cast<double>(1 << (15 - kInputIntegerBits));
+
+      const double q =
+          std::frexp(input_real_multiplier, &data->input_left_shift);
+      auto q_fixed = static_cast<int32_t>(TfLiteRound(q * (1ll << 15)));
+      data->input_multiplier = static_cast<int16_t>(q_fixed);
+
+      int16_t input_range_radius =
+          CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 15);
+      data->input_range_radius = input_range_radius;
+    }
+  }
+
+  if (kernel_type == kGenericOptimized || kernel_type == kReference) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+      PopulateLookupTable<uint8_t>(data, input, output, [](float value) {
+        return 1.0f / (1.0f + std::exp(-value));
+      });
+    } else if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+      PopulateLookupTable<int8_t>(data, input, output, [](float value) {
+        return 1.0f / (1.0f + std::exp(-value));
+      });
+    }
+  }
+
+  if (input->type == kTfLiteInt16) {
     static constexpr int kInputIntegerBits = 3;
     static constexpr int kOutputFractionalBits = 15;
 
@@ -363,7 +435,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  SoftmaxOpData* data = reinterpret_cast<SoftmaxOpData*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -375,16 +447,11 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, num_dims >= 1 && num_dims <= 4);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
-    if (CheckOutputQuantParams(context, input, output) == kTfLiteError) {
-      return kTfLiteError;
-    }
-
-    static const int kScaledDiffIntegerBits = 5;
-    tflite::PreprocessSoftmaxScaling(
-        params->beta, input->params.scale, kScaledDiffIntegerBits,
-        &data->input_multiplier, &data->input_left_shift);
-    data->diff_min = -1.0 * tflite::CalculateInputRadius(
-                                kScaledDiffIntegerBits, data->input_left_shift);
+    data->params.table = data->table;
+    optimized_ops::PopulateSoftmaxLookupTable(
+        &data->params, input->params.scale, params->beta);
+    data->params.zero_point = output->params.zero_point;
+    data->params.scale = output->params.scale;
   }
 
   return context->ResizeTensor(context, output,
@@ -605,9 +672,9 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32: {
       size_t elements = input->bytes / sizeof(float);
-      float* in = input->data.f;
-      float* in_end = in + elements;
-      float* out = output->data.f;
+      const float* in = GetTensorData<float>(input);
+      const float* in_end = in + elements;
+      float* out = GetTensorData<float>(output);
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
@@ -634,12 +701,12 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      if (kernel_type == kGenericOptimized) {
-        optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+      if (kernel_type == kReference) {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
                             GetTensorShape(output),
                             GetTensorData<float>(output));
       } else {
-        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+        optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
                             GetTensorShape(output),
                             GetTensorData<float>(output));
       }
@@ -648,23 +715,45 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kGenericOptimized) {
-        optimized_ops::Tanh(
+      if (kernel_type == kReference) {
+        reference_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
       } else {
-        reference_ops::Tanh(
+        optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
       }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      EvalUsingLookupTable<uint8_t>(data, input, output);
+      if (kernel_type == kFixedPointOptimized) {
+        TanhParams params;
+        params.input_zero_point = input->params.zero_point;
+        params.input_range_radius = data->input_range_radius;
+        params.input_multiplier = data->input_multiplier;
+        params.input_left_shift = data->input_left_shift;
+        optimized_ops::Tanh16bitPercision(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        EvalUsingLookupTable<uint8_t>(data, input, output);
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
-      EvalUsingLookupTable<int8_t>(data, input, output);
+      if (kernel_type == kFixedPointOptimized) {
+        TanhParams params;
+        params.input_zero_point = input->params.zero_point;
+        params.input_range_radius = data->input_range_radius;
+        params.input_multiplier = data->input_multiplier;
+        params.input_left_shift = data->input_left_shift;
+        optimized_ops::Tanh16bitPercision(
+            params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      } else {
+        EvalUsingLookupTable<int8_t>(data, input, output);
+      }
       return kTfLiteOk;
     } break;
     default:
@@ -685,12 +774,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      if (kernel_type == kGenericOptimized) {
-        optimized_ops::Logistic(
+      if (kernel_type == kReference) {
+        reference_ops::Logistic(
             GetTensorShape(input), GetTensorData<float>(input),
             GetTensorShape(output), GetTensorData<float>(output));
       } else {
-        reference_ops::Logistic(
+        optimized_ops::Logistic(
             GetTensorShape(input), GetTensorData<float>(input),
             GetTensorShape(output), GetTensorData<float>(output));
       }
@@ -698,23 +787,45 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kGenericOptimized) {
-        optimized_ops::Logistic(
+      if (kernel_type == kReference) {
+        reference_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
       } else {
-        reference_ops::Logistic(
+        optimized_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
       }
       break;
     }
     case kTfLiteUInt8: {
-      EvalUsingLookupTable<uint8_t>(data, input, output);
+      if (kernel_type == kFixedPointOptimized) {
+        LogisticParams params;
+        params.input_zero_point = input->params.zero_point;
+        params.input_range_radius = data->input_range_radius;
+        params.input_multiplier = data->input_multiplier;
+        params.input_left_shift = data->input_left_shift;
+        optimized_ops::Logistic16bitPercision(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        EvalUsingLookupTable<uint8_t>(data, input, output);
+      }
       break;
     }
     case kTfLiteInt8: {
-      EvalUsingLookupTable<int8_t>(data, input, output);
+      if (kernel_type == kFixedPointOptimized) {
+        LogisticParams params;
+        params.input_zero_point = input->params.zero_point;
+        params.input_range_radius = data->input_range_radius;
+        params.input_multiplier = data->input_multiplier;
+        params.input_left_shift = data->input_left_shift;
+        optimized_ops::Logistic16bitPercision(
+            params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      } else {
+        EvalUsingLookupTable<int8_t>(data, input, output);
+      }
       break;
     }
     default:
@@ -749,61 +860,25 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
   }
 }
 
-TfLiteStatus SoftmaxQuantizedUint8(TfLiteContext* context,
-                                   const TfLiteTensor* input,
-                                   TfLiteTensor* output,
-                                   TfLiteSoftmaxParams* params, OpData* data) {
-  switch (NumDimensions(input)) {
-    case 1:
-    case 2:
-    case 3:
-    case 4:
-      SoftmaxParams op_params;
-      op_params.input_multiplier = data->input_multiplier;
-      op_params.input_left_shift = data->input_left_shift;
-      op_params.diff_min = data->diff_min;
-      optimized_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
-      return kTfLiteOk;
-    default:
-      context->ReportError(
-          context,
-          "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
-  }
-}
-
-TfLiteStatus SoftmaxQuantizedInt8(TfLiteContext* context,
-                                  const TfLiteTensor* input,
-                                  TfLiteTensor* output,
-                                  TfLiteSoftmaxParams* params, OpData* data) {
-  switch (NumDimensions(input)) {
-    case 1:
-    case 2:
-    case 3:
-    case 4:
-      SoftmaxParams op_params;
-      op_params.input_multiplier = data->input_multiplier;
-      op_params.input_left_shift = data->input_left_shift;
-      op_params.diff_min = data->diff_min;
-      optimized_integer_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-      return kTfLiteOk;
-    default:
-      context->ReportError(
-          context,
-          "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
+template <typename T>
+TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
+                              TfLiteTensor* output, SoftmaxOpData* data) {
+  if (NumDimensions(input) >= 1 && NumDimensions(input) <= 4) {
+    optimized_ops::Softmax(data->params, GetTensorShape(input),
+                           GetTensorData<T>(input), GetTensorShape(output),
+                           GetTensorData<T>(output));
+    return kTfLiteOk;
+  } else {
+    context->ReportError(
+        context, "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
+        NumDimensions(input));
+    return kTfLiteError;
   }
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  SoftmaxOpData* data = reinterpret_cast<SoftmaxOpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -815,10 +890,10 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return SoftmaxFloat(context, input, output, params);
     }
     case kTfLiteUInt8: {
-      return SoftmaxQuantizedUint8(context, input, output, params, data);
+      return SoftmaxQuantized<uint8_t>(context, input, output, data);
     }
     case kTfLiteInt8: {
-      return SoftmaxQuantizedInt8(context, input, output, params, data);
+      return SoftmaxQuantized<int8_t>(context, input, output, data);
     }
 
     default:
@@ -1028,36 +1103,70 @@ TfLiteRegistration* Register_RELU6() {
 
 TfLiteRegistration* Register_TANH_REF() {
   static TfLiteRegistration r = {
-      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::Init, activations::Free,
+      activations::TanhPrepare<activations::kReference>,
       activations::TanhEval<activations::kReference>};
   return &r;
 }
 
-TfLiteRegistration* Register_TANH() {
+TfLiteRegistration* Register_TANH_GENERIC_OPT() {
   static TfLiteRegistration r = {
-      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::Init, activations::Free,
+      activations::TanhPrepare<activations::kGenericOptimized>,
       activations::TanhEval<activations::kGenericOptimized>};
   return &r;
 }
 
+TfLiteRegistration* Register_TANH_FIXED_POINT_OPT() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free,
+      activations::TanhPrepare<activations::kFixedPointOptimized>,
+      activations::TanhEval<activations::kFixedPointOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_TANH() {
+  // TODO(b/134622898): Switch over from the LUT optimized method to the fixed
+  // point optimized method when typical Android hardware performs better on
+  // the latter one.
+  return Register_TANH_GENERIC_OPT();
+}
+
 TfLiteRegistration* Register_LOGISTIC_REF() {
   static TfLiteRegistration r = {
-      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::Init, activations::Free,
+      activations::SigmoidPrepare<activations::kReference>,
       activations::SigmoidEval<activations::kReference>};
   return &r;
 }
 
-TfLiteRegistration* Register_LOGISTIC() {
+TfLiteRegistration* Register_LOGISTIC_GENERIC_OPT() {
   static TfLiteRegistration r = {
-      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::Init, activations::Free,
+      activations::SigmoidPrepare<activations::kGenericOptimized>,
       activations::SigmoidEval<activations::kGenericOptimized>};
   return &r;
 }
 
+TfLiteRegistration* Register_LOGISTIC_FIXED_POINT_OPT() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free,
+      activations::SigmoidPrepare<activations::kFixedPointOptimized>,
+      activations::SigmoidEval<activations::kFixedPointOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGISTIC() {
+  // TODO(b/134622898): Switch over from the LUT optimized method to the fixed
+  // point optimized method when typical Android hardware performs better on
+  // the latter one.
+  return Register_LOGISTIC_GENERIC_OPT();
+}
+
 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::SoftmaxPrepare,
-                                 activations::SoftmaxEval};
+  static TfLiteRegistration r = {
+      activations::SoftmaxInit, activations::SoftmaxFree,
+      activations::SoftmaxPrepare, activations::SoftmaxEval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 5ed15d7c972..67fbee58162 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -17,12 +17,30 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+// Tanh kernel registrations.
+TfLiteRegistration* Register_TANH_REF();
+TfLiteRegistration* Register_TANH_GENERIC_OPT();
+TfLiteRegistration* Register_TANH_FIXED_POINT_OPT();
+
+// Logistic kernel registrations.
+TfLiteRegistration* Register_LOGISTIC_REF();
+TfLiteRegistration* Register_LOGISTIC_GENERIC_OPT();
+TfLiteRegistration* Register_LOGISTIC_FIXED_POINT_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAreArray;
@@ -44,6 +62,21 @@ class BaseActivationsOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
+  BaseActivationsOpModel(TfLiteRegistration* registration, BuiltinOperator type,
+                         TensorData input) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256, -128});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    resolver_ = absl::make_unique<SingleOpResolver>(type, registration);
+    BuildInterpreter({GetShape(input_)});
+  }
+
   // A dedicated constructor for SOFTMAX, which does some options.
   BaseActivationsOpModel(float softmax_beta, TensorData input) {
     input_ = AddInput(input);
@@ -82,6 +115,15 @@ class BaseActivationsOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
+  BaseActivationsOpModel(TfLiteRegistration* registration, BuiltinOperator type,
+                         const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    resolver_ = absl::make_unique<SingleOpResolver>(type, registration);
+    BuildInterpreter({GetShape(input_)});
+  }
+
  protected:
   int input_;
   int output_;
@@ -136,6 +178,32 @@ class QuantizedActivationsOpModel : public BaseActivationsOpModel {
   }
 };
 
+const auto kTanhKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_TANH_REF()},
+    {"GenericOptimized", ops::builtin::Register_TANH_GENERIC_OPT()},
+    {"FixedPointOptimized", ops::builtin::Register_TANH_FIXED_POINT_OPT()},
+});
+
+class TanhOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kTanhKernelMap;
+  }
+};
+
+const auto kLogisticKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_LOGISTIC_REF()},
+    {"GenericOptimized", ops::builtin::Register_LOGISTIC_GENERIC_OPT()},
+    {"FixedPointOptimized", ops::builtin::Register_LOGISTIC_FIXED_POINT_OPT()},
+});
+
+class LogisticOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kLogisticKernelMap;
+  }
+};
+
 TEST(FloatActivationsOpTest, Elu) {
   FloatActivationsOpModel m(BuiltinOperator_ELU,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
@@ -366,8 +434,8 @@ TEST(QuantizedActivationsOpTest, HardSwishBias) {
                                       -0.3905796f, 24.50887f, 0.035);
 }
 
-TEST(FloatActivationsOpTest, Tanh) {
-  FloatActivationsOpModel m(BuiltinOperator_TANH,
+TEST_P(TanhOpTest, Tanh) {
+  FloatActivationsOpModel m(GetRegistration(), BuiltinOperator_TANH,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, 4,   //
@@ -499,57 +567,125 @@ TEST(QuantizedActivationsOpTest, Relu6Int8) {
               ElementsAreArray({0, 0, 32, 64, 48, 0, 96, 16}));
 }
 
-TEST(QuantizedActivationsOpTest, TanhUint8) {
+TEST_P(TanhOpTest, TanhUint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
-      BuiltinOperator_TANH,
-      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, kMin, kMax});
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_UINT8, {2, 6, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_UINT8, {2, 6, 4, 1}, kMin, kMax});
   m.SetInput<uint8_t>({
-      0, -6, 2, 4,   //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
       -4, -2, 8, 1,  //
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      0.0, -0.999987, 0.964027, 0.999329,     //
-                      -0.999329, -0.96402, 0.99999, 0.76159,  //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
                   },
                   kQuantizedTolerance)));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 225}));
+  if (GetParam() == "Reference") {
+    EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                            128, 0, 251, 255,  //
+                                            0,   5, 255, 225,  //
+                                        }));
+  }
 }
 
-TEST(QuantizedActivationsOpTest, TanhInt8) {
+TEST_P(TanhOpTest, TanhInt8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
-      BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, kMin, kMax});
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT8, {2, 6, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {2, 6, 4, 1}, kMin, kMax});
   m.SetInput<int8_t>({
-      0, -6, 2, 4,   //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
+      -4, -2, 8, 1,  //
+      0,  -6, 2, 4,  //
       -4, -2, 8, 1,  //
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      0.0, -0.999987, 0.964027, 0.999329,     //
-                      -0.999329, -0.96402, 0.99999, 0.76159,  //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
+                      0.0,       -0.999987, 0.964027, 0.999329,  //
+                      -0.999329, -0.96402,  0.99999,  0.76159,   //
                   },
                   kQuantizedTolerance)));
-  EXPECT_THAT(m.GetOutput<int8_t>(),
-              ElementsAreArray({0, -128, 123, 127, -128, -123, 127, 97}));
+  if (GetParam() == "Reference") {
+    EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                           0,    -128, 123, 127,  //
+                                           -128, -123, 127, 97,   //
+                                       }));
+  }
 }
 
-TEST(QuantizedActivationsOpTest, TanhInt16) {
+TEST_P(TanhOpTest, TanhInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
-      BuiltinOperator_TANH,
+      GetRegistration(), BuiltinOperator_TANH,
       /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
       /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
   m.SetInput<int16_t>({
@@ -566,8 +702,8 @@ TEST(QuantizedActivationsOpTest, TanhInt16) {
                   kQuantizedToleranceInt16)));
 }
 
-TEST(FloatActivationsOpTest, Sigmoid) {
-  FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
+TEST_P(LogisticOpTest, Sigmoid) {
+  FloatActivationsOpModel m(GetRegistration(), BuiltinOperator_LOGISTIC,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, 4,   //
@@ -580,10 +716,10 @@ TEST(FloatActivationsOpTest, Sigmoid) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, SigmoidUint8) {
+TEST_P(LogisticOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(
-      BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_UINT8, {1, 6, 4, 1}, -10, 10});
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_UINT8, {2, 6, 4, 1}, -10, 10});
   m.SetInput<uint8_t>({
       0, -6, 2,  4,  //
       3, -2, 10, 1,  //
@@ -591,6 +727,12 @@ TEST(QuantizedActivationsOpTest, SigmoidUint8) {
       3, -2, 10, 1,  //
       0, -6, 2,  4,  //
       3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
@@ -602,41 +744,86 @@ TEST(QuantizedActivationsOpTest, SigmoidUint8) {
                       0.952574, 0.119203, 0.999955, 0.731059,  //
                       0.5,      0.002473, 0.880797, 0.982014,  //
                       0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
                   },
                   kQuantizedTolerance)));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({
-                  128, 1, 227, 251, 244, 32, 255, 188,  //
-                  128, 1, 227, 251, 244, 32, 255, 188,  //
-                  128, 1, 227, 251, 244, 32, 255, 188,  //
-              }));
+  if (GetParam() == "Reference") {
+    EXPECT_THAT(m.GetOutput<uint8_t>(),
+                ElementsAreArray({
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                    128, 1, 227, 251, 244, 32, 255, 188,  //
+                }));
+  }
 }
 
-TEST(QuantizedActivationsOpTest, SigmoidInt8) {
+TEST_P(LogisticOpTest, SigmoidInt8) {
   QuantizedActivationsOpModel m(
-      BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, -10, 10});
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT8, {2, 6, 4, 1}, -10, 10});
   m.SetInput<int8_t>({
-      0, -6, 2, 4,   //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
+      3, -2, 10, 1,  //
+      0, -6, 2,  4,  //
       3, -2, 10, 1,  //
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.5,      0.002473, 0.880797, 0.982014,  //
                       0.952574, 0.119203, 0.999955, 0.731059,  //
                   },
                   kQuantizedTolerance)));
-  EXPECT_THAT(m.GetOutput<int8_t>(),
-              ElementsAreArray({0, -127, 99, 123, 116, -99, 127, 60}));
+  if (GetParam() == "Reference") {
+    EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                           0,   -127, 99,  123,  //
+                                           116, -99,  127, 60,   //
+                                       }));
+  }
 }
 
-TEST(QuantizedActivationsOpTest, SigmoidInt16) {
+TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
-      BuiltinOperator_LOGISTIC,
+      GetRegistration(), BuiltinOperator_LOGISTIC,
       /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
       /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
   m.SetInput<int16_t>({
@@ -1248,5 +1435,14 @@ TEST(FloatActivationsOpTest, LeakyRelu) {
                                  1.0f, -0.5f, -1.0f,  // Row 2
                              }));
 }
+
+INSTANTIATE_TEST_SUITE_P(
+    TanhOpTest, TanhOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kTanhKernelMap)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LogisticOpTest, LogisticOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kLogisticKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 9449981cb89..9dd7df147c8 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -109,7 +109,7 @@ TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
 TEST(FloatAddOpModel, VariousInputShapes) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
@@ -125,7 +125,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
 TEST(FloatAddOpModel, WithBroadcast) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}},  // always a scalar
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
@@ -162,7 +162,7 @@ TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) {
 TEST(IntegerAddOpModel, VariousInputShapes) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     IntegerAddOpModel m({TensorType_INT32, test_shapes[i]},
                         {TensorType_INT32, test_shapes[i]},
                         {TensorType_INT32, {}}, ActivationFunctionType_NONE);
@@ -177,7 +177,7 @@ TEST(IntegerAddOpModel, VariousInputShapes) {
 TEST(IntegerAddOpModel, WithBroadcast) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     IntegerAddOpModel m({TensorType_INT32, test_shapes[i]},
                         {TensorType_INT32, {}},  // always a scalar
                         {TensorType_INT32, {}}, ActivationFunctionType_NONE);
@@ -199,7 +199,7 @@ void QuantizedTestsNoActivation() {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
   std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
-  for (int i = 0; i < inputs1.size(); ++i) {
+  for (size_t i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
                           {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
                           {tensor_type, {}, -1.0, 1.0},
@@ -232,7 +232,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
   std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
-  for (int i = 0; i < inputs1.size(); ++i) {
+  for (size_t i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
                           {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
                           {TensorType_INT16, {}, kMin, kMax},
@@ -256,7 +256,7 @@ void QuantizedTestsActivationRELU_N1_TO_1() {
                                              {0.6, 0.4, -0.8, 0.5}};
   std::vector<std::vector<float>> results = {{-0.2, 0.6, 1.0, -0.1},
                                              {-0.2, 0.6, -0.1, 0.8}};
-  for (int i = 0; i < inputs1.size(); ++i) {
+  for (size_t i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
                           {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
                           {tensor_type, {}, -1.0, 1.0},
@@ -284,7 +284,7 @@ void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
                           {tensor_type, test_shapes[i], -3.0, 3.0},
                           {tensor_type, {}, -3.0, 3.0},
@@ -314,7 +314,7 @@ void QuantizedWithScalarBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
         {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f},
         {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
@@ -330,7 +330,7 @@ void QuantizedWithScalarBroadcast() {
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
         {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f},
         {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
@@ -374,7 +374,7 @@ void QuantizedWithMixedBroadcast() {
        1.0f,  -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
       {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f,
        -1.3f}};
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f},
                                       {tensor_type, test_shapes[i], -3.f, 3.f},
                                       {tensor_type, {}, -3.f, 3.f},
@@ -391,7 +391,7 @@ void QuantizedWithMixedBroadcast() {
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
-  for (int i = 0; i < test_shapes.size(); ++i) {
+  for (size_t i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f},
                                       {tensor_type, base_shape, -3.f, 3.f},
                                       {tensor_type, {}, -3.f, 3.f},
diff --git a/tensorflow/lite/kernels/assign_variable.cc b/tensorflow/lite/kernels/assign_variable.cc
new file mode 100644
index 00000000000..099b8e16cfb
--- /dev/null
+++ b/tensorflow/lite/kernels/assign_variable.cc
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string.h>
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace assign_variable {
+
+constexpr int kInputVariableId = 0;
+constexpr int kInputValue = 1;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  // TODO(b/137042749): TFLite infrastructure (converter, delegate) doesn't
+  // fully support 0-output ops yet. Currently it works if we manually crfat
+  // a TFLite graph that contains variable ops. Note:
+  // * The TFLite Converter need to be changed to be able to produce an op
+  //   with 0 output.
+  // * The delegation code need to be changed to handle 0 output ops. However
+  //   everything still works fine when variable ops aren't used.
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 0);
+
+  const TfLiteTensor* input_variable_id_tensor =
+      GetInput(context, node, kInputVariableId);
+  TF_LITE_ENSURE_EQ(context, input_variable_id_tensor->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, NumElements(input_variable_id_tensor), 1);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+
+  const TfLiteTensor* input_variable_id_tensor =
+      GetInput(context, node, kInputVariableId);
+  const TfLiteTensor* input_value_tensor = GetInput(context, node, kInputValue);
+
+  int variable_id = input_variable_id_tensor->data.i32[0];
+  auto& resource_variables = subgraph->resource_variables();
+
+  auto variable_iterator = resource_variables.find(variable_id);
+  if (variable_iterator == resource_variables.end()) {
+    auto ret = resource_variables.emplace(variable_id, ResourceVariable());
+    variable_iterator = ret.first;
+  }
+
+  auto& variable = variable_iterator->second;
+  variable.AssignFrom(input_value_tensor);
+
+  return kTfLiteOk;
+}
+
+}  // namespace assign_variable
+
+TfLiteRegistration* Register_ASSIGN_VARIABLE() {
+  static TfLiteRegistration r = {nullptr, nullptr, assign_variable::Prepare,
+                                 assign_variable::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index 630f1b31479..fcd64ea4a91 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -143,14 +144,14 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
       output->dims->data[output->dims->size - 1];
 
   // Initialize the pointer to hidden state.
-  float* hidden_state_ptr_batch = hidden_state->data.f;
+  float* hidden_state_ptr_batch = GetTensorData<float>(hidden_state);
   // Initialize the pointer to input and output.
-  const float* input_ptr_batch = input->data.f;
-  float* output_ptr_batch = output->data.f;
+  const float* input_ptr_batch = GetTensorData<float>(input);
+  float* output_ptr_batch = GetTensorData<float>(output);
   // Initialize input_weights, recurrent_weights and bias.
-  const float* input_weights_ptr = input_weights->data.f;
-  const float* recurrent_weights_ptr = recurrent_weights->data.f;
-  const float* bias_ptr = bias->data.f;
+  const float* input_weights_ptr = GetTensorData<float>(input_weights);
+  const float* recurrent_weights_ptr = GetTensorData<float>(recurrent_weights);
+  const float* bias_ptr = GetTensorData<float>(bias);
 
   kernel_utils::RnnBatchStep(
       input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
@@ -167,7 +168,6 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
                         TfLiteTensor* hidden_state_scratch,
                         TfLiteTensor* scaling_factors,
                         TfLiteTensor* hidden_state, TfLiteTensor* output) {
-  const bool is_uint8_hybrid = input_weights->type == kTfLiteUInt8;
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -175,24 +175,23 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
       output->dims->data[output->dims->size - 1];
 
   // Initialize the pointer to hidden state.
-  float* hidden_state_ptr_batch = hidden_state->data.f;
+  float* hidden_state_ptr_batch = GetTensorData<float>(hidden_state);
   // Initialize the pointer to input and output.
-  const float* input_ptr_batch = input->data.f;
-  float* output_ptr_batch = output->data.f;
+  const float* input_ptr_batch = GetTensorData<float>(input);
+  float* output_ptr_batch = GetTensorData<float>(output);
   // Initialize input_weights, recurrent_weights and bias.
-  const int8_t* input_weights_ptr =
-      GetInt8DataPtr(input_weights, is_uint8_hybrid);
+  const int8_t* input_weights_ptr = GetTensorData<int8_t>(input_weights);
   const int8_t* recurrent_weights_ptr =
-      GetInt8DataPtr(recurrent_weights, is_uint8_hybrid);
-  const float* bias_ptr = bias->data.f;
+      GetTensorData<int8_t>(recurrent_weights);
+  const float* bias_ptr = GetTensorData<float>(bias);
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
   // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr = GetInt8DataPtr(input_scratch, is_uint8_hybrid);
+  int8_t* quantized_input_ptr = GetTensorData<int8_t>(input_scratch);
   int8_t* quantized_hidden_state_ptr =
-      GetInt8DataPtr(hidden_state_scratch, is_uint8_hybrid);
-  float* scaling_factors_ptr = scaling_factors->data.f;
+      GetTensorData<int8_t>(hidden_state_scratch);
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
 
   kernel_utils::RnnBatchStep(
       input_ptr_batch, input_weights_ptr, input_weights_scale,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index acf6663daff..e5026ef31bb 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -289,21 +290,25 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
   const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
 
   const int fw_num_units = fw_input_weights->dims->data[0];
-  const float* fw_bias_ptr = fw_bias->data.f;
-  const float* fw_input_weights_ptr = fw_input_weights->data.f;
-  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+  const float* fw_bias_ptr = GetTensorData<float>(fw_bias);
+  const float* fw_input_weights_ptr = GetTensorData<float>(fw_input_weights);
+  const float* fw_recurrent_weights_ptr =
+      GetTensorData<float>(fw_recurrent_weights);
 
   const int bw_num_units = bw_input_weights->dims->data[0];
-  const float* bw_bias_ptr = bw_bias->data.f;
-  const float* bw_input_weights_ptr = bw_input_weights->data.f;
-  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+  const float* bw_bias_ptr = GetTensorData<float>(bw_bias);
+  const float* bw_input_weights_ptr = GetTensorData<float>(bw_input_weights);
+  const float* bw_recurrent_weights_ptr =
+      GetTensorData<float>(bw_recurrent_weights);
 
-  const float* fw_aux_input_weights_ptr = (fw_aux_input_weights != nullptr)
-                                              ? fw_aux_input_weights->data.f
-                                              : nullptr;
-  const float* bw_aux_input_weights_ptr = (bw_aux_input_weights != nullptr)
-                                              ? bw_aux_input_weights->data.f
-                                              : nullptr;
+  const float* fw_aux_input_weights_ptr =
+      (fw_aux_input_weights != nullptr)
+          ? GetTensorData<float>(fw_aux_input_weights)
+          : nullptr;
+  const float* bw_aux_input_weights_ptr =
+      (bw_aux_input_weights != nullptr)
+          ? GetTensorData<float>(bw_aux_input_weights)
+          : nullptr;
 
   const int fw_output_step =
       params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
@@ -311,16 +316,16 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
       params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
   if (time_major) {
     // Forward cell.
-    float* fw_hidden_state_ptr_batch = fw_hidden_state->data.f;
+    float* fw_hidden_state_ptr_batch = GetTensorData<float>(fw_hidden_state);
     for (int s = 0; s < max_time; s++) {
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
+          GetTensorData<float>(input) + s * input_size * batch_size;
       const float* aux_input_ptr_batch =
           (aux_input != nullptr)
-              ? aux_input->data.f + s * input_size * batch_size
+              ? GetTensorData<float>(aux_input) + s * input_size * batch_size
               : nullptr;
       float* output_ptr_batch =
-          fw_output->data.f + s * fw_output_step * batch_size;
+          GetTensorData<float>(fw_output) + s * fw_output_step * batch_size;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch,
@@ -329,17 +334,18 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
           params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
     }
     // Backward cell.
-    float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
+    float* bw_hidden_state_ptr_batch = GetTensorData<float>(bw_hidden_state);
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
-          bw_input->data.f + s * input_size * batch_size;
+          GetTensorData<float>(bw_input) + s * input_size * batch_size;
       const float* aux_input_ptr_batch =
           (aux_input != nullptr)
-              ? aux_input->data.f + s * input_size * batch_size
+              ? GetTensorData<float>(aux_input) + s * input_size * batch_size
               : nullptr;
       float* output_ptr_batch =
-          (params->merge_outputs ? fw_output->data.f + fw_num_units
-                                 : bw_output->data.f) +
+          (params->merge_outputs
+               ? GetTensorData<float>(fw_output) + fw_num_units
+               : GetTensorData<float>(bw_output)) +
           s * bw_output_step * batch_size;
 
       kernel_utils::RnnBatchStep(
@@ -352,15 +358,17 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
     for (int b = 0; b < batch_size; b++) {
       // Forward cell.
       float* fw_hidden_state_ptr_batch =
-          fw_hidden_state->data.f + b * fw_num_units;
+          GetTensorData<float>(fw_hidden_state) + b * fw_num_units;
       float* fw_output_offset =
-          fw_output->data.f + b * fw_output_step * max_time;
+          GetTensorData<float>(fw_output) + b * fw_output_step * max_time;
       for (int s = 0; s < max_time; s++) {
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                ? GetTensorData<float>(aux_input) +
+                      b * aux_input_size * max_time + s * aux_input_size
                 : nullptr;
         float* output_ptr_batch = fw_output_offset + s * fw_output_step;
 
@@ -373,17 +381,20 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
       }
       // Backward cell.
       float* bw_hidden_state_ptr_batch =
-          bw_hidden_state->data.f + b * bw_num_units;
+          GetTensorData<float>(bw_hidden_state) + b * bw_num_units;
       float* bw_output_offset =
           params->merge_outputs
-              ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
-              : bw_output->data.f + b * bw_output_step * max_time;
+              ? GetTensorData<float>(fw_output) +
+                    b * bw_output_step * max_time + fw_num_units
+              : GetTensorData<float>(bw_output) + b * bw_output_step * max_time;
       for (int s = max_time - 1; s >= 0; s--) {
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                ? GetTensorData<float>(aux_input) +
+                      b * aux_input_size * max_time + s * aux_input_size
                 : nullptr;
         float* output_ptr_batch = bw_output_offset + s * bw_output_step;
 
@@ -413,7 +424,6 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
     TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
     TfLiteTensor* bw_output) {
-  const bool is_uint8_hybrid = fw_input_weights->type == kTfLiteUInt8;
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -423,48 +433,42 @@ TfLiteStatus EvalHybrid(
   const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
 
   const int fw_num_units = fw_input_weights->dims->data[0];
-  const float* fw_bias_ptr = fw_bias->data.f;
-  const int8_t* fw_input_weights_ptr =
-      GetInt8DataPtr(fw_input_weights, is_uint8_hybrid);
+  const float* fw_bias_ptr = GetTensorData<float>(fw_bias);
+  const int8_t* fw_input_weights_ptr = GetTensorData<int8_t>(fw_input_weights);
   float fw_input_weights_scale = fw_input_weights->params.scale;
   const int8_t* fw_recurrent_weights_ptr =
-      GetInt8DataPtr(fw_recurrent_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(fw_recurrent_weights);
   float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale;
 
   const int bw_num_units = bw_input_weights->dims->data[0];
-  const float* bw_bias_ptr = bw_bias->data.f;
-  const int8_t* bw_input_weights_ptr =
-      GetInt8DataPtr(bw_input_weights, is_uint8_hybrid);
+  const float* bw_bias_ptr = GetTensorData<float>(bw_bias);
+  const int8_t* bw_input_weights_ptr = GetTensorData<int8_t>(bw_input_weights);
   float bw_input_weights_scale = bw_input_weights->params.scale;
   const int8_t* bw_recurrent_weights_ptr =
-      GetInt8DataPtr(bw_recurrent_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(bw_recurrent_weights);
   float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale;
 
   // Set the auxiliary pointers and scales if needed.
-  int8_t* aux_fw_input_weights_ptr = nullptr;
+  const int8_t* aux_fw_input_weights_ptr = nullptr;
   float aux_fw_input_weights_scale = 0.0f;
-  int8_t* aux_bw_input_weights_ptr = nullptr;
+  const int8_t* aux_bw_input_weights_ptr = nullptr;
   float aux_bw_input_weights_scale = 0.0f;
   int8_t* aux_quantized_input_ptr = nullptr;
   if (aux_input_size > 0) {
-    aux_fw_input_weights_ptr =
-        GetInt8DataPtr(aux_fw_input_weights, is_uint8_hybrid);
+    aux_fw_input_weights_ptr = GetTensorData<int8_t>(aux_fw_input_weights);
     aux_fw_input_weights_scale = aux_fw_input_weights->params.scale;
-    aux_bw_input_weights_ptr =
-        GetInt8DataPtr(aux_bw_input_weights, is_uint8_hybrid);
+    aux_bw_input_weights_ptr = GetTensorData<int8_t>(aux_bw_input_weights);
     aux_bw_input_weights_scale = aux_bw_input_weights->params.scale;
-    aux_quantized_input_ptr =
-        GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
+    aux_quantized_input_ptr = GetTensorData<int8_t>(aux_input_quantized);
   }
 
   // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
+  int8_t* quantized_input_ptr = GetTensorData<int8_t>(input_quantized);
   int8_t* fw_quantized_hidden_state_ptr =
-      GetInt8DataPtr(fw_hidden_state_quantized, is_uint8_hybrid);
+      GetTensorData<int8_t>(fw_hidden_state_quantized);
   int8_t* bw_quantized_hidden_state_ptr =
-      GetInt8DataPtr(bw_hidden_state_quantized, is_uint8_hybrid);
-  float* scaling_factors_ptr = scaling_factors->data.f;
+      GetTensorData<int8_t>(bw_hidden_state_quantized);
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
 
   const int fw_output_step =
       params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
@@ -473,16 +477,16 @@ TfLiteStatus EvalHybrid(
   if (time_major) {
     for (int t = 0; t < max_time; t++) {
       // Forward cell.
-      float* fw_hidden_state_ptr_batch = fw_hidden_state->data.f;
+      float* fw_hidden_state_ptr_batch = GetTensorData<float>(fw_hidden_state);
       for (int s = 0; s < max_time; s++) {
         const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size;
+            GetTensorData<float>(input) + s * input_size * batch_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + s * input_size * batch_size
+                ? GetTensorData<float>(aux_input) + s * input_size * batch_size
                 : nullptr;
         float* output_ptr_batch =
-            fw_output->data.f + s * fw_output_step * batch_size;
+            GetTensorData<float>(fw_output) + s * fw_output_step * batch_size;
 
         kernel_utils::RnnBatchStep(
             input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale,
@@ -495,17 +499,18 @@ TfLiteStatus EvalHybrid(
             fw_hidden_state_ptr_batch, output_ptr_batch);
       }
       // Backward cell.
-      float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
+      float* bw_hidden_state_ptr_batch = GetTensorData<float>(bw_hidden_state);
       for (int s = max_time - 1; s >= 0; s--) {
         const float* input_ptr_batch =
-            bw_input->data.f + s * input_size * batch_size;
+            GetTensorData<float>(bw_input) + s * input_size * batch_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + s * input_size * batch_size
+                ? GetTensorData<float>(aux_input) + s * input_size * batch_size
                 : nullptr;
         float* output_ptr_batch =
-            (params->merge_outputs ? fw_output->data.f + fw_num_units
-                                   : bw_output->data.f) +
+            (params->merge_outputs
+                 ? GetTensorData<float>(fw_output) + fw_num_units
+                 : GetTensorData<float>(bw_output)) +
             s * bw_output_step * batch_size;
 
         kernel_utils::RnnBatchStep(
@@ -523,15 +528,17 @@ TfLiteStatus EvalHybrid(
     for (int b = 0; b < batch_size; b++) {
       // Forward cell.
       float* fw_hidden_state_ptr_batch =
-          fw_hidden_state->data.f + b * fw_num_units;
+          GetTensorData<float>(fw_hidden_state) + b * fw_num_units;
       float* fw_output_offset =
-          fw_output->data.f + b * fw_output_step * max_time;
+          GetTensorData<float>(fw_output) + b * fw_output_step * max_time;
       for (int s = 0; s < max_time; s++) {
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                ? GetTensorData<float>(aux_input) + b * input_size * max_time +
+                      s * input_size
                 : nullptr;
         float* output_ptr_batch = fw_output_offset + s * fw_output_step;
 
@@ -547,17 +554,20 @@ TfLiteStatus EvalHybrid(
       }
       // Backward cell.
       float* bw_hidden_state_ptr_batch =
-          bw_hidden_state->data.f + b * bw_num_units;
+          GetTensorData<float>(bw_hidden_state) + b * bw_num_units;
       float* bw_output_offset =
           params->merge_outputs
-              ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
-              : bw_output->data.f + b * bw_output_step * max_time;
+              ? GetTensorData<float>(fw_output) +
+                    b * bw_output_step * max_time + fw_num_units
+              : GetTensorData<float>(bw_output) + b * bw_output_step * max_time;
       for (int s = max_time - 1; s >= 0; s--) {
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
-                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                ? GetTensorData<float>(aux_input) + b * input_size * max_time +
+                      s * input_size
                 : nullptr;
         float* output_ptr_batch = bw_output_offset + s * bw_output_step;
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index eb7cb0b6d7f..a5210da243b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -27,6 +27,12 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+enum class AuxInputMode {
+  kNoAuxInput,
+  kCrossLinking,
+  kNoCrossLinking,
+};
+
 using ::testing::ElementsAreArray;
 
 static float rnn_input[] = {
@@ -654,13 +660,15 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size, bool use_aux_input,
-                          bool time_major, bool merge_outputs)
+                          int bw_units, int input_size, int aux_input_size,
+                          AuxInputMode aux_input_mode, bool time_major,
+                          bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
         bw_units_(bw_units),
-        input_size_(input_size) {
+        input_size_(input_size),
+        aux_input_size_(aux_input_size) {
     input_ = AddInput(TensorType_FLOAT32);
     fw_weights_ = AddInput(TensorType_FLOAT32);
     fw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
@@ -671,15 +679,33 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     bw_bias_ = AddInput(TensorType_FLOAT32);
     bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
 
-    int aux_input_size = 0;
-    if (use_aux_input) {
+    const auto input_shape =
+        (time_major) ? std::vector<int>({sequence_len_, batches_, input_size_})
+                     : std::vector<int>({batches_, sequence_len_, input_size_});
+
+    std::vector<int> aux_input_shape = {0};
+    std::vector<int> aux_fw_weights_shape = {0};
+    std::vector<int> aux_bw_weights_shape = {0};
+    if (aux_input_mode != AuxInputMode::kNoAuxInput) {
       aux_input_ = AddInput(TensorType_FLOAT32);
-      aux_input_size = input_size_;
+      aux_input_shape =
+          (time_major)
+              ? std::vector<int>({sequence_len_, batches_, aux_input_size_})
+              : std::vector<int>({batches_, sequence_len_, aux_input_size_});
     } else {
       aux_input_ = AddNullInput();
     }
-    aux_fw_weights_ = AddNullInput();
-    aux_bw_weights_ = AddNullInput();
+
+    if (aux_input_mode == AuxInputMode::kCrossLinking) {
+      aux_fw_weights_ = AddInput(TensorType_FLOAT32);
+      aux_bw_weights_ = AddInput(TensorType_FLOAT32);
+
+      aux_fw_weights_shape = {fw_units, aux_input_size_};
+      aux_bw_weights_shape = {bw_units, aux_input_size_};
+    } else {
+      aux_fw_weights_ = AddNullInput();
+      aux_bw_weights_ = AddNullInput();
+    }
 
     fw_output_ = AddOutput(TensorType_FLOAT32);
     if (!merge_outputs) {
@@ -692,23 +718,20 @@ class BidirectionalRNNOpModel : public SingleOpModel {
         CreateBidirectionalSequenceRNNOptions(
             builder_, time_major, ActivationFunctionType_RELU, merge_outputs)
             .Union());
-    const auto input_shape =
-        (time_major) ? std::vector<int>({sequence_len_, batches_, input_size_})
-                     : std::vector<int>({batches_, sequence_len_, input_size_});
 
     BuildInterpreter({
-        input_shape,                                // input
-        {fw_units_, input_size_},                   // fw_weights
-        {fw_units_, fw_units_},                     // fw_recurrent_weights
-        {fw_units_},                                // fw_bias
-        {batches_, fw_units_},                      // fw_hidden_state
-        {bw_units_, input_size_},                   // bw_weights
-        {bw_units_, bw_units_},                     // bw_recurrent_weights
-        {bw_units_},                                // bw_bias
-        {batches_, bw_units_},                      // bw_hidden_state
-        {batches_, sequence_len_, aux_input_size},  // aux_input
-        {fw_units_, 0},                             // aux_fw_weights
-        {bw_units_, 0},                             // aux_bw_weights
+        input_shape,               // input
+        {fw_units_, input_size_},  // fw_weights
+        {fw_units_, fw_units_},    // fw_recurrent_weights
+        {fw_units_},               // fw_bias
+        {batches_, fw_units_},     // fw_hidden_state
+        {bw_units_, input_size_},  // bw_weights
+        {bw_units_, bw_units_},    // bw_recurrent_weights
+        {bw_units_},               // bw_bias
+        {batches_, bw_units_},     // bw_hidden_state
+        aux_input_shape,           // aux_input
+        aux_fw_weights_shape,      // aux_fw_weights
+        aux_bw_weights_shape,      // aux_bw_weights
     });
   }
 
@@ -720,19 +743,19 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     PopulateTensor(bw_bias_, f);
   }
 
-  void SetFwWeights(std::initializer_list<float> f) {
+  void SetFwWeights(const std::vector<float>& f) {
     PopulateTensor(fw_weights_, f);
   }
 
-  void SetBwWeights(std::initializer_list<float> f) {
+  void SetBwWeights(const std::vector<float>& f) {
     PopulateTensor(bw_weights_, f);
   }
 
-  void SetFwRecurrentWeights(std::initializer_list<float> f) {
+  void SetFwRecurrentWeights(const std::vector<float>& f) {
     PopulateTensor(fw_recurrent_weights_, f);
   }
 
-  void SetBwRecurrentWeights(std::initializer_list<float> f) {
+  void SetBwRecurrentWeights(const std::vector<float>& f) {
     PopulateTensor(bw_recurrent_weights_, f);
   }
 
@@ -748,10 +771,19 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     PopulateTensor(aux_input_, offset, begin, end);
   }
 
+  void SetAuxFwWeights(const std::vector<float>& f) {
+    PopulateTensor(aux_fw_weights_, f);
+  }
+
+  void SetAuxBwWeights(const std::vector<float>& f) {
+    PopulateTensor(aux_bw_weights_, f);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
   int input_size() { return input_size_; }
+  int aux_input_size() { return aux_input_size_; }
   int num_fw_units() { return fw_units_; }
   int num_bw_units() { return bw_units_; }
   int num_batches() { return batches_; }
@@ -778,6 +810,7 @@ class BidirectionalRNNOpModel : public SingleOpModel {
   int fw_units_;
   int bw_units_;
   int input_size_;
+  int aux_input_size_;
 };
 
 // TODO(mirkov): add another test which directly compares to TF once TOCO
@@ -785,7 +818,8 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -824,7 +858,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -861,7 +896,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/false,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
@@ -900,7 +936,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/true,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
@@ -945,7 +982,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -993,7 +1031,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*input_size=*/8, /*aux_input_size=*/0,
+                              /*aux_input_mode=*/AuxInputMode::kNoAuxInput,
                               /*time_major=*/false,
                               /*merge_outputs=*/false);
   const int output_size = 4;
@@ -1061,11 +1100,15 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) {
   }
 }
 
-// Same as BlackBox test, but has aux input.
-TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInput) {
+// Same as BlackBox test, but has an auxiliary input. The layer has no
+// cross-linking, i.e. the regular input is passed as an input to the forward
+// network only and the auxiliary input is passed as an input to the backward
+// network only.
+TEST(BidirectionalRNNOpTest, BlackBoxTestNoCrossLinkingRegularAndAuxInput) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*input_size=*/8, /*aux_input_size=*/8,
+                              /*aux_input_mode=*/AuxInputMode::kNoCrossLinking,
                               /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -1092,20 +1135,29 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInput) {
   rnn.Invoke();
 
   std::vector<float> fw_expected;
+  std::vector<float> bw_expected;
   for (int i = 0; i < rnn.sequence_len(); i++) {
     float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
     float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
     fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
     fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+
+    float* golden_bw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_fw_units();
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
   }
   EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
 }
 
-// Same as previous test, but has aux input is all zeros.
-TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputZeros) {
+// Same as above but the auxiliary input is set to zeroes. This test makes sure
+// that the forward network works as expected in a no-cross-linking mode.
+TEST(BidirectionalRNNOpTest, BlackBoxTestNoCrossLinkingRegularInputOnly) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*input_size=*/8, /*aux_input_size=*/8,
+                              /*aux_input_mode=*/AuxInputMode::kNoCrossLinking,
                               /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -1146,12 +1198,14 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputZeros) {
   EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
 }
 
-// Same as previous test, but has input is all zeros, and aux input is the real
-// input. This is testing the bw path is functional.
-TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
+// Same as above but the regular (i.e. not auxiliary) input is set to zeroes.
+// This test makes sure that the backward network works as expected in a
+// no-cross-linking mode.
+TEST(BidirectionalRNNOpTest, BlackBoxTestNoCrossLinkingAuxInputOnly) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*input_size=*/8, /*aux_input_size=*/8,
+                              /*aux_input_mode=*/AuxInputMode::kNoCrossLinking,
                               /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
@@ -1192,5 +1246,204 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
   EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
 }
 
+// Same as BlackBox test, but an input is passed to auxiliary input instead of
+// the regular one. Regular input and weights are set to zero.
+TEST(BidirectionalRNNOpTest, BlackBoxTestCrossLinkingAuxInputOnly) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*aux_input_size=*/8,
+                              /*aux_input_mode=*/AuxInputMode::kCrossLinking,
+                              /*time_major=*/false,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(std::vector<float>(weights.size(), 0.0));
+  rnn.SetBwWeights(std::vector<float>(weights.size(), 0.0));
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+  rnn.SetAuxFwWeights(weights);
+  rnn.SetAuxBwWeights(weights);
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  std::vector<float> zero_input(input_sequence_size, 0.f);
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  // Set batch 0 inputs
+  rnn.SetInput(0, zero_input.data(), zero_input.data() + zero_input.size());
+  rnn.SetAuxInput(0, batch_start, batch_end);
+  // Set batch 1 inputs
+  rnn.SetInput(input_sequence_size, zero_input.data(),
+               zero_input.data() + zero_input.size());
+  rnn.SetAuxInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Same as BlackBox test, but an input is passed to auxiliary input instead of
+// the regular one. Regular input and weights are set to zero. Time major inputs
+// and outputs.
+TEST(BidirectionalRNNOpTest, BlackBoxTestCrossLinkingAuxInputOnlyTimeMajor) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*aux_input_size=*/8,
+                              /*aux_input_mode=*/AuxInputMode::kCrossLinking,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(std::vector<float>(weights.size(), 0.0));
+  rnn.SetBwWeights(std::vector<float>(weights.size(), 0.0));
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+  rnn.SetAuxFwWeights(weights);
+  rnn.SetAuxBwWeights(weights);
+
+  std::vector<float> zero_input(rnn.sequence_len(), 0.f);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Set batch 0 inputs
+    rnn.SetInput(2 * i * rnn.input_size(), &zero_input.front(),
+                 &zero_input.back() + 1);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    // Set batch 1 inputs
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), &zero_input.front(),
+                 &zero_input.back() + 1);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as BlackBox test, but the input tensor and weights tensor are split
+// along the last dimension and passed to both regular and auxiliry inputs and
+// weights. The output in this case is the same. To understand this, let's
+// define W and V as regular input weights matrix and auxiliary input weights
+// matrix correspondingly. It's easy to see that this is equivalent to a regular
+// RNN with weights U = (W|V) and z^T = x^T | y^T, where .|. denotes
+// concatenation along horizontal axis:
+//   f(z) = Uz + b
+// is equivalent to:
+//   f((x^T|y^T)^T) = (Wx + Vy) + b.
+void run_blackbox_test_with_input_split(int input_size, int aux_input_size) {
+  const int num_units = 16;
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/num_units, /*bw_units=*/num_units,
+                              input_size, aux_input_size,
+                              /*aux_input_mode=*/AuxInputMode::kCrossLinking,
+                              /*time_major=*/false,
+                              /*merge_outputs=*/false);
+  std::vector<float> reg_weights(num_units * rnn.input_size());
+  std::vector<float> aux_weights(num_units * rnn.aux_input_size());
+  int full_weights_size = weights.size();
+  int reg_weights_offset = 0;
+  int aux_weights_offset = 0;
+  int weights_offset = 0;
+  // Alternating copying to regular input weights and auxiliary input weights to
+  // split the original weight matrix in half along the last axis.
+  while (weights_offset < full_weights_size) {
+    std::copy(weights.begin() + weights_offset,
+              weights.begin() + weights_offset + rnn.input_size(),
+              reg_weights.begin() + reg_weights_offset);
+    weights_offset += rnn.input_size();
+    reg_weights_offset += rnn.input_size();
+
+    std::copy(weights.begin() + weights_offset,
+              weights.begin() + weights_offset + rnn.aux_input_size(),
+              aux_weights.begin() + aux_weights_offset);
+    weights_offset += rnn.aux_input_size();
+    aux_weights_offset += rnn.aux_input_size();
+  }
+
+  rnn.SetFwWeights(reg_weights);
+  rnn.SetBwWeights(reg_weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+  rnn.SetAuxFwWeights(aux_weights);
+  rnn.SetAuxBwWeights(aux_weights);
+
+  int full_input_size =
+      (rnn.input_size() + rnn.aux_input_size()) * rnn.sequence_len();
+  int reg_input_offset = 0;
+  int aux_input_offset = 0;
+  // Alternating copying to regular input tensor and auxiliary input tensor to
+  // split the original input matrix in half along the last axis.
+  for (int batch = 0; batch < 2; ++batch) {
+    int input_offset = 0;
+    while (input_offset < full_input_size) {
+      rnn.SetInput(reg_input_offset, rnn_input + input_offset,
+                   rnn_input + input_offset + rnn.input_size());
+      input_offset += rnn.input_size();
+      reg_input_offset += rnn.input_size();
+
+      rnn.SetAuxInput(aux_input_offset, rnn_input + input_offset,
+                      rnn_input + input_offset + rnn.aux_input_size());
+      input_offset += rnn.aux_input_size();
+      aux_input_offset += rnn.aux_input_size();
+    }
+  }
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+TEST(BidirectionalRNNOpTest,
+     BlackBoxTestCrossLinkingRegularAndAuxInputEvenSplit) {
+  run_blackbox_test_with_input_split(/*input_size=*/4, /*aux_input_size=*/4);
+}
+
+// Same as above but the input tensor and the weights tensor are split unevenly.
+TEST(BidirectionalRNNOpTest,
+     BlackBoxTestCrossLinkingRegularAndAuxInputUnevenSplit) {
+  run_blackbox_test_with_input_split(/*input_size=*/2, /*aux_input_size=*/6);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index ac6c85b9692..ea7240f6c1d 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string.h>
+
 #include <algorithm>
 #include <complex>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -82,7 +84,7 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
       copyCast(in, out->data.uint8, num_elements);
       break;
     case kTfLiteFloat32:
-      copyCast(in, out->data.f, num_elements);
+      copyCast(in, GetTensorData<float>(out), num_elements);
       break;
     case kTfLiteBool:
       copyCast(in, out->data.b, num_elements);
@@ -111,7 +113,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       return copyToTensor(input->data.uint8, output, num_elements);
     case kTfLiteFloat32:
-      return copyToTensor(input->data.f, output, num_elements);
+      return copyToTensor(GetTensorData<float>(input), output, num_elements);
     case kTfLiteBool:
       return copyToTensor(input->data.b, output, num_elements);
     case kTfLiteComplex64:
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index 15a4a983c1e..15333fd5c37 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 072d6c6fc2c..faeed6f1e36 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
 #ifndef TFLITE_WITH_RUY
@@ -115,13 +115,11 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // to carry information from Prepare() to Eval().
   auto* data = new OpData;
   eigen_support::IncrementUsageCounter(context);
-  cpu_backend_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
   eigen_support::DecrementUsageCounter(context);
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -472,7 +470,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
           GetTensorShape(bias), GetTensorData<int32_t>(bias),
           GetTensorShape(output), GetTensorData<uint8_t>(output),
           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
-          cpu_backend_support::GetFromContext(context));
+          CpuBackendContext::GetFromContext(context));
       break;
     }
   }
@@ -516,7 +514,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<int32>(bias), GetTensorShape(output),
           GetTensorData<int8>(output), GetTensorShape(im2col),
           GetTensorData<int8>(im2col),
-          cpu_backend_support::GetFromContext(context));
+          CpuBackendContext::GetFromContext(context));
       break;
     }
   }
@@ -564,7 +562,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                           GetTensorData<float>(bias), GetTensorShape(output),
                           GetTensorData<float>(output), GetTensorShape(im2col),
                           GetTensorData<float>(im2col),
-                          cpu_backend_support::GetFromContext(context));
+                          CpuBackendContext::GetFromContext(context));
       break;
     }
     case kMultithreadOptimized: {
@@ -608,16 +606,17 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   const TfLiteTensor* input_quantized =
       GetTemporary(context, node, data->input_quantized_index);
   int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
-  float* scaling_factors_ptr =
-      GetTemporary(context, node, data->scaling_factors_index)->data.f;
+  float* scaling_factors_ptr = GetTensorData<float>(
+      GetTemporary(context, node, data->scaling_factors_index));
 
   // Per-batch input quantization for higher accuracy.
   for (int b = 0; b < batch_size; ++b) {
     float unused_min, unused_max;
     const int offset = b * input_size;
     tensor_utils::SymmetricQuantizeFloats(
-        input->data.f + offset, input_size, quantized_input_ptr_batch + offset,
-        &unused_min, &unused_max, &scaling_factors_ptr[b]);
+        GetTensorData<float>(input) + offset, input_size,
+        quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+        &scaling_factors_ptr[b]);
     scaling_factors_ptr[b] *= filter->params.scale;
   }
 
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index 15ab1bc7a67..0b38bb6998a 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -20,15 +20,42 @@ limitations under the License.
 
 namespace tflite {
 
+CpuBackendContext* CpuBackendContext::GetFromContext(TfLiteContext* context) {
+  auto* external_context = static_cast<ExternalCpuBackendContext*>(
+      context->GetExternalContext(context, kTfLiteCpuBackendContext));
+
+  if (external_context == nullptr) {
+    TF_LITE_FATAL(
+        "ExternalCpuBackendContext isn't properly initialized during TFLite "
+        "interpreter initialization.");
+  }
+
+  auto* cpu_backend_context = static_cast<CpuBackendContext*>(
+      external_context->internal_backend_context());
+  if (cpu_backend_context == nullptr) {
+    // We do the lazy initialization here for the TfLiteInternalBackendContext
+    // that's wrapped inside ExternalCpuBackendContext.
+    cpu_backend_context = new CpuBackendContext();
+    if (context->recommended_num_threads != -1) {
+      cpu_backend_context->SetMaxNumThreads(context->recommended_num_threads);
+    }
+    external_context->set_internal_backend_context(
+        std::unique_ptr<TfLiteInternalBackendContext>(cpu_backend_context));
+  }
+
+  return cpu_backend_context;
+}
+
 CpuBackendContext::CpuBackendContext()
-    : ruy_context_(new ruy::Context),
+    : TfLiteInternalBackendContext(),
+      ruy_context_(new ruy::Context),
       gemmlowp_context_(new gemmlowp::GemmContext) {
-  set_max_num_threads(1);
+  SetMaxNumThreads(1);
 }
 
 CpuBackendContext::~CpuBackendContext() {}
 
-void CpuBackendContext::set_max_num_threads(int max_num_threads) {
+void CpuBackendContext::SetMaxNumThreads(int max_num_threads) {
   max_num_threads_ = max_num_threads;
   ruy_context_->max_num_threads = max_num_threads;
   gemmlowp_context_->set_max_num_threads(max_num_threads);
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 066d4a10b8d..c64eae2f6f3 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -20,13 +20,16 @@ limitations under the License.
 
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/external_cpu_backend_context.h"
 
 namespace tflite {
 
-class CpuBackendContext final {
+class CpuBackendContext final : public TfLiteInternalBackendContext {
  public:
+  static CpuBackendContext* GetFromContext(TfLiteContext* context);
+
   CpuBackendContext();
-  ~CpuBackendContext();
+  ~CpuBackendContext() override;
 
   ruy::Context* ruy_context() const { return ruy_context_.get(); }
 
@@ -34,19 +37,10 @@ class CpuBackendContext final {
     return gemmlowp_context_.get();
   }
 
-  // Sets the maximum-number-of-threads-to-use parameter.
-  // This is only a means of passing around this information.
-  // cpu_backend_threadpool::Execute creates as many threads as it's
-  // asked to, regardless of this. Typically a call site would query
-  // cpu_backend_context->max_num_threads() and used that to determine
-  // the number of tasks to create and to give to
-  // cpu_backend_threadpool::Execute.
-  //
-  // This value also gets propagated to back-ends, where it plays the same
-  // information-only role.
-  void set_max_num_threads(int max_num_threads);
+  // Sets the maximum-number-of-threads-to-use parameter, only as a means of
+  // passing around this information.
+  void SetMaxNumThreads(int max_num_threads) override;
 
-  // See set_max_num_threads.
   int max_num_threads() const { return max_num_threads_; }
 
  private:
@@ -59,7 +53,15 @@ class CpuBackendContext final {
   const std::unique_ptr<ruy::Context> ruy_context_;
   const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
 
-  // See set_max_num_threads.
+  // The maxinum of threads used for parallelizing TfLite ops. However,
+  // cpu_backend_threadpool::Execute creates as many threads as it's
+  // asked to, regardless of this. Typically a call site would query
+  // cpu_backend_context->max_num_threads() and used that to determine
+  // the number of tasks to create and to give to
+  // cpu_backend_threadpool::Execute.
+  //
+  // This value also gets propagated to back-ends, where it plays the same
+  // information-only role.
   int max_num_threads_;
 
   CpuBackendContext(const CpuBackendContext&) = delete;
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 017f1660e8c..aa41f03319d 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -541,8 +541,10 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
       // being processed.
 
       // Add bias values.
-      int32x4_t bias_vec = vld1q_s32(params.bias + row);
-      reduced = vaddq_s32(reduced, bias_vec);
+      if (params.bias) {
+        int32x4_t bias_vec = vld1q_s32(params.bias + row);
+        reduced = vaddq_s32(reduced, bias_vec);
+      }
 
       // Get multiplier parameters.
       int32x4_t multiplier_fixedpoint;
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 3c63443ecf4..a73149c50fa 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -92,9 +92,6 @@ struct GemmImplUsingGemmlowp<
 
     using ColVectorMap =
         gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>;
-    ColVectorMap bias_vector(params.bias, lhs_params.rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
     scale_stage.result_offset_after_shift = dst_params.zero_point;
     scale_stage.result_fixedpoint_multiplier = params.multiplier_fixedpoint;
@@ -105,12 +102,25 @@ struct GemmImplUsingGemmlowp<
     clamp_stage.min = params.clamp_min;
     clamp_stage.max = params.clamp_max;
     SaturatingCastStageType saturating_cast_stage;
-    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
-                                           clamp_stage, saturating_cast_stage);
     using BitDepthParams = typename GemmlowpBitDepthParams<SrcScalar>::Type;
-    gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
-        context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
-        -lhs_params.zero_point, -rhs_params.zero_point, output_pipeline);
+    if (params.bias) {
+      ColVectorMap bias_vector(params.bias, lhs_params.rows);
+      gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+      bias_addition_stage.bias_vector = bias_vector;
+      auto output_pipeline = std::make_tuple(
+          bias_addition_stage, scale_stage, clamp_stage, saturating_cast_stage);
+      gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+          context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs,
+          &gemmlowp_dst, -lhs_params.zero_point, -rhs_params.zero_point,
+          output_pipeline);
+    } else {
+      auto output_pipeline =
+          std::make_tuple(scale_stage, clamp_stage, saturating_cast_stage);
+      gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+          context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs,
+          &gemmlowp_dst, -lhs_params.zero_point, -rhs_params.zero_point,
+          output_pipeline);
+    }
   }
 };
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
index 40e81dcfeae..27c273830e0 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -158,20 +158,12 @@ void ValidateGemmParams(
     TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
   } else if (quantization_flavor ==
              QuantizationFlavor::kIntegerWithUniformMultiplier) {
-    // For now require a bias vector. Ruy does not care, but for gemmlowp
-    // it's a separate instantiation of the whole GEMM, so we save a lot of
-    // binary size by requiring a bias vector, and that's what we've been
-    // doing all along in our usage of gemmlowp, so somehow that must
-    // be OK with all existing users.
-    TFLITE_DCHECK(params.bias);
     TFLITE_DCHECK(params.multiplier_fixedpoint);
     // Nothing to check about multiplier_exponent
     TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
     TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
   } else if (quantization_flavor ==
              QuantizationFlavor::kIntegerWithPerRowMultiplier) {
-    // See above comment about requiring bias.
-    TFLITE_DCHECK(params.bias);
     TFLITE_DCHECK(!params.multiplier_fixedpoint);
     TFLITE_DCHECK(!params.multiplier_exponent);
     TFLITE_DCHECK(params.multiplier_fixedpoint_perchannel);
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index c193d1b60cc..d545b80f97f 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -207,8 +207,8 @@ bool CheckErrorStats(const ErrorStats& error_stats, int accumulation_depth) {
     // compromise between something that works and something that's simple
     // enough code that doesn't feel too ad-hoc. As above in the float path,
     // abs_mean_diff is subject to a stricter requirement as it is a bias.
-    tolerated_relative_mean_abs_diff = std::sqrt(inverse_size);
-    tolerated_relative_abs_mean_diff = inverse_size;
+    tolerated_relative_mean_abs_diff = std::sqrt(inverse_size) * 0.5;
+    tolerated_relative_abs_mean_diff = inverse_size * 2.;
   }
 
   double tolerated_max_abs_diff =
@@ -363,7 +363,7 @@ void TestSomeGemm(int rows, int depth, int cols,
                   const std::vector<DstScalar>& golden) {
   CpuBackendContext cpu_backend_context;
   std::default_random_engine random_engine;
-  cpu_backend_context.set_max_num_threads(1 + (random_engine() % 8));
+  cpu_backend_context.SetMaxNumThreads(1 + (random_engine() % 8));
 
   const bool use_golden = !golden.empty();
 
@@ -416,8 +416,7 @@ void TestSomeGemm(int rows, int depth, int cols,
   }
 
   GemmParams<AccumScalar, DstScalar> params;
-  if (use_golden || !std::is_floating_point<AccumScalar>::value ||
-      (random_engine() % 2)) {
+  if (use_golden || (random_engine() % 2)) {
     // cpu_backend_gemm supports bias=null only in the float path. Test that
     // in 50% of float testcases.
     params.bias = bias_data.data();
diff --git a/tensorflow/lite/kernels/cpu_backend_support.cc b/tensorflow/lite/kernels/cpu_backend_support.cc
deleted file mode 100644
index 5d7f41ab4e8..00000000000
--- a/tensorflow/lite/kernels/cpu_backend_support.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
-
-#include <memory>
-
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace cpu_backend_support {
-
-namespace {
-
-// TODO(b/130950871) we probably shouldn't be using any reference-counting
-// but this is an existing idiom.
-struct RefCountedCpuBackendContext : public TfLiteExternalContext {
-  std::unique_ptr<CpuBackendContext> cpu_backend_context;
-  int num_references = 0;
-};
-
-RefCountedCpuBackendContext* GetCpuBackendContext(TfLiteContext* context) {
-  return static_cast<RefCountedCpuBackendContext*>(
-      context->GetExternalContext(context, kTfLiteCpuBackendContext));
-}
-
-TfLiteStatus Refresh(TfLiteContext* context) {
-  auto* refcounted = GetCpuBackendContext(context);
-  if (refcounted != nullptr) {
-    refcounted->cpu_backend_context->set_max_num_threads(
-        context->recommended_num_threads);
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void IncrementUsageCounter(TfLiteContext* context) {
-  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
-  if (refcounted == nullptr) {
-    refcounted = new RefCountedCpuBackendContext;
-    refcounted->type = kTfLiteCpuBackendContext;
-    refcounted->Refresh = Refresh;
-    refcounted->cpu_backend_context.reset(new CpuBackendContext);
-    if (context->recommended_num_threads != -1) {
-      refcounted->cpu_backend_context->set_max_num_threads(
-          context->recommended_num_threads);
-    }
-    refcounted->num_references = 0;
-    context->SetExternalContext(context, kTfLiteCpuBackendContext, refcounted);
-  }
-  refcounted->num_references++;
-}
-
-void DecrementUsageCounter(TfLiteContext* context) {
-  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
-  if (refcounted == nullptr) {
-    TF_LITE_FATAL(
-        "Call to DecrementUsageCounter() not preceded by "
-        "IncrementUsageCounter()");
-  }
-  if (--refcounted->num_references == 0) {
-    delete refcounted;
-    context->SetExternalContext(context, kTfLiteCpuBackendContext, nullptr);
-  }
-}
-
-CpuBackendContext* GetFromContext(TfLiteContext* context) {
-  RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context);
-  if (refcounted == nullptr) {
-    TF_LITE_FATAL(
-        "Call to GetFromContext() not preceded by IncrementUsageCounter()");
-  }
-  return refcounted->cpu_backend_context.get();
-}
-
-}  // namespace cpu_backend_support
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
index 45208a383c5..5089323070a 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
@@ -61,10 +61,10 @@ void TestGenerateArrayOfIncrementingInts(int num_threads, int size) {
   ASSERT_EQ(num_threads, tasks.size());
 
   CpuBackendContext context;
-  // This set_max_num_threads is only to satisfy an assertion in Execute.
+  // This SetMaxNumThreads is only to satisfy an assertion in Execute.
   // What actually determines the number of threads used is the parameter
   // passed to Execute, since Execute does 1:1 mapping of tasks to threads.
-  context.set_max_num_threads(num_threads);
+  context.SetMaxNumThreads(num_threads);
 
   // Execute tasks on the threadpool.
   cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), &context);
diff --git a/tensorflow/lite/kernels/depth_to_space.cc b/tensorflow/lite/kernels/depth_to_space.cc
new file mode 100644
index 00000000000..561a4340698
--- /dev/null
+++ b/tensorflow/lite/kernels/depth_to_space.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace depth_to_space {
+
+// This file has two implementation of DepthToSpace. Note that DepthToSpace only
+// works on 4D tensors.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthToSpaceParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+  auto data_type = output->type;
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
+                     data_type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  const int block_size = params->block_size;
+  const int input_height = input->dims->data[1];
+  const int input_width = input->dims->data[2];
+  const int input_channels = input->dims->data[3];
+  int output_height = input_height * block_size;
+  int output_width = input_width * block_size;
+  int output_channels = input_channels / block_size / block_size;
+
+  TF_LITE_ENSURE_EQ(context, input_height, output_height / block_size);
+  TF_LITE_ENSURE_EQ(context, input_width, output_width / block_size);
+  TF_LITE_ENSURE_EQ(context, input_channels,
+                    output_channels * block_size * block_size);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  output_size->data[1] = output_height;
+  output_size->data[2] = output_width;
+  output_size->data[3] = output_channels;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthToSpaceParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_DEPTH_TO_SPACE(type, scalar)                               \
+  tflite::DepthToSpaceParams op_params;                                    \
+  op_params.block_size = params->block_size;                               \
+  type::DepthToSpace(op_params, GetTensorShape(input),                     \
+                     GetTensorData<scalar>(input), GetTensorShape(output), \
+                     GetTensorData<scalar>(output))
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_DEPTH_TO_SPACE(reference_ops, float);
+      } else {
+        TF_LITE_DEPTH_TO_SPACE(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_DEPTH_TO_SPACE(reference_ops, uint8_t);
+      } else {
+        TF_LITE_DEPTH_TO_SPACE(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_DEPTH_TO_SPACE(reference_ops, int8_t);
+      } else {
+        TF_LITE_DEPTH_TO_SPACE(optimized_ops, int8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_DEPTH_TO_SPACE(reference_ops, int32_t);
+      } else {
+        TF_LITE_DEPTH_TO_SPACE(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_DEPTH_TO_SPACE(reference_ops, int64_t);
+      } else {
+        TF_LITE_DEPTH_TO_SPACE(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context, "Type '%s' not currently supported.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+#undef TF_LITE_DEPTH_TO_SPACE
+
+  return kTfLiteOk;
+}
+
+}  // namespace depth_to_space
+
+TfLiteRegistration* Register_DEPTH_TO_SPACE_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, depth_to_space::Prepare,
+      depth_to_space::Eval<depth_to_space::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTH_TO_SPACE_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, depth_to_space::Prepare,
+      depth_to_space::Eval<depth_to_space::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTH_TO_SPACE() {
+  return Register_DEPTH_TO_SPACE_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/depth_to_space_test.cc b/tensorflow/lite/kernels/depth_to_space_test.cc
new file mode 100644
index 00000000000..8d59a1ad82f
--- /dev/null
+++ b/tensorflow/lite/kernels/depth_to_space_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class DepthToSpaceOpModel : public SingleOpModel {
+ public:
+  DepthToSpaceOpModel(const TensorData& tensor_data, int block_size) {
+    input_ = AddInput(tensor_data);
+    output_ = AddOutput(tensor_data);
+    SetBuiltinOp(BuiltinOperator_DEPTH_TO_SPACE,
+                 BuiltinOptions_DepthToSpaceOptions,
+                 CreateDepthToSpaceOptions(builder_, block_size).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(DepthToSpaceOpModel, BadBlockSize) {
+  EXPECT_DEATH(DepthToSpaceOpModel({TensorType_FLOAT32, {1, 1, 1, 4}}, 4),
+               "Cannot allocate tensors");
+}
+#endif
+
+TEST(DepthToSpaceOpModel, Float32) {
+  DepthToSpaceOpModel m({TensorType_FLOAT32, {1, 1, 1, 4}}, 2);
+  m.SetInput<float>({1.4, 2.3, 3.2, 4.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.4, 2.3, 3.2, 4.1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+}
+
+TEST(DepthToSpaceOpModel, Uint8) {
+  DepthToSpaceOpModel m({TensorType_UINT8, {1, 1, 2, 4}}, 2);
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({1, 2, 5, 6, 3, 4, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 2, 4, 1));
+}
+
+TEST(DepthToSpaceOpModel, int8) {
+  DepthToSpaceOpModel m({TensorType_INT8, {1, 2, 1, 4}}, 2);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 4, 2, 1));
+}
+
+TEST(DepthToSpaceOpModel, Int32) {
+  DepthToSpaceOpModel m({TensorType_INT32, {1, 2, 2, 4}}, 2);
+  m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray(
+                  {1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 4, 4, 1));
+}
+
+TEST(DepthToSpaceOpModel, Int64) {
+  DepthToSpaceOpModel m({TensorType_INT64, {1, 1, 1, 1}}, 1);
+  m.SetInput<int64_t>({4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 1));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index f3010549406..1f50b3741d6 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -70,7 +70,6 @@ struct OpData {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  cpu_backend_support::IncrementUsageCounter(context);
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
@@ -78,7 +77,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -115,6 +113,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                               data_type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, output->type, data_type);
   TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+  // Filter in DepthwiseConv is expected to be [1, H, W, O].
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);
 
   if (hasBias) {
     bias = GetInput(context, node, kBiasTensor);
@@ -207,7 +207,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(filter), GetTensorData<float>(filter),
         GetTensorShape(bias), GetTensorData<float>(bias),
         GetTensorShape(output), GetTensorData<float>(output),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   }
 }
 
@@ -248,7 +248,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(filter), GetTensorData<uint8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   }
 }
 
@@ -290,7 +290,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<int8>(filter), GetTensorShape(bias),
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   }
 }
 
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 3a8a62c6804..75b4d5e6a61 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -616,6 +616,8 @@ class QuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
   }
 };
 
+// Only enable this test for neon.
+#ifdef USE_NEON
 TEST_F(QuantizedDepthwiseConvolutionOpTest, LargeOutputChannelTest) {
   const TensorData input({TensorType_UINT8, {1, 4, 4, 2400}, -63.5, 64});
   const TensorData filter({TensorType_UINT8, {1, 3, 3, 2400}, -63.5, 64});
@@ -646,6 +648,7 @@ TEST_F(QuantizedDepthwiseConvolutionOpTest, LargeOutputChannelTest) {
   reference_impl.SetInput(input_data);
   reference_impl.SetFilter(filter_data);
   reference_impl.SetBias(bias_data);
+  reference_impl.Invoke();
 
   QuantizedDepthwiseConvolutionOpModel optimized_impl(
       ops::builtin::Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT(), input, filter,
@@ -653,9 +656,11 @@ TEST_F(QuantizedDepthwiseConvolutionOpTest, LargeOutputChannelTest) {
   optimized_impl.SetInput(input_data);
   optimized_impl.SetFilter(filter_data);
   optimized_impl.SetBias(bias_data);
+  optimized_impl.Invoke();
 
-  // EXPECT_THAT(reference_impl.GetOutput(), optimized_impl.GetOutput());
+  EXPECT_THAT(reference_impl.GetOutput(), optimized_impl.GetOutput());
 }
+#endif
 
 // In this test we set the input and output scales so that the results match
 // exactly the 'non-quantized' version.
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index db7e23e6fa0..5ba94a63502 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -33,6 +33,12 @@ namespace ops {
 namespace builtin {
 namespace dequantize {
 
+// This file has two implementation of Dequantize.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 struct OpContext {
   OpContext(TfLiteContext* context, TfLiteNode* node) {
     input = GetInput(context, node, 0);
@@ -78,6 +84,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(op_context.input->dims));
 }
 
+template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   OpContext op_context(context, node);
@@ -91,24 +98,45 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   op_params.scale = op_context.input->params.scale;
   switch (op_context.input->type) {
     case kTfLiteUInt8:
-      optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
-                                GetTensorData<uint8_t>(op_context.input),
-                                GetTensorShape(op_context.output),
-                                GetTensorData<float>(op_context.output));
+      if (kernel_type == kReference) {
+        reference_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                  GetTensorData<uint8_t>(op_context.input),
+                                  GetTensorShape(op_context.output),
+                                  GetTensorData<float>(op_context.output));
+      } else {
+        optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                  GetTensorData<uint8_t>(op_context.input),
+                                  GetTensorShape(op_context.output),
+                                  GetTensorData<float>(op_context.output));
+      }
       break;
     case kTfLiteInt8:
-      reference_integer_ops::Dequantize<int8_t>(
-          op_params, GetTensorShape(op_context.input),
-          GetTensorData<int8_t>(op_context.input),
-          GetTensorShape(op_context.output),
-          GetTensorData<float>(op_context.output));
+      if (kernel_type == kReference) {
+        reference_integer_ops::Dequantize<int8_t>(
+            op_params, GetTensorShape(op_context.input),
+            GetTensorData<int8_t>(op_context.input),
+            GetTensorShape(op_context.output),
+            GetTensorData<float>(op_context.output));
+      } else {
+        optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                  GetTensorData<int8_t>(op_context.input),
+                                  GetTensorShape(op_context.output),
+                                  GetTensorData<float>(op_context.output));
+      }
       break;
     case kTfLiteInt16:
-      reference_integer_ops::Dequantize<int16_t>(
-          op_params, GetTensorShape(op_context.input),
-          GetTensorData<int16_t>(op_context.input),
-          GetTensorShape(op_context.output),
-          GetTensorData<float>(op_context.output));
+      if (kernel_type == kReference) {
+        reference_integer_ops::Dequantize<int16_t>(
+            op_params, GetTensorShape(op_context.input),
+            GetTensorData<int16_t>(op_context.input),
+            GetTensorShape(op_context.output),
+            GetTensorData<float>(op_context.output));
+      } else {
+        optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                  GetTensorData<int16_t>(op_context.input),
+                                  GetTensorShape(op_context.output),
+                                  GetTensorData<float>(op_context.output));
+      }
       break;
     case kTfLiteFloat16: {
       const Eigen::half* half_data = reinterpret_cast<const Eigen::half*>(
@@ -134,12 +162,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace dequantize
 
 TfLiteRegistration* Register_DEQUANTIZE_OPT() {
-  static TfLiteRegistration r = {dequantize::Init, dequantize::Free,
-                                 dequantize::Prepare, dequantize::Eval};
+  static TfLiteRegistration r = {
+      dequantize::Init, dequantize::Free, dequantize::Prepare,
+      dequantize::Eval<dequantize::kGenericOptimized>};
   return &r;
 }
 
-TfLiteRegistration* Register_DEQUANTIZE() { return Register_DEQUANTIZE_OPT(); }
+TfLiteRegistration* Register_DEQUANTIZE_REF() {
+  static TfLiteRegistration r = {dequantize::Init, dequantize::Free,
+                                 dequantize::Prepare,
+                                 dequantize::Eval<dequantize::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEQUANTIZE() {
+#ifdef USE_NEON
+  return Register_DEQUANTIZE_OPT();
+#else
+  return Register_DEQUANTIZE_REF();
+#endif
+}
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 6055070af40..828fb8dba36 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string.h>
+
 #include <numeric>
 #include <vector>
+
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
@@ -249,14 +251,14 @@ void DequantizeBoxEncodings(const TfLiteTensor* input_box_encodings, int idx,
 template <class T>
 T ReInterpretTensor(const TfLiteTensor* tensor) {
   // TODO (chowdhery): check float
-  const float* tensor_base = tensor->data.f;
+  const float* tensor_base = GetTensorData<float>(tensor);
   return reinterpret_cast<T>(tensor_base);
 }
 
 template <class T>
 T ReInterpretTensor(TfLiteTensor* tensor) {
   // TODO (chowdhery): check float
-  float* tensor_base = tensor->data.f;
+  float* tensor_base = GetTensorData<float>(tensor);
   return reinterpret_cast<T>(tensor_base);
 }
 
@@ -560,19 +562,19 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
           ReInterpretTensor<const BoxCornerEncoding*>(
               decoded_boxes)[anchor_index];
       // detection_classes
-      detection_classes->data.f[output_box_index] = class_index;
+      GetTensorData<float>(detection_classes)[output_box_index] = class_index;
       // detection_scores
-      detection_scores->data.f[output_box_index] = selected_score;
+      GetTensorData<float>(detection_scores)[output_box_index] = selected_score;
     } else {
       ReInterpretTensor<BoxCornerEncoding*>(
           detection_boxes)[output_box_index] = {0.0f, 0.0f, 0.0f, 0.0f};
       // detection_classes
-      detection_classes->data.f[output_box_index] = 0.0f;
+      GetTensorData<float>(detection_classes)[output_box_index] = 0.0f;
       // detection_scores
-      detection_scores->data.f[output_box_index] = 0.0f;
+      GetTensorData<float>(detection_scores)[output_box_index] = 0.0f;
     }
   }
-  num_detections->data.f[0] = size_of_sorted_indices;
+  GetTensorData<float>(num_detections)[0] = size_of_sorted_indices;
   box_indices_after_regular_non_max_suppression.clear();
   scores_after_regular_non_max_suppression.clear();
   return kTfLiteOk;
@@ -646,27 +648,28 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
           ReInterpretTensor<const BoxCornerEncoding*>(
               decoded_boxes)[selected_index];
       // detection_classes
-      detection_classes->data.f[box_offset] = class_indices[col];
+      GetTensorData<float>(detection_classes)[box_offset] = class_indices[col];
       // detection_scores
-      detection_scores->data.f[box_offset] = box_scores[class_indices[col]];
+      GetTensorData<float>(detection_scores)[box_offset] =
+          box_scores[class_indices[col]];
       output_box_index++;
     }
   }
-  num_detections->data.f[0] = output_box_index;
+  GetTensorData<float>(num_detections)[0] = output_box_index;
   return kTfLiteOk;
 }
 
 void DequantizeClassPredictions(const TfLiteTensor* input_class_predictions,
                                 const int num_boxes,
                                 const int num_classes_with_background,
-                                const TfLiteTensor* scores) {
+                                TfLiteTensor* scores) {
   float quant_zero_point =
       static_cast<float>(input_class_predictions->params.zero_point);
   float quant_scale = static_cast<float>(input_class_predictions->params.scale);
   Dequantizer dequantize(quant_zero_point, quant_scale);
   const uint8* scores_quant = GetTensorData<uint8>(input_class_predictions);
   for (int idx = 0; idx < num_boxes * num_classes_with_background; ++idx) {
-    scores->data.f[idx] = dequantize(scores_quant[idx]);
+    GetTensorData<float>(scores)[idx] = dequantize(scores_quant[idx]);
   }
 }
 
@@ -738,10 +741,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace detection_postprocess
 
 TfLiteRegistration* Register_DETECTION_POSTPROCESS() {
-  static TfLiteRegistration r = {detection_postprocess::Init,
-                                 detection_postprocess::Free,
-                                 detection_postprocess::Prepare,
-                                 detection_postprocess::Eval};
+  static TfLiteRegistration r = {
+      detection_postprocess::Init, detection_postprocess::Free,
+      detection_postprocess::Prepare, detection_postprocess::Eval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 8a285f6622d..293000f2b24 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -39,6 +39,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -104,6 +105,14 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
     col_size *= SizeOfDimension(value, i);
   }
 
+  float* output_ptr = GetTensorData<float>(output);
+  const int8_t* value_ptr;
+  if (value->type == kTfLiteUInt8) {
+    value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
+  } else {
+    value_ptr = value->data.int8;
+  }
+
   for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
     int idx = lookup->data.i32[i];
     if (idx >= row_size || idx < 0) {
@@ -117,13 +126,7 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       // TODO(alanchiao): refactor scalar multiply into separate function
       // for ease of adding a neon equivalent if ever necessary.
       for (int j = 0; j < col_size; j++) {
-        const int8_t* value_ptr;
-        if (value->type == kTfLiteUInt8) {
-          value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
-        } else {
-          value_ptr = value->data.int8;
-        }
-        output->data.f[j + i * col_size] =
+        output_ptr[j + i * col_size] =
             value_ptr[j + idx * col_size] * scaling_factor;
       }
     }
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
index 72bfe5b4f5d..9546db7b795 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -67,6 +67,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -176,7 +177,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int output_size = lookup_size * embedding_size;
   TfLiteTensorRealloc(output_size * sizeof(float), output);
 
-  tensor_utils::ZeroVector(output->data.f, output_size);
+  float* output_ptr = GetTensorData<float>(output);
+  const float* weights_ptr = GetTensorData<float>(weights);
+  const float* value_ptr = GetTensorData<float>(value);
+
+  std::fill_n(output_ptr, output_size, 0.0f);
 
   // Keep track of the current bucket for aggregation/combination.
   int current_output_offset = 0;
@@ -209,7 +214,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     if (output_offset != current_output_offset) {
       FinalizeAggregation(params->combiner, num_elements, current_total_weight,
                           current_squares_weight, embedding_size,
-                          &output->data.f[current_output_offset]);
+                          &output_ptr[current_output_offset]);
 
       // Track next bucket.
       num_elements = 0;
@@ -221,19 +226,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     // Add element to aggregation.
     ++num_elements;
     const int example_embedding_offset = idx * embedding_size;
-    const float w = weights->data.f[i];
+    const float w = weights_ptr[i];
     current_squares_weight += w * w;
     current_total_weight += w;
     for (int k = 0; k < embedding_size; k++) {
-      output->data.f[current_output_offset + k] +=
-          (value->data.f[example_embedding_offset + k] * w);
+      output_ptr[current_output_offset + k] +=
+          value_ptr[example_embedding_offset + k] * w;
     }
   }
 
   // Finalize last bucket.
   FinalizeAggregation(params->combiner, num_elements, current_total_weight,
                       current_squares_weight, embedding_size,
-                      &output->data.f[current_output_offset]);
+                      &GetTensorData<float>(output)[current_output_offset]);
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
index 7f9cf19e197..07c4e66bb39 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -64,10 +65,11 @@ class EmbeddingLookupSparseOpModel : public SingleOpModel {
     int rows = tensor->dims->data[0];
     int columns = tensor->dims->data[1];
     int features = tensor->dims->data[2];
+    float* tensor_ptr = GetTensorData<float>(tensor);
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < columns; j++) {
         for (int k = 0; k < features; k++) {
-          tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+          tensor_ptr[(i * columns + j) * features + k] = function(i, j, k);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index b6ccce3b938..76074191797 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index bca595eb836..cb419c5e3ff 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
@@ -25,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
@@ -115,7 +117,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
-  cpu_backend_support::IncrementUsageCounter(context);
   auto* op_data = new OpData();
   context->AddTensors(context, /*tensors_to_add=*/2,
                       &op_data->scratch_tensor_index);
@@ -123,7 +124,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -133,7 +133,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
+  TF_LITE_ENSURE(context, node->inputs->size == 2 || node->inputs->size == 3);
   // Shuffled formats need a workspace to store the shuffled input activations.
   const int expected_outputs_count =
       params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault ? 1
@@ -142,7 +142,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias =
+      (node->inputs->size == 3)
+          ? GetOptionalInputTensor(context, node, kBiasTensor)
+          : nullptr;
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Check proper datatype match among all Input Tensors
@@ -246,20 +249,23 @@ TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
 
   // Output = bias if bias tensor exists.
   if (bias) {
-    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
-                                          output->data.f);
+    tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias), num_units,
+                                          batch_size,
+                                          GetTensorData<float>(output));
   } else {
-    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+    std::fill_n(GetTensorData<float>(output), batch_size * num_units, 0.0f);
   }
 
   // Compute output += weight * input
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      filter->data.f, num_units, input_size, input->data.f, batch_size,
-      output->data.f, /*result_stride=*/1);
+      GetTensorData<float>(filter), num_units, input_size,
+      GetTensorData<float>(input), batch_size, GetTensorData<float>(output),
+      /*result_stride=*/1);
 
   // Apply activation function
-  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
-                                        params->activation, output->data.f);
+  tensor_utils::ApplyActivationToVector(
+      GetTensorData<float>(output), batch_size * num_units, params->activation,
+      GetTensorData<float>(output));
 
   return kTfLiteOk;
 }
@@ -280,23 +286,25 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
 
   // Output = bias if bias tensor exists.
   if (bias) {
-    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
-                                          output->data.f);
+    tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias), num_units,
+                                          batch_size,
+                                          GetTensorData<float>(output));
   } else {
-    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+    std::fill_n(GetTensorData<float>(output), batch_size * num_units, 0.0f);
   }
 
   // Save matrix multiplication computation for all zero input.
-  if (tensor_utils::IsZeroVector(input->data.f, total_input_size)) {
-    tensor_utils::ApplyActivationToVector(output->data.f,
-                                          batch_size * num_units,
-                                          params->activation, output->data.f);
+  if (tensor_utils::IsZeroVector(GetTensorData<float>(input),
+                                 total_input_size)) {
+    tensor_utils::ApplyActivationToVector(
+        GetTensorData<float>(output), batch_size * num_units,
+        params->activation, GetTensorData<float>(output));
     return kTfLiteOk;
   }
 
   // Quantize input from float to uint8 + quantization params (scaling factor).
   float unused_min, unused_max;
-  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
   int8_t* quant_data;
   int8_t* filter_data;
   if (filter->type == kTfLiteUInt8) {
@@ -310,9 +318,9 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   // Quantize each batch independently.
   for (int b = 0; b < batch_size; ++b) {
     const int offset = b * input_size;
-    tensor_utils::SymmetricQuantizeFloats(input->data.f + offset, input_size,
-                                          quant_data + offset, &unused_min,
-                                          &unused_max, &scaling_factors_ptr[b]);
+    tensor_utils::SymmetricQuantizeFloats(
+        GetTensorData<float>(input) + offset, input_size, quant_data + offset,
+        &unused_min, &unused_max, &scaling_factors_ptr[b]);
     // Incorporate scaling of the filter.
     scaling_factors_ptr[b] *= filter->params.scale;
   }
@@ -320,12 +328,13 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   // Compute output += weight * quantized_input
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, num_units, input_size, quant_data, scaling_factors_ptr,
-      batch_size, output->data.f,
+      batch_size, GetTensorData<float>(output),
       /*result_stride=*/1);
 
   // Apply activation function to floats.
-  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
-                                        params->activation, output->data.f);
+  tensor_utils::ApplyActivationToVector(
+      GetTensorData<float>(output), batch_size * num_units, params->activation,
+      GetTensorData<float>(output));
   return kTfLiteOk;
 }
 
@@ -398,13 +407,13 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
               GetTensorShape(bias), GetTensorData<int32_t>(bias),
               GetTensorShape(output), GetTensorData<uint8_t>(output),
-              cpu_backend_support::GetFromContext(context));
+              CpuBackendContext::GetFromContext(context));
         }
         break;
       case kTfLiteInt8:
         FullyConnectedInt8<kernel_type>(
             data, input, filter, bias, output,
-            cpu_backend_support::GetFromContext(context));
+            CpuBackendContext::GetFromContext(context));
         break;
       case kTfLiteInt16:
         if (kernel_type == kReference) {
@@ -419,7 +428,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
               GetTensorShape(bias), GetTensorData<int32_t>(bias),
               GetTensorShape(output), GetTensorData<int16_t>(output),
-              cpu_backend_support::GetFromContext(context));
+              CpuBackendContext::GetFromContext(context));
         }
         break;
       default:
@@ -456,7 +465,7 @@ TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
         GetTensorShape(output), GetTensorData<int16_t>(output),          \
         GetTensorData<uint8_t>(shuffled_input_workspace),                \
-        cpu_backend_support::GetFromContext(context));                   \
+        CpuBackendContext::GetFromContext(context));                     \
   }
   FullyConnectedParams op_params;
   op_params.output_multiplier = data->output_multiplier;
@@ -477,7 +486,7 @@ TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<int16_t>(output),
         GetTensorData<uint8_t>(shuffled_input_workspace),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   }
 #undef TF_LITE_SHUFFLED_FULLY_CONNECTED
 
@@ -512,7 +521,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(filter), GetTensorData<float>(filter),
         GetTensorShape(bias), GetTensorData<float>(bias),
         GetTensorShape(output), GetTensorData<float>(output),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   }
 
   return kTfLiteOk;
@@ -526,7 +535,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias =
+      (node->inputs->size == 3)
+          ? GetOptionalInputTensor(context, node, kBiasTensor)
+          : nullptr;
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   switch (filter->type) {
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 637ee6b2736..c564a52e3fe 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -139,7 +139,8 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       bool keep_num_dims = false, bool bias_tensor_optional = false,
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
-          FullyConnectedOptionsWeightsFormat_DEFAULT)
+          FullyConnectedOptionsWeightsFormat_DEFAULT,
+      bool add_bias_for_quantized = true)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -155,7 +156,7 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       bias_ = AddNullInput();
     } else if (input.type == TensorType_FLOAT32) {
       bias_ = AddInput({TensorType_FLOAT32, {units_}});
-    } else {
+    } else if (add_bias_for_quantized) {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
@@ -176,9 +177,13 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
                      .Union());
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_FULLY_CONNECTED, registration);
-    BuildInterpreter(
-        {GetShape(input_), GetShape(weights_),
-         (bias_ == kOptionalTensor) ? std::vector<int>() : GetShape(bias_)});
+    std::vector<std::vector<int>> inputs = {GetShape(input_),
+                                            GetShape(weights_)};
+    if (add_bias_for_quantized) {
+      inputs.push_back((bias_ == kOptionalTensor) ? std::vector<int>()
+                                                  : GetShape(bias_));
+    }
+    BuildInterpreter(inputs);
   }
 
   int input_size() { return input_size_; }
@@ -465,6 +470,40 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8NoBias) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128},
+      /*keep_num_dims =*/false, /*bool bias_tensor_optional =*/false,
+      /*ActivationFunctionType activation_func =*/ActivationFunctionType_RELU,
+      /*FullyConnectedOptionsWeightsFormat weights_format =*/
+      FullyConnectedOptionsWeightsFormat_DEFAULT,
+      /*add_bias_for_quantized =*/false);
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  23, 23, 23,  //
+                  57, 57, 57,  //
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(150, 150, 150, 184, 184, 184));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
@@ -491,6 +530,36 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(23, 24, 25, 57, 58, 59));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8NoBias) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128},
+      /*keep_num_dims =*/false, /*bool bias_tensor_optional =*/false,
+      /*ActivationFunctionType activation_func =*/ActivationFunctionType_RELU,
+      /*FullyConnectedOptionsWeightsFormat weights_format =*/
+      FullyConnectedOptionsWeightsFormat_DEFAULT,
+      /*add_bias_for_quantized =*/false);
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
+}
+
 // Test the GEMV path.
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestSingleBatchQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 54d05adbcf1..85eb4235374 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -60,6 +60,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8:
     case kTfLiteInt64:
     case kTfLiteInt32:
+    case kTfLiteBool:
       break;
     case kTfLiteString: {
       // Only 1D input is supported.
@@ -142,6 +143,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<int32_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt64:
         return Gather<int64_t, int32_t>(*params, input, positions, output);
+      case kTfLiteBool:
+        return Gather<bool, int32_t>(*params, input, positions, output);
       case kTfLiteString:
         return GatherStrings<int32_t>(context, input, positions, output);
       default:
@@ -162,6 +165,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<int32_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt64:
         return Gather<int64_t, int64_t>(*params, input, positions, output);
+      case kTfLiteBool:
+        return Gather<bool, int64_t>(*params, input, positions, output);
       case kTfLiteString:
         return GatherStrings<int64_t>(context, input, positions, output);
       default:
diff --git a/tensorflow/lite/kernels/gemmlowp_support.cc b/tensorflow/lite/kernels/gemmlowp_support.cc
deleted file mode 100644
index 410a72ca3f6..00000000000
--- a/tensorflow/lite/kernels/gemmlowp_support.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/kernels/gemmlowp_support.h"
-
-#include <memory>
-
-#include "tensorflow/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace gemmlowp_support {
-namespace {
-
-struct RefCountedGemmlowpContext : public TfLiteExternalContext {
-  std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context;
-  int num_references = 0;
-};
-
-RefCountedGemmlowpContext* GetGemmLowpContext(TfLiteContext* context) {
-  return reinterpret_cast<RefCountedGemmlowpContext*>(
-      context->GetExternalContext(context, kTfLiteGemmLowpContext));
-}
-
-TfLiteStatus Refresh(TfLiteContext* context) {
-  auto* ptr = GetGemmLowpContext(context);
-  if (ptr != nullptr) {
-    ptr->gemmlowp_context->set_max_num_threads(
-        context->recommended_num_threads);
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void IncrementUsageCounter(TfLiteContext* context) {
-  auto* ptr = GetGemmLowpContext(context);
-  if (ptr == nullptr) {
-    ptr = new RefCountedGemmlowpContext;
-    ptr->type = kTfLiteGemmLowpContext;
-    ptr->Refresh = Refresh;
-    ptr->gemmlowp_context.reset(new gemmlowp::GemmContext());
-    if (context->recommended_num_threads != -1) {
-      ptr->gemmlowp_context->set_max_num_threads(
-          context->recommended_num_threads);
-    }
-    ptr->num_references = 0;
-    context->SetExternalContext(context, kTfLiteGemmLowpContext, ptr);
-  }
-  ptr->num_references++;
-}
-
-void DecrementUsageCounter(TfLiteContext* context) {
-  auto* ptr = GetGemmLowpContext(context);
-  if (ptr == nullptr) {
-    TF_LITE_FATAL(
-        "Call to DecrementUsageCounter() not preceded by "
-        "IncrementUsageCounter()");
-  }
-  if (--ptr->num_references == 0) {
-    delete ptr;
-    context->SetExternalContext(context, kTfLiteGemmLowpContext, nullptr);
-  }
-}
-
-gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) {
-  auto* ptr = GetGemmLowpContext(context);
-  if (ptr == nullptr) {
-    TF_LITE_FATAL(
-        "Call to GetFromContext() not preceded by IncrementUsageCounter()");
-  }
-  return ptr->gemmlowp_context.get();
-}
-
-}  // namespace gemmlowp_support
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/gemmlowp_support.h b/tensorflow/lite/kernels/gemmlowp_support.h
deleted file mode 100644
index 9679326a533..00000000000
--- a/tensorflow/lite/kernels/gemmlowp_support.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_
-#define TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_
-
-#include "public/gemmlowp.h"
-#include "tensorflow/lite/c/c_api_internal.h"
-
-namespace tflite {
-namespace gemmlowp_support {
-
-// Returns the GemmContext stored in 'context', allowing multiple ops to
-// share a single object, as long as they share a TfLiteContext. The caller
-// must ensure that this is called between IncrementUsageCounter() and
-// DecrementUsageCounter(). For example, in the implementation of an op:
-//   void* Init(TfLiteContext* context, const char*, size_t) {
-//     gemmlowp_support::IncrementUsageCounter(context);
-//     return nullptr;
-//   }
-//   void Free(TfLiteContext* context, void*) {
-//     gemmlowp_support::DecrementUsageCounter(context);
-//   }
-//   TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-//     auto* gemmlowp_context = gemmlowp_support::GetFromContext(context);
-//   }
-gemmlowp::GemmContext* GetFromContext(TfLiteContext* context);
-
-// Let the framework know that the GemmContext stored in 'context' will be used
-// by an op. If necessary a new GemmContext is created and placed in 'context'.
-void IncrementUsageCounter(TfLiteContext* context);
-
-// Let the framework know that the op stopped using the GemmContext stored in
-// 'context'. If there are no more usages the GemmContext will be deleted.
-void DecrementUsageCounter(TfLiteContext* context);
-
-}  // namespace gemmlowp_support
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_
diff --git a/tensorflow/lite/kernels/hashtable_lookup_test.cc b/tensorflow/lite/kernels/hashtable_lookup_test.cc
index 5646165e861..638d82ea167 100644
--- a/tensorflow/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -61,7 +62,7 @@ class HashtableLookupOpModel : public SingleOpModel {
     TfLiteTensor* tensor = interpreter_->tensor(value_);
     int rows = tensor->dims->data[0];
     for (int i = 0; i < rows; i++) {
-      tensor->data.f[i] = function(i);
+      GetTensorData<float>(tensor)[i] = function(i);
     }
   }
 
@@ -71,7 +72,7 @@ class HashtableLookupOpModel : public SingleOpModel {
     int features = tensor->dims->data[1];
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < features; j++) {
-        tensor->data.f[i * features + j] = function(i, j);
+        GetTensorData<float>(tensor)[i * features + j] = function(i, j);
       }
     }
   }
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index 1bd394e9800..610af8cd4b9 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+
+#include <cstring>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/subgraph.h"
@@ -30,10 +32,9 @@ struct OpData {
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData;
-  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
-  op_data->then_subgraph_index = m["then_subgraph_index"].AsInt32();
-  op_data->else_subgraph_index = m["else_subgraph_index"].AsInt32();
+  const auto* params = reinterpret_cast<const TfLiteIfParams*>(buffer);
+  op_data->then_subgraph_index = params->then_subgraph_index;
+  op_data->else_subgraph_index = params->else_subgraph_index;
   return op_data;
 }
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index a4cbd0f3271..c52503e6511 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -167,6 +167,16 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "raspberry_pi_with_neon",
+    define_values = {
+        "raspberry_pi_with_neon": "true",
+    },
+    values = {
+        "cpu": "armeabi",
+    },
+)
+
 cc_library(
     name = "common",
     srcs = [],
@@ -194,6 +204,7 @@ cc_library(
         "optimized/integer_ops/depthwise_conv.h",
         "optimized/integer_ops/depthwise_conv_3x3_filter.h",
         "optimized/integer_ops/fully_connected.h",
+        "optimized/integer_ops/mean.h",
         "optimized/integer_ops/mul.h",
         "optimized/integer_ops/pooling.h",
         "optimized/integer_ops/softmax.h",
@@ -347,9 +358,15 @@ cc_library(
     name = "reference_base",
     srcs = [],
     hdrs = [
+        "reference/add.h",
+        "reference/arg_min_max.h",
+        "reference/binary_function.h",
+        "reference/ceil.h",
+        "reference/comparisons.h",
         "reference/conv.h",
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
+        "reference/floor.h",
         "reference/fully_connected.h",
         "reference/integer_ops/add.h",
         "reference/integer_ops/conv.h",
@@ -364,10 +381,17 @@ cc_library(
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/softmax.h",
         "reference/integer_ops/tanh.h",
+        "reference/maximum_minimum.h",
+        "reference/neg.h",
+        "reference/non_max_suppression.h",
         "reference/pooling.h",
+        "reference/prelu.h",
+        "reference/process_broadcast_shapes.h",
         "reference/reference_ops.h",
+        "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
+        "reference/svdf.h",
     ],
     deps = [
         ":common",
@@ -376,6 +400,7 @@ cc_library(
         ":round",
         ":strided_slice_logic",
         ":tensor",
+        ":tensor_utils",
         ":types",
         "@gemmlowp//:fixedpoint",
         "@gemmlowp//:profiler",
@@ -399,13 +424,24 @@ cc_library(
     name = "legacy_reference_base",
     srcs = [],
     hdrs = [
+        "reference/add.h",
+        "reference/arg_min_max.h",
+        "reference/binary_function.h",
+        "reference/ceil.h",
+        "reference/comparisons.h",
         "reference/conv.h",
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
+        "reference/floor.h",
         "reference/fully_connected.h",
         "reference/legacy_reference_ops.h",
+        "reference/maximum_minimum.h",
+        "reference/neg.h",
         "reference/pooling.h",
+        "reference/prelu.h",
+        "reference/process_broadcast_shapes.h",
         "reference/reference_ops.h",
+        "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
     ],
@@ -472,6 +508,7 @@ cc_library(
         "reference/portable_tensor_utils_impl.h",
     ],
     deps = [
+        ":common",
         ":compatibility",
         ":round",
         ":types",
@@ -499,7 +536,7 @@ cc_library(
         ":round",
         ":types",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/experimental/ruy:detect_dotprod",
+        "//tensorflow/lite/experimental/ruy:detect_arm",
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:op_macros",
@@ -573,7 +610,6 @@ cc_library(
         ":cpu_check",
         ":types",
         "//tensorflow/lite/c:c_api_internal",
-        "@arm_neon_2_x86_sse",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:op_macros",
         "@gemmlowp//:fixedpoint",
@@ -602,6 +638,9 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":raspberry_pi_with_neon": [
+            ":neon_tensor_utils",
+        ],
         ":ios_x86_64": [
             ":sse_tensor_utils",
         ],
@@ -754,6 +793,19 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "strided_slice_logic_test",
+    timeout = "moderate",
+    srcs = [
+        "strided_slice_logic_test.cc",
+    ],
+    shard_count = 4,
+    deps = [
+        ":strided_slice_logic",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "logsoftmax_quantized_test",
     timeout = "long",
@@ -856,6 +908,16 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "non_max_suppression_test",
+    srcs = ["non_max_suppression_test.cc"],
+    deps = [
+        ":reference_base",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
 
 filegroup(
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index fd5b89eaf73..1c3d0e9ad62 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -292,7 +292,7 @@ inline void DispatchDepthwiseConv(
       << " input_offset = " << params.input_offset;
 
   CpuBackendContext backend_context;
-  backend_context.set_max_num_threads(test_param.num_threads);
+  backend_context.SetMaxNumThreads(test_param.num_threads);
   optimized_ops::DepthwiseConv<uint8, int32>(
       params, input_shape, input_data, filter_shape, filter_data, bias_shape,
       bias_data, output_shape, output_data, &backend_context);
diff --git a/tensorflow/lite/kernels/internal/kernel_utils.cc b/tensorflow/lite/kernels/internal/kernel_utils.cc
index 0836a3b662d..63e84ed1d5f 100644
--- a/tensorflow/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/lite/kernels/internal/kernel_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 
+#include <algorithm>
+
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 
 namespace tflite {
@@ -68,13 +70,13 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
     // Output = activation(Output) and update hidden_state
     tensor_utils::ApplyActivationToVector(
         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
-    tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
-                             hidden_state_ptr_batch);
+    std::copy_n(output_ptr_batch, num_units * batch_size,
+                hidden_state_ptr_batch);
   } else {
     // Output = bias
     for (int k = 0; k < batch_size; k++) {
-      tensor_utils::CopyVector(bias_ptr, num_units,
-                               output_ptr_batch + k * output_batch_leading_dim);
+      std::copy_n(bias_ptr, num_units,
+                  output_ptr_batch + k * output_batch_leading_dim);
     }
 
     // Output += input * input_weights
@@ -110,9 +112,8 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
       tensor_utils::ApplyActivationToVector(
           output_ptr_batch + k * output_batch_leading_dim, num_units,
           activation, output_ptr_batch + k * output_batch_leading_dim);
-      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
-                               num_units,
-                               hidden_state_ptr_batch + k * num_units);
+      std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
+                  hidden_state_ptr_batch + k * num_units);
     }
   }
 }
@@ -221,13 +222,13 @@ void RnnBatchStep(
     // Output = activation(Output) and update hidden_state
     tensor_utils::ApplyActivationToVector(
         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
-    tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
-                             hidden_state_ptr_batch);
+    std::copy_n(output_ptr_batch, num_units * batch_size,
+                hidden_state_ptr_batch);
   } else {
     // Output = bias
     for (int k = 0; k < batch_size; k++) {
-      tensor_utils::CopyVector(bias_ptr, num_units,
-                               output_ptr_batch + k * output_batch_leading_dim);
+      std::copy_n(bias_ptr, num_units,
+                  output_ptr_batch + k * output_batch_leading_dim);
     }
 
     // Save quantization and matmul computation for all zero input.
@@ -310,9 +311,8 @@ void RnnBatchStep(
       tensor_utils::ApplyActivationToVector(
           output_ptr_batch + k * output_batch_leading_dim, num_units,
           activation, output_ptr_batch + k * output_batch_leading_dim);
-      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
-                               num_units,
-                               hidden_state_ptr_batch + k * num_units);
+      std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
+                  hidden_state_ptr_batch + k * num_units);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/non_max_suppression_test.cc b/tensorflow/lite/kernels/internal/non_max_suppression_test.cc
new file mode 100644
index 00000000000..6fb24555e9c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/non_max_suppression_test.cc
@@ -0,0 +1,304 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/non_max_suppression.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+constexpr int kNumBoxes = 6;
+
+void InitializeCandidates(std::vector<float>* boxes, std::vector<float>* scores,
+                          bool flip_coordinates = false) {
+  if (!flip_coordinates) {
+    *boxes = {
+        0, 0,    1, 1,     // Box 0
+        0, 0.1,  1, 1.1,   // Box 1
+        0, -0.1, 1, 0.9,   // Box 2
+        0, 10,   1, 11,    // Box 3
+        0, 10.1, 1, 11.1,  // Box 4
+        0, 100,  1, 101    // Box 5
+    };
+  } else {
+    *boxes = {
+        1, 1,     0, 0,     // Box 0
+        0, 0.1,   1, 1.1,   // Box 1
+        0, .9f,   1, -0.1,  // Box 2
+        0, 10,    1, 11,    // Box 3
+        1, 10.1f, 0, 11.1,  // Box 4
+        1, 101,   0, 100    // Box 5
+    };
+  }
+  *scores = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+}
+
+template <typename T>
+void MatchFirstNElements(int num_elements, const std::vector<T>& test_values,
+                         const std::vector<T>& reference_values) {
+  EXPECT_LT(num_elements, test_values.size());
+  EXPECT_EQ(num_elements, reference_values.size());
+
+  for (int i = 0; i < num_elements; ++i) {
+    EXPECT_EQ(test_values[i], reference_values[i]);
+  }
+}
+
+TEST(NonMaxSuppression, TestZeroBoxes) {
+  // Inputs
+  std::vector<float> boxes(1);
+  std::vector<float> scores(1);
+  const float iou_threshold = 0.5;
+  const float score_threshold = 0.4;
+  const int max_output_size = 4;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  reference_ops::NonMaxSuppression(
+      boxes.data(), /**num_boxes=**/ 0, scores.data(), max_output_size,
+      iou_threshold, score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 0);
+}
+
+TEST(NonMaxSuppression, TestSelectFromIdenticalBoxes) {
+  // Inputs
+  std::vector<float> boxes(kNumBoxes * 4);
+  std::vector<float> scores(kNumBoxes);
+  for (int i = 0; i < kNumBoxes; ++i) {
+    boxes[i * 4 + 0] = 0;
+    boxes[i * 4 + 1] = 0;
+    boxes[i * 4 + 2] = 1;
+    boxes[i * 4 + 3] = 1;
+    scores[i] = 0.75;
+  }
+  const float iou_threshold = 0.5;
+  float score_threshold = 0.5;
+  const int max_output_size = kNumBoxes;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 1);
+  MatchFirstNElements(1, selected_scores, {.75});
+
+  score_threshold = 0.95;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 0);
+}
+
+TEST(NonMaxSuppression, TestSelectFromThreeClustersWithZeroScoreThreshold) {
+  // Inputs
+  std::vector<float> boxes;
+  std::vector<float> scores;
+  InitializeCandidates(&boxes, &scores);
+  const float iou_threshold = 0.5;
+  int max_output_size;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  // Test a large max_output_size.
+  max_output_size = 100;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      /**score_threshold=**/ 0.0, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 3);
+  MatchFirstNElements(3, selected_indices, {3, 0, 5});
+  MatchFirstNElements(3, selected_scores, {0.95, 0.9, 0.3});
+
+  // Smaller max_output_size.
+  max_output_size = 2;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      /**score_threshold=**/ 0.0, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, max_output_size);
+  MatchFirstNElements(max_output_size, selected_indices, {3, 0});
+  MatchFirstNElements(max_output_size, selected_scores, {0.95, 0.9});
+
+  // max_output_size = 0.
+  max_output_size = 0;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      /**score_threshold=**/ 0.0, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 0);
+}
+
+TEST(NonMaxSuppression, TestSelectFromThreeClustersWithScoreThreshold) {
+  // Inputs
+  std::vector<float> boxes;
+  std::vector<float> scores;
+  InitializeCandidates(&boxes, &scores);
+  const float iou_threshold = 0.5;
+  const float score_threshold = 0.4;
+  int max_output_size;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  // Test a large max_output_size.
+  max_output_size = 100;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 2);
+  MatchFirstNElements(2, selected_indices, {3, 0});
+  MatchFirstNElements(2, selected_scores, {0.95, 0.9});
+
+  // max_output_size = 1.
+  max_output_size = 1;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 1);
+  MatchFirstNElements(1, selected_indices, {3});
+  MatchFirstNElements(1, selected_scores, {0.95});
+}
+
+// This flips the (y1, x1) & (y2, x2) corners for each box. The output should
+// match what we get without flipping.
+TEST(NonMaxSuppression, TestSelectFromThreeClustersWithFlippedCoordinates) {
+  // Inputs
+  std::vector<float> boxes;
+  std::vector<float> scores;
+  InitializeCandidates(&boxes, &scores, /**flipped_coordinates=**/ true);
+  const float iou_threshold = 0.5;
+  const float score_threshold = 0.4;
+  const int max_output_size = 3;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  // Test a large max_output_size.
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 2);
+  MatchFirstNElements(2, selected_indices, {3, 0});
+  MatchFirstNElements(2, selected_scores, {0.95, 0.9});
+
+  // score_threshold = 0.
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      /**score_threshold=**/ 0.0, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 3);
+  MatchFirstNElements(3, selected_indices, {3, 0, 5});
+  MatchFirstNElements(3, selected_scores, {0.95, 0.9, 0.3});
+}
+
+TEST(NonMaxSuppression, TestIoUThresholdBoundaryCases) {
+  // Inputs
+  std::vector<float> boxes;
+  std::vector<float> scores;
+  InitializeCandidates(&boxes, &scores);
+  const float score_threshold = 0.4;
+  const int max_output_size = 4;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  // IoU threshold is zero. Only one index should get selected.
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size,
+      /**iou_threshold=**/ 0.0, score_threshold, /**sigma=**/ 0.0,
+      selected_indices.data(), selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 1);
+  MatchFirstNElements(1, selected_indices, {3});
+  MatchFirstNElements(1, selected_scores, {0.95});
+
+  // IoU threshold too high. max_output_size number of indices should be
+  // selected.
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size,
+      /**iou_threshold=**/ 0.9999,
+      /**score_threshold=**/ 0.0, /**sigma=**/ 0.0, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, max_output_size);
+  MatchFirstNElements(max_output_size, selected_indices, {3, 0, 1, 2});
+  MatchFirstNElements(max_output_size, selected_scores, {0.95, 0.9, 0.75, 0.6});
+}
+
+TEST(NonMaxSuppression, TestSelectFromThreeClustersWithSoftNMS) {
+  // Inputs
+  std::vector<float> boxes;
+  std::vector<float> scores;
+  InitializeCandidates(&boxes, &scores);
+  const float iou_threshold = 1.0;
+  float score_threshold = 0.0;
+  const float soft_nms_sigma = 0.5;
+  int max_output_size = 6;
+
+  // Outputs
+  std::vector<int> selected_indices(6);
+  std::vector<float> selected_scores(6);
+  int num_selected_indices = -1;
+
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, soft_nms_sigma, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 6);
+  // Box 0 soft-suppresses box 1, but not enough to cause it to fall under
+  // `score_threshold` (which is 0.0) or the score of box 5, so in this test,
+  // box 1 ends up being selected before box 5.
+  EXPECT_THAT(selected_indices, ElementsAreArray({3, 0, 1, 5, 4, 2}));
+  EXPECT_THAT(selected_scores,
+              ElementsAreArray(
+                  ArrayFloatNear({0.95, 0.9, 0.384, 0.3, 0.256, 0.197}, 1e-3)));
+
+  score_threshold = 0.299;
+  reference_ops::NonMaxSuppression(
+      boxes.data(), kNumBoxes, scores.data(), max_output_size, iou_threshold,
+      score_threshold, soft_nms_sigma, selected_indices.data(),
+      selected_scores.data(), &num_selected_indices);
+  EXPECT_EQ(num_selected_indices, 4);
+  MatchFirstNElements(4, selected_indices, {3, 0, 1, 5});
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 2c6a682f3b2..3ab5c86ff6b 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -26,10 +26,14 @@ struct CpuFlags {
 
 inline void GetCpuFlags(CpuBackendContext* cpu_backend_context,
                         CpuFlags* cpu_flags) {
+#if RUY_PLATFORM(ARM)
   ruy::Context* ruy_context = cpu_backend_context->ruy_context();
   cpu_flags->neon_dotprod =
       ruy_context != nullptr && (ruy_context->GetRuntimeEnabledPaths() &
                                  ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+#else
+  cpu_flags->neon_dotprod = false;
+#endif
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index 6461a5e5426..ee962b57b12 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -70,10 +70,6 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-// #if defined(EIGEN_USE_LIBXSMM)
-// #include "libxsmm.h"
-// #endif
-
 #ifdef EIGEN_USE_THREADS
 #include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
 #endif
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index f5576fbff70..66ca61ca148 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -70,10 +70,6 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-// #if defined(EIGEN_USE_LIBXSMM)
-// #include "libxsmm.h"
-// #endif
-
 #ifdef EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/ThreadPool"
 #endif
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
index 7fb2d8896b5..f6127c56614 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
@@ -55,7 +55,9 @@ inline void FullyConnected(
   TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
   const int output_rows = output_shape.Dims(output_dim_count - 1);
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  }
 
   cpu_backend_gemm::MatrixParams<int8> lhs_params;
   lhs_params.rows = filter_rows;
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h
new file mode 100644
index 00000000000..0c1ab818199
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h
@@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+#ifdef USE_NEON
+
+using optimized_ops::DivideSumForMeanImpl;
+using optimized_ops::RoundToNearest;
+
+#endif  // USE_NEON
+
+inline void MeanImpl(const tflite::MeanParams& op_params,
+                     const RuntimeShape& input_shape, const int8_t* input_data,
+                     int32 input_zero_point, float input_scale,
+                     const RuntimeShape& output_shape, int8_t* output_data,
+                     int32 output_zero_point, float output_scale,
+                     int start_depth, int end_depth) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Int8/MeanImpl");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(2);
+  const int output_width = output_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  const bool ordinary_mean =
+      (input_zero_point == output_zero_point && input_scale == output_scale);
+  float scale = 0.0f, bias = 0.0f;
+  if (!ordinary_mean) {
+    scale = input_scale / output_scale;
+    bias = -input_zero_point * scale + 0.5;
+  }
+
+#ifdef USE_NEON
+  const float32x4_t num_elements_dup = vdupq_n_f32(num_elements_in_axis);
+  // This is only an approximation as NEON does not offer division instruction.
+  const float32x4_t scale_dup = vdupq_n_f32(scale);
+  const float32x4_t num_elements_reverse = vrecpeq_f32(num_elements_dup);
+  float32x4_t zero_point_with_bias_dup = vdupq_n_f32(output_zero_point + bias);
+#endif  // USE_NEON
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    int out_d = start_depth;
+#ifdef USE_NEON
+
+    for (; out_d < end_depth - 8; out_d += 8) {
+      float32x4_t temp_sum_1 = vdupq_n_f32(0);
+      float32x4_t temp_sum_2 = vdupq_n_f32(0);
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          const int8_t* input_data_ptr =
+              input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
+          int8x8_t input_data_val = vld1_s8(input_data_ptr);
+          int16x8_t input_data_val_shift = vmovl_s8(input_data_val);
+          float32x4_t input_float_1 =
+              vcvtq_f32_s32(vmovl_s16(vget_high_s16(input_data_val_shift)));
+          float32x4_t input_float_2 =
+              vcvtq_f32_s32(vmovl_s16(vget_low_s16(input_data_val_shift)));
+          temp_sum_1 = vaddq_f32(temp_sum_1, input_float_1);
+          temp_sum_2 = vaddq_f32(temp_sum_2, input_float_2);
+        }
+      }
+
+      const float32x4_t mean_1 =
+          DivideSumForMeanImpl(temp_sum_1, num_elements_reverse, ordinary_mean,
+                               scale_dup, zero_point_with_bias_dup);
+      const float32x4_t mean_2 =
+          DivideSumForMeanImpl(temp_sum_2, num_elements_reverse, ordinary_mean,
+                               scale_dup, zero_point_with_bias_dup);
+
+      int32x4_t casted_mean_1 = RoundToNearest(mean_1);
+      int16x4_t narrow_range_mean_1 = vmovn_s32(casted_mean_1);
+      int32x4_t casted_mean_2 = RoundToNearest(mean_2);
+      int16x4_t narrow_range_mean_2 = vmovn_s32(casted_mean_2);
+      int16x8_t combined_mean =
+          vcombine_s16(narrow_range_mean_2, narrow_range_mean_1);
+      int8x8_t narrowed_combined_mean = vmovn_s16(combined_mean);
+      int8_t* output_data_ptr =
+          output_data + Offset(output_shape, out_b, 0, 0, out_d);
+      vst1_s8(output_data_ptr, narrowed_combined_mean);
+    }
+#endif  // USE_NEON
+
+    for (; out_d < end_depth; ++out_d) {
+      float temp_value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          temp_value +=
+              input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+
+      temp_value = temp_value / num_elements_in_axis;
+      if (ordinary_mean) {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<int8_t>(round(temp_value));
+      } else {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<int8_t>(round(temp_value * scale + bias)) +
+            output_zero_point;
+      }
+    }
+  }
+}
+
+struct MeanWorkerTask : cpu_backend_threadpool::Task {
+  MeanWorkerTask(const tflite::MeanParams& op_params,
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 int32 input_zero_point, float input_scale,
+                 const RuntimeShape& output_shape, int8_t* output_data,
+                 int32 output_zero_point, float output_scale, int start_height,
+                 int end_height)
+      : op_params(op_params),
+        input_shape(input_shape),
+        input_data(input_data),
+        input_zero_point(input_zero_point),
+        input_scale(input_scale),
+        output_shape(output_shape),
+        output_data(output_data),
+        output_zero_point(output_zero_point),
+        output_scale(output_scale),
+        start_height(start_height),
+        end_height(end_height) {}
+
+  void Run() override {
+    MeanImpl(op_params, input_shape, input_data, input_zero_point, input_scale,
+             output_shape, output_data, output_zero_point, output_scale,
+             start_height, end_height);
+  }
+
+ private:
+  const tflite::MeanParams& op_params;
+  const RuntimeShape& input_shape;
+  const int8_t* input_data;
+  int32 input_zero_point;
+  float input_scale;
+  const RuntimeShape& output_shape;
+  int8_t* output_data;
+  int32 output_zero_point;
+  float output_scale;
+  int start_height;
+  int end_height;
+};
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const int8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 int8_t* output_data, int32 output_zero_point,
+                 float output_scale, CpuBackendContext* cpu_backend_context) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Int8");
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  constexpr int kMinDepthPerThread = 8;
+  int thread_count = output_depth / kMinDepthPerThread;
+  thread_count = thread_count > 0 ? thread_count : 1;
+  const int capped_thread_count =
+      std::min(thread_count, cpu_backend_context->max_num_threads());
+
+  if (capped_thread_count == 1) {
+    MeanImpl(op_params, input_shape, input_data, input_zero_point, input_scale,
+             output_shape, output_data, output_zero_point, output_scale, 0,
+             output_depth);
+  } else {
+    // Instead parrallel for batch, we loop for the output_depth since batch
+    // is typical 1.
+    std::vector<MeanWorkerTask> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(capped_thread_count);
+    int depth_start = 0;
+    for (int i = 0; i < capped_thread_count; ++i) {
+      // Try to distribute the tasks as even as possible.
+      int depth_end = depth_start +
+                      (output_depth - depth_start) / (capped_thread_count - i);
+      tasks.emplace_back(op_params, input_shape, input_data, input_zero_point,
+                         input_scale, output_shape, output_data,
+                         output_zero_point, output_scale, depth_start,
+                         depth_end);
+      depth_start = depth_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index fa95f098d63..08b8da09915 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -44,6 +44,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
       vdup_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
       vdup_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
   for (; i <= size - 8; i += 8) {
     // We load / store 8 at a time, multiplying as two sets of 4 int32s.
     const auto input1_val_original = vld1_s8(input1_data + i);
@@ -61,14 +64,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     auto p1 = vmull_s16(input2_val_low, input1_val_low);
     auto p2 = vmull_s16(input2_val_high, input1_val_high);
 
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    p1 = RoundingDivideByPOT(p1, -params.output_shift);
-    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
 
-    const auto p1_narrowed = vmovn_s32(p1);
-    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
     const auto p =
         vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
     const auto clamped =
@@ -83,9 +88,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -114,6 +119,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
       vdup_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
       vdup_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
   for (; i <= size - 8; i += 8) {
     // We load / store 8 at a time, multiplying as two sets of 4 int32s.
     const auto input2_val_original = vld1_s8(input2_data + i);
@@ -126,14 +134,16 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
     auto p1 = vmull_n_s16(input2_val_low, input1_val);
     auto p2 = vmull_n_s16(input2_val_high, input1_val);
 
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    p1 = RoundingDivideByPOT(p1, -params.output_shift);
-    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
 
-    const auto p1_narrowed = vmovn_s32(p1);
-    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
     const auto p =
         vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
     const auto clamped =
@@ -147,9 +157,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 9154645b6a7..b9305169065 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -51,7 +51,6 @@ using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
 using reference_ops::ConcatenationWithScaling;
 using reference_ops::DepthConcatenation;
-using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::FakeQuant;
 using reference_ops::Gather;
@@ -3973,6 +3972,208 @@ void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
              filter_width, filter_height, output_data, output_dims);
 }
 
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const uint8* input_data,
+                    const RuntimeShape& output_shape, uint8* output_data) {
+  const int32 input_beta_multiplier = params.input_multiplier;
+  const int32 input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int b = 0; b < outer_size; ++b) {
+    const uint8* input_data_ptr = input_data + b * depth;
+    uint8* output_data_ptr = output_data + b * depth;
+
+    // Determine the largest entry in the current row
+    uint8 max_in_row = 0;
+    {
+      int c = 0;
+#ifdef USE_NEON
+      uint8x16_t max16_0 = vdupq_n_u8(0);
+      uint8x16_t max16_1 = vdupq_n_u8(0);
+      for (; c <= depth - 32; c += 32) {
+        max16_0 = vmaxq_u8(max16_0, vld1q_u8(input_data_ptr + c + 0));
+        max16_1 = vmaxq_u8(max16_1, vld1q_u8(input_data_ptr + c + 16));
+      }
+      uint8x16_t max16 = vmaxq_u8(max16_0, max16_1);
+      if (c <= depth - 16) {
+        max16 = vmaxq_u8(max16, vld1q_u8(input_data_ptr + c));
+        c += 16;
+      }
+      uint8x8_t max8 = vmax_u8(vget_low_u8(max16), vget_high_u8(max16));
+      if (c <= depth - 8) {
+        max8 = vmax_u8(max8, vld1_u8(input_data_ptr + c));
+        c += 8;
+      }
+      uint8x8_t max4 = vmax_u8(max8, vext_u8(max8, max8, 4));
+      uint8x8_t max2 = vmax_u8(max4, vext_u8(max4, max4, 2));
+      uint8x8_t max1 = vpmax_u8(max2, max2);
+      max_in_row = vget_lane_u8(max1, 0);
+#endif
+      for (; c < depth; ++c) {
+        max_in_row = std::max(max_in_row, input_data_ptr[c]);
+      }
+    }
+
+#ifdef USE_NEON
+    using FixedPointAccumInt32x4 =
+        gemmlowp::FixedPoint<int32x4_t, kAccumulationIntegerBits>;
+    using FixedPointScaledDiffInt32x4 =
+        gemmlowp::FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
+    using FixedPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    FixedPoint0Int32x4 input_beta_multiplier_f0 =
+        FixedPoint0Int32x4::FromScalarRaw(input_beta_multiplier);
+    int16x8_t max_in_row_s16 = vdupq_n_s16(max_in_row);
+#endif
+
+    // Compute the sum of exponentials of the differences of entries in the
+    // current row from the largest entry in the current row.
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    {
+      int c = 0;
+#ifdef USE_NEON
+      int32x4_t diff_min_s32 = vdupq_n_s32(diff_min);
+      FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
+      FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
+      FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
+      for (; c <= depth - 8; c += 8) {
+        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
+        int16x8_t input_diff_s16 =
+            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
+        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+        int32x4_t mask_0 =
+            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
+        int32x4_t mask_1 =
+            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
+        FixedPointScaledDiffInt32x4 scaled_diff_0 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
+        FixedPointScaledDiffInt32x4 scaled_diff_1 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
+        FixedPointAccumInt32x4 exps_0 =
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(scaled_diff_0));
+        FixedPointAccumInt32x4 exps_1 =
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(scaled_diff_1));
+        FixedPointAccumInt32x4 masked_exps_0 =
+            SelectUsingMask(mask_0, exps_0, zeros);
+        FixedPointAccumInt32x4 masked_exps_1 =
+            SelectUsingMask(mask_1, exps_1, zeros);
+        sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
+        sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
+      }
+      int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
+      int32x2_t sum_of_exps_reduced_2 =
+          vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
+                   vget_high_s32(sum_of_exps_reduced_4));
+      int32x2_t sum_of_exps_reduced_1 =
+          vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
+      sum_of_exps =
+          FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
+#endif
+      for (; c < depth; ++c) {
+        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
+        if (input_diff >= diff_min) {
+          const int32 input_diff_rescaled =
+              MultiplyByQuantizedMultiplierGreaterThanOne(
+                  input_diff, input_beta_multiplier, input_beta_left_shift);
+          const FixedPointScaledDiff scaled_diff_f8 =
+              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          sum_of_exps =
+              sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                exp_on_negative_values(scaled_diff_f8));
+        }
+      }
+    }
+
+    // Compute the fixed-point multiplier and shift that we need to apply to
+    // perform a division by the above-computed sum-of-exponentials.
+    int num_bits_over_unit = 0;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    // Compute the quotients of exponentials of differences of entries in the
+    // current row from the largest entry, over the previously-computed sum of
+    // exponentials.
+    {
+      int c = 0;
+#ifdef USE_NEON
+      int16x8_t diff_min_s16 = vdupq_n_s16(diff_min);
+      for (; c <= depth - 8; c += 8) {
+        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
+        int16x8_t input_diff_s16 =
+            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
+        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+        uint8x8_t mask = vmovn_u16(vcgeq_s16(input_diff_s16, diff_min_s16));
+        FixedPointScaledDiffInt32x4 scaled_diff_0 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
+        FixedPointScaledDiffInt32x4 scaled_diff_1 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
+        FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
+        FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
+        int32x4_t output_s32_0 = gemmlowp::RoundingDivideByPOT(
+            vqrdmulhq_n_s32(exp_0.raw(), shifted_scale.raw()),
+            num_bits_over_unit + 31 - 8);
+        int32x4_t output_s32_1 = gemmlowp::RoundingDivideByPOT(
+            vqrdmulhq_n_s32(exp_1.raw(), shifted_scale.raw()),
+            num_bits_over_unit + 31 - 8);
+        int16x8_t output_s16 =
+            vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
+        uint8x8_t output_u8 = vqmovun_s16(output_s16);
+        uint8x8_t masked_output = vbsl_u8(mask, output_u8, vdup_n_u8(0));
+        vst1_u8(output_data_ptr + c, masked_output);
+      }
+#endif
+      for (; c < depth; ++c) {
+        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
+        if (input_diff >= diff_min) {
+          const int32 input_diff_rescaled =
+              MultiplyByQuantizedMultiplierGreaterThanOne(
+                  input_diff, input_beta_multiplier, input_beta_left_shift);
+          const FixedPointScaledDiff scaled_diff_f8 =
+              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+          FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+          int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+              (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+          output_data_ptr[c] = std::max(std::min(unsat_output, 255), 0);
+
+        } else {
+          output_data_ptr[c] = 0;
+        }
+      }
+    }
+  }
+}
+
 inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
                     float beta, float* output_data,
                     const RuntimeShape& output_shape) {
@@ -4706,6 +4907,17 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
           DimsToShape(output_dims), output_data);
 }
 
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = zero_point;
+  op_params.scale = scale;
+
+  Dequantize(op_params, DimsToShape(input_dims), input_data,
+             DimsToShape(output_dims), output_data);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 11caae64ee2..2144fdf0436 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -19,11 +19,13 @@ limitations under the License.
 
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/experimental/ruy/detect_dotprod.h"
+#include "tensorflow/lite/experimental/ruy/detect_arm.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
@@ -31,22 +33,25 @@ limitations under the License.
 
 #ifdef USE_NEON
 
-#define kFloatWeightsPerNeonLane 4
-
+// aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
-#define TFLITE_USE_STD_ALIGN
+#if !defined(__APPLE__)  // Apple does not provide aligned_alloc.
+#define TFLITE_USE_STD_ALIGNED_ALLOC
 #endif
 #endif
-
-#ifdef TFLITE_USE_STD_ALIGN
-#include <stdalign.h>
 #endif
 
 namespace tflite {
 namespace tensor_utils {
 namespace {
 
+constexpr int kFloatValuesPerNeonVector = 4;
+
+inline int RoundDownToFloatVectors(int size) {
+  return size & ~(kFloatValuesPerNeonVector - 1);
+}
+
 // Allocates, at least, size bytes of uninitialized storage whose alignment is
 // specified by alignment. The size parameter must be an integral multiple of
 // alignment.
@@ -54,7 +59,7 @@ namespace {
 // the passed freeing_buffer pointer.
 inline void* aligned_alloc(size_t alignment, size_t size,
                            void** freeing_buffer) {
-#ifdef TFLITE_USE_STD_ALIGN
+#ifdef TFLITE_USE_STD_ALIGNED_ALLOC
   *freeing_buffer = ::aligned_alloc(
       alignment, (size + alignment - 1) / alignment * alignment);
   return *freeing_buffer;
@@ -72,17 +77,96 @@ bool HasSdotInstruction() {
   return has_dotprod;
 }
 
+inline float AccumulateNeonLane(const float32x4_t lane) {
+#ifdef __aarch64__
+  return vaddvq_f32(lane);
+#else
+  return vgetq_lane_f32(lane, 0) + vgetq_lane_f32(lane, 1) +
+         vgetq_lane_f32(lane, 2) + vgetq_lane_f32(lane, 3);
+#endif
+}
+
+inline int32_t AccumulateNeonLane(const int32x4_t lane) {
+#ifdef __aarch64__
+  return vaddvq_s32(lane);
+#else
+  int64x2_t pairwiseAdded = vpaddlq_s32(lane);
+  return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+#endif
+}
+
+inline int64_t AccumulateNeonLane64(const int64x2_t lane) {
+#ifdef __aarch64__
+  return vaddvq_s64(lane);
+#else
+  return vgetq_lane_s64(lane, 0) + vgetq_lane_s64(lane, 1);
+#endif
+}
+
+// TODO(jaesung): Merge duplicated implementations in optimized_ops.h and
+// neon_tensor_utils.cc.
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
+  int32x4x4_t result;
+  result.val[0] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  result.val[1] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  result.val[2] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[2], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  result.val[3] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[3], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  return result;
+}
+
+inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
+    int32x4x2_t input_val, int32 quantized_multiplier, int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
+  int32x4x2_t result;
+  result.val[0] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  result.val[1] =
+      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
+                              quantized_multiplier),
+                          right_shift);
+  return result;
+}
+
 }  // namespace
 
 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int m_cols, const float* vector,
                                              int n_batch, float* result,
                                              int result_stride) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(m_cols);
 
   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows * result_stride;
@@ -92,7 +176,8 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
     // Main matrix by vector multiplication loop
     for (int r = 0; r < m_rows; r++) {
       float32x4_t acc_32x4 = vmovq_n_f32(0.0);
-      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+      int c = 0;
+      for (; c < postamble_start; c += kFloatValuesPerNeonVector) {
         // Load 4 float values from vector and matrix row.
         float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
         float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
@@ -101,10 +186,8 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       }
       // Add the 4 intermediate sum values to get the final dot-prod value for
       // this column.
-      *result_in_batch +=
-          (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
-           vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
-      for (int c = postamble_start; c < m_cols; c++) {
+      *result_in_batch += AccumulateNeonLane(acc_32x4);
+      for (; c < m_cols; c++) {
         *result_in_batch += matrix_row[c] * vector_in_batch[c];
       }
       matrix_row += m_cols;
@@ -356,6 +439,256 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
 
 #endif  // __aarch64__
 
+void NeonMatrixBatchVectorMultiplyImpl(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t n_batch, int32_t n_input,
+    int32_t n_output, int32_t output_zp, int32_t* scratch) {
+  static const int kWeightsPerUint32 = 4;
+  static const int kWeightsPerNeonLane = 16;
+  // Assuming *matrix is kWeightsPerUint32-byte aligned,
+  // every row of the matrix is also
+  // kWeightsPerUint32-byte aligned as long as cols is
+  // a multiple of kWeightsPerUint32. The assumption
+  // is currently satisfied by TFLite's 16-byte memory
+  // alignment scheme.
+  //
+  // Otherwise, we allocate an aligned memory block and set
+  // a flag to later copy rows from matrix to the block
+  // for aligned multiplication.
+  bool unaligned = false;
+  int8_t* aligned_row = nullptr;
+  void* aligned_row_free = nullptr;
+  if ((n_input & (kWeightsPerUint32 - 1)) != 0) {
+    unaligned = true;
+    aligned_row = (int8_t*)aligned_alloc(kWeightsPerUint32, n_input,  // NOLINT
+                                         &aligned_row_free);
+  }
+  void* aligned_vec_free = nullptr;
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, n_input,  // NOLINT
+                             &aligned_vec_free);
+
+  // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_half_start
+  // shows the start index where this should happen. Between postamble_start and
+  // postamble_half_start we can still process kWeightsPerNeonLane >> 1 in a
+  // vectorized form.
+  const int postamble_half_start = n_input & ~(kWeightsPerNeonLane - 1);
+  const int postamble_start = n_input & ~((kWeightsPerNeonLane >> 1) - 1);
+
+  for (int batch = 0; batch < n_batch; ++batch) {
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, input + batch * n_input, sizeof(int8_t) * n_input);
+    // Compute dot-product for every column.
+    for (int row = 0; row < n_output; ++row) {
+      // Get the address of the first element of the row.
+      int8_t* row_ptr =
+          (int8_t*)input_to_gate_weights + row * n_input;  // NOLINT
+      if (unaligned) {
+        memcpy(aligned_row, row_ptr, sizeof(int8_t) * n_input);
+        row_ptr = aligned_row;
+      }
+
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod_32x4 = vmovq_n_s32(0);
+
+      // For every block of 16 8-bit elements.
+      int col = 0;
+      for (; col < postamble_half_start; col += kWeightsPerNeonLane) {
+        // Load 16 8-bit values from the row and vector, each, to operate on.
+        // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
+        // performance may suffer significantly.
+        TFLITE_DCHECK_EQ(  // NOLINT
+            (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
+        const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
+        const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
+        // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+        // registers).
+        int16x8_t prod_16x8 =
+            vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
+        // registers), and accumulate with the result of the low bits product.
+        // The assumption here is that overflow will not happen as we quantize
+        // our values to be in the range [-127, 127]. As such the sum of the 2
+        // products is always strictly smaller than 15-bits (32767 in absolute
+        // value).
+        prod_16x8 =
+            vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+        dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
+      }  // for col
+
+      // Half iteration dealing only 8 elements
+      // TODO(raziel): if (ABSL_PREDICT_FALSE(col < postamble_start))
+      if (col < postamble_start) {
+        // Load 8 8-bit values from the row and column each to operate on.
+        // Here the assumption is that each buffer is 4-bytes aligned.
+        // Otherwise, performance may suffer significantly.
+        TFLITE_DCHECK_EQ(  // NOLINT
+            (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
+        const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
+        const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
+        const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
+        dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
+        col += (kWeightsPerNeonLane >> 1);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this row.
+      int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
+      // Postamble loop.
+      // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols))
+      for (; col < n_input; ++col) {
+        dotprod += row_ptr[col] * aligned_vec[col];
+      }  // for col
+
+      dotprod += input_zeropoint_times_weights[row];
+      scratch[batch * n_output + row] = dotprod;
+    }  // for row
+  }    // for batch
+
+  if (unaligned) {
+    free(aligned_row_free);
+  }
+  free(aligned_vec_free);
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output) {
+  NeonMatrixBatchVectorMultiplyImpl(input, input_zeropoint_times_weights,
+                                    input_to_gate_weights, n_batch, n_input,
+                                    n_output, output_zp, scratch);
+  int i = 0;
+  const int total_size = n_batch * n_output;
+
+  const int32_t output_min = std::numeric_limits<int16_t>::min();
+  const int32_t output_max = std::numeric_limits<int16_t>::max();
+
+  const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
+  const int32x4_t max_val_dup = vdupq_n_s32(output_max);
+  const int32x4_t min_val_dup = vdupq_n_s32(output_min);
+
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+
+  for (; i <= total_size - 8; i += 8) {
+    int32x4x2_t scratch_val;
+    scratch_val.val[0] = vld1q_s32(scratch + i);
+    scratch_val.val[1] = vld1q_s32(scratch + i + 4);
+    const int16x8_t output_val = vld1q_s16(output + i);
+    const int32x4_t first_half = vmovl_s16(vget_low_s16(output_val));
+    const int32x4_t second_half = vmovl_s16(vget_high_s16(output_val));
+
+    int32x4x2_t temp_val =
+        MultiplyByQuantizedMultiplier2Rows(scratch_val, multiplier, shift);
+
+    temp_val.val[0] =
+        vaddq_s32(vaddq_s32(temp_val.val[0], first_half), output_zp_dup);
+    temp_val.val[1] =
+        vaddq_s32(vaddq_s32(temp_val.val[1], second_half), output_zp_dup);
+    temp_val.val[0] =
+        vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+    temp_val.val[1] =
+        vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+    const int16x8_t result =
+        vcombine_s16(vqmovn_s32(temp_val.val[0]), vqmovn_s32(temp_val.val[1]));
+    vst1q_s16(output + i, result);
+  }
+  for (; i < total_size; ++i) {
+    int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
+    temp += output_zp;
+    temp += output[i];
+    if (temp > output_max) {
+      temp = output_max;
+    }
+    if (temp < output_min) {
+      temp = output_min;
+    }
+    output[i] = static_cast<int16_t>(temp);
+  }
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output) {
+  NeonMatrixBatchVectorMultiplyImpl(input, input_zeropoint_times_weights,
+                                    input_to_gate_weights, n_batch, n_input,
+                                    n_output, output_zp, scratch);
+  int i = 0;
+  const int total_size = n_batch * n_output;
+
+  const int32_t output_min = std::numeric_limits<int8_t>::min();
+  const int32_t output_max = std::numeric_limits<int8_t>::max();
+
+  const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
+  const int32x4_t max_val_dup = vdupq_n_s32(output_max);
+  const int32x4_t min_val_dup = vdupq_n_s32(output_min);
+
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+
+  for (; i <= total_size - 16; i += 16) {
+    int32x4x4_t scratch_val;
+    scratch_val.val[0] = vld1q_s32(scratch + i);
+    scratch_val.val[1] = vld1q_s32(scratch + i + 4);
+    scratch_val.val[2] = vld1q_s32(scratch + i + 8);
+    scratch_val.val[3] = vld1q_s32(scratch + i + 12);
+
+    const int8x16_t output_val = vld1q_s8(output + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(output_val));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(output_val));
+    const int32x4_t output_val_1 = vmovl_s16(vget_low_s16(first_half));
+    const int32x4_t output_val_2 = vmovl_s16(vget_high_s16(first_half));
+    const int32x4_t output_val_3 = vmovl_s16(vget_low_s16(second_half));
+    const int32x4_t output_val_4 = vmovl_s16(vget_high_s16(second_half));
+
+    int32x4x4_t temp_val =
+        MultiplyByQuantizedMultiplier4Rows(scratch_val, multiplier, shift);
+
+    temp_val.val[0] =
+        vaddq_s32(vaddq_s32(temp_val.val[0], output_val_1), output_zp_dup);
+    temp_val.val[1] =
+        vaddq_s32(vaddq_s32(temp_val.val[1], output_val_2), output_zp_dup);
+    temp_val.val[2] =
+        vaddq_s32(vaddq_s32(temp_val.val[2], output_val_3), output_zp_dup);
+    temp_val.val[3] =
+        vaddq_s32(vaddq_s32(temp_val.val[3], output_val_4), output_zp_dup);
+
+    temp_val.val[0] =
+        vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+    temp_val.val[1] =
+        vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+    temp_val.val[2] =
+        vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+    temp_val.val[3] =
+        vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+    const int16x8_t result_1 =
+        vcombine_s16(vqmovn_s32(temp_val.val[0]), vqmovn_s32(temp_val.val[1]));
+    const int16x8_t result_2 =
+        vcombine_s16(vqmovn_s32(temp_val.val[2]), vqmovn_s32(temp_val.val[3]));
+    const int8x16_t result =
+        vcombine_s8(vqmovn_s16(result_1), vqmovn_s16(result_2));
+    vst1q_s8(output + i, result);
+  }
+  for (; i < total_size; ++i) {
+    int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
+    temp += output_zp;
+    temp += output[i];
+    if (temp > output_max) {
+      temp = output_max;
+    }
+    if (temp < output_min) {
+      temp = output_min;
+    }
+    output[i] = static_cast<int8_t>(temp);
+  }
+}
+
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
@@ -466,15 +799,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
         dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
         col += (kWeightsPerNeonLane >> 1);
       }
-#ifdef __aarch64__
-      int32_t dotprod = vaddvq_s32(dotprod_32x4);
-#else
       // Add the 4 intermediate sum values to get the final dot-prod value for
       // this row.
-      int64x2_t pairwiseAdded = vpaddlq_s32(dotprod_32x4);
-      int32_t dotprod =
-          vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
-#endif
+      int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
       // Postamble loop.
       // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols))
       for (; col < m_cols; ++col) {
@@ -491,12 +818,425 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   free(aligned_vec_free);
 }
 
+inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
+  int64x2x2_t result;
+  const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
+  const int64x2_t lhs_high = vmovl_s32(vget_low_s32(lhs));
+  const int64_t lhs_0 = vgetq_lane_s64(lhs_low, 0);
+  const int64_t lhs_1 = vgetq_lane_s64(lhs_low, 1);
+  const int64_t lhs_2 = vgetq_lane_s64(lhs_high, 0);
+  const int64_t lhs_3 = vgetq_lane_s64(lhs_high, 1);
+
+  const int64x2_t rhs_low = vmovl_s32(vget_low_s32(rhs));
+  const int64x2_t rhs_high = vmovl_s32(vget_low_s32(rhs));
+  const int64_t rhs_0 = vgetq_lane_s64(rhs_low, 0);
+  const int64_t rhs_1 = vgetq_lane_s64(rhs_low, 1);
+  const int64_t rhs_2 = vgetq_lane_s64(rhs_high, 0);
+  const int64_t rhs_3 = vgetq_lane_s64(rhs_high, 1);
+
+  const int64x2_t mul_0 = {lhs_0 * rhs_0, lhs_1 * rhs_1};
+  const int64x2_t mul_1 = {lhs_2 * rhs_2, lhs_3 * rhs_3};
+
+  result.val[0] = vaddq_s64(vmovl_s32(vget_low_s32(acc)), mul_0);
+  result.val[1] = vaddq_s64(vmovl_s32(vget_high_s32(acc)), mul_1);
+  return result;
+}
+
+void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                        const int32_t* bias, int32_t layer_norm_scale_a,
+                        int32_t layer_norm_scale_b, int32_t variance_limit,
+                        int n_batch, int n_input, int16_t* output) {
+  const int32 int16_max = std::numeric_limits<int16>::max();
+  const int32 int16_min = std::numeric_limits<int16>::min();
+  const int32 temp = 1048576 / n_input;
+
+  for (int i = 0; i < n_batch; ++i) {
+    int64_t sum = 0;
+    int64_t sum_sq = 0;
+
+    int j = 0;
+    for (; j <= n_input - 16; j += 16) {
+      const int32 index = i * n_input + j;
+      const int16x8_t val_s16 = vld1q_s16(input + index);
+      const int32x4_t val_s32_0 = vmovl_s16(vget_low_s16(val_s16));
+      const int32x4_t val_s32_1 = vmovl_s16(vget_high_s16(val_s16));
+
+      sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_0));
+      sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_1));
+
+      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_low_s32(val_s32_0)));
+      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_high_s32(val_s32_0)));
+      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_low_s32(val_s32_1)));
+      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_high_s32(val_s32_1)));
+    }
+    for (; j < n_input; ++j) {
+      const int32 index = i * n_input + j;
+      int32 val = static_cast<int32_t>(input[index]);
+      sum += val;
+      sum_sq += val * val;
+    }
+
+    int32_t mean =
+        static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
+    // TODO(jianlijianli): Avoids overflow but only works for POT n_input.
+    int64_t variance =
+        sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
+    int32_t variance2 = static_cast<int32>(variance / 1048576);
+    if (variance2 < 1) {
+      variance2 = variance_limit;
+    }
+    int32_t stddev_inverse_a;
+    int stddev_inverse_b;
+    GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
+                                     &stddev_inverse_a, &stddev_inverse_b);
+
+    j = 0;
+    const int32x4_t mean_dup = vdupq_n_s32(mean);
+    for (; j <= n_input - 32; j += 32) {
+      // Load 32 items at once.
+      const int32 index = i * n_input + j;
+      const int16x8_t val_s16_0 = vld1q_s16(input + index);
+      const int16x8_t val_s16_1 = vld1q_s16(input + index + 16);
+
+      int32x4x4_t shifted;
+      shifted.val[0] = vsubq_s32(
+          vshlq_n_s32(vmovl_s16(vget_low_s16(val_s16_0)), 10), mean_dup);
+      shifted.val[1] = vsubq_s32(
+          vshlq_n_s32(vmovl_s16(vget_high_s16(val_s16_0)), 10), mean_dup);
+      shifted.val[2] = vsubq_s32(
+          vshlq_n_s32(vmovl_s16(vget_low_s16(val_s16_1)), 10), mean_dup);
+      shifted.val[3] = vsubq_s32(
+          vshlq_n_s32(vmovl_s16(vget_high_s16(val_s16_1)), 10), mean_dup);
+
+      int32x4x4_t rescaled = MultiplyByQuantizedMultiplier4Rows(
+          shifted, stddev_inverse_a, stddev_inverse_b);
+
+      const int32x4_t bias_0 = vld1q_s32(bias + j);
+      const int32x4_t bias_1 = vld1q_s32(bias + j + 4);
+      const int32x4_t bias_2 = vld1q_s32(bias + j + 8);
+      const int32x4_t bias_3 = vld1q_s32(bias + j + 12);
+
+      const int16x8_t layer_norm_weights_s16_0 =
+          vld1q_s16(layer_norm_weights + j);
+      const int16x8_t layer_norm_weights_s16_1 =
+          vld1q_s16(layer_norm_weights + j + 8);
+      const int32x4_t layer_norm_weights_s32_0 =
+          vmovl_s16(vget_low_s16(layer_norm_weights_s16_0));
+      const int32x4_t layer_norm_weights_s32_1 =
+          vmovl_s16(vget_high_s16(layer_norm_weights_s16_0));
+      const int32x4_t layer_norm_weights_s32_2 =
+          vmovl_s16(vget_low_s16(layer_norm_weights_s16_1));
+      const int32x4_t layer_norm_weights_s32_3 =
+          vmovl_s16(vget_high_s16(layer_norm_weights_s16_1));
+
+      int64x2x2_t val3_0 =
+          MulAdd(bias_0, rescaled.val[0], layer_norm_weights_s32_0);
+      int64x2x2_t val3_1 =
+          MulAdd(bias_1, rescaled.val[1], layer_norm_weights_s32_1);
+      int64x2x2_t val3_2 =
+          MulAdd(bias_2, rescaled.val[2], layer_norm_weights_s32_2);
+      int64x2x2_t val3_3 =
+          MulAdd(bias_3, rescaled.val[3], layer_norm_weights_s32_3);
+
+      int32x4x4_t val4;
+      val4.val[0] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_0.val[0], 10)),
+                                 vmovn_s64(vrshrq_n_s64(val3_0.val[1], 10)));
+      val4.val[1] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_1.val[0], 10)),
+                                 vmovn_s64(vrshrq_n_s64(val3_1.val[1], 10)));
+      val4.val[2] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_2.val[0], 10)),
+                                 vmovn_s64(vrshrq_n_s64(val3_2.val[1], 10)));
+      val4.val[3] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_3.val[0], 10)),
+                                 vmovn_s64(vrshrq_n_s64(val3_3.val[1], 10)));
+
+      int32x4x4_t val5_s32 = MultiplyByQuantizedMultiplier4Rows(
+          val4, layer_norm_scale_a, layer_norm_scale_b + 12);
+      vst1_s16(output + index, vqmovn_s32(val5_s32.val[0]));
+      vst1_s16(output + index + 4, vqmovn_s32(val5_s32.val[1]));
+      vst1_s16(output + index + 8, vqmovn_s32(val5_s32.val[2]));
+      vst1_s16(output + index + 12, vqmovn_s32(val5_s32.val[3]));
+    }
+    for (; j < n_input; ++j) {
+      const int32 index = i * n_input + j;
+      int32 val = static_cast<int32_t>(input[index]);
+      int32 shifted = 1024 * val - mean;
+      int32 rescaled = MultiplyByQuantizedMultiplier(shifted, stddev_inverse_a,
+                                                     stddev_inverse_b);
+      // TODO(jianlijianli): Saturate this.
+      int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
+      int32 val4 =
+          static_cast<int32>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
+      int32 val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
+                                                 layer_norm_scale_b + 12);
+      val5 = std::min(std::max(int16_min, val5), int16_max);
+      output[index] = static_cast<int16_t>(val5);
+    }
+  }
+}
+
+void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                      int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+#ifdef GEMMLOWP_NEON
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; i <= n_input - 16; i += 16) {
+      const int index = batch * n_input + i;
+      F3 input0 = F3::FromRaw(vld1q_s16(input + index));
+      F3 input1 = F3::FromRaw(vld1q_s16(input + index + 8));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      vst1q_s16(output + index, output0.raw());
+      vst1q_s16(output + index + 8, output1.raw());
+    }
+#endif  // GEMMLOWP_NEON
+    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
+    using F3_Scalar = gemmlowp::FixedPoint<int16_t, 3>;
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      F3_Scalar input_f3 = F3_Scalar::FromRaw(input[index]);
+      F0_Scalar output_f0 = gemmlowp::logistic(input_f3);
+      output[index] = output_f0.raw();
+    }
+  }
+}
+
+void NeonApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+#ifdef GEMMLOWP_NEON
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; i <= n_input - 16; i += 16) {
+      const int index = batch * n_input + i;
+      F3 input0 = F3::FromRaw(vld1q_s16(input + index));
+      F3 input1 = F3::FromRaw(vld1q_s16(input + index + 8));
+      F0 output0 = gemmlowp::tanh(input0);
+      F0 output1 = gemmlowp::tanh(input1);
+      vst1q_s16(output + index, output0.raw());
+      vst1q_s16(output + index + 8, output1.raw());
+    }
+#endif  // GEMMLOWP_NEON
+    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
+    using F3_Scalar = gemmlowp::FixedPoint<int16_t, 3>;
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      F3_Scalar input_f3 = F3_Scalar::FromRaw(input[index]);
+      F0_Scalar output_f0 = gemmlowp::tanh(input_f3);
+      output[index] = output_f0.raw();
+    }
+  }
+}
+
+void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+#ifdef GEMMLOWP_NEON
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F4 uses 4 integer bits, range [-16, 16], the input range expected here.
+    using F4 = gemmlowp::FixedPoint<int16x8_t, 4>;
+
+    for (; i <= n_input - 16; i += 16) {
+      const int index = batch * n_input + i;
+      F4 input0 = F4::FromRaw(vld1q_s16(input + index));
+      F4 input1 = F4::FromRaw(vld1q_s16(input + index + 8));
+      F0 output0 = gemmlowp::tanh(input0);
+      F0 output1 = gemmlowp::tanh(input1);
+      vst1q_s16(output + index, output0.raw());
+      vst1q_s16(output + index + 8, output1.raw());
+    }
+#endif  // GEMMLOWP_NEON
+    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
+    using F4_Scalar = gemmlowp::FixedPoint<int16_t, 4>;
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      F4_Scalar input_f4 = F4_Scalar::FromRaw(input[index]);
+      F0_Scalar output_f0 = gemmlowp::tanh(input_f4);
+      output[index] = output_f0.raw();
+    }
+  }
+}
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int shift, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+    for (; i <= n_input - 8; i += 8) {
+      const int index = batch * n_input + i;
+      const int16x8_t a = vld1q_s16(input_1 + index);
+      const int16x8_t b = vld1q_s16(input_2 + index);
+      const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
+      const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
+      const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
+      const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
+
+      int32x4_t x_0 = vmulq_s32(a_s32_0, b_s32_0);
+      int32x4_t x_1 = vmulq_s32(a_s32_1, b_s32_1);
+      x_0 = gemmlowp::RoundingDivideByPOT(x_0, shift);
+      x_1 = gemmlowp::RoundingDivideByPOT(x_1, shift);
+
+      const int16x8_t result = vcombine_s16(vmovn_s32(x_0), vmovn_s32(x_1));
+      vst1q_s16(output + index, result);
+    }
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      int64_t x = a * b;
+      if (x > std::numeric_limits<std::int32_t>::max()) {
+        x = std::numeric_limits<std::int32_t>::max();
+      }
+      const int32_t value = static_cast<int32_t>(x);
+      output[index] =
+          static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+    }
+  }
+}
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int shift, int8_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+    for (; i <= n_input - 8; i += 8) {
+      const int index = batch * n_input + i;
+      const int16x8_t a = vld1q_s16(input_1 + index);
+      const int16x8_t b = vld1q_s16(input_2 + index);
+      const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
+      const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
+      const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
+      const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
+
+      int32x4_t x_0 = vmulq_s32(a_s32_0, b_s32_0);
+      int32x4_t x_1 = vmulq_s32(a_s32_1, b_s32_1);
+      x_0 = gemmlowp::RoundingDivideByPOT(x_0, shift);
+      x_1 = gemmlowp::RoundingDivideByPOT(x_1, shift);
+
+      const int16x8_t result = vcombine_s16(vmovn_s32(x_0), vmovn_s32(x_1));
+      vst1_s8(output + index, vmovn_s16(result));
+    }
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      int64_t x = a * b;
+      if (x > std::numeric_limits<std::int32_t>::max()) {
+        x = std::numeric_limits<std::int32_t>::max();
+      }
+      const int32_t value = static_cast<int32_t>(x);
+      output[index] =
+          static_cast<int8_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+    }
+  }
+}
+
+void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int16_t* output) {
+  const int32 int16_max = std::numeric_limits<int16>::max();
+  const int32 int16_min = std::numeric_limits<int16>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+    for (; i <= n_input - 8; i += 8) {
+      const int index = batch * n_input + i;
+      const int16x8_t a = vld1q_s16(input_1 + index);
+      const int16x8_t b = vld1q_s16(input_2 + index);
+      const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
+      const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
+      const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
+      const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
+
+      const int32x4_t sum_0 = vaddq_s32(a_s32_0, b_s32_0);
+      const int32x4_t sum_1 = vaddq_s32(a_s32_1, b_s32_1);
+      vst1_s16(output + index, vqmovn_s32(sum_0));
+      vst1_s16(output + index + 4, vqmovn_s32(sum_1));
+    }
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      int32_t sum = input_1[index] + input_2[index];
+      const int32 sum_clamped = std::min(int16_max, std::max(int16_min, sum));
+      output[index] = static_cast<int16_t>(sum_clamped);
+    }
+  }
+}
+
+void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
+                       int32_t n_batch, int32_t n_input) {
+  const int16x8_t max_dup = vdupq_n_s16(clipping_value);
+  const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+    for (; i <= n_input - 16; i += 16) {
+      const int index = batch * n_input + i;
+      int16x8_t val_0 = vld1q_s16(input + index);
+      int16x8_t val_1 = vld1q_s16(input + index + 8);
+      val_0 = vminq_s16(val_0, max_dup);
+      val_1 = vminq_s16(val_1, max_dup);
+      val_0 = vmaxq_s16(val_0, min_dup);
+      val_1 = vmaxq_s16(val_1, min_dup);
+      vst1q_s16(input + index, val_0);
+      vst1q_s16(input + index + 8, val_1);
+    }
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      if (input[index] > clipping_value) {
+        input[index] = clipping_value;
+      }
+      if (input[index] < -clipping_value) {
+        input[index] = -clipping_value;
+      }
+    }
+  }
+}
+
+void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
+                       int32_t n_batch, int32_t n_input) {
+  const int8x16_t max_dup = vdupq_n_s8(clipping_value);
+  const int8x16_t min_dup = vdupq_n_s8(-clipping_value);
+  for (int batch = 0; batch < n_batch; ++batch) {
+    int i = 0;
+    for (; i <= n_input - 32; i += 32) {
+      const int index = batch * n_input + i;
+      int8x16_t val_0 = vld1q_s8(input + index);
+      int8x16_t val_1 = vld1q_s8(input + index + 16);
+      val_0 = vminq_s8(val_0, max_dup);
+      val_1 = vminq_s8(val_1, max_dup);
+      val_0 = vmaxq_s8(val_0, min_dup);
+      val_1 = vmaxq_s8(val_1, min_dup);
+      vst1q_s8(input + index, val_0);
+      vst1q_s8(input + index + 16, val_1);
+    }
+    for (; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      if (input[index] > clipping_value) {
+        input[index] = clipping_value;
+      }
+      if (input[index] < -clipping_value) {
+        input[index] = -clipping_value;
+      }
+    }
+  }
+}
+
 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
     float* __restrict__ result, int result_stride) {
   const int kBlockSize = 16;
-  const int kNeonLanesPerBlock = 4;
+  const int kNeonVectorsPerBlock = 4;
   TFLITE_DCHECK_EQ(  // NOLINT
       m_cols % kBlockSize, 0);
 
@@ -515,24 +1255,18 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
           const float* vector_block_in_batch_ptr =
               vector_in_batch + block_start_index;
 
-          for (int c = 0; c < kNeonLanesPerBlock; c++) {
+          for (int c = 0; c < kNeonVectorsPerBlock; c++) {
             // Load 4 float values from the vector and matrix row.
             float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
-                                                 c * kFloatWeightsPerNeonLane);
+                                                 c * kFloatValuesPerNeonVector);
             float32x4_t matrix_f32x4 =
-                vld1q_f32(matrix_ptr + c * kFloatWeightsPerNeonLane);
+                vld1q_f32(matrix_ptr + c * kFloatValuesPerNeonVector);
             // Multiply the vector and matrix row and add to accumulator.
             acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
           }
           matrix_ptr += kBlockSize;
         }
-#ifdef __aarch64__
-        *result_in_batch += vaddvq_f32(acc_32x4);
-#else
-        *result_in_batch +=
-            (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
-             vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
-#endif
+        *result_in_batch += AccumulateNeonLane(acc_32x4);
       }
       result_in_batch += result_stride;
     }
@@ -606,13 +1340,7 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
         }
         // Add the 4 intermediate sum values to get the final dot-prod value for
         // this row.
-#ifdef __aarch64__
-        int32_t dotprod = vaddvq_s32(dotprod_32x4);
-#else
-        int64x2_t pairwiseAdded = vpaddlq_s32(dotprod_32x4);
-        int32_t dotprod =
-            vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
-#endif
+        int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
         *result += dotprod * batch_scaling_factor;
       }
     }  // for row
@@ -622,12 +1350,12 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
 
 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
                                   int v_size, float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from vector1 and vector2.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
@@ -636,7 +1364,7 @@ void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
     // Save to result array.
     vst1q_f32(&result[v], mul_32x4);
   }
-  for (int v = postamble_start; v < v_size; v++) {
+  for (; v < v_size; v++) {
     result[v] = vector1[v] * vector2[v];
   }
 }
@@ -644,12 +1372,12 @@ void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
 void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
                                             const float* vector2, int v_size,
                                             float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from vector1 and vector2 and accumulator.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
@@ -659,7 +1387,7 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
     // Save to result array.
     vst1q_f32(&result[v], acc_32x4);
   }
-  for (int v = postamble_start; v < v_size; v++) {
+  for (; v < v_size; v++) {
     result[v] += vector1[v] * vector2[v];
   }
 }
@@ -667,14 +1395,14 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
 void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
                                        const float* batch_vector, int n_batch,
                                        float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
 
   for (int b = 0; b < n_batch; b++) {
-    for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    int v = 0;
+    for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
       // Load from memory to vectors.
       float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector + v);
       float32x4_t vector_f32x4 = vld1q_f32(vector + v);
@@ -684,7 +1412,7 @@ void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
       vst1q_f32(result + v, result_f32x4);
     }
     // Postamble loop
-    for (int v = postamble_start; v < v_size; v++) {
+    for (; v < v_size; v++) {
       result[v] = vector[v] * batch_vector[v];
     }
     // Update the pointers.
@@ -697,16 +1425,16 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
                                                  int v_size,
                                                  const float* batch_vector,
                                                  int n_batch, float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
 
   float* result_ptr = result;
   const float* batch_vector_ptr = batch_vector;
   for (int b = 0; b < n_batch; b++) {
-    for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    int v = 0;
+    for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
       // Load from memory to vectors.
       float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
       float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
@@ -717,7 +1445,7 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
       vst1q_f32(result_ptr + v, result_f32x4);
     }
     // Postamble loop
-    for (int v = postamble_start; v < v_size; v++) {
+    for (; v < v_size; v++) {
       result_ptr[v] += vector[v] * batch_vector_ptr[v];
     }
     // Update the pointers.
@@ -727,14 +1455,14 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
 }
 
 void NeonSub1Vector(const float* vector, int v_size, float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
 
   float32x4_t one_f32x4 = vmovq_n_f32(1.0);
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from the current pointers of the input column and
     // subtract from 1.
     float32x4_t v_f32x4 = vld1q_f32(vector + v);
@@ -742,20 +1470,20 @@ void NeonSub1Vector(const float* vector, int v_size, float* result) {
     // Save to output.
     vst1q_f32(result + v, result_f32x4);
   }
-  for (int v = postamble_start; v < v_size; v++) {
+  for (; v < v_size; v++) {
     result[v] = 1.0f - vector[v];
   }
 }
 
 bool NeonIsZeroVector(const float* vector, int v_size) {
-  // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
-  // use the main vectorized loop, and we need to process sequentially.
-  // postamble_start shows the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
 
   const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     const float32x4_t i_x4_float = vld1q_f32(vector + v);
     uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
     if (vgetq_lane_u32(cmp_result, 0) == 0) return false;
@@ -763,9 +1491,8 @@ bool NeonIsZeroVector(const float* vector, int v_size) {
     if (vgetq_lane_u32(cmp_result, 2) == 0) return false;
     if (vgetq_lane_u32(cmp_result, 3) == 0) return false;
   }
-
   // Postamble loop
-  for (int v = postamble_start; v < v_size; ++v) {
+  for (; v < v_size; ++v) {
     if (vector[v] != 0.0) return false;
   }
   return true;
@@ -773,17 +1500,17 @@ bool NeonIsZeroVector(const float* vector, int v_size) {
 
 void NeonClipVector(const float* vector, int v_size, float abs_limit,
                     float* result) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
 
   // Replicate abs_limit and -abs_limit in two vectors.
   const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
   const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
 
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load from memory to vector.
     float32x4_t v_f32x4 = vld1q_f32(vector + v);
     // Clip between abs_limit and -abs_limit.
@@ -793,7 +1520,7 @@ void NeonClipVector(const float* vector, int v_size, float abs_limit,
     vst1q_f32(result + v, result_f32x4);
   }
   // Postamble loop.
-  for (int v = postamble_start; v < v_size; v++) {
+  for (; v < v_size; v++) {
     result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
     result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
   }
@@ -870,6 +1597,23 @@ void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
   }
 }
 
+// TODO(renjieliu): Avoid duplicating the logic.
+// Also consider changing the rounding stragey from "ties to away" to
+// "ties to even" since vcvtnq_s32_f32 is generally more available.
+inline int32x4_t RoundToNearest(const float32x4_t input) {
+#if defined(_ACAT_ARM64)
+  return vcvtaq_s32_f32(input);
+#else
+  static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+
+  const int32x4_t mask = vreinterpretq_s32_u32(vcltq_f32(input, zero_val_dup));
+  const float32x4_t casted_mask = vcvtq_f32_s32(mask);
+  const float32x4_t round = vaddq_f32(casted_mask, point5_val_dup);
+  return vcvtq_s32_f32(vaddq_f32(input, round));
+#endif
+}
+
 void NeonSymmetricQuantizeFloats(const float* values, const int size,
                                  int8_t* quantized_values, float* min,
                                  float* max, float* scaling_factor) {
@@ -887,43 +1631,26 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size,
   *scaling_factor = range / kScale;
   const float scaling_factor_inv = kScale / range;
 
-  const int postamble_start =
-      size - (size & (2 * kFloatWeightsPerNeonLane - 1));
+  const int postamble_start = size & ~(2 * kFloatValuesPerNeonVector - 1);
 
   // Vectorized constants.
   const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
-  const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
-  const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
   const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
   const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
 
-  for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) {
+  int i = 0;
+  for (; i < postamble_start; i += 2 * kFloatValuesPerNeonVector) {
     // Implements the vectorized version of the following:
     // const int32 quantized_value = static_cast<int32>(
     //    std::round(*scaling_factor * values[i]));
-    // Since the vectorized round intrinsics (vrndqa_f32) is not supported
-    // on all Neon flavors, we use the following method for rounding: if (x
-    // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
     float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
-    float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
+    float32x4_t value1_f32x4 =
+        vld1q_f32(&values[i + kFloatValuesPerNeonVector]);
     float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
     float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
 
-    int32x4_t cmp_with_zero0_ui32x4 =
-        (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4);  // NOLINT
-    int32x4_t cmp_with_zero1_ui32x4 =
-        (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4);  // NOLINT
-
-    float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
-    float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
-    cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
-    cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
-
-    mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
-    mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
-
-    int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
-    int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
+    const int32x4_t f2i0_i32x4 = RoundToNearest(mul0_f32x4);
+    const int32x4_t f2i1_i32x4 = RoundToNearest(mul1_f32x4);
 
     // Implements the vectorized version of the folowing block:
     //  quantized_values[i] = std::min(kScale, std::max(-kScale,
@@ -941,7 +1668,7 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size,
     vst1_s8(&quantized_values[i], min_s8x8);
   }
 
-  for (int i = postamble_start; i < size; ++i) {
+  for (; i < size; ++i) {
     const int32 quantized_value =
         static_cast<int32>(TfLiteRound(scaling_factor_inv * values[i]));
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
@@ -950,27 +1677,22 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size,
 
 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
                                  int v_size) {
-  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
-  // vectorized loop, and we need to process sequentially. postamble_start shows
-  // the start index where this should happen.
-  const int postamble_start =
-      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownToFloatVectors(v_size);
   float32x4_t acc_32x4 = vmovq_n_f32(0.0);
-  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from vector1 and vector2 and accumulator.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
     // Vector multiply-accumulate 4 float
     acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
   }
-#ifdef __aarch64__
-  float result = vaddvq_f32(acc_32x4);
-#else
-  float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
-                  vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
-#endif
+  float result = AccumulateNeonLane(acc_32x4);
   // Postamble loop.
-  for (int v = postamble_start; v < v_size; v++) {
+  for (; v < v_size; v++) {
     result += vector1[v] * vector2[v];
   }
   return result;
@@ -995,50 +1717,26 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector,
                             int output_size, int reduction_size) {
   const float* input_vector_ptr = input_vector;
   for (int o = 0; o < output_size; o++) {
-    // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
-    // the main vectorized loop, and we need to process sequentially.
-    // postamble_start shows the start index where this should happen.
-    const int postamble_start =
-        reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
+    // If v_size is not divisible by the vector size, then we need to process
+    // the final few elements sequentially. postamble_start shows the start
+    // index where this should happen.
+    const int postamble_start = RoundDownToFloatVectors(reduction_size);
     float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
-    for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
+    int r = 0;
+    for (; r < postamble_start; r += kFloatValuesPerNeonVector) {
       float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
       sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
     }
-#ifdef __aarch64__
-    output_vector[o] += vaddvq_f32(sum_f32x4);
-#else
-    output_vector[o] +=
-        (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) +
-         vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3));
-#endif
+    output_vector[o] += AccumulateNeonLane(sum_f32x4);
     input_vector_ptr += postamble_start;
 
     // Postamble loop.
-    for (int r = postamble_start; r < reduction_size; r++) {
+    for (; r < reduction_size; r++) {
       output_vector[o] += *input_vector_ptr++;
     }
   }
 }
 
-void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) {
-  // This variable keeps track of the next to the last index which is being
-  // copied to make sure we are not out of the vector boundary.
-  int last_index_copy = kFloatWeightsPerNeonLane;
-  int current_index_copy = 0;
-  while (last_index_copy < v_size) {
-    float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1);
-    vst1q_f32(vector + current_index_copy, v_f32x4);
-    current_index_copy += kFloatWeightsPerNeonLane;
-    last_index_copy += kFloatWeightsPerNeonLane;
-  }
-  // Postamble loop.
-  for (int i = current_index_copy; i < v_size - 1; i++) {
-    vector[i] = vector[i + 1];
-  }
-  vector[v_size - 1] = shift_value;
-}
-
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index c4f13a17153..2ca5ca36ecb 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -59,6 +59,77 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
                    result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input,
+                   input_zeropoint_times_weights, input_to_gate_weights,
+                   multiplier, shift, n_batch, n_input, n_output, output_zp,
+                   scratch, output);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input,
+                   input_zeropoint_times_weights, input_to_gate_weights,
+                   multiplier, shift, n_batch, n_input, n_output, output_zp,
+                   scratch, output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  NEON_OR_PORTABLE(ApplyLayerNorm, input, layer_norm_weights, bias,
+                   layer_norm_scale_a, layer_norm_scale_b, variance_limit,
+                   n_batch, n_input, output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
+}
+
+void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  NEON_OR_PORTABLE(ApplyTanh3, input, n_batch, n_input, output);
+}
+
+void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  NEON_OR_PORTABLE(ApplyTanh4, input, n_batch, n_input, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output) {
+  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(int16_t* input, const int16_t clipping_value,
+                   int32_t n_batch, int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
+void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
+                   int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
@@ -103,11 +174,6 @@ void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
   PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
 }
 
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
-                             float* batch_vector) {
-  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
-}
-
 void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
   PortableApplySigmoidToVector(vector, v_size, result);
 }
@@ -117,18 +183,10 @@ void ApplyActivationToVector(const float* vector, int v_size,
   PortableApplyActivationToVector(vector, v_size, activation, result);
 }
 
-void CopyVector(const float* vector, int v_size, float* result) {
-  PortableCopyVector(vector, v_size, result);
-}
-
 void Sub1Vector(const float* vector, int v_size, float* result) {
   NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
 }
 
-void ZeroVector(float* vector, int v_size) {
-  PortableZeroVector(vector, v_size);
-}
-
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
 // Check if all entries of a vector are zero.
@@ -152,10 +210,6 @@ void SymmetricQuantizeFloats(const float* values, const int size,
                    min_value, max_value, scaling_factor);
 }
 
-void VectorShiftLeft(float* vector, int v_size, float shift_value) {
-  NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
-}
-
 void ReductionSumVector(const float* input_vector, float* output_vector,
                         int output_size, int reduction_size) {
   NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 40281416338..a015af341bc 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -42,6 +42,47 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                        const int32_t* bias, int32_t layer_norm_scale_a,
+                        int32_t layer_norm_scale_b, int32_t variance_limit,
+                        int n_batch, int n_input, int16_t* output);
+
+void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                      int16_t* output);
+
+void NeonApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int16_t* output);
+
+void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int16_t* output);
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int shift, int16_t* output);
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int shift, int8_t* output);
+
+void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int16_t* output);
+
+void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
+                       int32_t n_batch, int32_t n_input);
+
+void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
+                       int32_t n_batch, int32_t n_input);
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output);
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector. Sparse version.
 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5798ea319de..ba7b0fd2f32 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -56,7 +56,6 @@ namespace tflite {
 namespace optimized_ops {
 
 // Unoptimized reference ops:
-using reference_ops::AffineQuantize;
 using reference_ops::ArgMax;
 using reference_ops::ArgMinMax;
 using reference_ops::Broadcast4DSlowGreater;
@@ -73,7 +72,6 @@ using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
 using reference_ops::ConcatenationWithScaling;
 using reference_ops::DepthConcatenation;
-using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::Elu;
 using reference_ops::FakeQuant;
@@ -268,7 +266,9 @@ inline void FullyConnected(
   TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
   const int output_rows = output_shape.Dims(output_dim_count - 1);
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  }
 
   cpu_backend_gemm::MatrixParams<uint8> lhs_params;
   lhs_params.rows = filter_rows;
@@ -801,6 +801,51 @@ inline void ShuffledFullyConnected(
                                   cpu_backend_context);
 }
 
+#ifdef USE_NEON
+
+inline float32x4_t DivideSumForMeanImpl(
+    const float32x4_t sum, const float32x4_t num_elements_reverse,
+    const bool ordinary_mean, const float32x4_t scale_dup,
+    const float32x4_t zero_point_with_bias_dup) {
+  const float32x4_t val = vmulq_f32(sum, num_elements_reverse);
+  if (!ordinary_mean) {
+#ifdef ARM_FEATURE_FMA
+    return vfmaq_f32(zero_point_with_bias_dup, scale_dup, val);
+#else
+    return vmlaq_f32(zero_point_with_bias_dup, scale_dup, val);
+#endif  // ARM_FEATURE_FMA
+  }
+  return val;
+}
+
+inline int32x4_t RoundToNearest(const float32x4_t input) {
+#if !defined(__aarch64__) && !defined(__SSE4_1__)
+  static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+  static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
+
+  const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
+  const float32x4_t round =
+      vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
+  return vcvtq_s32_f32(vaddq_f32(input, round));
+#else
+  return vcvtnq_s32_f32(input);
+#endif  // !defined(__aarch64__)
+}
+
+inline uint32x4_t RoundToNearestUnsigned(const float32x4_t input) {
+#if defined(__aarch64__) && !defined(__SSE4_1__)
+  // Note that vcvtnq_u32_f32 is not available on the arm_neon_sse.h.
+  return vcvtnq_u32_f32(input);
+#else
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+
+  return vcvtq_u32_f32(vaddq_f32(input, point5_val_dup));
+#endif  // defined(__aarch64__) && !defined(__SSE4_1__)
+}
+
+#endif  // USE_NEON
+
 inline void MeanImpl(const tflite::MeanParams& op_params,
                      const RuntimeShape& input_shape, const uint8_t* input_data,
                      int32 input_zero_point, float input_scale,
@@ -826,7 +871,7 @@ inline void MeanImpl(const tflite::MeanParams& op_params,
 
   const bool ordinary_mean =
       (input_zero_point == output_zero_point && input_scale == output_scale);
-  float scale, bias;
+  float scale = 0.0f, bias = 0.0f;
   if (!ordinary_mean) {
     scale = input_scale / output_scale;
     bias = -input_zero_point * scale + 0.5;
@@ -835,15 +880,10 @@ inline void MeanImpl(const tflite::MeanParams& op_params,
 #ifdef USE_NEON
   const float32x4_t num_elements_dup = vdupq_n_f32(num_elements_in_axis);
   // This is only an approximation as NEON does not offer division instruction.
+  const float32x4_t scale_dup = vdupq_n_f32(scale);
   const float32x4_t num_elements_reverse = vrecpeq_f32(num_elements_dup);
-  const float32x4_t kRounding = vdupq_n_f32(0.5);
-  float32x4_t bias_dup;
-  float32x4_t output_zero_point_dup;
-  if (!ordinary_mean) {
-    bias_dup = vdupq_n_f32(bias);
-    output_zero_point_dup = vdupq_n_f32(output_zero_point);
-  }
-#endif
+  float32x4_t zero_point_with_bias_dup = vdupq_n_f32(output_zero_point + bias);
+#endif  // USE_NEON
 
   for (int out_b = 0; out_b < output_batch; ++out_b) {
     int out_d = start_depth;
@@ -868,28 +908,16 @@ inline void MeanImpl(const tflite::MeanParams& op_params,
         }
       }
 
-      float32x4_t mean_1 = vmulq_f32(temp_sum_1, num_elements_reverse);
-      float32x4_t mean_2 = vmulq_f32(temp_sum_2, num_elements_reverse);
+      const float32x4_t mean_1 =
+          DivideSumForMeanImpl(temp_sum_1, num_elements_reverse, ordinary_mean,
+                               scale_dup, zero_point_with_bias_dup);
+      const float32x4_t mean_2 =
+          DivideSumForMeanImpl(temp_sum_2, num_elements_reverse, ordinary_mean,
+                               scale_dup, zero_point_with_bias_dup);
 
-      if (!ordinary_mean) {
-        // maq is not supported, break down into two ops.
-        mean_1 = vmulq_n_f32(mean_1, scale);
-        mean_1 = vaddq_f32(mean_1, bias_dup);
-        mean_2 = vmulq_n_f32(mean_2, scale);
-        mean_2 = vaddq_f32(mean_2, bias_dup);
-      }
-
-      if (!ordinary_mean) {
-        mean_1 = vaddq_f32(mean_1, output_zero_point_dup);
-        mean_2 = vaddq_f32(mean_2, output_zero_point_dup);
-      }
-
-      // Rounding.
-      mean_1 = vaddq_f32(mean_1, kRounding);
-      mean_2 = vaddq_f32(mean_2, kRounding);
-      uint32x4_t casted_mean_1 = vcvtq_u32_f32(mean_1);
+      uint32x4_t casted_mean_1 = RoundToNearestUnsigned(mean_1);
       uint16x4_t narrow_range_mean_1 = vmovn_u32(casted_mean_1);
-      uint32x4_t casted_mean_2 = vcvtq_u32_f32(mean_2);
+      uint32x4_t casted_mean_2 = RoundToNearestUnsigned(mean_2);
       uint16x4_t narrow_range_mean_2 = vmovn_u32(casted_mean_2);
       uint16x8_t combined_mean =
           vcombine_u16(narrow_range_mean_2, narrow_range_mean_1);
@@ -898,7 +926,7 @@ inline void MeanImpl(const tflite::MeanParams& op_params,
           output_data + Offset(output_shape, out_b, 0, 0, out_d);
       vst1_u8(output_data_ptr, narrowed_combined_mean);
     }
-#endif
+#endif  // USE_NEON
 
     for (; out_d < end_depth; ++out_d) {
       float temp_value = 0;
@@ -1186,7 +1214,7 @@ inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
     scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch];
   }
 
-  tensor_utils::ZeroVector(output_data, output_rows * output_cols);
+  std::fill_n(output_data, output_rows * output_cols, 0.0f);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, filter_rows, filter_cols, gemm_input_data,
@@ -2055,6 +2083,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
       vdup_n_u8(params.quantized_activation_min);
   const auto output_activation_max_vector =
       vdup_n_u8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
   for (; i <= size - 8; i += 8) {
     // We load / store 8 at a time, multiplying as two sets of 4 int32s.
     const auto input1_val_original = vld1_u8(input1_data + i);
@@ -2074,14 +2105,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     auto p1 = vmull_s16(input2_val_low, input1_val_low);
     auto p2 = vmull_s16(input2_val_high, input1_val_high);
 
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    p1 = RoundingDivideByPOT(p1, -params.output_shift);
-    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
 
-    const auto p1_narrowed = vmovn_s32(p1);
-    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
     const auto p =
         vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
     const auto clamped =
@@ -2096,9 +2129,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -2126,6 +2159,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
       vdup_n_u8(params.quantized_activation_min);
   const auto output_activation_max_vector =
       vdup_n_u8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
   for (; i <= size - 8; i += 8) {
     // We load / store 8 at a time, multiplying as two sets of 4 int32s.
     const auto input2_val_original = vld1_u8(input2_data + i);
@@ -2139,11 +2175,13 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
     auto p1 = vmull_n_s16(input2_val_low, input1_val);
     auto p2 = vmull_n_s16(input2_val_high, input1_val);
 
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    p1 = RoundingDivideByPOT(p1, -params.output_shift);
-    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
 
     const auto p1_narrowed = vmovn_s32(p1);
     const auto p2_narrowed = vmovn_s32(p2);
@@ -3485,205 +3523,64 @@ inline void Softmax(const SoftmaxParams& params,
   out_mat.array().rowwise() *= scale;
 }
 
-inline void Softmax(const SoftmaxParams& params,
-                    const RuntimeShape& input_shape, const uint8* input_data,
-                    const RuntimeShape& output_shape, uint8* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
-  const int diff_min = params.diff_min;
-  // The representation chosen for the input to the exp() function is Q5.26.
-  // We need to leave extra space since values that we skip might be as large as
-  // -32 before multiplying by input_beta_multiplier, and therefore as large as
-  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
-  // accumulation, but exp(-16) definitely is.
-  static const int kScaledDiffIntegerBits = 5;
-  static const int kAccumulationIntegerBits = 12;
-  using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+inline int32_t QuantizeSoftmaxOutput(int8_t* output_data, float prob_rescaled,
+                                     int32_t zero_point) {
+  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+  return prob_rnd + zero_point;
+}
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+inline int32_t QuantizeSoftmaxOutput(uint8_t* output_data, float prob_rescaled,
+                                     int32_t zero_point) {
+  return static_cast<int32_t>(prob_rescaled + 0.5);
+}
+
+inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
+                                       float beta) {
+  const float scale = -input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  for (int32_t val = 0; val <= max_uint8; ++val) {
+    data->table[max_uint8 - val] = expf(scale * val);
+  }
+}
+
+template <typename T>
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const RuntimeShape& output_shape, T* output_data) {
   const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size =
+  const int excluding_last_dim =
       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth =
+  const int last_dim =
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-  for (int b = 0; b < outer_size; ++b) {
-    const uint8* input_data_ptr = input_data + b * depth;
-    uint8* output_data_ptr = output_data + b * depth;
-
-    // Determine the largest entry in the current row
-    uint8 max_in_row = 0;
-    {
-      int c = 0;
-#ifdef USE_NEON
-      uint8x16_t max16_0 = vdupq_n_u8(0);
-      uint8x16_t max16_1 = vdupq_n_u8(0);
-      for (; c <= depth - 32; c += 32) {
-        max16_0 = vmaxq_u8(max16_0, vld1q_u8(input_data_ptr + c + 0));
-        max16_1 = vmaxq_u8(max16_1, vld1q_u8(input_data_ptr + c + 16));
-      }
-      uint8x16_t max16 = vmaxq_u8(max16_0, max16_1);
-      if (c <= depth - 16) {
-        max16 = vmaxq_u8(max16, vld1q_u8(input_data_ptr + c));
-        c += 16;
-      }
-      uint8x8_t max8 = vmax_u8(vget_low_u8(max16), vget_high_u8(max16));
-      if (c <= depth - 8) {
-        max8 = vmax_u8(max8, vld1_u8(input_data_ptr + c));
-        c += 8;
-      }
-      uint8x8_t max4 = vmax_u8(max8, vext_u8(max8, max8, 4));
-      uint8x8_t max2 = vmax_u8(max4, vext_u8(max4, max4, 2));
-      uint8x8_t max1 = vpmax_u8(max2, max2);
-      max_in_row = vget_lane_u8(max1, 0);
-#endif
-      for (; c < depth; ++c) {
-        max_in_row = std::max(max_in_row, input_data_ptr[c]);
-      }
+  const int32_t clamp_max = std::numeric_limits<T>::max();
+  const int32_t clamp_min = std::numeric_limits<T>::min();
+  for (int i = 0; i < excluding_last_dim; ++i) {
+    int32_t max_val = std::numeric_limits<T>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j) {
+      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
     }
 
-#ifdef USE_NEON
-    using FixedPointAccumInt32x4 =
-        gemmlowp::FixedPoint<int32x4_t, kAccumulationIntegerBits>;
-    using FixedPointScaledDiffInt32x4 =
-        gemmlowp::FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
-    using FixedPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>;
-    FixedPoint0Int32x4 input_beta_multiplier_f0 =
-        FixedPoint0Int32x4::FromScalarRaw(input_beta_multiplier);
-    int16x8_t max_in_row_s16 = vdupq_n_s16(max_in_row);
-#endif
-
-    // Compute the sum of exponentials of the differences of entries in the
-    // current row from the largest entry in the current row.
-    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-    {
-      int c = 0;
-#ifdef USE_NEON
-      int32x4_t diff_min_s32 = vdupq_n_s32(diff_min);
-      FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
-      FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
-      FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
-      for (; c <= depth - 8; c += 8) {
-        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
-        int16x8_t input_diff_s16 =
-            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
-        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
-        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
-        int32x4_t mask_0 =
-            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
-        int32x4_t mask_1 =
-            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
-        FixedPointScaledDiffInt32x4 scaled_diff_0 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
-        FixedPointScaledDiffInt32x4 scaled_diff_1 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
-        FixedPointAccumInt32x4 exps_0 =
-            gemmlowp::Rescale<kAccumulationIntegerBits>(
-                exp_on_negative_values(scaled_diff_0));
-        FixedPointAccumInt32x4 exps_1 =
-            gemmlowp::Rescale<kAccumulationIntegerBits>(
-                exp_on_negative_values(scaled_diff_1));
-        FixedPointAccumInt32x4 masked_exps_0 =
-            SelectUsingMask(mask_0, exps_0, zeros);
-        FixedPointAccumInt32x4 masked_exps_1 =
-            SelectUsingMask(mask_1, exps_1, zeros);
-        sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
-        sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
-      }
-      int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
-      int32x2_t sum_of_exps_reduced_2 =
-          vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
-                   vget_high_s32(sum_of_exps_reduced_4));
-      int32x2_t sum_of_exps_reduced_1 =
-          vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
-      sum_of_exps =
-          FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
-#endif
-      for (; c < depth; ++c) {
-        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
-        if (input_diff >= diff_min) {
-          const int32 input_diff_rescaled =
-              MultiplyByQuantizedMultiplierGreaterThanOne(
-                  input_diff, input_beta_multiplier, input_beta_left_shift);
-          const FixedPointScaledDiff scaled_diff_f8 =
-              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-          sum_of_exps =
-              sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                exp_on_negative_values(scaled_diff_f8));
-        }
-      }
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const float* table_offset = &params.table[max_uint8 - max_val];
+    // Calculate normalizer sum(exp(x)).
+    for (int j = 0; j < last_dim; ++j) {
+      sum_exp += table_offset[input_data[j]];
     }
 
-    // Compute the fixed-point multiplier and shift that we need to apply to
-    // perform a division by the above-computed sum-of-exponentials.
-    int num_bits_over_unit = 0;
-    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
-        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
-
-    // Compute the quotients of exponentials of differences of entries in the
-    // current row from the largest entry, over the previously-computed sum of
-    // exponentials.
-    {
-      int c = 0;
-#ifdef USE_NEON
-      int16x8_t diff_min_s16 = vdupq_n_s16(diff_min);
-      for (; c <= depth - 8; c += 8) {
-        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
-        int16x8_t input_diff_s16 =
-            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
-        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
-        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
-        uint8x8_t mask = vmovn_u16(vcgeq_s16(input_diff_s16, diff_min_s16));
-        FixedPointScaledDiffInt32x4 scaled_diff_0 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
-        FixedPointScaledDiffInt32x4 scaled_diff_1 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
-        FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
-        FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
-        int32x4_t output_s32_0 = gemmlowp::RoundingDivideByPOT(
-            vqrdmulhq_n_s32(exp_0.raw(), shifted_scale.raw()),
-            num_bits_over_unit + 31 - 8);
-        int32x4_t output_s32_1 = gemmlowp::RoundingDivideByPOT(
-            vqrdmulhq_n_s32(exp_1.raw(), shifted_scale.raw()),
-            num_bits_over_unit + 31 - 8);
-        int16x8_t output_s16 =
-            vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
-        uint8x8_t output_u8 = vqmovun_s16(output_s16);
-        uint8x8_t masked_output = vbsl_u8(mask, output_u8, vdup_n_u8(0));
-        vst1_u8(output_data_ptr + c, masked_output);
-      }
-#endif
-      for (; c < depth; ++c) {
-        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
-        if (input_diff >= diff_min) {
-          const int32 input_diff_rescaled =
-              MultiplyByQuantizedMultiplierGreaterThanOne(
-                  input_diff, input_beta_multiplier, input_beta_left_shift);
-          const FixedPointScaledDiff scaled_diff_f8 =
-              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-          FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-          int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-              (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
-          output_data_ptr[c] = std::max(std::min(unsat_output, 255), 0);
-
-        } else {
-          output_data_ptr[c] = 0;
-        }
-      }
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+    // Normalize and quantize probabilities.
+    for (int j = 0; j < last_dim; ++j) {
+      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+      const int32_t prob_quantized =
+          QuantizeSoftmaxOutput(output_data, prob_rescaled, params.zero_point);
+      output_data[j] = static_cast<T>(
+          std::max(std::min(clamp_max, prob_quantized), clamp_min));
     }
+    input_data += last_dim;
+    output_data += last_dim;
   }
 }
 
@@ -5152,7 +5049,7 @@ inline void TransposeConvV2(
   lhs_params.rows = hwoi_ordered_filter_total_size;
   lhs_params.cols = input_depth;
   float* output_data_p = output_data;
-  tensor_utils::ZeroVector(output_data, output_offset * batch_size);
+  std::fill_n(output_data, output_offset * batch_size, 0.0f);
   for (int i = 0; i < batch_size; ++i) {
     cpu_backend_gemm::MatrixParams<float> rhs_params;
     rhs_params.order = cpu_backend_gemm::Order::kColMajor;
@@ -5257,10 +5154,13 @@ inline void Requantize(const input_type* input_data, int32_t size,
 
 #ifdef USE_NEON
 
+// TODO(jaesung): Merge duplicated implementations in optimized_ops.h and
+// neon_tensor_utils.cc.
 inline void MultiplyByQuantizedMultiplier4Rows(
-    int32x4_t input_val_1, int32x4_t input_val_2, int32x4_t input_val_3,
-    int32x4_t input_val_4, int32_t multiplier, int32_t left_shifted_one,
-    int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2,
+    const int32x4_t input_val_1, const int32x4_t input_val_2,
+    const int32x4_t input_val_3, const int32x4_t input_val_4,
+    const int32_t multiplier, const int32_t left_shifted_one,
+    const int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2,
     int32x4_t* result_val_3, int32x4_t* result_val_4) {
   using gemmlowp::RoundingDivideByPOT;
   using gemmlowp::SaturatingRoundingDoublingHighMul;
@@ -5300,20 +5200,21 @@ inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
   int i = 0;
 #ifdef USE_NEON
   // Constants.
-  int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
-  int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
-  int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
-  int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
 
   // Left shift & right shift unconditionally.
-  int32_t left_shifted_one =
+  const int32_t left_shifted_one =
       effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
-  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+  const int32_t right_shift =
+      effective_scale_shift > 0 ? 0 : -effective_scale_shift;
 
   for (; i <= size - 16; i += 16) {
-    int8x16_t input_vec = vld1q_s8(input_data + i);
-    int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
-    int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
     int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half));
     int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half));
     int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half));
@@ -5338,21 +5239,27 @@ inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
     result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
     result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
 
-    uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result_val_1);
-    uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result_val_2);
-    uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result_val_3);
-    uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result_val_4);
+    const uint32x4_t result_val_1_unsigned =
+        vreinterpretq_u32_s32(result_val_1);
+    const uint32x4_t result_val_2_unsigned =
+        vreinterpretq_u32_s32(result_val_2);
+    const uint32x4_t result_val_3_unsigned =
+        vreinterpretq_u32_s32(result_val_3);
+    const uint32x4_t result_val_4_unsigned =
+        vreinterpretq_u32_s32(result_val_4);
 
-    uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
-    uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
-    uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
-    uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
-    uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
-    uint16x8_t output_second_half =
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half =
+        vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half =
         vcombine_u16(narrowed_val_3, narrowed_val_4);
-    uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
-    uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
-    uint8x16_t result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t result =
+        vcombine_u8(narrowed_first_half, narrowed_second_half);
     vst1q_u8(output_data + i, result);
   }
 
@@ -5376,7 +5283,7 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
                                         int32_t input_zeropoint,
                                         int32_t output_zeropoint,
                                         int8_t* output_data) {
-  gemmlowp::ScopedProfilingLabel label("Requantize/UInt8ToInt8");
+  gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToInt8");
 
   static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
   static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
@@ -5384,20 +5291,21 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
   int i = 0;
 #ifdef USE_NEON
   // Constants.
-  int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
-  int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
-  int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
-  int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
 
   // Left shift & right shift unconditionally.
-  int32_t left_shifted_one =
+  const int32_t left_shifted_one =
       effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
-  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+  const int32_t right_shift =
+      effective_scale_shift > 0 ? 0 : -effective_scale_shift;
 
   for (; i <= size - 16; i += 16) {
-    uint8x16_t input_vec = vld1q_u8(input_data + i);
-    uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
-    uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
     int32x4_t input_val_1 =
         vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
     int32x4_t input_val_2 =
@@ -5426,15 +5334,18 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
     result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
     result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
 
-    int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
-    int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
-    int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
-    int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
-    int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
-    int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
-    int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
-    int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
-    int8x16_t result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
+    const int16x8_t output_first_half =
+        vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half =
+        vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t result =
+        vcombine_s8(narrowed_first_half, narrowed_second_half);
     vst1q_s8(output_data + i, result);
   }
 
@@ -5451,6 +5362,180 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
   }
 }
 
+template <>
+inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
+                                       int32_t effective_scale_multiplier,
+                                       int32_t effective_scale_shift,
+                                       int32_t input_zeropoint,
+                                       int32_t output_zeropoint,
+                                       int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Requantize/Int8ToInt8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  // Left shift & right shift unconditionally.
+  int32_t left_shifted_one =
+      effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
+  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half));
+    int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half));
+    int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half));
+    int32x4_t input_val_4 = vmovl_s16(vget_high_s16(second_half));
+
+    input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
+    input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
+    input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
+    input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
+
+    int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
+    MultiplyByQuantizedMultiplier4Rows(
+        input_val_1, input_val_2, input_val_3, input_val_4,
+        effective_scale_multiplier, left_shifted_one, right_shift,
+        &result_val_1, &result_val_2, &result_val_3, &result_val_4);
+
+    result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
+    result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
+    result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
+    result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
+    result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
+    result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
+    result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
+    result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
+    const int16x8_t output_first_half =
+        vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half =
+        vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t result =
+        vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<uint8_t, uint8_t>(
+    const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
+    int32_t effective_scale_shift, int32_t input_zeropoint,
+    int32_t output_zeropoint, uint8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToUint8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  // Left shift & right shift unconditionally.
+  int32_t left_shifted_one =
+      effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
+  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+
+  for (; i <= size - 16; i += 16) {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4_t input_val_1 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    int32x4_t input_val_2 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    int32x4_t input_val_3 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    int32x4_t input_val_4 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
+    input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
+    input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
+    input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
+
+    int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
+    MultiplyByQuantizedMultiplier4Rows(
+        input_val_1, input_val_2, input_val_3, input_val_4,
+        effective_scale_multiplier, left_shifted_one, right_shift,
+        &result_val_1, &result_val_2, &result_val_3, &result_val_4);
+
+    result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
+    result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
+    result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
+    result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
+    result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
+    result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
+    result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
+    result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned =
+        vreinterpretq_u32_s32(result_val_1);
+    const uint32x4_t result_val_2_unsigned =
+        vreinterpretq_u32_s32(result_val_2);
+    const uint32x4_t result_val_3_unsigned =
+        vreinterpretq_u32_s32(result_val_3);
+    const uint32x4_t result_val_4_unsigned =
+        vreinterpretq_u32_s32(result_val_4);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half =
+        vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half =
+        vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t result =
+        vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
 inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
                       const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("HardSwish/Float");
@@ -5748,6 +5833,867 @@ inline void BroadcastPow4D(const RuntimeShape& unextended_input1_shape,
                                     unextended_output_shape, output_data);
 }
 
+#ifdef USE_NEON
+
+inline void ScaleWithNewZeroPoint(const int32x4_t input,
+                                  const float32x4_t scale_dup,
+                                  const float32x4_t zero_times_scale_dup,
+                                  float32x4_t* output) {
+#ifdef __ARM_FEATURE_FMA
+  *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
+#else
+  *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup),
+                      zero_times_scale_dup);
+#endif
+}
+
+#endif  // USE_NEON
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize/Uint8");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const uint8x8_t input_u8 = vld1_u8(input_data + i);
+    const uint16x8_t input_u16 = vmovl_u8(input_u8);
+    const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize/Int8");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const int8x8_t input_s8 = vld1_s8(input_data + i);
+    const int16x8_t input_s16 = vmovl_s8(input_s8);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const int16_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize/Int16");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const int16x4_t input_s16_low = vld1_s16(input_data + i);
+    const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const RuntimeShape& input_shape,
+                       const Eigen::half* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  reference_ops::Dequantize(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape, T* output_data) {
+  reference_ops::AffineQuantize(op_params, input_shape, input_data,
+                                output_shape, output_data);
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Quantize/Int8");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32 min_val = std::numeric_limits<int8_t>::min();
+  static constexpr int32 max_val = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
+    const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
+    vst1_s8(output_data + i, combined_val_narrowed);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32 unclamped =
+        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
+    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           uint8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Quantize/Uint8");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32 min_val = std::numeric_limits<uint8_t>::min();
+  static constexpr int32 max_val = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
+    const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
+    const uint16x8_t combined_val =
+        vcombine_u16(narrowed_val_0, narrowed_val_1);
+    const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
+    vst1_u8(output_data + i, combined_val_narrowed);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32 unclamped =
+        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
+    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           int16_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Quantize/Int16");
+  const int32 zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32 min_val = std::numeric_limits<int16_t>::min();
+  static constexpr int32 max_val = std::numeric_limits<int16_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    vst1_s16(output_data + i, narrowed_val_0);
+    vst1_s16(output_data + i + 4, narrowed_val_1);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32 unclamped =
+        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
+    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+
+inline int16x8x4_t SaturatingRounding(
+    int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
+    int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
+  // This performs what is expressed in the scalar code as
+  // const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
+  //      static_cast<int16>(input_val_centered * (1 << input_left_shift)),
+  //      static_cast<int16>(input_multiplier));
+  const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
+  const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
+  const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
+  const int16x8_t input_val_shifted_2 = vshlq_s16(input_val_2, left_shift_dup);
+  const int16x8_t input_val_shifted_3 = vshlq_s16(input_val_3, left_shift_dup);
+  int16x8x4_t result;
+  result.val[0] = vqrdmulhq_n_s16(input_val_shifted_0, input_multiplier);
+  result.val[1] = vqrdmulhq_n_s16(input_val_shifted_1, input_multiplier);
+  result.val[2] = vqrdmulhq_n_s16(input_val_shifted_2, input_multiplier);
+  result.val[3] = vqrdmulhq_n_s16(input_val_shifted_3, input_multiplier);
+  return result;
+}
+
+// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
+// considering 7 digits under zero.
+inline int16x8x4_t FixedPoint4Logistic(int16x8x4_t input_val) {
+  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
+  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
+  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
+  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
+  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
+
+  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
+  // method, gemmlowp::tanh spends about 80% of the execution times. The
+  // current implementation is rougly 12-bit accurate in the 16-bit fixed
+  // point case. Until reaching to error bounds, there are rooms for
+  // improvements.
+  const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
+  const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
+  const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
+  const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
+
+  // Divide by 2^7 as in the scalar code
+  int16x8x4_t result;
+  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 7);
+  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 7);
+  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 7);
+  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 7);
+  return result;
+}
+
+// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
+// considering 11 digits under zero at least.
+inline int16x8x4_t FixedPoint4Tanh(int16x8x4_t input_val) {
+  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
+  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
+  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
+  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
+  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
+
+  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
+  // method, gemmlowp::tanh spends about 80% of the execution times. The
+  // current implementation is rougly 12-bit accurate in the 16-bit fixed
+  // point case. Until reaching to error bounds, there are rooms for
+  // improvements.
+  const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
+  const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
+  const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
+  const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
+
+  // Divide by 2^7 as in the scalar code
+  int16x8x4_t result;
+  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 8);
+  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 8);
+  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 8);
+  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 8);
+  return result;
+}
+
+inline uint8x16x2_t CalculateUnsignedClampingWithRangeBitMasks(
+    int16x8x2_t input_val, int16x8_t range_radius_dup,
+    int16x8_t neg_range_radius_dup) {
+  const uint16x8_t mask_rightclamp_0 =
+      vcgtq_s16(input_val.val[0], range_radius_dup);
+  const uint16x8_t mask_rightclamp_1 =
+      vcgtq_s16(input_val.val[1], range_radius_dup);
+
+  const uint16x8_t mask_leftclamp_0 =
+      vcgeq_s16(input_val.val[0], neg_range_radius_dup);
+  const uint16x8_t mask_leftclamp_1 =
+      vcgeq_s16(input_val.val[1], neg_range_radius_dup);
+
+  uint8x16x2_t result;
+  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                              vshrn_n_u16(mask_leftclamp_1, 8));
+  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                              vshrn_n_u16(mask_rightclamp_1, 8));
+  return result;
+}
+
+inline uint8x16x2_t CalculateSignedClampingWithRangeBitMasks(
+    int16x8x2_t input_val, int16x8_t range_radius_dup,
+    int16x8_t neg_range_radius_dup) {
+  const uint16x8_t mask_rightclamp_0 =
+      vcgtq_s16(input_val.val[0], range_radius_dup);
+  const uint16x8_t mask_rightclamp_1 =
+      vcgtq_s16(input_val.val[1], range_radius_dup);
+
+  const uint16x8_t mask_leftclamp_0 =
+      vcltq_s16(input_val.val[0], neg_range_radius_dup);
+  const uint16x8_t mask_leftclamp_1 =
+      vcltq_s16(input_val.val[1], neg_range_radius_dup);
+
+  uint8x16x2_t result;
+  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                              vshrn_n_u16(mask_leftclamp_1, 8));
+  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                              vshrn_n_u16(mask_rightclamp_1, 8));
+  return result;
+}
+
+inline void ClampWithRangeAndStore(uint8_t* output_dst, uint8x16_t input_val,
+                                   uint8x16x2_t masks_clamp) {
+  // Store back to memory
+  vst1q_u8(output_dst, vandq_u8(vorrq_u8(input_val, masks_clamp.val[1]),
+                                masks_clamp.val[0]));
+}
+
+inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
+                                   uint8x16x2_t masks_clamp) {
+  static const int8x16_t max_dup = vdupq_n_s8(127);
+  static const int8x16_t min_dup = vdupq_n_s8(-128);
+  // Store back to memory
+  vst1q_s8(output_dst,
+           vbslq_s8(masks_clamp.val[1], max_dup,
+                    vbslq_s8(masks_clamp.val[0], min_dup, input_val)));
+}
+
+#endif  // GEMMLOWP_NEON
+
+inline void Tanh16bitPercision(const TanhParams& params,
+                               const RuntimeShape& input_shape,
+                               const uint8* input_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  // Note that this is almost the exact same code as in Logistic().
+  gemmlowp::ScopedProfilingLabel label("Tanh/Uint8");
+  const int32 input_zero_point = params.input_zero_point;
+  const int32 input_range_radius = params.input_range_radius;
+  const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
+  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  int16_t output_zero_point = 128;
+
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+  const int16x8_t output_zero_point_s16 = vdupq_n_s16(output_zero_point);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
+
+    // Add the output zero point
+    output_val_s16.val[0] =
+        vaddq_s16(output_val_s16.val[0], output_zero_point_s16);
+    output_val_s16.val[1] =
+        vaddq_s16(output_val_s16.val[1], output_zero_point_s16);
+    output_val_s16.val[2] =
+        vaddq_s16(output_val_s16.val[2], output_zero_point_s16);
+    output_val_s16.val[3] =
+        vaddq_s16(output_val_s16.val[3], output_zero_point_s16);
+
+    // Cast output values to uint8, saturating
+    uint8x16_t output_val_u8_0_1 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
+    uint8x16_t output_val_u8_2_3 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8 input_val_u8 = input_data[c];
+    const int16 input_val_centered =
+        static_cast<int16>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      output_val_s16 += output_zero_point;
+      if (output_val_s16 == 256) {
+        output_val_s16 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, 0);
+      TFLITE_DCHECK_LE(output_val_s16, 255);
+      output_val = static_cast<uint8>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Tanh16bitPercision(const TanhParams& params,
+                               const RuntimeShape& input_shape,
+                               const int8* input_data,
+                               const RuntimeShape& output_shape,
+                               int8* output_data) {
+  // Note that this is almost the exact same code as in Logistic().
+  gemmlowp::ScopedProfilingLabel label("Tanh/Int8");
+  const int32 input_zero_point = params.input_zero_point;
+  const int32 input_range_radius = params.input_range_radius;
+  const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
+  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input int8 values, cast to int16 and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = -128;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 127;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
+
+    // Cast output values to uint8, saturating
+    int8x16_t output_val_s8_0_1 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
+    int8x16_t output_val_s8_2_3 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const int8 input_val_s8 = input_data[c];
+    const int16 input_val_centered =
+        static_cast<int16>(input_val_s8) - input_zero_point;
+    int8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = -128;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 127;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      if (output_val_s16 == 128) {
+        output_val_s16 = 127;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, -128);
+      TFLITE_DCHECK_LE(output_val_s16, 127);
+      output_val = static_cast<int8>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Logistic16bitPercision(const LogisticParams& params,
+                                   const RuntimeShape& input_shape,
+                                   const uint8* input_data,
+                                   const RuntimeShape& output_shape,
+                                   uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
+  const int32 input_zero_point = params.input_zero_point;
+  const int32 input_range_radius = params.input_range_radius;
+  const int32 input_multiplier = params.input_multiplier;
+  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
+
+    // Cast output values to uint8, saturating
+    uint8x16_t output_val_u8_0_1 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
+    uint8x16_t output_val_u8_2_3 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8 input_val_u8 = input_data[c];
+    const int16 input_val_centered =
+        static_cast<int16>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      if (output_val_s16 == 256) {
+        output_val_s16 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, 0);
+      TFLITE_DCHECK_LE(output_val_s16, 255);
+      output_val = static_cast<uint8>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Logistic16bitPercision(const LogisticParams& params,
+                                   const RuntimeShape& input_shape,
+                                   const int8* input_data,
+                                   const RuntimeShape& output_shape,
+                                   int8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Logistic/Int8");
+  const int32 input_zero_point = params.input_zero_point;
+  const int32 input_range_radius = params.input_range_radius;
+  const int32 input_multiplier = params.input_multiplier;
+  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  const int16 output_zero_point = 128;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+  const int16x8_t output_zero_point_dup = vdupq_n_s16(output_zero_point);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input int8 values, cast to int16 and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = -128;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 127;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
+
+    // Substract output zero point.
+    output_val_s16.val[0] =
+        vsubq_s16(output_val_s16.val[0], output_zero_point_dup);
+    output_val_s16.val[1] =
+        vsubq_s16(output_val_s16.val[1], output_zero_point_dup);
+    output_val_s16.val[2] =
+        vsubq_s16(output_val_s16.val[2], output_zero_point_dup);
+    output_val_s16.val[3] =
+        vsubq_s16(output_val_s16.val[3], output_zero_point_dup);
+
+    // Cast output values to int8, saturating
+    int8x16_t output_val_s8_0_1 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
+    int8x16_t output_val_s8_2_3 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const int8 input_val_s8 = input_data[c];
+    const int16 input_val_centered =
+        static_cast<int16>(input_val_s8) - input_zero_point;
+    int8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = -128;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 127;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      output_val_s16 -= output_zero_point;
+      if (output_val_s16 == 128) {
+        output_val_s16 = 127;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, -128);
+      TFLITE_DCHECK_LE(output_val_s16, 127);
+      output_val = static_cast<int8>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 41f7194297d..4f26ebde562 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -70,6 +70,75 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
                   result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  PortableApplyTanh3(input, n_batch, n_input, output);
+}
+
+void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  PortableApplyTanh4(input, n_batch, n_input, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(int16_t* input, const int16_t clipping_value,
+                   int32_t n_batch, int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
+void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
+                   int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
@@ -114,11 +183,6 @@ void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
   PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
 }
 
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
-                             float* batch_vector) {
-  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
-}
-
 void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
   PortableApplySigmoidToVector(vector, v_size, result);
 }
@@ -128,18 +192,10 @@ void ApplyActivationToVector(const float* vector, int v_size,
   PortableApplyActivationToVector(vector, v_size, activation, result);
 }
 
-void CopyVector(const float* vector, int v_size, float* result) {
-  PortableCopyVector(vector, v_size, result);
-}
-
 void Sub1Vector(const float* vector, int v_size, float* result) {
   NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
 }
 
-void ZeroVector(float* vector, int v_size) {
-  PortableZeroVector(vector, v_size);
-}
-
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
 // Check if all entries of a vector are zero.
@@ -163,10 +219,6 @@ void SymmetricQuantizeFloats(const float* values, const int size,
                    min_value, max_value, scaling_factor);
 }
 
-void VectorShiftLeft(float* vector, int v_size, float shift_value) {
-  NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
-}
-
 void ReductionSumVector(const float* input_vector, float* output_vector,
                         int output_size, int reduction_size) {
   NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
diff --git a/tensorflow/lite/kernels/internal/quantization_util.cc b/tensorflow/lite/kernels/internal/quantization_util.cc
index 71eef71372c..d982859b7e4 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -72,6 +72,20 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
     ++*shift;
   }
   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (*shift < -31) {
+    *shift = 0;
+    q_fixed = 0;
+  }
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
 
@@ -299,16 +313,18 @@ void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
                                               reverse_scaling_left_shift);
 }
 
-int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+int CalculateInputRadius(int input_integer_bits, int input_left_shift,
+                         int total_signed_bits) {
 #ifdef TFLITE_EMULATE_FLOAT
   int64_t result = (1 << input_integer_bits) - 1;
-  result <<= (31 - input_integer_bits);
+  result <<= (total_signed_bits - input_integer_bits);
   result >>= input_left_shift;
   return result;
 #else   // TFLITE_EMULATE_FLOAT
-  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
-                                    (1ll << (31 - input_integer_bits)) /
-                                    (1ll << input_left_shift);
+  const double max_input_rescaled =
+      1.0 * ((1 << input_integer_bits) - 1) *
+      (1ll << (total_signed_bits - input_integer_bits)) /
+      (1ll << input_left_shift);
   // Tighten bound using floor.  Suppose that we could use the exact value.
   // After scaling the difference, the result would be at the maximum.  Thus we
   // must ensure that our value has lower magnitude.
diff --git a/tensorflow/lite/kernels/internal/quantization_util.h b/tensorflow/lite/kernels/internal/quantization_util.h
index 5d67c0d0277..d380725257e 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/lite/kernels/internal/quantization_util.h
@@ -252,7 +252,8 @@ void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
 // it must not overflow before we reduce the value by multiplication by the
 // input multiplier.  The negative radius is used as the minimum difference in
 // Softmax.
-int CalculateInputRadius(int input_integer_bits, int input_left_shift);
+int CalculateInputRadius(int input_integer_bits, int input_left_shift,
+                         int total_signed_bits = 31);
 
 // Nudges a min/max quantization range to ensure zero is zero.
 // Gymnastics with nudged zero point is to ensure that real zero maps to
diff --git a/tensorflow/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc
index ca4ff370ad4..338236f37a4 100644
--- a/tensorflow/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc
@@ -381,6 +381,22 @@ TEST(QuantizationUtilTest, QuantizeMultiplierGreaterThanOne) {
   EXPECT_THAT(quantize(2), Pair(1073741824, 2));
 }
 
+#ifndef __APPLE__  // Some Apple toolchains don't support std::ldexp
+TEST(QuantizationUtilTest, QuantizeMultiplierUnderflow) {
+  auto quantize = [](double d) {
+    int32_t q;
+    int s;
+    QuantizeMultiplier(d, &q, &s);
+    return std::pair<int32_t, int>{q, s};
+  };
+
+  EXPECT_THAT(quantize(std::ldexp(1.0f, -31)), Pair(1073741824, -30));
+  EXPECT_THAT(quantize(std::ldexp(1.0f, -32)), Pair(1073741824, -31));
+  EXPECT_THAT(quantize(std::ldexp(0.99f, -32)), Pair(0, 0));
+  EXPECT_THAT(quantize(std::ldexp(1.0f, -33)), Pair(0, 0));
+}
+#endif
+
 TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
   auto quantize = [](double beta, double scale, int integer_bits) {
     int32_t q;
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
new file mode 100644
index 00000000000..5193a586fd0
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -0,0 +1,418 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8 input1_data, const uint8* input2_data,
+                               uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  const int32 input1_val = params.input1_offset + input1_data;
+  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32 scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  for (int i = 0; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingFlatSize(output_shape, input1_shape, input2_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16 raw_output = result.raw();
+    const int16 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const float* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const float* input2_data,
+                               const RuntimeShape& output_shape,
+                               float* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.float_activation_min, params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int32* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int32* input2_data,
+                               const RuntimeShape& output_shape,
+                               int32* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32 shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32 clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
diff --git a/tensorflow/lite/kernels/internal/reference/arg_min_max.h b/tensorflow/lite/kernels/internal/reference/arg_min_max.h
new file mode 100644
index 00000000000..e6f34fd73f4
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/arg_min_max.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+               const T3* input2_data, const RuntimeShape& output_shape,
+               T2* output_data, const Cmp& cmp) {
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
+                   output_shape.DimensionsCount());
+  int axis = input2_data[0];
+  if (axis < 0) {
+    axis += input1_shape.DimensionsCount();
+  }
+  const int axis_size = input1_shape.Dims(axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
+    inner_size *= input1_shape.Dims(i);
+  }
+  for (int outer = 0; outer < outer_size; ++outer) {
+    for (int inner = 0; inner < inner_size; ++inner) {
+      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
+      T2 min_max_index = 0;
+      for (int i = 1; i < axis_size; ++i) {
+        const auto& curr_value =
+            input1_data[(outer * axis_size + i) * inner_size + inner];
+        if (cmp(curr_value, min_max_value)) {
+          min_max_value = curr_value;
+          min_max_index = static_cast<T2>(i);
+        }
+      }
+      output_data[outer * inner_size + inner] = min_max_index;
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/binary_function.h b/tensorflow/lite/kernels/internal/reference/binary_function.h
new file mode 100644
index 00000000000..82095af84a4
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/binary_function.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
+// generalized and efficient BroadcastBinaryFunction.
+//
+// Also appears to duplicte MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(
+    const RuntimeShape& unextended_input1_shape, const T1* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T2* input2_data,
+    const RuntimeShape& unextended_output_shape, R* output_data,
+    R (*func)(T1, T2)) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+// TODO(renjieliu): Refactor other binary functions to use this one.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const RuntimeShape& input1_shape,
+                           const T1* input1_data,
+                           const RuntimeShape& input2_shape,
+                           const T2* input2_data,
+                           const RuntimeShape& output_shape, R* output_data,
+                           R (*func)(T1, T2)) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
diff --git a/tensorflow/lite/kernels/internal/reference/ceil.h b/tensorflow/lite/kernels/internal/reference/ceil.h
new file mode 100644
index 00000000000..66d1dc3599c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/ceil.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = std::ceil(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
new file mode 100644
index 00000000000..19a968e4670
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -0,0 +1,267 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline bool EqualFn(T lhs, T rhs) {
+  return lhs == rhs;
+}
+
+template <typename T>
+inline bool NotEqualFn(T lhs, T rhs) {
+  return lhs != rhs;
+}
+
+template <typename T>
+inline bool GreaterFn(T lhs, T rhs) {
+  return lhs > rhs;
+}
+template <typename T>
+inline bool GreaterEqualFn(T lhs, T rhs) {
+  return lhs >= rhs;
+}
+template <typename T>
+inline bool LessFn(T lhs, T rhs) {
+  return lhs < rhs;
+}
+template <typename T>
+inline bool LessEqualFn(T lhs, T rhs) {
+  return lhs <= rhs;
+}
+
+template <typename T>
+using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void ComparisonImpl(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void Comparison(const ComparisonParams& op_params,
+                       const RuntimeShape& input1_shape,
+                       const float* input1_data,
+                       const RuntimeShape& input2_shape,
+                       const float* input2_data,
+                       const RuntimeShape& output_shape, bool* output_data) {
+  ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void ComparisonWithScaling(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  int left_shift = op_params.left_shift;
+  int32 input1_offset = op_params.input1_offset;
+  int32 input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32 input2_offset = op_params.input2_offset;
+  int32 input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier, input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier, input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison4DSlowImpl(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          output_data[Offset(output_shape, b, y, x, c)] =
+              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+template <ComparisonFn<float> F>
+inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
+                                      const RuntimeShape& input1_shape,
+                                      const float* input1_data,
+                                      const RuntimeShape& input2_shape,
+                                      const float* input2_data,
+                                      const RuntimeShape& output_shape,
+                                      bool* output_data) {
+  BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
+                                          input2_shape, input2_data,
+                                          output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void BroadcastComparison4DSlowWithScaling(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  int left_shift = op_params.left_shift;
+  int32 input1_offset = op_params.input1_offset;
+  int32 input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32 input2_offset = op_params.input2_offset;
+  int32 input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          output_data[Offset(output_shape, b, y, x, c)] =
+              F(scaled_input1_val, scaled_input2_val);
+        }
+      }
+    }
+  }
+}
+
+#define TFLITE_COMPARISON_OP(name)                                             \
+  inline void name(const ComparisonParams& op_params,                          \
+                   const RuntimeShape& input1_shape, const float* input1_data, \
+                   const RuntimeShape& input2_shape, const float* input2_data, \
+                   const RuntimeShape& output_shape, bool* output_data) {      \
+    Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape,   \
+                         input2_data, output_shape, output_data);              \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##NoScaling(                                                 \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data,          \
+                                input2_shape, input2_data, output_shape,       \
+                                output_data);                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##WithScaling(                                               \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data,   \
+                                       input2_shape, input2_data,              \
+                                       output_shape, output_data);             \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##NoScaling(                                \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(                                \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }                                                                            \
+  inline void Broadcast4DSlow##name(                                           \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const float* input1_data, const RuntimeShape& input2_shape,              \
+      const float* input2_data, const RuntimeShape& output_shape,              \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data,  \
+                                        input2_shape, input2_data,             \
+                                        output_shape, output_data);            \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##WithScaling(                              \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                         \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }
+TFLITE_COMPARISON_OP(Equal);
+TFLITE_COMPARISON_OP(NotEqual);
+TFLITE_COMPARISON_OP(Greater);
+TFLITE_COMPARISON_OP(GreaterEqual);
+TFLITE_COMPARISON_OP(Less);
+TFLITE_COMPARISON_OP(LessEqual);
+#undef TFLITE_COMPARISON_OP
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
diff --git a/tensorflow/lite/kernels/internal/reference/floor.h b/tensorflow/lite/kernels/internal/reference/floor.h
new file mode 100644
index 00000000000..0693fd42987
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/floor.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index e3138e86b1f..e10092bafb5 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <limits>
 
-#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -82,7 +81,6 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const int8_t* input2_data,
                                const RuntimeShape& output_shape,
                                int8_t* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int8");
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
index dad17fb7f4a..9c629ff2b8e 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -30,9 +30,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -112,9 +112,9 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32 unclamped_result =
               params.output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  input1_val * input2_val, params.output_multiplier,
-                  params.output_shift);
+              MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                            params.output_multiplier,
+                                            params.output_shift);
           const int32 clamped_output = std::min(
               params.quantized_activation_max,
               std::max(params.quantized_activation_min, unclamped_result));
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 082f86e5c9e..615abdfcfaf 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -2192,25 +2192,6 @@ inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims,
                      DimsToShape(output_dims), output_data);
 }
 
-inline void Logical(const bool* input1_data, const Dims<4>& input1_dims,
-                    const bool* input2_data, const Dims<4>& input2_dims,
-                    bool* output_data, const Dims<4>& output_dims,
-                    const std::function<bool(bool, bool)>& func) {
-  Logical(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims),
-          input2_data, DimsToShape(output_dims), output_data, func);
-}
-
-inline void BroadcastLogical(const bool* input1_data,
-                             const Dims<4>& input1_dims,
-                             const bool* input2_data,
-                             const Dims<4>& input2_dims, bool* output_data,
-                             const Dims<4>& output_dims,
-                             const std::function<bool(bool, bool)>& func) {
-  BroadcastLogical4DSlow(DimsToShape(input1_dims), input1_data,
-                         DimsToShape(input2_dims), input2_data,
-                         DimsToShape(output_dims), output_data, func);
-}
-
 // R: Result type. T1: Input 1 type. T2: Input 2 type.
 template <typename R, typename T1, typename T2>
 inline void BroadcastBinaryFunction(const T1* input1_data,
diff --git a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
new file mode 100644
index 00000000000..480069aa13e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T, typename Op>
+void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
+                                   const T* input1_data,
+                                   const RuntimeShape& unextended_input2_shape,
+                                   const T* input2_data,
+                                   const RuntimeShape& unextended_output_shape,
+                                   T* output_data, Op op) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = op(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
diff --git a/tensorflow/lite/kernels/internal/reference/neg.h b/tensorflow/lite/kernels/internal/reference/neg.h
new file mode 100644
index 00000000000..e127883f9ad
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/neg.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Negate(const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = -input_data[i];
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
diff --git a/tensorflow/lite/kernels/internal/reference/non_max_suppression.h b/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
new file mode 100644
index 00000000000..1bb3eb74c05
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
@@ -0,0 +1,191 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <queue>
+
+namespace tflite {
+namespace reference_ops {
+
+// A pair of diagonal corners of the box.
+struct BoxCornerEncoding {
+  float y1;
+  float x1;
+  float y2;
+  float x2;
+};
+
+inline float ComputeIntersectionOverUnion(const float* boxes, const int i,
+                                          const int j) {
+  auto& box_i = reinterpret_cast<const BoxCornerEncoding*>(boxes)[i];
+  auto& box_j = reinterpret_cast<const BoxCornerEncoding*>(boxes)[j];
+  const float box_i_y_min = std::min<float>(box_i.y1, box_i.y2);
+  const float box_i_y_max = std::max<float>(box_i.y1, box_i.y2);
+  const float box_i_x_min = std::min<float>(box_i.x1, box_i.x2);
+  const float box_i_x_max = std::max<float>(box_i.x1, box_i.x2);
+  const float box_j_y_min = std::min<float>(box_j.y1, box_j.y2);
+  const float box_j_y_max = std::max<float>(box_j.y1, box_j.y2);
+  const float box_j_x_min = std::min<float>(box_j.x1, box_j.x2);
+  const float box_j_x_max = std::max<float>(box_j.x1, box_j.x2);
+
+  const float area_i =
+      (box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
+  const float area_j =
+      (box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
+  if (area_i <= 0 || area_j <= 0) return 0.0;
+  const float intersection_ymax = std::min<float>(box_i_y_max, box_j_y_max);
+  const float intersection_xmax = std::min<float>(box_i_x_max, box_j_x_max);
+  const float intersection_ymin = std::max<float>(box_i_y_min, box_j_y_min);
+  const float intersection_xmin = std::max<float>(box_i_x_min, box_j_x_min);
+  const float intersection_area =
+      std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
+      std::max<float>(intersection_xmax - intersection_xmin, 0.0);
+  return intersection_area / (area_i + area_j - intersection_area);
+}
+
+// Implements (Single-Class) Soft NMS (with Gaussian weighting).
+// Supports functionality of TensorFlow ops NonMaxSuppressionV4 & V5.
+// Reference: "Soft-NMS - Improving Object Detection With One Line of Code"
+//            [Bodla et al, https://arxiv.org/abs/1704.04503]
+// Implementation adapted from the TensorFlow NMS code at
+// tensorflow/core/kernels/non_max_suppression_op.cc.
+//
+// Arguments:
+//  boxes: box encodings in format [y1, x1, y2, x2], shape: [num_boxes, 4]
+//  num_boxes: number of candidates
+//  scores: scores for candidate boxes, in the same order. shape: [num_boxes]
+//  max_output_size: the maximum number of selections.
+//  iou_threshold: Intersection-over-Union (IoU) threshold for NMS
+//  score_threshold: All candidate scores below this value are rejected
+//  soft_nms_sigma: Soft NMS parameter, used for decaying scores
+//
+// Outputs:
+//  selected_indices: all the selected indices. Underlying array must have
+//    length >= max_output_size. Cannot be null.
+//  selected_scores: scores of selected indices. Defer from original value for
+//    Soft NMS. If not null, array must have length >= max_output_size.
+//  num_selected_indices: Number of selections. Only these many elements are
+//    set in selected_indices, selected_scores. Cannot be null.
+//
+// Assumes inputs are valid (for eg, iou_threshold must be >= 0).
+inline void NonMaxSuppression(const float* boxes, const int num_boxes,
+                              const float* scores, const int max_output_size,
+                              const float iou_threshold,
+                              const float score_threshold,
+                              const float soft_nms_sigma, int* selected_indices,
+                              float* selected_scores,
+                              int* num_selected_indices) {
+  struct Candidate {
+    int index;
+    float score;
+    int suppress_begin_index;
+  };
+
+  // Priority queue to hold candidates.
+  auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+    return bs_i.score < bs_j.score;
+  };
+  std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
+      candidate_priority_queue(cmp);
+  // Populate queue with candidates above the score threshold.
+  for (int i = 0; i < num_boxes; ++i) {
+    if (scores[i] > score_threshold) {
+      candidate_priority_queue.emplace(Candidate({i, scores[i], 0}));
+    }
+  }
+
+  *num_selected_indices = 0;
+  int num_outputs = std::min(static_cast<int>(candidate_priority_queue.size()),
+                             max_output_size);
+  if (num_outputs == 0) return;
+
+  // NMS loop.
+  float scale = 0;
+  if (soft_nms_sigma > 0.0) {
+    scale = -0.5 / soft_nms_sigma;
+  }
+  while (*num_selected_indices < num_outputs &&
+         !candidate_priority_queue.empty()) {
+    Candidate next_candidate = candidate_priority_queue.top();
+    const float original_score = next_candidate.score;
+    candidate_priority_queue.pop();
+
+    // Overlapping boxes are likely to have similar scores, therefore we
+    // iterate through the previously selected boxes backwards in order to
+    // see if `next_candidate` should be suppressed. We also enforce a property
+    // that a candidate can be suppressed by another candidate no more than
+    // once via `suppress_begin_index` which tracks which previously selected
+    // boxes have already been compared against next_candidate prior to a given
+    // iteration.  These previous selected boxes are then skipped over in the
+    // following loop.
+    bool should_hard_suppress = false;
+    for (int j = *num_selected_indices - 1;
+         j >= next_candidate.suppress_begin_index; --j) {
+      const float iou = ComputeIntersectionOverUnion(
+          boxes, next_candidate.index, selected_indices[j]);
+
+      // First decide whether to perform hard suppression.
+      if (iou >= iou_threshold) {
+        should_hard_suppress = true;
+        break;
+      }
+
+      // Suppress score if NMS sigma > 0.
+      if (soft_nms_sigma > 0.0) {
+        next_candidate.score =
+            next_candidate.score * std::exp(scale * iou * iou);
+      }
+
+      // If score has fallen below score_threshold, it won't be pushed back into
+      // the queue.
+      if (next_candidate.score <= score_threshold) break;
+    }
+    // If `next_candidate.score` has not dropped below `score_threshold`
+    // by this point, then we know that we went through all of the previous
+    // selections and can safely update `suppress_begin_index` to
+    // `selected.size()`. If on the other hand `next_candidate.score`
+    // *has* dropped below the score threshold, then since `suppress_weight`
+    // always returns values in [0, 1], further suppression by items that were
+    // not covered in the above for loop would not have caused the algorithm
+    // to select this item. We thus do the same update to
+    // `suppress_begin_index`, but really, this element will not be added back
+    // into the priority queue.
+    next_candidate.suppress_begin_index = *num_selected_indices;
+
+    if (!should_hard_suppress) {
+      if (next_candidate.score == original_score) {
+        // Suppression has not occurred, so select next_candidate.
+        selected_indices[*num_selected_indices] = next_candidate.index;
+        selected_scores[*num_selected_indices] = next_candidate.score;
+        ++*num_selected_indices;
+      }
+      if (next_candidate.score > score_threshold) {
+        // Soft suppression might have occurred and current score is still
+        // greater than score_threshold; add next_candidate back onto priority
+        // queue.
+        candidate_priority_queue.push(next_candidate);
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 472425e8e0a..d3bde590ad9 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"
@@ -176,6 +177,228 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
+template <typename T>
+void PortableMatrixBatchVectorMultiplyAccumulateImpl(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    T* output) {
+  const int16_t output_max = std::numeric_limits<T>::max();
+  const int16_t output_min = std::numeric_limits<T>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_output; ++row) {
+      int32_t acc = input_zeropoint_times_weights[row];
+      for (int col = 0; col < n_input; ++col) {
+        int8 input_val = input[batch * n_input + col];
+        int8 weights_val = input_to_gate_weights[row * n_input + col];
+        acc += input_val * weights_val;
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc += output_zp;
+      acc += output[batch * n_output + row];
+      if (acc > output_max) {
+        acc = output_max;
+      }
+      if (acc < output_min) {
+        acc = output_min;
+      }
+      output[batch * n_output + row] = static_cast<T>(acc);
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulateImpl(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, output);
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulateImpl(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, output);
+}
+
+void PortableApplyLayerNorm(const int16_t* input,
+                            const int16_t* layer_norm_weights,
+                            const int32_t* bias, int32_t layer_norm_scale_a,
+                            int32_t layer_norm_scale_b, int32_t variance_limit,
+                            int n_batch, int n_input, int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
+  static const int kOverflowGuard = 1 << 20;
+  for (int i = 0; i < n_batch; ++i) {
+    int64_t sum = 0;
+    int64_t sum_sq = 0;
+    for (int j = 0; j < n_input; ++j) {
+      const int32_t index = i * n_input + j;
+      int32_t val = static_cast<int32_t>(input[index]);
+      sum += val;
+      sum_sq += val * val;
+    }
+    int32_t mean =
+        static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
+    // TODO(jianlijianli): Avoids overflow but only works for POT n_input.
+    int32 temp = kOverflowGuard / n_input;
+    int64_t variance =
+        sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
+    int32_t variance2 = static_cast<int32>(variance / kOverflowGuard);
+    if (variance2 < 1) {
+      variance2 = variance_limit;
+    }
+    int32_t stddev_inverse_a;
+    int stddev_inverse_b;
+    GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
+                                     &stddev_inverse_a, &stddev_inverse_b);
+
+    for (int j = 0; j < n_input; ++j) {
+      const int32 index = i * n_input + j;
+      int32 val = static_cast<int32_t>(input[index]);
+      int32 shifted = 1024 * val - mean;
+      int32 rescaled = MultiplyByQuantizedMultiplier(shifted, stddev_inverse_a,
+                                                     stddev_inverse_b);
+      // TODO(jianlijianli): Saturate this.
+      int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
+      int32 val4 =
+          static_cast<int32>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
+      int32 val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
+                                                 layer_norm_scale_b + 12);
+      val5 = std::min(std::max(int16_min, val5), int16_max);
+      output[index] = static_cast<int16_t>(val5);
+    }
+  }
+}
+
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
+                          int32_t n_input, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int c = 0; c < n_input; c++) {
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      const int index = batch * n_input + c;
+      F3 sigmoid_input = F3::FromRaw(input[index]);
+      F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
+      output[index] = sigmoid_output.raw();
+    }
+  }
+}
+
+void PortableApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                        int16_t* output) {
+  using FX = gemmlowp::FixedPoint<std::int16_t, 3>;
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      FX tanh_input = FX::FromRaw(input[index]);
+      F0 tanh_output = gemmlowp::tanh(tanh_input);
+      output[index] = tanh_output.raw();
+    }
+  }
+}
+
+void PortableApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                        int16_t* output) {
+  using FX = gemmlowp::FixedPoint<std::int16_t, 4>;
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      FX tanh_input = FX::FromRaw(input[index]);
+      F0 tanh_output = gemmlowp::tanh(tanh_input);
+      output[index] = tanh_output.raw();
+    }
+  }
+}
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      int64_t x = a * b;
+      if (x > std::numeric_limits<std::int32_t>::max()) {
+        x = std::numeric_limits<std::int32_t>::max();
+      }
+      const int32_t value = static_cast<int32_t>(x);
+      output[index] =
+          static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+    }
+  }
+}
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int8_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      int64_t x = a * b;
+      if (x > std::numeric_limits<std::int32_t>::max()) {
+        x = std::numeric_limits<std::int32_t>::max();
+      }
+      const int32_t value = static_cast<int32_t>(x);
+      output[index] =
+          static_cast<int8_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+    }
+  }
+}
+
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int16_t* output) {
+  const int32 int16_max = std::numeric_limits<int16>::max();
+  const int32 int16_min = std::numeric_limits<int16>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      int32_t sum = input_1[index] + input_2[index];
+      const int32 sum_clamped = std::min(int16_max, std::max(int16_min, sum));
+      output[index] = static_cast<int16_t>(sum_clamped);
+    }
+  }
+}
+
+void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
+                           int32_t n_batch, int32_t n_input) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      if (input[index] > clipping_value) {
+        input[index] = clipping_value;
+      }
+      if (input[index] < -clipping_value) {
+        input[index] = -clipping_value;
+      }
+    }
+  }
+}
+
+void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
+                           int32_t n_batch, int32_t n_input) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      if (input[index] > clipping_value) {
+        input[index] = clipping_value;
+      }
+      if (input[index] < -clipping_value) {
+        input[index] = -clipping_value;
+      }
+    }
+  }
+}
+
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result) {
@@ -249,13 +472,6 @@ void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
   }
 }
 
-void PortableVectorBatchVectorAssign(const float* vector, int v_size,
-                                     int n_batch, float* batch_vector) {
-  for (int b = 0; b < n_batch; b++) {
-    memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
-  }
-}
-
 void PortableApplySigmoidToVector(const float* vector, int v_size,
                                   float* result) {
   auto sigmoid_func = ActivationFunctor(kTfLiteActSigmoid);
@@ -273,20 +489,12 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
   }
 }
 
-void PortableCopyVector(const float* vector, int v_size, float* result) {
-  memcpy(result, vector, v_size * sizeof(float));
-}
-
 void PortableSub1Vector(const float* vector, int v_size, float* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ = 1.0f - *vector++;
   }
 }
 
-void PortableZeroVector(float* vector, int v_size) {
-  memset(vector, 0, v_size * sizeof(float));
-}
-
 void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
                                   const float scale, float* result) {
   for (int v = 0; v < v_size; ++v) {
@@ -301,14 +509,6 @@ void PortableClipVector(const float* vector, int v_size, float abs_limit,
   }
 }
 
-void PortableVectorShiftLeft(float* vector, int v_size, float shift_value) {
-  TF_LITE_ASSERT(v_size > 0);
-  for (int i = 0; i < v_size - 1; i++) {
-    vector[i] = vector[i + 1];
-  }
-  vector[v_size - 1] = shift_value;
-}
-
 void PortableReductionSumVector(const float* input_vector, float* output_vector,
                                 int output_size, int reduction_size) {
   const float* input_vector_ptr = input_vector;
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 28ca98160cd..7f1788be4d6 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -75,6 +75,75 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
       result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  PortableApplyTanh3(input, n_batch, n_input, output);
+}
+
+void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output) {
+  PortableApplyTanh4(input, n_batch, n_input, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(int16_t* input, const int16_t clipping_value,
+                   int32_t n_batch, int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
+void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
+                   int32_t n_input) {
+  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
@@ -118,11 +187,6 @@ void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
   PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
 }
 
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
-                             float* batch_vector) {
-  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
-}
-
 void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
   PortableApplySigmoidToVector(vector, v_size, result);
 }
@@ -132,18 +196,10 @@ void ApplyActivationToVector(const float* vector, int v_size,
   PortableApplyActivationToVector(vector, v_size, activation, result);
 }
 
-void CopyVector(const float* vector, int v_size, float* result) {
-  PortableCopyVector(vector, v_size, result);
-}
-
 void Sub1Vector(const float* vector, int v_size, float* result) {
   PortableSub1Vector(vector, v_size, result);
 }
 
-void ZeroVector(float* vector, int v_size) {
-  PortableZeroVector(vector, v_size);
-}
-
 // Multiply all elements of vector with a scalar.
 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result) {
@@ -155,10 +211,6 @@ void ClipVector(const float* vector, int v_size, float abs_limit,
   PortableClipVector(vector, v_size, abs_limit, result);
 }
 
-void VectorShiftLeft(float* vector, int v_size, float shift_value) {
-  PortableVectorShiftLeft(vector, v_size, shift_value);
-}
-
 void ReductionSumVector(const float* input_vector, float* output_vector,
                         int output_size, int reduction_size) {
   PortableReductionSumVector(input_vector, output_vector, output_size,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 80503d7f6cd..ff18c9d28c4 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -96,6 +96,48 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
                                                      int n_batch,
                                                      float* result);
 
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output);
+
+void PortableApplyLayerNorm(const int16_t* input,
+                            const int16_t* layer_norm_weights,
+                            const int32_t* bias, int32_t layer_norm_scale_a,
+                            int32_t layer_norm_scale_b, int32_t variance_limit,
+                            int n_batch, int n_input, int16_t* output);
+
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
+                          int32_t n_input, int16_t* output);
+
+void PortableApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                        int16_t* output);
+
+void PortableApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                        int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int8_t* output);
+
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int16_t* output);
+
+void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
+                           int32_t n_batch, int32_t n_input);
+
+void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
+                           int32_t n_batch, int32_t n_input);
+
 // Batch vector initialization with another vector.
 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
                                      int n_batch, float* batch_vector);
@@ -113,15 +155,9 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
                                      TfLiteFusedActivation activation,
                                      float* result);
 
-// Copy vector to another vector.
-void PortableCopyVector(const float* vector, int v_size, float* result);
-
 // Compute "1.0f - elements of vector" (used in CIFG).
 void PortableSub1Vector(const float* vector, int v_size, float* result);
 
-// Fill vector with 0.f.
-void PortableZeroVector(float* vector, int v_size);
-
 // Multiply all elements of vector with a scalar.
 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                                   float* result);
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
new file mode 100644
index 00000000000..adbbf66eb1b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Broadcast prelu to output_shape for quantized uint8 data.
+inline void BroadcastPrelu4DSlow(const PreluParams& params,
+                                 const RuntimeShape& input_shape,
+                                 const uint8* input_data,
+                                 const RuntimeShape& alpha_shape,
+                                 const uint8* alpha_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32 input_value =
+              params.input_offset + input_data[input_index];
+          if (input_value >= 0) {
+            output_data[output_index] = input_data[input_index];
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32 alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+            const int32 unclamped_output =
+                params.output_offset +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    input_value * alpha_value, params.output_multiplier,
+                    params.output_shift);
+            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
+            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
+            const int32 clamped_output = std::min(
+                quantized_max, std::max(quantized_min, unclamped_output));
+            output_data[output_index] = static_cast<uint8>(clamped_output);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
diff --git a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
new file mode 100644
index 00000000000..d903022c6fd
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -0,0 +1,119 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Return true for broadcast case, false otherwise.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      break;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index a8b35ae7b92..5f2e8331f59 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -32,9 +32,20 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 #include "tensorflow/lite/kernels/internal/round.h"
@@ -46,98 +57,6 @@ namespace tflite {
 
 namespace reference_ops {
 
-// Return true for broadcast case, false otherwise.
-inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
-                                   const RuntimeShape& shape1,
-                                   tflite::ArithmeticParams* params) {
-  const int dims_count =
-      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
-
-  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
-  RuntimeShape scalar_shape(dims_count, 1);
-
-  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
-  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
-
-  // Check for "exact" match, implicitly accepting any scalar shapes.
-  if (extended_shape0 == extended_shape1) {
-    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
-    return false;
-  }
-
-  for (int i = dims_count - 1; i >= 0; --i) {
-    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
-      continue;
-    } else if (extended_shape0.Dims(i) == 1) {
-      params->broadcast_category =
-          BroadcastableOpCategory::kFirstInputBroadcastsFast;
-      break;
-    } else if (extended_shape1.Dims(i) == 1) {
-      params->broadcast_category =
-          BroadcastableOpCategory::kSecondInputBroadcastsFast;
-      break;
-    } else {
-      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
-      break;
-    }
-  }
-
-  if (params->broadcast_category !=
-          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
-      params->broadcast_category !=
-          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
-    return false;
-  }
-
-  // From this point it is assumed contractually that corresponding dimensions
-  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
-  const bool swap_inputs = params->broadcast_category ==
-                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
-  const RuntimeShape* shape_a =
-      swap_inputs ? &extended_shape1 : &extended_shape0;
-  const RuntimeShape* shape_b =
-      swap_inputs ? &extended_shape0 : &extended_shape1;
-
-  int i = dims_count - 1;
-  params->broadcast_shape[0] = 1;
-  params->broadcast_shape[1] = 1;
-  params->broadcast_shape[2] = 1;
-  params->broadcast_shape[3] = 1;
-  params->broadcast_shape[4] = 1;
-  // y_0 is greedy: include dims if both or neither equal 1: in other words,
-  // test for equality rather than (shape_a->Dims(i) != 1).
-  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
-    params->broadcast_shape[4] *= shape_b->Dims(i);
-    --i;
-  }
-  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
-  // that has the unit dimension, the next two loops are not entered.
-  while (i >= 0 && shape_a->Dims(i) == 1) {
-    params->broadcast_shape[3] *= shape_b->Dims(i);
-    --i;
-  }
-  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
-    params->broadcast_shape[2] *= shape_a->Dims(i);
-    --i;
-  }
-  // Here either input_a or input_b has dim of 1 (if i >= 0).
-  while (i >= 0 && shape_b->Dims(i) == 1) {
-    params->broadcast_shape[1] *= shape_a->Dims(i);
-    --i;
-  }
-  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
-    params->broadcast_shape[0] *= shape_b->Dims(i);
-    --i;
-  }
-
-  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
-  // loop.
-  if (i >= 0) {
-    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
-  }
-  return true;
-}
-
 template <typename T>
 inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
                          const RuntimeShape& unextended_input_shape,
@@ -402,32 +321,6 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
   }
 }
 
-template <typename T>
-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const T* input1_data,
-                const RuntimeShape& input2_shape, const T* input2_data,
-                const RuntimeShape& output_shape, T* output_data) {
-  const int flat_size =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] + input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
-  }
-}
-
-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const float* input1_data,
-                const RuntimeShape& input2_shape, const float* input2_data,
-                const RuntimeShape& output_shape, float* output_data) {
-  const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < size; i++) {
-    auto x = input1_data[i] + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax(
-        x, params.float_activation_min, params.float_activation_max);
-  }
-}
-
 // T is expected to be either float or int.
 template <typename T>
 inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
@@ -444,373 +337,6 @@ inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
   }
 }
 
-// Element-wise add that can often be used for inner loop of broadcast add as
-// well as the non-broadcast add.
-inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
-
-  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sum, params.output_multiplier, params.output_shift) +
-        params.output_offset;
-    const int32 clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
-  }
-}
-
-// Scalar-broadcast add that can be used for inner loop of more general
-// broadcast add, so that, for example, scalar-broadcast with batch will still
-// be fast.
-inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
-                               uint8* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
-
-  const int32 input1_val = params.input1_offset + input1_data;
-  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-  const int32 scaled_input1_val =
-      MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          shifted_input1_val, params.input1_multiplier, params.input1_shift);
-  for (int i = 0; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sum, params.output_multiplier, params.output_shift) +
-        params.output_offset;
-    const int32 clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
-  }
-}
-
-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
-  const int flat_size =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
-  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
-}
-
-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
-
-  const int input1_shift = params.input1_shift;
-  const int flat_size =
-      MatchingFlatSize(output_shape, input1_shape, input2_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
-
-  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
-  TFLITE_DCHECK_LE(input1_shift, 0);
-  TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
-  const int input_right_shift =
-      input1_shift == 0 ? -params.input2_shift : -input1_shift;
-
-  for (int i = 0; i < flat_size; i++) {
-    // F0 uses 0 integer bits, range [-1, 1].
-    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-
-    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
-    F0 scaled_input = F0::FromRaw(
-        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
-    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
-    const int16 clamped_output = std::min(
-        output_activation_max, std::max(output_activation_min, raw_output));
-    output_data[i] = clamped_output;
-  }
-}
-
-// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const float* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const float* input2_data,
-                               const RuntimeShape& output_shape,
-                               float* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  params.float_activation_min, params.float_activation_max);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const int32* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const int32* input2_data,
-                               const RuntimeShape& output_shape,
-                               int32* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  params.quantized_activation_min,
-                  params.quantized_activation_max);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
-              input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
-              input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, params.input1_multiplier,
-                  params.input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, params.input2_multiplier,
-                  params.input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, params.output_multiplier, params.output_shift) +
-              params.output_offset;
-          const int32 clamped_output =
-              std::min(params.quantized_activation_max,
-                       std::max(params.quantized_activation_min, raw_output));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename T>
 inline void Mul(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const T* input1_data,
@@ -897,9 +423,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -1001,9 +527,9 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32 unclamped_result =
               params.output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  input1_val * input2_val, params.output_multiplier,
-                  params.output_shift);
+              MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                            params.output_multiplier,
+                                            params.output_shift);
           const int32 clamped_output = std::min(
               params.quantized_activation_max,
               std::max(params.quantized_activation_min, unclamped_result));
@@ -2467,7 +1993,6 @@ inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
   Tanh(input_shape, input_data, output_shape, output_data);
 }
 
-
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                  const int16* input_data, const RuntimeShape& output_shape,
                  int16* output_data) {
@@ -2631,54 +2156,11 @@ T FloorMod(T input1, T input2) {
                                             std::modulus<T>, FloatMod>::type;
   ModFunc mod_func;
   T trunc_mod = mod_func(input1, input2);
-  return trunc_mod != 0 && ((input2 < 0) != (trunc_mod < 0))
-             ? trunc_mod + input2
+  return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
+             ? (trunc_mod + input2)
              : trunc_mod;
 }
 
-inline void Floor(const RuntimeShape& input_shape, const float* input_data,
-                  const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    int offset = i;
-    output_data[offset] = std::floor(input_data[offset]);
-  }
-}
-
-inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    int offset = i;
-    output_data[offset] = std::ceil(input_data[offset]);
-  }
-}
-
-inline float RoundToNearest(float value) {
-  auto floor_val = std::floor(value);
-  auto diff = value - floor_val;
-  if ((diff < 0.5f) ||
-      ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
-    return floor_val;
-  } else {
-    return floor_val = floor_val + 1.0f;
-  }
-}
-
-inline void Round(const RuntimeShape& input_shape, const float* input_data,
-                  const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  for (int i = 0; i < flat_size; i++) {
-    // Note that this implementation matches that of tensorFlow tf.round
-    // and corresponds to the bankers rounding method.
-    // cfenv (for fesetround) is not yet supported universally on Android, so
-    // using a work around.
-    output_data[i] = RoundToNearest(input_data[i]);
-  }
-}
-
 template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -3545,87 +3027,6 @@ inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
   Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
 }
 
-template <typename T, typename Op>
-void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
-                                   const T* input1_data,
-                                   const RuntimeShape& unextended_input2_shape,
-                                   const T* input2_data,
-                                   const RuntimeShape& unextended_output_shape,
-                                   T* output_data, Op op) {
-  gemmlowp::ScopedProfilingLabel label("MaximumMinimumBroadcast4DSlow");
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          auto out_idx = Offset(output_shape, b, y, x, c);
-          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
-          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
-          auto in1_val = input1_data[in1_idx];
-          auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = op(in1_val, in2_val);
-        }
-      }
-    }
-  }
-}
-
-template <typename T1, typename T2, typename T3, typename Cmp>
-void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
-               const T3* input2_data, const RuntimeShape& output_shape,
-               T2* output_data, const Cmp& cmp) {
-  gemmlowp::ScopedProfilingLabel label("ArgMinMax");
-  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
-  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
-                   output_shape.DimensionsCount());
-
-  int axis = input2_data[0];
-  if (axis < 0) {
-    axis += input1_shape.DimensionsCount();
-  }
-
-  const int axis_size = input1_shape.Dims(axis);
-
-  int outer_size = 1;
-  for (int i = 0; i < axis; ++i) {
-    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
-    outer_size *= input1_shape.Dims(i);
-  }
-
-  int inner_size = 1;
-  const int dims_count = input1_shape.DimensionsCount();
-  for (int i = axis + 1; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
-    inner_size *= input1_shape.Dims(i);
-  }
-
-  for (int outer = 0; outer < outer_size; ++outer) {
-    for (int inner = 0; inner < inner_size; ++inner) {
-      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
-      int min_max_index = 0;
-      for (int i = 1; i < axis_size; ++i) {
-        const auto& curr_value =
-            input1_data[(outer * axis_size + i) * inner_size + inner];
-        if (cmp(curr_value, min_max_value)) {
-          min_max_value = curr_value;
-          min_max_index = i;
-        }
-      }
-      output_data[outer * inner_size + inner] = min_max_index;
-    }
-  }
-}
-
 template <typename T1, typename T2, typename T3>
 void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
             const T3* input2_data, const RuntimeShape& output_shape,
@@ -3855,253 +3256,6 @@ inline void TransposeConv(const ConvParams& params,
   }
 }
 
-template <typename T>
-inline bool EqualFn(T lhs, T rhs) {
-  return lhs == rhs;
-}
-
-template <typename T>
-inline bool NotEqualFn(T lhs, T rhs) {
-  return lhs != rhs;
-}
-
-template <typename T>
-inline bool GreaterFn(T lhs, T rhs) {
-  return lhs > rhs;
-}
-template <typename T>
-inline bool GreaterEqualFn(T lhs, T rhs) {
-  return lhs >= rhs;
-}
-template <typename T>
-inline bool LessFn(T lhs, T rhs) {
-  return lhs < rhs;
-}
-template <typename T>
-inline bool LessEqualFn(T lhs, T rhs) {
-  return lhs <= rhs;
-}
-
-template <typename T>
-using ComparisonFn = bool (*)(T, T);
-
-template <typename T, ComparisonFn<T> F>
-inline void ComparisonImpl(
-    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
-    const T* input1_data, const RuntimeShape& input2_shape,
-    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    output_data[i] = F(input1_data[i], input2_data[i]);
-  }
-}
-
-template <ComparisonFn<float> F>
-inline void Comparison(const ComparisonParams& op_params,
-                       const RuntimeShape& input1_shape,
-                       const float* input1_data,
-                       const RuntimeShape& input2_shape,
-                       const float* input2_data,
-                       const RuntimeShape& output_shape, bool* output_data) {
-  ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
-                           input2_data, output_shape, output_data);
-}
-
-template <typename T, ComparisonFn<int32> F>
-inline void ComparisonWithScaling(
-    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
-    const T* input1_data, const RuntimeShape& input2_shape,
-    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
-  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
-  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
-  int input2_shift = op_params.input2_shift;
-
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, input2_multiplier, input2_shift);
-    output_data[i] = F(scaled_input1_val, scaled_input2_val);
-  }
-}
-
-template <typename T, ComparisonFn<T> F>
-inline void BroadcastComparison4DSlowImpl(
-    const ComparisonParams& op_params,
-    const RuntimeShape& unextended_input1_shape, const T* input1_data,
-    const RuntimeShape& unextended_input2_shape, const T* input2_data,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlow");
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          output_data[Offset(output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
-}
-template <ComparisonFn<float> F>
-inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
-                                      const RuntimeShape& input1_shape,
-                                      const float* input1_data,
-                                      const RuntimeShape& input2_shape,
-                                      const float* input2_data,
-                                      const RuntimeShape& output_shape,
-                                      bool* output_data) {
-  BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
-                                          input2_shape, input2_data,
-                                          output_shape, output_data);
-}
-
-template <typename T, ComparisonFn<int32> F>
-inline void BroadcastComparison4DSlowWithScaling(
-    const ComparisonParams& op_params,
-    const RuntimeShape& unextended_input1_shape, const T* input1_data,
-    const RuntimeShape& unextended_input2_shape, const T* input2_data,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlowWithScaling");
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
-  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
-  int input2_shift = op_params.input2_shift;
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier, input2_shift);
-          output_data[Offset(output_shape, b, y, x, c)] =
-              F(scaled_input1_val, scaled_input2_val);
-        }
-      }
-    }
-  }
-}
-
-#define TFLITE_COMPARISON_OP(name)                                             \
-  inline void name(const ComparisonParams& op_params,                          \
-                   const RuntimeShape& input1_shape, const float* input1_data, \
-                   const RuntimeShape& input2_shape, const float* input2_data, \
-                   const RuntimeShape& output_shape, bool* output_data) {      \
-    gemmlowp::ScopedProfilingLabel label(#name);                               \
-    Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape,   \
-                         input2_data, output_shape, output_data);              \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void name##NoScaling(                                                 \
-      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
-      const T* input1_data, const RuntimeShape& input2_shape,                  \
-      const T* input2_data, const RuntimeShape& output_shape,                  \
-      bool* output_data) {                                                     \
-    gemmlowp::ScopedProfilingLabel label(#name "NoScaling");                   \
-    ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data,          \
-                                input2_shape, input2_data, output_shape,       \
-                                output_data);                                  \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void name##WithScaling(                                               \
-      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
-      const T* input1_data, const RuntimeShape& input2_shape,                  \
-      const T* input2_data, const RuntimeShape& output_shape,                  \
-      bool* output_data) {                                                     \
-    gemmlowp::ScopedProfilingLabel label(#name "WithScaling/8bit");            \
-    ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data,   \
-                                       input2_shape, input2_data,              \
-                                       output_shape, output_data);             \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void Broadcast4DSlow##name##NoScaling(                                \
-      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
-      const T* input1_data, const RuntimeShape& input2_shape,                  \
-      const T* input2_data, const RuntimeShape& output_shape,                  \
-      bool* output_data) {                                                     \
-    gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "NoScaling"); \
-    BroadcastComparison4DSlowImpl<T, name##Fn>(                                \
-        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
-        output_shape, output_data);                                            \
-  }                                                                            \
-  inline void Broadcast4DSlow##name(                                           \
-      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
-      const float* input1_data, const RuntimeShape& input2_shape,              \
-      const float* input2_data, const RuntimeShape& output_shape,              \
-      bool* output_data) {                                                     \
-    gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name);             \
-    BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data,  \
-                                        input2_shape, input2_data,             \
-                                        output_shape, output_data);            \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void Broadcast4DSlow##name##WithScaling(                              \
-      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
-      const T* input1_data, const RuntimeShape& input2_shape,                  \
-      const T* input2_data, const RuntimeShape& output_shape,                  \
-      bool* output_data) {                                                     \
-    gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "/8bit");     \
-    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                         \
-        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
-        output_shape, output_data);                                            \
-  }
-TFLITE_COMPARISON_OP(Equal);
-TFLITE_COMPARISON_OP(NotEqual);
-TFLITE_COMPARISON_OP(Greater);
-TFLITE_COMPARISON_OP(GreaterEqual);
-TFLITE_COMPARISON_OP(Less);
-TFLITE_COMPARISON_OP(LessEqual);
-#undef TFLITE_COMPARISON_OP
-
 template <typename D, typename T>
 void Select(const RuntimeShape& input_condition_shape,
             const D* input_condition_data, const RuntimeShape& input_x_shape,
@@ -4251,104 +3405,6 @@ inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape,
   }
 }
 
-inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data,
-                    const RuntimeShape& input2_shape, const bool* input2_data,
-                    const RuntimeShape& output_shape, bool* output_data,
-                    const std::function<bool(bool, bool)>& func) {
-  const int flat_size =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = func(input1_data[i], input2_data[i]);
-  }
-}
-
-inline void BroadcastLogical4DSlow(
-    const RuntimeShape& unextended_input1_shape, const bool* input1_data,
-    const RuntimeShape& unextended_input2_shape, const bool* input2_data,
-    const RuntimeShape& unextended_output_shape, bool* output_data,
-    const std::function<bool(bool, bool)>& func) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          auto out_idx = Offset(output_shape, b, y, x, c);
-          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
-          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
-          auto in1_val = input1_data[in1_idx];
-          auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = func(in1_val, in2_val);
-        }
-      }
-    }
-  }
-}
-
-// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
-// generalized and efficient BroadcastBinaryFunction.
-//
-// Also appears to duplicte MinimumMaximum.
-//
-// R: Result type. T1: Input 1 type. T2: Input 2 type.
-template <typename R, typename T1, typename T2>
-inline void BroadcastBinaryFunction4DSlow(
-    const RuntimeShape& unextended_input1_shape, const T1* input1_data,
-    const RuntimeShape& unextended_input2_shape, const T2* input2_data,
-    const RuntimeShape& unextended_output_shape, R* output_data,
-    R (*func)(T1, T2)) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          auto out_idx = Offset(output_shape, b, y, x, c);
-          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
-          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
-          auto in1_val = input1_data[in1_idx];
-          auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = func(in1_val, in2_val);
-        }
-      }
-    }
-  }
-}
-
-// R: Result type. T1: Input 1 type. T2: Input 2 type.
-// TODO(renjieliu): Refactor other binary functions to use this one.
-template <typename R, typename T1, typename T2>
-inline void BinaryFunction(const RuntimeShape& input1_shape,
-                           const T1* input1_data,
-                           const RuntimeShape& input2_shape,
-                           const T2* input2_data,
-                           const RuntimeShape& output_shape, R* output_data,
-                           R (*func)(T1, T2)) {
-  const int flat_size =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = func(input1_data[i], input2_data[i]);
-  }
-}
-
 template <typename T>
 inline void ResizeNearestNeighbor(
     const tflite::ResizeNearestNeighborParams& op_params,
@@ -4403,53 +3459,6 @@ inline void ResizeNearestNeighbor(
   }
 }
 
-inline void BroadcastPrelu4DSlow(const PreluParams& params,
-                                 const RuntimeShape& input_shape,
-                                 const uint8* input_data,
-                                 const RuntimeShape& alpha_shape,
-                                 const uint8* alpha_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
-
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          int output_index = Offset(extended_output_shape, b, y, x, c);
-          int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32 input_value =
-              params.input_offset + input_data[input_index];
-          if (input_value >= 0) {
-            output_data[output_index] = input_data[input_index];
-          } else {
-            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32 alpha_value =
-                params.alpha_offset + alpha_data[alpha_index];
-            const int32 unclamped_output =
-                params.output_offset +
-                MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                    input_value * alpha_value, params.output_multiplier,
-                    params.output_shift);
-            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
-            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
-            const int32 clamped_output = std::min(
-                quantized_max, std::max(quantized_min, unclamped_output));
-            output_data[output_index] = static_cast<uint8>(clamped_output);
-          }
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
 void Fill(const RuntimeShape& value_shape, const T* value_data,
           const RuntimeShape& output_shape, T* output_data) {
diff --git a/tensorflow/lite/kernels/internal/reference/round.h b/tensorflow/lite/kernels/internal/reference/round.h
new file mode 100644
index 00000000000..9bd8f3f2b23
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/round.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline float RoundToNearest(float value) {
+  auto floor_val = std::floor(value);
+  auto diff = value - floor_val;
+  if ((diff < 0.5f) ||
+      ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
+    return floor_val;
+  } else {
+    return floor_val = floor_val + 1.0f;
+  }
+}
+
+inline void Round(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    // Note that this implementation matches that of tensorFlow tf.round
+    // and corresponds to the bankers rounding method.
+    // cfenv (for fesetround) is not yet supported universally on Android, so
+    // using a work around.
+    output_data[i] = RoundToNearest(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
new file mode 100644
index 00000000000..b1b14986cfe
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -0,0 +1,211 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+// SVDF op that compresses a fully connected op via low-rank matrix
+// factorization. See https://research.google.com/pubs/archive/43813.pdf for
+// details.
+
+namespace tflite {
+namespace reference_ops {
+
+static inline void ApplyTimeWeightsBiasAndActivation(
+    int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const TfLiteTensor* weights_time, const TfLiteTensor* bias,
+    TfLiteFusedActivation activation, TfLiteTensor* activation_state,
+    TfLiteTensor* scratch, TfLiteTensor* output) {
+  // Compute matmul(state, weights_time).
+  // The rightmost column is used to save temporary output (with the size of
+  // num_filters). This is achieved by starting at
+  // GetTensorData<float>(activation_state), and having the stride equal to
+  // memory_size.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+    float* scratch_ptr_batch = GetTensorData<float>(scratch) + b * num_filters;
+    tensor_utils::BatchVectorBatchVectorDotProduct(
+        GetTensorData<float>(weights_time), state_ptr_batch, memory_size,
+        num_filters, scratch_ptr_batch, /*result_stride=*/1);
+  }
+
+  // Initialize output with bias if provided.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias), num_units,
+                                          batch_size,
+                                          GetTensorData<float>(output));
+  } else {
+    std::fill_n(GetTensorData<float>(output), batch_size * num_units, 0.0f);
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    float* scratch_ptr_batch = GetTensorData<float>(scratch) + b * num_filters;
+    tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch,
+                                     num_units, rank);
+  }
+
+  // Apply activation.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units,
+                                          activation, output_ptr_batch);
+  }
+
+  // Left shift the activation_state to make room for next cycle's activation.
+  // TODO(alanchiao): explore collapsing this into a single loop.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+    for (int f = 0; f < num_filters; ++f) {
+      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
+                                    /*shift_value=*/0.0f);
+      state_ptr_batch += memory_size;
+    }
+  }
+}
+
+inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
+                          const TfLiteTensor* input,
+                          const TfLiteTensor* weights_feature,
+                          const TfLiteTensor* weights_time,
+                          const TfLiteTensor* bias,
+                          const TfLiteSVDFParams* params, TfLiteTensor* scratch,
+                          TfLiteTensor* state, TfLiteTensor* output) {
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Clear the activation (state's leftmost column).
+  // TODO(ghodrat): Add a test which initialize activation_state with invalid
+  // values in leftmost column and make sure it passes.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(state) + b * memory_size * num_filters;
+    for (int c = 0; c < num_filters; ++c) {
+      float* state_ptr = state_ptr_batch + c * memory_size;
+      state_ptr[memory_size - 1] = 0.0f;
+    }
+  }
+
+  // Compute conv1d(inputs, weights_feature).
+  // The state's rightmost column is used to save current cycle activation. This
+  // is achieved by starting at GetTensorData<float>(state)[memory_size - 1] and
+  // having the stride equal to memory_size.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      GetTensorData<float>(weights_feature), num_filters, input_size,
+      GetTensorData<float>(input), batch_size,
+      &GetTensorData<float>(state)[memory_size - 1], memory_size);
+
+  ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters,
+                                    num_units, rank, weights_time, bias,
+                                    params->activation, state, scratch, output);
+}
+
+inline void EvalHybridSVDF(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
+    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
+    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
+    TfLiteTensor* scratch, TfLiteTensor* scaling_factors,
+    TfLiteTensor* input_quantized, TfLiteTensor* state, TfLiteTensor* output) {
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Initialize the pointer to input.
+  const float* input_ptr_batch = GetTensorData<float>(input);
+
+  // Initialize the pointer to storage for quantized values and the weights
+  // feature.
+  int8_t* quantized_input_ptr_batch;
+  const int8_t* weights_feature_ptr;
+  if (weights_feature->type == kTfLiteUInt8) {
+    quantized_input_ptr_batch =
+        reinterpret_cast<int8_t*>(GetTensorData<uint8_t>(input_quantized));
+    weights_feature_ptr = reinterpret_cast<const int8_t*>(
+        GetTensorData<uint8_t>(weights_feature));
+  } else {
+    quantized_input_ptr_batch = GetTensorData<int8_t>(input_quantized);
+    weights_feature_ptr = GetTensorData<int8_t>(weights_feature);
+  }
+
+  // Initialize the pointer to storage for scaling factors.
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
+
+  // Initialize the weights scale.
+  const float weights_feature_scale = weights_feature->params.scale;
+
+  // Clear the activation (state's leftmost column).
+  // TODO(ghodrat): Add a test which initialize state with invalid values in
+  // the leftmost column and make sure it passes.
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch =
+        GetTensorData<float>(state) + b * memory_size * num_filters;
+    for (int c = 0; c < num_filters; ++c) {
+      float* state_ptr = state_ptr_batch + c * memory_size;
+      state_ptr[memory_size - 1] = 0.0;
+    }
+  }
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
+    // Quantize input from float to int8.
+    float unused_min, unused_max;
+    for (int b = 0; b < batch_size; ++b) {
+      const int offset = b * input_size;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, input_size,
+          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors_ptr[b]);
+      scaling_factors_ptr[b] *= weights_feature_scale;
+    }
+
+    // Compute conv1d(inputs, weights_feature).
+    // The rightmost column of state is used to save the current cycle
+    // activation.
+    // This is achieved by starting at GetTensorData<float>(state)[memory_size -
+    // 1] and having the stride equal to memory_size.
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        weights_feature_ptr, num_filters, input_size, quantized_input_ptr_batch,
+        scaling_factors_ptr, batch_size,
+        &GetTensorData<float>(state)[memory_size - 1], memory_size);
+  }
+
+  // TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying
+  // time weights so that the inner loop multiplies eight elements at a time.
+  ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters,
+                                    num_units, rank, weights_time, bias,
+                                    params->activation, state, scratch, output);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index ea69f493db1..269dc98e129 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -124,9 +124,14 @@ void RunOneSoftmaxTest(const uint8* input_data,
                                                      input_beta_left_shift);
 
   SoftmaxParams params;
+  float table[256];
   params.input_multiplier = input_beta_multiplier;
   params.input_left_shift = input_beta_left_shift;
   params.diff_min = diff_min;
+  params.scale = 1.0f / 256;
+  params.zero_point = 0;
+  params.table = table;
+  optimized_ops::PopulateSoftmaxLookupTable(&params, input_scale, beta);
   optimized_ops::Softmax(params, shape_common, input_data, shape_common,
                          optimized_softmax_output.data());
   reference_ops::Softmax(params, shape_common, input_data, shape_common,
@@ -137,7 +142,7 @@ void RunOneSoftmaxTest(const uint8* input_data,
                            "Optimized vs float reference", false);
   CheckOutputData<uint8_t>(optimized_softmax_output.data(),
                            reference_quant_softmax_output.data(), shape_common,
-                           "Optimized vs quant reference", true);
+                           "Optimized vs quant reference", false);
   CheckOutputData<uint8_t>(reference_quant_softmax_output.data(),
                            reference_float_softmax_output.data(), shape_common,
                            "Quant reference vs float reference", false);
diff --git a/tensorflow/lite/kernels/internal/strided_slice_logic.h b/tensorflow/lite/kernels/internal/strided_slice_logic.h
index e7fd5ca9319..3022ac7b8e9 100644
--- a/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -43,14 +43,14 @@ inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
   const int pad_count = dim_count - p->start_indices_count;
 
   // Pad indices at start, so move arrays by pad_count.
-  for (int i = p->start_indices_count - 1; i > 0; --i) {
+  for (int i = p->start_indices_count - 1; i >= 0; --i) {
     p->strides[i + pad_count] = p->strides[i];
     p->start_indices[i + pad_count] = p->start_indices[i];
     p->stop_indices[i + pad_count] = p->stop_indices[i];
   }
   for (int i = 0; i < pad_count; ++i) {
     p->start_indices[i] = 0;
-    p->stop_indices[i] = 0;
+    p->stop_indices[i] = 1;
     p->strides[i] = 1;
   }
 
diff --git a/tensorflow/lite/kernels/internal/strided_slice_logic_test.cc b/tensorflow/lite/kernels/internal/strided_slice_logic_test.cc
new file mode 100644
index 00000000000..628e7269891
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/strided_slice_logic_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+void RunStridedSlicePadIndices(std::initializer_list<int> begin,
+                               std::initializer_list<int> end,
+                               std::initializer_list<int> stride,
+                               std::initializer_list<int> expected_begin,
+                               std::initializer_list<int> expected_end,
+                               std::initializer_list<int> expected_stride) {
+  StridedSliceParams op_params;
+  int dims = begin.size();
+  op_params.start_indices_count = dims;
+  op_params.stop_indices_count = dims;
+  op_params.strides_count = dims;
+
+  for (int i = 0; i < dims; ++i) {
+    op_params.start_indices[i] = begin.begin()[i];
+    op_params.stop_indices[i] = end.begin()[i];
+    op_params.strides[i] = stride.begin()[i];
+  }
+
+  strided_slice::StridedSlicePadIndices(&op_params, 4);
+
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(op_params.start_indices[i], expected_begin.begin()[i]);
+    EXPECT_EQ(op_params.stop_indices[i], expected_end.begin()[i]);
+    EXPECT_EQ(op_params.strides[i], expected_stride.begin()[i]);
+  }
+}
+
+TEST(RunStridedSlicePadIndices, Pad1) {
+  RunStridedSlicePadIndices({1, 2, 3},     // begin
+                            {4, 5, 6},     // end
+                            {2, 2, 2},     // stride
+                            {0, 1, 2, 3},  // expected_begin
+                            {1, 4, 5, 6},  // expected_end
+                            {1, 2, 2, 2}   // expected_stride
+  );
+}
+
+TEST(RunStridedSlicePadIndices, Pad2) {
+  RunStridedSlicePadIndices({1, 2},        // begin
+                            {4, 5},        // end
+                            {2, 2},        // stride
+                            {0, 0, 1, 2},  // expected_begin
+                            {1, 1, 4, 5},  // expected_end
+                            {1, 1, 2, 2}   // expected_stride
+  );
+}
+
+TEST(RunStridedSlicePadIndices, Pad3) {
+  RunStridedSlicePadIndices({1},           // begin
+                            {4},           // end
+                            {2},           // stride
+                            {0, 0, 0, 1},  // expected_begin
+                            {1, 1, 1, 4},  // expected_end
+                            {1, 1, 1, 2}   // expected_stride
+  );
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index 8ee95d4d5b3..e2136dc1549 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -28,6 +28,11 @@ inline float* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.f : nullptr;
 }
 
+template <>
+inline TfLiteFloat16* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f16 : nullptr;
+}
+
 template <>
 inline uint8_t* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 8eba2f19daa..d9145d71d50 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
 
+#include <algorithm>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 
 #if defined(_MSC_VER)
@@ -100,6 +102,47 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
     const float* scaling_factors, int n_batch, float* __restrict__ result,
     int result_stride);
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output);
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output);
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output);
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output);
+
+void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output);
+
+void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
+                int16_t* output);
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output);
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output);
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output);
+
+void CwiseClipping(int16_t* input, const int16_t clipping_value,
+                   int32_t n_batch, int32_t n_input);
+
+void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
+                   int32_t n_input);
+
 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result);
@@ -151,8 +194,13 @@ void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
                           float* batch_vector);
 
 // Batch vector initialization with another vector.
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
-                             float* batch_vector);
+template <typename T>
+void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
+                             T* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    std::copy_n(vector, v_size, batch_vector + b * v_size);
+  }
+}
 
 // Apply sigmoid to elements of a vector.
 void ApplySigmoidToVector(const float* vector, int v_size, float* result);
@@ -161,15 +209,9 @@ void ApplySigmoidToVector(const float* vector, int v_size, float* result);
 void ApplyActivationToVector(const float* vector, int v_size,
                              TfLiteFusedActivation activation, float* result);
 
-// Copy vector to another vector.
-void CopyVector(const float* vector, int v_size, float* result);
-
 // Compute "1.0f - elements of vector" (used in CIFG).
 void Sub1Vector(const float* vector, int v_size, float* result);
 
-// Fill vector with 0.f.
-void ZeroVector(float* vector, int v_size);
-
 // Multiply all elements of vector with a scalar.
 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result);
@@ -179,7 +221,13 @@ void ClipVector(const float* vector, int v_size, float abs_limit,
                 float* result);
 
 // Shift left a vector in place with v_size size.
-void VectorShiftLeft(float* vector, int v_size, float shift_value);
+template <typename T>
+void VectorShiftLeft(T* vector, int v_size, const T& shift_value) {
+  // When copying overlapping ranges, std::copy is appropriate when beginning of
+  // the destination range is outside the source range.
+  std::copy(vector + 1, vector + v_size, vector);
+  vector[v_size - 1] = shift_value;
+}
 
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 0918c8d2772..7db6dcd7d5f 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
+
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/test_util.h"
@@ -64,13 +65,6 @@ TEST(uKernels, IsZeroTest) {
   EXPECT_FALSE(IsZeroVector(nonzeros, kVectorSize));
 }
 
-TEST(uKernels, GeneratedIsZeroTest) {
-  constexpr int kVectorSize = 39;
-  std::vector<float> input(kVectorSize);
-  ZeroVector(input.data(), kVectorSize);
-  EXPECT_TRUE(IsZeroVector(input.data(), kVectorSize));
-}
-
 TEST(uKernels, SymmetricQuantizeFloatsTest) {
   constexpr int kVectorSize = 9;
   static float input[kVectorSize] = {-640, -635.0, -630, 10.0,  2.0,
@@ -147,6 +141,308 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
                                                -1., 3., 7., 3., 23., 3.})));
 }
 
+// Quantized matmul with 2 * 30 input and 9 * 30 matrix.
+TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
+  const std::vector<int8_t> input = {
+      4,   -41, 5,   -41, 22,  17, -30, 24,  13,  -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int32_t> input_zeropoint_times_weights = {
+      -620, -170, -395, 715, -1220, -1080, 1130, -260, -470,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      -10, -4,  -8,  16,  4,   -16, -1,  11,  1,   2,   -25, 19,  7,   9,   2,
+      -24, -2,  10,  -7,  7,   -5,  -2,  3,   4,   3,   -4,  -7,  -11, -13, -18,
+      11,  10,  12,  -9,  17,  -15, -5,  20,  -6,  -11, 2,   -6,  -18, 15,  4,
+      4,   -9,  -2,  -3,  -9,  -13, 17,  -21, 5,   3,   -12, 0,   -4,  9,   -5,
+      10,  -2,  8,   1,   -10, -6,  1,   -9,  10,  11,  -1,  -5,  4,   -7,  -4,
+      -4,  4,   12,  -7,  -5,  -9,  -19, 6,   -4,  12,  -17, -22, 0,   9,   -4,
+      -5,  5,   -8,  8,   3,   15,  -18, -18, 5,   3,   -12, 5,   -10, 7,   7,
+      -9,  17,  2,   -11, -25, 3,   19,  -6,  7,   1,   7,   5,   -3,  11,  3,
+      0,   -8,  8,   -2,  -2,  -12, 14,  -5,  7,   8,   16,  20,  -16, -5,  -5,
+      1,   -10, -6,  14,  10,  -12, 10,  -6,  5,   0,   3,   8,   -9,  -13, -2,
+      4,   4,   -16, -17, -9,  16,  -5,  14,  -9,  -5,  -12, 0,   17,  6,   -1,
+      16,  -20, 1,   -11, -1,  -10, -21, 13,  4,   -12, -7,  0,   -14, -6,  3,
+      -4,  6,   -18, -3,  -1,  14,  -8,  -6,  -15, 5,   12,  -3,  -10, 4,   6,
+      -5,  -20, 0,   3,   -3,  -7,  1,   2,   -10, 7,   -3,  6,   1,   -12, 6,
+      4,   -12, 2,   6,   -20, 0,   5,   23,  15,  14,  9,   8,   20,  -2,  9,
+      -8,  -8,  -7,  -4,  -8,  -9,  7,   -12, -2,  2,   1,   -14, 31,  4,   -14,
+      3,   10,  -18, -17, -1,  18,  1,   12,  0,   7,   -3,  -5,  8,   -9,  18,
+      17,  7,   -15, 3,   20,  4,   -8,  16,  6,   -3,  -3,  9,   -4,  -6,  4,
+  };
+  const int32_t multiplier = 2080364544;
+  const int32_t shift = -2;
+
+  std::vector<int32_t> scrach(2 * 9, 0);
+  std::vector<int16_t> output = {10, 2, 33, 4, 5,  6,  65, 4,  3,
+                                 52, 1, 2,  8, -1, -2, 11, 17, -18};
+  MatrixBatchVectorMultiplyAccumulate(
+      input.data(), input_zeropoint_times_weights.data(),
+      input_to_gate_weights.data(), multiplier, shift,
+      /*n_batch=*/2, /*n_input=*/30, /*n_output=*/9, /*output_zp=*/0,
+      scrach.data(), output.data());
+  const std::vector<int16_t> expected_output = {
+      -210, 331,  153, 139, -570, -657, 258, 515,  -495,
+      91,   -243, -73, 603, -744, -269, 169, -748, -174,
+  };
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Qautnized matmul with 2 * 30 input and 9 * 30 matrix.
+TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_8Test) {
+  const std::vector<int8_t> input = {
+      4,   -41, 5,   -41, 22,  17, -30, 24,  13,  -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int32_t> input_zeropoint_times_weights = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      13,  -7,  -20, -22, 8,   -46, 9,   -2,  -18, -42, 40,  28,  -7,  24,  34,
+      -7,  -24, -24, 19,  14,  -19, -6,  -2,  -3,  5,   -36, -13, 6,   -27, 36,
+      -23, 0,   20,  -37, -23, 9,   17,  -41, 33,  -15, -18, -42, -41, -34, -16,
+      -6,  12,  -14, -15, -20, -14, 21,  -3,  -1,  -26, 54,  51,  35,  -14, 9,
+      -2,  13,  -6,  39,  34,  -21, 39,  -51, 19,  -44, 52,  0,   -2,  -38, -35,
+      -33, 4,   -22, -37, 27,  -23, 3,   -10, 5,   32,  6,   1,   -35, 24,  -19,
+      46,  43,  -55, 5,   38,  -14, 32,  -43, -44, -17, -13, -28, 56,  28,  -42,
+      4,   10,  -7,  25,  -15, -9,  -25, -14, -15, 6,   -10, -22, 40,  -72, 18,
+      -6,  -18, -2,  37,  -13, -10, 11,  -9,  32,  -28, 19,  -2,  4,   -31, 50,
+      -15, 23,  -34, -9,  41,  -6,  -34, 17,  2,   24,  -15, 21,  -17, -8,  -20,
+      1,   -63, 19,  -40, 12,  -5,  5,   -6,  1,   19,  -9,  -23, 5,   -34, 11,
+      26,  21,  54,  34,  -43, -29, 1,   16,  31,  -56, -28, 57,  -15, -23, 37,
+      -17, -3,  -6,  29,  18,  77,  17,  -20, -14, -19, 8,   -24, -7,  -45, -3,
+      0,   -25, -8,  6,   9,   3,   -15, 51,  4,   -15, -19, -16, -14, -47, -52,
+      25,  9,   58,  26,  -9,  -27, 49,  -6,  -21, 21,  18,  12,  -9,  -9,  14,
+      31,  -26, -19, -50, 17,  35,  11,  -10, 22,  -16, -43, -2,  26,  55,  -20,
+      -7,  21,  33,  -20, 26,  -15, -22, 30,  27,  3,   -34, 26,  12,  -1,  19,
+      26,  -25, 10,  30,  30,  -14, -23, -23, -35, -16, 26,  -41, 11,  1,   21,
+  };
+  const int32_t multiplier = 1347771520;
+  const int32_t shift = -7;
+  const int32_t output_zp = -11;
+
+  std::vector<int8_t> output = {1, 2, 3, 4, 5,  6,  5,  4,  3,
+                                2, 1, 2, 8, -1, -2, 11, 17, 18};
+  std::vector<int32_t> scrach(2 * 9, 0);
+  MatrixBatchVectorMultiplyAccumulate(
+      input.data(), input_zeropoint_times_weights.data(),
+      input_to_gate_weights.data(), multiplier, shift,
+      /*n_batch=*/2, /*n_input=*/30, /*n_output=*/9, output_zp, scrach.data(),
+      output.data());
+  const std::vector<int8_t> expected_output = {
+      5,   -9, -2, -30, -5, -11, -22, -18, 18,
+      -19, 2,  11, -5,  9,  -2,  10,  -38, -22,
+  };
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized layer norm of n_batch = 2 and n_input = 15.
+TEST(uKernels, QuantApplyLayerNormTest) {
+  const std::vector<int16_t> input = {
+      -310,  596,   34,   -68,  475,  92,  672, -54,  -913, -200,
+      -1194, -836,  -620, -237, 991,  533, 721, -736, -8,   -941,
+      -372,  -1084, 591,  2557, -779, 175, 582, 956,  -287, 944,
+  };
+  const std::vector<int16_t> layer_norm_weights = {
+      21849, 22882, 20626, 23854, 24779, 26354, 12980, 26231,
+      23716, 27271, 24937, 22647, 24715, 22854, 19646,
+  };
+  const std::vector<int32_t> bias_weight = {
+      -14175520, -13805465, -16027609, -13786809, -13321033,
+      -14399810, -15055368, -14536623, -14508746, -13784007,
+      -15206609, -15125830, -14996304, -14847597, -12814379,
+  };
+  const int32_t multiplier = 1895840000;
+  const int32_t shift = -13;
+  const int32_t limit = 1;
+
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplyLayerNorm(input.data(), layer_norm_weights.data(), bias_weight.data(),
+                 multiplier, shift, limit, 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -9407,  5846,   -4802,  -5295,  4822,   -2390,  930,   -5283,
+      -20352, -7846,  -26539, -18704, -15829, -8627,  10313, -2522,
+      -132,   -16058, -8206,  -19158, -13296, -14407, -1235, 20612,
+      -18591, -6738,  -2274,  2602,   -11622, 1565,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized tanh with Q3.12 input and Q0.15 output.
+TEST(uKernels, QuantTanh3Test) {
+  const std::vector<int16_t> input = {
+      -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplyTanh3(input.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -1156, 7076, -1412, -276, 2104, 2308,  64,    220,   -288,  -10132,
+      -964,  1016, -120,  844,  2944, -4640, -2392, 736,   -4352, 4352,
+      5180,  -232, -428,  8276, -412, -1308, -1196, -5044, 1612,  -10044,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized tanh with Q4.11 input and Q0.15 output.
+TEST(uKernels, QuantTanh4Test) {
+  const std::vector<int16_t> input = {
+      -5,  163, -31, -5,  54, 90, 1,  2,  -4, -42, -8,  29,  0,   47, 150,
+      -26, -36, 9,   -73, 25, 14, -2, -1, 29, -10, -12, -18, -29, 51, -92,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplyTanh4(input.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -76,  2596, -496, -76, 856,  1436, 24,   36,   -64,   -672,
+      -120, 456,  0,    752, 2400, -412, -576, 148,  -1168, 400,
+      216,  -36,  -24,  456, -164, -192, -292, -456, 820,   -1476,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized sigmoid with Q3.12 input and Q0.15 output.
+TEST(uKernels, QuantSigmoidTest) {
+  const std::vector<int16_t> input = {
+      -10500, 1398,   -6963,  -7404,  485,    -5401,  -1757, -7668,
+      -19248, -9692,  -24249, -17923, -15840, -10026, 5249,  -89,
+      1787,   -16178, -6691,  -19524, -13439, -24048, -1123, 32767,
+      -17267, -3378,  823,    11482,  -11139, 7508,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplySigmoid(input.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      2339, 19152, 5063,  4617,  17350, 6917,  12921, 4371,  299,  2813,
+      89,   409,   673,   2605,  25646, 16207, 19904, 615,   5353, 273,
+      1187, 91,    14153, 32756, 475,   9983,  18026, 30898, 2023, 28246,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized Multiply with 16bit output and 15 bit shift.
+TEST(uKernels, QuantMul16bitOut15ShiftTest) {
+  const std::vector<int16_t> input1 = {
+      2491, 32767, -32768, 32767, -32768, 32767, 32767, -32768, -32768, 2157,
+      4545, 14835, 1285,   29498, 26788,  2907,  7877,  6331,   8775,   3001,
+      1399, 4683,  1437,   1853,  12163,  4927,  7977,  3001,   16612,  4791,
+  };
+  const std::vector<int16_t> input2 = {
+      -1156, 32767, -32768, -32768, 32767, 2308,  64,    220,   -288,  -10132,
+      -964,  1016,  -120,   844,    2944,  -4640, -2392, 736,   -4352, 4352,
+      5180,  -232,  -428,   8276,   -412,  -1308, -1196, -5044, 1612,  -10044,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  CwiseMul(input1.data(), input2.data(), 2, 15, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -88,  32766, -32768, -32767, -32767, 2308, 64,   -220, 288,   -667,
+      -134, 460,   -5,     760,    2407,   -412, -575, 142,  -1165, 399,
+      221,  -33,   -19,    468,    -153,   -197, -291, -462, 817,   -1469,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized Multiply with 16bit output and 19 bit shift.
+TEST(uKernels, QuantMul16bitOut19ShiftTest) {
+  const std::vector<int16_t> input1 = {
+      2491, 32767, -32768, 32767, -32768, 32767, 32767, -32768, -32768, 2157,
+      4545, 14835, 1285,   29498, 26788,  2907,  7877,  6331,   8775,   3001,
+      1399, 4683,  1437,   1853,  12163,  4927,  7977,  3001,   16612,  4791,
+  };
+  const std::vector<int16_t> input2 = {
+      -1156, 32767, -32768, -32768, 32767, 2308,  64,    220,   -288,  -10132,
+      -964,  1016,  -120,   844,    2944,  -4640, -2392, 736,   -4352, 4352,
+      5180,  -232,  -428,   8276,   -412,  -1308, -1196, -5044, 1612,  -10044,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  CwiseMul(input1.data(), input2.data(), 2, 15, 19, output.data());
+  const std::vector<int16_t> expected_output = {
+      -5, 2048, 2048, -2048, -2048, 144, 4,   -14, 18,  -42,
+      -8, 29,   0,    47,    150,   -26, -36, 9,   -73, 25,
+      14, -2,   -1,   29,    -10,   -12, -18, -29, 51,  -92,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized Multiply with 8bit output and 32 bit shift.
+TEST(uKernels, QuantMul8bitOut23ShiftTest) {
+  const std::vector<int16_t> input1 = {
+      2491, 32767, -32768, 32767, -32768, 32767, 32767, -32768, -32768, 2157,
+      4545, 14835, 1285,   29498, 26788,  2907,  7877,  6331,   8775,   3001,
+      1399, 4683,  1437,   1853,  12163,  4927,  7977,  3001,   16612,  4791,
+  };
+  const std::vector<int16_t> input2 = {
+      -1156, 32767, -32768, -32768, 32767, 2308,  64,    220,   -288,  -10132,
+      -964,  1016,  -120,   844,    2944,  -4640, -2392, 736,   -4352, 4352,
+      5180,  -232,  -428,   8276,   -412,  -1308, -1196, -5044, 1612,  -10044,
+  };
+  std::vector<int8_t> output(2 * 15, 0);
+  CwiseMul(input1.data(), input2.data(), 2, 15, 23, output.data());
+  const std::vector<int8_t> expected_output = {
+      0,  -128, -128, -128, -128, 9, 0, -1, 1, -3, -1, 2,  0,  3, 9,
+      -2, -2,   1,    -5,   2,    1, 0, 0,  2, -1, -1, -1, -2, 3, -6,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized element wise Add with saturation.
+TEST(uKernels, QuantAddTest) {
+  const std::vector<int16_t> input1 = {
+      2491,   32767, -32768, 32767, -32768, 32767, 32767, -32768, -32768, 20000,
+      -20000, 14835, 1285,   29498, 26788,  2907,  7877,  6331,   8775,   3001,
+      1399,   4683,  1437,   1853,  12163,  4927,  7977,  3001,   16612,  4791,
+  };
+  const std::vector<int16_t> input2 = {
+      -1156,  32767, -32768, -32768, 32767, 2308,  64,    220,   -288,  20000,
+      -20000, 1016,  -120,   844,    2944,  -4640, -2392, 736,   -4352, 4352,
+      5180,   -232,  -428,   8276,   -412,  -1308, -1196, -5044, 1612,  -10044,
+  };
+  std::vector<int16_t> output(2 * 15, 0);
+  CwiseAdd(input1.data(), input2.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      1335,   32767, -32768, -1,    -1,    32767, 32767, -32548, -32768, 32767,
+      -32768, 15851, 1165,   30342, 29732, -1733, 5485,  7067,   4423,   7353,
+      6579,   4451,  1009,   10129, 11751, 3619,  6781,  -2043,  18224,  -5253,
+  };
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized clipping for 16 bit.
+TEST(uKernels, QuantClip16Test) {
+  std::vector<int16_t> input = {
+      -10500, 1,     -2,     -7404,  200,    -5401,  -1757, -7668,
+      -19248, -9692, -24249, -17923, -15840, -10026, 5249,  -89,
+      1787,   -200,  -6691,  -19524, -13439, -24048, -1123, 32767,
+      -17267, -3378, 823,    11482,  -11139, 7508,
+  };
+  CwiseClipping(input.data(), 300, 2, 15);
+  const std::vector<int16_t> expected_output = {
+      -300, 1,    -2,   -300, 200,  -300, -300, -300, -300, -300,
+      -300, -300, -300, -300, 300,  -89,  300,  -200, -300, -300,
+      -300, -300, -300, 300,  -300, -300, 300,  300,  -300, 300,
+  };
+  EXPECT_THAT(input, testing::ElementsAreArray(expected_output));
+}
+
+// Quantized clipping for 8 bit.
+TEST(uKernels, QuantClip8Test) {
+  std::vector<int8_t> input = {
+      4,   -11, -5, -34, -10, -17, -27, -22, 15,  127, -128, 1,  3, 56, 3,
+      -21, 1,   9,  -13, 10,  0,   -1,  -55, -40, 127, -128, 11, 4, 6,  32,
+  };
+  CwiseClipping(input.data(), 32, 2, 15);
+  const std::vector<int8_t> expected_output = {
+      4,   -11, -5, -32, -10, -17, -27, -22, 15,  32, -32, 1,  3, 32, 3,
+      -21, 1,   9,  -13, 10,  0,   -1,  -32, -32, 32, -32, 11, 4, 6,  32,
+  };
+  EXPECT_THAT(input, testing::ElementsAreArray(expected_output));
+}
+
 struct MatrixVectorData {
   // Contains dense parameters.
   std::vector<int8_t> matrix;
@@ -717,15 +1013,6 @@ TEST(uKernels, ApplyActivationToVectorTest) {
                           {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
 }
 
-TEST(uKernels, CopyVectorTest) {
-  constexpr int kVectorSize = 5;
-  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
-  std::vector<float> output(kVectorSize);
-  CopyVector(input, kVectorSize, output.data());
-  EXPECT_THAT(output,
-              ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
-}
-
 TEST(uKernels, Sub1VectorTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
@@ -735,14 +1022,6 @@ TEST(uKernels, Sub1VectorTest) {
               ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
 }
 
-TEST(uKernels, ZeroVectorTest) {
-  constexpr int kVectorSize = 5;
-  std::vector<float> output(kVectorSize);
-  ZeroVector(output.data(), kVectorSize);
-  EXPECT_THAT(output,
-              ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
-}
-
 TEST(uKernels, VectorBatchVectorCwiseProductAccumulate) {
   constexpr int kVectorSize = 29;
   constexpr int kBatchSize = 4;
@@ -874,7 +1153,7 @@ TEST(uKernels, VectorShiftLeftTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> result(kVectorSize);
-  VectorShiftLeft(input, kVectorSize, 3.0);
+  VectorShiftLeft(input, kVectorSize, 3.0f);
   result.assign(input, input + kVectorSize);
   EXPECT_THAT(result,
               ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index b786bdeefc2..eb7b630c574 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
 
 #include <algorithm>
+#include <cstdint>
 #include <cstring>
 #include <initializer_list>
 
@@ -985,6 +986,9 @@ struct SoftmaxParams {
   int32 reverse_scaling_divisor;
   int32 reverse_scaling_right_shift;
   int diff_min;
+  int32_t zero_point;
+  float scale;
+  float* table;
 };
 
 struct SpaceToBatchParams {
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 7f5ab194af3..5fdb301b20d 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -23,19 +23,6 @@ limitations under the License.
 
 namespace tflite {
 
-void GuardedQuantizeMultiplier(double effective_output_scale,
-                               int32_t* significand, int* shift) {
-  QuantizeMultiplier(effective_output_scale, significand, shift);
-  // Additional guard to make sure RoundingDivideByPOT does not fail.
-  if (*shift < -31) {
-    // If shift is less than -31, RoundingDivideByPOT fails. This happens when
-    // min and max are close and small. For this particular case, both
-    // significand and shift are set to zero.
-    *significand = 0;
-    *shift = 0;
-  }
-}
-
 TfLiteStatus PopulateConvolutionQuantizationParams(
     TfLiteContext* context, const TfLiteTensor* input,
     const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
@@ -79,7 +66,7 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
                                           static_cast<double>(output_scale);
     int32_t significand;
     int shift;
-    GuardedQuantizeMultiplier(effective_output_scale, &significand, &shift);
+    QuantizeMultiplier(effective_output_scale, &significand, &shift);
     per_channel_multiplier[i] = significand;
     per_channel_shift[i] = shift;
   }
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index a76d925c3bf..d6ea1c17020 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <limits>
 
+#include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 
@@ -29,20 +30,24 @@ inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
 }
 inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
                                     int index) {
-  return &context->tensors[node->inputs->data[index]];
+  return &context
+              ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
 }
 inline TfLiteTensor* GetVariableInput(TfLiteContext* context, TfLiteNode* node,
                                       int index) {
-  TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+  TfLiteTensor* tensor =
+      &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
   return (tensor->is_variable) ? tensor : nullptr;
 }
 inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
                                int index) {
-  return &context->tensors[node->outputs->data[index]];
+  return &context
+              ->tensors[flatbuffers::EndianScalar(node->outputs->data[index])];
 }
 inline TfLiteTensor* GetTemporary(TfLiteContext* context, TfLiteNode* node,
                                   int index) {
-  return &context->tensors[node->temporaries->data[index]];
+  return &context->tensors[flatbuffers::EndianScalar(
+      node->temporaries->data[index])];
 }
 inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                             TfLiteNode* node, int index) {
@@ -54,32 +59,29 @@ inline int NumIntermediates(const TfLiteNode* node) {
   return node->intermediates->size;
 }
 
-inline int64_t NumElements(const TfLiteTensor* t) {
+inline int64_t NumElements(const TfLiteIntArray* dims) {
   int64_t count = 1;
-  for (int i = 0; i < NumDimensions(t); ++i) {
-    count *= SizeOfDimension(t, i);
+  for (int i = 0; i < dims->size; ++i) {
+    count *= dims->data[i];
   }
   return count;
 }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  return NumElements(t->dims);
+}
+
 inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                                   const TfLiteNode* node,
                                                   int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
   if (use_tensor) {
-    return &context->tensors[node->inputs->data[index]];
+    return &context
+                ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
   }
   return nullptr;
 }
 
-inline int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
-  if (is_uint8) {
-    return reinterpret_cast<int8_t*>(tensor->data.uint8);
-  } else {
-    return tensor->data.int8;
-  }
-}
-
 // Determines whether tensor is constant.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
@@ -114,10 +116,6 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
     int32_t* output_activation_min, int32_t* output_activation_max,
     int32_t* per_channel_multiplier, int* per_channel_shift);
 
-// QuantizedMultiplier with the guard that shift will not be smaller than -31.
-void GuardedQuantizeMultiplier(double effective_output_scale,
-                               int32_t* significand, int* shift);
-
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 04d559ab1c2..d410d2b0a48 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -356,6 +356,7 @@ TEST_F(KernelUtilTest, CheckAndPopulateShift) {
   TfLiteTensorFree(&output);
 }
 
+#ifndef __APPLE__  // Some Apple toolchains don't support std::ldexp
 TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   // Create input.
   TfLiteTensor input;
@@ -389,15 +390,9 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   filter_params->scale = TfLiteFloatArrayCreate(3);
-  int32_t two_pow_neg_31 = 0x30000000;  // 2^-31 so shift = -30.
-  int32_t two_pow_neg_32 = 0x2F800000;  // 2^-32 so shift = -31.
-  int32_t two_pow_neg_33 = 0x2F000000;  // 2^-33 so shift = -32.
-  float* scale_date = reinterpret_cast<float*>(&two_pow_neg_31);
-  filter_params->scale->data[0] = *scale_date;
-  scale_date = reinterpret_cast<float*>(&two_pow_neg_32);
-  filter_params->scale->data[1] = *scale_date;
-  scale_date = reinterpret_cast<float*>(&two_pow_neg_33);
-  filter_params->scale->data[2] = *scale_date;
+  filter_params->scale->data[0] = std::ldexp(1.0f, -31);
+  filter_params->scale->data[1] = std::ldexp(1.0f, -32);
+  filter_params->scale->data[2] = std::ldexp(1.0f, -33);
   filter_params->zero_point = TfLiteIntArrayCreate(3);
   filter_params->zero_point->data[0] = 0;
   filter_params->zero_point->data[1] = 0;
@@ -416,9 +411,9 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   bias_params->scale = TfLiteFloatArrayCreate(3);
-  bias_params->scale->data[0] = 4.6566129e-10;  // 2^-31
-  bias_params->scale->data[1] = 2.3283064e-10;  // 2^-32
-  bias_params->scale->data[2] = 1.1641532e-10;  // 2^-33
+  bias_params->scale->data[0] = std::ldexp(1.0f, -31);
+  bias_params->scale->data[1] = std::ldexp(1.0f, -32);
+  bias_params->scale->data[2] = std::ldexp(1.0f, -33);
   bias_params->zero_point = TfLiteIntArrayCreate(3);
   bias_params->zero_point->data[0] = 11;
   bias_params->zero_point->data[1] = 12;
@@ -467,6 +462,7 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   TfLiteTensorFree(&bias);
   TfLiteTensorFree(&output);
 }
+#endif
 
 TEST_F(KernelUtilTest, CheckAndPopulateUint8) {
   // Create input.
diff --git a/tensorflow/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
index 582bcff64a8..7a2805d503b 100644
--- a/tensorflow/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -78,7 +78,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
-                         const std::function<bool(bool, bool)>& func) {
+                         bool (*func)(bool, bool)) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
@@ -86,28 +86,30 @@ TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (data->requires_broadcast) {
-    reference_ops::BroadcastLogical4DSlow(
+    reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
         GetTensorShape(input1), GetTensorData<bool>(input1),
         GetTensorShape(input2), GetTensorData<bool>(input2),
         GetTensorShape(output), GetTensorData<bool>(output), func);
   } else {
-    reference_ops::Logical(GetTensorShape(input1), GetTensorData<bool>(input1),
-                           GetTensorShape(input2), GetTensorData<bool>(input2),
-                           GetTensorShape(output), GetTensorData<bool>(output),
-                           func);
+    reference_ops::BinaryFunction<bool, bool, bool>(
+        GetTensorShape(input1), GetTensorData<bool>(input1),
+        GetTensorShape(input2), GetTensorData<bool>(input2),
+        GetTensorShape(output), GetTensorData<bool>(output), func);
   }
 
   return kTfLiteOk;
 }
 
+bool LogicalOr(bool x, bool y) { return x || y; }
+
 TfLiteStatus LogicalOrEval(TfLiteContext* context, TfLiteNode* node) {
-  const auto logical_or_func = std::logical_or<bool>();
-  return LogicalImpl(context, node, logical_or_func);
+  return LogicalImpl(context, node, LogicalOr);
 }
 
+bool LogicalAnd(bool x, bool y) { return x && y; }
+
 TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
-  const auto logical_and_func = std::logical_and<bool>();
-  return LogicalImpl(context, node, logical_and_func);
+  return LogicalImpl(context, node, LogicalAnd);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/lsh_projection.cc b/tensorflow/lite/kernels/lsh_projection.cc
index f68ff4d634a..1ed1986536e 100644
--- a/tensorflow/lite/kernels/lsh_projection.cc
+++ b/tensorflow/lite/kernels/lsh_projection.cc
@@ -61,6 +61,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include <farmhash.h>
@@ -121,6 +122,8 @@ int RunningSignBit(const TfLiteTensor* input, const TfLiteTensor* weight,
   const size_t key_bytes = sizeof(float) + input_item_bytes;
   std::unique_ptr<char[]> key(new char[key_bytes]);
 
+  const float* weight_ptr = GetTensorData<float>(weight);
+
   for (int i = 0; i < SizeOfDimension(input, 0); ++i) {
     // Create running hash id and value for current dimension.
     memcpy(key.get(), &seed, seed_size);
@@ -129,10 +132,10 @@ int RunningSignBit(const TfLiteTensor* input, const TfLiteTensor* weight,
     int64_t hash_signature = ::util::Fingerprint64(key.get(), key_bytes);
     double running_value = static_cast<double>(hash_signature);
     input_ptr += input_item_bytes;
-    if (weight == nullptr) {
+    if (weight_ptr == nullptr) {
       score += running_value;
     } else {
-      score += weight->data.f[i] * running_value;
+      score += weight_ptr[i] * running_value;
     }
   }
 
@@ -146,7 +149,7 @@ void SparseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
   for (int i = 0; i < num_hash; i++) {
     int32_t hash_signature = 0;
     for (int j = 0; j < num_bits; j++) {
-      float seed = hash->data.f[i * num_bits + j];
+      float seed = GetTensorData<float>(hash)[i * num_bits + j];
       int bit = RunningSignBit(input, weight, seed);
       hash_signature = (hash_signature << 1) | bit;
     }
@@ -160,7 +163,7 @@ void DenseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
   int num_bits = SizeOfDimension(hash, 1);
   for (int i = 0; i < num_hash; i++) {
     for (int j = 0; j < num_bits; j++) {
-      float seed = hash->data.f[i * num_bits + j];
+      float seed = GetTensorData<float>(hash)[i * num_bits + j];
       int bit = RunningSignBit(input, weight, seed);
       *out_buf++ = bit;
     }
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 19ec80889e7..c155a575508 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -19,14 +19,16 @@ limitations under the License.
 #include <cstdlib>
 #include <iostream>
 #include <limits>
+#include <memory>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/lstm_eval.h"
@@ -51,6 +53,7 @@ struct OpData {
   int activation_state_tensor_index;
   int cell_state_tensor_index;
   int scratch_tensor_index;
+  lstm_eval::QuantizedLstmParameter quantized_lstm_param;
 };
 
 // For full inputs kernel (24-inputs).
@@ -104,6 +107,289 @@ constexpr int kOutputLayerNormCoefficientsTensor = 23;  // Optional
 // Output tensors.
 constexpr int kOutputTensor = 0;
 
+namespace {
+TfLiteStatus PopulateQuantizedLstmParams(
+    TfLiteContext* context, TfLiteNode* node,
+    lstm_eval::QuantizedLstmParameter* quantized_lstm_param) {
+  std::vector<float> intermediate_scale;
+  std::vector<int32> intermediate_zp;
+  for (int i = 0; i < 12; ++i) {
+    // Calculate intermediate tensors.
+    TfLiteTensor* intermediate =
+        &context->tensors[node->intermediates->data[i]];
+    auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+        intermediate->quantization.params);
+    intermediate_scale.push_back(params->scale->data[0]);
+    intermediate_zp.push_back(params->zero_point->data[0]);
+  }
+
+  // Calculate quantized clip for projection and cell.
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const float cell_clip = params->cell_clip;
+  const float proj_clip = params->proj_clip;
+
+  const TfLiteTensor* cell_tensor =
+      GetInput(context, node, kInputCellStateTensor);
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+
+  auto* cell_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      cell_tensor->quantization.params);
+  auto* proj_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      output_tensor->quantization.params);
+  if (cell_clip > 0.0) {
+    quantized_lstm_param->quantized_cell_clip =
+        static_cast<int32_t>(cell_clip / cell_params->scale->data[0]);
+  } else {
+    quantized_lstm_param->quantized_cell_clip = 0;
+  }
+  if (proj_clip > 0.0) {
+    quantized_lstm_param->quantized_proj_clip =
+        static_cast<int32_t>(proj_clip / proj_params->scale->data[0]);
+  } else {
+    quantized_lstm_param->quantized_proj_clip = 0;
+  }
+
+  // Calculate effective scales.
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_layer_norm_coefficients =
+      is_layer_norm_lstm ? GetOptionalInputTensor(
+                               context, node, kInputLayerNormCoefficientsTensor)
+                         : nullptr;
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kForgetLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* cell_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kCellLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* output_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kOutputLayerNormCoefficientsTensor)
+          : nullptr;
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+
+  TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float cell_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float cell_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float cell_to_output_weight_scale = default_scale;
+  float proj_weight_scale = default_scale;
+  float layer_norm_input_scale = default_scale;
+  float layer_norm_forget_scale = default_scale;
+  float layer_norm_cell_scale = default_scale;
+  float layer_norm_output_scale = default_scale;
+  float activation_scale = default_scale;
+  float cell_scale = default_scale;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_proj_scale = default_scale;
+
+  // Populate scales.
+  if (!use_cifg) {
+    input_to_input_weight_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weight_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weight_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weight_scale = cell_to_output_weights->params.scale;
+  }
+
+  if (is_layer_norm_lstm) {
+    if (!use_cifg) {
+      layer_norm_input_scale = input_layer_norm_coefficients->params.scale;
+    }
+    layer_norm_forget_scale = forget_layer_norm_coefficients->params.scale;
+    layer_norm_cell_scale = cell_layer_norm_coefficients->params.scale;
+    layer_norm_output_scale = output_layer_norm_coefficients->params.scale;
+  }
+
+  if (use_projection) {
+    proj_weight_scale = projection_weights->params.scale;
+  }
+  activation_scale = activation_state->params.scale;
+
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+  cell_scale = std::pow(2, -11);
+  input_scale = input->params.scale;
+
+  // Calculate effective scales.
+  if (!use_cifg) {
+    effective_input_to_input_scale =
+        input_to_input_weight_scale * input_scale / intermediate_scale[0];
+    effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                         activation_scale /
+                                         intermediate_scale[0];
+  }
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[3];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[3];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[6];
+  effective_recurrent_to_cell_scale =
+      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[6];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[9];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[9];
+  // Use (2, -7) as scale.
+  effective_proj_scale = proj_weight_scale * std::pow(2, -7) / activation_scale;
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      effective_cell_to_input_scale =
+          cell_scale * cell_to_input_weight_scale / intermediate_scale[0];
+    }
+    effective_cell_to_forget_scale =
+        cell_scale * cell_to_forget_weight_scale / intermediate_scale[3];
+    effective_cell_to_output_scale =
+        cell_scale * cell_to_output_weight_scale / intermediate_scale[9];
+  }
+
+  // Decompose scales.
+  QuantizeMultiplier(effective_input_to_input_scale,
+                     &quantized_lstm_param->effective_input_to_input_scale_a,
+                     &quantized_lstm_param->effective_input_to_input_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_input_scale,
+      &quantized_lstm_param->effective_recurrent_to_input_scale_a,
+      &quantized_lstm_param->effective_recurrent_to_input_scale_b);
+  QuantizeMultiplier(effective_cell_to_input_scale,
+                     &quantized_lstm_param->effective_cell_to_input_scale_a,
+                     &quantized_lstm_param->effective_cell_to_input_scale_b);
+  QuantizeMultiplier(effective_input_to_forget_scale,
+                     &quantized_lstm_param->effective_input_to_forget_scale_a,
+                     &quantized_lstm_param->effective_input_to_forget_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_forget_scale,
+      &quantized_lstm_param->effective_recurrent_to_forget_scale_a,
+      &quantized_lstm_param->effective_recurrent_to_forget_scale_b);
+  QuantizeMultiplier(effective_cell_to_forget_scale,
+                     &quantized_lstm_param->effective_cell_to_forget_scale_a,
+                     &quantized_lstm_param->effective_cell_to_forget_scale_b);
+  QuantizeMultiplier(effective_input_to_cell_scale,
+                     &quantized_lstm_param->effective_input_to_cell_scale_a,
+                     &quantized_lstm_param->effective_input_to_cell_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_cell_scale,
+      &quantized_lstm_param->effective_recurrent_to_cell_scale_a,
+      &quantized_lstm_param->effective_recurrent_to_cell_scale_b);
+  QuantizeMultiplier(effective_input_to_output_scale,
+                     &quantized_lstm_param->effective_input_to_output_scale_a,
+                     &quantized_lstm_param->effective_input_to_output_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_output_scale,
+      &quantized_lstm_param->effective_recurrent_to_output_scale_a,
+      &quantized_lstm_param->effective_recurrent_to_output_scale_b);
+  QuantizeMultiplier(effective_cell_to_output_scale,
+                     &quantized_lstm_param->effective_cell_to_output_scale_a,
+                     &quantized_lstm_param->effective_cell_to_output_scale_b);
+  QuantizeMultiplier(effective_proj_scale,
+                     &quantized_lstm_param->effective_proj_scale_a,
+                     &quantized_lstm_param->effective_proj_scale_b);
+  QuantizeMultiplier(layer_norm_input_scale,
+                     &quantized_lstm_param->layer_norm_input_scale_a,
+                     &quantized_lstm_param->layer_norm_input_scale_b);
+  QuantizeMultiplier(layer_norm_forget_scale,
+                     &quantized_lstm_param->layer_norm_forget_scale_a,
+                     &quantized_lstm_param->layer_norm_forget_scale_b);
+  QuantizeMultiplier(layer_norm_cell_scale,
+                     &quantized_lstm_param->layer_norm_cell_scale_a,
+                     &quantized_lstm_param->layer_norm_cell_scale_b);
+  QuantizeMultiplier(layer_norm_output_scale,
+                     &quantized_lstm_param->layer_norm_output_scale_a,
+                     &quantized_lstm_param->layer_norm_output_scale_b);
+
+  // TODO(jianlijianli): add support for cifg.
+  // 10000 is used to make sure the kernel logic does not overflow.
+  quantized_lstm_param->inv_large_value[0] =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_input_scale));
+  quantized_lstm_param->inv_large_value[1] =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_forget_scale));
+  quantized_lstm_param->inv_large_value[2] =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_cell_scale));
+  quantized_lstm_param->inv_large_value[3] =
+      std::max(1, static_cast<int32_t>(10000 * layer_norm_output_scale));
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMFullKernel;
@@ -116,7 +402,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
                                         int n_output, int n_cell,
-                                        bool is_layer_norm_lstm) {
+                                        bool is_layer_norm_lstm,
+                                        bool is_fully_quantized) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
@@ -238,25 +525,41 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+    if (is_fully_quantized) {
+      TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+    }
   }
 
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+  if (is_fully_quantized) {
+    TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+  if (is_fully_quantized) {
+    TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+  if (is_fully_quantized) {
+    TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+  }
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
@@ -273,7 +576,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
+    if (is_fully_quantized) {
+      TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
+    }
   }
 
   // Making sure the projection tensors are consistent:
@@ -295,8 +602,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
                         n_cell);
-      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
-                        kTfLiteFloat32);
+      if (is_fully_quantized) {
+        TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
+                          kTfLiteInt16);
+      } else {
+        TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
+                          kTfLiteFloat32);
+      }
     }
 
     const TfLiteTensor* forget_layer_norm_coefficients =
@@ -305,8 +617,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    if (is_fully_quantized) {
+      TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
+                        kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
+                        kTfLiteFloat32);
+    }
 
     const TfLiteTensor* cell_layer_norm_coefficients =
         GetInput(context, node, kCellLayerNormCoefficientsTensor);
@@ -314,8 +631,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    if (is_fully_quantized) {
+      TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
+                        kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
+                        kTfLiteFloat32);
+    }
 
     const TfLiteTensor* output_layer_norm_coefficients =
         GetInput(context, node, kOutputLayerNormCoefficientsTensor);
@@ -323,13 +645,134 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    if (is_fully_quantized) {
+      TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
+                        kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
+                        kTfLiteFloat32);
+    }
   }
 
   return kTfLiteOk;
 }
 
+TfLiteStatus PrecomputeZeroPointTimesWeightWithBias(
+    TfLiteContext* context, int32_t zero_pint,
+    const TfLiteTensor* weight_tensor, const TfLiteTensor* bias_tensor,
+    std::unique_ptr<int32_t[]>* output) {
+  if (weight_tensor == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const RuntimeShape& weight_shape = GetTensorShape(weight_tensor);
+  TF_LITE_ENSURE_EQ(context, weight_shape.DimensionsCount(), 2);
+  int row = weight_shape.Dims(0);
+  int col = weight_shape.Dims(1);
+  const int8_t* weight = GetTensorData<int8_t>(weight_tensor);
+  const int32_t* bias =
+      bias_tensor == nullptr ? nullptr : GetTensorData<int32_t>(bias_tensor);
+  output->reset(new int32_t[row]);
+  for (int i = 0; i < row; ++i) {
+    int32_t accu = bias == nullptr ? 0 : bias[i];
+    for (int j = 0; j < col; ++j) {
+      int weight_val = weight[i * col + j];
+      accu += weight_val * zero_pint;
+    }
+    output->get()[i] = accu;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
+                                                       OpData* op_data,
+                                                       TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
+
+  const int32_t input_zero_point = -input->params.zero_point;
+  const int32_t activation_zero_point = -activation_state->params.zero_point;
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  lstm_eval::QuantizedLstmParameter* quantized_lstm_params =
+      &op_data->quantized_lstm_param;
+
+  // Forget gate.
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_forget_weights, nullptr,
+          &(quantized_lstm_params->input_to_forget_weight_x_input_zp)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, activation_zero_point, recurrent_to_forget_weights, nullptr,
+          &(quantized_lstm_params
+                ->recurrent_to_forget_weight_x_activation_zp)));
+  // Modulation gate.
+  TF_LITE_ENSURE_OK(
+      context, PrecomputeZeroPointTimesWeightWithBias(
+                   context, input_zero_point, input_to_cell_weights, nullptr,
+                   &(quantized_lstm_params->input_to_cell_weight_x_input_zp)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, activation_zero_point, recurrent_to_cell_weights, nullptr,
+          &(quantized_lstm_params->recurrent_to_cell_weight_x_activation_zp)));
+  // Output gate.
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_output_weights, nullptr,
+          &(quantized_lstm_params->input_to_output_weight_x_input_zp)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, activation_zero_point, recurrent_to_output_weights, nullptr,
+          &(quantized_lstm_params
+                ->recurrent_to_output_weight_x_activation_zp)));
+  // Input gate.
+  TF_LITE_ENSURE_OK(
+      context, PrecomputeZeroPointTimesWeightWithBias(
+                   context, input_zero_point, input_to_input_weights, nullptr,
+                   &(quantized_lstm_params->input_to_input_weight_x_input_zp)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, activation_zero_point, recurrent_to_input_weights, nullptr,
+          &(quantized_lstm_params->recurrent_to_input_weight_x_activation_zp)));
+
+  // Projection bias.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, 0, projection_weights, projection_bias,
+                        &(quantized_lstm_params->projection_bias_accu)));
+  return kTfLiteOk;
+}
+
 // Resize the output, state tensors based on the sizes of the input tensors.
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
@@ -370,7 +813,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  const bool is_fully_quantized = input->type == kTfLiteInt8;
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
@@ -389,9 +832,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context,
-                    CheckInputTensorDimensions(context, node, n_input, n_output,
-                                               n_cell, is_layer_norm_lstm));
+  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(
+                                 context, node, n_input, n_output, n_cell,
+                                 is_layer_norm_lstm, is_fully_quantized));
 
   // Get the pointer to output, activation_state and cell_state tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -420,30 +863,36 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
     node->temporaries = TfLiteIntArrayCreate(7);
+  } else if (is_fully_quantized) {
+    node->temporaries = TfLiteIntArrayCreate(6);
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
-  node->temporaries->data[0] = op_data->scratch_tensor_index;
 
-  // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-  scratch_buffer->type = input->type;
-  scratch_buffer->allocation_type = kTfLiteArenaRw;
+  // Create a scratch buffer tensor for float case and hybrid case.
+  // TODO(jianlijianli): Create a is_float boolean and reorginze the temporary
+  // buffer allocation logic.
+  if (!is_fully_quantized) {
+    node->temporaries->data[0] = op_data->scratch_tensor_index;
+    TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+    scratch_buffer->type = input->type;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
 
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-  scratch_buffer_size->data[0] = n_batch;
-  if (use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 4;
+    const TfLiteTensor* input_to_input_weights =
+        GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+    const bool use_cifg = (input_to_input_weights == nullptr);
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    if (use_cifg) {
+      // Reserving space for Cell, Forget, Output gates
+      scratch_buffer_size->data[1] = n_cell * 3;
+    } else {
+      // Reserving space for Input, Cell, Forget, Output gates
+      scratch_buffer_size->data[1] = n_cell * 4;
+    }
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
   }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                   scratch_buffer_size));
 
   if (is_hybrid_op) {
     // Allocate temporary tensors to store quantized values of input,
@@ -530,6 +979,45 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                               recovered_cell_weights_size));
     }
   }
+
+  if (is_fully_quantized) {
+    // Populate quantization parameters.
+    PopulateQuantizedLstmParams(context, node, &op_data->quantized_lstm_param);
+
+    // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+    // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
+    // buffer with size n_batch * n_cell.
+    //
+    // TODO(jianlijianli): Handle cifg case as well, which might save one
+    // buffer.
+    for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
+      node->temporaries->data[scratch_index] =
+          op_data->scratch_tensor_index + scratch_index;
+      TfLiteTensor* scratch_tensor =
+          GetTemporary(context, node, /*index=*/scratch_index);
+      scratch_tensor->type = kTfLiteInt16;
+      if (scratch_index == 4) {
+        scratch_tensor->type = kTfLiteInt8;
+      } else if (scratch_index == 5) {
+        scratch_tensor->type = kTfLiteInt32;
+      }
+      scratch_tensor->allocation_type = kTfLiteArenaRw;
+      const int scratch_dimension[2] = {n_batch, n_cell};
+      if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                     scratch_dimension)) {
+        TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+        scratch_buffer_size->data[0] = n_batch;
+        scratch_buffer_size->data[1] = n_cell;
+        TF_LITE_ENSURE_OK(context,
+                          context->ResizeTensor(context, scratch_tensor,
+                                                scratch_buffer_size));
+      }
+    }
+
+    // Populate precomputed zp * weight.
+    TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
+                                   context, op_data, node));
+  }
   return kTfLiteOk;
 }
 
@@ -595,9 +1083,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
   TfLiteTensor* activation_state =
       &context->tensors[op_data->activation_state_tensor_index];
   TfLiteTensor* cell_state =
@@ -607,6 +1092,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
+      // Index the scratch buffers pointers to the global scratch buffer.
+      TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
       return lstm_eval::EvalFloat(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
@@ -628,36 +1115,65 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, /*index=*/6);
-      return lstm_eval::EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_layer_norm_coefficients, forget_layer_norm_coefficients,
-          cell_layer_norm_coefficients, output_layer_norm_coefficients,
-          /*aux_input=*/nullptr,
-          /*aux_input_to_input_weights=*/nullptr,
-          /*aux_input_to_forget_weights=*/nullptr,
-          /*aux_input_to_cell_weights=*/nullptr,
-          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
-          /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized,
-          /*aux_input_quantized=*/nullptr, activation_state_quantized,
-          cell_state_quantized, activation_state, cell_state, output);
+      const bool is_hybrid = (input->type == kTfLiteFloat32);
+      if (is_hybrid) {
+        // Index the scratch buffers pointers to the global scratch buffer.
+        TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+        TfLiteTensor* input_quantized =
+            GetTemporary(context, node, /*index=*/1);
+        TfLiteTensor* activation_state_quantized =
+            GetTemporary(context, node, /*index=*/2);
+        TfLiteTensor* cell_state_quantized =
+            GetTemporary(context, node, /*index=*/3);
+        TfLiteTensor* scaling_factors =
+            GetTemporary(context, node, /*index=*/4);
+        TfLiteTensor* prod_scaling_factors =
+            GetTemporary(context, node, /*index=*/5);
+        TfLiteTensor* recovered_cell_weights =
+            GetTemporary(context, node, /*index=*/6);
+        return lstm_eval::EvalHybrid(
+            input, input_to_input_weights, input_to_forget_weights,
+            input_to_cell_weights, input_to_output_weights,
+            recurrent_to_input_weights, recurrent_to_forget_weights,
+            recurrent_to_cell_weights, recurrent_to_output_weights,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients,
+            /*aux_input=*/nullptr,
+            /*aux_input_to_input_weights=*/nullptr,
+            /*aux_input_to_forget_weights=*/nullptr,
+            /*aux_input_to_cell_weights=*/nullptr,
+            /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+            forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+            projection_bias, params, /*forward_sequence=*/true,
+            /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
+            scaling_factors, prod_scaling_factors, recovered_cell_weights,
+            input_quantized,
+            /*aux_input_quantized=*/nullptr, activation_state_quantized,
+            cell_state_quantized, activation_state, cell_state, output);
+      } else {
+        TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
+        TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
+        TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
+        TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
+        TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
+        TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+        return lstm_eval::EvalQuantized(
+            input, input_to_input_weights, input_to_forget_weights,
+            input_to_cell_weights, input_to_output_weights,
+            recurrent_to_input_weights, recurrent_to_forget_weights,
+            recurrent_to_cell_weights, recurrent_to_output_weights,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+            cell_bias, output_gate_bias, projection_weights, projection_bias,
+            params, &op_data->quantized_lstm_param, activation_state,
+            cell_state, output, scratch0, scratch1, scratch2, scratch3,
+            scratch4, scratch5);
+        return kTfLiteOk;
+      }
     }
     default:
       context->ReportError(context, "Type %d is not currently supported.",
@@ -796,7 +1312,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         GetTensorShape(activation_out), GetTensorData<float>(activation_out),
         GetTensorShape(concat_temp), GetTensorData<float>(concat_temp),
         GetTensorShape(activation_temp), GetTensorData<float>(activation_temp),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   } else if (input->type == kTfLiteUInt8 &&
              prev_activation->type == kTfLiteUInt8 &&
              weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 &&
@@ -844,7 +1360,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         GetTensorShape(concat_temp), GetTensorData<uint8_t>(concat_temp),
         GetTensorShape(activation_temp),
         GetTensorData<int16_t>(activation_temp),
-        cpu_backend_support::GetFromContext(context));
+        CpuBackendContext::GetFromContext(context));
   } else {
     context->ReportError(context,
                          "Unsupported combination of data types for LstmCell");
@@ -866,10 +1382,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
   switch (params->kernel_type) {
     case kTfLiteLSTMFullKernel:
-      cpu_backend_support::IncrementUsageCounter(context);
       return full::Init(context, buffer, length);
     case kTfLiteLSTMBasicKernel:
-      cpu_backend_support::IncrementUsageCounter(context);
       return basic::Init(context, buffer, length);
     default:
       return nullptr;
@@ -877,8 +1391,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   return nullptr;
 }
 void Free(TfLiteContext* context, void* buffer) {
-  cpu_backend_support::DecrementUsageCounter(context);
-
   delete reinterpret_cast<OpData*>(buffer);
 }
 
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index a518daf2cfd..fa2fc2cccc6 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/lstm_eval.h"
 
+#include <algorithm>
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
 #ifdef GEMMLOWP_PROFILING
 #include "profiling/profiler.h"
 #endif
@@ -24,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -139,11 +142,11 @@ inline void LstmStepWithAuxInput(
   // zero for layer norm lstm.
   if (is_layer_norm_lstm) {
     if (!use_cifg) {
-      tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
+      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
     }
-    tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-    tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-    tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
+    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
   } else {
     if (!use_cifg) {
       tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
@@ -315,7 +318,7 @@ inline void LstmStepWithAuxInput(
         tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
                                               n_batch, output_ptr_batch);
       } else {
-        tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+        std::fill_n(output_ptr_batch, n_batch * n_output, 0.0f);
       }
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           projection_weights_ptr, n_output, n_cell, output_gate_scratch,
@@ -325,23 +328,20 @@ inline void LstmStepWithAuxInput(
                                  params->proj_clip, output_ptr_batch);
       }
     } else {
-      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                               output_ptr_batch);
+      std::copy_n(output_gate_scratch, n_batch * n_output, output_ptr_batch);
     }
-    tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                             output_state_ptr);
+    std::copy_n(output_ptr_batch, n_batch * n_output, output_state_ptr);
   } else {
     if (use_projection_weight) {
       if (use_projection_bias) {
         for (int k = 0; k < n_batch; k++) {
-          tensor_utils::CopyVector(
-              projection_bias_ptr, n_output,
-              output_ptr_batch + k * output_batch_leading_dim);
+          std::copy_n(projection_bias_ptr, n_output,
+                      output_ptr_batch + k * output_batch_leading_dim);
         }
       } else {
         for (int k = 0; k < n_batch; k++) {
-          tensor_utils::ZeroVector(
-              output_ptr_batch + k * output_batch_leading_dim, n_output);
+          std::fill_n(output_ptr_batch + k * output_batch_leading_dim, n_output,
+                      0.0f);
         }
       }
       for (int k = 0; k < n_batch; k++) {
@@ -359,14 +359,13 @@ inline void LstmStepWithAuxInput(
       }
     } else {
       for (int k = 0; k < n_batch; k++) {
-        tensor_utils::CopyVector(
-            output_gate_scratch + k * n_output, n_output,
-            output_ptr_batch + k * output_batch_leading_dim);
+        std::copy_n(output_gate_scratch + k * n_output, n_output,
+                    output_ptr_batch + k * output_batch_leading_dim);
       }
     }
     for (int k = 0; k < n_batch; k++) {
-      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
-                               n_output, output_state_ptr + k * n_output);
+      std::copy_n(output_ptr_batch + k * output_batch_leading_dim, n_output,
+                  output_state_ptr + k * n_output);
     }
   }
 }
@@ -517,11 +516,11 @@ inline void LstmStepWithAuxInput(
   // Initialize scratch buffers with bias.
   if (is_layer_norm_lstm) {
     if (!use_cifg) {
-      tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
+      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
     }
-    tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-    tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-    tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
+    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
   } else {
     if (!use_cifg) {
       tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
@@ -802,7 +801,7 @@ inline void LstmStepWithAuxInput(
         tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
                                               n_batch, output_ptr_batch);
       } else {
-        tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+        std::fill_n(output_ptr_batch, n_batch * n_output, 0.0f);
       }
       if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
         // Save quantization and matmul computation for all zero input.
@@ -828,23 +827,20 @@ inline void LstmStepWithAuxInput(
                                  params->proj_clip, output_ptr_batch);
       }
     } else {
-      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                               output_ptr_batch);
+      std::copy_n(output_gate_scratch, n_batch * n_output, output_ptr_batch);
     }
-    tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                             output_state_ptr);
+    std::copy_n(output_ptr_batch, n_batch * n_output, output_state_ptr);
   } else {
     if (use_projection_weight) {
       if (use_projection_bias) {
         for (int k = 0; k < n_batch; k++) {
-          tensor_utils::CopyVector(
-              projection_bias_ptr, n_output,
-              output_ptr_batch + k * output_batch_leading_dim);
+          std::copy_n(projection_bias_ptr, n_output,
+                      output_ptr_batch + k * output_batch_leading_dim);
         }
       } else {
         for (int k = 0; k < n_batch; k++) {
-          tensor_utils::ZeroVector(
-              output_ptr_batch + k * output_batch_leading_dim, n_output);
+          std::fill_n(output_ptr_batch + k * output_batch_leading_dim, n_output,
+                      0.0f);
         }
       }
       if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
@@ -880,18 +876,222 @@ inline void LstmStepWithAuxInput(
       }
     } else {
       for (int k = 0; k < n_batch; k++) {
-        tensor_utils::CopyVector(
-            output_gate_scratch + k * n_output, n_output,
-            output_ptr_batch + k * output_batch_leading_dim);
+        std::copy_n(output_gate_scratch + k * n_output, n_output,
+                    output_ptr_batch + k * output_batch_leading_dim);
       }
     }
     for (int k = 0; k < n_batch; k++) {
-      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
-                               n_output, output_state_ptr + k * n_output);
+      std::copy_n(output_ptr_batch + k * output_batch_leading_dim, n_output,
+                  output_state_ptr + k * n_output);
     }
   }
 }
 
+inline void LstmStepQuantized(
+    const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
+    int32_t effective_input_to_input_scale_a,
+    int32_t effective_input_to_input_scale_b,
+    const int8_t* input_to_forget_weight_ptr,
+    int32_t effective_input_to_forget_scale_a,
+    int32_t effective_input_to_forget_scale_b,
+    const int8_t* input_to_cell_weight_ptr,
+    int32_t effective_input_to_cell_scale_a,
+    int32_t effective_input_to_cell_scale_b,
+    const int8_t* input_to_output_weight_ptr,
+    int32_t effective_input_to_output_scale_a,
+    int32_t effective_input_to_output_scale_b,
+    const int8_t* recurrent_to_input_weight_ptr,
+    int32_t effective_recurrent_to_input_scale_a,
+    int32_t effective_recurrent_to_input_scale_b,
+    const int8_t* recurrent_to_forget_weight_ptr,
+    int32_t effective_recurrent_to_forget_scale_a,
+    int32_t effective_recurrent_to_forget_scale_b,
+    const int8_t* recurrent_to_cell_weight_ptr,
+    int32_t effective_recurrent_to_cell_scale_a,
+    int32_t effective_recurrent_to_cell_scale_b,
+    const int8_t* recurrent_to_output_weight_ptr,
+    int32_t effective_recurrent_to_output_scale_a,
+    int32_t effective_recurrent_to_output_scale_b,
+    const int8_t* cell_to_input_weight_ptr,
+    int32_t effective_cell_to_input_scale_a,
+    int32_t effective_cell_to_input_scale_b,
+    const int8_t* cell_to_forget_weight_ptr,
+    int32_t effective_cell_to_forget_scale_a,
+    int32_t effective_cell_to_forget_scale_b,
+    const int8_t* cell_to_output_weight_ptr,
+    int32_t effective_cell_to_output_scale_a,
+    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
+    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
+    const int16_t* layer_norm_input_weight_ptr,
+    int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
+    const int16_t* layer_norm_forget_weight_ptr,
+    int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
+    const int16_t* layer_norm_cell_weight_ptr, int32_t layer_norm_cell_scale_a,
+    int32_t layer_norm_cell_scale_b,
+    const int16_t* layer_norm_output_weight_ptr,
+    int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
+    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
+    const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
+    int32 quantized_cell_clip, int32 quantized_proj_clip,
+    const int32_t* inv_large_value,
+    const int32_t* input_to_forget_weight_x_input_zp,
+    const int32_t* recurrent_to_forget_weight_x_activation_zp,
+    const int32_t* input_to_cell_weight_x_input_zp,
+    const int32_t* recurrent_to_cell_weight_x_activation_zp,
+    const int32_t* input_to_output_weight_x_input_zp,
+    const int32_t* recurrent_to_output_weight_x_activation_zp,
+    const int32_t* input_to_input_weight_x_input_zp,
+    const int32_t* recurrent_to_input_weight_x_activation_zp,
+    const int32_t* projection_bias_accu, int32 n_batch, int32 n_cell,
+    int32 n_input, int32 n_output, int32 output_batch_leading_dim,
+    int8_t* activation_ptr, int32_t activation_zp, int16_t* cell_ptr,
+    int8_t* output_ptr, int16_t* scratch_0_ptr, int16_t* scratch_1_ptr,
+    int16_t* scratch_2_ptr, int16_t* scratch_3_ptr, int8_t* scratch_4_ptr,
+    int32_t* scratch_5_ptr) {
+  TFLITE_DCHECK(input_to_forget_weight_x_input_zp);
+  TFLITE_DCHECK(recurrent_to_forget_weight_x_activation_zp);
+  TFLITE_DCHECK(input_to_cell_weight_x_input_zp);
+  TFLITE_DCHECK(recurrent_to_cell_weight_x_activation_zp);
+  TFLITE_DCHECK(input_to_output_weight_x_input_zp);
+  TFLITE_DCHECK(recurrent_to_output_weight_x_activation_zp);
+  TFLITE_DCHECK(input_to_input_weight_x_input_zp);
+  TFLITE_DCHECK(recurrent_to_input_weight_x_activation_zp);
+  TFLITE_DCHECK(projection_bias_accu);
+
+  // TODO(renjieliu): Handle optional arguments processing here:
+  // case cifg: input_to_input_weights should be nullptr
+  //            recurrent_to_input_weight should be nullptr
+  //            input_bias should be nullptr
+  // case not peephole: cell_to_forget_weight should be nullptr
+  //                    cell_to_output_weight should be nullptr
+  //                    cifg: cell_to_input_weight should be nullptr
+
+  // Set scratch to 0.
+  memset(scratch_0_ptr, 0, n_batch * n_cell * sizeof(int16_t));
+  memset(scratch_1_ptr, 0, n_batch * n_cell * sizeof(int16_t));
+  memset(scratch_2_ptr, 0, n_batch * n_cell * sizeof(int16_t));
+  memset(scratch_3_ptr, 0, n_batch * n_cell * sizeof(int16_t));
+
+  // Forget gate.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_ptr, input_to_forget_weight_x_input_zp, input_to_forget_weight_ptr,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_1_ptr);
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      activation_ptr, recurrent_to_forget_weight_x_activation_zp,
+      recurrent_to_forget_weight_ptr, effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell, 0,
+      scratch_5_ptr, scratch_1_ptr);
+
+  if (layer_norm_forget_weight_ptr != nullptr) {
+    tensor_utils::ApplyLayerNorm(scratch_1_ptr, layer_norm_forget_weight_ptr,
+                                 forget_bias_ptr, layer_norm_forget_scale_a,
+                                 layer_norm_forget_scale_b, inv_large_value[1],
+                                 n_batch, n_cell, scratch_1_ptr);
+  }
+
+  tensor_utils::ApplySigmoid(scratch_1_ptr, n_batch, n_cell, scratch_1_ptr);
+
+  // Modulation gate.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_ptr, input_to_cell_weight_x_input_zp, input_to_cell_weight_ptr,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
+      n_input, n_cell, 0, scratch_5_ptr, scratch_2_ptr);
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      activation_ptr, recurrent_to_cell_weight_x_activation_zp,
+      recurrent_to_cell_weight_ptr, effective_recurrent_to_cell_scale_a,
+      effective_recurrent_to_cell_scale_b, n_batch, n_output, n_cell, 0,
+      scratch_5_ptr, scratch_2_ptr);
+
+  if (layer_norm_cell_weight_ptr != nullptr) {
+    tensor_utils::ApplyLayerNorm(scratch_2_ptr, layer_norm_cell_weight_ptr,
+                                 cell_bias_ptr, layer_norm_cell_scale_a,
+                                 layer_norm_cell_scale_b, inv_large_value[2],
+                                 n_batch, n_cell, scratch_2_ptr);
+  }
+
+  tensor_utils::ApplyTanh3(scratch_2_ptr, n_batch, n_cell, scratch_2_ptr);
+
+  // Ouptut gate.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_ptr, input_to_output_weight_x_input_zp, input_to_output_weight_ptr,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_3_ptr);
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      activation_ptr, recurrent_to_output_weight_x_activation_zp,
+      recurrent_to_output_weight_ptr, effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell, 0,
+      scratch_5_ptr, scratch_3_ptr);
+
+  if (layer_norm_output_weight_ptr != nullptr) {
+    tensor_utils::ApplyLayerNorm(scratch_3_ptr, layer_norm_output_weight_ptr,
+                                 output_bias_ptr, layer_norm_output_scale_a,
+                                 layer_norm_output_scale_b, inv_large_value[3],
+                                 n_batch, n_cell, scratch_3_ptr);
+  }
+
+  tensor_utils::ApplySigmoid(scratch_3_ptr, n_batch, n_cell, scratch_3_ptr);
+
+  // Input gate.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_ptr, input_to_input_weight_x_input_zp, input_to_input_weight_ptr,
+      effective_input_to_input_scale_a, effective_input_to_input_scale_b,
+      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_0_ptr);
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      activation_ptr, recurrent_to_input_weight_x_activation_zp,
+      recurrent_to_input_weight_ptr, effective_recurrent_to_input_scale_a,
+      effective_recurrent_to_input_scale_b, n_batch, n_output, n_cell, 0,
+      scratch_5_ptr, scratch_0_ptr);
+
+  if (layer_norm_input_weight_ptr != nullptr) {
+    tensor_utils::ApplyLayerNorm(scratch_0_ptr, layer_norm_input_weight_ptr,
+                                 input_bias_ptr, layer_norm_input_scale_a,
+                                 layer_norm_input_scale_b, inv_large_value[0],
+                                 n_batch, n_cell, scratch_0_ptr);
+  }
+
+  tensor_utils::ApplySigmoid(scratch_0_ptr, n_batch, n_cell, scratch_0_ptr);
+
+  // Cell and hidden.
+  tensor_utils::CwiseMul(scratch_1_ptr, cell_ptr, n_batch, n_cell, 15,
+                         scratch_1_ptr);
+
+  tensor_utils::CwiseMul(scratch_0_ptr, scratch_2_ptr, n_batch, n_cell, 19,
+                         scratch_2_ptr);
+
+  tensor_utils::CwiseAdd(scratch_1_ptr, scratch_2_ptr, n_batch, n_cell,
+                         cell_ptr);
+
+  if (quantized_cell_clip > 0) {
+    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+  }
+
+  tensor_utils::ApplyTanh4(cell_ptr, n_batch, n_cell, scratch_0_ptr);
+
+  tensor_utils::CwiseMul(scratch_3_ptr, scratch_0_ptr, n_batch, n_cell, 23,
+                         scratch_4_ptr);
+
+  // Projection.
+  if (proj_weight_ptr != nullptr) {
+    memset(output_ptr, 0, n_batch * n_output * sizeof(int8_t));
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        scratch_4_ptr, projection_bias_accu, proj_weight_ptr,
+        effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
+        n_output, activation_zp, scratch_5_ptr, output_ptr);
+  }
+
+  if (quantized_proj_clip > 0) {
+    tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
+                                n_output);
+  }
+
+  memcpy(activation_ptr, output_ptr, n_batch * n_output * sizeof(int8_t));
+}
+
 }  // namespace
 
 TfLiteStatus EvalFloat(
@@ -951,55 +1151,71 @@ TfLiteStatus EvalFloat(
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    cell_scratch = GetTensorData<float>(scratch_buffer);
+    forget_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + n_cell * n_batch;
+    output_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 2 * n_cell * n_batch;
   } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+    input_gate_scratch = GetTensorData<float>(scratch_buffer);
+    cell_scratch = GetTensorData<float>(scratch_buffer) + n_cell * n_batch;
+    forget_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 2 * n_cell * n_batch;
+    output_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 3 * n_cell * n_batch;
   }
 
   // Check optional tensors, the respective pointers can be null.
   const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
+      (use_cifg) ? nullptr : GetTensorData<float>(input_to_input_weights);
   const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
+      (use_cifg) ? nullptr : GetTensorData<float>(recurrent_to_input_weights);
   const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
+      (use_cifg) ? nullptr : GetTensorData<float>(input_gate_bias);
   const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
+      (use_peephole && !use_cifg) ? GetTensorData<float>(cell_to_input_weights)
+                                  : nullptr;
   const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
+      (use_peephole) ? GetTensorData<float>(cell_to_forget_weights) : nullptr;
   const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+      (use_peephole) ? GetTensorData<float>(cell_to_output_weights) : nullptr;
   const float* input_layer_norm_coefficients_ptr =
-      (is_layer_norm_lstm && !use_cifg) ? input_layer_norm_coefficients->data.f
-                                        : nullptr;
+      (is_layer_norm_lstm && !use_cifg)
+          ? GetTensorData<float>(input_layer_norm_coefficients)
+          : nullptr;
   const float* forget_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? forget_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(forget_layer_norm_coefficients)
+                         : nullptr;
   const float* cell_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? cell_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(cell_layer_norm_coefficients)
+                         : nullptr;
   const float* output_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? output_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(output_layer_norm_coefficients)
+                         : nullptr;
   const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
+      (projection_weights == nullptr)
+          ? nullptr
+          : GetTensorData<float>(projection_weights);
   const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+      (projection_bias == nullptr) ? nullptr
+                                   : GetTensorData<float>(projection_bias);
 
-  float* aux_input_ptr = nullptr;
-  float* aux_input_to_input_weights_ptr = nullptr;
-  float* aux_input_to_forget_weights_ptr = nullptr;
-  float* aux_input_to_cell_weights_ptr = nullptr;
-  float* aux_input_to_output_weights_ptr = nullptr;
+  const float* aux_input_ptr = nullptr;
+  const float* aux_input_to_input_weights_ptr = nullptr;
+  const float* aux_input_to_forget_weights_ptr = nullptr;
+  const float* aux_input_to_cell_weights_ptr = nullptr;
+  const float* aux_input_to_output_weights_ptr = nullptr;
   if (aux_input_size > 0) {
     if (!use_cifg) {
-      aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f;
+      aux_input_to_input_weights_ptr =
+          GetTensorData<float>(aux_input_to_input_weights);
     }
-    aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f;
-    aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f;
-    aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f;
+    aux_input_to_forget_weights_ptr =
+        GetTensorData<float>(aux_input_to_forget_weights);
+    aux_input_to_cell_weights_ptr =
+        GetTensorData<float>(aux_input_to_cell_weights);
+    aux_input_to_output_weights_ptr =
+        GetTensorData<float>(aux_input_to_output_weights);
   }
 
   const int output_batch_leading_dim =
@@ -1012,31 +1228,38 @@ TfLiteStatus EvalFloat(
       // If this is the forward_sequence, step forward, otherwise step
       // backwards.
       const int t_rel = forward_sequence ? t : max_time - t - 1;
-      const float* input_ptr_batch = input->data.f + t_rel * input_step;
+      const float* input_ptr_batch =
+          GetTensorData<float>(input) + t_rel * input_step;
       if (aux_input) {
-        aux_input_ptr = aux_input->data.f + t_rel * input_step;
+        aux_input_ptr = GetTensorData<float>(aux_input) + t_rel * input_step;
       }
       float* output_ptr_time =
-          output->data.f + t_rel * output_step + output_offset;
+          GetTensorData<float>(output) + t_rel * output_step + output_offset;
 
       LstmStepWithAuxInput(
           input_ptr_batch, input_to_input_weights_ptr,
-          input_to_forget_weights->data.f, input_to_cell_weights->data.f,
-          input_to_output_weights->data.f, aux_input_ptr,
+          GetTensorData<float>(input_to_forget_weights),
+          GetTensorData<float>(input_to_cell_weights),
+          GetTensorData<float>(input_to_output_weights), aux_input_ptr,
           aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
           aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
-          recurrent_to_input_weights_ptr, recurrent_to_forget_weights->data.f,
-          recurrent_to_cell_weights->data.f,
-          recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
-          cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-          input_layer_norm_coefficients_ptr, forget_layer_norm_coefficients_ptr,
-          cell_layer_norm_coefficients_ptr, output_layer_norm_coefficients_ptr,
-          input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
-          output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
-          params, n_batch, n_cell, n_input, aux_input_size, n_output,
-          output_batch_leading_dim, activation_state->data.f,
-          cell_state->data.f, input_gate_scratch, forget_gate_scratch,
-          cell_scratch, output_gate_scratch, output_ptr_time);
+          recurrent_to_input_weights_ptr,
+          GetTensorData<float>(recurrent_to_forget_weights),
+          GetTensorData<float>(recurrent_to_cell_weights),
+          GetTensorData<float>(recurrent_to_output_weights),
+          cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
+          cell_to_output_weights_ptr, input_layer_norm_coefficients_ptr,
+          forget_layer_norm_coefficients_ptr, cell_layer_norm_coefficients_ptr,
+          output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
+          GetTensorData<float>(forget_gate_bias),
+          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(output_gate_bias), projection_weights_ptr,
+          projection_bias_ptr, params, n_batch, n_cell, n_input, aux_input_size,
+          n_output, output_batch_leading_dim,
+          GetTensorData<float>(activation_state),
+          GetTensorData<float>(cell_state), input_gate_scratch,
+          forget_gate_scratch, cell_scratch, output_gate_scratch,
+          output_ptr_time);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1047,17 +1270,19 @@ TfLiteStatus EvalFloat(
         // backwards.
         const int t_rel = forward_sequence ? t : max_time - t - 1;
         const int time_offset = b * max_time + t_rel;
-        const float* input_ptr = input->data.f + time_offset * input_step;
+        const float* input_ptr =
+            GetTensorData<float>(input) + time_offset * input_step;
         if (aux_input) {
-          aux_input_ptr = aux_input->data.f + time_offset * input_step;
+          aux_input_ptr =
+              GetTensorData<float>(aux_input) + time_offset * input_step;
         }
-        float* output_ptr =
-            output->data.f + time_offset * output_step + output_offset;
+        float* output_ptr = GetTensorData<float>(output) +
+                            time_offset * output_step + output_offset;
 
         // Offset the {activation,cell}_state pointers to the right batch.
-        float* activation_state_ptr =
-            activation_state->data.f + b * output_batch_leading_dim;
-        float* cell_state_ptr = cell_state->data.f + b * n_cell;
+        float* activation_state_ptr = GetTensorData<float>(activation_state) +
+                                      b * output_batch_leading_dim;
+        float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
             input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
@@ -1067,20 +1292,23 @@ TfLiteStatus EvalFloat(
 
         LstmStepWithAuxInput(
             input_ptr, input_to_input_weights_ptr,
-            input_to_forget_weights->data.f, input_to_cell_weights->data.f,
-            input_to_output_weights->data.f, aux_input_ptr,
+            GetTensorData<float>(input_to_forget_weights),
+            GetTensorData<float>(input_to_cell_weights),
+            GetTensorData<float>(input_to_output_weights), aux_input_ptr,
             aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
             aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
-            recurrent_to_input_weights_ptr, recurrent_to_forget_weights->data.f,
-            recurrent_to_cell_weights->data.f,
-            recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
-            cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-            input_layer_norm_coefficients_ptr,
+            recurrent_to_input_weights_ptr,
+            GetTensorData<float>(recurrent_to_forget_weights),
+            GetTensorData<float>(recurrent_to_cell_weights),
+            GetTensorData<float>(recurrent_to_output_weights),
+            cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
+            cell_to_output_weights_ptr, input_layer_norm_coefficients_ptr,
             forget_layer_norm_coefficients_ptr,
             cell_layer_norm_coefficients_ptr,
             output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
-            forget_gate_bias->data.f, cell_bias->data.f,
-            output_gate_bias->data.f, projection_weights_ptr,
+            GetTensorData<float>(forget_gate_bias),
+            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(output_gate_bias), projection_weights_ptr,
             projection_bias_ptr, params, /*n_batch=*/1, n_cell, n_input,
             aux_input_size, n_output, output_batch_leading_dim,
             activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
@@ -1123,9 +1351,6 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
     TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output) {
-  // For operations that use int8 instead of uint8 we need to fetch raw data
-  // from the tensor different. We use this bool for that condition.
-  const bool is_uint8_hybrid = input_to_output_weights->type == kTfLiteUInt8;
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1153,120 +1378,125 @@ TfLiteStatus EvalHybrid(
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    cell_scratch = GetTensorData<float>(scratch_buffer);
+    forget_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + n_cell * n_batch;
+    output_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 2 * n_cell * n_batch;
   } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+    input_gate_scratch = GetTensorData<float>(scratch_buffer);
+    cell_scratch = GetTensorData<float>(scratch_buffer) + n_cell * n_batch;
+    forget_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 2 * n_cell * n_batch;
+    output_gate_scratch =
+        GetTensorData<float>(scratch_buffer) + 3 * n_cell * n_batch;
   }
 
   // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
+  const int8_t* input_to_input_weights_ptr = nullptr;
   float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
+  const int8_t* recurrent_to_input_weights_ptr = nullptr;
   float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
+  const float* input_gate_bias_ptr = nullptr;
   if (!use_cifg) {
-    input_to_input_weights_ptr =
-        GetInt8DataPtr(input_to_input_weights, is_uint8_hybrid);
+    input_to_input_weights_ptr = GetTensorData<int8_t>(input_to_input_weights);
     recurrent_to_input_weights_ptr =
-        GetInt8DataPtr(recurrent_to_input_weights, is_uint8_hybrid);
-    input_gate_bias_ptr = input_gate_bias->data.f;
+        GetTensorData<int8_t>(recurrent_to_input_weights);
+    input_gate_bias_ptr = GetTensorData<float>(input_gate_bias);
     input_to_input_weights_scale = input_to_input_weights->params.scale;
     recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
   }
 
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
+  const int8_t* cell_to_input_weights_ptr = nullptr;
+  const int8_t* cell_to_forget_weights_ptr = nullptr;
+  const int8_t* cell_to_output_weights_ptr = nullptr;
   float cell_to_input_weights_scale = 1.0f;
   float cell_to_forget_weights_scale = 1.0f;
   float cell_to_output_weights_scale = 1.0f;
   if (use_peephole) {
     if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          GetInt8DataPtr(cell_to_input_weights, is_uint8_hybrid);
+      cell_to_input_weights_ptr = GetTensorData<int8_t>(cell_to_input_weights);
       cell_to_input_weights_scale = cell_to_input_weights->params.scale;
     }
-    cell_to_forget_weights_ptr =
-        GetInt8DataPtr(cell_to_forget_weights, is_uint8_hybrid);
-    cell_to_output_weights_ptr =
-        GetInt8DataPtr(cell_to_output_weights, is_uint8_hybrid);
+    cell_to_forget_weights_ptr = GetTensorData<int8_t>(cell_to_forget_weights);
+    cell_to_output_weights_ptr = GetTensorData<int8_t>(cell_to_output_weights);
     cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
     cell_to_output_weights_scale = cell_to_output_weights->params.scale;
   }
 
   const float* input_layer_norm_coefficients_ptr =
-      (is_layer_norm_lstm && !use_cifg) ? input_layer_norm_coefficients->data.f
-                                        : nullptr;
+      (is_layer_norm_lstm && !use_cifg)
+          ? GetTensorData<float>(input_layer_norm_coefficients)
+          : nullptr;
   const float* forget_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? forget_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(forget_layer_norm_coefficients)
+                         : nullptr;
   const float* cell_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? cell_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(cell_layer_norm_coefficients)
+                         : nullptr;
   const float* output_layer_norm_coefficients_ptr =
-      is_layer_norm_lstm ? output_layer_norm_coefficients->data.f : nullptr;
+      is_layer_norm_lstm ? GetTensorData<float>(output_layer_norm_coefficients)
+                         : nullptr;
 
   const int8_t* projection_weights_ptr =
       (projection_weights == nullptr)
           ? nullptr
-          : GetInt8DataPtr(projection_weights, is_uint8_hybrid);
+          : GetTensorData<int8_t>(projection_weights);
   const float projection_weights_scale =
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+      (projection_bias == nullptr) ? nullptr
+                                   : GetTensorData<float>(projection_bias);
 
   // Required tensors, pointers are non-null.
   const int8_t* input_to_forget_weights_ptr =
-      GetInt8DataPtr(input_to_forget_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(input_to_forget_weights);
   const float input_to_forget_weights_scale =
       input_to_forget_weights->params.scale;
   const int8_t* input_to_cell_weights_ptr =
-      GetInt8DataPtr(input_to_cell_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(input_to_cell_weights);
   const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
   const int8_t* input_to_output_weights_ptr =
-      GetInt8DataPtr(input_to_output_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(input_to_output_weights);
   const float input_to_output_weights_scale =
       input_to_output_weights->params.scale;
   const int8_t* recurrent_to_forget_weights_ptr =
-      GetInt8DataPtr(recurrent_to_forget_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(recurrent_to_forget_weights);
   const float recurrent_to_forget_weights_scale =
       recurrent_to_forget_weights->params.scale;
   const int8_t* recurrent_to_cell_weights_ptr =
-      GetInt8DataPtr(recurrent_to_cell_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(recurrent_to_cell_weights);
   const float recurrent_to_cell_weights_scale =
       recurrent_to_cell_weights->params.scale;
   const int8_t* recurrent_to_output_weights_ptr =
-      GetInt8DataPtr(recurrent_to_output_weights, is_uint8_hybrid);
+      GetTensorData<int8_t>(recurrent_to_output_weights);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+  const float* forget_gate_bias_ptr = GetTensorData<float>(forget_gate_bias);
+  const float* cell_bias_ptr = GetTensorData<float>(cell_bias);
+  const float* output_gate_bias_ptr = GetTensorData<float>(output_gate_bias);
 
   // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
+  int8_t* quantized_input_ptr = GetTensorData<int8_t>(input_quantized);
   int8_t* quantized_aux_input_ptr =
       (aux_input_quantized == nullptr)
           ? nullptr
-          : GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
+          : GetTensorData<int8_t>(aux_input_quantized);
   int8_t* quantized_output_state_ptr =
-      GetInt8DataPtr(output_state_quantized, is_uint8_hybrid);
+      GetTensorData<int8_t>(output_state_quantized);
   int8_t* quantized_cell_state_ptr =
-      GetInt8DataPtr(cell_state_quantized, is_uint8_hybrid);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
+      GetTensorData<int8_t>(cell_state_quantized);
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
+  float* prod_scaling_factors_ptr = GetTensorData<float>(prod_scaling_factors);
+  float* recovered_cell_weights_ptr =
+      GetTensorData<float>(recovered_cell_weights);
 
   // Auxiliary input and weights.
-  float* aux_input_ptr = nullptr;
-  int8_t* aux_input_to_input_weights_ptr = nullptr;
-  int8_t* aux_input_to_forget_weights_ptr = nullptr;
-  int8_t* aux_input_to_cell_weights_ptr = nullptr;
-  int8_t* aux_input_to_output_weights_ptr = nullptr;
+  const float* aux_input_ptr = nullptr;
+  const int8_t* aux_input_to_input_weights_ptr = nullptr;
+  const int8_t* aux_input_to_forget_weights_ptr = nullptr;
+  const int8_t* aux_input_to_cell_weights_ptr = nullptr;
+  const int8_t* aux_input_to_output_weights_ptr = nullptr;
   float aux_input_to_input_weights_scale = 0.0f;
   float aux_input_to_forget_weights_scale = 0.0f;
   float aux_input_to_cell_weights_scale = 0.0f;
@@ -1274,14 +1504,14 @@ TfLiteStatus EvalHybrid(
   if (aux_input_size > 0) {
     if (!use_cifg) {
       aux_input_to_input_weights_ptr =
-          GetInt8DataPtr(aux_input_to_input_weights, is_uint8_hybrid);
+          GetTensorData<int8_t>(aux_input_to_input_weights);
     }
     aux_input_to_forget_weights_ptr =
-        GetInt8DataPtr(aux_input_to_forget_weights, is_uint8_hybrid);
+        GetTensorData<int8_t>(aux_input_to_forget_weights);
     aux_input_to_cell_weights_ptr =
-        GetInt8DataPtr(aux_input_to_cell_weights, is_uint8_hybrid);
+        GetTensorData<int8_t>(aux_input_to_cell_weights);
     aux_input_to_output_weights_ptr =
-        GetInt8DataPtr(aux_input_to_output_weights, is_uint8_hybrid);
+        GetTensorData<int8_t>(aux_input_to_output_weights);
     if (!use_cifg) {
       aux_input_to_input_weights_scale =
           aux_input_to_input_weights->params.scale;
@@ -1303,12 +1533,13 @@ TfLiteStatus EvalHybrid(
       // If this is the forward_sequence, step forward, otherwise step
       // backwards.
       const int t_rel = forward_sequence ? t : max_time - t - 1;
-      const float* input_ptr_batch = input->data.f + t_rel * input_step;
+      const float* input_ptr_batch =
+          GetTensorData<float>(input) + t_rel * input_step;
       if (aux_input) {
-        aux_input_ptr = aux_input->data.f + t_rel * input_step;
+        aux_input_ptr = GetTensorData<float>(aux_input) + t_rel * input_step;
       }
       float* output_ptr_batch =
-          output->data.f + t_rel * output_step + output_offset;
+          GetTensorData<float>(output) + t_rel * output_step + output_offset;
 
       LstmStepWithAuxInput(
           input_ptr_batch, input_to_input_weights_ptr,
@@ -1337,8 +1568,8 @@ TfLiteStatus EvalHybrid(
           output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
           recovered_cell_weights_ptr, quantized_input_ptr,
           quantized_aux_input_ptr, quantized_output_state_ptr,
-          quantized_cell_state_ptr, output_state->data.f, cell_state->data.f,
-          output_ptr_batch);
+          quantized_cell_state_ptr, GetTensorData<float>(output_state),
+          GetTensorData<float>(cell_state), output_ptr_batch);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1349,17 +1580,19 @@ TfLiteStatus EvalHybrid(
         // backwards.
         const int t_rel = forward_sequence ? t : max_time - t - 1;
         const int time_offset = b * max_time + t_rel;
-        const float* input_ptr = input->data.f + time_offset * input_step;
+        const float* input_ptr =
+            GetTensorData<float>(input) + time_offset * input_step;
         if (aux_input) {
-          aux_input_ptr = aux_input->data.f + time_offset * input_step;
+          aux_input_ptr =
+              GetTensorData<float>(aux_input) + time_offset * input_step;
         }
-        float* output_ptr =
-            output->data.f + time_offset * output_step + output_offset;
+        float* output_ptr = GetTensorData<float>(output) +
+                            time_offset * output_step + output_offset;
 
         // Offset the {output,cell}_state pointers to the right batch.
         float* output_state_ptr =
-            output_state->data.f + b * output_batch_leading_dim;
-        float* cell_state_ptr = cell_state->data.f + b * n_cell;
+            GetTensorData<float>(output_state) + b * output_batch_leading_dim;
+        float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
             input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
@@ -1405,6 +1638,177 @@ TfLiteStatus EvalHybrid(
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalQuantized(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params,
+    const lstm_eval::QuantizedLstmParameter* quantized_lstm_param,
+    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
+    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
+    TfLiteTensor* scratch5) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = input->dims->data[0];
+    n_batch = input->dims->data[1];
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Weights and states.
+  const int8_t* input_to_input_weight_ptr =
+      GetTensorData<int8_t>(input_to_input_weights);
+  const int8_t* recurrent_to_input_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_input_weights);
+  const int8_t* cell_to_input_weight_ptr =
+      GetTensorData<int8_t>(cell_to_input_weights);
+  const int8_t* input_to_forget_weight_ptr =
+      GetTensorData<int8_t>(input_to_forget_weights);
+  const int8_t* recurrent_to_forget_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_forget_weights);
+  const int8_t* cell_to_forget_weight_ptr =
+      GetTensorData<int8_t>(cell_to_forget_weights);
+  const int8_t* input_to_cell_weight_ptr =
+      GetTensorData<int8_t>(input_to_cell_weights);
+  const int8_t* recurrent_to_cell_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_cell_weights);
+  const int8_t* input_to_output_weight_ptr =
+      GetTensorData<int8_t>(input_to_output_weights);
+  const int8_t* recurrent_to_output_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_output_weights);
+  const int8_t* cell_to_output_weight_ptr =
+      GetTensorData<int8_t>(cell_to_output_weights);
+  const int8_t* proj_weight_ptr = GetTensorData<int8_t>(projection_weights);
+  const int16_t* layer_norm_input_weight_ptr =
+      GetTensorData<int16_t>(input_layer_norm_coefficients);
+  const int16_t* layer_norm_forget_weight_ptr =
+      GetTensorData<int16_t>(forget_layer_norm_coefficients);
+  const int16_t* layer_norm_cell_weight_ptr =
+      GetTensorData<int16_t>(cell_layer_norm_coefficients);
+  const int16_t* layer_norm_output_weight_ptr =
+      GetTensorData<int16_t>(output_layer_norm_coefficients);
+  const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
+  const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
+  const int32_t* cell_bias_ptr = GetTensorData<int32_t>(cell_bias);
+  const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
+
+  int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
+  int8_t* activation_ptr = GetTensorData<int8_t>(activation_state);
+  int8_t* output_ptr = GetTensorData<int8_t>(output);
+
+  // Zero points
+  int input_zp = 0;
+  int activation_zp = 0;
+
+  input_zp = input->params.zero_point;
+  activation_zp = activation_state->params.zero_point;
+
+  // Get params for time/batch/sequence.
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output_batch_leading_dim;
+
+  for (int t = 0; t < max_time; t++) {
+    const int t_rel = t;
+    output_ptr = output_ptr + t_rel * output_step;
+
+    // Input can be int8 asymmetric or int16 symmetric.
+    const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
+    LstmStepQuantized(
+        input_ptr, input_to_input_weight_ptr,
+        quantized_lstm_param->effective_input_to_input_scale_a,
+        quantized_lstm_param->effective_input_to_input_scale_b,
+        input_to_forget_weight_ptr,
+        quantized_lstm_param->effective_input_to_forget_scale_a,
+        quantized_lstm_param->effective_input_to_forget_scale_b,
+        input_to_cell_weight_ptr,
+        quantized_lstm_param->effective_input_to_cell_scale_a,
+        quantized_lstm_param->effective_input_to_cell_scale_b,
+        input_to_output_weight_ptr,
+        quantized_lstm_param->effective_input_to_output_scale_a,
+        quantized_lstm_param->effective_input_to_output_scale_b,
+        recurrent_to_input_weight_ptr,
+        quantized_lstm_param->effective_recurrent_to_input_scale_a,
+        quantized_lstm_param->effective_recurrent_to_input_scale_b,
+        recurrent_to_forget_weight_ptr,
+        quantized_lstm_param->effective_recurrent_to_forget_scale_a,
+        quantized_lstm_param->effective_recurrent_to_forget_scale_b,
+        recurrent_to_cell_weight_ptr,
+        quantized_lstm_param->effective_recurrent_to_cell_scale_a,
+        quantized_lstm_param->effective_recurrent_to_cell_scale_b,
+        recurrent_to_output_weight_ptr,
+        quantized_lstm_param->effective_recurrent_to_output_scale_a,
+        quantized_lstm_param->effective_recurrent_to_output_scale_b,
+        cell_to_input_weight_ptr,
+        quantized_lstm_param->effective_cell_to_input_scale_a,
+        quantized_lstm_param->effective_cell_to_input_scale_b,
+        cell_to_forget_weight_ptr,
+        quantized_lstm_param->effective_cell_to_forget_scale_a,
+        quantized_lstm_param->effective_cell_to_forget_scale_b,
+        cell_to_output_weight_ptr,
+        quantized_lstm_param->effective_cell_to_output_scale_a,
+        quantized_lstm_param->effective_cell_to_output_scale_b, proj_weight_ptr,
+        quantized_lstm_param->effective_proj_scale_a,
+        quantized_lstm_param->effective_proj_scale_b,
+        layer_norm_input_weight_ptr,
+        quantized_lstm_param->layer_norm_input_scale_a,
+        quantized_lstm_param->layer_norm_input_scale_b,
+        layer_norm_forget_weight_ptr,
+        quantized_lstm_param->layer_norm_forget_scale_a,
+        quantized_lstm_param->layer_norm_forget_scale_b,
+        layer_norm_cell_weight_ptr,
+        quantized_lstm_param->layer_norm_cell_scale_a,
+        quantized_lstm_param->layer_norm_cell_scale_b,
+        layer_norm_output_weight_ptr,
+        quantized_lstm_param->layer_norm_output_scale_a,
+        quantized_lstm_param->layer_norm_output_scale_b, input_bias_ptr,
+        forget_bias_ptr, cell_bias_ptr, output_bias_ptr,
+        quantized_lstm_param->quantized_cell_clip,
+        quantized_lstm_param->quantized_proj_clip,
+        quantized_lstm_param->inv_large_value.data(),
+        quantized_lstm_param->input_to_forget_weight_x_input_zp.get(),
+        quantized_lstm_param->recurrent_to_forget_weight_x_activation_zp.get(),
+        quantized_lstm_param->input_to_cell_weight_x_input_zp.get(),
+        quantized_lstm_param->recurrent_to_cell_weight_x_activation_zp.get(),
+        quantized_lstm_param->input_to_output_weight_x_input_zp.get(),
+        quantized_lstm_param->recurrent_to_output_weight_x_activation_zp.get(),
+        quantized_lstm_param->input_to_input_weight_x_input_zp.get(),
+        quantized_lstm_param->recurrent_to_input_weight_x_activation_zp.get(),
+        quantized_lstm_param->projection_bias_accu.get(), n_batch, n_cell,
+        n_input, n_output, output_batch_leading_dim, activation_ptr,
+        activation_zp, cell_ptr, output_ptr, GetTensorData<int16_t>(scratch0),
+        GetTensorData<int16_t>(scratch1), GetTensorData<int16_t>(scratch2),
+        GetTensorData<int16_t>(scratch3), GetTensorData<int8_t>(scratch4),
+        GetTensorData<int32_t>(scratch5));
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 33e5bc07819..0f6856975b1 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
 #define TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
 
+#include <cstdint>
+#include <memory>
+#include <vector>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 
@@ -23,6 +27,66 @@ namespace ops {
 namespace builtin {
 namespace lstm_eval {
 
+// Pamameters for quantized lstm.
+struct QuantizedLstmParameter {
+  QuantizedLstmParameter() : inv_large_value(4) {}
+  int32_t effective_input_to_input_scale_a;
+  int32_t effective_input_to_input_scale_b;
+  int32_t effective_recurrent_to_input_scale_a;
+  int32_t effective_recurrent_to_input_scale_b;
+  int32_t effective_cell_to_input_scale_a;
+  int32_t effective_cell_to_input_scale_b;
+  int32_t effective_input_to_forget_scale_a;
+  int32_t effective_input_to_forget_scale_b;
+  int32_t effective_recurrent_to_forget_scale_a;
+  int32_t effective_recurrent_to_forget_scale_b;
+  int32_t effective_cell_to_forget_scale_a;
+  int32_t effective_cell_to_forget_scale_b;
+  int32_t effective_input_to_cell_scale_a;
+  int32_t effective_input_to_cell_scale_b;
+  int32_t effective_recurrent_to_cell_scale_a;
+  int32_t effective_recurrent_to_cell_scale_b;
+  int32_t effective_input_to_output_scale_a;
+  int32_t effective_input_to_output_scale_b;
+  int32_t effective_recurrent_to_output_scale_a;
+  int32_t effective_recurrent_to_output_scale_b;
+  int32_t effective_cell_to_output_scale_a;
+  int32_t effective_cell_to_output_scale_b;
+  int32_t effective_proj_scale_a;
+  int32_t effective_proj_scale_b;
+  int32_t layer_norm_input_scale_a;
+  int32_t layer_norm_input_scale_b;
+  int32_t layer_norm_forget_scale_a;
+  int32_t layer_norm_forget_scale_b;
+  int32_t layer_norm_cell_scale_a;
+  int32_t layer_norm_cell_scale_b;
+  int32_t layer_norm_output_scale_a;
+  int32_t layer_norm_output_scale_b;
+  // Quantized clip value for cell and projection. Zero value means no clipping.
+  int32_t quantized_cell_clip;
+  int32_t quantized_proj_clip;
+  std::vector<int32_t> inv_large_value;
+
+  // The fields are used for pre-computing zero_point * weight.
+  // We cannot use temporary tensors since temporary tensors are not alllocated
+  // yet until end of prepare.
+
+  // Forget gate.
+  std::unique_ptr<int32_t[]> input_to_forget_weight_x_input_zp;
+  std::unique_ptr<int32_t[]> recurrent_to_forget_weight_x_activation_zp;
+  // Modulation gate.
+  std::unique_ptr<int32_t[]> input_to_cell_weight_x_input_zp;
+  std::unique_ptr<int32_t[]> recurrent_to_cell_weight_x_activation_zp;
+  // Output gate.
+  std::unique_ptr<int32_t[]> input_to_output_weight_x_input_zp;
+  std::unique_ptr<int32_t[]> recurrent_to_output_weight_x_activation_zp;
+  // Input gate.
+  std::unique_ptr<int32_t[]> input_to_input_weight_x_input_zp;
+  std::unique_ptr<int32_t[]> recurrent_to_input_weight_x_activation_zp;
+  // Projection.
+  std::unique_ptr<int32_t[]> projection_bias_accu;
+};
+
 TfLiteStatus EvalFloat(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
     const TfLiteTensor* input_to_forget_weights,
@@ -84,6 +148,32 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output);
 
+TfLiteStatus EvalQuantized(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params,
+    const lstm_eval::QuantizedLstmParameter* quantized_lstm_param,
+    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
+    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
+    TfLiteTensor* scratch5);
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
new file mode 100644
index 00000000000..41ff7338a7f
--- /dev/null
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -0,0 +1,604 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/lstm_eval.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+// Validate result.
+template <typename T>
+bool ArrayEq(const T* result, const T* expected_result, int size) {
+  for (int i = 0; i < size; ++i) {
+    if (result[i] != expected_result[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// The class that holds input parameters for quantized lstm.
+class QuantizedLstmParam {
+ public:
+  // Getter methods.
+  TfLiteTensor* GetInput() {
+    PackWeightToTensor(&input_tensor_, input_, input_size_);
+    input_tensor_.data.int8 = input_.data();
+    return &input_tensor_;
+  }
+  TfLiteTensor* Geti2i() {
+    PackWeightToTensor(&i2i_tensor_, i2i_, i2i_size_);
+    i2i_tensor_.data.int8 = i2i_.data();
+    return &i2i_tensor_;
+  }
+  TfLiteTensor* Geti2f() {
+    PackWeightToTensor(&i2f_tensor_, i2f_, i2f_size_);
+    i2f_tensor_.data.int8 = i2f_.data();
+    return &i2f_tensor_;
+  }
+  TfLiteTensor* Geti2c() {
+    PackWeightToTensor(&i2c_tensor_, i2c_, i2c_size_);
+    i2c_tensor_.data.int8 = i2c_.data();
+    return &i2c_tensor_;
+  }
+  TfLiteTensor* Geti2o() {
+    PackWeightToTensor(&i2o_tensor_, i2o_, i2o_size_);
+    i2o_tensor_.data.int8 = i2o_.data();
+    return &i2o_tensor_;
+  }
+  TfLiteTensor* Getr2i() {
+    PackWeightToTensor(&r2i_tensor_, r2i_, r2i_size_);
+    r2i_tensor_.data.int8 = r2i_.data();
+    return &r2i_tensor_;
+  }
+  TfLiteTensor* Getr2f() {
+    PackWeightToTensor(&r2f_tensor_, r2f_, r2f_size_);
+    r2f_tensor_.data.int8 = r2f_.data();
+    return &r2f_tensor_;
+  }
+  TfLiteTensor* Getr2c() {
+    PackWeightToTensor(&r2c_tensor_, r2c_, r2c_size_);
+    r2c_tensor_.data.int8 = r2c_.data();
+    return &r2c_tensor_;
+  }
+  TfLiteTensor* Getr2o() {
+    PackWeightToTensor(&r2o_tensor_, r2o_, r2o_size_);
+    r2o_tensor_.data.int8 = r2o_.data();
+    return &r2o_tensor_;
+  }
+  TfLiteTensor* GetProjection() {
+    PackWeightToTensor(&projection_tensor_, projection_, projection_size_);
+    projection_tensor_.data.int8 = projection_.data();
+    return &projection_tensor_;
+  }
+  TfLiteTensor* GetInputLayerNorm() {
+    PackWeightToTensor(&layer_norm_input_tensor_, layer_norm_input_,
+                       layer_norm_input_size_);
+    layer_norm_input_tensor_.data.i16 = layer_norm_input_.data();
+    return &layer_norm_input_tensor_;
+  }
+  TfLiteTensor* GetForgetLayerNorm() {
+    PackWeightToTensor(&layer_norm_forget_tensor_, layer_norm_forget_,
+                       layer_norm_forget_size_);
+    layer_norm_forget_tensor_.data.i16 = layer_norm_forget_.data();
+    return &layer_norm_forget_tensor_;
+  }
+  TfLiteTensor* GetCellLayerNorm() {
+    PackWeightToTensor(&layer_norm_cell_tensor_, layer_norm_cell_,
+                       layer_norm_cell_size_);
+    layer_norm_cell_tensor_.data.i16 = layer_norm_cell_.data();
+    return &layer_norm_cell_tensor_;
+  }
+  TfLiteTensor* GetOutputLayerNorm() {
+    PackWeightToTensor(&layer_norm_output_tensor_, layer_norm_output_,
+                       layer_norm_output_size_);
+    layer_norm_output_tensor_.data.i16 = layer_norm_output_.data();
+    return &layer_norm_output_tensor_;
+  }
+  TfLiteTensor* GetInputBias() {
+    PackWeightToTensor(&input_bias_tensor_, input_bias_, input_bias_size_);
+    input_bias_tensor_.data.i32 = input_bias_.data();
+    return &input_bias_tensor_;
+  }
+  TfLiteTensor* GetForgetBias() {
+    PackWeightToTensor(&forget_bias_tensor_, forget_bias_, forget_bias_size_);
+    forget_bias_tensor_.data.i32 = forget_bias_.data();
+    return &forget_bias_tensor_;
+  }
+  TfLiteTensor* GetCellBias() {
+    PackWeightToTensor(&cell_bias_tensor_, cell_bias_, cell_bias_size_);
+    cell_bias_tensor_.data.i32 = cell_bias_.data();
+    return &cell_bias_tensor_;
+  }
+  TfLiteTensor* GetOutputBias() {
+    PackWeightToTensor(&output_bias_tensor_, output_bias_, output_bias_size_);
+    output_bias_tensor_.data.i32 = output_bias_.data();
+    return &output_bias_tensor_;
+  }
+  TfLiteTensor* GetProjectionBias() {
+    PackWeightToTensor(&projection_bias_tensor_, projection_bias_,
+                       projection_bias_size_);
+    projection_bias_tensor_.data.i32 = projection_bias_.data();
+    return &projection_bias_tensor_;
+  }
+
+  // Set up quantization parameters.
+  ops::builtin::lstm_eval::QuantizedLstmParameter* GetQuantParam() {
+    quant_lstm_parm_.effective_input_to_input_scale_a = 1808677632;
+    quant_lstm_parm_.effective_input_to_input_scale_b = -1;
+    quant_lstm_parm_.effective_recurrent_to_input_scale_a = 1078887680;
+    quant_lstm_parm_.effective_recurrent_to_input_scale_b = -1;
+    quant_lstm_parm_.effective_cell_to_input_scale_a = 1073741824;
+    quant_lstm_parm_.effective_cell_to_input_scale_b = 1;
+    quant_lstm_parm_.effective_input_to_forget_scale_a = 1845996800;
+    quant_lstm_parm_.effective_input_to_forget_scale_b = -3;
+    quant_lstm_parm_.effective_recurrent_to_forget_scale_a = 1477412736;
+    quant_lstm_parm_.effective_recurrent_to_forget_scale_b = -2;
+    quant_lstm_parm_.effective_cell_to_forget_scale_a = 1073741824;
+    quant_lstm_parm_.effective_cell_to_forget_scale_b = 1;
+    quant_lstm_parm_.effective_input_to_cell_scale_a = 1648385408;
+    quant_lstm_parm_.effective_input_to_cell_scale_b = -2;
+    quant_lstm_parm_.effective_recurrent_to_cell_scale_a = 1185544192,
+    quant_lstm_parm_.effective_recurrent_to_cell_scale_b = -1;
+    quant_lstm_parm_.effective_input_to_output_scale_a = 1328153600;
+    quant_lstm_parm_.effective_input_to_output_scale_b = -1;
+    quant_lstm_parm_.effective_recurrent_to_output_scale_a = 1479582592;
+    quant_lstm_parm_.effective_recurrent_to_output_scale_b = -1;
+    quant_lstm_parm_.effective_cell_to_output_scale_a = 1073741824,
+    quant_lstm_parm_.effective_cell_to_output_scale_b = 1;
+    quant_lstm_parm_.effective_proj_scale_a = 1105682560;
+    quant_lstm_parm_.effective_proj_scale_b = -8;
+    quant_lstm_parm_.layer_norm_input_scale_a = 2011617664;
+    quant_lstm_parm_.layer_norm_input_scale_b = -11;
+    quant_lstm_parm_.layer_norm_forget_scale_a = 1968024960;
+    quant_lstm_parm_.layer_norm_forget_scale_b = -13;
+    quant_lstm_parm_.layer_norm_cell_scale_a = 1097334528,
+    quant_lstm_parm_.layer_norm_cell_scale_b = -12;
+    quant_lstm_parm_.layer_norm_output_scale_a = 1837163008;
+    quant_lstm_parm_.layer_norm_output_scale_b = -12;
+    quant_lstm_parm_.quantized_cell_clip = 20480;
+    quant_lstm_parm_.quantized_proj_clip = 0;
+    quant_lstm_parm_.inv_large_value[0] = 1;
+    quant_lstm_parm_.inv_large_value[1] = 2;
+    quant_lstm_parm_.inv_large_value[2] = 2;
+    quant_lstm_parm_.inv_large_value[3] = 1;
+    quant_lstm_parm_.input_to_forget_weight_x_input_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.recurrent_to_forget_weight_x_activation_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.input_to_cell_weight_x_input_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.recurrent_to_cell_weight_x_activation_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.input_to_output_weight_x_input_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.recurrent_to_output_weight_x_activation_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.input_to_input_weight_x_input_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.recurrent_to_input_weight_x_activation_zp.reset(
+        new int32_t[n_cell_]);
+    quant_lstm_parm_.projection_bias_accu.reset(new int32_t[n_output_]);
+    std::fill_n(quant_lstm_parm_.input_to_forget_weight_x_input_zp.get(),
+                n_cell_, 152);
+    std::fill_n(
+        quant_lstm_parm_.recurrent_to_forget_weight_x_activation_zp.get(),
+        n_cell_, 315);
+    std::fill_n(quant_lstm_parm_.input_to_cell_weight_x_input_zp.get(), n_cell_,
+                165);
+    std::fill_n(quant_lstm_parm_.recurrent_to_cell_weight_x_activation_zp.get(),
+                n_cell_, 1165);
+    std::fill_n(quant_lstm_parm_.input_to_output_weight_x_input_zp.get(),
+                n_cell_, 159);
+    std::fill_n(
+        quant_lstm_parm_.recurrent_to_output_weight_x_activation_zp.get(),
+        n_cell_, 915);
+    std::fill_n(quant_lstm_parm_.input_to_input_weight_x_input_zp.get(),
+                n_cell_, -15);
+    std::fill_n(
+        quant_lstm_parm_.recurrent_to_input_weight_x_activation_zp.get(),
+        n_cell_, 315);
+    std::fill_n(quant_lstm_parm_.projection_bias_accu.get(), n_output_, 115);
+    return &quant_lstm_parm_;
+  }
+
+  // Create scratch buffers.
+  TfLiteTensor* GetScratch0() {
+    PackWeightToTensor(&scratch0_tensor_, scratch0_, scratch0_size_);
+    scratch0_tensor_.data.i16 = scratch0_.data();
+    return &scratch0_tensor_;
+  }
+  TfLiteTensor* GetScratch1() {
+    PackWeightToTensor(&scratch1_tensor_, scratch1_, scratch1_size_);
+    scratch1_tensor_.data.i16 = scratch1_.data();
+    return &scratch1_tensor_;
+  }
+  TfLiteTensor* GetScratch2() {
+    PackWeightToTensor(&scratch2_tensor_, scratch2_, scratch2_size_);
+    scratch2_tensor_.data.i16 = scratch2_.data();
+    return &scratch2_tensor_;
+  }
+  TfLiteTensor* GetScratch3() {
+    PackWeightToTensor(&scratch3_tensor_, scratch3_, scratch3_size_);
+    scratch3_tensor_.data.i16 = scratch3_.data();
+    return &scratch3_tensor_;
+  }
+  TfLiteTensor* GetScratch4() {
+    PackWeightToTensor(&scratch4_tensor_, scratch4_, scratch4_size_);
+    scratch4_tensor_.data.int8 = scratch4_.data();
+    return &scratch4_tensor_;
+  }
+  TfLiteTensor* GetScratch5() {
+    PackWeightToTensor(&scratch5_tensor_, scratch5_, scratch5_size_);
+    scratch5_tensor_.data.i32 = scratch5_.data();
+    return &scratch5_tensor_;
+  }
+  TfLiteTensor* GetActivation() {
+    PackWeightToTensor(&activation_tensor_, activation_, activation_size_);
+    activation_tensor_.data.int8 = activation_.data();
+    activation_tensor_.params.zero_point = 50;
+    return &activation_tensor_;
+  }
+  TfLiteTensor* GetOutput() {
+    PackWeightToTensor(&output_tensor_, output_, output_size_);
+    output_tensor_.data.int8 = output_.data();
+    return &output_tensor_;
+  }
+  TfLiteTensor* GetCell() {
+    PackWeightToTensor(&cell_tensor_, cell_, cell_size_);
+    cell_tensor_.data.i16 = cell_.data();
+    return &cell_tensor_;
+  }
+
+  ~QuantizedLstmParam() {
+    TfLiteIntArrayFree(input_tensor_.dims);
+    TfLiteIntArrayFree(i2i_tensor_.dims);
+    TfLiteIntArrayFree(i2f_tensor_.dims);
+    TfLiteIntArrayFree(i2c_tensor_.dims);
+    TfLiteIntArrayFree(i2o_tensor_.dims);
+    TfLiteIntArrayFree(r2i_tensor_.dims);
+    TfLiteIntArrayFree(r2f_tensor_.dims);
+    TfLiteIntArrayFree(r2c_tensor_.dims);
+    TfLiteIntArrayFree(r2o_tensor_.dims);
+    TfLiteIntArrayFree(projection_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_input_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_forget_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_cell_tensor_.dims);
+    TfLiteIntArrayFree(layer_norm_output_tensor_.dims);
+    TfLiteIntArrayFree(input_bias_tensor_.dims);
+    TfLiteIntArrayFree(forget_bias_tensor_.dims);
+    TfLiteIntArrayFree(cell_bias_tensor_.dims);
+    TfLiteIntArrayFree(output_bias_tensor_.dims);
+    TfLiteIntArrayFree(projection_bias_tensor_.dims);
+    TfLiteIntArrayFree(activation_tensor_.dims);
+    TfLiteIntArrayFree(cell_tensor_.dims);
+    TfLiteIntArrayFree(output_tensor_.dims);
+    TfLiteIntArrayFree(scratch0_tensor_.dims);
+    TfLiteIntArrayFree(scratch1_tensor_.dims);
+    TfLiteIntArrayFree(scratch2_tensor_.dims);
+    TfLiteIntArrayFree(scratch3_tensor_.dims);
+    TfLiteIntArrayFree(scratch4_tensor_.dims);
+    TfLiteIntArrayFree(scratch5_tensor_.dims);
+  }
+
+ private:
+  template <typename T>
+  void PackWeightToTensor(TfLiteTensor* tensor, std::vector<T>& data,
+                          std::vector<int32_t> dims) {
+    if (data.empty()) {
+      int total = 1;
+      for (int i = 0; i < dims.size(); ++i) {
+        total *= dims[i];
+      }
+      for (int i = 0; i < total; ++i) {
+        data.push_back(0);
+      }
+    }
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    for (int i = 0; i < dims.size(); ++i) {
+      tensor->dims->data[i] = dims[i];
+    }
+  }
+
+  // Dimensions. Need proper size to trigger neon code.
+  const int n_batch_ = 2;
+  const int n_input_ = 18;
+  const int n_cell_ = 10;
+  const int n_output_ = 6;
+  // input.
+  std::vector<int8_t> input_ = {
+      8, 2, 3,  4, 5, 6, 1, -2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,  //
+      1, 2, -3, 4, 5, 6, 1, 2,  3, 4, 5, 6, 1, 2, 3, 4, 5, 6,  //
+  };
+  std::vector<int32_t> input_size_ = {n_batch_, n_input_};
+  TfLiteTensor input_tensor_;
+
+  // input_to_input_weights.
+  std::vector<int8_t> i2i_ = {
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6, 1, 2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6, 1, 7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 8,  5,  -6,  //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6, 1, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6, 1, 2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6, 1, 2, 3, 4,  5,  6,   //
+  };
+  std::vector<int32_t> i2i_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2i_tensor_;
+
+  // input_to_forget_weights.
+  std::vector<int8_t> i2f_ = {
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  11, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  -6, 1,  2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  13, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
+  };
+  std::vector<int32_t> i2f_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2f_tensor_;
+  // input_to_cell_weights.
+  std::vector<int8_t> i2c_ = {
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  0,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1, 2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  16, 1, 2, 3, 14, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  7, 2, 3, 4,  5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 8,  5,  -6,  //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1, 2, 3, -4, 5,  6,   //
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1, 7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1, 2, 3, 4,  5,  6,   //
+  };
+  std::vector<int32_t> i2c_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2c_tensor_;
+
+  // input_to_output_weights.
+  std::vector<int8_t> i2o_ = {
+      1,  2,  3, 4,  5, 6, 1, 2,  3, 4, -5, 6,  1,  7, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  3, 6, 1, 2,  6, 4, 5,  6,  1,  2, 3, 4,  -5, 6,   //
+      8,  2,  3, 4,  5, 6, 7, 2,  3, 4, 5,  6,  1,  2, 3, 14, 5,  6,   //
+      18, 2,  3, 4,  5, 6, 1, 2,  3, 4, 5,  -6, 1,  2, 3, 4,  5,  6,   //
+      8,  2,  3, 4,  5, 6, 3, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  6,   //
+      1,  2,  3, 4,  5, 6, 5, 2,  3, 4, 5,  6,  1,  2, 3, 4,  5,  0,   //
+      8,  2,  3, 4,  3, 6, 1, -2, 3, 4, 5,  6,  1,  2, 3, -4, 5,  6,   //
+      1,  2,  3, -4, 5, 6, 1, 2,  3, 4, 5,  6,  -1, 2, 3, 4,  5,  6,   //
+      1,  -2, 2, 4,  5, 6, 1, 2,  3, 4, 5,  6,  1,  2, 3, 8,  5,  -6,  //
+  };
+  std::vector<int32_t> i2o_size_ = {n_cell_, n_input_};
+  TfLiteTensor i2o_tensor_;
+
+  // recurrent_to_input_weights.
+  std::vector<int8_t> r2i_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2i_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2i_tensor_;
+
+  // recurrent_to_forget_weights.
+  std::vector<int8_t> r2f_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2f_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2f_tensor_;
+
+  // recurrent_to_cell_weights.
+  std::vector<int8_t> r2c_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2c_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2c_tensor_;
+
+  // recurrent_to_output_weights.
+  std::vector<int8_t> r2o_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+  };
+  std::vector<int32_t> r2o_size_ = {n_cell_, n_output_};
+  TfLiteTensor r2o_tensor_;
+
+  // input_layer_norm_coefficients.
+  std::vector<int16_t> layer_norm_input_ = {8, 2, 3, 4, 5, 6, 1, 2, 3, 4};
+  std::vector<int32_t> layer_norm_input_size_ = {n_cell_};
+  TfLiteTensor layer_norm_input_tensor_;
+
+  // forget_layer_norm_coefficient.
+  std::vector<int16_t> layer_norm_forget_ = {
+      1, 2, 3, 4, 7, 3, 4, -5, 6, 3,  //
+  };
+  std::vector<int32_t> layer_norm_forget_size_ = {n_cell_};
+  TfLiteTensor layer_norm_forget_tensor_;
+
+  // cell_layer_norm_coefficients.
+  std::vector<int16_t> layer_norm_cell_ = {
+      6, 4, 5, 6, 1, 2, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> layer_norm_cell_size_ = {n_cell_};
+  TfLiteTensor layer_norm_cell_tensor_;
+
+  // output_layer_norm_coefficients.
+  std::vector<int16_t> layer_norm_output_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> layer_norm_output_size_ = {n_cell_};
+  TfLiteTensor layer_norm_output_tensor_;
+
+  // input_gate_bias.
+  std::vector<int32_t> input_bias_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> input_bias_size_ = {n_cell_};
+  TfLiteTensor input_bias_tensor_;
+
+  // forget_gate_bias.
+  std::vector<int32_t> forget_bias_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> forget_bias_size_ = {n_cell_};
+  TfLiteTensor forget_bias_tensor_;
+
+  // cell_bias.
+  std::vector<int32_t> cell_bias_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> cell_bias_size_ = {n_cell_};
+  TfLiteTensor cell_bias_tensor_;
+
+  // output_gate_bias.
+  std::vector<int32_t> output_bias_ = {
+      16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> output_bias_size_ = {n_cell_};
+  TfLiteTensor output_bias_tensor_;
+
+  // projection_weights.
+  std::vector<int8_t> projection_ = {
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+      8, 2, 3, 4, 5, 6, 1, 2,  3,  4,  //
+      6, 4, 5, 6, 1, 2, 3, 4,  -5, 6,  //
+      1, 2, 3, 4, 7, 3, 4, -5, 6,  3,  //
+  };
+  std::vector<int32_t> projection_size_ = {n_cell_, n_output_};
+  TfLiteTensor projection_tensor_;
+
+  // projection_bias.
+  std::vector<int32_t> projection_bias_ = {
+      16, 4, 5, 6, 1, 1  //
+  };
+  std::vector<int32_t> projection_bias_size_ = {n_output_};
+  TfLiteTensor projection_bias_tensor_;
+
+  // activation.
+  std::vector<int8_t> activation_;
+  std::vector<int32_t> activation_size_ = {n_batch_, n_output_};
+  TfLiteTensor activation_tensor_;
+
+  // cell.
+  std::vector<int16_t> cell_ = {
+      16, 4,  5, 6, 1, 1, 3, 4, -5, 6,  //
+      1,  14, 5, 6, 1, 1, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> cell_size_ = {n_batch_, n_cell_};
+  TfLiteTensor cell_tensor_;
+
+  // output.
+  std::vector<int8_t> output_ = {
+      1, 1, 3, 4, -5, 6,  //
+      1, 4, 3, 4, -5, 6,  //
+  };
+  std::vector<int32_t> output_size_ = {n_batch_, n_output_};
+  TfLiteTensor output_tensor_;
+
+  // quantized_lstm_param
+  ops::builtin::lstm_eval::QuantizedLstmParameter quant_lstm_parm_;
+
+  // 5 scratch buffers.
+  std::vector<int16_t> scratch0_;
+  std::vector<int32_t> scratch0_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch0_tensor_;
+  std::vector<int16_t> scratch1_;
+  std::vector<int32_t> scratch1_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch1_tensor_;
+  std::vector<int16_t> scratch2_;
+  std::vector<int32_t> scratch2_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch2_tensor_;
+  std::vector<int16_t> scratch3_;
+  std::vector<int32_t> scratch3_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch3_tensor_;
+  std::vector<int8_t> scratch4_;
+  std::vector<int32_t> scratch4_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch4_tensor_;
+  std::vector<int32_t> scratch5_;
+  std::vector<int32_t> scratch5_size_ = {n_batch_, n_cell_};
+  TfLiteTensor scratch5_tensor_;
+};
+
+void TestOneFullyQuantizedLSTM() {
+  QuantizedLstmParam one_parameter;
+  auto activation = one_parameter.GetActivation();
+  auto output = one_parameter.GetOutput();
+  auto cell = one_parameter.GetCell();
+  auto param = one_parameter.GetQuantParam();
+  ops::builtin::lstm_eval::EvalQuantized(
+      one_parameter.GetInput(), one_parameter.Geti2i(), one_parameter.Geti2f(),
+      one_parameter.Geti2c(), one_parameter.Geti2o(), one_parameter.Getr2i(),
+      one_parameter.Getr2f(), one_parameter.Getr2c(), one_parameter.Getr2o(),
+      nullptr, nullptr, nullptr, one_parameter.GetInputLayerNorm(),
+      one_parameter.GetForgetLayerNorm(), one_parameter.GetCellLayerNorm(),
+      one_parameter.GetOutputLayerNorm(), one_parameter.GetInputBias(),
+      one_parameter.GetForgetBias(), one_parameter.GetCellBias(),
+      one_parameter.GetOutputBias(), one_parameter.GetProjection(),
+      one_parameter.GetProjectionBias(), nullptr, param, activation, cell,
+      output, one_parameter.GetScratch0(), one_parameter.GetScratch1(),
+      one_parameter.GetScratch2(), one_parameter.GetScratch3(),
+      one_parameter.GetScratch4(), one_parameter.GetScratch5());
+
+  // Verify results.
+  const std::vector<int16_t> expected_cell = {
+      7, 1, 3, 2, 0, 1, 0, 2, -2, 4, 1, 6, 4, 3, 0, 1, 0, 2, -2, 4,
+  };
+  const std::vector<int8_t> expected_activation = {
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+  };
+  EXPECT_TRUE(ArrayEq(cell->data.i16, expected_cell.data(), 20));
+  EXPECT_TRUE(ArrayEq(activation->data.int8, expected_activation.data(), 12));
+  EXPECT_TRUE(ArrayEq(output->data.int8, expected_activation.data(), 12));
+}
+
+TEST(TestOneFullyQuantizedLSTM, TestOneFullyQuantizedLSTM) {
+  TestOneFullyQuantizedLSTM();
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 84ddc2562a0..d02b11e5889 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -38,12 +38,13 @@ class LSTMOpModel : public SingleOpModel {
               bool use_peephole, bool use_projection_weights,
               bool use_projection_bias, float cell_clip, float proj_clip,
               const std::vector<std::vector<int>>& input_shapes,
-              const TensorType& weight_type = TensorType_FLOAT32,
-              bool is_layer_norm = false)
+              const TensorType weight_type, bool is_layer_norm)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
-        n_output_(n_output) {
+        n_output_(n_output),
+        weight_type_(weight_type),
+        is_layer_norm_(is_layer_norm) {
     input_ = AddInput(TensorType_FLOAT32);
 
     if (use_cifg) {
@@ -103,9 +104,9 @@ class LSTMOpModel : public SingleOpModel {
 
     // Adding the 2 input state tensors.
     input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
+        AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_output_}}, true);
     input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
+        AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_cell_}}, true);
 
     // Layer norm weights.
     if (is_layer_norm) {
@@ -134,178 +135,10 @@ class LSTMOpModel : public SingleOpModel {
                                    cell_clip, proj_clip)
                      .Union());
 
-    BuildInterpreter(input_shapes);
-  }
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(input_layer_norm_coefficients_, f);
-  }
-
-  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(forget_layer_norm_coefficients_, f);
-  }
-
-  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(cell_layer_norm_coefficients_, f);
-  }
-
-  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(output_layer_norm_coefficients_, f);
-  }
-
-  void SetInputGateBias(const std::vector<float>& f) {
-    PopulateTensor(input_gate_bias_, f);
-  }
-
-  void SetForgetGateBias(const std::vector<float>& f) {
-    PopulateTensor(forget_gate_bias_, f);
-  }
-
-  void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
-  }
-
-  void SetOutputGateBias(const std::vector<float>& f) {
-    PopulateTensor(output_gate_bias_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    PopulateTensor(projection_weights_, f);
-  }
-
-  void SetProjectionBias(const std::vector<float>& f) {
-    PopulateTensor(projection_bias_, f);
-  }
-
-  void SetInput(int offset, const float* begin, const float* end) {
-    PopulateTensor(input_, offset, const_cast<float*>(begin),
-                   const_cast<float*>(end));
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
-  int num_inputs() { return n_input_; }
-  int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
-
- protected:
-  int input_;
-  int input_to_input_weights_;
-  int input_to_forget_weights_;
-  int input_to_cell_weights_;
-  int input_to_output_weights_;
-
-  int recurrent_to_input_weights_;
-  int recurrent_to_forget_weights_;
-  int recurrent_to_cell_weights_;
-  int recurrent_to_output_weights_;
-
-  int cell_to_input_weights_;
-  int cell_to_forget_weights_;
-  int cell_to_output_weights_;
-
-  int input_layer_norm_coefficients_;
-  int forget_layer_norm_coefficients_;
-  int cell_layer_norm_coefficients_;
-  int output_layer_norm_coefficients_;
-
-  int input_gate_bias_;
-  int forget_gate_bias_;
-  int cell_bias_;
-  int output_gate_bias_;
-
-  int projection_weights_;
-  int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
-
-  int output_;
-  int output_state_;
-  int cell_state_;
-
-  int n_batch_;
-  int n_input_;
-  int n_cell_;
-  int n_output_;
-
- private:
-  int AddLayerNormCoeffsTensor(
-      int tensor_index, const std::vector<std::vector<int>>& input_shapes) {
-    if (input_shapes[tensor_index][0] != 0) {
-      return AddInput(TensorType_FLOAT32);
-    } else {
-      return AddNullInput();
-    }
-  }
-};
-
-class HybridLSTMOpModel : public LSTMOpModel {
- public:
-  HybridLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                    bool use_cifg, bool use_peephole,
-                    bool use_projection_weights, bool use_projection_bias,
-                    float cell_clip, float proj_clip,
-                    const std::vector<std::vector<int>>& input_shapes,
-                    TensorType tensor_type)
-      : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
-                    use_projection_weights, use_projection_bias, cell_clip,
-                    proj_clip, input_shapes, tensor_type) {
-    tensor_type_ = tensor_type;
-  }
-
-  TensorType tensor_type_;
-
-  void SetWeights(int weights_idx, const std::vector<float>& f) {
-    if (tensor_type_ == TensorType_UINT8) {
-      SymmetricQuantizeAndPopulate(weights_idx, f);
-    } else {
-      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
-    }
+    // Do not apply delegate yet since tensor values are not known (and more
+    // specifically scales in quantized tensors are not known).
+    BuildInterpreter(input_shapes, /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
   }
 
   void SetInputToInputWeights(const std::vector<float>& f) {
@@ -352,9 +185,136 @@ class HybridLSTMOpModel : public LSTMOpModel {
     SetWeights(cell_to_output_weights_, f);
   }
 
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(const std::vector<float>& f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(const std::vector<float>& f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
   void SetProjectionWeights(const std::vector<float>& f) {
     SetWeights(projection_weights_, f);
   }
+
+  void SetProjectionBias(const std::vector<float>& f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    SingleOpModel::PopulateTensor(input_, offset, const_cast<float*>(begin),
+                                  const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_coefficients_ = kOptionalTensor;
+  int forget_layer_norm_coefficients_ = kOptionalTensor;
+  int cell_layer_norm_coefficients_ = kOptionalTensor;
+  int output_layer_norm_coefficients_ = kOptionalTensor;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+  int input_activation_state_;
+  int input_cell_state_;
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+
+ private:
+  int AddLayerNormCoeffsTensor(
+      int tensor_index, const std::vector<std::vector<int>>& input_shapes) {
+    if (input_shapes[tensor_index][0] != 0) {
+      return AddInput(TensorType_FLOAT32);
+    } else {
+      return AddNullInput();
+    }
+  }
+
+  template <typename T>
+  void PopulateTensor(int index, const std::vector<T>& data) {
+    // Nothing to do if tensor is an optional input or if data vector is empty.
+    if ((index == kOptionalTensor) || data.empty()) return;
+    SingleOpModel::PopulateTensor(index, data);
+  }
+
+  void SetWeights(int index, const std::vector<float>& data) {
+    if (data.empty()) return;
+    if (index == kOptionalTensor) return;
+    switch (weight_type_) {
+      case TensorType_FLOAT32:
+        PopulateTensor(index, data);
+        break;
+      case TensorType_UINT8:
+        SymmetricQuantizeAndPopulate(index, data);
+        break;
+      case TensorType_INT8:
+        SignedSymmetricQuantizeAndPopulate(index, data);
+        break;
+      default:
+        GTEST_FAIL() << "Type not supported: " << weight_type_;
+        break;
+    }
+  }
+
+  const TensorType weight_type_;
+  const bool is_layer_norm_;
 };
 
 class BaseLstmTest : public ::testing::Test {
@@ -376,6 +336,10 @@ class BaseLstmTest : public ::testing::Test {
   std::vector<float> cell_to_forget_weights_;
   std::vector<float> cell_to_output_weights_;
   std::vector<float> projection_weights_;
+  std::vector<float> input_layer_norm_coefficients_;
+  std::vector<float> forget_layer_norm_coefficients_;
+  std::vector<float> cell_layer_norm_coefficients_;
+  std::vector<float> output_layer_norm_coefficients_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
@@ -386,6 +350,16 @@ class BaseLstmTest : public ::testing::Test {
   void VerifyGoldens(const std::vector<std::vector<float>>& input,
                      const std::vector<std::vector<float>>& output,
                      LSTMOpModel* lstm, float tolerance = 1e-5) {
+    // Weights are set twice:
+    // - The delegate, if used, needs to know the scales and zero-points of
+    //   quantized tensors, which are computed dynamically when weights are set,
+    //   so weights have to be set before applying the delegate.
+    // - Applying a delegate will invalidate the tensor data so weights have to
+    //   be set a second time.
+    SetAllWeightsAndBiases(lstm);
+    lstm->ApplyDelegate();
+    SetAllWeightsAndBiases(lstm);
+
     const int num_batches = input.size();
     EXPECT_GT(num_batches, 0);
     const int num_inputs = lstm->num_inputs();
@@ -413,6 +387,37 @@ class BaseLstmTest : public ::testing::Test {
                   ElementsAreArray(ArrayFloatNear(expected, tolerance)));
     }
   }
+
+  // Sets all weights and biases that have been defined by test. The test can
+  // define only a subset of all those vectors, and only the ones that have been
+  // defined will be set.
+  void SetAllWeightsAndBiases(LSTMOpModel* lstm) {
+    lstm->SetInputToInputWeights(input_to_input_weights_);
+    lstm->SetInputToCellWeights(input_to_cell_weights_);
+    lstm->SetInputToForgetWeights(input_to_forget_weights_);
+    lstm->SetInputToOutputWeights(input_to_output_weights_);
+
+    lstm->SetInputGateBias(input_gate_bias_);
+    lstm->SetCellBias(cell_gate_bias_);
+    lstm->SetForgetGateBias(forget_gate_bias_);
+    lstm->SetOutputGateBias(output_gate_bias_);
+
+    lstm->SetRecurrentToInputWeights(recurrent_to_input_weights_);
+    lstm->SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+    lstm->SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+    lstm->SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+    lstm->SetCellToInputWeights(cell_to_input_weights_);
+    lstm->SetCellToForgetWeights(cell_to_forget_weights_);
+    lstm->SetCellToOutputWeights(cell_to_output_weights_);
+
+    lstm->SetProjectionWeights(projection_weights_);
+
+    lstm->SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
+    lstm->SetForgetLayerNormCoefficients(forget_layer_norm_coefficients_);
+    lstm->SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+    lstm->SetOutputLayerNormCoefficients(output_layer_norm_coefficients_);
+  }
 };
 
 class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
@@ -500,22 +505,9 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
-                   });
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+                   },
+                   /*weight_type=*/TensorType_FLOAT32,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
@@ -572,21 +564,6 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest,
                    /*weight_type=*/TensorType_FLOAT32,
                    /*is_layer_norm=*/true);
 
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
@@ -598,52 +575,38 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
   const int n_cell = 4;
   const int n_output = 4;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/false,
-      /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false, /*cell_clip=*/0.0,
+                   /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {0},  // cell_to_input_weight tensor
-          {0},  // cell_to_forget_weight tensor
-          {0},  // cell_to_output_weight tensor
+                       {0},  // cell_to_input_weight tensor
+                       {0},  // cell_to_forget_weight tensor
+                       {0},  // cell_to_output_weight tensor
 
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {0, 0},  // projection_weight tensor
-          {0},     // projection_bias tensor
-      },
-      TensorType_UINT8);
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_UINT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
                 /*tolerance=*/0.0157651);
@@ -657,52 +620,38 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
   const int n_cell = 4;
   const int n_output = 4;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/false,
-      /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false, /*cell_clip=*/0.0,
+                   /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {0},  // cell_to_input_weight tensor
-          {0},  // cell_to_forget_weight tensor
-          {0},  // cell_to_output_weight tensor
+                       {0},  // cell_to_input_weight tensor
+                       {0},  // cell_to_forget_weight tensor
+                       {0},  // cell_to_output_weight tensor
 
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {0, 0},  // projection_weight tensor
-          {0},     // projection_bias tensor
-      },
-      TensorType_INT8);
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_INT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
                 /*tolerance=*/0.0157651);
@@ -791,22 +740,9 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
-                   });
-
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+                   },
+                   /*weight_type=*/TensorType_FLOAT32,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
@@ -819,53 +755,38 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
   const int n_cell = 4;
   const int n_output = 4;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false,
-      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/true, /*use_peephole=*/true,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {0, 0},             // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {0, 0},              // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+                       {0},       // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
 
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {0},       // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {0, 0},  // projection_weight tensor
-          {0},     // projection_bias tensor
-      },
-      TensorType_UINT8);
-
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_UINT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
 }
@@ -878,53 +799,38 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
   const int n_cell = 4;
   const int n_output = 4;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false,
-      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/true, /*use_peephole=*/true,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {0, 0},             // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {0, 0},              // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+                       {0},       // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
 
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {0},       // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {0, 0},  // projection_weight tensor
-          {0},     // projection_bias tensor
-      },
-      TensorType_INT8);
-
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_INT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
 }
@@ -1563,91 +1469,51 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_output, n_cell},  // projection_weight tensor
                        {0},                 // projection_bias tensor
-                   });
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToInputWeights(cell_to_input_weights_);
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  lstm.SetProjectionWeights(projection_weights_);
+                   },
+                   /*weight_type=*/TensorType_FLOAT32,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTesInt8) {
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTestInt8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
   const int n_output = 16;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false,
-      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+                       {n_cell},  // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
 
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      },
-      TensorType_UINT8);
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToInputWeights(cell_to_input_weights_);
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  lstm.SetProjectionWeights(projection_weights_);
+                       {n_output, n_cell},  // projection_weight tensor
+                       {0},                 // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_INT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
@@ -1659,233 +1525,44 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest,
   const int n_cell = 20;
   const int n_output = 16;
 
-  HybridLSTMOpModel lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false,
-      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-      {
-          {n_batch, n_input},  // input tensor
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
 
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
 
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
 
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+                       {n_cell},  // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
 
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
 
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      },
-      TensorType_INT8);
-
-  lstm.SetInputToInputWeights(input_to_input_weights_);
-  lstm.SetInputToCellWeights(input_to_cell_weights_);
-  lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  lstm.SetInputGateBias(input_gate_bias_);
-  lstm.SetCellBias(cell_gate_bias_);
-  lstm.SetForgetGateBias(forget_gate_bias_);
-  lstm.SetOutputGateBias(output_gate_bias_);
-
-  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  lstm.SetCellToInputWeights(cell_to_input_weights_);
-  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  lstm.SetProjectionWeights(projection_weights_);
+                       {n_output, n_cell},  // projection_weight tensor
+                       {0},                 // projection_bias tensor
+                   },
+                   /*weight_type=*/TensorType_UINT8,
+                   /*is_layer_norm=*/false);
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
-class LayerNormLSTMOpModel : public LSTMOpModel {
- public:
-  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                       bool use_cifg, bool use_peephole,
-                       bool use_projection_weights, bool use_projection_bias,
-                       float cell_clip, float proj_clip,
-                       const std::vector<std::vector<int>>& input_shapes,
-                       const TensorType& weight_type = TensorType_FLOAT32)
-      : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
-                    use_projection_weights, use_projection_bias, cell_clip,
-                    proj_clip, input_shapes, weight_type,
-                    /*is_layer_norm*/ true) {}
-};
-
-class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
- public:
-  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                             bool use_cifg, bool use_peephole,
-                             bool use_projection_weights,
-                             bool use_projection_bias, float cell_clip,
-                             float proj_clip,
-                             const std::vector<std::vector<int>>& input_shapes,
-                             TensorType tensor_type)
-      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
-                             use_peephole, use_projection_weights,
-                             use_projection_bias, cell_clip, proj_clip,
-                             input_shapes, tensor_type) {
-    tensor_type_ = tensor_type;
-  }
-
-  TensorType tensor_type_;
-
-  void SetWeights(int weights_idx, const std::vector<float>& f) {
-    if (tensor_type_ == TensorType_UINT8) {
-      SymmetricQuantizeAndPopulate(weights_idx, f);
-    } else {
-      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
-    }
-  }
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    SetWeights(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    SetWeights(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    SetWeights(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    SetWeights(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(input_layer_norm_coefficients_, f);
-  }
-
-  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(forget_layer_norm_coefficients_, f);
-  }
-
-  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(cell_layer_norm_coefficients_, f);
-  }
-
-  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
-    PopulateTensor(output_layer_norm_coefficients_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    SetWeights(projection_weights_, f);
-  }
-};
-
-class BaseLayerNormLstmTest : public ::testing::Test {
- protected:
-  // Weights of the Layer Norm LSTM model. Some are optional.
-  std::vector<float> input_to_input_weights_;
-  std::vector<float> input_to_cell_weights_;
-  std::vector<float> input_to_forget_weights_;
-  std::vector<float> input_to_output_weights_;
-  std::vector<float> input_gate_bias_;
-  std::vector<float> cell_gate_bias_;
-  std::vector<float> forget_gate_bias_;
-  std::vector<float> output_gate_bias_;
-  std::vector<float> recurrent_to_input_weights_;
-  std::vector<float> recurrent_to_cell_weights_;
-  std::vector<float> recurrent_to_forget_weights_;
-  std::vector<float> recurrent_to_output_weights_;
-  std::vector<float> cell_to_input_weights_;
-  std::vector<float> cell_to_forget_weights_;
-  std::vector<float> cell_to_output_weights_;
-  std::vector<float> projection_weights_;
-  std::vector<float> input_layer_norm_coefficients_;
-  std::vector<float> forget_layer_norm_coefficients_;
-  std::vector<float> cell_layer_norm_coefficients_;
-  std::vector<float> output_layer_norm_coefficients_;
-
-  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
-  std::vector<std::vector<float>> layer_norm_lstm_input_;
-
-  // Compares output up to tolerance to the result of the layer_norm_lstm given
-  // the input.
-  void VerifyGoldens(const std::vector<std::vector<float>>& input,
-                     const std::vector<std::vector<float>>& output,
-                     LayerNormLSTMOpModel* layer_norm_lstm,
-                     float tolerance = 1e-5) {
-    const int num_batches = input.size();
-    EXPECT_GT(num_batches, 0);
-    const int num_inputs = layer_norm_lstm->num_inputs();
-    EXPECT_GT(num_inputs, 0);
-    const int input_sequence_size = input[0].size() / num_inputs;
-    EXPECT_GT(input_sequence_size, 0);
-    for (int i = 0; i < input_sequence_size; ++i) {
-      for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
-
-        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
-                                  batch_start, batch_end);
-      }
-
-      layer_norm_lstm->Invoke();
-
-      const int num_outputs = layer_norm_lstm->num_outputs();
-      std::vector<float> expected;
-      for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
-      }
-      EXPECT_THAT(layer_norm_lstm->GetOutput(),
-                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
-    }
-  }
-};
-
 class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
+    : public BaseLstmTest {
   void SetUp() override {
     input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
                                0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
@@ -1937,7 +1614,7 @@ class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
     projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
                            0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
 
-    layer_norm_lstm_input_ = {
+    lstm_input_ = {
         {// Batch0: 3 (input_sequence_size) * 5 (n_input)
          0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
          0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
@@ -1960,7 +1637,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  LayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -1997,53 +1674,25 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // forget_layer_norm_coefficient tensor
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+      },
+      /*weight_type=*/TensorType_FLOAT32,
+      /*is_layer_norm=*/true);
 
   // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244077, 0.128027, -0.00170918,  // seq 0
-          0.0137642, 0.140751, 0.0395835,    // seq 1
-          -0.00459231, 0.155278, 0.0837377,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00692428, 0.0848741, 0.063445,  // seq 0
-          -0.00403912, 0.139963, 0.072681,   // seq 1
-          0.00752706, 0.161903, 0.0561371,   // seq 2
-      }};
+  lstm_golden_output_ = {{
+                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
+                             0.0244077, 0.128027, -0.00170918,  // seq 0
+                             0.0137642, 0.140751, 0.0395835,    // seq 1
+                             -0.00459231, 0.155278, 0.0837377,  // seq 2
+                         },
+                         {
+                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
+                             -0.00692428, 0.0848741, 0.063445,  // seq 0
+                             -0.00403912, 0.139963, 0.072681,   // seq 1
+                             0.00752706, 0.161903, 0.0561371,   // seq 2
+                         }};
 
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
 }
 
 TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
@@ -2055,7 +1704,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -2093,52 +1742,24 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
       },
-      TensorType_UINT8);
+      /*weight_type=*/TensorType_UINT8,
+      /*is_layer_norm=*/true);
 
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+  lstm_golden_output_ = {{
+                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
+                             0.0244576, 0.127847, -0.00181765,  // seq 0
+                             0.0137518, 0.140892, 0.0402234,    // seq 1
+                             -0.0048839, 0.155096, 0.0840309,   // seq 2
+                         },
+                         {
+                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
+                             -0.00728636, 0.0843957, 0.0634786,  // seq 0
+                             -0.00448382, 0.139278, 0.0737372,   // seq 1
+                             0.00734616, 0.161793, 0.0560238,    // seq 2
+                         }};
 
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244576, 0.127847, -0.00181765,  // seq 0
-          0.0137518, 0.140892, 0.0402234,    // seq 1
-          -0.0048839, 0.155096, 0.0840309,   // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00728636, 0.0843957, 0.0634786,  // seq 0
-          -0.00448382, 0.139278, 0.0737372,   // seq 1
-          0.00734616, 0.161793, 0.0560238,    // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
+                /*tolerance=*/0.0010907);
 }
 
 TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
@@ -2150,7 +1771,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -2188,56 +1809,26 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
       },
-      TensorType_INT8);
+      /*weight_type=*/TensorType_INT8,
+      /*is_layer_norm=*/true);
 
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+  lstm_golden_output_ = {{
+                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
+                             0.0244576, 0.127847, -0.00181765,  // seq 0
+                             0.0137518, 0.140892, 0.0402234,    // seq 1
+                             -0.0048839, 0.155096, 0.0840309,   // seq 2
+                         },
+                         {
+                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
+                             -0.00728636, 0.0843957, 0.0634786,  // seq 0
+                             -0.00448382, 0.139278, 0.0737372,   // seq 1
+                             0.00734616, 0.161793, 0.0560238,    // seq 2
+                         }};
 
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244576, 0.127847, -0.00181765,  // seq 0
-          0.0137518, 0.140892, 0.0402234,    // seq 1
-          -0.0048839, 0.155096, 0.0840309,   // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00728636, 0.0843957, 0.0634786,  // seq 0
-          -0.00448382, 0.139278, 0.0737372,   // seq 1
-          0.00734616, 0.161793, 0.0560238,    // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
 }
 
-class CifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
+class CifgPeepholeProjectionNoClippingLayerNormLstmTest : public BaseLstmTest {
   void SetUp() override {
     input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
                                 -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
@@ -2269,7 +1860,7 @@ class CifgPeepholeProjectionNoClippingLayerNormLstmTest
     projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
                            0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
 
-    layer_norm_lstm_input_ = {
+    lstm_input_ = {
         {// Batch0: 3 (input_sequence_size) * 5 (n_input)
          0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
          0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
@@ -2292,7 +1883,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  LayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -2329,33 +1920,12 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // forget_layer_norm_coefficient tensor
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
-      });
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+      },
+      /*weight_type=*/TensorType_FLOAT32,
+      /*is_layer_norm=*/true);
 
   // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+  lstm_golden_output_ = {
       {
           // Batch0: 3 (input_sequence_size) * 3 (n_output)
           0.02129706, 0.140816242, 0.0112733059,     // seq 0
@@ -2369,8 +1939,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
       }};
 
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
 }
 
 TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
@@ -2382,7 +1951,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -2420,33 +1989,11 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
       },
-      TensorType_UINT8);
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+      /*weight_type=*/TensorType_UINT8,
+      /*is_layer_norm=*/true);
 
   // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+  lstm_golden_output_ = {
       {
           // Batch0: 3 (input_sequence_size) * 3 (n_output)
           0.0212250091, 0.140474007, 0.0115012666,   // seq 0
@@ -2460,8 +2007,8 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
       }};
 
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
+                /*tolerance=*/0.000902065);
 }
 
 TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
@@ -2473,7 +2020,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
+  LSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
@@ -2511,33 +2058,11 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
       },
-      TensorType_INT8);
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormCoefficients(
-      forget_layer_norm_coefficients_);
-  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
-  layer_norm_lstm.SetOutputLayerNormCoefficients(
-      output_layer_norm_coefficients_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+      /*weight_type=*/TensorType_INT8,
+      /*is_layer_norm=*/true);
 
   // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+  lstm_golden_output_ = {
       {
           // Batch0: 3 (input_sequence_size) * 3 (n_output)
           0.0212250091, 0.140474007, 0.0115012666,   // seq 0
@@ -2551,8 +2076,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
       }};
 
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
@@ -2593,7 +2117,8 @@ TEST(LSTMOpModel, InvalidTypeTest) {
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
                    },
-                   /*weight_type=*/TensorType_INT32),
+                   /*weight_type=*/TensorType_INT32,
+                   /*is_layer_norm=*/false),
                "");
 
   EXPECT_DEATH(LSTMOpModel lstm(
@@ -2627,7 +2152,8 @@ TEST(LSTMOpModel, InvalidTypeTest) {
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
                    },
-                   /*weight_type=*/TensorType_COMPLEX64),
+                   /*weight_type=*/TensorType_COMPLEX64,
+                   /*is_layer_norm=*/false),
                "");
 }
 #endif
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
index f5b0212728e..da172bb4827 100644
--- a/tensorflow/lite/kernels/mfcc.cc
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -67,19 +67,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input_wav), 3);
+  TF_LITE_ENSURE_EQ(context, NumElements(input_rate), 1);
 
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, inputWav->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input_wav->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input_rate->type, kTfLiteInt32);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
-  output_size->data[0] = inputWav->dims->data[0];
-  output_size->data[1] = inputWav->dims->data[1];
+  output_size->data[0] = input_wav->dims->data[0];
+  output_size->data[1] = input_wav->dims->data[1];
   output_size->data[2] = params->dct_coefficient_count;
 
   return context->ResizeTensor(context, output, output_size);
@@ -94,15 +95,15 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
 
-  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  const int32 sample_rate = *GetTensorData<int>(inputRate);
+  const int32 sample_rate = *GetTensorData<int>(input_rate);
 
-  const int spectrogram_channels = inputWav->dims->data[2];
-  const int spectrogram_samples = inputWav->dims->data[1];
-  const int audio_channels = inputWav->dims->data[0];
+  const int spectrogram_channels = input_wav->dims->data[2];
+  const int spectrogram_samples = input_wav->dims->data[1];
+  const int audio_channels = input_wav->dims->data[0];
 
   internal::Mfcc mfcc;
   mfcc.set_upper_frequency_limit(params->upper_frequency_limit);
@@ -112,7 +113,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   mfcc.Initialize(spectrogram_channels, sample_rate);
 
-  const float* spectrogram_flat = GetTensorData<float>(inputWav);
+  const float* spectrogram_flat = GetTensorData<float>(input_wav);
   float* output_flat = GetTensorData<float>(output);
 
   for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) {
diff --git a/tensorflow/lite/kernels/mfcc_test.cc b/tensorflow/lite/kernels/mfcc_test.cc
index 7b5591b3b67..99dcc3c8a72 100644
--- a/tensorflow/lite/kernels/mfcc_test.cc
+++ b/tensorflow/lite/kernels/mfcc_test.cc
@@ -92,6 +92,31 @@ TEST(MfccOpTest, SimpleTest) {
           1e-3)));
 }
 
+TEST(MfccOpTest, ScalarInputRateTest) {
+  BaseMfccOpModel m({TensorType_FLOAT32, {1, 1, 513}}, {TensorType_INT32, {}},
+                    {TensorType_FLOAT32, {}});
+
+  std::vector<float> data(513);
+  for (int i = 0; i < data.size(); ++i) {
+    data[i] = i + 1;
+  }
+  m.PopulateTensor<float>(m.input1(), 0, data.data(),
+                          data.data() + data.size());
+  m.PopulateTensor<int>(m.input2(), {22050});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 13));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+           -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+           -0.0769791, -0.10806114, -0.06047613},
+          1e-3)));
+}
+
 }  // namespace
 }  // namespace custom
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index f11a1f3c426..8c7eaf85cae 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -101,8 +101,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       output->type == kTfLiteInt16) {
     double real_multiplier =
         input1->params.scale * input2->params.scale / output->params.scale;
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                       &data->output_shift);
   }
 
   return context->ResizeTensor(context, output, output_size);
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index b6a7700aee1..30f9c526659 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -206,12 +206,37 @@ void NoActivation() {
                                               kQuantizedTolerance)));
 }
 
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivationLargeMultiplier() {
+  // TODO(b/138722124): Remove this after setting the appropriate op version (3)
+  // for dependent tests.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    // NNAPI doesn't currently support Mul with multiplier>1.
+    return;
+  }
+  // Intentionally pathological output range much narrower than needed
+  // to represent input values to exercise the multiplier>1 case.
+  QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -100, 100},
+                        {tensor_type, {1, 2, 2, 1}, -100, 100},
+                        {tensor_type, {}, -10, 10},
+                        ActivationFunctionType_NONE);
+  m.QuantizeAndPopulate<integer_dtype>(m.input1(), {-4, 2, 3, 1});
+  m.QuantizeAndPopulate<integer_dtype>(m.input2(), {-1, -3, 4, 2});
+  m.Invoke();
+  // Note the large tolerance. This computation is inherently inaccurate.
+  const float kTolerance = 1.4f;
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
+              ElementsAreArray(ArrayFloatNear({4, -6, 10, 2}, kTolerance)));
+}
+
 TEST(QuantizedMulOpTest, NoActivationUInt8) {
   NoActivation<TensorType_UINT8, uint8_t>();
+  NoActivationLargeMultiplier<TensorType_UINT8, uint8_t>();
 }
 
 TEST(QuantizedMulOpTest, NoActivationInt8) {
   NoActivation<TensorType_INT8, int8_t>();
+  NoActivationLargeMultiplier<TensorType_INT8, int8_t>();
 }
 
 TEST(QuantizedMulOpTest, NoActivationInt16) {
diff --git a/tensorflow/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
index e9a1aa23254..1c38f8f3ca4 100644
--- a/tensorflow/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
@@ -35,27 +39,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
-template <typename T>
-void Negate(const T* in_data, int num_elements, T* out_data) {
-  // TODO(alanchiao): add vectorized version.
-  for (int i = 0; i < num_elements; ++i) {
-    out_data[i] = -in_data[i];
-  }
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const int num_elements = NumElements(input);
   switch (input->type) {
     case kTfLiteInt64:
-      Negate(input->data.i64, num_elements, output->data.i64);
+      reference_ops::Negate(
+          GetTensorShape(input), GetTensorData<int64_t>(input),
+          GetTensorShape(output), GetTensorData<int64_t>(output));
       break;
     case kTfLiteInt32:
-      Negate(input->data.i32, num_elements, output->data.i32);
+      reference_ops::Negate(
+          GetTensorShape(input), GetTensorData<int32_t>(input),
+          GetTensorShape(output), GetTensorData<int32_t>(output));
       break;
     case kTfLiteFloat32:
-      Negate(input->data.f, num_elements, output->data.f);
+      reference_ops::Negate(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
       break;
     default:
       context->ReportError(
diff --git a/tensorflow/lite/kernels/quant_basic_lstm_test.cc b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
new file mode 100644
index 00000000000..e8f7ad3fc58
--- /dev/null
+++ b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
@@ -0,0 +1,230 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class QuantizedLSTMOpModel : public SingleOpModel {
+ public:
+  QuantizedLSTMOpModel(int numBatches, int inputSize, float weightsScale,
+                       int32_t weightsZeroPoint, int outputSize,
+                       std::initializer_list<uint8_t> weights,
+                       std::initializer_list<int32_t> biases) {
+    std::vector<uint32_t> inputs;
+
+    input_size_ = inputSize;
+    output_size_ = outputSize;
+
+    std::vector<int> input_shape{numBatches, inputSize};
+    std::vector<int> output_shape{numBatches, outputSize};
+    std::vector<int> weight_shape{4 * outputSize, outputSize + inputSize};
+    std::vector<int> state_shape{numBatches, outputSize};
+    std::vector<int> bias_shape{4 * outputSize};
+
+    input_ =
+        AddInput({TensorType_UINT8, input_shape, 0.0f, 0.0f, 1. / 128., 128});
+    prev_output_ =
+        AddInput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128});
+    // Biases and Weights have to be constant in order to allow NNAPI
+    // delegation
+    weights_ = AddConstInput<uint8_t>({TensorType_UINT8, weight_shape, 0.0f,
+                                       0.0f, weightsScale, weightsZeroPoint},
+                                      weights);
+    biases_ = AddConstInput<int32_t>(
+        {TensorType_INT32, bias_shape, 0.0f, 0.0f, weightsScale / 128, 0},
+        biases);
+    prev_cell_state_ =
+        AddInput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0});
+
+    output_ =
+        AddOutput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128});
+    cell_state_out_ =
+        AddOutput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0});
+    output_concat_temp_ =
+        AddOutput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128});
+    output_activation_temp_ =
+        AddOutput({TensorType_INT16, output_shape, 0.0f, 0.0f, 1. / 128., 128});
+
+    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH, 0.0,
+                                   0.0, LSTMKernelType_BASIC)
+                     .Union());
+
+    BuildInterpreter({GetShape(input_), GetShape(prev_output_),
+                      GetShape(weights_), GetShape(biases_),
+                      GetShape(prev_cell_state_)});
+
+    // init feedback inputs to zero
+    std::vector<int16_t> initial_state(GetTensorSize(cell_state_out_), 0);
+    PopulateTensor(prev_cell_state_, initial_state);
+    std::vector<uint8_t> initial_prev_output(GetTensorSize(output_), 0);
+    PopulateTensor(prev_output_, initial_prev_output);
+  }
+
+  int inputSize() { return input_size_; }
+
+  int outputSize() { return output_size_; }
+
+  void setInput(const std::vector<uint8_t>& input) {
+    PopulateTensor(input_, input);
+  }
+
+  std::vector<uint8_t> getOutput() { return ExtractVector<uint8_t>(output_); }
+
+ private:
+  // Inputs
+  int input_;
+  int weights_;
+  int biases_;
+  int prev_cell_state_;
+  int prev_output_;
+  // Outputs
+  int cell_state_out_;
+  int output_;
+  int output_concat_temp_;
+  int output_activation_temp_;
+
+  int input_size_;
+  int output_size_;
+};
+
+class QuantizedLstmTest : public ::testing::Test {
+ protected:
+  void VerifyGoldens(const std::vector<std::vector<uint8_t>>& input,
+                     const std::vector<std::vector<uint8_t>>& output,
+                     QuantizedLSTMOpModel* lstm) {
+    const int numBatches = input.size();
+    ASSERT_GT(numBatches, 0);
+    const int inputSize = lstm->inputSize();
+    ASSERT_GT(inputSize, 0);
+    const int inputSequenceSize = input[0].size() / inputSize;
+    ASSERT_GT(inputSequenceSize, 0);
+    for (int i = 0; i < inputSequenceSize; ++i) {
+      std::vector<uint8_t> inputStep;
+      for (int b = 0; b < numBatches; ++b) {
+        const uint8_t* batchStart = input[b].data() + i * inputSize;
+        const uint8_t* batchEnd = batchStart + inputSize;
+        inputStep.insert(inputStep.end(), batchStart, batchEnd);
+      }
+      lstm->setInput(inputStep);
+      lstm->Invoke();
+
+      const int outputSize = lstm->outputSize();
+      std::vector<float> expected;
+      for (int b = 0; b < numBatches; ++b) {
+        const uint8_t* goldenBatchStart = output[b].data() + i * outputSize;
+        const uint8_t* goldenBatchEnd = goldenBatchStart + outputSize;
+        expected.insert(expected.end(), goldenBatchStart, goldenBatchEnd);
+      }
+      EXPECT_THAT(lstm->getOutput(), ElementsAreArray(expected));
+    }
+  }
+};
+
+// Inputs and weights in this test are random and the test only checks that the
+// outputs are equal to outputs obtained from running TF Lite version of
+// quantized LSTM on the same inputs.
+TEST_F(QuantizedLstmTest, BasicQuantizedLstmTest) {
+  const int numBatches = 2;
+  const int inputSize = 2;
+  const int outputSize = 4;
+
+  float weightsScale = 0.00408021;
+  int weightsZeroPoint = 100;
+
+  QuantizedLSTMOpModel lstm(
+      numBatches, inputSize, weightsScale, weightsZeroPoint, outputSize,
+
+      // This data are copied from QuantizedLSTMTest.cpp in NNAPI source code
+      // I have to recompose the weight matrix before passing it to the model
+
+      // recurrentToInputWeights   inputToInputWeights
+      {254, 206, 77, 168, 146, 250, 71, 20, 215, 6, 235, 171, 223, 7, 118, 225,
+       10, 218, 59, 130, 174, 26, 171, 108,
+
+       // recurrentToCellWeights     inputToCellWeights
+       172, 60, 205, 65, 133, 34, 14, 0, 140, 168, 29, 49, 240, 223, 133, 56,
+       206, 109, 142, 64, 246, 216, 54, 183,
+
+       // recurrentToForgetWeights   inputToForgetWeights
+       137, 240, 103, 52, 24, 50, 68, 51, 237, 112, 132, 179, 0, 220, 89, 23,
+       158, 110, 69, 4, 207, 253, 3, 169,
+
+       // recurrentToOutputWeights  inputToOutputWeights
+       106, 214, 67, 23, 195, 187, 59, 158, 45, 3, 11, 99, 119, 132, 49, 205,
+       109, 10, 129, 218, 11, 98, 218, 48},
+
+      // inputGateBias
+      {-7876, 13488, -726, 32839,
+       // cellGateBias
+       39481, 48624, 48976, -21419,
+       // forgetGateBias
+       9206, -46884, -11693, -38724,
+       // outputGateBias
+       -58999, -17050, -41852, -40538});
+  // clang-format on
+
+  // LSTM input is stored as numBatches x (sequenceLength x inputSize) vector.
+  std::vector<std::vector<uint8_t>> lstmInput;
+  // clang-format off
+    lstmInput = {{154, 166,
+                  166, 179,
+                  141, 141},
+                 {100, 200,
+                  50,  150,
+                  111, 222}};
+  // clang-format on
+
+  // LSTM output is stored as numBatches x (sequenceLength x outputSize) vector.
+  std::vector<std::vector<uint8_t>> lstmGoldenOutput;
+  /*
+    This is the output used in NNAPI's QuantizedLSTMTest.cpp
+    I get slightly different values that are consistent running with or
+    without acceleration
+
+    lstmGoldenOutput = {{136, 150, 140, 115,
+                         140, 151, 146, 112,
+                         139, 153, 146, 114},
+                        {135, 152, 138, 112,
+                         136, 156, 142, 112,
+                         141, 154, 146, 108}};
+   */
+
+  // clang-format off
+    lstmGoldenOutput = {{131, 152, 136, 109,
+                         138, 150, 145, 111,
+                         139, 152, 146, 113},
+                        {131, 153, 135, 107,
+                         134, 154, 140, 111,
+                         140, 154, 145, 108}};
+  // clang-format on
+  VerifyGoldens(lstmInput, lstmGoldenOutput, &lstm);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 35a0bb54f35..234d6d3d604 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -24,6 +24,12 @@ namespace ops {
 namespace builtin {
 namespace quantize {
 
+// This file has two implementation of Quantize.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 struct OpData {
   int32_t output_multiplier;
   int output_shift;
@@ -87,6 +93,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(op_context.input->dims));
 }
 
+template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -100,17 +107,35 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       op_params.zero_point = output->params.zero_point;
       op_params.scale = output->params.scale;
       if (output->type == kTfLiteInt8) {
-        optimized_ops::AffineQuantize(
-            op_params, GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<int8_t>(output));
+        if (kernel_type == kReference) {
+          reference_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<int8_t>(output));
+        } else {
+          optimized_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<int8_t>(output));
+        }
       } else if (output->type == kTfLiteUInt8) {
-        optimized_ops::AffineQuantize(
-            op_params, GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<uint8_t>(output));
+        if (kernel_type == kReference) {
+          reference_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<uint8_t>(output));
+        } else {
+          optimized_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<uint8_t>(output));
+        }
       } else if (output->type == kTfLiteInt16) {
-        optimized_ops::AffineQuantize(
-            op_params, GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<int16_t>(output));
+        if (kernel_type == kReference) {
+          reference_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<int16_t>(output));
+        } else {
+          optimized_ops::AffineQuantize(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(output), GetTensorData<int16_t>(output));
+        }
       } else {
         context->ReportError(
             context,
@@ -124,15 +149,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       const int32_t size =
           MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
       if (output->type == kTfLiteInt8) {
-        optimized_ops::Requantize<int8_t, int8_t>(
-            GetTensorData<int8_t>(input), size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int8_t>(output));
+        if (kernel_type == kReference) {
+          reference_ops::Requantize<int8_t, int8_t>(
+              GetTensorData<int8_t>(input), size, data->output_multiplier,
+              data->output_shift, input->params.zero_point,
+              output->params.zero_point, GetTensorData<int8_t>(output));
+        } else {
+          optimized_ops::Requantize<int8_t, int8_t>(
+              GetTensorData<int8_t>(input), size, data->output_multiplier,
+              data->output_shift, input->params.zero_point,
+              output->params.zero_point, GetTensorData<int8_t>(output));
+        }
       } else if (output->type == kTfLiteUInt8) {
-        optimized_ops::Requantize<int8_t, uint8_t>(
-            GetTensorData<int8_t>(input), size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<uint8_t>(output));
+        if (kernel_type == kReference) {
+          reference_ops::Requantize<int8_t, uint8_t>(
+              GetTensorData<int8_t>(input), size, data->output_multiplier,
+              data->output_shift, input->params.zero_point,
+              output->params.zero_point, GetTensorData<uint8_t>(output));
+        } else {
+          optimized_ops::Requantize<int8_t, uint8_t>(
+              GetTensorData<int8_t>(input), size, data->output_multiplier,
+              data->output_shift, input->params.zero_point,
+              output->params.zero_point, GetTensorData<uint8_t>(output));
+        }
       } else {
         context->ReportError(
             context,
@@ -185,11 +224,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // scale and zero point.
 TfLiteRegistration* Register_QUANTIZE_OPT() {
   static TfLiteRegistration r = {quantize::Init, quantize::Free,
-                                 quantize::Prepare, quantize::Eval};
+                                 quantize::Prepare,
+                                 quantize::Eval<quantize::kGenericOptimized>};
   return &r;
 }
 
-TfLiteRegistration* Register_QUANTIZE() { return Register_QUANTIZE_OPT(); }
+TfLiteRegistration* Register_QUANTIZE_REF() {
+  static TfLiteRegistration r = {quantize::Init, quantize::Free,
+                                 quantize::Prepare,
+                                 quantize::Eval<quantize::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_QUANTIZE() {
+#ifdef USE_NEON
+  return Register_QUANTIZE_OPT();
+#else
+  return Register_QUANTIZE_REF();
+#endif
+}
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc
index e720f74728e..69b6f7dbc26 100644
--- a/tensorflow/lite/kernels/quantize_test.cc
+++ b/tensorflow/lite/kernels/quantize_test.cc
@@ -129,6 +129,20 @@ TEST(QuantizeOpTest, Int8Int8SmallerScale) {
               ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Int8Int8SmallerScaleNeonPath) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_INT8, {1, 1, 4, 5}, -63.5, 64});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}.
+  m.SetInputAndQuantize<int8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({1,  3,  5,  7,  9,  11, 13, 15, 17, 19,
+                                19, 17, 15, 13, 11, 9,  7,  5,  3,  1}));
+}
+
 // Input scale 0.500000, output scale 0.500000, input zeropoint 127, output
 // zeropoint 127
 TEST(QuantizeOpTest, UInt8UInt8SameScale) {
@@ -171,6 +185,22 @@ TEST(QuantizeOpTest, Uint8Uint8SmallerScale) {
       ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Uint8Uint8SmallerScaleNeonPath) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 4, 5}, -63.5, 64});
+
+  // Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
+  // 137, 136, 135, 134, 133, 132, 131, 130, 129, 128}.
+  m.SetInputAndQuantize<uint8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                        147, 145, 143, 141, 139, 137, 135, 133, 131, 129}));
+}
+
 // Input scale 1.000000, output scale 1.000000, input zeropoint -1, output
 // zeropoint 127
 TEST(QuantizeOpTest, Int8Uint8SameScale) {
diff --git a/tensorflow/lite/kernels/read_variable.cc b/tensorflow/lite/kernels/read_variable.cc
new file mode 100644
index 00000000000..4996bcc0b4a
--- /dev/null
+++ b/tensorflow/lite/kernels/read_variable.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string.h>
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace read_variable {
+
+constexpr int kInputVariableId = 0;
+constexpr int kOutputValue = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  const TfLiteTensor* input_variable_id_tensor =
+      GetInput(context, node, kInputVariableId);
+  TF_LITE_ENSURE_EQ(context, input_variable_id_tensor->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, NumElements(input_variable_id_tensor), 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputValue);
+  SetTensorToDynamic(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+
+  const TfLiteTensor* input_variable_id_tensor =
+      GetInput(context, node, kInputVariableId);
+  int variable_id = input_variable_id_tensor->data.i32[0];
+  auto& resource_variables = subgraph->resource_variables();
+
+  const auto& variable_iterator = resource_variables.find(variable_id);
+  if (variable_iterator == resource_variables.end()) {
+    context->ReportError(context, "Variable ID %d is read before initialized.",
+                         variable_id);
+    return kTfLiteError;
+  }
+  auto& variable = variable_iterator->second;
+
+  TfLiteTensor* variable_tensor = variable.GetTensor();
+  TfLiteTensor* output = GetOutput(context, node, kOutputValue);
+
+  TF_LITE_ENSURE_EQ(context, variable_tensor->type, output->type);
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(
+                   context, output, TfLiteIntArrayCopy(variable_tensor->dims)));
+  memcpy(output->data.raw, variable_tensor->data.raw, output->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace read_variable
+
+TfLiteRegistration* Register_READ_VARIABLE() {
+  static TfLiteRegistration r = {nullptr, nullptr, read_variable::Prepare,
+                                 read_variable::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index d28ec70f98a..1ac90c68193 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -20,7 +20,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
@@ -39,6 +40,7 @@ namespace reduce {
 // This file has reference implementation of reduce_* operators.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 struct OpData {
@@ -62,7 +64,6 @@ struct OpContext {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  cpu_backend_support::IncrementUsageCounter(context);
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
   auto* op_data = new OpData();
@@ -71,7 +72,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -287,115 +287,160 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
-  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-  if (op_context.input->type == kTfLiteFloat32 ||
-      op_context.input->type == kTfLiteUInt8) {
-    tflite::MeanParams op_params;
-    op_params.axis_count = num_axis;
-    ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-    const TfLiteTensor* input = op_context.input;
-    if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
-        op_params.axis_count == 2 &&
-        ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-         (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
-      if (op_context.input->type == kTfLiteUInt8) {
-        optimized_ops::Mean(
-            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-            op_context.input->params.zero_point, op_context.input->params.scale,
-            GetTensorShape(op_context.output),
-            GetTensorData<uint8_t>(op_context.output),
-            op_context.output->params.zero_point,
-            op_context.output->params.scale,
-            cpu_backend_support::GetFromContext(context));
-      } else {
-        reference_ops::Mean(op_params, GetTensorShape(input),
-                            GetTensorData<float>(input),
-                            GetTensorShape(op_context.output),
-                            GetTensorData<float>(op_context.output));
-      }
-      return kTfLiteOk;
-    }
-  }
-
-  if (op_context.input->type == kTfLiteInt8) {
-    tflite::MeanParams op_params;
-    op_params.axis_count = num_axis;
-    ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-    const TfLiteTensor* input = op_context.input;
-    reference_integer_ops::Mean(
-        op_params, data->multiplier, data->shift, GetTensorShape(input),
-        GetTensorData<int8_t>(input), op_context.input->params.zero_point,
-        GetTensorShape(op_context.output),
-        GetTensorData<int8_t>(op_context.output),
-        op_context.output->params.zero_point);
-    return kTfLiteOk;
-  }
-
-#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
-  kernel_type::Mean<>(                                              \
-      GetTensorData<data_type>(op_context.input),                   \
-      op_context.input->dims->data, op_context.input->dims->size,   \
-      GetTensorData<data_type>(op_context.output),                  \
-      op_context.output->dims->data, op_context.output->dims->size, \
-      GetTensorData<int>(op_context.axis), num_axis,                \
-      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
-      GetTensorData<int>(resolved_axis),                            \
-      GetTensorData<temp_data_type>(temp_sum))
-
-  if (kernel_type == kReference) {
+  if (kernel_type == kGenericOptimized) {
+    // Use optimized ops if available.
     switch (op_context.input->type) {
-      case kTfLiteFloat32: {
+      case kTfLiteInt8: {
         tflite::MeanParams op_params;
         op_params.axis_count = num_axis;
         ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
         const TfLiteTensor* input = op_context.input;
+        optimized_integer_ops::Mean(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            op_context.input->params.zero_point, op_context.input->params.scale,
+            GetTensorShape(op_context.output),
+            GetTensorData<int8_t>(op_context.output),
+            op_context.output->params.zero_point,
+            op_context.output->params.scale,
+            CpuBackendContext::GetFromContext(context));
+        return kTfLiteOk;
+      } break;
+      case kTfLiteUInt8: {
+        tflite::MeanParams op_params;
+        op_params.axis_count = num_axis;
+        ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+        const TfLiteTensor* input = op_context.input;
+        // TODO(b/13910232): Handle the below special case in the optimized
+        // method.
         if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
             op_params.axis_count == 2 &&
             ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
              (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
-          reference_ops::Mean(op_params, GetTensorShape(input),
-                              GetTensorData<float>(input),
-                              GetTensorShape(op_context.output),
-                              GetTensorData<float>(op_context.output));
-        } else {
-          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
+          optimized_ops::Mean(
+              op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+              op_context.input->params.zero_point,
+              op_context.input->params.scale, GetTensorShape(op_context.output),
+              GetTensorData<uint8_t>(op_context.output),
+              op_context.output->params.zero_point,
+              op_context.output->params.scale,
+              CpuBackendContext::GetFromContext(context));
+          return kTfLiteOk;
         }
       } break;
-      case kTfLiteInt32:
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
-        break;
-      case kTfLiteInt64:
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t));
-        break;
-      case kTfLiteUInt8:
-        if (op_context.input->params.zero_point ==
-                op_context.output->params.zero_point &&
-            op_context.input->params.scale == op_context.output->params.scale) {
-          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
-        } else {
-          TF_LITE_ENSURE(
-              context,
-              reference_ops::QuantizedMeanOrSum<>(
-                  GetTensorData<uint8_t>(op_context.input),
-                  op_context.input->params.zero_point,
-                  op_context.input->params.scale, op_context.input->dims->data,
-                  op_context.input->dims->size,
-                  GetTensorData<uint8_t>(op_context.output),
-                  op_context.output->params.zero_point,
-                  op_context.output->params.scale,
-                  op_context.output->dims->data, op_context.output->dims->size,
-                  GetTensorData<int>(op_context.axis), num_axis,
-                  op_context.params->keep_dims, GetTensorData<int>(temp_index),
-                  GetTensorData<int>(resolved_axis),
-                  GetTensorData<int>(temp_sum),
-                  /*compute_sum=*/false));
-        }
-        break;
       default:
-        return kTfLiteError;
+        break;
     }
   }
-#undef TF_LITE_MEAN
+
+  // From here, it uses the reference implementations.
+  // TODO(b/139102329): Clean up the function signatures to merge the variations
+  // and handle the specialized cases in the combined reference implementations
+  // per each op.
+  switch (op_context.input->type) {
+    case kTfLiteFloat32: {
+      tflite::MeanParams op_params;
+      op_params.axis_count = num_axis;
+      ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+      const TfLiteTensor* input = op_context.input;
+      // TODO(b/13910232): Handle the below special case in the combined
+      // reference method.
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+          op_params.axis_count == 2 &&
+          ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+           (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+        reference_ops::Mean(op_params, GetTensorShape(input),
+                            GetTensorData<float>(input),
+                            GetTensorShape(op_context.output),
+                            GetTensorData<float>(op_context.output));
+      } else {
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                GetTensorData<float>(op_context.input),
+                op_context.input->dims->data, op_context.input->dims->size,
+                GetTensorData<float>(op_context.output),
+                op_context.output->dims->data, op_context.output->dims->size,
+                GetTensorData<int>(op_context.axis), num_axis,
+                op_context.params->keep_dims, GetTensorData<int>(temp_index),
+                GetTensorData<int>(resolved_axis),
+                GetTensorData<float>(temp_sum)));
+      }
+    } break;
+    case kTfLiteInt32:
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::Mean(
+              GetTensorData<int>(op_context.input),
+              op_context.input->dims->data, op_context.input->dims->size,
+              GetTensorData<int>(op_context.output),
+              op_context.output->dims->data, op_context.output->dims->size,
+              GetTensorData<int>(op_context.axis), num_axis,
+              op_context.params->keep_dims, GetTensorData<int>(temp_index),
+              GetTensorData<int>(resolved_axis),
+              GetTensorData<int64_t>(temp_sum)));
+      break;
+    case kTfLiteInt64:
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::Mean(
+              GetTensorData<int64_t>(op_context.input),
+              op_context.input->dims->data, op_context.input->dims->size,
+              GetTensorData<int64_t>(op_context.output),
+              op_context.output->dims->data, op_context.output->dims->size,
+              GetTensorData<int>(op_context.axis), num_axis,
+              op_context.params->keep_dims, GetTensorData<int>(temp_index),
+              GetTensorData<int>(resolved_axis),
+              GetTensorData<int64_t>(temp_sum)));
+      break;
+    case kTfLiteInt8: {
+      tflite::MeanParams op_params;
+      op_params.axis_count = num_axis;
+      ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+      const TfLiteTensor* input = op_context.input;
+      reference_integer_ops::Mean(
+          op_params, data->multiplier, data->shift, GetTensorShape(input),
+          GetTensorData<int8_t>(input), op_context.input->params.zero_point,
+          GetTensorShape(op_context.output),
+          GetTensorData<int8_t>(op_context.output),
+          op_context.output->params.zero_point);
+    } break;
+    case kTfLiteUInt8: {
+      if (op_context.input->params.zero_point ==
+              op_context.output->params.zero_point &&
+          op_context.input->params.scale == op_context.output->params.scale) {
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                GetTensorData<uint8_t>(op_context.input),
+                op_context.input->dims->data, op_context.input->dims->size,
+                GetTensorData<uint8_t>(op_context.output),
+                op_context.output->dims->data, op_context.output->dims->size,
+                GetTensorData<int>(op_context.axis), num_axis,
+                op_context.params->keep_dims, GetTensorData<int>(temp_index),
+                GetTensorData<int>(resolved_axis),
+                GetTensorData<int>(temp_sum)));
+      } else {
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum<>(
+                GetTensorData<uint8_t>(op_context.input),
+                op_context.input->params.zero_point,
+                op_context.input->params.scale, op_context.input->dims->data,
+                op_context.input->dims->size,
+                GetTensorData<uint8_t>(op_context.output),
+                op_context.output->params.zero_point,
+                op_context.output->params.scale, op_context.output->dims->data,
+                op_context.output->dims->size,
+                GetTensorData<int>(op_context.axis), num_axis,
+                op_context.params->keep_dims, GetTensorData<int>(temp_index),
+                GetTensorData<int>(resolved_axis), GetTensorData<int>(temp_sum),
+                /*compute_sum=*/false));
+      }
+    } break;
+    default:
+      return kTfLiteError;
+  }
   return kTfLiteOk;
 }
 
@@ -587,6 +632,13 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace reduce
 
+TfLiteRegistration* Register_MEAN_OPT() {
+  static TfLiteRegistration r = {reduce::Init, reduce::Free,
+                                 reduce::PrepareMeanOrSum,
+                                 reduce::EvalMean<reduce::kGenericOptimized>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MEAN_REF() {
   static TfLiteRegistration r = {reduce::Init, reduce::Free,
                                  reduce::PrepareMeanOrSum,
@@ -628,8 +680,13 @@ TfLiteRegistration* Register_REDUCE_ANY_REF() {
   return &r;
 }
 
-// TODO(kanlig): add optimized implementation of Mean.
-TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); }
+TfLiteRegistration* Register_MEAN() {
+#ifdef USE_NEON
+  return Register_MEAN_OPT();
+#else
+  return Register_MEAN_REF();
+#endif
+}
 TfLiteRegistration* Register_SUM() { return Register_SUM_REF(); }
 TfLiteRegistration* Register_REDUCE_PROD() {
   return Register_REDUCE_PROD_REF();
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index bd2643aaa64..da727c9fefc 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -70,6 +70,7 @@ TfLiteRegistration* Register_RESIZE_BILINEAR();
 TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR();
 TfLiteRegistration* Register_SKIP_GRAM();
 TfLiteRegistration* Register_SPACE_TO_DEPTH();
+TfLiteRegistration* Register_DEPTH_TO_SPACE();
 TfLiteRegistration* Register_GATHER();
 TfLiteRegistration* Register_TRANSPOSE();
 TfLiteRegistration* Register_MEAN();
@@ -200,7 +201,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 5);
+             /* max_version */ 6);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -219,7 +220,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
              /* min_version */ 1,
              /* max_version */ 2);
@@ -249,9 +250,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
              /* min_version */ 1,
              /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE, Register_DEPTH_TO_SPACE());
   AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
              /* min_version */ 1,
              /* max_version */ 3);
@@ -381,6 +383,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MATRIX_SET_DIAG, Register_MATRIX_SET_DIAG());
 
+  // WARNING: Control flow ops are experimental and subject to change.
+  AddBuiltin(BuiltinOperator_IF, tflite::ops::custom::Register_IF());
+  AddBuiltin(BuiltinOperator_WHILE, tflite::ops::custom::Register_WHILE());
+
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
   AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
@@ -388,10 +394,6 @@ BuiltinOpResolver::BuiltinOpResolver() {
             tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
   AddCustom("TFLite_Detection_PostProcess",
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
-
-  // WARNING: Control flow ops are experimental and subject to change.
-  AddCustom("Experimental_If", tflite::ops::custom::Register_IF());
-  AddCustom("Experimental_While", tflite::ops::custom::Register_WHILE());
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index fc0cbb3f251..6d10828869f 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -131,6 +131,7 @@ TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
 TfLiteRegistration* Register_QUANTIZE();
 TfLiteRegistration* Register_HARD_SWISH_REF();
+TfLiteRegistration* Register_DEPTH_TO_SPACE_REF();
 
 namespace {
 
@@ -218,6 +219,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              Register_RESIZE_NEAREST_NEIGHBOR_REF());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF());
+  AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE, Register_DEPTH_TO_SPACE_REF());
   AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF());
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF());
diff --git a/tensorflow/lite/kernels/reshape.cc b/tensorflow/lite/kernels/reshape.cc
index 7da36a299ea..3cb0742f547 100644
--- a/tensorflow/lite/kernels/reshape.cc
+++ b/tensorflow/lite/kernels/reshape.cc
@@ -31,8 +31,10 @@ constexpr int kInputTensor = 0;
 constexpr int kShapeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
-                          TfLiteIntArray* output_shape) {
+TfLiteIntArray* GetOutputShape(TfLiteContext*, TfLiteNode*);
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteIntArray* output_shape = GetOutputShape(context, node);
   std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)>
       scoped_output_shape(output_shape, TfLiteIntArrayFree);
 
@@ -65,8 +67,8 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
   return context->ResizeTensor(context, output, scoped_output_shape.release());
 }
 
-TfLiteIntArray* GetOutputShapeFromTensor(TfLiteContext* context,
-                                         TfLiteNode* node) {
+inline TfLiteIntArray* GetOutputShapeFromTensor(TfLiteContext* context,
+                                                TfLiteNode* node) {
   const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
 
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
@@ -77,8 +79,8 @@ TfLiteIntArray* GetOutputShapeFromTensor(TfLiteContext* context,
   return output_shape;
 }
 
-TfLiteIntArray* GetOutputShapeFromParam(TfLiteContext* context,
-                                        TfLiteNode* node) {
+inline TfLiteIntArray* GetOutputShapeFromParam(TfLiteContext* context,
+                                               TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
 
   // The function is returned above this line if the shape tensor is usable.
@@ -99,7 +101,7 @@ TfLiteIntArray* GetOutputShapeFromParam(TfLiteContext* context,
 }
 
 // Check if the shape tensor is valid. Shapes should be int32 vectors.
-bool ShapeIsVector(TfLiteContext* context, TfLiteNode* node) {
+inline bool ShapeIsVector(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
   return (shape->dims->size == 1 && shape->type == kTfLiteInt32);
 }
@@ -124,8 +126,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (output->type != kTfLiteString) {
     if (NumInputs(node) == 1 ||
         IsConstantTensor(GetInput(context, node, kShapeTensor))) {
-      TF_LITE_ENSURE_OK(
-          context, ResizeOutput(context, node, GetOutputShape(context, node)));
+      TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
     } else {
       SetTensorToDynamic(output);
     }
@@ -141,8 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // a string tensor, or its shape cannot be calculated during Prepare(). In
   // either case, we now have all the information to calculate its shape.
   if (IsDynamicTensor(output)) {
-    TF_LITE_ENSURE_OK(
-        context, ResizeOutput(context, node, GetOutputShape(context, node)));
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   }
 
   // Note that string tensors are always "dynamic" in the sense that their size
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 1f1e05d0247..194ba51fae4 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -349,14 +349,16 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
   });
   m.SetSize({3, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
-                                         3, 5, 6,     //
-                                         7, 9, 10,    //
-                                         9, 11, 12,   //
-                                         4, 8, 10,    //
-                                         9, 12, 13,   //
-                                         12, 14, 16,  //
-                                     })));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                         {
+                                             3, 5, 6,     //
+                                             7, 9, 10,    //
+                                             9, 11, 12,   //
+                                             4, 8, 10,    //
+                                             9, 12, 13,   //
+                                             12, 14, 16,  //
+                                         },
+                                         /*max_abs_error=*/1)));
 
   ResizeBilinearOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
   const_m.SetInput<int8_t>({
@@ -366,14 +368,16 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
       12, 16  //
   });
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
-                                               3, 5, 6,     //
-                                               7, 9, 10,    //
-                                               9, 11, 12,   //
-                                               4, 8, 10,    //
-                                               9, 12, 13,   //
-                                               12, 14, 16,  //
-                                           })));
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                               {
+                                                   3, 5, 6,     //
+                                                   7, 9, 10,    //
+                                                   9, 11, 12,   //
+                                                   4, 8, 10,    //
+                                                   9, 12, 13,   //
+                                                   12, 14, 16,  //
+                                               },
+                                               /*max_abs_error=*/1)));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResizeUInt8) {
@@ -415,11 +419,13 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
   });
   m.SetSize({3, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
-                                         3, 4, 5, 8, 6, 10,       //
-                                         7, 9, 10, 12, 11, 13,    //
-                                         10, 12, 12, 14, 14, 16,  //
-                                     })));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                         {
+                                             3, 4, 5, 8, 6, 10,       //
+                                             7, 9, 10, 12, 11, 13,    //
+                                             10, 12, 12, 14, 14, 16,  //
+                                         },
+                                         /*max_abs_error=*/1)));
 
   ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
   const_m.SetInput<int8_t>({
@@ -427,11 +433,13 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
       10, 12, 14, 16,  //
   });
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
-                                               3, 4, 5, 8, 6, 10,       //
-                                               7, 9, 10, 12, 11, 13,    //
-                                               10, 12, 12, 14, 14, 16,  //
-                                           })));
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                               {
+                                                   3, 4, 5, 8, 6, 10,       //
+                                                   7, 9, 10, 12, 11, 13,    //
+                                                   10, 12, 12, 14, 14, 16,  //
+                                               },
+                                               /*max_abs_error=*/1)));
 }
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 3b4ee40ed70..af30cada68f 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -43,10 +43,11 @@ constexpr int kOutputTensor = 0;
 const int kMaxDim = 4;
 
 template <typename T>
-TfLiteStatus CalculateOutputShapeVector(
-    TfLiteContext* context, const TfLiteTensor* input,
-    const TfLiteTensor* begin, const TfLiteTensor* size,
-    std::vector<int64_t>* output_shape_vector) {
+TfLiteStatus CalculateOutputShapeVector(TfLiteContext* context,
+                                        const TfLiteTensor* input,
+                                        const TfLiteTensor* begin,
+                                        const TfLiteTensor* size,
+                                        std::vector<int>* output_shape_vector) {
   for (int idx = 0; idx < NumDimensions(input); ++idx) {
     T size_value = GetTensorData<T>(size)[idx];
     if (size_value < 0) {
@@ -62,7 +63,7 @@ TfLiteStatus CalculateOutputShapeVector(
         return kTfLiteError;
       }
     }
-    output_shape_vector->push_back(size_value);
+    output_shape_vector->push_back(static_cast<int>(size_value));
   }
   return kTfLiteOk;
 }
@@ -81,7 +82,7 @@ TfLiteStatus ResizeOutputShape(TfLiteContext* context,
                                const TfLiteTensor* input,
                                const TfLiteTensor* begin,
                                const TfLiteTensor* size, TfLiteTensor* output) {
-  std::vector<int64_t> output_shape_vector;
+  std::vector<int> output_shape_vector;
 
   if (begin->type == kTfLiteInt32) {
     TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int32_t>(
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index cf6b0bd4d3d..573ffe66e50 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -123,8 +123,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
+      context->ReportError(context, "Type '%s' not currently supported.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_DEPTH
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index e55965ecf94..b60bdab080d 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -170,18 +170,14 @@ void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) {
   SetupTensor(subgraph, kInput2, kTfLiteInt32);
   SetupTensor(subgraph, kOutput, kTfLiteInt32);
 
-  flexbuffers::Builder fbb;
-  fbb.Map([&]() {
-    fbb.Int("then_subgraph_index", 1);
-    fbb.Int("else_subgraph_index", 2);
-  });
-  fbb.Finish();
-  const auto& buffer = fbb.GetBuffer();
+  TfLiteIfParams* params =
+      reinterpret_cast<TfLiteIfParams*>(malloc(sizeof(TfLiteIfParams)));
+  params->then_subgraph_index = 1;
+  params->else_subgraph_index = 2;
 
   int node_index;
   subgraph->AddNodeWithParameters(
-      {kCondInput, kInput1, kInput2}, {kOutput}, {},
-      reinterpret_cast<const char*>(buffer.data()), buffer.size(), nullptr,
+      {kCondInput, kInput1, kInput2}, {kOutput}, {}, nullptr, 0, params,
       ::tflite::ops::custom::Register_IF(), &node_index);
 }
 
@@ -333,19 +329,15 @@ void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) {
   SetupTensor(subgraph, kOutput1, kTfLiteInt32);
   SetupTensor(subgraph, kOutput2, kTfLiteInt32);
 
-  flexbuffers::Builder fbb;
-  fbb.Map([&]() {
-    fbb.Int("cond_subgraph_index", 1);
-    fbb.Int("body_subgraph_index", 2);
-  });
-  fbb.Finish();
-  const auto& buffer = fbb.GetBuffer();
+  TfLiteWhileParams* params =
+      reinterpret_cast<TfLiteWhileParams*>(malloc(sizeof(TfLiteWhileParams)));
+  params->cond_subgraph_index = 1;
+  params->body_subgraph_index = 2;
 
   int node_index;
-  subgraph->AddNodeWithParameters(
-      {0, 1}, {2, 3}, {}, reinterpret_cast<const char*>(buffer.data()),
-      buffer.size(), nullptr, ::tflite::ops::custom::Register_WHILE(),
-      &node_index);
+  subgraph->AddNodeWithParameters({0, 1}, {2, 3}, {}, nullptr, 0, params,
+                                  ::tflite::ops::custom::Register_WHILE(),
+                                  &node_index);
 }
 
 void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph,
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index ae04c96967c..3be938fcb21 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // SVDF op that compresses a fully connected op via low-rank matrix
 // factorization. See https://research.google.com/pubs/archive/43813.pdf for
 // details.
+
+#include "tensorflow/lite/kernels/internal/reference/svdf.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -26,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -43,60 +47,6 @@ struct OpData {
   int activation_state_tensor_index;
 };
 
-static inline void ApplyTimeWeightsBiasAndActivation(
-    int batch_size, int memory_size, int num_filters, int num_units, int rank,
-    const TfLiteTensor* weights_time, const TfLiteTensor* bias,
-    TfLiteFusedActivation activation, TfLiteTensor* activation_state,
-    TfLiteTensor* scratch, TfLiteTensor* output) {
-  // Compute matmul(state, weights_time).
-  // The right most column is used to save temporary output (with the size of
-  // num_filters). This is achieved by starting at activation_state->data.f,
-  // and having the stride equal to memory_size.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        activation_state->data.f + b * memory_size * num_filters;
-    float* scratch_ptr_batch = scratch->data.f + b * num_filters;
-    tensor_utils::BatchVectorBatchVectorDotProduct(
-        weights_time->data.f, state_ptr_batch, memory_size, num_filters,
-        scratch_ptr_batch, /*result_stride=*/1);
-  }
-
-  // Initialize output with bias if provided.
-  if (bias) {
-    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
-                                          output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
-  }
-
-  // Reduction sum.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output->data.f + b * num_units;
-    float* scratch_ptr_batch = scratch->data.f + b * num_filters;
-    tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch,
-                                     num_units, rank);
-  }
-
-  // Apply activation.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output->data.f + b * num_units;
-    tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units,
-                                          activation, output_ptr_batch);
-  }
-
-  // Left shift the activation_state to make room for next cycle's activation.
-  // TODO(alanchiao): explore collapsing this into a single loop.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        activation_state->data.f + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
-                                    /*shift_value=*/0.0f);
-      state_ptr_batch += memory_size;
-    }
-  }
-}
-
 }  // namespace
 
 // Input tensors.
@@ -113,6 +63,7 @@ constexpr int kOutputTensor = 0;
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
   op_data->float_weights_time_initialized = false;
+  // Note: only needs 4 scratch tensors when is_hybrid_op, only 1 otherwise.
   context->AddTensors(context, /*tensors_to_add=*/4,
                       &op_data->scratch_tensor_index);
   return op_data;
@@ -241,123 +192,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteTensor* input,
-                       const TfLiteTensor* weights_feature,
-                       const TfLiteTensor* weights_time,
-                       const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-                       TfLiteTensor* scratch, TfLiteTensor* state,
-                       TfLiteTensor* output) {
-  const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Clear the activation (state left most column).
-  // TODO(ghodrat): Add a test which initialize activation_state with invalid
-  // values in left most column and make sure it passes.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
-    for (int c = 0; c < num_filters; ++c) {
-      float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0f;
-    }
-  }
-
-  // Compute conv1d(inputs, weights_feature).
-  // The state right most column is used to save current cycle activation. This
-  // is achieved by starting at state->data.f[memory_size - 1] and having the
-  // stride equal to memory_size.
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      weights_feature->data.f, num_filters, input_size, input->data.f,
-      batch_size, &state->data.f[memory_size - 1], memory_size);
-
-  ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters,
-                                    num_units, rank, weights_time, bias,
-                                    params->activation, state, scratch, output);
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
-    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* scratch, TfLiteTensor* scaling_factors,
-    TfLiteTensor* input_quantized, TfLiteTensor* state, TfLiteTensor* output) {
-  const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Initialize the pointer to input.
-  const float* input_ptr_batch = input->data.f;
-
-  // Initialize the pointer to storage for quantized values and the weights
-  // feature.
-  int8_t* quantized_input_ptr_batch;
-  const int8_t* weights_feature_ptr;
-  if (weights_feature->type == kTfLiteUInt8) {
-    quantized_input_ptr_batch =
-        reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-    weights_feature_ptr =
-        reinterpret_cast<int8_t*>(weights_feature->data.uint8);
-  } else {
-    quantized_input_ptr_batch = input_quantized->data.int8;
-    weights_feature_ptr = weights_feature->data.int8;
-  }
-
-  // Initialize the pointer to storage for scaling factors.
-  float* scaling_factors_ptr = scaling_factors->data.f;
-
-  // Initialize the weights scale.
-  const float weights_feature_scale = weights_feature->params.scale;
-
-  // Clear the activation (state left most column).
-  // TODO(ghodrat): Add a test which initialize state with invalid values in
-  // the left most column and make sure it passes.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
-    for (int c = 0; c < num_filters; ++c) {
-      float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0;
-    }
-  }
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
-    // Quantize input from float to int8.
-    float unused_min, unused_max;
-    for (int b = 0; b < batch_size; ++b) {
-      const int offset = b * input_size;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, input_size,
-          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
-          &scaling_factors_ptr[b]);
-      scaling_factors_ptr[b] *= weights_feature_scale;
-    }
-
-    // Compute conv1d(inputs, weights_feature).
-    // The rightmost column of state is used to save the current cycle
-    // activation.
-    // This is achieved by starting at state->data.f[memory_size - 1]
-    // and having the stride equal to memory_size.
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        weights_feature_ptr, num_filters, input_size, quantized_input_ptr_batch,
-        scaling_factors_ptr, batch_size, &state->data.f[memory_size - 1],
-        memory_size);
-  }
-
-  // TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying
-  // time weights so that the inner loop multiplies eight elements at a time.
-  ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters,
-                                    num_units, rank, weights_time, bias,
-                                    params->activation, state, scratch, output);
-  return kTfLiteOk;
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
@@ -377,8 +211,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      return EvalFloat(context, node, input, weights_feature, weights_time,
-                       bias, params, scratch, activation_state, output);
+      reference_ops::EvalFloatSVDF(context, node, input, weights_feature,
+                                   weights_time, bias, params, scratch,
+                                   activation_state, output);
+      return kTfLiteOk;
       break;
     }
     case kTfLiteUInt8:
@@ -398,26 +234,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         const float dequantization_scale = weights_time->params.scale;
         const int8_t* weights_time_ptr;
         if (weights_feature->type == kTfLiteUInt8) {
-          weights_time_ptr =
-              reinterpret_cast<int8_t*>(weights_time->data.uint8);
+          weights_time_ptr = reinterpret_cast<const int8_t*>(
+              GetTensorData<uint8_t>(weights_time));
         } else {
-          weights_time_ptr = weights_time->data.int8;
+          weights_time_ptr = GetTensorData<int8_t>(weights_time);
         }
+        float* float_weights_time_ptr =
+            GetTensorData<float>(float_weights_time);
         for (int i = 0; i < NumElements(float_weights_time); ++i) {
-          float_weights_time->data.f[i] =
+          float_weights_time_ptr[i] =
               weights_time_ptr[i] * dequantization_scale;
         }
         op_data->float_weights_time_initialized = true;
       }
-      return EvalHybrid(context, node, input, weights_feature,
-                        float_weights_time, bias, params, scratch,
-                        scaling_factors, input_quantized, activation_state,
-                        output);
+      reference_ops::EvalHybridSVDF(context, node, input, weights_feature,
+                                    float_weights_time, bias, params, scratch,
+                                    scaling_factors, input_quantized,
+                                    activation_state, output);
+      return kTfLiteOk;
       break;
     }
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           weights_feature->type);
+      context->ReportError(context, "Type %s not currently supported.",
+                           TfLiteTypeGetName(weights_feature->type));
       return kTfLiteError;
   }
 }
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 9c4dead65f3..743d6681e2d 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -116,7 +116,8 @@ void SingleOpModel::SetCustomOp(
 
 void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                                      int num_threads,
-                                     bool allow_fp32_relax_to_fp16) {
+                                     bool allow_fp32_relax_to_fp16,
+                                     bool apply_delegate) {
   auto opcodes = builder_.CreateVector(opcodes_);
   auto operators = builder_.CreateVector(operators_);
   auto tensors = builder_.CreateVector(tensors_);
@@ -161,6 +162,13 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
       << "Cannot allocate tensors";
   interpreter_->ResetVariableTensors();
 
+  // In some rare cases a test may need to postpone modifying the graph with
+  // a delegate, e.g. if tensors are not fully specified. In such cases the
+  // test has to explicitly call ApplyDelegate() when necessary.
+  if (apply_delegate) ApplyDelegate();
+}
+
+void SingleOpModel::ApplyDelegate() {
   if (force_use_nnapi) {
     // TODO(b/124505407): Check the result and fail accordingly.
     interpreter_->ModifyGraphWithDelegate(TestNnApiDelegate());
@@ -179,18 +187,22 @@ TfLiteStatus SingleOpModel::InvokeUnchecked() { return interpreter_->Invoke(); }
 void SingleOpModel::BuildInterpreter(
     std::vector<std::vector<int>> input_shapes) {
   BuildInterpreter(input_shapes, /*num_threads=*/-1,
-                   /*allow_fp32_relax_to_fp16=*/false);
+                   /*allow_fp32_relax_to_fp16=*/false,
+                   /*apply_delegate=*/true);
+}
+
+void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
+                                     bool allow_fp32_relax_to_fp16,
+                                     bool apply_delegate) {
+  BuildInterpreter(input_shapes, /*num_threads=*/-1, allow_fp32_relax_to_fp16,
+                   apply_delegate);
 }
 
 void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                                      int num_threads) {
   BuildInterpreter(input_shapes, num_threads,
-                   /*allow_fp32_relax_to_fp16=*/false);
-}
-
-void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
-                                     bool allow_fp32_relax_to_fp16) {
-  BuildInterpreter(input_shapes, /*num_threads=*/-1, allow_fp32_relax_to_fp16);
+                   /*allow_fp32_relax_to_fp16=*/false,
+                   /*apply_delegate=*/true);
 }
 
 // static
@@ -198,6 +210,9 @@ void SingleOpModel::SetForceUseNnapi(bool use_nnapi) {
   force_use_nnapi = use_nnapi;
 }
 
+// static
+bool SingleOpModel::GetForceUseNnapi() { return force_use_nnapi; }
+
 int32_t SingleOpModel::GetTensorSize(int index) const {
   TfLiteTensor* t = interpreter_->tensor(index);
   CHECK(t);
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 1faae708340..31fde685804 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -151,6 +151,8 @@ class SingleOpModel {
     apply_delegate_fn_ = apply_delegate_fn;
   }
 
+  void ApplyDelegate();
+
   // Copying or assignment is disallowed to simplify ownership semantics.
   SingleOpModel(const SingleOpModel&) = delete;
   SingleOpModel& operator=(const SingleOpModel&) = delete;
@@ -255,13 +257,14 @@ class SingleOpModel {
   // Build the interpreter for this model. Also, resize and allocate all
   // tensors given the shapes of the inputs.
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
-                        int num_threads, bool allow_fp32_relax_to_fp16);
+                        int num_threads, bool allow_fp32_relax_to_fp16,
+                        bool apply_delegate = true);
 
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                         int num_threads);
 
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
-                        bool allow_fp32_relax_to_fp16);
+                        bool allow_fp32_relax_to_fp16, bool apply_delegate);
 
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes);
 
@@ -358,6 +361,7 @@ class SingleOpModel {
 
   // Enables NNAPI delegate application during interpreter creation.
   static void SetForceUseNnapi(bool use_nnapi);
+  static bool GetForceUseNnapi();
 
  protected:
   int32_t GetTensorSize(int index) const;
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 1b747974743..dc049ca4677 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -70,10 +70,10 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
-template <typename T>
-void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier,
+template <typename T, typename M>
+void CopyMultipleTimes(const T* in_data, int32_t in_size, M multiplier,
                        T* out_data) {
-  for (int i = 0; i < multiplier; ++i) {
+  for (M i = 0; i < multiplier; ++i) {
     const T* in_end = in_data + in_size;
     T* new_out_data = std::copy(in_data, in_end, out_data);
     in_data = out_data;
@@ -109,8 +109,9 @@ std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
   CopyMultipleTimes(out_data, total_tiled_stride_size,
                     multipliers[dimension] - 1,
                     out_data + total_tiled_stride_size);
-  return std::make_pair(total_stride_size,
-                        total_tiled_stride_size * multipliers[dimension]);
+  return std::make_pair(
+      total_stride_size,
+      static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
 }
 
 template <typename T>
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 64973d7b860..482008808ab 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -76,9 +76,8 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// The class that collects top indexes of k values. Based on template
-// tensorflow::gtl::TopN<> but, for optimization,
-// it re-uses the same container.
+// Class that collects indices of top k values.  Based on template
+// tensorflow::gtl::TopN<> but, for optimization, it re-uses the same container.
 template <typename T>
 class TopContainer {
  public:
@@ -100,8 +99,21 @@ class TopContainer {
         std::pop_heap(container_.begin(), container_.end(), comparator);
       }
     } else if (comparator(a, container_.front())) {
+      // Due to how we defined comparator / compare_fun, container_.front()
+      // contains the index of the smallest of the top-k elements seen so far.
+      //
+      // If control reaches this point, we know that the current index a
+      // corresponds to an element which is bigger than the smallest of the
+      // top-k elements seen so far.  Hence, we have to update the indices of
+      // the top-k elements, by removing the index of the smallest top-k
+      // element, adding a, and making sure container_[0:k] is still a heap.
+
+      // Store index a into container_[k].
       container_.back() = a;
-      std::push_heap(container_.begin(), container_.end(), comparator);
+
+      // Swap container_[0] and container_[k], and rearrange elements from
+      // container_[0,k) such that they are a heap according to comparator.  For
+      // more info, see https://en.cppreference.com/w/cpp/algorithm/pop_heap.
       std::pop_heap(container_.begin(), container_.end(), comparator);
     }
   }
@@ -109,6 +121,9 @@ class TopContainer {
   const std::vector<int32>& sorted_result() {
     auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
     if (container_.size() <= k_) {
+      // Note: due to the way we defined compare_fun (see comments for that
+      // function) std::sort puts the indices from container_ in decreasing
+      // order of the corresponding elements.
       std::sort(container_.begin(), container_.end(), comparator);
     } else {
       std::sort_heap(container_.begin(), container_.end() - 1, comparator);
@@ -118,10 +133,22 @@ class TopContainer {
   }
 
  private:
-  int32 k_;
+  const int32 k_;
+
+  // container_[0,k) holds the indices of the largest k elements from values_
+  // seen so far and are maintained in a min-heap order: container_.front() is
+  // the index of the smallest of the top-k elements see so far.
+  //
+  // container_[k] is used as temporary space (not part of the min-heap).
   std::vector<int32> container_;
+
   const T* values_ = nullptr;
 
+  // Compares indices a and b based on the corresponding elements from values_.
+  //
+  // Intuitively, compare_fun(a, b) returns true iff values_[b] < values_[a]
+  // (notice the inversion of direction, not a typo); ties (==) are broken in
+  // favor of earlier elements (i.e., a < b).
   bool compare_fun(int32 a, int32 b) const {
     if (values_[b] < values_[a]) {
       return true;
@@ -200,8 +227,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
   switch (output_values->type) {
     case kTfLiteFloat32:
-      TopK(row_size, num_rows, input->data.f, k, output_indexes->data.i32,
-           output_values->data.f);
+      TopK(row_size, num_rows, GetTensorData<float>(input), k,
+           output_indexes->data.i32, GetTensorData<float>(output_values));
       break;
     case kTfLiteUInt8:
       TopK(row_size, num_rows, input->data.uint8, k, output_indexes->data.i32,
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 8bca828a1d9..c4447b2a468 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -86,13 +86,11 @@ struct OpData {
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* data = new OpData;
   eigen_support::IncrementUsageCounter(context);
-  cpu_backend_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
   eigen_support::DecrementUsageCounter(context);
-  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -338,7 +336,7 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
           GetTensorData<float>(transposed_weights), GetTensorShape(output),
           GetTensorData<float>(output), GetTensorShape(col2im),
           GetTensorData<float>(col2im),
-          cpu_backend_support::GetFromContext(context));
+          CpuBackendContext::GetFromContext(context));
       break;
     }
   }
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 3000c3cd42f..1f95a6dec35 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -148,7 +149,7 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
                        const TfLiteSequenceRNNParams* params,
                        TfLiteTensor* hidden_state, TfLiteTensor* output) {
   // Initialize the pointer bias.
-  const float* bias_ptr = bias->data.f;
+  const float* bias_ptr = GetTensorData<float>(bias);
 
   const bool time_major = params->time_major;
   const int batch_size =
@@ -159,18 +160,19 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
   const int input_size = input->dims->data[2];
 
   // Initialize input_weights and recurrent_weights.
-  const float* input_weights_ptr = input_weights->data.f;
-  const float* recurrent_weights_ptr = recurrent_weights->data.f;
+  const float* input_weights_ptr = GetTensorData<float>(input_weights);
+  const float* recurrent_weights_ptr = GetTensorData<float>(recurrent_weights);
 
   if (time_major) {
     // Initialize the pointer to hidden state.
-    float* hidden_state_ptr_batch = hidden_state->data.f;
+    float* hidden_state_ptr_batch = GetTensorData<float>(hidden_state);
     // Unroll the sequence and use batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
       // Initialize the pointer to input and output.
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
-      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+          GetTensorData<float>(input) + s * input_size * batch_size;
+      float* output_ptr_batch =
+          GetTensorData<float>(output) + s * num_units * batch_size;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
@@ -181,13 +183,15 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
     // For each batch
     for (int b = 0; b < batch_size; b++) {
       // Initialize the pointer to hidden state.
-      float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+      float* hidden_state_ptr_batch =
+          GetTensorData<float>(hidden_state) + b * num_units;
       for (int s = 0; s < max_time; s++) {
         // Initialize the pointer to input and output.
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
-        float* output_ptr_batch =
-            output->data.f + b * num_units * max_time + s * num_units;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
+        float* output_ptr_batch = GetTensorData<float>(output) +
+                                  b * num_units * max_time + s * num_units;
 
         kernel_utils::RnnBatchStep(
             input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
@@ -214,7 +218,7 @@ TfLiteStatus EvalHybrid(
   const int input_size = input->dims->data[2];
 
   // Initialize the pointer bias.
-  const float* bias_ptr = bias->data.f;
+  const float* bias_ptr = GetTensorData<float>(bias);
 
   // Initialize input_weights, recurrent_weights, and temporary storage for
   // quantized values.
@@ -240,17 +244,18 @@ TfLiteStatus EvalHybrid(
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
-  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
 
   if (time_major) {
     // Initialize the pointer to hidden state.
-    float* hidden_state_ptr_batch = hidden_state->data.f;
+    float* hidden_state_ptr_batch = GetTensorData<float>(hidden_state);
     // Unroll the sequence and use batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
       // Initialize the pointer to input and output.
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
-      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+          GetTensorData<float>(input) + s * input_size * batch_size;
+      float* output_ptr_batch =
+          GetTensorData<float>(output) + s * num_units * batch_size;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, input_weights_ptr, input_weights_scale,
@@ -263,13 +268,15 @@ TfLiteStatus EvalHybrid(
     // For each batch
     for (int b = 0; b < batch_size; b++) {
       // Initialize the pointer to hidden state.
-      float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+      float* hidden_state_ptr_batch =
+          GetTensorData<float>(hidden_state) + b * num_units;
       for (int s = 0; s < max_time; s++) {
         // Initialize the pointer to input and output.
-        const float* input_ptr_batch =
-            input->data.f + b * input_size * max_time + s * input_size;
-        float* output_ptr_batch =
-            output->data.f + b * num_units * max_time + s * num_units;
+        const float* input_ptr_batch = GetTensorData<float>(input) +
+                                       b * input_size * max_time +
+                                       s * input_size;
+        float* output_ptr_batch = GetTensorData<float>(output) +
+                                  b * num_units * max_time + s * num_units;
 
         kernel_utils::RnnBatchStep(
             input_ptr_batch, input_weights_ptr, input_weights_scale,
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 3af2e969a7b..511ea854aac 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -36,7 +36,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
-  TF_LITE_ENSURE(context, NumDimensions(input) > 1);
+  TF_LITE_ENSURE(context, NumElements(input) > 0);
   int axis = data->axis;
   if (axis < 0) {
     axis += NumDimensions(input);
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index fb38b50dd99..28d21cc4508 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -126,6 +126,13 @@ TEST(UnpackOpTest, FloatThreeDimensionsOutputs) {
                /*expected_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}});
 }
 
+TEST(UnpackOpTest, FloatVectorToScalar) {
+  Check<float>(/*axis=*/0, /*input_shape=*/{5},
+               /*input_data=*/{1, 2, 3, 4, 5},
+               /*exp_output_shape=*/{{}, {}, {}, {}, {}},
+               /*exp_output_data=*/{{1}, {2}, {3}, {4}, {5}});
+}
+
 // int32 tests.
 TEST(UnpackOpTest, IntThreeOutputs) {
   Check<int32_t>(/*axis=*/0, /*input_shape=*/{3, 2},
@@ -159,6 +166,14 @@ TEST(UnpackOpTest, IntThreeDimensionsOutputs) {
                  /*type=*/TensorType_INT32);
 }
 
+TEST(UnpackOpTest, IntVectorToScalar) {
+  Check<int32_t>(/*axis=*/0, /*input_shape=*/{5},
+                 /*input_data=*/{1, 2, 3, 4, 5},
+                 /*exp_output_shape=*/{{}, {}, {}, {}, {}},
+                 /*exp_output_data=*/{{1}, {2}, {3}, {4}, {5}},
+                 /*type=*/TensorType_INT32);
+}
+
 // uint8 tests.
 TEST(UnpackOpTest, Uint8ThreeOutputs) {
   Check<uint8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
@@ -208,6 +223,14 @@ TEST(UnpackOpTest, Uint8ThreeDimensionsOutputs) {
                  /*type=*/TensorType_UINT8);
 }
 
+TEST(UnpackOpTest, Uint8VectorToScalar) {
+  Check<uint8_t>(/*axis=*/0, /*input_shape=*/{5},
+                 /*input_data=*/{1, 2, 3, 4, 5},
+                 /*exp_output_shape=*/{{}, {}, {}, {}, {}},
+                 /*exp_output_data=*/{{1}, {2}, {3}, {4}, {5}},
+                 /*type=*/TensorType_UINT8);
+}
+
 // int8 tests.
 TEST(UnpackOpTest, Int8ThreeOutputs) {
   Check<int8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
@@ -257,5 +280,13 @@ TEST(UnpackOpTest, Int8ThreeDimensionsOutputs) {
                 /*type=*/TensorType_INT8);
 }
 
+TEST(UnpackOpTest, Int8VectorToScalar) {
+  Check<int8_t>(/*axis=*/0, /*input_shape=*/{5},
+                /*input_data=*/{1, 2, 3, 4, 5},
+                /*exp_output_shape=*/{{}, {}, {}, {}, {}},
+                /*exp_output_data=*/{{1}, {2}, {3}, {4}, {5}},
+                /*type=*/TensorType_INT8);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variable_ops_test.cc b/tensorflow/lite/kernels/variable_ops_test.cc
new file mode 100644
index 00000000000..d6a3f916d12
--- /dev/null
+++ b/tensorflow/lite/kernels/variable_ops_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+// Forward declaraction for op kernels.
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_ASSIGN_VARIABLE();
+TfLiteRegistration* Register_READ_VARIABLE();
+
+}  // namespace custom
+}  // namespace ops
+
+namespace {
+
+class VariableOpsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    assign_registration_ = ::tflite::ops::custom::Register_ASSIGN_VARIABLE();
+    ASSERT_NE(assign_registration_, nullptr);
+    read_registration_ = ::tflite::ops::custom::Register_READ_VARIABLE();
+    ASSERT_NE(read_registration_, nullptr);
+
+    ConstructGraph();
+  }
+
+  void ConstructGraph() {
+    // Construct a graph like ths:
+    //   Input: %0, %1, %2
+    //   Output: %3
+    //   variable_assign(%0, %2)
+    //   %3 = read(%1)
+
+    int first_new_tensor_index;
+    ASSERT_EQ(interpreter_.AddTensors(4, &first_new_tensor_index), kTfLiteOk);
+    ASSERT_EQ(interpreter_.SetInputs({0, 1, 2}), kTfLiteOk);
+    ASSERT_EQ(interpreter_.SetOutputs({3}), kTfLiteOk);
+    interpreter_.SetTensorParametersReadWrite(0, kTfLiteInt32, "", 0, nullptr,
+                                              {}, false);
+    interpreter_.SetTensorParametersReadWrite(1, kTfLiteInt32, "", 0, nullptr,
+                                              {}, false);
+    interpreter_.SetTensorParametersReadWrite(2, kTfLiteFloat32, "", 0, nullptr,
+                                              {}, false);
+    interpreter_.SetTensorParametersReadWrite(3, kTfLiteFloat32, "", 0, nullptr,
+                                              {}, false);
+    int node_index;
+    interpreter_.AddNodeWithParameters({0, 2}, {}, nullptr, 0, nullptr,
+                                       assign_registration_, &node_index);
+    interpreter_.AddNodeWithParameters({1}, {3}, nullptr, 0, nullptr,
+                                       read_registration_, &node_index);
+  }
+  TfLiteRegistration* assign_registration_;
+  TfLiteRegistration* read_registration_;
+  Interpreter interpreter_;
+};
+
+TEST_F(VariableOpsTest, TestAssignThenReadVariable) {
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  TfLiteTensor* input_assign_index = interpreter_.tensor(0);
+  input_assign_index->data.i32[0] = 1;
+  TfLiteTensor* input_read_index = interpreter_.tensor(1);
+  input_read_index->data.i32[0] = 1;
+  TfLiteTensor* input_data_index = interpreter_.tensor(2);
+  GetTensorData<float>(input_data_index)[0] = 1717;
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+
+  // Verify output.
+  TfLiteTensor* output = interpreter_.tensor(3);
+  ASSERT_EQ(output->dims->size, 0);
+  EXPECT_EQ(GetTensorData<float>(output)[0], 1717);
+}
+
+TEST_F(VariableOpsTest, TestReadVariableBeforeAssign) {
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  TfLiteTensor* input_assign_index = interpreter_.tensor(0);
+  input_assign_index->data.i32[0] = 1;
+  TfLiteTensor* input_read_index = interpreter_.tensor(1);
+  input_read_index->data.i32[0] = 2;
+  TfLiteTensor* input_data_index = interpreter_.tensor(2);
+  GetTensorData<float>(input_data_index)[0] = 1717;
+
+  // Error because variable 2 is never initialized.
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteError);
+}
+
+TEST_F(VariableOpsTest, TestReeasignToDifferentSize) {
+  // 1st invocation. The variable is assigned as a scalar.
+  {
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+    TfLiteTensor* input_assign_index = interpreter_.tensor(0);
+    input_assign_index->data.i32[0] = 1;
+    TfLiteTensor* input_read_index = interpreter_.tensor(1);
+    input_read_index->data.i32[0] = 1;
+    TfLiteTensor* input_data_index = interpreter_.tensor(2);
+    GetTensorData<float>(input_data_index)[0] = 1717;
+    ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+
+    // Verify output.
+    TfLiteTensor* output = interpreter_.tensor(3);
+    ASSERT_EQ(output->dims->size, 0);
+    EXPECT_EQ(GetTensorData<float>(output)[0], 1717);
+  }
+
+  // 2nd invocation. The variable is assigned as a 1D vector with 2 elements.
+  {
+    interpreter_.ResizeInputTensor(2, {2});
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+    TfLiteTensor* input_assign_index = interpreter_.tensor(0);
+    input_assign_index->data.i32[0] = 1;
+    TfLiteTensor* input_read_index = interpreter_.tensor(1);
+    input_read_index->data.i32[0] = 1;
+    TfLiteTensor* input_data_index = interpreter_.tensor(2);
+    GetTensorData<float>(input_data_index)[0] = 1717;
+    GetTensorData<float>(input_data_index)[1] = 2121;
+    ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+
+    // Verify output.
+    TfLiteTensor* output = interpreter_.tensor(3);
+    ASSERT_EQ(output->dims->size, 1);
+    ASSERT_EQ(output->dims->data[0], 2);
+    EXPECT_EQ(GetTensorData<float>(output)[0], 1717);
+    EXPECT_EQ(GetTensorData<float>(output)[1], 2121);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index a6438558458..6ac1d4b1e91 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+
+#include <cstring>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
@@ -107,10 +109,9 @@ struct OpData {
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData;
-  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
-  op_data->cond_subgraph_index = m["cond_subgraph_index"].AsInt32();
-  op_data->body_subgraph_index = m["body_subgraph_index"].AsInt32();
+  const auto* params = reinterpret_cast<const TfLiteWhileParams*>(buffer);
+  op_data->cond_subgraph_index = params->cond_subgraph_index;
+  op_data->body_subgraph_index = params->body_subgraph_index;
   op_data->cond_has_dynamic_output_tensors = false;
   op_data->body_has_dynamic_output_tensors = false;
   return op_data;
diff --git a/tensorflow/lite/minimal_logging.h b/tensorflow/lite/minimal_logging.h
index 23ab269827d..5f42bc7eb81 100644
--- a/tensorflow/lite/minimal_logging.h
+++ b/tensorflow/lite/minimal_logging.h
@@ -68,12 +68,14 @@ class MinimalLogger {
 #ifndef NDEBUG
 // In debug builds, always log.
 #define TFLITE_LOG TFLITE_LOG_PROD
+#define TFLITE_LOG_ONCE TFLITE_LOG_PROD_ONCE
 #else
 // In prod builds, never log, but ensure the code is well-formed and compiles.
 #define TFLITE_LOG(severity, format, ...)             \
   while (false) {                                     \
     TFLITE_LOG_PROD(severity, format, ##__VA_ARGS__); \
   }
+#define TFLITE_LOG_ONCE TFLITE_LOG
 #endif
 
 #endif  // TENSORFLOW_LITE_MINIMAL_LOGGING_H_
diff --git a/tensorflow/lite/minimal_logging_test.cc b/tensorflow/lite/minimal_logging_test.cc
index 751233aea24..b5212452dab 100644
--- a/tensorflow/lite/minimal_logging_test.cc
+++ b/tensorflow/lite/minimal_logging_test.cc
@@ -73,6 +73,18 @@ TEST(MinimalLogging, Debug) {
 #endif
 }
 
+TEST(MinimalLogging, DebugOnce) {
+  testing::internal::CaptureStderr();
+  for (int i = 0; i < 10; ++i) {
+    TFLITE_LOG_ONCE(TFLITE_LOG_INFO, "Count: %d", i);
+  }
+#ifndef NDEBUG
+  EXPECT_EQ("INFO: Count: 0\n", testing::internal::GetCapturedStderr());
+#else
+  EXPECT_TRUE(testing::internal::GetCapturedStderr().empty());
+#endif
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 5fd9e21f90a..516ba693738 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -159,6 +159,22 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
   return model;
 }
 
+string FlatBufferModel::GetMinimumRuntime() const {
+  if (!model_ || !model_->metadata()) return "";
+
+  for (int i = 0; i < model_->metadata()->size(); ++i) {
+    auto metadata = model_->metadata()->Get(i);
+    if (metadata->name()->str() == "min_runtime_version") {
+      auto buf = metadata->buffer();
+      auto* buffer = (*model_->buffers())[buf];
+      auto* array = buffer->data();
+      return string(reinterpret_cast<const char*>(array->data()),
+                    array->size());
+    }
+  }
+  return "";
+}
+
 bool FlatBufferModel::CheckModelIdentifier() const {
   if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
     const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
@@ -311,13 +327,21 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           EnumNameBuiltinOperator(op_type));
     }
 
-    if (op->custom_options()) {
-      subgraph->AddNodeWithParameters(
-          FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()),
-          FlatBufferIntArrayToVector(op->intermediates()),
-          reinterpret_cast<const char*>(op->custom_options()->data()),
-          op->custom_options()->size(), nullptr, registration);
+    if (op_type == BuiltinOperator_CUSTOM) {
+      if (op->custom_options()) {
+        subgraph->AddNodeWithParameters(
+            FlatBufferIntArrayToVector(op->inputs()),
+            FlatBufferIntArrayToVector(op->outputs()),
+            FlatBufferIntArrayToVector(op->intermediates()),
+            reinterpret_cast<const char*>(op->custom_options()->data()),
+            op->custom_options()->size(), nullptr, registration);
+      } else {
+        subgraph->AddNodeWithParameters(
+            FlatBufferIntArrayToVector(op->inputs()),
+            FlatBufferIntArrayToVector(op->outputs()),
+            FlatBufferIntArrayToVector(op->intermediates()), nullptr, 0,
+            nullptr, registration);
+      }
     } else {
       void* builtin_data = nullptr;
       MallocDataAllocator malloc_allocator;
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 6c569470f34..a8d0f22888e 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -135,6 +135,16 @@ class FlatBufferModel {
   ErrorReporter* error_reporter() const { return error_reporter_; }
   const Allocation* allocation() const { return allocation_.get(); }
 
+  // Returns the minimum runtime version from the flatbuffer. This runtime
+  // version encodes the minimum required interpreter version to run the
+  // flatbuffer model. If the minimum version can't be determined, an empty
+  // string will be returned.
+  // Note that the returned minimum version is a lower-bound but not a strict
+  // lower-bound; ops in the graph may not have an associated runtime version,
+  // in which case the actual required runtime might be greater than the
+  // reported minimum.
+  string GetMinimumRuntime() const;
+
   /// Returns true if the model identifier is correct (otherwise false and
   /// reports an error).
   bool CheckModelIdentifier() const;
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
index d58dbf4d45f..7dc582b8862 100644
--- a/tensorflow/lite/model_test.cc
+++ b/tensorflow/lite/model_test.cc
@@ -315,6 +315,22 @@ TEST(BasicFlatBufferModel, TestBuildFromModel) {
   ASSERT_NE(interpreter, nullptr);
 }
 
+// Test reading the minimum runtime string from metadata in a Model flatbuffer.
+TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) {
+  // First read a model that doesn't have the runtime string.
+  auto model1 = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin");
+  ASSERT_TRUE(model1);
+  ASSERT_EQ(model1->GetMinimumRuntime(), "");
+
+  // Read a model that has minimum runtime string populated.
+  auto model2 = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/test_min_runtime.bin");
+  ASSERT_TRUE(model2);
+  // Check that we have read the runtime string correctly.
+  ASSERT_EQ(model2->GetMinimumRuntime(), "1.10.0");
+}
+
 // TODO(aselle): Add tests for serialization of builtin op data types.
 // These tests will occur with the evaluation tests of individual operators,
 // not here.
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
index fbd75051e71..cbd155bb0cd 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -53,8 +53,13 @@ public class SmartReplyClient implements AutoCloseable {
   @WorkerThread
   public synchronized void loadModel() {
     if (!isLibraryLoaded) {
-      System.loadLibrary(JNI_LIB);
-      isLibraryLoaded = true;
+      try {
+        System.loadLibrary(JNI_LIB);
+        isLibraryLoaded = true;
+      } catch (Exception e) {
+        Log.e(TAG, "Failed to load prebuilt smartreply_jni lib", e);
+        return;
+      }
     }
 
     try {
diff --git a/tensorflow/lite/models/smartreply/g3doc/README.md b/tensorflow/lite/models/smartreply/g3doc/README.md
index 1b8ff15196c..04439293337 100644
--- a/tensorflow/lite/models/smartreply/g3doc/README.md
+++ b/tensorflow/lite/models/smartreply/g3doc/README.md
@@ -62,8 +62,8 @@ and [research paper](https://arxiv.org/pdf/1708.00630).
 ## How to use this Model?
 
 We have provided a pre-built demo APK that you can download, install and test on
-your phone ([demo APK
-here](http://download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)).
+your phone
+([demo APK here](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)).
 
 The On-Device Smart Reply demo App works in the following way:
 
diff --git a/tensorflow/lite/models/speech_test.cc b/tensorflow/lite/models/speech_test.cc
index 63436efd298..4b408581350 100644
--- a/tensorflow/lite/models/speech_test.cc
+++ b/tensorflow/lite/models/speech_test.cc
@@ -108,7 +108,7 @@ TEST_P(SpeechTest, DISABLED_HotwordOkGoogleRank1Test) {
       "speech_hotword_model_out_rank1.csv", /*input_tensor=*/"0",
       /*output_tensor=*/"18", /*persistent_tensors=*/"4",
       /*sequence_size=*/40, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -120,7 +120,7 @@ TEST_P(SpeechTest, DISABLED_HotwordOkGoogleRank2Test) {
       "speech_hotword_model_out_rank2.csv", /*input_tensor=*/"17",
       /*output_tensor=*/"18", /*persistent_tensors=*/"1",
       /*sequence_size=*/40, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -133,7 +133,7 @@ TEST_P(SpeechTest, DISABLED_SpeakerIdOkGoogleTest) {
       /*output_tensor=*/"63",
       /*persistent_tensors=*/"18,19,38,39,58,59",
       /*sequence_size=*/80, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -146,7 +146,7 @@ TEST_P(SpeechTest, AsrAmTest) {
                      /*output_tensor=*/"104",
                      /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
                      /*sequence_size=*/320, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -159,7 +159,7 @@ TEST_P(SpeechTest, AsrAmQuantizedTest) {
       /*output_tensor=*/"104",
       /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
       /*sequence_size=*/320, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -170,7 +170,7 @@ TEST_P(SpeechTest, AsrAmQuantizedTest) {
 // results.
 TEST_P(SpeechTest, DISABLED_AsrLmTest) {
   std::ifstream in_file;
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(Init("speech_asr_lm_model.test_spec", &test_driver, &in_file));
   ASSERT_TRUE(
       testing::ParseAndRunTests(&in_file, &test_driver, GetMaxInvocations()))
@@ -185,7 +185,7 @@ TEST_P(SpeechTest, DISABLED_EndpointerTest) {
       /*output_tensor=*/"56",
       /*persistent_tensors=*/"27,28,47,48",
       /*sequence_size=*/320, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
@@ -198,7 +198,7 @@ TEST_P(SpeechTest, DISABLED_TtsTest) {
                              /*output_tensor=*/"71",
                              /*persistent_tensors=*/"24,25,44,45,64,65,70",
                              /*sequence_size=*/334, &os));
-  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  testing::TfLiteDriver test_driver;
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 7ea58c583e1..228f7d46f8a 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -12,6 +12,9 @@ cc_library(
         "NeuralNetworksTypes.h",
     ],
     linkopts = select({
+        "//tensorflow:emscripten": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
         "//conditions:default": ["-ldl"],
     }),
@@ -20,9 +23,15 @@ cc_library(
 cc_library(
     name = "nnapi_implementation",
     srcs = select({
+        "//tensorflow:emscripten": [
+            "nnapi_implementation_disabled.cc",
+        ],
         "//tensorflow:ios": [
             "nnapi_implementation_disabled.cc",
         ],
+        "//tensorflow:macos": [
+            "nnapi_implementation_disabled.cc",
+        ],
         "//tensorflow:windows": [
             "nnapi_implementation_disabled.cc",
         ],
@@ -34,12 +43,16 @@ cc_library(
         "nnapi_implementation.h",
     ],
     linkopts = select({
+        "//tensorflow:emscripten": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
         "//conditions:default": ["-ldl"],
     }) + select({
         "//tensorflow:android": [],
-        "//tensorflow:macos": [],
+        "//tensorflow:emscripten": [],
         "//tensorflow:ios": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
         "//conditions:default": ["-lrt"],
     }),
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 6b5d8e241e4..b4ec12ee14d 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -41,6 +41,7 @@ enum {
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
   ANEURALNETWORKS_BOOL = 6,
   ANEURALNETWORKS_TENSOR_BOOL8 = 9,
+  ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
 };
@@ -115,6 +116,12 @@ enum {
   ANEURALNETWORKS_POW = 70,
   ANEURALNETWORKS_PRELU = 71,
   ANEURALNETWORKS_QUANTIZE = 72,
+  ANEURALNETWORKS_QUANTIZED_16BIT_LSTM = 73,
+  ANEURALNETWORKS_REDUCE_ANY = 76,
+  ANEURALNETWORKS_REDUCE_MAX = 77,
+  ANEURALNETWORKS_REDUCE_MIN = 78,
+  ANEURALNETWORKS_REDUCE_PROD = 79,
+  ANEURALNETWORKS_REDUCE_SUM = 80,
   ANEURALNETWORKS_RSQRT = 83,
   ANEURALNETWORKS_SELECT = 84,
   ANEURALNETWORKS_SIN = 85,
@@ -123,8 +130,10 @@ enum {
   ANEURALNETWORKS_SQRT = 88,
   ANEURALNETWORKS_TILE = 89,
   ANEURALNETWORKS_TOPK_V2 = 90,
+  ANEURALNETWORKS_TRANSPOSE_CONV = 91,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
+  ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index bc5159f6e4a..c30a24afa4f 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -178,7 +178,13 @@ const NnApi LoadNnApi() {
     }
   }
 #else
-  nnapi.ASharedMemory_create = ASharedMemory_create;
+  // Mock ASharedMemory_create only if libneuralnetworks.so was successfully
+  // loaded. This ensures identical behaviour on platforms which use this
+  // implementation, but don't have libneuralnetworks.so library, and
+  // platforms which use nnapi_implementation_disabled.cc stub.
+  if (libneuralnetworks != nullptr) {
+    nnapi.ASharedMemory_create = ASharedMemory_create;
+  }
 #endif  // __ANDROID__
 
   // API 28 (NN 1.1) methods.
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_test.cc b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
index 9f30b95ec37..0d696aff79e 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation_test.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
@@ -116,7 +116,7 @@ TEST(NnapiLibTest, NnApiImplementation) {
   EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
   EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
   EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
-  EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+  EXPECT_EQ(nnapi->ASharedMemory_create, nullptr);
   EXPECT_EQ(nnapi->ANeuralNetworks_getDeviceCount, nullptr);
   EXPECT_EQ(nnapi->ANeuralNetworks_getDevice, nullptr);
   EXPECT_EQ(nnapi->ANeuralNetworksDevice_getName, nullptr);
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index d75269ec0f3..d3250313438 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -45,6 +45,8 @@ class ProfileSummarizer {
     return stats_calculator_->GetShortSummary();
   }
 
+  bool HasProfiles() const { return stats_calculator_->num_runs() >= 1; }
+
  private:
   std::unique_ptr<tensorflow::StatsCalculator> stats_calculator_;
 };
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index b9176a415e5..a410c176666 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -25,7 +25,6 @@ py_test(
         "//tensorflow/lite/python/testdata:interpreter_test_data",
         "//tensorflow/lite/python/testdata:test_delegate.so",
     ],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -34,6 +33,7 @@ py_test(
     ],
     deps = [
         ":interpreter",
+        "//tensorflow/lite/python/testdata:test_registerer_wrapper",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
@@ -95,7 +95,6 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
-    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -111,7 +110,6 @@ py_test(
 py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -126,7 +124,6 @@ py_test(
 py_test(
     name = "lite_flex_test",
     srcs = ["lite_flex_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         # TODO(b/111881877): Enable in oss after resolving op registry issues.
@@ -143,10 +140,8 @@ py_test(
 py_test(
     name = "lite_mlir_test",
     srcs = ["lite_mlir_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss",
         "no_windows",
     ],
     deps = [
@@ -174,7 +169,6 @@ py_library(
 py_test(
     name = "util_test",
     srcs = ["util_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -240,7 +234,6 @@ py_library(
 py_test(
     name = "convert_test",
     srcs = ["convert_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":convert",
@@ -272,7 +265,6 @@ py_library(
 py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
-    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -290,17 +282,3 @@ py_test(
         "//tensorflow/python/saved_model",
     ],
 )
-
-py_binary(
-    name = "create_custom_op",
-    srcs = ["create_custom_op.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
-        "@absl_py//absl/flags",
-    ],
-)
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index ae1f8bb47f2..9fe8b25c0e6 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -153,7 +153,18 @@ def toco_convert_protos(model_flags_str,
       fp_toco.write(toco_flags_str)
       fp_input.write(input_data_str)
       debug_info_str = debug_info_str if debug_info_str else ""
-      fp_debug.write(debug_info_str)
+      # if debug_info_str contains a "string value", then the call to
+      # fp_debug.write(debug_info_str) will fail with the following error
+      #
+      # TypeError: a bytes-like object is required, not 'str'
+      #
+      # Some of the subtests within the "convert_test" unit-test fail
+      # with the error shown above. So watch out for that scenario and
+      # convert debug_info_str to bytes where needed
+      if not isinstance(debug_info_str, bytes):
+        fp_debug.write(debug_info_str.encode("utf-8"))
+      else:
+        fp_debug.write(debug_info_str)
 
     # Reserve an output file
     with _tempfile.NamedTemporaryFile(delete=False) as fp:
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 382c351f7a7..543ddda0d7c 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -25,6 +25,7 @@ from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
@@ -34,32 +35,27 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("Incompatible with 2.0.")
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
-    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
-                                      dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Try running on valid graph
     tflite_model = convert.toco_convert(sess.graph_def, [in_tensor],
                                         [out_tensor])
     self.assertTrue(tflite_model)
 
-    # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
-    # all the time).
-    # Try running on identity graph (known fail)
-    # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
-    #   result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
-
   def testQuantization(self):
-    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
-                                      dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor,
-                                                        min=0., max=1.)
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor + in_tensor, min=0., max=1.)
+      sess = session.Session()
 
     tflite_model = convert.toco_convert(
         sess.graph_def, [in_tensor], [out_tensor],
@@ -68,11 +64,12 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(tflite_model)
 
   def testQuantizationInvalid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1.)
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor + in_tensor, min=0., max=1.)
+      sess = session.Session()
 
     with self.assertRaises(ValueError) as error:
       convert.toco_convert(
@@ -83,10 +80,11 @@ class ConvertTest(test_util.TensorFlowTestCase):
         "QUANTIZED_UINT8.", str(error.exception))
 
   def testGraphDefBasic(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     tflite_model = convert.toco_convert_graph_def(
         sess.graph_def, [("input", [1, 16, 16, 3])], ["add"],
@@ -113,13 +111,14 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertEqual((0., 0.), output_details[0]["quantization"])
 
   def testGraphDefQuantization(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
-    in_tensor_2 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
-    _ = array_ops.fake_quant_with_min_max_args(
-        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+      sess = session.Session()
 
     input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
     output_arrays = ["output"]
@@ -158,13 +157,14 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(output_details[0]["quantization"][0] > 0)  # scale
 
   def testGraphDefQuantizationInvalid(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
-    in_tensor_2 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
-    _ = array_ops.fake_quant_with_min_max_args(
-        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+      sess = session.Session()
 
     input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
     output_arrays = ["output"]
@@ -180,7 +180,6 @@ class ConvertTest(test_util.TensorFlowTestCase):
         "QUANTIZED_UINT8.", str(error.exception))
 
 
-@test_util.run_v1_only("Incompatible with 2.0.")
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
@@ -219,82 +218,91 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
 
   def testSwishLiteHint(self):
     """Makes a custom op swish and makes sure it gets converted as a unit."""
-    image = array_ops.constant([1., 2., 3., 4.])
-    swish_scale = array_ops.constant(1.0)
+    with ops.Graph().as_default():
+      image = array_ops.constant([1., 2., 3., 4.])
+      swish_scale = array_ops.constant(1.0)
 
-    def _swish(input_tensor, scale):
-      custom = op_hint.OpHint("cool_activation")
-      input_tensor, scale = custom.add_inputs(input_tensor, scale)
-      output = math_ops.sigmoid(input_tensor) * input_tensor * scale
-      output, = custom.add_outputs(output)
-      return output
-    output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput")
+      def _swish(input_tensor, scale):
+        custom = op_hint.OpHint("cool_activation")
+        input_tensor, scale = custom.add_inputs(input_tensor, scale)
+        output = math_ops.sigmoid(input_tensor) * input_tensor * scale
+        output, = custom.add_outputs(output)
+        return output
 
-    with self.cached_session() as sess:
-      # check if identities have been put into the graph (2 input, 1 output,
-      # and 1 final output).
-      self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
+      output = array_ops.identity(
+          _swish(image, swish_scale), name="ModelOutput")
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
-          graph_def=sess.graph_def)
+      with self.cached_session() as sess:
+        # check if identities have been put into the graph (2 input, 1 output,
+        # and 1 final output).
+        self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      self.assertEqual(
-          self._getGraphOpTypes(
-              stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output.name)]),
-          set(["cool_activation", "Const", "Identity"]))
+        stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+            graph_def=sess.graph_def)
+
+        self.assertEqual(
+            self._getGraphOpTypes(
+                stubbed_graphdef,
+                output_nodes=[op_hint._tensor_name_base(output.name)]),
+            set(["cool_activation", "Const", "Identity"]))
 
   def testScaleAndBiasAndIdentity(self):
     """This tests a scaled add which has 3 inputs and 2 outputs."""
-    a = array_ops.constant(1.)
-    x = array_ops.constant([2., 3.])
-    b = array_ops.constant([4., 5.])
+    with ops.Graph().as_default():
+      a = array_ops.constant(1.)
+      x = array_ops.constant([2., 3.])
+      b = array_ops.constant([4., 5.])
 
-    def _scaled_and_bias_and_identity(a, x, b):
-      custom = op_hint.OpHint("scale_and_bias_and_identity")
-      a, x, b = custom.add_inputs(a, x, b)
-      return custom.add_outputs(a * x + b, x)
-    output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
-                                name="ModelOutput")
+      def _scaled_and_bias_and_identity(a, x, b):
+        custom = op_hint.OpHint("scale_and_bias_and_identity")
+        a, x, b = custom.add_inputs(a, x, b)
+        return custom.add_outputs(a * x + b, x)
 
-    with self.cached_session() as sess:
-      # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
-      # +1 for the final output
-      self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
+      output = array_ops.identity(
+          _scaled_and_bias_and_identity(a, x, b), name="ModelOutput")
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
-          graph_def=sess.graph_def)
+      with self.cached_session() as sess:
+        # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
+        # +1 for the final output
+        self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      self.assertEqual(
-          self._getGraphOpTypes(
-              stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output.name)]),
-          set(["scale_and_bias_and_identity", "Const", "Identity", "Pack"]))
+        stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+            graph_def=sess.graph_def)
+
+        self.assertEqual(
+            self._getGraphOpTypes(
+                stubbed_graphdef,
+                output_nodes=[op_hint._tensor_name_base(output.name)]),
+            set(["scale_and_bias_and_identity", "Const", "Identity", "Pack"]))
 
   def testTwoFunctions(self):
     """Tests if two functions are converted correctly."""
-    a = array_ops.constant([1.])
-    b = array_ops.constant([1.])
-    def _double_values(x):
-      custom = op_hint.OpHint("add_test")
-      x, = custom.add_inputs(x)
-      output = math_ops.multiply(x, x)
-      output, = custom.add_outputs(output)
-      return output
-    output = array_ops.identity(
-        math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput")
+    with ops.Graph().as_default():
+      a = array_ops.constant([1.])
+      b = array_ops.constant([1.])
 
-    with self.cached_session() as sess:
-      # make sure one identity for each input (2) and output (2) => 2 + 2
-      # +1 for the final output
-      self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
-          graph_def=sess.graph_def)
-      self.assertEqual(
-          self._getGraphOpTypes(
-              stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output.name)]),
-          set(["add_test", "Const", "Identity", "Add"]))
+      def _double_values(x):
+        custom = op_hint.OpHint("add_test")
+        x, = custom.add_inputs(x)
+        output = math_ops.multiply(x, x)
+        output, = custom.add_outputs(output)
+        return output
+
+      output = array_ops.identity(
+          math_ops.add(_double_values(a), _double_values(b)),
+          name="ModelOutput")
+
+      with self.cached_session() as sess:
+        # make sure one identity for each input (2) and output (2) => 2 + 2
+        # +1 for the final output
+        self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
+        stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+            graph_def=sess.graph_def)
+        self.assertEqual(
+            self._getGraphOpTypes(
+                stubbed_graphdef,
+                output_nodes=[op_hint._tensor_name_base(output.name)]),
+            set(["add_test", "Const", "Identity", "Add"]))
 
   def _get_input_index(self, x):
     return x.op.node_def.attr[op_hint.OpHint.FUNCTION_INPUT_INDEX_ATTR].i
@@ -307,93 +315,97 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
 
   def testTags(self):
     """Test if multiple args with the same tag are grouped."""
-    a = array_ops.constant([1.])
-    b = array_ops.constant([2.])
-    c = array_ops.constant([3.])
-    d = array_ops.constant([4.])
-    custom = op_hint.OpHint("test_tag")
-    a = custom.add_input(a, tag="mytag",
-                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    b, = custom.add_inputs(b)
-    c = custom.add_input(c, tag="mytag",
-                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    d = custom.add_input(d, tag="mytag2",
-                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    res = math_ops.add(math_ops.mul(a, b), math_ops.mul(c, b))
-    custom.add_outputs([res])
-    with self.cached_session():
-      self.assertEqual(self._get_input_index(a), 0)
-      self.assertEqual(self._get_sort_index(a), 0)
-      self.assertEqual(self._get_input_index(b), 1)
-      self.assertEqual(self._get_sort_index(b), 0)
-      self.assertEqual(self._get_input_index(c), 0)
-      self.assertEqual(self._get_sort_index(c), 1)
+    with ops.Graph().as_default():
+      a = array_ops.constant([1.])
+      b = array_ops.constant([2.])
+      c = array_ops.constant([3.])
+      d = array_ops.constant([4.])
+      custom = op_hint.OpHint("test_tag")
+      a = custom.add_input(
+          a, tag="mytag", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      b, = custom.add_inputs(b)
+      c = custom.add_input(
+          c, tag="mytag", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      d = custom.add_input(
+          d, tag="mytag2", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      res = math_ops.add(math_ops.mul(a, b), math_ops.mul(c, b))
+      custom.add_outputs([res])
+      with self.cached_session():
+        self.assertEqual(self._get_input_index(a), 0)
+        self.assertEqual(self._get_sort_index(a), 0)
+        self.assertEqual(self._get_input_index(b), 1)
+        self.assertEqual(self._get_sort_index(b), 0)
+        self.assertEqual(self._get_input_index(c), 0)
+        self.assertEqual(self._get_sort_index(c), 1)
 
   def testOverrideIndex(self):
-    a = array_ops.constant([1.])
-    b = array_ops.constant([2.])
-    c = array_ops.constant([3.])
-    custom = op_hint.OpHint("test_override")
-    b = custom.add_input(b)  # should auto assign 0
-    a = custom.add_input(a, index_override=1)
-    c = custom.add_input(c)  # should auto assign 2
-    with self.cached_session():
-      self.assertEqual(self._get_input_index(a), 1)
-      self.assertEqual(self._get_input_index(b), 0)
-      self.assertEqual(self._get_input_index(c), 2)
+    with ops.Graph().as_default():
+      a = array_ops.constant([1.])
+      b = array_ops.constant([2.])
+      c = array_ops.constant([3.])
+      custom = op_hint.OpHint("test_override")
+      b = custom.add_input(b)  # should auto assign 0
+      a = custom.add_input(a, index_override=1)
+      c = custom.add_input(c)  # should auto assign 2
+      with self.cached_session():
+        self.assertEqual(self._get_input_index(a), 1)
+        self.assertEqual(self._get_input_index(b), 0)
+        self.assertEqual(self._get_input_index(c), 2)
 
   def testAggregate(self):
-    a = array_ops.constant([3., 4.])
-    b = array_ops.constant([5., 6.])
-    hint = op_hint.OpHint("agg")
-    a0, a1 = array_ops.unstack(a)
-    b0, b1 = array_ops.unstack(b)
+    with ops.Graph().as_default():
+      a = array_ops.constant([3., 4.])
+      b = array_ops.constant([5., 6.])
+      hint = op_hint.OpHint("agg")
+      a0, a1 = array_ops.unstack(a)
+      b0, b1 = array_ops.unstack(b)
 
-    a0 = hint.add_input(a0, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    b0 = hint.add_input(b0, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    a1 = hint.add_input(a1, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    b1 = hint.add_input(b1, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      a0 = hint.add_input(a0, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      b0 = hint.add_input(b0, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      a1 = hint.add_input(a1, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      b1 = hint.add_input(b1, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
 
-    c0 = math_ops.add(a0, b0, name="addleft")
-    c1 = math_ops.add(a1, b1, name="addright")
-    c0 = hint.add_output(
-        c0, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
-    c1 = hint.add_output(
-        c1, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      c0 = math_ops.add(a0, b0, name="addleft")
+      c1 = math_ops.add(a1, b1, name="addright")
+      c0 = hint.add_output(
+          c0, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+      c1 = hint.add_output(
+          c1, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
 
-    curr = array_ops.stack([c0, c1])
-    output = array_ops.identity(curr, name="FINAL_OUTPUT")
-    with self.cached_session() as sess:
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
-          graph_def=sess.graph_def)
-      self.assertEqual(
-          self._getGraphOpTypes(
-              stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output.name)]),
-          set(["agg", "Const", "Identity"]))
+      curr = array_ops.stack([c0, c1])
+      output = array_ops.identity(curr, name="FINAL_OUTPUT")
+      with self.cached_session() as sess:
+        stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+            graph_def=sess.graph_def)
+        self.assertEqual(
+            self._getGraphOpTypes(
+                stubbed_graphdef,
+                output_nodes=[op_hint._tensor_name_base(output.name)]),
+            set(["agg", "Const", "Identity"]))
 
   def testFindHintedOutputNodes(self):
     """Test if all hinted output nodes are correctly found."""
+    with ops.Graph().as_default():
 
-    def _build_ophinted_op(name, input1, input2):
-      custom_op = op_hint.OpHint(name)
-      input1 = custom_op.add_input(input1)
-      input2 = custom_op.add_input(input2)
-      output = math_ops.mul(input1, input2)
-      return custom_op.add_output(output)
+      def _build_ophinted_op(name, input1, input2):
+        custom_op = op_hint.OpHint(name)
+        input1 = custom_op.add_input(input1)
+        input2 = custom_op.add_input(input2)
+        output = math_ops.mul(input1, input2)
+        return custom_op.add_output(output)
 
-    output_1 = _build_ophinted_op("custom_op_1", array_ops.constant([1.]),
-                                  array_ops.constant([2.]))
-    output_2 = _build_ophinted_op("custom_op_2", array_ops.constant([3.]),
-                                  array_ops.constant([4.]))
-    with self.cached_session() as sess:
-      hinted_outputs_nodes = op_hint.find_all_hinted_output_nodes(sess)
-      expected_hinted_output_nodes = [
-          _node_name(output_1.name),
-          _node_name(output_2.name)
-      ]
-      self.assertEqual(
-          len(hinted_outputs_nodes), len(expected_hinted_output_nodes))
+      output_1 = _build_ophinted_op("custom_op_1", array_ops.constant([1.]),
+                                    array_ops.constant([2.]))
+      output_2 = _build_ophinted_op("custom_op_2", array_ops.constant([3.]),
+                                    array_ops.constant([4.]))
+      with self.cached_session() as sess:
+        hinted_outputs_nodes = op_hint.find_all_hinted_output_nodes(sess)
+        expected_hinted_output_nodes = [
+            _node_name(output_1.name),
+            _node_name(output_2.name)
+        ]
+        self.assertEqual(
+            len(hinted_outputs_nodes), len(expected_hinted_output_nodes))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/lite/python/create_custom_op.py b/tensorflow/lite/python/create_custom_op.py
deleted file mode 100644
index e793f7fe2bc..00000000000
--- a/tensorflow/lite/python/create_custom_op.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Replaces a subgraph of a TensorFlow GraphDef with a single node.
-
-In conjunction with TOCO's --allow_custom_op this script allows selected
-portions of a TensorFlow GraphDef to be executed by custom code.
-
-Example:
-
-bazel run tensorflow/lite/python:create_custom_op  -- \
-  --input_graph=/tmp/input.pb \
-  --output_graph=/tmp/output.pb \
-  --inputs=concat,concat_1 \
-  --outputs=detection_classes \
-  --op_definition='op:"PostProcessing" attr{key:"num" value:{i:10}}'
-
-The above will identify a subgraph starting at nodes 'concat' and 'concat_1',
-and ending at 'detection_classes'. All nodes in between will be removed and
-replaced by a new op called 'PostProcessing'.
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import uuid as _uuid
-from absl import app
-from absl import flags
-from google.protobuf import text_format
-from tensorflow.contrib.framework.python.framework.graph_util import fuse_op
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import node_def_pb2
-from tensorflow.core.framework import types_pb2
-from tensorflow.python.platform import gfile
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("input_graph", "", "Binary graphdef to load.")
-flags.DEFINE_string("output_graph", "", "Resulting binary graphdef.")
-
-flags.DEFINE_string("inputs", "",
-                    "Comma-separated list of inputs to the subgraph.")
-flags.DEFINE_string("outputs", "",
-                    "Comma-separated list of outputs of the subgraph.")
-flags.DEFINE_string("op_definition", "",
-                    "A text NodeDef defining the contents of the custom op.")
-
-
-def _read_graph_def(filename):
-  if not gfile.Exists(filename):
-    raise ValueError("Input graph file '" + filename + "' does not exist!")
-
-  graph_def = graph_pb2.GraphDef()
-  with gfile.GFile(filename, "rb") as f:
-    graph_def.ParseFromString(f.read())
-  return graph_def
-
-
-def _write_graph_def(graph_def, filename):
-  if not filename:
-    raise ValueError("Output graph file not specified")
-
-  with gfile.Open(filename, "wb") as f:
-    f.write(graph_def.SerializeToString())
-
-
-def _collapse_subgraph(graph_def, inputs, outputs, op_definition):
-  """Substitute a custom op for the subgraph delimited by inputs and outputs."""
-  name = _uuid.uuid1().hex
-  # We need a default type, but it can be changed using 'op_definition'.
-  default_type = types_pb2.DT_FLOAT
-  new_graph = fuse_op(
-      graph_def=graph_def,
-      input_nodes=inputs,
-      output_nodes=outputs,
-      output_dtypes=[default_type for _ in outputs],
-      output_quantized=False,
-      op_name=name,
-      op_type="CustomTfLiteOp")
-  node_def = node_def_pb2.NodeDef()
-  text_format.Parse(op_definition, node_def)
-  for node in new_graph.node:
-    if node.name == name:
-      node.MergeFrom(node_def)
-  return new_graph
-
-
-def main(argv):
-  del argv  # unused
-  graph = _read_graph_def(filename=flags.FLAGS.input_graph)
-  graph = _collapse_subgraph(
-      graph_def=graph,
-      inputs=flags.FLAGS.inputs.split(","),
-      outputs=flags.FLAGS.outputs.split(","),
-      op_definition=flags.FLAGS.op_definition)
-  _write_graph_def(graph_def=graph, filename=flags.FLAGS.output_graph)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index f83a438f959..b5d6ad543d1 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -99,8 +99,8 @@ class Delegate(object):
     options_keys = (ctypes.c_char_p * len(options))()
     options_values = (ctypes.c_char_p * len(options))()
     for idx, (key, value) in enumerate(options.items()):
-      options_keys[idx] = str(key)
-      options_values[idx] = str(value)
+      options_keys[idx] = str(key).encode('utf-8')
+      options_values[idx] = str(value).encode('utf-8')
 
     class ErrorMessageCapture(object):
 
@@ -200,10 +200,12 @@ class Interpreter(object):
     Raises:
       ValueError: If the interpreter was unable to create.
     """
+    if not hasattr(self, '_custom_op_registerers'):
+      self._custom_op_registerers = []
     if model_path and not model_content:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
-              model_path))
+              model_path, self._custom_op_registerers))
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
     elif model_content and not model_path:
@@ -213,7 +215,7 @@ class Interpreter(object):
       self._model_content = model_content
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content))
+              model_content, self._custom_op_registerers))
     elif not model_path and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
     else:
@@ -454,3 +456,40 @@ class Interpreter(object):
 
   def reset_all_variables(self):
     return self._interpreter.ResetVariableTensors()
+
+
+class InterpreterWithCustomOps(Interpreter):
+  """Interpreter interface for TensorFlow Lite Models that accepts custom ops.
+
+  The interface provided by this class is experimenal and therefore not exposed
+  as part of the public API.
+
+  Wraps the tf.lite.Interpreter class and adds the ability to load custom ops
+  by providing the names of functions that take a pointer to a BuiltinOpResolver
+  and add a custom op.
+  """
+
+  def __init__(self,
+               model_path=None,
+               model_content=None,
+               experimental_delegates=None,
+               custom_op_registerers=None):
+    """Constructor.
+
+    Args:
+      model_path: Path to TF-Lite Flatbuffer file.
+      model_content: Content of model.
+      experimental_delegates: Experimental. Subject to change. List of
+        [TfLiteDelegate](https://www.tensorflow.org/lite/performance/delegates)
+          objects returned by lite.load_delegate().
+      custom_op_registerers: List of str, symbol names of functions that take a
+        pointer to a MutableOpResolver and register a custom op.
+
+    Raises:
+      ValueError: If the interpreter was unable to create.
+    """
+    self._custom_op_registerers = custom_op_registerers
+    super(InterpreterWithCustomOps, self).__init__(
+        model_path=model_path,
+        model_content=model_content,
+        experimental_delegates=experimental_delegates)
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 27c4e5756ca..af0540c510a 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -23,10 +23,39 @@ import sys
 import numpy as np
 import six
 
+# Force loaded shared object symbols to be globally visible. This is needed so
+# that the interpreter_wrapper, in one .so file, can see the test_registerer,
+# in a different .so file. Note that this may already be set by default.
+# pylint: disable=g-import-not-at-top
+if hasattr(sys, 'setdlopenflags') and hasattr(sys, 'getdlopenflags'):
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
 from tensorflow.lite.python import interpreter as interpreter_wrapper
+from tensorflow.lite.python.testdata import test_registerer_wrapper as test_registerer
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
+# pylint: enable=g-import-not-at-top
+
+
+class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
+
+  def testRegisterer(self):
+    interpreter = interpreter_wrapper.InterpreterWithCustomOps(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'),
+        custom_op_registerers=['TF_TestRegisterer'])
+    self.assertTrue(interpreter._safe_to_run())
+    self.assertEqual(test_registerer.get_num_test_registerer_calls(), 1)
+
+  def testRegistererFailure(self):
+    bogus_name = 'CompletelyBogusRegistererName'
+    with self.assertRaisesRegexp(
+        ValueError, 'Looking up symbol \'' + bogus_name + '\' failed'):
+      interpreter_wrapper.InterpreterWithCustomOps(
+          model_path=resource_loader.get_path_to_datafile(
+              'testdata/permute_float.tflite'),
+          custom_op_registerers=[bogus_name])
 
 
 class InterpreterTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 476f9390e57..6e8ba8e7de1 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -28,9 +28,11 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -60,6 +62,7 @@ tf_py_wrap_cc(
     srcs = [
         "interpreter_wrapper.i",
     ],
+    copts = ["-fexceptions"],
     deps = [
         ":interpreter_wrapper_lib",
         "//third_party/python_runtime:headers",
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index d0076e6a351..b4da1fd6d36 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -14,11 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
 
+// Windows does not have dlfcn.h/dlsym, use GetProcAddress() instead.
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif  // defined(_WIN32)
+
+#include <stdarg.h>
+
 #include <sstream>
 #include <string>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
@@ -82,18 +93,60 @@ PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
   return result;
 }
 
+bool RegisterCustomOpByName(const char* registerer_name,
+                            tflite::MutableOpResolver* resolver,
+                            std::string* error_msg) {
+  // Registerer functions take a pointer to a BuiltinOpResolver as an input
+  // parameter and return void.
+  // TODO(b/137576229): We should implement this functionality in a more
+  // principled way.
+  typedef void (*RegistererFunctionType)(tflite::MutableOpResolver*);
+
+  // Look for the Registerer function by name.
+  RegistererFunctionType registerer = reinterpret_cast<RegistererFunctionType>(
+  // We don't have dlsym on Windows, use GetProcAddress instead.
+#if defined(_WIN32)
+      GetProcAddress(nullptr, registerer_name)
+#else
+      dlsym(RTLD_DEFAULT, registerer_name)
+#endif  // defined(_WIN32)
+      );
+
+  // Fail in an informative way if the function was not found.
+  if (registerer == nullptr) {
+    // We don't have dlerror on Windows, use GetLastError instead.
+    *error_msg =
+#if defined(_WIN32)
+        absl::StrFormat("Looking up symbol '%s' failed with error (0x%x).",
+                        registerer_name, GetLastError());
+#else
+        absl::StrFormat("Looking up symbol '%s' failed with error '%s'.",
+                        registerer_name, dlerror());
+#endif  // defined(_WIN32)
+    return false;
+  }
+
+  // Call the registerer with the resolver.
+  registerer(resolver);
+  return true;
+}
+
 }  // namespace
 
 InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
     std::unique_ptr<tflite::FlatBufferModel> model,
     std::unique_ptr<PythonErrorReporter> error_reporter,
-    std::string* error_msg) {
+    const std::vector<std::string>& registerers, std::string* error_msg) {
   if (!model) {
     *error_msg = error_reporter->message();
     return nullptr;
   }
 
   auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+  for (const auto registerer : registerers) {
+    if (!RegisterCustomOpByName(registerer.c_str(), resolver.get(), error_msg))
+      return nullptr;
+  }
   auto interpreter = CreateInterpreter(model.get(), *resolver);
   if (!interpreter) {
     *error_msg = error_reporter->message();
@@ -417,16 +470,18 @@ PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
-    const char* model_path, std::string* error_msg) {
+    const char* model_path, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(model_path, error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  error_msg);
+                                  registerers, error_msg);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data, std::string* error_msg) {
+    PyObject* data, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
   char * buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
@@ -438,7 +493,7 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
       tflite::FlatBufferModel::BuildFromBuffer(buf, length,
                                                error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  error_msg);
+                                  registerers, error_msg);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensors() {
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 56fe36000c0..de57f732038 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -26,8 +26,7 @@ limitations under the License.
 // automatically move <Python.h> before <locale>.
 #include <Python.h>
 
-struct _TfLiteDelegate;
-typedef struct _TfLiteDelegate TfLiteDelegate;
+struct TfLiteDelegate;
 
 // We forward declare TFLite classes here to avoid exposing them to SWIG.
 namespace tflite {
@@ -47,12 +46,14 @@ class PythonErrorReporter;
 class InterpreterWrapper {
  public:
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path,
-                                                      std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromFile(
+      const char* model_path, const std::vector<std::string>& registerers,
+      std::string* error_msg);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data,
-                                                        std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, const std::vector<std::string>& registerers,
+      std::string* error_msg);
 
   ~InterpreterWrapper();
   PyObject* AllocateTensors();
@@ -85,7 +86,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateInterpreterWrapper(
       std::unique_ptr<tflite::FlatBufferModel> model,
       std::unique_ptr<PythonErrorReporter> error_reporter,
-      std::string* error_msg);
+      const std::vector<std::string>& registerers, std::string* error_msg);
 
   InterpreterWrapper(
       std::unique_ptr<tflite::FlatBufferModel> model,
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
index 5424c625508..a9d2bff833e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -33,6 +33,36 @@ limitations under the License.
   $result = PyLong_FromVoidPtr($1)
 }
 
+// Converts a Python list of str to a std::vector<std::string>, returns true
+// if the conversion was successful.
+%{
+static bool PyListToStdVectorString(PyObject *list, std::vector<std::string> *strings) {
+  // Make sure the list is actually a list.
+  if (!PyList_Check(list)) return false;
+
+  // Convert the Python list to a vector of strings.
+  const int list_size = PyList_Size(list);
+  strings->resize(list_size);
+  for (int k = 0; k < list_size; k++) {
+    PyObject *string_py = PyList_GetItem(list, k);
+    if (PyString_Check(string_py)) {
+      (*strings)[k] = PyString_AsString(string_py);
+    } else if (PyUnicode_Check(string_py)) {
+      // First convert the PyUnicode to a PyString.
+      PyObject *utf8_string_py = PyUnicode_AsUTF8String(string_py);
+      if (!utf8_string_py) return false;
+
+      // Then convert it to a regular std::string.
+      (*strings)[k] = PyString_AsString(utf8_string_py);
+      Py_DECREF(utf8_string_py);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+%}
+bool PyListToStdVectorString(PyObject *list, std::vector<std::string> *strings);
 
 %include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
 
@@ -42,12 +72,19 @@ namespace interpreter_wrapper {
 
   // Version of the constructor that handles producing Python exceptions
   // that propagate strings.
-  static PyObject* CreateWrapperCPPFromFile(const char* model_path) {
+  static PyObject* CreateWrapperCPPFromFile(
+      const char* model_path,
+      PyObject* registerers_py) {
     std::string error;
+    std::vector<std::string> registerers;
+    if (!PyListToStdVectorString(registerers_py, &registerers)) {
+      PyErr_SetString(PyExc_ValueError, "Second argument is expected to be a list of strings.");
+      return nullptr;
+    }
     if(tflite::interpreter_wrapper::InterpreterWrapper* ptr =
         tflite::interpreter_wrapper::InterpreterWrapper
             ::CreateWrapperCPPFromFile(
-        model_path, &error)) {
+        model_path, registerers, &error)) {
       return SWIG_NewPointerObj(
           ptr, SWIGTYPE_p_tflite__interpreter_wrapper__InterpreterWrapper, 1);
     } else {
@@ -59,12 +96,18 @@ namespace interpreter_wrapper {
   // Version of the constructor that handles producing Python exceptions
   // that propagate strings.
   static PyObject* CreateWrapperCPPFromBuffer(
-      PyObject* data) {
+      PyObject* data ,
+      PyObject* registerers_py) {
     std::string error;
+    std::vector<std::string> registerers;
+    if (!PyListToStdVectorString(registerers_py, &registerers)) {
+      PyErr_SetString(PyExc_ValueError, "Second argument is expected to be a list of strings.");
+      return nullptr;
+    }
     if(tflite::interpreter_wrapper::InterpreterWrapper* ptr =
         tflite::interpreter_wrapper::InterpreterWrapper
             ::CreateWrapperCPPFromBuffer(
-        data, &error)) {
+        data, registerers, &error)) {
       return SWIG_NewPointerObj(
           ptr, SWIGTYPE_p_tflite__interpreter_wrapper__InterpreterWrapper, 1);
     } else {
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 15772e7f0ce..5501c0746cf 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -1029,8 +1029,9 @@ class TFLiteConverter(TFLiteConverterBase):
 
     for tensor in self._input_tensors:
       shape = tensor.shape.as_list()
-      shape[0] = batch_size
-      tensor.set_shape(shape)
+      if shape[0] is None:
+        shape[0] = batch_size
+        tensor.set_shape(shape)
 
 
 @_tf_export(v1=["lite.TocoConverter"])
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index c1fc54be4d3..a3294d87e0b 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
@@ -31,14 +32,14 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import tracking
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFlexMode(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -58,10 +59,11 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         str(error.exception))
 
   def testDeprecatedFlags(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
diff --git a/tensorflow/lite/python/lite_mlir_test.py b/tensorflow/lite/python/lite_mlir_test.py
index 98c0a5fe36e..8cdb100b2ad 100644
--- a/tensorflow/lite/python/lite_mlir_test.py
+++ b/tensorflow/lite/python/lite_mlir_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -40,44 +41,20 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import tracking
 
 
-def mlir_convert_and_check_for_unsupported(test_object, converter):
-  """Run the converter but don't fail MLIR was not built.
-
-  Args:
-    test_object: PyTest object.
-    converter: A TFLiteConverter
-
-  Returns:
-    The converted TF lite model or None if mlir support is not builtinto the
-    binary.
-  """
-  try:
-    model = converter.convert()
-    test_object.assertTrue(model)
-    return model
-  except lite.ConverterError as e:
-    if not e.message.startswith('This flag is not supported by this version'):
-      raise e
-    else:
-      return None
-
-
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -98,16 +75,16 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testString(self):
-    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
-    out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
+      out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    converter.experimental_enable_mlir_converter = True
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -126,13 +103,14 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([2, 2] == output_details[0]['shape']).all())
 
   def testQuantization(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess,
@@ -144,9 +122,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         'inputA': (0., 1.),
         'inputB': (0., 1.)
     }  # mean, std_dev
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -175,16 +151,16 @@ class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testScalarValid(self):
     # Construct a graph using a scalar (empty shape) input.
-    in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Test conversion with the scalar input shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    converter.experimental_enable_mlir_converter = True
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -212,34 +188,33 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue((expected_output == output_data).all())
 
   def testPostTrainingQuantize(self):
+    self.skipTest('b/124315492')
     np.random.seed(0)
-    # We need the tensor to have more than 1024 elements for quantize_weights
-    # to kick in. Thus, the [33, 33] shape.
-    in_tensor_1 = array_ops.placeholder(
-        shape=[33, 33], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = constant_op.constant(
-        np.random.uniform(low=-10., high=10., size=(33, 33)),
-        shape=[33, 33],
-        dtype=dtypes.float32,
-        name='inputB')
-    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      # We need the tensor to have more than 1024 elements for quantize_weights
+      # to kick in. Thus, the [33, 33] shape.
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [in_tensor_1],
                                                         [out_tensor])
-    float_tflite = mlir_convert_and_check_for_unsupported(self, float_converter)
-    if float_tflite is None:
-      return
+    float_converter.experimental_enable_mlir_converter = True
+    float_tflite = float_converter.convert()
 
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
+    quantized_converter.experimental_enable_mlir_converter = True
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_tflite = mlir_convert_and_check_for_unsupported(
-        self, quantized_converter)
-    if quantized_tflite is None:
-      return
+    quantized_tflite = quantized_converter.convert()
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
@@ -266,9 +241,8 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [placeholder],
                                                   [output_node])
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    converter.experimental_enable_mlir_converter = True
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -322,9 +296,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = root.f(input_data)
@@ -359,9 +331,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = concrete_func(**input_data)
@@ -389,9 +359,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = concrete_func(input_data)[0]
@@ -422,9 +390,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = concrete_func(input_data)
@@ -449,9 +415,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = concrete_func(input_data)
@@ -463,24 +427,46 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
         expected = expected.c.numpy()
       np.testing.assert_almost_equal(expected, actual)
 
+  @test_util.run_v2_only
+  def testKerasLSTM(self):
+    self.skipTest('b/138657502')
+    input_data = constant_op.constant(
+        np.array(np.random.random_sample((10, 10, 10)), dtype=np.float32))
+
+    model = keras.models.Sequential(
+        [keras.layers.LSTM(units=10, input_shape=(10, 10))])
+
+    run_model = def_function.function(model.__call__)
+    concrete_func = run_model.get_concrete_function(
+        tensor_spec.TensorSpec((10, 10, 10), dtype=dtypes.float32))
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    converter.experimental_enable_mlir_converter = True
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = concrete_func(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    for expected, actual in zip(expected_value, actual_value):
+      np.testing.assert_almost_equal(expected, actual)
+
 
 class TestFlexMode(test_util.TensorFlowTestCase):
 
-  @test_util.run_v1_only('Incompatible with 2.0.')
   def testSession(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     converter.experimental_enable_mlir_converter = True
     converter.target_spec.supported_ops = set([lite.OpsSet.SELECT_TF_OPS])
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Ensures the model contains TensorFlow ops.
     # TODO(nupurgarg): Check values once there is a Python delegate interface.
@@ -505,10 +491,7 @@ class TestFlexMode(test_util.TensorFlowTestCase):
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
     converter.experimental_enable_mlir_converter = True
     converter.target_spec.supported_ops = set([lite.OpsSet.SELECT_TF_OPS])
-
-    tflite_model = mlir_convert_and_check_for_unsupported(self, converter)
-    if tflite_model is None:
-      return
+    tflite_model = converter.convert()
 
     # Ensures the model contains TensorFlow ops.
     # TODO(nupurgarg): Check values once there is a Python delegate interface.
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index ae68022751c..1d11cb75e9e 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -101,14 +101,14 @@ class FromConstructor(TestModels):
     self.assertTrue(converter._has_valid_tensors())
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testFloat(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -135,9 +135,10 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testString(self):
-    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
-    out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
+      out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -164,13 +165,14 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # interpreter API after support has been added.
 
   def testQuantization(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess,
@@ -210,13 +212,14 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
   def testQuantizationInvalid(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess,
@@ -232,11 +235,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testIntermediateInputArray(self):
     """Convert a model from an intermediate input array."""
-    in_tensor_init = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    in_tensor_final = in_tensor_init + in_tensor_init
-    out_tensor = in_tensor_final + in_tensor_final
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_init = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor_final = in_tensor_init + in_tensor_init
+      out_tensor = in_tensor_final + in_tensor_final
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor_final],
@@ -263,9 +267,10 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testSizeNoneInvalid(self):
-    in_tensor = array_ops.placeholder(dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Test None as shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -277,9 +282,10 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testScalarValid(self):
     # Construct a graph using a scalar (empty shape) input.
-    in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Test conversion with the scalar input shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -313,10 +319,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue((expected_output == output_data).all())
 
   def testSizeInvalid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, None, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, None, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Test invalid shape. None after 1st dimension.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -329,10 +336,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         str(error.exception))
 
   def testBatchSizeValid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[None, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[None, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -358,14 +366,42 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  def testBatchSizeNonZero(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[None, 4], dtype=dtypes.float32, name='input1')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[4, 10], dtype=dtypes.float32, name='input2')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2)
+      sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess,
+                                                  [in_tensor_1, in_tensor_2],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertEqual('input1', input_details[0]['name'])
+    self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertEqual('input2', input_details[1]['name'])
+    self.assertTrue(([4, 10] == input_details[1]['shape']).all())
+
   def testFreezeGraph(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    var = variable_scope.get_variable(
-        'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + var
-    sess = session.Session()
-    sess.run(_global_variables_initializer())
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      var = variable_scope.get_variable(
+          'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + var
+      sess = session.Session()
+      sess.run(_global_variables_initializer())
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -391,12 +427,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
-  # TODO(nupurgarg): Verify value of contents in GraphViz.
   def testGraphviz(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -405,12 +441,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     graphviz_output = converter.convert()
     self.assertTrue(graphviz_output)
 
-  # TODO(nupurgarg): Verify value of contents in GraphViz.
   def testDumpGraphviz(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -441,10 +477,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(num_items_graphviz_video > num_items_graphviz)
 
   def testInferenceInputType(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -472,10 +509,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
 
   def testDefaultRangesStats(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -505,15 +543,16 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
   def testPostTrainingQuantizeDeprecatedAttribute(self):
-    in_tensor_1 = array_ops.placeholder(
-        shape=[33, 33], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = constant_op.constant(
-        np.random.uniform(low=-10., high=10., size=(33, 33)),
-        shape=[33, 33],
-        dtype=dtypes.float32,
-        name='inputB')
-    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+      sess = session.Session()
 
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
@@ -528,17 +567,18 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testPostTrainingQuantize(self):
     np.random.seed(0)
-    # We need the tensor to have more than 1024 elements for quantize_weights
-    # to kick in. Thus, the [33, 33] shape.
-    in_tensor_1 = array_ops.placeholder(
-        shape=[33, 33], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = constant_op.constant(
-        np.random.uniform(low=-10., high=10., size=(33, 33)),
-        shape=[33, 33],
-        dtype=dtypes.float32,
-        name='inputB')
-    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      # We need the tensor to have more than 1024 elements for quantize_weights
+      # to kick in. Thus, the [33, 33] shape.
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [in_tensor_1],
@@ -574,8 +614,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     return (inp, output, calibration_gen)
 
   def testPostTrainingCalibrateAndQuantize(self):
-    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
@@ -604,8 +645,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   def testCalibrateAndQuantizeBuiltinInt8(self):
-    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
@@ -648,8 +690,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   def testQuantizeFloat16(self, use_rep_data, include_int8,
                           is_float16_quantized, is_error,
                           is_post_training_quantized):
-    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
@@ -698,8 +741,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         raise ValueError('Invalid test options.')
 
   def testInvalidQuantizeFloat16(self):
-    inp, output, _ = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, _ = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Specify float16 quantization
     quantized_converter = lite.TFLiteConverter.from_session(
@@ -718,17 +762,18 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testInvalidPostTrainingQuantize(self):
     np.random.seed(0)
-    # We need the tensor to have more than 1024 elements for quantize_weights
-    # to kick in. Thus, the [33, 33] shape.
-    in_tensor_1 = array_ops.placeholder(
-        shape=[33, 33], dtype=dtypes.float32, name='inputA')
-    in_tensor_2 = constant_op.constant(
-        np.random.uniform(low=-10., high=10., size=(33, 33)),
-        shape=[33, 33],
-        dtype=dtypes.float32,
-        name='inputB')
-    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      # We need the tensor to have more than 1024 elements for quantize_weights
+      # to kick in. Thus, the [33, 33] shape.
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+      sess = session.Session()
 
     # Attempt to convert to quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
@@ -744,8 +789,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         'TFLITE_BUILTINS_INT8 or INT8 supported types.', str(error.exception))
 
   def testPostTrainingCalibrateAndQuantizeFloatNotAllowed(self):
-    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
@@ -768,8 +814,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   def testPostTrainingCalibrateAndQuantizeInt8Inputs(self):
-    inp, output, calibration_gen = self._getCalibrationQuantizeModel()
-    sess = session.Session()
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
@@ -801,10 +848,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testFloatTocoConverter(self):
     """Tests deprecated test TocoConverter."""
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
@@ -817,9 +865,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   def testMultipleOutputNodeNames(self):
     """Tests converting a graph with an op that have multiple outputs."""
-    input_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
-    out0, out1, out2, out3 = array_ops.split(input_tensor, [1, 1, 1, 1], axis=0)
-    sess = session.Session()
+    with ops.Graph().as_default():
+      input_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
+      out0, out1, out2, out3 = array_ops.split(
+          input_tensor, [1, 1, 1, 1], axis=0)
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [input_tensor],
@@ -888,10 +938,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testInferenceInputOutputTypeFloatDefault(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -916,11 +967,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
 
   def testInferenceInputOutputTypeQuantizedUint8Default(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor + in_tensor, min=0., max=1., name='output')
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -947,11 +999,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
 
   def testReusingConverterWithDifferentPostTrainingQuantization(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor + in_tensor, min=0., max=1., name='output')
+      sess = session.Session()
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
@@ -969,16 +1022,18 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # This is a regression test for the case where shape of dynamic output
     # tensors changes between invocations.
     # See also https://github.com/tensorflow/tensorflow/issues/26549
-    input_tensor = array_ops.placeholder(shape=[1, 1], dtype=dtypes.float32)
-    input2_tensor = array_ops.placeholder(shape=[1], dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      input_tensor = array_ops.placeholder(shape=[1, 1], dtype=dtypes.float32)
+      input2_tensor = array_ops.placeholder(shape=[1], dtype=dtypes.float32)
 
-    # The bug is triggered only when dynamic tensor is intermediate. Putting
-    # some other ops around it.
-    neg = math_ops.negative(input2_tensor)
-    padding = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int32)
-    output_tensor = array_ops.pad(input_tensor, padding) + neg
+      # The bug is triggered only when dynamic tensor is intermediate. Putting
+      # some other ops around it.
+      neg = math_ops.negative(input2_tensor)
+      padding = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int32)
+      output_tensor = array_ops.pad(input_tensor, padding) + neg
+
+      sess = session.Session()
 
-    sess = session.Session()
     converter = lite.TFLiteConverter.from_session(
         sess, [input_tensor, padding, input2_tensor], [output_tensor])
     tflite_model = converter.convert()
@@ -1025,14 +1080,14 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertIn((func + 'add'), converter._debug_info.traces)
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
@@ -1064,10 +1119,11 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testFloatWithShapesArray(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
@@ -1090,12 +1146,13 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
 
   def testFreezeGraph(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    var = variable_scope.get_variable(
-        'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + var
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      var = variable_scope.get_variable(
+          'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + var
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
@@ -1110,10 +1167,11 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
                      str(error.exception))
 
   def testPbtxt(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pbtxt')
@@ -1166,10 +1224,11 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
         str(error.exception))
 
   def testFloatTocoConverter(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
@@ -1188,10 +1247,11 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testGraphDebugInfo(self):
     """Test a frozen graph doesn't have debug info captured."""
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     # Write graph to file.
     graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
@@ -1296,21 +1356,21 @@ class FromFrozenGraphObjectDetection(test_util.TensorFlowTestCase):
         str(error.exception))
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromSavedModelTest(TestModels):
 
   def _createSavedModel(self, shape):
     """Create a simple SavedModel."""
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
-    with session.Session() as sess:
-      in_tensor_1 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name='inputB')
-      in_tensor_2 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name='inputA')
-      out_tensor = in_tensor_1 + in_tensor_2
-      inputs = {'x': in_tensor_1, 'y': in_tensor_2}
-      outputs = {'z': out_tensor}
-      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    with ops.Graph().as_default():
+      with session.Session() as sess:
+        in_tensor_1 = array_ops.placeholder(
+            shape=shape, dtype=dtypes.float32, name='inputB')
+        in_tensor_2 = array_ops.placeholder(
+            shape=shape, dtype=dtypes.float32, name='inputA')
+        out_tensor = in_tensor_1 + in_tensor_2
+        inputs = {'x': in_tensor_1, 'y': in_tensor_2}
+        outputs = {'z': out_tensor}
+        saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
     return saved_model_dir
 
   def testSimpleModel(self):
@@ -1465,7 +1525,6 @@ class MyAddLayer(keras.layers.Layer):
     return config
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class FromKerasFile(TestModels, parameterized.TestCase):
 
   def setUp(self):
@@ -1578,6 +1637,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
   def testSequentialModelInputArray(self):
     """Test a Sequential tf.keras model testing input arrays argument."""
+    ops.disable_eager_execution()
     self._getSequentialModel()
 
     # Invalid input array raises error.
@@ -1622,6 +1682,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
   def testSequentialModelOutputArray(self):
     """Test a Sequential tf.keras model testing output arrays argument."""
+    ops.disable_eager_execution()
     self._getSequentialModel()
 
     # Invalid output array raises error.
@@ -1747,12 +1808,10 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 2)
-    self.assertEqual('dense_1/BiasAdd', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 4] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
-    self.assertEqual('dropout/Identity', output_details[1]['name'])
     self.assertEqual(np.float32, output_details[1]['dtype'])
     self.assertTrue(([1, 4] == output_details[1]['shape']).all())
     self.assertEqual((0., 0.), output_details[1]['quantization'])
@@ -1800,7 +1859,6 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
-    self.assertEqual('time_distributed/Reshape_1', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
@@ -1839,17 +1897,18 @@ class FromKerasFile(TestModels, parameterized.TestCase):
       self.assertValidDebugInfo(converter._debug_info)
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class GrapplerTest(TestModels):
 
   def testConstantFolding(self):
+    ops.disable_eager_execution()
     # Constant folding handles the tf.broadcast_to operation which was not
     # supported by the TFLite at the time this test was added.
-    in_tensor = array_ops.placeholder(shape=[3, 3], dtype=dtypes.float32)
-    y_const = constant_op.constant([1., 2., 3.])
-    y_broadcast = gen_array_ops.broadcast_to(y_const, [3, 3])
-    out_tensor = math_ops.matmul(in_tensor, y_broadcast, name='output')
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[3, 3], dtype=dtypes.float32)
+      y_const = constant_op.constant([1., 2., 3.])
+      y_broadcast = gen_array_ops.broadcast_to(y_const, [3, 3])
+      out_tensor = math_ops.matmul(in_tensor, y_broadcast, name='output')
+      sess = session.Session()
 
     # Convert model.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index e4412aa744f..c946533498d 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
@@ -249,6 +250,42 @@ class FromConcreteFunctionTest(TestModels):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
+  @test_util.run_v2_only
+  def testEmbeddings(self):
+    """Test model with embeddings."""
+    input_data = constant_op.constant(
+        np.array(np.random.random_sample((20)), dtype=np.int32))
+
+    class EmbeddingModel(keras.Model):
+
+      def __init__(self):
+        super(EmbeddingModel, self).__init__()
+        self.shared_weights = self.add_weight(
+            'weights',
+            shape=(2000, 300),
+            dtype=dtypes.float32,
+            initializer=init_ops.random_normal_initializer(
+                mean=0.0, stddev=300**(-0.5)))
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=(20), dtype=dtypes.int32)
+      ])
+      def func(self, x):
+        return array_ops.gather(self.shared_weights, x)
+
+    # Building the model.
+    root = EmbeddingModel()
+    concrete_func = root.func.get_concrete_function()
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.func(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    np.testing.assert_almost_equal(expected_value.numpy(), actual_value, 5)
+
   @test_util.run_v2_only
   def testGraphDebugInfo(self):
     """Test a concrete function has debug info captured."""
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 390f4a0af24..5aa212a573f 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -854,7 +854,7 @@ def _find_children_hints(call, graph_def):
     if n in reachable_by_output:
       if n not in reachable_by_input and n not in output_nodes_set:
         # special handle for while loop function def.
-        if node.op == "While":
+        if node.op == "While" or node.op == "StatelessWhile":
           body_name = node.attr["body"].func.name
           inputs_outside_loop = node.input
           for function_def in graph_def.library.function:
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 75ec5e024af..5da04f16ae5 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -213,6 +213,36 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
       builder.GetSize());
 }
 
+PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
+                                            int output_py_type,
+                                            bool allow_float,
+                                            const char* operator_output_name) {
+  string op_name = std::string(operator_output_name);
+
+  TfLiteType input_type = python_utils::TfLiteTypeFromPyType(input_py_type);
+  TfLiteType output_type = python_utils::TfLiteTypeFromPyType(output_py_type);
+  if (input_type == kTfLiteNoType || output_type == kTfLiteNoType) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Input/output type cannot be kTfLiteNoType");
+    return nullptr;
+  }
+  auto tflite_model = CreateMutableModel(*model_->GetModel());
+  reader_->AddCalibrationToModel(tflite_model.get(), /*update=*/false);
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = tflite::optimize::QuantizeModel(
+      &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
+      TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
+      error_reporter_.get());
+  if (status != kTfLiteOk) {
+    error_reporter_->exception();
+    return nullptr;
+  }
+
+  return python_utils::ConvertToPyString(
+      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+      builder.GetSize());
+}
+
 /*static*/ CalibrationWrapper* CalibrationWrapper::CreateWrapperCPPFromBuffer(
     PyObject* data) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index 3fe1629da58..627e5f5a0aa 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -62,6 +62,12 @@ class CalibrationWrapper {
   PyObject* QuantizeModel(int input_py_type, int output_py_type,
                           bool allow_float);
 
+  // Allows quantizing only the operator that produces the tensor with name
+  // operator_output_name. (This can be used to help debug.).
+  // TODO(suharshs): Allow providing multiple names.
+  PyObject* QuantizeModel(int input_py_type, int output_py_type,
+                          bool allow_float, const char* operator_output_name);
+
  private:
   // CalibrationWrapper is not copyable or assignable. We avoid the use of
   // CalibrationWrapper() = delete here for SWIG compatibility.
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index a9eb6792882..f24996c63bd 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -76,3 +76,29 @@ class Calibrator(object):
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
         np.dtype(output_type.as_numpy_dtype()).num, allow_float)
+
+  def calibrate_and_quantize_single(self, dataset_gen, input_type, output_type,
+                                    allow_float, op_output_name):
+    """Calibrates the model with specified generator and then quantizes it.
+
+    Only the single op with output op_output_name will be quantized.
+
+    Returns:
+      A quantized model.
+
+    Args:
+      dataset_gen: A generator that generates calibration samples.
+      input_type: A tf.dtype representing the desired real-value input type.
+      output_type: A tf.dtype representing the desired real-value output type.
+      allow_float: A boolean. False if the resulting model cannot perform float
+        computation, useful when targeting an integer-only backend. If False, an
+        error will be thrown if an operation cannot be quantized, otherwise the
+        model will fallback to float ops.
+      op_output_name: A string, only this op will be quantized.
+    """
+    self._calibrator.Prepare()
+    for calibration_sample in dataset_gen():
+      self._calibrator.FeedTensor(calibration_sample)
+    return self._calibrator.QuantizeModel(
+        np.dtype(input_type.as_numpy_dtype()).num,
+        np.dtype(output_type.as_numpy_dtype()).num, allow_float, op_output_name)
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index ca4a86c8461..f027d6c4bb3 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -60,6 +60,21 @@ class CalibratorTest(test_util.TensorFlowTestCase):
                                                        constants.FLOAT, True)
     self.assertIsNotNone(quantized_model)
 
+  def test_calibration_with_quantization_single_op(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator for the model.
+    def input_gen():
+      for _ in range(10):
+        yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
+
+    quantized_model = quantizer.calibrate_and_quantize_single(
+        input_gen, constants.FLOAT, constants.FLOAT, True, 'conv2d_8/BiasAdd')
+    self.assertIsNotNone(quantized_model)
+
   def test_calibration_with_quantization_multiple_inputs(self):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 7bda81358f9..0c12e19451c 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,8 +1,9 @@
 load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
 package(
     default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],  # Apache 2.0,
 )
 
 exports_files(glob(["*.pb"]))
@@ -71,3 +72,26 @@ cc_binary(
         ":test_delegate",
     ],
 )
+
+cc_library(
+    name = "test_registerer",
+    srcs = ["test_registerer.cc"],
+    hdrs = ["test_registerer.h"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+    alwayslink = 1,
+)
+
+tf_py_wrap_cc(
+    name = "test_registerer_wrapper",
+    srcs = [
+        "test_registerer.i",
+    ],
+    deps = [
+        ":test_registerer",
+        "//third_party/python_runtime:headers",
+    ],
+)
diff --git a/tensorflow/lite/python/testdata/test_registerer.cc b/tensorflow/lite/python/testdata/test_registerer.cc
new file mode 100644
index 00000000000..6adde65a863
--- /dev/null
+++ b/tensorflow/lite/python/testdata/test_registerer.cc
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/python/testdata/test_registerer.h"
+
+namespace tflite {
+
+namespace {
+static int num_test_registerer_calls = 0;
+}  // namespace
+
+// Dummy registerer function with the correct signature. Ignores the resolver
+// but increments the num_test_registerer_calls counter by one. The TF_ prefix
+// is needed to get past the version script in the OSS build.
+extern "C" void TF_TestRegisterer(tflite::MutableOpResolver *resolver) {
+  num_test_registerer_calls++;
+}
+
+// Returns the num_test_registerer_calls counter and re-sets it.
+int get_num_test_registerer_calls() {
+  const int result = num_test_registerer_calls;
+  num_test_registerer_calls = 0;
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/python/testdata/test_registerer.h b/tensorflow/lite/python/testdata/test_registerer.h
new file mode 100644
index 00000000000..8ee7e198358
--- /dev/null
+++ b/tensorflow/lite/python/testdata/test_registerer.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
+#define TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+
+// Dummy registerer function with the correct signature. Ignores the resolver
+// but increments the num_test_registerer_calls counter by one. The TF_ prefix
+// is needed to get past the version script in the OSS build.
+extern "C" void TF_TestRegisterer(tflite::MutableOpResolver *resolver);
+
+// Returns the num_test_registerer_calls counter and re-sets it.
+int get_num_test_registerer_calls();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
diff --git a/tensorflow/lite/python/testdata/test_registerer.i b/tensorflow/lite/python/testdata/test_registerer.i
new file mode 100644
index 00000000000..1cd41c9164d
--- /dev/null
+++ b/tensorflow/lite/python/testdata/test_registerer.i
@@ -0,0 +1,20 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+#include "tensorflow/lite/python/testdata/test_registerer.h"
+%}
+
+%include "tensorflow/lite/python/testdata/test_registerer.h"
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index f13fad5e821..0c76db2b414 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -24,6 +24,7 @@ from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,7 +33,6 @@ from tensorflow.python.platform import test
 
 
 # TODO(nupurgarg): Add test for Grappler and frozen graph related functions.
-@test_util.run_v1_only("Incompatible with 2.0.")
 class UtilTest(test_util.TensorFlowTestCase):
 
   def testConvertDtype(self):
@@ -59,50 +59,53 @@ class UtilTest(test_util.TensorFlowTestCase):
         util.convert_dtype_to_tflite_type(dtypes.bool), _types_pb2.BOOL)
 
   def testTensorName(self):
-    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
-    # out_tensors should have names: "split:0", "split:1", "split:2", "split:3".
-    out_tensors = array_ops.split(
-        value=in_tensor, num_or_size_splits=[1, 1, 1, 1], axis=0)
-    expect_names = ["split", "split:1", "split:2", "split:3"]
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
+      out_tensors = array_ops.split(
+          value=in_tensor, num_or_size_splits=[1, 1, 1, 1], axis=0)
 
+    expect_names = ["split", "split:1", "split:2", "split:3"]
     for i in range(len(expect_names)):
       got_name = util.get_tensor_name(out_tensors[i])
       self.assertEqual(got_name, expect_names[i])
 
   @test_util.enable_control_flow_v2
   def testRemoveLowerUsingSwitchMerge(self):
-    i = array_ops.placeholder(shape=(), dtype=dtypes.int32)
-    c = lambda i: math_ops.less(i, 10)
-    b = lambda i: math_ops.add(i, 1)
-    control_flow_ops.while_loop(c, b, [i])
-    sess = session.Session()
+    with ops.Graph().as_default():
+      i = array_ops.placeholder(shape=(), dtype=dtypes.int32)
+      c = lambda i: math_ops.less(i, 10)
+      b = lambda i: math_ops.add(i, 1)
+      control_flow_ops.while_loop(c, b, [i])
+      sess = session.Session()
+
     new_graph_def = convert_to_constants.disable_lower_using_switch_merge(
         sess.graph_def)
     lower_using_switch_merge_is_removed = False
     for node in new_graph_def.node:
-      if node.op == "While":
+      if node.op == "While" or node.op == "StatelessWhile":
         if not node.attr["_lower_using_switch_merge"].b:
           lower_using_switch_merge_is_removed = True
     self.assertEqual(lower_using_switch_merge_is_removed, True)
 
 
-@test_util.run_v1_only("Incompatible with 2.0.")
 class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
   def testGetTensorsValid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     tensors = util.get_tensors_from_tensor_names(sess.graph, ["Placeholder"])
     self.assertEqual("Placeholder:0", tensors[0].name)
 
   def testGetTensorsInvalid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      _ = in_tensor + in_tensor
+      sess = session.Session()
 
     with self.assertRaises(ValueError) as error:
       util.get_tensors_from_tensor_names(sess.graph, ["invalid-input"])
@@ -110,14 +113,16 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
                      str(error.exception))
 
   def testSetTensorShapeValid(self):
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
     util.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
   def testSetTensorShapeNoneValid(self):
-    tensor = array_ops.placeholder(dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      tensor = array_ops.placeholder(dtype=dtypes.float32)
     self.assertEqual(None, tensor.shape)
 
     util.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
@@ -125,7 +130,8 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
   def testSetTensorShapeArrayInvalid(self):
     # Tests set_tensor_shape where the tensor name passed in doesn't exist.
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
     with self.assertRaises(ValueError) as error:
@@ -138,7 +144,8 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatiable.
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
     with self.assertRaises(ValueError) as error:
@@ -148,7 +155,8 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
   def testSetTensorShapeEmpty(self):
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    with ops.Graph().as_default():
+      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
     util.set_tensor_shapes([tensor], {})
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 814fa627509..8f5a812f742 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -79,6 +79,12 @@ flatbuffer_cc_library(
     out_prefix = "reflection/",
 )
 
+# Generic schema for model metadata.
+flatbuffer_cc_library(
+    name = "metadata_schema_fbs",
+    srcs = ["metadata_schema.fbs"],
+)
+
 # Schema test to make sure we don't introduce backward incompatible changes
 # to schemas.
 cc_test(
@@ -95,7 +101,7 @@ cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
-        "//tensorflow/core:lib_platform",
+        "//tensorflow/core/platform",
         "@com_google_googletest//:gtest",
         "@flatbuffers//:flatc_library",
     ],
diff --git a/tensorflow/lite/schema/metadata_schema.fbs b/tensorflow/lite/schema/metadata_schema.fbs
new file mode 100644
index 00000000000..d13f9813354
--- /dev/null
+++ b/tensorflow/lite/schema/metadata_schema.fbs
@@ -0,0 +1,160 @@
+// Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace tflite;
+
+// WARNING: This file contains experimental interface and is subject to change.
+
+// This corresponds to the version.
+file_identifier "TFLM";
+// File extension of any written files.
+file_extension "tflitemeta";
+
+enum AssociatedFileType : byte {
+  UNKNOWN = 0,
+  // Files such as readme.txt
+  DESCRIPTIONS = 1,
+  // Contains class labels used in classification. For example, the label files
+  // in image classification.
+  LABELS = 2,
+  // The vocab files used in NLP.
+  VOCABULARY = 3,
+  // Files that translate between languages, for example, the language resource
+  // file in Android.
+  LOCALIZATION = 4,
+}
+
+table AssociatedFile {
+  // Name of this file.
+  name:string;
+
+  // A description of what the file is.
+  description:string;
+
+  // Type of the associated file. There may be special pre/post processing for
+  // some types. For example in image classification, a label file of the output
+  // will be used to convert object index into string.
+  type:AssociatedFileType;
+}
+
+// The type of content that a tensor may represent.
+enum ContentType : byte {
+  UNKNOWN = 0,
+  IMAGE = 1,
+  VIDEO = 2,
+  TEXT = 3,
+  AUDIO = 4,
+  FEATURE = 5,
+}
+
+// The type of color space of an image.
+enum ColorSpaceType : byte {
+  UNKNOWN = 0,
+  RGB = 1,
+  BGR = 2,
+  YUV = 3,
+  HSV = 4,
+  GRAYSCALE = 5,
+}
+
+table ImageSize {
+  width:uint;
+  height:uint;
+}
+
+table ImageProperties {
+  // The color space of the image.
+  color_space:ColorSpaceType;
+
+  // Indicates the default value of image width and height if the tensor shape
+  // is dynamic. For fixed-size tensor, this size will be consistent with the
+  // expected size.
+  default_size:ImageSize;
+}
+
+// Detailed information of an input or output tensor.
+table TensorMetadata {
+  // Name of the tensor.
+  name:string;
+
+  // A description of the tensor.
+  description:string;
+
+  // The type of content that this tensor represents.
+  content_type:ContentType;
+
+  // Values are normailzed per-channelly by (x - mean) / std.
+  // If there is only one value in mean and std, we'll propogate the value to
+  // all channels.
+  // Mean of the possible values used in normalization.
+  mean:[float];
+
+  // Standard dev. of the possible values used in normalization.
+  std:[float];
+
+  // Properties that define an image. The section is used when the Content Type
+  // is specified as image.
+  image_properties:ImageProperties;
+
+  // A list of associated files of this tensor.
+  associated_files:[AssociatedFile];
+}
+
+table SubGraphMetadata {
+  // Name of the subgraph.
+  name:string;
+
+  // A description explains details about what the subgraph does.
+  description:string;
+
+  // Metadata of all input tensors used in this subgraph.
+  input_tensor_metadata:[TensorMetadata];
+
+  // Metadata of all output tensors used in this subgraph.
+  output_tensor_metadata:[TensorMetadata];
+
+  // A list of associated files of this subgraph.
+  associated_files:[AssociatedFile];
+}
+
+table ModelMetadata {
+  // Name of the model.
+  name:string;
+
+  // This is duplicated from the Model description in schema.
+  // description:string;
+
+  // Version of the model that specified by model creators.
+  version:string;
+
+  // Noted that, the minimum required TFLite runtime version that the model is
+  // compatible with, has already been added as a metadata entry in tflite
+  // schema. We'll decide later if we want to move it here, and keep it with
+  // other metadata entries.
+
+  // Metadata of all the subgraphs of the model. The 0th is assumed to be the
+  // main subgraph.
+  subgraph_metadata:[SubGraphMetadata];
+
+  // The person who creates this model.
+  author:string;
+
+  // Licenses that may apply to this model.
+  license:string;
+
+  // A list of associated files of this model.
+  associated_files:[AssociatedFile];
+}
+
+root_type ModelMetadata;
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 65c7156f0d3..d6338603576 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -111,7 +111,7 @@ enum BuiltinOperator : byte {
   CONCATENATION = 2,
   CONV_2D = 3,
   DEPTHWISE_CONV_2D = 4,
-  // DEPTH_TO_SPACE = 5,
+  DEPTH_TO_SPACE = 5,
   DEQUANTIZE = 6,
   EMBEDDING_LOOKUP = 7,
   FLOOR = 8,
@@ -231,6 +231,8 @@ enum BuiltinOperator : byte {
   MATRIX_SET_DIAG = 115,
   ROUND = 116,
   HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
 }
 
 // Options for the builtin operators.
@@ -325,7 +327,10 @@ union BuiltinOptions {
   MatrixDiagOptions,
   QuantizeOptions,
   MatrixSetDiagOptions,
-  HardSwishOptions
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions
 }
 
 enum Padding : byte { SAME, VALID }
@@ -543,6 +548,10 @@ table SpaceToDepthOptions {
   block_size: int;
 }
 
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
 table SubOptions {
   fused_activation_function:ActivationFunctionType;
 }
@@ -783,6 +792,16 @@ table QuantizeOptions {
 table MatrixSetDiagOptions {
 }
 
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -825,9 +844,13 @@ table Operator {
   // The list either has the same length as `inputs`, or is empty.
   mutating_variable_inputs:[bool];
 
-  // Intermediate tensors record the tensor indices that are internal to an Op.
-  // Those tensors contains quantization information for complicated ops such as
-  // LSTM.
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
   intermediates:[int];
 }
 
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index abe1f3f9a4a..b4509e694a6 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -118,6 +118,9 @@ struct SkipGramOptionsT;
 struct SpaceToDepthOptions;
 struct SpaceToDepthOptionsT;
 
+struct DepthToSpaceOptions;
+struct DepthToSpaceOptionsT;
+
 struct SubOptions;
 struct SubOptionsT;
 
@@ -304,6 +307,12 @@ struct QuantizeOptionsT;
 struct MatrixSetDiagOptions;
 struct MatrixSetDiagOptionsT;
 
+struct IfOptions;
+struct IfOptionsT;
+
+struct WhileOptions;
+struct WhileOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -465,6 +474,7 @@ enum BuiltinOperator {
   BuiltinOperator_CONCATENATION = 2,
   BuiltinOperator_CONV_2D = 3,
   BuiltinOperator_DEPTHWISE_CONV_2D = 4,
+  BuiltinOperator_DEPTH_TO_SPACE = 5,
   BuiltinOperator_DEQUANTIZE = 6,
   BuiltinOperator_EMBEDDING_LOOKUP = 7,
   BuiltinOperator_FLOOR = 8,
@@ -577,17 +587,20 @@ enum BuiltinOperator {
   BuiltinOperator_MATRIX_SET_DIAG = 115,
   BuiltinOperator_ROUND = 116,
   BuiltinOperator_HARD_SWISH = 117,
+  BuiltinOperator_IF = 118,
+  BuiltinOperator_WHILE = 119,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_HARD_SWISH
+  BuiltinOperator_MAX = BuiltinOperator_WHILE
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[117] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[120] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
     BuiltinOperator_CONCATENATION,
     BuiltinOperator_CONV_2D,
     BuiltinOperator_DEPTHWISE_CONV_2D,
+    BuiltinOperator_DEPTH_TO_SPACE,
     BuiltinOperator_DEQUANTIZE,
     BuiltinOperator_EMBEDDING_LOOKUP,
     BuiltinOperator_FLOOR,
@@ -699,7 +712,9 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[117] {
     BuiltinOperator_QUANTIZE,
     BuiltinOperator_MATRIX_SET_DIAG,
     BuiltinOperator_ROUND,
-    BuiltinOperator_HARD_SWISH
+    BuiltinOperator_HARD_SWISH,
+    BuiltinOperator_IF,
+    BuiltinOperator_WHILE
   };
   return values;
 }
@@ -711,7 +726,7 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "CONCATENATION",
     "CONV_2D",
     "DEPTHWISE_CONV_2D",
-    "",
+    "DEPTH_TO_SPACE",
     "DEQUANTIZE",
     "EMBEDDING_LOOKUP",
     "FLOOR",
@@ -824,13 +839,15 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "MATRIX_SET_DIAG",
     "ROUND",
     "HARD_SWISH",
+    "IF",
+    "WHILE",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (e < BuiltinOperator_ADD || e > BuiltinOperator_HARD_SWISH) return "";
+  if (e < BuiltinOperator_ADD || e > BuiltinOperator_WHILE) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -928,11 +945,14 @@ enum BuiltinOptions {
   BuiltinOptions_QuantizeOptions = 89,
   BuiltinOptions_MatrixSetDiagOptions = 90,
   BuiltinOptions_HardSwishOptions = 91,
+  BuiltinOptions_IfOptions = 92,
+  BuiltinOptions_WhileOptions = 93,
+  BuiltinOptions_DepthToSpaceOptions = 94,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_HardSwishOptions
+  BuiltinOptions_MAX = BuiltinOptions_DepthToSpaceOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[92] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[95] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -1025,7 +1045,10 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[92] {
     BuiltinOptions_MatrixDiagOptions,
     BuiltinOptions_QuantizeOptions,
     BuiltinOptions_MatrixSetDiagOptions,
-    BuiltinOptions_HardSwishOptions
+    BuiltinOptions_HardSwishOptions,
+    BuiltinOptions_IfOptions,
+    BuiltinOptions_WhileOptions,
+    BuiltinOptions_DepthToSpaceOptions
   };
   return values;
 }
@@ -1124,13 +1147,16 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "QuantizeOptions",
     "MatrixSetDiagOptions",
     "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
-  if (e < BuiltinOptions_NONE || e > BuiltinOptions_HardSwishOptions) return "";
+  if (e < BuiltinOptions_NONE || e > BuiltinOptions_DepthToSpaceOptions) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -1503,6 +1529,18 @@ template<> struct BuiltinOptionsTraits<HardSwishOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_HardSwishOptions;
 };
 
+template<> struct BuiltinOptionsTraits<IfOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_IfOptions;
+};
+
+template<> struct BuiltinOptionsTraits<WhileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<DepthToSpaceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthToSpaceOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2263,6 +2301,30 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_HardSwishOptions ?
       reinterpret_cast<const HardSwishOptionsT *>(value) : nullptr;
   }
+  IfOptionsT *AsIfOptions() {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<IfOptionsT *>(value) : nullptr;
+  }
+  const IfOptionsT *AsIfOptions() const {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<const IfOptionsT *>(value) : nullptr;
+  }
+  WhileOptionsT *AsWhileOptions() {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<WhileOptionsT *>(value) : nullptr;
+  }
+  const WhileOptionsT *AsWhileOptions() const {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<const WhileOptionsT *>(value) : nullptr;
+  }
+  DepthToSpaceOptionsT *AsDepthToSpaceOptions() {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<DepthToSpaceOptionsT *>(value) : nullptr;
+  }
+  const DepthToSpaceOptionsT *AsDepthToSpaceOptions() const {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<const DepthToSpaceOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4889,6 +4951,60 @@ inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
 
 flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct DepthToSpaceOptionsT : public flatbuffers::NativeTable {
+  typedef DepthToSpaceOptions TableType;
+  int32_t block_size;
+  DepthToSpaceOptionsT()
+      : block_size(0) {
+  }
+};
+
+struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DepthToSpaceOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) &&
+           verifier.EndTable();
+  }
+  DepthToSpaceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthToSpaceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthToSpaceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(DepthToSpaceOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit DepthToSpaceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DepthToSpaceOptionsBuilder &operator=(const DepthToSpaceOptionsBuilder &);
+  flatbuffers::Offset<DepthToSpaceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DepthToSpaceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  DepthToSpaceOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct SubOptionsT : public flatbuffers::NativeTable {
   typedef SubOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -7856,6 +7972,138 @@ inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
 
 flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct IfOptionsT : public flatbuffers::NativeTable {
+  typedef IfOptions TableType;
+  int32_t then_subgraph_index;
+  int32_t else_subgraph_index;
+  IfOptionsT()
+      : then_subgraph_index(0),
+        else_subgraph_index(0) {
+  }
+};
+
+struct IfOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef IfOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_THEN_SUBGRAPH_INDEX = 4,
+    VT_ELSE_SUBGRAPH_INDEX = 6
+  };
+  int32_t then_subgraph_index() const {
+    return GetField<int32_t>(VT_THEN_SUBGRAPH_INDEX, 0);
+  }
+  int32_t else_subgraph_index() const {
+    return GetField<int32_t>(VT_ELSE_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_THEN_SUBGRAPH_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_ELSE_SUBGRAPH_INDEX) &&
+           verifier.EndTable();
+  }
+  IfOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<IfOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct IfOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_then_subgraph_index(int32_t then_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0);
+  }
+  void add_else_subgraph_index(int32_t else_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0);
+  }
+  explicit IfOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  IfOptionsBuilder &operator=(const IfOptionsBuilder &);
+  flatbuffers::Offset<IfOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<IfOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<IfOptions> CreateIfOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t then_subgraph_index = 0,
+    int32_t else_subgraph_index = 0) {
+  IfOptionsBuilder builder_(_fbb);
+  builder_.add_else_subgraph_index(else_subgraph_index);
+  builder_.add_then_subgraph_index(then_subgraph_index);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhileOptionsT : public flatbuffers::NativeTable {
+  typedef WhileOptions TableType;
+  int32_t cond_subgraph_index;
+  int32_t body_subgraph_index;
+  WhileOptionsT()
+      : cond_subgraph_index(0),
+        body_subgraph_index(0) {
+  }
+};
+
+struct WhileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhileOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX) &&
+           verifier.EndTable();
+  }
+  WhileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhileOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit WhileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  WhileOptionsBuilder &operator=(const WhileOptionsBuilder &);
+  flatbuffers::Offset<WhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhileOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  WhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -8265,6 +8513,15 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const HardSwishOptions *builtin_options_as_HardSwishOptions() const {
     return builtin_options_type() == BuiltinOptions_HardSwishOptions ? static_cast<const HardSwishOptions *>(builtin_options()) : nullptr;
   }
+  const IfOptions *builtin_options_as_IfOptions() const {
+    return builtin_options_type() == BuiltinOptions_IfOptions ? static_cast<const IfOptions *>(builtin_options()) : nullptr;
+  }
+  const WhileOptions *builtin_options_as_WhileOptions() const {
+    return builtin_options_type() == BuiltinOptions_WhileOptions ? static_cast<const WhileOptions *>(builtin_options()) : nullptr;
+  }
+  const DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const {
+    return builtin_options_type() == BuiltinOptions_DepthToSpaceOptions ? static_cast<const DepthToSpaceOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -8665,6 +8922,18 @@ template<> inline const HardSwishOptions *Operator::builtin_options_as<HardSwish
   return builtin_options_as_HardSwishOptions();
 }
 
+template<> inline const IfOptions *Operator::builtin_options_as<IfOptions>() const {
+  return builtin_options_as_IfOptions();
+}
+
+template<> inline const WhileOptions *Operator::builtin_options_as<WhileOptions>() const {
+  return builtin_options_as_WhileOptions();
+}
+
+template<> inline const DepthToSpaceOptions *Operator::builtin_options_as<DepthToSpaceOptions>() const {
+  return builtin_options_as_DepthToSpaceOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -10156,6 +10425,32 @@ inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbu
       _block_size);
 }
 
+inline DepthToSpaceOptionsT *DepthToSpaceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DepthToSpaceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DepthToSpaceOptions::UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = block_size(); _o->block_size = _e; };
+}
+
+inline flatbuffers::Offset<DepthToSpaceOptions> DepthToSpaceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthToSpaceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthToSpaceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateDepthToSpaceOptions(
+      _fbb,
+      _block_size);
+}
+
 inline SubOptionsT *SubOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SubOptionsT();
   UnPackTo(_o, _resolver);
@@ -11690,6 +11985,64 @@ inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flat
       _fbb);
 }
 
+inline IfOptionsT *IfOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new IfOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void IfOptions::UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = then_subgraph_index(); _o->then_subgraph_index = _e; };
+  { auto _e = else_subgraph_index(); _o->else_subgraph_index = _e; };
+}
+
+inline flatbuffers::Offset<IfOptions> IfOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateIfOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _then_subgraph_index = _o->then_subgraph_index;
+  auto _else_subgraph_index = _o->else_subgraph_index;
+  return tflite::CreateIfOptions(
+      _fbb,
+      _then_subgraph_index,
+      _else_subgraph_index);
+}
+
+inline WhileOptionsT *WhileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new WhileOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; };
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; };
+}
+
+inline flatbuffers::Offset<WhileOptions> WhileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhileOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cond_subgraph_index = _o->cond_subgraph_index;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateWhileOptions(
+      _fbb,
+      _cond_subgraph_index,
+      _body_subgraph_index);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -12347,6 +12700,18 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const HardSwishOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const IfOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const WhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const DepthToSpaceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -12729,6 +13094,18 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const HardSwishOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const IfOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const WhileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const DepthToSpaceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -13099,6 +13476,18 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const HardSwishOptionsT *>(value);
       return CreateHardSwishOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const IfOptionsT *>(value);
+      return CreateIfOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const WhileOptionsT *>(value);
+      return CreateWhileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const DepthToSpaceOptionsT *>(value);
+      return CreateDepthToSpaceOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -13469,6 +13858,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new HardSwishOptionsT(*reinterpret_cast<HardSwishOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_IfOptions: {
+      value = new IfOptionsT(*reinterpret_cast<IfOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      value = new WhileOptionsT(*reinterpret_cast<WhileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      value = new DepthToSpaceOptionsT(*reinterpret_cast<DepthToSpaceOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -13931,6 +14332,21 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<IfOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<WhileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<DepthToSpaceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/testdata/test_min_runtime.bin b/tensorflow/lite/testdata/test_min_runtime.bin
new file mode 100644
index 00000000000..c68174390de
Binary files /dev/null and b/tensorflow/lite/testdata/test_min_runtime.bin differ
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index e2eb79d713d..4f89fda889c 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -74,11 +74,21 @@ test_suite(
 )
 
 py_library(
-    name = "generate_examples_lib",
-    srcs = ["generate_examples_lib.py"],
+    name = "toco_convert",
+    srcs = ["toco_convert.py"],
     data = [
         "//tensorflow/lite/toco",
     ],
+    deps = [
+        ":generate_examples_lib",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "generate_examples_lib",
+    srcs = ["generate_examples_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_report",
@@ -93,13 +103,11 @@ py_library(
 py_binary(
     name = "generate_examples",
     srcs = ["generate_examples.py"],
-    data = [
-        "//tensorflow/lite/toco",
-    ],
     python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_lib",
+        ":toco_convert",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -190,6 +198,7 @@ cc_library(
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:custom_ops",
         "//tensorflow/lite/kernels:reference_ops",
+        "//tensorflow/lite/tools/evaluation:utils",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -368,6 +377,7 @@ cc_library(
     deps = [
         ":split",
         ":tflite_diff_util",
+        ":tflite_driver",
     ] + select({
         "//conditions:default": [
             "//tensorflow/core:framework_internal",
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 5d8662d7939..98f32d854a9 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -35,6 +35,7 @@ import argparse
 import os
 import sys
 from tensorflow.lite.testing import generate_examples_lib
+from tensorflow.lite.testing import toco_convert
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -95,6 +96,7 @@ def main(unused_args):
   options.run_with_flex = FLAGS.run_with_flex
   options.make_edgetpu_tests = FLAGS.make_edgetpu_tests
   options.make_forward_compat_test = FLAGS.make_forward_compat_test
+  options.tflite_convert_function = toco_convert.toco_convert
 
   generate_examples_lib.generate_examples(options)
 
@@ -104,5 +106,6 @@ if __name__ == "__main__":
 
   if unparsed:
     print("Usage: %s <path out> <zip file to generate>")
+    exit(1)
   else:
     tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 792bf50d16a..e8236d176af 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -37,7 +37,6 @@ import os
 import random
 import re
 import string
-import tempfile
 import traceback
 import zipfile
 import numpy as np
@@ -51,10 +50,12 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
+from tensorflow.contrib.quantize.python import quantize_graph
+
 from tensorflow.lite.testing import generate_examples_report as report_lib
 from tensorflow.lite.testing import string_util_wrapper
-from tensorflow.python.framework import test_util
 from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import spectral_ops_test_util
@@ -70,8 +71,6 @@ KNOWN_BUGS = {
     # TOCO doesn't support scalars as input.
     # Concat doesn't work with a single input tensor
     r"concat.*num_tensors=1": "67378344",
-    # Transposition in MatMul is not fully supported.
-    "fully_connected.*transpose_a=True": "67586970",
     # Softmax graphs are too complex.
     r"softmax.*dim=0": "67749831",
     # BatchToSpaceND only supports 4D tensors.
@@ -79,7 +78,7 @@ KNOWN_BUGS = {
     # Div will use floordiv.
     r"div.*int32": "72051395",
     # Strided slice cannot handle new_axis_mask.
-    r"strided_slice.*new_axis_num=1|2": "137470173",
+    r"strided_slice.*spec=\[None": "137470173",
 }
 
 
@@ -106,9 +105,7 @@ class Options(object):
     self.make_edgetpu_tests = False
     # The function to convert a TensorFLow model to TFLite model.
     # See the document for `toco_convert` function for its required signature.
-    # TODO(ycling): Decouple `toco_convert` function from this module, and
-    # remove the `toco` attribute in this class.
-    self.tflite_convert_function = toco_convert
+    self.tflite_convert_function = None
     # A map from regular expression to bug number. Any test failure with label
     # matching the expression will be considered due to the corresponding bug.
     self.known_bugs = KNOWN_BUGS
@@ -158,47 +155,6 @@ class ExtraTocoOptions(object):
     self.inference_output_type = None
 
 
-def toco_options(data_types,
-                 input_arrays,
-                 output_arrays,
-                 shapes,
-                 extra_toco_options=ExtraTocoOptions()):
-  """Create TOCO options to process a model.
-
-  Args:
-    data_types: input and inference types used by TOCO.
-    input_arrays: names of the input tensors
-    output_arrays: name of the output tensors
-    shapes: shapes of the input tensors
-    extra_toco_options: additional toco options
-  Returns:
-    the options in a string.
-  """
-  shape_str = ":".join([",".join(str(y) for y in x) for x in shapes if x])
-  inference_type = "FLOAT"
-  # TODO(ahentz): if we get multi-input quantization to work we need this
-  # to change
-  if data_types[0] == "QUANTIZED_UINT8":
-    inference_type = "QUANTIZED_UINT8"
-  s = (" --input_data_types=%s" % ",".join(data_types) +
-       " --inference_type=%s" % inference_type +
-       " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
-       " --input_arrays=%s" % ",".join(input_arrays) +
-       " --output_arrays=%s" % ",".join(output_arrays))
-  if shape_str:
-    s += (" --input_shapes=%s" % shape_str)
-  if extra_toco_options.drop_control_dependency:
-    s += " --drop_control_dependency"
-  if extra_toco_options.allow_custom_ops:
-    s += " --allow_custom_ops"
-  if extra_toco_options.rnn_states:
-    s += (" --rnn_states='" + extra_toco_options.rnn_states + "'")
-  if extra_toco_options.split_tflite_lstm_inputs is not None:
-    if extra_toco_options.split_tflite_lstm_inputs:
-      s += " --split_tflite_lstm_inputs=true"
-    else:
-      s += " --split_tflite_lstm_inputs=false"
-  return s
 
 
 def format_result(t):
@@ -268,7 +224,7 @@ def write_test_cases(fp, model_name, examples):
     fp.write("}\n")
 
 
-_TF_TYPE_INFO = {
+TF_TYPE_INFO = {
     tf.float32: (np.float32, "FLOAT"),
     tf.float16: (np.float16, "FLOAT"),
     tf.int32: (np.int32, "INT32"),
@@ -283,8 +239,8 @@ _TF_TYPE_INFO = {
 def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
   """Build tensor data spreading the range [min_value, max_value)."""
 
-  if dtype in _TF_TYPE_INFO:
-    dtype = _TF_TYPE_INFO[dtype][0]
+  if dtype in TF_TYPE_INFO:
+    dtype = TF_TYPE_INFO[dtype][0]
 
   if dtype in (tf.float32, tf.float16):
     value = (max_value-min_value)*np.random.random_sample(shape)+min_value
@@ -303,8 +259,8 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
 def create_scalar_data(dtype, min_value=-100, max_value=100):
   """Build scalar tensor data range from min_value to max_value exclusively."""
 
-  if dtype in _TF_TYPE_INFO:
-    dtype = _TF_TYPE_INFO[dtype][0]
+  if dtype in TF_TYPE_INFO:
+    dtype = TF_TYPE_INFO[dtype][0]
 
   if dtype in (tf.float32, tf.float16):
     value = (max_value - min_value) * np.random.random() + min_value
@@ -361,100 +317,6 @@ def make_control_dep_tests(options):
       expected_tf_failures=3)
 
 
-def toco_convert(options, graph_def, input_tensors, output_tensors, **kwargs):
-  """Convert a model's graph def into a tflite model.
-
-  NOTE: this currently shells out to the toco binary, but we would like
-  convert to Python API tooling in the future.
-
-  Args:
-    options: An Options instance.
-    graph_def: A GraphDef object.
-    input_tensors: List of input tensor tuples `(name, shape, type)`.
-    output_tensors: List of output tensors (names).
-    **kwargs: Extra options to be passed.
-
-  Returns:
-    output tflite model, log_txt from conversion
-    or None, log_txt if it did not convert properly.
-  """
-  # Convert ophint ops if presented.
-  graph_def = tf.lite.experimental.convert_op_hints_to_stubs(
-      graph_def=graph_def)
-  graph_def_str = graph_def.SerializeToString()
-
-  extra_toco_options = kwargs.get("extra_toco_options", ExtraTocoOptions())
-  test_params = kwargs.get("test_params", {})
-  input_arrays = [x[0] for x in input_tensors]
-  data_types = [_TF_TYPE_INFO[x[2]][1] for x in input_tensors]
-
-  if test_params.get("fully_quantize", False):
-    with tempfile.NamedTemporaryFile() as graphdef_file:
-      graphdef_file.write(graph_def_str)
-      graphdef_file.flush()
-
-      input_shapes = get_input_shapes_map(input_tensors)
-      converter = tf.lite.TocoConverter.from_frozen_graph(
-          graphdef_file.name, input_arrays, output_tensors, input_shapes)
-
-      def representative_dataset(input_tensors):
-        calibration_inputs = []
-        for _, shape, _ in input_tensors:
-          if shape:
-            dims = [dim.value for dim in shape.dims]
-            calibration_inputs.append(
-                np.random.uniform(-1, 1, tuple(dims)).astype(np.float32))
-        return calibration_inputs
-
-      def representative_dataset_gen():
-        for _ in range(100):
-          yield representative_dataset(input_tensors)
-
-      converter.target_spec.supported_ops = [
-          tf.lite.OpsSet.TFLITE_BUILTINS_INT8
-      ]
-      converter.representative_dataset = representative_dataset_gen
-      if extra_toco_options.inference_input_type:
-        converter.inference_input_type = (
-            extra_toco_options.inference_input_type)
-      if extra_toco_options.inference_output_type:
-        converter.inference_output_type = (
-            extra_toco_options.inference_output_type)
-
-      try:
-        tflite_model = converter.convert()
-        return tflite_model, ""
-      except Exception as e:
-        log = "{0}\n{1}".format(str(e), traceback.format_exc())
-        return None, log
-
-  else:
-    opts = toco_options(
-        data_types=data_types,
-        input_arrays=input_arrays,
-        shapes=[x[1] for x in input_tensors],
-        output_arrays=output_tensors,
-        extra_toco_options=extra_toco_options)
-
-    with tempfile.NamedTemporaryFile() as graphdef_file, \
-         tempfile.NamedTemporaryFile() as output_file, \
-         tempfile.NamedTemporaryFile("w+") as stdout_file:
-      graphdef_file.write(graph_def_str)
-      graphdef_file.flush()
-
-      # TODO(aselle): Switch this to subprocess at some point.
-      if options.run_with_flex:
-        opts += " --enable_select_tf_ops --force_select_tf_ops"
-      cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
-             (bin_path, graphdef_file.name, output_file.name, opts,
-              stdout_file.name))
-      exit_code = os.system(cmd)
-      log = (
-          cmd + "exited with code %d" % exit_code + "\n------------------\n" +
-          stdout_file.read())
-      return (None if exit_code != 0 else output_file.read()), log
-
-
 def get_input_shapes_map(input_tensors):
   """Gets a map of input names to shapes.
 
@@ -649,7 +511,6 @@ def make_zip_of_tests(options,
         if "split_tflite_lstm_inputs" in param_dict_real:
           extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
               "split_tflite_lstm_inputs"]
-
         tflite_model_binary, toco_log = options.tflite_convert_function(
             options,
             graph_def,
@@ -834,6 +695,7 @@ def make_abs_tests(options):
 
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
 
+
 @register_make_test_function()
 def make_elu_tests(options):
   """Make a set of tests to do (float) tf.nn.elu."""
@@ -863,6 +725,106 @@ def make_elu_tests(options):
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
 
 
+@register_make_test_function()
+def make_hardswish_tests(options):
+  """Make a set of tests to do hardswish."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                      [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    inp = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+
+    out = inp * tf.nn.relu6(inp + np.float32(3)) * np.float32(1. / 6.)
+
+    return [inp], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-10, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  # Add additional validation if we are using toco.
+  # Flex and mlir doesn't yet support this. TODO(b/139193008): Fix
+  if not options.run_with_flex:
+    options.tflite_convert_function = functools.partial(
+        _tflite_convert_verify_num_ops,
+        options.tflite_convert_function,
+        num_ops=2)
+  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
+
+
+def _tflite_convert_verify_num_ops(tflite_convert_function, *args, **kwargs):
+  """Verifies that the result of the conversion is a single op."""
+  num_ops = kwargs.pop("num_ops", 2)
+  result = tflite_convert_function(*args, **kwargs)
+  tflite_model_binary = result[0]
+  if not result[0]:
+    tf.logging.error(result[1])  # stderr from running tflite_convert.
+    raise RuntimeError("Failed to bulid model: \n\n" + result[1])
+  interpreter = tf.lite.Interpreter(model_content=tflite_model_binary)
+  interpreter.allocate_tensors()
+  if len(interpreter.get_tensor_details()) != num_ops:
+    raise RuntimeError("Expected to generate two node graph got %r " %
+                       interpreter.get_tensor_details())
+  return result
+
+
+@register_make_test_function()
+def make_uint8_hardswish_tests(options):
+  """Make a set of tests to do hardswish."""
+  # Chose a set of parameters.
+  test_parameters = [{
+      "input_shape": [[2, 3]],
+      "fully_quantize": [True],
+  }]
+  def build_graph(parameters):
+    """Builds tensorflow graph."""
+    inp = tf.placeholder(dtype=tf.float32, name="input",
+                         shape=parameters["input_shape"])
+
+    # Note: there is some magic about the inputs being in the range [-1,1]
+    # or else some quantization range need to be fixed.
+    qinp = array_ops.fake_quant_with_min_max_args(
+        inp, min=-1, max=1, num_bits=8)
+    relu6 = tf.nn.relu6(qinp + np.float32(3)) * np.float32(1. / 6.)
+    out = qinp * relu6
+    quantize_graph.experimental_create_eval_graph(
+        inp.graph, weight_bits=8, activation_bits=8)
+    return [qinp], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-1, max_value=1)
+    output_values = sess.run(outputs,
+                             feed_dict=dict(zip(inputs, [input_values])))
+    return [input_values], output_values
+
+  # Add additional validation if we are using toco.
+  # Flex, doesn't yet support this. TODO(b/139193008): Remove this constraitn
+  if not options.run_with_flex:
+    # Expect 2 quantize operators and one hard swish resulting in 4 tensors.
+    options.tflite_convert_function = functools.partial(
+        _tflite_convert_verify_num_ops,
+        options.tflite_convert_function,
+        num_ops=4)
+  extra_toco_options = ExtraTocoOptions()
+  extra_toco_options.inference_input_type = tf.lite.constants.QUANTIZED_UINT8
+  extra_toco_options.inference_output_type = tf.lite.constants.QUANTIZED_UINT8
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      extra_toco_options=extra_toco_options,
+      use_frozen_graph=True)
+
+
 @register_make_test_function()
 def make_identity_tests(options):
   """Make a set of tests to do identity."""
@@ -870,12 +832,21 @@ def make_identity_tests(options):
   # Chose a set of parameters
   test_parameters = [{
       "input_shape": [[], [1], [3, 3]],
-      "use_snapshot": [False, True],
+      "op_to_use": [
+          "identity", "identity_n", "snapshot", "identity_n_with_2_inputs"
+      ],
   }]
 
   def build_graph(parameters):
-    input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    input_tensors = []
+    input_count = (2 if parameters["op_to_use"] == "identity_n_with_2_inputs"
+                   else 1)
+    input_tensors = [
+        tf.placeholder(
+            dtype=tf.float32, name="input", shape=parameters["input_shape"])
+        for _ in range(input_count)
+    ]
+
     # We add the Multiply before Identity just as a walk-around to make the test
     # pass when input_shape is scalar.
     # During graph transformation, TOCO will replace the Identity op with
@@ -883,18 +854,24 @@ def make_identity_tests(options):
     # between missing shape and scalar shape. As a result, when input has scalar
     # shape, this conversion still fails.
     # TODO(b/129197312), remove the walk-around code once the bug is fixed.
-    input_doubled = input_tensor * 2.0
-    if parameters["use_snapshot"]:
-      identity_output = array_ops.snapshot(input_doubled)
-    else:
-      identity_output = tf.identity(input_doubled)
-    return [input_tensor], [identity_output]
+    inputs_doubled = [input_tensor * 2.0 for input_tensor in input_tensors]
+    if parameters["op_to_use"] == "identity":
+      identity_outputs = [tf.identity(inputs_doubled[0])]
+    elif parameters["op_to_use"] == "snapshot":
+      identity_outputs = [array_ops.snapshot(inputs_doubled[0])]
+    elif parameters["op_to_use"] in ("identity_n", "identity_n_with_2_inputs"):
+      identity_outputs = tf.identity_n(inputs_doubled)
+    return input_tensors, identity_outputs
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(
-        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    input_values = [
+        create_tensor_data(
+            np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+        for _ in range(len(inputs))
+    ]
+
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
 
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
 
@@ -1100,7 +1077,7 @@ def make_constant_tests(options):
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
-        parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
+        parameters["input_shape"], dtype=TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
@@ -2313,6 +2290,12 @@ def make_fully_connected_tests(options):
       "transpose_a": [False],
       "transpose_b": [True],
       "constant_filter": [True, False],
+  }, {
+      "shape1": [[5, 3]],
+      "shape2": [[5, 3]],
+      "transpose_a": [True],
+      "transpose_b": [False],
+      "constant_filter": [True, False],
   }]
 
   def build_graph(parameters):
@@ -2836,6 +2819,33 @@ def make_space_to_depth_tests(options):
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
 
 
+@register_make_test_function()
+def make_depth_to_space_tests(options):
+  """Make a set of tests to do depth_to_space."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32, tf.uint8, tf.int64],
+      "input_shape": [[2, 3, 4, 16]],
+      "block_size": [2, 4],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.depth_to_space(input_tensor, block_size=parameters["block_size"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
+
+
 @register_make_test_function()
 def make_space_to_batch_nd_tests(options):
   """Make a set of tests to do space_to_batch_nd."""
@@ -3166,7 +3176,7 @@ def _make_strided_slice_tests(options, test_parameters,
     """Build inputs for stride_slice test."""
     input_values = create_tensor_data(parameters["dtype"],
                                       parameters["input_shape"])
-    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+    index_type = TF_TYPE_INFO[parameters["index_type"]][0]
     values = [input_values]
     if not parameters["constant_indices"]:
       begin_values = np.array(parameters["begin"]).astype(index_type)
@@ -3294,12 +3304,37 @@ def make_strided_slice_np_style_tests(options):
   test_parameters = [
       {
           "dtype": [tf.float32],
-          "new_axis_num": [0, 1, 2],
-          "shape": [[12, 7], [33]],
-          "stride": [1, 2, 3],
-          "use_begin_end_mask": [True, False],
-          # share between begin and end to avoid creating too many combinations.
-          "begin_end_offset": [0, 1, 3]
+          "shape": [[12, 7], [33, 1]],
+          "spec": [[slice(3, 7, 2), slice(None)],
+                   [tf.newaxis,
+                    slice(3, 7, 1), tf.newaxis,
+                    slice(None)], [slice(1, 5, 1), slice(None)]],
+      },
+      # 1-D case
+      {
+          "dtype": [tf.float32],
+          "shape": [[44]],
+          "spec": [[slice(3, 7, 2)], [tf.newaxis, slice(None)]],
+      },
+      # Shrink mask.
+      {
+          "dtype": [tf.float32],
+          "shape": [[21, 15, 7]],
+          "spec": [[slice(3, 7, 2), slice(None), 2]],
+      },
+      # Ellipsis.
+      {
+          "dtype": [tf.float32],
+          "shape": [[21, 15, 7]],
+          "spec": [[slice(3, 7, 2), Ellipsis]],
+      },
+      # All combinations.
+      {
+          "dtype": [tf.float32],
+          "shape": [[21, 15, 7]],
+          "spec": [[tf.newaxis,
+                    slice(3, 7, 2),
+                    slice(None), Ellipsis]],
       },
   ]
 
@@ -3312,38 +3347,12 @@ def make_strided_slice_np_style_tests(options):
     Returns:
       strided_slice spec, e.g., [2:3, :] or [tf.newaxis, :, tf.newaxis].
     """
-    shape = parameters["shape"]
-    new_axis_num = parameters["new_axis_num"]
-    insert_new_axis_array = [False] * len(shape)
-    for _ in range(new_axis_num):
-      insert_loc = np.random.randint(0, len(insert_new_axis_array) + 1)
-      insert_new_axis_array.insert(insert_loc, True)
-    slice_spec = []
-    index = 0
-    for insert_new_axis in insert_new_axis_array:
-      if insert_new_axis:
-        slice_spec.append(tf.newaxis)
-      else:
-        # Random pop up begin/end/strides or just use ":"
-        if parameters["use_begin_end_mask"]:
-          # use slice(None), means use all values, equivalent of ":".
-          slice_spec.append(slice(None))
-        else:
-          # Begin.
-          begin = parameters["begin_end_offset"]
-          # End.
-          end = shape[index] - parameters["begin_end_offset"]
-          # Strides.
-          stride = parameters["stride"]
-          slice_spec.append(slice(begin, end, stride))
-        index += 1
-    return slice_spec
 
   def build_graph(parameters):
     """Build a simple graph with np style strided_slice."""
     input_value = tf.placeholder(
         dtype=parameters["dtype"], shape=parameters["shape"])
-    out = input_value.__getitem__(build_strided_slice_spec(parameters))
+    out = input_value.__getitem__(parameters["spec"])
     return [input_value], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -4120,7 +4129,7 @@ def make_slice_tests(options):
     """Build inputs for slice test."""
     input_values = create_tensor_data(parameters["dtype"],
                                       parameters["input_shape"])
-    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+    index_type = TF_TYPE_INFO[parameters["index_type"]][0]
 
     begin_values = np.array(parameters["begin"]).astype(index_type)
     size_values = np.array(parameters["size"]).astype(index_type)
@@ -4797,7 +4806,7 @@ def make_placeholder_with_default_tests(options):
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    numpy_type = _TF_TYPE_INFO[parameters["dtype"]][0]
+    numpy_type = TF_TYPE_INFO[parameters["dtype"]][0]
     input_value = np.array([[1, 0], [2, 1]], numpy_type)
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
@@ -5259,13 +5268,8 @@ def make_rfft2d_tests(options):
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs,
                     extra_toco_options)
 
-# Toco binary path provided by the generate rule.
-bin_path = None
-
 
 def generate_examples(options):
-  global bin_path
-
   def mkdir_if_not_exist(x):
     if not os.path.isdir(x):
       os.mkdir(x)
@@ -5276,7 +5280,6 @@ def generate_examples(options):
   mkdir_if_not_exist(opstest_path)
 
   out = options.zip_to_output
-  bin_path = options.toco
   # Some zip filenames contain a postfix identifying the conversion mode. The
   # list of valid conversion modes is defined in
   # generated_test_conversion_modes() in build_def.bzl.
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index b293611bf47..df77b94aeab 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -106,6 +106,9 @@ std::map<string, string> kBrokenTests = {
 
     // Select kernel doesn't support broadcasting yet.
     {R"(^\/where.*1,2,3,1)", "134692786"},
+
+    // Strided slice doesn't support ellipsis.
+    {R"(strided_slice.*Ellipsis)", "138098220"},
 };
 
 // Additional list of tests that are expected to fail when
@@ -262,7 +265,9 @@ TEST_P(OpsTest, RunZipTests) {
 
   std::ifstream tflite_stream(tflite_test_case);
   ASSERT_TRUE(tflite_stream.is_open()) << tflite_test_case;
-  tflite::testing::TfLiteDriver test_driver(FLAGS_use_nnapi);
+  tflite::testing::TfLiteDriver test_driver(
+      FLAGS_use_nnapi ? TfLiteDriver::DelegateType::kNnapi
+                      : TfLiteDriver::DelegateType::kNone);
 
   if (test_path.find("fully_quantize=True") != std::string::npos) {
     // TODO(b/134594898): Tighten this constraint.
diff --git a/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
index 34c1728ed1d..dbbaf164709 100644
--- a/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
+++ b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
@@ -19,10 +19,13 @@ int main(int argc, char** argv) {
   tflite::testing::kernel_test::TestOptions options =
       tflite::testing::kernel_test::ParseTfliteKernelTestFlags(&argc, argv);
   const bool run_reference_kernel = options.kernel_type == "REFERENCE";
-  const bool use_nnapi = options.kernel_type == "NNAPI";
+  const tflite::testing::TfLiteDriver::DelegateType delegate_type =
+      options.kernel_type == "NNAPI"
+          ? tflite::testing::TfLiteDriver::DelegateType::kNnapi
+          : tflite::testing::TfLiteDriver::DelegateType::kNone;
 
   auto runner = absl::make_unique<tflite::testing::TfLiteDriver>(
-      use_nnapi, "", run_reference_kernel);
+      delegate_type, run_reference_kernel);
   if (tflite::testing::kernel_test::RunKernelTest(options, runner.get()) ==
       kTfLiteOk) {
     return 0;
diff --git a/tensorflow/lite/testing/kernel_test/util_test.cc b/tensorflow/lite/testing/kernel_test/util_test.cc
index cbec6609283..0599ded72d8 100644
--- a/tensorflow/lite/testing/kernel_test/util_test.cc
+++ b/tensorflow/lite/testing/kernel_test/util_test.cc
@@ -34,7 +34,8 @@ TEST(UtilTest, SimpleE2ETest) {
       "tensorflow/lite/testdata/test_input.csv";
   options.dump_output_to_file = FLAGS_test_tmpdir + "/test_out.csv";
   options.kernel_type = "REFERENCE";
-  std::unique_ptr<TestRunner> runner(new TfLiteDriver(false, "", true));
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(
+      TfLiteDriver::DelegateType::kNone, /*reference_kernel=*/true));
   RunKernelTest(options, runner.get());
   std::string expected = "3";
   for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
diff --git a/tensorflow/lite/testing/model_coverage/BUILD b/tensorflow/lite/testing/model_coverage/BUILD
index 7e6a65997d3..39ed70c869f 100644
--- a/tensorflow/lite/testing/model_coverage/BUILD
+++ b/tensorflow/lite/testing/model_coverage/BUILD
@@ -4,7 +4,7 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
-py_binary(
+py_library(
     name = "model_coverage_lib",
     srcs = ["model_coverage_lib.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index d1309b7014f..328ac9e068f 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -38,7 +38,6 @@ from tensorflow.python.saved_model import saved_model
 from tensorflow.python.training.training_util import write_graph
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateFrozenGraph(test.TestCase):
 
   def _saveFrozenGraph(self, sess):
@@ -47,27 +46,29 @@ class EvaluateFrozenGraph(test.TestCase):
     return graph_def_file
 
   def testFloat(self):
-    with session.Session().as_default() as sess:
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
-      _ = in_tensor + in_tensor
-    filename = self._saveFrozenGraph(sess)
+    with ops.Graph().as_default():
+      with session.Session().as_default() as sess:
+        in_tensor = array_ops.placeholder(
+            shape=[1, 16, 16, 3], dtype=dtypes.float32)
+        _ = in_tensor + in_tensor
 
+    filename = self._saveFrozenGraph(sess)
     model_coverage.test_frozen_graph(filename, ['Placeholder'], ['add'])
 
   def testMultipleOutputs(self):
-    with session.Session().as_default() as sess:
-      in_tensor_1 = array_ops.placeholder(
-          shape=[1, 16], dtype=dtypes.float32, name='inputA')
-      in_tensor_2 = array_ops.placeholder(
-          shape=[1, 16], dtype=dtypes.float32, name='inputB')
+    with ops.Graph().as_default():
+      with session.Session().as_default() as sess:
+        in_tensor_1 = array_ops.placeholder(
+            shape=[1, 16], dtype=dtypes.float32, name='inputA')
+        in_tensor_2 = array_ops.placeholder(
+            shape=[1, 16], dtype=dtypes.float32, name='inputB')
+
+        weight = constant_op.constant(-1.0, shape=[16, 16])
+        bias = constant_op.constant(-1.0, shape=[16])
+        layer = math_ops.matmul(in_tensor_1, weight) + bias
+        _ = math_ops.reduce_mean(math_ops.square(layer - in_tensor_2))
 
-      weight = constant_op.constant(-1.0, shape=[16, 16])
-      bias = constant_op.constant(-1.0, shape=[16])
-      layer = math_ops.matmul(in_tensor_1, weight) + bias
-      _ = math_ops.reduce_mean(math_ops.square(layer - in_tensor_2))
     filename = self._saveFrozenGraph(sess)
-
     model_coverage.test_frozen_graph(filename, ['inputA', 'inputB'],
                                      ['add', 'Mean'])
 
@@ -94,17 +95,18 @@ class EvaluateFrozenGraph(test.TestCase):
 
   def _getQuantizedModel(self):
     np.random.seed(0)
-    with session.Session().as_default() as sess:
-      # The tensor needs to have more than 1024 elements for quantize_weights to
-      # kick in. Thus, the [33, 33] shape.
-      in_tensor_1 = array_ops.placeholder(
-          shape=[33, 33], dtype=dtypes.float32, name='inputA')
-      in_tensor_2 = constant_op.constant(
-          np.random.uniform(low=-10., high=10., size=(33, 33)),
-          shape=[33, 33],
-          dtype=dtypes.float32,
-          name='inputB')
-      _ = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+    with ops.Graph().as_default():
+      with session.Session().as_default() as sess:
+        # The tensor needs to have more than 1024 elements for quantize_weights
+        # to kick in. Thus, the [33, 33] shape.
+        in_tensor_1 = array_ops.placeholder(
+            shape=[33, 33], dtype=dtypes.float32, name='inputA')
+        in_tensor_2 = constant_op.constant(
+            np.random.uniform(low=-10., high=10., size=(33, 33)),
+            shape=[33, 33],
+            dtype=dtypes.float32,
+            name='inputB')
+        _ = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
 
     filename = self._saveFrozenGraph(sess)
     return filename
@@ -125,25 +127,24 @@ class EvaluateFrozenGraph(test.TestCase):
         target_ops=set([lite.OpsSet.SELECT_TF_OPS]))
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateSavedModel(test.TestCase):
 
   def testFloat(self):
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
-    with session.Session().as_default() as sess:
-      in_tensor_1 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-      in_tensor_2 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-      out_tensor = in_tensor_1 + in_tensor_2
+    with ops.Graph().as_default():
+      with session.Session().as_default() as sess:
+        in_tensor_1 = array_ops.placeholder(
+            shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+        in_tensor_2 = array_ops.placeholder(
+            shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+        out_tensor = in_tensor_1 + in_tensor_2
 
-      inputs = {'x': in_tensor_1, 'y': in_tensor_2}
-      outputs = {'z': out_tensor}
-      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+        inputs = {'x': in_tensor_1, 'y': in_tensor_2}
+        outputs = {'z': out_tensor}
+        saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
     model_coverage.test_saved_model(saved_model_dir)
 
 
-@test_util.run_v1_only('Incompatible with 2.0.')
 class EvaluateKerasModel(test.TestCase):
 
   def _getSingleInputKerasModel(self):
diff --git a/tensorflow/lite/testing/nnapi_example.cc b/tensorflow/lite/testing/nnapi_example.cc
index 309cb19628c..a847ffa9968 100644
--- a/tensorflow/lite/testing/nnapi_example.cc
+++ b/tensorflow/lite/testing/nnapi_example.cc
@@ -42,7 +42,9 @@ bool Interpret(const char* examples_filename, bool use_nnapi) {
   }
 
   printf("Use nnapi is set to: %d\n", use_nnapi);
-  tflite::testing::TfLiteDriver test_driver(use_nnapi);
+  tflite::testing::TfLiteDriver test_driver(
+      use_nnapi ? tflite::testing::TfLiteDriver::DelegateType::kNnapi
+                : tflite::testing::TfLiteDriver::DelegateType::kNone);
 
   test_driver.SetModelBaseDir(dirname(examples_filename));
   if (!tflite::testing::ParseAndRunTests(&tflite_stream, &test_driver)) {
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index ffd76e8dc7e..b63aeccafbd 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -60,7 +60,7 @@ int FillTensorWithTfLiteHexString(tensorflow::Tensor* tensor,
   int num_strings = values_as_string.empty() ? 0 : GetStringCount(s.data());
 
   if (num_strings == tensor->NumElements()) {
-    auto data = tensor->flat<string>();
+    auto data = tensor->flat<tensorflow::tstring>();
     for (size_t i = 0; i < num_strings; ++i) {
       auto ref = GetString(s.data(), i);
       data(i).assign(ref.str, ref.len);
@@ -87,7 +87,7 @@ string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
 string TensorDataToTfLiteHexString(const tensorflow::Tensor& tensor) {
   DynamicBuffer dynamic_buffer;
 
-  auto data = tensor.flat<string>();
+  auto data = tensor.flat<tensorflow::tstring>();
   for (int i = 0; i < tensor.NumElements(); ++i) {
     dynamic_buffer.AddString(data(i).data(), data(i).size());
   }
diff --git a/tensorflow/lite/testing/tflite_diff_flags.h b/tensorflow/lite/testing/tflite_diff_flags.h
index 2fe068eb20f..8b1205e58d7 100644
--- a/tensorflow/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/lite/testing/tflite_diff_flags.h
@@ -17,9 +17,10 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/testing/split.h"
 #include "tensorflow/lite/testing/tflite_diff_util.h"
-#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 namespace tflite {
 namespace testing {
@@ -33,9 +34,10 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
     string input_layer_shape;
     string output_layer;
     int32_t num_runs_per_pass = 100;
-    string delegate;
+    string delegate_name;
   } values;
 
+  std::string delegate_name;
   std::vector<tensorflow::Flag> flags = {
       tensorflow::Flag("tensorflow_model", &values.tensorflow_model,
                        "Path of tensorflow model."),
@@ -55,9 +57,9 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "output_1,output_2."),
       tensorflow::Flag("num_runs_per_pass", &values.num_runs_per_pass,
                        "[optional] Number of full runs in each pass."),
-      tensorflow::Flag("delegate", &values.delegate,
+      tensorflow::Flag("delegate", &values.delegate_name,
                        "[optional] Delegate to use for executing ops. Must be "
-                       "`{\"\", FLEX}`"),
+                       "`{\"\", NNAPI, GPU, FLEX}`"),
   };
 
   bool no_inputs = *argc == 1;
@@ -70,9 +72,20 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
              values.input_layer_shape.empty() || values.output_layer.empty()) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return {};
-  } else if (!(values.delegate == "" || values.delegate == "FLEX")) {
-    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
-    return {};
+  }
+
+  TfLiteDriver::DelegateType delegate = TfLiteDriver::DelegateType::kNone;
+  if (!values.delegate_name.empty()) {
+    if (delegate_name == "NNAPI") {
+      delegate = TfLiteDriver::DelegateType::kNnapi;
+    } else if (values.delegate_name == "GPU") {
+      delegate = TfLiteDriver::DelegateType::kGpu;
+    } else if (values.delegate_name == "FLEX") {
+      delegate = TfLiteDriver::DelegateType::kFlex;
+    } else {
+      fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+      return {};
+    }
   }
 
   return {values.tensorflow_model,
@@ -82,7 +95,7 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
           Split<string>(values.input_layer_shape, ":"),
           Split<string>(values.output_layer, ","),
           values.num_runs_per_pass,
-          values.delegate};
+          delegate};
 }
 
 }  // namespace testing
diff --git a/tensorflow/lite/testing/tflite_diff_util.cc b/tensorflow/lite/testing/tflite_diff_util.cc
index 0142ae4217e..721830adc4d 100644
--- a/tensorflow/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/lite/testing/tflite_diff_util.cc
@@ -33,7 +33,7 @@ bool RunDiffTest(const DiffOptions& options, int num_invocations) {
           options.input_layer_shape, options.output_layer)) {
     return false;
   }
-  TfLiteDriver tflite_driver(/*use_nnapi=*/true, options.delegate);
+  TfLiteDriver tflite_driver(options.delegate);
   tflite_driver.LoadModel(options.tflite_model);
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
diff --git a/tensorflow/lite/testing/tflite_diff_util.h b/tensorflow/lite/testing/tflite_diff_util.h
index 3f9f10892db..091134f50f8 100644
--- a/tensorflow/lite/testing/tflite_diff_util.h
+++ b/tensorflow/lite/testing/tflite_diff_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/string.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 namespace tflite {
 namespace testing {
@@ -44,9 +45,8 @@ struct DiffOptions {
   // each of the passes. The first pass has a single inference, while the
   // second pass does multiple inferences back to back.
   int num_runs_per_pass;
-  // Path to the delegate library to be loaded in order to execute ops. Must be
-  // `{"", FLEX}`.
-  string delegate;
+  // The type of delegate to apply during inference.
+  TfLiteDriver::DelegateType delegate;
 };
 
 // Run a single TensorFLow Lite diff test with a given options.
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 50981c5f101..cbed30fecfb 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/tools/evaluation/utils.h"
 
 namespace tflite {
 namespace testing {
@@ -259,9 +260,8 @@ bool TfLiteDriver::Expectation::Check(bool verbose,
   }
 }
 
-TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name,
-                           bool reference_kernel)
-    : use_nnapi_(use_nnapi),
+TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
+    : delegate_(nullptr, nullptr),
       relative_threshold_(kRelativeThreshold),
       absolute_threshold_(kAbsoluteThreshold) {
   if (reference_kernel) {
@@ -274,8 +274,21 @@ TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name,
                                    tflite::ops::custom::Register_RFFT2D());
   }
 
-  if (delegate_name == "FLEX") {
-    delegate_ = FlexDelegate::Create();
+  switch (delegate_type) {
+    case DelegateType::kNone:
+      break;
+    case DelegateType::kNnapi:
+      delegate_ = evaluation::CreateNNAPIDelegate();
+      break;
+    case DelegateType::kGpu:
+      delegate_ = evaluation::CreateGPUDelegate(/*model=*/nullptr);
+      break;
+    case DelegateType::kFlex:
+      delegate_ = Interpreter::TfLiteDelegatePtr(
+          FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
+            delete static_cast<tflite::FlexDelegate*>(delegate);
+          });
+      break;
   }
 }
 
@@ -310,8 +323,6 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
     Invalidate("Failed build interpreter");
     return;
   }
-  interpreter_->UseNNAPI(use_nnapi_);
-
   if (delegate_) {
     if (interpreter_->ModifyGraphWithDelegate(delegate_.get()) != kTfLiteOk) {
       Invalidate("Unable to the build graph using the delegate");
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 8dd6459b8f2..a9bd92a2a66 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -31,7 +31,19 @@ namespace testing {
 // A test runner that feeds inputs into TF Lite and verifies its outputs.
 class TfLiteDriver : public TestRunner {
  public:
-  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "",
+  enum class DelegateType {
+    kNone,
+    kNnapi,
+    kGpu,
+    kFlex,
+  };
+
+  /**
+   * Creates a new TfLiteDriver
+   * @param  delegate         The (optional) delegate to use.
+   * @param  reference_kernel Whether to use the builtin reference kernel ops.
+   */
+  explicit TfLiteDriver(DelegateType delegate_type = DelegateType::kNone,
                         bool reference_kernel = false);
   ~TfLiteDriver() override;
 
@@ -71,8 +83,7 @@ class TfLiteDriver : public TestRunner {
   class Expectation;
 
   std::unique_ptr<OpResolver> resolver_;
-  std::unique_ptr<FlexDelegate> delegate_;
-  bool use_nnapi_ = false;
+  Interpreter::TfLiteDelegatePtr delegate_;
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
   std::map<int, std::unique_ptr<Expectation>> expected_output_;
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 93125c4701d..99efd2d66d1 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -24,7 +24,7 @@ namespace {
 using ::testing::ElementsAre;
 
 TEST(TfliteDriverTest, SimpleTest) {
-  std::unique_ptr<TestRunner> runner(new TfLiteDriver(/*use_nnapi=*/false));
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver());
 
   runner->SetModelBaseDir("tensorflow/lite");
   runner->LoadModel("testdata/multi_add.bin");
@@ -60,7 +60,8 @@ TEST(TfliteDriverTest, SimpleTest) {
 
 TEST(TfliteDriverTest, SingleAddOpTest) {
   std::unique_ptr<TestRunner> runner(new TfLiteDriver(
-      /*use_nnapi*/ false, /*delegate*/ "", /*reference_kernel*/ true));
+      /*delegate_type=*/TfLiteDriver::DelegateType::kNone,
+      /*reference_kernel=*/true));
 
   runner->SetModelBaseDir("tensorflow/lite");
   runner->LoadModel("testdata/multi_add.bin");
@@ -95,7 +96,7 @@ TEST(TfliteDriverTest, SingleAddOpTest) {
 }
 
 TEST(TfliteDriverTest, AddQuantizedInt8Test) {
-  std::unique_ptr<TestRunner> runner(new TfLiteDriver(/*use_nnapi=*/false));
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver());
 
   runner->SetModelBaseDir("tensorflow/lite");
   runner->LoadModel("testdata/add_quantized_int8.bin");
diff --git a/tensorflow/lite/testing/toco_convert.py b/tensorflow/lite/testing/toco_convert.py
new file mode 100644
index 00000000000..d14a9899358
--- /dev/null
+++ b/tensorflow/lite/testing/toco_convert.py
@@ -0,0 +1,170 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import numpy as np
+import tensorflow as tf
+import traceback
+
+from tensorflow.lite.testing import generate_examples_lib
+
+
+def toco_options(data_types,
+                 input_arrays,
+                 output_arrays,
+                 shapes,
+                 extra_toco_options=None):
+  """Create TOCO options to process a model.
+
+  Args:
+    data_types: input and inference types used by TOCO.
+    input_arrays: names of the input tensors
+    output_arrays: name of the output tensors
+    shapes: shapes of the input tensors
+    extra_toco_options: additional toco options
+
+  Returns:
+    the options in a string.
+  """
+  if extra_toco_options is None:
+    extra_toco_options = generate_examples_lib.ExtraTocoOptions()
+
+  shape_str = ":".join([",".join(str(y) for y in x) for x in shapes if x])
+  inference_type = "FLOAT"
+  # TODO(ahentz): if we get multi-input quantization to work we need this
+  # to change
+  if data_types[0] == "QUANTIZED_UINT8":
+    inference_type = "QUANTIZED_UINT8"
+  s = (" --input_data_types=%s" % ",".join(data_types) +
+       " --inference_type=%s" % inference_type +
+       " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
+       " --input_arrays=%s" % ",".join(input_arrays) +
+       " --output_arrays=%s" % ",".join(output_arrays))
+  if shape_str:
+    s += (" --input_shapes=%s" % shape_str)
+  if extra_toco_options.drop_control_dependency:
+    s += " --drop_control_dependency"
+  if extra_toco_options.allow_custom_ops:
+    s += " --allow_custom_ops"
+  if extra_toco_options.rnn_states:
+    s += (" --rnn_states='" + extra_toco_options.rnn_states + "'")
+  if extra_toco_options.split_tflite_lstm_inputs is not None:
+    if extra_toco_options.split_tflite_lstm_inputs:
+      s += " --split_tflite_lstm_inputs=true"
+    else:
+      s += " --split_tflite_lstm_inputs=false"
+  return s
+
+
+def toco_convert(options, graph_def, input_tensors, output_tensors, **kwargs):
+  """Convert a model's graph def into a tflite model.
+
+  NOTE: this currently shells out to the toco binary, but we would like
+  convert to Python API tooling in the future.
+
+  Args:
+    options: An Options instance.
+    graph_def: A GraphDef object.
+    input_tensors: List of input tensor tuples `(name, shape, type)`.
+    output_tensors: List of output tensors (names).
+    **kwargs: Extra options to be passed.
+
+  Returns:
+    output tflite model, log_txt from conversion
+    or None, log_txt if it did not convert properly.
+  """
+  # Convert ophint ops if presented.
+  graph_def = tf.lite.experimental.convert_op_hints_to_stubs(
+      graph_def=graph_def)
+  graph_def_str = graph_def.SerializeToString()
+
+  extra_toco_options = kwargs.get(
+      "extra_toco_options", generate_examples_lib.ExtraTocoOptions())
+  test_params = kwargs.get("test_params", {})
+  input_arrays = [x[0] for x in input_tensors]
+  data_types = [
+      generate_examples_lib.TF_TYPE_INFO[x[2]][1] for x in input_tensors]
+
+  if test_params.get("fully_quantize", False):
+    with tempfile.NamedTemporaryFile() as graphdef_file:
+      graphdef_file.write(graph_def_str)
+      graphdef_file.flush()
+
+      input_shapes = generate_examples_lib.get_input_shapes_map(input_tensors)
+      converter = tf.lite.TocoConverter.from_frozen_graph(
+          graphdef_file.name, input_arrays, output_tensors, input_shapes)
+
+      def representative_dataset(input_tensors):
+        calibration_inputs = []
+        for _, shape, _ in input_tensors:
+          if shape:
+            dims = [dim.value for dim in shape.dims]
+            calibration_inputs.append(
+                np.random.uniform(-1, 1, tuple(dims)).astype(np.float32))
+        return calibration_inputs
+
+      def representative_dataset_gen():
+        for _ in range(100):
+          yield representative_dataset(input_tensors)
+
+      converter.target_spec.supported_ops = [
+          tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+      ]
+      converter.representative_dataset = representative_dataset_gen
+      if extra_toco_options.inference_input_type:
+        converter.inference_input_type = (
+            extra_toco_options.inference_input_type)
+      if extra_toco_options.inference_output_type:
+        converter.inference_output_type = (
+            extra_toco_options.inference_output_type)
+
+      try:
+        tflite_model = converter.convert()
+        return tflite_model, ""
+      except Exception as e:
+        log = "{0}\n{1}".format(str(e), traceback.format_exc())
+        return None, log
+
+  else:
+    opts = toco_options(
+        data_types=data_types,
+        input_arrays=input_arrays,
+        shapes=[x[1] for x in input_tensors],
+        output_arrays=output_tensors,
+        extra_toco_options=extra_toco_options)
+
+    with tempfile.NamedTemporaryFile() as graphdef_file, \
+         tempfile.NamedTemporaryFile() as output_file, \
+         tempfile.NamedTemporaryFile("w+") as stdout_file:
+      graphdef_file.write(graph_def_str)
+      graphdef_file.flush()
+
+      # TODO(aselle): Switch this to subprocess at some point.
+      if options.run_with_flex:
+        opts += " --enable_select_tf_ops --force_select_tf_ops"
+      cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
+             (options.toco, graphdef_file.name, output_file.name, opts,
+              stdout_file.name))
+      exit_code = os.system(cmd)
+      log = (
+          cmd + "exited with code %d" % exit_code + "\n------------------\n" +
+          stdout_file.read())
+      return (None if exit_code != 0 else output_file.read()), log
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 43714fcf902..b9637f6367b 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_cc",
     "tf_proto_library_py",
 )
@@ -201,6 +201,7 @@ cc_library(
         "graph_transformations/group_bidirectional_sequence_ops.cc",
         "graph_transformations/hardcode_min_max.cc",
         "graph_transformations/identify_dilated_conv.cc",
+        "graph_transformations/identify_hardswish.cc",
         "graph_transformations/identify_l2_normalization.cc",
         "graph_transformations/identify_l2_pool.cc",
         "graph_transformations/identify_lstm.cc",
@@ -208,6 +209,7 @@ cc_library(
         "graph_transformations/identify_lstm_split_inputs.cc",
         "graph_transformations/identify_prelu.cc",
         "graph_transformations/identify_relu1.cc",
+        "graph_transformations/identify_util.cc",
         "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
         "graph_transformations/merge_reshape_into_preceding_transpose.cc",
@@ -222,6 +224,7 @@ cc_library(
         "graph_transformations/quantize.cc",
         "graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
+        "graph_transformations/remove_successive_transpose.cc",
         "graph_transformations/remove_tensorflow_assert.cc",
         "graph_transformations/remove_tensorflow_identity.cc",
         "graph_transformations/remove_trivial_binary.cc",
@@ -279,6 +282,7 @@ cc_library(
     ],
     hdrs = [
         "graph_transformations/graph_transformations.h",
+        "graph_transformations/identify_util.h",
         "graph_transformations/lstm_utils.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index c53e07031f2..7eda054a118 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -144,7 +144,7 @@ DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceRnn)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
-// TODO(b/131260336): Add IdentifyHardSwish
+DECLARE_GRAPH_TRANSFORMATION(IdentifyHardSwish)
 DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
@@ -159,6 +159,7 @@ DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
 DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp)
+DECLARE_GRAPH_TRANSFORMATION(RemoveSuccesiveTranspose)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
diff --git a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
new file mode 100644
index 00000000000..00758a22177
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/identify_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+// This transformation rule tries to identify the HardSwish structure generated
+// by tensorflow.
+// The formula of hardswish is:
+// f(x) = x * relu6((x+3))/6
+//
+// We look for the following tensorflow subgraph:
+// x * tf.nn.relu6(x + np.float32(3)) * np.float32(1. / 6.)
+namespace toco {
+
+using util::IsBinaryOp;
+
+::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
+                                            bool* modified) {
+  *modified = false;
+  const auto add_with_relu6_op_it = (model->operators.begin() + op_index);
+  const auto add_with_relu6_op = add_with_relu6_op_it->get();
+  if (!util::IsBinaryOp(add_with_relu6_op, OperatorType::kAdd,
+                        FusedActivationFunctionType::kRelu6)) {
+    return ::tensorflow::Status::OK();
+  }
+  std::vector<const Operator*> ops;
+  ops.push_back(add_with_relu6_op);
+  const auto* mul_op = GetOpWithInput(*model, add_with_relu6_op->outputs[0]);
+  ops.push_back(mul_op);
+
+  if (mul_op->type == OperatorType::kFakeQuant) {
+    mul_op = GetOpWithInput(*model, mul_op->outputs[0]);
+    ops.push_back(mul_op);
+  }
+  if (!IsBinaryOp(mul_op, OperatorType::kMul)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  const auto* output_op = GetOpWithInput(*model, mul_op->outputs[0]);
+  ops.push_back(output_op);
+  if (output_op->type == OperatorType::kFakeQuant) {
+    output_op = GetOpWithInput(*model, output_op->outputs[0]);
+    ops.push_back(output_op);
+  }
+  if (!IsBinaryOp(output_op, OperatorType::kMul)) {
+    return ::tensorflow::Status::OK();
+  }
+  const auto add_3_tensor =
+      util::GetSingleScalarInputIndexOfBinaryOp(model, add_with_relu6_op, 3.0f);
+  if (add_3_tensor < 0) {
+    // Expected 3.0f got something else.;
+    return ::tensorflow::Status::OK();
+  }
+  const auto input_tensor_name = add_with_relu6_op->inputs[1 - add_3_tensor];
+
+  // Now we verify that the 3 mul arguments are respectively:
+  // 1. non-constant input of add_with_relu6_op
+  // 2. 1/6
+  // 3. (and add_with_relu6_op[0].outputs[0] - which we already know!)
+  std::vector<string> mul_inputs = mul_op->inputs;
+  mul_inputs.insert(mul_inputs.end(), output_op->inputs.begin(),
+                    output_op->inputs.end());
+
+  // 1. Check that we have the input tensor as one of the multiplicants
+  if (std::find(mul_inputs.begin(), mul_inputs.end(), input_tensor_name) ==
+      mul_inputs.end()) {
+    // Input tensor not found! << input_tensor_name << std::endl;
+    return ::tensorflow::Status::OK();
+  }
+  // 2. Find 1/6
+  bool found = false;
+  for (const auto& input : mul_inputs) {
+    found |= util::CheckArrayIsScalarFloat(model, input, 1.f / 6.f);
+  }
+  if (!found) {
+    // Input tensor is not divided by 6!.";
+    return ::tensorflow::Status::OK();
+  }
+  //  Success! Now delete the subgraph and instert new one
+  const auto output_tensor_name = output_op->outputs[0];
+  auto* hardswish_op = new HardSwishOperator;
+  hardswish_op->inputs = {input_tensor_name};
+  hardswish_op->outputs = {output_tensor_name};
+  model->operators.emplace(add_with_relu6_op_it, hardswish_op);
+  AddMessageF("Creating hardswish op (%s) replacing equivalent subgraph",
+              LogName(*hardswish_op));
+  while (!ops.empty()) {
+    DeleteOpAndArrays(model, ops.back());
+    ops.pop_back();
+  }
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
index bcafd5675b1..af923c1e6e5 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
@@ -17,44 +17,15 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/identify_util.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-namespace {
-
-std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
-    Model* model, const Operator* op) {
-  auto it = model->operators.begin();
-  for (; it != model->operators.end(); ++it) {
-    if (it->get() == op) {
-      break;
-    }
-  }
-  return it;
-}
-
-bool CheckArrayIsScalarFloat(Model* model, const std::string& name, float val) {
-  const auto& op_array = model->GetArray(name);
-  if (!op_array.buffer || op_array.buffer->type != ArrayDataType::kFloat ||
-      RequiredBufferSizeForShape(op_array.shape()) != 1) {
-    return false;
-  }
-  const auto& op_data = op_array.GetBuffer<ArrayDataType::kFloat>().data;
-  return op_data[0] == val;
-}
-
-// Returns index of scalar input when there is exactly one scalar, -1 otherwise
-int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
-                                        float val) {
-  bool input0_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[0], val);
-  bool input1_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[1], val);
-  return input0_is_scalar == input1_is_scalar ? -1 : input0_is_scalar ? 0 : 1;
-}
-}  // namespace
+using util::GetSingleScalarInputIndexOfBinaryOp;
 
 ::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
                                         bool* modified) {
diff --git a/tensorflow/lite/toco/graph_transformations/identify_util.cc b/tensorflow/lite/toco/graph_transformations/identify_util.cc
new file mode 100644
index 00000000000..67e58897b25
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/identify_util.cc
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/toco/graph_transformations/identify_util.h"
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+namespace util {
+
+bool IsBinaryOp(const Operator* op, OperatorType optype,
+                FusedActivationFunctionType act) {
+  return op && op->type == optype && op->inputs.size() == 2 &&
+         op->fused_activation_function == act;
+}
+
+bool CheckArrayIsScalarFloat(Model* model, const std::string& name, float val) {
+  const auto& op_array = model->GetArray(name);
+  if (!op_array.buffer || op_array.buffer->type != ArrayDataType::kFloat ||
+      RequiredBufferSizeForShape(op_array.shape()) != 1) {
+    return false;
+  }
+  const auto& op_data = op_array.GetBuffer<ArrayDataType::kFloat>().data;
+  return op_data[0] == val;
+}
+
+int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
+                                        float val) {
+  bool input0_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[0], val);
+  bool input1_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[1], val);
+  return input0_is_scalar == input1_is_scalar ? -1 : input0_is_scalar ? 0 : 1;
+}
+
+}  // namespace util
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_util.h b/tensorflow/lite/toco/graph_transformations/identify_util.h
new file mode 100644
index 00000000000..1a79231ff01
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/identify_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
+#include <string>
+
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+namespace util {
+
+bool IsBinaryOp(
+    const Operator* op, OperatorType optype,
+    FusedActivationFunctionType act = FusedActivationFunctionType::kNone);
+
+// Returns true if given array is a scalar and is val.
+bool CheckArrayIsScalarFloat(Model* model, const std::string& name, float val);
+
+// Returns index of scalar input that is equal to val, returns -1 otherwise.
+int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
+                                        float val);
+}  // namespace util
+}  // namespace toco
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 0f67edce9b1..360ab3cbd5c 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -55,7 +55,6 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
   // Do the actual output data types propagation.
   switch (op->type) {
     case OperatorType::kDequantize:
-    case OperatorType::kResizeBilinear:
       // These operators unconditionally produce float outputs
       SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
       break;
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 7f953a34e9c..d7a56e6d4b2 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1747,7 +1747,7 @@ void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
 }
 
 void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
-  CHECK(op->inputs.size() == 3 || op->inputs.size() == 4);
+  CHECK(op->inputs.size() == 4 || op->inputs.size() == 5);
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) return;
 
@@ -1757,7 +1757,7 @@ void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
   const auto& weights_time_array = model->GetArray(op->inputs[2]);
   if (!weights_time_array.has_shape()) return;
 
-  const bool has_bias = (op->inputs.size() == 4);
+  const bool has_bias = (op->inputs.size() == 5);
   if (has_bias) {
     const auto& bias_array = model->GetArray(op->inputs[3]);
     if (!bias_array.has_shape()) return;
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 8e951e017b8..da30d59aaf8 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -16,64 +16,98 @@ limitations under the License.
 #include <cmath>
 #include <limits>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
 namespace {
 
-bool SupportsQuantization(const Operator& op) {
+bool SupportsQuantization(Model* model, const Operator& op) {
   auto type = op.type;
   if (type == OperatorType::kUnsupported) {
     auto* unsupported = static_cast<const TensorFlowUnsupportedOperator*>(&op);
     return unsupported->quantized;
   }
-  return type == OperatorType::kConv || type == OperatorType::kDepthwiseConv ||
-         type == OperatorType::kFullyConnected ||
-         type == OperatorType::kConcatenation ||
-         type == OperatorType::kL2Normalization || type == OperatorType::kAdd ||
-         type == OperatorType::kAveragePool || type == OperatorType::kMaxPool ||
-         type == OperatorType::kMinimum || type == OperatorType::kMaximum ||
-         type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
-         type == OperatorType::kLogSoftmax || type == OperatorType::kSlice ||
-         type == OperatorType::kResizeBilinear ||
-         type == OperatorType::kSplit || type == OperatorType::kSplitV ||
-         type == OperatorType::kSub || type == OperatorType::kSqueeze ||
-         type == OperatorType::kPad || type == OperatorType::kPadV2 ||
-         type == OperatorType::kReshape || type == OperatorType::kTanh ||
-         type == OperatorType::kMul || type == OperatorType::kBatchToSpaceND ||
-         type == OperatorType::kSum || type == OperatorType::kSpaceToBatchND ||
-         type == OperatorType::kSpaceToDepth ||
-         type == OperatorType::kStridedSlice ||
-         type == OperatorType::kDepthToSpace ||
-         type == OperatorType::kLstmCell || type == OperatorType::kGather ||
-         type == OperatorType::kTranspose || type == OperatorType::kMean ||
-         type == OperatorType::kEqual || type == OperatorType::kGreater ||
-         type == OperatorType::kGreaterEqual || type == OperatorType::kLess ||
-         type == OperatorType::kLessEqual || type == OperatorType::kSelect ||
-         type == OperatorType::kArgMax || type == OperatorType::kRelu ||
-         type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
-         type == OperatorType::kLeakyRelu || type == OperatorType::kShape ||
-         type == OperatorType::kExpandDims || type == OperatorType::kPack ||
-         type == OperatorType::kUnpack || type == OperatorType::kTopK_V2 ||
-         type == OperatorType::kRandomUniform ||
-         type == OperatorType::kResizeNearestNeighbor ||
-         type == OperatorType::kPRelu || type == OperatorType::kReduceMax ||
-         type == OperatorType::kReduceMin ||
-         type == OperatorType::kTransposeConv ||
-         type == OperatorType::kMatrixSetDiag ||
-         type == OperatorType::kMatrixDiag ||
-         type == OperatorType::kSparseToDense ||
-         type == OperatorType::kHardSwish;
+  if (op.type == OperatorType::kRange) {
+    const auto& array = model->GetArray(op.outputs[0]);
+    return (array.data_type != ArrayDataType::kFloat &&
+            array.data_type != ArrayDataType::kFloat16);
+  }
+  // Please add op in alpha-beta sequence.
+  static const std::set<OperatorType> supported_ops{
+      OperatorType::kAdd,
+      OperatorType::kArgMax,
+      OperatorType::kAveragePool,
+      OperatorType::kBatchToSpaceND,
+      OperatorType::kConcatenation,
+      OperatorType::kConv,
+      OperatorType::kDepthToSpace,
+      OperatorType::kDepthwiseConv,
+      OperatorType::kEqual,
+      OperatorType::kExpandDims,
+      OperatorType::kFullyConnected,
+      OperatorType::kGather,
+      OperatorType::kGreater,
+      OperatorType::kGreaterEqual,
+      OperatorType::kHardSwish,
+      OperatorType::kL2Normalization,
+      OperatorType::kLeakyRelu,
+      OperatorType::kLess,
+      OperatorType::kLessEqual,
+      OperatorType::kLogistic,
+      OperatorType::kLogSoftmax,
+      OperatorType::kLstmCell,
+      OperatorType::kMatrixDiag,
+      OperatorType::kMatrixSetDiag,
+      OperatorType::kMaximum,
+      OperatorType::kMaxPool,
+      OperatorType::kMean,
+      OperatorType::kMinimum,
+      OperatorType::kMirrorPad,
+      OperatorType::kMul,
+      OperatorType::kPack,
+      OperatorType::kPad,
+      OperatorType::kPadV2,
+      OperatorType::kPRelu,
+      OperatorType::kRandomUniform,
+      OperatorType::kReduceMax,
+      OperatorType::kReduceMin,
+      OperatorType::kRelu,
+      OperatorType::kRelu1,
+      OperatorType::kRelu6,
+      OperatorType::kReshape,
+      OperatorType::kResizeBilinear,
+      OperatorType::kResizeNearestNeighbor,
+      OperatorType::kSelect,
+      OperatorType::kShape,
+      OperatorType::kSlice,
+      OperatorType::kSoftmax,
+      OperatorType::kSpaceToBatchND,
+      OperatorType::kSpaceToDepth,
+      OperatorType::kSparseToDense,
+      OperatorType::kSplit,
+      OperatorType::kSplitV,
+      OperatorType::kSqueeze,
+      OperatorType::kStridedSlice,
+      OperatorType::kSub,
+      OperatorType::kSum,
+      OperatorType::kTanh,
+      OperatorType::kTopK_V2,
+      OperatorType::kTranspose,
+      OperatorType::kTransposeConv,
+      OperatorType::kUnpack,
+  };
+  return supported_ops.find(type) != supported_ops.end();
 }
 
 // The quantized op allows output arrays of type float using
@@ -494,7 +528,7 @@ void FixMinMaxPostQuantization(GraphTransformation* transformation,
           << "Input array " << input << " is missing quantization_params";
     }
   }
-  if (!SupportsQuantization(op)) {
+  if (!SupportsQuantization(model, op)) {
     return tensorflow::errors::InvalidArgument(
         "Unimplemented: this graph contains an operator of type ",
         HelpfulOperatorTypeName(op),
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
new file mode 100644
index 00000000000..1f0fdf88108
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+bool TransformsToIdentity(std::vector<int> const& perm1,
+                          std::vector<int> const& perm2) {
+  if (perm2.size() != perm1.size() || perm1.empty()) {
+    return false;
+  }
+  // perm1 is the order of the indices after first transpose. When perm1 is
+  // reordered according to perm2, if the result is simple increasing sequence
+  // i.e., range(0, perm1.size()), then the two transposes cancel each other.
+  for (int i = 0; i < perm1.size(); ++i) {
+    if (perm1[i] < 0 || perm1[i] >= perm1.size() || perm2[i] < 0 ||
+        perm2[i] >= perm1.size()) {
+      return false;
+    }
+    if (perm1[perm2[i]] != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ReplaceOpInputsWith(Model* model, const string& lookfor,
+                         const string& replacewith) {
+  for (const auto& op : model->operators) {
+    for (int i = 0; i < op->inputs.size(); ++i) {
+      if (op->inputs[i] == lookfor) {
+        op->inputs[i] = replacewith;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+::tensorflow::Status RemoveSuccesiveTranspose::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
+  auto op = model->operators.begin() + op_index;
+  if (op->get()->type != OperatorType::kTranspose) {
+    return ::tensorflow::Status::OK();
+  }
+
+  TransposeOperator* t_op = static_cast<TransposeOperator*>(op->get());
+  if (CountOpsWithInput(*model, t_op->outputs[0]) != 1) {
+    return ::tensorflow::Status::OK();
+  }
+  Operator* next = GetOpWithInput(*model, t_op->outputs[0]);
+  if (!next || next->type != OperatorType::kTranspose) {
+    return ::tensorflow::Status::OK();
+  }
+
+  TransposeOperator* t_next = static_cast<TransposeOperator*>(next);
+  if (!CountOpsWithInput(*model, t_next->outputs[0])) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (TransformsToIdentity(t_op->perm, t_next->perm)) {
+    // Find the input tensor that uses the results of transpose t_next, then
+    // make it point to the input of t_op, effectively isolating both the
+    // transposes from the graph.
+    ReplaceOpInputsWith(model, t_next->outputs[0], t_op->inputs[0]);
+    DeleteOpAndArrays(model, t_next);
+    DeleteOpAndArrays(model, t_op);
+    *modified = true;
+  }
+
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 292c60121f7..bd529bd9ecd 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -57,6 +57,12 @@ void Reroute(const string& from, const string& to, Model* model) {
       to_array.final_data_type == ArrayDataType::kNone) {
     to_array.final_data_type = from_array.final_data_type;
   }
+  // The 'from' array may now be unused. We delete it here immediately
+  // so that this function doesn't violate graph invariants (no unused arrays)
+  // and as it's not trivial to get this right for the caller since
+  // DeleteOpAndArrays will no longer delete this array, since it's no longer
+  // referenced by this op.
+  DeleteArrayIfUnused(from, model);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
index 637579e8633..03d2f05978e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -165,7 +165,7 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
   }
 }
 
-void EvaluateBinaryOperatorOnConstantInputs(Model* model,
+bool EvaluateBinaryOperatorOnConstantInputs(Model* model,
                                             const Operator* binary_op) {
   const auto inputs_data_type = model->GetArray(binary_op->inputs[0]).data_type;
   const auto output_data_type =
@@ -175,7 +175,7 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
       output_data_type == OutputDataType) {                                 \
     EvaluateBinaryOperatorOnConstantInputs<InputsDataType, OutputDataType>( \
         model, binary_op);                                                  \
-    return;                                                                 \
+    return true;                                                            \
   }
   TOCO_HANDLE_CASE(ArrayDataType::kFloat, ArrayDataType::kFloat)
   TOCO_HANDLE_CASE(ArrayDataType::kFloat, ArrayDataType::kBool)
@@ -183,8 +183,7 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
   TOCO_HANDLE_CASE(ArrayDataType::kInt32, ArrayDataType::kBool)
   TOCO_HANDLE_CASE(ArrayDataType::kInt64, ArrayDataType::kInt64)
   TOCO_HANDLE_CASE(ArrayDataType::kInt64, ArrayDataType::kBool)
-  LOG(FATAL) << "Unimplemented: don't know how to resolve a constant "
-             << "binary operator for these data types.";
+  return false;
 #undef TOCO_HANDLE_CASE
 }
 }  // namespace
@@ -245,7 +244,9 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
       << static_cast<int>(input1_array.data_type) << ").";
 
   // Do the actual constants propagation
-  EvaluateBinaryOperatorOnConstantInputs(model, binary_op);
+  if (!EvaluateBinaryOperatorOnConstantInputs(model, binary_op)) {
+    return ::tensorflow::Status::OK();
+  }
 
   DeleteOpAndArrays(model, binary_op);
   *modified = true;
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index 1aa30bcf1f3..ac95d609e91 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -66,25 +66,69 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
   const auto* matmul_op =
       static_cast<const TensorFlowMatMulOperator*>(matmul_it->get());
 
-  // Handling transposition of the first input here isn't very simple because
-  // we need to know the actual shape in order to produce a proper
-  // TransposeOperator.  However, the second input is supposed to be 2D, so we
-  // can actually handle transposition of that matrix, which happens to be more
-  // common anyway.
+  auto refresh_matmul_iterator = [&model, &matmul_it, &matmul_op]() {
+    matmul_it = std::find_if(model->operators.begin(), model->operators.end(),
+                             [matmul_op](const std::unique_ptr<Operator>& op) {
+                               return op.get() == matmul_op;
+                             });
+    DCHECK_EQ(matmul_it->get(), matmul_op);
+  };
+
+  string input_lhs = matmul_op->inputs[0];
+  string input_rhs = matmul_op->inputs[1];
+
+  // Handle `transpose_a` with best effort: If the dimension of lhs is known,
+  // insert a `Transpose` op.
   if (matmul_op->transpose_a) {
-    AddMessageF(
-        "Not replacing %s by a FullyConnected operator, because it has "
-        "the transpose_a attribute",
-        LogName(*matmul_op));
-    return ::tensorflow::Status::OK();
+    Array& lhs_array = model->GetArray(input_lhs);
+    if (!lhs_array.has_shape()) {
+      AddMessageF(
+          "Not replacing %s by a FullyConnected operator, because it has "
+          "the transpose_a attribute and LHS has no shape",
+          LogName(*matmul_op));
+      return ::tensorflow::Status::OK();
+    }
+
+    int dimensions_count = lhs_array.shape().dimensions_count();
+    if (dimensions_count < 2) {
+      return ::tensorflow::errors::InvalidArgument(
+          "Inputs of MatMul should have dimension >= 2. Got %d dimensions",
+          dimensions_count);
+    }
+
+    // Create a permutation vector to exchange the last 2 dimensions.
+    // E.g. For 4D, create [0, 1, 3, 2].
+    std::vector<int> perm;
+    perm.reserve(dimensions_count);
+    for (int i = 0; i < dimensions_count; ++i) {
+      perm.push_back(i);
+    }
+    std::swap(perm[dimensions_count - 1], perm[dimensions_count - 2]);
+
+    auto* transpose_op = new TransposeOperator;
+    transpose_op->inputs = {
+        input_lhs,
+        CreateInt32Array(
+            model, AvailableArrayName(*model, input_lhs + "/transpose/perm"),
+            perm)};
+    transpose_op->outputs = {
+        AvailableArrayName(*model, input_lhs + "/transpose")};
+    model->GetOrCreateArray(transpose_op->outputs[0]);
+    model->operators.emplace(matmul_it, transpose_op);
+    // Sanity check
+    DCHECK_EQ(transpose_op, FindTransposeOpWithInput(*model, input_lhs));
+    input_lhs = transpose_op->outputs[0];
+
+    refresh_matmul_iterator();
   }
 
+  // TODO(b/138662017): The following code assumes that RHS is 2D. This isn't
+  // always true in TensorFlow.
+  //
   // Reorder the axes on the second input. TensorFlow uses row-major ordering
   // on both inputs, however this is inefficient for the FullyConnected
   // operator. We'll transpose the second input to be in column-major order now
   // and let constant propagation optimize things (if possible).
-  string input_lhs = matmul_op->inputs[0];
-  string input_rhs = matmul_op->inputs[1];
   if (!matmul_op->transpose_b) {
     // Need to transpose input_rhs, by inserting a TransposeOperator.
     // First, check if there already is a TransposeOperator transposing that
@@ -108,6 +152,7 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
       model->operators.emplace(matmul_it, transpose_op);
       // Sanity check
       DCHECK_EQ(transpose_op, FindTransposeOpWithInput(*model, input_rhs));
+      refresh_matmul_iterator();
     } else {
       AddMessageF(
           "While replacing %s by a FullyConnected operator, reused existing "
@@ -118,15 +163,6 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
     input_rhs = transpose_op->outputs[0];
   }
 
-  // Refresh iterator.
-  matmul_it = model->operators.begin();
-  for (; matmul_it != model->operators.end(); ++matmul_it) {
-    if (matmul_it->get() == matmul_op) {
-      break;
-    }
-  }
-  DCHECK_EQ(matmul_it->get(), matmul_op);
-
   // Construct the new FullyConnectedOperator.
   auto* fc_op = new FullyConnectedOperator;
   fc_op->inputs = {input_lhs, input_rhs};
@@ -181,14 +217,7 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
     }
 
     // We may have just invalidated matmul_it, so let's refresh it now.
-    matmul_it = model->operators.begin();
-    for (; matmul_it != model->operators.end(); ++matmul_it) {
-      if (matmul_it->get() == matmul_op) {
-        break;
-      }
-    }
-    CHECK(matmul_it != model->operators.end());
-    CHECK(matmul_it->get() == matmul_op);
+    refresh_matmul_iterator();
   } else {
     AddMessageF("Replacing %s by a FullyConnected operator",
                 LogName(*matmul_op));
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 49924292e70..099c083ab55 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -31,6 +31,17 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "remove_successive_transpose_test",
+    srcs = ["remove_successive_transpose_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_test(
     name = "resolve_constant_concatenation_test",
     srcs = ["resolve_constant_concatenation_test.cc"],
diff --git a/tensorflow/lite/toco/graph_transformations/tests/remove_successive_transpose_test.cc b/tensorflow/lite/toco/graph_transformations/tests/remove_successive_transpose_test.cc
new file mode 100644
index 00000000000..a5a0afbe8d1
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/remove_successive_transpose_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace {
+
+using ::testing::Test;
+
+class RemoveSuccessiveTransposeTest : public Test {
+ protected:
+  RemoveSuccessiveTransposeTest() {}
+
+  void SetUp() override { model_.reset(new toco::Model); }
+
+  void CreateArray(const std::string& name, const std::vector<int>& shape) {
+    toco::Array& array = model_->GetOrCreateArray(name);
+    array.data_type = toco::ArrayDataType::kFloat;
+    toco::Shape* array_shape = array.mutable_shape();
+    *(array_shape->mutable_dims()) = shape;
+  }
+
+  void CreateConstantArray(const std::string& name,
+                           const std::vector<int>& shape,
+                           const std::vector<float>& data) {
+    CreateArray(name, shape);
+    toco::Array& array = model_->GetOrCreateArray(name);
+    auto& array_buffer = array.GetMutableBuffer<toco::ArrayDataType::kFloat>();
+    int bufsize = 1;
+    for (int dim : shape) {
+      bufsize *= dim;
+    }
+    array_buffer.data.resize(bufsize);
+    float* buf_ptr = array_buffer.data.data();
+    for (int i = 0; i < bufsize; ++i) {
+      buf_ptr[i] = data[i];
+    }
+  }
+
+  void CreateGraph(const std::vector<int>& perm1,
+                   const std::vector<int>& perm2) {
+    CreateArray("InputA", {2, 2});
+    CreateArray("InputB", {2, 2});
+    CreateArray("Input", {2, 2});
+    CreateArray("InputTranspose", {2, 2});
+    CreateArray("InputTransposeTranspose", {2, 2});
+    CreateArray("InputTransposeTransposePlusB", {2, 2});
+
+    auto* add_op = new toco::AddOperator;
+    add_op->inputs = {"InputA", "InputB"};
+    add_op->outputs = {"Input"};
+    model_->operators.push_back(std::unique_ptr<toco::Operator>(add_op));
+
+    auto* transpose_op = new toco::TransposeOperator;
+    transpose_op->inputs = {"Input"};
+    transpose_op->perm = perm1;
+    transpose_op->outputs = {"InputTranspose"};
+    model_->operators.push_back(std::unique_ptr<toco::Operator>(transpose_op));
+
+    auto* transpose2_op = new toco::TransposeOperator;
+    transpose2_op->inputs = {"InputTranspose"};
+    transpose2_op->perm = perm2;
+    transpose2_op->outputs = {"InputTransposeTranspose"};
+    model_->operators.push_back(std::unique_ptr<toco::Operator>(transpose2_op));
+
+    auto* add2_op = new toco::AddOperator;
+    add2_op->inputs = {"InputTransposeTranspose", "InputB"};
+    add2_op->outputs = {"InputTransposeTransposePlusB"};
+    model_->operators.push_back(std::unique_ptr<toco::Operator>(add2_op));
+  }
+
+  std::unique_ptr<toco::Model> model_;
+};
+
+TEST_F(RemoveSuccessiveTransposeTest, RemoveTranspose) {
+  // Creating a model.
+  CreateGraph({1, 0}, {1, 0});
+
+  toco::RemoveSuccesiveTranspose transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/1, &modified).ok());
+  EXPECT_TRUE(modified);
+
+  ASSERT_EQ(model_->operators.size(), 2);
+  ASSERT_EQ(model_->operators[0]->type, toco::OperatorType::kAdd);
+  ASSERT_EQ(model_->operators[1]->type, toco::OperatorType::kAdd);
+  ASSERT_EQ(model_->operators[1]->inputs[0], model_->operators[0]->outputs[0]);
+}
+
+TEST_F(RemoveSuccessiveTransposeTest, DontRemoveNotIdentityTranspose) {
+  // Creating a model.
+  CreateGraph({0, 2, 1}, {1, 0, 2});
+
+  toco::RemoveSuccesiveTranspose transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/1, &modified).ok());
+  EXPECT_FALSE(modified);
+}
+
+TEST_F(RemoveSuccessiveTransposeTest, DontRemoveTransposeOutputUnused) {
+  CreateArray("InputA", {2, 2});
+  CreateArray("InputB", {2, 2});
+  CreateArray("Input", {2, 2});
+  CreateArray("InputTranspose", {2, 2});
+  CreateArray("InputTransposeTranspose", {2, 2});
+
+  auto* add_op = new toco::AddOperator;
+  add_op->inputs = {"InputA", "InputB"};
+  add_op->outputs = {"Input"};
+  model_->operators.push_back(std::unique_ptr<toco::Operator>(add_op));
+
+  auto* transpose_op = new toco::TransposeOperator;
+  transpose_op->inputs = {"Input"};
+  transpose_op->perm = {0, 2, 1};
+  transpose_op->outputs = {"InputTranspose"};
+  model_->operators.push_back(std::unique_ptr<toco::Operator>(transpose_op));
+
+  auto* transpose2_op = new toco::TransposeOperator;
+  transpose2_op->inputs = {"InputTranspose"};
+  transpose2_op->perm = {0, 2, 1};
+  transpose2_op->outputs = {"InputTransposeTranspose"};
+  model_->operators.push_back(std::unique_ptr<toco::Operator>(transpose2_op));
+
+  toco::RemoveSuccesiveTranspose transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/1, &modified).ok());
+  EXPECT_FALSE(modified);
+}
+}  // namespace
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 859fa0f6147..29c8d9629a9 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -562,6 +562,178 @@ void RetainTensorFlowNodeDef(const NodeDef& node, Operator* op) {
   node.SerializeToString(&op->tensorflow_node_def);
 }
 
+void GetOutputNamesFromNodeDef(const NodeDef& node,
+                               const tensorflow::OpDef& op_def,
+                               TensorFlowUnsupportedOperator* op) {
+  int next_output = 0;
+  auto add_output = [&node, &next_output, op]() {
+    if (next_output == 0) {
+      op->outputs.push_back(node.name());  // Implicit :0.
+    } else {
+      op->outputs.push_back(absl::StrCat(node.name(), ":", next_output));
+    }
+    ++next_output;
+  };
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    string multiples = op_def.output_arg(i).number_attr();
+    if (!multiples.empty()) {
+      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
+      int num_outputs = GetIntAttr(node, multiples);
+      for (int j = 0; j < num_outputs; ++j) {
+        add_output();
+      }
+    } else {
+      string list = op_def.output_arg(i).type_list_attr();
+      if (!list.empty()) {
+        CHECK(HasAttr(node, list)) << "No attr named " << list;
+        const AttrValue::ListValue& list_value = GetListAttr(node, list);
+        for (int j = 0; j < list_value.type_size(); ++j) {
+          add_output();
+        }
+      } else {
+        add_output();
+      }
+    }
+  }
+}
+
+void GetOutputTypesFromNodeDef(const NodeDef& node,
+                               const tensorflow::OpDef& op_def,
+                               TensorFlowUnsupportedOperator* op) {
+  // The given type to the op, or clear the types if invalid.
+  auto add_type = [&node, op](tensorflow::DataType type) {
+    if (type == tensorflow::DT_INVALID) {
+      LOG(WARNING) << "Op node missing output type attribute: " << node.name();
+      op->output_data_types.clear();
+    } else {
+      op->output_data_types.push_back(ConvertDataType(type));
+    }
+  };
+
+  // Retrieve the data type according to the OpDef definition: either the
+  // "type" or "type_attr" field will be set.
+  auto get_type = [&node](const tensorflow::OpDef::ArgDef& a) {
+    if (a.type() != tensorflow::DT_INVALID) {
+      return a.type();
+    } else if (HasAttr(node, a.type_attr())) {
+      return GetDataTypeAttr(node, a.type_attr());
+    } else {
+      return tensorflow::DT_INVALID;
+    }
+  };
+
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    string multiples = op_def.output_arg(i).number_attr();
+    if (!multiples.empty()) {
+      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
+      int num_outputs = GetIntAttr(node, multiples);
+      auto type = get_type(op_def.output_arg(i));
+      for (int j = 0; j < num_outputs; ++j) {
+        add_type(type);
+      }
+    } else {
+      string list = op_def.output_arg(i).type_list_attr();
+      if (!list.empty()) {
+        CHECK(HasAttr(node, list)) << "No attr named " << list;
+        const AttrValue::ListValue& list_value = GetListAttr(node, list);
+        for (int j = 0; j < list_value.type_size(); ++j) {
+          add_type(list_value.type(j));
+        }
+      } else {
+        add_type(get_type(op_def.output_arg(i)));
+      }
+    }
+  }
+}
+
+tensorflow::Status ConvertUnsupportedOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    const ModelFlags& model_flags, Model* model) {
+  // Names of special attributes in TF graph that are used by Toco.
+  static constexpr char kAttrOutputQuantized[] = "_output_quantized";
+  static constexpr char kAttrOutputTypes[] = "_output_types";
+  static constexpr char kAttrOutputShapes[] = "_output_shapes";
+  static constexpr char kAttrSupportOutputTypeFloatInQuantizedOp[] =
+      "_support_output_type_float_in_quantized_op";
+
+  LOG(INFO) << "Converting unsupported operation: " << node.op();
+
+  auto* op = new TensorFlowUnsupportedOperator;
+  op->tensorflow_op = node.op();
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, op);
+
+  model->operators.emplace_back(op);
+
+  // Parse inputs.
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+
+  // Parse outputs. Name them after the node's name, plus an ordinal suffix.
+  // Note that some outputs are to be multiplied by a named attribute.
+  const tensorflow::OpDef* op_def = nullptr;
+  if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
+    GetOutputNamesFromNodeDef(node, *op_def, op);
+  } else {
+    op->outputs.push_back(node.name());  // Implicit :0.
+  }
+
+  // Parse if the op supports quantization
+  if (HasAttr(node, kAttrOutputQuantized)) {
+    op->quantized = GetBoolAttr(node, kAttrOutputQuantized);
+  }
+  // Parse if the quantized op allows output arrays of type float
+  if (HasAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp)) {
+    op->support_output_type_float_in_quantized_op =
+        GetBoolAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp);
+  }
+
+  // Parse output type(s).
+  if (HasAttr(node, kAttrOutputTypes)) {
+    const auto& output_types = GetListAttr(node, kAttrOutputTypes);
+    for (int i = 0; i < output_types.type_size(); ++i) {
+      op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
+    }
+  } else if (HasAttr(node, "Tout")) {
+    const auto& output_type = GetDataTypeAttr(node, "Tout");
+    op->output_data_types.push_back(ConvertDataType(output_type));
+  } else if (op_def != nullptr) {
+    GetOutputTypesFromNodeDef(node, *op_def, op);
+  } else {
+    // TODO(b/113613439): Figure out how to propagate types for custom ops
+    // that have no OpDef.
+    LOG(INFO) << "Unable to determine output type for op: " << node.op();
+  }
+
+  // Parse output shape(s).
+  if (HasAttr(node, kAttrOutputShapes)) {
+    const auto& output_shapes = GetListAttr(node, kAttrOutputShapes);
+    Shape output_shape;
+    for (int i = 0; i < output_shapes.shape_size(); ++i) {
+      const auto& shape = output_shapes.shape(i);
+      // TOCO doesn't yet properly handle shapes with wildcard dimensions.
+      // TODO(b/113613439): Handle shape inference for unsupported ops that have
+      // shapes with wildcard dimensions.
+      if (HasWildcardDimension(shape)) {
+        LOG(INFO) << "Skipping wildcard output shape(s) for node: "
+                  << node.name();
+        op->output_shapes.clear();
+        break;
+      }
+      const auto status =
+          ImportShape(shape.dim(), /*input_flat_size=*/nullptr, &output_shape);
+      if (!status.ok()) {
+        return status;
+      }
+      op->output_shapes.push_back(output_shape);
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertConstOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     const ModelFlags& model_flags, Model* model) {
@@ -646,9 +818,6 @@ tensorflow::Status ConvertConvOperator(
     reorder->output_axes_order = AxesOrder::kOHWI;
     model->operators.emplace_back(reorder);
   }
-  auto* conv = new ConvOperator;
-  conv->inputs = {input_name, reordered_weights_name};
-  conv->outputs = {node.name()};
   if (!HasAttr(node, "strides")) {
     return tensorflow::errors::InvalidArgument("Missing attribute 'strides'");
   }
@@ -656,8 +825,8 @@ tensorflow::Status ConvertConvOperator(
   TF_RETURN_IF_ERROR(ExpectValue(strides.i_size(), 4, "number of strides"));
   TF_RETURN_IF_ERROR(ExpectValue(strides.i(0), 1, "strides(0)"));
   TF_RETURN_IF_ERROR(ExpectValue(strides.i(3), 1, "strides(3)"));
-  conv->stride_height = strides.i(1);
-  conv->stride_width = strides.i(2);
+  int dilation_height_factor;
+  int dilation_width_factor;
   if (HasAttr(node, "dilations")) {
     const auto& dilations = GetListAttr(node, "dilations");
     TF_RETURN_IF_ERROR(
@@ -669,21 +838,30 @@ tensorflow::Status ConvertConvOperator(
           node.name(), "\" had dilations:[ ", dilations.i(0), ", ",
           dilations.i(1), ", ", dilations.i(2), ", ", dilations.i(3), "]."));
     }
-    conv->dilation_height_factor = dilations.i(1);
-    conv->dilation_width_factor = dilations.i(2);
+    dilation_height_factor = dilations.i(1);
+    dilation_width_factor = dilations.i(2);
   } else {
-    conv->dilation_height_factor = 1;
-    conv->dilation_width_factor = 1;
+    dilation_height_factor = 1;
+    dilation_width_factor = 1;
   }
   const auto& padding = GetStringAttr(node, "padding");
+  PaddingType padding_type;
   if (padding == "SAME") {
-    conv->padding.type = PaddingType::kSame;
+    padding_type = PaddingType::kSame;
   } else if (padding == "VALID") {
-    conv->padding.type = PaddingType::kValid;
+    padding_type = PaddingType::kValid;
   } else {
     return tensorflow::errors::InvalidArgument(
         "Bad padding (only SAME and VALID are supported)");
   }
+  auto* conv = new ConvOperator;
+  conv->inputs = {input_name, reordered_weights_name};
+  conv->outputs = {node.name()};
+  conv->stride_height = strides.i(1);
+  conv->stride_width = strides.i(2);
+  conv->dilation_height_factor = dilation_height_factor;
+  conv->dilation_width_factor = dilation_width_factor;
+  conv->padding.type = padding_type;
   model->operators.emplace_back(conv);
 
   return tensorflow::Status::OK();
@@ -722,15 +900,12 @@ tensorflow::Status ConvertDepthwiseConvOperator(
     reorder->output_axes_order = AxesOrder::k1HWO;
     model->operators.emplace_back(reorder);
   }
-  auto* conv = new DepthwiseConvOperator;
-  conv->inputs = {input_name, reordered_weights_name};
-  conv->outputs = {node.name()};
   const auto& strides = GetListAttr(node, "strides");
-  CHECK_EQ(strides.i_size(), 4);
-  CHECK_EQ(strides.i(0), 1);
-  CHECK_EQ(strides.i(3), 1);
-  conv->stride_height = strides.i(1);
-  conv->stride_width = strides.i(2);
+  TF_RETURN_IF_ERROR(ExpectValue(strides.i_size(), 4, "number of strides"));
+  TF_RETURN_IF_ERROR(ExpectValue(strides.i(0), 1, "strides(0)"));
+  TF_RETURN_IF_ERROR(ExpectValue(strides.i(3), 1, "strides(3)"));
+  int dilation_height_factor;
+  int dilation_width_factor;
   if (HasAttr(node, "dilations")) {
     const auto& dilations = GetListAttr(node, "dilations");
     TF_RETURN_IF_ERROR(
@@ -742,20 +917,30 @@ tensorflow::Status ConvertDepthwiseConvOperator(
           node.name(), "\" had dilations:[ ", dilations.i(0), ", ",
           dilations.i(1), ", ", dilations.i(2), ", ", dilations.i(3), "]."));
     }
-    conv->dilation_height_factor = dilations.i(1);
-    conv->dilation_width_factor = dilations.i(2);
+    dilation_height_factor = dilations.i(1);
+    dilation_width_factor = dilations.i(2);
   } else {
-    conv->dilation_height_factor = 1;
-    conv->dilation_width_factor = 1;
+    dilation_height_factor = 1;
+    dilation_width_factor = 1;
   }
   const auto& padding = GetStringAttr(node, "padding");
+  PaddingType padding_type;
   if (padding == "SAME") {
-    conv->padding.type = PaddingType::kSame;
+    padding_type = PaddingType::kSame;
   } else if (padding == "VALID") {
-    conv->padding.type = PaddingType::kValid;
+    padding_type = PaddingType::kValid;
   } else {
-    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+    return tensorflow::errors::InvalidArgument(
+        "Bad padding (only SAME and VALID are supported)");
   }
+  auto* conv = new DepthwiseConvOperator;
+  conv->inputs = {input_name, reordered_weights_name};
+  conv->outputs = {node.name()};
+  conv->stride_height = strides.i(1);
+  conv->stride_width = strides.i(2);
+  conv->dilation_height_factor = dilation_height_factor;
+  conv->dilation_width_factor = dilation_width_factor;
+  conv->padding.type = padding_type;
   model->operators.emplace_back(conv);
   return tensorflow::Status::OK();
 }
@@ -766,7 +951,14 @@ tensorflow::Status ConvertDepthToSpaceOperator(
   CHECK_EQ(node.op(), "DepthToSpace");
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
 
-  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  tensorflow::DataType dtype = GetDataTypeAttr(node, "T");
+  if (dtype != DT_FLOAT && dtype != DT_UINT8 && dtype != DT_INT32 &&
+      dtype != DT_INT64) {
+    const auto* enum_descriptor = tensorflow::DataType_descriptor();
+    LOG(FATAL) << "TFLite does not support DepthToSpace with type T:"
+               << enum_descriptor->FindValueByNumber(dtype)->name() << ". "
+               << "T must be one of {DT_FLOAT, DT_UINT8, DT_INT32, DT_INT64}.";
+  }
   auto* op = new DepthToSpaceOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -788,7 +980,7 @@ tensorflow::Status ConvertSpaceToDepthOperator(
     const auto* enum_descriptor = tensorflow::DataType_descriptor();
     LOG(FATAL) << "TFLite does not support SpaceToDepth with type T:"
                << enum_descriptor->FindValueByNumber(dtype)->name() << ". "
-               << "T must be one of {DT_FLOAT, DT_INT8, DT_INT32, DT_INT64}.";
+               << "T must be one of {DT_FLOAT, DT_UINT8, DT_INT32, DT_INT64}.";
   }
   auto* op = new SpaceToDepthOperator;
   op->inputs.push_back(node.input(0));
@@ -857,6 +1049,24 @@ tensorflow::Status ConvertIdentityOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertIdentityNOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    const ModelFlags& model_flags, Model* model) {
+  CHECK_EQ(node.op(), "IdentityN");
+  for (int i = 0; i < node.input_size(); ++i) {
+    auto* op = new TensorFlowIdentityOperator;
+    const auto& input_name = node.input(i);
+    string output_name = node.name();
+    if (i > 0) {
+      output_name = output_name + ":" + std::to_string(i);
+    }
+    op->inputs.push_back(input_name);
+    op->outputs.push_back(output_name);
+    model->operators.emplace_back(op);
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertFakeQuantWithMinMaxArgs(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     const ModelFlags& model_flags, Model* model) {
@@ -1239,178 +1449,6 @@ tensorflow::Status ConvertSimpleOperatorFlexOk(
       node, tf_import_flags, model_flags, model);
 }
 
-void GetOutputNamesFromNodeDef(const NodeDef& node,
-                               const tensorflow::OpDef& op_def,
-                               TensorFlowUnsupportedOperator* op) {
-  int next_output = 0;
-  auto add_output = [&node, &next_output, op]() {
-    if (next_output == 0) {
-      op->outputs.push_back(node.name());  // Implicit :0.
-    } else {
-      op->outputs.push_back(absl::StrCat(node.name(), ":", next_output));
-    }
-    ++next_output;
-  };
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
-    if (!multiples.empty()) {
-      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
-      int num_outputs = GetIntAttr(node, multiples);
-      for (int j = 0; j < num_outputs; ++j) {
-        add_output();
-      }
-    } else {
-      string list = op_def.output_arg(i).type_list_attr();
-      if (!list.empty()) {
-        CHECK(HasAttr(node, list)) << "No attr named " << list;
-        const AttrValue::ListValue& list_value = GetListAttr(node, list);
-        for (int j = 0; j < list_value.type_size(); ++j) {
-          add_output();
-        }
-      } else {
-        add_output();
-      }
-    }
-  }
-}
-
-void GetOutputTypesFromNodeDef(const NodeDef& node,
-                               const tensorflow::OpDef& op_def,
-                               TensorFlowUnsupportedOperator* op) {
-  // The given type to the op, or clear the types if invalid.
-  auto add_type = [&node, op](tensorflow::DataType type) {
-    if (type == tensorflow::DT_INVALID) {
-      LOG(WARNING) << "Op node missing output type attribute: " << node.name();
-      op->output_data_types.clear();
-    } else {
-      op->output_data_types.push_back(ConvertDataType(type));
-    }
-  };
-
-  // Retrieve the data type according to the OpDef definition: either the
-  // "type" or "type_attr" field will be set.
-  auto get_type = [&node](const tensorflow::OpDef::ArgDef& a) {
-    if (a.type() != tensorflow::DT_INVALID) {
-      return a.type();
-    } else if (HasAttr(node, a.type_attr())) {
-      return GetDataTypeAttr(node, a.type_attr());
-    } else {
-      return tensorflow::DT_INVALID;
-    }
-  };
-
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
-    if (!multiples.empty()) {
-      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
-      int num_outputs = GetIntAttr(node, multiples);
-      auto type = get_type(op_def.output_arg(i));
-      for (int j = 0; j < num_outputs; ++j) {
-        add_type(type);
-      }
-    } else {
-      string list = op_def.output_arg(i).type_list_attr();
-      if (!list.empty()) {
-        CHECK(HasAttr(node, list)) << "No attr named " << list;
-        const AttrValue::ListValue& list_value = GetListAttr(node, list);
-        for (int j = 0; j < list_value.type_size(); ++j) {
-          add_type(list_value.type(j));
-        }
-      } else {
-        add_type(get_type(op_def.output_arg(i)));
-      }
-    }
-  }
-}
-
-tensorflow::Status ConvertUnsupportedOperator(
-    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
-    const ModelFlags& model_flags, Model* model) {
-  // Names of special attributes in TF graph that are used by Toco.
-  static constexpr char kAttrOutputQuantized[] = "_output_quantized";
-  static constexpr char kAttrOutputTypes[] = "_output_types";
-  static constexpr char kAttrOutputShapes[] = "_output_shapes";
-  static constexpr char kAttrSupportOutputTypeFloatInQuantizedOp[] =
-      "_support_output_type_float_in_quantized_op";
-
-  LOG(INFO) << "Converting unsupported operation: " << node.op();
-
-  auto* op = new TensorFlowUnsupportedOperator;
-  op->tensorflow_op = node.op();
-
-  // For Flex mode. Please read the comments of the function.
-  RetainTensorFlowNodeDef(node, op);
-
-  model->operators.emplace_back(op);
-
-  // Parse inputs.
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-
-  // Parse outputs. Name them after the node's name, plus an ordinal suffix.
-  // Note that some outputs are to be multiplied by a named attribute.
-  const tensorflow::OpDef* op_def = nullptr;
-  if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
-    GetOutputNamesFromNodeDef(node, *op_def, op);
-  } else {
-    op->outputs.push_back(node.name());  // Implicit :0.
-  }
-
-  // Parse if the op supports quantization
-  if (HasAttr(node, kAttrOutputQuantized)) {
-    op->quantized = GetBoolAttr(node, kAttrOutputQuantized);
-  }
-  // Parse if the quantized op allows output arrays of type float
-  if (HasAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp)) {
-    op->support_output_type_float_in_quantized_op =
-        GetBoolAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp);
-  }
-
-  // Parse output type(s).
-  if (HasAttr(node, kAttrOutputTypes)) {
-    const auto& output_types = GetListAttr(node, kAttrOutputTypes);
-    for (int i = 0; i < output_types.type_size(); ++i) {
-      op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
-    }
-  } else if (HasAttr(node, "Tout")) {
-    const auto& output_type = GetDataTypeAttr(node, "Tout");
-    op->output_data_types.push_back(ConvertDataType(output_type));
-  } else if (op_def != nullptr) {
-    GetOutputTypesFromNodeDef(node, *op_def, op);
-  } else {
-    // TODO(b/113613439): Figure out how to propagate types for custom ops
-    // that have no OpDef.
-    LOG(INFO) << "Unable to determine output type for op: " << node.op();
-  }
-
-  // Parse output shape(s).
-  if (HasAttr(node, kAttrOutputShapes)) {
-    const auto& output_shapes = GetListAttr(node, kAttrOutputShapes);
-    Shape output_shape;
-    for (int i = 0; i < output_shapes.shape_size(); ++i) {
-      const auto& shape = output_shapes.shape(i);
-      // TOCO doesn't yet properly handle shapes with wildcard dimensions.
-      // TODO(b/113613439): Handle shape inference for unsupported ops that have
-      // shapes with wildcard dimensions.
-      if (HasWildcardDimension(shape)) {
-        LOG(INFO) << "Skipping wildcard output shape(s) for node: "
-                  << node.name();
-        op->output_shapes.clear();
-        break;
-      }
-      const auto status =
-          ImportShape(shape.dim(), /*input_flat_size=*/nullptr, &output_shape);
-      if (!status.ok()) {
-        return status;
-      }
-      op->output_shapes.push_back(output_shape);
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
 // Same as ConvertConstOperator, but revert to ConvertUnsupportedOperator if
 // the types are not supported. Converting Const operators here avoids
 // expensive copies of the protocol buffers downstream in the flex delegate.
@@ -1848,23 +1886,25 @@ tensorflow::Status ConvertReduceOperator(
   return tensorflow::Status::OK();
 }
 
+// TODO(b/139320642): Add test when fused op is supported.
 tensorflow::Status ConvertSvdfOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     const ModelFlags& model_flags, Model* model) {
   CHECK_EQ(node.op(), "Svdf");
   const int input_size = GetInputsCount(node, tf_import_flags);
-  QCHECK(input_size == 3 || input_size == 4)
+  QCHECK(input_size == 4 || input_size == 5)
       << "Svdf node expects 3 or 4 inputs other than control dependencies: "
       << node.DebugString();
-  bool has_bias = (input_size == 4);
+  bool has_bias = (input_size == 5);
   auto* op = new SvdfOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->inputs.push_back(node.input(2));
+  int index = 0;
+  op->inputs.push_back(node.input(index++));
+  op->inputs.push_back(node.input(index++));
+  op->inputs.push_back(node.input(index++));
   if (has_bias) {
-    op->inputs.push_back(node.input(3));
+    op->inputs.push_back(node.input(index++));
   }
-  op->outputs.push_back(node.name() + "_state");
+  op->inputs.push_back(node.input(index));
   op->outputs.push_back(node.name());
   if (node.attr().at("ActivationFunction").s() == "Relu") {
     op->fused_activation_function = FusedActivationFunctionType::kRelu;
@@ -1906,7 +1946,7 @@ tensorflow::Status ConvertTransposeConvOperator(
         << "Dilation unsupported in TransposeConv. TensorFlow op \""
         << node.name() << "\" had dilations";
     CHECK((dilations.i(0) == 1) && (dilations.i(1) == 1) &&
-          (dilations.i(1) == 1) && (dilations.i(3) == 1))
+          (dilations.i(2) == 1) && (dilations.i(3) == 1))
         << "Dilation unsupported in TransposeConv. TensorFlow op \""
         << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
         << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
@@ -2361,12 +2401,13 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
     const ModelFlags& model_flags, Model* model) {
   DCHECK_EQ(node.op(), "UnidirectionalSequenceLstm");
 
-  auto* op = new UnidirectionalSequenceLstmOperator();
   const auto& indices = GetListAttr(node, "_tflite_input_indices");
   if (indices.i_size() != node.input().size()) {
     return tensorflow::errors::InvalidArgument("Input size does not match.");
   }
 
+  auto* op = new UnidirectionalSequenceLstmOperator();
+
   // The input size needs to be the same as the TfLite UniDirectionalSequence
   // Lstm implementation.
   const int kInputsSize = 20;
@@ -2416,12 +2457,12 @@ tensorflow::Status ConvertUnidirectionalSequenceRnn(
     const ModelFlags& model_flags, Model* model) {
   DCHECK_EQ(node.op(), "UnidirectionalSequenceRnn");
 
-  auto* op = new UnidirectionalSequenceRnnOperator();
   const auto& indices = GetListAttr(node, "_tflite_input_indices");
   if (indices.i_size() != node.input().size()) {
     return tensorflow::errors::InvalidArgument("Input size does not match.");
   }
 
+  auto* op = new UnidirectionalSequenceRnnOperator();
   for (const string& input : node.input()) {
     op->inputs.push_back(input);
   }
@@ -2504,6 +2545,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"GreaterEqual",
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
       {"Identity", ConvertIdentityOperator},
+      {"IdentityN", ConvertIdentityNOperator},
       {"LRN", ConvertLRNOperator},
       {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 7a95e5db582..3d1e82ae3ed 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "absl/types/optional.h"
@@ -553,6 +554,10 @@ struct FullyConnectedOperator : Operator {
   FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
   FullyConnectedWeightsFormat weights_format =
       FullyConnectedWeightsFormat::kDefault;
+
+  // `keep_num_dims` is supported in the FullyConnected kernel version 5, but
+  // it's never supported by Toco.
+  bool keep_num_dims = false;
 };
 
 // Dequantization operator, converting a quantized array of integers with
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 1f4e86f85c8..0df333666d1 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-load("//tensorflow:tensorflow.bzl", "if_mlir", "py_binary", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "if_mlir_tflite", "py_binary", "tf_py_test")
 
 package(
     default_visibility = [
@@ -13,7 +13,6 @@ config_setting(
     name = "tflite_convert_with_select_tf_ops",
     define_values = {"tflite_convert_with_select_tf_ops": "true"},
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
 )
@@ -22,7 +21,7 @@ cc_library(
     name = "toco_python_api",
     srcs = ["toco_python_api.cc"],
     hdrs = ["toco_python_api.h"],
-    defines = if_mlir(
+    defines = if_mlir_tflite(
         if_false = [],
         if_true = ["TFLITE_BUILD_WITH_MLIR_CONVERTER"],
     ),
@@ -46,7 +45,7 @@ cc_library(
             "//tensorflow/core:ops",
         ],
         "//conditions:default": [],
-    }) + if_mlir(
+    }) + if_mlir_tflite(
         if_false = [],
         if_true = ["//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer"],
     ),
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 01850bf68bb..4fff36fc43f 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -89,6 +89,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":op_version",
         ":operator",
         ":types",
         "//tensorflow/lite:schema_fbs_version",
@@ -108,9 +109,11 @@ tf_cc_test(
     ],
     deps = [
         ":export",
+        ":operator",
         "//tensorflow/core:ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
     ],
 )
 
@@ -143,6 +146,7 @@ cc_library(
         ":operator",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index c32466bc1f3..227c6aada89 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/tflite/op_version.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
@@ -38,9 +39,11 @@ using ::tflite::BuiltinOperator_CUSTOM;
 using ::tflite::BuiltinOperator_MAX;
 using ::tflite::BuiltinOperator_MIN;
 using ::tflite::CreateBuffer;
+using ::tflite::CreateMetadata;
 using ::tflite::CreateModel;
 using ::tflite::CreateOperator;
 using ::tflite::CreateTensor;
+using ::tflite::Metadata;
 using ::tflite::Operator;
 using ::tflite::OperatorCode;
 using ::tflite::SubGraph;
@@ -456,6 +459,17 @@ void ParseControlFlowErrors(std::set<string>* custom_ops,
   }
 }
 
+// Exports a string buffer that contains the model's minimum required runtime
+// version.
+void ExportModelVersionBuffer(
+    const Model& model, std::vector<Offset<Vector<uint8_t>>>* buffers_to_write,
+    FlatBufferBuilder* builder) {
+  const std::string min_runtime = GetMinimumRuntimeVersionForModel(model);
+  buffers_to_write->push_back(builder->CreateVector(
+      reinterpret_cast<const uint8_t*>(min_runtime.data()),
+      min_runtime.size()));
+}
+
 tensorflow::Status Export(
     const Model& model, string* output_file_contents,
     const ExportParams& params,
@@ -612,11 +626,20 @@ tensorflow::Status Export(
         "not implemented yet.");
   }
 
+  // Write the minimum required runtime version into metadata.
+  auto metadata =
+      CreateMetadata(builder, builder.CreateString("min_runtime_version"),
+                     buffers_to_write.size());
+  ExportModelVersionBuffer(model, &buffers_to_write, &builder);
+  std::vector<flatbuffers::Offset<Metadata>> metadatas = {metadata};
+
   auto buffers = ExportBuffers(model, buffers_to_write, &builder);
   auto description = builder.CreateString("TOCO Converted.");
+
   auto new_model_location =
       CreateModel(builder, TFLITE_SCHEMA_VERSION, op_codes,
-                  builder.CreateVector(subgraphs), description, buffers);
+                  builder.CreateVector(subgraphs), description, buffers,
+                  /* metadata_buffer */ 0, builder.CreateVector(metadatas));
   ::tflite::FinishModelBuffer(builder, new_model_location);
 
   if (params.quantize_weights == QuantizedBufferType::NONE) {
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 0ae6104f8f9..bbb1c557f4d 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -245,6 +246,44 @@ TEST_F(ExportTest, Export) {
   EXPECT_THAT(ExportAndGetOperatorIndices(params), ElementsAre(1, 0, 2, 3));
 }
 
+TEST_F(ExportTest, ExportMinRuntime) {
+  AddOperatorsByName({"Conv", "Add", "Sub"});
+
+  ExportParams params;
+  params.allow_custom_ops = true;
+  params.enable_select_tf_ops = false;
+  params.quantize_weights = QuantizedBufferType::NONE;
+
+  string output;
+  auto status = Export(input_model_, &output, params);
+  auto* model = ::tflite::GetModel(output.data());
+  EXPECT_EQ(model->metadata()->size(), 1);
+  EXPECT_EQ(model->metadata()->Get(0)->name()->str(), "min_runtime_version");
+  auto buf = model->metadata()->Get(0)->buffer();
+  auto* buffer = (*model->buffers())[buf];
+  auto* array = buffer->data();
+  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  EXPECT_EQ(version, "1.6.0");
+}
+
+TEST_F(ExportTest, ExportEmptyMinRuntime) {
+  AddOperatorsByName({"Switch", "MyCustomOp", "Assert"});
+
+  ExportParams params;
+  params.allow_custom_ops = true;
+
+  string output;
+  auto status = Export(input_model_, &output, params);
+  auto* model = ::tflite::GetModel(output.data());
+  EXPECT_EQ(model->metadata()->size(), 1);
+  EXPECT_EQ(model->metadata()->Get(0)->name()->str(), "min_runtime_version");
+  auto buf = model->metadata()->Get(0)->buffer();
+  auto* buffer = (*model->buffers())[buf];
+  auto* array = buffer->data();
+  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  EXPECT_EQ(version, "");
+}
+
 TEST_F(ExportTest, UnsupportedControlFlowErrors) {
   AddOperatorsByName({"Conv", "Add", "Switch", "Merge"});
 
@@ -532,7 +571,7 @@ class VersionedOpExportTest : public ::testing::Test {
       auto* op = new ConvOperator;
       op->inputs.push_back("input");
       op->inputs.push_back("filter");
-      op->inputs.push_back("output");
+      op->outputs.push_back("output");
 
       op->padding.type = PaddingType::kSame;
       op->stride_width = 1;
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 1937f3efeb8..b432ea0f851 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/lite/toco/tflite/op_version.h"
 
 #include <cstring>
+#include <vector>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tooling_util.h"
@@ -23,6 +26,23 @@ limitations under the License.
 namespace toco {
 namespace tflite {
 
+bool CompareVersion(const string& v1, const string& v2) {
+  const std::vector<string>& vec1 = absl::StrSplit(v1, '.');
+  const std::vector<string>& vec2 = absl::StrSplit(v2, '.');
+  int i = 0;
+  while (i < vec1.size() && i < vec2.size()) {
+    int v1_val, v2_val;
+    if (absl::SimpleAtoi(vec1[i], &v1_val) &&
+        absl::SimpleAtoi(vec2[i], &v2_val)) {
+      if (v1_val != v2_val) return v1_val < v2_val;
+    }
+    ++i;
+  }
+  // If there are remaining items in v2 not being compared, then v1 should
+  // precede v2.
+  return i < vec2.size();
+}
+
 string GetMinimumRuntimeVersionForModel(const Model& model) {
   // Use this as the placeholder string if a particular op is not yet included
   // in any Tensorflow's RC/Final release source package. Once that op is
@@ -54,15 +74,18 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kCast, 1}, "1.5.0"},
           {{OperatorType::kConcatenation, 1}, "1.5.0"},
           {{OperatorType::kConcatenation, 2}, "1.14.0"},
-          {{OperatorType::kDepthToSpace, 1}, "1.5.0"},
+          {{OperatorType::kDepthToSpace, 1}, kPendingReleaseOpVersion},
           {{OperatorType::kFakeQuant, 1}, "1.5.0"},
           {{OperatorType::kFakeQuant, 2}, "1.10.0"},
           {{OperatorType::kFullyConnected, 1}, "1.5.0"},
           {{OperatorType::kFullyConnected, 2}, "1.10.0"},
           {{OperatorType::kFullyConnected, 3}, "1.14.0"},
           {{OperatorType::kFullyConnected, 4}, "1.14.0"},
+          {{OperatorType::kFullyConnected, 5}, "2.0.0"},
+          {{OperatorType::kFullyConnected, 6}, kPendingReleaseOpVersion},
           {{OperatorType::kGather, 1}, "1.6.0"},
           {{OperatorType::kGather, 2}, "1.14.0"},
+          {{OperatorType::kGather, 3}, "1.15.0"},
           {{OperatorType::kGatherNd, 1}, "1.14.0"},
           {{OperatorType::kSvdf, 1}, "1.5.0"},
           {{OperatorType::kSvdf, 2}, "1.14.0"},
@@ -78,6 +101,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kMinimum, 2}, "1.14.0"},
           {{OperatorType::kMul, 1}, "1.5.0"},
           {{OperatorType::kMul, 2}, "1.14.0"},
+          {{OperatorType::kMul, 3}, "1.15.0"},
           {{OperatorType::kPad, 1}, "1.5.0"},
           {{OperatorType::kPad, 2}, "1.14.0"},
           {{OperatorType::kTile, 1}, "1.10.1"},
@@ -90,7 +114,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSpaceToDepth, 2}, "1.14.0"},
           {{OperatorType::kTranspose, 1}, "1.6.0"},
           {{OperatorType::kTranspose, 2}, "1.14.0"},
-          {{OperatorType::kTranspose, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kTranspose, 3}, "1.15.0"},
           {{OperatorType::kLstmCell, 1}, "1.7.0"},
           {{OperatorType::kLstmCell, 2}, "1.10.0"},
           {{OperatorType::kLstmCell, 3}, "1.14.0"},
@@ -101,14 +125,14 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kMean, 1}, "1.6.0"},
           {{OperatorType::kMean, 2}, "1.14.0"},
           {{OperatorType::kSum, 1}, "1.10.0"},
-          {{OperatorType::kSum, 2}, kPendingReleaseOpVersion},
+          {{OperatorType::kSum, 2}, "1.15.0"},
           {{OperatorType::kReduceMax, 1}, "1.11.0"},
           {{OperatorType::kReduceMax, 2}, "1.14.0"},
           {{OperatorType::kReduceMin, 1}, "1.11.0"},
           {{OperatorType::kReduceMin, 2}, "1.14.0"},
           {{OperatorType::kReduceProd, 1}, "1.11.0"},
           {{OperatorType::kAny, 1}, "1.11.0"},
-          {{OperatorType::kRelu6, 1}, "1.14.0"},
+          {{OperatorType::kRelu6, 1}, "1.5.0"},
           {{OperatorType::kRelu6, 2}, "1.14.0"},
           {{OperatorType::kResizeBilinear, 1}, "1.7.0"},
           {{OperatorType::kResizeBilinear, 2}, "1.14.0"},
@@ -130,7 +154,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kTransposeConv, 1}, "1.9.0"},
           {{OperatorType::kSparseToDense, 1}, "1.9.0"},
           {{OperatorType::kSparseToDense, 2}, "1.14.0"},
-          {{OperatorType::kSparseToDense, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kSparseToDense, 3}, "1.15.0"},
           {{OperatorType::kExpandDims, 1}, "1.10.0"},
           {{OperatorType::kPack, 1}, "1.11.0"},
           {{OperatorType::kPack, 2}, "1.14.0"},
@@ -155,7 +179,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kWhere, 1}, "1.14.0"},
           {{OperatorType::kDequantize, 1}, "1.13.1"},
           {{OperatorType::kDequantize, 2}, "1.14.0"},
-          {{OperatorType::kDequantize, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kDequantize, 3}, "1.15.0"},
           {{OperatorType::kReverseSequence, 1}, "1.14.0"},
           {{OperatorType::kEqual, 1}, "1.14.0"},
           {{OperatorType::kEqual, 2}, "1.14.0"},
@@ -173,6 +197,34 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSelect, 2}, "1.14.0"},
           {{OperatorType::kFloorDiv, 1}, "1.14.0"},
           {{OperatorType::kFloorDiv, 2}, "1.14.0"},
+          {{OperatorType::kFloor, 1}, "1.9.0"},
+          {{OperatorType::kCeil, 1}, "1.14.0"},
+          {{OperatorType::kMatrixDiag, 1}, "1.14.0"},
+          {{OperatorType::kMatrixSetDiag, 1}, "1.14.0"},
+          {{OperatorType::kElu, 1}, "1.14.0"},
+          {{OperatorType::kRound, 1}, "1.14.0"},
+          {{OperatorType::kRelu, 1}, "1.5.0"},
+          {{OperatorType::kRelu1, 1}, "1.5.0"},
+          {{OperatorType::kPRelu, 1}, "1.8.0"},
+          {{OperatorType::kExp, 1}, "1.7.0"},
+          {{OperatorType::kCos, 1}, "1.14.0"},
+          {{OperatorType::kNeg, 1}, "1.9.0"},
+          {{OperatorType::kPow, 1}, "1.10.0"},
+          {{OperatorType::kLogicalOr, 1}, "1.11.0"},
+          {{OperatorType::kLogicalAnd, 1}, "1.11.0"},
+          {{OperatorType::kLogicalNot, 1}, "1.11.0"},
+          {{OperatorType::kFloorMod, 1}, "1.13.0"},
+          {{OperatorType::kRange, 1}, "1.13.0"},
+          {{OperatorType::kSin, 1}, "1.9.0"},
+          {{OperatorType::kLog, 1}, "1.14.0"},
+          {{OperatorType::kRsqrt, 1}, "1.10.0"},
+          {{OperatorType::kSquare, 1}, "1.12.0"},
+          {{OperatorType::kZerosLike, 1}, "1.12.0"},
+          {{OperatorType::kAbs, 1}, "1.13.0"},
+          {{OperatorType::kHardSwish, 1}, "1.15.0"},
+          {{OperatorType::kFill, 1}, "1.13.0"},
+          {{OperatorType::kReverseV2, 1}, "1.14.0"},
+          {{OperatorType::kRank, 1}, "1.14.0"},
       });
 
   const auto& op_types_map =
@@ -181,6 +233,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
   op_signature.model = &model;
   string model_min_version;
   for (const auto& op : model.operators) {
+    if (op_types_map.find(op->type) == op_types_map.end()) continue;
     op_signature.op = op.get();
     const int version = op_types_map.at(op->type)->GetVersion(op_signature);
     std::pair<OperatorType, int> version_key = {op->type, version};
@@ -190,7 +243,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
       // doesn't have a minimum runtime version associated, continue.
       continue;
     }
-    if (strcmp(model_min_version.c_str(), it->second.c_str()) < 0) {
+    if (CompareVersion(model_min_version, it->second)) {
       // Current min model runtime version should be bumped if we see a higher
       // op version.
       model_min_version = it->second;
diff --git a/tensorflow/lite/toco/tflite/op_version.h b/tensorflow/lite/toco/tflite/op_version.h
index 9c2b16723cc..7b644c19db1 100644
--- a/tensorflow/lite/toco/tflite/op_version.h
+++ b/tensorflow/lite/toco/tflite/op_version.h
@@ -20,10 +20,15 @@ limitations under the License.
 namespace toco {
 namespace tflite {
 
-// Get the minimum TF Lite runtime required to run a model. Each operator in
-// the model will have its own minimum requirement of a runtime, and the model's
-// minimum requirement of runtime is defined as the maximum of all the
-// operators' minimum runtime.
+// Returns true if the first version string precedes the second.
+// For example, '1.14' should precede '1.9', also '1.14.1' should precede
+// '1.14'. If two version string is equal, then false will be returned.
+bool CompareVersion(const string&, const string&);
+
+// Get the minimum TF Lite runtime required to run a model. Each built-in
+// operator in the model will have its own minimum requirement of a runtime, and
+// the model's minimum requirement of runtime is defined as the maximum of all
+// the built-in operators' minimum runtime.
 std::string GetMinimumRuntimeVersionForModel(const Model& model);
 
 }  // namespace tflite
diff --git a/tensorflow/lite/toco/tflite/op_version_test.cc b/tensorflow/lite/toco/tflite/op_version_test.cc
index daacc71fa3e..0d34b199735 100644
--- a/tensorflow/lite/toco/tflite/op_version_test.cc
+++ b/tensorflow/lite/toco/tflite/op_version_test.cc
@@ -78,9 +78,11 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
   const string fc_input = "fc_input";
   const string fc_weights = "fc_weights";
+  const string fc_bias = "fc_bias";
   const string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
+  fc->inputs.push_back(fc_bias);
   fc->outputs.push_back(fc_output);
   array_map[fc_input] = std::unique_ptr<Array>(new Array);
   array_map[fc_weights] = std::unique_ptr<Array>(new Array);
@@ -120,9 +122,11 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
   const string fc_input = "fc_input";
   const string fc_weights = "fc_weights";
+  const string fc_bias = "fc_bias";
   const string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
+  fc->inputs.push_back(fc_bias);
   fc->outputs.push_back(fc_output);
   auto& array_map = model.GetMutableArrayMap();
   array_map[fc_input] = std::unique_ptr<Array>(new Array);
@@ -134,6 +138,18 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   EXPECT_EQ(GetMinimumRuntimeVersionForModel(model), "1.10.0");
 }
 
+TEST(OpVersionTest, CompareVersionString) {
+  EXPECT_TRUE(CompareVersion("1.9", "1.13"));
+  EXPECT_FALSE(CompareVersion("1.13", "1.13"));
+  EXPECT_TRUE(CompareVersion("1.14", "1.14.1"));
+  EXPECT_FALSE(CompareVersion("1.14.1", "1.14"));
+  EXPECT_FALSE(CompareVersion("1.14.1", "1.9"));
+  EXPECT_FALSE(CompareVersion("1.0.9", "1.0.8"));
+  EXPECT_FALSE(CompareVersion("2.1.0", "1.2.0"));
+  EXPECT_TRUE(CompareVersion("", "1.13"));
+  EXPECT_FALSE(CompareVersion("", ""));
+}
+
 }  // namespace
 }  // namespace tflite
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index b064ea396e1..d0671996f54 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -390,15 +390,21 @@ class Concatenation
   }
 };
 
-class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
+class DepthToSpace
+    : public BuiltinOperator<DepthToSpaceOperator,
+                             ::tflite::DepthToSpaceOptions,
+                             ::tflite::BuiltinOptions_DepthToSpaceOptions> {
  public:
-  using CustomOperator::CustomOperator;
-  void WriteOptions(const TocoOperator& op,
-                    flexbuffers::Builder* fbb) const override {
-    fbb->Int("block_size", op.block_size);
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateDepthToSpaceOptions(*builder, op.block_size);
   }
-  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
-    op->block_size = m["block_size"].AsInt64();
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->block_size = options.block_size();
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
@@ -496,6 +502,14 @@ class FullyConnected
     const Array& input_array = op_signature.model->GetArray(input_name);
     const Array& weights_array = op_signature.model->GetArray(weights_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
+    // 2 inputs (no bias) use case is supported starting from version 6.
+    if (op_signature.op->inputs.size() == 2) {
+      return 6;
+    }
+    // `keep_num_dims` is supported at verison 5.
+    if (fc_op.keep_num_dims) {
+      return 5;
+    }
     // Int8 fully fixed point kernel is at version 4.
     if (input_array.data_type == ArrayDataType::kInt8 &&
         weights_array.data_type == ArrayDataType::kInt8 &&
@@ -540,7 +554,11 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
   int GetVersion(const OperatorSignature& op_signature) const override {
     const string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
-    // If the op take int8 input, it is version 2.
+    // If the op takes bool input, it is version 3.
+    if (input_array.data_type == ArrayDataType::kBool) {
+      return 3;
+    }
+    // If the op takes int8 input, it is version 2.
     if (input_array.data_type == ArrayDataType::kInt8) {
       return 2;
     }
@@ -778,10 +796,23 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
-    const Array& input_array = op_signature.model->GetArray(input_name);
+    const string& input1_name = op_signature.op->inputs[0];
+    const string& input2_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input1_array = op_signature.model->GetArray(input1_name);
+    const Array& input2_array = op_signature.model->GetArray(input2_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    const auto& input1_quant = input1_array.quantization_params;
+    const auto& input2_quant = input2_array.quantization_params;
+    const auto& output_quant = output_array.quantization_params;
+    // Version 3 supports have a rescale value greater than or equal to 1.
+    if (input1_quant && input2_quant && output_quant &&
+        (input1_quant->scale * input2_quant->scale / output_quant->scale) >=
+            1.0) {
+      return 3;
+    }
     // Version 2 supports signed int8 input types.
-    if (input_array.data_type == ArrayDataType::kInt8) {
+    if (input1_array.data_type == ArrayDataType::kInt8) {
       return 2;
     }
     return 1;
@@ -2389,7 +2420,6 @@ class FloorDiv : public SimpleOperator<FloorDivOperator> {
   }
 };
 
-// LINT.ThenChange(//tensorflow/lite/toco/tflite/op_version.cc)
 
 namespace {
 // Build a vector containing all the known operators.
@@ -2453,6 +2483,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kSoftmax));
   ops.push_back(MakeUnique<SpaceToDepth>(
       ::tflite::BuiltinOperator_SPACE_TO_DEPTH, OperatorType::kSpaceToDepth));
+  ops.push_back(MakeUnique<DepthToSpace>(
+      ::tflite::BuiltinOperator_DEPTH_TO_SPACE, OperatorType::kDepthToSpace));
   ops.push_back(
       MakeUnique<Svdf>(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf));
   ops.push_back(MakeUnique<Transpose>(::tflite::BuiltinOperator_TRANSPOSE,
@@ -2542,8 +2574,6 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<SimpleOperator<MatrixSetDiagOperator>>(
       "MATRIX_SET_DIAG", OperatorType::kMatrixSetDiag));
   // Custom Operators.
-  ops.push_back(
-      MakeUnique<DepthToSpace>("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.push_back(MakeUnique<CTCBeamSearchDecoder>(
       "CTC_BEAM_SEARCH_DECODER", OperatorType::kCTCBeamSearchDecoder));
   ops.push_back(MakeUnique<TensorFlowUnsupported>("TENSORFLOW_UNSUPPORTED",
@@ -2629,6 +2659,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
 }
 }  // namespace
 
+// LINT.ThenChange(//tensorflow/lite/toco/tflite/op_version.cc)
+
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
     bool enable_select_tf_ops) {
   std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 3b007cb2514..40313f85bf9 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -917,7 +917,35 @@ TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
 
 TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
 
-TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest<MulOperator>(); }
+void SimpleMulVersioningTest(ArrayDataType data_type, float multiplier,
+                             int version) {
+  MulOperator op;
+  op.inputs = {"input1", "input2"};
+  op.outputs = {"output"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model model;
+  Array& input0 = model.GetOrCreateArray(op.inputs[0]);
+  Array& input1 = model.GetOrCreateArray(op.inputs[1]);
+  Array& output = model.GetOrCreateArray(op.outputs[0]);
+
+  input0.data_type = data_type;
+  input0.GetOrCreateQuantizationParams().scale = 1.0f;
+  input1.data_type = data_type;
+  input1.GetOrCreateQuantizationParams().scale = 1.0f;
+  output.data_type = data_type;
+  output.GetOrCreateQuantizationParams().scale = 1.0f / multiplier;
+
+  OperatorSignature signature = {.op = &op, .model = &model};
+  EXPECT_EQ(base_op->GetVersion(signature), version);
+}
+
+TEST_F(OperatorTest, VersioningMulTest) {
+  SimpleMulVersioningTest(ArrayDataType::kUint8, 0.5f, 1);
+  SimpleMulVersioningTest(ArrayDataType::kInt8, 0.5f, 2);
+  SimpleMulVersioningTest(ArrayDataType::kInt8, 2.0f, 3);
+}
 
 TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest<PadOperator>(); }
 
@@ -957,7 +985,7 @@ TEST_F(OperatorTest, VersioningFullyConnectedTest) {
   output_uint8_array.data_type = ArrayDataType::kUint8;
   OperatorSignature uint8_signature = {.op = &fully_connected_op,
                                        .model = &uint8_model};
-  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+  EXPECT_EQ(op->GetVersion(uint8_signature), 6);
 
   Model int8_model;
   Array& input_int8_array =
@@ -971,7 +999,7 @@ TEST_F(OperatorTest, VersioningFullyConnectedTest) {
   output_int8_array.data_type = ArrayDataType::kInt8;
   OperatorSignature int8_signature = {.op = &fully_connected_op,
                                       .model = &int8_model};
-  EXPECT_EQ(op->GetVersion(int8_signature), 4);
+  EXPECT_EQ(op->GetVersion(int8_signature), 6);
 }
 
 TEST_F(OperatorTest, VersioningDequantizeTest) {
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index 8e3550ded13..d17c5f72caa 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -25,7 +25,7 @@ enum FileFormat {
   TENSORFLOW_GRAPHDEF = 1;
 
   // Tensorflow's mobile inference model.
-  // third_party/tensorflow/contrib/tflite/schema.fbs
+  // third_party/tensorflow/lite/schema/schema.fbs
   TFLITE = 2;
 
   // GraphViz
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 020d228ad82..0e170e5bfad 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -67,6 +67,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new PropagateActivationFunctionIntoConstants);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
+  transformations->Add(new RemoveSuccesiveTranspose);
   transformations->Add(new RemoveTensorFlowAssert);
   transformations->Add(new RemoveTensorFlowIdentity);
   transformations->Add(new RemoveTrivialConcatenation);
@@ -104,6 +105,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowSwitch);
   transformations->Add(new ResolveTensorFlowConcat);
   transformations->Add(new ResolveMultiplyByZero);
+  transformations->Add(new IdentifyHardSwish);
   transformations->Add(new IdentifyL2Normalization);
   transformations->Add(new IdentifyL2Pool);
   transformations->Add(new IdentifyRelu1);
@@ -431,12 +433,25 @@ tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
   CheckModelCounts(*model);
   CheckFinalDataTypesSatisfied(*model);
 
-  int64 ops_count;
+  // Estimate and log the number of arithmetic ops
+  int64 ops_count = 0;
   if (EstimateArithmeticOpsCount(*model, &ops_count)) {
-    LOG(INFO) << "Estimated count of arithmetic ops: " << 1e-9 * ops_count
-              << " billion (note that a multiply-add is counted as 2 ops).";
+    LOG(INFO) << "Estimated count of arithmetic ops: " << ops_count
+              << " ops, equivalently " << ops_count / 2 << " MACs";
   }
   model->ops_count = ops_count;
+  int64 params_count = 0;
+
+  // Compute and log the number of parameters
+  for (const auto& array_pair : model->GetArrayMap()) {
+    const Array& array = *array_pair.second;
+    if (!array.buffer) {
+      // not a parameter array
+      continue;
+    }
+    params_count += RequiredBufferSizeForShape(array.shape());
+  }
+  LOG(INFO) << "Number of parameters: " << params_count;
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/lite/toco/toco_types.h b/tensorflow/lite/toco/toco_types.h
index da2efd6724a..76dd1b0348d 100644
--- a/tensorflow/lite/toco/toco_types.h
+++ b/tensorflow/lite/toco/toco_types.h
@@ -16,13 +16,9 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
 
 #include <string>
-#include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
-#include "tensorflow/core/platform/google/integral_types.h"
-#else
-#include "tensorflow/core/platform/default/integral_types.h"
-#endif
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace toco {
 #ifdef PLATFORM_GOOGLE
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 3978cf5ee1a..442c830b246 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -1049,18 +1049,20 @@ void CheckEachArray(const Model& model) {
     const auto& array = array_entry.second;
     // It's OK to have a buffer or an alloc, but not both.
     // (Since allocs are for transient arrays without a buffer).
-    CHECK(!array->buffer || !array->alloc);
+    CHECK(!array->buffer || !array->alloc) << "Tensor: " << array_entry.first;
     if (array->buffer) {
       // If there is a buffer, its type should be consistent with data_type.
-      CHECK(array->buffer->type == array->data_type);
+      CHECK(array->buffer->type == array->data_type)
+          << "Tensor: " << array_entry.first;
       // The presence of a fixed buffer should imply the presence of a fixed
       // shape.
-      CHECK(array->has_shape());
+      CHECK(array->has_shape()) << array_entry.first;
       // Constant buffer should has a valid shape.
       CheckValidShape(array->shape());
       // The shape flat-size should agree with the buffer length.
       CHECK_EQ(array->buffer->Length(),
-               RequiredBufferSizeForShape(array->shape()));
+               RequiredBufferSizeForShape(array->shape()))
+          << "Tensor: " << array_entry.first;
     }
 
     // Check name.  Either "name_with_suffix_8", "name_with_port:3", but not
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 3f448b1e5cc..38fc69e8408 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -91,7 +91,8 @@ cc_test(
         "//tensorflow/core:framework_lite",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
index 1f5a0121069..f58efeb39fe 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/README.md
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -110,7 +110,7 @@ bazel build -c opt \
      (make the directory if required):
 
 ```
-adb push bazel-bin/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp
+adb push bazel-bin/third_party/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp
 ```
 
 (3) Make the binary executable.
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 69e8fc6b2ce..d8e93ff3b29 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -36,6 +36,26 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "benchmark_model_performance_options",
+    srcs = [
+        "benchmark_tflite_performance_options_main.cc",
+    ],
+    copts = common_copts,
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_performance_options",
+        ":benchmark_tflite_model_lib",
+        ":logging",
+    ],
+)
+
 tf_cc_binary(
     name = "benchmark_model_plus_flex",
     srcs = [
@@ -87,6 +107,7 @@ cc_library(
     copts = common_copts,
     deps = [
         ":benchmark_model_lib",
+        ":benchmark_utils",
         ":logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
@@ -98,6 +119,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "benchmark_performance_options",
+    srcs = [
+        "benchmark_performance_options.cc",
+    ],
+    hdrs = ["benchmark_performance_options.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
+        ":benchmark_params",
+        ":benchmark_utils",
+        ":logging",
+        "//tensorflow/core:stats_calculator_portable",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/profiling:time",
+        "//tensorflow/lite/tools:command_line_flags",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/gpu:gl_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "benchmark_params",
     srcs = [
@@ -117,12 +162,37 @@ cc_library(
     copts = common_copts,
     deps = [
         ":benchmark_params",
+        ":benchmark_utils",
         ":logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools:command_line_flags",
     ],
 )
 
+cc_library(
+    name = "benchmark_utils",
+    srcs = [
+        "benchmark_utils.cc",
+    ],
+    hdrs = ["benchmark_utils.h"],
+    copts = common_copts,
+    deps = ["//tensorflow/lite/profiling:time"],
+)
+
+cc_test(
+    name = "benchmark_utils_test",
+    srcs = [
+        "benchmark_utils_test.cc",
+    ],
+    copts = common_copts,
+    deps = [
+        ":benchmark_utils",
+        "//tensorflow/lite/profiling:time",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 8e77a22f6b1..5eb623ce1c8 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -39,6 +39,14 @@ and the following optional parameters:
     This API is available on recent Android devices. Note that some Android P
     devices will fail to use NNAPI for models in `/data/local/tmp/` and this
     benchmark tool will not correctly use NNAPI.
+*   `nnapi_accelerator_name`: `str` (default="") \
+    The name of the NNAPI accelerator to use (requires Android Q+). If left
+    blank, NNAPI will automatically select which of the available accelerators
+    to use.
+*   `nnapi_execution_preference`: `string` (default="") \
+    Which [NNAPI execution preference](https://developer.android.com/ndk/reference/group/neural-networks.html#group___neural_networks_1gga034380829226e2d980b2a7e63c992f18af727c25f1e2d8dcc693c477aef4ea5f5)
+    to use when executing using NNAPI. Should be one of the
+    following: fast_single_answer, sustained_speed, low_power, undefined.
 *   `use_legacy_nnapi`: `bool` (default=false) \
     Whether to use the legacy
     [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/)
@@ -213,3 +221,22 @@ Memory (bytes): count=0
 
 Average inference timings in us: Warmup: 83235, Init: 38467, no stats: 79760.9
 ```
+
+## Benchmark multiple performance options in a single run
+
+A convenient and simple C++ binary is also provided to benchmark multiple
+performance options in a single run. This binary is built based on the
+aforementioned benchmark tool that could only benchmark a single performance
+option at a time. They share the same build/install/run process, but the BUILD
+target name of this binary is `benchmark_model_performance_options` and it takes
+some additional parameters as detailed below.
+
+### Additional Parameters
+*   `perf_options_list`: `string` (default='all') \
+    A comma-separated list of TFLite performance options to benchmark.
+*   `option_benchmark_run_delay`: `float` (default=-1.0) \
+    The delay between two consecutive runs of benchmarking performance options
+    in seconds.
+*   `random_shuffle_benchmark_runs`: `bool` (default=true) \
+    Whether to perform all benchmark runs, each of which has different
+    performance options, in a random order.
diff --git a/tensorflow/lite/tools/benchmark/benchmark_main.cc b/tensorflow/lite/tools/benchmark/benchmark_main.cc
index dcf82a8b7ec..4e8ee314d64 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_main.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_main.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
+
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
@@ -28,8 +30,11 @@ int Main(int argc, char** argv) {
   BenchmarkTfLiteModel benchmark;
   BenchmarkLoggingListener listener;
   benchmark.AddListener(&listener);
-  benchmark.Run(argc, argv);
-  return 0;
+  if (benchmark.Run(argc, argv) != kTfLiteOk) {
+    TFLITE_LOG(ERROR) << "Benchmarking failed.";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
 }
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 3ee5500ef7a..c1f47e8ad77 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -19,22 +19,9 @@ limitations under the License.
 #include <sstream>
 
 #include "tensorflow/lite/profiling/time.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
-namespace {
-void SleepForSeconds(double sleep_seconds) {
-  if (sleep_seconds <= 0.0) {
-    return;
-  }
-  // If requested, sleep between runs for an arbitrary amount of time.
-  // This can be helpful to determine the effect of mobile processor
-  // scaling and thermal throttling.
-  return tflite::profiling::time::SleepForMicros(
-      static_cast<uint64_t>(sleep_seconds * 1e6));
-}
-
-}  // namespace
-
 namespace tflite {
 namespace benchmark {
 using tensorflow::Stat;
@@ -55,7 +42,7 @@ BenchmarkParams BenchmarkModel::DefaultParams() {
 
 BenchmarkModel::BenchmarkModel() : params_(DefaultParams()) {}
 
-void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
+void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
   auto inference_us = results.inference_time_us();
   auto init_us = results.startup_latency_us();
   auto warmup_us = results.warmup_time_us();
@@ -118,12 +105,13 @@ void BenchmarkModel::LogParams() {
                    << params_.Get<float>("warmup_min_secs") << "]";
 }
 
-void BenchmarkModel::PrepareInputData() {}
+TfLiteStatus BenchmarkModel::PrepareInputData() { return kTfLiteOk; }
 
-void BenchmarkModel::ResetInputsAndOutputs() {}
+TfLiteStatus BenchmarkModel::ResetInputsAndOutputs() { return kTfLiteOk; }
 
 Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
-                                  float max_secs, RunType run_type) {
+                                  float max_secs, RunType run_type,
+                                  TfLiteStatus* invoke_status) {
   Stat<int64_t> run_stats;
   TFLITE_LOG(INFO) << "Running benchmark for at least " << min_num_times
                    << " iterations and at least " << min_secs << " seconds but"
@@ -132,19 +120,24 @@ Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
   int64_t min_finish_us = now_us + static_cast<int64_t>(min_secs * 1.e6f);
   int64_t max_finish_us = now_us + static_cast<int64_t>(max_secs * 1.e6f);
 
+  *invoke_status = kTfLiteOk;
   for (int run = 0; (run < min_num_times || now_us < min_finish_us) &&
                     now_us <= max_finish_us;
        run++) {
     ResetInputsAndOutputs();
     listeners_.OnSingleRunStart(run_type);
     int64_t start_us = profiling::time::NowMicros();
-    RunImpl();
+    TfLiteStatus status = RunImpl();
     int64_t end_us = profiling::time::NowMicros();
     listeners_.OnSingleRunEnd();
 
     run_stats.UpdateStat(end_us - start_us);
-    SleepForSeconds(params_.Get<float>("run_delay"));
+    util::SleepForSeconds(params_.Get<float>("run_delay"));
     now_us = profiling::time::NowMicros();
+
+    if (status != kTfLiteOk) {
+      *invoke_status = status;
+    }
   }
 
   std::stringstream stream;
@@ -154,49 +147,57 @@ Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
   return run_stats;
 }
 
-bool BenchmarkModel::ValidateParams() { return true; }
+TfLiteStatus BenchmarkModel::ValidateParams() { return kTfLiteOk; }
 
-void BenchmarkModel::Run(int argc, char **argv) {
-  if (!ParseFlags(argc, argv)) {
-    return;
-  }
-  Run();
+TfLiteStatus BenchmarkModel::Run(int argc, char** argv) {
+  TF_LITE_ENSURE_STATUS(ParseFlags(argc, argv));
+  return Run();
 }
 
-void BenchmarkModel::Run() {
-  ValidateParams();
+TfLiteStatus BenchmarkModel::Run() {
+  TF_LITE_ENSURE_STATUS(ValidateParams());
+
   LogParams();
 
   int64_t initialization_start_us = profiling::time::NowMicros();
-  Init();
+  TF_LITE_ENSURE_STATUS(Init());
   int64_t initialization_end_us = profiling::time::NowMicros();
   int64_t startup_latency_us = initialization_end_us - initialization_start_us;
   TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
                    << "ms";
 
-  PrepareInputData();
+  TF_LITE_ENSURE_STATUS(PrepareInputData());
+
+  TfLiteStatus status = kTfLiteOk;
   uint64_t input_bytes = ComputeInputBytes();
   listeners_.OnBenchmarkStart(params_);
-  Stat<int64_t> warmup_time_us = Run(params_.Get<int32_t>("warmup_runs"),
-                                     params_.Get<float>("warmup_min_secs"),
-                                     params_.Get<float>("max_secs"), WARMUP);
+  Stat<int64_t> warmup_time_us =
+      Run(params_.Get<int32_t>("warmup_runs"),
+          params_.Get<float>("warmup_min_secs"), params_.Get<float>("max_secs"),
+          WARMUP, &status);
+  if (status != kTfLiteOk) {
+    return status;
+  }
+
   Stat<int64_t> inference_time_us =
       Run(params_.Get<int32_t>("num_runs"), params_.Get<float>("min_secs"),
-          params_.Get<float>("max_secs"), REGULAR);
+          params_.Get<float>("max_secs"), REGULAR, &status);
   listeners_.OnBenchmarkEnd(
       {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
+
+  return status;
 }
 
-bool BenchmarkModel::ParseFlags(int argc, char **argv) {
+TfLiteStatus BenchmarkModel::ParseFlags(int* argc, char** argv) {
   auto flag_list = GetFlags();
   const bool parse_result =
-      Flags::Parse(&argc, const_cast<const char **>(argv), flag_list);
+      Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   if (!parse_result) {
     std::string usage = Flags::Usage(argv[0], flag_list);
     TFLITE_LOG(ERROR) << usage;
-    return false;
+    return kTfLiteError;
   }
-  return true;
+  return kTfLiteOk;
 }
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index 132ee84bb7b..7a0a6c07e50 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_params.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 
@@ -129,8 +130,9 @@ class BenchmarkLoggingListener : public BenchmarkListener {
 template <typename T>
 Flag CreateFlag(const char* name, BenchmarkParams* params,
                 const std::string& usage) {
-  return Flag(name, [params, name](const T& val) { params->Set<T>(name, val); },
-              params->Get<T>(name), usage);
+  return Flag(
+      name, [params, name](const T& val) { params->Set<T>(name, val); },
+      params->Get<T>(name), usage);
 }
 
 // Benchmarks a model.
@@ -143,27 +145,38 @@ class BenchmarkModel {
   BenchmarkModel();
   BenchmarkModel(BenchmarkParams params) : params_(std::move(params)) {}
   virtual ~BenchmarkModel() {}
-  virtual void Init() = 0;
-  void Run(int argc, char** argv);
-  virtual void Run();
+  virtual TfLiteStatus Init() = 0;
+  TfLiteStatus Run(int argc, char** argv);
+  virtual TfLiteStatus Run();
   void AddListener(BenchmarkListener* listener) {
     listeners_.AddListener(listener);
   }
 
+  BenchmarkParams* mutable_params() { return &params_; }
+
+  // Unparsable flags will remain in 'argv' in the original order and 'argc'
+  // will be updated accordingly.
+  TfLiteStatus ParseFlags(int* argc, char** argv);
+
  protected:
   virtual void LogParams();
-  virtual bool ValidateParams();
-  bool ParseFlags(int argc, char** argv);
+  virtual TfLiteStatus ValidateParams();
+
+  TfLiteStatus ParseFlags(int argc, char** argv) {
+    return ParseFlags(&argc, argv);
+  }
   virtual std::vector<Flag> GetFlags();
+
   virtual uint64_t ComputeInputBytes() = 0;
   virtual tensorflow::Stat<int64_t> Run(int min_num_times, float min_secs,
-                                        float max_secs, RunType run_type);
+                                        float max_secs, RunType run_type,
+                                        TfLiteStatus* invoke_status);
   // Prepares input data for benchmark. This can be used to initialize input
   // data that has non-trivial cost.
-  virtual void PrepareInputData();
+  virtual TfLiteStatus PrepareInputData();
 
-  virtual void ResetInputsAndOutputs();
-  virtual void RunImpl() = 0;
+  virtual TfLiteStatus ResetInputsAndOutputs();
+  virtual TfLiteStatus RunImpl() = 0;
   BenchmarkParams params_;
   BenchmarkListeners listeners_;
 };
diff --git a/tensorflow/lite/tools/benchmark/benchmark_params.cc b/tensorflow/lite/tools/benchmark/benchmark_params.cc
index 5ab3adff553..caff9714d47 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_params.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_params.cc
@@ -53,5 +53,13 @@ void BenchmarkParams::AssertParamExists(const std::string& name) const {
   TFLITE_BENCHMARK_CHECK(HasParam(name)) << name << " was not found.";
 }
 
+void BenchmarkParams::Set(const BenchmarkParams& other) {
+  for (const auto& param : params_) {
+    const BenchmarkParam* other_param = other.GetParam(param.first);
+    if (other_param == nullptr) continue;
+    param.second->Set(*other_param);
+  }
+}
+
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_params.h b/tensorflow/lite/tools/benchmark/benchmark_params.h
index c591cc2445b..07db44dd84c 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_params.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_params.h
@@ -47,8 +47,17 @@ class BenchmarkParam {
     AssertHasSameType(GetValueType<T>(), type_);
     return static_cast<TypedBenchmarkParam<T>*>(this);
   }
+
+  template <typename T>
+  const TypedBenchmarkParam<T>* AsConstTyped() const {
+    AssertHasSameType(GetValueType<T>(), type_);
+    return static_cast<const TypedBenchmarkParam<T>*>(this);
+  }
+
   virtual ~BenchmarkParam() {}
-  BenchmarkParam(ParamType type) : type_(type) {}
+  explicit BenchmarkParam(ParamType type) : type_(type) {}
+
+  virtual void Set(const BenchmarkParam&) {}
 
  private:
   static void AssertHasSameType(ParamType a, ParamType b);
@@ -59,11 +68,16 @@ class BenchmarkParam {
 template <typename T>
 class TypedBenchmarkParam : public BenchmarkParam {
  public:
-  TypedBenchmarkParam(const T& value)
+  explicit TypedBenchmarkParam(const T& value)
       : BenchmarkParam(GetValueType<T>()), value_(value) {}
+
   void Set(const T& value) { value_ = value; }
 
-  T Get() { return value_; }
+  T Get() const { return value_; }
+
+  void Set(const BenchmarkParam& other) override {
+    Set(other.AsConstTyped<T>()->Get());
+  }
 
  private:
   T value_;
@@ -80,6 +94,12 @@ class BenchmarkParams {
     return params_.find(name) != params_.end();
   }
 
+  const BenchmarkParam* GetParam(const std::string& name) const {
+    const auto& entry = params_.find(name);
+    if (entry == params_.end()) return nullptr;
+    return entry->second.get();
+  }
+
   template <typename T>
   void Set(const std::string& name, const T& value) {
     AssertParamExists(name);
@@ -92,6 +112,9 @@ class BenchmarkParams {
     return params_.at(name)->AsTyped<T>()->Get();
   }
 
+  // Set the value of all same parameters from 'other'.
+  void Set(const BenchmarkParams& other);
+
  private:
   void AssertParamExists(const std::string& name) const;
   std::unordered_map<std::string, std::unique_ptr<BenchmarkParam>> params_;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
new file mode 100644
index 00000000000..28b74dbd9e7
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -0,0 +1,303 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#if defined(__ANDROID__)
+#include "tensorflow/lite/delegates/gpu/gl_delegate.h"
+#endif
+#include "tensorflow/lite/profiling/time.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+
+namespace tflite {
+namespace benchmark {
+
+void MultiRunStatsRecorder::OnBenchmarkStart(const BenchmarkParams& params) {
+  current_run_name_.clear();
+
+  if (params.Get<bool>("use_nnapi")) {
+    current_run_name_ = "nnapi";
+    return;
+  }
+
+  if (params.Get<bool>("use_gpu")) {
+#if defined(__ANDROID__)
+    const bool allow_precision_loss =
+        params.Get<bool>("gpu_precision_loss_allowed");
+    const string precision_tag = allow_precision_loss ? "fp16" : "fp32";
+
+    const int32_t gl_obj_type = params.Get<int32_t>("gpu_gl_object_type");
+    string gl_type;
+    switch (gl_obj_type) {
+      case TFLITE_GL_OBJECT_TYPE_FASTEST:
+        gl_type = "fastest";
+        break;
+      case TFLITE_GL_OBJECT_TYPE_TEXTURE:
+        gl_type = "texture";
+        break;
+      case TFLITE_GL_OBJECT_TYPE_BUFFER:
+        gl_type = "buffer";
+        break;
+      default:
+        gl_type = "unknown";
+        break;
+    }
+
+    if (allow_precision_loss && gl_obj_type == TFLITE_GL_OBJECT_TYPE_FASTEST) {
+      current_run_name_ = "gpu(fp16, fastest)-default";
+      return;
+    }
+    current_run_name_ = "gpu(" + precision_tag + ", " + gl_type + ")";
+#else
+    current_run_name_ = "gpu(fp16, fastest)-default";
+#endif
+    return;
+  }
+
+  // Handle cases run on CPU
+  // Note: could use std::to_string to convert an integer to string but it
+  // requires C++11.
+  std::stringstream sstm;
+  sstm << "cpu w/ " << params.Get<int32_t>("num_threads") << " threads";
+  current_run_name_ = sstm.str();
+}
+
+void MultiRunStatsRecorder::OnBenchmarkEnd(const BenchmarkResults& results) {
+  each_run_stats_.emplace_back(std::make_pair(current_run_name_, results));
+}
+
+void MultiRunStatsRecorder::OutputStats() {
+  // Make a 80-character-long header.
+  TFLITE_LOG(INFO) << "\n==============Summary of All Runs w/ Different "
+                      "Performance Options==============";
+  std::sort(each_run_stats_.begin(), each_run_stats_.end(),
+            EachRunStatsEntryComparator());
+
+  for (const auto& run_stats : each_run_stats_) {
+    std::stringstream stream;
+    // Output the name of this run first.
+    stream << std::setw(26) << run_stats.first << ": ";
+    run_stats.second.inference_time_us().OutputToStream(&stream);
+    TFLITE_LOG(INFO) << stream.str();
+  }
+}
+
+BenchmarkPerformanceOptions::BenchmarkPerformanceOptions(
+    BenchmarkModel* single_option_run)
+    : BenchmarkPerformanceOptions(DefaultParams(), single_option_run,
+                                  DefaultRunStatsRecorder()) {}
+
+BenchmarkPerformanceOptions::BenchmarkPerformanceOptions(
+    BenchmarkParams params, BenchmarkModel* single_option_run,
+    std::unique_ptr<MultiRunStatsRecorder> all_run_stats)
+    : params_(std::move(params)),
+      single_option_run_(single_option_run),
+      single_option_run_params_(single_option_run->mutable_params()),
+      all_run_stats_(std::move(all_run_stats)) {
+  single_option_run_->AddListener(all_run_stats_.get());
+}
+
+BenchmarkParams BenchmarkPerformanceOptions::DefaultParams() {
+  BenchmarkParams params;
+  params.AddParam("perf_options_list",
+                  BenchmarkParam::Create<std::string>("all"));
+  params.AddParam("option_benchmark_run_delay",
+                  BenchmarkParam::Create<float>(-1.0f));
+  params.AddParam("random_shuffle_benchmark_runs",
+                  BenchmarkParam::Create<bool>(true));
+  return params;
+}
+
+std::unique_ptr<MultiRunStatsRecorder>
+BenchmarkPerformanceOptions::DefaultRunStatsRecorder() {
+  return std::unique_ptr<MultiRunStatsRecorder>(new MultiRunStatsRecorder());
+}
+
+std::vector<Flag> BenchmarkPerformanceOptions::GetFlags() {
+  return {
+      CreateFlag<std::string>(
+          "perf_options_list", &params_,
+          "A comma-separated list of TFLite performance options to benchmark. "
+          "By default, all performance options are benchmarked."),
+      CreateFlag<float>("option_benchmark_run_delay", &params_,
+                        "The delay between two consecutive runs of "
+                        "benchmarking performance options in seconds."),
+      CreateFlag<bool>(
+          "random_shuffle_benchmark_runs", &params_,
+          "Whether to perform all benchmark runs, each of which has different "
+          "performance options, in a random order. It is enabled by default."),
+  };
+}
+
+bool BenchmarkPerformanceOptions::ParseFlags(int* argc, char** argv) {
+  auto flag_list = GetFlags();
+  const bool parse_result =
+      Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return false;
+  }
+
+  // Parse the value of --perf_options_list to find performance options to be
+  // benchmarked.
+  return ParsePerfOptions();
+}
+
+bool BenchmarkPerformanceOptions::ParsePerfOptions() {
+  const auto& perf_options_list = params_.Get<std::string>("perf_options_list");
+  if (!util::SplitAndParse(perf_options_list, ',', &perf_options_)) {
+    TFLITE_LOG(ERROR) << "Cannot parse --perf_options_list: '"
+                      << perf_options_list
+                      << "'. Please double-check its value.";
+    perf_options_.clear();
+    return false;
+  }
+
+  const auto valid_options = GetValidPerfOptions();
+  bool is_valid = true;
+  for (const auto& option : perf_options_) {
+    if (std::find(valid_options.begin(), valid_options.end(), option) ==
+        valid_options.end()) {
+      is_valid = false;
+      break;
+    }
+  }
+  if (!is_valid) {
+    std::string valid_options_str;
+    for (int i = 0; i < valid_options.size() - 1; ++i) {
+      valid_options_str += (valid_options[i] + ", ");
+    }
+    valid_options_str += valid_options.back();
+    TFLITE_LOG(ERROR)
+        << "There are invalid perf options in --perf_options_list: '"
+        << perf_options_list << "'. Valid perf options are: ["
+        << valid_options_str << "]";
+    perf_options_.clear();
+    return false;
+  }
+  return true;
+}
+
+std::vector<std::string> BenchmarkPerformanceOptions::GetValidPerfOptions()
+    const {
+  return {"all", "cpu", "gpu", "nnapi"};
+}
+
+bool BenchmarkPerformanceOptions::HasOption(const std::string& option) const {
+  return std::find(perf_options_.begin(), perf_options_.end(), option) !=
+         perf_options_.end();
+}
+
+void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
+  single_option_run_params_->Set<int32_t>("num_threads", 1);
+  single_option_run_params_->Set<bool>("use_gpu", false);
+#if defined(__ANDROID__)
+  single_option_run_params_->Set<bool>("gpu_precision_loss_allowed", true);
+  single_option_run_params_->Set<int32_t>("gpu_gl_object_type",
+                                          TFLITE_GL_OBJECT_TYPE_FASTEST);
+#endif
+  single_option_run_params_->Set<bool>("use_nnapi", false);
+}
+
+void BenchmarkPerformanceOptions::CreatePerformanceOptions() {
+  TFLITE_LOG(INFO) << "The list of TFLite runtime options to be benchmarked: ["
+                   << params_.Get<std::string>("perf_options_list") << "]";
+
+  const bool benchmark_all = HasOption("all");
+
+  if (benchmark_all || HasOption("cpu")) {
+    const std::vector<int> num_threads = {1, 2, 4};
+    for (const int count : num_threads) {
+      BenchmarkParams params;
+      params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(count));
+      all_run_params_.emplace_back(std::move(params));
+    }
+  }
+
+  if (benchmark_all || HasOption("gpu")) {
+#if defined(__ANDROID__)
+    const std::vector<bool> allow_precision_loss = {true, false};
+    const std::vector<int32_t> gl_obj_types = {TFLITE_GL_OBJECT_TYPE_TEXTURE,
+                                               TFLITE_GL_OBJECT_TYPE_BUFFER};
+    for (const auto precision_loss : allow_precision_loss) {
+      for (const auto obj_type : gl_obj_types) {
+        BenchmarkParams params;
+        params.AddParam("use_gpu", BenchmarkParam::Create<bool>(true));
+        params.AddParam("gpu_precision_loss_allowed",
+                        BenchmarkParam::Create<bool>(precision_loss));
+        params.AddParam("gpu_gl_object_type",
+                        BenchmarkParam::Create<int32_t>(obj_type));
+        all_run_params_.emplace_back(std::move(params));
+      }
+    }
+#endif
+    // Note by default, gpu delegate allows to operate on lower precision and
+    // uses the fastest GL object type.
+    BenchmarkParams params;
+    params.AddParam("use_gpu", BenchmarkParam::Create<bool>(true));
+    all_run_params_.emplace_back(std::move(params));
+  }
+
+  if (benchmark_all || HasOption("nnapi")) {
+    BenchmarkParams params;
+    params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(true));
+    all_run_params_.emplace_back(std::move(params));
+  }
+}
+
+void BenchmarkPerformanceOptions::Run(int argc, char** argv) {
+  // We first parse flags for single-option runs to get information like
+  // parameters of the input model etc.
+  if (single_option_run_->ParseFlags(&argc, argv) != kTfLiteOk) return;
+
+  // Now, we parse flags that are specified for this particular binary.
+  if (!ParseFlags(&argc, argv)) return;
+
+  // Now, the remaining are unrecognized flags and we simply print them out.
+  for (int i = 1; i < argc; ++i) {
+    TFLITE_LOG(WARN) << "WARNING: unrecognized commandline flag: " << argv[i];
+  }
+
+  CreatePerformanceOptions();
+
+  if (params_.Get<bool>("random_shuffle_benchmark_runs")) {
+    std::random_shuffle(all_run_params_.begin(), all_run_params_.end());
+  }
+
+  // Now perform all runs, each with different performance-affecting parameters.
+  for (const auto& run_params : all_run_params_) {
+    // Reset all performance-related options before any runs.
+    ResetPerformanceOptions();
+    single_option_run_params_->Set(run_params);
+    util::SleepForSeconds(params_.Get<float>("option_benchmark_run_delay"));
+    single_option_run_->Run();
+  }
+
+  all_run_stats_->OutputStats();
+}
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.h b/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
new file mode 100644
index 00000000000..df5aa818600
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+class MultiRunStatsRecorder : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+  virtual void OutputStats();
+
+ protected:
+  using EachRunStatsEntry = std::pair<std::string, BenchmarkResults>;
+
+  // Use this to order the runs by the average inference time in increasing
+  // order (i.e. the fastest run ranks first.)
+  struct EachRunStatsEntryComparator {
+    bool operator()(const EachRunStatsEntry& i, const EachRunStatsEntry& j) {
+      return (i.second.inference_time_us().avg() <
+              j.second.inference_time_us().avg());
+    }
+  };
+
+  std::string current_run_name_;
+  std::vector<EachRunStatsEntry> each_run_stats_;
+};
+
+// Benchmarks all performance options on a model by repeatedly invoking the
+// single-performance-option run on a passed-in 'BenchmarkModel' object.
+class BenchmarkPerformanceOptions {
+ public:
+  // Doesn't own the memory of 'single_option_run'.
+  explicit BenchmarkPerformanceOptions(BenchmarkModel* single_option_run);
+
+  virtual ~BenchmarkPerformanceOptions() {}
+
+  void Run(int argc, char** argv);
+
+ protected:
+  static BenchmarkParams DefaultParams();
+  static std::unique_ptr<MultiRunStatsRecorder> DefaultRunStatsRecorder();
+
+  BenchmarkPerformanceOptions(
+      BenchmarkParams params, BenchmarkModel* single_option_run,
+      std::unique_ptr<MultiRunStatsRecorder> all_run_stats);
+
+  // Unparsable flags will remain in 'argv' in the original order and 'argc'
+  // will be updated accordingly.
+  bool ParseFlags(int* argc, char** argv);
+  virtual std::vector<Flag> GetFlags();
+
+  bool ParsePerfOptions();
+  virtual std::vector<std::string> GetValidPerfOptions() const;
+  bool HasOption(const std::string& option) const;
+
+  virtual void ResetPerformanceOptions();
+  virtual void CreatePerformanceOptions();
+
+  BenchmarkParams params_;
+  std::vector<std::string> perf_options_;
+
+  // The object that drives a single-performance-option run.
+  BenchmarkModel* const single_option_run_;          // Doesn't own the memory.
+  BenchmarkParams* const single_option_run_params_;  // Doesn't own the memory.
+
+  // Each element is a set of performance-affecting benchmark parameters to be
+  // all set for a particular benchmark run.
+  std::vector<BenchmarkParams> all_run_params_;
+
+  std::unique_ptr<MultiRunStatsRecorder> all_run_stats_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 563bf9e6eef..04514b92384 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -46,12 +46,18 @@ BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs) {
   params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
   params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
   params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  params.AddParam("require_full_delegation",
+                  BenchmarkParam::Create<bool>(false));
   params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   params.AddParam("use_legacy_nnapi", BenchmarkParam::Create<bool>(false));
   params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
   params.AddParam("enable_op_profiling", BenchmarkParam::Create<bool>(false));
   params.AddParam("max_profiling_buffer_entries",
                   BenchmarkParam::Create<int32_t>(1024));
+  params.AddParam("nnapi_accelerator_name",
+                  BenchmarkParam::Create<std::string>(""));
+  params.AddParam("nnapi_execution_preference",
+                  BenchmarkParam::Create<std::string>(""));
   return params;
 }
 
@@ -61,7 +67,7 @@ class TestBenchmark : public BenchmarkTfLiteModel {
  public:
   explicit TestBenchmark(BenchmarkParams params)
       : BenchmarkTfLiteModel(std::move(params)) {}
-  const tflite::Interpreter* GetInterpreter() { return interpreter.get(); }
+  const tflite::Interpreter* GetInterpreter() { return interpreter_.get(); }
 
   void Prepare() {
     PrepareInputData();
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index e527796664f..b729c1e50aa 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -23,12 +23,17 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#if defined(__ANDROID__)
+#include "tensorflow/lite/delegates/gpu/gl_delegate.h"
+#endif
+
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
@@ -55,9 +60,7 @@ constexpr int kOpProfilingEnabledDefault = false;
 class ProfilingListener : public BenchmarkListener {
  public:
   explicit ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries)
-      : interpreter_(interpreter),
-        profiler_(max_num_entries),
-        has_profiles_(false) {
+      : interpreter_(interpreter), profiler_(max_num_entries) {
     TFLITE_BENCHMARK_CHECK(interpreter);
     interpreter_->SetProfiler(&profiler_);
   }
@@ -72,7 +75,6 @@ class ProfilingListener : public BenchmarkListener {
   Interpreter* interpreter_;
   profiling::BufferedProfiler profiler_;
   profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
 };
 
 // Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
@@ -91,7 +93,7 @@ void ProfilingListener::OnSingleRunStart(RunType run_type) {
 }
 
 void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
-  if (has_profiles_) {
+  if (summarizer_.HasProfiles()) {
     TFLITE_LOG(INFO) << summarizer_.GetOutputString();
   }
 }
@@ -99,7 +101,6 @@ void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
 void ProfilingListener::OnSingleRunEnd() {
   profiler_.StopProfiling();
   auto profile_events = profiler_.GetProfileEvents();
-  has_profiles_ = !profile_events.empty();
   summarizer_.ProcessProfiles(profile_events, *interpreter_);
 }
 
@@ -119,39 +120,13 @@ void GemmlowpProfilingListener::OnBenchmarkEnd(
 }
 
 std::vector<std::string> Split(const std::string& str, const char delim) {
-  std::istringstream input(str);
   std::vector<std::string> results;
-  std::string item;
-  while (std::getline(input, item, delim)) {
-    results.push_back(item);
+  if (!util::SplitAndParse(str, delim, &results)) {
+    results.clear();
   }
   return results;
 }
 
-template <typename T>
-bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
-  std::istringstream input(str);
-  bool first = true;
-  while (!input.eof()) {
-    if (!first) {
-      char c;
-      input >> c;
-      if (c != delim) {
-        return false;
-      }
-    } else {
-      first = false;
-    }
-    T val;
-    input >> val;
-    if (!input.eof() && !input.good()) {
-      return false;
-    }
-    values->push_back(val);
-  }
-  return true;
-}
-
 template <typename T>
 void FillRandomValue(T* ptr, int num_elements,
                      const std::function<T()>& random_func) {
@@ -173,9 +148,10 @@ void FillRandomString(tflite::DynamicBuffer* buffer,
   }
 }
 
-bool PopulateInputLayerInfo(
+TfLiteStatus PopulateInputLayerInfo(
     const string& names_string, const string& shapes_string,
     std::vector<BenchmarkTfLiteModel::InputLayerInfo>* info) {
+  info->clear();
   std::vector<std::string> names = Split(names_string, ',');
   std::vector<std::string> shapes = Split(shapes_string, ':');
 
@@ -188,7 +164,7 @@ bool PopulateInputLayerInfo(
                       << names.size() << " items)."
                       << " For example --input_layer=input1,input2"
                       << " --input_layer_shape=1,224,224,4:1,20";
-    return false;
+    return kTfLiteError;
   }
 
   for (int i = 0; i < names.size(); ++i) {
@@ -197,19 +173,19 @@ bool PopulateInputLayerInfo(
 
     input.name = names[i];
 
-    TFLITE_BENCHMARK_CHECK(SplitAndParse(shapes[i], ',', &input.shape))
+    TFLITE_BENCHMARK_CHECK(util::SplitAndParse(shapes[i], ',', &input.shape))
         << "Incorrect size string specified: " << shapes[i];
     for (int dim : input.shape) {
       if (dim == -1) {
         TFLITE_LOG(ERROR)
             << "Any unknown sizes in the shapes (-1's) must be replaced"
             << " with the size you want to benchmark with.";
-        return false;
+        return kTfLiteError;
       }
     }
   }
 
-  return true;
+  return kTfLiteOk;
 }
 
 std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
@@ -231,12 +207,23 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   default_params.AddParam("input_layer_shape",
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam("nnapi_execution_preference",
+                          BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("use_legacy_nnapi",
                           BenchmarkParam::Create<bool>(false));
   default_params.AddParam("nnapi_accelerator_name",
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+#if defined(__ANDROID__)
+  default_params.AddParam("gpu_precision_loss_allowed",
+                          BenchmarkParam::Create<bool>(true));
+  default_params.AddParam(
+      "gpu_gl_object_type",
+      BenchmarkParam::Create<int32_t>(TFLITE_GL_OBJECT_TYPE_FASTEST));
+#endif
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam("require_full_delegation",
+                          BenchmarkParam::Create<bool>(false));
   default_params.AddParam(
       "enable_op_profiling",
       BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
@@ -267,20 +254,34 @@ BenchmarkTfLiteModel::~BenchmarkTfLiteModel() { CleanUp(); }
 std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
   std::vector<Flag> flags = BenchmarkTfLiteModel::BenchmarkModel::GetFlags();
   std::vector<Flag> specific_flags = {
-      CreateFlag<std::string>("graph", &params_, "graph file name"),
-      CreateFlag<std::string>("input_layer", &params_, "input layer names"),
-      CreateFlag<std::string>("input_layer_shape", &params_,
-                              "input layer shape"),
-      CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
-      CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
-      CreateFlag<std::string>(
-          "nnapi_accelerator_name", &params_,
-          "the name of the nnapi accelerator to use (requires Android Q+)"),
-      CreateFlag<bool>("use_gpu", &params_, "use gpu"),
-      CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
-      CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling"),
-      CreateFlag<int32_t>("max_profiling_buffer_entries", &params_,
-                          "max profiling buffer entries")};
+    CreateFlag<std::string>("graph", &params_, "graph file name"),
+    CreateFlag<std::string>("input_layer", &params_, "input layer names"),
+    CreateFlag<std::string>("input_layer_shape", &params_, "input layer shape"),
+    CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
+    CreateFlag<std::string>(
+        "nnapi_execution_preference", &params_,
+        "execution preference for nnapi delegate. Should be one of the "
+        "following: fast_single_answer, sustained_speed, low_power, undefined"),
+    CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
+    CreateFlag<std::string>(
+        "nnapi_accelerator_name", &params_,
+        "the name of the nnapi accelerator to use (requires Android Q+)"),
+    CreateFlag<bool>("use_gpu", &params_, "use gpu"),
+#if defined(__ANDROID__)
+    CreateFlag<bool>("gpu_precision_loss_allowed", &params_,
+                     "Allow to process computation in lower precision than "
+                     "FP32 in GPU. By default, it's enabled."),
+    CreateFlag<int32_t>("gpu_gl_object_type", &params_,
+                        "The preferred GL object type to represent tensors in "
+                        "GPU. By default, it's TFLITE_GL_OBJECT_TYPE_FASTEST"),
+#endif
+    CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
+    CreateFlag<bool>("require_full_delegation", &params_,
+                     "require delegate to run the entire graph"),
+    CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling"),
+    CreateFlag<int32_t>("max_profiling_buffer_entries", &params_,
+                        "max profiling buffer entries")
+  };
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
   return flags;
@@ -294,15 +295,28 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Input shapes: ["
                    << params_.Get<std::string>("input_layer_shape") << "]";
   TFLITE_LOG(INFO) << "Use nnapi : [" << params_.Get<bool>("use_nnapi") << "]";
+  if (!params_.Get<std::string>("nnapi_execution_preference").empty()) {
+    TFLITE_LOG(INFO) << "nnapi execution preference: ["
+                     << params_.Get<string>("nnapi_execution_preference")
+                     << "]";
+  }
   TFLITE_LOG(INFO) << "Use legacy nnapi : ["
                    << params_.Get<bool>("use_legacy_nnapi") << "]";
-  if (params_.HasParam("nnapi_accelerator_name")) {
+  if (!params_.Get<std::string>("nnapi_accelerator_name").empty()) {
     TFLITE_LOG(INFO) << "nnapi accelerator name: ["
                      << params_.Get<string>("nnapi_accelerator_name") << "]";
   }
   TFLITE_LOG(INFO) << "Use gpu : [" << params_.Get<bool>("use_gpu") << "]";
+#if defined(__ANDROID__)
+  TFLITE_LOG(INFO) << "Allow lower precision in gpu : ["
+                   << params_.Get<bool>("gpu_precision_loss_allowed") << "]";
+  TFLITE_LOG(INFO) << "Preferred GL object type in gpu : ["
+                   << params_.Get<int32_t>("gpu_gl_object_type") << "]";
+#endif
   TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
                    << "]";
+  TFLITE_LOG(INFO) << "Require full delegation : ["
+                   << params_.Get<bool>("require_full_delegation") << "]";
   TFLITE_LOG(INFO) << "Enable op profiling: ["
                    << params_.Get<bool>("enable_op_profiling") << "]";
   TFLITE_LOG(INFO) << "Max profiling buffer entries: ["
@@ -310,35 +324,35 @@ void BenchmarkTfLiteModel::LogParams() {
                    << "]";
 }
 
-bool BenchmarkTfLiteModel::ValidateParams() {
+TfLiteStatus BenchmarkTfLiteModel::ValidateParams() {
   if (params_.Get<std::string>("graph").empty()) {
     TFLITE_LOG(ERROR)
         << "Please specify the name of your TF Lite input file with --graph";
-    return false;
+    return kTfLiteError;
   }
   return PopulateInputLayerInfo(params_.Get<std::string>("input_layer"),
                                 params_.Get<std::string>("input_layer_shape"),
-                                &inputs);
+                                &inputs_);
 }
 
 uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
-  TFLITE_BENCHMARK_CHECK(interpreter);
+  TFLITE_BENCHMARK_CHECK(interpreter_);
   uint64_t total_input_bytes = 0;
-  for (int input : interpreter->inputs()) {
-    auto* t = interpreter->tensor(input);
+  for (int input : interpreter_->inputs()) {
+    auto* t = interpreter_->tensor(input);
     total_input_bytes += t->bytes;
   }
   return total_input_bytes;
 }
 
-void BenchmarkTfLiteModel::PrepareInputData() {
-  auto interpreter_inputs = interpreter->inputs();
+TfLiteStatus BenchmarkTfLiteModel::PrepareInputData() {
+  auto interpreter_inputs = interpreter_->inputs();
   const size_t input_size = interpreter_inputs.size();
   CleanUp();
 
   for (int j = 0; j < input_size; ++j) {
     int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter->tensor(i);
+    TfLiteTensor* t = interpreter_->tensor(i);
     std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
     int num_elements = 1;
     for (int i = 0; i < sizes.size(); ++i) {
@@ -351,6 +365,29 @@ void BenchmarkTfLiteModel::PrepareInputData() {
       FillRandomValue<float>(t_data.data.f, num_elements, []() {
         return static_cast<float>(rand()) / RAND_MAX - 0.5f;
       });
+    } else if (t->type == kTfLiteFloat16) {
+      t_data.bytes = sizeof(TfLiteFloat16) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+#if __GNUC__ && \
+    (__clang__ || __ARM_FP16_FORMAT_IEEE || __ARM_FP16_FORMAT_ALTERNATIVE)
+      // __fp16 is available on Clang or when __ARM_FP16_FORMAT_* is defined.
+      FillRandomValue<TfLiteFloat16>(
+          t_data.data.f16, num_elements, []() -> TfLiteFloat16 {
+            __fp16 f16_value = static_cast<float>(rand()) / RAND_MAX - 0.5f;
+            TfLiteFloat16 f16_placeholder_value;
+            memcpy(&f16_placeholder_value, &f16_value, sizeof(TfLiteFloat16));
+            return f16_placeholder_value;
+          });
+#else
+      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+                        << " of type FLOAT16 on this platform.";
+#endif
+    } else if (t->type == kTfLiteInt64) {
+      t_data.bytes = sizeof(int64_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<int64_t>(t_data.data.i64, num_elements, []() {
+        return static_cast<int64_t>(rand()) % 100;
+      });
     } else if (t->type == kTfLiteInt32) {
       // TODO(yunluli): This is currently only used for handling embedding input
       // for speech models. Generalize if necessary.
@@ -380,33 +417,44 @@ void BenchmarkTfLiteModel::PrepareInputData() {
     } else if (t->type == kTfLiteString) {
       // TODO(haoliang): No need to cache string tensors right now.
     } else {
-      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+      TFLITE_LOG(ERROR) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
+      return kTfLiteError;
     }
     inputs_data_.push_back(t_data);
   }
+  return kTfLiteOk;
 }
 
-void BenchmarkTfLiteModel::ResetInputsAndOutputs() {
-  auto interpreter_inputs = interpreter->inputs();
+TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
+  auto interpreter_inputs = interpreter_->inputs();
   // Set the values of the input tensors from inputs_data_.
   for (int j = 0; j < interpreter_inputs.size(); ++j) {
     int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter->tensor(i);
+    TfLiteTensor* t = interpreter_->tensor(i);
     if (t->type == kTfLiteFloat32) {
-      std::memcpy(interpreter->typed_tensor<float>(i), inputs_data_[j].data.f,
+      std::memcpy(interpreter_->typed_tensor<float>(i), inputs_data_[j].data.f,
                   inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteFloat16) {
+      std::memcpy(interpreter_->typed_tensor<TfLiteFloat16>(i),
+                  inputs_data_[j].data.f16, inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteInt64) {
+      std::memcpy(interpreter_->typed_tensor<int64_t>(i),
+                  inputs_data_[j].data.i64, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteInt32) {
-      std::memcpy(interpreter->typed_tensor<int32_t>(i),
+      std::memcpy(interpreter_->typed_tensor<int32_t>(i),
                   inputs_data_[j].data.i32, inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteInt64) {
+      std::memcpy(interpreter_->typed_tensor<int64_t>(i),
+                  inputs_data_[j].data.i64, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteInt16) {
-      std::memcpy(interpreter->typed_tensor<int16_t>(i),
+      std::memcpy(interpreter_->typed_tensor<int16_t>(i),
                   inputs_data_[j].data.i16, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteUInt8) {
-      std::memcpy(interpreter->typed_tensor<uint8_t>(i),
+      std::memcpy(interpreter_->typed_tensor<uint8_t>(i),
                   inputs_data_[j].data.uint8, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteInt8) {
-      std::memcpy(interpreter->typed_tensor<int8_t>(i),
+      std::memcpy(interpreter_->typed_tensor<int8_t>(i),
                   inputs_data_[j].data.int8, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteString) {
       tflite::DynamicBuffer buffer;
@@ -414,61 +462,86 @@ void BenchmarkTfLiteModel::ResetInputsAndOutputs() {
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
-      buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr);
+      buffer.WriteToTensor(interpreter_->tensor(i), /*new_shape=*/nullptr);
     } else {
-      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+      TFLITE_LOG(ERROR) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
+      return kTfLiteError;
     }
   }
+
+  return kTfLiteOk;
 }
 
-void BenchmarkTfLiteModel::Init() {
+TfLiteStatus BenchmarkTfLiteModel::Init() {
   std::string graph = params_.Get<std::string>("graph");
-  model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
-  if (!model) {
-    TFLITE_LOG(FATAL) << "Failed to mmap model " << graph;
+  model_ = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
+  if (!model_) {
+    TFLITE_LOG(ERROR) << "Failed to mmap model " << graph;
+    return kTfLiteError;
   }
   TFLITE_LOG(INFO) << "Loaded model " << graph;
-  model->error_reporter();
+  model_->error_reporter();
   TFLITE_LOG(INFO) << "resolved reporter";
 
   auto resolver = GetOpResolver();
 
   const int32_t num_threads = params_.Get<int32_t>("num_threads");
-  tflite::InterpreterBuilder(*model, *resolver)(&interpreter, num_threads);
-  if (!interpreter) {
-    TFLITE_LOG(FATAL) << "Failed to construct interpreter";
+  tflite::InterpreterBuilder(*model_, *resolver)(&interpreter_, num_threads);
+  if (!interpreter_) {
+    TFLITE_LOG(ERROR) << "Failed to construct interpreter";
+    return kTfLiteError;
   }
 
-  interpreter->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
+  interpreter_->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
 
   delegates_ = GetDelegates();
   for (const auto& delegate : delegates_) {
-    if (interpreter->ModifyGraphWithDelegate(delegate.second.get()) !=
+    if (interpreter_->ModifyGraphWithDelegate(delegate.second.get()) !=
         kTfLiteOk) {
-      TFLITE_LOG(FATAL) << "Failed to apply " << delegate.first << " delegate.";
+      TFLITE_LOG(ERROR) << "Failed to apply " << delegate.first << " delegate.";
+      return kTfLiteError;
     } else {
+      if (params_.Get<bool>("require_full_delegation")) {
+        bool fully_delegated = true;
+        if (interpreter_->execution_plan().size() != 1) {
+          fully_delegated = false;
+        } else {
+          int first_node_id = interpreter_->execution_plan()[0];
+          const TfLiteNode first_node =
+              interpreter_->node_and_registration(first_node_id)->first;
+          if (delegate.second.get() != first_node.delegate) {
+            fully_delegated = false;
+          }
+        }
+
+        if (!fully_delegated) {
+          TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
+          return kTfLiteError;
+        }
+      }
+
       TFLITE_LOG(INFO) << "Applied " << delegate.first << " delegate.";
     }
   }
 
-  interpreter->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
+  interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
-  auto interpreter_inputs = interpreter->inputs();
+  auto interpreter_inputs = interpreter_->inputs();
 
-  if (!inputs.empty()) {
-    TFLITE_BENCHMARK_CHECK_EQ(inputs.size(), interpreter_inputs.size())
+  if (!inputs_.empty()) {
+    TFLITE_BENCHMARK_CHECK_EQ(inputs_.size(), interpreter_inputs.size())
         << "Inputs mismatch: Model inputs #:" << interpreter_inputs.size()
-        << " expected: " << inputs.size();
+        << " expected: " << inputs_.size();
   }
 
   // Check if the tensor names match, and log a warning if it doesn't.
   // TODO(ycling): Consider to make this an error again when the new converter
   // create tensors with consistent naming.
-  for (int j = 0; j < inputs.size(); ++j) {
-    const InputLayerInfo& input = inputs[j];
+  for (int j = 0; j < inputs_.size(); ++j) {
+    const InputLayerInfo& input = inputs_[j];
     int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter->tensor(i);
+    TfLiteTensor* t = interpreter_->tensor(i);
     if (input.name != t->name) {
       TFLITE_LOG(WARN) << "Tensor # " << i << " is named " << t->name
                        << " but flags call it " << input.name;
@@ -476,23 +549,24 @@ void BenchmarkTfLiteModel::Init() {
   }
 
   // Resize all non-string tensors.
-  for (int j = 0; j < inputs.size(); ++j) {
-    const InputLayerInfo& input = inputs[j];
+  for (int j = 0; j < inputs_.size(); ++j) {
+    const InputLayerInfo& input = inputs_[j];
     int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter->tensor(i);
+    TfLiteTensor* t = interpreter_->tensor(i);
     if (t->type != kTfLiteString) {
-      interpreter->ResizeInputTensor(i, input.shape);
+      interpreter_->ResizeInputTensor(i, input.shape);
     }
   }
 
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
-    TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
+  if (interpreter_->AllocateTensors() != kTfLiteOk) {
+    TFLITE_LOG(ERROR) << "Failed to allocate tensors!";
+    return kTfLiteError;
   }
 
   // Install profilers if necessary.
   if (params_.Get<bool>("enable_op_profiling")) {
     profiling_listener_.reset(new ProfilingListener(
-        interpreter.get(),
+        interpreter_.get(),
         params_.Get<int32_t>("max_profiling_buffer_entries")));
     AddListener(profiling_listener_.get());
   }
@@ -500,14 +574,50 @@ void BenchmarkTfLiteModel::Init() {
   gemmlowp_profiling_listener_.reset(new GemmlowpProfilingListener());
   AddListener(gemmlowp_profiling_listener_.get());
 #endif
+
+  return kTfLiteOk;
 }
 
+#if defined(__ANDROID__)
+bool IsValidGLObjectTypeInGPU(int32_t type) {
+  if (type < TFLITE_GL_OBJECT_TYPE_FASTEST ||
+      type > TFLITE_GL_OBJECT_TYPE_BUFFER) {
+    TFLITE_LOG(WARN) << "The specified GL object type in GPU is invalid: "
+                     << type;
+    return false;
+  }
+  return true;
+}
+#endif
+
 BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
     const {
   TfLiteDelegatePtrMap delegates;
   if (params_.Get<bool>("use_gpu")) {
+#if defined(__ANDROID__)
+    TfLiteGpuDelegateOptions gpu_opts = TfLiteGpuDelegateOptionsDefault();
+    gpu_opts.metadata =
+        model_ ? TfLiteGpuDelegateGetModelMetadata(model_->GetModel())
+               : nullptr;
+    gpu_opts.compile_options.precision_loss_allowed =
+        params_.Get<bool>("gpu_precision_loss_allowed") ? 1 : 0;
+    int32_t gl_obj_type = params_.Get<int32_t>("gpu_gl_object_type");
+    // We overwrite the gl object type to the recommended value if the specified
+    // isn't valid.
+    if (!IsValidGLObjectTypeInGPU(gl_obj_type)) {
+      gl_obj_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
+    }
+    gpu_opts.compile_options.preferred_gl_object_type = gl_obj_type;
+    gpu_opts.compile_options.dynamic_batch_enabled = 0;
     Interpreter::TfLiteDelegatePtr delegate =
-        evaluation::CreateGPUDelegate(model.get());
+        evaluation::CreateGPUDelegate(model_.get(), &gpu_opts);
+#else
+    TFLITE_LOG(WARN) << "The GPU delegate compile options aren't supported to "
+                        "be benchmarked on non-Android platforms.";
+    Interpreter::TfLiteDelegatePtr delegate =
+        evaluation::CreateGPUDelegate(model_.get());
+#endif
+
     if (!delegate) {
       TFLITE_LOG(WARN) << "GPU acceleration is unsupported on this platform.";
     } else {
@@ -516,11 +626,38 @@ BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
   }
   if (params_.Get<bool>("use_nnapi")) {
     StatefulNnApiDelegate::Options options;
-    std::string accelerator_name;
-    if (params_.HasParam("nnapi_accelerator_name")) {
-      accelerator_name = params_.Get<std::string>("nnapi_accelerator_name");
+    std::string accelerator_name =
+        params_.Get<std::string>("nnapi_accelerator_name");
+    if (!accelerator_name.empty()) {
       options.accelerator_name = accelerator_name.c_str();
     }
+    std::string string_execution_preference =
+        params_.Get<std::string>("nnapi_execution_preference");
+    // Only set execution preference if user explicitly passes one. Otherwise,
+    // leave it as whatever NNAPI has as the default.
+    if (!string_execution_preference.empty()) {
+      tflite::StatefulNnApiDelegate::Options::ExecutionPreference
+          execution_preference =
+              tflite::StatefulNnApiDelegate::Options::kUndefined;
+      if (string_execution_preference == "low_power") {
+        execution_preference =
+            tflite::StatefulNnApiDelegate::Options::kLowPower;
+      } else if (string_execution_preference == "sustained_speed") {
+        execution_preference =
+            tflite::StatefulNnApiDelegate::Options::kSustainedSpeed;
+      } else if (string_execution_preference == "fast_single_answer") {
+        execution_preference =
+            tflite::StatefulNnApiDelegate::Options::kFastSingleAnswer;
+      } else if (string_execution_preference == "undefined") {
+        execution_preference =
+            tflite::StatefulNnApiDelegate::Options::kUndefined;
+      } else {
+        TFLITE_LOG(WARN) << "The provided value ("
+                         << string_execution_preference
+                         << ") is not a valid nnapi execution preference.";
+      }
+      options.execution_preference = execution_preference;
+    }
     Interpreter::TfLiteDelegatePtr delegate =
         evaluation::CreateNNAPIDelegate(options);
     if (!delegate) {
@@ -528,11 +665,16 @@ BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
     } else {
       delegates.emplace("NNAPI", std::move(delegate));
     }
-  } else if (params_.HasParam("nnapi_accelerator_name")) {
+  } else if (!params_.Get<std::string>("nnapi_accelerator_name").empty()) {
     TFLITE_LOG(WARN)
         << "`--use_nnapi=true` must be set for the provided NNAPI accelerator ("
         << params_.Get<std::string>("nnapi_accelerator_name")
         << ") to be used.";
+  } else if (!params_.Get<std::string>("nnapi_execution_preference").empty()) {
+    TFLITE_LOG(WARN) << "`--use_nnapi=true` must be set for the provided NNAPI "
+                        "execution preference ("
+                     << params_.Get<std::string>("nnapi_execution_preference")
+                     << ") to be used.";
   }
   return delegates;
 }
@@ -550,11 +692,7 @@ std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
   return std::unique_ptr<tflite::OpResolver>(resolver);
 }
 
-void BenchmarkTfLiteModel::RunImpl() {
-  if (interpreter->Invoke() != kTfLiteOk) {
-    TFLITE_LOG(FATAL) << "Failed to invoke!";
-  }
-}
+TfLiteStatus BenchmarkTfLiteModel::RunImpl() { return interpreter_->Invoke(); }
 
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 04d190531b8..e4bd6d019dd 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -42,15 +42,15 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
 
   std::vector<Flag> GetFlags() override;
   void LogParams() override;
-  bool ValidateParams() override;
+  TfLiteStatus ValidateParams() override;
   uint64_t ComputeInputBytes() override;
-  void Init() override;
-  void RunImpl() override;
+  TfLiteStatus Init() override;
+  TfLiteStatus RunImpl() override;
 
  protected:
   static BenchmarkParams DefaultParams();
-  void PrepareInputData() override;
-  void ResetInputsAndOutputs() override;
+  TfLiteStatus PrepareInputData() override;
+  TfLiteStatus ResetInputsAndOutputs() override;
 
   // Allow subclasses to create custom delegates to be applied during init.
   using TfLiteDelegatePtr = tflite::Interpreter::TfLiteDelegatePtr;
@@ -62,15 +62,15 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
 
   void CleanUp();
 
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  std::unique_ptr<tflite::Interpreter> interpreter;
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
 
  private:
   struct InputTensorData {
     TfLitePtrUnion data;
     size_t bytes;
   };
-  std::vector<InputLayerInfo> inputs;
+  std::vector<InputLayerInfo> inputs_;
   std::vector<InputTensorData> inputs_data_;
   std::unique_ptr<BenchmarkListener> profiling_listener_;
   std::unique_ptr<BenchmarkListener> gemmlowp_profiling_listener_;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_performance_options_main.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_performance_options_main.cc
new file mode 100644
index 00000000000..c70a719423c
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_performance_options_main.cc
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+namespace tflite {
+namespace benchmark {
+
+int Main(int argc, char** argv) {
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  TFLITE_LOG(INFO) << "STARTING with custom ops!";
+#else
+  TFLITE_LOG(INFO) << "STARTING!";
+#endif
+  BenchmarkTfLiteModel benchmark;
+  BenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+
+  BenchmarkPerformanceOptions all_options_benchmark(&benchmark);
+  all_options_benchmark.Run(argc, argv);
+  return EXIT_SUCCESS;
+}
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) { return tflite::benchmark::Main(argc, argv); }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils.cc b/tensorflow/lite/tools/benchmark/benchmark_utils.cc
new file mode 100644
index 00000000000..d8fe2633307
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_utils.cc
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
+
+#include "tensorflow/lite/profiling/time.h"
+
+namespace tflite {
+namespace benchmark {
+namespace util {
+
+void SleepForSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
+  }
+  // If requested, sleep between runs for an arbitrary amount of time.
+  // This can be helpful to determine the effect of mobile processor
+  // scaling and thermal throttling.
+  tflite::profiling::time::SleepForMicros(
+      static_cast<uint64_t>(sleep_seconds * 1e6));
+}
+
+}  // namespace util
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils.h b/tensorflow/lite/tools/benchmark/benchmark_utils.h
new file mode 100644
index 00000000000..b69011626d0
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace benchmark {
+namespace util {
+
+// A convenient function that wraps tflite::profiling::time::SleepForMicros and
+// simply return if 'sleep_seconds' is negative.
+void SleepForSeconds(double sleep_seconds);
+
+// Split the 'str' according to 'delim', and store each splitted element into
+// 'values'.
+template <typename T>
+bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
+  std::istringstream input(str);
+  for (std::string line; std::getline(input, line, delim);) {
+    std::istringstream to_parse(line);
+    T val;
+    to_parse >> val;
+    if (!to_parse.eof() && !to_parse.good()) {
+      return false;
+    }
+    values->emplace_back(val);
+  }
+  return true;
+}
+
+}  // namespace util
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc b/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc
new file mode 100644
index 00000000000..cb1517293f7
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/profiling/time.h"
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+TEST(BenchmarkHelpersTest, SleepForNegativeSeconds) {
+  const auto start_ts = tflite::profiling::time::NowMicros();
+  // The following should return immediately.
+  util::SleepForSeconds(-5.0);
+  const auto end_ts = tflite::profiling::time::NowMicros();
+
+  // As we don't have a mocked clock, we simply expect <1 sec has elapsed, which
+  // is admittedly not quite accurate.
+  EXPECT_LT(end_ts - start_ts, 1000000);
+}
+
+TEST(BenchmarkHelpersTest, SleepForSomeSeconds) {
+  const auto start_ts = tflite::profiling::time::NowMicros();
+  // The following should return after 2.0 secs
+  util::SleepForSeconds(2.0);
+  const auto end_ts = tflite::profiling::time::NowMicros();
+
+  // As we don't have a mocked clock, we simply expect >1.9 sec has elapsed.
+  EXPECT_GT(end_ts - start_ts, 1900000);
+}
+
+TEST(BenchmarkHelpersTest, SplitAndParseFailed) {
+  std::vector<int> results;
+  const bool splitted = util::SplitAndParse("hello;world", ';', &results);
+
+  EXPECT_FALSE(splitted);
+}
+
+TEST(BenchmarkHelpersTest, SplitAndParseString) {
+  std::vector<std::string> results;
+  const bool splitted = util::SplitAndParse("hello,world", ',', &results);
+
+  EXPECT_TRUE(splitted);
+  EXPECT_EQ(2, results.size());
+
+  EXPECT_EQ("hello", results[0]);
+  EXPECT_EQ("world", results[1]);
+}
+
+TEST(BenchmarkHelpersTest, SplitAndParseInts) {
+  std::vector<int> results;
+  const bool splitted = util::SplitAndParse("1,2", ',', &results);
+
+  EXPECT_TRUE(splitted);
+  EXPECT_EQ(2, results.size());
+
+  EXPECT_EQ(1, results[0]);
+  EXPECT_EQ(2, results[1]);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index 3a9ae27384c..5c772ac3fca 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -13,7 +13,7 @@ parameters like inputs to the model, type of inputs, number of iterations,
 number of threads. The default values in the JSON file are for the
 Mobilenet_1.0_224 model
 ([paper](https://arxiv.org/pdf/1704.04861.pdf),
-[tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz))
+[tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz))
 
 ## To build/install/run
 
diff --git a/tensorflow/lite/tools/benchmark/logging.h b/tensorflow/lite/tools/benchmark/logging.h
index 42ccb2c9663..808090bf21f 100644
--- a/tensorflow/lite/tools/benchmark/logging.h
+++ b/tensorflow/lite/tools/benchmark/logging.h
@@ -46,10 +46,19 @@ class LoggingWrapper {
   std::stringstream& Stream() { return stream_; }
   ~LoggingWrapper() {
     if (should_log_) {
-      std::cerr << stream_.str() << std::endl;
-      if (severity_ == LogSeverity::FATAL) {
-        std::flush(std::cerr);
-        std::abort();
+      switch (severity_) {
+        case LogSeverity::INFO:
+        case LogSeverity::WARN:
+          std::cout << stream_.str() << std::endl;
+          break;
+        case LogSeverity::ERROR:
+          std::cerr << stream_.str() << std::endl;
+          break;
+        case LogSeverity::FATAL:
+          std::cerr << stream_.str() << std::endl;
+          std::flush(std::cerr);
+          std::abort();
+          break;
       }
     }
   }
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index fe4e028ab5c..8c265ff5c70 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -19,7 +19,7 @@ package(
 )
 
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_py",
 )
 
diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
index b4e3401eff0..869d095e726 100644
--- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
@@ -109,7 +109,7 @@ TfLiteStatus ObjectDetectionStage::Run() {
   TF_LITE_ENSURE_STATUS(inference_stage_->Run());
 
   // Convert model output to ObjectsSet.
-  ObjectDetectionResult predicted_objects;
+  predicted_objects_.Clear();
   const int class_offset =
       config_.specification().object_detection_params().class_offset();
   const std::vector<void*>* outputs = inference_stage_->GetOutputs();
@@ -119,7 +119,7 @@ TfLiteStatus ObjectDetectionStage::Run() {
   float* detected_label_probabilities = static_cast<float*>(outputs->at(2));
   for (int i = 0; i < num_detections; ++i) {
     const int bounding_box_offset = i * 4;
-    auto* object = predicted_objects.add_objects();
+    auto* object = predicted_objects_.add_objects();
     // Bounding box
     auto* bbox = object->mutable_bounding_box();
     bbox->set_normalized_top(detected_label_boxes[bounding_box_offset + 0]);
@@ -134,7 +134,7 @@ TfLiteStatus ObjectDetectionStage::Run() {
   }
 
   // AP Evaluation.
-  eval_stage_->SetEvalInputs(predicted_objects, *ground_truth_objects_);
+  eval_stage_->SetEvalInputs(predicted_objects_, *ground_truth_objects_);
   TF_LITE_ENSURE_STATUS(eval_stage_->Run());
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
index ec9772754eb..cc0c935bba9 100644
--- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
@@ -70,13 +70,22 @@ class ObjectDetectionStage : public EvaluationStage {
     return inference_stage_.get();
   }
 
+  // Returns a const pointer to the latest inference output.
+  const ObjectDetectionResult* GetLatestPrediction() {
+    return &predicted_objects_;
+  }
+
  private:
   const std::vector<std::string>* all_labels_ = nullptr;
   std::unique_ptr<ImagePreprocessingStage> preprocessing_stage_;
   std::unique_ptr<TfliteInferenceStage> inference_stage_;
   std::unique_ptr<ObjectDetectionAveragePrecisionStage> eval_stage_;
   std::string image_path_;
+
+  // Obtained from SetInputs(...).
   const ObjectDetectionResult* ground_truth_objects_;
+  // Reflects the outputs generated from the latest call to Run().
+  ObjectDetectionResult predicted_objects_;
 };
 
 // Reads a tflite::evaluation::ObjectDetectionGroundTruth instance from a
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
index db4e00d8f81..aa7905a2996 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
@@ -110,7 +110,7 @@ TFLite
 
 *   `ground_truth_proto`: `string` \
     Path to file containing tflite::evaluation::ObjectDetectionGroundTruth proto
-    in text format.
+    in text format. If left empty, mAP numbers are not provided.
 
 The above two parameters can be prepared using the `preprocess_coco_minival`
 script included in this folder.
@@ -129,6 +129,21 @@ The following optional parameters can be used to modify the inference runtime:
     If provided, tries to use the specified delegate for accuracy evaluation.
     Valid values: "nnapi", "gpu".
 
+### Debug Mode
+
+The script also supports a debug mode with the following parameter:
+
+*   `debug_mode`: `boolean` \
+    Whether to enable debug mode. Per-image predictions are written to the
+    output file along with metrics. NOTE: Its not possible to parse the output
+    file as a proto in this mode, since it contains demarcations between
+    per-file outputs for readability.
+
+This mode lets you debug the output of an object detection model that isn't
+necessarily trained on the COCO dataset (by leaving `ground_truth_proto` empty).
+The model output signature would still need to follow the convention mentioned
+above, and you we still need an output labels file.
+
 ## Preprocessing the minival dataset
 
 To compute mAP in a consistent and interpretable way, we utilize the same 2014
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 3479ee48311..470fb8e7f00 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -34,6 +34,7 @@ constexpr char kModelOutputLabelsFlag[] = "model_output_labels";
 constexpr char kOutputFilePathFlag[] = "output_file_path";
 constexpr char kGroundTruthProtoFileFlag[] = "ground_truth_proto";
 constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads";
+constexpr char kDebugModeFlag[] = "debug_mode";
 constexpr char kDelegateFlag[] = "delegate";
 constexpr char kNnapiDelegate[] = "nnapi";
 constexpr char kGpuDelegate[] = "gpu";
@@ -49,7 +50,7 @@ bool EvaluateModel(const std::string& model_file_path,
                    const std::vector<std::string>& image_paths,
                    const std::string& ground_truth_proto_file,
                    std::string delegate, std::string output_file_path,
-                   int num_interpreter_threads) {
+                   int num_interpreter_threads, bool debug_mode) {
   EvaluationStageConfig eval_config;
   eval_config.set_name("object_detection");
   auto* detection_params =
@@ -65,27 +66,47 @@ bool EvaluateModel(const std::string& model_file_path,
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
-  PopulateGroundTruth(ground_truth_proto_file, &ground_truth_map);
+  if (!ground_truth_proto_file.empty()) {
+    PopulateGroundTruth(ground_truth_proto_file, &ground_truth_map);
+  }
 
   ObjectDetectionStage eval(eval_config);
 
   eval.SetAllLabels(model_labels);
   if (eval.Init() != kTfLiteOk) return false;
 
+  // Open output file for writing.
+  std::ofstream ofile;
+  ofile.open(output_file_path, std::ios::out);
+
   const int step = image_paths.size() / 100;
   for (int i = 0; i < image_paths.size(); ++i) {
     if (step > 1 && i % step == 0) {
       LOG(INFO) << "Finished: " << i / step << "%";
     }
-    eval.SetInputs(image_paths[i],
-                   ground_truth_map[GetNameFromPath(image_paths[i])]);
+
+    const std::string image_name = GetNameFromPath(image_paths[i]);
+    eval.SetInputs(image_paths[i], ground_truth_map[image_name]);
     if (eval.Run() != kTfLiteOk) return false;
+
+    if (debug_mode) {
+      ObjectDetectionResult prediction = *eval.GetLatestPrediction();
+      prediction.set_image_name(image_name);
+      ofile << prediction.DebugString();
+      ofile << "======================================================\n";
+    }
   }
 
-  std::ofstream metrics_ofile;
-  metrics_ofile.open(output_file_path, std::ios::out);
-  metrics_ofile << eval.LatestMetrics().DebugString();
-  metrics_ofile.close();
+  // Write metrics to file.
+  EvaluationStageMetrics metrics = eval.LatestMetrics();
+  if (ground_truth_proto_file.empty()) {
+    // mAP metrics are meaningless for no ground truth.
+    metrics.mutable_process_metrics()
+        ->mutable_object_detection_metrics()
+        ->clear_average_precision_metrics();
+  }
+  ofile << metrics.DebugString();
+  ofile.close();
 
   return true;
 }
@@ -99,6 +120,7 @@ int Main(int argc, char* argv[]) {
   std::string output_file_path;
   std::string delegate;
   int num_interpreter_threads = 1;
+  bool debug_mode;
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path,
                                "Path to test tflite model file."),
@@ -112,13 +134,19 @@ int Main(int argc, char* argv[]) {
           kGroundTruthImagesPathFlag, &ground_truth_images_path,
           "Path to ground truth images. These will be evaluated in "
           "alphabetical order of filenames"),
-      tflite::Flag::CreateFlag(kGroundTruthProtoFileFlag,
-                               &ground_truth_proto_file,
-                               "Path to file containing "
-                               "tflite::evaluation::ObjectDetectionGroundTruth "
-                               "proto in text format"),
-      tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path,
-                               "File to output metrics proto to."),
+      tflite::Flag::CreateFlag(
+          kGroundTruthProtoFileFlag, &ground_truth_proto_file,
+          "Path to file containing "
+          "tflite::evaluation::ObjectDetectionGroundTruth "
+          "proto in text format. If left empty, mAP numbers are not output."),
+      tflite::Flag::CreateFlag(
+          kOutputFilePathFlag, &output_file_path,
+          "File to output to. Contains only metrics proto if debug_mode is "
+          "off, and per-image predictions also otherwise."),
+      tflite::Flag::CreateFlag(kDebugModeFlag, &debug_mode,
+                               "Whether to enable debug mode. Per-image "
+                               "predictions are written to the output file "
+                               "along with metrics."),
       tflite::Flag::CreateFlag(
           kInterpreterThreadsFlag, &num_interpreter_threads,
           "Number of interpreter threads to use for inference."),
@@ -141,7 +169,7 @@ int Main(int argc, char* argv[]) {
 
   if (!EvaluateModel(model_file_path, model_labels, image_paths,
                      ground_truth_proto_file, delegate, output_file_path,
-                     num_interpreter_threads)) {
+                     num_interpreter_threads, debug_mode)) {
     LOG(ERROR) << "Could not evaluate model";
     return 0;
   }
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index 07b9b187b16..382719f012d 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -191,7 +191,7 @@ adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt
 (8) Run the binary.
 
 ```
-adb shell /data/local/tmp/imagenet_accuracy_eval \
+adb shell /data/local/tmp/run_eval \
   --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \
   --ground_truth_images_path=/data/local/tmp/ilsvrc_images \
   --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 76f1225a7e1..940f304c2c1 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -70,8 +70,13 @@ bool EvaluateModel(const std::string& model_file_path,
   eval.SetAllLabels(model_labels);
   if (eval.Init() != kTfLiteOk) return false;
 
-  for (const auto& image_label : image_labels) {
-    eval.SetInputs(image_label.image, image_label.label);
+  const int step = image_labels.size() / 100;
+  for (int i = 0; i < image_labels.size(); ++i) {
+    if (step > 1 && i % step == 0) {
+      LOG(INFO) << "Evaluated: " << i / step << "%";
+    }
+
+    eval.SetInputs(image_labels[i].image, image_labels[i].label);
     if (eval.Run() != kTfLiteOk) return false;
   }
 
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 162acbabf7b..0de9212cf8c 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -73,6 +73,7 @@ TfLiteStatus GetSortedFileNames(const std::string& directory,
   return kTfLiteOk;
 }
 
+// TODO(b/138448769): Migrate delegate helper APIs to lite/testing.
 Interpreter::TfLiteDelegatePtr CreateNNAPIDelegate() {
 #if defined(__ANDROID__)
   return Interpreter::TfLiteDelegatePtr(
@@ -107,8 +108,9 @@ Interpreter::TfLiteDelegatePtr CreateGPUDelegate(
 Interpreter::TfLiteDelegatePtr CreateGPUDelegate(
     tflite::FlatBufferModel* model) {
 #if defined(__ANDROID__)
-  TfLiteGpuDelegateOptions options;
-  options.metadata = TfLiteGpuDelegateGetModelMetadata(model->GetModel());
+  TfLiteGpuDelegateOptions options = TfLiteGpuDelegateOptionsDefault();
+  options.metadata =
+      model ? TfLiteGpuDelegateGetModelMetadata(model->GetModel()) : nullptr;
   options.compile_options.precision_loss_allowed = 1;
   options.compile_options.preferred_gl_object_type =
       TFLITE_GL_OBJECT_TYPE_FASTEST;
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 89ef6e5c302..ed7085d1003 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -59,7 +59,6 @@ CXXFLAGS := -O3 -DNDEBUG -fPIC
 CXXFLAGS += $(EXTRA_CXXFLAGS)
 CFLAGS := ${CXXFLAGS}
 CXXFLAGS += --std=c++11
-CFLAGS :=
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
 TARGET_TOOLCHAIN_PREFIX :=
@@ -99,14 +98,21 @@ $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
+$(wildcard tensorflow/lite/experimental/resource_variable/*.cc) \
 tensorflow/lite/experimental/ruy/allocator.cc \
 tensorflow/lite/experimental/ruy/block_map.cc \
 tensorflow/lite/experimental/ruy/blocking_counter.cc \
 tensorflow/lite/experimental/ruy/context.cc \
-tensorflow/lite/experimental/ruy/detect_dotprod.cc \
+tensorflow/lite/experimental/ruy/detect_arm.cc \
+tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc \
+tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc \
 tensorflow/lite/experimental/ruy/kernel_arm32.cc \
 tensorflow/lite/experimental/ruy/kernel_arm64.cc \
-tensorflow/lite/experimental/ruy/pack.cc \
+tensorflow/lite/experimental/ruy/kernel_avx2.cc \
+tensorflow/lite/experimental/ruy/kernel_avx512.cc \
+tensorflow/lite/experimental/ruy/pack_arm.cc \
+tensorflow/lite/experimental/ruy/pack_avx2.cc \
+tensorflow/lite/experimental/ruy/pack_avx512.cc \
 tensorflow/lite/experimental/ruy/pmu.cc \
 tensorflow/lite/experimental/ruy/thread_pool.cc \
 tensorflow/lite/experimental/ruy/trace.cc \
@@ -161,9 +167,14 @@ endif
 ifeq ($(TARGET),rpi)
 	BUILD_WITH_NNAPI=false
 endif
+ifeq ($(TARGET),generic-aarch64)
+	BUILD_WITH_NNAPI=false
+endif
 ifeq ($(BUILD_WITH_NNAPI),true)
 	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+  CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
 	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation.cc
+	LIBS += -lrt
 else
 	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
 	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
diff --git a/tensorflow/python/autograph/core/function_wrapping.py b/tensorflow/lite/tools/make/build_bbb_lib.sh
old mode 100644
new mode 100755
similarity index 61%
rename from tensorflow/python/autograph/core/function_wrapping.py
rename to tensorflow/lite/tools/make/build_bbb_lib.sh
index 21b66eff023..a195c407793
--- a/tensorflow/python/autograph/core/function_wrapping.py
+++ b/tensorflow/lite/tools/make/build_bbb_lib.sh
@@ -1,3 +1,4 @@
+#!/bin/bash -x
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,19 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Support for wrapping converted functions bodies with auxiliary logic."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+set -e
 
-import contextlib
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
 
-from tensorflow.python.framework import ops
-
-
-@contextlib.contextmanager
-def function_scope(function_name):
-  """Returns a context manager for the converted body of a function."""
-  with ops.name_scope(function_name):
-    yield
+CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=bbb TARGET_ARCH=armv7l
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries.py b/tensorflow/lite/tools/make/build_generic_aarch64_lib.sh
old mode 100644
new mode 100755
similarity index 58%
rename from tensorflow/contrib/gan/python/eval/python/summaries.py
rename to tensorflow/lite/tools/make/build_generic_aarch64_lib.sh
index 1b202dfc973..d497b94ffc0
--- a/tensorflow/contrib/gan/python/eval/python/summaries.py
+++ b/tensorflow/lite/tools/make/build_generic_aarch64_lib.sh
@@ -1,3 +1,4 @@
+#!/bin/bash -x
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,17 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TF-GAN summaries."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+set -e
 
-from tensorflow.contrib.gan.python.eval.python import summaries_impl
-# pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.eval.python.summaries_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
 
-__all__ = summaries_impl.__all__
-remove_undocumented(__name__, __all__)
+CC_PREFIX=aarch64-linux-gnu- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=generic-aarch64 TARGET_ARCH=armv8-a
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 1b0df57624f..4b4df1e9f9d 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -30,13 +30,13 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)"
-GEMMLOWP_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
-FARMHASH_URL="http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
-FLATBUFFERS_URL="http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
-FFT2D_URL="http://mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz"
+FARMHASH_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
+FLATBUFFERS_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz"
+FFT2D_URL="https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
diff --git a/tensorflow/lite/tools/make/targets/bbb_makefile.inc b/tensorflow/lite/tools/make/targets/bbb_makefile.inc
new file mode 100644
index 00000000000..dfbdd2f0c72
--- /dev/null
+++ b/tensorflow/lite/tools/make/targets/bbb_makefile.inc
@@ -0,0 +1,35 @@
+# Settings for BeagleBone Black.
+ifeq ($(TARGET),bbb)
+  TARGET_ARCH := armv7l
+  TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
+
+  ifeq ($(TARGET_ARCH), armv7l)
+    CXXFLAGS += \
+      -march=armv7-a \
+      -mfpu=neon \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    CFLAGS += \
+      -march=armv7-a \
+      -mfpu=neon \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    LDFLAGS := \
+      -Wl,--no-export-dynamic \
+      -Wl,--exclude-libs,ALL \
+      -Wl,--gc-sections \
+      -Wl,--as-needed
+  endif
+
+  LIBS := \
+    -lstdc++ \
+    -lpthread \
+    -lm \
+    -ldl \
+    -lrt
+
+endif
diff --git a/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc b/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc
new file mode 100644
index 00000000000..f4e4f1f9c4d
--- /dev/null
+++ b/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc
@@ -0,0 +1,33 @@
+# Settings for generic aarch64 boards such as Odroid C2 or Pine64.
+ifeq ($(TARGET),generic-aarch64)
+  # The aarch64 architecture covers all 64-bit ARM chips. This arch mandates
+  # NEON, so FPU flags are not needed below.
+  TARGET_ARCH := armv8-a
+  TARGET_TOOLCHAIN_PREFIX := aarch64-linux-gnu-
+
+  CXXFLAGS += \
+    -march=armv8-a \
+    -funsafe-math-optimizations \
+    -ftree-vectorize \
+    -fPIC
+
+  CCFLAGS += \
+    -march=armv8-a \
+    -funsafe-math-optimizations \
+    -ftree-vectorize \
+    -fPIC
+
+  LDFLAGS := \
+    -Wl,--no-export-dynamic \
+    -Wl,--exclude-libs,ALL \
+    -Wl,--gc-sections \
+    -Wl,--as-needed
+
+       
+  LIBS := \
+    -lstdc++ \
+    -lpthread \
+    -lm \
+    -ldl
+
+endif
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index f9ef0d02e58..865f1e987bc 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -167,6 +167,7 @@ cc_library(
         ":operator_property",
         ":quantization_utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers",
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index ce1927a183e..8316dec09dc 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -86,6 +86,13 @@ OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
       // Comparisons have no quantizable outputs.
       property.version = 2;
       break;
+    case BuiltinOperator_EXPAND_DIMS:
+      // We skip input 1 as it is not real valued (it's the index of axis) and
+      // hence does not need to be quantized.
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 1;
+      break;
     case BuiltinOperator_FULLY_CONNECTED: {
       TensorProperty tensor_property;
       tensor_property.symmetric = true;
@@ -101,6 +108,12 @@ OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_HARD_SWISH: {
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 1;
+      break;
+    }
     case BuiltinOperator_LOG_SOFTMAX: {
       property.inputs = {{0, {}}};
       // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 789c2b0dfbc..38cf76976d0 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"
@@ -36,6 +38,24 @@ namespace optimize {
 
 namespace {
 
+// Gets the operator property from the operator_property list and additionally
+// modifies the quantizable parameter based on the user's specified
+// operator_names.
+operator_property::OperatorProperty GetOperatorProperty(
+    const std::unordered_set<string>& operator_names, const BuiltinOperator& op,
+    const string& operator_name) {
+  operator_property::OperatorProperty property =
+      operator_property::GetOperatorProperty(op);
+  // The algorithm adds Dequantize and Quantize, so we don't require them to be
+  // in the operator_names.
+  if (op != BuiltinOperator_DEQUANTIZE && op != BuiltinOperator_QUANTIZE) {
+    property.quantizable =
+        property.quantizable &&
+        (operator_names.find(operator_name) != operator_names.end());
+  }
+  return property;
+}
+
 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                           const TensorT* weight_tensor, TensorT* bias_tensor,
                           bool is_per_channel, int channel_dim_index,
@@ -239,8 +259,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
       // TODO(suharshs): Add support for this case if it ever comes up.
       if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
         error_reporter->Report(
-            "Unsupported output type %s for output tensor %d of type %s.",
-            EnumNameTensorType(output_type), subgraph->outputs[i],
+            "Unsupported output type %s for output tensor '%s' of type %s.",
+            EnumNameTensorType(output_type), tensor->name.c_str(),
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
@@ -260,7 +280,9 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
 // outpus must have the same scale and zero point. The other ones with
 // constraints(averagepool, maxpool, gather, softmax, tanh etc) are handled in
 // QuantizeWeightsAndInput.
-TfLiteStatus ApplyConstraints(ModelT* model, ErrorReporter* error_reporter) {
+TfLiteStatus ApplyConstraints(ModelT* model,
+                              const std::unordered_set<string>& operator_names,
+                              ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -269,8 +291,8 @@ TfLiteStatus ApplyConstraints(ModelT* model, ErrorReporter* error_reporter) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          operator_property::GetOperatorProperty(op_code);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, op_code, subgraph->tensors[op->outputs[0]]->name);
       if (!property.quantizable) {
         continue;
       }
@@ -546,8 +568,10 @@ TfLiteStatus QuantizeOpOutput(
 
 // Quantize inputs and weights.
 // Because of ops such as lstm, still need to do per op, instead of weights.
-TfLiteStatus QuantizeWeightsInputOutput(ModelT* model, bool allow_float,
-                                        ErrorReporter* error_reporter) {
+TfLiteStatus QuantizeWeightsInputOutput(
+    ModelT* model, bool allow_float,
+    const std::unordered_set<string>& operator_names,
+    ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -555,8 +579,8 @@ TfLiteStatus QuantizeWeightsInputOutput(ModelT* model, bool allow_float,
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          operator_property::GetOperatorProperty(op_code);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, op_code, subgraph->tensors[op->outputs[0]]->name);
 
       if (!property.quantizable && !allow_float) {
         error_reporter->Report("Quantization not yet supported for op: %s",
@@ -583,7 +607,9 @@ TfLiteStatus QuantizeWeightsInputOutput(ModelT* model, bool allow_float,
 }
 
 // Quantize bias.
-TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
+TfLiteStatus QuantizeBiases(ModelT* model,
+                            const std::unordered_set<string>& operator_names,
+                            ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -591,12 +617,15 @@ TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          operator_property::GetOperatorProperty(op_code);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, op_code, subgraph->tensors[op->outputs[0]]->name);
       if (!property.quantizable) {
         continue;
       }
       for (const int bias_idx : property.biases) {
+        if (op->inputs[bias_idx] == -1 /*kOptionalTensor*/) {
+          continue;
+        }
         if (bias_idx >= op->inputs.size()) {
           error_reporter->Report(
               "Required input index %d is larger than the input length of "
@@ -636,17 +665,32 @@ TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
   return kTfLiteOk;
 }
 
+std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
+  std::unordered_set<string> operator_names;
+  for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
+         tensor_idx++) {
+      operator_names.insert(subgraph->tensors[tensor_idx]->name);
+    }
+  }
+  return operator_names;
+}
+
 }  // namespace
 
 // Assumes that the operators in the model have been topologically sorted.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
                            ErrorReporter* error_reporter) {
+  TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
+      model, allow_float, operator_names, error_reporter));
   TF_LITE_ENSURE_STATUS(
-      QuantizeWeightsInputOutput(model, allow_float, error_reporter));
-  TF_LITE_ENSURE_STATUS(ApplyConstraints(model, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, error_reporter));
+      ApplyConstraints(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, error_reporter));
   utils::SetOperatorCodeVersion(model);
   TF_LITE_ENSURE_STATUS(
       SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
@@ -658,6 +702,14 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
   return kTfLiteOk;
 }
 
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type, allow_float,
+                       GetAllOperatorOutputs(model), error_reporter);
+}
+
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type,
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index d6519797c26..9b0353f6b6b 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -16,11 +16,13 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
 
 #include <memory>
+#include <unordered_set>
 
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace optimize {
@@ -53,6 +55,16 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter);
 
+// Same as above, but enables only quantizing a whitelist of operations,
+// specified by their operator output name.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
+                           ErrorReporter* error_reporter);
+
 }  // namespace optimize
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 9ee5933f2a5..ecf4deb2e37 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -98,6 +98,31 @@ TEST_F(QuantizeConvModelTest, QuantizationSucceeds) {
   ASSERT_TRUE(output_model);
 }
 
+TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    /*allow_float=*/true, {}, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  // The resulting model should be the same.
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto quantized_graph = model_.subgraphs[subgraph_idx].get();
+    const auto float_graph = readonly_model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors.size(), float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors.size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors[i].get();
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer, float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable, float_tensor->is_variable());
+      EXPECT_EQ(quant_tensor->shape, GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name, float_tensor->name()->str());
+      EXPECT_EQ(quant_tensor->type, float_tensor->type());
+    }
+  }
+}
+
 TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
   auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
                               TensorType_INT8, &error_reporter_);
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 89965e1190e..451faaeb688 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -112,6 +112,16 @@ std::vector<int32_t> GetWeightInputIndices(const OperatorCodeT* op_code,
   return {};
 }
 
+// Checks that a specific input can be quantized.
+bool IsQuantizedInput(const OperatorCodeT* op_code,
+                      const CustomOpMap& custom_op_map, int op_input_idx) {
+  const auto quantized_input_indices =
+      GetWeightInputIndices(op_code, custom_op_map);
+  return std::find(std::begin(quantized_input_indices),
+                   std::end(quantized_input_indices),
+                   op_input_idx) != std::end(quantized_input_indices);
+}
+
 // Returns true if the operator supports hybrid evaluation.
 bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
                           const CustomOpMap& custom_op_map) {
@@ -390,7 +400,9 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
           use_hybrid_evaluation &&
           IsHybridEvaluationOp(consumer_op, consumer_op_code, custom_op_map) &&
           CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code,
-                                    custom_op_map);
+                                    custom_op_map) &&
+          IsQuantizedInput(consumer_op_code, custom_op_map,
+                           consumer_op_info.op_input_idx);
       if (!eval_hybrid) {
         dequant_op_infos.push_back(consumer_op_info);
       }
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 04833ed69d7..16ae0a6651d 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -130,20 +130,30 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT32:
       bytes_required *= sizeof(float);
       break;
-    case TensorType_INT8:
-      bytes_required *= sizeof(int8_t);
-      break;
-    case TensorType_UINT8:
-      bytes_required *= sizeof(uint8_t);
+    case TensorType_FLOAT16:
+      bytes_required *= sizeof(uint16_t);
       break;
     case TensorType_INT32:
       bytes_required *= sizeof(int32_t);
       break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT8:
+      bytes_required *= sizeof(int8_t);
+      break;
     case TensorType_INT64:
       bytes_required *= sizeof(int64_t);
       break;
-    case TensorType_FLOAT16:
-      // FALLTHROUGH_INTENDED;
+    case TensorType_BOOL:
+      bytes_required *= sizeof(bool);
+      break;
+    case TensorType_INT16:
+      bytes_required *= sizeof(uint16_t);
+      break;
+    case TensorType_COMPLEX64:
+      bytes_required *= sizeof(std::complex<float>);
+      break;
     default:
       ReportError(error_reporter, "Tensor %s invalid type: %d",
                   tensor.name()->c_str(), tensor.type());
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index c89a6fb10d1..ca3a9d63959 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/tools/verifier.h"
+
 #include <string>
 #include <vector>
 
@@ -21,11 +23,12 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
-#include "tensorflow/lite/tools/verifier.h"
+#include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -516,6 +519,42 @@ TEST(VerifyModel, OpWithOptionalTensor) {
   EXPECT_EQ("", builder.GetErrorString());
 }
 
+TEST(VerifyModel, TypedTensorShapeMismatchWithTensorBufferSize) {
+  TfLiteFlatbufferModelBuilder builder;
+  for (int tensor_type = TensorType_MIN; tensor_type <= TensorType_MAX;
+       ++tensor_type) {
+    if (tensor_type == TensorType_STRING) continue;
+    builder.AddTensor({2, 3}, static_cast<TensorType>(tensor_type),
+                      {1, 2, 3, 4}, "input");
+    builder.FinishModel({}, {});
+    ASSERT_FALSE(builder.Verify());
+    EXPECT_THAT(
+        builder.GetErrorString(),
+        ::testing::ContainsRegex("Tensor input requires .* bytes, but is "
+                                 "allocated with 4 bytes buffer"));
+  }
+}
+
+TEST(VerifyModel, TypedTensorShapeMatchesTensorBufferSize) {
+  TfLiteFlatbufferModelBuilder builder;
+  for (int tensor_type = TensorType_MIN; tensor_type <= TensorType_MAX;
+       ++tensor_type) {
+    if (tensor_type == TensorType_STRING) continue;
+    TfLiteType lite_type = kTfLiteNoType;
+    ASSERT_EQ(ConvertTensorType(static_cast<TensorType>(tensor_type),
+                                &lite_type, /*error_reporter=*/nullptr),
+              kTfLiteOk);
+    size_t size_bytes = 0;
+    ASSERT_EQ(GetSizeOfType(/*context=*/nullptr, lite_type, &size_bytes),
+              kTfLiteOk);
+    std::vector<uint8_t> buffer(size_bytes);
+    builder.AddTensor({1}, static_cast<TensorType>(tensor_type), buffer,
+                      "input");
+    builder.FinishModel({}, {});
+    ASSERT_TRUE(builder.Verify());
+  }
+}
+
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index ce604ee5623..9f67c39c335 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -114,27 +114,26 @@ text {
 
 _D3_HTML_TEMPLATE = """
   <script>
-    // Build graph data
-    var graph = %s;
-
-    var svg = d3.select("#subgraph%d")
-    var width = svg.attr("width");
-    var height = svg.attr("height");
-    // Make the graph scrollable.
-    svg = svg.call(d3.zoom().on("zoom", function() {
-      svg.attr("transform", d3.event.transform);
-    })).append("g");
-
-
-    var color = d3.scaleOrdinal(d3.schemeDark2);
-
-    var simulation = d3.forceSimulation()
-        .force("link", d3.forceLink().id(function(d) {return d.id;}))
-        .force("charge", d3.forceManyBody())
-        .force("center", d3.forceCenter(0.5 * width, 0.5 * height));
-
-
     function buildGraph() {
+      // Build graph data
+      var graph = %s;
+
+      var svg = d3.select("#subgraph%d")
+      var width = svg.attr("width");
+      var height = svg.attr("height");
+      // Make the graph scrollable.
+      svg = svg.call(d3.zoom().on("zoom", function() {
+        svg.attr("transform", d3.event.transform);
+      })).append("g");
+
+
+      var color = d3.scaleOrdinal(d3.schemeDark2);
+
+      var simulation = d3.forceSimulation()
+          .force("link", d3.forceLink().id(function(d) {return d.id;}))
+          .force("charge", d3.forceManyBody())
+          .force("center", d3.forceCenter(0.5 * width, 0.5 * height));
+
       var edge = svg.append("g").attr("class", "edges").selectAll("line")
         .data(graph.edges).enter().append("path").attr("stroke","black").attr("fill","none")
 
@@ -313,7 +312,7 @@ def GenerateGraph(subgraph_idx, g, opcode_mapper):
 
     nodes.append({
         "id": TensorName(tensor_index),
-        "name": "%r (%d)" % (tensor["shape"], tensor_index),
+        "name": "%r (%d)" % (getattr(tensor, "shape", []), tensor_index),
         "group": 1,
         "x": initial_y[1],
         "y": initial_y[0]
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 27d7a82862d..6551b35edd6 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -1,281 +1,283 @@
-tensorflow/contrib/mpi/BUILD
-tensorflow/stream_executor/build_defs.bzl
+llvm/llvm/projects/google_mlir/WORKSPACE
+tensorflow/__init__.py
+tensorflow/api_template.__init__.py
+tensorflow/api_template_v1.__init__.py
+tensorflow/compat_template.__init__.py
+tensorflow/compat_template_v1.__init__.py
 tensorflow/python/autograph/core/config.py
-tensorflow/python/tpu/profiler/pip_package/setup.py
-tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh
+tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
-tensorflow/tools/ci_build/remote/BUILD
-tensorflow/tools/pip_package/README
-tensorflow/tools/pip_package/MANIFEST.in
-tensorflow/tools/pip_package/simple_console.py
-tensorflow/tools/pip_package/build_pip_package.sh
-tensorflow/tools/pip_package/check_load_py_test.py
-tensorflow/tools/pip_package/pip_smoke_test.py
-tensorflow/tools/pip_package/simple_console_for_windows.py
-tensorflow/tools/pip_package/setup.py
-tensorflow/tools/pip_package/BUILD
-tensorflow/tools/lib_package/concat_licenses.sh
-tensorflow/tools/lib_package/libtensorflow_test.c
-tensorflow/tools/lib_package/LibTensorFlowTest.java
-tensorflow/tools/lib_package/BUILD
-tensorflow/tools/lib_package/libtensorflow_test.sh
-tensorflow/tools/lib_package/README.md
-tensorflow/tools/lib_package/libtensorflow_java_test.sh
-tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
-tensorflow/tools/def_file_filter/BUILD
-tensorflow/tools/def_file_filter/BUILD.tpl
-tensorflow/tools/def_file_filter/def_file_filter.py.tpl
-tensorflow/third_party/mkl/MKL_LICENSE
-tensorflow/third_party/mkl/LICENSE
-tensorflow/third_party/mkl/BUILD
-tensorflow/third_party/mkl/mkl.BUILD
-tensorflow/third_party/mkl/build_defs.bzl
+tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh
+tensorflow/python/tpu/profiler/pip_package/setup.py
+tensorflow/stream_executor/build_defs.bzl
+tensorflow/third_party/BUILD
+tensorflow/third_party/__init__.py
+tensorflow/third_party/android/BUILD
+tensorflow/third_party/android/android.bzl.tpl
+tensorflow/third_party/android/android_configure.BUILD.tpl
+tensorflow/third_party/android/android_configure.bzl
+tensorflow/third_party/arm_neon_2_x86_sse.BUILD
+tensorflow/third_party/astor.BUILD
 tensorflow/third_party/backports_weakref.BUILD
-tensorflow/third_party/toolchains/clang6/BUILD
-tensorflow/third_party/toolchains/clang6/README.md
-tensorflow/third_party/toolchains/clang6/repo.bzl
-tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
-tensorflow/third_party/toolchains/clang6/clang.BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
-tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
-tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
-tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
-tensorflow/third_party/toolchains/preconfig/generate/BUILD
-tensorflow/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
-tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
-tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
-tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
-tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
-tensorflow/third_party/toolchains/cpus/arm/BUILD
-tensorflow/third_party/toolchains/cpus/py3/BUILD
-tensorflow/third_party/toolchains/cpus/py/BUILD
-tensorflow/third_party/toolchains/remote/configure.bzl
-tensorflow/third_party/toolchains/remote/BUILD.tpl
-tensorflow/third_party/toolchains/remote/BUILD
-tensorflow/third_party/toolchains/remote/execution.bzl.tpl
-tensorflow/third_party/toolchains/BUILD
+tensorflow/third_party/boringssl/BUILD
+tensorflow/third_party/clang_toolchain/BUILD
+tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl
+tensorflow/third_party/clang_toolchain/download_clang.bzl
+tensorflow/third_party/codegen.BUILD
+tensorflow/third_party/com_google_absl.BUILD
+tensorflow/third_party/common.bzl
+tensorflow/third_party/cub.BUILD
+tensorflow/third_party/curl.BUILD
+tensorflow/third_party/cython.BUILD
+tensorflow/third_party/double_conversion.BUILD
+tensorflow/third_party/eigen.BUILD
+tensorflow/third_party/eigen3/BUILD
+tensorflow/third_party/eigen3/Eigen/Cholesky
+tensorflow/third_party/eigen3/Eigen/Core
+tensorflow/third_party/eigen3/Eigen/Eigenvalues
+tensorflow/third_party/eigen3/Eigen/LU
+tensorflow/third_party/eigen3/Eigen/QR
+tensorflow/third_party/eigen3/Eigen/SVD
+tensorflow/third_party/eigen3/LICENSE
+tensorflow/third_party/eigen3/gpu_packet_math.patch
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
+tensorflow/third_party/enum34.BUILD
+tensorflow/third_party/farmhash.BUILD
+tensorflow/third_party/fft2d/BUILD
+tensorflow/third_party/fft2d/LICENSE
+tensorflow/third_party/fft2d/fft.h
+tensorflow/third_party/fft2d/fft2d.BUILD
+tensorflow/third_party/fft2d/fft2d.h
+tensorflow/third_party/functools32.BUILD
+tensorflow/third_party/gast.BUILD
+tensorflow/third_party/gif.BUILD
+tensorflow/third_party/git/BUILD
+tensorflow/third_party/git/BUILD.tpl
+tensorflow/third_party/git/git_configure.bzl
+tensorflow/third_party/googleapis.BUILD
 tensorflow/third_party/gpus/BUILD
-tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
-tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
-tensorflow/third_party/gpus/crosstool/LICENSE
-tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
-tensorflow/third_party/gpus/crosstool/BUILD.tpl
 tensorflow/third_party/gpus/crosstool/BUILD
-tensorflow/third_party/gpus/cuda/LICENSE
+tensorflow/third_party/gpus/crosstool/BUILD.tpl
+tensorflow/third_party/gpus/crosstool/LICENSE
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+tensorflow/third_party/gpus/cuda/BUILD
 tensorflow/third_party/gpus/cuda/BUILD.tpl
 tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
-tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
-tensorflow/third_party/gpus/cuda/BUILD
+tensorflow/third_party/gpus/cuda/LICENSE
 tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
-tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
+tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
+tensorflow/third_party/gpus/cuda_configure.bzl
+tensorflow/third_party/gpus/find_cuda_config.py
 tensorflow/third_party/gpus/rocm/BUILD
 tensorflow/third_party/gpus/rocm/BUILD.tpl
 tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl
-tensorflow/third_party/gpus/cuda_configure.bzl
-tensorflow/third_party/gpus/find_cuda_config.py
+tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
 tensorflow/third_party/gpus/rocm_configure.bzl
-tensorflow/third_party/snappy.BUILD
-tensorflow/third_party/cython.BUILD
-tensorflow/third_party/farmhash.BUILD
-tensorflow/third_party/eigen3/Eigen/Cholesky
-tensorflow/third_party/eigen3/Eigen/QR
-tensorflow/third_party/eigen3/Eigen/LU
-tensorflow/third_party/eigen3/Eigen/Core
-tensorflow/third_party/eigen3/Eigen/SVD
-tensorflow/third_party/eigen3/Eigen/Eigenvalues
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
-tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
-tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
-tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
-tensorflow/third_party/eigen3/gpu_packet_math.patch
-tensorflow/third_party/eigen3/LICENSE
-tensorflow/third_party/eigen3/BUILD
-tensorflow/third_party/systemlibs/build_defs.bzl.tpl
-tensorflow/third_party/systemlibs/absl_py.BUILD
-tensorflow/third_party/systemlibs/curl.BUILD
-tensorflow/third_party/systemlibs/termcolor.BUILD
-tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
-tensorflow/third_party/systemlibs/grpc.BUILD
-tensorflow/third_party/systemlibs/swig.BUILD
-tensorflow/third_party/systemlibs/protobuf.bzl
-tensorflow/third_party/systemlibs/protobuf.BUILD
-tensorflow/third_party/systemlibs/BUILD
-tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD
-tensorflow/third_party/systemlibs/astor.BUILD
-tensorflow/third_party/systemlibs/six.BUILD
-tensorflow/third_party/systemlibs/opt_einsum.BUILD
-tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
-tensorflow/third_party/systemlibs/boringssl.BUILD
-tensorflow/third_party/systemlibs/nsync.BUILD
-tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
-tensorflow/third_party/systemlibs/gif.BUILD
-tensorflow/third_party/systemlibs/pcre.BUILD
-tensorflow/third_party/systemlibs/BUILD.tpl
-tensorflow/third_party/systemlibs/snappy.BUILD
-tensorflow/third_party/systemlibs/gast.BUILD
-tensorflow/third_party/systemlibs/cython.BUILD
-tensorflow/third_party/systemlibs/double_conversion.BUILD
-tensorflow/third_party/systemlibs/zlib.BUILD
-tensorflow/third_party/systemlibs/jsoncpp.BUILD
-tensorflow/third_party/systemlibs/re2.BUILD
-tensorflow/third_party/systemlibs/lmdb.BUILD
-tensorflow/third_party/systemlibs/googleapis.BUILD
-tensorflow/third_party/systemlibs/png.BUILD
-tensorflow/third_party/systemlibs/syslibs_configure.bzl
-tensorflow/third_party/systemlibs/sqlite.BUILD
-tensorflow/third_party/python_runtime/BUILD
-tensorflow/third_party/sycl/crosstool/BUILD
-tensorflow/third_party/ngraph/LICENSE
-tensorflow/third_party/ngraph/tbb.BUILD
-tensorflow/third_party/ngraph/BUILD
-tensorflow/third_party/ngraph/ngraph.BUILD
-tensorflow/third_party/ngraph/build_defs.bzl
-tensorflow/third_party/ngraph/NGRAPH_LICENSE
-tensorflow/third_party/ngraph/ngraph_tf.BUILD
-tensorflow/third_party/ngraph/nlohmann_json.BUILD
-tensorflow/third_party/clang_toolchain/download_clang.bzl
-tensorflow/third_party/clang_toolchain/BUILD
-tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl
-tensorflow/third_party/mlir/BUILD
-tensorflow/third_party/mlir/mlir_configure.bzl
-tensorflow/third_party/mlir/bindings/python/BUILD
-tensorflow/third_party/mlir/test/BUILD
-tensorflow/third_party/mlir/tblgen.bzl
-tensorflow/third_party/gast.BUILD
+tensorflow/third_party/grpc/BUILD
+tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/jsoncpp.BUILD
+tensorflow/third_party/kafka/BUILD
+tensorflow/third_party/kafka/config.patch
+tensorflow/third_party/libxsmm.BUILD
+tensorflow/third_party/linenoise.BUILD
 tensorflow/third_party/llvm/BUILD
 tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
-tensorflow/third_party/icu/udata.patch
-tensorflow/third_party/fft2d/fft2d.h
-tensorflow/third_party/fft2d/BUILD
-tensorflow/third_party/fft2d/fft.h
-tensorflow/third_party/fft2d/LICENSE
-tensorflow/third_party/fft2d/fft2d.BUILD
-tensorflow/third_party/nccl/archive.BUILD
-tensorflow/third_party/nccl/LICENSE
-tensorflow/third_party/nccl/system.BUILD.tpl
-tensorflow/third_party/nccl/nccl_configure.bzl
-tensorflow/third_party/nccl/build_defs.bzl.tpl
-tensorflow/third_party/nccl/archive.patch
-tensorflow/third_party/nccl/BUILD
-tensorflow/third_party/boringssl/BUILD
-tensorflow/third_party/mpi/.gitignore
-tensorflow/third_party/mpi/BUILD
-tensorflow/third_party/functools32.BUILD
-tensorflow/third_party/tensorrt/LICENSE
-tensorflow/third_party/tensorrt/BUILD
-tensorflow/third_party/tensorrt/build_defs.bzl.tpl
-tensorflow/third_party/tensorrt/BUILD.tpl
-tensorflow/third_party/tensorrt/tensorrt_configure.bzl
-tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
-tensorflow/third_party/kafka/config.patch
-tensorflow/third_party/kafka/BUILD
-tensorflow/third_party/android/BUILD
-tensorflow/third_party/android/android.bzl.tpl
-tensorflow/third_party/android/android_configure.bzl
-tensorflow/third_party/android/android_configure.BUILD.tpl
-tensorflow/third_party/tflite_smartreply.BUILD
+tensorflow/third_party/lmdb.BUILD
+tensorflow/third_party/mkl/BUILD
+tensorflow/third_party/mkl/LICENSE
+tensorflow/third_party/mkl/MKL_LICENSE
+tensorflow/third_party/mkl/build_defs.bzl
+tensorflow/third_party/mkl/mkl.BUILD
 tensorflow/third_party/mkl_dnn/LICENSE
 tensorflow/third_party/mkl_dnn/mkldnn.BUILD
-tensorflow/third_party/pcre.BUILD
-tensorflow/third_party/pybind11.BUILD
+tensorflow/third_party/mpi/.gitignore
+tensorflow/third_party/nanopb.BUILD
+tensorflow/third_party/nccl/BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/archive.patch
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/ngraph/BUILD
+tensorflow/third_party/ngraph/LICENSE
+tensorflow/third_party/ngraph/NGRAPH_LICENSE
+tensorflow/third_party/ngraph/build_defs.bzl
+tensorflow/third_party/ngraph/ngraph.BUILD
+tensorflow/third_party/ngraph/ngraph_tf.BUILD
+tensorflow/third_party/ngraph/nlohmann_json.BUILD
+tensorflow/third_party/ngraph/tbb.BUILD
 tensorflow/third_party/opt_einsum.BUILD
-tensorflow/third_party/linenoise.BUILD
-tensorflow/third_party/sqlite.BUILD
-tensorflow/third_party/common.bzl
-tensorflow/third_party/com_google_absl.BUILD
+tensorflow/third_party/pcre.BUILD
+tensorflow/third_party/png.BUILD
+tensorflow/third_party/png_fix_rpi.patch
 tensorflow/third_party/pprof.BUILD
-tensorflow/third_party/BUILD
-tensorflow/third_party/tflite_mobilenet_quant.BUILD
-tensorflow/third_party/wrapt.BUILD
-tensorflow/third_party/lmdb.BUILD
-tensorflow/third_party/git/BUILD.tpl
-tensorflow/third_party/git/BUILD
-tensorflow/third_party/git/git_configure.bzl
 tensorflow/third_party/protobuf/BUILD
-tensorflow/third_party/enum34.BUILD
-tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/py/BUILD
 tensorflow/third_party/py/BUILD.tpl
 tensorflow/third_party/py/numpy/BUILD
 tensorflow/third_party/py/python_configure.bzl
-tensorflow/third_party/termcolor.BUILD
-tensorflow/third_party/png_fix_rpi.patch
-tensorflow/third_party/swig.BUILD
-tensorflow/third_party/astor.BUILD
-tensorflow/third_party/grpc/BUILD
-tensorflow/third_party/curl.BUILD
-tensorflow/third_party/arm_neon_2_x86_sse.BUILD
-tensorflow/third_party/png.BUILD
-tensorflow/third_party/googleapis.BUILD
-tensorflow/third_party/mpi_collectives/BUILD
-tensorflow/third_party/nanopb.BUILD
-tensorflow/third_party/gif.BUILD
-tensorflow/third_party/double_conversion.BUILD
-tensorflow/third_party/six.BUILD
-tensorflow/third_party/tflite_mobilenet_float.BUILD
+tensorflow/third_party/pybind11.BUILD
+tensorflow/third_party/python_runtime/BUILD
 tensorflow/third_party/repo.bzl
-tensorflow/third_party/codegen.BUILD
-tensorflow/third_party/cub.BUILD
-tensorflow/third_party/jsoncpp.BUILD
+tensorflow/third_party/six.BUILD
+tensorflow/third_party/snappy.BUILD
+tensorflow/third_party/sqlite.BUILD
+tensorflow/third_party/swig.BUILD
+tensorflow/third_party/sycl/crosstool/BUILD
+tensorflow/third_party/systemlibs/BUILD
+tensorflow/third_party/systemlibs/BUILD.tpl
+tensorflow/third_party/systemlibs/absl_py.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
+tensorflow/third_party/systemlibs/astor.BUILD
+tensorflow/third_party/systemlibs/boringssl.BUILD
+tensorflow/third_party/systemlibs/build_defs.bzl.tpl
+tensorflow/third_party/systemlibs/curl.BUILD
+tensorflow/third_party/systemlibs/cython.BUILD
+tensorflow/third_party/systemlibs/double_conversion.BUILD
+tensorflow/third_party/systemlibs/gast.BUILD
+tensorflow/third_party/systemlibs/gif.BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
+tensorflow/third_party/systemlibs/googleapis.BUILD
+tensorflow/third_party/systemlibs/grpc.BUILD
+tensorflow/third_party/systemlibs/jsoncpp.BUILD
+tensorflow/third_party/systemlibs/lmdb.BUILD
+tensorflow/third_party/systemlibs/nsync.BUILD
+tensorflow/third_party/systemlibs/opt_einsum.BUILD
+tensorflow/third_party/systemlibs/pcre.BUILD
+tensorflow/third_party/systemlibs/png.BUILD
+tensorflow/third_party/systemlibs/protobuf.BUILD
+tensorflow/third_party/systemlibs/protobuf.bzl
+tensorflow/third_party/systemlibs/re2.BUILD
+tensorflow/third_party/systemlibs/six.BUILD
+tensorflow/third_party/systemlibs/snappy.BUILD
+tensorflow/third_party/systemlibs/sqlite.BUILD
+tensorflow/third_party/systemlibs/swig.BUILD
+tensorflow/third_party/systemlibs/syslibs_configure.bzl
+tensorflow/third_party/systemlibs/termcolor.BUILD
+tensorflow/third_party/systemlibs/zlib.BUILD
+tensorflow/third_party/tensorrt/BUILD
+tensorflow/third_party/tensorrt/BUILD.tpl
+tensorflow/third_party/tensorrt/LICENSE
+tensorflow/third_party/tensorrt/build_defs.bzl.tpl
+tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
+tensorflow/third_party/tensorrt/tensorrt_configure.bzl
+tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/tflite_mobilenet.BUILD
+tensorflow/third_party/tflite_mobilenet_float.BUILD
+tensorflow/third_party/tflite_mobilenet_quant.BUILD
 tensorflow/third_party/tflite_ovic_testdata.BUILD
-tensorflow/third_party/__init__.py
-tensorflow/third_party/libxsmm.BUILD
+tensorflow/third_party/tflite_smartreply.BUILD
+tensorflow/third_party/toolchains/BUILD
+tensorflow/third_party/toolchains/clang6/BUILD
+tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/clang6/README.md
+tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/clang6/repo.bzl
+tensorflow/third_party/toolchains/cpus/arm/BUILD
+tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+tensorflow/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
+tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/cpus/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
+tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
+tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_026/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/py37/BUILD
+tensorflow/third_party/toolchains/remote/BUILD
+tensorflow/third_party/toolchains/remote/BUILD.tpl
+tensorflow/third_party/toolchains/remote/configure.bzl
+tensorflow/third_party/toolchains/remote/execution.bzl.tpl
+tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
-tensorflow/third_party/eigen.BUILD
-tensorflow/api_template_v1.__init__.py
-tensorflow/compat_template_v1.__init__.py
-tensorflow/compat_template.__init__.py
-tensorflow/api_template.__init__.py
-tensorflow/__init__.py
-tensorflow/virtual_root_template_v2.__init__.py
-tensorflow/virtual_root_template_v1.__init__.py
\ No newline at end of file
+tensorflow/tools/ci_build/remote/BUILD
+tensorflow/tools/def_file_filter/BUILD
+tensorflow/tools/def_file_filter/BUILD.tpl
+tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+tensorflow/tools/lib_package/BUILD
+tensorflow/tools/lib_package/LibTensorFlowTest.java
+tensorflow/tools/lib_package/README.md
+tensorflow/tools/lib_package/concat_licenses.sh
+tensorflow/tools/lib_package/libtensorflow_java_test.sh
+tensorflow/tools/lib_package/libtensorflow_test.c
+tensorflow/tools/lib_package/libtensorflow_test.sh
+tensorflow/tools/pip_package/BUILD
+tensorflow/tools/pip_package/MANIFEST.in
+tensorflow/tools/pip_package/README
+tensorflow/tools/pip_package/build_pip_package.sh
+tensorflow/tools/pip_package/check_load_py_test.py
+tensorflow/tools/pip_package/pip_smoke_test.py
+tensorflow/tools/pip_package/setup.py
+tensorflow/tools/pip_package/simple_console.py
+tensorflow/tools/pip_package/simple_console_for_windows.py
+tensorflow/virtual_root_template_v1.__init__.py
+tensorflow/virtual_root_template_v2.__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index dbf32d93e71..882800ed999 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -12,18 +12,21 @@ visibility = [
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
     "//third_party/py/cleverhans:__subpackages__",
+    "//third_party/py/neural_structured_learning:__subpackages__",
     "//third_party/py/tensorflow_examples:__subpackages__",
     "//third_party/py/tf_slim:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
 ]
 
-load("//tensorflow:tensorflow.bzl", "if_mlir", "if_not_v2", "if_not_windows", "tf_cuda_library", "tf_gen_op_wrapper_py", "py_test", "tf_py_test", "py_tests", "tf_py_build_info_genrule", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "if_mlir", "if_not_v2", "if_not_windows", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py", "tf_py_build_info_genrule", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library", "tf_proto_library", "tf_proto_library_py", "tf_additional_lib_deps", "tf_additional_all_protos", "tf_protos_grappler", "tf_additional_cupti_test_flags")  # @unused
-load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_mpi_deps", "tf_additional_gdr_deps", "if_static")
+load("//tensorflow/core/platform:default/build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_cupti_test_flags", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -97,6 +100,7 @@ py_library(
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
+        ":_pywrap_utils",
         ":array_ops",
         ":audio_ops_gen",
         ":bitwise_ops",
@@ -375,6 +379,22 @@ cc_library(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_utils",
+    srcs = ["util/util_wrapper.cc"],
+    hdrs = ["util/util.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    module_name = "_pywrap_utils",
+    deps = [
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 cc_library(
     name = "cpp_python_util",
     srcs = ["util/util.cc"],
@@ -596,7 +616,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
     ],
@@ -684,6 +703,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":_pywrap_utils",
         ":common_shapes",
         ":composite_tensor",
         ":convert_to_constants",
@@ -912,6 +932,7 @@ py_library(
         ":tensor_shape",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:execution_callbacks",
         "@six_archive//:six",
     ],
 )
@@ -956,6 +977,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "op_callbacks",
+    srcs = ["framework/op_callbacks.py"],
+    srcs_version = "PY2AND3",
+)
+
+cuda_py_test(
+    name = "op_callbacks_test",
+    srcs = ["framework/op_callbacks_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":control_flow_ops",
+        ":framework_test_lib",
+        ":script_ops",
+        ":sparse_ops",
+        ":sparse_tensor",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:execute",
+        "//tensorflow/python/eager:execution_callbacks",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+)
+
 py_library(
     name = "tensor_like",
     srcs = ["framework/tensor_like.py"],
@@ -1196,7 +1241,9 @@ py_library(
     srcs = ["util/tf_stack.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [],
+    deps = [
+        ":_tf_stack",
+    ],
 )
 
 py_library(
@@ -2482,11 +2529,29 @@ tf_py_test(
     additional_deps = [
         ":client_testlib",
         ":collective_ops",
+        ":kernels",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
 )
 
+cuda_py_test(
+    name = "collective_ops_gpu_test",
+    size = "small",
+    srcs = ["ops/collective_ops_gpu_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":collective_ops",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_rocm",
+        "no_windows",
+    ],
+)
+
 py_library(
     name = "control_flow_grad",
     srcs =
@@ -2545,11 +2610,23 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_util",
+        ":control_flow_v2_func_graphs",
         ":framework_ops",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras:base_layer_utils",
+    ],
+)
+
+py_library(
+    name = "control_flow_v2_func_graphs",
+    srcs = ["ops/control_flow_v2_func_graphs.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":func_graph",
     ],
 )
 
@@ -2559,11 +2636,56 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_util",
+        ":control_flow_util_v2",
         ":framework_ops",
         ":util",
     ],
 )
 
+tf_py_test(
+    name = "control_flow_v2_toggles_test",
+    size = "small",
+    srcs = ["ops/control_flow_v2_toggles_test.py"],
+    additional_deps = [
+        ":control_flow_v2_toggles",
+        ":control_flow_util_v2",
+        ":client_testlib",
+        ":platform_test",
+    ],
+)
+
+tf_py_test(
+    name = "control_flow_v2_enable_test",
+    size = "small",
+    srcs = ["ops/control_flow_v2_enable_test.py"],
+    additional_deps = [
+        ":tf2",
+        ":control_flow_util",
+        ":client_testlib",
+        ":platform_test",
+    ],
+)
+
+tf_py_test(
+    name = "control_flow_v2_disable_test",
+    size = "small",
+    srcs = ["ops/control_flow_v2_disable_test.py"],
+    additional_deps = [
+        ":tf2",
+        ":control_flow_util",
+        ":client_testlib",
+        ":platform_test",
+    ],
+    # This tests that it is possible to disable cfv2 using env vars.
+    # This does not apply to TF 2.0 nightly builds which enable
+    # v2 behavior using `tf.compat.v1.enable_v2_behavior()` in which case
+    # `tf.compat.v1.disable_control_flow_v2()` needs to be used.
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+)
+
 py_library(
     name = "cond_v2",
     srcs = [
@@ -3096,6 +3218,36 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+py_library(
+    name = "loss_scaling_gradient_tape",
+    srcs = ["training/experimental/loss_scaling_gradient_tape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":loss_scale",
+        ":unconnected_gradients",
+        ":util",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+py_test(
+    name = "loss_scaling_gradient_tape_test",
+    size = "medium",
+    srcs = ["training/experimental/loss_scaling_gradient_tape_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":client_testlib",
+        ":constant_op",
+        ":loss_scale",
+        ":loss_scaling_gradient_tape",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -3615,6 +3767,24 @@ py_library(
     ],
 )
 
+py_test(
+    name = "rnn_grad_test",
+    srcs = ["ops/rnn_grad_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":dtypes",
+        ":framework_ops",
+        ":framework_test_lib",
+        ":gradients",
+        ":math_ops",
+        ":rnn_grad",
+        ":rnn_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "standard_ops",
     srcs = ["ops/standard_ops.py"],
@@ -3640,6 +3810,7 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":loss_scaling_gradient_tape",
         ":manip_grad",
         ":manip_ops",
         ":math_grad",
@@ -3881,6 +4052,7 @@ cuda_py_test(
         ":array_ops",
         ":cond_v2",
         ":control_flow_ops",
+        ":control_flow_v2_toggles",
         ":embedding_ops",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -4447,6 +4619,19 @@ py_library(
     ],
 )
 
+pybind_extension(
+    name = "_tf_stack",
+    srcs = ["util/tf_stack.cc"],
+    copts = ["-fexceptions"],
+    features = ["-use_header_modules"],
+    # TODO(b/138203821): change to "util._tf_stack" once the bug is fixed.
+    module_name = "_tf_stack",
+    deps = [
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@pybind11",
+    ],
+)
+
 py_library(
     name = "util",
     srcs = glob(
@@ -4464,6 +4649,7 @@ py_library(
         "//third_party/py/tf_agents:__subpackages__",
     ],
     deps = [
+        ":_tf_stack",
         "@org_python_pypi_backports_weakref",
         "@com_google_protobuf//:protobuf_python",
         "//third_party/py/numpy",
@@ -4472,6 +4658,21 @@ py_library(
     ] + if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass"]),
 )
 
+tf_py_test(
+    name = "tf_stack_test",
+    srcs = ["util/tf_stack_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+tf_py_test(
+    name = "object_identity_test",
+    size = "small",
+    srcs = ["util/object_identity_test.py"],
+)
+
 # Placeholder for intenal nest_test comments.
 tf_py_test(
     name = "util_nest_test",
@@ -4800,10 +5001,11 @@ tf_py_wrap_cc(
         "util/kernel_registry.i",
         "util/port.i",
         "util/py_checkpoint_reader.i",
+        "util/scoped_annotation.i",
         "util/stat_summarizer.i",
         "util/tfprof.i",
+        "util/traceme.i",
         "util/transform_graph.i",
-        "util/util.i",
         "//tensorflow/lite/toco/python:toco.i",
     ],
     # add win_def_file for pywrap_tensorflow
@@ -4850,13 +5052,14 @@ tf_py_wrap_cc(
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/core/profiler/internal:python_traceme",
+        "//tensorflow/core/profiler/internal:python_scoped_annotation",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/lite/toco/python:toco_python_api",
         "//tensorflow/python/eager:pywrap_tfe_lib",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
          tf_additional_verbs_deps() +
-         tf_additional_mpi_deps() +
          tf_additional_gdr_deps()) + if_ngraph([
         "@ngraph_tf//:ngraph_tf",
     ]),
@@ -4870,14 +5073,20 @@ tf_py_wrap_cc(
 # the dynamic libraries of custom ops can find it at runtime.
 genrule(
     name = "pywrap_tensorflow_filtered_def_file",
-    srcs = ["//tensorflow:tensorflow_def_file"],
+    srcs = [
+        "//tensorflow:tensorflow_def_file",
+        "//tensorflow/tools/def_file_filter:symbols_pybind",
+        ":cpp_python_util",
+    ],
     outs = ["pywrap_tensorflow_filtered_def_file.def"],
     cmd = select({
         "//tensorflow:windows": """
               $(location @local_config_def_file_filter//:def_file_filter) \\
               --input $(location //tensorflow:tensorflow_def_file) \\
               --output $@ \\
-              --target _pywrap_tensorflow_internal.pyd
+              --target _pywrap_tensorflow_internal.pyd \\
+              --lib_paths $(execpath :cpp_python_util) \\
+              --symbols $(location //tensorflow/tools/def_file_filter:symbols_pybind)
           """,
         "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
     }),
@@ -4892,6 +5101,22 @@ filegroup(
     output_group = "interface_library",
 )
 
+cc_import(
+    name = "_pywrap_tensorflow_internal_linux",
+    shared_library = "//tensorflow/python:lib_pywrap_tensorflow_internal.so",
+)
+
+cc_import(
+    name = "_pywrap_tensorflow_internal_macos",
+    shared_library = "//tensorflow/python:lib_pywrap_tensorflow_internal.dylib",
+)
+
+cc_import(
+    name = "_pywrap_tensorflow_internal_windows",
+    interface_library = "//tensorflow/python:pywrap_tensorflow_import_lib_file",
+    shared_library = "//tensorflow/python:_pywrap_tensorflow_internal.dll",
+)
+
 # Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib
 # (It was _pywrap_tensorflow_internal.so.if.lib).
 genrule(
@@ -5113,7 +5338,6 @@ tf_py_test(
     grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
-        "nofwdcompat",  # b/137641346
         "notsan",  # data race due to b/62910646
         "oss_serial",
     ],
@@ -5401,7 +5625,10 @@ cuda_py_test(
         ":variable_scope",
         ":variables",
     ],
-    tags = ["notsan"],
+    tags = [
+        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
+        "notsan",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -6131,6 +6358,22 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "collective_ops_benchmark",
+    srcs = ["ops/collective_ops_benchmark.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client",
+        ":client_testlib",
+        ":collective_ops",
+        ":framework_for_generated_wrappers",
+        ":platform",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+    main = "ops/collective_ops_benchmark.py",
+)
+
 cuda_py_test(
     name = "concat_benchmark",
     srcs = ["ops/concat_benchmark.py"],
@@ -6348,6 +6591,7 @@ tf_py_test(
     additional_deps = [
         ":array_ops",
         ":client_testlib",
+        ":framework_combinations",
         ":framework_for_generated_wrappers",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
@@ -6667,6 +6911,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = visibility + [
         "//learning/deepmind/tensorflow:__subpackages__",
+        "//third_party/car/deep_nets/tensorflow:__subpackages__",
     ],
     deps = [
         ":framework_for_generated_wrappers",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 4975568796a..06216f47b85 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -47,6 +47,7 @@ import traceback
 import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
@@ -142,6 +143,13 @@ from tensorflow.python.eager.remote import connect_to_remote_host
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework.ops import enable_eager_execution
 
+# Check whether TF2_BEHAVIOR is turned on.
+from tensorflow.python.eager import monitoring as _monitoring
+from tensorflow.python import tf2 as _tf2
+_tf2_gauge = _monitoring.BoolGauge('/tensorflow/api/tf2_enable',
+                                   'Environment variable TF2_BEHAVIOR is set".')
+_tf2_gauge.get_cell().set(_tf2.enabled())
+
 # Necessary for the symbols in this module to be taken into account by
 # the namespace management system (API decorators).
 from tensorflow.python.ops import rnn
diff --git a/tensorflow/python/autograph/LIMITATIONS.md b/tensorflow/python/autograph/LIMITATIONS.md
deleted file mode 100644
index b4e4ca661ad..00000000000
--- a/tensorflow/python/autograph/LIMITATIONS.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# Capabilities and Limitations
-
-TF AutoGraph converts Eager Python code into TensorFlow graph-mode code. For example, users write code with `if` and `while` and AutoGraph automatically converts it into the equivalent `tf.cond`, and `tf.while_loop`.
-
-Python is a large language, so hoping to convert arbitrary Python code directly to TF graphs is overly ambitious. However, the Python code written to metaprogram TF graphs is in practice a restricted subset. We aim to support as much of this subset as possible. The table below lays out what we currently handle, what we hope to support, and what we have no plans to support.
-
-# Python Language Support Status
-
-Note: as more complex features in TensorFlow are made more accessible using AutoGraph, we expect to come across use cases that haven't been tried before, some of which might reveal rare bugs. If we do find any such bugs, we may add additional restrictions for the affected configurations, until those bugs are resolved.
-
-Construct                   | Supported now? | Plan to support? | Notes
-:-------------------------- | :------------: | :--------------: | :----
-If statement                | Yes            |                  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
-For statement               | Yes            |                  | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
-While statement             | Yes            |                  | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
-Continue and break          | Yes            |                  | Converts to boolean flags and extra predicates in loop tests.
-Composition of control flow | Yes            |                  | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
-Iterators                   | Some           | Yes              | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
-Multiple return values      | Yes            |                  | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
-Print expression            | Yes            |                  | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
-Static function calls       | Yes            |                  | Non-recursive function calls
-Nested call trees           | Yes            |                  | For example, `f` calls `g` which calls `h`, all of which need conversion.
-Recursive function calls    | No             | Maybe            | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
-Python built-ins            | Some           | Yes              | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
-List operations             | Yes            |                  | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
-Function variables          | Yes            |                  | e.g. `f_new = f_orig; f_new()`
-Lambda functions            | No             | Yes              | Planned feature.
-Classes                     | Yes            |                  | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
-Subclasses                  | Yes            |                  | Subclassing library objects like tf.keras.Model is also supported.
-Dynamic types               | Some           |                  | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
-Dynamic code / exec         | No             |                  |
-Reflection                  | No             |                  |
-Try / Except                | No             | No               | No current sane TF equivalent.
-Global variables            | Restricted     |                  | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
-Functions with side effects | Some           |                  | Side effects are allowed, under certain circumstances.
-Collections                 | Some           | Yes              | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
-List Comprehensions         | Yes            |                  | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
-Custom context managers     | No             | Yes              | Currently low priority. Left unconverted currently.
-Generators                  | No             | Maybe            | Could be achievable using queues; very low priority.
-Assertions                  | Yes            |                  | As `tf.Assert`
-Deletion                    | Yes            | Maybe            | Currently unconverted. If new semantics are required for `del`, we are able to add it in.
-Inline imports              | No             | Yes              | For example, `import numpy as np; np.eye(3)`. Currently low priority.
-Async                       | No             | No               |
-
-## Extra capabilities
-
- - We liberally add name scopes to generated functions
- - Operations get decent default names everywhere (planned)
- - Statements that have no output values are given correct control dependencies. For example, `for i in range(n): print(i)` will have control dependencies to ensure the `print` statements are executed serially.
-
diff --git a/tensorflow/python/autograph/README.md b/tensorflow/python/autograph/README.md
deleted file mode 100644
index bfe21b4765d..00000000000
--- a/tensorflow/python/autograph/README.md
+++ /dev/null
@@ -1,143 +0,0 @@
-# AutoGraph
-
-IMPORTANT: AutoGraph is beta software, and under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! We'd also love contributions ([please see our contributing guidelines](CONTRIBUTING.md) and our [style guide](STYLE_GUIDE.md)).
-
-AutoGraph is a Python to TensorFlow compiler.
-
-With AutoGraph, you can write [Eager style](https://www.tensorflow.org/guide/eager) code in a concise manner, and run it as a TensorFlow graph. AutoGraph uses source code transformation and partial evaluation to generate Python code that builds an equivalent TensorFlow subgraph. The result is code that behaves like ops and can be freely combined with other TensorFlow ops.  [Please see this file for which parts of the Python language we currently support](LIMITATIONS.md).
-
-For example, this Python function:
-
-```
-def f(x):
-  if x < 0:
-    x = -x
-  return x
-```
-
-would be converted to this:
-
-```
-def graph_mode_f(x):
-  with tf.name_scope('f'):
-
-    def if_true():
-      with tf.name_scope('if_true'):
-        x_1, = x,
-        x_1 = tf.negative(x_1)
-        return x_1,
-
-    def if_false():
-      with tf.name_scope('if_false'):
-        x_1, = x,
-        return x_1,
-    x = ag__.utils.run_cond(tf.greater(x, 0), if_true, if_false)
-    return x
-```
-
-so you can use it like an op:
-
-```
-with tf.Graph().as_default():
-  x = tf.constant(-1.0)
-
-  converted_f = autograph.to_graph(f)
-  y = converted_f(x)
-
-  with tf.Session() as sess:
-    print(sess.run(y))
-    # Output: 1
-```
-
-# Getting started
-
-Use AutoGraph in one of the following ways, described below:
-
- 1. Annotations (simpler)
- 2. Functional API (more flexible)
-
-To get started, install the latest nightly TensorFlow build:
-
-```shell
-pip install -U tf-nightly
-```
-
-Then import the `autograph` module from `tf.contrib`:
-
-```
-from tensorflow.python import autograph as ag
-```
-
-### Related links
-
-Articles:
-
- * [TensorFlow blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7)
-
-Interactive notebooks:
-
- * [Quick guide](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/guide/autograph.ipynb)
- * [RNN trained using Keras and Estimators](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
- * [Demo from the TF Dev Summit 2018](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb)
- * [Basic control flow speed test](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/ag_vs_eager_collatz_speed_test.ipynb)
- * [MNIST training speed test](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/ag_vs_eager_mnist_speed_test.ipynb)
- * [Basic algorithm samples](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb)
- * [Introductory workshop support notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/workshop.ipynb)
-
-## Using with annotations
-
-Annotating a function or class with `@convert` converts it in place:
-
-```
-@ag.convert()
-def f(x):
-  if x < 0:
-    x = -x
-  return x
-```
-
-... so that it always outputs TensorFlow code:
-
-```
-with tf.Graph().as_default():
-  x = tf.constant(-1)
-
-  y = f(x)
-
-  with tf.Session() as sess:
-    print(sess.run(y))
-    # Output: 1
-```
-
-## Using the functional API
-
-The functional API allows you to convert an existing function, class or object after it was defined:
-
-```
-converted_f = ag.to_graph(f)
-
-print(converted_f(tf.constant(-1)))
-# Output: Tensor
-
-print(f(-1))
-# Output: 1
-```
-
-You can use the functional API to inspect the generated code as well:
-
-```
-print(ag.to_code(f))
-# Output: <Python and TensorFlow code>
-```
-
-## Filing bugs and feature requests
-
-### Reporting a bug
-
- - If AutoGraph-generated code is compiling and running, but producing an incorrect result, send us a minimal reproduction case that includes the original Eager code, the inputs and if possible, the outputs or the error message.
- - If AutoGraph-generated code is compiling, but not running, send us a minimal reproduction case that includes the original Eager code, the inputs and if possible, the outputs or the error message.
- - If AutoGraph-generated code is not compiling, send us two minimal pieces of code. First, the Eager code that you would like to write, and second, the Graph code that you would like AutoGraph to have generated for you.
-
-### Requesting a feature
-
-If you’d like AutoGraph to convert a feature of Python or TF that we currently don’t handle, please let us know by filing a bug. We’ll make it as easy as possible to interact with us through there.
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 0d2d8a148fd..4132cd5369c 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -40,7 +40,6 @@ from tensorflow.python.autograph.impl.api import AutoGraphError
 from tensorflow.python.autograph.impl.api import convert
 from tensorflow.python.autograph.impl.api import converted_call
 from tensorflow.python.autograph.impl.api import do_not_convert
-from tensorflow.python.autograph.impl.api import RunMode
 from tensorflow.python.autograph.impl.api import StackTraceMapper
 from tensorflow.python.autograph.impl.api import to_code
 from tensorflow.python.autograph.impl.api import to_graph
@@ -56,7 +55,6 @@ _allowed_symbols = [
     'AutoGraphError',
     'ConversionOptions',
     'Feature',
-    'RunMode',
     'StackTraceMapper',
     'convert',
     'converted_call',
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 7a6a77ed144..0f6189ceaa7 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -32,7 +32,6 @@ py_library(
         "lists.py",
         "logical_expressions.py",
         "return_statements.py",
-        "side_effect_guards.py",
         "slices.py",
     ],
     srcs_version = "PY2AND3",
@@ -182,18 +181,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "side_effect_guards_test",
-    srcs = ["side_effect_guards_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":converters",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/core:test_lib",
-    ],
-)
-
 py_test(
     name = "return_statements_test",
     srcs = ["return_statements_test.py"],
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index 9ae448892a0..0302964c6b1 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -19,31 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.converters import asserts
-from tensorflow.python.autograph.converters import side_effect_guards
+from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
 class AssertsTest(converter_testing.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(a):
-      assert a, 'test message'
-      return tf.no_op()  # pylint:disable=undefined-variable
+      assert a, 'testmsg'
+      return a
 
-    with self.converted(test_fn, (asserts, side_effect_guards), {},
-                        gen_control_flow_ops.no_op) as result:
-      with self.cached_session() as sess:
+    with ops.Graph().as_default():
+      with self.converted(test_fn, (function_scopes, asserts), {}) as result:
         op = result.test_fn(constant_op.constant(False))
-        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                     'test message'):
-          self.evaluate(op)
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
+        self.evaluate(op)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/break_statements_test.py b/tensorflow/python/autograph/converters/break_statements_test.py
index 816d3bb1b65..c789ced095d 100644
--- a/tensorflow/python/autograph/converters/break_statements_test.py
+++ b/tensorflow/python/autograph/converters/break_statements_test.py
@@ -28,7 +28,7 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     with self.converted(test_fn, break_statements, {},
-                        constant_op.constant) as result:
+                        (constant_op.constant,)) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_while_loop(self):
@@ -58,7 +58,7 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
       return v
 
     with self.converted(test_fn, break_statements, {},
-                        constant_op.constant) as result:
+                        (constant_op.constant,)) as result:
       # The break is incompletely canonicalized. The loop will not interrupt,
       # but the section following the break will be skipped.
       self.assertEqual([3], result.test_fn([5, 4]))
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 657d880620f..5a5a2c95dde 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -16,6 +16,8 @@
 
 Note: this transformer does not rename the top level object being converted;
 that is the caller's responsibility.
+
+Requires function_scopes.
 """
 
 from __future__ import absolute_import
@@ -29,6 +31,7 @@ from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.utils import ag_logging
 
 
 # TODO(mdan): Rename to FunctionCallsTransformer.
@@ -38,12 +41,37 @@ class _Function(object):
 
   no_root = True
 
+  def __init__(self):
+    self.context_name = None
+
+
+set_trace_warned = False
+
 
 class CallTreeTransformer(converter.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
+  def visit_Lambda(self, node):
+    if anno.hasanno(node, 'function_context_name'):
+      # Lambda functions created during the conversion process have no
+      # context manager.
+      self.state[_Function].enter()
+      self.state[_Function].context_name = anno.getanno(
+          node, 'function_context_name')
+      node = self.generic_visit(node)
+      self.state[_Function].exit()
+    else:
+      node = self.generic_visit(node)
+    return node
+
   def visit_FunctionDef(self, node):
     self.state[_Function].enter()
+    # Note: if the conversion process ever creates helper functions, this
+    # assumption will no longer hold.
+    assert anno.hasanno(node, 'function_context_name'), (
+        'The function_scopes converter always creates a scope for functions.')
+    self.state[_Function].context_name = anno.getanno(
+        node, 'function_context_name')
     node.args = self.visit(node.args)
     node.body = self.visit_block(node.body)
 
@@ -71,24 +99,42 @@ class CallTreeTransformer(converter.Base):
     return node
 
   def visit_Call(self, node):
+    full_name = str(anno.getanno(node.func, anno.Basic.QN, default=''))
+    function_context_name = self.state[_Function].context_name
+    node = self.generic_visit(node)
+
     # TODO(mdan): Refactor converted_call as a 'Call' operator.
 
     # Calls to the internal 'ag__' module are never converted (though their
     # arguments might be).
-    full_name = str(anno.getanno(node.func, anno.Basic.QN, default=''))
     if full_name.startswith('ag__.'):
-      return self.generic_visit(node)
+      return node
+
+    # Calls to the function context manager (inserted by function_scopes) are
+    # also safe.
+    if full_name.startswith(function_context_name + '.'):
+      return node
 
     # Calls to pdb.set_trace or ipdb.set_trace are never converted. We don't use
     # the normal mechanisms to bypass these literals because they are sensitive
     # to the frame they are being called from.
     # TODO(mdan): Generalize this to a "static whitelist" config.
-    if full_name in ('pdb.set_trace', 'ipdb.set_trace'):
-      return self.generic_visit(node)
+    if full_name in ('pdb.set_trace', 'ipdb.set_trace', 'breakpoint'):
+      global set_trace_warned
+      if not set_trace_warned:
+        # TODO(mdan): Update and shorten once available on tensorflow.org.
+        ag_logging.warn(
+            'Detected `pdb.set_trace()` in converted code. The code'
+            ' generated by AutoGraph is not optimized for step-by-step'
+            ' debugging. See https://github.com/tensorflow/tensorflow/'
+            'blob/master/tensorflow/python/autograph/g3doc/reference/'
+            'debugging.md.')
+        set_trace_warned = True
+      return node
 
     if (full_name == 'print' and
         not self.ctx.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS)):
-      return self.generic_visit(node)
+      return node
 
     func = node.func
 
@@ -99,7 +145,6 @@ class CallTreeTransformer(converter.Base):
         assert starred_arg is None, 'Multiple *args should be impossible.'
         starred_arg = a
       else:
-        a = self.visit(a)
         normal_args.append(a)
     if starred_arg is None:
       args = templates.replace_as_expression('(args,)', args=normal_args)
@@ -116,7 +161,6 @@ class CallTreeTransformer(converter.Base):
         assert kwargs_arg is None, 'Multiple **kwargs should be impossible.'
         kwargs_arg = k
       else:
-        k = self.visit(k)
         normal_keywords.append(k)
     if kwargs_arg is None:
       if not normal_keywords:
@@ -130,15 +174,15 @@ class CallTreeTransformer(converter.Base):
           keywords=ast_util.keywords_to_dict(normal_keywords))
 
     template = """
-      ag__.converted_call(func, options, args, kwargs)
+      ag__.converted_call(func, options, args, kwargs, function_ctx)
     """
     new_call = templates.replace_as_expression(
         template,
         func=func,
-        options=self.ctx.program.options.to_ast(
-            internal_convert_user_code=self.ctx.program.options.recursive),
+        options=parser.parse_expression(function_context_name + '.callopts'),
         args=args,
-        kwargs=kwargs)
+        kwargs=kwargs,
+        function_ctx=function_context_name)
 
     return new_call
 
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index d61908fc8e8..6336d380b10 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import imp
 
 from tensorflow.python.autograph.converters import call_trees
+from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.platform import test
 
@@ -30,89 +31,98 @@ class CallTreesTest(converter_testing.TestCase):
   def test_normal_function(self):
 
     def test_fn(f):
-      return f() + 3
+      return f() + 20
 
-    with self.converted(test_fn, call_trees, {}) as result:
-      self.assertEqual(
-          result.test_fn(None),
-          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
+      self.assertEqual(result.test_fn(lambda: 1), 21)
       self.assertListEqual(self.dynamic_calls, [((), None)])
 
   def test_function_with_expression_in_argument(self):
 
     def test_fn(f, g):
-      return f(g() + 7) + 3
+      return f(g() + 20) + 4000
 
-    with self.converted(test_fn, call_trees, {}) as result:
-      self.assertEqual(
-          result.test_fn(None, None),
-          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
+      self.assertEqual(result.test_fn(lambda x: x + 300, lambda: 1), 4321)
       self.assertListEqual(self.dynamic_calls, [
           ((), None),
-          ((converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 7,), None),
+          ((21,), None),
       ])
 
   def test_function_with_call_in_argument(self):
 
     def test_fn(f, g):
-      return f(g()) + 3
+      return f(g()) + 300
 
-    with self.converted(test_fn, call_trees, {}) as result:
-      self.assertEqual(
-          result.test_fn(None, None),
-          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
+      self.assertEqual(result.test_fn(lambda x: x + 20, lambda: 1), 321)
       self.assertListEqual(self.dynamic_calls, [
           ((), None),
-          ((converter_testing.RESULT_OF_MOCK_CONVERTED_CALL,), None),
+          ((1,), None),
+      ])
+
+  def test_function_chaining(self):
+
+    def get_one():
+      return 1
+
+    def test_fn():
+      return get_one().__add__(20)
+
+    with self.converted(test_fn, (function_scopes, call_trees),
+                        {'get_one': get_one}, ()) as result:
+
+      self.assertEqual(result.test_fn(), 21)
+
+      self.assertListEqual(self.dynamic_calls, [
+          ((), None),
+          ((20,), None),
       ])
 
   def test_function_with_kwarg(self):
 
     def test_fn(f, a, b):
-      return f(a, c=b) + 3
+      return f(a, c=b) + 300
 
-    with self.converted(test_fn, call_trees, {}) as result:
-      self.assertEqual(
-          result.test_fn(None, 1, 2),
-          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
-      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 2})])
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
+      self.assertEqual(result.test_fn(lambda a, c: a + c, 1, 20), 321)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 20})])
 
   def test_function_with_kwargs_starargs(self):
 
     def test_fn(f, a, *args, **kwargs):
       return f(a, *args, **kwargs) + 5
 
-    with self.converted(test_fn, call_trees, {}) as result:
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
       self.assertEqual(
-          result.test_fn(None, 1, *[2, 3], **{
+          result.test_fn(lambda *args, **kwargs: 7, 1, *[2, 3], **{
               'b': 4,
               'c': 5
-          }), converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+          }), 12)
       self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})])
 
   def test_function_with_kwargs_starargs_only(self):
 
-    def f(*unused_args):  # Will not be called.
-      pass
+    def f(*args):
+      return sum(args)
 
     def test_fn():
-      args = [1, 2, 3]
-      return f(*args) + 11
+      args = [1, 20, 300]
+      return f(*args) + 4000
 
-    with self.converted(test_fn, call_trees, {'f': f}) as result:
-      self.assertEqual(result.test_fn(),
-                       converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 11)
-      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), None)])
+    with self.converted(test_fn, (function_scopes, call_trees),
+                        {'f': f}) as result:
+      self.assertEqual(result.test_fn(), 4321)
+      self.assertListEqual(self.dynamic_calls, [((1, 20, 300), None)])
 
   def test_function_with_kwargs_keywords(self):
 
     def test_fn(f, a, b, **kwargs):
       return f(a, b=b, **kwargs) + 5
 
-    with self.converted(test_fn, call_trees, {}) as result:
+    with self.converted(test_fn, (function_scopes, call_trees), {}) as result:
       self.assertEqual(
-          result.test_fn(None, 1, 2, **{'c': 3}),
-          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+          result.test_fn(lambda *args, **kwargs: 7, 1, 2, **{'c': 3}), 12)
       self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})])
 
   def test_debugger_set_trace(self):
@@ -125,7 +135,8 @@ class CallTreesTest(converter_testing.TestCase):
     def test_fn():
       return pdb.set_trace()
 
-    with self.converted(test_fn, call_trees, {'pdb': pdb}) as result:
+    with self.converted(test_fn, (function_scopes, call_trees),
+                        {'pdb': pdb}) as result:
       result.test_fn()
       self.assertListEqual(tracking_list, [1])
 
@@ -133,32 +144,32 @@ class CallTreesTest(converter_testing.TestCase):
 
     class TestClass(object):
 
-      def other_method(self, _):
-        raise ValueError('this should not be called')
+      def other_method(self, x):
+        return x + 20
 
       def test_method(self, a):
-        return self.other_method(a) + 1
+        return self.other_method(a) + 300
 
     tc = TestClass()
-    with self.converted(TestClass.test_method, call_trees, {}) as result:
-      self.assertEqual(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
-                       result.test_method(tc, 1))
+    with self.converted(TestClass.test_method, (function_scopes, call_trees),
+                        {}) as result:
+      self.assertEqual(321, result.test_method(tc, 1))
       self.assertListEqual(self.dynamic_calls, [((1,), None)])
 
   def test_object_method(self):
 
     class TestClass(object):
 
-      def other_method(self, _):
-        raise ValueError('this should not be called')
+      def other_method(self, x):
+        return x + 20
 
       def test_method(self, a):
-        return self.other_method(a) + 1
+        return self.other_method(a) + 300
 
     tc = TestClass()
-    with self.converted(tc.test_method, call_trees, {}) as result:
-      self.assertEqual(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
-                       result.test_method(tc, 1))
+    with self.converted(tc.test_method, (function_scopes, call_trees),
+                        {}) as result:
+      self.assertEqual(321, result.test_method(tc, 1))
       self.assertListEqual(self.dynamic_calls, [((1,), None)])
 
 
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index 97a975b1698..a24ddd5e527 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -29,7 +29,7 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     with self.converted(test_fn, continue_statements, {'ops': ops},
-                        constant_op.constant) as result:
+                        (constant_op.constant,)) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_basic(self):
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 526c642c337..c4b0e14e00b 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -77,12 +77,14 @@ class ControlFlowTransformer(converter.Base):
           template, body_name=body_name, body=body, return_stmt=return_stmt)
 
   def _create_cond_expr(self, results, test, body_name, orelse_name,
-                        state_getter_name,
-                        state_setter_name):
+                        state_getter_name, state_setter_name,
+                        basic_symbol_names, composite_symbol_names):
     if results is not None:
       template = """
         results = ag__.if_stmt(test, body_name, orelse_name,
-                               state_getter_name, state_setter_name)
+                               state_getter_name, state_setter_name,
+                               (basic_symbol_names,),
+                               (composite_symbol_names,))
       """
       return templates.replace(
           template,
@@ -91,10 +93,13 @@ class ControlFlowTransformer(converter.Base):
           body_name=body_name,
           orelse_name=orelse_name,
           state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name)
+          state_setter_name=state_setter_name,
+          basic_symbol_names=basic_symbol_names,
+          composite_symbol_names=composite_symbol_names)
     else:
       template = """
-        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name)
+        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name,
+                     (basic_symbol_names,), (composite_symbol_names,))
       """
       return templates.replace(
           template,
@@ -102,7 +107,9 @@ class ControlFlowTransformer(converter.Base):
           body_name=body_name,
           orelse_name=orelse_name,
           getter_name=state_getter_name,
-          setter_name=state_setter_name)
+          setter_name=state_setter_name,
+          basic_symbol_names=basic_symbol_names,
+          composite_symbol_names=composite_symbol_names)
 
   def _fmt_symbols(self, symbol_set):
     if not symbol_set:
@@ -119,10 +126,12 @@ class ControlFlowTransformer(converter.Base):
     # Composite symbols are handled elsewhere see _create_state_functions
     return {s for s in modified_live if not s.is_composite()}
 
-  def _create_state_functions(self, composites,
-                              state_getter_name, state_setter_name):
+  def _create_state_functions(self, composites, state_getter_name,
+                              state_setter_name):
+
     if composites:
       composite_tuple = tuple(composites)
+
       template = """
         def state_getter_name():
           return composite_tuple,
@@ -231,6 +240,8 @@ class ControlFlowTransformer(converter.Base):
     state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
 
     returned_from_cond = tuple(returned_from_cond)
+    composites = tuple(composites)
+
     if returned_from_cond:
       if len(returned_from_cond) == 1:
         cond_results = returned_from_cond[0]
@@ -275,9 +286,15 @@ class ControlFlowTransformer(converter.Base):
     composite_defs = self._create_state_functions(
         composites, state_getter_name, state_setter_name)
 
+    basic_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in returned_from_cond)
+    composite_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in composites)
+
     cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
                                        orelse_name, state_getter_name,
-                                       state_setter_name)
+                                       state_setter_name, basic_symbol_names,
+                                       composite_symbol_names)
 
     if_ast = (
         undefined_assigns + composite_defs + body_def + orelse_def +
@@ -347,7 +364,7 @@ class ControlFlowTransformer(converter.Base):
     return loop_vars, loop_vars_ast_tuple
 
   def visit_While(self, node):
-    self.generic_visit(node)
+    node = self.generic_visit(node)
 
     (basic_loop_vars, composite_loop_vars, reserved_symbols,
      possibly_undefs) = self._get_loop_vars(
@@ -361,6 +378,11 @@ class ControlFlowTransformer(converter.Base):
     state_functions = self._create_state_functions(
         composite_loop_vars, state_getter_name, state_setter_name)
 
+    basic_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in basic_loop_vars)
+    composite_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in composite_loop_vars)
+
     # TODO(mdan): Use a single template.
     # If the body and test functions took a single tuple for loop_vars, instead
     # of *loop_vars, then a single template could be used.
@@ -377,7 +399,9 @@ class ControlFlowTransformer(converter.Base):
             body_name,
             state_getter_name,
             state_setter_name,
-            (loop_vars,))
+            (loop_vars,),
+            (basic_symbol_names,),
+            (composite_symbol_names,))
       """
       node = templates.replace(
           template,
@@ -389,7 +413,9 @@ class ControlFlowTransformer(converter.Base):
           body=node.body,
           state_functions=state_functions,
           state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name)
+          state_setter_name=state_setter_name,
+          basic_symbol_names=basic_symbol_names,
+          composite_symbol_names=composite_symbol_names)
     else:
       template = """
         state_functions
@@ -403,7 +429,9 @@ class ControlFlowTransformer(converter.Base):
             body_name,
             state_getter_name,
             state_setter_name,
-            ())
+            (),
+            (),
+            (composite_symbol_names,))
       """
       node = templates.replace(
           template,
@@ -413,13 +441,14 @@ class ControlFlowTransformer(converter.Base):
           body=node.body,
           state_functions=state_functions,
           state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name)
+          state_setter_name=state_setter_name,
+          composite_symbol_names=composite_symbol_names)
 
     undefined_assigns = self._create_undefined_assigns(possibly_undefs)
     return undefined_assigns + node
 
   def visit_For(self, node):
-    self.generic_visit(node)
+    node = self.generic_visit(node)
 
     (basic_loop_vars, composite_loop_vars,
      reserved_symbols, possibly_undefs) = self._get_loop_vars(
@@ -466,6 +495,11 @@ class ControlFlowTransformer(converter.Base):
 
     undefined_assigns = self._create_undefined_assigns(possibly_undefs)
 
+    basic_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in basic_loop_vars)
+    composite_symbol_names = tuple(
+        gast.Str(str(symbol)) for symbol in composite_loop_vars)
+
     # TODO(mdan): Use a single template.
     # If the body and test functions took a single tuple for loop_vars, instead
     # of *loop_vars, then a single template could be used.
@@ -484,7 +518,9 @@ class ControlFlowTransformer(converter.Base):
             body_name,
             state_getter_name,
             state_setter_name,
-            (loop_vars,))
+            (loop_vars,),
+            (basic_symbol_names,),
+            (composite_symbol_names,))
       """
       return templates.replace(
           template,
@@ -500,7 +536,9 @@ class ControlFlowTransformer(converter.Base):
           body=node.body,
           state_functions=state_functions,
           state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name)
+          state_setter_name=state_setter_name,
+          basic_symbol_names=basic_symbol_names,
+          composite_symbol_names=composite_symbol_names)
     else:
       template = """
         undefined_assigns
@@ -516,7 +554,9 @@ class ControlFlowTransformer(converter.Base):
             body_name,
             state_getter_name,
             state_setter_name,
-            ())
+            (),
+            (),
+            (composite_symbol_names,))
       """
       return templates.replace(
           template,
@@ -530,7 +570,8 @@ class ControlFlowTransformer(converter.Base):
           body=node.body,
           state_functions=state_functions,
           state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name)
+          state_setter_name=state_setter_name,
+          composite_symbol_names=composite_symbol_names)
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 4690b114a77..e1ba82043bc 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -39,7 +39,7 @@ class ControlFlowTest(converter_testing.TestCase):
     if not symbols:
       symbols = {}
     with self.converted(test_fn, control_flow, symbols,
-                        constant_op.constant) as result:
+                        (constant_op.constant,)) as result:
       self.assertAllEqual(self.evaluate(result.test_fn(*inputs)), expected)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/autograph/converters/function_scopes.py b/tensorflow/python/autograph/converters/function_scopes.py
index 284b5b35195..52bd701b790 100644
--- a/tensorflow/python/autograph/converters/function_scopes.py
+++ b/tensorflow/python/autograph/converters/function_scopes.py
@@ -21,54 +21,98 @@ from __future__ import print_function
 import gast
 
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct.static_analysis import annos
+
+
+class _Function(object):
+
+  def __init__(self):
+    self.context_name = None
 
 
 class FunctionBodyTransformer(converter.Base):
   """Wraps function bodies around autograph-specific boilerplate."""
 
-  def _name_for_current_scope(self):
-    innermost = self.enclosing_entities[-1]
-    if len(self.enclosing_entities) > 1:
-      parent = self.enclosing_entities[-2]
-      if isinstance(parent, gast.ClassDef):
-        # Methods also take the name of their class.
-        name = '%s/%s' % (parent.name, innermost.name)
-      else:
-        name = innermost.name
-    else:
-      name = innermost.name
+  def visit_Return(self, node):
+    if node.value is None:
+      return node
+    return templates.replace(
+        'return function_context_name.mark_return_value(value)',
+        function_context_name=self.state[_Function].context_name,
+        value=node.value)
 
-    # Sanitize the name.
-    # See https://www.tensorflow.org/api_docs/python/tf/Graph#name_scope
-    # TensorFlow doesn't like leading underscores at the top level.
-    while name[0] == '_':
-      name = name[1:]
-    return name
-
-  def visit_FunctionDef(self, node):
+  def visit_Lambda(self, node):
+    self.state[_Function].enter()
     node = self.generic_visit(node)
 
-    final_body = []
-    indented_body = node.body
-    if node.body:
-      first_statement = node.body[0]
-      # Skip the docstring, if any.
-      if (isinstance(first_statement, gast.Expr) and
-          isinstance(first_statement.value, gast.Str)):
-        indented_body = indented_body[1:]
-        final_body.append(first_statement)
+    # Only wrap the top-level function. Theoretically, we can and should wrap
+    # everything, but that can lead to excessive boilerplate when lambdas are
+    # nested.
+    # TODO(mdan): Looks more closely for use cases that actually require this.
+    if self.state[_Function].level > 2:
+      self.state[_Function].exit()
+      return node
+
+    scope = anno.getanno(node, anno.Static.SCOPE)
+    function_context_name = self.ctx.namer.new_symbol('lambda_scope',
+                                                      scope.referenced)
+    self.state[_Function].context_name = function_context_name
+    anno.setanno(node, 'function_context_name', function_context_name)
 
     template = """
-      with ag__.function_scope(scope_name):
+      ag__.with_function_scope(
+          lambda function_context: body, function_context_name, options)
+    """
+    node.body = templates.replace_as_expression(
+        template,
+        options=self.ctx.program.options.to_ast(),
+        function_context=function_context_name,
+        function_context_name=gast.Str(function_context_name),
+        body=node.body)
+
+    self.state[_Function].exit()
+    return node
+
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+
+    function_context_name = self.ctx.namer.new_symbol(
+        '{}_scope'.format(node.name), scope.referenced)
+    self.state[_Function].context_name = function_context_name
+    anno.setanno(node, 'function_context_name', function_context_name)
+
+    node = self.generic_visit(node)
+
+    docstring_node = None
+    if node.body:
+      first_statement = node.body[0]
+      if (isinstance(first_statement, gast.Expr) and
+          isinstance(first_statement.value, gast.Str)):
+        docstring_node = first_statement
+        node.body = node.body[1:]
+
+    template = """
+      with ag__.FunctionScope(
+      function_name, context_name, options) as function_context:
         body
     """
-    scoped_body = templates.replace(
+    wrapped_body = templates.replace(
         template,
-        scope_name=gast.Str(self._name_for_current_scope()),
-        body=indented_body)
-    final_body.extend(scoped_body)
-    node.body = final_body
+        function_name=gast.Str(node.name),
+        context_name=gast.Str(function_context_name),
+        options=self.ctx.program.options.to_ast(),
+        function_context=function_context_name,
+        body=node.body)
+
+    if docstring_node is not None:
+      wrapped_body = [docstring_node] + wrapped_body
+
+    node.body = wrapped_body
+
+    self.state[_Function].exit()
     return node
 
 
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index 0eccf39db7d..a12310509bc 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -55,7 +55,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       return tf.constant(1)
 
     with self.converted(test_fn, function_scopes, {},
-                        constant_op.constant) as result:
+                        (constant_op.constant,)) as result:
       result_op = result.test_fn()
       self.assertIn('test_fn/', result_op.op.name)
       self.assertIn('First sentence.', result.test_fn.__doc__)
@@ -72,11 +72,12 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, function_scopes, {}, ops.name_scope) as result:
+    with self.converted(test_fn, function_scopes, {},
+                        (ops.name_scope,)) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
       self.assertNotIn('inner_fn', first.op.name)
-      self.assertIn('test_fn/inner_fn/', second.op.name)
+      self.assertIn('test_fn/inner_fn/', second.op.inputs[0].name)
 
   @test_util.run_deprecated_v1
   def test_method(self):
@@ -95,11 +96,11 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
     node, ctx = self.prepare(TestClass, ns)
     node = function_scopes.transform(node, ctx)
 
-    with self.compiled(node, {}, ops.name_scope) as result:
+    with self.compiled(node, {}, (ops.name_scope,)) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
-      self.assertIn('TestClass/test_fn/', first.op.name)
+      self.assertIn('test_fn/', first.op.name)
       self.assertNotIn('inner_fn', first.op.name)
-      self.assertIn('TestClass/test_fn/inner_fn/', second.op.name)
+      self.assertIn('test_fn/inner_fn/', second.op.inputs[0].name)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index 39843c7d74f..9436b69d749 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -87,7 +87,7 @@ class ListTest(converter_testing.TestCase):
     }
     node = lists.transform(node, ctx)
 
-    with self.compiled(node, ns, dtypes.int32) as result:
+    with self.compiled(node, ns, (dtypes.int32,)) as result:
       with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
@@ -121,7 +121,7 @@ class ListTest(converter_testing.TestCase):
     }
     node = lists.transform(node, ctx)
 
-    with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result:
+    with self.compiled(node, {}, (array_ops.stack, dtypes.int32)) as result:
       with self.cached_session() as sess:
         self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3])
 
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
deleted file mode 100644
index 21de0e42abb..00000000000
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adds guards against function calls with side effects.
-
-Only standalone calls are guarded.
-
-WARNING: This mechanism is incomplete. Particularly, it only guards the
-arguments passed to functions, and does not account for indirectly modified
-state.
-
-Example:
-  y = tf.compat.v1.layers.dense(x)       # Creates TF variable 'foo'
-  loss = loss(y)
-  opt.minimize(loss)           # indirectly affects 'foo'
-  z = tf.compat.v1.get_variable('foo')   # Indirectly affects `loss` and 'foo'
-  # Here, `loss` can be guarded. But `z` cannot.
-
-# TODO(mdan): We should probably define a safe mode where we guard everything.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import qual_names
-from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-class SymbolNamer(object):
-  """Describes the interface for SideEffectGuardTransformer's namer."""
-
-  def new_symbol(self, name_root, reserved_locals):
-    """Generate a new unique function_name.
-
-    Args:
-      name_root: String, used as stem in the new name.
-      reserved_locals: Set(string), additional local symbols that are reserved.
-    Returns:
-      String.
-    """
-    raise NotImplementedError()
-
-
-class SideEffectGuardTransformer(converter.Base):
-  """Adds control dependencies to functions with side effects."""
-
-  def _visit_and_reindent(self, nodes):
-    new_nodes = []
-    current_dest = new_nodes
-    alias_map = {}
-    reindent_requested = False
-    for n in nodes:
-      n = self.visit(n)
-      # NOTE: the order in which these statements execute is important; in
-      # particular, watch out for ending up with cycles in the AST.
-      if alias_map:
-        n = ast_util.rename_symbols(n, alias_map)
-      if isinstance(n, (list, tuple)):
-        current_dest.extend(n)
-      else:
-        current_dest.append(n)
-      if anno.hasanno(n, anno.Basic.INDENT_BLOCK_REMAINDER):
-        reindent_requested = True
-        new_dest, new_alias_map = anno.getanno(
-            n, anno.Basic.INDENT_BLOCK_REMAINDER)
-        anno.delanno(n, anno.Basic.INDENT_BLOCK_REMAINDER)
-        new_alias_map.update(alias_map)
-        alias_map = new_alias_map
-        current_dest = new_dest
-
-    if reindent_requested:
-      no_controls_to_gate = False
-      if not current_dest:
-        no_controls_to_gate = True
-      if len(current_dest) == 1:
-        if ast_util.matches(current_dest[0], 'return'):
-          no_controls_to_gate = True
-        if ast_util.matches(current_dest[0], 'return ()'):
-          no_controls_to_gate = True
-        if ast_util.matches(current_dest[0], 'return []'):
-          no_controls_to_gate = True
-        if ast_util.matches(current_dest[0], 'return {}'):
-          no_controls_to_gate = True
-      if no_controls_to_gate:
-        # TODO(mdan): There may still be something that could be done.
-        raise ValueError(
-            'Unable to insert statement into the computation flow: it is not'
-            ' followed by any computation which the statement could gate.')
-
-    return new_nodes
-
-  def visit_FunctionDef(self, node):
-    node.body = self._visit_and_reindent(node.body)
-    return node
-
-  def visit_With(self, node):
-    node.body = self._visit_and_reindent(node.body)
-    return node
-
-  def visit_If(self, node):
-    node.body = self._visit_and_reindent(node.body)
-    node.orelse = self._visit_and_reindent(node.orelse)
-    return node
-
-  def visit_While(self, node):
-    node.body = self._visit_and_reindent(node.body)
-    node.orelse = self._visit_and_reindent(node.orelse)
-    return node
-
-  # TODO(b/123995141) Remove once ExceptionHandlers are in the CFG
-  def visit_ExceptHandler(self, node):
-    return node
-
-  def visit_Expr(self, node):
-    self.generic_visit(node)
-    if isinstance(node.value, gast.Call):
-      # Patterns of single function calls, like:
-      #   opt.minimize(loss)
-      # or:
-      #   tf.compat.v1.py_func(...)
-
-      # First, attempt to gate future evaluation of args. If that's not
-      # possible, gate all remaining statements (and that may fail too, see
-      # _visit_and_reindent.
-      args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
-      live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-      # NOTE: We can't guard object attributes because they may not be writable.
-      # In addition, avoid renaming well-known names.
-      # TODO(mdan): Move these names into config.
-      unguarded_names = (qual_names.QN('self'), qual_names.QN('ag__'))
-      guarded_args = tuple(s for s in live_out
-                           if not s.is_composite() and s not in unguarded_names)
-
-      # TODO(mdan): Include all arguments which depended on guarded_args too.
-      # For example, the following will still cause a race:
-      #   tf.compat.v1.assign(a, a + 1)
-      #   b = a + 1
-      #   tf.compat.v1.assign(a, a + 1)  # Control deps here should include `b`
-      #   c = b + 1
-      # Or maybe we should just raise an "unsafe assign" error?
-
-      if guarded_args:
-        # The aliases may need new names to avoid incorrectly making them local.
-        # TODO(mdan): This is brutal. It will even rename modules - any fix?
-        need_alias = tuple(
-            s for s in guarded_args if s not in args_scope.parent.modified)
-        aliased_new_names = tuple(
-            qual_names.QN(
-                self.ctx.namer.new_symbol(
-                    s.ssf(), args_scope.parent.referenced)) for s in need_alias)
-        alias_map = dict(zip(need_alias, aliased_new_names))
-        if len(guarded_args) == 1:
-          s, = guarded_args
-          aliased_guarded_args = alias_map.get(s, s)
-        else:
-          aliased_guarded_args = gast.Tuple(
-              [alias_map.get(s, s).ast() for s in guarded_args], None)
-
-        template = """
-          with ag__.utils.control_dependency_on_returns(call):
-            aliased_guarded_args = ag__.utils.alias_tensors(guarded_args)
-        """
-        control_deps_guard = templates.replace(
-            template,
-            call=node.value,
-            aliased_guarded_args=aliased_guarded_args,
-            guarded_args=guarded_args)[-1]
-      else:
-        alias_map = {}
-
-        template = """
-          with ag__.utils.control_dependency_on_returns(call):
-            pass
-        """
-        control_deps_guard = templates.replace(template, call=node.value)[-1]
-        control_deps_guard.body = []
-
-      node = control_deps_guard
-      anno.setanno(node, anno.Basic.INDENT_BLOCK_REMAINDER,
-                   (node.body, alias_map))
-    return node
-
-
-def transform(node, ctx):
-  return SideEffectGuardTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py
deleted file mode 100644
index 645267e5600..00000000000
--- a/tensorflow/python/autograph/converters/side_effect_guards_test.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for side_effect_guards module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.converters import side_effect_guards
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-
-
-tf = None  # Will be replaced by a mock.
-
-
-class SideEffectGuardsTest(converter_testing.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_side_effect_on_return_only_variable(self):
-
-    def test_fn(a):
-      tf.assign(a, a + 1)
-      return a
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body), 1)
-
-    with self.compiled(node, {}, state_ops.assign) as result:
-      with self.cached_session() as sess:
-        v = variable_scope.get_variable('test', initializer=2)
-        self.evaluate(v.initializer)
-        self.evaluate(result.test_fn(v))
-        # TODO(mdan): Add support for this use case.
-        # Right now the variable `a` is not conditioned on the `assign` because
-        # there's no way to add control dependencies to a variable object.
-        self.assertEqual(2, self.evaluate(v))
-
-  def test_side_effect_on_used_variable(self):
-
-    def test_fn(a):
-      tf.assign(a, a + 1)
-      return a + 1
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body), 1)
-
-    with self.compiled(node, {}, state_ops.assign) as result:
-      with self.cached_session() as sess:
-        v = variable_scope.get_variable('test', initializer=2)
-        self.evaluate(v.initializer)
-        self.evaluate(result.test_fn(v))
-        # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        # Right now it's 3 or 4 based on whether the read is synchronized.
-        self.assertEqual(3, self.evaluate(v))
-
-  @test_util.run_deprecated_v1
-  def test_side_effect_on_tensor(self):
-
-    def test_fn(a):
-      tf.Assert(a > 0, ['expected in throw'])
-      return a
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body), 1)
-
-    with self.compiled(node, {}, control_flow_ops.Assert) as result:
-      with self.cached_session() as sess:
-        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                     'expected in throw'):
-          sess.run(result.test_fn(constant_op.constant(-1)))
-
-  def test_multiline_block(self):
-
-    def test_fn(a):
-      tf.assign_add(a, 1)
-      b = a + 1
-      tf.assign_add(a, 1)
-      b += 1
-      return b
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body), 1)
-
-    with self.compiled(node, {}, state_ops.assign_add) as result:
-      with self.cached_session() as sess:
-        v = variable_scope.get_variable('test', initializer=2)
-        self.evaluate(v.initializer)
-        self.evaluate(result.test_fn(v))
-        # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, self.evaluate(v))
-
-  def test_multiline_nested_block(self):
-
-    def test_fn(a):
-      with tf.name_scope('foo'):
-        tf.assign(a, a + 1)
-        b = a + 1
-      return b
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body[0].body), 1)
-
-    with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
-      with self.cached_session() as sess:
-        v = variable_scope.get_variable('test', initializer=2)
-        self.evaluate(v.initializer)
-        self.evaluate(result.test_fn(v))
-        # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(3, self.evaluate(v))
-
-  def test_multiline_block_unsafe(self):
-
-    def test_fn(a):
-      tf.assign(a, a + 1)
-      b = a + 1
-      tf.assign_add(a, 1)
-      c = b + 1
-      return c
-
-    node, ctx = self.prepare(test_fn, {})
-    node = side_effect_guards.transform(node, ctx)
-
-    self.assertEqual(len(node.body), 1)
-
-    with self.compiled(node, {}, state_ops.assign,
-                       state_ops.assign_add) as result:
-      with self.cached_session() as sess:
-        v = variable_scope.get_variable('test', initializer=2)
-        self.evaluate(v.initializer)
-        self.evaluate(result.test_fn(v))
-        # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, self.evaluate(v))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index 11e3736d4fb..2fea1c7f81f 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -43,7 +43,7 @@ class SliceTest(converter_testing.TestCase):
     }
     node = slices.transform(node, ctx)
 
-    with self.compiled(node, {}, dtypes.int32) as result:
+    with self.compiled(node, {}, (dtypes.int32,)) as result:
       with self.cached_session() as sess:
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index a480f83a4fd..8d7fc1d82ea 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -23,7 +23,7 @@ py_library(
         "config.py",
         "config_lib.py",
         "converter.py",
-        "function_wrapping.py",
+        "function_wrappers.py",
         "naming.py",
         "unsupported_features_checker.py",
     ],
@@ -68,8 +68,8 @@ py_test(
 )
 
 py_test(
-    name = "function_wrapping_test",
-    srcs = ["function_wrapping_test.py"],
+    name = "function_wrappers_test",
+    srcs = ["function_wrappers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":core",
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 4cd5ee3cc51..41d05ce6502 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -28,6 +28,9 @@ DoNotConvert = config_lib.DoNotConvert
 # This list is evaluated in order and stops at the first rule that tests True
 # for a definitely_convert of definitely_bypass call.
 CONVERSION_RULES = (
+    # Known packages
+    Convert('tensorflow.python.training.experimental'),
+
     # Builtin modules
     DoNotConvert('collections'),
     DoNotConvert('copy'),
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 2ec12c65f7b..e9bf009d029 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -148,9 +148,9 @@ class ConversionOptions(object):
   Attributes:
     recursive: bool, whether to recursively convert any user functions or
       classes that the converted function may use.
-    force_conversion: bool, whether to force convertinng the target entity. When
-      force_conversion is turned off, the converter may decide to return the
-      function as-is.
+    user_requested: bool, whether the conversion was explicitly requested by
+      the user, as opposed to being performed as a result of other logic. This
+      value always auto-resets resets to False in child conversions.
     optional_features: Union[Feature, Set[Feature]], controls the use of
       optional features in the conversion process. See Feature for available
       options.
@@ -158,11 +158,11 @@ class ConversionOptions(object):
 
   def __init__(self,
                recursive=False,
-               force_conversion=False,
+               user_requested=False,
                internal_convert_user_code=True,
                optional_features=Feature.ALL):
     self.recursive = recursive
-    self.force_conversion = force_conversion
+    self.user_requested = user_requested
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
 
@@ -174,7 +174,7 @@ class ConversionOptions(object):
     self.optional_features = optional_features
 
   def as_tuple(self):
-    return (self.recursive, self.force_conversion,
+    return (self.recursive, self.user_requested,
             self.internal_convert_user_code, self.optional_features)
 
   def __hash__(self):
@@ -191,16 +191,20 @@ class ConversionOptions(object):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
 
-  def to_ast(self, internal_convert_user_code=None):
+  def call_options(self):
+    """Returns the corresponding options to be used for recursive conversion."""
+    return ConversionOptions(
+        recursive=self.recursive,
+        user_requested=False,
+        internal_convert_user_code=self.recursive,
+        optional_features=self.optional_features)
+
+  def to_ast(self):
     """Returns a representation of this object as an AST node.
 
     The AST node encodes a constructor that would create an object with the
     same contents.
 
-    Args:
-      internal_convert_user_code: Optional[bool], allows ovrriding the
-        corresponding value.
-
     Returns:
       ast.Node
     """
@@ -210,7 +214,7 @@ class ConversionOptions(object):
     template = """
       ag__.ConversionOptions(
           recursive=recursive_val,
-          force_conversion=force_conversion_val,
+          user_requested=user_requested_val,
           optional_features=optional_features_val,
           internal_convert_user_code=internal_convert_user_code_val)
     """
@@ -219,23 +223,19 @@ class ConversionOptions(object):
       return parser.parse_expression('({})'.format(', '.join(
           'ag__.{}'.format(str(v)) for v in values)))
 
-    if internal_convert_user_code is None:
-      internal_convert_user_code = self.internal_convert_user_code
-
     expr_ast = templates.replace(
         template,
         recursive_val=parser.parse_expression(str(self.recursive)),
-        force_conversion_val=parser.parse_expression(
-            str(self.force_conversion)),
+        user_requested_val=parser.parse_expression(str(self.user_requested)),
         internal_convert_user_code_val=parser.parse_expression(
-            str(internal_convert_user_code)),
+            str(self.internal_convert_user_code)),
         optional_features_val=list_of_features(self.optional_features))
     return expr_ast[0].value
 
 
 STANDARD_OPTIONS = ConversionOptions(
     recursive=True,
-    force_conversion=False,
+    user_requested=False,
     internal_convert_user_code=True,
     optional_features=None)
 
@@ -262,13 +262,15 @@ class EntityContext(transformer.Context):
   Attributes:
     namer: Namer
     info: transformer.EntityInfo
-    program: ProgramContext
+    program: ProgramContext,
+    targe_name: Text
   """
 
-  def __init__(self, namer, entity_info, program_ctx):
+  def __init__(self, namer, entity_info, program_ctx, target_name=None):
     super(EntityContext, self).__init__(entity_info)
     self.namer = namer
     self.program = program_ctx
+    self.target_name = target_name
 
 
 class Base(transformer.Base):
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index 85b4d459b7a..2d5b33465e0 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -50,7 +50,7 @@ class ConversionOptionsTest(converter_testing.TestCase):
     reparsed_opts = reparsed.test_fn()
 
     self.assertEqual(opts.recursive, reparsed_opts.recursive)
-    self.assertEqual(opts.force_conversion, reparsed_opts.force_conversion)
+    self.assertEqual(opts.user_requested, False)
     self.assertEqual(
         opts.internal_convert_user_code,
         reparsed_opts.internal_convert_user_code)
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index bb2ed38fbbb..7560b436ef5 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -27,7 +27,7 @@ import six
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import function_wrappers
 from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
@@ -37,8 +37,6 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
-RESULT_OF_MOCK_CONVERTED_CALL = 7
-
 
 class TestCase(test.TestCase):
   """Base class for unit tests in this module. Contains relevant utilities."""
@@ -54,15 +52,17 @@ class TestCase(test.TestCase):
       sys.stdout = sys.__stdout__
 
   @contextlib.contextmanager
-  def compiled(self, node, namespace, *symbols):
+  def compiled(self, node, namespace, symbols=()):
     source = None
 
     self.dynamic_calls = []
     # See api.converted_call
-    def converted_call(unused_f, unused_opts, args, kwargs):
+    def converted_call(f, unused_opts, args, kwargs, unused_function_ctx):
       """Mock version of api.converted_call."""
       self.dynamic_calls.append((args, kwargs))
-      return RESULT_OF_MOCK_CONVERTED_CALL
+      if kwargs is None:
+        kwargs = {}
+      return f(*args, **kwargs)
 
     try:
       result, source, source_map = compiler.ast_to_object(
@@ -78,7 +78,7 @@ class TestCase(test.TestCase):
       fake_ag.ConversionOptions = converter.ConversionOptions
       fake_ag.Feature = converter.Feature
       fake_ag.utils = utils
-      fake_ag.function_scope = function_wrapping.function_scope
+      fake_ag.FunctionScope = function_wrappers.FunctionScope
       result.ag__ = fake_ag
       result.ag_source_map__ = source_map
       for k, v in namespace.items():
@@ -92,7 +92,8 @@ class TestCase(test.TestCase):
       raise
 
   @contextlib.contextmanager
-  def converted(self, entity, converter_module, namespace, *tf_symbols):
+  def converted(self, entity, converter_module, namespace, tf_symbols=()):
+
     node, ctx = self.prepare(entity, namespace)
 
     if not isinstance(converter_module, (list, tuple)):
@@ -101,7 +102,7 @@ class TestCase(test.TestCase):
       node = converter.standard_analysis(node, ctx, is_initial=not i)
       node = m.transform(node, ctx)
 
-    with self.compiled(node, namespace, *tf_symbols) as result:
+    with self.compiled(node, namespace, tf_symbols) as result:
       yield result
 
   def make_fake_mod(self, name, *symbols):
@@ -134,7 +135,8 @@ class TestCase(test.TestCase):
         source_file='<fragment>',
         future_features=future_features,
         namespace=namespace)
-    ctx = converter.EntityContext(namer, entity_info, program_ctx)
+    ctx = converter.EntityContext(
+        namer, entity_info, program_ctx, 'test_fn')
     origin_info.resolve_entity(node, source, test_fn)
     node = converter.standard_analysis(node, ctx, is_initial=True)
     return node, ctx
diff --git a/tensorflow/python/autograph/core/function_wrappers.py b/tensorflow/python/autograph/core/function_wrappers.py
new file mode 100644
index 00000000000..55b1071b029
--- /dev/null
+++ b/tensorflow/python/autograph/core/function_wrappers.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for wrapping converted functions bodies with auxiliary logic."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import ag_ctx
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.framework import auto_control_deps
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.util import nest
+
+
+class FunctionScope(object):
+  """Context manager that wraps the body of a converted function.
+
+  This context manager handles various operations related to the scope of a
+  function:
+    * optional TF name scopes - these name scopes match the name of the
+        function, for easy visualization in tensorBoard;
+    * optional automatic control dependencies - this adds the same mechanism
+        for control dependenecies that is used by `@tf.function`; it can be
+        optionally enabled when using `tf.autograph.to_graph`;
+    * tracking of autograph conversion state (whether it's enabled by the user,
+        conversion options;
+  """
+
+  def __init__(self, function_name, scope_name, options):
+    self.name = scope_name
+    self.options = options
+
+    if options.user_requested:
+      self.autograph_ctx = ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED,
+                                                   options)
+    self.callopts = options.call_options()
+
+    use_name_scope = options.uses(converter.Feature.NAME_SCOPES)
+    self.use_name_scope = use_name_scope
+    if use_name_scope:
+      self.name_scope = ops.name_scope(self._sanitize(function_name))
+
+    use_auto_deps = self.options.uses(converter.Feature.AUTO_CONTROL_DEPS)
+    self.use_auto_deps = use_auto_deps
+    if use_auto_deps:
+      self.autodeps_scope = auto_control_deps.AutomaticControlDependencies()
+      self._return_value_marked = False
+
+  def _sanitize(self, name):
+    """See https://www.tensorflow.org/api_docs/python/tf/Graph#name_scope."""
+    # TensorFlow doesn't like leading underscores at the top level.
+    if name and name.startswith('_'):
+      name = 'fn' + name
+    return name
+
+  def __enter__(self):
+    if self.options.user_requested:
+      self.autograph_ctx.__enter__()
+    if self.use_name_scope:
+      self.name_scope.__enter__()
+    if self.use_auto_deps:
+      self.autodeps_scope.__enter__()
+    return self
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    if self.options.user_requested:
+      self.autograph_ctx.__exit__(exc_type, exc_val, exc_tb)
+    if self.use_name_scope:
+      self.name_scope.__exit__(exc_type, exc_val, exc_tb)
+    if self.use_auto_deps:
+      self.autodeps_scope.__exit__(exc_type, exc_val, exc_tb)
+
+  def mark_return_value(self, value):
+    """Marks a value as returned from the function guarded by the scope."""
+    if self.use_auto_deps:
+      self._return_value_marked = True
+      if value is None:
+        # We don't create dummy returns, to preserve Python semantics. The user
+        # is responsible for adding a return value to the top-level function.
+        return None
+
+      def _mark_return_if_tensor(t):
+        if tensor_util.is_tensor(t):
+          return self.autodeps_scope.mark_as_return(t)
+        return t
+
+      value = nest.map_structure(_mark_return_if_tensor, value)
+    return value
+
+
+def with_function_scope(thunk, scope_name, options):
+  """Inline version of the FunctionScope context manager."""
+  with FunctionScope('lambda_', scope_name, options) as scope:
+    return thunk(scope)
diff --git a/tensorflow/python/autograph/core/function_wrappers_test.py b/tensorflow/python/autograph/core/function_wrappers_test.py
new file mode 100644
index 00000000000..01918007bbd
--- /dev/null
+++ b/tensorflow/python/autograph/core/function_wrappers_test.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for function_wrappers module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.core import function_wrappers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class FunctionWrappersTest(test.TestCase):
+
+  def test_name_scope(self):
+    if context.executing_eagerly():
+      self.skipTest('Tensor names are disabled in eager')
+
+    with function_wrappers.FunctionScope(
+        'test_name', None,
+        converter.ConversionOptions(
+            optional_features=converter.Feature.NAME_SCOPES)):
+      t = constant_op.constant(1)
+    self.assertIn('test_name', t.name)
+
+  def test_auto_cotrol_deps(self):
+    v = variables.Variable(1)
+    with function_wrappers.FunctionScope(
+        '_', None,
+        converter.ConversionOptions(
+            optional_features=converter.Feature.AUTO_CONTROL_DEPS)) as scope:
+      v.assign(2)
+      op = scope.mark_return_value(constant_op.constant(1))
+    self.evaluate(op)
+    self.assertEqual(self.evaluate(v.read_value()), 2)
+
+  def test_all_disabled(self):
+    with function_wrappers.FunctionScope(None, None,
+                                         converter.STANDARD_OPTIONS):
+      t = constant_op.constant(1)
+    self.assertEqual(self.evaluate(t), 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/docs/pyfunc_dtypes.md b/tensorflow/python/autograph/docs/pyfunc_dtypes.md
deleted file mode 100644
index c2427f5f4f1..00000000000
--- a/tensorflow/python/autograph/docs/pyfunc_dtypes.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Specifying return data type for `py_func` calls
-
-The `py_func` op requires specifying a
-[data type](https://www.tensorflow.org/guide/tensors#data_types).
-
-When wrapping a function with `py_func`, for instance using
-`@autograph.do_not_convert(run_as=autograph.RunMode.PY_FUNC)`, you have two
-options to specify the returned data type:
-
- * explicitly, with a specified `tf.DType` value
- * by matching the data type of an input argument, which is then assumed to be
-     a `Tensor`
-
-Examples:
-
-Specify an explicit data type:
-
-```
-  def foo(a):
-    return a + 1
-
-  autograph.util.wrap_py_func(f, return_dtypes=[tf.float32])
-```
-
-Match the data type of the first argument:
-
-```
-  def foo(a):
-    return a + 1
-
-  autograph.util.wrap_py_func(
-      f, return_dtypes=[autograph.utils.py_func.MatchDType(0)])
-```
diff --git a/tensorflow/python/autograph/g3doc/reference/common_errors.md b/tensorflow/python/autograph/g3doc/reference/common_errors.md
new file mode 100644
index 00000000000..79867e0ce4b
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/common_errors.md
@@ -0,0 +1,87 @@
+# AutoGraph reference
+
+[Index](index.md)
+
+## Common AutoGraph errors
+
+### "WARNING: `<name>` could not be transformed"
+
+This warning is output when AutoGraph could not convert a function, for an
+unexpected reason. The error message contains the reason why the function could
+not be converted, as well as guidance on how to proceed next.
+
+Note: AutoGraph does not always output a warning. For example, constructors
+are silently called without conversion.
+
+When this warning is printed, the code returned by AutoGraph still executes, but
+the functions indicated in the warning will be executed as they are, without
+conversion. If the functions contain pure Python or graph code (for example,
+they have no Tensor-dependent control flow), then the code is likely to still
+run without error. However, if it contains any constructs that are only
+supported in AutoGraph, expect subsequent exceptions.
+
+Note: the warning is output to the [abseil](https://github.com/abseil/abseil-py)
+logger, with `WARNING` severity. To direct these warnings to `stdout`, use
+`tf.autograph.set_verbosity(0, True)`.
+
+### "OperatorNotAllowedInGraphError: using a `tf.Tensor` as a Python `bool`"
+
+This exception is raised whenever a `tf.Tensor` is type-cast as a Python `bool`,
+in a context where eager execution is not active. The exception is only raised
+when graph execution is active, for example inside a `@tf.function` with
+AutoGraph turned off. It can be caused by using a `tf.Tensor` value as:
+
+  * the condition of an `if` or `while` statement: `if <tensor>:`
+  * the argument in a logical expression: `tensor and another_tensor`
+  * the argument to the `bool` built-in: `bool(tensor)`
+
+Note: These operations are allowed when executing eagerly.
+
+Within the context of AutoGraph, it usually indicates eager-style control
+flow that has not been converted by AutoGraph, for any reason.
+
+When encountering this error, make sure that the function is either decorated
+with `@tf.function`, or called from another function decorated in this way. Also
+look at the console and logging output for conversion warnings (see the section
+above).
+
+### "OperatorNotAllowedInGraphError: iterating over `tf.Tensor`"
+
+This exception is raised whenever you try to iterate over a `tf.Tensor`,
+in a context where eager execution is not active. The exception is only raised
+when graph execution is active, for example inside a `@tf.function` with
+AutoGraph turned off. It can be caused by using a `tf.Tensor` value as:
+
+  * the iterated of a `for` statement: `for i in tensor:`
+  * the argument to the `iter` built-in: `iter(tensor)`
+
+Note: These operations are allowed when executing eagerly.
+
+This exception is similar to the previous example, and has similar causes and
+remedies.
+
+### "InaccessibleTensorError: The tensor `<name>` is defined in another function or code block"
+
+This exception is common to code which attempts to obtain values calculated
+within a `tf.cond`, `tf.while_loop`, or another `@tf.function` without using
+functional style or through mutable collections. See
+[Limitations](limitations.md) for more details.
+
+### "StagingError: in converted code"
+
+This exception is used by AutoGraph to wrap exceptions with custom constructors
+that it cannot re-raise with the original type. See
+[Error handling](error_handling.md) for more details. If your code uses custom
+exceptions, expect them to be wrapped by this exception.
+
+### "Unable to identify source code of lambda function"
+
+This error usually appears in the context of a conversion warning. It indicates
+that a lambda function could not be parsed (see [Limitations](limitations.md)).
+
+This type of errors can usually be avoided by creating lambda functions in
+separate simple assignments, for example:
+
+```
+l = lambda <args>: <body>
+```
diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
new file mode 100644
index 00000000000..494e556c8c5
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -0,0 +1,517 @@
+# AutoGraph reference
+
+[Index](index.md)
+
+## Control flow
+
+AutoGraph rewrites all control flow statements with specialized AutoGraph
+function calls. These function calls are capable of executing the corresponding
+control flow statement using Python semantics for effects outside the Python
+interpreter itself (see the [Introduction](intro.md)).
+
+### Dispatch rules
+
+Key Point: Only statements that are conditioned on, or iterate over, a
+TensorFlow object such as `tf.Tensor`, are converted into TensorFlow ops.
+
+As described in the [Introduction](intro.md), AutoGraph aims to preserve the
+semantics of valid Python code. If a control flow statement runs in graph
+execution without raising an error, then AutoGraph will also execute it as
+normal Python control flow. Statements which would normally raise an error, for
+example because a `tf.Tensor` cannot be used as a `bool` in an `if` statement,
+are converted to TensorFlow control flow ops.
+
+#### Analogy with compile-time constants and code optimization
+
+From the perspective of a TensorFlow graph, non-Tensor values, for example an
+integer or a NumPy array, are _constants_: they do not change value while the
+graph executes.
+
+For example, in the graph below, the condition is always `True` (it is
+invariant):
+
+```
+x = 1
+y = tf.cond(x > 0, lambda: 3 * x, lambda 5 * x)
+```
+
+That is equivalent to the code below:
+
+```
+x = 1
+y = 3 * x
+```
+
+In the example above, we've optimized away the conditional on a constant
+condition. The AutoGraph dispatch rules have the same effect: anything that is
+not a TensorFlow object is a compile-time constant for TensorFlow, and can be
+optimized away. For this reason, you can usually mix Python and TensorFlow
+computation and it will transparently have the expected result even
+when only some computations are executed in the graph.
+
+<!-- TODO(mdan): This is actually a limitation (a very subtle one) -->
+Caution: The assumption of invariant code made above is not true if the
+TensorFlow graph had callbacks into the Python code. If you modify data
+from within a `tf.py_function`, then the code outside a `tf.py_function`
+will have unpredictable behavior if it depends on the same data.
+
+For example, the `tf.cond` that runs as part of the `if` statement below will
+miss the update made by `f`:
+
+```
+n = [10]
+def f():
+  n[0] = 20
+  return 0
+tf.py_function(f, (), (tf.int32,))
+if tf.equal(n[0], 10):
+  tf.print('n is 10')
+```
+
+```
+n is 10
+```
+
+### Compound symbols
+
+AutoGraph usually handles basic symbols:
+
+```
+if a < 0:
+  a = -a
+```
+
+```
+a = tf.cond(a < 0, lambda: -a, lambda: a)
+```
+
+But it can also handle complex symbols in many cases. For example, if we treat
+`a.b` as a symbol in the code below, then we can use it as if it were a basic
+symbol name:
+
+```
+if a.b < 0
+  a.b = -a.b
+```
+
+```
+a.b = tf.cond(a.b < 0, lambda: -a.b, lambda: a.b)
+```
+
+This is useful in methods, which can operate on properties of `self`, as well as
+working directly on more complex object structures or collections.
+
+Caution: There are certain [limitations](limitations.md) around using Python
+collections and object mutation. When in doubt, place the values you work
+with into local variables and operate on those.
+
+### Effects of the tracing process
+
+#### All Python code paths are executed during tracing
+
+When constructing a graph, TensorFlow _traces_ the code. The tracing of control
+flow requires visiting _every possible code path_ (usually once).
+
+Note: In rare cases, the runtime may decide to trace some code paths several
+times. For example, the condition of a `while` statement may be executed twice,
+first with a temporary graph, to determine whether it evaluates to a
+`tf.Tensor`, then if it is a `tf.Tensor`, it's executed a second time in the
+proper graph.
+
+In other words, when tracing executes both branches of an if statement.
+Similarly, the body of loops is executed once (even if the loop would otherwise
+not iterate at all).
+
+This explains why inserting `print` statements in an `if` statement produces
+this output:
+
+```
+print('before if')
+if tf.constant(True):
+  print('true branch')
+else:
+  print('false branch')
+print('after if')
+```
+
+```
+before if
+true branch
+false branch
+after if
+```
+
+Note: Control flow that is not executed as a TensorFlow graph is not traced. Its
+body will execute as expected.
+
+Example of code that runs as regular Python code:
+
+```
+print('before if')
+if True:  # Condition not a Tensor, running normally
+  print('true branch')
+else:
+  print('false branch')
+print('after if')
+```
+
+```
+before if
+true branch
+after if
+```
+
+#### Python values modified in TensorFlow control flow become Tensors
+
+If a symbol is modified in a TensorFlow control flow statement, then it becomes
+a `tf.Tensor`, even if it started off as a Python promitive value.
+
+For example, the conditional below will run as a `tf.cond` (its condition is a
+`tf.Tensor`), which in turn will cause `i` to become a `tf.Tensor`.
+
+```
+i = 0
+if tf.greater(i, 0):
+  i = 1
+# i is not a Tensor
+```
+
+### `if` statements
+
+`if` statements whose condition is a `tf.Tensor` are executed as TensorFlow
+conditionals by converting them to `tf.cond`:
+
+```
+if tf.random.uniform(()) > 0.5:
+  x = 1
+else:
+  x = 2
+```
+
+`if` statements whose condition is not a `tf.Tensor` are executed as normal
+Python:
+
+```
+if np.random.uniform() > 0.5:
+  x = 1
+else:
+  x = 2
+```
+
+`if` statements executed as TensorFlow conditionals are subject to restrictions
+(see [limitations](limitations.md)). All symbols affected by the statement and
+used thereafter must be:
+
+ * of a data type understood by TensorFlow
+ * defined in both branches
+ * of consistent dtypes in both branches, for TensorFlow entities
+ * of consistent structure in both branches, for static collections (such as
+   lists or tuples)
+
+### `while` statements
+
+`while` statements whose condition is a `tf.Tensor` are executed as TensorFlow
+loops by converting them to `tf.while_loop`:
+
+```
+x = 0
+while tf.random.uniform(()) > 0.5:
+  x = x + 1
+```
+
+`while` statements whose condition is not a `tf.Tensor` are executed as normal
+Python:
+
+```
+x = 0
+while np.random.uniform() > 0.5:
+  x = x + 1
+```
+
+`while` statements executed as TensorFlow loops are subject to restrictions
+(see [limitations](limitations.md)). All symbols affected by the statement and
+used thereafter must be:
+
+ * of a data type understood by TensorFlow
+ * defined before the loop
+ * of consistent dtype at the beginning and the end of the loop,
+   for TensorFlow entities
+ * either of consistent shape at the beginning and the end of the loop,
+   for TensorFlow entities, or declared in `shape_invariants`
+ * of consistent structure  at the beginning and the end of the loop, for
+   static collections (such as lists or tuples)
+
+Caution: A `while` loop whose condition is a Python scalar will execute as
+normal Python. If you intended to run the loop as a TensorFlow loop, the loop
+will replicate its body in the graph (it is unrolled). To avoid that, make sure
+its condition is converted to a `tf.Tensor`, using for instance `tf.constant`.
+
+For example, the following loop is unrolled, even though the list contains
+`tf.Tensor` values, because the type of `l` is a Python `list`:
+
+```
+l = [tf.constant(1), tf.constant(2), tf.constant(3)]
+for i in l:
+  tf.print(i)  # This is unrolled - three `tf.print`s are built in the graph. 
+```
+
+If you wish for the loop to run as a TensorFlow loop, stack the loop:
+
+```
+l = [tf.constant(1), tf.constant(2), tf.constant(3)]
+for i in tf.stack(l):
+  tf.print(i)  # This runs as a TensorFlow loop.
+```
+
+<!-- TODO(mdan): List this under limitations -->
+Caution: A loop in which the type of the condition condition changes across
+iterations, in a way that would influence the way the loop is executed, is not
+allowed in AutoGraph.
+
+For example, the loop below will generate an error. After the first iteration,
+`i` becomes a tf.Tensor, because
+
+```
+i = 0
+while i < 10:  # `i < 10` is a Python bool - run as normal while loop
+  i = tf.constant(1)  # Error -- `i < 10` would now be a `tf.Tensor`
+```
+
+### `for` statements
+
+`for` statements that iterate over a `tf.Tensor` are executed as TensorFlow
+loops by converting them to a `tf.while_loop` which iterates over the first
+dimension (equivalent to NumPy):
+
+```
+for i in tf.constant(((1, 2), (3, 4))):
+  tf.print('iteration:', i)
+```
+
+```
+iteration: [1, 2]
+iteration: [3, 4]
+```
+
+Note: If possible, AutoGraph will also set the `maximum_iteration` parameter
+of the `tf.while_loop`.
+
+`for` statements that iterate over a the output of a `tf.range` are executed as
+TensorFlow loops by converting them to a `tf.while_loop` which uses the
+arguments passed to the `tf.range`:
+
+```
+for i in tf.range(3):
+  tf.print('iteration:', i)
+```
+
+`for` statements that iterate over a `tf.data.Dataset` and which do not contain
+`break` or `return` statements are executed as TensorFlow loops by converting
+them to `tf.data.Dataset.reduce` ops:
+
+```
+for i in tf.data.Dataset.range(3):
+  tf.print('iteration:', i)
+```
+
+`for` statements that iterate over a _distributed_ `tf.data.Dataset` and which
+do not contain `break` or `return` statements are executed as TensorFlow loops
+by converting them to the datasets' `reduce` ops:
+
+```
+for i in tf.distribute.OneDeviceStrategy('cpu').experimental_distribute_dataset(
+    tf.data.Dataset.range(3)):
+  tf.print('iteration:', i)
+```
+
+`for` statements that iterate over a `tf.data.Dataset` and which contain
+`break` or `return` statements are executed as TensorFlow loops by converting
+them to a combination of `tf.data.Dataset.scan`, `tf.data.Dataset.take_while`
+and `tf.data.Dataset.reduce` ops:
+
+```
+for i in tf.data.Dataset.range(3):
+  tf.print('iteration:', i)
+  break
+```
+
+```
+iteration: 1
+```
+
+`for` statements that iterate over a `tf.data.Dataset` _iterator_ are executed
+as TensorFlow loops by converting them to a combination of `tf.while_loop`,
+and `tf.cond` ops:
+
+```
+for i in iter(tf.data.Dataset.range(3)):
+  tf.print('iteration:', i)
+```
+
+`for` statements that iterate over a type different from any of the above are
+executed as normal Python:
+
+```
+for i in [1, 2, 3]:
+  print('iteration:', i)
+```
+
+Caution: A `for` loop over a `list` or `tuple` of `tf.Tensor` is considered to
+iterate over a Python `list` (or respectively `tuple`), therefore will be
+executed as normal Python. If you intended to run it as a TensorFlow loop,
+use `tf.stack` or `tf.concat`.
+
+Caution: A `for` loop over a Python `range` will be executed as normal Python.
+If you intended to run it as a TensorFlow loop, `tf.range`.
+
+Note: AutoGraph may output a warning when it believes that you are unrolling
+a loop inefficiently. However, the warning thresholds are very conservative.
+
+### `break` statements
+
+Code blocks in which `break` statements are used are rewritten with equivalent
+code that uses extra control booleans and conditionals. The control booleans are
+used directly in `while` loops. In the case of `for` loops, the AutoGraph
+corresponding operator accepts an `extra_test` argument which is similar to
+the conditional of a while loop, and which contains the control boolean.
+
+For example, the `while` loop below is rewritten as (showing the output of the
+`break` transformation only):
+
+```
+while i < 10:
+  if i > 3:
+    break
+  i += 1
+```
+
+```
+break_ = False
+while i < 10 and not break_:
+  if i > 3:
+    break_ = True
+    continue  # The continue statement is also rewritten in a subsequent pass
+  i += 1
+```
+
+Another example shows how the control boolean is used in the overload of a `for`
+loop (showing portions of the final output):
+
+```
+for i in range(10):
+  if i > 3:
+    break
+```
+
+```
+break_ = False
+...
+def extra_test(break_):
+  return ag__.not_(break_)
+# break_ becomes a loop variable.
+break_, = ag__.for_stmt(range(10), extra_test, ..., (break_,))
+```
+
+### `continue` statements
+
+Code blocks in which `continue` statements are used are rewritten with
+equivalent code that uses extra control booleans and conditionals, similar to
+how `break` is handled.
+
+For example, the `for` loop below is rewritten as (showing the output of the
+`continue` transformation only):
+
+```
+for i in range(10):
+  if i > 3:
+    continue
+```
+
+```
+for i in range(10):
+  continue_ = False
+  if i > 3:
+    continue_ = True
+  if not continue_:
+    i += 1
+```
+
+Notice that unlike `break`, `continue` statements are local to the loop and do
+not influence the number of iterations.
+
+### `return` statements
+
+`return` statements are also rewritten using control symbols, in a manner
+similar to how `break` is converted. In the case of `return` statements, an
+additional symbol keeps track of the return value.
+
+Depending on the structure of the code, the return value might be undefined
+in parts of the code (for example on code paths in which no return statement
+has executed). AutoGraph keeps track of this by using a special value.
+This special value is converted to `None` (the default return value) upon
+exiting the function.
+
+Caution: TensorFlow control flow doe not support undefined values, and an
+undefined return value is no exception. Therefore, AutoGraph will raise an
+error for TensorFlow control flow in which the return value is not known for
+all code paths.
+
+For example, the following code raises an error because the return value would
+be undefined when the random number would be less than 0.5:
+
+```
+if tf.random.uniform(()) > 0.5:
+  return 1
+```
+
+```
+ValueError: A value must also be returned from the else branch.
+```
+
+An example of rewriting a `while` (showing the output of the `return`
+transformation only):
+
+```
+def f():
+  while i < 10:
+    if i > 3:
+      return 1
+    i += 1
+```
+
+```
+def f():
+  do_return = False
+  retval_ = ag__.UndefinedReturnValue()
+  while i < 10 and not do_return:
+    if i > 3:
+      do_return = True
+      retval_ = 1
+    if not do_return:
+      i += 1
+  return ag__.retval(retval_)  # Transforms any UndefinedReturnValue to None
+```
+
+Note: AutoGraph performs an additional code normalization in which an `if`
+statement with no `else` branch contains a `return` statement it is rewritten as
+an `if-else` statement in which the code that follows the statement is moved
+under the `else` branch.
+
+Example (showing the normalization only):
+
+```
+def f():
+  if i > 3:
+    return 1
+  i += 1
+```
+
+```
+def f():
+  if i > 3:
+    return 1
+  else:
+   i += 1
+```
+
+
diff --git a/tensorflow/python/autograph/g3doc/reference/error_handling.md b/tensorflow/python/autograph/g3doc/reference/error_handling.md
new file mode 100644
index 00000000000..ce3a64f8f28
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/error_handling.md
@@ -0,0 +1,213 @@
+# AutoGraph reference
+
+[Index](index.md)
+
+## Error handling
+
+When an exception occurs in code generated by AutoGraph, the error message
+is augmented with information about the location in the original code,
+before conversion.
+
+When an error occurs in a TensorFlow graph constructed using AutoGraph code,
+the stack trace which points to where the failing op was created is modified
+to point to the original code, before conversion.
+
+### Python execution errors
+
+Python execution (or tracing) exceptions that are raised in AutoGraph code are
+caught and re-raised with an extended error message that contains references
+to the original code.
+
+These functions are re-raised by `@tf.function`. If you use a `try/catch` the
+exception inside `tf.function`, you will obtain the original exception.
+
+The exception traceback still contains the entire call stack, including frames
+corresponding to generated code.
+
+AutoGraph tries to re-raise an exception of the same type as the original
+exception. This is usually possible for subclasses of
+`Exception` that do not define a custom `__init__`. For more complex
+exception types which define a custom constructor, AutoGraph raises a
+`StagingError` instead.
+
+Among the distinctive features of the re-raised exception:
+
+ * the exception traceback indicates the call stack of the exception, up to the
+   first @tf.function
+ * the error message includes references to the original code within
+   the `@tf.function`
+ * the references corresponding to converted code are marked with an
+   asterisk (`*`)
+
+For example, the code below triggers an exception in the Python runtime, at
+graph construction time:
+
+```
+@tf.function
+def f():
+  tf.constant(1) + tf.constant(1.0)
+f()
+```
+
+An excerpt of the exception that is raised is shown below:
+
+```
+Traceback (most recent call last):
+  File "<ipython-input-10-1938a51c970d>", line 11, in <module>
+    f()
+  File "tensorflow/python/eager/def_function.py", line 417, in __call__
+    self._initialize(args, kwds, add_initializers_to=initializer_map)
+  ... more TensorFlow internal frames ...
+TypeError: in converted code:
+
+    <ipython-input-9-002fa22f79df>:8 f  *
+        tf.constant(1) + tf.constant(1.0)
+    tensorflow/python/ops/math_ops.py:900 binary_op_wrapper
+        return func(x, y, name=name)
+    ... more TensorFlow internal frames ...
+
+    TypeError: Input 'y' of 'AddV2' Op has type float32 that does not match type int32 of argument 'x'.
+
+```
+
+Note: the exact appearance of the various parts in the error message may change
+in the future.
+
+Let's look at the individual components of this exception.
+
+The traceback of the exception indicates the location until the call to
+`@tf.function`, including any frames internal to TensorFlow:
+
+```
+Traceback (most recent call last):
+  File "<ipython-input-10-1938a51c970d>", line 11, in <module>
+    f()
+  File "tensorflow/python/eager/def_function.py", line 417, in __call__
+    self._initialize(args, kwds, add_initializers_to=initializer_map)
+  File "tensorflow/python/eager/def_function.py", line 360, in _initialize
+    *args, **kwds))
+  File "tensorflow/python/eager/function.py", line 1688, in _get_concrete_function_internal_garbage_collected
+    graph_function, _, _ = self._maybe_define_function(args, kwargs)
+  File "tensorflow/python/eager/function.py", line 1992, in _maybe_define_function
+    graph_function = self._create_graph_function(args, kwargs)
+  File "tensorflow/python/eager/function.py", line 1878, in _create_graph_function
+    capture_by_value=self._capture_by_value),
+  File "tensorflow/python/framework/func_graph.py", line 791, in func_graph_from_py_func
+    func_outputs = python_func(*func_args, **func_kwargs)
+  File "tensorflow/python/eager/def_function.py", line 310, in wrapped_fn
+    return weak_wrapped_fn().__wrapped__(*args, **kwds)
+  File "tensorflow/python/framework/func_graph.py", line 781, in wrapper
+    raise e.ag_error_metadata.to_exception(type(e))
+```
+
+The exception message includes the location inside the converted function `f`:
+
+```
+TypeError: in converted code:
+
+    <ipython-input-9-002fa22f79df>:8 f  *
+        tf.constant(1) + tf.constant(1.0)
+    tensorflow/python/ops/math_ops.py:900 binary_op_wrapper
+        return func(x, y, name=name)
+    tensorflow/python/ops/math_ops.py:1198 _add_dispatch
+        return gen_math_ops.add_v2(x, y, name=name)
+    tensorflow/python/ops/gen_math_ops.py:549 add_v2
+        "AddV2", x=x, y=y, name=name)
+    tensorflow/python/framework/op_def_library.py:564 _apply_op_helper
+        inferred_from[input_arg.type_attr]))
+```
+
+Notice the frame corresponding to the call of `f`. The function is converted,
+which is being indicated by the asterisk `*` character displayed next to
+`f`:
+
+```
+    <ipython-input-9-002fa22f79df>:8 f  *
+        tf.constant(1) + tf.constant(1.0)
+```
+
+Lastly, the lower part includes the message that the exception originally
+reported:
+
+```
+    TypeError: Input 'y' of 'AddV2' Op has type float32 that does not match type int32 of argument 'x'.
+```
+
+Note: Typically, error messages raised by code internal to TensorFlow refers
+to arguments of the internal API that failed. Error messages raised by code
+internal to AutoGraph (that is, 'tensorflow/python/autograph') usually
+refer to symbols used in your code.
+
+### TensorFlow execution errors
+
+TensorFlow execution errors are displayed normally, but the portions of the
+error message which correspond to user code contain references to the original
+code.
+
+For example, the code below triggers an error in the TensorFlow runtime, at
+graph execution time:
+
+```
+@tf.function
+def my_function():
+  tf.Assert(tf.random.uniform(()) > 1.0, ['example error'])
+my_function()
+```
+
+An excerpt of the exception that is subsequently raised is shown below:
+
+```
+Traceback (most recent call last):
+  File "<ipython-input-16-af656fb445f0>", line 11, in <module>
+    my_function()
+  File "tensorflow/python/eager/def_function.py", line 435, in __call__
+    return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)
+  File "tensorflow/python/eager/function.py", line 636, in _filtered_call
+    self.captured_inputs)
+  File "tensorflow/python/eager/function.py", line 734, in _call_flat
+    outputs = self._inference_function.call(ctx, args)
+  File "tensorflow/python/eager/function.py", line 460, in call
+    ctx=ctx)
+  File "tensorflow/python/eager/execute.py", line 68, in quick_execute
+    six.raise_from(core._status_to_exception(e.code, message), None)
+  File "<string>", line 3, in raise_from
+InvalidArgumentError:  assertion failed: [example error]
+    [[node Assert/Assert (defined at <ipython-input-16-af656fb445f0>:8) ]] [Op:__inference_my_function_79]
+```
+
+Notice the error message containing references to the location where the failing
+op was defined in the code (`<ipython-input-16-af656fb445f0>:8`):
+
+```
+InvalidArgumentError:  assertion failed: [example error]
+    [[node Assert/Assert (defined at <ipython-input-16-af656fb445f0>:8) ]] [Op:__inference_my_function_79]
+```
+
+### AutoGraph conversion exceptions
+
+Within `@tf.function`, when AutoGraph fails to convert a function, it displays
+a warning message and attempts to run the function without conversion.
+
+For example, the code below make a call to a Python
+[generator](https://wiki.python.org/moin/Generators) function, which is not
+supported by AutoGraph:
+
+```
+def example_generator():
+  yield 1
+
+@tf.function
+def f():
+  for i in example_generator():
+    print(i)
+```
+
+Calling `f()` will still run the code. AutoGraph will convert the function `f`,
+but skips the function `example_generator`. In addition, AutoGraph prints a
+warning to the console indicating that the function is called without being
+converted.
+
+```
+WARNING: Entity <function example_generator at 0x7f951b67f158> appears to be
+a generator function. It will not be converted by AutoGraph.
+```
diff --git a/tensorflow/python/autograph/g3doc/reference/functions.md b/tensorflow/python/autograph/g3doc/reference/functions.md
new file mode 100644
index 00000000000..f2768a04058
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/functions.md
@@ -0,0 +1,65 @@
+# AutoGraph reference
+
+[Index](index.md)
+
+## Functions and function calls
+
+Typically, AutoGraph converts one function at a time. If a function calls other
+functions, the called function will be converted recursively, as described
+below.
+
+### Function calls
+
+AutoGraph rewrites all function calls with a special wrapper that may convert
+the called function at runtime.
+
+For example, the function call below:
+
+```
+f(x, y, z=1)
+```
+
+Is converted to code that schematically looks like this:
+
+```
+ag__.converted_call(f, ..., (x, y), {'z': 1}, ...)
+```
+
+All calls are rewritten, including calls to other types of callables, builtin
+functions, etc.
+
+If the originally called function is not converted, AutoGraph simply
+forwards the call to it, so that the wrapper is functionally equivalent with
+the original function call.
+
+If the originally called function is converted, then the conversion is performed
+first and the converted function is called instead.
+
+Note: a caching mechanism prevents the same function from being converted
+multiple times. This mechanism ensures that functions calls made with different
+[global or free variables](https://docs.python.org/3/reference/executionmodel.html#binding-of-names)
+are handled correctly.
+
+#### Function conversion rules
+
+The following types of functions are not converted:
+
+  * functions already converted
+  * functions defined in in a whitelisted module (see autograph/core/config.py)
+  * non-Python functions (such as native bindings)
+  * `print`, `pdb.set_trace`, `ipdb.set_trace`
+  * most built-in functions (exceptions are listed in
+    autograph/operators/py_builtins.py)
+  * constructors
+  * functions without source code attached (prints a warning)(see
+    [limitations](limitations.md))
+  * generator functions (prints a warning)
+
+When AutoGraph encounters a function that it cannot convert outside of this
+list, it prints a warning.
+
+### Nested functions
+
+Functions nested inside a function converted by AutoGraph are converted
+at the same time as the function containing them. If the nested function is
+returned, a converted version of it is returned.
diff --git a/tensorflow/python/autograph/g3doc/reference/index.md b/tensorflow/python/autograph/g3doc/reference/index.md
index 1a1259643bf..6fb7ab6c7ff 100644
--- a/tensorflow/python/autograph/g3doc/reference/index.md
+++ b/tensorflow/python/autograph/g3doc/reference/index.md
@@ -2,11 +2,21 @@
 
 This reference document describes the semantics of AutoGraph transformations.
 
+In `@tf.function`, AutoGraph allows running Eager-style code as a TensorFlow
+graph.
+
 *   [Introduction](intro.md)
 *   [Interacting with the generated code](generated_code.md)
 *   [Debugging AutoGraph code](debugging.md)
-*   Control Flow (coming soon)
-*   Collections (coming soon)
-*   Exceptions (coming soon)
-*   Builtin Functions (coming soon)
-*   Datasets (coming soon)
+*   [Control flow](control_flow.md)
+*   [Functions and function calls](functions.md)
+*   [Error handling](error_handling.md)
+*   [Limitations](limitations.md)
+*   [Common errors](common_errors.md)
+
+For more information on AutoGraph, see the following articles:
+
+*   [AutoGraph tutorial](https://www.tensorflow.org/alpha/beta/autograph)
+*   [Eager tutorial](https://www.tensorflow.org/alpha/guide/eager)
+*   [TensorFlow 2.0 Alpha](https://www.tensorflow.org/alpha)
+*   [AutoGraph blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7)
diff --git a/tensorflow/python/autograph/g3doc/reference/intro.md b/tensorflow/python/autograph/g3doc/reference/intro.md
index 1c720fd2e9f..1de00699590 100644
--- a/tensorflow/python/autograph/g3doc/reference/intro.md
+++ b/tensorflow/python/autograph/g3doc/reference/intro.md
@@ -4,15 +4,6 @@
 
 ## Introduction
 
-This document describes the semantics of AutoGraph's code transformations.
-
-For more information on AutoGraph, see the following articles:
-
-*   [AutoGraph tutorial](https://www.tensorflow.org/alpha/guide/autograph)
-*   [Eager tutorial](https://www.tensorflow.org/alpha/guide/eager)
-*   [TensorFlow 2.0 Alpha](https://www.tensorflow.org/alpha)
-*   [AutoGraph blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7)
-
 ### Terminology
 
 Typically, AutoGraph operates by converting a function into a new function with
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
new file mode 100644
index 00000000000..ebfab4b449d
--- /dev/null
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -0,0 +1,463 @@
+# AutoGraph reference
+
+[Index](index.md)
+
+## Limitations
+
+When AutoGraph is applied to normal Python code, you should expect no change
+in functionality.
+However, when applied to TensorFlow control flow (for example, an if statement
+with a `tf.Tensor` condition), there are certain limitations. This section
+describes these limitations and practices that will allow you to avoid them.
+
+Key Term: Python variables refer to Python symbols (or symbols for short) and
+should not be confused with TensorFlow variables.
+
+Key Term: A TensorFlow loop variable (or loop variable for short) refers to a
+value (typically a `tf.Tensor`) modified by a loop. See `tf.while_loop`.
+
+### Indirect modifications and hidden side effects in TensorFlow control flow
+
+<!-- TODO(mdan) Refine this paragraph well - it's important -->
+Key Point: We recommend using functional style and immutable Python collections.
+
+#### AutoGraph analyzes code to detect modifications
+
+One of the most important functions of AutoGraph is to rewrite Python control
+flow statements into equivalent TensorFlow ops. This process requires "wiring"
+variables in the Python code whose values are affected these statements control
+flow into the respective ops.
+
+The examples below use a `while` loop, but the same notions extend to all
+control flow: `if` and `for` statements.
+
+In the example below, `x` needs to become a loop variable of the
+corresponding `tf.while_loop':
+
+```
+while x > 0:
+  x = x - 1
+```
+```
+x = tf.while_loop(..., loop_vars=(x,)
+```
+
+TF control ops support only a limited set of types for loop variable. At the
+same time, the efficiency of TensorFlow graphs is influenced by the number of
+loop variables, so we don't want to create them unnecessarily. For this reason,
+AutoGraph only pulls symbols through loop variables if necessary.
+
+Note: If a symbol refers to a nested structure, such as a `dict` of `dict`s,
+then when that symbol is added to the loop variables the entire structure
+becomes part of the loop variables - TensorFlow automatically unpacks it.
+
+For example, the symbol 'y' below is not wired through the `tf.while_loop`'s
+`loop_vars` because it is not affected by the while loop:
+
+```
+y = 0
+while x > 0:
+  x = x - 1
+print(y)
+```
+```
+x = tf.while_loop(..., loop_vars=(x,)  # y does not need to be a loop variable
+```
+
+AutoGraph uses static analysis to determine which symbols are modified by the
+code, in order to transform them into control flow variables. Static analysis
+is generally performed on single functions - Python's dynamic nature limits its
+effectiveness across functions.
+
+#### Modifications are not detected across functions
+
+Because static analysis is limited to single functions, modifications that are
+performed in other functions are not visible to AutoGraph:
+
+```
+def change_y():
+  global y
+  y = y + 1
+
+while x > 0:
+  change_y()  # Problem -- change made to y is not visible here!
+```
+
+This can be easily remedied using functional style - writing functions that take
+their inputs as arguments, and return everything they calculate as return
+values:
+
+```
+def change(y):
+  y = y + 1
+  return y
+
+while x > 0:
+  y = change(y)  # Okay -- y can now be properly tracked!
+```
+
+#### Modifications are not detected in methods
+
+A special case of hidden side effects are methods, which are commonly used
+to change the value of objects:
+
+```
+def MyClass(object):
+  def change(self):
+    self.y += 1
+
+c = MyClass()
+while x > 0:
+  c.change()  # Problem -- modification to c.y is not visible here!
+```
+
+This can be addressed in a number of ways.
+
+One possibility is to operate directly on the object properties:
+
+```
+c = MyClass()
+while x > 0:
+  c.y += 1  # Okay -- c.y can now be properly tracked!
+```
+
+Another possibility is to rely on immutable objects. This may lead to many
+temporary objects when executing eagerly, but their number is greatly reduced
+in `@tf.function`:
+
+```
+def MyClass(object):
+  def change(self):
+    self.y += 1
+    return self
+
+c = MyClass()
+while x > 0:
+  c = c.change()  # Okay -- c is now a loop var.
+```
+
+Note: TensorFlow control flow does not currently support arbitrary Python
+objects, but it does support basic collection objects such as `list`, `dict`,
+`tuple`, `namedtuple` and their subclasses. Design your objects as subclasses
+of [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple).
+
+### Python collections in TensorFlow control flow
+
+Key Point: Use TensorFlow collection classes instead of Python collections.
+Python collections are okay to use when they represent a fixed structure (that
+is, `list`s don't change length, `dict`s don't add or remove keys).
+
+#### Modifying Python collections in TensorFlow control flow is not allowed
+
+One of the advantages of eager execution is that you may use the usual Python
+collections, like `list` or `dict` to hold `tf.Tensor` values. However, these
+are generally not compatible with TensorFlow control flow. Specialized
+collections like `tf.TensorArray` are required.
+
+Consider the following example:
+
+```
+def fn():
+  l = []
+
+  def loop_cond(i):
+    return i < 10
+
+  def loop_body(i):
+    i = i + 1
+    l.append(i)
+    return i,
+
+  tf.while_loop(
+      cond=loop_cond,
+      body=loop_body,
+      loop_vars=(0,))
+
+  return l
+```
+
+This code works in eager execution, which does not use the TensorFlow runtime
+for the `tf.while_loop`:
+
+```
+fn()
+```
+
+However, it does not work in graph execution, because TensorFlow uses special
+mechanisms to ensure the computations are correctly sequenced in the dataflow
+graph:
+
+```
+tf.function(fn)()  # Error -- illegal tensor capture!
+```
+
+The equivalent AutoGraph code raises the same error:
+
+```
+l = []
+for i in tf.range(10):
+  l.append(i)  # Error -- illegal tensor capture!
+```
+
+Instead, use the specialized `tf.TensorArray` class:
+
+```
+l = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
+for i in tf.range(10):
+  l = l.write(l.size(), i)  # Okay
+```
+
+#### Python collections of fixed structure are allowed TensorFlow control flow
+
+An exception from the previous rule is made by Python collections that are
+static, that is, they don't grow in size for the duration of the computation.
+
+Caution: Use functional style when manipulating static collections.
+
+Examples:
+
+```
+static_list = [tf.constant(3)]
+while d.prop > 0:
+  static_list[0] -= 1  # Okay -- static_list does not change structure
+```
+```
+static_object = MyClass()
+static_object.field = tf.constant(3)
+while static_object.field > 0:
+  static_object.field -= 1  # Okay -- static_object does not change structure
+```
+```
+static_dict = {'field': tf.constant(3)}
+while static_dict['field'] > 0:
+  static_dict['field'] -= 1  # Okay -- static_dict does not change structure
+```
+
+However, remember to use functional style when these collections are used
+inside control flow.
+
+#### Python collections of fixed structure with dynamic index
+
+A more subtle error occurs when the collection is static, but is accessed in a
+dynamic way, that is with a key that is not constant.
+
+For example:
+
+```
+d = {'a': tf.constant(3)}
+for i in tf.range(10):
+  for key in d:
+    d[key] += i  # Problem -- accessing `dict` using non-constant key
+```
+
+The code above will raises an "illegal capture" error. To remedy it, write it
+in functional style:
+
+```
+d = {'a': tf.constant(3)}
+for i in tf.range(10):
+  d = {key: value + i for key, value in d.items()}  # Okay
+```
+
+### Shape and dtype consistency in TensorFlow control flow
+
+Unlike Python, TensorFlow has limited support for dynamic typing. This means
+that tensors must maintain consistent shapes and dtypes across control flow
+paths.
+
+Note: In general, these restrictions do not apply in control flow in Eager
+execution, because Eager execution uses Python control flow, rather than
+TensorFlow control flow ops.
+
+#### Consistency of dtype
+
+The dtypes across all code paths must be consistent in conditionals and loops.
+
+For example, if a `tf.cond` (and correspondingly, an AutoGraph `if`) sets a
+tensor value conditionally, then that tensor must have the same shape and dtype
+in both branches of the conditional.
+
+Example of illegal dtype change in a conditional:
+
+```
+x = tf.cond(
+    tf.random.uniform(()) > 0.5,
+    lambda: tf.constant(1, dtype=tf.int32),
+    lambda: tf.constant(1, dtype=tf.float32))  # Error -- inconsistent dtypes: int32, float32
+```
+
+The same restriction in AutoGraph code:
+
+```
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1, dtype=tf.int32)
+else:
+  x = tf.constant(1, dtype=tf.float32)  # Error -- inconsistent dtypes: int32, float32
+```
+
+Example of illegal dtype change in a loop:
+
+```
+# This won't work - "x" changes dtype inside the loop.
+x = tf.while_loop(
+    lambda _: tf.random.uniform(()) > 0.5,
+    lambda x: tf.constant(1, dtype=tf.float32),
+    loop_vars=(tf.constant(1, dtype=tf.int32),))  # Error -- inconsistent dtypes: int32, float32
+```
+
+The same restriction in AutoGraph code:
+
+```
+x = tf.constant(0, dtype=tf.int32)
+while tf.random.uniform(()) > 0.5:
+  x = tf.constant(0, dtype=tf.float32)   # Error -- inconsistent dtypes: int32, float32
+```
+
+#### Consistency of shape
+
+The shapes across all code paths must be consistent in loops only. When tensors
+do need to change shape across iterations, use `shape_invariants`.
+
+Note: Shapes are allowed to be inconsistent in conditionals. The result will be
+a partially dynamic shape.
+
+In a `tf.while_loop` (and correspondingly, an AutoGraph `while` or `for` loop)
+all loop variables must maintain consistent shape and dtype across iterations.
+That is, every loop variable must have the same shape at the end of the loop
+body as the shape that it had at the beginning of the loop body.
+
+Example of illegal shape change in a loop:
+
+```
+def loop_body(x):  # x.shape is ()
+  return tf.constant((1, 2, 3))  # Error -- inconsistent shapes: (), (3,)
+
+x = tf.while_loop(
+    lambda _: tf.random.uniform(()) > 0.5,
+    loop_body,
+    loop_vars=(tf.constant(1,))
+```
+
+The same restriction in AutoGraph code:
+
+```
+x = tf.constant(0, dtype=tf.int32)
+while tf.random.uniform(()) > 0.5:
+  x = tf.constant(0, dtype=tf.float32)  # Error -- inconsistent shapes: (), (3,)
+```
+
+### Undefined and None values in TensorFlow
+
+TensorFlow does not support undefined and `None` values. All tensors must have
+a value.
+
+Example:
+
+```
+x = tf.cond(
+    tf.random.uniform(()) > 0.5,
+    lambda: tf.constant(1),
+    lambda: None)  # Error -- a Tensor cannot be None
+```
+
+The same restriction carries over in AutoGraph, but only if the symbol is used
+after the conditional (otherwise AutoGraph avoids making it a return value
+of the `tf.cond`):
+
+```
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+else:
+  x = None
+tf.print(x)  # Error -- x may be None here
+```
+
+A related but less obvious restriction in AutoGraph forbids symbols to be
+defined in only one branch of TensorFlow control flow, if the symbol is
+used afterwards:
+
+```
+del x
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+else:
+  pass
+tf.print(x)  # Error -- x may be undefined here
+```
+
+Similarly, variables defined in a loop may not be used outside the loop, again
+if the symbol is used afterwards:
+
+```
+del x
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+tf.print(x)  # Error -- x may be undefined here
+```
+
+Avoid these limitations by defining a default value before the control flow
+statement:
+
+```
+x = tf.constant()
+if tf.random.uniform(()) > 0.5:
+  x = tf.constant(1)
+tf.print(x)  # Okay -- x is either 0 or 1
+```
+
+Note: `None` values and undefined symbols are allowed in Eager control flow,
+because Eager execution uses Python control flow, rather than TensorFlow
+control flow ops.
+
+### Access to source code
+
+Key point: AutoGraph can only handle functions whose source code can be
+accessed at runtime.
+
+Almost all Python functions allow access to their source code. However, a few
+exceptions exist:
+
+ * functions created in the Python interactive shell
+ * functions with native bindings (these do not have Python source code)
+ * functions created dynamically, using `exec` or `eval`
+
+Use
+[inspect.getsource](https://docs.python.org/3/library/inspect.html#inspect.getsource)
+to quickly diagnose whether the source code is available for a function.
+
+#### Source code of lambda functions
+
+Key Point: Declare lambda functions on separate lines to avoid failures to
+load their source code.
+
+The Python runtime exposes the source code of lambda functions, however it
+may include surrounding code. Typically, the code includes all the lines that
+contained the lambda function, including surrounding code. This may make it
+impossible to parse the exact source code of the lambda function.
+
+For example, consider the declaration of a lambda function below, which
+is otherwise valid Python code:
+
+```
+foo = (
+ 'bar',
+ lambda: x)
+```
+
+The Python runtime will report the following source code for `foo[0]`:
+
+```
+>>> inspect.getsource(foo[0])
+' lambda: x)\n'
+```
+
+The code is the entire line of code at which the lambda was declared. Because
+the line is part of a larger expression, the line itself is not syntactically
+correct and cannot be parsed.
+
+This shortcoming can be avoided by declaring the lambda function separately:
+
+```
+my_lambda = lambda: x
+foo = ('bar', my_lambda)
+```
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index d850937423c..283e294a79b 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -28,7 +28,6 @@ import re
 import sys
 import textwrap
 import traceback
-from enum import Enum
 
 # pylint:disable=g-bad-import-order
 import six
@@ -42,7 +41,6 @@ from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.utils import ag_logging as logging
-from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -73,37 +71,41 @@ class StagingError(AutoGraphError):
 class _ErrorMetadata(errors.ErrorMetadataBase):
   """AutoGraph-specific error metadata. See base class."""
 
-  def create_exception(self, preferred_type):
-    if preferred_type == errors_impl.OpError:
+  def create_exception(self, source_error):
+    preferred_type = type(source_error)
+    if issubclass(preferred_type, errors_impl.OpError):
       # Best-effort unpacking of OpError exceptions.
       # TODO(mdan): Use a mechanism that is more future-proof.
-      t = type(self.cause)
-      init_argspec = tf_inspect.getfullargspec(t.__init__)
+      init_argspec = tf_inspect.getfullargspec(preferred_type.__init__)
       message = self.get_message()
-      init_args = tuple(init_argspec.argspec)
+      init_args = tuple(init_argspec.args)
       # At the time of this writing, TF errors either take 3 or 4 arguments,
       # with the fourth being error_code.
       if init_args == ('self', 'node_def', 'op', 'message', 'error_code'):
-        return t(
-            node_def=self.cause.node_def,
-            op=self.cause.op,
+        return preferred_type(
+            node_def=source_error.node_def,
+            op=source_error.op,
             message=message,
             error_code=self.error_code)
       elif init_args == ('self', 'node_def', 'op', 'message'):
         if 'error_code' in init_argspec.kwonlyargs:
-          return t(
-              node_def=self.cause.node_def,
-              op=self.cause.op,
+          return preferred_type(
+              node_def=source_error.node_def,
+              op=source_error.op,
               message=message,
               errro_code=self.error_code)
         else:
-          return t(
-              node_def=self.cause.node_def, op=self.cause.op, message=message)
+          return preferred_type(
+              node_def=source_error.node_def,
+              op=source_error.op,
+              message=message)
 
-    elif preferred_type in (AutoGraphError, ConversionError, StagingError):
+    elif preferred_type in (AutoGraphError, ConversionError, StagingError,
+                            errors_impl.InaccessibleTensorError,
+                            errors_impl.OperatorNotAllowedInGraphError):
       return preferred_type(self.get_message())
 
-    exc = super(_ErrorMetadata, self).create_exception(preferred_type)
+    exc = super(_ErrorMetadata, self).create_exception(source_error)
     if exc is not None:
       return exc
 
@@ -121,16 +123,33 @@ class StackTraceMapper(tf_stack.StackTraceMapper):
   def __init__(self, converted_fn):
     self._source_map = converted_fn.ag_source_map
 
-  def map(self, filename, lineno, name):
-    loc = origin_info.LineLocation(filename=filename, lineno=lineno)
-    if loc not in self._source_map:
-      return filename, lineno, name
+  def get_effective_source_map(self):
+    effective_source_map = self._effective_source_map
+    if effective_source_map is None:
+      if self.parent is not None:
+        parent_map = self.parent.get_effective_source_map()
+      else:
+        parent_map = {}
 
-    origin = self._source_map[loc]
-    return origin.loc.filename, origin.loc.lineno, origin.function_name
+      effective_source_map = {}
+      for loc, origin in self._source_map.items():
+        effective_source_map[(loc.filename, loc.lineno)] = (
+            origin.loc.filename, origin.loc.lineno, origin.function_name)
+
+      for key, value in parent_map.items():
+        filename, lineno, _ = value
+        value_loc = origin_info.LineLocation(filename=filename, lineno=lineno)
+        if value_loc in self._source_map:
+          origin = self._source_map[value_loc]
+          effective_source_map[key] = (
+              origin.loc.filename, origin.loc.lineno, origin.function_name)
+        else:
+          effective_source_map[key] = value
+      self._effective_source_map = effective_source_map
+    return effective_source_map
 
 
-def tf_convert(f, ctx, convert_by_default=True, force_conversion=False):
+def tf_convert(f, ctx, convert_by_default=True, user_requested=False):
   """Decorator that applies AutoGraph to a function.
 
   Use in internal APIs.
@@ -147,8 +166,8 @@ def tf_convert(f, ctx, convert_by_default=True, force_conversion=False):
     ctx: ag_ctx.ControlStatusCtx, the Autograph context in which `f` is used.
     convert_by_default: bool, whether to use AutoGraph when the context doesn't
       specify.
-    force_conversion: bool, whether to ignore the conversion whitelist. See
-      ConversionOptions.force_conversion.
+    user_requested: bool, whether to ignore the conversion whitelist. See
+      ConversionOptions.user_requested.
 
   Returns:
     Either `f or the converted version of `f`.
@@ -161,12 +180,12 @@ def tf_convert(f, ctx, convert_by_default=True, force_conversion=False):
 
   # TODO(mdan): Grab features from context.
   if ctx.status == ag_ctx.Status.ENABLED:
-    wrapper = convert(recursive=True, force_conversion=force_conversion)(f)
+    wrapper = convert(recursive=True, user_requested=user_requested)(f)
   elif ctx.status == ag_ctx.Status.DISABLED:
     wrapper = do_not_convert(f)
   elif ctx.status == ag_ctx.Status.UNSPECIFIED:
     if convert_by_default:
-      wrapper = convert(recursive=True, force_conversion=force_conversion)(f)
+      wrapper = convert(recursive=True, user_requested=user_requested)(f)
     else:
       wrapper = call_with_unspecified_conversion_status(f)
   else:
@@ -180,7 +199,7 @@ def tf_convert(f, ctx, convert_by_default=True, force_conversion=False):
 
 
 # TODO(mdan): Make private.
-def convert(recursive=False, optional_features=None, force_conversion=True):
+def convert(recursive=False, optional_features=None, user_requested=True):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -194,8 +213,8 @@ def convert(recursive=False, optional_features=None, force_conversion=True):
     optional_features: converted.Feature, allows toggling optional or
       experimental features. When set to None, only the core features are
       enabled.
-    force_conversion: bool, whether to ignore the conversion whitelist. See
-      ConversionOptions.force_conversion.
+    user_requested: bool, whether to ignore the conversion whitelist. See
+      ConversionOptions.user_requested.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
@@ -207,21 +226,17 @@ def convert(recursive=False, optional_features=None, force_conversion=True):
 
     def wrapper(*args, **kwargs):
       """Wrapper that calls the converted version of f."""
-      with ag_ctx.ControlStatusCtx(
-          status=ag_ctx.Status.ENABLED, options=optional_features):
-        try:
-          return converted_call(
-              f,
-              converter.ConversionOptions(
-                  recursive=recursive,
-                  force_conversion=force_conversion,
-                  optional_features=optional_features,
-              ), args, kwargs)
-        except Exception as e:  # pylint:disable=broad-except
-          if hasattr(e, 'ag_error_metadata'):
-            raise e.ag_error_metadata.to_exception(type(e))
-          else:
-            raise
+      options = converter.ConversionOptions(
+          recursive=recursive,
+          user_requested=user_requested,
+          optional_features=optional_features)
+      try:
+        return converted_call(f, options, args, kwargs)
+      except Exception as e:  # pylint:disable=broad-except
+        if hasattr(e, 'ag_error_metadata'):
+          raise e.ag_error_metadata.to_exception(e)
+        else:
+          raise
 
     if inspect.isfunction(f) or inspect.ismethod(f):
       wrapper = functools.update_wrapper(wrapper, f)
@@ -236,20 +251,6 @@ def convert(recursive=False, optional_features=None, force_conversion=True):
   return decorator
 
 
-class RunMode(Enum):
-  """Specifies the way a converted function or method should be executed in TF.
-
-  Attributes:
-   * GRAPH: Call this function directly, as-is. This is suitable for functions
-     that were already designed for TF graphs and contain ops.
-   * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
-     that will only run correctly in Python, for example code that renders to
-     the display, reads keyboard input, etc.
-  """
-  GRAPH = 1
-  PY_FUNC = 2
-
-
 def call_with_unspecified_conversion_status(func):
   """Decorator that resets the conversion context to the unspecified status."""
   def wrapper(*args, **kwargs):
@@ -270,18 +271,11 @@ def do_not_convert_internal(f):
 
 
 @tf_export('autograph.experimental.do_not_convert')
-def do_not_convert(func=None, run_as=RunMode.GRAPH, return_dtypes=None):
+def do_not_convert(func=None):
   """Decorator that suppresses the conversion of a function.
 
-  See also: docs/pyfunc_dtypes.md
-
   Args:
     func: function to decorate.
-    run_as: RunMode, specifies how to use the function in TensorFlow.
-    return_dtypes: Optional[Iterable[ Union[tf.DType,
-      utils.py_func.MatchDType]]], the return data types of the converted
-      function, if run_as is RunMode.PY_FUNC. Ignored otherwise. May be set to
-      None if the function has no return values.
 
   Returns:
     If `func` is not None, returns a `Callable` which is equivalent to
@@ -291,29 +285,12 @@ def do_not_convert(func=None, run_as=RunMode.GRAPH, return_dtypes=None):
     above case.
   """
   if func is None:
-    return functools.partial(
-        do_not_convert,
-        run_as=run_as,
-        return_dtypes=return_dtypes)
+    return do_not_convert
 
-  def graph_wrapper(*args, **kwargs):
+  def wrapper(*args, **kwargs):
     with ag_ctx.ControlStatusCtx(status=ag_ctx.Status.DISABLED):
       return func(*args, **kwargs)
 
-  def py_func_wrapper(*args, **kwargs):
-    if kwargs:
-      raise NotImplementedError('RunMode.PY_FUNC does not yet support kwargs')
-    # TODO(mdan): Add support for kwargs.
-    return py_func.wrap_py_func(
-        func, return_dtypes, args, kwargs, use_dummy_return=not return_dtypes)
-
-  if run_as == RunMode.GRAPH:
-    wrapper = graph_wrapper
-  elif run_as == RunMode.PY_FUNC:
-    wrapper = py_func_wrapper
-  else:
-    raise ValueError('unknown value for run_as: %s' % run_as)
-
   if inspect.isfunction(func) or inspect.ismethod(func):
     wrapper = functools.update_wrapper(wrapper, func)
 
@@ -385,8 +362,23 @@ def _is_known_loaded_type(f, module_name, entity_name):
   return False
 
 
-def converted_call(f, options, args, kwargs):
-  """Compiles a function call inline. For internal use only."""
+def converted_call(f, options, args, kwargs, caller_fn_scope=None):
+  """Compiles a function call inline.
+
+  For internal use only.
+
+  Args:
+    f: The function to convert.
+    options: converter.ConversionOptions
+    args: Tuple, the original positional arguments of f
+    kwargs: Dict, the original keyword arguments of f
+    caller_fn_scope: Optional[function_wrappers.FunctionScope], the function
+      scope of the converted function in which this call was originally made.
+
+  Returns:
+    Any, the result of executing a possibly-converted `f` with the given
+      arguments.
+  """
   logging.log(1, 'Converted call: %s\n    args: %s\n    kwargs: %s\n', f, args,
               kwargs)
 
@@ -395,7 +387,9 @@ def converted_call(f, options, args, kwargs):
 
   if inspect_utils.isbuiltin(f):
     if f is eval:
-      return py_builtins.eval_in_original_context(f, args, 1)
+      return py_builtins.eval_in_original_context(f, args, caller_fn_scope)
+    if f is super:
+      return py_builtins.super_in_original_context(f, args, caller_fn_scope)
     if kwargs:
       return py_builtins.overload_of(f)(*args, **kwargs)
     else:
@@ -441,7 +435,7 @@ def converted_call(f, options, args, kwargs):
     logging.log(2, 'Permanently whitelisted: %s: TensorFlow plugin', f)
     return _call_unconverted(f, args, kwargs, options)
 
-  if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
+  if not options.user_requested and conversion.is_whitelisted_for_graph(f):
     return _call_unconverted(f, args, kwargs, options)
 
   # internal_convert_user_code is for example turned off when issuing a dynamic
@@ -507,10 +501,9 @@ def converted_call(f, options, args, kwargs):
                     target_entity)
         return _call_unconverted(f, args, kwargs, options)
 
-    converted_f = to_graph(
-        target_entity,
-        recursive=options.recursive,
-        experimental_optional_features=options.optional_features)
+    program_ctx = converter.ProgramContext(
+        options=options, autograph_module=tf_inspect.getmodule(converted_call))
+    converted_f = conversion.convert(target_entity, program_ctx)
 
     if logging.has_verbosity(2):
       logging.log(2, 'Defaults of %s : %s', converted_f,
@@ -615,6 +608,7 @@ def to_graph(entity, recursive=True, experimental_optional_features=None):
     program_ctx = converter.ProgramContext(
         options=converter.ConversionOptions(
             recursive=recursive,
+            user_requested=True,
             optional_features=experimental_optional_features),
         autograph_module=tf_inspect.getmodule(to_graph))
     return conversion.convert(entity, program_ctx)
diff --git a/tensorflow/python/autograph/impl/api_py3_test.py b/tensorflow/python/autograph/impl/api_py3_test.py
index 951b31357f8..d1ae2152bd2 100644
--- a/tensorflow/python/autograph/impl/api_py3_test.py
+++ b/tensorflow/python/autograph/impl/api_py3_test.py
@@ -38,6 +38,27 @@ class ApiTest(test.TestCase):
                            (), {'a': constant_op.constant(-1)})
     self.assertEqual(-1, self.evaluate(x))
 
+  def test_super_with_no_arg(self):
+    test_case_self = self
+
+    class TestBase:
+
+      def plus_three(self, x):
+        return x + 3
+
+    class TestSubclass(TestBase):
+
+      def plus_three(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def no_arg(self, x):
+        return super().plus_three(x)
+
+    tc = api.converted_call(TestSubclass,
+                            converter.ConversionOptions(recursive=True), (), {})
+
+    self.assertEqual(5, tc.no_arg(2))
+
 
 if __name__ == '__main__':
   os.environ['AUTOGRAPH_STRICT_CONVERSION'] = '1'
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 43330f707f1..1a3ed4ffc6b 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -108,11 +107,11 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_convert_then_do_not_convert_graph(self):
+  def test_convert_then_do_not_convert(self):
 
     class TestClass(object):
 
-      @api.do_not_convert(run_as=api.RunMode.GRAPH)
+      @api.do_not_convert
       def called_member(self, a):
         return tf.negative(a)
 
@@ -128,32 +127,6 @@ class ApiTest(test.TestCase):
         constant_op.constant(-2))
     self.assertAllEqual((0, 1), self.evaluate(x))
 
-  @test_util.run_deprecated_v1
-  def test_convert_then_do_not_convert_py_func(self):
-
-    class TestClass(object):
-
-      @api.do_not_convert(
-          run_as=api.RunMode.PY_FUNC, return_dtypes=py_func.MatchDType(1))
-      def called_member(self, a):
-        return np.negative(a)
-
-      @api.convert(recursive=True)
-      def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
-          y = self.called_member(a)
-          # set_shape works around while_loop's limitations.
-          # TODO(mdan): Allow specifying shapes (or ShapeLike) instead.
-          y.set_shape(a.shape)
-          x //= y
-        return x
-
-    tc = TestClass()
-    x = tc.test_method(
-        constant_op.constant((2, 4)), constant_op.constant(1),
-        constant_op.constant(-2))
-    self.assertAllEqual((0, 1), self.evaluate(x))
-
   @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
 
@@ -456,8 +429,7 @@ class ApiTest(test.TestCase):
     # tc is still a TestClass - constructors are whitelisted.
     # TODO(b/124016764): Support this use case.
     # The error below is specific to the `if` statement not being converted.
-    with self.assertRaisesRegex(TypeError,
-                                'Using a `tf.Tensor` as a Python `bool`'):
+    with self.assertRaises(TypeError):
       tc.test_method()
 
   def test_converted_call_mangled_properties(self):
@@ -518,7 +490,7 @@ class ApiTest(test.TestCase):
       return x + 1
 
     x = api.converted_call(
-        f, converter.ConversionOptions(recursive=True, force_conversion=True),
+        f, converter.ConversionOptions(recursive=True, user_requested=True),
         (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
@@ -538,8 +510,7 @@ class ApiTest(test.TestCase):
     opts = converter.ConversionOptions(internal_convert_user_code=False)
 
     # f should not be converted, causing len to error out.
-    with self.assertRaisesRegexp(Exception,
-                                 'object of type \'Tensor\' has no len()'):
+    with self.assertRaisesRegexp(Exception, 'len is not well defined'):
       api.converted_call(f, opts, (constant_op.constant([0]),), {})
 
     # len on the other hand should work fine.
@@ -593,7 +564,7 @@ class ApiTest(test.TestCase):
 
     # TODO(mdan): Add the missing level of support to LOGICAL_EXPRESSIONS.
     opts = converter.ConversionOptions(
-        force_conversion=True, optional_features=None)
+        user_requested=True, optional_features=None)
 
     x = api.converted_call(gen_math_ops.add, opts, (1, 1), {})
 
@@ -631,6 +602,53 @@ class ApiTest(test.TestCase):
 
     self.assertTrue(inspect_utils.isnamedtuple(x))
 
+  def test_converted_call_namedtuple_subclass_bound_method(self):
+
+    class TestClass(collections.namedtuple('TestNamedtuple', ('a', 'b'))):
+
+      def test_method(self, x):
+        while tf.reduce_sum(x) > self.a:
+          x //= self.b
+        return x
+
+    opts = converter.ConversionOptions(recursive=True)
+
+    obj = TestClass(5, 2)
+    x = api.converted_call(obj.test_method, opts,
+                           (constant_op.constant([2, 4]),), {})
+
+    self.assertAllEqual(self.evaluate(x), [1, 2])
+
+  def test_converted_call_namedtuple_method(self):
+
+    class TestClass(collections.namedtuple('TestNamedtuple', ('a', 'b'))):
+      pass
+
+    opts = converter.ConversionOptions(recursive=True)
+
+    obj = TestClass(5, 2)
+    # _asdict is a documented method of namedtuple.
+    x = api.converted_call(obj._asdict, opts, (), {})
+
+    self.assertDictEqual(x, {'a': 5, 'b': 2})
+
+  def test_converted_call_namedtuple_subclass_unbound_method(self):
+
+    class TestClass(collections.namedtuple('TestNamedtuple', ('a', 'b'))):
+
+      def test_method(self, x):
+        while tf.reduce_sum(x) > self.a:
+          x //= self.b
+        return x
+
+    opts = converter.ConversionOptions(recursive=True)
+
+    obj = TestClass(5, 2)
+    x = api.converted_call(TestClass.test_method, opts,
+                           (obj, constant_op.constant([2, 4])), {})
+
+    self.assertAllEqual(self.evaluate(x), [1, 2])
+
   def test_converted_call_lambda(self):
 
     opts = converter.ConversionOptions(recursive=True)
@@ -674,7 +692,7 @@ class ApiTest(test.TestCase):
     def f():
       return dataset_ops.Dataset.range(-3, 3).map(other_fn)
 
-    # Dataset iteration only works inside tf.function.
+    # Dataset iteration only works inside tf.
     @def_function.function
     def graph_fn():
       opts = converter.ConversionOptions(recursive=True)
@@ -851,13 +869,10 @@ class ApiTest(test.TestCase):
 
     self.assertNotEqual(converted_recursive.ag_module,
                         converted_non_recursive.ag_module)
-    self.assertIn('ag__.STD', tf_inspect.getsource(converted_recursive))
-    self.assertNotIn('internal_convert_user_code=False',
-                     tf_inspect.getsource(converted_recursive))
-    self.assertIn('internal_convert_user_code=False',
-                  tf_inspect.getsource(converted_non_recursive))
-    self.assertNotIn('internal_convert_user_code=True',
-                     tf_inspect.getsource(converted_non_recursive))
+    self.assertRegex(tf_inspect.getsource(converted_recursive),
+                     'FunctionScope(.*recursive=True.*)')
+    self.assertRegex(tf_inspect.getsource(converted_non_recursive),
+                     'FunctionScope(.*recursive=False.*)')
 
   def test_to_graph_preserves_bindings(self):
     y = 3
@@ -880,6 +895,22 @@ class ApiTest(test.TestCase):
 
     self.assertTrue(hasattr(api.to_graph(test_fn), 'ag_source_map'))
 
+  def test_to_graph_sets_conversion_context(self):
+
+    def g():
+      self.assertEqual(ag_ctx.control_status_ctx().status,
+                       ag_ctx.Status.ENABLED)
+      return 0
+
+    # Note: the autograph=False sets the contect to Status.DISABLED. The test
+    # verifies that to_graph overrides that.
+    @def_function.function(autograph=False)
+    def f():
+      converted_g = api.to_graph(g)
+      converted_g()
+
+    f()
+
   def test_to_code_basic(self):
 
     def test_fn(x, s):
@@ -890,6 +921,17 @@ class ApiTest(test.TestCase):
     # Just check that the output is parseable Python code.
     self.assertIsNotNone(parser.parse_str(api.to_code(test_fn)))
 
+  def test_to_code_with_wrapped_function(self):
+
+    @def_function.function
+    def test_fn(x, s):
+      while tf.reduce_sum(x) > s:
+        x /= 2
+      return x
+
+    with self.assertRaisesRegex(Exception, 'try passing.*python_function'):
+      api.to_code(test_fn)
+
   def test_tf_convert_direct(self):
 
     def f():
@@ -948,7 +990,7 @@ class ApiTest(test.TestCase):
 
     decorated_f = tf_decorator.make_decorator(f, wrapper)
 
-    # Note: the autograph setting of tf.function has nothing to do with the
+    # Note: the autograph setting of tf has nothing to do with the
     # test case. We just disable it to avoid confusion.
     @def_function.function(autograph=False)
     def test_fn(ctx):
@@ -965,6 +1007,50 @@ class ApiTest(test.TestCase):
       # The code in `f` is only valid with AutoGraph.
       test_fn(ag_ctx.ControlStatusCtx(status=ag_ctx.Status.DISABLED))
 
+  def test_super_with_one_arg(self):
+    test_case_self = self
+
+    class TestBase(object):
+
+      def plus_three(self, x):
+        return x + 3
+
+    class TestSubclass(TestBase):
+
+      def plus_three(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def one_arg(self, x):
+        test_base_unbound = super(TestSubclass)
+        test_base = test_base_unbound.__get__(self, TestSubclass)
+        return test_base.plus_three(x)
+
+    tc = api.converted_call(TestSubclass,
+                            converter.ConversionOptions(recursive=True), (), {})
+
+    self.assertEqual(5, tc.one_arg(2))
+
+  def test_super_with_two_args(self):
+    test_case_self = self
+
+    class TestBase(object):
+
+      def plus_three(self, x):
+        return x + 3
+
+    class TestSubclass(TestBase):
+
+      def plus_three(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def two_args(self, x):
+        return super(TestSubclass, self).plus_three(x)
+
+    tc = api.converted_call(TestSubclass,
+                            converter.ConversionOptions(recursive=True), (), {})
+
+    self.assertEqual(5, tc.two_args(2))
+
 
 if __name__ == '__main__':
   os.environ['AUTOGRAPH_STRICT_CONVERSION'] = '1'
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index b97c7e5d066..4f71966963f 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -43,11 +43,10 @@ from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.converters import return_statements
-from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import function_wrappers
 from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
@@ -308,6 +307,10 @@ def convert(entity, program_ctx):
   """Converts an entity into an equivalent entity."""
 
   if tf_inspect.isfunction(entity) or tf_inspect.ismethod(entity):
+    if not hasattr(entity, '__code__'):
+      raise ValueError('Cannot apply autograph to a function that doesn\'t '
+                       'expose a __code__ object. If this is a @tf.function,'
+                       ' try passing f.python_function instead.')
     free_nonglobal_var_names = entity.__code__.co_freevars
   else:
     free_nonglobal_var_names = ()
@@ -325,7 +328,9 @@ def convert(entity, program_ctx):
   return _instantiate(entity, converted_entity_info, free_nonglobal_var_names)
 
 
-def is_whitelisted_for_graph(o, check_call_override=True):
+# TODO(mdan): allow_namedtuple_subclass should be hardcoded to True.
+def is_whitelisted_for_graph(
+    o, check_call_override=True, allow_namedtuple_subclass=False):
   """Checks whether an entity is whitelisted for use in graph mode.
 
   Examples of whitelisted entities include all members of the tensorflow
@@ -336,6 +341,8 @@ def is_whitelisted_for_graph(o, check_call_override=True):
     check_call_override: Reserved for internal use. When set to `False`, it
       disables the rule according to which classes are whitelisted if their
       __call__ method is whitelisted.
+    allow_namedtuple_subclass: Reserved for internal use. When `True`,
+      namedtuple subclasses are not whitelisted.
 
   Returns:
     Boolean
@@ -399,7 +406,10 @@ def is_whitelisted_for_graph(o, check_call_override=True):
         return True
 
       owner_class = inspect_utils.getdefiningclass(o, owner_class)
-      if is_whitelisted_for_graph(owner_class, check_call_override=False):
+      if is_whitelisted_for_graph(
+          owner_class,
+          check_call_override=False,
+          allow_namedtuple_subclass=True):
         logging.log(2, 'Whitelisted: %s: owner is whitelisted %s', o,
                     owner_class)
         return True
@@ -408,8 +418,13 @@ def is_whitelisted_for_graph(o, check_call_override=True):
     # Due to the way they're constructed, namedtuple types cannot be converted
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
-    logging.log(2, 'Whitelisted: %s: named tuple', o)
-    return True
+    if allow_namedtuple_subclass:
+      if not any(inspect_utils.isnamedtuple(base) for base in o.__bases__):
+        logging.log(2, 'Whitelisted: %s: named tuple', o)
+        return True
+    else:
+      logging.log(2, 'Whitelisted: %s: named tuple or subclass', o)
+      return True
 
   logging.log(2, 'Not whitelisted: %s: default rule', o)
   return False
@@ -601,7 +616,8 @@ def _add_self_references(namespace, autograph_module):
     ag_internal.STD = converter.STANDARD_OPTIONS
     ag_internal.Feature = converter.Feature
     ag_internal.utils = utils
-    ag_internal.function_scope = function_wrapping.function_scope
+    ag_internal.FunctionScope = function_wrappers.FunctionScope
+    ag_internal.with_function_scope = function_wrappers.with_function_scope
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
@@ -641,24 +657,27 @@ def convert_func_to_ast(f, program_ctx, do_rename=True):
   _add_self_references(namespace, program_ctx.autograph_module)
   namer = naming.Namer(namespace)
 
+  if isinstance(node, gast.Lambda):
+    new_name = namer.new_symbol('tf__lambda', ())
+  elif do_rename:
+    new_name = namer.function_name(f.__name__)
+  else:
+    new_name = f.__name__
+
   entity_info = transformer.EntityInfo(
       source_code=source,
       source_file='<fragment>',
       future_features=future_features,
       namespace=namespace)
-  context = converter.EntityContext(namer, entity_info, program_ctx)
+  context = converter.EntityContext(namer, entity_info, program_ctx, new_name)
   node = node_to_graph(node, context)
 
   if isinstance(node, gast.Lambda):
-    new_name = namer.new_symbol('tf__lambda', ())
     node = gast.Assign(
         targets=[gast.Name(new_name, gast.Store(), None)], value=node)
-
   elif do_rename:
-    new_name = namer.function_name(f.__name__)
     node.name = new_name
   else:
-    new_name = f.__name__
     assert node.name == new_name
 
   return (node,), new_name, entity_info
@@ -681,6 +700,7 @@ def node_to_graph(node, context):
   unsupported_features_checker.verify(node)
 
   node = converter.standard_analysis(node, context, is_initial=True)
+  node = converter.apply_(node, context, function_scopes)
   node = converter.apply_(node, context, arg_defaults)
   node = converter.apply_(node, context, directives)
   node = converter.apply_(node, context, break_statements)
@@ -698,9 +718,4 @@ def node_to_graph(node, context):
   node = converter.apply_(node, context, control_flow)
   node = converter.apply_(node, context, conditional_expressions)
   node = converter.apply_(node, context, logical_expressions)
-  if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
-    node = converter.apply_(node, context, side_effect_guards)
-  # TODO(mdan): If function scopes ever does more, the toggle will need moving.
-  if context.program.options.uses(converter.Feature.NAME_SCOPES):
-    node = converter.apply_(node, context, function_scopes)
   return node
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 43f125324c0..25fefbd380c 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -37,6 +37,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:list_ops",
@@ -46,6 +47,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/utils",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -97,6 +99,24 @@ py_test(
     srcs = ["py_builtins_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_windows"],
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core",
+    ],
+)
+
+py_test(
+    name = "py_builtins_py3_test",
+    srcs = ["py_builtins_py3_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_windows",
+        # TODO(kkimlabs): Temporay workaround since KokoroPresubmit was failing.
+        #                 cl/259400943 for more context.
+        "no_oss_py2",
+    ],
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 9e179f55c17..3f0f53ffea9 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -59,6 +59,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import numpy as np
+
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.autograph.utils import ag_logging
@@ -74,8 +77,10 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
 
 LIMIT_PYTHON_ITERATIONS = True
 PYTHON_MAX_ITERATIONS = 100000000  # Fails in about one minute for empty loops.
@@ -83,7 +88,6 @@ WARN_INEFFICIENT_UNROLL = True
 INEFFICIENT_UNROLL_MIN_ITERATIONS = 3000
 INEFFICIENT_UNROLL_MIN_OPS = 1
 
-
 def _disallow_undefs_into_loop(*values):
   """Ensures that all values in the state are defined when entering a loop."""
   undefined = tuple(filter(special_values.is_undefined, values))
@@ -98,10 +102,173 @@ def _disallow_undefs_into_loop(*values):
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
-          'Return statements are not supported within a TensorFlow loop.')
+          'return statements are not supported within a TensorFlow loop.')
 
 
-def for_stmt(iter_, extra_test, body, get_state, set_state, init_vars):
+def _shape_greater_than_or_equal(shape1, shape2):
+  """Check whether the shape2 is equal or more specific than shape1."""
+
+  # The following logic was mirrored from control_flow_ops.py's
+  # _ShapeLessThanOrEqual function.
+  if shape1.dims is None:
+    return True
+  if shape1.ndims != shape2.ndims:
+    return False
+  for dim1, dim2 in zip(shape1.dims, shape2.dims):
+    if dim1.value is not None and dim1.value != dim2.value:
+      return False
+  return True
+
+
+def _verify_tf_loop_vars(init_loop_vars,
+                         first_iter_vars,
+                         basic_symbol_names,
+                         composite_symbol_names,
+                         include_shapes=True):
+  """Verifies loop variables for consistency."""
+
+  # The whole point of _verify_tf_loop_vars is to give more useful error message
+  # than tf-level exception by including variable names.  If it's not available,
+  # there is no point at performing this verification here.  As of 2019-07-31,
+  # operators:control_flow_test does not pass the names.
+  if basic_symbol_names is None:
+    return
+
+  output_symbol_names = basic_symbol_names + composite_symbol_names
+
+  assert len(init_loop_vars) == len(first_iter_vars) == len(output_symbol_names)
+
+  for init_loop_var, first_iter_var, name in zip(init_loop_vars,
+                                                 first_iter_vars,
+                                                 output_symbol_names):
+
+    try:
+      nest.assert_same_structure(
+          init_loop_var, first_iter_var, expand_composites=True)
+    except (ValueError, TypeError) as e:
+      raise TypeError('"{}" does not have the same nested structure after one'
+                      ' iteration.\n\n{}'.format(name, e))
+
+    def _check_same_type(name, init_loop_var, first_iter_var):
+      """Ensures init_loop_var and first_iter_var are consistent."""
+      if isinstance(init_loop_var, (bool, int, float, str)):
+        init_loop_var = ops.convert_to_tensor_v2(init_loop_var)
+
+      if isinstance(first_iter_var, (bool, int, float, str)):
+        first_iter_var = ops.convert_to_tensor_v2(first_iter_var)
+
+      if (not tensor_util.is_tensor(init_loop_var) or
+          not tensor_util.is_tensor(first_iter_var)):
+        return
+
+      # TODO(mdan): Properly account for CompositeTensors.
+      if (not hasattr(init_loop_var, 'dtype') or
+          not hasattr(first_iter_var, 'dtype')):
+        return
+      if (not hasattr(init_loop_var, 'shape') or
+          not hasattr(first_iter_var, 'shape')):
+        return
+
+      if init_loop_var.dtype != first_iter_var.dtype:
+        raise TypeError(
+            '"{}" has dtype {} before the loop, but dtype {} after one'
+            ' iteration. TensorFlow control flow requires it stays the'
+            ' same.'.format(
+                name,
+                init_loop_var.dtype.name,
+                first_iter_var.dtype.name,
+            ))
+
+      if include_shapes:
+        init_shape = init_loop_var.shape
+        first_iter_shape = first_iter_var.shape
+        # TODO(b/135183013): Update needed once we support shape_invariants.
+        if not _shape_greater_than_or_equal(init_shape, first_iter_shape):
+          raise ValueError(
+              '"{}" has shape {} before the loop, but shape {} after one'
+              ' iteration. TensorFlow control flow requires it stays the'
+              ' same or be more specific.'.format(name, init_shape,
+                                                  first_iter_shape))
+
+    nest.map_structure(
+        functools.partial(_check_same_type, name), init_loop_var,
+        first_iter_var)
+
+
+def _verify_tf_cond_vars(body_outputs, orelse_outputs, basic_symbol_names,
+                         composite_symbol_names):
+  """Verifies variables manipulated by a conditional for consistency."""
+
+  # The whole point of _verify_tf_cond_vars is to give more useful error message
+  # than tf-level exception by including variable names.  If it's not available,
+  # there is no point at performing this verification here.  As of 2019-07-31,
+  # conditional expression does not pass the names.
+  if basic_symbol_names is None:
+    return
+
+  output_symbol_names = basic_symbol_names + composite_symbol_names
+
+  basic_body_outputs, composite_body_outputs = body_outputs
+  basic_orelse_outputs, composite_orelse_outputs = orelse_outputs
+  assert isinstance(composite_body_outputs, tuple)
+  assert isinstance(composite_orelse_outputs, tuple)
+
+  # TODO(kkimlabs): Make this more consistent.
+  # The basic outputs should always be a tuple.
+  if not isinstance(basic_body_outputs, tuple):
+    basic_body_outputs = (basic_body_outputs,)
+  if not isinstance(basic_orelse_outputs, tuple):
+    basic_orelse_outputs = (basic_orelse_outputs,)
+
+  body_outputs = basic_body_outputs + composite_body_outputs
+  orelse_outputs = basic_orelse_outputs + composite_orelse_outputs
+
+  for body_output, orelse_output, name in zip(body_outputs, orelse_outputs,
+                                              output_symbol_names):
+    try:
+      nest.assert_same_structure(
+          body_output, orelse_output, expand_composites=True)
+    except (ValueError, TypeError) as e:
+      raise TypeError(
+          '"{}" does not have the same nested structure in the TRUE and FALSE'
+          ' branches.\n\n{}'.format(name, str(e)))
+
+    def _check_same_type(name, body_output_var, orelse_output_var):
+      """Verfies that body_output_var and orelse_output_var have same dtype."""
+      if isinstance(body_output_var, (bool, int, float, str)):
+        body_output_var = ops.convert_to_tensor_v2(body_output_var)
+
+      if isinstance(orelse_output_var, (bool, int, float, str)):
+        orelse_output_var = ops.convert_to_tensor_v2(orelse_output_var)
+
+      if (not tensor_util.is_tensor(body_output_var) or
+          not tensor_util.is_tensor(orelse_output_var)):
+        return
+
+      # TODO(mdan): Properly account for CompositeTensors.
+      if (not hasattr(body_output_var, 'dtype') or
+          not hasattr(orelse_output_var, 'dtype')):
+        return
+
+      if body_output_var.dtype != orelse_output_var.dtype:
+        raise TypeError(
+            '"{}" has dtype {} in the TRUE branch, but dtype={} in the FALSE'
+            ' branch. TensorFlow control flow requires that they are the'
+            ' same.'.format(name, body_output_var.dtype.name,
+                            orelse_output_var.dtype.name))
+
+    nest.map_structure(
+        functools.partial(_check_same_type, name), body_output, orelse_output)
+
+
+def for_stmt(iter_,
+             extra_test,
+             body,
+             get_state,
+             set_state,
+             init_vars,
+             basic_symbol_names=None,
+             composite_symbol_names=None):
   """Functional form of a for statement.
 
   The loop operates on a state, which includes all symbols that are
@@ -135,6 +302,8 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, init_vars):
     set_state: Additional callable which save values captured by get_state back
       into the Python environment. This is only useful when staging the loop.
     init_vars: Tuple containing the initial state.
+    basic_symbol_names: Tuple containing basic loop var names.
+    composite_symbol_names: Tuple containing composite loop var names.
 
   Returns:
     Tuple containing the final state.
@@ -142,18 +311,22 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, init_vars):
   if tensor_util.is_tensor(iter_):
     if tensors.is_range_tensor(iter_):
       return _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state,
-                                init_vars)
+                                init_vars, basic_symbol_names,
+                                composite_symbol_names)
     else:
       return _known_len_tf_for_stmt(iter_, extra_test, body, get_state,
-                                    set_state, init_vars)
+                                    set_state, init_vars, basic_symbol_names,
+                                    composite_symbol_names)
 
   if isinstance(iter_, dataset_ops.DatasetV2):
     return _tf_dataset_for_stmt(iter_, extra_test, body, get_state, set_state,
-                                init_vars)
+                                init_vars, basic_symbol_names,
+                                composite_symbol_names)
 
   if isinstance(iter_, iterator_ops.IteratorV2):
     return _tf_iterator_for_stmt(iter_, extra_test, body, get_state, set_state,
-                                 init_vars)
+                                 init_vars, basic_symbol_names,
+                                 composite_symbol_names)
 
   # Note: This experimental interface is subject to change.
   custom_handler = getattr(iter_, '_autograph_for_loop', None)
@@ -179,7 +352,8 @@ def _py_for_stmt(iter_, extra_test, body, get_state, set_state, init_vars):
 
 
 def _known_len_tf_for_stmt(iter_, extra_test, body, get_state, set_state,
-                           init_vars):
+                           init_vars, basic_symbol_names,
+                           composite_symbol_names):
   """Overload of for_stmt that iterates over TF entities that admit a length."""
   _disallow_undefs_into_loop(*init_vars)
 
@@ -191,8 +365,11 @@ def _known_len_tf_for_stmt(iter_, extra_test, body, get_state, set_state,
   iter_ = ta.unstack(iter_)
 
   def while_body(iterate_index, *loop_vars):
+    """Main loop body."""
     iterate = iter_.read(iterate_index)
     new_vars = body(iterate, *loop_vars)
+    _verify_tf_loop_vars(loop_vars, new_vars, basic_symbol_names,
+                         composite_symbol_names)
 
     loop_vars = (iterate_index + 1,)
     if new_vars:
@@ -206,13 +383,22 @@ def _known_len_tf_for_stmt(iter_, extra_test, body, get_state, set_state,
           iterate_index < n, lambda: extra_test(*loop_vars), lambda: False)
     return iterate_index < n
 
+  opts = {}
+  # TODO(b/134181679): We do not always set maximum_iterations since that
+  # is significantly slower on GPU.
+  if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
+
   results = _tf_while_stmt(
       while_cond,
       while_body,
       get_state,
       set_state,
-      init_vars=(0,) + init_vars,
-      opts=dict(maximum_iterations=n))
+      (0,) + init_vars,
+      None,
+      None,
+      opts=opts,
+  )
 
   # Note: the iteration index is not returned by the while loop, however
   # if a symbol with the same name exists outside the loop, it will be captured
@@ -227,8 +413,8 @@ def _known_len_tf_for_stmt(iter_, extra_test, body, get_state, set_state,
   return results
 
 
-def _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state,
-                       init_vars):
+def _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state, init_vars,
+                       basic_symbol_names, composite_symbol_names):
   """Overload of for_stmt that iterates over a TF range (and elides it)."""
   _disallow_undefs_into_loop(*init_vars)
 
@@ -236,33 +422,61 @@ def _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state,
 
   def while_body(iterate, *loop_vars):
     new_vars = body(iterate, *loop_vars)
-
     loop_vars = (iterate + delta,)
+
     if new_vars:
       loop_vars += new_vars
 
     return loop_vars
 
   def while_cond(iterate, *loop_vars):
-    main_test = math_ops.logical_or(
-        math_ops.logical_and(delta >= 0, iterate < limit),
-        math_ops.logical_and(delta < 0, iterate > limit))
+    """Cond function for `tf.while_loop`."""
+
+    def build_main_test():
+      """Main iteration condition."""
+      # Note(b/138857806): LogicalAnd is slow on GPU so we avoid adding it if
+      # `delta` is a compile time constant.
+      delta_const = tensor_util.constant_value(delta)
+      if delta_const is not None:
+        # Support single element arrays.
+        delta_const = np.asscalar(delta_const)
+        if delta_const >= 0:
+          return iterate < limit
+        else:
+          return iterate > limit
+      else:
+        return math_ops.logical_or(
+            math_ops.logical_and(delta >= 0, iterate < limit),
+            math_ops.logical_and(delta < 0, iterate > limit))
+
+    main_test = build_main_test()
     if extra_test is not None:
       return control_flow_ops.cond(
           main_test, lambda: extra_test(*loop_vars), lambda: False)
     return main_test
 
-  # This specific dtype is required by while_loop.
-  maximum_iterations = math_ops.cast(
-      misc.get_range_len(start, limit, delta), dtypes.int32)
+  # The first loopvar corresponds to the iterate variable which is internal.
+  if isinstance(basic_symbol_names, tuple):
+    basic_symbol_names = (None,) + basic_symbol_names
+
+  opts = {}
+  # TODO(b/134181679): We do not always set maximum_iterations since that
+  # is significantly slower on GPU.
+  if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    # This specific dtype is required by while_loop.
+    opts['maximum_iterations'] = math_ops.cast(
+        misc.get_range_len(start, limit, delta), dtypes.int32)
 
   results = _tf_while_stmt(
       while_cond,
       while_body,
       get_state,
       set_state,
-      init_vars=(start,) + init_vars,
-      opts=dict(maximum_iterations=maximum_iterations))
+      (start,) + init_vars,
+      basic_symbol_names,
+      composite_symbol_names,
+      opts=opts,
+  )
 
   # Note: the iteration index is not returned by the while loop, however
   # if a symbol with the same name exists outside the loop, it will be captured
@@ -278,12 +492,16 @@ def _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state,
 
 
 def _tf_iterator_for_stmt(itr, extra_test, body, get_state, set_state,
-                          init_vars):
+                          init_vars, basic_symbol_names,
+                          composite_symbol_names):
   """Overload of for_stmt that iterates over TF Iterators. See for_loop."""
   _disallow_undefs_into_loop(*init_vars)
 
   def while_body_actual(opt_iterate, *loop_vars):
+    """Actual main loop body."""
     new_vars = body(opt_iterate.get_value(), *loop_vars)
+    _verify_tf_loop_vars(loop_vars, new_vars, basic_symbol_names,
+                         composite_symbol_names)
     # TODO(mdan): Fix this inconsistency in the converter.
     if new_vars is None:
       new_vars = ()
@@ -318,31 +536,40 @@ def _tf_iterator_for_stmt(itr, extra_test, body, get_state, set_state,
           has_next, lambda: extra_test(*loop_vars), lambda: False)
     return has_next
 
+  # The first loopvar corresponds to the iterate variable which is internal.
   _, final_vars = _tf_while_stmt(
       while_cond,
       while_body,
       get_state,
       set_state,
-      init_vars=(True, init_vars),
-      opts=None)
+      (True, init_vars),
+      None,
+      None,
+      opts=None,
+  )
   return final_vars
 
 
-def _tf_dataset_for_stmt(ds, extra_test, body, get_state, set_state, init_vars):
+def _tf_dataset_for_stmt(ds, extra_test, body, get_state, set_state, init_vars,
+                         basic_symbol_names, composite_symbol_names):
   """Overload of for_stmt that iterates over TF Datasets."""
   _disallow_undefs_into_loop(*init_vars)
 
   if extra_test is not None:
     assert init_vars, 'Lowering should always add state.'
     return _dataset_for_stmt_with_extra_test(ds, extra_test, body, get_state,
-                                             set_state, init_vars)
+                                             set_state, init_vars,
+                                             basic_symbol_names,
+                                             composite_symbol_names)
 
   return _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state,
-                                         init_vars)
+                                         init_vars, basic_symbol_names,
+                                         composite_symbol_names)
 
 
 def _dataset_for_stmt_with_extra_test(ds, extra_test, body, get_state,
-                                      set_state, init_vars):
+                                      set_state, init_vars, basic_symbol_names,
+                                      composite_symbol_names):
   """Overload of _dataset_for_stmt with early stopping. See for_stmt."""
 
   # TODO(mdan): Simplify this - following it is extremely difficult.
@@ -354,6 +581,12 @@ def _dataset_for_stmt_with_extra_test(ds, extra_test, body, get_state,
     def true_fn():
       set_state(state)
       outputs = body(iterate, *loop_vars)
+      _verify_tf_loop_vars(
+          loop_vars + state,
+          outputs + state,
+          basic_symbol_names,
+          composite_symbol_names,
+          include_shapes=False)
       return outputs, get_state()
 
     extra_cond = extra_test(*loop_vars)
@@ -385,7 +618,8 @@ def _dataset_for_stmt_with_extra_test(ds, extra_test, body, get_state,
   return final_vars
 
 
-def _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state, init_vars):
+def _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state, init_vars,
+                                    basic_symbol_names, composite_symbol_names):
   """Overload of _dataset_for_stmt without early stopping. See for_stmt."""
   init_state = get_state()
   assert isinstance(init_vars, tuple)
@@ -399,6 +633,8 @@ def _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state, init_vars):
 
   if no_vars:
     init_vars = (constant_op.constant(0),)
+    if isinstance(basic_symbol_names, tuple):
+      basic_symbol_names = (None,) + basic_symbol_names
   if no_state:
     init_state = (constant_op.constant(0),)
 
@@ -419,6 +655,12 @@ def _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state, init_vars):
     else:
       new_state = get_state()
 
+    _verify_tf_loop_vars(
+        loop_vars + state,
+        new_vars + new_state,
+        basic_symbol_names,
+        composite_symbol_names,
+        include_shapes=False)
     return new_vars, new_state
 
   aug_vars = init_vars, get_state()
@@ -430,7 +672,16 @@ def _dataset_for_stmt_no_extra_test(ds, body, get_state, set_state, init_vars):
   return final_vars
 
 
-def while_stmt(test, body, get_state, set_state, init_vars, opts=None):
+def while_stmt(
+    test,
+    body,
+    get_state,
+    set_state,
+    init_vars,
+    basic_symbol_names=None,
+    composite_symbol_names=None,
+    opts=None,
+):
   """Functional form of a while statement.
 
   The loop operates on a so-called state, which includes all symbols that are
@@ -449,11 +700,14 @@ def while_stmt(test, body, get_state, set_state, init_vars, opts=None):
     set_state: Additional callable which save values captured by get_state back
       into the Python environment. This is only useful when staging the loop.
     init_vars: Tuple containing the initial state.
+    basic_symbol_names: Tuple containing basic loop var names.
+    composite_symbol_names: Tuple containing composite loop var names.
     opts: Optional dict of extra loop parameters.
 
   Returns:
     Tuple containing the final state.
   """
+
   # Evaluate the initial test once in order to do the dispatch. The evaluation
   # is isolated to minimize unwanted side effects.
   # TODO(mdan): Do a full iteration - some state types might lower to Tensor.
@@ -463,7 +717,8 @@ def while_stmt(test, body, get_state, set_state, init_vars, opts=None):
   # TensorFlow: Multiple evaluations are acceptable in this case, so we're fine
   # with the re-evaluation of `test` that `_tf_while_stmt` will make.
   if tensors.is_dense_tensor(init_test):
-    return _tf_while_stmt(test, body, get_state, set_state, init_vars, opts)
+    return _tf_while_stmt(test, body, get_state, set_state, init_vars,
+                          basic_symbol_names, composite_symbol_names, opts)
 
   # Normal Python: We already consumed one evaluation of `test`; consistently,
   # unroll one iteration before dispatching to a normal loop.
@@ -475,7 +730,11 @@ def while_stmt(test, body, get_state, set_state, init_vars, opts=None):
   return _py_while_stmt(test, body, get_state, set_state, init_vars, opts)
 
 
-def _tf_while_stmt(test, body, get_state, set_state, init_vars, opts):
+# TODO(kkimlabs): Some callers set basic_symbol_names=None and
+# composite_symbol_names=None and call _verify_tf_loop_vars(...) itself.  We can
+# remove these arguments once all callers do that.
+def _tf_while_stmt(test, body, get_state, set_state, init_vars,
+                   basic_symbol_names, composite_symbol_names, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   _disallow_undefs_into_loop(*init_vars)
 
@@ -495,7 +754,11 @@ def _tf_while_stmt(test, body, get_state, set_state, init_vars, opts):
     state = aug_loop_vars[state_slice]
     set_state(state)
     loop_vars = body(*aug_loop_vars[loop_vars_slice])
-    return loop_vars + get_state()
+    new_state = loop_vars + get_state()
+    _verify_tf_loop_vars(aug_loop_vars, new_state, basic_symbol_names,
+                         composite_symbol_names)
+
+    return new_state
 
   # Non-v2 while_loop unpacks the results when there is only one return value.
   # This enforces consistency across versions.
@@ -592,7 +855,13 @@ def _py_while_stmt(test, body, get_state, set_state, init_vars, opts):
   return loop_vars
 
 
-def if_stmt(cond, body, orelse, get_state, set_state):
+def if_stmt(cond,
+            body,
+            orelse,
+            get_state,
+            set_state,
+            basic_symbol_names=None,
+            composite_symbol_names=None):
   """Functional form of an if statement.
 
   Args:
@@ -612,18 +881,22 @@ def if_stmt(cond, body, orelse, get_state, set_state):
       restore checkpointed values. The single argument a tuple containing values
       for each composite symbol that may be modified in a branch of the
       conditional. The is usually the result of a call to get_state.
+    basic_symbol_names: Tuple containing basic loop var names.
+    composite_symbol_names: Tuple containing composite loop var names.
 
   Returns:
     Tuple containing the statement outputs.
   """
   # Note: tf.cond doesn't support SparseTensor.
   if tensors.is_dense_tensor(cond):
-    return tf_if_stmt(cond, body, orelse, get_state, set_state)
+    return tf_if_stmt(cond, body, orelse, get_state, set_state,
+                      basic_symbol_names, composite_symbol_names)
   else:
     return _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse, get_state, set_state):
+def tf_if_stmt(cond, body, orelse, get_state, set_state, basic_symbol_names,
+               composite_symbol_names):
   """Overload of if_stmt that stages a TF cond."""
   body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
   orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
@@ -635,7 +908,28 @@ def tf_if_stmt(cond, body, orelse, get_state, set_state):
   # symbols (e.g. `a`) which cannot be passed by reference and must be returned.
   # See _isolate_state.
   # TODO(mdan): We should minimize calls to get/set_state.
-  final_vars, final_state = control_flow_ops.cond(cond, body, orelse)
+
+  body_branch = 0
+  orelse_branch = 1
+  result = [None, None]
+
+  def error_checking_body():
+    result[body_branch] = body()
+    if result[orelse_branch] is not None:
+      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
+                           basic_symbol_names, composite_symbol_names)
+    return result[body_branch]
+
+  def error_checking_orelse():
+    result[orelse_branch] = orelse()
+    if result[body_branch] is not None:
+      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
+                           basic_symbol_names, composite_symbol_names)
+    return result[orelse_branch]
+
+  final_vars, final_state = control_flow_ops.cond(cond, error_checking_body,
+                                                  error_checking_orelse)
+
   set_state(final_state)
 
   return final_vars
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index cf25075dfcd..7b6217cf78e 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -62,6 +63,19 @@ class ForLoopTest(test.TestCase):
           init_vars=(0,))
       self.assertEqual(self.evaluate(s), (1234,))
 
+  def test_range_tensor_random_delta(self):
+
+    with ops.Graph().as_default():
+      random_one = random_ops.random_uniform((), 1, 2, dtype=dtypes.int32)
+      s = control_flow.for_stmt(
+          math_ops.range(0, 5, random_one),
+          extra_test=lambda s: True,
+          body=lambda i, s: (s * 10 + i,),
+          get_state=lambda: (),
+          set_state=lambda _: None,
+          init_vars=(0,))
+      self.assertEqual(self.evaluate(s), (1234,))
+
   def test_range_tensor_explicit_limit_delta(self):
     with ops.Graph().as_default():
       s = control_flow.for_stmt(
@@ -73,6 +87,21 @@ class ForLoopTest(test.TestCase):
           init_vars=(0,))
       self.assertEqual(self.evaluate(s), (-171207,))
 
+  def test_range_tensor_random_negative_delta(self):
+    with ops.Graph().as_default():
+      random_neg_five = random_ops.random_uniform((),
+                                                  -5,
+                                                  -4,
+                                                  dtype=dtypes.int32)
+      s = control_flow.for_stmt(
+          math_ops.range(17, 3, random_neg_five),
+          extra_test=lambda s: True,
+          body=lambda i, s: (s * 100 + i,),
+          get_state=lambda: (),
+          set_state=lambda _: None,
+          init_vars=(0,))
+      self.assertEqual(self.evaluate(s), (171207,))
+
   def test_range_tensor_negative_delta(self):
     with ops.Graph().as_default():
       s = control_flow.for_stmt(
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index ab28228c207..cd5f69bbce2 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -27,6 +27,7 @@ import six
 
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.autograph.utils import tensors
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,11 +49,34 @@ def overload_of(f):
   return f
 
 
-def eval_in_original_context(f, args, caller_level_delta):
-  """Executes the eval function with the user-specified globals/locals."""
+def _find_originating_frame(caller_fn_scope, innermost=True):
+  """Locates the frame in which `caller_fn_scope` was defined."""
   ctx_frame = inspect.currentframe()
-  for _ in range(caller_level_delta + 1):
+  result = None
+  while ctx_frame is not None:
+    # Note it should not be normally possible to get false positives this way
+    # because the function scope object is not accessible to user code (barring
+    # call stack introspection).
+    if ctx_frame.f_locals.get(caller_fn_scope.name, None) is caller_fn_scope:
+      result = ctx_frame
+      if innermost:
+        break
     ctx_frame = ctx_frame.f_back
+
+  assert result is not None, (
+      'the conversion process should ensure the caller_fn_scope is always'
+      ' found somewhere on the call stack')
+
+  return result
+
+
+def eval_in_original_context(f, args, caller_fn_scope):
+  """Executes the eval function in the context of a specified function."""
+  # When control flow is rewritten using functions, eval should use the
+  # variables found in the same block where it was called. That is equivalent
+  # to the innermost function call.
+  ctx_frame = _find_originating_frame(caller_fn_scope, innermost=True)
+
   args = (
       args[0],
       ctx_frame.f_globals if len(args) < 2 else args[1],
@@ -61,6 +85,68 @@ def eval_in_original_context(f, args, caller_level_delta):
   return f(*args)
 
 
+def super_in_original_context(f, args, caller_fn_scope):
+  """Executes the super function in the context of a specified function.
+
+  See https://docs.python.org/3/library/functions.html#super for the exact
+  details
+
+  Args:
+    f: Callable, typically the super builtin
+    args: List[Any], the original call arguments
+    caller_fn_scope: Optional[function_wrappers.FunctionScope], the function
+      scope of the converted function in which this call was originally made
+
+  Returns:
+    The result of calling `f` as if it was called in the frame indicated by
+      `caller_fn_scope`.
+  """
+
+  # Python 2 doesn't support implicit argument super variants.
+  if six.PY2:
+    return f(*args)
+
+  # Only the no-arg call is desugared.
+  if args:
+    return f(*args)
+
+  # Inner functions seem to include their closure in f_locals, so we need
+  # to find the outermost frame.
+  ctx_frame = _find_originating_frame(caller_fn_scope, innermost=False)
+
+  # When super(..) is called without arguments, it looks for __class__ cell
+  # variable and the first argument passed in the enclosing function according
+  # to the spec https://www.python.org/dev/peps/pep-3135/ .
+  #
+  # We couldn't verify if `inspect.currentframe().f_code.co_varnames[0]` is
+  # guaranteed to be the first argument from an official doc or PEP, however,
+  # it's fairly stable and well established:
+  # - An unofficial community doc mentions it.
+  #   https://python-reference.readthedocs.io/en/latest/docs/code/varnames.html
+  # - CPython has tests checking that order, which was merged in 2008, and
+  #   unchanged since then.
+  #   https://github.com/python/cpython/blame/2f224a077a83ac9de8a12bb7dcc516642b8176d8/Lib/lib2to3/tests/data/py2_test_grammar.py#L157
+  #   https://github.com/python/cpython/blame/2f224a077a83ac9de8a12bb7dcc516642b8176d8/Lib/lib2to3/tests/data/py3_test_grammar.py#L192
+  #
+  # Note: the name can be more reliably obtained by inspecting the calling
+  # function's argspec.
+  #
+  # Even though methods can be declared using *args (def method(*args)),
+  # that pattern is disallowed by super() -- it raises super() no arguments.
+  # Method definitions using **kwargs are not allowed at all.
+  # In other words, we can always assume that self is on the first positional
+  # argument (for correct code).
+  #
+  # TODO(mdan): Consider additional checks in case the input code is incorrect.
+  # For example, the error might be cryptic compared to what super() regularly
+  # raises.
+
+  type_arg = ctx_frame.f_locals['__class__']
+  self_arg_name = ctx_frame.f_code.co_varnames[0]
+  self_arg = ctx_frame.f_locals[self_arg_name]
+  return f(type_arg, self_arg)
+
+
 def abs_(x):
   if tensor_util.is_tensor(x):
     return _tf_abs(x)
@@ -139,7 +225,7 @@ def _tf_tensor_len(s):
     return s.shape.dims[0].value
 
   # Static shape of unknown dimensions: use dynamic shape but statically
-  # chech that it's a scalar.
+  # check that it's a scalar.
   shape = array_ops.shape(s)
 
   assert shape.shape, 'shape tensor of zero size? {}'.format(shape)
@@ -242,7 +328,35 @@ def _py_range(start_or_stop, stop, step):
   return range(start_or_stop)
 
 
-SUPPORTED_BUILTINS = (abs, float, int, len, print, range)
+def enumerate_(s, start=0):
+  if isinstance(s, dataset_ops.DatasetV2):
+    return _tf_dataset_enumerate(s, start)
+  return _py_enumerate(s, start)
+
+
+def _tf_dataset_enumerate(s, start=0):
+  return s.enumerate(start)
+
+
+def _py_enumerate(s, start=0):
+  return enumerate(s, start)
+
+
+def zip_(*iterables):
+  if all(isinstance(x, dataset_ops.DatasetV2) for x in iterables):
+    return _tf_dataset_zip(*iterables)
+  return _py_zip(*iterables)
+
+
+def _tf_dataset_zip(*iterables):
+  return dataset_ops.DatasetV2.zip(tuple(iterables))
+
+
+def _py_zip(*iterables):
+  return zip(*iterables)
+
+
+SUPPORTED_BUILTINS = (abs, float, int, len, print, range, enumerate, zip)
 
 if six.PY2:
   SUPPORTED_BUILTINS += (xrange,)
@@ -256,4 +370,6 @@ BUILTIN_FUINCTIONS_MAP = {
     'range': range_,
     # TODO(mdan): This might make more sense as tf.data.range.
     'xrange': range_,
+    'enumerate': enumerate_,
+    'zip': zip_,
 }
diff --git a/tensorflow/python/autograph/operators/py_builtins_py3_test.py b/tensorflow/python/autograph/operators/py_builtins_py3_test.py
new file mode 100644
index 00000000000..11a33b90b75
--- /dev/null
+++ b/tensorflow/python/autograph/operators/py_builtins_py3_test.py
@@ -0,0 +1,123 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for py_builtins_py3 module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.core import function_wrappers
+from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.platform import test
+
+
+class TestBaseClass(object):
+
+  def overridden_method(self, x):
+    return x + 20
+
+
+class PyBuiltinsTest(test.TestCase):
+
+  def _basic_function_scope(self):
+    return function_wrappers.FunctionScope(
+        'test_function_name',
+        'test_scope',  # Note: this must match the name in the `with` statement.
+        converter.ConversionOptions())
+
+  def test_super_in_original_context_niladic_call(self):
+    test_case_self = self
+
+    class TestSubclass(TestBaseClass):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self):
+        with test_case_self._basic_function_scope() as test_scope:
+          b = py_builtins.super_in_original_context(super, (), test_scope)
+          return b.overridden_method(1)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(), 21)
+
+  def test_super_in_original_context_caller_with_locals(self):
+    test_case_self = self
+
+    class TestSubclass(TestBaseClass):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        y = 7
+        with test_case_self._basic_function_scope() as test_scope:
+          z = 7
+          return py_builtins.super_in_original_context(
+              super, (), test_scope).overridden_method(x + y - z)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
+  def test_super_in_original_context_inner_function(self):
+    test_case_self = self
+
+    class TestSubclass(TestBaseClass):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        with test_case_self._basic_function_scope() as test_scope:
+          # Oddly, it's sufficient to use `self` in an inner function
+          # to gain access to __class__ in this scope.
+          # TODO(mdan): Is this true across implementations?
+          # Note: normally, it's illegal to use super() in inner functions (it
+          # throws an error), but the generated code may create them.
+          def inner_fn():
+            return py_builtins.super_in_original_context(
+                super, (), test_scope).overridden_method(x)
+
+          return inner_fn()
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
+  def test_super_in_original_context_inner_lambda(self):
+    test_case_self = self
+
+    class TestSubclass(TestBaseClass):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        with test_case_self._basic_function_scope() as test_scope:
+          # Oddly, it's sufficient to use `self` in an inner function
+          # to gain access to __class__ in this scope.
+          # TODO(mdan): Is this true across implementations?
+          # Note: normally, it's illegal to use super() in inner functions (it
+          # throws an error), but the generated code may create them.
+          l = lambda: py_builtins.super_in_original_context(  # pylint:disable=g-long-lambda
+              super, (), test_scope).overridden_method(x)
+          return l()
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index 1be10bf0171..be77495daed 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -22,8 +22,11 @@ import sys
 
 import six
 
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.core import function_wrappers
 from tensorflow.python.autograph.operators import data_structures
 from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -33,6 +36,12 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
+class TestBase(object):
+
+  def plus_twenty(self, x):
+    return x + 20
+
+
 class PyBuiltinsTest(test.TestCase):
 
   def test_abs(self):
@@ -137,23 +146,103 @@ class PyBuiltinsTest(test.TestCase):
       r = py_builtins.range_(5, constant_op.constant(2))
       self.assertAllEqual(self.evaluate(r), [])
 
+  def test_enumerate(self):
+    self.assertListEqual(
+        list(py_builtins.enumerate_([3, 2, 1])), [(0, 3), (1, 2), (2, 1)])
+    self.assertListEqual(
+        list(py_builtins.enumerate_([3, 2, 1], 5)), [(5, 3), (6, 2), (7, 1)])
+    self.assertListEqual(list(py_builtins.enumerate_([-8], -3)), [(-3, -8)])
+
+  def test_enumerate_dataset(self):
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(['a', 'c'])
+    start = constant_op.constant(20, dtype=dtypes.int64)
+    dataset = py_builtins.enumerate_(dataset, start)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(iterator.get_next()), (20, b'a'))
+      self.assertAllEqual(self.evaluate(iterator.get_next()), (21, b'c'))
+
+  def test_zip(self):
+    self.assertListEqual(
+        list(py_builtins.zip_([3, 2, 1], [1, 2, 3])), [(3, 1), (2, 2), (1, 3)])
+    self.assertListEqual(
+        list(py_builtins.zip_([4, 5, 6], [-1, -2])), [(4, -1), (5, -2)])
+
+  def test_zip_dataset(self):
+    ds1 = dataset_ops.DatasetV2.from_tensor_slices([-11, -12, 4])
+    ds2 = dataset_ops.DatasetV2.from_tensor_slices([-21, -22, 5])
+    ds3 = py_builtins.zip_(ds1, ds2)
+    iterator = dataset_ops.make_one_shot_iterator(ds3)
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(iterator.get_next()), (-11, -21))
+      self.assertAllEqual(self.evaluate(iterator.get_next()), (-12, -22))
+      self.assertAllEqual(self.evaluate(iterator.get_next()), (4, 5))
+
+  def _basic_function_scope(self):
+    return function_wrappers.FunctionScope(
+        'test_function_name',
+        'test_scope',  # Note: this must match the name in the `with` statement.
+        converter.ConversionOptions())
+
   def test_eval_in_original_context(self):
 
-    def caller_1(lvl_delta):
+    def test_fn():
       l = 1  # pylint:disable=unused-variable
-      return py_builtins.eval_in_original_context(eval, ('l',), lvl_delta)
+      with self._basic_function_scope() as test_scope:
+        return py_builtins.eval_in_original_context(eval, ('l',), test_scope)
 
-    def caller_2(lvl_delta):
-      l = 2  # pylint:disable=unused-variable
-      return caller_1(lvl_delta)
+    self.assertEqual(test_fn(), 1)
 
-    def caller_3(lvl_delta):
-      l = 3  # pylint:disable=unused-variable
-      return caller_2(lvl_delta)
+  def test_eval_in_original_context_inner_function(self):
 
-    self.assertEqual(caller_3(0), 1)
-    self.assertEqual(caller_3(1), 2)
-    self.assertEqual(caller_3(2), 3)
+    def test_fn():
+      l = 1  # pylint:disable=unused-variable
+      with self._basic_function_scope() as test_scope:
+
+        def inner_fn():
+          # Note: a user function without a top-level function scope should
+          # never be found in user code; it's only possible in generated code.
+          l = 2  # pylint:disable=unused-variable
+          return py_builtins.eval_in_original_context(eval, ('l',), test_scope)
+
+        return inner_fn()
+
+    self.assertEqual(test_fn(), 2)
+
+  def test_super_in_original_context_unary_call(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def plus_twenty(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self):
+        with test_case_self._basic_function_scope() as test_scope:
+          test_base_unbound = py_builtins.super_in_original_context(
+              super, (TestSubclass,), test_scope)
+          test_base = test_base_unbound.__get__(self, TestSubclass)
+          return test_base.plus_twenty(1)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(), 21)
+
+  def test_super_in_original_context_binary_call(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def plus_twenty(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self):
+        with test_case_self._basic_function_scope() as test_scope:
+          test_base = py_builtins.super_in_original_context(
+              super, (TestSubclass, self), test_scope)
+          return test_base.plus_twenty(1)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(), 21)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf.py b/tensorflow/python/autograph/pyct/common_transformers/anf.py
index ed141aea956..216c0231434 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf.py
@@ -156,7 +156,8 @@ class AnfTransformer(transformer.Base):
   # A-normal form.  Thus they are left in by default, but could be pulled out
   # if the configuration calls for it.
   _literal_nodes = (
-      gast.Num, gast.Str, gast.Bytes, gast.NameConstant
+      gast.Num, gast.Str, gast.Bytes, gast.NameConstant,
+      gast.Name  # Name is here to cover True, False, and None in Python 2
   )
 
   def _match(self, pattern, parent, field, child):
@@ -198,7 +199,8 @@ class AnfTransformer(transformer.Base):
     """
     if node is None:
       return node
-    if isinstance(node, self._trivial_nodes):
+    if (isinstance(node, self._trivial_nodes) and
+        not _is_py2_name_constant(node)):
       return node
     if isinstance(node, list):
       # If something's field was actually a list, e.g., variadic arguments.
@@ -493,6 +495,10 @@ class AnfTransformer(transformer.Base):
     return node
 
 
+def _is_py2_name_constant(node):
+  return isinstance(node, gast.Name) and node.id in ['True', 'False', 'None']
+
+
 def transform(node, ctx, config=None, gensym_source=None):
   """Converts the given node to A-normal form (ANF).
 
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index df5ea5dc522..fe2e9b20361 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -479,7 +479,7 @@ class AnfConfiguredTest(AnfTestBase):
   def test_constants_in_function_calls(self):
     # An example specific configuration that differs from the default: Moving
     # literals out of being directly passed to functions, but nothing else.
-    literals = (gast.Num, gast.Str, gast.Bytes, gast.NameConstant)
+    literals = (gast.Num, gast.Str, gast.Bytes, gast.NameConstant, gast.Name)
     config = [(anf.ASTEdgePattern(gast.Call, anf.ANY, literals), anf.REPLACE)]
 
     def test_function(x, frob):
@@ -514,6 +514,24 @@ class AnfConfiguredTest(AnfTestBase):
 
     self.assert_body_anfs_as_expected(expected_result, test_function, config)
 
+  def test_touching_name_constant(self):
+    # Checking that the nodes for `True`, `False`, and `None` can be manipulated
+    # by a configuration.  This is non-trivial, because in Python 2 those are
+    # represented as `Name`, which is the same node type as variable references.
+    specials = (gast.Name, gast.NameConstant)
+    config = [(anf.ASTEdgePattern(gast.Call, anf.ANY, specials), anf.REPLACE)]
+
+    def test_function(f):
+      return f(True, False, None)
+
+    def expected_result(f):
+      tmp_1001 = True
+      tmp_1002 = False
+      tmp_1003 = None
+      return f(tmp_1001, tmp_1002, tmp_1003)
+
+    self.assert_body_anfs_as_expected(expected_result, test_function, config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
index 4960883ae32..345320dc931 100644
--- a/tensorflow/python/autograph/pyct/errors.py
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -197,7 +197,8 @@ class ErrorMetadataBase(object):
 
     return '\n'.join(lines)
 
-  def create_exception(self, preferred_type):
+  def create_exception(self, source_error):
+    preferred_type = type(source_error)
     if preferred_type.__init__ is Exception.__init__:
       return preferred_type(self.get_message())
     if preferred_type in KNOWN_STRING_CONSTRUCTOR_ERRORS:
@@ -206,8 +207,8 @@ class ErrorMetadataBase(object):
       return MultilineMessageKeyError(self.get_message(), self.cause_message)
     return None
 
-  def to_exception(self, preferred_type):
-    exc = self.create_exception(preferred_type)
+  def to_exception(self, source_error):
+    exc = self.create_exception(source_error)
     exc.__suppress_context__ = True
     exc.ag_error_metadata = self
     return exc
diff --git a/tensorflow/python/autograph/pyct/errors_test.py b/tensorflow/python/autograph/pyct/errors_test.py
index f6286a5e319..9640af15d44 100644
--- a/tensorflow/python/autograph/pyct/errors_test.py
+++ b/tensorflow/python/autograph/pyct/errors_test.py
@@ -36,7 +36,7 @@ class ErrorMetadataBaseTest(test.TestCase):
         cause_metadata=None,
         cause_message='test message',
         source_map={})
-    exc = em.create_exception(CustomError)
+    exc = em.create_exception(CustomError())
     self.assertIsInstance(exc, CustomError)
     self.assertIn('test message', str(exc))
 
@@ -52,7 +52,7 @@ class ErrorMetadataBaseTest(test.TestCase):
         cause_metadata=None,
         cause_message='test message',
         source_map={})
-    exc = em.create_exception(CustomError)
+    exc = em.create_exception(CustomError())
     self.assertIsNone(exc)
 
   def test_get_message_when_frame_info_code_is_none(self):
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 6d4f2525201..47c52d2e8bb 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -81,7 +81,7 @@ def isnamedtuple(f):
 
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
-  if f in six.moves.builtins.__dict__.values():
+  if any(f is builtin for builtin in six.moves.builtins.__dict__.values()):
     return True
   elif isinstance(f, types.BuiltinFunctionType):
     return True
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 28212053fc6..fa2c7068419 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -360,6 +360,7 @@ class ActivityAnalyzer(transformer.Base):
     assert not self._in_function_def_args
     self.state[_Lambda].enter()
     node = self.generic_visit(node)
+    anno.setanno(node, anno.Static.SCOPE, self.scope)
     self.state[_Lambda].exit()
     return node
 
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 3e705dd48a1..253e2943a12 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -122,6 +122,7 @@ class ReplaceTransformer(gast.NodeTransformer):
         anno.Basic.SKIP_PROCESSING,
         anno.Static.ORIG_DEFINITIONS,
         'extra_test',
+        'function_context_name',
     }
 
   def _prepare_replacement(self, replaced, key):
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 032781e89fc..c4a39ecc521 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import re
 import threading
@@ -30,6 +29,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import device
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
@@ -40,8 +40,13 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.compat import collections_abc
 
+_python_session_create_counter = monitoring.Counter(
+    '/tensorflow/api/python/session_create_counter',
+    'Counter for number of sessions created in Python.')
 
 class SessionInterface(object):
   """Base class for implementations of TensorFlow client sessions."""
@@ -259,7 +264,7 @@ class _FetchMapper(object):
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
-    elif isinstance(fetch, collections.Mapping):
+    elif isinstance(fetch, collections_abc.Mapping):
       return _DictFetchMapper(fetch)
     elif _is_attrs_instance(fetch):
       return _AttrsFetchMapper(fetch)
@@ -343,7 +348,7 @@ def _uniquify_fetches(fetch_mappers):
   """
   unique_fetches = []
   value_indices = []
-  seen_fetches = {}
+  seen_fetches = object_identity.ObjectIdentityDictionary()
   for m in fetch_mappers:
     m_value_indices = []
     for f in m.unique_fetches():
@@ -470,9 +475,10 @@ class _FetchHandler(object):
     self._fetches = []
     self._targets = []
     self._feeds = feeds
-    self._feed_handles = feed_handles or {}
+    self._feed_handles = (
+        feed_handles or object_identity.ObjectIdentityDictionary())
     self._ops = []
-    self._fetch_handles = {}
+    self._fetch_handles = object_identity.ObjectIdentityDictionary()
     for fetch in self._fetch_mapper.unique_fetches():
       if isinstance(fetch, ops.Operation):
         self._assert_fetchable(graph, fetch)
@@ -491,8 +497,12 @@ class _FetchHandler(object):
 
   def _assert_fetchable(self, graph, op):
     if not graph.is_fetchable(op):
-      raise ValueError('Operation %r has been marked as not fetchable.' %
-                       op.name)
+      raise errors.InaccessibleTensorError(
+          'Operation %r has been marked as not fetchable. Typically this'
+          ' happens when it is defined in another function or code block.'
+          ' Use return values,explicit Python locals or TensorFlow collections'
+          ' to access it.'
+          % op.name)
 
   def fetches(self):
     """Return the unique names of tensors to fetch.
@@ -635,6 +645,7 @@ class BaseSession(SessionInterface):
         creating the TensorFlow session.
       TypeError: If one of the arguments has the wrong type.
     """
+    _python_session_create_counter.get_cell().increase_by(1)
     if graph is None:
       self._graph = ops.get_default_graph()
     else:
@@ -1060,7 +1071,8 @@ class BaseSession(SessionInterface):
 
     # Validate and process fetches.
     # TODO(touts): Support feeding and fetching the same tensor.
-    fetch_handler = _FetchHandler(self._graph, fetches, {})
+    fetch_handler = _FetchHandler(self._graph, fetches,
+                                  object_identity.ObjectIdentityDictionary())
 
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
@@ -1094,7 +1106,7 @@ class BaseSession(SessionInterface):
                          'graph before calling run().')
 
     # Create request.
-    feed_dict_tensor = {}
+    feed_dict_tensor = object_identity.ObjectIdentityDictionary()
     feed_map = {}
 
     # Validate and process feed_dict.
@@ -1228,7 +1240,8 @@ class BaseSession(SessionInterface):
     self._extend_graph()
 
     # Create a fetch handler to take care of the structure of fetches.
-    fetch_handler = _FetchHandler(self._graph, fetches, {})
+    fetch_handler = _FetchHandler(self._graph, fetches,
+                                  object_identity.ObjectIdentityDictionary())
     # pylint: disable=protected-access
     fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
     target_list = [op._c_op for op in fetch_handler.targets()]
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 70de97835d3..628a8a8ee4d 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -143,6 +143,30 @@ tensorflow::ImportNumpy();
   $result = PyLong_FromUnsignedLongLong($1);
 }
 
+// Convert TF_OperationGetAttrType TF_DataType* out-argument to Python integer.
+%typemap(in, numinputs=0) TF_DataType *value (TF_DataType temp) {
+  $1 = &temp;
+}
+%typemap(argout) TF_DataType *value {
+  $result = PyInt_FromLong(*$1);
+}
+
+// Convert TF_OperationGetAttrBool unsigned char* out-argument to Python bool.
+%typemap(in, numinputs=0) unsigned char *value (unsigned char temp) {
+  $1 = &temp;
+}
+%typemap(argout) unsigned char *value {
+  $result = PyBool_FromLong(*$1);
+}
+
+// Convert TF_OperationGetAttrInt int64_t* out-argument to Python bool.
+%typemap(in, numinputs=0) int64_t *value (int64_t temp) {
+  $1 = &temp;
+}
+%typemap(argout) int64_t *value {
+  $result = PyLong_FromLongLong(*$1);
+}
+
 // We use TF_OperationGetControlInputs_wrapper instead of
 // TF_OperationGetControlInputs
 %ignore TF_OperationGetControlInputs;
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index dd2ed951c8f..05088fd815a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 8, 28)
 
 _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
index 85381089b7c..60547a9f900 100644
--- a/tensorflow/python/compat/v2_compat.py
+++ b/tensorflow/python/compat/v2_compat.py
@@ -39,13 +39,16 @@ def enable_v2_behavior():
   This function is called in the main TensorFlow `__init__.py` file, user should
   not need to call it, except during complex migrations.
   """
+  # TF2 behavior is enabled if either 1) enable_v2_behavior() is called or
+  # 2) the TF2_BEHAVIOR=1 environment variable is set.  In the latter case,
+  # the modules below independently check if tf2.enabled().
   tf2.enable()
   ops.enable_eager_execution()
   tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
   variable_scope.enable_resource_variables()
+  ops.enable_tensor_equality()
   # Enables TensorArrayV2 and control flow V2.
-  # TODO(b/134181885): Re-enable this.
-  # control_flow_v2_toggles.enable_control_flow_v2()
+  control_flow_v2_toggles.enable_control_flow_v2()
 
 
 @tf_export(v1=["disable_v2_behavior"])
@@ -63,5 +66,6 @@ def disable_v2_behavior():
   ops.disable_eager_execution()
   tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
   variable_scope.disable_resource_variables()
+  ops.disable_tensor_equality()
   # Disables TensorArrayV2 and control flow V2.
   control_flow_v2_toggles.disable_control_flow_v2()
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 07f6f2b80cd..161dd2ce019 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -80,6 +80,7 @@ cuda_py_test(
     srcs = ["trt_convert_test.py"],
     additional_deps = [
         ":trt_convert_py",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:graph_util",
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 56994617b90..d44a0ec7156 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -153,8 +153,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
           # runtime to allocate GPU memory.
           max_workspace_size_bytes=1 << 28,
           minimum_segment_size=2,
-          use_calibration=False,
-          use_function_backup=False)
+          use_calibration=False)
       graph_def = converter.convert()
       logging.info('Number of nodes after TF-TRT conversion: %d',
                    len(graph_def.node))
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 6b72cbec9bd..30aae4de5cf 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -23,6 +23,7 @@ import errno
 import gc
 import itertools
 import os
+import re
 import shutil
 import tempfile
 import warnings
@@ -234,10 +235,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
         use_calibration=run_params.use_calibration,
-        use_function_backup=False,
         max_batch_size=min(batch_list))
-    return conversion_params._replace(
-        use_function_backup=IsQuantizationWithCalibration(conversion_params))
+    return conversion_params
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
@@ -388,8 +387,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         minimum_segment_size=conversion_params.minimum_segment_size,
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
-        use_calibration=conversion_params.use_calibration,
-        use_function_backup=conversion_params.use_function_backup)
+        use_calibration=conversion_params.use_calibration)
 
   def _GetCalibratedInferGraph(self, run_params, saved_model_dir, inputs_data):
     """Return trt converted graphdef in INT8 mode."""
@@ -558,21 +556,18 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       if node.op == "TRTEngineOp":
         logging.info("Found TRTEngineOp: " + node.name)
         num_engines += 1
-        segment_funcdef_name = node.attr["segment_funcdef_name"].s
+        segment_funcdef_name = node.attr["segment_func"].func.name
         function_name = node.name + "_native_segment"
-        if IsQuantizationWithCalibration(run_params):
-          self.assertNotEmpty(segment_funcdef_name, node.name)
-          self.assertIn(function_name, functions)
-        else:
-          self.assertEmpty(segment_funcdef_name, node.name)
-          self.assertNotIn(function_name, functions)
+        is_dynamic_engine = not node.attr["static_engine"].b
+        self.assertNotEmpty(segment_funcdef_name, node.name)
+        self.assertIn(function_name, functions)
+        if not IsQuantizationWithCalibration and not is_dynamic_engine:
+          self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
         self.assertIn(node.name, expected_engines)
-        self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
         self.assertEqual(
             self._ToBytes(run_params.precision_mode),
             node.attr["precision_mode"].s, node.name)
 
-        is_dynamic_engine = not node.attr["static_engine"].b
         self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
                          node.name)
         self.assertEqual(node.attr["use_calibration"].b,
@@ -602,10 +597,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         node.name for node in gdef_to_verify.node if node.op == "TRTEngineOp"
     ]
     for func in gdef_to_verify.library.function:
-      for node in func.node_def:
-        all_op_names.append(node.name)
-        if node.op == "TRTEngineOp":
-          trt_op_names.append(node.name)
+      if not re.search(r"TRTEngineOp_\d+_native_segment", func.signature.name):
+        for node in func.node_def:
+          all_op_names.append(node.name)
+          if node.op == "TRTEngineOp":
+            trt_op_names.append(node.name)
     # Remove the function name prefix.
     def _Canonicalize(names):
       return set([self._ToString(name.split("/")[-1]) for name in names])
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 982c4fea641..2de271d4e85 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
@@ -51,6 +50,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import nest
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # Lazily load the op, since it's not available in cpu-only builds. Importing
@@ -94,7 +94,10 @@ class TrtPrecisionMode(object):
 
   @staticmethod
   def supported_precision_modes():
-    return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
+    precisions = [
+        TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8
+    ]
+    return precisions + [p.lower() for p in precisions]
 
 
 # Use a large enough number as the default max_workspace_size for TRT engines,
@@ -125,6 +128,7 @@ TrtConversionParams = collections.namedtuple(
 
         # Whether to generate dynamic TRT ops which will build the TRT network
         # and engine at run time.
+        # This option should be set to True in TF 2.0.
         "is_dynamic_op",
 
         # Max number of cached TRT engines in dynamic TRT ops. If the number of
@@ -144,11 +148,6 @@ TrtConversionParams = collections.namedtuple(
         # trained with fake quantization.
         "use_calibration",
 
-        # If set to True, it will create a FunctionDef for each subgraph that is
-        # converted to TRT op, and if TRT ops fail to execute at runtime, it'll
-        # invoke that function as a fallback.
-        "use_function_backup",
-
         # Max size for the input batch.
         # This option is deprecated in TF 2.0.
         "max_batch_size",
@@ -159,13 +158,11 @@ DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams(
     max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
     precision_mode=TrtPrecisionMode.FP32,
     minimum_segment_size=3,
-    is_dynamic_op=False,
+    is_dynamic_op=True,
     maximum_cached_engines=1,
     use_calibration=True,
-    use_function_backup=True,
     max_batch_size=1)
 
-_TRT_ENGINE_CACHE_CONTAINER_NAME = "TF-TRT-Engine-Cache"
 _TRT_ENGINE_OP_NAME = "TRTEngineOp"
 
 
@@ -220,8 +217,7 @@ def _check_trt_version_compatibility():
                     ".".join([str(x) for x in loaded_version]))
 
 
-def get_tensorrt_rewriter_config(
-    conversion_params=DEFAULT_TRT_CONVERSION_PARAMS, is_v2=False):
+def get_tensorrt_rewriter_config(conversion_params, is_v2=False):
   """Returns a RewriterConfig proto for TRT transformation.
 
   Args:
@@ -269,12 +265,12 @@ def get_tensorrt_rewriter_config(
       "maximum_cached_engines"].i = conversion_params.maximum_cached_engines
   optimizer.parameter_map[
       "use_calibration"].b = conversion_params.use_calibration
-  optimizer.parameter_map[
-      "use_function_backup"].b = conversion_params.use_function_backup
-
   if is_v2:
-    # Static mode (a.k.a pre-generating TRT engines and make them node
-    # attributes) is deprecated in TF 2.0.
+    # Static mode (building TRT engine without executing the op) is deprecated
+    # in TF 2.0. See TrtGraphConverterV2 for more details.
+    if not conversion_params.is_dynamic_op:
+      raise ValueError("Option is_dynamic_op=False is not supported in TF 2.0, "
+                       "please set it to True instead.")
     optimizer.parameter_map["is_dynamic_op"].b = True
   else:
     optimizer.parameter_map[
@@ -283,6 +279,19 @@ def get_tensorrt_rewriter_config(
   return rewriter_config_with_trt
 
 
+# Remove all scope prefixes in the node name. In TF 2.0, the same concrete
+# function can be initialized multiple times with different prefixes, and
+# this will result in the same TRTEngineOp being initialized multiple times
+# with different cache and duplicate TRT engines.
+# TODO(laigd): this may be caused by the fact that TRTEngineOp is not
+# stateful, need to investigate.
+# TODO(laigd): we rely on the fact that all functions are fully inlined
+# before TF-TRT optimizer is called, as otherwise it may generate the same
+# name when optimizing a different function graph. Fix this.
+def _get_canonical_engine_name(name):
+  return name.split("/")[-1]
+
+
 class TrtGraphConverter(object):
   """A converter for TF-TRT transformation for TF 1.x GraphDef/SavedModels.
 
@@ -328,8 +337,7 @@ class TrtGraphConverter(object):
                minimum_segment_size=3,
                is_dynamic_op=False,
                maximum_cached_engines=1,
-               use_calibration=True,
-               use_function_backup=True):
+               use_calibration=True):
     """Initialize the converter.
 
     Args:
@@ -368,13 +376,14 @@ class TrtGraphConverter(object):
         will occur. Please note that accuracy may be negatively affected if
         there is a mismatch between which tensors TRT quantizes and which
         tensors were trained with fake quantization.
-      use_function_backup: if set to True, it will create a FunctionDef for each
-        subgraph that is converted to TRT op, and if TRT ops fail to execute at
-        runtime, it'll invoke that function as a fallback.
 
     Raises:
       ValueError: if the combination of the parameters is invalid.
+      RuntimeError: if this class is used in TF 2.0.
     """
+    if context.executing_eagerly():
+      raise RuntimeError("Please use TrtGraphConverterV2 in TF 2.0.")
+
     if input_graph_def and input_saved_model_dir:
       raise ValueError(
           "Can only specify one of input_graph_def and input_saved_model_dir")
@@ -408,12 +417,6 @@ class TrtGraphConverter(object):
           "dynamic TRT ops only. Disregarding is_dynamic_op parameter.")
       is_dynamic_op = True
 
-    # TODO(laigd): consider provide a mechanism to remove the fallback path
-    # after calibration is done.
-    if self._need_calibration and not use_function_backup:
-      raise ValueError(
-          "Calibration requires enabling fallback to TF function execution.")
-
     # TODO(laigd):
     # - Verify in int8 mode that maximum_cached_engines is set properly.
     # - If it fails to build the int8 engine it should return error.
@@ -430,7 +433,6 @@ class TrtGraphConverter(object):
         is_dynamic_op=is_dynamic_op,
         maximum_cached_engines=maximum_cached_engines,
         use_calibration=use_calibration,
-        use_function_backup=use_function_backup,
         max_batch_size=max_batch_size)
     _check_conversion_params(self._conversion_params)
 
@@ -577,14 +579,18 @@ class TrtGraphConverter(object):
     assert self._need_calibration
     assert not self._calibration_data_collected
 
-    if context.executing_eagerly():
-      raise RuntimeError("Calibration for TF 2.0 is not supported yet.")
-
     if (feed_dict_fn and input_map_fn) or (not feed_dict_fn and
                                            not input_map_fn):
       raise ValueError(
           "Should specify one and only one of feed_dict_fn and input_map_fn.")
 
+    if input_map_fn:
+      for k, v in input_map_fn().items():
+        if not isinstance(k, str):
+          raise ValueError("Keys of input_map_fn must be of type str")
+        if not isinstance(v, ops.Tensor):
+          raise ValueError("Values of input_map_fn must be of type tf.Tensor")
+
     self._calibration_graph = ops.Graph()
     with self._calibration_graph.as_default():
       fetches = importer.import_graph_def(
@@ -626,7 +632,9 @@ class TrtGraphConverter(object):
             # Get the calibration resource.
             calibration_result = calibration_sess.run(
                 device_to_get_resource_op_map[node.device],
-                feed_dict={resource_name_input: node.name})
+                feed_dict={
+                    resource_name_input: _get_canonical_engine_name(node.name)
+                })
             node.attr["calibration_data"].s = calibration_result
 
       self._calibration_data_collected = True
@@ -719,15 +727,14 @@ class TrtGraphConverter(object):
 
 def _get_resource_handle(name, device):
   with ops.device(device):
-    return gen_trt_ops.create_trt_engine_cache_handle(
-        container=_TRT_ENGINE_CACHE_CONTAINER_NAME, resource_name=name)
+    return gen_trt_ops.create_trt_resource_handle(resource_name=name)
 
 
-class TRTEngineResourceDeleter(tracking.CapturableResourceDeleter):
+class _TRTEngineResourceDeleter(tracking.CapturableResourceDeleter):
   """Resource deleter for destroying TRT engine cache resource."""
 
   def __init__(self, resource_name, device):
-    super(TRTEngineResourceDeleter, self).__init__()
+    super(_TRTEngineResourceDeleter, self).__init__()
     self._resource_name = resource_name
     self._device = device
 
@@ -738,7 +745,7 @@ class TRTEngineResourceDeleter(tracking.CapturableResourceDeleter):
           handle, ignore_lookup_error=True)
 
 
-class TRTEngineResource(tracking.TrackableResource):
+class _TRTEngineResource(tracking.TrackableResource):
   """Class to track the serialized engines resource."""
 
   def __init__(self,
@@ -746,19 +753,19 @@ class TRTEngineResource(tracking.TrackableResource):
                filename,
                maximum_cached_engines,
                device="GPU"):
-    super(TRTEngineResource, self).__init__(
-        device=device, deleter=TRTEngineResourceDeleter(resource_name, device))
+    super(_TRTEngineResource, self).__init__(
+        device=device, deleter=_TRTEngineResourceDeleter(resource_name, device))
     self._resource_name = resource_name
     # Track the serialized engine file in the SavedModel.
     self._filename = self._track_trackable(
-        tracking.TrackableAsset(filename), "_serialized_trt_engine_filename")
+        tracking.TrackableAsset(filename), "_serialized_trt_resource_filename")
     self._maximum_cached_engines = maximum_cached_engines
 
   def _create_resource(self):
     return _get_resource_handle(self._resource_name, self._resource_device)
 
   def _initialize(self):
-    gen_trt_ops.populate_trt_engine_cache(
+    gen_trt_ops.initialize_trt_resource(
         self.resource_handle,
         self._filename,
         max_cached_engines_count=self._maximum_cached_engines)
@@ -767,48 +774,96 @@ class TRTEngineResource(tracking.TrackableResource):
 class TrtGraphConverterV2(object):
   """An offline converter for TF-TRT transformation for TF 2.0 SavedModels.
 
-  To run the conversion without quantization calibration (e.g. for FP32/FP16
-  precision modes):
+  Note that in V2, is_dynamic_op=False is not supported, meaning TRT engines
+  will be built only when the corresponding TRTEngineOp is executed. But we
+  still provide a way to avoid the cost of building TRT engines during inference
+  (see more below).
 
-  ```python
-  params = DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode='FP16')
-  converter = TrtGraphConverterV2(
-      input_saved_model_dir="my_dir", conversion_params=params)
-  converter.convert()
-  converter.save(output_saved_model_dir)
-  ```
+  There are several ways to run the conversion:
 
-  As a result, a TF-TRT converted SavedModel will be generated and saved to
-  `output_saved_model_dir`. The SavedModel will have TRT compatible subgraph
-  replaced by TRTEngineOps, but no TRT engines will be pre-built until execution
-  time. We can also build the TRT engines offline by running the converted
-  function with some input data:
+  1. FP32/FP16 precision
 
-  ```python
-  params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
-      precision_mode='FP16',
-      # Set this to a large enough number so it can cache all the TRT engines.
-      maximum_cached_engines=16)
-  converter = TrtGraphConverterV2(
-      input_saved_model_dir="my_dir", conversion_params=params)
-  converted_func = converter.convert()
-  for data in my_input_data:
-    converted_func(my_input_data)
-  converter.save(output_saved_model_dir)
-  ```
+     ```python
+     params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+         precision_mode='FP16')
+     converter = TrtGraphConverterV2(
+         input_saved_model_dir="my_dir", conversion_params=params)
+     converter.convert()
+     converter.save(output_saved_model_dir)
+     ```
 
-  In this way, for each unique shapes of the inputs to the TRTEngineOp, if it
-  cannot be handled by any previously generated TRT engine, a new engine will be
-  generated and serialized to the output SavedModel in `output_saved_model_dir`.
-  This is good for applications that cannot afford building TRT engines at
-  runtime but have access to input data that is similar to the one used in
-  production (for example, that will result in the same input shapes to the
-  TRTEngineOps). Also, the generated TRT engines is platform dependent, so we
-  need to run `converted_func` in an environment that is similar to production
-  (at least with same type of GPU).
+     In this case, no TRT engines will be built or saved in the converted
+     SavedModel. But if input data is available during conversion, we can still
+     build and save the TRT engines to reduce the cost during inference (see
+     option 2 below).
 
-  TODO(laigd/hinsu): running conversion with calibration in INT8 mode should
-  follow exactly the same steps.
+  2. FP32/FP16 precision with pre-built engines
+
+     ```python
+     params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+         precision_mode='FP16',
+         # Set this to a large enough number so it can cache all the engines.
+         maximum_cached_engines=16)
+     converter = TrtGraphConverterV2(
+         input_saved_model_dir="my_dir", conversion_params=params)
+     converter.convert()
+
+     # Define a generator function that yields input data, and use it to execute
+     # the graph to build TRT engines.
+     # With TensorRT 5.1, different engines will be built (and saved later) for
+     # different input shapes to the TRTEngineOp.
+     def my_input_fn():
+       for _ in range(num_runs):
+         inp1, inp2 = ...
+         yield inp1, inp2
+
+     converter.build(input_fn=my_input_fn)  # Generate corresponding TRT engines
+     converter.save(output_saved_model_dir)  # Generated engines will be saved.
+     ```
+
+     In this way, one engine will be built/saved for each unique input shapes of
+     the TRTEngineOp. This is good for applications that cannot afford building
+     engines during inference but have access to input data that is similar to
+     the one used in production (for example, that has the same input shapes).
+     Also, the generated TRT engines is platform dependent, so we need to run
+     `build()` in an environment that is similar to production (e.g. with
+     same type of GPU).
+
+  3. INT8 precision and calibration with pre-built engines
+
+     ```python
+     params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+         precision_mode='INT8',
+         # Currently only one INT8 engine is supported in this mode.
+         maximum_cached_engines=1,
+         use_calibration=True)
+     converter = TrtGraphConverterV2(
+         input_saved_model_dir="my_dir", conversion_params=params)
+
+     # Define a generator function that yields input data, and run INT8
+     # calibration with the data. All input data should have the same shape.
+     # At the end of convert(), the calibration stats (e.g. range information)
+     # will be saved and can be used to generate more TRT engines with different
+     # shapes. Also, one TRT engine will be generated (with the same shape as
+     # the calibration data) for save later.
+     def my_calibration_input_fn():
+       for _ in range(num_runs):
+         inp1, inp2 = ...
+         yield inp1, inp2
+
+     converter.convert(calibration_input_fn=my_calibration_input_fn)
+
+     # (Optional) Generate more TRT engines offline (same as the previous
+     # option), to avoid the cost of generating them during inference.
+     def my_input_fn():
+       for _ in range(num_runs):
+         inp1, inp2 = ...
+         yield inp1, inp2
+     converter.build(input_fn=my_input_fn)
+
+     # Save the TRT engine and the engines.
+     converter.save(output_saved_model_dir)
+     ```
   """
 
   def __init__(self,
@@ -867,15 +922,46 @@ class TrtGraphConverterV2(object):
     return tf_optimizer.OptimizeGraph(
         grappler_session_config, meta_graph_def, graph_id=b"tf_graph")
 
+  def _for_each_trt_node(self, graph_def, fn):
+    """Helper method to manipulate all TRTEngineOps in a GraphDef."""
+    for node in graph_def.node:
+      if node.op == _TRT_ENGINE_OP_NAME:
+        fn(node)
+    for func in graph_def.library.function:
+      for node in func.node_def:
+        if node.op == _TRT_ENGINE_OP_NAME:
+          fn(node)
+
   # TODO(laigd): provide a utility function to optimize a ConcreteFunction and
   # use it here (b/124792963).
-  def convert(self):
+  def convert(self, calibration_input_fn=None):
     """Convert the input SavedModel in 2.0 format.
 
+    Args:
+      calibration_input_fn: a generator function that yields input data as a
+        list or tuple, which will be used to execute the converted signature for
+        calibration. All the returned input data should have the same shape.
+        Example:
+        ```
+        def input_fn():
+          yield input1, input2, input3
+        ```
+
+    Raises:
+      ValueError: if the input combination is invalid.
+
     Returns:
       The TF-TRT converted Function.
     """
     assert not self._converted
+
+    if (self._need_calibration and not calibration_input_fn):
+      raise ValueError("Should specify calibration_input_fn because INT8 "
+                       "calibration is needed")
+    if (not self._need_calibration and calibration_input_fn):
+      raise ValueError("Should not specify calibration_input_fn because INT8 "
+                       "calibration is not needed")
+
     self._saved_model = load.load(self._input_saved_model_dir,
                                   self._input_saved_model_tags)
     func = self._saved_model.signatures[self._input_saved_model_signature_key]
@@ -896,16 +982,50 @@ class TrtGraphConverterV2(object):
         self._converted_graph_def,
         [tensor.name for tensor in frozen_func.inputs],
         [tensor.name for tensor in frozen_func.outputs])
+    # Reconstruct the output signatures using the ones from original model.
+    self._converted_func.graph.structured_outputs = nest.pack_sequence_as(
+        func.graph.structured_outputs,
+        self._converted_func.graph.structured_outputs)
+
+    if self._need_calibration:
+      for inp in calibration_input_fn():
+        self._converted_func(*map(ops.convert_to_tensor, inp))
+
+      def _save_calibration_table(node):
+        calibration_table = gen_trt_ops.get_calibration_data_op(
+            _get_canonical_engine_name(node.name))
+        node.attr["calibration_data"].s = calibration_table.numpy()
+
+      self._for_each_trt_node(self._converted_graph_def,
+                              _save_calibration_table)
+
+      # Rebuild the function since calibration has changed the graph.
+      calibrated_func = wrap_function.function_from_graph_def(
+          self._converted_graph_def,
+          [tensor.name for tensor in self._converted_func.inputs],
+          [tensor.name for tensor in self._converted_func.outputs])
+      calibrated_func.graph.structured_outputs = nest.pack_sequence_as(
+          self._converted_func.graph.structured_outputs,
+          calibrated_func.graph.structured_outputs)
+      self._converted_func = calibrated_func
 
     self._converted = True
 
-    # Wrap the converted ConcreteFunction in a Function so it can accept numpy
-    # arrays as input.
-    @def_function.function
-    def wrapper_func(*args, **kwargs):
-      return self._converted_func(*args, **kwargs)
+  def build(self, input_fn):
+    """Run inference with converted graph in order to build TensorRT engines.
 
-    return wrapper_func
+    Args:
+      input_fn: a generator function that yields input data as a list or tuple,
+        which will be used to execute the converted signature to generate TRT
+        engines.
+        Example:
+        ```
+        def input_fn():
+          yield input1, input2, input3
+        ```
+    """
+    for inp in input_fn():
+      self._converted_func(*map(ops.convert_to_tensor, inp))
 
   def save(self, output_saved_model_dir):
     """Save the converted SavedModel.
@@ -920,48 +1040,33 @@ class TrtGraphConverterV2(object):
     engine_asset_dir = tempfile.mkdtemp()
     resource_map = {}
 
-    def _serialize_and_track_engine(canonical_engine_name):
+    def _serialize_and_track_engine(node):
       """Serialize TRT engines in the cache and track them."""
       # Don't dump the same cache twice.
+      canonical_engine_name = _get_canonical_engine_name(node.name)
       if canonical_engine_name in resource_map:
         return
 
       filename = os.path.join(engine_asset_dir,
                               "trt-serialized-engine." + canonical_engine_name)
+
       try:
-        gen_trt_ops.dump_trt_engine_cache(
-            container=_TRT_ENGINE_CACHE_CONTAINER_NAME,
+        gen_trt_ops.serialize_trt_resource(
             resource_name=canonical_engine_name,
             filename=filename,
-            delete_cache_after_dump=True)
+            delete_resource=True)
       except errors.NotFoundError:
         # If user haven't run the function to populate the engine, it's fine,
         # and we don't need to track any serialized TRT engines.
         return
 
       # TODO(laigd): add an option for the user to choose the device.
-      resource_map[canonical_engine_name] = TRTEngineResource(
+      resource_map[canonical_engine_name] = _TRTEngineResource(
           canonical_engine_name, filename,
           self._conversion_params.maximum_cached_engines)
 
-    # Remove all scope prefixes in the node name. In TF 2.0, the same concrete
-    # function can be initialized multiple times with different prefixes, and
-    # this will result in the same TRTEngineOp being initialized multiple times
-    # with different cache and duplicate TRT engines.
-    # TODO(laigd): this may be caused by the fact that TRTEngineOp is not
-    # stataful, need to investigate.
-    # TODO(laigd): we rely on the fact that all functions are fully inlined
-    # before TF-TRT optimizer is called, as otherwise it may generate the same
-    # name when optimizing a different function graph. Fix this.
-    canonical_engine_name = lambda node: node.name.split("/")[-1]
-    for node in self._converted_graph_def.node:
-      if node.op == _TRT_ENGINE_OP_NAME:
-        _serialize_and_track_engine(canonical_engine_name(node))
-    for func in self._converted_graph_def.library.function:
-      for node in func.node_def:
-        if node.op == _TRT_ENGINE_OP_NAME:
-          _serialize_and_track_engine(canonical_engine_name(node))
-
+    self._for_each_trt_node(self._converted_graph_def,
+                            _serialize_and_track_engine)
     self._saved_model.trt_engine_resources = resource_map
 
     # Rewrite the signature map using the optimized ConcreteFunction.
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 61ecd79beb2..b27b4315e69 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -22,6 +22,7 @@ import gc
 import os
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
@@ -35,6 +36,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -42,13 +44,14 @@ from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
-from tensorflow.python.saved_model import load
-from tensorflow.python.saved_model import save
 from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -60,13 +63,16 @@ gen_trt_ops = LazyLoader(
     "tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops")
 
 
-class TrtConvertTest(test_util.TensorFlowTestCase):
+class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
   # Use a small max_workspace_size for tests so they don't consume too much GPU
   # memory.
   _TRT_MAX_WORKSPACE_SIZE_BYTES = 2 << 20
 
+  def mkdtemp(self):
+    return tempfile.mkdtemp(dir=self.get_temp_dir())
+
   def testGetTensorrtRewriterConfig(self):
     """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
     if not is_tensorrt_enabled():
@@ -105,29 +111,22 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         trt_optimizer.parameter_map["precision_mode"].s)
     self.assertEqual(2, trt_optimizer.parameter_map["maximum_cached_engines"].i)
 
-  def _GetConfigProto(self):
+  def _GetConfigProto(self, rewriter_config=None):
     """Get ConfigProto for session creation."""
     config = config_pb2.ConfigProto(
         gpu_options=config_pb2.GPUOptions(allow_growth=True))
+    if rewriter_config:
+      config.graph_options.rewrite_options.CopyFrom(rewriter_config)
     return config
 
   @classmethod
-  def _GetGraph(cls, inp, var):
+  def _GetGraph(cls, inp1, inp2, var):
     """Get the graph for testing."""
-    # The graph computes (input+1)^2, it looks like:
-    #
-    # input (Placeholder)  v1 (Variable)
-    #               |   \ /
-    #                \   +
-    #                 \ / \
-    #                  *   |
-    #                   \ /
-    #                    +
-    #                    |
-    #                 output (Identity)
-    add = inp + var
-    mul = inp * add
+    # The graph computes: inp1^2 + inp1*var + inp1 + inp2 + var
+    add = inp1 + var
+    mul = inp1 * add
     add = mul + add
+    add = add + inp2
     out = array_ops.identity(add, name="output")
     return out
 
@@ -139,28 +138,37 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         self.v = None
 
       @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 1, 1], dtype=dtypes.float32),
           tensor_spec.TensorSpec(shape=[None, 1, 1], dtype=dtypes.float32)
       ])
-      def run(self, inp):
+      def run(self, inp1, inp2):
         if self.v is None:
           self.v = variables.Variable([[[1.0]]], dtype=dtypes.float32)
-        return TrtConvertTest._GetGraph(inp, self.v)
+        return TrtConvertTest._GetGraph(inp1, inp2, self.v)
 
     return SimpleModel()
 
-  def _GetGraphForV1(self):
+  def _GetGraphForV1(self, device):
+
+    def _GraphFn():
+      inp1 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, 1, 1], name="input1")
+      inp2 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, 1, 1], name="input2")
+      var = variables.Variable([[[1.0]]], dtype=dtypes.float32, name="v1")
+      out = TrtConvertTest._GetGraph(inp1, inp2, var)
+      return g, var, inp1, inp2, out
+
     g = ops.Graph()
     with g.as_default():
-      with g.device("/GPU:0"):
-        inp = array_ops.placeholder(
-            dtype=dtypes.float32, shape=[None, 1, 1], name="input")
-        var = variables.Variable([[[1.0]]], dtype=dtypes.float32, name="v1")
-        out = TrtConvertTest._GetGraph(inp, var)
-        return g, var, inp, out
+      if device:
+        with g.device(device):
+          return _GraphFn()
+      return _GraphFn()
 
-  def _GetGraphDef(self):
+  def _GetGraphDefForV1(self, device):
     """Get the graph def for testing."""
-    g, var, _, _ = self._GetGraphForV1()
+    g, var, _, _, _ = self._GetGraphForV1(device)
     with self.session(graph=g, config=self._GetConfigProto()) as sess:
       sess.run(var.initializer)
       graph_def = graph_util.convert_variables_to_constants(
@@ -170,19 +178,24 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         {
             "v1": "Const",
             "add/ReadVariableOp": "Identity",
-            "input": "Placeholder",
+            "input1": "Placeholder",
+            "input2": "Placeholder",
             "add": "AddV2",
             "mul": "Mul",
             "add_1": "AddV2",
+            "add_2": "AddV2",
             "output": "Identity"
         }, node_name_to_op)
     return graph_def
 
-  def _WriteInputSavedModel(self, input_saved_model_dir):
+  def _WriteInputSavedModelForV1(self, input_saved_model_dir, device):
     """Write the saved model as an input for testing."""
-    g, var, inp, out = self._GetGraphForV1()
+    g, var, inp1, inp2, out = self._GetGraphForV1(device)
     signature_def = signature_def_utils.build_signature_def(
-        inputs={"myinput": utils.build_tensor_info(inp)},
+        inputs={
+            "myinput1": utils.build_tensor_info(inp1),
+            "myinput2": utils.build_tensor_info(inp2)
+        },
         outputs={"myoutput": utils.build_tensor_info(out)},
         method_name=signature_constants.PREDICT_METHOD_NAME)
     saved_model_builder = builder.SavedModelBuilder(input_saved_model_dir)
@@ -193,20 +206,25 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           signature_def_map={_SAVED_MODEL_SIGNATURE_KEY: signature_def})
     saved_model_builder.save()
 
-  def _ConvertGraph(self,
-                    input_saved_model_dir=None,
-                    output_saved_model_dir=None,
-                    need_calibration=False,
-                    max_batch_size=1,
-                    minimum_segment_size=3,
-                    is_dynamic_op=False,
-                    maximum_cached_engines=1,
-                    use_function_backup=False):
+  def _ConvertGraphV1(self,
+                      output_saved_model_dir=None,
+                      need_calibration=False,
+                      max_batch_size=1,
+                      minimum_segment_size=3,
+                      is_dynamic_op=False,
+                      maximum_cached_engines=1,
+                      device=None):
     """Helper method to convert a GraphDef or SavedModel using TF-TRT."""
+    input_saved_model_dir = None
+    if output_saved_model_dir:
+      input_saved_model_dir = self.mkdtemp()
+      self._WriteInputSavedModelForV1(input_saved_model_dir, device)
+
     converter = trt_convert.TrtGraphConverter(
         input_saved_model_dir=input_saved_model_dir,
         input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
-        input_graph_def=None if input_saved_model_dir else self._GetGraphDef(),
+        input_graph_def=None
+        if input_saved_model_dir else self._GetGraphDefForV1(device),
         nodes_blacklist=None if input_saved_model_dir else ["output"],
         session_config=self._GetConfigProto(),
         max_batch_size=max_batch_size,
@@ -215,8 +233,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
                         else trt_convert.TrtPrecisionMode.FP32),
         minimum_segment_size=minimum_segment_size,
         is_dynamic_op=is_dynamic_op,
-        maximum_cached_engines=maximum_cached_engines,
-        use_function_backup=use_function_backup)
+        maximum_cached_engines=maximum_cached_engines)
     output_graph_def = converter.convert()
 
     if need_calibration:
@@ -228,7 +245,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
         def next(self):
           self._data += 1
-          return {"input:0": [[[self._data]]]}
+          return {"input1:0": [[[self._data]]], "input2:0": [[[self._data]]]}
 
       output_graph_def = converter.calibrate(
           fetch_names=["output:0"],
@@ -240,17 +257,16 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     return output_graph_def
 
   def _TestTrtGraphConverter(self,
-                             input_saved_model_dir=None,
+                             device,
                              output_saved_model_dir=None,
                              need_calibration=False,
                              is_dynamic_op=False):
     """General method to test trt_convert.TrtGraphConverter()."""
-    output_graph_def = self._ConvertGraph(
-        input_saved_model_dir=input_saved_model_dir,
+    output_graph_def = self._ConvertGraphV1(
         output_saved_model_dir=output_saved_model_dir,
         need_calibration=need_calibration,
         is_dynamic_op=is_dynamic_op,
-        use_function_backup=need_calibration)
+        device=device)
     graph_defs_to_verify = [output_graph_def]
 
     if output_saved_model_dir:
@@ -263,7 +279,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       node_name_to_op = {node.name: node.op for node in graph_def.node}
       self.assertEqual(
           {
-              "input": "Placeholder",
+              "input1": "Placeholder",
+              "input2": "Placeholder",
               "TRTEngineOp_0": "TRTEngineOp",
               "output": "Identity"
           }, node_name_to_op)
@@ -281,116 +298,271 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           importer.import_graph_def(graph_def, name="")
           with self.session(config=self._GetConfigProto()) as sess:
             for test_data in range(10):
-              self.assertEqual(
-                  (test_data + 1.0)**2,
-                  sess.run("output:0", feed_dict={"input:0": [[[test_data]]]}))
+              self.assertEqual((test_data + 1.0)**2 + test_data,
+                               sess.run(
+                                   "output:0",
+                                   feed_dict={
+                                       "input1:0": [[[test_data]]],
+                                       "input2:0": [[[test_data]]]
+                                   }))
 
+  @parameterized.named_parameters([
+      ("NoDeviceAssignment", None),
+      ("GPU", "/GPU:0"),
+      ("CPU", "/CPU:0"),
+  ])
   @test_util.deprecated_graph_mode_only
-  def testTrtGraphConverter_BasicConversion(self):
+  def testTrtGraphConverter_OfflineConversion(self, device):
     """Test case for trt_convert.TrtGraphConverter()."""
     if not is_tensorrt_enabled():
       return
 
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir1")
-    self._WriteInputSavedModel(input_saved_model_dir)
-
     for need_calibration in [False, True]:
       # Use GraphDef as input.
-      self._TestTrtGraphConverter()
+      self._TestTrtGraphConverter(device)
 
       # Use SavedModel as input.
-      output_saved_model_dir = os.path.join(
-          tmp_dir, "out_dir1%s" % ("_int8" if need_calibration else ""))
       self._TestTrtGraphConverter(
-          input_saved_model_dir=input_saved_model_dir,
-          output_saved_model_dir=output_saved_model_dir,
+          device,
+          output_saved_model_dir=self.mkdtemp(),
           need_calibration=need_calibration)
 
-  def _CreateConverterV2(self, input_saved_model_dir):
+  @parameterized.named_parameters([
+      ("NoDeviceAssignment", None),
+      ("GPU", "/device:GPU:0"),
+      ("CPU", "/device:CPU:0"),
+  ])
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_OnlineConversion(self, device):
+    """Test case for TF-TRT conversion using Grappler directly."""
+    if not is_tensorrt_enabled():
+      return
+
+    conversion_params = trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
+        precision_mode=trt_convert.TrtPrecisionMode.FP32, is_dynamic_op=True)
+    config = self._GetConfigProto(
+        rewriter_config=trt_convert.get_tensorrt_rewriter_config(
+            conversion_params, is_v2=False))
+
+    with ops.Graph().as_default():
+      # Online conversion requires a frozen graph, so we reuse inp1 as the var
+      # argument.
+      inp1 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, 1, 1], name="input1")
+      inp2 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, 1, 1], name="input2")
+      if device:
+        with ops.device(device):
+          TrtConvertTest._GetGraph(inp1, inp2, inp1)
+      else:
+        TrtConvertTest._GetGraph(inp1, inp2, inp1)
+      with self.session(config=config) as sess:
+        self._TestRun(sess, batch_size=1)
+
+  def _CreateConverterV2(
+      self,
+      input_saved_model_dir,
+      input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
+      precision_mode=trt_convert.TrtPrecisionMode.FP32,
+      is_dynamic_op=True,
+      maximum_cached_engines=2):
     return trt_convert.TrtGraphConverterV2(
         input_saved_model_dir=input_saved_model_dir,
-        input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
+        input_saved_model_signature_key=input_saved_model_signature_key,
         conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
-            precision_mode=trt_convert.TrtPrecisionMode.FP32,
-            is_dynamic_op=True,
-            maximum_cached_engines=2,
-            use_function_backup=False))
+            max_workspace_size_bytes=10 << 20,  # Use a smaller workspace.
+            precision_mode=precision_mode,
+            is_dynamic_op=is_dynamic_op,
+            maximum_cached_engines=maximum_cached_engines))
+
+  def _CheckTrtOps(self, concrete_func, check_fn=None):
+    graph_def = concrete_func.graph.as_graph_def()
+    trt_op_names = []
+    for node in graph_def.node:
+      if node.op == "TRTEngineOp":
+        trt_op_names.append(node.name)
+        if check_fn:
+          check_fn(node)
+    for func in graph_def.library.function:
+      for node in func.node_def:
+        if node.op == "TRTEngineOp":
+          trt_op_names.append(node.name)
+          if check_fn:
+            check_fn(node)
+    self.assertEqual(1, len(trt_op_names))
+    self.assertIn("TRTEngineOp_0", trt_op_names[0])
+
+  def _RandomInput(self, shape, dtype=np.float32):
+    inp1 = np.random.random_sample(shape).astype(dtype)
+    inp2 = np.random.random_sample(shape).astype(dtype)
+    return inp1, inp2
 
   @test_util.run_v2_only
-  def testTrtGraphConverter_BasicConversion_v2(self):
+  def testTrtGraphConverter_DynamicConversion_v2(self):
     """Test case for trt_convert.TrtGraphConverter()."""
     if not is_tensorrt_enabled():
       return
 
-    np_input = np.random.random_sample([4, 1, 1]).astype(np.float32)
+    np_input1, np_input2 = self._RandomInput([4, 1, 1])
 
     # Create a model and save it.
-    input_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    input_saved_model_dir = self.mkdtemp()
     root = self._GetModelForV2()
-    expected_output = root.run(np_input)
+    expected_output = root.run(np_input1, np_input2)
     save.save(root, input_saved_model_dir,
               {_SAVED_MODEL_SIGNATURE_KEY: root.run})
 
     # Run TRT conversion.
     converter = self._CreateConverterV2(input_saved_model_dir)
-    converted_func = converter.convert()
-
-    def _check_trt_ops(graph_def):
-      trt_op_names = [
-          node.name for node in graph_def.node if node.op == "TRTEngineOp"
-      ]
-      for func in graph_def.library.function:
-        for node in func.node_def:
-          if node.op == "TRTEngineOp":
-            trt_op_names.append(node.name)
-      self.assertEqual(1, len(trt_op_names))
-      self.assertIn("TRTEngineOp_0", trt_op_names[0])
+    converter.convert()
 
     # Verify the converted GraphDef and ConcreteFunction.
-    self.assertIsInstance(converted_func, def_function.Function)
-    converted_concrete_func = converted_func.get_concrete_function(
-        tensor_spec.TensorSpec(shape=[None, 1, 1], dtype=dtypes.float32))
-    _check_trt_ops(converted_concrete_func.graph.as_graph_def())
+    self._CheckTrtOps(converter._converted_func)  # pylint: disable=protected-access
 
     # Save the converted model without any TRT engine cache.
-    output_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    output_saved_model_dir = self.mkdtemp()
     converter.save(output_saved_model_dir)
     unexpected_asset_file = os.path.join(
         output_saved_model_dir, "assets/trt-serialized-engine.TRTEngineOp_0")
     self.assertFalse(os.path.exists(unexpected_asset_file))
 
     # Run the converted function to populate the engine cache.
-    output_with_trt = converted_func(np_input)
-    self.assertEqual(1, len(output_with_trt))
-    self.assertAllClose(
-        expected_output, output_with_trt[0], atol=1e-6, rtol=1e-6)
+    def _InputFn():
+      yield np_input1, np_input2
+
+    converter.build(input_fn=_InputFn)
 
     # Save the converted model again with serialized engine cache.
-    output_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    output_saved_model_dir = self.mkdtemp()
     converter.save(output_saved_model_dir)
     expected_asset_file = os.path.join(
         output_saved_model_dir, "assets/trt-serialized-engine.TRTEngineOp_0")
     self.assertTrue(os.path.exists(expected_asset_file))
     self.assertTrue(os.path.getsize(expected_asset_file))
 
+    del converter
+    gc.collect()  # Force GC to destroy the TRT engine cache.
+
     # Load and verify the converted model.
     #
-    # TODO(laigd): the name of then new input_signature of the
+    # TODO(laigd): the name of the new input_signature of the
     # `root_with_trt.run` function is empty string (originaly was None),
     # investigate why.
     root_with_trt = load.load(output_saved_model_dir)
     # TODO(laigd): `root_with_trt.run` is still using the original graph without
     # trt. Consider changing that.
-    # _check_trt_ops(
-    #     root_with_trt.run.get_concrete_function().graph.as_graph_def())
+    # self._CheckTrtOps(root_with_trt.run.get_concrete_function())
     converted_signature = root_with_trt.signatures[_SAVED_MODEL_SIGNATURE_KEY]
-    _check_trt_ops(converted_signature.graph.as_graph_def())
-    output_with_trt = converted_signature(ops.convert_to_tensor(np_input))
+    self._CheckTrtOps(converted_signature)
+    output_with_trt = converted_signature(
+        inp1=ops.convert_to_tensor(np_input1),
+        inp2=ops.convert_to_tensor(np_input2))
     # The output of running the converted signature is a dict due to
     # compatibility reasons with V1 SavedModel signature mechanism.
-    output_with_trt = output_with_trt[output_with_trt.keys()[0]]
-    self.assertAllClose(expected_output, output_with_trt, atol=1e-6, rtol=1e-6)
+    self.assertAllClose(
+        expected_output,
+        list(output_with_trt.values())[0],
+        atol=1e-6,
+        rtol=1e-6)
+
+    del root_with_trt
+    gc.collect()  # Force GC to destroy the TRT engine cache.
+
+  @test_util.run_v2_only
+  def testTrtGraphConverter_StaticConversionNotSupportedInV2(self):
+    """Test case for trt_convert.TrtGraphConverter() using static mode."""
+    if not is_tensorrt_enabled():
+      return
+
+    # Create a model and save it.
+    input_saved_model_dir = self.mkdtemp()
+    root = self._GetModelForV2()
+    save.save(root, input_saved_model_dir,
+              {_SAVED_MODEL_SIGNATURE_KEY: root.run})
+
+    # Run TRT conversion.
+    converter = self._CreateConverterV2(
+        input_saved_model_dir, is_dynamic_op=False)
+    with self.assertRaisesRegexp(
+        ValueError, r"Option is_dynamic_op=False is not supported in TF 2.0"):
+      converter.convert()
+
+  @test_util.run_v2_only
+  def testTrtGraphConverter_Int8Conversion_v2(self):
+    if not is_tensorrt_enabled():
+      return
+
+    np_input1, np_input2 = self._RandomInput([4, 1, 1])
+
+    # Create a model and save it.
+    input_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    root = self._GetModelForV2()
+    expected_output = root.run(np_input1, np_input2)
+    save.save(root, input_saved_model_dir,
+              {_SAVED_MODEL_SIGNATURE_KEY: root.run})
+
+    # Run TRT conversion.
+    converter = self._CreateConverterV2(
+        input_saved_model_dir,
+        precision_mode=trt_convert.TrtPrecisionMode.INT8,
+        maximum_cached_engines=3)
+
+    # Convert and perform INT8 calibration
+    def _CalibrationInputFn():
+      yield np_input1, np_input2
+
+    converter.convert(calibration_input_fn=_CalibrationInputFn)
+
+    def _CheckFn(node):
+      self.assertTrue(len(node.attr["calibration_data"].s), node.name)
+
+    # Verify the converted GraphDef.
+    self._CheckTrtOps(converter._converted_func, _CheckFn)  # pylint: disable=protected-access
+
+    # Build another engine with different batch size.
+    def _InputFn():
+      yield self._RandomInput([5, 1, 1])
+
+    converter.build(input_fn=_InputFn)
+
+    # Save the converted model.
+    # TODO(laigd): check that it should contain two engines.
+    output_saved_model_dir = self.mkdtemp()
+    converter.save(output_saved_model_dir)
+    expected_asset_file = os.path.join(
+        output_saved_model_dir, "assets/trt-serialized-engine.TRTEngineOp_0")
+    self.assertTrue(os.path.exists(expected_asset_file))
+    self.assertTrue(os.path.getsize(expected_asset_file))
+
+    del converter
+    gc.collect()  # Force GC to destroy the TRT engine cache.
+
+    # Load and verify the converted model.
+    root_with_trt = load.load(output_saved_model_dir)
+    converted_signature = root_with_trt.signatures[_SAVED_MODEL_SIGNATURE_KEY]
+    self._CheckTrtOps(converted_signature, _CheckFn)
+    output_with_trt = converted_signature(
+        inp1=ops.convert_to_tensor(np_input1),
+        inp2=ops.convert_to_tensor(np_input2))
+    self.assertEqual(1, len(output_with_trt))
+    # The output of running the converted signature is a dict due to
+    # compatibility reasons with V1 SavedModel signature mechanism.
+    self.assertAllClose(
+        expected_output,
+        list(output_with_trt.values())[0],
+        atol=1e-6,
+        rtol=1e-6)
+
+    # Run with an input of different batch size. It should build a new engine
+    # using calibration table.
+    # TODO(laigd): check that it should contain three engines.
+    np_input1, np_input2 = self._RandomInput([6, 1, 1])
+    converted_signature(
+        inp1=ops.convert_to_tensor(np_input1),
+        inp2=ops.convert_to_tensor(np_input2))
+
+    del root_with_trt
+    gc.collect()  # Force GC to destroy the TRT engine cache.
 
   @test_util.run_v2_only
   def testTrtGraphConverter_DestroyEngineCache(self):
@@ -398,40 +570,43 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     if not is_tensorrt_enabled():
       return
 
-    np_input = np.random.random_sample([4, 1, 1]).astype(np.float32)
+    np_input1, np_input2 = self._RandomInput([4, 1, 1])
 
     # Create a model and save it.
-    input_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    input_saved_model_dir = self.mkdtemp()
     root = self._GetModelForV2()
     save.save(root, input_saved_model_dir,
               {_SAVED_MODEL_SIGNATURE_KEY: root.run})
 
     # Run TRT conversion.
     converter = self._CreateConverterV2(input_saved_model_dir)
-    converted_func = converter.convert()
-    converted_func(np_input)  # Populate the TRT engine cache.
-    output_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    converter.convert()
+
+    def _InputFn():
+      yield np_input1, np_input2
+
+    converter.build(input_fn=_InputFn)  # Populate the TRT engine cache.
+    output_saved_model_dir = self.mkdtemp()
     converter.save(output_saved_model_dir)
 
-    def _destroy_cache():
+    def _DestroyCache():
       with ops.device("GPU:0"):
-        handle = gen_trt_ops.create_trt_engine_cache_handle(
-            container=trt_convert._TRT_ENGINE_CACHE_CONTAINER_NAME,
+        handle = gen_trt_ops.create_trt_resource_handle(
             resource_name="TRTEngineOp_0")
         gen_resource_variable_ops.destroy_resource_op(
             handle, ignore_lookup_error=False)
 
     with self.assertRaisesRegexp(errors.NotFoundError,
                                  r"Resource .* does not exist."):
-      _destroy_cache()
+      _DestroyCache()
 
     # Load the converted model and make sure the engine cache is populated by
     # default.
     root = load.load(output_saved_model_dir)
-    _destroy_cache()
+    _DestroyCache()
     with self.assertRaisesRegexp(errors.NotFoundError,
                                  r"Resource .* does not exist."):
-      _destroy_cache()
+      _DestroyCache()
 
     # Load the converted model again and make sure the engine cache is destroyed
     # when the model goes out of scope.
@@ -440,36 +615,149 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     gc.collect()  # Force GC to destroy the TRT engine cache.
     with self.assertRaisesRegexp(errors.NotFoundError,
                                  r"Resource .* does not exist."):
-      _destroy_cache()
+      _DestroyCache()
 
-  def _TestRun(self,
-               sess,
-               batch_size,
-               use_function_backup=False,
-               expect_engine_is_run=True):
-    try:
-      result = sess.run(
-          "output:0", feed_dict={"input:0": [[[1.0]]] * batch_size})
-      self.assertAllEqual([[[4.0]]] * batch_size, result)
-    except errors.OpError as e:
-      # This should happen only when fallback path is disabled and TRT engine
-      # fails to run.
-      self.assertTrue(not use_function_backup and not expect_engine_is_run)
-      self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e))
+  def _CompareSavedModel(self, model_class):
+    signature_key = "serving_default"
+
+    def _GetModelPaths(model_class):
+      input_saved_model_dir = self.mkdtemp()
+      root = model_class()
+      save.save(root, input_saved_model_dir)
+
+      converter = self._CreateConverterV2(
+          input_saved_model_dir, input_saved_model_signature_key=signature_key)
+      converter.convert()
+      output_saved_model_dir = self.mkdtemp()
+      converter.save(output_saved_model_dir)
+      return input_saved_model_dir, output_saved_model_dir
+
+    def _GetSignatureDef(export_dir):
+      saved_model_proto = loader_impl.parse_saved_model(export_dir)
+      self.assertEqual(1, len(saved_model_proto.meta_graphs))
+      meta_graph = saved_model_proto.meta_graphs[0]
+      self.assertIn(signature_key, meta_graph.signature_def)
+      return meta_graph.signature_def[signature_key]
+
+    def _CompareSignatureDef(original_def, converted_def, is_input):
+      endpoints = original_def.inputs if is_input else original_def.outputs
+      converted_endpoints = (
+          converted_def.inputs if is_input else converted_def.outputs)
+      self.assertEqual(set(endpoints.keys()), set(converted_endpoints.keys()))
+      for key in endpoints:
+        original_input = endpoints[key]
+        converted_input = converted_endpoints[key]
+        self.assertEqual(original_input.name, converted_input.name)
+        self.assertEqual(original_input.dtype, converted_input.dtype)
+        self.assertEqual(
+            tensor_shape.TensorShape(original_input.tensor_shape).as_list(),
+            tensor_shape.TensorShape(converted_input.tensor_shape).as_list())
+
+    def _GetStructuredOutputs(export_dir):
+      root = load.load(export_dir)
+      return root.signatures[signature_key].structured_outputs
+
+    saved_model_path, converted_saved_model_path = _GetModelPaths(model_class)
+    original_def = _GetSignatureDef(saved_model_path)
+    converted_def = _GetSignatureDef(converted_saved_model_path)
+    self.assertEqual(original_def.method_name, converted_def.method_name)
+    _CompareSignatureDef(original_def, converted_def, True)
+    _CompareSignatureDef(original_def, converted_def, False)
+
+    self.assertEqual(
+        _GetStructuredOutputs(saved_model_path),
+        _GetStructuredOutputs(converted_saved_model_path))
+
+  @test_util.run_v2_only
+  def testRetainSignatureInfo_NoInputs(self):
+
+    class _Model(tracking.AutoTrackable):
+
+      @def_function.function(input_signature=[])
+      def run(self):
+        return array_ops.constant(1.0)
+
+    self._CompareSavedModel(_Model)
+
+  @test_util.run_v2_only
+  def testRetainSignatureInfo_OneInput(self):
+
+    class _Model(tracking.AutoTrackable):
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 1], dtype=dtypes.float32)
+      ])
+      def run(self, inp):
+        return inp + inp * inp
+
+    self._CompareSavedModel(_Model)
+
+  @test_util.run_v2_only
+  def testRetainSignatureInfo_TwoInputs(self):
+
+    class _Model(tracking.AutoTrackable):
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 1], dtype=dtypes.float32),
+          tensor_spec.TensorSpec(shape=[None, 2], dtype=dtypes.float32)
+      ])
+      def run(self, inp1, inp2):
+        return inp1 + inp2 * inp2
+
+    self._CompareSavedModel(_Model)
+
+  @test_util.run_v2_only
+  def testRetainSignatureInfo_OneOutputSignatureKey(self):
+
+    class _Model(tracking.AutoTrackable):
+
+      @def_function.function(input_signature=[])
+      def run(self):
+        return {"my_output": array_ops.constant(1.0)}
+
+    self._CompareSavedModel(_Model)
+
+  @test_util.run_v2_only
+  def testRetainSignatureInfo_TwoOutputSignatureKeys(self):
+
+    class _Model(tracking.AutoTrackable):
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 1], dtype=dtypes.float32)
+      ])
+      def run(self, inp):
+        # Here the keys are not ordered lexicographically on purpose.
+        return {
+            "output_b": array_ops.constant(1.0),
+            "output_a": inp + inp * inp
+        }
+
+    self._CompareSavedModel(_Model)
+
+  def _TestRun(self, sess, batch_size):
+    result = sess.run(
+        "output:0",
+        feed_dict={
+            "input1:0": [[[1.0]]] * batch_size,
+            "input2:0": [[[1.0]]] * batch_size
+        })
+    self.assertAllEqual([[[5.0]]] * batch_size, result)
 
   @test_util.deprecated_graph_mode_only
   def testTrtGraphConverter_MinimumSegmentSize(self):
     if not is_tensorrt_enabled():
       return
-    output_graph_def = self._ConvertGraph(minimum_segment_size=5)
+    output_graph_def = self._ConvertGraphV1(minimum_segment_size=7)
     node_name_to_op = {node.name: node.op for node in output_graph_def.node}
     self.assertEqual(
         {
             "add/ReadVariableOp": "Const",
-            "input": "Placeholder",
+            "input1": "Placeholder",
+            "input2": "Placeholder",
             "add": "AddV2",
             "mul": "Mul",
             "add_1": "AddV2",
+            "add_2": "AddV2",
             "output": "Identity"
         }, node_name_to_op)
 
@@ -478,16 +766,11 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     if not is_tensorrt_enabled():
       return
 
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir2")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir2")
-    self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = self._ConvertGraph(
-        input_saved_model_dir=input_saved_model_dir,
+    output_saved_model_dir = self.mkdtemp()
+    output_graph_def = self._ConvertGraphV1(
         output_saved_model_dir=output_saved_model_dir,
         is_dynamic_op=True,
-        maximum_cached_engines=2,
-        use_function_backup=False)  # Disallow fallback.
+        maximum_cached_engines=2)
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
@@ -513,19 +796,14 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         # the max, it should evict an old engine and create a new one.
         self._TestRun(sess, 3)
 
-  def _TestStaticOp(self, use_function_backup):
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_StaticOp(self):
     if not is_tensorrt_enabled():
       return
 
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir3")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir3")
-    self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = self._ConvertGraph(
-        input_saved_model_dir=input_saved_model_dir,
-        output_saved_model_dir=output_saved_model_dir,
-        maximum_cached_engines=2,  # This is noop, added just for testing.
-        use_function_backup=use_function_backup)
+    output_saved_model_dir = self.mkdtemp()
+    output_graph_def = self._ConvertGraphV1(
+        output_saved_model_dir=output_saved_model_dir, maximum_cached_engines=1)
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
@@ -533,18 +811,10 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       with self.session(config=self._GetConfigProto()) as sess:
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
-        self._TestRun(
-            sess,
-            1,
-            use_function_backup=use_function_backup,
-            expect_engine_is_run=True)
+        self._TestRun(sess, 1)
         # Run with batch size 2, which exceed the max_batch_size, it should try
         # to fall back to TF function.
-        self._TestRun(
-            sess,
-            2,
-            use_function_backup=use_function_backup,
-            expect_engine_is_run=False)
+        self._TestRun(sess, 2)
 
     # Test the output SavedModel
     with ops.Graph().as_default():
@@ -552,26 +822,10 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
-        self._TestRun(
-            sess,
-            1,
-            use_function_backup=use_function_backup,
-            expect_engine_is_run=True)
+        self._TestRun(sess, 1)
         # Run with batch size 2, which exceed the max_batch_size, it should try
         # to fall back to TF function.
-        self._TestRun(
-            sess,
-            2,
-            use_function_backup=use_function_backup,
-            expect_engine_is_run=False)
-
-  @test_util.deprecated_graph_mode_only
-  def testTrtGraphConverter_StaticOp_NoFallback(self):
-    self._TestStaticOp(use_function_backup=False)
-
-  @test_util.deprecated_graph_mode_only
-  def testTrtGraphConverter_StaticOp_WithFallback(self):
-    self._TestStaticOp(use_function_backup=True)
+        self._TestRun(sess, 2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index b4b540d51af..c3e280ed229 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -79,6 +79,7 @@ cuda_py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
@@ -86,6 +87,7 @@ cuda_py_test(
     ],
     tags = [
         "no_mac",
+        "no_rocm",  # XLA support is not enabled on the ROCm platform
         "no_windows",
     ],
     xla_enable_strict_auto_jit = True,
diff --git a/tensorflow/python/compiler/xla/xla.py b/tensorflow/python/compiler/xla/xla.py
index 1fa462f3794..55bfaeb3931 100644
--- a/tensorflow/python/compiler/xla/xla.py
+++ b/tensorflow/python/compiler/xla/xla.py
@@ -223,7 +223,7 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
       for index in xrange(len(op.inputs)):
         x = op.inputs[index]
         real_x = self.AddValue(x)
-        if real_x != x:
+        if real_x is not x:
           op._update_input(index, real_x)  # pylint: disable=protected-access
 
     if external_control_inputs:
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index af7c4736083..a6ee0d7dec7 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -230,6 +230,71 @@ class AutotuneBenchmark(test.Benchmark):
         (("_autotune_%s" % algorithm.name) if autotune else ""))
     return np.median(deltas)
 
+  def benchmark_map_batch_and_interleave(self):
+    a = self._benchmark_map_batch_and_interleave(autotune=False)
+    b = self._benchmark_map_batch_and_interleave(autotune=True)
+    c = self._benchmark_map_batch_and_interleave(
+        autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT)
+    print("HillClimb vs Default speedup: %f" % (a / b))
+    print("GradientDescent vs Default speedup: %f" % (a / c))
+
+  def _benchmark_map_batch_and_interleave(
+      self, autotune, algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB):
+    batch_size = 16
+    k = 1024 * 1024
+    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
+    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
+    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
+    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
+    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
+    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
+
+    dataset = dataset_a
+    dataset = dataset.map(
+        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
+    dataset = dataset.batch(batch_size=batch_size)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=dataset_ops.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=dataset_ops.AUTOTUNE,
+        cycle_length=2)
+
+    dataset_c = dataset_c.map(
+        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
+    dataset_c = dataset_c.batch(batch_size=batch_size)
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_and_batch_fusion = True
+    options.experimental_optimization.autotune = autotune
+    if autotune:
+      options.experimental_optimization.autotune_algorithm = algorithm.value
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next)
+        end = time.time()
+        deltas.append(end - start)
+
+    self.report_benchmark(
+        iters=1000,
+        wall_time=np.median(deltas),
+        name="map_batch_and_interleave" +
+        (("_autotune_%s" % algorithm.name) if autotune else ""))
+    return np.median(deltas)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 6482458e730..ef71774ec0e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -159,6 +159,7 @@ tf_py_test(
     srcs = ["replicate_test.py"],
     additional_deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_combinations",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -172,6 +173,7 @@ tf_py_test(
     srcs = ["replicate_cluster_test.py"],
     additional_deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_combinations",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -540,7 +542,7 @@ py_test(
     size = "medium",
     srcs = ["rejection_resample_test.py"],
     python_version = "PY2",
-    shard_count = 2,
+    shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",
@@ -574,6 +576,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
@@ -660,6 +663,7 @@ py_test(
     name = "snapshot_test",
     srcs = ["snapshot_test.py"],
     python_version = "PY2",
+    shard_count = 10,
     srcs_version = "PY2AND3",
     deps = [
         ":reader_dataset_ops_test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 5d964c11bb5..e9d5b43cd58 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -24,10 +24,11 @@ from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.experimental.ops import unique
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -37,7 +38,6 @@ def chunk(l, n):
     yield l[i:i + n]
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
                            parameterized.TestCase):
 
@@ -65,7 +65,10 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     else:
       self.assertDatasetProduces(dataset, list(chunk(expected, batch)))
 
-  @parameterized.parameters(True, False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(shuffle=[True, False])))
   def testFlatMapReaderPipeline(self, shuffle):
     dataset = dataset_ops.Dataset.list_files(
         self.test_filenames, shuffle=shuffle)
@@ -80,6 +83,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testZipReaderPipeline(self):
     dataset1 = dataset_ops.Dataset.list_files(
         self.test_filenames, shuffle=False)
@@ -101,7 +105,10 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertDatasetProduces(dataset, expected)
 
-  @parameterized.parameters(True, False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(shuffle=[True, False])))
   def testConcatenateReaderPipeline(self, shuffle):
     dataset1 = dataset_ops.Dataset.list_files(
         self.test_filenames, shuffle=shuffle)
@@ -125,7 +132,10 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     expected += expected
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 8, shuffle)
 
-  @parameterized.parameters(True, False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(shuffle=[True, False])))
   def testPipelineWithMap(self, shuffle):
     dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
     dataset = dataset.apply(
@@ -141,6 +151,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDirectFilenameTFRecordReaderPipeline(self):
     dataset = core_readers.TFRecordDataset(self.test_filenames)
     dataset = distribute._AutoShardDataset(dataset, 5, 0)
@@ -152,7 +163,10 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProduces(dataset, expected)
 
-  @parameterized.parameters(True, False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(shuffle=[True, False])))
   def testValidPipelineWithRangeDataset(self, shuffle):
     dataset = dataset_ops.Dataset.range(self._num_files)
     dataset = dataset.map(lambda n: string_ops.string_join(  # pylint:disable=g-long-lambda
@@ -171,9 +185,13 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
-  @parameterized.parameters((1, 0, 10, 10), (2, 1, 20, 5), (10, 1, 1, 10))
-  def testStandardReaderPipeline(self, num_epochs, index, batch_size,
-                                 parallel_reads):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(params=[(1, 0, 10, 10), (2, 1, 20, 5),
+                                       (10, 1, 1, 10)])))
+  def testStandardReaderPipeline(self, params):
+    num_epochs, index, batch_size, parallel_reads = params
     dataset = readers.make_tf_record_dataset(
         file_pattern=self.test_filenames,
         num_epochs=num_epochs,
@@ -195,7 +213,10 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(outputs())
 
-  @parameterized.parameters(True, False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(shuffle=[True, False])))
   def testSampleResNetPipeline(self, shuffle):
     dataset = dataset_ops.Dataset.list_files(
         self.test_filenames, shuffle=shuffle)
@@ -211,6 +232,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWorkersGreaterThanNumFiles(self):
     dataset = dataset_ops.Dataset.list_files(self.test_filenames)
     dataset = dataset.apply(
@@ -219,6 +241,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = distribute._AutoShardDataset(dataset, 500, 499)
     self.assertDatasetProduces(dataset, [])
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTFRecordReaderWithDirectFileNames(self):
     # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
     # a flat_map automatically.
@@ -232,6 +255,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProduces(dataset, expected)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTFRecordReaderWithDirectFileNamesAndShapes(self):
     # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
     # a flat_map automatically.
@@ -248,23 +272,27 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testShardOutOfRange(self):
     dataset = dataset_ops.Dataset.range(5)
     with self.assertRaises(errors.InvalidArgumentError):
       dataset = distribute._AutoShardDataset(dataset, 10, 0)
       self.evaluate(self.getNext(dataset)())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testShardOutOfRangeEmptyDataset(self):
     dataset = dataset_ops.Dataset.range(0)
     with self.assertRaises(errors.OutOfRangeError):
       dataset = distribute._AutoShardDataset(dataset, 10, 0)
       self.evaluate(self.getNext(dataset)())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testNoReaderPipelines(self):
     dataset = dataset_ops.Dataset.range(1024)
     dataset = distribute._AutoShardDataset(dataset, 2, 0)
     self.assertDatasetProduces(dataset, [i for i in range(1024) if i % 2 == 0])
 
+  @combinations.generate(test_base.default_test_combinations())
   def testUnknownOpInPipelineStillShardsAtTheEnd(self):
     dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
     dataset = dataset.flat_map(core_readers.TFRecordDataset)
@@ -279,6 +307,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProduces(dataset, expected)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testInvalidWorkerIndex(self):
     dataset = dataset_ops.Dataset.list_files(self.test_filenames)
     dataset = dataset.flat_map(core_readers.TFRecordDataset)
@@ -289,7 +318,6 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       self.evaluate(self.getNext(dataset)())
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class AutoShardTextLineDatasetTest(
     reader_dataset_ops_test_base.TextLineDatasetTestBase,
     parameterized.TestCase):
@@ -300,6 +328,7 @@ class AutoShardTextLineDatasetTest(
     self._num_records = 10
     self.test_filenames = self._createFiles(self._num_files, self._num_records)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDirectFilenameTextLineReaderPipeline(self):
     dataset = core_readers.TextLineDataset(self.test_filenames)
     dataset = distribute._AutoShardDataset(dataset, 5, 0)
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index df69a9dbb01..4a8c7d1ccc6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -22,6 +22,8 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
@@ -137,6 +139,14 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
           dataset_ops.Dataset.from_tensors(1)
       ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
 
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "out of range"):
+      dataset = interleave_ops.choose_from_datasets(
+          [dataset_ops.Dataset.from_tensors(0)],
+          choice_dataset=dataset_ops.Dataset.from_tensors(
+              constant_op.constant(1, dtype=dtypes.int64)))
+      next_element = self.getNext(dataset)
+      self.evaluate(next_element())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 60b493b5d77..0e9042b2ef8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -46,7 +46,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x % 2, reducer))
       self.assertDatasetProduces(
           dataset,
-          expected_shapes=tensor_shape.scalar(),
+          expected_shapes=tensor_shape.TensorShape([]),
           expected_output=[(i - 1) * i, i * i])
 
   def testAverage(self):
@@ -65,7 +65,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
               lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
       self.assertDatasetProduces(
           dataset,
-          expected_shapes=tensor_shape.scalar(),
+          expected_shapes=tensor_shape.TensorShape([]),
           expected_output=[i - 1, i])
 
   def testConcat(self):
@@ -81,8 +81,8 @@ class GroupByReducerTest(test_base.DatasetTestBase):
                grouping.group_by_reducer(lambda x, y: y % 2, reducer))
       self.assertDatasetProduces(
           dataset,
-          expected_shapes=tensor_shape.scalar(),
-          expected_output=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
+          expected_shapes=tensor_shape.TensorShape([]),
+          expected_output=[b"acegikmoqs"[:i], b"bdfhjlnprt"[:i]])
 
   def testSparseSum(self):
     def _sparse(i):
@@ -100,7 +100,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
       self.assertDatasetProduces(
           dataset,
-          expected_shapes=tensor_shape.scalar(),
+          expected_shapes=tensor_shape.TensorShape([]),
           expected_output=[(i - 1) * i, i * i])
 
   def testChangingStateShape(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index 267e3e89487..16c323b3790 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -221,6 +221,54 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
           compression_type=compression_type,
       )
 
+  def testMakeCSVDataset_withCompressionTypeAndNoColumnNames(self):
+    """Tests `compression_type` argument."""
+    record_defaults = [
+        constant_op.constant([], dtypes.int32),
+        constant_op.constant([], dtypes.int64),
+        constant_op.constant([], dtypes.float32),
+        constant_op.constant([], dtypes.float64),
+        constant_op.constant([], dtypes.string)
+    ]
+
+    column_names = ["col%d" % i for i in range(5)]
+    inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"],
+              [
+                  ",".join(x for x in column_names), "10,11,12,13,14",
+                  "15,16,17,18,19"
+              ]]
+    expected_output = [[0, 1, 2, 3, b"4"], [5, 6, 7, 8, b"9"],
+                       [10, 11, 12, 13, b"14"], [15, 16, 17, 18, b"19"]]
+    label = "col0"
+
+    self._test_dataset(
+        inputs,
+        expected_output=expected_output,
+        expected_keys=column_names,
+        label_name=label,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False,
+        header=True,
+        column_defaults=record_defaults,
+        compression_type="GZIP",
+    )
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "compression_type .ZLIB. is not supported"):
+      self._test_dataset(
+          inputs,
+          expected_output=expected_output,
+          expected_keys=column_names,
+          label_name=label,
+          batch_size=1,
+          num_epochs=1,
+          shuffle=False,
+          header=True,
+          column_defaults=record_defaults,
+          compression_type="ZLIB",
+      )
+
   def testMakeCSVDataset_withBadInputs(self):
     """Tests that exception is raised when input is malformed.
     """
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 31b9cd65c4c..ec1760398fa 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -102,15 +102,17 @@ class MakeTFRecordDatasetTest(
 
   def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
                     seed=None):
-    dataset = readers.make_tf_record_dataset(
-        file_pattern=self.test_filenames,
-        num_epochs=num_epochs,
-        batch_size=batch_size,
-        num_parallel_reads=num_parallel_reads,
-        shuffle=True,
-        shuffle_seed=seed)
 
-    next_element = self.getNext(dataset)
+    def dataset_fn():
+      return readers.make_tf_record_dataset(
+          file_pattern=self.test_filenames,
+          num_epochs=num_epochs,
+          batch_size=batch_size,
+          num_parallel_reads=num_parallel_reads,
+          shuffle=True,
+          shuffle_seed=seed)
+
+    next_element = self.getNext(dataset_fn())
     first_batches = []
     try:
       while True:
@@ -118,7 +120,7 @@ class MakeTFRecordDatasetTest(
     except errors.OutOfRangeError:
       pass
 
-    next_element = self.getNext(dataset)
+    next_element = self.getNext(dataset_fn())
     second_batches = []
     try:
       while True:
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index a0253ad0337..61562f20897 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -227,7 +227,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
           array_ops.check_numerics(
               constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
       dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
-      get_next = self.getNext(dataset)
+      get_next = self.getNext(dataset, requires_initialization=True)
       self.evaluate(get_next())
 
   def testMapAndBatchShapeMismatch(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 824cc680abb..8d429b01cc1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -17,9 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -29,8 +31,13 @@ from tensorflow.python.platform import test
 class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
 
   def testShuffleAndRepeatFusion(self):
+    if tf2.enabled() and context.executing_eagerly():
+      expected = "Shuffle"
+    else:
+      expected = "ShuffleAndRepeat"
+
     dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
+        optimization.assert_next([expected])).shuffle(10).repeat(2)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.shuffle_and_repeat_fusion = True
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 82c13cb8491..c12d9916041 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -48,96 +48,98 @@ def _flat_shapes(dataset):
   return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
 
 
-@parameterized.named_parameters(("WithDropRemainder", True),
-                                ("WithoutDropRemainder", False))
 @test_util.run_all_in_graph_and_eager_modes
-class RebatchDatasetTest(test_base.DatasetTestBase):
+class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  drop_remainder_cases = [("WithDropRemainder", True),
+                          ("WithoutDropRemainder", False)]
+
+  @parameterized.named_parameters(drop_remainder_cases)
   def testBasic(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1024).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[8] if drop_remainder else [None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
 
     expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testScalarInputError(self, _):
+  def testScalarInputError(self):
     dataset = dataset_ops.Dataset.range(1024)
+    distribute._RebatchDataset(dataset.batch(4), num_replicas=4)
     with self.assertRaisesRegexp(ValueError, "at least one dimension"):
-      distribute._RebatchDataset(dataset, num_workers=4)
+      distribute._RebatchDataset(dataset, num_replicas=4)
 
-  def testNotDivisible(self, drop_remainder):
+  @parameterized.named_parameters(drop_remainder_cases)
+  def testBatchNotDivisibleByNumReplicas(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1024).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=5)
-    expected_output = [[k for k in range(i, i + 7)] for i in range(0, 1022, 7)]  # pylint: disable=g-complex-comprehension
-    if not drop_remainder:
-      expected_output.append([1022, 1023])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = []
+    i = 0
+    for _ in range(32):  # number of steps
+      # first four minibatches have seven elements
+      for _ in range(4):
+        expected_output.append([k for k in range(i, i + 7)])
+        i += 7
+      # last minibatch has four elements
+      expected_output.append([k for k in range(i, i + 4)])
+      i += 4
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testTupleOutput(self, drop_remainder):
-    dataset = (
-        dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(
-            32, drop_remainder=drop_remainder))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
+  def testTupleOutput(self):
+    dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
                         [k for k in range(i, i + 8)])
                        for i in range(0, 1024, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testNestedDictionaryOutput(self, drop_remainder):
+  def testNestedDictionaryOutput(self):
     dataset = dataset_ops.Dataset.range(1024).map(
-        lambda x: {"a": x, "b": {"c": x}}).batch(
-            32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
+        lambda x: {"a": x, "b": {"c": x}}).batch(32)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
                         "b": {"c": [k for k in range(i, i + 8)]}}
                        for i in range(0, 1024, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testFinalPartialBatchOriginal(self, drop_remainder):
+  @parameterized.named_parameters(drop_remainder_cases)
+  def testFinalPartialBatch(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(1032).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[8] if drop_remainder else [None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
 
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1032, 8)]  # pylint: disable=g-complex-comprehension
+    # if drop_remainder, the final partial batch is dropped, even though it
+    # makes up a complete minibatch.
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    if not drop_remainder:
+      expected_output.append([k for k in range(1024, 1032)])
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  @parameterized.named_parameters(drop_remainder_cases)
   def testFinalPartialBatchAfterRebatch(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(34).batch(
         32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[8] if drop_remainder else [None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
 
     expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
     if not drop_remainder:
       expected_output += [[32, 33]]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testMultipleBatches(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(128).batch(
-        4, drop_remainder=drop_remainder)
-    dataset = dataset.batch(8, drop_remainder=drop_remainder)
-    self.assertEqual(
-        [[8, 4]] if drop_remainder else [[None, None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
+  def testMultipleBatches(self):
+    dataset = dataset_ops.Dataset.range(128).batch(4).batch(8)
+    self.assertEqual([[None, None]],
+                     [ts.as_list() for ts in _flat_shapes(dataset)])
+
     # Each element is a list of 8 elements where each element is a list of 4.
     expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
                         for j in range(i, i + 32, 4)]  # generates 8 elements
@@ -145,39 +147,30 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(dataset, expected_output)
 
     rebatched_dataset = distribute._RebatchDataset(dataset, 4)
-    self.assertEqual(
-        [[2, 4]] if drop_remainder else [[None, None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    self.assertEqual([[None, None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # Each element is a list of 2 elements where each element is a list of 4.
     expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
                         for j in range(i, i + 8, 4)]  # generates 2 elements
                        for i in range(0, 128, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testMapAndBatch(self, drop_remainder):
+  def testMapAndBatch(self):
     dataset = dataset_ops.Dataset.range(1024).apply(
-        batching.map_and_batch(
-            math_ops.square, 32, drop_remainder=drop_remainder))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+        batching.map_and_batch(math_ops.square, 32))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = [[k**2 for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
                        for i in range(0, 1024, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testMapAndBatchWithCapturedInput(self, drop_remainder):
+  def testMapAndBatchWithCapturedInput(self):
     captured_t = variables.Variable(42)
     dataset = dataset_ops.Dataset.range(1024).apply(
-        batching.map_and_batch(
-            lambda x: captured_t, 32, drop_remainder=drop_remainder))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual([[32 if drop_remainder else None]],
-                     [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual([[8 if drop_remainder else None]],
+        batching.map_and_batch(lambda x: captured_t, 32))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = [[42 for _ in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
                        for i in range(0, 1024, 8)]
@@ -185,22 +178,19 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(
         rebatched_dataset, expected_output, requires_initialization=True)
 
-  def testPaddedBatch(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(128).batch(4).padded_batch(
-        8, padded_shapes=[5], drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8, 5]] if drop_remainder else [[None, 5]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
+  def testPaddedBatch(self):
+    dataset = dataset_ops.Dataset.range(128).batch(
+        4, drop_remainder=True).padded_batch(
+            8, padded_shapes=[5])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     # Each element is a list of 8 elements in which each element is a list of 5
     # elements, first four are numbers and the last one is a padded zero.
     expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
                         for j in range(i, i + 32, 4)]  # generates 8 elements
                        for i in range(0, 128, 32)]
     self.assertDatasetProduces(dataset, expected_output)
-    self.assertEqual(
-        [[2, 5]] if drop_remainder else [[None, 5]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    self.assertEqual([[None, 5]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # Each element is a list of 2 elements in which each element is a list of 5
     # elements, first four are numbers and the last one is a padded zero.
     expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
@@ -208,32 +198,22 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for i in range(0, 128, 8)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testConcatenate(self, drop_remainder):
-    dataset1 = dataset_ops.Dataset.range(64).batch(
-        8, drop_remainder=drop_remainder)
-    dataset2 = dataset_ops.Dataset.range(32).batch(
-        8, drop_remainder=drop_remainder)
+  def testConcatenate(self):
+    dataset1 = dataset_ops.Dataset.range(64).batch(8)
+    dataset2 = dataset_ops.Dataset.range(32).batch(8)
     dataset = dataset1.concatenate(dataset2)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[2 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = ([[i, i + 1] for i in range(0, 64, 2)] +
                        [[i, i + 1] for i in range(0, 32, 2)])
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testConcatenateDifferentShapes(self, drop_remainder):
-    dataset1 = dataset_ops.Dataset.range(64).batch(
-        16, drop_remainder=drop_remainder)
-    dataset2 = dataset_ops.Dataset.range(32).batch(
-        8, drop_remainder=drop_remainder)
+  def testConcatenateDifferentShapes(self):
+    dataset1 = dataset_ops.Dataset.range(64).batch(16)
+    dataset2 = dataset_ops.Dataset.range(32).batch(8)
     dataset = dataset1.concatenate(dataset2)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     self.assertEqual(
         [[None]],
         [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
@@ -241,62 +221,56 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        [[i, i + 1] for i in range(0, 32, 2)])
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testZip(self, drop_remainder):
-    dataset1 = dataset_ops.Dataset.range(64).batch(
-        8, drop_remainder=drop_remainder)
-    dataset2 = dataset_ops.Dataset.range(32).batch(
-        8, drop_remainder=drop_remainder)
+  def testZip(self):
+    dataset1 = dataset_ops.Dataset.range(64).batch(8)
+    dataset2 = dataset_ops.Dataset.range(32).batch(8)
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8], [8]] if drop_remainder else [[None], [None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[2], [2]] if drop_remainder else [[None], [None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None], [None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testZipDifferentShapes(self, drop_remainder):
-    dataset1 = dataset_ops.Dataset.range(64).batch(
-        16, drop_remainder=drop_remainder)
-    dataset2 = dataset_ops.Dataset.range(32).batch(
-        8, drop_remainder=drop_remainder)
+  def testZipDifferentShapes(self):
+    dataset1 = dataset_ops.Dataset.range(64).batch(16)
+    dataset2 = dataset_ops.Dataset.range(32).batch(8)
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[16], [8]] if drop_remainder else [[None], [None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual(
-        [[4], [2]] if drop_remainder else [[None], [None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None], [None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = [([2 * i, 2 * i + 1, 2 * i + 2, 2 * i + 3], [i, i + 1])
                        for i in range(0, 32, 2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testUnsupportedTransformError(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=drop_remainder).apply(sleep.sleep(10))
+  def testUnsupportedTransformError(self):
+    dataset = dataset_ops.Dataset.range(1024).batch(32).apply(sleep.sleep(10))
     with self.assertRaises(errors.InvalidArgumentError):
-      rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
+      rebatched_dataset = distribute._RebatchDataset(
+          dataset, num_replicas=4, use_fallback=False)
       next_element = self.getNext(rebatched_dataset)
       self.evaluate(next_element())
 
-  def testFlatMapBatching(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(
-        2).flat_map(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
-            32, drop_remainder=drop_remainder))
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
+  def testUnsupportedTransformInFlatMapError(self):
+    dataset = dataset_ops.Dataset.range(2).flat_map(
+        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32).apply(sleep.sleep(10)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      rebatched_dataset = distribute._RebatchDataset(
+          dataset, num_replicas=4, use_fallback=False)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testFlatMapBatching(self):
+    dataset = dataset_ops.Dataset.range(2).flat_map(
+        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32))
     # Two elements where each element is range(32)
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # Two elements where each element is a list of 4 elements where each element
     # is a list of 8.
     expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
@@ -304,21 +278,18 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for i in range(0, 32, 8)]  # generates 4 elements
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testInterleaveBatching(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(
-        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
-            32, drop_remainder=drop_remainder), cycle_length=2)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
+  def testInterleaveBatching(self):
+    dataset = dataset_ops.Dataset.range(2).interleave(
+        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32),
+        cycle_length=2)
     # Two elements where each element is range(32)
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # List of 4 elements where each element is a list of 8 numbering from 0 to
     # 31 repeated twice.
     expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
@@ -326,22 +297,19 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for _ in range(2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testParallelInterleaveBatching(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(
-        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
-            32, drop_remainder=drop_remainder), cycle_length=2,
-                      num_parallel_calls=2)
-    self.assertEqual(
-        [[32 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(dataset)])
+  def testParallelInterleaveBatching(self):
+    dataset = dataset_ops.Dataset.range(2).interleave(
+        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32),
+        cycle_length=2,
+        num_parallel_calls=2)
     # Two elements where each element is range(32)
     expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
-    self.assertEqual(
-        [[8 if drop_remainder else None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # List of 4 elements where each element is a list of 8 numbering from 0 to
     # 31 repeated twice in collated fashion i.e [0...8], [0...8] etc.
     expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
@@ -349,17 +317,17 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for _ in range(2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testGroupByWindowStaticBatch(self, drop_remainder):
+  def testGroupByWindowStaticBatch(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)])
     reduce_fn = lambda bucket_id, ds: ds.batch(  # pylint: disable=g-long-lambda
-        batch_size=10, drop_remainder=drop_remainder)
+        batch_size=10)
     dataset = dataset.apply(
         grouping.group_by_window(
             key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=2)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=2)
 
-    self.assertEqual([[5, 3] if drop_remainder else [None, 3]],
+    self.assertEqual([[None, 3]],
                      [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     # pylint: disable=g-complex-comprehension
     expected_output = [[[j + i * 4 + k * 20] * 3
@@ -368,36 +336,87 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for k in range(2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testGroupByWindowDynamicBatch(self, drop_remainder):
+  def testGroupByWindowDynamicBatch(self):
+    # {0, 1, 0, 1, ...}
     dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)
-    reduce_fn = lambda bucket_id, ds: ds.batch(  # pylint: disable=g-long-lambda
-        batch_size=(bucket_id + 1) * 5, drop_remainder=drop_remainder)
+
+    def reduce_fn(key, ds):
+      # key == 0 -> .batch(5)
+      # key == 1 -> .batch(10)
+      return ds.batch(batch_size=(key + 1) * 5)
+
     dataset = dataset.apply(
         grouping.group_by_window(
             key_func=lambda x: x, reduce_func=reduce_fn, window_size=10))
-    dataset = distribute._RebatchDataset(dataset, num_workers=2)
+    dataset = distribute._RebatchDataset(dataset, num_replicas=2)
 
     self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(dataset)])
-    pairs = [(3, 0), (3, 0), (3, 0)]
-    if not drop_remainder:
-      pairs.extend([(1, 0)])
-    pairs.extend([(5, 1), (5, 1)])
+
+    # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and
+    # the batches of 10 (value == 1) split into minibatches of (5, 5)
+    # [(batch_size, value), ...]
+    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1)]
     pairs = pairs * 2
     expected_output = [[value] * batch_size for batch_size, value in pairs]
     self.assertDatasetProduces(dataset, expected_output)
 
-  def testScanAfterBatch(self, drop_remainder):
+  def testGroupByWindowDynamicBatchWithPartialBatch(self):
+    # {0, 1, 0, 1, ...}
+    dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)
+
+    def reduce_fn(key, ds):
+      # key == 0 -> .batch(5)
+      # key == 1 -> .batch(10)
+      return ds.batch(batch_size=(key + 1) * 5)
+
+    dataset = dataset.apply(
+        grouping.group_by_window(
+            key_func=lambda x: x, reduce_func=reduce_fn, window_size=11))
+    dataset = distribute._RebatchDataset(dataset, num_replicas=2)
+
+    self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
+
+    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (1, 0), (5, 1), (5, 1), (1, 1),
+             (3, 0), (2, 0), (2, 0), (2, 0), (5, 1), (4, 1)]
+    expected_output = [[value] * batch_size for batch_size, value in pairs]
+    self.assertDatasetProduces(dataset, expected_output)
+
+  def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self):
+    # This test exercises nested batch functionality, dynamic batch size
+    # and drop_remainder=True together.
+    dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)
+
+    def reduce_fn(key, ds):
+      # key == 0 -> .batch(5)
+      # key == 1 -> .batch(10)
+      return ds.batch(batch_size=(key + 1) * 5, drop_remainder=True)
+
+    dataset = dataset.apply(
+        grouping.group_by_window(
+            key_func=lambda x: x, reduce_func=reduce_fn, window_size=11))
+    dataset = distribute._RebatchDataset(dataset, num_replicas=2)
+
+    self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
+
+    # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and
+    # the batches of 10 (value == 1) split into minibatches of (5, 5)
+    # [(batch_size, value), ...]
+    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1), (3, 0), (2, 0)]
+    expected_output = [[value] * batch_size for batch_size, value in pairs]
+    self.assertDatasetProduces(dataset, expected_output)
+
+  def testScanAfterBatch(self):
     dataset = dataset_ops.Dataset.range(40).batch(10).apply(
         scan_ops.scan(np.int64(2), lambda state, value: (state, value * state)))
-    dataset = distribute._RebatchDataset(dataset, num_workers=2)
+    dataset = distribute._RebatchDataset(dataset, num_replicas=2)
 
     self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(dataset)])
     expected_output = [[i * 2 for i in range(j*5, (j+1)*5)] for j in range(8)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(dataset, expected_output)
 
-  def testMakeBatchedFeaturesDataset(self, drop_remainder):
+  def testMakeBatchedFeaturesDataset(self):
     # Set up
     fn = os.path.join(self.get_temp_dir(), "tf_record.txt")
     writer = python_io.TFRecordWriter(fn)
@@ -418,13 +437,11 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
         features={"value": parsing_ops.FixedLenFeature([], dtypes.int64)},
         shuffle=False,
         num_epochs=1,
-        drop_final_batch=drop_remainder)
+        drop_final_batch=False)
 
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
 
-    self.assertEqual([[32 if drop_remainder else None]],
-                     [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual([[8 if drop_remainder else None]],
+    self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
 
     expected_output = [{
@@ -433,5 +450,92 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class RebatchDatasetFallbackTest(test_base.DatasetTestBase):
+
+  def testWithNoBatchDataset(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[k for k in range(i, i + 32)] for i in range(0, 1024, 32)])  # pylint: disable=g-complex-comprehension
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual([[8]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testWithUnhandledTransformation(self):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=True).apply(sleep.sleep(10))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual([[8]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testWithUnhandledTransformationInFlatMap(self):
+    dataset = dataset_ops.Dataset.range(2).flat_map(
+        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=True).apply(sleep.sleep(10)))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+
+    self.assertEqual([[8]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    # Two elements where each element is a list of 4 elements where each element
+    # is a list of 8.
+    expected_output = [
+        [k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+        for _ in range(2) for i in range(0, 32, 8)]  # generates 4 elements
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testWithUnknownBatchDim(self):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=False).apply(sleep.sleep(10))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testWithUnknownBatchDimInSecondComponent(self):
+    dataset0 = dataset_ops.Dataset.range(1024).batch(32, drop_remainder=True)
+    dataset1 = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=False).apply(sleep.sleep(10))
+    dataset = dataset_ops.Dataset.zip((dataset0, dataset1))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    expected_output = [(x, x) for x in expected_output]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testBatchSizeNotDivisibleByNumReplicas(self):
+    dataset = dataset_ops.Dataset.range(64).batch(
+        32, drop_remainder=True).apply(sleep.sleep(10))
+
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
+
+    expected_output = []
+    i = 0
+    for _ in range(2):  # number of steps
+      # first four minibatches have seven elements
+      for _ in range(4):
+        expected_output.append([k for k in range(i, i + 7)])
+        i += 7
+      # last minibatch has four elements
+      expected_output.append([k for k in range(i, i + 4)])
+      i += 4
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testBatchSizesDontMatch(self):
+    dataset = dataset_ops.Dataset.from_tensors((np.arange(10), np.arange(5)))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Cannot use rebatching fallback"):
+      rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 063e1230908..673e77fc3bb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -46,7 +46,8 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     initial_dist = [0.2] * 5 if initial_known else None
     classes = math_ops.cast(classes, dtypes.int64)  # needed for Windows build.
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
-        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
+        200, seed=21, reshuffle_each_iteration=False).map(
+            lambda c: (c, string_ops.as_string(c))).repeat()
 
     get_next = self.getNext(
         dataset.apply(
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py
index 24913d40a05..41acbc804e7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py
@@ -17,22 +17,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
-class ReplicateClusterTest(test_base.DatasetTestBase):
+class ReplicateClusterTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def setUp(self):
+    super(ReplicateClusterTest, self).setUp()
     # Start the local server.
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -43,7 +49,8 @@ class ReplicateClusterTest(test_base.DatasetTestBase):
     self._device2 = "/job:worker/replica:0/task:2/device:CPU:0"
     self._target = worker[0].target
 
-  @test_util.deprecated_graph_mode_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testBasic(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100)
@@ -64,7 +71,8 @@ class ReplicateClusterTest(test_base.DatasetTestBase):
         self.assertEqual(i, sess.run(get_next1()))
         self.assertEqual(i, sess.run(get_next2()))
 
-  @test_util.deprecated_graph_mode_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testMap(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100).map(lambda x: x * 2)
@@ -85,7 +93,8 @@ class ReplicateClusterTest(test_base.DatasetTestBase):
         self.assertEqual(i * 2, sess.run(get_next1()))
         self.assertEqual(i * 2, sess.run(get_next2()))
 
-  @test_util.deprecated_graph_mode_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testVariableInput(self):
     with ops.device(self._device0):
       counter_var = variable_scope.get_variable(
@@ -99,9 +108,41 @@ class ReplicateClusterTest(test_base.DatasetTestBase):
       it1 = dataset_ops.make_initializable_iterator(dataset1)
     # We don't support stateful ops in functions as of now.
     with session.Session(self._target) as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaises(errors.FailedPreconditionError):
         sess.run(it1.initializer)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
+  def testAllowStatefulOp(self):
+    with compat.forward_compatibility_horizon(2019, 9, 12):
+      with ops.device(self._device0):
+        dataset0 = dataset_ops.Dataset.range(100).map(
+            lambda _: random_ops.random_uniform(  # pylint:disable=g-long-lambda
+                [],
+                minval=1,
+                maxval=10,
+                dtype=dtypes.float32))
+        opt = dataset_ops.Options()
+        opt.experimental_allow_stateful = True
+        dataset0 = dataset0.with_options(opt)
+      replicated_ds = distribute.replicate(dataset0,
+                                           [self._device1, self._device2])
+      dataset1 = replicated_ds[self._device1]
+      dataset2 = replicated_ds[self._device2]
+
+      with ops.device(self._device0):
+        get_next0 = self.getNext(dataset0)
+      with ops.device(self._device1):
+        get_next1 = self.getNext(dataset1)
+      with ops.device(self._device2):
+        get_next2 = self.getNext(dataset2)
+
+      with session.Session(self._target) as sess:
+        for _ in range(100):
+          sess.run(get_next0())
+          sess.run(get_next1())
+          sess.run(get_next2())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
index 55b8d25f1e2..46a9ac4df7e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
@@ -18,27 +18,28 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class LocalReplicateTest(test_base.DatasetTestBase):
+class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(LocalReplicateTest, self).__init__(methodName)
@@ -46,7 +47,8 @@ class LocalReplicateTest(test_base.DatasetTestBase):
     self._device1 = "/device:CPU:1"
     self._device2 = "/device:CPU:2"
 
-  @test_util.run_v1_only("V2 doesnt support multiple devices")
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
   def testBasic(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100)
@@ -55,17 +57,15 @@ class LocalReplicateTest(test_base.DatasetTestBase):
     dataset1 = replicated_ds[self._device1]
     dataset2 = replicated_ds[self._device2]
 
-    logging.info("Producing 0")
     with ops.device(self._device0):
       self.assertDatasetProduces(dataset0, range(100))
-    logging.info("Producing 1")
     with ops.device(self._device1):
       self.assertDatasetProduces(dataset1, range(100))
-    logging.info("Producing 2")
     with ops.device(self._device2):
       self.assertDatasetProduces(dataset2, range(100))
 
-  @test_util.run_v1_only("V2 doesnt support multiple devices")
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
   def testVariableInput(self):
     with ops.device(self._device0):
       counter_var = variable_scope.get_variable(
@@ -73,13 +73,41 @@ class LocalReplicateTest(test_base.DatasetTestBase):
       dataset0 = dataset_ops.Dataset.range(100).map(
           lambda _: counter_var.assign_add(1))
     # We don't support stateful ops in functions as of now.
-    with self.assertRaises(errors.InvalidArgumentError):
+    with self.assertRaises(errors.FailedPreconditionError):
+      replicated_ds = distribute.replicate(dataset0,
+                                           [self._device1, self._device2])
+      self.evaluate(replicated_ds[self._device1]._variant_tensor)
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  def testAllowStatefulOp(self):
+    with compat.forward_compatibility_horizon(2019, 9, 12):
+      with ops.device(self._device0):
+        dataset0 = dataset_ops.Dataset.range(100).map(
+            lambda _: random_ops.random_uniform(  # pylint:disable=g-long-lambda
+                [],
+                minval=1,
+                maxval=10,
+                dtype=dtypes.float32))
+        opt = dataset_ops.Options()
+        opt.experimental_allow_stateful = True
+        dataset0 = dataset0.with_options(opt)
       replicated_ds = distribute.replicate(dataset0,
                                            [self._device1, self._device2])
       dataset1 = replicated_ds[self._device1]
+      dataset2 = replicated_ds[self._device2]
+
+      with ops.device(self._device0):
+        get_next0 = self.getNext(dataset0)
       with ops.device(self._device1):
-        self.assertDatasetProduces(
-            dataset1, range(100), requires_initialization=True)
+        get_next1 = self.getNext(dataset1)
+      with ops.device(self._device2):
+        get_next2 = self.getNext(dataset2)
+
+      for _ in range(100):
+        get_next0()
+        get_next1()
+        get_next2()
 
 
 JOB_NAME = "remote_device"
@@ -106,7 +134,7 @@ def _get_server_def(job_name, local_server_port, remote_server_addresses,
 
 
 # Pure eager mode test that sets up a cluster of processes.
-class RemoteReplicateTest(test_base.DatasetTestBase):
+class RemoteReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(RemoteReplicateTest, self).__init__(methodName)
@@ -120,6 +148,7 @@ class RemoteReplicateTest(test_base.DatasetTestBase):
     self._device2 = "/job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME
 
   def setUp(self):
+    super(RemoteReplicateTest, self).setUp()
     # Start the local server.
     local_port = pywrap_tensorflow.TF_PickUnusedPortOrDie()
     context.set_server_def(
@@ -131,7 +160,8 @@ class RemoteReplicateTest(test_base.DatasetTestBase):
             ],
             task_index=0))
 
-  @test_util.run_v2_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[2], mode=["eager"]))
   def testBasic(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100)
@@ -146,7 +176,8 @@ class RemoteReplicateTest(test_base.DatasetTestBase):
     with ops.device(self._device2):
       self.assertDatasetProduces(dataset2, range(100))
 
-  @test_util.run_v2_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[2], mode=["eager"]))
   def testMap(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100).map(lambda x: x * 2)
@@ -161,7 +192,8 @@ class RemoteReplicateTest(test_base.DatasetTestBase):
     with ops.device(self._device2):
       self.assertDatasetProduces(dataset2, range(0, 200, 2))
 
-  @test_util.run_v2_only
+  @combinations.generate(
+      combinations.combine(tf_api_version=[2], mode=["eager"]))
   def testVariableInput(self):
     with ops.device(self._device0):
       counter_var = variable_scope.get_variable(
@@ -169,13 +201,41 @@ class RemoteReplicateTest(test_base.DatasetTestBase):
       dataset0 = dataset_ops.Dataset.range(100).map(
           lambda _: counter_var.assign_add(1))
     # We don't support stateful ops in functions as of now.
-    with self.assertRaises(errors.InvalidArgumentError):
+    with self.assertRaises(errors.FailedPreconditionError):
+      replicated_ds = distribute.replicate(dataset0,
+                                           [self._device1, self._device2])
+      self.evaluate(replicated_ds[self._device1]._variant_tensor)
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[2], mode=["eager"]))
+  def testAllowStatefulOp(self):
+    with compat.forward_compatibility_horizon(2019, 9, 12):
+      with ops.device(self._device0):
+        dataset0 = dataset_ops.Dataset.range(100).map(
+            lambda _: random_ops.random_uniform(  # pylint:disable=g-long-lambda
+                [],
+                minval=1,
+                maxval=10,
+                dtype=dtypes.float32))
+        opt = dataset_ops.Options()
+        opt.experimental_allow_stateful = True
+        dataset0 = dataset0.with_options(opt)
       replicated_ds = distribute.replicate(dataset0,
                                            [self._device1, self._device2])
       dataset1 = replicated_ds[self._device1]
+      dataset2 = replicated_ds[self._device2]
+
+      with ops.device(self._device0):
+        get_next0 = self.getNext(dataset0)
       with ops.device(self._device1):
-        self.assertDatasetProduces(
-            dataset1, range(100), requires_initialization=True)
+        get_next1 = self.getNext(dataset1)
+      with ops.device(self._device2):
+        get_next2 = self.getNext(dataset2)
+
+      for _ in range(100):
+        get_next0()
+        get_next1()
+        get_next2()
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 0932a25488a..8f059c41532 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -156,6 +157,9 @@ class ScanTest(test_base.DatasetTestBase):
 
   def testTensorArrayWithCondResetByExternalCaptureBreaks(self):
 
+    if control_flow_v2_toggles.control_flow_v2_enabled():
+      self.skipTest("v1 only test")
+
     empty_ta = tensor_array_ops.TensorArray(
         size=0, element_shape=[], dtype=dtypes.int64, dynamic_size=True)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
index 5bf83651ef5..ee1792f3ff8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
@@ -58,7 +58,7 @@ class AutoShardDatasetSerializationTest(
       dataset = distribute._AutoShardDataset(dataset, 5, 3)
       return dataset
 
-    self.run_core_tests(build_dataset, None, 20)
+    self.run_core_tests(build_dataset, 20)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
index d72a6df14c8..8766a1c7cdf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
@@ -44,7 +44,6 @@ class BatchDatasetSerializationTest(
     num_outputs = tensor_slice_len // batch_size
     self.run_core_tests(
         lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
-        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
         num_outputs)
 
   def _build_dataset_dense_to_sparse(self, components):
@@ -54,11 +53,9 @@ class BatchDatasetSerializationTest(
 
   def testDenseToSparseBatchDatasetCore(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    diff_comp = np.random.randint(2, size=(100,)).astype(np.int32)
 
     num_outputs = len(components) // 4
     self.run_core_tests(lambda: self._build_dataset_dense_to_sparse(components),
-                        lambda: self._build_dataset_dense_to_sparse(diff_comp),
                         num_outputs)
 
   def _sparse(self, i):
@@ -69,14 +66,13 @@ class BatchDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
 
   def testSparseCore(self):
-    self.run_core_tests(self._build_dataset_sparse,
-                        lambda: self._build_dataset_sparse(2), 2)
+    self.run_core_tests(self._build_dataset_sparse, 2)
 
   def _build_dataset_nested_sparse(self):
     return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
 
   def testNestedSparseCore(self):
-    self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
+    self.run_core_tests(self._build_dataset_nested_sparse, 1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
index 2bcf77f5d8a..0f86e44e281 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
@@ -85,24 +85,14 @@ class CacheDatasetSerializationTest(
         ds_fn, [5], 8, verify_exhausted=False, save_checkpoint_at_end=False)
     self.assertSequenceEqual(outputs, range(8))
 
-    if is_memory:
-      outputs = outputs[:5]
-      outputs.extend(
-          self.gen_outputs(
-              ds_fn, [],
-              self.num_outputs - 5,
-              ckpt_saved=True,
-              verify_exhausted=False))
-      self.assertSequenceEqual(outputs, self.expected_outputs())
-    else:
-      # Restoring from checkpoint and running GetNext should return
-      # `AlreadExistsError` now because the lockfile already exists.
-      with self.assertRaises(errors.AlreadyExistsError):
+    outputs = outputs[:5]
+    outputs.extend(
         self.gen_outputs(
             ds_fn, [],
             self.num_outputs - 5,
             ckpt_saved=True,
-            verify_exhausted=False)
+            verify_exhausted=False))
+    self.assertSequenceEqual(outputs, self.expected_outputs())
 
   @parameterized.named_parameters(
       ('Memory', True),
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
index eaedcae4210..d73420cf2b0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
@@ -46,7 +46,7 @@ class ChooseFastestBranchDatasetSerializationTest(
           ratio_numerator=10)
 
     for size in [100, 1000]:
-      self.run_core_tests(lambda: build_ds(size), None, size // 10)  # pylint: disable=cell-var-from-loop
+      self.run_core_tests(lambda: build_ds(size), size // 10)  # pylint: disable=cell-var-from-loop
 
   def testWithCapture(self):
 
@@ -64,7 +64,7 @@ class ChooseFastestBranchDatasetSerializationTest(
       return optimization._ChooseFastestBranchDataset(
           dataset, [branch_0, branch_1], num_elements_per_branch=3)
 
-    self.run_core_tests(build_ds, None, 10)
+    self.run_core_tests(build_ds, 10)
 
   def testWithPrefetch(self):
 
@@ -82,7 +82,7 @@ class ChooseFastestBranchDatasetSerializationTest(
       return optimization._ChooseFastestBranchDataset(
           dataset, [branch_0, branch_1], num_elements_per_branch=3)
 
-    self.run_core_tests(build_ds, None, 10)
+    self.run_core_tests(build_ds, 10)
 
   def testWithMoreOutputThanInput(self):
 
@@ -97,7 +97,7 @@ class ChooseFastestBranchDatasetSerializationTest(
           ratio_denominator=10,
           num_elements_per_branch=100)
 
-    self.run_core_tests(build_ds, None, 1000)
+    self.run_core_tests(build_ds, 1000)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
index 936dc222149..73146a5239a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
@@ -38,7 +38,7 @@ class ChooseFastestDatasetSerializationTest(
           dataset.batch(batch_size).map(map_fn)
       ])
 
-    self.run_core_tests(build_ds, None, num_outputs // 2)
+    self.run_core_tests(build_ds, num_outputs // 2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
index c075dff8cb1..968c8581d93 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
@@ -39,9 +39,7 @@ class ConcatenateDatasetSerializationTest(
   def testConcatenateCore(self):
     num_outputs = 9
     array = np.tile(np.array([[16], [17], [18], [19], [20]]), 15)
-    diff_array = np.array([[1], [2], [3], [4], [5]])
     self.run_core_tests(lambda: self._build_concatenate_dataset(array),
-                        lambda: self._build_concatenate_dataset(diff_array),
                         num_outputs)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
index d4983492e76..c1c91a6a4d8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
@@ -65,7 +65,6 @@ class CsvDatasetSerializationTest(
     defs = [[0]] * self._num_cols
     self.run_core_tests(
         lambda: self.ds_func(record_defaults=defs, buffer_size=2),
-        lambda: self.ds_func(record_defaults=defs, buffer_size=12),
         self._num_outputs)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
index 41a095fb1a4..2c31c23341d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
@@ -37,9 +37,7 @@ class FromTensorsSerializationTest(
     # Equal length components
     arr = np.array(1)
     num_outputs = 1
-    diff_arr = np.array(2)
     self.run_core_tests(lambda: self._build_tensor_dataset(arr),
-                        lambda: self._build_tensor_dataset(diff_arr),
                         num_outputs)
 
 
@@ -55,16 +53,12 @@ class FromTensorSlicesSerializationTest(
                   np.tile(np.array([[12], [13], [14], [15]]), 22),
                   np.array([37.0, 38.0, 39.0, 40.0]))
 
-    diff_comp = (np.tile(np.array([[1], [2], [3], [4]]), 20),
-                 np.tile(np.array([[5], [6], [7], [8]]), 22),
-                 np.array([1.0, 2.0, 3.0, 4.0]))
-
     dict_components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
 
     self.run_core_tests(lambda: self._build_tensor_slices_dataset(components),
-                        lambda: self._build_tensor_slices_dataset(diff_comp), 4)
+                        4)
     self.run_core_tests(
-        lambda: self._build_tensor_slices_dataset(dict_components), None, 3)
+        lambda: self._build_tensor_slices_dataset(dict_components), 3)
 
 
 class FromSparseTensorSlicesSerializationTest(
@@ -82,11 +76,9 @@ class FromSparseTensorSlicesSerializationTest(
 
   def testFromSparseTensorSlicesCore(self):
     slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-    diff_slices = [[1., 2.], [2.], [2., 3., 4.], [], [], []]
 
     self.run_core_tests(
         lambda: self._build_sparse_tensor_slice_dataset(slices),
-        lambda: self._build_sparse_tensor_slice_dataset(diff_slices),
         9,
         sparse_tensors=True)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index d4f377d69b8..f6ab5a1cde2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -24,7 +24,6 @@ import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -62,13 +61,11 @@ class DatasetSerializationTestBase(test.TestCase):
   # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
   # (deprecated) saveable `SparseTensorSliceDataset`, once the API
   # `from_sparse_tensor_slices()`and related tests are deleted.
-  def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
+  def run_core_tests(self, ds_fn, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
     Args:
-      ds_fn1: 0-argument function that returns a Dataset.
-      ds_fn2: 0-argument function that returns a Dataset different from
-        ds_fn1. If None, verify_restore_in_modified_graph test is not run.
+      ds_fn: 0-argument function that returns a Dataset.
       num_outputs: Total number of outputs expected from this Dataset.
       sparse_tensors: Whether dataset is built from SparseTensor(s).
 
@@ -80,33 +77,19 @@ class DatasetSerializationTestBase(test.TestCase):
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
 
-    def ds_fn1_no_opt():
-      return ds_fn1().with_options(options)
+    def ds_fn_no_opt():
+      return ds_fn().with_options(options)
 
     self.verify_unused_iterator(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_fully_used_iterator(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_exhausted_iterator(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_init_before_restore(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_multiple_breaks(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_reset_restored_iterator(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_restore_in_empty_graph(
-        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
-    if ds_fn2:
-
-      def ds_fn2_no_opt():
-        return ds_fn2().with_options(options)
-
-      self.verify_restore_in_modified_graph(
-          ds_fn1_no_opt,
-          ds_fn2_no_opt,
-          num_outputs,
-          sparse_tensors=sparse_tensors)
+        ds_fn_no_opt, num_outputs, sparse_tensors=sparse_tensors)
 
   def verify_unused_iterator(self,
                              ds_fn,
@@ -176,30 +159,6 @@ class DatasetSerializationTestBase(test.TestCase):
         sparse_tensors=sparse_tensors)
     self.assertEqual(len(actual), 0)
 
-  def verify_init_before_restore(self,
-                                 ds_fn,
-                                 num_outputs,
-                                 sparse_tensors=False,
-                                 verify_exhausted=True):
-    """Verifies that restoring into an already initialized iterator works.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.verify_run_with_breaks(
-        ds_fn,
-        self.gen_break_points(num_outputs),
-        num_outputs,
-        init_before_restore=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
   def verify_multiple_breaks(self,
                              ds_fn,
                              num_outputs,
@@ -270,6 +229,7 @@ class DatasetSerializationTestBase(test.TestCase):
           ds_fn, sparse_tensors=sparse_tensors)
       get_next_op = remove_variants(get_next_op)
       with self.session(graph=g) as sess:
+        self._initialize(init_op, sess)
         self._restore(saver, sess)
         self._initialize(init_op, sess)
         for _ in range(num_outputs):
@@ -279,130 +239,6 @@ class DatasetSerializationTestBase(test.TestCase):
             sess.run(get_next_op)
     self.match(expected, actual)
 
-  def verify_restore_in_modified_graph(self,
-                                       ds_fn1,
-                                       ds_fn2,
-                                       num_outputs,
-                                       break_point=None,
-                                       sparse_tensors=False,
-                                       verify_exhausted=True):
-    """Attempts to restore an iterator in a modified graph.
-
-    Builds an input pipeline using ds_fn1, runs it for `break_point` steps
-    and saves a checkpoint. Then builds a new graph using ds_fn2, restores
-    the checkpoint from ds_fn1 and verifies that the restore is successful.
-
-    Args:
-      ds_fn1: See `run_core_tests`.
-      ds_fn2: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    break_point = num_outputs // 2 if not break_point else break_point
-
-    # Skip `break_point` items and store the remaining produced from ds_fn1
-    # in `expected`.
-    self.gen_outputs(
-        ds_fn1, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-    expected = self.gen_outputs(
-        ds_fn1, [],
-        num_outputs - break_point,
-        ckpt_saved=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    # Generate `break_point` items from ds_fn1 and save checkpoint.
-    self.gen_outputs(
-        ds_fn1, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-
-    actual = []
-    # Build graph for ds_fn2 but load checkpoint for ds_fn1.
-    with ops.Graph().as_default() as g:
-      _, get_next_op, saver = self._build_graph(
-          ds_fn2, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    self.match(expected, actual)
-
-  def verify_restore_in_empty_graph(self,
-                                    ds_fn,
-                                    num_outputs,
-                                    break_point=None,
-                                    sparse_tensors=False,
-                                    verify_exhausted=True):
-    """Attempts to restore an iterator in an empty graph.
-
-    Builds an input pipeline using ds_fn, runs it for `break_point` steps
-    and saves a checkpoint. Then builds a new empty graph, restores
-    the checkpoint from ds_fn and verifies that the restore is successful.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    break_point = num_outputs // 2 if not break_point else break_point
-
-    # Skip `break_point` items and store the remaining produced from ds_fn
-    # in `expected`.
-    self.gen_outputs(
-        ds_fn, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-    expected = self.gen_outputs(
-        ds_fn, [],
-        num_outputs - break_point,
-        ckpt_saved=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    # Generate `break_point` items from ds_fn and save checkpoint.
-    self.gen_outputs(
-        ds_fn, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-
-    actual = []
-    # Build an empty graph but load checkpoint for ds_fn.
-    with ops.Graph().as_default() as g:
-      get_next_op, saver = self._build_empty_graph(
-          ds_fn, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    self.match(expected, actual)
-
   def verify_error_on_save(self,
                            ds_fn,
                            num_outputs,
@@ -438,7 +274,6 @@ class DatasetSerializationTestBase(test.TestCase):
                              ds_fn,
                              break_points,
                              num_outputs,
-                             init_before_restore=False,
                              sparse_tensors=False,
                              verify_exhausted=True):
     """Verifies that ds_fn() produces the same outputs with and without breaks.
@@ -454,7 +289,6 @@ class DatasetSerializationTestBase(test.TestCase):
       ds_fn: See `gen_outputs`.
       break_points: See `gen_outputs`.
       num_outputs: See `gen_outputs`.
-      init_before_restore: See `gen_outputs`.
       sparse_tensors: See `run_core_tests`.
       verify_exhausted: See `gen_outputs`.
 
@@ -464,7 +298,6 @@ class DatasetSerializationTestBase(test.TestCase):
     expected = self.gen_outputs(
         ds_fn, [],
         num_outputs,
-        init_before_restore=init_before_restore,
         sparse_tensors=sparse_tensors,
         verify_exhausted=verify_exhausted)
 
@@ -472,7 +305,6 @@ class DatasetSerializationTestBase(test.TestCase):
         ds_fn,
         break_points,
         num_outputs,
-        init_before_restore=init_before_restore,
         sparse_tensors=sparse_tensors,
         verify_exhausted=verify_exhausted)
 
@@ -483,7 +315,6 @@ class DatasetSerializationTestBase(test.TestCase):
                   break_points,
                   num_outputs,
                   ckpt_saved=False,
-                  init_before_restore=False,
                   sparse_tensors=False,
                   verify_exhausted=True,
                   save_checkpoint_at_end=True):
@@ -501,11 +332,7 @@ class DatasetSerializationTestBase(test.TestCase):
         produce outputs till next checkpoint or till `num_outputs` elements
         have been produced. `break_point` must be <= `num_outputs`.
       num_outputs: The total number of outputs to produce from the iterator.
-      ckpt_saved: Whether a checkpoint already exists. If False, we build the
-        graph from ds_fn.
-      init_before_restore: Whether init should be called before saver.restore.
-        This is just so that we can verify that restoring an already initialized
-        iterator works.
+      ckpt_saved: Whether a checkpoint already exists.
       sparse_tensors:  Whether dataset is built from SparseTensor(s).
       verify_exhausted: Whether to verify that the iterator has been exhausted
         after producing `num_outputs` elements.
@@ -535,8 +362,7 @@ class DatasetSerializationTestBase(test.TestCase):
         get_next_op = remove_variants(get_next_op)
         with self.session(graph=g) as sess:
           if ckpt_saved:
-            if init_before_restore:
-              self._initialize(init_op, sess)
+            self._initialize(init_op, sess)
             self._restore(saver, sess)
           else:
             self._initialize(init_op, sess)
@@ -584,13 +410,11 @@ class DatasetSerializationTestBase(test.TestCase):
         for item1, item2 in zip(expected, actual):
           self.match(item1, item2)
     elif isinstance(expected, sparse_tensor.SparseTensorValue):
-      return self.match(
-          (expected.indices, expected.values, expected.dense_shape),
-          (actual.indices, actual.values, actual.dense_shape))
+      self.match((expected.indices, expected.values, expected.dense_shape),
+                 (actual.indices, actual.values, actual.dense_shape))
     elif isinstance(expected, ragged_tensor_value.RaggedTensorValue):
-      return self.match(
-          (expected.values, expected.row_splits),
-          (actual.values, actual.row_splits))
+      self.match((expected.values, expected.row_splits),
+                 (actual.values, actual.row_splits))
     else:
       self.assertEqual(expected, actual)
 
@@ -617,20 +441,6 @@ class DatasetSerializationTestBase(test.TestCase):
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
-  def _build_empty_graph(self, ds_fn, sparse_tensors=False):
-    iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn),
-        output_shapes=self._get_output_shapes(ds_fn),
-        output_classes=self._get_output_classes(ds_fn))
-    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    if sparse_tensors:
-      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-    else:
-      get_next = iterator.get_next()
-    saver = saver_lib.Saver(allow_empty=True)
-    return get_next, saver
-
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index e3ba8ad231b..4aaf4500350 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -35,7 +35,6 @@ class FilterDatasetSerializationTest(
     div = 3
     num_outputs = sum(x % 3 != 2 for x in range(100))
     self.run_core_tests(lambda: self._build_filter_range_graph(div),
-                        lambda: self._build_filter_range_graph(div * 2),
                         num_outputs)
 
   def _build_filter_dict_graph(self):
@@ -46,7 +45,7 @@ class FilterDatasetSerializationTest(
 
   def testFilterDictCore(self):
     num_outputs = sum((x**2) % 2 == 0 for x in range(10))
-    self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
+    self.run_core_tests(self._build_filter_dict_graph, num_outputs)
 
   def _build_sparse_filter(self):
 
@@ -62,7 +61,7 @@ class FilterDatasetSerializationTest(
 
   def testSparseCore(self):
     num_outputs = 5
-    self.run_core_tests(self._build_sparse_filter, None, num_outputs)
+    self.run_core_tests(self._build_sparse_filter, num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
index 70caf3e0d5a..4a9c6b1c330 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
@@ -37,7 +37,6 @@ class FixedLengthRecordDatasetSerializationTest(
     num_epochs = 5
     num_outputs = num_epochs * self._num_files * self._num_records
     self.run_core_tests(lambda: self._build_iterator_graph(num_epochs),
-                        lambda: self._build_iterator_graph(num_epochs * 2),
                         num_outputs)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
index e18cfa5002d..b2da2c7f668 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
@@ -43,7 +43,7 @@ class FlatMapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(start, start + 5 * 5, 5).flat_map(map_fn)
 
-    self.run_core_tests(lambda: build_ds(0), lambda: build_ds(10), 25)
+    self.run_core_tests(lambda: build_ds(0), 25)
 
   def testMapThenFlatMap(self):
 
@@ -58,7 +58,7 @@ class FlatMapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
 
-    self.run_core_tests(build_ds, None, 500)
+    self.run_core_tests(build_ds, 500)
 
   def testCaptureDefunInMapFn(self):
 
@@ -74,7 +74,7 @@ class FlatMapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(100).flat_map(map_fn)
 
-    self.run_core_tests(build_ds, None, 100)
+    self.run_core_tests(build_ds, 100)
 
   def testDisallowVariableCapture(self):
 
@@ -84,7 +84,7 @@ class FlatMapDatasetSerializationTest(
       return dataset_ops.Dataset.range(5).flat_map(
           lambda _: dataset_ops.Dataset.from_tensor_slices([test_var]))
 
-    self.verify_error_on_save(build_ds, 5, errors.InvalidArgumentError)
+    self.verify_error_on_save(build_ds, 5, errors.FailedPreconditionError)
 
   def testDisallowCapturingStatefulOps(self):
 
@@ -100,7 +100,7 @@ class FlatMapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
 
-    self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
+    self.verify_error_on_save(build_ds, 500, errors.FailedPreconditionError)
 
   def testSparseCore(self):
 
@@ -115,7 +115,7 @@ class FlatMapDatasetSerializationTest(
     def _build_ds():
       return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
 
-    self.run_core_tests(_build_ds, None, 20)
+    self.run_core_tests(_build_ds, 20)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
index 169c8845d0b..d2f1ffbdca8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
@@ -41,20 +41,10 @@ class GroupByReducerSerializationTest(
     components = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64)
     self.verify_unused_iterator(
         lambda: self._build_dataset(components), 5, verify_exhausted=True)
-    self.verify_init_before_restore(
-        lambda: self._build_dataset(components), 5, verify_exhausted=True)
     self.verify_multiple_breaks(
         lambda: self._build_dataset(components), 5, verify_exhausted=True)
     self.verify_reset_restored_iterator(
         lambda: self._build_dataset(components), 5, verify_exhausted=True)
-    self.verify_restore_in_empty_graph(
-        lambda: self._build_dataset(components), 5, verify_exhausted=True)
-    diff_components = np.array([5, 4, 3, 2, 1, 0], dtype=np.int64)
-    self.verify_restore_in_modified_graph(
-        lambda: self._build_dataset(components),
-        lambda: self._build_dataset(diff_components),
-        5,
-        verify_exhausted=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
index e5bc76288e9..69e28d4ab0a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
@@ -37,20 +37,10 @@ class GroupByWindowSerializationTest(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
     self.verify_unused_iterator(
         lambda: self._build_dataset(components), 12, verify_exhausted=False)
-    self.verify_init_before_restore(
-        lambda: self._build_dataset(components), 12, verify_exhausted=False)
     self.verify_multiple_breaks(
         lambda: self._build_dataset(components), 12, verify_exhausted=False)
     self.verify_reset_restored_iterator(
         lambda: self._build_dataset(components), 12, verify_exhausted=False)
-    self.verify_restore_in_empty_graph(
-        lambda: self._build_dataset(components), 12, verify_exhausted=False)
-    diff_components = np.array([0, 0, 0, 1, 1, 1], dtype=np.int64)
-    self.verify_restore_in_modified_graph(
-        lambda: self._build_dataset(components),
-        lambda: self._build_dataset(diff_components),
-        12,
-        verify_exhausted=False)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
index df1f43129a0..5858bd2dbd6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -29,17 +27,14 @@ from tensorflow.python.platform import test
 class IgnoreErrorsSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def _build_ds(self, components):
-    return dataset_ops.Dataset.from_tensor_slices(components).map(
-        lambda x: array_ops.check_numerics(x, "message")).apply(
+  def _build_ds(self):
+    return dataset_ops.Dataset.range(5).map(
+        array_ops.ones).map(lambda x: array_ops.gather(x, [0])).apply(
             error_ops.ignore_errors())
 
   def testIgnoreErrorsCore(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-    diff_components = np.array([1., 2., 3., np.nan]).astype(np.float32)
     num_outputs = 4
-    self.run_core_tests(lambda: self._build_ds(components),
-                        lambda: self._build_ds(diff_components), num_outputs)
+    self.run_core_tests(self._build_ds, num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
index 0c1d40ce390..f3daffbae9e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
@@ -57,8 +57,6 @@ class InterleaveDatasetSerializationTest(
     self.run_core_tests(
         lambda: self._build_iterator_graph(
             input_values, cycle_length, block_length, num_parallel_calls),
-        lambda: self._build_iterator_graph(
-            input_values, cycle_length * 2, block_length, num_parallel_calls),
         num_outputs)
     # pylint: enable=g-long-lambda
 
@@ -76,7 +74,7 @@ class InterleaveDatasetSerializationTest(
       return dataset_ops.Dataset.range(10).map(_map_fn).interleave(
           _interleave_fn, cycle_length=1)
 
-    self.run_core_tests(_build_dataset, None, 20)
+    self.run_core_tests(_build_dataset, 20)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index 8bfe6ce2f30..9cffd39c842 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -52,10 +52,8 @@ class MapAndBatchDatasetSerializationTest(
                   num_parallel_batches=num_parallel_batches,
                   drop_remainder=drop_remainder))
 
-    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
-                        num_outputs_keep_remainder)
-    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
-                        num_outputs_drop_remainder)
+    self.run_core_tests(lambda: build_ds(10), num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), num_outputs_drop_remainder)
 
   def testNumParallelCalls(self):
     range_size = 11
@@ -79,10 +77,8 @@ class MapAndBatchDatasetSerializationTest(
                   num_parallel_calls=num_parallel_calls,
                   drop_remainder=drop_remainder))
 
-    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
-                        num_outputs_keep_remainder)
-    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
-                        num_outputs_drop_remainder)
+    self.run_core_tests(lambda: build_ds(10), num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), num_outputs_drop_remainder)
 
   def testSparse(self):
 
@@ -95,7 +91,7 @@ class MapAndBatchDatasetSerializationTest(
       return dataset_ops.Dataset.range(10).apply(
           batching.map_and_batch(map_fn, 5))
 
-    self.run_core_tests(build_dataset, None, 2)
+    self.run_core_tests(build_dataset, 2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
index a8667c2aad0..73801722a98 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
@@ -53,10 +53,7 @@ class MapDatasetSerializationTest(
         .repeat(self._num_epochs))
 
   def testSaveRestoreCore(self):
-    self.run_core_tests(
-        self._build_ds,
-        lambda: self._build_ds(multiplier=15.0),
-        self._num_outputs)
+    self.run_core_tests(self._build_ds, self._num_outputs)
 
   def testSaveStatefulFunction(self):
 
@@ -68,7 +65,7 @@ class MapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(100).map(_map_fn)
 
-    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+    self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
   def testCaptureVariableInMapFn(self):
 
@@ -78,7 +75,7 @@ class MapDatasetSerializationTest(
       return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda _: counter_var.assign_add(1)))
 
-    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+    self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
   def testCaptureConstantInMapFn(self):
 
@@ -87,7 +84,7 @@ class MapDatasetSerializationTest(
       return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda x: x + constant_var))
 
-    self.run_core_tests(_build_ds, None, 10)
+    self.run_core_tests(_build_ds, 10)
 
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
@@ -100,7 +97,7 @@ class MapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    self.run_core_tests(_build_ds, None, num_outputs)
+    self.run_core_tests(_build_ds, num_outputs)
 
   def testBuildDefunInMapFn(self):
     num_outputs = 100
@@ -119,7 +116,7 @@ class MapDatasetSerializationTest(
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    self.run_core_tests(_build_ds, None, num_outputs)
+    self.run_core_tests(_build_ds, num_outputs)
 
   def testSparseCore(self):
 
@@ -133,8 +130,7 @@ class MapDatasetSerializationTest(
       return dataset_ops.Dataset.range(num_outputs).map(_sparse)
 
     num_outputs = 10
-    self.run_core_tests(lambda: _build_ds(num_outputs),
-                        lambda: _build_ds(int(num_outputs / 2)), num_outputs)
+    self.run_core_tests(lambda: _build_ds(num_outputs), num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
index c026e97835c..94b5e1b0b62 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
@@ -55,7 +55,6 @@ class MatchingFilesDatasetSerializationTest(
 
     num_outputs = width * len(patterns)
     self.run_core_tests(lambda: self._build_iterator_graph(patterns),
-                        lambda: self._build_iterator_graph(patterns[0:1]),
                         num_outputs)
 
     shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index aaa46bacefe..646f306f519 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -32,7 +32,7 @@ class OptimizeDatasetSerializationTest(
       return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch(
           batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
 
-    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+    self.run_core_tests(lambda: build_dataset(200, 10), 20)
 
   def testWithNewFunction(self):
     """Tests that optimized datasets with new functions work."""
@@ -46,7 +46,7 @@ class OptimizeDatasetSerializationTest(
       dataset = dataset.apply(optimization.optimize(["map_vectorization"]))
       return dataset
 
-    self.run_core_tests(build_dataset, None, 20)
+    self.run_core_tests(build_dataset, 20)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
index 6f72b246738..3988e64a647 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
@@ -36,10 +36,8 @@ class PaddedBatchDatasetSerializationTest(
           lambda x: array_ops.fill([x], x)).padded_batch(
               4, padded_shapes=[-1])
 
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
+    seq_lens = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens), 8)
 
   def testPaddedBatchNonDefaultPadding(self):
 
@@ -56,10 +54,8 @@ class PaddedBatchDatasetSerializationTest(
               padded_shapes=(padded_shape, padded_shape),
               padding_values=(-1, "<end>"))
 
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
+    seq_lens = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens), 8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
index b8f38e8a28f..c441ee753f9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
@@ -46,20 +46,18 @@ class ParallelInterleaveDatasetSerializationTest(
     # cycle_length > 1, block_length > 1
     cycle_length = 2
     block_length = 3
-    self.run_core_tests(
-        lambda: self._build_ds(cycle_length, block_length),
-        lambda: self._build_ds(cycle_length * 2, block_length * 1),
-        self.num_outputs)
+    self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
+                        self.num_outputs)
     # cycle_length = 1
     cycle_length = 1
     block_length = 3
     self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
-                        None, self.num_outputs)
+                        self.num_outputs)
     # block_length = 1
     cycle_length = 2
     block_length = 1
     self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
-                        None, self.num_outputs)
+                        self.num_outputs)
 
   def testSerializationWithSloppy(self):
     break_points = self.gen_break_points(self.num_outputs, 10)
@@ -94,7 +92,7 @@ class ParallelInterleaveDatasetSerializationTest(
       return dataset_ops.Dataset.range(10).map(_map_fn).apply(
           interleave_ops.parallel_interleave(_interleave_fn, 1))
 
-    self.run_core_tests(_build_dataset, None, 20)
+    self.run_core_tests(_build_dataset, 20)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
index 4e4ed687046..6ec012f5f7d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
@@ -63,10 +63,7 @@ class ParallelMapDatasetSerializationTest(
 
   def testSaveRestoreCore(self):
     for ds_fn in [self._build_ds, self._build_ds_with_prefetch]:
-      self.run_core_tests(
-          ds_fn,
-          lambda: ds_fn(multiplier=15.0),  # pylint: disable=cell-var-from-loop
-          self._num_outputs)
+      self.run_core_tests(ds_fn, self._num_outputs)
 
   def testSaveStatefulFunction(self):
 
@@ -79,7 +76,7 @@ class ParallelMapDatasetSerializationTest(
       return dataset_ops.Dataset.range(100).map(
           _map_fn, num_parallel_calls=2).prefetch(2)
 
-    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+    self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
   def testCaptureVariableInMapFn(self):
 
@@ -90,7 +87,7 @@ class ParallelMapDatasetSerializationTest(
           lambda _: counter_var.assign_add(1),
           num_parallel_calls=2).prefetch(2))
 
-    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+    self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
   def testCaptureConstantInMapFn(self):
 
@@ -99,7 +96,7 @@ class ParallelMapDatasetSerializationTest(
       return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
 
-    self.run_core_tests(_build_ds, None, 10)
+    self.run_core_tests(_build_ds, 10)
 
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
@@ -113,7 +110,7 @@ class ParallelMapDatasetSerializationTest(
       return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
 
-    self.run_core_tests(_build_ds, None, num_outputs)
+    self.run_core_tests(_build_ds, num_outputs)
 
   def testBuildDefunInMapFn(self):
     num_outputs = 100
@@ -133,7 +130,7 @@ class ParallelMapDatasetSerializationTest(
       return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
 
-    self.run_core_tests(_build_ds, None, num_outputs)
+    self.run_core_tests(_build_ds, num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
index b3dfe214863..6698fce8270 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
@@ -41,9 +41,7 @@ class ParseExampleDatasetSerializationTest(
     # pylint: disable=g-long-lambda
     self.run_core_tests(
         lambda: self.ParseExampleDataset(
-            num_repeat=num_repeat, batch_size=batch_size),
-        lambda: self.ParseExampleDataset(num_repeat=10, batch_size=4),
-        num_outputs)
+            num_repeat=num_repeat, batch_size=batch_size), num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
index 00d74c00256..738d9561e2f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
@@ -31,8 +31,7 @@ class PrefetchDatasetSerializationTest(
 
   def testCore(self):
     num_outputs = 100
-    self.run_core_tests(lambda: self.build_dataset(10),
-                        lambda: self.build_dataset(20), num_outputs)
+    self.run_core_tests(lambda: self.build_dataset(10), num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index 34419a31493..c06cd39d241 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -110,7 +110,6 @@ class RangeDatasetSerializationTest(
     stop = 10
     stop_1 = 8
     self.run_core_tests(lambda: self._build_range_dataset(start, stop),
-                        lambda: self._build_range_dataset(start, stop_1),
                         stop - start)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
index a053d086b38..0ae26927ca5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -32,9 +32,9 @@ class RebatchDatasetSerializationTest(
       return distribute._RebatchDataset(
           dataset_ops.Dataset.range(num_elements).batch(
               4 * batch_size, drop_remainder=True),
-          num_workers=4)
+          num_replicas=4)
 
-    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+    self.run_core_tests(lambda: build_dataset(200, 10), 20)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
index c23c1ecdfb5..f12267db681 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
@@ -37,9 +37,7 @@ class SampleFromDatasetsSerializationTest(
     return dataset.take(num_samples)
 
   def testSerializationCore(self):
-    self.run_core_tests(
-        lambda: self._build_dataset([0.5, 0.5], 100),
-        lambda: self._build_dataset([0.25, 0.25, 0.25, 0.25], 1000), 100)
+    self.run_core_tests(lambda: self._build_dataset([0.5, 0.5], 100), 100)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
index 5f501606197..33aa33c4e26 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
@@ -32,8 +32,7 @@ class ScanDatasetSerializationTest(
 
   def testScanCore(self):
     num_output = 5
-    self.run_core_tests(lambda: self._build_dataset(num_output),
-                        lambda: self._build_dataset(2), num_output)
+    self.run_core_tests(lambda: self._build_dataset(num_output), num_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
index fe99a3d3d9e..09c09aa0b8a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
@@ -34,23 +34,21 @@ class SkipDatasetSerializationTest(
   def testSkipFewerThanInputs(self):
     count = 4
     num_outputs = 10 - count
-    self.run_core_tests(lambda: self._build_skip_dataset(count),
-                        lambda: self._build_skip_dataset(count + 2),
-                        num_outputs)
+    self.run_core_tests(lambda: self._build_skip_dataset(count), num_outputs)
 
   def testSkipVarious(self):
     # Skip more than inputs
-    self.run_core_tests(lambda: self._build_skip_dataset(20), None, 0)
+    self.run_core_tests(lambda: self._build_skip_dataset(20), 0)
     # Skip exactly the input size
-    self.run_core_tests(lambda: self._build_skip_dataset(10), None, 0)
-    self.run_core_tests(lambda: self._build_skip_dataset(-1), None, 0)
+    self.run_core_tests(lambda: self._build_skip_dataset(10), 0)
+    self.run_core_tests(lambda: self._build_skip_dataset(-1), 0)
     # Skip nothing
-    self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
+    self.run_core_tests(lambda: self._build_skip_dataset(0), 10)
 
   def testInvalidSkip(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Shape must be rank 0 but is rank 1'):
-      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), 0)
 
 
 class TakeDatasetSerializationTest(
@@ -62,26 +60,22 @@ class TakeDatasetSerializationTest(
 
   def testTakeFewerThanInputs(self):
     count = 4
-    self.run_core_tests(
-        lambda: self._build_take_dataset(count),
-        lambda: self._build_take_dataset(count + 2),
-        count,
-    )
+    self.run_core_tests(lambda: self._build_take_dataset(count), count)
 
   def testTakeVarious(self):
     # Take more than inputs
-    self.run_core_tests(lambda: self._build_take_dataset(20), None, 10)
+    self.run_core_tests(lambda: self._build_take_dataset(20), 10)
     # Take exactly the input size
-    self.run_core_tests(lambda: self._build_take_dataset(10), None, 10)
+    self.run_core_tests(lambda: self._build_take_dataset(10), 10)
     # Take all
-    self.run_core_tests(lambda: self._build_take_dataset(-1), None, 10)
+    self.run_core_tests(lambda: self._build_take_dataset(-1), 10)
     # Take nothing
-    self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
+    self.run_core_tests(lambda: self._build_take_dataset(0), 0)
 
   def testInvalidTake(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Shape must be rank 0 but is rank 1'):
-      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), 0)
 
 
 class RepeatDatasetSerializationTest(
@@ -94,35 +88,26 @@ class RepeatDatasetSerializationTest(
 
   def testFiniteRepeat(self):
     count = 10
-    self.run_core_tests(lambda: self._build_repeat_dataset(count),
-                        lambda: self._build_repeat_dataset(count + 2),
-                        3 * count)
+    self.run_core_tests(lambda: self._build_repeat_dataset(count), 3 * count)
 
   def testEmptyRepeat(self):
-    self.run_core_tests(lambda: self._build_repeat_dataset(0), None, 0)
+    self.run_core_tests(lambda: self._build_repeat_dataset(0), 0)
 
   def testInfiniteRepeat(self):
     self.verify_unused_iterator(
         lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
-    self.verify_init_before_restore(
-        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
     self.verify_multiple_breaks(
         lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
     self.verify_reset_restored_iterator(
         lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
-    self.verify_restore_in_modified_graph(
-        lambda: self._build_repeat_dataset(-1),
-        lambda: self._build_repeat_dataset(2),
-        20,
-        verify_exhausted=False)
+
     # Test repeat empty dataset
-    self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
+    self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), 0)
 
   def testInvalidRepeat(self):
     with self.assertRaisesRegexp(
         ValueError, 'Shape must be rank 0 but is rank 1'):
-      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0),
-                          None, 0)
+      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0), 0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 2486db9d113..2cada3f3a5f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -72,6 +72,7 @@ class SerializationIntegrationTest(test.TestCase):
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
       with self.session(graph=g) as sess:
+        self.evaluate(init_ops)
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
           output = self.evaluate(get_next_ops)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
index 99674b69103..e180b103157 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
@@ -31,10 +31,9 @@ class ShardDatasetSerializationTest(
   def _build_dataset(self, num_elements, num_shards, index):
     return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
 
-  @parameterized.parameters((10, 5, 2, 3), (10, 10, 0, 9), (100, 2, 0, 1))
-  def testCore(self, elems, num_shards, index1, index2):
-    self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index1),
-                        lambda: self._build_dataset(elems, num_shards, index2),
+  @parameterized.parameters((10, 5, 2), (10, 10, 0), (100, 2, 0))
+  def testCore(self, elems, num_shards, index):
+    self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index),
                         elems // num_shards)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
index f847ac19f9d..42f01b7ac14 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
@@ -31,8 +31,7 @@ class ShuffleAndRepeatSerializationTest(
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
 
   def testCore(self):
-    self.run_core_tests(lambda: self._build_ds(10), lambda: self._build_ds(20),
-                        100)
+    self.run_core_tests(lambda: self._build_ds(10), 100)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index e753a7a15be..8e05823ccbe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -17,6 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -26,7 +30,8 @@ from tensorflow.python.training import saver as saver_lib
 
 
 class ShuffleDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_shuffle_dataset(
       self,
@@ -36,113 +41,100 @@ class ShuffleDatasetSerializationTest(
       seed=None,
       reshuffle_each_iteration=None,
   ):
-    return dataset_ops.Dataset.range(range_limit).shuffle(
+    dataset = dataset_ops.Dataset.range(range_limit).shuffle(
         buffer_size,
         seed=seed,
         reshuffle_each_iteration=reshuffle_each_iteration).repeat(num_repeats)
+    # TODO(b/138399725): Re-enable default optimizations.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    return dataset.with_options(options)
 
-  def testShuffleCore(self):
-
+  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  def testShuffleCore(self, reshuffle_each_iteration, buffer_size):
     seed = 55
     range_limit = 5
     num_repeats = 2
     num_outputs = range_limit * num_repeats
-    buffer_sizes = [1, 3, 5, 8, 10]
-    # pylint: disable=cell-var-from-loop
     # pylint: disable=g-long-lambda
-    for reshuffle_each_iteration in [True, False]:
-      for buffer_size in buffer_sizes:
-        self.run_core_tests(
-            lambda: self._build_shuffle_dataset(
-                range_limit=range_limit,
-                num_repeats=num_repeats,
-                buffer_size=buffer_size,
-                seed=seed,
-                reshuffle_each_iteration=reshuffle_each_iteration),
-            lambda: self._build_shuffle_dataset(
-                range_limit=range_limit,
-                num_repeats=num_repeats,
-                buffer_size=buffer_size,
-                seed=10,
-                reshuffle_each_iteration=reshuffle_each_iteration),
-            num_outputs)
-    # pylint: enable=cell-var-from-loop
-    # pylint: enable=g-long-lambda
-
-  def testNonDeterministicSeeding(self):
+    self.run_core_tests(
+        lambda: self._build_shuffle_dataset(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration), num_outputs)
 
+  # TODO(b/133780904): Re-enable this test once randomness state is hoisted out
+  # of the input pipeline.
+  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  def _testNonDeterministicSeeding(self, reshuffle_each_iteration, buffer_size):
     range_limit = 5
     num_repeats = 2
     num_outputs = range_limit * num_repeats
-    buffer_sizes = [1, 3, 5, 8, 10]
-    for reshuffle_each_iteration in [True, False]:
-      for buffer_size in buffer_sizes:
 
-        def ds_fn():
-          # pylint: disable=cell-var-from-loop
-          return self._build_shuffle_dataset(
-              range_limit=range_limit,
-              num_repeats=num_repeats,
-              buffer_size=buffer_size,
-              seed=None,  # Iterator seeds are generated non-deterministically.
-              reshuffle_each_iteration=reshuffle_each_iteration)
-          # pylint: enable=cell-var-from-loop
+    def ds_fn():
+      # pylint: disable=cell-var-from-loop
+      return self._build_shuffle_dataset(
+          range_limit=range_limit,
+          num_repeats=num_repeats,
+          buffer_size=buffer_size,
+          seed=None,  # Iterator seeds are generated non-deterministically.
+          reshuffle_each_iteration=reshuffle_each_iteration)
+      # pylint: enable=cell-var-from-loop
 
-        # We checkpoint the initial state of the Dataset so that we can restore
-        # the seeds in the next run. Since the seeding is non-deterministic
-        # the dataset gets initialized with different seeds each time.
-        expected = self.gen_outputs(
-            ds_fn,
-            break_points=[0],
-            num_outputs=num_outputs,
-            ckpt_saved=False,
-            verify_exhausted=False,
-            save_checkpoint_at_end=False)
-        actual = self.gen_outputs(
-            ds_fn,
-            break_points=self.gen_break_points(num_outputs),
-            num_outputs=num_outputs,
-            ckpt_saved=True,
-            verify_exhausted=False)
+    # We checkpoint the initial state of the Dataset so that we can restore
+    # the seeds in the next run. Since the seeding is non-deterministic
+    # the dataset gets initialized with different seeds each time.
+    expected = self.gen_outputs(
+        ds_fn,
+        break_points=[0],
+        num_outputs=num_outputs,
+        ckpt_saved=False,
+        verify_exhausted=False,
+        save_checkpoint_at_end=False)
+    actual = self.gen_outputs(
+        ds_fn,
+        break_points=self.gen_break_points(num_outputs),
+        num_outputs=num_outputs,
+        ckpt_saved=True,
+        verify_exhausted=False)
+    self.match(expected, actual)
+
+  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  def testMultipleIterators(self, reshuffle_each_iteration, buffer_size):
+    range_limit = 5
+    num_repeats = 2
+    num_outputs = range_limit * num_repeats
+
+    def ds_fn():
+      # pylint: disable=cell-var-from-loop
+      return self._build_shuffle_dataset(
+          range_limit=range_limit,
+          num_repeats=num_repeats,
+          buffer_size=buffer_size,
+          seed=None,  # Iterator seeds are generated non-deterministically.
+          reshuffle_each_iteration=reshuffle_each_iteration)
+      # pylint: enable=cell-var-from-loop
+
+    with ops.Graph().as_default() as g:
+      ds = ds_fn()
+      iterators = [ds.make_one_shot_iterator(), ds.make_one_shot_iterator()]
+      get_next_ops = [it.get_next() for it in iterators]
+      saveables = [
+          contrib_iterator_ops.make_saveable_from_iterator(it)
+          for it in iterators
+      ]
+      for saveable in saveables:
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      saver = saver_lib.Saver(allow_empty=True)
+      with self.session(graph=g) as sess:
+        self._save(sess, saver)
+        expected = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
+        self._restore(saver, sess)
+        actual = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
         self.match(expected, actual)
 
-  def testMultipleIterators(self):
-    range_limit = 5
-    num_repeats = 2
-    num_outputs = range_limit * num_repeats
-    buffer_sizes = [1, 3, 5, 8, 10]
-
-    for reshuffle_each_iteration in [True, False]:
-      for buffer_size in buffer_sizes:
-
-        def ds_fn():
-          # pylint: disable=cell-var-from-loop
-          return self._build_shuffle_dataset(
-              range_limit=range_limit,
-              num_repeats=num_repeats,
-              buffer_size=buffer_size,
-              seed=None,  # Iterator seeds are generated non-deterministically.
-              reshuffle_each_iteration=reshuffle_each_iteration)
-          # pylint: enable=cell-var-from-loop
-
-        with ops.Graph().as_default() as g:
-          ds = ds_fn()
-          iterators = [ds.make_one_shot_iterator(), ds.make_one_shot_iterator()]
-          get_next_ops = [it.get_next() for it in iterators]
-          saveables = [
-              contrib_iterator_ops.make_saveable_from_iterator(it)
-              for it in iterators
-          ]
-          for saveable in saveables:
-            ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-          saver = saver_lib.Saver(allow_empty=True)
-          with self.session(graph=g) as sess:
-            self._save(sess, saver)
-            expected = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
-            self._restore(saver, sess)
-            actual = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
-            self.match(expected, actual)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
index 006279bbe1b..e3a44a4d6d7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
@@ -44,9 +44,7 @@ class SqlDatasetSerializationTest(
   def testSQLSaveable(self):
     num_repeats = 4
     num_outputs = num_repeats * 2
-    self.run_core_tests(lambda: self._build_dataset(num_repeats),
-                        lambda: self._build_dataset(num_repeats // 2),
-                        num_outputs)
+    self.run_core_tests(lambda: self._build_dataset(num_repeats), num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index 9372eef256b..66d423634be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -44,15 +44,13 @@ class StatsDatasetSerializationTest(
       # pylint: disable=g-long-lambda
       self.run_core_tests(
           lambda: dataset_ops.Dataset.range(100).apply(
-              stats_ops.bytes_produced_stats(["bytes_produced"])),
-          None, 100)
+              stats_ops.bytes_produced_stats(["bytes_produced"])), 100)
       # pylint: enable=g-long-lambda
 
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
-    self.run_core_tests(
-        lambda: self._build_dataset_bytes_stats(num_outputs),
-        lambda: self._build_dataset_bytes_stats(num_outputs // 10), num_outputs)
+    self.run_core_tests(lambda: self._build_dataset_bytes_stats(num_outputs),
+                        num_outputs)
 
   def _build_dataset_latency_stats(self, num_elements, tag="record_latency"):
     return dataset_ops.Dataset.range(num_elements).apply(
@@ -72,25 +70,23 @@ class StatsDatasetSerializationTest(
       self.run_core_tests(
           lambda: dataset_ops.Dataset.range(100).apply(
               stats_ops.latency_stats(["record_latency", "record_latency_2"])),
-          None, 100)
+          100)
       # pylint: enable=g-long-lambda
 
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
-    self.run_core_tests(
-        lambda: self._build_dataset_latency_stats(num_outputs),
-        lambda: self._build_dataset_latency_stats(num_outputs // 10),
-        num_outputs)
+    self.run_core_tests(lambda: self._build_dataset_latency_stats(num_outputs),
+                        num_outputs)
 
     self.run_core_tests(lambda: self._build_dataset_multiple_tags(num_outputs),
-                        None, num_outputs)
+                        num_outputs)
 
     tag1 = "record_latency"
     tag2 = "record_latency"
     self.run_core_tests(
         lambda: self._build_dataset_multiple_tags(num_outputs, tag1, tag2),
-        None, num_outputs)
+        num_outputs)
 
   def _build_dataset_stats_aggregator(self):
     aggregator = stats_aggregator.StatsAggregator()
@@ -100,7 +96,7 @@ class StatsDatasetSerializationTest(
   def test_set_stats_aggregator_not_support_checkpointing(self):
     with self.assertRaisesRegexp(errors.UnimplementedError,
                                  "does not support checkpointing"):
-      self.run_core_tests(self._build_dataset_stats_aggregator, None, 10)
+      self.run_core_tests(self._build_dataset_stats_aggregator, 10)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
index 47899eab68c..67a27ac7570 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
@@ -33,10 +33,9 @@ class TakeWhileDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         take_while_ops.take_while(lambda x: x < upper_bound))
 
-  @parameterized.parameters((23, 10, 7), (10, 50, 0), (25, 30, 25))
-  def testCore(self, num_elem1, num_elem2, upper_bound):
-    self.run_core_tests(lambda: self._build_dataset(num_elem1, upper_bound),
-                        lambda: self._build_dataset(num_elem2, upper_bound),
+  @parameterized.parameters((23, 7), (10, 0), (25, 25))
+  def testCore(self, num_elem, upper_bound):
+    self.run_core_tests(lambda: self._build_dataset(num_elem, upper_bound),
                         upper_bound)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
index c87a7443a7a..97827c85d94 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
@@ -45,7 +45,7 @@ class TextLineDatasetSerializationTest(
       # pylint: disable=cell-var-from-loop
       self.run_core_tests(
           lambda: self._build_iterator_graph(test_filenames, compression_type),
-          lambda: self._build_iterator_graph(test_filenames), num_outputs)
+          num_outputs)
       # pylint: enable=cell-var-from-loop
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
index f0dcc131d44..92cd8e0e4ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
@@ -70,10 +70,9 @@ class TFRecordDatasetSerializationTest(
     self.run_core_tests(
         lambda: self._build_iterator_graph(num_epochs, batch_size,
                                            buffer_size=0),
-        lambda: self._build_iterator_graph(num_epochs * 2, batch_size),
         num_outputs)
     self.run_core_tests(
-        lambda: self._build_iterator_graph(num_epochs, buffer_size=0), None,
+        lambda: self._build_iterator_graph(num_epochs, buffer_size=0),
         num_outputs * batch_size)
     # pylint: enable=g-long-lambda
 
@@ -81,7 +80,6 @@ class TFRecordDatasetSerializationTest(
     num_epochs = 5
     num_outputs = num_epochs * self._num_files * self._num_records
     self.run_core_tests(lambda: self._build_iterator_graph(num_epochs),
-                        lambda: self._build_iterator_graph(num_epochs * 2),
                         num_outputs)
 
   def testTFRecordWithCompressionCore(self):
@@ -89,10 +87,10 @@ class TFRecordDatasetSerializationTest(
     num_outputs = num_epochs * self._num_files * self._num_records
     self.run_core_tests(
         lambda: self._build_iterator_graph(num_epochs, compression_type="ZLIB"),
-        lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
+        num_outputs)
     self.run_core_tests(
         lambda: self._build_iterator_graph(num_epochs, compression_type="GZIP"),
-        lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
+        num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
index 528598dfe43..e900c56d0d1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
@@ -43,7 +43,6 @@ class UnbatchDatasetSerializationTest(
     num_outputs = tensor_slice_len
     self.run_core_tests(
         lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
-        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
         num_outputs)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
index e2862af4d65..278fd857c5a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
@@ -32,8 +32,7 @@ class UniqueDatasetSerializationTest(
       return dataset_ops.Dataset.range(num_elements).map(
           lambda x: x % unique_elem_range).apply(unique.unique())
 
-    self.run_core_tests(lambda: build_dataset(200, 100),
-                        lambda: build_dataset(40, 100), 100)
+    self.run_core_tests(lambda: build_dataset(200, 100), 100)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
index 4ea6131c224..b26691fed07 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
@@ -43,11 +43,10 @@ class ZipDatasetSerializationTest(
     # Equal length components
     arr = [37.0, 38.0, 39.0, 40.0]
     num_outputs = len(arr)
-    self.run_core_tests(lambda: self._build_dataset(arr), None, num_outputs)
+    self.run_core_tests(lambda: self._build_dataset(arr), num_outputs)
     # Variable length components
     diff_size_arr = [1.0, 2.0]
-    self.run_core_tests(lambda: self._build_dataset(diff_size_arr),
-                        lambda: self._build_dataset(arr), 2)
+    self.run_core_tests(lambda: self._build_dataset(diff_size_arr), 2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 313dabf6bc5..0f43f4df704 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -21,18 +21,17 @@ import os
 import time
 from absl.testing import parameterized
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import snapshot
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import combinations
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
                           parameterized.TestCase):
 
@@ -78,6 +77,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
           self.assertEqual(filename, "%08d.snapshot" % file_counter)
           file_counter += 1
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWriteDifferentPipelinesInOneDirectory(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -91,6 +91,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWriteSnapshotMultipleSimultaneous(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -110,6 +111,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # one that lost the race would be in passthrough mode.
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testGetNextCreatesDir(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -128,8 +130,12 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # We check that only one directory is created.
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
-  @parameterized.parameters(snapshot.COMPRESSION_NONE,
-                            snapshot.COMPRESSION_GZIP)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              compression=[snapshot.COMPRESSION_NONE,
+                           snapshot.COMPRESSION_GZIP])))
   def testWriteSnapshotSimpleSuccessful(self, compression):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -139,6 +145,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWriteSnapshotRepeatAfterwards(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -149,8 +156,12 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
-  @parameterized.parameters(snapshot.COMPRESSION_NONE,
-                            snapshot.COMPRESSION_GZIP)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              compression=[snapshot.COMPRESSION_NONE,
+                           snapshot.COMPRESSION_GZIP])))
   def testReadSnapshotBackAfterWrite(self, compression):
     self.setUpTFRecord()
     filenames = self.test_filenames
@@ -174,123 +185,80 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
         tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testReadSnapshotParallelAfterWrite(self):
-    with compat.forward_compatibility_horizon(2019, 8, 16):
-      self.setUpTFRecord(10, 4000)
-      filenames = self.test_filenames
+    self.setUpTFRecord(10, 4000)
+    filenames = self.test_filenames
 
-      expected = [
-          b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
-          for f in range(0, 10)
-          for r in range(0, 4000)
-      ]
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 4000)
+    ]
 
-      tmpdir = self.makeSnapshotDirectory()
-      dataset = core_readers._TFRecordDataset(filenames)
-      dataset = dataset.apply(
-          snapshot.snapshot(
-              tmpdir,
-              shard_size_bytes=1024 * 1024,
-              num_reader_threads=2,
-              reader_buffer_size=10))
-      self.assertDatasetProduces(dataset, expected, assert_items_equal=True)
+    tmpdir = self.makeSnapshotDirectory()
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(
+            tmpdir,
+            shard_size_bytes=1024 * 1024,
+            num_reader_threads=2,
+            reader_buffer_size=10))
+    self.assertDatasetProduces(dataset, expected, assert_items_equal=True)
 
-      # remove the original files and try to read the data back only from
-      # snapshot.
-      self.removeTFRecords()
+    # remove the original files and try to read the data back only from
+    # snapshot.
+    self.removeTFRecords()
 
-      dataset2 = core_readers._TFRecordDataset(filenames)
-      dataset2 = dataset2.apply(
-          snapshot.snapshot(
-              tmpdir,
-              shard_size_bytes=1024 * 1024,
-              num_reader_threads=2,
-              reader_buffer_size=10))
-      self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(
+            tmpdir,
+            shard_size_bytes=1024 * 1024,
+            num_reader_threads=2,
+            reader_buffer_size=10))
+    self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
-  @parameterized.parameters(
-      {
-          "compression": snapshot.COMPRESSION_NONE,
-          "threads": 2,
-          "size": 1
-      },
-      {
-          "compression": snapshot.COMPRESSION_GZIP,
-          "threads": 2,
-          "size": 1
-      },
-      {
-          "compression": snapshot.COMPRESSION_NONE,
-          "threads": 2,
-          "size": 2
-      },
-      {
-          "compression": snapshot.COMPRESSION_GZIP,
-          "threads": 2,
-          "size": 2
-      },
-      {
-          "compression": snapshot.COMPRESSION_NONE,
-          "threads": 8,
-          "size": 1
-      },
-      {
-          "compression": snapshot.COMPRESSION_GZIP,
-          "threads": 8,
-          "size": 1
-      },
-      {
-          "compression": snapshot.COMPRESSION_NONE,
-          "threads": 8,
-          "size": 4
-      },
-      {
-          "compression": snapshot.COMPRESSION_GZIP,
-          "threads": 8,
-          "size": 4
-      },
-      {
-          "compression": snapshot.COMPRESSION_NONE,
-          "threads": 8,
-          "size": 8
-      },
-      {
-          "compression": snapshot.COMPRESSION_GZIP,
-          "threads": 8,
-          "size": 8
-      },
-  )
-  def testReadSnapshotBackAfterMultiThreadedWrite(self, compression, threads,
-                                                  size):
-    with compat.forward_compatibility_horizon(2019, 8, 16):
-      self.setUpTFRecord()
-      filenames = self.test_filenames
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.times(
+              combinations.combine(
+                  compression=[snapshot.COMPRESSION_NONE,
+                               snapshot.COMPRESSION_GZIP]),
+              combinations.combine(threads=2, size=[1, 2]) +
+              combinations.combine(threads=8, size=[1, 4, 8]))))
+  def testReadSnapshotBackAfterMultiThreadedWrite(
+      self, compression, threads, size):
+    self.setUpTFRecord()
+    filenames = self.test_filenames
 
-      expected = [
-          b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
-          for f in range(0, 10)
-          for r in range(0, 10)
-      ]
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 10)
+    ]
 
-      tmpdir = self.makeSnapshotDirectory()
-      dataset = core_readers._TFRecordDataset(filenames)
-      dataset = dataset.apply(
-          snapshot.snapshot(
-              tmpdir,
-              compression=compression,
-              num_writer_threads=threads,
-              writer_buffer_size=size))
-      self.assertDatasetProduces(dataset, expected)
+    tmpdir = self.makeSnapshotDirectory()
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(
+            tmpdir,
+            compression=compression,
+            num_writer_threads=threads,
+            writer_buffer_size=size))
+    self.assertDatasetProduces(dataset, expected)
 
-      # remove the original files and try to read the data back only from
-      # snapshot
-      self.removeTFRecords()
+    # remove the original files and try to read the data back only from
+    # snapshot
+    self.removeTFRecords()
 
-      dataset2 = core_readers._TFRecordDataset(filenames)
-      dataset2 = dataset2.apply(
-          snapshot.snapshot(tmpdir, compression=compression))
-      self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(tmpdir, compression=compression))
+    self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSameFingerprintWithDifferentInitializationOrder(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -312,6 +280,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testExpiredSnapshotRewrite(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -340,6 +309,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       self.evaluate(next2())
     self.assertSnapshotDirectoryContains(tmpdir, 1, 2, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSpecifyShardSize(self):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -355,6 +325,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 4)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testAdditionalOperationsAfterReadBack(self):
     self.setUpTFRecord()
     filenames = self.test_filenames
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index ede4f8af019..4f04a0a3639 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -362,7 +362,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase):
 
     num_output = 100 // 16 + 1
     self.parallelCallsStats(
-        dataset_fn, {"ExperimentalMapAndBatchDataset"},
+        dataset_fn, {"MapAndBatchDataset"},
         num_output,
         check_elements=False,
         function_processing_time=True)
@@ -391,7 +391,7 @@ class FeatureStatsDatasetTest(
       num_output = total_records // batch_size + 1
 
     self.parallelCallsStats(
-        dataset_fn, {"ExperimentalParseExampleDataset"},
+        dataset_fn, {"ParseExampleDataset"},
         num_output,
         check_elements=False)
 
@@ -409,19 +409,19 @@ class FeatureStatsDatasetTest(
     handle = self.getHandle(aggregator)
     self.assertStatisticsHasCount(
         handle,
-        self.regexForNodeName("record_stats::ExperimentalParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDataset",
                               "features_count"), total_records)
     self.assertStatisticsHasCount(
         handle,
-        self.regexForNodeName("record_stats::ExperimentalParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDataset",
                               "feature_values_count"), total_records)
     self.assertStatisticsHasSum(
         handle,
-        self.regexForNodeName("record_stats::ExperimentalParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDataset",
                               "features_count"), total_records * 4)
     self.assertStatisticsHasSum(
         handle,
-        self.regexForNodeName("record_stats::ExperimentalParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDataset",
                               "feature_values_count"),
         self._sum_keywords(1) * num_epochs + 3 * total_records)
 
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index a7fd0bf0ccc..d028d35cb5c 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
@@ -244,21 +243,14 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
     self._batch_size = batch_size
     self._row_shape = row_shape
     self._element_spec = sparse_tensor.SparseTensorSpec(
-        tensor_shape.vector(None).concatenate(self._row_shape),
+        tensor_shape.TensorShape([None]).concatenate(self._row_shape),
         dataset_ops.get_legacy_output_types(input_dataset))
 
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.dense_to_sparse_batch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._batch_size,
-          row_shape=convert.partial_shape_to_tensor(self._row_shape),
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._batch_size,
-          row_shape=convert.partial_shape_to_tensor(self._row_shape),
-          **self._flat_structure)
+    variant_tensor = ged_ops.dense_to_sparse_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._batch_size,
+        row_shape=convert.partial_shape_to_tensor(self._row_shape),
+        **self._flat_structure)
     super(_DenseToSparseBatchDataset, self).__init__(input_dataset,
                                                      variant_tensor)
 
@@ -302,26 +294,15 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
           lambda component_spec: component_spec._batch(None),
           self._map_func.output_structure)
     # pylint: enable=protected-access
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.map_and_batch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          f=self._map_func.function,
-          batch_size=self._batch_size_t,
-          num_parallel_calls=self._num_parallel_calls_t,
-          drop_remainder=self._drop_remainder_t,
-          preserve_cardinality=True,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_map_and_batch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          f=self._map_func.function,
-          batch_size=self._batch_size_t,
-          num_parallel_calls=self._num_parallel_calls_t,
-          drop_remainder=self._drop_remainder_t,
-          preserve_cardinality=True,
-          **self._flat_structure)
+    variant_tensor = ged_ops.map_and_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        batch_size=self._batch_size_t,
+        num_parallel_calls=self._num_parallel_calls_t,
+        drop_remainder=self._drop_remainder_t,
+        preserve_cardinality=True,
+        **self._flat_structure)
     super(_MapAndBatchDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
index d7f47646a11..db4bb8f6f30 100644
--- a/tensorflow/python/data/experimental/ops/cardinality.py
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -49,7 +48,4 @@ def cardinality(dataset):
     constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
   """
 
-  if compat.forward_compatible(2019, 8, 3):
-    return ged_ops.dataset_cardinality(dataset._variant_tensor)  # pylint: disable=protected-access
-  else:
-    return ged_ops.experimental_dataset_cardinality(dataset._variant_tensor)  # pylint: disable=protected-access
+  return ged_ops.dataset_cardinality(dataset._variant_tensor)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index deeaa5f9fbe..7245a3d7928 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -49,18 +49,11 @@ class _AutoShardDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
 
     self._element_spec = input_dataset.element_spec
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.auto_shard_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_workers=num_workers,
-          index=index,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_auto_shard_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_workers=num_workers,
-          index=index,
-          **self._flat_structure)
+    variant_tensor = ged_ops.auto_shard_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_workers=num_workers,
+        index=index,
+        **self._flat_structure)
     super(_AutoShardDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
@@ -74,19 +67,29 @@ def _AutoShardDatasetV1(input_dataset, num_workers, index):  # pylint: disable=i
 
 
 class _RebatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that divides the batch size by `num_workers`."""
+  """A `Dataset` that divides the batch size by `num_replicas`.
 
-  def __init__(self, input_dataset, num_workers):
+  For each batch in the input dataset, the resulting dataset will produce
+  `num_replicas` minibatches whose sizes add up to the original batch size.
+  """
+
+  def __init__(self, input_dataset, num_replicas, use_fallback=True):
     self._input_dataset = input_dataset
 
     def recalculate_output_shapes(output_shapes):
-      """Recalculates the output_shapes after dividing it by num_workers."""
+      """Recalculates the output_shapes after dividing it by num_replicas."""
       if len(output_shapes) < 1:
         raise ValueError(
             "Input shape should have at least one dimension. "
             "Perhaps your input dataset is not batched?")
-      output_dims = [d for d in output_shapes.dims]
-      output_dims[0] = (output_dims[0] + num_workers - 1) // num_workers
+      output_dims = [d.value for d in output_shapes.dims]
+
+      if output_dims[0] is not None and output_dims[0] % num_replicas == 0:
+        output_dims[0] = output_dims[0] // num_replicas
+      else:
+        # Set the batch dimension to unknown. If the global batch size does not
+        # divide num_replicas evenly, the minibatches may have different sizes.
+        output_dims[0] = None
       return tensor_shape.TensorShape(output_dims)
 
     input_types = dataset_ops.get_legacy_output_types(self._input_dataset)
@@ -96,15 +99,16 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
 
     self._element_spec = structure.convert_legacy_structure(
         input_types, output_shapes, input_classes)
-    if compat.forward_compatible(2019, 8, 3):
+    if compat.forward_compatible(2019, 8, 13) or not use_fallback:
       variant_tensor = ged_ops.rebatch_dataset(
           self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_workers=num_workers,
+          num_replicas=num_replicas,
+          use_fallback=use_fallback,
           **self._flat_structure)
     else:
-      variant_tensor = ged_ops.experimental_rebatch_dataset(
+      variant_tensor = ged_ops.rebatch_dataset(
           self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_workers=num_workers,
+          num_replicas=num_replicas,
           **self._flat_structure)
     super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
 
@@ -140,8 +144,18 @@ def replicate(dataset, devices):
   if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
-  graph_def = dataset._as_serialized_graph()  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  dataset_device = dataset._variant_tensor.device
+
   datasets = {}
+  if len(devices) == 1 and devices[0] == dataset_device:
+    datasets[devices[0]] = dataset
+    return datasets
+
+  with ops.colocate_with(dataset._variant_tensor):
+    dataset = dataset._apply_options()
+    allow_stateful = dataset.options().experimental_allow_stateful
+    graph_def = dataset._as_serialized_graph(allow_stateful=allow_stateful)
   for device in devices:
     ds = _RemoteDataset(graph_def, device, dataset.element_spec)
     datasets[device] = ds
diff --git a/tensorflow/python/data/experimental/ops/distribute_options.py b/tensorflow/python/data/experimental/ops/distribute_options.py
index 3c5b4a6b520..33f942068cb 100644
--- a/tensorflow/python/data/experimental/ops/distribute_options.py
+++ b/tensorflow/python/data/experimental/ops/distribute_options.py
@@ -47,6 +47,14 @@ class DistributeOptions(options.OptionsBase):
       "option does nothing. If None, defaults to True.",
       default_factory=lambda: True)
 
+  _make_stateless = options.create_option(
+      name="_make_stateless",
+      ty=bool,
+      docstring=
+      "Determines whether the input pipeline should be rewritten to not "
+      "contain stateful transformations (so that its graph can be moved "
+      "between devices).")
+
   num_devices = options.create_option(
       name="num_devices",
       ty=int,
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 1aa2ad7375a..23937bb76f8 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -60,14 +59,8 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   def __init__(self, input_dataset):
     """See `Dataset.ignore_errors()` for details."""
     self._input_dataset = input_dataset
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = (
-          gen_experimental_dataset_ops.ignore_errors_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.ignore_errors_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            **self._flat_structure))
     super(_IgnoreErrorsDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 3cbe78427c0..e48ffbc2d46 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
@@ -255,30 +254,17 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
     self._make_init_func(reducer.init_func)
     self._make_reduce_func(reducer.reduce_func, input_dataset)
     self._make_finalize_func(reducer.finalize_func)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.experimental_group_by_reducer_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._key_func.function.captured_inputs,
-          self._init_func.function.captured_inputs,
-          self._reduce_func.function.captured_inputs,
-          self._finalize_func.function.captured_inputs,
-          key_func=self._key_func.function,
-          init_func=self._init_func.function,
-          reduce_func=self._reduce_func.function,
-          finalize_func=self._finalize_func.function,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.group_by_reducer_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._key_func.function.captured_inputs,
-          self._init_func.function.captured_inputs,
-          self._reduce_func.function.captured_inputs,
-          self._finalize_func.function.captured_inputs,
-          key_func=self._key_func.function,
-          init_func=self._init_func.function,
-          reduce_func=self._reduce_func.function,
-          finalize_func=self._finalize_func.function,
-          **self._flat_structure)
+    variant_tensor = ged_ops.experimental_group_by_reducer_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
+        **self._flat_structure)
     super(_GroupByReducerDataset, self).__init__(input_dataset, variant_tensor)
 
   def _make_key_func(self, key_func, input_dataset):
@@ -390,26 +376,15 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
     self._make_key_func(key_func, input_dataset)
     self._make_reduce_func(reduce_func, input_dataset)
     self._make_window_size_func(window_size_func)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.group_by_window_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._key_func.function.captured_inputs,
-          self._reduce_func.function.captured_inputs,
-          self._window_size_func.function.captured_inputs,
-          key_func=self._key_func.function,
-          reduce_func=self._reduce_func.function,
-          window_size_func=self._window_size_func.function,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_group_by_window_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._key_func.function.captured_inputs,
-          self._reduce_func.function.captured_inputs,
-          self._window_size_func.function.captured_inputs,
-          key_func=self._key_func.function,
-          reduce_func=self._reduce_func.function,
-          window_size_func=self._window_size_func.function,
-          **self._flat_structure)
+    variant_tensor = ged_ops.group_by_window_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
+        **self._flat_structure)
     super(_GroupByWindowDataset, self).__init__(input_dataset, variant_tensor)
 
   def _make_window_size_func(self, window_size_func):
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 9c9645c4947..07351b86449 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
@@ -127,19 +126,11 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    if compat.forward_compatible(2019, 8, 3):
-      return (
-          gen_experimental_dataset_ops.directed_interleave_dataset(
-              self._selector_input._variant_tensor,
-              [data_input._variant_tensor for data_input in self._data_inputs],
-              **self._flat_structure))
-    else:
-      return (
-          gen_experimental_dataset_ops.experimental_directed_interleave_dataset(
-              self._selector_input._variant_tensor,
-              [data_input._variant_tensor for data_input in self._data_inputs],
-              **self._flat_structure))
-    # pylint: enable=protected-access
+    return (
+        gen_experimental_dataset_ops.directed_interleave_dataset(
+            self._selector_input._variant_tensor,
+            [data_input._variant_tensor for data_input in self._data_inputs],
+            **self._flat_structure))
 
   def _inputs(self):
     return [self._selector_input] + self._data_inputs
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
index 59b477bab0c..5bb0142fd57 100644
--- a/tensorflow/python/data/experimental/ops/matching_files.py
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,11 +31,7 @@ class MatchingFilesDataset(dataset_ops.DatasetSource):
   def __init__(self, patterns):
     self._patterns = ops.convert_to_tensor(
         patterns, dtype=dtypes.string, name="patterns")
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.matching_files_dataset(self._patterns)
-    else:
-      variant_tensor = ged_ops.experimental_matching_files_dataset(
-          self._patterns)
+    variant_tensor = ged_ops.matching_files_dataset(self._patterns)
     super(MatchingFilesDataset, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 23c381ee47d..a5f71d376c1 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -105,18 +104,11 @@ class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
       raise ValueError("At least one transformation should be specified")
     self._transformations = ops.convert_to_tensor(
         transformations, dtype=dtypes.string, name="transformations")
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = (
-          gen_experimental_dataset_ops.assert_next_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._transformations,
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.experimental_assert_next_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._transformations,
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.assert_next_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            self._transformations,
+            **self._flat_structure))
     super(_AssertNextDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -126,16 +118,10 @@ class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
   def __init__(self, input_dataset):
     """See `non_serializable()` for details."""
     self._input_dataset = input_dataset
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = (
-          gen_experimental_dataset_ops.non_serializable_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.experimental_non_serializable_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.non_serializable_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            **self._flat_structure))
     super(_NonSerializableDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -171,18 +157,11 @@ class _ChooseFastestDataset(dataset_ops.DatasetV2):
     """
     self._datasets = list(datasets)
     self._element_spec = self._datasets[0].element_spec
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = (
-          gen_experimental_dataset_ops.choose_fastest_dataset(
-              [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
-              num_experiments=num_experiments,
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.experimental_choose_fastest_dataset(
-              [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
-              num_experiments=num_experiments,
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.choose_fastest_dataset(
+            [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
+            num_experiments=num_experiments,
+            **self._flat_structure))
     super(_ChooseFastestDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 3dad40ac57a..2f74eba5a89 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
@@ -81,28 +80,16 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
     self._element_spec = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
 
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = (
-          gen_experimental_dataset_ops.parse_example_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._num_parallel_calls,
-              self._dense_defaults,
-              self._sparse_keys,
-              self._dense_keys,
-              self._sparse_types,
-              self._dense_shapes,
-              **self._flat_structure))
-    else:
-      variant_tensor = (
-          gen_experimental_dataset_ops.experimental_parse_example_dataset(
-              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-              self._num_parallel_calls,
-              self._dense_defaults,
-              self._sparse_keys,
-              self._dense_keys,
-              self._sparse_types,
-              self._dense_shapes,
-              **self._flat_structure))
+    variant_tensor = (
+        gen_experimental_dataset_ops.parse_example_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            self._num_parallel_calls,
+            self._dense_defaults,
+            self._sparse_keys,
+            self._dense_keys,
+            self._sparse_types,
+            self._dense_shapes,
+            **self._flat_structure))
     super(_ParseExampleDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index 873fe23639a..e4bd782532e 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import functools
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import dtypes
@@ -35,12 +34,8 @@ class RandomDatasetV2(dataset_ops.DatasetSource):
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
     self._seed, self._seed2 = random_seed.get_seed(seed)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.random_dataset(
-          seed=self._seed, seed2=self._seed2, **self._flat_structure)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_random_dataset(
-          seed=self._seed, seed2=self._seed2, **self._flat_structure)
+    variant_tensor = gen_experimental_dataset_ops.random_dataset(
+        seed=self._seed, seed2=self._seed2, **self._flat_structure)
     super(RandomDatasetV2, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index cf8b8c7a13e..c6de693d987 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -20,11 +20,10 @@ from __future__ import print_function
 import collections
 import csv
 import functools
+import gzip
 
 import numpy as np
 
-from tensorflow.python.compat import compat
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import parsing_ops
@@ -37,6 +36,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import io_ops
@@ -108,10 +108,11 @@ def _infer_type(str_val, na_value, prev_type):
       return type_list[i]
 
 
-def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                  file_io_fn):
   """Generator that yields rows of CSV file(s) in order."""
   for fn in filenames:
-    with file_io.FileIO(fn, "r") as f:
+    with file_io_fn(fn) as f:
       rdr = csv.reader(
           f,
           delimiter=field_delim,
@@ -129,14 +130,15 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
                            na_value, header, num_rows_for_inference,
-                           select_columns):
+                           select_columns, file_io_fn):
   """Infers column types from the first N valid CSV records of files."""
   if select_columns is None:
     select_columns = range(num_cols)
   inferred_types = [None] * len(select_columns)
 
   for i, csv_row in enumerate(
-      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)):
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                    file_io_fn)):
     if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
 
@@ -153,13 +155,13 @@ def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
   ]
 
 
-def _infer_column_names(filenames, field_delim, use_quote_delim):
+def _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn):
   """Infers column names from first rows of files."""
   csv_kwargs = {
       "delimiter": field_delim,
       "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
   }
-  with file_io.FileIO(filenames[0], "r") as f:
+  with file_io_fn(filenames[0]) as f:
     try:
       column_names = next(csv.reader(f, **csv_kwargs))
     except StopIteration:
@@ -167,7 +169,7 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
                         "of %s.  Empty file?") % filenames[0])
 
   for name in filenames[1:]:
-    with file_io.FileIO(name, "r") as f:
+    with file_io_fn(name) as f:
       try:
         if next(csv.reader(f, **csv_kwargs)) != column_names:
           raise ValueError(
@@ -224,7 +226,7 @@ def make_tf_record_dataset(file_pattern,
                            shuffle=True,
                            shuffle_buffer_size=None,
                            shuffle_seed=None,
-                           prefetch_buffer_size=dataset_ops.AUTOTUNE,
+                           prefetch_buffer_size=None,
                            num_parallel_reads=None,
                            num_parallel_parser_calls=None,
                            drop_final_batch=False):
@@ -256,9 +258,9 @@ def make_tf_record_dataset(file_pattern,
       Defaults to auto-tune. Set to 0 to disable prefetching.
     num_parallel_reads: (Optional.) Number of threads used to read
       records from files. By default or if set to a value >1, the
-      results will be interleaved.
+      results will be interleaved. Defaults to `24`.
     num_parallel_parser_calls: (Optional.) Number of parallel
-      records to parse in parallel. Defaults to an automatic selection.
+      records to parse in parallel. Defaults to `batch_size`.
     drop_final_batch: (Optional.) Whether the last batch should be
       dropped in case its size is smaller than `batch_size`; the
       default behavior is not to drop the smaller batch.
@@ -269,15 +271,24 @@ def make_tf_record_dataset(file_pattern,
     or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
     unspecified.
   """
-  files = dataset_ops.Dataset.list_files(
-      file_pattern, shuffle=shuffle, seed=shuffle_seed)
-
   if num_parallel_reads is None:
-    # Note: We considered auto-tuning this value, but there is a concern
+    # NOTE: We considered auto-tuning this value, but there is a concern
     # that this affects the mixing of records from different files, which
     # could affect training convergence/accuracy, so we are defaulting to
     # a constant for now.
     num_parallel_reads = 24
+
+  if num_parallel_parser_calls is None:
+    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
+    # of num cores instead of `batch_size`.
+    num_parallel_parser_calls = batch_size
+
+  if prefetch_buffer_size is None:
+    prefetch_buffer_size = dataset_ops.AUTOTUNE
+
+  files = dataset_ops.Dataset.list_files(
+      file_pattern, shuffle=shuffle, seed=shuffle_seed)
+
   dataset = core_readers.TFRecordDataset(
       files, num_parallel_reads=num_parallel_reads)
 
@@ -296,11 +307,9 @@ def make_tf_record_dataset(file_pattern,
   if parser_fn is None:
     dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch)
   else:
-    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
-    # of num cores instead of map_and_batch's default behavior of one batch.
-    dataset = dataset.apply(batching.map_and_batch(
-        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
-        drop_remainder=drop_final_batch))
+    dataset = dataset.map(
+        parser_fn, num_parallel_calls=num_parallel_parser_calls)
+    dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch)
 
   if prefetch_buffer_size == 0:
     return dataset
@@ -324,8 +333,8 @@ def make_csv_dataset_v2(
     shuffle=True,
     shuffle_buffer_size=10000,
     shuffle_seed=None,
-    prefetch_buffer_size=dataset_ops.AUTOTUNE,
-    num_parallel_reads=1,
+    prefetch_buffer_size=None,
+    num_parallel_reads=None,
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
@@ -391,9 +400,8 @@ def make_csv_dataset_v2(
     prefetch_buffer_size: An int specifying the number of feature
       batches to prefetch for performance improvement. Recommended value is the
       number of batches consumed per training step. Defaults to auto-tune.
-
     num_parallel_reads: Number of threads used to read CSV records from files.
-      If >1, the results will be interleaved.
+      If >1, the results will be interleaved. Defaults to `1`.
     sloppy: If `True`, reading performance will be improved at
       the cost of non-deterministic ordering. If `False`, the order of elements
       produced is deterministic prior to shuffling (elements are still
@@ -419,6 +427,12 @@ def make_csv_dataset_v2(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
+  if num_parallel_reads is None:
+    num_parallel_reads = 1
+
+  if prefetch_buffer_size is None:
+    prefetch_buffer_size = dataset_ops.AUTOTUNE
+
   # Create dataset of all matching filenames
   filenames = _get_file_names(file_pattern, False)
   dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
@@ -426,12 +440,28 @@ def make_csv_dataset_v2(
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
   # Clean arguments; figure out column names and defaults
-
+  if column_names is None or column_defaults is None:
+    # Find out which io function to open the file
+    file_io_fn = lambda filename: file_io.FileIO(filename, "r")
+    if compression_type is not None:
+      compression_type_value = tensor_util.constant_value(compression_type)
+      if compression_type_value is None:
+        raise ValueError("Received unkown compression_type")
+      if compression_type_value == "GZIP":
+        file_io_fn = lambda filename: gzip.open(filename, "rt")
+      elif compression_type_value == "ZLIB":
+        raise ValueError(
+            "compression_type (%s) is not supported for probing columns" %
+            compression_type)
+      elif compression_type_value != "":
+        raise ValueError("compression_type (%s) is not supported" %
+                         compression_type)
   if column_names is None:
     if not header:
       raise ValueError("Cannot infer column names without a header line.")
     # If column names are not provided, infer from the header lines
-    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
+    column_names = _infer_column_names(filenames, field_delim, use_quote_delim,
+                                       file_io_fn)
   if len(column_names) != len(set(column_names)):
     raise ValueError("Cannot have duplicate column names.")
 
@@ -440,15 +470,18 @@ def make_csv_dataset_v2(
 
   if column_defaults is not None:
     column_defaults = [
-        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        constant_op.constant([], dtype=x)
+        if not tensor_util.is_tensor(x) and x in _ACCEPTABLE_CSV_TYPES else x
         for x in column_defaults
     ]
   else:
     # If column defaults are not provided, infer from records at graph
     # construction time
-    column_defaults = _infer_column_defaults(
-        filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, num_rows_for_inference, select_columns)
+    column_defaults = _infer_column_defaults(filenames, len(column_names),
+                                             field_delim, use_quote_delim,
+                                             na_value, header,
+                                             num_rows_for_inference,
+                                             select_columns, file_io_fn)
 
   if select_columns is not None and len(column_defaults) != len(select_columns):
     raise ValueError(
@@ -493,10 +526,18 @@ def make_csv_dataset_v2(
       return features, label
     return features
 
-  # Read files sequentially (if num_parallel_reads=1) or in parallel
-  dataset = dataset.apply(
-      interleave_ops.parallel_interleave(
-          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
+  if num_parallel_reads == dataset_ops.AUTOTUNE:
+    dataset = dataset.interleave(
+        filename_to_dataset, num_parallel_calls=num_parallel_reads)
+    options = dataset_ops.Options()
+    options.experimental_deterministic = not sloppy
+    dataset = dataset.with_options(options)
+  else:
+    # Read files sequentially (if num_parallel_reads=1) or in parallel
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(
+            filename_to_dataset, cycle_length=num_parallel_reads,
+            sloppy=sloppy))
 
   dataset = _maybe_shuffle_and_repeat(
       dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
@@ -532,8 +573,8 @@ def make_csv_dataset_v1(
     shuffle=True,
     shuffle_buffer_size=10000,
     shuffle_seed=None,
-    prefetch_buffer_size=dataset_ops.AUTOTUNE,
-    num_parallel_reads=1,
+    prefetch_buffer_size=None,
+    num_parallel_reads=None,
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
@@ -642,7 +683,8 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         argument_default="",
         argument_dtype=dtypes.string)
     record_defaults = [
-        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        constant_op.constant([], dtype=x)
+        if not tensor_util.is_tensor(x) and x in _ACCEPTABLE_CSV_TYPES else x
         for x in record_defaults
     ]
     self._record_defaults = ops.convert_n_to_tensor(
@@ -665,30 +707,17 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     )
     self._element_spec = tuple(
         tensor_spec.TensorSpec([], d.dtype) for d in self._record_defaults)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.csv_dataset(
-          filenames=self._filenames,
-          record_defaults=self._record_defaults,
-          buffer_size=self._buffer_size,
-          header=self._header,
-          output_shapes=self._flat_shapes,
-          field_delim=self._field_delim,
-          use_quote_delim=self._use_quote_delim,
-          na_value=self._na_value,
-          select_cols=self._select_cols,
-          compression_type=self._compression_type)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_csv_dataset(
-          filenames=self._filenames,
-          record_defaults=self._record_defaults,
-          buffer_size=self._buffer_size,
-          header=self._header,
-          output_shapes=self._flat_shapes,
-          field_delim=self._field_delim,
-          use_quote_delim=self._use_quote_delim,
-          na_value=self._na_value,
-          select_cols=self._select_cols,
-          compression_type=self._compression_type)
+    variant_tensor = gen_experimental_dataset_ops.csv_dataset(
+        filenames=self._filenames,
+        record_defaults=self._record_defaults,
+        buffer_size=self._buffer_size,
+        header=self._header,
+        output_shapes=self._flat_shapes,
+        field_delim=self._field_delim,
+        use_quote_delim=self._use_quote_delim,
+        na_value=self._na_value,
+        select_cols=self._select_cols,
+        compression_type=self._compression_type)
     super(CsvDatasetV2, self).__init__(variant_tensor)
 
   @property
@@ -728,9 +757,9 @@ def make_batched_features_dataset_v2(file_pattern,
                                      shuffle=True,
                                      shuffle_buffer_size=10000,
                                      shuffle_seed=None,
-                                     prefetch_buffer_size=dataset_ops.AUTOTUNE,
-                                     reader_num_threads=1,
-                                     parser_num_threads=2,
+                                     prefetch_buffer_size=None,
+                                     reader_num_threads=None,
+                                     parser_num_threads=None,
                                      sloppy_ordering=False,
                                      drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
@@ -803,9 +832,9 @@ def make_batched_features_dataset_v2(file_pattern,
       improve performance. Recommended value is the number of batches consumed
       per training step. Defaults to auto-tune.
     reader_num_threads: Number of threads used to read `Example` records. If >1,
-      the results will be interleaved.
+      the results will be interleaved. Defaults to `1`.
     parser_num_threads: Number of threads to use for parsing `Example` tensors
-      into a dictionary of `Feature` tensors.
+      into a dictionary of `Feature` tensors. Defaults to `2`.
     sloppy_ordering: If `True`, reading performance will be improved at
       the cost of non-deterministic ordering. If `False`, the order of elements
       produced is deterministic prior to shuffling (elements are still
@@ -823,6 +852,14 @@ def make_batched_features_dataset_v2(file_pattern,
     TypeError: If `reader` is a `tf.compat.v1.ReaderBase` subclass.
     ValueError: If `label_key` is not one of the `features` keys.
   """
+
+  if reader_num_threads is None:
+    reader_num_threads = 1
+  if parser_num_threads is None:
+    parser_num_threads = 2
+  if prefetch_buffer_size is None:
+    prefetch_buffer_size = dataset_ops.AUTOTUNE
+
   # Create dataset of all matching filenames
   dataset = dataset_ops.Dataset.list_files(
       file_pattern, shuffle=shuffle, seed=shuffle_seed)
@@ -837,12 +874,20 @@ def make_batched_features_dataset_v2(file_pattern,
   if reader_args is None:
     reader_args = []
 
-  # Read files sequentially (if reader_num_threads=1) or in parallel
-  dataset = dataset.apply(
-      interleave_ops.parallel_interleave(
-          lambda filename: reader(filename, *reader_args),
-          cycle_length=reader_num_threads,
-          sloppy=sloppy_ordering))
+  if reader_num_threads == dataset_ops.AUTOTUNE:
+    dataset = dataset.interleave(
+        lambda filename: reader(filename, *reader_args),
+        num_parallel_calls=reader_num_threads)
+    options = dataset_ops.Options()
+    options.experimental_deterministic = not sloppy_ordering
+    dataset = dataset.with_options(options)
+  else:
+    # Read files sequentially (if reader_num_threads=1) or in parallel
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(
+            lambda filename: reader(filename, *reader_args),
+            cycle_length=reader_num_threads,
+            sloppy=sloppy_ordering))
 
   # Extract values if the `Example` tensors are stored as key-value tuples.
   if dataset_ops.get_legacy_output_types(dataset) == (
@@ -888,9 +933,9 @@ def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-do
                                      shuffle=True,
                                      shuffle_buffer_size=10000,
                                      shuffle_seed=None,
-                                     prefetch_buffer_size=dataset_ops.AUTOTUNE,
-                                     reader_num_threads=1,
-                                     parser_num_threads=2,
+                                     prefetch_buffer_size=None,
+                                     reader_num_threads=None,
+                                     parser_num_threads=None,
                                      sloppy_ordering=False,
                                      drop_final_batch=False):
   return dataset_ops.DatasetV1Adapter(make_batched_features_dataset_v2(
@@ -898,8 +943,8 @@ def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-do
       num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
       prefetch_buffer_size, reader_num_threads, parser_num_threads,
       sloppy_ordering, drop_final_batch))
-make_batched_features_dataset_v2.__doc__ = (
-    make_batched_features_dataset_v1.__doc__)
+make_batched_features_dataset_v1.__doc__ = (
+    make_batched_features_dataset_v2.__doc__)
 
 
 def _get_file_names(file_pattern, shuffle):
@@ -971,14 +1016,9 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
         query, dtype=dtypes.string, name="query")
     self._element_spec = nest.map_structure(
         lambda dtype: tensor_spec.TensorSpec([], dtype), output_types)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.sql_dataset(
-          self._driver_name, self._data_source_name, self._query,
-          **self._flat_structure)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_sql_dataset(
-          self._driver_name, self._data_source_name, self._query,
-          **self._flat_structure)
+    variant_tensor = gen_experimental_dataset_ops.sql_dataset(
+        self._driver_name, self._data_source_name, self._query,
+        **self._flat_structure)
     super(SqlDatasetV2, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index a81f5e64fee..27662d72c9f 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -17,14 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -52,9 +50,8 @@ class _ScanDataset(dataset_ops.UnaryDataset):
           input_structure=(self._state_structure,
                            input_dataset.element_spec),
           add_to_graph=False)
-      if not (
-          isinstance(wrapped_func.output_types, collections.Sequence) and
-          len(wrapped_func.output_types) == 2):
+      if not (isinstance(wrapped_func.output_types, collections_abc.Sequence)
+              and len(wrapped_func.output_types) == 2):
         raise TypeError("The scan function must return a pair comprising the "
                         "new state and the output value.")
 
@@ -123,22 +120,13 @@ class _ScanDataset(dataset_ops.UnaryDataset):
     self._scan_func = wrapped_func
     self._scan_func.function.add_to_graph(ops.get_default_graph())
     # pylint: disable=protected-access
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.scan_dataset(
-          self._input_dataset._variant_tensor,
-          structure.to_tensor_list(self._state_structure, self._initial_state),
-          self._scan_func.function.captured_inputs,
-          f=self._scan_func.function,
-          preserve_cardinality=True,
-          **self._flat_structure)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_scan_dataset(
-          self._input_dataset._variant_tensor,
-          structure.to_tensor_list(self._state_structure, self._initial_state),
-          self._scan_func.function.captured_inputs,
-          f=self._scan_func.function,
-          preserve_cardinality=True,
-          **self._flat_structure)
+    variant_tensor = gen_experimental_dataset_ops.scan_dataset(
+        self._input_dataset._variant_tensor,
+        structure.to_tensor_list(self._state_structure, self._initial_state),
+        self._scan_func.function.captured_inputs,
+        f=self._scan_func.function,
+        preserve_cardinality=True,
+        **self._flat_structure)
     super(_ScanDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 837ec0038cf..ff56436fbbe 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
@@ -28,16 +27,10 @@ class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
   def __init__(self, input_dataset, sleep_microseconds):
     self._input_dataset = input_dataset
     self._sleep_microseconds = sleep_microseconds
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.sleep_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._sleep_microseconds,
-          **self._flat_structure)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_sleep_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._sleep_microseconds,
-          **self._flat_structure)
+    variant_tensor = gen_experimental_dataset_ops.sleep_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._sleep_microseconds,
+        **self._flat_structure)
     super(_SleepDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 7e074cc2d75..b0d66c271f4 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -66,31 +65,19 @@ class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
     self._input_dataset = input_dataset
     self._path = ops.convert_to_tensor(path, dtype=dtypes.string, name="path")
 
-    if compat.forward_compatible(2019, 8, 15):
-      variant_tensor = ged_ops.snapshot_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          path=self._path,
-          compression=self._compression,
-          reader_path_prefix=self._reader_path_prefix,
-          writer_path_prefix=self._writer_path_prefix,
-          shard_size_bytes=self._shard_size_bytes,
-          pending_snapshot_expiry_seconds=self._pending_snapshot_expiry_seconds,
-          num_reader_threads=self._num_reader_threads,
-          reader_buffer_size=self._reader_buffer_size,
-          num_writer_threads=self._num_writer_threads,
-          writer_buffer_size=self._writer_buffer_size,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.snapshot_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          path=self._path,
-          compression=self._compression,
-          reader_path_prefix=self._reader_path_prefix,
-          writer_path_prefix=self._writer_path_prefix,
-          shard_size_bytes=self._shard_size_bytes,
-          pending_snapshot_expiry_seconds=self._pending_snapshot_expiry_seconds,
-          **self._flat_structure)
-
+    variant_tensor = ged_ops.snapshot_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        path=self._path,
+        compression=self._compression,
+        reader_path_prefix=self._reader_path_prefix,
+        writer_path_prefix=self._writer_path_prefix,
+        shard_size_bytes=self._shard_size_bytes,
+        pending_snapshot_expiry_seconds=self._pending_snapshot_expiry_seconds,
+        num_reader_threads=self._num_reader_threads,
+        reader_buffer_size=self._reader_buffer_size,
+        num_writer_threads=self._num_writer_threads,
+        writer_buffer_size=self._writer_buffer_size,
+        **self._flat_structure)
     super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index cb8239c5d55..d8174acb818 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import tempfile
 
-from tensorflow.python.compat import compat
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.util.tf_export import tf_export
@@ -126,10 +125,7 @@ class StatsAggregatorV1(object):
 
   def __init__(self):
     """Creates a `StatsAggregator`."""
-    if compat.forward_compatible(2019, 8, 3):
-      self._resource = ged_ops.stats_aggregator_handle()
-    else:
-      self._resource = ged_ops.experimental_stats_aggregator_handle()
+    self._resource = ged_ops.stats_aggregator_handle()
 
   def get_summary(self):
     """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
@@ -141,10 +137,7 @@ class StatsAggregatorV1(object):
     Returns:
       A scalar string `tf.Tensor` that summarizes the aggregated statistics.
     """
-    if compat.forward_compatible(2019, 8, 3):
-      return ged_ops.stats_aggregator_summary(self._resource)
-    else:
-      return ged_ops.experimental_stats_aggregator_summary(self._resource)
+    return ged_ops.stats_aggregator_summary(self._resource)
 
 
 # TODO(b/116314787): Change this to StatsAggregatorV2 when we have stable
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index 02d3f1e793a..c132a22e74b 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -66,14 +65,8 @@ def bytes_produced_stats(tag):
   """
 
   def _apply_fn(dataset):
-    if compat.forward_compatible(2019, 8, 3):
-      return _StatsDataset(
-          dataset, gen_experimental_dataset_ops.bytes_produced_stats_dataset,
-          tag)
-    else:
-      return _StatsDataset(
-          dataset, gen_experimental_dataset_ops
-          .experimental_bytes_produced_stats_dataset, tag)
+    return _StatsDataset(
+        dataset, gen_experimental_dataset_ops.bytes_produced_stats_dataset, tag)
 
   return _apply_fn
 
@@ -95,14 +88,8 @@ def latency_stats(tag):
   """
 
   def _apply_fn(dataset):
-    if compat.forward_compatible(2019, 8, 3):
-      return _StatsDataset(
-          dataset,
-          gen_experimental_dataset_ops.latency_stats_dataset, tag)
-    else:
-      return _StatsDataset(
-          dataset,
-          gen_experimental_dataset_ops.experimental_latency_stats_dataset, tag)
+    return _StatsDataset(
+        dataset, gen_experimental_dataset_ops.latency_stats_dataset, tag)
 
   return _apply_fn
 
diff --git a/tensorflow/python/data/experimental/ops/take_while_ops.py b/tensorflow/python/data/experimental/ops/take_while_ops.py
index 3b8cb2be893..fbaf0c233b2 100644
--- a/tensorflow/python/data/experimental/ops/take_while_ops.py
+++ b/tensorflow/python/data/experimental/ops/take_while_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
@@ -42,18 +41,11 @@ class _TakeWhileDataset(dataset_ops.UnaryUnchangedStructureDataset):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
 
     self._predicate = wrapped_func
-    if compat.forward_compatible(2019, 8, 3):
-      var_tensor = gen_experimental_dataset_ops.take_while_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          other_arguments=self._predicate.function.captured_inputs,
-          predicate=self._predicate.function,
-          **self._flat_structure)
-    else:
-      var_tensor = gen_experimental_dataset_ops.experimental_take_while_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          other_arguments=self._predicate.function.captured_inputs,
-          predicate=self._predicate.function,
-          **self._flat_structure)
+    var_tensor = gen_experimental_dataset_ops.take_while_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
+        **self._flat_structure)
     super(_TakeWhileDataset, self).__init__(input_dataset, var_tensor)
 
   def _functions(self):
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 0997e46db25..c30b36ca2bc 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import threading
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
@@ -47,31 +46,18 @@ class PrivateThreadPool(object):
     """Creates a `PrivateThreadPool` with the given number of threads."""
     if context.executing_eagerly():
       shared_name = _generate_shared_name("privatethreadpool")
-      if compat.forward_compatible(2019, 8, 3):
-        self._resource = ged_ops.thread_pool_handle(
-            num_threads=num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name=display_name,
-            shared_name=shared_name)
-      else:
-        self._resource = ged_ops.experimental_thread_pool_handle(
-            num_threads=num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name=display_name,
-            shared_name=shared_name)
+      self._resource = ged_ops.thread_pool_handle(
+          num_threads=num_threads,
+          max_intra_op_parallelism=max_intra_op_parallelism,
+          display_name=display_name,
+          shared_name=shared_name)
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device=context.context().device_name)
     else:
-      if compat.forward_compatible(2019, 8, 3):
-        self._resource = ged_ops.thread_pool_handle(
-            num_threads=num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name=display_name)
-      else:
-        self._resource = ged_ops.experimental_thread_pool_handle(
-            num_threads=num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name=display_name)
+      self._resource = ged_ops.thread_pool_handle(
+          num_threads=num_threads,
+          max_intra_op_parallelism=max_intra_op_parallelism,
+          display_name=display_name)
 
 
 class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
@@ -80,16 +66,10 @@ class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   def __init__(self, input_dataset, thread_pool):
     self._input_dataset = input_dataset
     self._thread_pool = thread_pool
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.thread_pool_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._thread_pool._resource,  # pylint: disable=protected-access
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_thread_pool_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._thread_pool._resource,  # pylint: disable=protected-access
-          **self._flat_structure)
+    variant_tensor = ged_ops.thread_pool_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._thread_pool._resource,  # pylint: disable=protected-access
+        **self._flat_structure)
     super(_ThreadPoolDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 396ec7a6e14..057c9cab7b7 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import gen_experimental_dataset_ops
@@ -60,12 +59,7 @@ class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
       raise TypeError(
           "`tf.data.experimental.unique()` only supports inputs with a single "
           "`tf.int32`, `tf.int64`, or `tf.string` component.")
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = gen_experimental_dataset_ops.unique_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          **self._flat_structure)
-    else:
-      variant_tensor = gen_experimental_dataset_ops.experimental_unique_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          **self._flat_structure)
+    variant_tensor = gen_experimental_dataset_ops.unique_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        **self._flat_structure)
     super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 21c0c736074..0d1785c7ee3 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
@@ -84,9 +83,5 @@ class TFRecordWriter(object):
           "produces shape {0} and types {1}".format(
               dataset_ops.get_legacy_output_shapes(dataset),
               dataset_ops.get_legacy_output_types(dataset)))
-    if compat.forward_compatible(2019, 8, 3):
-      return gen_experimental_dataset_ops.dataset_to_tf_record(
-          dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
-    else:
-      return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
-          dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
+    return gen_experimental_dataset_ops.dataset_to_tf_record(
+        dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index f76277a0138..15a3d89f89b 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -46,6 +46,24 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "checkpoint_test",
+    size = "medium",
+    srcs = ["checkpoint_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:util",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    grpc_enabled = True,
+)
+
 tf_py_test(
     name = "concatenate_test",
     size = "small",
@@ -61,28 +79,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dataset_checkpoint_test",
-    size = "small",
-    srcs = ["dataset_checkpoint_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-    ],
-)
-
 tf_py_test(
     name = "dataset_test",
     size = "small",
@@ -126,26 +122,8 @@ tf_py_test(
     size = "small",
     srcs = ["filter_test.py"],
     additional_deps = [
-        ":filter_test_base",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-tf_py_test(
-    name = "filter_with_legacy_function_test",
-    size = "small",
-    srcs = ["filter_with_legacy_function_test.py"],
-    additional_deps = [
-        ":filter_test_base",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_library(
-    name = "filter_test_base",
-    srcs = ["filter_test_base.py"],
-    deps = [
         ":test_base",
+        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -154,7 +132,6 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -308,30 +285,11 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "iterator_checkpoint_test",
-    size = "medium",
-    srcs = ["iterator_checkpoint_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/tracking:util",
-        "//tensorflow/python:checkpoint_management",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-    ],
-    grpc_enabled = True,
-)
-
 tf_py_test(
     name = "iterator_cluster_test",
     size = "small",
     srcs = ["iterator_cluster_test.py"],
     additional_deps = [
-        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -468,6 +426,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no_oss",
         "no_windows_gpu",
     ],
     xla_enable_strict_auto_jit = True,
@@ -712,6 +671,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_combinations",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index 2f049e414c6..6a3a2cc954f 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -24,10 +24,10 @@ import numpy as np
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_concat_ops
@@ -37,15 +37,14 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      ('even', 28, 14, False),
-      ('uneven_with_remainder', 28, 15, False),
-      ('uneven_without_remainder', 28, 15, True),
-      ('empty', 0, 14, False),
-  )
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              count=[0, 28], batch_size=[14, 15], drop_remainder=[True,
+                                                                  False])))
   def testBasic(self, count, batch_size, drop_remainder):
     """Tests the batch dataset logic for various input configurations.
 
@@ -95,11 +94,13 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       result = self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testInvalidBatchSize(self):
     with self.assertRaises(errors.InvalidArgumentError):
       dataset = (dataset_ops.Dataset.range(10).batch(0))
       self.evaluate(dataset._variant_tensor)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDataset(self):
 
     def map_fn(i):
@@ -125,6 +126,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     ]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseWithDifferentDenseShapes(self):
 
     def _sparse(i):
@@ -150,6 +152,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               dense_shape=[5, (i + 1) * 5 - 1]))
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseNested(self):
 
     def _sparse(i):
@@ -166,6 +169,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     ]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testShapeError(self):
 
     def generator():
@@ -183,7 +187,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
             r'Cannot batch tensors with different shapes in component 0. First '
             r'element had shape \[3\] and element 2 had shape \[4\].'))
 
-  # Ragged Tensors.
+  @combinations.generate(test_base.default_test_combinations())
   def testRagged(self):
 
     def _ragged(i):
@@ -196,6 +200,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     ]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testRaggedWithDifferentShapes(self):
     dataset = dataset_ops.Dataset.range(10).map(ragged_math_ops.range).batch(5)
     expected_output = [
@@ -205,6 +210,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     ]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testRaggedNested(self):
 
     def _ragged(i):
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index 305092c4ba0..a7df11464ff 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -17,34 +17,39 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from os import path
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class FileCacheTest(test_base.DatasetTestBase):
+class FileCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def setUp(self):
+    super(FileCacheTest, self).setUp()
     self.tmp_dir = tempfile.mkdtemp()
     self.cache_prefix = path.join(self.tmp_dir, "cache")
 
   def tearDown(self):
+    super(FileCacheTest, self).tearDown()
     if self.tmp_dir:
       shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCacheDatasetPassthrough(self):
     components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
                   np.array([9.0, 10.0, 11.0, 12.0]))
@@ -97,6 +102,7 @@ class FileCacheTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcurrentWriters(self):
     components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
                   np.array([9.0, 10.0, 11.0, 12.0]))
@@ -118,6 +124,7 @@ class FileCacheTest(test_base.DatasetTestBase):
 
     self.evaluate(get_next1())  # this should continue to succeed
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcurrentReaders(self):
     components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
                   np.array([9.0, 10.0, 11.0, 12.0]))
@@ -164,16 +171,36 @@ class FileCacheTest(test_base.DatasetTestBase):
     self.assertAllEqual(elements, elements_itr1)
     self.assertAllEqual(elements, elements_itr2)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testReadingPastEndOfSequence(self):
     dataset = dataset_ops.Dataset.range(10).cache(self.cache_prefix)
     dataset = dataset.map(lambda a: a).batch(4).repeat(2)
     expected_output = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]] * 2
     self.assertDatasetProduces(dataset, expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCleaningUpCacheFiles(self):
 
-@test_util.run_all_in_graph_and_eager_modes
-class MemoryCacheTest(test_base.DatasetTestBase):
+    def do_test(i):
+      dataset = dataset_ops.Dataset.range(10).cache(self.cache_prefix)
+      get_next = self.getNext(dataset)
+      for _ in range(i):
+        try:
+          self.evaluate(get_next())
+        except errors.OutOfRangeError:
+          break
 
+    if not context.executing_eagerly():
+      self.skipTest(
+          "Test requires eager mode for iterators to be deconstructed")
+
+    for i in [0, 3, 10, 12, 15]:
+      do_test(i)
+
+
+class MemoryCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
   def testCacheDatasetPassthrough(self):
     with ops.device("cpu:0"):
       repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
@@ -208,6 +235,7 @@ class MemoryCacheTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(cached_next())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testEmptyCacheReading(self):
     components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
                   np.array([9.0, 10.0, 11.0, 12.0]))
@@ -220,11 +248,12 @@ class MemoryCacheTest(test_base.DatasetTestBase):
     # caching, respectively.
     self.assertDatasetProduces(cache_dataset, expected_output=[])
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcurrentReaders(self):
 
-    dataset = dataset_ops.Dataset.range(5).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
+    dataset_fn = lambda: dataset_ops.Dataset.range(5).cache()
+    d1 = dataset_fn().map(lambda x: x + 1)
+    d2 = dataset_fn().map(lambda x: x + 6)
 
     get_next1 = self.getNext(d1)
 
@@ -248,12 +277,81 @@ class MemoryCacheTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next1())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCacheTakeRepeat(self):
     dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
 
     expected_output = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCacheRepeatEpochs(self):
+    counter = variables.Variable(0)
+    self.evaluate(counter.initializer)
+
+    def increment_fn(x):
+      counter.assign_add(1)
+      return x
+
+    dataset = dataset_ops.Dataset.range(10).map(increment_fn).cache().repeat(2)
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    # first epoch
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter))
+      self.assertEqual(i, self.evaluate(get_next()))
+    # second epoch
+    for i in range(10):
+      self.assertEqual(10, self.evaluate(counter))
+      self.assertEqual(i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
+  def testCacheIterationEpochs(self):
+    counter = variables.Variable(0)
+    self.evaluate(counter.initializer)
+
+    def increment_fn(x):
+      counter.assign_add(1)
+      return x
+
+    dataset = dataset_ops.Dataset.range(10).map(increment_fn).cache()
+
+    # first epoch
+    i = 0
+    for elem in dataset:
+      self.assertEqual(i, self.evaluate(elem))
+      i += 1
+      self.assertEqual(i, self.evaluate(counter))
+
+    # second epoch
+    i = 0
+    for elem in dataset:
+      self.assertEqual(10, self.evaluate(counter))
+      self.assertEqual(i, self.evaluate(elem))
+      i += 1
+
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
+  def testCacheV2ResourceCapture(self):
+
+    def make_dataset():
+      ids = dataset_ops.Dataset.range(10)
+      ids = ids.cache()
+
+      def interleave_fn(dataset, _):
+        return dataset
+
+      dataset = dataset_ops.Dataset.range(1)
+      dataset = dataset.interleave(functools.partial(interleave_fn, ids))
+      return dataset
+
+    results = []
+    for elem in make_dataset():
+      results.append(elem.numpy())
+
+    self.assertAllEqual(results, range(10))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py b/tensorflow/python/data/kernel_tests/checkpoint_test.py
similarity index 66%
rename from tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
rename to tensorflow/python/data/kernel_tests/checkpoint_test.py
index 6dcd94ea020..738d09b97fe 100644
--- a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/checkpoint_test.py
@@ -12,32 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Checkpoint tests for `tf.data.Dataset`."""
+"""Tests for checkpointing tf.data iterators."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class DatasetCheckpointTest(test_base.DatasetTestBase):
+# TODO(jsimsa): Add missing test combinations.
+class CheckpointTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def tearDown(self):
-    # Remove all checkpoint files.
+    super(CheckpointTest, self).tearDown()
     prefix = self._iterator_checkpoint_prefix()
     pattern = prefix + "*"
     files = gfile.Glob(pattern)
@@ -61,6 +66,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
                                                       iterator_state_variant)
     return restore_op
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="graph"))
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
@@ -104,95 +111,15 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
         for i in range(start, break_point):
           self.assertEqual(i, sess.run(get_next))
         sess.run(save_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-
-    def _build_graph(start, stop, num_epochs):
-      dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset_ops.make_initializable_iterator(dataset)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_point = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
         sess.run(init_op)
-        for _ in range(break_epoch):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Create an empty IteratorResource and restore the Iterator into it.
-      output_types = dtypes.int64
-      output_shapes = tensor_shape.scalar()
-      iterator = iterator_ops.Iterator.from_structure(output_types,
-                                                      output_shapes)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      get_next = iterator.get_next()
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch + 1, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreInModifiedGraph(self):
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset_ops.make_initializable_iterator(dataset)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    stop_1 = 8
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Intentionally build a graph with a different value for stop to make sure
-      # the original dataset graph is actually getting loaded.
-      init_op, get_next, _, restore_op = _build_graph(start, stop_1)
-      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_point, stop):
           self.assertEqual(i, sess.run(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="graph"))
   def testInitThenRestore(self):
     # Note: Calling init_op before restore_op is redundant. This test just makes
     # sure we do not fail if restore is called on an already initialized
@@ -230,6 +157,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="graph"))
   def testMultipleSaves(self):
 
     def _build_graph(start, stop):
@@ -258,6 +187,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
+        sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_point1, break_point2):
           self.assertEqual(i, sess.run(get_next))
@@ -267,12 +197,15 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
+        sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_point2, stop):
           self.assertEqual(i, sess.run(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="graph"))
   def testSaveRestoreWithRepeat(self):
 
     def _build_graph(start, stop, num_epochs):
@@ -298,6 +231,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
+          sess.run(init_op)
           sess.run(restore_op)
         for _ in range(break_epoch - 1):
           for i in range(start, stop):
@@ -309,6 +243,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
       with self.session(graph=g) as sess:
+        sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_range, stop):
           self.assertEqual(i, sess.run(get_next))
@@ -318,6 +253,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="graph"))
   def testSaveRestoreExhaustedIterator(self):
 
     def _build_graph(start, stop, num_epochs):
@@ -341,6 +278,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
+          sess.run(init_op)
           sess.run(restore_op)
         for _ in range(num_epochs):
           for i in range(start, stop):
@@ -352,10 +290,106 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
       with self.session(graph=g) as sess:
+        sess.run(init_op)
         sess.run(restore_op)
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="eager"))
+  def testSaveRestoreOneShotIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
+        math_ops.square).batch(2)
+    # TODO(b/138399725): Re-enable default optimizations.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    iterator = iter(dataset)
+    get_next = iterator.get_next
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="eager"))
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    # TODO(b/138399725): Re-enable default optimizations.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    iterator_1 = iter(dataset)
+    get_next_1 = iterator_1.get_next
+    iterator_2 = iter(dataset)
+    get_next_2 = iterator_2.get_next
+    dataset_2 = dataset_ops.Dataset.range(10)
+    iterator_3 = iter(dataset_2)
+    get_next_3 = iterator_3.get_next
+    checkpoint = trackable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], get_next_1())
+    self.assertAllEqual(0, get_next_3())
+    self.assertAllEqual(1, get_next_3())
+    self.assertAllEqual(2, get_next_3())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual([9, 16], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next_1())
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="eager"))
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(3)
+    iterator = iter(dataset)
+    get_next = iterator.get_next
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual(0, get_next())
+    self.assertAllEqual(1, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual(2, get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual(2, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    checkpoint.restore(save_path).run_restore_ops()
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1, 2], mode="eager"))
+  def testRestoreInReconstructedIteratorInitializable(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(10)
+    iterator = iter(dataset)
+    get_next = iterator.get_next
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
+    for i in range(5):
+      checkpoint.restore(
+          checkpoint_management.latest_checkpoint(
+              checkpoint_directory)).initialize_or_restore()
+      for j in range(2):
+        self.assertEqual(i * 2 + j, self.evaluate(get_next()))
+      checkpoint.save(file_prefix=checkpoint_prefix)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
index 384fd289f16..bf726607681 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -17,20 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class ConcatenateTest(test_base.DatasetTestBase):
+class ConcatenateTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateDataset(self):
     input_components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20),
@@ -64,6 +65,7 @@ class ConcatenateTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateDatasetDifferentShape(self):
     input_components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20),
@@ -94,6 +96,7 @@ class ConcatenateTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateDatasetDifferentStructure(self):
     input_components = (
         np.tile(np.array([[1], [2], [3], [4]]), 5),
@@ -110,6 +113,7 @@ class ConcatenateTest(test_base.DatasetTestBase):
     with self.assertRaisesRegexp(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateDatasetDifferentKeys(self):
     input_components = {
         "foo": np.array([[1], [2], [3], [4]]),
@@ -127,6 +131,7 @@ class ConcatenateTest(test_base.DatasetTestBase):
     with self.assertRaisesRegexp(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateDatasetDifferentType(self):
     input_components = (
         np.tile(np.array([[1], [2], [3], [4]]), 5),
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index cbcaa0e5251..a682ee2680a 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -32,27 +32,36 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import tf_logging
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testAsSerializedGraph(self):
     dataset = dataset_ops.Dataset.range(10)
     graph = graph_pb2.GraphDef().FromString(
         self.evaluate(dataset._as_serialized_graph()))
-    self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+    self.assertTrue(any([node.op == "RangeDataset" for node in graph.node]))
 
+  def testAsSerializedGraphStateful(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda _: random_ops.random_uniform(()))
+    with self.assertRaises(errors.FailedPreconditionError):
+      self.evaluate(dataset._as_serialized_graph())
+
+  @combinations.generate(test_base.default_test_combinations())
   def testAsFunctionWithMap(self):
     if not context.executing_eagerly():
       self.skipTest("Only works executing eagerly")
@@ -65,6 +74,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           variant, original_dataset.element_spec)
       self.assertDatasetProduces(revived_dataset, range(0, 10, 2))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testAsFunctionWithMapInFlatMap(self):
     if not context.executing_eagerly():
       self.skipTest("Only works executing eagerly")
@@ -78,54 +88,44 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           variant, original_dataset.element_spec)
       self.assertDatasetProduces(revived_dataset, list(original_dataset))
 
-  @staticmethod
-  def make_apply_fn(dataset):
+  def checkNumInputs(self, dataset, num_inputs):
+    self.assertLen(dataset._inputs(), num_inputs)
 
-    def apply_fn(dataset):
-
-      def _apply_fn(dataset):
-        return dataset.cache()
-
-      return dataset.apply(_apply_fn)
-
-    return apply_fn
-
-  @staticmethod
-  def make_gen():
+  @combinations.generate(test_base.default_test_combinations())
+  def testFixedLengthRecordInputs(self):
+    dataset = readers.FixedLengthRecordDataset("", 42)
+    self.checkNumInputs(dataset, 0)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromGeneratorInputs(self):
     def gen():
       yield 42
 
-    return gen
+    dataset = dataset_ops.Dataset.from_generator(gen, dtypes.int32)
+    self.checkNumInputs(dataset, 1)
 
-  @staticmethod
-  def make_interleave_fn(dataset, num_parallel_calls=None):
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromTensorsInputs(self):
+    dataset = dataset_ops.Dataset.from_tensors([42])
+    self.checkNumInputs(dataset, 0)
 
-    def interleave_fn(dataset):
-      return dataset.interleave(
-          lambda x: dataset_ops.Dataset.range(0),
-          cycle_length=2,
-          num_parallel_calls=num_parallel_calls)
+  @combinations.generate(test_base.default_test_combinations())
+  def testRangeInputs(self):
+    dataset = dataset_ops.Dataset.range(10)
+    self.checkNumInputs(dataset, 0)
 
-    return interleave_fn
+  @combinations.generate(test_base.default_test_combinations())
+  def testTextLineInputs(self):
+    dataset = readers.TextLineDataset("")
+    self.checkNumInputs(dataset, 0)
 
-  @parameterized.named_parameters(
-      ("FixedLengthRecord",
-       lambda: readers.FixedLengthRecordDataset("", 42)),
-      ("FromGenerator",
-       lambda: dataset_ops.Dataset.from_generator(
-           DatasetTest.make_gen(), dtypes.int32),
-       1),
-      ("FromTensors", lambda: dataset_ops.Dataset.from_tensors([42])),
-      ("FromTensorSlices", lambda: dataset_ops.Dataset.from_tensors([42])),
-      ("Range", lambda: dataset_ops.Dataset.range(10)),
-      ("TextLine", lambda: readers.TextLineDataset("")),
-      ("TFRecord", lambda: readers.TFRecordDataset(""), 1),
-  )
-  def testDatasetSimpleSourceInputs(self, dataset_fn, num_inputs=0):
-    self.assertLen(dataset_fn()._inputs(), num_inputs)
+  @combinations.generate(test_base.default_test_combinations())
+  def testTFRecordInputs(self):
+    dataset = readers.TFRecordDataset("")
+    self.checkNumInputs(dataset, 1)
 
-  @test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["eager", "graph"]))
   def testDatasetComplexSourceInputs(self):
     dataset_fn = dataset_ops.Dataset.from_sparse_tensor_slices(
         sparse_tensor.SparseTensor(
@@ -134,106 +134,127 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
             dense_shape=np.array([3, 1])))
     self.assertEmpty(dataset_fn._inputs())
 
-  @parameterized.named_parameters(
-      ("Batch",
-       lambda x: x.batch(10),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Cache",
-       lambda x: x.cache(),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Filter",
-       lambda x: x.filter(lambda x: True),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("FlatMap",
-       lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Map",
-       lambda x: x.map(lambda x: x),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("PaddedBatch",
-       lambda x: x.padded_batch(10, []),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("ParallelMap",
-       lambda x: x.map(lambda x: x, num_parallel_calls=2),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Repeat",
-       lambda x: x.repeat(),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Shuffle",
-       lambda x: x.shuffle(10),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Skip",
-       lambda x: x.skip(1),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Take",
-       lambda x: x.take(1),
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Window",
-       lambda x: x.window(10),
-       lambda: dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputs(self, dataset_fn, input_dataset_fn):
-    input_dataset = input_dataset_fn()
+  def checkUnaryInputs(self, dataset_fn):
+    input_dataset = dataset_ops.Dataset.range(0)
     self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testBatchInputs(self):
+    self.checkUnaryInputs(lambda x: x.batch(10))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCacheInputs(self):
+    self.checkUnaryInputs(lambda x: x.cache())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFilterInputs(self):
+    self.checkUnaryInputs(lambda x: x.filter(lambda x: True))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFlatMapInputs(self):
+    self.checkUnaryInputs(
+        lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testMapInputs(self):
+    self.checkUnaryInputs(lambda x: x.map(lambda x: x))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testPaddedBatchInputs(self):
+    self.checkUnaryInputs(lambda x: x.padded_batch(10, []))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testParallelMapInputs(self):
+    self.checkUnaryInputs(lambda x: x.map(lambda x: x, num_parallel_calls=2))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testRepeatInputs(self):
+    self.checkUnaryInputs(lambda x: x.repeat())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffleInputs(self):
+    self.checkUnaryInputs(lambda x: x.shuffle(10))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSkipInputs(self):
+    self.checkUnaryInputs(lambda x: x.skip(1))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testTakeInputs(self):
+    self.checkUnaryInputs(lambda x: x.take(1))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWindowInputs(self):
+    self.checkUnaryInputs(lambda x: x.window(10))
+
+  @combinations.generate(test_base.default_test_combinations())
   def testUnaryTransformationInputsApply(self):
     input_dataset = dataset_ops.Dataset.range(0)
-    dataset_fn = self.make_apply_fn(dataset_ops.Dataset.range(0))
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+    dataset = input_dataset.apply(lambda dataset: dataset.cache())
 
-  @parameterized.named_parameters(
-      ("ParallelInterleave",
-       [lambda: dataset_ops.Dataset.range(0), 2],
-       lambda: dataset_ops.Dataset.range(0)),
-      ("Interleave",
-       [lambda: dataset_ops.Dataset.range(0), None],
-       lambda: dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputsWithInterleaveFn(
-      self, interleave_fn_args, input_dataset_fn):
-    input_dataset = input_dataset_fn()
-    dataset_fn = self.make_interleave_fn(*interleave_fn_args)
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+    self.assertEqual([input_dataset], dataset._inputs())
 
+  def checkInputsWithInterleaveFn(self, dataset_fn, interleave_parallelism):
+    input_dataset = dataset_ops.Dataset.range(0)
+    dataset = input_dataset.interleave(
+        lambda x: dataset_ops.Dataset.range(0),
+        cycle_length=2,
+        num_parallel_calls=interleave_parallelism)
+    self.assertEqual([input_dataset], dataset._inputs())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testParallelInterleaveInputs(self):
+    self.checkInputsWithInterleaveFn(lambda: dataset_ops.range(0), 2)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testInterleaveInputs(self):
+    self.checkInputsWithInterleaveFn(lambda: dataset_ops.range(0), None)
+
+  @combinations.generate(test_base.default_test_combinations())
   def testNoWarnings(self):
     with test.mock.patch.object(warnings, "warn") as mock_log:
-      dataset_fn = self.make_interleave_fn(dataset_ops.Dataset.range(10))
-      dataset_fn(dataset_ops.Dataset.range(10))
+      dataset_ops.Dataset.range(0).interleave(
+          lambda x: dataset_ops.Dataset.range(0), cycle_length=2)
       self.assertEmpty(mock_log.call_args_list)
 
-  @parameterized.named_parameters(
-      ("Concatenate", lambda x, y: x.concatenate(y),
-       lambda: dataset_ops.Dataset.range(0),
-       lambda: dataset_ops.Dataset.range(1)))
-  def testBinaryTransformationInputs(self, dataset_fn, input1_fn, input2_fn):
-    input1 = input1_fn()
-    input2 = input2_fn()
+  def checkBinaryInputs(self, dataset_fn):
+    input1 = dataset_ops.Dataset.range(0)
+    input2 = dataset_ops.Dataset.range(1)
     self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
 
-  @parameterized.named_parameters(
-      ("ZipOne",
-       dataset_ops.Dataset.zip,
-       lambda: (dataset_ops.Dataset.range(0))),
-      ("ZipNest",
-       dataset_ops.Dataset.zip,
-       lambda: (dataset_ops.Dataset.range(0),
-                (dataset_ops.Dataset.range(1),
-                 dataset_ops.Dataset.range(2)))),
-      ("ZipTuple",
-       dataset_ops.Dataset.zip,
-       lambda: (dataset_ops.Dataset.range(0),
-                dataset_ops.Dataset.range(1))),
-  )
-  def testVariadicTransformationInputs(self, dataset_fn, input_datasets_fn):
-    input_datasets = input_datasets_fn()
+  @combinations.generate(test_base.default_test_combinations())
+  def testConcatenateInputs(self):
+    self.checkBinaryInputs(lambda x, y: x.concatenate(y))
+
+  def checkVariadicInputs(self, dataset_fn, input_datasets):
     self.assertEqual(
         nest.flatten(input_datasets),
         dataset_fn(input_datasets)._inputs())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testZipOneInputs(self):
+    input_datasets = dataset_ops.Dataset.range(0)
+    self.checkVariadicInputs(dataset_ops.Dataset.zip, input_datasets)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testZipNestInputs(self):
+    input_datasets = (dataset_ops.Dataset.range(0),
+                      (dataset_ops.Dataset.range(1),
+                       dataset_ops.Dataset.range(2)))
+    self.checkVariadicInputs(dataset_ops.Dataset.zip, input_datasets)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testZipTupleInputs(self):
+    input_datasets = (dataset_ops.Dataset.range(0),
+                      dataset_ops.Dataset.range(1))
+    self.checkVariadicInputs(dataset_ops.Dataset.zip, input_datasets)
+
+  @combinations.generate(test_base.default_test_combinations())
   def testFunctions(self):
     dataset = dataset_ops.Dataset.range(5).map(lambda x: x * 2)
     self.assertLen(dataset._functions(), 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCollectInputs(self):
     ds1 = dataset_ops.Dataset.range(0)
     ds2 = ds1.concatenate(ds1)
@@ -251,34 +272,8 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(2, inputs.count(ds2))
     self.assertEqual(1, inputs.count(ds3))
 
-  # pylint: disable=g-long-lambda
-  @parameterized.named_parameters(
-      ("Tensor", lambda: constant_op.constant(37.0),
-       tensor_spec.TensorSpec([], dtypes.float32)),
-      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
-          indices=[[0]],
-          values=constant_op.constant([0], dtype=dtypes.int32),
-          dense_shape=[1]), sparse_tensor.SparseTensorSpec([1], dtypes.int32)),
-      ("Nest", lambda: {
-          "a": constant_op.constant(37.0),
-          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))
-      }, {
-          "a":
-              tensor_spec.TensorSpec([], dtypes.float32),
-          "b": (
-              tensor_spec.TensorSpec([1], dtypes.string),
-              tensor_spec.TensorSpec([], dtypes.string),
-          )
-      }),
-      ("Dataset", lambda: dataset_ops.Dataset.from_tensor_slices(
-          constant_op.constant([1, 2, 3])),
-       dataset_ops.DatasetSpec(tensor_spec.TensorSpec([], dtypes.int32))),
-      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
-       optional_ops.OptionalSpec(
-           tensor_spec.TensorSpec([], dtypes.float32))),
-  )
-  def testDatasetSpec(self, tf_value_fn, expected_element_structure):
-    dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
+  def checkDatasetSpec(self, tf_value, expected_element_structure):
+    dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value)
     dataset_structure = structure.type_spec_from_value(dataset)
     self.assertIsInstance(dataset_structure, dataset_ops.DatasetSpec)
 
@@ -287,7 +282,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
             dataset_ops.get_structure(dataset), expected_element_structure))
     self.assertEqual([dtypes.variant],
                      structure.get_flat_tensor_types(dataset_structure))
-    self.assertEqual([tensor_shape.scalar()],
+    self.assertEqual([tensor_shape.TensorShape([])],
                      structure.get_flat_tensor_shapes(dataset_structure))
 
     # Assert that the `Dataset` survives a round-trip via _from_tensor_list()
@@ -295,7 +290,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     round_trip_dataset = dataset_structure._from_tensor_list(
         dataset_structure._to_tensor_list(dataset))
 
-    value = tf_value_fn()
+    value = tf_value
 
     if isinstance(value, dataset_ops.Dataset):
       self.assertDatasetsEqual(value, dataset.flat_map(lambda x: x))
@@ -306,45 +301,90 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           requires_initialization=True)
     else:
       self.assertDatasetProduces(
-          round_trip_dataset, [self.evaluate(tf_value_fn())],
+          round_trip_dataset, [self.evaluate(tf_value)],
           requires_initialization=True)
 
-  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  @combinations.generate(test_base.default_test_combinations())
+  def testTensorDatasetSpec(self):
+    self.checkDatasetSpec(
+        constant_op.constant(37.0), tensor_spec.TensorSpec([], dtypes.float32))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSparseTensorDatasetSpec(self):
+    self.checkDatasetSpec(
+        sparse_tensor.SparseTensor(
+            indices=[[0]],
+            values=constant_op.constant([0], dtype=dtypes.int32),
+            dense_shape=[1]), sparse_tensor.SparseTensorSpec([1], dtypes.int32))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNestDatasetSpec(self):
+    self.checkDatasetSpec(
+        {
+            "a": constant_op.constant(37.0),
+            "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))
+        }, {
+            "a":
+                tensor_spec.TensorSpec([], dtypes.float32),
+            "b": (
+                tensor_spec.TensorSpec([1], dtypes.string),
+                tensor_spec.TensorSpec([], dtypes.string),
+            )
+        })
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testDatasetDatasetSpec(self):
+    self.checkDatasetSpec(
+        dataset_ops.Dataset.from_tensor_slices(
+            constant_op.constant([1, 2, 3])),
+        dataset_ops.DatasetSpec(tensor_spec.TensorSpec([], dtypes.int32)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testOptionalDatasetSpec(self):
+    self.checkDatasetSpec(
+        optional_ops.Optional.from_value(37.0),
+        optional_ops.OptionalSpec(tensor_spec.TensorSpec([], dtypes.float32)))
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testSkipEagerSameGraphErrorOneShot(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
-  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testSkipEagerSameGraphErrorOneShotSimple(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
-      with test.mock.patch.object(logging, "warning") as mock_log:
+      with test.mock.patch.object(tf_logging, "warning") as mock_log:
         _ = dataset_ops.make_one_shot_iterator(dataset)
         self.assertRegexpMatches(
             str(mock_log.call_args), "Please ensure that all datasets in the "
             "pipeline are created in the same graph as the iterator.")
 
-  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
   def testSkipEagerSameGraphErrorInitializable(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
-  @parameterized.named_parameters(
-      ("Async", context.ASYNC),
-      ("Sync", context.SYNC),
-  )
-  def testDatasetEagerIteration(self, execution_mode):
-    with context.eager_mode(), context.execution_mode(execution_mode):
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(tf_api_version=[1, 2], mode="eager"),
+          combinations.combine(execution_mode=[context.ASYNC, context.SYNC])))
+  def testEagerIteration(self, execution_mode):
+    with context.execution_mode(execution_mode):
       val = 0
       dataset = dataset_ops.Dataset.range(10)
       for foo in dataset:
         self.assertEqual(val, foo.numpy())
         val += 1
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDatasetAsFunctionArgument(self):
 
     @def_function.function
@@ -371,6 +411,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           _uses_dataset.get_concrete_function(
               dataset_ops.Dataset.zip((first_dataset, second_dataset))))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testLimitedRetracing(self):
     trace_count = [0]
 
diff --git a/tensorflow/python/data/kernel_tests/enumerate_test.py b/tensorflow/python/data/kernel_tests/enumerate_test.py
index 0666449cda9..1ff9b9d5e08 100644
--- a/tensorflow/python/data/kernel_tests/enumerate_test.py
+++ b/tensorflow/python/data/kernel_tests/enumerate_test.py
@@ -17,18 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class EnumerateTest(test_base.DatasetTestBase):
+class EnumerateTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testEnumerate(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
index b81e9a892df..05b538a46ce 100644
--- a/tensorflow/python/data/kernel_tests/filter_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -17,16 +17,143 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.kernel_tests import filter_test_base
-from tensorflow.python.framework import test_util
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class FilterTest(filter_test_base.FilterTestBase):
+def new_and_legacy_filter_fn_combinations():
 
-  def apply_filter(self, input_dataset, predicate):
-    return input_dataset.filter(predicate)
+  def new_filter_fn(dataset, predicate):
+    return dataset.filter(predicate)
+
+  def legacy_filter_fn(dataset, predicate):
+    return dataset.filter_with_legacy_function(predicate)
+
+  return (combinations.combine(
+      tf_api_version=[1, 2],
+      mode=["eager", "graph"],
+      apply_filter=combinations.NamedObject("new_filter_fn", new_filter_fn)) +
+          combinations.combine(
+              tf_api_version=1,
+              mode=["eager", "graph"],
+              apply_filter=combinations.NamedObject("legacy_filter_fn",
+                                                    legacy_filter_fn)))
+
+
+class FilterTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testFilterDataset(self, apply_filter):
+    components = (np.arange(7, dtype=np.int64),
+                  np.array([[1, 2, 3]], dtype=np.int64) *
+                  np.arange(7, dtype=np.int64)[:, np.newaxis],
+                  np.array(37.0, dtype=np.float64) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):  # pylint: disable=missing-docstring
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count)
+      # pylint: disable=g-long-lambda
+      dataset = apply_filter(
+          dataset,
+          lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+      # pylint: enable=g-long-lambda
+      self.assertEqual(
+          [c.shape[1:] for c in components],
+          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testFilterRange(self, apply_filter):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = apply_filter(dataset,
+                           lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testFilterDict(self, apply_filter):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x**2})
+    dataset = apply_filter(dataset, lambda d: math_ops.equal(d["bar"] % 2, 0))
+    dataset = dataset.map(lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testUseStepContainerInFilter(self, apply_filter):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
+    dataset = apply_filter(dataset, _predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testSparse(self, apply_filter):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    dataset = apply_filter(dataset, _filter_fn)
+    dataset = dataset.map(lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testShortCircuit(self, apply_filter):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)))
+    dataset = apply_filter(dataset, lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  @combinations.generate(new_and_legacy_filter_fn_combinations())
+  def testParallelFilters(self, apply_filter):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = apply_filter(dataset, lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/filter_test_base.py b/tensorflow/python/data/kernel_tests/filter_test_base.py
deleted file mode 100644
index f6e5d285f2c..00000000000
--- a/tensorflow/python/data/kernel_tests/filter_test_base.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `tf.data.Dataset.filter()`."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import map_fn
-from tensorflow.python.ops import math_ops
-
-
-class FilterTestBase(test_base.DatasetTestBase):
-  """Base class for FilterDataset tests."""
-
-  def apply_filter(self, input_dataset, predicate):
-    raise NotImplementedError("FilterTestBase._apply_filter")
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def do_test(count, modulus):  # pylint: disable=missing-docstring
-      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
-          _map_fn).repeat(count)
-      # pylint: disable=g-long-lambda
-      dataset = self.apply_filter(
-          dataset, lambda x, _y, _z: math_ops.equal(
-              math_ops.mod(x, modulus), 0))
-      # pylint: enable=g-long-lambda
-      self.assertEqual(
-          [c.shape[1:] for c in components],
-          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
-      get_next = self.getNext(dataset)
-      for _ in range(count):
-        for i in [x for x in range(7) if x**2 % modulus == 0]:
-          result = self.evaluate(get_next())
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next())
-
-    do_test(14, 2)
-    do_test(4, 18)
-
-    # Test an empty dataset.
-    do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(4)
-    dataset = self.apply_filter(
-        dataset, lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
-
-  def testFilterDict(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: {"foo": x * 2, "bar": x ** 2})
-    dataset = self.apply_filter(
-        dataset, lambda d: math_ops.equal(d["bar"] % 2, 0))
-    dataset = dataset.map(lambda d: d["foo"] + d["bar"])
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [[1, 2, 3], [4, 5, 6]])
-    dataset = self.apply_filter(dataset, _predicate)
-    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    dataset = self.apply_filter(dataset, _filter_fn)
-    dataset = dataset.map(lambda x, i: x)
-    self.assertDatasetProduces(
-        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
-
-  def testShortCircuit(self):
-    dataset = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(10),
-         dataset_ops.Dataset.from_tensors(True).repeat(None)
-        ))
-    dataset = self.apply_filter(dataset, lambda x, y: y)
-    self.assertDatasetProduces(
-        dataset, expected_output=[(i, True) for i in range(10)])
-
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10)
-    dataset = self.apply_filter(dataset, lambda x: math_ops.equal(x % 2, 0))
-    next_elements = [self.getNext(dataset) for _ in range(10)]
-    self.assertEqual([0 for _ in range(10)],
-                     self.evaluate(
-                         [next_element() for next_element in next_elements]))
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
deleted file mode 100644
index dfb54b50ad6..00000000000
--- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Checkpoint tests for `tf.data.Iterator`."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.tracking import util as trackable_utils
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class IteratorCheckpointingTest(test_base.DatasetTestBase):
-
-  def testSaveRestoreOneShotIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
-        math_ops.square).batch(2)
-    iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual([1, 4], get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  def testSaveRestoreMultipleIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
-    dataset = dataset.map(math_ops.square).batch(2)
-    iterator_1 = iter(dataset) if context.executing_eagerly(
-    ) else dataset_ops.make_one_shot_iterator(dataset)
-    get_next_1 = iterator_1.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_1.get_next())
-    iterator_2 = iter(dataset) if context.executing_eagerly(
-    ) else dataset_ops.make_one_shot_iterator(dataset)
-    get_next_2 = iterator_2.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_2.get_next())
-    dataset_2 = dataset_ops.Dataset.range(10)
-    iterator_3 = iter(dataset_2) if context.executing_eagerly(
-    ) else dataset_ops.make_one_shot_iterator(dataset_2)
-    get_next_3 = iterator_3.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = trackable_utils.Checkpoint(
-        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
-    self.assertAllEqual([1, 4], get_next_1())
-    self.assertAllEqual(0, get_next_3())
-    self.assertAllEqual(1, get_next_3())
-    self.assertAllEqual(2, get_next_3())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual([9, 16], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next_1())
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-
-  def testRestoreExhaustedIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(3)
-    iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual(0, get_next())
-    self.assertAllEqual(1, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual(2, get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual(2, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    checkpoint.restore(save_path).run_restore_ops()
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  def testRestoreInReconstructedIteratorInitializable(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(10)
-    iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset_ops.make_initializable_iterator(dataset)
-    get_next = iterator.get_next
-    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
-    for i in range(5):
-      checkpoint.restore(
-          checkpoint_management.latest_checkpoint(
-              checkpoint_directory)).initialize_or_restore()
-      for j in range(2):
-        self.assertEqual(i * 2 + j, self.evaluate(get_next()))
-      checkpoint.save(file_prefix=checkpoint_prefix)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index ef198869e4e..0384f9fc18a 100644
--- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import lookup as lookup_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -32,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -124,10 +124,9 @@ class IteratorClusterTest(test.TestCase):
     default_val = -1
     keys = constant_op.constant(["brain", "salad", "surgery"])
     values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
+    table = lookup_ops.StaticHashTableV1(
         lookup_ops.KeyValueTensorInitializer(keys, values),
-        default_val,
-        shared_name="shared_table")
+        default_val)
 
     input_sentences = dataset_ops.Dataset.from_tensor_slices(
         ["brain brain tank salad surgery", "surgery brain"])
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index caaf09c0571..4fc15669b7f 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import warnings
 
 from absl.testing import parameterized
@@ -30,7 +29,6 @@ from tensorflow.python.compat import compat as forward_compat
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -45,9 +43,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
@@ -212,24 +208,22 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
   @test_util.deprecated_graph_mode_only
   def testOneShotIteratorInitializerFails(self):
     # Define a dataset whose initialization will always fail.
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+    dataset = dataset_ops.Dataset.from_tensors(array_ops.gather([0], [4]))
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
         sess.run(next_element)
 
       # Test that subsequent attempts to use the iterator also fail.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
         sess.run(next_element)
 
     with self.cached_session() as sess:
 
       def consumer_thread():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
           sess.run(next_element)
 
       num_threads = 8
@@ -771,65 +765,6 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
             })
 
-  @test_util.deprecated_graph_mode_only
-  def testIncorrectIteratorRestore(self):
-
-    def _path():
-      return os.path.join(self.get_temp_dir(), "iterator")
-
-    def _save_op(iterator_resource):
-      iterator_state_variant = gen_dataset_ops.serialize_iterator(
-          iterator_resource)
-      save_op = io_ops.write_file(
-          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
-      return save_op
-
-    def _restore_op(iterator_resource):
-      iterator_state_variant = parsing_ops.parse_tensor(
-          io_ops.read_file(_path()), dtypes.variant)
-      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                        iterator_state_variant)
-      return restore_op
-
-    def _build_range_dataset_graph():
-      start = 1
-      stop = 10
-      iterator = dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.range(start, stop))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    def _build_reader_dataset_graph():
-      filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = dataset_ops.make_initializable_iterator(
-          readers.FixedLengthRecordDataset(filenames, 1, 0, 0))
-      init_op = iterator.initializer
-      get_next_op = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next_op, save_op, restore_op
-
-    # Saving iterator for RangeDataset graph.
-    with ops.Graph().as_default() as g:
-      init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
-
-    # Attempt to restore the saved iterator into an IteratorResource of
-    # incompatible type. An iterator of RangeDataset has output type int64,
-    # while an iterator of FixedLengthRecordDataset has output type string.
-    # So an InvalidArgumentError should be raised by
-    # IteratorResource::set_iterator.
-    with ops.Graph().as_default() as g:
-      _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.session(graph=g) as sess:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
-
   @test_util.deprecated_graph_mode_only
   def testRepeatedGetNextWarning(self):
     iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 03cec7efa50..6168330a747 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -79,8 +79,9 @@ class ListFilesTest(test_base.DatasetTestBase):
     filenames = ['a', 'b', 'c']
     self._touchTempFiles(filenames)
 
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+    def dataset_fn():
+      return dataset_ops.Dataset.list_files(
+          path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
 
     expected_filenames = [
         compat.as_bytes(path.join(self.tmp_dir, filename))
@@ -90,7 +91,7 @@ class ListFilesTest(test_base.DatasetTestBase):
     all_actual_filenames = []
     for _ in range(3):
       actual_filenames = []
-      next_element = self.getNext(dataset, requires_initialization=True)
+      next_element = self.getNext(dataset_fn(), requires_initialization=True)
       try:
         while True:
           actual_filenames.append(self.evaluate(next_element()))
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index d85caa96beb..0847cdd7a0d 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from collections import namedtuple
 import threading
+import time
 import warnings
 
 from absl.testing import parameterized
@@ -521,10 +522,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
             divide,
             name="cond_mult")
 
-      pred_fn_pairs = {
-          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
-              defaults_two,
-      }
+      pred_fn_pairs = [
+          (math_ops.logical_or(math_ops.equal(y, 2),
+                               math_ops.equal(y, 3)), defaults_two),
+      ]
 
       return control_flow_ops.case(
           pred_fn_pairs, default=multiply, exclusive=True)
@@ -554,10 +555,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
       def divide():
         return x // 2
 
-      pred_fn_pairs = {
-          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
-              divide,
-      }
+      pred_fn_pairs = [
+          (math_ops.logical_or(math_ops.equal(y, 2),
+                               math_ops.equal(y, 3)), divide),
+      ]
 
       return control_flow_ops.case(
           pred_fn_pairs, default=multiply, exclusive=True)
@@ -595,10 +596,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
             divide,
             name="cond_mult")
 
-      pred_fn_pairs = {
-          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
-              defaults_two,
-      }
+      pred_fn_pairs = [
+          (math_ops.logical_or(math_ops.equal(y, 2),
+                               math_ops.equal(y, 3)), defaults_two),
+      ]
 
       return control_flow_ops.case(
           pred_fn_pairs, default=multiply, exclusive=True)
@@ -732,6 +733,30 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset,
         expected_output=[self.evaluate(_check(_sparse(i))) for i in range(10)])
 
+  def testSparseMapShapeInference(self):
+    if not context.executing_eagerly():
+      self.skipTest("SparseTensor shape inference requires eager mode")
+    row_lengths = np.random.randint(0, 4, size=128)
+    values = np.ones(np.sum(row_lengths))
+    sparse = ragged_tensor.RaggedTensor.from_row_lengths(
+        values, row_lengths).to_sparse()
+    dataset = dataset_ops.Dataset.from_tensor_slices(sparse)
+    dataset = dataset.batch(32, drop_remainder=True)
+    dataset = dataset.map(lambda x: x)
+    self.assertEqual((32, 3), dataset.element_spec.shape)
+
+  def testSparseMapShapeInferencePartial(self):
+    if not context.executing_eagerly():
+      self.skipTest("SparseTensor shape inference requires eager mode")
+    row_lengths = np.random.randint(0, 4, size=128)
+    values = np.ones(np.sum(row_lengths))
+    sparse = ragged_tensor.RaggedTensor.from_row_lengths(
+        values, row_lengths).to_sparse()
+    dataset = dataset_ops.Dataset.from_tensor_slices(sparse)
+    dataset = dataset.batch(32, drop_remainder=False)
+    dataset = dataset.map(lambda x: x)
+    self.assertEqual([None, 3], dataset.element_spec.shape.as_list())
+
   def testTensorArray(self):
 
     def _tensor_array(i):
@@ -1094,6 +1119,30 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensors(constant_op.constant(1.0))
     dataset.map(func)
 
+  @parameterized.named_parameters(
+      ("Sequential", None),
+      ("Parallel", 12),
+  )
+  @test_util.run_v1_only("graph-mode specific test")
+  def testSkipEagerMapCancellation(self, num_parallel_calls):
+    # Checks that a cancellation of is threaded through to map transformation.
+    queue = data_flow_ops.FIFOQueue(10, dtypes.int32, ())
+
+    def fn(_):
+      return queue.dequeue()
+
+    dataset = dataset_ops.Dataset.range(1).map(
+        fn, num_parallel_calls=num_parallel_calls)
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    with self.cached_session() as sess:
+      thread = self.checkedThread(self.assert_op_cancelled, args=(get_next(),))
+      thread.start()
+      time.sleep(0.2)
+      sess.close()
+      thread.join()
+
+
 # TODO(shivaniagarwal): separate out `map` and `map_with_legacy_function` tests
 # as later would not work in v2.
 @test_util.run_all_in_graph_and_eager_modes
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index 13f0e08c9cc..3ab6717b9c3 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -290,7 +290,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                                  expected_value_structure))
     self.assertEqual([dtypes.variant],
                      structure.get_flat_tensor_types(opt_structure))
-    self.assertEqual([tensor_shape.scalar()],
+    self.assertEqual([tensor_shape.TensorShape([])],
                      structure.get_flat_tensor_shapes(opt_structure))
 
     # All OptionalSpec objects are not compatible with a non-optional
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index b54749002cc..39339c0063a 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -221,7 +221,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         TypeError, r'Padded shape .* must be a 1-D tensor '
         r'of tf.int64 values, but its element type was float32.'):
       _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=constant_op.constant([1., 2., 3.]))
+          5, padded_shapes=constant_op.constant([1.5, 2., 3.]))
 
     with self.assertRaisesRegexp(
         ValueError, r'The padded shape \(1,\) is not compatible with the '
diff --git a/tensorflow/python/data/kernel_tests/prefetch_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
index ca59dd067a0..427fbf1d29f 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from absl.testing import parameterized
 
 from tensorflow.python.data.kernel_tests import test_base
@@ -49,6 +51,24 @@ class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset, buffer_size, slack_period=slack_period)
     self.assertDatasetProduces(dataset, expected_output=range(100))
 
+  @test_util.run_v1_only("graph-mode specific test")
+  def testSkipEagerPrefetchCancellation(self):
+
+    def map_py_fn(x):
+      while x > -1:
+        x = x * 1
+      return x
+
+    dataset = dataset_ops.Dataset.range(10).map(map_py_fn).prefetch(3)
+    get_next = self.getNext(dataset)
+
+    with self.cached_session() as sess:
+      thread = self.checkedThread(self.assert_op_cancelled, args=(get_next(),))
+      thread.start()
+      time.sleep(0.5)
+      sess.close()
+      thread.join()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index a0f56a4c374..d1846e4eaeb 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -18,25 +18,25 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
 class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testShuffleDataset(self):
     components = (
         np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
@@ -115,8 +115,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
-  def testSkipEagerSeedZero(self):
+  @combinations.generate(combinations.combine(tf_api_version=1, mode="graph"))
+  def testSeedZero(self):
     """Test for same behavior when the seed is a Python or Tensor zero."""
     iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.range(10).shuffle(10, seed=0))
@@ -141,6 +141,7 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDefaultArguments(self):
     components = [0, 1, 2, 3, 4]
     dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
@@ -154,42 +155,20 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(5):
       self.assertEqual(10, counts[i])
 
-  @parameterized.named_parameters(
-      ("Reshuffle", True),
-      ("NoReshuffle", False),
-  )
-  def testReshuffle(self, reshuffle):
-    dataset = dataset_ops.Dataset.range(10).shuffle(
-        10, reshuffle_each_iteration=reshuffle).repeat(2)
-    next_element = self.getNext(dataset)
-
-    first_epoch = []
-    for _ in range(10):
-      first_epoch.append(self.evaluate(next_element()))
-
-    second_epoch = []
-    for _ in range(10):
-      second_epoch.append(self.evaluate(next_element()))
-
-    self.assertEqual(first_epoch == second_epoch, not reshuffle)
-
-  @parameterized.named_parameters(
-      ("ReshuffleGraphLevelSeed", True, 38, None),
-      ("ReshuffleOpLevelSeed", True, None, 42),
-      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
-      ("NoReshuffleGraphLevelSeed", False, 38, None),
-      ("NoReshuffleOpLevelSeed", False, None, 42),
-      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
-  )
-  def testSkipEagerShuffleSeed(self, reshuffle, graph_level_seed,
-                               op_level_seed):
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(tf_api_version=[1, 2], mode="graph"),
+          combinations.combine(reshuffle=[True, False]),
+          combinations.combine(graph_seed=38, op_seed=None) +
+          combinations.combine(graph_seed=None, op_seed=42) +
+          combinations.combine(graph_seed=38, op_seed=42)))
+  def testShuffleSeed(self, reshuffle, graph_seed, op_seed):
     results = []
     for _ in range(2):
       with ops.Graph().as_default() as g:
-        random_seed.set_random_seed(graph_level_seed)
+        random_seed.set_random_seed(graph_seed)
         dataset = dataset_ops.Dataset.range(10).shuffle(
-            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
-                3)
+            10, seed=op_seed, reshuffle_each_iteration=reshuffle).repeat(3)
         iterator = dataset_ops.make_one_shot_iterator(dataset)
         next_element = iterator.get_next()
 
@@ -203,15 +182,13 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual(results[0], results[1])
 
-  # TODO(b/117581999): fails for eager mode with result[0] equal to result[1],
-  # debug.
-  @parameterized.named_parameters(
-      ("ReshuffleOneShot", True, False),
-      ("ReshuffleInitializable", True, True),
-      ("NoReshuffleOneShot", False, False),
-      ("NoReshuffleInitializable", False, True),
-  )
-  def testSkipEagerMultipleIterators(self, reshuffle, initializable):
+  # TODO(b/117581999): enable this test for eager-mode.
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(tf_api_version=[1, 2], mode="graph"),
+          combinations.combine(
+              reshuffle=[True, False], initializable=[True, False])))
+  def testMultipleIterators(self, reshuffle, initializable):
     with ops.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(100).shuffle(
           10, reshuffle_each_iteration=reshuffle).repeat(3)
@@ -239,6 +216,82 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
         self.assertNotEqual(results[0], results[1])
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(reshuffle=[True, False], seed=[None, 42])))
+  def testReshuffleRepeatEpochs(self, reshuffle, seed):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, seed=seed, reshuffle_each_iteration=reshuffle).repeat(2)
+    next_element = self.getNext(dataset)
+
+    first_epoch = []
+    for _ in range(10):
+      first_epoch.append(self.evaluate(next_element()))
+
+    second_epoch = []
+    for _ in range(10):
+      second_epoch.append(self.evaluate(next_element()))
+
+    self.assertEqual(first_epoch == second_epoch, not reshuffle)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(tf_api_version=2, mode="eager"),
+          combinations.combine(reshuffle=[True, False], seed=[None, 42])))
+  def testReshuffleIterationEpochs(self, reshuffle, seed):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, seed=seed, reshuffle_each_iteration=reshuffle)
+
+    first_epoch = []
+    for elem in dataset:
+      first_epoch.append(elem.numpy())
+
+    second_epoch = []
+    for elem in dataset:
+      second_epoch.append(elem.numpy())
+
+    self.assertEqual(first_epoch == second_epoch, not reshuffle)
+
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
+  def testShuffleV2ResourceCapture(self):
+
+    def make_dataset():
+      ids = dataset_ops.Dataset.range(10)
+      ids = ids.shuffle(1)
+
+      def interleave_fn(dataset, _):
+        return dataset
+
+      dataset = dataset_ops.Dataset.range(1)
+      dataset = dataset.interleave(functools.partial(interleave_fn, ids))
+      return dataset
+
+    results = []
+    for elem in make_dataset():
+      results.append(elem.numpy())
+
+    self.assertAllEqual(results, range(10))
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(tf_api_version=[1, 2], mode="eager"),
+          combinations.combine(reshuffle=[True, False], seed=[None, 42])))
+  def testReshuffleSeparateTransformations(self, reshuffle, seed):
+    dataset = dataset_ops.Dataset.range(10)
+
+    first_epoch = []
+    for elem in dataset.shuffle(
+        10, seed=seed, reshuffle_each_iteration=reshuffle):
+      first_epoch.append(elem.numpy())
+
+    second_epoch = []
+    for elem in dataset.shuffle(
+        10, seed=seed, reshuffle_each_iteration=reshuffle):
+      second_epoch.append(elem.numpy())
+
+    self.assertEqual(first_epoch != second_epoch, seed is None)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index c831b135aac..c81ec21c485 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -24,8 +24,10 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -33,6 +35,11 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
+def default_test_combinations():
+  """Returns the default test combinations for tf.data tests."""
+  return combinations.combine(tf_api_version=[1, 2], mode=["eager", "graph"])
+
+
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
@@ -44,7 +51,7 @@ class DatasetTestBase(test.TestCase):
       dataset_ops.Dataset = dataset_ops.DatasetV1
 
   def assert_op_cancelled(self, op):
-    with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
+    with self.assertRaises(errors.CancelledError):
       self.evaluate(op)
 
   def assertValuesEqual(self, expected, actual):
@@ -87,7 +94,11 @@ class DatasetTestBase(test.TestCase):
         else:
           return r
       return _wrapper
-    if context.executing_eagerly():
+
+    # Create an anonymous iterator if we are in eager-mode or are graph inside
+    # of a tf.function.
+    building_function = ops.get_default_graph()._building_function  # pylint: disable=protected-access
+    if context.executing_eagerly() or building_function:
       iterator = iter(dataset)
       return ta_wrapper(iterator._next_internal)  # pylint: disable=protected-access
     else:
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 7216b5b9d38..2e0e7786d91 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,6 +30,7 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.python import tf2
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import optimization_options
@@ -108,6 +109,25 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   A `Dataset` can be used to represent an input pipeline as a
   collection of elements and a "logical plan" of transformations that act on
   those elements.
+
+  A dataset contains elements that each have the same (nested) structure and the
+  individual components of the structure can be of any type representable by
+  `tf.TypeSpec`, including `tf.Tensor`, `tf.data.Dataset`, `tf.SparseTensor`,
+  `tf.RaggedTensor`, or `tf.TensorArray`.
+
+  Example elements:
+  ```python
+  # Integer element
+  a = 1
+  # Float element
+  b = 2.0
+  # Tuple element with 2 components
+  c = (1, 2)
+  # Dict element with 3 components
+  d = {"a": (2, 2), "b": 3}
+  # Element containing a dataset
+  e = tf.data.Dataset.from_element(10)
+  ```
   """
 
   def __init__(self, variant_tensor):
@@ -141,14 +161,22 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def _variant_tensor(self, _):
     raise ValueError("The _variant_tensor property is read-only")
 
-  def _as_serialized_graph(self):
+  def _as_serialized_graph(self, allow_stateful=None):
     """Produces serialized graph representation of the dataset.
 
+    Args:
+      allow_stateful: If true, we allow stateful ops to be present in the graph
+      def. In that case, the state in these ops would be thrown away.
+
     Returns:
       A scalar `tf.Tensor` of `tf.string` type, representing this dataset as a
       serialized graph.
     """
-    return gen_dataset_ops.dataset_to_graph(self._variant_tensor)
+    if compat.forward_compatible(2019, 9, 16) or allow_stateful:
+      return gen_dataset_ops.dataset_to_graph(self._variant_tensor,
+                                              allow_stateful=allow_stateful)
+    else:
+      return gen_dataset_ops.dataset_to_graph(self._variant_tensor)
 
   def _trace_variant_creation(self):
     """Traces a function which outputs a variant `tf.Tensor` for this dataset.
@@ -945,6 +973,21 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def shard(self, num_shards, index):
     """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
 
+    `shard` is a deterministic operator; the Dataset produced by
+    `A.shard(n, i)` will contain all elements of A whose index mod n = i.
+
+    ```python
+    # Create a Dataset with 60 elements.
+    A = tf.data.Dataset.range(60) # ==> [0, 1, 2, 3, ..., 57, 58, 59]
+
+    # Create 3 Datasets, each with 20 elements from Dataset A.
+    B = A.shard(num_shards=3, index=0) # ==> [0, 3, 6, 9, ..., 51, 54, 57]
+    C = A.shard(num_shards=3, index=1) # ==> [1, 4, 7, 10, ..., 52, 55, 58]
+    D = A.shard(num_shards=3, index=2) # ==> [2, 5, 8, 11, ..., 53, 56, 59]
+
+    # There is no overlap between Datasets B, C and D.
+    ```
+
     This dataset operator is very useful when running distributed training, as
     it allows each worker to read a unique subset.
 
@@ -1165,18 +1208,17 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
 
     2) Use `tf.py_function`, which allows you to write arbitrary Python code but
     will generally result in worse performance than 1). For example:
-    
+
     ```python
     d = tf.data.Dataset.from_tensor_slices(['hello', 'world'])
-    
+
     # transform a string tensor to upper case string using a Python function
     def upper_case_fn(t: tf.Tensor) -> str:
         return t.numpy().decode('utf-8').upper()
-    
+
     d.map(lambda x: tf.py_function(func=upper_case_fn,
           inp=[x], Tout=tf.string))  # ==> [ "HELLO", "WORLD" ]
     ```
-    
 
     Args:
       map_func: A function mapping a dataset element to another dataset element.
@@ -1598,13 +1640,13 @@ class DatasetV1(DatasetV2):
         raise AttributeError("Please use _variant_tensor instead of "
                              "_as_variant_tensor() to obtain the variant "
                              "associated with a dataset")
-      raise AttributeError("A likely cause of this error is that the super "
+      raise AttributeError("{}: A likely cause of this error is that the super "
                            "call for this dataset is not the last line of the "
                            "__init__ method. The base class causes the "
                            "_as_variant_tensor call in its constructor and "
                            "if that uses attributes defined in the __init__ "
                            "method, those attrs need to be defined before the "
-                           "super call.")
+                           "super call.".format(e))
     super(DatasetV1, self).__init__(variant_tensor)
 
   @abc.abstractmethod
@@ -1724,12 +1766,8 @@ class DatasetV1(DatasetV2):
     dataset = self._apply_options()
     if shared_name is None:
       shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="", shared_name=shared_name, **self._flat_structure)
-    else:
-      iterator_resource = gen_dataset_ops.iterator(
-          container="", shared_name=shared_name, **self._flat_structure)
+    iterator_resource = gen_dataset_ops.iterator_v2(
+        container="", shared_name=shared_name, **self._flat_structure)
     with ops.colocate_with(iterator_resource):
       initializer = gen_dataset_ops.make_iterator(
           dataset._variant_tensor,  # pylint: disable=protected-access
@@ -2195,7 +2233,7 @@ class Options(options_lib.OptionsBase):
       name="experimental_distribute",
       ty=distribute_options.DistributeOptions,
       docstring=
-      "The distribution options associated with the dataset. See "
+      "The distribution strategy options associated with the dataset. See "
       "`tf.data.experimental.DistributeOptions` for more details.",
       default_factory=distribute_options.DistributeOptions)
 
@@ -2232,6 +2270,16 @@ class Options(options_lib.OptionsBase):
       "`tf.data.experimental.ThreadingOptions` for more details.",
       default_factory=threading_options.ThreadingOptions)
 
+  experimental_allow_stateful = options_lib.create_option(
+      name="experimental_allow_stateful",
+      ty=bool,
+      docstring="By default, tf.data will refuse to serialize a dataset or "
+      "checkpoint its iterator if the dataset contains a stateful op as the "
+      "serialization / checkpointing won't be able to capture its state. "
+      "Users can -- at their own risk -- override this restriction by "
+      "explicitly specifying that they are fine throwing away the state "
+      "in these ops when they turn this option on.")
+
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
 
@@ -2240,11 +2288,13 @@ class Options(options_lib.OptionsBase):
 
     if self.experimental_deterministic is False:
       result.append("make_sloppy")
-    exp_stats_options = self.experimental_stats
-    if exp_stats_options and exp_stats_options.latency_all_edges:
+    if self.experimental_stats and self.experimental_stats.latency_all_edges:
       result.append("latency_all_edges")
     if self.experimental_slack:
       result.append("slack")
+    if (self.experimental_distribute and
+        self.experimental_distribute._make_stateless):  # pylint: disable=protected-access
+      result.append("make_stateless")
     return result
 
   def _static_optimization_configs(self):
@@ -2578,6 +2628,8 @@ class StructuredFunctionWrapper(object):
                          "must be specified.")
       self._input_structure = input_structure
 
+    self._func = func
+
     if defun_kwargs is None:
       defun_kwargs = {}
 
@@ -2909,6 +2961,47 @@ class RangeDataset(DatasetSource):
     return self._structure
 
 
+class _MemoryCacheDeleter(object):
+  """An object which cleans up an anonymous memory cache resource.
+
+  An alternative to defining a __del__ method on an object. Even if the parent
+  object is part of a reference cycle, the cycle will be collectable.
+  """
+
+  def __init__(self, handle, device, deleter):
+    self._deleter = deleter
+    self._handle = handle
+    self._device = device
+    self._eager_mode = context.executing_eagerly()
+
+  def __del__(self):
+    with ops.device(self._device):
+      # Make sure the resource is deleted in the same mode as it was created in.
+      if self._eager_mode:
+        with context.eager_mode():
+          gen_dataset_ops.delete_memory_cache(
+              handle=self._handle, deleter=self._deleter)
+      else:
+        with context.graph_mode():
+          gen_dataset_ops.delete_memory_cache(
+              handle=self._handle, deleter=self._deleter)
+
+
+class _MemoryCache(object):
+  """Represents a memory cache resource."""
+
+  def __init__(self):
+    super(_MemoryCache, self).__init__()
+    self._device = context.context().device_name
+    self._handle, self._deleter = (gen_dataset_ops.anonymous_memory_cache())
+    self._resource_deleter = _MemoryCacheDeleter(
+        handle=self._handle, device=self._device, deleter=self._deleter)
+
+  @property
+  def handle(self):
+    return self._handle
+
+
 class CacheDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that caches elements of its input."""
 
@@ -2917,13 +3010,64 @@ class CacheDataset(UnaryUnchangedStructureDataset):
     self._input_dataset = input_dataset
     self._filename = ops.convert_to_tensor(
         filename, dtype=dtypes.string, name="filename")
-    variant_tensor = gen_dataset_ops.cache_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        filename=self._filename,
-        **self._flat_structure)
+    if tf2.enabled() and (context.executing_eagerly() or
+                          ops.get_default_graph()._building_function):  # pylint: disable=protected-access
+      self._cache = _MemoryCache()
+      variant_tensor = gen_dataset_ops.cache_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          filename=self._filename,
+          cache=self._cache.handle,
+          **self._flat_structure)
+    else:
+      variant_tensor = gen_dataset_ops.cache_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          filename=self._filename,
+          **self._flat_structure)
     super(CacheDataset, self).__init__(input_dataset, variant_tensor)
 
 
+class _RandomSeedGeneratorDeleter(object):
+  """An object which cleans up an anonymous random seed generator resource.
+
+  An alternative to defining a __del__ method on an object. Even if the parent
+  object is part of a reference cycle, the cycle will be collectable.
+  """
+
+  def __init__(self, handle, device, deleter):
+    self._deleter = deleter
+    self._handle = handle
+    self._device = device
+    self._eager_mode = context.executing_eagerly()
+
+  def __del__(self):
+    with ops.device(self._device):
+      # Make sure the resource is deleted in the same mode as it was created in.
+      if self._eager_mode:
+        with context.eager_mode():
+          gen_dataset_ops.delete_random_seed_generator(
+              handle=self._handle, deleter=self._deleter)
+      else:
+        with context.graph_mode():
+          gen_dataset_ops.delete_random_seed_generator(
+              handle=self._handle, deleter=self._deleter)
+
+
+class _RandomSeedGenerator(object):
+  """Represents a random seed generator resource."""
+
+  def __init__(self, seed, seed2):
+    super(_RandomSeedGenerator, self).__init__()
+    self._device = context.context().device_name
+    self._handle, self._deleter = (
+        gen_dataset_ops.anonymous_random_seed_generator(seed=seed, seed2=seed2))
+    self._resource_deleter = _RandomSeedGeneratorDeleter(
+        handle=self._handle, device=self._device, deleter=self._deleter)
+
+  @property
+  def handle(self):
+    return self._handle
+
+
 class ShuffleDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
@@ -2960,13 +3104,24 @@ class ShuffleDataset(UnaryUnchangedStructureDataset):
       self._reshuffle_each_iteration = True
     else:
       self._reshuffle_each_iteration = reshuffle_each_iteration
-    variant_tensor = gen_dataset_ops.shuffle_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        buffer_size=self._buffer_size,
-        seed=self._seed,
-        seed2=self._seed2,
-        reshuffle_each_iteration=self._reshuffle_each_iteration,
-        **self._flat_structure)
+
+    if tf2.enabled() and self._reshuffle_each_iteration and (
+        context.executing_eagerly() or
+        ops.get_default_graph()._building_function):  # pylint: disable=protected-access
+      self._seed_generator = _RandomSeedGenerator(self._seed, self._seed2)
+      variant_tensor = gen_dataset_ops.shuffle_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          buffer_size=self._buffer_size,
+          seed_generator=self._seed_generator.handle,
+          **self._flat_structure)
+    else:
+      variant_tensor = gen_dataset_ops.shuffle_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          buffer_size=self._buffer_size,
+          seed=self._seed,
+          seed2=self._seed2,
+          reshuffle_each_iteration=self._reshuffle_each_iteration,
+          **self._flat_structure)
     super(ShuffleDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -3165,7 +3320,7 @@ def _padding_value_to_tensor(value, output_type):
     TypeError: if the padding value's type does not match `output_type`.
   """
   value = ops.convert_to_tensor(value, name="padding_value")
-  if not value.shape.is_compatible_with(tensor_shape.scalar()):
+  if not value.shape.is_compatible_with(tensor_shape.TensorShape([])):
     raise ValueError("Padding value should be a scalar, but is not: %s" % value)
   if value.dtype != output_type:
     raise TypeError("Padding value tensor (%s) does not match output type: %s" %
@@ -3229,10 +3384,10 @@ class PaddedBatchDataset(UnaryDataset):
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
     def _padded_shape_to_batch_shape(s):
-      return tensor_shape.vector(
-          tensor_util.constant_value(self._batch_size) if smart_cond.
-          smart_constant_value(self._drop_remainder) else None).concatenate(
-              tensor_util.constant_value_as_shape(s))
+      return tensor_shape.TensorShape([
+          tensor_util.constant_value(self._batch_size)
+          if smart_cond.smart_constant_value(self._drop_remainder) else None
+      ]).concatenate(tensor_util.constant_value_as_shape(s))
 
     output_shapes = nest.map_structure(
         _padded_shape_to_batch_shape, self._padded_shapes)
@@ -3629,20 +3784,12 @@ class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
     self._stats_aggregator = aggregator
     self._prefix = prefix
     self._counter_prefix = counter_prefix
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.set_stats_aggregator_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._stats_aggregator._resource,  # pylint: disable=protected-access
-          self._prefix,
-          self._counter_prefix,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_set_stats_aggregator_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._stats_aggregator._resource,  # pylint: disable=protected-access
-          self._prefix,
-          self._counter_prefix,
-          **self._flat_structure)
+    variant_tensor = ged_ops.set_stats_aggregator_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        self._prefix,
+        self._counter_prefix,
+        **self._flat_structure)
     super(_SetStatsAggregatorDataset, self).__init__(input_dataset,
                                                      variant_tensor)
 
@@ -3656,16 +3803,10 @@ class _MaxIntraOpParallelismDataset(UnaryUnchangedStructureDataset):
         max_intra_op_parallelism,
         dtype=dtypes.int64,
         name="max_intra_op_parallelism")
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.max_intra_op_parallelism_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._max_intra_op_parallelism,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_max_intra_op_parallelism_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._max_intra_op_parallelism,
-          **self._flat_structure)
+    variant_tensor = ged_ops.max_intra_op_parallelism_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._max_intra_op_parallelism,
+        **self._flat_structure)
     super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset,
                                                         variant_tensor)
 
@@ -3677,16 +3818,10 @@ class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
     self._input_dataset = input_dataset
     self._num_threads = ops.convert_to_tensor(
         num_threads, dtype=dtypes.int64, name="num_threads")
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.private_thread_pool_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._num_threads,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_private_thread_pool_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._num_threads,
-          **self._flat_structure)
+    variant_tensor = ged_ops.private_thread_pool_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._num_threads,
+        **self._flat_structure)
     super(_PrivateThreadPoolDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
@@ -3725,14 +3860,9 @@ class _UnbatchDataset(UnaryDataset):
     self._structure = nest.map_structure(
         lambda component_spec: component_spec._unbatch(),  # pylint: disable=protected-access
         get_structure(input_dataset))
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.unbatch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_unbatch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          **self._flat_structure)
+    variant_tensor = ged_ops.unbatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        **self._flat_structure)
     super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 8d523d34906..446cd09b843 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import threading
 import warnings
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
@@ -201,29 +200,22 @@ class Iterator(trackable.Trackable):
         output_types, output_shapes, output_classes)
     if shared_name is None:
       shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      if _device_stack_is_empty():
-        with ops.device("/cpu:0"):
-          iterator_resource = gen_dataset_ops.iterator_v2(
-              container="",
-              shared_name=shared_name,
-              output_types=structure.get_flat_tensor_types(
-                  output_structure),
-              output_shapes=structure.get_flat_tensor_shapes(
-                  output_structure))
-      else:
+    if _device_stack_is_empty():
+      with ops.device("/cpu:0"):
         iterator_resource = gen_dataset_ops.iterator_v2(
             container="",
             shared_name=shared_name,
-            output_types=structure.get_flat_tensor_types(output_structure),
+            output_types=structure.get_flat_tensor_types(
+                output_structure),
             output_shapes=structure.get_flat_tensor_shapes(
                 output_structure))
     else:
-      iterator_resource = gen_dataset_ops.iterator(
+      iterator_resource = gen_dataset_ops.iterator_v2(
           container="",
           shared_name=shared_name,
           output_types=structure.get_flat_tensor_types(output_structure),
-          output_shapes=structure.get_flat_tensor_shapes(output_structure))
+          output_shapes=structure.get_flat_tensor_shapes(
+              output_structure))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -291,20 +283,14 @@ class Iterator(trackable.Trackable):
     output_structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
-    if compat.forward_compatible(2018, 8, 3):
-      if _device_stack_is_empty():
-        with ops.device("/cpu:0"):
-          iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
-              string_handle,
-              output_types=structure.get_flat_tensor_types(output_structure),
-              output_shapes=structure.get_flat_tensor_shapes(output_structure))
-      else:
+    if _device_stack_is_empty():
+      with ops.device("/cpu:0"):
         iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
             string_handle,
             output_types=structure.get_flat_tensor_types(output_structure),
             output_shapes=structure.get_flat_tensor_shapes(output_structure))
     else:
-      iterator_resource = gen_dataset_ops.iterator_from_string_handle(
+      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
           string_handle,
           output_types=structure.get_flat_tensor_types(output_structure),
           output_shapes=structure.get_flat_tensor_shapes(output_structure))
@@ -795,8 +781,7 @@ class IteratorSpec(type_spec.TypeSpec):
     return IteratorSpec(value.element_spec)  # pylint: disable=protected-access
 
 
-# TODO(b/71645805): Expose trackable stateful objects from dataset
-# attributes(potential).
+# TODO(b/71645805): Expose trackable stateful objects from dataset.
 class _IteratorSaveable(BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
 
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 0a5fd456645..acebe54e6c7 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -348,7 +348,7 @@ class MultiDeviceIterator(object):
 
   def _eager_reset(self):
     """Resets the MultiDeviceIterator in eager mode."""
-    if not context.executing_eagerly():
+    if not ops.executing_eagerly_outside_functions():
       raise ValueError("Eager reset is only supported in eager mode.")
     # pylint: disable=protected-access
     self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 07434f54e44..1e1402c222c 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -34,14 +34,16 @@ from tensorflow.python.util.tf_export import tf_export
 @tf_export("data.experimental.Optional")
 @six.add_metaclass(abc.ABCMeta)
 class Optional(composite_tensor.CompositeTensor):
-  """Wraps a nested structure of tensors that may/may not be present at runtime.
+  """Wraps a value that may/may not be present at runtime.
 
   An `Optional` can represent the result of an operation that may fail as a
   value, rather than raising an exception and halting execution. For example,
   `tf.data.experimental.get_next_as_optional` returns an `Optional` that either
   contains the next value from a `tf.compat.v1.data.Iterator` if one exists, or
-  a "none"
-  value that indicates the end of the sequence has been reached.
+  a "none" value that indicates the end of the sequence has been reached.
+
+  `Optional` can only be used by values that are convertible to `Tensor` or
+  `CompositeTensor`.
   """
 
   @abc.abstractmethod
@@ -58,7 +60,7 @@ class Optional(composite_tensor.CompositeTensor):
 
   @abc.abstractmethod
   def get_value(self, name=None):
-    """Returns a nested structure of values wrapped by this optional.
+    """Returns the value wrapped by this optional.
 
     If this optional does not have a value (i.e. `self.has_value()` evaluates
     to `False`), this operation will raise `tf.errors.InvalidArgumentError`
@@ -68,7 +70,7 @@ class Optional(composite_tensor.CompositeTensor):
       name: (Optional.) A name for the created operation.
 
     Returns:
-      A nested structure of `tf.Tensor` and/or `tf.SparseTensor` objects.
+      The wrapped value.
     """
     raise NotImplementedError("Optional.get_value()")
 
@@ -87,7 +89,8 @@ class Optional(composite_tensor.CompositeTensor):
     """Returns an `Optional` that wraps the given value.
 
     Args:
-      value: A nested structure of `tf.Tensor` and/or `tf.SparseTensor` objects.
+      value: A value to wrap. The value must be convertible to `Tensor` or
+        `CompositeTensor`.
 
     Returns:
       An `Optional` that wraps `value`.
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index a5610cdf7cd..b55406b72e3 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
@@ -49,7 +48,7 @@ def _create_or_validate_filenames_dataset(filenames):
       raise TypeError(
           "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
     if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
-        tensor_shape.scalar()):
+        tensor_shape.TensorShape([])):
       raise TypeError(
           "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
           "elements.")
@@ -79,6 +78,9 @@ def _create_dataset_reader(dataset_creator, filenames, num_parallel_reads=None):
 
   if num_parallel_reads is None:
     return filenames.flat_map(read_one_file)
+  elif num_parallel_reads == dataset_ops.AUTOTUNE:
+    return filenames.interleave(
+        read_one_file, num_parallel_calls=num_parallel_reads)
   else:
     return ParallelInterleaveDataset(
         filenames, read_one_file, cycle_length=num_parallel_reads,
@@ -239,28 +241,16 @@ class ParallelInterleaveDataset(dataset_ops.UnaryDataset):
         "prefetch_input_elements",
         prefetch_input_elements,
         argument_default=2 * cycle_length)
-    if compat.forward_compatible(2019, 8, 3):
-      variant_tensor = ged_ops.parallel_interleave_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          self._cycle_length,
-          self._block_length,
-          self._sloppy,
-          self._buffer_output_elements,
-          self._prefetch_input_elements,
-          f=self._map_func.function,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.experimental_parallel_interleave_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._map_func.function.captured_inputs,
-          self._cycle_length,
-          self._block_length,
-          self._sloppy,
-          self._buffer_output_elements,
-          self._prefetch_input_elements,
-          f=self._map_func.function,
-          **self._flat_structure)
+    variant_tensor = ged_ops.parallel_interleave_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        self._cycle_length,
+        self._block_length,
+        self._sloppy,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
+        f=self._map_func.function,
+        **self._flat_structure)
     super(ParallelInterleaveDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
@@ -407,15 +397,9 @@ class _FixedLengthRecordDataset(dataset_ops.DatasetSource):
         compression_type,
         argument_default="",
         argument_dtype=dtypes.string)
-    if (self._compression_type is not None or
-        compat.forward_compatible(2018, 11, 30)):
-      variant_tensor = gen_dataset_ops.fixed_length_record_dataset_v2(
-          self._filenames, self._header_bytes, self._record_bytes,
-          self._footer_bytes, self._buffer_size, self._compression_type)
-    else:
-      variant_tensor = gen_dataset_ops.fixed_length_record_dataset(
-          self._filenames, self._header_bytes, self._record_bytes,
-          self._footer_bytes, self._buffer_size)
+    variant_tensor = gen_dataset_ops.fixed_length_record_dataset_v2(
+        self._filenames, self._header_bytes, self._record_bytes,
+        self._footer_bytes, self._buffer_size, self._compression_type)
     super(_FixedLengthRecordDataset, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index ebfd8af3423..245f578826b 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -35,12 +35,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as _collections
-
 import six as _six
 
-from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.framework import sparse_tensor as _sparse_tensor
+from tensorflow.python.util.compat import collections_abc as _collections_abc
 
 
 def _sorted(dict_):
@@ -69,9 +68,8 @@ def _sequence_like(instance, args):
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
     return type(instance)((key, result[key]) for key in instance)
-  elif (isinstance(instance, tuple) and
-        hasattr(instance, "_fields") and
-        isinstance(instance._fields, _collections.Sequence) and
+  elif (isinstance(instance, tuple) and hasattr(instance, "_fields") and
+        isinstance(instance._fields, _collections_abc.Sequence) and
         all(isinstance(f, _six.string_types) for f in instance._fields)):
     # This is a namedtuple
     return type(instance)(*args)
@@ -97,10 +95,10 @@ def _yield_value(iterable):
 
 
 # See the swig file (../../util/util.i) for documentation.
-is_sequence = _pywrap_tensorflow.IsSequenceForData
+is_sequence = _pywrap_utils.IsSequenceForData
 
 # See the swig file (../../util/util.i) for documentation.
-flatten = _pywrap_tensorflow.FlattenForData
+flatten = _pywrap_utils.FlattenForData
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -122,7 +120,7 @@ def assert_same_structure(nest1, nest2, check_types=True):
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
-  _pywrap_tensorflow.AssertSameStructureForData(nest1, nest2, check_types)
+  _pywrap_utils.AssertSameStructureForData(nest1, nest2, check_types)
 
 
 def _packed_nest_with_indices(structure, flat, index):
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index 06acf55ab9d..3b9eed128a2 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -87,64 +87,67 @@ class SparseTest(test.TestCase):
             "expected": ()
         },
         {
-            "types": tensor_shape.scalar(),
+            "types": tensor_shape.TensorShape([]),
             "classes": ops.Tensor,
-            "expected": tensor_shape.scalar()
+            "expected": tensor_shape.TensorShape([])
         },
         {
-            "types": tensor_shape.scalar(),
+            "types": tensor_shape.TensorShape([]),
             "classes": sparse_tensor.SparseTensor,
             "expected": tensor_shape.unknown_shape()
         },
         {
-            "types": (tensor_shape.scalar()),
+            "types": (tensor_shape.TensorShape([])),
             "classes": (ops.Tensor),
-            "expected": (tensor_shape.scalar())
+            "expected": (tensor_shape.TensorShape([]))
         },
         {
-            "types": (tensor_shape.scalar()),
+            "types": (tensor_shape.TensorShape([])),
             "classes": (sparse_tensor.SparseTensor),
             "expected": (tensor_shape.unknown_shape())
         },
         {
-            "types": (tensor_shape.scalar(), ()),
+            "types": (tensor_shape.TensorShape([]), ()),
             "classes": (ops.Tensor, ()),
-            "expected": (tensor_shape.scalar(), ())
+            "expected": (tensor_shape.TensorShape([]), ())
         },
         {
-            "types": ((), tensor_shape.scalar()),
+            "types": ((), tensor_shape.TensorShape([])),
             "classes": ((), ops.Tensor),
-            "expected": ((), tensor_shape.scalar())
+            "expected": ((), tensor_shape.TensorShape([]))
         },
         {
-            "types": (tensor_shape.scalar(), ()),
+            "types": (tensor_shape.TensorShape([]), ()),
             "classes": (sparse_tensor.SparseTensor, ()),
             "expected": (tensor_shape.unknown_shape(), ())
         },
         {
-            "types": ((), tensor_shape.scalar()),
+            "types": ((), tensor_shape.TensorShape([])),
             "classes": ((), sparse_tensor.SparseTensor),
             "expected": ((), tensor_shape.unknown_shape())
         },
         {
-            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
+            "types": (tensor_shape.TensorShape([]), (),
+                      tensor_shape.TensorShape([])),
             "classes": (ops.Tensor, (), ops.Tensor),
-            "expected": (tensor_shape.scalar(), (), tensor_shape.scalar())
+            "expected": (tensor_shape.TensorShape([]), (),
+                         tensor_shape.TensorShape([]))
         },
         {
-            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
-            "classes": (sparse_tensor.SparseTensor, (),
-                        sparse_tensor.SparseTensor),
+            "types": (tensor_shape.TensorShape([]), (),
+                      tensor_shape.TensorShape([])),
+            "classes":
+                (sparse_tensor.SparseTensor, (), sparse_tensor.SparseTensor),
             "expected": (tensor_shape.unknown_shape(), (),
                          tensor_shape.unknown_shape())
         },
         {
-            "types": ((), tensor_shape.scalar(), ()),
+            "types": ((), tensor_shape.TensorShape([]), ()),
             "classes": ((), ops.Tensor, ()),
-            "expected": ((), tensor_shape.scalar(), ())
+            "expected": ((), tensor_shape.TensorShape([]), ())
         },
         {
-            "types": ((), tensor_shape.scalar(), ()),
+            "types": ((), tensor_shape.TensorShape([]), ()),
             "classes": ((), sparse_tensor.SparseTensor, ()),
             "expected": ((), tensor_shape.unknown_shape(), ())
         },
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index 8781a1933c5..290dc99df27 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -373,6 +373,23 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
     self.assertEqual(st_after.dense_shape.shape.as_list(),
                      st.dense_shape.shape.as_list())
 
+  def testPreserveTensorArrayShape(self):
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=1, element_shape=(3,))
+    ta_s = structure.type_spec_from_value(ta)
+    ta_after = structure.from_tensor_list(ta_s,
+                                          structure.to_tensor_list(ta_s, ta))
+    self.assertEqual(ta_after.element_shape.as_list(), [3])
+
+  def testPreserveInferredTensorArrayShape(self):
+    ta = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=1)
+    # Shape is inferred from the write.
+    ta = ta.write(0, [1, 2, 3])
+    ta_s = structure.type_spec_from_value(ta)
+    ta_after = structure.from_tensor_list(ta_s,
+                                          structure.to_tensor_list(ta_s, ta))
+    self.assertEqual(ta_after.element_shape.as_list(), [3])
+
   def testIncompatibleStructure(self):
     # Define three mutually incompatible values/structures, and assert that:
     # 1. Using one structure to flatten a value with an incompatible structure
@@ -525,40 +542,43 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
       structure.from_tensor_list(s_2, flat_s_1)
 
   @parameterized.named_parameters(
-      ("Tensor", dtypes.float32, tensor_shape.scalar(), ops.Tensor,
-       tensor_spec.TensorSpec([], dtypes.float32)),
-      ("SparseTensor", dtypes.int32, tensor_shape.matrix(
-          2, 2), sparse_tensor.SparseTensor,
+      ("Tensor", dtypes.float32, tensor_shape.TensorShape(
+          []), ops.Tensor, tensor_spec.TensorSpec([], dtypes.float32)),
+      ("SparseTensor", dtypes.int32, tensor_shape.TensorShape(
+          [2, 2]), sparse_tensor.SparseTensor,
        sparse_tensor.SparseTensorSpec([2, 2], dtypes.int32)),
-      ("TensorArray_0", dtypes.int32, tensor_shape.as_shape(
-          [None, True, 2, 2]), tensor_array_ops.TensorArray,
+      ("TensorArray_0", dtypes.int32,
+       tensor_shape.TensorShape([None, True, 2, 2
+                                ]), tensor_array_ops.TensorArray,
        tensor_array_ops.TensorArraySpec(
            [2, 2], dtypes.int32, dynamic_size=None, infer_shape=True)),
-      ("TensorArray_1", dtypes.int32, tensor_shape.as_shape(
-          [True, None, 2, 2]), tensor_array_ops.TensorArray,
+      ("TensorArray_1", dtypes.int32,
+       tensor_shape.TensorShape([True, None, 2, 2
+                                ]), tensor_array_ops.TensorArray,
        tensor_array_ops.TensorArraySpec(
            [2, 2], dtypes.int32, dynamic_size=True, infer_shape=None)),
-      ("TensorArray_2", dtypes.int32, tensor_shape.as_shape(
-          [True, False, 2, 2]), tensor_array_ops.TensorArray,
+      ("TensorArray_2", dtypes.int32,
+       tensor_shape.TensorShape([True, False, 2, 2
+                                ]), tensor_array_ops.TensorArray,
        tensor_array_ops.TensorArraySpec(
            [2, 2], dtypes.int32, dynamic_size=True, infer_shape=False)),
-      ("RaggedTensor", dtypes.int32, tensor_shape.matrix(
-          2, None), ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1),
+      ("RaggedTensor", dtypes.int32, tensor_shape.TensorShape([2, None]),
+       ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1),
        ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1)),
       ("Nested", {
           "a": dtypes.float32,
           "b": (dtypes.int32, dtypes.string)
       }, {
-          "a": tensor_shape.scalar(),
-          "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar())
+          "a": tensor_shape.TensorShape([]),
+          "b": (tensor_shape.TensorShape([2, 2]), tensor_shape.TensorShape([]))
       }, {
           "a": ops.Tensor,
           "b": (sparse_tensor.SparseTensor, ops.Tensor)
       }, {
           "a":
               tensor_spec.TensorSpec([], dtypes.float32),
-          "b": (sparse_tensor.SparseTensorSpec([2, 2], dtypes.int32),
-                tensor_spec.TensorSpec([], dtypes.string))
+          "b": (sparse_tensor.SparseTensorSpec(
+              [2, 2], dtypes.int32), tensor_spec.TensorSpec([], dtypes.string))
       }),
   )
   def testConvertLegacyStructure(self, output_types, output_shapes,
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 8d9e6b0e67c..6a087df4046 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -799,6 +799,22 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "debug_grappler_test",
+    size = "small",
+    srcs = ["lib/debug_grappler_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    xla_enable_strict_auto_jit = False,  # Tests TF:Classic implementation
+)
+
 cuda_py_test(
     name = "session_debug_file_test",
     size = "small",
@@ -1171,7 +1187,6 @@ sh_test(
         ":offline_analyzer",
     ],
     tags = [
-        "no_oss",  # TODO(b/137652456): remove when fixed
         "no_windows",
     ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 586982dc4bf..795849843f4 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -47,6 +47,13 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
 
+# Helper function to accommodate MKL-enabled TensorFlow:
+# MatMul op is supported by MKL and its name is prefixed with "_Mkl" during the
+# MKL graph rewrite pass.
+def _matmul_op_name():
+  return "_MklMatMul" if test_util.IsMklEnabled() else "MatMul"
+
+
 def _cli_config_from_temp_file():
   return cli_config.CLIConfig(
       config_file_path=os.path.join(tempfile.mkdtemp(), ".tfdbg_config"))
@@ -135,14 +142,9 @@ def assert_listed_tensors(tst,
   attr_segs = out.font_attr_segs
   line_counter = 0
 
-  num_tensors = len(expected_tensor_names)
-
-  if tensor_filter_name is None:
-    tst.assertEqual("%d dumped tensor(s):" % num_tensors, next(line_iter))
-  else:
-    tst.assertEqual("%d dumped tensor(s) passing filter \"%s\":" %
-                    (num_tensors, tensor_filter_name), next(line_iter))
+  num_dumped_tensors = int(next(line_iter).split(" ")[0])
   line_counter += 1
+  tst.assertGreaterEqual(num_dumped_tensors, len(expected_tensor_names))
 
   if op_type_regex is not None:
     tst.assertEqual("Op type regex filter: \"%s\"" % op_type_regex,
@@ -669,7 +671,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         "simple_mul_add/u:0", "simple_mul_add/v:0", "simple_mul_add/u/read:0",
         "simple_mul_add/v/read:0", "simple_mul_add/matmul:0",
         "simple_mul_add/add:0"
-    ], ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"])
+    ], [
+        "VariableV2", "VariableV2", "Identity", "Identity",
+        _matmul_op_name(), "Add"
+    ])
 
     # Check the main menu.
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -683,8 +688,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="timestamp",
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -697,8 +704,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="dump_size")
     check_main_menu(self, out, list_tensors_enabled=False)
 
@@ -710,8 +719,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="dump_size",
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -730,8 +741,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="op_type",
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -745,8 +758,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="op_type",
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -760,8 +775,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="tensor_name",
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -775,8 +792,10 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
             "simple_mul_add/u:0", "simple_mul_add/v:0",
             "simple_mul_add/u/read:0", "simple_mul_add/v/read:0",
             "simple_mul_add/matmul:0", "simple_mul_add/add:0"
+        ], [
+            "VariableV2", "VariableV2", "Identity", "Identity",
+            _matmul_op_name(), "Add"
         ],
-        ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"],
         sort_by="tensor_name",
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
@@ -803,13 +822,13 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         ["Identity", "Identity"],
         op_type_regex="Identity")
 
-    out = self._registry.dispatch_command("list_tensors",
-                                          ["-t", "(Add|MatMul)"])
+    out = self._registry.dispatch_command(
+        "list_tensors", ["-t", "(Add|" + _matmul_op_name() + ")"])
     assert_listed_tensors(
         self,
         out, ["simple_mul_add/add:0", "simple_mul_add/matmul:0"],
-        ["Add", "MatMul"],
-        op_type_regex="(Add|MatMul)")
+        ["Add", _matmul_op_name()],
+        op_type_regex=("(Add|" + _matmul_op_name() + ")"))
     check_main_menu(self, out, list_tensors_enabled=False)
 
   def testListTensorFilterByNodeNameRegexAndOpTypeRegex(self):
@@ -845,7 +864,9 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     assert_listed_tensors(
         self,
         out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"],
-        ["MatMul", "Add"], tensor_filter_name="is_2x1_vector")
+        [_matmul_op_name(), "Add"],
+        tensor_filter_name="is_2x1_vector")
+
     check_main_menu(self, out, list_tensors_enabled=False)
 
   def testListTensorsFilterNanOrInf(self):
@@ -884,7 +905,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     recipients = [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")]
 
-    assert_node_attribute_lines(self, out, node_name, "MatMul",
+    assert_node_attribute_lines(self, out, node_name, _matmul_op_name(),
                                 self._main_device,
                                 [("Identity", "simple_mul_add/u/read"),
                                  ("Identity", "simple_mul_add/v/read")], [],
@@ -906,17 +927,21 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", ["-a", node_name])
 
+    test_attr_key_val_pairs = [("transpose_a", "b: false"),
+                               ("transpose_b", "b: false"),
+                               ("T", "type: DT_DOUBLE")]
+    if test_util.IsMklEnabled():
+      test_attr_key_val_pairs.append(("_kernel", 's: "MklNameChangeOp"'))
+
     assert_node_attribute_lines(
         self,
         out,
         node_name,
-        "MatMul",
+        _matmul_op_name(),
         self._main_device, [("Identity", "simple_mul_add/u/read"),
                             ("Identity", "simple_mul_add/v/read")], [],
         [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
-        attr_key_val_pairs=[("transpose_a", "b: false"),
-                            ("transpose_b", "b: false"),
-                            ("T", "type: DT_DOUBLE")])
+        attr_key_val_pairs=test_attr_key_val_pairs)
     check_main_menu(
         self,
         out,
@@ -933,7 +958,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         self,
         out,
         node_name,
-        "MatMul",
+        _matmul_op_name(),
         self._main_device, [("Identity", "simple_mul_add/u/read"),
                             ("Identity", "simple_mul_add/v/read")], [],
         [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
@@ -959,11 +984,12 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         self,
         out,
         node_name,
-        "MatMul",
+        _matmul_op_name(),
         self._main_device, [("Identity", "simple_mul_add/u/read"),
                             ("Identity", "simple_mul_add/v/read")], [],
         [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
-        show_stack_trace=True, stack_trace_available=False)
+        show_stack_trace=True,
+        stack_trace_available=False)
     check_main_menu(
         self,
         out,
@@ -982,11 +1008,12 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         self,
         out,
         node_name,
-        "MatMul",
+        _matmul_op_name(),
         self._main_device, [("Identity", "simple_mul_add/u/read"),
                             ("Identity", "simple_mul_add/v/read")], [],
         [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
-        show_stack_trace=True, stack_trace_available=True)
+        show_stack_trace=True,
+        stack_trace_available=True)
     check_main_menu(
         self,
         out,
@@ -1003,7 +1030,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     assert_node_attribute_lines(self, out, node_name, "Identity",
                                 self._main_device,
                                 [("VariableV2", "simple_mul_add/u")], [],
-                                [("MatMul", "simple_mul_add/matmul")], [])
+                                [(_matmul_op_name(), "simple_mul_add/matmul")],
+                                [])
     check_main_menu(
         self,
         out,
diff --git a/tensorflow/python/debug/examples/debug_errors.py b/tensorflow/python/debug/examples/debug_errors.py
index 9f75e6a2c27..f8d6666cf8b 100644
--- a/tensorflow/python/debug/examples/debug_errors.py
+++ b/tensorflow/python/debug/examples/debug_errors.py
@@ -19,12 +19,15 @@ from __future__ import print_function
 
 import argparse
 import sys
+import tempfile
 
 import numpy as np
-import tensorflow as tf
+import tensorflow
 
 from tensorflow.python import debug as tf_debug
 
+tf = tensorflow.compat.v1
+
 
 def main(_):
   sess = tf.Session()
@@ -41,10 +44,12 @@ def main(_):
   z = tf.matmul(m, v, name="z")
 
   if FLAGS.debug:
+    config_file_path = (tempfile.mktemp(".tfdbg_config")
+                        if FLAGS.use_random_config_path else None)
     sess = tf_debug.LocalCLIDebugWrapperSession(
         sess,
         ui_type=FLAGS.ui_type,
-        use_random_config_path=FLAGS.use_random_config_path)
+        config_file_path=config_file_path)
 
   if FLAGS.error == "shape_mismatch":
     print(sess.run(y, feed_dict={ph_float: np.array([[0.0], [1.0], [2.0]])}))
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 777fb089881..870a6d6813a 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -22,10 +22,12 @@ import sys
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
+import tensorflow
 
 from tensorflow.python import debug as tf_debug
 
+tf = tensorflow.compat.v1
+
 FLAGS = None
 
 
diff --git a/tensorflow/python/debug/examples/debug_keras.py b/tensorflow/python/debug/examples/debug_keras.py
index 019121fa0a6..fb3031b323b 100644
--- a/tensorflow/python/debug/examples/debug_keras.py
+++ b/tensorflow/python/debug/examples/debug_keras.py
@@ -20,12 +20,15 @@ from __future__ import print_function
 
 import argparse
 import sys
+import tempfile
 
 import numpy as np
-import tensorflow as tf
+import tensorflow
 
 from tensorflow.python import debug as tf_debug
 
+tf = tensorflow.compat.v1
+
 
 def main(_):
   # Create a dummy dataset.
@@ -41,7 +44,12 @@ def main(_):
   sess = tf.Session()
   if FLAGS.debug:
     # Use the command-line interface (CLI) of tfdbg.
-    sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
+    config_file_path = (tempfile.mktemp(".tfdbg_config")
+                        if FLAGS.use_random_config_path else None)
+    sess = tf_debug.LocalCLIDebugWrapperSession(
+        sess,
+        ui_type=FLAGS.ui_type,
+        config_file_path=config_file_path)
   elif FLAGS.tensorboard_debug_address:
     # Use the TensorBoard Debugger Plugin (GUI of tfdbg).
     sess = tf_debug.TensorBoardDebugWrapperSession(
@@ -73,6 +81,14 @@ if __name__ == "__main__":
       type=str,
       default="curses",
       help="Command-line user interface type (curses | readline).")
+  parser.add_argument(
+      "--use_random_config_path",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=False,
+      help="""If set, set config file path to a random file in the temporary
+      directory.""")
   parser.add_argument(
       "--tensorboard_debug_address",
       type=str,
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 58979619032..823179cf5dc 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -26,12 +26,15 @@ from __future__ import print_function
 
 import argparse
 import sys
+import tempfile
 
-import tensorflow as tf
+import tensorflow
 
 from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.python import debug as tf_debug
 
+tf = tensorflow.compat.v1
+
 
 IMAGE_SIZE = 28
 HIDDEN_SIZE = 500
@@ -125,10 +128,12 @@ def main(_):
         "The --debug and --tensorboard_debug_address flags are mutually "
         "exclusive.")
   if FLAGS.debug:
+    config_file_path = (tempfile.mktemp(".tfdbg_config")
+                        if FLAGS.use_random_config_path else None)
     sess = tf_debug.LocalCLIDebugWrapperSession(
         sess,
         ui_type=FLAGS.ui_type,
-        use_random_config_path=FLAGS.use_random_config_path)
+        config_file_path=config_file_path)
   elif FLAGS.tensorboard_debug_address:
     sess = tf_debug.TensorBoardDebugWrapperSession(
         sess, FLAGS.tensorboard_debug_address)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index be9a62311b6..6f9d61192db 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -21,10 +21,11 @@ import argparse
 import sys
 import tempfile
 
-import tensorflow as tf
+import tensorflow
 
 from tensorflow.python import debug as tf_debug
 
+tf = tensorflow.compat.v1
 
 _IRIS_INPUT_DIM = 4
 
@@ -58,8 +59,11 @@ def main(_):
         "exclusive.")
   hooks = []
   if FLAGS.debug:
+    config_file_path = (tempfile.mktemp(".tfdbg_config")
+                        if FLAGS.use_random_config_path else None)
     hooks.append(tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
-                                            dump_root=FLAGS.dump_root))
+                                            dump_root=FLAGS.dump_root,
+                                            config_file_path=config_file_path))
   elif FLAGS.tensorboard_debug_address:
     hooks.append(tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address))
 
@@ -122,6 +126,14 @@ if __name__ == "__main__":
       type=str,
       default="",
       help="Optional custom root directory for temporary debug dump data")
+  parser.add_argument(
+      "--use_random_config_path",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=False,
+      help="""If set, set config file path to a random file in the temporary
+      directory.""")
   parser.add_argument(
       "--tensorboard_debug_address",
       type=str,
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 727bc702af6..397d8d5c281 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -87,7 +87,7 @@ EOF
 CUSTOM_DUMP_ROOT=$(mktemp -d)
 mkdir -p ${CUSTOM_DUMP_ROOT}
 
-cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline --use_random_config_path
 run -p
 run -f has_inf_or_nan
 EOF
@@ -99,12 +99,12 @@ if [[ -d "${CUSTOM_DUMP_ROOT}" ]]; then
 fi
 
 # Test debugging of tf.keras.
-cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline
+cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path
 run -f has_inf_or_nan
 EOF
 
 # Test debugging of tf.keras, with non-debug runs included.
-cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline
+cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path
 run -t 10
 EOF
 
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 15d91f4c1e8..ceabd2e86d4 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -670,7 +670,7 @@ class DebugDumpDir(object):
     self._node_traceback = {}
     if self._python_graph:
       for op in self._python_graph.get_operations():
-        self._node_traceback[op.name] = op.traceback
+        self._node_traceback[op.name] = tuple(map(tuple, op.traceback))
 
   @property
   def python_graph(self):
diff --git a/tensorflow/python/debug/lib/debug_grappler_test.py b/tensorflow/python/debug/lib/debug_grappler_test.py
new file mode 100644
index 00000000000..7a3bf90a616
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_grappler_test.py
@@ -0,0 +1,121 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities in tf.Session."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+def _grappler_enabled_session_config():
+  """Constructs a Session config proto that explicitly enables Grappler.
+
+  Returns:
+    A config proto that obtains extra safety for the unit tests in this
+    file by ensuring that the relevant Grappler rewrites are always enabled.
+  """
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=False,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.ON)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
+class SessionDebugGrapplerInteractionTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(SessionDebugGrapplerInteractionTest, self).setUp()
+    self._dump_root = tempfile.mkdtemp()
+    self._debug_url = "file://%s" % self._dump_root
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+    super(SessionDebugGrapplerInteractionTest, self).tearDown()
+
+  def testArithmeticOptimizationActive(self):
+    """Tests that tfdbg can dump the tensor from nodes created by Grappler."""
+    with session.Session(config=_grappler_enabled_session_config()) as sess:
+      u = variables.VariableV1([[1, 2], [3, 4]], name="u", dtype=dtypes.float32)
+      # The next two ops should be optimized by Grappler into a single op:
+      # either an AddN op or a Mul op.
+      x = math_ops.add(u, u)
+      x = math_ops.add(x, u)
+      y = math_ops.multiply(x, u)
+
+      sess.run(variables.global_variables_initializer())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugIdentity"],
+          debug_urls=[self._debug_url])
+
+      run_metadata = config_pb2.RunMetadata()
+      run_result = sess.run(y, options=run_options, run_metadata=run_metadata)
+      self.assertAllClose(run_result, [[3, 12], [27, 48]])
+
+      dump_data = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs,
+          validate=True)
+
+      original_node_names = set([op.name for op in sess.graph.get_operations()])
+      dumped_node_names = set(dump_data.nodes())
+      grappler_created_node_names = dumped_node_names - original_node_names
+      grappler_removed_node_names = original_node_names - dumped_node_names
+
+      # Assert that Grappler should have replaced some of the nodes from the
+      # original graph with new nodes.
+      self.assertTrue(grappler_created_node_names)
+      self.assertTrue(grappler_removed_node_names)
+
+      # Iterate through the nodes created by Grappler. One of them should be
+      # be the result of replacing the original add ops with an AddN op or a
+      # Mul op.
+      found_optimized_node = False
+      for grappler_node_name in grappler_created_node_names:
+        node_op_type = dump_data.node_op_type(grappler_node_name)
+        # Look for the node created by Grappler's arithmetic optimization.
+        if node_op_type in ("AddN", "Mul"):
+          datum = dump_data.get_tensors(grappler_node_name, 0, "DebugIdentity")
+          self.assertEqual(1, len(datum))
+          self.assertAllClose(datum[0], [[3, 6], [9, 12]])
+          found_optimized_node = True
+          break
+      self.assertTrue(
+          found_optimized_node,
+          "Failed to find optimized node created by Grappler's arithmetic "
+          "optimization.")
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/debug_utils.py b/tensorflow/python/debug/lib/debug_utils.py
index f2a43a61527..eb21694ba2f 100644
--- a/tensorflow/python/debug/lib/debug_utils.py
+++ b/tensorflow/python/debug/lib/debug_utils.py
@@ -134,6 +134,10 @@ def watch_graph(run_options,
     reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte
       usage to zero (default: `False`).
   """
+  if not debug_ops:
+    raise ValueError("debug_ops must not be empty or None.")
+  if not debug_urls:
+    raise ValueError("debug_urls must not be empty or None.")
 
   if isinstance(debug_ops, str):
     debug_ops = [debug_ops]
@@ -173,6 +177,23 @@ def watch_graph(run_options,
           tolerate_debug_op_creation_failures=(
               tolerate_debug_op_creation_failures),
           global_step=global_step)
+
+  # If no filter for node or tensor is used, will add a wildcard node name, so
+  # that all nodes, including the ones created internally by TensorFlow itself
+  # (e.g., by Grappler), can be watched during debugging.
+  use_node_name_wildcard = (not node_name_pattern and
+                            not op_type_pattern and
+                            not tensor_dtype_pattern)
+  if use_node_name_wildcard:
+    add_debug_tensor_watch(
+        run_options,
+        "*",
+        output_slot=-1,
+        debug_ops=debug_ops,
+        debug_urls=debug_urls,
+        tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures,
+        global_step=global_step)
+
   run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage
 
 
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 9d59cfc1792..6e0b637b7c8 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -59,11 +59,13 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     cls._graph = cls._sess.graph
 
     # These are all the expected nodes in the graph:
-    #   Two variables (a, b), each with four nodes (Variable, init, Assign,
-    #       read).
-    #   One constant (c).
-    #   One add operation and one matmul operation.
-    cls._expected_num_nodes = 4 * 2 + 1 + 1 + 1
+    #   - Two variables (a, b), each with four nodes (Variable, init, Assign,
+    #     read).
+    #   - One constant (c).
+    #   - One add operation and one matmul operation.
+    #   - One wildcard node name ("*") that covers nodes created internally
+    #     by TensorFlow itself (e.g., Grappler).
+    cls._expected_num_nodes = 4 * 2 + 1 + 1 + 1 + 1
 
   def setUp(self):
     self._run_options = config_pb2.RunOptions()
@@ -88,9 +90,14 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     for watch in watch_opts:
       node_names.append(watch.node_name)
 
-      self.assertEqual(expected_output_slot, watch.output_slot)
-      self.assertEqual(expected_debug_ops, watch.debug_ops)
-      self.assertEqual(expected_debug_urls, watch.debug_urls)
+      if watch.node_name == "*":
+        self.assertEqual(-1, watch.output_slot)
+        self.assertEqual(expected_debug_ops, watch.debug_ops)
+        self.assertEqual(expected_debug_urls, watch.debug_urls)
+      else:
+        self.assertEqual(expected_output_slot, watch.output_slot)
+        self.assertEqual(expected_debug_ops, watch.debug_ops)
+        self.assertEqual(expected_debug_urls, watch.debug_urls)
 
     return node_names
 
@@ -203,19 +210,22 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
                                       ["file:///tmp/tfdbg_1"])
 
     # Verify the node names.
-    self.assertTrue("a1_init" in node_names)
-    self.assertTrue("a1" in node_names)
-    self.assertTrue("a1/Assign" in node_names)
-    self.assertTrue("a1/read" in node_names)
+    self.assertIn("a1_init", node_names)
+    self.assertIn("a1", node_names)
+    self.assertIn("a1/Assign", node_names)
+    self.assertIn("a1/read", node_names)
 
-    self.assertTrue("b_init" in node_names)
-    self.assertTrue("b" in node_names)
-    self.assertTrue("b/Assign" in node_names)
-    self.assertTrue("b/read" in node_names)
+    self.assertIn("b_init", node_names)
+    self.assertIn("b", node_names)
+    self.assertIn("b/Assign", node_names)
+    self.assertIn("b/read", node_names)
 
-    self.assertTrue("c" in node_names)
-    self.assertTrue("p1" in node_names)
-    self.assertTrue("s" in node_names)
+    self.assertIn("c", node_names)
+    self.assertIn("p1", node_names)
+    self.assertIn("s", node_names)
+
+    # Assert that the wildcard node name has been created.
+    self.assertIn("*", node_names)
 
   @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameWhitelist(self):
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index db93946e769..f3b187cb3e9 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -164,7 +164,7 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     self.assertAllClose(42.0, w_result)
 
     dump = debug_data.DebugDumpDir(self._dump_root)
-    self.assertEqual(5, dump.size)
+    self.assertLessEqual(5, dump.size)
     self.assertAllClose([2.1], dump.get_tensors("u", 0, "DebugIdentity"))
     self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity"))
     self.assertAllClose([20.0], dump.get_tensors("v", 0, "DebugIdentity"))
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index b438b6500ae..e2740d80705 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -659,16 +659,15 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       # Verify that the nodes with bad values are caught through running find
       # on the debug dump.
-      self.assertEqual(3, len(bad_data))
-      self.assertEqual(x_name, bad_data[0].node_name)
-      self.assertEqual(y_name, bad_data[1].node_name)
-      self.assertEqual(z_name, bad_data[2].node_name)
+      self.assertLessEqual(3, len(bad_data))
+      node_names = [datum.node_name for datum in bad_data]
+      self.assertIn(x_name, node_names)
+      self.assertIn(y_name, node_names)
+      self.assertIn(z_name, node_names)
 
       # Test first_n kwarg of find(): Find the first offending tensor.
       first_bad_datum = dump.find(has_bad_value, first_n=1)
-
       self.assertEqual(1, len(first_bad_datum))
-      self.assertEqual(x_name, first_bad_datum[0].node_name)
 
   def testFindInfOrNanWithOpNameExclusion(self):
     with session.Session() as sess:
@@ -708,16 +707,15 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       # Verify that the nodes with bad values are caught through running find
       # on the debug dump.
-      self.assertEqual(2, len(bad_data))
+      self.assertLessEqual(2, len(bad_data))
       # Assert that the node `x` should have been excluded.
-      self.assertEqual(y_name, bad_data[0].node_name)
-      self.assertEqual(z_name, bad_data[1].node_name)
+      node_names = [datum.node_name for datum in bad_data]
+      self.assertIn(y_name, node_names)
+      self.assertIn(z_name, node_names)
 
       first_bad_datum = dump.find(
           debug_data.has_inf_or_nan, first_n=1, exclude_node_names=".*/x$")
-
       self.assertEqual(1, len(first_bad_datum))
-      self.assertEqual(y_name, first_bad_datum[0].node_name)
 
   def _session_run_for_graph_structure_lookup(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
@@ -1378,7 +1376,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
           sess, y, debug_ops=["DebugNumericSummary(mute_if_healthy=true)"],
           validate=False)
 
-      self.assertEqual(2, dump.size)
+      self.assertLessEqual(2, dump.size)
       self.assertAllClose([[
           1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, np.nan,
           np.nan, 1.0, 0.0
@@ -1393,7 +1391,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       shutil.rmtree(self._dump_root)
       _, dump = self._debug_run_and_get_dump(
           sess, y, debug_ops=["DebugNumericSummary()"])
-      self.assertEqual(8, dump.size)
+      self.assertLessEqual(8, dump.size)
 
   def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
     with session.Session() as sess:
@@ -1462,14 +1460,14 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       # Lookup should work with node name input.
       traceback = dump.node_traceback("traceback/w")
-      self.assertIsInstance(traceback, list)
+      self.assertIsInstance(traceback, tuple)
       self.assertGreater(len(traceback), 0)
       for trace in traceback:
         self.assertIsInstance(trace, tuple)
 
       # Lookup should also work with tensor name input.
       traceback = dump.node_traceback("traceback/w:0")
-      self.assertIsInstance(traceback, list)
+      self.assertIsInstance(traceback, tuple)
       self.assertGreater(len(traceback), 0)
       for trace in traceback:
         self.assertIsInstance(trace, tuple)
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 76d5ad28e04..4c958be257c 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -36,7 +36,11 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   available.
   """
 
-  def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
+  def __init__(self,
+               ui_type="curses",
+               dump_root=None,
+               thread_name_filter=None,
+               config_file_path=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -49,6 +53,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
       thread_name_filter: Regular-expression white list for threads on which the
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
+      config_file_path: Optional override to the default configuration file
+        path, which is at `${HOME}/.tfdbg_config`.
     """
 
     self._ui_type = ui_type
@@ -56,6 +62,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
     self._thread_name_filter = thread_name_filter
     self._session_wrapper = None
     self._pending_tensor_filters = {}
+    self._config_file_path = config_file_path
 
   def add_tensor_filter(self, filter_name, tensor_filter):
     """Add a tensor filter.
@@ -87,7 +94,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
           run_context.session,
           ui_type=self._ui_type,
           dump_root=self._dump_root,
-          thread_name_filter=self._thread_name_filter)
+          thread_name_filter=self._thread_name_filter,
+          config_file_path=self._config_file_path)
 
       # Actually register tensor filters registered prior to the construction
       # of the underlying LocalCLIDebugWrapperSession object.
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 85a282ef33f..5f7fec5bfab 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -54,7 +54,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
                log_usage=True,
                ui_type="curses",
                thread_name_filter=None,
-               use_random_config_path=False):
+               config_file_path=False):
     """Constructor of LocalCLIDebugWrapperSession.
 
     Args:
@@ -69,8 +69,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         (curses | readline)
       thread_name_filter: Regular-expression white list for thread name. See
         the doc of `BaseDebugWrapperSession` for details.
-      use_random_config_path: If true, set config file path to a random file in
-        the temporary directory.
+      config_file_path: Optional override to the default configuration file
+        path, which is at `${HOME}/.tfdbg_config`.
 
     Raises:
       ValueError: If dump_root is an existing and non-empty directory or if
@@ -127,9 +127,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     self._is_run_start = True
     self._ui_type = ui_type
     self._config = None
-    if use_random_config_path:
-      self._config = cli_config.CLIConfig(
-          config_file_path=os.path.join(tempfile.mkdtemp(), ".tfdbg_config"))
+    if config_file_path:
+      self._config = cli_config.CLIConfig(config_file_path=config_file_path)
 
   def _is_disk_usage_reset_each_run(self):
     # The dumped tensors are all cleaned up after every Session.run
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 3d90fa01889..83222f2cbe1 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -459,7 +459,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(debug_dumps))
     for debug_dump in debug_dumps:
       node_names = [datum.node_name for datum in debug_dump.dumped_tensor_data]
-      self.assertItemsEqual(["callable_a", "callable_b"], node_names)
+      self.assertIn("callable_a", node_names)
+      self.assertIn("callable_b", node_names)
 
   def testDebuggingMakeCallableFromOptionsWithTwoFeedsWorks(self):
     ph1 = array_ops.placeholder(dtypes.float32, name="callable_ph1")
@@ -486,7 +487,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(debug_dumps))
     for debug_dump in debug_dumps:
       node_names = [datum.node_name for datum in debug_dump.dumped_tensor_data]
-      self.assertItemsEqual(["callable_a", "callable_b"], node_names)
+      self.assertIn("callable_a", node_names)
+      self.assertIn("callable_b", node_names)
 
   def testDebugMakeCallableFromOptionsWithCustomOptionsAndMetadataWorks(self):
     variable_1 = variables.VariableV1(
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 21831fcd891..3330c9b132b 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
-load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
+load("//tensorflow/core/platform:default/distribute.bzl", "distribute_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -75,6 +75,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
@@ -220,6 +221,7 @@ py_test(
     srcs = ["distribute_coordinator_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = ["no_oss_py2"],  # b/138443278
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -695,6 +697,7 @@ tf_xla_py_test(
         ":tpu_strategy",
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/training/tracking:util",
     ],
 )
@@ -756,7 +759,8 @@ cuda_py_test(
         "//tensorflow/python/eager:test",
     ],
     tags = [
-        "multi_and_single_gpu",
+        # TODO(b/138143527): Re-enable after fixing Guitar failure.
+        # "multi_and_single_gpu",
     ],
     xla_enable_strict_auto_jit = True,
 )
@@ -874,13 +878,16 @@ distribute_py_test(
     main = "minimize_loss_test.py",
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/139815303): enable after this is fixed.
         "no_rocm",
+        "notap",  # TODO(b/139815303): enable after this is fixed.
     ],
     deps = [
         ":mirrored_strategy",
         ":single_loss_example",
         "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
@@ -1107,33 +1114,22 @@ distribute_py_test(
     name = "saved_model_save_load_test",
     size = "medium",
     srcs = ["saved_model_save_load_test.py"],
+    full_precision = True,
     main = "saved_model_save_load_test.py",
+    shard_count = 5,
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/saved_model",
     ],
 )
 
-distribute_py_test(
-    name = "keras_experimental_saved_model_test",
-    size = "medium",
-    srcs = ["keras_experimental_saved_model_test.py"],
-    main = "keras_experimental_saved_model_test.py",
-    tags = [
-        "no_oss",  # TODO(b/135287893) reenable
-        "no_rocm",
-    ],
-    deps = [
-        ":saved_model_test_base",
-        "//tensorflow/python/keras:saving",
-    ],
-)
-
 distribute_py_test(
     name = "keras_save_load_test",
     size = "medium",
     srcs = ["keras_save_load_test.py"],
+    full_precision = True,
     main = "keras_save_load_test.py",
+    shard_count = 5,
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/keras:saving",
@@ -1144,7 +1140,9 @@ distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
     srcs = ["saved_model_mixed_api_test.py"],
+    full_precision = True,
     main = "saved_model_mixed_api_test.py",
+    shard_count = 5,
     deps = [
         ":saved_model_test_base",
         "//tensorflow/python/keras:saving",
@@ -1156,6 +1154,7 @@ distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
     main = "ctl_correctness_test.py",
+    shard_count = 3,
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1170,3 +1169,70 @@ distribute_py_test(
         "//tensorflow/python/eager:test",
     ],
 )
+
+cuda_py_test(
+    name = "collective_all_reduce_strategy_test",
+    srcs = ["collective_all_reduce_strategy_test.py"],
+    additional_deps = [
+        ":collective_all_reduce_strategy",
+        ":combinations",
+        ":strategy_combinations",
+        ":multi_worker_test_base",
+        ":strategy_test_lib",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "noguitar",  # b/139307796
+    ],
+)
+
+cuda_py_test(
+    name = "parameter_server_strategy_test",
+    srcs = ["parameter_server_strategy_test.py"],
+    additional_deps = [
+        ":parameter_server_strategy",
+        ":central_storage_strategy",
+        ":combinations",
+        ":strategy_combinations",
+        ":multi_worker_test_base",
+        ":strategy_test_lib",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/133330625)
+    ],
+)
diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py
index caa184c5fa5..0b78ec0fe90 100644
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes implementing a multi-worker ps DistributionStrategy."""
+"""Class implementing a single machine parameter server strategy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,12 +32,24 @@ class CentralStorageStrategy(distribute_lib.Strategy):
   than one GPU, compute operations (other than variable update operations)
   will be replicated across all GPUs.
 
-  Args:
-    compute_devices: an optional list of strings for device to replicate models
-      on. If this is not provided, all local GPUs will be used; if there is no
-      GPU, local CPU will be used.
-    parameter_device: an optional device string for which device to put
-      variables on. The default one is CPU or GPU if there is only one.
+  For Example:
+  ```
+  strategy = tf.distribute.experimental.CentralStorageStrategy()
+  # Create a dataset
+  ds = tf.data.Dataset.range(5).batch(2)
+  # Distribute that dataset
+  dist_dataset = strategy.experimental_distribute_dataset(ds)
+
+  with strategy.scope():
+    @tf.function
+    def train_step(val):
+      return val + 1
+
+    # Iterate over the distributed dataset
+    for x in dist_dataset:
+      # process dataset elements
+      strategy.experimental_run_v2(train_step, args=(x,))
+  ```
   """
 
   def __init__(self, compute_devices=None, parameter_device=None):
@@ -45,22 +57,205 @@ class CentralStorageStrategy(distribute_lib.Strategy):
         self,
         compute_devices=compute_devices,
         parameter_device=parameter_device)
+    """Initializes the strategy with optional device strings.
+
+    Args:
+    compute_devices: an optional list of strings for device to replicate models
+      on. If this is not provided, all local GPUs will be used; if there is no
+      GPU, local CPU will be used.
+    parameter_device: an optional device string for which device to put
+      variables on. The default one is CPU or GPU if there is only one.
+    """
     super(CentralStorageStrategy, self).__init__(extended)
 
   @classmethod
   def _from_num_gpus(cls, num_gpus):
     return cls(device_util.local_devices_from_num_gpus(num_gpus))
 
+  def experimental_distribute_dataset(self, dataset):  # pylint: disable=useless-super-delegation
+    """Distributes a tf.data.Dataset instance provided via dataset.
 
-@tf_export(v1=["distribute.experimental.CentralStorageStrategy"])
+    The returned dataset is a wrapped strategy dataset which creates a
+    multidevice iterator under the hood. It prefetches the input data to the
+    specified devices on the worker. The returned distributed dataset can be
+    iterated over similar to how regular datasets can.
+
+    NOTE: Currently, the user cannot add any more transformations to a
+    distributed dataset.
+
+    For Example:
+    ```
+    strategy = tf.distribute.CentralStorageStrategy()  # with 1 CPU and 1 GPU
+    dataset = tf.data.Dataset.range(10).batch(2)
+    dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    for x in dist_dataset:
+      print(x)  # Prints PerReplica values [0, 1], [2, 3],...
+
+    ```
+    Args:
+      dataset: `tf.data.Dataset` to be prefetched to device.
+
+    Returns:
+      A "distributed `Dataset`" that the caller can iterate over.
+    """
+    return super(CentralStorageStrategy, self).experimental_distribute_dataset(
+        dataset)
+
+  def experimental_distribute_datasets_from_function(self, dataset_fn):  # pylint: disable=useless-super-delegation
+    """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
+
+    `dataset_fn` will be called once for each worker in the strategy. In this
+    case, we only have one worker so `dataset_fn` is called once. Each replica
+    on this worker will then dequeue a batch of elements from this local
+    dataset.
+
+    The `dataset_fn` should take an `tf.distribute.InputContext` instance where
+    information about batching and input replication can be accessed.
+
+    For Example:
+    ```
+    def dataset_fn(input_context):
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
+      return d.shard(
+          input_context.num_input_pipelines, input_context.input_pipeline_id)
+
+    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+    for batch in inputs:
+      replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,))
+    ```
+
+    IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
+    per-replica batch size, unlike `experimental_distribute_dataset`, which uses
+    the global batch size.  This may be computed using
+    `input_context.get_per_replica_batch_size`.
+
+    Args:
+      dataset_fn: A function taking a `tf.distribute.InputContext` instance and
+        returning a `tf.data.Dataset`.
+
+    Returns:
+      A "distributed `Dataset`", which the caller can iterate over like regular
+      datasets.
+    """
+    return super(
+        CentralStorageStrategy,
+        self).experimental_distribute_datasets_from_function(dataset_fn)
+
+  def experimental_local_results(self, value):  # pylint: disable=useless-super-delegation
+    """Returns the list of all local per-replica values contained in `value`.
+
+    In `CentralStorageStrategy` there is a single worker so the value returned
+    will be all the values on that worker.
+
+    Args:
+      value: A value returned by `experimental_run()`, `experimental_run_v2()`,
+        `extended.call_for_each_replica()`, or a variable created in `scope`.
+
+    Returns:
+      A tuple of values contained in `value`. If `value` represents a single
+      value, this returns `(value,).`
+    """
+    return super(CentralStorageStrategy, self).experimental_local_results(value)
+
+  def experimental_run_v2(self, fn, args=(), kwargs=None):  # pylint: disable=useless-super-delegation
+    """Run `fn` on each replica, with the given arguments.
+
+    In `CentralStorageStrategy`, `fn` is  called on each of the compute
+    replicas, with the provided "per replica" arguments specific to that device.
+
+    Args:
+      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
+      args: (Optional) Positional arguments to `fn`.
+      kwargs: (Optional) Keyword arguments to `fn`.
+
+    Returns:
+      Return value from running `fn`.
+    """
+    return super(CentralStorageStrategy, self).experimental_run_v2(fn, args,
+                                                                   kwargs)
+
+  def reduce(self, reduce_op, value, axis):  # pylint: disable=useless-super-delegation
+    """Reduce `value` across replicas.
+
+    Given a per-replica value returned by `experimental_run_v2`, say a
+    per-example loss, the batch will be divided across all the replicas. This
+    function allows you to aggregate across replicas and optionally also across
+    batch elements.  For example, if you have a global batch size of 8 and 2
+    replicas, values for examples `[0, 1, 2, 3]` will be on replica 0 and
+    `[4, 5, 6, 7]` will be on replica 1. By default, `reduce` will just
+    aggregate across replicas, returning `[0+4, 1+5, 2+6, 3+7]`. This is useful
+    when each replica is computing a scalar or some other value that doesn't
+    have a "batch" dimension (like a gradient). More often you will want to
+    aggregate across the global batch, which you can get by specifying the batch
+    dimension as the `axis`, typically `axis=0`. In this case it would return a
+    scalar `0+1+2+3+4+5+6+7`.
+
+    If there is a last partial batch, you will need to specify an axis so
+    that the resulting shape is consistent across replicas. So if the last
+    batch has size 6 and it is divided into [0, 1, 2, 3] and [4, 5], you
+    would get a shape mismatch unless you specify `axis=0`. If you specify
+    `tf.distribute.ReduceOp.MEAN`, using `axis=0` will use the correct
+    denominator of 6. Contrast this with computing `reduce_mean` to get a
+    scalar value on each replica and this function to average those means,
+    which will weigh some values `1/8` and others `1/4`.
+
+    For Example:
+    ```
+    strategy = tf.distribute.experimental.CentralStorageStrategy(
+        compute_devices=['CPU:0', 'GPU:0'], parameter_device='CPU:0')
+    ds = tf.data.Dataset.range(10)
+    # Distribute that dataset
+    dist_dataset = strategy.experimental_distribute_dataset(ds)
+
+    with strategy.scope():
+      @tf.function
+      def train_step(val):
+        # pass through
+        return val
+
+      # Iterate over the distributed dataset
+      for x in dist_dataset:
+        result = strategy.experimental_run_v2(train_step, args=(x,))
+
+    result = strategy.reduce(tf.distribute.ReduceOp.SUM, result,
+                             axis=None).numpy()
+    # result: array([ 4,  6,  8, 10])
+
+    result = strategy.reduce(tf.distribute.ReduceOp.SUM, result, axis=0).numpy()
+    # result: 28
+    ```
+
+    Args:
+      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
+        be combined.
+      value: A "per replica" value, e.g. returned by `experimental_run_v2` to
+        be combined into a single tensor.
+      axis: Specifies the dimension to reduce along within each
+        replica's tensor. Should typically be set to the batch dimension, or
+        `None` to only reduce across replicas (e.g. if the tensor has no batch
+        dimension).
+
+    Returns:
+      A `Tensor`.
+    """
+    return super(CentralStorageStrategy, self).reduce(reduce_op, value, axis)
+
+
+@tf_export(v1=["distribute.experimental.CentralStorageStrategy"])  # pylint: disable=missing-docstring
 class CentralStorageStrategyV1(distribute_lib.StrategyV1):
 
   __doc__ = CentralStorageStrategy.__doc__
 
   def __init__(self, compute_devices=None, parameter_device=None):
-    """Initializes this strategy with default TFConfigClusterResolver."""
     super(CentralStorageStrategyV1, self).__init__(
         parameter_server_strategy.ParameterServerStrategyExtended(
             self,
             compute_devices=compute_devices,
             parameter_device=parameter_device))
+  __init__.__doc__ = CentralStorageStrategy.__init__.__doc__
+
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    return False
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index c4341ca8396..48623337391 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,6 +1,10 @@
 # Description: Operations defined for Cluster Resolvers
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/core/platform:default/build_config.bzl",
+    "tf_additional_rpc_deps",
+)
 
 package(
     default_visibility = [
@@ -63,7 +67,7 @@ py_library(
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
-    ],
+    ] + tf_additional_rpc_deps(),
 )
 
 py_library(
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
index 39ea191fb04..11de551b084 100644
--- a/tensorflow/python/distribute/cluster_resolver/__init__.py
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Library Imports for Cluster Resolvers."""
+"""Library imports for ClusterResolvers.
+
+  This library contains all implementations of ClusterResolvers.
+  ClusterResolvers are a way of specifying cluster information for distributed
+  execution. Built on top of existing `ClusterSpec` framework, ClusterResolvers
+  are a way for TensorFlow to communicate with various cluster management
+  systems (e.g. GCE, AWS, etc...).
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index c636c98254c..5b61f847801 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -90,7 +90,7 @@ class ClusterResolver(object):
 
   @abc.abstractmethod
   def cluster_spec(self):
-    """Retrieve the current state of the cluster and returns a ClusterSpec.
+    """Retrieve the current state of the cluster and return a ClusterSpec.
 
     Returns:
       A ClusterSpec representing the state of the cluster at the moment this
@@ -288,7 +288,7 @@ class UnionClusterResolver(ClusterResolver):
   when cluster_spec is called. The details of the merge function is
   documented in the cluster_spec function.
 
-  For additional Cluster Resolver properties such as task type, task index,
+  For additional ClusterResolver properties such as task type, task index,
   rpc layer, environment, etc..., we will return the value from the first
   ClusterResolver in the union.
   """
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
index 9d7dfdd1ea9..70d42e80a70 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+"""Implementation of ClusterResolvers for GCE instance groups."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -33,12 +33,12 @@ except ImportError:
 
 @tf_export('distribute.cluster_resolver.GCEClusterResolver')
 class GCEClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Compute Engine.
+  """ClusterResolver for Google Compute Engine.
 
   This is an implementation of cluster resolvers for the Google Compute Engine
   instance group platform. By specifying a project, zone, and instance group,
   this will retrieve the IP address of all the instances within the instance
-  group and return a Cluster Resolver object suitable for use for distributed
+  group and return a ClusterResolver object suitable for use for distributed
   TensorFlow.
   """
 
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index 28b2712590d..f812df0e5c7 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -33,7 +33,7 @@ except ImportError:
 
 @tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
 class KubernetesClusterResolver(ClusterResolver):
-  """Cluster Resolver for Kubernetes.
+  """ClusterResolver for Kubernetes.
 
   This is an implementation of cluster resolvers for Kubernetes. When given the
   the Kubernetes namespace and label selector for pods, we will retrieve the
@@ -48,7 +48,7 @@ class KubernetesClusterResolver(ClusterResolver):
                override_client=None):
     """Initializes a new KubernetesClusterResolver.
 
-    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
+    This initializes a new Kubernetes ClusterResolver. The ClusterResolver
     will attempt to talk to the Kubernetes master to retrieve all the instances
     of pods matching a label selector.
 
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 0e49cebee2b..1d6d346ddf2 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -30,13 +30,13 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export('distribute.cluster_resolver.SlurmClusterResolver')
 class SlurmClusterResolver(ClusterResolver):
-  """Cluster Resolver for system with Slurm workload manager.
+  """ClusterResolver for system with Slurm workload manager.
 
   This is an implementation of cluster resolvers for Slurm clusters. This allows
   the specification of jobs and task counts, number of tasks per node, number of
-  GPUs on each node and number of GPUs for each task, It retrieves system
+  GPUs on each node and number of GPUs for each task. It retrieves system
   attributes by Slurm environment variables, resolves allocated computing node
-  names, construct a cluster and return a Cluster Resolver object which an be
+  names, constructs a cluster and returns a ClusterResolver object which can be
   use for distributed TensorFlow.
   """
 
@@ -61,15 +61,15 @@ class SlurmClusterResolver(ClusterResolver):
     """Creates a new SlurmClusterResolver object.
 
     This takes in parameters and creates a SlurmClusterResolver object. It uses
-    those parameters to check which nodes will processes reside and resolves
+    those parameters to check which nodes will processes reside on and resolves
     their hostnames. With the number of the GPUs on each node and number of GPUs
-    for each task it offsets the port number for each processes and allocate
+    for each task it offsets the port number for each process and allocates
     GPUs to tasks by setting environment variables. The resolver currently
     supports homogeneous tasks and default Slurm process allocation.
 
     Args:
       jobs: Dictionary with job names as key and number of tasks in the job as
-        value
+        value.
       port_base: The first port number to start with for processes on a node.
       gpus_per_node: Number of GPUs available on each node.
       gpus_per_task: Number of GPUs to be used for each task.
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index c9b6191a1c0..421351944c2 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -50,7 +50,12 @@ def _get_value_in_tfconfig(key, default=None):
 
 @tf_export('distribute.cluster_resolver.TFConfigClusterResolver')
 class TFConfigClusterResolver(ClusterResolver):
-  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
+  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar.
+
+  This is an implementation of cluster resolvers when using TF_CONFIG to set
+  information about the cluster. The cluster spec returned will be
+  initialized from the TF_CONFIG environment variable.
+  """
 
   def __init__(self,
                task_type=None,
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 253708c132c..fb3911aa928 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -94,7 +94,7 @@ class TPUClusterResolver(ClusterResolver):
 
     This works around an issue where the underlying HTTP connection sometimes
     times out when the script has been running for too long. Other methods in
-    this object calls this method to get a new API object whenever they need
+    this object call this method to get a new API object whenever they need
     to communicate with the Cloud API.
 
     Returns:
@@ -124,20 +124,17 @@ class TPUClusterResolver(ClusterResolver):
     resp = urlopen(req)
     return compat.as_bytes(resp.read())
 
-  def _is_google_environment(self):
+  def _is_local_tpu(self):
     return (
         self._tpu == compat.as_bytes('') or
-        self._tpu == compat.as_bytes('local') or
-        self._tpu.startswith(compat.as_bytes('localhost:')) or
-        self._tpu.startswith(compat.as_bytes('/bns')) or
-        self._tpu.startswith(compat.as_bytes('uptc://')))
+        self._tpu == compat.as_bytes('local'))
 
   def _should_resolve(self):
     if isinstance(self._should_resolve_override, bool):
       return self._should_resolve_override
     else:
       return not (self._tpu.startswith(compat.as_bytes('grpc://')) or
-                  self._is_google_environment())
+                  self._is_local_tpu())
 
   @staticmethod
   def _get_device_dict_and_cores(devices):
@@ -206,12 +203,12 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: A string corresponding to the TPU to use. If the string is the empty
-        string, the string 'local', or a string that begins with 'grpc://' or
-          '/bns', then it is assumed to not correspond with a Cloud TPU and will
-          instead be passed as the session master and no ClusterSpec propagation
-          will be done. In the future, this may also support a list of strings
-          when multiple Cloud TPUs are used.
+      tpu: A string corresponding to the TPU to use. If the string is an empty
+        string, the string 'local', or a string that begins with 'grpc://',
+        then it is assumed to not correspond with a Cloud TPU and will
+        instead be passed as the session master and no ClusterSpec propagation
+        will be done. In the future, this may also support a list of strings
+        when multiple Cloud TPUs are used.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -273,8 +270,7 @@ class TPUClusterResolver(ClusterResolver):
     self.task_type = job_name
     self.task_id = 0
 
-    if self._is_google_environment():
-      self._environment = 'google'
+    if self._is_local_tpu():
       self.rpc_layer = None
     else:
       self._environment = ''
@@ -455,17 +451,16 @@ class TPUClusterResolver(ClusterResolver):
 
   def _fetch_cloud_tpu_metadata(self):
     """Returns the TPU metadata object from the TPU Get API call."""
-    res = []
     try:
       full_name = 'projects/%s/locations/%s/nodes/%s' % (
           self._project, self._zone, compat.as_text(self._tpu))
       service = self._tpu_service()
       request = service.projects().locations().nodes().get(name=full_name)
-      res = request.execute()
-    except:  # pylint: disable=bare-except
-      pass
-    finally:
-      return res  # pylint: disable=lost-exception
+      return request.execute()
+    except Exception as e:
+      raise ValueError("Could not lookup TPU metadata from name '%s'. Please "
+                       "doublecheck the tpu argument in the TPUClusterResolver "
+                       "constructor. Exception: %s" % (self._tpu, e))
 
   def num_accelerators(self,
                        task_type=None,
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index cb4d785e6cd..83ded5c18b6 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -27,7 +27,6 @@ from tensorflow.python import eager
 from tensorflow.python.client import session
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -255,6 +254,29 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
     self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
+  def testFailedMetadata(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        }
+    }
+
+    cluster_resolver = resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='nonexistent-tpu',
+        coordinator_name='coordinator',
+        coordinator_address='10.128.1.5:10203',
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    with self.assertRaises(ValueError) as context:
+      cluster_resolver.cluster_spec()
+
+    self.assertIn('Could not lookup TPU metadata', str(context.exception))
+
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
@@ -413,15 +435,9 @@ class TPUClusterResolverTest(test.TestCase):
   def testShouldResolveLocal(self):
     self.verifyShouldResolve('local', False)
 
-  def testShouldResolveLocalhost(self):
-    self.verifyShouldResolve('localhost:12345', False)
-
   def testShouldResolveGrpc(self):
     self.verifyShouldResolve('grpc://10.1.2.3:8470', False)
 
-  def testShouldResolveBns(self):
-    self.verifyShouldResolve('/bns/foo/bar', False)
-
   def testShouldResolveName(self):
     self.verifyShouldResolve('mytpu', True)
 
@@ -432,20 +448,13 @@ class TPUClusterResolverTest(test.TestCase):
     self.verifyShouldResolve('grpctpu', True)
 
   def testNoCallComputeMetadata(self):
-    cluster_resolver = resolver.TPUClusterResolver(tpu='/bns/foo/bar')
-    self.assertEqual('/bns/foo/bar', cluster_resolver.master())
-    if ops.executing_eagerly_outside_functions():
-      self.assertEqual(
-          server_lib.ClusterSpec({
-              'worker': ['/bns/foo/bar']
-          }).as_dict(),
-          cluster_resolver.cluster_spec().as_dict())
-    else:
-      self.assertEqual(None, cluster_resolver.cluster_spec())
-
-  def testLocalhostMaster(self):
-    cluster_resolver = resolver.TPUClusterResolver(tpu='localhost:12345')
-    self.assertEqual('localhost:12345', cluster_resolver.master())
+    cluster_resolver = resolver.TPUClusterResolver(tpu='grpc://10.1.2.3:8470')
+    self.assertEqual('grpc://10.1.2.3:8470', cluster_resolver.master())
+    self.assertEqual(
+        server_lib.ClusterSpec({
+            'worker': ['10.1.2.3:8470']
+        }).as_dict(),
+        cluster_resolver.cluster_spec().as_dict())
 
   def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
@@ -512,11 +521,6 @@ class TPUClusterResolverTest(test.TestCase):
         'https://{api}.internal/{apiVersion}',
         (resolver.TPUClusterResolver._environment_discovery_url()))
 
-  def testEnvironmentAndRpcDetectionForGoogle(self):
-    cluster_resolver = resolver.TPUClusterResolver(tpu='/bns/ab/cd/ef')
-    self.assertEqual(cluster_resolver.environment, 'google')
-    self.assertEqual(cluster_resolver.rpc_layer, None)
-
   def testEnvironmentAndRpcDetectionForGrpcString(self):
     cluster_resolver = resolver.TPUClusterResolver(
         tpu='grpc://10.1.2.3:8470')
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index e35f95a0331..a74d6e4bed8 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -496,6 +496,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       logging.warning("Enabled NCCL communication but no GPUs detected/"
                       "specified.")
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    return self._num_workers > 1
+
   @property
   def experimental_between_graph(self):
     return True
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
new file mode 100644
index 00000000000..8c303661926
--- /dev/null
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -0,0 +1,592 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CollectiveAllReduceStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCollectiveAllReduceStrategy, self).__init__(
+        collective_all_reduce_strategy.CollectiveAllReduceExtended(
+            self,
+            communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
+            cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None):
+  sess_config = config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+
+  if cluster_spec and task_type and task_id is not None:
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        task_type=task_type,
+        task_id=task_id,
+        num_accelerators={'GPU': num_gpus})
+    target = 'grpc://' + cluster_spec[task_type][task_id]
+  else:
+    cluster_resolver = SimpleClusterResolver(
+        ClusterSpec({}), num_accelerators={'GPU': num_gpus})
+    target = ''
+
+  strategy = MockCollectiveAllReduceStrategy(cluster_resolver)
+  sess_config = strategy.update_config_proto(sess_config)
+
+  return strategy, target, sess_config
+
+
+class CollectiveAllReduceStrategyTestBase(
+    multi_worker_test_base.MultiWorkerTestBase):
+
+  collective_key_base = 0
+
+  def setUp(self):
+    # We use a different key_base for each test so that collective keys won't be
+    # reused.
+    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
+    # tests.
+    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    super(CollectiveAllReduceStrategyTestBase, self).setUp()
+
+  def _get_test_object(self, task_type, task_id, num_gpus=0):
+    strategy, target, session_config = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        num_gpus=num_gpus)
+
+    collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=10 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
+        op_instance_key_start=100 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
+        variable_instance_key_start=10000 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base)
+    strategy.extended._collective_keys = collective_keys
+    strategy.extended._cross_device_ops._collective_keys = (collective_keys)
+
+    return strategy, target, session_config
+
+  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False,
+                     name='gpu_%d' % d.extended._num_gpus_per_worker)
+
+      def loss_fn(x):
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
+      # multiple graphs (b/111216820).
+      def grad_fn(x):
+        loss = loss_fn(x)
+        var_list = (
+            variables.trainable_variables() + ops.get_collection(
+                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        grads = gradients.gradients(loss, var_list)
+        ret = list(zip(grads, var_list))
+        return ret
+
+      def update(v, g):
+        return v.assign_sub(0.05 * g, use_locking=True)
+
+      one = constant_op.constant([[1.]])
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.extended.call_for_each_replica(grad_fn, args=[one])
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.extended.read_var(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            # TODO(yuefengz): support non-Mirrored variable as destinations.
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
+            with ops.control_dependencies(
+                d.extended.update(v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
+        return True
+
+      sess.run(variables.global_variables_initializer())
+
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_complex_model(self, task_type, task_id, num_gpus):
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
+
+    def model_fn():
+      """Mnist model with synthetic input."""
+      data_format = 'channels_last'
+      input_shape = [28, 28, 1]
+      l = keras.layers
+      max_pool = l.MaxPooling2D((2, 2), (2, 2),
+                                padding='same',
+                                data_format=data_format)
+      model = keras.Sequential([
+          l.Reshape(target_shape=input_shape, input_shape=(28 * 28,)),
+          l.Conv2D(
+              32,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=nn.relu), max_pool,
+          l.Conv2D(
+              64,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=nn.relu), max_pool,
+          l.Flatten(),
+          l.Dense(1024, activation=nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])
+      image = random_ops.random_uniform([2, 28, 28])
+      label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
+      logits = model(image, training=True)
+      # TODO(yuefengz): make loss a callable for eager mode.
+      loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
+      optimizer = adam.AdamOptimizer(learning_rate=1e-4)
+      train_op = optimizer.minimize(loss,
+                                    training_util.get_or_create_global_step())
+      return train_op
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      with d.scope():
+        train_op = d.extended.call_for_each_replica(model_fn)
+        train_op = d.group(d.experimental_local_results(train_op))
+
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+
+  def _test_variable_initialization(self, task_type, task_id, num_gpus):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess, \
+         distribution.scope():
+
+      def model_fn():
+        x = variable_scope.get_variable(
+            'x',
+            shape=(2, 3),
+            initializer=init_ops.random_uniform_initializer(
+                1.0, 10.0, dtype=dtypes.float32))
+        return array_ops.identity(x)
+
+      x = distribution.extended.call_for_each_replica(model_fn)
+      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=None)
+      x = distribution.experimental_local_results(x)[0]
+
+      sess.run(variables.global_variables_initializer())
+
+      x_value, reduced_x_value = sess.run([x, reduced_x])
+      self.assertTrue(
+          np.allclose(x_value, reduced_x_value, atol=1e-5),
+          msg=('x_value = %r, reduced_x_value = %r' % (x_value,
+                                                       reduced_x_value)))
+
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              test_reinitialize=True,
+                              ignore_order=False):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
+        if ignore_order:
+          self.assertCountEqual(expected_value, computed_value)
+        else:
+          self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_replica(r, next_element)
+                  for r in range(len(devices))])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      if test_reinitialize:
+        sess.run(iterator.initialize())
+
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          if ignore_order:
+            self.assertCountEqual(expected_value, computed_value)
+          else:
+            self.assertEqual(expected_value, computed_value)
+
+
+class DistributedCollectiveAllReduceStrategyTest(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_num_replicas_in_sync(self):
+    distribution, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=0,
+        num_gpus=2)
+    num_workers = len(self._cluster_spec.get('chief', []) +
+                      self._cluster_spec.get('worker', []))
+    self.assertEqual(2 * num_workers,
+                     distribution.num_replicas_in_sync)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testVariableInitialization(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    self._run_between_graph_clients(
+        self._test_variable_initialization,
+        self._cluster_spec,
+        num_gpus=num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testComplexModel(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    self._run_between_graph_clients(
+        self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
+
+  # TODO(yuefengz): Update how we use num_gpus and required_gpus
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_dataset=[True, False]))
+  def testMakeInputFnIterator(self, num_gpus, use_dataset):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
+    # We use CPU as the device when num_gpus = 0
+    devices_per_worker = max(1, num_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        ignore_order=not use_dataset)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProto(self):
+    strategy, _, _ = self._get_test_object(
+        task_type='worker', task_id=1, num_gpus=2)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+    rewrite_options = config_proto.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
+
+    new_config = strategy.update_config_proto(config_proto)
+
+    # Verify group leader
+    self.assertEqual('/job:worker/replica:0/task:0',
+                     new_config.experimental.collective_group_leader)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)
+
+    # Verify rewrite options.
+    new_rewrite_options = new_config.graph_options.rewrite_options
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
+                     new_rewrite_options.scoped_allocator_optimization)
+    self.assertEqual(['CollectiveReduce'],
+                     new_rewrite_options.scoped_allocator_opts.enable_op)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOps(self):
+    mock_called = [False]
+
+    # pylint: disable=dangerous-default-value
+    def mock_enable_collective_ops(server_def, mock_called=mock_called):
+      self.assertEqual('worker', server_def.job_name)
+      self.assertEqual(1, server_def.task_index)
+      self.assertEqual('grpc', server_def.protocol)
+      mock_called[0] = True
+
+    def mock_configure_collective_ops(*args, **kwargs):
+      del args, kwargs
+
+    with test.mock.patch.object(context.context(), 'enable_collective_ops',
+                                mock_enable_collective_ops), \
+         test.mock.patch.object(context.context(), 'configure_collective_ops',
+                                mock_configure_collective_ops):
+      strategy, _, _ = self._get_test_object(
+          task_type='worker', task_id=1, num_gpus=2)
+    self.assertTrue(strategy.extended._std_server_started)
+    self.assertTrue(mock_called[0])
+
+
+class DistributedCollectiveAllReduceStrategyTestWithChief(
+    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers and 1 chief."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0, has_chief=True)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testVariableInitialization(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      return
+    self._run_between_graph_clients(
+        self._test_variable_initialization,
+        self._cluster_spec,
+        num_gpus=num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testComplexModel(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      return
+    self._run_between_graph_clients(
+        self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
+
+
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2))
+  def testMinimizeLoss(self, num_gpus):
+    # Collective ops doesn't support strategy with one device.
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    if context.executing_eagerly():
+      strategy, _, _ = self._get_test_object(None, None, num_gpus)
+      self._test_minimize_loss_eager(strategy)
+    else:
+      self._test_minimize_loss_graph(None, None, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2))
+  def testComplexModel(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    self._test_complex_model(None, None, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'], required_gpus=2, use_dataset=[True, False]))
+  def testMakeInputFnIterator(self, use_dataset):
+    num_gpus = 2
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(5 * num_gpus)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
+    expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        ignore_order=not use_dataset)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSum(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSumGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradients(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceSumGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradient_tape(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMean(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMeanGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradients(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testAllReduceMeanGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradient_tape(distribution)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testNumpyDataset(self):
+    num_gpus = 2
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    strategy, _, _ = self._get_test_object(None, None, num_gpus=num_gpus)
+    self._test_numpy_dataset(strategy)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 143b183e76b..c5774ff98e2 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,21 +49,26 @@ def check_destinations(destinations):
     Boolean which is True if `destinations` is not empty.
   """
   # Calling bool() on a ResourceVariable is not allowed.
-  if isinstance(destinations, resource_variable_ops.BaseResourceVariable):
+  if isinstance(destinations,
+                (resource_variable_ops.BaseResourceVariable, ops.Tensor)):
     return bool(destinations.device)
   return bool(destinations)
 
 
 def validate_destinations(destinations):
-  if not isinstance(destinations,
-                    (value_lib.DistributedValues,
-                     resource_variable_ops.BaseResourceVariable,
-                     value_lib.AggregatingVariable,
-                     six.string_types,
-                     value_lib.TPUMirroredVariable,
-                     # LogicalDeviceSpec is only used internally, e.g. as a
-                     # broadcast destination, never supplied by a user.
-                     value_lib.LogicalDeviceSpec)):
+  """Validates the `destination` is one of expected types."""
+  if not isinstance(
+      destinations,
+      (
+          value_lib.DistributedValues,
+          resource_variable_ops.BaseResourceVariable,
+          ops.Tensor,
+          value_lib.AggregatingVariable,
+          six.string_types,
+          value_lib.TPUMirroredVariable,
+          # LogicalDeviceSpec is only used internally, e.g. as a
+          # broadcast destination, never supplied by a user.
+          value_lib.LogicalDeviceSpec)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
                      " a tf.Variable object, or a device string.")
 
@@ -79,7 +85,8 @@ def reduce_non_distributed_value(reduce_op, device_map, value, destinations):
   # If the same value is present on all replicas then the PerReplica value will
   # be a single value. We also handle the case when `value` is a single value
   # and equal to 0.
-  if value == 0:
+  # TODO:(b/138823479): handle the tensor value properly.
+  if not tensor_util.is_tensor(value) and value == 0:
     return 0
   # If there is only a single value and the reduce op is MEAN,
   # that value should be on all destinations.
@@ -159,7 +166,7 @@ def get_devices_from(destinations):
         destinations.logical_device)
   elif isinstance(destinations, six.string_types):
     return (device_util.resolve(destinations),)
-  return (destinations.device,)
+  return (device_util.resolve(destinations.device),)
 
 
 def get_device_map_from(destinations):
@@ -199,7 +206,8 @@ def simple_broadcast(value, destinations, always_mirrored=False):
       value_updates.append(
           cross_device_utils.copy_tensor_or_indexed_slices_to_device(
               value, d))
-    return value_lib.Mirrored(device_map, value_updates, logical_device)
+    return value_lib.regroup(
+        device_map, value_updates, wrap_class=value_lib.Mirrored)
 
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
@@ -244,8 +252,8 @@ class CrossDeviceOps(object):
     result on `destinations`.
 
     Args:
-      reduce_op: Indicates how per_replica_value will be reduced. Accepted
-        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
+        per_replica_value will be reduced.
       per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
@@ -254,9 +262,9 @@ class CrossDeviceOps(object):
 
     Raises:
       ValueError: if per_replica_value can't be converted to a PerReplica
-        object.
+        object or if destinations aren't strings, Variables or DistributedValues
     """
-    if not isinstance(per_replica_value, value_lib.PerReplica):
+    if not isinstance(per_replica_value, value_lib.DistributedValues):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
@@ -265,8 +273,10 @@ class CrossDeviceOps(object):
     if self._num_between_graph_workers == 1 and len(
         per_replica_value.values) == 1 and _devices_match(
             per_replica_value, destinations):
-      return value_lib.Mirrored(per_replica_value.device_map,
-                                per_replica_value.values)
+      return value_lib.regroup(
+          per_replica_value.device_map,
+          per_replica_value.values,
+          wrap_class=value_lib.Mirrored)
 
     return self.reduce_implementation(reduce_op, per_replica_value,
                                       destinations)
@@ -278,17 +288,17 @@ class CrossDeviceOps(object):
     element which indicates the destinations.
 
     Args:
-      reduce_op: Indicates how per_replica_value will be reduced. Accepted
-        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
-      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
+        the `per_replica_value` will be reduced.
+      value_destination_pairs: a list or a tuple of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
     Returns:
       a list of Mirrored objects.
 
     Raises:
-      ValueError: if `value_destination_pairs` is not a list or a tuple of
-        tuples of PerReplica objects and destinations
+      ValueError: if `value_destination_pairs` is not an iterable of
+        tuples of PerReplica objects and destinations.
     """
     # TODO(yuefengz): if destinations are different, split into several
     # `_batch_reduce` invocations.
@@ -306,7 +316,8 @@ class CrossDeviceOps(object):
         value_destination_pairs) and len(
             value_destination_pairs[0][0].values) == 1:
       return [
-          value_lib.Mirrored(v.device_map, v.values)
+          value_lib.regroup(
+              v.device_map, v.values, wrap_class=value_lib.Mirrored)
           for v, _ in value_destination_pairs
       ]
 
@@ -329,12 +340,14 @@ class CrossDeviceOps(object):
   def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     """The implementation of reduce of `per_replica_value` to `destinations`.
 
+    Overriding this method is useful for subclass implementers.
+
     It runs the reduction operation defined by `reduce_op` and put the
     result on `destinations`.
 
     Args:
-      reduce_op: Indicates how per_replica_value will be reduced. Accepted
-        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      reduce_op: An instance `tf.distribute.ReduceOp` that indicates of how
+        per_replica_value will be reduced.
       per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
@@ -352,20 +365,22 @@ class CrossDeviceOps(object):
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     """Implementation of reduce PerReplica objects in a batch.
 
+    Overriding this method is useful for subclass implementers.
+
     Reduce each first element in `value_destination_pairs` to each second
     element which indicates the destinations.
 
     Args:
-      reduce_op: Indicates how per_replica_value will be reduced. Accepted
-        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
-      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
+        per_replica_value will be reduced.
+      value_destination_pairs: an iterable of tuples of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
     Returns:
       a list of Mirrored objects.
 
     Raises:
-      ValueError: if `value_destination_pairs` is not a list or a tuple of
+      ValueError: if `value_destination_pairs` is not an iterable of
         tuples of PerReplica objects and destinations
     """
     raise NotImplementedError(
@@ -389,15 +404,20 @@ class CrossDeviceOps(object):
 class ReductionToOneDevice(CrossDeviceOps):
   """Always do reduction to one device first and then do broadcasting.
 
-    Batch reduction is done by reduction on each element one by one.
+  Batch reduction is done by reduction on each element one by one.
+
+  ```
+    mirrored_strategy = tf.distribute.MirroredStrategy(
+      cross_device_ops=tf.distribute.ReductionToOneDevice())
+  ```
   """
 
   def __init__(self, reduce_to_device=None, accumulation_fn=None):
-    """Constructor.
+    """Initializes with a device to reduce to and a way to accumulate.
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
-        to the first device in `destinations` of the reduce() method.
+        to the first device in `destinations` of the `reduce()` method.
       accumulation_fn: a function that does accumulation.  If None, then
         `tf.math.add_n` is used.
     """
@@ -475,16 +495,20 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   Returns:
     a list of Mirrored objects.
   """
-  device_map, logical_device = get_device_map_from(destinations)
+  device_map, _ = get_device_map_from(destinations)
   num_replicas = device_map.num_replicas_in_graph * num_between_graph_workers
   index = [[] for _ in range(len(grouped_reduced[0]))]
   for per_replica_reduced in grouped_reduced:
     for i, (v, _) in enumerate(per_replica_reduced):
       if reduce_op == reduce_util.ReduceOp.MEAN:
-        index[i].append(v / num_replicas)
+        with ops.device(v.device):
+          index[i].append(v / num_replicas)
       else:
         index[i].append(v)
-  return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
+  return [
+      value_lib.regroup(device_map, v, wrap_class=value_lib.Mirrored)
+      for v in index
+  ]
 
 
 class _ConcatAndSplitPacker(object):
@@ -789,14 +813,14 @@ class NcclAllReduce(AllReduceCrossDeviceOps):
 
     Args:
       num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than 0.
+        be greater than or equals 0. When it is zero, no packing will be done.
 
     Raises:
-      ValueError if `num_packs` is zero or negative.
+      ValueError if `num_packs` is negative.
     """
-    if num_packs <= 0:
+    if num_packs < 0:
       raise ValueError(
-          "NCCL all-reduce requires num_packs > 0, but {} is specified".format(
+          "NCCL all-reduce requires num_packs >= 0, but {} is specified".format(
               num_packs))
     super(NcclAllReduce, self).__init__(
         all_reduce_alg="nccl", num_packs=num_packs)
@@ -820,15 +844,15 @@ class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
 
     Args:
       num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than 0.
+        be greater than or equals 0. When it is zero, no packing will be done.
 
     Raises:
-      ValueError if `num_packs` is zero or negative.
+      ValueError if `num_packs` is negative.
     """
-    if num_packs <= 0:
+    if num_packs < 0:
       raise ValueError(
-          "HierarchicalCopy requires num_packs > 0, but {} is specified".format(
-              num_packs))
+          "HierarchicalCopy requires num_packs >= 0, but {} is specified"
+          .format(num_packs))
     super(HierarchicalCopyAllReduce, self).__init__(
         all_reduce_alg="hierarchical_copy",
         num_packs=num_packs)
@@ -983,21 +1007,19 @@ class CollectiveAllReduce(CrossDeviceOps):
   def __init__(self,
                num_workers=1,
                num_gpus_per_worker=0,
-               all_reduce_merge_scope=32,
+               num_packs=1,
                collective_keys=None):
     """Initializes the object.
 
     Args:
       num_workers: number of workers in the between-graph replicated training.
       num_gpus_per_worker: number of GPUs per worker.
-      all_reduce_merge_scope: size of groups into which to partition consecutive
-        gradients grouped under a common 'allreduce' name scope. This is useful
-        for some optimization of collective ops.
+      num_packs: gradients will be packed into `num_packs` chunks.
       collective_keys: an optional CollectiveKey object.
     """
     self._num_workers = num_workers
     self._num_gpus_per_worker = num_gpus_per_worker
-    self._all_reduce_merge_scope = all_reduce_merge_scope
+    self._num_packs = num_packs
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     super(CollectiveAllReduce, self).__init__()
@@ -1009,10 +1031,19 @@ class CollectiveAllReduce(CrossDeviceOps):
   def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     device_map, logical_device = get_device_map_from(destinations)
-    if (all_reduced.device_map is device_map and
+    devices = device_map.logical_to_actual_devices(logical_device)
+
+    if (isinstance(all_reduced, value_lib.Mirrored) and
+        all_reduced.device_map is device_map and
         all_reduced.logical_device == logical_device):
       return all_reduced
-    devices = device_map.logical_to_actual_devices(logical_device)
+
+    # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform
+    # utility to access component for a particular device.
+    if not isinstance(all_reduced, value_lib.Mirrored):
+      all_reduced = value_lib.Mirrored(
+          value_lib.SingleDeviceMap(all_reduced.device), [all_reduced])
+
     index = []
     with ops.control_dependencies(all_reduced.values):
       for d in devices:
@@ -1024,7 +1055,7 @@ class CollectiveAllReduce(CrossDeviceOps):
             # copy from the corresponding replica instead of the primary.
             index.append(array_ops.identity(all_reduced.primary))
 
-    return value_lib.Mirrored(device_map, index, logical_device)
+    return value_lib.regroup(device_map, index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
@@ -1042,21 +1073,31 @@ class CollectiveAllReduce(CrossDeviceOps):
           for t, v in value_destination_pairs
       ]
 
-  def _make_gradient_chunks(self, per_replica_values, all_reduce_merge_scope):
+  def _make_gradient_chunks(self, per_replica_values, num_packs):
     """Make `per_replica_values` into chunks."""
-    grouped_by_device = _group_value_by_device(per_replica_values)
-
-    grouped_by_var = list(zip(*grouped_by_device))
-    # grouped_by_var is grouped by variables and takes the following format:
+    chunked_by_device = _group_value_by_device(per_replica_values)
+    chunked_by_var = list(zip(*chunked_by_device))
+    # chunked_by_var is chunked by variables and takes the following format:
     # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
     #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
     #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
     #  ...
     # ]
+
+    # First n-1 chunks get `chunk_size` grads, last chunk gets leftover grads.
+    # This strategy can cause the last chunk to have larger size compared to the
+    # first n-1 chunks.  Alternatively, we can increment chunk_size by 1 to get
+    # slightly larger first n-1 chunks and smaller last chunk.
+    # TODO(ayushd): compare different packing strategies.
+    chunk_size = len(chunked_by_var) // num_packs
+    leftover_size = len(chunked_by_var) - chunk_size * (num_packs - 1)
+    assert leftover_size > 0
     chunked_gv = [
-        grouped_by_var[x:x + all_reduce_merge_scope]
-        for x in range(0, len(grouped_by_var), all_reduce_merge_scope)
+        chunked_by_var[x:x + chunk_size]
+        for x in range(0, len(chunked_by_var) - leftover_size, chunk_size)
     ]
+    chunked_gv.append(chunked_by_var[-leftover_size:])
+
     return chunked_gv
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
@@ -1082,11 +1123,13 @@ class CollectiveAllReduce(CrossDeviceOps):
         logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
         "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
 
-    chunked_gv = self._make_gradient_chunks(per_replica_values,
-                                            self._all_reduce_merge_scope)
+    chunked_gv = self._make_gradient_chunks(per_replica_values, self._num_packs)
 
     reduced_gv_list = []
     for chunk in chunked_gv:
+      # By placing all collective ops in a chunk under single name scope, we
+      # ensure they will be picked up by the `ScopedAllocator` grappler
+      # optimizer and packed into a single all-reduce.
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
           # Gradients for the same variable but from different devices.
@@ -1114,8 +1157,7 @@ class CollectiveAllReduce(CrossDeviceOps):
         "%d all-reduces, num_workers = %d" %
         (len(per_replica_values), self._num_workers), 10)
 
-    chunked_gv = self._make_gradient_chunks(per_replica_values,
-                                            self._all_reduce_merge_scope)
+    chunked_gv = self._make_gradient_chunks(per_replica_values, self._num_packs)
 
     reduced_gv_list = []
     for chunk in chunked_gv:
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index ea2241d8616..a5b7fa15d2c 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -32,9 +32,9 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
-from tensorflow.python.framework import kernels
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -45,6 +45,8 @@ def _get_devices(devices):
     return tuple(device_util.resolve(d) for d in devices)
   elif isinstance(devices, value_lib.DistributedValues):
     return devices.devices
+  elif isinstance(devices, ops.Tensor):
+    return (device_util.resolve(devices.device),)
   return (device_util.resolve(devices),)
 
 
@@ -64,7 +66,7 @@ def _make_per_replica(values, devices, regroup=False):
     with ops.device(d):
       placed_v = array_ops.identity(v)
     index.append(placed_v)
-  return value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), index)
+  return value_lib.regroup(value_lib.ReplicaDeviceMap(devices), index)
 
 
 # pylint: disable=g-doc-args,g-doc-return-or-yield
@@ -75,8 +77,14 @@ def _fake_mirrored(value, devices):
   true in reality.
   """
   devices = _get_devices(devices)
-  return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices),
-                            [value] * len(devices))
+  values = []
+  for d in devices:
+    with ops.device(d):
+      values.append(array_ops.identity(value))
+  return value_lib.regroup(
+      value_lib.ReplicaDeviceMap(devices),
+      values,
+      wrap_class=value_lib.Mirrored)
 
 
 def _make_indexed_slices(values, indices, dense_shape, device):
@@ -91,7 +99,10 @@ def _make_indexed_slices(values, indices, dense_shape, device):
 def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
   values = [_make_indexed_slices(values, indices, dense_shape, d)
             for d in devices]
-  return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), values)
+  return value_lib.regroup(
+      value_lib.ReplicaDeviceMap(devices),
+      values,
+      wrap_class=value_lib.Mirrored)
 
 
 _cpu_device = "/device:CPU:0"
@@ -109,22 +120,25 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
         self.evaluate(ops.convert_to_tensor(right)))
 
   def _assert_values_equal(self, left, right):
-    if isinstance(left, list):
+    self.assertEqual(type(left), type(right))
+    if isinstance(left, (list, tuple)):
       for l, r in zip(left, right):
         self._assert_values_equal(l, r)
     else:
-      self.assertEqual(type(left), type(right))
-      self.assertEqual(set(left.devices), set(right.devices))
-      if isinstance(left.values[0], ops.IndexedSlices):
-        for d in left.devices:
-          self._assert_indexed_slices_equal(left.get(d), right.get(d))
-      elif context.executing_eagerly():
-        self.assertEqual([v.numpy() for v in left.values],
-                         list(right.values))
+      if isinstance(left, value_lib.DistributedValues):
+        self.assertEqual(set(left.devices), set(right.devices))
+        self._assert_values_equal([left.get(d) for d in sorted(left.devices)],
+                                  [right.get(d) for d in sorted(right.devices)])
       else:
-        with self.cached_session() as sess:
-          self.assertEqual(
-              sess.run(list(left.values)), list(right.values))
+        self.assertEqual(
+            device_util.resolve(left.device), device_util.resolve(right.device))
+        if isinstance(left, ops.IndexedSlices):
+          self._assert_indexed_slices_equal(left, right)
+        elif context.executing_eagerly():
+          self.assertEqual(left.numpy(), right.numpy())
+        else:
+          with self.cached_session() as sess:
+            self.assertEqual(sess.run(left), sess.run(right))
 
   def _testReductionAndBroadcast(self, cross_device_ops, devices):
     if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
@@ -139,8 +153,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     mean_2 = mean + 1.
 
     destination_mirrored = _fake_mirrored(1., devices)
-    destination_different = _fake_mirrored(1., _cpu_device)
-    destination_str = _cpu_device
+    destination_different = _fake_mirrored(1., device_util.resolve(_cpu_device))
+    destination_str = device_util.resolve(_cpu_device)
 
     all_destinations = [
         destination_mirrored, destination_different, destination_str,
@@ -253,7 +267,7 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
               cross_device_ops_lib.ReductionToOneDevice(
-                  accumulation_fn=math_ops.accumulate_n)),
+                  accumulation_fn=math_ops.add_n)),
       ],
       devices=[
           ["/cpu:0"],
@@ -416,7 +430,9 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
 
   @combinations.generate(multi_worker_allreduce_combinations)
   def testReductionAndBroadcast(self, cross_device_ops, devices):
-    self._testReductionAndBroadcast(cross_device_ops, devices)
+    # Mimic the default device of multi-worker strategies.
+    with ops.device("/job:worker/replica:0/task:0"):
+      self._testReductionAndBroadcast(cross_device_ops, devices)
 
 
 NUM_WORKERS = 3
@@ -493,22 +509,27 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
   def _assert_values_equal(self, left, right, sess):
-    if isinstance(left, list):
+    self.assertEqual(type(left), type(right))
+    if isinstance(left, (list, tuple)):
       for l, r in zip(left, right):
         self._assert_values_equal(l, r, sess)
     else:
-      self.assertEqual(type(left), type(right))
-      self.assertEqual(set(left.devices), set(right.devices))
-
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 6
-
-      left_values = np.array(
-          sess.run(list(left.values), options=run_options)).flatten()
-      right_values = np.array(list(right.values)).flatten()
-      self.assertEqual(len(left_values), len(right_values))
-      for l, r in zip(left_values, right_values):
-        self.assertEqual(l, r)
+      if isinstance(left, value_lib.DistributedValues):
+        self.assertEqual(set(left.devices), set(right.devices))
+        self._assert_values_equal(left.values, right.values, sess)
+      else:
+        self.assertEqual(
+            device_util.resolve(left.device), device_util.resolve(right.device))
+        if isinstance(left, ops.IndexedSlices):
+          self._assert_indexed_slices_equal(left, right)
+        elif context.executing_eagerly():
+          self.assertEqual(left.numpy(), right.numpy())
+        else:
+          run_options = config_pb2.RunOptions()
+          run_options.experimental.collective_graph_key = 6
+          self.assertEqual(
+              sess.run(left, options=run_options),
+              sess.run(right, options=run_options))
 
   def _test_reduction(self,
                       task_type,
@@ -533,10 +554,6 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
     def _reduce(test_object, reduce_op, per_replica, destinations):
       if use_strategy_object:
         with test_object.scope():
-          # Mimic the behavior that distribution strategy usually strips the
-          # wrapper if there is only one value.
-          if len(per_replica.values) == 1:
-            per_replica = per_replica.values[0]
           return test_object.extended.reduce_to(reduce_op, per_replica,
                                                 destinations)
       else:
@@ -663,12 +680,16 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
       else:
         result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM,
                                               per_replica, per_replica)
-      self.assertIsInstance(result, value_lib.Mirrored)
+      if num_gpus > 1:
+        self.assertIsInstance(result, value_lib.Mirrored)
 
       run_options = config_pb2.RunOptions()
       run_options.experimental.collective_graph_key = 7
-      result = sess.run([ops.convert_to_tensor(v) for v in result.values],
-                        options=run_options)[0]
+      if num_gpus > 1:
+        result = sess.run([ops.convert_to_tensor(v) for v in result.values],
+                          options=run_options)[0]
+      else:
+        result = sess.run(ops.convert_to_tensor(result), options=run_options)
 
       # Reduce the same indexed slices on CPU locally as our expected results.
       devices_cpu = [(worker_device or "") + "/device:CPU:0"] * (
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 6058db356e2..6ef06b91799 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -576,7 +576,7 @@ def unpack_grad_tuple(gv, gpt):
      reduction.
   """
   elt_widths = [x.num_elements() for x in gpt.shapes]
-  with ops.device(gv[0][0].device):
+  with ops.device(gv[0].device):
     with ops.name_scope('unpack'):
       splits = array_ops.split(gv[0], elt_widths)
       unpacked_gv = []
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 45dc7480869..852e964918f 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -63,7 +63,7 @@ the same way with eager and graph execution.
   each replica are aggregated together before updating the model variables. This
   is in contrast to _asynchronous_, or _async_ training, where each replica
   updates the model variables independently. You may also have replicas
-  partitioned into gropus which are in sync within each group but async between
+  partitioned into groups which are in sync within each group but async between
   groups.
 * _Parameter servers_: These are machines that hold a single copy of
   parameters/variables, used by some strategies (right now just
@@ -99,6 +99,7 @@ import copy
 import enum  # pylint: disable=g-bad-import-order
 import threading
 import weakref
+
 import six
 
 from tensorflow.python.autograph.core import ag_ctx
@@ -123,6 +124,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
@@ -410,13 +412,16 @@ class InputContext(object):
 # pylint: disable=line-too-long
 @tf_export("distribute.Strategy", v1=[])
 class Strategy(object):
-  """A list of devices with a state & compute distribution policy.
+  """A state & compute distribution policy on a list of devices.
 
   See [the guide](https://www.tensorflow.org/alpha/guide/distribute_strategy)
   for overview and examples.
 
   In short:
 
+  * To use it with Keras `compile`/`fit`,
+    [please
+    read](https://www.tensorflow.org/alpha/guide/distribute_strategy#using_tfdistributestrategy_with_keras).
   * You may pass descendant of `tf.distribute.Strategy` to
     `tf.estimator.RunConfig` to specify how a `tf.estimator.Estimator`
     should distribute its computation. See
@@ -425,11 +430,10 @@ class Strategy(object):
     strategy should be used when building an executing your model.
     (This puts you in the "cross-replica context" for this strategy, which
     means the strategy is put in control of things like variable placement.)
-  * If using Keras `compile`/`fit`,
-    [that is it](https://www.tensorflow.org/alpha/guide/distribute_strategy#using_tfdistributestrategy_with_keras).
   * If you are writing a custom training loop, you will need to call a few more
     methods,
-    [see the guide](https://www.tensorflow.org/alpha/guide/distribute_strategy#using_tfdistributestrategy_with_custom_training_loops):
+    [see the
+    guide](https://www.tensorflow.org/alpha/guide/distribute_strategy#using_tfdistributestrategy_with_custom_training_loops):
 
       * Start by either creating a `tf.data.Dataset` normally or using
         `tf.distribute.experimental_make_numpy_dataset` to make a dataset out of
@@ -484,7 +488,8 @@ class Strategy(object):
   accumulate metrics across steps in a given epoch.
 
   See the
-  [custom training loop tutorial](https://www.tensorflow.org/alpha/tutorials/distribute/training_loops)
+  [custom training loop
+  tutorial](https://www.tensorflow.org/alpha/tutorials/distribute/training_loops)
   for a more detailed example.
 
   Note: `tf.distribute.Strategy` currently does not support TensorFlow's
@@ -672,8 +677,6 @@ class Strategy(object):
   def experimental_distribute_datasets_from_function(self, dataset_fn):
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
-    Note: This API can only be used in eager mode.
-
     `dataset_fn` will be called once for each worker in the strategy. Each
     replica on that worker will dequeue one batch of inputs from the local
     `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued
@@ -718,11 +721,8 @@ class Strategy(object):
       A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
       it produces "per-replica" values.
     """
-    if ops.executing_eagerly_outside_functions():
-      return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
-          dataset_fn)
-    raise RuntimeError("`experimental_distribute_datasets_from_function` is "  # pylint: disable=g-doc-exception
-                       "only supported when eager execution is enabled.")
+    return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
+        dataset_fn)
 
   def experimental_run_v2(self, fn, args=(), kwargs=None):
     """Run `fn` on each replica, with the given arguments.
@@ -730,14 +730,17 @@ class Strategy(object):
     Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
     "per-replica" values, such as those produced by a "distributed `Dataset`",
     when `fn` is executed on a particular replica, it will be executed with the
-    component of those "per-replica" values that corresponds to that replica.
+    component of those "per-replica" values that correspond to that replica.
 
     `fn` may call `tf.distribute.get_replica_context()` to access members such
     as `all_reduce`.
 
-    IMPORTANT: Depending on the `tf.distribute.Strategy` implementation being
-    used, and whether eager execution is enabled, `fn` may be called one or more
-    times (once for each replica).
+    All arguments in `args` or `kwargs` should either be nest of tensors or
+    per-replica objects containing tensors or composite tensors.
+
+    IMPORTANT: Depending on the implementation of `tf.distribute.Strategy` and
+    whether eager execution is enabled, `fn` may be called one or more times (
+    once for each replica).
 
     Args:
       fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
@@ -839,7 +842,12 @@ class Strategy(object):
           return numer, dim
       elif axis < 0:
         axis = axis + array_ops.rank(v)
-      denom = array_ops.shape_v2(v, out_type=dtypes.int64)[axis]
+      if v.shape.rank == 1:
+        # TODO(b/139422050): Currently tf.shape is not supported in TPU dynamic
+        # padder, use tf.size instead to workaround if the rank is 1.
+        denom = array_ops.size(v, out_type=dtypes.int64)
+      else:
+        denom = array_ops.shape_v2(v, out_type=dtypes.int64)[axis]
       # TODO(josh11b): Should we cast denom to v.dtype here instead of after the
       # reduce is complete?
       return numer, denom
@@ -876,7 +884,7 @@ class Strategy(object):
   def experimental_local_results(self, value):
     """Returns the list of all local per-replica values contained in `value`.
 
-    Note: This only returns values on the workers initiated by this client.
+    Note: This only returns values on the worker initiated by this client.
     When using a `tf.distribute.Strategy` like
     `tf.distribute.experimental.MultiWorkerMirroredStrategy`, each worker
     will be its own client, and this function will only return values
@@ -1305,9 +1313,18 @@ class StrategyExtendedV2(object):
   def _scope(self, strategy):
     """Implementation of tf.distribute.Strategy.scope()."""
     def creator_with_resource_vars(*args, **kwargs):
+      """Variable creator to use in `_CurrentDistributionContext`."""
       _require_strategy_scope_extended(self)
       kwargs["use_resource"] = True
       kwargs["distribute_strategy"] = strategy
+
+      # Unwrap `initial_value` if it is a `CheckpointInitialValue` to avoid
+      # dereferencing a `Tensor` that is without a `name`.
+      # TODO(b/138130844): Revisit the following check once
+      # `CheckpointInitialValue` class is removed.
+      if isinstance(kwargs["initial_value"], trackable.CheckpointInitialValue):
+        kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+
       return self._create_variable(*args, **kwargs)
 
     def distributed_getter(getter, *args, **kwargs):
@@ -1436,7 +1453,7 @@ class StrategyExtendedV2(object):
         all-reduction, pass `value` to `destinations`.
 
     Returns:
-      A value mirrored to `destinations`.
+      A tensor or value mirrored to `destinations`.
     """
     # TODO(josh11b): More docstring
     _require_cross_replica_or_default_context_extended(self)
@@ -1625,6 +1642,23 @@ class StrategyExtendedV2(object):
   def _update_config_proto(self, config_proto):
     return copy.deepcopy(config_proto)
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings.
+
+    Multi-worker training refers to the setup where the training is
+    distributed across multiple workers, as opposed to the case where
+    only a local process performs the training. This function is
+    used by higher-level apis such as Keras' `model.fit()` to infer
+    for example whether or not a distribute coordinator should be run,
+    and thus TensorFlow servers should be started for communication
+    with other servers in the cluster, or whether or not saving/restoring
+    checkpoints is relevant for preemption fault tolerance.
+
+    Subclasses should override this to provide whether the strategy is
+    currently in multi-worker setup.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
 
 @tf_export(v1=["distribute.StrategyExtended"])  # pylint: disable=missing-docstring
 class StrategyExtendedV1(StrategyExtendedV2):
@@ -2166,6 +2200,11 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    # Default strategy doesn't indicate multi-worker training.
+    return False
+
   # TODO(priyag): This should inherit from `InputIterator`, once dependency
   # issues have been resolved.
   class DefaultInputIterator(object):
@@ -2295,9 +2334,12 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
     if kwargs.get("trainable", True):
       collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
       l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in value_list:
-        if v in l:
-          l.remove(v)
+      for value in value_list:
+        for i, trainable_variable in enumerate(l):
+          if value is trainable_variable:
+            del l[i]
+            break
+
     g.add_to_collections(collections, result)
   elif ops.GraphKeys.GLOBAL_STEP in collections:
     ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 27db4261f8b..d0d14a7831e 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -500,12 +500,10 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([0, 1], self.evaluate(next_val))
     else:
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "only supported when eager execution is "
-                                   "enabled"):
-        dist_dataset_from_func = \
-          default_strategy.experimental_distribute_datasets_from_function(
-              dataset_fn)
+      dist_dataset_from_func = \
+        default_strategy.experimental_distribute_datasets_from_function(
+            dataset_fn)
+      dataset_ops.make_initializable_iterator(dist_dataset_from_func)
 
 
 class InputContextTest(test.TestCase):
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 9822d223433..b55f933a668 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -88,6 +88,44 @@ def get_distributed_dataset(dataset,
         input_context=input_context)
 
 
+def get_distributed_datasets_from_function(dataset_fn,
+                                           input_workers,
+                                           input_contexts,
+                                           strategy):
+  """Returns a wrapped tf.data.DatasetV1 or tf.data.DatasetV2 instance.
+
+  This is a common function that is used by all strategies to return the right
+  tf.data.Dataset wrapped instance depending on if we are in graph or eager
+  mode.
+
+  Args:
+    dataset_fn: a function that returns a tf.data.DatasetV1 or tf.data.DatasetV2
+        instance.
+    input_workers: an InputWorkers object which specifies devices on which
+        iterators should be created.
+    input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `dataset_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
+        handle last partial batch.
+
+  Returns:
+    A wrapped tf.data.DatasetV1 or tf.data.DatasetV2 instance.
+  """
+  if ops.executing_eagerly_outside_functions():
+    return DistributedDatasetsFromFunction(
+        dataset_fn,
+        input_workers,
+        input_contexts,
+        strategy)
+  else:
+    return DistributedDatasetsFromFunctionV1(
+        dataset_fn,
+        input_workers,
+        input_contexts,
+        strategy)
+
+
 class InputWorkers(object):
   """A 1-to-many mapping from input worker devices to compute devices."""
 
@@ -443,7 +481,9 @@ class DistributedDataset(_IterableInput):
     # pipeline and only receive its own shard of the dataset.
     if split_batch_by:
       try:
-        dataset = distribute._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
+        # pylint: disable=protected-access
+        with ops.colocate_with(dataset._variant_tensor):
+          dataset = distribute._RebatchDataset(dataset, split_batch_by)
       except errors.InvalidArgumentError as e:
         if "without encountering a batch" in str(e):
           six.reraise(
@@ -457,41 +497,48 @@ class DistributedDataset(_IterableInput):
         else:
           raise
 
+    # TODO(b/138745411): Remove once stateful transformations are supported.
+    options = dataset_ops.Options()
+    options.experimental_distribute._make_stateless = True  # pylint: disable=protected-access
+    dataset = dataset.with_options(options)
+
     self._cloned_datasets = []
     if input_context:
       # Between-graph where we rely on the input_context for sharding
       assert input_workers.num_workers == 1
-      dataset = input_ops.auto_shard_dataset(  # pylint: disable=protected-access
-          dataset, input_context.num_input_pipelines,
-          input_context.input_pipeline_id)
+      dataset = input_ops.auto_shard_dataset(dataset,
+                                             input_context.num_input_pipelines,
+                                             input_context.input_pipeline_id)
       self._cloned_datasets.append(dataset)
     else:
+      replicated_ds = distribute.replicate(dataset,
+                                           input_workers.worker_devices)
       for i, worker in enumerate(input_workers.worker_devices):
         with ops.device(worker):
-          cloned_dataset = dataset
-          if not context.executing_eagerly():
-            cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
-            cloned_dataset = cloned_dataset.with_options(dataset.options())
-          # TODO(b/129506833): Figure out between graph cases
-          cloned_dataset = input_ops.auto_shard_dataset(  # pylint: disable=protected-access
+          cloned_dataset = replicated_ds[worker]
+          cloned_dataset = cloned_dataset.with_options(dataset.options())
+          cloned_dataset = input_ops.auto_shard_dataset(
               cloned_dataset, len(input_workers.worker_devices), i)
           self._cloned_datasets.append(cloned_dataset)
 
     self._input_workers = input_workers
+    # TODO(anjalisridhar): Identify if we need to set this property on the
+    # iterator.
     self.element_spec = dataset.element_spec
     self._strategy = strategy
 
   def __iter__(self):
-    if (context.executing_eagerly() or
-        ops.executing_eagerly_outside_functions()):
-      worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
-                                                      self._input_workers)
-      iterator = DistributedIterator(self._input_workers, worker_iterators,
-                                     self._strategy)
-      iterator.element_spec = self.element_spec
-      return iterator
-    raise RuntimeError("__iter__() is only supported inside of tf.function "
-                       "or when eager execution is enabled.")
+    if not (context.executing_eagerly() or
+            ops.get_default_graph().building_function):
+      raise RuntimeError("__iter__() is only supported inside of tf.function "
+                         "or when eager execution is enabled.")
+
+    worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
+                                                    self._input_workers)
+    iterator = DistributedIterator(self._input_workers, worker_iterators,
+                                   self._strategy)
+    iterator.element_spec = self.element_spec  # pylint: disable=protected-access
+    return iterator
 
 
 class DistributedDatasetV1(DistributedDataset):
@@ -512,7 +559,18 @@ class DistributedDatasetV1(DistributedDataset):
         input_context=input_context)
 
   def make_one_shot_iterator(self):
-    """Get a one time use iterator for DistributedDatasetV1."""
+    """Get a one time use iterator for DistributedDatasetV1.
+
+    Note: This API is deprecated. Please use `for ... in dataset:` to iterate
+    over the dataset or `iter` to create an iterator.
+
+    Returns:
+      A DistributedIteratorV1 instance.
+    """
+    return self._make_one_shot_iterator()
+
+  def _make_one_shot_iterator(self):
+    """Get an iterator for DistributedDatasetV1."""
     # Graph mode with one shot iterator is disabled because we have to call
     # `initialize` on the iterator which is only required if we are using a
     # tf.distribute strategy.
@@ -522,12 +580,24 @@ class DistributedDatasetV1(DistributedDataset):
     return self._get_iterator()
 
   def make_initializable_iterator(self):
+    """Get an initializable iterator for DistributedDatasetV1.
+
+    Note: This API is deprecated. Please use
+    `tf.compat.v1.data.make_initializable_iterator(dataset)` to create an
+    initializable iterator.
+
+    Returns:
+      A DistributedIteratorV1 instance.
+    """
+    return self._make_initializable_iterator()
+
+  def _make_initializable_iterator(self, shared_name=None):  # pylint: disable=unused-argument
     """Get an initializable iterator for DistributedDatasetV1."""
     # Eager mode generates already initialized iterators. Hence we cannot create
     # an initializable iterator.
     if context.executing_eagerly():
       raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
+                       "Please use `iter()` instead.")
     return self._get_iterator()
 
   def _get_iterator(self):
@@ -535,7 +605,7 @@ class DistributedDatasetV1(DistributedDataset):
                                                     self._input_workers)
     iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
                                      self._strategy)
-    iterator.element_spec = self.element_spec
+    iterator.element_spec = self.element_spec  # pylint: disable=protected-access
     return iterator
 
 
@@ -570,18 +640,45 @@ class DistributedDatasetsFromFunction(_IterableInput):
     self._strategy = strategy
 
   def __iter__(self):
-    iterators = []
-    for i, ctx in enumerate(self._input_contexts):
-      worker = self._input_workers.worker_devices[i]
-      with ops.device(worker):
-        dataset = self._dataset_fn(ctx)
-        devices = self._input_workers.compute_devices_for_worker(i)
-        iterator = _SingleWorkerDatasetIterator(dataset, worker, devices)
-        iterators.append(iterator)
+    if not (context.executing_eagerly() or
+            ops.get_default_graph().building_function):
+      raise RuntimeError("__iter__() is only supported inside of tf.function "
+                         "or when eager execution is enabled.")
 
+    iterators = _create_iterators_per_worker_with_input_context(
+        self._input_contexts, self._input_workers, self._dataset_fn)
     return DistributedIterator(self._input_workers, iterators, self._strategy)
 
 
+class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
+  """Inputs created from dataset function."""
+
+  def _make_initializable_iterator(self, shared_name=None):
+    """Get an initializable iterator for DistributedDatasetsFromFunctionV1."""
+    del shared_name  # Unused
+    # Eager mode generates already initialized iterators. Hence we cannot create
+    # an initializable iterator.
+    if context.executing_eagerly():
+      raise ValueError("Cannot create initializable iterator in Eager mode. "
+                       "Please use `iter()` instead.")
+    return self._get_iterator()
+
+  def _make_one_shot_iterator(self):
+    """Get an iterator for iterating over DistributedDatasetsFromFunctionV1."""
+    # Graph mode with one shot iterator is disabled because we have to call
+    # `initialize` on the iterator which is only required if we are using a
+    # tf.distribute strategy.
+    if not context.executing_eagerly():
+      raise ValueError("Cannot create a one shot iterator. Please use "
+                       "`make_initializable_iterator()` instead.")
+    return self._get_iterator()
+
+  def _get_iterator(self):
+    iterators = _create_iterators_per_worker_with_input_context(
+        self._input_contexts, self._input_workers, self._dataset_fn)
+    return DistributedIteratorV1(self._input_workers, iterators, self._strategy)
+
+
 # TODO(anjalisridhar): This class will be soon be removed in favor of newer
 # APIs.
 class InputFunctionIterator(DistributedIteratorV1):
@@ -668,7 +765,7 @@ class DatasetIterator(DistributedIteratorV1):
         dist_dataset._cloned_datasets, input_workers)  # pylint: disable=protected-access
     super(DatasetIterator, self).__init__(
         input_workers,
-        worker_iterators,
+        worker_iterators,  # pylint: disable=protected-access
         strategy)
     self.element_spec = dist_dataset.element_spec  # pylint: disable=protected-access
 
@@ -796,7 +893,7 @@ class _SingleWorkerDatasetIterator(object):
     Returns:
       A list of any initializer ops that should be run.
     """
-    if context.executing_eagerly():
+    if ops.executing_eagerly_outside_functions():
       self._iterator._eager_reset()  # pylint: disable=protected-access
       return []
     else:
@@ -863,6 +960,25 @@ def _create_iterators_per_worker(worker_datasets, input_workers):
   return iterators
 
 
+def _create_iterators_per_worker_with_input_context(input_contexts,
+                                                    input_workers,
+                                                    dataset_fn):
+  """Create a multidevice iterator per workers given a dataset function."""
+  iterators = []
+  for i, ctx in enumerate(input_contexts):
+    worker = input_workers.worker_devices[i]
+    with ops.device(worker):
+      dataset = dataset_fn(ctx)
+      # TODO(b/138745411): Remove once stateful transformations are supported.
+      options = dataset_ops.Options()
+      options.experimental_distribute._make_stateless = True  # pylint: disable=protected-access
+      dataset = dataset.with_options(options)
+      devices = input_workers.compute_devices_for_worker(i)
+      iterator = _SingleWorkerDatasetIterator(dataset, worker, devices)
+      iterators.append(iterator)
+  return iterators
+
+
 # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
 def _get_batched_dataset(d):
   """Get the batched dataset from `d`."""
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 46b4a6c58c3..9fe67212488 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
@@ -220,6 +221,31 @@ class DistributedIteratorTestBase(test.TestCase):
 class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
                                           parameterized.TestCase):
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
+          ]))
+  def testMultiDeviceIterInitialize(self, distribution):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda _: dataset_ops.DatasetV1.range(10)
+
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
+
+    dist_dataset = input_lib.get_distributed_dataset(
+        dataset_fn(distribute_lib.InputContext()), input_workers, distribution)
+
+    iterator = dataset_ops.make_one_shot_iterator(dist_dataset)
+
+    @def_function.function
+    def init_func_for_iter():
+      self.evaluate(iterator.initializer)
+
+    init_func_for_iter()
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
diff --git a/tensorflow/python/distribute/keras_experimental_saved_model_test.py b/tensorflow/python/distribute/keras_experimental_saved_model_test.py
deleted file mode 100644
index 0bfb3419cc2..00000000000
--- a/tensorflow/python/distribute/keras_experimental_saved_model_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for saving and loading using keras experimental APIs with DS."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import saved_model_test_base as test_base
-from tensorflow.python.eager import test
-from tensorflow.python.keras.saving import saved_model_experimental as saved_model
-
-
-class KerasExperimentalSaveLoadTest(test_base.TestSavedModelBase):
-
-  def setUp(self):
-    self._root_dir = 'keras_experimental_save_load'
-    super(KerasExperimentalSaveLoadTest, self).setUp()
-
-  def _save_model(self, model, saved_dir):
-    saved_model.export_saved_model(model, saved_dir)
-
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name):
-    restored_keras_model = saved_model.load_from_saved_model(saved_dir)
-    return restored_keras_model.predict(
-        predict_dataset, steps=test_base.PREDICT_STEPS)
-
-  @combinations.generate(test_base.simple_models_with_strategies())
-  def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(model_and_input,
-                                                    distribution)
-
-  @combinations.generate(
-      combinations.times(test_base.simple_models_with_strategies(),
-                         combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
-    self.run_test_save_strategy_restore_no_strategy(model_and_input,
-                                                    distribution, save_in_scope)
-
-  @combinations.generate(
-      combinations.times(test_base.simple_models_with_strategy_pairs(),
-                         combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_strategy(self, model_and_input,
-                                          distribution_for_saving,
-                                          distribution_for_restoring,
-                                          save_in_scope):
-    self.run_test_save_strategy_restore_strategy(model_and_input,
-                                                 distribution_for_saving,
-                                                 distribution_for_restoring,
-                                                 save_in_scope)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/distribute/keras_save_load_test.py b/tensorflow/python/distribute/keras_save_load_test.py
index e001ae43814..45bf27a8140 100644
--- a/tensorflow/python/distribute/keras_save_load_test.py
+++ b/tensorflow/python/distribute/keras_save_load_test.py
@@ -34,27 +34,32 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
     model.save(saved_dir, save_format='tf')
 
   def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name):
+                          output_name, experimental_run_tf_function):
     restored_keras_model = save.load_model(saved_dir)
+    restored_keras_model._experimental_run_tf_function = (
+        experimental_run_tf_function)
     return restored_keras_model.predict(
         predict_dataset, steps=test_base.PREDICT_STEPS)
 
   @combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(model_and_input,
-                                                    distribution)
+                                             distribution,
+                                             experimental_run_tf_function):
+    self.run_test_save_no_strategy_restore_strategy(
+        model_and_input, distribution, experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
+                                             distribution, save_in_scope,
+                                             experimental_run_tf_function):
     if save_in_scope:
       self.skipTest(('b/134703272 - Saving model in tf.distribute.Strategy ',
                      'scope is not supported.'))
-    self.run_test_save_strategy_restore_no_strategy(model_and_input,
-                                                    distribution, save_in_scope)
+    self.run_test_save_strategy_restore_no_strategy(
+        model_and_input, distribution, save_in_scope,
+        experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
@@ -62,14 +67,17 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
   def test_save_strategy_restore_strategy(self, model_and_input,
                                           distribution_for_saving,
                                           distribution_for_restoring,
-                                          save_in_scope):
+                                          save_in_scope,
+                                          experimental_run_tf_function):
     if save_in_scope:
       self.skipTest(('b/134703272 - Saving model in tf.distribute.Strategy ',
                      'scope is not supported.'))
     self.run_test_save_strategy_restore_strategy(model_and_input,
                                                  distribution_for_saving,
                                                  distribution_for_restoring,
-                                                 save_in_scope)
+                                                 save_in_scope,
+                                                 experimental_run_tf_function)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/minimize_loss_test.py b/tensorflow/python/distribute/minimize_loss_test.py
index c9422ab8014..789ee970b9d 100644
--- a/tensorflow/python/distribute/minimize_loss_test.py
+++ b/tensorflow/python/distribute/minimize_loss_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
@@ -161,6 +162,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               optimizer_fn=strategy_combinations.optimizers_v1_and_v2,
               mode=["graph"]))
   def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+    if (not context.executing_eagerly() and
+        control_flow_v2_toggles.control_flow_v2_enabled()):
+      self.skipTest("b/138751864")
     created_variables = []
     trainable_variables = []
 
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 811bd2541e8..8db3ba668af 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -342,7 +342,8 @@ class MirroredStrategy(distribute_lib.Strategy):
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
-  The multi-worker version will be added in the future.
+  To use `MirroredStrategy` with multiple workers, please refer to
+  `tf.distribute.MultiWorkerMirroredStrategy`.
 
   Args:
     devices: a list of device strings.  If `None`, all available GPUs are used.
@@ -374,8 +375,22 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
   def __init__(self, container_strategy, devices=None, cross_device_ops=None):
     super(MirroredExtended, self).__init__(container_strategy)
-    if devices is None:
-      devices = all_devices()
+    if context.executing_eagerly():
+      if devices and not _is_device_list_local(devices):
+        raise RuntimeError("In-graph multi-worker training with "
+                           "`MirroredStrategy` is not supported in eager mode.")
+      else:
+        if TFConfigClusterResolver().cluster_spec().as_dict():
+          # if you are executing in eager mode, only the single machine code
+          # path is supported.
+          logging.info("Initializing local devices since in-graph multi-worker "
+                       "training with `MirroredStrategy` is not supported in "
+                       "eager mode. TF_CONFIG will be ignored when "
+                       "when initializing `MirroredStrategy`.")
+        devices = devices or all_local_devices()
+    else:
+      devices = devices or all_devices()
+
     assert devices, ("Got an empty `devices` list and unable to recognize "
                      "any local devices.")
     self._cross_device_ops = cross_device_ops
@@ -405,6 +420,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._inferred_cross_device_ops = None if self._cross_device_ops else (
         cross_device_ops_lib.choose_the_best(devices))
     self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
+    self._is_multi_worker_training = False
 
   def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
@@ -431,6 +447,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._device_map = values.ReplicaDeviceMap(devices)
     self._input_workers = input_lib.InputWorkers(
         self._device_map, worker_devices)
+    self._is_multi_worker_training = True
 
     if len(workers) > 1:
       if not isinstance(self._cross_device_ops,
@@ -556,7 +573,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
 
-    return input_lib.DistributedDatasetsFromFunction(
+    return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers,
         input_contexts,
@@ -780,6 +797,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     """
     return True
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    return False
+
 
 class _MirroredReplicaThread(threading.Thread):
   """A thread that runs() a function on a device."""
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 7e606dbd500..5c8e9a778dd 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -49,6 +49,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -614,6 +615,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
+      self.assertIsNotNone(ops.tensor_id(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
       self.assertEqual(6.0, mirrored_var_result)
 
@@ -1162,45 +1164,68 @@ class MultiWorkerMirroredStrategyTestWithChief(
         context.num_gpus())
 
   def testMinimizeLossGraph(self):
-    strategy = mirrored_strategy.MirroredStrategy(
-        cross_device_ops=self._make_cross_device_ops())
-    strategy.configure(cluster_spec=self._cluster_spec)
-    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
-
-  def testMinimizeLossGraphMirroredStrategy(self):
-    strategy = mirrored_strategy.MirroredStrategy(
-        mirrored_strategy.all_local_devices(),
-        cross_device_ops=self._make_cross_device_ops())
-    strategy.configure(cluster_spec=self._cluster_spec)
-    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
-
-  def testMinimizeLossGraphMirroredStrategyWithOneNode(self):
-    cluster_spec = {}
-    cluster_spec["chief"] = self._cluster_spec["chief"]
-    tf_config = {"cluster": cluster_spec}
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(tf_config)}):
-      strategy = mirrored_strategy.MirroredStrategy()
-      self.assertIsInstance(strategy.extended._inferred_cross_device_ops,
-                            cross_device_ops_lib.NcclAllReduce)
-    self.skipTest('b/130551176, run the following once fixed.')
-    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
-
-  def testInitializeFromTFConfig(self):
-    tf_config = {"cluster": self._cluster_spec}
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(tf_config)}):
+    with context.graph_mode():
       strategy = mirrored_strategy.MirroredStrategy(
           cross_device_ops=self._make_cross_device_ops())
-      self.assertEqual(
-          max(context.num_gpus(), 1) * 3, strategy.num_replicas_in_sync)
+      strategy.configure(cluster_spec=self._cluster_spec)
+      self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+  def testMinimizeLossGraphMirroredStrategy(self):
+    with context.graph_mode():
+      strategy = mirrored_strategy.MirroredStrategy(
+          mirrored_strategy.all_local_devices(),
+          cross_device_ops=self._make_cross_device_ops())
+      strategy.configure(cluster_spec=self._cluster_spec)
+      self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+  def testMinimizeLossGraphMirroredStrategyWithOneNode(self):
+    with context.graph_mode():
+      cluster_spec = {}
+      cluster_spec["chief"] = self._cluster_spec["chief"]
+      tf_config = {"cluster": cluster_spec}
+      with test.mock.patch.dict("os.environ",
+                                {"TF_CONFIG": json.dumps(tf_config)}):
+        strategy = mirrored_strategy.MirroredStrategy()
+        self.assertIsInstance(strategy.extended._inferred_cross_device_ops,
+                              cross_device_ops_lib.NcclAllReduce)
+      self.skipTest("b/130551176, run the following once fixed.")
+      self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+  def testInitializeFromTFConfig(self):
+    with context.graph_mode():
+      tf_config = {"cluster": self._cluster_spec}
+      with test.mock.patch.dict("os.environ",
+                                {"TF_CONFIG": json.dumps(tf_config)}):
+        strategy = mirrored_strategy.MirroredStrategy(
+            cross_device_ops=self._make_cross_device_ops())
+        self.assertEqual(
+            max(context.num_gpus(), 1) * 3, strategy.num_replicas_in_sync)
 
   def testSummaryForReplicaZeroOnly(self):
-    strategy = mirrored_strategy.MirroredStrategy(
-        mirrored_strategy.all_local_devices(),
-        cross_device_ops=self._make_cross_device_ops())
-    strategy.configure(cluster_spec=self._cluster_spec)
-    self._test_summary_for_replica_zero_only(strategy)
+    with context.graph_mode():
+      strategy = mirrored_strategy.MirroredStrategy(
+          mirrored_strategy.all_local_devices(),
+          cross_device_ops=self._make_cross_device_ops())
+      strategy.configure(cluster_spec=self._cluster_spec)
+      self._test_summary_for_replica_zero_only(strategy)
+
+
+class MirroredVariableStopGradientTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_one_gpu,
+          ],
+          mode=["graph"]))
+  def testMirroredVariableAsStopGradient(self, distribution):
+    with distribution.scope():
+      inp = constant_op.constant(1.0)
+      x = variables.Variable(1.0)
+      y = inp*x
+      grads = gradients.gradients(x, y, stop_gradients=x)
+      self.assertIsNone(grads[0])
 
 
 def _replica_id():
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index 1bf995b881a..a5e682f09c3 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -454,6 +454,9 @@ class MirroredVariableCreationTest(test.TestCase):
         _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
   def testSyncOnReadVariable(self, distribution):
+    if context.executing_eagerly():
+      self.skipTest("Skip the test due to b/137400477.")
+
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
@@ -554,6 +557,8 @@ class MirroredVariableCreationTest(test.TestCase):
         self.assertStartsWith(v1._op.name, "replica_1/")
 
   def testSyncOnReadVariableUpdate(self, distribution):
+    if context.executing_eagerly():
+      self.skipTest("Skip the test due to b/137400477.")
 
     def model_fn():
       v_sum = variable_scope.variable(
diff --git a/tensorflow/python/distribute/model_collection/simple_models.py b/tensorflow/python/distribute/model_collection/simple_models.py
index d3b811bebc8..6a95f06b27c 100644
--- a/tensorflow/python/distribute/model_collection/simple_models.py
+++ b/tensorflow/python/distribute/model_collection/simple_models.py
@@ -22,9 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.distribute.model_collection import model_collection_base
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.module import module
+from tensorflow.python.ops import variables
 
 _BATCH_SIZE = 10
 
@@ -49,7 +52,14 @@ class SimpleFunctionalModel(model_collection_base.ModelAndInput):
 
     model = keras.Model(inputs=x, outputs=y)
     optimizer = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(loss='mse', metrics=['mae'], optimizer=optimizer)
+    experimental_run_tf_function = kwargs.pop('experimental_run_tf_function',
+                                              None)
+    assert experimental_run_tf_function is not None
+    model.compile(
+        loss='mse',
+        metrics=['mae'],
+        optimizer=optimizer,
+        experimental_run_tf_function=experimental_run_tf_function)
 
     return model, output_name
 
@@ -71,7 +81,14 @@ class SimpleSequentialModel(model_collection_base.ModelAndInput):
         5, dtype=dtypes.float32, name=output_name, input_dim=3)
     model.add(y)
     optimizer = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(loss='mse', metrics=['mae'], optimizer=optimizer)
+    experimental_run_tf_function = kwargs.pop('experimental_run_tf_function',
+                                              None)
+    assert experimental_run_tf_function is not None
+    model.compile(
+        loss='mse',
+        metrics=['mae'],
+        optimizer=optimizer,
+        experimental_run_tf_function=experimental_run_tf_function)
 
     return model, output_name
 
@@ -100,8 +117,15 @@ class SimpleSubclassModel(model_collection_base.ModelAndInput):
   def get_model(self, **kwargs):
     model = _SimpleModel()
     optimizer = gradient_descent.SGD(learning_rate=0.001)
+    experimental_run_tf_function = kwargs.pop('experimental_run_tf_function',
+                                              None)
+    assert experimental_run_tf_function is not None
     model.compile(
-        loss='mse', metrics=['mae'], cloning=False, optimizer=optimizer)
+        loss='mse',
+        metrics=['mae'],
+        cloning=False,
+        optimizer=optimizer,
+        experimental_run_tf_function=experimental_run_tf_function)
 
     return model, model.output_name
 
@@ -110,3 +134,27 @@ class SimpleSubclassModel(model_collection_base.ModelAndInput):
 
   def get_batch_size(self):
     return _BATCH_SIZE
+
+
+class _SimpleModule(module.Module):
+
+  def __init__(self):
+    self.v = variables.Variable(3.0)
+
+  @def_function.function
+  def __call__(self, x):
+    return self.v * x
+
+
+class SimpleTFModuleModel(model_collection_base.ModelAndInput):
+  """A simple model based on tf.Module and its data."""
+
+  def get_model(self, **kwargs):
+    model = _SimpleModule()
+    return model, 'foo'
+
+  def get_data(self):
+    return _get_data_for_simple_models()
+
+  def get_batch_size(self):
+    return _BATCH_SIZE
diff --git a/tensorflow/python/distribute/model_combinations.py b/tensorflow/python/distribute/model_combinations.py
index 798bf112ba5..2d8ca79fcd9 100644
--- a/tensorflow/python/distribute/model_combinations.py
+++ b/tensorflow/python/distribute/model_combinations.py
@@ -29,3 +29,6 @@ simple_sequential_model = combinations.NamedObject(
 
 simple_subclass_model = combinations.NamedObject(
     "SimpleSubclassModel", simple_models.SimpleSubclassModel())
+
+simple_tfmodule_model = combinations.NamedObject(
+    "SimpleTFModuleModel", simple_models.SimpleTFModuleModel())
diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index 97626ed3697..c96baf27a25 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -22,6 +22,8 @@ from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,14 +32,19 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 
 
+all_distributions = [
+    strategy_combinations.default_strategy,
+    strategy_combinations.one_device_strategy,
+    strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+    strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+    strategy_combinations.tpu_strategy,
+]
+
 all_combinations = combinations.combine(
-    distribution=[
-        strategy_combinations.default_strategy,
-        strategy_combinations.one_device_strategy,
-        strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-        strategy_combinations.tpu_strategy,
-    ],
-    mode=["graph"])
+    distribution=all_distributions, mode=["graph"])
+
+all_combinations_eager = combinations.combine(
+    distribution=all_distributions, mode=["eager"], use_function=[True, False])
 
 
 class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
@@ -164,5 +171,96 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
           var.eval())
 
 
+class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
+
+  def _ema_replica_fn_eager(self, w, ema):
+    ema.apply([w])
+    w.assign_sub([0.5])
+    ema.apply([w])
+    return ema.average(w)
+
+  @combinations.generate(all_combinations_eager)
+  def testReplicaContextEager(self, distribution, use_function):
+    if isinstance(distribution,
+                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest("b/139429499: TPUStrategy is not supported yet.")
+    with distribution.scope():
+      w = variables.Variable([1.0],
+                             name="w",
+                             aggregation=variables.VariableAggregation.MEAN)
+      ema = moving_averages.ExponentialMovingAverage(0.8)
+
+      def fn(w, ema):
+        return distribution.experimental_run_v2(
+            self._ema_replica_fn_eager, args=(w, ema))
+
+      if use_function:
+        fn = def_function.function(fn)
+      ema_w = fn(w, ema)
+    self.assertAllClose(
+        self.evaluate(distribution.experimental_local_results(ema_w))[0],
+        [0.89999998])
+
+  @combinations.generate(all_combinations_eager)
+  def testCrossReplicaContextEager(self, distribution, use_function):
+    with distribution.scope():
+      w = variables.Variable([1.0],
+                             name="w",
+                             aggregation=variables.VariableAggregation.MEAN)
+      ema = moving_averages.ExponentialMovingAverage(0.8)
+
+      def fn(w, ema):
+        return self._ema_replica_fn_eager(w, ema)
+
+      if use_function:
+        fn = def_function.function(fn)
+      avg = fn(w, ema)
+    self.assertAllClose(
+        self.evaluate(distribution.experimental_local_results(avg))[0],
+        [0.89999998])
+
+  def _ema_replica_fn_graph(self):
+    w = variables.Variable([1.0],
+                           name="w",
+                           aggregation=variables.VariableAggregation.MEAN)
+    ema = moving_averages.ExponentialMovingAverage(0.8)
+    w_apply = ema.apply([w])
+    w_assign = w.assign_sub([0.5])
+    return w_assign, w_apply, ema.average(w)
+
+  @combinations.generate(all_combinations)
+  def testReplicaContextGraph(self, distribution):
+    if isinstance(distribution,
+                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest("b/139550827: Cannot do variable.assign in replica context "
+                    "of TPUStrategy")
+    with distribution.scope():
+      w_assign, w_apply, ema_w = distribution.experimental_run_v2(
+          self._ema_replica_fn_graph)
+    self.assertEqual(ema_w.name, "w/ExponentialMovingAverage:0")
+    with self.cached_session():
+      variables.global_variables_initializer().run()
+      self.evaluate(distribution.experimental_local_results(w_apply))
+      self.evaluate(distribution.experimental_local_results(w_assign))
+      self.evaluate(distribution.experimental_local_results(w_apply))
+      self.assertAllClose(
+          self.evaluate(distribution.experimental_local_results(ema_w))[0],
+          [0.89999998])
+
+  @combinations.generate(all_combinations)
+  def testCrossReplicaContextGraph(self, distribution):
+    with distribution.scope():
+      w_assign, w_apply, ema_w = self._ema_replica_fn_graph()
+    self.assertEqual(ema_w.name, "w/ExponentialMovingAverage:0")
+    with self.cached_session():
+      variables.global_variables_initializer().run()
+      self.evaluate(distribution.experimental_local_results(w_apply))
+      self.evaluate(distribution.experimental_local_results(w_assign))
+      self.evaluate(distribution.experimental_local_results(w_apply))
+      self.assertAllClose(
+          self.evaluate(distribution.experimental_local_results(ema_w))[0],
+          [0.89999998])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index dc03de85ba1..096723aa351 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import contextlib
 import copy
 import json
@@ -49,6 +48,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 original_run_std_server = dc._run_std_server  # pylint: disable=protected-access
@@ -359,7 +359,7 @@ class MultiWorkerTestBase(test.TestCase):
     self._coord.join(threads)
 
 
-class MockOsEnv(collections.Mapping):
+class MockOsEnv(collections_abc.Mapping):
   """A class that allows per-thread TF_CONFIG."""
 
   def __init__(self, *args):
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 918e025c13a..983cd5fb212 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.training import server_lib
@@ -226,15 +223,6 @@ def id_in_cluster(cluster_spec, task_type, task_id):
   raise ValueError("There is no id for task_type %r" % task_type)
 
 
-def in_multi_worker_mode():
-  """Whether the program is operating in Multi-Worker setting."""
-  # TODO(rchao): Consider a warning if user uses multiple `model` method
-  # calls in multi-worker setting.
-  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-  cluster_spec = server_lib.ClusterSpec(tf_config.get("cluster", {}))
-  return tf_config and "master" not in cluster_spec.jobs
-
-
 def should_save_checkpoint():
   """Returns whether the current worker should save checkpoints.
 
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 8381a4d34cd..b64d503d5cb 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -300,7 +300,7 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
                                              self._container_strategy())
 
   def _experimental_distribute_datasets_from_function(self, dataset_fn):
-    return input_lib.DistributedDatasetsFromFunction(
+    return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers,
         [distribute_lib.InputContext()],
@@ -383,6 +383,10 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def value_container(self, value):
     return value
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    return False
+
   @property
   def _num_replicas_in_sync(self):
     return 1
@@ -403,6 +407,10 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def experimental_should_init(self):
     return True
 
+  @property
+  def experimental_between_graph(self):
+    return False
+
   @property
   def should_checkpoint(self):
     return True
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 829b54af4b6..9b007cd9c34 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes implementing a multi-worker ps DistributionStrategy."""
+"""Class implementing a multi-worker parameter server tf.distribute strategy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -48,13 +48,13 @@ _LOCAL_CPU = "/device:CPU:0"
 # TODO(yuefengz): maybe cache variables on local CPU.
 @tf_export("distribute.experimental.ParameterServerStrategy", v1=[])
 class ParameterServerStrategy(distribute_lib.Strategy):
-  """An asynchronous multi-worker parameter server DistributionStrategy.
+  """An asynchronous multi-worker parameter server tf.distribute strategy.
 
-  This strategy requires two jobs: workers and parameter servers.  Variables and
+  This strategy requires two jobs: workers and parameter servers. Variables and
   updates to those variables will be assigned to parameter servers and other
   operations are assigned to workers.
 
-  When each worker has more than one GPU, operations will be replicated on these
+  When each worker has more than one GPU, operations will be replicated on all
   GPUs. Even though operations may be replicated, variables are not and each
   worker shares a common view for which parameter server a variable is assigned
   to.
@@ -83,11 +83,24 @@ class ParameterServerStrategy(distribute_lib.Strategy):
   2) It is also not recommended to open a colocation scope (i.e. calling
   `tf.compat.v1.colocate_with`) under the strategy's scope. For colocating
   variables, use `strategy.extended.colocate_vars_with` instead. Colocation of
-  ops will possibly create conflicts of device assignment.
+  ops will possibly create device assignment conflicts.
+
+  Note: This strategy only works with the Estimator API. Pass an instance of
+  this strategy to the `experimental_distribute` argument when you create the
+  `RunConfig`. This instance of `RunConfig` should then be passed to the
+  `Estimator` instance on which `train_and_evaluate` is called.
+
+  For Example:
+  ```
+  strategy = tf.distribute.experimental.ParameterServerStrategy()
+  run_config = tf.estimator.RunConfig(
+      experimental_distribute.train_distribute=strategy)
+  estimator = tf.estimator.Estimator(config=run_config)
+  tf.estimator.train_and_evaluate(estimator,...)
   """
 
   def __init__(self, cluster_resolver=None):
-    """Initializes this strategy.
+    """Initializes this strategy with an optional `cluster_resolver`.
 
     Args:
       cluster_resolver: Optional
@@ -103,7 +116,7 @@ class ParameterServerStrategy(distribute_lib.Strategy):
     super(ParameterServerStrategy, self).__init__(extended)
 
 
-@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])
+@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])  # pylint: disable=missing-docstring
 class ParameterServerStrategyV1(distribute_lib.StrategyV1):
 
   __doc__ = ParameterServerStrategy.__doc__
@@ -113,6 +126,7 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1):
     super(ParameterServerStrategyV1, self).__init__(
         ParameterServerStrategyExtended(
             self, cluster_resolver=cluster_resolver))
+  __init__.__doc__ = ParameterServerStrategy.__init__.__doc__
 
 
 # TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
@@ -241,7 +255,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
                         compute_devices,
                         parameter_device,
                         cluster_resolver=None):
-    """Initialize internal devices for local training."""
+    """Initialize local devices for training."""
     worker_device = device_util.canonicalize("/device:CPU:0")
     self._input_host_device = numpy_dataset.SingleDevice(worker_device)
 
@@ -336,7 +350,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
         input_pipeline_id=input_pipeline_id,
         num_replicas_in_sync=self._num_replicas_in_sync)
 
-    return input_lib.DistributedDatasetsFromFunction(
+    return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers,
         [input_context],
@@ -359,7 +373,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
   def _allow_variable_partition(self):
     return not context.executing_eagerly()
 
-  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # TODO(yuefengz): Not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
     if self._num_replicas_in_sync > 1:
@@ -455,7 +469,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
                                                value_destination_pairs)
 
   def _select_single_value(self, structured):
-    """Select any single values in `structured`."""
+    """Select any single value in `structured`."""
 
     def _select_fn(x):  # pylint: disable=g-missing-docstring
       if isinstance(x, values.Mirrored):
@@ -523,13 +537,13 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
                  cluster_spec=None,
                  task_type=None,
                  task_id=None):
-    """Configures the strategy class.
+    """Configures the strategy class with `cluser_spec`.
 
-    The strategy object will be re-initialized if `cluster_spec` is given but
-    was not passed in the constructor.
+    The strategy object will be re-initialized if `cluster_spec` is passed to
+    `configure` but was not passed when instantiating the strategy.
 
     Args:
-      session_config: not used currently.
+      session_config: Session config object.
       cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
         cluster configurations.
       task_type: the current task type.
@@ -573,6 +587,12 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
           "/job:%s/task:%d" % (self._task_type, self._task_id))
     return updated_config
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    # With a PS job, PS strategy should always be considered as in multi
+    # worker mode.
+    return True
+
   @property
   def _num_replicas_in_sync(self):
     return self._device_map.num_replicas_in_graph
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
new file mode 100644
index 00000000000..f8202fd050b
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -0,0 +1,817 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterServerStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import threading
+from absl.testing import parameterized
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import run_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
+
+CHIEF = run_config.TaskType.CHIEF
+WORKER = run_config.TaskType.WORKER
+PS = run_config.TaskType.PS
+
+
+def _get_replica_id_integer():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        sess_config=None):
+  sess_config = sess_config or config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if cluster_spec and task_type and task_id is not None:
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        task_type=task_type,
+        task_id=task_id,
+        num_accelerators={'GPU': num_gpus})
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        cluster_resolver)
+    target = 'grpc://' + cluster_spec[WORKER][task_id]
+  else:
+    distribution = (
+        central_storage_strategy.CentralStorageStrategy._from_num_gpus(num_gpus)
+    )
+    target = ''
+
+  sess_config = copy.deepcopy(sess_config)
+  sess_config = distribution.update_config_proto(sess_config)
+
+  return distribution, target, sess_config
+
+
+class ParameterServerStrategyTestBase(
+    multi_worker_test_base.MultiWorkerTestBase):
+
+  def setUp(self):
+    self._result = 0
+    self._lock = threading.Lock()
+    self._init_condition = threading.Condition()
+    self._init_reached = 0
+    self._finish_condition = threading.Condition()
+    self._finish_reached = 0
+    self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    super(ParameterServerStrategyTestBase, self).setUp()
+
+  def _get_test_objects(self, task_type, task_id, num_gpus):
+    return create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        num_gpus=num_gpus,
+        sess_config=self._sess_config)
+
+  def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
+    worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      # Define a variable outside the call_for_each_replica scope.
+      n = variable_scope.get_variable('n', initializer=10.0)
+      self.assertEqual(n.device, '/job:ps/task:0')
+
+      def model_fn():
+        if num_gpus == 0:
+          last_part_device = 'device:CPU:0'
+        else:
+          replica_id = _get_replica_id_integer()
+          last_part_device = ('device:GPU:%d' % replica_id)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device, worker_device + '/' + last_part_device)
+        self.assertEqual(b.device, worker_device + '/' + last_part_device)
+        self.assertEqual(c.device, worker_device + '/' + last_part_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable(
+              'x', initializer=10.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+          x_add = x.assign_add(c)
+          e = a + c
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        self.assertEqual(x.device, '/job:ps/task:1')
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device,
+                         '/job:worker/replica:0/task:0/%s' % last_part_device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.extended.colocate_vars_with(x):
+          y = variable_scope.get_variable(
+              'y', initializer=20.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+        # We add an identity here to avoid complaints about summing
+        # non-distributed values.
+        y_add = y.assign_add(array_ops.identity(x_add))
+        self.assertEqual(y.device, '/job:ps/task:1')
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable(
+            'z', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+        self.assertEqual(z.device, '/job:ps/task:0')
+        self.assertNotEqual(z.device, x.device)
+
+        with ops.control_dependencies([y_add]):
+          # We add an identity here to avoid complaints about summing
+          # non-distributed values.
+          z_add = z.assign_add(array_ops.identity(y))
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device, worker_device + '/' + last_part_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, worker_device + '/device:CPU:1')
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          v = variable_scope.get_variable('v', initializer=30.0)
+          h = f + 1.0
+        self.assertIn('/job:ps/', u.device)
+        self.assertIn('/job:ps/', v.device)
+        # u and v are on different parameter servers.
+        self.assertTrue(u.device != x.device or v.device != x.device)
+        self.assertTrue(u.device == x.device or v.device == x.device)
+        # Here h is not on one worker. Note h.device is canonical while x.device
+        # is not but.
+        self.assertIn('/job:ps/', h.device)
+        return y_add, z_add, f
+
+      y, z, f = d.extended.call_for_each_replica(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  def _test_device_assignment_distributed_enable_partitioner(
+      self, task_type, task_id, num_gpus):
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    num_shards = len(d.extended.parameter_devices)
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      n = variable_scope.get_variable(
+          'n',
+          initializer=constant_op.constant([10.0, 20.0]),
+          aggregation=variable_scope.VariableAggregation.SUM,
+          partitioner=partitioner)
+
+      for part_id, var in enumerate(n):
+        self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+
+      def model_fn():
+        a = constant_op.constant([3.0, 5.0])
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable(
+              'x',
+              initializer=constant_op.constant([10.0, 20.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+          x_add = x.assign_add(a, name='x_add')
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        for part_id, var in enumerate(x):
+          self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+          self.assertEqual(var.device, x_add[part_id].device)
+
+        return x_add
+
+      x = d.extended.call_for_each_replica(model_fn)
+
+      if context.num_gpus() >= 1:
+        variables.global_variables_initializer().run()
+        x_val = sess.run(x)
+        if num_gpus < 1:
+          self.assertEqual(x_val, [13.0, 25.0])
+        else:
+          x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
+          self.assertEqual(x_val, x_expect)
+
+  def _test_device_assignment_local(self,
+                                    d,
+                                    compute_device='CPU',
+                                    variable_device='CPU',
+                                    num_gpus=0):
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=self._sess_config) as sess, \
+         d.scope():
+
+      def model_fn():
+        if 'CPU' in compute_device:
+          replica_compute_device = '/device:CPU:0'
+        else:
+          replica_id = _get_replica_id_integer()
+          replica_compute_device = ('/device:GPU:%d' % replica_id)
+        replica_compute_device = device_util.canonicalize(
+            replica_compute_device)
+
+        if 'CPU' in variable_device:
+          replica_variable_device = '/device:CPU:0'
+        else:
+          replica_id = _get_replica_id_integer()
+          replica_variable_device = ('/device:GPU:%d' % replica_id)
+        replica_variable_device = device_util.canonicalize(
+            replica_variable_device)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device, replica_compute_device)
+        self.assertEqual(b.device, replica_compute_device)
+        self.assertEqual(c.device, replica_compute_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/device:GPU:2'):
+          x = variable_scope.get_variable(
+              'x', initializer=10.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+          x_add = x.assign_add(c)
+          e = a + c
+        self.assertEqual(
+            device_util.canonicalize(x.device), replica_variable_device)
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.extended.colocate_vars_with(x):
+          y = variable_scope.get_variable(
+              'y', initializer=20.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+        # We add an identity here to avoid complaints about summing
+        # non-distributed values.
+        y_add = y.assign_add(array_ops.identity(x_add))
+        self.assertEqual(
+            device_util.canonicalize(y.device), replica_variable_device)
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable(
+            'z', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+        self.assertEqual(
+            device_util.canonicalize(z.device), replica_variable_device)
+
+        with ops.control_dependencies([y_add]):
+          # We add an identity here to avoid complaints about summing
+          # non-distributed values.
+          z_add = z.assign_add(array_ops.identity(y))
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device, replica_compute_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          h = f + 1.0
+        self.assertEqual(
+            device_util.canonicalize(u.device), replica_variable_device)
+        self.assertEqual(
+            device_util.canonicalize(x.device),
+            device_util.canonicalize(h.device))
+        return y_add, z_add, f
+
+      y, z, f = d.extended.call_for_each_replica(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  def _test_simple_increment(self, task_type, task_id, num_gpus):
+    d, master_target, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    if d.extended._cluster_spec:
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d.extended._cluster_spec.as_dict():
+        num_workers += 1
+    else:
+      num_workers = 1
+    with ops.Graph().as_default(), \
+         self.cached_session(target=master_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      def model_fn():
+        x = variable_scope.get_variable(
+            'x', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+        y = variable_scope.get_variable(
+            'y', initializer=20.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+        z = variable_scope.get_variable(
+            'z', initializer=30.0,
+            aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA)
+
+        # We explicitly make a constant tensor here to avoid complaints about
+        # summing non-distributed values.
+        one = constant_op.constant(1.0)
+        x_add = x.assign_add(one, use_locking=True)
+        y_add = y.assign_add(one, use_locking=True)
+        z_add = z.assign_add(one, use_locking=True)
+
+        train_op = control_flow_ops.group(x_add, y_add, z_add)
+        return x, y, z, train_op
+
+      x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
+      train_op = d.group(train_op)
+
+      if context.num_gpus() < sum(
+          1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
+        return True
+
+      if task_id == 0:
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != num_workers:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      sess.run(train_op)
+
+      # Wait for other workers to finish training.
+      self._finish_condition.acquire()
+      self._finish_reached += 1
+      while self._finish_reached != num_workers:
+        self._finish_condition.wait()
+      self._finish_condition.notify_all()
+      self._finish_condition.release()
+
+      x_val, y_val, z_val = sess.run([x, y, z])
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
+      self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
+
+  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+    d, master_target, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    if task_type:
+      # Multi-worker
+      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if CHIEF in d.extended._cluster_spec.as_dict():
+        num_workers += 1
+    else:
+      # local
+      num_workers = 1
+
+    with ops.Graph().as_default(), \
+         self.cached_session(target=master_target,
+                             config=sess_config) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss_fn(x):
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
+      # multiple graphs (b/111216820).
+      def grad_fn(x):
+        loss = loss_fn(x)
+        var_list = (
+            variables.trainable_variables() + ops.get_collection(
+                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        grads = gradients.gradients(loss, var_list)
+        ret = list(zip(grads, var_list))
+        return ret
+
+      def update(v, g):
+        return v.assign_sub(0.05 * g, use_locking=True)
+
+      one = constant_op.constant([[1.]])
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.extended.read_var(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            # TODO(yuefengz): support non-Mirrored variable as destinations.
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
+            with ops.control_dependencies(
+                d.extended.update(v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+
+      if context.num_gpus() < sum(
+          1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
+        return True
+
+      if (not task_type or
+          multi_worker_util.is_chief(
+              d.extended._cluster_spec, task_type, task_id)):
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != num_workers:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              test_reinitialize=True,
+                              ignore_order=False):
+    distribution, master_target, config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
+        if ignore_order:
+          self.assertCountEqual(expected_value, computed_value)
+        else:
+          self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_replica(r, next_element)
+                  for r in range(len(devices))])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      if test_reinitialize:
+        sess.run(iterator.initialize())
+
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          if ignore_order:
+            self.assertCountEqual(expected_value, computed_value)
+          else:
+            self.assertEqual(expected_value, computed_value)
+
+
+class ParameterServerStrategyTest(
+    ParameterServerStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+    cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_num_replicas_in_sync(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+    # All the devices on a given worker are in sync which in this case is the
+    # number of gpus on each worker.
+    self.assertEqual(2, strategy.num_replicas_in_sync)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalCPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=0)
+    self._test_device_assignment_local(
+        strategy, compute_device='CPU', variable_device='CPU', num_gpus=0)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalOneGPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=1)
+    self._test_device_assignment_local(
+        strategy, compute_device='GPU', variable_device='GPU', num_gpus=1)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testDeviceAssignmentLocalTwoGPUs(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+    self._test_device_assignment_local(
+        strategy, compute_device='GPU', variable_device='CPU', num_gpus=2)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributed(self, num_gpus):
+    self._test_device_assignment_distributed('worker', 1, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+    self._test_device_assignment_distributed_enable_partitioner(
+        'worker', 1, num_gpus)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testSimpleBetweenGraph(self):
+    self._run_between_graph_clients(self._test_simple_increment,
+                                    self._cluster_spec, context.num_gpus())
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testLocalSimpleIncrement(self, num_gpus):
+    self._test_simple_increment(None, 0, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphDistributed(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphLocal(self, num_gpus):
+    self._test_minimize_loss_graph(None, None, num_gpus)
+
+  # TODO(priyag): Refactor this and other multi worker tests.
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_dataset=[True, False]))
+  def testMakeInputFnIteratorDistributed(self, num_gpus, use_dataset):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        ignore_order=not use_dataset)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_dataset=[True, False]))
+  def testMakeInputFnIteratorLocal(self, num_gpus, use_dataset):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        ignore_order=not use_dataset)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepUpdate(self):
+    strategy, _, _ = create_test_objects()
+    self._test_global_step_update(strategy)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProtoMultiWorker(self):
+    strategy, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=1,
+        num_gpus=2)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+
+    new_config = strategy.update_config_proto(config_proto)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1', '/job:ps'],
+                     new_config.device_filters)
+
+    # Verify isolate_session_state
+    self.assertFalse(new_config.isolate_session_state)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testUpdateConfigProtoLocal(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = strategy.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
+
+
+class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
+                                           parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2, has_chief=True)
+    cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testSimpleBetweenGraph(self):
+    self._run_between_graph_clients(self._test_simple_increment,
+                                    self._cluster_spec, context.num_gpus())
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepIsWrappedOnTwoGPUs(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+    with ops.Graph().as_default(), strategy.scope():
+      created_step = training_util.create_global_step()
+      get_step = training_util.get_global_step()
+      self.assertEqual(created_step, get_step,
+                       msg=('created_step %s type %s vs. get_step %s type %s' %
+                            (id(created_step), created_step.__class__.__name__,
+                             id(get_step), get_step.__class__.__name__)))
+      self.assertIs(values.AggregatingVariable, type(created_step))
+      self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(strategy, created_step.distribute_strategy)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testGlobalStepIsNotWrappedOnOneGPU(self):
+    strategy, _, _ = create_test_objects(num_gpus=1)
+    with ops.Graph().as_default(), strategy.scope():
+      created_step = training_util.create_global_step()
+      get_step = training_util.get_global_step()
+      self.assertEqual(created_step, get_step,
+                       msg=('created_step %s type %s vs. get_step %s type %s' %
+                            (id(created_step), created_step.__class__.__name__,
+                             id(get_step), get_step.__class__.__name__)))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(created_step))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(get_step))
+      # All variables have an _distribute_strategy parameter. Only variable
+      # subclasses in distribution strategy expose it publicly.
+      self.assertFalse(hasattr(strategy, 'distribute_strategy'))
+      self.assertIs(strategy, created_step._distribute_strategy)
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def testValueContainer(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+    with ops.Graph().as_default(), strategy.scope():
+
+      def f():
+        with backprop.GradientTape() as tape:
+          v = variable_scope.get_variable('v', initializer=10.0)
+          _ = v * v
+        v, = tape.watched_variables()
+        w = strategy.extended.value_container(v)
+        self.assertIs(values.AggregatingVariable, type(w))
+
+      strategy.extended.call_for_each_replica(f)
+
+
+class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase,
+                                 parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager'],
+                                              required_gpus=2))
+  def testNumpyDataset(self):
+    strategy, _, _ = create_test_objects(num_gpus=2)
+    self._test_numpy_dataset(strategy)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/saved_model_mixed_api_test.py b/tensorflow/python/distribute/saved_model_mixed_api_test.py
index 7179987b212..74d208d8e01 100644
--- a/tensorflow/python/distribute/saved_model_mixed_api_test.py
+++ b/tensorflow/python/distribute/saved_model_mixed_api_test.py
@@ -42,27 +42,30 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
     keras_saved_model.export_saved_model(model, saved_dir, serving_only=True)
 
   def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name):
+                          output_name, experimental_run_tf_function):
     return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
                                                        predict_dataset,
                                                        output_name)
 
   @combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(model_and_input,
-                                                    distribution)
+                                             distribution,
+                                             experimental_run_tf_function):
+    self.run_test_save_no_strategy_restore_strategy(
+        model_and_input, distribution, experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
+                                             distribution, save_in_scope,
+                                             experimental_run_tf_function):
     if save_in_scope:
       self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
                      'supported.'))
-    self.run_test_save_strategy_restore_no_strategy(model_and_input,
-                                                    distribution, save_in_scope)
+    self.run_test_save_strategy_restore_no_strategy(
+        model_and_input, distribution, save_in_scope,
+        experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
@@ -70,14 +73,16 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
   def test_save_strategy_restore_strategy(self, model_and_input,
                                           distribution_for_saving,
                                           distribution_for_restoring,
-                                          save_in_scope):
+                                          save_in_scope,
+                                          experimental_run_tf_function):
     if save_in_scope:
       self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
                      'supported.'))
     self.run_test_save_strategy_restore_strategy(model_and_input,
                                                  distribution_for_saving,
                                                  distribution_for_restoring,
-                                                 save_in_scope)
+                                                 save_in_scope,
+                                                 experimental_run_tf_function)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py
index 144ffdbbcc6..04903f18d1c 100644
--- a/tensorflow/python/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/distribute/saved_model_save_load_test.py
@@ -21,40 +21,46 @@ from __future__ import print_function
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import saved_model_test_base as test_base
 from tensorflow.python.eager import test
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
 from tensorflow.python.saved_model import saved_model
 
 
-class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
+class SavedModelKerasModelTest(test_base.TestSavedModelBase):
 
   def setUp(self):
     self._root_dir = 'saved_model_save_load'
-    super(SavedModelSaveAndLoadTest, self).setUp()
+    super(SavedModelKerasModelTest, self).setUp()
 
   def _save_model(self, model, saved_dir):
     saved_model.save(model, saved_dir)
 
   def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name):
+                          output_name, experimental_run_tf_function):
     return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
                                                        predict_dataset,
                                                        output_name)
 
   @combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(model_and_input,
-                                                    distribution)
+                                             distribution,
+                                             experimental_run_tf_function):
+    self.run_test_save_no_strategy_restore_strategy(
+        model_and_input, distribution, experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
+                                             distribution, save_in_scope,
+                                             experimental_run_tf_function):
     if save_in_scope:
+      # TODO(b/134703272): Unskip this test when fixed.
       self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
                      'supported.'))
-    self.run_test_save_strategy_restore_no_strategy(model_and_input,
-                                                    distribution, save_in_scope)
+    self.run_test_save_strategy_restore_no_strategy(
+        model_and_input, distribution, save_in_scope,
+        experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
@@ -62,14 +68,90 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
   def test_save_strategy_restore_strategy(self, model_and_input,
                                           distribution_for_saving,
                                           distribution_for_restoring,
-                                          save_in_scope):
+                                          save_in_scope,
+                                          experimental_run_tf_function):
     if save_in_scope:
+      # TODO(b/134703272): Unskip this test when fixed.
       self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
                      'supported.'))
     self.run_test_save_strategy_restore_strategy(model_and_input,
                                                  distribution_for_saving,
                                                  distribution_for_restoring,
-                                                 save_in_scope)
+                                                 save_in_scope,
+                                                 experimental_run_tf_function)
+
+
+class SavedModelTFModuleTest(test_base.TestSavedModelBase):
+
+  def setUp(self):
+    self._root_dir = 'saved_model_save_load'
+    super(SavedModelTFModuleTest, self).setUp()
+
+  def _train_model(self, model, x_train, y_train, batch_size):
+    pass
+
+  def _predict_with_model(self, distribution, model, predict_dataset):
+    if distribution:
+      dist_predict_dataset = distribution.experimental_distribute_dataset(
+          predict_dataset)
+      per_replica_predict_data = next(iter(dist_predict_dataset))
+      result = distribution.experimental_run_v2(
+          model, args=(per_replica_predict_data,))
+      # Convert the per_replica value to a list, then concatenate them
+      reduced = distribution.experimental_local_results(result)
+      concat = array_ops.concat(reduced, 0)
+      return concat
+    else:
+      return model(next(iter(predict_dataset)))
+
+  def _save_model(self, model, saved_dir):
+    call = model.__call__.get_concrete_function(tensor_spec.TensorSpec(None))
+    saved_model.save(model, saved_dir, signatures=call)
+
+  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
+                          output_name, experimental_run_tf_function):
+    del output_name, experimental_run_tf_function
+    model = saved_model.load(saved_dir)
+    return self._predict_with_model(distribution, model, predict_dataset)
+
+  @combinations.generate(test_base.tfmodule_models_with_strategies())
+  def test_save_no_strategy_restore_strategy(self, model_and_input,
+                                             distribution,
+                                             experimental_run_tf_function):
+    self.run_test_save_no_strategy_restore_strategy(
+        model_and_input, distribution, experimental_run_tf_function)
+
+  @combinations.generate(
+      combinations.times(test_base.tfmodule_models_with_strategies(),
+                         combinations.combine(save_in_scope=[True, False])))
+  def test_save_strategy_restore_no_strategy(
+      self, model_and_input, distribution, save_in_scope,
+      experimental_run_tf_function):
+    if save_in_scope:
+      # TODO(b/134703272): Unskip this test when fixed.
+      self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
+                     'supported.'))
+    self.run_test_save_strategy_restore_no_strategy(
+        model_and_input, distribution, save_in_scope,
+        experimental_run_tf_function)
+
+  @combinations.generate(
+      combinations.times(test_base.tfmodule_models_with_strategy_pairs(),
+                         combinations.combine(save_in_scope=[True, False])))
+  def test_save_strategy_restore_strategy(self, model_and_input,
+                                          distribution_for_saving,
+                                          distribution_for_restoring,
+                                          save_in_scope,
+                                          experimental_run_tf_function):
+    if save_in_scope:
+      # TODO(b/134703272): Unskip this test when fixed.
+      self.skipTest(('Saving model within tf.distribute.Strategy scope is not ',
+                     'supported.'))
+    self.run_test_save_strategy_restore_strategy(model_and_input,
+                                                 distribution_for_saving,
+                                                 distribution_for_restoring,
+                                                 save_in_scope,
+                                                 experimental_run_tf_function)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index 11f35b76f91..1001dd4e9f0 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -35,6 +35,9 @@ _RANDOM_SEED = 1337
 _DEFAULT_FUNCTION_KEY = 'serving_default'
 
 _TOLERANCE = 1e-30
+# TPU uses bfloat16 for computation in hardware underlying, so it has less
+# precision than CPU/GPU.
+_TPU_TOLERANCE = 1e-7
 
 PREDICT_STEPS = 1
 
@@ -47,30 +50,62 @@ simple_models = [
 ]
 
 
-strategies_minus_tpu = [
+strategies = [
     # TODO(b/132702156): include default strategy
     strategy_combinations.one_device_strategy,
     strategy_combinations.one_device_strategy_gpu,
     strategy_combinations.mirrored_strategy_with_one_cpu,
     strategy_combinations.mirrored_strategy_with_one_gpu,
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-    strategy_combinations.mirrored_strategy_with_two_gpus
+    strategy_combinations.mirrored_strategy_with_two_gpus,
+    strategy_combinations.tpu_strategy
 ]
 
 
+def is_tpu_strategy(distribution):
+  return (distribution is not None and
+          distribution.__class__.__name__.startswith('TPUStrategy'))
+
+
+def get_tolerance(save_distribution, restore_distribution):
+  if is_tpu_strategy(save_distribution) or is_tpu_strategy(
+      restore_distribution):
+    return _TPU_TOLERANCE
+  return _TOLERANCE
+
+
 def simple_models_with_strategies():
   return combinations.combine(
       model_and_input=simple_models,
-      distribution=strategies_minus_tpu,
-      mode=['eager'])
+      distribution=strategies,
+      mode=['eager'],
+      experimental_run_tf_function=[True, False])
 
 
 def simple_models_with_strategy_pairs():
   return combinations.combine(
       model_and_input=simple_models,
-      distribution_for_saving=strategies_minus_tpu,
-      distribution_for_restoring=strategies_minus_tpu,
-      mode=['eager'])
+      distribution_for_saving=strategies,
+      distribution_for_restoring=strategies,
+      mode=['eager'],
+      experimental_run_tf_function=[True, False])
+
+
+def tfmodule_models_with_strategies():
+  return combinations.combine(
+      model_and_input=[model_combinations.simple_tfmodule_model],
+      distribution=strategies,
+      mode=['eager'],
+      experimental_run_tf_function=[True])
+
+
+def tfmodule_models_with_strategy_pairs():
+  return combinations.combine(
+      model_and_input=[model_combinations.simple_tfmodule_model],
+      distribution_for_saving=strategies,
+      distribution_for_restoring=strategies,
+      mode=['eager'],
+      experimental_run_tf_function=[True])
 
 
 def load_and_run_with_saved_model_api(distribution, saved_dir, predict_dataset,
@@ -116,7 +151,7 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     raise NotImplementedError('must be implemented in descendants')
 
   def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name):
+                          output_name, experimental_run_tf_function):
     """Load the model and run 1 step of predict with it.
 
     This method must be implemented by the subclasses.
@@ -129,6 +164,8 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
         cross_replica context.
       output_name: the string representing the name of the output layer of the
         model.
+      experimental_run_tf_function: Whether to use the single execution path
+        for models.
     """
 
     raise NotImplementedError('must be implemented in descendants')
@@ -142,6 +179,9 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     # Train the model for 1 epoch
     model.fit(x=training_dataset, epochs=1, steps_per_epoch=100)
 
+  def _predict_with_model(self, distribution, model, predict_dataset):
+    return model.predict(predict_dataset, steps=PREDICT_STEPS)
+
   def _get_predict_dataset(self, x_predict, batch_size):
     predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
     predict_dataset = predict_dataset.repeat()
@@ -149,19 +189,20 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     return predict_dataset
 
   def run_test_save_no_strategy_restore_strategy(self, model_and_input,
-                                                 distribution):
+                                                 distribution,
+                                                 experimental_run_tf_function):
     """Save a model without DS, and restore it with DS."""
 
-    saved_dir = os.path.join(self.get_temp_dir(), self._root_dir,
-                             'test_save_no_dist_restore_dist')
+    saved_dir = os.path.join(self.get_temp_dir(), '0')
 
-    model, output_name = model_and_input.get_model()
+    model, output_name = model_and_input.get_model(
+        experimental_run_tf_function=experimental_run_tf_function)
     x_train, y_train, x_predict = model_and_input.get_data()
     batch_size = model_and_input.get_batch_size()
+    predict_dataset = self._get_predict_dataset(x_predict, batch_size)
 
     self._train_model(model, x_train, y_train, batch_size)
-    predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-    result_before_save = model.predict(predict_dataset, steps=PREDICT_STEPS)
+    result_before_save = self._predict_with_model(None, model, predict_dataset)
 
     self._save_model(model, saved_dir)
 
@@ -170,25 +211,29 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
           distribution=distribution,
           saved_dir=saved_dir,
           predict_dataset=predict_dataset,
-          output_name=output_name)
+          output_name=output_name,
+          experimental_run_tf_function=experimental_run_tf_function)
 
-    self.assertAllClose(result_before_save, result_after_save, atol=_TOLERANCE)
+    tolerance = get_tolerance(None, distribution)
+    self.assertAllClose(result_before_save, result_after_save, atol=tolerance)
 
   def run_test_save_strategy_restore_no_strategy(self, model_and_input,
-                                                 distribution, save_in_scope):
+                                                 distribution, save_in_scope,
+                                                 experimental_run_tf_function):
     """Save a model with DS, and restore it without DS."""
 
-    saved_dir = os.path.join(self.get_temp_dir(), self._root_dir,
-                             'test_save_no_dist_restore_dist')
+    saved_dir = os.path.join(self.get_temp_dir(), '1')
 
     with distribution.scope():
-      model, output_name = model_and_input.get_model()
+      model, output_name = model_and_input.get_model(
+          experimental_run_tf_function=experimental_run_tf_function)
       x_train, y_train, x_predict = model_and_input.get_data()
       batch_size = model_and_input.get_batch_size()
 
       self._train_model(model, x_train, y_train, batch_size)
       predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-      result_before_save = model.predict(predict_dataset, steps=PREDICT_STEPS)
+      result_before_save = self._predict_with_model(
+          distribution, model, predict_dataset)
 
     if save_in_scope:
       with distribution.scope():
@@ -200,27 +245,30 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
         distribution=None,
         saved_dir=saved_dir,
         predict_dataset=predict_dataset,
-        output_name=output_name)
+        output_name=output_name,
+        experimental_run_tf_function=experimental_run_tf_function)
 
-    self.assertAllClose(result_before_save, load_result, atol=_TOLERANCE)
+    tolerance = get_tolerance(distribution, None)
+    self.assertAllClose(result_before_save, load_result, atol=tolerance)
 
   def run_test_save_strategy_restore_strategy(self, model_and_input,
                                               distribution_for_saving,
                                               distribution_for_restoring,
-                                              save_in_scope):
+                                              save_in_scope,
+                                              experimental_run_tf_function):
     """Save a model with DS, and restore it with potentially different DS."""
-
-    saved_dir = os.path.join(self.get_temp_dir(), self._root_dir,
-                             'test_save_dist_restore_dist')
+    saved_dir = os.path.join(self.get_temp_dir(), '2')
 
     with distribution_for_saving.scope():
-      model, output_name = model_and_input.get_model()
+      model, output_name = model_and_input.get_model(
+          experimental_run_tf_function=experimental_run_tf_function)
       x_train, y_train, x_predict = model_and_input.get_data()
       batch_size = model_and_input.get_batch_size()
 
       self._train_model(model, x_train, y_train, batch_size)
       predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-      result_before_save = model.predict(predict_dataset, steps=PREDICT_STEPS)
+      result_before_save = self._predict_with_model(
+          distribution_for_saving, model, predict_dataset)
 
     if save_in_scope:
       with distribution_for_saving.scope():
@@ -234,6 +282,9 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
           distribution=distribution_for_restoring,
           saved_dir=saved_dir,
           predict_dataset=predict_dataset,
-          output_name=output_name)
+          output_name=output_name,
+          experimental_run_tf_function=experimental_run_tf_function)
 
-    self.assertAllClose(result_before_save, load_result, atol=_TOLERANCE)
+    tolerance = get_tolerance(distribution_for_saving,
+                              distribution_for_restoring)
+    self.assertAllClose(result_before_save, load_result, atol=tolerance)
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 0ca3a679042..193a84bb09a 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -28,9 +28,13 @@ from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_keras_v2
+from tensorflow.python.keras.optimizer_v2 import ftrl as ftrl_keras_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_keras_v2
 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras_v2
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu_strategy_util
@@ -132,12 +136,20 @@ rmsprop_optimizer_v1_fn = combinations.NamedObject(
 # TODO(shiningsun): consider adding the other v1 optimizers
 optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn]
 
-gradient_descent_optimizer_keras_v2_fn = combinations.NamedObject(
-    "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.2))
+adadelta_optimizer_keras_v2_fn = combinations.NamedObject(
+    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001))
 adagrad_optimizer_keras_v2_fn = combinations.NamedObject(
     "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
 adam_optimizer_keras_v2_fn = combinations.NamedObject(
     "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
+adamax_optimizer_keras_v2_fn = combinations.NamedObject(
+    "AdamaxKerasV2", lambda: adamax_keras_v2.Adamax(0.001, epsilon=1.0))
+nadam_optimizer_keras_v2_fn = combinations.NamedObject(
+    "NadamKerasV2", lambda: nadam_keras_v2.Nadam(0.001, epsilon=1.0))
+ftrl_optimizer_keras_v2_fn = combinations.NamedObject(
+    "FtrlKerasV2", lambda: ftrl_keras_v2.Ftrl(0.001))
+gradient_descent_optimizer_keras_v2_fn = combinations.NamedObject(
+    "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.2))
 rmsprop_optimizer_keras_v2_fn = combinations.NamedObject(
     "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))
 
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 06c791c6bdf..7f6d0b9f064 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -52,6 +52,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_util
 from tensorflow.python.util import nest
 
+
 class _TestException(Exception):
   pass
 
@@ -308,41 +309,32 @@ class DistributionTestBase(test.TestCase):
 
   def _test_input_fn_iterable(
       self, strategy, input_fn, expected_values, ignore_order=False):
-    if context.executing_eagerly():
-      self._test_input_fn_iterable_in_eager_mode(
-          strategy, input_fn, expected_values, ignore_order=False)
-    else:
-      self._test_input_fn_iterable_in_graph_mode(
-          strategy, input_fn, expected_values, ignore_order=False)
-
-  def _test_input_fn_iterable_in_graph_mode(
-      self, strategy, input_fn, expected_values, ignore_order=False):
-    with self.assertRaisesRegexp(RuntimeError, "only supported when eager "
-                                 "execution is enabled"):
-      strategy.experimental_distribute_datasets_from_function(input_fn)
-
-  def _test_input_fn_iterable_in_eager_mode(
-      self, strategy, input_fn, expected_values, ignore_order=False):
     assert_same = self.assertCountEqual if ignore_order else self.assertEqual
 
     iterable = strategy.experimental_distribute_datasets_from_function(input_fn)
-    iterator = iter(iterable)
+    if context.executing_eagerly():
+      iterator = iter(iterable)
 
-    for expected_value in expected_values:
-      computed_value = self.evaluate(
-          list(strategy.experimental_local_results(next(iterator))))
-      assert_same(expected_value, computed_value)
+      for expected_value in expected_values:
+        computed_value = self.evaluate(
+            list(strategy.experimental_local_results(next(iterator))))
+        assert_same(expected_value, computed_value)
 
-    with self.assertRaises(StopIteration):
-      self.evaluate(strategy.experimental_local_results(next(iterator)))
+      with self.assertRaises(StopIteration):
+        self.evaluate(strategy.experimental_local_results(next(iterator)))
 
-    # After re-initializing the iterator, should be able to iterate again.
-    iterator = iter(iterable)
+      # After re-initializing the iterator, should be able to iterate again.
+      iterator = iter(iterable)
 
-    for expected_value in expected_values:
-      computed_value = self.evaluate(
-          list(strategy.experimental_local_results(next(iterator))))
-      assert_same(expected_value, computed_value)
+      for expected_value in expected_values:
+        computed_value = self.evaluate(
+            list(strategy.experimental_local_results(next(iterator))))
+        assert_same(expected_value, computed_value)
+    else:
+      iterator = dataset_ops.make_initializable_iterator(iterable)
+      self._test_input_fn_iterator(iterator, strategy.extended.worker_devices,
+                                   expected_values, test_reinitialize=True,
+                                   ignore_order=ignore_order)
 
   def _test_input_fn_iterator(self,
                               iterator,
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 7aa99b9a8c4..c7106a2242f 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -46,7 +46,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.python.tpu import device_assignment as device_assignment_lib  # pylint: disable=unused-import
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
@@ -174,20 +174,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     self._device_assignment = device_assignment
 
-    # Device assignment is currently only supported for 1 core case.
-    if self._device_assignment:
-      assert isinstance(self._device_assignment,
-                        device_assignment_lib.DeviceAssignment)
-      if self._device_assignment.num_replicas != 1:
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-      if self._device_assignment.num_cores_per_replica != 1:
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-
     # TODO(jhseu): Switch to DeviceAssignment to support pods and model
     # parallelism.
     self._tpu_devices = [d.name for d in self._tpu_metadata.devices
@@ -264,7 +250,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
 
-    return input_lib.DistributedDatasetsFromFunction(
+    return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers,
         input_contexts,
@@ -699,6 +685,15 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._tpu_function_cache[fn] = tpu_function
     return tpu_function
 
+  def _in_multi_worker_mode(self):
+    """Whether this strategy indicates working in multi-worker settings."""
+    # TPUStrategy has different distributed training structure that the whole
+    # cluster should be treated as single worker from higher-level (e.g. Keras)
+    # library's point of view.
+    # TODO(rchao): Revisit this as we design a fault-tolerance solution for
+    # TPUStrategy.
+    return False
+
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 05efd2c69ea..dcc97cd7b0c 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -613,6 +613,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.AbstractVariable):
     # We need to make _keras_initialized a member of DistributedVariable because
     # without this it will use `__getattr__` which will delegate to a component
     # variable.
+    self._id = ops.uid()
     self._keras_initialized = False
     # Typically, a `DistributedVariable`'s initializer is composed of the
     # initializers of the components variables. However, in some cases, such as
@@ -776,6 +777,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.AbstractVariable):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
 
+  def _clone_with_new_values(self, new_values):
+    raise NotImplementedError("Must be implemented in descendents.")
+
 
 ops.register_dense_tensor_like_type(DistributedVariable)
 
@@ -855,8 +859,9 @@ class TPUVariableMixin(object):
     if tpu_context is None:
       return self._get_closest().handle
     else:
-      return tpu_context.get_replicated_var_handle(
-          self._handle_id, self._values)
+      return tpu_context.get_replicated_var_handle(self._handle_id,
+                                                   self._values,
+                                                   self._device_map)
 
   @property
   def device(self):
@@ -1069,6 +1074,10 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return ops.internal_convert_to_tensor(
         self.get(), dtype=dtype, name=name, as_ref=as_ref)
 
+  def _clone_with_new_values(self, new_values):
+    return type(self)(self._distribute_strategy, self._device_map, new_values,
+                      self._aggregation, logical_device=self._logical_device)
+
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
@@ -1081,13 +1090,20 @@ ops.register_tensor_conversion_function(MirroredVariable,
 
 
 def _enclosing_tpu_context():
-  # pylint: disable=protected-access
-  tpu_context = ops.get_default_graph()._get_control_flow_context()
-  # pylint: enable=protected-access
-  while tpu_context is not None and not isinstance(
-      tpu_context, control_flow_ops.XLAControlFlowContext):
-    tpu_context = tpu_context.outer_context
-  return tpu_context
+  """Returns the XLAControlFlowContext, which exists inside a tpu.rewrite()."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, control_flow_ops.XLAControlFlowContext):
+        return context_
+      context_ = context_.outer_context
+    # This may be a FuncGraph due to defuns or v2 control flow. We need to
+    # find the original graph with the XLAControlFlowContext.
+    graph = getattr(graph, "outer_graph", None)
+  return None
 
 
 def is_distributed_variable(v):
@@ -1159,7 +1175,7 @@ def _assert_replica_context(strategy):
         "Replica-local variables may only be assigned in a replica context.")
 
 
-class SyncOnReadVariable(DistributedVariable, PerReplica):
+class SyncOnReadVariable(DistributedVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
 
   def __init__(
@@ -1245,6 +1261,10 @@ class SyncOnReadVariable(DistributedVariable, PerReplica):
     return ops.internal_convert_to_tensor(
         self.get(), dtype=dtype, name=name, as_ref=as_ref)
 
+  def _clone_with_new_values(self, new_values):
+    return type(self)(self._distribute_strategy, self._device_map, new_values,
+                      self._aggregation, logical_device=self._logical_device)
+
 
 # Register a conversion function for SyncOnReadVariable which allows as_ref to
 # be true.
@@ -1371,7 +1391,14 @@ def regroup(device_map, values, wrap_class=PerReplica):
 def select_replica(replica_id, structured):
   """Specialize a nest of regular & per-replica values for one replica."""
   def _get(x):
-    return x.values[replica_id] if isinstance(x, DistributedValues) else x
+    # `DistributedValues` would be sliced according to replica unless it is a
+    # `DistributedVariable` because `DistributedVariable` can be handled
+    # directly in the replica context.
+    if (isinstance(x, DistributedVariable) or
+        not isinstance(x, DistributedValues)):
+      return x
+    else:
+      return x.values[replica_id]
 
   return nest.map_structure(_get, structured)
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 0bedcc9134b..cb6cdf975ed 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -41,9 +42,11 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model.model_utils import mode_keys
+from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
@@ -662,6 +665,42 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
         variable_scope.get_variable(
             name="testVar", initializer=1., use_resource=True)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testInitializedToSameValueInsideEagerRun(self, distribution):
+    v = [None]
+    @def_function.function
+    def step():
+      def f():
+        if v[0] is None:
+          v[0] = variables_lib.Variable(random_ops.random_normal([]))
+      distribution.experimental_run_v2(f)
+
+    context.set_global_seed(None)
+    step()
+    vals = self.evaluate(v[0].values)
+    self.assertAllEqual(vals[0], vals[1])
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["graph", "eager"]))
+  def testSelectReplica(self, distribution):
+    with distribution.scope():
+      v = variables_lib.Variable(1.)
+    self.assertIs(v, values.select_replica(0, v))
+
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
 
@@ -732,6 +771,19 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
       self.assertIsInstance(converted, ops.Tensor)
       self.assertEqual(converted.dtype, replica_local.dtype)
 
+  @test_util.run_v2_only
+  def testCanPassToDefFun(self):
+    @def_function.function
+    def add1(x):
+      return x + 1
+
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
+    replica_local = values.SyncOnReadVariable(
+        None, device_map, (v,), variable_scope.VariableAggregation.MEAN)
+    self.assertEqual(2., self.evaluate(add1(replica_local)))
+
 
 @combinations.generate(
     combinations.combine(
@@ -1031,6 +1083,9 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
     for aggregation in aggregations:
+      if isinstance(distribution, _TPU_STRATEGIES):
+        resolver = tpu_cluster_resolver.TPUClusterResolver('')
+        tpu_strategy_util.initialize_tpu_system(resolver)
       with distribution.scope():
         v = variable_scope.variable(
             0.,
@@ -1065,6 +1120,24 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         ValueError, "Could not convert from .* VariableAggregation\\.NONE"):
       self.evaluate(v.read_value())
 
+  def testInitializedToSameValueInsideEagerRun(self, distribution):
+    if not context.executing_eagerly(): self.skipTest("eager only")
+
+    v = [None]
+    @def_function.function
+    def step():
+      def f():
+        if v[0] is None:
+          v[0] = variables_lib.Variable(
+              random_ops.random_normal([]),
+              synchronization=variables_lib.VariableSynchronization.ON_READ)
+      distribution.experimental_run_v2(f)
+
+    context.set_global_seed(None)
+    step()
+    vals = self.evaluate(v[0].values)
+    self.assertAllEqual(vals[0], vals[1])
+
 
 class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index b58bf1875fd..4c93ba13fbc 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -14,10 +14,12 @@ cc_library(
     name = "pywrap_tfe_lib",
     srcs = [
         "pywrap_tensor.cc",
+        "pywrap_tensor_conversion.cc",
         "pywrap_tfe_src.cc",
     ],
     hdrs = [
         "pywrap_tensor.h",
+        "pywrap_tensor_conversion.h",
         "pywrap_tfe.h",
     ],
     visibility = [
@@ -34,6 +36,9 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
         "//tensorflow/python:cpp_python_util",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
@@ -42,6 +47,8 @@ cc_library(
         "//tensorflow/python:safe_ptr",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
@@ -60,6 +67,7 @@ py_library(
         ":execute",
         ":execution_callbacks",
         ":forwardprop",
+        ":forwardprop_util",
         ":function",
         ":graph_only_ops",
         ":monitoring",
@@ -105,12 +113,23 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "executor",
+    srcs = ["executor.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
 py_library(
     name = "context",
     srcs = ["context.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":executor",
         ":monitoring",
         "//tensorflow/python:device",
         "//tensorflow/python:device_spec",
@@ -192,6 +211,7 @@ py_library(
 py_test(
     name = "profiler_client_test",
     srcs = ["profiler_client_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     visibility = ["//tensorflow:internal"],
@@ -239,7 +259,10 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
     ],
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "no_windows",  #TODO(b/139745667)
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -248,7 +271,9 @@ cuda_py_test(
     srcs = ["forwardprop_test.py"],
     additional_deps = [
         ":forwardprop",
+        ":forwardprop_util",
         ":test",
+        "//tensorflow/python/distribute:mirrored_strategy",
     ],
     shard_count = 5,
     xla_enable_strict_auto_jit = True,
@@ -256,6 +281,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "core_test",
+    size = "small",
     srcs = ["core_test.py"],
     additional_deps = [
         ":context",
@@ -391,6 +417,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
+        "//tensorflow/python:op_callbacks",
         "//tensorflow/python:pywrap_tensorflow",
         "//third_party/py/numpy",
     ],
@@ -438,10 +465,10 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":execute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/eager:execute",
     ],
 )
 
@@ -451,7 +478,11 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":context",
+        ":core",
+        ":execute",
         ":graph_only_ops",
+        ":tape",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
@@ -459,10 +490,6 @@ py_library(
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:core",
-        "//tensorflow/python/eager:execute",
-        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -474,7 +501,10 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":context",
+        ":execute",
         ":imperative_grad",
+        ":tape",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -485,9 +515,6 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:unconnected_gradients",
         "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:execute",
-        "//tensorflow/python/eager:tape",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
@@ -505,6 +532,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "forwardprop_util",
+    srcs = ["forwardprop_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
 cuda_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
@@ -655,6 +692,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "def_function_xla_jit_test",
+    srcs = ["def_function_xla_jit_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    tags = [
+        "no_mac",
+        "no_windows",
+    ],
+    xla_enabled = True,
+)
+
 tf_xla_py_test(
     name = "def_function_xla_test",
     srcs = ["def_function_xla_test.py"],
@@ -684,6 +739,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/training/tracking:base",
     ],
 )
@@ -704,9 +760,9 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":context",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
-        "//tensorflow/python/eager:context",
     ],
 )
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index cebf0a896bb..0fdc0d7e53c 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -25,6 +25,7 @@ import sys
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import imperative_grad
@@ -33,6 +34,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_array_ops
@@ -140,11 +142,16 @@ def _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs,
 pywrap_tensorflow.TFE_Py_RegisterGradientFunction(_gradient_function)
 
 
+def _must_record_gradient():
+  return not pywrap_tensorflow.TFE_Py_TapeSetIsEmpty()
+
+
 def _record_gradient(op_name, inputs, attrs, results, name):
   return pywrap_tensorflow.TFE_Py_RecordGradient(op_name, inputs, attrs,
                                                  results, name)
 
 
+execute.must_record_gradient = _must_record_gradient
 execute.record_gradient = _record_gradient
 
 
@@ -636,7 +643,12 @@ def _zeros(shape, dtype):
     return array_ops.zeros(shape, dtype)
 
   device = ctx.device_name
-  cache_key = shape, dtype, device
+
+  if tensor_util.is_tensor(shape):
+    shape_key = shape.experimental_ref()
+  else:
+    shape_key = shape
+  cache_key = shape_key, dtype, device
   cached = ctx.zeros_cache().get(cache_key)
   if cached is None:
     if dtypes.as_dtype(dtype).is_bool:
@@ -838,8 +850,7 @@ class GradientTape(object):
       ValueError: if it encounters something that is not a tensor.
     """
     for t in nest.flatten(tensor):
-      if not (pywrap_tensorflow.IsTensor(t) or
-              pywrap_tensorflow.IsVariable(t)):
+      if not (_pywrap_utils.IsTensor(t) or _pywrap_utils.IsVariable(t)):
         raise ValueError("Passed in object of type {}, not tf.Tensor".format(
             type(t)))
       if not t.dtype.is_floating:
@@ -934,7 +945,8 @@ class GradientTape(object):
     """Computes the gradient using operations recorded in context of this tape.
 
     Args:
-      target: Tensor (or list of tensors) to be differentiated.
+      target: a list or nested structure of Tensors or Variables to be
+        differentiated.
       sources: a list or nested structure of Tensors or Variables. `target`
         will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
@@ -1128,7 +1140,7 @@ class GradientTape(object):
     See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) for the
     definition of a Jacobian. This function is essentially an efficient
     implementation of the following:
-    
+
     `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
 
     Note that compared to `GradientTape.jacobian` which computes gradient of
@@ -1146,7 +1158,7 @@ class GradientTape(object):
       x = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32)
       g.watch(x)
       y = x * x
-    batch_jacobian = g.batch_jacobian(y, x) 
+    batch_jacobian = g.batch_jacobian(y, x)
     # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
     ```
 
@@ -1229,10 +1241,11 @@ class GradientTape(object):
             " with experimental_use_pfor set to False.")
       output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size,
                                  parallel_iterations=parallel_iterations)
-    if output is None:
-      return None
-    output = array_ops.reshape(output,
-                               [target_row_size, batch_size, -1])
-    output = array_ops.transpose(output, [1, 0, 2])
     new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
-    return array_ops.reshape(output, new_shape)
+    if output is None:
+      return array_ops.zeros(new_shape)
+    else:
+      output = array_ops.reshape(output,
+                                 [target_row_size, batch_size, -1])
+      output = array_ops.transpose(output, [1, 0, 2])
+      return array_ops.reshape(output, new_shape)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 30d3a4f48c6..6f7f4a0a76f 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 import functools
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
@@ -30,6 +31,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
@@ -43,11 +45,24 @@ from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
+from tensorflow.python.util import nest
 
 
-class BackpropTest(test.TestCase):
+def _chain_grads(primals, grad_fns):
+  if len(grad_fns) == 1:
+    return grad_fns[-1]
+  @custom_gradient.custom_gradient(primals=primals)
+  def grad(*args, **kwargs):
+    return (grad_fns[0](*args, **kwargs),
+            _chain_grads(primals, grad_fns[1:]))
+  return grad
+
+
+class BackpropTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAggregateGradients(self):
@@ -120,6 +135,24 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
+  @parameterized.named_parameters(
+      [('Function', def_function.function),
+       ('NoFunction', lambda f: f)])
+  def testIdentityBehaviorConsistent(self, decorator):
+
+    @decorator
+    def f(x):
+      x1 = array_ops.identity(x)
+      with backprop.GradientTape() as t:
+        t.watch(x)
+        t.watch(x1)
+        y1 = x * 2.
+        y2 = x1 * 3.
+        loss = y1 + y2
+      return t.gradient(loss, [x, x1])
+
+    self.assertAllClose([2., 3.], f(constant_op.constant(10.)))
+
   def testGradientInsideLoop(self):
     with ops.Graph().as_default():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -128,7 +161,7 @@ class BackpropTest(test.TestCase):
         _ = v + 1.0  # This reads the variable inside the loop context
         with backprop.GradientTape() as t:
           result = v * 2
-        self.assertTrue(t.gradient(result, v) is not None)
+        self.assertIsNotNone(t.gradient(result, v))
         return 1.0
 
       control_flow_ops.while_loop(lambda i: False, body, [1.0])
@@ -268,8 +301,8 @@ class BackpropTest(test.TestCase):
 
     grads = backprop.implicit_grad(f)()
     ordered_variables = [x[1] for x in grads]
-    self.assertTrue(ordered_variables[0] is v0)
-    self.assertTrue(ordered_variables[1] is v1)
+    self.assertIs(ordered_variables[0], v0)
+    self.assertIs(ordered_variables[1], v1)
 
   def testTapeNoOpGradient(self):
     x = constant_op.constant(3.0)
@@ -1331,6 +1364,143 @@ class BackpropTest(test.TestCase):
       g = f(c)
     self.assertAllEqual(self.evaluate(t.gradient(g, c)), 4.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNthOrderCustomGradientsTape(self):
+
+    def _all_grads_tape(f, primals, doutputs):
+      primals = nest.map_structure(ops.convert_to_tensor, primals)
+      with backprop.GradientTape(persistent=True) as t:
+        t.watch(primals)
+        with variable_scope.variable_scope(
+            # Required when graph building
+            variable_scope.get_variable_scope(), use_resource=True):
+          current = f(primals)
+          ret = [current]
+          for doutput in doutputs:
+            current = t.gradient(current, primals, output_gradients=doutput,
+                                 unconnected_gradients='zero')
+            ret.append(current)
+        return ret
+
+    @custom_gradient.custom_gradient
+    def f(x):
+      y = 2. * x
+      return y, _chain_grads(x, [lambda dy: dy * 2.1,
+                                 lambda ddy: ddy * 2.2,
+                                 lambda dddy: dddy * x * 2.3])
+
+    self.assertAllClose(
+        [6., 4.2, 22.], _all_grads_tape(f, 3., [2., 10.]))
+    self.assertAllClose(
+        [6., 2.1, 2.2, 6.9, 2.3, 0.],
+        _all_grads_tape(f, 3., [1., 1., 1., 1., 1.]))
+
+    traced_tape_grads = def_function.function(_all_grads_tape)
+    self.assertAllClose(
+        [6., 4.2, 22.], traced_tape_grads(f, 3., [2., 10.]))
+    self.assertAllClose(
+        [6., 2.1, 2.2, 6.9, 2.3, 0.],
+        traced_tape_grads(f, 3., [1., 1., 1., 1., 1.]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNthOrderCustomGradientsTFGradients(self):
+
+    @def_function.function
+    def _all_grads_tf_gradients(f, primals, doutputs):
+      primals = nest.map_structure(ops.convert_to_tensor, primals)
+      current = f(primals)
+      ret = [current]
+      for doutput in doutputs:
+        current, = gradients.gradients(current, primals, grad_ys=doutput,
+                                       unconnected_gradients='zero')
+        ret.append(current)
+      return ret
+
+    @custom_gradient.custom_gradient
+    def f(x):
+      y = 2. * x
+      return y, _chain_grads(x, [lambda dy: dy * 2.1,
+                                 lambda ddy: ddy * 2.2,
+                                 lambda dddy: dddy * x * 2.3])
+
+    self.assertAllClose(
+        [6., 4.2, 22.], _all_grads_tf_gradients(f, 3., [2., 10.]))
+    self.assertAllClose(
+        [6., 2.1, 2.2, 6.9, 2.3, 0.], _all_grads_tf_gradients(
+            f, 3., [1., 1., 1., 1., 1.]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomGradientManualNesting(self):
+    @custom_gradient.custom_gradient
+    def f(x, y):
+      z = 2. * x * y
+
+      @custom_gradient.custom_gradient(primals=(x, y))
+      def g(unused_dz):
+
+        def h(unused_dz, unused_dydz):
+          return (2.2, 3.2)
+
+        return (2.1, 3.1), h
+
+      return z, g
+
+    with backprop.GradientTape(persistent=True) as t:
+      with backprop.GradientTape(persistent=True) as tt:
+        c = constant_op.constant(1.)
+        d = constant_op.constant(-1.)
+        t.watch(c)
+        tt.watch(c)
+        t.watch(d)
+        tt.watch(d)
+        output = f(c, d)
+        self.assertAllClose(-2., output)
+      gc = tt.gradient(output, c)
+      self.assertAllClose(2.1, gc)
+      gd = tt.gradient(output, d)
+      self.assertAllClose(3.1, gd)
+    gcgc = t.gradient(gc, c)
+    self.assertAllClose(2.2, gcgc)
+    gcgd = t.gradient(gc, d)
+    self.assertAllClose(3.2, gcgd)
+    gdgc = t.gradient(gd, c)
+    self.assertAllClose(2.2, gdgc)
+    gdgd = t.gradient(gd, d)
+    self.assertAllClose(3.2, gdgd)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomGradientForwardprop(self):
+    @custom_gradient.custom_gradient
+    def f(x):
+      z = 2. * tensor_util.constant_value(x)
+      def g(dz):
+        @custom_gradient.custom_gradient
+        def first_order(unused_x, unused_dz):
+          def second_order_and_transpose(unused_ddz):
+            return 2.2, 3.1
+          return 2.1, second_order_and_transpose
+        return first_order(x, dz)
+      return z, g
+
+    with backprop.GradientTape(persistent=True) as t:
+      with backprop.GradientTape() as tt:
+        c = constant_op.constant(1.)
+        t.watch(c)
+        tt.watch(c)
+        output_grad = array_ops.ones([])
+        t.watch(output_grad)
+        output = f(c)
+        self.assertAllClose(2., output)
+      gc = tt.gradient(output, c, output_gradients=output_grad)
+      self.assertAllClose(2.1, gc)
+    ggc = t.gradient(gc, c)
+    self.assertAllClose(2.2, ggc)
+    # Note that executed eagerly this kind of transpose is not efficient. But
+    # from a tf.function we could prune out the first-order gradient
+    # computation.
+    transpose = t.gradient(gc, output_grad)
+    self.assertAllClose(3.1, transpose)
+
   @test_util.run_in_graph_and_eager_modes
   def testMaxPooling3DGradient(self):
 
@@ -1363,6 +1533,23 @@ class BackpropTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'ndarray'):
       g.watch(np.array(1.))
 
+  def testOpWithNoAttrs(self):
+
+    @function.defun(autograph=False)
+    def f():
+      with backprop.GradientTape() as tape:
+        xs = random_ops.random_normal([10, 32])
+        tape.watch(xs)
+        # The `rfft()` op has no defined attrs, which exercises a different
+        # branch in the Python op wrapper code generator for recording
+        # gradients.
+        ys = fft_ops.rfft(xs)
+        self.assertEmpty(ys.op.node_def.attr)
+      gs = tape.gradient(ys, xs)
+      self.assertIsNotNone(gs)
+
+    f.get_concrete_function()
+
 
 class JacobianTest(test.TestCase):
 
@@ -1482,7 +1669,7 @@ class JacobianTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class BatchJacobianTest(test.TestCase):
+class BatchJacobianTest(test.TestCase, parameterized.TestCase):
 
   def _batch_jacobian(self, experimental_use_pfor):
     persistent = context.executing_eagerly and not experimental_use_pfor
@@ -1583,6 +1770,23 @@ class BatchJacobianTest(test.TestCase):
     self.assertAllClose(g.batch_jacobian(y, x, parallel_iterations=2),
                         g.batch_jacobian(y, x, parallel_iterations=3))
 
+  @parameterized.parameters(
+      (True, True),
+      (True, False),
+      (False, True),
+      (False, False))
+  def test_degenerate_shape(self, use_function, use_pfor):
+
+    def f(x):
+      with backprop.GradientTape(persistent=True) as tape:
+        tape.watch(x)
+        y = x**2
+      return tape.batch_jacobian(y, x, experimental_use_pfor=use_pfor)
+
+    if use_function:
+      f = def_function.function(f)
+    self.assertAllEqual([1, 0, 0], array_ops.shape(f(array_ops.zeros([1, 0]))))
+
 
 class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
 
@@ -1630,3 +1834,4 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9e945ff3dd4..01d2134da92 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -25,6 +25,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import os
 import time
 
@@ -143,12 +144,12 @@ def run_benchmark(func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
     func()
     if execution_mode == context.ASYNC:
-      ctx.async_wait()
+      ctx.executor.wait()
     start = time.time()
     for _ in xrange(num_iters):
       func()
     if execution_mode == context.ASYNC:
-      ctx.async_wait()
+      ctx.executor.wait()
     end = time.time()
 
     return end - start
@@ -181,44 +182,46 @@ class MicroBenchmarks(test.Benchmark):
   def _benchmark_create_tensor(self, value, dtype, device):
     """Benchmark overheads of creating a Tensor object."""
     ctx = context.context()
-    handle = ctx._handle
     if device == GPU:
       # Warmup the GPU
-      ops.EagerTensor(value, context=handle, device=device)
+      ops.EagerTensor(value, device=device)
 
     def func():
-      ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+      ops.EagerTensor(value, device=device, dtype=dtype)
 
     self._run(func, 30000)
 
   def _benchmark_create_constant(self, value, dtype):
     def func():
-      return constant_op.constant(value, dtype=dtype)
+      constant_op.constant(value, dtype=dtype)
 
-    for _ in range(1000):
-      func()  # Warmup.
-
-    self._run(func, 30000)
+    with ops.device("GPU:0" if context.num_gpus() else "CPU:0"):
+      for _ in range(1000):
+        func()  # Warmup.
+      self._run(func, 3000)
 
   def benchmark_create_float_constant(self):
     self._benchmark_create_constant(42.0, dtype=None)
 
   def benchmark_create_int32_constant(self):
+    if context.num_gpus():
+      return  # int32 constants are always allocated on CPU.
+
     self._benchmark_create_constant(42, dtype=dtypes.int32)
 
   def _benchmark_add_scalars(self, a, b):
     def func():
-      return math_ops.add(a, b)
+      return memoryview(math_ops.add(a, b))
 
-    for _ in range(1000):
-      func()  # Warmup.
-
-    self._run(func, 30000)
+    with ops.device("GPU:0" if context.num_gpus() else "CPU:0"):
+      for _ in range(1000):
+        func()  # Warmup.
+      self._run(func, 30000)
 
   def benchmark_add_float_scalars(self):
     self._benchmark_add_scalars(42.0, 24.0)
 
-  def benchmark_add_int_scalars(self):
+  def benchmark_add_int32_scalars(self):
     self._benchmark_add_scalars(42, 24)
 
   def benchmark_create_float_tensor_from_list_CPU(self):
@@ -1112,8 +1115,7 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
         wall_time=mean_us,
         extras={"examples_per_sec": num_iters / total_time})
 
-  # TODO(b/136184459): Re-enabled once crash is fixed
-  def _DISABLED_benchmark_send_mirroring_off(self):
+  def benchmark_send_mirroring_off(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -1128,9 +1130,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
 
     context.context().mirroring_policy = context.MIRRORING_NONE
     self._run(lambda: func(x))
+    # NOTE(b/136184459): Force garbage collecting hanging resources before
+    # subsequent calls to set_server_def, to ensure the destroy resource ops are
+    # executed when their corresponding device and manager are still available.
+    gc.collect()
 
-  # TODO(b/136184459): Re-enabled once crash is fixed
-  def _DISABLED_benchmark_send_mirroring_on(self):
+  def benchmark_send_mirroring_on(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -1145,9 +1150,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
 
     context.context().mirroring_policy = context.MIRRORING_ALL
     self._run(lambda: func(x))
+    # NOTE(b/136184459): Force garbage collecting hanging resources before
+    # subsequent calls to set_server_def, to ensure the destroy resource ops are
+    # executed when their corresponding device and manager are still available.
+    gc.collect()
 
-  # TODO(b/136184459): Re-enabled once crash is fixed
-  def _DISABLED_benchmark_worker_mirroring_off(self):
+  def benchmark_worker_mirroring_off(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
@@ -1164,9 +1172,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
 
     context.context().mirroring_policy = context.MIRRORING_NONE
     self._run(func)
+    # NOTE(b/136184459): Force garbage collecting hanging resources before
+    # subsequent calls to set_server_def, to ensure the destroy resource ops are
+    # executed when their corresponding device and manager are still available.
+    gc.collect()
 
-  # TODO(b/136184459): Re-enabled once crash is fixed
-  def _DISABLED_benchmark_worker_mirroring_on(self):
+  def benchmark_worker_mirroring_on(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
@@ -1183,6 +1194,10 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
 
     context.context().mirroring_policy = context.MIRRORING_ALL
     self._run(func)
+    # NOTE(b/136184459): Force garbage collecting hanging resources before
+    # subsequent calls to set_server_def, to ensure the destroy resource ops are
+    # executed when their corresponding device and manager are still available.
+    gc.collect()
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index fb6d9428be8..347f334d793 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -30,6 +30,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import tf2
+from tensorflow.python.eager import executor
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
@@ -64,10 +65,9 @@ ASYNC = 1
 MIRRORING_NONE = pywrap_tensorflow.TFE_MIRRORING_NONE
 MIRRORING_ALL = pywrap_tensorflow.TFE_MIRRORING_ALL
 
-_tf2_gauge = monitoring.BoolGauge("/tensorflow/api/tf2_enable",
-                                  "Whether tf2.enable() is called.")
-
-_tf2_gauge.get_cell().set(tf2.enabled())
+_python_eager_context_create_counter = monitoring.Counter(
+    "/tensorflow/api/python/eager_context_create_counter",
+    "Counter for number of eager contexts created in Python.")
 
 
 class _EagerTensorCache(object):
@@ -155,7 +155,6 @@ class _TensorCaches(threading.local):
 
   def __init__(self):
     super(_TensorCaches, self).__init__()
-    self.scalar_cache = {}
     self._ones_rank_cache = None
     self._zeros_cache = None
 
@@ -186,8 +185,8 @@ class _ThreadLocalData(threading.local):
     self.summary_recording = None
     self.summary_recording_distribution_strategy = True
     self.summary_step = None
-    self.execution_mode = SYNC
     self.function_call_options = None
+    self.executor = None
 
 
 ContextSwitch = collections.namedtuple(
@@ -319,6 +318,10 @@ class _TensorCacheDeleter(object):
       del _tensor_caches_map[self._context_id]
 
 
+# Thread-local stack of execution callbacks.
+_post_execution_callbacks = threading.local()
+
+
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
@@ -380,9 +383,9 @@ class Context(object):
     self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
-    self._post_execution_callbacks = []
     self._seed = None
     self._initialize_lock = threading.Lock()
+    self._initialized = False
     if device_policy is None:
       device_policy = DEVICE_PLACEMENT_SILENT
     self._device_policy = device_policy
@@ -392,7 +395,7 @@ class Context(object):
           "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
     if execution_mode is None:
       execution_mode = SYNC
-    self._execution_mode = execution_mode
+    self._default_is_async = execution_mode == ASYNC
     self._server_def = server_def
     self._collective_ops_server_def = None
     self._collective_leader = None
@@ -414,6 +417,7 @@ class Context(object):
     self._log_device_placement = None
     self._optimizer_experimental_options = {}
 
+    _python_eager_context_create_counter.get_cell().increase_by(1)
   # pylint: enable=redefined-outer-name
 
   def _set_global_seed(self, seed):
@@ -467,8 +471,10 @@ class Context(object):
 
   def ensure_initialized(self):
     """Initialize handle and devices if not already done so."""
+    if self._initialized:
+      return
     with self._initialize_lock:
-      if self._context_handle is not None:
+      if self._initialized:
         return
       assert self._context_devices is None
       opts = pywrap_tensorflow.TFE_NewContextOptions()
@@ -481,9 +487,9 @@ class Context(object):
         if self._mirroring_policy is not None:
           pywrap_tensorflow.TFE_ContextOptionsSetMirroringPolicy(
               opts, self._mirroring_policy)
-        if self._execution_mode == ASYNC:
+        if self._default_is_async == ASYNC:
           pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
-        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
+        context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       assert not (self._server_def and self._collective_ops_server_def), (
@@ -491,19 +497,21 @@ class Context(object):
           "moment. If this is important to you, please file an issue.")
       if self._server_def is not None:
         server_def_str = self._server_def.SerializeToString()
-        pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle, 600,
+        pywrap_tensorflow.TFE_ContextSetServerDef(context_handle, 600,
                                                   server_def_str)
       elif self._collective_ops_server_def is not None:
         server_def_str = self._collective_ops_server_def.SerializeToString()
-        pywrap_tensorflow.TFE_EnableCollectiveOps(self._context_handle,
+        pywrap_tensorflow.TFE_EnableCollectiveOps(context_handle,
                                                   server_def_str)
 
+      self._context_handle = context_handle
       self._initialize_logical_devices()
+      self._initialized = True
 
   def _clear_caches(self):
-    self.scalar_cache().clear()
     self.ones_rank_cache().flush()
     self.zeros_cache().flush()
+    pywrap_tensorflow.TFE_ClearScalarCache()
 
   def set_server_def(self, server_def, keep_alive_secs=600):
     """Allow setting a server_def on the context.
@@ -533,12 +541,11 @@ class Context(object):
       server_def_str = server_def.SerializeToString()
       pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle,
                                                 keep_alive_secs, server_def_str)
-
-      # Clear all the caches in case there are remote tensors in them.
-      self._clear_caches()
-
       self._initialize_logical_devices()
 
+    # Clear all the caches in case there are remote tensors in them.
+    self._clear_caches()
+
   def enable_collective_ops(self, server_def):
     """Enable distributed collective ops with an appropriate server_def.
 
@@ -650,10 +657,6 @@ class Context(object):
     """Returns True if current thread has eager executing enabled."""
     return self._thread_local_data.is_eager
 
-  def scalar_cache(self):
-    """Per-device cache for scalars."""
-    return _tensor_caches_map[self._id].scalar_cache
-
   def ones_rank_cache(self):
     """Per-device cache for scalars."""
     return _tensor_caches_map[self._id].ones_rank_cache
@@ -745,18 +748,11 @@ class Context(object):
     """List of the names of devices available to execute operations."""
     return self._devices
 
+  # TODO(fishx): remove this property.
   @property
   def execution_mode(self):
     """Gets execution mode for current thread."""
-    # Only get the execution mode from the context if it has already been
-    # initialized
-    if self._context_handle is None:
-      return self._execution_mode
-
-    mode = self._thread_local_data.execution_mode
-    if mode is None:
-      mode = self._execution_mode
-    return mode
+    return ASYNC if self.is_async() else SYNC
 
   @execution_mode.setter
   def execution_mode(self, mode):
@@ -764,18 +760,39 @@ class Context(object):
     if mode not in (None, SYNC, ASYNC):
       raise ValueError(
           "Execution mode should be None/SYNC/ASYNC. Got %s" % mode)
+
     if mode is None:
       mode = SYNC
 
-    if self._thread_local_data.execution_mode != mode:
-      self._thread_local_data.execution_mode = mode
-
+    enable_async = (mode == ASYNC)
+    if self.is_async() != enable_async:
       # Only set the execution mode if the context has already been initialized
       if self._context_handle is not None:
-        pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._context_handle,
-                                                       mode == ASYNC)
+        self.executor.wait()
+        executor_new = executor.new_executor(enable_async)
+        self._thread_local_data.executor = executor_new
+        pywrap_tensorflow.TFE_ContextSetExecutorForThread(
+            self._context_handle, executor_new.handle())
       else:
-        self._execution_mode = mode
+        self._default_is_async = enable_async
+
+  def is_async(self):
+    if self._context_handle is not None:
+      return self.executor.is_async()
+    else:
+      return self._default_is_async
+
+  @property
+  def executor(self):
+    ensure_initialized()
+    return executor.Executor(
+        pywrap_tensorflow.TFE_ContextGetExecutorForThread(self._context_handle))
+
+  @executor.setter
+  def executor(self, e):
+    ensure_initialized()
+    pywrap_tensorflow.TFE_ContextSetExecutorForThread(self._context_handle,
+                                                      e.handle())
 
   @property
   def config(self):
@@ -943,14 +960,6 @@ class Context(object):
     """Returns function call options for current thread."""
     self._thread_local_data.function_call_options = options
 
-  def async_wait(self):
-    """Waits for ops dispatched in ASYNC mode to finish."""
-    pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
-
-  def async_clear_error(self):
-    """Clears errors raised during ASYNC execution."""
-    pywrap_tensorflow.TFE_ContextAsyncClearError(self._handle)
-
   def num_gpus(self):
     """The number of GPUs available to execute operations."""
     self.ensure_initialized()
@@ -1021,17 +1030,18 @@ class Context(object):
       `outputs` is the `list` of output `Tensor`(s) from the op.
        Return value(s) from the callback are ignored.
     """
-    # TODO(cais): (b/64674139) Allow access to function-internal operations.
-    self._post_execution_callbacks.append(callback)
+    self.post_execution_callbacks.append(callback)
 
   def clear_post_execution_callbacks(self):
     """Clear all post-execution callbacks added to the context."""
-    del self._post_execution_callbacks[:]
+    del self.post_execution_callbacks[:]
 
   @property
   def post_execution_callbacks(self):
     """Get the list of post-execution callbacks added to the context."""
-    return self._post_execution_callbacks
+    if not hasattr(_post_execution_callbacks, "callbacks"):
+      _post_execution_callbacks.callbacks = []
+    return _post_execution_callbacks.callbacks
 
   def _initialize_physical_devices(self):
     """Get local devices visible to the system."""
@@ -1470,9 +1480,6 @@ class Context(object):
   def end_step(self):
     pywrap_tensorflow.TFE_ContextEndStep(self._handle)
 
-_context = None
-_context_lock = threading.Lock()
-
 
 class _EagerDeviceContext(object):
   """Context-manager forcing placement of ops and Tensors on a device."""
@@ -1526,11 +1533,27 @@ class _EagerDeviceContext(object):
     ctx._set_device(old_device_name, old_device_spec)  # pylint: disable=protected-access
 
 
-def _create_context():
+# Do not set directly. Use _set_context.
+_context = None
+_context_lock = threading.Lock()
+
+
+def _set_context_locked(ctx):
   global _context
+  pywrap_tensorflow.TFE_Py_SetEagerContext(ctx)
+  _context = ctx
+
+
+def _set_context(ctx):
+  with _context_lock:
+    _set_context_locked(ctx)
+
+
+def _create_context():
   with _context_lock:
     if _context is None:
-      _context = Context()
+      ctx = Context()
+      _set_context_locked(ctx)
 
 
 def context():
@@ -1717,16 +1740,40 @@ def set_execution_mode(mode):
   context().execution_mode = mode
 
 
+# TODO(fishx): remove this method.
 @tf_contextlib.contextmanager
 def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
   ctx = context()
-  old_mode = ctx.execution_mode
+  executor_new = executor.new_executor(mode == ASYNC)
+  executor_old = ctx.executor
   try:
-    ctx.execution_mode = mode
+    executor_old.wait()
+    ctx.executor = executor_new
     yield
   finally:
-    ctx.execution_mode = old_mode
+    ctx.executor = executor_old
+    executor_new.wait()
+
+
+@tf_contextlib.contextmanager
+def executor_scope(e):
+  """Context manager for changing executor for current thread.
+
+  Args:
+    e: A Executor to execute eager ops under this scope. Setting it to None will
+      switch back to use the default executor for the context.
+
+  Yields:
+    Context manager for setting the executor for current thread.
+  """
+  ctx = context()
+  executor_old = ctx.executor
+  try:
+    ctx.executor = e
+    yield
+  finally:
+    ctx.executor = executor_old
 
 
 @tf_export("experimental.function_executor_type")
@@ -1752,14 +1799,19 @@ def function_executor_type(executor_type):
     context().function_call_options = old_options
 
 
+def is_async():
+  """Returns true if current thread is in async mode."""
+  return context().is_async()
+
+
 def async_wait():
   """Waits for ops dispatched in ASYNC mode to finish."""
-  return context().async_wait()
+  return context().executor.wait()
 
 
 def async_clear_error():
   """Clears errors raised during ASYNC execution mode."""
-  return context().async_clear_error()
+  return context().executor.clear_error()
 
 
 def num_gpus():
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index ba856b803fa..3b1a3c27622 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -17,9 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
@@ -34,6 +37,40 @@ class ContextTest(test.TestCase):
       c._set_global_seed(np.array(123, dtype=t))
       c._set_global_seed(ops.convert_to_tensor(123, dtype=t))
 
+  def testContextIsDestroyedAfterTensors(self):
+    # Create a new context
+    new_context = context.Context()
+    weak_c = weakref.ref(new_context)
+    new_context.ensure_initialized()
+
+    # Create a tensor with the new context as default.
+    # Make sure to restore the original context.
+    original_context = context.context()
+    try:
+      context._set_context(new_context)
+      # Use a 2D tensor so that it is not cached.
+      tensor1 = constant_op.constant([[3.]])
+      # Produce a tensor as an operation output. This uses a different code path
+      # from tensors created from Python.
+      tensor2 = tensor1 * tensor1
+      context._set_context(original_context)
+    except:
+      context._set_context(original_context)
+      raise
+
+    # Deleting our context reference should not delete the underlying object.
+    del new_context
+    self.assertIsNot(weak_c(), None)
+
+    # Deleting the first tensor should not delete the context since there is
+    # another tensor.
+    del tensor1
+    self.assertIsNot(weak_c(), None)
+
+    # Deleting the last tensor should result in deleting its context.
+    del tensor2
+    self.assertIs(weak_c(), None)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 7958f7ee15e..da742b5b849 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute as execute_lib
+from tensorflow.python.eager import executor
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
@@ -92,101 +93,232 @@ class TFETest(test_util.TensorFlowTestCase):
   def testEquality(self):
     default = ops.Tensor._USE_EQUALITY
 
-    def _v1_check(a, b):
-      self.assertEqual(a, a)
-      self.assertIs(a, a)
-      self.assertNotEqual(a, 1.0)
-      self.assertIsNot(a, 1.0)
-      self.assertNotEqual(a, b)
-      self.assertIsNot(a, b)
+    try:
+      def _v1_check(a, b):
+        self.assertEqual(a, a)
+        self.assertIs(a, a)
+        self.assertNotEqual(a, 1.0)
+        self.assertIsNot(a, 1.0)
+        self.assertNotEqual(a, b)
+        self.assertIsNot(a, b)
 
-    def _v2_check(a, b):
-      self.assertEqual(a, a)
-      self.assertIs(a, a)
-      self.assertEqual(a, 1.0)
-      self.assertIsNot(a, 1.0)
-      self.assertEqual(a, b)
-      self.assertIsNot(a, b)
+      def _v2_check(a, b):
+        self.assertEqual(a, a)
+        self.assertIs(a, a)
+        self.assertEqual(a, 1.0)
+        self.assertIsNot(a, 1.0)
+        self.assertEqual(a, b)
+        self.assertIsNot(a, b)
 
-    constant_a = constant_op.constant(1.0)
-    constant_b = constant_op.constant(1.0)
+      constant_a = constant_op.constant(1.0)
+      constant_b = constant_op.constant(1.0)
 
-    ops.disable_tensor_equality()
-    self._test_hashable(constant_a, constant_b, True)
-    _v1_check(constant_a, constant_b)
-    ops.enable_tensor_equality()
-    _v2_check(constant_a, constant_b)
-    self._test_hashable(constant_a, constant_b, False)
-
-    variable_a = variables.Variable(1.0)
-    variable_b = variables.Variable(1.0)
-
-    ops.disable_tensor_equality()
-    _v1_check(variable_a, variable_b)
-    self._test_hashable(variable_a, variable_b, True)
-    ops.enable_tensor_equality()
-    _v2_check(variable_a, variable_b)
-    self._test_hashable(variable_a, variable_b, True)
-
-    if default:
-      ops.enable_tensor_equality()
-    else:
       ops.disable_tensor_equality()
+      self._test_hashable(constant_a, constant_b, True)
+      _v1_check(constant_a, constant_b)
+      ops.enable_tensor_equality()
+      _v2_check(constant_a, constant_b)
+      self._test_hashable(constant_a, constant_b, False)
 
-    # We only test numpy behaviour in v2 mode since we'd like to match that.
-    numpy_a = np.array(1.0)
-    numpy_b = np.array(1.0)
-    _v2_check(numpy_a, numpy_b)
-    self._test_hashable(numpy_a, numpy_b, False)
+      variable_a = variables.Variable(1.0)
+      variable_b = variables.Variable(1.0)
+
+      ops.disable_tensor_equality()
+      _v1_check(variable_a, variable_b)
+      self._test_hashable(variable_a, variable_b, True)
+      ops.enable_tensor_equality()
+      _v2_check(variable_a, variable_b)
+      self._test_hashable(variable_a, variable_b, False)
+
+      # We only test numpy behaviour in v2 mode since we'd like to match that.
+      numpy_a = np.array(1.0)
+      numpy_b = np.array(1.0)
+      _v2_check(numpy_a, numpy_b)
+      self._test_hashable(numpy_a, numpy_b, False)
+    finally:
+      if default:
+        ops.enable_tensor_equality()
+      else:
+        ops.disable_tensor_equality()
 
   def testEqualityNan(self):
     default = ops.Tensor._USE_EQUALITY
 
-    def _v1_check(a, b):
-      self.assertEqual(a, a)
-      self.assertIs(a, a)
-      self.assertNotEqual(a, float('nan'))
-      self.assertIsNot(a, float('nan'))
-      self.assertNotEqual(a, b)
-      self.assertIsNot(a, b)
+    try:
+      def _v1_check(a, b):
+        self.assertEqual(a, a)
+        self.assertIs(a, a)
+        self.assertNotEqual(a, float('nan'))
+        self.assertIsNot(a, float('nan'))
+        self.assertNotEqual(a, b)
+        self.assertIsNot(a, b)
 
-    def _v2_check(a, b):
-      self.assertNotEqual(a, a)
-      self.assertIs(a, a)
-      self.assertNotEqual(a, float('nan'))
-      self.assertIsNot(a, float('nan'))
-      self.assertNotEqual(a, b)
-      self.assertIsNot(a, b)
+      def _v2_check(a, b):
+        self.assertNotEqual(a, a)
+        self.assertIs(a, a)
+        self.assertNotEqual(a, float('nan'))
+        self.assertIsNot(a, float('nan'))
+        self.assertNotEqual(a, b)
+        self.assertIsNot(a, b)
 
-    constant_a = constant_op.constant(float('nan'))
-    constant_b = constant_op.constant(float('nan'))
+      constant_a = constant_op.constant(float('nan'))
+      constant_b = constant_op.constant(float('nan'))
 
-    ops.disable_tensor_equality()
-    self._test_hashable(constant_a, constant_b, True)
-    _v1_check(constant_a, constant_b)
-    ops.enable_tensor_equality()
-    _v2_check(constant_a, constant_b)
-    self._test_hashable(constant_a, constant_b, False)
-
-    variable_a = variables.Variable(float('nan'))
-    variable_b = variables.Variable(float('nan'))
-
-    ops.disable_tensor_equality()
-    _v1_check(variable_a, variable_b)
-    self._test_hashable(variable_a, variable_b, True)
-    ops.enable_tensor_equality()
-    _v2_check(variable_a, variable_b)
-    self._test_hashable(variable_a, variable_b, True)
-
-    if default:
-      ops.enable_tensor_equality()
-    else:
       ops.disable_tensor_equality()
+      self._test_hashable(constant_a, constant_b, True)
+      _v1_check(constant_a, constant_b)
+      ops.enable_tensor_equality()
+      _v2_check(constant_a, constant_b)
+      self._test_hashable(constant_a, constant_b, False)
 
-    numpy_a = np.array(float('nan'))
-    numpy_b = np.array(float('nan'))
-    _v2_check(numpy_a, numpy_b)
-    self._test_hashable(numpy_a, numpy_b, False)
+      variable_a = variables.Variable(float('nan'))
+      variable_b = variables.Variable(float('nan'))
+
+      ops.disable_tensor_equality()
+      _v1_check(variable_a, variable_b)
+      self._test_hashable(variable_a, variable_b, True)
+      ops.enable_tensor_equality()
+      _v2_check(variable_a, variable_b)
+      self._test_hashable(variable_a, variable_b, False)
+
+      numpy_a = np.array(float('nan'))
+      numpy_b = np.array(float('nan'))
+      _v2_check(numpy_a, numpy_b)
+      self._test_hashable(numpy_a, numpy_b, False)
+    finally:
+      if default:
+        ops.enable_tensor_equality()
+      else:
+        ops.disable_tensor_equality()
+
+  def testEqualityCompare(self):
+    default = ops.Tensor._USE_EQUALITY
+
+    try:
+      tf_a = constant_op.constant([1, 2])
+      tf_b = constant_op.constant([1, 2])
+      tf_c = constant_op.constant([1, 1])
+      np_a = np.array([1, 2])
+      np_b = np.array([1, 2])
+      np_c = np.array([1, 1])
+
+      ops.disable_tensor_equality()
+      # We don't do element-wise comparison
+      self.assertNotEqual(tf_a, tf_b)
+      self.assertNotEqual(tf_a, tf_c)
+
+      # We can compare list of tensors
+      self.assertEqual([tf_a, tf_b], [tf_a, tf_b])
+      self.assertNotEqual([tf_a, tf_b], [tf_b, tf_b])
+
+      # We can compare existence in a list
+      self.assertIn(tf_a, [tf_a, tf_b])
+      self.assertIn(tf_a, [tf_b, tf_a])
+      self.assertNotIn(tf_a, [tf_b, tf_c])
+
+      ops.enable_tensor_equality()
+      # We do element-wise comparison but can't convert results array to bool
+      with self.assertRaises(ValueError):
+        bool(tf_a == tf_b)
+      self.assertAllEqual(tf_a == tf_b, [True, True])
+      with self.assertRaises(ValueError):
+        bool(tf_a == tf_c)
+      self.assertAllEqual(tf_a == tf_c, [True, False])
+      self.assertNotAllEqual(tf_a, tf_c)
+      with self.assertRaises(ValueError):
+        bool(np_a == np_b)
+      self.assertAllEqual(np_a == np_b, [True, True])
+      with self.assertRaises(ValueError):
+        bool(np_a == np_c)
+      self.assertAllEqual(np_a == np_c, [True, False])
+      self.assertNotAllEqual(np_a, np_c)
+
+      # Warning even though we technically shouldn't be able to compare here,
+      # since the id is the same both TF & numpy will handle lists with the same
+      # value without raising an error
+      self.assertEqual([tf_a, tf_b], [tf_a, tf_b])
+      with self.assertRaises(ValueError):
+        bool([tf_a, tf_b] == [tf_b, tf_b])
+      self.assertEqual([np_a, np_b], [np_a, np_b])
+      with self.assertRaises(ValueError):
+        bool([np_a, np_b] == [np_b, np_b])
+
+      # Similar to lists we shouldn't be able to do a `in` check such as
+      # `if a in [a,b]`. However if `a` is the first element, it works due to
+      # short circuiting
+      self.assertIn(tf_a, [tf_a, tf_b])
+      with self.assertRaises(ValueError):
+        bool(tf_a in [tf_b, tf_a])
+      with self.assertRaises(ValueError):
+        bool(tf_a in [tf_b, tf_c])
+      self.assertIn(np_a, [np_a, np_b])
+      with self.assertRaises(ValueError):
+        bool(np_a in [np_b, np_a])
+      with self.assertRaises(ValueError):
+        bool(np_a in [np_b, np_c])
+
+      # rank 0
+      self.assertAllEqual(
+          constant_op.constant(1) == constant_op.constant(1), True)
+      self.assertAllEqual(
+          constant_op.constant(1) == constant_op.constant(2), False)
+      self.assertAllEqual(np.array(1) == np.array(1), True)
+      self.assertAllEqual(np.array(1) == np.array(2), False)
+    finally:
+      if default:
+        ops.enable_tensor_equality()
+      else:
+        ops.disable_tensor_equality()
+
+  def testEqualityBroadcast(self):
+    default = ops.Tensor._USE_EQUALITY
+
+    try:
+      tf_a = constant_op.constant([1, 1])
+      tf_b = constant_op.constant([1, 1])
+      tf_c = constant_op.constant([[1, 1], [1, 1]])
+      tf_d = constant_op.constant([[1, 2], [1, 2]])
+      tf_e = constant_op.constant([1, 1, 1])
+      np_a = np.array([1, 1])
+      np_b = np.array([1, 1])
+      np_c = np.array([[1, 1], [1, 1]])
+      np_d = np.array([[1, 2], [1, 2]])
+      np_e = np.array([1, 1, 1])
+
+      ops.disable_tensor_equality()
+      # We don't do element-wise comparison
+      self.assertNotEqual(tf_a, tf_b)
+      self.assertNotEqual(tf_a, tf_c)
+      self.assertNotEqual(tf_a, tf_d)
+
+      ops.enable_tensor_equality()
+      # We do element-wise comparison but can't convert results array to bool
+      with self.assertRaises(ValueError):
+        bool(tf_a == tf_b)
+      self.assertAllEqual(tf_a == tf_b, [True, True])
+      with self.assertRaises(ValueError):
+        bool(tf_a == tf_c)
+      self.assertAllEqual(tf_a == tf_c, [[True, True], [True, True]])
+      with self.assertRaises(ValueError):
+        bool(tf_a == tf_d)
+      self.assertAllEqual(tf_a == tf_d, [[True, False], [True, False]])
+      # TODO(b/120678848): If shapes do not match we should instead return False
+      with self.assertRaises(errors.InvalidArgumentError):
+        bool(tf_a != tf_e)
+
+      with self.assertRaises(ValueError):
+        bool(np_a == np_b)
+      self.assertAllEqual(np_a == np_b, [True, True])
+      with self.assertRaises(ValueError):
+        bool(np_a == np_c)
+      self.assertAllEqual(np_a == np_c, [[True, True], [True, True]])
+      self.assertAllEqual(np_a == np_d, [[True, False], [True, False]])
+      bool(np_a != np_e)
+    finally:
+      if default:
+        ops.enable_tensor_equality()
+      else:
+        ops.disable_tensor_equality()
 
   def testContext(self):
     ctx = context.Context()
@@ -241,19 +373,6 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
-  def testRunMetadata(self):
-    context.enable_run_metadata()
-    t = constant_op.constant(1.0)
-    _ = t + t  # Runs an operation which will be in the RunMetadata
-    run_metadata = context.export_run_metadata()
-    context.disable_run_metadata()
-    step_stats = run_metadata.step_stats
-    self.assertGreater(len(step_stats.dev_stats), 0)
-    cpu_stats = step_stats.dev_stats[0]
-    self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
-                     cpu_stats.device)
-    self.assertGreaterEqual(len(cpu_stats.node_stats), 1)
-
   def testMultiCpuPlacement(self):
     with ops.device('cpu:1'):
       x = constant_op.constant(1.0)
@@ -409,13 +528,13 @@ class TFETest(test_util.TensorFlowTestCase):
       x = x.gpu()
       x = x.gpu()
       x = x.cpu()
-      context.async_wait()
+      context.context().executor.wait()
 
     # Invalid device
     with self.assertRaises(RuntimeError):
       x.gpu(context.context().num_gpus() + 1)
-      context.async_wait()
-    context.async_clear_error()
+      context.context().executor.wait()
+    context.context().executor.clear_error()
 
   @test_util.run_gpu_only
   def testCopyScope(self):
@@ -437,6 +556,22 @@ class TFETest(test_util.TensorFlowTestCase):
     test_var = variables.Variable([2., 3.])
     self.assertAllEqual(test_fn(test_var), 1.0)
 
+  def testPyFunctionAsync(self):
+
+    def simple_fn(v):
+      one = constant_op.constant(1.)
+      return v + one
+
+    @def_function.function
+    def test_fn(v):
+      return script_ops.eager_py_func(simple_fn, [v], dtypes.float32)
+
+    async_executor = executor.new_executor(enable_async=True)
+    with context.executor_scope(async_executor):
+      test_var = variables.Variable(2.)
+      self.assertAllEqual(test_fn(test_var), 3.0)
+    async_executor.wait()
+
   @test_util.run_gpu_only
   def testNumpyForceCPU(self):
     cpu = constant_op.constant([[1., 2.], [3., 4.]])
@@ -485,8 +620,8 @@ class TFETest(test_util.TensorFlowTestCase):
           inputs=[three, five],
           attrs=('transpose_a', False, 'transpose_b', False, 'T',
                  three.dtype.as_datatype_enum))
-      context.async_wait()
-    context.async_clear_error()
+      context.context().executor.wait()
+    context.context().executor.clear_error()
     context.context().execution_mode = context.SYNC
 
   def testExecuteTooManyNumOutputs(self):
@@ -931,10 +1066,10 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
   def testCacheSkipsTensorsTooLarge(self):
     cache = context._EagerTensorCache(max_items=100, max_tensor_size=3)
     cache.put('1', array_ops.zeros((2, 2)))
-    self.assertEqual(cache.get('1'), None)
+    self.assertIsNone(cache.get('1'))
 
     cache.put('2', array_ops.zeros((2)))
-    self.assertNotEqual(cache.get('2'), None)
+    self.assertIsNotNone(cache.get('2'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index c5571b9bb6a..f910986e2dd 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -181,7 +182,16 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
               self._initializer_op = resource_variable_ops.assign_variable_op(
                   self._handle, lifted_initializer, name=n)
+      elif context.executing_eagerly():
+        # In this case, both current scope and init scope are eager.
+        # Assign_variable_op will be executed immediately. So we don't need to
+        # add it to "add_initializers_to" to lift it out.
+        with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+          resource_variable_ops.assign_variable_op(
+              self._handle, initial_value, name=n)
       else:
+        # Init scope is eager but current scope is graph. We will lift out this
+        # variable by addint it into "add_initializers_to".
         if add_initializers_to is not None:
           add_initializers_to[self] = initial_value
         def assign_fn():
@@ -195,7 +205,8 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
         def not_assign_fn():
           return ops.convert_to_tensor(0)
         # Note: this cond is always guaranteed to run because we're inside a
-        # defun which will insert automatic control dependencies.
+        # defun which will insert automatic control dependencies. It will only
+        # execute assign_fn if lifting failed.
         control_flow_ops.cond(
             resource_variable_ops.var_is_initialized_op(self._handle),
             not_assign_fn, assign_fn)
@@ -252,7 +263,8 @@ class Function(object):
                input_signature=None,
                autograph=True,
                experimental_autograph_options=None,
-               experimental_relax_shapes=False):
+               experimental_relax_shapes=False,
+               experimental_compile=None):
     """Initializes a `Function`.
 
     Args:
@@ -268,7 +280,19 @@ class Function(object):
         conversion options when autograph is set to True.
       experimental_relax_shapes: When true, argument shapes may be relaxed to
         avoid unecessary retracing.
-
+      experimental_compile: If false, execute the function in a regular way. The
+        function is optimized by some graph rewrite passes (some ops might be
+        clustered into a single op) and interpreted by the standard TensorFlow
+        executor, which dispatches op kernels one by one as they become
+        executable. Set it to false when directly running a multi-device
+        function on TPUs (e.g. two TPU cores, one TPU core and its
+        host CPU). If True, the function is compiled directly by XLA. XLA would
+        fuse all the ops and emit more efficient code to run for some devices
+        (e.g. TPU, XLA_GPU) and some use cases (e.g. dense tensor computation).
+        It requires that the whole function is compilable by XLA. If None
+        (default), compile the function with XLA when running on TPU and go
+        through the regular function execution path when running on other
+        devices.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -280,6 +304,7 @@ class Function(object):
     self._autograph = autograph
     self._experimental_autograph_options = experimental_autograph_options
     self.experimental_relax_shapes = experimental_relax_shapes
+    self._experimental_compile = experimental_compile
     self._created_variables = None
     self._stateful_fn = None
     self._stateless_fn = None
@@ -316,9 +341,16 @@ class Function(object):
 
   def _defun(self, fn):
     """Returns a defun generated from the input function."""
-    return function_lib.defun(
+    attributes = None
+    if self._experimental_compile is not None:
+      if self._experimental_compile:
+        attributes = {"_XlaCompile": True}
+      else:
+        attributes = {"_XlaCompile": False}
+    return function_lib.defun_with_attributes(
         fn,
         input_signature=self.input_signature,
+        attributes=attributes,
         autograph=self._autograph,
         experimental_autograph_options=self._experimental_autograph_options,
         experimental_relax_shapes=self.experimental_relax_shapes)
@@ -413,7 +445,7 @@ class Function(object):
       return results
 
     # This is the first call of __call__, so we have to initialize.
-    initializer_map = {}
+    initializer_map = object_identity.ObjectIdentityDictionary()
     self._initialize(args, kwds, add_initializers_to=initializer_map)
     if self._created_variables:
       try:
@@ -511,13 +543,15 @@ class Function(object):
     # Note: using defun here avoids an infinite recursion.
     @function_lib.defun
     def initialize_variables():
+      op_map = object_identity.ObjectIdentityDictionary()
       for v, init in initializer_map.items():
         with ops.init_scope():
           if resource_variable_ops.var_is_initialized_op(v.handle):
             # Ignore variables which are already initialized at trace time.
             continue
-        v.assign(lift_to_graph.lift_to_graph(
-            [init], ops.get_default_graph())[init])
+        op_map = lift_to_graph.lift_to_graph(
+            [init], ops.get_default_graph(), op_map=op_map)
+        v.assign(op_map[init])
 
     with ops.init_scope():
       return initialize_variables.get_concrete_function()()
@@ -550,7 +584,7 @@ class Function(object):
           "has been used")
     # Here we trace the function, collect the initializers, and attempt to
     # extract them and run them eagerly. Fail only if we cannot do so.
-    initializer_map = {}
+    initializer_map = object_identity.ObjectIdentityDictionary()
     self._initialize(args, kwargs, add_initializers_to=initializer_map)
 
     # Note: using defun here avoids an infinite recursion.
@@ -579,13 +613,7 @@ class Function(object):
       concrete_functions.extend(
           self._stateless_fn._function_cache.all_values())
     # pylint: enable=protected-access
-    deduplicated_concrete_functions = []
     seen_signatures = []
-    # We are using a list so that:
-    #  - the returned collection is deterministic, and
-    #  - we can use a custom equality operator (is_same_structure).
-    # This is run only at serialization time on likely very small inputs so we
-    # are not concerned about O(n^2) runtime.
     for concrete_function in concrete_functions:
       signature = concrete_function.structured_input_signature
       flattened = nest.flatten(signature)
@@ -597,9 +625,14 @@ class Function(object):
       equal_to_signature = functools.partial(
           function_lib.is_same_structure, signature, check_values=True)
       if not any(equal_to_signature(s) for s in seen_signatures):
-        deduplicated_concrete_functions.append(concrete_function)
         seen_signatures.append(signature)
-    return deduplicated_concrete_functions
+
+    # Re-create concrete functions for these signatures. Re-creating ensures
+    # that if the cache key has changed, the function will be traced again.
+    concrete_functions = []
+    for args, kwargs in seen_signatures:
+      concrete_functions.append(self.get_concrete_function(*args, **kwargs))
+    return concrete_functions
 
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `ConcreteFunction` specialized to inputs and execution context.
@@ -678,7 +711,7 @@ class Function(object):
       ValueError: if this object has not yet been called on concrete values.
     """
     if self._stateful_fn is None:
-      initializer_map = {}
+      initializer_map = object_identity.ObjectIdentityDictionary()
       self._initialize(args, kwargs, add_initializers_to=initializer_map)
       self._initialize_uninitialized_variables(initializer_map)
 
@@ -728,7 +761,8 @@ def function(func=None,
              input_signature=None,
              autograph=True,
              experimental_autograph_options=None,
-             experimental_relax_shapes=False):
+             experimental_relax_shapes=False,
+             experimental_compile=None):
   """Creates a callable TensorFlow graph from a Python function.
 
   `function` constructs a callable that executes a TensorFlow graph
@@ -985,6 +1019,21 @@ def function(func=None,
       autograph=True.
     experimental_relax_shapes: When true, argument shapes may be relaxed to
       avoid unecessary retracing.
+    experimental_compile: If false, execute the function in a regular way. The
+      function is optimized by some graph rewrite passes (some ops might be
+      clustered into a single op) and interpreted by the standard TensorFlow
+      executor, which dispatches op kernels one by one as they become
+      executable. Set it to false when directly running a multi-device function
+      on TPUs (e.g. two TPU cores, one TPU core and its host CPU). If True, the
+      function is compiled directly by XLA (https://www.tensorflow.org/xla).
+      XLA would fuse all the ops and emit more efficient code to run for some
+      devices (e.g. TPU, XLA_GPU) and some use cases (e.g. dense tensor
+      computation). It requires that the whole function is compilable by XLA
+      (e.g. static tensor shape, a subset of operations, no string, compile-time
+      constant input, etc). If None (default), compile the function with XLA
+      when running on TPU and go through the regular function execution path
+      when running on other devices. Note: TensorArrays on TPU don't work with
+      standard TensorFlow executor.
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -1012,7 +1061,8 @@ def function(func=None,
             input_signature=input_signature,
             autograph=autograph,
             experimental_autograph_options=experimental_autograph_options,
-            experimental_relax_shapes=experimental_relax_shapes))
+            experimental_relax_shapes=experimental_relax_shapes,
+            experimental_compile=experimental_compile))
 
   # This code path is for the `foo = tf.function(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 4a7d6fe4e9e..9ab42b63098 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -391,7 +391,8 @@ class DefFunctionTest(test.TestCase):
         outputs.append(inputs[t])
       return outputs
 
-    with self.assertRaisesRegexp(ValueError, 'inner'):
+    with self.assertRaisesRegexp(errors.InaccessibleTensorError,
+                                 'defined in another function or code block'):
       f(array_ops.zeros(shape=(8, 42, 3)))
 
   def testRuntimeErrorNotSticky(self):
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
new file mode 100644
index 00000000000..5dd586c6c0f
--- /dev/null
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -0,0 +1,82 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+
+
+class DefFunctionTest(test.TestCase):
+
+  def testBasic(self):
+
+    def fn(x, a):
+      return x + a
+
+    func = def_function.function(fn, experimental_compile=False)
+    xla_func = def_function.function(fn, experimental_compile=True)
+
+    inputs = constant_op.constant([1, 2, 2, 3, 3])
+    self.assertAllClose([2, 3, 3, 4, 4], func(inputs, 1))
+    self.assertAllClose([2, 3, 3, 4, 4], xla_func(inputs, 1))
+
+  def testUnsupportedOps(self):
+
+    def fn(x):
+      return array_ops.unique(x).y  # Unique is not supported by XLA
+
+    func = def_function.function(fn, experimental_compile=False)
+    xla_func = def_function.function(fn, experimental_compile=True)
+
+    inputs = constant_op.constant([1, 2, 2, 3, 3])
+    self.assertAllClose([1, 2, 3], func(inputs))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, 'not compilable'):
+      xla_func(inputs)
+
+  def testFunctionGradient(self):
+    v = resource_variable_ops.ResourceVariable(2.0)
+
+    def fn(x):
+      return v * x
+
+    func = def_function.function(fn, experimental_compile=False)
+    xla_func = def_function.function(fn, experimental_compile=True)
+
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as tape_1:
+      y_1 = func(x)
+    with backprop.GradientTape() as tape_2:
+      y_2 = xla_func(x)
+    dy_1 = tape_1.gradient(y_1, v)
+    dy_2 = tape_2.gradient(y_2, v)
+
+    self.assertAllClose(6.0, y_1)
+    self.assertAllClose(6.0, y_2)
+    self.assertAllClose(3.0, dy_1)
+    self.assertAllClose(3.0, dy_2)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index a65a760cf54..19f8887ec79 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -137,7 +137,7 @@ def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
   """Monkey-patch to execute to enable execution callbacks."""
   tensors = quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
   for callback in ctx.post_execution_callbacks:
-    callback(op_name, inputs, attrs, tensors, name)
+    callback(op_name, tuple(inputs), attrs, tensors, name)
 
   return tensors
 
@@ -145,6 +145,11 @@ def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
 execute = quick_execute
 
 
+def must_record_gradient():
+  """Import backprop if you want gradients recorded."""
+  return False
+
+
 def record_gradient(unused_op_name, unused_inputs, unused_attrs, unused_results,
                     unused_name):
   """Import backprop if you want gradients recorded."""
diff --git a/tensorflow/python/eager/executor.py b/tensorflow/python/eager/executor.py
new file mode 100644
index 00000000000..be844015dd0
--- /dev/null
+++ b/tensorflow/python/eager/executor.py
@@ -0,0 +1,76 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executor for eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow
+
+
+class Executor(object):
+  """A class for handling eager execution.
+
+  The default behavior for asynchronous execution is to serialize all ops on
+  a single thread. Having different `Executor` objects in different threads
+  enables executing ops asynchronously in parallel:
+
+  ```python
+  def thread_function():
+    executor = executor.Executor(enable_async=True):
+    context.set_executor(executor)
+
+  a = threading.Thread(target=thread_function)
+  a.start()
+  b = threading.Thread(target=thread_function)
+  b.start()
+  ```
+  """
+
+  def __init__(self, handle):
+    self._handle = handle
+
+  def __del__(self):
+    try:
+      # pywrap_tensorflow.TFE_ExecutorWaitForAllPendingNodes(self._handle)
+      pywrap_tensorflow.TFE_DeleteExecutor(self._handle)
+    except TypeError:
+      # Suppress some exceptions, mainly for the case when we're running on
+      # module deletion. Things that can go wrong include the pywrap module
+      # already being unloaded, self._handle. no longer being
+      # valid, and so on. Printing warnings in these cases is silly
+      # (exceptions raised from __del__ are printed as warnings to stderr).
+      pass  # 'NoneType' object is not callable when the handle has been
+      # partially unloaded.
+
+  def is_async(self):
+    return pywrap_tensorflow.TFE_ExecutorIsAsync(self._handle)
+
+  def handle(self):
+    return self._handle
+
+  def wait(self):
+    """Waits for ops dispatched in this executor to finish."""
+    pywrap_tensorflow.TFE_ExecutorWaitForAllPendingNodes(self._handle)
+
+  def clear_error(self):
+    """Clears errors raised in this executor during execution."""
+    pywrap_tensorflow.TFE_ExecutorClearError(self._handle)
+
+
+def new_executor(enable_async):
+  handle = pywrap_tensorflow.TFE_NewExecutor(enable_async)
+  return Executor(handle)
diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index bd153277485..74fc9db8b08 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -28,6 +26,7 @@ from tensorflow.python.eager import execute
 from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -55,35 +54,35 @@ def _forward_gradient(op_name, attr_tuple, inputs, outputs, tangents):
   Returns:
     A flat list of tangents corresponding to `outputs`.
   """
-  float_inputs = []
-  float_indices = []
+  trainable_inputs = []
+  trainable_indices = []
   nontrivial_tangents = []
   for input_index, tensor in enumerate(inputs):
-    if tensor.dtype.is_floating:
-      float_inputs.append(tensor)
-      float_indices.append(input_index)
+    if gradients_util.IsTrainable(tensor):
+      trainable_inputs.append(tensor)
+      trainable_indices.append(input_index)
       nontrivial_tangents.append(tangents[input_index])
 
   with backprop.GradientTape() as transpose_tape:
     with backprop.GradientTape() as backfunc_tape:
-      backfunc_tape.watch(float_inputs)
+      backfunc_tape.watch(trainable_inputs)
       execute.record_gradient(op_name, inputs, attr_tuple, outputs,
                               "forward_op_replay")
 
     forwardprop_aids = []
-    float_outputs = []
+    trainable_outputs = []
     nontrivial_output_indices = []
     for output_index, output in enumerate(outputs):
-      if output.dtype.is_floating:
+      if gradients_util.IsTrainable(output):
         forwardprop_aids.append(
             array_ops.ones_like(output, name="unused_forwardprop_aid"))
-        float_outputs.append(output)
+        trainable_outputs.append(output)
         nontrivial_output_indices.append(output_index)
 
     transpose_tape.watch(forwardprop_aids)
     grads = backfunc_tape.gradient(
-        float_outputs,
-        float_inputs,
+        trainable_outputs,
+        trainable_inputs,
         forwardprop_aids,
         unconnected_gradients=UnconnectedGradients.ZERO)
   nontrivial_output_tangents = transpose_tape.gradient(
@@ -183,10 +182,9 @@ class ForwardGradientAccumulator(object):
         logging.log_first_n(
             logging.WARN, "The dtype of the watched tensor must be "
             "floating (e.g. tf.float32), got %r", 5, t.dtype)
-      if hasattr(t, "handle"):
-        # TODO(allenl): Handle watching variables.
-        raise NotImplementedError("Currently only Tensors may be watched.")
       g = ops.convert_to_tensor(g, dtype=t.dtype)
+      if hasattr(t, "handle"):
+        t = t.handle
       pywrap_tensorflow.TFE_Py_ForwardAccumulatorWatch(self._accumulator, t, g)
 
   def jvp(self, target):
@@ -206,6 +204,9 @@ class ForwardGradientAccumulator(object):
     """
     if self._accumulator is None:
       raise ValueError("Called jvp() without first tracing anything.")
-    return nest.map_structure(
-        functools.partial(pywrap_tensorflow.TFE_Py_ForwardAccumulatorJVP,
-                          self._accumulator), target)
+    def _fetch_jvp(tensor):
+      if hasattr(tensor, "handle"):
+        tensor = tensor.handle
+      return pywrap_tensorflow.TFE_Py_ForwardAccumulatorJVP(
+          self._accumulator, tensor)
+    return nest.map_structure(_fetch_jvp, target)
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 0272ba15a7f..3fd3cdfcdbc 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 import functools
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
+from tensorflow.python.eager import forwardprop_util
+from tensorflow.python.eager import tape as tape_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -33,11 +37,20 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
+_X11_35_DERIVATIVES = [
+    1.1 ** 3.5,
+    3.5 * 1.1 ** 2.5,
+    3.5 * 2.5 * 1.1 ** 1.5,
+    3.5 * 2.5 * 1.5 * 1.1 ** 0.5]
+
+
 # TODO(allenl): Move this somewhere useful once forward gradients are stable.
 def _jvp(f, primals, tangents):
   """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
@@ -120,7 +133,7 @@ def _test_gradients(testcase,
   testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
 
-class ForwardpropTest(test.TestCase):
+class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
   def testForwardGradientFunction(self):
     add_outputs = (constant_op.constant(4.),)
@@ -250,8 +263,51 @@ class ForwardpropTest(test.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1.])], order=3)
 
+  def testExceptionInCustomGradientNotSwallowed(self):
+
+    @custom_gradient.custom_gradient
+    def f(unused_x):
+      def grad(unused_dy):
+        raise ValueError("test_error_string")
+      return 1., grad
+
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      c = constant_op.constant(1.)
+      d = constant_op.constant(2.)
+      acc.watch(c, d)
+      with self.assertRaisesRegexp(ValueError, "test_error_string"):
+        f(c)
+
+  def testPushPopAccumulatorState(self):
+    # Note that this example is somewhat contrived. push_forwardprop_state is
+    # probably only useful in practice for building functions that compute jvps
+    # alongside their usual outputs.
+    with forwardprop.ForwardGradientAccumulator() as acc:
+
+      @custom_gradient.custom_gradient
+      def f(x):
+        y = math_ops.sin(x.numpy())
+
+        def grad(dy):
+          with forwardprop_util.push_forwardprop_state():
+            x_copy = constant_op.constant(x.numpy())
+            acc.watch(x_copy, dy)
+            y_copy = math_ops.sin(x_copy)
+          return dy * acc.jvp(y_copy)
+
+        return y, grad
+
+      c = constant_op.constant(1.)
+      d = constant_op.constant(2.)
+      acc.watch(c, d)
+      output = f(c)
+      self.assertAllClose(d * math_ops.cos(c), acc.jvp(output))
+
+  @parameterized.named_parameters(
+      [("Order{}".format(order), order, expected)
+       for order, expected in enumerate(_X11_35_DERIVATIVES)])
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testHigherOrderPureForward(self):
+  def testHigherOrderPureForward(self, order, expected):
 
     def _forwardgrad(f):
       def _compute_forwardgrad(primal):
@@ -267,17 +323,16 @@ class ForwardpropTest(test.TestCase):
 
     f = _forward
     primal = constant_op.constant(1.1)
-    for expected in [1.1 ** 3.5,
-                     3.5 * 1.1 ** 2.5,
-                     3.5 * 2.5 * 1.1 ** 1.5,
-                     3.5 * 2.5 * 1.5 * 1.1 ** 0.5,
-                     3.5 * 2.5 * 1.5 * 0.5 * 1.1 ** -0.5]:
-      self.assertAllClose(expected, f(primal))
+    for _ in range(order):
       f = _forwardgrad(f)
+    self.assertAllClose(expected, f(primal))
 
-  def testFunctionGradPureForward(self):
+  @parameterized.named_parameters(
+      [("Function", def_function.function),
+       ("NoFunction", lambda f: f)])
+  def testGradPureForward(self, decorator):
 
-    @def_function.function
+    @decorator
     def f(x):
       return x ** 3.5
 
@@ -294,6 +349,35 @@ class ForwardpropTest(test.TestCase):
     self.assertAllClose(3.5 * 2.5 * 1.1 ** 1.5, outer_jvp)
     self.assertIsNone(acc.jvp(outer_acc.jvp(primal_out)))
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testJVPPacking(self):
+    two = constant_op.constant(2.)
+    with forwardprop.ForwardGradientAccumulator() as outer_acc:
+      primal_in = constant_op.constant(1.)
+      outer_acc.watch(primal_in, constant_op.constant(2.))
+      with forwardprop.ForwardGradientAccumulator() as inner_acc:
+        inner_jvp = constant_op.constant(3.)
+        inner_acc.watch(primal_in, inner_jvp)
+        outer_acc.watch(inner_jvp, constant_op.constant(4.))
+        packed_input_indices, packed_input_tangents = (
+            pywrap_tensorflow.TFE_Py_PackForwardGradients([primal_in]))
+        self.assertAllClose([3., 2., 4.], packed_input_tangents)
+        expected_indices = (
+            # inner_acc watches primal_in
+            ((0, 1),),
+            # outer_acc watches primal_in and inner_jvp
+            ((0, 2),
+             (1, 3)))
+        self.assertAllEqual(expected_indices, packed_input_indices)
+        primal_out = primal_in * two
+        self.assertAllClose(6., inner_acc.jvp(primal_out))
+        self.assertAllClose(4., outer_acc.jvp(primal_out))
+        self.assertAllClose(8., outer_acc.jvp(inner_acc.jvp(primal_out)))
+        packed_output_indices, packed_output_tangents = (
+            pywrap_tensorflow.TFE_Py_PackForwardGradients([primal_out]))
+        self.assertAllClose([6., 4., 8.], packed_output_tangents)
+        self.assertAllEqual(expected_indices, packed_output_indices)
+
   def testFunctionGradInFunctionPureForward(self):
 
     @def_function.function
@@ -330,6 +414,27 @@ class ForwardpropTest(test.TestCase):
         [constant_op.constant([1., 2.])],
         order=3)
 
+  def testReusingForwardGradient(self):
+    m1 = random_ops.random_uniform((256, 2096))
+    m2 = array_ops.identity(m1)
+    tangent1 = random_ops.random_uniform((256, 2096))
+    tangent2 = random_ops.random_uniform((256, 2096))
+    matmul = def_function.function(math_ops.matmul)
+
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      acc.watch(m1, tangent1)
+      result1 = matmul(m1, m1, transpose_b=True)
+      acc.watch(m2, tangent2)
+      result2 = matmul(m2, m2, transpose_b=True)
+
+    def _expected(mat, tangent):
+      return (math_ops.matmul(tangent, mat, transpose_b=True)
+              + math_ops.matmul(mat, tangent, transpose_b=True))
+
+    self.assertAllClose(result1, result2)
+    self.assertAllClose(_expected(m1, tangent1), acc.jvp(result1))
+    self.assertAllClose(_expected(m2, tangent2), acc.jvp(result2))
+
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testHVPMemory(self):
 
@@ -367,6 +472,108 @@ class ForwardpropTest(test.TestCase):
     self.assertAllClose(backback_hvp, forwardback_hvp_eager)
     self.assertAllClose(backback_hvp, forwardback_hvp_function)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testShouldRecordAndStopRecord(self):
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      c = constant_op.constant(1.)
+      c_tangent = constant_op.constant(2.)
+      acc.watch(c, c_tangent)
+      with backprop.GradientTape() as tape:
+        self.assertFalse(tape_lib.should_record_backprop([c]))
+        self.assertEqual(
+            1, pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes([c]))
+        tape.watch(c)
+        self.assertEqual(
+            2, pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes([c]))
+        self.assertTrue(tape_lib.should_record_backprop([c]))
+        with tape_lib.stop_recording():
+          self.assertEqual(
+              0, pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes([c]))
+          self.assertFalse(tape_lib.should_record_backprop([c]))
+          d = c * 2.
+        self.assertEqual(
+            2, pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes([c]))
+        self.assertTrue(tape_lib.should_record_backprop([c]))
+        self.assertFalse(tape_lib.should_record_backprop([d]))
+        self.assertIsNone(acc.jvp(d))
+      self.assertIsNone(tape.gradient(d, c))
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testRecordingSelectively(self):
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      c = constant_op.constant(1.)
+      c_tangent = constant_op.constant(2.)
+      acc.watch(c, c_tangent)
+      with backprop.GradientTape(persistent=True) as tape:
+        tape.watch(c)
+        with tape_lib.stop_recording():
+          two = constant_op.constant(2.)
+          d = c * two
+          three = constant_op.constant(3.)
+          e = c * three
+        self.assertIsNone(acc.jvp(d))
+        self.assertIsNone(acc.jvp(e))
+        self.assertIsNone(tape.gradient(d, c))
+        self.assertIsNone(tape.gradient(e, c))
+        tape_lib.record_operation_forwardprop_only(
+            "CustomForwardMul", [d], [c, two],
+            lambda dd: (two * dd, c * dd), None)
+        tape_lib.record_operation_backprop_only(
+            "CustomBackwardMul", [e], [c, three],
+            lambda de: (three * de, c * de))
+        self.assertAllClose(4., acc.jvp(d))
+        self.assertIsNone(acc.jvp(e))
+        self.assertIsNone(tape.gradient(d, c))
+        self.assertAllClose(3., tape.gradient(e, c))
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testRecordingWithJVPIndices(self):
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      c = constant_op.constant(1.)
+      acc.watch(c, 10.)
+      _, packed_input_tangents = (
+          pywrap_tensorflow.TFE_Py_PackForwardGradients([c]))
+      self.assertAllClose([10.], packed_input_tangents)
+      d = constant_op.constant(2.)
+      d_tangent = constant_op.constant(3.)
+      tape_lib.record_operation_forwardprop_only(
+          "FunctionWithInlineJVPs",
+          [d] + [d_tangent],
+          [c] + packed_input_tangents,
+          None, (((0, 1),),))
+      self.assertAllClose(3., acc.jvp(d))
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testVariableWatched(self):
+    v = variables.Variable([1., 2., 3.])
+    with forwardprop.ForwardGradientAccumulator() as acc:
+      acc.watch(v, constant_op.constant([.1, -.2, .3]))
+      self.assertAllClose([.1, -.2, .3], acc.jvp(v))
+      x = v * 2.
+      self.assertAllClose([.2, -.4, .6], acc.jvp(x))
+      x2 = v + .1
+      self.assertAllClose([.1, -.2, .3], acc.jvp(x2))
+
+  # NOTE: assert_no_new_pyobjects_executing_eagerly fails flakily on this
+  # test... could be something wrong with the test decorator, or some sort of
+  # nondeterminstic caching.
+  def testMirroredVariableWatched(self):
+
+    def _replicated(input_tangent):
+      with forwardprop.ForwardGradientAccumulator() as acc:
+        acc.watch(v, input_tangent)
+        self.assertAllClose([.1, -.2, .3], acc.jvp(v))
+        x = v * 2.
+        self.assertAllClose([.2, -.4, .6], acc.jvp(x))
+        x2 = v + .1
+        self.assertAllClose([.1, -.2, .3], acc.jvp(x2))
+
+    strategy = mirrored_strategy.MirroredStrategy()
+    with strategy.scope():
+      v = variables.Variable([1., 2., 3.])
+      strategy.experimental_run_v2(
+          _replicated, args=(constant_op.constant([.1, -.2, .3]),))
+
 
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
diff --git a/tensorflow/python/eager/forwardprop_util.py b/tensorflow/python/eager/forwardprop_util.py
new file mode 100644
index 00000000000..81d6c61db0c
--- /dev/null
+++ b/tensorflow/python/eager/forwardprop_util.py
@@ -0,0 +1,47 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for managing forward accumulators.
+
+A separate file from forwardprop.py so that functions can use these utilities.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python import pywrap_tensorflow
+
+
+@contextlib.contextmanager
+def push_forwardprop_state():
+  """Temporarily push or pop transient state for accumulators in the active set.
+
+  Allows an accumulator which is currently processing an operation to
+  temporarily reset its state. This is useful when building forwardprop versions
+  of functions, where an accumulator will trigger function building and then
+  must process captured symbolic tensors while building it. Without pushing and
+  poping, accumulators ignore operations executed as a direct result of their
+  own jvp computations.
+
+  Yields:
+    None (used for its side effect).
+  """
+  try:
+    pywrap_tensorflow.TFE_Py_ForwardAccumulatorPushState()
+    yield
+  finally:
+    pywrap_tensorflow.TFE_Py_ForwardAccumulatorPopState()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 0ebed906724..9b5cce467f2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -29,10 +29,12 @@ import weakref
 
 import numpy as np
 import six
+from six.moves import map
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
@@ -60,6 +62,7 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -75,13 +78,29 @@ FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
 BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 
 
+def _make_input_signature_hashable(elem):
+  """Ensure elem is hashable even if a Variable is nested in it."""
+  # TODO(slebedev): consider using nest.
+  if isinstance(elem, tuple):
+    return tuple(map(_make_input_signature_hashable, elem))
+
+  # If the element is not hashable, assume it is a weakref to a variable
+  # and return the dtype & shape. Else, simply return the element
+  try:
+    hash(elem)
+  except TypeError:
+    assert isinstance(elem, weakref.ReferenceType)
+    v = elem()
+    return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype)
+
+  return elem
+
+
 CacheKey = collections.namedtuple("CacheKey", [
     "input_signature", "parent_graph", "device_functions", "colocation_stack",
     "in_cross_replica_context"
 ])
 
-CacheKey.replace = CacheKey._replace  # pylint: disable=protected-access
-
 
 def _flat_shape_list(*params):
   """Return a flat list of TensorShapes, one for each tensor[spec] in `*params`.
@@ -355,9 +374,11 @@ class _EagerDefinedFunction(object):
     operations = [op for op in graph.get_operations() if op not in input_ops]
 
     graph_output_names = graph._output_names  # pylint: disable=protected-access
-    if (graph_output_names is not None
-        and all(t in graph_output_names for t in outputs)):
-      output_names = [compat.as_bytes(graph_output_names[t]) for t in outputs]
+    if (graph_output_names is not None and
+        all(ops.tensor_id(t) in graph_output_names for t in outputs)):
+      output_names = [
+          compat.as_bytes(graph_output_names[ops.tensor_id(t)]) for t in outputs
+      ]
       if len(set(output_names)) != len(output_names):
         # There are duplicate names for some reason, probably an invalid
         # signature. Revert to auto-naming.
@@ -449,7 +470,8 @@ class _EagerDefinedFunction(object):
     """
     if len(args) != len(self.signature.input_arg):
       raise ValueError(
-          "Arguments and signature arguments do not match: %s %s " %
+          "Arguments and signature arguments do not match. "
+          "got: %s, expected: %s " %
           (len(args), len(list(self.signature.input_arg))))
 
     function_call_options = ctx.function_call_options
@@ -512,6 +534,449 @@ class _EagerDefinedFunction(object):
       return outputs
 
 
+class _DelayedRewriteGradientFunctions(object):
+  """Caches forward/backward functions with a delayed forward rewrite."""
+
+  def __init__(self, func_graph, attrs, func_graph_deleter):
+    """Construct an inference function and initialize caches."""
+    # A map from the number of forward function outputs with accepted gradients
+    # to forward and backward functions, used to cache non-tape backward
+    # function generation.
+    self._cached_function_pairs = {}
+    self._func_graph = func_graph
+    self._inference_function = _EagerDefinedFunction(
+        _inference_name(self._func_graph.name), self._func_graph,
+        self._func_graph.inputs, self._func_graph.outputs, attrs)
+    self._attrs = attrs
+    self._gradient_name = None
+    # Note that the FuncGraph is mutated later, so we need to inspect it now to
+    # figure out the user-specified outputs of the inference function.
+    self._num_inference_outputs = len(self._func_graph.outputs)
+    self._func_graph_deleter = func_graph_deleter
+
+  def forward_backward(self, num_doutputs=None):
+    """A possibly-cached pair of forward and backward functions."""
+    if num_doutputs is None:
+      num_doutputs = self._num_inference_outputs
+    forward_backward = self._cached_function_pairs.get(num_doutputs)
+    if forward_backward is not None:
+      return forward_backward
+    forward, backward = self._construct_forward_backward(num_doutputs)
+    self._cached_function_pairs[num_doutputs] = (forward, backward)
+    return forward, backward
+
+  def _construct_forward_backward(self, num_doutputs):
+    """Constructs a pair of forward and backward functions.
+
+    Args:
+      num_doutputs: The constructed backprop function will take output gradients
+        for the first `num_doutputs` outputs of the forward function. Defaults
+        to the number of outputs for the inference function, but when
+        higher-order gradients are computed this will increase to include side
+        outputs.
+
+    Returns:
+      A pair of (forward_function, backward_function):
+        forward_function: A re-generated inference function (an
+          _EagerDefinedFunction) to account for new side outputs, if any extra
+          were required when building the backward pass.
+        backward_function: A ConcreteFunction that Takes `num_doutputs`
+          arguments and returns gradients with respect to inputs of the forward
+          function.
+    """
+    trainable_outputs = [
+        output for output in self._func_graph.outputs[:num_doutputs]
+        if gradients_util.IsTrainable(output)]
+
+    signature = []
+    for t in trainable_outputs:
+      signature.append(
+          tensor_spec.TensorSpec(*default_gradient.shape_and_dtype(t)))
+
+    def _backprop_function(*grad_ys):
+      return gradients_util._GradientsHelper(  # pylint: disable=protected-access
+          trainable_outputs,
+          self._func_graph.inputs,
+          grad_ys=grad_ys,
+          src_graph=self._func_graph)
+
+    with self._func_graph.as_default():
+      backwards_graph = func_graph_module.FuncGraph(
+          _backward_name(self._func_graph.name))
+      func_graph_module.func_graph_from_py_func(
+          name=backwards_graph.name,
+          python_func=_backprop_function,
+          args=[], kwargs={},
+          signature=signature,
+          func_graph=backwards_graph)
+      backwards_graph_captures = backwards_graph.external_captures
+      captures_from_forward = [
+          c for c in backwards_graph_captures if
+          not isinstance(c, ops.EagerTensor) and c.graph is self._func_graph]
+
+      forward_function_name = _forward_name(self._func_graph.name)
+
+      existing_outputs = object_identity.ObjectIdentitySet(
+          self._func_graph.outputs)
+      for capture in captures_from_forward:
+        if capture not in existing_outputs:
+          existing_outputs.add(capture)
+          self._func_graph.outputs.append(capture)
+      backward_function_attr = _parse_func_attrs(
+          {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
+      backward_function_attr.update(self._attrs)
+
+      backward_function = ConcreteFunction(
+          backwards_graph, attrs=backward_function_attr)
+      forward_function_attr = _parse_func_attrs({
+          BACKWARD_FUNCTION_ATTRIBUTE_NAME:
+          backward_function.name})
+      forward_function_attr.update(self._attrs)
+
+      forward_function = _EagerDefinedFunction(
+          forward_function_name, self._func_graph, self._func_graph.inputs,
+          self._func_graph.outputs, forward_function_attr)
+      return forward_function, backward_function
+
+  def _rewrite_forward_and_call_backward(self, op, *doutputs):
+    """Add outputs to the forward call and feed them to the grad function."""
+    forward_function, backwards_function = self.forward_backward(len(doutputs))
+    if not backwards_function.outputs:
+      return []
+    forward_function.add_to_graph(op.graph)
+
+    # pylint: disable=protected-access
+    # Rewrite an inference call op to be a forward call op
+    op._set_func_attr("f", forward_function.name)
+    op._set_type_list_attr("Tout", forward_function._output_types)
+    op._add_outputs(
+        forward_function._output_types[len(op.outputs):],
+        forward_function._output_shapes[len(op.outputs):])
+    for i in range(len(op.outputs)):
+      func_graph_output = forward_function._func_graph_outputs[i]
+      custom_gradient.copy_handle_data(func_graph_output, op.outputs[i])
+    # pylint: enable=protected-access
+
+    capture_mapping = dict(
+        zip([ops.tensor_id(t) for t in self._func_graph.outputs], op.outputs))
+    remapped_captures = [
+        capture_mapping.get(ops.tensor_id(capture), capture)
+        for capture in backwards_function.captured_inputs
+    ]
+
+    # Replace Nones with zeros since we're calling a graph function which
+    # expects numeric inputs.
+    cleaned_doutputs = []
+    for doutput, placeholder in zip(doutputs, self._func_graph.outputs):
+      if gradients_util.IsTrainable(placeholder):
+        if doutput is not None:
+          cleaned_doutputs.append(doutput)
+        else:
+          cleaned_doutputs.append(default_gradient.zeros_like(placeholder))
+
+    # Compute the gradients using the side outputs
+    return backwards_function._call_flat(  # pylint: disable=protected-access
+        cleaned_doutputs, remapped_captures)
+
+  def register(self):
+    """Registers a delayed-rewrite gradient with a unique name (idempotent).
+
+    The gradient rewrites an inference call op to a forward call op, but does
+    not modify a pre-existing forward call op. It then computes the gradient
+    from the output's gradients and the side outputs of the forward op.
+
+    Returns:
+      The name under which gradient was registered.
+    """
+    if self._gradient_name:
+      return self._gradient_name
+    self._gradient_name = "PartitionedCall-%s" % ops.uid()
+
+    @ops.RegisterGradient(self._gradient_name)
+    def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
+      return self._rewrite_forward_and_call_backward(op, *doutputs)
+    return self._gradient_name
+
+  @property
+  def forward(self):
+    """A forward function with only user-specified outputs.
+
+    The call operation for the returned inference function can be rewritten into
+    a forward function. This only happens if the backward function (from the
+    `backward` method) ends up being used to compute gradients.
+
+    This approach avoids constructing unnecessary graphs, but it only works if
+    we are calling this function when not executing eagerly.
+
+    Returns:
+      An _EagerDefinedFunction.
+    """
+    return self._inference_function
+
+  def backward(self, outputs):
+    """Fetch a backward function for `outputs` from the forward function."""
+    def _backward_function(*args):
+      call_op = outputs[0].op
+      return self._rewrite_forward_and_call_backward(call_op, *args)
+    return _backward_function, outputs
+
+
+class _TapeGradientFunctions(object):
+  """Caches forward and backward functions compatible with eager gradients.
+
+  In contrast to the delayed-rewrite approach in
+  `_DelayedRewriteGradientFunctions` which only works with delayed execution,
+  the forward function generated by this class has a fixed set of outputs which
+  may be preserved by a tape in order to compute gradients later.
+
+  This class is abstract; its child classes differ in how many side outputs of
+  the forward function their backward function accepts gradients for, which
+  determines whether higher-order tape gradients are possible.
+  """
+
+  def __init__(self, func_graph, attrs, func_graph_deleter):
+    self._func_graph = func_graph
+    self._attrs = attrs
+    self._forward = None
+    self._backward = None
+    self._num_outputs = len(func_graph.outputs)
+    self._func_graph_deleter = func_graph_deleter
+
+  def _build_functions_for_outputs(self, outputs):
+    """Forward+backward functions where the backward function sees `outputs`."""
+    # First figure out which of `outputs` are trainable. We'll accept gradients
+    # for each of these in the backward function.
+    handles_to_variables = self._func_graph.variable_captures
+    trainable_outputs = []
+    for output in outputs:
+      if gradients_util.IsTrainable(output):
+        # Swap in the Variable object for resource handles if we can so
+        # sparse gradients work.
+        output = handles_to_variables.get(ops.tensor_id(output), output)
+        trainable_outputs.append(output)
+
+    backwards_graph = func_graph_module.FuncGraph(
+        _backward_name(self._func_graph.name))
+    # Keep track of the forward graph so that if the backwards graph
+    # tries to capture tensors those will be correctly captured first in
+    # the forward graph. This is an edge case that can only happen with
+    # tf.custom_gradient.
+    backwards_graph._forward_func_graph = self._func_graph  # pylint: disable=protected-access
+    with backwards_graph.as_default():
+      gradients_wrt_outputs = []
+      for output in trainable_outputs:
+        gradient_shape, gradient_dtype = default_gradient.shape_and_dtype(
+            output)
+        gradients_wrt_outputs.append(
+            graph_placeholder(gradient_dtype, gradient_shape))
+      gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
+          trainable_outputs,
+          self._func_graph.inputs,
+          grad_ys=gradients_wrt_outputs,
+          src_graph=self._func_graph)
+
+      captures_from_forward = [
+          c for c in backwards_graph.external_captures
+          if not isinstance(c, ops.EagerTensor) and c.graph is self._func_graph
+      ]
+      existing_outputs = object_identity.ObjectIdentitySet(
+          self._func_graph.outputs)
+      for capture in captures_from_forward:
+        if capture not in existing_outputs:
+          existing_outputs.add(capture)
+          self._func_graph.outputs.append(capture)
+
+    forward_function_name = _forward_name(self._func_graph.name)
+    backward_function_attr = _parse_func_attrs(
+        {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
+    backward_function_attr.update(self._attrs)
+
+    # The ordering of `backwards_graph.inputs` is important: inputs of
+    # `backward_function` correspond to outputs (including
+    # side outputs) of `self._tape_forward_function`.
+    backwards_graph.inputs = (
+        gradients_wrt_outputs + backwards_graph.internal_captures)
+    backwards_graph.outputs.extend(
+        grad
+        for grad in nest.flatten(gradients_wrt_inputs, expand_composites=True)
+        if grad is not None)
+    backwards_graph.structured_outputs = gradients_wrt_inputs
+    backward_function = ConcreteFunction(
+        backwards_graph, attrs=backward_function_attr)
+
+    forward_function_attr = _parse_func_attrs({
+        BACKWARD_FUNCTION_ATTRIBUTE_NAME:
+            backward_function.name})
+    forward_function_attr.update(self._attrs)
+
+    forward_function = _EagerDefinedFunction(
+        forward_function_name, self._func_graph, self._func_graph.inputs,
+        self._func_graph.outputs,
+        forward_function_attr)
+    return forward_function, backward_function
+
+  @property
+  def forward(self):
+    """Construct or fetch a forward function with side-outputs.
+
+    When graph building without a tape active, symbolic gradients rely on
+    regenerating the backward function for higher-order gradients (to account
+    for new side outputs of the rewritten forward function call). Thus there is
+    no fixed backward function for this case. However, when a tape is active
+    (eager or graph building), we generate fixed backward and forward functions
+    at forward function call time.
+
+    This difference between the tape and non-tape cases is to avoid building
+    unneeded backward functions while graph building (where we may or may not
+    eventually need gradients).
+
+    Returns:
+      A forward _EagerDefinedFunction.
+    """
+    if self._forward is None:
+      self._forward, self._backward = (
+          self._forward_and_backward_functions())
+    return self._forward
+
+  def backward(self, outputs):
+    """Create a backward function given `outputs` from the forward function."""
+    capture_mapping = dict(
+        zip([ops.tensor_id(t) for t in self._func_graph.outputs], outputs))
+    remapped_captures = [
+        capture_mapping.get(ops.tensor_id(capture), capture)
+        for capture in self._backward.captured_inputs
+    ]
+    # We may need to use zeros_like to get a zero for variant Tensors with
+    # unconnected gradients. We do that in advance so we don't have to hold on
+    # to the outputs themselves, which may not be needed otherwise.
+    variant_zeros_like = {}
+    backward_function_inputs = (
+        len(self._backward.inputs) - len(self._backward.captured_inputs))
+    recorded_outputs = []
+    trainable_recorded_outputs = 0
+    skip_positions = []
+    for output_index, output in enumerate(outputs):
+      if trainable_recorded_outputs < backward_function_inputs:
+        recorded_outputs.append(output)
+      if gradients_util.IsTrainable(output):
+        trainable_recorded_outputs += 1
+      else:
+        skip_positions.append(output_index)
+      if output.dtype == dtypes.variant:
+        variant_zeros_like[output_index] = default_gradient.zeros_like(output)
+
+    def _backward_function_wrapper(*args):
+      """Process output gradients and call the backward function."""
+      if not self._backward.outputs:
+        return []
+      processed_args = []
+      input_index = 0
+      for output_index, arg in enumerate(args):
+        if output_index in skip_positions:
+          continue
+        if arg is None:
+          # We're calling a (non-polymorphic) ConcreteFunction, so we need to
+          # have a Tensor value for each Tensor we thought would be trainable
+          # based on its dtype, even if it ended up being unconnected.
+          input_placeholder = self._backward.inputs[
+              input_index]
+          if input_placeholder.dtype == dtypes.variant:
+            arg = variant_zeros_like[output_index]
+          else:
+            arg = array_ops.zeros(
+                *default_gradient.shape_and_dtype(input_placeholder))
+        processed_args.append(arg)
+        input_index += 1
+        if input_index >= backward_function_inputs:
+          break
+      return self._backward._call_flat(  # pylint: disable=protected-access
+          processed_args, remapped_captures)
+
+    return _backward_function_wrapper, recorded_outputs
+
+
+class _FirstOrderTapeGradientFunctions(_TapeGradientFunctions):
+  """Caches tape-friendly functions for first-order gradients."""
+
+  def __init__(self, func_graph, attrs, func_graph_deleter):
+    super(_FirstOrderTapeGradientFunctions, self).__init__(
+        func_graph, attrs, func_graph_deleter)
+    self._num_inference_outputs = len(func_graph.outputs)
+    self._func_graph_deleter = func_graph_deleter
+
+  def _forward_and_backward_functions(self):
+    """Shortcut for when only first-order gradients are required.
+
+    The returned backward function does not accept gradients with respect to
+    side output of forward_function. This is fine as long as the user can't
+    possibly request second order tape gradients, as when they've used a single
+    non-persistent GradientTape. Since we don't need the backward function to
+    take gradients with respect to side outputs, we can skip some potentially
+    slow graph building.
+
+    Returns:
+      A tuple of (forward_function, backward_function):
+        forward_function: Takes the same inputs as the inference function, but
+          returns side outputs used by backward_function in addition to the
+          inference function's outputs.
+        backward_function: Takes side outputs from forward_function and
+          gradients with respect to the "real" outputs of forward_function and
+          returns gradients with respect to the inputs.
+    """
+    outputs = self._func_graph.outputs[:self._num_inference_outputs]
+    return self._build_functions_for_outputs(outputs)
+
+
+class _HigherOrderTapeGradientFunctions(_TapeGradientFunctions):
+  """Caches tape-friendly functions for higher-order gradients."""
+
+  # TODO(b/136189779): Cond/while under a tape may need similar logic. Consider
+  # generalizing if so.
+  def _forward_and_backward_functions(self):
+    """Forward and backward functions suitable for higher-order gradients.
+
+    Unlike in `_FirstOrderTapeGradientFunctions`, the backward function built by
+    this method accepts gradients for all of the outputs of the returned forward
+    function, including side outputs.
+
+    Returns:
+      A tuple of (forward_function, backward_function):
+        forward_function: Takes the same inputs as the inference function, but
+          returns side outputs used by backward_function in addition to the
+          inference function's outputs.
+        backward_function: Takes side outputs from forward_function and
+          gradients with respect to all of its outputs, real and side. Returns
+          gradients with respect to the inputs.
+    """
+    outputs = []
+    # First we need to figure out how many side outputs from the forward pass
+    # will be required. We do this in a temporary graph to avoid actually
+    # running multiple copies of the backward pass (one per _GradientsHelper
+    # call).
+    #
+    # While computing gradients, the backward function captures Tensors from
+    # the forward function. We add these as side outputs of the original
+    # function. However, we then need to accept output gradients with respect
+    # to these side outputs for higher order gradients to work. Thus we loop
+    # until the number of outputs of the function stabilizes. Note that this
+    # is only required for tape gradients, where we need to declare in advance
+    # all of the forward op's outputs: symbolic gradients with tf.gradients
+    # instead rely on regenerating backward functions when higher-order
+    # gradients are requested.
+    while len(outputs) < len(self._func_graph.outputs):
+      new_outputs = self._func_graph.outputs[len(outputs):]
+      outputs = list(self._func_graph.outputs)
+      self._build_functions_for_outputs(new_outputs)
+    forward_function, backward_function = (
+        self._build_functions_for_outputs(outputs))
+    if len(self._func_graph.outputs) != len(outputs):
+      raise AssertionError(
+          ("Unexpectedly added new outputs to the forward function when "
+           "building the backward function: {}").format(
+               self._func_graph.outputs[len(outputs):]))
+    return forward_function, backward_function
+
+
 class _PossibleTapeGradientTypes(enum.Enum):
   """Represents the output of TFE_Py_TapeSetPossibleGradientTypes."""
   NONE = 0
@@ -526,7 +991,8 @@ class ConcreteFunction(object):
   is differentiable under `tf.GradientTape` objects.
   """
 
-  def __init__(self, func_graph, attrs=None, signature=None):
+  def __init__(self, func_graph, attrs=None, signature=None,
+               shared_func_graph=True):
     """Initialize a `ConcreteFunction`.
 
     Args:
@@ -536,6 +1002,9 @@ class ConcreteFunction(object):
         definition.
      signature: a nested sequence of `TensorSpec` objects specifying the input
        signature of this function.
+     shared_func_graph: If False, the ConcreteFunction takes ownership of
+       `func_graph` and will break reference cycles when it is deleted. This
+       makes the FuncGraph inoperable.
 
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
@@ -544,40 +1013,29 @@ class ConcreteFunction(object):
     self._arg_keywords = None
     self._num_positional_args = None
     self._func_graph = func_graph
-    self._captured_inputs = list(self._func_graph.captures.keys())
-    self._captured_closures = [
-        x[0] for x in self._func_graph.deferred_captures.values()]
-    self._num_outputs = len(self._func_graph.outputs)
+    self._captured_inputs = self._func_graph.external_captures
+    self._captured_closures = self._func_graph.deferred_external_captures
     self._output_shapes = tuple(
         output.shape for output in self._func_graph.outputs)
-    self._attrs = _parse_func_attrs(attrs or {})
-
-    self._inference_function = _EagerDefinedFunction(
-        _inference_name(self._func_graph.name), self._func_graph,
-        self._func_graph.inputs, self._func_graph.outputs, self._attrs)
-
-    # When graph building without a tape active, symbolic gradients rely on
-    # regenerating the backward function for higher-order gradients (to account
-    # for new side outputs of the rewritten forward function call). Thus there
-    # is no fixed backward function for this case. However, when a tape is
-    # active (eager or graph building), we generate fixed backward and forward
-    # functions at forward function call time.
-    #
-    # This difference between the tape and non-tape cases is to avoid building
-    # unneeded backward functions while graph building (where we may or may not
-    # eventually need gradients).
-    self._tape_forward_function_first_order = None
-    self._tape_backward_function_first_order = None
-    self._tape_forward_function_higher_order = None
-    self._tape_backward_function_higher_order = None
-
-    # A map from the number of forward function outputs with accepted gradients
-    # to backward functions, used to cache non-tape backward function
-    # generation.
-    self._cached_graph_backprop_functions = {}
-
+    attrs = _parse_func_attrs(attrs or {})
     self._signature = signature
-    self._gradient_name = None
+
+    if shared_func_graph:
+      self._garbage_collector = None
+    else:
+      self._garbage_collector = ConcreteFunctionGarbageCollector(
+          func_graph)
+
+    # Pairs of forward and backward functions used for computing gradients.
+    #
+    # These each get a reference to the FuncGraph deleter since they use the
+    # FuncGraph directly.
+    self._delayed_rewrite_functions = _DelayedRewriteGradientFunctions(
+        func_graph, attrs, self._garbage_collector)
+    self._first_order_tape_functions = _FirstOrderTapeGradientFunctions(
+        func_graph, attrs, self._garbage_collector)
+    self._higher_order_tape_functions = _HigherOrderTapeGradientFunctions(
+        func_graph, attrs, self._garbage_collector)
 
   def __call__(self, *args, **kwargs):
     """Executes the wrapped function.
@@ -685,6 +1143,11 @@ class ConcreteFunction(object):
     ctx = context.context()
     executing_eagerly = ctx.executing_eagerly()
 
+    # Copy saveable status of function's graph to current FuncGraph.
+    default_graph = ops.get_default_graph()
+    if default_graph.building_function and not self._func_graph.saveable:
+      default_graph.mark_as_unsaveable(self._func_graph.saving_errors)
+
     if any(isinstance(a, composite_tensor.CompositeTensor) for a in args):
       raise AssertionError("Expected all args to be Tensors or Variables; "
                            "but got CompositeTensor: %r" % args)
@@ -695,7 +1158,7 @@ class ConcreteFunction(object):
         resource_variable_ops.variable_accessed(v)
 
     tensor_inputs = []
-    variables_used = set([])
+    variables_used = object_identity.ObjectIdentitySet([])
     for i, arg in enumerate(args):
       if isinstance(arg, resource_variable_ops.BaseResourceVariable):
         # We can pass a variable more than once, and in this case we need to
@@ -736,104 +1199,24 @@ class ConcreteFunction(object):
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
     args = tensor_inputs + captured_inputs
-
-    possible_gradient_type = _PossibleTapeGradientTypes(
-        pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes(args))
-    if possible_gradient_type == _PossibleTapeGradientTypes.FIRST_ORDER:
-      if context.executing_eagerly():
-        # There is a single non-persistent tape active, so the user can only
-        # request first-order gradients from a tape. We can spend less time
-        # graph building since we know this.
-        #
-        # We may still end up computing higher-order gradients, but that'd be
-        # through `tf.gradients`, which can re-write the forward pass and so
-        # needs no preparation here.
-        forward_function, backward_function = (
-            self._tape_functions_for_first_order())
-        return self._tape_backprop_call(
-            args, forward_function, backward_function)
-      else:
-        # We can avoid computing second-order gradients in some cases by doing a
-        # delayed rewrite when graph building. Since we know we'll only compute
-        # first-order tape gradients, the delayed rewrite is safe: we won't need
-        # to tell the tape about side outputs.
-        #
-        # TODO(allenl): This case is really dirty. It would be better if we
-        # could temporarily pop all of the current tapes to avoid
-        # accidentally taking second-order gradients.
-        return self._backprop_call_with_delayed_rewrite(args)
-    elif possible_gradient_type == _PossibleTapeGradientTypes.HIGHER_ORDER:
-      # Either there's a persistent tape watching, or there are multiple nested
-      # tapes. Either way, the user may request higher-order gradients. We'll
-      # spend a bit more time and make sure higher-order gradients are correct.
-      forward_function, backward_function = (
-          self._tape_functions_for_higher_order())
-      return self._tape_backprop_call(args, forward_function, backward_function)
-    # else possible_gradient_type == _PossibleTapeGradientTypes.NONE, meaning no
-    # tape is recording.
-
-    # Only need to override the gradient in graph mode and when we have outputs.
-    if context.executing_eagerly() or not self.outputs:
-      outputs = self._inference_function.call(
+    forward_backward = self._select_forward_and_backward_functions(args)
+    forward_function = forward_backward.forward
+    if executing_eagerly:
+      flat_outputs = forward_function.call(
           ctx, args, cancellation_manager=cancellation_manager)
     else:
-      self._register_gradient()
+      gradient_name = self._delayed_rewrite_functions.register()
       with ops.get_default_graph().gradient_override_map(
-          {"PartitionedCall": self._gradient_name,
-           "StatefulPartitionedCall": self._gradient_name}):
-        outputs = self._inference_function.call(ctx, args)
-    return self._build_call_outputs(outputs)
-
-  def _register_gradient(self):
-    """Registers the gradient for this `ConcreteFunction`.
-
-    The gradient rewrites an inference call op to a forward call op, but does
-    not modify a pre-existing forward call op. It then computes the gradient
-    from the output's gradients and the side outputs of the forward op.
-    """
-    if self._gradient_name:
-      return
-    self._gradient_name = "PartitionedCall-%s" % ops.uid()
-
-    @ops.RegisterGradient(self._gradient_name)
-    def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
-      return self._grad_fn(op, *doutputs)
-
-  def _grad_fn(self, op, *doutputs):
-    """Gradients of this function."""
-    backwards_function = self._graph_backprop_function(len(doutputs))
-    self._forward_function.add_to_graph(op.graph)
-
-    # pylint: disable=protected-access
-    # Rewrite an inference call op to be a forward call op
-    op._set_func_attr("f", self._forward_function.name)
-    op._set_type_list_attr("Tout", self._forward_function._output_types)
-    op._add_outputs(
-        self._forward_function._output_types[len(op.outputs):],
-        self._forward_function._output_shapes[len(op.outputs):])
-    for i in range(len(op.outputs)):
-      func_graph_output = self._forward_function._func_graph_outputs[i]
-      custom_gradient.copy_handle_data(func_graph_output, op.outputs[i])
-    # pylint: enable=protected-access
-
-    capture_mapping = dict(zip(self._func_graph.outputs, op.outputs))
-    remapped_captures = []
-    for capture in backwards_function.captured_inputs:
-      remapped_captures.append(capture_mapping.get(capture, capture))
-
-    # Replace Nones with zeros since we're calling a graph function which
-    # expects numeric inputs.
-    cleaned_doutputs = []
-    for doutput, placeholder in zip(doutputs, self._func_graph.outputs):
-      if gradients_util.IsTrainable(placeholder):
-        if doutput is not None:
-          cleaned_doutputs.append(doutput)
-        else:
-          cleaned_doutputs.append(default_gradient.zeros_like(placeholder))
-
-    # Compute the gradients using the side outputs
-    return backwards_function._call_flat(  # pylint: disable=protected-access
-        cleaned_doutputs, remapped_captures)
+          {"PartitionedCall": gradient_name,
+           "StatefulPartitionedCall": gradient_name}):
+        flat_outputs = forward_function.call(ctx, args)
+    if isinstance(flat_outputs, ops.Operation) or flat_outputs is None:
+      # We only record function calls which have outputs.
+      return self._build_call_outputs(flat_outputs)
+    backward_function, to_record = forward_backward.backward(flat_outputs)
+    tape.record_operation(forward_function.signature.name,
+                          to_record, args, backward_function)
+    return self._build_call_outputs(flat_outputs)
 
   def _experimental_with_cancellation_manager(self, cancellation_manager):
     """Returns a callable that invokes a cancelable version of this function.
@@ -855,7 +1238,7 @@ class ConcreteFunction(object):
   @property
   def name(self):
     """`ConcreteFunction` name."""
-    return self._inference_function.name
+    return self._delayed_rewrite_functions.forward.name
 
   @property
   def graph(self):
@@ -895,7 +1278,7 @@ class ConcreteFunction(object):
   @property
   def function_def(self):
     """Returns a `FunctionDef` object representing this function."""
-    return self._inference_function.definition
+    return self._delayed_rewrite_functions.forward.definition
 
   @property
   def output_shapes(self):
@@ -915,390 +1298,81 @@ class ConcreteFunction(object):
             self._func_graph.structured_outputs),
         expand_composites=False)
 
-  def add_to_graph(self, g=None, register_gradient_functions=False):
-    """Registers the function, adds it to the graph g or default graph."""
+  def add_to_graph(self, g=None):
+    """Registers the function, adds it to the graph g or default graph.
+
+    Args:
+      g: If specified, registers the function with this graph. Defaults to the
+        current context (either the default graph or the eager context).
+    """
     # If we are not executing eagerly, adds the function to default graph if no
     # graph is specified.
     # In case of eager execution, function definition gets added to context
     # during construction itself.
 
-    # TODO(allenl/shivaniagrawal): rename this to register to reflect the
-    # method's functionality better. Remove register_gradient_functions argument
-    # and figure out if these needs to be registered.
-
     if not context.executing_eagerly() and not g:
       g = ops.get_default_graph()
-    self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
+    self._delayed_rewrite_functions.forward.add_to_graph(g)
 
-    # pylint: disable=protected-access
-    if register_gradient_functions:
-      # There are two situations for the actual call of a defun:
-      # 1. If none of the input args are resource variables or watch by any
-      #   tape, and it will run the _inference_function of concrete_func for
-      #   forward pass, the gradient will be generated by standard mechanism.
-      # 2. Otherwise, defun will create two functions, one for forward pass,
-      #   and the backward pass will be created via tape.
-      #   When registering the function, we register both cases.
-      backward_function = self._graph_backprop_function()._inference_function
-      forward_function = self._forward_function
-      # pylint: enable=protected-access
-      forward_function.add_to_graph(g)
-      backward_function.add_to_graph(g)
+  def add_gradient_functions_to_graph(self, g=None):
+    """Add forward/backward functions to graph `g` or the current context."""
+    if not context.executing_eagerly() and not g:
+      g = ops.get_default_graph()
+    self._delayed_rewrite_functions.forward.add_to_graph(g)
+    forward_function, backward_function = (
+        self._delayed_rewrite_functions.forward_backward())
+    forward_function.add_to_graph(g)
+    backward_function.add_to_graph(g)
 
-  def _graph_backprop_function(self, num_doutputs=None):
-    """A possibly-cached backprop function."""
-    backward_function = self._cached_graph_backprop_functions.get(
-        num_doutputs, None)
-    if backward_function is not None:
-      return backward_function
-    backward_function = self._construct_graph_backprop_function(num_doutputs)
-    self._cached_graph_backprop_functions[num_doutputs] = backward_function
-    return backward_function
+  def _register_delayed_rewrite_gradient(self):
+    """Registers a delayed-rewrite gradient function and returns the name."""
+    return self._delayed_rewrite_functions.register()
 
-  def _construct_graph_backprop_function(self, num_doutputs=None):
-    """Constructs a backprop function object for this function.
+  def _select_forward_and_backward_functions(self, args):
+    """Selects forward and backward functions based on the calling context.
+
+    The forward function computes the "real" function outputs, `self._outputs`,
+    and any extra values needed by the corresponding backward function.
 
     Args:
-      num_doutputs: The constructed backprop function will take output gradients
-        for the first `num_doutputs` outputs of the forward function. Defaults
-        to the number of outputs for the inference function, but when
-        higher-order gradients are computed this will increase to include side
-        outputs.
+      args: A flat list of Tensors with all of the inputs to the forward
+        function (including user-specified and captured inputs).
 
     Returns:
-      A backward function taking `num_doutputs` arguments and returning
-      gradients with respect to inputs of the forward function.
-
-      self._forward_function is re-generated to account for new side outputs, if
-      any extra were required when building the backward pass.
+      An object with a `forward` property containing an _EagerDefinedFunction,
+      and a corresponding `backward` method which takes outputs from the forward
+      function and returns a backward function.
     """
-    if num_doutputs is None:
-      num_doutputs = len(self._inference_function.signature.output_arg)
-    trainable_outputs = [
-        output for output in self._func_graph.outputs[:num_doutputs]
-        if gradients_util.IsTrainable(output)]
-
-    signature = []
-    for t in trainable_outputs:
-      signature.append(
-          tensor_spec.TensorSpec(*default_gradient.shape_and_dtype(t)))
-
-    def _backprop_function(*grad_ys):
-      return gradients_util._GradientsHelper(  # pylint: disable=protected-access
-          trainable_outputs,
-          self._func_graph.inputs,
-          grad_ys=grad_ys,
-          src_graph=self._func_graph)
-
-    with self._func_graph.as_default():
-      backwards_graph = func_graph_module.FuncGraph(
-          _backward_name(self._func_graph.name))
-      func_graph_module.func_graph_from_py_func(
-          name=backwards_graph.name,
-          python_func=_backprop_function,
-          args=[], kwargs={},
-          signature=signature,
-          func_graph=backwards_graph)
-      backwards_graph_captures = list(backwards_graph.captures.keys())
-      captures_from_forward = [
-          c for c in backwards_graph_captures if
-          not isinstance(c, ops.EagerTensor) and c.graph is self._func_graph]
-
-      forward_function_name = _forward_name(self._func_graph.name)
-
-      existing_outputs = set(self._func_graph.outputs)
-      for capture in captures_from_forward:
-        if capture not in existing_outputs:
-          existing_outputs.add(capture)
-          self._func_graph.outputs.append(capture)
-      backward_function_attr = _parse_func_attrs(
-          {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
-      backward_function_attr.update(self._attrs)
-
-      backward_function = ConcreteFunction(
-          backwards_graph, attrs=backward_function_attr)
-      forward_function_attr = _parse_func_attrs({
-          BACKWARD_FUNCTION_ATTRIBUTE_NAME:
-          backward_function._inference_function.name})  # pylint: disable=protected-access
-      forward_function_attr.update(self._attrs)
-
-      self._forward_function = _EagerDefinedFunction(
-          forward_function_name, self._func_graph, self._func_graph.inputs,
-          self._func_graph.outputs, forward_function_attr)
-      return backward_function
-
-  def _tape_functions_for_first_order(self):
-    """Shortcut for when only first-order gradients are required.
-
-    The returned backward function does not accept gradients with respect to
-    side output of forward_function. This is fine as long as the user can't
-    possibly request second order tape gradients, as when they've used a single
-    non-persistent GradientTape. Since we don't need the backward function to
-    take gradients with respect to side outputs, we can skip some potentially
-    slow graph building.
-
-    Returns:
-      A tuple of (forward_function, backward_function):
-        forward_function: Takes the same inputs as the inference function, but
-          returns side outputs used by backward_function in addition to the
-          inference function's outputs.
-        backward_function: Takes side outputs from forward_function and
-          gradients with respect to the "real" outputs of forward_function and
-          returns gradients with respect to the inputs.
-    """
-    if self._tape_forward_function_first_order is not None:
-      return (self._tape_forward_function_first_order,
-              self._tape_backward_function_first_order)
-    outputs = self._func_graph.outputs[
-        :len(self._inference_function.signature.output_arg)]
-    forward_function, backward_function = (
-        self._tape_forward_and_backward_functions(outputs))
-    self._tape_forward_function_first_order = forward_function
-    self._tape_backward_function_first_order = backward_function
-    return forward_function, backward_function
-
-  # TODO(b/136189779): Cond/while under a tape may need similar logic. Consider
-  # generalizing if so.
-  def _tape_functions_for_higher_order(self):
-    """Forward and backward functions suitable for higher-order gradients.
-
-    Unlike `_tape_functions_for_first_order`, the backward function built by
-    this method accepts gradients for all of the outputs of the returned forward
-    function, including side outputs.
-
-    Returns:
-      A tuple of (forward_function, backward_function):
-        forward_function: Takes the same inputs as the inference function, but
-          returns side outputs used by backward_function in addition to the
-          inference function's outputs.
-        backward_function: Takes side outputs from forward_function and
-          gradients with respect to all of its outputs, real and side. Returns
-          gradients with respect to the inputs.
-    """
-    if self._tape_forward_function_higher_order is not None:
-      return (self._tape_forward_function_higher_order,
-              self._tape_backward_function_higher_order)
-    outputs = []
-    # First we need to figure out how many side outputs from the forward pass
-    # will be required. We do this in a temporary graph to avoid actually
-    # running multiple copies of the backward pass (one per _GradientsHelper
-    # call).
-    #
-    # While computing gradients, the backward function captures Tensors from
-    # the forward function. We add these as side outputs of the original
-    # function. However, we then need to accept output gradients with respect
-    # to these side outputs for higher order gradients to work. Thus we loop
-    # until the number of outputs of the function stabilizes. Note that this
-    # is only required for tape gradients, where we need to declare in advance
-    # all of the forward op's outputs: symbolic gradients with tf.gradients
-    # instead rely on regenerating backward functions when higher-order
-    # gradients are requested.
-    while len(outputs) < len(self._func_graph.outputs):
-      new_outputs = self._func_graph.outputs[len(outputs):]
-      outputs = list(self._func_graph.outputs)
-      self._tape_forward_and_backward_functions(new_outputs)
-    forward_function, backward_function = (
-        self._tape_forward_and_backward_functions(outputs))
-    if len(self._func_graph.outputs) != len(outputs):
-      raise AssertionError(
-          ("Unexpectedly added new outputs to the forward function when "
-           "building the backward function: {}").format(
-               self._func_graph.outputs[len(outputs):]))
-    self._tape_forward_function_higher_order = forward_function
-    self._tape_backward_function_higher_order = backward_function
-    return forward_function, backward_function
-
-  def _tape_forward_and_backward_functions(self, outputs):
-    """Constructs tape forward and back functions for `outputs`."""
-    # First figure out which of `outputs` are trainable. We'll accept gradients
-    # for each of these in the backward function.
-    handles_to_variables = {self._func_graph.captures[v.handle]: v
-                            for v in self._func_graph.variables
-                            if v.handle in self._func_graph.captures}
-    trainable_outputs = []
-    for output in outputs:
-      if gradients_util.IsTrainable(output):
-        # Swap in the Variable object for resource handles if we can so
-        # sparse gradients work.
-        output = handles_to_variables.get(output, output)
-        trainable_outputs.append(output)
-
-    backwards_graph = func_graph_module.FuncGraph(
-        _backward_name(self._func_graph.name))
-    # Keep track of the forward graph so that if the backwards graph
-    # tries to capture tensors those will be correctly captured first in
-    # the forward graph. This is an edge case that can only happen with
-    # tf.custom_gradient.
-    backwards_graph._forward_func_graph = self._func_graph  # pylint: disable=protected-access
-    with backwards_graph.as_default():
-      gradients_wrt_outputs = []
-      for output in trainable_outputs:
-        gradient_shape, gradient_dtype = default_gradient.shape_and_dtype(
-            output)
-        gradients_wrt_outputs.append(
-            graph_placeholder(gradient_dtype, gradient_shape))
-      gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
-          trainable_outputs,
-          self._func_graph.inputs,
-          grad_ys=gradients_wrt_outputs,
-          src_graph=self._func_graph)
-
-      captures_from_forward = [
-          c for c in backwards_graph.captures.keys() if
-          not isinstance(c, ops.EagerTensor) and c.graph is self._func_graph]
-      existing_outputs = set(self._func_graph.outputs)
-      for capture in captures_from_forward:
-        if capture not in existing_outputs:
-          existing_outputs.add(capture)
-          self._func_graph.outputs.append(capture)
-
-    forward_function_name = _forward_name(self._func_graph.name)
-    backward_function_attr = _parse_func_attrs(
-        {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
-    backward_function_attr.update(self._attrs)
-
-    # The ordering of `backwards_graph.inputs` is important: inputs of
-    # `backward_function` correspond to outputs (including
-    # side outputs) of `self._tape_forward_function`.
-    backwards_graph.inputs = (
-        gradients_wrt_outputs + list(backwards_graph.captures.values()))
-    backwards_graph.outputs.extend(
-        grad
-        for grad in nest.flatten(gradients_wrt_inputs, expand_composites=True)
-        if grad is not None)
-    backwards_graph.structured_outputs = gradients_wrt_inputs
-    backward_function = ConcreteFunction(
-        backwards_graph, attrs=backward_function_attr)
-
-    forward_function_attr = _parse_func_attrs({
-        BACKWARD_FUNCTION_ATTRIBUTE_NAME:
-            backward_function._inference_function.name})  # pylint: disable=protected-access
-    forward_function_attr.update(self._attrs)
-
-    forward_function = _EagerDefinedFunction(
-        forward_function_name, self._func_graph, self._func_graph.inputs,
-        self._func_graph.outputs,
-        forward_function_attr)
-    return forward_function, backward_function
-
-  def _tape_backprop_call(self, args, forward_function, backward_function):
-    """Calls the forward function and records the result on a tape.
-
-    Args:
-      args: All inputs to the function, including resolved captured inputs
-      forward_function: The forward pass, outputting both user-specified and
-        side outputs.
-      backward_function: Computes gradients for inputs of forward_function given
-        output gradients for the first `N` of forward_function's outputs, not
-        necessarily all of them. See `_tape_functions_for_first_order` and
-        `_tape_functions_for_higher_order`.
-
-    Returns:
-      The call output.
-    """
-    ctx = context.context()
-
-    self._register_gradient()
-    with ops.get_default_graph().gradient_override_map(
-        {"PartitionedCall": self._gradient_name,
-         "StatefulPartitionedCall": self._gradient_name}):
-      outputs = forward_function.call(ctx, args)
-
-    if isinstance(outputs, ops.Operation) or outputs is None:
-      return outputs
-
-    # `real_outputs` are the actual outputs of the inference graph function;
-    # `side_outputs` are the intermediate Tensors that were added as outputs to
-    # the forward graph function so that we can compute its gradient.
-    real_outputs = outputs[:self._num_outputs]
-
-    capture_mapping = dict(zip(self._func_graph.outputs, outputs))
-    remapped_captures = [
-        capture_mapping.get(capture, capture)
-        for capture in backward_function.captured_inputs]
-    # We may need to use zeros_like to get a zero for variant Tensors with
-    # unconnected gradients. We do that in advance so we don't have to hold on
-    # to the outputs themselves, which may not be needed otherwise.
-    variant_zeros_like = {}
-    backward_function_inputs = (
-        len(backward_function.inputs) - len(backward_function.captured_inputs))
-    recorded_outputs = []
-    trainable_recorded_outputs = 0
-    skip_positions = []
-    for output_index, output in enumerate(outputs):
-      if trainable_recorded_outputs < backward_function_inputs:
-        recorded_outputs.append(output)
-      if gradients_util.IsTrainable(output):
-        trainable_recorded_outputs += 1
+    possible_gradient_type = _PossibleTapeGradientTypes(
+        pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes(args))
+    if possible_gradient_type == _PossibleTapeGradientTypes.FIRST_ORDER:
+      if context.executing_eagerly():
+        # There is a single non-persistent tape active, so the user can only
+        # request first-order gradients from a tape. We can spend less time
+        # graph building since we know this.
+        #
+        # We may still end up computing higher-order gradients, but that'd be
+        # through `tf.gradients`, which can re-write the forward pass and so
+        # needs no preparation here.
+        return self._first_order_tape_functions
       else:
-        skip_positions.append(output_index)
-      if output.dtype == dtypes.variant:
-        variant_zeros_like[output_index] = default_gradient.zeros_like(output)
-
-    def _backward_function_wrapper(*args):
-      """Process output gradients and call the backward function."""
-      processed_args = []
-      input_index = 0
-      for output_index, arg in enumerate(args):
-        if output_index in skip_positions:
-          continue
-        if arg is None:
-          # We're calling a (non-polymorphic) ConcreteFunction, so we need to
-          # have a Tensor value for each Tensor we thought would be trainable
-          # based on its dtype, even if it ended up being unconnected.
-          input_placeholder = backward_function.inputs[
-              input_index]
-          if input_placeholder.dtype == dtypes.variant:
-            arg = variant_zeros_like[output_index]
-          else:
-            arg = array_ops.zeros(
-                *default_gradient.shape_and_dtype(input_placeholder))
-        processed_args.append(arg)
-        input_index += 1
-      return backward_function._call_flat(  # pylint: disable=protected-access
-          processed_args, remapped_captures)
-
-    tape.record_operation(forward_function.signature.name,
-                          recorded_outputs, args, _backward_function_wrapper)
-    return self._build_call_outputs(real_outputs)
-
-  def _backprop_call_with_delayed_rewrite(self, args):
-    """Calls the inference function and records the result on a tape.
-
-    The recorded backwards function will construct the backwards graph and
-    rewrite the inference function to the forward function. This only happens
-    if the recorded backwards function ends up being used to compute gradients.
-
-    This approach avoids constructing unnecessary graphs, but it only works if
-    we are calling this function when not executing eagerly.
-
-    (Only records results on a tape if the function has outputs)
-
-    Args:
-      args: All inputs to the function, including resolved captured inputs
-
-    Returns:
-      The call output.
-    """
-    ctx = context.context()
-
-    self._register_gradient()
-    with ops.get_default_graph().gradient_override_map(
-        {"PartitionedCall": self._gradient_name,
-         "StatefulPartitionedCall": self._gradient_name}):
-      outputs = self._inference_function.call(ctx, args)
-
-    if isinstance(outputs, ops.Operation) or outputs is None:
-      return outputs
-
-    call_op = outputs[0].op
-
-    def backward_function(*args):
-      return self._grad_fn(call_op, *args)
-
-    tape.record_operation(self._inference_function.signature.name, outputs,
-                          args, backward_function)
-    return self._build_call_outputs(outputs)
+        # We can avoid computing second-order gradients in some cases by doing a
+        # delayed rewrite when graph building. Since we know we'll only compute
+        # first-order tape gradients, the delayed rewrite is safe: we won't need
+        # to tell the tape about side outputs.
+        #
+        # TODO(allenl): This case is really dirty. It would be better if we
+        # could temporarily pop all of the current tapes to avoid
+        # accidentally taking second-order gradients.
+        return self._delayed_rewrite_functions
+    elif possible_gradient_type == _PossibleTapeGradientTypes.HIGHER_ORDER:
+      # Either there's a persistent tape watching, or there are multiple nested
+      # tapes. Either way, the user may request higher-order gradients. We'll
+      # spend a bit more time and make sure higher-order gradients are correct.
+      return self._higher_order_tape_functions
+    # else possible_gradient_type == _PossibleTapeGradientTypes.NONE, meaning no
+    # tape is recording.
+    return self._delayed_rewrite_functions
 
   def _build_call_outputs(self, result):
     """Maps the fdef output list to actual output structure.
@@ -1324,8 +1398,8 @@ class ConcreteFunction(object):
     return ret
 
 
-pywrap_tensorflow.RegisterType("Tensor", ops.Tensor)
-pywrap_tensorflow.RegisterType("IndexedSlices", ops.IndexedSlices)
+_pywrap_utils.RegisterType("Tensor", ops.Tensor)
+_pywrap_utils.RegisterType("IndexedSlices", ops.IndexedSlices)
 
 
 def _deterministic_dict_values(dictionary):
@@ -1583,6 +1657,13 @@ def _convert_numpy_inputs(inputs):
 
 def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
   """Convert inputs to pass into a function with an explicit signature."""
+
+  def format_error_message(inputs, input_signature):
+    return ("  inputs: (\n" + "    " +
+            ",\n    ".join([str(i) for i in inputs]) + ")\n" +
+            "  input_signature: (\n" + "    " +
+            ",\n    ".join([str(i) for i in input_signature]) + ")")
+
   try:
     # TODO(b/124370185): Use all elements as inputs to throw an error if there
     # are ignored arguments. Calling with arguments that are not part of the
@@ -1593,29 +1674,28 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
         expand_composites=True)
   except ValueError:
     raise ValueError("Structure of Python function inputs does not match "
-                     "input_signature. Inputs (%s), input_signature(%s)." %
-                     (str(inputs), str(input_signature)))
+                     "input_signature:\n%s" %
+                     format_error_message(inputs, input_signature))
 
   need_packing = False
   for index, (value, spec) in enumerate(zip(flatten_inputs,
                                             flat_input_signature)):
-    if not pywrap_tensorflow.IsTensor(value):
+    if not _pywrap_utils.IsTensor(value):
       try:
         flatten_inputs[index] = ops.convert_to_tensor(
             value, dtype_hint=spec.dtype)
         need_packing = True
       except ValueError:
         raise ValueError("When input_signature is provided, all inputs to "
-                         "the Python function must be convertible to tensors."
-                         "Inputs (%s), input_signature(%s)." %
-                         (str(inputs), str(input_signature)))
+                         "the Python function must be convertible to "
+                         "tensors:\n%s" %
+                         format_error_message(inputs, input_signature))
 
   if any(not spec.is_compatible_with(other) for spec, other in zip(
       flat_input_signature,
       flatten_inputs)):
-    raise ValueError("Python inputs incompatible with input_signature: "
-                     "inputs (%s), input_signature (%s)" %
-                     (str(inputs), str(input_signature)))
+    raise ValueError("Python inputs incompatible with input_signature:\n%s" %
+                     format_error_message(inputs, input_signature))
 
   if need_packing:
     inputs = nest.pack_sequence_as(
@@ -1794,7 +1874,8 @@ class Function(object):
       args = self.input_signature
       kwargs = {}
     seen_names = set()
-    captured = frozenset(graph_function.graph.internal_captures)
+    captured = object_identity.ObjectIdentitySet(
+        graph_function.graph.internal_captures)
     # pylint: disable=protected-access
     graph_function._arg_keywords = []
     prefix_counts = {}
@@ -1907,8 +1988,12 @@ class Function(object):
     except (AttributeError, IndexError):
       pass
 
-    return CacheKey(input_signature, parent_graph, device_functions,
-                    colocation_stack, in_cross_replica_context)
+    return CacheKey(
+        _make_input_signature_hashable(input_signature),
+        parent_graph,
+        device_functions,
+        colocation_stack,
+        in_cross_replica_context)
 
   def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
     """Create a `ConcreteFunction` from `args` and `kwargs`."""
@@ -1937,17 +2022,12 @@ class Function(object):
             arg_names=arg_names,
             override_flat_arg_shapes=override_flat_arg_shapes,
             capture_by_value=self._capture_by_value),
-        self._function_attributes)
-
-    # pylint: disable=protected-access
-    # Tell the ConcreteFunction to clean up its graph once it goes out of
-    # scope. ConcreteFunction does not do this in its constructor since it
-    # gets used in some places (like Keras) where the FuncGraph lives
-    # longer than the ConcreteFunction.
-    graph_function._garbage_collector = ConcreteFunctionGarbageCollector(
-        graph_function.graph)
-    # pylint: enable=protected-access
-
+        self._function_attributes,
+        # Tell the ConcreteFunction to clean up its graph once it goes out of
+        # scope. This is not the default behavior since it gets used in some
+        # places (like Keras) where the FuncGraph lives longer than the
+        # ConcreteFunction.
+        shared_func_graph=False)
     return graph_function
 
   def _define_function_with_shape_relaxation(self, args, kwargs):
@@ -2031,7 +2111,9 @@ class Function(object):
                    args,
                    kwargs)
 
-      call_context_key = cache_key.replace(input_signature=None)
+      # pylint: disable=protected-access
+      call_context_key = cache_key._replace(input_signature=None)
+      # pylint: disable=protected-access
 
       ag_status = (
           ag_ctx.Status.ENABLED if self._autograph else ag_ctx.Status.DISABLED)
@@ -2077,7 +2159,8 @@ def register(func, *args, **kwargs):
     raise ValueError("Only defun function is allowed to be registered. "
                      "Got type: %s" % type(func))
   concrete_func = func.get_concrete_function(*args, **kwargs)
-  concrete_func.add_to_graph(register_gradient_functions=True)
+  concrete_func.add_to_graph()
+  concrete_func.add_gradient_functions_to_graph()
   return concrete_func
 
 
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index f151bab9f38..1b052ad4f45 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -298,6 +298,24 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
       y = f(x)
     self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
 
+  def testGraphLoopGradientInsideSession(self):
+    with ops.Graph().as_default():
+      n = constant_op.constant(2.0)
+      x = array_ops.placeholder(dtypes.float32, shape=None)
+
+      @def_function.function
+      def f():
+        c = lambda n: n < 10
+        b = lambda n: n * x
+        return control_flow_ops.while_loop(c, b, [n],
+                                           [tensor_shape.unknown_shape()])
+
+      l = f()
+      dx = gradients_impl.gradients(l, [x])[0]
+
+      with self.cached_session():
+        self.assertEqual(dx.eval(feed_dict={x: 2.0}), 24.0)
+
   def testDefunDifferentiable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 477d6b19227..1e525e505b9 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
@@ -149,6 +150,15 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     r = add(x, v2)
     self.assertEqual(3.0, self.evaluate(r))
 
+  def testVariableOnly(self):
+    v = variables.Variable(1.0)
+    add = def_function.function(lambda x: x.assign_add(1.0))
+    r1 = add(v)
+    self.assertEqual(2.0, self.evaluate(r1))
+    c = constant_op.constant(1.0)
+    with self.assertRaisesRegexp(AttributeError, 'no attribute'):
+      add(c)
+
   def testExternalControlDependency(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.0)
@@ -287,7 +297,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def f(_):
       return 1.0
 
-    with self.assertRaisesRegexp(TypeError, 'set'):
+    with self.assertRaisesRegexp(AttributeError, 'set'):
       f(set([]))
 
   def testFuncName(self):
@@ -683,7 +693,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     compiled = def_function.function(f)
     var_handle = compiled()
     self.assertEqual(var_handle.dtype, dtypes.resource)
-    self.assertEqual(var_handle.shape, tensor_shape.scalar())
+    self.assertEqual(var_handle.shape, tensor_shape.TensorShape([]))
     var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
     self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
@@ -760,7 +770,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       compiled = def_function.function(f)
       var_handle = compiled()
       self.assertEqual(var_handle.dtype, dtypes.resource)
-      self.assertEqual(var_handle.shape, tensor_shape.scalar())
+      self.assertEqual(var_handle.shape, tensor_shape.TensorShape([]))
       var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
       self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
@@ -790,14 +800,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       def f():
         tl, value = list_ops.tensor_list_pop_back(
             tensor_list, element_dtype=dtypes.float32)
-        self.assertEqual(value.shape, tensor_shape.scalar())
+        self.assertEqual(value.shape, tensor_shape.TensorShape([]))
         return tl
 
       compiled = def_function.function(f)
       output_tensor_list = compiled()
       _, value = list_ops.tensor_list_pop_back(
           output_tensor_list, element_dtype=dtypes.float32)
-      self.assertEqual(value.shape, tensor_shape.scalar())
+      self.assertEqual(value.shape, tensor_shape.TensorShape([]))
 
   @test_util.run_in_graph_and_eager_modes
   def testDefunForcesResourceVariables(self):
@@ -823,16 +833,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       f(constant_op.constant(1.0))
     run_metadata = context.export_run_metadata()
     context.disable_run_metadata()
-    step_stats = run_metadata.step_stats
-    self.assertNotEmpty(step_stats.dev_stats)
-    cpu_stats = step_stats.dev_stats[0]
-    self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
-                     cpu_stats.device)
-    # Testing for at least 2 because the function call should generate at most
-    # one entry in the step_stats; the ops inside function can generate
-    # arbitrarily many (placeholders, return identities, etc, might be included
-    # or not in the future, so shouldn't be tested for exactly.
-    self.assertGreaterEqual(len(cpu_stats.node_stats), 2)
     self.assertLen(run_metadata.partition_graphs, 1)
 
   def testGraphModeCaptureVariable(self):
@@ -2092,7 +2092,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
         t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-        composite.add_to_graph(register_gradient_functions=True)
+        composite.add_to_graph()
+        composite.add_gradient_functions_to_graph()
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
@@ -2124,6 +2125,32 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # is added.
         self.assertLen(graph._functions, 6)
 
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun',
+           function_decorator=function.defun),
+      dict(testcase_name='DefFunction',
+           function_decorator=def_function.function))
+  def testEagerCaptures(self, function_decorator):
+    with context.eager_mode():
+      large_tensor = array_ops.ones(shape=(256,))
+      self.assertGreater(256, func_graph._EAGER_CONST_THRESHOLD)
+
+      small_tensor = array_ops.ones(shape=(4,))
+      self.assertLessEqual(4, func_graph._EAGER_CONST_THRESHOLD)
+
+      v = resource_variable_ops.ResourceVariable(0.0)
+
+    for captured, op_type in [(large_tensor, 'Placeholder'),
+                              (small_tensor, 'Const'), (v, 'Placeholder')]:
+      @function_decorator
+      def test_fn():
+        return captured + 1  # pylint: disable=cell-var-from-loop
+
+      g = test_fn.get_concrete_function().graph
+      internal_captures = g.internal_captures
+      self.assertLen(internal_captures, 1)
+      self.assertEqual(internal_captures[0].op.type, op_type)
+
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
       return math_ops.matmul(x, y)
@@ -2544,10 +2571,17 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testDecoratedMethodVariableCleanup(self):
     m = DefunnedMiniModel()
     m(array_ops.ones([1, 2]))
-    weak_variables = weakref.WeakSet(m.variables)
-    self.assertLen(weak_variables, 2)
+    variable_refs = list({v.experimental_ref() for v in m.variables})
+    self.assertLen(variable_refs, 2)
     del m
-    self.assertEqual([], list(weak_variables))
+
+    # Verifying if the variables are only referenced from variable_refs.
+    # We expect the reference counter to be 1, but `sys.getrefcount` reports
+    # one higher reference counter because a temporary is created when we call
+    # sys.getrefcount().  Hence check if the number returned is 2.
+    # https://docs.python.org/3/library/sys.html#sys.getrefcount
+    self.assertEqual(sys.getrefcount(variable_refs[0].deref()), 2)
+    self.assertEqual(sys.getrefcount(variable_refs[1].deref()), 2)
 
   def testExecutorType(self):
     @function.defun
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
index a25aa3f1973..f884f6ab2ce 100644
--- a/tensorflow/python/eager/lift_to_graph.py
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import six
 
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
@@ -28,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import object_identity
 
 
 UnliftableError = op_selector.UnliftableError
@@ -120,6 +120,8 @@ def _copy_non_source(op, graph, op_map, base_graph):
     if f is not None and compat.as_str(f.name) not in graph._functions:
       f.add_to_graph(graph)
     # pylint: enable=protected-access
+
+    # Create a new op in the destination graph if it doesn't exist before.
     copied_op = graph.create_op(
         op_type=op.type,
         inputs=copied_inputs,
@@ -200,13 +202,18 @@ def _copy_source(s, graph, op_map, handle_captures, inverse_captures,
   op_map[s.op] = copied_placeholder.op
 
 
-def lift_to_graph(init_tensors, graph, sources=None,
-                  disallowed_placeholders=None, add_sources=False,
-                  handle_captures=False, base_graph=None):
+def lift_to_graph(tensors,
+                  graph,
+                  sources=None,
+                  disallowed_placeholders=None,
+                  add_sources=False,
+                  handle_captures=False,
+                  base_graph=None,
+                  op_map=None):
   """Copies the tensor and all its inputs recursively to the outer graph.
 
   Args:
-    init_tensors: The Tensor to lift.
+    tensors: The Tensors to lift.
     graph: The graph to lift to.
     sources: Optional sequence of nodes to start from. If omitted the whole
       subgraph which feeds into `init_tensor` is lifted.
@@ -218,6 +225,8 @@ def lift_to_graph(init_tensors, graph, sources=None,
       graph or simply create a vanilla placeholder.
     base_graph: The graph from which to lift ops. This will be inferred if not
       specified.
+    op_map: A map contains all the existing nodes that have been lifted to the
+      destination graph, so they won't be lifted and copied again.
 
   Returns:
     A mapping from ops in the current default graph to ops in `graph`.
@@ -225,13 +234,18 @@ def lift_to_graph(init_tensors, graph, sources=None,
   Raises:
     UnliftableError: If a placeholder blocks lifting.
   """
-  variable_init_tensors = {i for i in init_tensors if isinstance(
-      i, resource_variable_ops.ResourceVariable)}
-  init_tensors = set(init_tensors).difference(variable_init_tensors)
-  base_graph = base_graph or list(init_tensors)[0].graph
+  variable_init_tensors = []
+  init_tensors = []
+  for tensor in tensors:
+    if isinstance(tensor, resource_variable_ops.ResourceVariable):
+      variable_init_tensors.append(tensor)
+    else:
+      init_tensors.append(tensor)
+  base_graph = base_graph or init_tensors[0].graph
+  op_map = op_map or object_identity.ObjectIdentityDictionary()
 
   # Check that the initializer does not depend on any placeholders.
-  sources = set(sources or [])
+  sources = object_identity.ObjectIdentitySet(sources or [])
   visited_ops = set([x.op for x in sources])
   op_outputs = collections.defaultdict(set)
 
@@ -277,20 +291,25 @@ def lift_to_graph(init_tensors, graph, sources=None,
 
   # When lifting from one FuncGraph to another, we will need to capture the
   # relevant tensors as well.
-  captures = collections.OrderedDict()
+  captures = []
+  inverse_captures = object_identity.ObjectIdentityDictionary()
+  internal_captures = []
   if (isinstance(base_graph, func_graph.FuncGraph) and
       isinstance(graph, func_graph.FuncGraph)):
     captures = base_graph.captures
-  inverse_captures = {v: k for k, v in captures.items()}
+    for external_capture, internal_capture in captures:
+      inverse_captures[internal_capture] = external_capture
+    internal_captures = base_graph.internal_captures
 
   # ops_to_copy now holds a reverse topologically sorted list of ops which
   # ends in the initializer. We copy those to the outermost graph and
   # build the initialization op there.
   with graph.as_default():
-    op_map = {i: i for i in variable_init_tensors}  # Pass through variables.
+    for i in variable_init_tensors:
+      op_map[i] = i
     source_ops = set()
     # Add the sources in the same order as the original graph.
-    for s in six.itervalues(captures):
+    for s in internal_captures:
       if s in sources:
         sources.remove(s)
         source_ops.add(s.op)
@@ -314,7 +333,7 @@ def lift_to_graph(init_tensors, graph, sources=None,
     input_mutations = []
     control_mutations = []
     for op in reversed(ops_to_copy):
-      if op in source_ops:
+      if op in source_ops or op in op_map:
         continue
       new_input_mutations, new_control_mutations = _copy_non_source(
           op=op, graph=graph, op_map=op_map, base_graph=base_graph)
diff --git a/tensorflow/python/eager/lift_to_graph_test.py b/tensorflow/python/eager/lift_to_graph_test.py
index 619b9dc4a7e..90db3ebb0f5 100644
--- a/tensorflow/python/eager/lift_to_graph_test.py
+++ b/tensorflow/python/eager/lift_to_graph_test.py
@@ -41,7 +41,7 @@ class LiftToGraphTest(test.TestCase):
       return v1 + v2 + v3
 
     concrete_fn = fn.get_concrete_function()
-    original_captures = concrete_fn.graph.captures
+    original_captures = concrete_fn.graph.internal_captures
     outputs = concrete_fn.graph.outputs
 
     for _ in range(100):
@@ -49,11 +49,10 @@ class LiftToGraphTest(test.TestCase):
 
       lift_to_graph.lift_to_graph(
           outputs, g, add_sources=True, handle_captures=True)
-      lifted_captures = g.captures
+      lifted_captures = g.internal_captures
       self.assertLen(lifted_captures, 3)
-      for original_capture, lifted_capture in zip(original_captures.values(),
-                                                  lifted_captures.values()):
-        self.assertEqual(original_capture.name, lifted_capture.name)
+      for original, lifted in zip(original_captures, lifted_captures):
+        self.assertEqual(original.name, lifted.name)
 
   def testClassAttrsRemoved(self):
     """Tests that _class attrs (from colocate_with()) are removed."""
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index 0c34029755a..6f31546820b 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -32,7 +32,8 @@ cuda_py_test(
     tags = [
         "optonly",  # The test is too slow in non-opt mode
     ],
-    xla_enable_strict_auto_jit = True,
+    # TODO(b/140065350): Re-enable
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/eager/memory_tests/memory_test.py b/tensorflow/python/eager/memory_tests/memory_test.py
index 7ad19f3b465..90361c8aa08 100644
--- a/tensorflow/python/eager/memory_tests/memory_test.py
+++ b/tensorflow/python/eager/memory_tests/memory_test.py
@@ -102,3 +102,7 @@ class MemoryTest(test.TestCase):
 
     memory_test_util.assert_no_leak(
         f, num_iters=1000, increase_threshold_absolute_mb=30)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 678aa589c74..cebf786d6a8 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import threading
 import weakref
 
@@ -188,8 +189,16 @@ class OpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual((a <= b), np.less_equal(v1, v2))
       self.assertAllEqual((a > b), np.greater(v1, v2))
       self.assertAllEqual((a >= b), np.greater_equal(v1, v2))
-      self.assertAllEqual((a == b), np.equal(v1, v2)[0])
-      self.assertAllEqual((a != b), np.not_equal(v1, v2)[0])
+
+      # TODO(b/120678848): Remove the else branch once we enable
+      # ops.Tensor._USE_EQUALITY by default.
+      if ops.Tensor._USE_EQUALITY:
+        self.assertAllEqual((a == b), np.equal(v1, v2))
+        self.assertAllEqual((a != b), np.not_equal(v1, v2))
+      else:
+        self.assertAllEqual((a == b), np.equal(v1, v2)[0])
+        self.assertAllEqual((a != b), np.not_equal(v1, v2)[0])
+
       self.assertAllEqual(v1[0], a[constant_op.constant(0)])
 
     ops_test([1, 4, 8], [2, 3, 5])
@@ -272,24 +281,23 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testSilentCopy(self):
     # Temporarily replace the context
     # pylint: disable=protected-access
-    del context._context
-    context._context = context.Context()
+    old_context = context.context()
+    context._set_context(context.Context())
     try:
       config.set_device_policy('silent')
       cpu_tensor = constant_op.constant(1.0)
       gpu_tensor = cpu_tensor.gpu()
       self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)
     finally:
-      del context._context
-      context._context = context.Context()
+      context._set_context(old_context)
     # pylint: enable=protected-access
 
   @test_util.run_gpu_only
   def testSoftPlacement(self):
     # Temporarily replace the context
     # pylint: disable=protected-access
-    del context._context
-    context._context = context.Context()
+    old_context = context.context()
+    context._set_context(context.Context())
     try:
       config.set_device_policy('silent')
       config.set_soft_device_placement(True)
@@ -298,8 +306,7 @@ class OpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(result.device,
                        '/job:localhost/replica:0/task:0/device:GPU:0')
     finally:
-      del context._context
-      context._context = context.Context()
+      context._set_context(old_context)
     # pylint: enable=protected-access
 
   def testRandomUniform(self):
@@ -408,20 +415,38 @@ class OpsTest(test_util.TensorFlowTestCase):
 
   def testWeakKeyDictionaryTensor(self):
     weak_key_dict = weakref.WeakKeyDictionary()
+
     strong_x = constant_op.constant([[1.]])
     strong_y = constant_op.constant([[2.]])
-    weak_key_dict[strong_x] = constant_op.constant([[3.]])
-    weak_key_dict[strong_y] = constant_op.constant([[4.]])
+    strong_x_ref = strong_x.experimental_ref()
+    strong_y_ref = strong_y.experimental_ref()
+    weak_key_dict[strong_x_ref] = constant_op.constant([[3.]])
+    weak_key_dict[strong_y_ref] = constant_op.constant([[4.]])
     strong_y.a = constant_op.constant([[5.]])
-    weak_x = weakref.ref(strong_x)
-    del strong_x
-    self.assertIs(weak_x(), None)
-    self.assertEqual([strong_y], list(weak_key_dict))
+    weak_x_ref = weakref.ref(strong_x)
+
+    del strong_x, strong_x_ref
+    self.assertIs(weak_x_ref(), None)
+    self.assertEqual([strong_y_ref], list(weak_key_dict))
     self.assertEqual(1, len(list(weak_key_dict)))
     self.assertEqual(1, len(weak_key_dict))
-    del strong_y
+
+    del strong_y, strong_y_ref
     self.assertEqual([], list(weak_key_dict))
 
+  def testEagerTensorsCanBeGarbageCollected(self):
+    x = constant_op.constant([[1.]])
+    y = constant_op.constant([[2.]])
+    x.y = y
+    y.x = x
+    weak_x = weakref.ref(x)
+    weak_y = weakref.ref(y)
+    del x
+    del y
+    gc.collect()
+    self.assertIs(weak_x(), None)
+    self.assertIs(weak_y(), None)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
index b40cec91444..d906fc9be19 100644
--- a/tensorflow/python/eager/profiler.py
+++ b/tensorflow/python/eager/profiler.py
@@ -71,14 +71,9 @@ def start():
   with _profiler_lock:
     if _profiler is not None:
       raise ProfilerAlreadyRunningError('Another profiler is running.')
-    profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
     if context.default_execution_mode == context.EAGER_MODE:
       context.ensure_initialized()
-      pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
-          profiler_context,
-          context.context()._handle)  # pylint: disable=protected-access
-    _profiler = pywrap_tensorflow.TFE_NewProfiler(profiler_context)
-    pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+    _profiler = pywrap_tensorflow.TFE_NewProfiler()
     if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
       logging.warning('Another profiler session is running which is probably '
                       'created by profiler server. Please avoid using profiler '
@@ -102,7 +97,7 @@ def stop():
       raise ProfilerNotRunningError(
           'Cannot stop profiling. No profiler is running.')
     if context.default_execution_mode == context.EAGER_MODE:
-      context.async_wait()
+      context.context().executor.wait()
     with c_api_util.tf_buffer() as buffer_:
       pywrap_tensorflow.TFE_ProfilerSerializeToString(
           _profiler,
@@ -161,14 +156,9 @@ def start_profiler_server(port):
   Args:
     port: port profiler server listens to.
   """
-  profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
   if context.default_execution_mode == context.EAGER_MODE:
     context.ensure_initialized()
-    pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
-        profiler_context,
-        context.context()._handle)  # pylint: disable=protected-access
-  pywrap_tensorflow.TFE_StartProfilerServer(profiler_context, port)
-  pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+  pywrap_tensorflow.TFE_StartProfilerServer(port)
 
 
 class Profiler(object):
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 4dbdc2895fd..b81eddac077 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/python/eager/pywrap_tensor.h"
 
 #include <stdlib.h>
+#include <string.h>
 
 #include "structmember.h"  // NOLINT // For PyMemberDef
 #include "tensorflow/c/c_api.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/python/eager/pywrap_tensor_conversion.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
@@ -40,17 +42,31 @@ namespace {
 // events on eager tensors. This is set by TFE_Py_InitEagerTensor, if at all.
 PyObject* eager_tensor_profiler = nullptr;
 
-TFE_Context* GetContext(PyObject* ctx) {
-  TFE_Context* context =
-      reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(ctx, nullptr));
-  if (context == nullptr) {
+TFE_Context* GetContextHandle(PyObject* py_context) {
+  tensorflow::Safe_PyObjectPtr py_context_handle(
+      PyObject_GetAttrString(py_context, "_handle"));
+  if (py_context_handle == nullptr) {
+    // Current Python code makes sure this never happens. If it does, or
+    // becomes hard to maintain, we can call the ensure_initialized() method
+    // here.
+    PyErr_SetString(
+        PyExc_TypeError,
+        "Expected `context` argument in EagerTensor constructor to have a "
+        "`_handle` attribute but it did not. Was eager Context initialized?");
+    return nullptr;
+  }
+
+  auto* ctx = reinterpret_cast<TFE_Context*>(
+      PyCapsule_GetPointer(py_context_handle.get(), nullptr));
+  if (ctx == nullptr) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "Expecting a PyCapsule encoded context handle. Got ",
-                        Py_TYPE(ctx)->tp_name)
+                        "Expected context._handle to contain a PyCapsule "
+                        "encoded pointer to TFE_Context. Got ",
+                        Py_TYPE(py_context_handle.get())->tp_name)
                         .c_str());
   }
-  return context;
+  return ctx;
 }
 
 // Convert a Python numpy.ndarray object to a TFE_TensorHandle.
@@ -104,41 +120,6 @@ PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
   return ret;
 }
 
-TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx,
-                               PyObject* dev) {
-  const char* device = "";
-  if (dev != nullptr && dev != Py_None) {
-    device = PyBytes_AsString(dev);
-#if PY_MAJOR_VERSION >= 3
-    if (device == nullptr) {
-      PyErr_Clear();
-      device = PyUnicode_AsUTF8(dev);
-    }
-#endif
-    if (device == nullptr) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Error parsing device argument to CopyToDevice");
-      return nullptr;
-    }
-  }
-  TFE_Context* context = GetContext(ctx);
-  if (context == nullptr) {  // PyErr already set by GetContext
-    return nullptr;
-  }
-  auto status = tensorflow::make_safe(TF_NewStatus());
-  TFE_TensorHandle* new_handle =
-      TFE_TensorHandleCopyToDevice(handle, context, device, status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat("Error copying tensor to device: ", device,
-                                    ". ", TF_Message(status.get()))
-            .c_str());
-    return nullptr;
-  }
-  return new_handle;
-}
-
 // Helper function to convert `v` to a tensorflow::DataType and store it in
 // `*out`. Returns true on success, false otherwise.
 // Note that we assume that v is a python int (not long) representing a
@@ -167,6 +148,41 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 #endif
 }
 
+// PyObject->tensorflow::DataType conversion function to be used with
+// PyArg_Parse* APIs.
+int ConvertDataType(PyObject* obj, tensorflow::DataType* dst) {
+  if (obj == Py_None) {
+    *dst = tensorflow::DataType::DT_INVALID;
+  } else if (!PyIntToDataType(obj, dst)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat(
+            "Expecting a DataType value for dtype. Got ", Py_TYPE(obj)->tp_name)
+            .c_str());
+    return 0;
+  }
+
+  return 1;
+}
+
+// Conversion function extracting a const char** device name from a PyObject.
+// The function should be used with PyArg_Parse* APIs.
+int ConvertDeviceName(PyObject* obj, const char** dst) {
+  if (obj == Py_None) {
+    *dst = nullptr;
+  } else {
+    auto device_name = TFE_GetPythonString(obj);
+    if (device_name == nullptr) {
+      PyErr_Clear();
+      PyErr_SetString(PyExc_TypeError, "Error parsing device argument.");
+      return 0;
+    }
+    *dst = device_name;
+  }
+
+  return 1;
+}
+
 }  // namespace
 
 namespace tensorflow {
@@ -251,8 +267,10 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(PyObject* value, DataType dtype) {
   return new TFE_TensorHandle(handle);
 }
 
-TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
-                                       tensorflow::DataType dtype) {
+TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
+                                               PyObject* value,
+                                               tensorflow::DataType dtype,
+                                               const char* device_name) {
   tensorflow::Safe_PyObjectPtr value_decrefer;
   if (PyArray_IsScalar(value, Generic)) {
     // Convert numpy scalars to numpy arrays.
@@ -300,24 +318,22 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
 
   if (handle == nullptr) return nullptr;
 
+  Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (dtype != tensorflow::DT_INVALID &&
       dtype != static_cast<DataType>(handle_dtype)) {
     if (tensorflow::IsCompatible(dtype, static_cast<DataType>(handle_dtype))) {
-      Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
       handle = tensorflow::make_safe(
           tensorflow::EagerCast(ctx, handle.get(), handle_dtype,
                                 static_cast<TF_DataType>(dtype), status.get()));
       if (TF_GetCode(status.get()) != TF_OK) {
-        PyErr_SetString(
-            PyExc_TypeError,
-            absl::StrCat(
-                "Error while casting from dtype ",
-                tensorflow::DataTypeString(static_cast<DataType>(handle_dtype)),
-                " to ",
-                tensorflow::DataTypeString(static_cast<DataType>(dtype)), ". ",
-                TF_Message(status.get()))
-                .c_str());
+        PyErr_SetString(PyExc_TypeError,
+                        absl::StrCat("Error while casting from dtype ",
+                                     tensorflow::DataTypeString(
+                                         static_cast<DataType>(handle_dtype)),
+                                     " to ", tensorflow::DataTypeString(dtype),
+                                     ". ", TF_Message(status.get()))
+                            .c_str());
         return nullptr;
       }
     } else {
@@ -332,9 +348,69 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
     }
   }
 
+  // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
+  // memory. We approximate the same behavior for eager execution - keeping
+  // int32 tensors in host memory.
+  //
+  // We do so to preclude the need for callers into such kernels from having to
+  // explicitly place the int32 tensors in host memory. For example, without
+  // this, one needed:
+  //
+  // with tf.device('/gpu:0'):
+  //   ...// code here
+  //   with tf.device('/cpu:0'):
+  //     shape = tf.constant(...)
+  //   y = tf.random_uniform(shape)
+  //
+  // Without the CPU device block, tfe.ops.random_uniform would fail since the
+  // kernel expects the shape in host memory.
+  //
+  // With this support, we simplify the code:
+  //
+  // with tf.device('/gpu:0'):
+  //   y = tf.random_uniform(...)
+  //
+  // The approximation is not exact there are GPU kernels which do not require
+  // host memory for int32 tensors. This will lead to a discrepancy between
+  // eager and graph execution.
+  //
+  // To support remote execution copy int32 tensors to another CPU device.
+  // TODO(ashankar): Fix this.
+  if (device_name != nullptr &&
+      (TFE_TensorHandleDataType(handle.get()) != TF_INT32 ||
+       strstr(device_name, "/device:CPU:0") != nullptr)) {
+    // Note that this is a shallow copy and will share the underlying buffer
+    // if copying to the same device.
+    handle = make_safe(TFE_TensorHandleCopyToDevice(handle.get(), ctx,
+                                                    device_name, status.get()));
+    if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_RuntimeError)) {
+      return nullptr;
+    }
+  }
+
   return handle.release();
 }
 
+TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
+                                       DataType dtype,
+                                       const char* device_name) {
+  // Reduce the overhead of allocation/transfer-to-device for scalars by
+  // caching the corresponding handles. Note that currently only Python
+  // scalars are cached.
+  // TODO(slebedev): also cache singleton NumPy arrays and scalars?
+  if (PyArray_IsPythonNumber(value)) {
+    auto* cache = TFE_TensorHandleCache::Get();
+    TFE_TensorHandle* handle = cache->Lookup(value, dtype, device_name);
+    if (handle != nullptr) return handle;
+    handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
+    if (handle == nullptr) return nullptr;
+    cache->Insert(value, dtype, device_name, handle);
+    return handle;
+  } else {
+    return ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
+  }
+}
+
 }  // namespace tensorflow
 
 extern "C" {
@@ -369,6 +445,10 @@ typedef struct EagerTensor {
   // thread-safe.
   TF_Status* status;
 
+  // The eager Context (from eager/context.py) used by this Tensor.
+  // This is currently used only to make sure context outlives TensorHandles.
+  PyObject* context;
+
   PyObject* weakreflist; /* List of weak references */
 
   // Per-instance attribute dictionary, to support monkey patching
@@ -426,91 +506,25 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->status = TF_NewStatus();
   self->dict = nullptr;
   self->weakreflist = nullptr;
+  self->context = nullptr;
   PyObject* value;
-  PyObject* context = nullptr;
-  PyObject* device = nullptr;
-  PyObject* dtype = Py_None;
-  PyObject* other_value = nullptr;
-  const char* kwlist[] = {"value", "context",     "device",
-                          "dtype", "other_value", nullptr};
-  if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOO|OO",
-                                   const_cast<char**>(kwlist), &value, &context,
-                                   &device, &dtype, &other_value)) {
+  const char* device_name = nullptr;
+  tensorflow::DataType dtype = tensorflow::DataType::DT_INVALID;
+  const char* kwlist[] = {"value", "device", "dtype", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args, kwds, "OO&|O&", const_cast<char**>(kwlist), &value,
+          ConvertDeviceName, &device_name, ConvertDataType, &dtype)) {
     return -1;
   }
 
-  if (other_value != nullptr) {
-    if (!EagerTensor_CheckExact(other_value)) {
-      PyErr_SetString(PyExc_TypeError,
-                      tensorflow::strings::StrCat(
-                          "Expecting an EagerTensor for other_value, got ",
-                          Py_TYPE(other_value)->tp_name)
-                          .c_str());
+  PyObject* py_context = GetPyEagerContext();
+  if (py_context == nullptr) return -1;
+  self->context = py_context;
 
-      return -1;
-    }
-    EagerTensor* other = reinterpret_cast<EagerTensor*>(other_value);
-    self->handle =
-        TFE_TensorHandleCopySharingTensor(other->handle, self->status);
-
-    if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
-      return -1;
-    }
-
-    return 0;
-  }
-
-  // Extract dtype
-  tensorflow::DataType desired_dtype = tensorflow::DT_INVALID;
-  if (dtype != Py_None) {
-    if (!PyIntToDataType(dtype, &desired_dtype)) {
-      PyErr_SetString(PyExc_TypeError,
-                      tensorflow::strings::StrCat(
-                          "Expecting a DataType value for dtype. Got ",
-                          Py_TYPE(dtype)->tp_name)
-                          .c_str());
-      return -1;
-    }
-  }
-  PyErr_Clear();
-  tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(tensorflow::ConvertToEagerTensor(
-          GetContext(context), value, desired_dtype));
+  auto* handle = tensorflow::ConvertToEagerTensor(GetContextHandle(py_context),
+                                                  value, dtype, device_name);
   if (handle == nullptr) return -1;
-
-  // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
-  // memory. We approximate the same behavior for eager execution - keeping
-  // int32 tensors in host memory.
-  //
-  // We do so to preclude the need for callers into such kernels from having to
-  // explicitly place the int32 tensors in host memory. For example, without
-  // this, one needed:
-  //
-  // with tf.device('/gpu:0'):
-  //   ...// code here
-  //   with tf.device('/cpu:0'):
-  //     shape = tf.constant(...)
-  //   y = tf.random_uniform(shape)
-  //
-  // Without the CPU device block, tfe.ops.random_uniform would fail since the
-  // kernel expects the shape in host memory.
-  //
-  // With this support, we simplify the code:
-  //
-  // with tf.device('/gpu:0'):
-  //   y = tf.random_uniform(...)
-  //
-  // The approximation is not exact there are GPU kernels which do not require
-  // host memory for int32 tensors. This will lead to a discrepancy between
-  // eager and graph execution.
-  // TODO(ashankar): Fix this.
-  if (TFE_TensorHandleDataType(handle.get()) != TF_INT32) {
-    // Note that this is a shallow copy and will share the underlying buffer
-    // if copying to the same device.
-    handle = tensorflow::make_safe(CopyToDevice(handle.get(), context, device));
-    if (handle == nullptr) return -1;
-  }
-  self->handle = handle.release();
+  self->handle = handle;
 
   if (!MaybeInvokeCreatedOnEagerTensorProfiler(self)) {
     return -1;
@@ -540,6 +554,10 @@ void EagerTensor_dealloc(EagerTensor* self) {
     TFE_DeleteTensorHandle(self->handle);
     self->handle = nullptr;
   }
+
+  // Decref context after deleting the tensor handle.
+  Py_XDECREF(self->context);
+
   // We have the global interpreter lock, so use this chance to perform delayed
   // refcount decrements.
   tensorflow::ClearDecrefCache();
@@ -641,15 +659,24 @@ static int EagerTensor_settensor_shape(EagerTensor* self, PyObject* value,
 // Function `_copy_to_device`.
 static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
                                             PyObject* kwds) {
-  const char* kwlist[] = {"context", "device", nullptr};
-  PyObject* ctx = nullptr;
-  PyObject* dev = nullptr;
-  if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO", const_cast<char**>(kwlist),
-                                   &ctx, &dev) ||
-      !ctx || !dev) {
+  if (!_PyArg_NoKeywords("copy_to_device", kwds)) return nullptr;
+
+  const char* device_name = nullptr;
+  if (!PyArg_ParseTuple(args, "O&:copy_to_device", ConvertDeviceName,
+                        &device_name)) {
     return nullptr;
   }
-  auto handle = CopyToDevice(self->handle, ctx, dev);
+
+  // Note that this is a shallow copy and will share the underlying buffer
+  // if copying to the same device.
+  TFE_TensorHandle* handle = TFE_TensorHandleCopyToDevice(
+      self->handle, GetContextHandle(self->context), device_name, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_RuntimeError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
+
   return EagerTensorFromHandle(handle);
 }
 
@@ -874,6 +901,13 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
     t->handle = handle;
     t->status = TF_NewStatus();
     t->weakreflist = nullptr;
+    PyObject* py_context = GetPyEagerContext();
+    if (py_context == nullptr) {
+      LOG(ERROR) << "Cannot create an eager tensor before eager context has "
+                    "been set or after it has been deleted";
+      return nullptr;
+    }
+    t->context = py_context;
 
     if (!MaybeInvokeCreatedOnEagerTensorProfiler(t)) {
       return nullptr;
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 53c0d77b5ae..0a462178e78 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -27,10 +27,15 @@ tensorflow::int64 PyEagerTensor_NumElements(const PyObject* tensor);
 
 namespace tensorflow {
 
-// Converts value to a TFE_TensorHandle of a given dtype. Note that the
-// resulting handle is always allocated on CPU.
+// Converts a value to a TFE_TensorHandle of a given dtype. The handle is
+// first allocated on CPU and then copied to a device identified by
+// device_name, unless it is nullptr.
+//
+// Note that an DT_INT32 handle is always kept on CPU regardless of the
+// device_name argument.
 TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
-                                       DataType dtype);
+                                       DataType dtype,
+                                       const char* device_name = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/eager/pywrap_tensor_conversion.cc b/tensorflow/python/eager/pywrap_tensor_conversion.cc
new file mode 100644
index 00000000000..90bd62a1cde
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.cc
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/eager/pywrap_tensor_conversion.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+auto* scalar_cache_hits = tensorflow::monitoring::Counter<0>::New(
+    "/tensorflow/eager/python/scalar_cache_hits",
+    "Number of times a scalar TFE_TensorHandle was retrieved from cache");
+auto* scalar_cache_misses = tensorflow::monitoring::Counter<0>::New(
+    "/tensorflow/eager/python/scalar_cache_misses",
+    "Number of times a scalar TFE_TensorHandle was not available in cache");
+
+TFE_TensorHandleCache* TFE_TensorHandleCache::Get() {
+  // TODO(slebedev): link with Context (in context.py) instead of having
+  // a static global?
+  static auto* cache = new TFE_TensorHandleCache();
+  return cache;
+}
+
+TFE_TensorHandle* TFE_TensorHandleCache::Lookup(
+    PyObject* value, tensorflow::DataType dtype,
+    absl::string_view device_name) const {
+  CHECK_NOTNULL(value);
+  const auto& it = cache.find(Key{PyObjectPtr{value}, dtype, device_name});
+  if (it == cache.end()) {
+    scalar_cache_misses->GetCell()->IncrementBy(1);
+    return nullptr;
+  }
+
+  scalar_cache_hits->GetCell()->IncrementBy(1);
+  auto* handle = it->second;
+  handle->Ref();
+  return new TFE_TensorHandle(handle);
+}
+
+void TFE_TensorHandleCache::Insert(PyObject* value, tensorflow::DataType dtype,
+                                   absl::string_view device_name,
+                                   TFE_TensorHandle* handle) {
+  Py_INCREF(value);
+  handle->handle->Ref();
+  cache.emplace(Key{PyObjectPtr{value}, dtype, device_name}, handle->handle);
+}
+
+void TFE_TensorHandleCache::Clear() {
+  DecrefUnrefAll();
+  cache.clear();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/eager/pywrap_tensor_conversion.h b/tensorflow/python/eager/pywrap_tensor_conversion.h
new file mode 100644
index 00000000000..5caf68c4dae
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.h
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_CONVERSION_H_
+#define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_CONVERSION_H_
+
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+// Wrapper-class allowing to use Python hashing/comparison functions
+// for PyObject*.
+//
+// Note that unlike Safe_PyObjectPtr this class does not steal a
+// reference to a Python object. The caller is responsible for doing
+// Py_INCREF/Py_DECREF.
+struct PyObjectPtr {
+  template <typename H>
+  friend H AbslHashValue(H h, const PyObjectPtr& obj) {
+    return H::combine(std::move(h), PyObject_Hash(obj.ptr));
+  }
+
+  explicit PyObjectPtr(PyObject* ptr) : ptr(ptr) {}
+
+  explicit inline operator PyObject*() const { return ptr; }
+
+  inline bool operator==(const PyObjectPtr& other) const {
+    // We require exact type equality to account for 0 == 0.0 == False.
+    if (Py_TYPE(ptr) != Py_TYPE(other.ptr)) {
+      return false;
+    }
+
+    bool result = PyObject_RichCompareBool(ptr, other.ptr, Py_EQ) > 0;
+    CHECK(!PyErr_Occurred());
+    return result;
+  }
+
+ private:
+  PyObject* ptr;
+};
+
+// Cache mapping PyObject* to the corresponding on-device TFE_TensorHandles.
+// Used to speed up ConvertToEagerTensor for scalars.
+// TODO(slebedev): move ConvertToEagerTensor here.
+struct TFE_TensorHandleCache {
+  static TFE_TensorHandleCache* Get();
+
+  TFE_TensorHandleCache() { cache.reserve(64); }
+  ~TFE_TensorHandleCache() { DecrefUnrefAll(); }
+
+  TFE_TensorHandle* Lookup(PyObject* value, tensorflow::DataType dtype,
+                           absl::string_view device_name) const;
+
+  void Insert(PyObject* value, tensorflow::DataType dtype,
+              absl::string_view device_name, TFE_TensorHandle* handle);
+
+  void Clear();
+
+ private:
+  // TODO(slebedev): should the key depend on TFE_Context?
+  using Key = std::tuple<PyObjectPtr, tensorflow::DataType, absl::string_view>;
+
+  void DecrefUnrefAll() {
+    for (const auto& p : cache) {
+      Py_DECREF(static_cast<PyObject*>(std::get<0>(p.first)));
+      p.second->Unref();
+    }
+  }
+
+  // Not guarded by a mutex because the code is only used while the
+  // GIL is held.
+  absl::flat_hash_map<Key, tensorflow::TensorHandle*> cache;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_CONVERSION_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 57e1e2dd016..b97ff9b270c 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -161,10 +161,14 @@ void TFE_Py_TapeSetAdd(PyObject* tape);
 // Returns true if the tape stack is empty.
 PyObject* TFE_Py_TapeSetIsEmpty();
 
-PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors);
+// Check if any backward tape should record an operation given inputs.
+//
+// Does not take forward accumulators into account.
+PyObject* TFE_Py_TapeSetShouldRecordBackprop(PyObject* tensors);
 
-// Like TFE_Py_TapeSetShouldRecord but with a ternary return:
-//   - 0 if no tape will record (implies TFE_Py_TapeSetShouldRecord is false)
+// Determine possible gradient types, taking forward accumulators into account.
+//   - 0 if no tape will record (implies TFE_Py_TapeSetShouldRecordBackprop
+//     is false and no forward accumulator is watching)
 //   - 1 if first-order gradients may be requested
 //   - 2 if higher-order gradients may be requested
 PyObject* TFE_Py_TapeSetPossibleGradientTypes(PyObject* tensors);
@@ -173,24 +177,61 @@ void TFE_Py_TapeWatch(PyObject* tape, PyObject* tensor);
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id);
 
 // Stops any gradient recording on the current thread.
+//
+// Includes forward accumulators.
 void TFE_Py_TapeSetStopOnThread();
 
 // Restarts gradient recording on the current thread.
 void TFE_Py_TapeSetRestartOnThread();
 
-// Returns if gradient recording is stopped on the current thread.
+// Checks whether gradient recording is stopped on the current thread.
 PyObject* TFE_Py_TapeSetIsStopped();
 
-// Records an operation in the gradient tape stack.type is a string for the
-// operation type, used in the backprop code. output_tensors should be a list of
-// python ops.Tensor objects. input_tensor_ids should be a list of python
-// integers with the ids of the input tensors of the recorded
-// operation. backward_function should be the function to be called during
-// backprop to, given the gradients of the output tensors, produce the gradients
-// of the input tensors.
-void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
-                                   PyObject* input_tensor_ids,
-                                   PyObject* backward_function);
+// Records an operation for the purpose of gradient computation.
+//
+// Arguments:
+//  - op_type is a string for the operation type, used in the backprop code
+//  - output_tensors are a list of Python Tensor objects output by the operation
+//  - input_tensors are a list of input Tensors to the recorded operation
+//  - backward_function is the function to be called during backprop or
+//    forwardprop to, given the gradients of the output tensors, produce the
+//    gradients of the input tensors. This function is automatically transposed
+//    during forwardprop.
+//
+// Records an operation both for backprop (gradient tape) and forwardprop
+// (forward accumulator). Equivalent to calling both
+// TFE_Py_TapeSetRecordOperationBackprop and
+// TFE_Py_TapeSetRecordOperationForwardprop.
+PyObject* TFE_Py_TapeSetRecordOperation(PyObject* op_type,
+                                        PyObject* output_tensors,
+                                        PyObject* input_tensors,
+                                        PyObject* backward_function);
+
+// Records an operation only for backprop (gradient tapes).
+//
+// Same arguments as TFE_Py_TapeSetRecordOperation.
+PyObject* TFE_Py_TapeSetRecordOperationBackprop(PyObject* op_type,
+                                                PyObject* output_tensors,
+                                                PyObject* input_tensors,
+                                                PyObject* backward_function);
+
+// Records an operation only for forwardprop (forward accumulators).
+//
+// Arguments:
+//  - op_type is a string for the operation type, used in the backprop code
+//  - output_tensors are a list of Python Tensor objects output by the operation
+//  - input_tensors are a list of input Tensors to the recorded operation
+//  - backward_function is the function to be called to, given the gradients of
+//    the output tensors, produce the gradients of the input tensors. This
+//    function is automatically transposed to produce output gradients given
+//    input gradients.
+//  - forwardprop_output_indices indicates any output_tensors which contain
+//    JVPs. Typically these will have come from TFE_Py_PackForwardGradients. May
+//    be None or an empty sequence if there are no JVP outputs from the
+//    operation.
+PyObject* TFE_Py_TapeSetRecordOperationForwardprop(
+    PyObject* op_type, PyObject* output_tensors, PyObject* input_tensors,
+    PyObject* backward_function, PyObject* forwardprop_output_indices);
 
 // Notifies all tapes that a variable has been accessed.
 void TFE_Py_TapeVariableAccessed(PyObject* variable);
@@ -255,6 +296,36 @@ void TFE_Py_ForwardAccumulatorWatch(PyObject* accumulator, PyObject* tensor,
 // `accumulator`. Returns None if no JVP is available.
 PyObject* TFE_Py_ForwardAccumulatorJVP(PyObject* accumulator, PyObject* tensor);
 
+// Temporarily push or pop transient state for accumulators in the active set.
+//
+// Allows an accumulator which is currently processing an operation to
+// temporarily reset its state. This is useful when building forwardprop
+// versions of functions, where an accumulator will trigger function building
+// and then must process captured symbolic tensors while building it. Without
+// pushing and poping, accumulators ignore operations executed as a direct
+// result of their own jvp computations.
+PyObject* TFE_Py_ForwardAccumulatorPushState();
+PyObject* TFE_Py_ForwardAccumulatorPopState();
+
+// Collects state from all current forward accumulators related to `tensors`.
+//
+// This is useful for packing JVPs as function inputs before executing a
+// function which computes primals and JVPs at the same time.
+//
+// Does not include accumulators which are currently in the process of computing
+// a jvp (and so appear somewhere on the current execution stack) or any
+// accumulators more deeply nested.
+//
+// Includes JVPs for `tensors` and any higher-order JVPs for those
+// (recursively). Returns a two-element tuple (indices, jvps):
+//   indices: A sequence of sequences of two-element tuples. Each forward
+//       accumulator is represented as a sequence of tuples with (primal_index,
+//       jvp_index). Both integers index into the concatenated `tensors + jvps`
+//       array.
+//   jvps: A flat list of Tensors. Best interpreted as a sequence to be
+//       appended to `tensors`.
+PyObject* TFE_Py_PackForwardGradients(PyObject* tensors);
+
 // Returns an EagerTensor of dimension [len(`tensors`)] containing
 // the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
@@ -280,4 +351,20 @@ PyObject* TFE_Py_EncodeArg(PyObject*, bool include_tensor_ranks_only);
 
 void TFE_Py_EnableInteractivePythonLogging();
 
+// Sets `python_context` as the current eager Context object (defined
+// in eager/context.py). This function must be called at least once before
+// eager tensors are created.
+// If an error is encountered, sets python error and returns NULL. Else, returns
+// Py_None.
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_SetEagerContext(PyObject* python_context);
+
+// Returns the current eager Context object (defined in eager/context.py)
+// that was last set using TFE_Py_SetEagerContext.
+// If an error is encountered, sets python error and returns NULL.
+// The returned PyObject is "new", i.e. the caller must call Py_DECREF on it at
+// some point.
+PyObject* GetPyEagerContext();
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index dfe45d17aa7..1e8ca3ec95a 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1414,6 +1414,8 @@ class AccumulatorSet {
 
   bool empty() const { return ordered_.empty(); }
 
+  size_t size() const { return ordered_.size(); }
+
  private:
   typedef std::list<TFE_Py_ForwardAccumulator*> ListType;
   typedef tensorflow::gtl::FlatMap<TFE_Py_ForwardAccumulator*,
@@ -1422,10 +1424,14 @@ class AccumulatorSet {
 
  public:
   typedef ListType::const_iterator const_iterator;
-  const_iterator begin() const { return ordered_.begin(); }
+  typedef ListType::const_reverse_iterator const_reverse_iterator;
 
+  const_iterator begin() const { return ordered_.begin(); }
   const_iterator end() const { return ordered_.end(); }
 
+  const_reverse_iterator rbegin() const { return ordered_.rbegin(); }
+  const_reverse_iterator rend() const { return ordered_.rend(); }
+
  private:
   MapType map_;
   ListType ordered_;
@@ -1441,7 +1447,11 @@ AccumulatorSet* GetAccumulatorSet() {
 
 inline bool HasAccumulator() { return !GetAccumulatorSet()->empty(); }
 
-inline bool HasTape() { return !GetTapeSet()->empty() || HasAccumulator(); }
+inline bool HasGradientTape() { return !GetTapeSet()->empty(); }
+
+inline bool HasAccumulatorOrTape() {
+  return HasGradientTape() || HasAccumulator();
+}
 
 // A safe copy of a set, used for tapes and accumulators. The copy is not
 // affected by other python threads changing the set of active tapes.
@@ -1467,8 +1477,9 @@ class SafeSetCopy {
   typename ContainerType::const_iterator end() const { return set_copy_.end(); }
 
   bool empty() const { return set_copy_.empty(); }
+  size_t size() const { return set_copy_.size(); }
 
- private:
+ protected:
   ContainerType set_copy_;
 };
 
@@ -1483,6 +1494,14 @@ class SafeTapeSet
 class SafeAccumulatorSet : public SafeSetCopy<AccumulatorSet> {
  public:
   SafeAccumulatorSet() : SafeSetCopy<AccumulatorSet>(*GetAccumulatorSet()) {}
+
+  typename AccumulatorSet::const_reverse_iterator rbegin() const {
+    return set_copy_.rbegin();
+  }
+
+  typename AccumulatorSet::const_reverse_iterator rend() const {
+    return set_copy_.rend();
+  }
 };
 
 bool* ThreadTapeIsStopped() {
@@ -1522,7 +1541,7 @@ void TFE_Py_TapeSetAdd(PyObject* tape) {
 }
 
 PyObject* TFE_Py_TapeSetIsEmpty() {
-  if (*ThreadTapeIsStopped() || !HasTape()) {
+  if (*ThreadTapeIsStopped() || !HasAccumulatorOrTape()) {
     Py_RETURN_TRUE;
   }
   Py_RETURN_FALSE;
@@ -1592,14 +1611,18 @@ bool TapeCouldPossiblyRecord(PyObject* tensors) {
   if (*ThreadTapeIsStopped()) {
     return false;
   }
-  if (!HasTape()) {
+  if (!HasAccumulatorOrTape()) {
     return false;
   }
   return true;
 }
 
-PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors) {
-  if (!TapeCouldPossiblyRecord(tensors)) {
+bool CouldBackprop() { return !*ThreadTapeIsStopped() && HasGradientTape(); }
+
+bool CouldForwardprop() { return !*ThreadTapeIsStopped() && HasAccumulator(); }
+
+PyObject* TFE_Py_TapeSetShouldRecordBackprop(PyObject* tensors) {
+  if (!TapeCouldPossiblyRecord(tensors) || !CouldBackprop()) {
     Py_RETURN_FALSE;
   }
   // TODO(apassos) consider not building a list and changing the API to check
@@ -1615,16 +1638,26 @@ PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors) {
       Py_RETURN_TRUE;
     }
   }
-  auto forward_accumulators = *GetAccumulatorSet();
-  for (TFE_Py_ForwardAccumulator* accumulator : forward_accumulators) {
-    if (accumulator->accumulator->ShouldRecord(tensor_ids, dtypes)) {
-      Py_RETURN_TRUE;
-    }
-  }
 
   Py_RETURN_FALSE;
 }
 
+PyObject* TFE_Py_ForwardAccumulatorPushState() {
+  auto forward_accumulators = *GetAccumulatorSet();
+  for (TFE_Py_ForwardAccumulator* accumulator : forward_accumulators) {
+    accumulator->accumulator->PushState();
+  }
+  Py_RETURN_NONE;
+}
+
+PyObject* TFE_Py_ForwardAccumulatorPopState() {
+  auto forward_accumulators = *GetAccumulatorSet();
+  for (TFE_Py_ForwardAccumulator* accumulator : forward_accumulators) {
+    accumulator->accumulator->PopState();
+  }
+  Py_RETURN_NONE;
+}
+
 PyObject* TFE_Py_TapeSetPossibleGradientTypes(PyObject* tensors) {
   if (!TapeCouldPossiblyRecord(tensors)) {
     return GetPythonObjectFromInt(0);
@@ -1639,27 +1672,31 @@ PyObject* TFE_Py_TapeSetPossibleGradientTypes(PyObject* tensors) {
   // watching, we'll return immediately indicating that higher-order tape
   // gradients are possible.
   bool some_tape_watching = false;
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
-    if (tape->tape->ShouldRecord(tensor_ids, dtypes)) {
-      if (tape->tape->IsPersistent() || some_tape_watching) {
-        // Either this is the second tape watching, or this tape is persistent:
-        // higher-order gradients are possible.
-        return GetPythonObjectFromInt(2);
+  if (CouldBackprop()) {
+    auto tape_set = *GetTapeSet();
+    for (TFE_Py_Tape* tape : tape_set) {
+      if (tape->tape->ShouldRecord(tensor_ids, dtypes)) {
+        if (tape->tape->IsPersistent() || some_tape_watching) {
+          // Either this is the second tape watching, or this tape is
+          // persistent: higher-order gradients are possible.
+          return GetPythonObjectFromInt(2);
+        }
+        some_tape_watching = true;
       }
-      some_tape_watching = true;
     }
   }
-  auto forward_accumulators = *GetAccumulatorSet();
-  for (TFE_Py_ForwardAccumulator* accumulator : forward_accumulators) {
-    if (accumulator->accumulator->ShouldRecord(tensor_ids, dtypes)) {
-      if (some_tape_watching) {
-        // This is the second tape watching: higher-order gradients are
-        // possible. Note that there's no equivalent of persistence for
-        // forward-mode.
-        return GetPythonObjectFromInt(2);
+  if (CouldForwardprop()) {
+    auto forward_accumulators = *GetAccumulatorSet();
+    for (TFE_Py_ForwardAccumulator* accumulator : forward_accumulators) {
+      if (accumulator->accumulator->ShouldRecord(tensor_ids, dtypes)) {
+        if (some_tape_watching) {
+          // This is the second tape watching: higher-order gradients are
+          // possible. Note that there's no equivalent of persistence for
+          // forward-mode.
+          return GetPythonObjectFromInt(2);
+        }
+        some_tape_watching = true;
       }
-      some_tape_watching = true;
     }
   }
   if (some_tape_watching) {
@@ -1673,7 +1710,7 @@ PyObject* TFE_Py_TapeSetPossibleGradientTypes(PyObject* tensors) {
 }
 
 void TFE_Py_TapeWatch(PyObject* tape, PyObject* tensor) {
-  if (*ThreadTapeIsStopped()) {
+  if (!CouldBackprop()) {
     return;
   }
   tensorflow::int64 tensor_id = FastTensorId(tensor);
@@ -1753,6 +1790,24 @@ static PyTapeTensor TapeTensorFromTensor(PyObject* tensor) {
   return PyTapeTensor(id, dtype, shape);
 }
 
+// Populates output_info from output_seq, which must come from PySequence_Fast.
+//
+// Does not take ownership of output_seq. Returns true on success and false if a
+// Python exception has been set.
+bool TapeTensorsFromTensorSequence(PyObject* output_seq,
+                                   std::vector<PyTapeTensor>* output_info) {
+  Py_ssize_t output_len = PySequence_Fast_GET_SIZE(output_seq);
+  output_info->reserve(output_len);
+  for (Py_ssize_t i = 0; i < output_len; ++i) {
+    output_info->push_back(
+        TapeTensorFromTensor(PySequence_Fast_GET_ITEM(output_seq, i)));
+    if (PyErr_Occurred() != nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
 std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
   PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
   if (seq == nullptr) {
@@ -1774,7 +1829,7 @@ std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
 }
 
 void TFE_Py_TapeVariableAccessed(PyObject* variable) {
-  if (*ThreadTapeIsStopped()) {
+  if (!CouldBackprop()) {
     return;
   }
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
@@ -1783,7 +1838,7 @@ void TFE_Py_TapeVariableAccessed(PyObject* variable) {
 }
 
 void TFE_Py_TapeWatchVariable(PyObject* tape, PyObject* variable) {
-  if (*ThreadTapeIsStopped()) {
+  if (!CouldBackprop()) {
     return;
   }
   reinterpret_cast<TFE_Py_Tape*>(tape)->tape->WatchVariable(variable);
@@ -1837,7 +1892,128 @@ void RegisterForwardAccumulatorCleanup(PyObject* tensor,
   PyWeakref_NewRef(tensor, callback.get());
 }
 
-void TapeSetRecordOperation(
+void TapeSetRecordBackprop(
+    const string& op_type, const std::vector<PyTapeTensor>& output_info,
+    const std::vector<tensorflow::int64>& input_ids,
+    const std::vector<tensorflow::DataType>& input_dtypes,
+    const std::function<PyBackwardFunction*()>& backward_function_getter,
+    const std::function<void(PyBackwardFunction*)>& backward_function_killer) {
+  if (!CouldBackprop()) {
+    return;
+  }
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
+    tape->tape->RecordOperation(op_type, output_info, input_ids, input_dtypes,
+                                backward_function_getter,
+                                backward_function_killer);
+  }
+}
+
+bool TapeSetRecordForwardprop(
+    const string& op_type, PyObject* output_seq,
+    const std::vector<PyTapeTensor>& output_info, PyObject* input_tensors,
+    const std::vector<tensorflow::int64>& input_ids,
+    const std::vector<tensorflow::DataType>& input_dtypes,
+    const std::function<PyBackwardFunction*()>& backward_function_getter,
+    const std::function<void(PyBackwardFunction*)>& backward_function_killer,
+    const tensorflow::eager::ForwardFunction<PyObject>* forward_function,
+    PyObject* forwardprop_output_indices) {
+  if (!CouldForwardprop()) {
+    return true;
+  }
+  auto accumulator_set = SafeAccumulatorSet();
+  tensorflow::Safe_PyObjectPtr input_seq(
+      PySequence_Fast(input_tensors, "expected a sequence of tensors"));
+  if (input_seq == nullptr || PyErr_Occurred()) return false;
+  Py_ssize_t input_len = PySequence_Fast_GET_SIZE(input_seq.get());
+  for (int i = 0; i < output_info.size(); ++i) {
+    RegisterForwardAccumulatorCleanup(PySequence_Fast_GET_ITEM(output_seq, i),
+                                      output_info[i].GetID());
+  }
+  if (forwardprop_output_indices != nullptr &&
+      forwardprop_output_indices != Py_None) {
+    tensorflow::Safe_PyObjectPtr indices_fast(
+        PySequence_Fast(forwardprop_output_indices,
+                        "Expected a sequence sequences of indices"));
+    if (indices_fast == nullptr || PyErr_Occurred()) {
+      return false;
+    }
+    if (PySequence_Fast_GET_SIZE(indices_fast.get()) !=
+        accumulator_set.size()) {
+      MaybeRaiseExceptionFromStatus(
+          tensorflow::errors::Internal(
+              "Accumulators were added or removed from the active set "
+              "between packing and unpacking."),
+          nullptr);
+    }
+    Py_ssize_t accumulator_index = 0;
+    for (AccumulatorSet::const_reverse_iterator it = accumulator_set.rbegin();
+         it != accumulator_set.rend(); ++it, ++accumulator_index) {
+      tensorflow::Safe_PyObjectPtr jvp_index_seq(PySequence_Fast(
+          PySequence_Fast_GET_ITEM(indices_fast.get(), accumulator_index),
+          "Expected a sequence of jvp indices."));
+      if (jvp_index_seq == nullptr || PyErr_Occurred()) {
+        return false;
+      }
+      Py_ssize_t num_jvps = PySequence_Fast_GET_SIZE(jvp_index_seq.get());
+      for (Py_ssize_t jvp_index = 0; jvp_index < num_jvps; ++jvp_index) {
+        PyObject* tuple =
+            PySequence_Fast_GET_ITEM(jvp_index_seq.get(), jvp_index);
+        tensorflow::int64 primal_tensor_id =
+            output_info[MakeInt(PyTuple_GetItem(tuple, 0))].GetID();
+        (*it)->accumulator->Watch(
+            primal_tensor_id,
+            PySequence_Fast_GET_ITEM(output_seq,
+                                     MakeInt(PyTuple_GetItem(tuple, 1))));
+      }
+    }
+  } else {
+    std::vector<PyTapeTensor> input_info;
+    input_info.reserve(input_len);
+    for (Py_ssize_t i = 0; i < input_len; ++i) {
+      input_info.push_back(
+          TapeTensorFromTensor(PySequence_Fast_GET_ITEM(input_seq.get(), i)));
+    }
+    for (TFE_Py_ForwardAccumulator* accumulator : accumulator_set) {
+      tensorflow::Status status = accumulator->accumulator->Accumulate(
+          op_type, input_info, output_info, input_ids, input_dtypes,
+          forward_function, backward_function_getter, backward_function_killer);
+      if (PyErr_Occurred()) return false;  // Don't swallow Python exceptions.
+      if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
+        return false;
+      }
+      if (accumulator->accumulator->BusyAccumulating()) {
+        // Ensure inner accumulators don't see outer accumulators' jvps. This
+        // mostly happens on its own, with some potentially surprising
+        // exceptions, so the blanket policy is for consistency.
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+bool ParseOpTypeString(PyObject* op_type, string* op_type_string) {
+  if (PyBytes_Check(op_type)) {
+    *op_type_string = PyBytes_AsString(op_type);
+  } else if (PyUnicode_Check(op_type)) {
+#if PY_MAJOR_VERSION >= 3
+    *op_type_string = PyUnicode_AsUTF8(op_type);
+#else
+    PyObject* py_str = PyUnicode_AsUTF8String(op_type);
+    if (py_str == nullptr) {
+      return false;
+    }
+    *op_type_string = PyBytes_AS_STRING(py_str);
+    Py_DECREF(py_str);
+#endif
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "op_type should be a string.");
+    return false;
+  }
+  return true;
+}
+
+bool TapeSetRecordOperation(
     PyObject* op_type, PyObject* input_tensors, PyObject* output_tensors,
     const std::vector<tensorflow::int64>& input_ids,
     const std::vector<tensorflow::DataType>& input_dtypes,
@@ -1847,84 +2023,41 @@ void TapeSetRecordOperation(
   std::vector<PyTapeTensor> output_info;
   tensorflow::Safe_PyObjectPtr output_seq(PySequence_Fast(
       output_tensors, "expected a sequence of integer tensor ids"));
-  int output_len = PySequence_Size(output_tensors);
-  if (PyErr_Occurred()) return;
-  output_info.reserve(output_len);
-  for (int i = 0; i < output_len; ++i) {
-    output_info.push_back(
-        TapeTensorFromTensor(PySequence_Fast_GET_ITEM(output_seq.get(), i)));
-    if (PyErr_Occurred() != nullptr) {
-      return;
-    }
+  if (PyErr_Occurred() ||
+      !TapeTensorsFromTensorSequence(output_seq.get(), &output_info)) {
+    return false;
   }
   string op_type_str;
-  if (PyBytes_Check(op_type)) {
-    op_type_str = PyBytes_AsString(op_type);
-  } else if (PyUnicode_Check(op_type)) {
-#if PY_MAJOR_VERSION >= 3
-    op_type_str = PyUnicode_AsUTF8(op_type);
-#else
-    PyObject* py_str = PyUnicode_AsUTF8String(op_type);
-    if (py_str == nullptr) return;
-    op_type_str = PyBytes_AS_STRING(py_str);
-    Py_DECREF(py_str);
-#endif
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, "op_type should be a string.");
-    return;
+  if (!ParseOpTypeString(op_type, &op_type_str)) {
+    return false;
   }
-
-  for (TFE_Py_Tape* tape : SafeTapeSet()) {
-    tape->tape->RecordOperation(op_type_str, output_info, input_ids,
-                                input_dtypes, backward_function_getter,
-                                backward_function_killer);
-  }
-
-  auto accumulator_set = SafeAccumulatorSet();
-  if (!accumulator_set.empty()) {
-    std::vector<PyTapeTensor> input_info;
-    tensorflow::Safe_PyObjectPtr input_seq(
-        PySequence_Fast(input_tensors, "expected a sequence of tensors"));
-    if (input_seq == nullptr || PyErr_Occurred()) return;
-    int input_len = PySequence_Size(input_tensors);
-    input_info.reserve(input_len);
-    for (int i = 0; i < input_len; ++i) {
-      input_info.push_back(
-          TapeTensorFromTensor(PySequence_Fast_GET_ITEM(input_seq.get(), i)));
-    }
-    for (int i = 0; i < output_len; ++i) {
-      RegisterForwardAccumulatorCleanup(
-          PySequence_Fast_GET_ITEM(output_seq.get(), i),
-          output_info[i].GetID());
-    }
-    for (TFE_Py_ForwardAccumulator* accumulator : accumulator_set) {
-      tensorflow::Status status = accumulator->accumulator->Accumulate(
-          op_type_str, input_info, output_info, input_ids, input_dtypes,
-          forward_function, backward_function_getter, backward_function_killer);
-      if (PyErr_Occurred()) return;  // Don't swallow Python exceptions.
-      if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
-        return;
-      }
-    }
+  TapeSetRecordBackprop(op_type_str, output_info, input_ids, input_dtypes,
+                        backward_function_getter, backward_function_killer);
+  if (!TapeSetRecordForwardprop(
+          op_type_str, output_seq.get(), output_info, input_tensors, input_ids,
+          input_dtypes, backward_function_getter, backward_function_killer,
+          forward_function, nullptr /* No special-cased jvps. */)) {
+    return false;
   }
+  return true;
 }
 }  // namespace
 
-void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
-                                   PyObject* input_tensors,
-                                   PyObject* backward_function) {
-  if (!HasTape() || *ThreadTapeIsStopped()) {
-    return;
+PyObject* TFE_Py_TapeSetRecordOperation(PyObject* op_type,
+                                        PyObject* output_tensors,
+                                        PyObject* input_tensors,
+                                        PyObject* backward_function) {
+  if (!HasAccumulatorOrTape() || *ThreadTapeIsStopped()) {
+    Py_RETURN_NONE;
   }
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
-  if (PyErr_Occurred()) return;
+  if (PyErr_Occurred()) return nullptr;
 
   std::vector<tensorflow::DataType> input_dtypes =
       MakeTensorDtypeList(input_tensors);
-  if (PyErr_Occurred()) return;
+  if (PyErr_Occurred()) return nullptr;
 
-  TapeSetRecordOperation(
-      op_type, input_tensors, output_tensors, input_ids, input_dtypes,
+  std::function<PyBackwardFunction*()> backward_function_getter(
       [backward_function]() {
         Py_INCREF(backward_function);
         PyBackwardFunction* function = new PyBackwardFunction(
@@ -1933,12 +2066,114 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
               return PyObject_CallObject(backward_function, out_grads);
             });
         return function;
-      },
+      });
+  std::function<void(PyBackwardFunction*)> backward_function_killer(
       [backward_function](PyBackwardFunction* py_backward_function) {
         Py_DECREF(backward_function);
         delete py_backward_function;
-      },
-      nullptr /* No special-cased forward function */);
+      });
+
+  if (!TapeSetRecordOperation(
+          op_type, input_tensors, output_tensors, input_ids, input_dtypes,
+          backward_function_getter, backward_function_killer,
+          nullptr /* No special-cased forward function */)) {
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
+PyObject* TFE_Py_TapeSetRecordOperationForwardprop(
+    PyObject* op_type, PyObject* output_tensors, PyObject* input_tensors,
+    PyObject* backward_function, PyObject* forwardprop_output_indices) {
+  if (!HasAccumulator() || *ThreadTapeIsStopped()) {
+    Py_RETURN_NONE;
+  }
+  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) return nullptr;
+
+  std::vector<tensorflow::DataType> input_dtypes =
+      MakeTensorDtypeList(input_tensors);
+  if (PyErr_Occurred()) return nullptr;
+
+  std::function<PyBackwardFunction*()> backward_function_getter(
+      [backward_function]() {
+        Py_INCREF(backward_function);
+        PyBackwardFunction* function = new PyBackwardFunction(
+            [backward_function](PyObject* out_grads,
+                                const std::vector<tensorflow::int64>& unused) {
+              return PyObject_CallObject(backward_function, out_grads);
+            });
+        return function;
+      });
+  std::function<void(PyBackwardFunction*)> backward_function_killer(
+      [backward_function](PyBackwardFunction* py_backward_function) {
+        Py_DECREF(backward_function);
+        delete py_backward_function;
+      });
+  std::vector<PyTapeTensor> output_info;
+  tensorflow::Safe_PyObjectPtr output_seq(PySequence_Fast(
+      output_tensors, "expected a sequence of integer tensor ids"));
+  if (PyErr_Occurred() ||
+      !TapeTensorsFromTensorSequence(output_seq.get(), &output_info)) {
+    return nullptr;
+  }
+  string op_type_str;
+  if (!ParseOpTypeString(op_type, &op_type_str)) {
+    return nullptr;
+  }
+  if (!TapeSetRecordForwardprop(
+          op_type_str, output_seq.get(), output_info, input_tensors, input_ids,
+          input_dtypes, backward_function_getter, backward_function_killer,
+          nullptr /* no special-cased forward function */,
+          forwardprop_output_indices)) {
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
+PyObject* TFE_Py_TapeSetRecordOperationBackprop(PyObject* op_type,
+                                                PyObject* output_tensors,
+                                                PyObject* input_tensors,
+                                                PyObject* backward_function) {
+  if (!CouldBackprop()) {
+    Py_RETURN_NONE;
+  }
+  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) return nullptr;
+
+  std::vector<tensorflow::DataType> input_dtypes =
+      MakeTensorDtypeList(input_tensors);
+  if (PyErr_Occurred()) return nullptr;
+
+  std::function<PyBackwardFunction*()> backward_function_getter(
+      [backward_function]() {
+        Py_INCREF(backward_function);
+        PyBackwardFunction* function = new PyBackwardFunction(
+            [backward_function](PyObject* out_grads,
+                                const std::vector<tensorflow::int64>& unused) {
+              return PyObject_CallObject(backward_function, out_grads);
+            });
+        return function;
+      });
+  std::function<void(PyBackwardFunction*)> backward_function_killer(
+      [backward_function](PyBackwardFunction* py_backward_function) {
+        Py_DECREF(backward_function);
+        delete py_backward_function;
+      });
+  std::vector<PyTapeTensor> output_info;
+  tensorflow::Safe_PyObjectPtr output_seq(PySequence_Fast(
+      output_tensors, "expected a sequence of integer tensor ids"));
+  if (PyErr_Occurred() ||
+      !TapeTensorsFromTensorSequence(output_seq.get(), &output_info)) {
+    return nullptr;
+  }
+  string op_type_str;
+  if (!ParseOpTypeString(op_type, &op_type_str)) {
+    return nullptr;
+  }
+  TapeSetRecordBackprop(op_type_str, output_info, input_ids, input_dtypes,
+                        backward_function_getter, backward_function_killer);
+  Py_RETURN_NONE;
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
@@ -2117,6 +2352,100 @@ PyObject* TFE_Py_ForwardAccumulatorJVP(PyObject* accumulator,
   return jvp;
 }
 
+PyObject* TFE_Py_PackForwardGradients(PyObject* tensors) {
+  if (!TapeCouldPossiblyRecord(tensors)) {
+    tensorflow::Safe_PyObjectPtr empty_tuple(PyTuple_New(0));
+    tensorflow::Safe_PyObjectPtr empty_list(PyList_New(0));
+    return PyTuple_Pack(2, empty_tuple.get(), empty_list.get());
+  }
+  auto accumulators = *GetAccumulatorSet();
+  tensorflow::Safe_PyObjectPtr tensors_fast(
+      PySequence_Fast(tensors, "Expected a sequence of input Tensors."));
+  if (tensors_fast == nullptr || PyErr_Occurred()) {
+    return nullptr;
+  }
+  std::vector<tensorflow::int64> augmented_input_ids;
+  for (Py_ssize_t position = 0;
+       position < PySequence_Fast_GET_SIZE(tensors_fast.get()); ++position) {
+    PyObject* input = PySequence_Fast_GET_ITEM(tensors_fast.get(), position);
+    if (input == Py_None) {
+      continue;
+    }
+    tensorflow::DataType input_dtype(FastTensorDtype(input));
+    if (input_dtype == tensorflow::DT_INVALID) {
+      return nullptr;
+    }
+    augmented_input_ids.push_back(FastTensorId(input));
+  }
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+  // Find the innermost accumulator such that all outer accumulators are
+  // recording. Any more deeply nested accumulators will not have their JVPs
+  // saved.
+  AccumulatorSet::const_iterator innermost_all_recording = accumulators.begin();
+  for (; innermost_all_recording != accumulators.end();
+       ++innermost_all_recording) {
+    if ((*innermost_all_recording)->accumulator->BusyAccumulating()) {
+      break;
+    }
+  }
+  AccumulatorSet::const_reverse_iterator reverse_innermost_all_recording(
+      innermost_all_recording);
+
+  bool saving_jvps = false;
+  tensorflow::Safe_PyObjectPtr all_indices(PyTuple_New(accumulators.size()));
+  std::vector<PyObject*> new_tensors;
+  Py_ssize_t accumulator_index = 0;
+  // Start with the innermost accumulators to give outer accumulators a chance
+  // to find their higher-order JVPs.
+  for (AccumulatorSet::const_reverse_iterator it = accumulators.rbegin();
+       it != accumulators.rend(); ++it, ++accumulator_index) {
+    std::vector<tensorflow::int64> new_input_ids;
+    std::vector<std::pair<tensorflow::int64, tensorflow::int64>>
+        accumulator_indices;
+    if (it == reverse_innermost_all_recording) {
+      saving_jvps = true;
+    }
+    if (saving_jvps) {
+      for (int input_index = 0; input_index < augmented_input_ids.size();
+           ++input_index) {
+        tensorflow::int64 existing_input = augmented_input_ids[input_index];
+        PyObject* jvp = (*it)->accumulator->FetchJVP(existing_input);
+        if (jvp != nullptr) {
+          new_tensors.push_back(jvp);
+          new_input_ids.push_back(FastTensorId(jvp));
+          accumulator_indices.emplace_back(
+              input_index,
+              augmented_input_ids.size() + new_input_ids.size() - 1);
+        }
+      }
+    }
+    tensorflow::Safe_PyObjectPtr accumulator_indices_py(
+        PyTuple_New(accumulator_indices.size()));
+    for (int i = 0; i < accumulator_indices.size(); ++i) {
+      tensorflow::Safe_PyObjectPtr from_index(
+          GetPythonObjectFromInt(accumulator_indices[i].first));
+      tensorflow::Safe_PyObjectPtr to_index(
+          GetPythonObjectFromInt(accumulator_indices[i].second));
+      PyTuple_SetItem(accumulator_indices_py.get(), i,
+                      PyTuple_Pack(2, from_index.get(), to_index.get()));
+    }
+    PyTuple_SetItem(all_indices.get(), accumulator_index,
+                    accumulator_indices_py.release());
+    augmented_input_ids.insert(augmented_input_ids.end(), new_input_ids.begin(),
+                               new_input_ids.end());
+  }
+
+  tensorflow::Safe_PyObjectPtr new_tensors_py(PyList_New(new_tensors.size()));
+  for (int i = 0; i < new_tensors.size(); ++i) {
+    PyObject* jvp = new_tensors[i];
+    Py_INCREF(jvp);
+    PyList_SET_ITEM(new_tensors_py.get(), i, jvp);
+  }
+  return PyTuple_Pack(2, all_indices.get(), new_tensors_py.get());
+}
+
 namespace {
 static const int kFastPathExecuteInputStartIndex = 5;
 
@@ -2597,6 +2926,10 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
   if (op_outputs_tuple_created) Py_DECREF(op_outputs);
   if (op_inputs_tuple_created) Py_DECREF(op_inputs);
 
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+
   Py_RETURN_NONE;
 }
 
@@ -2698,25 +3031,15 @@ bool ConvertToTensor(
   // The hint comes from a supposedly similarly typed tensor.
   tensorflow::DataType dtype_hint = dtype_hint_getter();
 
-  tensorflow::Safe_TFE_TensorHandlePtr handle = tensorflow::make_safe(
-      tensorflow::ConvertToEagerTensor(op_exec_info.ctx, input, dtype_hint));
+  TFE_TensorHandle* handle = tensorflow::ConvertToEagerTensor(
+      op_exec_info.ctx, input, dtype_hint, op_exec_info.device_name);
   if (handle == nullptr) {
     return MaybeRaiseExceptionFromTFStatus(status, nullptr);
   }
 
-  auto output_dtype = TFE_TensorHandleDataType(handle.get());
-  if (output_dtype != TF_INT32) {
-    // Note that this is a shallow copy and will share the underlying buffer
-    // if copying to the same device.
-    handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice(
-        handle.get(), op_exec_info.ctx, op_exec_info.device_name, status));
-    if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
-      return false;
-    }
-  }
-
-  output_handle->reset(EagerTensorFromHandle(handle.release()));
-  dtype_setter(static_cast<tensorflow::DataType>(output_dtype));
+  output_handle->reset(EagerTensorFromHandle(handle));
+  dtype_setter(
+      static_cast<tensorflow::DataType>(TFE_TensorHandleDataType(handle)));
 
   return true;
 }
@@ -2929,7 +3252,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   // (similar to benchmark_tf_gradient_function_*). Also consider using an
   // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
   // point out problems with heap allocs.
-  op_exec_info.run_gradient_callback = !*ThreadTapeIsStopped() && HasTape();
+  op_exec_info.run_gradient_callback =
+      !*ThreadTapeIsStopped() && HasAccumulatorOrTape();
   op_exec_info.run_post_exec_callbacks =
       op_exec_info.callbacks != Py_None &&
       PyList_Size(op_exec_info.callbacks) > 0;
@@ -3229,7 +3553,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
                                 PyObject* attrs, PyObject* results,
                                 PyObject* name) {
-  if (*ThreadTapeIsStopped() || !HasTape()) {
+  if (*ThreadTapeIsStopped() || !HasAccumulatorOrTape()) {
     Py_RETURN_NONE;
   }
 
@@ -3499,3 +3823,31 @@ void TFE_Py_EnableInteractivePythonLogging() {
     TF_RegisterLogListener(PrintToPythonStdout);
   }
 }
+
+namespace {
+// weak reference to Python Context object currently active
+PyObject* weak_eager_context = nullptr;
+}  // namespace
+
+PyObject* TFE_Py_SetEagerContext(PyObject* py_context) {
+  Py_XDECREF(weak_eager_context);
+  weak_eager_context = PyWeakref_NewRef(py_context, nullptr);
+  if (weak_eager_context == nullptr) {
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
+PyObject* GetPyEagerContext() {
+  if (weak_eager_context == nullptr) {
+    PyErr_SetString(PyExc_RuntimeError, "Python eager context is not set");
+    return nullptr;
+  }
+  PyObject* py_context = PyWeakref_GET_OBJECT(weak_eager_context);
+  if (py_context == Py_None) {
+    PyErr_SetString(PyExc_RuntimeError, "Eager context has been destroyed");
+    return nullptr;
+  }
+  Py_INCREF(py_context);
+  return py_context;
+}
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index cccec010e08..71fefd56160 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl import logging
 
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import remote_utils
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -73,11 +76,11 @@ def connect_to_remote_host(remote_host=None, job_name="worker"):
 
 
 @tf_export("config.experimental_connect_to_cluster")
-def connect_to_cluster(
-    cluster_spec_or_resolver,
-    job_name="localhost",
-    task_index=0,
-    protocol="grpc"):
+def connect_to_cluster(cluster_spec_or_resolver,
+                       job_name="localhost",
+                       task_index=0,
+                       protocol=None,
+                       make_master_device_default=True):
   """Connects to the given cluster.
 
   Will make devices on the cluster available to use. Note that calling this more
@@ -92,8 +95,15 @@ def connect_to_cluster(
       the cluster.
     job_name: The name of the local job.
     task_index: The local task index.
-    protocol: The communication protocol.
+    protocol: The communication protocol, such as `"grpc"`. If unspecified, will
+      use the default from `python/platform/remote_utils.py`.
+    make_master_device_default: If True and a cluster resolver is passed, will
+      automatically enter the master task device scope, which indicates the
+      master becomes the default device to run ops. It won't do anything if
+      a cluster spec is passed. Will throw an error if the caller is currently
+      already in some device scope.
   """
+  protocol = protocol or remote_utils.get_default_communication_protocol()
   if isinstance(cluster_spec_or_resolver, server_lib.ClusterSpec):
     cluster_spec = cluster_spec_or_resolver
   elif isinstance(cluster_spec_or_resolver, cluster_resolver.ClusterResolver):
@@ -122,6 +132,44 @@ def connect_to_cluster(
   os.environ["TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC"] = "1"
   context.set_server_def(server_def)
 
+  if make_master_device_default and isinstance(
+      cluster_spec_or_resolver,
+      cluster_resolver.ClusterResolver) and cluster_spec_or_resolver.master():
+    master = cluster_spec_or_resolver.master()
+    master_job_name = None
+    master_task_id = None
+    for job_name in cluster_spec.jobs:
+      for task_id in cluster_spec.task_indices(job_name):
+        task_address = cluster_spec.task_address(job_name, task_id)
+        if master in task_address or task_address in master:
+          master_job_name = job_name
+          master_task_id = task_id
+          break
+
+    if not master_job_name:
+      raise ValueError(
+          "`make_master_device_default` is set to True but cannot find "
+          "master %s in the cluster" % master)
+
+    master_device = "/job:{}/replica:0/task:{}".format(master_job_name,
+                                                       master_task_id)
+    if not _device_stack_is_empty():
+      raise ValueError("`connect_to_cluster` should not be called inside "
+                       "an existing device scope")
+    logging.info("Entering into master device scope: %s", master_device)
+    # TODO(b/138389076): Think of the entering device scope behavior in the
+    # failure recovery case when dealing with preemptions.
+    ops.device(master_device).__enter__()
+
 
 def _strip_prefix(s, prefix):
   return s[len(prefix):] if s.startswith(prefix) else s
+
+
+def _device_stack_is_empty():
+  if context.executing_eagerly():
+    return not bool(context.context().device_name)
+  # pylint: disable=protected-access
+  device_stack = ops.get_default_graph()._device_functions_outer_to_inner
+  # pylint: enable=protected-access
+  return not bool(device_stack)
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 1b13d0cd52b..33dcdb76c76 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -18,6 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
+
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
@@ -26,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import server_lib
@@ -87,12 +92,26 @@ class SingleWorkerTest(test.TestCase):
 
     with self.assertRaises(errors.InvalidArgumentError) as cm:
       with ops.device('/job:worker/replica:0/task:0/cpu:0'):
-        self.assertAllEqual(
-            ambiguous_device(constant_op.constant([2])).numpy(), [3])
+        ambiguous_device(constant_op.constant([2])).numpy()
 
     self.assertIn('the output node must match exactly one device',
                   cm.exception.message)
 
+  def testStreaming(self):
+    """A mini stress test for streaming - issuing many RPCs back to back."""
+    with ops.device('job:worker/replica:0/task:0/device:CPU:0'):
+      x = array_ops.ones([2, 2])
+      y = array_ops.zeros([2, 2])
+      num_iters = 200
+      for _ in range(num_iters):
+        y = x + y
+        # Ask for y's shape after every 10 additions on average.
+        # This exercises waiting for remote shape logic in TensorHandle.
+        if random.randint(1, 10) == 1:
+          _ = y.shape
+    np.testing.assert_array_equal(
+        [[num_iters, num_iters], [num_iters, num_iters]], y.numpy())
+
 
 class MultiWorkersTest(test.TestCase):
 
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 66a58af371b..fecafc84514 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -117,6 +117,7 @@ def pop_tape(tape):
 
 @contextlib.contextmanager
 def stop_recording():
+  """Stop all gradient recording (backprop and forwardprop)."""
   is_stopped = pywrap_tensorflow.TFE_Py_TapeSetIsStopped()
   try:
     if not is_stopped:
@@ -127,9 +128,18 @@ def stop_recording():
       pywrap_tensorflow.TFE_Py_TapeSetRestartOnThread()
 
 
-def should_record(tensors):
-  """Returns true if any tape in the stack watches any of these tensors."""
-  return pywrap_tensorflow.TFE_Py_TapeSetShouldRecord(tensors)
+def should_record_backprop(tensors):
+  """Returns true if any tape in the stack watches any of these tensors.
+
+  Only takes GradientTapes into account, not forward accumulators.
+
+  Args:
+    tensors: Tensors to check, typically inputs to an operation.
+
+  Returns:
+    Boolean, whether any tape watches any of `tensors`.
+  """
+  return pywrap_tensorflow.TFE_Py_TapeSetShouldRecordBackprop(tensors)
 
 
 def record_operation(op_type, output_tensors, input_tensors, backward_function):
@@ -138,6 +148,35 @@ def record_operation(op_type, output_tensors, input_tensors, backward_function):
       op_type, output_tensors, input_tensors, backward_function)
 
 
+def record_operation_backprop_only(op_type, output_tensors, input_tensors,
+                                   backward_function):
+  """Records the operation on all backward tapes in the stack."""
+  pywrap_tensorflow.TFE_Py_TapeSetRecordOperationBackprop(
+      op_type, output_tensors, input_tensors, backward_function)
+
+
+def record_operation_forwardprop_only(op_type, output_tensors, input_tensors,
+                                      backward_function,
+                                      forwardprop_output_indices):
+  """Records the operation on all forward accumulators in the stack.
+
+  Args:
+    op_type: a string for the operation type, used in the backprop code
+    output_tensors: a list of Python Tensor objects output by the operation
+    input_tensors: a list of input Tensors to the recorded operation
+    backward_function: the function to be called to, given the gradients of the
+      output tensors, produce the gradients of the input tensors. This function
+      is automatically transposed to produce output gradients given input
+      gradients.
+    forwardprop_output_indices: indicates any output_tensors which contain JVPs.
+      Typically these will have come from TFE_Py_PackForwardGradients. May be
+      None or an empty sequence if there are no JVP outputs from the operation.
+  """
+  pywrap_tensorflow.TFE_Py_TapeSetRecordOperationForwardprop(
+      op_type, output_tensors, input_tensors, backward_function,
+      forwardprop_output_indices)
+
+
 def delete_trace(tensor_id):
   """Deletes traces for this Tensor from all tapes in the stack."""
   pywrap_tensorflow.TFE_Py_TapeSetDeleteTrace(tensor_id)
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0059cdf1069..74b4b438e0f 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -47,8 +47,7 @@ def _create_tensor(value, device=None, dtype=None):
   if dtype is not None:
     dtype = dtype.as_datatype_enum
   try:
-    return ops.EagerTensor(
-        value, context=ctx._handle, device=device, dtype=dtype)
+    return ops.EagerTensor(value, device=device, dtype=dtype)
   except core._NotOkStatusException as e:  # pylint: disable=protected-access
     raise core._status_to_exception(e.code, e.message)
 
@@ -67,35 +66,23 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testBadConstructorArgs(self):
     context.ensure_initialized()
     ctx = context.context()
-    handle = ctx._handle
     device = ctx.device_name
-    # Missing context.
-    with self.assertRaisesRegexp(
-        TypeError, r".*argument 'context' \(pos 2\).*"):
-      ops.EagerTensor(1, device=device)
     # Missing device.
-    with self.assertRaisesRegexp(
-        TypeError, r".*argument 'device' \(pos 3\).*"):
-      ops.EagerTensor(1, context=handle)
+    with self.assertRaisesRegexp(TypeError, r".*argument 'device' \(pos 2\).*"):
+      ops.EagerTensor(1)
     # Bad dtype type.
     with self.assertRaisesRegexp(TypeError,
                                  "Expecting a DataType value for dtype. Got"):
-      ops.EagerTensor(1, context=handle, device=device, dtype="1")
+      ops.EagerTensor(1, device=device, dtype="1")
 
     # Following errors happen when trying to copy to GPU.
     if not test_util.is_gpu_available():
       self.skipTest("No GPUs found")
 
     with ops.device("/device:GPU:0"):
-      device = ctx.device_name
-      # Bad context.
-      with self.assertRaisesRegexp(
-          TypeError, "Expecting a PyCapsule encoded context handle. Got"):
-        ops.EagerTensor(1.0, context=1, device=device)
       # Bad device.
-      with self.assertRaisesRegexp(
-          TypeError, "Error parsing device argument to CopyToDevice"):
-        ops.EagerTensor(1.0, context=handle, device=1)
+      with self.assertRaisesRegexp(TypeError, "Error parsing device argument"):
+        ops.EagerTensor(1.0, device=1)
 
   def testNumpyValue(self):
     values = np.array([3.0])
@@ -121,8 +108,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     ctx = context.context()
     # Bad dtype value.
     with self.assertRaisesRegexp(TypeError, "Invalid dtype argument value"):
-      ops.EagerTensor(
-          values, context=ctx._handle, device=ctx.device_name, dtype=12345)
+      ops.EagerTensor(values, device=ctx.device_name, dtype=12345)
 
   def testNumpyOrderHandling(self):
     n = np.array([[1, 2], [3, 4]], order="F")
@@ -537,6 +523,5 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
         ValueError, "non-rectangular Python sequence"):
       constant_op.constant(l)
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index ad2f24edbe2..625a7d3c166 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -22,9 +22,11 @@ from __future__ import print_function
 import weakref
 
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -34,8 +36,10 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -103,6 +107,14 @@ def _get_element_from_tensor_info(tensor_info, graph):
         graph.get_tensor_by_name(tensor_info.coo_sparse.values_tensor_name),
         graph.get_tensor_by_name(
             tensor_info.coo_sparse.dense_shape_tensor_name))
+  elif encoding == "composite_tensor":
+    struct_coder = nested_structure_coder.StructureCoder()
+    spec_proto = struct_pb2.StructuredValue(
+        type_spec_value=tensor_info.composite_tensor.type_spec)
+    spec = struct_coder.decode_proto(spec_proto)
+    components = [graph.get_tensor_by_name(component.name) for component in
+                  tensor_info.composite_tensor.components]
+    return spec._from_components(components)  # pylint: disable=protected-access
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
@@ -116,8 +128,7 @@ def _lift_single_variable(old_variable, graph, variable_holder):
       trainable=old_variable.trainable,
       extra_handle_data=old_variable.handle)
   new_variable._initializer_op = old_variable._initializer_op  # pylint: disable=protected-access
-  graph.inputs.append(old_variable.handle)
-  graph.captures[new_variable.handle] = old_variable.handle
+  graph.add_capture(new_variable.handle, old_variable.handle)
   # Now that we've added the new variable to graph.captures,
   # graph.capture will use that cached value and do some post-processing
   # on the capture like recording it on the tape.
@@ -151,8 +162,9 @@ def _lift_unlifted_variables(graph, variable_holder):
         ops.GraphKeys.GLOBAL_VARIABLES)
     local_collection_variables = ops.get_collection(
         ops.GraphKeys.LOCAL_VARIABLES)
-    existing_captures = set(graph.internal_captures)
-    lifted_variables = {}
+    existing_captures = object_identity.ObjectIdentitySet(
+        graph.internal_captures)
+    lifted_variables = object_identity.ObjectIdentityDictionary()
 
     def _should_lift_variable(v):
       return ((v._in_graph_mode  # pylint: disable=protected-access
@@ -242,15 +254,16 @@ class WrappedFunction(function.ConcreteFunction):
     """
     # TODO(b/129646028): Add support for CompositeTensors.
     name = name or "pruned"
-    feeds = nest.map_structure(self.graph.as_graph_element, feeds)
-    flat_feeds = nest.flatten(feeds)
+    flat_feeds = nest.flatten(feeds, expand_composites=True)
+    flat_feeds = [self.graph.as_graph_element(t) for t in flat_feeds]
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
         raise ValueError("Feeds must be tensors.")
 
     # Ignoring all feeds that are captures allows prune to be called
     # using wrapped_func.inputs even when it uses variables
-    internal_captures = self.graph.internal_captures
+    internal_captures = object_identity.ObjectIdentitySet(
+        self.graph.internal_captures)
     flat_feeds = [f for f in flat_feeds if f not in internal_captures]
 
     operation_fetches = []
@@ -276,12 +289,13 @@ class WrappedFunction(function.ConcreteFunction):
       elif isinstance(fetch, meta_graph_pb2.TensorInfo):
         tensor_infos.append(fetch)
         decoded = _get_element_from_tensor_info(fetch, self._func_graph)
-        if tensor_util.is_tensor(decoded):
+        if (tensor_util.is_tensor(decoded) or
+            isinstance(decoded, composite_tensor.CompositeTensor)):
           tensor_fetches.append(decoded)
         else:
           operation_fetches.append(decoded)
         return decoded
-      elif isinstance(fetch, ops.Tensor):
+      elif isinstance(fetch, (ops.Tensor, composite_tensor.CompositeTensor)):
         tensor_fetches.append(fetch)
         return fetch
       else:
@@ -303,7 +317,7 @@ class WrappedFunction(function.ConcreteFunction):
     lift_map = lift_to_graph.lift_to_graph(
         operation_fetches + tensor_fetches,
         pruned_graph,
-        sources=flat_feeds + internal_captures)
+        sources=flat_feeds + self.graph.internal_captures)
 
     # Note that we add the component tensors of any composite tensors to the
     # returned function's outputs list; the list must contain these component
@@ -311,10 +325,9 @@ class WrappedFunction(function.ConcreteFunction):
     pruned_graph.outputs.extend(lift_map[x] for x in tensor_fetches)
     pruned_graph.control_outputs.extend(
         [lift_map[operation] for operation in operation_fetches])
-    for external_capture, internal_capture in self.graph.captures.items():
-      pruned_graph.captures[external_capture] = lift_map[internal_capture]
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
-    pruned_graph.inputs.extend(pruned_graph.captures.values())
+    for external_capture, internal_capture in self.graph.captures:
+      pruned_graph.add_capture(external_capture, lift_map[internal_capture])
     for ti in tensor_infos:
       if ti.WhichOneof("encoding") == "name":  # Dense tensors only
         t = pruned_graph.as_graph_element(ti.name)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index 1a135b3534f..58cbbbdc82e 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -36,6 +36,8 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
 
@@ -84,6 +86,31 @@ class WrapFunctionTest(test.TestCase):
     f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
     self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
 
+  def testPruneRagged(self):
+
+    x_in = []
+    x_out = []
+
+    def f(x, y):
+      x_in.append(x)
+      xx = x * x
+      x_out.append(xx)
+      return xx, y * y
+
+    x_spec = ragged_tensor.RaggedTensorSpec([None, None], dtypes.float32)
+    y_spec = tensor_spec.TensorSpec((), dtypes.float32)
+
+    f_wrapped = wrap_function.wrap_function(f, [x_spec, y_spec])
+
+    f_pruned = f_wrapped.prune(x_in[0], x_out[0])
+    rt = ragged_factory_ops.constant([[1.0, 2.0], [3.0]])
+    expected = ragged_factory_ops.constant_value([[1.0, 4.0], [9.0]])
+
+    # Note: when we call f_pruned, we must pass the RaggedTensor in using
+    # its components, since that's the current convention for how concrete
+    # functions handle structured inputs.
+    self.assertAllEqual(f_pruned(rt.values, rt.row_splits), expected)
+
   def _assert_single_captured_variable_argument(self, graph_def):
     # The single FunctionDef should have one argument, a captured variable
     function_def, = graph_def.library.function
@@ -208,8 +235,8 @@ class WrapFunctionTest(test.TestCase):
     self.assertIs(g_var_collection[0], v3_holder[0])
 
     # Both have only one value, and their values aren't equal. So no sharing.
-    self.assertNotEqual(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES),
-                        f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES))
+    self.assertIsNot(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES[0]),
+                     f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)[0])
 
   def testGradientsOfPrune(self):
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 0020eebff64..1c6b83740d6 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -6,7 +6,6 @@ package(
 py_library(
     name = "estimator_py",
     srcs = [
-        "__init__.py",
         "estimator_lib.py",
     ],
     srcs_version = "PY2AND3",
@@ -383,7 +382,6 @@ py_library(
 py_library(
     name = "inputs_queues",
     srcs = [
-        "inputs/queues/__init__.py",
         "inputs/queues/feeding_functions.py",
         "inputs/queues/feeding_queue_runner.py",
     ],
diff --git a/tensorflow/python/estimator/canned/__init__.py b/tensorflow/python/estimator/canned/__init__.py
deleted file mode 100644
index d640c8c15a5..00000000000
--- a/tensorflow/python/estimator/canned/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""canned python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow_estimator.python.estimator import canned
-
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-canned.__all__ = [s for s in dir(canned) if not s.startswith('__')]
-
-from tensorflow_estimator.python.estimator.canned import *
diff --git a/tensorflow/python/estimator/export/__init__.py b/tensorflow/python/estimator/export/__init__.py
deleted file mode 100644
index 898efd46efb..00000000000
--- a/tensorflow/python/estimator/export/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""export python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow_estimator.python.estimator import export
-
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-export.__all__ = [s for s in dir(export) if not s.startswith('__')]
-
-from tensorflow_estimator.python.estimator.export import *
diff --git a/tensorflow/python/estimator/inputs/__init__.py b/tensorflow/python/estimator/inputs/__init__.py
deleted file mode 100644
index 045ede224de..00000000000
--- a/tensorflow/python/estimator/inputs/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""inputs python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow_estimator.python.estimator import inputs
-
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-inputs.__all__ = [s for s in dir(inputs) if not s.startswith('__')]
-
-from tensorflow_estimator.python.estimator.inputs import *
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 1e8828d8573..38c3657ef58 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -53,6 +53,8 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
+        "dense_features.py",
+        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -115,6 +117,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    additional_deps = [
+        ":feature_column_test_main_lib",
+    ],
+    tags = ["no_pip"],
+)
+
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -156,6 +167,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    additional_deps = [
+        ":feature_column_v2_test_main_lib",
+    ],
+    tags = ["no_pip"],
+)
+
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
@@ -181,7 +201,6 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/feature_column/dense_features.py
new file mode 100644
index 00000000000..e3d0fb3da12
--- /dev/null
+++ b/tensorflow/python/feature_column/dense_features.py
@@ -0,0 +1,138 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A layer that produces a dense `Tensor` based on given `feature_columns`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.framework import ops
+from tensorflow.python.util import serialization
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.layers.DenseFeatures'])
+class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
+  """A layer that produces a dense `Tensor` based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  This layer can be called multiple times with different features.
+
+  This is the V1 version of this layer that uses variable_scope's to create
+  variables which works well with PartitionedVariables. Variable scopes are
+  deprecated in V2, so the V2 version uses name_scopes instead. But currently
+  that lacks support for partitioned variables. Use this if you need
+  partitioned variables.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  columns = [price, keywords_embedded, ...]
+  feature_layer = DenseFeatures(columns)
+
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = feature_layer(features)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.compat.v1.keras.layers.Dense(
+                       units, activation='relu')(dense_tensor)
+  prediction = tf.compat.v1.keras.layers.Dense(1)(dense_tensor)
+  ```
+  """
+
+  def __init__(self, feature_columns, trainable=True, name=None, **kwargs):
+    """Constructs a DenseFeatures layer.
+
+    Args:
+      feature_columns: An iterable containing the FeatureColumns to use as
+        inputs to your model. All items should be instances of classes derived
+        from `DenseColumn` such as `numeric_column`, `embedding_column`,
+        `bucketized_column`, `indicator_column`. If you have categorical
+        features, you can wrap them with an `embedding_column` or
+        `indicator_column`.
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the DenseFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+    """
+    super(DenseFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=fc.DenseColumn,
+        **kwargs)
+
+  @property
+  def _is_feature_layer(self):
+    return True
+
+  @property
+  def _tracking_metadata(self):
+    """String stored in metadata field in the SavedModel proto.
+
+    Returns:
+      A serialized JSON storing information necessary for recreating this layer.
+    """
+    metadata = json.loads(super(DenseFeatures, self)._tracking_metadata)
+    metadata['_is_feature_layer'] = True
+    return json.dumps(metadata, default=serialization.get_json_type)
+
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], total_elements)
+
+  def call(self, features, cols_to_output_tensors=None):
+    """Returns a dense tensor corresponding to the `feature_columns`.
+
+    Args:
+      features: A mapping from key to tensors. `FeatureColumn`s look up via
+        these keys. For example `numeric_column('price')` will look at 'price'
+        key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
+        on corresponding `FeatureColumn`.
+      cols_to_output_tensors: If not `None`, this will be filled with a dict
+        mapping feature columns to output tensors created.
+
+    Returns:
+      A `Tensor` which represents input layer of a model. Its shape
+      is (batch_size, first_layer_dimension) and its dtype is `float32`.
+      first_layer_dimension is determined based on given `feature_columns`.
+
+    Raises:
+      ValueError: If features are not a dictionary.
+    """
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    transformation_cache = fc.FeatureTransformationCache(features)
+    output_tensors = []
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        tensor = column.get_dense_tensor(transformation_cache,
+                                         self._state_manager)
+        processed_tensors = self._process_dense_tensor(column, tensor)
+        if cols_to_output_tensors is not None:
+          cols_to_output_tensors[column] = processed_tensors
+        output_tensors.append(processed_tensors)
+    return self._verify_and_concat_tensors(output_tensors)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/feature_column/dense_features_test.py
new file mode 100644
index 00000000000..c1a970e8e03
--- /dev/null
+++ b/tensorflow/python/feature_column/dense_features_test.py
@@ -0,0 +1,627 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense_features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+
+
+def _initialized_session(config=None):
+  sess = session.Session(config=config)
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class DenseFeaturesTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    dense_features = df.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = df.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = dense_features(features)
+      variables = dense_features.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking dense_features on the same features does not create
+      # additional variables
+      _ = dense_features(features)
+      self.assertEqual(1, len(variables))
+      self.assertIs(variables[0], dense_features.variables[0])
+
+  def test_feature_column_dense_features_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = df.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = dense_features(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      df.DenseFeatures(feature_columns=[])(features={})
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
+      df.DenseFeatures(feature_columns=[
+          fc.categorical_column_with_hash_bucket('wire_cast', 4)
+      ])(
+          features={
+              'a': [[0]]
+          })
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
+          features={
+              'a': [[0]]
+          })
+
+  def test_bare_column(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.]}
+      net = df.DenseFeatures(fc.numeric_column('a'))(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
+
+  def test_column_generator(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.], 'b': [1.]}
+      columns = (fc.numeric_column(key) for key in features)
+      net = df.DenseFeatures(columns)(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      df.DenseFeatures(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])(
+                               features={
+                                   'a': [[0]]
+                               })
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_compute_output_shape(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=4)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
+      }
+      dense_features = df.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+                          self.evaluate(net))
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        df.DenseFeatures([price])(features)
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = df.DenseFeatures([price1, price2])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_cols_to_output_tensors(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      cols_dict = {}
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      dense_features = df.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]],
+                          self.evaluate(cols_dict[price1]))
+      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = df.DenseFeatures([price_a, price_b])(features)
+      net2 = df.DenseFeatures([price_b, price_a])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
+        df.DenseFeatures([animal])(features)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        df.DenseFeatures([price1, price2])(features)
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        df.DenseFeatures([price1, price2, price3])(features)
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = df.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = df.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      df.DenseFeatures(all_cols)(features)
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      df.DenseFeatures(all_cols)(features)
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      df.DenseFeatures(all_cols)(features1)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=5, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+        # This is dense tensor for the categorical_column.
+        'country':
+            constant_op.constant(['CA', 'US']),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+    self.assertEqual(1, features['country'].shape.ndims)
+
+    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    embedding_values = (
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        # This is dense tensor for the categorical_column.
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+    self.assertIsNone(features['country'].shape.ndims)
+
+    price_data = np.array([11., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
+
+    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 2, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
+
+  @test_util.run_deprecated_v1
+  def test_with_rank_0_feature(self):
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      df.DenseFeatures([price])(features)
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = df.DenseFeatures([price])(features)
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/feature_column/dense_features_v2.py
new file mode 100644
index 00000000000..3d17b4895b2
--- /dev/null
+++ b/tensorflow/python/feature_column/dense_features_v2.py
@@ -0,0 +1,93 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A layer that produces a dense `Tensor` based on given `feature_columns`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.feature_column import dense_features
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.layers.DenseFeatures', v1=[])
+class DenseFeatures(dense_features.DenseFeatures):
+  """A layer that produces a dense `Tensor` based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  This layer can be called multiple times with different features.
+
+  This is the V2 version of this layer that uses name_scopes to create
+  variables instead of variable_scopes. But this approach currently lacks
+  support for partitioned variables. In that case, use the V1 version instead.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  columns = [price, keywords_embedded, ...]
+  feature_layer = DenseFeatures(columns)
+
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = feature_layer(features)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.keras.layers.Dense(units, activation='relu')(dense_tensor)
+  prediction = tf.keras.layers.Dense(1)(dense_tensor)
+  ```
+  """
+
+  def __init__(self,
+               feature_columns,
+               trainable=True,
+               name=None,
+               **kwargs):
+    """Creates a DenseFeatures object.
+
+    Args:
+      feature_columns: An iterable containing the FeatureColumns to use as
+        inputs to your model. All items should be instances of classes derived
+        from `DenseColumn` such as `numeric_column`, `embedding_column`,
+        `bucketized_column`, `indicator_column`. If you have categorical
+        features, you can wrap them with an `embedding_column` or
+        `indicator_column`.
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the DenseFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+    """
+    super(DenseFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self._state_manager = fc._StateManagerImplV2(self, self.trainable)  # pylint: disable=protected-access
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        column.create_state(self._state_manager)
+    # We would like to call Layer.build and not _DenseFeaturesHelper.build.
+    # pylint: disable=protected-access
+    super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/feature_column/dense_features_v2_test.py
new file mode 100644
index 00000000000..d5a96081f55
--- /dev/null
+++ b/tensorflow/python/feature_column/dense_features_v2_test.py
@@ -0,0 +1,627 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense_features_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features_v2 as df
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+
+
+def _initialized_session(config=None):
+  sess = session.Session(config=config)
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class DenseFeaturesTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    dense_features = df.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = df.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = dense_features(features)
+      variables = dense_features.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking dense_features on the same features does not create
+      # additional variables
+      _ = dense_features(features)
+      self.assertEqual(1, len(variables))
+      self.assertIs(variables[0], dense_features.variables[0])
+
+  def test_feature_column_dense_features_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = df.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = dense_features(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      df.DenseFeatures(feature_columns=[])(features={})
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
+      df.DenseFeatures(feature_columns=[
+          fc.categorical_column_with_hash_bucket('wire_cast', 4)
+      ])(
+          features={
+              'a': [[0]]
+          })
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
+          features={
+              'a': [[0]]
+          })
+
+  def test_bare_column(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.]}
+      net = df.DenseFeatures(fc.numeric_column('a'))(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
+
+  def test_column_generator(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.], 'b': [1.]}
+      columns = (fc.numeric_column(key) for key in features)
+      net = df.DenseFeatures(columns)(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      df.DenseFeatures(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])(
+                               features={
+                                   'a': [[0]]
+                               })
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_compute_output_shape(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=4)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
+      }
+      dense_features = df.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+                          self.evaluate(net))
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        df.DenseFeatures([price])(features)
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = df.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = df.DenseFeatures([price1, price2])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_cols_to_output_tensors(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      cols_dict = {}
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      dense_features = df.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]],
+                          self.evaluate(cols_dict[price1]))
+      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = df.DenseFeatures([price_a, price_b])(features)
+      net2 = df.DenseFeatures([price_b, price_a])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
+        df.DenseFeatures([animal])(features)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        df.DenseFeatures([price1, price2])(features)
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        df.DenseFeatures([price1, price2, price3])(features)
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = df.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = df.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      df.DenseFeatures(all_cols)(features)
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      df.DenseFeatures(all_cols)(features)
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      df.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      df.DenseFeatures(all_cols)(features1)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=5, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+        # This is dense tensor for the categorical_column.
+        'country':
+            constant_op.constant(['CA', 'US']),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+    self.assertEqual(1, features['country'].shape.ndims)
+
+    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    embedding_values = (
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        # This is dense tensor for the categorical_column.
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+    self.assertIsNone(features['country'].shape.ndims)
+
+    price_data = np.array([11., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
+
+    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 2, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
+
+  @test_util.run_deprecated_v1
+  def test_with_rank_0_feature(self):
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      df.DenseFeatures([price])(features)
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = df.DenseFeatures([price])(features)
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index cf3043ec7bb..0ad8b9e6847 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -166,6 +166,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.compat import collections_abc
 
 
 def _internal_input_layer(features,
@@ -1757,6 +1758,48 @@ class _FeatureColumn(object):
     """Returns string. Used for naming and for name_scope."""
     pass
 
+  def __lt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    In CPython, `__lt__` must be defined for all objects in the
+    sequence being sorted. If any objects do not have an `__lt__` compatible
+    with feature column objects (such as strings), then CPython will fall back
+    to using the `__gt__` method below.
+    https://docs.python.org/3/library/stdtypes.html#list.sort
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically less
+      than the string representation of `other`. For FeatureColumn objects,
+      this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) < str(other)
+
+  def __gt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    `__gt__` is called when the "other" object being compared during the sort
+    does not have `__lt__` defined.
+    Example: http://gpaste/4803354716798976
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically
+      greater than the string representation of `other`. For FeatureColumn
+      objects, this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) > str(other)
+
   @property
   def _var_scope_name(self):
     """Returns string. Used for variable_scope. Defaults to self.name."""
@@ -2152,7 +2195,7 @@ class _LazyBuilder(object):
     if rank is not None:
       if rank == 0:
         raise ValueError(
-            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
+            'Feature (key: {}) cannot have rank 0. Given: {}'.format(
                 key, feature_tensor))
       return feature_tensor if rank != 1 else expand_dims(feature_tensor)
 
@@ -2245,7 +2288,7 @@ def _normalize_feature_columns(feature_columns):
   if isinstance(feature_columns, _FeatureColumn):
     feature_columns = [feature_columns]
 
-  if isinstance(feature_columns, collections.Iterator):
+  if isinstance(feature_columns, collections_abc.Iterator):
     feature_columns = list(feature_columns)
 
   if isinstance(feature_columns, dict):
@@ -2420,7 +2463,7 @@ class _EmbeddingColumn(
   @property
   def _variable_shape(self):
     if not hasattr(self, '_shape'):
-      self._shape = tensor_shape.vector(self.dimension)
+      self._shape = tensor_shape.TensorShape([self.dimension])
     return self._shape
 
   def _get_dense_tensor_internal(self,
@@ -2531,7 +2574,7 @@ class _SharedEmbeddingColumn(
   @property
   def _variable_shape(self):
     if not hasattr(self, '_shape'):
-      self._shape = tensor_shape.vector(self.dimension)
+      self._shape = tensor_shape.TensorShape([self.dimension])
     return self._shape
 
   def _get_dense_tensor_internal(self,
@@ -2837,10 +2880,18 @@ class _IdentityCategoricalColumn(
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
-          values, num_buckets, data=(values, num_buckets),
+          values,
+          num_buckets,
+          data=(values, num_buckets),
+          message='Bucket index for categorical column '
+          '"{}" exceeds number of buckets'.format(self.name),
           name='assert_less_than_num_buckets')
       assert_greater = check_ops.assert_greater_equal(
-          values, zero, data=(values,),
+          values,
+          zero,
+          data=(values,),
+          message='Negative bucket index for categorical column "{}"'.format(
+              self.name),
           name='assert_greater_or_equal_0')
       with ops.control_dependencies((assert_less, assert_greater)):
         values = array_ops.identity(values)
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 6b4dfe69737..6a995842d8b 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -18,7 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long,wildcard-import
+# pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.feature_column.dense_features_v2 import *
+from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 0ded2bf8c9f..58aa776aaa0 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -2682,7 +2682,7 @@ class InputLayerTest(test.TestCase):
       # additional variables
       _ = input_layer(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], input_layer.variables[0])
+      self.assertIs(variables[0], input_layer.variables[0])
 
   def test_feature_column_input_layer_gradient(self):
     with context.eager_mode():
@@ -4391,12 +4391,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, -1, 0),
         dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_greater_or_equal_0'):
-        id_weight_pair.id_tensor.eval()
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'Negative bucket index for categorical column "aaa"'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
@@ -4405,12 +4403,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, 99, 0),
         dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_less_than_num_buckets'):
-        id_weight_pair.id_tensor.eval()
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'Bucket index for categorical column "aaa" exceeds number of buckets'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index a9d0fa2e906..eac18d63137 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -134,7 +134,6 @@ import math
 import numpy as np
 import six
 
-
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import utils as fc_utils
@@ -165,10 +164,11 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.compat import collections_abc
 
 
 _FEATURE_COLUMN_DEPRECATION_DATE = None
@@ -273,6 +273,8 @@ class _StateManagerImpl(StateManager):
     """
     self._trainable = trainable
     self._layer = layer
+    if self._layer is not None:
+      self._layer._maybe_create_attribute('_resources', [])  # pylint: disable=protected-access
     self._cols_to_vars_map = collections.defaultdict(lambda: {})
     # TODO(vbardiovsky): Make sure the resources are tracked by moving them to
     # the layer (inheriting from AutoTrackable), e.g.:
@@ -290,17 +292,22 @@ class _StateManagerImpl(StateManager):
     if name in self._cols_to_vars_map[feature_column]:
       raise ValueError('Variable already exists.')
 
-    var = self._layer.add_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        initializer=initializer,
-        trainable=self._trainable and trainable,
-        use_resource=use_resource,
-        # TODO(rohanj): Get rid of this hack once we have a mechanism for
-        # specifying a default partitioner for an entire layer. In that case,
-        # the default getter for Layers should work.
-        getter=variable_scope.get_variable)
+    # We explicitly track these variables since `name` is not guaranteed to be
+    # unique and disable manual tracking that the add_variable call does.
+    with trackable.no_manual_dependency_tracking_scope(self._layer):
+      var = self._layer.add_variable(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          trainable=self._trainable and trainable,
+          use_resource=use_resource,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+    if isinstance(var, trackable.Trackable):
+      self._layer._track_trackable(var, feature_column.name + '/' + name)  # pylint: disable=protected-access
     self._cols_to_vars_map[feature_column][name] = var
     return var
 
@@ -311,6 +318,8 @@ class _StateManagerImpl(StateManager):
 
   def add_resource(self, feature_column, name, resource):
     self._cols_to_resources_map[feature_column][name] = resource
+    if self._layer is not None:
+      self._layer._resources.append(resource)  # pylint: disable=protected-access
 
   def get_resource(self, feature_column, name):
     if name in self._cols_to_resources_map[feature_column]:
@@ -318,6 +327,36 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Resource does not exist.')
 
 
+class _StateManagerImplV2(_StateManagerImpl):
+  """Manages the state of DenseFeatures."""
+
+  def create_variable(self,
+                      feature_column,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      use_resource=True,
+                      initializer=None):
+    if name in self._cols_to_vars_map[feature_column]:
+      raise ValueError('Variable already exists.')
+
+    # We explicitly track these variables since `name` is not guaranteed to be
+    # unique and disable manual tracking that the add_variable call does.
+    with trackable.no_manual_dependency_tracking_scope(self._layer):
+      var = self._layer.add_variable(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          trainable=self._trainable and trainable,
+          use_resource=use_resource)
+    if isinstance(var, trackable.Trackable):
+      self._layer._track_trackable(var, feature_column.name + '/' + name)  # pylint: disable=protected-access
+    self._cols_to_vars_map[feature_column][name] = var
+    return var
+
+
 class _BaseFeaturesLayer(Layer):
   """Base class for DenseFeatures and SequenceFeatures.
 
@@ -415,104 +454,6 @@ class _BaseFeaturesLayer(Layer):
     return cls(**config_cp)
 
 
-@keras_export('keras.layers.DenseFeatures')
-class DenseFeatures(_BaseFeaturesLayer):
-  """A layer that produces a dense `Tensor` based on given `feature_columns`.
-
-  Generally a single example in training data is described with FeatureColumns.
-  At the first layer of the model, this column oriented data should be converted
-  to a single `Tensor`.
-
-  This layer can be called multiple times with different features.
-
-  Example:
-
-  ```python
-  price = numeric_column('price')
-  keywords_embedded = embedding_column(
-      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
-  columns = [price, keywords_embedded, ...]
-  feature_layer = DenseFeatures(columns)
-
-  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
-  dense_tensor = feature_layer(features)
-  for units in [128, 64, 32]:
-    dense_tensor = tf.keras.layers.Dense(units, activation='relu')(dense_tensor)
-  prediction = tf.keras.layers.Dense(1)(dense_tensor)
-  ```
-  """
-
-  def __init__(self,
-               feature_columns,
-               trainable=True,
-               name=None,
-               **kwargs):
-    """Constructs a DenseFeatures.
-
-    Args:
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `DenseColumn` such as `numeric_column`, `embedding_column`,
-        `bucketized_column`, `indicator_column`. If you have categorical
-        features, you can wrap them with an `embedding_column` or
-        `indicator_column`.
-      trainable:  Boolean, whether the layer's variables will be updated via
-        gradient descent during training.
-      name: Name to give to the DenseFeatures.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
-    """
-    super(DenseFeatures, self).__init__(
-        feature_columns=feature_columns,
-        trainable=trainable,
-        name=name,
-        expected_column_type=DenseColumn,
-        **kwargs)
-
-  @property
-  def _is_feature_layer(self):
-    return True
-
-  def _target_shape(self, input_shape, total_elements):
-    return (input_shape[0], total_elements)
-
-  def call(self, features, cols_to_output_tensors=None):
-    """Returns a dense tensor corresponding to the `feature_columns`.
-
-    Args:
-      features: A mapping from key to tensors. `FeatureColumn`s look up via
-        these keys. For example `numeric_column('price')` will look at 'price'
-        key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
-        on corresponding `FeatureColumn`.
-      cols_to_output_tensors: If not `None`, this will be filled with a dict
-        mapping feature columns to output tensors created.
-
-    Returns:
-      A `Tensor` which represents input layer of a model. Its shape
-      is (batch_size, first_layer_dimension) and its dtype is `float32`.
-      first_layer_dimension is determined based on given `feature_columns`.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    transformation_cache = FeatureTransformationCache(features)
-    output_tensors = []
-    for column in self._feature_columns:
-      with ops.name_scope(column.name):
-        tensor = column.get_dense_tensor(transformation_cache,
-                                         self._state_manager)
-        processed_tensors = self._process_dense_tensor(column, tensor)
-        if cols_to_output_tensors is not None:
-          cols_to_output_tensors[column] = processed_tensors
-        output_tensors.append(processed_tensors)
-    return self._verify_and_concat_tensors(output_tensors)
-
-
 class _LinearModelLayer(Layer):
   """Layer that contains logic for `LinearModel`."""
 
@@ -2197,6 +2138,50 @@ class FeatureColumn(object):
     """Returns string. Used for naming."""
     pass
 
+  def __lt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    In CPython, `__lt__` must be defined for all objects in the
+    sequence being sorted.
+
+    If any objects in teh sequence being sorted do not have an `__lt__` method
+    compatible with feature column objects (such as strings), then CPython will
+    fall back to using the `__gt__` method below.
+    https://docs.python.org/3/library/stdtypes.html#list.sort
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically less
+      than the string representation of `other`. For FeatureColumn objects,
+      this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) < str(other)
+
+  def __gt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    `__gt__` is called when the "other" object being compared during the sort
+    does not have `__lt__` defined.
+    Example: http://gpaste/4803354716798976
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically
+      greater than the string representation of `other`. For FeatureColumn
+      objects, this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) > str(other)
+
   @abc.abstractmethod
   def transform_feature(self, transformation_cache, state_manager):
     """Returns intermediate representation (usually a `Tensor`).
@@ -2660,7 +2645,7 @@ class FeatureTransformationCache(object):
     if rank is not None:
       if rank == 0:
         raise ValueError(
-            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
+            'Feature (key: {}) cannot have rank 0. Given: {}'.format(
                 key, feature_tensor))
       return feature_tensor if rank != 1 else expand_dims(feature_tensor)
 
@@ -2740,7 +2725,7 @@ def _normalize_feature_columns(feature_columns):
   if isinstance(feature_columns, FeatureColumn):
     feature_columns = [feature_columns]
 
-  if isinstance(feature_columns, collections.Iterator):
+  if isinstance(feature_columns, collections_abc.Iterator):
     feature_columns = list(feature_columns)
 
   if isinstance(feature_columns, dict):
@@ -3090,7 +3075,7 @@ class EmbeddingColumn(
   @property
   def variable_shape(self):
     """See `DenseColumn` base class."""
-    return tensor_shape.vector(self.dimension)
+    return tensor_shape.TensorShape([self.dimension])
 
   @property
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -3374,7 +3359,8 @@ class SharedEmbeddingColumn(
   @property
   def variable_shape(self):
     """See `DenseColumn` base class."""
-    return tensor_shape.vector(self.shared_embedding_column_creator.dimension)
+    return tensor_shape.TensorShape(
+        [self.shared_embedding_column_creator.dimension])
 
   @property
   def _variable_shape(self):
@@ -3612,7 +3598,7 @@ class VocabularyFileCategoricalColumn(
   def _parse_example_spec(self):
     return self.parse_example_spec
 
-  def _transform_input_tensor(self, input_tensor):
+  def _transform_input_tensor(self, input_tensor, state_manager=None):
     """Creates a lookup table for the vocabulary."""
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
@@ -3630,20 +3616,23 @@ class VocabularyFileCategoricalColumn(
       key_dtype = dtypes.int64
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
-    # TODO(rohanj): Use state manager to manage the index table creation.
-    return lookup_ops.index_table_from_file(
+    name = '{}_lookup'.format(self.key)
+    table = lookup_ops.index_table_from_file(
         vocabulary_file=self.vocabulary_file,
         num_oov_buckets=self.num_oov_buckets,
         vocab_size=self.vocabulary_size,
         default_value=self.default_value,
         key_dtype=key_dtype,
-        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+        name=name)
+    if state_manager is not None:
+      state_manager.add_resource(self, name, table)
+    return table.lookup(input_tensor)
 
   def transform_feature(self, transformation_cache, state_manager):
     """Creates a lookup table for the vocabulary."""
     input_tensor = _to_sparse_input_and_drop_ignore_values(
         transformation_cache.get(self.key, state_manager))
-    return self._transform_input_tensor(input_tensor)
+    return self._transform_input_tensor(input_tensor, state_manager)
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
                           _FEATURE_COLUMN_DEPRECATION)
@@ -3724,7 +3713,7 @@ class VocabularyListCategoricalColumn(
   def _parse_example_spec(self):
     return self.parse_example_spec
 
-  def _transform_input_tensor(self, input_tensor):
+  def _transform_input_tensor(self, input_tensor, state_manager=None):
     """Creates a lookup table for the vocabulary list."""
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
@@ -3742,19 +3731,22 @@ class VocabularyListCategoricalColumn(
       key_dtype = dtypes.int64
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
-    # TODO(rohanj): Use state manager to manage the index table creation.
-    return lookup_ops.index_table_from_tensor(
+    name = '{}_lookup'.format(self.key)
+    table = lookup_ops.index_table_from_tensor(
         vocabulary_list=tuple(self.vocabulary_list),
         default_value=self.default_value,
         num_oov_buckets=self.num_oov_buckets,
         dtype=key_dtype,
-        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+        name=name)
+    if state_manager is not None:
+      state_manager.add_resource(self, name, table)
+    return table.lookup(input_tensor)
 
   def transform_feature(self, transformation_cache, state_manager):
     """Creates a lookup table for the vocabulary list."""
     input_tensor = _to_sparse_input_and_drop_ignore_values(
         transformation_cache.get(self.key, state_manager))
-    return self._transform_input_tensor(input_tensor)
+    return self._transform_input_tensor(input_tensor, state_manager)
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
                           _FEATURE_COLUMN_DEPRECATION)
@@ -3848,10 +3840,18 @@ class IdentityCategoricalColumn(
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
-          values, num_buckets, data=(values, num_buckets),
+          values,
+          num_buckets,
+          data=(values, num_buckets),
+          message='Bucket index for categorical column '
+          '"{}" exceeds number of buckets'.format(self.name),
           name='assert_less_than_num_buckets')
       assert_greater = check_ops.assert_greater_equal(
-          values, zero, data=(values,),
+          values,
+          zero,
+          data=(values,),
+          message='Negative bucket index for categorical column "{}"'.format(
+              self.name),
           name='assert_greater_or_equal_0')
       with ops.control_dependencies((assert_less, assert_greater)):
         values = array_ops.identity(values)
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index f56c01bd198..cdce2648d33 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,8 +31,10 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -47,10 +49,7 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import rmsprop
-from tensorflow_estimator.python.estimator.inputs import numpy_io
 
 
 def _initialized_session(config=None):
@@ -89,6 +88,30 @@ class BaseFeatureColumnForTests(fc.FeatureColumn):
     raise ValueError('Should not use this method.')
 
 
+class SortableFeatureColumnTest(test.TestCase):
+
+  def test_sort_columns_by_string_representation(self):
+    # These should be sorted lexicographically based on their string
+    # representations. For FeatureColumns, this looks like
+    # '<__main__.FeatureColumn object at ...>'.
+
+    a = fc.numeric_column('first')  # '<__main__.NumericColumn ...>'
+    b = fc.numeric_column('second')  # '<__main__.NumericColumn ...>'
+    c = fc_old._numeric_column('third')  # '<__main__._NumericColumn ...>'
+
+    sorted_sequence = ['0', a, b, c, 'd']
+    reversed_sequence = sorted_sequence[::-1]
+    self.assertAllEqual(sorted(reversed_sequence), sorted_sequence)
+
+    # pylint: disable=g-generic-assert
+    self.assertTrue(a < b)  # V2 < V2 feature columns.
+    self.assertTrue(a < c)  # V2 < V1 feature columns.
+    self.assertFalse(c < a)  # V1 < V2 feature columns.
+    self.assertTrue('0' < a)  # string < V2 feature column.
+    self.assertTrue(a < 'd')  # V2 feature column < string.
+    # pylint: enable=g-generic-assert
+
+
 class LazyColumnTest(test.TestCase):
 
   def test_transformations_called_once(self):
@@ -830,7 +853,10 @@ class BucketizedColumnTest(test.TestCase):
     self.assertIsNot(price, new_bucketized_price.source_column)
 
     new_bucketized_price = fc.BucketizedColumn._from_config(
-        config, columns_by_name={price.name: price})
+        config,
+        columns_by_name={
+            serialization._column_name_with_class_name(price): price
+        })
     self.assertEqual(bucketized_price, new_bucketized_price)
     self.assertIs(price, new_bucketized_price.source_column)
 
@@ -1591,7 +1617,8 @@ class CrossedColumnTest(test.TestCase):
     self.assertIsNot(b, new_crossed.keys[0])
 
     new_crossed = fc.CrossedColumn._from_config(
-        config, columns_by_name={b.name: b})
+        config,
+        columns_by_name={serialization._column_name_with_class_name(b): b})
     self.assertEqual(crossed, new_crossed)
     self.assertIs(b, new_crossed.keys[0])
 
@@ -2128,45 +2155,6 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
-  @test_util.run_deprecated_v1
-  def test_with_numpy_input_fn(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    model = fc.LinearModel([price_buckets, body_style])
-    net = model(features)
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      body_style_var, price_buckets_var, bias = model.variables
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]],
-                          self.evaluate(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
   @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
@@ -3177,630 +3165,6 @@ class OldLinearModelTest(test.TestCase):
         fc_old.linear_model(features, all_cols)
 
 
-class DenseFeaturesTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    dense_features = fc.DenseFeatures(fc.numeric_column('a'))
-    inputs = self.evaluate(dense_features(features))
-    self.assertAllClose([[0.]], inputs)
-
-  def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
-
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      dense_features = fc.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
-
-      inputs = dense_features(features)
-      variables = dense_features.variables
-
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
-
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
-
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], dense_features.variables[0])
-
-  def test_feature_column_dense_features_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
-
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      dense_features = fc.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
-
-      def scale_matrix():
-        matrix = dense_features(features)
-        return 2 * matrix
-
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
-
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
-
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.DenseFeatures(feature_columns=[])(features={})
-
-  def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
-      fc.DenseFeatures(feature_columns=[
-          fc.categorical_column_with_hash_bucket('wire_cast', 4)
-      ])(
-          features={
-              'a': [[0]]
-          })
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
-          features={
-              'a': [[0]]
-          })
-
-  def test_bare_column(self):
-    with ops.Graph().as_default():
-      features = features = {'a': [0.]}
-      net = fc.DenseFeatures(fc.numeric_column('a'))(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0.]], self.evaluate(net))
-
-  def test_column_generator(self):
-    with ops.Graph().as_default():
-      features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
-      net = fc.DenseFeatures(columns)(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1.]], self.evaluate(net))
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Duplicate feature column name found for columns'):
-      fc.DenseFeatures(
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])(
-                               features={
-                                   'a': [[0]]
-                               })
-
-  def test_one_column(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      net = fc.DenseFeatures([price])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1.], [5.]], self.evaluate(net))
-
-  def test_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      net = fc.DenseFeatures([price])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_compute_output_shape(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=4)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
-      }
-      dense_features = fc.DenseFeatures([price1, price2])
-      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
-      net = dense_features(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
-                          self.evaluate(net))
-
-  def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.DenseFeatures([price])(features)
-
-  def test_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
-    with ops.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = fc.DenseFeatures([price])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      net = fc.DenseFeatures([price1, price2])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_cols_to_output_tensors(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      cols_dict = {}
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      dense_features = fc.DenseFeatures([price1, price2])
-      net = dense_features(features, cols_dict)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]],
-                          self.evaluate(cols_dict[price1]))
-      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    with ops.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-      }
-      net1 = fc.DenseFeatures([price_a, price_b])(features)
-      net2 = fc.DenseFeatures([price_b, price_a])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[1., 3.]], self.evaluate(net1))
-      self.assertAllClose([[1., 3.]], self.evaluate(net2))
-
-  def test_fails_for_categorical_column(self):
-    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
-        fc.DenseFeatures([animal])(features)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.DenseFeatures([price1, price2])(features)
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.DenseFeatures([price1, price2, price3])(features)
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      net = fc.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'Dimensions of inputs should match'):
-          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-      }
-      net = fc.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        sess.run(
-            net,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
-        some_sparse_column, dimension=10)
-
-    with ops.Graph().as_default():
-      features = {
-          'sparse_feature': [['a'], ['x']],
-      }
-      all_cols = [some_embedding_column]
-      fc.DenseFeatures(all_cols)(features)
-      fc.DenseFeatures(all_cols)(features)
-      # Make sure that 2 variables get created in this case.
-      self.assertEqual(2, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      expected_var_names = [
-          'dense_features/sparse_feature_embedding/embedding_weights:0',
-          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
-      ]
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  @test_util.run_deprecated_v1
-  def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-
-    with ops.Graph().as_default():
-      features = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      all_cols = [embedding_column_a, embedding_column_b]
-      fc.DenseFeatures(all_cols)(features)
-      fc.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0'],
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  @test_util.run_deprecated_v1
-  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
-
-    with ops.Graph().as_default():
-      features = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      fc.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-
-    with ops.Graph().as_default():
-      features1 = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-
-      fc.DenseFeatures(all_cols)(features1)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0'],
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  @test_util.run_deprecated_v1
-  def test_with_numpy_input_fn(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = fc.numeric_column('price')
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in dense_features.
-    one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in dense_features.
-    embedded_body_style = fc.embedding_column(
-        body_style, dimension=5, initializer=_initializer)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([11., 12., 13., 14.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.DenseFeatures([price, one_hot_body_style, embedded_body_style])(
-        features)
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual([[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-                           [1., 2., 3., 4., 5., 1., 0., 0., 12]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_sparse_tensor(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = fc.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = fc.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
-        country, dimension=5, initializer=_initializer)
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price':
-            constant_op.constant([
-                11.,
-                12.,
-            ]),
-        'body-style':
-            sparse_tensor.SparseTensor(
-                indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
-        # This is dense tensor for the categorical_column.
-        'country':
-            constant_op.constant(['CA', 'US']),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-    self.assertEqual(1, features['country'].shape.ndims)
-
-    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-                          sess.run(net))
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    embedding_values = (
-        (1., 2.),  # id 0
-        (6., 7.),  # id 1
-        (11., 12.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = fc.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = fc.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
-        country, dimension=2, initializer=_initializer)
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-        'body-style': array_ops.sparse_placeholder(dtypes.string),
-        # This is dense tensor for the categorical_column.
-        'country': array_ops.placeholder(dtypes.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
-    self.assertIsNone(features['country'].shape.ndims)
-
-    price_data = np.array([11., 12.])
-    body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-    country_data = np.array([['US'], ['CA']])
-
-    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 2, net.shape[1])
-    with _initialized_session() as sess:
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
-          sess.run(
-              net,
-              feed_dict={
-                  features['price']: price_data,
-                  features['body-style']: body_style_data,
-                  features['country']: country_data
-              }))
-
-  @test_util.run_deprecated_v1
-  def test_with_rank_0_feature(self):
-    # price has 1 dimension in dense_features
-    price = fc.numeric_column('price')
-    features = {
-        'price': constant_op.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
-
-    # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.DenseFeatures([price])(features)
-
-    # Dynamic rank 0 should fail
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-    }
-    net = fc.DenseFeatures([price])(features)
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
-
-
 class InputLayerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -3822,7 +3186,7 @@ class InputLayerTest(test.TestCase):
           key='a', num_buckets=3)
       embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info):
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
         del shape  # unused
         del dtype  # unused
         del partition_info  # unused
@@ -3853,7 +3217,7 @@ class InputLayerTest(test.TestCase):
       # additional variables
       _ = input_layer(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], input_layer.variables[0])
+      self.assertIs(variables[0], input_layer.variables[0])
 
   def test_feature_column_input_layer_gradient(self):
     with context.eager_mode():
@@ -3867,7 +3231,7 @@ class InputLayerTest(test.TestCase):
           key='a', num_buckets=3)
       embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info):
+      def _embedding_column_initializer(shape, dtype, partition_info=None):
         del shape  # unused
         del dtype  # unused
         del partition_info  # unused
@@ -4244,7 +3608,7 @@ class FunctionalInputLayerTest(test.TestCase):
         (11., 12., 13., 14., 15.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       del shape, dtype, partition_info
       return embedding_values
 
@@ -4301,7 +3665,7 @@ class FunctionalInputLayerTest(test.TestCase):
         (11., 12.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       del shape, dtype, partition_info
       return embedding_values
 
@@ -5646,40 +5010,47 @@ class IdentityCategoricalColumnTest(test.TestCase):
             values=np.array((0, 1, 0), dtype=np.int64),
             dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_small(self):
+  def _test_get_sparse_tensors_with_inputs_too_small(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2))
-    id_weight_pair = column.get_sparse_tensors(
-        fc.FeatureTransformationCache({
-            'aaa': inputs
-        }), None)
-    self.assertIsNone(id_weight_pair.weight_tensor)
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    with self.assertRaisesRegexp(errors.OpError, 'assert_greater_or_equal_0'):
-      self.evaluate(id_weight_pair.id_tensor)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'Negative bucket index for categorical column "aaa"'):
+      column.get_sparse_tensors(
+          fc.FeatureTransformationCache({
+              'aaa': inputs
+          }), None)
 
   @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_big(self):
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    self._test_get_sparse_tensors_with_inputs_too_small()
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def test_get_sparse_tensors_with_inputs_too_small_v2(self):
+    self._test_get_sparse_tensors_with_inputs_too_small()
+
+  def _test_get_sparse_tensors_with_inputs_too_big(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2))
-    id_weight_pair = column.get_sparse_tensors(
-        fc.FeatureTransformationCache({
-            'aaa': inputs
-        }), None)
-    self.assertIsNone(id_weight_pair.weight_tensor)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'Bucket index for categorical column "aaa" exceeds number of buckets'):
+      column.get_sparse_tensors(
+          fc.FeatureTransformationCache({
+              'aaa': inputs
+          }), None)
 
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
+  @test_util.run_deprecated_v1
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    self._test_get_sparse_tensors_with_inputs_too_big()
 
-    with self.assertRaisesRegexp(errors.OpError,
-                                 'assert_less_than_num_buckets'):
-      self.evaluate(id_weight_pair.id_tensor)
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def test_get_sparse_tensors_with_inputs_too_big_v2(self):
+    self._test_get_sparse_tensors_with_inputs_too_big()
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
@@ -6129,7 +5500,7 @@ class IndicatorColumnTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = fc.DenseFeatures([animal])(features)
+      net = df.DenseFeatures([animal])(features)
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
@@ -6193,7 +5564,10 @@ class IndicatorColumnTest(test.TestCase):
     self.assertIsNot(parent, new_animal.categorical_column)
 
     new_animal = fc.IndicatorColumn._from_config(
-        config, columns_by_name={parent.name: parent})
+        config,
+        columns_by_name={
+            serialization._column_name_with_class_name(parent): parent
+        })
     self.assertEqual(animal, new_animal)
     self.assertIs(parent, new_animal.categorical_column)
 
@@ -6402,7 +5776,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6468,7 +5842,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6533,7 +5907,7 @@ class EmbeddingColumnTest(test.TestCase):
         (2., 7., 12.)  # id 3
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6600,7 +5974,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6742,7 +6116,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6826,7 +6200,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6853,7 +6227,7 @@ class EmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    l = fc.DenseFeatures((embedding_column,))
+    l = df.DenseFeatures((embedding_column,))
     dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
@@ -6893,7 +6267,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -6921,7 +6295,7 @@ class EmbeddingColumnTest(test.TestCase):
         trainable=False)
 
     # Provide sparse input and get dense result.
-    dense_features = fc.DenseFeatures((embedding_column,))({
+    dense_features = df.DenseFeatures((embedding_column,))({
         'aaa': sparse_input
     })
 
@@ -6959,7 +6333,7 @@ class EmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7022,7 +6396,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7104,7 +6478,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7216,7 +6590,10 @@ class EmbeddingColumnTest(test.TestCase):
     new_embedding_column = fc.EmbeddingColumn._from_config(
         config,
         custom_objects=custom_objects,
-        columns_by_name={categorical_column.name: categorical_column})
+        columns_by_name={
+            serialization._column_name_with_class_name(categorical_column):
+                categorical_column
+        })
     self.assertEqual(embedding_column._get_config(),
                      new_embedding_column._get_config())
     self.assertIs(categorical_column, new_embedding_column.categorical_column)
@@ -7224,7 +6601,7 @@ class EmbeddingColumnTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_serialization_with_custom_initializer(self):
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       del shape, dtype, partition_info
       return ValueError('Not expected to be called')
 
@@ -7268,7 +6645,10 @@ class EmbeddingColumnTest(test.TestCase):
     new_embedding_column = fc.EmbeddingColumn._from_config(
         config,
         custom_objects=custom_objects,
-        columns_by_name={categorical_column.name: categorical_column})
+        columns_by_name={
+            serialization._column_name_with_class_name(categorical_column):
+                categorical_column
+        })
     self.assertEqual(embedding_column, new_embedding_column)
     self.assertIs(categorical_column, new_embedding_column.categorical_column)
 
@@ -7502,7 +6882,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7586,7 +6966,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7631,7 +7011,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7745,7 +7125,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -7796,7 +7176,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }
 
     # Provide sparse input and get dense result.
-    dense_features = fc.DenseFeatures(
+    dense_features = df.DenseFeatures(
         feature_columns=(embedding_column_b, embedding_column_a,
                          embedding_column_c, embedding_column_d))(
                              features)
@@ -7835,7 +7215,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       del shape, dtype, partition_info
       return ValueError('Not expected to be called')
 
@@ -8347,7 +7727,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
     self.assertEqual(column, fc.WeightedCategoricalColumn._from_config(config))
 
     new_column = fc.WeightedCategoricalColumn._from_config(
-        config, columns_by_name={categorical_column.name: categorical_column})
+        config,
+        columns_by_name={
+            serialization._column_name_with_class_name(categorical_column):
+                categorical_column
+        })
     self.assertEqual(column, new_column)
     self.assertIs(categorical_column, new_column.categorical_column)
 
diff --git a/tensorflow/python/feature_column/sequence_feature_column.py b/tensorflow/python/feature_column/sequence_feature_column.py
index 51661a435b7..53f2d3e85e5 100644
--- a/tensorflow/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -106,6 +106,10 @@ class SequenceFeatures(fc._BaseFeaturesLayer):
         expected_column_type=fc.SequenceDenseColumn,
         **kwargs)
 
+  @property
+  def _is_feature_layer(self):
+    return True
+
   def _target_shape(self, input_shape, total_elements):
     return (input_shape[0], input_shape[1], total_elements)
 
@@ -589,7 +593,7 @@ class SequenceNumericColumn(
   def _from_config(cls, config, custom_objects=None, columns_by_name=None):
     """See 'FeatureColumn` base class."""
     fc._check_config_keys(config, cls._fields)
-    kwargs = config.copy()
+    kwargs = fc._standardize_and_copy_config(config)
     kwargs['normalizer_fn'] = utils.deserialize_keras_object(
         config['normalizer_fn'], custom_objects=custom_objects)
     kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
diff --git a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
index b7c67945c6b..03fc1b62f22 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -26,6 +26,7 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.keras.layers import recurrent
@@ -92,7 +93,7 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     # Tile the context features across the sequence features
     sequence_input_layer = sfc.SequenceFeatures(seq_cols)
     seq_layer, _ = sequence_input_layer(features)
-    input_layer = fc.DenseFeatures(ctx_cols)
+    input_layer = dense_features.DenseFeatures(ctx_cols)
     ctx_layer = input_layer(features)
     input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
 
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 53ccc325d7b..8c269a0b800 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -112,7 +113,8 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
         (17., 18., 19.)  # id 2
     )
     def _get_initializer(embedding_dimension, embedding_values):
-      def _initializer(shape, dtype, partition_info):
+
+      def _initializer(shape, dtype, partition_info=None):
         self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
         self.assertEqual(dtypes.float32, dtype)
         self.assertIsNone(partition_info)
@@ -199,7 +201,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
       def _get_initializer(embedding_dimension, embedding_values):
 
-        def _initializer(shape, dtype, partition_info):
+        def _initializer(shape, dtype, partition_info=None):
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertEqual(dtypes.float32, dtype)
           self.assertIsNone(partition_info)
@@ -663,7 +665,7 @@ class DenseFeaturesTest(test.TestCase):
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must not be '
         r'of type SequenceCategoricalColumn\.'):
-      input_layer = fc.DenseFeatures([embedding_column_a])
+      input_layer = dense_features.DenseFeatures([embedding_column_a])
       _ = input_layer({'aaa': sparse_input})
 
   def test_indicator_column(self):
@@ -684,7 +686,7 @@ class DenseFeaturesTest(test.TestCase):
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must not be '
         r'of type SequenceCategoricalColumn\.'):
-      input_layer = fc.DenseFeatures([indicator_column_a])
+      input_layer = dense_features.DenseFeatures([indicator_column_a])
       _ = input_layer({'aaa': sparse_input})
 
 
@@ -757,6 +759,42 @@ class SequenceCategoricalColumnWithIdentityTest(
     _assert_sparse_tensor_value(
         self, expected, self.evaluate(id_weight_pair.id_tensor))
 
+  def test_serialization(self):
+    """Tests that column can be serialized."""
+    parent = sfc.sequence_categorical_column_with_identity(
+        'animal', num_buckets=4)
+    animal = fc.indicator_column(parent)
+
+    config = animal._get_config()
+    self.assertEqual(
+        {
+            'categorical_column': {
+                'class_name': 'SequenceCategoricalColumn',
+                'config': {
+                    'categorical_column': {
+                        'class_name': 'IdentityCategoricalColumn',
+                        'config': {
+                            'default_value': None,
+                            'key': 'animal',
+                            'number_buckets': 4
+                        }
+                    }
+                }
+            }
+        }, config)
+
+    new_animal = fc.IndicatorColumn._from_config(config)
+    self.assertEqual(animal, new_animal)
+    self.assertIsNot(parent, new_animal.categorical_column)
+
+    new_animal = fc.IndicatorColumn._from_config(
+        config,
+        columns_by_name={
+            serialization._column_name_with_class_name(parent): parent
+        })
+    self.assertEqual(animal, new_animal)
+    self.assertIs(parent, new_animal.categorical_column)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithHashBucketTest(
@@ -971,7 +1009,8 @@ class SequenceEmbeddingColumnTest(
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
-    def _initializer(shape, dtype, partition_info):
+
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -1066,7 +1105,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
diff --git a/tensorflow/python/feature_column/serialization.py b/tensorflow/python/feature_column/serialization.py
index f9067b3e939..1bec4cba6bf 100644
--- a/tensorflow/python/feature_column/serialization.py
+++ b/tensorflow/python/feature_column/serialization.py
@@ -23,7 +23,12 @@ import six
 from tensorflow.python.feature_column import feature_column_v2 as fc_lib
 from tensorflow.python.feature_column import sequence_feature_column as sfc_lib
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util.lazy_loader import LazyLoader
 
+# Prevent circular dependencies with Keras serialization.
+generic_utils = LazyLoader(
+    'generic_utils', globals(),
+    'tensorflow.python.keras.utils')
 
 _FEATURE_COLUMNS = [
     fc_lib.BucketizedColumn, fc_lib.CrossedColumn, fc_lib.EmbeddingColumn,
@@ -76,9 +81,6 @@ def serialize_feature_column(fc):
   Raises:
     ValueError if called with input that is not string or FeatureColumn.
   """
-  # Import here to avoid circular imports.
-  from tensorflow.python.keras.utils import generic_utils  # pylint: disable=g-import-not-at-top
-
   if isinstance(fc, six.string_types):
     return fc
   elif isinstance(fc, fc_lib.FeatureColumn):
@@ -113,9 +115,6 @@ def deserialize_feature_column(config,
   Returns:
     A FeatureColumn corresponding to the input `config`.
   """
-  # Import here to avoid circular imports.
-  from tensorflow.python.keras.utils import generic_utils  # pylint: disable=g-import-not-at-top
-
   if isinstance(config, six.string_types):
     return config
   # A dict from class_name to class for all FeatureColumns in this module.
@@ -144,7 +143,8 @@ def deserialize_feature_column(config,
 
   # If the name already exists, re-use the column from columns_by_name,
   # (new_instance remains unused).
-  return columns_by_name.setdefault(new_instance.name, new_instance)
+  return columns_by_name.setdefault(
+      _column_name_with_class_name(new_instance), new_instance)
 
 
 def serialize_feature_columns(feature_columns):
@@ -189,3 +189,20 @@ def deserialize_feature_columns(configs, custom_objects=None):
       deserialize_feature_column(c, custom_objects, columns_by_name)
       for c in configs
   ]
+
+
+def _column_name_with_class_name(fc):
+  """Returns a unique name for the feature column used during deduping.
+
+  Without this two FeatureColumns that have the same name and where
+  one wraps the other, such as an IndicatorColumn wrapping a
+  SequenceCategoricalColumn, will fail to deserialize because they will have the
+  same name in colums_by_name, causing the wrong column to be returned.
+
+  Args:
+    fc: A FeatureColumn.
+
+  Returns:
+    A unique name as a string.
+  """
+  return fc.__class__.__name__ + ':' + fc.name
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 97883496ed4..8a9082d02e1 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -125,7 +126,8 @@ class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
     cols = [fc.numeric_column('a'),
             fc.embedding_column(fc.categorical_column_with_identity(
                 key='b', num_buckets=3), dimension=2)]
-    orig_layer = fc.DenseFeatures(cols, trainable=trainable, name=name)
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
     config = orig_layer.get_config()
 
     self.assertEqual(config['name'], orig_layer.name)
@@ -147,10 +149,11 @@ class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
                 'b', vocabulary_list=['1', '2', '3']), dimension=2),
             fc.indicator_column(fc.categorical_column_with_hash_bucket(
                 key='c', hash_bucket_size=3))]
-    orig_layer = fc.DenseFeatures(cols, trainable=trainable, name=name)
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
     config = orig_layer.get_config()
 
-    new_layer = fc.DenseFeatures.from_config(config)
+    new_layer = dense_features.DenseFeatures.from_config(config)
 
     self.assertEqual(new_layer.name, orig_layer.name)
     self.assertEqual(new_layer.trainable, trainable)
@@ -168,10 +171,10 @@ class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
     ab = fc.crossed_column([a, b], hash_bucket_size=2)
     cols = [fc.indicator_column(ab)]
 
-    orig_layer = fc.DenseFeatures(cols)
+    orig_layer = dense_features.DenseFeatures(cols)
     config = orig_layer.get_config()
 
-    new_layer = fc.DenseFeatures.from_config(config)
+    new_layer = dense_features.DenseFeatures.from_config(config)
 
     self.assertLen(new_layer._feature_columns, 1)
     self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 38f1926ac12..f50e8c8e3e8 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -27,8 +27,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
 
+# LINT.IfChange
 # Op types that should not run in program order, e.g. because they need to run
 # asynchronously to avoid deadlock.
 ASYNC_STATEFUL_OPS = [
@@ -85,7 +87,14 @@ LEGACY_RANDOM_OPS = [
     "RandomPoissonV2",
 ]
 
-_ALL_BLACKLISTED_OPS = set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
+_ORDER_INSENSITIVE_STATEFUL_OPS = [
+    "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3"
+]
+# LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
+
+_ALL_BLACKLISTED_OPS = (
+    set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
+    | set(_ORDER_INSENSITIVE_STATEFUL_OPS))
 
 
 def op_is_stateful(op):
@@ -110,7 +119,7 @@ class AutomaticControlDependencies(object):
   """
 
   def __init__(self):
-    self._returned_tensors = set()
+    self._returned_tensors = object_identity.ObjectIdentitySet()
     self.ops_which_must_run = set()
 
   def mark_as_return(self, tensor):
@@ -206,10 +215,13 @@ class AutomaticControlDependencies(object):
         all usages of it.
     """
     inp = switch_op.inputs[0]
+    input_id = ops.tensor_id(inp)
     if inp.dtype == dtypes_module.resource and inp.op.type == "Switch":
       self._process_switch(inp.op, ops_which_must_run,
                            last_op_using_resource_tensor, merge_for_resource)
-    if switch_op.outputs[0] in merge_for_resource:
+    output = switch_op.outputs[0]
+    output_id = ops.tensor_id(output)
+    if output_id in merge_for_resource:
       return
     new_merge = control_flow_ops.merge(switch_op.outputs,
                                        name="artificial_merge")
@@ -217,16 +229,16 @@ class AutomaticControlDependencies(object):
         switch_op._control_flow_context.outer_context)  # pylint: disable=protected-access
     # Ensures the merge always runs
     ops_which_must_run.add(new_merge[0].op)
-    if inp in last_op_using_resource_tensor:
+    if input_id in last_op_using_resource_tensor:
       # Ensures the switch executes after the previous op using the resource.
-      switch_op._add_control_input(last_op_using_resource_tensor[inp])  # pylint: disable=protected-access
+      switch_op._add_control_input(last_op_using_resource_tensor[input_id])  # pylint: disable=protected-access
     # Ensure the next op outside the cond happens after the merge.
-    last_op_using_resource_tensor[inp] = new_merge[0].op
-    if inp in merge_for_resource:
-      merge_for_resource[inp]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
+    last_op_using_resource_tensor[input_id] = new_merge[0].op
+    if input_id in merge_for_resource:
+      merge_for_resource[input_id]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
     for o in switch_op.outputs:
       # Ensures the merge will execute after all ops inside the cond
-      merge_for_resource[o] = new_merge[0].op
+      merge_for_resource[ops.tensor_id(o)] = new_merge[0].op
 
   def __exit__(self, unused_type, unused_value, unused_traceback):
     if context.executing_eagerly():
@@ -298,34 +310,43 @@ class AutomaticControlDependencies(object):
         for o in ops_which_must_run:
           op._add_control_input(o)  # pylint: disable=protected-access
           for inp in o.inputs:
-            if inp in last_op_using_resource_tensor:
-              last_op_using_resource_tensor[inp] = op
+            input_id = ops.tensor_id(inp)
+            if input_id in last_op_using_resource_tensor:
+              last_op_using_resource_tensor[input_id] = op
         ops_which_must_run = set([op])
         continue
-      found_resource = False
+
+      resource_inputs = set()
       # Check for any resource inputs. If we find any, we update control_inputs
-      # and last_op_using_resource_tensor. Note that we dedup op.inputs in case
-      # op receives the same resource tensor twice as input, which would result
-      # in op getting a control dependency on itself.
-      for inp in set(op.inputs):
+      # and last_op_using_resource_tensor.
+      for inp in op.inputs:
         if inp.dtype != dtypes_module.resource:
           continue
-        found_resource = True
+
+        input_id = ops.tensor_id(inp)
+
+        # If the op receives the same resource tensor twice as an input, we skip
+        # to avoid the op getting a control dependency on itself.
+        if input_id in resource_inputs:
+          continue
+
+        resource_inputs.add(input_id)
         # Deal with switches, finally.
         if inp.op.type == "Switch":
           self._process_switch(inp.op, ops_which_must_run,
                                last_op_using_resource_tensor,
                                merge_for_resource)
         # Ensure uses of resources are serialized
-        if inp in last_op_using_resource_tensor:
-          if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
+        if input_id in last_op_using_resource_tensor:
+          if (last_op_using_resource_tensor[input_id]._control_flow_context  # pylint: disable=protected-access
               is op._control_flow_context):  # pylint: disable=protected-access
-            control_inputs.add(last_op_using_resource_tensor[inp])
+            control_inputs.add(last_op_using_resource_tensor[input_id])
         # Ensure merges happen after the closing of a cond block
-        if inp in merge_for_resource:
-          merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
-        last_op_using_resource_tensor[inp] = op
-      if (op_is_stateful(op) and not found_resource
+        if input_id in merge_for_resource:
+          merge_for_resource[input_id]._add_control_input(op)  # pylint: disable=protected-access
+        last_op_using_resource_tensor[input_id] = op
+
+      if (op_is_stateful(op) and not resource_inputs
           and op._control_flow_context is None):  # pylint: disable=protected-access
         if None in last_op_using_resource_tensor:
           op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/common_shapes.py b/tensorflow/python/framework/common_shapes.py
index 422bc7abf32..11612295d92 100644
--- a/tensorflow/python/framework/common_shapes.py
+++ b/tensorflow/python/framework/common_shapes.py
@@ -42,7 +42,7 @@ def rank(tensor):
 
 def scalar_shape(unused_op):
   """Shape function for ops that output a scalar value."""
-  return [tensor_shape.scalar()]
+  return [tensor_shape.TensorShape([])]
 
 
 def unchanged_shape(op):
diff --git a/tensorflow/python/framework/common_shapes_test.py b/tensorflow/python/framework/common_shapes_test.py
index 24e079eefbe..5cc48b4f42b 100644
--- a/tensorflow/python/framework/common_shapes_test.py
+++ b/tensorflow/python/framework/common_shapes_test.py
@@ -63,11 +63,11 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
       self.assertEqual(expected, common_shapes.broadcast_shape(shape2, shape1))
 
   def testBroadcast_one_dimension(self):
-    s1 = tensor_shape.vector(5)
-    s2 = tensor_shape.vector(7)
+    s1 = tensor_shape.TensorShape([5])
+    s2 = tensor_shape.TensorShape([7])
 
     unknown = tensor_shape.unknown_shape()
-    scalar = tensor_shape.scalar()
+    scalar = tensor_shape.TensorShape([])
     expanded_scalar = tensor_shape.TensorShape([1])
 
     # Tensors with same shape should have the same broadcast result.
@@ -90,13 +90,13 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
 
   def testBroadcast_many_dimensions(self):
     unknown = tensor_shape.unknown_shape()
-    shape_0 = tensor_shape.scalar()
-    shape_1 = tensor_shape.vector(1)
-    shape_4 = tensor_shape.vector(4)
-    shape_1x4 = tensor_shape.matrix(1, 4)
-    shape_4x1 = tensor_shape.matrix(4, 1)
-    shape_3x4 = tensor_shape.matrix(3, 4)
-    shape_4x3 = tensor_shape.matrix(4, 3)
+    shape_0 = tensor_shape.TensorShape([])
+    shape_1 = tensor_shape.TensorShape([1])
+    shape_4 = tensor_shape.TensorShape([4])
+    shape_1x4 = tensor_shape.TensorShape([1, 4])
+    shape_4x1 = tensor_shape.TensorShape([4, 1])
+    shape_3x4 = tensor_shape.TensorShape([3, 4])
+    shape_4x3 = tensor_shape.TensorShape([4, 3])
 
     # Tensors with same shape should have the same broadcast result.
     for shape in (
@@ -113,7 +113,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
       self._assert_broadcast(expected=unknown, shape1=shape, shape2=unknown)
 
     self._assert_broadcast(expected=shape_1x4, shape1=shape_4, shape2=shape_1x4)
-    shape_4x4 = tensor_shape.matrix(4, 4)
+    shape_4x4 = tensor_shape.TensorShape([4, 4])
     self._assert_broadcast(expected=shape_4x4, shape1=shape_4, shape2=shape_4x1)
     self._assert_broadcast(expected=shape_3x4, shape1=shape_4, shape2=shape_3x4)
     self._assert_incompatible_broadcast(shape1=shape_4, shape2=shape_4x3)
@@ -155,14 +155,14 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
 
   def testBroadcast_unknown_dims(self):
     unknown = tensor_shape.unknown_shape()
-    shape_0 = tensor_shape.scalar()
-    shape_1 = tensor_shape.vector(1)
+    shape_0 = tensor_shape.TensorShape([])
+    shape_1 = tensor_shape.TensorShape([1])
     # pylint: disable=invalid-name
-    shape_U = tensor_shape.vector(None)
-    shape_1xU = tensor_shape.matrix(1, None)
-    shape_Ux1 = tensor_shape.matrix(None, 1)
-    shape_4xU = tensor_shape.matrix(4, None)
-    shape_Ux4 = tensor_shape.matrix(None, 4)
+    shape_U = tensor_shape.TensorShape([None])
+    shape_1xU = tensor_shape.TensorShape([1, None])
+    shape_Ux1 = tensor_shape.TensorShape([None, 1])
+    shape_4xU = tensor_shape.TensorShape([4, None])
+    shape_Ux4 = tensor_shape.TensorShape([None, 4])
     # pylint: enable=invalid-name
 
     # Tensors with same shape should have the same broadcast result.
@@ -183,7 +183,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
 
     self._assert_broadcast_with_unknown_dims(
         expected=shape_1xU, shape1=shape_U, shape2=shape_1xU)
-    shape_UxU = tensor_shape.matrix(None, None)  # pylint: disable=invalid-name
+    shape_UxU = tensor_shape.TensorShape([None, None])  # pylint: disable=invalid-name
     self._assert_broadcast_with_unknown_dims(
         expected=shape_UxU, shape1=shape_U, shape2=shape_Ux1)
     self._assert_broadcast_with_unknown_dims(
@@ -200,7 +200,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
         expected=shape_4xU, shape1=shape_Ux1, shape2=shape_4xU)
     self._assert_broadcast_with_unknown_dims(
         expected=shape_Ux4, shape1=shape_Ux1, shape2=shape_Ux4)
-    shape_4x4 = tensor_shape.matrix(4, 4)
+    shape_4x4 = tensor_shape.TensorShape([4, 4])
     self._assert_broadcast_with_unknown_dims(
         expected=shape_4x4, shape1=shape_4xU, shape2=shape_Ux4)
 
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index e44e3a83d38..512fff92558 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -22,7 +22,8 @@ import abc
 
 import six
 
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.util import nest
 
 
@@ -137,7 +138,7 @@ class CompositeTensor(object):
     return list(set(consumers))
 
 
-pywrap_tensorflow.RegisterType("CompositeTensor", CompositeTensor)
+_pywrap_utils.RegisterType("CompositeTensor", CompositeTensor)
 
 
 def replace_composites_with_components(structure):
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index afdfa9791d0..5d3b300190e 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -43,9 +43,10 @@ def reset_eager(fn):
     try:
       return fn(*args, **kwargs)
     finally:
-      del context._context
-      context._context = context.Context()
-      ops.enable_eager_execution()
+      # Reset the context.
+      context._context = None
+      ops.enable_eager_execution_internal()
+      assert context._context is not None
 
   return wrapper
 
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index b092d0d3c2e..a0f21616b4c 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -23,8 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import context
@@ -95,23 +93,7 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     except AttributeError:
       dtype = dtypes.as_dtype(dtype).as_datatype_enum
   ctx.ensure_initialized()
-  device = ctx.device_name
-  handle = ctx._handle  # pylint: disable=protected-access
-  if isinstance(value, (float,) + six.integer_types):
-    # Use a scalar cache. This will put each scalar of each type only once on
-    # each device. Scalars don't use much device memory but copying scalars can
-    # trigger memcpys which are slow.
-    cache_key = device, value, dtype, type(value)
-    scalar_cache = ctx.scalar_cache()
-    tensor = scalar_cache.get(cache_key, None)
-    if tensor is not None:
-      return ops.EagerTensor(
-          value, handle, device, dtype, tensor)
-    t = ops.EagerTensor(value, handle, device, dtype)
-    scalar_cache[cache_key] = t
-    return t
-  else:
-    return ops.EagerTensor(value, handle, device, dtype)
+  return ops.EagerTensor(value, ctx.device_name, dtype)
 
 
 @tf_export(v1=["constant"])
@@ -282,7 +264,7 @@ def _constant_impl(
           value, dtype=dtype, shape=shape, verify_shape=verify_shape,
           allow_broadcast=allow_broadcast))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
-  const_tensor = g.create_op(
+  const_tensor = g._create_op_internal(  # pylint: disable=protected-access
       "Const", [], [dtype_value.type],
       attrs={"value": tensor_value,
              "dtype": dtype_value},
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 88274de8d96..c40ffcd8961 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_shape_pb2
@@ -29,15 +31,17 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.util import object_identity
 from tensorflow.python.training.saver import export_meta_graph
 
 
-# TODO(nupurgarg): Handle StatelessIf op.
-_CONTROL_FLOW_OPS = set(["If", "While"])
+_CONDITIONAL_OPS = set(["If", "StatelessIf"])
+_LOOP_OPS = set(["While", "StatelessWhile"])
+_CONTROL_FLOW_OPS = _CONDITIONAL_OPS.union(_LOOP_OPS)
 
 
 def disable_lower_using_switch_merge(graph_def):
-  """Set '_lower_using_switch_merge' attributes to False in If and While ops.
+  """Set '_lower_using_switch_merge' attributes to False.
 
   Sets the attribute to False in the NodeDefs in the main graph and the NodeDefs
   in each function's graph.
@@ -176,13 +180,15 @@ def _get_tensor_data(func):
     Dict
   """
   tensor_data = {}
-  map_index_to_variable = {
-      func.captured_inputs.index(var.handle): var
-      for var in func.graph.variables
-  }
+  map_index_to_variable = {}
+  for var in func.graph.variables:
+    for idx, captured_input in enumerate(func.captured_inputs):
+      if var.handle is captured_input:  # pylint: disable=protected-access
+        map_index_to_variable[idx] = var
+        break
 
   # Iterates through all captures which are represented as Placeholders.
-  for idx, (val_tensor, name_tensor) in enumerate(func.graph.captures.items()):
+  for idx, (val_tensor, name_tensor) in enumerate(func.graph.captures):
     tensor_name = _get_tensor_name(name_tensor.name)
     is_variable = idx in map_index_to_variable
     if is_variable:
@@ -238,7 +244,7 @@ def _get_control_flow_function_data(node_defs, tensor_data):
     }
 
   for node in node_defs:
-    if node.op == "If":
+    if node.op in _CONDITIONAL_OPS:
       arg_types = [dtype for dtype in node.attr["Tin"].list.type]
 
       for idx in range(len(arg_types)):
@@ -248,7 +254,7 @@ def _get_control_flow_function_data(node_defs, tensor_data):
 
       add_value(node.attr["then_branch"].func.name, arg_types, None, False)
       add_value(node.attr["else_branch"].func.name, arg_types, None, False)
-    elif node.op == "While":
+    elif node.op in _LOOP_OPS:
       arg_types = [dtype for dtype in node.attr["T"].list.type]
       output_shapes = [shape for shape in node.attr["output_shapes"].list.shape]
 
@@ -297,7 +303,7 @@ def _populate_identity_op(output_node, input_node):
 
 
 def _populate_if_op(output_node, input_node, function_data):
-  """Updates the type attributes and the function names of the If op.
+  """Updates the type attributes and function names of If or StatelessIf.
 
   Args:
     output_node: TensorFlow NodeDef.
@@ -316,7 +322,7 @@ def _populate_if_op(output_node, input_node, function_data):
 
 
 def _populate_while_op(output_node, input_node, function_data):
-  """Updates the type attributes and the function names of the While op.
+  """Updates the type attributes and function names of While or StatelessWhile.
 
   Args:
     output_node: TensorFlow NodeDef.
@@ -351,10 +357,11 @@ def _construct_concrete_function(func, output_graph_def,
     ConcreteFunction.
   """
   # Create a ConcreteFunction from the new GraphDef.
-  input_tensors = list(func.graph.captures.values())
-  converted_inputs = set(
+  input_tensors = func.graph.internal_captures
+  converted_inputs = object_identity.ObjectIdentitySet(
       [input_tensors[index] for index in converted_input_indices])
-  not_converted_inputs = set(func.inputs).difference(converted_inputs)
+  not_converted_inputs = object_identity.ObjectIdentitySet(
+      func.inputs).difference(converted_inputs)
   not_converted_inputs_map = {
       tensor.name: tensor for tensor in not_converted_inputs
   }
@@ -392,7 +399,6 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
   Returns:
     ConcreteFunction containing a simplified version of the original.
   """
-  # TODO(nupurgarg): Replace ResourceGather with Gather.
   # Inline the graph in order to remove functions when possible.
   graph_def = _run_inline_graph_optimization(func, lower_control_flow)
 
@@ -422,7 +428,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
     converted_input_indices.add(tensor_data[node_name]["index"])
 
   for node in node_defs:
-    if node.op == "If":
+    if node.op in _CONDITIONAL_OPS:
       # Get dtype and data for resource Placeholders.
       then_func = node.attr["then_branch"].func.name
       arg_types = function_data[then_func]["types"]
@@ -431,7 +437,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
         if input_name in tensor_data:
           dtype = attr_value_pb2.AttrValue(type=arg_types[idx])
           _save_placeholder(_get_tensor_name(input_tensor), dtype)
-    elif node.op == "While":
+    elif node.op in _LOOP_OPS:
       # Get dtype and data for resource Placeholders.
       cond_func = node.attr["cond"].func.name
       arg_types = function_data[cond_func]["types"]
@@ -441,7 +447,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
           dtype = attr_value_pb2.AttrValue(type=arg_types[idx])
           _save_placeholder(_get_tensor_name(input_tensor), dtype)
     elif (node.op == "Identity" and node.attr["T"].type == dtypes.resource and
-          name_to_node[_get_tensor_name(node.input[0])].op == "While"):
+          name_to_node[_get_tensor_name(node.input[0])].op in _LOOP_OPS):
       # Store the dtype for Identity resource ops that are outputs of While ops.
       while_node = name_to_node[_get_tensor_name(node.input[0])]
       body_func = while_node.attr["body"].func.name
@@ -462,10 +468,10 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
       # Get dtype and data for non-variable Placeholders (ex. values for 1.X
       # Const ops that are loaded as Placeholders in 2.0)
       _save_placeholder(node.name, node.attr["dtype"])
-    elif node.op == "ReadVariableOp":
-      # Get dtype and data for Placeholder ops associated with ReadVariableOp.
-      # There can be an Identity in between the ReadVariableOp and Placeholder.
-      # Store the dtype for the Identity ops.
+    elif node.op in ["ReadVariableOp", "ResourceGather"]:
+      # Get dtype and data for Placeholder ops associated with ReadVariableOp
+      # and ResourceGather ops. There can be an Identity in between the
+      # resource op and Placeholder. Store the dtype for the Identity ops.
       input_name = _get_tensor_name(node.input[0])
       while name_to_node[input_name].op == "Identity":
         resource_identities[input_name] = node.attr["dtype"]
@@ -498,10 +504,30 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
     # Convert ReadVariableOps to Identity ops.
     elif input_node.op == "ReadVariableOp":
       _populate_identity_op(output_node, input_node)
+    # Convert ResourceGather to Gather ops with a Const axis feeding into it.
+    elif input_node.op == "ResourceGather":
+      if input_node.attr["batch_dims"].i != 0:
+        raise ValueError("batch_dims != 0 is not supported by freeze_graph.")
+      output_axis_node = output_graph_def.node.add()
+      axis_node_name = input_node.name + "/axis"
+      axis_dtype = input_node.attr["Tindices"]
+      axis_data = np.array(input_node.attr["batch_dims"].i)
+      _populate_const_op(output_axis_node, axis_node_name, axis_dtype,
+                         axis_data, axis_data.shape)
+
+      output_node.op = "GatherV2"
+      output_node.name = input_node.name
+      output_node.input.extend(
+          [input_node.input[0], input_node.input[1], axis_node_name])
+      output_node.attr["Tparams"].CopyFrom(input_node.attr["dtype"])
+      output_node.attr["Tindices"].CopyFrom(input_node.attr["Tindices"])
+      output_node.attr["Taxis"].CopyFrom(axis_dtype)
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
     # Update the function names and argument types for the conditional ops.
-    elif input_node.op == "If":
+    elif input_node.op in _CONDITIONAL_OPS:
       _populate_if_op(output_node, input_node, function_data)
-    elif input_node.op == "While":
+    elif input_node.op in _LOOP_OPS:
       _populate_while_op(output_node, input_node, function_data)
     else:
       output_node.CopyFrom(input_node)
@@ -550,9 +576,9 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True):
         if input_node.op == "ReadVariableOp":
           _populate_identity_op(output_node, input_node)
         # Update the function names and argument types for the conditional ops.
-        elif input_node.op == "If":
+        elif input_node.op in _CONDITIONAL_OPS:
           _populate_if_op(output_node, input_node, function_data)
-        elif input_node.op == "While":
+        elif input_node.op in _LOOP_OPS:
           _populate_while_op(output_node, input_node, function_data)
         else:
           output_node.CopyFrom(input_node)
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index 4db64572064..315fe235b17 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -31,11 +31,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import while_v2
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import simple_save
 from tensorflow.python.saved_model.load import load
@@ -46,6 +49,24 @@ from tensorflow.python.util import nest
 
 class VariablesToConstantsTest(test.TestCase):
 
+  def _freezeModel(self, model):
+    """Freezes the model.
+
+    Args:
+      model: Function.
+
+    Returns:
+      root: AutoTrackable object with original ConcreteFunction.
+      output_func: frozen ConcreteFunction.
+    """
+    root = tracking.AutoTrackable()
+    root.f = model
+    input_func = root.f.get_concrete_function()
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func, lower_control_flow=False)
+    return root, output_func
+
   def _hasStatefulPartitionedCallOp(self, graph_def):
     """Determines if a StatefulPartitionedCall op exists in the graph."""
     for node in graph_def.node:
@@ -59,6 +80,11 @@ class VariablesToConstantsTest(test.TestCase):
 
   def _testConvertedFunction(self, obj, func, converted_concrete_func,
                              input_data):
+    # Ensure the converted graph has no variables and no function calls.
+    constant_graph_def = converted_concrete_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
     # Check that the converted ConcreteFunction produces the same result as the
     # original Function.
     expected_value = nest.flatten(func(**input_data))
@@ -103,10 +129,6 @@ class VariablesToConstantsTest(test.TestCase):
 
     output_func = convert_to_constants.convert_variables_to_constants_v2(
         input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(constant_graph_def.library.function)
-
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
@@ -124,10 +146,6 @@ class VariablesToConstantsTest(test.TestCase):
 
     output_func = convert_to_constants.convert_variables_to_constants_v2(
         input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
@@ -145,10 +163,6 @@ class VariablesToConstantsTest(test.TestCase):
 
     output_func = convert_to_constants.convert_variables_to_constants_v2(
         input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
@@ -171,10 +185,6 @@ class VariablesToConstantsTest(test.TestCase):
 
     output_func = convert_to_constants.convert_variables_to_constants_v2(
         input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
@@ -208,15 +218,12 @@ class VariablesToConstantsTest(test.TestCase):
 
     output_func = convert_to_constants.convert_variables_to_constants_v2(
         input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
     self._testConvertedFunction(root, root.add, output_func, input_data)
 
   @test_util.run_v2_only
   def testKerasModel(self):
-    input_data = constant_op.constant(1., shape=[1, 1])
+    """Test a basic Keras model with Variables."""
+    input_data = {"x": constant_op.constant(1., shape=[1, 1])}
 
     # Create a simple Keras model.
     x = [-1, 0, 1, 2, 3, 4]
@@ -227,26 +234,14 @@ class VariablesToConstantsTest(test.TestCase):
     model.compile(optimizer="sgd", loss="mean_squared_error")
     model.fit(x, y, epochs=1)
 
-    # Get the concrete function from the Keras model.
-    @def_function.function
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[1, 1], dtype=dtypes.float32)
+    ])
     def to_save(x):
       return model(x)
 
-    input_func = to_save.get_concrete_function(input_data)
-
-    variable_graph_def = input_func.graph.as_graph_def()
-    self.assertEqual(2, self._getNumVariables(variable_graph_def))
-
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
-    # Check value.
-    expected_value = to_save(input_data)
-    actual_value = nest.flatten(output_func(input_data))
-    self.assertEqual(expected_value.numpy(), actual_value)
+    root, output_func = self._freezeModel(to_save)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
 
   def _singleMetaGraphSavedModel(self):
     export_graph = ops.Graph()
@@ -275,20 +270,20 @@ class VariablesToConstantsTest(test.TestCase):
 
   @test_util.run_v2_only
   def testRefVariableImport(self):
+    """Test a model with 1.X ReferenceVariables."""
+    input_data = {"start": constant_op.constant(1., shape=[1, 1])}
+
     saved = self._singleMetaGraphSavedModel()
     imported = load(saved)
     fn = imported.signatures["serving_default"]
-    output_func = convert_to_constants.convert_variables_to_constants_v2(fn)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
 
-    input_data = {"start": constant_op.constant(1., shape=[1, 1])}
+    output_func = convert_to_constants.convert_variables_to_constants_v2(fn)
     root = tracking.AutoTrackable()
     self._testConvertedFunction(root, fn, output_func, input_data)
 
   @test_util.run_v2_only
-  def testControlFlow(self):
+  def testIf(self):
+    """Test a model with the If op."""
     input_data = {
         "x": constant_op.constant([1., 2.], shape=[1, 2]),
         "b": constant_op.constant(True)
@@ -310,21 +305,33 @@ class VariablesToConstantsTest(test.TestCase):
       return control_flow_ops.cond(
           b, true_fn=lambda: true_fn(x), false_fn=lambda: false_fn(x))
 
-    root = tracking.AutoTrackable()
-    root.f = model
-    input_func = root.f.get_concrete_function()
-    input_func(**input_data)
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
 
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func, lower_control_flow=False)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+  @test_util.run_v2_only
+  def testStatelessIf(self):
+    """Test a model with the StatelessIf op."""
+    input_data = {"b": constant_op.constant(True)}
 
+    x = constant_op.constant([1., 2.], shape=[1, 2], name="x")
+
+    def true_fn():
+      return x
+
+    def false_fn():
+      return x + 2
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool)])
+    def model(b):
+      return cond_v2.cond_v2(b, true_fn, false_fn)
+
+    root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
   def testStaticRnn(self):
+    """Test a StaticRnn containing If ops."""
     input_data = {
         "x":
             constant_op.constant(
@@ -341,20 +348,12 @@ class VariablesToConstantsTest(test.TestCase):
       return rnn.static_rnn(
           cell, seq, dtype=dtypes.float32, sequence_length=[1])
 
-    root = tracking.AutoTrackable()
-    root.f = model
-    input_func = root.f.get_concrete_function()
-
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func, lower_control_flow=False)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
+    root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
-  def testLoop(self):
+  def testWhile(self):
+    """Test a While loop."""
     input_data = {"x": constant_op.constant([1., 2., 3., 4.], shape=[2, 2])}
 
     weights = variables.Variable([[0.1, 0.2], [0.3, 0.4]], dtype=dtypes.float32)
@@ -371,21 +370,30 @@ class VariablesToConstantsTest(test.TestCase):
     def model(x):
       return control_flow_ops.while_loop(condition, body, [x])
 
-    root = tracking.AutoTrackable()
-    root.f = model
-    input_func = root.f.get_concrete_function()
-    input_func(**input_data)
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
 
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func, lower_control_flow=False)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+  @test_util.run_v2_only
+  def testStatelessWhile(self):
+    """Test a StatelessWhile loop."""
+    input_data = {"x": constant_op.constant(2.)}
 
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)
+    ])
+    def model(x):
+      return while_v2.while_loop(
+          lambda v: v < 4.,
+          lambda v: v * v, [x],
+          return_same_structure=False,
+          name="while_1")  # x**2
+
+    root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
   @test_util.run_v2_only
   def testDynamicRnn(self):
+    """Test a DynamicRnn containing While loops."""
     input_data = {
         "x":
             constant_op.constant(
@@ -401,16 +409,59 @@ class VariablesToConstantsTest(test.TestCase):
     def model(x):
       return rnn.dynamic_rnn(cell, x, dtype=dtypes.float32)
 
-    root = tracking.AutoTrackable()
-    root.f = model
-    input_func = root.f.get_concrete_function()
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
 
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func, lower_control_flow=False)
-    constant_graph_def = output_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+  @test_util.run_v2_only
+  def testKerasLSTM(self):
+    """Test a Keras LSTM containing dynamic_rnn ops."""
+    input_data = {
+        "x":
+            constant_op.constant(
+                np.array(
+                    np.random.random_sample((10, 10, 10)), dtype=np.float32))
+    }
 
+    model = keras.models.Sequential(
+        [keras.layers.LSTM(units=10, input_shape=(10, 10))])
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[10, 10, 10], dtype=dtypes.float32)
+    ])
+    def to_save(x):
+      return model(x)
+
+    root, output_func = self._freezeModel(to_save)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
+
+  @test_util.run_v2_only
+  def testEmbeddings(self):
+    """Test model with embeddings."""
+    input_data = {
+        "x":
+            constant_op.constant(
+                np.array(np.random.random_sample((20)), dtype=np.int32))
+    }
+
+    class EmbeddingModel(keras.Model):
+
+      def __init__(self):
+        super(EmbeddingModel, self).__init__()
+        self.shared_weights = self.add_weight(
+            "weights",
+            shape=(2000, 300),
+            dtype=dtypes.float32,
+            initializer=init_ops.random_normal_initializer(
+                mean=0.0, stddev=300**(-0.5)))
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=(20), dtype=dtypes.int32)
+      ])
+      def func(self, x):
+        return array_ops.gather(self.shared_weights, x)
+
+    model = EmbeddingModel()
+    root, output_func = self._freezeModel(model.func)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 16403b266ca..e817c3172ee 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -566,6 +566,10 @@ for pdt in [
     _NP_TO_TF[pdt] = next(
         _NP_TO_TF[dt] for dt in _NP_TO_TF if dt == pdt().dtype)
 
+
+TF_VALUE_DTYPES = set(_NP_TO_TF.values())
+
+
 _TF_TO_NP = {
     types_pb2.DT_HALF:
         np.float16,
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 5ddbac72ff3..0fc1a95955f 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import re
 
@@ -28,14 +29,20 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
-from tensorflow.python.util import tf_stack
+
+# A mock for ``tf_stack.StackFrame``.
+StackFrame = collections.namedtuple(
+    "StackFrame", ["filename", "lineno", "name", "line"])
 
 
 def _make_frame_with_filename(op, idx, filename):
   """Return a copy of an existing stack frame with a new filename."""
-  stack_frame = list(op._traceback[idx])
-  stack_frame[tf_stack.TB_FILENAME] = filename
-  return tuple(stack_frame)
+  frame = op._traceback[idx]
+  return StackFrame(
+      filename,
+      frame.lineno,
+      frame.name,
+      frame.line)
 
 
 def _modify_op_stack_with_filenames(op, num_user_frames, user_filename,
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index fbdc2aaa0ea..caaeab40254 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -46,6 +46,14 @@ def _compact_stack_trace(op):
   return compact_traces
 
 
+class InaccessibleTensorError(ValueError):
+  pass
+
+
+class OperatorNotAllowedInGraphError(TypeError):
+  pass
+
+
 @tf_export("errors.OpError", v1=["errors.OpError", "OpError"])
 @deprecation.deprecated_endpoints("OpError")
 class OpError(Exception):
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index f747110f318..23275255db9 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -22,15 +22,20 @@ import collections as py_collections
 import itertools
 import weakref
 
+import numpy as np
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
@@ -40,6 +45,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -61,6 +67,9 @@ WHITELIST_COLLECTIONS = [
 ]
 
 
+_EAGER_CONST_THRESHOLD = 128
+
+
 class UnknownArgument(object):
   """Signifies an argument which is not currently handled."""
   pass
@@ -106,6 +115,7 @@ def convert_structure_to_signature(structure, arg_names=None):
         type(None),
         dtypes.DType,
         tensor_spec.TensorSpec,
+        type_spec.TypeSpec,
     )):
       return arg
     return UnknownArgument()
@@ -151,9 +161,6 @@ class FuncGraph(ops.Graph):
       or the global default Graph.
     captures: Maps external tensor -> internal tensor (i.e. input placeholder).
       The entries are in the order they were captured.
-    deferred_captures: Maps arbitrary key -> (closure, nest of placeholders),
-      where at function call time the value of closure() will be used to feed
-      the nest of placeholders.
     control_captures: Set of external ops on which this graph has a control
       dependency.
     seed: The graph-level random seed.
@@ -190,14 +197,17 @@ class FuncGraph(ops.Graph):
     self.structured_input_signature = None
     self.structured_outputs = None
     self._weak_variables = []
-    self._watched_variables = weakref.WeakSet()
+    self._watched_variables = object_identity.ObjectIdentityWeakSet()
     self.outer_graph = ops.get_default_graph()
-    self.captures = py_collections.OrderedDict()
+    self._captures = py_collections.OrderedDict()
     # If not None, records the names of output args of this function. Used to
     # preserve the output names in the signature of a serialized+deserialized
     # function. Private at the moment mostly because it's often out of date.
     self._output_names = None
-    self.deferred_captures = py_collections.OrderedDict()
+    # Maps arbitrary key -> (closure, nest of placeholders), where at function
+    # call time the value of closure() will be used to feed the nest of
+    # placeholders.
+    self._deferred_captures = py_collections.OrderedDict()
     # Inherit capture-by-value from outer graph.
     if capture_by_value is not None:
       self.capture_by_value = capture_by_value
@@ -239,6 +249,12 @@ class FuncGraph(ops.Graph):
     else:
       self._collections = collections
 
+    # Keep track of whether this FuncGraph is exportable to SavedModel. Use
+    # `graph.mark_as_unsaveable(reason)` to mark this FuncGraph and any
+    # dependent functions as unsaveable.
+    self._saveable = True
+    self._saving_errors = set()
+
   def __str__(self):
     return "FuncGraph(name=%s, id=%s)" % (self.name, id(self))
 
@@ -272,7 +288,7 @@ class FuncGraph(ops.Graph):
     """
     if key is None:
       key = object()
-    if key not in self.deferred_captures:
+    if key not in self._deferred_captures:
 
       def convert_to_placeholder(s):
         if not isinstance(s, tensor_spec.TensorSpec):
@@ -295,8 +311,8 @@ class FuncGraph(ops.Graph):
         # pylint: enable=protected-access
         return nest.flatten(y, expand_composites=True)
 
-      self.deferred_captures[key] = (wrapped_closure, placeholder)
-    return self.deferred_captures[key][1]
+      self._deferred_captures[key] = (wrapped_closure, placeholder)
+    return self._deferred_captures[key][1]
 
   def control_dependencies(self, control_inputs):
     """Handles control dependencies.
@@ -438,8 +454,8 @@ class FuncGraph(ops.Graph):
       op_def=None,
       compute_device=True):
     # When capturing by value, do the read outside
-    reverse_captures = dict((v, k) for k, v in self.captures.items())
-    uncaptured_inputs = [reverse_captures.get(t, t) for t in inputs]
+    reverse_captures = dict((id(v), k) for k, v in self.captures)
+    uncaptured_inputs = [reverse_captures.get(id(t), t) for t in inputs]
     with ops.init_scope():
       if context.executing_eagerly():
         attr_list = ("dtype", int(attrs["dtype"].type))
@@ -460,7 +476,7 @@ class FuncGraph(ops.Graph):
     captured_value = self.capture(value)
     return captured_value.op
 
-  def create_op(
+  def _create_op_internal(
       self,
       op_type,
       inputs,
@@ -469,7 +485,6 @@ class FuncGraph(ops.Graph):
       name=None,
       attrs=None,
       op_def=None,
-      compute_shapes=True,
       compute_device=True):
     """Like Graph.create_op, except handles external input tensors.
 
@@ -495,15 +510,12 @@ class FuncGraph(ops.Graph):
         proto).
       op_def: (Optional.) The `OpDef` proto that describes the `op_type` that
         the operation will have.
-      compute_shapes: (Optional.) Deprecated. Has no effect (shapes are always
-        computed).
       compute_device: (Optional.) If True, device functions will be executed
         to compute the device property of the Operation.
 
     Returns:
       An `Operation` object.
     """
-    del compute_shapes
     if self.capture_by_value and op_type in ["ReadVariableOp",
                                              "ResourceGather"]:
       return self._capture_by_value(op_type, inputs, dtypes, input_types, name,
@@ -546,6 +558,11 @@ class FuncGraph(ops.Graph):
 
     Returns:
       Tensor from this FuncGraph.
+
+    Raises:
+      InaccessibleTensorError: if any tensors are accessed in a manner that
+      bypasses the mechanisms required for the data dependencies to be correctly
+      wired.
     """
     # Note: _forward_func_graph is currently only set when building the gradient
     # graph graph of a defun call. If the backwards graph tries to capture
@@ -553,17 +570,6 @@ class FuncGraph(ops.Graph):
     # makes sure that any tensor needed by a custom_gradient is correctly
     # captured.
 
-    # TODO(b/134097853): figure out a better way to check distributed variables
-    if hasattr(tensor, "_distribute_strategy") and hasattr(tensor, "_values"):
-      # This checks if the 'tensor' is a DistributedVariable. When it is a
-      # DistributedVariable, we do not want to check its "graph" attr as the
-      # following if branch does, because "graph" is not an attr for the
-      # container DistributedVariable object, and the underlying components may
-      # not have been initialized yet.
-      # The reason we do not use isinstance() is due to cyclic dependency issue.
-      if name is None:
-        name = str("distributed_variable")
-      return self._capture_helper(tensor, name)
     if (getattr(tensor, "graph", None) is not self and
         hasattr(self, "_forward_func_graph") and
         isinstance(self._forward_func_graph, FuncGraph)):
@@ -571,6 +577,13 @@ class FuncGraph(ops.Graph):
     if isinstance(tensor, ops.EagerTensor):
       if name is None:
         name = str(ops.uid())
+
+      # Small EagerTensors are captured with Const ops
+      if (tensor.dtype in dtypes.TF_VALUE_DTYPES and
+          np.prod(tensor.shape) <= _EAGER_CONST_THRESHOLD):
+        return self.capture_eager_tensor(tensor, name)
+
+      # Large EagerTensors and resources are captured with Placeholder ops
       return self._capture_helper(tensor, name)
     if tensor.graph is not self:
       if name is None:
@@ -578,39 +591,142 @@ class FuncGraph(ops.Graph):
       inner_graph = tensor.graph
       while inner_graph is not None and isinstance(inner_graph, FuncGraph):
         if inner_graph is self:
-          raise ValueError(
-              "Trying to capture a tensor from an inner function. This can be "
-              "caused by accessing a tensor defined inside a loop or "
-              "conditional body, or a subfunction, from a calling function, "
-              "without going through the proper return value mechanism. "
-              "Consider using TensorFlow mechanisms such as TensorArrays "
-              "to return tensors from inner functions or loop / conditional "
-              "bodies. Tensor: %s; tensor graph: %s; this graph: %s"
+          raise errors.InaccessibleTensorError(
+              "The tensor '%s' cannot be accessed here: it is defined"
+              " in another function or code block. Use return values,"
+              " explicit Python locals or TensorFlow collections to access"
+              " it. Defined in: %s; accessed from: %s.\n"
               % (tensor, tensor.graph, self))
         inner_graph = inner_graph.outer_graph
       return self._capture_helper(tensor, name)
     return tensor
 
   def _capture_helper(self, tensor, name):
-    captured_tensor = self.captures.get(tensor, None)
-    if captured_tensor is None:
-      captured_tensor = _create_substitute_placeholder(tensor, name=name,
-                                                       dtype=tensor.dtype)
-      self.captures[tensor] = captured_tensor
-      self.inputs.append(captured_tensor)
-    tape.record_operation("captured_value", [captured_tensor], [tensor],
+    capture = self._captures.get(ops.tensor_id(tensor))
+    if capture is None:
+      placeholder = _create_substitute_placeholder(
+          tensor, name=name, dtype=tensor.dtype)
+      self.add_capture(tensor, placeholder)
+    else:
+      placeholder = capture[1]
+    tape.record_operation("captured_value", [placeholder], [tensor],
                           lambda x: [x])
-    return captured_tensor
+    return placeholder
+
+  @property
+  def captures(self):
+    """Order list of tuples containing external and internal captures."""
+    return self._captures.values()
+
+  def add_capture(self, tensor, placeholder):
+    """Capture a specific tensor and utilize the provided placeholder.
+
+    Args:
+      tensor: Tensor to captures.
+      placeholder: Provided placeholder for the tensor.
+    """
+    self._captures[ops.tensor_id(tensor)] = (tensor, placeholder)
+    self.inputs.append(placeholder)
+
+  def reset_captures(self, capture_list):
+    """Set the captures with the provided list of captures & placeholder."""
+    self._captures = py_collections.OrderedDict()
+    for tensor, placeholder in capture_list:
+      self._captures[ops.tensor_id(tensor)] = (tensor, placeholder)
+
+  def pop_capture(self, tensor):
+    """Remove the capture and return the generated placeholder."""
+    capture = self._captures.pop(ops.tensor_id(tensor), None)
+    if capture is None:
+      return None
+
+    return capture[1]
+
+  def clear_captures(self):
+    # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
+    # Clearing captures using clear() leaves some cycles around.
+    while self._captures:
+      self._captures.popitem()
+    memory.dismantle_ordered_dict(self._captures)
+    while self._deferred_captures:
+      self._deferred_captures.popitem()
+    memory.dismantle_ordered_dict(self._deferred_captures)
+
+  def capture_distributed_variable(self, variable, placeholder):
+    """Add given distributed variable to captures with given placeholder."""
+    self._captures[ops.tensor_id(variable)] = (variable, placeholder)
+    tape.record_operation("captured_value", [placeholder], [variable],
+                          lambda x: [x])
+
+  def capture_eager_tensor(self, tensor, name):
+    capture = self._captures.get(ops.tensor_id(tensor))
+    if capture is None:
+      # We clear all control dependencies and place the Const op on the same
+      # device as the source tensor. The device placement may be relaxed at
+      # a later date.
+      with ops.control_dependencies(None), self.device(tensor.device):
+        graph_const = constant_op.constant(tensor.numpy(), dtype=tensor.dtype,
+                                           shape=tensor.shape, name=name)
+      self.add_capture(tensor, graph_const)
+    else:
+      graph_const = capture[1]
+    tape.record_operation("captured_value", [graph_const], [tensor],
+                          lambda x: [x])
+    return graph_const
 
   @property
   def external_captures(self):
     """External tensors captured by this function."""
-    return list(self.captures.keys())
+    return [c[0] for c in self._captures.values()]
 
   @property
   def internal_captures(self):
     """Placeholders in this function corresponding captured tensors."""
-    return list(self.captures.values())
+    return [c[1] for c in self._captures.values()]
+
+  @property
+  def deferred_external_captures(self):
+    """Ordered nest of tensors whose placeholders will be fed at call time."""
+    return [c[0] for c in self._deferred_captures.values()]
+
+  @property
+  def deferred_internal_captures(self):
+    """List of nest of placeholders which at call time will be fed."""
+    return [c[1] for c in self._deferred_captures.values()]
+
+  @property
+  def variable_captures(self):
+    """Map of tensor ids of variable handles to variables which are captured."""
+    return {
+        ops.tensor_id(self._captures[ops.tensor_id(v.handle)][1]): v
+        for v in self.variables
+        if ops.tensor_id(v.handle) in self._captures
+    }
+
+  def mark_as_unsaveable(self, error_message):
+    """Marks this FuncGraph as unsaveable.
+
+    Any attempts to export this FuncGraph will raise an error with the specified
+    message.
+
+    Args:
+      error_message: List or string containing the error message to be raised
+        when saving this FuncGraph to SavedModel.
+    """
+    self._saveable = False
+    if isinstance(error_message, str):
+      error_message = [error_message]
+    self._saving_errors.update(error_message)
+
+  @property
+  def saveable(self):
+    """Returns whether this FuncGraph is saveable."""
+    return self._saveable
+
+  @property
+  def saving_errors(self):
+    """Returns set of errors preventing this FuncGraph from being saved."""
+    return self._saving_errors
 
 
 def func_graph_from_py_func(name,
@@ -778,11 +894,11 @@ def func_graph_from_py_func(name,
                 autograph.ConversionOptions(
                     recursive=True,
                     optional_features=autograph_options,
-                    force_conversion=True,
+                    user_requested=True,
                 ), args, kwargs)
           except Exception as e:  # pylint:disable=broad-except
             if hasattr(e, "ag_error_metadata"):
-              raise e.ag_error_metadata.to_exception(type(e))
+              raise e.ag_error_metadata.to_exception(e)
             else:
               raise
 
@@ -807,7 +923,7 @@ def func_graph_from_py_func(name,
     # Variables in `func_args`, `func_kwargs` should be explicit inputs
     # to the function, not captured inputs.
     graph_variables = list(func_graph._watched_variables)  # pylint: disable=protected-access
-    arg_variables = set()
+    arg_variables = object_identity.ObjectIdentitySet()
     inputs = []
     for arg in (nest.flatten(func_args, expand_composites=True) +
                 nest.flatten(func_kwargs, expand_composites=True)):
@@ -815,7 +931,7 @@ def func_graph_from_py_func(name,
         # Even if an argument variable was not used in the function, we've
         # already manually captured the resource Tensor when creating argument
         # placeholders.
-        resource_placeholder = func_graph.captures.pop(arg.handle, None)
+        resource_placeholder = func_graph.pop_capture(arg.handle)
         if resource_placeholder is None:
           continue
         arg_variables.add(arg)
@@ -824,12 +940,8 @@ def func_graph_from_py_func(name,
         inputs.append(arg)
     variables = [v for v in graph_variables if v not in arg_variables]
     func_graph.inputs = (
-        inputs +
-        list(func_graph.captures.values()) +
-        nest.flatten(
-            [x[1] for x in func_graph.deferred_captures.values()],
-            expand_composites=True))
-
+        inputs + func_graph.internal_captures + nest.flatten(
+            func_graph.deferred_internal_captures, expand_composites=True))
     func_graph.structured_outputs = func_outputs
     # Returning a closed-over tensor does not trigger convert_to_tensor.
     func_graph.outputs.extend(
@@ -856,7 +968,7 @@ def maybe_captured(tensor):
   """
   if (not isinstance(tensor, ops.EagerTensor) and
       tensor.op.graph.building_function and tensor.op.type == "Placeholder"):
-    for input_t, placeholder_t in tensor.op.graph.captures.items():
+    for input_t, placeholder_t in tensor.op.graph.captures:
       if tensor == placeholder_t:
         return maybe_captured(input_t)
   # pylint: enable=protected-access
@@ -1066,12 +1178,5 @@ def dismantle_func_graph(func_graph):
     func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable
       after this function.
   """
-  # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
-  # Clearing captures using clear() leaves some cycles around.
-  while func_graph.captures:
-    func_graph.captures.popitem()
-  memory.dismantle_ordered_dict(func_graph.captures)
-  while func_graph.deferred_captures:
-    func_graph.deferred_captures.popitem()
-  memory.dismantle_ordered_dict(func_graph.deferred_captures)
+  func_graph.clear_captures()
   ops.dismantle_graph(func_graph)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index e607838ea97..6a5813eadc8 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
@@ -710,7 +711,7 @@ class _FuncGraph(ops.Graph):
     # _FuncGraph.
     self.outputs = []
     # Maps external tensor -> internal tensor (e.g. input placeholder).
-    self._captured = {}
+    self._captured = object_identity.ObjectIdentityDictionary()
     # The external tensors that have been captured as inputs and must be passed
     # to this function (empty if capturing by value, otherwise these are the
     # keys of _captured).
@@ -799,12 +800,12 @@ class _FuncGraph(ops.Graph):
         return var.value()
       return var
 
-  def create_op(self, op_type, inputs, dtypes=None, **kwargs):  # pylint: disable=redefined-outer-name
+  def _create_op_internal(self, op_type, inputs, dtypes=None, **kwargs):  # pylint: disable=redefined-outer-name
     for i, x in enumerate(inputs):
       if isinstance(x, ops.EagerTensor) or x.graph is not self:
         inputs[i] = self.capture(x)
-    return super(_FuncGraph, self).create_op(op_type, inputs,
-                                             dtypes=dtypes, **kwargs)
+    return super(_FuncGraph, self)._create_op_internal(
+        op_type, inputs, dtypes=dtypes, **kwargs)
 
   def capture(self, tensor, name=None):
     """Adds the given tensor to this graph and returns the captured tensor."""
@@ -863,7 +864,7 @@ class _FuncGraph(ops.Graph):
 
     captured_inputs = [self._add_tensor_and_parents(x) for x in op.inputs]
 
-    captured_op = self.create_op(
+    captured_op = self._create_op_internal(
         op.type,
         captured_inputs, [o.dtype for o in op.outputs],
         name=op.name,
@@ -1057,7 +1058,7 @@ def _call(sig, *inputs, **kwargs):
     name = func_name
   attrs = _parse_kwargs_as_attrs(func_name, **kwargs)
   output_types = [dtypes.DType(x.type) for x in sig.output_arg]
-  op = g.create_op(
+  op = g._create_op_internal(  # pylint: disable=protected-access
       func_name, list(inputs), output_types, name=name, attrs=attrs, op_def=sig)
   if op.outputs:
     if len(op.outputs) == 1:
@@ -1186,6 +1187,15 @@ def _get_experimental_kwarg_as_attr(attr_name, value):
                      (attr_name, type(value)))
 
 
+def _get_kwarg_as_str_attr(attr_name, value):
+  """Creates an AttrValue for a python object."""
+  if isinstance(value, str):
+    return attr_value_pb2.AttrValue(s=compat.as_bytes(value))
+  else:
+    raise ValueError("Unsupported attribute type for %s with type %s" %
+                     (attr_name, type(value)))
+
+
 def _parse_kwargs_as_attrs(func_name, **kwargs):
   """Parses **kwargs into a node's attributes."""
   attrs = {}
@@ -1218,7 +1228,10 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
     if key.startswith("experimental_"):
       attrs[key] = _get_experimental_kwarg_as_attr(key, kwargs[key])
       del kwargs[key]
-
+    # Support for https://github.com/tensorflow/community/pull/113/files.
+    elif key == "_implements" or key == "_reference":
+      attrs[key] = _get_kwarg_as_str_attr(key, kwargs[key])
+      del kwargs[key]
   if kwargs:
     raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
   return attrs
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 9d144957f12..7e12dffb9a2 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -62,7 +62,7 @@ def function_def_to_graph(fdef, input_shapes=None, copy_functions=True):
 
   with func_graph.as_default():
     # Add all function nodes to the graph.
-    importer.import_graph_def(graph_def, name="")
+    importer.import_graph_def_for_function(graph_def, name="")
 
     # Initialize fields specific to FuncGraph.
 
@@ -99,8 +99,9 @@ def function_def_to_graph(fdef, input_shapes=None, copy_functions=True):
     output_names = {}
     for ret_arg_def, tensor_name in zip(
         fdef.signature.output_arg, output_tensor_names):
-      output_names[func_graph.get_tensor_by_name(tensor_name)] = (
-          ret_arg_def.name)
+      output_names[ops.tensor_id(
+          func_graph.get_tensor_by_name(tensor_name))] = (
+              ret_arg_def.name)
     func_graph._output_names = output_names  # pylint: disable=protected-access
   return func_graph
 
@@ -207,7 +208,7 @@ def function_def_to_graph_def(fdef, input_shapes=None, copy_functions=True):
         # Since this function is referenced as an op type, we have no choice but
         # to copy it into the GraphDef if we want downstream tools to process
         # it.
-        graph_def.library.function.append(f.definition)
+        graph_def.library.function.add().CopyFrom(f.definition)
         copied_functions.add(node_def.op)
     else:
       op_def = ops.get_default_graph()._get_op_def(node_def.op)  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index b557f3ba192..4c9f6702583 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -75,15 +75,18 @@ class FunctionDefToGraphTest(test.TestCase):
     self.assertIsNone(g.outputs[1].shape.dims)  # Unknown dims.
 
     g = function_def_to_graph.function_def_to_graph(
-        fdef, input_shapes=[tensor_shape.vector(5),
-                            tensor_shape.vector(5)])
+        fdef,
+        input_shapes=[
+            tensor_shape.TensorShape([5]),
+            tensor_shape.TensorShape([5])
+        ])
     self.assertSequenceEqual(g.inputs[0].shape.dims, [5])
     self.assertSequenceEqual(g.inputs[1].shape.dims, [5])
     self.assertSequenceEqual(g.outputs[0].shape.dims, [5])
     self.assertSequenceEqual(g.outputs[1].shape.dims, [5])
 
     g = function_def_to_graph.function_def_to_graph(
-        fdef, input_shapes=[None, tensor_shape.matrix(5, 7)])
+        fdef, input_shapes=[None, tensor_shape.TensorShape([5, 7])])
     self.assertIsNone(g.inputs[0].shape.dims)
     self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7])
     self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7])
@@ -93,7 +96,7 @@ class FunctionDefToGraphTest(test.TestCase):
     # the number of input args in FunctionDef.signature.input_arg.
     with self.assertRaises(ValueError):
       g = function_def_to_graph.function_def_to_graph(
-          fdef, input_shapes=[tensor_shape.matrix(5, 7)])
+          fdef, input_shapes=[tensor_shape.TensorShape([5, 7])])
 
 
 class FunctionDefToGraphDefTest(test.TestCase):
@@ -177,8 +180,10 @@ class FunctionDefToGraphDefTest(test.TestCase):
     fdef = self._build_function_def()
     g, _ = function_def_to_graph.function_def_to_graph_def(
         fdef,
-        input_shapes=[tensor_shape.scalar(),
-                      tensor_shape.vector(5), None])
+        input_shapes=[
+            tensor_shape.TensorShape([]),
+            tensor_shape.TensorShape([5]), None
+        ])
     self.assertEqual("shape" in g.node[0].attr, True)
     self.assertSequenceEqual(
         tensor_shape.TensorShape(g.node[0].attr["shape"].shape).as_list(), [])
@@ -191,6 +196,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testFunctionCallsFromFunction(self):
+    ops.disable_tensor_equality()
     x = constant_op.constant(5.0)
     y = constant_op.constant(10.0)
 
@@ -213,7 +219,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
     # `function_def_to_graph` can find it.
     fn2_defun()
 
-    fdef = fn2_defun._inference_function.definition
+    fdef = fn2_defun.function_def
     func_graph = function_def_to_graph.function_def_to_graph(fdef)
     with func_graph.as_default():
       x_ph, y_ph = func_graph.inputs
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 7f679a4d023..58a1d379304 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1388,6 +1388,20 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(FunctionWithBoolAttr.definition.attr["experimental_tag"].b,
                      True)
 
+  def testImplementsReferenceAttrs(self):
+
+    @function.Defun(
+        dtypes.int32, _implements="org.google.lstm", _reference="arxiv.org")
+    def FunctionWithStrAttr(i):
+      return array_ops.identity(i)
+
+    self.assertIn("_implements", FunctionWithStrAttr.definition.attr)
+    self.assertEqual(FunctionWithStrAttr.definition.attr["_implements"].s,
+                     b"org.google.lstm")
+    self.assertIn("_reference", FunctionWithStrAttr.definition.attr)
+    self.assertEqual(FunctionWithStrAttr.definition.attr["_reference"].s,
+                     b"arxiv.org")
+
 
 class FunctionOverloadTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 59621a0bc2a..a735081834e 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -45,6 +45,15 @@ _VARIABLE_OPS = {
     "VariableV2",
 }
 
+_CONTROL_FLOW_OP_NAMES_OR_IDENTITY = [
+    "Switch",
+    "Enter",
+    "Exit",
+    "Identity",
+    "Merge",
+    "NextIteration",
+]
+
 
 def _is_variable_op(op):
   """Returns true if 'op' refers to a Variable node."""
@@ -126,6 +135,12 @@ def _extract_graph_summary(graph_def):
     n = _node_name(node.name)
     name_to_node[n] = node
     name_to_input_name[n] = [_node_name(x) for x in node.input]
+    # Prevent colocated nodes from being lost.
+    if "_class" in node.attr:
+      for colocated_node_name in node.attr["_class"].list.s:
+        colocated_node_decoded = colocated_node_name.decode("utf-8")
+        if colocated_node_decoded.startswith("loc:@"):
+          name_to_input_name[n].append(colocated_node_decoded[5:])
     name_to_seq_num[n] = seq
     seq += 1
   return name_to_input_name, name_to_node, name_to_seq_num
@@ -243,15 +258,7 @@ def convert_variables_to_constants(sess,
     GraphDef containing a simplified version of the original.
   """
 
-  def get_input_name(node):
-    """Gets the name of the first input. Errors if suffix is not :0."""
-    details = node.input[0].split(":")
-    if len(details) == 1 or int(details[1]) == 0:
-      return details[0]
-    # While it is valid for input tensors to have a suffix that is not :0, this
-    # method is used to find the associated ops, not tensors, and therefore it
-    # is not valid.
-    raise ValueError("Tensor name '{0}' is invalid.".format(node.input[0]))
+  get_input_name = lambda node, index=0: node.input[index].split(":")[0]
 
   def create_const_op(node_name, dtype, data, data_shape=None):
     """Creates a Const op."""
@@ -277,7 +284,7 @@ def convert_variables_to_constants(sess,
   # Get list of variables.
   variable_names = []
   variable_dict_names = []
-  resource_identity_types = {}
+  resource_op_types = {}
   for node in inference_graph.node:
     if node.op in ["Variable", "VariableV2", "VarHandleOp"]:
       variable_name = node.name
@@ -292,15 +299,31 @@ def convert_variables_to_constants(sess,
       else:
         variable_names.append(variable_name + ":0")
     elif node.op in ["ReadVariableOp", "ResourceGather"]:
-      # There can be one or more Identity ops in between the ReadVariableOp and
-      # VarHandleOp.  Store the Identity ops with the associated dtypes.
-      source_op_name = get_input_name(node)
-      while map_name_to_node[source_op_name].op == "Identity":
-        resource_identity_types[source_op_name] = node.attr["dtype"]
-        source_op_name = get_input_name(map_name_to_node[source_op_name])
-      if map_name_to_node[source_op_name].op != "VarHandleOp":
-        raise ValueError("Cannot find the variable that is an input "
-                         "to the ReadVariableOp.")
+      # There can be one or more Identity or control flow ops in between the
+      # ReadVariableOp and VarHandleOp. Store the ops with the associated
+      # dtypes.
+      source_op_names = [get_input_name(node)]
+      while (source_op_names and map_name_to_node[source_op_names[0]].op in
+             _CONTROL_FLOW_OP_NAMES_OR_IDENTITY):
+        source_op_name = source_op_names.pop()
+
+        if source_op_name not in resource_op_types:
+          resource_op_types[source_op_name] = node.attr["dtype"]
+          source_op_names.append(
+              get_input_name(map_name_to_node[source_op_name]))
+
+        if map_name_to_node[source_op_name].op == "Merge":
+          merge_resource_name = get_input_name(
+              map_name_to_node[source_op_name], index=1)
+          if merge_resource_name not in resource_op_types:
+            resource_op_types[merge_resource_name] = node.attr["dtype"]
+            source_op_names.append(
+                get_input_name(map_name_to_node[merge_resource_name]))
+
+      for source_node in source_op_names:
+        if map_name_to_node[source_node].op != "VarHandleOp":
+          raise ValueError("Cannot find the variable that is an input "
+                           "to the ReadVariableOp.")
 
   # Gets map of variables and the associated data.
   if variable_names:
@@ -320,11 +343,18 @@ def convert_variables_to_constants(sess,
       output_node = create_const_op(input_node.name, input_node.attr["dtype"],
                                     data, data.shape)
       how_many_converted += 1
-    elif input_node.name in resource_identity_types:
-      # Converts the Identities of type RESOURCE_DT to the appropriate type
-      # based on the input they are referencing.
-      output_node.CopyFrom(input_node)
-      output_node.attr["T"].CopyFrom(resource_identity_types[input_node.name])
+    elif input_node.name in resource_op_types:
+      # Converts the type of the ops between the ReadVariableOp and VarHandleOp
+      # from RESOURCE_DT to the appropriate type based on the input they are
+      # referencing. Do not copy shapes due to incorrect shape info.
+      output_node.op = input_node.op
+      output_node.name = input_node.name
+      for in_node in input_node.input:
+        output_node.input.append(in_node)
+      for attr_name in input_node.attr:
+        if str(attr_name) != "_output_shapes":
+          output_node.attr[attr_name].CopyFrom(input_node.attr[attr_name])
+      output_node.attr["T"].CopyFrom(resource_op_types[input_node.name])
     elif input_node.op == "ReadVariableOp":
       # The first branch converts all VarHandleOps of ResourceVariables to
       # constants, so we need to convert the associated ReadVariableOps to
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 6a5a779ca03..c28167a68c3 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -36,6 +36,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops as math_ops_lib
@@ -205,54 +207,119 @@ class DeviceFunctionsTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must be a list"):
       graph_util.extract_sub_graph(graph_def, "n1")
 
-  def _test_convert_variables_with_functions(self, inline_functions):
-    """Freezes a graph with functions."""
+  def create_node_def(self, op, name, inputs):
+    new_node = node_def_pb2.NodeDef()
+    new_node.op = op
+    new_node.name = name
+    new_node.input.extend(inputs)
+    return new_node
 
-    @function.Defun(dtypes.float32)
-    def plus_one(x):
-      return x + 1.0
+  def create_constant_node_def(self,
+                               name,
+                               value,
+                               dtype,
+                               shape=None,
+                               inputs=None):
+    node = self.create_node_def("Const", name, inputs or [])
+    self.set_attr_dtype(node, "dtype", dtype)
+    self.set_attr_tensor(node, "value", value, dtype, shape)
+    return node
 
-    with ops.Graph().as_default():
-      variable_node = variables.Variable(1.0, name="variable_node")
-      _ = variables.Variable(1.0, name="unused_variable_node")
-      defun_node = plus_one(variable_node)
-      _ = math_ops_lib.multiply(defun_node, 2.0, name="output_node")
+  def set_attr_dtype(self, node, key, value):
+    node.attr[key].CopyFrom(
+        attr_value_pb2.AttrValue(type=value.as_datatype_enum))
 
-      with session.Session() as sess:
-        self.evaluate(variables.variables_initializer([variable_node]))
-        variable_graph_def = sess.graph.as_graph_def()
+  def set_attr_tensor(self, node, key, value, dtype, shape=None):
+    node.attr[key].CopyFrom(
+        attr_value_pb2.AttrValue(
+            tensor=tensor_util.make_tensor_proto(
+                value, dtype=dtype, shape=shape)))
 
-        if inline_functions:
-          # Run Grappler to create the VarOpHandle --> Placeholder -->
-          # ResourceVariable pattern.
-          meta_graph = export_meta_graph(graph_def=variable_graph_def)
-          fetch_collection = meta_graph_pb2.CollectionDef()
-          for name in ["variable_node", "output_node"]:
-            fetch_collection.node_list.value.append(name)
-          meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+  def testRemoveTrainingNodes(self):
+    a_constant_name = "a_constant"
+    b_constant_name = "b_constant"
+    a_check_name = "a_check"
+    b_check_name = "b_check"
+    a_identity_name = "a_identity"
+    b_identity_name = "b_identity"
+    add_name = "add"
+    graph_def = graph_pb2.GraphDef()
+    a_constant = self.create_constant_node_def(
+        a_constant_name, value=1, dtype=dtypes.float32, shape=[])
+    graph_def.node.extend([a_constant])
+    a_check_node = self.create_node_def("CheckNumerics", a_check_name,
+                                        [a_constant_name])
+    graph_def.node.extend([a_check_node])
+    a_identity_node = self.create_node_def(
+        "Identity", a_identity_name, [a_constant_name, "^" + a_check_name])
+    graph_def.node.extend([a_identity_node])
+    b_constant = self.create_constant_node_def(
+        b_constant_name, value=1, dtype=dtypes.float32, shape=[])
+    graph_def.node.extend([b_constant])
+    b_check_node = self.create_node_def("CheckNumerics", b_check_name,
+                                        [b_constant_name])
+    graph_def.node.extend([b_check_node])
+    b_identity_node = self.create_node_def(
+        "Identity", b_identity_name, [b_constant_name, "^" + b_check_name])
+    graph_def.node.extend([b_identity_node])
+    add_node = self.create_node_def("Add", add_name,
+                                    [a_identity_name, b_identity_name])
+    self.set_attr_dtype(add_node, "T", dtypes.float32)
+    graph_def.node.extend([add_node])
 
-          # Initialize RewriterConfig with everything disabled except function
-          # inlining.
-          config = config_pb2.ConfigProto()
-          rewrite_options = config.graph_options.rewrite_options
-          rewrite_options.optimizers.append("function")
-          variable_graph_def = tf_optimizer.OptimizeGraph(config, meta_graph)
+    expected_output = graph_pb2.GraphDef()
+    a_constant = self.create_constant_node_def(
+        a_constant_name, value=1, dtype=dtypes.float32, shape=[])
+    expected_output.node.extend([a_constant])
+    b_constant = self.create_constant_node_def(
+        b_constant_name, value=1, dtype=dtypes.float32, shape=[])
+    expected_output.node.extend([b_constant])
+    add_node = self.create_node_def("Add", add_name,
+                                    [a_constant_name, b_constant_name])
+    self.set_attr_dtype(add_node, "T", dtypes.float32)
+    expected_output.node.extend([add_node])
 
-        constant_graph_def = graph_util.convert_variables_to_constants(
-            sess, variable_graph_def, ["output_node"])
+    output = graph_util.remove_training_nodes(graph_def)
+    self.assertProtoEquals(expected_output, output)
 
-    # Ensure there are no variables after freezing.
-    for node in constant_graph_def.node:
-      self.assertNotIn(
-          node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
+  def testRemoveIdentityChains(self):
+    """Check that chains of Identity nodes are correctly pruned.
 
-  def testConvertVariablesToConstsWithFunctions(self):
-    """Freezes a graph with functions."""
-    self._test_convert_variables_with_functions(inline_functions=False)
+    Create a chain of four nodes, A, B, C, and D where A inputs B, B inputs C,
+    and C inputs D. Nodes B and C are "Identity" and should be pruned, resulting
+    in the nodes A and D, where A inputs D.
+    """
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_node_def("Aop", "A", ["B"]),
+        self.create_node_def("Identity", "B", ["C"]),
+        self.create_node_def("Identity", "C", ["D"]),
+        self.create_node_def("Dop", "D", [])
+    ])
 
-  def testConvertVariableToConstsWithFunctionsInlined(self):
-    """Freezes a graph with functions that have been inlined using Grappler."""
-    self._test_convert_variables_with_functions(inline_functions=True)
+    expected_graph_def = graph_pb2.GraphDef()
+    expected_graph_def.node.extend([
+        self.create_node_def("Aop", "A", ["D"]),
+        self.create_node_def("Dop", "D", [])
+    ])
+
+    self.assertProtoEquals(expected_graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
+  def testRemoveIdentityUsedAsControlInputInConst(self):
+    """Check that Identity nodes used as control inputs are not removed."""
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]),
+        self.create_node_def("Identity", "I", ["Base"]),
+        self.create_node_def("BaseOp", "Base", [])
+    ])
+
+    self.assertProtoEquals(graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
+
+class ConvertVariablesToConstantsTest(test.TestCase):
 
   def _get_tensors(self, sess, tensor_list):
     """Returns a list of Tensor objects from the Session."""
@@ -260,6 +327,10 @@ class DeviceFunctionsTest(test.TestCase):
         sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
     ]
 
+  def _get_tensor_names(self, tensors):
+    """Returns a list of string names for the tensors specified."""
+    return [tensor.name.split(":")[0] for tensor in tensors]
+
   def _evaluate_graph_def(self, graph_def, inputs, outputs, input_data):
     """Evaluates the GraphDef using Sessions."""
     with ops.Graph().as_default() as graph:
@@ -271,45 +342,19 @@ class DeviceFunctionsTest(test.TestCase):
     return sess.run(
         output_tensors, feed_dict=dict(zip(input_tensors, input_data)))
 
-  @test_util.run_v1_only("Incompatible with TF 2.0")
-  def testConvertVariablesToConstsWithEmbeddings(self):
-    """Freezes a graph with embeddings."""
-    input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32)
-
-    # Make model.
-    state_input = keras.layers.Input(
-        shape=(1,), name="state_input", dtype="int32")
-    output = keras.layers.Embedding(
-        output_dim=16, input_dim=100, input_length=1, name="state")(
-            state_input)
-    model = keras.models.Model(inputs=[state_input], outputs=[output])
-    model.compile(
-        loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam")
-
-    # Get associated session.
-    sess = keras.backend.get_session()
-    variable_graph_def = sess.graph_def
-    output_tensor = [tensor.name.split(":")[0] for tensor in model.outputs]
-    constant_graph_def = graph_util.convert_variables_to_constants(
-        sess, variable_graph_def, output_tensor)
-
-    # Ensure graph has no variables.
-    for node in constant_graph_def.node:
+  def _ensure_no_variables_in_graph(self, graph_def):
+    """Ensures there are no variables in the graph."""
+    for node in graph_def.node:
       self.assertNotIn(
           node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
 
-    # Compare the value of the graphs.
+  def _test_converted_keras_model(self, model, constant_graph_def, input_data):
+    """Compares the converted Keras model."""
     expected_value = model.predict(input_data)
     actual_value = self._evaluate_graph_def(constant_graph_def, model.inputs,
                                             model.outputs, [input_data])
     np.testing.assert_almost_equal(np.array([expected_value]), actual_value, 5)
 
-  def testConvertVariablesToConsts(self):
-    self._test_variable_to_const_conversion(use_resource=False)
-
-  def testConvertResourceVariablesToConsts(self):
-    self._test_variable_to_const_conversion(use_resource=True)
-
   def _test_variable_to_const_conversion(self, use_resource):
     with ops.Graph().as_default():
       with variable_scope.variable_scope("", use_resource=use_resource):
@@ -367,120 +412,160 @@ class DeviceFunctionsTest(test.TestCase):
     with ops.Graph().as_default():
       _ = importer.import_graph_def(constant_graph_def, name="")
       self.assertEqual(4, len(constant_graph_def.node))
-      for node in constant_graph_def.node:
-        self.assertNotIn(
-            node.op,
-            ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
+      self._ensure_no_variables_in_graph(constant_graph_def)
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
         output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
-  def create_node_def(self, op, name, inputs):
-    new_node = node_def_pb2.NodeDef()
-    new_node.op = op
-    new_node.name = name
-    for input_name in inputs:
-      new_node.input.extend([input_name])
-    return new_node
+  def _inline_functions(self, graph_def, arrays):
+    meta_graph = export_meta_graph(graph_def=graph_def)
+    fetch_collection = meta_graph_pb2.CollectionDef()
+    for name in arrays:
+      fetch_collection.node_list.value.append(name)
+    meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
 
-  def create_constant_node_def(self, name, value, dtype,
-                               shape=None, inputs=None):
-    node = self.create_node_def("Const", name, inputs or [])
-    self.set_attr_dtype(node, "dtype", dtype)
-    self.set_attr_tensor(node, "value", value, dtype, shape)
-    return node
+    # Initialize RewriterConfig with everything disabled except function
+    # inlining.
+    config = config_pb2.ConfigProto()
+    rewrite_options = config.graph_options.rewrite_options
+    rewrite_options.optimizers.append("function")
+    return tf_optimizer.OptimizeGraph(config, meta_graph)
 
-  def set_attr_dtype(self, node, key, value):
-    node.attr[key].CopyFrom(
-        attr_value_pb2.AttrValue(type=value.as_datatype_enum))
+  def _test_convert_variables_with_functions(self, inline_functions):
+    """Freezes a graph with functions."""
 
-  def set_attr_tensor(self, node, key, value, dtype, shape=None):
-    node.attr[key].CopyFrom(
-        attr_value_pb2.AttrValue(tensor=tensor_util.make_tensor_proto(
-            value, dtype=dtype, shape=shape)))
+    @function.Defun(dtypes.float32)
+    def plus_one(x):
+      return x + 1.0
 
-  def testRemoveTrainingNodes(self):
-    a_constant_name = "a_constant"
-    b_constant_name = "b_constant"
-    a_check_name = "a_check"
-    b_check_name = "b_check"
-    a_identity_name = "a_identity"
-    b_identity_name = "b_identity"
-    add_name = "add"
-    graph_def = graph_pb2.GraphDef()
-    a_constant = self.create_constant_node_def(
-        a_constant_name, value=1, dtype=dtypes.float32, shape=[])
-    graph_def.node.extend([a_constant])
-    a_check_node = self.create_node_def("CheckNumerics", a_check_name,
-                                        [a_constant_name])
-    graph_def.node.extend([a_check_node])
-    a_identity_node = self.create_node_def(
-        "Identity", a_identity_name, [a_constant_name, "^" + a_check_name])
-    graph_def.node.extend([a_identity_node])
-    b_constant = self.create_constant_node_def(
-        b_constant_name, value=1, dtype=dtypes.float32, shape=[])
-    graph_def.node.extend([b_constant])
-    b_check_node = self.create_node_def("CheckNumerics", b_check_name,
-                                        [b_constant_name])
-    graph_def.node.extend([b_check_node])
-    b_identity_node = self.create_node_def(
-        "Identity", b_identity_name, [b_constant_name, "^" + b_check_name])
-    graph_def.node.extend([b_identity_node])
-    add_node = self.create_node_def("Add", add_name,
-                                    [a_identity_name, b_identity_name])
-    self.set_attr_dtype(add_node, "T", dtypes.float32)
-    graph_def.node.extend([add_node])
+    with ops.Graph().as_default():
+      variable_node = variables.Variable(1.0, name="variable_node")
+      _ = variables.Variable(1.0, name="unused_variable_node")
+      defun_node = plus_one(variable_node)
+      _ = math_ops_lib.multiply(defun_node, 2.0, name="output_node")
 
-    expected_output = graph_pb2.GraphDef()
-    a_constant = self.create_constant_node_def(
-        a_constant_name, value=1, dtype=dtypes.float32, shape=[])
-    expected_output.node.extend([a_constant])
-    b_constant = self.create_constant_node_def(
-        b_constant_name, value=1, dtype=dtypes.float32, shape=[])
-    expected_output.node.extend([b_constant])
-    add_node = self.create_node_def("Add", add_name,
-                                    [a_constant_name, b_constant_name])
-    self.set_attr_dtype(add_node, "T", dtypes.float32)
-    expected_output.node.extend([add_node])
+      with session.Session() as sess:
+        self.evaluate(variables.variables_initializer([variable_node]))
+        variable_graph_def = sess.graph.as_graph_def()
 
-    output = graph_util.remove_training_nodes(graph_def)
-    self.assertProtoEquals(expected_output, output)
+        if inline_functions:
+          # Run Grappler to create the VarOpHandle --> Placeholder -->
+          # ResourceVariable pattern.
+          variable_graph_def = self._inline_functions(
+              variable_graph_def, ["variable_node", "output_node"])
 
-  def testRemoveIdentityChains(self):
-    """Check that chains of Identity nodes are correctly pruned.
+        constant_graph_def = graph_util.convert_variables_to_constants(
+            sess, variable_graph_def, ["output_node"])
 
-    Create a chain of four nodes, A, B, C, and D where A inputs B, B inputs C,
-    and C inputs D. Nodes B and C are "Identity" and should be pruned, resulting
-    in the nodes A and D, where A inputs D.
-    """
-    graph_def = graph_pb2.GraphDef()
-    graph_def.node.extend([
-        self.create_node_def("Aop", "A", ["B"]), self.create_node_def(
-            "Identity", "B", ["C"]), self.create_node_def(
-                "Identity", "C", ["D"]), self.create_node_def("Dop", "D", [])
-    ])
+    self._ensure_no_variables_in_graph(constant_graph_def)
 
-    expected_graph_def = graph_pb2.GraphDef()
-    expected_graph_def.node.extend([
-        self.create_node_def("Aop", "A", ["D"]), self.create_node_def(
-            "Dop", "D", [])
-    ])
+  def testReferenceVariables(self):
+    """Freezes a graph with reference variables."""
+    self._test_variable_to_const_conversion(use_resource=False)
 
-    self.assertProtoEquals(expected_graph_def,
-                           graph_util.remove_training_nodes(graph_def))
+  def testResourceVariables(self):
+    """Freezes a graph with resource variables."""
+    self._test_variable_to_const_conversion(use_resource=True)
 
-  def testRemoveIdentityUsedAsControlInputInConst(self):
-    """Check that Identity nodes used as control inputs are not removed."""
-    graph_def = graph_pb2.GraphDef()
-    graph_def.node.extend([
-        self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]),
-        self.create_node_def("Identity", "I", ["Base"]),
-        self.create_node_def("BaseOp", "Base", [])
-    ])
+  def testWithFunctions(self):
+    """Freezes a graph with functions."""
+    self._test_convert_variables_with_functions(inline_functions=False)
 
-    self.assertProtoEquals(graph_def,
-                           graph_util.remove_training_nodes(graph_def))
+  def testWithInlinedFunctions(self):
+    """Freezes a graph with functions that have been inlined using Grappler."""
+    self._test_convert_variables_with_functions(inline_functions=True)
+
+  def testWithEmbeddings(self):
+    """Freezes a graph with embeddings."""
+    ops.disable_eager_execution()
+    state_input = keras.layers.Input(
+        shape=(1,), name="state_input", dtype="int32")
+    output = keras.layers.Embedding(
+        output_dim=16, input_dim=100, input_length=1, name="state")(
+            state_input)
+    model = keras.models.Model(inputs=[state_input], outputs=[output])
+    model.compile(
+        loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam")
+
+    # Freeze the graph.
+    sess = keras.backend.get_session()
+    variable_graph_def = sess.graph_def
+    output_tensor = self._get_tensor_names(model.outputs)
+    constant_graph_def = graph_util.convert_variables_to_constants(
+        sess, variable_graph_def, output_tensor)
+
+    # Validate converted graph.
+    input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32)
+    self._ensure_no_variables_in_graph(constant_graph_def)
+    self._test_converted_keras_model(model, constant_graph_def, input_data)
+
+  def testGraphWithSwitch(self):
+    """Freezes a graph which contains a Switch with type RESOURCE_DT."""
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope("", use_resource=True):
+        x = variable_scope.get_variable("var_x", initializer=1.0)
+        y = variable_scope.get_variable("var_y", initializer=2.0)
+        f1 = lambda: variable_scope.get_variable("var_f1", initializer=17.0)
+        f2 = lambda: variable_scope.get_variable("var_f2", initializer=23.0)
+        cond_node = control_flow_ops.case([(gen_math_ops.less(x, y), f1)],
+                                          default=f2)
+        _ = math_ops_lib.multiply(cond_node, 2.0, name="output_node")
+
+        with session.Session() as sess:
+          sess.run(variables.global_variables_initializer())
+          variable_graph_def = sess.graph.as_graph_def()
+
+          constant_graph_def = graph_util.convert_variables_to_constants(
+              sess, variable_graph_def, ["output_node"])
+
+    self._ensure_no_variables_in_graph(constant_graph_def)
+
+  def testKerasBatchNorm(self):
+    """Freezes a graph with Keras batch norm."""
+    ops.disable_eager_execution()
+    inputs = keras.layers.Input(shape=(128, 128, 1))
+    batch_norm = keras.layers.BatchNormalization()(inputs)
+    model = keras.models.Model(inputs, batch_norm, name="test")
+    model.compile(
+        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
+    tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
+
+    # Freeze the graph.
+    sess = keras.backend.get_session()
+    variable_graph_def = sess.graph_def
+    variable_graph_def = self._inline_functions(variable_graph_def,
+                                                tensor_names)
+    output_tensor = self._get_tensor_names(model.outputs)
+    constant_graph_def = graph_util.convert_variables_to_constants(
+        sess, variable_graph_def, output_tensor)
+
+    # Validate converted graph.
+    input_data = np.array(
+        np.random.random_sample([1, 128, 128, 1]), dtype=np.int32)
+    self._ensure_no_variables_in_graph(constant_graph_def)
+    self._test_converted_keras_model(model, constant_graph_def, input_data)
+
+  def testLSTM(self):
+    """Freezes a Keras LSTM."""
+    ops.disable_eager_execution()
+    model = keras.models.Sequential(
+        [keras.layers.LSTM(units=10, input_shape=(10, 10))])
+    tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
+
+    # Freeze the model.
+    sess = keras.backend.get_session()
+    variable_graph_def = sess.graph_def
+    variable_graph_def = self._inline_functions(variable_graph_def,
+                                                tensor_names)
+    output_tensor = self._get_tensor_names(model.outputs)
+    constant_graph_def = graph_util.convert_variables_to_constants(
+        sess, variable_graph_def, output_tensor)
+
+    # Validate converted graph.
+    input_data = np.array(np.random.random_sample([10, 10, 10]), dtype=np.int32)
+    self._ensure_no_variables_in_graph(constant_graph_def)
+    self._test_converted_keras_model(model, constant_graph_def, input_data)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 3ba7176876b..95a77100fc9 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -202,7 +202,8 @@ def _ConvertInputMapValues(name, input_map):
 
 
 def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
-                                     return_elements):
+                                     return_elements,
+                                     validate_colocation_constraints):
   """Populates the TF_ImportGraphDefOptions `options`."""
   c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
   c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
@@ -229,6 +230,9 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
       c_api.TF_ImportGraphDefOptionsAddReturnOperation(options,
                                                        compat.as_str(name))
 
+  c_api.TF_ImportGraphDefOptionsSetValidateColocationConstraints(
+      options, validate_colocation_constraints)
+
 
 def _ProcessNewOps(graph):
   """Processes the newly-added TF_Operations in `graph`."""
@@ -384,6 +388,73 @@ def import_graph_def(graph_def,
     corresponding to the names in `return_elements`,
     and None if `returns_elements` is None.
 
+  Raises:
+    TypeError: If `graph_def` is not a `GraphDef` proto,
+      `input_map` is not a dictionary mapping strings to `Tensor` objects,
+      or `return_elements` is not a list of strings.
+    ValueError: If `input_map`, or `return_elements` contains names that
+      do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
+      it refers to an unknown tensor).
+  """
+  return _import_graph_def_internal(
+      graph_def,
+      input_map=input_map,
+      return_elements=return_elements,
+      name=name,
+      op_dict=op_dict,
+      producer_op_list=producer_op_list)
+
+
+def import_graph_def_for_function(  # pylint: disable=invalid-name
+    graph_def, name=None):
+  """Like import_graph_def but does not validate colocation constraints."""
+  return _import_graph_def_internal(
+      graph_def, validate_colocation_constraints=False, name=name)
+
+
+def _import_graph_def_internal(  # pylint: disable=invalid-name
+    graph_def,
+    input_map=None,
+    return_elements=None,
+    validate_colocation_constraints=True,
+    name=None,
+    op_dict=None,
+    producer_op_list=None):
+  """Imports the graph from `graph_def` into the current default `Graph`.
+
+  This function provides a way to import a serialized TensorFlow
+  [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
+  protocol buffer, and extract individual objects in the `GraphDef` as
+  `tf.Tensor` and `tf.Operation` objects. Once extracted,
+  these objects are placed into the current default `Graph`. See
+  `tf.Graph.as_graph_def` for a way to create a `GraphDef`
+  proto.
+
+  Args:
+    graph_def: A `GraphDef` proto containing operations to be imported into the
+      default graph.
+    input_map: A dictionary mapping input names (as strings) in `graph_def` to
+      `Tensor` objects. The values of the named input tensors in the imported
+      graph will be re-mapped to the respective `Tensor` values.
+    return_elements: A list of strings containing operation names in `graph_def`
+      that will be returned as `Operation` objects; and/or tensor names in
+      `graph_def` that will be returned as `Tensor` objects.
+    validate_colocation_constraints: Whether to validate colocation constraints.
+    name: (Optional.) A prefix that will be prepended to the names in
+      `graph_def`. Note that this does not apply to imported function names.
+      Defaults to `"import"`.
+    op_dict: (Optional.) Deprecated, do not use.
+    producer_op_list: (Optional.) An `OpList` proto with the (possibly stripped)
+      list of `OpDef`s used by the producer of the graph. If provided,
+      unrecognized attrs for ops in `graph_def` that have their default value
+      according to `producer_op_list` will be removed. This will allow some more
+      `GraphDef`s produced by later binaries to be accepted by earlier binaries.
+
+  Returns:
+    A list of `Operation` and/or `Tensor` objects from the imported graph,
+    corresponding to the names in `return_elements`,
+    and None if `returns_elements` is None.
+
   Raises:
     TypeError: If `graph_def` is not a `GraphDef` proto,
       `input_map` is not a dictionary mapping strings to `Tensor` objects,
@@ -416,8 +487,8 @@ def import_graph_def(graph_def,
 
   scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
   options = scoped_options.options
-  _PopulateTFImportGraphDefOptions(options, prefix, input_map,
-                                   return_elements)
+  _PopulateTFImportGraphDefOptions(options, prefix, input_map, return_elements,
+                                   validate_colocation_constraints)
 
   # _ProcessNewOps mutates the new operations. _mutation_lock ensures a
   # Session.run call cannot occur between creating the TF_Operations in the
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index 3026caa08db..8bc21cac682 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -23,6 +23,7 @@ import collections
 import warnings
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
@@ -211,7 +212,7 @@ class IndexedSlicesSpec(type_spec.TypeSpec):
       self._dense_shape_dtype = None
     else:
       self._dense_shape_dtype = dtypes.as_dtype(dense_shape_dtype)
-    self._indices_shape = tensor_shape.as_shape(indices_shape)
+    self._indices_shape = tensor_shape.as_shape(indices_shape).with_rank(1)
 
   def _serialize(self):
     return (self._shape, self._values_dtype, self._indices_dtype,
@@ -235,7 +236,14 @@ class IndexedSlicesSpec(type_spec.TypeSpec):
       return (value.values, value.indices, value.dense_shape)
 
   def _from_components(self, tensor_list):
-    return IndexedSlices(*tensor_list)
+    if (all(isinstance(t, np.ndarray) for t in tensor_list) and
+        not tf2.enabled()):
+      if len(tensor_list) == 2:
+        return IndexedSlicesValue(tensor_list[0], tensor_list[1], None)
+      else:
+        return IndexedSlicesValue(*tensor_list)
+    else:
+      return IndexedSlices(*tensor_list)
 
 
 @tf_export(v1=["convert_to_tensor_or_indexed_slices"])
diff --git a/tensorflow/python/framework/op_callbacks.py b/tensorflow/python/framework/op_callbacks.py
new file mode 100644
index 00000000000..281a8382e17
--- /dev/null
+++ b/tensorflow/python/framework/op_callbacks.py
@@ -0,0 +1,221 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unified callbacks op execution and creation under eager and graph modes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
+
+# A thread-local state object. It may hold the following attributes:
+#   - `callbacks`: the thread-local stack of op callbacks.
+#   - `invoking_callbacks`: a boolean used to keep track of whether
+#     we are currently invoking an op_callback.
+_state = threading.local()
+
+
+class _OpCallbackContextManager(object):
+  """Context manager for op callbacks."""
+
+  def __init__(self, callback_fn):
+    self._callback_fn = callback_fn
+
+  def __enter__(self):
+    """A method of when a scope of this context manager is being entered."""
+    # Monkey-patch `execute.execute()`.
+    execute.execute = execute.execute_with_callbacks
+    if not hasattr(_state, "callback_stack"):
+      _state.callback_stack = []
+      _state.invoking_callbacks = False
+    _state.callback_stack.append(self._callback_fn)
+
+    ctx = context.context()
+    if ctx.executing_eagerly():
+      ctx.post_execution_callbacks.append(self._callback_fn)
+
+  def __exit__(self, exec_type, exec_value, exec_traceback):
+    """A method of when a scope of this context manager is being exited."""
+    _state.callback_stack.pop()
+    ctx = context.context()
+    if ctx.executing_eagerly():
+      ctx.post_execution_callbacks.pop()
+
+
+def op_callback(callback_fn):
+  r"""Intercepts op execution and op creation.
+
+  The `callback_fn` will be invoked immediately after any of the three types
+  of events:
+    - The execution of an TensorFlow operation ("op" for short hereafter)
+      under eager mode,
+    - The execution of a FuncGraph under eager mode,
+    - The creation of an op during graph construction (e.g., in
+      @tf.function-decorated Python functions).
+
+  Args:
+    callback_fn: A callback_fn that has the following signature:
+      def callback_fn(op_type,
+                      inputs,
+                      attrs,
+                      outputs,
+                      op_name=None,
+                      graph=None):
+        # op_type: The type of the op, as a string. E.g., "MatMul".
+        #          For the special case of FuncGraph execution, op_type
+        #          takes the name of the graph name, e.g.,
+        #          "__inference_my_func_24".
+        # inputs: (`tuple` of `Tensor`s) Input tensors to the op or the
+        #         FuncGraph.
+        #         - In eager execution, these are `EagerTensor`s.
+        #         - In graph construction, these are non-eager `Tensor`s
+        #           that form the inputs to the just-created op.
+        # attrs: The attributes of the op or FuncGraph of which the execution
+        #        or creation caused the current invocation of the callback.
+        #        This is applicable to both eager- and graph-based execution,
+        #        as well as graph construction.
+        #        This is a tuple of alternating attribute keys and attribute
+        #        values. E.g., `('adjoint_a', False, 'adjoint_b', False)`.
+        # outputs: (`tuple of `Tensor`s) Output tensors from the op or
+        #          FuncGraph.
+        #          In eager execution, these are `EagerTensor`s.
+        #          In graph construction, these are non-eager `Tensor`s that
+        #          are the outputs of the just-created op.
+        # op_name: Name of the op.
+        #          - If the current invocation of the callback is due to the
+        #            eager execution of an op or FuncGraph, this will be
+        #            `None`, as op names are meaningless in eager execution.
+        #          - In graph construction, this is the name of the op, e.g.,
+        #            "MatMul_2".
+        # graph: The graph that the op belongs to (if any).
+        #        - In eager execution of an op or FuncGraph, this is `None`.
+        #        - In graph construction, this is the op's containing graph
+        #          as a `tf.Graph` object.
+        #
+        # Return values:
+        #   This callback function is expected to return `None` or
+        #   a `list` or `tuple` of `Tensor`s with its length matching
+        #   `len(outputs)`, in the order that corresponds to that of the
+        #   `outputs` argument.
+        #   If the return value is `None`, downstream execution or graph
+        #   construction will be unaffected.
+        #   Howevevr, if the return value is a `list` or `tuple` of `Tensor`s,
+        #   - In eager execution, these returned `Tensor`s should be
+        #     `EagerTensor`s. Their values will replace the original values of
+        #     `outputs` for downstream eager execution. (*Not implemented yet*).
+        #   - In graph construction, these returned `Tensor`s should be
+        #     non-eager `Tensor`s. Their values will replace the original
+        #     `outputs` for downstream graph construction.
+
+  Returns:
+    A thread-local context manager. Within the scope of the context
+    manager, all eager op/graph execution and graph op construction
+    will invoke `callback_fn`.
+
+  Raises:
+    ValueEror: If `callback_fn` is not callable.
+  """
+  # TODO(b/139668041): Implement support for overriding `EagerTensor`s from
+  # callback.
+  if callback_fn is None:
+    raise ValueError("Passed callback function cannot be None.")
+  if not callable(callback_fn):
+    raise ValueError(
+        "Callback function passed to op_callback() is expected to be callable, "
+        "but is not. Recevied %s" % callback_fn)
+  return _OpCallbackContextManager(callback_fn)
+
+
+def should_invoke_op_callbacks():
+  """Determine if op callbacks are present and should be invoked.
+
+  Returns:
+    A thread-local result (boolean) indicating whether any op callback(s) exist
+    and should be invoked.
+  """
+  return (
+      hasattr(_state, "callback_stack") and _state.callback_stack and
+      not (hasattr(_state, "invoking_callbacks") and _state.invoking_callbacks))
+
+
+def invoke_op_callbacks(op_type,
+                        inputs,
+                        attrs,
+                        outputs,
+                        op_name=None,
+                        graph=None):
+  r"""Invoke the callbacks that exist in the current scope (if any).
+
+  If no callbacks are present in the current scope, this method returns
+  immediately.
+
+  Args:
+    op_type: Type of the operation (e.g., "MatMul").
+    inputs: Input tensors to the op. These are `EagerTensor`s in the case of
+      eager execution of ops or `FuncGraph`s, and are non-eager `Tensor`s in the
+      case of graph construction.
+    attrs: Attributes of the op, as `tuple` of alternating keys and values.
+    outputs: Output tensors from the op. These are `EagerTensor`s in the case of
+      eager execution and are non-eager `Tensor`s in the case of graph
+      construction.
+    op_name: Name of the op. Applicable if and only if this method is invoked
+      due to the graph construction of an op or the eager execution of of a
+      `FuncGraph`.
+    graph: The graph involved (if any).
+      - In the case if the eager execution of an op or FuncGraph, this is
+        `None`.
+      - In the case of the graph construction of an op, this is the `tf.Graph`
+        object being built.
+
+  Returns:
+    `None`, or a `list` or `tuple` of output tenors that will override the
+    original (input) `outputs`.
+  """
+  if _state.callback_stack:
+    # Guards against stack overflow that can result from recursive invocation
+    # due to op constructions inside client-supplied op callbacks.
+    _state.invoking_callbacks = True
+    try:
+      if isinstance(attrs, dict):
+        attrs_list = []
+        for key in attrs:
+          attrs_list.append(key)
+          attrs_list.append(attrs[key])
+        attrs_tuple = tuple(attrs_list)
+      else:
+        attrs_tuple = attrs
+
+      new_outputs = outputs
+      for callback in reversed(_state.callback_stack):
+        new_outputs = callback(
+            op_type,
+            inputs,
+            attrs_tuple,
+            new_outputs,
+            op_name=op_name,
+            graph=graph)
+        if new_outputs is not None and len(new_outputs) != len(outputs):
+          raise ValueError(
+              "The op callback returned %s tensors, which does not match the "
+              "original number of outputs of op %s (%d)." %
+              (len(new_outputs), op_name, len(outputs)))
+      return new_outputs
+    finally:
+      _state.invoking_callbacks = False
+  else:
+    return outputs
diff --git a/tensorflow/python/framework/op_callbacks_test.py b/tensorflow/python/framework/op_callbacks_test.py
new file mode 100644
index 00000000000..c58c57c7a95
--- /dev/null
+++ b/tensorflow/python/framework/op_callbacks_test.py
@@ -0,0 +1,768 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for op_callback."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import op_callbacks
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.util import compat
+
+# Keep all the hard-coded op type strings in one place so they are easy to
+# change all at once in the face of any possible future op type name changes.
+_ADD_OP = b"AddV2"
+_ASSIGN_ADD_VARIABLE_OP = b"AssignAddVariableOp"
+_CONSTANT_OP = b"Const"
+_COS_OP = b"Cos"
+_GREATER_OP = b"Greater"
+_IDENTITY_OP = b"Identity"
+_IF_OP = b"If"
+_LESS_OP = b"Less"
+_LOG_OP = b"Log"
+_MATMUL_OP = b"MatMul"
+_MUL_OP = b"Mul"
+_PLACEHOLDER_OP = b"Placeholder"
+_POW_OP = b"Pow"
+_READ_VARIALBE_OP = b"ReadVariableOp"
+_SIN_OP = b"Sin"
+_SPARSE_TENSOR_DENSE_MATMUL_OP = b"SparseTensorDenseMatMul"
+_SQRT_OP = b"Sqrt"
+_SQUARE_OP = b"Square"
+_STATELESS_IF_OP = b"StatelessIf"
+_UNIQUE_OP = b"Unique"
+_WHILE_OP = b"While"
+
+
+class _NumpyFunctionCallback(object):
+
+  def __init__(self, instrument_graph_ops=True):
+    self.instrument_graph_ops = instrument_graph_ops
+    self.reset()
+
+  def callback(self, op_type, inputs, attrs, outputs, op_name=None, graph=None):
+    is_eager = not graph
+    if is_eager:
+      self.eager_op_types.append(
+          compat.as_bytes(op_type) if op_type else op_type)
+      self.eager_op_names.append(
+          compat.as_bytes(op_name) if op_name else op_name)
+      self.eager_attrs.append(attrs)
+      self.eager_graphs.append(graph)
+      self.eager_inputs.append(inputs)
+    else:
+      self.graph_op_types.append(
+          compat.as_bytes(op_type) if op_type else op_type)
+      self.graph_op_names.append(
+          compat.as_bytes(op_name) if op_name else op_name)
+      self.graph_attrs.append(attrs)
+      self.graph_graphs.append(graph)
+      self.graph_graph_versions.append(graph.version)
+      self.graph_inputs.append(inputs)
+
+      if not self.instrument_graph_ops:
+        return outputs
+
+      # Instrument the graph with numpy_function.
+      instrumented_outputs = []
+      for output in outputs:
+        if compat.as_bytes(op_type) in (
+            _IF_OP, _STATELESS_IF_OP, _WHILE_OP, _IDENTITY_OP):
+          # TODO(cais): Overriding the output of StatelessIf, If and While ops
+          # currently fails with error. Investigate (b/139668453).
+          # Avoid instrumenting Identity ops as well, as they are inserted
+          # by tf.function/AutoGraph for marshalling outputs.
+          instrumented_output = output
+        else:
+
+          def record(ndarray_value):
+            if compat.as_bytes(op_name) not in self.graph_internal_ndarrays:
+              self.graph_internal_ndarrays[compat.as_bytes(op_name)] = []
+            self.graph_internal_ndarrays[compat.as_bytes(op_name)].append(
+                ndarray_value)
+            return ndarray_value
+
+          instrumented_output = script_ops.numpy_function(
+              record, [output], output.dtype)
+          instrumented_output.set_shape(output.shape)
+        instrumented_outputs.append(instrumented_output)
+
+      return instrumented_outputs
+
+  def reset(self):
+    self.eager_op_types = []
+    self.eager_op_names = []
+    self.eager_attrs = []
+    self.eager_graphs = []
+    self.eager_inputs = []
+    self.graph_op_types = []
+    self.graph_op_names = []
+    self.graph_attrs = []
+    self.graph_graphs = []
+    self.graph_graph_versions = []
+    self.graph_inputs = []
+
+    # A dict mapping tensor name (e.g., "MatMut_10") to a list of ndarrays.
+    # The list is the history of the tensor's computation result inside
+    # `tf.Graph`s (`FuncGraph`s).
+    # For an op with multiple output tensors, the outputs are interleaved in
+    # the list.
+    self.graph_internal_ndarrays = {}
+
+
+class OpCallbacksTest(test_util.TensorFlowTestCase):
+
+  def testSingleThreadedStack(self):
+    instrument_0 = _NumpyFunctionCallback()
+    instrument_1 = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument_0.callback):
+      self.assertEqual(1, len(op_callbacks._state.callback_stack))
+      self.assertEqual(instrument_0.callback,
+                       op_callbacks._state.callback_stack[0])
+
+      with op_callbacks.op_callback(instrument_1.callback):
+        self.assertEqual(2, len(op_callbacks._state.callback_stack))
+        self.assertEqual(instrument_0.callback,
+                         op_callbacks._state.callback_stack[0])
+        self.assertEqual(instrument_1.callback,
+                         op_callbacks._state.callback_stack[1])
+
+      self.assertEqual(1, len(op_callbacks._state.callback_stack))
+      self.assertEqual(instrument_0.callback,
+                       op_callbacks._state.callback_stack[0])
+
+    self.assertEqual(0, len(op_callbacks._state.callback_stack))
+
+  def testMultiThreadedStacks(self):
+    # Instrument for the main thread.
+    instrument_0 = _NumpyFunctionCallback()
+
+    # Instrument for the to-be-created thread.
+    instrument_1 = _NumpyFunctionCallback()
+
+    def thread1_job():
+      with op_callbacks.op_callback(instrument_1.callback):
+
+        @def_function.function
+        def func1(x):
+          return math_ops.sqrt(math_ops.log(x))
+
+        x = constant_op.constant(4.0)
+        self.assertAllClose(func1(x), np.sqrt(np.log(4.0)))
+
+    thread1 = threading.Thread(target=thread1_job)
+
+    # Start job on separate thread.
+    thread1.start()
+
+    # Run something on the main thread.
+    with op_callbacks.op_callback(instrument_0.callback):
+
+      @def_function.function
+      def func0(x):
+        return math_ops.square(math_ops.sin(x))
+
+      x = constant_op.constant(4.0)
+      self.assertAllClose(func0(x), np.square(np.sin(4.0)))
+
+    thread1.join()
+
+    # Assert that there is no cross-talk between the main thread
+    # and the created thread.
+    self.assertIn(_LOG_OP, instrument_1.graph_op_types)
+    self.assertIn(_SQRT_OP, instrument_1.graph_op_types)
+    self.assertNotIn(_SIN_OP, instrument_1.graph_op_types)
+    self.assertNotIn(_SQUARE_OP, instrument_1.graph_op_types)
+
+    self.assertNotIn(_LOG_OP, instrument_0.graph_op_types)
+    self.assertNotIn(_SQRT_OP, instrument_0.graph_op_types)
+    self.assertIn(_SIN_OP, instrument_0.graph_op_types)
+    self.assertIn(_SQUARE_OP, instrument_0.graph_op_types)
+
+  def testEagerOpExecution(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+      x = constant_op.constant(6.0)
+      y = math_ops.square(math_ops.log(x))
+      self.assertAllClose(y, np.square(np.log(6.0)))
+
+    self.assertEqual(instrument.eager_op_types, [_LOG_OP, _SQUARE_OP])
+    # Op names are unavailable under eager mode.
+    self.assertEqual(instrument.eager_op_names, [None, None])
+    self.assertEqual(instrument.eager_graphs, [None, None])
+    self.assertEqual(len(instrument.eager_inputs), 2)
+    self.assertEqual(len(instrument.eager_inputs[0]), 1)
+    self.assertIsInstance(instrument.eager_inputs[0], tuple)
+    self.assertEqual(instrument.eager_inputs[0][0], x)
+    self.assertEqual(len(instrument.eager_inputs[1]), 1)
+    self.assertIsInstance(instrument.eager_inputs[1], tuple)
+    self.assertAllClose(instrument.eager_inputs[1][0], np.log(6.0))
+    self.assertFalse(instrument.graph_op_types)
+    self.assertFalse(instrument.graph_op_names)
+    self.assertFalse(instrument.graph_attrs)
+    self.assertFalse(instrument.graph_graphs)
+    self.assertFalse(instrument.graph_inputs)
+
+  def testMultiThreadedEagerOpExecution(self):
+    # Instrument for the main thread.
+    instrument_0 = _NumpyFunctionCallback()
+
+    # Instrument for the to-be-created thread.
+    instrument_1 = _NumpyFunctionCallback()
+
+    def thread_1_job():
+      with op_callbacks.op_callback(instrument_1.callback):
+        x = constant_op.constant(6.0)
+        y = math_ops.square(math_ops.log(x))
+        return y
+
+    thread_1 = threading.Thread(target=thread_1_job)
+    thread_1.start()
+
+    # While thread_1 is ongoing, do something on the main thread.
+    with op_callbacks.op_callback(instrument_0.callback):
+      x = constant_op.constant(2.0)
+      y = math_ops.cos(x)
+      self.assertAllClose(y, np.cos(2.0))
+
+    thread_1.join()
+
+    self.assertEqual(instrument_0.eager_op_types, [_COS_OP])
+    self.assertEqual(instrument_0.eager_op_names, [None])
+    self.assertEqual(instrument_1.eager_op_types, [_LOG_OP, _SQUARE_OP])
+    self.assertEqual(instrument_1.eager_op_names, [None, None])
+
+  def testEagerFunctionExecution(self):
+    instrument = _NumpyFunctionCallback()
+
+    @def_function.function
+    def square_log(x):
+      return math_ops.square(math_ops.log(x))
+
+    # Call the function once, so that the graph construction won't show up
+    # in the callback.
+    x_float32 = constant_op.constant(6.0, dtype=dtypes.float32)
+    x_float64 = constant_op.constant(6.0, dtype=dtypes.float64)
+    square_log(x_float32)
+    square_log(x_float64)
+
+    with op_callbacks.op_callback(instrument.callback):
+      y = square_log(x_float32)
+      self.assertAllClose(y, np.square(np.log(6.0)))
+      y = square_log(x_float64)
+      self.assertAllClose(y, np.square(np.log(6.0)))
+
+    # Each of the two dtypes should be associated with its own FuncGraph.
+    self.assertIn(
+        square_log.get_concrete_function(x_float32).name,
+        instrument.eager_op_types)
+    self.assertIn(
+        square_log.get_concrete_function(x_float64).name,
+        instrument.eager_op_types)
+
+    self.assertEqual(len(instrument.eager_inputs), 2)
+    self.assertIsInstance(instrument.eager_inputs[0], tuple)
+    self.assertEqual(instrument.eager_inputs[0][0], x_float32)
+    self.assertIsInstance(instrument.eager_inputs[1], tuple)
+    self.assertEqual(instrument.eager_inputs[1][0], x_float64)
+
+    self.assertEqual(instrument.eager_op_names, [None, None])
+    self.assertFalse(instrument.graph_op_types)
+    self.assertFalse(instrument.graph_op_names)
+    self.assertFalse(instrument.graph_inputs)
+
+  def testMultiThreadedEagerFunctionExecution(self):
+    # Instrument for the main thread.
+    instrument_0 = _NumpyFunctionCallback()
+
+    # Instrument for the to-be-created thread.
+    instrument_1 = _NumpyFunctionCallback()
+
+    @def_function.function
+    def square_log(x):
+      return math_ops.square(math_ops.log(x))
+
+    # Call the function once, so that the graph construction won't show up
+    # in the callback.
+    x_float32 = constant_op.constant(6.0, dtype=dtypes.float32)
+    x_float64 = constant_op.constant(6.0, dtype=dtypes.float64)
+    square_log(x_float32)
+    square_log(x_float64)
+
+    def thread_1_job():
+      with op_callbacks.op_callback(instrument_1.callback):
+        square_log(x_float32)
+
+    thread_1 = threading.Thread(target=thread_1_job)
+    thread_1.start()
+
+    # In the meantime, run some computation on the main thread.
+    with op_callbacks.op_callback(instrument_0.callback):
+      square_log(x_float64)
+
+    thread_1.join()
+
+    # Each of the two dtypes should be associated with its own FuncGraph.
+    self.assertIn(
+        square_log.get_concrete_function(x_float64).name,
+        instrument_0.eager_op_types)
+    self.assertEqual(instrument_0.eager_op_names, [None])
+    self.assertFalse(instrument_0.graph_op_types)
+    self.assertIn(
+        square_log.get_concrete_function(x_float32).name,
+        instrument_1.eager_op_types)
+    self.assertEqual(instrument_1.eager_op_names, [None])
+    self.assertFalse(instrument_1.graph_op_types)
+
+  def testSimpleGraphConstructionScopeOutsideFunction(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def log_2plus_unique_x(x):
+        unique_values, unique_pos = array_ops.unique(x)
+        return math_ops.log(2.0 + unique_values), unique_pos
+
+      x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
+      y1, y2 = log_2plus_unique_x(x)
+      self.assertAllClose(y1, [0.0, np.log(2.0)])
+      self.assertAllClose(y2, [0, 0, 1])
+    self.assertIn(_UNIQUE_OP, instrument.graph_op_types)
+    self.assertIn(_ADD_OP, instrument.graph_op_types)
+    self.assertIn(_LOG_OP, instrument.graph_op_types)
+    self.assertEqual(
+        len(instrument.graph_op_names), len(instrument.graph_op_types))
+
+    # Check the graph internal ndarrays recorded at runtime.
+    unique_op_outputs = instrument.graph_internal_ndarrays[_UNIQUE_OP]
+    self.assertEqual(len(unique_op_outputs), 2)
+    self.assertAllClose(unique_op_outputs[0], [-1.0, 0.0])
+    self.assertAllClose(unique_op_outputs[1], [0, 0, 1])
+    add_op_outputs = instrument.graph_internal_ndarrays[b"add"]
+    self.assertEqual(len(add_op_outputs), 1)
+    self.assertAllClose(add_op_outputs[0], [1.0, 2.0])
+    log_op_outputs = instrument.graph_internal_ndarrays[_LOG_OP]
+    self.assertEqual(len(log_op_outputs), 1)
+    self.assertAllClose(log_op_outputs[0], [0.0, np.log(2.0)])
+
+  def testSimpleGraphConstructionWithCallbackReturningNone(self):
+    """Test that callbacks that return None works."""
+    op_types = []
+    def no_return_callback(op_type,
+                           inputs,
+                           attrs,
+                           outputs,
+                           op_name=None,
+                           graph=None):
+      del inputs, attrs, outputs, op_name, graph  # Unused.
+      op_types.append(compat.as_bytes(op_type))
+
+    with op_callbacks.op_callback(no_return_callback):
+      @def_function.function
+      def log1p(x):
+        return math_ops.log(1.0 + x)
+      x = constant_op.constant(3.0)
+      y = log1p(x)
+      self.assertAllClose(y, np.log(4.0))
+    self.assertIn(_ADD_OP, op_types)
+    self.assertIn(_LOG_OP, op_types)
+
+  def testGraphConstructionInputsAndGraphAreCapturedCorrectly(self):
+    instrument = _NumpyFunctionCallback(instrument_graph_ops=False)
+
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def log_2plus_unique_x(x):
+        unique_values, unique_pos = array_ops.unique(x)
+        return math_ops.log(2.0 + unique_values), unique_pos
+
+      x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
+      y1, y2 = log_2plus_unique_x(x)
+      self.assertAllClose(y1, [0.0, np.log(2.0)])
+      self.assertAllClose(y2, [0, 0, 1])
+
+    # Check the recorded input tensors.
+    self.assertEqual(
+        len(instrument.graph_inputs), len(instrument.graph_op_types))
+    unique_inputs = instrument.graph_inputs[instrument.graph_op_types.index(
+        _UNIQUE_OP)]
+    self.assertIsInstance(unique_inputs, tuple)
+    self.assertEqual(len(unique_inputs), 1)
+    self.assertEqual(
+        compat.as_bytes(unique_inputs[0].op.op_def.name), _PLACEHOLDER_OP)
+
+    add_inputs = instrument.graph_inputs[instrument.graph_op_types.index(
+        _ADD_OP)]
+    self.assertIsInstance(add_inputs, tuple)
+    self.assertEqual(len(add_inputs), 2)
+    self.assertEqual(
+        compat.as_bytes(add_inputs[0].op.op_def.name), _CONSTANT_OP)
+    self.assertEqual(compat.as_bytes(add_inputs[1].op.op_def.name), _UNIQUE_OP)
+
+    log_inputs = instrument.graph_inputs[instrument.graph_op_types.index(
+        _LOG_OP)]
+    self.assertIsInstance(log_inputs, tuple)
+    self.assertEqual(len(log_inputs), 1)
+    self.assertEqual(compat.as_bytes(log_inputs[0].op.op_def.name), _ADD_OP)
+
+    # Check the recorded graphs.
+    self.assertEqual(
+        len(instrument.graph_graphs), len(instrument.graph_op_types))
+    self.assertGreater(len(instrument.graph_graph_versions), 1)
+    for i in range(len(instrument.graph_graph_versions) - 1):
+      self.assertGreater(instrument.graph_graph_versions[i + 1],
+                         instrument.graph_graph_versions[i])
+
+  def testEagerGraphOpConstructionSimpleGraphScopeInsideFunction(self):
+    instrument = _NumpyFunctionCallback()
+
+    @def_function.function
+    def log_2plus_unique_x(x):
+      with op_callbacks.op_callback(instrument.callback):
+        unique_values, _ = array_ops.unique(x)
+        y = math_ops.log(2.0 + unique_values)
+      return math_ops.sin(y)
+
+    x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
+    output = log_2plus_unique_x(x)
+    self.assertAllClose(output, np.sin([0.0, np.log(2.0)]))
+
+    # The following ops should have been captured by the callback
+    # because they were constructed within the scope of `op_callback()`.
+    self.assertIn(_UNIQUE_OP, instrument.graph_op_types)
+    self.assertIn(_ADD_OP, instrument.graph_op_types)
+    self.assertIn(_LOG_OP, instrument.graph_op_types)
+    # The "Sin" op should not have been captured, because it was constructed
+    # outside the scope of `op_callback()`.
+    self.assertNotIn(_SIN_OP, instrument.graph_op_types)
+    self.assertEqual(
+        len(instrument.graph_op_names), len(instrument.graph_op_types))
+
+    # Check the graph internal ndarrays recorded at runtime.
+    unique_op_outputs = instrument.graph_internal_ndarrays[_UNIQUE_OP]
+    self.assertEqual(len(unique_op_outputs), 2)
+    self.assertAllClose(unique_op_outputs[0], [-1.0, 0.0])
+    self.assertAllClose(unique_op_outputs[1], [0, 0, 1])
+    add_op_outputs = instrument.graph_internal_ndarrays[b"add"]
+    self.assertEqual(len(add_op_outputs), 1)
+    self.assertAllClose(add_op_outputs[0], [1.0, 2.0])
+    log_op_outputs = instrument.graph_internal_ndarrays[_LOG_OP]
+    self.assertEqual(len(log_op_outputs), 1)
+    self.assertAllClose(log_op_outputs[0], [0.0, np.log(2.0)])
+
+  def testEagerOpAttributesAreCapture(self):
+    instrument = _NumpyFunctionCallback()
+    with op_callbacks.op_callback(instrument.callback):
+      m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
+      x = constant_op.constant([[-2.0], [3.0]])
+      y = math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
+      self.assertAllClose(y, [[-2.0], [5.0]])
+    self.assertEqual(len(instrument.eager_attrs), 1)
+    self.assertIsInstance(instrument.eager_attrs[0], tuple)
+    self.assertEqual(
+        instrument.eager_attrs[0][instrument.eager_attrs[0].index("transpose_a")
+                                  + 1], True)
+    self.assertEqual(
+        instrument.eager_attrs[0][instrument.eager_attrs[0].index("transpose_b")
+                                  + 1], False)
+    self.assertEqual(len(instrument.graph_attrs), 0)
+
+  def testGraphOpAttributesAreCapture(self):
+    instrument = _NumpyFunctionCallback()
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def my_matmul(m, x):
+        return math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
+
+      m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
+      x = constant_op.constant([[-2.0], [3.0]])
+      y = my_matmul(m, x)
+      self.assertAllClose(y, [[-2.0], [5.0]])
+    index = instrument.graph_op_types.index(_MATMUL_OP)
+    self.assertIsInstance(instrument.graph_attrs[index], tuple)
+    self.assertEqual(
+        instrument.graph_attrs[index][
+            instrument.graph_attrs[index].index("transpose_a") + 1].b, True)
+    self.assertEqual(
+        instrument.graph_attrs[index][
+            instrument.graph_attrs[index].index("transpose_b") + 1].b, False)
+    self.assertEqual(len(instrument.eager_attrs), 1)
+    self.assertIsInstance(instrument.eager_attrs[0], tuple)
+
+  def testEagerGraphOpConstructionIfControlFlow(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def my_function_with_cond(x):
+        if math_ops.greater(x, 0.0):
+          return x**2.0
+        else:
+          return x**3.0
+
+      x = constant_op.constant(-4.0)
+      self.assertAllClose(my_function_with_cond(x), -64.0)
+
+    self.assertIn(_IF_OP, instrument.graph_op_types)
+    self.assertIn(_GREATER_OP, instrument.graph_op_types)
+    self.assertIn(_POW_OP, instrument.graph_op_types)
+    self.assertEqual(
+        len(instrument.graph_op_names), len(instrument.graph_op_types))
+
+    # Check the graph internal ndarrays recorded at runtime.
+    greater_op_outputs = instrument.graph_internal_ndarrays[_GREATER_OP]
+    self.assertEqual(len(greater_op_outputs), 1)
+    self.assertAllClose(greater_op_outputs[0], False)
+    pow_op_outputs = instrument.graph_internal_ndarrays[b"pow"]
+    self.assertEqual(len(pow_op_outputs), 1)
+    self.assertAllClose(pow_op_outputs[0], -64.0)
+
+  def testEagerGraphOpConstructionWhileLoopControlFlow(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def my_function_with_while(counter, lim, accum):
+        while math_ops.less(counter, lim):
+          accum.assign_add(accum)
+          counter.assign_add(1.0)
+
+      counter = variables.Variable(0.0)
+      lim = constant_op.constant(4.0, dtype=dtypes.float32)
+      accum = variables.Variable(1.0)
+      my_function_with_while(counter, lim, accum)
+    self.assertAllClose(accum.read_value(), 16.0)
+    self.assertIn(_WHILE_OP, instrument.graph_op_types)
+    self.assertIn(_LESS_OP, instrument.graph_op_types)
+    self.assertIn(_ASSIGN_ADD_VARIABLE_OP, instrument.graph_op_types)
+    self.assertEqual(
+        len(instrument.graph_op_names), len(instrument.graph_op_types))
+
+    # Check the graph internal ndarrays recorded at runtime.
+    read_variable_op_outputs = instrument.graph_internal_ndarrays[
+        _READ_VARIALBE_OP]
+    self.assertAllClose(read_variable_op_outputs, [1.0, 2.0, 4.0, 8.0])
+    less_op_outputs = instrument.graph_internal_ndarrays[_LESS_OP]
+    self.assertAllClose(less_op_outputs, [True, True, True, True, False])
+
+  def testDatasetMapTest(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+      tensor = constant_op.constant(
+          [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
+
+      def map_fn(x):
+        return math_ops.log(math_ops.square(x) + 1)
+
+      dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
+          map_fn)
+      iterator = dataset.make_one_shot_iterator()
+
+      self.assertAllClose(iterator.next(), np.log([1.25, 2]))
+      self.assertAllClose(iterator.next(), np.log([3.25, 5]))
+
+      self.assertIn(_SQUARE_OP, instrument.graph_op_types)
+      self.assertIn(_ADD_OP, instrument.graph_op_types)
+      self.assertIn(_LOG_OP, instrument.graph_op_types)
+      self.assertEqual(
+          len(instrument.eager_op_types), len(instrument.eager_op_names))
+
+  def testSparseTensorEagerExecution(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+      indices = [[1, 2], [2, 0], [3, 4]]
+      values = [0.0, 8.0, -2.0]
+      shape = [4, 5]
+      sp = sparse_tensor.SparseTensorValue(indices, values, shape)
+      w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
+
+      y = sparse_ops.sparse_tensor_dense_matmul(sp, w)
+      self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
+      self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.eager_op_types)
+      self.assertFalse(instrument.graph_op_types)
+
+  def testSparseTensorFuncGraph(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+
+      @def_function.function
+      def dense_matmul(sp, w):
+        return sparse_ops.sparse_tensor_dense_matmul(sp, w)
+
+      indices = [[1, 2], [2, 0], [3, 4]]
+      values = [0.0, 8.0, -2.0]
+      shape = [4, 5]
+      sp = sparse_tensor.SparseTensorValue(indices, values, shape)
+      w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
+      y = dense_matmul(sp, w)
+      self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
+      self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.graph_op_types)
+      self.assertIn(
+          dense_matmul.get_concrete_function(sp, w).name,
+          instrument.eager_op_types)
+
+    # Check the graph internal ndarrays recorded at runtime.
+    sparse_matmul_outputs = instrument.graph_internal_ndarrays[
+        _SPARSE_TENSOR_DENSE_MATMUL_OP + b"/" + _SPARSE_TENSOR_DENSE_MATMUL_OP]
+    self.assertEqual(len(sparse_matmul_outputs), 1)
+    self.assertAllClose(sparse_matmul_outputs[0], [[0.0], [0.0], [8.0], [-2.0]])
+
+  def testOverrideDTypeInFuncGraph(self):
+
+    def to_float64(op_type, inputs, attrs, outputs, op_name=None, graph=None):
+      del op_type, inputs, attrs, op_name, graph  # Unused.
+      return [math_ops.cast(output, dtypes.float64) for output in outputs]
+
+    with op_callbacks.op_callback(to_float64):
+
+      @def_function.function
+      def add_1_times_2(x):
+        return (x + 1.0) * 2.0
+
+      x = constant_op.constant(3.0, dtype=dtypes.float32)
+      y = add_1_times_2(x)
+      self.assertEqual(y.dtype, dtypes.float64)
+      self.assertAllClose(y, 8.0)
+
+  def testNoOutputOpUnderEagerExecution(self):
+    instrument = _NumpyFunctionCallback()
+    with op_callbacks.op_callback(instrument.callback):
+      x = constant_op.constant(10.0)
+      y = constant_op.constant(20.0)
+      z = x + y
+      w = control_flow_ops.group([z])
+      self.assertIsNone(w)
+    self.assertEqual(instrument.eager_op_types, [_ADD_OP])
+
+  def testOpCallbackWorksWithGradientTape(self):
+    instrument = _NumpyFunctionCallback()
+
+    with op_callbacks.op_callback(instrument.callback):
+      v = variables.Variable(3.0, dtype=dtypes.float32)
+      @def_function.function
+      def get_gradients():
+        with backprop.GradientTape() as tape:
+          loss = math_ops.sin(math_ops.square(v))
+          gradients = tape.gradient(loss, v)
+        return gradients
+
+      gradients = get_gradients()
+      # Applying the chain rule.
+      self.assertAllClose(gradients, np.cos(3.0 * 3.0) * 3.0 * 2.0)
+      self.assertIn(_SQUARE_OP, instrument.graph_op_types)
+      self.assertIn(_SIN_OP, instrument.graph_op_types)
+      # The mul and cos ops are created for backprop.
+      self.assertIn(_MUL_OP, instrument.graph_op_types)
+      self.assertIn(_COS_OP, instrument.graph_op_types)
+
+      # Check the ndarrays from runtime.
+      cos_op_outputs = instrument.graph_internal_ndarrays[_COS_OP]
+      self.assertEqual(len(cos_op_outputs), 1)
+      self.assertAllClose(cos_op_outputs[0], np.cos(3.0 * 3.0))
+
+  def testKeraModelFit(self):
+    # TODO(cais): The purely PyFunc (numpy_function) based instrumentation
+    # doesn't work for the entire Keras model and its fit() call, due to some
+    # shape inference limitations. Use tfdbg's gen_debug_ops for testing
+    # instead (b/139668469).
+    instrument = _NumpyFunctionCallback(instrument_graph_ops=False)
+
+    with op_callbacks.op_callback(instrument.callback):
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(10, input_shape=(8,), activation="relu"))
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(1, activation="linear"))
+      model.compile(loss="mse", optimizer="adam")
+
+      batch_size = 4
+      xs = random_ops.random_normal([batch_size, 8])
+      ys = random_ops.random_normal([batch_size, 1])
+      history = model.fit(xs, ys, epochs=2, verbose=0)
+
+      # Simply assert that the training proceeded as expected and that
+      # op callbacks are invoked. We prefer not to assert on the details of the
+      # graph construction and the execution, in order to avoid future
+      # maintenance cost.
+      self.assertEqual(len(history.history["loss"]), 2)
+      self.assertTrue(instrument.graph_op_types)
+      self.assertEqual(len(instrument.graph_op_types),
+                       len(instrument.graph_op_names))
+      self.assertTrue(instrument.eager_op_types)
+
+
+class OpCallbacksErrorConditionsTest(test_util.TensorFlowTestCase):
+
+  def testNonCallableObjectArgErrors(self):
+    with self.assertRaisesRegex(ValueError, r"is expected to be callable"):
+      with op_callbacks.op_callback(1337):
+        pass
+
+  def testOverridingWithWrongNumberOfTensorOutputsErrors(self):
+    def wrong_outputs_callback(op_type,
+                               inputs,
+                               attrs,
+                               outputs,
+                               op_name=None,
+                               graph=None):
+      del op_type, inputs, attrs, op_name, graph  # Unused.
+      return outputs[0], math_ops.negative(outputs[0])
+
+    with op_callbacks.op_callback(wrong_outputs_callback):
+
+      @def_function.function
+      def log1p(x):
+        return math_ops.log(1.0 + x)
+
+      x = constant_op.constant(3.0)
+      with self.assertRaisesRegex(
+          ValueError,
+          r"returned 2 tensors, .* does not match .* \(1\)"):
+        log1p(x)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index e9328d53fe2..1a759b52073 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -27,6 +27,7 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import tf_logging as logging
@@ -788,9 +789,20 @@ class OpDefLibrary(object):
                               if arg.is_ref]
       with _MaybeColocateWith(must_colocate_inputs):
         # Add Op to graph
-        op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
-                         input_types=input_types, attrs=attr_protos,
-                         op_def=op_def)
+        # pylint: disable=protected-access
+        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
+                                   name=scope, input_types=input_types,
+                                   attrs=attr_protos, op_def=op_def)
+
+      # Conditionally invoke tfdbg v2's op callback(s).
+      if op_callbacks.should_invoke_op_callbacks():
+        callback_outputs = op_callbacks.invoke_op_callbacks(
+            op.node_def.op, tuple(op.inputs), attr_protos, tuple(op.outputs),
+            op_name=op.name, graph=g)
+        if callback_outputs is not None:
+          for slot_index, callback_output in enumerate(callback_outputs):
+            op.outputs[slot_index] = callback_output
+
       return output_structure, op_def.is_stateful, op
 
 # pylint: enable=invalid-name
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index dbb61acbcfc..b0702ca0af3 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -63,11 +63,18 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lock_util
 from tensorflow.python.util import memory
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_stack
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
+ag_ctx = LazyLoader(
+    "ag_ctx", globals(),
+    "tensorflow.python.autograph.core.ag_ctx")
+
 
 # Temporary global switches determining if we should enable the work-in-progress
 # calls to the C API. These will be removed once all functionality is supported.
@@ -81,6 +88,7 @@ _api_usage_gauge = monitoring.BoolGauge(
 
 # pylint: disable=protected-access
 _TensorLike = tensor_like._TensorLike
+_DTYPES_INTERN_TABLE = dtypes._INTERN_TABLE
 # pylint: enable=protected-access
 
 
@@ -256,7 +264,7 @@ def numpy_text(tensor, is_repr=False):
     text = "\n" + text
   return text
 
-
+@tf_export(v1=["enable_tensor_equality"])
 def enable_tensor_equality():
   """Compare Tensors with element-wise comparison and thus be unhashable.
 
@@ -267,7 +275,7 @@ def enable_tensor_equality():
   """
   Tensor._USE_EQUALITY = True  # pylint: disable=protected-access
 
-
+@tf_export(v1=["disable_tensor_equality"])
 def disable_tensor_equality():
   """Compare Tensors by their id and be hashable.
 
@@ -357,7 +365,7 @@ class Tensor(_TensorLike):
   }
 
   # Whether to allow hashing or numpy-style equality
-  _USE_EQUALITY = False
+  _USE_EQUALITY = tf2.enabled()
 
   def __init__(self, op, value_index, dtype):
     """Creates a new `Tensor`.
@@ -386,6 +394,12 @@ class Tensor(_TensorLike):
     self._id = uid()
     self._name = None
 
+  @staticmethod
+  def _create_with_tf_output(op, value_index, dtype, tf_output):
+    ret = Tensor(op, value_index, dtype)
+    ret._tf_output = tf_output
+    return ret
+
   @property
   def op(self):
     """The `Operation` that produces this tensor as an output."""
@@ -499,11 +513,45 @@ class Tensor(_TensorLike):
     raise ValueError(
         "Tensor._shape cannot be assigned, use Tensor.set_shape instead.")
 
+  def _disallow_when_autograph_disabled(self, task):
+    raise errors.OperatorNotAllowedInGraphError(
+        "{} is not allowed: AutoGraph is disabled in this function."
+        " Try decorating it directly with @tf.function.".format(task))
+
+  def _disallow_when_autograph_enabled(self, task):
+    raise errors.OperatorNotAllowedInGraphError(
+        "{} is not allowed: AutoGraph did not convert this function. Try"
+        " decorating it directly with @tf.function.".format(task))
+
+  def _disallow_in_graph_mode(self, task):
+    raise errors.OperatorNotAllowedInGraphError(
+        "{} is not allowed in Graph execution. Use Eager execution or decorate"
+        " this function with @tf.function.".format(task))
+
+  def _disallow_bool_casting(self):
+    if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
+      self._disallow_when_autograph_disabled(
+          "using a `tf.Tensor` as a Python `bool`")
+    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED:
+      self._disallow_when_autograph_enabled(
+          "using a `tf.Tensor` as a Python `bool`")
+    else:
+      # Default: V1-style Graph execution.
+      self._disallow_in_graph_mode("using a `tf.Tensor` as a Python `bool`")
+
+  def _disallow_iteration(self):
+    if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
+      self._disallow_when_autograph_disabled("iterating over `tf.Tensor`")
+    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED:
+      self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
+    else:
+      # Default: V1-style Graph execution.
+      self._disallow_in_graph_mode("iterating over `tf.Tensor`")
+
   def __iter__(self):
     if not context.executing_eagerly():
-      raise TypeError(
-          "Tensor objects are only iterable when eager execution is "
-          "enabled. To iterate over this tensor use tf.map_fn.")
+      self._disallow_iteration()
+
     shape = self._shape_tuple()
     if shape is None:
       raise TypeError("Cannot iterate over a tensor with unknown shape.")
@@ -665,8 +713,11 @@ class Tensor(_TensorLike):
                                                    self._dtype.name)
 
   def __hash__(self):
-    if Tensor._USE_EQUALITY and executing_eagerly_outside_functions():
-      raise TypeError("Tensor is unhashable if Tensor equality is enabled.")
+    g = getattr(self, "graph", None)
+    if (Tensor._USE_EQUALITY and executing_eagerly_outside_functions() and
+        (g is None or g._building_function)):  # pylint: disable=protected-access
+      raise TypeError("Tensor is unhashable if Tensor equality is enabled. "
+                      "Instead, use tensor.experimental_ref() as the key.")
     else:
       return id(self)
 
@@ -686,6 +737,15 @@ class Tensor(_TensorLike):
   # with ndarrays.
   __array_priority__ = 100
 
+  def __array__(self):
+    raise NotImplementedError("Cannot convert a symbolic Tensor ({}) to a numpy"
+                              " array.".format(self.name))
+
+  def __len__(self):
+    raise TypeError("len is not well defined for symbolic Tensors. ({}) "
+                    "Please call `x.shape` rather than `len(x)` for "
+                    "shape information.".format(self.name))
+
   @staticmethod
   def _override_operator(operator, func):
     _override_helper(Tensor, operator, func)
@@ -694,8 +754,8 @@ class Tensor(_TensorLike):
     """Dummy method to prevent a tensor from being used as a Python `bool`.
 
     This overload raises a `TypeError` when the user inadvertently
-    treats a `Tensor` as a boolean (e.g. in an `if` statement). For
-    example:
+    treats a `Tensor` as a boolean (most commonly in an `if` or `while`
+    statement), in code that was not converted by AutoGraph. For example:
 
     ```python
     if tf.constant(True):  # Will raise.
@@ -705,17 +765,10 @@ class Tensor(_TensorLike):
       # ...
     ```
 
-    This disallows ambiguities between testing the Python value vs testing the
-    dynamic condition of the `Tensor`.
-
     Raises:
       `TypeError`.
     """
-    raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. "
-                    "Use `if t is not None:` instead of `if t:` to test if a "
-                    "tensor is defined, and use TensorFlow ops such as "
-                    "tf.cond to execute subgraphs conditioned on the value of "
-                    "a tensor.")
+    self._disallow_bool_casting()
 
   def __nonzero__(self):
     """Dummy method to prevent a tensor from being used as a Python `bool`.
@@ -725,11 +778,7 @@ class Tensor(_TensorLike):
     Raises:
       `TypeError`.
     """
-    raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. "
-                    "Use `if t is not None:` instead of `if t:` to test if a "
-                    "tensor is defined, and use TensorFlow ops such as "
-                    "tf.cond to execute subgraphs conditioned on the value of "
-                    "a tensor.")
+    self._disallow_bool_casting()
 
   def eval(self, feed_dict=None, session=None):
     """Evaluates this tensor in a `Session`.
@@ -754,11 +803,118 @@ class Tensor(_TensorLike):
     """
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
+  def experimental_ref(self):
+    # tf.Variable also has the same experimental_ref() API.  If you update the
+    # documenation here, please update tf.Variable.experimental_ref() as well.
+    """Returns a hashable reference object to this Tensor.
+
+    Warning: Experimental API that could be changed or removed.
+
+    The primary usecase for this API is to put tensors in a set/dictionary.
+    We can't put tensors in a set/dictionary as `tensor.__hash__()` is no longer
+    available starting Tensorflow 2.0.
+
+    ```python
+    import tensorflow as tf
+
+    x = tf.constant(5)
+    y = tf.constant(10)
+    z = tf.constant(10)
+
+    # The followings will raise an exception starting 2.0
+    # TypeError: Tensor is unhashable if Tensor equality is enabled.
+    tensor_set = {x, y, z}
+    tensor_dict = {x: 'five', y: 'ten', z: 'ten'}
+    ```
+
+    Instead, we can use `tensor.experimental_ref()`.
+
+    ```python
+    tensor_set = {x.experimental_ref(),
+                  y.experimental_ref(),
+                  z.experimental_ref()}
+
+    print(x.experimental_ref() in tensor_set)
+    ==> True
+
+    tensor_dict = {x.experimental_ref(): 'five',
+                   y.experimental_ref(): 'ten',
+                   z.experimental_ref(): 'ten'}
+
+    print(tensor_dict[y.experimental_ref()])
+    ==> ten
+    ```
+
+    Also, the reference object provides `.deref()` function that returns the
+    original Tensor.
+
+    ```python
+    x = tf.constant(5)
+    print(x.experimental_ref().deref())
+    ==> tf.Tensor(5, shape=(), dtype=int32)
+    ```
+    """
+    return object_identity.Reference(self)
+
 
 # TODO(agarwal): consider getting rid of this.
 class _EagerTensorBase(Tensor):
   """Base class for EagerTensor."""
 
+  # __int__, __float__ and __index__ may copy the tensor to CPU and
+  # only work for scalars; values are cast as per numpy.
+  def __int__(self):
+    return int(self._numpy())
+
+  def __long__(self):
+    return long(self._numpy())
+
+  def __float__(self):
+    return float(self._numpy())
+
+  def __index__(self):
+    maybe_arr = self._numpy()
+    if isinstance(maybe_arr, np.ndarray):
+      return maybe_arr.__index__()
+    return int(maybe_arr)  # Must be a NumPy scalar.
+
+  def __bool__(self):
+    return bool(self._numpy())
+
+  __nonzero__ = __bool__
+
+  def __format__(self, format_spec):
+    return self._numpy().__format__(format_spec)
+
+  def __reduce__(self):
+    return convert_to_tensor, (self._numpy(),)
+
+  def __copy__(self):
+    # Eager Tensors are immutable so it's safe to return themselves as a copy.
+    return self
+
+  def __deepcopy__(self, memo):
+    # Eager Tensors are immutable so it's safe to return themselves as a copy.
+    del memo
+    return self
+
+  def __str__(self):
+    return "tf.Tensor(%s, shape=%s, dtype=%s)" % (numpy_text(self), self.shape,
+                                                  self.dtype.name)
+
+  def __repr__(self):
+    return "<tf.Tensor: id=%s, shape=%s, dtype=%s, numpy=%s>" % (
+        self._id, self.shape, self.dtype.name, numpy_text(self, is_repr=True))
+
+  def __len__(self):
+    """Returns the length of the first dimension in the Tensor."""
+    if not self.shape.ndims:
+      raise TypeError("Scalar tensor has no `len()`")
+    return self._shape_tuple()[0]
+
+  def _numpy(self):
+    raise NotImplementedError()
+
   @property
   def dtype(self):
     # Note: using the intern table directly here as this is
@@ -783,32 +939,6 @@ class _EagerTensorBase(Tensor):
     maybe_arr = self._numpy()  # pylint: disable=protected-access
     return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
 
-  # __int__, __float__ and __index__ may copy the tensor to CPU and
-  # only work for scalars; values are cast as per numpy.
-  def __int__(self):
-    return int(self._numpy())
-
-  def __long__(self):
-    return long(self._numpy())
-
-  def __float__(self):
-    return float(self._numpy())
-
-  def __index__(self):
-    maybe_arr = self._numpy()
-    if isinstance(maybe_arr, np.ndarray):
-      return maybe_arr.__index__()
-    return int(maybe_arr)  # Must be a NumPy scalar.
-
-  def __format__(self, format_spec):
-    return self._numpy().__format__(format_spec)
-
-  def __reduce__(self):
-    return (convert_to_tensor, (self._numpy(),))
-
-  def _numpy(self):
-    raise NotImplementedError()
-
   @property
   def backing_device(self):
     """Returns the name of the device holding this tensor's memory.
@@ -821,15 +951,6 @@ class _EagerTensorBase(Tensor):
     """
     raise NotImplementedError()
 
-  def __copy__(self):
-    # Eager Tensors are immutable so it's safe to return themselves as a copy.
-    return self
-
-  def __deepcopy__(self, memo):
-    # Eager Tensors are immutable so it's safe to return themselves as a copy.
-    del memo
-    return self
-
   def _datatype_enum(self):
     raise NotImplementedError()
 
@@ -873,24 +994,15 @@ class _EagerTensorBase(Tensor):
     """
     raise NotImplementedError()
 
-  def _copy_to_device(self, context, device):  # pylint: disable=redefined-outer-name
+  def _copy_to_device(self, device_name):  # pylint: disable=redefined-outer-name
     raise NotImplementedError()
 
-  def __str__(self):
-    return "tf.Tensor(%s, shape=%s, dtype=%s)" % (numpy_text(self), self.shape,
-                                                  self.dtype.name)
-
-  def __repr__(self):
-    return "<tf.Tensor: id=%s, shape=%s, dtype=%s, numpy=%s>" % (
-        self._id, self.shape, self.dtype.name, numpy_text(self, is_repr=True))
-
   @staticmethod
   def _override_operator(name, func):
     setattr(_EagerTensorBase, name, func)
 
   def _copy_nograd(self, ctx=None, device_name=None):
     """Copies tensor to dest device, but doesn't record the operation."""
-    # pylint: disable=protected-access
     # Creates a new tensor on the dest device.
     if ctx is None:
       ctx = context.context()
@@ -899,7 +1011,7 @@ class _EagerTensorBase(Tensor):
     # pylint: disable=protected-access
     try:
       ctx.ensure_initialized()
-      new_tensor = self._copy_to_device(context=ctx._handle, device=device_name)
+      new_tensor = self._copy_to_device(device_name)
     except core._NotOkStatusException as e:
       six.raise_from(core._status_to_exception(e.code, e.message), None)
     return new_tensor
@@ -942,12 +1054,6 @@ class _EagerTensorBase(Tensor):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
-  def __len__(self):
-    """Returns the length of the first dimension in the Tensor."""
-    if not self.shape.ndims:
-      raise TypeError("Scalar tensor has no `len()`")
-    return self._shape_tuple()[0]
-
   @deprecation.deprecated(None, "Use tf.identity instead.")
   def cpu(self):
     """A copy of this Tensor with contents backed by host memory."""
@@ -967,12 +1073,6 @@ class _EagerTensorBase(Tensor):
     """
     return self._copy(context.context(), "GPU:" + str(gpu_index))
 
-  def __bool__(self):
-    return bool(self._numpy())
-
-  def __nonzero__(self):
-    return self.__bool__()
-
   def set_shape(self, shape):
     if not self.shape.is_compatible_with(shape):
       raise ValueError(
@@ -1158,19 +1258,21 @@ def internal_convert_to_tensor(value,
                                as_ref=False,
                                preferred_dtype=None,
                                ctx=None,
-                               accept_composite_tensors=False):
+                               accepted_result_types=(Tensor,)):
   """Implementation of the public convert_to_tensor."""
+  if isinstance(value, EagerTensor):
+    if ctx is None:
+      ctx = context.context()
+    if not ctx.executing_eagerly():
+      graph = get_default_graph()
+      if not graph.building_function:
+        raise RuntimeError("Attempting to capture an EagerTensor without "
+                           "building a function.")
+      return graph.capture(value, name=name)
+
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
-  if ctx is None:
-    ctx = context.context()
-  if isinstance(value, EagerTensor) and not ctx.executing_eagerly():
-    graph = get_default_graph()
-    if not graph.building_function:
-      raise RuntimeError("Attempting to capture an EagerTensor without "
-                         "building a function.")
-    return graph.capture(value, name=name)
-  elif isinstance(value, Tensor):
+  if isinstance(value, Tensor):
     if dtype is not None and not dtype.is_compatible_with(value.dtype):
       raise ValueError(
           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
@@ -1179,7 +1281,6 @@ def internal_convert_to_tensor(value,
 
   if preferred_dtype is not None:
     preferred_dtype = dtypes.as_dtype(preferred_dtype)
-
   for base_type, conversion_func in tensor_conversion_registry.get(type(value)):
     # If dtype is None but preferred_dtype is not None, we try to
     # cast to preferred_dtype first.
@@ -1204,11 +1305,7 @@ def internal_convert_to_tensor(value,
     if ret is NotImplemented:
       continue
 
-    is_acceptable_type = (
-        isinstance(ret, Tensor) or
-        (accept_composite_tensors and
-         isinstance(ret, composite_tensor.CompositeTensor)))
-    if not is_acceptable_type:
+    if not isinstance(ret, accepted_result_types):
       raise RuntimeError(
           "%sConversion function %r for type %s returned non-Tensor: %r" %
           (_error_prefix(name), conversion_func, base_type, ret))
@@ -1254,7 +1351,7 @@ def internal_convert_n_to_tensor(values,
     RuntimeError: If a registered conversion function returns an invalid
       value.
   """
-  if not isinstance(values, collections.Sequence):
+  if not isinstance(values, collections_abc.Sequence):
     raise TypeError("values must be a sequence.")
   ret = []
   if ctx is None:
@@ -1362,7 +1459,7 @@ def internal_convert_to_tensor_or_composite(value,
         dtype=dtype,
         name=name,
         as_ref=as_ref,
-        accept_composite_tensors=True)
+        accepted_result_types=(Tensor, composite_tensor.CompositeTensor))
 
 
 def internal_convert_n_to_tensor_or_composite(values,
@@ -1391,7 +1488,7 @@ def internal_convert_n_to_tensor_or_composite(values,
     RuntimeError: If a registered conversion function returns an invalid
       value.
   """
-  if not isinstance(values, collections.Sequence):
+  if not isinstance(values, collections_abc.Sequence):
     raise TypeError("values must be a sequence.")
   ret = []
   for i, value in enumerate(values):
@@ -1438,32 +1535,24 @@ def _device_string(dev_spec):
     return dev_spec
 
 
-def _NodeDef(op_type, name, device=None, attrs=None):  # pylint: disable=redefined-outer-name
+def _NodeDef(op_type, name, attrs=None):
   """Create a NodeDef proto.
 
   Args:
     op_type: Value for the "op" attribute of the NodeDef proto.
     name: Value for the "name" attribute of the NodeDef proto.
-    device: string, device, or function from NodeDef to string. Value for the
-      "device" attribute of the NodeDef proto.
-    attrs: Optional dictionary where the key is the attribute name (a string)
+    attrs: Dictionary where the key is the attribute name (a string)
       and the value is the respective "attr" attribute of the NodeDef proto (an
       AttrValue).
 
   Returns:
     A node_def_pb2.NodeDef protocol buffer.
   """
-  node_def = node_def_pb2.NodeDef()
-  node_def.op = compat.as_bytes(op_type)
-  node_def.name = compat.as_bytes(name)
-  if attrs is not None:
+  node_def = node_def_pb2.NodeDef(op=compat.as_bytes(op_type),
+                                  name=compat.as_bytes(name))
+  if attrs:
     for k, v in six.iteritems(attrs):
       node_def.attr[k].CopyFrom(v)
-  if device is not None:
-    if callable(device):
-      node_def.device = device(node_def)
-    else:
-      node_def.device = _device_string(device)
   return node_def
 
 
@@ -1668,6 +1757,7 @@ class Operation(object):
     if c_op:
       self._c_op = c_op
       op_def = g._get_op_def(c_api.TF_OperationOpType(c_op))
+      name = self.name
     else:
       if op_def is None:
         op_def = self._graph._get_op_def(node_def.op)
@@ -1677,22 +1767,21 @@ class Operation(object):
           op_def, inputs, node_def.attr)
       self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
                                 control_input_ops)
+      name = compat.as_str(node_def.name)
     # pylint: enable=protected-access
 
     self._is_stateful = op_def.is_stateful
 
     # Initialize self._outputs.
     num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
-    output_types = [
-        c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
-        for i in range(num_outputs)
-    ]
-    self._outputs = [
-        Tensor(self, i, output_type)
-        for i, output_type in enumerate(output_types)
-    ]
+    self._outputs = []
+    for i in range(num_outputs):
+      tf_output = c_api_util.tf_output(self._c_op, i)
+      output_type = c_api.TF_OperationOutputType(tf_output)
+      tensor = Tensor._create_with_tf_output(self, i, output_type, tf_output)  # pylint: disable=protected-access
+      self._outputs.append(tensor)
 
-    self._graph._add_op(self)  # pylint: disable=protected-access
+    self._graph._add_op(self, self._id_value, name)  # pylint: disable=protected-access
 
     if not c_op:
       self._control_flow_post_processing()
@@ -2041,8 +2130,6 @@ class Operation(object):
     """The list of `Tensor` objects representing the outputs of this op."""
     return self._outputs
 
-# pylint: disable=protected-access
-
   class _InputList(object):
     """Immutable input list wrapper."""
 
@@ -2064,9 +2151,6 @@ class Operation(object):
     def __getitem__(self, i):
       return self._inputs[i]
 
-
-# pylint: enable=protected-access
-
   @property
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
@@ -2220,17 +2304,7 @@ class Operation(object):
   @property
   def traceback(self):
     """Returns the call stack from when this operation was constructed."""
-    return tf_stack.convert_stack(self._traceback)
-
-  @property
-  def traceback_with_start_lines(self):
-    """Same as traceback but includes start line of function definition.
-
-    Returns:
-      A list of 5-tuples (filename, lineno, name, code, func_start_lineno).
-    """
-    return tf_stack.convert_stack(
-        self._traceback, include_func_start_lineno=True)
+    return self._traceback
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
@@ -2315,6 +2389,31 @@ class Operation(object):
     assert oneof_value in fields, "Unsupported field type in " + str(x)
     return getattr(x, oneof_value)
 
+  def _get_attr_type(self, name):
+    """Returns the `DType` value of the attr of this op with the given `name`."""
+    try:
+      dtype_enum = c_api.TF_OperationGetAttrType(self._c_op, name)
+      return _DTYPES_INTERN_TABLE[dtype_enum]
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+
+  def _get_attr_bool(self, name):
+    """Returns the `bool` value of the attr of this op with the given `name`."""
+    try:
+      return c_api.TF_OperationGetAttrBool(self._c_op, name)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+
+  def _get_attr_int(self, name):
+    """Returns the `int` value of the attr of this op with the given `name`."""
+    try:
+      return c_api.TF_OperationGetAttrInt(self._c_op, name)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+
   def run(self, feed_dict=None, session=None):
     """Runs this operation in a `Session`.
 
@@ -2766,7 +2865,7 @@ class Graph(object):
     # self._thread_local._colocation_stack is used instead.
     self._graph_colocation_stack = traceable_stack.TraceableStack()
     # Set of tensors that are dangerous to feed!
-    self._unfeedable_tensors = set()
+    self._unfeedable_tensors = object_identity.ObjectIdentitySet()
     # Set of operations that are dangerous to fetch!
     self._unfetchable_ops = set()
     # A map of tensor handle placeholder to tensor dtype.
@@ -2796,6 +2895,16 @@ class Graph(object):
     # Set to True if this graph is being built in an
     # AutomaticControlDependencies context.
     self._add_control_dependencies = False
+    # Cache for OpDef protobufs retrieved via the C API.
+    self._op_def_cache = {}
+    # Cache for constant results of `broadcast_gradient_args()`. The keys are
+    # tuples of fully-defined shapes: (x_shape_tuple, y_shape_tuple), and the
+    # values are tuples of reduction indices: (rx, ry).
+    self._bcast_grad_args_cache = {}
+    # Cache for constant results of `reduced_shape()`. The keys are pairs of
+    # tuples: (input_shape_tuple, reduction_indices_tuple), and the values
+    # are pairs of tuples: (output_shape_kept_dims, tile_scaling).
+    self._reduced_shape_cache = {}
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
@@ -2882,31 +2991,19 @@ class Graph(object):
     if self._finalized:
       raise RuntimeError("Graph is finalized and cannot be modified.")
 
-  def _add_op(self, op):
+  def _add_op(self, op, op_id, op_name):
     """Adds 'op' to the graph.
 
     Args:
-      op: the Operator or Tensor to add.
-
-    Raises:
-      TypeError: if op is not an Operation or Tensor.
-      ValueError: if the op.name or op._id are already used.
+      op: the Operation to add.
+      op_id: the ID of the Operation.
+      op_name: the name of the Operation.
     """
     self._check_not_finalized()
-    if not isinstance(op, (Tensor, Operation)):
-      raise TypeError("op must be a Tensor or Operation: %s" % op)
     with self._lock:
-      # pylint: disable=protected-access
-      if op._id in self._nodes_by_id:
-        raise ValueError("cannot add an op with id %d as it already "
-                         "exists in the graph" % op._id)
-      if op.name in self._nodes_by_name:
-        raise ValueError("cannot add op with name %s as that name "
-                         "is already used" % op.name)
-      self._nodes_by_id[op._id] = op
-      self._nodes_by_name[op.name] = op
-      self._version = max(self._version, op._id)
-      # pylint: enable=protected-access
+      self._nodes_by_id[op_id] = op
+      self._nodes_by_name[op_name] = op
+      self._version = max(self._version, op_id)
 
   @property
   def _c_graph(self):
@@ -3293,7 +3390,7 @@ class Graph(object):
     else:
       name = self.unique_name(name)
 
-    node_def = _NodeDef(op_type, name, device=None, attrs=attrs)
+    node_def = _NodeDef(op_type, name, attrs)
 
     input_ops = set([t.op for t in inputs])
     control_inputs = self._control_dependencies_for_inputs(input_ops)
@@ -3696,14 +3793,20 @@ class Graph(object):
 
   def _get_op_def(self, type):  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
-    with c_api_util.tf_buffer() as buf:
-      # pylint: disable=protected-access
-      c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
-      # pylint: enable=protected-access
-      data = c_api.TF_GetBuffer(buf)
-    op_def = op_def_pb2.OpDef()
-    op_def.ParseFromString(compat.as_bytes(data))
-    return op_def
+    # NOTE: No locking is required because the lookup and insertion operations
+    # on Python dictionaries are atomic.
+    try:
+      return self._op_def_cache[type]
+    except KeyError:
+      with c_api_util.tf_buffer() as buf:
+        # pylint: disable=protected-access
+        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+        # pylint: enable=protected-access
+        data = c_api.TF_GetBuffer(buf)
+      op_def = op_def_pb2.OpDef()
+      op_def.ParseFromString(compat.as_bytes(data))
+      self._op_def_cache[type] = op_def
+      return op_def
 
   def as_default(self):
     """Returns a context manager that makes this `Graph` the default graph.
@@ -3848,8 +3951,12 @@ class Graph(object):
         c = []
         regex = re.compile(scope)
         for item in collection:
-          if hasattr(item, "name") and regex.match(item.name):
-            c.append(item)
+          try:
+            if regex.match(item.name):
+              c.append(item)
+          except AttributeError:
+            # Collection items with no name are ignored.
+            pass
         return c
 
   def get_all_collection_keys(self):
@@ -4395,9 +4502,13 @@ class Graph(object):
       return self._control_inputs_val
 
     def add_op(self, op):
+      if isinstance(op, Tensor):
+        op = op.experimental_ref()
       self._seen_nodes.add(op)
 
     def op_in_group(self, op):
+      if isinstance(op, Tensor):
+        op = op.experimental_ref()
       return op in self._seen_nodes
 
   def _push_control_dependencies_controller(self, controller):
@@ -4934,9 +5045,10 @@ class Graph(object):
     return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
 
   @_auto_cast_variable_read_dtype.setter
-  def _auto_cast_variable_read_dtype(self, _auto_cast_variable_read_dtype):
-    self._thread_local._auto_cast_variable_read_dtype = (  # pylint: disable=protected-access
-        _auto_cast_variable_read_dtype)
+  def _auto_cast_variable_read_dtype(self, dtype):
+    if dtype:
+      dtype = dtypes.as_dtype(dtype)
+    self._thread_local._auto_cast_variable_read_dtype = dtype  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
   def _enable_auto_casting_variables(self, dtype):
@@ -5652,28 +5764,29 @@ def enable_eager_execution_internal(config=None,
           "tf.enable_eager_execution must be called at program startup.")
   context.default_execution_mode = context.EAGER_MODE
   # pylint: disable=protected-access
-  if context._context is None:
-    context._context = context.Context(
-        config=config,
-        device_policy=device_policy,
-        execution_mode=execution_mode,
-        server_def=server_def)
-  elif ((config is not None and config is not context._context._config) or
-        (device_policy is not None and
-         device_policy is not context._context._device_policy) or
-        (execution_mode is not None and
-         execution_mode is not context._context._execution_mode)):
-    raise ValueError(
-        "Trying to change the options of an active eager"
-        " execution. Context config: %s, specified config:"
-        " %s. Context device policy: %s, specified device"
-        " policy: %s. Context execution mode: %s, "
-        " specified execution mode %s." %
-        (context._context._config, config, context._context._device_policy,
-         device_policy, context._context._execution_mode, execution_mode))
-  else:
-    # We already created everything, so update the thread local data.
-    context._context._thread_local_data.is_eager = True
+  with context._context_lock:
+    if context._context is None:
+      context._set_context_locked(context.Context(
+          config=config,
+          device_policy=device_policy,
+          execution_mode=execution_mode,
+          server_def=server_def))
+    elif ((config is not None and config is not context._context._config) or
+          (device_policy is not None and
+           device_policy is not context._context._device_policy) or
+          (execution_mode is not None and
+           execution_mode is not context._context._execution_mode)):
+      raise ValueError(
+          "Trying to change the options of an active eager"
+          " execution. Context config: %s, specified config:"
+          " %s. Context device policy: %s, specified device"
+          " policy: %s. Context execution mode: %s, "
+          " specified execution mode %s." %
+          (context._context._config, config, context._context._device_policy,
+           device_policy, context._context._execution_mode, execution_mode))
+    else:
+      # We already created everything, so update the thread local data.
+      context._context._thread_local_data.is_eager = True
 
   # Monkey patch to get rid of an unnecessary conditional since the context is
   # now initialized.
@@ -5819,8 +5932,9 @@ def _get_graph_from_inputs(op_input_list, graph=None):
     The appropriate graph to use for the given inputs.
 
   """
-  if get_default_graph().building_function:
-    return get_default_graph()
+  current_default_graph = get_default_graph()
+  if current_default_graph.building_function:
+    return current_default_graph
 
   op_input_list = tuple(op_input_list)  # Handle generators correctly
   if graph and not isinstance(graph, Graph):
@@ -5853,7 +5967,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
         raise ValueError("%s is not from the passed-in graph." % graph_element)
 
   # 2. If all else fails, we use the default graph, which is always there.
-  return graph or get_default_graph()
+  return graph or current_default_graph
 
 
 @tf_export(v1=["GraphKeys"])
@@ -6198,15 +6312,21 @@ class name_scope(object):  # pylint: disable=invalid-name
         raise ValueError(
             "At least one of name (%s) and default_name (%s) must be provided."
             % (self._name, self._default_name))
-      if self._values is None:
-        self._values = []
-      if self._values:
-        g = _get_graph_from_inputs(self._values)
-        self._g_manager = g.as_default()
-        self._g_manager.__enter__()
+
+      g = get_default_graph()
+      if self._values and not g.building_function:
+        # Specialize based on the knowledge that `_get_graph_from_inputs()`
+        # ignores `inputs` when building a function.
+        g_from_inputs = _get_graph_from_inputs(self._values)
+        if g_from_inputs is not g:
+          g = g_from_inputs
+          self._g_manager = g.as_default()
+          self._g_manager.__enter__()
+        else:
+          self._g_manager = None
       else:
-        g = get_default_graph()
         self._g_manager = None
+
       try:
         self._name_scope = g.name_scope(self._name)
         return self._name_scope.__enter__()
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 2fdc42e1dea..5399e23098b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -18,14 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import gc
+import numpy as np
 import os
 import threading
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat as forward_compat
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -38,9 +42,11 @@ from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
@@ -99,13 +105,47 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     self.assertEqual([1, 2, 3], t.get_shape())
 
   def testIterable(self):
+    if not context.executing_eagerly():
+      self.skipTest("Eager-mode test")
     op = ops.Operation(
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
-    self.assertTrue(isinstance(t, ops.Tensor))
-    with self.assertRaisesRegexp(TypeError, "iter"):
-      for _ in t:
-        pass
+    with self.assertRaisesRegexp(TypeError, "Cannot iterate"):
+      next(iter(t))
+
+  def testIterableGraph(self):
+    if context.executing_eagerly():
+      self.skipTest("Graph-mode test")
+
+    op = ops.Operation(
+        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
+    t = op.outputs[0]
+    with self.assertRaisesRegexp(TypeError, "iterating.*not allowed in Graph"):
+      next(iter(t))
+    with self.assertRaisesRegexp(
+        TypeError, "iterating.*AutoGraph did not convert"):
+      with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
+        next(iter(t))
+    with self.assertRaisesRegexp(
+        TypeError, "iterating.*AutoGraph is disabled"):
+      with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
+        next(iter(t))
+
+  def testImplicitBool(self):
+    op = ops.Operation(
+        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.bool])
+    t = op.outputs[0]
+    with self.assertRaisesRegexp(
+        TypeError, "using.*as a.*bool.*not allowed in Graph"):
+      bool(t)
+    with self.assertRaisesRegexp(
+        TypeError, "using.*as a.*bool.*AutoGraph did not convert"):
+      with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
+        bool(t)
+    with self.assertRaisesRegexp(
+        TypeError, "using.*as a.*bool.*AutoGraph is disabled"):
+      with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
+        bool(t)
 
   def testAddShape(self):
     with self.cached_session():
@@ -136,7 +176,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       b = array_ops.ones([])
       c = a + b
-      self.assertEqual(tensor_shape.scalar(), c.shape)
+      self.assertEqual(tensor_shape.TensorShape([]), c.shape)
 
   @test_util.run_deprecated_v1
   def testShapeFunctionError(self):
@@ -148,6 +188,130 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
           r"\(op: 'Add(V2)?'\) with input shapes: \[1,2,3\], \[4,5,6\]."):
         _ = a + b
 
+  def testNumpyArray(self):
+    with ops.Graph().as_default():
+      x = array_ops.ones((3, 4), name="test_ones")
+
+    with self.assertRaisesRegexp(NotImplementedError,
+                                 r"Cannot convert a symbolic.+test_ones"):
+      np.array(x)
+
+    with self.assertRaisesRegexp(TypeError, "not well defined.+test_ones"):
+      len(x)
+
+    # EagerTensors should still behave as numpy arrays.
+    with context.eager_mode():
+      x = array_ops.ones((3, 4))
+
+    self.assertAllEqual(x, np.ones((3, 4)))
+    self.assertAllEqual(np.array(x), np.ones((3, 4)))
+    self.assertEqual(len(x), 3)
+
+  def testRef(self):
+    x1 = constant_op.constant(3)
+    x2 = x1
+    y = constant_op.constant(3)
+    z = constant_op.constant([6, 10])
+    w = variables.Variable(5)
+
+    self.assertEqual(x1.experimental_ref(), x1.experimental_ref())
+    self.assertEqual(x2.experimental_ref(), x2.experimental_ref())
+    self.assertEqual(x1.experimental_ref(), x2.experimental_ref())
+    self.assertEqual(y.experimental_ref(), y.experimental_ref())
+    self.assertEqual(z.experimental_ref(), z.experimental_ref())
+    self.assertEqual(w.experimental_ref(), w.experimental_ref())
+
+    self.assertNotEqual(x1.experimental_ref(), y.experimental_ref())
+    self.assertNotEqual(x1.experimental_ref(), z.experimental_ref())
+    self.assertNotEqual(x1.experimental_ref(), w.experimental_ref())
+    self.assertNotEqual(y.experimental_ref(), z.experimental_ref())
+    self.assertNotEqual(y.experimental_ref(), w.experimental_ref())
+    self.assertNotEqual(z.experimental_ref(), w.experimental_ref())
+
+  def testRefDeref(self):
+    x1 = constant_op.constant(3)
+    x2 = x1
+    y = constant_op.constant(3)
+    z = constant_op.constant([6, 10])
+    w = variables.Variable(5)
+
+    self.assertIs(x1, x1.experimental_ref().deref())
+    self.assertIs(x2, x2.experimental_ref().deref())
+    self.assertIs(x1, x2.experimental_ref().deref())
+    self.assertIs(x2, x1.experimental_ref().deref())
+    self.assertIs(y, y.experimental_ref().deref())
+    self.assertIs(z, z.experimental_ref().deref())
+
+    self.assertIsNot(x1, y.experimental_ref().deref())
+    self.assertIsNot(x1, z.experimental_ref().deref())
+    self.assertIsNot(x1, w.experimental_ref().deref())
+    self.assertIsNot(y, z.experimental_ref().deref())
+    self.assertIsNot(y, w.experimental_ref().deref())
+    self.assertIsNot(z, w.experimental_ref().deref())
+
+  def testRefInSet(self):
+    x1 = constant_op.constant(3)
+    x2 = x1
+    y = constant_op.constant(3)
+    z = constant_op.constant([6, 10])
+    w = variables.Variable(5)
+
+    self.assertEqual(x1.experimental_ref(), x2.experimental_ref())
+
+    tensor_set = {
+        x1.experimental_ref(),
+        x2.experimental_ref(),
+        y.experimental_ref(),
+        z.experimental_ref(),
+        w.experimental_ref(),
+    }
+
+    self.assertEqual(len(tensor_set), 4)
+    self.assertIn(x1.experimental_ref(), tensor_set)
+    self.assertIn(x2.experimental_ref(), tensor_set)
+    self.assertIn(y.experimental_ref(), tensor_set)
+    self.assertIn(z.experimental_ref(), tensor_set)
+    self.assertIn(w.experimental_ref(), tensor_set)
+
+  def testRefInDict(self):
+    x1 = constant_op.constant(3)
+    x2 = x1
+    y = constant_op.constant(3)
+    z = constant_op.constant([6, 10])
+    w = variables.Variable(5)
+
+    self.assertEqual(x1.experimental_ref(), x2.experimental_ref())
+
+    tensor_dict = {
+        x1.experimental_ref(): "x1",
+        y.experimental_ref(): "y",
+        z.experimental_ref(): "z",
+        w.experimental_ref(): "w",
+    }
+
+    self.assertEqual(len(tensor_dict), 4)
+
+    # Overwriting x1
+    tensor_dict[x2.experimental_ref()] = "x2"
+    self.assertEqual(len(tensor_dict), 4)
+
+    self.assertEqual(tensor_dict[x1.experimental_ref()], "x2")
+    self.assertEqual(tensor_dict[x2.experimental_ref()], "x2")
+    self.assertEqual(tensor_dict[y.experimental_ref()], "y")
+    self.assertEqual(tensor_dict[z.experimental_ref()], "z")
+    self.assertEqual(tensor_dict[w.experimental_ref()], "w")
+
+  def testTensorRefStrong(self):
+    x = constant_op.constant(1.)
+    x_ref = x.experimental_ref()
+    del x
+    self.assertIsNotNone(x_ref.deref())
+
+  def testVariableRefStrong(self):
+    x = variables.Variable(1.)
+    x_ref = x.experimental_ref()
+    del x
+    self.assertIsNotNone(x_ref.deref())
 
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -191,19 +355,142 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    spec1 = indexed_slices.IndexedSlicesSpec()
+    self.assertEqual(spec1._shape.rank, None)
+    self.assertEqual(spec1._values_dtype, dtypes.float32)
+    self.assertEqual(spec1._indices_dtype, dtypes.int64)
+    self.assertEqual(spec1._dense_shape_dtype, None)
+    self.assertEqual(spec1._indices_shape.as_list(), [None])
+
+    spec2 = indexed_slices.IndexedSlicesSpec([None, None], dtypes.string,
+                                             dtypes.int32, dtypes.int64, [10])
+    self.assertEqual(spec2._shape.as_list(), [None, None])
+    self.assertEqual(spec2._values_dtype, dtypes.string)
+    self.assertEqual(spec2._indices_dtype, dtypes.int32)
+    self.assertEqual(spec2._dense_shape_dtype, dtypes.int64)
+    self.assertEqual(spec2._indices_shape.as_list(), [10])
+
+  def testValueType(self):
+    spec1 = indexed_slices.IndexedSlicesSpec()
+    self.assertEqual(spec1.value_type, ops.IndexedSlices)
+
+  @parameterized.parameters([
+      (indexed_slices.IndexedSlicesSpec(),
+       (tensor_shape.TensorShape(None), dtypes.float32, dtypes.int64, None,
+        tensor_shape.TensorShape([None]))),
+      (indexed_slices.IndexedSlicesSpec(shape=[5, None, None]),
+       (tensor_shape.TensorShape([5, None, None]), dtypes.float32,
+        dtypes.int64, None, tensor_shape.TensorShape([None]))),
+      (indexed_slices.IndexedSlicesSpec(
+          dtype=dtypes.int32, dense_shape_dtype=dtypes.int64),
+       (tensor_shape.TensorShape(None), dtypes.int32, dtypes.int64,
+        dtypes.int64, tensor_shape.TensorShape([None]))),
+      (indexed_slices.IndexedSlicesSpec(indices_shape=[100]),
+       (tensor_shape.TensorShape(None), dtypes.float32, dtypes.int64, None,
+        tensor_shape.TensorShape([100]))),
+  ])  # pyformat: disable
+  def testSerialize(self, spec, expected):
+    serialization = spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      (indexed_slices.IndexedSlicesSpec(dtype=dtypes.string), (
+          tensor_spec.TensorSpec(None, dtypes.string),
+          tensor_spec.TensorSpec([None], dtypes.int64),
+      )),
+      (indexed_slices.IndexedSlicesSpec(
+          dtype=dtypes.string, dense_shape_dtype=dtypes.int32), (
+              tensor_spec.TensorSpec(None, dtypes.string),
+              tensor_spec.TensorSpec([None], dtypes.int64),
+              tensor_spec.TensorSpec([None], dtypes.int32),
+          )),
+      (indexed_slices.IndexedSlicesSpec(
+          shape=[5, 10, 15], dense_shape_dtype=dtypes.int32), (
+              tensor_spec.TensorSpec([None, 10, 15], dtypes.float32),
+              tensor_spec.TensorSpec([None], dtypes.int64),
+              tensor_spec.TensorSpec([3], dtypes.int32),
+          )),
+      (indexed_slices.IndexedSlicesSpec(
+          shape=[5, 10, 15], dense_shape_dtype=dtypes.int32,
+          indices_shape=[20]), (
+              tensor_spec.TensorSpec([20, 10, 15], dtypes.float32),
+              tensor_spec.TensorSpec([20], dtypes.int64),
+              tensor_spec.TensorSpec([3], dtypes.int32),
+          )),
+  ])
+  def testComponentSpecs(self, spec, expected):
+    self.assertEqual(spec._component_specs, expected)
+
+  @parameterized.parameters([
+      {
+          "spec": indexed_slices.IndexedSlicesSpec(),
+          "values": [3.0, 5.0],
+          "indices": [5, 10]
+      },
+      {
+          "spec":
+              indexed_slices.IndexedSlicesSpec(dense_shape_dtype=dtypes.int32),
+          "values": [3.0, 5.0],
+          "indices": [5, 10],
+          "dense_shape": [100]
+      },
+  ])
+  def testToFromComponents(self, spec, indices, values, dense_shape=None):
+    x = ops.IndexedSlices(indices, values, dense_shape)
+    actual_components = spec._to_components(x)
+    if dense_shape is None:
+      self.assertAllTensorsEqual(actual_components, [indices, values])
+    else:
+      self.assertAllTensorsEqual(actual_components,
+                                 [indices, values, dense_shape])
+    st_reconstructed = spec._from_components(actual_components)
+    self.assertAllEqual(x.indices, st_reconstructed.indices)
+    self.assertAllEqual(x.values, st_reconstructed.values)
+    if dense_shape is None:
+      self.assertIs(st_reconstructed.dense_shape, None)
+    else:
+      self.assertAllEqual(x.dense_shape, st_reconstructed.dense_shape)
+
+  @test_util.run_v1_only("IndexedSlicesValue is deprecated in v2")
+  def testFromNumpyComponents(self):
+    indices = np.array([3, 8])
+    values = np.array([1.0, 9.0])
+    dense_shape = np.array([100])
+
+    spec1 = indexed_slices.IndexedSlicesSpec(dense_shape_dtype=dtypes.int32)
+    st1 = spec1._from_components((values, indices, dense_shape))
+    self.assertIsInstance(st1, indexed_slices.IndexedSlicesValue)
+    self.assertAllEqual(st1.indices, indices)
+    self.assertAllEqual(st1.values, values)
+    self.assertAllEqual(st1.dense_shape, dense_shape)
+
+    spec2 = indexed_slices.IndexedSlicesSpec()
+    st2 = spec2._from_components((values, indices))
+    self.assertIsInstance(st2, indexed_slices.IndexedSlicesValue)
+    self.assertAllEqual(st2.indices, indices)
+    self.assertAllEqual(st2.values, values)
+    self.assertIs(st2.dense_shape, None)
+
+
 class NodeDefConstructorTest(test_util.TensorFlowTestCase):
 
   def testNoArgs(self):
     nodedef = ops._NodeDef("None", "bar")
     self.assertProtoEquals("op: 'None' name: 'bar'", nodedef)
 
-  def testArgs(self):
-    nodedef = ops._NodeDef("foo", "bar", device="/device:baz:*")
-    self.assertProtoEquals("op:'foo' name:'bar' device:'/device:baz:*'",
-                           nodedef)
-    nodedef = ops._NodeDef("foo", "bar", device=pydev.DeviceSpec(job="j"))
-    self.assertProtoEquals("op:'foo' name:'bar' device:'/job:j'", nodedef)
-
 
 def _apply_op(g, *args, **kwargs):
   op = g.create_op(*args, **kwargs)
@@ -282,12 +569,6 @@ class OperationTest(test_util.TensorFlowTestCase):
     input:'myop1' input:'myop2:1' input:'myop2:1'
     """, op3.node_def)
 
-  def testDeviceFromNodeDef(self):
-    op = ops.Operation(
-        ops._NodeDef("None", "myop", device="/job:goo/device:GPU:0"),
-        ops.Graph(), [], [])
-    self.assertEqual("/job:goo/device:GPU:0", op.device)
-
   def testDeviceObject(self):
     op = ops.Operation(ops._NodeDef("None", "myop"), ops.Graph(), [], [])
     op._set_device("/job:goo/device:GPU:0")
@@ -412,6 +693,13 @@ class OperationTest(test_util.TensorFlowTestCase):
       values = [1.23]
       ops.convert_to_tensor(values, dtype=dtypes.int64)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToLongLongTensorType(self):
+    tensor = ops.convert_to_tensor(
+        # Get a numpy array of dtype NPY_LONGLONG
+        np.prod(constant_op.constant([1])._shape_tuple()))
+    self.assertEqual(dtypes.int64, tensor.dtype)
+
   @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorFromInvalidTensor(self):
     tensor = constant_op.constant(42.0, dtype=dtypes.float32)
@@ -624,33 +912,34 @@ class OperationTest(test_util.TensorFlowTestCase):
   @test_util.enable_control_flow_v2
   @test_util.run_v1_only("b/120545219")
   def testAddWhileInput(self):
-    @eager_function.defun
-    def test():
-      output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
-                                           [1])
-      while_op = output.op.inputs[0].op
-      self.assertEqual(while_op.type, "While")
-      orig_num_inputs = len(while_op.inputs)
+    if forward_compat.forward_compatible(2019, 8, 23):
+      @eager_function.defun
+      def test():
+        output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
+                                             [1])
+        while_op = output.op.inputs[0].op
+        self.assertEqual(while_op.type, "StatelessWhile")
+        orig_num_inputs = len(while_op.inputs)
 
-      # Make sure we can handle the while op having a control input.
-      while_op._add_control_input(constant_op.constant(0).op)
+        # Make sure we can handle the while op having a control input.
+        while_op._add_control_input(constant_op.constant(0).op)
 
-      new_input1 = constant_op.constant(1.0)
-      new_input2 = constant_op.constant(True)
+        new_input1 = constant_op.constant(1.0)
+        new_input2 = constant_op.constant(True)
 
-      # Clear output shapes to bypass shape checking.
-      while_op._set_shape_list_attr("output_shapes", [])
-      while_op._set_type_list_attr("T",
-                                   [t.dtype for t in while_op.inputs] +
-                                   [new_input1.dtype, new_input2.dtype])
+        # Clear output shapes to bypass shape checking.
+        while_op._set_shape_list_attr("output_shapes", [])
+        while_op._set_type_list_attr("T",
+                                     [t.dtype for t in while_op.inputs] +
+                                     [new_input1.dtype, new_input2.dtype])
 
-      while_op._add_while_inputs([new_input1, new_input2])
-      # Can't add an edge beyond what's specified by "T"
-      with self.assertRaises(errors.OutOfRangeError):
-        while_op._add_while_inputs([new_input2])
-      self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
+        while_op._add_while_inputs([new_input1, new_input2])
+        # Can't add an edge beyond what's specified by "T"
+        with self.assertRaises(errors.OutOfRangeError):
+          while_op._add_while_inputs([new_input2])
+        self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
 
-    test()
+      test()
 
   @test_util.run_deprecated_v1
   def testOpDef(self):
@@ -783,7 +1072,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op.name, "myop")
     self.assertEqual(op.type, "Identity")
     self.assertEqual(len(op.outputs), 1)
-    self.assertEqual(op.outputs[0].shape, tensor_shape.matrix(2, 3))
+    self.assertEqual(op.outputs[0].shape, tensor_shape.TensorShape([2, 3]))
 
   def testUniqueName(self):
     g = ops.Graph()
@@ -3061,28 +3350,6 @@ class NameScopeTest(test_util.TensorFlowTestCase):
     self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
 
 
-class TracebackTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_deprecated_v1
-  def testTracebackWithStartLines(self):
-    with self.cached_session() as sess:
-      a = constant_op.constant(2.0)
-      sess.run(
-          a,
-          options=config_pb2.RunOptions(
-              trace_level=config_pb2.RunOptions.FULL_TRACE))
-      self.assertTrue(sess.graph.get_operations())
-
-      # Tests that traceback_with_start_lines is the same as traceback
-      # but includes one more element at the end.
-      for op in sess.graph.get_operations():
-        self.assertEquals(len(op.traceback), len(op.traceback_with_start_lines))
-        for frame, frame_with_start_line in zip(
-            op.traceback, op.traceback_with_start_lines):
-          self.assertEquals(5, len(frame_with_start_line))
-          self.assertEquals(frame, frame_with_start_line[:-1])
-
-
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
   @test_util.run_v1_only("b/120545219")
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index d45428d7ca0..864d7591796 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -23,11 +23,10 @@ limitations under the License.
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -102,7 +101,7 @@ void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
 string TensorPBString(const TensorProto& pb) {
   // Note: This gets used in the argument list, and so must survive naive
   // word wrapping.
-  return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
+  return strings::StrCat("\"\"\"", pb.ShortDebugString(), "\"\"\"");
 }
 
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
@@ -118,7 +117,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   string Code() override;
 
  protected:
-  void HandleGraphMode(const string& function_setup);
+  void HandleGraphMode(const string& function_setup,
+                       const std::vector<string>& output_sizes);
 
   string GetEagerNotAllowedError();
   void ExpectListArg(const string& indentation, const string& arg_name,
@@ -360,7 +360,8 @@ string GenEagerPythonOp::Code() {
   return prelude_ + result_;
 }
 
-void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
+void GenEagerPythonOp::HandleGraphMode(
+    const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
   strings::StrAppend(&result_, function_setup);
   if (api_def_.visibility() == ApiDef::VISIBLE) {
@@ -383,23 +384,59 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
                          "  if not _result:\n"
                          "    return _op\n");
     }
-    strings::StrAppend(&result_, "  _inputs_flat = _op.inputs\n");
 
-    // Compute graph-mode attrs.
+    // Compute graph-mode attrs when we need to record a gradient.
+    strings::StrAppend(&result_, "  if _execute.must_record_gradient():\n");
     if (op_def_.attr_size() > 0) {
       string attr_values;
       for (int i = 0; i < op_def_.attr_size(); ++i) {
         if (i > 0) strings::StrAppend(&attr_values, ", ");
         const auto& attr_name(op_def_.attr(i).name());
-        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
-                           attr_name, "\")");
+        if (op_def_.attr(i).type() == "type") {
+          strings::StrAppend(&attr_values, "\"", attr_name,
+                             "\", _op._get_attr_type(\"", attr_name, "\")");
+        } else if (op_def_.attr(i).type() == "bool") {
+          strings::StrAppend(&attr_values, "\"", attr_name,
+                             "\", _op._get_attr_bool(\"", attr_name, "\")");
+        } else if (op_def_.attr(i).type() == "int") {
+          strings::StrAppend(&attr_values, "\"", attr_name,
+                             "\", _op._get_attr_int(\"", attr_name, "\")");
+        } else {
+          strings::StrAppend(&attr_values, "\"", attr_name,
+                             "\", _op.get_attr(\"", attr_name, "\")");
+        }
       }
       strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(
-          &result_, WordWrap("  _attrs = (", attr_values, kRightMargin), "\n");
+      strings::StrAppend(&result_,
+                         WordWrap("    _attrs = (", attr_values, kRightMargin),
+                         "\n");
+
     } else {
-      strings::StrAppend(&result_, "  _attrs = None\n");
+      strings::StrAppend(&result_, "    _attrs = ()\n");
     }
+
+    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+    strings::StrAppend(&result_, "    _execute.record_gradient(\n",
+                       "        \"", op_def_.name(),
+                       "\", _inputs_flat, _attrs, _result, name)\n");
+
+    if (num_outs_ == 1 && !output_sizes[0].empty()) {
+      // Single list result.
+    } else if (num_outs_ == 1) {
+      // Execute returns a single-element list which we need to destructure.
+      strings::StrAppend(&result_, "  ", "_result, = _result\n");
+    } else {
+      // Have multiple outputs, so we will need to reformat the return
+      // value of execute() to be a list with one entry per op output
+      // (that entry will be a list of tensors if that output is of list
+      // type).
+      // For list outputs, convert the right subrange of _result into a list.
+      Unflatten("  ", output_sizes, "_result", &result_);
+      // Convert to a named tuple.
+      strings::StrAppend(&result_, "  _result = _", op_def_.name(),
+                         "Output._make(_result)\n");
+    }
+    strings::StrAppend(&result_, "  return _result\n\n");
   } else {
     strings::StrAppend(&result_, "  return _op\n");
   }
@@ -610,8 +647,10 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
     bool execute_record_gradient) {
   if (num_outs_ > 0) {
     if (execute_record_gradient) {
-      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
-                         "      \"", op_def_.name(),
+      strings::StrAppend(&result_, indentation,
+                         "if _execute.must_record_gradient():\n");
+      strings::StrAppend(&result_, indentation, "  _execute.record_gradient(\n",
+                         "        \"", op_def_.name(),
                          "\", _inputs_flat, _attrs, _result, name)\n");
     }
     if (num_outs_ == 1 && !output_sizes[0].empty()) {
@@ -671,9 +710,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     result_ = function_setup;
     return false;
   }
-  HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("  ", output_sizes,
-                           true /* execute_record_gradient */);
+  HandleGraphMode(function_setup, output_sizes);
 
   AddRawOpExport(parameters);
   strings::StrAppend(&result_, "\n\n");
@@ -722,7 +759,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params = strings::StrCat(
       "_ctx._context_handle, _ctx._thread_local_data.device_name, \"",
-      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
+      op_def_.name(), "\", ", "name, _ctx.post_execution_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
@@ -1073,7 +1110,7 @@ from tensorflow.tools.docs import doc_controls as _doc_controls
 )");
 
   result.append("# ");
-  auto ops_text = ProtoDebugString(cleaned_ops);
+  auto ops_text = cleaned_ops.DebugString();
   absl::StripTrailingAsciiWhitespace(&ops_text);
   result.append(str_util::StringReplace(ops_text, "\n", "\n# ", true));
   result.append("\n");
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index bd8509ba6fa..e6d9f9563e5 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -26,11 +26,9 @@ limitations under the License.
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -379,7 +377,7 @@ string ShapeToPython(const TensorShapeProto& shape) {
 }
 
 string TensorToPython(const TensorProto& proto) {
-  return ProtoShortDebugString(proto);
+  return proto.ShortDebugString();
 }
 
 string AttrListToPython(const AttrValue& value,
@@ -472,6 +470,12 @@ void GenerateLowerCaseOpName(const string& str, string* result) {
   const int last_index = str.size() - 1;
   for (int i = 0; i <= last_index; ++i) {
     const char c = str[i];
+    // Convert namespace separators ('>' characters) to joiners
+    if (c == '>') {
+      result->push_back(joiner);
+      continue;
+    }
+
     // Emit a joiner only if a previous-lower-to-now-upper or a
     // now-upper-to-next-lower transition happens.
     if (isupper(c) && (i > 0)) {
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 7e1e09c2856..18a9ee32e04 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -54,11 +54,11 @@ class Registry(object):
     if not name:
       name = candidate.__name__
     if name in self._registry:
-      (filename, line_number, function_name, _, _) = (
-          self._registry[name][_LOCATION_TAG])
-      raise KeyError("Registering two %s with name '%s'! "
-                     "(Previous registration was in %s %s:%d)" %
-                     (self._name, name, function_name, filename, line_number))
+      frame = self._registry[name][_LOCATION_TAG]
+      raise KeyError(
+          "Registering two %s with name '%s'! "
+          "(Previous registration was in %s %s:%d)" %
+          (self._name, name, frame.name, frame.filename, frame.lineno))
 
     logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
     # stack trace is [this_function, Register(), user_function,...]
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 788d0e97faf..a99087214cf 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import collections
 import numpy as np
 
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.python import _pywrap_utils
+from tensorflow.python import tf2
 from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_like
@@ -253,7 +256,7 @@ class SparseTensor(_TensorLike, composite_tensor.CompositeTensor):
 SparseTensorValue = collections.namedtuple("SparseTensorValue",
                                            ["indices", "values", "dense_shape"])
 tf_export(v1=["SparseTensorValue"])(SparseTensorValue)
-pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
+_pywrap_utils.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
 @tf_export("SparseTensorSpec")
@@ -278,6 +281,16 @@ class SparseTensorSpec(type_spec.BatchableTypeSpec):
   def _serialize(self):
     return (self._shape, self._dtype)
 
+  @property
+  def dtype(self):
+    """The `tf.dtypes.DType` specified by this type for the SparseTensor."""
+    return self._dtype
+
+  @property
+  def shape(self):
+    """The `tf.TensorShape` specified by this type for the SparseTensor."""
+    return self._shape
+
   @property
   def _component_specs(self):
     rank = self._shape.ndims
@@ -293,7 +306,11 @@ class SparseTensorSpec(type_spec.BatchableTypeSpec):
     return [value.indices, value.values, value.dense_shape]
 
   def _from_components(self, tensor_list):
-    return SparseTensor(*tensor_list)
+    if (all(isinstance(t, np.ndarray) for t in tensor_list) and
+        not tf2.enabled()):
+      return SparseTensorValue(*tensor_list)
+    else:
+      return SparseTensor(*tensor_list)
 
   # The SparseTensorSpec tensor_list encoding uses (de)serialize_sparse ops
   # to (un)box the component tensors in a way that allows for batching &
@@ -323,11 +340,28 @@ class SparseTensorSpec(type_spec.BatchableTypeSpec):
 
   def _from_compatible_tensor_list(self, tensor_list):
     tensor_list = gen_sparse_ops.deserialize_sparse(tensor_list[0], self._dtype)
-    result = SparseTensor(*tensor_list)
+    indices, values, dense_shape = tensor_list
     rank = self._shape.ndims
-    result.indices.set_shape([None, rank])
-    result.dense_shape.set_shape([rank])
-    return result
+    indices.set_shape([None, rank])
+    # We restore the dense_shape from the SparseTypeSpec. This is necessary
+    # for shape inference when using placeholder SparseTensors in function
+    # tracing.
+    if self._shape.is_fully_defined():
+      dense_shape = ops.convert_to_tensor(
+          self._shape, dtype=dtypes.int64, name="shape")
+    elif (self._shape.rank is not None and
+          any(dim.value is not None for dim in self._shape.dims)):
+      # array_ops imports sparse_tensor.py. Local import to avoid import cycle.
+      from tensorflow.python.ops import array_ops  # pylint: disable=g-import-not-at-top
+      pieces = array_ops.unstack(dense_shape, num=self._shape.rank)
+      for i, dim in enumerate(self._shape.dims):
+        if dim.value is not None:
+          pieces[i] = constant_op.constant(dim.value, dense_shape.dtype)
+      dense_shape = array_ops.stack(pieces)
+    else:
+      dense_shape.set_shape([rank])
+
+    return SparseTensor(indices, values, dense_shape)
 
   def _batch(self, batch_size):
     return SparseTensorSpec(
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 03aa63b624e..0202a83ef9f 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
@@ -108,5 +111,146 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
             sparse_tensor_value.dense_shape, convertee.dense_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class SparseTensorSpecTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    spec1 = sparse_tensor.SparseTensorSpec()
+    self.assertEqual(spec1.shape.rank, None)
+    self.assertEqual(spec1.dtype, dtypes.float32)
+
+    spec2 = sparse_tensor.SparseTensorSpec([None, None], dtypes.string)
+    self.assertEqual(spec2.shape.as_list(), [None, None])
+    self.assertEqual(spec2.dtype, dtypes.string)
+
+  def testValueType(self):
+    spec1 = sparse_tensor.SparseTensorSpec()
+    self.assertEqual(spec1.value_type, sparse_tensor.SparseTensor)
+
+  @parameterized.parameters([
+      (sparse_tensor.SparseTensorSpec(),
+       (tensor_shape.TensorShape(None), dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec(shape=[5, None, None]),
+       (tensor_shape.TensorShape([5, None, None]), dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec(dtype=dtypes.int32),
+       (tensor_shape.TensorShape(None), dtypes.int32)),
+  ])  # pyformat: disable
+  def testSerialize(self, st_spec, expected):
+    serialization = st_spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      (sparse_tensor.SparseTensorSpec(dtype=dtypes.string), [
+          tensor_spec.TensorSpec([None, None], dtypes.int64),
+          tensor_spec.TensorSpec([None], dtypes.string),
+          tensor_spec.TensorSpec([None], dtypes.int64)
+      ]),
+      (sparse_tensor.SparseTensorSpec(shape=[5, None, None]), [
+          tensor_spec.TensorSpec([None, 3], dtypes.int64),
+          tensor_spec.TensorSpec([None], dtypes.float32),
+          tensor_spec.TensorSpec([3], dtypes.int64)
+      ]),
+  ])
+  def testComponentSpecs(self, st_spec, expected):
+    self.assertEqual(st_spec._component_specs, expected)
+
+  @parameterized.parameters([
+      {
+          "st_spec": sparse_tensor.SparseTensorSpec(),
+          "indices": [[0, 1], [10, 8]],
+          "values": [3.0, 5.0],
+          "dense_shape": [100, 100]
+      },
+      {
+          "st_spec": sparse_tensor.SparseTensorSpec([100, None, None]),
+          "indices": [[0, 1, 3], [10, 8, 2]],
+          "values": [3.0, 5.0],
+          "dense_shape": [100, 20, 20]
+      },
+  ])
+  def testToFromComponents(self, st_spec, indices, values, dense_shape):
+    st = sparse_tensor.SparseTensor(indices, values, dense_shape)
+    actual_components = st_spec._to_components(st)
+    self.assertAllTensorsEqual(actual_components,
+                               [indices, values, dense_shape])
+    st_reconstructed = st_spec._from_components(actual_components)
+    self.assertAllEqual(st.indices, st_reconstructed.indices)
+    self.assertAllEqual(st.values, st_reconstructed.values)
+    self.assertAllEqual(st.dense_shape, st_reconstructed.dense_shape)
+
+  @test_util.run_v1_only("SparseTensorValue is deprecated in v2")
+  def testFromNumpyComponents(self):
+    indices = np.array([[0], [8]])
+    values = np.array([1.0, 9.0])
+    dense_shape = np.array([100])
+    spec = sparse_tensor.SparseTensorSpec()
+    st = spec._from_components([indices, values, dense_shape])
+    self.assertIsInstance(st, sparse_tensor.SparseTensorValue)
+    self.assertAllEqual(st.indices, indices)
+    self.assertAllEqual(st.values, values)
+    self.assertAllEqual(st.dense_shape, dense_shape)
+
+  @parameterized.parameters([
+      sparse_tensor.SparseTensorSpec(dtype=dtypes.string),
+      sparse_tensor.SparseTensorSpec(shape=[5, None, None]),
+  ])
+  def testFlatTensorSpecs(self, st_spec):
+    self.assertEqual(st_spec._flat_tensor_specs,
+                     [tensor_spec.TensorSpec(None, dtypes.variant)])
+
+  @parameterized.parameters([
+      {
+          "st_spec": sparse_tensor.SparseTensorSpec(),
+          "indices": [[0, 1], [10, 8]],
+          "values": [3.0, 5.0],
+          "dense_shape": [100, 100]
+      },
+      {
+          "st_spec": sparse_tensor.SparseTensorSpec([100, None, None]),
+          "indices": [[0, 1, 3], [10, 8, 2]],
+          "values": [3.0, 5.0],
+          "dense_shape": [100, 20, 20]
+      },
+  ])
+  def testToFromTensorList(self, st_spec, indices, values, dense_shape):
+    st = sparse_tensor.SparseTensor(indices, values, dense_shape)
+    tensor_list = st_spec._to_tensor_list(st)
+    st_reconstructed = st_spec._from_tensor_list(tensor_list)
+    self.assertAllEqual(st.indices, st_reconstructed.indices)
+    self.assertAllEqual(st.values, st_reconstructed.values)
+    self.assertAllEqual(st.dense_shape, st_reconstructed.dense_shape)
+
+  @parameterized.parameters([
+      (sparse_tensor.SparseTensorSpec([2, None], dtypes.float32), 32,
+       sparse_tensor.SparseTensorSpec([32, 2, None], dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec([4, None], dtypes.float32), None,
+       sparse_tensor.SparseTensorSpec([None, 4, None], dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec([2], dtypes.float32), 32,
+       sparse_tensor.SparseTensorSpec([32, 2], dtypes.float32)),
+  ])
+  def testBatch(self, spec, batch_size, expected):
+    self.assertEqual(spec._batch(batch_size), expected)
+
+  @parameterized.parameters([
+      (sparse_tensor.SparseTensorSpec([32, None, None], dtypes.float32),
+       sparse_tensor.SparseTensorSpec([None, None], dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec([None, None, None], dtypes.float32),
+       sparse_tensor.SparseTensorSpec([None, None], dtypes.float32)),
+      (sparse_tensor.SparseTensorSpec([32, 2], dtypes.float32),
+       sparse_tensor.SparseTensorSpec([2], dtypes.float32)),
+  ])
+  def testUnbatch(self, spec, expected):
+    self.assertEqual(spec._unbatch(), expected)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 14fbddabd00..4a26b7224ae 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -22,6 +22,7 @@ from tensorflow.python import tf2
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 _TENSORSHAPE_V2_OVERRIDE = None
@@ -1238,11 +1239,13 @@ def unknown_shape(rank=None, **kwargs):
     return TensorShape([Dimension(None)] * rank)
 
 
+@deprecation.deprecated(None, "Use tf.TensorShape([]).")
 def scalar():
   """Returns a shape representing a scalar."""
   return TensorShape([])
 
 
+@deprecation.deprecated(None, "Use tf.TensorShape([length]).")
 def vector(length):
   """Returns a shape representing a vector.
 
@@ -1255,6 +1258,7 @@ def vector(length):
   return TensorShape([length])
 
 
+@deprecation.deprecated(None, "Use tf.TensorShape([rows, cols]).")
 def matrix(rows, cols):
   """Returns a shape representing a matrix.
 
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 5fa78f2041a..ccbf5cf9208 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -377,14 +377,6 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self._testMostSpecificCompatibleShapeHelper([1, 1, 3], [None, 2, 3],
                                                 [None, None, 3])
 
-  def testHelpers(self):
-    tensor_shape.TensorShape([]).assert_is_compatible_with(
-        tensor_shape.scalar())
-    tensor_shape.TensorShape([37]).assert_is_compatible_with(
-        tensor_shape.vector(37))
-    tensor_shape.TensorShape(
-        [94, 43]).assert_is_compatible_with(tensor_shape.matrix(94, 43))
-
   def testTruedivFails(self):
     unknown = tensor_shape.Dimension(None)
     self.assertEqual((unknown // unknown).value, None)
@@ -430,9 +422,9 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(
         "(32, None, 1, 9)",
         str(tensor_shape.TensorShape([32, None, 1, 9])).replace("?", "None"))
-    self.assertEqual("()", str(tensor_shape.scalar()))
-    self.assertEqual("(7,)", str(tensor_shape.vector(7)))
-    self.assertEqual("(3, 8)", str(tensor_shape.matrix(3, 8)))
+    self.assertEqual("()", str(tensor_shape.TensorShape([])))
+    self.assertEqual("(7,)", str(tensor_shape.TensorShape([7])))
+    self.assertEqual("(3, 8)", str(tensor_shape.TensorShape([3, 8])))
     self.assertEqual("(4, 5, 2)", str(tensor_shape.TensorShape([4, 5, 2])))
 
   def testAsProto(self):
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index 1e224e628c2..7240f288686 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -287,7 +287,7 @@ class BoundedTensorSpec(TensorSpec):
     return (self._shape, self._dtype, self._minimum, self._maximum, self._name)
 
 
-pywrap_tensorflow.RegisterType("TensorSpec", TensorSpec)
+_pywrap_utils.RegisterType("TensorSpec", TensorSpec)
 
 
 # Note: we do not include Tensor names when constructing TypeSpecs.
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index d957b6b0647..9ecdd5e6e76 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_like
@@ -332,6 +333,11 @@ def _AssertCompatible(values, dtype):
 
 def _is_array_like(obj):  # pylint: disable=invalid-name
   """Check if a given object is array-like."""
+  if isinstance(obj, ops.Tensor) and not isinstance(obj, ops._EagerTensorBase):  # pylint: disable=protected-access
+    # Tensor implements __array__ only so it can inform the user that it is not
+    # a valid array.
+    return False
+
   # TODO(slebedev): an object could also implement C-level array interface.
   if (callable(getattr(obj, "__array__", None)) or
       isinstance(getattr(obj, "__array_interface__", None), dict)):
@@ -731,6 +737,21 @@ def _ConstantValue(tensor, partial):
         return None
       values.append(value)
     return np.array(values)
+  elif tensor.op.type == "Unpack":
+    # We can't handle axis != 0 Unpacks at the moment.
+    if tensor.op.get_attr("axis") != 0:
+      return None
+    value = constant_value(tensor.op.inputs[0], partial)
+    if value is None:
+      return None
+    return value[tensor.value_index]
+  elif tensor.op.type == "Split":
+    dim = constant_value(tensor.op.inputs[0])
+    value = constant_value(tensor.op.inputs[1], partial)
+    if value is None or dim is None:
+      return None
+    split = np.split(value, tensor.op.get_attr("num_split"), dim)
+    return split[tensor.value_index]
   elif tensor.op.type == "Fill":
     fill_shape = tensor.shape
     fill_value = constant_value(tensor.op.inputs[1])
@@ -754,6 +775,8 @@ def _ConstantValue(tensor, partial):
     if value2 is None:
       return None
     return np.not_equal(value1, value2)
+  elif tensor.op.type == "StopGradient":
+    return constant_value(tensor.op.inputs[0], partial)
   else:
     return None
 
@@ -833,11 +856,11 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
 
   shape = tensor.get_shape().with_rank(1)
   if shape == [0]:
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
   elif tensor.op.type == "Shape":
     return tensor.op.inputs[0].get_shape()
   elif tensor.op.type == "Pack":
-    ret = tensor_shape.scalar()  # Empty list.
+    ret = tensor_shape.TensorShape([])  # Empty list.
     # Since we expect rank 1 inputs, Pack's axis must be zero, otherwise it
     # would not be rank 1.
     assert tensor.op.get_attr("axis") == 0
@@ -855,7 +878,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
     # We assume that `tensor.op.inputs[0]` evaluates to 0, as this is
     # the only legal value when concatenating vectors, and it will
     # have been checked by a previous shape function.
-    ret = tensor_shape.scalar()  # Empty list.
+    ret = tensor_shape.TensorShape([])  # Empty list.
     for concat_input in tensor.op.inputs[1:]:
       # `concat_input` must be a vector. Attempt to evaluate it as a shape,
       # and concatenate it with `ret`.
@@ -865,7 +888,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
     # We assume that `tensor.op.inputs[-1]` evaluates to 0, as this is
     # the only legal value when concatenating vectors, and it will
     # have been checked by a previous shape function.
-    ret = tensor_shape.scalar()  # Empty list.
+    ret = tensor_shape.TensorShape([])  # Empty list.
     for concat_input in tensor.op.inputs[:-1]:
       # `concat_input` must be a vector. Attempt to evaluate it as a shape,
       # and concatenate it with `ret`.
@@ -904,6 +927,18 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
       pass
     except TypeError:  # Could come from slicing prev.
       pass
+  elif (tensor.op.type == "Placeholder" and
+        tensor.op.graph.building_function and
+        hasattr(tensor.op.graph, "internal_captures")):
+    # If we are inside a FuncGraph try to lookup the constant value of the
+    # corresponding external capture. Note that we only look at captures and
+    # not the fed inputs because those can be fed different values in different
+    # instantiations of the function call or different iterations of a
+    # tf.while_loop.
+    for i, capture in enumerate(tensor.op.graph.internal_captures):
+      if capture is tensor:
+        external_capture = tensor.op.graph.external_captures[i]
+        return constant_value_as_shape(external_capture)
 
   ret = tensor_shape.unknown_shape(shape.dims[0].value)
   value = constant_value(tensor)
@@ -944,3 +979,42 @@ def shape_tensor(shape):  # pylint: disable=invalid-name
       # not convertible to Tensors becasue of mixed content.
       shape = tuple(map(tensor_shape.dimension_value, shape))
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
+
+
+# DO NOT USE: For testing only.
+_ENABLE_MAYBE_SET_STATIC_SHAPE = True
+
+
+def maybe_set_static_shape(tensor, shape):  # pylint: disable=invalid-name
+  """Sets the shape of `tensor` to the `shape`'s constant value, if inferrable.
+
+  This is a temporary workaround to fix shape inference across functional op
+  boundaries. E.g.
+
+  ```python
+  shape = tf.constant([3])
+  @tf.function
+  def f():
+    u = tf.random_uniform(shape)
+    return u
+  ```
+
+  If we were to rely solely on C++ shape inference, the shape of `u` inside
+  `f` would be unknown because C++ shape inference is not aware of the outer
+  graph and all it sees is a Placeholder node when backtracing the captured
+  tensor for `shape`. `maybe_set_static_shape` computes the static shape value
+  of `shape` by traversing the `FuncGraph` boundaries and sets the correct
+  shape.
+
+  A longer term solution would be to fix C++ shape inference.
+
+  Args:
+    tensor: A tensor.
+    shape: A shape tensor.
+  """
+  if (_ENABLE_MAYBE_SET_STATIC_SHAPE and not context.executing_eagerly() and
+      ops.get_default_graph().building_function and
+      not tensor.shape.is_fully_defined() and is_tensor(shape)):
+    shape = shape_tensor(shape)
+    const_shape = constant_value_as_shape(shape)
+    tensor.set_shape(const_shape)
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index d7ce2eff2dd..bea744ff641 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import sys
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
@@ -906,6 +908,40 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val, partial=True)
     self.assertIsNone(c_val)
 
+  @test_util.run_deprecated_v1
+  def testUnpack_Axis0(self):
+    inputs = np.random.rand(3, 4, 7)
+    tf_vals = array_ops.unstack(inputs)
+    c_vals = [tensor_util.constant_value(x) for x in tf_vals]
+    self.assertAllClose(inputs, c_vals)
+
+  @test_util.run_deprecated_v1
+  def testUnpack_Partial_Axis0(self):
+    input_ = np.random.rand(4, 7)
+    packed = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
+    tf_vals = array_ops.unstack(packed)
+    c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
+    self.assertAllClose(input_, c_vals[0])
+    self.assertIsNone(c_vals[1])
+
+  @test_util.run_deprecated_v1
+  def testSplit_Axis0(self):
+    inputs = np.random.rand(6, 5, 7)
+    tf_vals = array_ops.split(inputs, 3)
+    c_vals = [tensor_util.constant_value(x) for x in tf_vals]
+    self.assertAllClose(np.split(inputs, 3), c_vals)
+
+  @test_util.run_deprecated_v1
+  def testSplit_Partial_Axis0(self):
+    input_ = np.random.rand(4, 7)
+    placeholder = array_ops.placeholder(dtypes.float32, shape=(4, 7))
+    # it'd be better to use concat here, but concat doesn't support partial
+    packed = array_ops.stack([input_, placeholder])
+    tf_vals = array_ops.split(packed, 2)
+    c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
+    self.assertAllClose(input_, c_vals[0][0])
+    self.assertIsNone(c_vals[1][0])
+
   def testEqual(self):
     # Scalar inputs.
     tf_val = math_ops.equal(constant_op.constant(1), constant_op.constant(1))
@@ -936,6 +972,12 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllEqual(c_val, [[False, True], [True, False]])
 
+  def testStopGradient(self):
+    input_ = np.random.rand(4, 7)
+    tf_val = array_ops.stop_gradient(input_)
+    c_val = tensor_util.constant_value(tf_val)
+    self.assertAllEqual(input_, c_val)
+
   def testLiteral(self):
     x = "hi"
     self.assertIs(x, tensor_util.constant_value(x))
@@ -1080,6 +1122,52 @@ class ConstantValueAsShapeTest(test.TestCase):
       c_val = tensor_util.constant_value_as_shape(tf_val)
 
 
+class MaybeSetStaticShapeTest(test.TestCase):
+
+  @contextlib.contextmanager
+  def disableSetStaticShape(self):
+    flag_old = tensor_util._ENABLE_MAYBE_SET_STATIC_SHAPE
+    tensor_util._ENABLE_MAYBE_SET_STATIC_SHAPE = False
+    try:
+      yield
+    finally:
+      tensor_util._ENABLE_MAYBE_SET_STATIC_SHAPE = flag_old
+
+  @test_util.run_deprecated_v1
+  def testMaybeSetStaticShape(self):
+    shape = constant_op.constant([2, 5], dtype=dtypes.int32)
+
+    def reshape():
+      v = array_ops.zeros([10])
+      return array_ops.reshape(v, shape)
+
+    with self.disableSetStaticShape():
+      graph_without_shape_propagation = func_graph.func_graph_from_py_func(
+          "without_shape_propagation", reshape, [], {})
+    graph_with_shape_propagation = func_graph.func_graph_from_py_func(
+        "with_shape_propagation", reshape, [], {})
+    self.assertCountEqual(
+        [op.type for op in graph_without_shape_propagation.get_operations()],
+        [op.type for op in graph_with_shape_propagation.get_operations()])
+
+  @test_util.run_deprecated_v1
+  def testMaybeSetStaticShapeScalarShape(self):
+
+    def reshape():
+      v = array_ops.placeholder(dtypes.float32)
+      t = array_ops.reshape(v, [-1])
+      return t
+
+    with self.disableSetStaticShape():
+      graph_without_shape_propagation = func_graph.func_graph_from_py_func(
+          "without_shape_propagation", reshape, [], {})
+    graph_with_shape_propagation = func_graph.func_graph_from_py_func(
+        "with_shape_propagation", reshape, [], {})
+    self.assertCountEqual(
+        [op.type for op in graph_without_shape_propagation.get_operations()],
+        [op.type for op in graph_with_shape_propagation.get_operations()])
+
+
 class ShapeTensorTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 5d1386c26d7..550d5babcf7 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -96,13 +96,13 @@ class KernelLabelOp : public OpKernel {
                    ctx->allocate_output("result", TensorShape({}), &output));
     switch (KL) {
       case DEFAULT_LABEL:
-        output->scalar<string>()() = "My label is: default";
+        output->scalar<tstring>()() = "My label is: default";
         break;
       case OVERLOAD_1_LABEL:
-        output->scalar<string>()() = "My label is: overload_1";
+        output->scalar<tstring>()() = "My label is: overload_1";
         break;
       case OVERLOAD_2_LABEL:
-        output->scalar<string>()() = "My label is: overload_2";
+        output->scalar<tstring>()() = "My label is: overload_2";
         break;
     }
   }
@@ -676,7 +676,7 @@ class DevicePlacementOp : public OpKernel {
     Tensor* output;
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("device", TensorShape({}), &output));
-    output->scalar<string>()() = ctx->device()->name();
+    output->scalar<tstring>()() = ctx->device()->name();
   }
 };
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a1adf18bf35..8857e767ed7 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 from collections import OrderedDict
 import contextlib
 import functools
@@ -69,6 +68,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops import script_ops
@@ -83,6 +83,7 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.compat import collections_abc
 
 
 # If the below import is made available through the BUILD rule, then this
@@ -536,6 +537,29 @@ def disable_control_flow_v2(unused_msg):
   return wrapper
 
 
+def enable_output_all_intermediates(fn):
+  """Force-enable outputing all intermediates from functional control flow ops.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    output_all_intermediates_old = \
+        control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+    control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = True
+    try:
+      return fn(*args, **kwargs)
+    finally:
+      control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = \
+          output_all_intermediates_old
+
+  return wrapper
+
+
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
@@ -547,7 +571,7 @@ def assert_no_new_pyobjects_executing_eagerly(f):
   a bit of Python.
   """
 
-  def decorator(self, **kwargs):
+  def decorator(self, *args, **kwargs):
     """Warms up, gets an object count, runs the test, checks for new objects."""
     with context.eager_mode():
       gc.disable()
@@ -558,7 +582,7 @@ def assert_no_new_pyobjects_executing_eagerly(f):
       # tests that fail with 1 warmup run, and pass with 2, on various versions
       # of python2.7.x.
       for _ in range(2):
-        f(self, **kwargs)
+        f(self, *args, **kwargs)
       gc.collect()
       previous_count = len(gc.get_objects())
       if ops.has_default_graph():
@@ -567,7 +591,7 @@ def assert_no_new_pyobjects_executing_eagerly(f):
             for collection in ops.get_default_graph().collections
         }
       for _ in range(3):
-        f(self, **kwargs)
+        f(self, *args, **kwargs)
       # Note that gc.get_objects misses anything that isn't subject to garbage
       # collection (C types). Collections are a common source of leaks, so we
       # test for collection sizes explicitly.
@@ -915,11 +939,14 @@ def generate_combinations_with_testcase_name(**kwargs):
 def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith(
-        unittest.TestLoader.testMethodPrefix) and not (
-            name.startswith("testSkipEager") or
-            name.startswith("test_skip_eager") or name == "test_session"):
+  for name in dir(cls):
+    if (not name.startswith(unittest.TestLoader.testMethodPrefix) or
+        name.startswith("testSkipEager") or
+        name.startswith("test_skip_eager") or
+        name == "test_session"):
+      continue
+    value = getattr(cls, name, None)
+    if callable(value):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -2301,8 +2328,8 @@ class TensorFlowTestCase(googletest.TestCase):
       a = a._asdict()
     if hasattr(b, "_asdict"):
       b = b._asdict()
-    a_is_dict = isinstance(a, collections.Mapping)
-    if a_is_dict != isinstance(b, collections.Mapping):
+    a_is_dict = isinstance(a, collections_abc.Mapping)
+    if a_is_dict != isinstance(b, collections_abc.Mapping):
       raise ValueError("Can't compare dict to non-dict, a%s vs b%s. %s" %
                        (path_str, path_str, msg))
     if a_is_dict:
@@ -2495,6 +2522,21 @@ class TensorFlowTestCase(googletest.TestCase):
       msgs.append("not equal rhs = {}".format(y))
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
+  @py_func_if_in_function
+  def assertNotAllEqual(self, a, b, msg=None):
+    """Asserts that two numpy arrays or Tensors do not have the same values.
+
+    Args:
+      a: the expected numpy ndarray or anything can be converted to one.
+      b: the actual numpy ndarray or anything can be converted to one.
+      msg: Optional message to report on failure.
+    """
+    try:
+      self.assertAllEqual(a, b, msg)
+    except AssertionError:
+      return
+    raise AssertionError("The two values are equal at all elements")
+
   @py_func_if_in_function
   def assertAllGreater(self, a, comparison_target):
     """Assert element values are all greater than a target value.
diff --git a/tensorflow/python/framework/traceable_stack.py b/tensorflow/python/framework/traceable_stack.py
index 0a0cda870fc..857d021b293 100644
--- a/tensorflow/python/framework/traceable_stack.py
+++ b/tensorflow/python/framework/traceable_stack.py
@@ -55,19 +55,21 @@ class TraceableObject(object):
     # beyond the caller.
     local_offset = offset + 1
 
-    frame_records = tf_stack.extract_stack_file_and_line(
-        max_length=local_offset + 1)
+    frame_records = tf_stack.extract_stack(
+        limit=local_offset + 1)
     if not frame_records:
       return self.FAILURE
     if len(frame_records) > local_offset:
-      # Negative indexing is one-indexed instead of zero-indexed.
-      negative_offset = -(local_offset + 1)
-      self.filename, self.lineno = frame_records[negative_offset]
+      frame = frame_records[len(frame_records) - (local_offset + 1)]
+      self.filename = frame.filename
+      self.lineno = frame.lineno
       return self.SUCCESS
     else:
       # If the offset is too large then we use the largest offset possible,
       # meaning we use the outermost stack frame at index 0.
-      self.filename, self.lineno = frame_records[0]
+      frame = frame_records[0]
+      self.filename = frame.filename
+      self.lineno = frame.lineno
       return self.HEURISTIC_USED
 
   def copy_metadata(self):
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 214dce5acbe..c724f5d8100 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -22,7 +22,7 @@ import abc
 import numpy as np
 import six
 
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
@@ -260,7 +260,8 @@ class TypeSpec(object):
 
   def __eq__(self, other):
     # pylint: disable=protected-access
-    return self.__get_cmp_key() == other.__get_cmp_key()
+    return (type(other) is type(self) and
+            self.__get_cmp_key() == other.__get_cmp_key())
 
   def __ne__(self, other):
     return not self == other
@@ -545,4 +546,4 @@ def register_type_spec_from_value_converter(type_object, converter_fn,
       (type_object, converter_fn, allow_subclass))
 
 
-pywrap_tensorflow.RegisterType("TypeSpec", TypeSpec)
+_pywrap_utils.RegisterType("TypeSpec", TypeSpec)
diff --git a/tensorflow/python/framework/type_spec_test.py b/tensorflow/python/framework/type_spec_test.py
index dcc54b538f7..46e1ea32d72 100644
--- a/tensorflow/python/framework/type_spec_test.py
+++ b/tensorflow/python/framework/type_spec_test.py
@@ -143,6 +143,8 @@ class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                       tensor_spec.TensorSpec([4], name="a")),
        TwoTensorsSpec([5, 3], dtypes.int32, [3], dtypes.bool,
                       tensor_spec.TensorSpec([4], name="b"))),
+      ("Non-TypeSpec",
+       TwoTensorsSpec([5, 3], dtypes.int32, [8], dtypes.bool), 5),
       )
   def testInequality(self, v1, v2):
     # pylint: disable=g-generic-assert
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index ea160088356..3893cb4748b 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -19,18 +19,22 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import numpy as np
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
-
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.compat import compat
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -40,7 +44,9 @@ from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 
 
@@ -436,7 +442,7 @@ class AutoMixedPrecisionTest(test.TestCase):
       self.assertEqual(num_to_fp32, 1)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_simple_loop(self):
     """Test graph with while loop."""
@@ -455,7 +461,7 @@ class AutoMixedPrecisionTest(test.TestCase):
       self._assert_output_fp16(node_map, 'while/Relu')
       self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_loop_with_vars_intertwined(self):
     """Test graph with intertwined while loops."""
@@ -528,7 +534,7 @@ class AutoMixedPrecisionTest(test.TestCase):
       self._assert_output_fp16(node_map, 'Relu_1')
       self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_recurrent_lstm(self):
     """Test graph with recurrent lstm."""
@@ -554,47 +560,48 @@ class AutoMixedPrecisionTest(test.TestCase):
       self._assert_output_fp16(node_map, 'while/Tanh_1')
       self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_1(self):
     self._run_simple_loop_test('W', 'C', 'C')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_2(self):
     self._run_simple_loop_test('C', 'C', 'W')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_3(self):
     self._run_simple_loop_test('W', 'G', 'W')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_4(self):
     self._run_simple_loop_test('W', 'gbg', 'W')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_5(self):
     self._run_simple_loop_test('b', 'gWC', 'c')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_6(self):
     self._run_simple_loop_test('b', 'CWCG', 'C')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_7(self):
     self._run_simple_loop_test('C', 'GWCG', 'C')
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
   def test_propagation_through_simple_loop_8(self):
     self._run_simple_loop_test('C', 'CgbgWC', 'g')
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test does not pass with XLA')
   def test_noninlined_funcdef(self):
     """Test graph with non-inlined function subgraph.
 
@@ -616,6 +623,53 @@ class AutoMixedPrecisionTest(test.TestCase):
       self._assert_output_fp16(node_map, 'MatMul')
       self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test does not pass with XLA')
+  def test_ingraph_train_loop(self):
+    """Tests a graph containing a while loop around a training update.
+
+    This requires the grappler pass to take special care with its handling of
+    Enter ops that appear in front of reads from non-resource variables. See
+    the use of NodeImplicitlyReadsVariable in auto_mixed_precision.cc.
+    """
+    if tf2.enabled():
+      # This test tests non-resource variables, which are only used in TF1.
+      self.skipTest('TensorFlow 1 required')
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(1234)
+      np.random.seed(1234)
+      num_iter, bs, nchan, nclass = 100, 64, 32, 100
+
+      data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
+      labels = np.random.randint(nclass, size=(bs * num_iter,))
+      ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
+      ds = ds.batch(bs).prefetch(3)
+      it = ds.make_one_shot_iterator()
+
+      def body(_, i):
+        i += 1
+        x, yt = it.get_next()
+        dense = layers.Dense(nclass)
+        y = dense(x)
+        loss = losses.sparse_softmax_cross_entropy(yt, y)
+        opt = adam.AdamOptimizer()
+        train_op = opt.minimize(loss, var_list=dense.trainable_weights)
+        with ops.control_dependencies([train_op]):
+          loss = array_ops.identity(loss)
+        return loss, i
+
+      begin, end = constant_op.constant(0), constant_op.constant(num_iter)
+      loss, _ = control_flow_ops.while_loop(
+          lambda loss, i: math_ops.less(i, end), body, [0.0, begin])
+
+      output_val_ref, output_val, cost_graph = self._run(loss)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'while/dense/MatMul')
+      self._assert_output_fp16(
+          node_map, 'while/gradients/while/dense/MatMul_grad/MatMul_1')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
index 6937301ab25..e2587633969 100644
--- a/tensorflow/python/grappler/datasets_test.py
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -129,8 +129,9 @@ class GrapplerTest(test.TestCase):
       mg = meta_graph.create_meta_graph_def(graph=g)
       grappler_item = item.Item(mg)
       op_properties = grappler_item.GetOpProperties()
-      self.assertEqual(tensor_shape.scalar(),
-                       op_properties['IteratorGetNext'][0].shape)
+      self.assertEqual(
+          tensor_shape.TensorShape([]),
+          op_properties['IteratorGetNext'][0].shape)
 
   def _testTransformation(self, fn):
     test_cases = [{
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index c02fd9f55b8..3ec901a15ea 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -80,7 +80,7 @@ class ItemTest(test.TestCase):
         else:
           self.assertEqual(1, len(node_prop))
           self.assertEqual(dtypes.int32, node_prop[0].dtype)
-          self.assertEqual(tensor_shape.scalar(), node_prop[0].shape)
+          self.assertEqual(tensor_shape.TensorShape([]), node_prop[0].shape)
 
   def testUpdates(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 35866d35d3f..d487fe5f8ce 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,7 +3,6 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -145,11 +144,12 @@ py_library(
         ":tf_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:auto_control_deps",
-        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:control_flow_v2_func_graphs",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
@@ -492,6 +492,7 @@ py_library(
         ":generic_utils",
         ":tf_utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
@@ -714,6 +715,18 @@ tf_py_test(
     shard_count = 4,
 )
 
+tf_py_test(
+    name = "temporal_sample_weights_correctness_test",
+    size = "medium",
+    srcs = ["temporal_sample_weights_correctness_test.py"],
+    additional_deps = [
+        ":keras",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 20,
+)
+
 tf_py_test(
     name = "applications_test",
     size = "medium",
@@ -743,6 +756,7 @@ tf_py_test(
     srcs = ["layers/tensorflow_op_layer_test.py"],
     additional_deps = [
         ":keras",
+        ":saving",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:backprop",
@@ -901,7 +915,7 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
@@ -1112,7 +1126,7 @@ tf_py_test(
 
 tf_py_test(
     name = "wrappers_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/wrappers_test.py"],
     additional_deps = [
         ":keras",
@@ -1183,8 +1197,6 @@ tf_py_test(
     ],
     shard_count = 6,
     tags = [
-        "no_oss",
-        "no_windows",
         "noasan",  # times out
         "notsan",
         "optonly",  # times out
@@ -1378,7 +1390,10 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
-    tags = ["notsan"],
+    tags = [
+        "no_oss",
+        "notsan",
+    ],
 )
 
 tf_py_test(
@@ -1490,12 +1505,28 @@ tf_py_test(
     ],
     shard_count = 6,
     tags = [
-        "no_oss",
         "noasan",  # TODO(b/132183295): Re-enable this.
         "notsan",
     ],
 )
 
+tf_py_test(
+    name = "training_integration_test",
+    size = "medium",
+    srcs = ["engine/training_integration_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 30,
+    tags = [
+        "no_rocm",
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+    ],
+)
+
 tf_py_test(
     name = "feature_columns_integration_test",
     size = "medium",
@@ -1546,18 +1577,49 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "model_subclassing_test_util",
+    srcs = ["model_subclassing_test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+    ],
+)
+
 tf_py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
     additional_deps = [
+        ":model_subclassing_test_util",
         ":keras",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
+)
+
+tf_py_test(
+    name = "model_subclassing_compiled_test",
+    size = "medium",
+    srcs = ["model_subclassing_compiled_test.py"],
+    additional_deps = [
+        ":model_subclassing_test_util",
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
 )
 
 tf_py_test(
@@ -1635,6 +1697,9 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
+    tags = [
+        "no_windows",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index 64fa7313ca3..96552549a27 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -38,6 +38,7 @@ from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
 from tensorflow.python.keras import ops
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import premade
 from tensorflow.python.keras import preprocessing
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import utils
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index c7ebb4b2524..00fef68728d 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -37,7 +37,6 @@ from tensorflow.python.client import session as session_module
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import lift_to_graph
@@ -48,6 +47,7 @@ from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend_config
 from tensorflow.python.ops import array_ops
@@ -64,7 +64,6 @@ from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
@@ -73,6 +72,7 @@ from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -272,10 +272,13 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  if ops.get_default_graph() is _GRAPH:
+  graph = ops.get_default_graph()
+  if graph is _GRAPH:
     # Don't enter an init_scope for the learning phase if eager execution
     # is enabled but we're inside the Keras workspace graph.
-    return symbolic_learning_phase()
+    learning_phase = symbolic_learning_phase()
+    _mark_func_graph_as_unsaveable(graph, learning_phase)
+    return learning_phase
   with ops.init_scope():
     # We always check & set the learning phase inside the init_scope,
     # otherwise the wrong default_graph will be used to look up the learning
@@ -289,13 +292,34 @@ def learning_phase():
         # Fallback to inference mode as default.
         return 0
       return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
-    return symbolic_learning_phase()
+    learning_phase = symbolic_learning_phase()
+    _mark_func_graph_as_unsaveable(graph, learning_phase)
+    return learning_phase
 
 
 def global_learning_phase_is_set():
   return _DUMMY_EAGER_GRAPH in _GRAPH_LEARNING_PHASES
 
 
+def _mark_func_graph_as_unsaveable(graph, learning_phase):
+  """Mark func graph as unsaveable due to use of symbolic keras learning phase.
+
+  Functions that capture the symbolic learning phase cannot be exported to
+  SavedModel. Mark the funcgraph as unsaveable, so that an error will be raised
+  if it is exported.
+
+  Args:
+    graph: Graph or FuncGraph object.
+    learning_phase: Learning phase placeholder or int defined in the graph.
+  """
+  if graph.building_function and is_placeholder(learning_phase):
+    graph.mark_as_unsaveable(
+        'The keras learning phase placeholder was used inside a function. '
+        'Exporting placeholders is not supported when saving out a SavedModel. '
+        'Please call `tf.keras.backend.set_learning_phase(0)` in the function '
+        'to set the learning phase to a constant value.')
+
+
 def symbolic_learning_phase():
   graph = get_graph()
   with graph.as_default():
@@ -669,13 +693,13 @@ def is_sparse(tensor):
 
   Example:
   ```python
-      >>> from keras import backend as K
-      >>> a = K.placeholder((2, 2), sparse=False)
-      >>> print(K.is_sparse(a))
-      False
-      >>> b = K.placeholder((2, 2), sparse=True)
-      >>> print(K.is_sparse(b))
-      True
+  >>> from keras import backend as K
+  >>> a = K.placeholder((2, 2), sparse=False)
+  >>> print(K.is_sparse(a))
+  False
+  >>> b = K.placeholder((2, 2), sparse=True)
+  >>> print(K.is_sparse(b))
+  True
   ```
   """
   return isinstance(tensor, sparse_tensor.SparseTensor)
@@ -693,13 +717,13 @@ def to_dense(tensor):
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> b = K.placeholder((2, 2), sparse=True)
-      >>> print(K.is_sparse(b))
-      True
-      >>> c = K.to_dense(b)
-      >>> print(K.is_sparse(c))
-      False
+  >>> from keras import backend as K
+  >>> b = K.placeholder((2, 2), sparse=True)
+  >>> print(K.is_sparse(b))
+  True
+  >>> c = K.to_dense(b)
+  >>> print(K.is_sparse(c))
+  False
   ```
   """
   if is_sparse(tensor):
@@ -752,17 +776,17 @@ def variable(value, dtype=None, name=None, constraint=None):
 
   Examples:
   ```python
-      >>> import numpy as np
-      >>> from keras import backend as K
-      >>> val = np.array([[1, 2], [3, 4]])
-      >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
-      >>> K.dtype(kvar)
-      'float64'
-      >>> print(kvar)
-      example_var
-      >>> kvar.eval()
-      array([[ 1.,  2.],
-             [ 3.,  4.]])
+  >>> import numpy as np
+  >>> from keras import backend as K
+  >>> val = np.array([[1, 2], [3, 4]])
+  >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
+  >>> K.dtype(kvar)
+  'float64'
+  >>> print(kvar)
+  example_var
+  >>> kvar.eval()
+  array([[ 1.,  2.],
+         [ 3.,  4.]])
   ```
   """
   if dtype is None:
@@ -775,7 +799,7 @@ def variable(value, dtype=None, name=None, constraint=None):
         indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
     v._keras_shape = sparse_coo.shape
     return v
-  v = resource_variable_ops.ResourceVariable(
+  v = variables_module.Variable(
       value,
       dtype=dtypes_module.as_dtype(dtype),
       name=name,
@@ -803,7 +827,7 @@ def track_variable(v):
     return
   graph = v.graph if hasattr(v, 'graph') else get_graph()
   if graph not in _GRAPH_VARIABLES:
-    _GRAPH_VARIABLES[graph] = weakref.WeakSet()
+    _GRAPH_VARIABLES[graph] = object_identity.ObjectIdentityWeakSet()
   _GRAPH_VARIABLES[graph].add(v)
 
 
@@ -905,6 +929,7 @@ def constant(value, dtype=None, shape=None, name=None):
   return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
 
 
+@keras_export('keras.backend.is_keras_tensor')
 def is_keras_tensor(x):
   """Returns whether `x` is a Keras tensor.
 
@@ -922,32 +947,32 @@ def is_keras_tensor(x):
 
   Examples:
   ```python
-      >>> import tensorflow as tf
-      >>> import numpy
-      >>> from keras import backend as K
-      >>> from keras.layers import Input, Dense
-      >>> np_var = numpy.array([1, 2])
-      >>> K.is_keras_tensor(np_var) # A numpy array is not a symbolic tensor.
-      ValueError
-      >>> k_var = tf.compat.v1.placeholder('float32', shape=(1,1))
-      >>> K.is_keras_tensor(k_var) # A variable indirectly created outside of
-      keras is not a Keras tensor.
-      False
-      >>> keras_var = K.variable(np_var)
-      >>> K.is_keras_tensor(keras_var)  # A variable created with the keras
-      backend is not a Keras tensor.
-      False
-      >>> keras_placeholder = K.placeholder(shape=(2, 4, 5))
-      >>> K.is_keras_tensor(keras_placeholder)  # A placeholder is not a Keras
-      tensor.
-      False
-      >>> keras_input = Input([10])
-      >>> K.is_keras_tensor(keras_input) # An Input is a Keras tensor.
-      True
-      >>> keras_layer_output = Dense(10)(keras_input)
-      >>> K.is_keras_tensor(keras_layer_output) # Any Keras layer output is a
-      Keras tensor.
-      True
+  >>> import tensorflow as tf
+  >>> import numpy
+  >>> from keras import backend as K
+  >>> from keras.layers import Input, Dense
+  >>> np_var = numpy.array([1, 2])
+  >>> K.is_keras_tensor(np_var) # A numpy array is not a symbolic tensor.
+  ValueError
+  >>> k_var = tf.compat.v1.placeholder('float32', shape=(1,1))
+  >>> K.is_keras_tensor(k_var) # A variable indirectly created outside of
+  keras is not a Keras tensor.
+  False
+  >>> keras_var = K.variable(np_var)
+  >>> K.is_keras_tensor(keras_var)  # A variable created with the keras
+  backend is not a Keras tensor.
+  False
+  >>> keras_placeholder = K.placeholder(shape=(2, 4, 5))
+  >>> K.is_keras_tensor(keras_placeholder)  # A placeholder is not a Keras
+  tensor.
+  False
+  >>> keras_input = Input([10])
+  >>> K.is_keras_tensor(keras_input) # An Input is a Keras tensor.
+  True
+  >>> keras_layer_output = Dense(10)(keras_input)
+  >>> K.is_keras_tensor(keras_layer_output) # Any Keras layer output is a
+  Keras tensor.
+  True
   ```
   """
   if not isinstance(x, (ops.Tensor,
@@ -990,10 +1015,10 @@ def placeholder(shape=None,
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> input_ph = K.placeholder(shape=(2, 4, 5))
-      >>> input_ph
-      <tf.Tensor 'Placeholder_4:0' shape=(2, 4, 5) dtype=float32>
+  >>> from keras import backend as K
+  >>> input_ph = K.placeholder(shape=(2, 4, 5))
+  >>> input_ph
+  <tf.Tensor 'Placeholder_4:0' shape=(2, 4, 5) dtype=float32>
   ```
   """
   if sparse and ragged:
@@ -1091,7 +1116,7 @@ def freezable_variable(value, shape=None, name=None):
 
     global _FREEZABLE_VARS
     if graph not in _FREEZABLE_VARS:
-      _FREEZABLE_VARS[graph] = weakref.WeakSet()
+      _FREEZABLE_VARS[graph] = object_identity.ObjectIdentityWeakSet()
     _FREEZABLE_VARS[graph].add(x)
   return x
 
@@ -1109,21 +1134,21 @@ def shape(x):
   Examples:
 
   ```python
-      # TensorFlow example
-      >>> from keras import backend as K
-      >>> tf_session = K.get_session()
-      >>> val = np.array([[1, 2], [3, 4]])
-      >>> kvar = K.variable(value=val)
-      >>> input = keras.backend.placeholder(shape=(2, 4, 5))
-      >>> K.shape(kvar)
-      <tf.Tensor 'Shape_8:0' shape=(2,) dtype=int32>
-      >>> K.shape(input)
-      <tf.Tensor 'Shape_9:0' shape=(3,) dtype=int32>
-      # To get integer shape (Instead, you can use K.int_shape(x))
-      >>> K.shape(kvar).eval(session=tf_session)
-      array([2, 2], dtype=int32)
-      >>> K.shape(input).eval(session=tf_session)
-      array([2, 4, 5], dtype=int32)
+  # TensorFlow example
+  >>> from keras import backend as K
+  >>> tf_session = K.get_session()
+  >>> val = np.array([[1, 2], [3, 4]])
+  >>> kvar = K.variable(value=val)
+  >>> input = keras.backend.placeholder(shape=(2, 4, 5))
+  >>> K.shape(kvar)
+  <tf.Tensor 'Shape_8:0' shape=(2,) dtype=int32>
+  >>> K.shape(input)
+  <tf.Tensor 'Shape_9:0' shape=(3,) dtype=int32>
+  # To get integer shape (Instead, you can use K.int_shape(x))
+  >>> K.shape(kvar).eval(session=tf_session)
+  array([2, 2], dtype=int32)
+  >>> K.shape(input).eval(session=tf_session)
+  array([2, 4, 5], dtype=int32)
   ```
   """
   return array_ops.shape(x)
@@ -1141,14 +1166,14 @@ def int_shape(x):
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> input = K.placeholder(shape=(2, 4, 5))
-      >>> K.int_shape(input)
-      (2, 4, 5)
-      >>> val = np.array([[1, 2], [3, 4]])
-      >>> kvar = K.variable(value=val)
-      >>> K.int_shape(kvar)
-      (2, 2)
+  >>> from keras import backend as K
+  >>> input = K.placeholder(shape=(2, 4, 5))
+  >>> K.int_shape(input)
+  (2, 4, 5)
+  >>> val = np.array([[1, 2], [3, 4]])
+  >>> kvar = K.variable(value=val)
+  >>> K.int_shape(kvar)
+  (2, 2)
   ```
   """
   try:
@@ -1172,14 +1197,14 @@ def ndim(x):
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> input = K.placeholder(shape=(2, 4, 5))
-      >>> val = np.array([[1, 2], [3, 4]])
-      >>> kvar = K.variable(value=val)
-      >>> K.ndim(input)
-      3
-      >>> K.ndim(kvar)
-      2
+  >>> from keras import backend as K
+  >>> input = K.placeholder(shape=(2, 4, 5))
+  >>> val = np.array([[1, 2], [3, 4]])
+  >>> kvar = K.variable(value=val)
+  >>> K.ndim(input)
+  3
+  >>> K.ndim(kvar)
+  2
   ```
   """
   dims = x.shape._dims
@@ -1200,20 +1225,20 @@ def dtype(x):
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> K.dtype(K.placeholder(shape=(2,4,5)))
-      'float32'
-      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float32'))
-      'float32'
-      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float64'))
-      'float64'
-      # Keras variable
-      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
-      >>> K.dtype(kvar)
-      'float32'
-      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
-      >>> K.dtype(kvar)
-      'float32'
+  >>> from keras import backend as K
+  >>> K.dtype(K.placeholder(shape=(2,4,5)))
+  'float32'
+  >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float32'))
+  'float32'
+  >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float64'))
+  'float64'
+  # Keras variable
+  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
+  >>> K.dtype(kvar)
+  'float32'
+  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+  >>> K.dtype(kvar)
+  'float32'
   ```
   """
   return x.dtype.base_dtype.name
@@ -1231,11 +1256,11 @@ def eval(x):
 
   Examples:
   ```python
-      >>> from keras import backend as K
-      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
-      >>> K.eval(kvar)
-      array([[ 1.,  2.],
-             [ 3.,  4.]], dtype=float32)
+  >>> from keras import backend as K
+  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+  >>> K.eval(kvar)
+  array([[ 1.,  2.],
+         [ 3.,  4.]], dtype=float32)
   ```
   """
   return get_value(to_dense(x))
@@ -1256,7 +1281,7 @@ def zeros(shape, dtype=None, name=None):
       and will return a dynamically-shaped tensor instead.
 
   Example:
-  
+
   ```python
   from tensorflow.keras import backend as K
   kvar = K.zeros((3,4))
@@ -1268,7 +1293,7 @@ def zeros(shape, dtype=None, name=None):
   kvar3 = K.zeros(A.shape,dtype=tf.int32) # [0, 0, 0] with int32 dtype
   kvar4 = K.zeros([2,3]) # [[0., 0., 0.], [0., 0., 0.]]
   ```
-  
+
   """
   with ops.init_scope():
     if dtype is None:
@@ -1297,12 +1322,12 @@ def ones(shape, dtype=None, name=None):
 
   Example:
   ```python
-      >>> from keras import backend as K
-      >>> kvar = K.ones((3,4))
-      >>> K.eval(kvar)
-      array([[ 1.,  1.,  1.,  1.],
-             [ 1.,  1.,  1.,  1.],
-             [ 1.,  1.,  1.,  1.]], dtype=float32)
+  >>> from keras import backend as K
+  >>> kvar = K.ones((3,4))
+  >>> K.eval(kvar)
+  array([[ 1.,  1.,  1.,  1.],
+         [ 1.,  1.,  1.,  1.],
+         [ 1.,  1.,  1.,  1.]], dtype=float32)
   ```
   """
   with ops.init_scope():
@@ -1330,12 +1355,12 @@ def eye(size, dtype=None, name=None):
 
   Example:
   ```python
-      >>> from keras import backend as K
-      >>> kvar = K.eye(3)
-      >>> K.eval(kvar)
-      array([[ 1.,  0.,  0.],
-             [ 0.,  1.,  0.],
-             [ 0.,  0.,  1.]], dtype=float32)
+  >>> from keras import backend as K
+  >>> kvar = K.eye(3)
+  >>> K.eval(kvar)
+  array([[ 1.,  0.,  0.],
+         [ 0.,  1.,  0.],
+         [ 0.,  0.,  1.]], dtype=float32)
   ```
 
   """
@@ -1359,7 +1384,7 @@ def zeros_like(x, dtype=None, name=None):
       A Keras variable with the shape of `x` filled with zeros.
 
   Example:
-  
+
   ```python
   from tensorflow.keras import backend as K
   kvar = K.variable(np.random.random((2,3)))
@@ -1367,7 +1392,7 @@ def zeros_like(x, dtype=None, name=None):
   K.eval(kvar_zeros)
   # array([[ 0.,  0.,  0.], [ 0.,  0.,  0.]], dtype=float32)
   ```
-  
+
   """
   return array_ops.zeros_like(x, dtype=dtype, name=name)
 
@@ -1387,12 +1412,12 @@ def ones_like(x, dtype=None, name=None):
 
   Example:
   ```python
-      >>> from keras import backend as K
-      >>> kvar = K.variable(np.random.random((2,3)))
-      >>> kvar_ones = K.ones_like(kvar)
-      >>> K.eval(kvar_ones)
-      array([[ 1.,  1.,  1.],
-             [ 1.,  1.,  1.]], dtype=float32)
+  >>> from keras import backend as K
+  >>> kvar = K.variable(np.random.random((2,3)))
+  >>> kvar_ones = K.ones_like(kvar)
+  >>> K.eval(kvar_ones)
+  array([[ 1.,  1.,  1.],
+         [ 1.,  1.,  1.]], dtype=float32)
   ```
   """
   return array_ops.ones_like(x, dtype=dtype, name=name)
@@ -1428,13 +1453,13 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
 
   Example:
   ```python
-      # TensorFlow example
-      >>> kvar = K.random_uniform_variable((2,3), 0, 1)
-      >>> kvar
-      <tensorflow.python.ops.variables.Variable object at 0x10ab40b10>
-      >>> K.eval(kvar)
-      array([[ 0.10940075,  0.10047495,  0.476143  ],
-             [ 0.66137183,  0.00869417,  0.89220798]], dtype=float32)
+  # TensorFlow example
+  >>> kvar = K.random_uniform_variable((2,3), 0, 1)
+  >>> kvar
+  <tensorflow.python.ops.variables.Variable object at 0x10ab40b10>
+  >>> K.eval(kvar)
+  array([[ 0.10940075,  0.10047495,  0.476143  ],
+         [ 0.66137183,  0.00869417,  0.89220798]], dtype=float32)
   ```
   """
   if dtype is None:
@@ -1466,13 +1491,13 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
 
   Example:
   ```python
-      # TensorFlow example
-      >>> kvar = K.random_normal_variable((2,3), 0, 1)
-      >>> kvar
-      <tensorflow.python.ops.variables.Variable object at 0x10ab12dd0>
-      >>> K.eval(kvar)
-      array([[ 1.19591331,  0.68685907, -0.63814116],
-             [ 0.92629528,  0.28055015,  1.70484698]], dtype=float32)
+  # TensorFlow example
+  >>> kvar = K.random_normal_variable((2,3), 0, 1)
+  >>> kvar
+  <tensorflow.python.ops.variables.Variable object at 0x10ab12dd0>
+  >>> K.eval(kvar)
+  array([[ 1.19591331,  0.68685907, -0.63814116],
+         [ 0.92629528,  0.28055015,  1.70484698]], dtype=float32)
   ```
   """
   if dtype is None:
@@ -1498,12 +1523,12 @@ def count_params(x):
 
   Example:
   ```python
-      >>> kvar = K.zeros((2,3))
-      >>> K.count_params(kvar)
-      6
-      >>> K.eval(kvar)
-      array([[ 0.,  0.,  0.],
-             [ 0.,  0.,  0.]], dtype=float32)
+  >>> kvar = K.zeros((2,3))
+  >>> K.count_params(kvar)
+  6
+  >>> K.eval(kvar)
+  array([[ 0.,  0.,  0.],
+         [ 0.,  0.,  0.]], dtype=float32)
   ```
   """
   return np.prod(x.shape.as_list())
@@ -1526,16 +1551,16 @@ def cast(x, dtype):
       Cast a float32 variable to a float64 tensor
 
   ```python
-      >>> import tensorflow as tf
-      >>> from tensorflow.keras import backend as K
-      >>> input = K.ones(shape=(1,3))
-      >>> print(input)
-      >>> cast_input = K.cast(input, dtype='float64')
-      >>> print(cast_input)
+  >>> import tensorflow as tf
+  >>> from tensorflow.keras import backend as K
+  >>> input = K.ones(shape=(1,3))
+  >>> print(input)
+  >>> cast_input = K.cast(input, dtype='float64')
+  >>> print(cast_input)
 
-      <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
-           numpy=array([[1., 1., 1.]], dtype=float32)>
-      tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
+  <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
+       numpy=array([[1., 1., 1.]], dtype=float32)>
+  tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
   ```
   """
   return math_ops.cast(x, dtype)
@@ -1619,30 +1644,30 @@ def dot(x, y):
 
   Examples:
   ```python
-      # dot product between tensors
-      >>> x = K.placeholder(shape=(2, 3))
-      >>> y = K.placeholder(shape=(3, 4))
-      >>> xy = K.dot(x, y)
-      >>> xy
-      <tf.Tensor 'MatMul_9:0' shape=(2, 4) dtype=float32>
+  # dot product between tensors
+  >>> x = K.placeholder(shape=(2, 3))
+  >>> y = K.placeholder(shape=(3, 4))
+  >>> xy = K.dot(x, y)
+  >>> xy
+  <tf.Tensor 'MatMul_9:0' shape=(2, 4) dtype=float32>
   ```
 
   ```python
-      # dot product between tensors
-      >>> x = K.placeholder(shape=(32, 28, 3))
-      >>> y = K.placeholder(shape=(3, 4))
-      >>> xy = K.dot(x, y)
-      >>> xy
-      <tf.Tensor 'MatMul_9:0' shape=(32, 28, 4) dtype=float32>
+  # dot product between tensors
+  >>> x = K.placeholder(shape=(32, 28, 3))
+  >>> y = K.placeholder(shape=(3, 4))
+  >>> xy = K.dot(x, y)
+  >>> xy
+  <tf.Tensor 'MatMul_9:0' shape=(32, 28, 4) dtype=float32>
   ```
 
   ```python
-      # Theano-like behavior example
-      >>> x = K.random_uniform_variable(shape=(2, 3), low=0, high=1)
-      >>> y = K.ones((4, 3, 5))
-      >>> xy = K.dot(x, y)
-      >>> K.int_shape(xy)
-      (2, 4, 5)
+  # Theano-like behavior example
+  >>> x = K.random_uniform_variable(shape=(2, 3), low=0, high=1)
+  >>> y = K.ones((4, 3, 5))
+  >>> xy = K.dot(x, y)
+  >>> K.int_shape(xy)
+  (2, 4, 5)
   ```
   """
   if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
@@ -1719,11 +1744,11 @@ def batch_dot(x, y, axes=None):
       `output_shape` = `(100, 30)`
 
   ```python
-      >>> x_batch = K.ones(shape=(32, 20, 1))
-      >>> y_batch = K.ones(shape=(32, 30, 20))
-      >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
-      >>> K.int_shape(xy_batch_dot)
-      (32, 1, 30)
+  >>> x_batch = K.ones(shape=(32, 20, 1))
+  >>> y_batch = K.ones(shape=(32, 30, 20))
+  >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
+  >>> K.int_shape(xy_batch_dot)
+  (32, 1, 30)
   ```
   """
   if isinstance(axes, int):
@@ -1778,24 +1803,24 @@ def transpose(x):
 
   Examples:
   ```python
-      >>> var = K.variable([[1, 2, 3], [4, 5, 6]])
-      >>> K.eval(var)
-      array([[ 1.,  2.,  3.],
-             [ 4.,  5.,  6.]], dtype=float32)
-      >>> var_transposed = K.transpose(var)
-      >>> K.eval(var_transposed)
-      array([[ 1.,  4.],
-             [ 2.,  5.],
-             [ 3.,  6.]], dtype=float32)
+  >>> var = K.variable([[1, 2, 3], [4, 5, 6]])
+  >>> K.eval(var)
+  array([[ 1.,  2.,  3.],
+         [ 4.,  5.,  6.]], dtype=float32)
+  >>> var_transposed = K.transpose(var)
+  >>> K.eval(var_transposed)
+  array([[ 1.,  4.],
+         [ 2.,  5.],
+         [ 3.,  6.]], dtype=float32)
   ```
 
   ```python
-      >>> input = K.placeholder((2, 3))
-      >>> input
-      <tf.Tensor 'Placeholder_11:0' shape=(2, 3) dtype=float32>
-      >>> input_transposed = K.transpose(input)
-      >>> input_transposed
-      <tf.Tensor 'transpose_4:0' shape=(3, 2) dtype=float32>
+  >>> input = K.placeholder((2, 3))
+  >>> input
+  <tf.Tensor 'Placeholder_11:0' shape=(2, 3) dtype=float32>
+  >>> input_transposed = K.transpose(input)
+  >>> input_transposed
+  <tf.Tensor 'transpose_4:0' shape=(3, 2) dtype=float32>
 
   ```
   """
@@ -2288,14 +2313,14 @@ def maximum(x, y):
 
   Examples:
   ```python
-      # maximum of two tensors
-      >>> x = tf.Variable([[1, 2], [3, 4]])
-      >>> y = tf.Variable([[2, 1], [0, -1]])
-      >>> m = tf.keras.backend.maximum(x, y)
-      >>> m
-      <tf.Tensor: id=42, shape=(2, 2), dtype=int32, numpy=
-      array([[2, 2],
-             [3, 4]], dtype=int32)>
+  # maximum of two tensors
+  >>> x = tf.Variable([[1, 2], [3, 4]])
+  >>> y = tf.Variable([[2, 1], [0, -1]])
+  >>> m = tf.keras.backend.maximum(x, y)
+  >>> m
+  <tf.Tensor: id=42, shape=(2, 2), dtype=int32, numpy=
+  array([[2, 2],
+         [3, 4]], dtype=int32)>
   ```
   """
   return math_ops.maximum(x, y)
@@ -2545,6 +2570,17 @@ def concatenate(tensors, axis=-1):
 
   Returns:
       A tensor.
+
+  Example:
+      ```python
+      >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+      >>> b = tf.constant([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
+      >>> tf.keras.backend.concatenate((a, b), axis=-1)
+      <tf.Tensor: id=14, shape=(3, 6), dtype=int32, numpy=
+      array([[ 1,  2,  3, 10, 20, 30],
+             [ 4,  5,  6, 40, 50, 60],
+             [ 7,  8,  9, 70, 80, 90]], dtype=int32)>
+      ```
   """
   if axis < 0:
     rank = ndim(tensors[0])
@@ -2569,6 +2605,21 @@ def reshape(x, shape):
 
   Returns:
       A tensor.
+
+  Example:
+    ```python
+    >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+    >>> a
+    <tf.Tensor: id=32, shape=(4, 3), dtype=int32, numpy=
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 7,  8,  9],
+           [10, 11, 12]], dtype=int32)>
+    >>> tf.keras.backend.reshape(a, shape=(2, 6))
+    <tf.Tensor: id=35, shape=(2, 6), dtype=int32, numpy=
+    array([[ 1,  2,  3,  4,  5,  6],
+           [ 7,  8,  9, 10, 11, 12]], dtype=int32)>
+    ```
   """
   return array_ops.reshape(x, shape)
 
@@ -2584,6 +2635,22 @@ def permute_dimensions(x, pattern):
 
   Returns:
       A tensor.
+
+  Example:
+    ```python
+    >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+    >>> a
+    <tf.Tensor: id=49, shape=(4, 3), dtype=int32, numpy=
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 7,  8,  9],
+           [10, 11, 12]], dtype=int32)>
+    >>> tf.keras.backend.permute_dimensions(a, pattern=(1, 0))
+    <tf.Tensor: id=52, shape=(3, 4), dtype=int32, numpy=
+    array([[ 1,  4,  7, 10],
+           [ 2,  5,  8, 11],
+           [ 3,  6,  9, 12]], dtype=int32)>
+    ```
   """
   return array_ops.transpose(x, perm=pattern)
 
@@ -2697,6 +2764,14 @@ def repeat_elements(x, rep, axis):
 
   Returns:
       A tensor.
+
+  Example:
+      ```python
+      >>> b = tf.constant([1, 2, 3])
+      >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0)
+      <tf.Tensor: id=70, shape=(6,), dtype=int32,
+          numpy=array([1, 1, 2, 2, 3, 3], dtype=int32)>
+      ```
   """
   x_shape = x.shape.as_list()
   # For static axis
@@ -2749,6 +2824,21 @@ def repeat(x, n):
 
   Returns:
       A tensor.
+
+  Example:
+      ```python
+      >>> b = tf.constant([[1, 2], [3, 4]])
+      >>> b
+      <tf.Tensor: id=78, shape=(2, 2), dtype=int32, numpy=
+      array([[1, 2],
+             [3, 4]], dtype=int32)>
+      >>> tf.keras.backend.repeat(b, n=2)
+      <tf.Tensor: id=82, shape=(2, 2, 2), dtype=int32, numpy=
+      array([[[1, 2],
+              [1, 2]],
+             [[3, 4],
+              [3, 4]]], dtype=int32)>
+      ```
   """
   assert ndim(x) == 2
   x = array_ops.expand_dims(x, 1)
@@ -2776,6 +2866,14 @@ def arange(start, stop=None, step=1, dtype='int32'):
   Returns:
       An integer tensor.
 
+  Example:
+      ```python
+      >>> tf.keras.backend.arange(start=0, stop=10, step=1.5)
+      <tf.Tensor: id=96, shape=(7,), dtype=float32,
+          numpy=array([0. , 1.5, 3. , 4.5, 6. , 7.5, 9. ], dtype=float32)>
+
+      ```
+
   """
   # Match the behavior of numpy and Theano by returning an empty sequence.
   if stop is None and start < 0:
@@ -2812,6 +2910,18 @@ def flatten(x):
 
   Returns:
       A tensor, reshaped into 1-D
+
+  Example:
+      ```python
+      >>> b = tf.constant([[1, 2], [3, 4]])
+      >>> b
+      <tf.Tensor: id=102, shape=(2, 2), dtype=int32, numpy=
+      array([[1, 2],
+             [3, 4]], dtype=int32)>
+      >>> tf.keras.backend.flatten(b)
+      <tf.Tensor: id=105, shape=(4,), dtype=int32,
+          numpy=array([1, 2, 3, 4], dtype=int32)>
+      ```
   """
   return array_ops.reshape(x, [-1])
 
@@ -2832,11 +2942,11 @@ def batch_flatten(x):
     Flattening a 3D tensor to 2D by collapsing the last dimension.
 
   ```python
-      >>> from tensorflow.keras import backend as K
-      >>> x_batch = K.ones(shape=(2, 3, 4, 5))
-      >>> x_batch_flatten = K.batch_flatten(x_batch)
-      >>> K.int_shape(x_batch_flatten)
-      (2, 60)
+  >>> from tensorflow.keras import backend as K
+  >>> x_batch = K.ones(shape=(2, 3, 4, 5))
+  >>> x_batch_flatten = K.batch_flatten(x_batch)
+  >>> K.int_shape(x_batch_flatten)
+  (2, 60)
   ```
   """
   x = array_ops.reshape(x, array_ops.stack([-1, prod(shape(x)[1:])]))
@@ -2973,6 +3083,18 @@ def stack(x, axis=0):
 
   Returns:
       A tensor.
+
+  Example:
+      ```python
+      >>> a = tf.constant([[1, 2],[3, 4]])
+      >>> b = tf.constant([[10, 20],[30, 40]])
+      >>> tf.keras.backend.stack((a, b))
+      <tf.Tensor: id=146, shape=(2, 2, 2), dtype=int32, numpy=
+      array([[[ 1,  2],
+              [ 3,  4]],
+             [[10, 20],
+              [30, 40]]], dtype=int32)>
+      ```
   """
   return array_ops.stack(x, axis=axis)
 
@@ -3086,7 +3208,13 @@ def set_value(x, value):
         assign_placeholder = x._assign_placeholder
         assign_op = x._assign_op
       else:
-        assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+        # In order to support assigning weights to resizable variables in
+        # Keras, we make a placeholder with the correct number of dimensions
+        # but with None in each dimension. This way, we can assign weights
+        # of any size (as long as they have the correct dimensionality).
+        placeholder_shape = tensor_shape.TensorShape([None] * value.ndim)
+        assign_placeholder = array_ops.placeholder(
+            tf_dtype, shape=placeholder_shape)
         assign_op = x.assign(assign_placeholder)
         x._assign_placeholder = assign_placeholder
         x._assign_op = assign_op
@@ -3117,8 +3245,13 @@ def batch_set_value(tuples):
             assign_placeholder = x._assign_placeholder
             assign_op = x._assign_op
           else:
-            assign_placeholder = array_ops.placeholder(tf_dtype,
-                                                       shape=value.shape)
+            # In order to support assigning weights to resizable variables in
+            # Keras, we make a placeholder with the correct number of dimensions
+            # but with None in each dimension. This way, we can assign weights
+            # of any size (as long as they have the correct dimensionality).
+            placeholder_shape = tensor_shape.TensorShape([None] * value.ndim)
+            assign_placeholder = array_ops.placeholder(
+                tf_dtype, shape=placeholder_shape)
             assign_op = x.assign(assign_placeholder)
             x._assign_placeholder = assign_placeholder
             x._assign_op = assign_op
@@ -3138,7 +3271,7 @@ def print_tensor(x, message=''):
   Example:
 
   ```python
-     >>> x = K.print_tensor(x, message="x is: ")
+  >>> x = K.print_tensor(x, message="x is: ")
   ```
 
   Arguments:
@@ -3413,7 +3546,8 @@ class EagerExecutionFunction(object):
 
     self._freezable_vars_to_feed = []
     self._freezable_vars_values = []
-    freezable_vars_from_keras_graph = _FREEZABLE_VARS.get(global_graph, {})
+    freezable_vars_from_keras_graph = object_identity.ObjectIdentitySet(
+        _FREEZABLE_VARS.get(global_graph, {}))
     with _scratch_graph() as exec_graph:
       global_graph = get_graph()
       if source_graph not in (exec_graph, global_graph):
@@ -3425,8 +3559,12 @@ class EagerExecutionFunction(object):
             [p_new for [_, p_new] in legacy_update_ops
              if isinstance(p_new, ops.Tensor)])
         lifted_map = lift_to_graph.lift_to_graph(
-            init_tensors=init_tensors, graph=exec_graph, sources=inputs,
-            add_sources=True, handle_captures=True, base_graph=source_graph)
+            tensors=init_tensors,
+            graph=exec_graph,
+            sources=inputs,
+            add_sources=True,
+            handle_captures=True,
+            base_graph=source_graph)
 
         inputs = [lifted_map[i] for i in inputs]
         outputs = [lifted_map[i] for i in outputs]
@@ -3458,8 +3596,7 @@ class EagerExecutionFunction(object):
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-      exec_graph.inputs = self._input_references + list(
-          exec_graph.captures.values())
+      exec_graph.inputs = self._input_references + exec_graph.internal_captures
       exec_graph.outputs = self.outputs
       graph_fn = eager_function.ConcreteFunction(exec_graph)
 
@@ -3473,8 +3610,8 @@ class EagerExecutionFunction(object):
     with exec_graph.as_default():
       for x in self.inputs:
         if x.op.type == 'PlaceholderWithDefault':
-          self._placeholder_default_values[x] = tensor_util.constant_value(
-              x.op.inputs[0])
+          self._placeholder_default_values[ops.tensor_id(
+              x)] = tensor_util.constant_value(x.op.inputs[0])
 
   def __call__(self, inputs):
     input_values = nest.flatten(inputs, expand_composites=True)
@@ -3485,7 +3622,8 @@ class EagerExecutionFunction(object):
     for tensor, value in zip(self._input_references, input_values):
       if value is None:
         # Assume `value` is a placeholder with default
-        value = self._placeholder_default_values.get(tensor, None)
+        value = self._placeholder_default_values.get(
+            ops.tensor_id(tensor), None)
         if value is None:
           raise ValueError(
               'You must feed a value for placeholder %s' % (tensor,))
@@ -3796,6 +3934,7 @@ def rnn(step_function,
         tensor_array_ops.TensorArray(
             dtype=out.dtype,
             size=time_steps_t,
+            element_shape=out.shape,
             tensor_array_name='output_ta_%s' % i)
         for i, out in enumerate(nest.flatten(output_time_zero)))
 
@@ -4025,17 +4164,19 @@ def in_train_phase(x, alt, training=None):
   if training is None:
     training = learning_phase()
 
-  if training == 1 or training is True:
-    if callable(x):
-      return x()
-    else:
-      return x
+  # TODO(b/138862903): Handle the case when training is tensor.
+  if not tensor_util.is_tensor(training):
+    if training == 1 or training is True:
+      if callable(x):
+        return x()
+      else:
+        return x
 
-  elif training == 0 or training is False:
-    if callable(alt):
-      return alt()
-    else:
-      return alt
+    elif training == 0 or training is False:
+      if callable(alt):
+        return alt()
+      else:
+        return alt
 
   # else: assume learning phase is a placeholder tensor.
   x = switch(training, x, alt)
@@ -4198,19 +4339,19 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
-      
+
   Example:
   ```python:
-      import tensorflow as tf
-      from tensorflow.keras import backend as K
-      a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
-      print("a: ", a)
-      b = tf.constant([.9, .05, .05, .5, .89, .6, .05, .01, .94], shape=[3,3])
-      print("b: ", b)
-      loss = K.categorical_crossentropy(a, b)
-      print('Loss: ', loss) #Loss: tf.Tensor([0.10536055 0.8046684  0.06187541], shape=(3,), dtype=float32)
-      loss = K.categorical_crossentropy(a, a)
-      print('Loss: ', loss) #Loss:  tf.Tensor([1.1920929e-07 1.1920929e-07 1.1920929e-07], shape=(3,), dtype=float32)
+  import tensorflow as tf
+  from tensorflow.keras import backend as K
+  a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
+  print("a: ", a)
+  b = tf.constant([.9, .05, .05, .5, .89, .6, .05, .01, .94], shape=[3,3])
+  print("b: ", b)
+  loss = K.categorical_crossentropy(a, b)
+  print('Loss: ', loss) #Loss: tf.Tensor([0.10536055 0.8046684  0.06187541], shape=(3,), dtype=float32)
+  loss = K.categorical_crossentropy(a, a)
+  print('Loss: ', loss) #Loss:  tf.Tensor([1.1920929e-07 1.1920929e-07 1.1920929e-07], shape=(3,), dtype=float32)
   ```
   """
   if not from_logits:
@@ -5677,7 +5818,7 @@ def configure_and_create_distributed_session(distribution_strategy):
 
     set_session(session)
 
-  if multi_worker_util.in_multi_worker_mode():
+  if distribution_strategy.extended._in_multi_worker_mode():
     dc.run_distribute_coordinator(
         _create_session,
         distribution_strategy,
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 90b6aa671cd..1547a7747b2 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -201,8 +201,8 @@ class BackendUtilsTest(test.TestCase):
                      initial_learning_phase_outside_graph)
 
     with keras.backend.get_graph().as_default():
-      self.assertEqual(keras.backend.learning_phase(),
-                       initial_learning_phase_in_graph)
+      self.assertIs(keras.backend.learning_phase(),
+                    initial_learning_phase_in_graph)
 
     self.assertEqual(keras.backend.learning_phase(),
                      initial_learning_phase_outside_graph)
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index c735f71245e..ca899c5e81c 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -43,10 +43,13 @@ from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.compat import collections_abc
 
 try:
   import requests
@@ -898,8 +901,8 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    if multi_worker_util.in_multi_worker_mode():
-      # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if self.model._in_multi_worker_mode():
       # MultiWorkerTrainingState is used to manage the training state needed
       # for preemption-recovery of a worker in multi-worker training.
       self.model._training_state = (
@@ -914,8 +917,8 @@ class ModelCheckpoint(Callback):
     # If this is not multi worker training, restoring is not needed, or
     # restoring failed, check if it should load weights on restart.
     if self.load_weights_on_restart:
-      if (not multi_worker_util.in_multi_worker_mode()
-          or multi_worker_util.should_load_checkpoint()):
+      if (not self.model._in_multi_worker_mode() or
+          multi_worker_util.should_load_checkpoint()):
         filepath_to_load = (
             self._get_most_recently_modified_file_matching_pattern(
                 self.filepath))
@@ -931,7 +934,8 @@ class ModelCheckpoint(Callback):
                 filepath_to_load, e))
 
   def on_train_end(self, logs=None):
-    if multi_worker_util.in_multi_worker_mode():
+    # pylint: disable=protected-access
+    if self.model._in_multi_worker_mode():
       # In multi-worker training, on successful exit of training, delete the
       # training state backup file that was saved for the purpose of worker
       # recovery.
@@ -954,14 +958,15 @@ class ModelCheckpoint(Callback):
 
   def on_epoch_end(self, epoch, logs=None):
     self.epochs_since_last_save += 1
+    # pylint: disable=protected-access
     if self.save_freq == 'epoch':
-      if multi_worker_util.in_multi_worker_mode():
+      if self.model._in_multi_worker_mode():
         # Exclude training state variables in user-requested checkpoint file.
         with self._training_state.untrack_vars():
           self._save_model(epoch=epoch, logs=logs)
       else:
         self._save_model(epoch=epoch, logs=logs)
-    if multi_worker_util.in_multi_worker_mode():
+    if self.model._in_multi_worker_mode():
       # For multi-worker training, back up the weights and current training
       # state for possible future recovery.
       # TODO(rchao): Call `back_up` at finer period such as N steps.
@@ -1013,7 +1018,8 @@ class ModelCheckpoint(Callback):
 
   def _get_file_path(self, epoch, logs):
     """Returns the file path for checkpoint."""
-    if not multi_worker_util.in_multi_worker_mode(
+    # pylint: disable=protected-access
+    if not self.model._in_multi_worker_mode(
     ) or multi_worker_util.should_save_checkpoint():
       return self.filepath.format(epoch=epoch + 1, **logs)
     else:
@@ -1031,8 +1037,9 @@ class ModelCheckpoint(Callback):
     # Remove the checkpoint directory in multi-worker training where this worker
     # should not checkpoint. It is a dummy directory previously saved for sync
     # distributed training.
-    if multi_worker_util.in_multi_worker_mode(
-    ) and not multi_worker_util.should_save_checkpoint():
+
+    if (self.model._in_multi_worker_mode() and  # pylint: disable=protected-access
+        not multi_worker_util.should_save_checkpoint()):
       file_io.delete_recursively(self._temp_file_dir)
       del self._temp_file_dir
 
@@ -1343,10 +1350,12 @@ class LearningRateScheduler(Callback):
       lr = self.schedule(epoch, lr)
     except TypeError:  # Support for old API for backward compatibility
       lr = self.schedule(epoch)
-    if not isinstance(lr, (float, np.float32, np.float64)):
+    if not isinstance(lr, (ops.Tensor, float, np.float32, np.float64)):
       raise ValueError('The output of the "schedule" function '
                        'should be float.')
-    K.set_value(self.model.optimizer.lr, lr)
+    if isinstance(lr, ops.Tensor) and not lr.dtype.is_floating:
+      raise ValueError('The dtype of Tensor should be float')
+    K.set_value(self.model.optimizer.lr, K.get_value(lr))
     if self.verbose > 0:
       print('\nEpoch %05d: LearningRateScheduler reducing learning '
             'rate to %s.' % (epoch + 1, lr))
@@ -1394,7 +1403,7 @@ class TensorBoard(Callback):
         writes the losses and metrics to TensorBoard after each batch. The same
         applies for `'epoch'`. If using an integer, let's say `1000`, the
         callback will write the metrics and losses to TensorBoard every 1000
-        samples. Note that writing too frequently to TensorBoard can slow down
+        batches. Note that writing too frequently to TensorBoard can slow down
         your training.
       profile_batch: Profile the batch to sample compute characteristics. By
         default, it will profile the second batch. Set profile_batch=0 to
@@ -1441,16 +1450,14 @@ class TensorBoard(Callback):
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
     self._current_batch = 0
-    self._total_batches_seen = 0
-    self._total_val_batches_seen = 0
 
     # A collection of file writers currently in use, to be closed when
     # training ends for this callback. Writers are keyed by the
     # directory name under the root logdir: e.g., "train" or
     # "validation".
-    self._writers = {}
     self._train_run_name = 'train'
     self._validation_run_name = 'validation'
+    self._writers = {}
 
     self._profile_batch = profile_batch
     # True when a trace is running.
@@ -1507,6 +1514,10 @@ class TensorBoard(Callback):
     if self.embeddings_freq:
       self._configure_embeddings()
 
+    self._prev_summary_writer = context.context().summary_writer
+    self._prev_summary_recording = context.context().summary_recording
+    self._prev_summary_step = context.context().summary_step
+
   def _configure_embeddings(self):
     """Configure the Projector for embeddings."""
     # TODO(omalleyt): Add integration tests.
@@ -1575,12 +1586,55 @@ class TensorBoard(Callback):
       self._writers[writer_name] = writer
     return self._writers[writer_name]
 
+  def _set_default_writer(self, writer_name):
+    """Sets the default writer for custom batch-level summaries."""
+    if self.update_freq == 'epoch':
+      # Writer is only used for custom summaries, which are written
+      # batch-by-batch.
+      return
+    writer = self._get_writer(writer_name)
+    step = self._total_batches_seen[writer_name]
+    context.context().summary_writer = writer
+
+    def _should_record():
+      return math_ops.equal(step % self.update_freq, 0)
+
+    context.context().summary_recording = _should_record
+    summary_ops_v2.set_step(step)
+
+  def _init_batch_steps(self):
+    """Create the total batch counters."""
+    if ops.executing_eagerly_outside_functions():
+      # Variables are needed for the `step` value of custom tf.summaries
+      # to be updated inside a tf.function.
+      self._total_batches_seen = {
+          self._train_run_name: variables.Variable(0, dtype='int64'),
+          self._validation_run_name: variables.Variable(0, dtype='int64')
+      }
+    else:
+      # Custom tf.summaries are not supported in legacy graph mode.
+      self._total_batches_seen = {
+          self._train_run_name: 0,
+          self._validation_run_name: 0
+      }
+
+  def _increment_step(self, writer_name):
+    step = self._total_batches_seen[writer_name]
+    if isinstance(step, variables.Variable):
+      step.assign_add(1)
+    else:
+      self._total_batches_seen[writer_name] += 1
+
   def on_train_begin(self, logs=None):
+    self._init_batch_steps()
     if self._profile_batch == 1:
       summary_ops_v2.trace_on(graph=True, profiler=True)
       self._is_tracing = True
 
-  def on_batch_end(self, batch, logs=None):
+  def on_test_begin(self, logs=None):
+    self._set_default_writer(self._validation_run_name)
+
+  def on_train_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch.
 
     Performs profiling if current batch is in profiler_batches.
@@ -1589,24 +1643,35 @@ class TensorBoard(Callback):
       batch: Integer, index of batch within the current epoch.
       logs: Dict. Metric results for this batch.
     """
+    if self.update_freq == 'epoch' and self._profile_batch is None:
+      return
+
     # Don't output batch_size and batch number as TensorBoard summaries
     logs = logs or {}
-    self._samples_seen += logs.get('size', 1)
-    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
-    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
-      self._log_metrics(logs, prefix='batch_', step=self._total_batches_seen)
-      self._samples_seen_at_last_write = self._samples_seen
-    self._total_batches_seen += 1
-    if self._is_tracing:
-      self._log_trace()
-    elif (not self._is_tracing and
-          self._total_batches_seen == self._profile_batch - 1):
-      self._enable_trace()
+    train_batches = self._total_batches_seen[self._train_run_name]
+    if self.update_freq != 'epoch' and batch % self.update_freq == 0:
+      self._log_metrics(logs, prefix='batch_', step=train_batches)
+
+    self._increment_step(self._train_run_name)
+
+    if context.executing_eagerly():
+      if self._is_tracing:
+        self._log_trace()
+      elif (not self._is_tracing and
+            math_ops.equal(train_batches, self._profile_batch - 1)):
+        self._enable_trace()
+
+  def on_test_batch_end(self, batch, logs=None):
+    if self.update_freq == 'epoch':
+      return
+    self._increment_step(self._validation_run_name)
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self._set_default_writer(self._train_run_name)
 
   def on_epoch_end(self, epoch, logs=None):
     """Runs metrics and histogram summaries at epoch end."""
-    step = epoch if self.update_freq == 'epoch' else self._samples_seen
-    self._log_metrics(logs, prefix='epoch_', step=step)
+    self._log_metrics(logs, prefix='epoch_', step=epoch)
 
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._log_weights(epoch)
@@ -1619,19 +1684,25 @@ class TensorBoard(Callback):
       self._log_trace()
     self._close_writers()
 
+    context.context().summary_writer = self._prev_summary_writer
+    context.context().summary_recording = self._prev_summary_recording
+    context.context().summary_step = self._prev_summary_step
+
   def _enable_trace(self):
     if context.executing_eagerly():
       summary_ops_v2.trace_on(graph=True, profiler=True)
       self._is_tracing = True
 
   def _log_trace(self):
+    """Logs the trace graph to TensorBoard."""
     if context.executing_eagerly():
       with self._get_writer(self._train_run_name).as_default(), \
           summary_ops_v2.always_record_summaries():
         # TODO(b/126388999): Remove step info in the summary name.
+        step = K.get_value(self._total_batches_seen[self._train_run_name])
         summary_ops_v2.trace_export(
-            name='batch_%d' % self._total_batches_seen,
-            step=self._total_batches_seen,
+            name='batch_%d' % step,
+            step=step,
             profiler_outdir=os.path.join(self.log_dir, 'train'))
       self._is_tracing = False
 
@@ -1902,7 +1973,7 @@ class CSVLogger(Callback):
       is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
       if isinstance(k, six.string_types):
         return k
-      elif isinstance(k, collections.Iterable) and not is_zero_dim_ndarray:
+      elif isinstance(k, collections_abc.Iterable) and not is_zero_dim_ndarray:
         return '"[%s]"' % (', '.join(map(str, k)))
       else:
         return k
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 4cb70bbbaa7..284047c6f88 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import csv
+import json
 import os
 import re
 import shutil
@@ -31,6 +32,7 @@ import unittest
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import random_seed
@@ -40,11 +42,14 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -135,7 +140,7 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         adam.AdamOptimizer(0.001),
         'binary_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   @parameterized.named_parameters(('with_numpy', _get_numpy()),
@@ -143,6 +148,7 @@ class CallbackCountsTest(keras_parameterized.TestCase):
   def test_callback_hooks_are_called_in_fit(self, data):
     x, y = data
     val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
+    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
 
     model = self._get_model()
     counter = Counter()
@@ -150,7 +156,8 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         x,
         y,
         validation_data=(val_x, val_y),
-        batch_size=2,
+        batch_size=2 if not is_sequence else None,
+        steps_per_epoch=5 if is_sequence else None,
         epochs=5,
         callbacks=[counter])
 
@@ -178,10 +185,16 @@ class CallbackCountsTest(keras_parameterized.TestCase):
                                   ('with_sequence', _get_sequence()))
   def test_callback_hooks_are_called_in_evaluate(self, data):
     x, y = data
+    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
 
     model = self._get_model()
     counter = Counter()
-    model.evaluate(x, y, batch_size=2, callbacks=[counter])
+    model.evaluate(
+        x,
+        y,
+        batch_size=2 if not is_sequence else None,
+        steps=5 if is_sequence else None,
+        callbacks=[counter])
     self._check_counts(
         counter, {
             'on_test_batch_begin': 5,
@@ -194,10 +207,15 @@ class CallbackCountsTest(keras_parameterized.TestCase):
                                   ('with_sequence', _get_sequence()))
   def test_callback_hooks_are_called_in_predict(self, data):
     x = data[0]
+    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
 
     model = self._get_model()
     counter = Counter()
-    model.predict(x, batch_size=2, callbacks=[counter])
+    model.predict(
+        x,
+        batch_size=2 if not is_sequence else None,
+        steps=5 if is_sequence else None,
+        callbacks=[counter])
     self._check_counts(
         counter, {
             'on_predict_batch_begin': 5,
@@ -238,7 +256,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy(name='my_acc')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   @keras_parameterized.run_with_all_model_types
@@ -270,6 +288,36 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
       model.fit(dataset, epochs=2, steps_per_epoch=10)
       self.assertRegexpMatches(printed.contents(), expected_log)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging_validation_data(self):
+    model = self._get_model(input_shape=(3,))
+
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    training_dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*5/5.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*)+'
+
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(training_dataset, epochs=2, validation_data=val_dataset)
+      self.assertRegexpMatches(printed.contents(), expected_log)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging_validation_split(self):
+    model = self._get_model(input_shape=(3,))
+
+    x = np.ones((100, 3))
+    y = np.zeros((100, 2))
+    expected_log = (
+        r'(?s).*1/2.*80/80.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:'
+        r'.*2/2.*80/80.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*')
+
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(x, y, batch_size=10, epochs=2, validation_split=0.2)
+      self.assertRegexpMatches(printed.contents(), expected_log)
+
   @keras_parameterized.run_with_all_model_types
   def test_ModelCheckpoint(self):
     if h5py is None:
@@ -523,7 +571,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         mode=mode,
         save_freq=3)
 
-  def _run_load_weights_on_restart_test_common_iterations(self):
+  def _get_dummy_resource_for_model_checkpoint_testing(self):
 
     def get_input_datasets():
       # Simple training input.
@@ -549,12 +597,19 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     temp_dir = self.get_temp_dir()
     filepath = os.path.join(temp_dir, 'checkpoint.epoch{epoch:02d}.h5')
-    initial_epochs = 3
 
     # The filepath shouldn't exist at the beginning.
     self.assertFalse(os.path.exists(filepath))
     callback = keras.callbacks.ModelCheckpoint(
         filepath=filepath, save_weights_only=True)
+
+    return model, train_ds, callback, filepath
+
+  def _run_load_weights_on_restart_test_common_iterations(self):
+
+    (model, train_ds, callback,
+     filepath) = self._get_dummy_resource_for_model_checkpoint_testing()
+    initial_epochs = 3
     model.fit(train_ds, epochs=initial_epochs, callbacks=[callback])
 
     # The files should exist after fitting with callback.
@@ -675,6 +730,23 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     self.assertNotAllClose(weights_before_additional_fit,
                            weights_after_additional_fit)
 
+  def test_fit_with_ModelCheckpoint_with_tf_config(self):
+    (model, train_ds, callback,
+     _) = self._get_dummy_resource_for_model_checkpoint_testing()
+
+    os.environ['TF_CONFIG'] = json.dumps({
+        'cluster': {
+            'worker': ['localhost:23333']
+        },
+        'task': {
+            'type': 'worker',
+            'index': 0
+        }
+    })
+
+    # `model.fit()` should work regardless of the presence of `TF_CONFIG`.
+    model.fit(train_ds, epochs=1, callbacks=[callback])
+
   def test_EarlyStopping(self):
     with self.cached_session():
       np.random.seed(123)
@@ -851,6 +923,30 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
+      cbks = [
+          keras.callbacks.LearningRateScheduler(
+              lambda epoch, _: learning_rate_schedule.CosineDecay(0.01, 2)
+              (epoch))
+      ]
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      cosine_decay_np = 0.5 * (1 + np.cos(np.pi * (1 / 2)))
+      decayed_learning_rate = 0.01 * cosine_decay_np
+
+      assert (float(keras.backend.get_value(model.optimizer.lr)) -
+              decayed_learning_rate) < keras.backend.epsilon()
+
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -869,7 +965,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
             num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
-            optimizer=keras.optimizers.SGD(lr=0.1))
+            optimizer=gradient_descent.SGD(lr=0.1))
         return model
 
       # TODO(psv): Make sure the callback works correctly when min_delta is
@@ -975,7 +1071,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
             num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
-            optimizer=keras.optimizers.SGD(lr=0.1),
+            optimizer=gradient_descent.SGD(lr=0.1),
             metrics=['accuracy'])
         return model
 
@@ -1266,6 +1362,12 @@ def list_summaries(logdir):
             raise ValueError(
                 'Unexpected summary kind %r in event file %s:\n%r'
                 % (kind, path, event))
+          elif kind == 'tensor' and tag != 'keras':
+            # Check for V2 scalar summaries, which have a different PB
+            # structure.
+            if event.summary.value[
+                0].metadata.plugin_data.plugin_name == 'scalars':
+              container = result.scalars
           container.add(_ObservedSummary(logdir=dirpath, tag=tag))
   return result
 
@@ -1292,7 +1394,7 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
         opt,
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def test_TensorBoard_default_logdir(self):
@@ -1345,8 +1447,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
 
     See: <https://github.com/tensorflow/tensorflow/issues/25707>
     """
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_model()
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
     tb_cbk = keras.callbacks.TensorBoard(self.logdir)
@@ -1410,8 +1510,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
     )
 
   def test_TensorBoard_weight_histograms(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_model()
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
     tb_cbk = keras.callbacks.TensorBoard(self.logdir, histogram_freq=1)
@@ -1442,8 +1540,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
     )
 
   def test_TensorBoard_weight_images(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_model()
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
     tb_cbk = keras.callbacks.TensorBoard(
@@ -1483,6 +1579,57 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
         },
     )
 
+  def test_custom_summary(self):
+    if not testing_utils.should_run_tf_function():
+      self.skipTest('Custom summaries only supported in V2 code path.')
+
+    def scalar_v2_mock(name, data, step=None):
+      """A reimplementation of the scalar plugin to avoid circular deps."""
+      metadata = summary_pb2.SummaryMetadata()
+      # Should match value in tensorboard/plugins/scalar/metadata.py.
+      metadata.plugin_data.plugin_name = 'scalars'
+      with summary_ops_v2.summary_scope(
+          name, 'scalar_summary', values=[data, step]) as (tag, _):
+        return summary_ops_v2.write(
+            tag=tag,
+            tensor=math_ops.cast(data, 'float32'),
+            step=step,
+            metadata=metadata)
+
+    class LayerWithSummary(keras.layers.Layer):
+
+      def call(self, x):
+        scalar_v2_mock('custom_summary', math_ops.reduce_sum(x))
+        return x
+
+    model = testing_utils.get_model_from_layers([LayerWithSummary()],
+                                                input_shape=(5,),
+                                                name='model')
+
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
+    x, y = np.ones((10, 5)), np.ones((10, 5))
+    model.fit(x, y, batch_size=2, validation_data=(x, y), callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.train_dir, tag='batch_loss'),
+            _ObservedSummary(
+                logdir=self.train_dir,
+                tag='model/layer_with_summary/custom_summary'),
+            _ObservedSummary(
+                logdir=self.validation_dir,
+                tag='model/layer_with_summary/custom_summary')
+        },
+    )
+
   def _strip_layer_names(self, summaries, model_type):
     """Deduplicate summary names modulo layer prefix.
 
@@ -1532,7 +1679,7 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
         opt,
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def fitModelAndAssertKerasModelWritten(self, model):
diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index cf1a1097bb8..6f1fc64ff78 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -45,7 +45,7 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
       origin=origin_folder + 'boston_housing.npz',
       file_hash=
       'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
-  with np.load(path) as f:
+  with np.load(path, allow_pickle=True) as f:
     x = f['x']
     y = f['y']
 
diff --git a/tensorflow/python/keras/datasets/cifar10.py b/tensorflow/python/keras/datasets/cifar10.py
index c23f1a263bb..f7606b657f5 100644
--- a/tensorflow/python/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/datasets/cifar10.py
@@ -37,7 +37,12 @@ def load_data():
   """
   dirname = 'cifar-10-batches-py'
   origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
-  path = get_file(dirname, origin=origin, untar=True)
+  path = get_file(
+      dirname,
+      origin=origin,
+      untar=True,
+      file_hash=
+      '6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce')
 
   num_train_samples = 50000
 
diff --git a/tensorflow/python/keras/datasets/cifar100.py b/tensorflow/python/keras/datasets/cifar100.py
index ee58d46228c..499188a5e0b 100644
--- a/tensorflow/python/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/datasets/cifar100.py
@@ -46,7 +46,12 @@ def load_data(label_mode='fine'):
 
   dirname = 'cifar-100-python'
   origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
-  path = get_file(dirname, origin=origin, untar=True)
+  path = get_file(
+      dirname,
+      origin=origin,
+      untar=True,
+      file_hash=
+      '85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7')
 
   fpath = os.path.join(path, 'train')
   x_train, y_train = load_batch(fpath, label_key=label_mode + '_labels')
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index e3a03c8d55d..d9f209add01 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -81,7 +81,8 @@ def load_data(path='imdb.npz',
   path = get_file(
       path,
       origin=origin_folder + 'imdb.npz',
-      file_hash='599dadb1135973df5b59232a0e9a887c')
+      file_hash=
+      '69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f')
   with np.load(path, allow_pickle=True) as f:
     x_train, labels_train = f['x_train'], f['y_train']
     x_test, labels_test = f['x_test'], f['y_test']
diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
index bad41a51642..bbcdbea8995 100644
--- a/tensorflow/python/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -46,8 +46,9 @@ def load_data(path='mnist.npz'):
   path = get_file(
       path,
       origin=origin_folder + 'mnist.npz',
-      file_hash='8a61469f7ea1b51cbae51d4f78837e45')
-  with np.load(path) as f:
+      file_hash=
+      '731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1')
+  with np.load(path, allow_pickle=True) as f:
     x_train, y_train = f['x_train'], f['y_train']
     x_test, y_test = f['x_test'], f['y_test']
 
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index 560b697dff2..e1aa1f5d185 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -79,7 +79,8 @@ def load_data(path='reuters.npz',
   path = get_file(
       path,
       origin=origin_folder + 'reuters.npz',
-      file_hash='87aedbeb0cb229e378797a632c1997b6')
+      file_hash=
+      'd6586e694ee56d7a4e65172e12b3e987c03096cb01eab99753921ef915959916')
   with np.load(path, allow_pickle=True) as f:
     xs, labels = f['x'], f['y']
 
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 045c273c2e2..275bdd359fa 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -2,7 +2,7 @@
 #   keras/distribute package is intended to serve as the centralized place for things
 #   related to dist-strat used by Keras..
 
-load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
+load("//tensorflow/core/platform:default/distribute.bzl", "distribute_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
@@ -97,14 +97,30 @@ py_library(
     ],
 )
 
+distribute_py_test(
+    name = "keras_premade_models_test",
+    srcs = ["keras_premade_models_test.py"],
+    full_precision = True,
+    main = "keras_premade_models_test.py",
+    shard_count = 4,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":distribute_strategy_test_lib",
+        ":keras_correctness_test_lib",
+    ],
+)
+
 distribute_py_test(
     name = "distribute_strategy_test",
     srcs = ["distribute_strategy_test.py"],
     full_precision = True,
     main = "distribute_strategy_test.py",
-    shard_count = 5,
+    shard_count = 8,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",  # times out on ROCm
         "no_windows_gpu",
         "notsan",
     ],
@@ -165,9 +181,8 @@ distribute_py_test(
     shard_count = 19,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",  # times out on ROCm
         "no_windows_gpu",
-        # TODO(b/134764123): Re-enable this test.
-        "notap",
         "notsan",
     ],
     deps = [
@@ -184,6 +199,7 @@ distribute_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",  # times out on ROCm
         "no_windows_gpu",
         "notsan",
     ],
@@ -201,9 +217,11 @@ distribute_py_test(
     shard_count = 8,
     tags = [
         "multi_and_single_gpu",
+        "no_rocm",  # times out on ROCm
         "no_windows_gpu",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = False,  # Tensorflow also fails.
     deps = [
         ":keras_correctness_test_lib",
     ],
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index b01bcec6bff..e5a8f366b9e 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
@@ -39,6 +40,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import loss_reduction
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
@@ -247,10 +249,10 @@ def all_strategy_combinations_plus_run_distributed():
   return (combinations.combine(
       distribution=strategies_minus_tpu,
       mode=['graph', 'eager'],
-      run_distributed=[True, False]) + combinations.combine(
+      experimental_run_tf_function=[True, False]) + combinations.combine(
           distribution=tpu_strategies,
           mode=['graph', 'eager'],
-          run_distributed=[False]))
+          experimental_run_tf_function=[False]))
 
 
 def all_strategy_minus_default_and_tpu_combinations():
@@ -274,16 +276,35 @@ def strategy_and_optimizer_combinations():
       strategy_minus_tpu_combinations(),
       combinations.combine(
           optimizer=[
+              strategy_combinations.adagrad_optimizer_v1_fn,
+              strategy_combinations.adam_optimizer_v1_fn,
+              strategy_combinations.gradient_descent_optimizer_v1_fn,
+              strategy_combinations.rmsprop_optimizer_v1_fn,
+              strategy_combinations.adadelta_optimizer_keras_v2_fn,
               strategy_combinations.adagrad_optimizer_keras_v2_fn,
               strategy_combinations.adam_optimizer_keras_v2_fn,
+              strategy_combinations.adamax_optimizer_keras_v2_fn,
               strategy_combinations.gradient_descent_optimizer_keras_v2_fn,
+              strategy_combinations.nadam_optimizer_keras_v2_fn,
               strategy_combinations.rmsprop_optimizer_keras_v2_fn
           ],
-          run_distributed=[True, False]))
+          experimental_run_tf_function=[True, False])) + combinations.combine(
+              distribution=[
+                  strategy_combinations.one_device_strategy,
+              ],
+              mode=['graph', 'eager'],
+              # TODO(b/109941998):  Ftrl optimizer doesn't have a GPU kernel.
+              # That is why it's not tested with MirroredStrategies.  It is also
+              # not tested with Default strategy, because that fails with
+              # colocation error dense_12/MatMul/ReadVariableOp on GPU and
+              # ResourceApplyFtrl that can't be on GPU.  Add default strategy,
+              # one device GPU strategy and mirrored GPU strategies.
+              optimizer=strategy_combinations.ftrl_optimizer_keras_v2_fn,
+              experimental_run_tf_function=[True, False])
   tpu_strategies_graph = combinations.combine(
       distribution=tpu_strategies,
       mode=['graph'],
-      run_distributed=[True],
+      experimental_run_tf_function=[True],
       optimizer=[
           strategy_combinations.adagrad_optimizer_v1_fn,
           strategy_combinations.adam_optimizer_v1_fn,
@@ -297,7 +318,7 @@ def strategy_and_optimizer_combinations():
   tpu_strategies_eager = combinations.combine(
       distribution=tpu_strategies,
       mode=['eager'],
-      run_distributed=[False],
+      experimental_run_tf_function=[False],
       optimizer=[
           strategy_combinations.adagrad_optimizer_keras_v2_fn,
           strategy_combinations.adam_optimizer_keras_v2_fn,
@@ -319,19 +340,15 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       replica_scale_factor = distribution.num_replicas_in_sync
 
     with self.cached_session():
-      # Input samples of different sizes
-      input_20_samples = np.zeros((20, 3), dtype=np.float32)
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
       # Default global batch size 32 for input with 64 samples run in 2 steps
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=None)
+          distribution, 64, steps=None, batch_size=None)
       self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
       # Computed global batch size 20 is lower than 32 if we pass less samples.
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_20_samples, steps=None, batch_size=None)
+          distribution, 20, steps=None, batch_size=None)
       self.assertEqual(batch_size, 20 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
@@ -345,33 +362,29 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       replica_scale_factor = distribution.num_replicas_in_sync
 
     with self.cached_session():
-      # Input samples of different sizes
-      input_63_samples = np.zeros((63, 3), dtype=np.float32)
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
       # Computed global batch size is correct for number of specified 1 step
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=1, batch_size=None)
+          distribution, 64, steps=1, batch_size=None)
       self.assertEqual(batch_size, 64 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
       # Computed global batch size is correct for number of specified 2 steps
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=2, batch_size=None)
+          distribution, 64, steps=2, batch_size=None)
       self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
       # All samples can not be consumed in specified number of steps
       with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
         distributed_training_utils.get_input_params(
-            distribution, input_63_samples, steps=2, batch_size=None)
+            distribution, 63, steps=2, batch_size=None)
 
       # This cases is different for different strategies due to the
       # difference in supported batch size being global or per-replica.
       if replica_scale_factor == 1:
         # Computed global batch size is correct even if not sharadable
         steps, batch_size = distributed_training_utils.get_input_params(
-            distribution, input_63_samples, steps=3, batch_size=None)
+            distribution, 63, steps=3, batch_size=None)
         self.assertEqual(batch_size, 21)
         self.assertEqual(steps, 3)
       else:
@@ -380,7 +393,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
             ValueError, 'could not be sharded evenly '
             'across the sync replicas'):
           distributed_training_utils.get_input_params(
-              distribution, input_63_samples, steps=1, batch_size=None)
+              distribution, 63, steps=1, batch_size=None)
 
   @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(
@@ -392,17 +405,15 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       replica_scale_factor = distribution.num_replicas_in_sync
 
     with self.cached_session():
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
       # Computed steps is correct for specified batch size
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=16)
+          distribution, 64, steps=None, batch_size=16)
       self.assertEqual(batch_size, 16)
       self.assertEqual(steps, 4 // replica_scale_factor)
 
       # Computed steps is correct for specified batch size
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=32)
+          distribution, 64, steps=None, batch_size=32)
       self.assertEqual(batch_size, 32)
       self.assertEqual(steps, 2 // replica_scale_factor)
 
@@ -410,21 +421,20 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
   def test_calculating_input_params_with_steps_with_batch_size(
       self, distribution):
     with self.cached_session():
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
       # No change to steps and batch size if both specified and feasible
       steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=5, batch_size=3)
+          distribution, 64, steps=5, batch_size=3)
       self.assertEqual(batch_size, 3)
       self.assertEqual(steps, 5)
 
       # Number of samples is less than global batch size * steps
       with self.assertRaisesRegexp(ValueError, 'less than samples required'):
         distributed_training_utils.get_input_params(
-            distribution, input_64_samples, steps=10, batch_size=13)
+            distribution, 64, steps=10, batch_size=13)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_calling_model_with_numpy_arrays(self, distribution, run_distributed):
+  def test_calling_model_with_numpy_arrays(self, distribution,
+                                           experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -433,7 +443,10 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         loss = 'mse'
         metrics = ['mae']
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
         inputs = np.zeros((64, 3), dtype=np.float32)
         targets = np.zeros((64, 4), dtype=np.float32)
@@ -457,14 +470,17 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
   def test_calling_model_with_nested_numpy_arrays(self, distribution,
-                                                  run_distributed):
+                                                  experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
         optimizer = optimizer_fn(learning_rate=0.001)
         model = multi_input_output_model()
         loss = 'mse'
-        model.compile(optimizer, loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
       input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
@@ -489,13 +505,17 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_numpy_with_sample_weights(self, distribution, run_distributed):
+          experimental_run_tf_function=[True, False]))
+  def test_numpy_with_sample_weights(self, distribution,
+                                     experimental_run_tf_function):
     with self.cached_session(), distribution.scope():
       model = get_sample_weights_model()
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, run_distributed=run_distributed)
+      model.compile(
+          optimizer,
+          loss,
+          experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.array([[0], [1], [2], [3]], np.float32)
       targets = np.array([[2], [4], [6], [8]], np.float32)
@@ -526,14 +546,18 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllClose(result, 13.5)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_flatten_predict_outputs(self, distribution, run_distributed):
+  def test_flatten_predict_outputs(self, distribution,
+                                   experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         model = multi_input_output_model()
         optimizer_fn = gradient_descent_keras.SGD
         optimizer = optimizer_fn(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       # We take 6 input samples with each input having a dimension of 3 or 5.
       input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
@@ -594,9 +618,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           rtol=1e-5)
 
   @combinations.generate(
-      combinations.times(tpu_strategy_combinations_graph_only(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_predict_with_partial_batch(self, distribution, run_distributed):
+      combinations.times(
+          tpu_strategy_combinations_graph_only(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_predict_with_partial_batch(self, distribution,
+                                      experimental_run_tf_function):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -604,7 +630,9 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       with distribution.scope():
         model_with_ds_strategy = get_model()
         model_with_ds_strategy.compile(
-            optimizer, loss, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       cpu_model = get_model()
       cpu_model.compile(optimizer, loss)
@@ -655,10 +683,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.evaluate(inputs, steps=1)
 
   @combinations.generate(
-      combinations.times(tpu_strategy_combinations_graph_only(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          tpu_strategy_combinations_graph_only(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_predict_multi_output_model_with_partial_batch(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -666,7 +695,9 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       with distribution.scope():
         model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
         model_with_ds_strategy.compile(
-            optimizer, loss, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       cpu_model = simple_multi_inputs_multi_outputs_model()
       cpu_model.compile(optimizer, loss)
@@ -693,7 +724,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_calling_model_on_same_dataset(self, distribution, run_distributed):
+  def test_calling_model_on_same_dataset(self, distribution,
+                                         experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -702,7 +734,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = get_dataset(distribution)
 
@@ -724,8 +759,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.predict(get_predict_dataset(distribution), steps=2)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_model_interleaved_eval_same_as_direct_eval(self, distribution,
-                                                      run_distributed):
+  def test_model_interleaved_eval_same_as_direct_eval(
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -734,7 +769,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             optimizer_fn(0.001),
             loss='mse',
             metrics=['mae', keras.metrics.CategoricalAccuracy()],
-            run_distributed=run_distributed)
+            experimental_run_tf_function=experimental_run_tf_function)
 
         interleaved_model = get_model()
         interleaved_model.set_weights(user_controlled_model.get_weights())
@@ -742,7 +777,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             optimizer_fn(0.001),
             loss='mse',
             metrics=['mae', keras.metrics.CategoricalAccuracy()],
-            run_distributed=run_distributed)
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = get_dataset(distribution)
 
@@ -778,7 +813,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution,
-                                                  run_distributed):
+                                                  experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -787,7 +822,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       input_a_np = np.random.random((10, 3)).astype('float32')
       input_b_np = np.random.random((10, 5)).astype('float32')
@@ -814,7 +852,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
   def test_fit_with_dictionary_in_the_dataset_b135161171(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
 
     def custom_loss(predict, label, weight):
       bce = keras.losses.binary_crossentropy(label, predict)
@@ -833,7 +871,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             inputs=[input_img, input_lbl, input_weight],
             outputs=[predict, my_loss])
         model.add_loss(model.get_layer('my_loss').output)
-        model.compile(optimizer='adam', run_distributed=run_distributed)
+        model.compile(
+            optimizer='adam',
+            experimental_run_tf_function=experimental_run_tf_function)
 
       def map_fn(img, lbl, weight):
         inputs = {'img': img, 'lbl': lbl, 'weight': weight}
@@ -851,7 +891,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -860,7 +900,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -884,12 +927,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
   @combinations.generate(
-      combinations.times(strategy_minus_tpu_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          strategy_minus_tpu_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_on_dataset_with_unknown_cardinality_without_steps(
-      self, distribution, run_distributed):
-    if run_distributed:
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
+      self, distribution, experimental_run_tf_function, mode):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -898,7 +940,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -939,10 +984,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           rtol=1e-4)
 
   @combinations.generate(
-      combinations.times(tpu_strategy_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          tpu_strategy_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_on_dataset_with_unknown_cardinality(self, distribution,
-                                               run_distributed):
+                                               experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
@@ -952,7 +998,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             gradient_descent.GradientDescentOptimizer(0.001),
             loss,
             metrics=metrics,
-            run_distributed=run_distributed)
+            experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((1000, 3), dtype=np.float32)
       targets = np.zeros((1000, 4), dtype=np.float32)
@@ -984,8 +1030,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         model.fit(dataset, epochs=1)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_fit_eval_and_predict_methods_on_dataset(self, distribution,
-                                                   run_distributed):
+  def test_fit_eval_and_predict_methods_on_dataset(
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -994,7 +1040,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['mae', keras.metrics.CategoricalAccuracy()]
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = get_dataset(distribution)
 
@@ -1004,14 +1053,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(strategy_and_optimizer_combinations())
   def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer,
-                                               run_distributed):
+                                               experimental_run_tf_function):
     with self.cached_session():
 
       with distribution.scope():
 
         model = get_model()
         loss = 'mse'
-        model.compile(optimizer(), loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer(),
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = get_dataset(distribution)
 
@@ -1026,8 +1078,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
               strategy_combinations.one_device_strategy
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_dataset_wrong_input_shape(self, distribution, run_distributed, mode):
+          experimental_run_tf_function=[True, False]))
+  def test_dataset_wrong_input_shape(self, distribution,
+                                     experimental_run_tf_function, mode):
     if mode == 'graph':
       self.skipTest(
           'TODO(b/120943676, b/120957836): Re-enable for graph once the '
@@ -1038,7 +1091,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         optimizer = optimizer_fn(learning_rate=0.001)
         model = get_model()
         loss = 'mse'
-        model.compile(optimizer, loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       # Wrong input shape
       inputs = np.zeros((10, 5), dtype=np.float32)
@@ -1056,29 +1112,27 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_dataset_no_batch_input_validation(self, distribution,
-                                             run_distributed, mode):
-    if mode == 'graph':
-      self.skipTest(
-          'TODO(b/120943676, b/120957836): Re-enable for graph once the '
-          'validation code is restored.'
-      )
+          experimental_run_tf_function=[True, False]))
+  def test_dataset_external_batch_input_validation(
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
+        optimizer_fn = gradient_descent_keras.SGD
+        optimizer = optimizer_fn(learning_rate=0.001)
         model = get_model()
-        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
         loss = 'mse'
-        model.compile(optimizer, loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 6), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      # Batching is done outside tf.data's `batch`
+      inputs = np.zeros((100, 10, 3), dtype=np.float32)
+      targets = np.zeros((100, 10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
 
-      with self.assertRaisesRegexp(ValueError, 'Call.*batch.*on.*Dataset'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
 
   @combinations.generate(
       combinations.combine(
@@ -1087,8 +1141,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
               strategy_combinations.mirrored_strategy_with_two_gpus
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_learning_phase_value(self, distribution, run_distributed):
+          experimental_run_tf_function=[True, False]))
+  def test_learning_phase_value(self, distribution,
+                                experimental_run_tf_function):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
@@ -1105,7 +1160,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         loss = 'mse'
         metrics = ['acc']
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       batch_size = 8
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
@@ -1135,13 +1193,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertArrayNear(output, ref_output, 1e-1)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def testOptimizerWithCallbacks(self, distribution, run_distributed):
+  def testOptimizerWithCallbacks(self, distribution,
+                                 experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         model = get_model()
         optimizer = gradient_descent_keras.SGD(0.01)
         loss = 'mse'
-        model.compile(optimizer, loss, run_distributed=run_distributed)
+        model.compile(
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = get_dataset(distribution)
 
@@ -1198,10 +1260,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           rtol=1e-5)
 
   @combinations.generate(
-      combinations.times(tpu_strategy_combinations_graph_only(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_predict_with_dataset_with_partial_batch(self, distribution,
-                                                   run_distributed):
+      combinations.times(
+          tpu_strategy_combinations_graph_only(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_predict_with_dataset_with_partial_batch(
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -1209,7 +1272,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with distribution.scope():
         model_with_ds_strategy = get_model()
         model_with_ds_strategy.compile(
-            optimizer, loss, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       cpu_model = get_model()
       cpu_model.compile(optimizer, loss)
@@ -1229,10 +1294,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           rtol=1e-5)
 
   @combinations.generate(
-      combinations.times(tpu_strategy_combinations_graph_only(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          tpu_strategy_combinations_graph_only(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_predict_multi_output_model_with_dataset_with_partial_batch(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -1240,7 +1306,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with distribution.scope():
         model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
         model_with_ds_strategy.compile(
-            optimizer, loss, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       cpu_model = simple_multi_inputs_multi_outputs_model()
       cpu_model.compile(optimizer, loss)
@@ -1321,13 +1389,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_dataset_with_sample_weights(self, distribution, run_distributed):
+          experimental_run_tf_function=[True, False]))
+  def test_dataset_with_sample_weights(self, distribution,
+                                       experimental_run_tf_function):
     with self.cached_session(), distribution.scope():
       model = get_sample_weights_model()
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      model.compile(optimizer, loss, run_distributed=run_distributed)
+      model.compile(
+          optimizer,
+          loss,
+          experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.array([[0], [1], [2], [3]], np.float32)
       targets = np.array([[2], [4], [6], [8]], np.float32)
@@ -1380,8 +1452,8 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
   @combinations.generate(
       combinations.times(
           strategy_combinations.all_strategy_combinations_minus_default(),
-          combinations.combine(run_distributed=[True, False])))
-  def test_regularizer_loss(self, distribution, run_distributed):
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_regularizer_loss(self, distribution, experimental_run_tf_function):
     batch_size = 2
     if not distributed_training_utils.global_batch_size_supported(distribution):
       batch_size //= distribution.num_replicas_in_sync
@@ -1403,7 +1475,7 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
       model.compile(
           opt,
           loss=TestRegularizerLoss.loss_fn,
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
       model.fit(
           x=np.array([[1.], [1.]], dtype=np.float32),
           y=np.array([[1.], [1.]], dtype=np.float32),
@@ -1416,14 +1488,17 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                                               parameterized.TestCase):
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_distribution_strategy_on_sequential_model(self, distribution,
-                                                     run_distributed):
+  def test_distribution_strategy_on_sequential_model(
+      self, distribution, experimental_run_tf_function):
     with distribution.scope():
       optimizer_fn = gradient_descent_keras.SGD
       optimizer = optimizer_fn(learning_rate=0.001)
       model = simple_sequential_model()
       loss = 'mse'
-      model.compile(optimizer, loss, run_distributed=run_distributed)
+      model.compile(
+          optimizer,
+          loss,
+          experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((20, 10), np.float32)
       targets = np.zeros((20, 2), np.float32)
@@ -1433,14 +1508,17 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.evaluate(inputs, targets, batch_size=10)
 
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
-  def test_distribution_strategy_on_functional_model(self, distribution,
-                                                     run_distributed):
+  def test_distribution_strategy_on_functional_model(
+      self, distribution, experimental_run_tf_function):
     with distribution.scope():
       optimizer_fn = gradient_descent_keras.SGD
       optimizer = optimizer_fn(learning_rate=0.001)
       model = get_model()
       loss = 'mse'
-      model.compile(optimizer, loss, run_distributed=run_distributed)
+      model.compile(
+          optimizer,
+          loss,
+          experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
@@ -1450,10 +1528,11 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.evaluate(inputs, targets)
 
   @combinations.generate(
-      combinations.times(all_strategy_combinations_minus_default(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          all_strategy_combinations_minus_default(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_distribution_strategy_one_dimensional(self, distribution,
-                                                 run_distributed):
+                                                 experimental_run_tf_function):
     with distribution.scope():
       inp = keras.layers.Input(shape=(10,))
       out = keras.layers.Dense(3, activation='softmax')(inp)
@@ -1462,7 +1541,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
           optimizer='rmsprop',
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
       x = np.random.random((64, 10)).astype('float32')
       y = np.random.randint(3, size=64)
@@ -1476,14 +1555,14 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
               strategy_combinations.mirrored_strategy_with_two_gpus
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False],
+          experimental_run_tf_function=[True, False],
           reduction=[
               loss_reduction.ReductionV2.AUTO,
               loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE,
               loss_reduction.ReductionV2.SUM
           ]))
   def test_distribution_strategy_with_loss_reduction_types(
-      self, distribution, run_distributed, reduction):
+      self, distribution, experimental_run_tf_function, reduction):
     np.random.seed(_RANDOM_SEED)
 
     def _get_model():
@@ -1509,17 +1588,18 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       ds_model.compile(
           'sgd',
           loss=keras.losses.MeanSquaredError(reduction=reduction),
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           dataset, steps_per_epoch=2, epochs=1, shuffle=False)
     self.assertArrayNear(history.history['loss'], ds_history.history['loss'],
                          1e-5)
 
   @combinations.generate(
-      combinations.times(all_strategy_combinations_minus_default(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_distribution_strategy_with_symbolic_add_loss(self, distribution,
-                                                        run_distributed):
+      combinations.times(
+          all_strategy_combinations_minus_default(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_distribution_strategy_with_symbolic_add_loss(
+      self, mode, distribution, experimental_run_tf_function):
 
     def _make_model_with_add_loss():
       inputs = keras.Input((10,))
@@ -1539,7 +1619,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     with distribution.scope():
       ds_model = _make_model_with_add_loss()
-      ds_model.compile('sgd', run_distributed=run_distributed)
+      ds_model.compile(
+          'sgd', experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(x, epochs=1)
 
     self.assertAllClose(history.history, ds_history.history)
@@ -1577,10 +1658,11 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllClose(history.history, ds_history.history)
 
   @combinations.generate(
-      combinations.times(all_strategy_minus_default_and_tpu_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          all_strategy_minus_default_and_tpu_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_distribution_strategy_with_add_metric_in_call(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
 
     class Bias(keras.layers.Layer):
 
@@ -1613,7 +1695,10 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     with distribution.scope():
       ds_model = _make_model_with_add_metric()
       self.assertLen(ds_model.metrics, 1)
-      ds_model.compile('sgd', 'mse', run_distributed=run_distributed)
+      ds_model.compile(
+          'sgd',
+          'mse',
+          experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
       self.assertLen(ds_model.metrics, 1)
@@ -1629,9 +1714,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
           ],
           mode=['eager'],
-          run_distributed=[False]))
-  def test_distribution_strategy_with_add_metric_object(self, distribution,
-                                                        run_distributed):
+          experimental_run_tf_function=[False]))
+  def test_distribution_strategy_with_add_metric_object(
+      self, distribution, experimental_run_tf_function):
 
     class Bias(keras.layers.Layer):
 
@@ -1664,7 +1749,10 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     with distribution.scope():
       ds_model = _make_model_with_add_metric_object()
       self.assertLen(ds_model.metrics, 1)
-      ds_model.compile('sgd', 'mse', run_distributed=run_distributed)
+      ds_model.compile(
+          'sgd',
+          'mse',
+          experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
       self.assertLen(ds_model.metrics, 1)
@@ -1673,10 +1761,11 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
   @combinations.generate(
       # TODO(phillypham): Why does validation_steps > 1 not work on TPUs?
-      combinations.times(all_strategy_minus_default_and_tpu_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
+      combinations.times(
+          all_strategy_minus_default_and_tpu_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_distribution_strategy_with_add_metric_outside_call(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
 
     def _make_model_with_add_metric():
       inputs = keras.Input((10,))
@@ -1700,13 +1789,78 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     with distribution.scope():
       ds_model = _make_model_with_add_metric()
       self.assertLen(ds_model.metrics, 1)
-      ds_model.compile('sgd', 'mse', run_distributed=run_distributed)
+      ds_model.compile(
+          'sgd',
+          'mse',
+          experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
       self.assertLen(ds_model.metrics, 1)
 
     self.assertAllClose(history.history, ds_history.history)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategies_minus_tpu,
+          mode=['eager'],
+          experimental_run_tf_function=[True]))
+  def test_sparse_tensor_outputs(self, distribution,
+                                 experimental_run_tf_function):
+
+    class ToSparse(keras.layers.Layer):
+      """Create a sparse tensor based on a given dense tensor."""
+
+      def call(self, inputs):
+        indices = array_ops.where_v2(math_ops.not_equal(inputs, 0))
+        values = array_ops.gather_nd(inputs, indices)
+        shape = array_ops.shape(inputs, out_type='int64')
+        return sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+    model = keras.Sequential([ToSparse()])
+    model._experimental_run_tf_function = experimental_run_tf_function
+
+    # Define some input data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    output = model.predict(input_data, batch_size=2)
+
+    expected_indices = np.array([[0, 0], [1, 0], [1, 1]])
+    expected_values = np.array([1, 2, 3])
+    expected_dense_shape = np.array([2, 3])
+
+    self.assertAllEqual(output.indices, expected_indices)
+    self.assertAllEqual(output.values, expected_values)
+    self.assertAllEqual(output.dense_shape, expected_dense_shape)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategies_minus_tpu,
+          mode=['eager'],
+          experimental_run_tf_function=[True]))
+  def test_ragged_tensor_outputs(self, distribution,
+                                 experimental_run_tf_function):
+
+    class ToRagged(keras.layers.Layer):
+      """Create a ragged tensor based on a given dense tensor."""
+
+      def __init__(self, padding, ragged_rank=1, **kwargs):
+        super(ToRagged, self).__init__(**kwargs)
+        self._padding = padding
+        self._ragged_rank = ragged_rank
+
+      def call(self, inputs):
+        return ragged_tensor.RaggedTensor.from_tensor(
+            inputs, padding=self._padding, ragged_rank=self._ragged_rank)
+
+    model = keras.Sequential([ToRagged(padding=0)])
+    model._experimental_run_tf_function = experimental_run_tf_function
+
+    # Define some input data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    output = model.predict(input_data, batch_size=2)
+
+    expected_values = [[1], [2, 3]]
+    self.assertAllEqual(expected_values, output)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategies_minus_default_minus_tpu + tpu_strategies,
@@ -1832,6 +1986,36 @@ def _sequential_with_add_loss_and_metric(input_shape, num_classes, l1, l2):
   return model
 
 
+def _functional_with_layer_reuse(input_shape, num_classes, l1, l2):
+  base_model = keras.Sequential([
+      keras.layers.Conv2D(
+          32, kernel_size=5, activation='relu', input_shape=input_shape),
+      keras.layers.MaxPooling2D(pool_size=2),
+      keras.layers.Conv2D(64, kernel_size=5, activation='relu'),
+      keras.layers.MaxPooling2D(pool_size=2),
+      keras.layers.Flatten(),
+      keras.layers.Dense(1024, activation='relu'),
+      keras.layers.Dense(num_classes, name='logits'),
+  ])
+  inputs = keras.Input(input_shape, name='images')
+  logits = base_model(inputs)
+  model = keras.Model(inputs=inputs, outputs=logits)
+  # Reuse sequential layer and create new nodes.
+  zero_logits = base_model(array_ops.zeros_like(inputs))
+  one_logits = base_model(array_ops.ones_like(inputs))
+  # L2 loss.
+  l2_loss = math_ops.reduce_mean(
+      math_ops.reduce_sum(math_ops.square(logits - zero_logits), -1))
+  model.add_loss(l2_loss * l2)
+  model.add_metric(l2_loss, aggregation='mean', name='l2_loss')
+  # L1 loss.
+  l1_loss = math_ops.reduce_mean(
+      math_ops.reduce_sum(math_ops.abs(logits - one_logits), -1))
+  model.add_loss(l1_loss * l1)
+  model.add_metric(l1_loss, aggregation='mean', name='l1_loss')
+  return model
+
+
 class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
     test.TestCase, parameterized.TestCase):
   """Tests complex models with multiple add loss and metric calls."""
@@ -1843,11 +2027,13 @@ class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
               model_fn=[
                   _functional_with_add_loss_and_metric,
                   _sequential_with_add_loss_and_metric,
+                  _functional_with_layer_reuse,
               ],
               l1=[0.01],
               l2=[0.1])))
   def test_fit_and_evaluate(self, distribution, model_fn, l1, l2):
     # Make fake MNIST-like image data.
+    np.random.seed(_RANDOM_SEED)
     dataset = dataset_ops.DatasetV2.from_tensor_slices(
         (np.random.uniform(size=(64, 28, 28, 1)).astype(np.float32),
          np.random.randint(0, 10, size=(64,))))
@@ -1871,7 +2057,7 @@ class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
       model.fit(dataset)
     results = dict(zip(model.metrics_names, model.evaluate(dataset)))
     # Sanity checks.
-    self.assertBetween(results['sparse_categorical_accuracy'], 0.05, 1.)
+    self.assertBetween(results['sparse_categorical_accuracy'], 0.02, 1.)
     self.assertGreater(results['l2_loss'], 0.)
     self.assertGreater(results['l1_loss'], 0.)
     # Assert correctness of the loss calculation and updating of metrics.
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index df47b5f8ea5..ef2d9e7f9d0 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -32,6 +32,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
@@ -42,7 +43,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -133,6 +137,38 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
   return all_inputs, all_outputs, all_updates, all_session_args
 
 
+def unwrap_output_dict(strategy, grouped_outputs, mode):
+  """Unwrap the list of outputs contained in the PerReplica parameters."""
+  if mode == ModeKeys.PREDICT:
+    return flatten_per_replica_values(strategy, grouped_outputs)
+
+  # In the case of fit/eval, the grouped_outputs is a dict, whereas in predict,
+  # the output is as same structure as model output. They need to be treated
+  # differently
+  total_loss = strategy.reduce(reduce_util.ReduceOp.SUM,
+                               grouped_outputs['total_loss'][0], axis=None)
+  output_losses = flatten_per_replica_values(strategy,
+                                             grouped_outputs['output_losses'])
+  metrics = flatten_per_replica_values(strategy,
+                                       grouped_outputs['metrics'])
+  batch_size = strategy.reduce(reduce_util.ReduceOp.SUM,
+                               grouped_outputs['batch_size'], axis=None)
+  if (is_tpu_strategy(strategy) and
+      ops.executing_eagerly_outside_functions()):
+    # Choose 1 value per replica in the TPU case since all replicas produce the
+    # same output.
+    # We only do this in eager mode for now since this function is used in
+    # both graph and eager mode and in the graph case we currently don't use
+    # experimental_run so would need to be removed when we converge the graph
+    # code path as well.
+    output_losses = output_losses[::strategy.num_replicas_in_sync]
+    metrics = metrics[::strategy.num_replicas_in_sync]
+  return {'total_loss': [total_loss],
+          'output_losses': output_losses,
+          'metrics': metrics,
+          'batch_size': batch_size}
+
+
 def unwrap_outputs(distribution_strategy, grouped_outputs,
                    with_loss_tensor=False):
   """Unwrap the list of outputs contained in the PerReplica parameters.
@@ -211,8 +247,8 @@ def validate_callbacks(input_callbacks, optimizer):
   Raises:
     ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
         callbacks passed.
-    ValueError: If `histogram_freq` or `write_grads` is one of the parameters
-        passed as part of the TensorBoard callback.
+    ValueError: If `write_grads` is one of the parameters passed as part of the
+        TensorBoard callback.
   """
   if input_callbacks:
     for callback in input_callbacks:
@@ -227,20 +263,13 @@ def validate_callbacks(input_callbacks, optimizer):
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
-        if getattr(callback, 'histogram_freq', False):
-          logging.warning(
-              UserWarning(
-                  '`histogram_freq` in the TensorBoard callback is not '
-                  'supported when using DistributionStrategy. Setting '
-                  '`histogram_freq` to `0`.'))
-          callback.histogram_freq = 0
         if getattr(callback, 'write_grads', False):
           logging.warning(
               UserWarning(
                   '`write_grads` in the TensorBoard callback is not supported '
                   'when using DistributionStrategy. Setting `write_grads` '
                   'to `False`.'))
-          callback.histogram_freq = False
+          callback.write_grads = False
 
 
 def validate_distributed_dataset_inputs(distribution_strategy, x, y,
@@ -311,7 +340,7 @@ def validate_per_replica_inputs(distribution_strategy, x):
 
   """
   # Convert the inputs and targets into a list of PerReplica objects.
-  per_replica_list = nest.flatten(x)
+  per_replica_list = nest.flatten(x, expand_composites=True)
   x_values_list = []
   for x in per_replica_list:
     if not tensor_util.is_tensor(x):
@@ -421,36 +450,43 @@ def is_dataset_shape_fully_defined(dataset):
   return not unknown_shapes
 
 
-def process_batch_and_step_size(
-    strategy, inputs, batch_size, steps_per_epoch, mode):
+def process_batch_and_step_size(strategy,
+                                inputs,
+                                batch_size,
+                                steps_per_epoch,
+                                mode,
+                                validation_split=0.):
   """Process the batch size and step size based on input and dist strategy."""
   first_x_value = nest.flatten(inputs)[0]
   if isinstance(first_x_value, np.ndarray):
+    num_samples = first_x_value.shape[0]
+    if validation_split and 0. < validation_split < 1.:
+      num_samples = int(num_samples * (1 - validation_split))
     # Until support for partial batch is implemented across all
     # functions and distribution strategy, we pass `mode` to selectively
     # relax the constraint to consume all the training samples.
-    steps_per_epoch, batch_size = get_input_params(strategy,
-                                                   first_x_value,
-                                                   steps_per_epoch,
-                                                   batch_size,
-                                                   mode=mode)
+    steps_per_epoch, batch_size = get_input_params(
+        strategy, num_samples, steps_per_epoch, batch_size, mode=mode)
   return batch_size, steps_per_epoch
 
 
-def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
+def get_input_params(distribution_strategy,
+                     num_samples,
+                     steps,
+                     batch_size,
                      mode=None):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
     distribution_strategy: The DistributionStrategy used to compile the model.
-    first_x_value: This is the first input numpy array that is passed in as the
-      model input.
+    num_samples: The number of samples from which we determine the batch size
+      and steps.
     steps:  The specified number of steps.
     batch_size: The specified batch_size.
     mode: ModeKey representing whether input will be used for training,
       evaluation, or prediction. This is used to relax the constraints on
-      consuming all the training samples to keep compatibility till we
-      support partial batches. If none, then partial batches are not allowed.
+      consuming all the training samples to keep compatibility till we support
+      partial batches. If none, then partial batches are not allowed.
 
   Returns:
     steps: The steps or steps_per_epoch argument depending on if a user is
@@ -462,7 +498,6 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
     ValueError: If the number of batches or steps evaluates to 0.
 
   """
-  num_samples = first_x_value.shape[0]
   # TODO(b/118776054): Use global batch size for Keras/DS support.
   # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
   use_per_replica_batch = not global_batch_size_supported(
@@ -1016,14 +1051,15 @@ def _copy_weights_to_original_model(model, mode):
     model.set_weights(updated_weights)
 
 
-def _per_replica_aggregate_batch(batch_outs, model, mode):
+def _per_replica_aggregate_batch(strategy, batch_outs, model, mode):
   """Aggregates the per-replica batch-level outputs from a distributed step."""
-  if model._distribution_strategy is not None and mode == ModeKeys.PREDICT:
+  if strategy is not None and mode == ModeKeys.PREDICT:
     total_batch_outs = []
     for i in range(len(model.outputs)):
-      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      num_replicas = strategy.num_replicas_in_sync
       nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+      total_batch_outs.append(
+          concat_along_batch_dimension(nest.flatten(nested_outs)))
     return total_batch_outs
   return batch_outs
 
@@ -1103,17 +1139,18 @@ def is_current_worker_chief():
   return dc_context.get_current_worker_context().is_chief
 
 
-def filter_distributed_callbacks(callbacks_list):
+def filter_distributed_callbacks(callbacks_list, model):
   """Filter Callbacks based on the worker context when running multi-worker.
 
   Arguments:
     callbacks_list: A list of `Callback` instances.
+    model: Keras model instance.
 
   Returns:
     The list of `Callback` instances that should be run on this worker.
   """
 
-  if not multi_worker_util.in_multi_worker_mode():
+  if not model._in_multi_worker_mode():
     raise ValueError(
         'filter_distributed_callbacks() should only be called when Keras '
         'is in multi worker mode.')
@@ -1155,3 +1192,12 @@ def _update_sample_weight_modes(model, mode, sample_weights):
       if sample_weights and None not in sample_weights:
         for m, sw in zip(distributed_models, sample_weights):
           m._update_sample_weight_modes(sample_weights=[sw])
+
+
+def concat_along_batch_dimension(outputs):
+  """Concats prediction outputs along the batch dimension."""
+  if isinstance(outputs[0], sparse_tensor.SparseTensor):
+    return sparse_ops.sparse_concat_v2(axis=0, sp_inputs=outputs)
+  if isinstance(outputs[0], ragged_tensor.RaggedTensor):
+    return ragged_concat_ops.concat(outputs, axis=0)
+  return np.concatenate(outputs)
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_test.py b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
index 4adc8b5f451..39b4c366cbd 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_test.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
@@ -22,14 +22,12 @@ from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adam as v1_adam
 
 
 class DistributedTrainingUtilsTest(test.TestCase):
 
-  @test.mock.patch.object(logging, 'warning', autospec=True)
-  def test_validate_callbacks_predefined_callbacks(self, mock_warning):
+  def test_validate_callbacks_predefined_callbacks(self):
     supported_predefined_callbacks = [
         callbacks.TensorBoard(),
         callbacks.CSVLogger(filename='./log.csv'),
@@ -55,8 +53,6 @@ class DistributedTrainingUtilsTest(test.TestCase):
         distributed_training_utils.validate_callbacks([callback],
                                                       v1_adam.AdamOptimizer())
 
-    self.assertEqual(0, mock_warning.call_count)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index 1a446d1fea8..73b899ba3cc 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -64,7 +64,8 @@ def graph_mode_test_configuration():
 def all_strategy_and_input_config_combinations():
   return (combinations.times(
       combinations.combine(
-          distribution=all_strategies, run_distributed=[True, False]),
+          distribution=all_strategies,
+          experimental_run_tf_function=[True, False]),
       eager_mode_test_configuration() + graph_mode_test_configuration()))
 
 
@@ -97,10 +98,11 @@ def test_combinations_for_embedding_model():
   return (combinations.times(
       combinations.combine(
           distribution=strategies_for_embedding_models(),
-          run_distributed=[True, False]),
+          experimental_run_tf_function=[True, False]),
       (graph_mode_test_configuration())) + combinations.times(
           combinations.combine(
-              distribution=eager_mode_strategies, run_distributed=[False]),
+              distribution=eager_mode_strategies,
+              experimental_run_tf_function=[False]),
           (eager_mode_test_configuration())))
 
 
@@ -244,13 +246,13 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
 def fit_eval_and_predict(initial_weights,
                          input_fn,
                          model_fn,
-                         run_distributed=None,
+                         experimental_run_tf_function=None,
                          distribution=None,
                          is_stateful_model=False):
   """Generates results for fit/predict/evaluate for given model."""
   training_inputs, eval_inputs, predict_inputs = input_fn()
   model = model_fn(
-      run_distributed=run_distributed,
+      experimental_run_tf_function=experimental_run_tf_function,
       initial_weights=initial_weights,
       distribution=distribution,
       input_shapes=get_shapes(training_inputs['x']))
@@ -418,28 +420,21 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
 
   def get_model(self,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     raise NotImplementedError
 
-  def skip_unsupported_test_configuration(self, distribution, run_distributed):
-    if should_skip_tpu_with_eager(distribution) and run_distributed:
-      self.skipTest(
-          'TPUStrategy does not support eager mode with run_distributed.')
-    return
-
   def run_correctness_test(self,
                            distribution,
                            use_numpy,
                            use_validation_data,
-                           run_distributed=None,
+                           experimental_run_tf_function=None,
                            with_batch_norm=False,
                            is_stateful_model=False,
                            partial_last_batch=None,
                            training_epochs=2):
     with self.cached_session():
       self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
-      self.skip_unsupported_test_configuration(distribution, run_distributed)
 
       if partial_last_batch == 'eval':
         x_train, y_train, x_eval, y_eval, x_predict = (
@@ -456,7 +451,8 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
       # This is used to initialize the model for both the distribution and
       # non-distribution run.
       model = self.get_model(
-          run_distributed=run_distributed, input_shapes=get_shapes(x_train))
+          experimental_run_tf_function=experimental_run_tf_function,
+          input_shapes=get_shapes(x_train))
       initial_weights = model.get_weights()
 
       ds_input_fn = functools.partial(
@@ -487,14 +483,14 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
           initial_weights,
           input_fn=ds_input_fn,
           model_fn=self.get_model,
-          run_distributed=run_distributed,
+          experimental_run_tf_function=experimental_run_tf_function,
           distribution=distribution,
           is_stateful_model=is_stateful_model)
       results_without_ds = fit_eval_and_predict(
           initial_weights,
           input_fn=nods_input_fn,
           model_fn=self.get_model,
-          run_distributed=run_distributed,
+          experimental_run_tf_function=experimental_run_tf_function,
           distribution=None,
           is_stateful_model=is_stateful_model)
 
@@ -534,14 +530,16 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
     training_input = kwargs
     return training_input, None, None
 
-  def run_dynamic_lr_test(self, distribution, run_distributed=None):
+  def run_dynamic_lr_test(self,
+                          distribution,
+                          experimental_run_tf_function=None):
     with self.cached_session():
       self.set_up_test_config()
-      self.skip_unsupported_test_configuration(distribution, run_distributed)
 
       x_train, y_train, _ = self.get_data()
       model = self.get_model(
-          run_distributed=run_distributed, input_shapes=get_shapes(x_train))
+          experimental_run_tf_function=experimental_run_tf_function,
+          input_shapes=get_shapes(x_train))
       initial_weights = model.get_weights()
       update_freq = None
 
@@ -582,13 +580,13 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
           initial_weights,
           input_fn=ds_input_fn,
           model_fn=self.get_model,
-          run_distributed=run_distributed,
+          experimental_run_tf_function=experimental_run_tf_function,
           distribution=distribution)
       results_without_ds = fit_eval_and_predict(
           initial_weights,
           input_fn=nods_input_fn,
           model_fn=self.get_model,
-          run_distributed=run_distributed,
+          experimental_run_tf_function=experimental_run_tf_function,
           distribution=None)
       compare_results(
           results_with_ds, results_without_ds, distribution, testcase=self)
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index 9d6724e6734..b3be5ce1993 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -34,14 +34,14 @@ def all_strategy_combinations_with_eager_and_graph_modes():
   return (combinations.combine(
       distribution=keras_correctness_test_base.all_strategies,
       mode=['graph', 'eager'],
-      run_distributed=[True, False]))
+      experimental_run_tf_function=[True, False]))
 
 
 def all_strategy_combinations_with_graph_mode():
   return (combinations.combine(
       distribution=keras_correctness_test_base.all_strategies,
       mode=['graph'],
-      run_distributed=[True, False]))
+      experimental_run_tf_function=[True, False]))
 
 
 def is_default_strategy(strategy):
@@ -53,7 +53,7 @@ class TestDistributionStrategyDnnCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
   def get_model(self,
-                run_distributed,
+                experimental_run_tf_function,
                 initial_weights=None,
                 distribution=None,
                 input_shapes=None):
@@ -76,7 +76,7 @@ class TestDistributionStrategyDnnCorrectness(
           loss=keras.losses.mean_squared_error,
           optimizer=gradient_descent_keras.SGD(0.05),
           metrics=['mse'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
       return model
 
   def get_data(self):
@@ -104,9 +104,9 @@ class TestDistributionStrategyDnnCorrectness(
   @combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data,
-                           run_distributed):
+                           experimental_run_tf_function):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies())
@@ -131,14 +131,18 @@ class TestDistributionStrategyDnnCorrectness(
         training_epochs=1)
 
   @combinations.generate(all_strategy_combinations_with_graph_mode())
-  def test_dnn_with_dynamic_learning_rate(self, distribution, run_distributed):
-    self.run_dynamic_lr_test(distribution, run_distributed)
+  def test_dnn_with_dynamic_learning_rate(self, distribution,
+                                          experimental_run_tf_function):
+    self.run_dynamic_lr_test(distribution, experimental_run_tf_function)
 
 
 class TestDistributionStrategyDnnMetricCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, run_distributed, distribution=None, input_shapes=None):
+  def get_model(self,
+                experimental_run_tf_function,
+                distribution=None,
+                input_shapes=None):
     with distribution.scope():
       model = keras.Sequential()
       model.add(
@@ -147,16 +151,17 @@ class TestDistributionStrategyDnnMetricCorrectness(
           loss=keras.losses.mean_squared_error,
           optimizer=gradient_descent_keras.SGD(0.05),
           metrics=[keras.metrics.BinaryAccuracy()],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
     return model
 
-  def run_metric_correctness_test(self, distribution, run_distributed):
+  def run_metric_correctness_test(self, distribution,
+                                  experimental_run_tf_function):
     with self.cached_session():
       self.set_up_test_config()
-      self.skip_unsupported_test_configuration(distribution, run_distributed)
 
       x_train, y_train, _ = self.get_data()
-      model = self.get_model(run_distributed, distribution=distribution)
+      model = self.get_model(
+          experimental_run_tf_function, distribution=distribution)
 
       batch_size = 64
       batch_size = (
@@ -169,14 +174,18 @@ class TestDistributionStrategyDnnMetricCorrectness(
       self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
   @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
-  def test_simple_dnn_metric_correctness(self, distribution, run_distributed):
-    self.run_metric_correctness_test(distribution, run_distributed)
+  def test_simple_dnn_metric_correctness(self, distribution,
+                                         experimental_run_tf_function):
+    self.run_metric_correctness_test(distribution, experimental_run_tf_function)
 
 
 class TestDistributionStrategyDnnMetricEvalCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
-  def get_model(self, run_distributed, distribution=None, input_shapes=None):
+  def get_model(self,
+                experimental_run_tf_function,
+                distribution=None,
+                input_shapes=None):
     with distribution.scope():
       model = keras.Sequential()
       model.add(
@@ -189,15 +198,16 @@ class TestDistributionStrategyDnnMetricEvalCorrectness(
           loss='mae',
           metrics=['accuracy', keras.metrics.BinaryAccuracy()],
           optimizer=gradient_descent.GradientDescentOptimizer(0.001),
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
     return model
 
-  def run_eval_metrics_correctness_test(self, distribution, run_distributed):
+  def run_eval_metrics_correctness_test(self, distribution,
+                                        experimental_run_tf_function):
     with self.cached_session():
       self.set_up_test_config()
-      self.skip_unsupported_test_configuration(distribution, run_distributed)
 
-      model = self.get_model(run_distributed, distribution=distribution)
+      model = self.get_model(
+          experimental_run_tf_function, distribution=distribution)
 
       # verify correctness of stateful and stateless metrics.
       x = np.ones((100, 4)).astype('float32')
@@ -217,8 +227,9 @@ class TestDistributionStrategyDnnMetricEvalCorrectness(
 
   @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
   def test_identity_model_metric_eval_correctness(self, distribution,
-                                                  run_distributed):
-    self.run_eval_metrics_correctness_test(distribution, run_distributed)
+                                                  experimental_run_tf_function):
+    self.run_eval_metrics_correctness_test(distribution,
+                                           experimental_run_tf_function)
 
 
 class SubclassedModel(keras.Model):
@@ -249,7 +260,7 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
     TestDistributionStrategyDnnCorrectness):
 
   def get_model(self,
-                run_distributed,
+                experimental_run_tf_function,
                 initial_weights=None,
                 distribution=None,
                 input_shapes=None):
@@ -260,23 +271,23 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           loss=keras.losses.mean_squared_error,
           optimizer=gradient_descent_keras.SGD(0.05),
           metrics=['mse'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
       return model
 
   @combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data,
-                           run_distributed):
+                           experimental_run_tf_function):
     if (context.executing_eagerly()) or is_default_strategy(distribution):
       self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                                run_distributed)
+                                experimental_run_tf_function)
     elif K.is_tpu_strategy(distribution) and not context.executing_eagerly():
       with self.assertRaisesRegexp(
           ValueError,
           'Expected `model` argument to be a functional `Model` instance, '
           'but got a subclass model instead.'):
         self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                                  run_distributed)
+                                  experimental_run_tf_function)
     else:
       with self.assertRaisesRegexp(
           ValueError,
@@ -284,27 +295,28 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           '`Sequential` model that is created without `input_shape`/'
           '`input_dim` set in its first layer or a subclassed model.'):
         self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                                  run_distributed)
+                                  experimental_run_tf_function)
 
   @combinations.generate(all_strategy_combinations_with_graph_mode())
-  def test_dnn_with_dynamic_learning_rate(self, distribution, run_distributed):
-    if ((not run_distributed and context.executing_eagerly() and
+  def test_dnn_with_dynamic_learning_rate(self, distribution,
+                                          experimental_run_tf_function):
+    if ((not experimental_run_tf_function and context.executing_eagerly() and
          not K.is_tpu_strategy(distribution)) or
         is_default_strategy(distribution)):
-      self.run_dynamic_lr_test(distribution, run_distributed)
+      self.run_dynamic_lr_test(distribution, experimental_run_tf_function)
     elif K.is_tpu_strategy(distribution):
       with self.assertRaisesRegexp(
           ValueError,
           'Expected `model` argument to be a functional `Model` instance, '
           'but got a subclass model instead.'):
-        self.run_dynamic_lr_test(distribution, run_distributed)
+        self.run_dynamic_lr_test(distribution, experimental_run_tf_function)
     else:
       with self.assertRaisesRegexp(
           ValueError,
           'We currently do not support distribution strategy with a '
           '`Sequential` model that is created without `input_shape`/'
           '`input_dim` set in its first layer or a subclassed model.'):
-        self.run_dynamic_lr_test(distribution, run_distributed)
+        self.run_dynamic_lr_test(distribution, experimental_run_tf_function)
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies())
diff --git a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
index 87fd1748c02..a14293ed704 100644
--- a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
@@ -33,7 +33,7 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
                 max_words=10,
                 initial_weights=None,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
@@ -51,32 +51,30 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
         model.set_weights(initial_weights)
 
       model.compile(
-          # TODO(b/130808953): Switch back the V1 optimizer once global_step is
-          # mirrored.
           optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
     return model
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_embedding_model_correctness(self, distribution, use_numpy,
-                                       use_validation_data, run_distributed):
+                                       use_validation_data,
+                                       experimental_run_tf_function):
 
     self.use_distributed_dense = False
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
-  def test_embedding_time_distributed_model_correctness(self, distribution,
-                                                        use_numpy,
-                                                        use_validation_data,
-                                                        run_distributed):
+  def test_embedding_time_distributed_model_correctness(
+      self, distribution, use_numpy, use_validation_data,
+      experimental_run_tf_function):
     self.use_distributed_dense = True
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
 
 class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
@@ -87,7 +85,7 @@ class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
                 max_words=10,
                 initial_weights=None,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
@@ -121,7 +119,7 @@ class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
       model.compile(
           optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
           loss='mse',
-          run_distributed=run_distributed,
+          experimental_run_tf_function=experimental_run_tf_function,
           metrics=['mse'])
     return model
 
@@ -159,9 +157,9 @@ class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
                                                use_validation_data,
-                                               run_distributed):
+                                               experimental_run_tf_function):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
index b3d5706c476..8f050f817a4 100644
--- a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
@@ -31,7 +31,7 @@ class DistributionStrategyCnnCorrectnessTest(
   def get_model(self,
                 initial_weights=None,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     del input_shapes
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
@@ -58,7 +58,7 @@ class DistributionStrategyCnnCorrectnessTest(
           optimizer=gradient_descent.SGD(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
     return model
 
@@ -93,22 +93,22 @@ class DistributionStrategyCnnCorrectnessTest(
   @combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_cnn_correctness(self, distribution, use_numpy, use_validation_data,
-                           run_distributed):
+                           experimental_run_tf_function):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
   @combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
                                            use_validation_data,
-                                           run_distributed):
+                                           experimental_run_tf_function):
     self.skipTest('Flakily times out, b/134670856')
     self.run_correctness_test(
         distribution,
         use_numpy,
         use_validation_data,
         with_batch_norm=True,
-        run_distributed=run_distributed)
+        experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies() +
diff --git a/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
index 5b403c2cc5f..149fbd1e083 100644
--- a/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_lstm_model_correctness_test.py
@@ -37,7 +37,7 @@ class DistributionStrategyLstmModelCorrectnessTest(
                 max_words=10,
                 initial_weights=None,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     del input_shapes
 
@@ -67,15 +67,16 @@ class DistributionStrategyLstmModelCorrectnessTest(
           optimizer=optimizer_fn(learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
     return model
 
   @combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_lstm_model_correctness(self, distribution, use_numpy,
-                                  use_validation_data, run_distributed):
+                                  use_validation_data,
+                                  experimental_run_tf_function):
     self.run_correctness_test(distribution, use_numpy, use_validation_data,
-                              run_distributed)
+                              experimental_run_tf_function)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
index 36918ca2db2..012c10f7ae8 100644
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@@ -108,9 +108,9 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
+          experimental_run_tf_function=[True, False]))
   def testOptimizerWithKerasModelAndNumpyArrays(self, distribution,
-                                                run_distributed):
+                                                experimental_run_tf_function):
     self.skipTest('b/130309197')
     with self.cached_session():
       with distribution.scope():
@@ -119,7 +119,10 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
         loss = 'mse'
         metrics = ['mae']
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
diff --git a/tensorflow/python/keras/distribute/keras_premade_models_test.py b/tensorflow/python/keras/distribute/keras_premade_models_test.py
new file mode 100644
index 00000000000..fa77ca2413c
--- /dev/null
+++ b/tensorflow/python/keras/distribute/keras_premade_models_test.py
@@ -0,0 +1,96 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras premade models using tf.distribute.Strategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import test
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.premade import linear
+from tensorflow.python.keras.premade import wide_deep
+
+
+def strategy_combinations_eager_data_fn():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.one_device_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus
+      ],
+      mode=['eager'],
+      data_fn=[get_numpy, get_dataset])
+
+
+def get_numpy():
+  inputs = np.random.uniform(low=-5, high=5, size=(64, 2)).astype(np.float32)
+  output = .3 * inputs[:, 0] + .2 * inputs[:, 1]
+  return inputs, output
+
+
+def get_dataset():
+  inputs, output = get_numpy()
+  dataset = dataset_ops.Dataset.from_tensor_slices((inputs, output))
+  dataset = dataset.batch(10).repeat(10)
+  return dataset
+
+
+class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(strategy_combinations_eager_data_fn())
+  def test_linear_model(self, distribution, data_fn):
+    with distribution.scope():
+      model = linear.LinearModel()
+      opt = gradient_descent.SGD(learning_rate=0.1)
+      model.compile(opt, 'mse', experimental_run_tf_function=True)
+      if data_fn == get_numpy:
+        inputs, output = get_numpy()
+        hist = model.fit(inputs, output, epochs=5)
+      else:
+        hist = model.fit(get_dataset(), epochs=5)
+      self.assertLess(hist.history['loss'][4], 0.2)
+
+  @combinations.generate(strategy_combinations_eager_data_fn())
+  def test_wide_deep_model(self, distribution, data_fn):
+    with distribution.scope():
+      linear_model = linear.LinearModel(units=1)
+      dnn_model = sequential.Sequential([core.Dense(units=1)])
+      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+      linear_opt = gradient_descent.SGD(learning_rate=0.1)
+      dnn_opt = adagrad.Adagrad(learning_rate=0.2)
+      wide_deep_model.compile(
+          optimizer=[linear_opt, dnn_opt],
+          loss='mse',
+          experimental_run_tf_function=True)
+      if data_fn == get_numpy:
+        inputs, output = get_numpy()
+        hist = wide_deep_model.fit(inputs, output, epochs=5)
+      else:
+        hist = wide_deep_model.fit(get_dataset(), epochs=5)
+      self.assertLess(hist.history['loss'][4], 0.2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index 3a6d5cc30a2..db5118f5b69 100644
--- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -41,7 +41,7 @@ def test_combinations_for_stateful_embedding_model():
       mode='graph',
       use_numpy=False,
       use_validation_data=False,
-      run_distributed=[True, False]))
+      experimental_run_tf_function=[True, False]))
 
 
 class DistributionStrategyStatefulLstmModelCorrectnessTest(
@@ -52,7 +52,7 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
                 max_words=10,
                 initial_weights=None,
                 distribution=None,
-                run_distributed=None,
+                experimental_run_tf_function=None,
                 input_shapes=None):
     del input_shapes
     batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
@@ -82,23 +82,26 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
           metrics=['sparse_categorical_accuracy'])
     return model
 
+  # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it
+  # doesn't work and enable for DistributionStrategy more generally.
   @combinations.generate(test_combinations_for_stateful_embedding_model())
-  def test_stateful_lstm_model_correctness(self, distribution, use_numpy,
-                                           use_validation_data,
-                                           run_distributed):
+  def disabled_test_stateful_lstm_model_correctness(
+      self, distribution, use_numpy, use_validation_data,
+      experimental_run_tf_function):
     self.run_correctness_test(
         distribution,
         use_numpy,
         use_validation_data,
         is_stateful_model=True,
-        run_distributed=run_distributed)
+        experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(
           keras_correctness_test_base.test_combinations_with_tpu_strategies(),
-          combinations.combine(run_distributed=[True, False])))
+          combinations.combine(experimental_run_tf_function=[True, False])))
   def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
-      self, distribution, use_numpy, use_validation_data, run_distributed):
+      self, distribution, use_numpy, use_validation_data,
+      experimental_run_tf_function):
     with self.assertRaisesRegexp(
         ValueError,
         'Single core must be used for computation on stateful models. Consider '
@@ -108,7 +111,7 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
           use_numpy,
           use_validation_data,
           is_stateful_model=True,
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 8a790cf8dd3..c4769126997 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -72,16 +72,17 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
                                             parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(keras_test_lib.all_strategy_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_callbacks_in_fit(self, distribution, run_distributed):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_callbacks_in_fit(self, distribution, experimental_run_tf_function):
     with distribution.scope():
       model = keras_test_lib.get_model()
       model.compile(
           optimizer='sgd',
           loss='mse',
           metrics=['mae'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -127,16 +128,17 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
         })
 
   @combinations.generate(
-      combinations.times(keras_test_lib.all_strategy_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_callbacks_in_eval(self, distribution, run_distributed):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_callbacks_in_eval(self, distribution, experimental_run_tf_function):
     with distribution.scope():
       model = keras_test_lib.get_model()
       model.compile(
           optimizer='sgd',
           loss='mse',
           metrics=['mae'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -152,16 +154,18 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
         })
 
   @combinations.generate(
-      combinations.times(keras_test_lib.all_strategy_combinations(),
-                         combinations.combine(run_distributed=[True, False])))
-  def test_callbacks_in_predict(self, distribution, run_distributed):
+      combinations.times(
+          keras_test_lib.all_strategy_combinations(),
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_callbacks_in_predict(self, distribution,
+                                experimental_run_tf_function):
     with distribution.scope():
       model = keras_test_lib.get_model()
       model.compile(
           optimizer='sgd',
           loss='mse',
           metrics=['mae'],
-          run_distributed=run_distributed)
+          experimental_run_tf_function=experimental_run_tf_function)
 
     dataset = keras_test_lib.get_dataset(distribution)
     counter = Counter()
@@ -238,8 +242,9 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_unsupported_features(self, distribution, run_distributed, mode):
+          experimental_run_tf_function=[True, False]))
+  def test_unsupported_features(self, distribution,
+                                experimental_run_tf_function, mode):
     with self.cached_session():
       with distribution.scope():
         model = keras_test_lib.get_model()
@@ -247,18 +252,15 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
         loss = 'mse'
         metrics = ['mae']
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = keras_test_lib.get_dataset(distribution)
-
-      if run_distributed and mode == 'eager':
-        exception_error_message = (
-            '`validation_split` argument is not supported when data adapter'
-            ' is.+')
-      else:
-        exception_error_message = (
-            '`validation_split` argument is not supported when input `x`'
-            ' is a dataset or a dataset iterator.+')
+      exception_error_message = (
+          '`validation_split` argument is not supported when input `x`'
+          ' is a dataset or a dataset iterator.+')
 
       # Test with validation split
       with self.assertRaisesRegexp(ValueError, exception_error_message):
@@ -308,9 +310,9 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
+          experimental_run_tf_function=[True, False]))
   def test_calling_with_unsupported_predefined_callbacks(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         model = keras_test_lib.get_model()
@@ -318,7 +320,10 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
         loss = 'mse'
         metrics = ['mae']
         model.compile(
-            optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+            optimizer,
+            loss,
+            metrics=metrics,
+            experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = keras_test_lib.get_dataset(distribution)
 
@@ -349,22 +354,27 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       combinations.combine(
           distribution=[strategy_combinations.one_device_strategy],
           mode=['eager'],
-          run_distributed=[True, False]))
+          experimental_run_tf_function=[True, False]))
   def test_distribution_strategy_with_run_eagerly(self, distribution,
-                                                  run_distributed):
+                                                  experimental_run_tf_function):
     with distribution.scope():
       x = keras.layers.Input(shape=(1,))
       y = keras.layers.Dense(1, kernel_initializer='ones')(x)
       model = keras.models.Model(x, y)
 
-      if run_distributed:
-        model.compile('sgd', run_eagerly=True, run_distributed=run_distributed)
+      if experimental_run_tf_function:
+        model.compile(
+            'sgd',
+            run_eagerly=True,
+            experimental_run_tf_function=experimental_run_tf_function)
       else:
         err_msg = ('We currently do not support enabling `run_eagerly` with '
                    'distribution strategy.')
         with self.assertRaisesRegex(ValueError, err_msg):
           model.compile(
-              'sgd', run_eagerly=True, run_distributed=run_distributed)
+              'sgd',
+              run_eagerly=True,
+              experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       combinations.combine(
@@ -373,9 +383,9 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
               strategy_combinations.one_device_strategy,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
-  def test_distribution_strategy_on_subclassed_model(self, distribution,
-                                                     run_distributed):
+          experimental_run_tf_function=[True, False]))
+  def test_distribution_strategy_on_subclassed_model(
+      self, distribution, experimental_run_tf_function):
     with distribution.scope():
 
       class _SimpleMLP(keras.Model):
@@ -395,9 +405,11 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
             'We currently do not support distribution strategy with a '
             '`Sequential` model that is created without `input_shape`/'
             '`input_dim` set in its first layer or a subclassed model.'):
-          model.compile('sgd', run_distributed=run_distributed)
+          model.compile(
+              'sgd', experimental_run_tf_function=experimental_run_tf_function)
       else:
-        model.compile('sgd', run_distributed=run_distributed)
+        model.compile(
+            'sgd', experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       combinations.combine(
@@ -406,16 +418,17 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
               strategy_combinations.one_device_strategy,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False]))
+          experimental_run_tf_function=[True, False]))
   def test_distribution_strategy_on_deferred_sequential_model(
-      self, distribution, run_distributed):
+      self, distribution, experimental_run_tf_function):
     with distribution.scope():
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(16, activation='relu'))
       model.add(keras.layers.Dense(3, activation='softmax'))
 
       if context.executing_eagerly():
-        model.compile('sgd', run_distributed=run_distributed)
+        model.compile(
+            'sgd', experimental_run_tf_function=experimental_run_tf_function)
       else:
         with self.assertRaisesRegexp(
             ValueError,
@@ -423,7 +436,8 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
             '`Sequential` model that is created without '
             '`input_shape`/`input_dim` set in its first layer or '
             'a subclassed model.'):
-          model.compile('sgd', run_distributed=run_distributed)
+          model.compile(
+              'sgd', experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
@@ -449,10 +463,10 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
           mode=['graph', 'eager'],
-          run_distributed=[True, False],
+          experimental_run_tf_function=[True, False],
           optimizer=strategy_combinations.gradient_descent_optimizer_keras_v2_fn
       ))
-  def test_masking(self, distribution, run_distributed, optimizer):
+  def test_masking(self, distribution, experimental_run_tf_function, optimizer):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -463,7 +477,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
             keras.layers.TimeDistributed(
                 keras.layers.Dense(1, kernel_initializer='one')))
         model.compile(
-            loss='mse', optimizer=optimizer(), run_distributed=run_distributed)
+            loss='mse',
+            optimizer=optimizer(),
+            experimental_run_tf_function=experimental_run_tf_function)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -480,11 +496,11 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
           keras_test_lib.all_strategy_combinations(),
           combinations.combine(
               fused=[True, False],
-              run_distributed=[True, False],
+              experimental_run_tf_function=[True, False],
               optimizer=strategy_combinations
               .gradient_descent_optimizer_keras_v2_fn)))
   def test_batchnorm_correctness(self, distribution, fused, optimizer,
-                                 run_distributed):
+                                 experimental_run_tf_function):
     with self.cached_session():
       with distribution.scope():
         model = keras.models.Sequential()
@@ -496,7 +512,9 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
             ), momentum=0.8, fused=fused)
         model.add(norm)
         model.compile(
-            loss='mse', optimizer=optimizer(), run_distributed=run_distributed)
+            loss='mse',
+            optimizer=optimizer(),
+            experimental_run_tf_function=experimental_run_tf_function)
 
       # centered on 5.0, variance 10.0
       x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
@@ -525,21 +543,28 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
-              run_distributed=[True, False],
+              experimental_run_tf_function=[True, False],
               optimizer=strategy_combinations.rmsprop_optimizer_keras_v2_fn)))
-  def test_save_load_h5(self, distribution, optimizer, run_distributed):
+  def test_save_load_h5(self, distribution, optimizer,
+                        experimental_run_tf_function):
     with self.cached_session():
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(optimizer(), 'mse', run_distributed=run_distributed)
+        model.compile(
+            optimizer(),
+            'mse',
+            experimental_run_tf_function=experimental_run_tf_function)
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp('.h5')
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(optimizer(), 'mse', run_distributed=run_distributed)
+        model_2.compile(
+            optimizer(),
+            'mse',
+            experimental_run_tf_function=experimental_run_tf_function)
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
@@ -549,9 +574,10 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
-              run_distributed=[True, False],
+              experimental_run_tf_function=[True, False],
               optimizer=strategy_combinations.rmsprop_optimizer_keras_v2_fn)))
-  def test_save_load_trackable(self, distribution, optimizer, run_distributed):
+  def test_save_load_trackable(self, distribution, optimizer,
+                               experimental_run_tf_function):
     # TODO(b/123533246): Enable the test for TPU once bug is fixed
     if (isinstance(distribution,
                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)) and
@@ -561,14 +587,20 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(optimizer(), 'mse', run_distributed=run_distributed)
+        model.compile(
+            optimizer(),
+            'mse',
+            experimental_run_tf_function=experimental_run_tf_function)
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp()
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(optimizer(), 'mse', run_distributed=run_distributed)
+        model_2.compile(
+            optimizer(),
+            'mse',
+            experimental_run_tf_function=experimental_run_tf_function)
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
@@ -580,8 +612,9 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
   @combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
-          combinations.combine(run_distributed=[True, False])))
-  def test_layer_outside_scope(self, distribution, run_distributed):
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_layer_outside_scope(self, distribution,
+                               experimental_run_tf_function):
     with self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'was not created in the distribution strategy'):
@@ -593,13 +626,17 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
           loss = 'mse'
           metrics = ['mae', keras.metrics.CategoricalAccuracy()]
           model.compile(
-              optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+              optimizer,
+              loss,
+              metrics=metrics,
+              experimental_run_tf_function=experimental_run_tf_function)
 
   @combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
-          combinations.combine(run_distributed=[True, False])))
-  def test_model_outside_scope(self, distribution, run_distributed):
+          combinations.combine(experimental_run_tf_function=[True, False])))
+  def test_model_outside_scope(self, distribution,
+                               experimental_run_tf_function):
     with self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'was not created in the distribution strategy'):
@@ -611,7 +648,10 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
           loss = 'mse'
           metrics = ['mae', keras.metrics.CategoricalAccuracy()]
           model.compile(
-              optimizer, loss, metrics=metrics, run_distributed=run_distributed)
+              optimizer,
+              loss,
+              metrics=metrics,
+              experimental_run_tf_function=experimental_run_tf_function)
 
 
 class TestDistributionStrategyWithStaticShapes(test.TestCase,
diff --git a/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py b/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py
index b9ce689458b..21128881b5d 100644
--- a/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_optimizer_comparison_test.py
@@ -34,10 +34,8 @@ from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
-from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent as gradient_descent_v1
-from tensorflow.python.training import rmsprop as rmsprop_v1
 
 
 class KerasMultiWorkerOptimizerTest(test_base.IndependentWorkerTestBase,
@@ -136,17 +134,6 @@ class KerasMultiWorkerOptimizerTest(test_base.IndependentWorkerTestBase,
         strategy_cls, gradient_descent.SGD,
         gradient_descent_v1.GradientDescentOptimizer)
 
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1]))
-  def test_rmsprop_optimizer_v1_v2_comparison(self, strategy_cls):
-    self.skipTest('There is an issue in collective ops (b/127700538) that '
-                  'prevent us from running this test with rmsprop optimizers.')
-    self.run_optimizer_comparison_with_simple_bias_model(
-        strategy_cls, rmsprop.RMSprop, rmsprop_v1.RMSPropOptimizer)
-
 
 if __name__ == '__main__':
   with test.mock.patch.object(sys, 'exit', os._exit):
diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py
index f40b249cd3d..d34f1724d35 100644
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@@ -34,10 +34,8 @@ from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
@@ -201,77 +199,6 @@ class MultiWorkerVerificationCallback(callbacks.Callback):
         })
 
 
-# TODO(yuefengz): right now, fit or evaluate has to be called under distribution
-# strategy's scope.
-def _run_standalone_client(test_obj, strategy, cluster_spec):
-  input_shape = (28, 28, 1)
-  with strategy.scope():
-    orig_model = multi_worker_testing_utils.get_mnist_model(input_shape)
-
-  def worker_fn(strategy):
-    with ops.Graph().as_default():
-      batch_size = 64
-      steps = 2
-
-      with strategy.scope():
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        model = _clone_and_build_model(orig_model, strategy)
-
-        orig_loss, orig_acc = model.evaluate(train_ds, steps=steps)
-
-        # Workaround for the metrics issue (b/122928955) in async training. This
-        # can only be used in standalone client mode.
-        multi_worker_util.wait_for_other_workers()
-
-        model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
-
-        multi_worker_util.wait_for_other_workers()
-
-        trained_loss, trained_acc = model.evaluate(train_ds, steps=steps)
-
-      test_obj.assertLessEqual(trained_loss, orig_loss)
-      test_obj.assertGreaterEqual(trained_acc, orig_acc)
-
-  dc.run_distribute_coordinator(
-      worker_fn,
-      strategy,
-      mode=dc.CoordinatorMode.STANDALONE_CLIENT,
-      cluster_spec=cluster_spec)
-
-
-class KerasMultiWorkerTestStandaloneClient(test.TestCase,
-                                           parameterized.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
-    super(KerasMultiWorkerTestStandaloneClient, cls).setUpClass()
-    cls._cluster_spec = test_base.create_in_process_cluster(
-        num_workers=2, num_ps=1, has_eval=False)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[
-              ParameterServerStrategy,
-              collective_strategy.CollectiveAllReduceStrategy,
-          ],
-          required_gpus=[0, 1]))
-  def testSimpleModelStandaloneClient(self, strategy_cls):
-    # With standalone client, training_utils.should_run_multi_worker returns
-    # False which means the distribute coordinator won't be called again in
-    # `fit`. This is still correct and intended since session is still
-    # configured under distribute coordinator's worker context and distribution
-    # strategy object is already configured by distribute coordinator for
-    # multi-worker training.
-    # The logic should be much clearer once standalone client is merged into
-    # core Keras as well.
-    strategy = strategy_cls()
-
-    _run_standalone_client(self, strategy, self._cluster_spec)
-
-
 class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
                                             parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state.py b/tensorflow/python/keras/distribute/multi_worker_training_state.py
index 17ac85a8654..d4fc0fc356d 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state.py
+++ b/tensorflow/python/keras/distribute/multi_worker_training_state.py
@@ -220,7 +220,8 @@ class MultiWorkerTrainingState(object):
     return temp_dir, os.path.join(temp_dir, 'training_state')
 
   def _assert_in_multi_worker_mode(self):
-    if not multi_worker_util.in_multi_worker_mode():
+    # pylint: disable=protected-access
+    if not self._model._in_multi_worker_mode():
       raise ValueError('MultiWorkerTrainingState is only supposed to be used '
                        'in multi-worker training. This indicates some error '
                        'that needs to be fixed. Please submit a bug issue to '
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py b/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
index 3b8c53867fc..a726a1deb45 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
@@ -22,6 +22,7 @@ import sys
 from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
@@ -48,7 +49,13 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
     ]
     self.assertFalse(training_state.checkpoint_exists(saving_filepath))
 
-    model.fit(x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
+    try:
+      model.fit(
+          x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
+    except NotFoundError as e:
+      if 'Failed to create a NewWriteableFile' in e.message:
+        self.skipTest('b/138941852, path not found error in Windows py35.')
+
     self.assertTrue(training_state.checkpoint_exists(saving_filepath))
     self.assertTrue(
         training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 4cd6fa74819..3d02e85a78e 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 import functools
-import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 import itertools
 import json
 import threading
@@ -28,6 +27,7 @@ import threading
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from google.protobuf import json_format
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
@@ -36,11 +36,14 @@ from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -64,6 +67,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
@@ -73,7 +78,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import serialization
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
@@ -81,6 +85,9 @@ from tensorflow.tools.docs import doc_controls
 # Prefix that is added to the TF op layer names.
 _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 
+_keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
+                                           'keras layers usage', 'method')
+
 
 @keras_export('keras.layers.Layer')
 class Layer(module.Module):
@@ -109,8 +116,9 @@ class Layer(module.Module):
   Arguments:
     trainable: Boolean, whether the layer's variables should be trainable.
     name: String name of the layer.
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
+    dtype: The dtype of the layer's computations and weights (default of
+      `None` means use `tf.keras.backend.floatx` in TensorFlow 2, or the type
+      of the first input in TensorFlow 1).
     dynamic: Set this to `True` if your layer should only be run eagerly, and
       should not be used to generate a static computation graph.
       This would be the case for a Tree-RNN or a recursive network,
@@ -120,8 +128,10 @@ class Layer(module.Module):
 
   Read-only properties:
     name: The name of the layer (string).
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
+    dtype: The dtype of the layer's computations and weights. If mixed
+      precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
+      this is instead just the dtype of the layer's weights, as the computations
+      are done in a different dtype.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -134,6 +144,129 @@ class Layer(module.Module):
     trainable: Whether the layer should be trained (boolean).
     input_spec: Optional (list of) `InputSpec` object(s) specifying the
       constraints on inputs that can be accepted by the layer.
+
+  ### Dtypes and casting
+  Each layer has a dtype, which is typically the dtype of the layer's
+  computations and variables. A layer's dtype can be queried via the
+  `Layer.dtype` property. The dtype is specified with the `dtype` constructor
+  argument. In TensorFlow 2, the dtype defaults to `tf.keras.backend.floatx()`
+  if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
+  layers will cast their inputs to the layer's dtype in TensorFlow 2. For
+  example:
+
+  ```
+  x = tf.ones((4, 4, 4, 4), dtype='float64')
+  layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
+  print(layer.dtype)  # float32
+
+  # `layer` casts it's inputs to layer.dtype, which is float32, and does
+  # computations in float32.
+  y = layer(x)
+  ```
+
+  Currently, only tensors in the first argument to the layer's `call` method are
+  casted. For example:
+
+  ```
+  class MyLayer(tf.keras.layers.Layer):
+    # Bug! `b` will not be casted.
+    def call(self, a, b):
+      return a + 1., b + 1.
+
+  a = tf.constant(1., dtype="float32")
+  b = tf.constant(1., dtype="float32")
+
+  layer = MyLayer(dtype="float64")
+  x, y = layer(a, b)
+  print(x.dtype)  # float64
+  print(y.dtype)  # float32. Not casted since `b` was not passed to first input
+  ```
+
+  It is recommended to accept tensors only in the first argument. This way,
+  all tensors are casted to the layer's dtype. `MyLayer` should therefore be
+  written as:
+
+  ```
+  class MyLayer(tf.keras.layers.Layer):
+    # Now, all tensor inputs will be casted.
+    def call(self, inputs):
+      a, b = inputs
+      return a + 1., b + 1.
+
+  a = tf.constant(1., dtype="float32")
+  b = tf.constant(1., dtype="float32")
+
+  layer = MyLayer(dtype="float64")
+  x, y = layer((a, b))
+  print(x.dtype)  # float64
+  print(y.dtype)  # float64.
+  ```
+
+  In a future minor release, tensors in other arguments may be casted as well.
+
+  Currently, other arguments are not automatically casted for
+  technical reasons, but this may change in a future minor release.
+
+  A layer subclass can prevent its inputs from being autocasted by passing
+  `autocast=False` to the layer constructor. For example:
+
+  ```
+  class MyLayer(tf.keras.layers.Layer):
+
+    def __init__(self, **kwargs):
+      kwargs['autocast']=False
+      super(MyLayer, self).__init__(**kwargs)
+
+    def call(self, inp):
+      return inp
+
+  x = tf.ones((4, 4, 4, 4), dtype='float64')
+  layer = MyLayer()
+  print(layer.dtype)  # float32.
+  y = layer(x)  # MyLayer will not cast inputs to it's dtype of float32
+  print(y.dtype)  # float64
+  ```
+
+  #### Running models in float64 in TensorFlow 2
+
+  If you want to run a Model in float64, you can set floatx to be float64 by
+  calling `tf.keras.backend.set_floatx('float64')`. This will cause all layers
+  to default to float64 instead of float32:
+
+  ```
+  tf.keras.backend.set_floatx('float64')
+  layer1 = tf.keras.layers.Dense(4)
+  layer2 = tf.keras.layers.Dense(4)
+
+  x = tf.ones((4, 4))
+  y = layer2(layer1(x))  # Both layers run in float64
+  ```
+
+  Alternatively, you can pass `dtype='float64'` to each individual layer. Note
+  that if you have any layers which contain other layers as members, you must
+  ensure each sublayer gets `dtype='float64'` passed to it's constructor as
+  well:
+
+  ```
+  layer1 = tf.keras.layers.Dense(4, dtype='float64')
+  layer2 = tf.keras.layers.Dense(4, dtype='float64')
+
+  x = tf.ones((4, 4))
+  y = layer2(layer1(x))  # Both layers run in float64
+
+  class NestedLayer(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+      super(NestedLayer, self).__init__(**kwargs)
+      self.dense = tf.keras.layers.Dense(4, dtype=kwargs.get('dtype'))
+
+    def call(self, inp):
+      return self.dense(inp)
+
+  layer3 = NestedLayer(dtype='float64')
+  z = layer3(x)  # layer3's dense layer runs in float64, since NestedLayer
+                 # correcty passed it's dtype to it's dense layer
+
+  ```
   """
 
   # See tf.Module for the usage of this property.
@@ -161,6 +294,7 @@ class Layer(module.Module):
         'batch_size',
         'weights',
         'activity_regularizer',
+        'autocast'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -196,9 +330,12 @@ class Layer(module.Module):
     # added using the `add_metric` API.
     self._metrics = []
 
-    self._set_dtype_and_policy(dtype)
-    self._call_convention = (base_layer_utils
-                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    self._set_dtype_policy(dtype)
+    # Boolean indicating whether the layer automatically casts its inputs to the
+    # layer's compute_dtype.
+    self._autocast = kwargs.get('autocast',
+                                base_layer_utils.v2_dtype_behavior_enabled())
+
     # Dependencies tracked via attribute assignment.
     self._maybe_create_attribute('_layers', [])
 
@@ -207,11 +344,7 @@ class Layer(module.Module):
     self._inbound_nodes = []
     self._outbound_nodes = []
 
-    call_fn_args = self._call_fn_args
-    self._expects_training_arg = ('training' in call_fn_args or
-                                  self._call_accepts_kwargs)
-    self._expects_mask_arg = ('mask' in call_fn_args or
-                              self._call_accepts_kwargs)
+    self._init_call_fn_args()
 
     # Whether the `call` method can be used to build a TF graph without issues.
     self._dynamic = dynamic
@@ -333,8 +466,9 @@ class Layer(module.Module):
     if dtype is None:
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
-    if self._dtype is None:
-      self._dtype = dtype.base_dtype.name
+    if self._dtype_policy.variable_dtype is None:
+      # The policy is "infer", so we infer the policy from the variable dtype.
+      self._dtype_policy = policy.Policy(dtype.base_dtype.name)
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -366,7 +500,8 @@ class Layer(module.Module):
         raise ValueError('An initializer for variable %s of type %s is required'
                          ' for layer %s' % (name, dtype.base_dtype, self.name))
 
-    if autocast and self._mixed_precision_policy.should_cast_variables:
+    if (autocast and self._dtype_policy.should_cast_variables and
+        dtype.is_floating):
       # Wrap 'getter' with a version that returns an AutoCastVariable.
       old_getter = getter
       def getter(*args, **kwargs):  # pylint: disable=function-redefined
@@ -445,7 +580,7 @@ class Layer(module.Module):
     if len(extra_args) > 1 and hasattr(self.get_config, '_is_default'):
       raise NotImplementedError('Layers with arguments in `__init__` must '
                                 'override `get_config`.')
-    # TODO(reedwm): Handle serializing self._mixed_precision_policy.
+    # TODO(reedwm): Handle serializing self._dtype_policy.
     return config
 
   @classmethod
@@ -541,12 +676,7 @@ class Layer(module.Module):
       return s.shape
     input_shape = nest.map_structure(check_type_return_shape, input_signature)
     output_shape = self.compute_output_shape(input_shape)
-    if self._mixed_precision_policy.should_cast_variables:
-      # If using mixed precision, and weights are cast to input dtype, we should
-      # not infer the dtype from self.dtype
-      dtype = None
-    else:
-      dtype = self.dtype
+    dtype = self._compute_dtype
     if dtype is None:
       input_dtypes = [s.dtype for s in nest.flatten(input_signature)]
       # Default behavior when self.dtype is None, is to use the first input's
@@ -657,6 +787,12 @@ class Layer(module.Module):
             training_value = backend.learning_phase()
 
       if self._expects_training_arg and training_value is not None:
+        # Force the training_value to be bool type which matches to the contract
+        # for layer/model call args.
+        if tensor_util.is_tensor(training_value):
+          training_value = math_ops.cast(training_value, dtypes.bool)
+        else:
+          training_value = bool(training_value)
         kwargs['training'] = training_value
         training_arg_passed_by_framework = True
 
@@ -678,6 +814,8 @@ class Layer(module.Module):
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
+        # TODO(reedwm): We should assert input compatibility after the inputs
+        # are casted, not before.
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
         graph = backend.get_graph()
@@ -685,13 +823,16 @@ class Layer(module.Module):
           # Build layer if applicable (if the `build` method has been
           # overridden).
           self._maybe_build(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs)
 
           # Wrapping `call` function in autograph to allow for dynamic control
-          # dependencies in call. We are limiting this to subclassed layers as
-          # autograph is strictly needed only for subclassed layers and models.
+          # flow and control dependencies in call. We are limiting this to
+          # subclassed layers as autograph is strictly needed only for
+          # subclassed layers and models.
           # tf_convert will respect the value of autograph setting in the
           # enclosing tf.function, if any.
-          if base_layer_utils.is_subclassed(self):
+          if (base_layer_utils.is_subclassed(self) and
+              not base_layer_utils.from_saved_model(self)):
             call_fn = autograph.tf_convert(
                 self.call, ag_ctx.control_status_ctx())
           else:
@@ -700,30 +841,25 @@ class Layer(module.Module):
           if not self.dynamic:
             try:
               with base_layer_utils.autocast_context_manager(
-                  input_list,
-                  self._mixed_precision_policy.should_cast_variables):
+                  self._compute_dtype):
                 # Add auto_control_deps in V2 when they are not already added by
                 # a `tf.function`.
                 if (ops.executing_eagerly_outside_functions() and
                     not base_layer_utils.is_in_eager_or_tf_function()):
                   with auto_control_deps.AutomaticControlDependencies() as acd:
-                    outputs = call_fn(inputs, *args, **kwargs)
+                    outputs = call_fn(cast_inputs, *args, **kwargs)
                     # Wrap Tensors in `outputs` in `tf.identity` to avoid
                     # circular dependencies.
                     outputs = base_layer_utils.mark_as_return(outputs, acd)
                 else:
-                  outputs = call_fn(inputs, *args, **kwargs)
+                  outputs = call_fn(cast_inputs, *args, **kwargs)
 
-            except TypeError as e:
-              exception_str = str(e)
-              exception_msg = 'Tensor objects are only iterable when eager'
-              if exception_msg in exception_str:
-                raise TypeError('You are attempting to use Python control '
-                                'flow in a layer that was not declared to be '
-                                'dynamic. Pass `dynamic=True` to the class '
-                                'constructor.\nEncountered error:\n"""\n' +
-                                exception_str + '\n"""')
-              raise
+            except errors.OperatorNotAllowedInGraphError as e:
+              raise TypeError('You are attempting to use Python control '
+                              'flow in a layer that was not declared to be '
+                              'dynamic. Pass `dynamic=True` to the class '
+                              'constructor.\nEncountered error:\n"""\n' +
+                              str(e) + '\n"""')
           else:
             # We will use static shape inference to return symbolic tensors
             # matching the specifications of the layer outputs.
@@ -757,9 +893,10 @@ class Layer(module.Module):
         # Eager execution on data tensors.
         with backend.name_scope(self._name_scope()):
           self._maybe_build(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs)
           with base_layer_utils.autocast_context_manager(
-              input_list, self._mixed_precision_policy.should_cast_variables):
-            outputs = self.call(inputs, *args, **kwargs)
+              self._compute_dtype):
+            outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
 
@@ -767,7 +904,7 @@ class Layer(module.Module):
 
   @property
   def dtype(self):
-    return self._dtype
+    return self._dtype_policy.variable_dtype
 
   @property
   def name(self):
@@ -848,11 +985,10 @@ class Layer(module.Module):
         if callable(u):
           try:
             u = u()
-          except ValueError as e:
-            if 'Trying to capture a tensor from an inner function' in str(e):
-              base_layer_utils.check_graph_consistency(
-                  method='add_update', force_raise=True)
-            raise
+          except errors.InaccessibleTensorError:
+            base_layer_utils.check_graph_consistency(
+                method='add_update', force_raise=True)
+            raise  # check_graph_consistency may not always raise.
         base_layer_utils.check_graph_consistency(u, method='add_update')
         updates.append(u)
     return updates + self._gather_children_attribute('updates')
@@ -1489,9 +1625,9 @@ class Layer(module.Module):
           (in which case its weights aren't yet defined).
     """
     if not self.built:
-      if self.__class__.__name__ == 'Sequential':
+      if getattr(self, '_is_graph_network', False):
         with tf_utils.maybe_init_scope(self):
-          self._maybe_build()  # pylint: disable=no-value-for-parameter
+          self._maybe_build(self.inputs)
       else:
         raise ValueError('You tried to call `count_params` on ' + self.name +
                          ', but the layer isn\'t built. '
@@ -1594,23 +1730,110 @@ class Layer(module.Module):
   # Methods & attributes below are all private and only used by the framework. #
   ##############################################################################
 
-  def _set_dtype_and_policy(self, dtype):
-    """Sets self._dtype and self._mixed_precision_policy."""
-    if dtype:
-      if isinstance(dtype, policy.Policy):
-        self._mixed_precision_policy = dtype
-        self._dtype = self._mixed_precision_policy.default_variable_dtype
-      else:
-        # If a non-policy dtype is passed, no casting should be done. So we use
-        # the "infer" policy, which does no casting.
-        self._mixed_precision_policy = policy.Policy('infer')
-        self._dtype = dtypes.as_dtype(dtype).name
+  def _set_dtype_policy(self, dtype):
+    """Sets self._dtype_policy."""
+    if isinstance(dtype, policy.Policy):
+      self._dtype_policy = dtype
+    elif dtype:
+      self._dtype_policy = policy.Policy(dtypes.as_dtype(dtype).name)
     else:
-      self._mixed_precision_policy = policy.global_policy()
-      # If the global policy has not been set, it will be an "infer" policy
-      # without a default variable dtype, and so self._dtype will be None. In
-      # that case, self._dtype will be set when the layer is built or called.
-      self._dtype = self._mixed_precision_policy.default_variable_dtype
+      self._dtype_policy = policy.global_policy()
+
+    if self._dtype_policy.should_cast_variables and backend.is_tpu_strategy(
+        ds_context.get_strategy()):
+      # TODO(b/137859335): Supoprt this. AutoCastVariables currently do not work
+      # properly when wrapping TPUMirroredVariables.
+      raise ValueError('DType Policies ending in "_with_float32_vars" are '
+                       'not yet supported with TPUStrategy. Got policy: %s' %
+                       self._dtype_policy.name)
+
+    # This has no impact on the layer behavior, and is only used for printing
+    # warnings.
+    self._dtype_defaulted_to_floatx = (not dtype and
+                                       policy.policy_defaults_to_floatx())
+
+  # TODO(reedwm): Expose this property?
+  @property
+  def _compute_dtype(self):
+    """The layer's compute dtype.
+
+    Unless mixed-precision is used, this is the same as `Layer.dtype`.
+
+    If self._autocast is True, layer's will cast floating-point inputs to this.
+
+    Returns:
+      The layer's compute dtype.
+    """
+    return self._dtype_policy.compute_dtype
+
+  def _maybe_cast_inputs(self, inputs):
+    """Maybe casts the inputs to the compute dtype.
+
+    If self._compute_dtype is floating-point, and self_autocast is True,
+    floating-point inputs are casted to self._compute_dtype.
+
+    Args:
+      inputs: Input tensor, or structure of input tensors.
+
+    Returns:
+      `inputs`, but tensors may have been casted to self._compute_dtype
+    """
+    compute_dtype = self._compute_dtype
+    if (self._autocast and compute_dtype and
+        dtypes.as_dtype(compute_dtype).is_floating):
+      def f(x):
+        cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
+                      ragged_tensor.RaggedTensor)
+        if (isinstance(x, cast_types) and x.dtype.is_floating and
+            x.dtype.base_dtype.name != compute_dtype):
+          if self._dtype_defaulted_to_floatx:
+            self._warn_about_input_casting(x.dtype.base_dtype)
+          return math_ops.cast(x, compute_dtype)
+        else:
+          return x
+      return nest.map_structure(f, inputs)
+    else:
+      return inputs
+
+  def _warn_about_input_casting(self, input_dtype):
+    # self._already_warned_about_input_casting is only retrieved or set in this
+    # function.
+    already_warned = getattr(self, '_already_warned_about_input_casting', False)
+    if not already_warned:
+      tf_logging.warn(
+          "Layer {self.name} is casting an input tensor from dtype "
+          "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
+          "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
+          "because it's dtype defaults to floatx.\n\n"
+          ""
+          "If you intended to run this layer in {layer_dtype}, you can safely "
+          "ignore this warning. If in doubt, this warning is likely only an "
+          "issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n\n"
+          ""
+          "To change all layers to have dtype {input_dtype} by default, call "
+          "`tf.keras.backend.set_floatx('{input_dtype}')`. To change just this "
+          "layer, pass dtype='{input_dtype}' to the layer constructor. If you "
+          "are the author of this layer, you can disable autocasting by "
+          "passing autocast=False to the base Layer constructor.\n".format(
+              self=self,
+              input_dtype=input_dtype.name,
+              layer_dtype=self._compute_dtype))
+      self._already_warned_about_input_casting = True
+
+  # _dtype used to be an attribute set in the constructor. We still expose it
+  # because some clients still use it.
+  # TODO(reedwm): Deprecate, then remove the _dtype property.
+  @property
+  def _dtype(self):
+    # This is equivalent to returning self.dtype . We do not return self.dtype
+    # as it would cause infinite recursion in a few subclasses, which override
+    # "dtype" to return self._dtype.
+    return self._dtype_policy.variable_dtype
+
+  @_dtype.setter
+  def _dtype(self, value):
+    value = dtypes.as_dtype(value).name
+    self._dtype_policy = policy.Policy(value)
 
   def _name_scope(self):
     return self.name
@@ -1776,48 +1999,33 @@ class Layer(module.Module):
       return None
     return input_masks
 
-  def _call_arg_was_passed(self, arg_name, args, kwargs):
+  def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
     if arg_name in kwargs:
       return True
-    # Ignore `inputs` arg.
-    if arg_name in dict(zip(self._call_fn_args[1:], args)):
+    call_fn_args = self._call_fn_args
+    if not inputs_in_args:
+      # Ignore `inputs` arg.
+      call_fn_args = call_fn_args[1:]
+    if arg_name in dict(zip(call_fn_args, args)):
       return True
     return False
 
-  def _get_call_arg_value(self, arg_name, args, kwargs):
+  def _get_call_arg_value(self, arg_name, args, kwargs, inputs_in_args=False):
     if arg_name in kwargs:
       return kwargs[arg_name]
-    # Ignore `inputs` arg.
-    args_dict = dict(zip(self._call_fn_args[1:], args))
+    call_fn_args = self._call_fn_args
+    if not inputs_in_args:
+      # Ignore `inputs` arg.
+      call_fn_args = call_fn_args[1:]
+    args_dict = dict(zip(call_fn_args, args))
     return args_dict[arg_name]
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    call_convention = getattr(
-        self, '_call_convention',
-        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if args:
-      if call_convention == (base_layer_utils
-                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT):
-        raise TypeError(
-            'This layer ("{}") takes an `inputs` argument in `call()`, '
-            'and only the `inputs` argument may be specified as a positional '
-            'argument. Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
-      elif call_convention == (base_layer_utils
-                               .CallConvention.SINGLE_POSITIONAL_ARGUMENT):
-        raise TypeError(
-            'This layer ("{}") takes a single positional argument in `call()`,'
-            ' which is by convention the `inputs` argument, '
-            'and only this argument may be specified as a positional argument. '
-            'Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
 
     # If the layer returns tensors from its inputs, unmodified,
     # we copy them to avoid loss of tensor metadata.
     output_ls = nest.flatten(outputs)
-    inputs_ls = nest.flatten(inputs)
+    inputs_ls = object_identity.ObjectIdentitySet(nest.flatten(inputs))
     output_ls_copy = []
     for x in output_ls:
       if x in inputs_ls:
@@ -1826,85 +2034,16 @@ class Layer(module.Module):
       output_ls_copy.append(x)
     outputs = nest.pack_sequence_as(outputs, output_ls_copy)
 
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Ignore `inputs` arg.
+    arguments = dict(zip(self._call_fn_args[1:], args))
+    arguments.update(kwargs)
+
     # Add an inbound node to the layer, so it can keep track of this call.
     # This updates the layer history of the output tensor(s).
     self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+        input_tensors=inputs, output_tensors=outputs, arguments=arguments)
     return inputs, outputs
 
-  def _inputs_from_call_args(self, call_args, call_kwargs):
-    """Get Layer inputs from __call__ *args and **kwargs.
-
-    Args:
-      call_args: The positional arguments passed to __call__.
-      call_kwargs: The keyword argument dict passed to __call__.
-
-    Returns:
-      A tuple of (inputs, non_input_kwargs). These may be the same objects as
-      were passed in (call_args and call_kwargs).
-    """
-    call_convention = getattr(
-        self, '_call_convention',
-        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if (call_convention in (
-        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
-        base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
-      assert len(call_args) == 1  # TypeError raised earlier in __call__.
-      return call_args[0], call_kwargs
-    else:
-      call_arg_spec = tf_inspect.getfullargspec(self.call)
-      # There is no explicit "inputs" argument expected or provided to
-      # call(). Arguments which have default values are considered non-inputs,
-      # and arguments without are considered inputs.
-      if call_arg_spec.defaults:
-        if call_arg_spec.varargs is not None:
-          raise TypeError(
-              'Layers may not accept both positional arguments and '
-              'arguments with default values (unable to determine which '
-              'are inputs to the layer). '
-              'Issue occurred with layer "%s"' % (self.name))
-        keyword_arg_names = set(
-            call_arg_spec.args[-len(call_arg_spec.defaults):])
-      else:
-        keyword_arg_names = set()
-        # Training is never an input argument name, to allow signatures like
-        # call(x, training).
-      keyword_arg_names.add('training')
-      _, unwrapped_call = tf_decorator.unwrap(self.call)
-      bound_args = inspect.getcallargs(
-          unwrapped_call, *call_args, **call_kwargs)
-      if call_arg_spec.varkw is not None:
-        var_kwargs = bound_args.pop(call_arg_spec.varkw)
-        bound_args.update(var_kwargs)
-        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
-      all_args = call_arg_spec.args
-      if all_args and bound_args[all_args[0]] is self:
-        # Ignore the 'self' argument of methods
-        bound_args.pop(call_arg_spec.args[0])
-        all_args = all_args[1:]
-      non_input_arg_values = {}
-      input_arg_values = []
-      remaining_args_are_keyword = False
-      for argument_name in all_args:
-        if argument_name in keyword_arg_names:
-          remaining_args_are_keyword = True
-        else:
-          if remaining_args_are_keyword:
-            raise TypeError(
-                'Found a positional argument in a layer call after a non-input '
-                'argument. All arguments after "training" must be keyword '
-                'arguments, and are not tracked as inputs to the layer. '
-                'Issue occurred with layer "%s"' % (self.name))
-        if remaining_args_are_keyword:
-          non_input_arg_values[argument_name] = bound_args[argument_name]
-        else:
-          input_arg_values.append(bound_args[argument_name])
-      if call_arg_spec.varargs is not None:
-        input_arg_values.extend(bound_args[call_arg_spec.varargs])
-      return input_arg_values, non_input_arg_values
-
   def _add_inbound_node(self,
                         input_tensors,
                         output_tensors,
@@ -1988,11 +2127,14 @@ class Layer(module.Module):
       input_spec.assert_input_compatibility(
           self.input_spec, inputs, self.name)
       input_list = nest.flatten(inputs)
-      if input_list and self._dtype is None:
+      if input_list and self._dtype_policy.compute_dtype is None:
         try:
-          self._dtype = input_list[0].dtype.base_dtype.name
+          dtype = input_list[0].dtype.base_dtype.name
         except AttributeError:
           pass
+        else:
+          self._dtype_policy = policy.with_input_dtype(self._dtype_policy,
+                                                       dtype)
       input_shapes = None
       if all(hasattr(x, 'shape') for x in input_list):
         input_shapes = nest.map_structure(lambda x: x.shape, inputs)
@@ -2163,18 +2305,25 @@ class Layer(module.Module):
     for val in nest.flatten(value):
       # TODO(b/126450014): Remove `_UnreadVariable` check here when assign ops
       # no longer return True for isinstance Variable checks.
-      if (isinstance(val, tf_variables.Variable) and
-          not isinstance(val, resource_variable_ops._UnreadVariable)):  # pylint: disable=protected-access
-        # Users may add extra weights/variables
-        # simply by assigning them to attributes (invalid for graph networks)
-        self._maybe_create_attribute('_trainable_weights', [])
-        self._maybe_create_attribute('_non_trainable_weights', [])
-        if val not in self._trainable_weights + self._non_trainable_weights:
-          if val.trainable:
-            self._trainable_weights.append(val)
-          else:
-            self._non_trainable_weights.append(val)
-          backend.track_variable(val)
+      if not isinstance(val, tf_variables.Variable):
+        continue
+      if isinstance(val, resource_variable_ops._UnreadVariable):  # pylint: disable=protected-access
+        continue
+
+      # Users may add extra weights/variables
+      # simply by assigning them to attributes (invalid for graph networks)
+      self._maybe_create_attribute('_trainable_weights', [])
+      self._maybe_create_attribute('_non_trainable_weights', [])
+      if val.trainable:
+        if any(val is w for w in self._trainable_weights):
+          continue
+        self._trainable_weights.append(val)
+      else:
+        if any(val is w for w in self._non_trainable_weights):
+          continue
+        self._non_trainable_weights.append(val)
+
+      backend.track_variable(val)
 
     # Skip the auto trackable from tf.Module to keep status quo. See the comment
     # at __delattr__.
@@ -2199,6 +2348,17 @@ class Layer(module.Module):
   def _is_layer(self):
     return True
 
+  def _init_call_fn_args(self):
+    # Clear cached call function arguments.
+    self.__class__._call_fn_args.fget.cache.pop(self, None)
+    self.__class__._call_accepts_kwargs.fget.cache.pop(self, None)
+
+    call_fn_args = self._call_fn_args
+    self._expects_training_arg = ('training' in call_fn_args or
+                                  self._call_accepts_kwargs)
+    self._expects_mask_arg = ('mask' in call_fn_args or
+                              self._call_accepts_kwargs)
+
   @property
   @tracking.cached_per_instance
   def _call_fn_args(self):
@@ -2340,6 +2500,17 @@ class Layer(module.Module):
         serialization_cache))
     return fns
 
+  @property
+  def _unique_trainable_weights(self):
+    """Dedupe trainable weights while maintaining order as much as possible."""
+    trainable_weights = self.trainable_weights
+    output, seen_weights = [], object_identity.ObjectIdentitySet()
+    for w in trainable_weights:
+      if w not in seen_weights:
+        output.append(w)
+        seen_weights.add(w)
+    return output
+
 
 class TensorFlowOpLayer(Layer):
   """Wraps a TensorFlow Operation in a Layer.
@@ -2361,11 +2532,11 @@ class TensorFlowOpLayer(Layer):
 
   Attributes:
     node_def: String, the serialized NodeDef of the Op this layer will wrap.
+    name: String, the name of the Layer.
     constants: Dict of NumPy arrays, the values of any Tensors needed for this
       Operation that do not originate from a Keras `Input` Layer. Since all
       placeholders must come from Keras `Input` Layers, these Tensors must be
       treated as constant in the Functional API.
-    name: String, the name of the Layer.
     trainable: Bool, whether this Layer is trainable. Currently Variables are
       not supported, and so this parameter has no effect.
     dtype: The default dtype of this Layer. Inherited from `Layer` and has no
@@ -2374,14 +2545,26 @@ class TensorFlowOpLayer(Layer):
 
   def __init__(self,
                node_def,
+               name,
                constants=None,
-               name=None,
                trainable=True,
                dtype=None):
+    # Pass autocast=False, as if inputs are cast, input types might not match
+    # Operation type.
     super(TensorFlowOpLayer, self).__init__(
-        name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype)
-    self.node_def = node_def_pb2.NodeDef.FromString(node_def)
-    self.constants = constants or {}
+        name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype,
+        autocast=False)
+    _keras_layers_gauge.get_cell('TensorflowOpLayer').set(True)
+    if isinstance(node_def, dict):
+      self.node_def = json_format.ParseDict(node_def, node_def_pb2.NodeDef())
+    else:
+      if not isinstance(node_def, bytes):
+        node_def = node_def.encode('utf-8')
+      self.node_def = node_def_pb2.NodeDef.FromString(node_def)
+    # JSON serialization stringifies keys which are integer input indices.
+    self.constants = ({
+        int(index): constant for index, constant in constants.items()
+    } if constants is not None else {})
     # Layer uses original op unless it is called on new inputs.
     # This means `built` is not set in `__call__`.
     self.built = True
@@ -2439,7 +2622,9 @@ class TensorFlowOpLayer(Layer):
   def get_config(self):
     config = super(TensorFlowOpLayer, self).get_config()
     config.update({
-        'node_def': self.node_def.SerializeToString(),
+        # `__init__` prefixes the name. Revert to the constructor argument.
+        'name': config['name'][len(_TF_OP_LAYER_NAME_PREFIX):],
+        'node_def': json_format.MessageToDict(self.node_def),
         'constants': {
             i: backend.get_value(c) for i, c in self.constants.items()
         }
@@ -2455,6 +2640,9 @@ class AddLoss(Layer):
   """
 
   def __init__(self, unconditional, **kwargs):
+    # Pass autocast=False, as there is no reason to cast loss to a different
+    # dtype.
+    kwargs['autocast'] = False
     super(AddLoss, self).__init__(**kwargs)
     self.unconditional = unconditional
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 7bacc1ec110..181d8205044 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import itertools as it
 import os
 import sys
 import traceback
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -32,12 +29,15 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import core as legacy_core
@@ -47,9 +47,12 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.summary import summary_iterator
+from tensorflow.python.util import nest
 
 
 class DynamicLayer(base_layer.Layer):
@@ -119,7 +122,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
         return inputs
 
     with context.eager_mode():
-      layer = BuildCounter()
+      layer = BuildCounter(dtype=dtypes.float64)
       output_shape = layer.compute_output_shape((None, 10))
       self.assertEqual(layer.build_counter, 1)
       self.assertEqual(output_shape.as_list(), [None, 10])
@@ -221,7 +224,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
     self.assertEqual(loss, 2 * 3)
 
@@ -313,7 +316,8 @@ class BaseLayerTest(keras_parameterized.TestCase):
     def get_learning_phase_value():
       model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
       model._run_eagerly = testing_utils.should_run_eagerly()
-      model._run_distributed = testing_utils.should_run_distributed()
+      model._experimental_run_tf_function = (
+          testing_utils.should_run_tf_function())
       return np.sum(model(np.ones((1, 1))))
 
     self.assertEqual(get_learning_phase_value(), 0)
@@ -334,7 +338,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_learning_phase_freezing_for_layers_in_predict(self):
     if not (testing_utils.should_run_eagerly() or
-            testing_utils.should_run_distributed()):
+            testing_utils.should_run_tf_function()):
       self.skipTest('Predict fails to override the outer learning phase in'
                     'the FuncGraph path.')
 
@@ -348,7 +352,8 @@ class BaseLayerTest(keras_parameterized.TestCase):
     def get_learning_phase_value():
       model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
       model._run_eagerly = testing_utils.should_run_eagerly()
-      model._run_distributed = testing_utils.should_run_distributed()
+      model._experimental_run_tf_function = (
+          testing_utils.should_run_tf_function())
       return np.sum(model.predict(np.ones((1, 1))))
 
     self.assertEqual(get_learning_phase_value(), 0)
@@ -447,7 +452,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x, y = np.ones((10, 10)), np.ones((10, 10))
     # Checks that variables get initialized.
     model.fit(x, y, batch_size=2, epochs=2)
@@ -494,7 +499,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     inputs = np.random.random((3, 10))
     out = model.predict(inputs)
     self.assertAllClose(model.layers[-1].get_weights()[0], kernel_value)
@@ -555,6 +560,24 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # arguments, no error is thrown:
     self.assertEqual(MyLayerNew2(name='New').get_config()['name'], 'New')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_count_params(self):
+    dense = keras.layers.Dense(16)
+    dense.build((None, 4))
+    self.assertEqual(dense.count_params(), 16 * 4 + 16)
+
+    dense = keras.layers.Dense(16)
+    with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
+      dense.count_params()
+
+    model = keras.Sequential(keras.layers.Dense(16))
+    with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
+      model.count_params()
+
+    dense = keras.layers.Dense(16, input_dim=4)
+    model = keras.Sequential(dense)
+    self.assertEqual(model.count_params(), 16 * 4 + 16)
+
 
 class SymbolicSupportTest(test.TestCase):
 
@@ -729,8 +752,8 @@ class NestedTrackingTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 0)
     self.assertEqual(len(layer.non_trainable_weights), 8)
     self.assertEqual(
-        set([layer.dense1, layer.dense2, layer.v1, layer.v2]),
-        set([obj for unused_name, obj in layer._checkpoint_dependencies]))
+        {id(v) for v in [layer.dense1, layer.dense2, layer.v1, layer.v2]},
+        {id(v) for _, v in layer._checkpoint_dependencies})
 
   def test_nested_layer_updates_losses_tracking(self):
     # Test that updates and losses from nested sublayers are
@@ -916,7 +939,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
     self.assertEqual(train_loss, 0.)
     test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
@@ -941,7 +964,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
     self.assertEqual(train_loss, 2 * 3)
     test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
@@ -966,7 +989,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     _, train_metric = model.train_on_batch(np.ones((2, 3)),
                                            np.ones((2, 3)))
     self.assertEqual(train_metric, 2 * 3)
@@ -998,7 +1021,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
     self.assertEqual(keras.backend.get_value(layer.counter), 1.)
 
@@ -1032,7 +1055,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
           'sgd',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
       self.assertEqual(keras.backend.get_value(layer.counter), 6.)
     else:
@@ -1068,7 +1091,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
           'sgd',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
       self.assertEqual(loss, 2 * 3)
     else:
@@ -1082,7 +1105,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
             1, kernel_regularizer=keras.regularizers.l2(1e-4), input_shape=(1,))
     ])
     model._run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     def assert_graph(t):
       if not context.executing_eagerly():
@@ -1125,7 +1148,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
           'sgd',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       history = model.fit(np.ones((2, 3)), np.ones((2, 3)))
       self.assertEqual(history.history['sum'][-1], 2 * 3)
     else:
@@ -1154,7 +1177,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -1188,7 +1211,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 3, 4))
     y = np.ones(shape=(10, 3, 2))
@@ -1201,66 +1224,237 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         model.fit(x, y, epochs=2, batch_size=5)
 
 
-_LAYERS_TO_TEST = [
-    (keras.layers.Dense, (1,), collections.OrderedDict(units=[1])),
-    (keras.layers.Activation, (2, 2),
-     collections.OrderedDict(activation=['relu'])),
-    (keras.layers.Dropout, (16,), collections.OrderedDict(rate=[0.25])),
-    (keras.layers.BatchNormalization, (8, 8, 3), collections.OrderedDict(
-        axis=[3], center=[True, False], scale=[True, False])),
-    (keras.layers.Conv1D, (8, 8), collections.OrderedDict(
-        filters=[1], kernel_size=[1, 3], strides=[1, 2],
-        padding=['valid', 'same'], use_bias=[True, False],
-        kernel_regularizer=[None, 'l2'])),
-    (keras.layers.Conv2D, (8, 8, 3), collections.OrderedDict(
-        filters=[1], kernel_size=[1, 3], strides=[1, 2],
-        padding=['valid', 'same'], use_bias=[True, False],
-        kernel_regularizer=[None, 'l2'])),
-    (keras.layers.LSTM, (8, 8), collections.OrderedDict(
-        units=[1],
-        activation=[None, 'relu'],
-        kernel_regularizer=[None, 'l2'],
-        dropout=[0, 0.5],
-        stateful=[True, False],
-        unroll=[True, False])),
-]
+class AddLayer(keras.layers.Layer):
+  """A layer which adds it's input to a variable.
 
-OUTPUT_TEST_CASES = []
-for layer_type, inp_shape, arg_dict in _LAYERS_TO_TEST:
-  arg_combinations = [[(k, i) for i in v] for k, v in arg_dict.items()]  # pylint: disable=g-complex-comprehension
-  for args in it.product(*arg_combinations):
-    name = '_{}_{}'.format(
-        layer_type.__name__, '_'.join('{}_{}'.format(k, v) for k, v in args))
-    OUTPUT_TEST_CASES.append(
-        (name, layer_type, inp_shape, {k: v for k, v in args}))
+  Useful for testing a layer with a variable
+  """
+
+  def build(self, _):
+    self.v = self.add_weight('v', (), initializer='ones')
+    self.built = True
+
+  def call(self, inputs):
+    return inputs + self.v
 
 
-class OutputTypeTest(keras_parameterized.TestCase):
-  """Test that layers and models produce the correct tensor types."""
+class IdentityLayer(keras.layers.Layer):
+  """A layer that returns it's input.
 
-  # In v1 graph there are only symbolic tensors.
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
-  def test_layer_outputs(self, layer_to_test, input_shape, layer_kwargs):
-    layer = layer_to_test(**layer_kwargs)
+  Useful for testing a layer without a variable.
+  """
 
-    input_data = np.ones(shape=(2,) + input_shape, dtype=np.float32)
-    layer_result = layer(input_data)
+  def call(self, inputs):
+    return inputs
 
-    inp = keras.layers.Input(shape=input_shape, batch_size=2)
-    model = keras.models.Model(inp, layer_to_test(**layer_kwargs)(inp))
-    model_result = model(input_data)
 
-    for x in [layer_result, model_result]:
-      if not isinstance(x, ops.Tensor):
-        raise ValueError('Tensor or EagerTensor expected, got type {}'
-                         .format(type(x)))
+@test_util.run_all_in_graph_and_eager_modes
+class DTypeTest(keras_parameterized.TestCase):
 
-      if isinstance(x, ops.EagerTensor) != context.executing_eagerly():
-        expected_type = (ops.EagerTensor if context.executing_eagerly()
-                         else ops.Tensor)
-        raise ValueError('Expected type {}, got type {}'
-                         .format(expected_type, type(x)))
+  # This class only have tests relating to layer.dtype. Tests for dtype policies
+  # are in mixed_precision/experimental/keras_test.py
+
+  # TODO(reedwm): Maybe have a separate test file for input casting tests.
+
+  def _const(self, dtype):
+    return array_ops.constant(1, dtype=dtype)
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_dtype_defaults_to_floatx(self):
+    layer = AddLayer()
+    self.assertEqual(layer.dtype, 'float32')
+    layer(self._const('float64'))
+    self.assertEqual(layer.dtype, 'float32')  # dtype should not change
+
+    try:
+      backend.set_floatx('float64')
+      layer = AddLayer()
+      self.assertEqual(layer.dtype, 'float64')
+    finally:
+      backend.set_floatx('float32')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_passing_dtype_to_constructor(self):
+    layer = IdentityLayer(dtype='float64')
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    layer = IdentityLayer(dtype='int32')
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'int32')
+
+    layer = IdentityLayer(dtype=dtypes.float64)
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'float64')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def input_cast_to_dtype(self):
+    layer = AddLayer()
+
+    # Input should be cast to layer.dtype, so output should also be layer.dtype
+    self.assertEqual(layer(self._const('float64')).dtype, 'float32')
+
+    layer = AddLayer(dtype='float64')
+    self.assertEqual(layer(self._const('float32')).dtype, 'float64')
+
+    # Test inputs are not casted if layer.dtype is not floating-point
+    layer = IdentityLayer(dtype='int32')
+    self.assertEqual(layer(self._const('float64')).dtype, 'float64')
+
+    # Test inputs are not casted if the inputs are not floating-point
+    layer = IdentityLayer(dtype='float32')
+    self.assertEqual(layer(self._const('int32')).dtype, 'int32')
+
+    # Test Numpy arrays are casted
+    layer = IdentityLayer(dtype='float64')
+    self.assertEqual(layer(np.array(1, dtype='float32')).dtype, 'float64')
+
+    # Test Python floats are casted
+    layer = IdentityLayer(dtype='float64')
+    self.assertEqual(layer(1.).dtype, 'float64')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def multiple_inputs_cast_to_dtype(self):
+
+    class MultiIdentityLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return [array_ops.identity(x) for x in inputs]
+
+    # Testing layer with default dtype of float32
+    layer = MultiIdentityLayer()
+    x, y = layer([self._const('float16'), self._const('float32')])
+    self.assertEqual(x.dtype, 'float32')
+    self.assertEqual(y.dtype, 'float32')
+
+    # Test passing dtype to the constructor
+    layer = MultiIdentityLayer(dtype='float64')
+    x, y = layer([self._const('float16'), self._const('float32')])
+    self.assertEqual(x.dtype, 'float64')
+    self.assertEqual(y.dtype, 'float64')
+
+    # Test several non-floating point types
+    layer = MultiIdentityLayer(dtype='float64')
+    x, y, z, w = layer([self._const('float16'), self._const('bool'),
+                        self._const('float64'), self._constant('complex64')])
+    self.assertEqual(x.dtype, 'float64')
+    self.assertEqual(y.dtype, 'bool')
+    self.assertEqual(z.dtype, 'float64')
+    self.assertEqual(w.dtype, 'complex64')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_extra_args_and_kwargs_not_casted(self):
+
+    class IdentityLayerWithArgs(keras.layers.Layer):
+
+      def call(self, inputs, *args, **kwargs):
+        return nest.flatten([inputs, args, kwargs])
+
+    layer = IdentityLayerWithArgs(dtype='float64')
+    x, y, z = layer(self._const('float16'), self._const('float16'),
+                    kwarg=self._const('float16'))
+    self.assertEqual(x.dtype, 'float64')
+    self.assertEqual(y.dtype, 'float16')
+    self.assertEqual(z.dtype, 'float16')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_layer_without_autocast(self):
+
+    class IdentityLayerWithoutAutocast(IdentityLayer):
+
+      def __init__(self, *args, **kwargs):
+        kwargs['autocast'] = False
+        super(IdentityLayerWithoutAutocast, self).__init__(*args, **kwargs)
+
+    layer = IdentityLayerWithoutAutocast(dtype='float64')
+    self.assertEqual(layer(self._const('float32')).dtype, 'float32')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_dtype_warnings(self):
+    # Test a layer warns when it casts inputs.
+    layer = IdentityLayer()
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      layer(self._const('float64'))
+      self.assertRegexpMatches(
+          str(mock_warn.call_args),
+          ".*from dtype float64 to the layer's dtype of float32.*"
+          "The layer has dtype float32 because.*")
+
+    # Test a layer does not warn a second time
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      layer(self._const('float64'))
+      mock_warn.assert_not_called()
+
+    # Test a new layer can warn even if a different layer already warned
+    layer = IdentityLayer()
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      layer(self._const('float64'))
+      self.assertRegexpMatches(
+          str(mock_warn.call_args),
+          ".*from dtype float64 to the layer's dtype of float32.*"
+          "The layer has dtype float32 because.*")
+
+    # Test a layer does not warn if a dtype is passed
+    layer = IdentityLayer(dtype='float32')
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      layer(self._const('float64'))
+      mock_warn.assert_not_called()
+
+    # Test a layer does not warn if a Policy is set:
+    with policy.policy_scope('float32'):
+      layer = IdentityLayer()
+      with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+        layer(self._const('float64'))
+        mock_warn.assert_not_called()
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_compute_output_signature(self):
+
+    class IdentityLayerWithOutputShape(IdentityLayer):
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    layer = IdentityLayerWithOutputShape(dtype='float64')
+    output_signature = layer.compute_output_signature(
+        tensor_spec.TensorSpec(shape=(), dtype='float32'))
+    self.assertEqual(output_signature.shape, ())
+    self.assertEqual(output_signature.dtype, 'float64')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_composite_tensors_input_casting(self):
+    sparse = sparse_tensor.SparseTensor(
+        indices=array_ops.constant([[0, 1], [2, 3]], dtype='int64'),
+        values=array_ops.constant([0., 1.], dtype='float32'),
+        dense_shape=array_ops.constant([4, 4], dtype='int64'))
+    ragged = ragged_tensor.RaggedTensor.from_row_splits(
+        values=array_ops.constant([1., 2., 3.], dtype='float32'),
+        row_splits=array_ops.constant([0, 2, 2, 3], dtype='int64'))
+
+    layer = IdentityLayer(dtype='float16')
+    for x in sparse, ragged:
+      self.assertEqual(x.dtype, 'float32')
+      y = layer(x)
+      self.assertEqual(y.dtype, 'float16')
+      self.assertEqual(type(x), type(y))
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_passing_non_tensor(self):
+    layer = IdentityLayer()
+    x = object()
+    y = layer(x)  # Layer should not cast 'x', as it's not a tensor
+    self.assertIs(x, y)
+
+  @testing_utils.disable_v2_dtype_behavior
+  def test_v1_behavior(self):
+    # Test dtype defaults to None and inferred from input
+    layer = IdentityLayer()
+    self.assertIsNone(layer.dtype)
+    layer(self._const('float64'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    # Test layer does not cast to dtype
+    self.assertEqual(layer(self._const('float32')).dtype, 'float32')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 14e2cabf39b..a4826e5b607 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -19,8 +19,7 @@ from __future__ import print_function
 
 import threading
 
-import enum
-
+from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -29,7 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
@@ -39,20 +38,6 @@ from tensorflow.python.util import tf_contextlib
 _call_context = threading.local()
 
 
-class CallConvention(enum.Enum):
-  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
-  # The Layer takes inputs as its first argument, named "inputs" for
-  # compatibility with the signature of Layer.__call__. This is the mode assumed
-  # for Layers which are not subclassed Models.
-  EXPLICIT_INPUTS_ARGUMENT = 1
-  # The Layer takes a single positional argument, not named "inputs". It's
-  # treated like an "inputs" argument.
-  SINGLE_POSITIONAL_ARGUMENT = 2
-  # The Layer has multiple positional arguments to which its inputs should be
-  # bound.
-  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
-
-
 def create_mean_metric(value, name=None):
   # TODO(psv): Remove this import when b/110718070 is fixed.
   from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
@@ -240,7 +225,8 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
             # configured improperly.
             constants[i] = op_input
           else:
-            constants[i] = backend.function([], op_input)([])
+            with ops.init_scope():
+              constants[i] = backend.function([], op_input)([])
       processed_ops, created_layers = _create_keras_history_helper(
           layer_inputs, processed_ops, created_layers)
       name = op.name
@@ -254,7 +240,7 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   return processed_ops, created_layers
 
 
-def needs_keras_history(tensors):
+def needs_keras_history(tensors, ignore_call_context=False):
   """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
 
   This will never return True inside a sublayer, because sublayers
@@ -264,12 +250,18 @@ def needs_keras_history(tensors):
 
   Arguments:
     tensors: An arbitrary nested structure of Tensors.
+    ignore_call_context: Whether to ignore the check of if currently
+      outside of a `call` context. This is `True` when creating
+      KerasHistory inside `Node`, where we always know that Tensors
+      are being used with the Functional API.
 
   Returns:
     Bool, whether at least one Tensor needs to be wrapped.
   """
   input_tensors = nest.flatten(tensors)
-  if call_context().in_call or all(
+  if call_context().in_call and not ignore_call_context:
+    return False
+  if all(
       getattr(tensor, '_keras_history', None) is not None
       for tensor in input_tensors):
     # KerasHistory already set.
@@ -324,21 +316,25 @@ def uses_keras_history(tensors):
   tensors_to_check = nest.flatten(tensors)
 
   while tensors_to_check:
-    new_tensors_to_check = set()
+    new_tensors_to_check = []
     for tensor in tensors_to_check:
+      if id(tensor) in checked_tensors:
+        continue
+
+      checked_tensors.add(id(tensor))
+
       if getattr(tensor, '_keras_history_checked', None) is not None:
         continue
       if getattr(tensor, '_keras_history', None) is not None:
         return True
 
       try:
-        new_tensors_to_check.update(tensor.op.inputs)
+        new_tensors_to_check.extend(tensor.op.inputs)
       except AttributeError:
         # In case `tensor` is a Variable created in an Eager context.
         pass
 
-    checked_tensors.update(tensors_to_check)
-    tensors_to_check = list(new_tensors_to_check - checked_tensors)
+    tensors_to_check = new_tensors_to_check
 
   # Mark that these Tensors have been checked once for `_keras_history`,
   # and should not be checked again for performance reasons.
@@ -438,32 +434,21 @@ def training_arg_passed_to_call(argspec, args, kwargs):
   return 'training' in full_args and full_args['training'] is not None
 
 
-def _get_var_read_dtype(input_list, should_cast):
-  """Gets the dtype that AutoCastVariables should be read in."""
-  if should_cast and input_list and input_list[0].dtype.is_floating:
-    return input_list[0].dtype.base_dtype
-  else:
-    return None
-
-
-def autocast_context_manager(input_list, should_cast):
+def autocast_context_manager(dtype):
   """Returns a context manager to autocast AutoCastVariables.
 
-  Under this context manager, if `should_cast` is True, AutoCastVariables will
-  be casted. If `should_cast` is False, AutoCastVariables will not be casted,
-  which can be used to disable autocasting if nested under another
-  call to `autocast_context_manager`.
+  Under this context manager, AutoCastVariables will be casted to `dtype` if
+  `dtype` is floating-point. Otherwise, AutoCastVariables will not be casted.
 
   Args:
-    input_list: The inputs to the layer with the AutoCastVariables.
-    should_cast: Whether AutoCastVariables should be casted.
+    dtype: The dtype to cast AutoCastVariables to, or None.
 
   Returns:
     A context manager to automatically cast AutoCastVariables.
   """
-  var_read_dtype = _get_var_read_dtype(input_list, should_cast)
-  return ops.get_default_graph()._enable_auto_casting_variables(  # pylint: disable=protected-access
-      var_read_dtype)
+  if dtype and not dtypes.as_dtype(dtype).is_floating:
+    dtype = None
+  return ops.get_default_graph()._enable_auto_casting_variables(dtype)  # pylint: disable=protected-access
 
 
 def is_subclassed(layer):
@@ -472,6 +457,11 @@ def is_subclassed(layer):
           layer.__module__.find('keras.layers') == -1)
 
 
+def from_saved_model(layer):
+  """Returns whether the layer is loaded from a SavedModel."""
+  return layer.__module__.find('keras.saving.saved_model') != -1
+
+
 def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
   """Checks that tensors passed to `add_*` method match the Keras graph.
 
@@ -488,12 +478,13 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
   Raises:
     RuntimeError: In case of an out-of-graph tensor.
   """
-  if (force_raise or (ops.executing_eagerly_outside_functions() and
-                      hasattr(tensor, 'graph') and
-                      isinstance(tensor.graph,
-                                 (control_flow_util_v2.CondBranchFuncGraph,
-                                  control_flow_util_v2.WhileCondFuncGraph,
-                                  control_flow_util_v2.WhileBodyFuncGraph)))):
+  if (force_raise or
+      (ops.executing_eagerly_outside_functions() and
+       hasattr(tensor, 'graph') and
+       isinstance(tensor.graph,
+                  (control_flow_v2_func_graphs.CondBranchFuncGraph,
+                   control_flow_v2_func_graphs.WhileCondFuncGraph,
+                   control_flow_v2_func_graphs.WhileBodyFuncGraph)))):
     if method == 'activity_regularizer':
       bad_example = """
       class TestModel(tf.keras.Model):
@@ -629,3 +620,60 @@ def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True  # pylint: disable=protected-access
   return method
+
+
+V2_DTYPE_BEHAVIOR = None
+
+
+# These two functions are not exported because we plan on removing them in the
+# future.
+def enable_v2_dtype_behavior():
+  """Enable the V2 dtype behavior for Keras layers.
+
+  By default, the V2 dtype behavior is enabled in TensorFlow 2.
+
+  When enabled, the dtype of Keras layers defaults to floatx (which is typically
+  float32) instead of None. In addition, layers will automatically cast
+  floating-point inputs to the layer's dtype.
+
+  For example, once enabled, the following block will run a Conv2D layer
+  in float32:
+
+  ```python
+  x = tf.ones((4, 4, 4, 4), dtype='float64')
+  layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
+  print(layer.dtype)  # Float32 when enabled. None when disabled.
+  # When enabled, will cast inputs to the layer's dtype, which is float32. When
+  # disabled, will do no casting, so the layer is done in float64.
+  y = layer(x)
+  ```
+
+  A layer author can opt-out their layer from the automatic input casting by
+  passing `autocast=False` to the base Layer's constructor. This disables the
+  autocasting part of the V2 behavior for that layer, but not the defaulting to
+  floatx part of the V2 behavior.
+
+  When a global `tf.keras.mixed_precision.experimental.Policy` is set, the
+  layer's dtype will default to the global policy instead of floatx. Layers
+  will automatically cast inputs to the policy's compute_dtype.
+  """
+  global V2_DTYPE_BEHAVIOR
+  V2_DTYPE_BEHAVIOR = True
+
+
+def disable_v2_dtype_behavior():
+  """Disables the V2 dtype behavior for Keras layers.
+
+  See `enable_v2_dtype_behavior`.
+
+  This function will be removed in the future.
+  """
+  global V2_DTYPE_BEHAVIOR
+  V2_DTYPE_BEHAVIOR = False
+
+
+def v2_dtype_behavior_enabled():
+  """Returns True if the V2 dtype behavior is enabled."""
+  if V2_DTYPE_BEHAVIOR is None:
+    return tf2.enabled()
+  return V2_DTYPE_BEHAVIOR
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
index ac26e681973..f5008760f18 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.ops import init_ops
@@ -158,10 +159,12 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     layer = get_layer()
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     layer.set_total(15)
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_pre_build_adapt_update_numpy(self):
     """Test that preproc layers can adapt() before build() is called."""
@@ -173,8 +176,10 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     input_data = keras.Input(shape=(1,))
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_post_build_adapt_update_numpy(self):
     """Test that preproc layers can adapt() after build() is called."""
@@ -184,10 +189,12 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     layer = get_layer()
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     layer.adapt(input_dataset)
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_pre_build_injected_update(self):
     """Test external update injection before build() is called."""
@@ -203,8 +210,10 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     input_data = keras.Input(shape=(1,))
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_post_build_injected_update(self):
     """Test external update injection after build() is called."""
@@ -213,12 +222,14 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     layer = get_layer()
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     combiner = layer._combiner
     updates = combiner.extract(combiner.compute(input_dataset))
     layer._set_state_variables(updates)
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_pre_build_adapt_update_dataset(self):
     """Test that preproc layers can adapt() before build() is called."""
@@ -231,8 +242,10 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     input_data = keras.Input(shape=(1,))
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_post_build_adapt_update_dataset(self):
     """Test that preproc layers can adapt() after build() is called."""
@@ -243,10 +256,12 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     layer = get_layer()
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     layer.adapt(input_dataset)
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
   def test_further_tuning(self):
     """Test that models can be tuned with multiple calls to 'adapt'."""
@@ -259,10 +274,13 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     input_data = keras.Input(shape=(1,))
     output = layer(input_data)
     model = keras.Model(input_data, output)
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
+
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
     layer.adapt(np.array([1, 2]), reset_state=False)
-    self.assertAllEqual([[19], [20], [21]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[19], [20], [21]], model.predict([1., 2., 3.]))
 
   def test_further_tuning_post_injection(self):
     """Test that models can be tuned with multiple calls to 'adapt'."""
@@ -274,14 +292,16 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     input_data = keras.Input(shape=(1,))
     output = layer(input_data)
     model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     combiner = layer._combiner
     updates = combiner.extract(combiner.compute(input_dataset))
     layer._set_state_variables(updates)
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
     layer.adapt(np.array([1, 2]), reset_state=False)
-    self.assertAllEqual([[19], [20], [21]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[19], [20], [21]], model.predict([1., 2., 3.]))
 
   def test_weight_based_state_transfer(self):
     """Test that preproc layers can transfer state via get/set weights.."""
@@ -290,21 +310,25 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
       input_data = keras.Input(shape=(1,))
       layer = get_layer()
       output = layer(input_data)
-      return (keras.Model(input_data, output), layer)
+      model = keras.Model(input_data, output)
+      model._run_eagerly = testing_utils.should_run_eagerly()
+      model._experimental_run_tf_function = (
+          testing_utils.should_run_tf_function())
+      return (model, layer)
 
     input_dataset = np.array([1, 2, 3, 4, 5])
     model, layer = get_model()
     layer.adapt(input_dataset)
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
     # Create a new model and verify it has no state carryover.
     weights = model.get_weights()
     model_2, _ = get_model()
-    self.assertAllEqual([[1], [2], [3]], model_2.predict([1, 2, 3]))
+    self.assertAllEqual([[1], [2], [3]], model_2.predict([1., 2., 3.]))
 
     # Transfer state from model to model_2 via get/set weights.
     model_2.set_weights(weights)
-    self.assertAllEqual([[16], [17], [18]], model_2.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model_2.predict([1., 2., 3.]))
 
   def test_weight_based_state_transfer_with_further_tuning(self):
     """Test that transferred state can be used to further tune a model.."""
@@ -313,12 +337,16 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
       input_data = keras.Input(shape=(1,))
       layer = get_layer()
       output = layer(input_data)
-      return (keras.Model(input_data, output), layer)
+      model = keras.Model(input_data, output)
+      model._run_eagerly = testing_utils.should_run_eagerly()
+      model._experimental_run_tf_function = (
+          testing_utils.should_run_tf_function())
+      return (model, layer)
 
     input_dataset = np.array([1, 2, 3, 4, 5])
     model, layer = get_model()
     layer.adapt(input_dataset)
-    self.assertAllEqual([[16], [17], [18]], model.predict([1, 2, 3]))
+    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
 
     # Transfer state from model to model_2 via get/set weights.
     weights = model.get_weights()
@@ -327,7 +355,7 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
 
     # Further adapt this layer based on the transferred weights.
     layer_2.adapt(np.array([1, 2]), reset_state=False)
-    self.assertAllEqual([[19], [20], [21]], model_2.predict([1, 2, 3]))
+    self.assertAllEqual([[19], [20], [21]], model_2.predict([1., 2., 3.]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/engine/correctness_test.py b/tensorflow/python/keras/engine/correctness_test.py
index aa005aa7086..3f75b2bc6fb 100644
--- a/tensorflow/python/keras/engine/correctness_test.py
+++ b/tensorflow/python/keras/engine/correctness_test.py
@@ -70,7 +70,7 @@ class SimpleBiasTest(keras_parameterized.TestCase):
         keras.optimizer_v2.gradient_descent.SGD(0.1),
         'mae',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def test_simple_bias_fit(self):
@@ -109,7 +109,7 @@ class MultipleInputTest(keras_parameterized.TestCase):
         keras.optimizer_v2.gradient_descent.SGD(0.1),
         'mae',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   @parameterized.named_parameters(('subclassed', True), ('functional', False))
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 28e52b4241e..e98ea821885 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -26,9 +26,15 @@ import numpy as np
 import six
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework.ops import composite_tensor
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -152,25 +158,66 @@ class DataAdapter(object):
     """Whether the dataset has partial batch at the end."""
     raise NotImplementedError
 
+  @abc.abstractmethod
+  def partial_batch_size(self):
+    """The size of the final partial batch for dataset.
+
+    Will return None if has_partial_batch is False or batch_size is None.
+    """
+    raise NotImplementedError
+
+  def should_recreate_iterator(self, steps_per_epoch):
+    """Returns whether a new iterator should be created every epoch."""
+    # Only recreate iterator when the data has a fixed length, which will be
+    # fully consumed every epoch, or has a unknown length (dataset, generator)
+    # and will be fully consumed (steps_per_epoch is None)
+    return self.get_size() is not None or steps_per_epoch is None
+
 
 class TensorLikeDataAdapter(DataAdapter):
   """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy."""
 
   @staticmethod
   def can_handle(x, y=None):
+    # TODO(kaftan): Check performance implications of using a flatten
+    #  here for other types of inputs.
     flat_inputs = nest.flatten(x)
     if y is not None:
       flat_inputs += nest.flatten(y)
 
-    return all(isinstance(v, (ops.Tensor, np.ndarray)) for v in flat_inputs)
+    def _is_tensor(v):
+      if isinstance(v, (ops.Tensor, np.ndarray)):
+        return True
+      return False
 
-  def __init__(self, x, y=None, sample_weights=None, batch_size=None,
-               shuffle=False, **kwargs):
+    return all(_is_tensor(v) for v in flat_inputs)
+
+  def __init__(self,
+               x,
+               y=None,
+               sample_weights=None,
+               sample_weight_modes=None,
+               batch_size=None,
+               epochs=1,
+               steps=None,
+               shuffle=False,
+               **kwargs):
     super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs)
     x = _process_numpy_inputs(x)
     y = _process_numpy_inputs(y)
     sample_weights = _process_numpy_inputs(sample_weights)
-    if y is not None and sample_weights is not None:
+
+    any_sample_weight = sample_weights is not None and any(
+        w is not None for w in sample_weights)
+    partial_sample_weight = any_sample_weight and any(
+        w is None for w in sample_weights)
+
+    # If sample_weights are not specified for an output use 1.0 as weights.
+    if partial_sample_weight:
+      sample_weights = handle_partial_sample_weights(y, sample_weights,
+                                                     sample_weight_modes)
+
+    if y is not None and any_sample_weight:
       inputs = (x, y, sample_weights)
     elif y is not None:
       # Sample weight is only needed for training, so if y is None, then
@@ -179,23 +226,208 @@ class TensorLikeDataAdapter(DataAdapter):
     else:
       inputs = (x,)
 
+    num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs))
+    if len(num_samples) > 1:
+      msg = "Data cardinality is ambiguous:\n"
+      for label, data in zip(["x", "y", "sample_weight"], inputs):
+        msg += "  {} sizes: {}\n".format(
+            label, ", ".join([str(i.shape[0]) for i in nest.flatten(data)]))
+      msg += "Please provide data which shares the same first dimension."
+      raise ValueError(msg)
+    num_samples = num_samples.pop()
+
+    # If batch_size is not passed but steps is, calculate from the input data.
+    if steps and not batch_size:
+      batch_size = int(math.ceil(num_samples / steps))
+
     if not batch_size:
       raise ValueError(
-          "`batch_size` is required for `Tensor` or `NumPy` input data.")
+          "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
+          " input data.")
+
+    self._size = int(math.ceil(num_samples / batch_size))
+    self._batch_size = batch_size
+
+    num_full_batches = int(num_samples // batch_size)
+    self._partial_batch_size = num_samples % batch_size
+
+    # Vectorized version of shuffle.
+    # This is a performance improvement over using `from_tensor_slices`.
+    # The indices of the data are shuffled and batched, and these indices
+    # are then zipped with the data and used to extract a batch of the data
+    # at each step. The performance improvements here come from:
+    # 1. vectorized batch using gather
+    # 2. parallelized map
+    # 3. pipelined permutation generation
+    # 4. optimized permutation batching
+    # 5. disabled static optimizations
+
+    indices_dataset = dataset_ops.DatasetV2.range(1).repeat(epochs)
+
+    def permutation(_):
+      # It turns out to be more performant to make a new set of indices rather
+      # than reusing the same range Tensor. (presumably because of buffer
+      # forwarding.)
+      indices = math_ops.range(num_samples, dtype=dtypes.int64)
+      if shuffle:
+        indices = random_ops.random_shuffle(indices)
+      return indices
+
+    # We prefetch a single element. Computing large permutations can take quite
+    # a while so we don't want to wait for prefetching over an epoch boundary to
+    # trigger the next permutation. On the other hand, too many simultaneous
+    # shuffles can contend on a hardware level and degrade all performance.
+    indices_dataset = indices_dataset.map(permutation).prefetch(1)
+
+    def slice_batch_indices(indices):
+      """Convert a Tensor of indices into a dataset of batched indices.
+
+      This step can be accomplished in several ways. The most natural is to
+      slice the Tensor in a Dataset map. (With a condition on the upper index to
+      handle the partial batch.) However it turns out that coercing the Tensor
+      into a shape which is divisible by the batch size (and handling the last
+      partial batch separately) allows for a much more favorable memory access
+      pattern and improved performance.
+
+      Args:
+        indices: Tensor which determines the data order for an entire epoch.
+
+      Returns:
+        A Dataset of batched indices.
+      """
+      num_in_full_batch = num_full_batches * batch_size
+      first_k_indices = array_ops.slice(indices, [0], [num_in_full_batch])
+      first_k_indices = array_ops.reshape(
+          first_k_indices, [num_full_batches, batch_size])
+
+      flat_dataset = dataset_ops.DatasetV2.from_tensor_slices(first_k_indices)
+      if self._partial_batch_size:
+        index_remainder = dataset_ops.DatasetV2.from_tensors(array_ops.slice(
+            indices, [num_in_full_batch], [self._partial_batch_size]))
+        flat_dataset = flat_dataset.concatenate(index_remainder)
+      return flat_dataset
+
+    indices_dataset = indices_dataset.flat_map(slice_batch_indices)
+    dataset = dataset_ops.DatasetV2.zip((
+        indices_dataset,
+        dataset_ops.DatasetV2.from_tensors(inputs).repeat()
+    ))
+
+    def grab_batch(i, data):
+      return nest.map_structure(lambda d: array_ops.gather(d, i, axis=0), data)
+
+    dataset = dataset.map(
+        grab_batch, num_parallel_calls=dataset_ops.AUTOTUNE)
+
+    # Default optimizations are disabled to avoid the overhead of (unnecessary)
+    # input pipeline graph serialization and deserialization
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    self._dataset = dataset
+
+  def get_dataset(self):
+    return self._dataset
+
+  def get_size(self):
+    return self._size
+
+  def batch_size(self):
+    return self._batch_size
+
+  def has_partial_batch(self):
+    return self._partial_batch_size > 0
+
+  def partial_batch_size(self):
+    return self._partial_batch_size or None
+
+  def should_recreate_iterator(self, _):
+    # An infinite dataset is always created here.
+    return False
+
+
+class CompositeTensorDataAdapter(DataAdapter):
+  """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy."""
+
+  @staticmethod
+  def can_handle(x, y=None):
+    flat_inputs = nest.flatten(x)
+    if y is not None:
+      flat_inputs += nest.flatten(y)
+
+    def _is_composite(v):
+      # Dataset inherits from CompositeTensor but shouldn't be handled here.
+      if (isinstance(v, composite_tensor.CompositeTensor) and
+          not isinstance(v, dataset_ops.DatasetV2)):
+        return True
+      return False
+
+    def _is_tensor_or_composite(v):
+      if isinstance(v, (ops.Tensor, np.ndarray)):
+        return True
+      return _is_composite(v)
+
+    return (any(_is_composite(v) for v in flat_inputs) and
+            all(_is_tensor_or_composite(v) for v in flat_inputs))
+
+  def __init__(self,
+               x,
+               y=None,
+               sample_weights=None,
+               sample_weight_modes=None,
+               batch_size=None,
+               steps=None,
+               shuffle=False,
+               **kwargs):
+    super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs)
+    x = _process_numpy_inputs(x)
+    y = _process_numpy_inputs(y)
+    sample_weights = _process_numpy_inputs(sample_weights)
+
+    any_sample_weight = sample_weights is not None and any(
+        w is not None for w in sample_weights)
+    partial_sample_weight = any_sample_weight and any(
+        w is None for w in sample_weights)
+
+    # Handle partial sample weights.
+    # If sample_weights are not specified for an output use 1.0 as weights.
+    if partial_sample_weight:
+      sample_weights = handle_partial_sample_weights(y, sample_weights,
+                                                     sample_weight_modes)
+
+    if y is not None and any_sample_weight:
+      inputs = (x, y, sample_weights)
+    elif y is not None:
+      # Sample weight is only needed for training, so if y is None, then
+      # sample_weight is ignored.
+      inputs = (x, y)
+    else:
+      inputs = (x,)
 
     dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
     num_samples = int(nest.flatten(x)[0].shape[0])
     if shuffle:
       dataset = dataset.shuffle(num_samples)
-    if batch_size:
-      dataset = dataset.batch(batch_size)
-      self._size = int(math.ceil(num_samples / batch_size))
-      self._batch_size = batch_size
-      self._has_partial_batch = (self._size != (num_samples // batch_size))
-    else:
-      self._size = 1
-      self._batch_size = num_samples
-      self._has_partial_batch = False
+
+    # If batch_size is not passed but steps is, calculate from the input data.
+    if steps and not batch_size:
+      batch_size = int(math.ceil(num_samples/steps))
+
+    if not batch_size:
+      raise ValueError(
+          "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
+          " input data.")
+
+    dataset = dataset.batch(batch_size)
+    self._size = int(math.ceil(num_samples / batch_size))
+    self._batch_size = batch_size
+    self._has_partial_batch = (self._size != (num_samples // batch_size))
+
+    self._partial_batch_size = None
+    if self._has_partial_batch:
+      self._partial_batch_size = (
+          num_samples - (self._size - 1) * self._batch_size)
+
     self._dataset = dataset
 
   def get_dataset(self):
@@ -210,6 +442,68 @@ class TensorLikeDataAdapter(DataAdapter):
   def has_partial_batch(self):
     return self._has_partial_batch
 
+  def partial_batch_size(self):
+    return self._partial_batch_size
+
+
+class ListsOfScalarsDataAdapter(DataAdapter):
+  """Adapter that handles lists of scalars and lists of lists of scalars."""
+
+  @staticmethod
+  def can_handle(x, y=None):
+    handles_x = ListsOfScalarsDataAdapter._is_list_of_scalars(x)
+    handles_y = True
+    if y is not None:
+      handles_y = ListsOfScalarsDataAdapter._is_list_of_scalars(y)
+    return handles_x and handles_y
+
+  @staticmethod
+  def _is_list_of_scalars(inp):
+    if isinstance(inp, (float, int, str)):
+      return True
+    if isinstance(inp, (list, tuple)):
+      return ListsOfScalarsDataAdapter._is_list_of_scalars(inp[0])
+    return False
+
+  def __init__(self,
+               x,
+               y=None,
+               sample_weights=None,
+               sample_weight_modes=None,
+               batch_size=None,
+               shuffle=False,
+               **kwargs):
+    super(ListsOfScalarsDataAdapter, self).__init__(x, y, **kwargs)
+    x = np.asarray(x)
+    if y is not None:
+      y = np.asarray(y)
+    if sample_weights is not None:
+      sample_weights = np.asarray(sample_weights)
+
+    self._internal_adapter = TensorLikeDataAdapter(
+        x,
+        y=y,
+        sample_weights=sample_weights,
+        sample_weight_modes=sample_weight_modes,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        **kwargs)
+
+  def get_dataset(self):
+    return self._internal_adapter.get_dataset()
+
+  def get_size(self):
+    return self._internal_adapter.get_size()
+
+  def batch_size(self):
+    return self._internal_adapter.batch_size()
+
+  def has_partial_batch(self):
+    return self._internal_adapter.has_partial_batch()
+
+  def partial_batch_size(self):
+    return self._internal_adapter.partial_batch_size()
+
 
 class DatasetAdapter(DataAdapter):
   """Adapter that handles `tf.data.Dataset`."""
@@ -243,6 +537,9 @@ class DatasetAdapter(DataAdapter):
   def has_partial_batch(self):
     return False
 
+  def partial_batch_size(self):
+    return None
+
 
 class GeneratorDataAdapter(DataAdapter):
   """Adapter that handles python generator."""
@@ -251,7 +548,8 @@ class GeneratorDataAdapter(DataAdapter):
   def can_handle(x, y=None):
     return tf_inspect.isgenerator(x)
 
-  def __init__(self, x, y=None, sample_weights=None, **kwargs):
+  def __init__(self, x, y=None, sample_weights=None, workers=1,
+               use_multiprocessing=False, max_queue_size=10, **kwargs):
     super(GeneratorDataAdapter, self).__init__(x, y, **kwargs)
     if not is_none_or_empty(y):
       raise ValueError("`y` argument is not supported when using "
@@ -269,12 +567,24 @@ class GeneratorDataAdapter(DataAdapter):
     nested_shape = nest.map_structure(lambda t: t.shape, peek)
     # Note that dataset API takes a callable that creates a generator object,
     # rather than generator itself, which is why we define a function here.
-    def reassemble():
-      return itertools.chain([peek], x)
+    if workers > 0:
+      if use_multiprocessing:
+        logging.warning(
+            UserWarning("Using a generator with `use_multiprocessing=True` "
+                        "and multiple workers may duplicate your data. "
+                        "Please consider using the `tf.data.Dataset`."))
+      def generator_fn():
+        enqueuer = data_utils.GeneratorEnqueuer(
+            itertools.chain([peek], x), use_multiprocessing=use_multiprocessing)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        return enqueuer.get()
+    else:
+      def generator_fn():
+        return itertools.chain([peek], x)
 
     self._batch_size = int(nest.flatten(peek)[0].shape[0])
     self._dataset = dataset_ops.DatasetV2.from_generator(
-        reassemble, nested_dtypes, output_shapes=nested_shape)
+        generator_fn, nested_dtypes, output_shapes=nested_shape)
 
   def get_dataset(self):
     return self._dataset
@@ -288,6 +598,9 @@ class GeneratorDataAdapter(DataAdapter):
   def has_partial_batch(self):
     return False
 
+  def partial_batch_size(self):
+    return None
+
 
 class KerasSequenceAdapter(DataAdapter):
   """Adapter that handles `keras.utils.Sequence`."""
@@ -296,7 +609,8 @@ class KerasSequenceAdapter(DataAdapter):
   def can_handle(x, y=None):
     return isinstance(x, data_utils.Sequence)
 
-  def __init__(self, x, y=None, sample_weights=None, shuffle=False, **kwargs):
+  def __init__(self, x, y=None, sample_weights=None, shuffle=False, workers=1,
+               use_multiprocessing=False, max_queue_size=10, **kwargs):
     super(KerasSequenceAdapter, self).__init__(x, y, **kwargs)
     if not is_none_or_empty(y):
       raise ValueError("`y` argument is not supported when using "
@@ -308,10 +622,17 @@ class KerasSequenceAdapter(DataAdapter):
     nested_dtypes = nest.map_structure(lambda t: t.dtype, peek)
     nested_shape = nest.map_structure(lambda t: t.shape, peek)
 
-    def generator():
-      for i in range(len(x)):
-        yield x[i]
-    dataset = dataset_ops.DatasetV2.from_generator(generator, nested_dtypes,
+    if workers > 0:
+      def generator_fn():
+        enqueuer = data_utils.OrderedEnqueuer(
+            x, use_multiprocessing=use_multiprocessing)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        return enqueuer.get()
+    else:
+      def generator_fn():
+        for i in range(len(x)):
+          yield x[i]
+    dataset = dataset_ops.DatasetV2.from_generator(generator_fn, nested_dtypes,
                                                    output_shapes=nested_shape)
     if shuffle:
       dataset = dataset.shuffle(len(x))
@@ -331,25 +652,48 @@ class KerasSequenceAdapter(DataAdapter):
   def has_partial_batch(self):
     return False
 
+  def partial_batch_size(self):
+    return None
+
 
 ALL_ADAPTER_CLS = [
-    TensorLikeDataAdapter, DatasetAdapter, GeneratorDataAdapter,
-    KerasSequenceAdapter
+    ListsOfScalarsDataAdapter, TensorLikeDataAdapter, DatasetAdapter,
+    GeneratorDataAdapter, KerasSequenceAdapter, CompositeTensorDataAdapter
 ]
 
 
 def select_data_adapter(x, y):
+  """Selects a data adapter than can handle a given x and y."""
   adapter_cls = [cls for cls in ALL_ADAPTER_CLS if cls.can_handle(x, y)]
   if not adapter_cls:
-    raise ValueError("Failed to find data adapter that can handle "
-                     "input: {}, {}".format(type(x), type(y)))
+    # TODO(scottzhu): This should be a less implementation-specific error.
+    raise ValueError(
+        "Failed to find data adapter that can handle "
+        "input: {}, {}".format(
+            _type_name(x), _type_name(y)))
   elif len(adapter_cls) > 1:
-    raise RuntimeError("Data adapter should be mutually exclusive for "
-                       "handling inputs. Found multiple adapter {} to handle "
-                       "input: {}, {}".format(adapter_cls, type(x), type(y)))
+    raise RuntimeError(
+        "Data adapters should be mutually exclusive for "
+        "handling inputs. Found multiple adapters {} to handle "
+        "input: {}, {}".format(
+            adapter_cls, _type_name(x), _type_name(y)))
   return adapter_cls[0]
 
 
+def _type_name(x):
+  """Generates a description of the type of an object."""
+  if isinstance(x, dict):
+    key_types = set(_type_name(key) for key in x.keys())
+    val_types = set(_type_name(key) for key in x.values())
+    return "({} containing {} keys and {} values)".format(
+        type(x), key_types, val_types)
+  if isinstance(x, (list, tuple)):
+    types = set(_type_name(val) for val in x)
+    return "({} containing values of types {})".format(
+        type(x), types)
+  return str(type(x))
+
+
 def _process_numpy_inputs(inputs):
   """Process numpy array inputs.
 
@@ -390,3 +734,29 @@ def is_none_or_empty(inputs):
   # "The truth value of an array with more than one element is ambiguous.
   # Use a.any() or a.all()"
   return inputs is None or not nest.flatten(inputs)
+
+
+def handle_partial_sample_weights(outputs, sample_weights, sample_weight_modes):
+  """Adds 1.0 as sample weights for the outputs for which there is no weight.
+
+  Args:
+    outputs: List of model outputs.
+    sample_weights: List of sample weight inputs.
+    sample_weight_modes: List of sample weight modes or None.
+
+  Returns:
+    Tuple of sample weights, one sample weight for every output.
+  """
+  new_sample_weights = []
+  for i, sw in enumerate(sample_weights):
+    if sw is None:
+      output_shape = outputs[i].shape
+      is_temporal = (
+          sample_weight_modes is not None and
+          sample_weight_modes[i] == "temporal")
+      sw_shape = (output_shape[0],
+                  output_shape[1]) if is_temporal else (output_shape[0],)
+      new_sample_weights.append(array_ops.ones(sw_shape))
+    else:
+      new_sample_weights.append(sw)
+  return training_utils.list_to_tuple(new_sample_weights)
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index 97bd4b018a9..850e162313f 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
+
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import data_adapter
 from tensorflow.python.keras.utils import data_utils
@@ -31,7 +36,7 @@ from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class DataAdapterTestBase(test.TestCase):
+class DataAdapterTestBase(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     super(DataAdapterTestBase, self).setUp()
@@ -45,8 +50,10 @@ class DataAdapterTestBase(test.TestCase):
             self.batch_size)
 
     def generator():
-      yield (np.zeros((self.batch_size, 10)), np.ones(self.batch_size))
+      while True:
+        yield (np.zeros((self.batch_size, 10)), np.ones(self.batch_size))
     self.generator_input = generator()
+    self.iterator_input = data_utils.threadsafe_generator(generator)()
     self.sequence_input = TestSequence(batch_size=self.batch_size,
                                        feature_shape=10)
     self.model = keras.models.Sequential(
@@ -83,7 +90,8 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
 
   def test_iterator_expect_batch_size_numpy(self):
-    with self.assertRaisesRegexp(ValueError, r'`batch_size` is required'):
+    with self.assertRaisesRegexp(
+        ValueError, r'`batch_size` or `steps` is required'):
       self.adapter_cls(self.numpy_input, self.numpy_target)
 
   def test_size_numpy(self):
@@ -102,12 +110,26 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
         self.numpy_input, self.numpy_target, batch_size=4)
     self.assertEqual(adapter.get_size(), 13)   # 50/4
     self.assertTrue(adapter.has_partial_batch())
+    self.assertEqual(adapter.partial_batch_size(), 2)
 
+  def test_epochs(self):
+    num_epochs = 3
+    adapter = self.adapter_cls(
+        self.numpy_input, self.numpy_target, batch_size=5, epochs=num_epochs)
+    ds_iter = iter(adapter.get_dataset())
+    num_batches_per_epoch = self.numpy_input.shape[0] // 5
+    for _ in range(num_batches_per_epoch * num_epochs):
+      next(ds_iter)
+    with self.assertRaises(StopIteration):
+      next(ds_iter)
+
+  @test_util.run_in_graph_and_eager_modes
   def test_training_numpy(self):
-    dataset = self.adapter_cls(
-        self.numpy_input, self.numpy_target, batch_size=5).get_dataset()
+    if not context.executing_eagerly():
+      return  # Only test in eager.
+
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
-    self.model.fit(dataset)
+    self.model.fit(self.numpy_input, self.numpy_target, batch_size=5)
 
   def test_can_handle(self):
     self.assertTrue(self.adapter_cls.can_handle(self.tensor_input))
@@ -118,11 +140,13 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_training(self):
-    dataset = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=5).get_dataset()
+    if not context.executing_eagerly():
+      return  # Only test EagerTensors.
+
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
-    self.model.fit(dataset)
+    self.model.fit(self.tensor_input, self.tensor_target, batch_size=5)
 
   def test_size(self):
     adapter = self.adapter_cls(
@@ -130,16 +154,66 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertEqual(adapter.get_size(), 10)
     self.assertFalse(adapter.has_partial_batch())
 
-  def test_batch_size(self):
-    adapter = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=5)
-    self.assertEqual(adapter.batch_size(), 5)
+  def test_shuffle_correctness(self):
+    with context.eager_mode():
+      num_samples = 100
+      batch_size = 32
+      x = np.arange(num_samples)
+      np.random.seed(99)
+      adapter = self.adapter_cls(
+          x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
 
-  def test_partial_batch(self):
+      def _get_epoch(ds_iter):
+        ds_data = []
+        for _ in range(int(math.ceil(num_samples / batch_size))):
+          ds_data.append(next(ds_iter)[0].numpy())
+        return np.concatenate(ds_data)
+
+      ds_iter = iter(adapter.get_dataset())
+
+      # First epoch.
+      epoch_data = _get_epoch(ds_iter)
+      # Check that shuffling occurred.
+      self.assertNotAllClose(x, epoch_data)
+      # Check that each elements appears, and only once.
+      self.assertAllClose(x, np.sort(epoch_data))
+
+      # Second epoch.
+      second_epoch_data = _get_epoch(ds_iter)
+      # Check that shuffling occurred.
+      self.assertNotAllClose(x, second_epoch_data)
+      # Check that shuffling is different across epochs.
+      self.assertNotAllClose(epoch_data, second_epoch_data)
+      # Check that each elements appears, and only once.
+      self.assertAllClose(x, np.sort(second_epoch_data))
+
+  @parameterized.named_parameters(
+      ('batch_size_5', 5, None, 5),
+      ('batch_size_50', 50, 4, 50),  # Sanity check: batch_size takes precedence
+      ('steps_1', None, 1, 50),
+      ('steps_4', None, 4, 13),
+      )
+  def test_batch_size(self, batch_size_in, steps, batch_size_out):
     adapter = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=4)
-    self.assertEqual(adapter.get_size(), 13)   # 50/4
-    self.assertTrue(adapter.has_partial_batch())
+        self.tensor_input, self.tensor_target, batch_size=batch_size_in,
+        steps=steps)
+    self.assertEqual(adapter.batch_size(), batch_size_out)
+
+  @parameterized.named_parameters(
+      ('batch_size_5', 5, None, 10, 0),
+      ('batch_size_4', 4, None, 13, 2),
+      ('steps_1', None, 1, 1, 0),
+      ('steps_5', None, 5, 5, 0),
+      ('steps_4', None, 4, 4, 11),
+      )
+  def test_partial_batch(
+      self, batch_size_in, steps, size, partial_batch_size):
+    adapter = self.adapter_cls(
+        self.tensor_input, self.tensor_target, batch_size=batch_size_in,
+        steps=steps)
+    self.assertEqual(adapter.get_size(), size)   # 50/steps
+    self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size))
+    self.assertEqual(adapter.partial_batch_size(), partial_batch_size or None)
 
 
 class DatasetAdapterTest(DataAdapterTestBase):
@@ -171,6 +245,16 @@ class DatasetAdapterTest(DataAdapterTestBase):
   def test_partial_batch(self):
     adapter = self.adapter_cls(self.dataset_input)
     self.assertFalse(adapter.has_partial_batch())
+    self.assertIsNone(adapter.partial_batch_size())
+
+  def test_invalid_targets_argument(self):
+    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+      self.adapter_cls(self.dataset_input, y=self.dataset_input)
+
+  def test_invalid_sample_weights_argument(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'`sample_weight` argument is not supported'):
+      self.adapter_cls(self.dataset_input, sample_weights=self.dataset_input)
 
 
 class GeneratorDataAdapterTest(DataAdapterTestBase):
@@ -187,9 +271,19 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
 
   def test_training(self):
-    dataset = self.adapter_cls(self.generator_input).get_dataset()
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
-    self.model.fit(dataset)
+    self.model.fit(self.generator_input, steps_per_epoch=10)
+
+  @test_util.run_v2_only
+  @data_utils.dont_use_multiprocessing_pool
+  def test_with_multiprocessing_training(self):
+    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
+    self.model.fit(self.iterator_input, workers=1, use_multiprocessing=True,
+                   max_queue_size=10, steps_per_epoch=10)
+    # Fit twice to ensure there isn't any duplication that prevent the worker
+    # from starting.
+    self.model.fit(self.iterator_input, workers=1, use_multiprocessing=True,
+                   max_queue_size=10, steps_per_epoch=10)
 
   def test_size(self):
     adapter = self.adapter_cls(self.generator_input)
@@ -202,6 +296,17 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
   def test_partial_batch(self):
     adapter = self.adapter_cls(self.generator_input)
     self.assertFalse(adapter.has_partial_batch())
+    self.assertIsNone(adapter.partial_batch_size())
+
+  def test_invalid_targets_argument(self):
+    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+      self.adapter_cls(self.generator_input, y=self.generator_input)
+
+  def test_invalid_sample_weights_argument(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'`sample_weight` argument is not supported'):
+      self.adapter_cls(
+          self.generator_input, sample_weights=self.generator_input)
 
 
 class KerasSequenceAdapterTest(DataAdapterTestBase):
@@ -218,9 +323,19 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
     self.assertTrue(self.adapter_cls.can_handle(self.sequence_input))
 
   def test_training(self):
-    dataset = self.adapter_cls(self.sequence_input).get_dataset()
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
-    self.model.fit(dataset)
+    self.model.fit(self.sequence_input)
+
+  @test_util.run_v2_only
+  @data_utils.dont_use_multiprocessing_pool
+  def test_with_multiprocessing_training(self):
+    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')
+    self.model.fit(self.sequence_input, workers=1, use_multiprocessing=True,
+                   max_queue_size=10, steps_per_epoch=10)
+    # Fit twice to ensure there isn't any duplication that prevent the worker
+    # from starting.
+    self.model.fit(self.sequence_input, workers=1, use_multiprocessing=True,
+                   max_queue_size=10, steps_per_epoch=10)
 
   def test_size(self):
     adapter = self.adapter_cls(self.sequence_input)
@@ -233,7 +348,18 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
   def test_partial_batch(self):
     adapter = self.adapter_cls(self.sequence_input)
     self.assertFalse(adapter.has_partial_batch())
+    self.assertIsNone(adapter.partial_batch_size())
+
+  def test_invalid_targets_argument(self):
+    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+      self.adapter_cls(self.sequence_input, y=self.sequence_input)
+
+  def test_invalid_sample_weights_argument(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'`sample_weight` argument is not supported'):
+      self.adapter_cls(self.sequence_input, sample_weights=self.sequence_input)
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index a151b847f3f..f50508c11ee 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -49,8 +49,6 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_sequential_model(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
         fc.DenseFeatures(columns),
@@ -62,7 +60,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = {'a': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -74,8 +72,6 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_sequential_model_with_ds_input(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
         fc.DenseFeatures(columns),
@@ -87,7 +83,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -141,8 +137,6 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
@@ -153,7 +147,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = {'a': np.random.random((10, 1)), 'b': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -165,8 +159,6 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns_with_ds_input(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
@@ -177,7 +169,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 2440448c62a..82c2e2d7d6a 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -102,6 +102,7 @@ class InputLayer(base_layer.Layer):
     super(InputLayer, self).__init__(dtype=dtype, name=name)
     self.built = True
     self.sparse = sparse
+    self.ragged = ragged
     self.batch_size = batch_size
     self.supports_masking = True
 
@@ -152,6 +153,7 @@ class InputLayer(base_layer.Layer):
         'batch_input_shape': self._batch_input_shape,
         'dtype': self.dtype,
         'sparse': self.sparse,
+        'ragged': self.ragged,
         'name': self.name
     }
     return config
@@ -204,7 +206,8 @@ def Input(  # pylint: disable=invalid-name
           values of 'None' in the 'shape' argument represent ragged dimensions.
           For more information about RaggedTensors, see
           https://www.tensorflow.org/guide/ragged_tensors.
-      **kwargs: deprecated arguments support.
+      **kwargs: deprecated arguments support. Supports `batch_shape` and
+          `batch_input_shape`.
 
   Returns:
     A `tensor`.
@@ -235,15 +238,21 @@ def Input(  # pylint: disable=invalid-name
     raise ValueError(
         'Cannot set both sparse and ragged to True in a Keras input.')
 
-  batch_shape = None
-  if 'batch_shape' in kwargs:
-    batch_shape = kwargs.pop('batch_shape')
-    if shape and batch_shape:
-      raise ValueError('Only provide the shape OR '
-                       'batch_shape argument to '
-                       'Input, not both at the same time.')
-    batch_size = batch_shape[0]
-    shape = batch_shape[1:]
+  input_layer_config = {'name': name, 'dtype': dtype, 'sparse': sparse,
+                        'ragged': ragged, 'input_tensor': tensor}
+
+  batch_input_shape = kwargs.pop('batch_input_shape',
+                                 kwargs.pop('batch_shape', None))
+  if shape and batch_input_shape:
+    raise ValueError('Only provide the `shape` OR `batch_input_shape` argument '
+                     'to Input, not both at the same time.')
+  if batch_input_shape:
+    shape = batch_input_shape[1:]
+    input_layer_config.update({'batch_input_shape': batch_input_shape})
+  else:
+    input_layer_config.update(
+        {'batch_size': batch_size, 'input_shape': shape})
+
   if kwargs:
     raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
 
@@ -253,23 +262,7 @@ def Input(  # pylint: disable=invalid-name
                      '`shape` does not include the batch '
                      'dimension.')
 
-  if batch_shape:
-    input_layer = InputLayer(
-        batch_input_shape=batch_shape,
-        name=name,
-        dtype=dtype,
-        sparse=sparse,
-        ragged=ragged,
-        input_tensor=tensor)
-  else:
-    input_layer = InputLayer(
-        input_shape=shape,
-        batch_size=batch_size,
-        name=name,
-        dtype=dtype,
-        sparse=sparse,
-        ragged=ragged,
-        input_tensor=tensor)
+  input_layer = InputLayer(**input_layer_config)
 
   # Return tensor including `_keras_history`.
   # Note that in this case train_output and test_output are the same pointer.
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 9bb23bc90d5..1fb6507e03f 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -26,6 +26,7 @@ import json
 import os
 import threading
 
+import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
@@ -41,7 +42,6 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -54,6 +54,7 @@ from tensorflow.python.training.tracking import layer_utils as trackable_layer_u
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
 
@@ -189,7 +190,8 @@ class Network(base_layer.Layer):
     # self.losses
     # self.updates
 
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic'})
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
+                                           'autocast'})
 
     # Object to store all thread local layer properties.
     self._thread_local = threading.local()
@@ -227,8 +229,12 @@ class Network(base_layer.Layer):
       self._graph = None
     else:
       self._graph = ops.get_default_graph()  # Used in symbolic mode only.
-      # A Network does not create weights of its own, thus has no dtype.
-    self._dtype = kwargs.get('dtype', None)
+
+    # Both graph and subclassed networks have a dtype policy. For graph
+    # networks, the policy's compute and variable dtypes are ignored, but other
+    # fields, like the loss scale, are used by Models. For subclassed networks,
+    # the compute and variable dtypes are used as like any ordinary layer.
+    self._set_dtype_policy(kwargs.get('dtype', None))
 
     # All layers in order of horizontal graph traversal.
     # Entries are unique. Includes input and output layers.
@@ -241,20 +247,12 @@ class Network(base_layer.Layer):
     self._trackable_saver = (
         trackable_utils.saver_with_op_caching(self))
 
-    # Networks do not need to do any casting of inputs or variables, because
-    # each of its layers will handle casting through the layer's own
-    # implementation. Therefore networks use the 'infer' policy, which does no
-    # casting.
-    self._mixed_precision_policy = policy.Policy('infer')
-
   @trackable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None, **kwargs):
     generic_utils.validate_kwargs(
         kwargs, {'trainable'},
         'Functional models may only specify `name` and `trainable` keyword '
         'arguments during initialization. Got an unexpected argument:')
-    self._call_convention = (base_layer_utils
-                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
       inputs = inputs[0]
@@ -280,6 +278,9 @@ class Network(base_layer.Layer):
     # present in the signature of the `call` method of a graph network.
     self._expects_training_arg = True
     self._expects_mask_arg = True
+    # A graph network does not autocast inputs, as its layers will cast them
+    # instead.
+    self._autocast = False
 
     self._input_layers = []
     self._output_layers = []
@@ -312,12 +313,11 @@ class Network(base_layer.Layer):
       self._input_coordinates.append((layer, node_index, tensor_index))
 
     # Keep track of the network's nodes and layers.
-    nodes, nodes_by_depth, layers, layers_by_depth = _map_graph_network(
+    nodes, nodes_by_depth, layers, _ = _map_graph_network(
         self.inputs, self.outputs)
     self._network_nodes = nodes
     self._nodes_by_depth = nodes_by_depth
     self._layers = layers
-    self._layers_by_depth = layers_by_depth
     self._layer_call_argspecs = {}
     for layer in self._layers:
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
@@ -374,12 +374,9 @@ class Network(base_layer.Layer):
   def _init_subclassed_network(self, name=None, **kwargs):
     self._base_init(name=name, **kwargs)
     self._is_graph_network = False
-    self._expects_training_arg = ('training' in self._call_fn_args or
-                                  self._call_accepts_kwargs)
-    self._expects_mask_arg = ('mask' in self._call_fn_args or
-                              self._call_accepts_kwargs)
-    call_argspec = tf_inspect.getfullargspec(self.call)
-    self._call_convention = self._determine_call_convention(call_argspec)
+    self._init_call_fn_args()
+    self._autocast = kwargs.get('autocast',
+                                base_layer_utils.v2_dtype_behavior_enabled())
     self.outputs = []
     self.inputs = []
     self.built = False
@@ -390,45 +387,6 @@ class Network(base_layer.Layer):
       return any(layer.dynamic for layer in self.layers)
     return self._dynamic or any(layer.dynamic for layer in self.layers)
 
-  def _determine_call_convention(self, call_argspec):
-    """Decides how `self.call()` is invoked. See `CallConvention`."""
-    if call_argspec.varargs:
-      may_take_single_argument = False
-    else:
-      try:
-        # Note: tf_inspect doesn't raise a TypeError when regular inspect would,
-        # so we need to keep in mind that "getcallargs" may have returned
-        # something even though we under-specified positional arguments.
-        all_args = tf_inspect.getcallargs(self.call, None)
-        self_args = set()
-        for arg_name, obj in all_args.items():
-          if obj is self:
-            self_args.add(arg_name)
-        may_take_single_argument = True
-      except TypeError:
-        may_take_single_argument = False
-    if may_take_single_argument:
-      # A single positional argument (plus "self") is considered equivalent to
-      # an "inputs" argument.
-      all_positional_args = len(call_argspec.args)
-      if call_argspec.defaults is not None:
-        all_positional_args -= len(call_argspec.defaults)
-      non_self_positional_args = all_positional_args
-      for positional_arg_name in call_argspec.args[:all_positional_args]:
-        if positional_arg_name in self_args:
-          non_self_positional_args -= 1
-      if non_self_positional_args == 1:
-        if 'inputs' in call_argspec.args[all_positional_args:]:
-          raise TypeError(
-              "Model.call() takes a single positional argument (to which "
-              "inputs are passed by convention) and a separate 'inputs' "
-              "argument. Unable to determine which arguments are inputs.")
-        return base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT
-    if 'inputs' in call_argspec.args:
-      return base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT
-    else:
-      return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
-
   def _track_layers(self, layers):
     """Add Trackable dependencies on a list of Layers."""
     weight_layer_index = 0
@@ -863,21 +821,24 @@ class Network(base_layer.Layer):
           computed_tensors = nest.map_structure(
               lambda t: tensor_dict[str(id(t))], node.input_tensors)
 
-          # Ensure `training` and `mask` arg propagation if applicable.
+          # Ensure `training` arg propagation if applicable.
           kwargs = copy.copy(node.arguments) if node.arguments else {}
           argspec = self._layer_call_argspecs[layer].args
           if 'training' in argspec:
             kwargs.setdefault('training', training)
-          if 'mask' in kwargs:
+            if (type(kwargs['training']) is ops.Tensor and  # pylint: disable=unidiomatic-typecheck
+                any([kwargs['training'] is x
+                     for x in backend._GRAPH_LEARNING_PHASES.values()])):
+              kwargs['training'] = training  # Materialize placeholder.
 
-            def _map_mask_if_from_keras_layer(m):
-              # Replace input mask that originates from a Keras layer with
-              # its computed value.
-              m_id = str(id(m))
-              return tensor_dict[m_id] if m_id in tensor_dict else m
+          # Map Keras tensors in kwargs to their computed value.
+          def _map_tensor_if_from_keras_layer(t):
+            if isinstance(t, ops.Tensor) and hasattr(t, '_keras_history'):
+              t_id = str(id(t))
+              return tensor_dict[t_id]
+            return t
 
-            kwargs['mask'] = nest.map_structure(_map_mask_if_from_keras_layer,
-                                                kwargs['mask'])
+          kwargs = nest.map_structure(_map_tensor_if_from_keras_layer, kwargs)
 
           # Compute outputs.
           output_tensors = layer(computed_tensors, **kwargs)
@@ -913,11 +874,7 @@ class Network(base_layer.Layer):
     }
     node_conversion_map = {}
     for layer in self.layers:
-      if issubclass(layer.__class__, Network) and layer._is_graph_network:
-        # Networks start with a pre-existing node linking their input to output.
-        kept_nodes = 1
-      else:
-        kept_nodes = 0
+      kept_nodes = 1 if _should_skip_first_node(layer) else 0
       for original_node_index, node in enumerate(layer._inbound_nodes):
         node_key = _make_node_key(layer.name, original_node_index)
         if node_key in self._network_nodes:
@@ -925,9 +882,6 @@ class Network(base_layer.Layer):
           kept_nodes += 1
     layer_configs = []
     for layer in self.layers:  # From the earliest layers on.
-      layer_class_name = layer.__class__.__name__
-      layer_config = layer.get_config()
-
       filtered_inbound_nodes = []
       for original_node_index, node in enumerate(layer._inbound_nodes):
         node_key = _make_node_key(layer.name, original_node_index)
@@ -935,9 +889,9 @@ class Network(base_layer.Layer):
           # The node is relevant to the model:
           # add to filtered_inbound_nodes.
           if node.arguments:
+            kwargs = _serialize_tensors(node.arguments)
             try:
-              json.dumps(node.arguments)
-              kwargs = node.arguments
+              json.dumps(kwargs)
             except TypeError:
               logging.warning(
                   'Layer ' + layer.name +
@@ -963,12 +917,10 @@ class Network(base_layer.Layer):
             node_data = tf_utils.convert_inner_node_data(node_data)
             filtered_inbound_nodes.append(node_data)
 
-      layer_configs.append({
-          'name': layer.name,
-          'class_name': layer_class_name,
-          'config': layer_config,
-          'inbound_nodes': filtered_inbound_nodes,
-      })
+      layer_config = generic_utils.serialize_keras_object(layer)
+      layer_config['name'] = layer.name
+      layer_config['inbound_nodes'] = filtered_inbound_nodes
+      layer_configs.append(layer_config)
     config['layers'] = layer_configs
 
     # Gather info about inputs and outputs.
@@ -1021,9 +973,8 @@ class Network(base_layer.Layer):
     Raises:
         ValueError: In case of improperly formatted config dict.
     """
-    # Layer instances created during
-    # the graph reconstruction process
-    created_layers = {}
+    # Layer instances created during the graph reconstruction process.
+    created_layers = collections.OrderedDict()
 
     # Dictionary mapping layer instances to
     # node data that specifies a layer call.
@@ -1058,6 +1009,7 @@ class Network(base_layer.Layer):
           kwargs = {}
         elif len(input_data) == 4:
           kwargs = input_data[3]
+          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
         else:
           raise ValueError('Improperly formatted model config.')
 
@@ -1154,14 +1106,20 @@ class Network(base_layer.Layer):
         layer for layer in created_layers.values() if layer not in model.layers
     ]
     if ancillary_layers:
-      model._insert_layers(ancillary_layers)
+      relevant_nodes = nest.flatten([
+          layer.inbound_nodes[1:]
+          if _should_skip_first_node(layer) else layer.inbound_nodes
+          for layer in created_layers.values()
+      ])
+      model._insert_layers(ancillary_layers, relevant_nodes)
     return model
 
   def save(self,
            filepath,
            overwrite=True,
            include_optimizer=True,
-           save_format=None):
+           save_format=None,
+           signatures=None):
     """Saves the model to Tensorflow SavedModel or a single HDF5 file.
 
     The savefile includes:
@@ -1187,6 +1145,9 @@ class Network(base_layer.Layer):
           to Tensorflow SavedModel or HDF5. The default is currently 'h5', but
           will switch to 'tf' in TensorFlow 2.0. The 'tf' option is currently
           disabled (use `tf.keras.experimental.export_saved_model` instead).
+      signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
+        format only. Please see the `signatures` argument in
+        `tf.saved_model.save` for details.
 
     Example:
 
@@ -1201,7 +1162,8 @@ class Network(base_layer.Layer):
     model = load_model('my_model.h5')
     ```
     """
-    saving.save_model(self, filepath, overwrite, include_optimizer, save_format)
+    saving.save_model(self, filepath, overwrite, include_optimizer, save_format,
+                      signatures)
 
   def save_weights(self, filepath, overwrite=True, save_format=None):
     """Saves all layer weights.
@@ -1496,7 +1458,7 @@ class Network(base_layer.Layer):
   def _validate_graph_inputs_and_outputs(self):
     """Validates the inputs and outputs of a Graph Network."""
     # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
+    if len(object_identity.ObjectIdentitySet(self.inputs)) != len(self.inputs):
       raise ValueError('The list of inputs passed to the model '
                        'is redundant. '
                        'All inputs should only appear once.'
@@ -1582,7 +1544,7 @@ class Network(base_layer.Layer):
     def _get_min_depth(node):
       """Gets the minimum depth at which node can be computed."""
       min_depth = 0
-      for layer, node_id, _, _ in node.iterate_inbound():
+      for layer, node_id, _, _ in node.iterate_inbound(include_arguments=True):
         inbound_node = layer._inbound_nodes[node_id]
         if inbound_node in node_to_depth:
           min_depth = min(min_depth, node_to_depth[inbound_node])
@@ -1607,26 +1569,23 @@ class Network(base_layer.Layer):
 
       node = unprocessed_nodes.pop(0)
       depth = _get_min_depth(node)
-      if depth is None:
+      if depth is None:  # Defer until inbound nodes are processed.
         unprocessed_nodes.append(node)
-      else:
-        node_key = _make_node_key(
-            node.outbound_layer.name,
-            node.outbound_layer._inbound_nodes.index(node))
+        continue
+      node_key = _make_node_key(node.outbound_layer.name,
+                                node.outbound_layer._inbound_nodes.index(node))
+      if node_key not in self._network_nodes:
         node_to_depth[node] = depth
         self._network_nodes.add(node_key)
         self._nodes_by_depth[depth].append(node)
 
-    # Insert layers into `_layer_by_depth` and other layer attrs.
+    # Insert layers and update other layer attrs.
+    layer_set = set(self._layers)
     for layer in layers:
-      depth = min([
-          node_to_depth[node]
-          for node in layer.inbound_nodes
-          if node in network_nodes
-      ])
-      self._layers_by_depth[depth].append(layer)
-      self._layers.append(layer)
-      self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+      if layer not in layer_set:
+        self._layers.append(layer)
+        self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+        layer_set.add(layer)
 
   def _assert_weights_created(self):
     """Asserts that all the weights for the network have been created.
@@ -1659,20 +1618,22 @@ class Network(base_layer.Layer):
     return '_tf_keras_network'
 
   def _graph_network_add_loss(self, symbolic_loss):
-    new_layers = _diff_layers(self.inputs, [symbolic_loss], self._layers)
+    new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
     # Losses must be keyed on inputs no matter what in order to be supported in
     # DistributionStrategy.
     add_loss_layer = base_layer.AddLoss(unconditional=False)
     add_loss_layer(symbolic_loss)
+    new_nodes.extend(add_loss_layer.inbound_nodes)
     new_layers.append(add_loss_layer)
-    self._insert_layers(new_layers)
+    self._insert_layers(new_layers, new_nodes)
 
   def _graph_network_add_metric(self, value, aggregation, name):
-    new_layers = _diff_layers(self.inputs, [value], self._layers)
+    new_nodes, new_layers = _map_subgraph_network(self.inputs, [value])
     add_metric_layer = base_layer.AddMetric(aggregation, name)
     add_metric_layer(value)
+    new_nodes.extend(add_metric_layer.inbound_nodes)
     new_layers.append(add_metric_layer)
-    self._insert_layers(new_layers)
+    self._insert_layers(new_layers, new_nodes)
 
 
 def _is_hdf5_filepath(filepath):
@@ -1756,7 +1717,8 @@ def _map_graph_network(inputs, outputs):
     nodes_in_progress.add(node)
 
     # Propagate to all previous tensors connected to this node.
-    for layer, node_index, tensor_index, tensor in node.iterate_inbound():
+    for layer, node_index, tensor_index, tensor in node.iterate_inbound(
+        include_arguments=True):
       build_map(tensor, finished_nodes, nodes_in_progress, layer, node_index,
                 tensor_index)
 
@@ -1789,11 +1751,10 @@ def _map_graph_network(inputs, outputs):
 
     # Update the depth of inbound nodes.
     # The "depth" of a node is the max of the depths
-    # of all layers it is connected to.
-    for inbound_layer, node_index, _, _ in node.iterate_inbound():
-      inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-      previous_depth = nodes_depths.get(inbound_node, 0)
-      nodes_depths[inbound_node] = max(depth + 1, previous_depth)
+    # of all nodes it is connected to + 1.
+    for node_dep in node._get_all_node_dependencies():
+      previous_depth = nodes_depths.get(node_dep, 0)
+      nodes_depths[node_dep] = max(depth + 1, previous_depth)
 
   # Handle inputs that are not connected to outputs.
   # We do not error out here because the inputs may be used to compute losses
@@ -1820,7 +1781,7 @@ def _map_graph_network(inputs, outputs):
   depth_keys = list(layers_by_depth.keys())
   depth_keys.sort(reverse=True)
 
-  # Set self.layers and self._layers_by_depth.
+  # Set self.layers ordered by depth.
   layers = []
   for depth in depth_keys:
     layers_for_depth = layers_by_depth[depth]
@@ -1836,9 +1797,9 @@ def _map_graph_network(inputs, outputs):
   # Check that all tensors required are computable.
   # computable_tensors: all tensors in the graph
   # that can be computed from the inputs provided.
-  computable_tensors = []
+  computable_tensors = object_identity.ObjectIdentitySet()
   for x in inputs:
-    computable_tensors.append(x)
+    computable_tensors.add(x)
 
   layers_with_complete_input = []  # To provide a better error msg.
   for depth in depth_keys:
@@ -1854,7 +1815,7 @@ def _map_graph_network(inputs, outputs):
                              'were accessed without issue: ' +
                              str(layers_with_complete_input))
         for x in nest.flatten(node.output_tensors):
-          computable_tensors.append(x)
+          computable_tensors.add(x)
         layers_with_complete_input.append(layer.name)
 
   # Ensure name unicity, which will be crucial for serialization
@@ -1868,18 +1829,63 @@ def _map_graph_network(inputs, outputs):
   return network_nodes, nodes_by_depth, layers, layers_by_depth
 
 
-def _diff_layers(inputs, outputs, layers):
-  """Returns the layers in the network topology minus those in `layers`.
+def _map_subgraph_network(inputs, outputs):
+  """Returns the nodes and layers in the topology from `inputs` to `outputs`.
 
   Args:
     inputs: List of input tensors.
     outputs: List of output tensors.
-    layers: List of layers.
 
   Returns:
-    List of layers in the network topology not in `layers`.
+    A tuple of List{Node] and List[Layer].
   """
   base_layer_utils.create_keras_history(outputs)
-  # List of all layers in the topology betweeen inputs and outputs.
-  all_layers = _map_graph_network(inputs, outputs)[2]
-  return [layer for layer in all_layers if layer not in layers]
+  # Keep only nodes and layers in the topology betweeen inputs and outputs.
+  _, nodes_by_depth, layers, _ = _map_graph_network(inputs, outputs)
+  return nest.flatten([nodes for nodes in nodes_by_depth.values()]), layers
+
+
+def _should_skip_first_node(layer):
+  """Returns True if the first layer node should not be saved or loaded."""
+  # Networks start with a pre-existing node linking their input to output.
+  return issubclass(layer.__class__, Network) and layer._is_graph_network
+
+
+def _serialize_tensors(kwargs):
+  """Serializes Tensors passed to `call`."""
+
+  def _serialize_keras_tensor(t):
+    """Serializes a single Tensor passed to `call`."""
+    if hasattr(t, '_keras_history'):
+      kh = t._keras_history
+      return [kh.layer.name, kh.node_index, kh.tensor_index]
+
+    if isinstance(t, np.ndarray):
+      return t.tolist()
+
+    if isinstance(t, ops.Tensor):
+      return backend.get_value(t).tolist()
+
+    return t
+
+  return nest.map_structure(_serialize_keras_tensor, kwargs)
+
+
+def _deserialize_keras_tensors(kwargs, layer_map):
+  """Deserializes Keras Tensors passed to `call`.."""
+
+  def _deserialize_keras_tensor(t):
+    """Deserializes a single Keras Tensor passed to `call`."""
+    if isinstance(t, tf_utils.ListWrapper):
+      t = t.as_list()
+      layer_name = t[0]
+      node_index = t[1]
+      tensor_index = t[2]
+
+      layer = layer_map[layer_name]
+      node = layer._inbound_nodes[node_index]
+      return nest.flatten(node.output_tensors)[tensor_index]
+    return t
+
+  kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
+  return nest.map_structure(_deserialize_keras_tensor, kwargs)
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index 06454479d80..78621c0245d 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -182,8 +182,8 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # test input, output, input_shape, output_shape
     test_layer = keras.layers.Dense(16, name='test_layer')
     a_test = test_layer(a)
-    self.assertEqual(test_layer.input, a)
-    self.assertEqual(test_layer.output, a_test)
+    self.assertIs(test_layer.input, a)
+    self.assertIs(test_layer.output, a_test)
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, (None, 16))
 
@@ -192,10 +192,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     a_2 = dense(a)
     b_2 = dense(b)
 
-    self.assertEqual(dense.get_input_at(0), a)
-    self.assertEqual(dense.get_input_at(1), b)
-    self.assertEqual(dense.get_output_at(0), a_2)
-    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertIs(dense.get_input_at(0), a)
+    self.assertIs(dense.get_input_at(1), b)
+    self.assertIs(dense.get_output_at(0), a_2)
+    self.assertIs(dense.get_output_at(1), b_2)
     self.assertEqual(dense.get_input_shape_at(0), (None, 32))
     self.assertEqual(dense.get_input_shape_at(1), (None, 32))
     self.assertEqual(dense.get_output_shape_at(0), (None, 16))
@@ -231,6 +231,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       b_2 = dense(b)
       _ = new_dense.output_shape
 
+  def _assertAllIs(self, a, b):
+    self.assertTrue(all(x is y for x, y in zip(a, b)))
+
   @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributesMultiOutputLayer(self):
 
@@ -243,8 +246,8 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     test_layer = PowersLayer()
     p1, p2 = test_layer(x)  # pylint: disable=not-callable
 
-    self.assertEqual(test_layer.input, x)
-    self.assertEqual(test_layer.output, [p1, p2])
+    self.assertIs(test_layer.input, x)
+    self._assertAllIs(test_layer.output, [p1, p2])
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
 
@@ -262,8 +265,8 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     test_layer = AddLayer()
     y = test_layer([a, b])  # pylint: disable=not-callable
 
-    self.assertEqual(test_layer.input, [a, b])
-    self.assertEqual(test_layer.output, y)
+    self._assertAllIs(test_layer.input, [a, b])
+    self.assertIs(test_layer.output, y)
     self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
     self.assertEqual(test_layer.output_shape, (None, 32))
 
@@ -279,9 +282,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(network.name, 'dense_network')
     self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
     self.assertEqual(network.layers[1], dense)
-    self.assertEqual(network.weights, dense.weights)
-    self.assertEqual(network.trainable_weights, dense.trainable_weights)
-    self.assertEqual(network.non_trainable_weights, dense.non_trainable_weights)
+    self._assertAllIs(network.weights, dense.weights)
+    self._assertAllIs(network.trainable_weights, dense.trainable_weights)
+    self._assertAllIs(network.non_trainable_weights,
+                      dense.non_trainable_weights)
 
     # test callability on Input
     x_2 = input_layer_lib.Input(shape=(32,))
@@ -295,10 +299,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
     # test network `trainable` attribute
     network.trainable = False
-    self.assertEqual(network.weights, dense.weights)
+    self._assertAllIs(network.weights, dense.weights)
     self.assertEqual(network.trainable_weights, [])
-    self.assertEqual(network.non_trainable_weights,
-                     dense.trainable_weights + dense.non_trainable_weights)
+    self._assertAllIs(network.non_trainable_weights,
+                      dense.trainable_weights + dense.non_trainable_weights)
 
   @test_util.run_in_graph_and_eager_modes
   def test_trainable_weights(self):
@@ -307,40 +311,40 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     model = keras.models.Model(a, b)
 
     weights = model.weights
-    self.assertListEqual(model.trainable_weights, weights)
+    self._assertAllIs(model.trainable_weights, weights)
     self.assertListEqual(model.non_trainable_weights, [])
 
     model.trainable = False
     self.assertListEqual(model.trainable_weights, [])
-    self.assertListEqual(model.non_trainable_weights, weights)
+    self._assertAllIs(model.non_trainable_weights, weights)
 
     model.trainable = True
-    self.assertListEqual(model.trainable_weights, weights)
+    self._assertAllIs(model.trainable_weights, weights)
     self.assertListEqual(model.non_trainable_weights, [])
 
     model.layers[1].trainable = False
     self.assertListEqual(model.trainable_weights, [])
-    self.assertListEqual(model.non_trainable_weights, weights)
+    self._assertAllIs(model.non_trainable_weights, weights)
 
     # sequential model
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1, input_dim=2))
     weights = model.weights
 
-    self.assertListEqual(model.trainable_weights, weights)
+    self._assertAllIs(model.trainable_weights, weights)
     self.assertListEqual(model.non_trainable_weights, [])
 
     model.trainable = False
     self.assertListEqual(model.trainable_weights, [])
-    self.assertListEqual(model.non_trainable_weights, weights)
+    self._assertAllIs(model.non_trainable_weights, weights)
 
     model.trainable = True
-    self.assertListEqual(model.trainable_weights, weights)
+    self._assertAllIs(model.trainable_weights, weights)
     self.assertListEqual(model.non_trainable_weights, [])
 
     model.layers[0].trainable = False
     self.assertListEqual(model.trainable_weights, [])
-    self.assertListEqual(model.non_trainable_weights, weights)
+    self._assertAllIs(model.non_trainable_weights, weights)
 
   @test_util.run_deprecated_v1
   def test_layer_call_arguments(self):
@@ -396,22 +400,22 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
     self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
     self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[0].input_tensors, a)
-    self.assertEqual(dense._inbound_nodes[1].input_tensors, b)
+    self.assertIs(dense._inbound_nodes[0].input_tensors, a)
+    self.assertIs(dense._inbound_nodes[1].input_tensors, b)
 
     # test layer properties
     test_layer = keras.layers.Dense(16, name='test_layer')
     a_test = test_layer(a)
     self.assertListEqual(test_layer.kernel.shape.as_list(), [32, 16])
-    self.assertEqual(test_layer.input, a)
-    self.assertEqual(test_layer.output, a_test)
+    self.assertIs(test_layer.input, a)
+    self.assertIs(test_layer.output, a_test)
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, (None, 16))
 
-    self.assertEqual(dense.get_input_at(0), a)
-    self.assertEqual(dense.get_input_at(1), b)
-    self.assertEqual(dense.get_output_at(0), a_2)
-    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertIs(dense.get_input_at(0), a)
+    self.assertIs(dense.get_input_at(1), b)
+    self.assertIs(dense.get_output_at(0), a_2)
+    self.assertIs(dense.get_output_at(1), b_2)
     self.assertEqual(dense.get_input_shape_at(0), (None, 32))
     self.assertEqual(dense.get_input_shape_at(1), (None, 32))
     self.assertEqual(dense.get_output_shape_at(0), (None, 16))
@@ -473,7 +477,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
       # test get_source_inputs
-      self.assertListEqual(keras.engine.get_source_inputs(c), [a, b])
+      self._assertAllIs(keras.engine.get_source_inputs(c), [a, b])
 
       # serialization / deserialization
       json_config = model.to_json()
@@ -810,7 +814,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     output = a(b(a(b(x))))
     m = keras.models.Model(x, output)
     m.run_eagerly = testing_utils.should_run_eagerly()
-    m._run_distributed = testing_utils.should_run_distributed()
+    m._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     output_val = m.predict(x_val)
 
@@ -838,7 +842,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
     m = keras.models.Model(inputs=input_layer, outputs=output)
     m.run_eagerly = testing_utils.should_run_eagerly()
-    m._run_distributed = testing_utils.should_run_distributed()
+    m._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x_val = np.random.random((10, 16, 9, 3))
     output_val = m.predict(x_val)
@@ -868,7 +872,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         optimizer='sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     loss = model.train_on_batch(x, y)
     self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
 
@@ -888,8 +892,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed()
-    )
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(
         x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
         y=np.zeros((10, 100)),
@@ -903,6 +906,154 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Data is not masked, returned values are random.
     self.assertGreater(history.history['loss'][0], 0.0)
 
+    model = keras.Model.from_config(model.get_config())
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
+        y=np.zeros((10, 100)),
+        batch_size=2)
+    # All data is masked, returned values are 0's.
+    self.assertEqual(history.history['loss'][0], 0.0)
+    history = model.fit(
+        x=[np.ones((10, 5, 10)), np.ones((10, 5))],
+        y=np.zeros((10, 100)),
+        batch_size=2)
+    # Data is not masked, returned values are random.
+    self.assertGreater(history.history['loss'][0], 0.0)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_call_arg_derived_from_keras_layer(self):
+
+    class MyAdd(keras.layers.Layer):
+
+      def call(self, x1, x2):
+        return x1 + x2
+
+    input1 = keras.Input(10)
+    input2 = keras.Input(10)
+    outputs = MyAdd()(input1, input2)
+    model = keras.Model([input1, input2], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    # Check serialization.
+    model = keras.Model.from_config(
+        model.get_config(), custom_objects={'MyAdd': MyAdd})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_call_kwarg_derived_from_keras_layer(self):
+
+    class MaybeAdd(keras.layers.Layer):
+
+      def call(self, x1, x2=None):
+        if x2 is not None:
+          return x1 + x2
+        return x1
+
+    input1 = keras.Input(10)
+    input2 = keras.Input(10)
+    outputs = MaybeAdd()(input1, x2=input2)
+    model = keras.Model([input1, input2], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    model = keras.Model.from_config(
+        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_call_nested_arg_derived_from_keras_layer(self):
+
+    class AddAll(keras.layers.Layer):
+
+      def call(self, x1, x2, x3=None):
+        out = x1 + x2
+        if x3 is not None:
+          for t in x3.values():
+            out += t
+        return out
+
+    input1 = keras.Input(10)
+    input2 = keras.Input(10)
+    input3 = keras.Input(10)
+    outputs = AddAll()(
+        input1,
+        4 * array_ops.ones((1, 10)),
+        x3={
+            'a': input2,
+            'b': input3,
+            'c': 5 * array_ops.ones((1, 10))
+        })
+    model = keras.Model([input1, input2, input3], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+        y=15 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that all inputs were correctly added.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    model = keras.Model.from_config(
+        model.get_config(), custom_objects={'AddAll': AddAll})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(
+        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+        y=15 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that all inputs were correctly added.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
   @keras_parameterized.run_all_keras_modes
   def test_multi_output_model_with_none_masking(self):
     def func(x):
@@ -920,13 +1071,14 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     o = keras.layers.add(o)
     model = keras.Model(i, o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     i2 = keras.layers.Input(shape=(3, 2, 1))
     o2 = model(i2)
     model2 = keras.Model(i2, o2)
     model2.run_eagerly = testing_utils.should_run_eagerly()
-    model2._run_distributed = testing_utils.should_run_distributed()
+    model2._experimental_run_tf_function = testing_utils.should_run_tf_function(
+    )
 
     x = np.random.random((4, 3, 2, 1))
     out = model2.predict(x)
@@ -945,7 +1097,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         optimizer='sgd',
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     json_str = model.to_json()
     keras.models.model_from_json(json_str)
@@ -1245,7 +1397,7 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     model_input = np.random.randint(
         low=1, high=5, size=(10, 3, 4)).astype('float32')
@@ -1430,14 +1582,14 @@ class AddLossTest(keras_parameterized.TestCase):
     model.compile(
         'sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, batch_size=2, epochs=1)
 
     model2 = model.from_config(model.get_config())
     model2.compile(
         'sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model2.set_weights(initial_weights)
     model2.fit(x, batch_size=2, epochs=1)
 
@@ -1462,7 +1614,7 @@ class AddLossTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=2, epochs=1)
 
     model2 = model.from_config(model.get_config())
@@ -1470,7 +1622,7 @@ class AddLossTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model2.set_weights(initial_weights)
     model2.fit(x, y, batch_size=2, epochs=1)
 
@@ -1539,5 +1691,36 @@ class WeightAccessTest(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 1)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class DTypeTest(keras_parameterized.TestCase):
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_graph_network_dtype(self):
+    inputs = keras.Input((10,))
+    outputs = keras.layers.Dense(10)(inputs)
+    network = network_lib.Network(inputs, outputs)
+    self.assertEqual(network.dtype, 'float32')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_subclassed_network_dtype(self):
+
+    class IdentityNetwork(network_lib.Network):
+
+      def call(self, inputs):
+        return inputs
+
+    network = IdentityNetwork()
+    self.assertEqual(network.dtype, 'float32')
+    self.assertEqual(network(array_ops.constant(1, 'float64')).dtype, 'float32')
+
+    network = IdentityNetwork(dtype='float16')
+    self.assertEqual(network.dtype, 'float16')
+    self.assertEqual(network(array_ops.constant(1, 'float64')).dtype, 'float16')
+
+    network = IdentityNetwork(autocast=False)
+    self.assertEqual(network.dtype, 'float32')
+    self.assertEqual(network(array_ops.constant(1, 'float64')).dtype, 'float64')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index f169fdb14fd..4e005071c6e 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.util import nest
 
 
@@ -110,6 +112,15 @@ class Node(object):
     # Optional keyword arguments to layer's `call`.
     self.arguments = arguments
 
+    # Create Keras History for any Keras Tensors in `arguments`.
+    tensor_arguments = [
+        t for t in nest.flatten(self.arguments) if isinstance(t, ops.Tensor)
+    ]
+    for tensor_argument in tensor_arguments:
+      if base_layer_utils.needs_keras_history(
+          tensor_argument, ignore_call_context=True):
+        base_layer_utils.create_keras_history(tensor_argument)
+
     # Add nodes to all layers involved.
     for layer in nest.flatten(inbound_layers):
       if layer is not None:
@@ -120,15 +131,52 @@ class Node(object):
     # accessor here.
     outbound_layer.inbound_nodes.append(self)
 
-  def iterate_inbound(self):
+  def iterate_inbound(self, include_arguments=False):
     """Returns a list of tuples representing the inbound data.
 
+    Arguments:
+      include_arguments: Whether to also iterate over any Keras Tensors
+        passed as args, kwargs.
+
     Returns:
       List of tuples like: (inbound_layer, node_index, tensor_index, tensor).
     """
-    return zip(
-        nest.flatten(self.inbound_layers), nest.flatten(self.node_indices),
-        nest.flatten(self.tensor_indices), nest.flatten(self.input_tensors))
+    inputs_inbound = list(
+        zip(
+            nest.flatten(self.inbound_layers),
+            nest.flatten(self.node_indices),
+            nest.flatten(self.tensor_indices),
+            nest.flatten(self.input_tensors)))
+
+    if include_arguments:
+      keras_tensor_arguments = [
+          kt for kt in nest.flatten(self.arguments)
+          if hasattr(kt, '_keras_history')
+      ]
+
+      def _get_inbound(keras_tensor):
+        kh = keras_tensor._keras_history
+        return kh.layer, kh.node_index, kh.tensor_index, keras_tensor
+
+      arguments_inbound = nest.map_structure(_get_inbound,
+                                             keras_tensor_arguments)
+
+      return inputs_inbound + arguments_inbound
+    else:
+      return inputs_inbound
+
+  def _get_all_node_dependencies(self):
+    """Returns all of the nodes this node immediately depends on."""
+    node_deps = []
+    for layer, node_index, _, _ in self.iterate_inbound():
+      node_deps.append(layer._inbound_nodes[node_index])
+
+    for arg in nest.flatten(self.arguments):
+      if isinstance(arg, ops.Tensor) and hasattr(arg, '_keras_history'):
+        kh = arg._keras_history
+        node_deps.append(kh.layer._inbound_nodes[kh.node_index])
+
+    return node_deps
 
   def get_config(self):
     inbound_names = nest.map_structure(
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index a83b6ad6d83..edb5bacf814 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
@@ -106,6 +107,8 @@ class Sequential(training.Model):
 
     # Add to the model any layers passed to the constructor.
     if layers:
+      if not isinstance(layers, (list, tuple)):
+        layers = [layers]
       tf_utils.assert_no_legacy_layers(layers)
       for layer in layers:
         self.add(layer)
@@ -330,10 +333,7 @@ class Sequential(training.Model):
   def get_config(self):
     layer_configs = []
     for layer in self.layers:
-      layer_configs.append({
-          'class_name': layer.__class__.__name__,
-          'config': layer.get_config()
-      })
+      layer_configs.append(generic_utils.serialize_keras_object(layer))
     # When constructed using an `InputLayer` the first non-input layer may not
     # have the shape information to reconstruct `Sequential` as a graph network.
     if (self._is_graph_network and layer_configs and
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 0dca345e117..c1593c4a15c 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -60,6 +60,11 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertLen(model.weights, 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
+  @keras_parameterized.run_all_keras_modes
+  def test_single_layer_in_init(self):
+    model = keras.models.Sequential(keras.layers.Dense(1))
+    self.assertLen(model.layers, 1)
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
@@ -73,7 +78,7 @@ class TestSequential(keras_parameterized.TestCase):
         loss='mse',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
@@ -84,7 +89,7 @@ class TestSequential(keras_parameterized.TestCase):
         loss='mse',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -113,7 +118,7 @@ class TestSequential(keras_parameterized.TestCase):
         optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.layers), 2)
     with self.assertRaisesRegexp(
         ValueError, 'Weights for model .* have not yet been created'):
@@ -141,7 +146,7 @@ class TestSequential(keras_parameterized.TestCase):
         optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.layers), 2)
     with self.assertRaisesRegexp(
         ValueError, 'Weights for model .* have not yet been created'):
@@ -153,9 +158,8 @@ class TestSequential(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-    model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
+    model.fit(dataset, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertFalse(model._is_graph_network)
@@ -291,7 +295,7 @@ class TestSequential(keras_parameterized.TestCase):
         optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -340,7 +344,7 @@ class TestSequential(keras_parameterized.TestCase):
         'rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.outputs), 0)
     model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
     self.assertEqual(len(model.outputs), 1)
@@ -355,7 +359,7 @@ class TestSequential(keras_parameterized.TestCase):
         loss='mse',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
@@ -381,14 +385,12 @@ class TestSequential(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_string_input(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     seq = keras.Sequential([
         keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
         keras.layers.Lambda(lambda x: x[0])
     ])
     seq.run_eagerly = testing_utils.should_run_eagerly()
-    seq._run_distributed = testing_utils.should_run_distributed()
+    seq._experimental_run_tf_function = testing_utils.should_run_tf_function()
     preds = seq.predict([['tensorflow eager']])
     self.assertEqual(preds.shape, (1,))
 
@@ -468,7 +470,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
         loss='mse',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
@@ -482,7 +484,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
         loss='mse',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     model.build((None, 6))
 
@@ -501,7 +503,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
         weighted_metrics=['mae'],
         loss='categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = {'dense_input': np.random.random((10, 1))}
     y = np.random.randint(num_classes, size=(10, 1))
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index c4e3378c182..3d13c569f89 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -26,7 +26,6 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
@@ -43,6 +42,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.engine import data_adapter
 from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -51,6 +51,8 @@ from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine import training_v2
 from tensorflow.python.keras.engine import training_v2_utils
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import losses_utils
@@ -64,6 +66,7 @@ from tensorflow.python.training.tracking import layer_utils as trackable_layer_u
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
 
 try:
@@ -142,9 +145,11 @@ class Model(network.Network):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
+    _keras_api_gauge.get_cell('model').set(True)
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
+    self._compile_time_distribution_strategy = None
 
     # This flag is used to track if the user is using the deprecated path of
     # passing distribution strategy to compile rather than creating the model
@@ -152,7 +157,7 @@ class Model(network.Network):
     self._compile_distribution = False
 
     self._run_eagerly = None
-    self._run_distributed = False
+    self._experimental_run_tf_function = False
 
   def get_weights(self):
     """Retrieves the weights of the model.
@@ -160,8 +165,10 @@ class Model(network.Network):
     Returns:
         A flat list of Numpy arrays.
     """
-    if self._distribution_strategy:
-      with self._distribution_strategy.scope():
+    strategy = (self._distribution_strategy or
+                self._compile_time_distribution_strategy)
+    if strategy:
+      with strategy.scope():
         return super(Model, self).get_weights()
     return super(Model, self).get_weights()
 
@@ -176,7 +183,7 @@ class Model(network.Network):
 
   @trackable.no_automatic_dependency_tracking
   def compile(self,
-              optimizer,
+              optimizer='rmsprop',
               loss=None,
               metrics=None,
               loss_weights=None,
@@ -238,19 +245,25 @@ class Model(network.Network):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    _keras_api_gauge.get_cell('compile').set(True)
     self._run_eagerly = kwargs.pop('run_eagerly', None)
-    self._run_distributed = kwargs.pop('run_distributed', False)
+    self._experimental_run_tf_function = kwargs.pop(
+        'experimental_run_tf_function', True)
 
-    if ((sample_weight_mode is not None)
-        or (target_tensors is not None)
-        or (weighted_metrics is not None)
-        or not context.executing_eagerly()):
+    self._set_optimizer(optimizer)
+    is_any_optimizer_v1 = any(isinstance(opt, optimizers.Optimizer)
+                              for opt in nest.flatten(self.optimizer))
+
+    if ((target_tensors is not None)
+        or is_any_optimizer_v1
+        or not ops.executing_eagerly_outside_functions()):
       # Fallback out of things that aren't supported with v2 loops
-      self._run_distributed = False
+      self._experimental_run_tf_function = False
+
+    self._compile_time_distribution_strategy = (
+        distribution_strategy_context.get_strategy())
 
     if distribute is not None:
-      if tf2.enabled():
+      if tf2.enabled() or self._experimental_run_tf_function:
         raise ValueError(
             'Distribute argument in compile is not available in TF 2.0 please '
             'create the model under the distribution strategy scope.')
@@ -268,12 +281,11 @@ class Model(network.Network):
           self._distribution_strategy = (
               distribution_strategy_context.get_strategy())
 
-    if not self._run_distributed:
+    if not self._experimental_run_tf_function:
       self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
                                                              sample_weight_mode,
                                                              target_tensors,
                                                              weighted_metrics)
-    self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
     # to add a checkpoint dependency on the optimizer if it's trackable.
     if isinstance(self.optimizer, trackable.Trackable):
@@ -300,6 +312,9 @@ class Model(network.Network):
     self._distributed_model_cache = {}
     self._distributed_function_cache = {}
 
+    # Clear any `_eager_losses` that was added.
+    self._clear_losses()
+
     if (not context.executing_eagerly() and
         self._distribution_strategy is not None):
       # Ensures a Session is created and configured correctly for Distribution
@@ -313,6 +328,7 @@ class Model(network.Network):
       # time the model gets called on training data.
       return
     self._is_compiled = True
+    _keras_api_gauge.get_cell('compile').set(True)
 
     # Prepare list of loss functions, same size of model outputs.
     self.loss_functions = training_utils.prepare_loss_functions(
@@ -365,7 +381,7 @@ class Model(network.Network):
       self.predict_function = None
 
       # Collected trainable weights, sorted in topological order.
-      self._collected_trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = self._unique_trainable_weights
 
       # Validate all variables were correctly created in distribution scope.
       if self._distribution_strategy and not self._compile_distribution:
@@ -464,21 +480,38 @@ class Model(network.Network):
 
   def _select_training_loop(self, inputs):
     """Select training loop for fit/eval/predict based on the inputs."""
+    # TODO(kaftan) or TODO(scottzhu): This check should eventually be nicely
+    #  integrated into the data adapters in the v2 loop. We can't do this yet
+    #  because we currently have to fall back for unhandled data types.
+    if isinstance(inputs, (iterator_ops.Iterator,
+                           iterator_ops.IteratorV2)):
+      raise ValueError('For performance reasons Keras `fit`, `evaluate` and'
+                       '`predict` accept tf.data `Datasets` as input but not '
+                       'iterators that have been manually generated from '
+                       'Datasets by users. Please directly pass in the '
+                       'original `Dataset` object instead of passing in '
+                       '`iter(dataset)`.')
+
     # Experiment training loop with default DS path.
-    if (context.executing_eagerly()
-        and self._run_distributed
-        and not isinstance(inputs, (iterator_ops.Iterator,
-                                    iterator_ops.IteratorV2))
-        # TODO(scottzhu): Finish getting sequences working with the v2 loops.
-        and not isinstance(inputs, (data_utils.Sequence))
-        and not distributed_training_utils.is_tpu_strategy(
-            self._distribution_strategy)):
-      return training_v2.Loop()
+    if context.executing_eagerly() and self._experimental_run_tf_function:
+      try:
+        valid_adapter = data_adapter.select_data_adapter(inputs, None)
+      except ValueError as data_failure_exception:
+        valid_adapter = None
+        logging.warning('Falling back from v2 loop because of error: '
+                        '%s' % data_failure_exception)
+      if valid_adapter:
+        if self._in_multi_worker_mode():
+          return training_distributed.DistributionMultiWorkerTrainingLoop(
+              training_v2.Loop())
+        else:
+          return training_v2.Loop()
 
     # Case 1: distribution strategy.
     if self._distribution_strategy:
-      if multi_worker_util.in_multi_worker_mode():
-        return training_distributed.DistributionMultiWorkerTrainingLoop()
+      if self._in_multi_worker_mode():
+        return training_distributed.DistributionMultiWorkerTrainingLoop(
+            training_distributed.DistributionSingleWorkerTrainingLoop())
       else:
         return training_distributed.DistributionSingleWorkerTrainingLoop()
 
@@ -527,7 +560,7 @@ class Model(network.Network):
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator. Should return a tuple
+          - A `tf.data` dataset. Should return a tuple
             of either `(inputs, targets)` or
             `(inputs, targets, sample_weights)`.
           - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
@@ -535,14 +568,14 @@ class Model(network.Network):
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, dataset
-          iterator, generator, or `keras.utils.Sequence` instance, `y` should
+          tensor targets, or inversely). If `x` is a dataset, generator,
+          or `keras.utils.Sequence` instance, `y` should
           not be specified (since targets will be obtained from `x`).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
+            form of symbolic tensors, datasets,
             generators, or `keras.utils.Sequence` instances (since they generate
             batches).
         epochs: Integer. Number of epochs to train the model.
@@ -569,7 +602,7 @@ class Model(network.Network):
             on this data at the end of each epoch.
             The validation data is selected from the last samples
             in the `x` and `y` data provided, before shuffling. This argument is
-            not supported when `x` is a dataset, dataset iterator, generator or
+            not supported when `x` is a dataset, generator or
            `keras.utils.Sequence` instance.
         validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
@@ -578,7 +611,7 @@ class Model(network.Network):
             `validation_data` could be:
               - tuple `(x_val, y_val)` of Numpy arrays or tensors
               - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
-              - dataset or a dataset iterator
+              - dataset
             For the first two cases, `batch_size` must be provided.
             For the last case, `validation_steps` must be provided.
         shuffle: Boolean (whether to shuffle the training data
@@ -603,7 +636,7 @@ class Model(network.Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, dataset iterator, generator, or
+            supported when `x` is a dataset, generator, or
            `keras.utils.Sequence` instance, instead provide the sample_weights
             as the third element of `x`.
         initial_epoch: Integer.
@@ -616,19 +649,19 @@ class Model(network.Network):
             TensorFlow data tensors, the default `None` is equal to
             the number of samples in your dataset divided by
             the batch size, or 1 if that cannot be determined. If x is a
-            `tf.data` dataset or a dataset iterator, and 'steps_per_epoch'
+            `tf.data` dataset, and 'steps_per_epoch'
             is None, the epoch will run until the input dataset is exhausted.
             This argument is not supported with array inputs.
         validation_steps: Only relevant if `validation_data` is provided and
-            is a dataset or dataset iterator. Total number of steps (batches of
+            is a `tf.data` dataset. Total number of steps (batches of
             samples) to draw before stopping when performing validation
             at the end of every epoch. If validation_data is a `tf.data` dataset
-            or a dataset iterator, and 'validation_steps' is None, validation
+            and 'validation_steps' is None, validation
             will run until the `validation_data` dataset is exhausted.
         validation_freq: Only relevant if validation data is provided. Integer
-            or `collections.Container` instance (e.g. list, tuple, etc.). If an
-            integer, specifies how many training epochs to run before a new
-            validation run is performed, e.g. `validation_freq=2` runs
+            or `collections_abc.Container` instance (e.g. list, tuple, etc.).
+            If an integer, specifies how many training epochs to run before a
+            new validation run is performed, e.g. `validation_freq=2` runs
             validation every 2 epochs. If a Container, specifies the epochs on
             which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
             validation at the end of the 1st, 2nd, and 10th epochs.
@@ -659,7 +692,7 @@ class Model(network.Network):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
-    _keras_api_gauge.get_cell('train').set(True)
+    _keras_api_gauge.get_cell('fit').set(True)
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -668,6 +701,7 @@ class Model(network.Network):
     if kwargs:
       raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     self._assert_compile_was_called()
+    self._check_call_args('fit')
 
     func = self._select_training_loop(x)
     return func.fit(
@@ -714,20 +748,20 @@ class Model(network.Network):
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
+          - A `tf.data` dataset.
           - A generator or `keras.utils.Sequence` instance.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
           tensor targets, or inversely).
-          If `x` is a dataset, dataset iterator, generator or
+          If `x` is a dataset, generator or
           `keras.utils.Sequence` instance, `y` should not be specified (since
           targets will be obtained from the iterator/dataset).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
+            form of symbolic tensors, dataset,
             generators, or `keras.utils.Sequence` instances (since they generate
             batches).
         verbose: 0 or 1. Verbosity mode.
@@ -743,13 +777,13 @@ class Model(network.Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset or a dataset iterator, instead pass
+            supported when `x` is a dataset, instead pass
             sample weights as the third element of `x`.
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
             Ignored with the default value of `None`.
-            If x is a `tf.data` dataset or a dataset iterator, and `steps` is
+            If x is a `tf.data` dataset and `steps` is
             None, 'evaluate' will run until the dataset is exhausted.
             This argument is not supported with array inputs.
         callbacks: List of `keras.callbacks.Callback` instances.
@@ -780,6 +814,7 @@ class Model(network.Network):
     """
     _keras_api_gauge.get_cell('evaluate').set(True)
     self._assert_compile_was_called()
+    self._check_call_args('evaluate')
 
     func = self._select_training_loop(x)
     return func.evaluate(
@@ -814,20 +849,20 @@ class Model(network.Network):
             (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
             (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
+          - A `tf.data` dataset.
           - A generator or `keras.utils.Sequence` instance.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
+            form of symbolic tensors, dataset,
             generators, or `keras.utils.Sequence` instances (since they generate
             batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
             Ignored with the default value of `None`. If x is a `tf.data`
-            dataset or a dataset iterator, and `steps` is None, `predict` will
+            dataset and `steps` is None, `predict` will
             run until the input dataset is exhausted.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during prediction.
@@ -857,6 +892,7 @@ class Model(network.Network):
             that is not a multiple of the batch size.
     """
     _keras_api_gauge.get_cell('predict').set(True)
+    self._check_call_args('predict')
 
     func = self._select_training_loop(x)
     return func.predict(
@@ -896,11 +932,11 @@ class Model(network.Network):
               (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
               if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
+          - A `tf.data` dataset.
         y: Target data. Like the input data `x`, it could be either Numpy
           array(s) or TensorFlow tensor(s). It should be consistent with `x`
           (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset or a dataset iterator, `y` should not be specified
+          `x` is a dataset, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
           weights to apply to the model's loss for each sample. In the case of
@@ -908,7 +944,7 @@ class Model(network.Network):
           sequence_length), to apply a different weight to every timestep of
           every sample. In this case you should make sure to specify
           sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset or a dataset iterator.
+          supported when `x` is a dataset.
         class_weight: Optional dictionary mapping class indices (integers) to a
           weight (float) to apply to the model's loss for the samples from this
           class during training. This can be useful to tell the model to "pay
@@ -927,12 +963,20 @@ class Model(network.Network):
     Raises:
       ValueError: In case of invalid user-provided arguments.
     """
-    if self._run_distributed:
-      return training_v2_utils.train_on_batch(
+    self._assert_compile_was_called()
+    self._check_call_args('train_on_batch')
+    if self._experimental_run_tf_function:
+      outputs = training_v2_utils.train_on_batch(
           self, x, y=y, sample_weight=sample_weight,
           class_weight=class_weight, reset_metrics=reset_metrics)
+      outputs = (outputs['total_loss'] + outputs['output_losses'] +
+                 outputs['metrics'])
+      outputs = [
+          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
+      if len(outputs) == 1:
+        outputs = outputs[0]
+      return outputs
 
-    self._assert_compile_was_called()
     # If at this point we are in the replica context, then it is okay to execute
     # the Eager code path.  The expected way to get here is to call `fit` that
     # calls `train_on_batch` on each replica.
@@ -950,12 +994,16 @@ class Model(network.Network):
     # for each replica by `self._distribution_strategy` and the same code path
     # as Eager is expected to be taken.
     if self.run_eagerly or self._distribution_strategy:
-      outputs = training_eager.train_on_batch(
+      output_dict = training_eager.train_on_batch(
           self,
           x,
           y,
           sample_weights=sample_weights,
           output_loss_metrics=self._output_loss_metrics)
+      outputs = (output_dict['total_loss'] + output_dict['output_losses']
+                 + output_dict['metrics'])
+      outputs = [
+          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
       x = training_utils.ModelInputs(x).as_list()
       ins = x + (y or []) + (sample_weights or [])
@@ -985,13 +1033,12 @@ class Model(network.Network):
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
+          - A `tf.data` dataset.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
-          (since targets will be obtained from the iterator).
+          tensor targets, or inversely). If `x` is a dataset `y` should
+          not be specified (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -999,7 +1046,7 @@ class Model(network.Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
+            supported when `x` is a dataset.
         reset_metrics: If `True`, the metrics returned will be only for this
           batch. If `False`, the metrics will be statefully accumulated across
           batches.
@@ -1013,12 +1060,20 @@ class Model(network.Network):
     Raises:
         ValueError: In case of invalid user-provided arguments.
     """
-    if self._run_distributed:
-      return training_v2_utils.test_on_batch(
+    self._assert_compile_was_called()
+    self._check_call_args('test_on_batch')
+    if self._experimental_run_tf_function:
+      outputs = training_v2_utils.test_on_batch(
           self, x, y=y, sample_weight=sample_weight,
           reset_metrics=reset_metrics)
+      outputs = (outputs['total_loss'] + outputs['output_losses'] +
+                 outputs['metrics'])
+      outputs = [
+          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
+      if len(outputs) == 1:
+        outputs = outputs[0]
+      return outputs
 
-    self._assert_compile_was_called()
     if (self._distribution_strategy and
         distribution_strategy_context.in_cross_replica_context()):
       raise NotImplementedError('`test_on_batch` is not supported for models '
@@ -1030,12 +1085,16 @@ class Model(network.Network):
     # If `self._distribution_strategy` is True, then we are in a replica context
     # at this point.
     if self.run_eagerly or self._distribution_strategy:
-      outputs = training_eager.test_on_batch(
+      output_dict = training_eager.test_on_batch(
           self,
           x,
           y,
           sample_weights=sample_weights,
           output_loss_metrics=self._output_loss_metrics)
+      outputs = (output_dict['total_loss'] + output_dict['output_losses']
+                 + output_dict['metrics'])
+      outputs = [
+          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
       x = training_utils.ModelInputs(x).as_list()
       inputs = x + (y or []) + (sample_weights or [])
@@ -1060,7 +1119,7 @@ class Model(network.Network):
             (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
             (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
+          - A `tf.data` dataset.
 
     Returns:
         Numpy array(s) of predictions.
@@ -1069,7 +1128,8 @@ class Model(network.Network):
         ValueError: In case of mismatch between given number of inputs and
           expectations of the model.
     """
-    if self._run_distributed:
+    self._check_call_args('predict_on_batch')
+    if self._experimental_run_tf_function:
       return training_v2_utils.predict_on_batch(self, x)
 
     if (self._distribution_strategy and
@@ -1084,7 +1144,7 @@ class Model(network.Network):
     # at this point.
     if self.run_eagerly or self._distribution_strategy:
       inputs = training_utils.cast_if_floating_dtype(inputs)
-      if isinstance(inputs, collections.Sequence):
+      if isinstance(inputs, collections_abc.Sequence):
         # Unwrap lists with only one input, as we do when training on batch
         if len(inputs) == 1:
           inputs = inputs[0]
@@ -1162,9 +1222,9 @@ class Model(network.Network):
             Optional for `Sequence`: if unspecified, will use
             the `len(validation_data)` as a number of steps.
         validation_freq: Only relevant if validation data is provided. Integer
-            or `collections.Container` instance (e.g. list, tuple, etc.). If an
-            integer, specifies how many training epochs to run before a new
-            validation run is performed, e.g. `validation_freq=2` runs
+            or `collections_abc.Container` instance (e.g. list, tuple, etc.).
+            If an integer, specifies how many training epochs to run before a
+            new validation run is performed, e.g. `validation_freq=2` runs
             validation every 2 epochs. If a Container, specifies the epochs on
             which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
             validation at the end of the 1st, 2nd, and 10th epochs.
@@ -1214,7 +1274,8 @@ class Model(network.Network):
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
                                 'models compiled with tf.distribute.Strategy.')
-    _keras_api_gauge.get_cell('train').set(True)
+    _keras_api_gauge.get_cell('fit_generator').set(True)
+    self._check_call_args('fit_generator')
     return training_generator.fit_generator(
         self,
         generator,
@@ -1287,7 +1348,9 @@ class Model(network.Network):
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
                                 'models compiled with tf.distribute.Strategy.')
-    _keras_api_gauge.get_cell('evaluate').set(True)
+    _keras_api_gauge.get_cell('evaluate_generator').set(True)
+    self._check_call_args('evaluate_generator')
+
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -1344,7 +1407,7 @@ class Model(network.Network):
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
                                 'models compiled with tf.distribute.Strategy.')
-    _keras_api_gauge.get_cell('predict').set(True)
+    _keras_api_gauge.get_cell('predict_generator').set(True)
     return training_generator.predict_generator(
         self,
         generator,
@@ -1355,6 +1418,66 @@ class Model(network.Network):
         verbose=verbose,
         callbacks=callbacks)
 
+  def _check_call_args(self, method_name):
+    """Check that `call` has only one positional arg."""
+    # Always allow first arg, regardless of arg name.
+    fullargspec = tf_inspect.getfullargspec(self.call)
+    if fullargspec.defaults:
+      positional_args = fullargspec.args[:-len(fullargspec.defaults)]
+    else:
+      positional_args = fullargspec.args
+    if 'training' in positional_args:
+      positional_args.remove('training')
+
+    # self and first arg can be positional.
+    if len(positional_args) > 2:
+      extra_args = positional_args[2:]
+      raise ValueError(
+          'Models passed to `' + method_name + '` can only have `training` '
+          'and the first argument in `call` as positional arguments, '
+          'found: ' + str(extra_args) + '.')
+
+  def _set_optimizer(self, optimizer):
+    """Sets self.optimizer.
+
+    Sets self.optimizer to `optimizer`, potentially wrapping it with a
+    LossScaleOptimizer.
+
+    Args:
+      optimizer: The optimizer(s) to assign to self.optimizer.
+    """
+    if isinstance(optimizer, (list, tuple)):
+      self.optimizer = [optimizers.get(opt) for opt in optimizer]
+    else:
+      self.optimizer = optimizers.get(optimizer)
+
+    if (self._dtype_policy.loss_scale is not None and
+        not isinstance(self.optimizer,
+                       loss_scale_optimizer.LossScaleOptimizer)):
+      if isinstance(self.optimizer, list):
+        raise ValueError('When a dtype policy with a loss scale is used, you '
+                         'can only pass a single optimizer. Using policy %s '
+                         'and got optimizers: %s' %
+                         self._dtype_policy, self.optimizer)
+      if not isinstance(self.optimizer, optimizer_v2.OptimizerV2):
+        raise ValueError('"optimizer" must be an instance of '
+                         'tf.keras.optimizers.Optimizer when a dype policy '
+                         'with a loss scale  used, but got: %s. Using policy: '
+                         '%s' %
+                         (self.optimizer, self._dtype_policy))
+      self.optimizer = loss_scale_optimizer.LossScaleOptimizer(
+          self.optimizer, self._dtype_policy.loss_scale)
+    if (isinstance(self.optimizer, loss_scale_optimizer.LossScaleOptimizer) and
+        self._dtype_policy.loss_scale and
+        self.optimizer.loss_scale != self._dtype_policy.loss_scale):
+      logging.warning('LossScale of LossScaleOptimizer passed to compile (%s) '
+                      'is not the same as the dtype policy\'s loss scale (%s). '
+                      'Because the dtype policy has a loss scale, you should '
+                      'pass an optimizer that is not wrapped with a '
+                      'LossScaleOptimizer,'
+                      % (self.optimizer.loss_scale,
+                         self._dtype_policy.loss_scale))
+
   def _prepare_validation_data(self, validation_data, batch_size,
                                validation_steps):
     """Unpack and check the validation data."""
@@ -1401,7 +1524,8 @@ class Model(network.Network):
       # as placeholder for each output.
       return [None for _ in self.output_names]
 
-    if target_tensors not in (None, []):
+    if target_tensors is not None and not (isinstance(target_tensors, list) and
+                                           target_tensors == []):  # pylint: disable=g-explicit-bool-comparison
       if isinstance(target_tensors, list):
         if len(target_tensors) != len(self.outputs):
           raise ValueError(
@@ -1446,7 +1570,7 @@ class Model(network.Network):
     # Set metric attributes on model.
     self._set_metric_attributes()
 
-    self._collected_trainable_weights = self.trainable_weights
+    self._collected_trainable_weights = self._unique_trainable_weights
 
   def _update_sample_weight_modes(self, sample_weights=None):
     """Updates sample weight modes based on training/eval inputs.
@@ -1473,7 +1597,8 @@ class Model(network.Network):
       return
     if sample_weights and any([s is not None for s in sample_weights]):
       for endpoint in self._training_endpoints:
-        endpoint.sample_weight_mode = self.sample_weight_mode or 'samplewise'
+        endpoint.sample_weight_mode = (
+            endpoint.sample_weight_mode or 'samplewise')
     else:
       for endpoint in self._training_endpoints:
         endpoint.sample_weight_mode = None
@@ -1678,31 +1803,39 @@ class Model(network.Network):
       The validated batch_size, auto-inferred from the first layer if not
       provided.
     """
-    if batch_size is not None and isinstance(x, dataset_ops.DatasetV2):
-      raise ValueError('The `batch_size` argument must not be specified when'
-                       ' using dataset as an input.')
+    if (isinstance(x, (dataset_ops.DatasetV1,
+                       dataset_ops.DatasetV2,
+                       data_utils.Sequence)) or
+        tf_inspect.isgenerator(x)):
+      if batch_size is not None:
+        raise ValueError(
+            'The `batch_size` argument must not be specified for the given '
+            'input type. Received input: {}, batch_size: {}'.format(
+                x, batch_size))
+      return
 
     layers = super(Model, self).layers  # Avoids the override in Sequential.
     if layers:
       first_layer = layers[0]
+      # The per-replica static batch size.
       static_batch_size = training_utils.get_static_batch_size(first_layer)
       if static_batch_size is not None:
-        split_batch_size = self._distribution_strategy and \
+
+        # Determine number of times the user-supplied batch size will be split.
+        if (self._distribution_strategy and
             distributed_training_utils.global_batch_size_supported(
-                self._distribution_strategy)
-        if split_batch_size:
-          num_replicas = self._distribution_strategy.num_replicas_in_sync
+                self._distribution_strategy)):
+          num_splits_for_ds = self._distribution_strategy.num_replicas_in_sync
+        else:
+          num_splits_for_ds = 1
 
         # Check `batch_size` argument is consistent with InputLayer.
         if batch_size is not None:
-          if split_batch_size:
-            if batch_size % num_replicas != 0:
-              raise ValueError('The `batch_size` argument value {} cannot be '
-                               'divisible by number of replicas {}'.format(
-                                   batch_size, num_replicas))
-            per_replica_batch_size = batch_size // num_replicas
-          else:
-            per_replica_batch_size = batch_size
+          if batch_size % num_splits_for_ds != 0:
+            raise ValueError('The `batch_size` argument value {} cannot be '
+                             'divisible by number of replicas {}'.format(
+                                 batch_size, num_splits_for_ds))
+          per_replica_batch_size = batch_size // num_splits_for_ds
 
           if per_replica_batch_size != static_batch_size:
             raise ValueError('The `batch_size` argument value {} is '
@@ -1716,31 +1849,25 @@ class Model(network.Network):
           ds_batch_size = tensor_shape.as_dimension(
               nest.flatten(dataset_ops.get_legacy_output_shapes(x))[0][0]).value
           if ds_batch_size is not None:
-            if split_batch_size:
-              if ds_batch_size % num_replicas != 0:
-                raise ValueError(
-                    'The batch output shape of your `Dataset` {} '
-                    'cannot be divisible by number of replicas {}'.format(
-                        ds_batch_size, num_replicas))
-              ds_batch_size = ds_batch_size // num_replicas
+            if ds_batch_size % num_splits_for_ds != 0:
+              raise ValueError(
+                  'The batch output shape of your `Dataset` {} '
+                  'cannot be divisible by number of replicas {}'.format(
+                      ds_batch_size, num_splits_for_ds))
 
-            if ds_batch_size != static_batch_size:
+            ds_per_replica_batch_size = ds_batch_size // num_splits_for_ds
+            if ds_per_replica_batch_size != static_batch_size:
               raise ValueError('The batch output shape of your `Dataset` is '
                                '{}, which is incompatible with the specified '
                                'batch size of your Input Layer: {}'.format(
-                                   ds_batch_size, static_batch_size))
+                                   ds_per_replica_batch_size,
+                                   static_batch_size))
 
         # Set inferred batch size from the InputLayer.
         if steps is None:
-          batch_size = static_batch_size
+          batch_size = static_batch_size * num_splits_for_ds
 
-    if (batch_size is None
-        and steps is None
-        and not isinstance(x, (dataset_ops.DatasetV2,
-                               iterator_ops.Iterator,
-                               iterator_ops.IteratorV2,
-                               data_utils.Sequence))
-        and not tf_inspect.isgenerator(x)):
+    if batch_size is None and steps is None:
       # Backwards compatibility
       batch_size = 32
     return batch_size
@@ -1757,7 +1884,7 @@ class Model(network.Network):
     else:
       sample_weights = [None] * len(self._training_endpoints)
     for endpoint, weight in zip(self._training_endpoints, sample_weights):
-      endpoint.populate_sample_weight(weight)
+      endpoint.populate_sample_weight(weight, endpoint.sample_weight_mode)
 
   def _cache_output_metric_attributes(self, metrics, weighted_metrics):
     """Caches metric name and function attributes for every model output."""
@@ -1953,7 +2080,8 @@ class Model(network.Network):
     if not hasattr(self, '_collected_trainable_weights'):
       return
 
-    if len(self.trainable_weights) != len(self._collected_trainable_weights):
+    if (len(self._unique_trainable_weights) !=
+        len(self._collected_trainable_weights)):
       logging.log_first_n(
           logging.WARN, 'Discrepancy between trainable weights and collected'
           ' trainable weights, did you set `model.trainable`'
@@ -1962,6 +2090,9 @@ class Model(network.Network):
   def _make_train_function(self):
     has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
     self._check_trainable_weights_consistency()
+    if isinstance(self.optimizer, list):
+      raise ValueError('The `optimizer` in `compile` should be a single '
+                       'optimizer.')
     # If we have re-compiled the loss/weighted metric sub-graphs then create
     # train function even if one exists already. This is because
     # `_feed_sample_weights` list has been updated on re-copmpile.
@@ -2213,13 +2344,12 @@ class Model(network.Network):
           (in case the model has multiple inputs).
         - A dict mapping input names to the corresponding array/tensors,
           if the model has named inputs.
-        - A `tf.data` dataset or a dataset iterator.
+        - A `tf.data` dataset.
       y: Target data. Like the input data `x`,
         it could be either Numpy array(s) or TensorFlow tensor(s).
         It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset or a
-        dataset iterator, `y` should not be specified
-        (since targets will be obtained from the iterator).
+        tensor targets, or inversely). If `x` is a dataset, `y` should not be
+        specified (since targets will be obtained from the iterator).
       sample_weight: An optional sample-weight array passed by the user to
         weight the importance of each sample in `x`.
       class_weight: An optional class-weight array by the user to
@@ -2282,134 +2412,24 @@ class Model(network.Network):
     if check_steps:
       training_utils.check_steps_argument(x, steps, steps_name)
 
-    # First, we build/compile the model on the fly if necessary.
-    all_inputs = []
-    is_build_called = False
-    is_compile_called = False
-    # Whether this is a subclassed model that expects dictionary inputs
-    # rather than list inputs (e.g. FeatureColumn-based models).
-    dict_inputs = False
-
+    # First, we build the model on the fly if necessary.
     if not self.inputs:
-      # We need to use `x_input` to set the model inputs.
-
-      # If input data is a dataset iterator in graph mode or if it is an eager
-      # iterator and only one batch of samples is required, we fetch the data
-      # tensors from the iterator and then standardize them.
-      if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-        x_input, y_input, _ = training_utils.extract_tensors_from_dataset(x)
-      else:
-        x_input = x
-        y_input = y
-
-      # We type-check that `x_input` and `y_input` are either single arrays
-      # or lists of arrays, and extract a flat list of inputs from the passed
-      # structure.
-      if isinstance(x_input, (list, tuple)):
-        if not all(isinstance(v, np.ndarray) or
-                   tensor_util.is_tensor(v) for v in x_input):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs += list(x_input)
-      elif isinstance(x_input, dict):
-        dict_inputs = True
-        keys = sorted(x_input.keys())
-        all_inputs = [x_input[k] for k in keys]
-      else:
-        if (not isinstance(x_input, np.ndarray) and
-            not tensor_util.is_tensor(x_input)):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs.append(x_input)
-
-      # Now that we have a flat set of inputs, we make sure that none of them
-      # are CompositeTensors or CompositeTensorValues of any type (or scipy
-      # sparse arrays, which we treat as SparseTensor values). We cannot safely
-      # infer input data from an arbitrary composite tensor, so we don't try -
-      # users should explictly add composite tensor inputs to their subclassed
-      # models.
-      for input_tensor in all_inputs:
-        if (composite_tensor_utils.is_composite_or_composite_value(input_tensor)
-           ):
-          # TODO(b/132691975): Document subclass-model CT input handling.
-          raise ValueError(
-              'All SparseTensor and RaggedTensor inputs must be explicitly '
-              'declared using a keras.Input() with sparse=True or ragged=True. '
-              'We found an undeclared input %s. For Sequential models, please '
-              'add a keras.Input() as your first Layer. For subclassed models, '
-              'please call self._add_inputs() on your input set, which you can '
-              'create using keras.Input() for each input to your model.' %
-              (input_tensor,))
-
-      # Build the model using the retrieved inputs (value or symbolic).
-      # If values or generated from a dataset, then in symbolic-mode
-      # placeholders will be created to match the value shapes.
+      all_inputs, y_input, dict_inputs = self._build_model_with_inputs(x, y)
       is_build_called = True
-      if is_dataset:
-        def create_tensor_spec(t):
-          return tensor_spec.TensorSpec(t.shape, t.dtype)
-        cast_inputs = nest.map_structure(create_tensor_spec, x_input)
-      elif training_utils.has_tensors(x_input):
-        cast_inputs = training_utils.cast_if_floating_dtype(x_input)
-      else:
-        cast_inputs = x_input
-
-      self._set_inputs(cast_inputs)
     else:
-      y_input = y
+      all_inputs = []
+      # Whether this is a subclassed model that expects dictionary inputs
+      # rather than list inputs (e.g. FeatureColumn-based models).
       dict_inputs = isinstance(self.inputs, dict)
+      is_build_called = False
+      y_input = y
 
+    # Second, we compile the model on the fly if necessary, mostly for subclass
+    # models.
+    is_compile_called = False
     if not self._is_compiled and self.optimizer:
-      # On-the-fly compilation of the model.
-      if y_input is not None:
-        # We need to use `y` to set the model targets.
-        if training_utils.has_tensors(y_input):
-          y_input = training_utils.cast_if_floating_dtype(y_input)
-        if isinstance(y_input, (list, tuple)):
-          if not all(isinstance(v, np.ndarray) or
-                     tensor_util.is_tensor(v) for v in y_input):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs += list(y_input)
-        elif isinstance(y_input, dict):
-          raise ValueError('You cannot pass a dictionary as model targets.')
-        else:
-          if (not isinstance(y_input, np.ndarray) and
-              not tensor_util.is_tensor(y_input)):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs.append(y_input)
-
-      # Typecheck that all inputs are *either* value *or* symbolic.
-      # TODO(fchollet): this check could be removed in Eager mode?
-      if any(tensor_util.is_tensor(v) for v in all_inputs):
-        if not all(tensor_util.is_tensor(v) for v in all_inputs):
-          raise ValueError('Do not pass inputs that mix Numpy arrays and '
-                           'TensorFlow tensors. '
-                           'You passed: x=' + str(x) + '; y=' + str(y))
-
-      if is_dataset or context.executing_eagerly():
-        target_tensors = None
-      else:
-        # Handle target tensors if any passed.
-        if y_input is not None:
-          if not isinstance(y_input, (list, tuple)):
-            y_input = [y_input]
-          target_tensors = [v for v in y_input if _is_symbolic_tensor(v)]
-        else:
-          target_tensors = None
+      self._compile_from_inputs(all_inputs, y_input, x, y)
       is_compile_called = True
-      self.compile(
-          optimizer=self.optimizer,
-          loss=self.loss,
-          metrics=self._compile_metrics,
-          weighted_metrics=self._compile_weighted_metrics,
-          loss_weights=self.loss_weights,
-          target_tensors=target_tensors,
-          run_eagerly=self.run_eagerly,
-          run_distributed=self._run_distributed)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -2475,16 +2495,16 @@ class Model(network.Network):
       nest.assert_same_structure(a, b, expand_composites=True)
 
     if y is not None:
+      # Prepare self._sample_weight_modes. List with the same length as
+      # model outputs.
+      training_utils.prepare_sample_weight_modes(self._training_endpoints,
+                                                 self.sample_weight_mode)
+      feed_output_names = self._feed_output_names
+      feed_sample_weight_modes = self._sample_weight_modes
       if not self._is_graph_network:
-        feed_output_names = self._feed_output_names
         feed_output_shapes = None
-        # Sample weighting not supported in this case.
-        # TODO(fchollet): consider supporting it.
-        feed_sample_weight_modes = [None for _ in self.outputs]
       else:
-        feed_output_names = self._feed_output_names
         feed_output_shapes = self._feed_output_shapes
-        feed_sample_weight_modes = self._sample_weight_modes
 
       # Standardize the outputs.
       y = training_utils.standardize_input_data(
@@ -2527,7 +2547,7 @@ class Model(network.Network):
       y = []
       sample_weights = None
 
-    if self.stateful and batch_size:
+    if self.stateful and batch_size and not is_dataset:
       # Check that for stateful networks, number of samples is a multiple
       # of the static batch size.
       if x[0].shape[0] % batch_size != 0:
@@ -2543,6 +2563,108 @@ class Model(network.Network):
       x = dict(zip(feed_input_names, x))
     return x, y, sample_weights
 
+  def _build_model_with_inputs(self, inputs, targets):
+    """Build the model (set model inputs/outputs), mainly for subclass model."""
+    processed_inputs = []
+    is_dict_inputs = False
+    orig_inputs = inputs
+    # We need to use `inputs` to set the model inputs.
+    # If input data is a dataset iterator in graph mode or if it is an eager
+    # iterator and only one batch of samples is required, we fetch the data
+    # tensors from the iterator and then standardize them.
+    if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      inputs, targets, _ = training_utils.extract_tensors_from_dataset(inputs)
+    # We type-check that `inputs` and `targets` are either single arrays
+    # or lists of arrays, and extract a flat list of inputs from the passed
+    # structure.
+    training_utils.validate_input_types(inputs, orig_inputs)
+
+    if isinstance(inputs, (list, tuple)):
+      processed_inputs += list(inputs)
+    elif isinstance(inputs, dict):
+      is_dict_inputs = True
+      keys = sorted(inputs.keys())
+      processed_inputs = [inputs[k] for k in keys]
+    else:
+      processed_inputs.append(inputs)
+    # Now that we have a flat set of inputs, we make sure that none of them
+    # are CompositeTensors or CompositeTensorValues of any type (or scipy
+    # sparse arrays, which we treat as SparseTensor values). We cannot safely
+    # infer input data from an arbitrary composite tensor, so we don't try -
+    # users should explicitly add composite tensor inputs to their subclassed
+    # models.
+    for input_tensor in processed_inputs:
+      if composite_tensor_utils.is_composite_or_composite_value(input_tensor):
+        # TODO(b/132691975): Document subclass-model CT input handling.
+        raise ValueError(
+            'All SparseTensor and RaggedTensor inputs must be explicitly '
+            'declared using a keras.Input() with sparse=True or ragged=True. '
+            'We found an undeclared input %s. For Sequential models, please '
+            'add a keras.Input() as your first Layer. For subclassed models, '
+            'please call self._set_inputs() on your input set, which you can '
+            'create using keras.Input() for each input to your model.' %
+            (input_tensor,))
+    # Build the model using the retrieved inputs (value or symbolic).
+    # If values are generated from a dataset, then in symbolic-mode
+    # placeholders will be created to match the value shapes.
+    if isinstance(orig_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                                iterator_ops.Iterator)):
+      def create_tensor_spec(t):
+        return tensor_spec.TensorSpec(t.shape, t.dtype)
+
+      cast_inputs = nest.map_structure(create_tensor_spec, inputs)
+    elif training_utils.has_tensors(inputs):
+      cast_inputs = training_utils.cast_if_floating_dtype(inputs)
+    else:
+      cast_inputs = inputs
+    self._set_inputs(cast_inputs)
+    return processed_inputs, targets, is_dict_inputs
+
+  def _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target):
+    if target is not None:
+      # We need to use `y` to set the model targets.
+      if training_utils.has_tensors(target):
+        target = training_utils.cast_if_floating_dtype_and_mismatch(
+            target, self.outputs)
+      training_utils.validate_input_types(target, orig_target,
+                                          allow_dict=False, field_name='target')
+      if isinstance(target, (list, tuple)):
+        all_inputs += list(target)
+      else:
+        all_inputs.append(target)
+    # Type check that all inputs are *either* value *or* symbolic.
+    # TODO(fchollet): this check could be removed in Eager mode?
+    if any(tensor_util.is_tensor(v) for v in all_inputs):
+      if not all(tensor_util.is_tensor(v) for v in all_inputs):
+        raise ValueError('Do not pass inputs that mix Numpy arrays and '
+                         'TensorFlow tensors. '
+                         'You passed: x=' + str(orig_inputs) +
+                         '; y=' + str(orig_target))
+    is_dataset = isinstance(orig_inputs, (dataset_ops.DatasetV1,
+                                          dataset_ops.DatasetV2,
+                                          iterator_ops.Iterator))
+    if is_dataset or context.executing_eagerly():
+      target_tensors = None
+    else:
+      # Handle target tensors if any passed.
+      if target is not None:
+        if not isinstance(target, (list, tuple)):
+          target = [target]
+        target_tensors = [v for v in target if _is_symbolic_tensor(v)]
+      else:
+        target_tensors = None
+
+    self.compile(
+        optimizer=self.optimizer,
+        loss=self.loss,
+        metrics=self._compile_metrics,
+        weighted_metrics=self._compile_weighted_metrics,
+        loss_weights=self.loss_weights,
+        target_tensors=target_tensors,
+        sample_weight_mode=self.sample_weight_mode,
+        run_eagerly=self.run_eagerly,
+        experimental_run_tf_function=self._experimental_run_tf_function)
+
   # TODO(omalleyt): Consider changing to a more descriptive function name.
   def _set_inputs(self, inputs, outputs=None, training=None):
     """Set model's input and output specs based on the input data received.
@@ -2762,6 +2884,24 @@ class Model(network.Network):
                          'training/testing. '
                          'Use `model.compile(optimizer, loss)`.')
 
+  def _in_multi_worker_mode(self):
+    """Method to infer if this `Model` is working in multi-worker settings.
+
+    Experimental. Signature and implementation are subject to change.
+
+    Returns:
+      Whether this model indicates it's working in multi-worker settings.
+    """
+    # If the model was compiled under the scope of a `tf.distribute.Strategy',
+    # `self._distribution_strategy` would have been set and model should infer
+    # that as the used strategy (even if it's out of strategy scope already).
+    strategy = self._distribution_strategy
+
+    # Otherwise, use the strategy whose scope this is in.
+    if not strategy and distribution_strategy_context.has_strategy():
+      strategy = distribution_strategy_context.get_strategy()
+    return strategy and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with tf.distribute.Strategy."""
@@ -3006,20 +3146,20 @@ class _TrainingEndpoint(object):
         (self.sample_weight_mode is not None and self.sample_weight is None) or
         (self.sample_weight_mode is None and self.sample_weight is not None))
 
-  def populate_sample_weight(self, sample_weight=None):
+  def populate_sample_weight(self, sample_weight, sample_weight_mode):
     """Populate the sample weight and based on the sample weight mode."""
-    if (sample_weight is None and (self.should_skip_target_weights() or
-                                   self.sample_weight_mode is None or
-                                   context.executing_eagerly())):
+    if (sample_weight is None and
+        (self.should_skip_target_weights() or sample_weight_mode is None or
+         context.executing_eagerly())):
       self._sample_weight = None
       return
 
-    assert self.sample_weight_mode in ['temporal', 'samplewise']
-    if self.sample_weight_mode == 'temporal':
+    assert sample_weight_mode in ['temporal', 'samplewise']
+    if sample_weight_mode == 'temporal':
       default_value = [[1.]]
       shape = [None, None]
     else:
-      # self.sample_weight_mode == 'samplewise'
+      # sample_weight_mode == 'samplewise'
       default_value = [1.]
       shape = [None]
 
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 941bfd6cb91..e4273d786d8 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -35,6 +35,7 @@ from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -90,9 +91,10 @@ def model_iteration(model,
         declaring one epoch finished and starting the next epoch. Ignored with
         the default value of `None`.
       validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with the default value of `None`.
+        validation from data tensors). Ignored with the default value of
+        `None`.
       validation_freq: Only relevant if validation data is provided. Integer or
-        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        `collections_abc.Container` instance (e.g. list, tuple, etc.). If an
         integer, specifies how many training epochs to run before a new
         validation run is performed, e.g. `validation_freq=2` runs
         validation every 2 epochs. If a Container, specifies the epochs on
@@ -100,8 +102,8 @@ def model_iteration(model,
         validation at the end of the 1st, 2nd, and 10th epochs.
       mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
       validation_in_fit: if true, then this method is invoked from within
-        training iteration (for validation). In the case where `val_inputs` is a
-        dataset, this flag indicates that its iterator and feed values are
+        training iteration (for validation). In the case where `val_inputs` is
+        a dataset, this flag indicates that its iterator and feed values are
         already created so should properly reuse resources.
       prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
         tensors returned from `_prepare_feed_values` call on the validation
@@ -138,7 +140,7 @@ def model_iteration(model,
     if steps_per_epoch is None:
       reset_dataset_after_each_epoch = True
       steps_per_epoch = training_utils.infer_steps_for_dataset(
-          inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+          model, inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
     input_iterator = _get_iterator(inputs, model._distribution_strategy)
 
   # Enter tf.distribute.Strategy scope.
@@ -196,6 +198,7 @@ def model_iteration(model,
       # that determines the number of steps required. To avoid this issue,
       # set validation_steps here if validation_steps is None.
       validation_steps = training_utils.infer_steps_for_dataset(
+          model,
           val_inputs,
           validation_steps,
           epochs=epochs,
@@ -207,7 +210,8 @@ def model_iteration(model,
     val_samples_or_steps = validation_steps
   else:
     # Get num samples for printing.
-    val_samples_or_steps = val_inputs and val_inputs[0].shape[0] or None
+    val_samples_or_steps = val_inputs and nest.flatten(
+        val_inputs)[0].shape[0] or None
 
   if mode == ModeKeys.TRAIN and verbose:
     _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset)
@@ -239,11 +243,15 @@ def model_iteration(model,
 
   # Select aggregation method.
   if mode == ModeKeys.PREDICT:
-    aggregator = training_utils.OutputsAggregator(use_steps,
-                                                  num_samples_or_steps)
+    aggregator = training_utils.OutputsAggregator(
+        use_steps,
+        num_samples=None if steps_per_epoch else num_samples_or_steps,
+        steps=steps_per_epoch)
   else:
-    aggregator = training_utils.MetricsAggregator(use_steps,
-                                                  num_samples_or_steps)
+    aggregator = training_utils.MetricsAggregator(
+        use_steps,
+        num_samples=None if steps_per_epoch else num_samples_or_steps,
+        steps=steps_per_epoch)
 
   if model._compile_distribution:
     distributed_training_utils._copy_weights_to_distributed_model(model, mode)
@@ -307,7 +315,7 @@ def model_iteration(model,
                   % (steps_name, steps_per_epoch * epochs))
             elif step > 0:
               steps_per_epoch = step
-              aggregator.num_samples_or_steps = steps_per_epoch
+              aggregator.steps = steps_per_epoch
               if mode == ModeKeys.TRAIN:
                 progbar.params['steps'] = steps_per_epoch
                 progbar.progbar.target = steps_per_epoch
@@ -328,7 +336,7 @@ def model_iteration(model,
 
         if model._distribution_strategy:
           batch_outs = distributed_training_utils._per_replica_aggregate_batch(
-              batch_outs, model, mode)
+              model._distribution_strategy, batch_outs, model, mode)
 
         # Aggregate results.
         if step == 0:
diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py
index 280c3699ee4..097d5eef36d 100644
--- a/tensorflow/python/keras/engine/training_arrays_test.py
+++ b/tensorflow/python/keras/engine/training_arrays_test.py
@@ -60,10 +60,11 @@ class ValidationDatasetNoLimitTest(keras_parameterized.TestCase):
     # from the fit history should be equal to the final element in the output
     # of evaluating the model on the same eval dataset.
     self.assertAlmostEqual(history.history["val_mean_absolute_error"][-1],
-                           evaluation[-1])
+                           evaluation[-1], places=5)
 
 
-class PrintTrainingInfoTest(parameterized.TestCase):
+class PrintTrainingInfoTest(keras_parameterized.TestCase,
+                            parameterized.TestCase):
 
   @test_util.run_v1_only("Only relevant in graph mode.")
   def test_print_info_with_datasets(self):
@@ -110,6 +111,79 @@ class PrintTrainingInfoTest(parameterized.TestCase):
     if do_validation:
       self.assertIn(", validate on 50 samples", mock_stdout.getvalue())
 
+  @keras_parameterized.run_all_keras_modes
+  def test_dict_float64_input(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__(self)
+        self.dense1 = keras.layers.Dense(10, activation="relu")
+        self.dense2 = keras.layers.Dense(10, activation="relu")
+        self.concat = keras.layers.Concatenate()
+        self.dense3 = keras.layers.Dense(1, activation="sigmoid")
+
+      def call(self, inputs):
+        d1 = self.dense1(inputs["one"])
+        d2 = self.dense2(inputs["two"])
+        concat = self.concat([d1, d2])
+        return self.dense3(concat)
+
+    model = MyModel()
+    model.compile(
+        loss="mae",
+        optimizer="adam",
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    model.fit(
+        x={
+            "one": np.random.rand(100, 10, 1),
+            "two": np.random.rand(100, 10, 1)
+        },
+        y=np.random.rand(100, 10, 1))
+
+  def test_dict_validation_input(self):
+    """Test case for GitHub issue 30122."""
+    train_input_0 = np.random.rand(1000, 1)
+    train_input_1 = np.random.rand(1000, 1)
+    train_labels = np.random.rand(1000, 1)
+    val_input_0 = np.random.rand(1000, 1)
+    val_input_1 = np.random.rand(1000, 1)
+    val_labels = np.random.rand(1000, 1)
+
+    input_0 = keras.Input(shape=(None,), name="input_0")
+    input_1 = keras.Input(shape=(None,), name="input_1")
+
+    class my_model(keras.Model):
+
+      def __init__(self):
+        super(my_model, self).__init__(self)
+        self.hidden_layer_0 = keras.layers.Dense(100, activation="relu")
+        self.hidden_layer_1 = keras.layers.Dense(100, activation="relu")
+        self.concat = keras.layers.Concatenate()
+        self.out_layer = keras.layers.Dense(1, activation="sigmoid")
+
+      def call(self, inputs=[input_0, input_1]):
+        activation_0 = self.hidden_layer_0(inputs["input_0"])
+        activation_1 = self.hidden_layer_1(inputs["input_1"])
+        concat = self.concat([activation_0, activation_1])
+        return self.out_layer(concat)
+
+    model = my_model()
+    model.compile(loss="mae", optimizer="adam")
+
+    model.fit(
+        x={
+            "input_0": train_input_0,
+            "input_1": train_input_1
+        },
+        y=train_labels,
+        validation_data=({
+            "input_0": val_input_0,
+            "input_1": val_input_1
+        }, val_labels))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index b10ea854141..fa67f5acdd7 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -47,109 +47,15 @@ class BatchCounterCallback(callbacks.Callback):
     self.batch_count += 1
 
 
-class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self):
-    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(iterator, steps=2, verbose=1)
-    model.predict(iterator, steps=2)
-
-    # Test with validation data
-    model.fit(iterator,
-              epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=iterator, validation_steps=2)
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported when '):
-      model.fit(iterator,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          iterator,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(iterator, iterator,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'the `steps_per_epoch` argument'):
-      model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'the `steps` argument'):
-      model.evaluate(iterator, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'the `steps` argument'):
-      model.predict(iterator, verbose=0)
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(2)
-    dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'dataset iterator ran out of data')
-
-
 class TestTrainingWithDataset(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_calling_model_on_same_dataset(self):
-    if ((not testing_utils.should_run_eagerly())
-        and testing_utils.get_model_type() == 'subclass'
-        and context.executing_eagerly()
-        and (not testing_utils.should_run_distributed())):
+    if ((not testing_utils.should_run_eagerly()) and
+        testing_utils.get_model_type() == 'subclass' and
+        context.executing_eagerly() and
+        (not testing_utils.should_run_tf_function())):
       self.skipTest('b/120673224')
 
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
@@ -161,7 +67,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
@@ -187,7 +93,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
@@ -223,19 +129,16 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
           sample_weight=sample_weight)
 
     # Test invalid usage
-    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
-                                 ' must not be specified when using dataset'
-                                 ' as an input.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
       model.fit(dataset, batch_size=10, epochs=1, steps_per_epoch=2,
                 verbose=0)
 
-    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
-                                 ' must not be specified when using dataset'
-                                 ' as an input.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
       model.predict(dataset, batch_size=10, steps=2, verbose=0)
-    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
-                                 ' must not be specified when using dataset'
-                                 ' as an input.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
       model.evaluate(dataset, batch_size=10, steps=2, verbose=0)
 
     with self.assertRaisesRegexp(ValueError,
@@ -269,7 +172,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_a_np = np.random.random((10, 3)).astype(dtype=np.float32)
     input_b_np = np.random.random((10, 3)).astype(dtype=np.float32)
@@ -326,7 +229,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
@@ -373,7 +276,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         optimizer,
         loss='sparse_categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((10, 3), dtype=np.float32)
     targets = np.random.randint(0, 4, size=10, dtype=np.int32)
@@ -385,9 +288,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_dataset_fit_correctness(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
-
     class SumLayer(keras.layers.Layer):
 
       def build(self, _):
@@ -401,7 +301,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         'rmsprop',
         loss='mae',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((40, 2), dtype=np.float32)
     inputs[10:20, :] = 2
@@ -467,14 +367,12 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_known_cardinality_no_steps_arg(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     model.compile(
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
     targets = np.random.randint(0, 4, size=100, dtype=np.int32)
@@ -493,14 +391,12 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     model.compile(
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
     targets = np.random.randint(0, 4, size=100, dtype=np.int32)
@@ -521,8 +417,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
 
     class CaptureStdout(object):
 
@@ -542,7 +436,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
     targets = np.random.randint(0, 4, size=100, dtype=np.int32)
@@ -579,7 +473,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
     targets = np.random.randint(0, 4, size=100, dtype=np.int32)
@@ -627,11 +521,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     model.fit(dataset)
 
 
-class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
+class TestMetricsWithDatasets(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness_with_iterator(self):
+  def test_metrics_correctness_with_dataset(self):
     layers = [
         keras.layers.Dense(8, activation='relu', input_dim=4,
                            kernel_initializer='ones'),
@@ -645,15 +539,14 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     np.random.seed(123)
     x = np.random.randint(10, size=(100, 4)).astype(np.float32)
     y = np.random.randint(2, size=(100, 1)).astype(np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    outs = model.evaluate(iterator, steps=10)
+    outs = model.evaluate(dataset, steps=10)
     self.assertEqual(np.around(outs[1], decimals=1), 0.5)
     self.assertEqual(np.around(outs[2], decimals=1), 0.5)
 
@@ -661,8 +554,7 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    outs = model.evaluate(iterator, steps=10)
+    outs = model.evaluate(dataset, steps=10)
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index fd2d8f04955..12826f76c23 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -149,7 +149,7 @@ def experimental_tpu_fit_loop(model,
           (only if doing validation from data tensors).
           Ignored with the default value of `None`.
       validation_freq: Only relevant if validation data is provided. Integer or
-          `collections.Container` instance (e.g. list, tuple, etc.). If an
+          `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
           integer, specifies how many training epochs to run before a new
           validation run is performed, e.g. `validation_freq=2` runs
           validation every 2 epochs. If a Container, specifies the epochs on
@@ -163,8 +163,16 @@ def experimental_tpu_fit_loop(model,
       ValueError: in case of invalid arguments.
   """
   mode = ModeKeys.TRAIN
-  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
+
   current_strategy = model._distribution_strategy
+  iteration_value = min(steps_per_epoch,
+                        current_strategy.extended.steps_per_run)
+  steps_per_run = K.variable(
+      value=iteration_value,
+      dtype='int32',
+      name='steps_per_run')
+
+  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   iterator = dist_utils.get_iterator(dataset, current_strategy)
 
   scope = dist_utils.distributed_scope(
@@ -183,13 +191,6 @@ def experimental_tpu_fit_loop(model,
     tensor = m.result()
     initial_loop_values[m.name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  iteration_value = min(steps_per_epoch,
-                        current_strategy.extended.steps_per_run)
-
-  steps_per_run = K.variable(
-      value=iteration_value,
-      dtype='int32',
-      name='steps_per_run')
   ctx = current_strategy.extended.experimental_run_steps_on_iterator(
       step_fn, iterator, iterations=steps_per_run,
       initial_loop_values=initial_loop_values)
@@ -236,7 +237,7 @@ def experimental_tpu_fit_loop(model,
       batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
       callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
       if prev_step_count is None or step_count != prev_step_count:
-        steps_per_run.load(step_count, K.get_session())
+        K.get_session().run(steps_per_run.assign(step_count))
         prev_step_count = step_count
       try:
         _, outputs = K.batch_get_value([train_op, output_tensors])
@@ -600,8 +601,12 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
     dist_utils.validate_inputs(x, y)
 
     batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
-        model._distribution_strategy, x, batch_size, steps_per_epoch,
-        ModeKeys.TRAIN)
+        model._distribution_strategy,
+        x,
+        batch_size,
+        steps_per_epoch,
+        ModeKeys.TRAIN,
+        validation_split=validation_split)
     batch_size = model._validate_or_infer_batch_size(
         batch_size, steps_per_epoch, x)
     dataset = model._distribution_standardize_user_data(
@@ -645,7 +650,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
 
     if dist_utils.is_tpu_strategy(model._distribution_strategy):
       steps_per_epoch = training_utils.infer_steps_for_dataset(
-          dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
+          model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
       if steps_per_epoch is None:
         raise ValueError('Number of steps could not be inferred from the data, '
                          'please pass the steps_per_epoch argument.')
@@ -702,7 +707,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
 
     if dist_utils.is_tpu_strategy(model._distribution_strategy):
       steps = training_utils.infer_steps_for_dataset(
-          dataset, steps, steps_name='steps')
+          model, dataset, steps, steps_name='steps')
       if steps is None:
         raise ValueError('Number of steps could not be inferred from the data, '
                          'please pass the steps argument.')
@@ -739,7 +744,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         allow_partial_batch=True)
     if dist_utils.is_tpu_strategy(model._distribution_strategy):
       steps = training_utils.infer_steps_for_dataset(
-          dataset, steps, steps_name='steps')
+          model, dataset, steps, steps_name='steps')
       if steps is None:
         raise ValueError('Number of steps could not be inferred from the data, '
                          'please pass the steps argument.')
@@ -755,16 +760,16 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         callbacks=callbacks)
 
 
-def train_with_multi_worker(fn):
+def train_with_multi_worker(method):
   """Decorator that handles multi worker training with distribution strategy."""
 
-  def wrapper(instance, model, **kwargs):
-
+  def wrapper(model, **kwargs):
     def _worker_fn(_):
       callbacks = kwargs.pop('callbacks', None)
-      filtered_callbacks = dist_utils.filter_distributed_callbacks(callbacks)
+      filtered_callbacks = dist_utils.filter_distributed_callbacks(
+          callbacks, model)
       kwargs['callbacks'] = filtered_callbacks
-      return fn(instance, model, **kwargs)
+      return method(model, **kwargs)
 
     return dc.run_distribute_coordinator(
         _worker_fn,
@@ -774,10 +779,20 @@ def train_with_multi_worker(fn):
   return wrapper
 
 
-class DistributionMultiWorkerTrainingLoop(DistributionSingleWorkerTrainingLoop):
+class DistributionMultiWorkerTrainingLoop(training_utils.TrainingLoop):
   """Training loop for distribution strategy with multiple worker."""
 
-  fit = train_with_multi_worker(DistributionSingleWorkerTrainingLoop.fit)
-  evaluate = train_with_multi_worker(
-      DistributionSingleWorkerTrainingLoop.evaluate)
-  # Currently predict is still using the single worker implementation.
+  def __init__(self, single_worker_loop):
+    self._single_worker_loop = single_worker_loop
+
+  def fit(self, *args, **kwargs):
+    return train_with_multi_worker(self._single_worker_loop.fit)(
+        *args, **kwargs)
+
+  def evaluate(self, *args, **kwargs):
+    return train_with_multi_worker(self._single_worker_loop.evaluate)(
+        *args, **kwargs)
+
+  def predict(self, *args, **kwargs):
+    # Currently predict is still using the single worker implementation.
+    return self._single_worker_loop.predict(*args, **kwargs)
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 6cbc6851a8e..ab16efc3646 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -19,13 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
@@ -60,6 +57,14 @@ def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
   # Invoke all(weighted and unweighted) metrics.
   metric_results = []
   if targets:
+    # Insert None values corresponding to the targets that need to be skipped
+    # on the model.
+    if len(model._targets) != len(targets):
+      new_targets = [
+          None if t is None else targets.pop(0) for t in model._targets
+      ]
+      targets = new_targets
+
     metric_results = model._handle_metrics(
         outputs,
         targets=targets,
@@ -121,6 +126,16 @@ def _model_loss(model,
 
   outs = model(inputs, **kwargs)
   outs = nest.flatten(outs)
+
+  if targets:
+    targets = training_utils.cast_if_floating_dtype_and_mismatch(targets, outs)
+  # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
+  if sample_weights:
+    sample_weights = [
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
+        if val is not None else None for val in sample_weights
+    ]
+
   masks = [getattr(t, '_keras_mask', None) for t in outs]
   targets = nest.flatten(targets)
 
@@ -243,15 +258,22 @@ def _process_single_batch(model,
       else:
         scaled_total_loss = total_loss
     if training:
-      if not model.trainable_weights:
+      trainable_weights = model._unique_trainable_weights
+      if trainable_weights:
+        # TODO(tanzheny) b/132690565: Provide mechanism for user to override
+        # model.train_on_batch.
+        if hasattr(model, '_backwards'):
+          model._backwards(tape, scaled_total_loss)
+        else:
+          grads = tape.gradient(scaled_total_loss, trainable_weights)
+          if isinstance(model.optimizer,
+                        loss_scale_optimizer.LossScaleOptimizer):
+            grads = model.optimizer.get_unscaled_gradients(grads)
+          model.optimizer.apply_gradients(zip(grads, trainable_weights))
+      else:
         logging.warning('The list of trainable weights is empty. Make sure that'
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
-      else:
-        grads = tape.gradient(scaled_total_loss, model.trainable_weights)
-        if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer):
-          grads = model.optimizer.get_unscaled_gradients(grads)
-        model.optimizer.apply_gradients(zip(grads, model.trainable_weights))
     model._set_trainable_state(current_trainable_state)
     return outs, total_loss, output_losses, masks
 
@@ -272,26 +294,13 @@ def train_on_batch(model,
         loss values.
 
   Returns:
-      total loss and the loss associated with each output.
+      Dict with three items:
+        'total_loss': list with a single tensor for overall loss,
+        'output_losses': list of tensors for loss corresponding to each of the
+          model output. Could be a empty list when model has only one output.
+        'metrics': list of tensors for metric specified.
   """
-  if isinstance(inputs, collections.Sequence):
-    if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
-                                                                     model)
-      if targets:
-        targets = training_utils.cast_if_floating_dtype(targets)
-    else:
-      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
-          [ops.convert_to_tensor(val) for val in inputs], model)
-      if targets:
-        targets = training_utils.cast_if_floating_dtype(
-            [ops.convert_to_tensor(val) for val in targets])
-  if sample_weights:
-    sample_weights = [
-        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
-        if val is not None else None for val in sample_weights
-    ]
-
+  inputs = training_utils.cast_to_model_input_dtypes(inputs, model)
   outs, total_loss, output_losses, masks = (
       _process_single_batch(
           model,
@@ -305,14 +314,9 @@ def train_on_batch(model,
   metrics_results = _eager_metrics_fn(
       model, outs, targets, sample_weights=sample_weights, masks=masks)
   total_loss = nest.flatten(total_loss)
-  results = total_loss + output_losses + metrics_results
-
-  return [_non_none_constant_value(v) for v in results]
-
-
-def _non_none_constant_value(v):
-  constant_value = tensor_util.constant_value(v)
-  return constant_value if constant_value is not None else v
+  return {'total_loss': total_loss,
+          'output_losses': output_losses,
+          'metrics': metrics_results}
 
 
 def test_on_batch(model,
@@ -331,38 +335,29 @@ def test_on_batch(model,
         loss values.
 
   Returns:
-      total loss, loss and metrics associated with each output.
+      Dict with three items:
+        'total_loss': single tensor for overall loss,
+        'output_losses': list of tensors for loss corresponding to each of the
+          model output. Could be a empty list when model has only one output.
+        'metrics': list of tensors for metric specified.
   """
-  if isinstance(inputs, collections.Sequence):
-    if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
-                                                                     model)
-      if targets:
-        targets = training_utils.cast_if_floating_dtype(targets)
-    else:
-      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
-          [ops.convert_to_tensor(val) for val in inputs], model)
-      if targets:
-        targets = training_utils.cast_if_floating_dtype(
-            [ops.convert_to_tensor(val) for val in targets])
-  if sample_weights:
-    sample_weights = [
-        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
-        if val is not None else None for val in sample_weights
-    ]
-  outs, total_loss, output_losses, masks = (
-      _model_loss(
-          model,
-          inputs,
-          targets,
-          sample_weights=sample_weights,
-          training=False,
-          output_loss_metrics=output_loss_metrics))
+  inputs = training_utils.cast_to_model_input_dtypes(inputs, model)
+
+  with backend.eager_learning_phase_scope(0):
+    outs, total_loss, output_losses, masks = (
+        _model_loss(
+            model,
+            inputs,
+            targets,
+            sample_weights=sample_weights,
+            training=False,
+            output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
       model, outs, targets, sample_weights=sample_weights, masks=masks)
   total_loss = nest.flatten(total_loss)
-  results = total_loss + output_losses + metrics_results
 
-  return [_non_none_constant_value(v) for v in results]
+  return {'total_loss': total_loss,
+          'output_losses': output_losses,
+          'metrics': metrics_results}
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 57d2f50d2ec..1fabe36891e 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
@@ -35,7 +37,7 @@ from tensorflow.python.platform import test
 
 class TrainingTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_dynamic_model_has_trainable_weights(self):
     if not context.executing_eagerly():
       # Only test Eager modes, as Graph mode is not relevant for dynamic models.
@@ -52,7 +54,10 @@ class TrainingTest(keras_parameterized.TestCase):
         return self.dense(inputs)
 
     model = DynamicModel()
-    model.compile('rmsprop', 'mae')
+    model.compile(
+        'rmsprop', 'mae',
+        run_eagerly=True,
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     hist = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
     self.assertEqual(hist.history['loss'][-1], 1)
     self.assertEqual(len(model.trainable_weights), 2)
@@ -88,7 +93,7 @@ class TrainingTest(keras_parameterized.TestCase):
         metrics=metrics,
         loss_weights=loss_weights,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function(),
         sample_weight_mode=None)
 
     input_a = array_ops.zeros(shape=(10, 3))
@@ -159,7 +164,7 @@ class TrainingTest(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = array_ops.zeros(shape=(10, 3))
     targets = array_ops.zeros(shape=(10, 4))
@@ -183,30 +188,20 @@ class TrainingTest(keras_parameterized.TestCase):
     x = array_ops.zeros(shape=(10, 3))
     y = array_ops.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x, y)).repeat().batch(5)  # Infinite dataset.
-    validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
-    with self.assertRaisesRegexp(
-        ValueError, r'specify .* `steps_per_epoch`'):
-      model.fit(iterator, epochs=1, verbose=0)
-    if not context.executing_eagerly():
-      # In eager execution, `array_ops.zeros` returns value tensors
-      # which can be used for validation without a `validation_steps` argument.
-      with self.assertRaisesRegexp(
-          ValueError, r'provide either `batch_size` or `validation_steps`'):
-        model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                  validation_data=(x, y))
+    model.fit(dataset, epochs=1, verbose=0)
+
     # Step argument is required for infinite datasets.
     with self.assertRaisesRegexp(ValueError,
                                  'specify the `validation_steps` argument.'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+      model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
     with self.assertRaisesRegexp(ValueError,
                                  'specify the `validation_steps` argument.'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=validation_iterator)
+      model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
+                validation_data=validation_dataset)
 
   # TODO(b/120931266): Enable test on subclassed models after bug causing an
   # extra dimension to be added to predict outputs is fixed.
@@ -254,7 +249,7 @@ class CorrectnessTest(keras_parameterized.TestCase):
         loss='sparse_categorical_crossentropy',
         optimizer=rmsprop.RMSprop(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
@@ -275,15 +270,14 @@ class CorrectnessTest(keras_parameterized.TestCase):
         loss='sparse_categorical_crossentropy',
         optimizer=rmsprop.RMSprop(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    history = model.fit(iterator, epochs=1, steps_per_epoch=10)
+    history = model.fit(dataset, epochs=1, steps_per_epoch=10)
     self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
   def test_loss_in_call(self):
@@ -298,6 +292,62 @@ class CorrectnessTest(keras_parameterized.TestCase):
     layer(1.)  # Plain-value inputs are only valid in eager mode.
     self.assertEqual(1, len(layer.losses))
 
+  @parameterized.named_parameters([
+      ('_None', contextlib.contextmanager(lambda: iter([None])), 0., 4.),
+      ('_0', lambda: keras.backend.learning_phase_scope(0), 4., 4.),
+      ('_1', lambda: keras.backend.learning_phase_scope(1), 0., 0.),
+  ])
+  def test_nested_model_learning_phase(self, nested_scope_fn,
+                                       expected_training_loss,
+                                       expected_validation_loss):
+    """Tests that learning phase is correctly set in an intermediate layer."""
+
+    def _make_unregularized_model():
+      inputs = keras.Input((4,))
+      # Zero out activations when `training=True`.
+      x = keras.layers.Dropout(1. - 1. / (1 << 24))(inputs)
+      x = keras.layers.Dense(
+          10,
+          activation='relu',
+          trainable=False,
+          bias_initializer='zeros',
+          kernel_initializer='ones')(
+              x)  # Just sum together all the activations.
+      outputs = keras.layers.Dense(3)(x)
+      return keras.Model(inputs, outputs)
+
+    def _regularize_model(unregularized_model):
+      inputs = keras.Input(unregularized_model.inputs[0].shape[1:])
+      with nested_scope_fn():
+        logits = unregularized_model(inputs)
+      outputs = keras.activations.softmax(logits)
+      model = keras.Model(inputs, outputs)
+      # Regularize the most recent activations of a post-dropout layer.
+      sample_activations = unregularized_model.get_layer(
+          index=-2).get_output_at(-1)
+      regularization_loss = keras.backend.mean(sample_activations)
+      model.add_loss(regularization_loss)
+      model.add_metric(
+          regularization_loss, aggregation='mean', name='regularization_loss')
+      return model
+
+    # Make and compile models.
+    model = _regularize_model(_make_unregularized_model())
+    model.compile('sgd', 'sparse_categorical_crossentropy')
+    # Prepare fake data.
+    x = np.ones((20, 4)).astype(np.float32)
+    y = np.random.randint(0, 3, size=(20,)).astype(np.int64)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
+    evaluation_results = dict(zip(model.metrics_names, model.evaluate(dataset)))
+    # Rate of dropout depends on the learning phase.
+    self.assertEqual(evaluation_results['regularization_loss'],
+                     expected_validation_loss)
+    history = model.fit(dataset, epochs=2, validation_data=dataset).history
+    self.assertAllEqual(history['regularization_loss'],
+                        [expected_training_loss] * 2)
+    self.assertAllEqual(history['val_regularization_loss'],
+                        [expected_validation_loss] * 2)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 51368098074..5194a22571a 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -80,7 +80,7 @@ def model_iteration(model,
       validation_steps: Total number of steps (batches of samples) before
         declaring validation finished.
       validation_freq: Only relevant if validation data is provided. Integer or
-        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
         integer, specifies how many training epochs to run before a new
         validation run is performed, e.g. `validation_freq=2` runs
         validation every 2 epochs. If a Container, specifies the epochs on
@@ -133,7 +133,7 @@ def model_iteration(model,
     if steps_per_epoch is None:
       reset_dataset_after_each_epoch = True
       steps_per_epoch = training_utils.infer_steps_for_dataset(
-          data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+          model, data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
 
   # Convert to a format that supports `next(generator)`.
   generator, steps_per_epoch = convert_to_generator_like(
@@ -182,9 +182,9 @@ def model_iteration(model,
   progbar.params['verbose'] = verbose
 
   if mode == ModeKeys.PREDICT:
-    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
+    aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch)
   else:
-    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
+    aggregator = training_utils.MetricsAggregator(True, steps=steps_per_epoch)
 
   should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   if should_set_learning_phase:
@@ -236,7 +236,7 @@ def model_iteration(model,
                 % (steps_name, steps_per_epoch * epochs))
           elif step > 0:
             steps_per_epoch = step
-            aggregator.num_samples_or_steps = steps_per_epoch
+            aggregator.steps = steps_per_epoch
             if mode == ModeKeys.TRAIN:
               progbar.params['steps'] = steps_per_epoch
               progbar.progbar.target = steps_per_epoch
@@ -318,7 +318,7 @@ def model_iteration(model,
           use_multiprocessing=use_multiprocessing,
           max_queue_size=max_queue_size,
           callbacks=callbacks,
-          verbose=0,
+          verbose=verbose,
           mode=ModeKeys.TEST,
           steps_name='validation_steps')
 
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 6db967c567f..ce1d73da59e 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -18,10 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import time
-import unittest
-
 from absl.testing import parameterized
 import numpy as np
 
@@ -63,46 +59,14 @@ def custom_generator(mode=2):
       yield x, y, w
 
 
-class ForkRobustTestCase(keras_parameterized.TestCase):
-  _sleep_at_end = False
-
-  def setUp(self):
-    # When setting up a test simply make a best effort to start from a clean
-    # state.
-    self._starting_remnants = data_utils.terminate_keras_multiprocessing_pools(
-        use_sigkill=False)
-
-    self._sleep_at_end = False
-    super(ForkRobustTestCase, self).setUp()
-
-  def tearDown(self):
-    # Give multiprocessing pools some time to finish on their own before
-    # cleanup_all_keras_forkpools yanks the rug out from under them. This is
-    # particularly important because calling .close() on a pool that is already
-    # in the process of spinning down can cause an uncatchable segmentation
-    # fault at which point the tearDown will hang.
-    if self._sleep_at_end:
-      time.sleep(1)
-
-    # If a test finishes and leaves behind uncleanable artifacts then that is a
-    # failure condition. However, if the state was not clean to begin with the
-    # test should not fail on that account.
-    new_remnants = set(data_utils.terminate_keras_multiprocessing_pools(
-        use_sigkill=True)).difference(self._starting_remnants)
-
-    if new_remnants:
-      raise ValueError('Test left behind stubborn orphans:\n  {}'.format(
-          '\n  '.join(new_remnants)))
-    super(ForkRobustTestCase, self).tearDown()
+custom_generator_threads = data_utils.threadsafe_generator(custom_generator)
 
 
-class TestGeneratorMethods(ForkRobustTestCase):
+class TestGeneratorMethods(keras_parameterized.TestCase):
 
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
   def test_fit_generator_method(self):
     model = testing_utils.get_small_mlp(
         num_hidden=3, num_classes=4, input_dim=2)
@@ -111,8 +75,7 @@ class TestGeneratorMethods(ForkRobustTestCase):
         optimizer=rmsprop.RMSprop(1e-3),
         metrics=['mae', metrics_module.CategoricalAccuracy()])
 
-    self._sleep_at_end = True
-    model.fit_generator(custom_generator(),
+    model.fit_generator(custom_generator_threads(),
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
@@ -139,11 +102,9 @@ class TestGeneratorMethods(ForkRobustTestCase):
                         validation_steps=1,
                         workers=0)
 
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
   def test_evaluate_generator_method(self):
     model = testing_utils.get_small_mlp(
         num_hidden=3, num_classes=4, input_dim=2)
@@ -152,10 +113,9 @@ class TestGeneratorMethods(ForkRobustTestCase):
         optimizer=rmsprop.RMSprop(1e-3),
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-    self._sleep_at_end = True
-    model.evaluate_generator(custom_generator(),
+    model.evaluate_generator(custom_generator_threads(),
                              steps=5,
                              max_queue_size=10,
                              workers=2,
@@ -171,19 +131,16 @@ class TestGeneratorMethods(ForkRobustTestCase):
                              use_multiprocessing=False,
                              workers=0)
 
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
   def test_predict_generator_method(self):
     model = testing_utils.get_small_mlp(
         num_hidden=3, num_classes=4, input_dim=2)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
-    self._sleep_at_end = True
-    model.predict_generator(custom_generator(),
+    model.predict_generator(custom_generator_threads(),
                             steps=5,
                             max_queue_size=10,
                             workers=2,
@@ -197,7 +154,7 @@ class TestGeneratorMethods(ForkRobustTestCase):
                             max_queue_size=10,
                             workers=0)
     # Test generator with just inputs (no targets)
-    model.predict_generator(custom_generator(mode=1),
+    model.predict_generator(custom_generator_threads(mode=1),
                             steps=5,
                             max_queue_size=10,
                             workers=2,
@@ -221,7 +178,7 @@ class TestGeneratorMethods(ForkRobustTestCase):
         optimizer=rmsprop.RMSprop(1e-3),
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     model.fit_generator(custom_generator(mode=3),
                         steps_per_epoch=5,
@@ -259,7 +216,7 @@ class TestGeneratorMethods(ForkRobustTestCase):
         loss='mse',
         optimizer=rmsprop.RMSprop(1e-3),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     err_msg = 'Output of generator should be a tuple of 1 or 2 or 3 elements'
     with self.assertRaisesRegex(ValueError, err_msg):
@@ -305,7 +262,7 @@ class TestGeneratorMethods(ForkRobustTestCase):
         rmsprop.RMSprop(0.001),
         'binary_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(
         ones_generator(),
         steps_per_epoch=2,
@@ -314,11 +271,40 @@ class TestGeneratorMethods(ForkRobustTestCase):
     model.evaluate(ones_generator(), steps=2)
     model.predict(ones_generator(), steps=2)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_batch_size_argument(self):
 
-class TestGeneratorMethodsWithSequences(ForkRobustTestCase):
+    def ones_generator():
+      while True:
+        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
+
+    model.compile(
+        'adam',
+        'binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.fit(ones_generator(), batch_size=2, epochs=2)
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.evaluate(ones_generator(), batch_size=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.predict(ones_generator(), batch_size=2)
+
+
+class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
   def test_training_with_sequences(self):
 
     class DummySequence(keras.utils.Sequence):
@@ -350,6 +336,7 @@ class TestGeneratorMethodsWithSequences(ForkRobustTestCase):
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
diff --git a/tensorflow/python/keras/engine/training_integration_test.py b/tensorflow/python/keras/engine/training_integration_test.py
new file mode 100644
index 00000000000..90a11802edb
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_integration_test.py
@@ -0,0 +1,207 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""End-to-end tests for a variety of small models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _conv2d_filter(**kwargs):
+  """Convolution with non-default strides and dilation rate is not supported."""
+  return kwargs['strides'] <= 1 or kwargs['dilation_rate'] <= 1
+
+
+# Scheme: (layer_class, data_shape, fuzz_dims, constructor_args, filter_fn)
+#   layer_class:
+#     A keras Layer class to be tested.
+#   data_shape:
+#     The shape of the input data. (not including batch dim)
+#   fuzz_dims:
+#     Dimensions which can be unspecified during model construction. For
+#     instance, if data_shape is (2, 5) and fuzz_dims is (False, True), a pass
+#     with model input shape of (2, None) will also be performed.
+#   constructor_args:
+#     An OrderedDict (to ensure consistent test names) with a key and a list
+#     of values to test. Test cases will be generated for the Cartesian product
+#     of all constructor args, so adding more fields can cause the drastically
+#     increase the testing load.
+#   filter_fn:
+#     If not None, this function will be called on each set of generated
+#     constructor args, and prevents generation of contradictory combinations.
+#     A True return value indicates a valid test.
+_LAYERS_TO_TEST = [
+    (keras.layers.Dense, (1,), (False,), collections.OrderedDict([
+        ('units', [1])]), None),
+    (keras.layers.Activation, (2, 2), (True, True), collections.OrderedDict([
+        ('activation', ['relu'])]), None),
+    (keras.layers.Dropout, (16,), (False,), collections.OrderedDict([
+        ('rate', [0.25])]), None),
+    (keras.layers.BatchNormalization, (8, 8, 3), (True, True, False),
+     collections.OrderedDict([
+         ('axis', [3]),
+         ('center', [True, False]),
+         ('scale', [True, False])
+     ]), None),
+    (keras.layers.Conv1D, (8, 8), (False, False), collections.OrderedDict([
+        ('filters', [1]),
+        ('kernel_size', [1, 3]),
+        ('strides', [1, 2]),
+        ('padding', ['valid', 'same']),
+        ('use_bias', [True]),
+        ('kernel_regularizer', ['l2']),
+        ('data_format', ['channels_last'])
+    ]), None),
+    (keras.layers.Conv2D, (8, 8, 3), (True, True, False),
+     collections.OrderedDict([
+         ('filters', [1]),
+         ('kernel_size', [1, 3]),
+         ('strides', [1, 2]),
+         ('padding', ['valid', 'same']),
+         ('use_bias', [True, False]),
+         ('kernel_regularizer', ['l2']),
+         ('dilation_rate', [1, 2]),
+         ('data_format', ['channels_last'])
+     ]), _conv2d_filter),
+    (keras.layers.LSTM, (4, 4), (False, False), collections.OrderedDict([
+        ('units', [1]),
+        ('kernel_regularizer', ['l2']),
+        ('dropout', [0, 0.5]),
+        ('stateful', [True, False]),
+        ('unroll', [True, False]),
+        ('return_sequences', [True, False])
+    ]), None),
+]
+
+
+def _gather_test_cases():
+  cases = []
+  for layer_type, inp_shape, fuzz_dims, arg_dict, filter_fn in _LAYERS_TO_TEST:
+    arg_combinations = [[(k, i) for i in v] for k, v in arg_dict.items()]  # pylint: disable=g-complex-comprehension
+    for arguments in itertools.product(*arg_combinations):
+      layer_kwargs = {k: v for k, v in arguments}
+      if filter_fn is not None and not filter_fn(**layer_kwargs):
+        continue
+
+      name = '_{}_{}'.format(layer_type.__name__,
+                             '_'.join('{}_{}'.format(*i) for i in arguments))
+      cases.append((name, layer_type, inp_shape, fuzz_dims, layer_kwargs))
+  return cases
+
+
+OUTPUT_TEST_CASES = _gather_test_cases()
+
+
+class CoreLayerIntegrationTest(keras_parameterized.TestCase):
+  """Test that layers and models produce the correct tensor types."""
+
+  # In v1 graph there are only symbolic tensors.
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
+  def test_layer_output_type(self, layer_to_test, input_shape, _, layer_kwargs):
+    layer = layer_to_test(**layer_kwargs)
+
+    input_data = np.ones(shape=(2,) + input_shape, dtype=np.float32)
+    layer_result = layer(input_data)
+
+    inp = keras.layers.Input(shape=input_shape, batch_size=2)
+    model = keras.models.Model(inp, layer_to_test(**layer_kwargs)(inp))
+    model_result = model(input_data)
+
+    for x in [layer_result, model_result]:
+      if not isinstance(x, ops.Tensor):
+        raise ValueError('Tensor or EagerTensor expected, got type {}'
+                         .format(type(x)))
+
+      if isinstance(x, ops.EagerTensor) != context.executing_eagerly():
+        expected_type = (ops.EagerTensor if context.executing_eagerly()
+                         else ops.Tensor)
+        raise ValueError('Expected type {}, got type {}'
+                         .format(expected_type, type(x)))
+
+  def _run_fit_eval_predict(self, layer_to_test, input_shape, data_shape,
+                            layer_kwargs):
+    batch_size = 2
+    run_eagerly = testing_utils.should_run_eagerly()
+    experimental_run_tf_function = testing_utils.should_run_tf_function()
+
+    def map_fn(_):
+      x = keras.backend.random_uniform(shape=data_shape)
+      y = keras.backend.random_uniform(shape=(1,))
+      return x, y
+
+    dataset = dataset_ops.DatasetV2.range(4).map(map_fn).batch(batch_size)
+
+    inp = keras.layers.Input(shape=input_shape, batch_size=batch_size)
+    layer = layer_to_test(**layer_kwargs)(inp)
+
+    # Condense the output down to a single scalar.
+    layer = keras.layers.Flatten()(layer)
+    layer = keras.layers.Lambda(
+        lambda x: math_ops.reduce_mean(x, keepdims=True))(layer)
+    layer = keras.layers.Dense(1, activation=None)(layer)
+    model = keras.models.Model(inp, layer)
+
+    model.compile(loss='mse', optimizer='sgd', run_eagerly=run_eagerly,
+                  experimental_run_tf_function=experimental_run_tf_function)
+    model.fit(dataset, verbose=2, epochs=2)
+
+    model.compile(loss='mse', optimizer='sgd', run_eagerly=run_eagerly,
+                  experimental_run_tf_function=experimental_run_tf_function)
+    model.fit(dataset.repeat(2), verbose=2, epochs=2, steps_per_epoch=2)
+
+    eval_dataset = dataset_ops.DatasetV2.range(4).map(map_fn).batch(batch_size)
+    model.evaluate(eval_dataset, verbose=2)
+
+    def pred_map_fn(_):
+      return keras.backend.random_uniform(shape=data_shape)
+
+    pred_dataset = dataset_ops.DatasetV2.range(4)
+    pred_dataset = pred_dataset.map(pred_map_fn).batch(batch_size)
+    model.predict(pred_dataset, verbose=2)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=False)
+  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
+  def test_model_loops(self, layer_to_test, input_shape, fuzz_dims,
+                       layer_kwargs):
+    self._run_fit_eval_predict(layer_to_test, input_shape,
+                               input_shape, layer_kwargs)
+
+    if any(fuzz_dims):
+      fuzzed_shape = []
+      for dim, should_fuzz in zip(input_shape, fuzz_dims):
+        fuzzed_shape.append(None if should_fuzz else dim)
+
+      self._run_fit_eval_predict(layer_to_test, fuzzed_shape,
+                                 input_shape, layer_kwargs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 9c82bc1a5ae..e1de9b7d611 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -69,7 +70,7 @@ class CompileTest(keras_parameterized.TestCase):
         optimizer='adam',
         loss=loss,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(model.loss, loss)
 
     loss = losses.get(loss)
@@ -119,7 +120,7 @@ class CompileTest(keras_parameterized.TestCase):
         optimizer='adam',
         loss=loss,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(model.loss_functions[0].fn, losses.mean_squared_error)
     self.assertEqual(model.loss_functions[1].fn, losses.mean_absolute_error)
     self.assertAllEqual(model._loss_weights_list, [1., 1.])
@@ -130,7 +131,7 @@ class CompileTest(keras_parameterized.TestCase):
         optimizer='adam',
         loss=loss,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(model.loss_functions[0].fn, losses.mean_absolute_error)
     self.assertEqual(model.loss_functions[1].fn, losses.mean_squared_error)
     self.assertAllEqual(model._loss_weights_list, [1., 1.])
@@ -144,7 +145,7 @@ class CompileTest(keras_parameterized.TestCase):
         loss='mse',
         loss_weights=loss_weights,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertAllEqual(model._loss_weights_list, [1., 2.])
 
   def test_compile_with_multi_output_and_loss_weights_dict(self):
@@ -182,7 +183,7 @@ class CompileTest(keras_parameterized.TestCase):
           optimizer='adam',
           loss=['mse', 'mae'],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_with_incorrect_loss_key(self):
@@ -196,7 +197,7 @@ class CompileTest(keras_parameterized.TestCase):
           optimizer='adam',
           loss={'unknown_output': 'mse'},
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_with_incorrect_loss_weights_size(self):
@@ -209,7 +210,7 @@ class CompileTest(keras_parameterized.TestCase):
           loss='mse',
           loss_weights=[1., 2.],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_with_incorrect_loss_weights_key(self):
@@ -224,7 +225,7 @@ class CompileTest(keras_parameterized.TestCase):
           loss='mse',
           loss_weights={'unknown_output': 1.},
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_with_incorrect_sample_weight_mode(self):
@@ -239,7 +240,7 @@ class CompileTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode={'unknown': 'temporal'},
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
 
 class TrainingTest(keras_parameterized.TestCase):
@@ -257,15 +258,153 @@ class TrainingTest(keras_parameterized.TestCase):
           return inputs + array_ops.constant([0], 'float32')
 
     model = keras.Sequential([ReturnTraining()])
-    model.compile('sgd', 'mse')
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     hist = model.fit(x=np.array([0.]), y=np.array([0.]))
     self.assertAllClose(hist.history['loss'][0], 10000)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_fit_and_validate_learning_phase(self):
+
+    class ReturnTraining(keras.layers.Layer):
+
+      def call(self, inputs):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs))
+
+    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model.compile(
+        'sgd',
+        loss='mae',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    inputs = np.ones((40, 2), dtype=np.float32)
+    targets = np.ones((40, 1), dtype=np.float32)
+
+    # Test correctness with `steps_per_epoch`.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    history = model.fit(
+        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
+
+    # The training loss should be 0.0
+    self.assertAllClose(history.history['loss'][0], 0.0)
+    # The validation loss should be 1.0.
+    self.assertAllClose(history.history['val_loss'][0], 1.0)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_fit_and_validate_training_arg(self):
+
+    class ReturnTraining(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs),
+            training=training)
+
+    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model.compile(
+        'sgd',
+        loss='mae',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    inputs = np.ones((40, 2), dtype=np.float32)
+    targets = np.ones((40, 1), dtype=np.float32)
+
+    # Test correctness with `steps_per_epoch`.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    history = model.fit(
+        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
+
+    # The training loss should be 0.0
+    self.assertAllClose(history.history['loss'][0], 0.0)
+    # The validation loss should be 1.0.
+    self.assertAllClose(history.history['val_loss'][0], 1.0)
+
+  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
+  def test_target_dtype_matches_output(self):
+
+    def _loss_fn(labels, preds):
+      self.assertEqual(labels.dtype, preds.dtype)
+      return labels - preds
+
+    layers = [keras.layers.Dense(10, dtype=np.float64),
+              keras.layers.Dense(10, dtype=np.float64)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+    inputs = np.ones(10, dtype=np.float64)
+    targets = np.ones(10, dtype=np.float64)
+    model.compile(
+        'sgd',
+        loss=_loss_fn,
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    self.assertEqual(model.predict(inputs).dtype, np.float64)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_fit_and_validate_nested_training_arg(self):
+
+    class NestedReturnTraining(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs),
+            training=training)
+
+    class ReturnTraining(keras.layers.Layer):
+
+      def __init__(self, input_shape=None, **kwargs):
+        super(ReturnTraining, self).__init__(input_shape=input_shape, **kwargs)
+        self._nested_layer = None
+
+      def build(self, input_shape):
+        self._nested_layer = NestedReturnTraining()
+        self.built = True
+
+      def call(self, inputs):
+        return self._nested_layer(inputs)
+
+    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model.compile(
+        'sgd',
+        loss='mae',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    inputs = np.ones((40, 2), dtype=np.float32)
+    targets = np.ones((40, 1), dtype=np.float32)
+
+    # Test correctness with `steps_per_epoch`.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    history = model.fit(
+        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
+
+    # The training loss should be 0.0
+    self.assertAllClose(history.history['loss'][0], 0.0)
+    # The validation loss should be 1.0.
+    self.assertAllClose(history.history['val_loss'][0], 1.0)
+
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   @keras_parameterized.run_all_keras_modes
   def test_fit_on_arrays(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     input_a = keras.layers.Input(shape=(3,), name='input_a')
     input_b = keras.layers.Input(shape=(3,), name='input_b')
 
@@ -285,7 +424,7 @@ class TrainingTest(keras_parameterized.TestCase):
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
         loss_weights=loss_weights,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -311,14 +450,6 @@ class TrainingTest(keras_parameterized.TestCase):
         verbose=2)
     model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
-    # Test model with input data as a list of lists
-    model.fit(
-        [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
-        [output_d_np, output_e_np],
-        epochs=2,
-        batch_size=5,
-        verbose=2)
-
     # Test with validation data
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -406,7 +537,7 @@ class TrainingTest(keras_parameterized.TestCase):
         loss,
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -427,7 +558,7 @@ class TrainingTest(keras_parameterized.TestCase):
           metrics=metrics,
           loss_weights=loss_weights,
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -438,7 +569,7 @@ class TrainingTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       model.train_on_batch({'input_a': input_a_np},
                            [output_d_np, output_e_np])
-    with self.assertRaises(AttributeError):
+    with self.assertRaises(ValueError):
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           epochs=1,
@@ -446,7 +577,7 @@ class TrainingTest(keras_parameterized.TestCase):
           verbose=0)
     with self.assertRaises(ValueError):
       model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-    with self.assertRaises(AttributeError):
+    with self.assertRaises(ValueError):
       model.train_on_batch(1, [output_d_np, output_e_np])
     with self.assertRaises(ValueError):
       model.train_on_batch(input_a_np, [output_d_np, output_e_np])
@@ -467,7 +598,7 @@ class TrainingTest(keras_parameterized.TestCase):
         optimizer,
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
     # TODO(gsundeep) Test only works in eager, file ticket
@@ -479,8 +610,7 @@ class TrainingTest(keras_parameterized.TestCase):
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 4))
 
-    model.fit([np.ndarray.tolist(input_a_np)],
-              [np.ndarray.tolist(input_b_np)],
+    model.fit([np.ndarray.tolist(input_a_np)], [np.ndarray.tolist(input_b_np)],
               epochs=2,
               batch_size=5,
               verbose=2)
@@ -507,7 +637,7 @@ class TrainingTest(keras_parameterized.TestCase):
         loss_weights=loss_weights,
         sample_weight_mode=None,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -593,7 +723,7 @@ class TrainingTest(keras_parameterized.TestCase):
           optimizer,
           'binary_crossentropy',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, batch_size=2, epochs=5)
       loss[reg] = model.evaluate(x, y)
     self.assertLess(loss[None], loss['l2'])
@@ -614,7 +744,7 @@ class TrainingTest(keras_parameterized.TestCase):
         optimizer,
         'binary_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     loss = model.test_on_batch(x, y)
     self.assertAlmostEqual(0.01, loss, places=4)
 
@@ -632,7 +762,7 @@ class TrainingTest(keras_parameterized.TestCase):
         optimizer,
         'binary_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
@@ -675,8 +805,6 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_training_on_sparse_data_with_dense_placeholders(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     # TODO(kaftan) Test seems to not work, file ticket
     if testing_utils.should_run_eagerly() and context.executing_eagerly():
       self.skipTest('Skipping running model eagerly.')
@@ -702,7 +830,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'mse',
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(test_inputs, test_outputs,
               epochs=1, batch_size=2, validation_split=0.5)
     model.evaluate(test_inputs, test_outputs, batch_size=2)
@@ -724,7 +852,7 @@ class TrainingTest(keras_parameterized.TestCase):
         optimizer=keras.optimizers.Adam(lr=0.0001),
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_that_trainable_disables_updates(self):
@@ -743,7 +871,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     assert not model.updates
 
     x1 = model.predict(val_a)
@@ -756,7 +884,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     assert model.updates
 
     model.train_on_batch(val_a, val_out)
@@ -768,7 +896,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     assert not model.updates
 
     x1 = model.predict(val_a)
@@ -776,6 +904,42 @@ class TrainingTest(keras_parameterized.TestCase):
     x2 = model.predict(val_a)
     self.assertAllClose(x1, x2, atol=1e-7)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_weight_deduplication(self):
+    class WatchingLayer(keras.layers.Layer):
+
+      def __init__(self, dense_to_track):
+        # This will cause the kernel and bias to be double counted, effectively
+        # doubling the learning rate if weights are not deduped.
+        self._kernel = dense_to_track.kernel
+        self._bias = dense_to_track.bias
+        super(WatchingLayer, self).__init__()
+
+    inp = keras.layers.Input(shape=(1,))
+    dense_layer = keras.layers.Dense(1)
+    dense_output = dense_layer(inp)  # This will build the dense kernel
+
+    # Deterministically set weights to make the test repeatable.
+    dense_layer.set_weights([np.ones((1, 1)), np.zeros((1,))])
+    output = WatchingLayer(dense_layer)(dense_output)
+
+    model = keras.models.Model(inp, output)
+
+    # 0.25 is the edge of the radius of convergence for the double apply case.
+    # At lr=0.24, the double apply case will very slowly descend while the
+    # correct case will drop very quickly.
+    model.compile(loss='mse', optimizer=gradient_descent.SGD(0.24),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones((64 * 2,))
+    y = 4.5 * x - 3.
+
+    history = model.fit(x, y, batch_size=64, epochs=2, verbose=2)
+
+    # If the gradient apply is duplicated then the loss after 2 epochs will
+    # be ~0.15, compared to the correct answer of O(1e-7).
+    self.assertLess(history.history['loss'][-1], 1e-6)
+
   def test_logs_passed_to_callbacks(self):
     with self.cached_session():
       input_dim = 5
@@ -839,8 +1003,6 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_mismatched_output_shape_and_target_shape(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = keras.Sequential([
         keras.layers.Dense(2, input_shape=(3, 4)),
         keras.layers.Dense(5),
@@ -849,7 +1011,7 @@ class TrainingTest(keras_parameterized.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='sparse_categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     # Test with Numpy data
     x_train = np.random.random((10, 3, 4))
     y_train = np.random.randint(0, 5, size=(10, 3))
@@ -859,8 +1021,7 @@ class TrainingTest(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
     dataset = dataset.repeat(10)
     dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    model.fit(iterator, epochs=1, steps_per_epoch=2)
+    model.fit(dataset, epochs=1, steps_per_epoch=2)
 
     if context.executing_eagerly():
       # Test with eager execution
@@ -870,7 +1031,7 @@ class TrainingTest(keras_parameterized.TestCase):
       model.fit(x_train, y_train, batch_size=5, epochs=1)
 
       # Test with eager execution and iterator
-      model.fit(iterator, epochs=1, steps_per_epoch=2)
+      model.fit(dataset, epochs=1, steps_per_epoch=2)
 
   def test_losses_in_defun(self):
     with context.eager_mode():
@@ -894,7 +1055,7 @@ class TrainingTest(keras_parameterized.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='binary_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     with test.mock.patch.object(sys, 'stdout', mock_stdout):
       model.fit(
           np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
@@ -970,12 +1131,6 @@ class TrainingTest(keras_parameterized.TestCase):
                                  'incompatible with the specified batch size'):
       model.fit(x, y, batch_size=4)
 
-    data = dataset_ops.DatasetV2.from_tensor_slices((x, y))
-    data = data.batch(4, drop_remainder=True)
-    with self.assertRaisesRegexp(ValueError,
-                                 'incompatible with the specified batch size'):
-      model.fit(data, steps_per_epoch=16)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_compatible_batch_size_functional_model(self):
 
@@ -1083,7 +1238,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     class ValCounter(keras.callbacks.Callback):
 
@@ -1112,7 +1267,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     with self.assertRaisesRegexp(
         ValueError, '`validation_steps` should not be specified if '
@@ -1143,7 +1298,7 @@ class TrainingTest(keras_parameterized.TestCase):
         keras.optimizer_v2.gradient_descent.SGD(0.025),
         loss=keras.losses.MeanAbsoluteError(),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.array([[0.], [1.], [2.]])
     y = np.array([[0.5], [2.], [3.5]])
@@ -1169,7 +1324,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
     self.assertEqual(loss, 2 * 3)
 
@@ -1228,41 +1383,34 @@ class TrainingTest(keras_parameterized.TestCase):
   # TODO(b/131372221): Make this work with subclassed models.
   @keras_parameterized.run_with_all_model_types(exclude_models=['subclass'])
   @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
   def test_model_dtype(self):
 
     class AssertTypeLayer(keras.layers.Layer):
 
-      def __init__(self, assert_type=None, **kwargs):
-        super(AssertTypeLayer, self).__init__(**kwargs)
-        self.assert_type = assert_type
-
       def call(self, inputs):
-        assert inputs.dtype.name == self.assert_type, (
+        assert inputs.dtype.name == self.dtype, (
             'Input tensor has type %s which does not match assert type %s' %
             (inputs.dtype.name, self.assert_type))
         return inputs + 1.
 
     for dtype in ('float16', 'float32', 'float64'):
-      model = testing_utils.get_model_from_layers([AssertTypeLayer(dtype)],
-                                                  input_shape=(10,),
-                                                  input_dtype=dtype)
+      model = testing_utils.get_model_from_layers(
+          [AssertTypeLayer(dtype=dtype)], input_shape=(10,))
       model.compile(
           'sgd',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-      x = np.ones((10, 10), dtype=dtype)
-      y = np.ones((10, 10), dtype=dtype)
+      x = np.ones((10, 10))
+      y = np.ones((10, 10))
       model.fit(x, y)
       model.test_on_batch(x, y)
       model(x)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_subclassed_model_with_training_arg(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
-
     class LayerWithTrainingArg(keras.layers.Layer):
 
       def call(self, inputs, training=None):
@@ -1286,16 +1434,17 @@ class TrainingTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, x, epochs=1)
 
-    if testing_utils.should_run_eagerly():
+    if (testing_utils.should_run_eagerly() or
+        testing_utils.should_run_tf_function()):
       expected_training_arg = True
     else:
       expected_training_arg = keras.backend.symbolic_learning_phase()
 
-    self.assertEqual(model.training, expected_training_arg)
-    self.assertEqual(model.l1.training, expected_training_arg)
+    self.assertIs(model.training, expected_training_arg)
+    self.assertIs(model.l1.training, expected_training_arg)
 
   @keras_parameterized.run_all_keras_modes
   def test_error_when_model_is_not_compiled(self):
@@ -1370,7 +1519,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
             optimizer,
             loss=None,
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
@@ -1392,7 +1541,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
                 'dense_1': metrics_module.CategoricalAccuracy(),
             },
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
         msg = ('Output dense_1 missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
                'expecting any data to be passed to dense_1.')
@@ -1400,8 +1549,6 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_invalid_steps_per_epoch_usage(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     x = keras.layers.Input(shape=(1,))
     y = keras.layers.Dense(1)(x)
 
@@ -1410,7 +1557,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
         'sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=False)
     err_msg = 'When passing input data as arrays, do not specify'
 
     if testing_utils.should_run_eagerly():
@@ -1424,16 +1571,44 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
         model.predict(np.zeros((100, 1)), steps=4)
     else:
       with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.fit(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps_per_epoch=4)
+        model._standardize_user_data(
+            np.zeros((100, 1)),
+            np.ones((100, 1)),
+            check_steps=True,
+            steps=4)
         self.assertRegexpMatches(str(mock_log.call_args), err_msg)
 
-      with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.evaluate(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps=4)
-        self.assertRegexpMatches(str(mock_log.call_args), err_msg)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_batch_size_argument_with_sequence_input(self):
 
-      with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.predict(np.zeros((100, 1)), steps=4)
-        self.assertRegexpMatches(str(mock_log.call_args), err_msg)
+    class DummySequence(keras.utils.Sequence):
+
+      def __getitem__(self, idx):
+        return np.zeros([10, 2]), np.ones([10, 4])
+
+      def __len__(self):
+        return 10
+
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
+
+    model.compile(
+        'adam',
+        'binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.fit(DummySequence(), batch_size=2, epochs=2)
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.evaluate(DummySequence(), batch_size=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'The `batch_size` argument must not be specified'):
+      model.predict(DummySequence(), batch_size=2)
 
 
 class LossWeightingTest(keras_parameterized.TestCase):
@@ -1458,7 +1633,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
         optimizer=RMSPropOptimizer(learning_rate=learning_rate),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     np.random.seed(1337)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -1527,7 +1702,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
         loss='categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     np.random.seed(43)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -1639,7 +1814,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
           sample_weight_mode='temporal',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       model.fit(
           temporal_x_train,
@@ -1674,8 +1849,6 @@ class LossWeightingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   def test_fit_with_incorrect_weights(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     input_a = keras.layers.Input(shape=(3,), name='input_a')
     input_b = keras.layers.Input(shape=(3,), name='input_b')
 
@@ -1689,7 +1862,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
         optimizer='adam',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.random.random((10, 3))
     y = np.random.random((10, 2))
 
@@ -1709,8 +1882,6 @@ class LossWeightingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_class_weight_invalid_use_case(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     num_classes = 5
     train_samples = 1000
     test_samples = 1000
@@ -1730,7 +1901,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           optimizer,
           loss='binary_crossentropy',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=train_samples,
@@ -1752,7 +1923,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
             loss='binary_crossentropy',
             sample_weight_mode=[],
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       # Build multi-output model
       x = keras.Input((3,))
@@ -1763,7 +1934,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           optimizer,
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       x_np = np.random.random((10, 3))
       y_np = np.random.random((10, 4))
       w_np = np.random.random((10,))
@@ -1793,9 +1964,6 @@ class LossWeightingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_default_sample_weight(self):
     """Verifies that fit works without having to set sample_weight."""
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
-
     num_classes = 5
     input_dim = 5
     timesteps = 3
@@ -1818,7 +1986,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode=[None],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a list and mode value is `temporal`
@@ -1827,7 +1995,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode=['temporal'],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is None
@@ -1836,7 +2004,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode={'time_distributed': None},
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is `temporal`
@@ -1845,7 +2013,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode={'time_distributed': 'temporal'},
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is None
@@ -1854,7 +2022,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode=None,
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is `temporal`
@@ -1863,7 +2031,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           loss='mse',
           sample_weight_mode='temporal',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, batch_size=10)
 
   def test_sample_weight_tensor(self):
@@ -1943,7 +2111,7 @@ class MaskingTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   @keras_parameterized.run_with_all_model_types
@@ -1989,7 +2157,7 @@ class MaskingTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     y = np.random.random((5, 3))
     model.train_on_batch(x, y)
 
@@ -2008,7 +2176,7 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.trainable = True
     model.train_on_batch(x, y)
     self.assertRaises(Warning)
@@ -2024,7 +2192,7 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
           'rmsprop',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       out = model.predict(x)
       model.train_on_batch(x, y)
       out_2 = model.predict(x)
@@ -2038,7 +2206,7 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
           'rmsprop',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       out = model.predict(x)
       model.train_on_batch(x, y)
       out_2 = model.predict(x)
@@ -2156,7 +2324,7 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs2 = keras.Input(10)
     outputs2 = shared_layer(inputs2)
@@ -2166,7 +2334,7 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x, y = np.ones((10, 10)), np.ones((10, 10))
 
@@ -2200,7 +2368,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
         loss,
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = keras.backend.zeros(shape=(10, 3))
     targets = keras.backend.zeros(shape=(10, 4))
@@ -2254,7 +2422,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         loss_weights=loss_weights,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_a_tf = keras.backend.zeros(shape=(10, 3))
     input_b_tf = keras.backend.zeros(shape=(10, 3))
@@ -2786,7 +2954,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mae',
         metrics=metrics,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     mse_metric = 'mse' if tf2.enabled() else 'mean_squared_error'
     reference_metric_names = [
@@ -2819,7 +2987,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         metrics=[acc_obj],
         optimizer=RMSPropOptimizer(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x_train = np.random.random((100, 4))
     y_train = np.random.random((100, 1))
@@ -2853,7 +3021,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         metrics=[keras.metrics.MeanSquaredError()],
         weighted_metrics=[keras.metrics.MeanSquaredError()],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # list of list of metrics.
     model.compile(
@@ -2870,7 +3038,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
              keras.metrics.Accuracy()]
         ],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # dict of metrics.
     model.compile(
@@ -2893,12 +3061,10 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             ],
         },
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     num_classes = 5
     input_dim = 5
 
@@ -2913,7 +3079,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           loss='categorical_crossentropy',
           metrics=metrics_module.CategoricalAccuracy(),
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inp = keras.layers.Input(shape=(1,))
     x = keras.layers.Dense(3, activation='relu')(inp)
@@ -2938,7 +3104,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
               'output_3': 'mse',
           },
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     with self.assertRaisesRegex(
         ValueError,
@@ -2952,7 +3118,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
               'output_3': 'mse',
           },
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
@@ -2970,7 +3136,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           loss='mse',
           weighted_metrics=['accuracy'],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       # verify that masking is applied.
       x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
@@ -3007,7 +3173,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         'sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.ones(shape=(10, 1))
     targets = np.ones(shape=(10, 1))
@@ -3029,8 +3195,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
 
     class TestModel(keras.Model):
 
@@ -3052,7 +3216,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -3095,7 +3259,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -3154,7 +3318,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         metrics=[metrics_module.Accuracy('metric_4')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # Verify that the metrics added using `compile` and `add_metric` API are
     # included
@@ -3182,7 +3346,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         optimizer=RMSPropOptimizer(0.01),
         metrics=[metrics_module.Accuracy('acc')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
     model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
@@ -3192,8 +3356,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
 
     class TestModel(keras.Model):
 
@@ -3215,7 +3377,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -3258,7 +3420,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           loss='mse',
           optimizer=RMSPropOptimizer(0.01),
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
@@ -3281,7 +3443,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -3309,7 +3471,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
 
@@ -3347,7 +3509,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.gradient_descent.SGD(0.1),
         metrics=[metrics_module.MeanAbsoluteError(name='mae_3')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.array([[0.], [1.], [2.]])
     y = np.array([[0.5], [2.], [3.5]])
@@ -3384,7 +3546,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         metrics=[metrics_module.Accuracy('acc')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     self.assertEqual([m.name for m in inner_model.metrics],
                      ['acc', 'mean', 'mean1'])
@@ -3400,7 +3562,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         metrics=[metrics_module.Accuracy('acc2')],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual([m.name for m in outer_model.metrics],
                      ['acc2', 'mean', 'mean1', 'mean2'])
 
@@ -3484,7 +3646,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=2, epochs=1)
     self.assertEqual(self.evaluate(layer.counter), 5)
 
@@ -3498,7 +3660,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=2, epochs=1)
     self.assertEqual(self.evaluate(layer.counter), 5)
     layer.trainable = False
@@ -3506,7 +3668,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=2, epochs=1)
     self.assertEqual(self.evaluate(layer.counter), 5)
 
@@ -3520,7 +3682,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=2, epochs=1)
     self.assertEqual(self.evaluate(layer.counter), 5)
 
@@ -3554,7 +3716,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model.fit(x, y, batch_size=2, epochs=1)
     self.assertAllEqual(self.evaluate(bn.moving_mean), np.zeros((10,)))
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 6a3ea5a32c7..b45bcbc5b3d 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -34,7 +34,6 @@ from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import dtypes
@@ -54,6 +53,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -62,13 +62,18 @@ class Aggregator(object):
 
   Attributes:
     use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+    num_samples: Total number of samples: `batch_size * num_batches`.
+    steps: Total number of steps.
+    batch_size: Batch size. It is used for validation checks between inputs and
+      outputs.
     results: What to return at the end of the aggregation loop.
   """
 
-  def __init__(self, use_steps, num_samples_or_steps):
+  def __init__(self, use_steps, num_samples=None, steps=None, batch_size=None):
     self.use_steps = use_steps
-    self.num_samples_or_steps = num_samples_or_steps
+    self.num_samples = num_samples
+    self.steps = steps
+    self.batch_size = batch_size
     self.results = []
 
   @abc.abstractmethod
@@ -100,7 +105,21 @@ class Aggregator(object):
 
 
 class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info."""
+  """Aggregator that calculates loss and metrics info.
+
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples: Total number of samples: `batch_size*num_batches`.
+    steps: Total number of steps, ie number of times to iterate over a dataset
+      to cover all samples.
+  """
+
+  def __init__(self, use_steps, num_samples=None, steps=None):
+    super(MetricsAggregator, self).__init__(
+        use_steps=use_steps,
+        num_samples=num_samples,
+        steps=steps,
+        batch_size=None)
 
   def create(self, batch_outs):
     self.results = [0.] * len(batch_outs)
@@ -117,7 +136,7 @@ class MetricsAggregator(Aggregator):
   def finalize(self):
     if not self.results:
       raise ValueError('Empty training data.')
-    self.results[0] /= self.num_samples_or_steps
+    self.results[0] /= (self.num_samples or self.steps)
 
 
 class ConcatAggregator(Aggregator):
@@ -127,16 +146,25 @@ class ConcatAggregator(Aggregator):
   structure of tensor-likes.
   """
 
-  def __init__(self):
+  def __init__(self, batch_size):
     self.composite = None
     super(ConcatAggregator, self).__init__(
-        use_steps=True, num_samples_or_steps=None)
+        use_steps=True, num_samples=None, steps=None, batch_size=batch_size)
 
   def create(self, batch_element):
     self.composite = composite_tensor_utils.is_composite_or_composite_value(
         batch_element)
 
   def aggregate(self, batch_element, batch_start=None, batch_end=None):
+
+    # TODO(psv): Add num_samples check here to detect when output batch
+    # #samples is < batch size and != input batch #samples.
+    if self.batch_size and self.batch_size < batch_element.shape[0]:
+      raise ValueError(
+          'Mismatch between expected batch size and model output batch size. '
+          'Output shape = {}, expected output shape = shape {}'.format(
+              batch_element.shape,
+              (self.batch_size,) + batch_element.shape[1:]))
     self.results.append(batch_element)
 
   def finalize(self):
@@ -203,17 +231,20 @@ class SliceAggregator(Aggregator):
   _BINARY_SIZE_THRESHOLD = 2 ** 14
   _MAX_COPY_SECONDS = 300
 
-  def __init__(self, num_samples_or_steps):
+  def __init__(self, num_samples, batch_size):
     self._async_copies = []
     self._pool = get_copy_pool()
     self._errors = []
     super(SliceAggregator, self).__init__(
-        use_steps=False, num_samples_or_steps=num_samples_or_steps)
+        use_steps=False,
+        num_samples=num_samples,
+        steps=None,
+        batch_size=batch_size)
 
   def create(self, batch_element):
     # This step does not need to be pipelined because NumPy empty array
     # initialization is effectively instantaneous.
-    shape = (self.num_samples_or_steps,) + batch_element.shape[1:]
+    shape = (self.num_samples,) + batch_element.shape[1:]
     dtype = batch_element.dtype
     if isinstance(batch_element, ops.EagerTensor):
       dtype = dtype.as_numpy_dtype()
@@ -226,8 +257,8 @@ class SliceAggregator(Aggregator):
       six.reraise(type(self._errors[0]), self._errors[0])
 
     # In the special case of single batch inference, no copy is needed.
-    if batch_end - batch_start == self.num_samples_or_steps:
-      if self.num_samples_or_steps != batch_element.shape[0]:
+    if batch_end - batch_start == self.num_samples:
+      if self.num_samples != batch_element.shape[0]:
         raise ValueError(
             'Mismatch between expected batch size and model output batch size. '
             'Output shape = {}, expected output shape = shape {}'.format(
@@ -291,10 +322,11 @@ class OutputsAggregator(Aggregator):
         # If the output is not a ndarray, it will be either a composite tensor
         # or a composite tensor's Value object. In either case, we can't
         # allocate an array to hold the object - we'll handle it later.
-        self.results.append(ConcatAggregator())
+        self.results.append(ConcatAggregator(self.batch_size))
       elif isinstance(batch_element, (np.ndarray, ops.EagerTensor)):
-        self.results.append(ConcatAggregator() if self.use_steps else
-                            SliceAggregator(self.num_samples_or_steps))
+        self.results.append(
+            (ConcatAggregator(self.batch_size) if self.use_steps else
+             SliceAggregator(self.num_samples, self.batch_size)))
       else:
         # This is not a ndarray, a CompositeTensor, or a CompositeTensorValue.
         # Fail fast rather than trying to concatenate it.
@@ -403,6 +435,10 @@ def standardize_single_array(x, expected_shape=None):
   if composite_tensor_utils.is_composite_or_composite_value(x):
     return x
 
+  if isinstance(x, int):
+    raise ValueError(
+        'Expected an array data type but received an integer: {}'.format(x))
+
   if (x.shape is not None and len(x.shape) == 1 and
       (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
@@ -438,9 +474,14 @@ def standardize_input_data(data,
   Raises:
       ValueError: in case of improperly formatted user-provided data.
   """
+  try:
+    data_len = len(data)
+  except TypeError:
+    # For instance if data is `None` or a symbolic Tensor.
+    data_len = None
+
   if not names:
-    if (data is not None and hasattr(data, '__len__') and len(data) and
-        not isinstance(data, dict)):
+    if data_len and not isinstance(data, dict):
       raise ValueError(
           'Error when checking model ' + exception_prefix + ': '
           'expected no data, but got:', data)
@@ -1032,7 +1073,7 @@ def get_loss_function(loss):
     return loss
 
   # Deserialize loss configuration, if needed.
-  if isinstance(loss, collections.Mapping):
+  if isinstance(loss, collections_abc.Mapping):
     loss = losses.get(loss)
 
   # Custom callable class.
@@ -1090,6 +1131,24 @@ def validate_dataset_input(x, y, sample_weight, validation_split=None):
         'Received: x=%s, validation_split=%f' % (x, validation_split))
 
 
+def validate_input_types(inp, orig_inp, allow_dict=True, field_name='inputs'):
+  """Helper function to validate either inputs or targets."""
+  if isinstance(inp, (list, tuple)):
+    if not all(isinstance(v, np.ndarray) or
+               tensor_util.is_tensor(v) for v in inp):
+      raise ValueError(
+          'Please provide as model inputs either a single array or a list of '
+          'arrays. You passed: {}={}'.format(field_name, str(orig_inp)))
+  elif isinstance(inp, dict):
+    if not allow_dict:
+      raise ValueError(
+          'You cannot pass a dictionary as model {}.'.format(field_name))
+  elif not isinstance(inp, np.ndarray) and not tensor_util.is_tensor(inp):
+    raise ValueError(
+        'Please provide as model inputs either a single array or a list of '
+        'arrays. You passed: {}={}'.format(field_name, orig_inp))
+
+
 def check_generator_arguments(y=None, sample_weight=None,
                               validation_split=None):
   """Validates arguments passed when using a generator."""
@@ -1155,13 +1214,41 @@ def check_steps_argument(input_data, steps, steps_name):
 
 
 def cast_single_tensor(x, dtype=None):
-  x = ops.convert_to_tensor(x)
+  if isinstance(x, np.ndarray):
+    x = ops.convert_to_tensor(x)
   dtype = dtype or K.floatx()
   if x.dtype.is_floating:
     return math_ops.cast(x, dtype=dtype)
   return x
 
 
+def cast_if_floating_dtype_and_mismatch(targets, outputs):
+  """Returns target data tensors using correct datatype.
+
+  Checks that each target and output pair are the same datatype. If not, casts
+  the target to the output's datatype.
+
+  Args:
+    targets: tensor or list of targets.
+    outputs: tensor or list of outputs.
+
+  Returns:
+    Targets in appropriate datatype.
+  """
+  if tensor_util.is_tensor(targets):
+    # There is one target, so output[0] should be the only output.
+    return cast_single_tensor(targets, dtype=outputs[0].dtype)
+  new_targets = []
+  for target, out in zip(targets, outputs):
+    if isinstance(target, np.ndarray):
+      target = ops.convert_to_tensor(target)
+    if target.dtype != out.dtype:
+      new_targets.append(cast_single_tensor(target, dtype=out.dtype))
+    else:
+      new_targets.append(target)
+  return new_targets
+
+
 def cast_if_floating_dtype(x):
   """Casts the given data tensors to the default floating point type.
 
@@ -1175,11 +1262,9 @@ def cast_if_floating_dtype(x):
   return nest.map_structure(cast_single_tensor, x)
 
 
-def cast_if_floating_to_model_input_dtypes(x, model):
+def cast_to_model_input_dtypes(x, model):
   """Casts the given data tensors to the dtypes of the model inputs.
 
-  Casts only if the input is already a floating point type.
-
   Args:
     x: tensor or list/tuple of tensors.
     model: The model.
@@ -1188,10 +1273,8 @@ def cast_if_floating_to_model_input_dtypes(x, model):
     Converted input. Each tensor is casted to the corresponding input in
     `model.inputs`.
   """
-  # TODO(b/131372221): We should probably cast even if the input is not
-  # floating-point.
   input_dtypes = nest.map_structure(lambda t: t.dtype, model.inputs)
-  return nest.map_structure(cast_single_tensor, x, input_dtypes)
+  return nest.map_structure(math_ops.cast, x, input_dtypes)
 
 
 def prepare_sample_weight_modes(training_endpoints, sample_weight_mode):
@@ -1252,7 +1335,7 @@ def prepare_loss_functions(loss, output_names):
       ValueError: If loss is a dict with keys not in model output names,
           or if loss is a list with len not equal to model outputs.
   """
-  if isinstance(loss, collections.Mapping):
+  if isinstance(loss, collections_abc.Mapping):
     generic_utils.check_for_unexpected_keys('loss', loss, output_names)
     loss_functions = []
     for name in output_names:
@@ -1264,7 +1347,7 @@ def prepare_loss_functions(loss, output_names):
       loss_functions.append(get_loss_function(loss.get(name, None)))
   elif isinstance(loss, six.string_types):
     loss_functions = [get_loss_function(loss) for _ in output_names]
-  elif isinstance(loss, collections.Sequence):
+  elif isinstance(loss, collections_abc.Sequence):
     if len(loss) != len(output_names):
       raise ValueError('When passing a list as loss, it should have one entry '
                        'per model outputs. The model has {} outputs, but you '
@@ -1532,10 +1615,15 @@ def unpack_iterator_input(iterator):
   return x, y, weights
 
 
-def infer_steps_for_dataset(dataset, steps, epochs=1, steps_name='steps'):
+def infer_steps_for_dataset(model,
+                            dataset,
+                            steps,
+                            epochs=1,
+                            steps_name='steps'):
   """Infers steps_per_epoch needed to loop through a dataset.
 
   Arguments:
+      model: Keras model instance.
       dataset: Input data of type tf.data.Dataset.
       steps: Number of steps to draw from the dataset (may be None if unknown).
       epochs: Number of times to iterate over the dataset.
@@ -1545,14 +1633,15 @@ def infer_steps_for_dataset(dataset, steps, epochs=1, steps_name='steps'):
 
   Returns:
     Integer or `None`. Inferred number of steps to loop through the dataset.
-    `None` is returned if the size of the dataset is unknown and `steps` was
-    not specified.
+    `None` is returned if 1) the size of the dataset is unknown and `steps` was
+    not specified, or 2) this is multi-worker training and auto sharding is
+    enabled.
 
   Raises:
     ValueError: In case of invalid argument values.
   """
   assert isinstance(dataset, dataset_ops.DatasetV2)
-  if (multi_worker_util.in_multi_worker_mode() and
+  if (model._in_multi_worker_mode() and
       dataset.options().experimental_distribute.auto_shard):
     # If the dataset would be auto-sharded, we should not infer a local
     # steps_per_epoch due to the possible inbalanced sharding between workers.
@@ -1767,9 +1856,9 @@ def should_run_validation(validation_freq, epoch):
       raise ValueError('`validation_freq` can not be less than 1.')
     return one_indexed_epoch % validation_freq == 0
 
-  if not isinstance(validation_freq, collections.Container):
+  if not isinstance(validation_freq, collections_abc.Container):
     raise ValueError('`validation_freq` must be an Integer or '
-                     '`collections.Container` (e.g. list, tuple, etc.)')
+                     '`collections_abc.Container` (e.g. list, tuple, etc.)')
   return one_indexed_epoch in validation_freq
 
 
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 0ef0066829a..1a6917e2e21 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -309,8 +309,7 @@ class AggregationTest(keras_parameterized.TestCase):
     training_utils.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout
 
   def _run_with_steps(self):
-    aggregator = training_utils.OutputsAggregator(
-        use_steps=True, num_samples_or_steps=None)
+    aggregator = training_utils.OutputsAggregator(use_steps=True)
     for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
       if i == 0:
         aggregator.create(batch)
@@ -324,7 +323,7 @@ class AggregationTest(keras_parameterized.TestCase):
 
   def _run_without_steps(self):
     aggregator = training_utils.OutputsAggregator(
-        use_steps=False, num_samples_or_steps=6)
+        use_steps=False, num_samples=6)
 
     batch_start = 0
     for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
@@ -349,7 +348,7 @@ class AggregationTest(keras_parameterized.TestCase):
 
   def test_nested_aggregation(self):
     aggregator = training_utils.OutputsAggregator(
-        use_steps=False, num_samples_or_steps=6)
+        use_steps=False, num_samples=6)
 
     batches = np.array_split(_TEST_DATA, 4)
     batch_start = 0
@@ -366,8 +365,7 @@ class AggregationTest(keras_parameterized.TestCase):
     self.assertAllEqual(aggregator.results, (_TEST_DATA, _TEST_DATA))
 
   def test_concat_single_batch(self):
-    aggregator = training_utils.OutputsAggregator(
-        use_steps=True, num_samples_or_steps=None)
+    aggregator = training_utils.OutputsAggregator(use_steps=True)
     data = _TEST_DATA.copy()
     aggregator.create(data)
     assert len(aggregator.results) == 1
@@ -379,7 +377,7 @@ class AggregationTest(keras_parameterized.TestCase):
 
   def test_slice_single_batch(self):
     aggregator = training_utils.OutputsAggregator(
-        use_steps=False, num_samples_or_steps=6)
+        use_steps=False, num_samples=6)
     data = _TEST_DATA.copy()
     aggregator.create(data)
     assert len(aggregator.results) == 1
diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py
index ab362e29f75..63fab9d2b22 100644
--- a/tensorflow/python/keras/engine/training_v2.py
+++ b/tensorflow/python/keras/engine/training_v2.py
@@ -35,6 +35,7 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine import training_v2_utils
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 
 
@@ -48,7 +49,8 @@ _ADAPTER_FOR_VALIDATION_SPLIT = [data_adapter.TensorLikeDataAdapter]
 # dataset/generate/sequence input will be peeked and processed by
 # model._standardize_user_data()
 _ADAPTER_FOR_STANDARDIZE_USER_DATA = [
-    data_adapter.TensorLikeDataAdapter, data_adapter.DatasetAdapter
+    data_adapter.TensorLikeDataAdapter, data_adapter.DatasetAdapter,
+    data_adapter.CompositeTensorDataAdapter
 ]
 
 
@@ -56,8 +58,10 @@ def run_one_epoch(model,
                   iterator,
                   execution_function,
                   dataset_size=None,
+                  batch_size=None,
                   strategy=None,
                   steps_per_epoch=None,
+                  num_samples=None,
                   mode=ModeKeys.TRAIN,
                   training_context=None,
                   total_epochs=None):
@@ -72,8 +76,11 @@ def run_one_epoch(model,
     iterator: the dataset iterator to fetch the data.
     execution_function: a tf.function that can be called with data.
     dataset_size: the size of iterator, None when unknown.
+    batch_size: The size of the current batch.
     strategy: the distribution strategy instance from the model.
     steps_per_epoch: the number of steps to run for the epoch.
+    num_samples: the number of samples for the whole epoch if known. This can be
+      used to calculate the final partial batch, and scale the loss.
     mode: the mode for the current epoch.
     training_context: the context that contains callbacks and progress bar.
     total_epochs: the total number of epochs that will be run.
@@ -82,12 +89,18 @@ def run_one_epoch(model,
   Returns:
     The loss and metric value from the model.
   """
+  # Only use the sample to count if there is a partial batch at the end.
+  use_steps = num_samples is None
+
   if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(
-        use_steps=True, num_samples_or_steps=steps_per_epoch)
+        use_steps=use_steps,
+        steps=steps_per_epoch,
+        num_samples=num_samples,
+        batch_size=batch_size)
   else:
     aggregator = training_utils.MetricsAggregator(
-        use_steps=True, num_samples_or_steps=steps_per_epoch)
+        use_steps=use_steps, steps=steps_per_epoch, num_samples=num_samples)
   callbacks = training_context.callbacks
   progbar = training_context.progbar
 
@@ -98,65 +111,74 @@ def run_one_epoch(model,
   step = 0
 
   while step < target_steps:
-    # TODO(scottzhu): Maybe update the training context to take into account
-    #  whether a batch of training happens. Then it could still use a
-    #  context manager
-    batch_logs = {'batch': step, 'size': 1}
-    training_context.callbacks._call_batch_hook(
-        mode, 'begin', step, batch_logs)
-    training_context.progbar.on_batch_begin(step, batch_logs)
-    try:
-      batch_outs = execution_function(iterator)
-    except (StopIteration, errors.OutOfRangeError):
-      # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
-      # Are there any other C++ errors tf function should recapture?
-      # The only acceptable case here is that the input has a unknown
-      # length, and configured to fully consume it.
-      if (dataset_size is None
-          and steps_per_epoch is None
-          and step > 0):
-        # The input passed by the user ran out of batches.
-        # Now we know the cardinality of the input(dataset or generator).
-        steps_per_epoch = step
-        aggregator.num_samples_or_steps = steps_per_epoch
-        progbar.params['steps'] = steps_per_epoch
-        progbar.progbar.target = steps_per_epoch
+    if use_steps:
+      current_batch_size = 1
+    elif step < target_steps - 1:
+      current_batch_size = batch_size
+    else:
+      current_batch_size = num_samples - step * batch_size
+    with training_context.on_batch(
+        step=step, mode=mode, size=current_batch_size) as batch_logs:
+      try:
+        batch_outs = execution_function(iterator)
+      except (StopIteration, errors.OutOfRangeError):
+        # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
+        # Are there any other C++ errors tf function should recapture?
+        # The only acceptable case here is that the input has a unknown
+        # length, and configured to fully consume it.
+        if (dataset_size is None
+            and steps_per_epoch is None
+            and step > 0):
+          # The input passed by the user ran out of batches.
+          # Now we know the cardinality of the input(dataset or generator).
+          steps_per_epoch = step
+          aggregator.steps = steps_per_epoch
+          progbar.params['steps'] = steps_per_epoch
+          progbar.progbar.target = steps_per_epoch
+        else:
+          callbacks.model.stop_training = True
+          logging.warning(
+              'Your input ran out of data; interrupting training. '
+              'Make sure that your dataset or generator can generate at '
+              'least `steps_per_epoch * epochs` batches (in this case, '
+              '{} batches). You may need to use the repeat() function '
+              'when building your dataset.'.format(
+                  total_epochs * steps_per_epoch))
+        # In either case, break out the loop for training batch.
+        # Also note the training_context that data inputs are exhausted, so all
+        # the post batch hooks can be skipped.
+        batch_logs['data_exhausted'] = True
+        break
+
+      if mode != ModeKeys.PREDICT:
+        data_batch_size = batch_outs['batch_size']
+        batch_outs = (batch_outs['total_loss'] + batch_outs['output_losses']
+                      + batch_outs['metrics'])
+        if current_batch_size != data_batch_size:
+          batch_logs['size'] = data_batch_size
+          current_batch_size = data_batch_size
       else:
-        callbacks.model.stop_training = True
-        logging.warning(
-            'Your input ran out of data; interrupting training. '
-            'Make sure that your dataset or generator can generate at '
-            'least `steps_per_epoch * epochs` batches (in this case, '
-            '{} batches). You may need to use the repeat() function '
-            'when building your dataset.'.format(
-                total_epochs * steps_per_epoch))
-      # In either case, break out the loop for training batch.
-      break
+        batch_outs = _aggregate_predict_results(strategy, batch_outs, model)
 
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if strategy:
-      batch_outs = dist_utils._per_replica_aggregate_batch(
-          batch_outs, model, mode)
+      if step == 0:
+        aggregator.create(batch_outs)
 
-    if step == 0:
-      aggregator.create(batch_outs)
-    aggregator.aggregate(batch_outs)
-    cbks.make_logs(model, batch_logs, batch_outs, mode)
-
-    training_context.callbacks._call_batch_hook(
-        mode, 'end', step, batch_logs)
-    training_context.progbar.on_batch_end(step, batch_logs)
-
-    step += 1
+      if use_steps:
+        aggregator.aggregate(batch_outs)
+      else:
+        aggregator.aggregate(
+            batch_outs,
+            batch_start=step * batch_size,
+            batch_end=step * batch_size + current_batch_size)
+      cbks.make_logs(model, batch_logs, batch_outs, mode)
+      step += 1
 
     if callbacks.model.stop_training:
       break
 
   # End of an epoch.
   aggregator.finalize()
-  results = aggregator.results
-  return results
+  return aggregator.results
 
 
 class Loop(training_utils.TrainingLoop):
@@ -170,24 +192,29 @@ class Loop(training_utils.TrainingLoop):
       self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1,
       callbacks=None, validation_split=0., validation_data=None, shuffle=True,
       class_weight=None, sample_weight=None, initial_epoch=0,
-      steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs):
+      steps_per_epoch=None, validation_steps=None, validation_freq=1,
+      max_queue_size=10, workers=1, use_multiprocessing=False, **kwargs):
     batch_size = model._validate_or_infer_batch_size(
         batch_size, steps_per_epoch, x)
 
     strategy = _get_distribution_strategy(model)
     batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
-        strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN)
+        strategy,
+        x,
+        batch_size,
+        steps_per_epoch,
+        ModeKeys.TRAIN,
+        validation_split=validation_split)
     dist_utils.validate_callbacks(input_callbacks=callbacks,
                                   optimizer=model.optimizer)
     # Enter tf.distribute.Strategy scope.
-    with dist_utils.distributed_scope(
-        strategy=strategy, learning_phase=1):
-
+    with strategy.scope():
       training_data_adapter, validation_adapter = _process_training_inputs(
           model,
           x,
           y,
           batch_size=batch_size,
+          epochs=epochs,
           sample_weights=sample_weight,
           class_weights=class_weight,
           validation_split=validation_split,
@@ -195,11 +222,20 @@ class Loop(training_utils.TrainingLoop):
           shuffle=shuffle,
           validation_data=validation_data,
           validation_steps=validation_steps,
-          distribution_strategy=strategy)
+          distribution_strategy=strategy,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
 
+      total_samples = _get_total_number_of_samples(training_data_adapter)
+      use_sample = total_samples is not None
       do_validation = (validation_adapter is not None)
 
+      recreate_training_iterator = (
+          training_data_adapter.should_recreate_iterator(steps_per_epoch))
       if not steps_per_epoch:
+        # TODO(b/139762795): Add step inference for when steps is None to
+        # prevent end of sequence warning message.
         steps_per_epoch = training_data_adapter.get_size()
 
       # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
@@ -213,53 +249,58 @@ class Loop(training_utils.TrainingLoop):
       # is infinite.
       # TODO(scottzhu): This check should probably happen in the adapter
       training_utils.infer_steps_for_dataset(
-          training_dataset, steps_per_epoch, steps_name='steps_per_epoch',
+          model,
+          training_dataset,
+          steps_per_epoch,
+          steps_name='steps_per_epoch',
           epochs=0)
 
       training_dataset = strategy.experimental_distribute_dataset(
           training_dataset)
 
-      _update_sample_weight_mode(model, ModeKeys.TRAIN, training_dataset)
       training_function = training_v2_utils._get_or_make_execution_function(
           model, ModeKeys.TRAIN)
 
       training_data_iter = None
-      # Only recreate iterator when the data has a fixed length, which will be
-      # fully consumed every epoch, or has a unknown length (dataset, generator)
-      # and will be fully consumed (steps_per_epoch is None)
-      recreate_training_iterator = (training_data_adapter.get_size() is not None
-                                    or steps_per_epoch is None)
-
       if do_validation:
+        validation_dataset = validation_adapter.get_dataset()
         if not validation_steps:
-          validation_steps = validation_adapter.get_size()
+          # Raise an error if validation_steps isn't specified but the
+          # validation dataset is infinite.
+          validation_steps = (
+              validation_adapter.get_size() or
+              training_utils.infer_steps_for_dataset(
+                  model,
+                  validation_dataset,
+                  validation_steps,
+                  steps_name='validation_steps'))
         eval_function = training_v2_utils._get_or_make_execution_function(
             model, ModeKeys.TEST)
         eval_data_iter = None
-
-        validation_dataset = validation_adapter.get_dataset()
-        # Raise an error if validation_steps isn't specified but the validation
-        # dataset is infinite.
-        # TODO(scottzhu): This check should probably happen in the adapter
-        training_utils.infer_steps_for_dataset(
-            validation_dataset, validation_steps, steps_name='validation_steps',
-            epochs=0)
         validation_dataset = strategy.experimental_distribute_dataset(
             validation_dataset)
+        val_total_samples = _get_total_number_of_samples(validation_adapter)
+      else:
+        val_total_samples = None
 
-      callbacks = cbks.configure_callbacks(
+      if verbose and (total_samples or steps_per_epoch):
+        _print_train_info(total_samples, steps_per_epoch, val_total_samples,
+                          validation_steps)
+
+      training_callbacks = cbks.configure_callbacks(
           callbacks,
           model,
           do_validation=do_validation,
           batch_size=batch_size,
           epochs=epochs,
           steps_per_epoch=steps_per_epoch,
-          samples=None,
+          samples=total_samples or steps_per_epoch,
+          count_mode='samples' if use_sample else 'steps',
           verbose=0,  # Handle ProgBarLogger separately in this loop.
           mode=ModeKeys.TRAIN)
 
-      with training_context.on_start(model, callbacks, verbose, ModeKeys.TRAIN):
-        # TODO(scottzhu): Handle TPUStrategy training loop
+      with training_context.on_start(model, training_callbacks, use_sample,
+                                     verbose, ModeKeys.TRAIN):
         for epoch in range(initial_epoch, epochs):
           if training_context.callbacks.model.stop_training:
             break
@@ -281,8 +322,10 @@ class Loop(training_utils.TrainingLoop):
                 training_data_iter,
                 training_function,
                 dataset_size=training_data_adapter.get_size(),
+                batch_size=training_data_adapter.batch_size(),
                 strategy=strategy,
                 steps_per_epoch=steps_per_epoch,
+                num_samples=total_samples,
                 mode=ModeKeys.TRAIN,
                 training_context=training_context,
                 total_epochs=epochs)
@@ -291,7 +334,7 @@ class Loop(training_utils.TrainingLoop):
             # Evaluation
             if (do_validation and
                 training_utils.should_run_validation(validation_freq, epoch) and
-                not callbacks.model.stop_training):
+                not training_callbacks.model.stop_training):
               if (eval_data_iter is not None and
                   distribution_strategy_context.has_strategy()):
                 # TODO(kaftan): remove this when MultiDeviceIterator is a
@@ -300,9 +343,24 @@ class Loop(training_utils.TrainingLoop):
               else:
                 eval_data_iter = iter(validation_dataset)
 
+              validation_callbacks = cbks.configure_callbacks(
+                  training_callbacks,
+                  model,
+                  batch_size=batch_size,
+                  epochs=1,
+                  steps_per_epoch=validation_steps,
+                  samples=val_total_samples or validation_steps,
+                  count_mode='samples' if use_sample else 'steps',
+                  verbose=0,  # Handle ProgBarLogger separately in this loop.
+                  mode=ModeKeys.TEST)
+
               eval_context = TrainingContext()
               with eval_context.on_start(
-                  model, callbacks, verbose=0, mode=ModeKeys.TEST):
+                  model,
+                  validation_callbacks,
+                  use_sample,
+                  verbose=0,
+                  mode=ModeKeys.TEST):
                 with eval_context.on_epoch(epoch, ModeKeys.TEST):
                   model.reset_metrics()
                   eval_result = run_one_epoch(
@@ -310,8 +368,10 @@ class Loop(training_utils.TrainingLoop):
                       eval_data_iter,
                       eval_function,
                       dataset_size=validation_adapter.get_size(),
+                      batch_size=validation_adapter.batch_size(),
                       strategy=strategy,
                       steps_per_epoch=validation_steps,
+                      num_samples=val_total_samples,
                       mode=ModeKeys.TEST,
                       training_context=eval_context,
                       total_epochs=1)
@@ -322,7 +382,8 @@ class Loop(training_utils.TrainingLoop):
 
   def _model_iteration(
       self, model, mode, x=None, y=None, batch_size=None, verbose=1,
-      sample_weight=None, steps=None, callbacks=None, **kwargs):
+      sample_weight=None, steps=None, callbacks=None, max_queue_size=10,
+      workers=1, use_multiprocessing=False, **kwargs):
 
     batch_size = model._validate_or_infer_batch_size(
         batch_size, steps, x)
@@ -332,33 +393,33 @@ class Loop(training_utils.TrainingLoop):
     dist_utils.validate_callbacks(input_callbacks=callbacks,
                                   optimizer=model.optimizer)
     # Enter tf.distribute.Strategy scope.
-    with dist_utils.distributed_scope(
-        strategy=strategy, learning_phase=0):
-
+    with strategy.scope():
       adapter = _process_inputs(
           model,
+          mode,
           x,
           y,
           batch_size=batch_size,
           sample_weights=sample_weight,
           steps=steps,
-          distribution_strategy=strategy)
+          distribution_strategy=strategy,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+      total_samples = _get_total_number_of_samples(adapter)
+      use_sample = total_samples is not None
+      dataset = adapter.get_dataset()
 
       if not steps:
-        steps = adapter.get_size()
+        # Raise an error if `steps` isn't specified but the dataset
+        # is infinite.
+        steps = adapter.get_size() or training_utils.infer_steps_for_dataset(
+            model, dataset, steps, steps_name='steps')
 
       # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
       training_context = TrainingContext()
-
-      dataset = adapter.get_dataset()
-      # Raise an error if `steps` isn't specified but the dataset
-      # is infinite.
-      # TODO(scottzhu): This check should probably happen in the adapter
-      training_utils.infer_steps_for_dataset(
-          dataset, steps, steps_name='steps', epochs=0)
       dataset = strategy.experimental_distribute_dataset(dataset)
 
-      _update_sample_weight_mode(model, mode, dataset)
       execution_function = training_v2_utils._get_or_make_execution_function(
           model, mode)
 
@@ -371,12 +432,13 @@ class Loop(training_utils.TrainingLoop):
           batch_size=batch_size,
           epochs=1,
           steps_per_epoch=steps,
-          samples=None,
+          samples=use_sample,
+          count_mode='samples' if use_sample else 'steps',
           verbose=0,  # Handle ProgBarLogger separately in this loop.
           mode=mode)
 
-      with training_context.on_start(model, callbacks, verbose, mode):
-        # TODO(scottzhu): Handle TPUStrategy training loop
+      with training_context.on_start(
+          model, callbacks, use_sample, verbose, mode):
         with training_context.on_epoch(0, mode) as epoch_logs:
           model.reset_metrics()
           result = run_one_epoch(
@@ -384,8 +446,10 @@ class Loop(training_utils.TrainingLoop):
               data_iterator,
               execution_function,
               dataset_size=adapter.get_size(),
+              batch_size=adapter.batch_size(),
               strategy=strategy,
               steps_per_epoch=steps,
+              num_samples=total_samples,
               mode=mode,
               training_context=training_context,
               total_epochs=1)
@@ -397,46 +461,50 @@ class Loop(training_utils.TrainingLoop):
 
   def evaluate(
       self, model, x=None, y=None, batch_size=None, verbose=1,
-      sample_weight=None, steps=None, callbacks=None, **kwargs):
+      sample_weight=None, steps=None, callbacks=None, max_queue_size=10,
+      workers=1, use_multiprocessing=False, **kwargs):
     return self._model_iteration(
         model, ModeKeys.TEST, x=x, y=y, batch_size=batch_size, verbose=verbose,
-        sample_weight=sample_weight, steps=steps, callbacks=callbacks, **kwargs)
+        sample_weight=sample_weight, steps=steps, callbacks=callbacks,
+        max_queue_size=max_queue_size, workers=workers,
+        use_multiprocessing=use_multiprocessing, **kwargs)
 
   def predict(self, model, x, batch_size=None, verbose=0, steps=None,
-              callbacks=None, **kwargs):
+              callbacks=None, max_queue_size=10, workers=1,
+              use_multiprocessing=False, **kwargs):
     return self._model_iteration(
         model, ModeKeys.PREDICT, x=x, batch_size=batch_size, verbose=verbose,
-        steps=steps, callbacks=callbacks, **kwargs)
+        steps=steps, callbacks=callbacks, max_queue_size=max_queue_size,
+        workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
 
 
 def _get_distribution_strategy(model):
   """Get the model's distribution strategy."""
-  if model._distribution_strategy:
-    return model._distribution_strategy
+  if model._compile_time_distribution_strategy:
+    strategy = model._compile_time_distribution_strategy
   else:
-    # Use the default strategy if no strategy was present at compile.
-    # Validate there is no actual strategy scope active at execution
-    # time.
+    # Grab the active strategy if the model was never compiled
+    # but it is now predicting.
     strategy = distribution_strategy_context.get_strategy()
-    if distribution_strategy_context.has_strategy():
-      raise ValueError(
-          'Model was compiled without any active distribution strategy, '
-          'but there is an execution-time distribution '
-          'strategy scope of (%s). '
-          'Try to make sure your code looks similar to the following.\n'
-          'with strategy.scope():\n'
-          '  model=_create_model()\n'
-          '  model.compile(...)\n'
-          '  model.fit(...)'% strategy)
-
-    return strategy
+  return strategy
 
 
-def _process_training_inputs(model, x, y, batch_size=None,
-                             sample_weights=None, class_weights=None,
-                             steps_per_epoch=None, validation_split=0.,
-                             validation_data=None, validation_steps=None,
-                             shuffle=True, distribution_strategy=None):
+def _process_training_inputs(model,
+                             x,
+                             y,
+                             batch_size=None,
+                             epochs=1,
+                             sample_weights=None,
+                             class_weights=None,
+                             steps_per_epoch=None,
+                             validation_split=0.,
+                             validation_data=None,
+                             validation_steps=None,
+                             shuffle=True,
+                             distribution_strategy=None,
+                             max_queue_size=10,
+                             workers=1,
+                             use_multiprocessing=False):
   """Process the data input for fit() with respect to validation_split."""
   if validation_split and 0. < validation_split < 1. and validation_data:
     raise ValueError('validation_data and validation_split cannot be used '
@@ -455,28 +523,54 @@ def _process_training_inputs(model, x, y, batch_size=None,
     # Retrieve the training section from x and y, and then construct dataset
     # from it.
     x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights,
+        x,
+        y,
+        sample_weight=sample_weights,
         class_weight=class_weights,
         batch_size=batch_size,
-        check_steps=True,
+        check_steps=False,
         steps=steps_per_epoch)
     (x, y, sample_weights,
      val_x, val_y,
      val_sample_weights) = training_utils.split_training_and_validation_data(
          x, y, sample_weights, validation_split)
-    train_adapter = adapter_cls(x, y, batch_size=batch_size,
-                                sample_weights=sample_weights, shuffle=shuffle,
-                                distribution_strategy=distribution_strategy)
-    val_adapter = adapter_cls(val_x, val_y,
-                              sample_weights=val_sample_weights,
-                              batch_size=batch_size,
-                              distribution_strategy=distribution_strategy)
+
+    sample_weight_modes = [
+        e.sample_weight_mode for e in model._training_endpoints
+    ]
+    train_adapter = adapter_cls(
+        x,
+        y,
+        batch_size=batch_size,
+        epochs=epochs,
+        sample_weights=sample_weights,
+        sample_weight_modes=sample_weight_modes,
+        shuffle=shuffle,
+        distribution_strategy=distribution_strategy)
+
+    val_adapter = adapter_cls(
+        val_x,
+        val_y,
+        sample_weights=val_sample_weights,
+        sample_weight_modes=sample_weight_modes,
+        batch_size=batch_size,
+        distribution_strategy=distribution_strategy)
   else:
-    train_adapter = _process_inputs(model, x, y, sample_weights=sample_weights,
-                                    batch_size=batch_size,
-                                    class_weights=class_weights,
-                                    shuffle=shuffle, steps=steps_per_epoch,
-                                    distribution_strategy=distribution_strategy)
+    train_adapter = _process_inputs(
+        model,
+        ModeKeys.TRAIN,
+        x,
+        y,
+        sample_weights=sample_weights,
+        batch_size=batch_size,
+        epochs=epochs,
+        class_weights=class_weights,
+        shuffle=shuffle,
+        steps=steps_per_epoch,
+        distribution_strategy=distribution_strategy,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
     val_adapter = None
     if validation_data:
       (val_x, val_y,
@@ -487,21 +581,36 @@ def _process_training_inputs(model, x, y, batch_size=None,
       # validation data input.
       if not batch_size:
         batch_size = train_adapter.batch_size()
-      val_adapter = _process_inputs(model, val_x, val_y,
-                                    sample_weights=val_sample_weights,
-                                    batch_size=batch_size,
-                                    class_weights=class_weights,
-                                    steps=validation_steps,
-                                    distribution_strategy=distribution_strategy)
+      val_adapter = _process_inputs(
+          model,
+          ModeKeys.TEST,
+          val_x,
+          val_y,
+          sample_weights=val_sample_weights,
+          batch_size=batch_size,
+          class_weights=class_weights,
+          steps=validation_steps,
+          distribution_strategy=distribution_strategy)
     elif validation_steps:
       raise ValueError('`validation_steps` should not be specified if '
                        '`validation_data` is None.')
   return train_adapter, val_adapter
 
 
-def _process_inputs(model, x, y, batch_size=None, sample_weights=None,
-                    class_weights=None, shuffle=False, steps=None,
-                    distribution_strategy=None):
+def _process_inputs(model,
+                    mode,
+                    x,
+                    y,
+                    batch_size=None,
+                    epochs=1,
+                    sample_weights=None,
+                    class_weights=None,
+                    shuffle=False,
+                    steps=None,
+                    distribution_strategy=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False):
   """Process the inputs for fit/eval/predict()."""
   adapter_cls = data_adapter.select_data_adapter(x, y)
   if adapter_cls in _ADAPTER_FOR_STANDARDIZE_USER_DATA:
@@ -511,50 +620,79 @@ def _process_inputs(model, x, y, batch_size=None, sample_weights=None,
         sample_weight=sample_weights,
         class_weight=class_weights,
         batch_size=batch_size,
-        check_steps=True,
+        check_steps=False,
         steps=steps)
-    # TODO(scottzhu): The generator and keras.sequence does not work with
-    # model._standardize_user_data() so far. However that method is very
-    # important which contains on-fly model build/tensor align for dict input,
-    # etc. We should still call the _standardize_user_data with the peeked data
-    # from generator or sequence, and let model compile.
-  return adapter_cls(x, y, batch_size=batch_size,
-                     sample_weights=sample_weights, shuffle=shuffle,
-                     distribution_strategy=distribution_strategy)
 
-
-def _update_sample_weight_mode(model, mode, dataset):
-  """Updates the sample_weight_mode of a given model."""
-  # TODO(kaftan): This won't actually do anything right now because
-  ## dist_utils._update_sample_weight_modes only does things when the model
-  ## is distributed by cloning. We will need to revisit if a method here
-  ## is needed at all, and if so how it should look.
-  # Add a quick return to prevent us from calling model._feed_targets that
-  # accesses certain model properties that may not be set in the `PREDICT` mode.
   if mode == ModeKeys.PREDICT:
-    return
+    sample_weight_modes = None
+  else:
+    sample_weight_modes = [
+        e.sample_weight_mode for e in model._training_endpoints
+    ]
 
-  # Get some sample inputs from the data_adapter
-  iterator = iter(dataset)
-  _, _, sample_weights = training_v2_utils._prepare_feed_values(
-      model, iterator, mode)
+  adapter = adapter_cls(
+      x,
+      y,
+      batch_size=batch_size,
+      epochs=epochs,
+      steps=steps,
+      sample_weights=sample_weights,
+      sample_weight_modes=sample_weight_modes,
+      shuffle=shuffle,
+      distribution_strategy=distribution_strategy,
+      max_queue_size=max_queue_size,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing)
+  # As a fallback for the data type that does not work with
+  # _standardize_user_data, use the _prepare_model_with_inputs.
+  if adapter_cls not in _ADAPTER_FOR_STANDARDIZE_USER_DATA:
+    training_v2_utils._prepare_model_with_inputs(model, adapter.get_dataset())
+  return adapter
 
-  # Call the DistributionStrategy specific function to update the
-  # sample_weight_mode on the model.
-  dist_utils._update_sample_weight_modes(model, mode, sample_weights)
 
-  # Force delete the iterator.
-  del iterator
+def _get_total_number_of_samples(adapter):
+  if not adapter.get_size() or not adapter.batch_size():
+    return None
+  total_sample = adapter.get_size() * adapter.batch_size()
+  if adapter.has_partial_batch():
+    total_sample -= (adapter.batch_size() - adapter.partial_batch_size())
+  return total_sample
+
+
+def _aggregate_predict_results(strategy, batch_outs, model):
+  if not isinstance(batch_outs, list):
+    batch_outs = [batch_outs]
+  total_batch_outs = []
+  for i in range(len(model.outputs)):
+    num_replicas = strategy.num_replicas_in_sync
+    nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+    total_batch_outs.append(
+        dist_utils.concat_along_batch_dimension(nest.flatten(nested_outs)))
+  return total_batch_outs
+
+
+def _print_train_info(total_samples, steps, val_total_samples, val_steps):
+  increment = 'samples' if total_samples else 'steps'
+  conjunction = 'on' if total_samples else 'for'
+  msg = 'Train {} {} {}'.format(conjunction, total_samples or steps, increment)
+  if val_total_samples or val_steps:
+    increment = 'samples' if val_total_samples else 'steps'
+    conjunction = 'on' if val_total_samples else 'for'
+    msg += ', validate {} {} {}'.format(conjunction, val_total_samples or
+                                        val_steps, increment)
+  print(msg)
 
 
 class TrainingContext(object):
   """Utility object that wrap around callbacks and progress bars."""
 
   @tf_contextlib.contextmanager
-  def on_start(self, model, callbacks=None, verbose=0, mode=ModeKeys.TRAIN):
+  def on_start(self, model, callbacks=None, use_samples=False, verbose=0,
+               mode=ModeKeys.TRAIN):
     """Provide a scope for the whole training process."""
     # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-    progbar = training_utils.get_progbar(model, 'steps')
+    progbar = training_utils.get_progbar(
+        model, 'samples' if use_samples else 'steps')
     progbar.params = callbacks.params
     progbar.params['verbose'] = verbose
     callbacks.model.stop_training = False
@@ -587,15 +725,16 @@ class TrainingContext(object):
       self.progbar.on_epoch_end(epoch, epoch_logs)
 
   @tf_contextlib.contextmanager
-  def on_batch(self, step=0, mode=ModeKeys.TRAIN):
+  def on_batch(self, step=0, mode=ModeKeys.TRAIN, size=1):
     """Provide a scope for running one batch."""
-    batch_logs = {'batch': step, 'size': 1}
+    batch_logs = {'batch': step, 'size': size}
     self.callbacks._call_batch_hook(
         mode, 'begin', step, batch_logs)
     self.progbar.on_batch_begin(step, batch_logs)
     try:
       yield batch_logs
     finally:
-      self.callbacks._call_batch_hook(
-          mode, 'end', step, batch_logs)
-      self.progbar.on_batch_end(step, batch_logs)
+      if not batch_logs.pop('data_exhausted', False):
+        self.callbacks._call_batch_hook(
+            mode, 'end', step, batch_logs)
+        self.progbar.on_batch_end(step, batch_logs)
diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py
index 2f42a5f531b..94544021227 100644
--- a/tensorflow/python/keras/engine/training_v2_utils.py
+++ b/tensorflow/python/keras/engine/training_v2_utils.py
@@ -28,11 +28,17 @@ import functools
 
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework.ops import composite_tensor
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
 from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
 
 
 def _get_or_make_execution_function(model, mode):
@@ -52,7 +58,7 @@ def _get_or_make_execution_function(model, mode):
 
 def _make_execution_function(model, mode):
   """Creates a function to run one step of distributed model execution."""
-  per_replica_function = _make_replica_execution_function(mode)
+  per_replica_function = _make_replica_execution_function(model, mode)
 
   def distributed_function(input_iterator):
     """A single step of the distributed execution across replicas."""
@@ -64,25 +70,29 @@ def _make_execution_function(model, mode):
     # are PerReplicas too.
     strategy = distribution_strategy_context.get_strategy()
     outputs = strategy.experimental_run_v2(
-        per_replica_function, args=(model, x, y, sample_weights))
+        per_replica_function, args=(x, y, sample_weights))
     # Out of PerReplica outputs reduce or pick values to return.
-    all_outputs = dist_utils.unwrap_outputs(
-        strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
+    all_outputs = dist_utils.unwrap_output_dict(
+        strategy, outputs, mode)
     return all_outputs
 
-  if model.run_eagerly:
-    execution_function = distributed_function
-  else:
+  if not model.run_eagerly:
     distributed_function = def_function.function(
         distributed_function, autograph=False)
 
-    def execution_function(input_fn):
-      # `numpy` translates Tensors to values in Eager mode.
-      return [out.numpy() for out in distributed_function(input_fn)]
+  def execution_function(input_fn):
+    # `numpy` translates Tensors to values in Eager mode.
+    return nest.map_structure(_non_none_constant_value,
+                              distributed_function(input_fn))
 
   return execution_function
 
 
+def _non_none_constant_value(v):
+  constant_value = tensor_util.constant_value(v)
+  return constant_value if constant_value is not None else v
+
+
 def _prepare_feed_values(model, inputs, mode):
   """Prepare feed values to the model execution function.
 
@@ -121,7 +131,8 @@ def _get_input_from_iterator(iterator):
   """Get elements from the iterator and verify the input shape and type."""
   next_element = next(iterator)
 
-  if tensor_util.is_tensor(next_element) or isinstance(next_element, dict):
+  if (tensor_util.is_tensor(next_element) or
+      isinstance(next_element, (dict, composite_tensor.CompositeTensor))):
     next_element = [next_element]
   if len(next_element) == 1:
     x, = next_element
@@ -139,14 +150,14 @@ def _get_input_from_iterator(iterator):
   return x, y, sample_weights
 
 
-def _make_replica_execution_function(mode):
+def _make_replica_execution_function(model, mode):
   """A single step of the distributed execution on a replica."""
   if mode == ModeKeys.TRAIN:
-    func = train_on_batch
+    func = functools.partial(train_on_batch, model)
   elif mode == ModeKeys.TEST:
-    func = test_on_batch
+    func = functools.partial(test_on_batch, model)
   else:
-    def _predict_on_batch(model, x, y=None, sample_weights=None):
+    def _predict_on_batch(x, y=None, sample_weights=None):
       del y, sample_weights
       return predict_on_batch(model, x)
 
@@ -160,6 +171,29 @@ def _make_replica_execution_function(mode):
   return func
 
 
+def _prepare_model_with_inputs(model, dataset):
+  """Use the data from the adapter to config the model.
+
+  Model need to be properly configured before training, eg build with inputs, or
+  compile with inputs for subclass model.
+
+  Args:
+    model: a Keras model object.
+    dataset: a eager dataset instance where the data will be extracted.
+  """
+  if not model.inputs:
+    inputs, target, _ = model._build_model_with_inputs(dataset, targets=None)
+  else:
+    inputs, target, _ = _get_input_from_iterator(iter(dataset))
+
+  if not model._is_compiled and model.optimizer:
+    model._compile_from_inputs(inputs, target, dataset, None)
+
+  if target is not None:
+    training_utils.prepare_sample_weight_modes(model._training_endpoints,
+                                               model.sample_weight_mode)
+
+
 def train_on_batch(
     model,
     x,
@@ -178,11 +212,11 @@ def train_on_batch(
             (in case the model has multiple inputs).
         - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-        - A `tf.data` dataset or a dataset iterator.
+        - A `tf.data` dataset.
       y: Target data. Like the input data `x`, it could be either Numpy
         array(s) or TensorFlow tensor(s). It should be consistent with `x`
         (you cannot have Numpy inputs and tensor targets, or inversely). If
-        `x` is a dataset or a dataset iterator, `y` should not be specified
+        `x` is a dataset `y` should not be specified
         (since targets will be obtained from the iterator).
       sample_weight: Optional array of the same length as x, containing
         weights to apply to the model's loss for each sample. In the case of
@@ -190,7 +224,7 @@ def train_on_batch(
         sequence_length), to apply a different weight to every timestep of
         every sample. In this case you should make sure to specify
         sample_weight_mode="temporal" in compile(). This argument is not
-        supported when `x` is a dataset or a dataset iterator.
+        supported when `x` is a dataset.
       class_weight: Optional dictionary mapping class indices (integers) to a
         weight (float) to apply to the model's loss for the samples from this
         class during training. This can be useful to tell the model to "pay
@@ -217,7 +251,7 @@ def train_on_batch(
   x, y, sample_weights = model._standardize_user_data(
       x, y, sample_weight=sample_weight, class_weight=class_weight,
       extract_tensors_from_dataset=True)
-
+  batch_size = array_ops.shape(nest.flatten(x, expand_composites=True)[0])[0]
   # If `model._distribution_strategy` is True, then we are in a replica context
   # at this point because of the check above.  `train_on_batch` is being run
   # for each replica by `model._distribution_strategy` and the same code path
@@ -232,8 +266,7 @@ def train_on_batch(
   if reset_metrics:
     model.reset_metrics()
 
-  if len(outputs) == 1:
-    return outputs[0]
+  outputs['batch_size'] = math_ops.cast(batch_size, dtypes.int64)
   return outputs
 
 
@@ -249,12 +282,12 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True):
           (in case the model has multiple inputs).
         - A dict mapping input names to the corresponding array/tensors,
           if the model has named inputs.
-        - A `tf.data` dataset or a dataset iterator.
+        - A `tf.data` dataset.
       y: Target data. Like the input data `x`,
         it could be either Numpy array(s) or TensorFlow tensor(s).
         It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset or a
-        dataset iterator, `y` should not be specified
+        tensor targets, or inversely). If `x` is a dataset,
+        `y` should not be specified
         (since targets will be obtained from the iterator).
       sample_weight: Optional array of the same length as x, containing
           weights to apply to the model's loss for each sample.
@@ -263,7 +296,7 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True):
           to apply a different weight to every timestep of every sample.
           In this case you should make sure to specify
           sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset or a dataset iterator.
+          supported when `x` is a dataset.
       reset_metrics: If `True`, the metrics returned will be only for this
         batch. If `False`, the metrics will be statefully accumulated across
         batches.
@@ -285,6 +318,7 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True):
   x, y, sample_weights = model._standardize_user_data(
       x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
 
+  batch_size = array_ops.shape(nest.flatten(x, expand_composites=True)[0])[0]
   outputs = training_eager.test_on_batch(
       model,
       x,
@@ -295,8 +329,7 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True):
   if reset_metrics:
     model.reset_metrics()
 
-  if len(outputs) == 1:
-    return outputs[0]
+  outputs['batch_size'] = math_ops.cast(batch_size, dtypes.int64)
   return outputs
 
 
@@ -310,7 +343,7 @@ def predict_on_batch(model, x):
           (in case the model has multiple inputs).
         - A TensorFlow tensor, or a list of tensors
           (in case the model has multiple inputs).
-        - A `tf.data` dataset or a dataset iterator.
+        - A `tf.data` dataset.
 
   Returns:
       Numpy array(s) of predictions.
@@ -333,4 +366,5 @@ def predict_on_batch(model, x):
     if len(inputs) == 1:
       inputs = inputs[0]
 
-  return model(inputs)  # pylint: disable=not-callable
+  with backend.eager_learning_phase_scope(0):
+    return model(inputs)  # pylint: disable=not-callable
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index 65a0e0c93ff..89ff3f061b3 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -121,6 +121,27 @@ def model_to_estimator_v2(
   [Creating estimators from Keras
   Models](https://tensorflow.org/guide/estimators#model_to_estimator).
 
+  __Sample Weights__
+  Estimators returned by `model_to_estimator` are configured to handle sample
+  weights (similar to `keras_model.fit(x, y, sample_weights)`). To pass sample
+  weights when training or evaluating the Estimator, the first item returned by
+  the input function should be a dictionary with keys `features` and
+  `sample_weights`. Example below:
+
+  ```
+  keras_model = tf.keras.Model(...)
+  keras_model.compile(...)
+
+  estimator = tf.keras.estimator.model_to_estimator(keras_model)
+
+  def input_fn():
+    return dataset_ops.Dataset.from_tensors(
+        ({'features': features, 'sample_weights': sample_weights},
+         targets))
+
+  estimator.train(input_fn, steps=1)
+  ```
+
   Args:
     keras_model: A compiled Keras model object. This argument is mutually
       exclusive with `keras_model_path`.
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 2bb030dc634..8510db99e38 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -56,8 +56,6 @@ class KerasIntegrationTest(keras_parameterized.TestCase):
 class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
 
   def test_vector_classification(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     np.random.seed(1337)
     (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=100,
@@ -76,7 +74,7 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -113,7 +111,7 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     if not testing_utils.should_run_eagerly():
       self.assertEqual(len(model.get_losses_for(None)), 2)
       self.assertEqual(len(model.get_updates_for(x)), 2)
@@ -154,7 +152,7 @@ class SequentialIntegrationTest(KerasIntegrationTest):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x_train, y_train, epochs=1, batch_size=10,
               validation_data=(x_train, y_train),
               verbose=2)
@@ -177,7 +175,7 @@ class SequentialIntegrationTest(KerasIntegrationTest):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -195,9 +193,6 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   def test_timeseries_classification(self):
-    if testing_utils.should_run_distributed():
-      # Test timeout, seems to be a performance issue.
-      self.skipTest('b/137397816')
     np.random.seed(1337)
     (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=100,
@@ -217,7 +212,7 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -228,9 +223,6 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
     self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
   def test_timeseries_classification_sequential_tf_rnn(self):
-    if testing_utils.should_run_distributed():
-      # Test timeout, seems to be a performance issue.
-      self.skipTest('b/137397816')
     np.random.seed(1337)
     (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=100,
@@ -250,7 +242,7 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -266,8 +258,6 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
 class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
 
   def test_image_classification(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     np.random.seed(1337)
     (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=100,
@@ -291,7 +281,7 @@ class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -336,7 +326,7 @@ class ActivationV2IntegrationTest(keras_parameterized.TestCase):
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x_train, y_train, epochs=2, batch_size=10,
               validation_data=(x_train, y_train),
               verbose=2)
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index 89fbbffb95c..0beb4d456bf 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import itertools
 import unittest
@@ -31,6 +30,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 class TestCase(test.TestCase, parameterized.TestCase):
@@ -204,9 +204,10 @@ def run_all_keras_modes(test_or_class=None,
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics,
-                    run_eagerly=testing_utils.should_run_eagerly(),
-                    run_distributed=testing_utils.should_run_distributed())
+      model.compile(
+          optimizer, loss, metrics=metrics,
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       inputs = np.zeros((10, 3))
       targets = np.zeros((10, 4))
@@ -243,12 +244,11 @@ def run_all_keras_modes(test_or_class=None,
       a target dependency.
   """
 
-  params = [('_v2_function', 'v2_function'),
-            ('_v2_distributed', 'v2_distributed')]
+  params = [('_v2_function', 'v2_function'), ('_v2_funcgraph', 'v2_funcgraph')]
   if not always_skip_eager:
     params.append(('_v2_eager', 'v2_eager'))
   if not (always_skip_v1 or tf2.enabled()):
-    params.append(('_v1_graph', 'v1_graph'))
+    params.append(('_v1_session', 'v1_session'))
 
   def single_method_decorator(f):
     """Decorator that constructs the test cases."""
@@ -258,14 +258,14 @@ def run_all_keras_modes(test_or_class=None,
     @functools.wraps(f)
     def decorated(self, run_mode, *args, **kwargs):
       """A run of a single test case w/ specified run mode."""
-      if run_mode == 'v1_graph':
-        _v1_graph_test(f, self, config, *args, **kwargs)
-      elif run_mode == 'v2_function':
+      if run_mode == 'v1_session':
+        _v1_session_test(f, self, config, *args, **kwargs)
+      elif run_mode == 'v2_funcgraph':
         _v2_graph_functions_test(f, self, *args, **kwargs)
       elif run_mode == 'v2_eager':
         _v2_eager_test(f, self, *args, **kwargs)
-      elif run_mode == 'v2_distributed':
-        _v2_distributed_test(f, self, *args, **kwargs)
+      elif run_mode == 'v2_function':
+        _v2_function_test(f, self, *args, **kwargs)
       else:
         return ValueError('Unknown run mode %s' % run_mode)
 
@@ -274,9 +274,9 @@ def run_all_keras_modes(test_or_class=None,
   return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
-def _v1_graph_test(f, test_or_class, config, *args, **kwargs):
+def _v1_session_test(f, test_or_class, config, *args, **kwargs):
   with context.graph_mode(), testing_utils.run_eagerly_scope(False):
-    with testing_utils.run_distributed_scope(False):
+    with testing_utils.experimental_run_tf_function_scope(False):
       with test_or_class.test_session(use_gpu=True, config=config):
         f(test_or_class, *args, **kwargs)
 
@@ -284,21 +284,21 @@ def _v1_graph_test(f, test_or_class, config, *args, **kwargs):
 def _v2_graph_functions_test(f, test_or_class, *args, **kwargs):
   with context.eager_mode():
     with testing_utils.run_eagerly_scope(False):
-      with testing_utils.run_distributed_scope(False):
+      with testing_utils.experimental_run_tf_function_scope(False):
         f(test_or_class, *args, **kwargs)
 
 
 def _v2_eager_test(f, test_or_class, *args, **kwargs):
   with context.eager_mode():
     with testing_utils.run_eagerly_scope(True):
-      with testing_utils.run_distributed_scope(False):
+      with testing_utils.experimental_run_tf_function_scope(True):
         f(test_or_class, *args, **kwargs)
 
 
-def _v2_distributed_test(f, test_or_class, *args, **kwargs):
+def _v2_function_test(f, test_or_class, *args, **kwargs):
   with context.eager_mode():
     with testing_utils.run_eagerly_scope(False):
-      with testing_utils.run_distributed_scope(True):
+      with testing_utils.experimental_run_tf_function_scope(True):
         f(test_or_class, *args, **kwargs)
 
 
@@ -326,7 +326,7 @@ def _test_or_class_decorator(test_or_class, single_method_decorator):
     The decorated result.
   """
   def _decorate_test_or_class(obj):
-    if isinstance(obj, collections.Iterable):
+    if isinstance(obj, collections_abc.Iterable):
       return itertools.chain.from_iterable(
           single_method_decorator(method) for method in obj)
     if isinstance(obj, type):
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
index d08ef4fc913..0017fcb6e3e 100644
--- a/tensorflow/python/keras/keras_parameterized_test.py
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -210,21 +210,21 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed))
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function))
 
     e = ExampleTest()
     if not tf2.enabled():
-      e.testBody_v1_graph()
+      e.testBody_v1_session()
     e.testBody_v2_eager()
+    e.testBody_v2_funcgraph()
     e.testBody_v2_function()
-    e.testBody_v2_distributed()
 
     if not tf2.enabled():
       self.assertLen(l, 4)
       self.assertAllEqual(l, [
           ("graph", False, False),
-          ("eager", True, False),
+          ("eager", True, True),
           ("eager", False, False),
           ("eager", False, True),
       ])
@@ -236,7 +236,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     else:
       self.assertLen(l, 3)
       self.assertAllEqual(l, [
-          ("eager", True, False),
+          ("eager", True, True),
           ("eager", False, False),
           ("eager", False, True),
       ])
@@ -262,27 +262,27 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
         mode = "eager" if context.executing_eagerly() else "graph"
         with_brackets = "with_brackets" if with_brackets else "without_brackets"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((with_brackets, mode, should_run_eagerly,
-                  should_run_distributed))
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append(
+            (with_brackets, mode, should_run_eagerly, should_run_tf_function))
 
     e = ExampleTest()
     if not tf2.enabled():
-      e.testBody_0_v1_graph()
-      e.testBody_1_v1_graph()
+      e.testBody_0_v1_session()
+      e.testBody_1_v1_session()
 
     e.testBody_0_v2_eager()
+    e.testBody_0_v2_funcgraph()
     e.testBody_0_v2_function()
-    e.testBody_0_v2_distributed()
     e.testBody_1_v2_eager()
+    e.testBody_1_v2_funcgraph()
     e.testBody_1_v2_function()
-    e.testBody_1_v2_distributed()
 
     expected_combinations = {
-        ("with_brackets", "eager", True, False),
+        ("with_brackets", "eager", True, True),
         ("with_brackets", "eager", False, False),
         ("with_brackets", "eager", False, True),
-        ("without_brackets", "eager", True, False),
+        ("without_brackets", "eager", True, True),
         ("without_brackets", "eager", False, False),
         ("without_brackets", "eager", False, True),
     }
@@ -314,25 +314,26 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed))
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function))
 
     e = ExampleTest()
-    if hasattr(e, "testBody_v1_graph"):
-      e.testBody_v1_graph()
+    if hasattr(e, "testBody_v1_session"):
+      e.testBody_v1_session()
     if hasattr(e, "testBody_v2_eager"):
       e.testBody_v2_eager()
+    if hasattr(e, "testBody_v2_funcgraph"):
+      e.testBody_v2_funcgraph()
     if hasattr(e, "testBody_v2_function"):
       e.testBody_v2_function()
-    if hasattr(e, "testBody_v2_distributed"):
-      e.testBody_v2_distributed()
 
     self.assertLen(l, 3)
-    self.assertEqual(set(l), {
-        ("eager", True, False),
-        ("eager", False, False),
-        ("eager", False, True),
-    })
+    self.assertEqual(
+        set(l), {
+            ("eager", True, True),
+            ("eager", False, False),
+            ("eager", False, True),
+        })
 
   def test_run_all_keras_modes_with_all_model_types(self):
     l = []
@@ -347,34 +348,34 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed,
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function,
                   testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_v2_eager_functional()
+    e.testBody_v2_funcgraph_functional()
     e.testBody_v2_function_functional()
-    e.testBody_v2_distributed_functional()
     e.testBody_v2_eager_sequential()
+    e.testBody_v2_funcgraph_sequential()
     e.testBody_v2_function_sequential()
-    e.testBody_v2_distributed_sequential()
     e.testBody_v2_eager_subclass()
+    e.testBody_v2_funcgraph_subclass()
     e.testBody_v2_function_subclass()
-    e.testBody_v2_distributed_subclass()
 
     if not tf2.enabled():
-      e.testBody_v1_graph_functional()
-      e.testBody_v1_graph_sequential()
-      e.testBody_v1_graph_subclass()
+      e.testBody_v1_session_functional()
+      e.testBody_v1_session_sequential()
+      e.testBody_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, False, "functional"),
+        ("eager", True, True, "functional"),
         ("eager", False, False, "functional"),
         ("eager", False, True, "functional"),
-        ("eager", True, False, "sequential"),
+        ("eager", True, True, "sequential"),
         ("eager", False, False, "sequential"),
         ("eager", False, True, "sequential"),
-        ("eager", True, False, "subclass"),
+        ("eager", True, True, "subclass"),
         ("eager", False, False, "subclass"),
         ("eager", False, True, "subclass"),
     }
@@ -408,34 +409,34 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed,
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function,
                   testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_functional_v2_eager()
+    e.testBody_functional_v2_funcgraph()
     e.testBody_functional_v2_function()
-    e.testBody_functional_v2_distributed()
     e.testBody_sequential_v2_eager()
+    e.testBody_sequential_v2_funcgraph()
     e.testBody_sequential_v2_function()
-    e.testBody_sequential_v2_distributed()
     e.testBody_subclass_v2_eager()
+    e.testBody_subclass_v2_funcgraph()
     e.testBody_subclass_v2_function()
-    e.testBody_subclass_v2_distributed()
 
     if not tf2.enabled():
-      e.testBody_functional_v1_graph()
-      e.testBody_sequential_v1_graph()
-      e.testBody_subclass_v1_graph()
+      e.testBody_functional_v1_session()
+      e.testBody_sequential_v1_session()
+      e.testBody_subclass_v1_session()
 
     expected_combinations = {
-        ("eager", True, False, "functional"),
+        ("eager", True, True, "functional"),
         ("eager", False, False, "functional"),
         ("eager", False, True, "functional"),
-        ("eager", True, False, "sequential"),
+        ("eager", True, True, "sequential"),
         ("eager", False, False, "sequential"),
         ("eager", False, True, "sequential"),
-        ("eager", True, False, "subclass"),
+        ("eager", True, True, "subclass"),
         ("eager", False, False, "subclass"),
         ("eager", False, True, "subclass"),
     }
@@ -471,34 +472,34 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self, arg):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed,
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function,
                   testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_funcgraph_functional()
     e.testBody_arg_v2_function_functional()
-    e.testBody_arg_v2_distributed_functional()
     e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_funcgraph_sequential()
     e.testBody_arg_v2_function_sequential()
-    e.testBody_arg_v2_distributed_sequential()
     e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_funcgraph_subclass()
     e.testBody_arg_v2_function_subclass()
-    e.testBody_arg_v2_distributed_subclass()
 
     if not tf2.enabled():
-      e.testBody_arg_v1_graph_functional()
-      e.testBody_arg_v1_graph_sequential()
-      e.testBody_arg_v1_graph_subclass()
+      e.testBody_arg_v1_session_functional()
+      e.testBody_arg_v1_session_sequential()
+      e.testBody_arg_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, False, "functional"),
+        ("eager", True, True, "functional"),
         ("eager", False, False, "functional"),
         ("eager", False, True, "functional"),
-        ("eager", True, False, "sequential"),
+        ("eager", True, True, "sequential"),
         ("eager", False, False, "sequential"),
         ("eager", False, True, "sequential"),
-        ("eager", True, False, "subclass"),
+        ("eager", True, True, "subclass"),
         ("eager", False, False, "subclass"),
         ("eager", False, True, "subclass"),
     }
@@ -534,34 +535,34 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self, arg):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_distributed = testing_utils.should_run_distributed()
-        l.append((mode, should_run_eagerly, should_run_distributed,
+        should_run_tf_function = testing_utils.should_run_tf_function()
+        l.append((mode, should_run_eagerly, should_run_tf_function,
                   testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_funcgraph_functional()
     e.testBody_arg_v2_function_functional()
-    e.testBody_arg_v2_distributed_functional()
     e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_funcgraph_sequential()
     e.testBody_arg_v2_function_sequential()
-    e.testBody_arg_v2_distributed_sequential()
     e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_funcgraph_subclass()
     e.testBody_arg_v2_function_subclass()
-    e.testBody_arg_v2_distributed_subclass()
 
     if not tf2.enabled():
-      e.testBody_arg_v1_graph_functional()
-      e.testBody_arg_v1_graph_sequential()
-      e.testBody_arg_v1_graph_subclass()
+      e.testBody_arg_v1_session_functional()
+      e.testBody_arg_v1_session_sequential()
+      e.testBody_arg_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, False, "functional"),
+        ("eager", True, True, "functional"),
         ("eager", False, False, "functional"),
         ("eager", False, True, "functional"),
-        ("eager", True, False, "sequential"),
+        ("eager", True, True, "sequential"),
         ("eager", False, False, "sequential"),
         ("eager", False, True, "sequential"),
-        ("eager", True, False, "subclass"),
+        ("eager", True, True, "subclass"),
         ("eager", False, False, "subclass"),
         ("eager", False, True, "subclass"),
     }
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 07caba4f323..34e0adef938 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -98,7 +98,7 @@ class AdvancedActivationsTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
 
 
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index dcece7fc754..9e06c4c882e 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -176,25 +176,24 @@ class Conv(Layer):
       self.bias = None
     self.input_spec = InputSpec(ndim=self.rank + 2,
                                 axes={channel_axis: input_dim})
-    self._convolution_op = None
-    self.built = True
-
-  def call(self, inputs):
     if self.padding == 'causal':
       op_padding = 'valid'
     else:
       op_padding = self.padding
     if not isinstance(op_padding, (list, tuple)):
       op_padding = op_padding.upper()
-    if self._convolution_op is None:
-      self._convolution_op = nn_ops.Convolution(
-          inputs.shape,
-          filter_shape=self.kernel.shape,
-          dilation_rate=self.dilation_rate,
-          strides=self.strides,
-          padding=op_padding,
-          data_format=conv_utils.convert_data_format(self.data_format,
-                                                     self.rank + 2))
+
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=self.kernel.shape,
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=op_padding,
+        data_format=conv_utils.convert_data_format(self.data_format,
+                                                   self.rank + 2))
+    self.built = True
+
+  def call(self, inputs):
     outputs = self._convolution_op(inputs, self.kernel)
 
     if self.use_bias:
@@ -1786,6 +1785,7 @@ class DepthwiseConv2D(Conv2D):
     if len(input_shape) < 4:
       raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
                        'Received input shape:', str(input_shape))
+    input_shape = tensor_shape.TensorShape(input_shape)
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index fe2994c8335..b1e30ac3790 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -921,7 +921,8 @@ class ConvLSTM2D(ConvRNN2D):
                           recurrent_constraint=recurrent_constraint,
                           bias_constraint=bias_constraint,
                           dropout=dropout,
-                          recurrent_dropout=recurrent_dropout)
+                          recurrent_dropout=recurrent_dropout,
+                          dtype=kwargs.get('dtype'))
     super(ConvLSTM2D, self).__init__(cell,
                                      return_sequences=return_sequences,
                                      go_backwards=go_backwards,
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 1c2d8cafbef..df78cffa4a2 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -1043,11 +1043,7 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
-      # Cast the inputs to self.dtype, which is the variable dtype. We do not
-      # cast if `should_cast_variables` is True, as in that case the variable
-      # will be automatically casted to inputs.dtype.
-      if not self._mixed_precision_policy.should_cast_variables:
-        inputs = math_ops.cast(inputs, self.dtype)
+      inputs = math_ops.cast(inputs, self._compute_dtype)
       if K.is_sparse(inputs):
         outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, self.kernel)
       else:
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 992cb41093e..c9437052081 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -154,7 +154,7 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     def lambda_fn(x):
       return math_ops.matmul(x[0], x[1])
 
-    l = keras.layers.Lambda(lambda_fn)
+    l = keras.layers.Lambda(lambda_fn, dtype=dtypes.float64)
     output_shape = l.compute_output_shape([(10, 10), (10, 20)])
     self.assertAllEqual((10, 20), output_shape)
     output_signature = l.compute_output_signature([
@@ -289,7 +289,7 @@ class TestStatefulLambda(keras_parameterized.TestCase):
         keras.optimizer_v2.gradient_descent.SGD(0.1),
         'mae',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x, y = np.ones((10, 10), 'float32'), 2 * np.ones((10, 10), 'float32')
     model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
     self.assertLen(model.trainable_weights, 1)
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 68ac8b7b277..ddd9a7ffa04 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -293,13 +293,16 @@ class CuDNNGRU(_CuDNNRNN):
         ],
         shape=self._vector_shape)
 
-    outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-        inputs,
-        input_h=input_h,
-        input_c=0,
-        params=params,
-        is_training=True,
-        rnn_mode='gru')
+    args = {
+        'input': inputs,
+        'input_h': input_h,
+        'input_c': 0,
+        'params': params,
+        'is_training': True,
+        'rnn_mode': 'gru',
+    }
+
+    outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
 
     if self.stateful or self.return_state:
       h = h[0]
@@ -492,12 +495,15 @@ class CuDNNLSTM(_CuDNNRNN):
         ],
         shape=self._vector_shape)
 
-    outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-        inputs,
-        input_h=input_h,
-        input_c=input_c,
-        params=params,
-        is_training=True)
+    args = {
+        'input': inputs,
+        'input_h': input_h,
+        'input_c': input_c,
+        'params': params,
+        'is_training': True,
+    }
+
+    outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
 
     if self.stateful or self.return_state:
       h = h[0]
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 12bde978a6e..3f0c72b5245 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -87,7 +87,7 @@ class CuDNNTest(keras_parameterized.TestCase):
     self.assertEqual(len(state), num_states)
     model = keras.models.Model(inputs, state[0])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     inputs = np.random.random((num_samples, timesteps, input_size))
     state = model.predict(inputs)
@@ -139,14 +139,16 @@ class CuDNNTest(keras_parameterized.TestCase):
       output = layer(inputs, initial_state=initial_state[0])
     else:
       output = layer(inputs, initial_state=initial_state)
-    self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors)
+    self.assertTrue(
+        any(initial_state[0] is t
+            for t in layer._inbound_nodes[0].input_tensors))
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(
         loss='categorical_crossentropy',
         optimizer=RMSprop(learning_rate=0.001),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.random.random((num_samples, timesteps, input_size))
     initial_state = [
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index d0b9d5355d4..054e840f48c 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -177,6 +177,11 @@ class BaseDenseAttention(Layer):
             '{} layer mask must be a list of length 2, namely [query_mask, '
             'value_mask]. Given length: {}'.format(class_name, len(mask)))
 
+  def get_config(self):
+    config = {'causal': self.causal}
+    base_config = super(BaseDenseAttention, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.layers.Attention')
 class Attention(BaseDenseAttention):
@@ -302,6 +307,11 @@ class Attention(BaseDenseAttention):
       scores *= self.scale
     return scores
 
+  def get_config(self):
+    config = {'use_scale': self.use_scale}
+    base_config = super(Attention, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.layers.AdditiveAttention')
 class AdditiveAttention(BaseDenseAttention):
@@ -439,6 +449,11 @@ class AdditiveAttention(BaseDenseAttention):
     return math_ops.reduce_sum(
         scale * math_ops.tanh(q_reshaped + k_reshaped), axis=-1)
 
+  def get_config(self):
+    config = {'use_scale': self.use_scale}
+    base_config = super(AdditiveAttention, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 def _lower_triangular_mask(shape):
   """Creates a lower-triangular boolean mask over the last 2 dimensions."""
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 5a9f35875bc..ef1dfd9109e 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import core
@@ -120,6 +121,18 @@ class BaseDenseAttentionTest(test.TestCase):
     expected = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
+  def test_serialization(self):
+    # Test serialization with causal
+    layer = dense_attention.BaseDenseAttention(causal=True)
+
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.causal, True)
+
+    config = layer.get_config()
+    new_layer = dense_attention.BaseDenseAttention.from_config(config)
+    self.assertEqual(new_layer.causal, True)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class AttentionTest(test.TestCase):
@@ -428,6 +441,18 @@ class AttentionTest(test.TestCase):
     actual = attention_layer([q, v])
     self.assertAllClose([[[0], [1]]], actual)
 
+  def test_serialization(self):
+    # Test serialization with use_scale
+    layer = dense_attention.Attention(use_scale=True)
+
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.use_scale, True)
+
+    config = layer.get_config()
+    new_layer = dense_attention.Attention.from_config(config)
+    self.assertEqual(new_layer.use_scale, True)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class AdditiveAttentionTest(test.TestCase):
@@ -662,6 +687,18 @@ class AdditiveAttentionTest(test.TestCase):
     expected = np.array([[[1.15497245968], [0.]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
+  def test_serialization(self):
+    # Test serialization with use_scale
+    layer = dense_attention.AdditiveAttention(use_scale=True)
+
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.use_scale, True)
+
+    config = layer.get_config()
+    new_layer = dense_attention.AdditiveAttention.from_config(config)
+    self.assertEqual(new_layer.use_scale, True)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class LowerTriangularMaskTest(test.TestCase):
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 85285db20aa..fea19011236 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -104,6 +104,11 @@ class Embedding(Layer):
       else:
         kwargs['input_shape'] = (None,)
     dtype = kwargs.pop('dtype', K.floatx())
+    # We set autocast to False, as we do not want to cast floating- point inputs
+    # to self.dtype. In call(), we cast to int32, and casting to self.dtype
+    # before casting to int32 might cause the int32 values to be different due
+    # to a loss of precision.
+    kwargs['autocast'] = False
     super(Embedding, self).__init__(dtype=dtype, **kwargs)
 
     self.input_dim = input_dim
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 8545941ecf0..f49cbe4d764 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -80,7 +80,7 @@ class EmbeddingTest(keras_parameterized.TestCase):
 
     layer.set_weights([np.array([[1, 1], [2, 2]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
     outputs = model.predict(np.array([[0, 1, 0]], dtype='int32'))
     self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 0383db0dc2e..cf32b8022b7 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -43,6 +43,19 @@ class GRULayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_float64_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.GRU,
+        kwargs={'units': units,
+                'return_sequences': True,
+                'dtype': 'float64'},
+        input_shape=(num_samples, timesteps, embedding_dim),
+        input_dtype='float64')
+
   def test_dynamic_behavior_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -55,7 +68,7 @@ class GRULayerTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -106,7 +119,7 @@ class GRULayerTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     gru_model.fit(x_train, y_train)
     gru_model.predict(x_train)
 
@@ -122,7 +135,7 @@ class GRULayerTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_statefulness_GRU(self):
@@ -147,7 +160,7 @@ class GRULayerTest(keras_parameterized.TestCase):
         optimizer='sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index ca5e6f3d2e7..0a58879bdd1 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -136,7 +137,6 @@ class GRUV2Test(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-  # Due to b/120160788
   @test_util.run_v2_only
   def test_gru_v2_feature_parity_with_canonical_gru(self):
     input_shape = 10
@@ -259,8 +259,8 @@ class GRUV2Test(keras_parameterized.TestCase):
       canonical_model.set_weights(weights)
       y_3 = canonical_model.predict(x_train)
 
-    self.assertAllClose(y_1, y_2)
-    self.assertAllClose(y_2, y_3)
+    self.assertAllClose(y_1, y_2, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(y_2, y_3, rtol=1e-5, atol=1e-5)
 
   @parameterized.named_parameters(
       # test_name, time_major, go_backwards
@@ -342,6 +342,19 @@ class GRUV2Test(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_float64_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        rnn.GRU,
+        kwargs={'units': units,
+                'return_sequences': True,
+                'dtype': 'float64'},
+        input_shape=(num_samples, timesteps, embedding_dim),
+        input_dtype='float64')
+
   def test_return_states_GRU(self):
     layer_class = rnn.GRU
     x = np.random.random((2, 3, 4))
@@ -422,8 +435,6 @@ class GRUV2Test(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-  # Run in V2 only due to b/120160788.
-  @test_util.run_v2_only
   def test_statefulness_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -445,7 +456,7 @@ class GRUV2Test(keras_parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -518,7 +529,7 @@ class GRUV2Test(keras_parameterized.TestCase):
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, epochs=1, shuffle=False)
 
   @test_util.run_v2_only
@@ -546,6 +557,23 @@ class GRUV2Test(keras_parameterized.TestCase):
       outputs_trimmed = lstm(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
 
+  @test_util.run_deprecated_v1
+  def test_v1_session_behavior(self):
+    # See b/139132348 for more details.
+    x = np.random.uniform(size=(100, 4, 8))
+    y = np.random.uniform(size=(100, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x, y)).shuffle(100).batch(32)
+
+    inp = keras.layers.Input(shape=(4, 8))
+    layer = rnn.GRU(1)(inp)
+    layer = keras.layers.Dense(1)(layer)
+
+    model = keras.models.Model(inp, layer)
+
+    model.compile(loss='mse', optimizer='sgd')
+    model.fit(dataset)
+
 
 class GRULayerGradientTapeTest(test.TestCase):
 
@@ -593,9 +621,10 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
         num_classes=self.output_shape)
     y_train = keras.utils.to_categorical(y_train, self.output_shape)
 
-    model.compile(optimizer='sgd',
-                  loss=['categorical_crossentropy', None],
-                  run_distributed=testing_utils.should_run_distributed())
+    model.compile(
+        optimizer='sgd',
+        loss=['categorical_crossentropy', None],
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     existing_loss = 0
     for _ in range(self.epoch):
@@ -611,6 +640,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
+  @test_util.run_v2_only
   def test_GRU_runtime(self):
     layer = rnn.GRU(self.rnn_state_size, return_runtime=True)
 
@@ -626,9 +656,64 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  # Due to b/120160788.
   @test_util.run_v2_only
-  def test_UnifiedGRU_with_cond(self):
+  def test_GRU_runtime_with_mask(self):
+    # Masking will affect which backend is selected based on whether the mask
+    # is strictly right padded.
+    layer = rnn.GRU(self.rnn_state_size, return_runtime=True)
+
+    inputs = keras.layers.Input(
+        shape=[self.timestep, self.input_shape], dtype=dtypes.float32)
+    masked_inputs = keras.layers.Masking()(inputs)
+
+    outputs, runtime = layer(masked_inputs)
+    # Expand the runtime so that it is a 1D tensor instead of scalar.
+    # TF model does not work with scalar model output, specially during
+    # aggregation.
+    runtime = keras.layers.Lambda(
+        lambda x: array_ops.expand_dims(x, axis=-1))(runtime)
+    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=self.batch,
+        test_samples=0,
+        input_shape=(self.timestep, self.input_shape),
+        num_classes=self.output_shape)
+    y_train = keras.utils.to_categorical(y_train, self.output_shape)
+
+    model.compile(
+        optimizer='sgd',
+        loss=['categorical_crossentropy', None],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    model.fit(x_train, y_train)
+
+    # Verify unpadded data.
+    _, runtime_value = model.predict(x_train)
+    if test.is_gpu_available():
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU)
+    else:
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
+    # Update x/y to be right padded by setting the last timestep to 0
+    x_train[:, -1, :] = 0
+    y_train[:, -1] = 0
+    _, runtime_value = model.predict(x_train)
+    if test.is_gpu_available():
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU)
+    else:
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
+    # Further update x/y to be mix padded (masks in the middle), and verify
+    # only cpu kernel can be selected.
+    x_train[:, -3, :] = 0
+    y_train[:, -3] = 0
+    _, runtime_value = model.predict(x_train)
+    self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
+  @test_util.run_v2_only
+  def test_GRU_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
     # the condition that the function returns different number of internal
     # states.
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index f9231df2aa7..7e45edbc0d9 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as keras_backend
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.layers import kernelized as kernel_layers
 from tensorflow.python.keras.utils import kernelized_utils
 from tensorflow.python.ops import array_ops
@@ -213,13 +214,15 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     if isinstance(initializer, init_ops.Initializer):
       expected_initializer = initializers.serialize(initializer)
 
+    expected_dtype = (
+        'float32' if base_layer_utils.v2_dtype_behavior_enabled() else None)
     expected_config = {
         'output_dim': output_dim,
         'kernel_initializer': expected_initializer,
         'scale': scale,
         'name': 'random_fourier_features',
         'trainable': trainable,
-        'dtype': None,
+        'dtype': expected_dtype,
     }
     self.assertLen(expected_config, len(rff_layer.get_config()))
     self.assertSameElements(
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index c2c93c0bc2a..d94092023aa 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -87,7 +87,7 @@ class LocallyConnected1D(Layer):
           the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1` or `2`.
+      implementation: implementation mode, either `1`, `2`, or `3`.
           `1` loops over input spatial locations to perform the forward pass.
           It is memory-efficient but performs a lot of (small) ops.
 
@@ -95,20 +95,30 @@ class LocallyConnected1D(Layer):
           and implements the forward pass as a single matrix-multiply. It uses
           a lot of RAM but performs few (large) ops.
 
-          Depending on the inputs, layer parameters, hardware, and
-          `tf.executing_eagerly()` one implementation can be dramatically faster
-          (e.g. 50X) than another.
+          `3` stores layer weights in a sparse tensor and implements the forward
+          pass as a single sparse matrix-multiply.
 
-          It is recommended to benchmark both in the setting of interest to pick
-          the most efficient one (in terms of speed and memory usage).
+          How to choose:
 
-          Following scenarios could benefit from setting `implementation=2`:
-              - eager execution;
-              - inference;
-              - running on CPU;
-              - large amount of RAM available;
-              - small models (few filters, small kernel);
-              - using `padding=same` (only possible with `implementation=2`).
+          `1`: large, dense models,
+          `2`: small models,
+          `3`: large, sparse models,
+
+          where "large" stands for large input/output activations
+          (i.e. many `filters`, `input_filters`, large `input_size`,
+          `output_size`), and "sparse" stands for few connections between inputs
+          and outputs, i.e. small ratio
+          `filters * input_filters * kernel_size / (input_size * strides)`,
+          where inputs to and outputs of the layer are assumed to have shapes
+          `(input_size, input_filters)`, `(output_size, filters)`
+          respectively.
+
+          It is recommended to benchmark each in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage). Correct
+          choice of implementation can lead to dramatic speed improvements (e.g.
+          50X), potentially at the expense of RAM.
+
+          Also, only `padding="valid"` is supported by `implementation=1`.
 
   Input shape:
       3D tensor with shape: `(batch_size, steps, input_dim)`
@@ -200,9 +210,31 @@ class LocallyConnected1D(Layer):
           kernel_shape=self.kernel_size,
           strides=self.strides,
           padding=self.padding,
-          data_format=self.data_format
+          data_format=self.data_format,
       )
 
+    elif self.implementation == 3:
+      self.kernel_shape = (self.output_length * self.filters,
+                           input_length * input_dim)
+
+      self.kernel_idxs = sorted(
+          conv_utils.conv_kernel_idxs(
+              input_shape=(input_length,),
+              kernel_shape=self.kernel_size,
+              strides=self.strides,
+              padding=self.padding,
+              filters_in=input_dim,
+              filters_out=self.filters,
+              data_format=self.data_format)
+      )
+
+      self.kernel = self.add_weight(
+          shape=(len(self.kernel_idxs),),
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
     else:
       raise ValueError('Unrecognized implementation mode: %d.'
                        % self.implementation)
@@ -247,6 +279,11 @@ class LocallyConnected1D(Layer):
       output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
                                  self.compute_output_shape(inputs.shape))
 
+    elif self.implementation == 3:
+      output = local_conv_sparse_matmul(inputs, self.kernel, self.kernel_idxs,
+                                        self.kernel_shape,
+                                        self.compute_output_shape(inputs.shape))
+
     else:
       raise ValueError('Unrecognized implementation mode: %d.'
                        % self.implementation)
@@ -355,7 +392,7 @@ class LocallyConnected2D(Layer):
           the output of the layer (its "activation").
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1` or `2`.
+      implementation: implementation mode, either `1`, `2`, or `3`.
           `1` loops over input spatial locations to perform the forward pass.
           It is memory-efficient but performs a lot of (small) ops.
 
@@ -363,20 +400,30 @@ class LocallyConnected2D(Layer):
           and implements the forward pass as a single matrix-multiply. It uses
           a lot of RAM but performs few (large) ops.
 
-          Depending on the inputs, layer parameters, hardware, and
-          `tf.executing_eagerly()` one implementation can be dramatically faster
-          (e.g. 50X) than another.
+          `3` stores layer weights in a sparse tensor and implements the forward
+          pass as a single sparse matrix-multiply.
 
-          It is recommended to benchmark both in the setting of interest to pick
-          the most efficient one (in terms of speed and memory usage).
+          How to choose:
 
-          Following scenarios could benefit from setting `implementation=2`:
-              - eager execution;
-              - inference;
-              - running on CPU;
-              - large amount of RAM available;
-              - small models (few filters, small kernel);
-              - using `padding=same` (only possible with `implementation=2`).
+          `1`: large, dense models,
+          `2`: small models,
+          `3`: large, sparse models,
+
+          where "large" stands for large input/output activations
+          (i.e. many `filters`, `input_filters`, large `np.prod(input_size)`,
+          `np.prod(output_size)`), and "sparse" stands for few connections
+          between inputs and outputs, i.e. small ratio
+          `filters * input_filters * np.prod(kernel_size) / (np.prod(input_size)
+          * np.prod(strides))`, where inputs to and outputs of the layer are
+          assumed to have shapes `input_size + (input_filters,)`,
+          `output_size + (filters,)` respectively.
+
+          It is recommended to benchmark each in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage). Correct
+          choice of implementation can lead to dramatic speed improvements (e.g.
+          50X), potentially at the expense of RAM.
+
+          Also, only `padding="valid"` is supported by `implementation=1`.
 
   Input shape:
       4D tensor with shape:
@@ -483,9 +530,31 @@ class LocallyConnected2D(Layer):
           kernel_shape=self.kernel_size,
           strides=self.strides,
           padding=self.padding,
-          data_format=self.data_format
+          data_format=self.data_format,
       )
 
+    elif self.implementation == 3:
+      self.kernel_shape = (self.output_row * self.output_col * self.filters,
+                           input_row * input_col * input_filter)
+
+      self.kernel_idxs = sorted(
+          conv_utils.conv_kernel_idxs(
+              input_shape=(input_row, input_col),
+              kernel_shape=self.kernel_size,
+              strides=self.strides,
+              padding=self.padding,
+              filters_in=input_filter,
+              filters_out=self.filters,
+              data_format=self.data_format)
+      )
+
+      self.kernel = self.add_weight(
+          shape=(len(self.kernel_idxs),),
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
     else:
       raise ValueError('Unrecognized implementation mode: %d.'
                        % self.implementation)
@@ -534,6 +603,11 @@ class LocallyConnected2D(Layer):
       output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
                                  self.compute_output_shape(inputs.shape))
 
+    elif self.implementation == 3:
+      output = local_conv_sparse_matmul(inputs, self.kernel, self.kernel_idxs,
+                                        self.kernel_shape,
+                                        self.compute_output_shape(inputs.shape))
+
     else:
       raise ValueError('Unrecognized implementation mode: %d.'
                        % self.implementation)
@@ -581,10 +655,7 @@ class LocallyConnected2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-def get_locallyconnected_mask(input_shape,
-                              kernel_shape,
-                              strides,
-                              padding,
+def get_locallyconnected_mask(input_shape, kernel_shape, strides, padding,
                               data_format):
   """Return a mask representing connectivity of a locally-connected operation.
 
@@ -701,6 +772,44 @@ def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
   return output
 
 
+def local_conv_sparse_matmul(inputs, kernel, kernel_idxs, kernel_shape,
+                             output_shape):
+  """Apply N-D convolution with un-shared weights using a single sparse matmul.
+
+  This method outputs `inputs . tf.SparseTensor(indices=kernel_idxs,
+  values=kernel, dense_shape=kernel_shape)`, with `.` standing for
+  matrix-multiply. It also reshapes `inputs` to 2-D and `output` to (N+2)-D.
+
+  Arguments:
+      inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
+        d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
+      kernel: a 1-D tensor with shape `(len(kernel_idxs),)` containing all the
+        weights of the layer.
+      kernel_idxs:  a list of integer tuples representing indices in a sparse
+        matrix performing the un-shared convolution as a matrix-multiply.
+      kernel_shape: a tuple `(input_size, output_size)`, where `input_size =
+        channels_in * d_in1 * ... * d_inN` and `output_size = channels_out *
+        d_out1 * ... * d_outN`.
+      output_shape: a tuple of (N+2) elements representing the output shape:
+        `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
+        d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
+        spatial dimensions matching that of the input.
+
+  Returns:
+      Output (N+2)-D dense tensor with shape `output_shape`.
+  """
+  inputs_flat = K.reshape(inputs, (K.shape(inputs)[0], -1))
+  output_flat = K.sparse_ops.sparse_tensor_dense_mat_mul(
+      kernel_idxs, kernel, kernel_shape, inputs_flat, adjoint_b=True)
+  output_flat_transpose = K.transpose(output_flat)
+
+  output_reshaped = K.reshape(
+      output_flat_transpose,
+      [K.shape(output_flat_transpose)[0],] + output_shape.as_list()[1:]
+  )
+  return output_reshaped
+
+
 def make_2d(tensor, split_dim):
   """Reshapes an N-dimensional tensor into a 2D tensor.
 
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index c03fb21de19..2efbd098d1e 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -61,6 +61,22 @@ _DATA_FORMAT_PADDING_IMPLEMENTATION = [{
     'data_format': 'channels_last',
     'padding': 'same',
     'implementation': 2
+}, {
+    'data_format': 'channels_first',
+    'padding': 'valid',
+    'implementation': 3
+}, {
+    'data_format': 'channels_first',
+    'padding': 'same',
+    'implementation': 3
+}, {
+    'data_format': 'channels_last',
+    'padding': 'valid',
+    'implementation': 3
+}, {
+    'data_format': 'channels_last',
+    'padding': 'same',
+    'implementation': 3
 }]
 
 
@@ -219,7 +235,8 @@ class LocallyConnected2DLayersTest(test.TestCase, parameterized.TestCase):
         'bias_regularizer': 'l2',
         'activity_regularizer': 'l2',
         'implementation': implementation,
-        'padding': padding
+        'padding': padding,
+        'data_format': data_format
     }
 
     if padding == 'same' and implementation == 1:
@@ -253,8 +270,13 @@ class LocallyConnected2DLayersTest(test.TestCase, parameterized.TestCase):
 class LocallyConnectedImplementationModeTest(test.TestCase,
                                              parameterized.TestCase):
 
-  @parameterized.parameters(['channels_first', 'channels_last'])
-  def test_locallyconnected_implementation(self, data_format):
+  @parameterized.parameters([
+      {'width': 1, 'data_format': 'channels_first'},
+      {'width': 1, 'data_format': 'channels_last'},
+      {'width': 6, 'data_format': 'channels_first'},
+      {'width': 6, 'data_format': 'channels_last'},
+  ])
+  def test_locallyconnected_implementation(self, width, data_format):
     with self.cached_session():
       num_samples = 4
       num_classes = 3
@@ -263,58 +285,78 @@ class LocallyConnectedImplementationModeTest(test.TestCase,
       np.random.seed(1)
       targets = np.random.randint(0, num_classes, (num_samples,))
 
-      for width in [1, 6]:
-        for height in [7]:
-          for filters in [2]:
-            inputs = get_inputs(data_format, filters, height, num_samples,
-                                width)
+      height = 7
+      filters = 2
+      inputs = get_inputs(data_format, filters, height, num_samples, width)
 
-            for kernel_x in [(3,)]:
-              for kernel_y in [()] if width == 1 else [(2,)]:
-                for stride_x in [(1,)]:
-                  for stride_y in [()] if width == 1 else [(3,)]:
-                    for layers in [2]:
-                      kwargs = {
-                          'layers': layers,
-                          'filters': filters,
-                          'kernel_size': kernel_x + kernel_y,
-                          'strides': stride_x + stride_y,
-                          'data_format': data_format,
-                          'num_classes': num_classes
-                      }
-                      model_1 = get_model(implementation=1, **kwargs)
-                      model_2 = get_model(implementation=2, **kwargs)
+      kernel_x = (3,)
+      kernel_y = () if width == 1 else (2,)
+      stride_x = (1,)
+      stride_y = () if width == 1 else (3,)
+      layers = 2
 
-                      # Build models.
-                      model_1.train_on_batch(inputs, targets)
-                      model_2.train_on_batch(inputs, targets)
+      kwargs = {
+          'layers': layers,
+          'filters': filters,
+          'kernel_size': kernel_x + kernel_y,
+          'strides': stride_x + stride_y,
+          'data_format': data_format,
+          'num_classes': num_classes
+      }
 
-                      # Copy weights.
-                      copy_model_weights(model_2, model_1)
+      model_1 = get_model(implementation=1, **kwargs)
+      model_2 = get_model(implementation=2, **kwargs)
+      model_3 = get_model(implementation=3, **kwargs)
 
-                      # Compare outputs at initialization.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(
-                          out_1, out_2, rtol=1e-5, atol=1e-5)
+      # Build models.
+      model_1.train_on_batch(inputs, targets)
+      model_2.train_on_batch(inputs, targets)
+      model_3.train_on_batch(inputs, targets)
 
-                      # Train.
-                      model_1.fit(
-                          x=inputs,
-                          y=targets,
-                          epochs=num_epochs,
-                          batch_size=num_samples)
-                      model_2.fit(
-                          x=inputs,
-                          y=targets,
-                          epochs=num_epochs,
-                          batch_size=num_samples)
+      # Copy weights.
+      copy_model_weights(model_from=model_2, model_to=model_1)
+      copy_model_weights(model_from=model_2, model_to=model_3)
 
-                      # Compare outputs after a few training steps.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(
-                          out_1, out_2, atol=2e-4)
+      # Compare outputs at initialization.
+      out_1 = model_1.call(inputs)
+      out_2 = model_2.call(inputs)
+      out_3 = model_3.call(inputs)
+
+      self.assertAllCloseAccordingToType(
+          out_2, out_1, rtol=1e-5, atol=1e-5)
+      self.assertAllCloseAccordingToType(
+          out_2, out_3, rtol=1e-5, atol=1e-5)
+      self.assertAllCloseAccordingToType(
+          out_1, out_3, rtol=1e-5, atol=1e-5)
+
+      # Train.
+      model_1.fit(
+          x=inputs,
+          y=targets,
+          epochs=num_epochs,
+          batch_size=num_samples)
+      model_2.fit(
+          x=inputs,
+          y=targets,
+          epochs=num_epochs,
+          batch_size=num_samples)
+      model_3.fit(
+          x=inputs,
+          y=targets,
+          epochs=num_epochs,
+          batch_size=num_samples)
+
+      # Compare outputs after a few training steps.
+      out_1 = model_1.call(inputs)
+      out_2 = model_2.call(inputs)
+      out_3 = model_3.call(inputs)
+
+      self.assertAllCloseAccordingToType(
+          out_2, out_1, atol=2e-4)
+      self.assertAllCloseAccordingToType(
+          out_2, out_3, atol=2e-4)
+      self.assertAllCloseAccordingToType(
+          out_1, out_3, atol=2e-4)
 
   def test_make_2d(self):
     input_shapes = [
@@ -422,7 +464,7 @@ def get_model(implementation,
   return model
 
 
-def copy_lc_weights(lc_layer_2_from, lc_layer_1_to):
+def copy_lc_weights_2_to_1(lc_layer_2_from, lc_layer_1_to):
   lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
   lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
 
@@ -463,20 +505,49 @@ def copy_lc_weights(lc_layer_2_from, lc_layer_1_to):
   lc_layer_1_to.set_weights([lc_2_kernel_reshaped, lc_2_bias])
 
 
-def copy_model_weights(model_2_from, model_1_to):
-  for l in range(len(model_2_from.layers)):
-    layer_2_from = model_2_from.layers[l]
-    layer_1_to = model_1_to.layers[l]
+def copy_lc_weights_2_to_3(lc_layer_2_from, lc_layer_3_to):
+  lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
+  lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
 
-    if isinstance(layer_2_from, (keras.layers.LocallyConnected2D,
-                                 keras.layers.LocallyConnected1D)):
-      copy_lc_weights(layer_2_from, layer_1_to)
+  lc_2_kernel_masked = keras.layers.local.make_2d(
+      lc_2_kernel_masked, split_dim=keras.backend.ndim(lc_2_kernel_masked) // 2)
+  lc_2_kernel_masked = keras.backend.transpose(lc_2_kernel_masked)
+  lc_2_kernel_mask = keras.backend.math_ops.not_equal(lc_2_kernel_masked, 0)
+  lc_2_kernel_flat = keras.backend.array_ops.boolean_mask(
+      lc_2_kernel_masked, lc_2_kernel_mask)
 
-    elif isinstance(layer_2_from, keras.layers.Dense):
-      weights_2, bias_2 = layer_2_from.weights
+  lc_2_kernel_flat = keras.backend.get_value(lc_2_kernel_flat)
+  lc_2_bias = keras.backend.get_value(lc_2_bias)
+
+  lc_layer_3_to.set_weights([lc_2_kernel_flat, lc_2_bias])
+
+
+def copy_model_weights(model_from, model_to):
+  for l in range(len(model_from.layers)):
+    layer_from = model_from.layers[l]
+    layer_to = model_to.layers[l]
+
+    if (isinstance(
+        layer_from,
+        (keras.layers.LocallyConnected2D, keras.layers.LocallyConnected1D)) and
+        isinstance(layer_to, (keras.layers.LocallyConnected2D,
+                              keras.layers.LocallyConnected1D))):
+      if layer_from.implementation == 2:
+        if layer_to.implementation == 1:
+          copy_lc_weights_2_to_1(layer_from, layer_to)
+        elif layer_to.implementation == 3:
+          copy_lc_weights_2_to_3(layer_from, layer_to)
+        else:
+          raise NotImplementedError
+
+      else:
+        raise NotImplementedError
+
+    elif isinstance(layer_from, keras.layers.Dense):
+      weights_2, bias_2 = layer_from.weights
       weights_2 = keras.backend.get_value(weights_2)
       bias_2 = keras.backend.get_value(bias_2)
-      layer_1_to.set_weights([weights_2, bias_2])
+      layer_to.set_weights([weights_2, bias_2])
 
     else:
       continue
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 2859c4582ea..654e84b73d2 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -44,6 +44,19 @@ class LSTMLayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_float64_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'return_sequences': True,
+                'dtype': 'float64'},
+        input_shape=(num_samples, timesteps, embedding_dim),
+        input_dtype='float64')
+
   def test_static_shape_inference_LSTM(self):
     # Github issue: 15165
     timesteps = 3
@@ -71,7 +84,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
@@ -132,7 +145,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_masking_with_stacking_LSTM(self):
@@ -147,7 +160,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
@@ -172,14 +185,16 @@ class LSTMLayerTest(keras_parameterized.TestCase):
       output = layer(inputs, initial_state=initial_state[0])
     else:
       output = layer(inputs, initial_state=initial_state)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+    self.assertTrue(
+        any(initial_state[0] is t
+            for t in layer._inbound_nodes[0].input_tensors))
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(
         loss='categorical_crossentropy',
         optimizer=adam.AdamOptimizer(),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -207,7 +222,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         optimizer=adam.AdamOptimizer(),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     targets = np.random.random((num_samples, units))
@@ -260,7 +275,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         loss='categorical_crossentropy',
         optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -317,14 +332,16 @@ class LSTMLayerTest(keras_parameterized.TestCase):
 
     layer = layer_class(units)
     output = layer(inputs)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+    self.assertTrue(
+        any(initial_state[0] is t
+            for t in layer._inbound_nodes[0].input_tensors))
 
     model = keras.models.Model(inputs, output)
     model.compile(
         loss='categorical_crossentropy',
         optimizer=adam.AdamOptimizer(),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -374,7 +391,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index 4af056a1b31..aed9f0c85fd 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -29,6 +29,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -139,7 +140,9 @@ class LSTMV2Test(keras_parameterized.TestCase):
       output = layer(inputs, initial_state=initial_state[0])
     else:
       output = layer(inputs, initial_state=initial_state)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+    self.assertTrue(
+        any(initial_state[0] is t
+            for t in layer._inbound_nodes[0].input_tensors))
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(
@@ -291,7 +294,9 @@ class LSTMV2Test(keras_parameterized.TestCase):
 
     layer = layer_class(units)
     output = layer(inputs)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+    self.assertTrue(
+        any(initial_state[0] is t
+            for t in layer._inbound_nodes[0].input_tensors))
 
     model = keras.models.Model(inputs, output)
     model.compile(
@@ -305,7 +310,6 @@ class LSTMV2Test(keras_parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
-  # Due to b/120160788.
   @test_util.run_v2_only
   def test_lstm_v2_feature_parity_with_canonical_lstm(self):
     input_shape = 10
@@ -565,6 +569,21 @@ class LSTMV2Test(keras_parameterized.TestCase):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_float64_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        rnn.LSTM,
+        kwargs={
+            'units': units,
+            'return_sequences': True,
+            'dtype': 'float64'
+        },
+        input_shape=(num_samples, timesteps, embedding_dim),
+        input_dtype='float64')
+
   def test_regularizers_LSTM(self):
     embedding_dim = 4
     layer_class = rnn.LSTM
@@ -586,8 +605,6 @@ class LSTMV2Test(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-  # Run in V2 only due to b/120160788.
-  @test_util.run_v2_only
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -609,7 +626,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -682,7 +699,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, epochs=1, shuffle=False)
 
   def test_dropout_LSTM(self):
@@ -747,6 +764,23 @@ class LSTMV2Test(keras_parameterized.TestCase):
       outputs_trimmed = lstm(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
 
+  @test_util.run_deprecated_v1
+  def test_v1_session_behavior(self):
+    # See b/139132348 for more details.
+    x = np.random.uniform(size=(100, 4, 8))
+    y = np.random.uniform(size=(100, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x, y)).shuffle(100).batch(32)
+
+    inp = keras.layers.Input(shape=(4, 8))
+    layer = rnn.LSTM(1)(inp)
+    layer = keras.layers.Dense(1)(layer)
+
+    model = keras.models.Model(inp, layer)
+
+    model.compile(loss='mse', optimizer='sgd')
+    model.fit(dataset)
+
 
 @keras_parameterized.run_all_keras_modes(config=_config)
 class LSTMGraphRewriteTest(keras_parameterized.TestCase):
@@ -767,9 +801,11 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
         num_classes=self.output_shape)
     y_train = keras.utils.to_categorical(y_train, self.output_shape)
 
-    model.compile(optimizer='sgd',
-                  loss=['categorical_crossentropy', None],
-                  run_distributed=testing_utils.should_run_distributed())
+    model.compile(
+        optimizer='sgd',
+        loss=['categorical_crossentropy', None],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     existing_loss = 0
     for _ in range(self.epoch):
@@ -785,6 +821,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
+  @test_util.run_v2_only
   def test_LSTM_runtime(self):
     layer = rnn.LSTM(self.rnn_state_size, return_runtime=True)
 
@@ -800,7 +837,62 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  # Due to b/120160788.
+  @test_util.run_v2_only
+  def test_LSTM_runtime_with_mask(self):
+    # Masking will affect which backend is selected based on whether the mask
+    # is strictly right padded.
+    layer = rnn.LSTM(self.rnn_state_size, return_runtime=True)
+
+    inputs = keras.layers.Input(
+        shape=[self.timestep, self.input_shape], dtype=dtypes.float32)
+    masked_inputs = keras.layers.Masking()(inputs)
+
+    outputs, runtime = layer(masked_inputs)
+    # Expand the runtime so that it is a 1D tensor instead of scalar.
+    # TF model does not work with scalar model output, specially during
+    # aggregation.
+    runtime = keras.layers.Lambda(
+        lambda x: array_ops.expand_dims(x, axis=-1))(runtime)
+    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=self.batch,
+        test_samples=0,
+        input_shape=(self.timestep, self.input_shape),
+        num_classes=self.output_shape)
+    y_train = keras.utils.to_categorical(y_train, self.output_shape)
+
+    model.compile(
+        optimizer='sgd',
+        loss=['categorical_crossentropy', None],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    model.fit(x_train, y_train)
+
+    # Verify unpadded data.
+    _, runtime_value = model.predict(x_train)
+    if test.is_gpu_available():
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU)
+    else:
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
+    # Update x/y to be right padded by setting the last timestep to 0
+    x_train[:, -1, :] = 0
+    y_train[:, -1] = 0
+    _, runtime_value = model.predict(x_train)
+    if test.is_gpu_available():
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU)
+    else:
+      self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
+    # Further update x/y to be mix padded (masks in the middle), and verify
+    # only cpu kernel can be selected.
+    x_train[:, -3, :] = 0
+    y_train[:, -3] = 0
+    _, runtime_value = model.predict(x_train)
+    self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
+
   @test_util.run_v2_only
   def test_LSTM_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 4d268d31428..78db3af3d1b 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -41,7 +41,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -75,7 +75,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -109,7 +109,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -125,7 +125,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -140,7 +140,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -155,7 +155,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -171,7 +171,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 8, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -203,7 +203,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
     _ = keras.layers.Dot(axes=1).get_config()
 
     x1 = np.random.random((2, 4))
@@ -220,7 +220,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
-    model._run_distributed = testing_utils.should_run_distributed()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
     out = model.predict([x1, x2])
     self.assertEqual(out.shape, (2, 1))
     self.assertAllClose(out, expected, atol=1e-4)
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index f230d23c15a..d15e97af04b 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Layers that operate regularization via the addition of noise.
-"""
+"""Layers that operate regularization via the addition of noise."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -65,7 +65,10 @@ class GaussianNoise(Layer):
 
     def noised():
       return inputs + K.random_normal(
-          shape=array_ops.shape(inputs), mean=0., stddev=self.stddev)
+          shape=array_ops.shape(inputs),
+          mean=0.,
+          stddev=self.stddev,
+          dtype=inputs.dtype)
 
     return K.in_train_phase(noised, inputs, training=training)
 
@@ -115,7 +118,10 @@ class GaussianDropout(Layer):
       def noised():
         stddev = np.sqrt(self.rate / (1.0 - self.rate))
         return inputs * K.random_normal(
-            shape=array_ops.shape(inputs), mean=1.0, stddev=stddev)
+            shape=array_ops.shape(inputs),
+            mean=1.0,
+            stddev=stddev,
+            dtype=inputs.dtype)
 
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index f1537a6919f..7f9f0391cd9 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
+from tensorflow.python.keras.backend import dtypes_module
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -41,9 +44,40 @@ class NoiseLayersTest(keras_parameterized.TestCase):
 
   def test_AlphaDropout(self):
     testing_utils.layer_test(
-        keras.layers.AlphaDropout,
-        kwargs={'rate': 0.2},
-        input_shape=(3, 2, 3))
+        keras.layers.AlphaDropout, kwargs={'rate': 0.2}, input_shape=(3, 2, 3))
+
+  @staticmethod
+  def _make_model(dtype, gtype):
+    assert dtype in (dtypes_module.float32, dtypes_module.float64)
+    assert gtype in ('noise', 'dropout')
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
+    if gtype == 'noise':
+      gaussian = keras.layers.GaussianNoise(0.0003)
+    else:
+      gaussian = keras.layers.GaussianDropout(0.1)
+    model.add(gaussian)
+    return model
+
+  def _train_model(self, dtype, gtype):
+    model = self._make_model(dtype, gtype)
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
+
+  def test_noise_float32(self):
+    self._train_model(dtypes_module.float32, 'noise')
+
+  def test_noise_float64(self):
+    self._train_model(dtypes_module.float64, 'noise')
+
+  def test_dropout_float32(self):
+    self._train_model(dtypes_module.float32, 'dropout')
+
+  def test_dropout_float64(self):
+    self._train_model(dtypes_module.float64, 'dropout')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 8ab0bac158c..369d01099a2 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -104,7 +104,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
             loss='mse',
             optimizer=gradient_descent.GradientDescentOptimizer(0.01),
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
         # centered on 5.0, variance 10.0
         x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
@@ -126,7 +126,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # centered on 5.0, variance 10.0
     x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
@@ -175,7 +175,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
 
     test_data = np.random.random((10, 3))
@@ -187,7 +187,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
@@ -366,7 +366,7 @@ def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
       loss='mse',
       optimizer=gradient_descent.GradientDescentOptimizer(0.01),
       run_eagerly=testing_utils.should_run_eagerly(),
-      run_distributed=testing_utils.should_run_distributed())
+      experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   # centered on 5.0, variance 10.0
   x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
@@ -498,10 +498,11 @@ def _run_layernorm_correctness_test(layer, dtype='float32'):
   model = keras.models.Sequential()
   norm = layer(input_shape=(2, 2, 2))
   model.add(norm)
-  model.compile(loss='mse',
-                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                run_eagerly=testing_utils.should_run_eagerly())
-  # TODO(b/137397816): run_distributed=testing_utils.should_run_distributed()
+  model.compile(
+      loss='mse',
+      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+      run_eagerly=testing_utils.should_run_eagerly(),
+      experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   # centered on 5.0, variance 10.0
   x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
@@ -573,7 +574,7 @@ class LayerNormalizationTest(keras_parameterized.TestCase):
         loss='mse',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # centered on 5.0, variance 10.0
     x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 657a0f9ad51..1dc109c36ab 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -83,6 +83,7 @@ class Normalization(CombinerPreprocessingLayer):
 
     # count is not used in this class's call() method, but is used to re-create
     # the accumulator during multiple calls to 'adapt'.
+    # TODO(omalleyt): should mean and variance be set to self.dtype?
     self.mean = self._add_state_variable(
         name=_MEAN_NAME,
         shape=mean_and_var_shape,
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index aff307cf6da..abb61f7164d 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -129,36 +129,39 @@ class NormalizationTest(keras_parameterized.TestCase,
 
   @parameterized.named_parameters(
       {
-          "adapt_data": np.array([[1], [2], [3], [4], [5]]),
+          "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]),
           "axis": -1,
-          "test_data": np.array([[1], [2], [3]]),
+          "test_data": np.array([[1.], [2.], [3.]]),
           "expected": np.array([[-1], [-.5], [0]]),
           "testcase_name": "2d_single_element"
       }, {
           "adapt_data":
-              np.array([[[1, 2, 3], [2, 3, 4]], [[3, 4, 5], [4, 5, 6]]]),
+              np.array([[[1., 2., 3.], [2., 3., 4.]],
+                        [[3., 4., 5.], [4., 5., 6.]]]),
           "axis":
               1,
           "test_data":
-              np.array([[[1, 2, 3], [2, 3, 4]], [[3, 4, 5], [4, 5, 6]]]),
+              np.array([[[1., 2., 3.], [2., 3., 4.]],
+                        [[3., 4., 5.], [4., 5., 6.]]]),
           "expected":
               np.array([[[-1.2, -0.6, 0.], [-1.2, -0.6, 0.]],
                         [[0., 0.6, 1.2], [0., 0.6, 1.2]]]),
-          "testcase_name":
-              "3d_internal_axis"
+          "testcase_name": "3d_internal_axis"
       }, {
           "adapt_data":
-              np.array([[[1, 0, 3], [2, 3, 4]], [[3, -1, 5], [4, 5, 8]]]),
+              np.array([[[1., 0., 3.], [2., 3., 4.]],
+                        [[3., -1., 5.], [4., 5., 8.]]]),
           "axis": (1, 2),
           "test_data":
-              np.array([[[3, 1, -1], [2, 5, 4]], [[3, 0, 5], [2, 5, 8]]]),
+              np.array([[[3., 1., -1.], [2., 5., 4.]],
+                        [[3., 0., 5.], [2., 5., 8.]]]),
           "expected":
               np.array([[[1., 6., -5.], [-1., 1., -0.5]],
                         [[1., 2., 1.], [-1., 1., 0.5]]]),
-          "testcase_name":
-              "3d_multiple_axis"
+          "testcase_name": "3d_multiple_axis"
       })
   def test_layer_computation(self, adapt_data, axis, test_data, expected):
+
     cls = get_layer_class()
     layer = cls(axis=axis)
     layer.adapt(adapt_data)
@@ -167,13 +170,16 @@ class NormalizationTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=input_shape)
     output = layer(input_data)
     model = keras.Model(input_data, output)
-
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
     output_data = model.predict(test_data)
     self.assertAllClose(expected, output_data)
 
-  # 'assign' doesn't work in V1 mode, so don't test it in V1.
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_mean_setting_continued_adapt_failure(self):
+
+    if not context.executing_eagerly():
+      self.skipTest("'assign' doesn't work in V1, so don't test in V1.")
+
     cls = get_layer_class()
     layer = cls()
     layer.build((2,))
@@ -181,9 +187,11 @@ class NormalizationTest(keras_parameterized.TestCase,
     with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"):
       layer.adapt(np.array([[1, 2]]), reset_state=False)
 
-  # 'assign' doesn't work in V1 mode, so don't test it in V1.
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_var_setting_continued_adapt_failure(self):
+
+    if not context.executing_eagerly():
+      self.skipTest("'assign' doesn't work in V1, so don't test in V1.")
+
     cls = get_layer_class()
     layer = cls()
     layer.build((2,))
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 5b88db6a346..2ee98a44bea 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -24,6 +24,7 @@ import collections
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
@@ -35,6 +36,7 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
@@ -732,6 +734,15 @@ class RNN(Layer):
           new_states = [new_states]
         return output, new_states
 
+    # `input_length` is passed as the `maximum_iterations` arg to tf.while_loop.
+    # We only specify that when building for XLA since that causes slowdowns
+    # on GPU in TF.
+    if (not context.executing_eagerly() and
+        control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph())):
+      input_length = timesteps
+    else:
+      input_length = None
+
     last_output, outputs, states = K.rnn(
         step,
         inputs,
@@ -740,7 +751,7 @@ class RNN(Layer):
         go_backwards=self.go_backwards,
         mask=mask,
         unroll=self.unroll,
-        input_length=timesteps,
+        input_length=input_length,
         time_major=self.time_major,
         zero_output_for_mask=self.zero_output_for_mask)
     if self.stateful:
@@ -1362,7 +1373,8 @@ class SimpleRNN(RNN):
         recurrent_constraint=recurrent_constraint,
         bias_constraint=bias_constraint,
         dropout=dropout,
-        recurrent_dropout=recurrent_dropout)
+        recurrent_dropout=recurrent_dropout,
+        dtype=kwargs.get('dtype'))
     super(SimpleRNN, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1890,7 +1902,8 @@ class GRU(RNN):
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
-        reset_after=reset_after)
+        reset_after=reset_after,
+        dtype=kwargs.get('dtype'))
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -2516,7 +2529,8 @@ class LSTM(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
+        implementation=implementation,
+        dtype=kwargs.get('dtype'))
     super(LSTM, self).__init__(
         cell,
         return_sequences=return_sequences,
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index fc2d5b31959..37311a7b3ca 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -83,7 +83,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -97,7 +97,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_non_layer_multiple_states(self):
@@ -128,7 +128,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -144,7 +144,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_layer(self):
@@ -187,7 +187,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test basic case serialization.
@@ -214,7 +214,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacked RNN serialization.
@@ -271,7 +271,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer="rmsprop",
         loss="mse",
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -285,7 +285,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_rnn_with_time_major(self):
@@ -314,7 +314,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -335,7 +335,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, cell_units[-1])))
@@ -353,7 +353,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -368,7 +368,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -403,7 +403,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -444,7 +444,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -461,7 +461,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -494,7 +494,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -508,7 +508,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_rnn_cell_with_constants_layer_passing_initial_state(self):
@@ -524,7 +524,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -574,7 +574,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -591,7 +591,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_stacked_rnn_attributes(self):
@@ -693,7 +693,7 @@ class RNNTest(keras_parameterized.TestCase):
           optimizer='rmsprop',
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
@@ -718,7 +718,7 @@ class RNNTest(keras_parameterized.TestCase):
           optimizer='rmsprop',
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       # Test stacked RNN serialization.
       x_np = np.random.random((6, 5, 5))
@@ -749,7 +749,7 @@ class RNNTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x_np = np.random.random((6, 5, 5))
     y_np = np.random.random((6, 3))
     model.train_on_batch(x_np, y_np)
@@ -774,7 +774,7 @@ class RNNTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     x_np = np.random.random((6, 5, 5))
     y_np = np.random.random((6, 3))
     model.train_on_batch(x_np, y_np)
@@ -852,7 +852,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
@@ -888,7 +888,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b)))
@@ -907,7 +907,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a * 4, unit_b * 4)))
@@ -933,7 +933,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch([
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b))
@@ -972,7 +972,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, time_step, input_size)),
         np.zeros((batch, input_size)))
@@ -1030,7 +1030,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
         [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
@@ -1054,7 +1054,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -1085,7 +1085,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -1112,7 +1112,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -1148,7 +1148,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1182,7 +1182,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1260,7 +1260,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # last time step masked
     x_np = np.array([[[1.], [2.], [0.]]])
@@ -1287,7 +1287,7 @@ class RNNTest(keras_parameterized.TestCase):
           optimizer='rmsprop',
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       np_x = np.ones((6, 5, 5))
       result_1 = model.predict(np_x)
@@ -1312,7 +1312,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     np_x = np.ones((6, 1, 5))
     result = model.predict(np_x)
@@ -1368,7 +1368,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, timesteps, input_dim)),
         np.zeros((batch, output_dim)))
@@ -1419,7 +1419,7 @@ class RNNTest(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(
         np.zeros((batch, timesteps, input_dim)),
         np.zeros((batch, output_dim)))
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 8225a621b10..d1a589765f3 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -237,13 +237,14 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
   Call arguments:
     inputs: A 3D tensor.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked.
+      a given timestep should be masked  (optional, defaults to `None`).
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. This argument is passed to the cell
       when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
+      `recurrent_dropout` is used  (optional, defaults to `None`).
     initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
+      call of the cell  (optional, defaults to `None` which causes creation
+      of zero-filled initial state tensors).
   """
 
   def __init__(self,
@@ -306,7 +307,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     self.could_use_cudnn = (
         activation == 'tanh' and recurrent_activation == 'sigmoid' and
         recurrent_dropout == 0 and not unroll and use_bias and
-        reset_after)
+        reset_after and ops.executing_eagerly_outside_functions())
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # GRU does not support constants. Ignore it during process.
@@ -399,22 +400,8 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs)
     else:
-      if mask is None:
-        last_output, outputs, new_h, runtime = gru_with_backend_selection(
-            normal_gru_kwargs, cudnn_gru_kwargs)
-      else:
-        def with_mask_support():
-          # TODO(b/134702514): Change to use backend selection.
-          # return gru_with_backend_selection(normal_gru_kwargs,
-          #                                   cudnn_gru_kwargs)
-          return standard_gru(**normal_gru_kwargs)
-        def without_mask_support():
-          return standard_gru(**normal_gru_kwargs)
-
-        last_output, outputs, new_h, runtime = control_flow_ops.cond(
-            is_sequence_right_padded(mask, self.time_major),
-            true_fn=with_mask_support,
-            false_fn=without_mask_support)
+      last_output, outputs, new_h, runtime = gru_with_backend_selection(
+          **normal_gru_kwargs)
 
     states = [new_h]
     return last_output, outputs, runtime, states
@@ -500,9 +487,14 @@ def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
 def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
               go_backwards):
   """GRU with CuDNN implementation which is only available for GPU."""
-  if not time_major:
+  if not time_major and mask is None:
     inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  init_h = array_ops.expand_dims(init_h, axis=0)
+    seq_axis, batch_axis = (0, 1)
+  else:
+    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
+  # For init_h, cuDNN expects one more dim of num_layers before or after batch
+  # dim for time major or batch major inputs respectively
+  init_h = array_ops.expand_dims(init_h, axis=seq_axis)
 
   weights = array_ops.split(kernel, 3, axis=1)
   weights += array_ops.split(recurrent_kernel, 3, axis=1)
@@ -534,15 +526,21 @@ def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
       # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
       # output_from_cudnn = [6, 5, 4, 0, 0]
       # expected_output = [0, 0, 6, 5 ,4]
-      inputs = array_ops.reverse_sequence_v2(inputs, sequence_length,
-                                             seq_axis=0, batch_axis=1)
+      inputs = array_ops.reverse_sequence_v2(
+          inputs, sequence_length, seq_axis=seq_axis, batch_axis=batch_axis)
     outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
-        inputs, input_h=init_h, input_c=0, params=params, is_training=True,
-        rnn_mode='gru', sequence_lengths=sequence_length)
+        inputs,
+        input_h=init_h,
+        input_c=0,
+        params=params,
+        is_training=True,
+        rnn_mode='gru',
+        sequence_lengths=sequence_length,
+        time_major=time_major)
     if go_backwards:
-      outputs = array_ops.reverse_sequence_v2(outputs, sequence_length,
-                                              seq_axis=0, batch_axis=1)
-      outputs = array_ops.reverse(outputs, axis=[0])
+      outputs = array_ops.reverse_sequence_v2(
+          outputs, sequence_length, seq_axis=seq_axis, batch_axis=batch_axis)
+      outputs = array_ops.reverse(outputs, axis=[seq_axis])
   else:
     if go_backwards:
       # Reverse axis 0 since the input is already convert to time major.
@@ -552,9 +550,9 @@ def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
         rnn_mode='gru')
 
   last_output = outputs[-1]
-  if not time_major:
+  if not time_major and mask is None:
     outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
+  h = array_ops.squeeze(h, axis=seq_axis)
 
   # In the case of variable length input, the cudnn kernel will fill zeros for
   # the output, whereas the default keras behavior is to bring over the previous
@@ -568,7 +566,9 @@ def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
   return last_output, outputs, h, _runtime(_RUNTIME_GPU)
 
 
-def gru_with_backend_selection(normal_gru_params, cudnn_gru_params):
+def gru_with_backend_selection(
+    inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
+    go_backwards, activation, recurrent_activation):
   """Call the GRU with optimized backend kernel selection.
 
   Under the hood, this function will create two TF function, one with the most
@@ -581,12 +581,69 @@ def gru_with_backend_selection(normal_gru_params, cudnn_gru_params):
   device placement.
 
   Args:
-    normal_gru_params: Dict, parameters for the generic TF function.
-    cudnn_gru_params: Dict, parameters for the CuDNN specific TF function.
+    inputs: Input tensor of GRU layer.
+    init_h: Initial state tensor for the cell output.
+    kernel: Weights for cell kernel.
+    recurrent_kernel: Weights for cell recurrent kernel.
+    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    mask: Boolean tensor for mask out the steps within sequence.
+    time_major: Boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
 
   Returns:
     List of output tensors, same as standard_gru.
   """
+  params = {
+      'inputs': inputs,
+      'init_h': init_h,
+      'kernel': kernel,
+      'recurrent_kernel': recurrent_kernel,
+      'bias': bias,
+      'mask': mask,
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+      'activation': activation,
+      'recurrent_activation': recurrent_activation
+  }
+
+  def cudnn_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel,
+                              bias, mask, time_major, go_backwards, activation,
+                              recurrent_activation):
+    """Use CuDNN kernel when mask is none or strictly right padded."""
+    if mask is None:
+      return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel,
+                       recurrent_kernel=recurrent_kernel, bias=bias, mask=mask,
+                       time_major=time_major, go_backwards=go_backwards)
+    # Note that mask is a boolean tensor, which doesn't need to do gradient
+    # calculation, when using tf.cond, a default gradient is added for it,
+    # which then cause the backward function to have a signature mismatch.
+    # Force the mask to not generate gradient to allow implementation_selector
+    # to work properly.
+    # TODO(b/80444525): Remove the stop_gradient().
+    mask = array_ops.stop_gradient(mask)
+
+    def input_right_padded():
+      return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel,
+                       recurrent_kernel=recurrent_kernel, bias=bias, mask=mask,
+                       time_major=time_major, go_backwards=go_backwards)
+
+    def input_not_right_padded():
+      return standard_gru(inputs=inputs, init_h=init_h, kernel=kernel,
+                          recurrent_kernel=recurrent_kernel, bias=bias,
+                          mask=mask, time_major=time_major,
+                          go_backwards=go_backwards, activation=activation,
+                          recurrent_activation=recurrent_activation)
+
+    return control_flow_ops.cond(
+        is_sequence_right_padded(mask, time_major),
+        true_fn=input_right_padded,
+        false_fn=input_not_right_padded)
+
   # Each time a `tf.function` is called, we will give it a unique
   # identifiable API name, so that Grappler won't get confused when it
   # sees multiple GRU layers added into same graph, and it will be able
@@ -595,14 +652,12 @@ def gru_with_backend_selection(normal_gru_params, cudnn_gru_params):
   defun_standard_gru = _generate_defun_backend(
       api_name, _CPU_DEVICE_NAME, standard_gru)
   defun_cudnn_gru = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, cudnn_gru)
+      api_name, _GPU_DEVICE_NAME, cudnn_gru_with_fallback)
 
   # Call the normal GRU impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
-  last_output, outputs, new_h, runtime = defun_standard_gru(
-      **normal_gru_params)
-
-  function.register(defun_cudnn_gru, **cudnn_gru_params)
+  last_output, outputs, new_h, runtime = defun_standard_gru(**params)
+  function.register(defun_cudnn_gru, **params)
   return last_output, outputs, new_h, runtime
 
 
@@ -771,13 +826,14 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
   Call arguments:
     inputs: A 3D tensor.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked.
+      a given timestep should be masked (optional, defaults to `None`).
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. This argument is passed to the cell
       when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
+      `recurrent_dropout` is used (optional, defaults to `None`).
     initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
+      call of the cell (optional, defaults to `None` which causes creation
+      of zero-filled initial state tensors).
   """
 
   def __init__(self,
@@ -842,7 +898,8 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
     ]
     self.could_use_cudnn = (
         activation == 'tanh' and recurrent_activation == 'sigmoid' and
-        recurrent_dropout == 0 and not unroll and use_bias)
+        recurrent_dropout == 0 and not unroll and use_bias and
+        ops.executing_eagerly_outside_functions())
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # LSTM does not support constants. Ignore it during process.
@@ -919,24 +976,8 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
           last_output, outputs, new_h, new_c, runtime = standard_lstm(
               **normal_lstm_kwargs)
       else:
-        if mask is None:
-          (last_output, outputs,
-           new_h, new_c, runtime) = lstm_with_backend_selection(
-               normal_lstm_kwargs, cudnn_lstm_kwargs)
-        else:
-          def with_mask_support():
-            # TODO(b/134702514): Change to use backend selection.
-            # return lstm_with_backend_selection(normal_lstm_kwargs,
-            #                                    cudnn_lstm_kwargs)
-            return standard_lstm(**normal_lstm_kwargs)
-          def without_mask_support():
-            return standard_lstm(**normal_lstm_kwargs)
-
-          (last_output, outputs,
-           new_h, new_c, runtime) = control_flow_ops.cond(
-               is_sequence_right_padded(mask, self.time_major),
-               true_fn=with_mask_support,
-               false_fn=without_mask_support)
+        (last_output, outputs, new_h, new_c,
+         runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
 
       states = [new_h, new_c]
 
@@ -1099,11 +1140,15 @@ def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
     runtime: Constant string tensor which indicate real runtime hardware. This
       value is for testing purpose and should not be used by user.
   """
-  if not time_major:
-    # Cudnn kernel prefer the input to be time major.
+  if not time_major and mask is None:
     inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  init_h = array_ops.expand_dims(init_h, axis=0)
-  init_c = array_ops.expand_dims(init_c, axis=0)
+    seq_axis, batch_axis = (0, 1)
+  else:
+    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
+  # For init_h and init_c, cuDNN expects one more dim of num_layers before or
+  # after batch dim for time major or batch major inputs respectively
+  init_h = array_ops.expand_dims(init_h, axis=seq_axis)
+  init_c = array_ops.expand_dims(init_c, axis=seq_axis)
 
   weights = array_ops.split(kernel, 4, axis=1)
   weights += array_ops.split(recurrent_kernel, 4, axis=1)
@@ -1125,15 +1170,21 @@ def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
       # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
       # output_from_cudnn = [6, 5, 4, 0, 0]
       # expected_output = [0, 0, 6, 5 ,4]
-      inputs = array_ops.reverse_sequence_v2(inputs, sequence_length,
-                                             seq_axis=0, batch_axis=1)
+      inputs = array_ops.reverse_sequence_v2(
+          inputs, sequence_length, seq_axis=seq_axis, batch_axis=batch_axis)
     outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
-        inputs, input_h=init_h, input_c=init_c, params=params, is_training=True,
-        rnn_mode='lstm', sequence_lengths=sequence_length)
+        inputs,
+        input_h=init_h,
+        input_c=init_c,
+        params=params,
+        is_training=True,
+        rnn_mode='lstm',
+        sequence_lengths=sequence_length,
+        time_major=time_major)
     if go_backwards:
-      outputs = array_ops.reverse_sequence_v2(outputs, sequence_length,
-                                              seq_axis=0, batch_axis=1)
-      outputs = array_ops.reverse(outputs, axis=[0])
+      outputs = array_ops.reverse_sequence_v2(
+          outputs, sequence_length, seq_axis=seq_axis, batch_axis=batch_axis)
+      outputs = array_ops.reverse(outputs, axis=[seq_axis])
   else:
     # # Fill the array with shape [batch] with value of max timesteps.
     # sequence_length = array_ops.fill([array_ops.shape(inputs)[1]],
@@ -1146,10 +1197,10 @@ def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
         rnn_mode='lstm')
 
   last_output = outputs[-1]
-  if not time_major:
+  if not time_major and mask is None:
     outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  c = c[0]
+  h = array_ops.squeeze(h, axis=seq_axis)
+  c = array_ops.squeeze(c, axis=seq_axis)
 
   # In the case of variable length input, the cudnn kernel will fill zeros for
   # the output, whereas the default keras behavior is to bring over the previous
@@ -1162,7 +1213,9 @@ def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
   return last_output, outputs, h, c, _runtime(_RUNTIME_GPU)
 
 
-def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params):
+def lstm_with_backend_selection(
+    inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask, time_major,
+    go_backwards, activation, recurrent_activation):
   """Call the LSTM with optimized backend kernel selection.
 
   Under the hood, this function will create two TF function, one with the most
@@ -1175,12 +1228,73 @@ def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params):
   device placement.
 
   Args:
-    normal_lstm_params: Dict, parameters for the generic TF function.
-    cudnn_lstm_params: Dict, parameters for the CuDNN specific TF function.
+    inputs: Input tensor of LSTM layer.
+    init_h: Initial state tensor for the cell output.
+    init_c: Initial state tensor for the cell hidden state.
+    kernel: Weights for cell kernel.
+    recurrent_kernel: Weights for cell recurrent kernel.
+    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    mask: Boolean tensor for mask out the steps within sequence.
+    time_major: Boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
 
   Returns:
     List of output tensors, same as standard_lstm.
   """
+  params = {
+      'inputs': inputs,
+      'init_h': init_h,
+      'init_c': init_c,
+      'kernel': kernel,
+      'recurrent_kernel': recurrent_kernel,
+      'bias': bias,
+      'mask': mask,
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+      'activation': activation,
+      'recurrent_activation': recurrent_activation
+  }
+
+  def cudnn_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel,
+                               bias, mask, time_major, go_backwards, activation,
+                               recurrent_activation):
+    """Use CuDNN kernel when mask is none or strictly right padded."""
+    if mask is None:
+      return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
+                        kernel=kernel, recurrent_kernel=recurrent_kernel,
+                        bias=bias, mask=mask, time_major=time_major,
+                        go_backwards=go_backwards)
+    # Note that mask is a boolean tensor, which doesn't need to do gradient
+    # calculation, when using tf.cond, a default gradient is added for it,
+    # which then cause the backward function to have a signature mismatch.
+    # Force the mask to not generate gradient to allow implementation_selector
+    # to work properly.
+    # TODO(b/80444525): Remove the stop_gradient().
+    mask = array_ops.stop_gradient(mask)
+
+    def input_right_padded():
+      return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
+                        kernel=kernel, recurrent_kernel=recurrent_kernel,
+                        bias=bias, mask=mask, time_major=time_major,
+                        go_backwards=go_backwards)
+
+    def input_not_right_padded():
+      return standard_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
+                           kernel=kernel, recurrent_kernel=recurrent_kernel,
+                           bias=bias, mask=mask, time_major=time_major,
+                           go_backwards=go_backwards, activation=activation,
+                           recurrent_activation=recurrent_activation)
+
+    return control_flow_ops.cond(
+        is_sequence_right_padded(mask, time_major),
+        true_fn=input_right_padded,
+        false_fn=input_not_right_padded)
+
   # Each time a `tf.function` is called, we will give it a unique
   # identifiable API name, so that Grappler won't get confused when it
   # sees multiple LSTM layers added into same graph, and it will be able
@@ -1189,14 +1303,14 @@ def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params):
   defun_standard_lstm = _generate_defun_backend(
       api_name, _CPU_DEVICE_NAME, standard_lstm)
   defun_cudnn_lstm = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+      api_name, _GPU_DEVICE_NAME, cudnn_lstm_with_fallback)
 
   # Call the normal LSTM impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
   last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-      **normal_lstm_params)
+      **params)
+  function.register(defun_cudnn_lstm, **params)
 
-  function.register(defun_cudnn_lstm, **cudnn_lstm_params)
   return last_output, outputs, new_h, new_c, runtime
 
 
@@ -1264,7 +1378,8 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
       _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
   }
   return function.defun_with_attributes(func=func,
-                                        attributes=function_attributes)
+                                        attributes=function_attributes,
+                                        autograph=False)
 
 
 def _get_context_device_type():
diff --git a/tensorflow/python/keras/layers/recurrent_v2_test.py b/tensorflow/python/keras/layers/recurrent_v2_test.py
index 2d45e64e382..487ee81eeff 100644
--- a/tensorflow/python/keras/layers/recurrent_v2_test.py
+++ b/tensorflow/python/keras/layers/recurrent_v2_test.py
@@ -61,7 +61,7 @@ class RNNV2Test(keras_parameterized.TestCase):
           optimizer='adam',
           loss='sparse_categorical_crossentropy',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model.fit(x, y, epochs=1, shuffle=False)
 
   @parameterized.parameters([rnn_v2.LSTM, rnn_v2.GRU])
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
index 42746de66dc..9ab88b96e7f 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
@@ -39,8 +39,8 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testResidualWrapper(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]))
-    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]), dtype="float32")
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
@@ -62,8 +62,8 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testResidualWrapperWithSlice(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]))
-    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]), dtype="float32")
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 11cd12c3e59..ac5a8d18b0c 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras.layers.convolutional import *
 from tensorflow.python.keras.layers.convolutional_recurrent import *
 from tensorflow.python.keras.layers.core import *
 from tensorflow.python.keras.layers.cudnn_recurrent import *
+from tensorflow.python.keras.layers.dense_attention import *
 from tensorflow.python.keras.layers.embeddings import *
 from tensorflow.python.keras.layers.local import *
 from tensorflow.python.keras.layers.merge import *
@@ -77,15 +78,21 @@ def deserialize(config, custom_objects=None):
   """
   # Prevent circular dependencies.
   from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.feature_column import feature_column_v2  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.premade.linear import LinearModel  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.premade.wide_deep import WideDeepModel  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.feature_column import dense_features  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.feature_column import sequence_feature_column as sfc  # pylint: disable=g-import-not-at-top
 
   globs = globals()  # All layers.
   globs['Network'] = models.Network
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
+  globs['LinearModel'] = LinearModel
+  globs['WideDeepModel'] = WideDeepModel
 
   # Prevent circular dependencies with FeatureColumn serialization.
-  globs['DenseFeatures'] = feature_column_v2.DenseFeatures
+  globs['DenseFeatures'] = dense_features.DenseFeatures
+  globs['SequenceFeatures'] = sfc.SequenceFeatures
 
   layer_class_name = config['class_name']
   if layer_class_name in _DESERIALIZATION_TABLE:
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index d8346b33c4e..731e312c356 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -42,6 +42,19 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_float64_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.SimpleRNN,
+        kwargs={'units': units,
+                'return_sequences': True,
+                'dtype': 'float64'},
+        input_shape=(num_samples, timesteps, embedding_dim),
+        input_dtype='float64')
+
   def test_dynamic_behavior_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -159,7 +172,7 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01),
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index a08bf214cd8..a43e983bdfd 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import saving
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import array_ops
@@ -42,7 +43,7 @@ def _single_op_at_end():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
   outputs = gen_nn_ops.relu(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _single_identity_op_at_end():
@@ -50,7 +51,7 @@ def _single_identity_op_at_end():
   x = keras.layers.Dense(10)(inputs)
   outputs = array_ops.identity(x)
   assert 'Identity' in outputs.name
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _multiple_ops_at_end():
@@ -58,7 +59,7 @@ def _multiple_ops_at_end():
   x = keras.layers.Dense(10)(inputs)
   x = gen_nn_ops.relu(x)
   outputs = gen_nn_ops.relu(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _single_op_in_middle():
@@ -66,7 +67,7 @@ def _single_op_in_middle():
   x = keras.layers.Dense(10)(inputs)
   x = gen_nn_ops.relu(x)
   outputs = keras.layers.Dense(10)(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _multiple_ops_in_middle():
@@ -75,21 +76,21 @@ def _multiple_ops_in_middle():
   x = gen_nn_ops.relu(x)
   x = gen_nn_ops.relu(x)
   outputs = keras.layers.Dense(10)(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _single_standalone_branch():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
   outputs = x * 2
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _single_op_with_attrs():
   inputs = keras.Input(shape=(10,))
   x = math_ops.reduce_mean(inputs, axis=1, keepdims=True)
   outputs = keras.layers.Dense(10)(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _multiple_uses():
@@ -98,20 +99,20 @@ def _multiple_uses():
   x1 = keras.layers.Dense(10)(x)
   x2 = keras.layers.Dense(10)(x)
   outputs = x1 + x2
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _op_with_tensor_list():
   inputs = keras.Input(shape=(10,))
   x = array_ops.concat([inputs, inputs], axis=1)
   outputs = keras.layers.Dense(10)(x)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _add_n():
   inputs = keras.Input(shape=(10,))
   outputs = math_ops.add_n([inputs, inputs, inputs])
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
 
 
 def _reuse_op():
@@ -122,7 +123,29 @@ def _reuse_op():
   x2 = x * 2
   y2 = keras.layers.Dense(10)(x2)
   outputs = y + y2
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
+
+
+def _float64_op():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10, dtype='float64')(inputs)
+  x = gen_nn_ops.relu(x)
+  assert x.dtype == 'float64', 'x has dtype: %s' % x.dtype
+  outputs = keras.layers.Dense(10)(x)
+  return keras.Model(inputs, outputs)
+
+
+class MyAdd(keras.layers.Layer):
+
+  def call(self, x, y):
+    return x + y
+
+
+def _layer_with_tensor_arg():
+  inputs = keras.Input(shape=(10,))
+  x = inputs * 2
+  outputs = MyAdd()(inputs, x)
+  return keras.Model(inputs, outputs)
 
 
 class LayerWithLayer(keras.layers.Layer):
@@ -140,7 +163,27 @@ class LayerWithLayer(keras.layers.Layer):
 def _inner_layer():
   inputs = keras.Input(shape=(10,))
   outputs = LayerWithLayer()(inputs)
-  return inputs, outputs
+  return keras.Model(inputs, outputs)
+
+
+def _reuse_ancillary_layer():
+  inputs = (keras.Input(shape=(5,)), keras.Input(shape=(5,)))
+  base_model = keras.Sequential([
+      keras.layers.Dense(3, input_shape=(5,)),
+  ])
+  outputs = base_model(inputs[0])
+  model = keras.Model(inputs, outputs)
+  # The second input is only involved in ancillary layers.
+  outputs_delta = outputs - base_model(0.5 * inputs[1])
+  l2_loss = math_ops.reduce_mean(
+      math_ops.reduce_sum(math_ops.square(outputs_delta), -1))
+  model.add_loss(l2_loss)
+  model.add_metric(l2_loss, aggregation='mean', name='l2_loss')
+  l1_loss = 0.01 * math_ops.reduce_mean(
+      math_ops.reduce_sum(math_ops.abs(outputs_delta), -1))
+  model.add_loss(l1_loss)
+  model.add_metric(l1_loss, aggregation='mean', name='l1_loss')
+  return model
 
 
 @keras_parameterized.run_all_keras_modes
@@ -155,33 +198,47 @@ class AutoLambdaTest(keras_parameterized.TestCase):
       ('single_standalone_branch', _single_standalone_branch),
       ('single_op_with_attrs', _single_op_with_attrs),
       ('multiple_uses', _multiple_uses),
-      ('op_with_tensor_list', _op_with_tensor_list), ('add_n', _add_n),
-      ('_reuse_op', _reuse_op), ('_inner_layer', _inner_layer))
+      ('op_with_tensor_list', _op_with_tensor_list),
+      ('add_n', _add_n),
+      ('_reuse_op', _reuse_op),
+      ('_float64_op', _float64_op),
+      ('_inner_layer', _inner_layer),
+      ('_reuse_ancillary_layer', _reuse_ancillary_layer),
+      ('_layer_with_tensor_arg', _layer_with_tensor_arg),
+  )
   def test_autolambda(self, model_fn):
-    inputs, outputs = model_fn()
-    model = keras.Model(inputs, outputs)
+    model = model_fn()
     model.compile(
         adam.Adam(0.001),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-    np_inputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
-                                   inputs)
-    np_outputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
-                                    outputs)
+    np_inputs = nest.map_structure(
+        lambda x: np.ones((10,) + tuple(x.shape[1:]), 'float32'), model.inputs)
+    np_outputs = nest.map_structure(
+        lambda x: np.ones((10,) + tuple(x.shape[1:]), 'float32'), model.outputs)
     model.fit(np_inputs, np_outputs, batch_size=2)
     model(np_inputs)  # Test calling the model directly on inputs.
 
     new_model = keras.Model.from_config(
-        model.get_config(), custom_objects={'LayerWithLayer': LayerWithLayer})
+        model.get_config(),
+        custom_objects={
+            'LayerWithLayer': LayerWithLayer,
+            'MyAdd': MyAdd
+        })
     new_model.compile(
         adam.Adam(0.001),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     new_model.fit(np_inputs, np_outputs, batch_size=2)
     new_model(np_inputs)  # Test calling the new model directly on inputs.
+    # Assert that metrics are preserved and in the right order.
+    self.assertAllEqual(model.metrics_names, new_model.metrics_names)
+    # Assert that layer names don't change.
+    self.assertAllEqual([layer.name for layer in model.layers],
+                        [layer.name for layer in new_model.layers])
 
   def test_numerical_correctness_simple(self):
     x = ops.convert_to_tensor([[-1., 0., -2., 1.]])
@@ -205,7 +262,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     outputs = gen_nn_ops.relu(inputs)
     model1 = keras.Model(inputs, outputs)
     y1 = self.evaluate(model1(x))
-    model2 = model1.from_config(model1.get_config())
+    model2 = keras.Model.from_config(model1.get_config())
     y2 = self.evaluate(model2(x))
     self.assertAllClose(y1, y2)
 
@@ -272,6 +329,15 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     # Test something that requires Layers to be built.
     model.summary()
 
+  def test_json_serialization(self):
+    inputs = keras.Input(shape=(4,), dtype='uint8')
+    outputs = math_ops.cast(inputs, 'float32') / 4.
+    model = saving.model_from_json(keras.Model(inputs, outputs).to_json())
+    self.assertAllEqual(
+        self.evaluate(model(np.array([0, 64, 128, 192], np.uint8))),
+        [0., 16., 32., 48.])
+    model.summary()
+
 
 class InputInEagerTest(test.TestCase):
   """Tests ops on graph tensors in Eager runtime.
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 2b2fd4f3c00..164bb03b469 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -95,21 +95,22 @@ class Loss(object):
     """Invokes the `Loss` instance.
 
     Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
-        as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`
+      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+      sample_weight: Optional `sample_weight` acts as a
         coefficient for the loss. If a scalar is provided, then the loss is
         simply scaled by the given value. If `sample_weight` is a tensor of size
         `[batch_size]`, then the total loss for each sample of the batch is
         rescaled by the corresponding element in the `sample_weight` vector. If
-        the shape of `sample_weight` matches the shape of `y_pred`, then the
-        loss of each measurable element of `y_pred` is scaled by the
-        corresponding value of `sample_weight`.
+        the shape of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be
+        broadcasted to this shape), then each loss element of `y_pred` is scaled
+        by the corresponding value of `sample_weight`. (Note on`dN-1`: all loss
+        functions reduce by 1 dimension, usually axis=-1.)
 
     Returns:
-      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
-        shape as `y_true`; otherwise, it is scalar.
+      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
+        shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar. (Note `dN-1`
+        because all loss functions reduce by 1 dimension, usually axis=-1.)
 
     Raises:
       ValueError: If the shape of `sample_weight` is invalid.
@@ -163,7 +164,7 @@ class Loss(object):
           '`tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch '
           'size like:\n```\nwith strategy.scope():\n'
           '    loss_obj = tf.keras.losses.CategoricalCrossentropy('
-          'reduction=tf.keras.losses.reduction.None)\n....\n'
+          'reduction=tf.keras.losses.Reduction.NONE)\n....\n'
           '    loss = tf.reduce_sum(loss_obj(labels, predictions)) * '
           '(1. / global_batch_size)\n```\nPlease see '
           'https://www.tensorflow.org/alpha/tutorials/distribute/training_loops'
@@ -419,8 +420,8 @@ class CategoricalCrossentropy(LossFunctionWrapper):
   cce = tf.keras.losses.CategoricalCrossentropy()
   loss = cce(
     [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
-    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
-  print('Loss: ', loss.numpy())  # Loss: 0.3239
+    [[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.0945
   ```
 
   Usage with the `compile` API:
@@ -483,8 +484,8 @@ class SparseCategoricalCrossentropy(LossFunctionWrapper):
   ```python
   cce = tf.keras.losses.SparseCategoricalCrossentropy()
   loss = cce(
-    [0, 1, 2],
-    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+    tf.convert_to_tensor([0, 1, 2]),
+    tf.convert_to_tensor([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]))
   print('Loss: ', loss.numpy())  # Loss: 0.3239
   ```
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 69ddd1779be..7246a159ffe 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
@@ -136,7 +137,10 @@ class Metric(Layer):
     super(Metric, self).__init__(name=name, dtype=dtype, **kwargs)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
-    self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
+    if not base_layer_utils.v2_dtype_behavior_enabled():
+      # We only do this when the V2 behavior is not enabled, as when it is
+      # enabled, the dtype already defaults to floatx.
+      self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
 
   def __new__(cls, *args, **kwargs):
     obj = super(Metric, cls).__new__(cls)
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
index d2bb508a81b..f372996141b 100644
--- a/tensorflow/python/keras/metrics_correctness_test.py
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -90,7 +90,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
             metrics.MeanSquaredError(name='mean_squared_error_2')
         ],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def setUp(self):
@@ -201,8 +201,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_fit_with_sample_weight(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_compiled_multi_io_model()
     history = model.fit([self.x, self.x], [self.y1, self.y2],
                         sample_weight={
@@ -226,8 +224,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_fit_with_class_weight(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_compiled_multi_io_model()
     history = model.fit([self.x, self.x], [self.y1, self.y2],
                         class_weight={
@@ -257,8 +253,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
   def test_eval_with_sample_weight(self):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_compiled_multi_io_model()
     eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
                                  batch_size=2,
@@ -435,7 +429,7 @@ class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
             metrics.MeanSquaredError(name='mean_squared_error_2')
         ],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def _custom_generator(self, sample_weight=None):
@@ -646,7 +640,7 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss=loss,
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
   def setUp(self):
@@ -764,8 +758,6 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
     self.assertAllClose(result, expected_values)
 
   def test_fit_generator(self, reduction):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_compiled_multi_io_model(
         loss=losses.MeanSquaredError(reduction=reduction))
     history = model.fit_generator(
@@ -777,8 +769,6 @@ class TestOutputLossMetrics(keras_parameterized.TestCase):
       self.assertAllClose(history.history[key], value)
 
   def test_eval_generator(self, reduction):
-    if testing_utils.should_run_distributed():
-      self.skipTest('b/137397816')
     model = self._get_compiled_multi_io_model(
         loss=losses.MeanSquaredError(reduction=reduction))
     eval_result = model.evaluate_generator(
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 02ebcdaf148..7329b0b9376 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -1968,7 +1968,7 @@ def _get_model(compile_metrics):
       metrics=compile_metrics,
       optimizer='rmsprop',
       run_eagerly=testing_utils.should_run_eagerly(),
-      run_distributed=testing_utils.should_run_distributed())
+      experimental_run_tf_function=testing_utils.should_run_tf_function())
   return model
 
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index f9587b14dc8..31dd12b6e51 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -60,6 +60,7 @@ py_test(
         ":policy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
         "//tensorflow/python/keras/optimizer_v2",
     ],
@@ -115,6 +116,7 @@ cuda_py_test(
         ":loss_scale_optimizer",
         ":test_util",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/keras",
@@ -144,5 +146,6 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     shard_count = 4,
+    tags = ["no_windows"],  # b/139083295: bfloat16 tests fail on Windows
     xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 60cb1ca0ee9..a9fdcfcc219 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -122,6 +122,22 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
+  @parameterized.named_parameters(*TESTCASES)
+  def test_dtype_is_not_string(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.assertEqual(x.dtype, dtypes.float32)
+      self.assertIsInstance(x.dtype, dtypes.DType)
+      self.assertEqual(x.true_dtype, dtypes.float32)
+      self.assertIsInstance(x.true_dtype, dtypes.DType)
+
+      with ops.get_default_graph()._enable_auto_casting_variables('float16'):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertIsInstance(x.dtype, dtypes.DType)
+        self.assertEqual(x.true_dtype, dtypes.float32)
+        self.assertIsInstance(x.true_dtype, dtypes.DType)
+
   @parameterized.named_parameters(*TESTCASES)
   def test_operator_overloads(self, distribute):
     with get_distribute_scope(distribute):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index d1471b7da0f..0c715a44452 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -35,9 +35,11 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
@@ -173,17 +175,89 @@ class KerasLayerTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
-  def test_variables_in_float32(self, strategy_fn):
+  def test_infer_with_float32_vars(self, strategy_fn):
     x = constant_op.constant([1.], dtype=dtypes.float16)
-    with strategy_fn().scope():
-      with policy.policy_scope('infer_float32_vars'):
-        layer = AddLayer(assert_type=dtypes.float16)
+    with strategy_fn().scope(), policy.policy_scope('infer_float32_vars'):
+      layer = AddLayer(assert_type=dtypes.float16)
+      self.assertEqual(layer.dtype, dtypes.float32)
+      y = layer(x)
+      self.assertEqual(layer.v.dtype, dtypes.float32)
+      self.assertEqual(y.dtype, dtypes.float16)
+      self.assertEqual(layer.dtype, dtypes.float32)
+      self.assertEqual(layer._dtype_policy._name, 'float16_with_float32_vars')
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(y), 2.)
+
+      if base_layer_utils.v2_dtype_behavior_enabled():
+        # Layer should now cast inputs to float16
+        x = constant_op.constant([1.], dtype=dtypes.float32)
+        y = layer(x)
+        self.assertEqual(y.dtype, dtypes.float16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_floating_point_policies_with_float32_vars(self, strategy_fn):
+    for dtype in 'bfloat16', 'float16', 'float64':
+      x = constant_op.constant([1.])
+      policy_name = dtype + '_with_float32_vars'
+      with strategy_fn().scope(), policy.policy_scope(policy_name):
+        layer = AddLayer(assert_type=dtype)
+        self.assertEqual(layer.dtype, dtypes.float32)
+        self.assertEqual(layer._dtype_policy._name, policy_name)
         y = layer(x)
         self.assertEqual(layer.v.dtype, dtypes.float32)
-        self.assertEqual(y.dtype, dtypes.float16)
+        self.assertEqual(y.dtype, dtype)
+        self.assertEqual(layer.dtype, dtypes.float32)
+        self.assertEqual(layer._dtype_policy._name, policy_name)
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(self.evaluate(y), 2.)
 
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_int32_with_float32_vars(self, strategy_fn):
+
+    # The policy int32_with_float32_vars is not useful at all (nor is any other
+    # non-float policy with float32 variables), but we have it for consistency,
+    # and so we test it.
+
+    class IdentityLayerWithVar(base_layer.Layer):
+
+      def build(self, _):
+        self.v = self.add_weight('v', ())
+
+      def call(self, inputs):
+        # Variables are only casted to other floats, not ints
+        assert array_ops.identity(self.v).dtype == 'float32'
+        return array_ops.identity(inputs)
+
+    x = constant_op.constant([1])
+    with strategy_fn().scope(), policy.policy_scope('int32_with_float32_vars'):
+      layer = IdentityLayerWithVar()
+      self.assertEqual(layer.dtype, dtypes.float32)
+      self.assertEqual(layer._dtype_policy._name, 'int32_with_float32_vars')
+      y = layer(x)
+      self.assertEqual(layer.v.dtype, dtypes.float32)
+      self.assertEqual(y.dtype, dtypes.int32)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_layer_with_int_variable(self, strategy_fn):
+    class LayerWithIntVar(base_layer.Layer):
+
+      def build(self, _):
+        self.v = self.add_weight('v', dtype='int32', trainable=False)
+
+      def call(self, inputs):
+        # Only float variables should be autocasted. This will fail if self.v is
+        # autocasted to float32
+        return math_ops.cast(inputs, 'int32') + self.v
+
+    x = constant_op.constant([1.])
+    layer = LayerWithIntVar(dtype=policy.Policy('mixed_float16'))
+    self.assertEqual(layer(x).dtype, 'int32')
+
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
   def test_layer_with_non_autocast_variable(self, strategy_fn):
@@ -212,7 +286,7 @@ class KerasLayerTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
-  def test_layer_regularizer_runs_in_float32(self, strategy_fn):
+  def test_layer_regularizer_runs_in_var_dtype(self, strategy_fn):
     x = constant_op.constant([1.], dtype=dtypes.float16)
     with strategy_fn().scope():
       with policy.policy_scope('infer_float32_vars'):
@@ -256,6 +330,16 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(layer.v.dtype, dtypes.float16)
         self.assertEqual(layer.dtype, dtypes.float16)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_error_passing_policy_string_to_layer(self):
+    with self.assertRaisesRegexp(
+        TypeError, "Cannot convert value 'float16_with_float32_vars' to a "
+                   "TensorFlow DType"):
+      # This is not allowed, as otherwise a "float16_with_float32_vars" policy
+      # could be created without an API call that has the name "experimental" in
+      # it.
+      AddLayer(dtype='float16_with_float32_vars')
+
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
   def test_gradient(self, strategy_fn):
@@ -304,7 +388,7 @@ class KerasLayerTest(keras_parameterized.TestCase):
     with strategy_fn().scope():
       with policy.policy_scope(save_policy):
         layer = AddLayer(assert_type=save_input_dtype)
-        layer.build(())
+        layer(x)  # Build layer
     layer.set_weights([np.array(100.)])
     self.assertEqual(self.evaluate(layer(x)), 101.)
     checkpoint = trackable_utils.Checkpoint(layer=layer)
@@ -316,7 +400,7 @@ class KerasLayerTest(keras_parameterized.TestCase):
     with strategy_fn().scope():
       with policy.policy_scope(load_policy):
         layer = AddLayer(assert_type=load_input_dtype)
-        layer.build(())
+        layer(x)  # Build layer
     layer.set_weights([np.array(200.)])
     self.assertEqual(self.evaluate(layer(x)), 201.)
     checkpoint = trackable_utils.Checkpoint(layer=layer)
@@ -365,21 +449,29 @@ class KerasModelTest(keras_parameterized.TestCase):
           'testcase_name': 'regularizer',
           'strategy_fn': create_mirrored_strategy,
           'use_regularizer': True
+      }, {
+          'testcase_name': 'infer',
+          'strategy_fn': create_mirrored_strategy,
+          'policy_name': 'mixed_float16'
       }, {
           'testcase_name': 'norun_distributed',
           'strategy_fn': create_mirrored_strategy,
-          'run_distributed': False
+          'experimental_run_tf_function': False
       })
+  @testing_utils.enable_v2_dtype_behavior
   def test_model(self,
                  strategy_fn,
                  use_operator=False,
                  use_regularizer=False,
-                 run_distributed=True):
+                 policy_name='mixed_float16',
+                 experimental_run_tf_function=True):
     if not self._is_strategy_supported(strategy_fn, check_model_type=True):
       return
     regularizer = IdentityRegularizer() if use_regularizer else None
     with strategy_fn().scope():
-      with policy.policy_scope('infer_float32_vars'):
+      # Pass loss_scale=None, as this test will fail if the DynamicLossScale
+      # skips applying gradients for a step
+      with policy.policy_scope(policy.Policy(policy_name, loss_scale=None)):
         layer_list = []
         if testing_utils.get_model_type() == 'subclass':
           # Subclassed models do not have an Input layer, so the model does not
@@ -410,7 +502,7 @@ class KerasModelTest(keras_parameterized.TestCase):
             opt,
             loss=loss_fn,
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones((2, 1))
     y = np.ones((2, 1))
@@ -435,9 +527,11 @@ class KerasModelTest(keras_parameterized.TestCase):
       }, {
           'testcase_name': 'norun_distributed',
           'strategy_fn': create_mirrored_strategy,
-          'run_distributed': False,
+          'experimental_run_tf_function': False,
       })
-  def test_fixed_loss_scaling(self, strategy_fn, run_distributed=True):
+  def test_fixed_loss_scaling(self,
+                              strategy_fn,
+                              experimental_run_tf_function=True):
     # Note: We do not test mixed precision in this method, only loss scaling.
     if not self._is_strategy_supported(strategy_fn):
       return
@@ -467,7 +561,7 @@ class KerasModelTest(keras_parameterized.TestCase):
           opt,
           loss=loss_fn,
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     self.assertEqual(backend.eval(layer.v), 1)
     x = np.ones((batch_size, 1))
@@ -491,6 +585,7 @@ class KerasModelTest(keras_parameterized.TestCase):
           'strategy_fn': create_mirrored_strategy,
           'use_loss_scaling': True
       })
+  @testing_utils.enable_v2_dtype_behavior
   def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
     # The advanced model tests mixed-precision-related features that would occur
     # in a resnet50 model. It tests a model that has:
@@ -504,11 +599,14 @@ class KerasModelTest(keras_parameterized.TestCase):
     strategy = strategy_fn()
     if use_loss_scaling:
       loss_scale = 8.
+    else:
+      loss_scale = None
     learning_rate = 2**-14
 
     with strategy.scope():
-      with policy.policy_scope(policy.Policy('infer_float32_vars')):
-        x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16)
+      with policy.policy_scope(policy.Policy('mixed_float16',
+                                             loss_scale=loss_scale)):
+        x = layers.Input(shape=(1,), batch_size=2)
         layer1 = AddLayer(
             assert_type=dtypes.float16,
             regularizer=IdentityRegularizer(),
@@ -543,13 +641,11 @@ class KerasModelTest(keras_parameterized.TestCase):
           return math_ops.reduce_mean(y_pred)
 
         opt = gradient_descent.SGD(learning_rate)
-        if use_loss_scaling:
-          opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
         model.compile(
             opt,
             loss=loss_fn,
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     x = np.ones((2, 1))
     y = np.ones((2, 1))
@@ -571,25 +667,38 @@ class KerasModelTest(keras_parameterized.TestCase):
       }, {
           'testcase_name': 'distribute',
           'strategy_fn': create_mirrored_strategy,
+      }, {
+          'testcase_name': 'pass_loss_scale_to_policy',
+          'strategy_fn': create_mirrored_strategy,
+          'pass_loss_scale_to_policy': True,
       }, {
           'testcase_name': 'norun_distributed',
           'strategy_fn': create_mirrored_strategy,
-          'run_distributed': False,
+          'experimental_run_tf_function': False,
       })
-  def test_dynamic_loss_scaling(self, strategy_fn, run_distributed=True):
+  def test_dynamic_loss_scaling(self,
+                                strategy_fn,
+                                pass_loss_scale_to_policy=False,
+                                experimental_run_tf_function=True):
     if not self._is_strategy_supported(strategy_fn):
       return
-    if run_distributed:
-      self.skipTest('b/137776821 : Fails with -c opt=-undebug')
     strategy = strategy_fn()
     initial_loss_scale = 2.
     batch_size = 4
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=initial_loss_scale, increment_period=2)
     expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                          dtype=dtypes.float16)
     # If this variable is set to True, the model below will have NaN gradients
     have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
     with strategy.scope():
-      with policy.policy_scope(policy.Policy('infer_float32_vars')):
+      opt = gradient_descent.SGD(1.)
+      if pass_loss_scale_to_policy:
+        p = policy.Policy('infer_float32_vars', loss_scale=loss_scale)
+      else:
+        p = policy.Policy('infer_float32_vars')
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      with policy.policy_scope(p):
         x = layers.Input(
             shape=(1,), batch_size=batch_size, dtype=dtypes.float16)
         layer = AddLayer(assert_type=dtypes.float16)
@@ -610,15 +719,11 @@ class KerasModelTest(keras_parameterized.TestCase):
           del y_true
           return math_ops.reduce_mean(y_pred)
 
-        opt = gradient_descent.SGD(1.)
-        loss_scale = loss_scale_module.DynamicLossScale(
-            initial_loss_scale=initial_loss_scale, increment_period=2)
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
         model.compile(
             opt,
             loss=loss_fn,
             run_eagerly=testing_utils.should_run_eagerly(),
-            run_distributed=testing_utils.should_run_distributed())
+            experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     self.assertEqual(backend.eval(layer.v), 1)
     x = np.ones((batch_size, 1))
@@ -653,6 +758,40 @@ class KerasModelTest(keras_parameterized.TestCase):
     model.fit(dataset)
     self.assertEqual(backend.eval(layer.v), -3)
 
+  @test_util.run_in_graph_and_eager_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_loss_scale_optimizer_overrides_policy_loss_scale(self):
+    with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
+      opt = gradient_descent.SGD(1.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=5.)
+      x = layers.Input(shape=(1,))
+      y = AddLayer()(x)
+      model = models.Model(x, y)
+      model.compile(opt, loss='mse')
+      self.assertEqual(self.evaluate(model.optimizer.loss_scale()), 5.)
+
+  @test_util.run_in_graph_and_eager_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_pass_invalid_optimizer_with_loss_scaling(self):
+    with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
+      x = layers.Input(shape=(1,))
+      y = AddLayer()(x)
+      model = models.Model(x, y)
+      with self.assertRaisesRegexp(ValueError,
+                                   'optimizer" must be an instance of '):
+        model.compile(optimizers.SGD(1.), 'mse')
+
+  @test_util.run_in_graph_and_eager_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_functional_model_loss_dtype(self):
+    with policy.policy_scope('float16'):
+      x = layers.Input(shape=(1,))
+      y = AddLayer()(x)
+      model = models.Model(x, y)
+      model.add_loss(math_ops.cast(y, 'float32'))
+      # The loss should not be casted to the policy's dtype.
+      self.assertEqual(model.losses[0].dtype, 'float32')
+
   @parameterized.named_parameters(
       {
           'testcase_name': 'base',
@@ -729,7 +868,7 @@ class KerasModelTest(keras_parameterized.TestCase):
           optimizer=opt,
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
     weights_file = os.path.join(self.get_temp_dir(), 'weights')
@@ -769,7 +908,7 @@ class KerasModelTest(keras_parameterized.TestCase):
           optimizer=opt,
           loss='mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
     # Run for 3 steps (6 examples with a batch size of 2)
     model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
     self.assertEqual(backend.get_value(loss_scale()), 2)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index e8128f08c9c..a68c6ff8663 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.util.tf_export import keras_export
 
@@ -166,9 +167,12 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     """
     loss_scale = self._loss_scale()
     if callable(loss):
-      return lambda: loss() * loss_scale
+      def new_loss():
+        loss_val = loss()
+        return loss_val * math_ops.cast(loss_scale, loss_val.dtype)
+      return new_loss
     else:
-      return loss * loss_scale
+      return loss * math_ops.cast(loss_scale, loss.dtype)
 
   def get_unscaled_gradients(self, grads):
     """Unscales the gradients by the loss scale.
@@ -193,7 +197,8 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     """
     loss_scale = self._loss_scale()
     loss_scale_reciprocal = 1. / loss_scale
-    return [g * loss_scale_reciprocal if g is not None else None for g in grads]
+    return [g * math_ops.cast(loss_scale_reciprocal, g.dtype) if g is not None
+            else None for g in grads]
 
   def _compute_gradients(self, loss, var_list, grad_loss=None):
     loss = self.get_scaled_loss(loss)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 2d8618c849e..320b30e27b9 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -31,6 +31,8 @@ from tensorflow.python.keras.mixed_precision.experimental import loss_scale_opti
 from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import control_flow_v2_toggles
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -59,6 +61,7 @@ TESTCASES = ({
 })
 
 
+@test_util.with_control_flow_v2
 class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _run_if_in_graph_mode(self, val):
@@ -114,13 +117,23 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
-    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(5.)))
+    loss = ops.convert_to_tensor(5.)
+    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
+    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
+    loss = ops.convert_to_tensor(5., dtype='float16')
+    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
+    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
 
   @test_util.run_in_graph_and_eager_modes
   def testGetUnscaledGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
-    grads = opt.get_unscaled_gradients([3., None, -4.])
+    scaled_grads = [
+        ops.convert_to_tensor(3.),
+        None,
+        ops.convert_to_tensor(-4., dtype='float16')
+    ]
+    grads = opt.get_unscaled_gradients(scaled_grads)
     grads = [self.evaluate(g) if g is not None else g for g in grads]
     self.assertEqual([1.5, None, -2.], grads)
 
@@ -189,10 +202,37 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # Loss scale should half due to NaN gradients.
       self.assertEqual(2., self.evaluate(opt.loss_scale()))
 
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
+    strategy = strategy_fn()
+    learning_rate = 2.
+    with strategy.scope():
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(learning_rate)
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=2, increment_period=1, multiplier=2)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+
+      def loss():
+        return math_ops.cast(var / strategy.num_replicas_in_sync, 'float16')
+      run_fn = lambda: opt.minimize(loss, var_list=[var])
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
   def testDynamicLossScaleWithSlots(self, strategy_fn):
-    with strategy_fn().scope() as strategy:
+    strategy_obj = strategy_fn()
+    if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy) and
+        control_flow_v2_toggles.control_flow_v2_enabled() and
+        not context.executing_eagerly()):
+      self.skipTest('b/138667997')
+    with strategy_obj.scope() as strategy:
       var = variables.Variable([1.0, 2.0])
       # An SGD optimizer with momentum has slot variables.
       opt = gradient_descent.SGD(1.0, momentum=1.)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index d90906f4bc9..a0eb11164e7 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -19,121 +19,332 @@ from __future__ import print_function
 
 import contextlib
 
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util.tf_export import keras_export
 
 
+# Default value of certain arguments, indicating the default behavior for
+# that argument should be used.
+USE_DEFAULT = 'USE_DEFAULT'
+
+
 @keras_export('keras.mixed_precision.experimental.Policy')
 class Policy(object):
-  """A mixed precision policy for a Keras layer.
+  """A dtype policy for a Keras layer.
 
-  A mixed precision policy determines the floating-point dtype that Keras layers
-  should create variables in. For non-default policies, if the variable dtype
-  does not match the input dtype, variables will automatically be casted to the
-  input dtype to avoid type errors. Policies can be passed to the 'dtype'
-  argument of layer constructors, or a global policy can be set with
-  'set_policy'.
+  A dtype policy determines dtype-related aspects of a layer, such as its
+  computation and variable dtypes. Each layer has a policy. Policies can be
+  passed to the 'dtype' argument of layer constructors, or a global policy can
+  be set with 'tf.keras.mixed_precision.experimental.set_policy'. A layer will
+  default to the global policy if no policy is passed to it's constructor.
 
-  In the near future, policies will also determine the computation dtype of
-  layers, as well as the loss scaling algorithm.
+  For most models, each layer will have the same computation dtype and variable
+  dtype, which will typically be float32. However, when mixed precision
+  training is used, most layers will instead have a float16 computation dtype
+  and a float32 variable dtype. See [this
+  link](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  for more information on mixed precision training. When the variable dtype does
+  not match the computation dtype, variables will be automatically casted to the
+  computation dtype to avoid type errors.
 
-  Policies are intended to enable mixed precision training, which require using
-  float32 variables and [b]float16 computations for most layers. The term "mixed
-  precision" refers to the use of both float16 (or bfloat16) and float32 in a
-  model. See https://arxiv.org/abs/1710.03740 for more information on mixed
-  precision training.
+  Policies also have a `tf.train.experimental.LossScale` instance, which is used
+  by Models to performance loss scaling. Layers which are not Models ignore
+  the loss scale.
 
-  Policies are constructed by passing a string to the `name` constructor
-  argument. `name` determines the behavior of the policy. Currently, `name` can
-  be one of the following values.
+  Policies are constructed by passing a string to the constructor, e.g.
+  `tf.keras.mixed_precision.experimental.Policy('float32')`. The string
+  determines the compute and variable dtypes. Currently, it can be one of
+  in one of the following forms:
 
-    * 'infer': Infer the variable and computation dtypes from the input dtype.
-      This is the default behavior.
-    * 'infer_float32_vars': Infer the computation dtypes from the input
-      dtype, but create variables in float32. Variables will be casted to the
-      computation dtype. This is intended to enable mixed precision. Users can
-      cast tensors to float16 before passing them to a layer, which causes the
-      layer to run it's computation in float16 while keeping variables in
-      float32.
+    * Any dtype name, such as 'float32' or 'float64'. Both the variable and
+      compute dtypes will be that dtype.
+    * '<dtype>_with_float32_vars', where <dtype> is any dtype. The compute dtype
+      will be <dtype>, while the variable dtype is float32. This can be used for
+      mixed precision, which uses float16 or bfloat16 for most computations, and
+      float32 for variables, but it is recommended to use the 'mixed_float16' or
+      'mixed_bfloat16' policies instead.
+    * 'mixed_float16' or 'mixed_bfloat16': Similar to
+      'float16_with_float32_vars' or 'bfloat16_with_float32_vars' respectively.
+      'mixed_float16' is identical to 'float16_with_float32_vars' except the
+      loss_scale is dynamic by default. 'mixed_bfloat16' is currently identical
+      to 'bfloat16_with_float32_vars'. More changes may be added to these mixed
+      policies in the future, to further differentiate them from
+      [b]float16_with_float32_vars.
 
-  To use mixed precision in a model, the 'infer_float32_vars' policy can be used
-  alongside float16 input tensors, which results in float16 computations and
-  float32 variables. For example:
+  ### How to use mixed precision in layers with Policies
+
+  To use mixed precision in a model, the 'mixed_float16' policy can
+  be used. `tf.keras.mixed_precision.experimental.set_policy` can be used to set
+  the default policy for layers if no policy is passed to them. For example:
 
   ```python
-  tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')
+  tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
   model = tf.keras.models.Sequential(
-      tf.keras.layers.Input((100,), dtype='float16'),
+      tf.keras.layers.Input((100,)),
+      # Dense layers use global policy of 'mixed_float16', which does
+      # computations in float16 while keeping variables in float32.
       tf.keras.layers.Dense(10),
       tf.keras.layers.Dense(10),
-      tf.keras.layers.Lambda(lambda x: tf.cast(x, 'float32')),
-      tf.keras.layers.Activation('Softmax')
+      # Softmax should be done in float32 for numeric stability. We pass
+      # dtype='float32' to use float32 instead of the global policy.
+      tf.keras.layers.Activation('Softmax', dtype='float32')
   )
+  model.fit(...)  # Train `model`
   ```
 
   Alternatively, the policy can be passed to individual layers instead of
   setting the global policy with `set_policy`:
 
   ```python
-  policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars')
+  policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
   model = tf.keras.models.Sequential(
-      tf.keras.layers.Input((100,), dtype='float16'),
+      tf.keras.layers.Input((100,)),
       tf.keras.layers.Dense(10, dtype=policy),
       tf.keras.layers.Dense(10, dtype=policy),
-      tf.keras.layers.Lambda(lambda x: tf.cast(x, 'float32')),
-      tf.keras.layers.Activation('Softmax')
+      # Softmax should be done in float32 for numeric stability.
+      tf.keras.layers.Activation('Softmax', dtype='float32')
   )
+  model.fit(...)  # Train `model`
   ```
 
-  Note that a LossScaleOptimizer should also be used for mixed precision models
-  to avoid numerical underflow. See `LossScaleOptimizer`.
+  As the above example shows, strings can be directly passed to layer
+  constructors in the `dtype` argument instead of policies, but only if the
+  string is convertible to a dtype.
+
+  ### The deprecated "infer" policy
+
+  In addition to a dtype or "<dtype>_with_float32_vars", a policy can also be
+  "infer". This Policy is deprecated, and it is not recommended. When a layer
+  has an infer policy, it will infer the computation and variable dtype from
+  the first input the first time the layer is called.
+
+  Once the layer is called for the first time, the layer's policy will change to
+  the dtype of the first input.
+
+  Similarly to "infer", there is a deprecated "infer_with_float32_vars" policy
+  that infers the compute dtype, but not the variable dtype.
+
+  In TensorFlow 1, only the "infer" and "infer_with_float32_vars" policies are
+  available.
   """
+  # TODO(reedwm): Replace link in above docstring with a version that is more
+  # TensorFlow-specific, and that also mentions bfloat16.
 
-  def __init__(self, name):
+  def __init__(self, name, loss_scale=USE_DEFAULT):
+    """Constructs the policy.
+
+    The `name` argument determines the compute and variable dtype, and has no
+    additional effect on the Policy. The compute and variable dtypes can only be
+    specified through `name`, and cannot be specified directly.
+
+    Args:
+      name: A string. Can be one of the following values:
+        * Any dtype name, such as 'float32' or 'float64'. Both the variable and
+          compute dtypes will be that dtype.
+        * '<dtype>_with_float32_vars', where <dtype> is any dtype. The compute
+          dtype will be <dtype>, while the variable dtype is float32. This can
+          be used for mixed precision, which uses float16 or bfloat16 for most
+          computations, and float32 for variables, but it is recommended to use
+          the 'mixed_float16' or 'mixed_bfloat16' policies instead.
+        * 'mixed_float16' or 'mixed_bfloat16': Similar to
+          'float16_with_float32_vars' or 'bfloat16_with_float32_vars'
+          respectively. 'mixed_float16' is identical to
+          'float16_with_float32_vars' except the loss_scale is dynamic by
+          default. 'mixed_bfloat16' is currently identical to
+          'bfloat16_with_float32_vars'. More changes may be added to these mixed
+          policies in the future, to further differentiate them from
+          [b]float16_with_float32_vars.
+        * 'infer' or 'infer_with_float32_vars' (deprecated): Infer the
+          computation dtype from the input dtype.
+      loss_scale: A `tf.train.experimental.LossScale`, or a value convertible to
+        one such as "dynamic". Defaults to using no loss scaling unless `name`
+        is "mixed_float16", in which case this defaults to "dynamic". Only
+        `tf.keras.Model`s, not layers, use the loss scale, and it is only used
+        during `Model.fit` or `Model.train_on_batch`.
+
+    """
+    if isinstance(name, dtypes.DType):
+      raise TypeError("'name' must be a string, not a DType. "
+                      "Instead, pass DType.name. Got: %s" % (name.name,))
+    elif not isinstance(name, six.string_types):
+      raise TypeError("'name' must be a string, but got: %s" % (name,))
+    if name == 'infer_float32_vars':
+      # For backwards compatibility. TODO(reedwm): Remove this.
+      name = 'infer_with_float32_vars'
+    if name == 'float32_with_float32_vars':
+      # Doesn't affect correctness, but causes "float32" instead of
+      # "float32_with_float32_vars" to be printed in __repr__.
+      name = 'float32'
     self._name = name
-    if name == 'infer':
-      self._default_variable_dtype = None
-    elif name == 'infer_float32_vars':
-      self._default_variable_dtype = 'float32'
-    else:
-      raise ValueError('"name" argument to Policy constructor must be "infer" '
-                       'or "infer_float32_vars", but got: %s' % name)
+    self._compute_dtype, self._variable_dtype = self._parse_name(name)
 
-  @property
-  def name(self):
-    """Returns the name of the policy: "infer" or "infer_float32_vars."""
-    return self._name
+    if loss_scale == USE_DEFAULT:
+      loss_scale = 'dynamic' if name == 'mixed_float16' else None
+    if loss_scale and self._compute_dtype not in (None, 'float16'):
+      tf_logging.warn('Creating a Policy with a loss scale is only useful for '
+                      'float16 policies. You passed loss_scale=%r for policy '
+                      '%s. Consider not passing any loss_scale instead.' %
+                      (loss_scale, name))
+    self._loss_scale = loss_scale_module.get(loss_scale)
 
-  @property
-  def default_variable_dtype(self):
-    """Returns the default variable dtype of this policy.
+  def _parse_name(self, name):
+    """Parses a Policy name into a compute and variable dtype.
 
-    This is the dtype layers will create their variables in, unless a layer
-    explicit chooses a different dtype. Layers will cast variables to the
-    appropriate dtype to avoid type errors.
+    Args:
+      name: The name of the policy:
 
     Returns:
-      The default variable dtype of this policy, or None if the default variable
-      dtype should be derived from the inputs.
+      The (compute_dtype, variable_dtype) pair.
     """
-    return self._default_variable_dtype
+    if name == 'mixed_float16':
+      return 'float16', 'float32'
+    elif name == 'mixed_bfloat16':
+      return 'bfloat16', 'float32'
+
+    if name.endswith('_with_float32_vars'):
+      base_name = name[:-len('_with_float32_vars')]
+      float32_vars = True
+    else:
+      base_name = name
+      float32_vars = False
+
+    if base_name == 'infer':
+      base_dtype = None
+    else:
+      try:
+        base_dtype = dtypes.as_dtype(base_name).name
+      except TypeError:
+        error = ('Cannot convert value %s to a mixed precision Policy. '
+                 'Valid policies include include those in the form "<dtype>" '
+                 'and "<dtype>_with_float32_vars", where <dtype> is the name '
+                 'of a dtype.' % (name,))
+        if float32_vars:
+          error += (' The value %s ends with _with_float32_vars, but %s cannot '
+                    'be converted to a DType' % (name, base_name))
+        raise ValueError(error)
+
+    if float32_vars:
+      return base_dtype, 'float32'
+    else:
+      return base_dtype, base_dtype
+
+  @property
+  def variable_dtype(self):
+    """The variable dtype of this policy.
+
+    This is the dtype layers will create their variables in, unless a layer
+    explicit chooses a different dtype. If this is different than
+    `Policy.compute_dtype` and both are non-None, Layers will cast variables to
+    the compute dtype to avoid type errors.
+
+    If this is None, the policy is "infer" and the `compute_dtype` is also None.
+    If `compute_dtype` is None, this is either None or float32.
+
+    Returns:
+      The variable dtype of this policy, or None if the variable dtype should be
+      inferred from the inputs.
+    """
+    return self._variable_dtype
+
+  @property
+  def compute_dtype(self):
+    """The compute dtype of this policy.
+
+    This is the dtype layers will do their computations in.
+
+    If this is None, the policy is "infer" or "infer_with_float32_vars" and
+    `variable_dtype` is either None or float32 respectively.
+
+    Note that even if the compute dtype is float16 or bfloat16, hardware devices
+    may not do individual adds, multiplies, and other fundamental operations in
+    [b]float16, but instead may do some of them in float32 for numeric
+    stability. The compute dtype is the dtype of the inputs and outputs of the
+    TensorFlow ops that the layer executes. Internally, many TensorFlow ops will
+    do certain internal calculations in float32, or some other device-internal
+    intermediate format with higher precision than [b]float16, to increase
+    numeric stability.
+
+    For example, a `tf.keras.layers.Dense` layer, when run on a GPU with a
+    float16 compute dtype, will pass float16 inputs to tf.matmul. But, tf.matmul
+    will do use float32 intermediate math. The performance benefit of float16 is
+    still apparent, due to increased memory bandwidth and the fact GPUs have
+    specialized hardware for computating matmuls on float16 while still keeping
+    intermediate computations in float32.
+
+    Returns:
+      The variable dtype of this policy, or None if the variable dtype should be
+      inferred from the inputs.
+    """
+    return self._compute_dtype
 
   @property
   def should_cast_variables(self):
-    """Returns true if variables should be casted."""
-    return self.default_variable_dtype is not None
+    """Returns True if variables should be casted.
 
-  # TODO(reedwm): Implement get_config/from_config.
+    This is true if the variable dtype is not the same as the compute dtype.
+
+    Returns:
+      True, if variables should be casted.
+    """
+    return self.variable_dtype != self.compute_dtype
+
+  @property
+  def loss_scale(self):
+    """Returns the loss scale of this Policy.
+
+    Returns:
+      A `tf.train.experimental.LossScale`, or None.
+    """
+    return self._loss_scale
+
+  @property
+  def name(self):
+    """Returns the name of this policy."""
+    return self._name
+
+  def __repr__(self):
+    return '<Policy "%s", loss_scale=%s>' % (self._name, self.loss_scale)
 
 
-# The policy in effect when TensorFlow starts. This is constant and never
-# changes.
-_default_policy = Policy('infer')
+def with_input_dtype(policy, dtype):
+  """Copies "infer" `policy`, adding `dtype` to it.
 
-# The current global policy in effect. This starts as the default policy, but
-# can be changed with `set_policy`.
+  Policy must be "infer" or "infer_float32_vars" (i.e., has no compute dtype).
+  Returns a new policy with compute dtype `dtype`. The returned policy's
+  variable dtype is also `dtype` if `policy` is "infer", and is `float32` if
+  `policy` is "infer_with_float32_vars".
+
+  Args:
+    policy: An "infer" or "infer_float32_vars" policy
+    dtype: The dtype of an input to a layer.
+
+  Returns:
+    A new policy copied from `policy`, but with compute dtype and maybe
+    variable_dtype set to `dtype`.
+  """
+  assert not policy.compute_dtype
+  dtype = dtypes.as_dtype(dtype).name
+  if policy.variable_dtype is None:
+    return Policy(dtype)
+  else:
+    # Policies without a compute dtype are either "infer" or
+    # "infer_with_float32_vars", so the variable_dtype must be float32 here.
+    assert policy.variable_dtype == 'float32'
+    return Policy(dtype + '_with_float32_vars')
+
+
+# The current global policy in effect. If None, it means the current value of
+# floatx should be used as the policy if the V2 dtype behavior is enabled,
+# or "infer" otherwise.
 # TODO(reedwm): Make this thread local?
-_global_policy = _default_policy
+_global_policy = None
 
 
 @keras_export('keras.mixed_precision.experimental.global_policy')
@@ -141,15 +352,29 @@ def global_policy():
   """Returns the global Policy.
 
   The global policy is the default policy used for layers, if no policy is
-  passed to the layer constructor. When TensorFlow starts, the global policy is
-  set to an "infer" policy, and can be changed with `set_policy`.
+  passed to the layer constructor. If no policy has been set with
+  `keras.mixed_precision.experimental.set_policy`, this will return a policy
+  constructed from `tf.keras.backend.floatx()` in TensorFlow 2, or an "infer"
+  policy in TensorFlow 1.
+
+  See `keras.mixed_precision.experimental.Policy` for more information.
 
   Returns:
     The global Policy.
   """
+  if _global_policy is None:
+    if base_layer_utils.v2_dtype_behavior_enabled():
+      return Policy(backend.floatx())
+    else:
+      return Policy('infer')
   return _global_policy
 
 
+def policy_defaults_to_floatx():
+  """Returns True if `global_policy()` will use the current value of floatx."""
+  return _global_policy is None and base_layer_utils.v2_dtype_behavior_enabled()
+
+
 def _check_if_mixed_precision_graph_rewrite_is_enabled():
   # TODO(reedwm): Update this comment once the Keras API is complete.
   if mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
@@ -170,19 +395,43 @@ def _check_if_mixed_precision_graph_rewrite_is_enabled():
 
 @keras_export('keras.mixed_precision.experimental.set_policy')
 def set_policy(policy):
-  """Sets the global Policy."""
+  """Sets the global Policy.
+
+  The global policy is the default policy used for layers, if no policy is
+  passed to the layer constructor. If no global policy is set, layers will
+  instead default to a Policy constructed from `tf.keras.backend.floatx()` in
+  TensorFlow 2. In TensorFlow 1, layers default to an "infer" policy.
+
+  See `keras.mixed_precision.experimental.Policy` for more information.
+
+  Args:
+    policy: A Policy, or a string that will be converted to a Policy..
+  """
   global _global_policy
   _check_if_mixed_precision_graph_rewrite_is_enabled()
-  if not isinstance(policy, Policy):
+  if policy is not None and not isinstance(policy, Policy):
     policy = Policy(policy)
+  if (policy and not base_layer_utils.v2_dtype_behavior_enabled() and
+      policy.compute_dtype):
+    raise ValueError(
+        'The global policy can only be set to a non-infer policy in TensorFlow '
+        '2')
   _global_policy = policy
   mixed_precision_global_state.using_default_mixed_precision_policy = (
-      _global_policy is _default_policy)
+      _global_policy is None)
 
 
 # TODO(reedwm): Make this thread local
 @contextlib.contextmanager
 def policy_scope(policy):
+  """A context manager that sets the global Policy under it.
+
+  Args:
+    policy: A Policy, or a string that will be converted to a Policy..
+
+  Yields:
+    Nothing.
+  """
   old_policy = _global_policy
   try:
     set_policy(policy)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index a48ecd7c5c9..f1c2504a990 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy as mp_policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.experimental import mixed_precision
 
 
@@ -30,43 +35,161 @@ from tensorflow.python.training.experimental import mixed_precision
 class PolicyTest(test.TestCase):
   """Tests Policies."""
 
-  def test_infer(self):
+  @testing_utils.enable_v2_dtype_behavior
+  def test_dtype_attributes(self):
     policy = mp_policy.Policy('infer')
-    self.assertEqual(policy.name, 'infer')
-    self.assertEqual(policy.default_variable_dtype, None)
+    self.assertEqual(policy.compute_dtype, None)
+    self.assertEqual(policy.variable_dtype, None)
 
-  def test_infer_float32_vars(self):
     policy = mp_policy.Policy('infer_float32_vars')
-    self.assertEqual(policy.name, 'infer_float32_vars')
-    self.assertEqual(policy.default_variable_dtype, 'float32')
+    self.assertEqual(policy.compute_dtype, None)
+    self.assertEqual(policy.variable_dtype, 'float32')
 
+    for dtype in 'int32', 'bool', 'float16', 'float32':
+      policy = mp_policy.Policy(dtype)
+      self.assertEqual(policy.name, dtype)
+      self.assertEqual(policy.compute_dtype, dtype)
+      self.assertEqual(policy.variable_dtype, dtype)
+
+      policy = mp_policy.Policy(dtype + '_with_float32_vars')
+      expected_name = (
+          dtype if dtype == 'float32' else dtype + '_with_float32_vars')
+      self.assertEqual(policy.name, expected_name)
+      self.assertEqual(policy.compute_dtype, dtype)
+      self.assertEqual(policy.variable_dtype, 'float32')
+
+    for dtype in 'float16', 'bfloat16':
+      policy = mp_policy.Policy('mixed_' + dtype)
+      self.assertEqual(policy.name, 'mixed_' + dtype)
+      self.assertEqual(policy.compute_dtype, dtype)
+      self.assertEqual(policy.variable_dtype, 'float32')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_repr(self):
+    for policy in ('infer', 'infer_with_float32_vars', 'float32',
+                   'float16_with_float32_vars'):
+      self.assertEqual(repr(mp_policy.Policy(policy)),
+                       '<Policy "%s", loss_scale=None>' % policy)
+    self.assertEqual(repr(mp_policy.Policy('float32_with_float32_vars')),
+                     '<Policy "float32", loss_scale=None>')
+    self.assertEqual(repr(mp_policy.Policy('float16', loss_scale=2)),
+                     '<Policy "float16", loss_scale=FixedLossScale(2.0)>')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_policy_errors(self):
+    # Test passing invalid strings
+    expected_error = 'Cannot convert value %s to a mixed precision Policy.'
+
+    for invalid_policy in ('abc', 'abc_with_float32_vars',
+                           'float32_with_float16_vars'):
+      with self.assertRaisesRegexp(ValueError,
+                                   expected_error % invalid_policy):
+        mp_policy.Policy(invalid_policy)
+
+    # Test passing a DType
+    with self.assertRaisesRegexp(TypeError,
+                                 "'name' must be a string, not a DType. "
+                                 "Instead, pass DType.name. Got: float16"):
+      mp_policy.Policy(dtypes.float16)
+
+    # Test passing a non-DType invalid type
+    with self.assertRaisesRegexp(TypeError,
+                                 "'name' must be a string, but got: 5"):
+      mp_policy.Policy(5)
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_with_input_dtype(self):
+    policy = mp_policy.with_input_dtype(mp_policy.Policy('infer'), 'float16')
+    self.assertEqual(policy.compute_dtype, 'float16')
+    self.assertEqual(policy.variable_dtype, 'float16')
+
+    policy = mp_policy.with_input_dtype(
+        mp_policy.Policy('infer_with_float32_vars'), 'float16')
+    self.assertEqual(policy.compute_dtype, 'float16')
+    self.assertEqual(policy.variable_dtype, 'float32')
+
+    policy = mp_policy.with_input_dtype(
+        mp_policy.Policy('infer_with_float32_vars'), 'float32')
+    self.assertEqual(policy.compute_dtype, 'float32')
+    self.assertEqual(policy.variable_dtype, 'float32')
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_loss_scale(self):
+    policy = mp_policy.Policy('float32')
+    self.assertEqual(policy.loss_scale, None)
+
+    policy = mp_policy.Policy('float32', loss_scale=None)
+    self.assertEqual(policy.loss_scale, None)
+
+    ls = loss_scale_module.DynamicLossScale()
+    policy = mp_policy.Policy('float32', loss_scale=ls)
+    self.assertIs(policy.loss_scale, ls)
+
+    policy = mp_policy.Policy('float32', loss_scale='dynamic')
+    self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)
+
+    policy = mp_policy.Policy('mixed_float16')
+    self.assertIsInstance(policy.loss_scale, loss_scale_module.DynamicLossScale)
+
+    policy = mp_policy.Policy('mixed_float16', loss_scale=None)
+    self.assertEqual(policy.loss_scale, None)
+
+    policy = mp_policy.Policy('mixed_bfloat16')
+    self.assertEqual(policy.loss_scale, None)
+
+  @testing_utils.enable_v2_dtype_behavior
   def test_global_policy(self):
-    self.assertEqual(mp_policy.global_policy().name, 'infer')
-    default_policy = mp_policy.global_policy()
+    if base_layer_utils.v2_dtype_behavior_enabled():
+      default_policy = 'float32'
+    else:
+      default_policy = 'infer'
+    self.assertEqual(mp_policy.global_policy().name, default_policy)
     try:
-      mp_policy.set_policy('infer_float32_vars')
-      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
-      self.assertEqual(mp_policy.global_policy().default_variable_dtype,
-                       'float32')
+      mp_policy.set_policy('infer_with_float32_vars')
+      self.assertEqual(mp_policy.global_policy().name,
+                       'infer_with_float32_vars')
       with ops.Graph().as_default():  # Policies are not associated with a graph
-        self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+        self.assertEqual(mp_policy.global_policy().name,
+                         'infer_with_float32_vars')
       mp_policy.set_policy('infer')
       self.assertEqual(mp_policy.global_policy().name, 'infer')
-      self.assertEqual(mp_policy.global_policy().default_variable_dtype, None)
-      policy = mp_policy.Policy('infer_float32_vars')
+      policy = mp_policy.Policy('infer_with_float32_vars')
       mp_policy.set_policy(policy)
       self.assertIs(mp_policy.global_policy(), policy)
     finally:
-      mp_policy.set_policy(default_policy)
+      mp_policy.set_policy(None)
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_loss_scale_warning(self):
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('float32', loss_scale=2.)
+      self.assertEqual(
+          mock_warn.call_args[0][0],
+          'Creating a Policy with a loss scale is only useful for float16 '
+          'policies. You passed loss_scale=2.0 for policy float32. Consider '
+          'not passing any loss_scale instead.')
+
+    for policy_name in 'float16', 'float16_with_float32_vars', 'mixed_float16':
+      with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+        mp_policy.Policy(policy_name, loss_scale=2.)
+        mock_warn.assert_not_called()
+
+  @testing_utils.enable_v2_dtype_behavior
   def test_policy_scope(self):
-    with mp_policy.policy_scope('infer_float32_vars'):
-      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+    if base_layer_utils.v2_dtype_behavior_enabled():
+      default_policy = 'float32'
+    else:
+      default_policy = 'infer'
+    with mp_policy.policy_scope('infer_with_float32_vars'):
+      self.assertEqual(mp_policy.global_policy().name,
+                       'infer_with_float32_vars')
       with mp_policy.policy_scope('infer'):
         self.assertEqual(mp_policy.global_policy().name, 'infer')
-      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
-    self.assertEqual(mp_policy.global_policy().name, 'infer')
+      self.assertEqual(mp_policy.global_policy().name,
+                       'infer_with_float32_vars')
+    self.assertEqual(mp_policy.global_policy().name, default_policy)
 
+  @testing_utils.enable_v2_dtype_behavior
   def test_error_if_graph_rewrite_enabled(self):
     try:
       mixed_precision.enable_mixed_precision_graph_rewrite(
@@ -78,6 +201,27 @@ class PolicyTest(test.TestCase):
     finally:
       mixed_precision.disable_mixed_precision_graph_rewrite()
 
+  @testing_utils.disable_v2_dtype_behavior
+  def test_v1_dtype_behavior(self):
+    # These policies are allowed with V1 dtype behavior
+    with mp_policy.policy_scope(mp_policy.Policy('infer')):
+      pass
+    with mp_policy.policy_scope(mp_policy.Policy('infer_float32_vars')):
+      pass
+
+    # These policies are not allowed with V1 dtype behavior
+    with self.assertRaisesRegexp(
+        ValueError,
+        'global policy can only be set to a non-infer policy in TensorFlow 2'):
+      with mp_policy.policy_scope(mp_policy.Policy('float32')):
+        pass
+    with self.assertRaisesRegexp(
+        ValueError,
+        'global policy can only be set to a non-infer policy in TensorFlow 2'):
+      with mp_policy.policy_scope(
+          mp_policy.Policy('float16_with_float32_vars')):
+        pass
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_compiled_test.py b/tensorflow/python/keras/model_subclassing_compiled_test.py
new file mode 100644
index 00000000000..180e8c8b735
--- /dev/null
+++ b/tensorflow/python/keras/model_subclassing_compiled_test.py
@@ -0,0 +1,475 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compiled Model subclassing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import model_subclassing_test_util as model_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+@keras_parameterized.run_all_keras_modes
+class ModelSubclassCompiledTest(keras_parameterized.TestCase):
+
+  def test_single_io_workflow_with_np_arrays(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    model = model_util.SimpleTestModel(
+        num_classes=num_classes, use_dp=True, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc', keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+  def test_multi_io_workflow_with_np_arrays(self):
+    num_classes = (2, 3)
+    num_samples = 1000
+    input_dim = 50
+
+    model = model_util.MultiIOTestModel(
+        num_classes=num_classes, use_dp=True, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
+  def test_single_io_workflow_with_datasets(self):
+    num_classes = 2
+    num_samples = 10
+    input_dim = 50
+
+    with self.cached_session():
+      model = model_util.SimpleTestModel(
+          num_classes=num_classes, use_dp=True, use_bn=True)
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+      x = np.ones((num_samples, input_dim), dtype=np.float32)
+      y = np.zeros((num_samples, num_classes), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(dataset, steps=10, verbose=0)
+
+  def test_attributes(self):
+    # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs
+
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    self.assertEqual(model.name, 'test_model')
+    self.assertEqual(model.built, False)
+    self.assertEqual(len(model.weights), 0)
+
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.train_on_batch([x1, x2], [y1, y2])
+
+    self.assertEqual(model.built, True)
+    self.assertEqual(len(model.layers), 4)
+    self.assertEqual(len(model.weights), 10)
+    self.assertEqual(len(model.trainable_weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.inputs), 2)
+    self.assertEqual(len(model.outputs), 2)
+
+  def test_updates(self):
+    # test that updates get run during training
+    num_samples = 100
+    input_dim = 50
+
+    class BNNet(keras.Model):
+
+      def __init__(self):
+        super(BNNet, self).__init__()
+        self.bn = keras.layers.BatchNormalization(beta_initializer='ones',
+                                                  gamma_initializer='ones')
+
+      def call(self, inputs):
+        return self.bn(inputs)
+
+    x = np.ones((num_samples, input_dim))
+    y = np.ones((num_samples, input_dim))
+
+    model = BNNet()
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    y_ref = model.predict(x)
+
+    model.train_on_batch(x, y)
+    y_new = model.predict(x)
+    self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
+
+  def test_training_and_inference_behavior(self):
+    # test that dropout is applied in training and not inference
+
+    num_samples = 100
+    input_dim = 50
+
+    class DPNet(keras.Model):
+
+      def __init__(self):
+        super(DPNet, self).__init__()
+        self.dp = keras.layers.Dropout(0.5)
+        self.dense = keras.layers.Dense(1,
+                                        use_bias=False,
+                                        kernel_initializer='ones')
+
+      def call(self, inputs):
+        x = self.dp(inputs)
+        return self.dense(x)
+
+    model = DPNet()
+    x = np.ones((num_samples, input_dim))
+    y = model.predict(x)
+    self.assertEqual(np.sum(y), np.sum(x))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    loss = model.train_on_batch(x, y)
+    self.assertGreater(loss, 0.1)
+
+  def test_training_methods(self):
+    # test fit, train_on_batch
+    # on different input types: list, dict
+
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    model.fit({'input_1': x1, 'input_2': x2},
+              {'output_1': y1, 'output_2': y2},
+              epochs=2, batch_size=32)
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0,
+              validation_data=([x1, x2], [y1, y2]))
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.train_on_batch([x1, x2], [y1, y2])
+    model.train_on_batch({'input_1': x1, 'input_2': x2},
+                         {'output_1': y1, 'output_2': y2})
+
+  def test_inference_methods(self):
+    # test predict, evaluate, test_on_batch, predict_on_batch
+    # on different input types: list, dict
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.evaluate([x1, x2], [y1, y2])
+    model.test_on_batch([x1, x2], [y1, y2])
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.predict([x1, x2])
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.predict_on_batch([x1, x2])
+
+  def test_saving(self):
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    y_ref_1, y_ref_2 = model.predict([x1, x2])
+
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
+
+    model = model_util.MultiIOTestModel(num_classes=num_classes, use_bn=True)
+
+    if h5py is not None:
+      with self.assertRaises(ValueError):
+        model.load_weights(hdf5_format_name)
+
+    model.load_weights(tf_format_name)
+
+    y1, y2 = model.predict([x1, x2])
+    self.assertAllClose(y_ref_1, y1, atol=1e-5)
+    self.assertAllClose(y_ref_2, y2, atol=1e-5)
+
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
+
+      y1, y2 = model.predict([x1, x2])
+      self.assertAllClose(y_ref_1, y1, atol=1e-5)
+      self.assertAllClose(y_ref_2, y2, atol=1e-5)
+
+  def test_subclass_nested_in_subclass(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    model = model_util.NestedTestModel1(num_classes=num_classes)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+    self.assertEqual(len(model.non_trainable_weights),
+                     2 + len(model.test_net.non_trainable_weights))
+    self.assertEqual(len(model.trainable_weights),
+                     6 + len(model.test_net.trainable_weights))
+
+  def test_graph_nested_in_subclass(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    model = model_util.NestedTestModel2(num_classes=num_classes)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+    self.assertEqual(len(model.non_trainable_weights),
+                     2 + len(model.test_net.non_trainable_weights))
+    self.assertEqual(len(model.trainable_weights),
+                     6 + len(model.test_net.trainable_weights))
+
+  def test_subclass_nested_in_graph(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    model = model_util.get_nested_model_3(
+        input_dim=input_dim, num_classes=num_classes)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 16)
+    self.assertEqual(len(model.non_trainable_weights), 4)
+    self.assertEqual(len(model.trainable_weights), 12)
+
+  def test_subclass_nested_in_sequential(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    class Inner(keras.Model):
+
+      def __init__(self):
+        super(Inner, self).__init__()
+        self.dense1 = keras.layers.Dense(32, activation='relu')
+        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+        self.bn = keras.layers.BatchNormalization()
+
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.dense2(x)
+        return self.bn(x)
+
+    model = keras.Sequential([Inner()])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.trainable_weights), 6)
+
+  def test_support_for_manual_training_arg(self):
+    # In most cases, the `training` argument is left unspecified, in which
+    # case it defaults to value corresponding to the Model method being used
+    # (fit -> True, predict -> False, etc).
+    # If the user writes their model `call` method to take
+    # an explicit `training` argument, we must check that the correct value
+    # is being passed to the model for each method call.
+
+    class DPNet(keras.Model):
+
+      def __init__(self):
+        super(DPNet, self).__init__()
+        self.dp = keras.layers.Dropout(0.5)
+        self.dense = keras.layers.Dense(1,
+                                        use_bias=False,
+                                        kernel_initializer='ones')
+
+      def call(self, inputs, training=False):
+        x = self.dp(inputs, training=training)
+        return self.dense(x)
+
+    model = DPNet()
+    x = np.ones((10, 10))
+    y = model.predict(x)
+    self.assertEqual(np.sum(y), np.sum(x))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    loss = model.train_on_batch(x, y)
+    self.assertGreater(loss, 0.1)
+
+  def test_no_loss_in_compile(self):
+
+    class InternalLossModel(keras.Model):
+
+      def __init__(self):
+        super(InternalLossModel, self).__init__()
+        self.dense = keras.layers.Dense(1)
+
+      def call(self, inputs):
+        out = self.dense(inputs)
+        self.add_loss(math_ops.reduce_sum(out))
+        return out
+
+    model = InternalLossModel()
+    x = np.ones((10, 10))
+    model.predict(x)
+    model.compile(
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.fit(x)
+    model.evaluate(x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index eecb3b5bd20..ac9e29dff76 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -23,15 +23,14 @@ import os
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import model_subclassing_test_util as model_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -44,150 +43,6 @@ except ImportError:
   h5py = None
 
 
-# pylint: disable=not-callable
-class SimpleTestModel(keras.Model):
-
-  def __init__(self, use_bn=False, use_dp=False, num_classes=10):
-    super(SimpleTestModel, self).__init__(name='test_model')
-    self.use_bn = use_bn
-    self.use_dp = use_dp
-    self.num_classes = num_classes
-
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes, activation='softmax')
-    if self.use_dp:
-      self.dp = keras.layers.Dropout(0.5)
-    if self.use_bn:
-      self.bn = keras.layers.BatchNormalization(axis=-1)
-
-  def call(self, x):
-    x = self.dense1(x)
-    if self.use_dp:
-      x = self.dp(x)
-    if self.use_bn:
-      x = self.bn(x)
-    return self.dense2(x)
-
-
-class SimpleConvTestModel(keras.Model):
-
-  def __init__(self, num_classes=10):
-    super(SimpleConvTestModel, self).__init__(name='test_model')
-    self.num_classes = num_classes
-
-    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
-    self.flatten = keras.layers.Flatten()
-    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
-
-  def call(self, x):
-    x = self.conv1(x)
-    x = self.flatten(x)
-    return self.dense1(x)
-
-
-class MultiIOTestModel(keras.Model):
-
-  def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
-    super(MultiIOTestModel, self).__init__(name='test_model')
-    self.use_bn = use_bn
-    self.use_dp = use_dp
-    self.num_classes = num_classes
-
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes[0], activation='softmax')
-    self.dense3 = keras.layers.Dense(num_classes[1], activation='softmax')
-    if use_dp:
-      self.dp = keras.layers.Dropout(0.5)
-    if use_bn:
-      self.bn = keras.layers.BatchNormalization()
-
-  def call(self, inputs):
-    x1, x2 = inputs
-    x1 = self.dense1(x1)
-    x2 = self.dense1(x2)
-    if self.use_dp:
-      x1 = self.dp(x1)
-    if self.use_bn:
-      x2 = self.bn(x2)
-    return [self.dense2(x1), self.dense3(x2)]
-
-
-class NestedTestModel1(keras.Model):
-  """A model subclass nested inside a model subclass.
-  """
-
-  def __init__(self, num_classes=2):
-    super(NestedTestModel1, self).__init__(name='nested_model_1')
-    self.num_classes = num_classes
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-    self.bn = keras.layers.BatchNormalization()
-    self.test_net = SimpleTestModel(num_classes=4,
-                                    use_bn=True,
-                                    use_dp=True)
-
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.bn(x)
-    x = self.test_net(x)
-    return self.dense2(x)
-
-
-def get_functional_graph_model(input_dim, num_classes):
-  # A simple functional-API model (a.k.a. graph network)
-  inputs = keras.Input(shape=(input_dim,))
-  x = keras.layers.Dense(32, activation='relu')(inputs)
-  x = keras.layers.BatchNormalization()(x)
-  outputs = keras.layers.Dense(num_classes)(x)
-  return keras.Model(inputs, outputs)
-
-
-class NestedTestModel2(keras.Model):
-  """A model subclass with a functional-API graph network inside.
-  """
-
-  def __init__(self, num_classes=2):
-    super(NestedTestModel2, self).__init__(name='nested_model_2')
-    self.num_classes = num_classes
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-    self.bn = self.bn = keras.layers.BatchNormalization()
-    self.test_net = get_functional_graph_model(32, 4)
-
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.bn(x)
-    x = self.test_net(x)
-    return self.dense2(x)
-
-
-def get_nested_model_3(input_dim, num_classes):
-  # A functional-API model with a subclassed model inside.
-  # NOTE: this requires the inner subclass to implement `compute_output_shape`.
-
-  inputs = keras.Input(shape=(input_dim,))
-  x = keras.layers.Dense(32, activation='relu')(inputs)
-  x = keras.layers.BatchNormalization()(x)
-
-  class Inner(keras.Model):
-
-    def __init__(self):
-      super(Inner, self).__init__()
-      self.dense1 = keras.layers.Dense(32, activation='relu')
-      self.dense2 = keras.layers.Dense(5, activation='relu')
-      self.bn = keras.layers.BatchNormalization()
-
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      x = self.dense2(x)
-      return self.bn(x)
-
-  test_model = Inner()
-  x = test_model(x)
-  outputs = keras.layers.Dense(num_classes)(x)
-  return keras.Model(inputs, outputs, name='nested_model_3')
-
-
 @keras_parameterized.run_all_keras_modes
 class ModelSubclassingTest(keras_parameterized.TestCase):
 
@@ -242,7 +97,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=2)
     self.assertLen(model.layers, 2)
     self.assertLen(model.trainable_variables, 4)
@@ -251,9 +106,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     num_classes = 2
     input_dim = 50
 
-    model = SimpleTestModel(num_classes=num_classes,
-                            use_dp=True,
-                            use_bn=True)
+    model = model_util.SimpleTestModel(
+        num_classes=num_classes, use_dp=True, use_bn=True)
 
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
@@ -329,9 +183,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     input_dim = 50
     batch_size = None
 
-    model = SimpleTestModel(num_classes=num_classes,
-                            use_dp=True,
-                            use_bn=True)
+    model = model_util.SimpleTestModel(
+        num_classes=num_classes, use_dp=True, use_bn=True)
 
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
@@ -347,9 +200,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     input_dim = tensor_shape.Dimension(50)
     batch_size = tensor_shape.Dimension(None)
 
-    model = SimpleTestModel(num_classes=num_classes,
-                            use_dp=True,
-                            use_bn=True)
+    model = model_util.SimpleTestModel(
+        num_classes=num_classes, use_dp=True, use_bn=True)
 
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
@@ -366,7 +218,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     batch_size = 32
     input_shape = (32, 32, 3)
 
-    model = SimpleConvTestModel(num_classes)
+    model = model_util.SimpleConvTestModel(num_classes)
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -384,7 +236,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     batch_size = None
     input_shape = (32, 32, 3)
 
-    model = SimpleConvTestModel(num_classes)
+    model = model_util.SimpleConvTestModel(num_classes)
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -402,7 +254,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     batch_size = None
     input_shape = (32, 32, 3)
 
-    model = SimpleConvTestModel(num_classes)
+    model = model_util.SimpleConvTestModel(num_classes)
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -419,7 +271,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
       model.save_weights(hdf5_format_name)
 
-    model = SimpleConvTestModel(num_classes)
+    model = model_util.SimpleConvTestModel(num_classes)
     model.build(
         input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
     if h5py is not None:
@@ -432,7 +284,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     batch_size = None
     num_samples = 1000
     input_dim = 50
-    model = MultiIOTestModel()
+    model = model_util.MultiIOTestModel()
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -457,14 +309,15 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         self.contents += msg + '\n'
 
     # Single-io
-    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
+    model = model_util.SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
     model._set_inputs(np.ones((3, 4)))  # need to build model first
     print_fn = ToString()
     model.summary(print_fn=print_fn)
     self.assertTrue('Trainable params: 356' in print_fn.contents)
 
     # Multi-io
-    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
+    model = model_util.MultiIOTestModel(
+        num_classes=(5, 6), use_bn=True, use_dp=True)
     model._set_inputs([np.ones((3, 4)),
                        np.ones((3, 4))])  # need to build model first
     print_fn = ToString()
@@ -599,441 +452,6 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       self.assertEqual(1, len(model.get_updates_for(x)))
 
 
-@keras_parameterized.run_all_keras_modes
-class ModelSubclassCompiledTest(keras_parameterized.TestCase):
-
-  def test_single_io_workflow_with_np_arrays(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = SimpleTestModel(num_classes=num_classes,
-                            use_dp=True,
-                            use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc', keras.metrics.CategoricalAccuracy()],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-  def test_multi_io_workflow_with_np_arrays(self):
-    num_classes = (2, 3)
-    num_samples = 1000
-    input_dim = 50
-
-    model = MultiIOTestModel(num_classes=num_classes,
-                             use_dp=True,
-                             use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
-
-  def test_single_io_workflow_with_dataset_iterators(self):
-    num_classes = 2
-    num_samples = 10
-    input_dim = 50
-
-    with self.cached_session():
-      model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(
-          loss='mse',
-          optimizer='rmsprop',
-          run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
-
-      x = np.ones((num_samples, input_dim), dtype=np.float32)
-      y = np.zeros((num_samples, num_classes), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-      model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(iterator, steps=10, verbose=0)
-
-  def test_attributes(self):
-    # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs
-
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    self.assertEqual(model.name, 'test_model')
-    self.assertEqual(model.built, False)
-    self.assertEqual(len(model.weights), 0)
-
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.train_on_batch([x1, x2], [y1, y2])
-
-    self.assertEqual(model.built, True)
-    self.assertEqual(len(model.layers), 4)
-    self.assertEqual(len(model.weights), 10)
-    self.assertEqual(len(model.trainable_weights), 8)
-    self.assertEqual(len(model.non_trainable_weights), 2)
-    self.assertEqual(len(model.inputs), 2)
-    self.assertEqual(len(model.outputs), 2)
-
-  def test_updates(self):
-    # test that updates get run during training
-    num_samples = 100
-    input_dim = 50
-
-    class BNNet(keras.Model):
-
-      def __init__(self):
-        super(BNNet, self).__init__()
-        self.bn = keras.layers.BatchNormalization(beta_initializer='ones',
-                                                  gamma_initializer='ones')
-
-      def call(self, inputs):
-        return self.bn(inputs)
-
-    x = np.ones((num_samples, input_dim))
-    y = np.ones((num_samples, input_dim))
-
-    model = BNNet()
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    y_ref = model.predict(x)
-
-    model.train_on_batch(x, y)
-    y_new = model.predict(x)
-    self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
-
-  def test_training_and_inference_behavior(self):
-    # test that dropout is applied in training and not inference
-
-    num_samples = 100
-    input_dim = 50
-
-    class DPNet(keras.Model):
-
-      def __init__(self):
-        super(DPNet, self).__init__()
-        self.dp = keras.layers.Dropout(0.5)
-        self.dense = keras.layers.Dense(1,
-                                        use_bias=False,
-                                        kernel_initializer='ones')
-
-      def call(self, inputs):
-        x = self.dp(inputs)
-        return self.dense(x)
-
-    model = DPNet()
-    x = np.ones((num_samples, input_dim))
-    y = model.predict(x)
-    self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    loss = model.train_on_batch(x, y)
-    self.assertGreater(loss, 0.1)
-
-  def test_training_methods(self):
-    # test fit, train_on_batch
-    # on different input types: list, dict
-
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    model.fit({'input_1': x1, 'input_2': x2},
-              {'output_1': y1, 'output_2': y2},
-              epochs=2, batch_size=32)
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0,
-              validation_data=([x1, x2], [y1, y2]))
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.train_on_batch([x1, x2], [y1, y2])
-    model.train_on_batch({'input_1': x1, 'input_2': x2},
-                         {'output_1': y1, 'output_2': y2})
-
-  def test_inference_methods(self):
-    # test predict, evaluate, test_on_batch, predict_on_batch
-    # on different input types: list, dict
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.evaluate([x1, x2], [y1, y2])
-    model.test_on_batch([x1, x2], [y1, y2])
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.predict([x1, x2])
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.predict_on_batch([x1, x2])
-
-  def test_saving(self):
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    y_ref_1, y_ref_2 = model.predict([x1, x2])
-
-    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
-    model.save_weights(tf_format_name)
-    if h5py is not None:
-      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
-      model.save_weights(hdf5_format_name)
-
-    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-
-    if h5py is not None:
-      with self.assertRaises(ValueError):
-        model.load_weights(hdf5_format_name)
-
-    model.load_weights(tf_format_name)
-
-    y1, y2 = model.predict([x1, x2])
-    self.assertAllClose(y_ref_1, y1, atol=1e-5)
-    self.assertAllClose(y_ref_2, y2, atol=1e-5)
-
-    if h5py is not None:
-      model.load_weights(hdf5_format_name)
-
-      y1, y2 = model.predict([x1, x2])
-      self.assertAllClose(y_ref_1, y1, atol=1e-5)
-      self.assertAllClose(y_ref_2, y2, atol=1e-5)
-
-  def test_subclass_nested_in_subclass(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = NestedTestModel1(num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-    self.assertEqual(len(model.non_trainable_weights),
-                     2 + len(model.test_net.non_trainable_weights))
-    self.assertEqual(len(model.trainable_weights),
-                     6 + len(model.test_net.trainable_weights))
-
-  def test_graph_nested_in_subclass(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = NestedTestModel2(num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-    self.assertEqual(len(model.non_trainable_weights),
-                     2 + len(model.test_net.non_trainable_weights))
-    self.assertEqual(len(model.trainable_weights),
-                     6 + len(model.test_net.trainable_weights))
-
-  def test_subclass_nested_in_graph(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 16)
-    self.assertEqual(len(model.non_trainable_weights), 4)
-    self.assertEqual(len(model.trainable_weights), 12)
-
-  def test_subclass_nested_in_sequential(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    class Inner(keras.Model):
-
-      def __init__(self):
-        super(Inner, self).__init__()
-        self.dense1 = keras.layers.Dense(32, activation='relu')
-        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-        self.bn = keras.layers.BatchNormalization()
-
-      def call(self, inputs):
-        x = self.dense1(inputs)
-        x = self.dense2(x)
-        return self.bn(x)
-
-    model = keras.Sequential([Inner()])
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8)
-    self.assertEqual(len(model.non_trainable_weights), 2)
-    self.assertEqual(len(model.trainable_weights), 6)
-
-  def test_support_for_manual_training_arg(self):
-    # In most cases, the `training` argument is left unspecified, in which
-    # case it defaults to value corresponding to the Model method being used
-    # (fit -> True, predict -> False, etc).
-    # If the user writes their model `call` method to take
-    # an explicit `training` argument, we must check that the correct value
-    # is being passed to the model for each method call.
-
-    class DPNet(keras.Model):
-
-      def __init__(self):
-        super(DPNet, self).__init__()
-        self.dp = keras.layers.Dropout(0.5)
-        self.dense = keras.layers.Dense(1,
-                                        use_bias=False,
-                                        kernel_initializer='ones')
-
-      def call(self, inputs, training=False):
-        x = self.dp(inputs, training=training)
-        return self.dense(x)
-
-    model = DPNet()
-    x = np.ones((10, 10))
-    y = model.predict(x)
-    self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    loss = model.train_on_batch(x, y)
-    self.assertGreater(loss, 0.1)
-
-  def test_no_loss_in_compile(self):
-
-    class InternalLossModel(keras.Model):
-
-      def __init__(self):
-        super(InternalLossModel, self).__init__()
-        self.dense = keras.layers.Dense(1)
-
-      def call(self, inputs):
-        out = self.dense(inputs)
-        self.add_loss(math_ops.reduce_sum(out))
-        return out
-
-    model = InternalLossModel()
-    x = np.ones((10, 10))
-    model.predict(x)
-    model.compile(
-        optimizer='rmsprop',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
-    model.fit(x)
-    model.evaluate(x)
-
 
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
@@ -1044,9 +462,8 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
     input_dim = 50
 
     with self.cached_session():
-      model = SimpleTestModel(num_classes=num_classes,
-                              use_dp=True,
-                              use_bn=True)
+      model = model_util.SimpleTestModel(
+          num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer='rmsprop')
 
       x = array_ops.ones((num_samples, input_dim))
@@ -1062,9 +479,8 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
     input_dim = 50
 
     with self.cached_session():
-      model = MultiIOTestModel(num_classes=num_classes,
-                               use_dp=True,
-                               use_bn=True)
+      model = model_util.MultiIOTestModel(
+          num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = array_ops.ones((num_samples, input_dim))
@@ -1150,9 +566,8 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
     input_dim = 50
 
     with self.cached_session():
-      model = MultiIOTestModel(num_classes=num_classes,
-                               use_dp=True,
-                               use_bn=True)
+      model = model_util.MultiIOTestModel(
+          num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = np.ones((num_samples, input_dim))
@@ -1168,46 +583,11 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
 
 
-class CustomCallModel(keras.Model):
-
-  def __init__(self):
-    super(CustomCallModel, self).__init__()
-    self.dense1 = keras.layers.Dense(1, activation='relu')
-    self.dense2 = keras.layers.Dense(1, activation='softmax')
-
-  def call(self, first, second, fiddle_with_output='no', training=True):
-    combined = self.dense1(first) + self.dense2(second)
-    if fiddle_with_output == 'yes':
-      return 10. * combined
-    else:
-      return combined
-
-
-class TrainingNoDefaultModel(keras.Model):
-
-  def __init__(self):
-    super(TrainingNoDefaultModel, self).__init__()
-    self.dense1 = keras.layers.Dense(1)
-
-  def call(self, x, training):
-    return self.dense1(x)
-
-
-class TrainingMaskingModel(keras.Model):
-
-  def __init__(self):
-    super(TrainingMaskingModel, self).__init__()
-    self.dense1 = keras.layers.Dense(1)
-
-  def call(self, x, training=False, mask=None):
-    return self.dense1(x)
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class CustomCallSignatureTests(test.TestCase):
 
   def test_no_inputs_in_signature(self):
-    model = CustomCallModel()
+    model = model_util.CustomCallModel()
     first = array_ops.ones([2, 3])
     second = array_ops.ones([2, 5])
     output = model(first, second)
@@ -1222,7 +602,7 @@ class CustomCallSignatureTests(test.TestCase):
   def test_training_args_call_build(self):
     input_dim = 2
 
-    model = TrainingNoDefaultModel()
+    model = model_util.TrainingNoDefaultModel()
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -1234,7 +614,7 @@ class CustomCallSignatureTests(test.TestCase):
   def test_training_and_mask_args_call_build(self):
     input_dim = 2
 
-    model = TrainingMaskingModel()
+    model = model_util.TrainingMaskingModel()
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -1247,7 +627,7 @@ class CustomCallSignatureTests(test.TestCase):
     first_input_shape = (2, 3)
     second_input_shape = (2, 5)
 
-    model = CustomCallModel()
+    model = model_util.CustomCallModel()
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
@@ -1255,22 +635,6 @@ class CustomCallSignatureTests(test.TestCase):
         ValueError, 'cannot build your model if it has positional'):
       model.build(input_shape=[first_input_shape, second_input_shape])
 
-  def test_inputs_in_signature(self):
-
-    class HasInputsAndOtherPositional(keras.Model):
-
-      def call(self, inputs, some_other_arg, training=False):
-        return inputs
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    model = HasInputsAndOtherPositional()
-    with self.assertRaisesRegexp(
-        TypeError, 'everything else as a keyword argument'):
-      x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
-      model(x1, x2)
-
   def test_kwargs_in_signature(self):
 
     class HasKwargs(keras.Model):
@@ -1284,57 +648,42 @@ class CustomCallSignatureTests(test.TestCase):
     if not context.executing_eagerly():
       self.assertEqual(len(model.inputs), 1)
 
-  def test_args_in_signature(self):
-
-    class HasArgs(keras.Model):
-
-      def call(self, x, *args, **kwargs):
-        return [x] + list(args)
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    model = HasArgs()
-    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-    model(x1, x2, x3, a=3)
-    self.assertEqual(len(model.inputs), 3)
-
-  def test_args_and_keywords_in_signature(self):
-
-    class HasArgs(keras.Model):
-
-      def call(self, x, training=True, *args, **kwargs):  # pylint:disable=keyword-arg-before-vararg
-        return x
-
-    model = HasArgs()
-    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-    with self.assertRaisesRegexp(
-        TypeError, 'may not accept both positional arguments and '):
-      model(x1, x2, x3, a=3)
-
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def test_training_no_default(self):
-    if context.executing_eagerly():
-      self.skipTest('b/120997007')
-
-    model = TrainingNoDefaultModel()
-
+    if not context.executing_eagerly():
+      self.skipTest('b/138307499')
+    model = model_util.TrainingNoDefaultModel()
     arg = array_ops.ones([1, 1])
     model(arg, True)
-    self.assertEqual(len(model.inputs), 1)
 
-  def test_training_no_default_with_positional(self):
+  def test_positional_arg_in_call(self):
 
-    class TrainingNoDefaultWithPositional(keras.Model):
+    class ModelWithPositionalArgs(keras.Model):
 
-      def call(self, x, training, positional):
-        return x
+      def call(self, x, x2, x3=None):
+        return x + x2
+
+    x = np.ones((10, 1))
+    y = np.ones((10, 1))
+    m = ModelWithPositionalArgs()
+    m.compile('sgd', 'mse')
+    with self.assertRaisesRegexp(ValueError, r'Models passed to `fit`'):
+      m.fit(x, y, batch_size=2)
+    with self.assertRaisesRegexp(ValueError, r'Models passed to `evaluate`'):
+      m.evaluate(x, y, batch_size=2)
+    with self.assertRaisesRegexp(ValueError, r'Models passed to `predict`'):
+      m.predict(x, batch_size=2)
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Models passed to `train_on_batch`'):
+      m.train_on_batch(x, y)
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Models passed to `test_on_batch`'):
+      m.test_on_batch(x, y)
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Models passed to `predict_on_batch`'):
+      m.predict_on_batch(x)
 
-    model = TrainingNoDefaultWithPositional()
-    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-    with self.assertRaisesRegexp(TypeError, 'after a non-input'):
-      model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test_util.py b/tensorflow/python/keras/model_subclassing_test_util.py
new file mode 100644
index 00000000000..0f07c716b80
--- /dev/null
+++ b/tensorflow/python/keras/model_subclassing_test_util.py
@@ -0,0 +1,200 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras models for use in Model subclassing tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+
+
+# pylint: disable=missing-docstring,not-callable
+class SimpleTestModel(keras.Model):
+
+  def __init__(self, use_bn=False, use_dp=False, num_classes=10):
+    super(SimpleTestModel, self).__init__(name='test_model')
+    self.use_bn = use_bn
+    self.use_dp = use_dp
+    self.num_classes = num_classes
+
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='softmax')
+    if self.use_dp:
+      self.dp = keras.layers.Dropout(0.5)
+    if self.use_bn:
+      self.bn = keras.layers.BatchNormalization(axis=-1)
+
+  def call(self, x):
+    x = self.dense1(x)
+    if self.use_dp:
+      x = self.dp(x)
+    if self.use_bn:
+      x = self.bn(x)
+    return self.dense2(x)
+
+
+class SimpleConvTestModel(keras.Model):
+
+  def __init__(self, num_classes=10):
+    super(SimpleConvTestModel, self).__init__(name='test_model')
+    self.num_classes = num_classes
+
+    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
+    self.flatten = keras.layers.Flatten()
+    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
+
+  def call(self, x):
+    x = self.conv1(x)
+    x = self.flatten(x)
+    return self.dense1(x)
+
+
+class MultiIOTestModel(keras.Model):
+
+  def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
+    super(MultiIOTestModel, self).__init__(name='test_model')
+    self.use_bn = use_bn
+    self.use_dp = use_dp
+    self.num_classes = num_classes
+
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes[0], activation='softmax')
+    self.dense3 = keras.layers.Dense(num_classes[1], activation='softmax')
+    if use_dp:
+      self.dp = keras.layers.Dropout(0.5)
+    if use_bn:
+      self.bn = keras.layers.BatchNormalization()
+
+  def call(self, inputs):
+    x1, x2 = inputs
+    x1 = self.dense1(x1)
+    x2 = self.dense1(x2)
+    if self.use_dp:
+      x1 = self.dp(x1)
+    if self.use_bn:
+      x2 = self.bn(x2)
+    return [self.dense2(x1), self.dense3(x2)]
+
+
+class NestedTestModel1(keras.Model):
+  """A model subclass nested inside a model subclass.
+  """
+
+  def __init__(self, num_classes=2):
+    super(NestedTestModel1, self).__init__(name='nested_model_1')
+    self.num_classes = num_classes
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+    self.bn = keras.layers.BatchNormalization()
+    self.test_net = SimpleTestModel(num_classes=4,
+                                    use_bn=True,
+                                    use_dp=True)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.bn(x)
+    x = self.test_net(x)
+    return self.dense2(x)
+
+
+class NestedTestModel2(keras.Model):
+  """A model subclass with a functional-API graph network inside.
+  """
+
+  def __init__(self, num_classes=2):
+    super(NestedTestModel2, self).__init__(name='nested_model_2')
+    self.num_classes = num_classes
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+    self.bn = self.bn = keras.layers.BatchNormalization()
+    self.test_net = self.get_functional_graph_model(32, 4)
+
+  @staticmethod
+  def get_functional_graph_model(input_dim, num_classes):
+    # A simple functional-API model (a.k.a. graph network)
+    inputs = keras.Input(shape=(input_dim,))
+    x = keras.layers.Dense(32, activation='relu')(inputs)
+    x = keras.layers.BatchNormalization()(x)
+    outputs = keras.layers.Dense(num_classes)(x)
+    return keras.Model(inputs, outputs)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.bn(x)
+    x = self.test_net(x)
+    return self.dense2(x)
+
+
+def get_nested_model_3(input_dim, num_classes):
+  # A functional-API model with a subclassed model inside.
+  # NOTE: this requires the inner subclass to implement `compute_output_shape`.
+
+  inputs = keras.Input(shape=(input_dim,))
+  x = keras.layers.Dense(32, activation='relu')(inputs)
+  x = keras.layers.BatchNormalization()(x)
+
+  class Inner(keras.Model):
+
+    def __init__(self):
+      super(Inner, self).__init__()
+      self.dense1 = keras.layers.Dense(32, activation='relu')
+      self.dense2 = keras.layers.Dense(5, activation='relu')
+      self.bn = keras.layers.BatchNormalization()
+
+    def call(self, inputs):
+      x = self.dense1(inputs)
+      x = self.dense2(x)
+      return self.bn(x)
+
+  test_model = Inner()
+  x = test_model(x)
+  outputs = keras.layers.Dense(num_classes)(x)
+  return keras.Model(inputs, outputs, name='nested_model_3')
+
+
+class CustomCallModel(keras.Model):
+
+  def __init__(self):
+    super(CustomCallModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1, activation='relu')
+    self.dense2 = keras.layers.Dense(1, activation='softmax')
+
+  def call(self, first, second, fiddle_with_output='no', training=True):
+    combined = self.dense1(first) + self.dense2(second)
+    if fiddle_with_output == 'yes':
+      return 10. * combined
+    else:
+      return combined
+
+
+class TrainingNoDefaultModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingNoDefaultModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training):
+    return self.dense1(x)
+
+
+class TrainingMaskingModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingMaskingModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training=False, mask=None):
+    return self.dense1(x)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 6ce20a718f2..bf157da6878 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -32,7 +32,9 @@ from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -65,10 +67,7 @@ def _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes):
   ancillary_layers = [
       layer for layer in ancillary_layers if not isinstance(layer, AddMetric)
   ] + metric_layers
-  nodes = set(
-      nest.flatten([layer._inbound_nodes for layer in ancillary_layers]))
-  relevant_nodes = list(nodes.intersection(new_nodes))
-  model._insert_layers(ancillary_layers, relevant_nodes=relevant_nodes)
+  model._insert_layers(ancillary_layers, relevant_nodes=list(new_nodes))
 
 
 def _make_new_nodes(nodes_by_depth, layer_fn, layer_map, tensor_map):
@@ -169,16 +168,13 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
                      'but got a subclass model instead.')
 
   layer_map = {}  # Cache for created layers.
-  tensor_map = {}  # Map {reference_tensor: corresponding_tensor}
+  tensor_map = object_identity.ObjectIdentityDictionary(
+  )  # Map {reference_tensor: corresponding_tensor}
   if input_tensors is None:
     # Create placeholders to build the model on top of.
     input_tensors = []
     for layer in model._input_layers:
-      input_tensor = Input(
-          batch_shape=layer._batch_input_shape,
-          dtype=layer.dtype,
-          sparse=layer.sparse,
-          name=layer.name)
+      input_tensor = Input(**layer.get_config())
       input_tensors.append(input_tensor)
       # Cache newly created input layer.
       newly_created_input_layer = input_tensor._keras_history.layer
@@ -580,8 +576,8 @@ def clone_and_build_model(
   Args:
     model: `tf.keras.Model` object. Can be Functional, Sequential, or
       sub-classed.
-    input_tensors: Optional list of input tensors to build the model upon. If
-      not provided, placeholders will be created.
+    input_tensors: Optional list or dictionary of input tensors to build the
+      model upon. If not provided, placeholders will be created.
     target_tensors: Optional list of target tensors for compiling the model. If
       not provided, placeholders will be created.
     custom_objects: Optional dictionary mapping string names to custom classes
@@ -596,10 +592,10 @@ def clone_and_build_model(
       optimizer if the clone is compiled. This argument is used when a Keras
       model is cloned into an Estimator model function, because Estimators
       create their own global step variable.
-    optimizer_config: Optimizer config dictionary returned from `get_config()`.
-      This argument should be defined if `clone_and_build_model` is called in
-      a different graph or session from the original model, and the optimizer is
-      an instance of `OptimizerV2`.
+    optimizer_config: Optimizer config dictionary or list of dictionary
+      returned from `get_config()`. This argument should be defined if
+      `clone_and_build_model` is called in a different graph or session from
+      the original model, and the optimizer is an instance of `OptimizerV2`.
 
   Returns:
     Clone of the model.
@@ -633,16 +629,24 @@ def clone_and_build_model(
       clone._set_inputs(
           K.placeholder(model._build_input_shape, dtype=model.inputs[0].dtype))
   else:
-    if not in_place_reset:
-      raise ValueError(
-          'This model is a subclassed model. '
-          'Such a model cannot be cloned, but there is a workaround where '
-          'the model is reset in-place. To use this, please set the argument '
-          '`in_place_reset` to `True`. This will reset the attributes in the '
-          'original model. To restore the attributes, call '
-          '`in_place_subclassed_model_state_restoration(model)`.')
-    clone = model
-    _in_place_subclassed_model_reset(clone)
+    try:
+      # Prefer clonining the model if serial/deserial logic is implemented for
+      # subclassed model.
+      clone = model.__class__.from_config(model.get_config())
+    except NotImplementedError:
+      logging.warning('This model is a subclassed model. Please implement '
+                      '`get_config` and `from_config` to better support '
+                      'cloning the model.')
+      if not in_place_reset:
+        raise ValueError(
+            'This model is a subclassed model. '
+            'Such a model cannot be cloned, but there is a workaround where '
+            'the model is reset in-place. To use this, please set the argument '
+            '`in_place_reset` to `True`. This will reset the attributes in the '
+            'original model. To restore the attributes, call '
+            '`in_place_subclassed_model_state_restoration(model)`.')
+      clone = model
+      _in_place_subclassed_model_reset(clone)
     if input_tensors is not None:
       if isinstance(input_tensors, (list, tuple)) and len(input_tensors) == 1:
         input_tensors = input_tensors[0]
@@ -654,11 +658,27 @@ def clone_and_build_model(
           orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
-      optimizer_config = optimizer_config or orig_optimizer.get_config()
-      optimizer = orig_optimizer.__class__.from_config(optimizer_config)
+      if not isinstance(orig_optimizer, (tuple, list)):
+        orig_optimizer = [orig_optimizer]
+      if optimizer_config is None:
+        optimizer = [
+            opt.__class__.from_config(opt.get_config())
+            for opt in orig_optimizer
+        ]
+      elif isinstance(optimizer_config, dict):
+        optimizer = [orig_optimizer[0].__class__.from_config(optimizer_config)]
+      else:
+        # optimizer config is list of dict, same order as orig_optimizer.
+        optimizer = [
+            opt.__class__.from_config(opt_config)
+            for (opt, opt_config) in zip(orig_optimizer, optimizer_config)
+        ]
       if optimizer_iterations is not None:
-        optimizer.iterations = optimizer_iterations
+        for opt in optimizer:
+          opt.iterations = optimizer_iterations
 
+      if len(optimizer) == 1:
+        optimizer = optimizer[0]
     clone.compile(
         optimizer,
         model.loss,
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 0cd79cf1976..6ee565ccddf 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -177,7 +177,7 @@ class TestModelCloning(keras_parameterized.TestCase):
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     new_model.train_on_batch([val_a, val_b], val_out)
 
     # On top of new tensors
@@ -190,7 +190,7 @@ class TestModelCloning(keras_parameterized.TestCase):
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     new_model.train_on_batch([val_a, val_b], val_out)
 
     # On top of new, non-Keras tensors
@@ -205,7 +205,7 @@ class TestModelCloning(keras_parameterized.TestCase):
           testing_utils.get_v2_optimizer('rmsprop'),
           'mse',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       new_model.train_on_batch(None, val_out)
 
   @keras_parameterized.run_all_keras_modes
@@ -232,7 +232,7 @@ class TestModelCloning(keras_parameterized.TestCase):
         loss='mse',
         optimizer=testing_utils.get_v2_optimizer('adam'),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     y = np.array([[[1], [1]], [[1], [1]]])
     loss = model.train_on_batch(x, y)
     self.assertEqual(float(loss), 0.)
@@ -297,7 +297,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
         optimizer=opt,
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     model.fit(
         x=np.array([[1., 2., 3., 4.]]),
@@ -327,7 +327,7 @@ class TestModelBackend(keras_parameterized.TestCase):
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     keras.backend.set_floatx(floatx)
 
@@ -357,7 +357,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     new_model.train_on_batch(inp, out)
 
     # Create new tensors for inputs and targets
@@ -374,7 +374,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         testing_utils.get_v2_optimizer('rmsprop'),
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     new_model.train_on_batch(inp, out)
 
   def _assert_same_compile_params(self, model):
@@ -428,7 +428,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         'mse',
         metrics=['acc', metrics.categorical_accuracy],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     self._clone_and_build_test_helper(model, testing_utils.get_model_type())
 
@@ -440,7 +440,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         'mse',
         metrics=['acc', metrics.categorical_accuracy],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self._clone_and_build_test_helper(model, 'sequential')
 
     inp = np.random.random((10, 4))
@@ -455,7 +455,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         'mse',
         metrics=['acc', metrics.categorical_accuracy],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     global_step = keras.backend.variable(123, dtype=dtypes.int64)
     clone_model = models.clone_and_build_model(
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index 7c9a17fa4d3..a766e6b3a30 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend_config
@@ -27,7 +28,9 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -66,9 +69,8 @@ class Adagrad(optimizer_v2.OptimizerV2):
     Args:
       learning_rate: A `Tensor` or a floating point value.  The learning rate.
       initial_accumulator_value: A floating point value.
-        Starting value for the accumulators, must be positive.
-      epsilon: A floating point value.
-        Starting value for the accumulators, must be positive.
+        Starting value for the accumulators, must be non-negative.
+      epsilon: A small floating point value to avoid zero denominator.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adagrad".
       **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
@@ -151,6 +153,15 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
+    if compat.forward_compatible(2019, 8, 20):
+      return training_ops.resource_apply_adagrad_v2(
+          var.handle,
+          acc.handle,
+          coefficients['lr_t'],
+          coefficients['epsilon'],
+          grad,
+          use_locking=self._use_locking)
+
     acc_t = state_ops.assign_add(
         acc, math_ops.square(grad), use_locking=self._use_locking)
     var_update = state_ops.assign_sub(
@@ -164,10 +175,22 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
-    acc_t = self._resource_scatter_add(acc, indices, math_ops.square(grad))
-    acc_t_slice = array_ops.gather(acc_t, indices, axis=coefficients['zero'])
-    var_update = self._resource_scatter_add(
-        var, indices, coefficients['neg_lr_t'] * grad /
+    if compat.forward_compatible(2019, 8, 20):
+      return training_ops.resource_sparse_apply_adagrad_v2(
+          var.handle,
+          acc.handle,
+          coefficients['lr_t'],
+          coefficients['epsilon'],
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    with ops.control_dependencies([
+        resource_variable_ops.resource_scatter_add(acc.handle, indices,
+                                                   math_ops.square(grad))
+    ]):
+      acc_t_slice = acc.sparse_read(indices)
+    var_update = resource_variable_ops.resource_scatter_add(
+        var.handle, indices, coefficients['neg_lr_t'] * grad /
         (math_ops.sqrt(acc_t_slice) + coefficients['epsilon']))
     return var_update
 
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 3ddf9852ba8..d3a2ac8b5ab 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -161,6 +161,47 @@ class AdagradOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  def testBasicWithLargeEpsilon(self):
+    with self.cached_session():
+      var0_np = np.array([1.0, 2.0])
+      var1_np = np.array([3.0, 4.0])
+      grads0_np = np.array([0.1, 0.1])
+      grads1_np = np.array([0.01, 0.01])
+      var0 = resource_variable_ops.ResourceVariable(var0_np)
+      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      grads0 = constant_op.constant(grads0_np)
+      grads1 = constant_op.constant(grads1_np)
+
+      learning_rate = 3.0
+
+      ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.0)
+
+      accum0_np = np.array([0.1, 0.1])
+      accum1_np = np.array([0.1, 0.1])
+
+      if not context.executing_eagerly():
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+      # Fetch params to validate initial values
+      v0_val, v1_val = self.evaluate([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+      # Run 3 steps of adagrad
+      for _ in range(3):
+        if not context.executing_eagerly():
+          self.evaluate(ada_update)
+        else:
+          ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np, grads0_np,
+                                                  3.0, 1.0)
+        var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np, grads1_np,
+                                                  3.0, 1.0)
+        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   def testBasicWithLearningRateInverseTimeDecay(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -308,6 +349,41 @@ class AdagradOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testSparseSingleVarDim(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0], var0.eval())
+
+        accum0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+
+          var0_np, accum0_np = sparse_adagrad_update_numpy(
+              var0_np,
+              accum0_np,
+              grads0_np_indices,
+              grads0_np[grads0_np_indices],
+              learning_rate,
+              epsilon=1.)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+
   @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 745d6b7a0dd..bad306a1dfd 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -119,16 +119,19 @@ class Adam(optimizer_v2.OptimizerV2):
       amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
         the paper "On the Convergence of Adam and beyond".
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".  @compatibility(eager) When eager execution is
-        enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
-        a callable that takes no arguments and returns the actual value to use.
-        This can be useful for changing these values across different
-        invocations of optimizer functions. @end_compatibility
+        Defaults to "Adam".
       **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
         `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta_1`, `beta_2`,
+    and `epsilon` can each be a callable that takes no arguments and
+    returns the actual value to use. This can be useful for changing these
+    values across different invocations of optimizer functions.
+    @end_compatibility
     """
 
     super(Adam, self).__init__(name, **kwargs)
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index e0b1c7e35ae..1c982a16ee8 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -539,7 +539,8 @@ class AdamOptimizerTest(test.TestCase):
       opt = adam.Adam(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          5, len(set([v.experimental_ref() for v in opt.variables()])))
       self.assertEqual(
           self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index fe658761c76..b246a1d07f8 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -360,7 +360,7 @@ class AdamaxOptimizerTest(test.TestCase):
       opt = adamax.Adamax(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(5, len({id(v) for v in opt.variables()}))
 
   def testConstructAdamaxWithLR(self):
     opt = adamax.Adamax(lr=1.0)
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index 00b7c449339..bd81dfaad20 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -452,7 +452,7 @@ class InverseTimeDecay(LearningRateSchedule):
     decay_steps = 1.0
     decay_rate = 0.5
     learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
-      initial_learning_rate, global_step, decay_steps, decay_rate)
+      initial_learning_rate, decay_steps, decay_rate)
 
     model.compile(optimizer=tf.keras.optimizers.SGD(
                       learning_rate=learning_rate_fn),
@@ -549,7 +549,7 @@ class CosineDecay(LearningRateSchedule):
     ```python
     decay_steps = 1000
     lr_decayed_fn = tf.keras.experimental.CosineDecay(
-        initial_learning_rate, global_step, decay_steps)
+        initial_learning_rate, decay_steps)
     ```
 
     You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
@@ -640,7 +640,6 @@ class CosineDecayRestarts(LearningRateSchedule):
     lr_decayed_fn = (
       tf.keras.experimental.CosineDecayRestarts(
           initial_learning_rate,
-          global_step,
           first_decay_steps))
     ```
 
@@ -665,8 +664,6 @@ class CosineDecayRestarts(LearningRateSchedule):
       A 1-arg callable learning rate schedule that takes the current optimizer
       step and outputs the decayed learning rate, a scalar `Tensor` of the same
       type as `initial_learning_rate`.
-    Raises:
-      ValueError: if `global_step` is not supplied.
     """
     super(CosineDecayRestarts, self).__init__()
 
@@ -779,7 +776,7 @@ class LinearCosineDecay(LearningRateSchedule):
     decay_steps = 1000
     lr_decayed_fn = (
       tf.keras.experimental.LinearCosineDecay(
-        initial_learning_rate, global_step, decay_steps))
+        initial_learning_rate, decay_steps))
     ```
 
     You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
@@ -899,7 +896,7 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
     decay_steps = 1000
     lr_decayed_fn = (
       tf.keras.experimental.NoisyLinearCosineDecay(
-        initial_learning_rate, global_step, decay_steps))
+        initial_learning_rate, decay_steps))
     ```
 
     You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index b2f0fdd913c..3750cb74976 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -109,7 +110,8 @@ class Nadam(optimizer_v2.OptimizerV2):
           shape=[],
           dtype=var_dtype,
           initializer='ones',
-          trainable=False)
+          trainable=False,
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
       self._weights.append(self._m_cache)
     # Separate for-loops to respect the ordering of slot variables from v1.
     for var in var_list:
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 039e2b4cea7..6b916fc7d9d 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -138,15 +138,13 @@ class OptimizerV2(trackable.Trackable):
     loss = <call_loss_function>
   vars = <list_of_variables>
   grads = tape.gradient(loss, vars)
+
+  # Process the gradients, for example cap them, etc.
+  # capped_grads = [MyCapper(g) for g in grads]
   processed_grads = [process_gradient(g) for g in grads]
-  grads_and_vars = zip(processed_grads, var_list)
 
-  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
-  # need to the 'gradient' part, for example cap them, etc.
-  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
-
-  # Ask the optimizer to apply the capped gradients.
-  opt.apply_gradients(capped_grads_and_vars)
+  # Ask the optimizer to apply the processed gradients.
+  opt.apply_gradients(zip(processed_grads, var_list))
   ```
 
   ### Use with `tf.distribute.Strategy`.
@@ -417,8 +415,8 @@ class OptimizerV2(trackable.Trackable):
         passed to the `Optimizer` constructor.
 
     Returns:
-      An `Operation` that applies the specified gradients. If `global_step`
-      was not None, that operation also increments `global_step`.
+      An `Operation` that applies the specified gradients. The `iterations`
+        will be automatically increased by 1.
 
     Raises:
       TypeError: If `grads_and_vars` is malformed.
@@ -474,9 +472,12 @@ class OptimizerV2(trackable.Trackable):
     update_ops = []
     with backend.name_scope(name or self._name):
       for grad, var in grads_and_vars:
-        scope_name = ("" if ops.executing_eagerly_outside_functions() else
-                      "_" + var.op.name)
-        with backend.name_scope("update" + scope_name):
+        scope_name = ("update" if ops.executing_eagerly_outside_functions() else
+                      "update_" + var.op.name)
+        # Colocate the update with variables to avoid unnecessary communication
+        # delays. See b/136304694.
+        with backend.name_scope(
+            scope_name), distribution.extended.colocate_vars_with(var):
           update_ops.extend(
               distribution.extended.update(
                   var, apply_grad_to_update_var, args=(grad,), group=False))
@@ -628,7 +629,8 @@ class OptimizerV2(trackable.Trackable):
       return
     # Iterate hyper values deterministically.
     for name, value in sorted(self._hyper.items()):
-      if isinstance(value, ops.Tensor) or callable(value):
+      if isinstance(
+          value, (ops.Tensor, tf_variables.Variable)) or callable(value):
         continue
       else:
         self._hyper[name] = self.add_weight(
@@ -1021,7 +1023,7 @@ def _filter_grads(grads_and_vars):
                      ([v.name for _, v in grads_and_vars],))
   if vars_with_empty_grads:
     logging.warning(
-        ("Gradients does not exist for variables %s when minimizing the loss."),
+        ("Gradients do not exist for variables %s when minimizing the loss."),
         ([v.name for v in vars_with_empty_grads]))
   return filtered
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 7fc63d1c59c..a0b9702916d 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -609,10 +609,16 @@ class OptimizerTest(test.TestCase):
       self.assertEqual('outter/Adam/var_2/m:0', opt_vars[3].name)
 
 
-@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
 class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
+  # After experimental_run_tf_function is turned on, optimizer v1 can no longer
+  # work in eager mode, skipping the test if so.
   def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -628,13 +634,23 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       num_hidden = 5
       model_v1 = testing_utils.get_small_sequential_mlp(
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_v1.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model_v1.compile(
+          opt_v1,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model_v1.fit(x, y, batch_size=5, epochs=1)
 
       model_v2 = testing_utils.get_small_sequential_mlp(
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
       model_v2.set_weights(model_v1.get_weights())
-      model_v2.compile(opt_v2, loss='categorical_crossentropy', metrics=[])
+      model_v2.compile(
+          opt_v2,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       model_v2._make_train_function()
       if test_weights:
         opt_v2.set_weights(opt_v1.get_weights())
@@ -687,6 +703,10 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
     self._testOptimizersCompatibility(opt_v1, opt_v2, False)
 
   def testNumericEquivalenceForNesterovMomentum(self):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -714,9 +734,24 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       opt_tf = momentum.MomentumOptimizer(
           learning_rate=0.01, momentum=0.9, use_nesterov=True)
 
-      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
-      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
-      model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[])
+      model_k_v1.compile(
+          opt_k_v1,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+      model_k_v2.compile(
+          opt_k_v2,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+      model_tf.compile(
+          opt_tf,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
       hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
@@ -729,6 +764,10 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
 
   def testNumericEquivalenceForAmsgrad(self):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -751,8 +790,18 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       opt_k_v1 = optimizers.Adam(amsgrad=True)
       opt_k_v2 = adam.Adam(amsgrad=True)
 
-      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
-      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+      model_k_v1.compile(
+          opt_k_v1,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+      model_k_v2.compile(
+          opt_k_v2,
+          loss='categorical_crossentropy',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
       hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
       hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 7d80dafddbe..87c1e56bd7c 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -527,21 +527,21 @@ class RMSpropOptimizerTest(test.TestCase):
       opt = rmsprop.RMSprop(1., momentum=0., centered=False)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and one unique slot variable for v1 and v2.
-      self.assertEqual(3, len(set(opt.variables())))
+      self.assertEqual(3, len(set({id(v) for v in opt.variables()})))
       self.assertEqual(
           self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
       opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(5, len(set({id(v) for v in opt.variables()})))
       self.assertEqual(
           self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
       opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and three unique slot variables for v1 and v2
-      self.assertEqual(7, len(set(opt.variables())))
+      self.assertEqual(7, len(set({id(v) for v in opt.variables()})))
       self.assertEqual(
           self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index c6146d3aafe..8885cead8b5 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -24,7 +24,9 @@ import weakref
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.adam import AdamOptimizer
@@ -39,16 +41,27 @@ def _get_model(input_dim, num_hidden, output_dim):
   return model
 
 
-class KerasOptimizersTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class KerasOptimizersTest(keras_parameterized.TestCase):
 
+  # After experimental_run_tf_function is turned on, optimizer v1 can no longer
+  # work in eager mode, skipping the test if so.
   def _test_optimizer(self, optimizer, target=0.75):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     np.random.seed(1337)
     (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2)
     y_train = keras.utils.to_categorical(y_train)
     model = _get_model(x_train.shape[1], 20, y_train.shape[1])
     model.compile(
-        loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     np.testing.assert_equal(
         keras.backend.get_value(model.optimizer.iterations), 0)
     history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
@@ -84,7 +97,9 @@ class KerasOptimizersTest(test.TestCase):
     model.compile(
         loss='categorical_crossentropy',
         optimizer=optimizer,
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     np.testing.assert_equal(
         keras.backend.get_value(model.optimizer.iterations),
         126)  # Using same optimizer from before
@@ -150,12 +165,20 @@ class KerasOptimizersTest(test.TestCase):
           keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
 
   def test_tf_optimizer(self):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(
         2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
     # This is possible
-    model.compile(loss='mean_squared_error', optimizer=optimizer)
+    model.compile(
+        loss='mean_squared_error',
+        optimizer=optimizer,
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     keras.backend.track_tf_optimizer(optimizer)
     model.fit(np.random.random((5, 3)),
               np.random.random((5, 2)),
@@ -171,6 +194,10 @@ class KerasOptimizersTest(test.TestCase):
       optimizer.from_config(None)
 
   def test_optimizer_garbage_collection(self):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     graph = ops.Graph()
     with graph.as_default():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
@@ -184,12 +211,20 @@ class KerasOptimizersTest(test.TestCase):
     self.assertIs(optimizer_weak(), None)
 
   def test_tf_optimizer_iterations(self):
+    if testing_utils.should_run_tf_function() or context.executing_eagerly():
+      self.skipTest(
+          'v1 optimizer does not run in experimental_run_tf_function mode or '
+          'eager mode')
     with self.cached_session():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(
           2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
-      model.compile(loss='mean_squared_error', optimizer=optimizer)
+      model.compile(
+          loss='mean_squared_error',
+          optimizer=optimizer,
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       keras.backend.track_tf_optimizer(optimizer)
       self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0)
 
diff --git a/tensorflow/python/keras/premade/BUILD b/tensorflow/python/keras/premade/BUILD
index 2350bb4fe98..af8e86b0d89 100644
--- a/tensorflow/python/keras/premade/BUILD
+++ b/tensorflow/python/keras/premade/BUILD
@@ -13,7 +13,9 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 py_library(
     name = "premade",
     srcs = [
+        "__init__.py",
         "linear.py",
+        "wide_deep.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -38,3 +40,18 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_test(
+    name = "wide_deep_test",
+    size = "medium",
+    srcs = ["wide_deep_test.py"],
+    python_version = "PY2",
+    shard_count = 2,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":premade",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/examples/how_tos/__init__.py b/tensorflow/python/keras/premade/__init__.py
similarity index 79%
rename from tensorflow/examples/how_tos/__init__.py
rename to tensorflow/python/keras/premade/__init__.py
index 2069def2420..507f7a6c2ec 100644
--- a/tensorflow/examples/how_tos/__init__.py
+++ b/tensorflow/python/keras/premade/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Declaring how_tos a python package.
-"""
+"""Premade Model API."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.python.keras.premade import linear
+from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/keras/premade/linear.py b/tensorflow/python/keras/premade/linear.py
index fc9599f7675..3cb3d7f0384 100644
--- a/tensorflow/python/keras/premade/linear.py
+++ b/tensorflow/python/keras/premade/linear.py
@@ -18,13 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import nn
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.experimental.LinearModel')
 class LinearModel(training.Model):
   r"""Linear Model for regression and classification problems.
 
@@ -58,6 +62,7 @@ class LinearModel(training.Model):
 
   def __init__(self,
                units=1,
+               activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
                bias_initializer='zeros',
@@ -68,6 +73,8 @@ class LinearModel(training.Model):
 
     Args:
       units: Positive integer, output dimension without the batch size.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied.
       use_bias: whether to calculate the bias/intercept for this model. If set
         to False, no bias/intercept will be used in calculations, e.g., the data
         is already centered.
@@ -79,6 +86,7 @@ class LinearModel(training.Model):
     """
 
     self.units = units
+    self.activation = activations.get(activation)
     self.use_bias = use_bias
     self.kernel_initializer = initializers.get(kernel_initializer)
     self.bias_initializer = initializers.get(bias_initializer)
@@ -133,4 +141,24 @@ class LinearModel(training.Model):
 
     if self.use_bias:
       result = nn.bias_add(result, self.bias)
+    if self.activation is not None:
+      return self.activation(result)  # pylint: disable=not-callable
     return result
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+    }
+    base_config = base_layer.Layer.get_config(self)
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    del custom_objects
+    return cls(**config)
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index 842e89ef930..7d61da7cdee 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -22,13 +22,16 @@ import numpy as np
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features_v2
+from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import losses
 from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
@@ -37,8 +40,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class LinearModelTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class LinearModelTest(keras_parameterized.TestCase):
 
   def test_linear_model_with_single_input(self):
     model = linear.LinearModel()
@@ -69,11 +72,6 @@ class LinearModelTest(test.TestCase):
     model.compile('sgd', 'mse', [])
     model.fit([input_a_np, input_b_np], output_np, epochs=5)
 
-  def test_linear_model_with_int_input(self):
-    inp = input_layer.Input(shape=(1,), dtype=dtypes.int32)
-    with self.assertRaisesRegexp(TypeError, 'Unable to build'):
-      linear.LinearModel()(inp)
-
   def test_linear_model_with_sparse_input(self):
     indices = constant_op.constant([[0, 0], [0, 2], [1, 0], [1, 1]],
                                    dtype=dtypes.int64)
@@ -131,6 +129,39 @@ class LinearModelTest(test.TestCase):
         grads_and_vars = zip(grads, model.trainable_variables)
         opt.apply_gradients(grads_and_vars)
 
+  # This test is an example for a regression on categorical inputs, i.e.,
+  # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
+  # separately.
+  def test_linear_model_with_feature_column(self):
+    with context.eager_mode():
+      vocab_list = ['alpha', 'beta', 'gamma']
+      vocab_val = [0.4, 0.6, 0.9]
+      data = np.random.choice(vocab_list, size=256)
+      y = np.zeros_like(data, dtype=np.float32)
+      for vocab, val in zip(vocab_list, vocab_val):
+        indices = np.where(data == vocab)
+        y[indices] = val + np.random.uniform(
+            low=-0.01, high=0.01, size=indices[0].shape)
+      cat_column = fc.categorical_column_with_vocabulary_list(
+          key='symbol', vocabulary_list=vocab_list)
+      ind_column = fc.indicator_column(cat_column)
+      dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+      linear_model = linear.LinearModel(
+          use_bias=False, kernel_initializer='zeros')
+      combined = sequential.Sequential([dense_feature_layer, linear_model])
+      opt = gradient_descent.SGD(learning_rate=0.1)
+      combined.compile(opt, 'mse', [])
+      combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+      self.assertAllClose([[0.4], [0.6], [0.9]],
+                          combined.layers[1].dense_layers[0].kernel.numpy(),
+                          atol=0.01)
+
+  def test_config(self):
+    linear_model = linear.LinearModel(units=3, use_bias=True)
+    config = linear_model.get_config()
+    cloned_linear_model = linear.LinearModel.from_config(config)
+    self.assertEqual(linear_model.units, cloned_linear_model.units)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
new file mode 100644
index 00000000000..de926f5f6bb
--- /dev/null
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -0,0 +1,193 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Built-in WideNDeep model classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers as layer_module
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.experimental.WideDeepModel')
+class WideDeepModel(training.Model):
+  r"""Wide & Deep Model for regression and classification problems.
+
+  This model jointly train a linear and a dnn model.
+
+  Example:
+
+  ```python
+  linear_model = LinearModel()
+  dnn_model = keras.Sequential([keras.layers.Dense(units=64),
+                               keras.layers.Dense(units=1)])
+  combined_model = WideDeepModel(dnn_model, linear_model)
+  combined_model.compile(optimizer=['sgd', 'adam'], 'mse', ['mse'])
+  # define dnn_inputs and linear_inputs as separate numpy arrays or
+  # a single numpy array if dnn_inputs is same as linear_inputs.
+  combined_model.fit([dnn_inputs, linear_inputs], y, epochs)
+  # or define a single `tf.data.Dataset` that contains a single tensor or
+  # separate tensors for dnn_inputs and linear_inputs.
+  dataset = tf.data.Dataset.from_tensors(([dnn_inputs, linear_inputs], y))
+  combined_model.fit(dataset, epochs)
+  ```
+
+  Both linear and dnn model can be pre-compiled and trained separately
+  before jointly training:
+
+  Example:
+  ```python
+  linear_model = LinearModel()
+  linear_model.compile('adagrad', 'mse')
+  linear_model.fit(linear_inputs, y, epochs)
+  dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
+  dnn_model.compile('rmsprop', 'mse')
+  dnn_model.fit(dnn_inputs, y, epochs)
+  combined_model = WideDeepModel(dnn_model, linear_model)
+  combined_model.compile(optimizer=['sgd', 'adam'], 'mse', ['mse'])
+  combined_model.fit([dnn_inputs, linear_inputs], y, epochs)
+  ```
+
+  """
+
+  def __init__(self, linear_model, dnn_model, activation=None, **kwargs):
+    """Create a Wide & Deep Model.
+
+    Args:
+      linear_model: a premade LinearModel, its output must match the output of
+        the dnn model.
+      dnn_model: a `tf.keras.Model`, its output must match the output of the
+        linear model.
+      activation: Activation function. Set it to None to maintain a linear
+        activation.
+      **kwargs: The keyword arguments that are passed on to BaseLayer.__init__.
+        Allowed keyword arguments include `name`.
+    """
+    super(WideDeepModel, self).__init__(**kwargs)
+    self.linear_model = linear_model
+    self.dnn_model = dnn_model
+    self.activation = activations.get(activation)
+
+  def call(self, inputs):
+    if not isinstance(inputs, (tuple, list)) or len(inputs) != 2:
+      linear_inputs = dnn_inputs = inputs
+    else:
+      linear_inputs, dnn_inputs = inputs
+    linear_output = self.linear_model(linear_inputs)
+    dnn_output = self.dnn_model(dnn_inputs)
+    output = .5 * (linear_output + dnn_output)
+    if self.activation:
+      return self.activation(output)
+    return output
+
+  def _get_optimizers(self):
+    if isinstance(self.optimizer, (tuple, list)):
+      return (self.optimizer[0], self.optimizer[1])
+    else:
+      return (self.optimizer, self.optimizer)
+
+  # This does not support gradient scaling and LossScaleOptimizer.
+  def _backwards(self, tape, loss):
+    linear_vars = self.linear_model._unique_trainable_weights  # pylint: disable=protected-access
+    dnn_vars = self.dnn_model._unique_trainable_weights  # pylint: disable=protected-access
+    linear_grads, dnn_grads = tape.gradient(loss, (linear_vars, dnn_vars))
+    linear_optimizer, dnn_optimizer = self._get_optimizers()
+    linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
+    dnn_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
+    return
+
+  def _make_train_function(self):
+    # TODO(tanzheny): This is a direct copy from super to make it work
+    # refactor it so that common logic can be shared.
+    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
+    self._check_trainable_weights_consistency()
+    # If we have re-compiled the loss/weighted metric sub-graphs then create
+    # train function even if one exists already. This is because
+    # `_feed_sample_weights` list has been updated on re-copmpile.
+    if getattr(self, 'train_function', None) is None or has_recompiled:
+      # Restore the compiled trainable state.
+      current_trainable_state = self._get_trainable_state()
+      self._set_trainable_state(self._compiled_trainable_state)
+
+      inputs = (
+          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      linear_optimizer, dnn_optimizer = self._get_optimizers()
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          # Training updates
+          updates = []
+          linear_updates = linear_optimizer.get_updates(
+              params=self.linear_model._unique_trainable_weights,  # pylint: disable=protected-access
+              loss=self.total_loss)
+          updates += linear_updates
+          dnn_updates = dnn_optimizer.get_updates(
+              params=self.dnn_model._unique_trainable_weights,  # pylint: disable=protected-access
+              loss=self.total_loss)
+          updates += dnn_updates
+          # Unconditional updates
+          updates += self.get_updates_for(None)
+          # Conditional updates relevant to this model
+          updates += self.get_updates_for(self.inputs)
+
+        metrics = self._get_training_eval_metrics()
+        metrics_tensors = [
+            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
+        ]
+
+      with K.name_scope('training'):
+        # Gets loss and metrics. Updates weights at each call.
+        fn = K.function(
+            inputs, [self.total_loss] + metrics_tensors,
+            updates=updates,
+            name='train_function',
+            **self._function_kwargs)
+        setattr(self, 'train_function', fn)
+
+      # Restore the current trainable state
+      self._set_trainable_state(current_trainable_state)
+
+  def get_config(self):
+    linear_config = generic_utils.serialize_keras_object(self.linear_model)
+    dnn_config = generic_utils.serialize_keras_object(self.dnn_model)
+    config = {
+        'linear_model': linear_config,
+        'dnn_model': dnn_config,
+        'activation': activations.serialize(self.activation),
+    }
+    base_config = base_layer.Layer.get_config(self)
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    linear_config = config.pop('linear_model')
+    linear_model = layer_module.deserialize(linear_config, custom_objects)
+    dnn_config = config.pop('dnn_model')
+    dnn_model = layer_module.deserialize(dnn_config, custom_objects)
+    activation = activations.deserialize(
+        config.pop('activation', None), custom_objects=custom_objects)
+    return cls(
+        linear_model=linear_model,
+        dnn_model=dnn_model,
+        activation=activation,
+        **config)
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
new file mode 100644
index 00000000000..fbbd10dc01c
--- /dev/null
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -0,0 +1,264 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras Premade WideNDeep models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.feature_column import dense_features_v2
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.premade import linear
+from tensorflow.python.keras.premade import wide_deep
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class WideDeepModelTest(keras_parameterized.TestCase):
+
+  def test_wide_deep_model(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    linear_inp = np.random.uniform(low=-5, high=5, size=(64, 2))
+    dnn_inp = np.random.uniform(low=-5, high=5, size=(64, 3))
+    inputs = [linear_inp, dnn_inp]
+    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
+    wide_deep_model.compile(
+        optimizer=['sgd', 'adam'],
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    wide_deep_model.fit(inputs, output, epochs=5)
+    self.assertTrue(wide_deep_model.built)
+
+  def test_wide_deep_model_backprop(self):
+    with self.cached_session():
+      linear_model = linear.LinearModel(units=1, kernel_initializer='zeros')
+      dnn_model = sequential.Sequential(
+          [core.Dense(units=1, kernel_initializer='zeros')])
+      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+      linear_inp = np.array([1.])
+      dnn_inp = np.array([1.])
+      inputs = [linear_inp, dnn_inp]
+      output = linear_inp + 2 * dnn_inp
+      linear_opt = gradient_descent.SGD(learning_rate=.1)
+      dnn_opt = gradient_descent.SGD(learning_rate=.3)
+      wide_deep_model.compile(
+          optimizer=[linear_opt, dnn_opt],
+          loss='mse',
+          metrics=[],
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+      self.evaluate(variables.global_variables_initializer())
+      wide_deep_model.fit(inputs, output, epochs=1)
+      self.assertAllClose(
+          [[0.3]],
+          self.evaluate(wide_deep_model.linear_model.dense_layers[0].kernel))
+      self.assertAllClose([[0.9]],
+                          self.evaluate(
+                              wide_deep_model.dnn_model.layers[0].kernel))
+
+  def test_wide_deep_model_with_single_input(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    inputs = np.random.uniform(low=-5, high=5, size=(64, 3))
+    output = .3 * inputs[:, 0]
+    wide_deep_model.compile(
+        optimizer=['sgd', 'adam'],
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    wide_deep_model.fit(inputs, output, epochs=5)
+
+  def test_wide_deep_model_with_single_optimizer(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    linear_inp = np.random.uniform(low=-5, high=5, size=(64, 2))
+    dnn_inp = np.random.uniform(low=-5, high=5, size=(64, 3))
+    inputs = [linear_inp, dnn_inp]
+    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
+    wide_deep_model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    wide_deep_model.fit(inputs, output, epochs=5)
+    self.assertTrue(wide_deep_model.built)
+
+  def test_wide_deep_model_as_layer(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1)])
+    linear_input = input_layer.Input(shape=(3,), name='linear')
+    dnn_input = input_layer.Input(shape=(5,), name='dnn')
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    wide_deep_output = wide_deep_model((linear_input, dnn_input))
+    input_b = input_layer.Input(shape=(1,), name='b')
+    output_b = core.Dense(units=1)(input_b)
+    model = training.Model(
+        inputs=[linear_input, dnn_input, input_b],
+        outputs=[wide_deep_output + output_b])
+    linear_input_np = np.random.uniform(low=-5, high=5, size=(64, 3))
+    dnn_input_np = np.random.uniform(low=-5, high=5, size=(64, 5))
+    input_b_np = np.random.uniform(low=-5, high=5, size=(64,))
+    output_np = linear_input_np[:, 0] + .2 * dnn_input_np[:, 1] + input_b_np
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.fit([linear_input_np, dnn_input_np, input_b_np], output_np, epochs=5)
+
+  def test_wide_deep_model_with_sub_model_trained(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(
+        linear.LinearModel(units=1),
+        sequential.Sequential([core.Dense(units=1, input_dim=3)]))
+    linear_inp = np.random.uniform(low=-5, high=5, size=(64, 2))
+    dnn_inp = np.random.uniform(low=-5, high=5, size=(64, 3))
+    inputs = [linear_inp, dnn_inp]
+    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
+    linear_model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    dnn_model.compile(
+        optimizer='adam',
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    linear_model.fit(linear_inp, output, epochs=50)
+    dnn_model.fit(dnn_inp, output, epochs=50)
+    wide_deep_model.compile(
+        optimizer=['sgd', 'adam'],
+        loss='mse',
+        metrics=[],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    wide_deep_model.fit(inputs, output, epochs=50)
+
+  # This test is an example for cases where linear and dnn model accepts
+  # same raw input and same transformed inputs, i.e., the raw input is
+  # categorical, and both linear and dnn model accept one hot encoding.
+  def test_wide_deep_model_with_single_feature_column(self):
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = fc.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = fc.indicator_column(cat_column)
+    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    dnn_model = sequential.Sequential([core.Dense(units=1)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    combined = sequential.Sequential([dense_feature_layer, wide_deep_model])
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    combined.compile(
+        opt,
+        'mse', [],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+
+  # This test is an example for cases where linear and dnn model accepts
+  # same raw input but different transformed inputs, i.e,. the raw input is
+  # categorical, and linear model accepts one hot encoding, while dnn model
+  # accepts embedding encoding.
+  def test_wide_deep_model_with_two_feature_columns(self):
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = fc.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = fc.indicator_column(cat_column)
+    emb_column = fc.embedding_column(cat_column, dimension=5)
+    linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    combined_linear = sequential.Sequential(
+        [linear_feature_layer, linear_model])
+    dnn_model = sequential.Sequential([core.Dense(units=1)])
+    dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
+    combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
+    wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn)
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    wide_deep_model.compile(
+        opt,
+        'mse', [],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+    self.assertEqual(3, linear_model.inputs[0].shape[1])
+    self.assertEqual(5, dnn_model.inputs[0].shape[1])
+
+  def test_config(self):
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    config = wide_deep_model.get_config()
+    cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(config)
+    self.assertEqual(linear_model.units,
+                     cloned_wide_deep_model.linear_model.units)
+    self.assertEqual(dnn_model.layers[0].units,
+                     cloned_wide_deep_model.dnn_model.layers[0].units)
+
+  def test_config_with_custom_objects(self):
+
+    def my_activation(x):
+      return x
+
+    linear_model = linear.LinearModel(units=1)
+    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+    wide_deep_model = wide_deep.WideDeepModel(
+        linear_model, dnn_model, activation=my_activation)
+    config = wide_deep_model.get_config()
+    cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(
+        config, custom_objects={'my_activation': my_activation})
+    self.assertEqual(cloned_wide_deep_model.activation, my_activation)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index 1b33e9df838..cabefa2694b 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -79,7 +79,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         loss='categorical_crossentropy',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.losses), 1)
     model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
 
@@ -97,7 +97,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         loss='categorical_crossentropy',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.losses), 1 if context.executing_eagerly() else 1)
     model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
 
@@ -113,7 +113,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x, y, batch_size=5, epochs=1)
 
   def test_custom_regularizer_saving(self):
@@ -144,7 +144,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         loss='categorical_crossentropy',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.losses), 5)
 
   @keras_parameterized.run_all_keras_modes
@@ -167,7 +167,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         loss='categorical_crossentropy',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.losses), 6)
 
   @keras_parameterized.run_all_keras_modes
@@ -195,7 +195,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         loss='categorical_crossentropy',
         optimizer='sgd',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     self.assertEqual(len(model.losses), 14)
 
 
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 7391f98eee3..a7450c9f18d 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -48,7 +48,8 @@ def save_model(model,
                filepath,
                overwrite=True,
                include_optimizer=True,
-               save_format=None):
+               save_format=None,
+               signatures=None):
   """Saves a model as a TensorFlow SavedModel or HDF5 file.
 
   The saved model contains:
@@ -79,6 +80,9 @@ def save_model(model,
       save_format: Either 'tf' or 'h5', indicating whether to save the model
         to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X, and 'h5'
         in TF 1.X.
+      signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
+        format only. Please see the `signatures` argument in
+        `tf.saved_model.save` for details.
 
   Raises:
       ImportError: If save format is hdf5, and h5py is not available.
@@ -104,7 +108,8 @@ def save_model(model,
     hdf5_format.save_model_to_hdf5(
         model, filepath, overwrite, include_optimizer)
   else:
-    saved_model_save.save(model, filepath, overwrite, include_optimizer)
+    saved_model_save.save(model, filepath, overwrite, include_optimizer,
+                          signatures)
 
 
 @keras_export('keras.models.load_model')
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 094a53db90d..58dd58b6b39 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -24,7 +24,8 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_v2
+from tensorflow.python.feature_column import feature_column_lib
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.saving import model_config
@@ -42,6 +43,7 @@ except ImportError:
 class TestSaveModel(test.TestCase):
 
   def setUp(self):
+    super(TestSaveModel, self).setUp()
     self.model = testing_utils.get_small_sequential_mlp(1, 2, 3)
     self.subclassed_model = testing_utils.get_small_subclass_mlp(1, 2)
 
@@ -83,16 +85,18 @@ class TestSaveModel(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_saving_with_dense_features(self):
-    cols = [feature_column_v2.numeric_column('a'),
-            feature_column_v2.indicator_column(
-                feature_column_v2.categorical_column_with_vocabulary_list(
-                    'b', ['one', 'two']))]
+    cols = [
+        feature_column_lib.numeric_column('a'),
+        feature_column_lib.indicator_column(
+            feature_column_lib.categorical_column_with_vocabulary_list(
+                'b', ['one', 'two']))
+    ]
     input_layers = {
         'a': keras.layers.Input(shape=(1,), name='a'),
         'b': keras.layers.Input(shape=(1,), name='b', dtype='string')
     }
 
-    fc_layer = feature_column_v2.DenseFeatures(cols)(input_layers)
+    fc_layer = feature_column_lib.DenseFeatures(cols)(input_layers)
     output = keras.layers.Dense(10)(fc_layer)
 
     model = keras.models.Model(input_layers, output)
@@ -114,6 +118,64 @@ class TestSaveModel(test.TestCase):
 
     self.assertLen(loaded_model.predict({'a': inputs_a, 'b': inputs_b}), 10)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_with_sequence_features(self):
+    cols = [
+        feature_column_lib.sequence_numeric_column('a'),
+        feature_column_lib.indicator_column(
+            feature_column_lib.sequence_categorical_column_with_vocabulary_list(
+                'b', ['one', 'two']))
+    ]
+    input_layers = {
+        'a':
+            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
+        'b':
+            keras.layers.Input(
+                shape=(None, 1), sparse=True, name='b', dtype='string')
+    }
+
+    fc_layer, _ = feature_column_lib.SequenceFeatures(cols)(input_layers)
+    # TODO(tibell): Figure out the right dtype and apply masking.
+    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
+    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
+    x = keras.layers.GRU(32)(fc_layer)
+    output = keras.layers.Dense(10)(x)
+
+    model = keras.models.Model(input_layers, output)
+
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer=keras.optimizers.RMSprop(lr=0.0001),
+        metrics=[keras.metrics.categorical_accuracy])
+
+    config = model.to_json()
+    loaded_model = model_config.model_from_json(config)
+
+    batch_size = 10
+    timesteps = 1
+
+    values_a = np.arange(10, dtype=np.float32)
+    indices_a = np.zeros((10, 3), dtype=np.int64)
+    indices_a[:, 0] = np.arange(10)
+    inputs_a = sparse_tensor.SparseTensor(indices_a, values_a,
+                                          (batch_size, timesteps, 1))
+
+    values_b = np.zeros(10, dtype=np.str)
+    indices_b = np.zeros((10, 3), dtype=np.int64)
+    indices_b[:, 0] = np.arange(10)
+    inputs_b = sparse_tensor.SparseTensor(indices_b, values_b,
+                                          (batch_size, timesteps, 1))
+
+    # Initialize tables for V1 lookup.
+    if not context.executing_eagerly():
+      self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertLen(
+        loaded_model.predict({
+            'a': inputs_a,
+            'b': inputs_b
+        }, steps=1), batch_size)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 635ad77a062..0cf38182c93 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -104,6 +104,15 @@ class KerasObjectLoader(tf_load.Loader):
 
   def _finalize(self):
     # pylint: disable=protected-access
+    for node in self._nodes:
+      if isinstance(node, RevivedLayer):
+        if not isinstance(node, RevivedSequential):
+          if hasattr(node.keras_api, 'call_and_return_conditional_losses'):
+            node.call = utils.use_wrapped_call(
+                node, node.keras_api.call_and_return_conditional_losses,
+                return_method=True)
+            node._init_call_fn_args()
+
     for node in self._nodes:
       if isinstance(node, RevivedModel):
         call_fn = node.keras_api.call_and_return_conditional_losses
@@ -204,6 +213,8 @@ class RevivedLayer(object):
       if metadata.get('activity_regularizer') is not None:
         revived_obj.activity_regularizer = regularizers.deserialize(
             metadata['activity_regularizer'])
+      if metadata.get('_is_feature_layer') is not None:
+        revived_obj._is_feature_layer = metadata['_is_feature_layer']
 
       # Store attributes revived from SerializedAttributes in a un-tracked
       # dictionary. The attributes are the ones listed in CommonEndpoints or
@@ -224,7 +235,7 @@ class RevivedLayer(object):
 
   @property
   def keras_api(self):
-    return self._serialized_attributes[constants.KERAS_ATTR]
+    return self._serialized_attributes.get(constants.KERAS_ATTR, None)
 
   def get_config(self):
     if hasattr(self, '_config'):
@@ -232,12 +243,6 @@ class RevivedLayer(object):
     else:
       raise NotImplementedError
 
-  def call(self, inputs, *args, **kwargs):
-    """Calls the revived layer and add conditional losses."""
-    call_fn = utils.use_wrapped_call(
-        self, self.keras_api.call_and_return_conditional_losses)
-    return call_fn(inputs, *args, **kwargs)
-
 
 def recursively_deserialize_keras_object(config, module_objects=None):
   """Deserialize Keras object from a nested structure."""
@@ -281,7 +286,6 @@ class RevivedNetwork(RevivedLayer):
   @classmethod
   def _init_from_metadata(cls, metadata):
     """Create revived network from metadata stored in the SavedModel proto."""
-    # TODO(kathywu): Refactor logic here so that RevivedNetwork uses the
     revived_obj = cls(name=metadata['name'])
 
     with trackable.no_automatic_dependency_tracking_scope(revived_obj):
@@ -329,6 +333,3 @@ class RevivedSequential(RevivedModel):
     """Create revived Sequential model from SavedModel metadata."""
     revived_obj = super(RevivedSequential, cls)._init_from_metadata(metadata)
     return revived_obj
-
-  def call(self, *args, **kwargs):
-    return models_lib.Sequential.call(self, *args, **kwargs)
diff --git a/tensorflow/python/keras/saving/saved_model/save.py b/tensorflow/python/keras/saving/saved_model/save.py
index 1bf80e6ae13..d99b9ccbf07 100644
--- a/tensorflow/python/keras/saving/saved_model/save.py
+++ b/tensorflow/python/keras/saving/saved_model/save.py
@@ -39,6 +39,7 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.lazy_loader import LazyLoader
 
@@ -57,7 +58,7 @@ training_lib = LazyLoader(
 # pylint:enable=g-inconsistent-quotes
 
 
-def save(model, filepath, overwrite, include_optimizer):
+def save(model, filepath, overwrite, include_optimizer, signatures=None):
   """Saves a model as a SavedModel to the filepath.
 
   Args:
@@ -65,6 +66,9 @@ def save(model, filepath, overwrite, include_optimizer):
     filepath: String path to save the model.
     overwrite: whether to overwrite the existing filepath.
     include_optimizer: If True, save the model's optimizer state.
+    signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
+      format only. Please see the `signatures` argument in `tf.saved_model.save`
+      for details.
 
   Raises:
     ValueError: if the model's inputs have not been defined.
@@ -82,7 +86,10 @@ def save(model, filepath, overwrite, include_optimizer):
     orig_optimizer = model.optimizer
     model.optimizer = None
 
-  save_lib.save(model, filepath)
+  # Trace all functions and signatures with `training=0` instead of using the
+  # default learning phase placeholder.
+  with K.learning_phase_scope(0):
+    save_lib.save(model, filepath, signatures)
 
   if not include_optimizer:
     model.optimizer = orig_optimizer
@@ -252,7 +259,8 @@ def _wrap_layer_functions(layer, serialization_cache):
     fns['activity_regularizer_fn'] = _wrap_activity_regularizer(layer)
     fns['call_and_return_all_conditional_losses'] = (
         call_collection.add_function(
-            _append_activity_regularizer_loss(call_fn_with_losses,
+            _append_activity_regularizer_loss(layer,
+                                              call_fn_with_losses,
                                               fns['activity_regularizer_fn']),
             '{}_layer_call_and_return_all_conditional_losses'.format(layer.name)
             ))
@@ -343,7 +351,8 @@ def _replace_child_layer_functions(layer, serialization_cache):
         # Some layers have an unsettable activity regularizer.
         pass
       child_layer.call = utils.use_wrapped_call(
-          child_layer, layer_fns['call_and_return_conditional_losses'])
+          child_layer, layer_fns['call_and_return_conditional_losses'],
+          default_training_value=False)
   return original_fns
   # pylint: enable=protected-access
 
@@ -380,6 +389,23 @@ def _restore_layer_losses(losses_dict):
 # pylint: enable=protected-access
 
 
+def layer_uses_training_bool(layer):
+  """Returns whether this layer or any of its children uses the training arg."""
+  if layer._expects_training_arg:  # pylint: disable=protected-access
+    return True
+  visited = {layer}
+  to_visit = _list_all_layers(layer)
+  while to_visit:
+    layer = to_visit.pop()
+    if layer in visited:
+      continue
+    if layer._expects_training_arg:  # pylint: disable=protected-access
+      return True
+    visited.add(layer)
+    to_visit.extend(_list_all_layers(layer))
+  return False
+
+
 class LayerCallCollection(object):
   """Groups wrapped layer call functions.
 
@@ -391,8 +417,18 @@ class LayerCallCollection(object):
   """
 
   def __init__(self, layer):
-    self._layer = layer
-    self._expects_training_arg = layer._expects_training_arg  # pylint: disable=protected-access
+    self.layer = layer
+    self._expects_training_arg = layer_uses_training_bool(layer)
+    self._training_arg_index = utils.get_training_arg_index(layer.call)
+
+    # If the layer call function has kwargs, then the traced function cannot
+    # have an input signature.
+    arg_spec = tf_inspect.getfullargspec(layer.call)
+    self._has_kwargs = bool(self._expects_training_arg or
+                            arg_spec.defaults or
+                            arg_spec.kwonlyargs or
+                            arg_spec.varkw)
+
     self._input_signature = self._generate_input_signature(layer)
     self._functions = weakref.WeakValueDictionary()
     # Bool indicating whether this object is currently tracing the layer call
@@ -447,59 +483,126 @@ class LayerCallCollection(object):
       # TODO(kathywu): Replace arguments with broader shapes defined in the
       # input signature.
       if self._expects_training_arg:
-        arg_list = tf_inspect.getfullargspec(fn.python_function).args
-        if 'training' in arg_list:
-          training_arg_index = arg_list.index('training')
-        else:
-          training_arg_index = -1
+        def trace_with_training(value, fn=fn):
+          utils.set_training_arg(value, self._training_arg_index, args, kwargs)
+          with K.learning_phase_scope(value):
+            fn.get_concrete_function(*args, **kwargs)
 
-        def set_training_arg(training, index=training_arg_index):
-          if index >= 0 and len(args) > index:
-            args[index] = training
-          else:
-            kwargs['training'] = training
-
-        set_training_arg(False)
-        fn.original_get_concrete_function(*args, **kwargs)
-        set_training_arg(True)
-        fn.original_get_concrete_function(*args, **kwargs)
+        trace_with_training(True)
+        trace_with_training(False)
       else:
-        fn.original_get_concrete_function(*args, **kwargs)
+        fn.get_concrete_function(*args, **kwargs)
     self.tracing = False
 
   @property
   def fn_input_signature(self):
     """Returns input signature for the wrapped layer call function."""
-    if self._expects_training_arg:
-      # The training arg is left as a python boolean, so the call functions
-      # will not have an input signature (input signatures may only describe
-      # tensor arguments).
+    if self._has_kwargs:
+      # Input signatures may only describe tensor arguments and kwargs are not
+      # supported.
       return None
     if None in nest.flatten(self._input_signature):
       # TODO(b/134962016): If input signature cannot be partially defined.
       return None
     return self._input_signature
 
-  def add_function(self, python_function, name):
+  def training_arg_was_passed(self, args, kwargs):
+    if not self.layer._expects_training_arg and self._expects_training_arg:  # pylint: disable=protected-access
+      return (utils.get_training_arg(self._training_arg_index, args, kwargs)
+              is not None)
+    else:
+      return self.layer._call_arg_was_passed(  # pylint: disable=protected-access
+          'training', args, kwargs, inputs_in_args=True)
+
+  def get_training_arg_value(self, args, kwargs):
+    if not self.layer._expects_training_arg and self._expects_training_arg:  # pylint: disable=protected-access
+      return utils.get_training_arg(self._training_arg_index, args, kwargs)
+    else:
+      return self.layer._get_call_arg_value(  # pylint: disable=protected-access
+          'training', args, kwargs, inputs_in_args=True)
+
+  def _maybe_wrap_with_training_arg(self, call_fn):
+    """Wraps call function with added training argument if necessary."""
+    if not self.layer._expects_training_arg and self._expects_training_arg:  # pylint: disable=protected-access
+      # Add training arg to wrapper function.
+      arg_spec = tf_inspect.getfullargspec(call_fn)
+      args = arg_spec.args + ['training']
+      defaults = list(arg_spec.defaults or [])
+      defaults.append(False)
+      new_arg_spec = tf_inspect.FullArgSpec(
+          args=args,
+          varargs=arg_spec.varargs,
+          varkw=arg_spec.varkw,
+          defaults=defaults,
+          kwonlyargs=arg_spec.kwonlyargs,
+          kwonlydefaults=arg_spec.kwonlydefaults,
+          annotations=arg_spec.annotations)
+
+      # Set new training arg index
+      self._training_arg_index = len(args) - 1
+      if tf_inspect.ismethod(call_fn):
+        self._training_arg_index -= 1
+
+      def wrap_with_training_arg(*args, **kwargs):
+        # Remove the training value, since the original call_fn does not expect
+        # a training arg. Instead, the training value will be propagated using
+        # the call context created in LayerCall.
+        args = list(args)
+        kwargs = kwargs.copy()
+        utils.remove_training_arg(self._training_arg_index, args, kwargs)
+        return call_fn(*args, **kwargs)
+
+      return tf_decorator.make_decorator(
+          target=call_fn,
+          decorator_func=wrap_with_training_arg,
+          decorator_argspec=new_arg_spec)
+
+    return call_fn
+
+  def add_function(self, call_fn, name):
     """Adds a layer call function to the collection."""
     self._functions[name] = fn = LayerCall(
-        self, python_function, name,
+        self, self._maybe_wrap_with_training_arg(call_fn), name,
         input_signature=self.fn_input_signature)
 
     if (None not in nest.flatten(self._input_signature) and
-        self._expects_training_arg):
-      # Manually add traces for layers that expect a training argument and have
+        self._has_kwargs):
+      # Manually add traces for layers that have keyword arguments and have
       # a fully defined input signature.
       self.add_trace(*self._input_signature)
     return fn
 
 
+def layer_call_wrapper(call_collection, method):
+  """Ensures layer losses are kept the same, and runs method in call context."""
+  def wrapper(*args, **kwargs):
+    """Calls method within call context."""
+    layer = call_collection.layer
+    training = None
+    inputs = None
+    # pylint: disable=protected-access
+    if (args or kwargs) and call_collection.training_arg_was_passed(
+        args, kwargs):
+      inputs = args[0]
+      training = call_collection.get_training_arg_value(args, kwargs)
+    # pylint: enable=protected-access
+    original_losses = _reset_layer_losses(layer)
+    with base_layer_utils.call_context().enter(
+        layer, inputs=inputs, build_graph=False, training=training):
+      ret = method(*args, **kwargs)
+    _restore_layer_losses(original_losses)
+    return ret
+  return tf_decorator.make_decorator(target=method, decorator_func=wrapper)
+
+
 class LayerCall(def_function.Function):
   """Function that triggers traces of other functions in the same collection."""
 
-  def __init__(self, call_collection, *args, **kwargs):
-    super(LayerCall, self).__init__(*args, **kwargs)
+  def __init__(self, call_collection, python_function, *args, **kwargs):
     self.call_collection = call_collection
+    self.original_call = call_collection.layer.call
+    python_function = layer_call_wrapper(call_collection, python_function)
+    super(LayerCall, self).__init__(python_function, *args, **kwargs)
 
   def __call__(self, *args, **kwargs):
     if not self.call_collection.tracing:
@@ -511,9 +614,6 @@ class LayerCall(def_function.Function):
       self.call_collection.add_trace(*args, **kwargs)
     return super(LayerCall, self).get_concrete_function(*args, **kwargs)
 
-  def original_get_concrete_function(self, *args, **kwargs):
-    return super(LayerCall, self).get_concrete_function(*args, **kwargs)
-
 
 def _wrap_call_and_conditional_losses(layer):
   """Wraps call function that returns a tuple of (outputs, losses).
@@ -530,37 +630,38 @@ def _wrap_call_and_conditional_losses(layer):
   """
   # Create function that generates both outputs and losses
   layer_call = layer.call
-  if layer._expects_training_arg:  # pylint: disable=protected-access
-    def call_and_return_conditional_losses(inputs, training=False):
-      return layer_call(inputs, training=training), layer.get_losses_for(inputs)
-  else:
-    def call_and_return_conditional_losses(inputs):
-      K.set_learning_phase(0)
-      return layer_call(inputs), layer.get_losses_for(inputs)
-  return call_and_return_conditional_losses
+  def call_and_return_conditional_losses(inputs, *args, **kwargs):
+    return layer_call(inputs, *args, **kwargs), layer.get_losses_for(inputs)
+  return _create_call_fn_decorator(layer, call_and_return_conditional_losses)
 
 
 def _extract_outputs_from_fn(layer, call_and_return_conditional_losses):
   """Returns a function that returns only call function outputs."""
   if isinstance(layer, keras_load.RevivedLayer):
     return layer.keras_api.__call__  # pylint: disable=protected-access
-  if layer._expects_training_arg:  # pylint: disable=protected-access
-    def call(inputs, training=False):
-      return call_and_return_conditional_losses(inputs, training=training)[0]
-  else:
-    def call(inputs):
-      return call_and_return_conditional_losses(inputs)[0]
-  return call
+  def call(inputs, *args, **kwargs):
+    return call_and_return_conditional_losses(inputs, *args, **kwargs)[0]
+  return _create_call_fn_decorator(layer, call)
 
 
 def _append_activity_regularizer_loss(
-    call_fn_with_losses, activity_regularizer_fn):
+    layer, call_fn_with_losses, activity_regularizer_fn):
   """Appends activity regularizer loss to losses returned by the wrapped fn."""
-  def fn(*args, **kwargs):
-    outputs, losses = call_fn_with_losses(*args, **kwargs)
+  def fn(inputs, *args, **kwargs):
+    outputs, losses = call_fn_with_losses(inputs, *args, **kwargs)
     losses.append(activity_regularizer_fn(outputs))
     return outputs, losses
-  return fn
+  return _create_call_fn_decorator(layer, fn)
+
+
+def _create_call_fn_decorator(layer, wrapped_call):
+  fn, arg_spec = utils.maybe_add_training_arg(
+      layer.call, wrapped_call, layer._expects_training_arg,  # pylint: disable=protected-access
+      default_training_value=False)
+  return tf_decorator.make_decorator(
+      target=layer.call,
+      decorator_func=fn,
+      decorator_argspec=arg_spec)
 
 
 def _wrap_unconditional_loss(loss_fn, index):
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 7358f431df1..5b007ef88a8 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -23,22 +23,33 @@ import shutil
 
 import numpy as np
 
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.saving.saved_model import load as saved_model_load
+from tensorflow.python.keras.saving.saved_model import load as keras_load
+from tensorflow.python.keras.saving.saved_model import save as keras_save
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import save as tf_save
+from tensorflow.python.util import tf_inspect
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -60,6 +71,13 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
     return input_shape
 
 
+class LayerWithLoss(keras.layers.Layer):
+
+  def call(self, inputs):
+    self.add_loss(math_ops.reduce_sum(inputs), inputs)
+    return inputs
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
@@ -86,8 +104,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     model.add_loss(callable_loss)
     saved_model_dir = self._save_model_dir()
     tf_save.save(model, saved_model_dir)
-
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
     self.evaluate(variables.variables_initializer(loaded.variables))
     self.assertAllClose(self.evaluate(model.weights),
                         self.evaluate(loaded.weights))
@@ -123,7 +140,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     saved_model_dir = self._save_model_dir()
     self.evaluate(variables.variables_initializer(layer.variables))
     tf_save.save(layer, saved_model_dir)
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
     self.evaluate(variables.variables_initializer(loaded.variables))
 
     equal_attrs = ['name', '_expects_training_arg', 'trainable']
@@ -137,13 +154,6 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
   def test_maintains_losses(self):
     """Tests that the layer losses do not change before and after export."""
-
-    class LayerWithLoss(keras.layers.Layer):
-
-      def call(self, inputs):
-        self.add_loss(math_ops.reduce_sum(inputs), inputs)
-        return inputs
-
     model = keras.models.Sequential([LayerWithLoss()])
     model.compile(
         loss='mse',
@@ -172,7 +182,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     layer.build([None, None])
     saved_model_dir = self._save_model_dir()
     tf_save.save(layer, saved_model_dir)
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
     input_arr = array_ops.ones((4, 3))
 
     # Run the layer, and use the keras backend learing phase
@@ -214,7 +224,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertEqual(expected_layers, len(loaded.keras_api.layers))
     input_arr = array_ops.ones((4, 3))
     self.assertAllClose(self.evaluate(model(input_arr)),
-                        self.evaluate(loaded(input_arr)))
+                        self.evaluate(loaded(input_arr, training=False)))
 
   @keras_parameterized.run_with_all_model_types
   def test_compiled_model(self):
@@ -232,7 +242,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     # TODO(b/134519980): Issue with model.fit if the model call function uses
     # a tf.function (Graph mode only).
     with context.eager_mode():
-      loaded = saved_model_load.load(saved_model_dir)
+      loaded = keras_load.load(saved_model_dir)
       actual_predict = loaded.predict(input_arr)
       self.assertAllClose(expected_predict, actual_predict)
 
@@ -261,7 +271,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     layer = LayerWithNestedSpec()
     saved_model_dir = self._save_model_dir()
     tf_save.save(layer, saved_model_dir)
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
     self.assertEqual(3, loaded.input_spec['a'].max_ndim)
     self.assertEqual({-1: 2}, loaded.input_spec['a'].axes)
     self.assertAllEqual([None, 2, 3], loaded.input_spec['b'].shape)
@@ -274,7 +284,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     saved_model_dir = self._save_model_dir()
 
     model.save(saved_model_dir, save_format='tf')
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
     input_arr_1 = np.random.random((1, 3)).astype('float32')
     input_arr_2 = np.random.random((1, 5)).astype('float32')
 
@@ -292,7 +302,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
     saved_model_dir = self._save_model_dir()
     model.save(saved_model_dir, save_format='tf')
-    loaded = saved_model_load.load(saved_model_dir)
+    loaded = keras_load.load(saved_model_dir)
 
     self.assertLen(loaded.layers, 2)
     self.assertLen(loaded.losses, 2)
@@ -307,5 +317,250 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertLen(loaded.layers, 2)
     self.assertLen(loaded.losses, 2)
 
+  def testBatchNormUpdates(self):
+    model = keras.models.Sequential(
+        keras.layers.BatchNormalization(input_shape=(1,)))
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    input_arr_1 = np.array([[11], [12], [13]]).astype('float32')
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+    self.evaluate(loaded(input_arr_1, training=True))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+    self.evaluate(loaded(input_arr_1, training=False))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
+  def testSaveWithSignatures(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(5, input_shape=(3,),
+                                 kernel_regularizer=regularizers.get('l2')))
+    model.add(keras.layers.Dropout(0.5))
+    model.add(keras.layers.Dense(4, kernel_regularizer=regularizers.get('l2')))
+
+    input_arr = np.random.random((2, 3)).astype(np.float32)
+    target_arr = np.random.random((2, 4)).astype(np.float32)
+
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop')
+    model.train_on_batch(input_arr, target_arr)
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec((None, 3))])
+    def predict(inputs):
+      return {'predictions': model(inputs)}
+
+    feature_configs = {
+        'inputs': parsing_ops.FixedLenFeature(
+            shape=[2, 3], dtype=dtypes.float32)}
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.string)])
+    def parse_and_predict(examples):
+      features = parsing_ops.parse_single_example(examples[0], feature_configs)
+      return {'predictions': model(features['inputs']),
+              'layer_1_outputs': model.layers[0](features['inputs'])}
+
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf', signatures={
+        'predict': predict,
+        'parse_and_predict': parse_and_predict})
+    model.save('/tmp/saved', save_format='tf', signatures={
+        'predict': predict,
+        'parse_and_predict': parse_and_predict})
+
+    loaded = keras_load.load(saved_model_dir)
+
+    self.assertAllClose(
+        model.predict(input_arr),
+        loaded.signatures['predict'](
+            ops.convert_to_tensor(input_arr))['predictions'])
+
+    feature = {
+        'inputs': feature_pb2.Feature(
+            float_list=feature_pb2.FloatList(value=input_arr.flatten()))}
+    example = example_pb2.Example(
+        features=feature_pb2.Features(feature=feature))
+    outputs = loaded.signatures['parse_and_predict'](
+        ops.convert_to_tensor([example.SerializeToString()]))
+    self.assertAllClose(model.predict(input_arr), outputs['predictions'])
+    self.assertAllClose(model.layers[0](input_arr), outputs['layer_1_outputs'])
+
+  def testTrainingDefaults(self):
+    def assert_training_default(fn, default_value):
+      arg_spec = tf_inspect.getfullargspec(fn)
+      index = len(arg_spec.args) - arg_spec.args.index('training')
+      self.assertEqual(arg_spec.defaults[-index], default_value)
+
+    class LayerWithTrainingRequiredArg(keras.engine.base_layer.Layer):
+
+      def call(self, inputs, training):
+        return tf_utils.smart_cond(
+            training, lambda: inputs * 0, lambda: array_ops.identity(inputs))
+
+    class LayerWithTrainingDefaultTrue(keras.engine.base_layer.Layer):
+
+      def call(self, inputs, training=True):
+        return tf_utils.smart_cond(
+            training, lambda: inputs * 0, lambda: array_ops.identity(inputs))
+
+    class Model(keras.models.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.layer_with_training_default_none = LayerWithLearningPhase()
+        self.layer_with_training_default_true = LayerWithTrainingDefaultTrue()
+        self.layer_with_required_training_arg = LayerWithTrainingRequiredArg()
+
+      def call(self, inputs):
+        x = self.layer_with_training_default_none(inputs)
+        x += self.layer_with_training_default_true(inputs)
+        x += self.layer_with_required_training_arg(inputs, False)
+        return x
+
+    model = Model()
+    # Build and set model inputs
+    model.predict(np.ones([1, 3]).astype('float32'))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    load = tf_load.load(saved_model_dir)
+
+    assert_training_default(load.__call__, False)
+    assert_training_default(
+        load.layer_with_training_default_none.__call__, False)
+    assert_training_default(
+        load.layer_with_training_default_true.__call__, True)
+
+    # Assert that there are no defaults for layer with required training arg
+    arg_spec = tf_inspect.getfullargspec(
+        load.layer_with_required_training_arg.__call__)
+    self.assertFalse(arg_spec.defaults)  # defaults is None or empty
+
+  def testTraceModelWithKwarg(self):
+    class Model(keras.models.Model):
+
+      def call(self, inputs, keyword=None):
+        return array_ops.identity(inputs)
+
+    model = Model()
+    prediction = model.predict(np.ones([1, 3]).astype('float32'))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+
+    loaded = keras_load.load(saved_model_dir)
+    self.assertAllClose(prediction,
+                        loaded.predict(np.ones([1, 3]).astype('float32')))
+
+  def testFeatureColumns(self):
+    # TODO(b/120099662): Error with table initialization with Keras models in
+    # graph mode.
+    if context.executing_eagerly():
+      numeric = fc.numeric_column('a')
+      bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15])
+      cat_vocab = fc.categorical_column_with_vocabulary_list(
+          'b', ['1', '2', '3'])
+      one_hot = fc.indicator_column(cat_vocab)
+      embedding = fc.embedding_column(cat_vocab, dimension=8)
+      feature_layer = DenseFeatures([bucketized, one_hot, embedding])
+      model = keras.models.Sequential(feature_layer)
+
+      features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])}
+      predictions = model.predict(features)
+
+      saved_model_dir = self._save_model_dir()
+      model.save(saved_model_dir, save_format='tf')
+      loaded = keras_load.load(saved_model_dir)
+      loaded_predictions = loaded.predict(features)
+      self.assertAllClose(predictions, loaded_predictions)
+
+
+
+class TestLayerCallTracing(test.TestCase):
+
+  def test_functions_have_same_trace(self):
+
+    class Layer(keras.engine.base_layer.Layer):
+
+      def call(self, inputs):
+        return inputs
+
+      def call2(self, inputs):
+        return inputs * 2
+
+    layer = Layer()
+    call_collection = keras_save.LayerCallCollection(layer)
+    fn = call_collection.add_function(layer.call, 'call')
+    fn2 = call_collection.add_function(layer.call2, 'call2')
+
+    fn(np.ones((2, 3)))
+    fn(np.ones((4, 5)))
+
+    self.assertLen(fn._list_all_concrete_functions_for_serialization(), 2)
+    self.assertLen(fn2._list_all_concrete_functions_for_serialization(), 2)
+
+    # Check that the shapes are correct
+    self.assertEqual(
+        {(2, 3), (4, 5)},
+        set(tuple(c.structured_input_signature[0][0].shape.as_list())
+            for c in fn2._list_all_concrete_functions_for_serialization()))
+
+  def test_training_arg_replacement(self):
+
+    def assert_num_traces(layer_cls, training_keyword):
+      layer = layer_cls()
+      call_collection = keras_save.LayerCallCollection(layer)
+      fn = call_collection.add_function(layer.call, 'call')
+
+      fn(np.ones((2, 3)), training=True)
+      self.assertLen(fn._list_all_concrete_functions_for_serialization(), 2)
+
+      fn(np.ones((2, 4)), training=False)
+      self.assertLen(fn._list_all_concrete_functions_for_serialization(), 4)
+
+      if training_keyword:
+        fn(np.ones((2, 5)), True)
+        self.assertLen(fn._list_all_concrete_functions_for_serialization(), 6)
+        fn(np.ones((2, 6)))
+        self.assertLen(fn._list_all_concrete_functions_for_serialization(), 8)
+
+    class LayerWithTrainingKeyword(keras.engine.base_layer.Layer):
+
+      def call(self, inputs, training=False):
+        return inputs * training
+
+    assert_num_traces(LayerWithTrainingKeyword, training_keyword=True)
+
+    class LayerWithKwargs(keras.engine.base_layer.Layer):
+
+      def call(self, inputs, **kwargs):
+        return inputs * kwargs['training']
+
+    assert_num_traces(LayerWithKwargs, training_keyword=False)
+
+    class LayerWithChildLayer(keras.engine.base_layer.Layer):
+
+      def __init__(self):
+        self.child = LayerWithKwargs()
+        super(LayerWithChildLayer, self).__init__()
+
+      def call(self, inputs):
+        return self.child(inputs)
+
+    assert_num_traces(LayerWithChildLayer, training_keyword=False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_maintains_losses(self):
+    layer = LayerWithLoss()
+    layer(np.ones((2, 3)))
+    previous_losses = layer.losses[:]
+
+    call_collection = keras_save.LayerCallCollection(layer)
+    fn = call_collection.add_function(layer.call, 'call')
+    fn(np.ones((2, 3)))
+
+    self.assertAllEqual(previous_losses, layer.losses)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index 960b3709273..6a52674226b 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -17,35 +17,173 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import types
+
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
-def use_wrapped_call(layer, call_fn):
+def use_wrapped_call(layer, call_fn, default_training_value=None,
+                     return_method=False):
   """Creates fn that adds the losses returned by call_fn & returns the outputs.
 
   Args:
     layer: A Keras layer object
     call_fn: tf.function that takes layer inputs (and possibly a training arg),
       and returns a tuple of (outputs, list of losses).
+    default_training_value: Default value of the training kwarg. If `None`, the
+      default is `K.learning_phase()`.
+    return_method: Whether to return a method bound to the layer.
 
   Returns:
     function that calls call_fn and returns the outputs. Losses returned by
     call_fn are added to the layer losses.
   """
-  # TODO(kathywu): Support mask argument and multi-input call functions.
-  def wrapped_call(inputs, **kwargs):
+  expects_training_arg = layer._expects_training_arg   # pylint: disable=protected-access
+  if hasattr(call_fn, 'original_call'):
+    original_call = call_fn.original_call
+  else:
+    original_call = call_fn
+  fn, arg_spec = maybe_add_training_arg(
+      original_call, call_fn, expects_training_arg, default_training_value)
+
+  def return_outputs_and_add_losses(*args, **kwargs):
     """Returns the outputs from the call_fn, and adds the losses."""
-    if layer._expects_training_arg:  # pylint: disable=protected-access
-      training = kwargs.pop('training', None)
-      if training is None:
-        training = K.learning_phase()
-      outputs, losses = tf_utils.smart_cond(
-          training,
-          lambda: call_fn(inputs, training=True),
-          lambda: call_fn(inputs, training=False))
-    else:
-      outputs, losses = call_fn(inputs)
+    inputs_arg_index = 1 if return_method else 0
+    inputs = args[inputs_arg_index]
+    args = args[inputs_arg_index + 1:]
+    outputs, losses = fn(inputs, *args, **kwargs)
     layer.add_loss(losses, inputs)
     return outputs
-  return wrapped_call
+
+  decorated = tf_decorator.make_decorator(
+      target=call_fn,
+      decorator_func=return_outputs_and_add_losses,
+      decorator_argspec=arg_spec)
+
+  if return_method:
+    return types.MethodType(decorated, layer)
+  else:
+    return decorated
+
+
+def maybe_add_training_arg(
+    original_call, wrapped_call, expects_training_arg, default_training_value):
+  """Decorate call and optionally adds training argument.
+
+  If a layer expects a training argument, this function ensures that 'training'
+  is present in the layer args or kwonly args, with the default training value.
+
+  Args:
+    original_call: Original call function.
+    wrapped_call: Wrapped call function.
+    expects_training_arg: Whether to include 'training' argument.
+    default_training_value: Default value of the training kwarg to include in
+      the arg spec. If `None`, the default is `K.learning_phase()`.
+
+  Returns:
+    Tuple of (
+      function that calls `wrapped_call` and sets the training arg,
+      Argspec of returned function or `None` if the argspec is unchanged)
+  """
+  if not expects_training_arg:
+    return wrapped_call, None
+
+  def wrap_with_training_arg(*args, **kwargs):
+    """Wrap the `wrapped_call` function, and set training argument."""
+    training_arg_index = get_training_arg_index(original_call)
+    training = get_training_arg(training_arg_index, args, kwargs)
+    if training is None:
+      training = default_training_value or K.learning_phase()
+
+    args = list(args)
+    kwargs = kwargs.copy()
+
+    def replace_training_and_call(training):
+      set_training_arg(training, training_arg_index, args, kwargs)
+      return wrapped_call(*args, **kwargs)
+
+    return tf_utils.smart_cond(
+        training,
+        lambda: replace_training_and_call(True),
+        lambda: replace_training_and_call(False))
+
+  # Create arg spec for decorated function. If 'training' is not defined in the
+  # args of the original arg spec, then add it to kwonlyargs.
+  arg_spec = tf_inspect.getfullargspec(original_call)
+  defaults = list(arg_spec.defaults) if arg_spec.defaults is not None else []
+
+  kwonlyargs = arg_spec.kwonlyargs
+  kwonlydefaults = arg_spec.kwonlydefaults or {}
+  # Add training arg if it does not exist, or set the default training value.
+  if 'training' not in arg_spec.args:
+    kwonlyargs.append('training')
+    kwonlydefaults['training'] = default_training_value
+  else:
+    index = arg_spec.args.index('training')
+    training_default_index = len(arg_spec.args) - index
+    if (arg_spec.defaults and
+        len(arg_spec.defaults) >= training_default_index and
+        defaults[-training_default_index] is None):
+      defaults[-training_default_index] = default_training_value
+
+  decorator_argspec = tf_inspect.FullArgSpec(
+      args=arg_spec.args,
+      varargs=arg_spec.varargs,
+      varkw=arg_spec.varkw,
+      defaults=defaults,
+      kwonlyargs=kwonlyargs,
+      kwonlydefaults=kwonlydefaults,
+      annotations=arg_spec.annotations)
+  return wrap_with_training_arg, decorator_argspec
+
+
+def get_training_arg_index(call_fn):
+  """Returns the index of 'training' in the layer call function arguments.
+
+  Args:
+    call_fn: Call function.
+
+  Returns:
+    - n: index of 'training' in the call function arguments.
+    - -1: if 'training' is not found in the arguments, but layer.call accepts
+          variable keyword arguments
+    - None: if layer doesn't expect a training argument.
+  """
+  arg_list = tf_inspect.getfullargspec(call_fn).args
+  if tf_inspect.ismethod(call_fn):
+    arg_list = arg_list[1:]
+  if 'training' in arg_list:
+    return arg_list.index('training')
+  else:
+    return -1
+
+
+def set_training_arg(training, index, args, kwargs):
+  if index is None:
+    pass
+  elif index >= 0 and len(args) > index:
+    args[index] = training
+  else:
+    kwargs['training'] = training
+  return args, kwargs
+
+
+def get_training_arg(index, args, kwargs):
+  if index is None:
+    return None
+  elif index >= 0 and len(args) > index:
+    return args[index]
+  else:
+    return kwargs.get('training', None)
+
+
+def remove_training_arg(index, args, kwargs):
+  if index is None:
+    pass
+  elif index >= 0 and len(args) > index:
+    args.pop(index)
+  else:
+    kwargs.pop('training', None)
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index c662a923967..11a3ff5e1ab 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -67,7 +67,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
           metrics=[keras.metrics.categorical_accuracy],
           sample_weight_mode='temporal',
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -111,7 +111,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
           optimizer=rmsprop.RMSprop(lr=0.0001),
           metrics=[keras.metrics.categorical_accuracy],
           run_eagerly=testing_utils.should_run_eagerly(),
-          run_distributed=testing_utils.should_run_distributed())
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
@@ -169,7 +169,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
         optimizer=training_module.RMSPropOptimizer(0.1),
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     y = loaded_model.predict(x)
     self.assertAllClose(ref_y, y, atol=1e-05)
 
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index d3e9eae6048..92bee3df50a 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.feature_column import feature_column_v2
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -89,7 +89,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
         optimizer='sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x=np.random.random((8, 5)),
               y=np.random.random((8, 3)), epochs=2)
 
@@ -130,7 +130,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
         optimizer='sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
                  np.random.random((8, input_dim)).astype(np.float32)],
               y=[np.random.random((8, num_classes)).astype(np.float32),
@@ -147,18 +147,18 @@ class TraceModelCallTest(keras_parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_trace_features_layer(self):
-    columns = [feature_column_v2.numeric_column('x')]
-    model = sequential.Sequential(
-        [feature_column_v2.DenseFeatures(columns)])
+    columns = [feature_column_lib.numeric_column('x')]
+    model = sequential.Sequential([feature_column_lib.DenseFeatures(columns)])
     model_input = {'x': constant_op.constant([[1.]])}
     model.predict(model_input, steps=1)
     fn = saving_utils.trace_model_call(model)
     self.assertAllClose({'output_1': [[1.]]}, fn({'x': [[1.]]}))
 
-    columns = [feature_column_v2.numeric_column('x'),
-               feature_column_v2.numeric_column('y')]
-    model = sequential.Sequential(
-        [feature_column_v2.DenseFeatures(columns)])
+    columns = [
+        feature_column_lib.numeric_column('x'),
+        feature_column_lib.numeric_column('y')
+    ]
+    model = sequential.Sequential([feature_column_lib.DenseFeatures(columns)])
     model_input = {'x': constant_op.constant([[1.]]),
                    'y': constant_op.constant([[2.]])}
     model.predict(model_input, steps=1)
@@ -310,7 +310,7 @@ class ExtractModelMetricsTest(keras_parameterized.TestCase):
         ],
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
         run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
     extract_metrics = saving_utils.extract_model_metrics(model)
     self.assertEqual(set(model_metric_names), set(model.metrics_names))
     self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
diff --git a/tensorflow/python/keras/temporal_sample_weights_correctness_test.py b/tensorflow/python/keras/temporal_sample_weights_correctness_test.py
new file mode 100644
index 00000000000..0d9f77cb000
--- /dev/null
+++ b/tensorflow/python/keras/temporal_sample_weights_correctness_test.py
@@ -0,0 +1,537 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests temporal sample weights correctness using Keras model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import tf2
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras import optimizer_v2
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class Bias(layers.Layer):
+  """Layer that add a bias to its inputs."""
+
+  def build(self, input_shape):
+    self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+  def call(self, inputs):
+    return inputs + self.bias
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+def get_multi_io_temporal_model():
+  timesteps = 2
+  inp_1 = layers.Input(shape=(1,), name='input_1')
+  inp_2 = layers.Input(shape=(1,), name='input_2')
+  x = layers.RepeatVector(timesteps)
+  out_1 = layers.TimeDistributed(Bias(), name='output_1')
+  out_2 = layers.TimeDistributed(Bias(), name='output_2')
+
+  branch_a = [inp_1, x, out_1]
+  branch_b = [inp_2, x, out_2]
+  return testing_utils.get_multi_io_model(branch_a, branch_b)
+
+
+def get_compiled_multi_io_model_temporal(sample_weight_mode):
+  model = get_multi_io_temporal_model()
+  model.compile(
+      optimizer=optimizer_v2.gradient_descent.SGD(0.1),
+      loss='mae',
+      metrics=[metrics.MeanAbsoluteError(name='mae')],
+      weighted_metrics=[metrics.MeanAbsoluteError(name='mae_2')],
+      sample_weight_mode=sample_weight_mode,
+      run_eagerly=testing_utils.should_run_eagerly(),
+      experimental_run_tf_function=testing_utils.should_run_tf_function())
+  return model
+
+
+def run_with_different_sample_weight_mode_inputs(fn, partial_sw=True):
+  """Executes the given function with different sample weight mode inputs.
+
+  Args:
+    fn: Training or eval function to execute.
+    partial_sw: Boolean flag to indicate whether temporal sample weight mode
+      should be set partially just for one output.
+  """
+  model = get_compiled_multi_io_model_temporal(sample_weight_mode='temporal')
+  fn(model)
+
+  model = get_compiled_multi_io_model_temporal(
+      sample_weight_mode=['temporal', 'temporal'])
+  fn(model)
+
+  model = get_compiled_multi_io_model_temporal(sample_weight_mode={
+      'output_1': 'temporal',
+      'output_2': 'temporal'
+  })
+  fn(model)
+
+  if partial_sw:
+    model = get_compiled_multi_io_model_temporal(
+        sample_weight_mode=[None, 'temporal'])
+    fn(model)
+
+    # TODO(b/129700800): Enable after bug is fixed.
+    # model = get_compiled_multi_io_model_temporal(sample_weight_mode={
+    #     'output_2': 'temporal'
+    # })
+    # fn(model)
+
+
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
+
+  def custom_generator_multi_io_temporal(self, sample_weights=None):
+    """Generator for getting data for temporal multi io model.
+
+    Args:
+      sample_weights: List of sample_weights.
+
+    Yields:
+      Tuple of inputs, label, sample weights data.
+    """
+    batch_size = 3
+    num_samples = 3
+    if sample_weights:
+      assert len(sample_weights) == 2
+      w1 = sample_weights[0]
+      w2 = sample_weights[1]
+    else:
+      w1 = None
+      w2 = None
+    iteration = 0
+    while True:
+      batch_index = iteration * batch_size % num_samples
+      iteration += 1
+      start = batch_index
+      end = start + batch_size
+      x = [self.x[start:end], self.x[start:end]]
+      y = [self.y1[start:end], self.y2[start:end]]
+      if sample_weights:
+        w = [
+            None if w1 is None else w1[start:end],
+            None if w2 is None else w2[start:end]
+        ]
+      else:
+        w = None
+      yield x, y, w
+
+  def setUp(self):
+    super(TestMetricsCorrectnessMultiIOTemporal, self).setUp()
+
+    self.x = np.asarray([[0.], [1.], [2.]])
+    self.y1 = np.asarray([[[.5], [1.]], [[2.], [2.5]], [[3.5], [2.5]]])
+    self.y2 = np.asarray([[[.5], [1.5]], [[2.], [1.5]], [[3.5], [3.]]])
+
+    if tf2.enabled():
+      self.wmae = 'mae_2'
+    else:
+      self.wmae = 'weighted_mae_2'
+
+    # Without weights:
+    # Epoch 1 - bias = 0
+    #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+    #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+    #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
+    #   mae                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+    #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
+    #   mae_2                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+
+    # Epoch 2 - bias = 0.1 (2/2 * 0.1)
+    #   y_pred_1 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
+    #   y_pred_2 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
+    #   mae (y1 - y_pred_1) = [[[.4], [.9]], [[.9], [1.4]], [[1.4], [.4]]]
+    #   mae                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9
+    #   mae_2 (y2 - y_pred_2) = [[[.4], [1.4]], [[.9], [.4]], [[1.4], [.9]]]
+    #   mae_2                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9
+
+    self.expected_fit_result = {
+        'output_1_mae': [1, 0.9],
+        'output_2_mae': [1, 0.9],
+        'output_1_' + self.wmae: [1, 0.9],
+        'output_2_' + self.wmae: [1, 0.9],
+        'loss': [2., 1.8],
+        'output_1_loss': [1, 0.9],
+        'output_2_loss': [1, 0.9],
+    }
+
+    self.sample_weight_1 = np.asarray([[.5, 2.], [.5, 2.], [.5, 2.]])
+    self.sample_weight_2 = np.asarray([[2., .5], [2., .5], [2., .5]])
+
+    # With weights:
+    # Epoch 1
+    #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+    #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+    #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
+    #      with weights     = [[[.5 * .5], [1 * 2]],
+    #                          [[1 * .5], [1.5 * 2]],
+    #                          [[1.5 * .5], [.5 * 2]]]
+    #   mae (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+    #   mae (weighted mean) = [[1.5/1.5, 6/6]] = [[1, 1]] = 2/2 = 1
+    #   mae (sum over bs)   = [[1.5/3, 6/3]] = [[.5, 2]] = 2.5/2 = 1.25
+
+    #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
+    #     with weights        = [[[.5 * 2], [1.5 * .5]],
+    #                            [[1. * 2], [.5 * .5]],
+    #                            [[1.5 * 2], [1. * .5]]]
+    #   mae_2 (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+    #   mae_2 (weighted mean) = [[6/6, 1.5/1.5]] = [[1, 1]] = 2/2 = 1
+    #   mae_2 (sum over bs)   = [[6/3, 1.5/3]] = [[2, .5]] = 2.5/2 = 1.25
+
+    # Epoch 2 - bias = 0.125 (2.5/2 * 0.1)
+    #   y_pred_1 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]]
+    #   y_pred_2 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]]
+
+    #   mae (y1 - y_pred_1) = [[[.375], [.875]],
+    #                          [[.875], [1.375]],
+    #                          [[1.375], [.375]]]
+    #     with weights      = [[[.375 * .5], [.875 * 2.]],
+    #                          [[.875 * .5], [1.375 * 2.]],
+    #                          [[1.375 * .5], [.375 * 2.]]]
+    #   mae (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
+    #   mae (weighted mean) = [[1.3125/1.5,  5.25/6]] = (.875+.875)/2 = .875
+    #   mae (sum over bs)   = [[1.3125/3,  5.25/3]] = (0.4375+1.75)/2 = 1.09375
+
+    #   mae_2 (y2 - y_pred_2) = [[[.375], [1.375]],
+    #                            [[.875], [.375]],
+    #                            [[1.375], [.875]]]
+    #     with weights        = [[[.375 * 2.], [1.375 * .5]],
+    #                            [[.875 * 2.], [.375 * .5]],
+    #                            [[1.375 * 2.], [.875 * .5]]]
+    #   mae_2 (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
+    #   mae_2 (weighted mean) = [[5.25/6, 1.3125/1.5]] = (.875+.875)/2 = .875
+    #   mae_2 (sum over bs)  = [[5.25/3, 1.3125/3]] = (1.75+0.4375)/2 = 1.09375
+
+    self.expected_fit_result_with_weights = {
+        'output_1_mae': [1, 0.875],
+        'output_2_mae': [1, 0.875],
+        'output_1_' + self.wmae: [1, 0.875],
+        'output_2_' + self.wmae: [1, 0.875],
+        'loss': [2.5, 2.1875],
+        'output_1_loss': [1.25, 1.09375],
+        'output_2_loss': [1.25, 1.09375],
+    }
+
+    self.expected_fit_result_with_weights_output_2 = {
+        'output_1_mae': [1., 0.9],
+        'output_2_mae': [1, 0.875],
+        'output_1_' + self.wmae: [1., 0.9],
+        'output_2_' + self.wmae: [1., 0.875],
+        'loss': [2.25, 1.99375],
+        'output_1_loss': [1., 0.9],
+        'output_2_loss': [1.25, 1.09375],
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    # 'output_1_mae', 'output_1_mae_2',
+    # 'output_2_mae', 'output_2_mae_2'
+    self.expected_batch_result_with_weights = [
+        2.1875, 1.09375, 1.09375, 0.875, 0.875, 0.875, 0.875
+    ]
+    self.expected_batch_result_with_weights_output_2 = [
+        1.99375, 0.9, 1.09375, 0.9, 0.9, 0.875, 0.875
+    ]
+    self.expected_batch_result = [1.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]
+
+  def test_fit(self):
+
+    def _train_and_assert(model):
+      history = model.fit([self.x, self.x], [self.y1, self.y2],
+                          batch_size=3,
+                          epochs=2,
+                          shuffle=False)
+      for key, value in self.expected_fit_result.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_fit_with_sample_weight(self):
+
+    def _train_and_assert(model):
+      history = model.fit([self.x, self.x], [self.y1, self.y2],
+                          sample_weight={
+                              'output_1': self.sample_weight_1,
+                              'output_2': self.sample_weight_2,
+                          },
+                          batch_size=3,
+                          epochs=2,
+                          shuffle=False)
+      for key, value in self.expected_fit_result_with_weights.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _train_and_assert, partial_sw=False)
+
+  def test_fit_with_partial_sample_weight(self):
+
+    def _train_and_assert(model):
+      history = model.fit([self.x, self.x], [self.y1, self.y2],
+                          sample_weight={
+                              'output_2': self.sample_weight_2,
+                          },
+                          batch_size=3,
+                          epochs=2,
+                          shuffle=False)
+      for key, value in self.expected_fit_result_with_weights_output_2.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_eval(self):
+
+    def _eval_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
+                                   batch_size=3)
+      self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_eval_and_assert)
+
+  def test_eval_with_sample_weight(self):
+
+    def _eval_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_1': self.sample_weight_1,
+                               'output_2': self.sample_weight_2,
+                           })
+      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
+                                   batch_size=3,
+                                   sample_weight={
+                                       'output_1': self.sample_weight_1,
+                                       'output_2': self.sample_weight_2,
+                                   })
+      self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _eval_and_assert, partial_sw=False)
+
+  def test_eval_with_partial_sample_weight(self):
+
+    def _eval_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_2': self.sample_weight_2,
+                           })
+      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
+                                   batch_size=3,
+                                   sample_weight={
+                                       'output_2': self.sample_weight_2,
+                                   })
+      self.assertAllClose(eval_result,
+                          self.expected_batch_result_with_weights_output_2,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_eval_and_assert)
+
+  def test_train_on_batch(self):
+
+    def _train_and_assert(model):
+      for _ in range(2):
+        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+      self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_train_on_batch_with_sample_weight(self):
+
+    def _train_and_assert(model):
+      for _ in range(2):
+        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                                      sample_weight={
+                                          'output_1': self.sample_weight_1,
+                                          'output_2': self.sample_weight_2,
+                                      })
+      self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _train_and_assert, partial_sw=False)
+
+  def test_train_on_batch_with_partial_sample_weight(self):
+
+    def _train_and_assert(model):
+      for _ in range(2):
+        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                                      sample_weight={
+                                          'output_2': self.sample_weight_2,
+                                      })
+      self.assertAllClose(result,
+                          self.expected_batch_result_with_weights_output_2,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_test_on_batch(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
+      self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+  def test_test_on_batch_with_sample_weight(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_1': self.sample_weight_1,
+                               'output_2': self.sample_weight_2,
+                           })
+      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
+                                   sample_weight={
+                                       'output_1': self.sample_weight_1,
+                                       'output_2': self.sample_weight_2,
+                                   })
+      self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _test_and_assert, partial_sw=False)
+
+  def test_test_on_batch_with_partial_sample_weight(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_2': self.sample_weight_2,
+                           })
+      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
+                                   sample_weight={
+                                       'output_2': self.sample_weight_2,
+                                   })
+      self.assertAllClose(result,
+                          self.expected_batch_result_with_weights_output_2,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+  def test_fit_generator(self):
+
+    def _train_and_assert(model):
+      history = model.fit_generator(
+          self.custom_generator_multi_io_temporal(),
+          steps_per_epoch=1,
+          epochs=2)
+      for key, value in self.expected_fit_result.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_fit_generator_with_sample_weight(self):
+
+    def _train_and_assert(model):
+      history = model.fit_generator(
+          self.custom_generator_multi_io_temporal(
+              sample_weights=[self.sample_weight_1, self.sample_weight_2]),
+          steps_per_epoch=1,
+          epochs=2)
+      for key, value in self.expected_fit_result_with_weights.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _train_and_assert, partial_sw=False)
+
+  def test_fit_generator_with_partial_sample_weight(self):
+
+    def _train_and_assert(model):
+      history = model.fit_generator(
+          self.custom_generator_multi_io_temporal(
+              sample_weights=[None, self.sample_weight_2]),
+          steps_per_epoch=1,
+          epochs=2)
+      for key, value in self.expected_fit_result_with_weights_output_2.items():
+        self.assertAllClose(history.history[key], value, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+  def test_eval_generator(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+      eval_result = model.evaluate_generator(
+          self.custom_generator_multi_io_temporal(), steps=1)
+      self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+  def test_eval_generator_with_sample_weight(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_1': self.sample_weight_1,
+                               'output_2': self.sample_weight_2,
+                           })
+      eval_result = model.evaluate_generator(
+          self.custom_generator_multi_io_temporal(
+              sample_weights=[self.sample_weight_1, self.sample_weight_2]),
+          steps=2)
+      self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(
+        _test_and_assert, partial_sw=False)
+
+  def test_eval_generator_with_partial_sample_weight(self):
+
+    def _test_and_assert(model):
+      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
+                           sample_weight={
+                               'output_2': self.sample_weight_2,
+                           })
+      eval_result = model.evaluate_generator(
+          self.custom_generator_multi_io_temporal(
+              sample_weights=[None, self.sample_weight_2]),
+          steps=2)
+      self.assertAllClose(eval_result,
+                          self.expected_batch_result_with_weights_output_2,
+                          1e-3)
+
+    run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+  def test_error_on_fit_with_class_weight(self):
+
+    def _train_and_assert(model):
+      with self.assertRaisesRegex(
+          ValueError,
+          r'`class_weight` not supported for 3\+ dimensional targets.'):
+        model.fit([self.x, self.x], [self.y1, self.y2],
+                  class_weight={'output_1': {
+                      .5: .5,
+                      2.: .5,
+                      3.5: .5
+                  }},
+                  batch_size=3,
+                  epochs=2,
+                  shuffle=False)
+
+    run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index bc6f84449af..fc0f84716f7 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
@@ -261,7 +262,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
 _thread_local_data = threading.local()
 _thread_local_data.model_type = None
 _thread_local_data.run_eagerly = None
-_thread_local_data.run_distributed = None
+_thread_local_data.experimental_run_tf_function = None
 
 
 @tf_contextlib.contextmanager
@@ -318,7 +319,7 @@ def should_run_eagerly():
 
 
 @tf_contextlib.contextmanager
-def run_distributed_scope(value):
+def experimental_run_tf_function_scope(value):
   """Provides a scope within which we compile models to run with distribution.
 
   The boolean gets restored to its original value upon exiting the scope.
@@ -330,23 +331,25 @@ def run_distributed_scope(value):
   Yields:
     The provided value.
   """
-  previous_value = _thread_local_data.run_distributed
+  previous_value = _thread_local_data.experimental_run_tf_function
   try:
-    _thread_local_data.run_distributed = value
+    _thread_local_data.experimental_run_tf_function = value
     yield value
   finally:
     # Restore model type to initial value.
-    _thread_local_data.run_distributed = previous_value
+    _thread_local_data.experimental_run_tf_function = previous_value
 
 
-def should_run_distributed():
+def should_run_tf_function():
   """Returns whether the models we are testing should be run distributed."""
-  if _thread_local_data.run_distributed is None:
-    raise ValueError('Cannot call `should_run_distributed()` outside of a '
-                     '`run_distributed_scope()` or `run_all_keras_modes` '
-                     'decorator.')
+  if _thread_local_data.experimental_run_tf_function is None:
+    raise ValueError(
+        'Cannot call `should_run_tf_function()` outside of a '
+        '`experimental_run_tf_function_scope()` or `run_all_keras_modes` '
+        'decorator.')
 
-  return _thread_local_data.run_distributed and context.executing_eagerly()
+  return (_thread_local_data.experimental_run_tf_function and
+          context.executing_eagerly())
 
 
 def get_model_type():
@@ -438,8 +441,18 @@ def get_small_mlp(num_hidden, num_classes, input_dim):
 class _SubclassModel(keras.Model):
   """A Keras subclass model."""
 
-  def __init__(self, layers):
-    super(_SubclassModel, self).__init__()
+  def __init__(self, layers, *args, **kwargs):
+    """Instantiate a model.
+
+    Args:
+      layers: a list of layers to be added to the model.
+      *args: Model's args
+      **kwargs: Model's keyword args, at most one of
+        input_tensor -> the input tensor required for ragged/sparse input.
+    """
+
+    inputs = kwargs.pop('input_tensor', None)
+    super(_SubclassModel, self).__init__(*args, **kwargs)
     # Note that clone and build doesn't support lists of layers in subclassed
     # models. Adding each layer directly here.
     for i, layer in enumerate(layers):
@@ -447,6 +460,9 @@ class _SubclassModel(keras.Model):
 
     self.num_layers = len(layers)
 
+    if inputs is not None:
+      self._set_inputs(inputs)
+
   def _layer_name_for_i(self, i):
     return 'layer{}'.format(i)
 
@@ -461,8 +477,8 @@ class _SubclassModel(keras.Model):
 class _SubclassModelCustomBuild(keras.Model):
   """A Keras subclass model that uses a custom build method."""
 
-  def __init__(self, layer_generating_func):
-    super(_SubclassModelCustomBuild, self).__init__()
+  def __init__(self, layer_generating_func, *args, **kwargs):
+    super(_SubclassModelCustomBuild, self).__init__(*args, **kwargs)
     self.all_layers = None
     self._layer_generating_func = layer_generating_func
 
@@ -479,21 +495,50 @@ class _SubclassModelCustomBuild(keras.Model):
     return x
 
 
-def get_model_from_layers(layers, input_shape=None, input_dtype=None):
-  """Builds a model from a sequence of layers."""
+def get_model_from_layers(layers,
+                          input_shape=None,
+                          input_dtype=None,
+                          name=None,
+                          input_ragged=None,
+                          input_sparse=None):
+  """Builds a model from a sequence of layers.
+
+  Args:
+    layers: The layers used to build the network.
+    input_shape: Shape tuple of the input or 'TensorShape' instance.
+    input_dtype: Datatype of the input.
+    name: Name for the model.
+    input_ragged: Boolean, whether the input data is a ragged tensor.
+    input_sparse: Boolean, whether the input data is a sparse tensor.
+
+  Returns:
+    A Keras model.
+  """
+
   model_type = get_model_type()
   if model_type == 'subclass':
-    return _SubclassModel(layers)
+    inputs = None
+    if input_ragged or input_sparse:
+      inputs = keras.Input(
+          shape=input_shape,
+          dtype=input_dtype,
+          ragged=input_ragged,
+          sparse=input_sparse)
+    return _SubclassModel(layers, name=name, input_tensor=inputs)
 
   if model_type == 'subclass_custom_build':
     layer_generating_func = lambda: layers
-    return _SubclassModelCustomBuild(layer_generating_func)
+    return _SubclassModelCustomBuild(layer_generating_func, name=name)
 
   if model_type == 'sequential':
-    model = keras.models.Sequential()
+    model = keras.models.Sequential(name=name)
     if input_shape:
-      model.add(keras.layers.InputLayer(input_shape=input_shape,
-                                        dtype=input_dtype))
+      model.add(
+          keras.layers.InputLayer(
+              input_shape=input_shape,
+              dtype=input_dtype,
+              ragged=input_ragged,
+              sparse=input_sparse))
     for layer in layers:
       model.add(layer)
     return model
@@ -502,11 +547,15 @@ def get_model_from_layers(layers, input_shape=None, input_dtype=None):
     if not input_shape:
       raise ValueError('Cannot create a functional model from layers with no '
                        'input shape.')
-    inputs = keras.Input(shape=input_shape, dtype=input_dtype)
+    inputs = keras.Input(
+        shape=input_shape,
+        dtype=input_dtype,
+        ragged=input_ragged,
+        sparse=input_sparse)
     outputs = inputs
     for layer in layers:
       outputs = layer(outputs)
-    return keras.Model(inputs, outputs)
+    return keras.Model(inputs, outputs, name=name)
 
   raise ValueError('Unknown model type {}'.format(model_type))
 
@@ -766,3 +815,26 @@ def get_expected_metric_variable_names(var_names, name_suffix=''):
     return [n + ':0' for n in var_names]
   # In V1 graph mode variable names are made unique using a suffix.
   return [n + name_suffix + ':0' for n in var_names]
+
+
+def enable_v2_dtype_behavior(fn):
+  """Decorator for enabling the layer V2 dtype behavior on a test."""
+  return _set_v2_dtype_behavior(fn, True)
+
+
+def disable_v2_dtype_behavior(fn):
+  """Decorator for disabling the layer V2 dtype behavior on a test."""
+  return _set_v2_dtype_behavior(fn, False)
+
+
+def _set_v2_dtype_behavior(fn, enabled):
+  """Returns version of 'fn' that runs with v2 dtype behavior on or off."""
+  def wrapper(*args, **kwargs):
+    v2_dtype_behavior = base_layer_utils.V2_DTYPE_BEHAVIOR
+    base_layer_utils.V2_DTYPE_BEHAVIOR = enabled
+    try:
+      return fn(*args, **kwargs)
+    finally:
+      base_layer_utils.V2_DTYPE_BEHAVIOR = v2_dtype_behavior
+
+  return wrapper
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index 649a1f8d409..b5a1d514766 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -26,6 +26,7 @@ import scipy.sparse
 from tensorflow.python import keras
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -152,6 +153,20 @@ def get_model_from_layers_with_input(layers,
   raise ValueError("Unknown model type {}".format(model_type))
 
 
+def get_test_mode_kwargs():
+  run_eagerly = testing_utils.should_run_eagerly()
+  # Certain things weren't supported correctly in the old path, therefore
+  # with these changes, some tests now only pass in the single code path in V2.
+  if run_eagerly or context.executing_eagerly():
+    experimental_run_tf_function = True
+  else:
+    experimental_run_tf_function = testing_utils.should_run_tf_function()
+  return {
+      "run_eagerly": run_eagerly,
+      "experimental_run_tf_function": experimental_run_tf_function
+  }
+
+
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 class CompositeTensorInternalTest(keras_parameterized.TestCase):
@@ -181,9 +196,6 @@ class CompositeTensorInternalTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, output)
 
   def test_training_internal_ragged_tensors(self):
-    if testing_utils.should_run_distributed():
-      # Training loop stall without clear reason.
-      self.skipTest("b/137397816")
     # Create a model that implements y=Mx. This is easy to learn and will
     # demonstrate appropriate gradient passing. (We have to use RaggedTensors
     # for this test, as ToSparse() doesn't support gradient propagation through
@@ -194,11 +206,7 @@ class CompositeTensorInternalTest(keras_parameterized.TestCase):
     input_data = np.random.rand(1024, 1)
     expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1)
 
-    model.compile(
-        loss="mse",
-        optimizer="adam",
-        run_eagerly=testing_utils.should_run_eagerly(),
-        run_distributed=testing_utils.should_run_distributed())
+    model.compile(loss="mse", optimizer="adam", **get_test_mode_kwargs())
     history = model.fit(input_data, expected_data, epochs=10, verbose=0)
 
     # If the model trained, the loss stored at history[0] should be different
@@ -215,6 +223,8 @@ class CompositeTensorOutputTest(keras_parameterized.TestCase):
     # converts the ragged tensor back to a dense tensor.
     layers = [ToRagged(padding=0)]
     model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
+    model._run_eagerly = testing_utils.should_run_eagerly()
 
     # Define some input data with additional padding.
     input_data = np.array([[1, 0, 0], [2, 3, 0]])
@@ -228,6 +238,8 @@ class CompositeTensorOutputTest(keras_parameterized.TestCase):
     # converts the ragged tensor back to a dense tensor.
     layers = [ToRagged(padding=0)]
     model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
+    model._run_eagerly = testing_utils.should_run_eagerly()
 
     # Define some input data with additional padding.
     input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
@@ -241,6 +253,8 @@ class CompositeTensorOutputTest(keras_parameterized.TestCase):
     # converts the ragged tensor back to a dense tensor.
     layers = [ToSparse()]
     model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
+    model._run_eagerly = testing_utils.should_run_eagerly()
 
     # Define some input data with additional padding.
     input_data = np.array([[1, 0, 0], [2, 3, 0]])
@@ -259,6 +273,8 @@ class CompositeTensorOutputTest(keras_parameterized.TestCase):
     # converts the ragged tensor back to a dense tensor.
     layers = [ToSparse()]
     model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+    model._experimental_run_tf_function = testing_utils.should_run_tf_function()
+    model._run_eagerly = testing_utils.should_run_eagerly()
 
     # Define some input data with additional padding.
     input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
@@ -284,26 +300,28 @@ def get_input_name(use_dict):
     return "test_input_name"
 
 
-def get_steps():
-  # Determine the steps arg (if appropriate)
-  if not testing_utils.should_run_eagerly():
-    # CompositeTensors in graph mode are symbolic and so require a steps arg.
-    return 1
+def get_kwargs(use_dataset, action="predict"):
+  if use_dataset or not context.executing_eagerly():
+    if action == "fit":
+      return {"steps_per_epoch": 1}
+    return {"steps": 1}
   else:
-    return None
+    return {"batch_size": 2}
 
 
 def prepare_inputs(data, use_dict, use_dataset, action, input_name):
   input_data, expected_output = data
+  batch_size = input_data.shape[0]
   # Prepare the input data.
   if use_dict:
     input_data = {input_name: input_data}
   if use_dataset:
     if action == "predict":
-      input_data = dataset_ops.Dataset.from_tensors(input_data)
+      input_data = dataset_ops.DatasetV2.from_tensor_slices(input_data).batch(
+          batch_size)
     else:
-      input_data = dataset_ops.Dataset.from_tensors(
-          (input_data, expected_output))
+      input_data = dataset_ops.DatasetV2.from_tensor_slices(
+          (input_data, expected_output)).batch(batch_size)
       expected_output = None
   return (input_data, expected_output)
 
@@ -332,8 +350,12 @@ class SparseTensorInputTest(keras_parameterized.TestCase):
         shape=(1, None), sparse=True, name=input_name, dtype=dtypes.int32)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
-    steps = get_steps()
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        **get_test_mode_kwargs())
+    kwargs = get_kwargs(use_dataset, action)
 
     # Prepare the input data
     for data_element in data:
@@ -342,15 +364,14 @@ class SparseTensorInputTest(keras_parameterized.TestCase):
                                                    input_name)
       # Perform the action.
       if action == "predict":
-        result = model.predict(input_data, steps=steps)
+        result = model.predict(input_data, **kwargs)
         self.assertAllEqual(expected_output, result)
       if action == "evaluate":
-        result = model.evaluate(input_data, expected_output, steps=steps)
+        result = model.evaluate(input_data, expected_output, **kwargs)
         self.assertAllEqual(1.0, result[-1])
       if action == "fit":
         # TODO(momernick): What's the best way of validating that fit happened?
-        _ = model.fit(
-            input_data, expected_output, shuffle=False, steps_per_epoch=steps)
+        _ = model.fit(input_data, expected_output, shuffle=False, **kwargs)
 
 
 @keras_parameterized.run_with_all_model_types
@@ -385,7 +406,11 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase,
     model_input = input_layer.Input(shape=(3,), sparse=True, dtype=dtypes.int64)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
                                          shape=[2, 3])
@@ -443,7 +468,11 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase,
         shape=(3,), sparse=True, name=input_name, dtype=dtypes.int64)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     input_data = {
         input_name:
@@ -484,7 +513,11 @@ class RaggedTensorInputTest(keras_parameterized.TestCase,
         shape=(None, None), ragged=True, name=input_name, dtype=dtypes.int32)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        **get_test_mode_kwargs())
 
     # Prepare the input data
     for data_element in data:
@@ -524,7 +557,11 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
         shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        **get_test_mode_kwargs())
 
     for data_element in data:
       input_data, expected_output = prepare_inputs(
@@ -549,11 +586,12 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
         shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
-
-    # The input is a symbolic tensor in non-Eager modes, so 'steps' is required
-    # for that case only.
-    steps = get_steps()
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        **get_test_mode_kwargs())
+    kwargs = get_kwargs(use_dataset)
 
     for data_element in data:
       input_data, expected_output = prepare_inputs(
@@ -562,7 +600,7 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
           use_dataset,
           action="predict",
           input_name=input_name)
-      result = model.predict(input_data, steps=steps)
+      result = model.predict(input_data, **kwargs)
       self.assertAllEqual(expected_output, result)
 
   def test_ragged_tensor_input_with_wrong_ragged_rank_fails(
@@ -577,7 +615,11 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
         shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32)
     layers = [ToDense(default_value=-1)]
     model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+    model.compile(
+        optimizer="sgd",
+        loss="mse",
+        metrics=["accuracy"],
+        **get_test_mode_kwargs())
 
     # Define some input data with the wrong ragged rank
     for data_element in data:
@@ -618,15 +660,9 @@ class SparseTensorInputValidationTest(keras_parameterized.TestCase):
     # Define some input data.
     input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
                                             [1, 2, 3], [2, 1, 3])
-    if not testing_utils.should_run_eagerly():
-      # This ragged tensor is actually a standard tensor (as it has no ragged
-      # dimensions). Because of this, graph mode models will expect a steps
-      # arg to be passed (as SparseTensors in graph mode are symbolic).
-      steps = 1
-    else:
-      steps = None
+    kwargs = get_kwargs(use_dataset=False)
     with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
-      _ = model.predict(input_data, steps=steps)
+      _ = model.predict(input_data, **kwargs)
 
   def test_ragged_tensor_input_with_wrong_value_shape(self):
     # Create a model that accepts a ragged input and converts it to dense.
@@ -652,14 +688,14 @@ class UndefinedCompositeTensorInputsTest(keras_parameterized.TestCase):
     # back to a dense tensor.
     layers = [ToDense(default_value=-1)]
     model = testing_utils.get_model_from_layers(layers)
-    steps = get_steps()
 
     # Define some input data.
     input_data = sparse_tensor.SparseTensor([[0, 0], [1, 0], [1, 1]], [1, 2, 3],
                                             [2, 3])
+    kwargs = get_kwargs(False)
     with self.assertRaisesRegex(
         ValueError, ".*All SparseTensor and RaggedTensor inputs .*"):
-      _ = model.predict(input_data, steps=steps)
+      _ = model.predict(input_data, **kwargs)
 
   def test_subclass_implicit_sparse_scipy_inputs_fails(self):
     # Create a model that accepts a sparse input and converts the sparse tensor
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index ea7427f61a8..1d6256ea3d8 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by convolution layers.
-"""
+"""Utilities used by convolution layers."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -52,8 +51,8 @@ def normalize_tuple(value, n, name):
   """Transforms a single integer or iterable of integers into an integer tuple.
 
   Arguments:
-    value: The value to validate and convert. Could an int, or any iterable
-      of ints.
+    value: The value to validate and convert. Could an int, or any iterable of
+      ints.
     n: The size of the tuple to be returned.
     name: The name of the argument being validated, e.g. "strides" or
       "kernel_size". This is only used to format error messages.
@@ -137,16 +136,20 @@ def conv_input_length(output_length, filter_size, padding, stride):
   return (output_length - 1) * stride - 2 * pad + filter_size
 
 
-def deconv_output_length(input_length, filter_size, padding,
-                         output_padding=None, stride=0, dilation=1):
+def deconv_output_length(input_length,
+                         filter_size,
+                         padding,
+                         output_padding=None,
+                         stride=0,
+                         dilation=1):
   """Determines output length of a transposed convolution given input length.
 
   Arguments:
       input_length: Integer.
       filter_size: Integer.
       padding: one of `"same"`, `"valid"`, `"full"`.
-      output_padding: Integer, amount of padding along the output dimension.
-          Can be set to `None` in which case the output length is inferred.
+      output_padding: Integer, amount of padding along the output dimension. Can
+        be set to `None` in which case the output length is inferred.
       stride: Integer.
       dilation: Integer.
 
@@ -252,10 +255,10 @@ def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
 
 
   Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
-                 spatial shape of the input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
-                  / receptive field.
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+      input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+      receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
 
@@ -295,21 +298,106 @@ def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
 
   output_axes_ticks = [range(dim) for dim in output_shape]
   for output_position in itertools.product(*output_axes_ticks):
-    input_axes_ticks = conv_connected_inputs(input_shape,
-                                             kernel_shape,
-                                             output_position,
-                                             strides,
-                                             padding)
+    input_axes_ticks = conv_connected_inputs(input_shape, kernel_shape,
+                                             output_position, strides, padding)
     for input_position in itertools.product(*input_axes_ticks):
       mask[input_position + output_position] = True
 
   return mask
 
 
-def conv_connected_inputs(input_shape,
-                          kernel_shape,
-                          output_position,
-                          strides,
+def conv_kernel_idxs(input_shape, kernel_shape, strides, padding, filters_in,
+                     filters_out, data_format):
+  """Yields output-input tuples of indices in a CNN layer.
+
+  The generator iterates over all `(output_idx, input_idx)` tuples, where
+    `output_idx` is an integer index in a flattened tensor representing a single
+    output image of a convolutional layer that is connected (via the layer
+    weights) to the respective single input image at `input_idx`
+
+  Example:
+    ```python
+        >>> input_shape = (2, 2)
+        >>> kernel_shape = (2, 1)
+        >>> strides = (1, 1)
+        >>> padding = "valid"
+        >>> filters_in = 1
+        >>> filters_out = 1
+        >>> data_format = "channels_last"
+        >>> list(conv_kernel_idxs(input_shape, kernel_shape, strides, padding,
+        >>>                       filters_in, filters_out, data_format))
+        [(0, 0), (0, 2), (1, 1), (1, 3)]
+    ```
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+      input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+      receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+    filters_in: `int`, number if filters in the input to the layer.
+    filters_out: `int', number if filters in the output of the layer.
+    data_format: string, "channels_first" or "channels_last".
+
+  Yields:
+    The next tuple `(output_idx, input_idx)`, where
+    `output_idx` is an integer index in a flattened tensor representing a single
+    output image of a convolutional layer that is connected (via the layer
+    weights) to the respective single input image at `input_idx`.
+
+  Raises:
+      ValueError: if `data_format` is neither
+      `"channels_last"` nor `"channels_first"`, or if number of strides, input,
+      and kernel number of dimensions do not match.
+
+      NotImplementedError: if `padding` is neither `"same"` nor `"valid"`.
+  """
+  if padding not in ('same', 'valid'):
+    raise NotImplementedError('Padding type %s not supported. '
+                              'Only "valid" and "same" '
+                              'are implemented.' % padding)
+
+  in_dims = len(input_shape)
+  if isinstance(kernel_shape, int):
+    kernel_shape = (kernel_shape,) * in_dims
+  if isinstance(strides, int):
+    strides = (strides,) * in_dims
+
+  kernel_dims = len(kernel_shape)
+  stride_dims = len(strides)
+  if kernel_dims != in_dims or stride_dims != in_dims:
+    raise ValueError('Number of strides, input and kernel dimensions must all '
+                     'match. Received: %d, %d, %d.' %
+                     (stride_dims, in_dims, kernel_dims))
+
+  output_shape = conv_output_shape(input_shape, kernel_shape, strides, padding)
+  output_axes_ticks = [range(dim) for dim in output_shape]
+
+  if data_format == 'channels_first':
+    concat_idxs = lambda spatial_idx, filter_idx: (filter_idx,) + spatial_idx
+  elif data_format == 'channels_last':
+    concat_idxs = lambda spatial_idx, filter_idx: spatial_idx + (filter_idx,)
+  else:
+    raise ValueError('Data format %s not recignized.'
+                     '`data_format` must be "channels_first" or '
+                     '"channels_last".' % data_format)
+
+  for output_position in itertools.product(*output_axes_ticks):
+    input_axes_ticks = conv_connected_inputs(input_shape, kernel_shape,
+                                             output_position, strides, padding)
+    for input_position in itertools.product(*input_axes_ticks):
+      for f_in in range(filters_in):
+        for f_out in range(filters_out):
+          out_idx = np.ravel_multi_index(
+              multi_index=concat_idxs(output_position, f_out),
+              dims=concat_idxs(output_shape, filters_out))
+          in_idx = np.ravel_multi_index(
+              multi_index=concat_idxs(input_position, f_in),
+              dims=concat_idxs(input_shape, filters_in))
+          yield (out_idx, in_idx)
+
+
+def conv_connected_inputs(input_shape, kernel_shape, output_position, strides,
                           padding):
   """Return locations of the input connected to an output position.
 
@@ -331,12 +419,12 @@ def conv_connected_inputs(input_shape,
         [xrange(1, 3), xrange(1, 2)]
     ```
   Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
-                 spatial shape of the input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
-                  / receptive field.
-    output_position: tuple of size N: `(p_out1, ..., p_outN)`,
-                     a single position in the output of the convolution.
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+      input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+      receptive field.
+    output_position: tuple of size N: `(p_out1, ..., p_outN)`, a single position
+      in the output of the convolution.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
 
@@ -371,10 +459,10 @@ def conv_output_shape(input_shape, kernel_shape, strides, padding):
   Forces dimensions where input is empty (size 0) to remain empty.
 
   Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
-                 spatial shape of the input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
-                  / receptive field.
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+      input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+      receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
 
@@ -382,11 +470,10 @@ def conv_output_shape(input_shape, kernel_shape, strides, padding):
     tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
   """
   dims = range(len(kernel_shape))
-  output_shape = [conv_output_length(input_shape[d],
-                                     kernel_shape[d],
-                                     padding,
-                                     strides[d])
-                  for d in dims]
-  output_shape = tuple([0 if input_shape[d] == 0 else output_shape[d]
-                        for d in dims])
+  output_shape = [
+      conv_output_length(input_shape[d], kernel_shape[d], padding, strides[d])
+      for d in dims
+  ]
+  output_shape = tuple(
+      [0 if input_shape[d] == 0 else output_shape[d] for d in dims])
   return output_shape
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index df446ea2758..e77f14c30e5 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 from abc import abstractmethod
 from contextlib import closing
+import functools
 import gc
 import hashlib
 import multiprocessing
-from multiprocessing.pool import ThreadPool
+import multiprocessing.dummy
 import os
 import random
 import shutil
@@ -41,7 +42,9 @@ from six.moves.urllib.error import HTTPError
 from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -51,6 +54,13 @@ try:
 except ImportError:
   import Queue as queue
 
+try:
+  import typing
+  is_iterator = lambda x: isinstance(x, typing.Iterator)
+except ImportError:
+  # Python2 uses next, and Python3 should have typing so __next__ is not needed.
+  is_iterator = lambda x: hasattr(x, '__iter__') and hasattr(x, 'next')
+
 
 if sys.version_info[0] == 2:
 
@@ -98,7 +108,10 @@ else:
 
 def is_generator_or_sequence(x):
   """Check if `x` is a Keras generator type."""
-  return tf_inspect.isgenerator(x) or isinstance(x, Sequence)
+  builtin_iterators = (str, list, tuple, dict, set, frozenset)
+  if isinstance(x, (ops.Tensor, np.ndarray) + builtin_iterators):
+    return False
+  return tf_inspect.isgenerator(x) or isinstance(x, Sequence) or is_iterator(x)
 
 
 def _extract_archive(file_path, path='.', archive_format='auto'):
@@ -327,6 +340,49 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
+class ThreadsafeIter(object):
+  """Wrap an iterator with a lock and propagate exceptions to all threads."""
+
+  def __init__(self, it):
+    self.it = it
+    self.lock = threading.Lock()
+
+    # After a generator throws an exception all subsequent next() calls raise a
+    # StopIteration Exception. This, however, presents an issue when mixing
+    # generators and threading because it means the order of retrieval need not
+    # match the order in which the generator was called. This can make it appear
+    # that a generator exited normally when in fact the terminating exception is
+    # just in a different thread. In order to provide thread safety, once
+    # self.it has thrown an exception we continue to throw the same exception.
+    self._exception = None
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    return self.next()
+
+  def next(self):
+    with self.lock:
+      if self._exception:
+        raise self._exception  # pylint: disable=raising-bad-type
+
+      try:
+        return next(self.it)
+      except Exception as e:
+        self._exception = e
+        raise
+
+
+def threadsafe_generator(f):
+
+  @functools.wraps(f)
+  def g(*a, **kw):
+    return ThreadsafeIter(f(*a, **kw))
+
+  return g
+
+
 @keras_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
@@ -432,6 +488,31 @@ _SEQUENCE_COUNTER = None
 _DATA_POOLS = weakref.WeakSet()
 _WORKER_ID_QUEUE = None  # Only created if needed.
 _WORKER_IDS = set()
+_FORCE_THREADPOOL = False
+_FORCE_THREADPOOL_LOCK = threading.RLock()
+
+
+def dont_use_multiprocessing_pool(f):
+  @functools.wraps(f)
+  def wrapped(*args, **kwargs):
+    with _FORCE_THREADPOOL_LOCK:
+      global _FORCE_THREADPOOL
+      old_force_threadpool, _FORCE_THREADPOOL = _FORCE_THREADPOOL, True
+      out = f(*args, **kwargs)
+      _FORCE_THREADPOOL = old_force_threadpool
+      return out
+  return wrapped
+
+
+def get_pool_class(use_multiprocessing):
+  global _FORCE_THREADPOOL
+  if not use_multiprocessing or _FORCE_THREADPOOL:
+    return multiprocessing.dummy.Pool  # ThreadPool
+  logging.warning(
+      'multiprocessing can interact badly with TensorFlow, causing '
+      'nondeterministic deadlocks. For high performance data pipelines tf.data '
+      'is recommended.')
+  return multiprocessing.Pool
 
 
 def get_worker_id_queue():
@@ -638,7 +719,7 @@ class SequenceEnqueuer(object):
       self.executor_fn = self._get_executor_init(workers)
     else:
       # We do not need the init since it's threads.
-      self.executor_fn = lambda _: ThreadPool(workers)
+      self.executor_fn = lambda _: get_pool_class(False)(workers)
     self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
@@ -667,6 +748,10 @@ class SequenceEnqueuer(object):
     self.run_thread.join(timeout)
     _SHARED_SEQUENCES[self.uid] = None
 
+  def __del__(self):
+    if self.is_running():
+      self.stop()
+
   @abstractmethod
   def _run(self):
     """Submits request to the executor and queue the `Future` objects."""
@@ -722,7 +807,7 @@ class OrderedEnqueuer(SequenceEnqueuer):
         Function, a Function to initialize the pool
     """
     def pool_fn(seqs):
-      pool = multiprocessing.Pool(
+      pool = get_pool_class(True)(
           workers, initializer=init_pool_generator,
           initargs=(seqs, None, get_worker_id_queue()))
       _DATA_POOLS.add(pool)
@@ -861,7 +946,7 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         A Function to initialize the pool
     """
     def pool_fn(seqs):
-      pool = multiprocessing.Pool(
+      pool = get_pool_class(True)(
           workers, initializer=init_pool_generator,
           initargs=(seqs, self.random_seed, get_worker_id_queue()))
       _DATA_POOLS.add(pool)
diff --git a/tensorflow/python/keras/utils/data_utils_test.py b/tensorflow/python/keras/utils/data_utils_test.py
index 3128099d120..0d3854890c5 100644
--- a/tensorflow/python/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/utils/data_utils_test.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 from itertools import cycle
 import os
 import tarfile
-import threading
-import unittest
 import zipfile
 
 import numpy as np
@@ -30,6 +28,7 @@ from six.moves.urllib.parse import urljoin
 from six.moves.urllib.request import pathname2url
 
 from tensorflow.python import keras
+from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.platform import test
 
 
@@ -89,47 +88,6 @@ class TestGetFileAndValidateIt(test.TestCase):
     self.assertTrue(keras.utils.data_utils.validate_file(path, hashval_md5))
 
 
-class ThreadsafeIter(object):
-
-  def __init__(self, it):
-    self.it = it
-    self.lock = threading.Lock()
-
-    # After a generator throws an exception all subsequent next() calls raise a
-    # StopIteration Exception. This, however, presents an issue when mixing
-    # generators and threading because it means the order of retrieval need not
-    # match the order in which the generator was called. This can make it appear
-    # that a generator exited normally when in fact the terminating exception is
-    # just in a different thread. In order to provide thread safety, once
-    # self.it has thrown an exception we continue to throw the same exception.
-    self._exception = None
-
-  def __iter__(self):
-    return self
-
-  def __next__(self):
-    return self.next()
-
-  def next(self):
-    with self.lock:
-      if self._exception:
-        raise self._exception  # pylint: disable=raising-bad-type
-
-      try:
-        return next(self.it)
-      except Exception as e:
-        self._exception = e
-        raise
-
-
-def threadsafe_generator(f):
-
-  def g(*a, **kw):
-    return ThreadsafeIter(f(*a, **kw))
-
-  return g
-
-
 class TestSequence(keras.utils.data_utils.Sequence):
 
   def __init__(self, shape, value=1.):
@@ -155,7 +113,7 @@ class FaultSequence(keras.utils.data_utils.Sequence):
     return 100
 
 
-@threadsafe_generator
+@data_utils.threadsafe_generator
 def create_generator_from_sequence_threads(ds):
   for i in cycle(range(len(ds))):
     yield ds[i]
@@ -181,17 +139,15 @@ class TestEnqueuers(test.TestCase):
     self.assertEqual(len(set(acc) - set(range(100))), 0)
     enqueuer.stop()
 
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
+  @data_utils.dont_use_multiprocessing_pool
   def test_generator_enqueuer_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_pcs(TestSequence([3, 200, 200, 3])),
+        create_generator_from_sequence_threads(TestSequence([3, 200, 200, 3])),
         use_multiprocessing=True)
-    enqueuer.start(3, 10)
+    enqueuer.start(4, 10)
     gen_output = enqueuer.get()
     acc = []
-    for _ in range(100):
+    for _ in range(300):
       acc.append(int(next(gen_output)[0, 0, 0, 0]))
     self.assertNotEqual(acc, list(range(100)))
     enqueuer.stop()
@@ -205,12 +161,10 @@ class TestEnqueuers(test.TestCase):
     with self.assertRaises(IndexError):
       next(gen_output)
 
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
+  @data_utils.dont_use_multiprocessing_pool
   def test_generator_enqueuer_fail_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_pcs(FaultSequence()),
+        create_generator_from_sequence_threads(FaultSequence()),
         use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
@@ -228,6 +182,7 @@ class TestEnqueuers(test.TestCase):
     self.assertEqual(acc, list(range(100)))
     enqueuer.stop()
 
+  @data_utils.dont_use_multiprocessing_pool
   def test_ordered_enqueuer_processes(self):
     enqueuer = keras.utils.data_utils.OrderedEnqueuer(
         TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
@@ -247,6 +202,7 @@ class TestEnqueuers(test.TestCase):
     with self.assertRaises(IndexError):
       next(gen_output)
 
+  @data_utils.dont_use_multiprocessing_pool
   def test_ordered_enqueuer_fail_processes(self):
     enqueuer = keras.utils.data_utils.OrderedEnqueuer(
         FaultSequence(), use_multiprocessing=True)
@@ -255,6 +211,7 @@ class TestEnqueuers(test.TestCase):
     with self.assertRaises(IndexError):
       next(gen_output)
 
+  @data_utils.dont_use_multiprocessing_pool
   def test_on_epoch_end_processes(self):
     enqueuer = keras.utils.data_utils.OrderedEnqueuer(
         TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
@@ -267,6 +224,7 @@ class TestEnqueuers(test.TestCase):
     self.assertEqual(acc[100:], list([k * 5 for k in range(100)]))
     enqueuer.stop()
 
+  @data_utils.dont_use_multiprocessing_pool
   def test_context_switch(self):
     enqueuer = keras.utils.data_utils.OrderedEnqueuer(
         TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 359d67ebf16..bc3fafe1df7 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -35,6 +35,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _GLOBAL_CUSTOM_OBJECTS = {}
+_GLOBAL_CUSTOM_NAMES = {}
 
 
 @keras_export('keras.utils.CustomObjectScope')
@@ -130,16 +131,73 @@ def serialize_keras_class_and_config(cls_name, cls_config):
   return {'class_name': cls_name, 'config': cls_config}
 
 
+@keras_export('keras.utils.register_keras_serializable')
+def register_keras_serializable(package='Custom', name=None):
+  """Registers an object with the Keras serialization framework.
+
+  This decorator injects the decorated class or function into the Keras custom
+  object dictionary, so that it can be serialized and deserialized without
+  needing an entry in the user-provided custom object dict. It also injects a
+  function that Keras will call to get the object's serializable string key.
+
+  Note that to be serialized and deserialized, classes must implement the
+  `get_config()` method. Functions do not have this requirement.
+
+  The object will be registered under the key 'module>name' where `name`,
+  defaults to the object name if not passed.
+
+  Arguments:
+    package: The package that this class belongs to.
+    name: The name to serialize this class under in this package. If None, the
+      class's name will be used.
+
+  Returns:
+    A decorator that registers the decorated class with the passed names.
+  """
+
+  def decorator(arg):
+    """Registers a class with the Keras serialization framework."""
+    class_name = name if name is not None else arg.__name__
+    registered_name = package + '>' + class_name
+
+    if tf_inspect.isclass(arg) and not hasattr(arg, 'get_config'):
+      raise ValueError(
+          'Cannot register a class that does not have a get_config() method.')
+
+    if registered_name in _GLOBAL_CUSTOM_OBJECTS:
+      raise ValueError(
+          '%s has already been registered to %s' %
+          (registered_name, _GLOBAL_CUSTOM_OBJECTS[registered_name]))
+
+    if arg in _GLOBAL_CUSTOM_NAMES:
+      raise ValueError('%s has already been registered to %s' %
+                       (arg, _GLOBAL_CUSTOM_NAMES[arg]))
+    _GLOBAL_CUSTOM_OBJECTS[registered_name] = arg
+    _GLOBAL_CUSTOM_NAMES[arg] = registered_name
+
+    return arg
+
+  return decorator
+
+
+def _get_name_or_custom_name(obj):
+  if obj in _GLOBAL_CUSTOM_NAMES:
+    return _GLOBAL_CUSTOM_NAMES[obj]
+  else:
+    return obj.__name__
+
+
 @keras_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
     return None
+
   if hasattr(instance, 'get_config'):
-    return serialize_keras_class_and_config(instance.__class__.__name__,
-                                            instance.get_config())
+    name = _get_name_or_custom_name(instance.__class__)
+    return serialize_keras_class_and_config(name, instance.get_config())
   if hasattr(instance, '__name__'):
-    return instance.__name__
+    return _get_name_or_custom_name(instance)
   raise ValueError('Cannot serialize', instance)
 
 
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 083573c8682..16001099536 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -1,13 +1,13 @@
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -80,6 +80,90 @@ class SerializeKerasObjectTest(test.TestCase):
         serialized)
     self.assertEqual(deserialized, None)
 
+  def test_serialize_custom_class_with_default_name(self):
+
+    @keras.utils.generic_utils.register_keras_serializable()
+    class TestClass(object):
+
+      def __init__(self, value):
+        self._value = value
+
+      def get_config(self):
+        return {'value': self._value}
+
+    serialized_name = 'Custom>TestClass'
+    inst = TestClass(value=10)
+    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[TestClass]
+    self.assertEqual(serialized_name, class_name)
+    config = keras.utils.generic_utils.serialize_keras_object(inst)
+    self.assertEqual(class_name, config['class_name'])
+    new_inst = keras.utils.generic_utils.deserialize_keras_object(config)
+    self.assertIsNot(inst, new_inst)
+    self.assertIsInstance(new_inst, TestClass)
+    self.assertEqual(10, new_inst._value)
+
+  def test_serialize_custom_class_with_custom_name(self):
+
+    @keras.utils.generic_utils.register_keras_serializable(
+        'TestPackage', 'CustomName')
+    class OtherTestClass(object):
+
+      def __init__(self, val):
+        self._val = val
+
+      def get_config(self):
+        return {'val': self._val}
+
+    serialized_name = 'TestPackage>CustomName'
+    inst = OtherTestClass(val=5)
+    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[OtherTestClass]
+    self.assertEqual(serialized_name, class_name)
+    config = keras.utils.generic_utils.serialize_keras_object(inst)
+    self.assertEqual(class_name, config['class_name'])
+    new_inst = keras.utils.generic_utils.deserialize_keras_object(config)
+    self.assertIsNot(inst, new_inst)
+    self.assertIsInstance(new_inst, OtherTestClass)
+    self.assertEqual(5, new_inst._val)
+
+  def test_serialize_custom_function(self):
+
+    @keras.utils.generic_utils.register_keras_serializable()
+    def my_fn():
+      return 42
+
+    serialized_name = 'Custom>my_fn'
+    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[my_fn]
+    self.assertEqual(serialized_name, class_name)
+    config = keras.utils.generic_utils.serialize_keras_object(my_fn)
+    self.assertEqual(class_name, config)
+    fn = keras.utils.generic_utils.deserialize_keras_object(config)
+    self.assertEqual(42, fn())
+
+  def test_serialize_custom_class_without_get_config_fails(self):
+
+    with self.assertRaisesRegex(
+        ValueError, 'Cannot register a class that does '
+        'not have a get_config.*'):
+
+      @keras.utils.generic_utils.register_keras_serializable(  # pylint: disable=unused-variable
+          'TestPackage', 'TestClass')
+      class TestClass(object):
+
+        def __init__(self, value):
+          self._value = value
+
+  def test_serialize_custom_objects_with_overwrite_fails(self):
+    with self.assertRaisesRegex(ValueError, '.*has already been registered.*'):
+
+      @keras.utils.generic_utils.register_keras_serializable()  # pylint: disable=unused-variable
+      class TestClass(object):
+
+        def __init__(self, value):
+          self._value = value
+
+        def get_config(self):
+          return {'value': self._value}
+
 
 class SliceArraysTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py
index b2801de56fa..f67b4df9383 100644
--- a/tensorflow/python/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@@ -25,6 +25,8 @@ import numpy as np
 import six
 
 from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import io_utils
 from tensorflow.python.platform import test
 
@@ -47,8 +49,10 @@ def create_dataset(h5_path='test.h5'):
   f.close()
 
 
-class TestIOUtils(test.TestCase):
+class TestIOUtils(keras_parameterized.TestCase):
 
+  # TODO(b/137965102): eventually support this in eager + the v2 loops
+  @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
   def test_HDF5Matrix(self):
     if h5py is None:
       return
@@ -80,7 +84,11 @@ class TestIOUtils(test.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
     model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(loss='binary_crossentropy', optimizer='sgd')
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer='sgd',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
 
     # Note: you have to use shuffle='batch' or False with HDF5Matrix
     model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 9000022bd14..6fff75d080b 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.conv_utils import convert_kernel
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -61,7 +62,7 @@ def get_source_inputs(tensor, layer=None, node_index=None):
         previous_sources = get_source_inputs(tensor, layer, node_index)
         # Avoid input redundancy.
         for x in previous_sources:
-          if x not in source_tensors:
+          if all(x is not t for t in source_tensors):
             source_tensors.append(x)
       return source_tensors
 
@@ -75,7 +76,10 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(sum(np.prod(p.shape.as_list()) for p in set(weights)))
+  return int(
+      sum(
+          np.prod(p.shape.as_list())
+          for p in object_identity.ObjectIdentitySet(weights)))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
@@ -231,7 +235,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   if hasattr(model, '_collected_trainable_weights'):
     trainable_count = count_params(model._collected_trainable_weights)
   else:
-    trainable_count = count_params(model.trainable_weights)
+    trainable_count = count_params(model._unique_trainable_weights)
 
   non_trainable_count = count_params(model.non_trainable_weights)
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 3d34d99f5af..24da4add22c 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 
 
@@ -107,10 +108,9 @@ def get_reachable_from_inputs(inputs, targets=None):
     A set of tensors reachable from the inputs (includes the inputs themselves).
   """
   inputs = nest.flatten(inputs, expand_composites=True)
-  reachable = set(inputs)
-  if targets and not isinstance(targets, set):
-    targets = nest.flatten(targets)
-    targets = set(targets)
+  reachable = object_identity.ObjectIdentitySet(inputs)
+  if targets:
+    remaining_targets = object_identity.ObjectIdentitySet(nest.flatten(targets))
   queue = inputs[:]
 
   while queue:
@@ -136,10 +136,13 @@ def get_reachable_from_inputs(inputs, targets=None):
     for y in outputs:
       if y not in reachable:
         reachable.add(y)
+        if targets:
+          remaining_targets.discard(y)
         queue.insert(0, y)
 
-    if targets and targets.issubset(reachable):
+    if targets and not remaining_targets:
       return reachable
+
   return reachable
 
 
@@ -250,10 +253,7 @@ def convert_inner_node_data(nested, wrap=False):
     Structure of same type as nested, with lists wrapped/unwrapped.
   """
 
-  def _is_atomic_nested(nested):
-    """Returns `True` if `nested` is a list representing node data."""
-    if isinstance(nested, ListWrapper):
-      return True
+  def _is_serialized_node_data(nested):
     # Node data can be of form `[layer_name, node_id, tensor_id]` or
     # `[layer_name, node_id, tensor_id, kwargs]`.
     if (isinstance(nested, list) and (len(nested) in [3, 4]) and
@@ -261,12 +261,22 @@ def convert_inner_node_data(nested, wrap=False):
       return True
     return False
 
+  def _is_atomic_nested(nested):
+    """Returns `True` if `nested` is a list representing node data."""
+    if isinstance(nested, ListWrapper):
+      return True
+    if _is_serialized_node_data(nested):
+      return True
+    return not nest.is_sequence(nested)
+
   def _convert_object_or_list(nested):
     """Convert b/t `ListWrapper` object and list representations."""
     if wrap:
       if isinstance(nested, ListWrapper):
         return nested
-      return ListWrapper(nested)
+      if _is_serialized_node_data(nested):
+        return ListWrapper(nested)
+      return nested
     else:
       if isinstance(nested, ListWrapper):
         return nested.as_list()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 0b176ecbbf7..a78520cdeea 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -234,11 +234,9 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = [
-        "no_gpu",  # TODO(b/131773093): Re-enable.
         "no_rocm",  # TODO(rocm): feature not supported on ROCm platform
         "nomsan",  # TODO(b/131773093): Re-enable.
     ],
-    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -271,7 +269,6 @@ tf_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
-    tags = ["nofwdcompat"],  # b/137641346
 )
 
 tf_py_test(
@@ -316,6 +313,22 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "cumulative_logsumexp_test",
+    size = "medium",
+    srcs = ["cumulative_logsumexp_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:array_ops",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
 tf_py_test(
     name = "decode_csv_op_test",
     size = "small",
@@ -968,7 +981,6 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
-    tags = ["nofwdcompat"],  # b/137641346
 )
 
 tf_py_test(
@@ -2965,7 +2977,6 @@ cuda_py_test(
     srcs = ["conv_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -3107,6 +3118,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
@@ -3120,7 +3132,8 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    shard_count = 10,
+    shard_count = 15,
+    tags = ["no_windows"],  # b/139739217
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3262,6 +3275,7 @@ cuda_py_test(
     ],
     tags = [
         "no_pip",
+        "nomac",  # http://b/139946976
         "notap",  # http://b/31080670
     ],
     xla_enable_strict_auto_jit = True,
@@ -3584,8 +3598,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
-    # TODO(b/134764123): Re-enable this test.
-    xla_enable_strict_auto_jit = False,
+    xla_enable_strict_auto_jit = True,
 )
 
 sycl_py_test(
@@ -3713,40 +3726,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "accumulate_n_test",
-    size = "small",
-    srcs = ["accumulate_n_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_test(
-    name = "accumulate_n_eager_test",
-    size = "small",
-    srcs = ["accumulate_n_eager_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
-    ],
-)
-
 # Custom op tests
 tf_custom_op_library(
     name = "ackermann_op.so",
@@ -3850,6 +3829,7 @@ cuda_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
@@ -3875,6 +3855,7 @@ cuda_py_test(
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
diff --git a/tensorflow/python/kernel_tests/accumulate_n_eager_test.py b/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
deleted file mode 100644
index 5f516f2c7e6..00000000000
--- a/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for new version of accumulate_n op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.eager import backprop
-
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
-  """Tests of the new, differentiable version of accumulate_n."""
-
-  def testMinimalEagerMode(self):
-    forty = constant_op.constant(40)
-    two = constant_op.constant(2)
-    answer = math_ops.accumulate_n([forty, two])
-    self.assertEqual(42, answer.numpy())
-
-  def testFloat(self):
-    np.random.seed(12345)
-    x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
-    tf_x = ops.convert_n_to_tensor(x)
-    self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x))
-    self.assertAllClose(x[0] * 5,
-                        math_ops.accumulate_n([tf_x[0]] * 5))
-
-  def testGrad(self):
-    np.random.seed(42)
-    num_inputs = 3
-    input_vars = [
-        resource_variable_ops.ResourceVariable(10.0 * np.random.random(),
-                                               name="t%d" % i)
-        for i in range(0, num_inputs)
-    ]
-
-    def fn(first, second, third):
-      return math_ops.accumulate_n([first, second, third])
-
-    grad_fn = backprop.gradients_function(fn)
-    grad = grad_fn(input_vars[0], input_vars[1], input_vars[2])
-    self.assertAllEqual(np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
-                        [elem.numpy() for elem in grad])
-
-
-if __name__ == "__main__":
-  ops.enable_eager_execution()
-  test.main()
diff --git a/tensorflow/python/kernel_tests/accumulate_n_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
deleted file mode 100644
index f7f4363e814..00000000000
--- a/tensorflow/python/kernel_tests/accumulate_n_test.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for new version of accumulate_n op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes as dtypes_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.control_flow_ops import while_loop as while_loop_v1
-from tensorflow.python.platform import googletest
-
-
-class AccumulateNV2Test(test_util.TensorFlowTestCase):
-  """Tests of the new, differentiable version of accumulate_n."""
-
-  @test_util.run_deprecated_v1
-  def testFloat(self):
-    np.random.seed(12345)
-    x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
-    tf_x = ops.convert_n_to_tensor(x)
-    with self.session(use_gpu=True):
-      self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
-      self.assertAllClose(x[0] * 5,
-                          math_ops.accumulate_n([tf_x[0]] * 5).eval())
-
-  @test_util.run_deprecated_v1
-  def testInt(self):
-    np.random.seed(54321)
-    x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
-    tf_x = ops.convert_n_to_tensor(x)
-    with self.session(use_gpu=True):
-      self.assertAllEqual(sum(x), math_ops.accumulate_n(tf_x).eval())
-      self.assertAllEqual(x[0] * 6,
-                          math_ops.accumulate_n([tf_x[0]] * 6).eval())
-
-  @test_util.run_deprecated_v1
-  def testUnknownShape(self):
-    with self.session(use_gpu=True):
-      x0 = array_ops.placeholder(dtype=dtypes_lib.int32, shape=[None])
-      acc = math_ops.accumulate_n([x0, x0], shape=[None])
-      self.assertAllEqual([2, 4], acc.eval(feed_dict={x0: [1, 2]}))
-
-  @test_util.run_deprecated_v1
-  def testGrad(self):
-    np.random.seed(42)
-    for num_inputs in range(1, 10):
-      with self.cached_session(use_gpu=True) as sess:
-        input_vars = [
-            variables.Variable(10.0 * np.random.random())
-            for _ in range(0, num_inputs)
-        ]
-        accum_n = math_ops.accumulate_n(input_vars)
-        self.evaluate(variables.global_variables_initializer())
-        accum_n_grad = gradients.gradients(accum_n, input_vars)
-        self.assertAllEqual(
-            np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
-            [g.eval() for g in accum_n_grad])
-
-  # The tests below used to be in a separate class under cwise_ops_test.py,
-  # which did not run in the default test target.
-  # Putting them here so that everything that exercises AccumulateNV2 is in
-  # one place and the default build runs all unit tests.
-  def testSimple(self):
-    with self.cached_session():
-      random_arrays = [
-          np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
-      ]
-      random_tensors = [
-          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
-          for x in random_arrays
-      ]
-      tf_val = math_ops.accumulate_n(random_tensors)
-      np_val = random_arrays[0]
-      for random_array in random_arrays[1:]:
-        np_val += random_array
-      self.assertAllClose(np_val, self.evaluate(tf_val))
-
-  # Test that AccumulateNV2 rewrite correctly add edges necessary to propagate
-  # while loop execution frame to all nodes.
-  def testAccumulateInsideWhileLoop(self):
-    with self.cached_session():
-      random_arrays = [
-          np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
-      ]
-      random_tensors = [
-          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
-          for x in random_arrays
-      ]
-
-      def cond_fn(i, acc, tensors):
-        del acc, tensors  # unused
-        return i < 1  # do just one iteration
-
-      def body_fn(i, acc, tensors):
-        return i + 1, acc + math_ops.accumulate_n(tensors), tensors
-
-      zeros = np.zeros((16, 16, 16, 16)).astype(np.float32)
-      _, tf_val, _ = while_loop_v1(cond_fn, body_fn, (0, zeros, random_tensors))
-      np_val = random_arrays[0]
-      for random_array in random_arrays[1:]:
-        np_val += random_array
-      self.assertAllClose(np_val, self.evaluate(tf_val))
-
-  def testZeroArgs(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        tf_val = math_ops.accumulate_n([])
-        self.evaluate(tf_val)
-
-  def testWrongShape(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        a = variables.Variable(0.2)
-        b = variables.Variable(0.1)
-        math_ops.accumulate_n([a, b], shape=[2, 2])  # Should be shape=[]
-
-  def testIncompatibleShapes(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        a = variables.Variable(np.array([0.1, 0.2]))
-        b = variables.Variable(np.array([[0.3], [0.4]]))
-        math_ops.accumulate_n([a, b])
-
-  def testWrongType(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        a = variables.Variable(0.2, dtype=np.float32)
-        b = variables.Variable(0.1, dtype=np.float32)
-        math_ops.accumulate_n([a, b], tensor_dtype=np.int32)
-
-  def testWrongTypeOneInput(self):
-    # Scenario that used to trigger a bug, even when testWrongType() worked
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        a = variables.Variable(0.2, dtype=np.float32)
-        math_ops.accumulate_n([a], tensor_dtype=np.int32)
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/kernel_tests/ackermann_op.cc b/tensorflow/python/kernel_tests/ackermann_op.cc
index d42ca6f662e..2d885b7a0f0 100644
--- a/tensorflow/python/kernel_tests/ackermann_op.cc
+++ b/tensorflow/python/kernel_tests/ackermann_op.cc
@@ -35,7 +35,7 @@ class AckermannOp : public OpKernel {
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape(), &output_tensor));
-    auto output = output_tensor->scalar<string>();
+    auto output = output_tensor->scalar<tstring>();
 
     output() = "A(m, 0) == A(m-1, 1)";
   }
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 542833151ef..916c2d0fa7d 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1420,6 +1420,40 @@ class SnapshotOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.eval(), [0, 1, 2, 3])
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.disable_xla("b/140109958")
+class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
+
+  def _scale_per_slice(self, shape, axis, values):
+    out = np.take(values, np.remainder(np.arange(np.prod(shape)),
+                                       len(values))).reshape(shape)
+    if axis is not None:
+      scale_shape = [1] * 4
+      scale_shape[axis] = shape[axis]
+      out *= np.arange(1, shape[axis] + 1).reshape(scale_shape)
+    return out
+
+  def testAxis(self):
+    shape = np.array([2, 3, 4, 5])
+    values = np.array([-1, -0.5, 0, 0.3, 0.8, 0.555, 0.5],
+                      dtype=np.float32)
+    quant_values = np.array([-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5],
+                            dtype=np.float32)
+    for axis in [None, 0, 1, 2, 3]:
+      inputs = constant_op.constant(self._scale_per_slice(shape, axis, values))
+      expected = self._scale_per_slice(shape, axis, quant_values)
+      unused_minmax_value = 0 if axis is None else []
+      fake_quantized = self.evaluate(array_ops.quantize_and_dequantize(
+          inputs, unused_minmax_value, unused_minmax_value,
+          range_given=False, round_mode="HALF_UP", axis=axis))
+      self.assertAllEqual(fake_quantized, expected)
+      if axis is not None:
+        fake_quantized = self.evaluate(array_ops.quantize_and_dequantize(
+            inputs, unused_minmax_value, unused_minmax_value, range_given=False,
+            axis=(axis - 4)))
+        self.assertAllClose(fake_quantized, expected)
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class SortedSearchTest(test_util.TensorFlowTestCase):
 
@@ -1819,5 +1853,49 @@ class BatchGatherNdTest(test_util.TensorFlowTestCase):
     self.assertEqual(None, tensor_shape.dimension_value(shape[0]))
 
 
+class RepeatTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testRepeatScalar(self):
+    with self.test_session():
+      v_tf = array_ops.repeat(constant_op.constant(3), 4)
+      v_np = np.repeat(3, 4)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testRepeatMatrix(self):
+    with self.test_session():
+      x = np.array([[1, 2], [3, 4]], dtype=np.int32)
+      v_tf = array_ops.repeat(constant_op.constant(x), 2)
+      v_np = np.repeat(x, 2)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testRepeatMatrixAxis0(self):
+    with self.test_session():
+      x = np.array([[1, 2], [3, 4]], dtype=np.int32)
+      v_tf = array_ops.repeat(
+          constant_op.constant(x), constant_op.constant([1, 2]), axis=0)
+      v_np = np.repeat(x, [1, 2], axis=0)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testRepeatMatrixAxis1(self):
+    with self.test_session():
+      x = np.array([[1, 2], [3, 4]], dtype=np.int32)
+      v_tf = array_ops.repeat(
+          constant_op.constant(x), constant_op.constant(3), axis=1)
+      v_np = np.repeat(x, 3, axis=1)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testRepeatMatrixRepeatArray(self):
+    with self.test_session():
+      x = np.array([[1, 2], [3, 4]], dtype=np.int32)
+      v_tf = array_ops.repeat(constant_op.constant(x), [1, 2, 3, 4])
+      v_np = np.repeat(x, [1, 2, 3, 4])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 0b557bda2e3..87e709fc69e 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -226,7 +226,7 @@ class ExtractGlimpseTest(test.TestCase):
       # [  0.  10.  11.  12.  13.  14.   0.]
       # [  0.  15.  16.  17.  18.  19.   0.]
       # [  0.  20.  21.  22.  23.  24.   0.]
-      # [  0.   0.   0.   0.   0.   0.   0.]]
+      # [  0.   0.   0.   0.   0.   0.   0.]
       result2 = image_ops.extract_glimpse_v2(
           img, [7, 7], [[0, 0]], normalized=False, noise='zero')
       self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index bee883ee246..e7961fc4c07 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -169,6 +169,75 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([2, 1], new_node_ids)
       self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
 
+  @test_util.run_deprecated_v1
+  def testNoCachedPredictionButTreeExistsMultiDimensionFeature(self):
+    """Tests that predictions are updated once trees are added."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              dimension_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, none were cached before.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      feature_0_values = [[67, 12], [5, 17]]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the first tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([1, 2], new_node_ids)
+      self.assertAllClose([[0.1 * 1.14], [0.1 * 8.79]], logits_updates)
+
   @test_util.run_deprecated_v1
   def testNoCachedPredictionButTreeExistsMultiClass(self):
     """Tests predictions are updated once trees are added for multi class."""
@@ -315,6 +384,79 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
+  def testCachedPredictionIsCurrentMultiDimensionFeature(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 2.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1, f0 is vector and f1 is matrix.
+      feature_0_values = [67, 5]
+      feature_1_values = [[9, 2], [17, 19]]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
   @test_util.run_deprecated_v1
   def testCachedPredictionIsCurrentMultiClass(self):
     """Tests that cached prediction is current for multi class."""
@@ -1062,6 +1204,81 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([3, 4, 2], new_node_ids)
       self.assertAllClose([[5.], [6.], [7.]], logits_updates)
 
+  @test_util.run_deprecated_v1
+  def testCategoricalSplitsMultiDimensionFeature(self):
+    """Tests the training prediction work for categorical splits."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            categorical_split {
+              feature_id: 1
+              dimension_id: 1
+              value: 2
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            categorical_split {
+              feature_id: 0
+              dimension_id: 1
+              value: 13
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [[1, 13], [9, 1], [0, 3]]
+      feature_1_values = [[4, 2], [1, 2], [3, 1]]
+
+      # No previous cached values.
+      cached_tree_ids = [0, 0, 0]
+      cached_node_ids = [0, 0, 0]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      self.assertAllClose([0, 0, 0], new_tree_ids)
+      self.assertAllClose([3, 4, 2], new_node_ids)
+      self.assertAllClose([[5.], [6.], [7.]], logits_updates)
+
   @test_util.run_deprecated_v1
   def testCategoricalSplitsMultiClass(self):
     """Tests the training prediction work for categorical splits."""
@@ -1965,6 +2182,29 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionOnEmptyEnsembleMultiDimensionFeature(self):
+    """Tests that prediction on a empty ensemble does not fail."""
+    with self.cached_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [[36, 1], [32, 34]]
+      feature_1_values = [11, 27]
+      expected_logits = [[0.0], [0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testPredictionOnEmptyEnsembleMultiClass(self):
     """Tests that prediction on empty ensemble does not fail for multiclass."""
@@ -2316,6 +2556,71 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testCategoricalSplitsMultiDimensionFeature(self):
+    """Tests the predictions work for categorical splits."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            categorical_split {
+              feature_id: 1
+              dimension_id: 1
+              value: 2
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            categorical_split {
+              feature_id: 0
+              dimension_id: 1
+              value: 13
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [[1, 13], [9, 1], [0, 3]]
+      feature_1_values = [[4, 2], [1, 2], [3, 1]]
+
+      expected_logits = [[5.], [6.], [7.]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
 
 class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
   """Tests feature contribs ops for model understanding."""
@@ -2381,6 +2686,67 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
+  def testContribsForOnlyABiasNodeMultiDimensionFeature(self):
+    """Tests case when, after training, only left with a bias node.
+
+    For example, this could happen if the final ensemble contains one tree that
+    got pruned up to the root.
+    """
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            leaf {
+              scalar: 1.72
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata: {
+          num_layers_grown: 0
+        }
+      """, tree_ensemble_config)
+
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # All features are unused.
+      feature_0_values = [[36, 1], [32, 34]]
+      feature_1_values = [13, -29]
+      feature_2_values = [11, 27]
+
+      # Expected logits are computed by traversing the logit path and
+      # subtracting child logits from parent logits.
+      bias = 1.72 * 0.1  # Root node of tree_0.
+      expected_feature_ids = ((), ())
+      expected_logits_paths = ((bias,), (bias,))
+
+      bucketized_features = [
+          feature_0_values, feature_1_values, feature_2_values
+      ]
+
+      debug_op = boosted_trees_ops.example_debug_outputs(
+          tree_ensemble_handle,
+          bucketized_features=bucketized_features,
+          logits_dimension=1)
+
+      serialized_examples_debug_outputs = session.run(debug_op)
+      feature_ids = []
+      logits_paths = []
+      for example in serialized_examples_debug_outputs:
+        example_debug_outputs = boosted_trees_pb2.DebugOutput()
+        example_debug_outputs.ParseFromString(example)
+        feature_ids.append(example_debug_outputs.feature_ids)
+        logits_paths.append(example_debug_outputs.logits_path)
+
+      self.assertAllClose(feature_ids, expected_feature_ids)
+      self.assertAllClose(logits_paths, expected_logits_paths)
+
   @test_util.run_deprecated_v1
   def testContribsMultipleTreeWhenFirstTreeIsABiasNode(self):
     """Tests case when, after training, first tree contains only a bias node."""
@@ -2479,6 +2845,105 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
+  def testContribsMultipleTreeWhenFirstTreeIsABiasNodeMultiDimFeature(self):
+    """Tests case when, after training, first tree contains only a bias node."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            leaf {
+              scalar: 1.72
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              dimension_id: 1
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              original_leaf: {scalar: 5.5}
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        tree_weights: 1.
+        tree_weights: 0.1
+        tree_metadata: {
+          num_layers_grown: 0
+        }
+        tree_metadata: {
+          num_layers_grown: 1
+        }
+      """, tree_ensemble_config)
+
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [[36, 1], [32, 34]]
+      feature_1_values = [13, -29]  # Unused feature.
+      feature_2_values = [11, 27]
+
+      # Expected logits are computed by traversing the logit path and
+      # subtracting child logits from parent logits.
+      expected_feature_ids = ((2, 0), (2,))
+      # bias = 1.72 * 1.  # Root node of tree_0.
+      # example_0 :  (bias, 0.1 * 5.5 + bias, 0.1 * 5. + bias)
+      # example_1 :  (bias, 0.1 * 7. + bias )
+      expected_logits_paths = ((1.72, 2.27, 2.22), (1.72, 2.42))
+
+      bucketized_features = [
+          feature_0_values, feature_1_values, feature_2_values
+      ]
+
+      debug_op = boosted_trees_ops.example_debug_outputs(
+          tree_ensemble_handle,
+          bucketized_features=bucketized_features,
+          logits_dimension=1)
+
+      serialized_examples_debug_outputs = session.run(debug_op)
+      feature_ids = []
+      logits_paths = []
+      for example in serialized_examples_debug_outputs:
+        example_debug_outputs = boosted_trees_pb2.DebugOutput()
+        example_debug_outputs.ParseFromString(example)
+        feature_ids.append(example_debug_outputs.feature_ids)
+        logits_paths.append(example_debug_outputs.logits_path)
+
+      self.assertAllClose(feature_ids, expected_feature_ids)
+      self.assertAllClose(logits_paths, expected_logits_paths)
+
   @test_util.run_deprecated_v1
   def testContribsMultipleTree(self):
     """Tests that the contribs work when we have multiple trees."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 0315456447d..fb44c33d602 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import resources
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_flush_quantile_summaries as flush_quantile_summaries
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as resource_initialized
 from tensorflow.python.platform import googletest
@@ -107,6 +108,41 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  def testBasicQuantileBucketsSingleResourcesAddFlushed(self):
+    with self.cached_session():
+      quantile_accumulator_handle = self.create_resource("floats_0", self.eps,
+                                                         self.max_elements, 2)
+      resources.initialize_resources(resources.shared_resources()).run()
+      summaries = boosted_trees_ops.make_quantile_summaries(
+          [self._feature_0, self._feature_1], self._example_weights,
+          epsilon=self.eps)
+      summary_op = boosted_trees_ops.quantile_add_summaries(
+          quantile_accumulator_handle, summaries)
+      flushed_summaries = flush_quantile_summaries(
+          quantile_accumulator_handle, num_features=2)
+
+      # We are testing whether the flushed summaries output at the previous step
+      # will give the same expected results by inputing it to add_summaries
+      summary_op_2 = boosted_trees_ops.quantile_add_summaries(
+          quantile_accumulator_handle, flushed_summaries)
+
+      flush_op = boosted_trees_ops.quantile_flush(
+          quantile_accumulator_handle, self.num_quantiles)
+      buckets = boosted_trees_ops.get_bucket_boundaries(
+          quantile_accumulator_handle, num_features=2)
+      quantiles = boosted_trees_ops.boosted_trees_bucketize(
+          [self._feature_0, self._feature_1], buckets)
+
+      self.evaluate(summary_op)
+      self.evaluate(summary_op_2)
+      self.evaluate(flush_op)
+
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
+      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
+
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 32e47efb64e..402c6f041e0 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -28,13 +28,24 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
-_INEQUALITY_DEFAULT_LEFT = 'inequality_default_left'.encode('utf-8')
-_INEQUALITY_DEFAULT_RIGHT = 'inequality_default_right'.encode('utf-8')
+_INEQUALITY_DEFAULT_LEFT = 'INEQUALITY_DEFAULT_LEFT'.encode('utf-8')
+_INEQUALITY_DEFAULT_RIGHT = 'INEQUALITY_DEFAULT_RIGHT'.encode('utf-8')
+_EQUALITY_DEFAULT_RIGHT = 'EQUALITY_DEFAULT_RIGHT'.encode('utf-8')
 
 
 class StatsOpsTest(test_util.TensorFlowTestCase):
   """Tests stats_ops."""
 
+  def _append_zeros_for_default_bucket(self, stats_summary):
+    summary_shapes = stats_summary.shape
+    # pad zeros for missing value bucket.
+    stats_summary = np.concatenate(
+        (stats_summary,
+         np.zeros([summary_shapes[0], summary_shapes[1], 1, summary_shapes[3]
+                  ])),
+        axis=2)
+    return stats_summary
+
   def _get_stats_summary_for_split(self):
     return [
         [
@@ -55,7 +66,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ]  # shape=[num_features, max_splits, num_buckets, 2]
+    ]  # shape=[feature_dim, max_splits, num_buckets, 2]
 
   def _get_sparse_stats_summary_for_split(self, stats_summary=None):
     if stats_summary is None:
@@ -90,7 +101,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ])  # num_features * shape=[max_splits, num_buckets, 2]
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
     node_id_range = [1, 3]
     dense_summary = np.moveaxis(dense_summary, 0, 1)
     dense_shape = dense_summary.shape
@@ -182,8 +193,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing best split calculation without any regularization."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
     stats_summary = np.asarray(self._get_stats_summary_for_split())
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
      right_node_contribs, split_types) = self.evaluate(
@@ -208,6 +220,69 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[-.076923], [-.75]], right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateBestMultiDimFeatureSplitWMissingValuesWORegularization(self):
+    """Testing best split calculation without any regularization."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=0.0,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1))
+
+    # Get same result as v1 op (CalculateBestGainsPerFeature), and find the
+    # feature dimension that has the best gain.
+    self.assertAllEqual([1, 2], node_ids)
+    self.assertAllClose([0.116495, 0.60429], gains)
+    self.assertAllEqual([1, 1], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # The left node contrib will be later added to the previous node value to
+    # make the left node value, and the same for right node contrib.
+    self.assertAllClose([[-0.631579], [-0.770833]], left_node_contribs)
+    self.assertAllClose([[0.833333], [0.8]], right_node_contribs)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
+  def testCalculateBestMultiDimFeatureEqualitySplitsWithoutRegularization(self):
+    """Testing best split calculation without any regularization."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=0.0,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1,
+             split_type='equality'))
+
+    self.assertAllEqual([1, 2], node_ids)
+    # 0.116495 = (-0.05)^2/0.06 + 0.36^2/0.57 - 0.31^2/0.63
+    # 0.60429 = (-0.4)^2/0.5 + 0.37^2/0.48 - 0.03^2/0.98
+    self.assertAllClose([0.116495, 0.60429], gains)
+    self.assertAllEqual([2, 2], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # The left node contrib will be later added to the previous node value to
+    # make the left node value, and the same for right node contrib.
+    # left contrib 0.83 = 0.05/0.06, 0.8 = 0.4/0.5
+    self.assertAllClose([[0.833333], [.8]], left_node_contribs)
+    # right contrib -0.6315 = -0.36/0.57, -0.7708 = -0.37/0.48
+    self.assertAllClose([[-0.631579], [-0.770833]], right_node_contribs)
+    self.assertAllEqual([_EQUALITY_DEFAULT_RIGHT] * 2, split_types)
+
   def testCalculateBestGainsWithL2(self):
     """Testing Gain calculation with L2."""
     with self.cached_session() as sess:
@@ -241,8 +316,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing best split calculation with L2."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
     stats_summary = np.asarray(self._get_stats_summary_for_split())
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
      right_node_contribs, split_types) = self.evaluate(
@@ -267,6 +343,69 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[-.043478], [-.6]], right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateMultiDimBestFeatureSplitsWithMissingValuesL2(self):
+    """Testing best split calculation with L2."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=0.1,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1))
+
+    # Get same result as v1 op (CalculateBestGainsPerFeature), and find the
+    # feature dimension that has the best gain.
+    self.assertAllEqual([1, 2], node_ids)
+    self.assertAllClose([0.077414, 0.501868], gains)
+    self.assertAllEqual([1, 1], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # The left node contrib will be later added to the previous node value to
+    # make the left node value, and the same for right node contrib.
+    self.assertAllClose([[-0.537313], [-0.637931]], left_node_contribs)
+    self.assertAllClose([[0.3125], [0.666667]], right_node_contribs)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
+  def testCalculateMultiDimBestFeatureEqualitySplitsWithL2(self):
+    """Testing best split calculation with L2."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=0.1,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1,
+             split_type='equality'))
+
+    self.assertAllEqual([1, 2], node_ids)
+    # 0.077414 = 0.05^2/0.16 + 0.36^2/0.67 - 0.31^2/0.73
+    # 0.501868 = 0.4^2/0.6 + 0.37^2/0.58 - 0.03^2/1.08
+    self.assertAllClose([0.077414, 0.501868], gains)
+    self.assertAllEqual([2, 2], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # # The left node contrib will be later added to the previous node value to
+    # # make the left node value, and the same for right node contrib.
+    # left contrib 0.3125 = 0.05/0.16, 0.6667 = 0.4/0.6
+    self.assertAllClose([[0.3125], [0.666667]], left_node_contribs)
+    # right contrib -0.5373 = -0.36/0.67, -0.6379 = -0.37/0.58
+    self.assertAllClose([[-0.537313], [-0.637931]], right_node_contribs)
+    self.assertAllEqual([_EQUALITY_DEFAULT_RIGHT] * 2, split_types)
+
   def testSparseCalculateBestSplitsWithL2(self):
     node_id_range = [1, 3]
     (summary_indices, summary_values,
@@ -331,8 +470,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing best split calculation with L1."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
     stats_summary = np.asarray(self._get_stats_summary_for_split())
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     l1 = 0.1
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
@@ -357,6 +497,75 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([1, 1], feature_dimensions)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateBestMultiDimFeatureSplitsWithMissingValuesL1(self):
+    """Testing best split calculation with L1."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    l1 = 0.1
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=l1,
+             l2=0.,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1))
+
+    # Get same result as v1 op (CalculateBestGainsPerFeature), and find the
+    # feature dimension that has the best gain.
+    self.assertAllEqual([1, 2], node_ids)
+    # Gain should also include an adjustment of the gradient by l1.
+    # (0.36-0.1)^2/0.57 + 0 - (0.31-0.1)^2/0.63 = 0.048597
+    # (0.37-0.1)^2/0.48 + (-0.4+0.1)^2/0.5 = 0.331875
+    self.assertAllClose([0.048597, 0.331875], gains)
+    self.assertAllEqual([1, 1], thresholds)
+    # -(0.36-0.1)/0.57 = -0.45614
+    # -(0.37-0.1)/0.48 = -0.5625
+    self.assertAllClose([[-0.45614], [-0.5625]], left_node_contribs)
+    # -(-0.4+0.1)/0.5 = 0.6
+    self.assertAllClose([[0.], [0.6]], right_node_contribs)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
+  def testCalculateBestMultiDimFeatureEqualitySplitsWithL1(self):
+    """Testing best split calculation with L1."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    l1 = 0.1
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=l1,
+             l2=0.,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=1,
+             split_type='equality'))
+
+    self.assertAllEqual([1, 2], node_ids)
+    # 0.048597 = 0 + 0.26^2/0.57 - 0.21^2/0.63
+    # 0.501868 = 0.3^2/0.5 + 0.27^2/0.48 - 0
+    self.assertAllClose([0.048597, 0.331875], gains)
+    self.assertAllEqual([2, 2], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # # The left node contrib will be later added to the previous node value to
+    # # make the left node value, and the same for right node contrib.
+    # left contrib 0 (-0.05>-0.1), 0.6 = 0.3/0.5
+    self.assertAllClose([[0], [0.6]], left_node_contribs)
+    # right contrib -0.45614 = -0.26/0.57, -0.5625 = -0.27/0.48
+    self.assertAllClose([[-0.45614], [-0.5625]], right_node_contribs)
+    self.assertAllEqual([_EQUALITY_DEFAULT_RIGHT] * 2, split_types)
+
   def testSparseCalculateBestSplitsWithL1(self):
     node_id_range = [1, 3]
     (summary_indices, summary_values,
@@ -421,8 +630,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing best split calculation with tree complexity."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
     stats_summary = np.asarray(self._get_stats_summary_for_split())
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     l2 = 0.1
     tree_complexity = 3.
@@ -448,6 +658,72 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([1, 0], feature_dimensions)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateBestMultiDimFeatureSplitsWMissingValsTreeComplexity(self):
+    """Testing best split calculation with tree complexity."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    l2 = 0.1
+    tree_complexity = 3.
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.,
+             l2=l2,
+             tree_complexity=tree_complexity,
+             min_node_weight=0,
+             logits_dimension=1))
+
+    # Get same result as v1 op (CalculateBestGainsPerFeature), and find the
+    # feature dimension that has the best gain.
+    self.assertAllEqual([1, 2], node_ids)
+    # Gain should also include an adjustment of the gradient by l1.
+    self.assertAllClose([-2.922586, -2.498132], gains)
+    self.assertAllEqual([1, 1], thresholds)
+    self.assertAllClose([[-0.537313], [-0.637931]], left_node_contribs)
+    self.assertAllClose([[0.3125], [0.666667]], right_node_contribs)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
+  def testCalculateBestMultiDimFeatureEqualitySplitsWithTreeComplexity(self):
+    """Testing best split calculation with tree complexity."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray(self._get_stats_summary_for_split())
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    l2 = 0.1
+    tree_complexity = 3.
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.,
+             l2=l2,
+             tree_complexity=tree_complexity,
+             min_node_weight=0,
+             logits_dimension=1,
+             split_type='equality'))
+
+    self.assertAllEqual([1, 2], node_ids)
+    # -2.922586 = 0.05^2/0.16 + 0.36^2/0.67 - 0.31^2/0.73 - 3
+    # -2.498132 = 0.4^2/0.6 + 0.37^2/0.58 - 0.03^2/1.08 - 3
+    self.assertAllClose([-2.922586, -2.498132], gains)
+    self.assertAllEqual([2, 2], thresholds)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    # # The left node contrib will be later added to the previous node value to
+    # # make the left node value, and the same for right node contrib.
+    # left contrib 0.3125 = 0.05/0.16, 0.6667 = 0.4/0.6
+    self.assertAllClose([[0.3125], [0.666667]], left_node_contribs)
+    # right contrib -0.5373 = -0.36/0.67, -0.6379 = -0.37/0.58
+    self.assertAllClose([[-0.537313], [-0.637931]], right_node_contribs)
+    self.assertAllEqual([_EQUALITY_DEFAULT_RIGHT] * 2, split_types)
+
   def testSparseCalculateBestSplitsWithTreeComplexity(self):
     """Testing best split calculation with tree complexity."""
     node_id_range = [1, 3]
@@ -499,7 +775,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
               [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
           ],  # feature 1
-      ]  # num_features * shape=[max_splits, num_buckets, 2]
+      ]  # feature_dim * shape=[max_splits, num_buckets, 2]
 
       (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
        right_node_contribs_list
@@ -544,9 +820,10 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ])  # num_features * shape=[max_splits, num_buckets, 2]
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
      right_node_contribs, split_types) = self.evaluate(
@@ -568,6 +845,52 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([1, 1], feature_dimensions)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateMultiDimBestSplitsWithMissingValuesMinNodeWeight(self):
+    """Testing best split calculation with min node weight."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray([
+        [
+            [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+            [[0., 0.], [.15, .36], [.06, .61], [.1, .2]],  # node 1
+            [[0., 0.], [-.33, .68], [0., 0.], [.3, .4]],  # node 2
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+        ],  # feature 0
+        [
+            [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+            [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+            [[.1, 1.], [.2, -.05], [-.4, .05], [.07, .08]],  # node 2
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+        ],  # feature 1
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.,
+             l2=0.,
+             tree_complexity=0.,
+             min_node_weight=1,
+             logits_dimension=1))
+
+    self.assertAllEqual([1, 2], node_ids)
+    # Gain should also include an adjustment of the gradient by l1.
+    self.assertAllClose([0.149398, 3.332075], gains)
+    self.assertAllEqual([1, 1], thresholds)
+    self.assertAllClose([[-0.631579], [-0.359223]], left_node_contribs)
+    self.assertAllClose([[0.083333], [7.999989]], right_node_contribs)
+    self.assertAllEqual([1, 1], feature_dimensions)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
   def testSparseCalculateBestSplitsWithMinNodeWeight(self):
     """Testing best split calculation with min node weight."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
@@ -590,8 +913,8 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ])  # num_features * shape=[max_splits, num_buckets, 2]
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
 
     (summary_indices, summary_values,
@@ -643,7 +966,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
               [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
           ],  # feature 1
-      ]  # num_features * shape=[max_splits, num_buckets, 2]
+      ]  # feature_dim * shape=[max_splits, num_buckets, 2]
 
       (node_ids_list, _, _, _,
        _) = boosted_trees_ops.calculate_best_gains_per_feature(
@@ -694,8 +1017,8 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ])  # num_features * shape=[max_splits, num_buckets, 2]
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
 
     (node_ids, _, _, _, _, _,
@@ -723,6 +1046,58 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
          logits_dimension=1)
     self.assertAllEqual([], node_ids)
 
+  def testCalculateBestMultiDimFeatureEqualitySplitsWithNoSplitPossible(self):
+    """Testing best split calculation with min node weight and no split."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    stats_summary = np.asarray([
+        [
+            [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+            [[0., 0.], [.15, .36], [.06, .7], [.1, .2]],  # node 1
+            [[0., 0.], [-.33, .068], [0., 0.], [.3, .04]],  # node 2
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+        ],  # feature 0
+        [
+            [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+            [[0., 0.], [.3, .5], [-.05, .06], [.06, .7]],  # node 1
+            [[.1, .1], [.2, -.05], [-.4, .05], [.07, .08]],  # node 2
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+            [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+        ],  # feature 1
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
+    stats_summary = np.moveaxis(stats_summary, 0, 1)
+
+    (node_ids, _, _, _, _, _,
+     _) = boosted_trees_ops.calculate_best_feature_split(
+         node_id_range,
+         stats_summary,
+         l1=0.0,
+         l2=0.0,
+         tree_complexity=0.0,
+         min_node_weight=1,
+         logits_dimension=1,
+         split_type='equality')
+
+    # We can't split either of the nodes on the first feature
+    self.assertAllEqual([1], node_ids)
+
+    # Now check when we can't split on any feature
+    (node_ids, _, _, _, _, _,
+     _) = boosted_trees_ops.calculate_best_feature_split(
+         node_id_range,
+         stats_summary,
+         l1=0.0,
+         l2=0.0,
+         tree_complexity=0.0,
+         min_node_weight=10,
+         logits_dimension=1)
+    self.assertAllEqual([], node_ids)
+
   def testSparseCalculateBestSplitsWithMinNodeWeightNoSplitOnFeature(self):
     """Testing best split calculation with min node weight and no split."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
@@ -745,8 +1120,8 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
             [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
         ],  # feature 1
-    ])  # num_features * shape=[max_splits, num_buckets, 2]
-    # reshape to [max_splits, num_features, num_buckets, 2]
+    ])  # feature_dim * shape=[max_splits, num_buckets, 2]
+    # reshape to [max_splits, feature_dim, num_buckets, 2]
     stats_summary = np.moveaxis(stats_summary, 0, 1)
     (summary_indices, summary_values,
      summary_shape) = self._get_sparse_stats_summary_for_split(stats_summary)
@@ -797,9 +1172,10 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testAggregateStatsSimple(self):
     # Get the same result as MakeStatsSummary Op.
-    expected_stats_summary = np.asarray([1., 5., 2., 6., 3., 7., 4., 8.])
+    expected_stats_summary = np.asarray(
+        [1., 5., 2., 6., 0., 0., 3., 7., 4., 8., 0., 0.])
     # shape=[max_splits, num_buckets, feature_dim, stats_dim]
-    expected_stats_summary = np.reshape(expected_stats_summary, (2, 2, 1, 2))
+    expected_stats_summary = np.reshape(expected_stats_summary, (2, 3, 1, 2))
     # Reshape feature dim and bucket id axes
     expected_stats_summary = np.swapaxes(expected_stats_summary, 1, 2)
     self.assertAllClose(
@@ -825,7 +1201,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1]]
       result = boosted_trees_ops.make_stats_summary(
           node_ids, gradients, hessians, bucketized_features, max_splits,
-          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+          num_buckets)  # shape=[max_splits, num_buckets, feature_dim, 2]
       self.assertAllClose(
           [[
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
@@ -850,9 +1226,34 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     # shape=[max_splits, num_buckets, feature_dim, stats_dim]
     # Get the same result as MakeStatsSummary Op.
     expected_stats_summary = [
-        [[[0., 0.]], [[.08, .09]], [[0., 0.]], [[0., 0.]]],
-        [[[0., 0.]], [[.15, .36]], [[.06, .07]], [[.1, .2]]],
-        [[[-.33, .58]], [[0., 0.]], [[.3, .4]], [[0., 0.]]],
+        [[[0., 0.]], [[.08, .09]], [[0., 0.]], [[0., 0.]], [[0., 0.]]],
+        [[[0., 0.]], [[.15, .36]], [[.06, .07]], [[.1, .2]], [[0., 0.]]],
+        [[[-.33, .58]], [[0., 0.]], [[.3, .4]], [[0., 0.]], [[0., 0.]]],
+    ]
+    # Swap feature dim and bucket id axis
+    expected_stats_summary = np.swapaxes(expected_stats_summary, 1, 2)
+    self.assertAllClose(expected_stats_summary, result)
+
+  def testAggregateStatsAccumulateWithMissingValue(self):
+    """Tests that Summary actually accumulates."""
+    max_splits = 3
+    num_buckets = 4
+    node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+    gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+    hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+    # Tests a single feature.
+    missing_feature = -1
+    bucketized_features = [[3], [1], [2], [0], [missing_feature], [2], [0], [1]]
+    result = boosted_trees_ops.boosted_trees_aggregate_stats(
+        node_ids, gradients, hessians, bucketized_features, max_splits,
+        num_buckets)
+    # shape=[max_splits, num_buckets, feature_dim, stats_dim]
+    # Get the same result as MakeStatsSummary Op.
+    expected_stats_summary = [
+        [[[0., 0.]], [[.08, .09]], [[0., 0.]], [[0., 0.]], [[0., 0.]]],
+        [[[0., 0.]], [[.2, .3]], [[.06, .07]], [[.1, .2]], [[-.05, .06]]],
+        [[[-.33, .58]], [[0., 0.]], [[.3, .4]], [[0., 0.]], [[0., 0.]]],
     ]
     # Swap feature dim and bucket id axis
     expected_stats_summary = np.swapaxes(expected_stats_summary, 1, 2)
@@ -872,7 +1273,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1], [0, 0, 0, 2, 2, 3, 3, 2]]
       result = boosted_trees_ops.make_stats_summary(
           node_ids, gradients, hessians, bucketized_features, max_splits,
-          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+          num_buckets)  # shape=[max_splits, num_buckets, feature_dim, 2]
       self.assertAllClose(
           [
               [
@@ -891,9 +1292,15 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
   def testAggregatesSummaryMultipleDimensionFeature(self):
     """Tests that MakeStatsSummary works for multiple features."""
     expected_stats_summary = np.asarray(
-        [[0, 0, 0, 0, .08, .09, 0, 0, 0, 0, .08, .09, 0, 0, 0, 0],
-         [0, 0, .3, .5, .15, .36, 0, 0, .06, .07, -.05, .06, .1, .2, .06, .07],
-         [-.33, .58, .3, .4, 0, 0, 0, 0, .3, .4, -.4, .5, 0, 0, .07, .08]])
+        [[0, 0, 0, 0, .08, .09, 0, 0, 0, 0, .08, .09, 0, 0, 0, 0, 0, 0, 0, 0],
+         [
+             0, 0, .3, .5, .15, .36, 0, 0, .06, .07, -.05, .06, .1, .2, .06,
+             .07, 0, 0, 0, 0
+         ],
+         [
+             -.33, .58, .3, .4, 0, 0, 0, 0, .3, .4, -.4, .5, 0, 0, .07, .08, 0,
+             0, 0, 0
+         ]])
     with self.cached_session():
       max_splits = 3
       num_buckets = 4
@@ -908,7 +1315,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           node_ids, gradients, hessians, bucketized_features, max_splits,
           num_buckets)
       # Reshape to [max_splits, num_buckets, feature_dim, stats_dim]
-      expected_stats_summary = np.reshape(expected_stats_summary, (3, 4, 2, 2))
+      expected_stats_summary = np.reshape(expected_stats_summary, (3, 5, 2, 2))
       # Swap feature_dim and bucket_id axis
       expected_stats_summary = np.swapaxes(expected_stats_summary, 1, 2)
       self.assertAllClose(expected_stats_summary, result)
@@ -932,12 +1339,12 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       # shape=[max_splits, num_buckets, feature_dim, stats_dim]
       expected_stats_summary = [
           [[[0., 0., 0., 0.]], [[.08, .16, .09, .27]], [[0., 0., 0., 0.]],
-           [[0., 0., 0., 0.]]],
+           [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]],
           [[[0., 0., 0., 0.]], [[.15, 0.3, .36, 1.08]], [[.06, 0.12, .07,
                                                           0.21]],
-           [[.1, .2, .2, .6]]],
+           [[.1, .2, .2, .6]], [[0., 0., 0., 0.]]],
           [[[-.33, -.66, .58, 1.74]], [[0., 0., 0., 0.]], [[.3, .6, .4, 1.2]],
-           [[0., 0., 0., 0.]]],
+           [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]],
       ]
       expected_stats_summary = np.swapaxes(expected_stats_summary, 1, 2)
       self.assertAllClose(expected_stats_summary, result)
@@ -1074,7 +1481,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
 
       result = boosted_trees_ops.make_stats_summary(
           node_ids, gradients, hessians, [bucketized_features], max_splits,
-          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+          num_buckets)  # shape=[max_splits, num_buckets, feature_dim, 2]
 
       self.assertAllClose([[[[2., 0.2]]]], self.evaluate(result))
 
@@ -1095,7 +1502,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     self._verify_precision(length=50000000)
 
 
-class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
+class BestMultiDimFeatureSplitMultiClass(StatsOpsTest):
   """Tests multi-class/multi-regression for best splits."""
 
   logits_dim = 2
@@ -1131,7 +1538,7 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
          [[0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.],
           [0., 0., 0., 0.]]]  # node 6
     ]
-    # [max_splits, num_features, num_buckets, 4]
+    # [max_splits, feature_dim, num_buckets, 4]
     return np.array(summary)
 
   def _add_feature_dim(self, stats_summary):
@@ -1140,10 +1547,10 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
 
   def testSumOfStatsSummaryValuesFromHelperFunction(self):
     """Sum of grads and hessians is correct from helper function."""
-    # [max_splits, num_features, num_buckets, 4]
+    # [max_splits, feature_dim, num_buckets, 4]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
     # Test that sum of grads/hessians are same for both features for all nodes.
-    # [max_splits, num_features, 4]
+    # [max_splits, feature_dim, 4]
     agg = stats_summary.sum(axis=2)  # Sum along buckets.
     self.assertAllClose(agg[:, 0, :], agg[:, 1, :])  # There are two features.
     # Test sum of hessians for each nodes. These values are used to evaluate if
@@ -1170,7 +1577,7 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
         [empty, [0.14, 0.1], empty],  # node 2
         [empty, empty, empty],  # node 3; ignored
     ]
-    # [max_splits, num_features, num_buckets, 2]
+    # [max_splits, feature_dim, num_buckets, 2]
     stats_summary = self._add_feature_dim(stats_summary)
     diag_empty = [0] * 4
     diag_stats_summary = [
@@ -1179,7 +1586,7 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
         [diag_empty, [0, 0.14, 0, 0.1], diag_empty],  # node 2
         [diag_empty, diag_empty, diag_empty],  # node 3; ignored
     ]
-    # [max_splits, num_features, num_buckets, 4]
+    # [max_splits, feature_dim, num_buckets, 4]
     diag_stats_summary = self._add_feature_dim(diag_stats_summary)
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
@@ -1232,7 +1639,7 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
         [diag_empty, [-.33, .58, -.2, -.31], diag_empty],  # node 2
         [diag_empty, diag_empty, diag_empty],  # node 3; ignored
     ]
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     diag_stats_summary = self._add_feature_dim(diag_stats_summary)
     full_empty = [0] * 6
     full_stats_summary = [
@@ -1242,7 +1649,7 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
         [full_empty, [-.33, .58, -.2, 0, 0, -.31], full_empty],  # node 2
         [full_empty, full_empty, full_empty],  # node 3; ignored
     ]
-    # [max_splits, num_features, num_buckets, logits_dim + logits_dim**2]
+    # [max_splits, feature_dim, num_buckets, logits_dim + logits_dim**2]
     full_stats_summary = self._add_feature_dim(full_stats_summary)
     (diag_node_ids, diag_gains, diag_feature_dimensions, diag_thresholds,
      diag_left_node_contribs, diag_right_node_contribs,
@@ -1281,8 +1688,9 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
   def testCalculateBestFeatureSplitsWithoutRegularization(self):
     """Testing best split calculation without any regularization."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
      right_node_contribs, split_types) = self.evaluate(
@@ -1307,11 +1715,41 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
                         right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateBestFeatureSplitsWMissingValuesWoRegularization(self):
+    """Testing best split calculation without any regularization."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
+    stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
+
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=0.0,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=self.logits_dim))
+
+    self.assertAllEqual([1, 2], node_ids)
+    self.assertAllClose([0.912981, 2.79444], gains)
+    self.assertAllEqual([0, 1], thresholds)
+    self.assertAllEqual([0, 1], feature_dimensions)
+    # The left node contrib will be later added to the previous node value to
+    # make the left node value, and the same for right node contrib.
+    self.assertAllClose([[-0.5, -3.916667], [-3.722223, -0.442857]],
+                        left_node_contribs)
+    self.assertAllClose([[0.906977, -0.394737], [0.8, 0.4]],
+                        right_node_contribs)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+
   def testCalculateBestFeatureSplitsWithL2(self):
     """Testing best split calculation inith L2 regularization."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
+    stats_summary = self._append_zeros_for_default_bucket(stats_summary)
 
     l2 = 0.1
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
@@ -1337,10 +1775,41 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
                         right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
+  def testCalculateBestFeatureSplitsWithMissingValuesL2(self):
+    """Testing best split calculation inith L2 regularization."""
+    node_id_range = [1, 3]  # node 1 through 2 will be processed.
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
+    stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
+
+    l2 = 0.1
+    (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
+     right_node_contribs, split_types) = self.evaluate(
+         boosted_trees_ops.calculate_best_feature_split(
+             node_id_range,
+             stats_summary,
+             l1=0.0,
+             l2=l2,
+             tree_complexity=0.0,
+             min_node_weight=0,
+             logits_dimension=self.logits_dim))
+
+    self.assertAllEqual([1, 2], node_ids)
+    self.assertAllClose([0.475669, 3.467833], gains)
+    self.assertAllEqual([1, 0], thresholds)
+    self.assertAllEqual([0, 1], feature_dimensions)
+    # The left node contrib will be later added to the previous node value to
+    # make the left node value, and the same for right node contrib.
+    self.assertAllClose([[0.543478, 0.333333], [-2.611111, -0.382692]],
+                        left_node_contribs)
+    self.assertAllClose([[0.108108, -1.426471], [0.285714, 14.800049]],
+                        right_node_contribs)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_RIGHT, _INEQUALITY_DEFAULT_LEFT],
+                        split_types)
+
   def testCalculateBestFeatureSplitsWithMinNodeWeight(self):
     """Testing best split calculation with min_node_weight."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
 
     (node_ids, gains, feature_dimensions, thresholds, left_node_contribs,
@@ -1356,21 +1825,21 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
 
     # Both nodes have large enough sum(hessians) so use them.
     self.assertAllEqual([1, 2], node_ids)
-    self.assertAllClose([0.912981, 1.446218], gains)
-    self.assertAllEqual([2, 1], thresholds)
+    self.assertAllClose([0.912981, 2.79444], gains)
+    self.assertAllEqual([0, 1], thresholds)
     self.assertAllEqual([0, 1], feature_dimensions)
     # The left node contrib will be later added to the previous node value to
     # make the left node value, and the same for right node contrib.
-    self.assertAllClose([[0.906977, -0.394737], [-2.307692, 0.370370]],
+    self.assertAllClose([[-0.5, -3.916667], [-3.722223, -0.442857]],
                         left_node_contribs)
-    self.assertAllClose([[-0.5, -3.916667], [0.785714, -0.133928]],
+    self.assertAllClose([[0.906977, -0.394737], [0.8, 0.4]],
                         right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
 
   def testCalculateBestFeatureSplitsWithTreeComplexity(self):
     """Testing best split calculation with tree complexity."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
 
     l2 = 0.1
@@ -1389,22 +1858,22 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
     self.assertAllEqual([1, 2], node_ids)
     self.assertAllEqual([1, 2], node_ids)
     # L2 test result, but subtracted by tree_complexity.
-    self.assertAllClose(
-        [0.475669 - tree_complexity, 1.009791 - tree_complexity], gains)
-    self.assertAllEqual([1, 1], thresholds)
+    self.assertAllClose([-2.524331, 0.467833], gains)
+    self.assertAllEqual([1, 0], thresholds)
     self.assertAllEqual([0, 1], feature_dimensions)
     # The left node contrib will be later added to the previous node value to
     # make the left node value, and the same for right node contrib.
-    self.assertAllClose([[0.543478, 0.333333], [-1.666667, 0.588235]],
+    self.assertAllClose([[0.543478, 0.333333], [-2.611111, -0.382692]],
                         left_node_contribs)
-    self.assertAllClose([[0.108108, -1.426471], [0.634615, -0.122951]],
+    self.assertAllClose([[0.108108, -1.426471], [0.285714, 14.800049]],
                         right_node_contribs)
-    self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT] * 2, split_types)
+    self.assertAllEqual([_INEQUALITY_DEFAULT_RIGHT, _INEQUALITY_DEFAULT_LEFT],
+                        split_types)
 
   def testCalculateBestFeatureSplitsWithMinNodeNoSplitOnFeaturePossible(self):
     """Test when parent node hessian doesn't meet min node weight."""
     node_id_range = [1, 3]  # node 1 through 2 will be processed.
-    # [max_splits, num_features, num_buckets, 2*logits_dim]
+    # [max_splits, feature_dim, num_buckets, 2*logits_dim]
     stats_summary = self._get_stats_summary_for_split_diagonal_hessian()
 
     min_node_weight = 0.8
@@ -1421,13 +1890,13 @@ class BestMultiDimFeatureSplitMultiClass(test_util.TensorFlowTestCase):
 
     # node_1 doesn't have large enough sum(hessians) so don't return it.
     self.assertAllEqual([2], node_ids)
-    self.assertAllClose([1.446218], gains)
+    self.assertAllClose([2.79444], gains)
     self.assertAllEqual([1], thresholds)
     self.assertAllEqual([1], feature_dimensions)
     # The left node contrib will be later added to the previous node value to
     # make the left node value, and the same for right node contrib.
-    self.assertAllClose([[-2.307692, 0.370370]], left_node_contribs)
-    self.assertAllClose([[0.785714, -0.133929]], right_node_contribs)
+    self.assertAllClose([[-3.722223, -0.442857]], left_node_contribs)
+    self.assertAllClose([[0.8, 0.4]], right_node_contribs)
     self.assertAllEqual([_INEQUALITY_DEFAULT_LEFT], split_types)
 
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index afc0564fc5a..3713fd289da 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -26,6 +26,10 @@ from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import googletest
 
+_INEQUALITY_DEFAULT_LEFT = 'INEQUALITY_DEFAULT_LEFT'.encode('utf-8')
+_INEQUALITY_DEFAULT_RIGHT = 'INEQUALITY_DEFAULT_RIGHT'.encode('utf-8')
+_EQUALITY_DEFAULT_RIGHT = 'EQUALITY_DEFAULT_RIGHT'.encode('utf-8')
+
 
 class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
   """Tests for growing tree ensemble from split candidates."""
@@ -140,6 +144,348 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testGrowWithEmptyEnsembleV2(self):
+    """Test growing an empty ensemble."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_ids = [0, 6]
+
+      # Prepare feature inputs.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+      feature1_inequality_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Feature split with the highest gain.
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([7.65], dtype=np.float32)
+      feature2_dimensions = np.array([1], dtype=np.int32)
+      feature2_thresholds = np.array([7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+      feature2_inequality_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[
+              feature1_inequality_split_types, feature2_inequality_split_types
+          ])
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 6
+              threshold: 7
+              dimension_id: 1
+              left_id: 1
+              right_id: 2
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.489
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.53
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowWithEmptyEnsembleV2EqualitySplit(self):
+    """Test growing an empty ensemble."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_ids = [0, 6]
+
+      # Prepare feature inputs.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+      feature1_inequality_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Feature split with the highest gain.
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([7.65], dtype=np.float32)
+      feature2_dimensions = np.array([1], dtype=np.int32)
+      feature2_thresholds = np.array([7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+      feature2_inequality_split_types = np.array([_EQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[
+              feature1_inequality_split_types, feature2_inequality_split_types
+          ],
+      )
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            categorical_split {
+              feature_id: 6
+              value: 7
+              dimension_id: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.489
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.53
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowWithEmptyEnsembleV2MultiClass(self):
+    """Test growing an empty ensemble for multi-class case."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      logits_dimension = 2
+      feature_ids = [0, 6]
+
+      # Prepare feature inputs.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375, 5.11]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143, 2.98]], dtype=np.float32)
+      feature1_inequality_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Feature split with the highest gain.
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([7.65], dtype=np.float32)
+      feature2_dimensions = np.array([1], dtype=np.int32)
+      feature2_thresholds = np.array([7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+      feature2_left_node_contribs = np.array([[-4.89, 6.31]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[5.3, -1.21]], dtype=np.float32)
+      feature2_inequality_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[
+              feature1_inequality_split_types, feature2_inequality_split_types
+          ],
+          logits_dimension=logits_dimension)
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 6
+              threshold: 7
+              dimension_id: 1
+              left_id: 1
+              right_id: 2
+              dimension_id: 1
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.489
+              }
+              vector {
+                value: 0.631
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.53
+              }
+              vector {
+                value: -0.121
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
   @test_util.run_deprecated_v1
   def testBiasCenteringOnEmptyEnsemble(self):
     """Test growing with bias centering on an empty ensemble."""
@@ -369,6 +715,665 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2NotFinalized(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_dimensions = np.array([1, 3], dtype=np.int32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+      feature2_split_types = np.array(
+          [_INEQUALITY_DEFAULT_RIGHT, _INEQUALITY_DEFAULT_RIGHT])
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_dimensions = np.array([0], dtype=np.int32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+      feature3_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[
+              feature1_dimensions, feature2_dimensions, feature3_dimensions
+          ],
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ],
+          split_types=[
+              feature1_split_types, feature2_split_types, feature3_split_types
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 21
+              dimension_id: 0
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 0.714
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              dimension_id: 3
+              threshold: 7
+              left_id: 5
+              right_id: 6
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -0.4375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.114
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.879
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.5875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.2075
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2NotFinalizedEqualitySplit(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_dimensions = np.array([1, 3], dtype=np.int32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+      feature2_split_types = np.array(
+          [_EQUALITY_DEFAULT_RIGHT, _EQUALITY_DEFAULT_RIGHT])
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_dimensions = np.array([0], dtype=np.int32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+      feature3_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[
+              feature1_dimensions, feature2_dimensions, feature3_dimensions
+          ],
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ],
+          split_types=[
+              feature1_split_types, feature2_split_types, feature3_split_types
+          ],
+      )
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 21
+              dimension_id: 0
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 0.714
+              }
+            }
+          }
+          nodes {
+            categorical_split {
+              feature_id: 1
+              dimension_id: 3
+              value: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -0.4375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.114
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.879
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.5875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.2075
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2NotFinalizedMultiClass(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.714
+              }
+              vector {
+                value: 0.1
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.4375
+              }
+              vector {
+                value: 1.2
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      logits_dimension = 2
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0, .95]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65, 0.1]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_dimensions = np.array([1, 3], dtype=np.int32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6, 2.1], [-1.5, 2.1]],
+                                             dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24, -1.1], [2.3, 0.5]],
+                                              dtype=np.float32)
+      feature2_split_types = np.array(
+          [_INEQUALITY_DEFAULT_RIGHT, _INEQUALITY_DEFAULT_RIGHT])
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_dimensions = np.array([0], dtype=np.int32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75, 3.2]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93, -1.05]], dtype=np.float32)
+      feature3_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[
+              feature1_dimensions, feature2_dimensions, feature3_dimensions
+          ],
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ],
+          split_types=[
+              feature1_split_types, feature2_split_types, feature3_split_types
+          ],
+          logits_dimension=logits_dimension)
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 21
+              dimension_id: 0
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                vector {
+                  value: 0.714
+                }
+                vector {
+                  value: 0.1
+                }
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              dimension_id: 3
+              threshold: 7
+              left_id: 5
+              right_id: 6
+              dimension_id: 3
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                vector {
+                  value: -0.4375
+                }
+                vector {
+                  value: 1.2
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.114
+              }
+              vector {
+                value: 0.195
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.879
+              }
+              vector {
+                value: 0.11
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.5875
+              }
+              vector {
+                value: 1.41
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.2075
+              }
+              vector {
+                value: 1.25
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
   @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeFinalized(self):
     """Test growing an existing ensemble with the last tree finalized."""
@@ -519,6 +1524,527 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2Finalized(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_dimensions = np.array([1], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types])
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              dimension_id: 1
+              threshold: 21
+              left_id: 1
+              right_id: 2
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2FinalizedEqualitySplit(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_dimensions = np.array([1], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+      feature1_split_types = np.array([_EQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types])
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            categorical_split {
+              feature_id: 75
+              dimension_id: 1
+              value: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  @test_util.run_deprecated_v1
+  def testGrowExistingEnsembleTreeV2FinalizedMultiClass(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.714
+              }
+              vector {
+                value: 0.1
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.4375
+              }
+              vector {
+                value: 1.2
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: 0.0
+              }
+              vector {
+                value: 0.0
+              }
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      logits_dimension = 2
+      # Prepare feature inputs.
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_dimensions = np.array([1], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0, 1.1]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65, 0.8]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types],
+          logits_dimension=logits_dimension)
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              dimension_id: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.714
+              }
+              vector {
+                value: 0.1
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.4375
+              }
+              vector {
+                value: 1.2
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              dimension_id: 1
+              threshold: 21
+              left_id: 1
+              right_id: 2
+              dimension_id: 1
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -1.4
+              original_leaf {
+                vector {
+                  value: 0.0
+                }
+                vector {
+                  value: 0.0
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -.6
+              }
+              vector {
+                value: 0.11
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.165
+              }
+              vector {
+                value: 0.08
+              }
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
   @test_util.run_deprecated_v1
   def testPrePruning(self):
     """Test growing an existing ensemble with pre-pruning."""
@@ -676,6 +2202,210 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testPrePruningMultiClass(self):
+    """Test growing an existing ensemble with pre-pruning."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.14
+              }
+              vector {
+                value: 1.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.375
+              }
+              vector {
+                value: 1.2
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      logits_dimension = 2
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0, .95]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65, 0.1]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, 2.7], dtype=np.float32)
+      feature2_dimensions = np.array([1, 3], dtype=np.int32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6, 2.1], [-1.5, 2.1]],
+                                             dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24, -1.1], [2.3, 0.5]],
+                                              dtype=np.float32)
+      feature2_split_types = np.array(
+          [_INEQUALITY_DEFAULT_RIGHT, _INEQUALITY_DEFAULT_RIGHT])
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([2.8], dtype=np.float32)
+      feature3_dimensions = np.array([0], dtype=np.int32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75, 3.2]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93, -1.05]], dtype=np.float32)
+      feature3_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=3,
+          feature_ids=feature_ids,
+          dimension_ids=[
+              feature1_dimensions, feature2_dimensions, feature3_dimensions
+          ],
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ],
+          split_types=[
+              feature1_split_types, feature2_split_types, feature3_split_types
+          ],
+          logits_dimension=logits_dimension)
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.14
+              }
+              vector {
+                value: 1.0
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.8
+              original_leaf {
+                vector {
+                  value: -4.375
+                }
+                vector {
+                  value: 1.2
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.45
+              }
+              vector {
+                value: 1.52
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.182
+              }
+              vector {
+                value: 1.095
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: false
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
   @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDueToEmptySplits(self):
     """Test that the metadata is updated even though we can't split."""
@@ -1259,6 +2989,440 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 3)
       self.assertProtoEquals(expected_result, res_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testPostPruningOfSomeNodesMultiClass(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      logits_dimension = 2
+      # Second feature has larger (but still negative gain).
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013, 0.14]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143, -0.2]],
+                                              dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.2], dtype=np.float32)
+      feature2_dimensions = np.array([3], dtype=np.int32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01, -0.3]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143, 0.121]],
+                                              dtype=np.float32)
+      feature2_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[feature1_split_types, feature2_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+      # Expect the split from second features to be chosen despite the negative
+      # gain.
+      # No pruning happened just yet.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+              dimension_id: 3
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.01
+              }
+              vector {
+                value: -0.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0143
+              }
+              vector {
+                value: 0.121
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare the second layer.
+      # Note that node 1 gain is negative and node 2 gain is positive.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, 0.5], dtype=np.float32)
+      feature1_dimensions = np.array([0, 2], dtype=np.int32)
+      feature1_thresholds = np.array([7, 5], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.07, 0.5], [0.041, 0.279]],
+                                             dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.083, 0.31], [0.064, -0.931]],
+                                              dtype=np.float32)
+      feature1_split_types = np.array(
+          [_INEQUALITY_DEFAULT_LEFT, _INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+
+      # After adding this layer, the tree will not be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+              dimension_id: 3
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 7
+              left_id: 3
+              right_id: 4
+              dimension_id: 0
+            }
+            metadata {
+              gain: -0.2
+              original_leaf {
+                vector {
+                  value: 0.01
+                }
+                vector {
+                  value: -0.3
+                }
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 5
+              right_id: 6
+              dimension_id: 2
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                vector {
+                  value: 0.0143
+                }
+                vector {
+                  value: 0.121
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.08
+              }
+              vector {
+                value: 0.2
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.093
+              }
+              vector {
+                value: 0.01
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0553
+              }
+              vector {
+                value: 0.4
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0783
+              }
+              vector {
+                value: -0.81
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
+        }
+       """
+      self.assertEqual(new_stamp, 2)
+
+      self.assertProtoEquals(expected_result, res_ensemble)
+      # Now split node 3, again with negative gain. After this layer, the
+      # tree will be finalized, and post-pruning happens. The leafs at nodes 3,
+      # 4,7,8 will be pruned out.
+
+      # Prepare the third layer.
+      feature_ids = [92]
+      feature1_nodes = np.array([3], dtype=np.int32)
+      feature1_gains = np.array([-0.45], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([11], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.15, -0.32]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.5, 0.81]], dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+      # After adding this layer, the tree will be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      # Node that nodes 3, 4, 7 and 8 got deleted, so metadata stores has ids
+      # mapped to their parent node 1, with the respective change in logits.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+              dimension_id: 3
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.01
+              }
+              vector {
+                value: -0.3
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 3
+              right_id: 4
+              dimension_id: 2
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                vector {
+                  value: 0.0143
+                }
+                vector {
+                  value: 0.121
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0553
+              }
+              vector {
+                value: 0.4
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0783
+              }
+              vector {
+                value: -0.81
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+            logit_change: -0.5
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+            logit_change: -0.31
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+            logit_change: -0.18
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+            logit_change: -1.31
+          }
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+       """
+      self.assertEqual(new_stamp, 3)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
   @test_util.run_deprecated_v1
   def testPostPruningOfAllNodes(self):
     """Test growing an ensemble with post-pruning, with all nodes are pruned."""
@@ -1443,6 +3607,225 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       }
       """, res_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testPostPruningOfAllNodesMultiClass(self):
+    """Test growing an ensemble with post-pruning, with all nodes are pruned."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      logits_dimension = 2
+      # Prepare inputs. All have negative gains.
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013, 0.14]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143, -0.2]],
+                                              dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.62], dtype=np.float32)
+      feature2_dimensions = np.array([3], dtype=np.int32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01, -0.3]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143, 0.121]],
+                                              dtype=np.float32)
+      feature2_split_types = np.array([_INEQUALITY_DEFAULT_RIGHT])
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[feature1_split_types, feature2_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+
+      # Expect the split from feature 2 to be chosen despite the negative gain.
+      # The grown tree should not be finalized as max tree depth is 2 so no
+      # pruning occurs.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+              dimension_id: 3
+              default_direction: DEFAULT_RIGHT
+            }
+            metadata {
+              gain: -0.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.01
+              }
+              vector {
+                value: -0.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0143
+              }
+              vector {
+                value: 0.121
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare inputs.
+      # All have negative gain.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, -0.5], dtype=np.float32)
+      feature1_dimensions = np.array([0, 4], dtype=np.int32)
+      feature1_thresholds = np.array([77, 79], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.023, -0.99], [0.3, 5.979]],
+                                             dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.012343, 0.63], [24, 0.289]],
+                                              dtype=np.float32)
+      feature1_split_types = np.array(
+          [_INEQUALITY_DEFAULT_LEFT, _INEQUALITY_DEFAULT_LEFT])
+
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions],
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs],
+          split_types=[feature1_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+
+      # Expect the split from feature 1 to be chosen despite the negative gain.
+      # The grown tree should be finalized. Since all nodes have negative gain,
+      # the whole tree is pruned.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      # Expect the ensemble to be empty as post-pruning will prune
+      # the entire finalized tree.
+      self.assertEqual(new_stamp, 2)
+      self.assertProtoEquals(
+          """
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      tree_weights: 1.0
+      tree_weights: 1.0
+      tree_metadata{
+        num_layers_grown: 2
+        is_finalized: true
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: 0.0
+          logit_change: 0.0
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.01
+          logit_change: 0.3
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.0143
+          logit_change: -0.121
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.033
+          logit_change: 1.29
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.022343
+          logit_change: -0.33
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.3143
+          logit_change: -6.1
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -24.014299
+          logit_change: -0.41
+        }
+      }
+      tree_metadata {
+      }
+      growing_metadata {
+        num_trees_attempted: 1
+        num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
+      }
+      """, res_ensemble)
+
   @test_util.run_deprecated_v1
   def testPostPruningChangesNothing(self):
     """Test growing an ensemble with post-pruning with all gains >0."""
@@ -1544,6 +3927,126 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, res_ensemble)
 
+  @test_util.run_deprecated_v1
+  def testPostPruningChangesNothingMultiClass(self):
+    """Test growing an ensemble with post-pruning with all gains >0."""
+    with self.cached_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      logits_dimension = 2
+      # Second feature has larger (but still negative gain).
+      feature_ids = [3, 4]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_dimensions = np.array([0], dtype=np.int32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375, 2.18]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143, -0.40]],
+                                              dtype=np.float32)
+      feature1_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_dimensions = np.array([0], dtype=np.int32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6, 1.11]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24, -2.01]], dtype=np.float32)
+      feature2_split_types = np.array([_INEQUALITY_DEFAULT_LEFT])
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble_v2(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=1,
+          feature_ids=feature_ids,
+          dimension_ids=[feature1_dimensions, feature2_dimensions],
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ],
+          split_types=[feature1_split_types, feature2_split_types],
+          logits_dimension=logits_dimension)
+
+      session.run(grow_op)
+
+      # Expect the split from the first feature to be chosen.
+      # Pruning got triggered but changed nothing.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 52
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.375
+              }
+              vector {
+                value: 2.18
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.143
+              }
+              vector {
+                value: -0.40
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 53dd065f135..08a940804d3 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+# pylint:disable=g-error-prone-assert-raises
 class AssertV2Asserts(test.TestCase):
 
   def test_passes_when_it_should(self):
@@ -206,8 +207,7 @@ Corresponding y values:
 First 6 elements of x:
 \[2 2 3 3 6 6\]
 First 6 elements of y:
-\[20  2  3 30 60  6\]
-"""
+\[20  2  3 30 60  6\]"""
     expected_error_msg_default = r"""big does not equal small
 Condition x == y did not hold.
 Indices of first 3 different values:
@@ -221,8 +221,7 @@ Corresponding y values:
 First 3 elements of x:
 \[2 2 3\]
 First 3 elements of y:
-\[20  2  3\]
-"""
+\[20  2  3\]"""
     expected_error_msg_short = r"""big does not equal small
 Condition x == y did not hold.
 Indices of first 2 different values:
@@ -235,8 +234,7 @@ Corresponding y values:
 First 2 elements of x:
 \[2 2\]
 First 2 elements of y:
-\[20  2\]
-"""
+\[20  2\]"""
     with context.eager_mode():
       big = constant_op.constant([[2, 2], [3, 3], [6, 6]])
       small = constant_op.constant([[20, 2], [3, 30], [60, 6]])
@@ -308,6 +306,15 @@ First 2 elements of y:
       out = array_ops.identity(larry)
     self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_noop_when_both_identical(self):
+    larry = constant_op.constant([])
+    check_op = check_ops.assert_equal(larry, larry)
+    if context.executing_eagerly():
+      self.assertIs(check_op, None)
+    else:
+      self.assertEqual(check_op.type, "NoOp")
+
 
 class AssertNoneEqualTest(test.TestCase):
 
@@ -371,27 +378,38 @@ class AssertNoneEqualTest(test.TestCase):
       x = check_ops.assert_none_equal(t1, t2)
       assert x is None
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          "Custom error message"):
+        check_ops.assert_none_equal(1, 1, message="Custom error message")
+
   def test_error_message_eager(self):
     # Note that the following three strings are regexes
-    expected_error_msg_full = r"""0.0, 1.0, 2.0, 3.0, 4.0, 5.0"""
-    expected_error_msg_default = r"""0.0, 1.0, 2.0, \.\.\."""
-    expected_error_msg_short = r"""0.0, 1.0, \.\.\."""
+    expected_error_msg_full = r"""\[ *0\. +1\. +2\. +3\. +4\. +5\.\]"""
+    expected_error_msg_default = r"""\[ *0\. +1\. +2\.\]"""
+    expected_error_msg_short = r"""\[ *0\. +1\.\]"""
     with context.eager_mode():
       t = constant_op.constant(
           np.array(range(6)), shape=[2, 3], dtype=np.float32)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_full):
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          expected_error_msg_full):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=10)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_full):
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          expected_error_msg_full):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=-1)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_default):
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          expected_error_msg_default):
         check_ops.assert_none_equal(t, t, message="This is the error message.")
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_short):
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          expected_error_msg_short):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=2)
 
@@ -483,7 +501,8 @@ class AssertAllCloseTest(test.TestCase):
   def test_raises_when_atol_violated(self):
     x = constant_op.constant(10., name="x")
     y = constant_op.constant(10.2, name="y")
-    with self.assertRaisesOpError("x and y not equal to tolerance"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "x and y not equal to tolerance"):
       with ops.control_dependencies(
           [check_ops.assert_near(x, y, atol=0.1,
                                  message="failure message")]):
@@ -494,7 +513,8 @@ class AssertAllCloseTest(test.TestCase):
   def test_raises_when_default_rtol_violated(self):
     x = constant_op.constant(0.1, name="x")
     y = constant_op.constant(0.0, name="y")
-    with self.assertRaisesOpError("x and y not equal to tolerance"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "x and y not equal to tolerance"):
       with ops.control_dependencies(
           [check_ops.assert_near(x, y, message="failure message")]):
         out = array_ops.identity(x)
@@ -514,7 +534,8 @@ class AssertLessTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
-    with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "failure message.*\n*.* x < y did not hold"):
       with ops.control_dependencies(
           [check_ops.assert_less(
               small, small, message="failure message")]):
@@ -526,7 +547,8 @@ class AssertLessTest(test.TestCase):
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesOpError("x < y did not hold"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "x < y did not hold"):
       with ops.control_dependencies([check_ops.assert_less(big, small)]):
         out = array_ops.identity(small)
       self.evaluate(out)
@@ -554,7 +576,7 @@ class AssertLessTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
         (ValueError, errors.InvalidArgumentError),
         (r"Incompatible shapes: \[3\] vs. \[2\]|"
          "Dimensions must be equal, but are 3 and 2")):
@@ -577,6 +599,13 @@ class AssertLessTest(test.TestCase):
       x = check_ops.assert_less(t1, t2)
       assert x is None
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          "Custom error message"):
+        check_ops.assert_less(1, 1, message="Custom error message")
+
 
 class AssertLessEqualTest(test.TestCase):
 
@@ -593,7 +622,8 @@ class AssertLessEqualTest(test.TestCase):
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesOpError("fail"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "fail"):
       with ops.control_dependencies(
           [check_ops.assert_less_equal(
               big, small, message="fail")]):
@@ -623,7 +653,7 @@ class AssertLessEqualTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -641,6 +671,13 @@ class AssertLessEqualTest(test.TestCase):
       out = array_ops.identity(larry)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          "Custom error message"):
+        check_ops.assert_less_equal(1, 0, message="Custom error message")
+
 
 class AssertGreaterTest(test.TestCase):
 
@@ -648,7 +685,8 @@ class AssertGreaterTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
-    with self.assertRaisesOpError("fail"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater(
               small, small, message="fail")]):
@@ -660,7 +698,8 @@ class AssertGreaterTest(test.TestCase):
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesOpError("x > y did not hold"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "x > y did not hold"):
       with ops.control_dependencies([check_ops.assert_greater(small, big)]):
         out = array_ops.identity(big)
       self.evaluate(out)
@@ -688,7 +727,7 @@ class AssertGreaterTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -704,6 +743,13 @@ class AssertGreaterTest(test.TestCase):
       out = array_ops.identity(larry)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          "Custom error message"):
+        check_ops.assert_greater(0, 1, message="Custom error message")
+
 
 class AssertGreaterEqualTest(test.TestCase):
 
@@ -720,7 +766,8 @@ class AssertGreaterEqualTest(test.TestCase):
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesOpError("fail"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater_equal(
               small, big, message="fail")]):
@@ -752,7 +799,7 @@ class AssertGreaterEqualTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -770,6 +817,13 @@ class AssertGreaterEqualTest(test.TestCase):
       out = array_ops.identity(larry)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError,
+          "Custom error message"):
+        check_ops.assert_greater_equal(0, 1, message="Custom error message")
+
 
 class AssertNegativeTest(test.TestCase):
 
@@ -784,7 +838,8 @@ class AssertNegativeTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     doug = constant_op.constant([1, 2], name="doug")
-    with self.assertRaisesOpError("fail"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "fail"):
       with ops.control_dependencies(
           [check_ops.assert_negative(
               doug, message="fail")]):
@@ -795,7 +850,8 @@ class AssertNegativeTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     claire = constant_op.constant([0], name="claire")
-    with self.assertRaisesOpError("x < 0 did not hold"):
+    with self.assertRaisesOpError(  # pylint:disable=g-error-prone-assert-raises
+        "x < 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_negative(claire)]):
         out = array_ops.identity(claire)
       self.evaluate(out)
@@ -811,7 +867,14 @@ class AssertNegativeTest(test.TestCase):
       out = array_ops.identity(empty)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Custom error message"):
+        check_ops.assert_negative(1, message="Custom error message")
 
+
+# pylint:disable=g-error-prone-assert-raises
 class AssertPositiveTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -852,6 +915,12 @@ class AssertPositiveTest(test.TestCase):
       out = array_ops.identity(empty)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Custom error message"):
+        check_ops.assert_positive(-1, message="Custom error message")
+
 
 class EnsureShapeTest(test.TestCase):
 
@@ -1393,6 +1462,12 @@ class AssertNonNegativeTest(test.TestCase):
       out = array_ops.identity(empty)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Custom error message"):
+        check_ops.assert_non_negative(-1, message="Custom error message")
+
 
 class AssertNonPositiveTest(test.TestCase):
 
@@ -1423,6 +1498,12 @@ class AssertNonPositiveTest(test.TestCase):
       out = array_ops.identity(empty)
     self.evaluate(out)
 
+  def test_static_check_in_graph_mode(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Custom error message"):
+        check_ops.assert_non_positive(1, message="Custom error message")
+
 
 class AssertIntegerTest(test.TestCase):
 
@@ -1593,7 +1674,7 @@ class AssertShapesTest(test.TestCase):
       for shape in shapes:
         regex = (r"Tensor .* must have rank %d.  Received rank %d" %
                  (correct_rank, actual_rank))
-        self.raises_static_error(shapes={x: shape}, regex=regex)
+        self.raises_static_error(shapes=[(x, shape)], regex=regex)
 
     raises_static_rank_error(
         rank_two_shapes, array_ops.ones([1]), correct_rank=2, actual_rank=1)
@@ -1719,18 +1800,18 @@ class AssertShapesTest(test.TestCase):
   def test_dim_size_specified_as_unknown(self):
     x = array_ops.ones([1, 2, 3], name="x")
     y = array_ops.ones([2, 1], name="y")
-    a1 = check_ops.assert_shapes({
+    a1 = check_ops.assert_shapes([
         (x, (None, 2, None)),
         (y, (None, 1)),
-    })
-    a2 = check_ops.assert_shapes({
+    ])
+    a2 = check_ops.assert_shapes([
         (x, (".", 2, ".")),
         (y, (".", 1)),
-    })
-    a3 = check_ops.assert_shapes({
+    ])
+    a3 = check_ops.assert_shapes([
         (x, ".2."),
         (y, ".1"),
-    })
+    ])
     with ops.control_dependencies([a1, a2, a3]):
       out = array_ops.identity(x)
     self.evaluate(out)
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 7e377853443..a83bfbab1c1 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -714,6 +715,33 @@ class ConcatOffsetTest(test.TestCase):
       ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
+  def testCreateMemDecBlockedFormat(self):
+    """Try to create the mkl concat operation
+
+    when one of the input's memory descriptor is in blocked format
+    """
+    if test_util.IsMklEnabled():
+      s0 = np.ones((1, 8188, 4092, 1), dtype=np.uint8).astype(np.float32)
+      s1 = array_ops.strided_slice(
+          s0, [0, 1, 1, 0], [0, -1, -1, 0], [1, 1, 1, 1],
+          begin_mask=9,
+          end_mask=9)
+      s2 = array_ops.slice(s1, [0, 0, 0, 0], [-1, -1, -1, 1])
+      s3_1 = array_ops.slice(s2, [0, 4, 4, 0], [-1, 8178, 4082, 1])
+      s3_2 = array_ops.slice(s2, [0, 4, 4, 0], [-1, 8178, 4082, 1])
+      filter4_1 = constant_op.constant([[[[1.18, -0.51]]]])
+      s4_1 = nn_ops.conv2d(
+          s3_1, filter4_1, strides=[1, 1, 1, 1], padding="VALID")
+      filter4_2 = constant_op.constant([[[[1.38, -0.11]]]])
+      s4_2 = nn_ops.conv2d(
+          s3_2, filter4_2, strides=[1, 1, 1, 1], padding="VALID")
+      s5_1 = array_ops.slice(s4_1, [0, 6, 6, 0], [-1, 1, 1, -1])
+      s5_2 = array_ops.slice(s4_2, [0, 6, 6, 0], [-1, 1, 1, -1])
+      x_concat = array_ops.concat([s5_1, s5_2], 3)
+      self.evaluate(
+          x_concat
+      )  # This test is only meant to check the creation is not crashed
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 5a403905885..5c78ce3390a 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.compat import compat as forward_compat
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -56,7 +55,8 @@ class CondV2Test(test.TestCase):
     with self.session(graph=ops.get_default_graph()) as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
 
-      expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
+      expected = control_flow_ops.cond(
+          array_ops.squeeze_v2(pred), true_fn, false_fn, name="expected")
       actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual")
 
       expected_grad = gradients_impl.gradients(expected, train_vals)
@@ -69,6 +69,13 @@ class CondV2Test(test.TestCase):
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+      sess_run_args = {pred: [[True]]}
+      sess_run_args.update(feed_dict)
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
       sess_run_args = {pred: False}
       sess_run_args.update(feed_dict)
       expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
@@ -76,6 +83,13 @@ class CondV2Test(test.TestCase):
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+      sess_run_args = {pred: [[False]]}
+      sess_run_args.update(feed_dict)
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
   @test_util.run_deprecated_v1
   def testBasic(self):
     x = constant_op.constant(1.0, name="x")
@@ -1410,6 +1424,4 @@ def _has_node_with_op(run_metadata, op_type):
 
 
 if __name__ == "__main__":
-  # Forward compat date for StatelessIf.
-  with forward_compat.forward_compatibility_horizon(2019, 7, 23):
-    test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 37afb32e36b..148fde18c42 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -39,47 +39,12 @@ from tensorflow.python.platform import test
 
 class ConditionalAccumulatorTest(test.TestCase):
 
-  def testConstructor(self):
-    with ops.Graph().as_default():
-      q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q")
-    self.assertTrue(isinstance(q.accumulator_ref, ops.Tensor))
-    self.assertProtoEquals(
-        """
-      name:'Q' op:'ConditionalAccumulator'
-      attr { key: 'dtype' value { type: DT_FLOAT } }
-      attr { key: 'shape' value { shape { unknown_rank: true} } }
-      attr { key: 'container' value { s: '' } }
-      attr { key: 'shared_name' value { s: '' } }
-      attr { key: 'reduction_type' value {s: 'MEAN'} }
-      """, q.accumulator_ref.op.node_def)
-
   def testConstructorWithInvalidArg(self):
     with ops.Graph().as_default():
       with self.assertRaises(ValueError):
         data_flow_ops.ConditionalAccumulator(
             dtypes_lib.float32, name="Q", reduction_type="Invalid")
 
-  def testConstructorWithShape(self):
-    with ops.Graph().as_default():
-      q = data_flow_ops.ConditionalAccumulator(
-          dtypes_lib.float32,
-          name="Q",
-          shape=tensor_shape.TensorShape([1, 5, 2, 8]))
-    self.assertTrue(isinstance(q.accumulator_ref, ops.Tensor))
-    self.assertProtoEquals(
-        """
-      name:'Q' op:'ConditionalAccumulator'
-      attr { key: 'dtype' value { type: DT_FLOAT } }
-      attr { key: 'shape' value { shape { dim {size: 1 }
-                                          dim {size: 5 }
-                                          dim {size: 2 }
-                                          dim {size: 8 }
-      } } }
-      attr { key: 'container' value { s: '' } }
-      attr { key: 'shared_name' value { s: '' } }
-      attr { key: 'reduction_type' value {s: 'MEAN'} }
-      """, q.accumulator_ref.op.node_def)
-
   @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index be11d4a88eb..007c3f268f2 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 
 import collections
 import math
+import re
 import sys
 import time
 
@@ -43,6 +44,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -391,7 +393,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     b = control_flow_ops.cond(
         constant_op.constant(True), lambda: math_ops.square(x),
         lambda: math_ops.subtract(x, 1.))
-    self.assertEqual(b.shape, tensor_shape.scalar())
+    self.assertEqual(b.shape, tensor_shape.TensorShape([]))
 
   @test_util.run_v1_only("b/120545219")
   def testFetchable(self):
@@ -431,8 +433,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlices(self):
     with self.cached_session():
-      values = constant_op.constant(10)
-      indices = constant_op.constant(0)
+      values = constant_op.constant([10])
+      indices = constant_op.constant([0])
       x = ops.IndexedSlices(values, indices)
       pred = math_ops.less(1, 2)
       fn1 = lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices)
@@ -441,14 +443,14 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
       val = r.values
       ind = r.indices
-    self.assertAllEqual(11, val)
-    self.assertAllEqual(0, ind)
+    self.assertAllEqual([11], val)
+    self.assertAllEqual([0], ind)
 
   def testCondMismatchedIndexedSlices(self):
     @def_function.function
     def foo():
-      values = constant_op.constant(10)
-      indices = constant_op.constant(0)
+      values = constant_op.constant([10])
+      indices = constant_op.constant([0])
       x = ops.IndexedSlices(values, indices)
       with self.assertRaisesRegexp(
           TypeError, "Cannot reconcile tf.cond 0-th outputs"):
@@ -517,9 +519,9 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
-      values = constant_op.constant(10)
-      i_32 = ops.convert_to_tensor(0, name="one", dtype=dtypes.int32)
-      i_64 = ops.convert_to_tensor(0, name="one", dtype=dtypes.int64)
+      values = constant_op.constant([10])
+      i_32 = ops.convert_to_tensor([0], name="one", dtype=dtypes.int32)
+      i_64 = ops.convert_to_tensor([0], name="one", dtype=dtypes.int64)
       x = ops.IndexedSlices(values, i_32)
       pred = math_ops.less(1, 2)
       fn1 = lambda: ops.IndexedSlices(math_ops.add(x.values, 1), i_32)
@@ -528,8 +530,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
       val = r.values
       ind = r.indices
-    self.assertAllEqual(11, val)
-    self.assertAllEqual(0, ind)
+    self.assertAllEqual([11], val)
+    self.assertAllEqual([0], ind)
     self.assertTrue(ind.dtype == np.int64)
 
   @test_util.run_v1_only("b/120545219")
@@ -1005,6 +1007,79 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       grad = gradients_impl.gradients(r, [x])[0]
       self.assertAllEqual(1.0, self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testCondComputeGradAfterSessRunFails(self):
+    with self.cached_session():
+      x = constant_op.constant(10.0, name="x")
+      pred = math_ops.less(1, 2)
+
+      def true_fn():
+        a = x * x
+        return a * a
+
+      def false_fn():
+        return x * x
+
+      r = control_flow_ops.cond(pred, true_fn, false_fn)
+
+      self.assertAllEqual(r, 10000.)
+      grad = gradients_impl.gradients(r, [x])[0]
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          r"Connecting to invalid output 1 of source node cond which has 1 "
+          r"outputs. Try using "
+          "tf.compat.v1.experimental.output_all_intermediates\(True\)."):
+        self.evaluate(grad)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testCondComputeGradAfterSessRun(self):
+    with self.cached_session():
+      x = constant_op.constant(10.0, name="x")
+      pred = math_ops.less(1, 2)
+
+      def true_fn():
+        a = x * x
+        return a * a
+
+      def false_fn():
+        return x * x
+
+      r = control_flow_ops.cond(pred, true_fn, false_fn)
+
+      self.assertAllEqual(r, 10000.)
+      grad = gradients_impl.gradients(r, [x])[0]
+      self.assertAllEqual(grad, 4000.)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testNestedCondComputeGradAfterSessRun(self):
+    with self.cached_session():
+      x = constant_op.constant(10.0, name="x")
+      pred = math_ops.less(1, 2)
+
+      def true_fn():
+
+        def inner_true_fn():
+          a = x * x
+          return a * a
+
+        def inner_false_fn():
+          return x * x
+
+        return control_flow_ops.cond(
+            constant_op.constant(True), inner_true_fn, inner_false_fn)
+
+      def false_fn():
+        return x * x
+
+      r = control_flow_ops.cond(pred, true_fn, false_fn)
+
+      self.assertAllEqual(r, 10000.)
+      grad = gradients_impl.gradients(r, [x])[0]
+      self.assertAllEqual(grad, 4000.)
+
   @test_util.run_deprecated_v1
   def testCondGrad_2(self):
     with self.cached_session():
@@ -1566,8 +1641,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      with self.assertRaisesRegexp(
-          ValueError, r"Tensor.*Placeholder:0.* must be from the same graph.*"):
+      with self.assertRaisesRegexp(ValueError,
+                                   r"must be from the same graph.*"):
         loop = create_while_loop()
       xla_context.Exit()
     else:
@@ -1653,14 +1728,17 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         for dev in run_metadata_without_xla_context.step_stats.dev_stats:
           if "/device:CPU" in dev.device:
             node_stats = dev.node_stats
-        stack_push_op = "TensorListPushBack"
+        stack_push_count = len([
+            x for x in node_stats
+            if re.match(r".*TensorListPushBack_?\d*", x.node_name)
+        ])
       else:
         for dev in run_metadata.step_stats.dev_stats:
           if "/device:CPU" in dev.device:
             node_stats = dev.node_stats
         stack_push_op = "StackPushV2"
-      stack_push_count = len(
-          [x for x in node_stats if x.node_name.endswith(stack_push_op)])
+        stack_push_count = len(
+            [x for x in node_stats if x.node_name.endswith("StackPushV2")])
       # Pushes to the stack = product of maximum_iterations values;
       # the last two "3"s comes from size(p), when p == [0, 0, 0].
       self.assertEqual(stack_push_count, 5 * 3 * 3, str(node_stats))
@@ -2117,6 +2195,50 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(r.values.row_splits.shape.as_list() in ([6], [None]))
     self.assertTrue(r.values.values.shape.as_list() in ([49], [None]))
 
+  def testWhileShapeInvariantTensorSpec(self):
+    i = constant_op.constant(0)
+    x = constant_op.constant([1])
+    c = lambda i, _: i < 10
+    b = lambda i, x: (i + 1, array_ops.stack([x, x]))
+    shape_invariants = [
+        tensor_spec.TensorSpec([], dtype=dtypes.int32),
+        tensor_spec.TensorSpec(None, dtype=dtypes.int32)]
+    control_flow_ops.while_loop(c, b, [i, x], shape_invariants)
+
+  # TODO(b/131265085) Remove this decorator when bug is fixed.
+  @test_util.build_as_function_and_v1_graph
+  def testWhileShapeInvariantWrongTypeSpecType(self):
+    c = lambda i, _: i < 10
+    b = lambda i, x: (i + 1, x)
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor([[0]], [1.0], [10])
+    shape_invariants = [
+        tensor_spec.TensorSpec([], dtype=dtypes.int32),
+        sparse_tensor.SparseTensorSpec([None])]
+    control_flow_ops.while_loop(c, b, [i, x], shape_invariants)
+
+    x2 = constant_op.constant([1])
+    with self.assertRaises(TypeError):
+      control_flow_ops.while_loop(c, b, [i, x2], shape_invariants)
+
+    x3 = ragged_factory_ops.constant([[1, 2], [3]])
+    with self.assertRaises(TypeError):
+      control_flow_ops.while_loop(c, b, [i, x3], shape_invariants)
+
+    i2 = constant_op.constant(0.0)
+    with self.assertRaises(TypeError):
+      control_flow_ops.while_loop(c, b, [i2, x], shape_invariants)
+
+  # TODO(b/131265085) Remove this decorator when bug is fixed.
+  @test_util.build_as_function_and_v1_graph
+  def testWhileShapeInvariantBadType(self):
+    i = constant_op.constant(0)
+    x = constant_op.constant([1])
+    c = lambda i, _: i < 10
+    b = lambda i, x: (i + 1, x)
+    with self.assertRaises((ValueError, TypeError)):
+      control_flow_ops.while_loop(c, b, [i, x], ["foo", "bar"])
+
   def _testNestedWhile_1(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
@@ -2739,6 +2861,34 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       r = gradients_impl.gradients([r, y], x)[0]
       self.assertAllClose([2.0, 4.0], sess.run(r, feed_dict={x: [1.0, 2.0]}))
 
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testWhileGradAfterSessionRun(self):
+    v0 = constant_op.constant(2.)
+    r = control_flow_ops.while_loop(
+        lambda _: True, lambda v: v * v, [v0], maximum_iterations=3)
+
+    self.assertAllEqual(r, 256.)
+    grad = gradients_impl.gradients(r, v0)[0]
+    self.assertAllClose(grad, 1024.)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testNestedWhileGradAfterSessionRun(self):
+    v0 = constant_op.constant(2.)
+
+    def body(v):
+      inner_v0 = constant_op.constant(1.)
+      return control_flow_ops.while_loop(
+          lambda _: True, lambda x: x * v, [inner_v0], maximum_iterations=2)
+
+    r = control_flow_ops.while_loop(
+        lambda _: True, body, [v0], maximum_iterations=3)
+
+    self.assertAllEqual(r, 256.)
+    grad = gradients_impl.gradients(r, v0)[0]
+    self.assertAllClose(grad, 1024.)
+
   @test_util.run_v1_only("b/120545219")
   def testWhileGrad_MultipleUses(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 608ee57ed69..cfb60884a78 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import math
 
 import numpy as np
@@ -32,7 +31,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
-from tensorflow.python.framework import test_util
+from tensorflow.python.util.compat import collections_abc
 
 
 def GetTestConfigs():
@@ -82,7 +81,7 @@ class Conv3DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
 
-      if isinstance(stride, collections.Iterable):
+      if isinstance(stride, collections_abc.Iterable):
         strides = [1] + list(stride) + [1]
       else:
         strides = [1, stride, stride, stride, 1]
@@ -140,7 +139,7 @@ class Conv3DTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
-      if isinstance(stride, collections.Iterable):
+      if isinstance(stride, collections_abc.Iterable):
         strides = list(stride)
       else:
         strides = [stride, stride, stride]
@@ -317,6 +316,33 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         expected=expected_output)
 
+  def _TestConv3DEmptyTensorOutputShape(self):
+    """Verifies the output shape of the Conv3D op when output tensor is empty.
+
+    Args: none
+    """
+    input_shape = [0, 112, 112, 112, 32]
+    filter_shape = [3, 3, 3, 32, 64]
+
+    output_shape = [0, 112, 112, 112, 64]
+    input_data = 1
+    filter_data = 1
+    for data_type in self._DtypesToTest(False):
+      input_tensor = constant_op.constant(
+          input_data, shape=input_shape, dtype=data_type, name="input")
+      filter_tensor = constant_op.constant(
+          filter_data, shape=filter_shape, dtype=data_type, name="filter")
+      conv = nn_ops.conv3d(
+          input_tensor,
+          filter_tensor,
+          strides=[1, 1, 1, 1, 1],
+          dilations=[1, 1, 1, 1, 1],
+          padding="SAME",
+          data_format="NDHWC",
+          name="conv")
+      values = self.evaluate(conv)
+      self.assertEqual(values.shape, tensor_shape.TensorShape(output_shape))
+
   def testKernelSmallerThanStride(self):
     expected_output = [
         0.03703704, 0.11111111, 0.25925926, 0.33333333, 0.7037037, 0.77777778,
@@ -380,7 +406,7 @@ class Conv3DTest(test.TestCase):
         filter_planes, filter_rows, filter_cols, in_depth, out_depth
     ]
 
-    if isinstance(stride, collections.Iterable):
+    if isinstance(stride, collections_abc.Iterable):
       strides = [1] + list(stride) + [1]
     else:
       strides = [1, stride, stride, stride, 1]
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index b6c22952ea4..b51d8ae61aa 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 import time
 
@@ -47,6 +46,7 @@ from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util.compat import collections_abc
 
 
 def GetShrunkInceptionShapes(shrink=10):
@@ -269,7 +269,7 @@ class Conv2DTest(test.TestCase):
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
-      if isinstance(stride, collections.Iterable):
+      if isinstance(stride, collections_abc.Iterable):
         strides = list(stride)
       else:
         strides = [stride, stride]
diff --git a/tensorflow/python/kernel_tests/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
index 326820f794c..5e515e15b42 100644
--- a/tensorflow/python/kernel_tests/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import critical_section_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -63,10 +64,12 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("Inner%sOuter%s" % (inner, outer), inner, outer)
       for (inner, outer) in itertools.product(*([(False, True)] * 2)))
-  @test_util.disable_control_flow_v2("b/135070612")
   @test_util.run_in_graph_and_eager_modes
   @test_util.xla_allow_fallback("b/128495870")
   def testCriticalSectionWithControlFlow(self, outer_cond, inner_cond):
+    if (not context.executing_eagerly() and
+        control_flow_v2_toggles.control_flow_v2_enabled()):
+      self.skipTest("b/135070612")
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     v = resource_variable_ops.ResourceVariable(0.0, name="v")
     num_concurrent = 100
diff --git a/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py
new file mode 100644
index 00000000000..aae624f6605
--- /dev/null
+++ b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py
@@ -0,0 +1,114 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for cumulative_logsumexp op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CumulativeLogsumexpTest(test.TestCase):
+  valid_dtypes = [dtypes.float32, dtypes.float64]
+
+  def _computeLogSumExp(self, x, **kwargs):
+    result_naive = math_ops.cumsum(math_ops.exp(x), **kwargs)
+    result_fused = math_ops.exp(math_ops.cumulative_logsumexp(x, **kwargs))
+    return result_naive, result_fused
+
+  def _testLogSumExp(self, x, dtype=dtypes.float32, use_gpu=False, **kwargs):
+    with self.cached_session(use_gpu=use_gpu):
+      x = ops.convert_to_tensor(x, dtype=dtype)
+
+      result_naive, result_fused = self.evaluate(
+          self._computeLogSumExp(x, **kwargs))
+
+    self.assertAllClose(result_naive, result_fused)
+
+  def _testLogSumExpAllArgs(self, x, axis=0, use_gpu=False):
+    for dtype in self.valid_dtypes:
+      for reverse in (True, False):
+        for exclusive in (True, False):
+          self._testLogSumExp(
+              x, dtype=dtype, use_gpu=use_gpu,
+              reverse=reverse, exclusive=exclusive,
+              axis=axis)
+
+  def test1D(self):
+    x = np.arange(10) / 10.0 - 0.5
+    self._testLogSumExpAllArgs(x, use_gpu=False)
+    self._testLogSumExpAllArgs(x, use_gpu=True)
+
+  def test2D(self):
+    x = np.reshape(np.arange(20) / 20.0 - 0.5, (2, 10))
+
+    for axis in (-2, -1, 0, 1):
+      self._testLogSumExpAllArgs(x, axis=axis, use_gpu=False)
+      self._testLogSumExpAllArgs(x, axis=axis, use_gpu=True)
+
+  def _testGradient(self, x, use_gpu=False, **kwargs):
+    with self.cached_session(use_gpu=use_gpu):
+      x = ops.convert_to_tensor(x, dtype=dtypes.float64)
+
+      grad_naive_theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda y: math_ops.cumsum(math_ops.exp(y), **kwargs), [x])
+      grad_fused_theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda y: math_ops.exp(math_ops.cumulative_logsumexp(y, **kwargs)),
+          [x])
+
+      self.assertAllClose(grad_fused_theoretical, grad_naive_theoretical)
+
+  def testGradient(self):
+    for reverse in (True, False):
+      for exclusive in (True, False):
+        x = np.arange(10) / 10.0 - 0.5
+
+        self._testGradient(x, use_gpu=False,
+                           reverse=reverse, exclusive=exclusive)
+        self._testGradient(x, use_gpu=True,
+                           reverse=reverse, exclusive=exclusive)
+
+  def _logSumExpMap(self, x):
+    return map_fn.map_fn(
+        lambda i: math_ops.reduce_logsumexp(x[:i + 1]),
+        math_ops.range(array_ops.shape(x)[0]),
+        dtype=x.dtype)
+
+  def test1DLarge(self):
+    # This test ensures that the operation is correct even when the naive
+    # implementation would overflow.
+    x_np = np.arange(20) * 20.0
+
+    for use_gpu in (True, False):
+      with self.cached_session(use_gpu=use_gpu):
+        x_tf = ops.convert_to_tensor(x_np, dtype=dtypes.float32)
+
+        result_fused = self.evaluate(math_ops.cumulative_logsumexp(x_tf))
+        result_map = self.evaluate(self._logSumExpMap(x_tf))
+
+      self.assertAllClose(result_fused, result_map)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 2f16aad83e7..33158128fb8 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 _ADD = lambda x, y: x + y
@@ -1187,51 +1186,6 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareMulGradient(data)
 
 
-class AccumulateTest(test.TestCase):
-
-  def testSimple(self):
-    with self.cached_session():
-      random_arrays = [
-          np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
-      ]
-      random_tensors = [
-          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
-          for x in random_arrays
-      ]
-      tf_val = math_ops.accumulate_n(random_tensors)
-      np_val = random_arrays[0]
-      for random_array in random_arrays[1:]:
-        np_val += random_array
-      self.assertAllClose(np_val, self.evaluate(tf_val))
-
-  def testZeroArgs(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        tf_val = math_ops.accumulate_n([])
-        self.evaluate(tf_val)
-
-  def testWrongShape(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        a = variables.Variable(0.2)
-        b = variables.Variable(0.1)
-        math_ops.accumulate_n([a, b], shape=[2, 2])  # Should be shape=[]
-
-  def testWrongType(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        a = variables.Variable(0.2, dtype=np.float32)
-        b = variables.Variable(0.1, dtype=np.float32)
-        math_ops.accumulate_n([a, b], tensor_dtype=np.int32)
-
-  def testWrongTypeOneInput(self):
-    # Scenario that used to trigger a bug, even when testWrongType() worked
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        a = variables.Variable(0.2, dtype=np.float32)
-        math_ops.accumulate_n([a], tensor_dtype=np.int32)
-
-
 class PolyvalTest(test.TestCase):
 
   def _runtest(self, dtype, degree):
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index a9391cdab7f..ec70b5dd863 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -315,7 +315,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertAllEqual(v_diag.eval(), mat)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # {Sub,Super}diagonals.
@@ -343,7 +343,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # {Sub,Super}diagonals.
@@ -378,7 +378,7 @@ class MatrixDiagTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testRectangularBatch(self):
     # LINT.IfChange
-    if compat.forward_compatible(2019, 7, 31):
+    if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
       with self.cached_session(use_gpu=True):
@@ -490,7 +490,7 @@ class MatrixDiagTest(test.TestCase):
         self.assertLess(error, 1e-4)
 
     # LINT.IfChange
-    if compat.forward_compatible(2019, 7, 31):
+    if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
       # {Sub,super}diagonals/band.
@@ -522,7 +522,7 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertAllEqual(mat_set_diag, self.evaluate(output))
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands.
@@ -554,7 +554,7 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertAllEqual(expected, self.evaluate(output))
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands.
@@ -585,7 +585,7 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands.
@@ -622,7 +622,7 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands.
@@ -656,7 +656,7 @@ class MatrixSetDiagTest(test.TestCase):
         array_ops.matrix_set_diag([[v]], v).eval(feed_dict={v: 0.0})
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         d = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -675,7 +675,7 @@ class MatrixSetDiagTest(test.TestCase):
           np.random.rand(*diag_shape), dtype=dtypes_lib.float32)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
         y = array_ops.matrix_set_diag(x, x_diag, k=diags)
       else:
@@ -697,7 +697,7 @@ class MatrixSetDiagTest(test.TestCase):
     diag_bands = [(0, 0)]
 
     # LINT.IfChange
-    if compat.forward_compatible(2019, 7, 31):
+    if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
       diag_bands.append((-1, 1))
     for input_shape, diags in itertools.product(input_shapes, diag_bands):
@@ -740,7 +740,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_diag.eval(), v)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
         for offset in [-2, 3]:
           mat = np.diag(v, offset)
@@ -767,7 +767,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands.
@@ -790,7 +790,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands with padding_value.
@@ -825,7 +825,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
       # LINT.IfChange
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
       # LINT.ThenChange(//tensorflow/python/ops/array_ops.py)
 
         # Diagonal bands with padding_value.
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index ccbe5416ded..457056a06a1 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -289,7 +289,6 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_oss"],  # b/133229436
     xla_enable_strict_auto_jit = True,
 )
 
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index a0e0a36fecc..49f24a57420 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -116,6 +116,7 @@ class BrokenBijector(bijector.Bijector):
       raise IntentionallyMissingError
     return math_ops.log(2.)
 
+
 class BijectorTestEventNdims(test.TestCase):
 
   def testBijectorNonIntegerEventNdims(self):
@@ -162,12 +163,8 @@ class BijectorCachingTestBase(object):
     _ = broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
 
     # Now, everything should be cached if the argument is y.
+    broken_bijector.inverse(y)
     broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
-    try:
-      broken_bijector.inverse(y)
-      broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
-    except IntentionallyMissingError:
-      raise AssertionError("Tests failed! Cached values not used.")
 
     # Different event_ndims should not be cached.
     with self.assertRaises(IntentionallyMissingError):
@@ -182,11 +179,8 @@ class BijectorCachingTestBase(object):
     _ = broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
 
     # Now, everything should be cached if the argument is x.
-    try:
-      broken_bijector.forward(x)
-      broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
-    except IntentionallyMissingError:
-      raise AssertionError("Tests failed! Cached values not used.")
+    broken_bijector.forward(x)
+    broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
 
     # Different event_ndims should not be cached.
     with self.assertRaises(IntentionallyMissingError):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index f23b7d33664..f9491d4da4c 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.compat import compat
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -39,6 +39,14 @@ _TEST_TYPES = (dtypes.int64, dtypes.float32,
 # TODO(virimia): Add a benchmark for gather_v2, with batch_dims and axis set.
 
 
+def _to_str_elements(values):
+  """Converts the inner list elements to strings."""
+  if isinstance(values, list):
+    return [_to_str_elements(value) for value in values]
+  else:
+    return str(values).encode("utf-8")
+
+
 class GatherTest(test.TestCase, parameterized.TestCase):
 
   def _buildParams(self, data, dtype):
@@ -343,9 +351,29 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
     self.assertAllEqual(expected, result)
 
-    with compat.forward_compatibility_horizon(2019, 6, 11):
+    # Test the gradients shape.
+    if context.executing_eagerly():
+      with backprop.GradientTape() as tape:
+        zeros = array_ops.zeros_like(params, dtype=dtypes.float32)
+        tape.watch(zeros)
+        values = zeros * 2 + zeros
+        result = array_ops.gather(
+            values, indices, axis=axis, batch_dims=batch_dims)
+      gradients = tape.gradient(result, zeros)
+    else:
+      zeros = array_ops.zeros_like(params, dtype=dtypes.float32)
+      values = zeros * 2 + zeros
       result = array_ops.gather(
-          params, indices, axis=axis, batch_dims=batch_dims)
+          values, indices, axis=axis, batch_dims=batch_dims)
+      gradients = gradients_impl.gradients(result, [zeros])[0]
+
+    self.assertAllEqual(array_ops.shape(params), array_ops.shape(gradients))
+
+    # Run the same test for strings.
+    params = _to_str_elements(params)
+    expected = _to_str_elements(expected)
+    result = array_ops.gather(
+        params, indices, axis=axis, batch_dims=batch_dims)
 
     self.assertAllEqual(expected, result)
 
@@ -443,9 +471,11 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(output_shape, result.shape.as_list())
     self.assertAllEqual(expected, result)
 
-    with compat.forward_compatibility_horizon(2019, 6, 11):
-      result = array_ops.gather(
-          params, indices, axis=axis, batch_dims=batch_dims)
+    # Run the same test for strings.
+    params = _to_str_elements(params)
+    expected = _to_str_elements(expected.tolist())
+    result = array_ops.gather(
+        params, indices, axis=axis, batch_dims=batch_dims)
 
     self.assertAllEqual(output_shape, result.shape.as_list())
     self.assertAllEqual(expected, result)
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 4b9681afd2c..3822b4b89fc 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -746,6 +746,13 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
         else:
           shape = [4, 16, 16, 16, 64]
           convolution = convolutional.conv3d
+
+          if test.is_built_with_rocm():
+            # This subtest triggers a known bug in ROCm runtime code
+            # The bug has been fixed and will be available in ROCm 2.7
+            # Re-enable this test once ROCm 2.7 is released
+            continue
+
         inputs = random_ops.random_normal(shape, dtype=dtype)
         inputs_2norm = linalg_ops.norm(inputs)
         outputs = convolution(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index a00e61c09dd..6a7c4362f5c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
 from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
@@ -56,6 +58,7 @@ def _block_diag_dense(expected_shape, blocks):
   return array_ops.concat(rows, axis=-2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SquareLinearOperatorBlockDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
@@ -209,6 +212,26 @@ class SquareLinearOperatorBlockDiagTest(
         block_diag.LinearOperatorBlockDiag)
     self.assertEqual(2, len(inverse.operators))
 
+  def test_tape_safe(self):
+    matrix = variables_module.Variable([[1., 0.], [0., 1.]])
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_self_adjoint=True,
+                is_positive_definite=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_self_adjoint=True,
+                is_positive_definite=True,
+            ),
+        ],
+        is_self_adjoint=True,
+        is_positive_definite=True,
+    )
+    self.check_tape_safe(operator)
+
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index 4c54ec6117c..590f9c76d8e 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -246,7 +246,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   # Skip Cholesky since we are explicitly testing non-hermitian
   # spectra.
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     return ["cholesky"]
 
   def operator_and_matrix(
@@ -339,8 +339,8 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       h = operator.convolution_kernel()
       c = operator.to_dense()
 
-      self.assertAllEqual((2, 3), h.get_shape())
-      self.assertAllEqual((2, 3, 3), c.get_shape())
+      self.assertAllEqual((2, 3), h.shape)
+      self.assertAllEqual((2, 3, 3), c.shape)
       self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
 
   @test_util.run_deprecated_v1
@@ -533,7 +533,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
     return [dtypes.complex64, dtypes.complex128]
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     return ["cholesky"]
 
   def operator_and_matrix(
@@ -682,7 +682,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       self.assertEqual(operator.dtype, dtypes.complex64)
       matrix = operator.to_dense().eval()
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
-      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-5)
 
   @test_util.run_deprecated_v1
   def test_defining_spd_operator_by_taking_real_part(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 2321a8c6d57..ba611a450c2 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -44,7 +44,7 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.complex64] = 1e-4
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     # Cholesky not implemented.
     return ["cholesky"]
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 41a48b60bed..8f96b112360 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -145,16 +145,16 @@ class LinearOperatorDiagTest(
       # Create a batch matrix with the broadcast shape of operator.
       diag_broadcast = array_ops.concat((diag, diag), 1)
       mat = array_ops.matrix_diag(diag_broadcast)
-      self.assertAllEqual((2, 2, 3, 3), mat.get_shape())  # being pedantic.
+      self.assertAllEqual((2, 2, 3, 3), mat.shape)  # being pedantic.
 
       operator_matmul = operator.matmul(x)
       mat_matmul = math_ops.matmul(mat, x)
-      self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
+      self.assertAllEqual(operator_matmul.shape, mat_matmul.shape)
       self.assertAllClose(*self.evaluate([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
-      self.assertAllEqual(operator_solve.get_shape(), mat_solve.get_shape())
+      self.assertAllEqual(operator_solve.shape, mat_solve.shape)
       self.assertAllClose(*self.evaluate([operator_solve, mat_solve]))
 
   def test_diag_matmul(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
index 5f435764945..4179d450ad1 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
@@ -17,17 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_householder as householder
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
+CheckTapeSafeSkipOptions = linear_operator_test_util.CheckTapeSafeSkipOptions
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorHouseholderTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
@@ -42,7 +46,7 @@ class LinearOperatorHouseholderTest(
         shape_info((2, 1, 4, 4))]
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     # This linear operator is never positive definite.
     return ["cholesky"]
 
@@ -87,6 +91,19 @@ class LinearOperatorHouseholderTest(
     self.assertIsInstance(
         operator.inverse(), householder.LinearOperatorHouseholder)
 
+  def test_tape_safe(self):
+    reflection_axis = variables_module.Variable([1., 3., 5., 8.])
+    operator = householder.LinearOperatorHouseholder(reflection_axis)
+    self.check_tape_safe(
+        operator,
+        skip_options=[
+            # Determinant hard-coded as 1.
+            CheckTapeSafeSkipOptions.DETERMINANT,
+            CheckTapeSafeSkipOptions.LOG_ABS_DETERMINANT,
+            # Trace hard-coded.
+            CheckTapeSafeSkipOptions.TRACE,
+        ])
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(LinearOperatorHouseholderTest)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 3d29adc143f..adb00e6c068 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ from tensorflow.python.platform import test
 rng = np.random.RandomState(2016)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
@@ -61,23 +63,20 @@ class LinearOperatorIdentityTest(
 
     return operator, mat
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
-      operator.assert_positive_definite().run()  # Should not fail
+      self.evaluate(operator.assert_positive_definite())  # Should not fail
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
-      operator.assert_non_singular().run()  # Should not fail
+      self.evaluate(operator.assert_non_singular())  # Should not fail
 
-  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
-      operator.assert_self_adjoint().run()  # Should not fail
+      self.evaluate(operator.assert_self_adjoint())  # Should not fail
 
   def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.linalg.solve does
@@ -113,41 +112,38 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
-  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
-      num_rows = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorIdentity(
-          num_rows, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be a 0-D Tensor"):
-        operator.to_dense().eval(feed_dict={num_rows: [2]})
+      num_rows = array_ops.placeholder_with_default([2], shape=None)
+
+      with self.assertRaisesError("must be a 0-D Tensor"):
+        operator = linalg_lib.LinearOperatorIdentity(
+            num_rows, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
-      num_rows = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorIdentity(
-          num_rows, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be non-negative"):
-        operator.to_dense().eval(feed_dict={num_rows: -2})
+      num_rows = array_ops.placeholder_with_default(-2, shape=None)
+      with self.assertRaisesError("must be non-negative"):
+        operator = linalg_lib.LinearOperatorIdentity(
+            num_rows, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
-      batch_shape = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorIdentity(
-          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be a 1-D"):
-        operator.to_dense().eval(feed_dict={batch_shape: 2})
+      batch_shape = array_ops.placeholder_with_default(2, shape=None)
+      with self.assertRaisesError("must be a 1-D"):
+        operator = linalg_lib.LinearOperatorIdentity(
+            num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
-      batch_shape = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorIdentity(
-          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be non-negative"):
-        operator.to_dense().eval(feed_dict={batch_shape: [-2]})
+      batch_shape = array_ops.placeholder_with_default([-2], shape=None)
+      with self.assertRaisesError("must be non-negative"):
+        operator = linalg_lib.LinearOperatorIdentity(
+            num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
   def test_wrong_matrix_dimensions_raises_static(self):
     operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -155,17 +151,16 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
-  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
-    num_rows = array_ops.placeholder(dtypes.int32)
-    x = array_ops.placeholder(dtypes.float32)
+    num_rows = array_ops.placeholder_with_default(2, shape=None)
+    x = array_ops.placeholder_with_default(
+        rng.rand(3, 3).astype(np.float32), shape=None)
 
     with self.cached_session():
-      operator = linalg_lib.LinearOperatorIdentity(
-          num_rows, assert_proper_shapes=True)
-      y = operator.matmul(x)
-      with self.assertRaisesOpError("Incompatible.*dimensions"):
-        y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
+      with self.assertRaisesError("Dimensions.*not.compatible"):
+        operator = linalg_lib.LinearOperatorIdentity(
+            num_rows, assert_proper_shapes=True)
+        self.evaluate(operator.matmul(x))
 
   def test_default_batch_shape_broadcasts_with_everything_static(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -178,25 +173,21 @@ class LinearOperatorIdentityTest(
       operator_matmul = operator.matmul(x)
       expected = x
 
-      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_matmul.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
-  @test_util.run_deprecated_v1
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.cached_session() as sess:
-      x = array_ops.placeholder(dtypes.float32)
+    with self.cached_session():
+      x = array_ops.placeholder_with_default(rng.randn(1, 2, 3, 4), shape=None)
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
       operator_matmul = operator.matmul(x)
       expected = x
 
-      feed_dict = {x: rng.randn(1, 2, 3, 4)}
-
-      self.assertAllClose(
-          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
   def test_broadcast_matmul_static_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -216,24 +207,22 @@ class LinearOperatorIdentityTest(
       expected = x + zeros
 
       operator_matmul = operator.matmul(x)
-      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_matmul.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
-  @test_util.run_deprecated_v1
   def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.cached_session() as sess:
+    with self.cached_session():
       # Given this x and LinearOperatorIdentity shape of (2, 1, 3, 3), the
       # broadcast shape of operator and 'x' is (2, 2, 3, 4)
-      x = array_ops.placeholder(dtypes.float32)
-      num_rows = array_ops.placeholder(dtypes.int32)
-      batch_shape = array_ops.placeholder(dtypes.int32)
+      x = array_ops.placeholder_with_default(rng.rand(1, 2, 3, 4), shape=None)
+      num_rows = array_ops.placeholder_with_default(3, shape=None)
+      batch_shape = array_ops.placeholder_with_default((2, 1), shape=None)
 
       operator = linalg_lib.LinearOperatorIdentity(
-          num_rows, batch_shape=batch_shape)
-      feed_dict = {x: rng.rand(1, 2, 3, 4), num_rows: 3, batch_shape: (2, 1)}
+          num_rows, batch_shape=batch_shape, dtype=dtypes.float64)
 
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
@@ -242,8 +231,7 @@ class LinearOperatorIdentityTest(
       expected = x + zeros
 
       operator_matmul = operator.matmul(x)
-      self.assertAllClose(
-          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
   def test_is_x_flags(self):
     # The is_x flags are by default all True.
@@ -280,7 +268,16 @@ class LinearOperatorIdentityTest(
     self.assertIsInstance(
         operator.inverse(), linalg_lib.LinearOperatorIdentity)
 
+  def test_ref_type_shape_args_raises(self):
+    with self.assertRaisesRegexp(TypeError, "num_rows.*reference"):
+      linalg_lib.LinearOperatorIdentity(num_rows=variables_module.Variable(2))
 
+    with self.assertRaisesRegexp(TypeError, "batch_shape.*reference"):
+      linalg_lib.LinearOperatorIdentity(
+          num_rows=2, batch_shape=variables_module.Variable([3]))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorScaledIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
@@ -331,47 +328,44 @@ class LinearOperatorScaledIdentityTest(
 
     return operator, matrix
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_when_positive(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=1.)
-      operator.assert_positive_definite().run()  # Should not fail
+      self.evaluate(operator.assert_positive_definite())  # Should not fail
 
   def test_assert_positive_definite_raises_when_negative(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=-1.)
       with self.assertRaisesOpError("not positive definite"):
-        operator.assert_positive_definite().run()
+        self.evaluate(operator.assert_positive_definite())
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_when_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1., 2., 3.])
-      operator.assert_non_singular().run()  # Should not fail
+      self.evaluate(operator.assert_non_singular())  # Should not fail
 
   def test_assert_non_singular_raises_when_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1., 2., 0.])
       with self.assertRaisesOpError("was singular"):
-        operator.assert_non_singular().run()
+        self.evaluate(operator.assert_non_singular())
 
-  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1. + 0J])
-      operator.assert_self_adjoint().run()  # Should not fail
+      self.evaluate(operator.assert_self_adjoint())  # Should not fail
 
   def test_assert_self_adjoint_raises_when_not_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1. + 1J])
       with self.assertRaisesOpError("not self-adjoint"):
-        operator.assert_self_adjoint().run()
+        self.evaluate(operator.assert_self_adjoint())
 
   def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.linalg.solve does
@@ -397,17 +391,18 @@ class LinearOperatorScaledIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
-  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
-    num_rows = array_ops.placeholder(dtypes.int32)
-    x = array_ops.placeholder(dtypes.float32)
+    num_rows = array_ops.placeholder_with_default(2, shape=None)
+    x = array_ops.placeholder_with_default(
+        rng.rand(3, 3).astype(np.float32), shape=None)
 
     with self.cached_session():
-      operator = linalg_lib.LinearOperatorScaledIdentity(
-          num_rows, multiplier=[1., 2], assert_proper_shapes=True)
-      y = operator.matmul(x)
-      with self.assertRaisesOpError("Incompatible.*dimensions"):
-        y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
+      with self.assertRaisesError("Dimensions.*not.compatible"):
+        operator = linalg_lib.LinearOperatorScaledIdentity(
+            num_rows,
+            multiplier=[1., 2],
+            assert_proper_shapes=True)
+        self.evaluate(operator.matmul(x))
 
   def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -428,13 +423,13 @@ class LinearOperatorScaledIdentityTest(
       # Test matmul
       expected = x * 2.2 + zeros
       operator_matmul = operator.matmul(x)
-      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_matmul.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
       operator_solve = operator.solve(x)
-      self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_solve.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
@@ -454,13 +449,13 @@ class LinearOperatorScaledIdentityTest(
       # Test matmul
       expected = x * 2.2
       operator_matmul = operator.matmul(x)
-      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_matmul.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
       operator_solve = operator.solve(x)
-      self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
+      self.assertAllEqual(operator_solve.shape, expected.shape)
       self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_is_x_flags(self):
@@ -530,6 +525,17 @@ class LinearOperatorScaledIdentityTest(
         operator.inverse(),
         linalg_lib.LinearOperatorScaledIdentity)
 
+  def test_ref_type_shape_args_raises(self):
+    with self.assertRaisesRegexp(TypeError, "num_rows.*reference"):
+      linalg_lib.LinearOperatorScaledIdentity(
+          num_rows=variables_module.Variable(2), multiplier=1.23)
+
+  def test_tape_safe(self):
+    multiplier = variables_module.Variable(1.23)
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=multiplier)
+    self.check_tape_safe(operator)
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(LinearOperatorIdentityTest)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 5c89607c1da..0438120a66c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -155,6 +156,22 @@ class BaseLinearOperatorLowRankUpdatetest(object):
 
     return operator, matrix
 
+  def test_tape_safe(self):
+    base_operator = linalg.LinearOperatorDiag(
+        variables_module.Variable([1.], name="diag"),
+        is_positive_definite=True,
+        is_self_adjoint=True)
+
+    operator = linalg.LinearOperatorLowRankUpdate(
+        base_operator,
+        u=variables_module.Variable([[2.]], name="u"),
+        v=variables_module.Variable([[1.25]], name="v")
+        if self._use_v else None,
+        diag_update=variables_module.Variable([1.25], name="diag_update")
+        if self._use_diag_update else None,
+        is_diag_update_positive=self._is_diag_update_positive)
+    self.check_tape_safe(operator)
+
 
 class LinearOperatorLowRankUpdatetestWithDiagUseCholesky(
     BaseLinearOperatorLowRankUpdatetest,
@@ -181,7 +198,7 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky(
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     return ["cholesky"]
 
   _use_diag_update = True
@@ -224,7 +241,7 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky(
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     return ["cholesky"]
 
   _use_diag_update = False
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index c86beebf1f3..71d24e316fe 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -26,12 +28,13 @@ from tensorflow.python.platform import test
 linalg = linalg_lib
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorLowerTriangularTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     # Cholesky does not make sense for triangular matrices.
     return ["cholesky"]
 
@@ -101,6 +104,12 @@ class LinearOperatorLowerTriangularTest(
             operator1.to_dense()),
         self.evaluate(operator_matmul.to_dense()))
 
+  def test_tape_safe(self):
+    tril = variables_module.Variable([[1., 0.], [0., 1.]])
+    operator = linalg_lib.LinearOperatorLowerTriangular(
+        tril, is_non_singular=True)
+    self.check_tape_safe(operator)
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(LinearOperatorLowerTriangularTest)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 7669f1d5330..9280abc5f5e 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -80,7 +80,7 @@ class LinearOperatorMatmulSolve(linalg.LinearOperator):
         is_square=is_square)
 
   def _shape(self):
-    return self._matrix.get_shape()
+    return self._matrix.shape
 
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
@@ -136,7 +136,7 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix)
     with self.cached_session():
       operator_dense = operator.to_dense()
-      self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
+      self.assertAllEqual((2, 3, 4), operator_dense.shape)
       self.assertAllClose(matrix, self.evaluate(operator_dense))
 
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
@@ -152,7 +152,7 @@ class LinearOperatorTest(test.TestCase):
     x = [1., 1.]
     with self.cached_session():
       y = operator.matvec(x)
-      self.assertAllEqual((2,), y.get_shape())
+      self.assertAllEqual((2,), y.shape)
       self.assertAllClose([1., 2.], self.evaluate(y))
 
   def test_solvevec(self):
@@ -161,7 +161,7 @@ class LinearOperatorTest(test.TestCase):
     y = [1., 1.]
     with self.cached_session():
       x = operator.solvevec(y)
-      self.assertAllEqual((2,), x.get_shape())
+      self.assertAllEqual((2,), x.shape)
       self.assertAllClose([1., 1 / 2.], self.evaluate(x))
 
   def test_is_square_set_to_true_for_square_static_shapes(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
index 22ae26f27b4..eb883541fa8 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
@@ -51,17 +51,17 @@ class LinearOperatorToeplitzTest(
   def setUp(self):
     # TODO(srvasude): Lower these tolerances once specialized solve and
     # determinants are implemented.
-    self._atol[dtypes.float32] = 1e-3
-    self._rtol[dtypes.float32] = 1e-3
+    self._atol[dtypes.float32] = 5e-3
+    self._rtol[dtypes.float32] = 5e-3
     self._atol[dtypes.float64] = 1e-10
     self._rtol[dtypes.float64] = 1e-10
-    self._atol[dtypes.complex64] = 1e-3
-    self._rtol[dtypes.complex64] = 1e-3
+    self._atol[dtypes.complex64] = 5e-3
+    self._rtol[dtypes.complex64] = 5e-3
     self._atol[dtypes.complex128] = 1e-10
     self._rtol[dtypes.complex128] = 1e-10
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     # Skip solve tests, as these could have better stability
     # (currently exercises the base class).
     # TODO(srvasude): Enable these when solve is implemented.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 03086e64ecf..fef498a33b4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -20,9 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -34,66 +32,62 @@ rng = np.random.RandomState(0)
 
 class AssertZeroImagPartTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([0., 2, 3])
-    with self.cached_session():
-      # Should not raise.
-      linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
+    # Should not raise.
+    self.evaluate(
+        linear_operator_util.assert_zero_imag_part(x, message="ABC123"))
 
-  @test_util.run_deprecated_v1
   def test_complex_tensor_with_imag_zero_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([0., 0, 0])
     z = math_ops.complex(x, y)
-    with self.cached_session():
-      # Should not raise.
-      linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
+    # Should not raise.
+    self.evaluate(
+        linear_operator_util.assert_zero_imag_part(z, message="ABC123"))
 
   def test_complex_tensor_with_nonzero_imag_raises(self):
     x = ops.convert_to_tensor([1., 2, 0])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.cached_session():
-      with self.assertRaisesOpError("ABC123"):
-        linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
+    with self.assertRaisesOpError("ABC123"):
+      self.evaluate(
+          linear_operator_util.assert_zero_imag_part(z, message="ABC123"))
 
 
 class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_nonzero_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 2, 3])
-    with self.cached_session():
-      # Should not raise.
-      linear_operator_util.assert_no_entries_with_modulus_zero(
-          x, message="ABC123").run()
+    # Should not raise.
+    self.evaluate(
+        linear_operator_util.assert_no_entries_with_modulus_zero(
+            x, message="ABC123"))
 
-  @test_util.run_deprecated_v1
   def test_nonzero_complex_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.cached_session():
-      # Should not raise.
-      linear_operator_util.assert_no_entries_with_modulus_zero(
-          z, message="ABC123").run()
+    # Should not raise.
+    self.evaluate(
+        linear_operator_util.assert_no_entries_with_modulus_zero(
+            z, message="ABC123"))
 
   def test_zero_real_tensor_raises(self):
     x = ops.convert_to_tensor([1., 0, 3])
-    with self.cached_session():
-      with self.assertRaisesOpError("ABC123"):
-        linear_operator_util.assert_no_entries_with_modulus_zero(
-            x, message="ABC123").run()
+    with self.assertRaisesOpError("ABC123"):
+      self.evaluate(
+          linear_operator_util.assert_no_entries_with_modulus_zero(
+              x, message="ABC123"))
 
   def test_zero_complex_tensor_raises(self):
     x = ops.convert_to_tensor([1., 2, 0])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.cached_session():
-      with self.assertRaisesOpError("ABC123"):
-        linear_operator_util.assert_no_entries_with_modulus_zero(
-            z, message="ABC123").run()
+    with self.assertRaisesOpError("ABC123"):
+      self.evaluate(
+          linear_operator_util.assert_no_entries_with_modulus_zero(
+              z, message="ABC123"))
 
 
 class BroadcastMatrixBatchDimsTest(test.TestCase):
@@ -107,10 +101,8 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     tensor, = linear_operator_util.broadcast_matrix_batch_dims([arr])
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
-    with self.cached_session():
-      self.assertAllClose(arr, self.evaluate(tensor))
+    self.assertAllClose(arr, self.evaluate(tensor))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -123,12 +115,11 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
 
-    with self.cached_session() as sess:
-      self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
-      self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
-      self.assertAllClose(x_bc_expected, x_bc_)
-      self.assertAllClose(y_bc_expected, y_bc_)
+    self.assertAllEqual(x_bc_expected.shape, x_bc.shape)
+    self.assertAllEqual(y_bc_expected.shape, y_bc.shape)
+    x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
+    self.assertAllClose(x_bc_expected, x_bc_)
+    self.assertAllClose(y_bc_expected, y_bc_)
 
   def test_static_dims_broadcast_second_arg_higher_rank(self):
     # x.batch_shape =    [1, 2]
@@ -142,14 +133,12 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
 
-    with self.cached_session() as sess:
-      self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
-      self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
-      self.assertAllClose(x_bc_expected, x_bc_)
-      self.assertAllClose(y_bc_expected, y_bc_)
+    self.assertAllEqual(x_bc_expected.shape, x_bc.shape)
+    self.assertAllEqual(y_bc_expected.shape, y_bc.shape)
+    x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
+    self.assertAllClose(x_bc_expected, x_bc_)
+    self.assertAllClose(y_bc_expected, y_bc_)
 
-  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -160,17 +149,15 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     x_bc_expected = x + batch_of_zeros
     y_bc_expected = y + batch_of_zeros
 
-    x_ph = array_ops.placeholder(dtypes.float32)
-    y_ph = array_ops.placeholder(dtypes.float32)
+    x_ph = array_ops.placeholder_with_default(x, shape=None)
+    y_ph = array_ops.placeholder_with_default(y, shape=None)
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
 
-    with self.cached_session() as sess:
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
-      self.assertAllClose(x_bc_expected, x_bc_)
-      self.assertAllClose(y_bc_expected, y_bc_)
+    x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
+    self.assertAllClose(x_bc_expected, x_bc_)
+    self.assertAllClose(y_bc_expected, y_bc_)
 
-  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self):
     # x.batch_shape =    [1, 2]
     # y.batch_shape = [3, 4, 1]
@@ -181,15 +168,14 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     x_bc_expected = x + batch_of_zeros
     y_bc_expected = y + batch_of_zeros
 
-    x_ph = array_ops.placeholder(dtypes.float32)
-    y_ph = array_ops.placeholder(dtypes.float32)
+    x_ph = array_ops.placeholder_with_default(x, shape=None)
+    y_ph = array_ops.placeholder_with_default(y, shape=None)
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
 
-    with self.cached_session() as sess:
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
-      self.assertAllClose(x_bc_expected, x_bc_)
-      self.assertAllClose(y_bc_expected, y_bc_)
+    x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
+    self.assertAllClose(x_bc_expected, x_bc_)
+    self.assertAllClose(y_bc_expected, y_bc_)
 
   def test_less_than_two_dims_raises_static(self):
     x = rng.rand(3)
@@ -204,20 +190,17 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
 class CholeskySolveWithBroadcastTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # batch_shape = [2]
     chol = rng.rand(3, 3)
     rhs = rng.rand(2, 3, 7)
     chol_broadcast = chol + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
-      self.assertAllEqual((2, 3, 7), result.get_shape())
-      expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
+    self.assertAllEqual((2, 3, 7), result.shape)
+    expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     chol = rng.rand(2, 3, 3)
@@ -225,40 +208,29 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
     chol_broadcast = chol + np.zeros((2, 2, 1, 1))
     rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
 
-    chol_ph = array_ops.placeholder(dtypes.float64)
-    rhs_ph = array_ops.placeholder(dtypes.float64)
+    chol_ph = array_ops.placeholder_with_default(chol, shape=None)
+    rhs_ph = array_ops.placeholder_with_default(rhs, shape=None)
 
-    with self.cached_session() as sess:
-      result, expected = sess.run(
-          [
-              linear_operator_util.cholesky_solve_with_broadcast(
-                  chol_ph, rhs_ph),
-              linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
-          ],
-          feed_dict={
-              chol_ph: chol,
-              rhs_ph: rhs,
-          })
-      self.assertAllClose(expected, result)
+    result, expected = self.evaluate([
+        linear_operator_util.cholesky_solve_with_broadcast(chol_ph, rhs_ph),
+        linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
+    ])
+    self.assertAllClose(expected, result)
 
 
 class MatrixSolveWithBroadcastTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
     rhs = rng.rand(3, 7)
     rhs_broadcast = rhs + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_solve_with_broadcast(
-          matrix, rhs)
-      self.assertAllEqual((2, 3, 7), result.get_shape())
-      expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+    self.assertAllEqual((2, 3, 7), result.shape)
+    expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -271,13 +243,11 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
-      self.assertAllEqual((2, 3, 2), result.get_shape())
-      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+    self.assertAllEqual((2, 3, 2), result.shape)
+    expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -290,22 +260,14 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    matrix_ph = array_ops.placeholder(dtypes.float64, shape=[None, None])
-    rhs_ph = array_ops.placeholder(dtypes.float64, shape=[None, None, None])
+    matrix_ph = array_ops.placeholder_with_default(matrix, shape=[None, None])
+    rhs_ph = array_ops.placeholder_with_default(rhs, shape=[None, None, None])
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_solve_with_broadcast(matrix_ph,
-                                                                rhs_ph)
-      self.assertAllEqual(3, result.shape.ndims)
-      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllClose(
-          self.evaluate(expected),
-          result.eval(feed_dict={
-              matrix_ph: matrix,
-              rhs_ph: rhs
-          }))
+    result = linear_operator_util.matrix_solve_with_broadcast(matrix_ph, rhs_ph)
+    self.assertAllEqual(3, result.shape.ndims)
+    expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -318,14 +280,12 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_solve_with_broadcast(
-          matrix, rhs, adjoint=True)
-      self.assertAllEqual((2, 3, 2), result.get_shape())
-      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_solve_with_broadcast(
+        matrix, rhs, adjoint=True)
+    self.assertAllEqual((2, 3, 2), result.shape)
+    expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     matrix = rng.rand(2, 3, 3)
@@ -333,40 +293,30 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     matrix_broadcast = matrix + np.zeros((2, 2, 1, 1))
     rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
 
-    matrix_ph = array_ops.placeholder(dtypes.float64)
-    rhs_ph = array_ops.placeholder(dtypes.float64)
+    matrix_ph = array_ops.placeholder_with_default(matrix, shape=None)
+    rhs_ph = array_ops.placeholder_with_default(rhs, shape=None)
 
-    with self.cached_session() as sess:
-      result, expected = sess.run(
-          [
-              linear_operator_util.matrix_solve_with_broadcast(
-                  matrix_ph, rhs_ph),
-              linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
-          ],
-          feed_dict={
-              matrix_ph: matrix,
-              rhs_ph: rhs,
-          })
-      self.assertAllClose(expected, result)
+    result, expected = self.evaluate([
+        linear_operator_util.matrix_solve_with_broadcast(matrix_ph, rhs_ph),
+        linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
+    ])
+    self.assertAllClose(expected, result)
 
 
 class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
     rhs = rng.rand(3, 7)
     rhs_broadcast = rhs + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
-          matrix, rhs)
-      self.assertAllEqual((2, 3, 7), result.get_shape())
-      expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+        matrix, rhs)
+    self.assertAllEqual((2, 3, 7), result.shape)
+    expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -379,14 +329,12 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
-          matrix, rhs)
-      self.assertAllEqual((2, 3, 2), result.get_shape())
-      expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+        matrix, rhs)
+    self.assertAllEqual((2, 3, 2), result.shape)
+    expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -399,36 +347,28 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    with self.cached_session():
-      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
-          matrix, rhs, adjoint=True)
-      self.assertAllEqual((2, 3, 2), result.get_shape())
-      expected = linalg_ops.matrix_triangular_solve(
-          matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), self.evaluate(result))
+    result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+        matrix, rhs, adjoint=True)
+    self.assertAllEqual((2, 3, 2), result.shape)
+    expected = linalg_ops.matrix_triangular_solve(
+        matrix_broadcast, rhs, adjoint=True)
+    self.assertAllClose(*self.evaluate([expected, result]))
 
-  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
     rhs = rng.rand(3, 7)
     rhs_broadcast = rhs + np.zeros((2, 1, 1))
 
-    matrix_ph = array_ops.placeholder(dtypes.float64)
-    rhs_ph = array_ops.placeholder(dtypes.float64)
+    matrix_ph = array_ops.placeholder_with_default(matrix, shape=None)
+    rhs_ph = array_ops.placeholder_with_default(rhs, shape=None)
 
-    with self.cached_session() as sess:
-      result, expected = sess.run(
-          [
-              linear_operator_util.matrix_triangular_solve_with_broadcast(
-                  matrix_ph, rhs_ph),
-              linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-          ],
-          feed_dict={
-              matrix_ph: matrix,
-              rhs_ph: rhs,
-          })
-      self.assertAllClose(expected, result)
+    result, expected = self.evaluate([
+        linear_operator_util.matrix_triangular_solve_with_broadcast(
+            matrix_ph, rhs_ph),
+        linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+    ])
+    self.assertAllClose(expected, result)
 
 
 class DomainDimensionStubOperator(object):
@@ -442,22 +382,21 @@ class DomainDimensionStubOperator(object):
 
 class AssertCompatibleMatrixDimensionsTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_compatible_dimensions_do_not_raise(self):
-    with self.cached_session():
-      x = ops.convert_to_tensor(rng.rand(2, 3, 4))
-      operator = DomainDimensionStubOperator(3)
-      # Should not raise
-      linear_operator_util.assert_compatible_matrix_dimensions(
-          operator, x).run()  # pyformat: disable
+    x = ops.convert_to_tensor(rng.rand(2, 3, 4))
+    operator = DomainDimensionStubOperator(3)
+    # Should not raise
+    self.evaluate(
+        linear_operator_util.assert_compatible_matrix_dimensions(operator, x))
 
   def test_incompatible_dimensions_raise(self):
-    with self.cached_session():
-      x = ops.convert_to_tensor(rng.rand(2, 4, 4))
-      operator = DomainDimensionStubOperator(3)
-      with self.assertRaisesOpError("Incompatible matrix dimensions"):
-        linear_operator_util.assert_compatible_matrix_dimensions(
-            operator, x).run()  # pyformat: disable
+    x = ops.convert_to_tensor(rng.rand(2, 4, 4))
+    operator = DomainDimensionStubOperator(3)
+    # pylint: disable=g-error-prone-assert-raises
+    with self.assertRaisesOpError("Dimensions are not compatible"):
+      self.evaluate(
+          linear_operator_util.assert_compatible_matrix_dimensions(operator, x))
+    # pylint: enable=g-error-prone-assert-raises
 
 
 class DummyOperatorWithHint(object):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index 60f9c4820e4..086f5eeef3c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -30,12 +31,13 @@ from tensorflow.python.platform import test
 rng = np.random.RandomState(2016)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorZerosTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     return [
         "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"]
 
@@ -75,11 +77,10 @@ class LinearOperatorZerosTest(
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
       operator.assert_non_singular()
 
-  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
-      operator.assert_self_adjoint().run()  # Should not fail
+      self.evaluate(operator.assert_self_adjoint())  # Should not fail
 
   def test_non_scalar_num_rows_raises_static(self):
     with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
@@ -111,46 +112,37 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
-  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
-      num_rows = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be a 0-D Tensor"):
-        operator.to_dense().eval(feed_dict={num_rows: [2]})
+      num_rows = array_ops.placeholder_with_default([2], shape=None)
+      with self.assertRaisesError("must be a 0-D Tensor"):
+        operator = linalg_lib.LinearOperatorZeros(
+            num_rows, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
-      n = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows=n, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be non-negative"):
-        operator.to_dense().eval(feed_dict={n: -2})
+      n = array_ops.placeholder_with_default(-2, shape=None)
+      with self.assertRaisesError("must be non-negative"):
+        operator = linalg_lib.LinearOperatorZeros(
+            num_rows=n, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows=2, num_columns=n, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be non-negative"):
-        operator.to_dense().eval(feed_dict={n: -2})
-
-  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
-      batch_shape = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be a 1-D"):
-        operator.to_dense().eval(feed_dict={batch_shape: 2})
+      batch_shape = array_ops.placeholder_with_default(2, shape=None)
+      with self.assertRaisesError("must be a 1-D"):
+        operator = linalg_lib.LinearOperatorZeros(
+            num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
-  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
-      batch_shape = array_ops.placeholder(dtypes.int32)
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
-      with self.assertRaisesOpError("must be non-negative"):
-        operator.to_dense().eval(feed_dict={batch_shape: [-2]})
+      batch_shape = array_ops.placeholder_with_default([-2], shape=None)
+      with self.assertRaisesError("must be non-negative"):
+        operator = linalg_lib.LinearOperatorZeros(
+            num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+        self.evaluate(operator.to_dense())
 
   def test_wrong_matrix_dimensions_raises_static(self):
     operator = linalg_lib.LinearOperatorZeros(num_rows=2)
@@ -158,17 +150,15 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
-  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
-    num_rows = array_ops.placeholder(dtypes.int32)
-    x = array_ops.placeholder(dtypes.float32)
+    num_rows = array_ops.placeholder_with_default(2, shape=None)
+    x = array_ops.placeholder_with_default(rng.rand(3, 3), shape=None)
 
     with self.cached_session():
-      operator = linalg_lib.LinearOperatorZeros(
-          num_rows, assert_proper_shapes=True)
-      y = operator.matmul(x)
-      with self.assertRaisesOpError("Incompatible.*dimensions"):
-        y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
+      with self.assertRaisesError("Dimensions.*not.compatible"):
+        operator = linalg_lib.LinearOperatorZeros(
+            num_rows, assert_proper_shapes=True, dtype=dtypes.float64)
+        self.evaluate(operator.matmul(x))
 
   def test_is_x_flags(self):
     # The is_x flags are by default all True.
@@ -188,7 +178,20 @@ class LinearOperatorZerosTest(
         operator2.matmul(operator1),
         linalg_lib.LinearOperatorZeros))
 
+  def test_ref_type_shape_args_raises(self):
+    with self.assertRaisesRegexp(TypeError, "num_rows.cannot.be.reference"):
+      linalg_lib.LinearOperatorZeros(num_rows=variables_module.Variable(2))
 
+    with self.assertRaisesRegexp(TypeError, "num_columns.cannot.be.reference"):
+      linalg_lib.LinearOperatorZeros(
+          num_rows=2, num_columns=variables_module.Variable(3))
+
+    with self.assertRaisesRegexp(TypeError, "batch_shape.cannot.be.reference"):
+      linalg_lib.LinearOperatorZeros(
+          num_rows=2, batch_shape=variables_module.Variable([2]))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
 
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 028167a7860..20cd128783e 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -23,12 +23,14 @@ import itertools
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import test
 
@@ -263,5 +265,297 @@ class EyeTest(parameterized.TestCase, test.TestCase):
     self.assertAllEqual(eye_np, eye_tf)
 
 
+class _MatrixRankTest(object):
+
+  def test_batch_default_tolerance(self):
+    x_ = np.array(
+        [
+            [
+                [2, 3, -2],  # = row2+row3
+                [-1, 1, -2],
+                [3, 2, 0]
+            ],
+            [
+                [0, 2, 0],  # = 2*row2
+                [0, 1, 0],
+                [0, 3, 0]
+            ],  # = 3*row2
+            [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+        ],
+        self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+    self.assertAllEqual([2, 1, 3], self.evaluate(linalg.matrix_rank(x)))
+
+  def test_custom_tolerance_broadcasts(self):
+    q = linalg.qr(random_ops.random_uniform([3, 3], dtype=self.dtype))[0]
+    e = constant_op.constant([0.1, 0.2, 0.3], dtype=self.dtype)
+    a = linalg.solve(q, linalg.transpose(a=e * q), adjoint=True)
+    self.assertAllEqual([3, 2, 1, 0],
+                        self.evaluate(
+                            linalg.matrix_rank(
+                                a, tol=[[0.09], [0.19], [0.29], [0.31]])))
+
+  def test_nonsquare(self):
+    x_ = np.array(
+        [
+            [
+                [2, 3, -2, 2],  # = row2+row3
+                [-1, 1, -2, 4],
+                [3, 2, 0, -2]
+            ],
+            [
+                [0, 2, 0, 6],  # = 2*row2
+                [0, 1, 0, 3],
+                [0, 3, 0, 9]
+            ]
+        ],  # = 3*row2
+        self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+    self.assertAllEqual([2, 1], self.evaluate(linalg.matrix_rank(x)))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MatrixRankStatic32Test(test.TestCase, _MatrixRankTest):
+  dtype = np.float32
+  use_static_shape = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MatrixRankDynamic64Test(test.TestCase, _MatrixRankTest):
+  dtype = np.float64
+  use_static_shape = False
+
+
+class _PinvTest(object):
+
+  def expected_pinv(self, a, rcond):
+    """Calls `np.linalg.pinv` but corrects its broken batch semantics."""
+    if a.ndim < 3:
+      return np.linalg.pinv(a, rcond)
+    if rcond is None:
+      rcond = 10. * max(a.shape[-2], a.shape[-1]) * np.finfo(a.dtype).eps
+    s = np.concatenate([a.shape[:-2], [a.shape[-1], a.shape[-2]]])
+    a_pinv = np.zeros(s, dtype=a.dtype)
+    for i in np.ndindex(a.shape[:(a.ndim - 2)]):
+      a_pinv[i] = np.linalg.pinv(
+          a[i], rcond=rcond if isinstance(rcond, float) else rcond[i])
+    return a_pinv
+
+  def test_symmetric(self):
+    a_ = self.dtype([[1., .4, .5], [.4, .2, .25], [.5, .25, .35]])
+    a_ = np.stack([a_ + 1., a_], axis=0)  # Batch of matrices.
+    a = array_ops.placeholder_with_default(
+        a_, shape=a_.shape if self.use_static_shape else None)
+    if self.use_default_rcond:
+      rcond = None
+    else:
+      rcond = self.dtype([0., 0.01])  # Smallest 1 component is forced to zero.
+    expected_a_pinv_ = self.expected_pinv(a_, rcond)
+    a_pinv = linalg.pinv(a, rcond, validate_args=True)
+    a_pinv_ = self.evaluate(a_pinv)
+    self.assertAllClose(expected_a_pinv_, a_pinv_, atol=2e-5, rtol=2e-5)
+    if not self.use_static_shape:
+      return
+    self.assertAllEqual(expected_a_pinv_.shape, a_pinv.shape)
+
+  def test_nonsquare(self):
+    a_ = self.dtype([[1., .4, .5, 1.], [.4, .2, .25, 2.], [.5, .25, .35, 3.]])
+    a_ = np.stack([a_ + 0.5, a_], axis=0)  # Batch of matrices.
+    a = array_ops.placeholder_with_default(
+        a_, shape=a_.shape if self.use_static_shape else None)
+    if self.use_default_rcond:
+      rcond = None
+    else:
+      # Smallest 2 components are forced to zero.
+      rcond = self.dtype([0., 0.25])
+    expected_a_pinv_ = self.expected_pinv(a_, rcond)
+    a_pinv = linalg.pinv(a, rcond, validate_args=True)
+    a_pinv_ = self.evaluate(a_pinv)
+    self.assertAllClose(expected_a_pinv_, a_pinv_, atol=1e-5, rtol=1e-4)
+    if not self.use_static_shape:
+      return
+    self.assertAllEqual(expected_a_pinv_.shape, a_pinv.shape)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PinvTestDynamic32DefaultRcond(test.TestCase, _PinvTest):
+  dtype = np.float32
+  use_static_shape = False
+  use_default_rcond = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PinvTestStatic64DefaultRcond(test.TestCase, _PinvTest):
+  dtype = np.float64
+  use_static_shape = True
+  use_default_rcond = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PinvTestDynamic32CustomtRcond(test.TestCase, _PinvTest):
+  dtype = np.float32
+  use_static_shape = False
+  use_default_rcond = False
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PinvTestStatic64CustomRcond(test.TestCase, _PinvTest):
+  dtype = np.float64
+  use_static_shape = True
+  use_default_rcond = False
+
+
+def make_tensor_hiding_attributes(value, hide_shape, hide_value=True):
+  if not hide_value:
+    return ops.convert_to_tensor(value)
+
+  shape = None if hide_shape else getattr(value, "shape", None)
+  return array_ops.placeholder_with_default(value, shape=shape)
+
+
+class _LUReconstruct(object):
+  dtype = np.float32
+  use_static_shape = True
+
+  def test_non_batch(self):
+    x_ = np.array([[3, 4], [1, 2]], dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+
+    y = linalg.lu_reconstruct(*linalg.lu(x), validate_args=True)
+    y_ = self.evaluate(y)
+
+    if self.use_static_shape:
+      self.assertAllEqual(x_.shape, y.shape)
+    self.assertAllClose(x_, y_, atol=0., rtol=1e-3)
+
+  def test_batch(self):
+    x_ = np.array([
+        [[3, 4], [1, 2]],
+        [[7, 8], [3, 4]],
+    ], dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+
+    y = linalg.lu_reconstruct(*linalg.lu(x), validate_args=True)
+    y_ = self.evaluate(y)
+
+    if self.use_static_shape:
+      self.assertAllEqual(x_.shape, y.shape)
+    self.assertAllClose(x_, y_, atol=0., rtol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUReconstructStatic(test.TestCase, _LUReconstruct):
+  use_static_shape = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUReconstructDynamic(test.TestCase, _LUReconstruct):
+  use_static_shape = False
+
+
+class _LUMatrixInverse(object):
+  dtype = np.float32
+  use_static_shape = True
+
+  def test_non_batch(self):
+    x_ = np.array([[1, 2], [3, 4]], dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+
+    y = linalg.lu_matrix_inverse(*linalg.lu(x), validate_args=True)
+    y_ = self.evaluate(y)
+
+    if self.use_static_shape:
+      self.assertAllEqual(x_.shape, y.shape)
+    self.assertAllClose(np.linalg.inv(x_), y_, atol=0., rtol=1e-3)
+
+  def test_batch(self):
+    x_ = np.array([
+        [[1, 2], [3, 4]],
+        [[7, 8], [3, 4]],
+        [[0.25, 0.5], [0.75, -2.]],
+    ],
+                  dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+
+    y = linalg.lu_matrix_inverse(*linalg.lu(x), validate_args=True)
+    y_ = self.evaluate(y)
+
+    if self.use_static_shape:
+      self.assertAllEqual(x_.shape, y.shape)
+    self.assertAllClose(np.linalg.inv(x_), y_, atol=0., rtol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUMatrixInverseStatic(test.TestCase, _LUMatrixInverse):
+  use_static_shape = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUMatrixInverseDynamic(test.TestCase, _LUMatrixInverse):
+  use_static_shape = False
+
+
+class _LUSolve(object):
+  dtype = np.float32
+  use_static_shape = True
+
+  def test_non_batch(self):
+    x_ = np.array([[1, 2], [3, 4]], dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+    rhs_ = np.array([[1, 1]], dtype=self.dtype).T
+    rhs = array_ops.placeholder_with_default(
+        rhs_, shape=rhs_.shape if self.use_static_shape else None)
+
+    lower_upper, perm = linalg.lu(x)
+    y = linalg.lu_solve(lower_upper, perm, rhs, validate_args=True)
+    y_, perm_ = self.evaluate([y, perm])
+
+    self.assertAllEqual([1, 0], perm_)
+    expected_ = np.linalg.solve(x_, rhs_)
+    if self.use_static_shape:
+      self.assertAllEqual(expected_.shape, y.shape)
+    self.assertAllClose(expected_, y_, atol=0., rtol=1e-3)
+
+  def test_batch_broadcast(self):
+    x_ = np.array([
+        [[1, 2], [3, 4]],
+        [[7, 8], [3, 4]],
+        [[0.25, 0.5], [0.75, -2.]],
+    ],
+                  dtype=self.dtype)
+    x = array_ops.placeholder_with_default(
+        x_, shape=x_.shape if self.use_static_shape else None)
+    rhs_ = np.array([[1, 1]], dtype=self.dtype).T
+    rhs = array_ops.placeholder_with_default(
+        rhs_, shape=rhs_.shape if self.use_static_shape else None)
+
+    lower_upper, perm = linalg.lu(x)
+    y = linalg.lu_solve(lower_upper, perm, rhs, validate_args=True)
+    y_, perm_ = self.evaluate([y, perm])
+
+    self.assertAllEqual([[1, 0], [0, 1], [1, 0]], perm_)
+    expected_ = np.linalg.solve(x_, rhs_[np.newaxis])
+    if self.use_static_shape:
+      self.assertAllEqual(expected_.shape, y.shape)
+    self.assertAllClose(expected_, y_, atol=0., rtol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUSolveStatic(test.TestCase, _LUSolve):
+  use_static_shape = True
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LUSolveDynamic(test.TestCase, _LUSolve):
+  use_static_shape = False
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 3c35b9767e9..edd1f6df7c3 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -53,7 +53,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 1.0)
+    l = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    l, e = self.evaluate((l, e))
+    self.assertAllEqual(l, [])
+    self.assertAllEqual(e, 1.0)
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
@@ -94,7 +97,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
     _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    l = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    l, e = self.evaluate((l, e))
     self.assertAllEqual(e, np.zeros((2, 3)))
+    self.assertAllEqual(l, np.zeros((3, 2, 3)))
 
   def testPopUninitializedTensorUseSpecifiedElementShape(self):
     l = list_ops.tensor_list_reserve(
@@ -954,14 +960,18 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l_concat_11 = list_ops.tensor_list_concat_lists(
         l_batch_1, l_batch_1, element_dtype=dtypes.float32)
 
+    expected_0 = [[1.0, 2.0], [-1.0]]
+    expected_1 = [[-1.0], [1.0, 2.0]]
     expected_00 = [[1.0, 2.0, 1.0, 2.0], [-1.0, -1.0]]
     expected_01 = [[1.0, 2.0, -1.0], [-1.0, 1.0, 2.0]]
     expected_10 = [[-1.0, 1.0, 2.0], [1.0, 2.0, -1.0]]
     expected_11 = [[-1.0, -1.0], [1.0, 2.0, 1.0, 2.0]]
 
     for i, (concat, expected) in enumerate(zip(
-        [l_concat_00, l_concat_01, l_concat_10, l_concat_11],
-        [expected_00, expected_01, expected_10, expected_11])):
+        [l_batch_0, l_batch_1,
+         l_concat_00, l_concat_01, l_concat_10, l_concat_11],
+        [expected_0, expected_1,
+         expected_00, expected_01, expected_10, expected_11])):
       splitted = array_ops.unstack(concat)
       splitted_stacked_ret = self.evaluate(
           (list_ops.tensor_list_stack(splitted[0], dtypes.float32),
@@ -1156,10 +1166,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(fn(tensor_shape.unknown_shape()), -1)
     # Scalar shape -> [] with type int32.
     self.assertEqual(fn([]).dtype, dtypes.int32)
-    self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32)
+    self.assertEqual(fn(tensor_shape.TensorShape([])).dtype, dtypes.int32)
     self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32))
     self.assertAllEqual(
-        self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32))
+        self.evaluate(fn(tensor_shape.TensorShape([]))), np.array([], np.int32))
     # Tensor -> Tensor
     shape = constant_op.constant(1)
     self.assertIs(fn(shape), shape)
@@ -1317,7 +1327,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testConcatListWithScalarElementShapeFails(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=tensor_shape.scalar())
+        element_dtype=dtypes.float32,
+        element_shape=tensor_shape.TensorShape([]))
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         "Concat requires elements to be at least vectors, "
@@ -1571,6 +1582,31 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         tensor_list, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(element.shape.as_list(), [])
 
+  @test_util.run_gpu_only
+  def testNestedListDevicetoDeviceCopy(self):
+    if context.num_gpus() < 2:
+      self.skipTest("Need at least 2 GPUs for this test, found %d" %
+                    context.num_gpus())
+    with ops.device("gpu:0"):
+      t = constant_op.constant([1.0, 2.0, 3.0])
+      inner_l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      outer_l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.variant, element_shape=[])
+      outer_l = list_ops.tensor_list_push_back(outer_l, inner_l)
+
+    # Stress test.
+    for _ in range(1024):
+      with ops.device("gpu:1"):
+        outer_l = array_ops.identity(outer_l)
+      with ops.device("gpu:0"):
+        outer_l = array_ops.identity(outer_l)
+
+    with ops.device("gpu:1"):
+      _, inner_l = list_ops.tensor_list_pop_back(
+          outer_l, element_dtype=dtypes.variant)
+      t = list_ops.tensor_list_stack(inner_l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [1.0, 2.0, 3.0])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index a8eda0f4fe8..4f5fed9dd1f 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -188,6 +188,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     self._verifySolve(np.empty([2, 0, 0]), np.empty([2, 0, 0]), lower=False)
     self._verifySolve(
         np.empty([2, 0, 0]), np.empty([2, 0, 0]), lower=True, batch_dims=[3, 2])
+    self._verifySolve(np.empty([0, 0]), np.empty([0, 0]), lower=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index 347e092dee3..d5331dcb3e9 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -205,6 +205,26 @@ class PoolingTest(test.TestCase):
         padding="VALID",
         expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5])
 
+  def _MaxPool3DEmptyTensorOutputShape(self):
+    """Verifies the output shape of the max pooling function when tensor is empty.
+
+    Args: none
+    """
+    input_sizes = [0, 112, 112, 112, 64]
+
+    input_data = 1
+    input_tensor = constant_op.constant(
+        input_data, shape=input_sizes, name="input")
+    max_pool_3d = nn_ops.max_pool3d(
+        input_tensor,
+        ksize=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="VALID",
+        data_format="NDHWC",
+        name="max_pool_3d")
+    values = self.evaluate(max_pool_3d)
+    self.assertEqual(values.shape, (0, 56, 56, 56, 64))
+
   def _ConstructAndTestGradientForConfig(self,
                                          pool_func,
                                          input_sizes,
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index 5a0f6a9efef..ff86609ed55 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -2,9 +2,8 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 # Placeholder for Google-internal load statements.
 
 package(
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 7c1cc81b63c..a3cdb258b29 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -697,6 +697,23 @@ class PyFuncTest(test.TestCase):
       output = sess.run(z, feed_dict={x: 3.0})
       self.assertEqual(output, 18.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testEagerPyFuncOnGPUWithStrings(self):
+
+    def fn(a):
+      return str(a.dtype)
+
+    x = constant_op.constant("x", dtype=dtypes.string)
+    output = script_ops.eager_py_func(fn, inp=[x], Tout=dtypes.string)
+    self.assertEqual(self.evaluate(output), "<dtype: 'string'>".encode("utf8"))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEagerPyFuncNotACallable(self):
+    x = constant_op.constant("x", dtype=dtypes.string)
+
+    with self.assertRaisesRegexp(ValueError, "callable"):
+      _ = script_ops.eager_py_func(x, inp=[x], Tout=dtypes.string)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index 2fbfdc0a963..77e194d5fcf 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -53,6 +53,10 @@ class RandomGammaTest(test.TestCase):
 
     return func
 
+  def testNpDtypes(self):
+    self.evaluate(random_ops.random_gamma(
+        [5], alpha=np.ones([2, 1, 3]), beta=np.ones([3]), dtype=np.float32))
+
   def testEmptySamplingNoError(self):
     self.evaluate(random_ops.random_gamma(
         [5], alpha=np.ones([2, 0, 3]), beta=np.ones([3]), dtype=dtypes.float32))
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 43d15817e97..053e5b047af 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -22,6 +22,7 @@ import collections
 import gzip
 import os
 import shutil
+import sys
 import threading
 import zlib
 
@@ -754,7 +755,10 @@ class LMDBReaderTest(test.TestCase):
   def setUp(self):
     super(LMDBReaderTest, self).setUp()
     # Copy database out because we need the path to be writable to use locks.
-    path = os.path.join(prefix_path, "lmdb", "testdata", "data.mdb")
+    # The on-disk format of an LMDB file is different on big-endian machines,
+    # because LMDB is a memory-mapped database.
+    db_file = "data.mdb" if sys.byteorder == "little" else "data_bigendian.mdb"
+    path = os.path.join(prefix_path, "lmdb", "testdata", db_file)
     self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
     shutil.copy(path, self.db_path)
 
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 49b6620779e..751e3e3648b 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -351,6 +351,16 @@ class ReduceJoinTest(UnicodeTestCase):
       with self.assertRaisesOpError("reduction dimension 2"):
         reduced.eval(feed_dict={placeholder.name: 2})
 
+  def testDeprecatedArgs(self):
+    foobar = constant_op.constant(["foobar"])
+    # Old names: keep_dims and reduction_indices
+    output = string_ops.reduce_join(
+        ["foo", "bar"], reduction_indices=0, keep_dims=True)
+    self.assertAllEqual(foobar, output)
+    # New names keepdims and axis.
+    output = string_ops.reduce_join(["foo", "bar"], axis=0, keepdims=True)
+    self.assertAllEqual(foobar, output)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 5ab8bc3a008..ca052ab9445 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -560,6 +560,15 @@ class EuclideanNormReductionTest(BaseReductionTest):
         self.assertEqual(y.shape, (9938,))
         self.assertAllEqual(y, np.zeros(9938))
 
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    shape = [2, 3, 4, 2]
+    for dtype in [dtypes.float32, dtypes.float64]:
+      # zero value entry will result NaN gradient if reduction doesn't happen.
+      # e.g., `tf.math.reduce_sum([0, 1], axis=[])` so add one to avoid it.
+      x = self._makeIncremental(shape, dtype) + 1.0
+      self._compareGradientAxes(x, rtol=1e-2, atol=1e-2)
+
 
 class ProdReductionTest(BaseReductionTest):
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 124c5860047..70c6c7ecfbc 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -118,6 +118,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertAllEqual(variable.numpy(), 1.0)
       self.assertAllEqual(variable.initialized_value().numpy(), 1.0)
 
+  def testInitializeVariableUsingInitializedValue(self):
+    var1 = resource_variable_ops.ResourceVariable(1.0, name="var1")
+    var2 = resource_variable_ops.ResourceVariable(var1.initialized_value(),
+                                                  name="var2")
+    self.assertAllEqual(var2.initialized_value(), 1.0)
+
   def testEagerBool(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
@@ -292,6 +298,13 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.evaluate(variables.global_variables_initializer())
     self.assertAllEqual(g[1], [[0., 0.], [0., 0.]])
 
+  @test_util.run_deprecated_v1
+  def testUnconnectedGradientZeros(self):
+    b = resource_variable_ops.ResourceVariable(initial_value=[[3., 4.]])
+    c = constant_op.constant(0.)
+    g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
+    self.assertAllEqual(g.shape.as_list(), [1, 2])
+
   @test_util.run_in_graph_and_eager_modes
   def testGradientGatherNdIndexedSlices(self):
     v = resource_variable_ops.ResourceVariable(
@@ -1389,6 +1402,29 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(output_shape, result.shape.as_list())
     self.assertAllEqual(expected, result)
 
+  @parameterized.parameters([
+      dict(dtype=dtypes.bool),
+      dict(dtype=dtypes.int64),
+      dict(dtype=dtypes.half),
+      dict(dtype=dtypes.float32),
+      dict(dtype=dtypes.double),
+  ])
+  @test_util.run_gpu_only
+  @test_util.run_in_graph_and_eager_modes
+  def testGatherWithDTypes(self, dtype):
+    if dtype == dtypes.bool:
+      params = constant_op.constant([False, True, False, True])
+      expected = constant_op.constant([[False, True], [False, True]])
+    else:
+      params = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+      expected = constant_op.constant([[8, 7], [6, 9]], dtype=dtype)
+    indices = constant_op.constant([[2, 1], [0, 3]])
+    var = resource_variable_ops.ResourceVariable(params, name="var0")
+    with ops.control_dependencies([var.initializer]):
+      result = resource_variable_ops.resource_gather(
+          var.handle, indices, dtype=dtype)
+    self.assertAllEqual(expected, result)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index 2b2402db63e..d29c533badf 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -1014,11 +1015,15 @@ class LSTMTest(test.TestCase):
                 inputs[0]: input_value
             })
 
+      comparison_fn = self.assertAllEqual
+      if (test_util.is_xla_enabled() and
+          control_flow_v2_toggles.control_flow_v2_enabled()):
+        comparison_fn = self.assertAllClose
       if in_graph_mode:
-        self.assertAllEqual(outputs_static, outputs_dynamic)
+        comparison_fn(outputs_static, outputs_dynamic)
       else:
         self.assertAllEqual(array_ops.stack(outputs_static), outputs_dynamic)
-      self.assertAllEqual(np.hstack(state_static), np.hstack(state_dynamic))
+      comparison_fn(np.hstack(state_static), np.hstack(state_dynamic))
 
   @test_util.run_in_graph_and_eager_modes
   def testDynamicRNNWithNestedTupleStates(self):
@@ -1101,13 +1106,17 @@ class LSTMTest(test.TestCase):
                 inputs[0]: input_value
             })
 
+      comparison_fn = self.assertAllEqual
+      if (test_util.is_xla_enabled() and
+          control_flow_v2_toggles.control_flow_v2_enabled()):
+        comparison_fn = self.assertAllClose
       if in_graph_mode:
-        self.assertAllEqual(outputs_static, outputs_dynamic)
+        comparison_fn(outputs_static, outputs_dynamic)
       else:
         self.assertAllEqual(array_ops.stack(outputs_static), outputs_dynamic)
         state_static = nest.flatten(state_static)
         state_dynamic = nest.flatten(state_dynamic)
-      self.assertAllEqual(np.hstack(state_static), np.hstack(state_dynamic))
+      comparison_fn(np.hstack(state_static), np.hstack(state_dynamic))
 
   def _testDynamicEquivalentToStaticRNN(self, use_sequence_length):
     time_steps = 8
@@ -1164,10 +1173,6 @@ class LSTMTest(test.TestCase):
             cell, inputs, sequence_length=sequence_length, dtype=dtypes.float32)
 
       if in_graph_mode:
-        # Generate gradients and run sessions to obtain outputs
-        feeds = {concat_inputs: input_values}
-        # Initialize
-        variables_lib.global_variables_initializer().run(feed_dict=feeds)
         # Generate gradients of sum of outputs w.r.t. inputs
         static_gradients = gradients_impl.gradients(
             outputs_static + [state_static], [concat_inputs])
@@ -1186,6 +1191,10 @@ class LSTMTest(test.TestCase):
             gradients_impl.gradients(y, trainable_variables)
             for y in [outputs_static[0], outputs_static[-1], state_static]
         ])
+        # Generate gradients and run sessions to obtain outputs
+        feeds = {concat_inputs: input_values}
+        # Initialize
+        variables_lib.global_variables_initializer().run(feed_dict=feeds)
         # Test forward pass
         values_static = sess.run(outputs_static, feed_dict=feeds)
         (state_value_static,) = sess.run((state_static,), feed_dict=feeds)
@@ -1229,10 +1238,6 @@ class LSTMTest(test.TestCase):
         split_outputs_dynamic = array_ops.unstack(outputs_dynamic, time_steps)
 
       if in_graph_mode:
-        feeds = {concat_inputs: input_values}
-
-        # Initialize
-        variables_lib.global_variables_initializer().run(feed_dict=feeds)
 
         # Generate gradients of sum of outputs w.r.t. inputs
         dynamic_gradients = gradients_impl.gradients(
@@ -1260,6 +1265,11 @@ class LSTMTest(test.TestCase):
             ]
         ])
 
+        feeds = {concat_inputs: input_values}
+
+        # Initialize
+        variables_lib.global_variables_initializer().run(feed_dict=feeds)
+
         # Test forward pass
         values_dynamic = sess.run(split_outputs_dynamic, feed_dict=feeds)
         (state_value_dynamic,) = sess.run((state_dynamic,), feed_dict=feeds)
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 8f7245214a2..39d88896416 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
@@ -146,6 +147,37 @@ class SliceTest(test.TestCase):
         slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
+  def test3Dimension(self):
+    with self.cached_session():
+      input_shape = [8, 16, 16, 16, 8]
+      total_input_size = 1
+      for s in input_shape:
+        total_input_size *= s
+      inputs = [
+          i * 1.0 / total_input_size for i in range(1, total_input_size + 1)
+      ]
+      a = constant_op.constant(inputs, shape=input_shape, dtype=dtypes.float32)
+
+      filter_shape = [1, 1, 1, 8, 8]
+      total_filter_size = 1
+      for s in filter_shape:
+        total_filter_size *= s
+      filters = [
+          i * 1.0 / total_filter_size for i in range(1, total_filter_size + 1)
+      ]
+      f = constant_op.constant(
+          filters, shape=filter_shape, dtype=dtypes.float32)
+
+      conv_t = nn_ops.conv3d(
+          a, filter=f, strides=[1, 1, 1, 1, 1], padding="VALID")
+      slice_t = array_ops.slice(conv_t, [0, 1, 1, 1, 0], [1, 1, 1, 1, 8])
+      result = self.evaluate(slice_t)
+      expected = [
+          0.03028321, 0.03132677, 0.03237033, 0.03341389, 0.03445745, 0.035501,
+          0.03654456, 0.03758812
+      ]
+      self.assertAllClose(expected, result.flatten(), rtol=1e-6)
+
   @test_util.run_deprecated_v1
   def testScalarInput(self):
     input_val = 0
@@ -348,8 +380,8 @@ class SliceTest(test.TestCase):
     # Tensor from 0 to infinity.  This test ensures that this
     # unintended behavior is prevented.
     c = constant_op.constant(5.0)
-    with self.assertRaisesWithPredicateMatch(
-        TypeError, lambda e: "Tensor objects are only iterable" in str(e)):
+    with self.assertRaisesRegex(errors_impl.OperatorNotAllowedInGraphError,
+                                "iterating over `tf.Tensor`"):
       for _ in c:
         pass
 
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 9341228d57e..56aaf4cb557 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -79,6 +80,18 @@ class SparseReshapeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 7))
 
+  @test_util.run_deprecated_v1
+  def testPropagatesFullyKnownDenseShapeWhenShapePartiallyKnown(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_2x3x4())
+    self.assertAllEqual((2, 3, 4), sp_input.shape)
+    sp_output = sparse_ops.sparse_reshape(
+        sp_input, shape=array_ops.concat(
+            (constant_op.constant([2], dtype=dtypes.int64),
+             array_ops.placeholder(dtype=dtypes.int64, shape=[1])),
+            axis=0))
+    self.assertAllEqual((2, 3 * 4), sp_output.shape)
+
   def testSameShape(self):
     with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 1d2a0e727a7..33879232fd3 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1034,7 +1034,7 @@ class TensorArrayTest(test.TestCase):
             dtype=dtypes.float32,
             size=num_steps,
             clear_after_read=False,
-            element_shape=tensor_shape.scalar())
+            element_shape=tensor_shape.TensorShape([]))
         i = constant_op.constant(0, name="i")
 
         c = lambda i, acc: i < 5
@@ -1365,7 +1365,7 @@ class TensorArrayTest(test.TestCase):
       x = constant_op.constant([1.0, 2.0, 3.0])
       ta = ta.write(0, x)
       t = ta.stack()
-      self.assertEqual(t.shape.as_list(), [None, 3])
+      self.assertEqual(t.shape.as_list(), [3, 3])
       return t
 
     ta_stack()
@@ -1693,10 +1693,10 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(dtypes.float32, ta0.dtype)
       self.assertEqual(dtypes.int32, ta1.dtype)
       if context.executing_eagerly():
-        self.assertEqual(tensor_shape.scalar(), read0.get_shape())
+        self.assertEqual(tensor_shape.TensorShape([]), read0.get_shape())
       else:
         self.assertEqual(tensor_shape.unknown_shape(), read0.get_shape())
-      self.assertEqual(tensor_shape.scalar(), read1.get_shape())
+      self.assertEqual(tensor_shape.TensorShape([]), read1.get_shape())
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -1747,6 +1747,54 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(v0, -3)
       self.assertAllEqual(v1, 100)
 
+  def testInferShapeFalseValid(self):
+    ta = tensor_array_ops.TensorArray(
+        dtypes.float32, size=3, infer_shape=False, element_shape=[None, 10, 20])
+    ta = ta.write(0, array_ops.ones([50, 10, 20]))
+    ta = ta.write(1, array_ops.ones([50, 10, 20]))
+    ta = ta.write(2, array_ops.ones([1, 10, 20]))
+    ta = ta.concat()
+
+    correct = np.ones([101, 10, 20])
+
+    self.assertAllEqual(ta, correct)
+
+  def testInferShapeFalseInvalid(self):
+    ta = tensor_array_ops.TensorArray(
+        dtypes.float32, size=2, infer_shape=False, element_shape=[None, 10, 20])
+    ta = ta.write(0, array_ops.ones([50, 10, 20]))
+
+    with self.assertRaises(ValueError):
+      ta = ta.write(1, array_ops.ones([1, 20, 20]))
+
+  def testInferShapeTrue(self):
+    ta = tensor_array_ops.TensorArray(
+        dtypes.float32, size=3, infer_shape=True, element_shape=[None, 10, 20])
+    self.assertAllEqual((None, 10, 20), ta.element_shape.as_list())
+    ta = ta.write(0, array_ops.ones([50, 10, 20]))
+    self.assertAllEqual((50, 10, 20), ta.element_shape.as_list())
+    ta = ta.write(1, array_ops.ones([50, 10, 20]))
+    with self.assertRaises(ValueError):
+      ta = ta.write(
+          2, array_ops.ones([1, 10, 20])
+      )  # Inconsistent shapes: saw (1, 10, 20) but expected (50, 10, 20)
+
+  def testStackShapeOnEmpty(self):
+    ta = tensor_array_ops.TensorArray(
+        dtypes.float32, size=0, element_shape=(5, 10), dynamic_size=True)
+    self.assertAllEqual([0, 5, 10], self.evaluate(ta.stack()).shape)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerStackOnPartiallyDefinedShape(self):
+    ta = tensor_array_ops.TensorArray(
+        dtypes.float32, size=0, element_shape=(5, None), dynamic_size=True)
+    self.assertEqual([None, 5, None], ta.stack().shape.as_list())
+
+  def testStackShapeOnStaticSize(self):
+    ta = tensor_array_ops.TensorArray(dtypes.float32, size=42)
+    ta = ta.write(0, [0])
+    self.assertEqual([42, 1], ta.stack().shape.as_list())
+
 
 class TensorArrayBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index f203263e0c5..dce5a2a4ad4 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -88,6 +88,28 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]])
 
+  def testBool(self):
+    x = np.random.choice([True, False], size=7000)
+    with self.cached_session() as sess:
+      y, idx = array_ops.unique(x)
+      tf_y, tf_idx = self.evaluate([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def testBoolV2(self):
+    x = np.random.choice([True, False], size=7000)
+    with self.cached_session() as sess:
+      y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32))
+      tf_y, tf_idx = self.evaluate([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
 
 class UniqueWithCountsTest(test.TestCase):
 
@@ -166,6 +188,33 @@ class UniqueWithCountsTest(test.TestCase):
     for value, count in zip(tf_y, tf_count):
       self.assertEqual(count, np.sum(x == value))
 
+  def testBool(self):
+    x = np.random.choice([True, False], size=7000)
+    with self.cached_session() as sess:
+      y, idx, count = array_ops.unique_with_counts(x)
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
+  def testBoolV2(self):
+    x = np.random.choice([True, False], size=7000)
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.unique_with_counts_v2(
+          x, axis=np.array([], np.int32))
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 7f0bb2a4eae..6e72f61a852 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -83,7 +83,7 @@ class VariableScopeTest(test.TestCase):
     vs = variable_scope._get_default_variable_store()
     v = vs.get_variable("v", [1])
     v1 = vs.get_variable("v", [1])
-    self.assertEqual(v, v1)
+    self.assertIs(v, v1)
 
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
@@ -99,7 +99,7 @@ class VariableScopeTest(test.TestCase):
     # No check by default, so we can both create and get existing names.
     v = vs.get_variable("v", [1])
     v1 = vs.get_variable("v", [1])
-    self.assertEqual(v, v1)
+    self.assertIs(v, v1)
 
     # When reuse is False, we fail when variables are already there.
     vs.get_variable("w", [1], reuse=False)  # That's ok.
@@ -248,8 +248,9 @@ class VariableScopeTest(test.TestCase):
         _ = d2(x)
         self.assertEqual(len(d2.variables), 2)
         v3, v4 = d2.variables
-        self.assertEqual(v1, v3)
-        self.assertEqual(v2, v4)
+        self.assertIs(v1, v3)
+        self.assertIs(v2, v4)
+
       f()
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -783,7 +784,7 @@ class VariableScopeTest(test.TestCase):
 
         with variable_scope.variable_scope(tower_a, reuse=True):
           va2 = variable_scope.get_variable("v", [1])
-          self.assertEqual(va2, va)
+          self.assertIs(va2, va)
 
         with variable_scope.variable_scope("towerB"):
           vb = variable_scope.get_variable("v", [1])
@@ -795,7 +796,7 @@ class VariableScopeTest(test.TestCase):
 
         with variable_scope.variable_scope("towerA", reuse=True):
           va2 = variable_scope.get_variable("v", [1])
-          self.assertEqual(va2, va)
+          self.assertIs(va2, va)
 
         with variable_scope.variable_scope("foo"):
           with variable_scope.variable_scope("bar"):
@@ -803,7 +804,7 @@ class VariableScopeTest(test.TestCase):
             self.assertEqual(v.name, "root/foo/bar/v:0")
             with variable_scope.variable_scope(tower_a, reuse=True):
               va3 = variable_scope.get_variable("v", [1])
-              self.assertEqual(va, va3)
+              self.assertIs(va, va3)
 
         with self.assertRaises(ValueError):
           with variable_scope.variable_scope(tower_a, reuse=True):
@@ -1446,7 +1447,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       v = variable_scope.get_variable("name0", shape=(3, 1, 1))
     with variable_scope.variable_scope("scope0", reuse=True):
       v_reused = variable_scope.get_variable("name0")
-    self.assertEqual(v, v_reused)
+    self.assertIs(v, v_reused)
 
   def testNoReuseInEagerByDefault(self):
     with context.eager_mode():
@@ -1550,8 +1551,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
         new_scope, reuse=True, custom_getter=custom_getter):
       v4 = variable_scope.get_variable("v3", [1])
 
-    self.assertEqual(v, v2)
-    self.assertEqual(v3, v4)
+    self.assertIs(v, v2)
+    self.assertIs(v3, v4)
     self.assertEqual(3, called[0])  # skipped one in the first new_scope
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 7a465dc13b3..fefeb594bea 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -24,9 +24,10 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.eager import def_function
+from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import control_flow_v2_toggles
+from tensorflow.python.ops import random_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
@@ -34,6 +35,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
@@ -44,6 +47,27 @@ from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
 from tensorflow.python.platform import test
 
 
+def random_gamma(shape):  # pylint: disable=invalid-name
+  return random_ops.random_gamma(shape, 1.0)
+
+
+def random_gamma_with_alpha_beta(shape):  # pylint: disable=invalid-name
+  return random_ops.random_gamma(
+      shape, alpha=[[1.], [3.], [5.], [6.]], beta=[[3., 4.]])
+
+
+def random_poisson_v2(shape):  # pylint: disable=invalid-name
+  return random_ops.random_poisson_v2(shape, 1.0)
+
+
+def random_poisson_v2_with_lam(shape):  # pylint: disable=invalid-name
+  return random_ops.random_poisson_v2(shape, [12.2, 3.3])
+
+
+def fill(shape):  # pylint: disable=invalid-name
+  return array_ops.fill(shape, 1.0)
+
+
 class WhileV2Test(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
@@ -136,6 +160,27 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
       self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
+  def testMultipleLoopNonscalarCond(self):
+    x = constant_op.constant([[5.]])
+    y = constant_op.constant(3.)
+
+    # x = 5.
+    # y = 3.
+    # while x < 45.:
+    #   x = x * y
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, w), [x, y],
+        return_same_structure=False)
+    # ret == [x*y^2, y]
+
+    # Note: This is simply d_ret[0]/d_x since d_ret[1]/d_x is 0.
+    grad = gradients_impl.gradients(ret, [x])  # [2*x*y]
+    with self.cached_session():
+      self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
+
   @test_util.run_deprecated_v1
   def testMultipleLoopVars(self):
     x = constant_op.constant(5.)
@@ -194,6 +239,112 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(grad), [32.])
       self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  def testMultipleWhileLoopsWithFunc(self):
+    x = constant_op.constant(2.)
+
+    @def_function.function
+    def Fn():
+      ret1 = while_loop_v2(
+          lambda v: v < 4.,
+          lambda v: v * v, [x],
+          return_same_structure=False,
+          name="while_1")  # x**2
+      ret2 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * v, [x],
+          return_same_structure=False,
+          name="while_2")  # x**4
+      return ret1, ret2
+
+    concrete_fn = Fn.get_concrete_function()
+    while_1 = concrete_fn.graph.get_operation_by_name("while_1")
+    while_2 = concrete_fn.graph.get_operation_by_name("while_2")
+    self.assertEqual(while_1.type, "StatelessWhile")
+    self.assertEqual(while_2.type, "StatelessWhile")
+    self.assertEmpty(while_1.control_inputs)
+    self.assertEmpty(while_2.control_inputs)
+
+  def testMultipleWhileLoopsWithDeps(self):
+    x = variables.Variable(2.)
+    c = constant_op.constant(2.)
+
+    @def_function.function
+    def Fn():
+      ret1 = while_loop_v2(
+          lambda v: v < 4.,
+          lambda v: v * x, [c],
+          return_same_structure=False,
+          name="while_1")  # 2x
+      ret2 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * x * x, [c],
+          return_same_structure=False,
+          name="while_2")  # 4x
+      return ret1, ret2
+
+    concrete_fn = Fn.get_concrete_function()
+    while_1 = concrete_fn.graph.get_operation_by_name("while_1")
+    while_2 = concrete_fn.graph.get_operation_by_name("while_2")
+    self.assertEqual(while_1.type, "While")
+    self.assertEqual(while_2.type, "While")
+    self.assertEmpty(while_1.control_inputs)
+    self.assertLen(while_2.control_inputs, 1)
+    self.assertIs(while_2.control_inputs[0], while_1)
+
+  def testMultipleWhileLoopsWithVarsDeps(self):
+    x1 = variables.Variable(2.)
+    x2 = variables.Variable(3.)
+    c = constant_op.constant(2.)
+
+    @def_function.function
+    def Fn():
+      ret1 = while_loop_v2(
+          lambda v: v < 4.,
+          lambda v: v * x1, [c],
+          return_same_structure=False,
+          name="while_1")  # 2x
+      ret2 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * x1 * x1, [c],
+          return_same_structure=False,
+          name="while_2")  # 4x
+      ret3 = while_loop_v2(
+          lambda v: v < 4.,
+          lambda v: v * x2, [c],
+          return_same_structure=False,
+          name="while_3")  # 3x
+      ret4 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * x2 * x2, [c],
+          return_same_structure=False,
+          name="while_4")  # 9x
+      ret5 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * v, [c],
+          return_same_structure=False,
+          name="while_stateless")  # x**2
+      return ret1, ret2, ret3, ret4, ret5
+
+    concrete_fn = Fn.get_concrete_function()
+    while_1 = concrete_fn.graph.get_operation_by_name("while_1")
+    while_2 = concrete_fn.graph.get_operation_by_name("while_2")
+    while_3 = concrete_fn.graph.get_operation_by_name("while_3")
+    while_4 = concrete_fn.graph.get_operation_by_name("while_4")
+    while_stateless = concrete_fn.graph.get_operation_by_name(
+        "while_stateless")
+    self.assertEqual(while_1.type, "While")
+    self.assertEqual(while_2.type, "While")
+    self.assertEqual(while_3.type, "While")
+    self.assertEqual(while_4.type, "While")
+    self.assertEqual(while_stateless.type, "StatelessWhile")
+    self.assertEmpty(while_1.control_inputs)
+    self.assertLen(while_2.control_inputs, 1)
+    self.assertIs(while_2.control_inputs[0], while_1)
+    self.assertEmpty(while_3.control_inputs)
+    self.assertLen(while_4.control_inputs, 1)
+    self.assertIs(while_4.control_inputs[0], while_3)
+    self.assertEmpty(while_stateless.control_inputs)
+
   @test_util.run_deprecated_v1
   def testDoubleDerivative(self):
     x = constant_op.constant(2.)
@@ -207,6 +358,45 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(grad), [32.])
       self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_v2_only
+  def testMultipleWhileLoopsEager(self):
+
+    @def_function.function
+    def Func():
+      x = constant_op.constant(2.)
+      ret1 = while_loop_v2(
+          lambda v: v < 4., lambda v: v * v, [x],
+          return_same_structure=False)  # x**2
+      ret2 = while_loop_v2(
+          lambda v: v < 16.,
+          lambda v: v * v, [ret1],
+          return_same_structure=False)  # x**4
+      grad = gradients_impl.gradients(ret2, [x])[0]  # 4x**3
+      grad_grad = gradients_impl.gradients(grad, [x])[0]  # 12x**2
+      return grad, grad_grad
+
+    grad, grad_grad = Func()
+    self.assertEqual(grad.numpy(), 32.)
+    self.assertEqual(grad_grad.numpy(), 48.)
+
+  @test_util.run_v2_only
+  def testDoubleDerivativeEager(self):
+
+    @def_function.function
+    def Func():
+      x = constant_op.constant(2.)
+      ret = while_loop_v2(
+          lambda v: v < 8., lambda v: v**2, [x],
+          return_same_structure=False)  # x**4
+      grad = gradients_impl.gradients(ret, [x])[0]  # 4x**3
+      grad_grad = gradients_impl.gradients(grad, [x])[0]  # 12x**2
+      return ret, grad, grad_grad
+
+    ret, grad, grad_grad = Func()
+    self.assertEqual(ret.numpy(), 16.)
+    self.assertEqual(grad.numpy(), 32.)
+    self.assertEqual(grad_grad.numpy(), 48.)
+
   def _testPruning(self):
     x = constant_op.constant(1)
 
@@ -235,6 +425,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         n for n in g.node if n.op == "Enter" and
         n.attr["T"].type == dtypes.variant.as_datatype_enum
     ])
+    self.assertEmpty([n for n in g.node if n.op == "TensorListPushBack"])
 
     stack = list_ops.tensor_list_stack(outputs[1], element_dtype=x.dtype)
     train_op.append(stack)
@@ -248,6 +439,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         n for n in g.node if n.op == "Enter" and
         n.attr["T"].type == dtypes.variant.as_datatype_enum
     ])
+    self.assertNotEmpty([n for n in g.node if n.op == "TensorListPushBack"])
 
   @test_util.run_deprecated_v1
   def testPruningV1(self):
@@ -258,18 +450,17 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
   def testPruningV2(self):
     self._testPruning()
 
-  @parameterized.named_parameters(
-      ("V1", control_flow_ops.while_loop, "StackPushV2"),
-      ("V2", while_loop_v2, "TensorListPushBack"),
-  )
-  @test_util.run_deprecated_v1
-  def testDoNotAccumulateInvariants(self, while_loop_fn, push_op):
+  def _testDoNotAccumulateInvariants(self):
+    push_op = ("TensorListPushBack"
+               if control_flow_v2_toggles.control_flow_v2_enabled() else
+               "StackPushV2")
+
     # Tests that loop invariants, i.e., tensors that are "captured" by the
     # while loop and not passed as loop variables are not accumulated in
     # gradient computation.
     v = constant_op.constant(5.0, name="v")
 
-    r = while_loop_fn(
+    r = control_flow_ops.while_loop(
         lambda _: True, lambda x: v * x, [1.0], maximum_iterations=5)
 
     output = gradients_impl.gradients(r, v)[0]
@@ -282,6 +473,182 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # x.
     self.assertLen([n for n in g.node if n.op == push_op], 1)
 
+  @test_util.run_deprecated_v1
+  def testDoNotAccumulateInvariantsV1(self):
+    self._testDoNotAccumulateInvariants()
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testDoNotAccumulateInvariantsV2(self):
+    self._testDoNotAccumulateInvariants()
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testPruningNested(self):
+    assert control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+    x = constant_op.constant(0)
+
+    tensor_list = list_ops.empty_tensor_list(
+        element_dtype=x.dtype, element_shape=x.shape)
+
+    def Cond(x, tl):
+      del tl  # Unused for Cond.
+      return x < 25
+
+    def Body(x, tl):
+
+      def InnerCond(inner_x, unused_outer_x, unused_tl):
+        return inner_x < 5
+
+      def InnerBody(inner_x, outer_x, tl):
+        return inner_x + 1, outer_x + 1, list_ops.tensor_list_push_back(tl, x)
+
+      inner_x = constant_op.constant(0)
+      return control_flow_ops.while_loop(InnerCond, InnerBody,
+                                         [inner_x, x, tl])[1:]
+
+    outputs = control_flow_ops.while_loop(Cond, Body, [x, tensor_list])
+
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(outputs[0])
+
+    g = GetOptimizedGraph()
+    # TODO(b/136034023): while_v2 adds an extra loop_counter which is not pruned
+    # away, causing an extra Enter node.
+    # enter_count = 4 if control_flow_util.ENABLE_CONTROL_FLOW_V2 else 2
+    # self.assertLen([n for n in g.node if n.op == "Enter"], enter_count)
+    # Test that the TensorList is pruned out.
+    self.assertEmpty([
+        n for n in g.node if n.op == "Enter" and
+        n.attr["T"].type == dtypes.variant.as_datatype_enum
+    ])
+    self.assertEmpty([n for n in g.node if n.op == "TensorListPushBack"])
+    self.assertEmpty([n for n in g.node if n.op == "_While"])
+
+    stack = list_ops.tensor_list_stack(outputs[1], element_dtype=x.dtype)
+    train_op.append(stack)
+    g = GetOptimizedGraph()
+    # TODO(b/136034023): while_v2 adds an extra loop_counter which is not pruned
+    # away, causing an extra Enter node.
+    # enter_count = 3 if control_flow_util.ENABLE_CONTROL_FLOW_V2 else 2
+    # self.assertLen([n for n in g.node if n.op == "Enter"], enter_count)
+    # Test that the TensorList is not pruned out.
+    self.assertNotEmpty([
+        n for n in g.node if n.op == "Enter" and
+        n.attr["T"].type == dtypes.variant.as_datatype_enum
+    ])
+    self.assertNotEmpty([n for n in g.node if n.op == "TensorListPushBack"])
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testPruningNested2(self):
+    assert control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+    v = constant_op.constant(5.0, name="v")
+
+    p = array_ops.placeholder(dtype=dtypes.int32)
+
+    def MidBodyBuilder(iterations):
+
+      def MidBody(i, x):
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            lambda i, x: (i + 1, math_ops.multiply(v, x, name="my_mul")),
+            (0, x),
+            maximum_iterations=iterations,
+            name="inner")
+        return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
+      return MidBody
+
+    def OuterBody(i, x):
+      iterations = array_ops.size(p, name="iterations")
+      return (i + 1, x + control_flow_ops.while_loop(
+          lambda *_: True,
+          MidBodyBuilder(iterations), (0, x),
+          maximum_iterations=iterations,
+          name="mid")[1])
+
+    def CreateWhileLoop():
+      with ops.device("/cpu:0"):
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            OuterBody, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
+        return array_ops.identity(r[1])
+
+    output = CreateWhileLoop()
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(output)
+
+    g = GetOptimizedGraph()
+    self.assertLen([n for n in g.node if n.op == "TensorListPushBack"], 1)
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testPruningNested3(self):
+    assert control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+    v = constant_op.constant(5.0, name="v")
+
+    def CreateWhileLoop():
+      r = control_flow_ops.while_loop(
+          lambda _: True,
+          lambda x: math_ops.multiply(v, x, name="my_mul"), [1.0],
+          maximum_iterations=5,
+          name="outer")
+      return array_ops.identity(r)
+
+    r = CreateWhileLoop()
+    output = gradients_impl.gradients(r, v)[0]
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(output)
+
+    g = GetOptimizedGraph()
+    self.assertLen([n for n in g.node if n.op == "TensorListPushBack"], 1)
+
+  def _assertNotAccumulated(self, while_op, index):
+    """Asserts that `while_op` input at `index` is not accumulated."""
+    body_graph = while_v2._get_graph(while_op, "body")
+    placeholder = body_graph.inputs[index]
+    self.assertNotIn("TensorListPushBack",
+                     [op.type for op in placeholder.consumers()])
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testDoNotOutputLoopCounterAsIntermediate(self):
+    assert control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+    v = constant_op.constant(5.0, name="v")
+    r = control_flow_ops.while_loop(
+        lambda _: True, lambda x: v * x, [1.0], maximum_iterations=5)
+    # Skip over Identity.
+    while_op = r.op.inputs[0].op
+    self._assertNotAccumulated(while_op, 0)
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  @test_util.enable_output_all_intermediates
+  def testDoNotOutputLoopInvariantAsIntermediate(self):
+    assert control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+
+    def GetInputIndex(op, tensor):
+      for index, inp in enumerate(op.inputs):
+        if inp is tensor:
+          return index
+
+    v = constant_op.constant(5.0, name="v")
+    r = control_flow_ops.while_loop(
+        lambda _: True, lambda x: v * x, [1.0], maximum_iterations=5)
+    # Skip over Identity.
+    while_op = r.op.inputs[0].op
+    # We can't directly use while_op.inputs.index() because Tensors are not
+    # hashshable.
+    index = GetInputIndex(while_op, v)
+    self._assertNotAccumulated(while_op, index)
+
   @test_util.run_deprecated_v1
   def testCaptureExternalTensorInCond(self):
     x = constant_op.constant(2.)
@@ -291,7 +658,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         lambda v: v * 3., [x],
         return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual(self.evaluate(ret), 18.)
       self.assertSequenceEqual(self.evaluate(grad), [9.])
 
@@ -302,7 +669,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(
         lambda v: v < 8., lambda v: v * y, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual(self.evaluate(ret), 18.)
       self.assertSequenceEqual(self.evaluate(grad), [9.])
 
@@ -350,7 +717,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         Cond, Body, [x, tensor_list], return_same_structure=False)
 
     for op in ops.get_default_graph().get_operations():
-      if op.type == "While":
+      if op.type == "While" or op.type == "StatelessWhile":
         while_op = op
 
     body_graph = while_v2._get_graph(while_op, "body")
@@ -433,7 +800,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         lambda i: i + 1, [constant_op.constant(0)],
         return_same_structure=False)
     while_op = output.op.inputs[0].op
-    self.assertEqual(while_op.type, "While")
+    self.assertEqual(while_op.type, "StatelessWhile")
     return while_op
 
   def testDefaultName(self):
@@ -519,18 +886,126 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
                                  lambda x: x * 2.0,
                                  [x])[0]
     while_op = output.op.inputs[0].op
-    self.assertEqual(while_op.type, "While")
+    self.assertEqual(while_op.type, "StatelessWhile")
     # outputs = [loop_counter, max_iters, x]
     self.assertLen(while_op.outputs, 3)
 
     gradients_impl.gradients(output, x)
-    # while_op should have been rewritten to output 2.0 intermediate.
-    # outputs = [loop_counter, max_iters, x, 2.0_accumulator, x_accumulator]
-    self.assertLen(while_op.outputs, 5)
+    # while_op should have been rewritten to output intermediates.
+    # outputs = [loop_counter, max_iters, x, x_accumulator]
+    self.assertLen(while_op.outputs, 4)
 
     gradients_impl.gradients(output, x)
     # Computing the gradient again shouldn't rewrite while_op again.
-    self.assertLen(while_op.outputs, 5)
+    self.assertLen(while_op.outputs, 4)
+
+  @parameterized.named_parameters(
+      ("RandomUniform", random_ops.random_uniform, [5, 3]),
+      ("RandomNormal", random_ops.random_normal, [5, 3]),
+      ("ParameterizedTruncatedNormal",
+       random_ops.parameterized_truncated_normal, [5, 3]),
+      ("TruncatedNormal", random_ops.truncated_normal, [5, 3]),
+      ("RandomGamma", random_gamma, [5, 3]),
+      ("RandomPoissonV2", random_poisson_v2, [5, 3]),
+      ("RandomGammaWithAlphaBeta", random_gamma_with_alpha_beta, [5, 3, 4, 2]),
+      ("RandomPoissonV2WithLam", random_poisson_v2_with_lam, [5, 3, 2]),
+  )
+  @test_util.run_deprecated_v1
+  def testRandomOpsShape(self, random_fn, expected_shape):
+    shape = constant_op.constant([3])
+
+    def Body(i, u):
+      shape_extended = array_ops.concat([[5], shape], axis=0)
+      u = random_fn(shape_extended)
+      assert u.shape.as_list() == expected_shape, str(u.shape.as_list())
+      return i + 1, u
+
+    _, _ = while_loop_v2(
+        cond=lambda i, _: i < 3,
+        body=Body,
+        loop_vars=[
+            0,
+            array_ops.zeros(expected_shape, dtype=dtypes.float32),
+        ])
+
+  @test_util.run_deprecated_v1
+  def testReshapeShape(self):
+    shape = constant_op.constant([3, 4])
+
+    def Body(i, u):
+      shape_extended = array_ops.concat([[5], shape], axis=0)
+      u = array_ops.reshape(u, [-1])
+      assert u.shape.as_list() == [60], str(u.shape.as_list())
+      u = array_ops.reshape(u, shape_extended)
+      assert u.shape.as_list() == [5, 3, 4], str(u.shape.as_list())
+      return i + 1, u
+
+    _, _ = while_loop_v2(
+        cond=lambda i, _: i < 3,
+        body=Body,
+        loop_vars=[
+            0,
+            array_ops.zeros([5, 3, 4], dtype=dtypes.float32),
+        ])
+
+  @parameterized.named_parameters(
+      ("Zeros", array_ops.zeros),
+      ("Ones", array_ops.ones),
+      ("Fill", fill),
+  )
+  @test_util.run_deprecated_v1
+  def testFillOpsShape(self, fill_fn):
+    shape = constant_op.constant([3, 4])
+
+    def Body(i, u):
+      shape_extended = array_ops.concat([[5], shape], axis=0)
+      u = fill_fn(shape_extended)
+      assert u.shape.as_list() == [5, 3, 4], str(u.shape.as_list())
+      return i + 1, u
+
+    _, _ = while_loop_v2(
+        cond=lambda i, _: i < 3,
+        body=Body,
+        loop_vars=[
+            0,
+            array_ops.zeros([5, 3, 4], dtype=dtypes.float32),
+        ])
+
+  @test_util.run_deprecated_v1
+  def testExternalColocationGrad(self):
+    external_t = constant_op.constant(2.)
+    v0 = constant_op.constant(2.)
+
+    def Body(v):
+      with ops.colocate_with(external_t):
+        return v * v
+
+    ret = while_loop_v2(lambda v: v < 8., Body, [v0])[0]
+    grad = gradients_impl.gradients(ret, [v0])[0]
+    self.assertAllEqual(ret, 16.)
+    self.assertAllEqual(grad, 32.)
+
+  @test_util.run_deprecated_v1
+  def testDoNotAccumulateConstNodes(self):
+
+    def Body(v):
+      return v * 2.0
+
+    v0 = constant_op.constant(2.)
+    ret = while_loop_v2(lambda v: v < 8., Body, [v0])[0]
+    # Gradients computation has the side-effect of updating the forward op
+    # which is what we want to test.
+    unused_grad = gradients_impl.gradients(ret, [v0])[0]
+    # ret is separated from the `While` op by an `Identity` so we skip over
+    # that.
+    forward_while_op = ret.op.inputs[0].op
+    body_graph = while_v2._get_graph(forward_while_op, "body")
+    push_back_nodes = [
+        o for o in body_graph.get_operations() if o.type == "TensorListPushBack"
+    ]
+    # Gradient of `Mul` requires accumulating both its inputs. But since one
+    # of those is a Const (2.0), we should have just one accumulator.
+    self.assertLen(push_back_nodes, 1)
 
 
 def ScalarShape():
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 7137a3bd011..f1cc132f56c 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -24,12 +24,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
@@ -199,6 +201,15 @@ class Layer(base_layer.Layer):
     self._trainable_weights = []
     self.built = False
 
+    if dtype is None:
+      # Indicates to infer dtype from inputs. When the V2 dtype behavior is
+      # enabled, Keras layers default their dtype to floatx instead, so we pass
+      # an "infer" policy to keep the old V1 behavior.
+      dtype = policy.Policy('infer')
+
+    if 'autocast' not in kwargs:
+      kwargs['autocast'] = False
+
     super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
                                 **kwargs)
 
@@ -577,7 +588,7 @@ def _add_elements_to_collection(elements, collection_list):
   collection_list = nest.flatten(collection_list)
   for name in collection_list:
     collection = ops.get_collection_ref(name)
-    collection_set = set(collection)
+    collection_set = object_identity.ObjectIdentitySet(collection)
     for element in elements:
       if element not in collection_set:
         collection.append(element)
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 3dd09a0f9f9..1481ef5304b 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import copy
 
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -638,5 +640,69 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_losses_for([outputs])), 0)
 
+
+class IdentityLayer(base_layers.Layer):
+  """A layer returns the identity of it's input."""
+
+  def call(self, inputs):
+    return inputs
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DTypeTest(test.TestCase):
+
+  def _const(self, dtype):
+    return array_ops.constant(1, dtype=dtype)
+
+  def test_dtype_inferred_from_input(self):
+    # Test with Tensor input
+    layer = IdentityLayer()
+    self.assertIsNone(layer.dtype)
+    layer(self._const('float64'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    # Test with Numpy input
+    layer = IdentityLayer()
+    self.assertIsNone(layer.dtype)
+    layer(np.array(1., dtype='float64'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    # Test with integer input
+    layer = IdentityLayer()
+    self.assertIsNone(layer.dtype)
+    layer(self._const('int32'))
+    self.assertEqual(layer.dtype, 'int32')
+
+    # Test layer dtype doesn't change when passed a new dtype
+    layer = IdentityLayer()
+    self.assertIsNone(layer.dtype)
+    layer(self._const('float64'))
+    self.assertEqual(layer.dtype, 'float64')
+    layer(self._const('float16'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    # Test layer dtype inferred from first input
+    layer = IdentityLayer()
+    layer([self._const('float32'), self._const('float64')])
+    self.assertEqual(layer.dtype, 'float32')
+
+  def test_passing_dtype_to_constructor(self):
+    layer = IdentityLayer(dtype='float64')
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'float64')
+
+    layer = IdentityLayer(dtype='int32')
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'int32')
+
+    layer = IdentityLayer(dtype=dtypes.float64)
+    layer(self._const('float32'))
+    self.assertEqual(layer.dtype, 'float64')
+
+  def test_inputs_not_casted(self):
+    layer = IdentityLayer(dtype='float32')
+    self.assertEqual(layer(self._const('float64')).dtype, 'float64')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 3390afa021a..bce79b15344 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -168,6 +168,16 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       if (pyarray_type == Bfloat16NumpyType()) {
         *out_tf_datatype = TF_BFLOAT16;
         break;
+      } else if (pyarray_type == NPY_ULONGLONG) {
+        // NPY_ULONGLONG is equivalent to NPY_UINT64, while their enum values
+        // might be different on certain platforms.
+        *out_tf_datatype = TF_UINT64;
+        break;
+      } else if (pyarray_type == NPY_LONGLONG) {
+        // NPY_LONGLONG is equivalent to NPY_INT64, while their enum values
+        // might be different on certain platforms.
+        *out_tf_datatype = TF_INT64;
+        break;
       }
       return errors::Internal("Unsupported numpy type: ",
                               numpy_type_name(pyarray_type));
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index e4c53f957e2..18443401d58 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -61,6 +61,9 @@ struct PyCall {
   // True if the call is associated with an EagerPyFunc.
   bool eager = false;
 
+  // True if the call is running under eager async mode.
+  bool eager_async = false;
+
   // Inputs and outputs of this function invocation.
   std::vector<Tensor> ins;
   std::vector<Tensor> out;
@@ -173,12 +176,18 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
 
   // Prepare the argument.
   PyObject* args = nullptr;
+  TFE_Context* ctx = nullptr;
+  std::unique_ptr<EagerExecutor> new_executor = nullptr;
+  EagerExecutor* old_executor = nullptr;
   if (call->eager) {
     // See FuncRegistry._ctx.
-    TFE_Context* ctx = reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(
+    ctx = reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(
         PyObject_GetAttrString(trampoline, "_ctx"), nullptr));
     CHECK_NE(ctx, nullptr);
     TF_RETURN_IF_ERROR(MakeArgTuple(call, ctx->context, &args));
+    new_executor.reset(new EagerExecutor(call->eager_async));
+    old_executor = ctx->context->Executor();
+    ctx->context->SetExecutorForThread(new_executor.get());
   } else {
     TF_RETURN_IF_ERROR(MakeArgTuple(call, nullptr, &args));
   }
@@ -187,31 +196,38 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   // Invokes the trampoline.
   PyObject* result = PyEval_CallObject(trampoline, args);
   Py_DECREF(args);
+  Status s = Status::OK();
   if (result == nullptr) {
     if (PyErr_Occurred()) {
       if (PyErr_ExceptionMatches(PyExc_ValueError) ||
           PyErr_ExceptionMatches(PyExc_TypeError)) {
-        return errors::InvalidArgument(PyExceptionFetch());
+        s = errors::InvalidArgument(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
         *out_log_on_error = false;
-        return errors::OutOfRange(PyExceptionFetch());
+        s = errors::OutOfRange(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
-        return errors::ResourceExhausted(PyExceptionFetch());
+        s = errors::ResourceExhausted(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
-        return errors::Unimplemented(PyExceptionFetch());
+        s = errors::Unimplemented(PyExceptionFetch());
       } else {
         // TODO(ebrevdo): Check if exception is an OpError and use the
         // OpError.error_code property to map it back in the Status.
-        return errors::Unknown(PyExceptionFetch());
+        s = errors::Unknown(PyExceptionFetch());
       }
     } else {
-      return errors::Internal("Failed to run py callback ", call->token,
-                              ": see error log.");
+      s = errors::Internal("Failed to run py callback ", call->token,
+                           ": see error log.");
     }
   }
 
+  if (new_executor != nullptr) {
+    s.Update(new_executor->WaitForAllPendingNodes());
+    ctx->context->SetExecutorForThread(old_executor);
+  }
+
+  TF_RETURN_IF_ERROR(s);
+
   // Process the return values and convert them to TF Tensors.
-  Status s = Status::OK();
   if (PyList_Check(result)) {
     // `result` is a Python list; if this operation is an `EagerPyFunc`, then
     // every item in the list must be an `EagerTensor`; otherwise, every element
@@ -282,6 +298,9 @@ class PyFuncOp : public OpKernel {
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
     eager_ = type_string() == "EagerPyFunc";
+    if (eager_) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("is_async", &eager_async_));
+    }
   }
 
   bool IsExpensive() override { return true; }
@@ -299,6 +318,7 @@ class PyFuncOp : public OpKernel {
             "Unrecognized device class: ", ctx->device()->name()));
         return;
       }
+      call.eager_async = eager_async_;
     }
 
     for (int i = 0; i < ctx->num_inputs(); ++i) {
@@ -357,12 +377,27 @@ class PyFuncOp : public OpKernel {
   // i.e., if and only if the eager attribute is set.
   bool eager_;
 
+  bool eager_async_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(PyFuncOp);
 };
 
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
-REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp);
+
+DataType gpu_types[] = {
+    // No strings and int32s, no ref types and no resource/variant types.
+    DT_FLOAT,      DT_DOUBLE,   DT_UINT8,  DT_INT16,   DT_INT8,
+    DT_COMPLEX64,  DT_INT64,    DT_BOOL,   DT_QINT8,   DT_QUINT8,
+    DT_QINT32,     DT_BFLOAT16, DT_QINT16, DT_QUINT16, DT_UINT16,
+    DT_COMPLEX128, DT_HALF,     DT_UINT32, DT_UINT64,
+};
+
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint("Tin", gpu_types)
+                            .TypeConstraint("Tout", gpu_types),
+                        PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 8f66a8a7364..e1b363515ec 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -80,15 +80,15 @@ PyObject* ZeroDimArrayToScalar(PyObject* obj) {
 // Converts Python object `c` that should hold a Python string into a
 // C++ string in *out.  Returns nullptr on success, or a message on error.
 // Defined below, but forward declared here for use in PyRepr.
-const char* ConvertOneString(PyObject* v, string* out);
+const char* ConvertOneString(PyObject* v, tstring* out);
 
-string PyRepr(PyObject* obj) {
+tstring PyRepr(PyObject* obj) {
   if (obj == nullptr) {
     return "<null>";
   }
   Safe_PyObjectPtr repr_obj = make_safe(PyObject_Repr(obj));
   if (repr_obj) {
-    string repr_str;
+    tstring repr_str;
     if (ConvertOneString(repr_obj.get(), &repr_str) == nullptr) {
       return repr_str;
     }
@@ -446,7 +446,7 @@ DEFINE_HELPER(ConvertNumpyHalf, Eigen::half, DT_HALF, ConvertOneNumpyHalf);
 
 // String support
 
-const char* ConvertOneString(PyObject* v, string* out) {
+const char* ConvertOneString(PyObject* v, tstring* out) {
   if (PyBytes_Check(v)) {
     out->assign(PyBytes_AS_STRING(v), PyBytes_GET_SIZE(v));
     return nullptr;
@@ -469,7 +469,7 @@ const char* ConvertOneString(PyObject* v, string* out) {
   return ErrorMixedTypes;
 }
 
-DEFINE_HELPER(ConvertString, string, DT_STRING, ConvertOneString);
+DEFINE_HELPER(ConvertString, tstring, DT_STRING, ConvertOneString);
 
 // Complex support
 
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index fa56159e271..21992177c45 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -228,7 +228,7 @@ int64 TellFile(tensorflow::WritableFile* file, TF_Status* status) {
 string ReadFromStream(tensorflow::io::BufferedInputStream* stream,
                       size_t bytes,
                       TF_Status* status) {
-  string result;
+  tensorflow::tstring result;
   tensorflow::Status s = stream->ReadNBytes(bytes, &result);
   if (!s.ok() && s.code() != tensorflow::error::OUT_OF_RANGE) {
     Set_TF_Status_from_Status(status, s);
diff --git a/tensorflow/python/lib/io/py_record_reader.h b/tensorflow/python/lib/io/py_record_reader.h
index b7ecc928d2f..243d82a3c55 100644
--- a/tensorflow/python/lib/io/py_record_reader.h
+++ b/tensorflow/python/lib/io/py_record_reader.h
@@ -63,7 +63,7 @@ class PyRecordReader {
   uint64 offset_;
   RandomAccessFile* file_;    // Owned
   io::RecordReader* reader_;  // Owned
-  string record_;
+  tstring record_;
   TF_DISALLOW_COPY_AND_ASSIGN(PyRecordReader);
 };
 
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 29a03addf07..6b5a82a12d5 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -524,10 +524,10 @@ class FlattenTest(parameterized.TestCase, test_util.TensorFlowTestCase):
 
     variable_list = m.variables
     self.assertLen(variable_list, 4)
-    self.assertEqual(variable_list[0], m.a.kernel)
-    self.assertEqual(variable_list[1], m.a.bias)
-    self.assertEqual(variable_list[2], m.b.kernel)
-    self.assertEqual(variable_list[3], m.b.bias)
+    self.assertIs(variable_list[0], m.a.kernel)
+    self.assertIs(variable_list[1], m.a.bias)
+    self.assertIs(variable_list[2], m.b.kernel)
+    self.assertIs(variable_list[3], m.b.bias)
 
   def test_model_discover_submodule(self):
     m = models.Sequential(layers=[layers.Dense(1),
diff --git a/tensorflow/python/ops/accumulate_n_benchmark.py b/tensorflow/python/ops/accumulate_n_benchmark.py
index a709066cae4..08349003dc3 100644
--- a/tensorflow/python/ops/accumulate_n_benchmark.py
+++ b/tensorflow/python/ops/accumulate_n_benchmark.py
@@ -60,7 +60,7 @@ class AccumulateNBenchmark(test.Benchmark):
     return self._AccumulateNTemplate(
         inputs,
         init=array_ops.zeros_like(gen_control_flow_ops.merge(inputs)[0]),
-        shape=tensor_shape.vector(0),
+        shape=tensor_shape.TensorShape([0]),
         validate_shape=False)
 
   def _AccumulateNInitializedWithShape(self, inputs):
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index c51783387ab..efab2b13e4a 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -477,6 +478,61 @@ def _GatherGrad(op, grad):
   return [ops.IndexedSlices(values, indices, params_shape), None]
 
 
+def _GetBatchIndices(params_shape, indices, batch_dims):
+  """Addds the batch offsets to the given indices and returns the results."""
+  batch_indices = indices
+  indices_ndims = indices.shape.ndims
+  indices_dtype = indices.dtype.base_dtype
+  casted_params_shape = math_ops.cast(params_shape, indices_dtype)
+  accum_dim_value = array_ops.ones((), dtype=indices_dtype)
+  for dim in range(batch_dims, 0, -1):
+    dim_value = casted_params_shape[dim - 1]
+    accum_dim_value *= casted_params_shape[dim]
+    start = array_ops.zeros((), dtype=indices_dtype)
+    step = array_ops.ones((), dtype=indices_dtype)
+    dim_indices = math_ops.range(start, dim_value, step)
+    dim_indices *= accum_dim_value
+    dim_shape = array_ops.stack(
+        [1] * (dim - 1) + [dim_value] + [1] * (indices_ndims - dim), axis=0)
+    batch_indices += array_ops.reshape(dim_indices, dim_shape)
+
+  return batch_indices
+
+
+def _BatchGatherGrad(
+    params_shape, values, indices, batch_dims, gather_dim_size):
+  """Returns the gradient of GatherV2 with batch dimensions."""
+
+  # Axis is the first non-batch dimension.
+  indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
+  if batch_dims:
+    values_shape = array_ops.shape(values)
+    # Add the batch offsets to indices and flatten the batch dimensions.
+    outer_shape = values_shape[:batch_dims]
+    inner_shape = values_shape[batch_dims:][1:]
+    batch_size = gen_math_ops.prod(outer_shape, [0], False)
+    flat_values_shape = array_ops.concat([[-1], inner_shape], 0)
+    gather_dim_size *= batch_size
+
+    indices = _GetBatchIndices(params_shape, indices, batch_dims)
+
+    with warnings.catch_warnings():
+      warnings.filterwarnings(
+          "ignore",
+          message="Converting sparse IndexedSlices to a dense Tensor.*")
+      values = array_ops.reshape(values, flat_values_shape)
+
+  indices = array_ops.reshape(indices, indices_size)
+  params_grad = math_ops.unsorted_segment_sum(values, indices, gather_dim_size)
+
+  if batch_dims:
+    # Put back the batch dimensions.
+    params_grad = array_ops.reshape(
+        params_grad, array_ops.concat([outer_shape, flat_values_shape], 0))
+
+  return params_grad
+
+
 @ops.RegisterGradient("GatherV2")
 def _GatherV2Grad(op, grad):
   """Gradient for GatherV2 op."""
@@ -495,6 +551,10 @@ def _GatherV2Grad(op, grad):
   indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
   axis = op.inputs[2]
   axis_static = tensor_util.constant_value(axis)
+  batch_dims = int(op.get_attr("batch_dims"))
+
+  if batch_dims < 0:
+    batch_dims += indices.shape.ndims
 
   # For axis 0 gathers, build an appropriately shaped IndexedSlices.
   if axis_static == 0:
@@ -509,44 +569,45 @@ def _GatherV2Grad(op, grad):
           message="Converting sparse IndexedSlices to a dense Tensor.*")
       values = array_ops.reshape(grad, values_shape)
     indices = array_ops.reshape(indices, indices_size)
-    return [ops.IndexedSlices(values, indices, params_shape), None, None]
+    params_grad = ops.IndexedSlices(values, indices, params_shape)
+  else:
+    # Handle axis by transposing the axis dimension to be the first non-batch
+    # dimension, compute the gradiend and transpose the result back.
+    outer_shape = params_shape[:axis]
+    inner_shape = params_shape[axis:][1:]
+    values_shape = array_ops.concat([outer_shape, [-1], inner_shape], 0)
 
-  outer_shape = params_shape[:axis]
-  outer_dims = array_ops.size(outer_shape)
-  inner_shape = params_shape[axis:][1:]
-  inner_dims = array_ops.size(inner_shape)
+    values_dims = array_ops.size(values_shape)
+    axis_dims = array_ops.size(outer_shape)
 
-  outer_axes_indices = math_ops.range(outer_dims)
-  inner_axes_indices = math_ops.range(outer_dims + 1,
-                                      outer_dims + 1 + inner_dims)
+    outer_batches_indices = math_ops.range(batch_dims)
+    batch_axis_indices = math_ops.range(batch_dims, axis_dims)
+    inner_axes_indices = math_ops.range(axis_dims + 1, values_dims)
 
-  values_shape = array_ops.concat([outer_shape, indices_size, inner_shape], 0)
-  with warnings.catch_warnings():
-    warnings.filterwarnings(
-        "ignore",
-        message="Converting sparse IndexedSlices to a dense Tensor.*")
-    values = array_ops.reshape(grad, values_shape)
-  indices = array_ops.reshape(indices, indices_size)
+    with warnings.catch_warnings():
+      warnings.filterwarnings(
+          "ignore",
+          message="Converting sparse IndexedSlices to a dense Tensor.*")
+      values = array_ops.reshape(grad, values_shape)
 
-  # We need to sum up every slice `values[..., i, ....]` corresponding to
-  # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
-  # support an axis parameter, we transpose the gather dimension to the front,
-  # then use `unsorted_segment_sum` to build a
-  # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
-  # affecting each index in `gather_axis` summed up.
-  transpose_dims = array_ops.concat(
-      [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
-  values_transpose = array_ops.transpose(values, transpose_dims)
-  num_segments = params_shape[axis]
+    # Move values[axis] up to values[batch_dims]
+    transpose_dims = array_ops.concat(
+        [outer_batches_indices, [axis_dims], batch_axis_indices,
+         inner_axes_indices],
+        0)
+    values_transpose = array_ops.transpose(values, transpose_dims)
 
-  params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
-                                              num_segments)
+    params_grad = _BatchGatherGrad(params_shape, values_transpose, indices,
+                                   batch_dims, params_shape[axis])
+
+    # Inverts the above transpose by moving dimension batch_dims back to its
+    # original position.
+    invert_transpose_dims = array_ops.concat(
+        [outer_batches_indices, batch_axis_indices + 1, [batch_dims],
+         inner_axes_indices],
+        0)
+    params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
 
-  # Inverts the above transpose by moving dimension 0 back to its original
-  # position.
-  invert_transpose_dims = array_ops.concat(
-      [outer_axes_indices + 1, [0], inner_axes_indices], 0)
-  params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
   return [params_grad, None, None]
 
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 063c081f4c6..a4a286a13f6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -56,6 +56,123 @@ tf_export("newaxis").export_constant(__name__, "newaxis")
 _BaseSlice = slice
 
 
+@tf_export("reshape", v1=["reshape", "manip.reshape"])
+def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
+  r"""Reshapes a tensor.
+
+  Given `tensor`, this operation returns a tensor that has the same values
+  as `tensor` with shape `shape`.
+
+  If one component of `shape` is the special value -1, the size of that
+  dimension is computed so that the total size remains constant.  In particular,
+  a `shape` of `[-1]` flattens into 1-D.  At most one component of `shape` can
+  be -1.
+
+  If `shape` is 1-D or higher, then the operation returns a tensor with shape
+  `shape` filled with the values of `tensor`. In this case, the number of
+  elements implied by `shape` must be the same as the number of elements in
+  `tensor`.
+
+  For example:
+
+  ```
+  # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+  # tensor 't' has shape [9]
+  reshape(t, [3, 3]) ==> [[1, 2, 3],
+                          [4, 5, 6],
+                          [7, 8, 9]]
+
+  # tensor 't' is [[[1, 1], [2, 2]],
+  #                [[3, 3], [4, 4]]]
+  # tensor 't' has shape [2, 2, 2]
+  reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+                          [3, 3, 4, 4]]
+
+  # tensor 't' is [[[1, 1, 1],
+  #                 [2, 2, 2]],
+  #                [[3, 3, 3],
+  #                 [4, 4, 4]],
+  #                [[5, 5, 5],
+  #                 [6, 6, 6]]]
+  # tensor 't' has shape [3, 2, 3]
+  # pass '[-1]' to flatten 't'
+  reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+
+  # -1 can also be used to infer the shape
+
+  # -1 is inferred to be 9:
+  reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                           [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+  # -1 is inferred to be 2:
+  reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                           [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+  # -1 is inferred to be 3:
+  reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+                                [2, 2, 2],
+                                [3, 3, 3]],
+                               [[4, 4, 4],
+                                [5, 5, 5],
+                                [6, 6, 6]]]
+
+  # tensor 't' is [7]
+  # shape `[]` reshapes to a scalar
+  reshape(t, []) ==> 7
+  ```
+
+  Args:
+    tensor: A `Tensor`.
+    shape: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      Defines the shape of the output tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `tensor`.
+  """
+  result = gen_array_ops.reshape(tensor, shape, name)
+  tensor_util.maybe_set_static_shape(result, shape)
+  return result
+
+
+@tf_export("fill")
+def fill(dims, value, name=None):
+  r"""Creates a tensor filled with a scalar value.
+
+  This operation creates a tensor of shape `dims` and fills it with `value`.
+
+  For example:
+
+  ```
+  # Output tensor has shape [2, 3].
+  fill([2, 3], 9) ==> [[9, 9, 9]
+                       [9, 9, 9]]
+  ```
+
+  `tf.fill` differs from `tf.constant` in a few ways:
+
+  *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+      Tensor values.
+  *   `tf.fill` creates an Op in the computation graph that constructs the
+  actual
+      Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+      the entire Tensor into the graph with a `Const` node.
+  *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+      based on other runtime Tensors, unlike `tf.constant`.
+
+  Args:
+    dims: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D.
+      Represents the shape of the output tensor.
+    value: A `Tensor`. 0-D (scalar). Value to fill the returned tensor.
+      @compatibility(numpy) Equivalent to np.full @end_compatibility
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `value`.
+  """
+  result = gen_array_ops.fill(dims, value, name=name)
+  tensor_util.maybe_set_static_shape(result, dims)
+  return result
+
+
 @tf_export("identity")
 @dispatch.add_dispatch_support
 def identity(input, name=None):  # pylint: disable=redefined-builtin
@@ -80,25 +197,14 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
     A `Tensor`. Has the same type as `input`.
   """
   if context.executing_eagerly() and not hasattr(input, "graph"):
+    # Make sure we get an input with handle data attached from resource
+    # variables. Variables have correct handle data when graph building.
     input = ops.convert_to_tensor(input)
-    in_device = input.backing_device
-    # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
-    context_device = context.context().device_name
-    if not context_device:
-      context_device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    if context_device == in_device:
-      return input
-    else:
-      copied = input._copy()  # pylint: disable=protected-access
-      if hasattr(copied, "_handle_data"):
-        copied._handle_data = input._handle_data  # pylint: disable=protected-access
-      return copied
-  else:
-    ret = gen_array_ops.identity(input, name=name)
-    # Propagate handle data for happier shape inference for resource variables.
-    if hasattr(input, "_handle_data"):
-      ret._handle_data = input._handle_data  # pylint: disable=protected-access
-    return ret
+  ret = gen_array_ops.identity(input, name=name)
+  # Propagate handle data for happier shape inference for resource variables.
+  if hasattr(input, "_handle_data"):
+    ret._handle_data = input._handle_data  # pylint: disable=protected-access
+  return ret
 
 
 # pylint: disable=redefined-builtin,protected-access
@@ -627,13 +733,15 @@ def _slice_helper(tensor, slice_spec, var=None):
       # python doesn't always use None when constructing ranges
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
-      if s.start is not None and s.start is not sys.maxsize:
+      if s.start is not None and (isinstance(s.start, ops.Tensor) or
+                                  s.start != sys.maxsize):
         _check_index(s.start)
         begin.append(s.start)
       else:
         begin.append(0)
         begin_mask |= (1 << index)
-      if s.stop is not None and s.stop != sys.maxsize:
+      if s.stop is not None and (isinstance(s.stop, ops.Tensor) or
+                                 s.stop != sys.maxsize):
         _check_index(s.stop)
         end.append(s.stop)
       else:
@@ -1307,9 +1415,8 @@ def concat(values, axis, name="concat"):
     with ops.name_scope(name) as scope:
       ops.convert_to_tensor(
           axis, name="concat_dim",
-          dtype=dtypes.int32).get_shape().assert_is_compatible_with(
-              tensor_shape.scalar())
-      return identity(values[0], name=scope)
+          dtype=dtypes.int32).get_shape().assert_has_rank(0)
+      return identity(values[0], name=name)
   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
 
 
@@ -1952,7 +2059,7 @@ def matrix_diag(diagonal,
     A Tensor. Has the same type as `diagonal`.
   """
   # LINT.IfChange
-  if compat.forward_compatible(2019, 7, 31):
+  if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/kernel_tests/diag_op_test.py)
 
     # Special case to sidestep the tf.constant conversion error:
@@ -2064,7 +2171,7 @@ def matrix_diag_part(
     A Tensor containing diagonals of `input`. Has the same type as `input`.
   """
   # LINT.IfChange
-  if compat.forward_compatible(2019, 7, 31):
+  if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/kernel_tests/diag_op_test.py)
 
     # Special case to sidestep the tf.constant conversion error:
@@ -2171,7 +2278,7 @@ def matrix_set_diag(
       and high ends of a matrix band. `k[0]` must not be larger than `k[1]`.
   """
   # LINT.IfChange
-  if compat.forward_compatible(2019, 7, 31):
+  if compat.forward_compatible(2019, 8, 31):
     # LINT.ThenChange(//tensorflow/python/kernel_tests/diag_op_test.py)
     return gen_array_ops.matrix_set_diag_v2(
         input=input, diagonal=diagonal, k=k, name=name)
@@ -2382,7 +2489,7 @@ def ones_like_v2(
     input,  # pylint: disable=redefined-builtin
     dtype=None,
     name=None):
-  """Creates a tensor with all elements set to zero.
+  """Creates a tensor with all elements set to one.
 
   Given a single tensor (`tensor`), this operation returns a tensor of the
   same type and shape as `tensor` with all elements set to 1. Optionally,
@@ -2403,7 +2510,7 @@ def ones_like_v2(
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` with all elements set to zero.
+    A `Tensor` with all elements set to one.
   """
   return ones_like_impl(input, dtype, name, optimize=True)
 
@@ -2729,11 +2836,11 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
   if mode == "CONSTANT":
     # TODO(rjryan): Once the forward compatibility period (3 weeks) have passed
     # remove the "Pad" fallback here.
-    if constant_values != 0:
+    if not tensor_util.is_tensor(constant_values) and constant_values == 0:
+      result = gen_array_ops.pad(tensor, paddings, name=name)
+    else:
       result = gen_array_ops.pad_v2(
           tensor, paddings, constant_values, name=name)
-    else:
-      result = gen_array_ops.pad(tensor, paddings, name=name)
   elif mode == "REFLECT":
     result = gen_array_ops.mirror_pad(
         tensor, paddings, mode="REFLECT", name=name)
@@ -3251,6 +3358,7 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
 
 
 @tf_export("one_hot")
+@dispatch.add_dispatch_support
 def one_hot(indices,
             depth,
             on_value=None,
@@ -3294,6 +3402,11 @@ def one_hot(indices,
     depth x batch x features if axis == 0
   ```
 
+  If `indices` is a RaggedTensor, the 'axis' argument must be positive and refer
+  to a non-ragged axis. The output will be equivalent to applying 'one_hot' on
+  the values of the RaggedTensor, and creating a new RaggedTensor from the
+  result.
+
   If `dtype` is not provided, it will attempt to assume the data type of
   `on_value` or `off_value`, if one or both are passed in. If none of
   `on_value`, `off_value`, or `dtype` are provided, `dtype` will default to the
@@ -3331,6 +3444,13 @@ def one_hot(indices,
   #   [0.0, 0.0, 1.0]],  # one_hot(2)
   #  [[0.0, 1.0, 0.0],   # one_hot(1)
   #   [0.0, 0.0, 0.0]]]  # one_hot(-1)
+
+  indices = tf.ragged.constant([[0, 1], [2]])
+  depth = 3
+  tf.one_hot(indices, depth)  # output: [2 x None x 3]
+  # [[[1., 0., 0.],
+  #   [0., 1., 0.]],
+  #  [[0., 0., 1.]]]
   ```
 
   Args:
@@ -3821,36 +3941,19 @@ def gather(params,
     A `Tensor`. Has the same type as `params`.
   """
   del validate_indices
-  if compat.forward_compatible(2019, 8, 10):
-    if axis is None:
-      axis = batch_dims
-    if axis != 0:
-      return gen_array_ops.gather_v2(
-          params, indices, axis, batch_dims=batch_dims, name=name)
-    try:
-      # TODO(apassos) find a less bad way of detecting resource variables
-      # without introducing a circular dependency.
-      return params.sparse_read(indices, name=name)
-    except AttributeError:
-      return gen_array_ops.gather_v2(
-          params, indices, axis, name=name)
 
-  if batch_dims != 0:
-    with ops.name_scope(name, "Gather", [params, indices, axis]):
-      return _batch_gather(params, indices, batch_dims, axis)
   if axis is None:
     axis = batch_dims
-  if axis != 0:
-    # Note that we do a sparse_read here to avoid snapshotting the entire
-    # resource variable and doing a gather, which can be inefficient and lead to
-    # subtle race conditions. TODO(apassos) implement axis != 0 on sparse_read
-    return gen_array_ops.gather_v2(params, indices, axis, name=name)
+  if tensor_util.constant_value(axis) != 0:
+    return gen_array_ops.gather_v2(
+        params, indices, axis, batch_dims=batch_dims, name=name)
   try:
-    # TODO(apassos) find a less bad way of detecting resource variables without
-    # introducing a circular dependency.
+    # TODO(apassos) find a less bad way of detecting resource variables
+    # without introducing a circular dependency.
     return params.sparse_read(indices, name=name)
   except AttributeError:
-    return gen_array_ops.gather_v2(params, indices, axis, name=name)
+    return gen_array_ops.gather_v2(
+        params, indices, axis, name=name)
 
 
 @tf_export("gather", v1=[])
@@ -4325,39 +4428,66 @@ def quantize_and_dequantize(input,  # pylint: disable=redefined-builtin
                             range_given=False,
                             round_mode="HALF_TO_EVEN",
                             name=None,
-                            narrow_range=False):
+                            narrow_range=False,
+                            axis=None):
   """Quantizes then dequantizes a tensor.
 
   Args:
     input: A `Tensor` to quantize and dequantize.
-    input_min: If range_given=True, the minimum input value that needs to be
-      represented in the quantized representation.
+    input_min: If range_given=True, the minimum input value, that needs to be
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of minimum values for each slice along axis.
     input_max: If range_given=True, the maximum input value that needs to be
-      represented in the quantized representation.
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of maximum values for each slice along axis.
     signed_input: True if the quantization is signed or unsigned.
     num_bits: The bitwidth of the quantization.
     range_given: If true use `input_min` and `input_max` for the range of the
       input, otherwise determine min and max from the input `Tensor`.
     round_mode: Rounding mode when rounding from float values to quantized ones.
+      one of ['HALF_TO_EVEN', 'HALF_UP']
     name: Optional name for the operation.
     narrow_range: If true, then the absolute value of the quantized minimum
       value is the same as the quantized maximum value, instead of 1 greater.
       i.e. for 8 bit quantization, the minimum value is -127 instead of -128.
+    axis: Integer. If specified, refers to a dimension of the input tensor,
+      such that quantization will be per slice along that dimension.
 
   Returns:
     A `Tensor`. Each element is the result of quantizing and dequantizing the
     corresponding element of `input`.
   """
-  return gen_array_ops.quantize_and_dequantize_v2(
-      input,
-      input_min=input_min,
-      input_max=input_max,
-      signed_input=signed_input,
-      num_bits=num_bits,
-      range_given=range_given,
-      round_mode=round_mode,
-      narrow_range=narrow_range,
-      name=name)
+  if axis is not None:
+    if axis < 0:
+      if input.shape.ndims is None:
+        raise ValueError("input should have known rank to use negative axis.")
+      axis %= input.shape.ndims
+  else:
+    axis = -1
+
+  if compat.forward_compatible(2019, 9, 25) or axis >= 0:
+    return gen_array_ops.quantize_and_dequantize_v2(
+        input,
+        input_min=input_min,
+        input_max=input_max,
+        signed_input=signed_input,
+        num_bits=num_bits,
+        range_given=range_given,
+        round_mode=round_mode,
+        narrow_range=narrow_range,
+        axis=axis,
+        name=name)
+  else:
+    return gen_array_ops.quantize_and_dequantize_v2(
+        input,
+        input_min=input_min,
+        input_max=input_max,
+        signed_input=signed_input,
+        num_bits=num_bits,
+        range_given=range_given,
+        round_mode=round_mode,
+        narrow_range=narrow_range,
+        name=name)
 
 
 @tf_export("searchsorted")
@@ -4612,3 +4742,213 @@ def fingerprint(data, method="farmhash64", name=None):
     fingerprint algorithm.
   """
   return gen_array_ops.fingerprint(data, method, name)
+
+
+def convert_to_int_tensor(tensor, name, dtype=dtypes.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = gen_math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError("%s must be an integer tensor; dtype=%s" %
+                    (name, tensor.dtype))
+  return tensor
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError("axis=%s out of bounds: expected %s<=axis<%s" %
+                       (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError("axis may only be negative if ndims is statically known.")
+  return axis
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat_with_axis(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+
+  with ops.name_scope(name, "Repeat", [data, repeats]):
+    data = ops.convert_to_tensor(data, name="data")
+    repeats = convert_to_int_tensor(repeats, name="repeats")
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                            axis=0)
+      return reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = shape(repeats)
+      repeats_ndims = rank(repeats)
+      broadcast_shape = concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = gen_math_ops.maximum(
+        0, gen_math_ops._max(repeats, _all_dimensions(repeats)))
+    mask = sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                            axis=0)
+      result = reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones_value = ones(rank(data), dtypes.int32)
+    multiples = concat([ones_value[:axis], [multiple], ones_value[axis + 1:]],
+                       axis=0)
+  return tile(data, multiples)
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return stack([data])
+    else:
+      return data
+  else:
+    data_shape = shape(data)
+    data_ndims = rank(data)
+    return reshape(data, concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+@tf_export("repeat")
+def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
+  """Repeat elements of `input`
+
+  Args:
+    input: An `N`-dimensional Tensor.
+    repeats: An 1-D `int` Tensor. The number of repetitions for each element.
+      repeats is broadcasted to fit the shape of the given axis. `len(repeats)`
+      must equal `input.shape[axis]` if axis is not None.
+    axis: An int. The axis along which to repeat values. By default (axis=None),
+      use the flattened input array, and return a flat output array.
+    name: A name for the operation.
+
+  Returns:
+    A Tensor which has the same shape as `input`, except along the given axis.
+      If axis is None then the output array is flattened to match the flattened
+      input array.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    >>> repeat(3, repeats=4)
+    [3, 3, 3, 3]
+    >>> repeat([[1,2], [3,4]], repeats=2)
+    [1, 1, 2, 2, 3, 3, 4, 4]
+    ```
+  """
+  if axis is None:
+    input = reshape(input, [-1])
+    axis = 0
+  return repeat_with_axis(input, repeats, axis, name)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 63b26e913d2..844b428a396 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_sparse_agg
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_sparse_calculate_best_feature_split as sparse_calculate_best_feature_split
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble_v2 as update_ensemble_v2
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as is_quantile_resource_initialized
 # pylint: enable=unused-import
 
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3997c401dc3..3d3cc7f8336 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -90,6 +90,287 @@ def _shape_and_dtype_str(tensor):
   return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
 
 
+def _unary_assert_doc(sym, sym_name):
+  """Common docstring for assert_* ops that evaluate a unary predicate over every element of a tensor.
+
+  Args:
+    sym: Mathematical symbol for the check performed on each element, i.e. "> 0"
+    sym_name: English-language name for the op described by sym
+
+  Returns:
+    Decorator that adds the appropriate docstring to the function for symbol
+    `sym`.
+  """
+
+  def _decorator(func):
+    """Generated decorator that adds the appropriate docstring to the function for symbol `sym`.
+
+    Args:
+      func: Function for a TensorFlow op
+
+    Returns:
+      Version of `func` with documentation attached.
+    """
+    opname = func.__name__
+    cap_sym_name = sym_name.capitalize()
+
+    func.__doc__ = """
+    Assert the condition `x {sym}` holds element-wise.
+
+    When running in graph mode, you should add a dependency on this operation
+    to ensure that it runs. Example of adding a dependency to an operation:
+
+    ```python
+    with tf.control_dependencies([tf.debugging.{opname}(x, y)]):
+      output = tf.reduce_sum(x)
+    ```
+
+    {sym_name} means, for every element `x[i]` of `x`, we have `x[i] {sym}`.
+    If `x` is empty this is trivially satisfied.
+
+    Args:
+      x:  Numeric `Tensor`.
+      data:  The tensors to print out if the condition is False.  Defaults to
+        error message and first few entries of `x`.
+      summarize: Print this many entries of each tensor.
+      message: A string to prefix to the default message.
+      name: A name for this operation (optional).  Defaults to "{opname}".
+
+    Returns:
+      Op that raises `InvalidArgumentError` if `x {sym}` is False.
+      @compatibility(eager)
+        returns None
+      @end_compatibility
+
+    Raises:
+      InvalidArgumentError: if the check can be performed immediately and
+        `x {sym}` is False. The check can be performed immediately during 
+        eager execution or if `x` is statically known.
+    """.format(
+        sym=sym, sym_name=cap_sym_name, opname=opname)
+    return func
+
+  return _decorator
+
+
+def _binary_assert_doc(sym):
+  """Common docstring for most of the v1 assert_* ops that compare two tensors element-wise.
+
+  Args:
+    sym: Binary operation symbol, i.e. "=="
+
+  Returns:
+    Decorator that adds the appropriate docstring to the function for
+  symbol `sym`.
+  """
+
+  def _decorator(func):
+    """Generated decorator that adds the appropriate docstring to the function for symbol `sym`.
+
+    Args:
+      func: Function for a TensorFlow op
+
+    Returns:
+      A version of `func` with documentation attached.
+    """
+    opname = func.__name__
+
+    func.__doc__ = """
+    Assert the condition `x {sym} y` holds element-wise.
+
+    This condition holds if for every pair of (possibly broadcast) elements
+    `x[i]`, `y[i]`, we have `x[i] {sym} y[i]`.
+    If both `x` and `y` are empty, this is trivially satisfied.
+
+    When running in graph mode, you should add a dependency on this operation
+    to ensure that it runs. Example of adding a dependency to an operation:
+
+    ```python
+    with tf.control_dependencies([tf.compat.v1.{opname}(x, y)]):
+      output = tf.reduce_sum(x)
+    ```
+
+    Args:
+      x:  Numeric `Tensor`.
+      y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+      data:  The tensors to print out if the condition is False.  Defaults to
+        error message and first few entries of `x`, `y`.
+      summarize: Print this many entries of each tensor.
+      message: A string to prefix to the default message.
+      name: A name for this operation (optional).  Defaults to "{opname}".
+
+    Returns:
+      Op that raises `InvalidArgumentError` if `x {sym} y` is False.
+      @compatibility(eager)
+        returns None
+      @end_compatibility
+
+    Raises:
+      InvalidArgumentError: if the check can be performed immediately and
+        `x {sym} y` is False. The check can be performed immediately during 
+        eager execution or if `x` and `y` are statically known.
+    """.format(
+        sym=sym, opname=opname)
+    return func
+
+  return _decorator
+
+
+def _make_assert_msg_data(sym, x, y, summarize, test_op):
+  """Subroutine of _binary_assert that generates the components of the default error message when running in eager mode.
+
+  Args:
+    sym: Mathematical symbol for the test to apply to pairs of tensor elements,
+      i.e. "=="
+    x: First input to the assertion after applying `convert_to_tensor()`
+    y: Second input to the assertion
+    summarize: Value of the "summarize" parameter to the original assert_* call;
+      tells how many elements of each tensor to print.
+    test_op: TensorFlow op that returns a Boolean tensor with True in each
+      position where the assertion is satisfied.
+
+  Returns:
+    List of tensors and scalars that, when stringified and concatenated,
+    will produce the error message string.
+  """
+  # Prepare a message with first elements of x and y.
+  data = []
+
+  data.append('Condition x %s y did not hold.' % sym)
+
+  if summarize > 0:
+    if x.shape == y.shape and x.shape.as_list():
+      # If the shapes of x and y are the same (and not scalars),
+      # Get the values that actually differed and their indices.
+      # If shapes are different this information is more confusing
+      # than useful.
+      mask = math_ops.logical_not(test_op)
+      indices = array_ops.where(mask)
+      indices_np = indices.numpy()
+      x_vals = array_ops.boolean_mask(x, mask)
+      y_vals = array_ops.boolean_mask(y, mask)
+      num_vals = min(summarize, indices_np.shape[0])
+      data.append('Indices of first %d different values:' % num_vals)
+      data.append(indices_np[:num_vals])
+      data.append('Corresponding x values:')
+      data.append(x_vals.numpy().reshape((-1,))[:num_vals])
+      data.append('Corresponding y values:')
+      data.append(y_vals.numpy().reshape((-1,))[:num_vals])
+
+    # reshape((-1,)) is the fastest way to get a flat array view.
+    x_np = x.numpy().reshape((-1,))
+    y_np = y.numpy().reshape((-1,))
+    x_sum = min(x_np.size, summarize)
+    y_sum = min(y_np.size, summarize)
+    data.append('First %d elements of x:' % x_sum)
+    data.append(x_np[:x_sum])
+    data.append('First %d elements of y:' % y_sum)
+    data.append(y_np[:y_sum])
+
+  return data
+
+
+def _pretty_print(data_item, summarize):
+  """Format a data item for use in an error message in eager mode.
+
+  Args:
+    data_item: One of the items in the "data" argument to an assert_* function.
+      Can be a Tensor or a scalar value.
+    summarize: How many elements to retain of each tensor-valued entry in data.
+
+  Returns:
+    An appropriate string representation of data_item
+  """
+  if isinstance(data_item, ops.Tensor):
+    arr = data_item.numpy()
+    if np.isscalar(arr):
+      # Tensor.numpy() returns a scalar for zero-dimensional tensors
+      return str(arr)
+    else:
+      flat = arr.reshape((-1,))
+      lst = [str(x) for x in flat[:summarize]]
+      if len(lst) < flat.size:
+        lst.append('...')
+      return str(lst)
+  else:
+    return str(data_item)
+
+
+def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
+                   message, name):
+  """Generic binary elementwise assertion.
+
+  Implements the behavior described in _binary_assert_doc() above.
+  Args:
+    sym: Mathematical symbol for the test to apply to pairs of tensor elements,
+      i.e. "=="
+    opname: Name of the assert op in the public API, i.e. "assert_equal"
+    op_func: Function that, if passed the two Tensor inputs to the assertion (x
+      and y), will return the test to be passed to reduce_all() i.e.
+    static_func: Function that, if passed numpy ndarray versions of the two
+      inputs to the assertion, will return a Boolean ndarray with containing
+      True in all positions where the assertion PASSES.
+      i.e. lambda x,y: (x == y) for assert_equal()
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    data:  The tensors to print out if the condition is False.  Defaults to
+      error message and first few entries of `x`, `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to the value of
+      `opname`.
+
+  Returns:
+    See docstring template in _binary_assert_doc().
+  """
+  with ops.name_scope(name, opname, [x, y, data]):
+    x = ops.convert_to_tensor(x, name='x')
+    y = ops.convert_to_tensor(y, name='y')
+
+    if context.executing_eagerly():
+      test_op = op_func(x, y)
+      condition = math_ops.reduce_all(test_op)
+      if condition:
+        return
+
+      # If we get here, the assertion has failed.
+      # Default to printing 3 elements like control_flow_ops.Assert (used
+      # by graph mode) does. Also treat negative values as "print
+      # everything" for consistency with Tensor::SummarizeValue().
+      if summarize is None:
+        summarize = 3
+      elif summarize < 0:
+        summarize = 1e9  # Code below will find exact size of x and y.
+
+      if data is None:
+        data = _make_assert_msg_data(sym, x, y, summarize, test_op)
+
+      if message is not None:
+        data = [message] + list(data)
+
+      raise errors.InvalidArgumentError(
+          node_def=None,
+          op=None,
+          message=('\n'.join([_pretty_print(d, summarize) for d in data])))
+
+    else:  # not context.executing_eagerly()
+      if data is None:
+        data = [
+            'Condition x %s y did not hold element-wise:' % sym,
+            'x (%s) = ' % x.name, x,
+            'y (%s) = ' % y.name, y
+        ]
+      if message is not None:
+        data = [message] + list(data)
+      condition = math_ops.reduce_all(op_func(x, y))
+      x_static = tensor_util.constant_value(x)
+      y_static = tensor_util.constant_value(y)
+      if x_static is not None and y_static is not None:
+        condition_static = static_func(x_static, y_static).all()
+        _assert_static(condition_static, data)
+      return control_flow_ops.Assert(condition, data, summarize=summarize)
+
+
 @tf_export(
     'debugging.assert_proper_iterable',
     v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
@@ -155,30 +436,8 @@ def assert_negative_v2(x, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_negative', 'assert_negative'])
 @deprecation.deprecated_endpoints('assert_negative')
-def assert_negative(x, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x < 0` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_negative(x)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  Negative means, for every element `x[i]` of `x`, we have `x[i] < 0`.
-  If `x` is empty this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_negative".
-
-  Returns:
-    Op raising `InvalidArgumentError` unless `x` is all negative.
-  """
+@_unary_assert_doc('< 0', 'negative')
+def assert_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   message = message or ''
   with ops.name_scope(name, 'assert_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
@@ -229,30 +488,8 @@ def assert_positive_v2(x, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_positive', 'assert_positive'])
 @deprecation.deprecated_endpoints('assert_positive')
-def assert_positive(x, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x > 0` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_positive(x)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  Positive means, for every element `x[i]` of `x`, we have `x[i] > 0`.
-  If `x` is empty this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_positive".
-
-  Returns:
-    Op raising `InvalidArgumentError` unless `x` is all positive.
-  """
+@_unary_assert_doc('> 0', 'positive')
+def assert_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   message = message or ''
   with ops.name_scope(name, 'assert_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
@@ -304,31 +541,8 @@ def assert_non_negative_v2(x, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
 @deprecation.deprecated_endpoints('assert_non_negative')
-def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x >= 0` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_non_negative(x)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  Non-negative means, for every element `x[i]` of `x`, we have `x[i] >= 0`.
-  If `x` is empty this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-      Defaults to "assert_non_negative".
-
-  Returns:
-    Op raising `InvalidArgumentError` unless `x` is all non-negative.
-  """
+@_unary_assert_doc('>= 0', 'non-negative')
+def assert_non_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   message = message or ''
   with ops.name_scope(name, 'assert_non_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
@@ -381,31 +595,8 @@ def assert_non_positive_v2(x, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
 @deprecation.deprecated_endpoints('assert_non_positive')
-def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x <= 0` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_non_positive(x)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  Non-positive means, for every element `x[i]` of `x`, we have `x[i] <= 0`.
-  If `x` is empty this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-      Defaults to "assert_non_positive".
-
-  Returns:
-    Op raising `InvalidArgumentError` unless `x` is all non-positive.
-  """
+@_unary_assert_doc('<= 0', 'non-positive')
+def assert_non_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   message = message or ''
   with ops.name_scope(name, 'assert_non_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
@@ -457,105 +648,15 @@ def assert_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_equal', 'assert_equal'])
-def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x == y` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_equal(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] == y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x == y` is False.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x == y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
-  message = message or ''
+@_binary_assert_doc('==')
+def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-
-    if context.executing_eagerly():
-      eq = math_ops.equal(x, y)
-      condition = math_ops.reduce_all(eq)
-      if not condition:
-        # Prepare a message with first elements of x and y.
-        summary_msg = ''
-        # Default to printing 3 elements like control_flow_ops.Assert (used
-        # by graph mode) does.
-        summarize = 3 if summarize is None else summarize
-        if summarize:
-          # reshape((-1,)) is the fastest way to get a flat array view.
-          x_np = x.numpy().reshape((-1,))
-          y_np = y.numpy().reshape((-1,))
-          x_sum = min(x_np.size, summarize)
-          y_sum = min(y_np.size, summarize)
-          summary_msg = ('First %d elements of x:\n%s\n'
-                         'First %d elements of y:\n%s\n' %
-                         (x_sum, x_np[:x_sum],
-                          y_sum, y_np[:y_sum]))
-
-        index_and_values_str = ''
-        if x.shape == y.shape and x.shape.as_list():
-          # If the shapes of x and y are the same (and not scalars),
-          # Get the values that actually differed and their indices.
-          # If shapes are different this information is more confusing
-          # than useful.
-          mask = math_ops.logical_not(eq)
-          indices = array_ops.where(mask)
-          indices_np = indices.numpy()
-          x_vals = array_ops.boolean_mask(x, mask)
-          y_vals = array_ops.boolean_mask(y, mask)
-          summarize = min(summarize, indices_np.shape[0])
-          index_and_values_str = (
-              'Indices of first %s different values:\n%s\n'
-              'Corresponding x values:\n%s\n'
-              'Corresponding y values:\n%s\n' %
-              (summarize, indices_np[:summarize],
-               x_vals.numpy().reshape((-1,))[:summarize],
-               y_vals.numpy().reshape((-1,))[:summarize]))
-
-        raise errors.InvalidArgumentError(
-            node_def=None, op=None,
-            message=('%s\nCondition x == y did not hold.\n%s%s' %
-                     (message or '', index_and_values_str, summary_msg)))
-      return
-
-    if data is None:
-      data = [
-          message,
-          'Condition x == y did not hold element-wise:',
-          'x (%s) = ' % x.name, x,
-          'y (%s) = ' % y.name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.equal(x, y))
-    x_static = tensor_util.constant_value(x)
-    y_static = tensor_util.constant_value(y)
-    if x_static is not None and y_static is not None:
-      condition_static = (x_static == y_static).all()
-      _assert_static(condition_static, data)
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+    # Short-circuit if x and y are the same tensor.
+    if x is y:
+      return None if context.executing_eagerly() else control_flow_ops.no_op()
+  return _binary_assert('==', 'assert_equal', math_ops.equal,
+                        lambda x, y: (x == y),
+                        x, y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
@@ -598,54 +699,12 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 
 @tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
 @deprecation.deprecated_endpoints('assert_none_equal')
+@_binary_assert_doc('!=')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x != y` holds for all elements.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_none_equal(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] != y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-      Defaults to "assert_none_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x != y` is ever False.
-  """
-  message = message or ''
-  with ops.name_scope(name, 'assert_none_equal', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-    if context.executing_eagerly():
-      x_name = _shape_and_dtype_str(x)
-      y_name = _shape_and_dtype_str(y)
-    else:
-      x_name = x.name
-      y_name = y.name
-
-    if data is None:
-      data = [
-          message,
-          'Condition x != y did not hold for every single element:',
-          'x (%s) = ' % x_name, x,
-          'y (%s) = ' % y_name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.not_equal(x, y))
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+  return _binary_assert('!=', 'assert_none_equal', math_ops.not_equal,
+                        lambda x, y: (x != y), x, y, data, summarize, message,
+                        name)
 
 
 @tf_export('debugging.assert_near', v1=[])
@@ -816,51 +875,10 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
+@_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x < y` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_less(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] < y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_less".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x < y` is False.
-  """
-  message = message or ''
-  with ops.name_scope(name, 'assert_less', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-    if context.executing_eagerly():
-      x_name = _shape_and_dtype_str(x)
-      y_name = _shape_and_dtype_str(y)
-    else:
-      x_name = x.name
-      y_name = y.name
-
-    if data is None:
-      data = [
-          message,
-          'Condition x < y did not hold element-wise:',
-          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.less(x, y))
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+  return _binary_assert('<', 'assert_less', math_ops.less, lambda x, y: (x < y),
+                        x, y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
@@ -901,51 +919,11 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
 @deprecation.deprecated_endpoints('assert_less_equal')
+@_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x <= y` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_less_equal(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] <= y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_less_equal"
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x <= y` is False.
-  """
-  message = message or ''
-  with ops.name_scope(name, 'assert_less_equal', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-    if context.executing_eagerly():
-      x_name = _shape_and_dtype_str(x)
-      y_name = _shape_and_dtype_str(y)
-    else:
-      x_name = x.name
-      y_name = y.name
-
-    if data is None:
-      data = [
-          message,
-          'Condition x <= y did not hold element-wise:'
-          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.less_equal(x, y))
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+  return _binary_assert('<=', 'assert_less_equal', math_ops.less_equal,
+                        lambda x, y: (x <= y), x, y, data, summarize, message,
+                        name)
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
@@ -985,51 +963,11 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
-def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
-  """Assert the condition `x > y` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_greater(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] > y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to "assert_greater".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x > y` is False.
-  """
-  message = message or ''
-  with ops.name_scope(name, 'assert_greater', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-    if context.executing_eagerly():
-      x_name = _shape_and_dtype_str(x)
-      y_name = _shape_and_dtype_str(y)
-    else:
-      x_name = x.name
-      y_name = y.name
-
-    if data is None:
-      data = [
-          message,
-          'Condition x > y did not hold element-wise:'
-          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.greater(x, y))
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+@_binary_assert_doc('>')
+def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
+  return _binary_assert('>', 'assert_greater', math_ops.greater,
+                        lambda x, y: (x > y),
+                        x, y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
@@ -1071,53 +1009,12 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 
 @tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
 @deprecation.deprecated_endpoints('assert_greater_equal')
+@_binary_assert_doc('>=')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
-  """Assert the condition `x >= y` holds element-wise.
-
-  Example of adding a dependency to an operation:
-
-  ```python
-  with tf.control_dependencies([tf.compat.v1.assert_greater_equal(x, y)]):
-    output = tf.reduce_sum(x)
-  ```
-
-  This condition holds if for every pair of (possibly broadcast) elements
-  `x[i]`, `y[i]`, we have `x[i] >= y[i]`.
-  If both `x` and `y` are empty, this is trivially satisfied.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`, `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to
-      "assert_greater_equal"
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x >= y` is False.
-  """
-  message = message or ''
-  with ops.name_scope(name, 'assert_greater_equal', [x, y, data]):
-    x = ops.convert_to_tensor(x, name='x')
-    y = ops.convert_to_tensor(y, name='y')
-    if context.executing_eagerly():
-      x_name = _shape_and_dtype_str(x)
-      y_name = _shape_and_dtype_str(y)
-    else:
-      x_name = x.name
-      y_name = y.name
-
-    if data is None:
-      data = [
-          message,
-          'Condition x >= y did not hold element-wise:'
-          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.greater_equal(x, y))
-    return control_flow_ops.Assert(condition, data, summarize=summarize)
+  return _binary_assert('>=', 'assert_greater_equal', math_ops.greater_equal,
+                        lambda x, y: (x >= y), x, y, data, summarize, message,
+                        name)
 
 
 def _assert_rank_condition(
@@ -2262,3 +2159,4 @@ def ensure_shape(x, shape, name=None):
 def _ensure_shape_grad(op, grad):
   del op  # Unused.
   return grad
+
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index a247a33c10d..bb880199384 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import six
 
 from tensorflow.python.framework import constant_op
@@ -31,6 +29,7 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -49,14 +48,14 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   correct results.
 
   For example:
-  
+
   ```python
   A = tf.constant([[1, 20, 13], [3, 21, 13]])
   B = tf.clip_by_value(A, clip_value_min=0, clip_value_max=3) # [[1, 3, 3],[3, 3, 3]]
-  C = tf.clip_by_value(A, clip_value_min=0., clip_value_max=3.) # throws `TypeError` 
+  C = tf.clip_by_value(A, clip_value_min=0., clip_value_max=3.) # throws `TypeError`
   as input and clip_values are of different dtype
   ```
-  
+
   Args:
     t: A `Tensor` or `IndexedSlices`.
     clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
@@ -71,8 +70,8 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   Raises:
     ValueError: If the clip tensors would trigger array broadcasting
       that would make the returned tensor larger than the input.
-    TypeError: If dtype of the input is `int32` and dtype of 
-    the `clip_value_min' or `clip_value_max` is `float32`  
+    TypeError: If dtype of the input is `int32` and dtype of
+    the `clip_value_min' or `clip_value_max` is `float32`
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
@@ -208,8 +207,8 @@ def global_norm(t_list, name=None):
   Raises:
     TypeError: If `t_list` is not a sequence.
   """
-  if (not isinstance(t_list, collections.Sequence)
-      or isinstance(t_list, six.string_types)):
+  if (not isinstance(t_list, collections_abc.Sequence) or
+      isinstance(t_list, six.string_types)):
     raise TypeError("t_list should be a sequence")
   t_list = list(t_list)
   with ops.name_scope(name, "global_norm", t_list) as name:
@@ -282,8 +281,8 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   Raises:
     TypeError: If `t_list` is not a sequence.
   """
-  if (not isinstance(t_list, collections.Sequence)
-      or isinstance(t_list, six.string_types)):
+  if (not isinstance(t_list, collections_abc.Sequence) or
+      isinstance(t_list, six.string_types)):
     raise TypeError("t_list should be a sequence")
   t_list = list(t_list)
   if use_norm is None:
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 14ed4fd446c..e4f8dd8c2ea 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -22,7 +22,7 @@ from tensorflow.python.ops import gen_collective_ops
 
 
 def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
-               subdiv_offsets=(0,)):
+               subdiv_offsets=(0,), communication_hint='auto'):
   """Reduces tensors collectively, across devices.
 
   Args:
@@ -38,6 +38,9 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
     subdiv_offsets: a list of integer offsets into the tensor at which each
       independent subdivision should begin.  Use [0] if no subdivision should
       be done.
+    communication_hint: preferred collective communication.  The implementation
+      may fall back to another mechanism.  Options include `auto`, `ring`, and
+      `nccl`.
 
   Returns:
     An Op implementing the distributed reduction.
@@ -49,16 +52,19 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
     raise ValueError('Device assignment required for collective ops')
   if group_size <= 1:
     raise ValueError('Parameter group_size to all_reduce must be at least 2.')
-  return gen_collective_ops.collective_reduce(t,
-                                              group_size=group_size,
-                                              group_key=group_key,
-                                              instance_key=instance_key,
-                                              merge_op=merge_op,
-                                              final_op=final_op,
-                                              subdiv_offsets=subdiv_offsets)
+  return gen_collective_ops.collective_reduce(
+      t,
+      group_size=group_size,
+      group_key=group_key,
+      instance_key=instance_key,
+      merge_op=merge_op,
+      final_op=final_op,
+      subdiv_offsets=subdiv_offsets,
+      communication_hint=communication_hint.lower())
 
 
-def all_gather(t, group_size, group_key, instance_key):
+def all_gather(t, group_size, group_key, instance_key,
+               communication_hint='auto'):
   """Accumulates tensors collectively, across devices, along first dimension.
 
   Args:
@@ -67,6 +73,9 @@ def all_gather(t, group_size, group_key, instance_key):
       Each must reside on a different device.
     group_key: an integer identifying the group of devices.
     instance_key: an integer identifying the participating group of Ops.
+    communication_hint: preferred collective communication.  The implementation
+      may fall back to another mechanism.  Options include `auto`, `ring`, and
+      `nccl`.
 
   Returns:
     An Op implementing the distributed operation.
@@ -83,10 +92,12 @@ def all_gather(t, group_size, group_key, instance_key):
       shape=[0],
       group_size=group_size,
       group_key=group_key,
-      instance_key=instance_key)
+      instance_key=instance_key,
+      communication_hint=communication_hint.lower())
 
 
-def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
+def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
+                   communication_hint='auto'):
   """Broadcasts one tensor to a group of others, across devices.
 
   Args:
@@ -98,6 +109,9 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
       different device.
     group_key: an integer identifying the group of devices.
     instance_key: an integer identifying the participating group of Ops.
+    communication_hint: preferred collective communication.  The implementation
+      may fall back to another mechanism.  Options include `auto`, `ring`, and
+      `nccl`.
 
   Returns:
     An Op implementing the distributed broadcast send.
@@ -126,14 +140,17 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
   if t.dtype != dtype:
     raise ValueError(
         'Type of broadcast_send tensor not equal to declared type')
-  return gen_collective_ops.collective_bcast_send(t,
-                                                  shape=shape,
-                                                  group_size=group_size,
-                                                  group_key=group_key,
-                                                  instance_key=instance_key)
+  return gen_collective_ops.collective_bcast_send(
+      t,
+      shape=shape,
+      group_size=group_size,
+      group_key=group_key,
+      instance_key=instance_key,
+      communication_hint=communication_hint.lower())
 
 
-def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
+def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
+                   communication_hint='auto'):
   """Receives a broadcasts tensor, across devices.
 
   Args:
@@ -144,6 +161,9 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
       different device.
     group_key: an integer identifying the group of devices.
     instance_key: an integer identifying the participating group of Ops.
+    communication_hint: preferred collective communication.  The implementation
+      may fall back to another mechanism.  Options include `auto`, `ring`, and
+      `nccl`.
 
   Returns:
     An Op implementing the broadcast receive.
@@ -154,8 +174,10 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
   if group_size <= 1:
     raise ValueError(
         'Parameter group_size to broadcast_send must be at least 2.')
-  return gen_collective_ops.collective_bcast_recv(shape=shape,
-                                                  T=dtype,
-                                                  group_size=group_size,
-                                                  group_key=group_key,
-                                                  instance_key=instance_key)
+  return gen_collective_ops.collective_bcast_recv(
+      shape=shape,
+      T=dtype,
+      group_size=group_size,
+      group_key=group_key,
+      instance_key=instance_key,
+      communication_hint=communication_hint.lower())
diff --git a/tensorflow/python/ops/collective_ops_benchmark.py b/tensorflow/python/ops/collective_ops_benchmark.py
new file mode 100644
index 00000000000..870dec525b2
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops_benchmark.py
@@ -0,0 +1,86 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Local CPU benchmarks for collective ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import test
+
+
+class CollectiveOpBenchmark(test.Benchmark):
+  """Benchmarks for local CPU collective op execution."""
+
+  def benchmark_collective(self):
+    """Measures the performance of local CPU collective execution."""
+    shapes = [(10,), (1000,), (1000000,)]
+    devices = [2, 4, 8]
+    collective_key_counter = 0
+
+    for group_size in devices:
+      group_key = collective_key_counter
+      instance_key = collective_key_counter
+      collective_key_counter += 1
+
+      for shape in shapes:
+        config = config_pb2.ConfigProto(device_count={"CPU": group_size})
+        with session.Session(config=config) as sess:
+          # Use a C++ callable to minimize the Python overhead in the benchmark.
+          callable_opts = config_pb2.CallableOptions()
+          reduce_ops = []
+          for device in range(group_size):
+            with ops.device("CPU:{}".format(device)):
+              t = constant_op.constant(np.multiply(range(shape[0]), 1.0))
+              r = collective_ops.all_reduce(t, group_size, group_key,
+                                            instance_key, "Add", "Div")
+              reduce_ops.append(r)
+              callable_opts.target.append(r.name)
+          op_callable = sess._make_callable_from_options(callable_opts)  # pylint: disable=protected-access
+
+          # Run five steps to warm up the session caches and do collective param
+          # resolution before taking the first measurement.
+          for _ in range(5):
+            op_callable()
+          deltas = []
+          overall_start = time.time()
+          # Run at least five repetitions and for at least five seconds.
+          while len(deltas) < 5 or time.time() - overall_start < 5.0:
+            start = time.time()
+            for _ in range(100):
+              op_callable()
+            end = time.time()
+            deltas.append(end - start)
+          del op_callable
+
+        median_wall_time = np.median(deltas) / 100.0
+        iters = len(deltas) * 100
+
+        self.report_benchmark(
+            iters=iters, wall_time=median_wall_time,
+            name="num_elements_{}_num_devices_{}".format(np.prod(shape),
+                                                         group_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/collective_ops_gpu_test.py b/tensorflow/python/ops/collective_ops_gpu_test.py
new file mode 100644
index 00000000000..20fb982ae11
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops_gpu_test.py
@@ -0,0 +1,239 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Collective Operations that require GPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+class CollectiveOpGPUTest(test.TestCase):
+
+  def _configure(self, group_size, set_config_proto_nccl=True):
+    """Set environment variables and return `ConfigProto` for NCCL execution."""
+    # Configure virtual GPU devices
+    virtual_devices = [config_pb2.GPUOptions.Experimental.VirtualDevices(
+        memory_limit_mb=([1 << 10] * group_size))]  # 1 GB per virtual GPU
+    gpu_options = config_pb2.GPUOptions(
+        visible_device_list='0',
+        experimental=config_pb2.GPUOptions.Experimental(
+            virtual_devices=virtual_devices))
+    # Configure NCCL
+    os.environ['NCCL_DEBUG'] = 'INFO'
+    os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL'
+    experimental = config_pb2.ConfigProto.Experimental()
+    if set_config_proto_nccl:
+      experimental.collective_nccl = True
+    return config_pb2.ConfigProto(gpu_options=gpu_options,
+                                  experimental=experimental)
+
+  @test_util.run_deprecated_v1
+  def testBasicNcclAllReduce(self):
+    inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+              [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
+    expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
+    group_size = len(inputs)
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          t = constant_op.constant(inputs[i])
+          collectives.append(collective_ops.all_reduce(
+              t, group_size, group_key, instance_key, 'Add', 'Div'))
+      results = sess.run(collectives)
+    for result in results:
+      self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testInt32Error(self):
+    inputs = [[0, 1], [2, 3]]
+    group_size = len(inputs)
+    group_key = 1
+    instance_key = 50
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          t = constant_op.constant(inputs[i], dtype=dtypes.int32)
+          collectives.append(collective_ops.all_reduce(
+              t, group_size, group_key, instance_key, 'Add', 'Div'))
+      with self.assertRaisesRegexp(
+          errors.InternalError,
+          'does not support datatype DT_INT32 on DEVICE_GPU'):
+        sess.run(collectives)
+
+  @test_util.run_deprecated_v1
+  def testFp16Reduce(self):
+    inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+              [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
+    expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
+    group_size = len(inputs)
+    group_key = 1
+    instance_key = 100
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          t = constant_op.constant(inputs[i], dtype=dtypes.float16)
+          collectives.append(collective_ops.all_reduce(
+              t, group_size, group_key, instance_key, 'Add', 'Div'))
+      results = sess.run(collectives)
+    for result in results:
+      logging.info('i {} result {} expected {}'.format(i, results[i], expected))
+      self.assertAllClose(result, expected, rtol=1e-3, atol=1e-3)
+
+  @test_util.run_deprecated_v1
+  def testNcclHintAllReduce(self):
+    inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+              [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
+    expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
+    group_size = len(inputs)
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(
+        config=self._configure(group_size,
+                               set_config_proto_nccl=False)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          t = constant_op.constant(inputs[i])
+          collectives.append(collective_ops.all_reduce(
+              t, group_size, group_key, instance_key, 'Add', 'Div',
+              communication_hint='nccl'))
+      results = sess.run(collectives)
+    for result in results:
+      self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testBasicNcclBroadcast(self):
+    tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
+    group_size = 2
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      with ops.device(devices[0]):
+        t = constant_op.constant(tensor_value)
+        collectives.append(collective_ops.broadcast_send(
+            t, t.shape, t.dtype, group_size, group_key, instance_key))
+      with ops.device(devices[1]):
+        t = constant_op.constant(tensor_value)
+        collectives.append(collective_ops.broadcast_recv(
+            t.shape, t.dtype, group_size, group_key, instance_key))
+      results = sess.run(collectives)
+    for result in results:
+      self.assertAllClose(result, tensor_value, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testNcclBroadcastDoubleRecv(self):
+    tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
+    group_size = 2
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for device in devices:
+        with ops.device(device):
+          t = constant_op.constant(tensor_value)
+          collectives.append(collective_ops.broadcast_recv(
+              t.shape, t.dtype, group_size, group_key, instance_key))
+      with self.assertRaisesRegexp(errors.InternalError, 'found no source'):
+        sess.run(collectives)
+
+  @test_util.run_deprecated_v1
+  def testNcclBroadcastDoubleSend(self):
+    tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
+    group_size = 2
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for device in devices:
+        with ops.device(device):
+          t = constant_op.constant(tensor_value)
+          collectives.append(collective_ops.broadcast_send(
+              t, t.shape, t.dtype, group_size, group_key, instance_key))
+      with self.assertRaisesRegexp(errors.InternalError, 'already has source'):
+        sess.run(collectives)
+
+  @test_util.run_deprecated_v1
+  def testBasicNcclAllGather(self):
+    inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+              [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
+    expected = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1,
+                0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]
+    group_size = len(inputs)
+    group_key = 1
+    instance_key = 1
+    devices = ['/GPU:{}'.format(i) for i in range(group_size)]
+
+    with self.session(config=self._configure(group_size)) as sess:
+      if not test_util.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      collectives = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          t = constant_op.constant(inputs[i])
+          collectives.append(collective_ops.all_gather(t, group_size,
+                                                       group_key, instance_key))
+      results = sess.run(collectives)
+    for result in results:
+      self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index c3e8bbf76cd..c7fe931a0f7 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -21,36 +21,47 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 class CollectiveOpTest(test.TestCase):
 
-  def _testCollectiveReduce(self, t0, t1, expected, set_graph_key):
+  def _testCollectiveReduce(self, inputs, expected, set_graph_key,
+                            communication_hint='auto', fp16=False):
     group_key = 1
+    group_size = len(inputs)
     instance_key = 1
-    with self.session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(t0)
-        colred0 = collective_ops.all_reduce(in0, 2, group_key, instance_key,
-                                            'Add', 'Div')
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(t1)
-        colred1 = collective_ops.all_reduce(in1, 2, group_key, instance_key,
-                                            'Add', 'Div')
+    device_type = 'CPU'
+    config = config_pb2.ConfigProto(device_count={device_type: group_size})
+    devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]
+
+    with self.session(config=config) as sess:
+      colred = []
+      for i in range(group_size):
+        with ops.device(devices[i]):
+          tensor = constant_op.constant(inputs[i], dtype=(
+              dtypes.float16 if fp16 else dtypes.float32))
+          colred.append(collective_ops.all_reduce(
+              tensor, group_size, group_key, instance_key, 'Add', 'Div',
+              communication_hint=communication_hint))
       run_options = config_pb2.RunOptions()
       if set_graph_key:
         run_options.experimental.collective_graph_key = 1
-      results = sess.run([colred0, colred1], options=run_options)
-    self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
-    self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
+      results = sess.run(colred, options=run_options)
+    tolerance = 1e-3 if fp16 else 1e-5
+    for i in range(group_size):
+      logging.info('i {} result {} expected {}'.format(i, results[i], expected))
+      self.assertAllClose(results[i], expected, rtol=tolerance, atol=tolerance)
 
   def _testMultipleConcurrentCollectiveReduce(self, t0, t1, expected):
     group_key = 1
@@ -72,15 +83,28 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
-    self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
-                               [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], True)
+    self._testCollectiveReduce(
+        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+        set_graph_key=True)
 
   @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
-    self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
-                               [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
+    self._testCollectiveReduce(
+        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+        set_graph_key=False)
+
+  @test_util.run_deprecated_v1
+  def testFp16Reduce(self):
+    self._testCollectiveReduce(
+        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+        set_graph_key=True,
+        fp16=True)
 
   @test_util.run_deprecated_v1
   def testCollectiveMultipleConcurrentReduce(self):
@@ -89,6 +113,77 @@ class CollectiveOpTest(test.TestCase):
         [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
         [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
 
+  @test_util.run_deprecated_v1
+  def testNcclHintFallbackToRingReduce(self):
+    """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
+    if kernels.get_registered_kernels_for_op('NcclAllReduce'):
+      self.skipTest('Run only on non-GPU environments')
+    self._testCollectiveReduce(
+        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+        set_graph_key=False,
+        communication_hint='nccl')
+
+  def _testWhile(self, num_vars, num_iterations, key_base):
+    group_size = 2
+    group_key = 1
+    instances = [(key_base + i) for i in range(num_vars)]
+    devices = ['CPU:{}'.format(i) for i in range(group_size)]
+
+    config = config_pb2.ConfigProto(device_count={'CPU': group_size})
+    rewrite_options = config.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
+
+    with self.session(config=config) as sess:
+      loop_vars = []
+      for device in devices:
+        with ops.device(device):
+          loop_vars.append(
+              [variables.VariableV1((1 << i) * 1.) for i in range(num_vars)])
+      # This variable controls number of iterations.
+      loop_vars.append(variables.VariableV1(0.))
+      def loop_body(dev0_tensors, dev1_tensors, loop_tensor):
+        return_ops = []
+        for i in range(len(devices)):
+          device = devices[i]
+          device_tensors = dev0_tensors if i == 0 else dev1_tensors
+          with ops.device(device):
+            device_collectives = []
+            for j in range(num_vars):
+              # NOTE(ayushd): we need the `cast` here to ensure that the input
+              # to `all_reduce` has an explicit device string.  We don't use
+              # `identity` because `cast` is more resilient to getting optimized
+              # away by various optimization passes.
+              input_tensor = math_ops.cast(device_tensors[j], dtypes.float16)
+              collective_op = collective_ops.all_reduce(
+                  input_tensor, group_size, group_key, instances[j],
+                  'Add', 'Id')
+              output_tensor = math_ops.cast(collective_op, dtypes.float32)
+              device_collectives.append(output_tensor)
+            return_ops.append(device_collectives)
+        return_ops.append(math_ops.add(loop_tensor, 1.))
+        return return_ops
+      # Run until last variable exceeds number of iterations.
+      loop_cond = lambda d0, d1, i: math_ops.less(i, num_iterations)
+      sess.run(variables.global_variables_initializer())
+      results = sess.run(control_flow_ops.while_loop(loop_cond, loop_body,
+                                                     loop_vars))
+      self.assertEqual(results[:-1], [
+          [((1 << (num_iterations + v)) * 1.) for v in range(num_vars)]
+          for _ in range(group_size)])
+
+  @test_util.run_deprecated_v1
+  def testSimpleWhile(self):
+    self._testWhile(num_vars=1, num_iterations=4, key_base=20)
+
+  @test_util.run_deprecated_v1
+  def testWhileMultipleAllReduce(self):
+    self._testWhile(num_vars=2, num_iterations=4, key_base=20)
+
   @test_util.run_deprecated_v1
   def testWhileWithScopedAllocator(self):
     group_size = 2
@@ -122,7 +217,8 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
-    self._testCollectiveReduce(0.1, 0.3, 0.2, True)
+    self._testCollectiveReduce(inputs=[0.1, 0.3], expected=0.2,
+                               set_graph_key=True)
 
   def _testCollectiveBroadcast(self, t0):
     group_key = 1
@@ -154,14 +250,14 @@ class CollectiveOpTest(test.TestCase):
         config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
       with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
-        colred0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
       with ops.device('/CPU:1'):
         in1 = constant_op.constant(t1)
-        colred1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
       run_options = config_pb2.RunOptions()
       if set_graph_key:
         run_options.experimental.collective_graph_key = 1
-      results = sess.run([colred0, colred1], options=run_options)
+      results = sess.run([c0, c1], options=run_options)
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
@@ -194,18 +290,38 @@ class CollectiveOpTest(test.TestCase):
         config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
       with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
-        colred0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
       with ops.device('/CPU:1'):
         in1 = constant_op.constant(t1)
         in2 = constant_op.constant(t2)
-        colred1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
-        colred2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
+        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+        c2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
       run_options = config_pb2.RunOptions()
       run_options.experimental.collective_graph_key = 1
-      sess.run([colred0, colred1], options=run_options)
-      with self.assertRaisesRegexp(errors.InternalError,
-                                   'Inconsistent output shapes'):
-        sess.run([colred0, colred2], options=run_options)
+      sess.run([c0, c1], options=run_options)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   'Shape mismatch'):
+        sess.run([c0, c2], options=run_options)
+
+  @test_util.run_deprecated_v1
+  def testCollectiveGatherShapeMismatchAcrossDevices(self):
+    group_key = 1
+    instance_key = 1
+    t0 = [1, 2, 3, 4]
+    t1 = [5, 6]
+    with self.session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+      with ops.device('/CPU:1'):
+        in1 = constant_op.constant(t1)
+        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 1
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   'Shape mismatch'):
+        sess.run([c0, c1], options=run_options)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 386aff3dd39..b3eb9a5718c 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,11 +25,10 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
-from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
@@ -70,6 +69,9 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     # graphs. Propagate that behavior here.
     add_control_dependencies = ops.get_default_graph()._add_control_dependencies
     pred = ops.convert_to_tensor(pred)
+    if (tensor_util.is_tensor(pred) and
+        (pred.shape.dims is None or pred.shape.dims)):
+      pred = array_ops.squeeze_v2(pred)
 
     true_graph = func_graph_module.func_graph_from_py_func(
         true_name,
@@ -87,10 +89,14 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
         op_return_value=pred)
 
     verify_captures(_COND, [true_graph, false_graph])
-    return _build_cond(pred, true_graph, false_graph,
-                       true_graph.external_captures,
-                       false_graph.external_captures,
-                       name=scope)
+    return _build_cond(
+        pred,
+        true_graph,
+        false_graph,
+        true_graph.external_captures,
+        false_graph.external_captures,
+        building_gradient=False,
+        name=scope)
 
 
 @ops.RegisterGradient("StatelessIf")
@@ -162,14 +168,25 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   _make_output_composite_tensors_match(_COND,
                                        [true_grad_graph, false_grad_graph])
 
-  outputs = _build_cond(if_op.inputs[0], true_grad_graph, false_grad_graph,
-                        true_grad_inputs, false_grad_inputs)
+  outputs = _build_cond(
+      if_op.inputs[0],
+      true_grad_graph,
+      false_grad_graph,
+      true_grad_inputs,
+      false_grad_inputs,
+      building_gradient=True,
+  )
 
   # The predicate has no gradient.
   return [None] + outputs
 
 
-def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
+def _build_cond(pred,
+                true_graph,
+                false_graph,
+                true_inputs,
+                false_inputs,
+                building_gradient,
                 name=None):
   """Creates an If op from the specified predicate, branch functions and inputs.
 
@@ -186,6 +203,7 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
     false_graph: FuncGraph
     true_inputs: a list of Tensors to be passed to true_graph as input.
     false_inputs: a list of Tensors to be passed to false_graph as input.
+    building_gradient: Whether this is a gradient If op.
     name: the name for the If op.
 
   Returns:
@@ -199,6 +217,33 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
   # this modifies true_graph and false_graph.
   cond_inputs = _make_inputs_match([true_graph, false_graph],
                                    [true_inputs, false_inputs])
+  # Save the original number of outputs to return to the caller.
+  num_cond_outputs = len(true_graph.outputs)
+  # We do not output intermediates of the gradient If op since this is just
+  # for backwards compatibility with existing code.
+  if not building_gradient and util.output_all_intermediates():
+    # Add all intermediate tensors as function outputs so they're available for
+    # the gradient computation. Since the outputs of the two functions must
+    # match, we wrap all the intermediates in optionals. Each intermediate
+    # output will have a value iff its corresponding branch is taken.
+
+    true_intermediates = _get_intermediates(true_graph)
+    false_intermediates = _get_intermediates(false_graph)
+
+    # Wrap intermediates in optionals.
+    wrapped_true_intermediates = _wrap_intermediates(true_graph,
+                                                     true_intermediates)
+    wrapped_false_intermediates = _wrap_intermediates(false_graph,
+                                                      false_intermediates)
+
+    # Make outputs match by adding none optionals.
+    extra_true_outputs, extra_false_outputs = _make_intermediates_match(  # pylint: disable=unbalanced-tuple-unpacking
+        [true_graph, false_graph],
+        [wrapped_true_intermediates, wrapped_false_intermediates])
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+    _check_same_outputs(_COND, [true_graph, false_graph])
 
   # Create the If op.
   with ops.control_dependencies(
@@ -209,11 +254,7 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
     false_stateful_ops = [
         op for op in false_graph.get_operations() if op._is_stateful
     ]
-    # TODO(srbs): Remove this after July 22, 2019. This is required to abide by
-    # 3-week forward compat window of new TF python op generating code with
-    # stale runtime binaries.
-    if (true_stateful_ops or false_stateful_ops or
-        not compat.forward_compatible(2019, 7, 22)):
+    if (true_stateful_ops or false_stateful_ops):
       op_fn = gen_functional_ops._if
     else:
       op_fn = gen_functional_ops.stateless_if
@@ -245,7 +286,7 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
   # Prevent fetching since the variant outputs can't be fetched directly.
   if_op.graph.prevent_fetching(if_op)
   return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
-                                            tensors)
+                                            tensors[:num_cond_outputs])
 
 
 def get_func_graphs(op):
@@ -263,20 +304,10 @@ def get_func_graphs(op):
     """Generates and returns a FuncGraph for the given branch."""
     inputs = op.inputs[1:]  # First input is pred.
     input_shapes = [t.shape for t in inputs]
-    fdef = op.graph._get_function(name_attr_list.name).definition
-    # `op.graph` may not be the same as `ops.get_default_graph()` e.g.
-    # in the case of nested if ops or when the gradient is being computed
-    # from inside a Defun. We build the `func_graph` with `op.graph` as its
-    # `outer_graph`. This resembles how the `FuncGraph` was built in the
-    # forward pass. We need this so that we can resolve references to tensors
-    # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
-    with op.graph.as_default():
-      func_graph = function_def_to_graph.function_def_to_graph(
-          fdef, input_shapes)
+    func_graph = util.get_func_graph(op, input_shapes, name_attr_list.name)
     for external_t, internal_t in zip(inputs, func_graph.inputs):
       custom_gradient.copy_handle_data(external_t, internal_t)
-    func_graph.captures = collections.OrderedDict(zip(inputs,
-                                                      func_graph.inputs))
+    func_graph.reset_captures(zip(inputs, func_graph.inputs))
     # Link the op so that the gradient code can use it.
     func_graph._forward_cond = op
     return func_graph
@@ -379,11 +410,18 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
       # `internal_captures` are not treated as intermediates and hence not added
       # to If op outputs. So we get the outer tensor corresponding to those
       # from the list of `external_captures`.
-      try:
-        t = t.graph._forward_cond.outputs[t.graph.outputs.index(t)]
-      except ValueError:
-        index = t.graph.internal_captures.index(t)
-        t = t.graph.external_captures[index]
+      for i, output in enumerate(t.graph.outputs):
+        if output is t:
+          t = t.graph._forward_cond.outputs[i]
+          break
+      else:
+        for i, output in enumerate(t.graph.internal_captures):
+          if output is t:
+            t = t.graph.external_captures[i]
+            break
+        else:
+          raise ValueError("Could not find external tensor capture {tensor} in "
+                           "captures or outputs".format(tensor=t))
 
       # Note: We rely on the capturing logic of the gradient If op graph to
       # correctly capture the tensors in `cond_graph.outer_graph`. Both cond_v2
@@ -395,12 +433,17 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
 
 
 def _get_intermediates(func_graph):
-  """Returns all tensors in `func_graph` that aren't inputs or outputs."""
+  """Returns intermediate tensors of `func_graph` for gradient computation."""
   intermediates = []
   for op in func_graph.get_operations():
     for t in op.outputs:
       if t in func_graph.inputs: continue
       if t in func_graph.outputs: continue
+      if t.dtype is dtypes.resource:
+        continue
+      # Accumulating mutexes can cause deadlock.
+      if op.type == "MutexLock":
+        continue
       intermediates.append(t)
   return intermediates
 
@@ -465,16 +508,21 @@ def _make_inputs_match(branch_graphs, branch_inputs):
     branch_graph. This is a deduped version of `sum(branch_inputs)`.
   """
   assert len(branch_graphs) == len(branch_inputs)
-  new_inputs = set()
+  added_inputs = set()
+  new_inputs = []
   for branch_in in branch_inputs:
-    new_inputs |= set(branch_in)
-  new_inputs = list(new_inputs)
+    for tensor in branch_in:
+      tensor_id = ops.tensor_id(tensor)
+      if tensor_id not in added_inputs:
+        added_inputs.add(tensor_id)
+        new_inputs.append(tensor)
 
   for branch_graph, branch_in in zip(branch_graphs, branch_inputs):
-    branch_input_to_param = dict(zip(branch_in, branch_graph.inputs))
+    input_ids = [ops.tensor_id(t) for t in branch_in]
+    branch_input_to_param = dict(zip(input_ids, branch_graph.inputs))
     input_list = []
     for in_t in new_inputs:
-      param = branch_input_to_param.get(in_t, None)
+      param = branch_input_to_param.get(ops.tensor_id(in_t))
       if param is None:
         param = _create_dummy_input(branch_graph, in_t)
       input_list.append(param)
@@ -482,8 +530,7 @@ def _make_inputs_match(branch_graphs, branch_inputs):
     branch_graph.inputs = input_list
 
     # Rewrite the FuncGraphs' state to reflect the new inputs.
-    branch_graph.captures = collections.OrderedDict(
-        zip(new_inputs, branch_graph.inputs))
+    branch_graph.reset_captures(zip(new_inputs, branch_graph.inputs))
 
   return new_inputs
 
@@ -744,19 +791,20 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
 
   def _capture_helper(self, tensor, name):
     if (tensor.graph is not self._forward_graph or
-        tensor in self._forward_graph.inputs or
-        tensor in self._forward_graph.outputs):
+        any(tensor is t for t in self._forward_graph.inputs) or
+        any(tensor is t for t in self._forward_graph.outputs)):
       return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
 
     if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
       # XLA does not yet support optionals, so capture intermediates directly.
       # TODO(skyewm,jpienaar): can XLA support optionals?
-      if tensor not in self.captures:
+      if all(tensor is not capture for capture in self.external_captures):
         self.xla_intermediates.append(tensor)
         self.op_needs_rewrite = True
       return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
 
-    captured_tensor = self._indirect_captures.get(tensor)
+    tensor_id = ops.tensor_id(tensor)
+    captured_tensor = self._indirect_captures.get(tensor_id)
     if captured_tensor is not None:
       return captured_tensor
 
@@ -778,12 +826,13 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
       captured_tensor = super(_CondGradFuncGraph, self)._capture_helper(
           self._forward_graph.inputs[index], name)
     else:
-      if tensor not in self._wrapped_intermediates:
+      if tensor_id not in self._wrapped_intermediates:
         # If the gradient has already been computed for this If op, 'tensor' may
         # already be wrapped.
         for consumer in tensor.consumers():
           if (consumer.type == "OptionalFromValue" and
-              consumer.outputs[0] in self._forward_graph.outputs):
+              any(consumer.outputs[0] is output
+                  for output in self._forward_graph.outputs)):
             optional = consumer.outputs[0]
             break
         else:
@@ -791,15 +840,15 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
           with self._forward_graph.as_default():
             optional = gen_dataset_ops.optional_from_value([tensor])
           self.op_needs_rewrite = True
-        self._wrapped_intermediates[tensor] = optional
+        self._wrapped_intermediates[tensor_id] = optional
 
-      optional = self._wrapped_intermediates[tensor]
+      optional = self._wrapped_intermediates[tensor_id]
       captured_optional = super(_CondGradFuncGraph,
                                 self)._capture_helper(optional, name)
       captured_tensor = gen_dataset_ops.optional_get_value(
           captured_optional, [tensor.dtype], [tensor.shape])[0]
 
-    self._indirect_captures[tensor] = captured_tensor
+    self._indirect_captures[tensor_id] = captured_tensor
     return captured_tensor
 
 
@@ -869,7 +918,7 @@ def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
     # NOTE(bjp): if there are any active sessions, this modification to `op`
     # may make them unrunnable!
 
-    if control_flow_util.InXlaContext(ops.get_default_graph()):
+    if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
       # XLA does not yet support optionals, so output intermediates directly and
       # make them match via FakeParams, which can be converted to zeros in XLA.
       # TODO(bjp,jpienaar): can XLA support optionals?
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 7d3d8d67183..411130fedf0 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -482,6 +482,12 @@ def _get_shape_invariant(var, shape=None):
 
   elif shape is None:
     return var.shape
+  elif isinstance(shape, tensor_spec.TensorSpec):
+    if var.dtype != shape.dtype:
+      raise TypeError("TensorSpec %r is not compatible with %r" % (shape, var))
+    return shape.shape
+  elif isinstance(shape, type_spec.TypeSpec):
+    raise TypeError("TypeSpec %r is not compatible with %r" % (shape, var))
   else:
     return shape
 
@@ -498,6 +504,8 @@ def _shape_invariant_to_type_spec(var, shape):
     A `TypeSpec` for `var`, consistent with the given shape.
   """
   if isinstance(shape, type_spec.TypeSpec):
+    if not shape.is_compatible_with(var):
+      raise TypeError("TypeSpec %r is not compatible with %r" % (shape, var))
     return shape
   elif not isinstance(shape, tensor_shape.TensorShape):
     raise TypeError("Expected shape to be a TypeSpec or TensorShape, got %r"
@@ -752,15 +760,6 @@ class ControlFlowContext(object):
       return self._outer_context.GetWhileContext()
     return None
 
-  def _IsInOuterContext(self, op):
-    op_ctxt = util.GetOutputContext(op)
-    outer_ctxt = self.outer_context
-    while outer_ctxt != op_ctxt:
-      if outer_ctxt is None:
-        return False
-      outer_ctxt = outer_ctxt.outer_context
-    return True
-
   def _RemoveExternalControlEdges(self, op):
     """Remove any external control dependency on this op."""
     while_ctxt = self.GetWhileContext()
@@ -2273,9 +2272,22 @@ class WhileContext(ControlFlowContext):
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
-        outer_control_inputs = [
-            op for op in control_inputs if self._IsInOuterContext(op)
-        ]
+        outer_control_inputs = []
+        for op in control_inputs:
+          # We need to keep control inputs that are in any ancestor
+          # ControlFlowContext, and within outer WhileContext.
+          keep_as_control_input = True
+          op_ctxt = util.GetOutputContext(op)
+          outer_ctxt = self.outer_context
+          outer_while_context = (None if outer_ctxt is None else
+                                 outer_ctxt.GetWhileContext())
+          while outer_ctxt != op_ctxt:
+            if outer_ctxt is None or outer_ctxt == outer_while_context:
+              keep_as_control_input = False
+              break
+            outer_ctxt = outer_ctxt.outer_context
+          if keep_as_control_input:
+            outer_control_inputs.append(op)
         x.op._set_control_flow_context(self)
         x.op._add_control_inputs(outer_control_inputs)
         graph._record_op_seen_by_control_dependencies(x.op)
@@ -2640,6 +2652,13 @@ def while_loop(cond,
   ```
 
   """
+  if not callable(cond):
+    raise TypeError("cond must be callable.")
+  if not callable(body):
+    raise TypeError("body must be callable.")
+  if parallel_iterations < 1:
+    raise TypeError("parallel_iterations must be a positive integer.")
+
   # Always enable control flow v2 if building a function, regardless of toggle.
   executing_eagerly = context.executing_eagerly()
   if (util.EnableControlFlowV2(ops.get_default_graph()) and
@@ -2652,18 +2671,13 @@ def while_loop(cond,
         parallel_iterations=parallel_iterations,
         maximum_iterations=maximum_iterations,
         name=name,
-        return_same_structure=return_same_structure)
+        return_same_structure=return_same_structure,
+        back_prop=back_prop)
 
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
       raise ValueError("No loop variables provided")
-    if not callable(cond):
-      raise TypeError("cond must be callable.")
-    if not callable(body):
-      raise TypeError("body must be callable.")
-    if parallel_iterations < 1:
-      raise TypeError("parallel_iterations must be a positive integer.")
-
+    try_to_pack = (len(loop_vars) == 1 and not return_same_structure)
     if maximum_iterations is not None:
       maximum_iterations = ops.convert_to_tensor(
           maximum_iterations, name="maximum_iterations")
@@ -2679,7 +2693,7 @@ def while_loop(cond,
             0, dtype=maximum_iterations.dtype, name="iteration_counter")
       orig_cond = cond
       orig_body = body
-      if len(loop_vars) == 1:
+      if try_to_pack:
         loop_vars = (counter, loop_vars[0])
         cond = lambda i, lv: (  # pylint: disable=g-long-lambda
             math_ops.logical_and(i < maximum_iterations, orig_cond(lv)))
@@ -2689,9 +2703,9 @@ def while_loop(cond,
         cond = lambda i, lv: (  # pylint: disable=g-long-lambda
             math_ops.logical_and(i < maximum_iterations, orig_cond(*lv)))
         body = lambda i, lv: (i + 1, orig_body(*lv))
+      try_to_pack = False
 
     if executing_eagerly:
-      try_to_pack = len(loop_vars) == 1
       packed = False  # whether the body result was packed into a 1-item tuple
 
       loop_var_structure = nest.map_structure(type_spec.type_spec_from_value,
@@ -3264,7 +3278,111 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
     return cond_v2.indexed_case(branch_index, branch_fns)
 
 
-@tf_export("case")
+@tf_export("case", v1=[])
+def case_v2(pred_fn_pairs,
+            default=None,
+            exclusive=False,
+            strict=False,
+            name="case"):
+  """Create a case operation.
+
+  See also `tf.switch_case`.
+
+  The `pred_fn_pairs` parameter is a list of pairs of size N.
+  Each pair contains a boolean scalar tensor and a python callable that
+  creates the tensors to be returned if the boolean evaluates to True.
+  `default` is a callable generating a list of tensors. All the callables
+  in `pred_fn_pairs` as well as `default` (if provided) should return the same
+  number and types of tensors.
+
+  If `exclusive==True`, all predicates are evaluated, and an exception is
+  thrown if more than one of the predicates evaluates to `True`.
+  If `exclusive==False`, execution stops at the first predicate which
+  evaluates to True, and the tensors generated by the corresponding function
+  are returned immediately. If none of the predicates evaluate to True, this
+  operation returns the tensors generated by `default`.
+
+  `tf.case` supports nested structures as implemented in
+  `tf.contrib.framework.nest`. All of the callables must return the same
+  (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  a callable, they are implicitly unpacked to single values. This
+  behavior is disabled by passing `strict=True`.
+
+  @compatibility(v2)
+  `pred_fn_pairs` could be a dictionary in v1. However, tf.Tensor and
+  tf.Variable are no longer hashable in v2, so cannot be used as a key for a
+  dictionary.  Please use a list or a tuple instead.
+  @end_compatibility
+
+
+  **Example 1:**
+
+  Pseudocode:
+
+  ```
+  if (x < y) return 17;
+  else return 23;
+  ```
+
+  Expressions:
+
+  ```python
+  f1 = lambda: tf.constant(17)
+  f2 = lambda: tf.constant(23)
+  r = tf.case([(tf.less(x, y), f1)], default=f2)
+  ```
+
+  **Example 2:**
+
+  Pseudocode:
+
+  ```
+  if (x < y && x > z) raise OpError("Only one predicate may evaluate to True");
+  if (x < y) return 17;
+  else if (x > z) return 23;
+  else return -1;
+  ```
+
+  Expressions:
+
+  ```python
+  def f1(): return tf.constant(17)
+  def f2(): return tf.constant(23)
+  def f3(): return tf.constant(-1)
+  r = tf.case([(tf.less(x, y), f1), (tf.greater(x, z), f2)],
+           default=f3, exclusive=True)
+  ```
+
+  Args:
+    pred_fn_pairs: List of pairs of a boolean scalar tensor and a callable which
+      returns a list of tensors.
+    default: Optional callable that returns a list of tensors.
+    exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    strict: A boolean that enables/disables 'strict' mode; see above.
+    name: A name for this operation (optional).
+
+  Returns:
+    The tensors returned by the first pair whose predicate evaluated to True, or
+    those returned by `default` if none does.
+
+  Raises:
+    TypeError: If `pred_fn_pairs` is not a list/tuple.
+    TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
+    TypeError: If `fns[i]` is not callable for any i, or `default` is not
+               callable.
+  """
+  return _case_helper(
+      cond,
+      pred_fn_pairs,
+      default,
+      exclusive,
+      name,
+      allow_python_preds=False,
+      strict=strict)
+
+
+@tf_export(v1=["case"])
 def case(pred_fn_pairs,
          default=None,
          exclusive=False,
@@ -3385,8 +3503,8 @@ def switch_case(branch_index,
   branch will be selected. `tf.switch_case` is more like a C++ switch/case
   statement than `tf.case`, which is more like an if/elif/elif/else chain.
 
-  The `branch_fns` parameter is either a dict from `int` to callables, or list
-  of (`int, callable) pairs, or simply a list of callables (in which case the
+  The `branch_fns` parameter is either a list
+  of (int, callable) pairs, or simply a list of callables (in which case the
   index is implicitly the key). The `branch_index` `Tensor` is used to select an
   element in `branch_fns` with matching `int` key, falling back to `default`
   if none match, or `max(keys)` if no `default` is provided. The keys must form
@@ -3396,6 +3514,12 @@ def switch_case(branch_index,
   callables must return the same (possibly nested) value structure of lists,
   tuples, and/or named tuples.
 
+  @compatibility(v2)
+  `branch_fns` could be a dictionary in v1. However, tf.Tensor and
+  tf.Variable are no longer hashable in v2, so cannot be used as a key for a
+  dictionary.  Please use a list or a tuple instead.
+  @end_compatibility
+
   **Example:**
 
   Pseudocode:
@@ -3426,10 +3550,9 @@ def switch_case(branch_index,
   Args:
     branch_index: An int Tensor specifying which of `branch_fns` should be
       executed.
-    branch_fns: A `dict` mapping `int`s to callables, or a `list` of
-      (`int, callable) pairs, or simply a list of callables (in which case the
-      index serves as the key). Each callable must return a matching structure
-      of tensors.
+    branch_fns: A `list` of (int, callable) pairs, or simply a list of
+    callables (in which case the index serves as the key). Each callable must
+    return a matching structure of tensors.
     default: Optional callable that returns a structure of tensors.
     name: A name for this operation (optional).
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 4d07d60d8ee..3a6b2aabdab 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -644,7 +645,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/138741991")
   def test_variable(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: variables.Variable(3.0)
@@ -792,7 +793,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/138741991")
   def test_list(self):
     shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
              tensor_shape.TensorShape([])]
@@ -1082,6 +1083,8 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   @test_util.disable_xla("Wants RunMetadata")
   def testParallelExecution(self):
     """Verify disjoint branches across while iterations are run in parallel."""
+    if control_flow_v2_toggles.control_flow_v2_enabled():
+      self.skipTest("b/138870290")
     if test.is_built_with_rocm():
       self.skipTest(
           "Disable subtest on ROCm due to missing Cholesky op support")
@@ -1186,7 +1189,7 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     def make_func(bi):
       return lambda: array_ops.constant(bi * 10., name="br{}_out".format(bi))
 
-    branches = {array_ops.constant(i): make_func(i) for i in range(5)}
+    branches = [(array_ops.constant(i), make_func(i)) for i in range(5)]
     with self.assertRaisesRegexp(TypeError, "must be a Python `int`"):
       control_flow_ops.switch_case(array_ops.constant(1), branches)
 
@@ -1259,10 +1262,8 @@ class CaseTest(test_util.TensorFlowTestCase):
   @test_util.run_in_graph_and_eager_modes
   def testCase_dict(self):
     x = constant_op.constant(2)
-    conditions = {
-        math_ops.equal(x, 1): lambda: constant_op.constant(2),
-        math_ops.equal(x, 2): lambda: constant_op.constant(4)
-    }
+    conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
+                  (math_ops.equal(x, 2), lambda: constant_op.constant(4))]
     output = control_flow_ops.case(conditions, exclusive=True)
     self.assertEqual(4, self.evaluate(output))
 
@@ -1288,7 +1289,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     # Expect a tuple since that is what the body returns.
     self.assertEqual(self.evaluate(r), (10,))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Unsupported in cfv2")
   def testWhileLoopSameReturnShape_False(self):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
@@ -1300,6 +1301,26 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     r = control_flow_ops.while_loop(c, b, [i, []])
     self.assertEqual(self.evaluate(r), 10)
 
+    # Adding maximum_iterations should yield the same result.
+    r = control_flow_ops.while_loop(c, b, [i, []], maximum_iterations=50)
+    # Note: this result is still incorrect - it should be just 10.
+    self.assertEqual(self.evaluate(r), [10, []])
+
+  def testWhileLoopSameReturnShape_FalseSingleLoopVar(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+
+    # Body return must be unpacked in this case.
+    b = lambda i: math_ops.add(i, 1)
+
+    # Should only return the tensor.
+    r = control_flow_ops.while_loop(c, b, [i])
+    self.assertEqual(self.evaluate(r), 10)
+
+    # Adding maximum_iterations should yield the same result.
+    r = control_flow_ops.while_loop(c, b, [i], maximum_iterations=50)
+    self.assertEqual(self.evaluate(r), 10)
+
   def testWhileLoopSameReturnShape_True(self):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
@@ -1311,6 +1332,26 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     r = control_flow_ops.while_loop(c, b, [i, []], return_same_structure=True)
     self.assertEqual(self.evaluate(r), [10, []])
 
+    # Adding maximum_iterations should yield the same result.
+    r = control_flow_ops.while_loop(
+        c, b, [i, []], return_same_structure=True, maximum_iterations=50)
+    self.assertEqual(self.evaluate(r), [10, []])
+
+  def testWhileLoopSameReturnShape_TrueSingleLoopVar(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+
+    b = lambda i: [math_ops.add(i, 1)]
+
+    # Should not unpack the single variable
+    r = control_flow_ops.while_loop(c, b, [i], return_same_structure=True)
+    self.assertEqual(self.evaluate(r), [10])
+
+    # Adding maximum_iterations should yield the same result.
+    r = control_flow_ops.while_loop(
+        c, b, [i], return_same_structure=True, maximum_iterations=50)
+    self.assertEqual(self.evaluate(r), [10])
+
 
 class AssertTest(test_util.TensorFlowTestCase):
 
@@ -1327,6 +1368,12 @@ class AssertTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssertInFunction(self):
+    # TODO(fishx): Re-enable this test for GPU.
+    # NOTE(fishx): Disable this test for now because, in GPU, multiple errors
+    # will be thrown. But since the root cause error is marked as "derived"
+    # error. So it might be ignored.
+    if test_util.is_gpu_available():
+      self.skipTest("Skip GPU Test")
 
     @def_function.function
     def whiny(value):
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index a2e8a65a309..0f984189aef 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -26,9 +26,12 @@ from __future__ import print_function
 import os
 import traceback
 
+from tensorflow.python import tf2
 from tensorflow.python.platform import tf_logging as logging
 
-ENABLE_CONTROL_FLOW_V2 = (os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
+ENABLE_CONTROL_FLOW_V2 = ((tf2.enabled() and
+                           os.getenv("TF_ENABLE_CONTROL_FLOW_V2") != "0") or
+                          os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_COND_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_WHILE_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_TENSOR_ARRAY_V2", "0") != "0")
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 70ec9f34cb2..3aec9192698 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -20,36 +20,22 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
+from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.util import tf_contextlib
 
+_EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = None
 
-class CondBranchFuncGraph(FuncGraph):
-  """FuncGraph for branches of tf.cond().
-
-  This is used to distinguish cond branches from other functions.
-  """
-  pass
-
-
-class WhileCondFuncGraph(FuncGraph):
-  """FuncGraph for the condition of tf.while_loop().
-
-  This is used to distinguish while conditions from other functions.
-  """
-  pass
-
-
-class WhileBodyFuncGraph(FuncGraph):
-  """FuncGraph for the body of tf.while_loop().
-
-  This is used to distinguish while bodies from other functions.
-  """
-  pass
+CondBranchFuncGraph = control_flow_v2_func_graphs.CondBranchFuncGraph
+WhileCondFuncGraph = control_flow_v2_func_graphs.WhileCondFuncGraph
+WhileBodyFuncGraph = control_flow_v2_func_graphs.WhileBodyFuncGraph
 
 
 def in_defun():
@@ -226,3 +212,59 @@ def clear_control_inputs():
     ops.get_default_graph()._set_control_flow_context(control_flow_context)
     yield
   # pylint: enable=protected-access
+
+
+def _is_tpu_strategy(strategy):
+  return (strategy is not None and
+          strategy.__class__.__name__.startswith("TPUStrategy"))
+
+
+def _is_building_keras_layer():
+  return base_layer_utils.call_context().layer is not None
+
+
+def output_all_intermediates():
+  """Whether to output all intermediates of a functional control flow op.
+
+  The default behavior is to output intermediates only when building a Keras
+  Layer in graph mode and that too when certain other conditions are met:
+  1. We do not output intermediates if the functional control flow op
+     is being built inside a FuncGraph which is not a If/While graph. This
+     guards against outputting intermediates in eager mode since keras adds
+     tensors to a FuncGraph named "keras_graph" in that case. Also because we
+     do not output intermediates of tf.function (since this feature is only for
+     backwards compatibility) outputting intermediates of functional control
+     flow ops built inside tf.function is of no value.
+  2. We do not output intermediates when the compilation is using XLA or for a
+     TPU.
+  3. We do not output intermediates when a single threaded executor is used
+     since that does not perform inlining and pruning.
+
+  Returns:
+    A bool telling whether to output all intermediates.
+  """
+  if _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE is not None:
+    return _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
+  if in_defun():
+    return False
+  if (control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()) or
+      _is_tpu_strategy(distribution_strategy_context.get_strategy())):
+    return False
+  if (context.context().function_call_options.executor_type ==
+      "SINGLE_THREADED_EXECUTOR"):
+    return False
+  return _is_building_keras_layer()
+
+
+def get_func_graph(op, input_shapes, func_name):
+  """Generates and returns a FuncGraph for the given op and input_shapes."""
+  fdef = op.graph._get_function(func_name).definition  # pylint: disable=protected-access
+  # `op.graph` may not be the same as `ops.get_default_graph()` e.g.
+  # in the case of nested if ops or when the gradient is being computed
+  # from inside a Defun. We build the `func_graph` with `op.graph` as its
+  # `outer_graph`. This resembles how the `FuncGraph` was built in the
+  # forward pass. We need this so that we can resolve references to tensors
+  # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
+  with op.graph.as_default():
+    func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
+  return func_graph
diff --git a/tensorflow/python/autograph/core/function_wrapping_test.py b/tensorflow/python/ops/control_flow_v2_disable_test.py
similarity index 55%
rename from tensorflow/python/autograph/core/function_wrapping_test.py
rename to tensorflow/python/ops/control_flow_v2_disable_test.py
index 7e21b979dbc..f6e3888a84c 100644
--- a/tensorflow/python/autograph/core/function_wrapping_test.py
+++ b/tensorflow/python/ops/control_flow_v2_disable_test.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,25 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for function_wrapping module."""
+"""Tests that TF2_BEHAVIOR=1 and TF_ENABLE_CONTROL_FLOW_V2=0 disables cfv2."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.core import function_wrapping
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
+import os
+os.environ["TF2_BEHAVIOR"] = "1"
+os.environ["TF_ENABLE_CONTROL_FLOW_V2"] = "0"
+
+from tensorflow.python import tf2  # pylint: disable=g-import-not-at-top
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 
 
-class FunctionWrappingTest(test.TestCase):
+class ControlFlowV2DisableTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
-  def test_function_scope_name(self):
-    with function_wrapping.function_scope('test_name'):
-      t = constant_op.constant(1)
-    self.assertIn('test_name', t.name)
+  def testIsDisabled(self):
+    self.assertTrue(tf2.enabled())
+    self.assertFalse(control_flow_util.ENABLE_CONTROL_FLOW_V2)
 
-if __name__ == '__main__':
-  test.main()
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py b/tensorflow/python/ops/control_flow_v2_enable_test.py
similarity index 61%
rename from tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
rename to tensorflow/python/ops/control_flow_v2_enable_test.py
index 08501170fa5..f29d4dc4a21 100644
--- a/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
+++ b/tensorflow/python/ops/control_flow_v2_enable_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,22 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.data.Dataset.filter_with_legacy_function()`."""
+"""Tests that TF2_BEHAVIOR=1 enables cfv2."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.kernel_tests import filter_test_base
-from tensorflow.python.framework import test_util
+import os
+os.environ["TF2_BEHAVIOR"] = "1"
+
+from tensorflow.python import tf2  # pylint: disable=g-import-not-at-top
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("filter_with_legacy_function only available in TF 1.x")
-class FilterWithLegacyFunctionTest(filter_test_base.FilterTestBase):
+class ControlFlowV2EnableTest(test.TestCase):
 
-  def apply_filter(self, input_dataset, predicate):
-    return input_dataset.filter_with_legacy_function(predicate)
+  def testIsEnabled(self):
+    self.assertTrue(tf2.enabled())
+    self.assertTrue(control_flow_util.ENABLE_CONTROL_FLOW_V2)
 
 
 if __name__ == "__main__":
-  test.main()
+  googletest.main()
diff --git a/tensorflow/python/estimator/__init__.py b/tensorflow/python/ops/control_flow_v2_func_graphs.py
similarity index 52%
rename from tensorflow/python/estimator/__init__.py
rename to tensorflow/python/ops/control_flow_v2_func_graphs.py
index 1e32161fbba..1a96d397c5b 100644
--- a/tensorflow/python/estimator/__init__.py
+++ b/tensorflow/python/ops/control_flow_v2_func_graphs.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""estimator python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+"""FuncGraphs for V2 control flow."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.python import estimator
+from tensorflow.python.framework.func_graph import FuncGraph
 
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-estimator.__all__ = [s for s in dir(estimator) if not s.startswith('__')]
 
-from tensorflow_estimator.python.estimator import *
+class CondBranchFuncGraph(FuncGraph):
+  """FuncGraph for branches of tf.cond().
+
+  This is used to distinguish cond branches from other functions.
+  """
+  pass
+
+
+class WhileCondFuncGraph(FuncGraph):
+  """FuncGraph for the condition of tf.while_loop().
+
+  This is used to distinguish while conditions from other functions.
+  """
+  pass
+
+
+class WhileBodyFuncGraph(FuncGraph):
+  """FuncGraph for the body of tf.while_loop().
+
+  This is used to distinguish while bodies from other functions.
+  """
+  pass
diff --git a/tensorflow/python/ops/control_flow_v2_toggles.py b/tensorflow/python/ops/control_flow_v2_toggles.py
index bbd264f8b12..9bae4e37ed7 100644
--- a/tensorflow/python/ops/control_flow_v2_toggles.py
+++ b/tensorflow/python/ops/control_flow_v2_toggles.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -64,3 +65,30 @@ def control_flow_v2_enabled():  # pylint: disable=invalid-name
   Note: v2 control flow is always enabled inside of tf.function.
   """
   return control_flow_util.EnableControlFlowV2(ops.get_default_graph())
+
+
+@tf_export(v1=["experimental.output_all_intermediates"])
+def output_all_intermediates(state):  # pylint: disable=invalid-name
+  """Whether to output all intermediates from functional control flow ops.
+
+  The "default" behavior to is to output all intermediates when using v2 control
+  flow inside Keras models in graph mode (possibly inside Estimators). This is
+  needed to support taking gradients of v2 control flow. In graph mode, Keras
+  can sometimes freeze the forward graph before the gradient computation which
+  does not work for v2 control flow since it requires updating the forward ops
+  to output the needed intermediates. We work around this by proactively
+  outputting the needed intermediates when building the forward pass itself.
+  Ideally any such extra tensors should be pruned out at runtime. However, if
+  for any reason this doesn't work for you or if you have an infernce-only model
+  you can turn this behavior off using
+  `tf.compat.v1.experimental.output_all_intermediates(False)`.
+
+  If with the default behavior you are still seeing errors of the form
+  "Connecting to invalid output X of source node Y which has Z outputs" try
+  setting `tf.compat.v1.experimental.output_all_intermediates(True)` and
+  please file an issue at https://github.com/tensorflow/tensorflow/issues.
+
+  Args:
+    state: True, False or None. None restores the default behavior.
+  """
+  control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = state  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/control_flow_v2_toggles_test.py b/tensorflow/python/ops/control_flow_v2_toggles_test.py
new file mode 100644
index 00000000000..78b63af5e8e
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_v2_toggles_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow_v2_toggles.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import control_flow_v2_toggles
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class ControlFlowV2TogglesTest(test.TestCase):
+
+  def testOutputAllIntermediates(self):
+    self.assertIsNone(
+        control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE)
+    control_flow_v2_toggles.output_all_intermediates(True)
+    self.assertTrue(
+        control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE)
+    control_flow_v2_toggles.output_all_intermediates(False)
+    self.assertFalse(
+        control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE)
+    control_flow_v2_toggles.output_all_intermediates(None)
+    self.assertIsNone(
+        control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/critical_section_ops.py b/tensorflow/python/ops/critical_section_ops.py
index 85d828cb40c..16419e45bda 100644
--- a/tensorflow/python/ops/critical_section_ops.py
+++ b/tensorflow/python/ops/critical_section_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -243,7 +244,7 @@ class CriticalSection(object):
         # captured_resources is a list of resources that are directly
         # accessed only by ops created during fn(), not by any
         # ancestors of those ops in the graph.
-        captured_resources = set([
+        captured_resources = object_identity.ObjectIdentitySet([
             input_ for op in created_ops
             for input_ in op.inputs
             if input_.dtype == dtypes.resource
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 22a8c95431c..a1d75f61fa2 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -615,18 +615,16 @@ def ctc_loss_v2(labels,
   pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Notes:
-      - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss
-      setting of
-        preprocess_collapse_repeated=False, ctc_merge_repeated=True
-      - Labels may be supplied as either a dense, zero-padded tensor with a
-        vector of label sequence lengths OR as a SparseTensor.
-      - On TPU and GPU:
-          - Only dense padded labels are supported.
-      - On CPU:
-          - Caller may use SparseTensor or dense padded labels but calling with
-            a SparseTensor will be significantly faster.
-      - Default blank label is 0 rather num_classes - 1, unless overridden by
-        blank_index.
+
+  - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss
+    setting of preprocess_collapse_repeated=False, ctc_merge_repeated=True
+  - Labels may be supplied as either a dense, zero-padded tensor with a
+    vector of label sequence lengths OR as a SparseTensor.
+  - On TPU and GPU: Only dense padded labels are supported.
+  - On CPU: Caller may use SparseTensor or dense padded labels but calling with
+    a SparseTensor will be significantly faster.
+  - Default blank label is 0 rather num_classes - 1, unless overridden by
+    blank_index.
 
   Args:
     labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 12b4feb68e5..408450ccedf 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -83,7 +83,7 @@ def copy_handle_data(source_t, target_t):
 
 
 @tf_export("custom_gradient")
-def custom_gradient(f):
+def custom_gradient(f=None, primals=None):
   """Decorator to define a function with a custom gradient.
 
   This decorator allows fine grained control over the gradients of a sequence
@@ -122,6 +122,71 @@ def custom_gradient(f):
   With this definition, the gradient at x=100 will be correctly evaluated as
   1.0.
 
+  Nesting custom gradients can lead to unintuitive results. The default
+  behavior does not correspond to n-th order derivatives. For example
+
+  ```python
+  @tf.custom_gradient
+  def op(x):
+    y = op1(x)
+    @tf.custom_gradient
+    def grad_fn(dy):
+      gdy = op2(x, y, dy)
+      def grad_grad_fn(ddy):  # Not the 2nd order gradient of op w.r.t. x.
+        return op3(x, y, dy, ddy)
+      return gdy, grad_grad_fn
+    return y, grad_fn
+  ```
+
+  The function `grad_grad_fn` will be calculating the first order gradient
+  of `grad_fn` with respect to `dy`, which is used to generate forward-mode
+  gradient graphs from backward-mode gradient graphs, but is not the same as
+  the second order gradient of `op` with respect to `x`.
+
+  Instead, when overriding `n`-th order gradients, specify a `primals` argument
+  to the inner decorator(s). For example overriding both first- and second-order
+  gradients is necessary when making an operation with a fused forward and
+  backward pass infinitely differentiable:
+
+  ```python
+  @tf.custom_gradient
+  def op_with_fused_backprop(x):
+    y, x_grad = fused_op(x)
+    @tf.custom_gradient(primals=x)
+    def grad_fn(dy):
+      def grad_grad_fn(ddy):
+        return infinitely_differentiable_second_order_grad_for_x(x, y, ddy)
+      return x_grad, grad_grad_fn
+    return y, grad_fn
+  ```
+
+  Likewise when also overriding third or higher-order gradients, `primals` will
+  typically be the original zeroth-order inputs.
+
+  You can achieve the same effect by wrapping nested `@tf.custom_gradients` in
+  another function. For example you may need to override gradients with respect
+  to output gradients in addition to second-order gradients. Gradients with
+  respect to output gradients are used for generating forward-mode gradient
+  graphs from backward graphs, transposing the gradient function.
+
+  ```python
+  @tf.custom_gradient
+  def op_with_fused_backprop(x):
+    y, x_grad = fused_op(x)
+    def first_order_gradient(dy):
+      @tf.custom_gradient
+      def first_order_custom(unused_x, unused_dy):
+        def second_order_and_transpose(ddy):
+          return second_order_for_x(...), gradient_wrt_dy(...)
+        return x_grad, second_order_and_transpose
+      return first_order_custom(x, dy)
+    return y, first_order_gradient
+  ```
+
+  With the additional layer of nesting, `primals` is no longer
+  necessary. Additional arguments to the inner `@tf.custom_gradient`-decorated
+  function control the expected return values of the innermost function.
+
   See also `tf.RegisterGradient` which registers a gradient function for a
   primitive TensorFlow operation. `tf.custom_gradient` on the other hand allows
   for fine grained control over the gradient computation of a sequence of
@@ -154,20 +219,30 @@ def custom_gradient(f):
          `grad_xs` is the same as above, and `grad_vars` is a `list<Tensor>`
          with the derivatives of `Tensor`s in `y` with respect to the variables
          (that is, grad_vars has one Tensor per variable in variables).
+    primals: A `Tensor` or list of `Tensor`. The tensors with respect to which
+      the gradient function will be returning. When nesting custom gradients,
+      specifying `primals` allows you to control which original tensors the
+      higher-order gradients are for. See examples above.
 
   Returns:
     A function `h(x)` which returns the same value as `f(x)[0]` and whose
     gradient (as calculated by `tf.gradients`) is determined by `f(x)[1]`.
   """
 
-  def decorated(*args, **kwargs):
-    """Decorated function with custom gradient."""
-    if context.executing_eagerly():
-      return _eager_mode_decorator(f, *args, **kwargs)
-    else:
-      return _graph_mode_decorator(f, *args, **kwargs)
+  def decorator(f):
+    def decorated(*args, **kwargs):
+      """Decorated function with custom gradient."""
+      if context.executing_eagerly():
+        return _eager_mode_decorator(f, primals, *args, **kwargs)
+      else:
+        return _graph_mode_decorator(f, primals, *args, **kwargs)
 
-  return tf_decorator.make_decorator(f, decorated)
+    return tf_decorator.make_decorator(f, decorated)
+
+  if f is None:
+    return decorator
+  else:
+    return decorator(f)
 
 
 def get_variable_by_name(var_name):
@@ -210,7 +285,7 @@ def get_dependent_variables(input_ops, output_ops):
   return tf_vars
 
 
-def _graph_mode_decorator(f, *args, **kwargs):
+def _graph_mode_decorator(f, primals, *args, **kwargs):
   """Implement custom gradient decorator for graph mode."""
   # TODO(rsepassi): Add support for kwargs
   if kwargs:
@@ -223,14 +298,17 @@ def _graph_mode_decorator(f, *args, **kwargs):
   # Checking global and local variables attempts to ensure that no non-resource
   # Variables are added to the graph.
   current_var_scope = variable_scope.get_variable_scope()
-  before_vars = set(current_var_scope.global_variables() +
-                    current_var_scope.local_variables())
+  before_vars = set(
+      [v.experimental_ref() for v in current_var_scope.global_variables() +
+       current_var_scope.local_variables()])
   with backprop.GradientTape() as tape:
     result, grad_fn = f(*args)
-  after_vars = set(current_var_scope.global_variables() +
-                   current_var_scope.local_variables())
+  after_vars = set(
+      [v.experimental_ref() for v in current_var_scope.global_variables() +
+       current_var_scope.local_variables()])
   new_vars = after_vars - before_vars
-  for v in new_vars:
+  new_vars_list = [v.deref() for v in new_vars]
+  for v in new_vars_list:
     if not resource_variable_ops.is_resource_variable(v):
       raise TypeError(
           "All variables used by a function wrapped with @custom_gradient must "
@@ -238,10 +316,15 @@ def _graph_mode_decorator(f, *args, **kwargs):
           "with `use_resource=False`.")
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
-  variables_in_tape = frozenset(tape.watched_variables()) - frozenset(args)
-  variables_in_subgraph = frozenset(get_dependent_variables(
-      input_ops=args, output_ops=result))
-  variables = list(variables_in_subgraph.union(variables_in_tape))
+  variables_in_tape = frozenset([
+      v.experimental_ref() for v in tape.watched_variables()
+  ]) - frozenset(v.experimental_ref() for v in args)
+  variables_in_subgraph = frozenset([
+      v.experimental_ref()
+      for v in get_dependent_variables(input_ops=args, output_ops=result)
+  ])
+  variables = list(
+      [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
 
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
   variables_in_signature = ("variables" in grad_argspec.args or
@@ -261,7 +344,12 @@ def _graph_mode_decorator(f, *args, **kwargs):
                    "no ResourceVariables were used on the forward pass.")
   flat_result = nest.flatten(result)
   flat_result_len = len(flat_result)
-  all_tensors = flat_result + args + variables
+
+  if primals is None:
+    all_tensors = flat_result + args + variables
+  else:
+    primals = [ops.convert_to_tensor(x) for x in nest.flatten(primals)]
+    all_tensors = flat_result + primals + variables
 
   def tape_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
@@ -304,14 +392,18 @@ def _graph_mode_decorator(f, *args, **kwargs):
       structure=result, flat_sequence=all_tensors[:flat_result_len])
 
 
-def _eager_mode_decorator(f, *args, **kwargs):
+def _eager_mode_decorator(f, primals, *args, **kwargs):
   """Implement custom gradient decorator for eager mode."""
   with backprop.GradientTape() as tape:
     result, grad_fn = f(*args, **kwargs)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
-  variables = [v for v in set(tape.watched_variables()) if v not in all_inputs]
+  variables = [
+      v.deref()  # pylint: disable=g-complex-comprehension
+      for v in set(v.experimental_ref() for v in tape.watched_variables())
+      if all(v.deref() is not i for i in all_inputs)
+  ]
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
   if (variables and ("variables" not in grad_argspec.args) and
       not grad_argspec.varkw):
@@ -324,7 +416,14 @@ def _eager_mode_decorator(f, *args, **kwargs):
 
   input_tensors = [ops.convert_to_tensor(x) for x
                    in list(args) + list(variables)]
-  arg_count = len(args)
+
+  if primals is None:
+    recorded_inputs = input_tensors
+    arg_count = len(args)
+  else:
+    recorded_inputs = [ops.convert_to_tensor(x) for x in nest.flatten(primals)]
+    arg_count = len(recorded_inputs)
+
   def actual_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
     if variables:
@@ -342,7 +441,7 @@ def _eager_mode_decorator(f, *args, **kwargs):
           "gradients but returned", len(flat_grads), "instead.")
     return nest.flatten(input_grads) + variable_grads
 
-  tape_lib.record_operation(f.__name__, flat_result, input_tensors,
+  tape_lib.record_operation(f.__name__, flat_result, recorded_inputs,
                             actual_grad_fn)
   flat_result = list(flat_result)
   return nest.pack_sequence_as(result, flat_result)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 9c49fc85270..5edb7489785 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import hashlib
 import threading
 
@@ -41,6 +40,7 @@ from tensorflow.python.ops import resource_variable_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: enable=wildcard-import
@@ -64,7 +64,7 @@ def _as_shape_list(shapes,
   """Convert shapes to a list of tuples of int (or None)."""
   del dtypes
   if unknown_dim_allowed:
-    if (not isinstance(shapes, collections.Sequence) or not shapes or
+    if (not isinstance(shapes, collections_abc.Sequence) or not shapes or
         any(shape is None or isinstance(shape, int) for shape in shapes)):
       raise ValueError(
           "When providing partial shapes, a list of shapes must be provided.")
@@ -1092,8 +1092,8 @@ class Barrier(object):
       else:
         batch_dim = tensor_shape.Dimension(
             tensor_util.constant_value(op.inputs[1]))
-      op.outputs[0].set_shape(tensor_shape.vector(batch_dim))  # indices
-      op.outputs[1].set_shape(tensor_shape.vector(batch_dim))  # keys
+      op.outputs[0].set_shape(tensor_shape.TensorShape([batch_dim]))  # indices
+      op.outputs[1].set_shape(tensor_shape.TensorShape([batch_dim]))  # keys
       for output, shape in zip(op.outputs[2:], self._shapes):  # value_list
         output.set_shape(
             tensor_shape.TensorShape([batch_dim]).concatenate(shape))
@@ -1283,8 +1283,9 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
           shared_name=shared_name,
           name=name,
           reduction_type=reduction_type)
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=accumulator_ref, handle_device=context.context().device_name)
+      if context.executing_eagerly():
+        self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=accumulator_ref, handle_device=context.context().device_name)
     else:
       accumulator_ref = gen_data_flow_ops.conditional_accumulator(
           dtype=dtype,
@@ -1518,6 +1519,40 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         values=return_val.values,
         dense_shape=return_val.shape)
 
+  # SparseConditionalAccumulator is not switched to resource. Use old kernels.
+  def num_accumulated(self, name=None):
+    """Number of gradients that have currently been aggregated in accumulator.
+
+    Args:
+      name: Optional name for the operation.
+
+    Returns:
+      Number of accumulated gradients currently in accumulator.
+    """
+    if name is None:
+      name = "%s_NumAccumulated" % self._name
+
+    return gen_data_flow_ops.accumulator_num_accumulated(
+        self._accumulator_ref, name=name)
+
+  def set_global_step(self, new_global_step, name=None):
+    """Sets the global time step of the accumulator.
+
+    The operation logs a warning if we attempt to set to a time step that is
+    lower than the accumulator's own time step.
+
+    Args:
+      new_global_step: Value of new time step. Can be a variable or a constant
+      name: Optional name for the operation.
+
+    Returns:
+      Operation that sets the accumulator's time step.
+    """
+    return gen_data_flow_ops.accumulator_set_global_step(
+        self._accumulator_ref,
+        math_ops.cast(ops.convert_to_tensor(new_global_step), _dtypes.int64),
+        name=name)
+
 
 class BaseStagingArea(object):
   """Base class for Staging Areas."""
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 4fb598aef4d..d77b3d14627 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -120,7 +120,7 @@ class Bernoulli(distribution.Distribution):
     return array_ops.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     new_shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index c1ec6ed6c69..9460627d5d7 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -238,7 +238,7 @@ class Beta(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     expanded_concentration1 = array_ops.ones_like(
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index b226232658f..fa78b2605d8 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import object_identity
 
 
 __all__ = [
@@ -64,12 +65,14 @@ class _Mapping(collections.namedtuple(
   @property
   def x_key(self):
     """Returns key used for caching Y=g(X)."""
-    return (self.x,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
+    return ((object_identity.Reference(self.x),) +
+            self._deep_tuple(tuple(sorted(self.kwargs.items()))))
 
   @property
   def y_key(self):
     """Returns key used for caching X=g^{-1}(Y)."""
-    return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
+    return ((object_identity.Reference(self.y),) +
+            self._deep_tuple(tuple(sorted(self.kwargs.items()))))
 
   def merge(self, x=None, y=None, ildj_map=None, kwargs=None, mapping=None):
     """Returns new _Mapping with args merged with self.
@@ -108,7 +111,7 @@ class _Mapping(collections.namedtuple(
     new = {} if new is None else new
     for k, v in six.iteritems(new):
       val = old.get(k, None)
-      if val is not None and val != v:
+      if val is not None and val is not v:
         raise ValueError("Found different value for existing key "
                          "(key:{} old_value:{} new_value:{}".format(
                              k, old[k], v))
@@ -119,7 +122,7 @@ class _Mapping(collections.namedtuple(
     """Helper to merge which handles merging one value."""
     if old is None:
       return new
-    elif new is not None and old != new:
+    elif new is not None and old is not new:
       raise ValueError("Incompatible values: %s != %s" % (old, new))
     return old
 
@@ -567,6 +570,7 @@ class Bijector(object):
     self._constant_ildj_map = {}
     self._validate_args = validate_args
     self._dtype = dtype
+    # These dicts can only be accessed using _Mapping.x_key or _Mapping.y_key
     self._from_y = {}
     self._from_x = {}
     if name:
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 33a84356250..1b2a8f53a72 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -266,7 +266,7 @@ class Categorical(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     if self.logits.get_shape().ndims == 2:
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 6fb105c2cbe..a459697fbce 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -210,7 +210,7 @@ class Gamma(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   @distribution_util.AppendDocstring(
       """Note: See `tf.random.gamma` docstring for sampling details and
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index a96b58ba1a6..02ec64f0e26 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -153,7 +153,7 @@ class Laplace(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 0b36054db2f..4c77cf9120c 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -189,7 +189,7 @@ class Normal(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index efc3290592d..4a5d3ea0d84 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -241,7 +241,7 @@ class StudentT(distribution.Distribution):
     return constant_op.constant([], dtype=math_ops.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     # The sampling method comes from the fact that if:
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 0221ccff78c..64fb0eadee7 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -165,7 +165,7 @@ class Uniform(distribution.Distribution):
     return constant_op.constant([], dtype=dtypes.int32)
 
   def _event_shape(self):
-    return tensor_shape.scalar()
+    return tensor_shape.TensorShape([])
 
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index e4c7087fded..9c437d32b9e 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -113,7 +113,9 @@ def _embedding_lookup_and_transform(params,
   Raises:
     ValueError: If `params` is empty.
   """
-  if params is None or params in ((), []):
+  if params is None:
+    raise ValueError("params must be specified")
+  if isinstance(params, (list, tuple)) and not params:
     raise ValueError("Need at least one param")
   if isinstance(params, variables.PartitionedVariable):
     params = list(params)  # Iterate to get the underlying Variables.
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index a1e1b7a0e39..92ca9c2971e 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -60,7 +60,7 @@ class GradientCheckerTest(test.TestCase):
       # checking gradients for x1
       error = gradient_checker.compute_gradient_error(x1, size, y, size)
     tf_logging.info("x1 error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   @test_util.run_deprecated_v1
   def testAddSimpleGPU(self):
@@ -75,7 +75,7 @@ class GradientCheckerTest(test.TestCase):
       # checking gradients for x1
       error = gradient_checker.compute_gradient_error(x1, size, y, size)
     tf_logging.info("x1 error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   @test_util.run_deprecated_v1
   def testAddCustomized(self):
@@ -94,7 +94,7 @@ class GradientCheckerTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(
           x2, size, y, size, x_init_value=x_init_value, delta=1e-2)
     tf_logging.info("x2 error = %f", error)
-    assert error < 1e-10
+    self.assertLess(error, 1e-10)
 
   @test_util.run_deprecated_v1
   def testGather(self):
@@ -112,7 +112,7 @@ class GradientCheckerTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(params, p_shape, y,
                                                       y_shape)
     tf_logging.info("gather error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   @test_util.run_deprecated_v1
   def testNestedGather(self):
@@ -134,7 +134,7 @@ class GradientCheckerTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(params, p_shape, y2,
                                                       y2_shape)
     tf_logging.info("nested gather error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   @test_util.run_deprecated_v1
   def testComplexMul(self):
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index 191b2b65681..d1205c36185 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -54,7 +54,7 @@ class GradientCheckerTest(test.TestCase):
     error = gradient_checker.max_error(*gradient_checker.compute_gradient(
         lambda x1: math_ops.add(x1, x2), [x1]))
     tf_logging.info("x1 error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   def testAddCustomized(self):
     size = (2, 3)
@@ -66,7 +66,7 @@ class GradientCheckerTest(test.TestCase):
         lambda x2: math_ops.add(x1, x2),
         [x2], delta=1e-2))
     tf_logging.info("x2 error = %f", error)
-    assert error < 1e-10
+    self.assertLess(error, 1e-10)
 
   def testGather(self):
     def f(params):
@@ -80,7 +80,7 @@ class GradientCheckerTest(test.TestCase):
     error = gradient_checker.max_error(*gradient_checker.compute_gradient(
         f, [params]))
     tf_logging.info("gather error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   def testNestedGather(self):
     def f(params):
@@ -97,7 +97,7 @@ class GradientCheckerTest(test.TestCase):
     error = gradient_checker.max_error(*gradient_checker.compute_gradient(
         f, [params]))
     tf_logging.info("nested gather error = %f", error)
-    assert error < 1e-4
+    self.assertLess(error, 1e-4)
 
   def testComplexMul(self):
     c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index 84a21d0bab5..2f8b15925d4 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -43,6 +43,8 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import object_identity
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -67,7 +69,7 @@ def _MarkReachedOps(from_ops, reached_ops, func_graphs):
 
 
 def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
-                  xs):
+                  xs_set):
   """Initialize the pending count for ops between two lists of Operations.
 
   'pending_count[op]' indicates the number of backprop inputs
@@ -81,7 +83,7 @@ def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
       these functions if they capture from_ops or any reachable ops. This is
       useful if to_ops occur in a function and from_ops are in an outer function
       or graph.
-    xs: list of Tensors.
+    xs_set: ObjectIdentitySet of Tensors.
 
   Returns:
     A tuple containing: (1) the subset of to_ops reachable from from_ops by a
@@ -111,7 +113,7 @@ def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
       between_op_list.append(op)
       # Clear the boolean so we won't add the inputs again.
       reached_ops.remove(op)
-      for inp in _NonEagerInputs(op, xs):
+      for inp in _NonEagerInputs(op, xs_set):
         queue.append(inp.op)
   # X in between_ops iff X is on a path of zero or more backpropagatable tensors
   # between from_ops and to_ops
@@ -123,7 +125,7 @@ def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
   # Initialize pending count for between ops.
   pending_count = collections.defaultdict(int)
   for op in between_op_list:
-    for x in _NonEagerInputs(op, xs):
+    for x in _NonEagerInputs(op, xs_set):
       if x.op in between_ops:
         pending_count[x.op] += 1
 
@@ -255,14 +257,15 @@ def _VerifyGeneratedGradients(grads, op):
   """
   # While ops have inputs added to them during the gradient computation, so we
   # skip the below check. See while_v2 for details.
-  if op.type == "While": return
+  if op.type == "While" or op.type == "StatelessWhile":
+    return
 
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
 
 
-def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
+def _StopOps(from_ops, stop_gradient_ops, pending_count, xs_set):
   """The set of ops that terminate the gradient computation.
 
   This computes the frontier of the forward graph *before* which backprop
@@ -278,7 +281,7 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
     from_ops: list of Operations.
     stop_gradient_ops: list of Operations never to backprop through.
     pending_count: mapping from operation to number of backprop inputs.
-    xs: list of Tensors.
+    xs_set: ObjectIdentitySet of Tensors.
 
   Returns:
     The set of operations.
@@ -286,7 +289,7 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
   stop_ops = set()
   for op in from_ops:
     is_stop_op = True
-    for inp in _NonEagerInputs(op, xs):
+    for inp in _NonEagerInputs(op, xs_set):
       if pending_count[inp.op] > 0:
         is_stop_op = False
         break
@@ -366,7 +369,7 @@ def _MaybeCompile(scope, op, func, grad_fn):
     return grad_fn()
 
 
-def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
+def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set):
   """Raises an error if we backprop through a loop var."""
   # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
   # message.
@@ -380,7 +383,7 @@ def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
     if curr_op in from_ops:
       target_op = curr_op
       break
-    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs))
+    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs_set))
   assert target_op
   raise ValueError(
       "Cannot compute gradient inside while loop with respect to op '%s'. "
@@ -400,7 +403,7 @@ def _Captures(func_graph):
     return func_graph.captures
   else:
     assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-    return func_graph._captured  # pylint: disable=protected-access
+    return func_graph._captured.items()  # pylint: disable=protected-access
 
 
 def _MaybeCaptured(t):
@@ -415,14 +418,14 @@ def _MaybeCaptured(t):
   # pylint: disable=protected-access
   if (not isinstance(t, ops.EagerTensor) and
       _IsFunction(t.op.graph) and t.op.type == "Placeholder"):
-    for input_t, placeholder_t in _Captures(t.op.graph).items():
-      if t == placeholder_t:
+    for input_t, placeholder_t in _Captures(t.op.graph):
+      if t is placeholder_t:
         return _MaybeCaptured(input_t)
   # pylint: enable=protected-access
   return t
 
 
-def _NonEagerInputs(op, xs):
+def _NonEagerInputs(op, xs_set):
   """Returns the inputs of op, crossing closure boundaries where necessary.
 
   Does not return any captured EagerTensors, i.e., the number of tensors
@@ -430,23 +433,23 @@ def _NonEagerInputs(op, xs):
 
   Args:
     op: Operation
-    xs: list of Tensors we are differentiating w.r.t.
+    xs_set: ObjectIdentitySet of Tensors we are differentiating w.r.t.
 
   Returns:
     A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
     is in a FuncGraph and has captured inputs.
   """
-  return [t for t in _Inputs(op, xs) if not isinstance(t, ops.EagerTensor)]
+  return [t for t in _Inputs(op, xs_set) if not isinstance(t, ops.EagerTensor)]
 
 
 # TODO(skyewm): plumbing xs through everywhere is ugly, consider making
 # _GradientsHelper a class with xs as a member variable.
-def _Inputs(op, xs):
+def _Inputs(op, xs_set):
   """Returns the inputs of op, crossing closure boundaries where necessary.
 
   Args:
     op: Operation
-    xs: list of Tensors we are differentiating w.r.t.
+    xs_set: ObjectIdentitySet of Tensors we are differentiating w.r.t.
 
   Returns:
     A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
@@ -460,7 +463,7 @@ def _Inputs(op, xs):
       # even if it's a function input for a captured value, whereas usually we'd
       # like to traverse through these closures as if the captured value was the
       # direct input to op.
-      if t not in xs:
+      if t not in xs_set:
         t = _MaybeCaptured(t)
       inputs.append(t)
     return inputs
@@ -481,8 +484,8 @@ def _Consumers(t, func_graphs):
   """
   consumers = t.consumers()
   for func in func_graphs:
-    for input_t, placeholder in _Captures(func).items():
-      if input_t == t:
+    for input_t, placeholder in _Captures(func):
+      if input_t is t:
         consumers.extend(_Consumers(placeholder, func_graphs))
   return consumers
 
@@ -542,6 +545,7 @@ def _GradientsHelper(ys,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
+    xs_set = object_identity.ObjectIdentitySet(xs)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
                              gradient_uid)
 
@@ -558,7 +562,7 @@ def _GradientsHelper(ys,
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
     reachable_to_ops, pending_count, loop_state = _PendingCount(
-        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)
+        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs_set)
 
     # Iterate over the collected ops.
     #
@@ -592,7 +596,7 @@ def _GradientsHelper(ys,
           _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
           queue.append(y.op)
 
-    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
+    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs_set)
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
@@ -645,7 +649,7 @@ def _GradientsHelper(ys,
             op._control_flow_context.IsWhileContext() and
             op._control_flow_context ==
             ops.get_default_graph()._get_control_flow_context()):
-          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
+          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set)
         # pylint: enable=protected-access
 
         if (grad_fn or is_func_call) and has_out_grads:
@@ -692,10 +696,10 @@ def _GradientsHelper(ys,
         else:
           # If no grad_fn is defined or none of out_grads is available,
           # just propagate a list of None backwards.
-          in_grads = [None] * len(_Inputs(op, xs))
+          in_grads = [None] * len(_Inputs(op, xs_set))
         # Note: we don't filter out eager inputs here because the inputs need to
         # line up with in_grads.
-        for i, (t_in, in_grad) in enumerate(zip(_Inputs(op, xs), in_grads)):
+        for i, (t_in, in_grad) in enumerate(zip(_Inputs(op, xs_set), in_grads)):
           if in_grad is not None:
             if (isinstance(in_grad, ops.Tensor) and
                 t_in.dtype != dtypes.resource):
@@ -715,7 +719,7 @@ def _GradientsHelper(ys,
 
       # Update pending count for the inputs of op and enqueue ready ops.
       _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                    xs)
+                                    xs_set)
 
   if loop_state:
     loop_state.PostProcessing()
@@ -728,16 +732,16 @@ def _HasAnyNotNoneGrads(grads, op):
   for out_grad in out_grads:
     if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
       return True
-    if out_grad and isinstance(out_grad, collections.Sequence):
+    if out_grad and isinstance(out_grad, collections_abc.Sequence):
       if any(g is not None for g in out_grad):
         return True
   return False
 
 
 def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                  xs):
+                                  xs_set):
   """Update pending count for the inputs of op and enqueue ready ops."""
-  for x in _NonEagerInputs(op, xs):
+  for x in _NonEagerInputs(op, xs_set):
     pending_count[x.op] -= 1
     ready = (pending_count[x.op] == 0)
     if loop_state and not ready:
@@ -794,7 +798,11 @@ def _GetGrad(grads, t, unconnected_gradients):
   if not op_grads:
     if unconnected_gradients == UnconnectedGradients.ZERO:
       t_dtype = default_gradient.get_zeros_dtype(t)
-      return array_ops.zeros_like(t, dtype=t_dtype)
+      if t.dtype == dtypes.resource:
+        return array_ops.zeros(
+            resource_variable_ops.variable_shape(t), dtype=t_dtype)
+      else:
+        return array_ops.zeros_like(t, dtype=t_dtype)
     elif unconnected_gradients == UnconnectedGradients.NONE:
       return None
     else:
@@ -894,19 +902,12 @@ class AggregationMethod(object):
     performance, but it can improve memory utilization because the
     gradients can be released earlier.
 
-  * `EXPERIMENTAL_ACCUMULATE_N`: Gradient terms are summed using the
-    "AccumulateN" op (see `tf.accumulate_n`), which accumulates the
-    overall sum in a single buffer that is shared across threads.
-    This method of summing gradients can result in a lower memory footprint
-    and lower latency at the expense of higher CPU/GPU utilization.
-    For gradients of types that "AccumulateN" does not support, this
-    summation method falls back on the behavior of `EXPERIMENTAL_TREE`
   """
   ADD_N = 0
   DEFAULT = ADD_N
   # The following are experimental and may not be supported in future releases.
   EXPERIMENTAL_TREE = 1
-  EXPERIMENTAL_ACCUMULATE_N = 2
+  EXPERIMENTAL_ACCUMULATE_N = 2  # An alias for EXPERIMENTAL_ADD_N = 1
 
 
 def _AggregatedGrads(grads,
@@ -953,11 +954,10 @@ def _AggregatedGrads(grads,
         assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all(
+    if (isinstance(out_grad, collections_abc.Sequence) and not all(
         isinstance(g, (ops.Tensor, ops.IndexedSlices))
         for g in out_grad
-        if g is not None
-    )):
+        if g is not None)):
       raise TypeError("gradients have to be either all Tensors "
                       "or all IndexedSlices")
     # Aggregate multiple gradients, and convert [] to None.
@@ -967,19 +967,7 @@ def _AggregatedGrads(grads,
         out_grads[i] = out_grad[0]
       elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
         tensor_shape = _AccumulatorShape(out_grad)
-        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
-          # The benefit of using AccumulateN is that its inputs can be combined
-          # in any order and this can allow the expression to be evaluated with
-          # a smaller memory footprint.  When used with gpu_allocator_retry,
-          # it is possible to compute a sum of terms which are much larger than
-          # total GPU memory.
-          # AccumulateN can currently only be used if we know the shape for
-          # an accumulator variable.  If this is not known, or if we only have
-          # 2 grads then we fall through to the "tree" case below.
-          used = "accumulate_n"
-          out_grads[i] = math_ops.accumulate_n(out_grad)
-        elif aggregation_method in [
+        if aggregation_method in [
             AggregationMethod.EXPERIMENTAL_TREE,
             AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
         ]:
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index ffc4408b781..07f3769eae4 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1597,6 +1597,13 @@ def adjust_brightness(image, delta):
 
   Returns:
     A brightness-adjusted tensor of the same shape and type as `image`.
+  
+  Usage Example:
+    ```python
+    import tensorflow as tf
+    x = tf.random.normal(shape=(256, 256, 3))
+    tf.image.adjust_brightness(x, delta=0.1)
+    ```
   """
   with ops.name_scope(None, 'adjust_brightness', [image, delta]) as name:
     image = ops.convert_to_tensor(image, name='image')
@@ -2008,6 +2015,16 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
+  
+  Usage Example:
+    ```python
+    >> import tensorflow as tf
+    >> x = tf.random.normal(shape=(256, 256, 3))
+    >> tf.image.adjust_jpeg_quality(x, 75)
+    ```
+  Raises:
+    InvalidArgumentError: quality must be in [0,100]
+    InvalidArgumentError: image must have 1 or 3 channels
   """
   with ops.name_scope(name, 'adjust_jpeg_quality', [image]) as name:
     image = ops.convert_to_tensor(image, name='image')
@@ -3417,7 +3434,40 @@ def image_gradients(image):
   Returns:
     Pair of tensors (dy, dx) holding the vertical and horizontal image
     gradients (1-step finite difference).
-
+    
+  Usage Example:
+    ```python
+    BATCH_SIZE = 1
+    IMAGE_HEIGHT = 5
+    IMAGE_WIDTH = 5
+    CHANNELS = 1
+    image = tf.reshape(tf.range(IMAGE_HEIGHT * IMAGE_WIDTH * CHANNELS, 
+      delta=1, dtype=tf.float32), 
+      shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS))
+    dx, dy = tf.image.image_gradients(image)
+    print(image[0, :,:,0])
+    tf.Tensor(
+      [[ 0.  1.  2.  3.  4.]
+      [ 5.  6.  7.  8.  9.]
+      [10. 11. 12. 13. 14.]
+      [15. 16. 17. 18. 19.]
+      [20. 21. 22. 23. 24.]], shape=(5, 5), dtype=float32)
+    print(dx[0, :,:,0])
+    tf.Tensor(
+      [[5. 5. 5. 5. 5.]
+      [5. 5. 5. 5. 5.]
+      [5. 5. 5. 5. 5.]
+      [5. 5. 5. 5. 5.]
+      [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32)    
+    print(dy[0, :,:,0])
+    tf.Tensor(
+      [[1. 1. 1. 1. 0.]
+      [1. 1. 1. 1. 0.]
+      [1. 1. 1. 1. 0.]
+      [1. 1. 1. 1. 0.]
+      [1. 1. 1. 1. 0.]], shape=(5, 5), dtype=float32)
+    ```
+    
   Raises:
     ValueError: If `image` is not a 4D tensor.
   """
@@ -3611,6 +3661,26 @@ def crop_and_resize_v2(image,
 
   Returns:
     A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+
+  Example:
+
+  ```python
+  import tensorflow as tf
+  BATCH_SIZE = 1
+  NUM_BOXES = 5
+  IMAGE_HEIGHT = 256
+  IMAGE_WIDTH = 256
+  CHANNELS = 3
+  CROP_SIZE = (24, 24)
+
+  image = tf.random.normal(shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH,
+  CHANNELS) )
+  boxes = tf.random.uniform(shape=(NUM_BOXES, 4))
+  box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0,
+  maxval=BATCH_SIZE, dtype=tf.int32)
+  output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE)
+  output.shape  #=> (5, 24, 24, 3)
+  ```
   """
   return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size,
                                        method, extrapolation_value, name)
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index 374ef7d99ba..fc5e71cad41 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -28,6 +28,7 @@ py_library(
     srcs = ["linalg_impl.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":linear_operator_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:linalg_ops",
@@ -35,3 +36,18 @@ py_library(
         "//tensorflow/python:special_math_ops",
     ],
 )
+
+py_library(
+    name = "linear_operator_util",
+    srcs = ["linear_operator_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/module",
+    ],
+)
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 9e29ba934cf..d733fe4f085 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -18,16 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
@@ -634,3 +639,480 @@ def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
   subdiag = array_ops.expand_dims(subdiag, -2)
 
   return linalg_ops.tridiagonal_mat_mul(superdiag, maindiag, subdiag, rhs, name)
+
+
+def _maybe_validate_matrix(a, validate_args):
+  """Checks that input is a `float` matrix."""
+  assertions = []
+  if not a.dtype.is_floating:
+    raise TypeError('Input `a` must have `float`-like `dtype` '
+                    '(saw {}).'.format(a.dtype.name))
+  if a.shape is not None and a.shape.rank is not None:
+    if a.shape.rank < 2:
+      raise ValueError('Input `a` must have at least 2 dimensions '
+                       '(saw: {}).'.format(a.shape.rank))
+  elif validate_args:
+    assertions.append(
+        check_ops.assert_rank_at_least(
+            a, rank=2, message='Input `a` must have at least 2 dimensions.'))
+  return assertions
+
+
+@tf_export('linalg.matrix_rank')
+def matrix_rank(a, tol=None, validate_args=False, name=None):
+  """Compute the matrix rank of one or more matrices.
+
+  Arguments:
+    a: (Batch of) `float`-like matrix-shaped `Tensor`(s) which are to be
+      pseudo-inverted.
+    tol: Threshold below which the singular value is counted as 'zero'.
+      Default value: `None` (i.e., `eps * max(rows, cols) * max(singular_val)`).
+    validate_args: When `True`, additional assertions might be embedded in the
+      graph.
+      Default value: `False` (i.e., no graph assertions are added).
+    name: Python `str` prefixed to ops created by this function.
+      Default value: 'matrix_rank'.
+
+  Returns:
+    matrix_rank: (Batch of) `int32` scalars representing the number of non-zero
+      singular values.
+  """
+  with ops.name_scope(name or 'matrix_rank'):
+    a = ops.convert_to_tensor(a, dtype_hint=dtypes.float32, name='a')
+    assertions = _maybe_validate_matrix(a, validate_args)
+    if assertions:
+      with ops.control_dependencies(assertions):
+        a = array_ops.identity(a)
+    s = svd(a, compute_uv=False)
+    if tol is None:
+      if (a.shape[-2:]).is_fully_defined():
+        m = np.max(a.shape[-2:].as_list())
+      else:
+        m = math_ops.reduce_max(array_ops.shape(a)[-2:])
+      eps = np.finfo(a.dtype.as_numpy_dtype).eps
+      tol = (
+          eps * math_ops.cast(m, a.dtype) *
+          math_ops.reduce_max(s, axis=-1, keepdims=True))
+    return math_ops.reduce_sum(math_ops.cast(s > tol, dtypes.int32), axis=-1)
+
+
+@tf_export('linalg.pinv')
+def pinv(a, rcond=None, validate_args=False, name=None):
+  """Compute the Moore-Penrose pseudo-inverse of one or more matrices.
+
+  Calculate the [generalized inverse of a matrix](
+  https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse) using its
+  singular-value decomposition (SVD) and including all large singular values.
+
+  The pseudo-inverse of a matrix `A`, is defined as: 'the matrix that 'solves'
+  [the least-squares problem] `A @ x = b`,' i.e., if `x_hat` is a solution, then
+  `A_pinv` is the matrix such that `x_hat = A_pinv @ b`. It can be shown that if
+  `U @ Sigma @ V.T = A` is the singular value decomposition of `A`, then
+  `A_pinv = V @ inv(Sigma) U^T`. [(Strang, 1980)][1]
+
+  This function is analogous to [`numpy.linalg.pinv`](
+  https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.pinv.html).
+  It differs only in default value of `rcond`. In `numpy.linalg.pinv`, the
+  default `rcond` is `1e-15`. Here the default is
+  `10. * max(num_rows, num_cols) * np.finfo(dtype).eps`.
+
+  Args:
+    a: (Batch of) `float`-like matrix-shaped `Tensor`(s) which are to be
+      pseudo-inverted.
+    rcond: `Tensor` of small singular value cutoffs.  Singular values smaller
+      (in modulus) than `rcond` * largest_singular_value (again, in modulus) are
+      set to zero. Must broadcast against `tf.shape(a)[:-2]`.
+      Default value: `10. * max(num_rows, num_cols) * np.finfo(a.dtype).eps`.
+    validate_args: When `True`, additional assertions might be embedded in the
+      graph.
+      Default value: `False` (i.e., no graph assertions are added).
+    name: Python `str` prefixed to ops created by this function.
+      Default value: 'pinv'.
+
+  Returns:
+    a_pinv: (Batch of) pseudo-inverse of input `a`. Has same shape as `a` except
+      rightmost two dimensions are transposed.
+
+  Raises:
+    TypeError: if input `a` does not have `float`-like `dtype`.
+    ValueError: if input `a` has fewer than 2 dimensions.
+
+  #### Examples
+
+  ```python
+  import tensorflow as tf
+  import tensorflow_probability as tfp
+
+  a = tf.constant([[1.,  0.4,  0.5],
+                   [0.4, 0.2,  0.25],
+                   [0.5, 0.25, 0.35]])
+  tf.matmul(tf.linalg..pinv(a), a)
+  # ==> array([[1., 0., 0.],
+               [0., 1., 0.],
+               [0., 0., 1.]], dtype=float32)
+
+  a = tf.constant([[1.,  0.4,  0.5,  1.],
+                   [0.4, 0.2,  0.25, 2.],
+                   [0.5, 0.25, 0.35, 3.]])
+  tf.matmul(tf.linalg..pinv(a), a)
+  # ==> array([[ 0.76,  0.37,  0.21, -0.02],
+               [ 0.37,  0.43, -0.33,  0.02],
+               [ 0.21, -0.33,  0.81,  0.01],
+               [-0.02,  0.02,  0.01,  1.  ]], dtype=float32)
+  ```
+
+  #### References
+
+  [1]: G. Strang. 'Linear Algebra and Its Applications, 2nd Ed.' Academic Press,
+       Inc., 1980, pp. 139-142.
+  """
+  with ops.name_scope(name or 'pinv'):
+    a = ops.convert_to_tensor(a, name='a')
+
+    assertions = _maybe_validate_matrix(a, validate_args)
+    if assertions:
+      with ops.control_dependencies(assertions):
+        a = array_ops.identity(a)
+
+    dtype = a.dtype.as_numpy_dtype
+
+    if rcond is None:
+
+      def get_dim_size(dim):
+        dim_val = tensor_shape.dimension_value(a.shape[dim])
+        if dim_val is not None:
+          return dim_val
+        return array_ops.shape(a)[dim]
+
+      num_rows = get_dim_size(-2)
+      num_cols = get_dim_size(-1)
+      if isinstance(num_rows, int) and isinstance(num_cols, int):
+        max_rows_cols = float(max(num_rows, num_cols))
+      else:
+        max_rows_cols = math_ops.cast(
+            math_ops.maximum(num_rows, num_cols), dtype)
+      rcond = 10. * max_rows_cols * np.finfo(dtype).eps
+
+    rcond = ops.convert_to_tensor(rcond, dtype=dtype, name='rcond')
+
+    # Calculate pseudo inverse via SVD.
+    # Note: if a is Hermitian then u == v. (We might observe additional
+    # performance by explicitly setting `v = u` in such cases.)
+    [
+        singular_values,  # Sigma
+        left_singular_vectors,  # U
+        right_singular_vectors,  # V
+    ] = svd(
+        a, full_matrices=False, compute_uv=True)
+
+    # Saturate small singular values to inf. This has the effect of make
+    # `1. / s = 0.` while not resulting in `NaN` gradients.
+    cutoff = rcond * math_ops.reduce_max(singular_values, axis=-1)
+    singular_values = array_ops.where_v2(
+        singular_values > array_ops.expand_dims_v2(cutoff, -1), singular_values,
+        np.array(np.inf, dtype))
+
+    # By the definition of the SVD, `a == u @ s @ v^H`, and the pseudo-inverse
+    # is defined as `pinv(a) == v @ inv(s) @ u^H`.
+    a_pinv = math_ops.matmul(
+        right_singular_vectors / array_ops.expand_dims_v2(singular_values, -2),
+        left_singular_vectors,
+        adjoint_b=True)
+
+    if a.shape is not None and a.shape.rank is not None:
+      a_pinv.set_shape(a.shape[:-2].concatenate([a.shape[-1], a.shape[-2]]))
+
+    return a_pinv
+
+
+@tf_export('linalg.lu_solve')
+def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
+  """Solves systems of linear eqns `A X = RHS`, given LU factorizations.
+
+  Note: this function does not verify the implied matrix is actually invertible
+  nor is this condition checked even when `validate_args=True`.
+
+  Args:
+    lower_upper: `lu` as returned by `tf.linalg.lu`, i.e., if `matmul(P,
+      matmul(L, U)) = X` then `lower_upper = L + U - eye`.
+    perm: `p` as returned by `tf.linag.lu`, i.e., if `matmul(P, matmul(L, U)) =
+      X` then `perm = argmax(P)`.
+    rhs: Matrix-shaped float `Tensor` representing targets for which to solve;
+      `A X = RHS`. To handle vector cases, use: `lu_solve(..., rhs[...,
+        tf.newaxis])[..., 0]`.
+    validate_args: Python `bool` indicating whether arguments should be checked
+      for correctness. Note: this function does not verify the implied matrix is
+        actually invertible, even when `validate_args=True`.
+      Default value: `False` (i.e., don't validate arguments).
+    name: Python `str` name given to ops managed by this object.
+      Default value: `None` (i.e., 'lu_solve').
+
+  Returns:
+    x: The `X` in `A @ X = RHS`.
+
+  #### Examples
+
+  ```python
+  import numpy as np
+  import tensorflow as tf
+  import tensorflow_probability as tfp
+
+  x = [[[1., 2],
+        [3, 4]],
+       [[7, 8],
+        [3, 4]]]
+  inv_x = tf.linalg.lu_solve(*tf.linalg.lu(x), rhs=tf.eye(2))
+  tf.assert_near(tf.matrix_inverse(x), inv_x)
+  # ==> True
+  ```
+
+  """
+
+  with ops.name_scope(name or 'lu_solve'):
+    lower_upper = ops.convert_to_tensor(
+        lower_upper, dtype_hint=dtypes.float32, name='lower_upper')
+    perm = ops.convert_to_tensor(perm, dtype_hint=dtypes.int32, name='perm')
+    rhs = ops.convert_to_tensor(rhs, dtype_hint=lower_upper.dtype, name='rhs')
+
+    assertions = _lu_solve_assertions(lower_upper, perm, rhs, validate_args)
+    if assertions:
+      with ops.control_dependencies(assertions):
+        lower_upper = array_ops.identity(lower_upper)
+        perm = array_ops.identity(perm)
+        rhs = array_ops.identity(rhs)
+
+    if (rhs.shape.rank == 2 and perm.shape.rank == 1):
+      # Both rhs and perm have scalar batch_shape.
+      permuted_rhs = array_ops.gather(rhs, perm, axis=-2)
+    else:
+      # Either rhs or perm have non-scalar batch_shape or we can't determine
+      # this information statically.
+      rhs_shape = array_ops.shape(rhs)
+      broadcast_batch_shape = array_ops.broadcast_dynamic_shape(
+          rhs_shape[:-2],
+          array_ops.shape(perm)[:-1])
+      d, m = rhs_shape[-2], rhs_shape[-1]
+      rhs_broadcast_shape = array_ops.concat([broadcast_batch_shape, [d, m]],
+                                             axis=0)
+
+      # Tile out rhs.
+      broadcast_rhs = array_ops.broadcast_to(rhs, rhs_broadcast_shape)
+      broadcast_rhs = array_ops.reshape(broadcast_rhs, [-1, d, m])
+
+      # Tile out perm and add batch indices.
+      broadcast_perm = array_ops.broadcast_to(perm, rhs_broadcast_shape[:-1])
+      broadcast_perm = array_ops.reshape(broadcast_perm, [-1, d])
+      broadcast_batch_size = math_ops.reduce_prod(broadcast_batch_shape)
+      broadcast_batch_indices = array_ops.broadcast_to(
+          math_ops.range(broadcast_batch_size)[:, array_ops.newaxis],
+          [broadcast_batch_size, d])
+      broadcast_perm = array_ops.stack(
+          [broadcast_batch_indices, broadcast_perm], axis=-1)
+
+      permuted_rhs = array_ops.gather_nd(broadcast_rhs, broadcast_perm)
+      permuted_rhs = array_ops.reshape(permuted_rhs, rhs_broadcast_shape)
+
+    lower = set_diag(
+        band_part(lower_upper, num_lower=-1, num_upper=0),
+        array_ops.ones(
+            array_ops.shape(lower_upper)[:-1], dtype=lower_upper.dtype))
+    return linear_operator_util.matrix_triangular_solve_with_broadcast(
+        lower_upper,  # Only upper is accessed.
+        linear_operator_util.matrix_triangular_solve_with_broadcast(
+            lower, permuted_rhs),
+        lower=False)
+
+
+@tf_export('linalg.lu_matrix_inverse')
+def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
+  """Computes the inverse given the LU decomposition(s) of one or more matrices.
+
+  This op is conceptually identical to,
+
+  ```python
+  inv_X = tf.lu_matrix_inverse(*tf.linalg.lu(X))
+  tf.assert_near(tf.matrix_inverse(X), inv_X)
+  # ==> True
+  ```
+
+  Note: this function does not verify the implied matrix is actually invertible
+  nor is this condition checked even when `validate_args=True`.
+
+  Args:
+    lower_upper: `lu` as returned by `tf.linalg.lu`, i.e., if `matmul(P,
+      matmul(L, U)) = X` then `lower_upper = L + U - eye`.
+    perm: `p` as returned by `tf.linag.lu`, i.e., if `matmul(P, matmul(L, U)) =
+      X` then `perm = argmax(P)`.
+    validate_args: Python `bool` indicating whether arguments should be checked
+      for correctness. Note: this function does not verify the implied matrix is
+        actually invertible, even when `validate_args=True`.
+      Default value: `False` (i.e., don't validate arguments).
+    name: Python `str` name given to ops managed by this object.
+      Default value: `None` (i.e., 'lu_matrix_inverse').
+
+  Returns:
+    inv_x: The matrix_inv, i.e.,
+      `tf.matrix_inverse(tf.linalg.lu_reconstruct(lu, perm))`.
+
+  #### Examples
+
+  ```python
+  import numpy as np
+  import tensorflow as tf
+  import tensorflow_probability as tfp
+
+  x = [[[3., 4], [1, 2]],
+       [[7., 8], [3, 4]]]
+  inv_x = tf.linalg.lu_matrix_inverse(*tf.linalg.lu(x))
+  tf.assert_near(tf.matrix_inverse(x), inv_x)
+  # ==> True
+  ```
+
+  """
+
+  with ops.name_scope(name or 'lu_matrix_inverse'):
+    lower_upper = ops.convert_to_tensor(
+        lower_upper, dtype_hint=dtypes.float32, name='lower_upper')
+    perm = ops.convert_to_tensor(perm, dtype_hint=dtypes.int32, name='perm')
+    assertions = lu_reconstruct_assertions(lower_upper, perm, validate_args)
+    if assertions:
+      with ops.control_dependencies(assertions):
+        lower_upper = array_ops.identity(lower_upper)
+        perm = array_ops.identity(perm)
+    shape = array_ops.shape(lower_upper)
+    return lu_solve(
+        lower_upper,
+        perm,
+        rhs=eye(shape[-1], batch_shape=shape[:-2], dtype=lower_upper.dtype),
+        validate_args=False)
+
+
+@tf_export('linalg.lu_reconstruct')
+def lu_reconstruct(lower_upper, perm, validate_args=False, name=None):
+  """The reconstruct one or more matrices from their LU decomposition(s).
+
+  Args:
+    lower_upper: `lu` as returned by `tf.linalg.lu`, i.e., if `matmul(P,
+      matmul(L, U)) = X` then `lower_upper = L + U - eye`.
+    perm: `p` as returned by `tf.linag.lu`, i.e., if `matmul(P, matmul(L, U)) =
+      X` then `perm = argmax(P)`.
+    validate_args: Python `bool` indicating whether arguments should be checked
+      for correctness.
+      Default value: `False` (i.e., don't validate arguments).
+    name: Python `str` name given to ops managed by this object.
+      Default value: `None` (i.e., 'lu_reconstruct').
+
+  Returns:
+    x: The original input to `tf.linalg.lu`, i.e., `x` as in,
+      `lu_reconstruct(*tf.linalg.lu(x))`.
+
+  #### Examples
+
+  ```python
+  import numpy as np
+  import tensorflow as tf
+  import tensorflow_probability as tfp
+
+  x = [[[3., 4], [1, 2]],
+       [[7., 8], [3, 4]]]
+  x_reconstructed = tf.linalg.lu_reconstruct(*tf.linalg.lu(x))
+  tf.assert_near(x, x_reconstructed)
+  # ==> True
+  ```
+
+  """
+  with ops.name_scope(name or 'lu_reconstruct'):
+    lower_upper = ops.convert_to_tensor(
+        lower_upper, dtype_hint=dtypes.float32, name='lower_upper')
+    perm = ops.convert_to_tensor(perm, dtype_hint=dtypes.int32, name='perm')
+
+    assertions = lu_reconstruct_assertions(lower_upper, perm, validate_args)
+    if assertions:
+      with ops.control_dependencies(assertions):
+        lower_upper = array_ops.identity(lower_upper)
+        perm = array_ops.identity(perm)
+
+    shape = array_ops.shape(lower_upper)
+
+    lower = set_diag(
+        band_part(lower_upper, num_lower=-1, num_upper=0),
+        array_ops.ones(shape[:-1], dtype=lower_upper.dtype))
+    upper = band_part(lower_upper, num_lower=0, num_upper=-1)
+    x = math_ops.matmul(lower, upper)
+
+    if (lower_upper.shape is None or lower_upper.shape.rank is None or
+        lower_upper.shape.rank != 2):
+      # We either don't know the batch rank or there are >0 batch dims.
+      batch_size = math_ops.reduce_prod(shape[:-2])
+      d = shape[-1]
+      x = array_ops.reshape(x, [batch_size, d, d])
+      perm = array_ops.reshape(perm, [batch_size, d])
+      perm = map_fn.map_fn(array_ops.invert_permutation, perm)
+      batch_indices = array_ops.broadcast_to(
+          math_ops.range(batch_size)[:, array_ops.newaxis], [batch_size, d])
+      x = array_ops.gather_nd(x, array_ops.stack([batch_indices, perm],
+                                                 axis=-1))
+      x = array_ops.reshape(x, shape)
+    else:
+      x = array_ops.gather(x, array_ops.invert_permutation(perm))
+
+    x.set_shape(lower_upper.shape)
+    return x
+
+
+def lu_reconstruct_assertions(lower_upper, perm, validate_args):
+  """Returns list of assertions related to `lu_reconstruct` assumptions."""
+  assertions = []
+
+  message = 'Input `lower_upper` must have at least 2 dimensions.'
+  if lower_upper.shape.rank is not None and lower_upper.shape.rank < 2:
+    raise ValueError(message)
+  elif validate_args:
+    assertions.append(
+        check_ops.assert_rank_at_least_v2(lower_upper, rank=2, message=message))
+
+  message = '`rank(lower_upper)` must equal `rank(perm) + 1`'
+  if lower_upper.shape.rank is not None and perm.shape.rank is not None:
+    if lower_upper.shape.rank != perm.shape.rank + 1:
+      raise ValueError(message)
+  elif validate_args:
+    assertions.append(
+        check_ops.assert_rank(
+            lower_upper, rank=array_ops.rank(perm) + 1, message=message))
+
+  message = '`lower_upper` must be square.'
+  if lower_upper.shape[:-2].is_fully_defined():
+    if lower_upper.shape[-2] != lower_upper.shape[-1]:
+      raise ValueError(message)
+  elif validate_args:
+    m, n = array_ops.split(
+        array_ops.shape(lower_upper)[-2:], num_or_size_splits=2)
+    assertions.append(check_ops.assert_equal(m, n, message=message))
+
+  return assertions
+
+
+def _lu_solve_assertions(lower_upper, perm, rhs, validate_args):
+  """Returns list of assertions related to `lu_solve` assumptions."""
+  assertions = lu_reconstruct_assertions(lower_upper, perm, validate_args)
+
+  message = 'Input `rhs` must have at least 2 dimensions.'
+  if rhs.shape.ndims is not None:
+    if rhs.shape.ndims < 2:
+      raise ValueError(message)
+  elif validate_args:
+    assertions.append(
+        check_ops.assert_rank_at_least(rhs, rank=2, message=message))
+
+  message = '`lower_upper.shape[-1]` must equal `rhs.shape[-1]`.'
+  if (lower_upper.shape[-1] is not None and rhs.shape[-2] is not None):
+    if lower_upper.shape[-1] != rhs.shape[-2]:
+      raise ValueError(message)
+  elif validate_args:
+    assertions.append(
+        check_ops.assert_equal(
+            array_ops.shape(lower_upper)[-1],
+            array_ops.shape(rhs)[-2],
+            message=message))
+
+  return assertions
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index f323d22d5b8..619c46883e6 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
@@ -147,6 +148,8 @@ class LinearOperator(module.Module):
     way.
   """
 
+  @deprecation.deprecated_args(None, "Do not pass `graph_parents`.  They will "
+                               " no longer be used.", "graph_parents")
   def __init__(self,
                dtype,
                graph_parents=None,
@@ -163,8 +166,8 @@ class LinearOperator(module.Module):
     Args:
       dtype: The type of the this `LinearOperator`.  Arguments to `matmul` and
         `solve` will have to be this type.
-      graph_parents: Python list of graph prerequisites of this `LinearOperator`
-        Typically tensors that are passed during initialization.
+      graph_parents: (Deprecated) Python list of graph prerequisites of this
+        `LinearOperator` Typically tensors that are passed during initialization
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `dtype` is real, this is equivalent to being symmetric.
@@ -200,7 +203,8 @@ class LinearOperator(module.Module):
 
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
-      if t is None or not tensor_util.is_tensor(t):
+      if t is None or not (linear_operator_util.is_ref(t) or
+                           tensor_util.is_tensor(t)):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
     self._dtype = dtypes.as_dtype(dtype).base_dtype if dtype else dtype
     self._graph_parents = graph_parents
@@ -229,6 +233,7 @@ class LinearOperator(module.Module):
     return self._name
 
   @property
+  @deprecation.deprecated(None, "Do not call `graph_parents`.")
   def graph_parents(self):
     """List of graph dependencies of this `LinearOperator`."""
     return self._graph_parents
@@ -270,7 +275,7 @@ class LinearOperator(module.Module):
 
     If this operator acts like the batch matrix `A` with
     `A.shape = [B1,...,Bb, M, N]`, then this returns
-    `TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
+    `TensorShape([B1,...,Bb, M, N])`, equivalent to `A.shape`.
 
     Returns:
       `TensorShape`, statically determined, may be undefined.
@@ -307,7 +312,7 @@ class LinearOperator(module.Module):
 
     If this operator acts like the batch matrix `A` with
     `A.shape = [B1,...,Bb, M, N]`, then this returns
-    `TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
+    `TensorShape([B1,...,Bb])`, equivalent to `A.shape[:-2]`
 
     Returns:
       `TensorShape`, statically determined, may be undefined.
@@ -622,7 +627,7 @@ class LinearOperator(module.Module):
       arg_dim = -1 if adjoint_arg else -2
       tensor_shape.dimension_at_index(
           self.shape, self_dim).assert_is_compatible_with(
-              x.get_shape()[arg_dim])
+              x.shape[arg_dim])
 
       return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
@@ -663,7 +668,7 @@ class LinearOperator(module.Module):
       self._check_input_dtype(x)
       self_dim = -2 if adjoint else -1
       tensor_shape.dimension_at_index(
-          self.shape, self_dim).assert_is_compatible_with(x.get_shape()[-1])
+          self.shape, self_dim).assert_is_compatible_with(x.shape[-1])
       return self._matvec(x, adjoint=adjoint)
 
   def _determinant(self):
@@ -808,7 +813,7 @@ class LinearOperator(module.Module):
       arg_dim = -1 if adjoint_arg else -2
       tensor_shape.dimension_at_index(
           self.shape, self_dim).assert_is_compatible_with(
-              rhs.get_shape()[arg_dim])
+              rhs.shape[arg_dim])
 
       return self._solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
@@ -862,8 +867,7 @@ class LinearOperator(module.Module):
       self._check_input_dtype(rhs)
       self_dim = -1 if adjoint else -2
       tensor_shape.dimension_at_index(
-          self.shape, self_dim).assert_is_compatible_with(
-              rhs.get_shape()[-1])
+          self.shape, self_dim).assert_is_compatible_with(rhs.shape[-1])
 
       return self._solvevec(rhs, adjoint=adjoint)
 
@@ -939,8 +943,6 @@ class LinearOperator(module.Module):
 
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
-    logging.warn("Using (possibly slow) default implementation of to_dense."
-                 "  Converts by self.matmul(identity).")
     if self.batch_shape.is_fully_defined():
       batch_shape = self.batch_shape
     else:
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index ca583471664..aa40223b014 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -137,8 +137,8 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
     """Static check of spectrum.  Then return `Tensor` version."""
     spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
 
-    if spectrum.get_shape().ndims is not None:
-      if spectrum.get_shape().ndims < self.block_depth:
+    if spectrum.shape.ndims is not None:
+      if spectrum.shape.ndims < self.block_depth:
         raise ValueError(
             "Argument spectrum must have at least %d dimensions.  Found: %s" %
             (self.block_depth, spectrum))
@@ -183,7 +183,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
   @property
   def block_shape(self):
-    return self.spectrum.get_shape()[-self.block_depth:]
+    return self.spectrum.shape[-self.block_depth:]
 
   @property
   def spectrum(self):
@@ -207,11 +207,11 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
     # Blockify: Blockfy trailing dimensions.
     #   [m3, m0, m1, m2] --> [m3, m0, m1, b0, b1]
-    if (vec.get_shape().is_fully_defined() and
+    if (vec.shape.is_fully_defined() and
         self.block_shape.is_fully_defined()):
       # vec_leading_shape = [m3, m0, m1],
       # the parts of vec that will not be blockified.
-      vec_leading_shape = vec.get_shape()[:-1]
+      vec_leading_shape = vec.shape[:-1]
       final_shape = vec_leading_shape.concatenate(self.block_shape)
     else:
       vec_leading_shape = array_ops.shape(vec)[:-1]
@@ -232,9 +232,9 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
     # Un-blockify: Flatten block dimensions.  Reshape
     #   [v0, v1, v2, v3] --> [v0, v1, v2*v3].
-    if vec.get_shape().is_fully_defined():
+    if vec.shape.is_fully_defined():
       # vec_shape = [v0, v1, v2, v3]
-      vec_shape = vec.get_shape().as_list()
+      vec_shape = vec.shape.as_list()
       # vec_leading_shape = [v0, v1]
       vec_leading_shape = vec_shape[:-self.block_depth]
       # vec_block_shape = [v2, v3]
@@ -298,7 +298,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
       return math_ops.cast(h, self.dtype)
 
   def _shape(self):
-    s_shape = self._spectrum.get_shape()
+    s_shape = self._spectrum.shape
     # Suppose spectrum.shape = [a, b, c, d]
     # block_depth = 2
     # Then:
@@ -471,8 +471,8 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
     # Get shape of diag along with the axis over which to reduce the spectrum.
     # We will reduce the spectrum over all block indices.
-    if self.spectrum.get_shape().is_fully_defined():
-      spec_rank = self.spectrum.get_shape().ndims
+    if self.spectrum.shape.is_fully_defined():
+      spec_rank = self.spectrum.shape.ndims
       axis = np.arange(spec_rank - self.block_depth, spec_rank, dtype=np.int32)
     else:
       spec_rank = array_ops.rank(self.spectrum)
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index ff6284e1ec1..614422a38b9 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -167,13 +167,13 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
 
   def _check_diag(self, diag):
     """Static check of diag."""
-    if diag.get_shape().ndims is not None and diag.get_shape().ndims < 1:
+    if diag.shape.ndims is not None and diag.shape.ndims < 1:
       raise ValueError("Argument diag must have at least 1 dimension.  "
                        "Found: %s" % diag)
 
   def _shape(self):
     # If d_shape = [5, 3], we return [5, 3, 3].
-    d_shape = self._diag.get_shape()
+    d_shape = self._diag.shape
     return d_shape.concatenate(d_shape[-1:])
 
   def _shape_tensor(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 942a404ff31..15e8fb6fdcf 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -166,13 +166,13 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
           "Argument matrix must have dtype in %s.  Found: %s"
           % (allowed_dtypes, dtype))
 
-    if matrix.get_shape().ndims is not None and matrix.get_shape().ndims < 2:
+    if matrix.shape.ndims is not None and matrix.shape.ndims < 2:
       raise ValueError(
           "Argument matrix must have at least 2 dimensions.  Found: %s"
           % matrix)
 
   def _shape(self):
-    return self._matrix.get_shape()
+    return self._matrix.shape
 
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
index be8f05bbff1..2771d8e4e52 100644
--- a/tensorflow/python/ops/linalg/linear_operator_householder.py
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorHouseholder",]
@@ -123,7 +124,7 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
     """
 
     with ops.name_scope(name, values=[reflection_axis]):
-      self._reflection_axis = ops.convert_to_tensor(
+      self._reflection_axis = linear_operator_util.convert_nonref_to_tensor(
           reflection_axis, name="reflection_axis")
       self._check_reflection_axis(self._reflection_axis)
 
@@ -154,15 +155,15 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
 
   def _check_reflection_axis(self, reflection_axis):
     """Static check of reflection_axis."""
-    if (reflection_axis.get_shape().ndims is not None and
-        reflection_axis.get_shape().ndims < 1):
+    if (reflection_axis.shape.ndims is not None and
+        reflection_axis.shape.ndims < 1):
       raise ValueError(
           "Argument reflection_axis must have at least 1 dimension.  "
           "Found: %s" % reflection_axis)
 
   def _shape(self):
     # If d_shape = [5, 3], we return [5, 3, 3].
-    d_shape = self._reflection_axis.get_shape()
+    d_shape = self._reflection_axis.shape
     return d_shape.concatenate(d_shape[-1:])
 
   def _shape_tensor(self):
@@ -194,9 +195,10 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
 
     # Note that because this is a reflection, it lies in O(n) (for real vector
     # spaces) or U(n) (for complex vector spaces), and thus is its own adjoint.
+    reflection_axis = ops.convert_to_tensor(self.reflection_axis)
     x = linalg.adjoint(x) if adjoint_arg else x
-    normalized_axis = self.reflection_axis / linalg.norm(
-        self.reflection_axis, axis=-1, keepdims=True)
+    normalized_axis = reflection_axis / linalg.norm(
+        reflection_axis, axis=-1, keepdims=True)
     mat = normalized_axis[..., array_ops.newaxis]
     x_dot_normalized_v = math_ops.matmul(mat, x, adjoint_a=True)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 1b019158023..0814b4bc9f1 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -250,6 +250,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         negative.
       ValueError:  If any of the following is not `True`:
         `{is_self_adjoint, is_non_singular, is_positive_definite}`.
+      TypeError:  If `num_rows` or `batch_shape` is ref-type (e.g. Variable).
     """
     dtype = dtype or dtypes.float32
     self._assert_proper_shapes = assert_proper_shapes
@@ -273,6 +274,9 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
           is_square=is_square,
           name=name)
 
+      linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
+      linear_operator_util.assert_not_ref_type(batch_shape, "batch_shape")
+
       self._num_rows = linear_operator_util.shape_tensor(
           num_rows, name="num_rows")
       self._num_rows_static = tensor_util.constant_value(self._num_rows)
@@ -329,10 +333,10 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     #   Also, the final dimension of 'x' can have any shape.
     #   Therefore, the final two dimensions of special_shape are 1's.
     special_shape = self.batch_shape.concatenate([1, 1])
-    bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape)
+    bshape = array_ops.broadcast_static_shape(x.shape, special_shape)
     if special_shape.is_fully_defined():
       # bshape.is_fully_defined iff special_shape.is_fully_defined.
-      if bshape == x.get_shape():
+      if bshape == x.shape:
         return x
       # Use the built in broadcasting of addition.
       zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
@@ -589,7 +593,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     self._assert_proper_shapes = assert_proper_shapes
 
     with ops.name_scope(name, values=[multiplier, num_rows]):
-      self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
+      self._multiplier = linear_operator_util.convert_nonref_to_tensor(
+          multiplier, name="multiplier")
 
       # Check and auto-set hints.
       if not self._multiplier.dtype.is_complex:
@@ -601,20 +606,16 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       if not is_square:
         raise ValueError("A ScaledIdentity operator is always square.")
 
+      linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
+
       super(LinearOperatorScaledIdentity, self).__init__(
-          dtype=self._multiplier.dtype,
+          dtype=self._multiplier.dtype.base_dtype,
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
           name=name)
 
-      # Shape [B1,...Bb, 1, 1]
-      self._multiplier_matrix = array_ops.expand_dims(
-          array_ops.expand_dims(self.multiplier, -1), -1)
-      self._multiplier_matrix_conj = math_ops.conj(self._multiplier_matrix)
-      self._abs_multiplier = math_ops.abs(self.multiplier)
-
       self._num_rows = linear_operator_util.shape_tensor(
           num_rows, name="num_rows")
       self._num_rows_static = tensor_util.constant_value(self._num_rows)
@@ -627,7 +628,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     matrix_shape = tensor_shape.TensorShape((self._num_rows_static,
                                              self._num_rows_static))
 
-    batch_shape = self.multiplier.get_shape()
+    batch_shape = self.multiplier.shape
     return batch_shape.concatenate(matrix_shape)
 
   def _shape_tensor(self):
@@ -652,34 +653,34 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         imag_multiplier,
         message="LinearOperator was not self-adjoint")
 
+  def _make_multiplier_matrix(self, conjugate=False):
+    # Shape [B1,...Bb, 1, 1]
+    multiplier_matrix = array_ops.expand_dims(
+        array_ops.expand_dims(self.multiplier, -1), -1)
+    if conjugate:
+      multiplier_matrix = math_ops.conj(multiplier_matrix)
+    return multiplier_matrix
+
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     x = linalg.adjoint(x) if adjoint_arg else x
-    if adjoint:
-      matrix = self._multiplier_matrix_conj
-    else:
-      matrix = self._multiplier_matrix
     if self._assert_proper_shapes:
       aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
       x = control_flow_ops.with_dependencies([aps], x)
-    return x * matrix
+    return x * self._make_multiplier_matrix(conjugate=adjoint)
 
   def _determinant(self):
     return self.multiplier**self._num_rows_cast_to_dtype
 
   def _log_abs_determinant(self):
     return self._num_rows_cast_to_real_dtype * math_ops.log(
-        self._abs_multiplier)
+        math_ops.abs(self.multiplier))
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    if adjoint:
-      matrix = self._multiplier_matrix_conj
-    else:
-      matrix = self._multiplier_matrix
     if self._assert_proper_shapes:
       aps = linear_operator_util.assert_compatible_matrix_dimensions(self, rhs)
       rhs = control_flow_ops.with_dependencies([aps], rhs)
-    return rhs / matrix
+    return rhs / self._make_multiplier_matrix(conjugate=adjoint)
 
   def _trace(self):
     # Get Tensor of all ones of same shape as self.batch_shape.
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index 3d6715420c3..8803d58c15f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -228,16 +228,16 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     with ops.name_scope(name, values=values):
 
       # Create U and V.
-      self._u = ops.convert_to_tensor(u, name="u")
+      self._u = linear_operator_util.convert_nonref_to_tensor(u, name="u")
       if v is None:
         self._v = self._u
       else:
-        self._v = ops.convert_to_tensor(v, name="v")
+        self._v = linear_operator_util.convert_nonref_to_tensor(v, name="v")
 
       if diag_update is None:
         self._diag_update = None
       else:
-        self._diag_update = ops.convert_to_tensor(
+        self._diag_update = linear_operator_util.convert_nonref_to_tensor(
             diag_update, name="diag_update")
 
       # Create base_operator L.
@@ -261,17 +261,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
 
       self._check_shapes()
 
-      # Pre-compute the so-called "capacitance" matrix
-      #   C := D^{-1} + V^H L^{-1} U
-      self._capacitance = self._make_capacitance()
-      if self._use_cholesky:
-        self._chol_capacitance = linalg_ops.cholesky(self._capacitance)
-
   def _check_shapes(self):
     """Static check that shapes are compatible."""
     # Broadcast shape also checks that u and v are compatible.
     uv_shape = array_ops.broadcast_static_shape(
-        self.u.get_shape(), self.v.get_shape())
+        self.u.shape, self.v.shape)
 
     batch_shape = array_ops.broadcast_static_shape(
         self.base_operator.batch_shape, uv_shape[:-2])
@@ -282,17 +276,15 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
 
     if self._diag_update is not None:
       tensor_shape.dimension_at_index(uv_shape, -1).assert_is_compatible_with(
-          self._diag_update.get_shape()[-1])
+          self._diag_update.shape[-1])
       array_ops.broadcast_static_shape(
-          batch_shape, self._diag_update.get_shape()[:-1])
+          batch_shape, self._diag_update.shape[:-1])
 
   def _set_diag_operators(self, diag_update, is_diag_update_positive):
     """Set attributes self._diag_update and self._diag_operator."""
     if diag_update is not None:
       self._diag_operator = linear_operator_diag.LinearOperatorDiag(
           self._diag_update, is_positive_definite=is_diag_update_positive)
-      self._diag_inv_operator = linear_operator_diag.LinearOperatorDiag(
-          1. / self._diag_update, is_positive_definite=is_diag_update_positive)
     else:
       if tensor_shape.dimension_value(self.u.shape[-1]) is not None:
         r = tensor_shape.dimension_value(self.u.shape[-1])
@@ -300,7 +292,6 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
         r = array_ops.shape(self.u)[-1]
       self._diag_operator = linear_operator_identity.LinearOperatorIdentity(
           num_rows=r, dtype=self.dtype)
-      self._diag_inv_operator = self._diag_operator
 
   @property
   def u(self):
@@ -335,7 +326,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
   def _shape(self):
     batch_shape = array_ops.broadcast_static_shape(
         self.base_operator.batch_shape,
-        self.u.get_shape()[:-2])
+        self.u.shape[:-2])
     return batch_shape.concatenate(self.base_operator.shape[-2:])
 
   def _shape_tensor(self):
@@ -373,7 +364,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     #                  = det(C) det(D) det(L)
     # where C is sometimes known as the capacitance matrix,
     #   C := D^{-1} + V^H L^{-1} U
-    det_c = linalg_ops.matrix_determinant(self._capacitance)
+    det_c = linalg_ops.matrix_determinant(self._make_capacitance())
     det_d = self.diag_operator.determinant()
     det_l = self.base_operator.determinant()
     return det_c * det_d * det_l
@@ -386,11 +377,12 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     log_abs_det_l = self.base_operator.log_abs_determinant()
 
     if self._use_cholesky:
-      chol_cap_diag = array_ops.matrix_diag_part(self._chol_capacitance)
+      chol_cap_diag = array_ops.matrix_diag_part(
+          linalg_ops.cholesky(self._make_capacitance()))
       log_abs_det_c = 2 * math_ops.reduce_sum(
           math_ops.log(chol_cap_diag), axis=[-1])
     else:
-      det_c = linalg_ops.matrix_determinant(self._capacitance)
+      det_c = linalg_ops.matrix_determinant(self._make_capacitance())
       log_abs_det_c = math_ops.log(math_ops.abs(det_c))
       if self.dtype.is_complex:
         log_abs_det_c = math_ops.cast(log_abs_det_c, dtype=self.dtype)
@@ -426,10 +418,10 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # C^{-1} V^H L^{-1} rhs
     if self._use_cholesky:
       capinv_vh_linv_rhs = linear_operator_util.cholesky_solve_with_broadcast(
-          self._chol_capacitance, vh_linv_rhs)
+          linalg_ops.cholesky(self._make_capacitance()), vh_linv_rhs)
     else:
       capinv_vh_linv_rhs = linear_operator_util.matrix_solve_with_broadcast(
-          self._capacitance, vh_linv_rhs, adjoint=adjoint)
+          self._make_capacitance(), vh_linv_rhs, adjoint=adjoint)
     # U C^{-1} V^H M^{-1} rhs
     u_capinv_vh_linv_rhs = math_ops.matmul(u, capinv_vh_linv_rhs)
     # L^{-1} U C^{-1} V^H L^{-1} rhs
@@ -448,5 +440,5 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     vh_linv_u = math_ops.matmul(self.v, linv_u, adjoint_a=True)
 
     # D^{-1} + V^H L^{-1} V
-    capacitance = self._diag_inv_operator.add_to_tensor(vh_linv_u)
+    capacitance = self._diag_operator.inverse().add_to_tensor(vh_linv_u)
     return capacitance
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index cc2e1baf2e9..cccebfd6f49 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -145,10 +144,9 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     is_square = True
 
     with ops.name_scope(name, values=[tril]):
-      self._tril = ops.convert_to_tensor(tril, name="tril")
+      self._tril = linear_operator_util.convert_nonref_to_tensor(tril,
+                                                                 name="tril")
       self._check_tril(self._tril)
-      self._tril = array_ops.matrix_band_part(tril, -1, 0)
-      self._diag = array_ops.matrix_diag_part(self._tril)
 
       super(LinearOperatorLowerTriangular, self).__init__(
           dtype=self._tril.dtype,
@@ -161,53 +159,46 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
 
   def _check_tril(self, tril):
     """Static check of the `tril` argument."""
-    allowed_dtypes = [
-        dtypes.float16,
-        dtypes.float32,
-        dtypes.float64,
-        dtypes.complex64,
-        dtypes.complex128,
-    ]
-    dtype = tril.dtype
-    if dtype not in allowed_dtypes:
-      raise TypeError(
-          "Argument tril must have dtype in %s.  Found: %s"
-          % (allowed_dtypes, dtype))
 
-    if tril.get_shape().ndims is not None and tril.get_shape().ndims < 2:
+    if tril.shape.ndims is not None and tril.shape.ndims < 2:
       raise ValueError(
           "Argument tril must have at least 2 dimensions.  Found: %s"
           % tril)
 
+  def _get_tril(self):
+    """Gets the `tril` kwarg, with upper part zero-d out."""
+    return array_ops.matrix_band_part(self._tril, -1, 0)
+
+  def _get_diag(self):
+    """Gets the diagonal part of `tril` kwarg."""
+    return array_ops.matrix_diag_part(self._tril)
+
   def _shape(self):
-    return self._tril.get_shape()
+    return self._tril.shape
 
   def _shape_tensor(self):
     return array_ops.shape(self._tril)
 
   def _assert_non_singular(self):
     return linear_operator_util.assert_no_entries_with_modulus_zero(
-        self._diag,
+        self._get_diag(),
         message="Singular operator:  Diagonal contained zero values.")
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return math_ops.matmul(
-        self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+        self._get_tril(), x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, axis=[-1])
+    return math_ops.reduce_prod(self._get_diag(), axis=[-1])
 
   def _log_abs_determinant(self):
     return math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
+        math_ops.log(math_ops.abs(self._get_diag())), axis=[-1])
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     return linear_operator_util.matrix_triangular_solve_with_broadcast(
-        self._tril, rhs, lower=True, adjoint=adjoint)
+        self._get_tril(), rhs, lower=True, adjoint=adjoint)
 
   def _to_dense(self):
-    return self._tril
-
-  def _add_to_tensor(self, x):
-    return self._tril + x
+    return self._get_tril()
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 12cdb1178f6..e9d0f90aae0 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -24,6 +24,7 @@ import numpy as np
 import six
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -51,6 +52,15 @@ class OperatorShapesInfo(object):
     self.__dict__.update(kwargs)
 
 
+class CheckTapeSafeSkipOptions(object):
+
+  # Skip checking this particular method.
+  DETERMINANT = "determinant"
+  DIAG_PART = "diag_part"
+  LOG_ABS_DETERMINANT = "log_abs_determinant"
+  TRACE = "trace"
+
+
 @six.add_metaclass(abc.ABCMeta)  # pylint: disable=no-init
 class LinearOperatorDerivedClassTest(test.TestCase):
   """Tests for derived classes.
@@ -168,24 +178,41 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("make_x is not defined.")
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     """List of test names to skip."""
     # Subclasses should over-ride if they want to skip some tests.
     # To skip "test_foo", add "foo" to this list.
     return []
 
-  def check_tape_safe(self, operator):
-    """Check gradients are not None w.r.t. Variables.
+  def assertRaisesError(self, msg):
+    """assertRaisesRegexp or OpError, depending on context.executing_eagerly."""
+    if context.executing_eagerly():
+      return self.assertRaisesRegexp(Exception, msg)
+    return self.assertRaisesOpError(msg)
+
+  def check_tape_safe(self, operator, skip_options=None):
+    """Check gradients are not None w.r.t. operator.variables.
 
     Meant to be called from the derived class.
 
+    This ensures grads are not w.r.t every variable in operator.variables.  If
+    more fine-grained testing is needed, a custom test should be written.
+
     Args:
       operator: LinearOperator.  Exact checks done will depend on hints.
+      skip_options: Optional list of CheckTapeSafeSkipOptions.
+        Makes this test skip particular checks.
     """
+    skip_options = skip_options or []
+
+    if not operator.variables:
+      raise AssertionError("`operator.variables` was empty")
+
     def _assert_not_none(iterable):
       for item in iterable:
         self.assertIsNotNone(item)
 
+    # Tape tests that can be run on every operator below.
     with backprop.GradientTape() as tape:
       _assert_not_none(tape.gradient(operator.to_dense(), operator.variables))
 
@@ -193,23 +220,30 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       _assert_not_none(
           tape.gradient(operator.adjoint().to_dense(), operator.variables))
 
-    x = array_ops.ones(shape=operator.H.shape_tensor()[:-1])
+    x = math_ops.cast(
+        array_ops.ones(shape=operator.H.shape_tensor()[:-1]), operator.dtype)
 
     with backprop.GradientTape() as tape:
       _assert_not_none(tape.gradient(operator.matvec(x), operator.variables))
 
+    # Tests for square, but possibly non-singular operators below.
     if not operator.is_square:
       return
 
-    with backprop.GradientTape() as tape:
-      _assert_not_none(
-          tape.gradient(operator.determinant(), operator.variables))
+    for option in [
+        CheckTapeSafeSkipOptions.DETERMINANT,
+        CheckTapeSafeSkipOptions.LOG_ABS_DETERMINANT,
+        CheckTapeSafeSkipOptions.DIAG_PART,
+        CheckTapeSafeSkipOptions.TRACE,
+    ]:
+      with backprop.GradientTape() as tape:
+        if option not in skip_options:
+          _assert_not_none(
+              tape.gradient(getattr(operator, option)(), operator.variables))
 
-    with backprop.GradientTape() as tape:
-      _assert_not_none(tape.gradient(operator.diag_part(), operator.variables))
-
-    with backprop.GradientTape() as tape:
-      _assert_not_none(tape.gradient(operator.trace(), operator.variables))
+    # Tests for non-singular operators below.
+    if operator.is_non_singular is False:  # pylint: disable=g-bool-id-comparison
+      return
 
     with backprop.GradientTape() as tape:
       _assert_not_none(
@@ -218,6 +252,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     with backprop.GradientTape() as tape:
       _assert_not_none(tape.gradient(operator.solvevec(x), operator.variables))
 
+    # Tests for SPD operators below.
     if not (operator.is_self_adjoint and operator.is_positive_definite):
       return
 
@@ -237,7 +272,7 @@ def _test_to_dense(use_placeholder, shapes_info, dtype):
           shapes_info, dtype, use_placeholder=use_placeholder)
       op_dense = operator.to_dense()
       if not use_placeholder:
-        self.assertAllEqual(shapes_info.shape, op_dense.get_shape())
+        self.assertAllEqual(shapes_info.shape, op_dense.shape)
       op_dense_v, mat_v = sess.run([op_dense, mat])
       self.assertAC(op_dense_v, mat_v)
   return test_to_dense
@@ -251,7 +286,7 @@ def _test_det(use_placeholder, shapes_info, dtype):
           shapes_info, dtype, use_placeholder=use_placeholder)
       op_det = operator.determinant()
       if not use_placeholder:
-        self.assertAllEqual(shapes_info.shape[:-2], op_det.get_shape())
+        self.assertAllEqual(shapes_info.shape[:-2], op_det.shape)
       op_det_v, mat_det_v = sess.run(
           [op_det, linalg_ops.matrix_determinant(mat)])
       self.assertAC(op_det_v, mat_det_v)
@@ -268,7 +303,7 @@ def _test_log_abs_det(use_placeholder, shapes_info, dtype):
       _, mat_log_abs_det = linalg.slogdet(mat)
       if not use_placeholder:
         self.assertAllEqual(
-            shapes_info.shape[:-2], op_log_abs_det.get_shape())
+            shapes_info.shape[:-2], op_log_abs_det.shape)
       op_log_abs_det_v, mat_log_abs_det_v = sess.run(
           [op_log_abs_det, mat_log_abs_det])
       self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
@@ -305,8 +340,8 @@ def _test_matmul_base(
       op_matmul = operator.matmul(x, adjoint=adjoint)
     mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
     if not use_placeholder:
-      self.assertAllEqual(op_matmul.get_shape(),
-                          mat_matmul.get_shape())
+      self.assertAllEqual(op_matmul.shape,
+                          mat_matmul.shape)
     op_matmul_v, mat_matmul_v = sess.run(
         [op_matmul, mat_matmul])
     self.assertAC(op_matmul_v, mat_matmul_v)
@@ -410,8 +445,8 @@ def _test_solve_base(
     mat_solve = linear_operator_util.matrix_solve_with_broadcast(
         mat, rhs, adjoint=adjoint)
     if not use_placeholder:
-      self.assertAllEqual(op_solve.get_shape(),
-                          mat_solve.get_shape())
+      self.assertAllEqual(op_solve.shape,
+                          mat_solve.shape)
     op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
     self.assertAC(op_solve_v, mat_solve_v)
 
@@ -465,7 +500,7 @@ def _test_trace(use_placeholder, shapes_info, dtype):
       op_trace = operator.trace()
       mat_trace = math_ops.trace(mat)
       if not use_placeholder:
-        self.assertAllEqual(op_trace.get_shape(), mat_trace.get_shape())
+        self.assertAllEqual(op_trace.shape, mat_trace.shape)
       op_trace_v, mat_trace_v = sess.run([op_trace, mat_trace])
       self.assertAC(op_trace_v, mat_trace_v)
   return test_trace
@@ -480,7 +515,7 @@ def _test_add_to_tensor(use_placeholder, shapes_info, dtype):
       op_plus_2mat = operator.add_to_tensor(2 * mat)
 
       if not use_placeholder:
-        self.assertAllEqual(shapes_info.shape, op_plus_2mat.get_shape())
+        self.assertAllEqual(shapes_info.shape, op_plus_2mat.shape)
 
       op_plus_2mat_v, mat_v = sess.run([op_plus_2mat, mat])
 
@@ -498,8 +533,8 @@ def _test_diag_part(use_placeholder, shapes_info, dtype):
       mat_diag_part = array_ops.matrix_diag_part(mat)
 
       if not use_placeholder:
-        self.assertAllEqual(mat_diag_part.get_shape(),
-                            op_diag_part.get_shape())
+        self.assertAllEqual(mat_diag_part.shape,
+                            op_diag_part.shape)
 
       op_diag_part_, mat_diag_part_ = sess.run(
           [op_diag_part, mat_diag_part])
@@ -534,7 +569,7 @@ def add_tests(test_cls):
   ]
 
   for name, test_template_fn in test_name_dict.items():
-    if name in test_cls.tests_to_skip():
+    if name in test_cls.skip_these_tests():
       continue
 
     for dtype, use_placeholder, shape_info in itertools.product(
@@ -639,7 +674,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   """
 
   @staticmethod
-  def tests_to_skip():
+  def skip_these_tests():
     """List of test names to skip."""
     return [
         "cholesky",
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index 3921689dc4a..25097670354 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -79,6 +79,7 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
   x = ... Shape [3, 4] Tensor
   operator.matmul(x)
   ==> Shape [3, 4] Tensor
+  ```
 
   #### Shape compatibility
 
@@ -168,12 +169,12 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
   def _check_row_col(self, row, col):
     """Static check of row and column."""
     for name, tensor in [["row", row], ["col", col]]:
-      if tensor.get_shape().ndims is not None and tensor.get_shape().ndims < 1:
+      if tensor.shape.ndims is not None and tensor.shape.ndims < 1:
         raise ValueError("Argument {} must have at least 1 dimension.  "
                          "Found: {}".format(name, tensor))
 
-    if row.get_shape()[-1] is not None and col.get_shape()[-1] is not None:
-      if row.get_shape()[-1] != col.get_shape()[-1]:
+    if row.shape[-1] is not None and col.shape[-1] is not None:
+      if row.shape[-1] != col.shape[-1]:
         raise ValueError(
             "Expected square matrix, got row and col with mismatched "
             "dimensions.")
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 573d373ea93..077568a0b85 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -29,7 +29,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.ops.linalg import linalg_impl as linalg
 
 
 ################################################################################
@@ -95,7 +94,7 @@ def convert_nonref_to_tensor(value, dtype=None, dtype_hint=None, name=None):
   y = convert_nonref_to_tensor(x)
   x is y
   # ==> True
-  tf.is_tensor
+  tf.is_tensor(y)
   # ==> False
   tf.equal(y, 13.37)
   # ==> True
@@ -157,6 +156,12 @@ def is_ref(x):
        hasattr(x, "shape")))
 
 
+def assert_not_ref_type(x, arg_name):
+  if is_ref(x):
+    raise TypeError(
+        "Argument %s cannot be reference type. Found: %s" % (arg_name, type(x)))
+
+
 ################################################################################
 # Asserts.
 ################################################################################
@@ -223,7 +228,9 @@ def assert_compatible_matrix_dimensions(operator, x):
   assert_same_dd = check_ops.assert_equal(
       array_ops.shape(x)[-2],
       operator.domain_dimension_tensor(),
-      message=("Incompatible matrix dimensions.  "
+      # This error message made to look similar to error raised by static check
+      # in the base class.
+      message=("Dimensions are not compatible.  "
                "shape[-2] of argument to be the same as this operator"))
 
   return assert_same_dd
@@ -231,7 +238,7 @@ def assert_compatible_matrix_dimensions(operator, x):
 
 def assert_is_batch_matrix(tensor):
   """Static assert that `tensor` has rank `2` or higher."""
-  sh = tensor.get_shape()
+  sh = tensor.shape
   if sh.ndims is not None and sh.ndims < 2:
     raise ValueError(
         "Expected [batch] matrix to have at least two dimensions.  Found: "
@@ -319,14 +326,14 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     # x.shape =    [2, j, k]  (batch shape =    [2])
     # y.shape = [3, 1, l, m]  (batch shape = [3, 1])
     # ==> bcast_batch_shape = [3, 2]
-    bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
+    bcast_batch_shape = batch_matrices[0].shape[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_static_shape(
           bcast_batch_shape,
-          mat.get_shape()[:-2])
+          mat.shape[:-2])
     if bcast_batch_shape.is_fully_defined():
       for i, mat in enumerate(batch_matrices):
-        if mat.get_shape()[:-2] != bcast_batch_shape:
+        if mat.shape[:-2] != bcast_batch_shape:
           bcast_shape = array_ops.concat(
               [bcast_batch_shape.as_list(), array_ops.shape(mat)[-2:]], axis=0)
           batch_matrices[i] = array_ops.broadcast_to(mat, bcast_shape)
@@ -481,13 +488,13 @@ def _reshape_for_efficiency(a,
   # Any transposes/adjoints will happen here explicitly, rather than in calling
   # code.  Why?  To avoid having to write separate complex code for each case.
   if adjoint_a:
-    a = linalg.adjoint(a)
+    a = array_ops.matrix_transpose(a, conjugate=True)
   elif transpose_a:
-    a = linalg.transpose(a)
+    a = array_ops.matrix_transpose(a, conjugate=False)
   if adjoint_b:
-    b = linalg.adjoint(b)
-  elif transpose_b:
-    b = linalg.transpose(b)
+    b = array_ops.matrix_transpose(b, conjugate=True)
+  elif transpose_a:
+    b = array_ops.matrix_transpose(b, conjugate=False)
   still_need_to_transpose = False
 
   # Recompute shapes, since the transpose/adjoint may have changed them.
diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py
index b8a79c065b3..074a49f8a9e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_zeros.py
+++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py
@@ -196,6 +196,10 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
           is_square=is_square,
           name=name)
 
+      linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
+      linear_operator_util.assert_not_ref_type(num_columns, "num_columns")
+      linear_operator_util.assert_not_ref_type(batch_shape, "batch_shape")
+
       self._num_rows = linear_operator_util.shape_tensor(
           num_rows, name="num_rows")
       self._num_rows_static = tensor_util.constant_value(self._num_rows)
@@ -273,10 +277,10 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
     #   Also, the final dimension of 'x' can have any shape.
     #   Therefore, the final two dimensions of special_shape are 1's.
     special_shape = self.batch_shape.concatenate([1, 1])
-    bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape)
+    bshape = array_ops.broadcast_static_shape(x.shape, special_shape)
     if special_shape.is_fully_defined():
       # bshape.is_fully_defined iff special_shape.is_fully_defined.
-      if bshape == x.get_shape():
+      if bshape == x.shape:
         return x
       # Use the built in broadcasting of addition.
       zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index cf704047934..48129ad827b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -82,6 +82,11 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
+  Additionally, to use tf.print in python 2.7, users must make sure to import
+  the following:
+
+  `from __future__ import print_function`
+
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -95,7 +100,7 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   Returns:
     A `Tensor`. Has the same type and contents as `input_`.
 
-  	```python
+    ```python
     sess = tf.compat.v1.Session()
     with sess.as_default():
         tensor = tf.range(10)
@@ -103,11 +108,7 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
         with tf.control_dependencies([print_op]):
           out = tf.add(tensor, tensor)
         sess.run(out)
-  	```
-	Additionally, to use tf.print in python 2.7, users must make sure to import
-	the following:
-
-  `from __future__ import print_function`
+    ```
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
 
@@ -139,18 +140,11 @@ def _is_filepath(output_stream):
 def print_v2(*inputs, **kwargs):
   """Print the specified inputs.
 
-  Returns an operator that prints the specified inputs to a desired
+  A TensorFlow operator that prints the specified inputs to a desired
   output stream or logging level. The inputs may be dense or sparse Tensors,
-  primitive python objects, data structures that contain Tensors, and printable
-  python objects. Printed tensors will recursively show the first and last
-  `summarize` elements of each dimension.
-
-  With eager execution enabled and/or inside a `tf.contrib.eager.defun` this
-  operator will automatically execute, and users only need to call `tf.print`
-  without using the return value. When constructing graphs outside of a
-  `tf.contrib.eager.defun`, one must either include the returned op
-  in the input to `session.run`, or use the operator as a control dependency for
-  executed ops by specifying `with tf.control_dependencies([print_op])`.
+  primitive python objects, data structures that contain tensors, and printable
+  Python objects. Printed tensors will recursively show the first and last
+  elements of each dimension to summarize.
 
   @compatibility(python2)
   In python 2.7, make sure to import the following:
@@ -161,7 +155,6 @@ def print_v2(*inputs, **kwargs):
     Single-input usage:
 
     ```python
-    tf.compat.v1.enable_eager_execution()
     tensor = tf.range(10)
     tf.print(tensor, output_stream=sys.stderr)
     ```
@@ -171,7 +164,6 @@ def print_v2(*inputs, **kwargs):
     Multi-input usage:
 
     ```python
-    tf.compat.v1.enable_eager_execution()
     tensor = tf.range(10)
     tf.print("tensors:", tensor, {2: tensor * 2}, output_stream=sys.stdout)
     ```
@@ -179,12 +171,19 @@ def print_v2(*inputs, **kwargs):
     (This prints "tensors: [0 1 2 ... 7 8 9] {2: [0 2 4 ... 14 16 18]}" to
     sys.stdout)
 
-    Usage in a defun:
+    Changing the input separator:
+    ```python
+    tensor_a = tf.range(2)
+    tensor_b = tensor_a * 2
+    tf.print(tensor_a, tensor_b, output_stream=sys.stderr, sep=',')
+    ```
+
+    (This prints "[0 1],[0 2]" to sys.stderr)
+
+    Usage in a `tf.function`:
 
     ```python
-    tf.compat.v1.enable_eager_execution()
-
-    @tf.contrib.eager.defun
+    @tf.function
     def f():
         tensor = tf.range(10)
         tf.print(tensor, output_stream=sys.stderr)
@@ -195,7 +194,16 @@ def print_v2(*inputs, **kwargs):
 
     (This prints "[0 1 2 ... 7 8 9]" to sys.stderr)
 
-    Usage when constructing graphs:
+  @compatibility(TF 1.x Graphs and Sessions)
+  In graphs manually created outside of `tf.function`, this method returns
+  the created TF operator that prints the data. To make sure the
+  operator runs, users need to pass the produced op to
+  `tf.compat.v1.Session`'s run method, or to use the op as a control
+  dependency for executed ops by specifying
+  `with tf.compat.v1.control_dependencies([print_op])`.
+  @end_compatibility
+
+    Compatibility usage in TF 1.x graphs:
 
     ```python
     sess = tf.compat.v1.Session()
@@ -211,7 +219,7 @@ def print_v2(*inputs, **kwargs):
     (This prints "tensors: [0 1 2 ... 7 8 9] {2: [0 2 4 ... 14 16 18]}" to
     sys.stdout)
 
-  Note: In Jupyter notebooks and colabs, this operator prints to the notebook
+  Note: In Jupyter notebooks and colabs, `tf.print` prints to the notebook
     cell outputs. It will not write to the notebook kernel's console logs.
 
   Args:
@@ -236,8 +244,10 @@ def print_v2(*inputs, **kwargs):
     name: A name for the operation (optional).
 
   Returns:
-    A print operator that prints the specified inputs in the specified output
-    stream or logging level.
+    None when executing eagerly. During graph tracing this returns
+    a TF operator that prints the specified inputs in the specified output
+    stream or logging level. This operator will be automatically executed
+    except inside of `tf.compat.v1` graphs and sessions.
 
   Raises:
     ValueError: If an unsupported output stream is specified.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 802a5b2d261..b2ea5c4ebc7 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -166,7 +166,7 @@ class InitializableLookupTableBase(LookupInterface):
                                                        initializer.value_dtype)
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
-    self._default_value.get_shape().merge_with(tensor_shape.scalar())
+    self._default_value.get_shape().merge_with(tensor_shape.TensorShape([]))
     if isinstance(initializer, trackable_base.Trackable):
       self._initializer = self._track_trackable(initializer, "_initializer")
     with ops.init_scope():
@@ -1025,7 +1025,7 @@ class IdTableWithHashBuckets(LookupInterface):
           ids = self._table.lookup(values)
           buckets = math_ops.add(buckets, self._table.size())
           is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
-          ids = array_ops.where(is_id_non_default, ids, buckets)
+          ids = array_ops.where_v2(is_id_non_default, ids, buckets)
         else:
           ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -1199,7 +1199,7 @@ class StaticVocabularyTable(LookupInterface):
         ids = self._table.lookup(values)
         buckets = math_ops.add(buckets, self._table.size())
         is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
-        ids = array_ops.where(is_id_non_default, ids, buckets)
+        ids = array_ops.where_v2(is_id_non_default, ids, buckets)
       else:
         ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py
index 483a325570b..7fdc7916440 100644
--- a/tensorflow/python/ops/losses/loss_reduction.py
+++ b/tensorflow/python/ops/losses/loss_reduction.py
@@ -28,10 +28,10 @@ class ReductionV2(object):
      used with `tf.distribute.Strategy`, outside of built-in training loops such
      as `tf.keras` `compile` and `fit`, we expect reduction value to be
      `SUM` or `NONE`. Using `AUTO` in that case will raise an error.
-  * `NONE`: Un-reduced weighted losses with the same shape as input. When this
-    reduction type used with built-in Keras training loops like
-    `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer but
-    the reported loss will be a scalar value.
+  * `NONE`: Weighted losses with one dimension reduced (axis=-1, or axis
+     specified by loss function). When this reduction type used with built-in
+     Keras training loops like `fit`/`evaluate`, the unreduced vector loss is
+     passed to the optimizer but the reported loss will be a scalar value.
   * `SUM`: Scalar sum of weighted losses.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
      This reduction type is not supported when used with
@@ -42,7 +42,7 @@ class ReductionV2(object):
      ```
      with strategy.scope():
        loss_obj = tf.keras.losses.CategoricalCrossentropy(
-           reduction=tf.keras.losses.Reduction.None)
+           reduction=tf.keras.losses.Reduction.NONE)
        ....
        loss = tf.reduce_sum(loss_object(labels, predictions)) *
            (1. / global_batch_size)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 0db8953b696..e0e68da775f 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -29,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import object_identity
 
 
 def _safe_shape_div(x, y):
@@ -48,8 +50,92 @@ def _ArgMinGrad(op, grad):
   return [None, None]
 
 
-# TODO(rmlarsen): Implement gradient.
-ops.NotDifferentiable("EuclideanNorm")
+@ops.RegisterGradient("EuclideanNorm")
+def _EuclideanNormGrad(op, grad):
+  """Gradient for EuclideanNorm."""
+
+  output = op.outputs[0]
+
+  if not op.get_attr("keep_dims"):
+    output_shape_kept_dims = math_ops.reduced_shape(
+        array_ops.shape(op.inputs[0]), op.inputs[1])
+    output = array_ops.reshape(output, output_shape_kept_dims)
+    grad = array_ops.reshape(grad, output_shape_kept_dims)
+
+  return math_ops.truediv(op.inputs[0], output / grad), None
+
+
+def SmartBroadcastGradientArgs(x, y, grad):
+  """Optimized version of `broadcast_gradient_args` that caches results.
+
+  This implementation avoids creating `broadcast_gradient_args` ops in the case
+  that the input shapes are fully defined, and provides hints to the calling
+  code that can be used to avoid creating reduction and reshaping ops.
+
+  Args:
+    x: The left input tensor to a broadcasting binary op.
+    y: The right input tensor to a broadcasting binary op.
+    grad: The incoming gradient tensor for a broadcasting binary op.
+
+  Returns:
+    A pair of tuples, containing:
+      * A 3-tuple of broadcast information for x, containing:
+        * The shape of x (as a tuple or Tensor).
+        * The reduction indices for x (as a tuple or Tensor).
+        * A boolean, which if True, indicates that x's shape differs from grad's
+          shape (and so x's gradient must be reduced and/or reshaped).
+      * A 3-tuple of broadcast information for y, containing the respective
+        details for y.
+  """
+  # NOTE: It may be productive to apply these optimizations in the eager case
+  # as well.
+  if context.executing_eagerly() or not (
+      isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)
+      and isinstance(grad, ops.Tensor)):
+    sx = array_ops.shape(x)
+    sy = array_ops.shape(y)
+    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+    return (sx, rx, True), (sy, ry, True)
+
+  # pylint: disable=protected-access
+  x_shape_tuple = x._shape_tuple()
+  y_shape_tuple = y._shape_tuple()
+  grad_shape_tuple = grad._shape_tuple()
+  # pylint: enable=protected-access
+
+  if (x_shape_tuple is None or None in x_shape_tuple or
+      y_shape_tuple is None or None in y_shape_tuple):
+    sx = array_ops.shape_internal(x, optimize=False)
+    sy = array_ops.shape_internal(y, optimize=False)
+    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+    return (sx, rx, True), (sy, ry, True)
+
+  x_needs_reduction = x_shape_tuple != grad_shape_tuple
+  y_needs_reduction = y_shape_tuple != grad_shape_tuple
+
+  # Get the default graph rather than relying on `x.graph`, `y.graph`, or
+  # `grad.graph`, because these may be eager tensors.
+  g = ops.get_default_graph()
+
+  try:
+    rx, ry = g._bcast_grad_args_cache[(x_shape_tuple, y_shape_tuple)]  # pylint: disable=protected-access
+    return (x_shape_tuple, rx, x_needs_reduction), (
+        y_shape_tuple, ry, y_needs_reduction)
+  except KeyError:
+    rx, ry = array_ops.broadcast_gradient_args(x_shape_tuple, y_shape_tuple)
+    # TODO(mrry): If this becomes a bottleneck, add a multi-output version of
+    # `TF_TryEvaluateConstant()`.
+    rx_value = tuple(c_api.TF_TryEvaluateConstant_wrapper(
+        rx.graph._c_graph, rx._as_tf_output()))  # pylint: disable=protected-access
+    assert rx_value is not None
+    ry_value = tuple(c_api.TF_TryEvaluateConstant_wrapper(
+        ry.graph._c_graph, ry._as_tf_output()))  # pylint: disable=protected-access
+    assert ry_value is not None
+    g._bcast_grad_args_cache[(x_shape_tuple, y_shape_tuple)] = (  # pylint: disable=protected-access
+        rx_value, ry_value)
+
+    return (x_shape_tuple, rx_value, x_needs_reduction), (
+        y_shape_tuple, ry_value, y_needs_reduction)
 
 
 _empty_tuple = ()
@@ -85,10 +171,49 @@ def _SumGrad(op, grad):
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
+      elif None not in input_0_shape and not context.executing_eagerly():
+        # The shape and reduction indices are statically known, so we use a
+        # graph-level cache to avoid recomputing `reduced_shape()` for each
+        # invocation.
+        graph = ops.get_default_graph()
+
+        # Canonicalize `axes` to be a tuple of indices. The incoming
+        # value may be a scalar or a vector, and may include negative indices.
+        axes = tuple(axes.reshape(-1))
+
+        try:
+          output_shape_kept_dims, tile_scaling = graph._reduced_shape_cache[  # pylint: disable=protected-access
+              (input_0_shape, axes)]
+        except KeyError:
+
+          # Compute and cache `output_shape_kept_dims` and `tile_scaling`.
+          def EvaluateAsTuple(t):
+            value = c_api.TF_TryEvaluateConstant_wrapper(
+                t.graph._c_graph, t._as_tf_output())  # pylint: disable=protected-access
+            assert value is not None
+            return tuple(value)
+
+          output_shape_kept_dims = EvaluateAsTuple(
+              math_ops.reduced_shape(input_0_shape, axes))
+          tile_scaling = EvaluateAsTuple(
+              _safe_shape_div(input_0_shape, output_shape_kept_dims))
+          graph._reduced_shape_cache[(input_0_shape, axes)] = (  # pylint:disable=protected-access
+              output_shape_kept_dims, tile_scaling)
+
+        grad = array_ops.reshape(grad, output_shape_kept_dims)
+        return [array_ops.tile(grad, tile_scaling), None]
 
   input_shape = array_ops.shape(op.inputs[0])
-  # TODO(apassos) remove this once device placement for eager ops makes more
-  # sense.
+
+  if compat.forward_compatible(2019, 9, 23):
+    if not op.get_attr("keep_dims"):
+      with ops.colocate_with(input_shape):
+        # TODO(apassos) remove this once device placement for eager ops makes
+        # more sense.
+        output_shape_kept_dims = math_ops.reduced_shape(input_shape,
+                                                        op.inputs[1])
+      grad = array_ops.reshape(grad, output_shape_kept_dims)
+    return [array_ops.broadcast_to(grad, input_shape), None]
   with ops.colocate_with(input_shape):
     output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
     tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
@@ -99,10 +224,13 @@ def _SumGrad(op, grad):
 def _MinOrMaxGrad(op, grad):
   """Gradient for Min or Max. Amazingly it's precisely the same code."""
   input_shape = array_ops.shape(op.inputs[0])
-  output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
   y = op.outputs[0]
-  y = array_ops.reshape(y, output_shape_kept_dims)
-  grad = array_ops.reshape(grad, output_shape_kept_dims)
+  if not op.get_attr("keep_dims"):
+    output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
+    y = array_ops.reshape(y, output_shape_kept_dims)
+    grad = array_ops.reshape(grad, output_shape_kept_dims)
+  else:
+    output_shape_kept_dims = array_ops.shape(y)
 
   # Compute the number of selected (maximum or minimum) elements in each
   # reduction dimension. If there are multiple minimum or maximum elements
@@ -157,11 +285,18 @@ def _ProdGrad(op, grad):
   # Reshape reduction indices for the case where the parameter is a scalar
   reduction_indices = array_ops.reshape(op.inputs[1], [-1])
 
-  # Expand grad to full input shape
-  output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
-  tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
-  grad = array_ops.reshape(grad, output_shape_kept_dims)
-  grad = array_ops.tile(grad, tile_scaling)
+  if compat.forward_compatible(2019, 9, 23):
+    # Expand grad to full input shape
+    if not op.get_attr("keep_dims"):
+      output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
+      grad = array_ops.reshape(grad, output_shape_kept_dims)
+
+    grad = array_ops.broadcast_to(grad, input_shape)
+  else:
+    output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
+    tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
+    grad = array_ops.reshape(grad, output_shape_kept_dims)
+    grad = array_ops.tile(grad, tile_scaling)
 
   # Pack all reduced dimensions into a single one, so we can perform the
   # cumprod ops. If the reduction dims list is empty, it defaults to float32,
@@ -1000,55 +1135,96 @@ def _AddGrad(op, grad):
   if (isinstance(grad, ops.Tensor) and
       _ShapesFullySpecifiedAndEqual(x, y, grad)):
     return grad, grad
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+      SmartBroadcastGradientArgs(x, y, grad))
   if skip_input_indices is not None and 0 in skip_input_indices:
     gx = None
+  elif not must_reduce_x:
+    gx = grad
   else:
     gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
   if skip_input_indices is not None and 1 in skip_input_indices:
     gy = None
+  elif not must_reduce_y:
+    gy = grad
   else:
     gy = array_ops.reshape(math_ops.reduce_sum(grad, ry), sy)
   return (gx, gy)
 
 
-
 @ops.RegisterGradient("Sub")
 def _SubGrad(op, grad):
   """Gradient for Sub."""
-  x = op.inputs[0]
   y = op.inputs[1]
+  skip_input_indices = None
+  try:
+    skip_input_indices = op.skip_input_indices
+    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
+        y):
+      return grad, None
+  except AttributeError:
+    # No gradient skipping, so do the full gradient computation
+    pass
+  x = op.inputs[0]
   if (isinstance(grad, ops.Tensor) and
       _ShapesFullySpecifiedAndEqual(x, y, grad)):
     return grad, -grad
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx),
-          array_ops.reshape(-math_ops.reduce_sum(grad, ry), sy))
+  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+      SmartBroadcastGradientArgs(x, y, grad))
+  if skip_input_indices is not None and 0 in skip_input_indices:
+    gx = None
+  elif not must_reduce_x:
+    gx = grad
+  else:
+    gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
+  if skip_input_indices is not None and 1 in skip_input_indices:
+    gy = None
+  elif not must_reduce_y:
+    gy = -grad
+  else:
+    gy = array_ops.reshape(math_ops.reduce_sum(-grad, ry), sy)
+  return (gx, gy)
 
 
 @ops.RegisterGradient("Mul")
 def _MulGrad(op, grad):
   """The gradient of scalar multiplication."""
-  x = op.inputs[0]
   y = op.inputs[1]
+  skip_input_indices = None
+  try:
+    skip_input_indices = op.skip_input_indices
+    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
+        y):
+      return gen_math_ops.mul(grad, math_ops.conj(y)), None
+  except AttributeError:
+    # No gradient skipping, so do the full gradient computation
+    pass
+  x = op.inputs[0]
   if (isinstance(grad, ops.Tensor) and
       _ShapesFullySpecifiedAndEqual(x, y, grad) and
       grad.dtype in (dtypes.int32, dtypes.float32)):
     return gen_math_ops.mul(grad, y), gen_math_ops.mul(grad, x)
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+
+  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+      SmartBroadcastGradientArgs(x, y, grad))
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  return (array_ops.reshape(
-      math_ops.reduce_sum(gen_math_ops.mul(grad, y), rx), sx),
-          array_ops.reshape(
-              math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
+  if skip_input_indices is not None and 0 in skip_input_indices:
+    gx = None
+  elif not must_reduce_x:
+    gx = gen_math_ops.mul(grad, y)
+  else:
+    gx = array_ops.reshape(
+        math_ops.reduce_sum(gen_math_ops.mul(grad, y), rx), sx)
+  if skip_input_indices is not None and 1 in skip_input_indices:
+    gy = None
+  elif not must_reduce_y:
+    gy = gen_math_ops.mul(x, grad)
+  else:
+    gy = array_ops.reshape(
+        math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy)
+  return (gx, gy)
 
 
 @ops.RegisterGradient("MulNoNan")
@@ -1181,35 +1357,61 @@ def _PowGrad(op, grad):
   """Returns grad * (y*x^(y-1), z*log(x))."""
   x = op.inputs[0]
   y = op.inputs[1]
-  z = op.outputs[0]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  use_mul_no_nan = compat.forward_compatible(2019, 9, 14)
+  skip_input_indices = None
+  try:
+    skip_input_indices = op.skip_input_indices
+    # TODO(mrry): If `y` is a constant, we can combine `tf.sub()` and the
+    # constant `1` into a single constant op.
+    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
+        y):
+      x = math_ops.conj(x)
+      y = math_ops.conj(y)
+      if use_mul_no_nan:
+        return gen_math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad), None
+      else:
+        return grad * y * math_ops.pow(x, y - 1), None
+
+  except AttributeError:
+    # No gradient skipping, so do the full gradient computation
+    pass
+
+  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+      SmartBroadcastGradientArgs(x, y, grad))
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  z = math_ops.conj(z)
 
-  if compat.forward_compatible(2019, 9, 14):
-    gx = array_ops.reshape(
-        math_ops.reduce_sum(
-            gen_math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad), rx), sx)
+  if skip_input_indices is None or 0 not in skip_input_indices:
+    if use_mul_no_nan:
+      gx = gen_math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad)
+    else:
+      gx = grad * y * math_ops.pow(x, y - 1)
+    if must_reduce_x:
+      gx = array_ops.reshape(math_ops.reduce_sum(gx, rx), sx)
   else:
-    gx = array_ops.reshape(
-        math_ops.reduce_sum(grad * y * math_ops.pow(x, y - 1), rx), sx)
-  # Avoid false singularity at x = 0
-  if x.dtype.is_complex:
-    # real(x) < 0 is fine for the complex case
-    mask = math_ops.not_equal(x, 0)
+    gx = None
+
+  if skip_input_indices is None or 1 not in skip_input_indices:
+    z = math_ops.conj(op.outputs[0])
+
+    # Avoid false singularity at x = 0
+    if x.dtype.is_complex:
+      # real(x) < 0 is fine for the complex case
+      mask = math_ops.not_equal(x, 0)
+    else:
+      # There's no sensible real value to return if x < 0, so return 0
+      mask = x > 0
+    safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+    log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
+    if use_mul_no_nan:
+      gy = gen_math_ops.mul_no_nan(z * log_x, grad)
+    else:
+      gy = grad * z * log_x
+    if must_reduce_y:
+      gy = array_ops.reshape(math_ops.reduce_sum(gy, ry), sy)
   else:
-    # There's no sensible real value to return if x < 0, so return 0
-    mask = x > 0
-  safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
-  log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
-  if compat.forward_compatible(2019, 9, 14):
-    gy = array_ops.reshape(
-        math_ops.reduce_sum(gen_math_ops.mul_no_nan(z * log_x, grad), ry), sy)
-  else:
-    gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
+    gy = None
+
   return gx, gy
 
 
@@ -1277,15 +1479,39 @@ def _SquaredDifferenceGrad(op, grad):
   """Returns the gradient for (x-y)^2."""
   x = op.inputs[0]
   y = op.inputs[1]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  skip_input_indices = None
+  try:
+    skip_input_indices = op.skip_input_indices
+  except AttributeError:
+    # No gradient skipping, so do the full gradient computation
+    pass
+
   with ops.control_dependencies([grad]):
     # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
     # Tensor (not a number like 2.0) which causes it to convert to Tensor.
     x_grad = math_ops.scalar_mul(2.0, grad) * (x - y)
-  return (array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx),
-          -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy))
+
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return x_grad, -x_grad
+
+  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+      SmartBroadcastGradientArgs(x, y, grad))
+
+  if skip_input_indices is not None and 0 in skip_input_indices:
+    gx = None
+  elif must_reduce_x:
+    gx = array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx)
+  else:
+    gx = x_grad
+
+  if skip_input_indices is not None and 1 in skip_input_indices:
+    gy = None
+  elif must_reduce_y:
+    gy = -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy)
+  else:
+    gy = -x_grad
+  return (gx, gy)
 
 
 # Logical operations have no gradients.
@@ -1405,12 +1631,12 @@ def _SparseMatMulGrad(op, grad):
 
   t_a = op.get_attr("transpose_a")
   t_b = op.get_attr("transpose_b")
-  is_sparse = {
-      op.inputs[0]: op.get_attr("a_is_sparse"),
-      op.inputs[1]: op.get_attr("b_is_sparse"),
-      # Use heuristic to figure out if grad might be sparse
-      grad: not context.executing_eagerly() and (grad.op.type == "ReluGrad")
-  }
+  is_sparse = object_identity.ObjectIdentityDictionary()
+  is_sparse[op.inputs[0]] = op.get_attr("a_is_sparse")
+  is_sparse[op.inputs[1]] = op.get_attr("b_is_sparse")
+  # Use heuristic to figure out if grad might be sparse
+  is_sparse[grad] = not context.executing_eagerly() and (
+      grad.op.type == "ReluGrad")
 
   def _SparseMatMul(t1, t2, out_dtype, transpose_a=False, transpose_b=False):
     """Helper function to create SparseMatMul op."""
@@ -1641,6 +1867,40 @@ def _CumprodGrad(op, grad):
   return [out / x, None]
 
 
+@ops.RegisterGradient("CumulativeLogsumexp")
+def _CumulativeLogsumexpGrad(op, grad):
+  x = op.inputs[0]
+  axis = op.inputs[1]
+  cumulative_logsumexp = op.outputs[0]
+
+  exclusive = op.get_attr("exclusive")
+  reverse = op.get_attr("reverse")
+
+  # Split the incoming gradient into positive and negative part
+  # in order to take logs. This is required for stable results.
+  log_grad_positive = array_ops.where_v2(
+      math_ops.greater(grad, 0),
+      math_ops.log(grad),
+      grad.dtype.min)
+
+  log_grad_negative = array_ops.where_v2(
+      math_ops.less(grad, 0),
+      math_ops.log(-grad),
+      grad.dtype.min)
+
+  output_pos = math_ops.exp(
+      math_ops.cumulative_logsumexp(
+          log_grad_positive - cumulative_logsumexp,
+          axis=axis, reverse=not reverse, exclusive=exclusive) + x)
+
+  output_neg = math_ops.exp(
+      math_ops.cumulative_logsumexp(
+          log_grad_negative - cumulative_logsumexp,
+          axis=axis, reverse=not reverse, exclusive=exclusive) + x)
+
+  return [output_pos - output_neg, None]
+
+
 @ops.RegisterGradient("NextAfter")
 def _NextAfterGrad(op, grad):
   """Returns gradient of nextafter(x1, x2) with respect to x1 and x2."""
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 96c24c3c98f..9715cd7cb59 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -193,6 +193,144 @@ class ProdGradientTest(test.TestCase):
         self.assertLess(error, 1e-4)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class EuclideanNormGradientTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([3], dtype=dtype)
+      grad = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grad)
+      self.assertLess(err, 1e-3)
+
+  def testNegative(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([-3], dtype=dtype)
+      grad = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grad)
+      self.assertLess(err, 1e-3)
+
+  def testKeepdims(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([3], dtype=dtype)
+      grad = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grad)
+      self.assertLess(err, 1e-3)
+
+  def testGradientChain(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([3], dtype=dtype)
+      grad = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x) * 5, [x])
+      err = gradient_checker_v2.max_error(*grad)
+      self.assertLess(err, 1e-3)
+
+  def testTwoElements(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([3, -4], dtype=dtype)
+      grad = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grad)
+      self.assertLess(err, 1e-3)
+
+  def testNegativeZero(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([1.0, -0.0], dtype=dtype)
+
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.reduce_euclidean_norm(x)
+
+      dx = tape.gradient(y, x)
+      dx_answer = constant_op.constant([1.0, -0.0], dtype=dtype)
+      self.assertAllClose(dx, dx_answer)
+      self.assertAllClose(1.0 / dx, 1.0 / dx_answer)
+
+  def testZeros(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([0.0, -0.0], dtype=dtype)
+
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.reduce_euclidean_norm(x)
+
+      dx = tape.gradient(y, x)
+      dx_answer = constant_op.constant(
+          [float("NaN"), float("NaN")], dtype=dtype)
+      self.assertAllClose(dx, dx_answer)
+
+  def test2D_1(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[-3, 5], [7, 11]], dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 1e-3)
+
+  def test2D_2(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[-3, 5], [7, 11]], dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 0), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 1e-3)
+
+  def test2D_3(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[-3, 5], [7, 11]], dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 1), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 1e-3)
+
+  def test2D_4(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[3], [4]], dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 1), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 1e-3)
+
+  def test3D_1(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[[-3, 5], [7, 11]], [[13, 17], [19, 23]]],
+                               dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          math_ops.reduce_euclidean_norm, [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 2e-3)
+
+  def test3D_2(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[[-3, 5], [7, 11]], [[13, 17], [19, 23]]],
+                               dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 0), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 2e-3)
+
+  def test3D_3(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[[-3, 5], [7, 11]], [[13, 17], [19, 23]]],
+                               dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 1), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 3e-3)
+
+  def test3D_4(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant([[[-3, 5], [7, 11]], [[13, 17], [19, 23]]],
+                               dtype=dtype)
+      grads = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.reduce_euclidean_norm(x, 2), [x])
+      err = gradient_checker_v2.max_error(*grads)
+      self.assertLess(err, 2e-3)
+
+
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 84372b3c922..8b48bfbf80a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -152,7 +152,7 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   tf.math.argmax(B,0) # [2, 2, 0, 2, 2]
   tf.math.argmax(B,1) # [2, 2, 1]
   ```
-   
+
   Args:
     input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
       `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`,
@@ -488,7 +488,7 @@ def complex(real, imag, name=None):
   Returns:
     A `Tensor` of type `complex64` or `complex128`.
 
-  Raises: 
+  Raises:
     TypeError: Real and imag must be correct types
   """
   real = ops.convert_to_tensor(real, name="real")
@@ -643,7 +643,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
     return gen_math_ops.round(x, name=name)
 
 
-@tf_export("dtypes.cast", "cast")
+@tf_export("cast", "dtypes.cast")
 @dispatch.add_dispatch_support
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
@@ -1110,11 +1110,6 @@ def div_no_nan(x, y, name=None):
   with ops.name_scope(name, "div_no_nan", [x, y]) as name:
     x = ops.convert_to_tensor(x, name="x")
     y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
-    x_dtype = x.dtype.base_dtype
-    y_dtype = y.dtype.base_dtype
-    if x_dtype != y_dtype:
-      raise TypeError("x and y must have the same dtype, got %r != %r" %
-                      (x_dtype, y_dtype))
     return gen_math_ops.div_no_nan(x, y, name=name)
 
 
@@ -1277,7 +1272,9 @@ ops.Tensor._override_operator("__ge__", gen_math_ops.greater_equal)
 
 def tensor_equals(self, other):
   """Compares two tensors element-wise for equality."""
-  if ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions():
+  g = getattr(self, "graph", None)
+  if (ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions() and
+      (g is None or g._building_function)):  # pylint: disable=protected-access
     return gen_math_ops.equal(self, other)
   else:
     # In legacy graph mode, tensor equality is object equality
@@ -2957,13 +2954,7 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
   otherwise, these are inferred.
 
-  `accumulate_n` performs the same operation as `tf.math.add_n`, but
-  does not wait for all of its inputs to be ready before beginning to sum.
-  This approach can save memory if inputs are ready at different times, since
-  minimum temporary storage is proportional to the output size rather than the
-  inputs' size.
-
-  `accumulate_n` is differentiable (but wasn't previous to TensorFlow 1.7).
+  `accumulate_n` performs the same operation as `tf.math.add_n`.
 
   For example:
 
@@ -3023,14 +3014,7 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
     return inputs[0]
   elif len(inputs) == 1 and name is not None:
     return array_ops.identity(inputs[0], name=name)
-  elif context.executing_eagerly():
-    # TemporaryVariable not currently supported in eager mode; fall back
-    # onto AddN for now.
-    # TODO(frreiss) remove this once the lifetime of eager variables gets
-    # addressed
-    return add_n(inputs, name=name)
-  else:
-    return gen_math_ops.accumulate_nv2(inputs, name=name, shape=shape)  # pylint: disable=protected-access
+  return add_n(inputs, name=name)
 
 
 @ops.RegisterGradient("AccumulateNV2")
@@ -3297,6 +3281,61 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
+@tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"])
+def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
+  """Compute the cumulative log-sum-exp of the tensor `x` along `axis`.
+
+  By default, this op performs an inclusive cumulative log-sum-exp, which means
+  that the first element of the input is identical to the first element of
+  the output.
+
+  This operation is significantly more numerically stable than the equivalent
+  tensorflow operation `tf.math.log(tf.math.cumsum(tf.math.exp(x)))`, although
+  computes the same result given infinite numerical precision. However, note
+  that in some cases, it may be less stable than `tf.math.reduce_logsumexp`
+  for a given element, as it applies the "log-sum-exp trick" in a different
+  way.
+
+  More precisely, where `tf.math.reduce_logsumexp` uses the following trick:
+
+  ```
+  log(sum(exp(x))) == log(sum(exp(x - max(x)))) + max(x)
+  ```
+
+  it cannot be directly used here as there is no fast way of applying it
+  to each prefix `x[:i]`. Instead, this function implements a prefix
+  scan using pairwise log-add-exp, which is a commutative and associative
+  (up to floating point precision) operator:
+
+  ```
+  log_add_exp(x, y) = log(exp(x) + exp(y))
+                    = log(1 + exp(min(x, y) - max(x, y))) + max(x, y)
+  ```
+
+  However, reducing using the above operator leads to a different computation
+  tree (logs are taken repeatedly instead of only at the end), and the maximum
+  is only computed pairwise instead of over the entire prefix. In general, this
+  leads to a different and slightly less precise computation.
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float16`, `float32`,
+      `float64`.
+    axis: A `Tensor` of type `int32` or `int64` (default: 0). Must be in the
+      range `[-rank(x), rank(x))`.
+    exclusive: If `True`, perform exclusive cumulative log-sum-exp.
+    reverse: If `True`, performs the cumulative log-sum-exp in the reverse
+      direction.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same shape and type as `x`.
+  """
+  with ops.name_scope(name, "CumulativeLogsumexp", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops.cumulative_logsumexp(
+        x, axis, exclusive=exclusive, reverse=reverse, name=name)
+
+
 @tf_export("math.conj", v1=["math.conj", "conj"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("conj")
@@ -3386,7 +3425,9 @@ def reduced_shape(input_shape, axes):
 
 
 def _unsorted_segment_N(data, segment_ids, num_segments):
-  """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number
+  """ Helper function for unsorted_segment_mean/_sqrtN.
+
+  Computes the number
       of segment entries with 0-entries set to 1 to allow division by N.
   """
   # bincount doesn't support negative indices so we use unsorted_segment_sum
@@ -3409,7 +3450,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3455,7 +3496,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3497,12 +3538,15 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
 
 @tf_export(v1=["sparse.segment_sum", "sparse_segment_sum"])
 @deprecation.deprecated_endpoints("sparse_segment_sum")
-def sparse_segment_sum(data, indices, segment_ids, name=None,
+def sparse_segment_sum(data,
+                       indices,
+                       segment_ids,
+                       name=None,
                        num_segments=None):
   r"""Computes the sum along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   Like `tf.math.segment_sum`, but `segment_ids` can have rank less than `data`'s
@@ -3578,7 +3622,7 @@ def sparse_segment_sum_v2(data,
   r"""Computes the sum along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   Like `tf.math.segment_sum`, but `segment_ids` can have rank less than `data`'s
@@ -3647,7 +3691,7 @@ def sparse_segment_mean(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   Like `tf.math.segment_mean`, but `segment_ids` can have rank less than
@@ -3693,7 +3737,7 @@ def sparse_segment_mean_v2(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   Like `tf.math.segment_mean`, but `segment_ids` can have rank less than
@@ -3769,7 +3813,7 @@ def sparse_segment_sqrt_n_v2(data,
   r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+  segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
   Like `tf.sparse.segment_mean`, but instead of dividing by the size of the
@@ -3796,7 +3840,7 @@ def sparse_segment_sqrt_n_v2(data,
 
 @tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
-  r"""Tensor contraction of a and b along specified axes.
+  r"""Tensor contraction of a and b along specified axes and outer product.
 
   Tensordot (also known as tensor contraction) sums the product of elements
   from `a` and `b` over the indices specified by `a_axes` and `b_axes`.
@@ -3804,7 +3848,8 @@ def tensordot(a, b, axes, name=None):
   contract the tensors. The axis `a_axes[i]` of `a` must have the same dimension
   as axis `b_axes[i]` of `b` for all `i` in `range(0, len(a_axes))`. The lists
   `a_axes` and `b_axes` must have identical length and consist of unique
-  integers that specify valid axes for each of the tensors.
+  integers that specify valid axes for each of the tensors. Additionally
+  outer product is supported by passing `axes=0`.
 
   This operation corresponds to `numpy.tensordot(a, b, axes)`.
 
@@ -3814,7 +3859,10 @@ def tensordot(a, b, axes, name=None):
   Example 2: When `a` and `b` are matrices (order 2), the case
   `axes = [[1], [0]]` is equivalent to matrix multiplication.
 
-  Example 3: Suppose that \\(a_{ijk}\\) and \\(b_{lmn}\\) represent two
+  Example 3: When `a` and `b` are matrices (order 2), the case `axes=0` gives
+  the outer product, a tensor of order 4.
+
+  Example 4: Suppose that \\(a_{ijk}\\) and \\(b_{lmn}\\) represent two
   tensors of order 3. Then, `contract(a, b, [[0], [2]])` is the order 4 tensor
   \\(c_{jklm}\\) whose entry
   corresponding to the indices \\((j,k,l,m)\\) is given by:
@@ -3831,7 +3879,8 @@ def tensordot(a, b, axes, name=None):
       b in order. If axes is a list or `Tensor` the first and second row contain
       the set of unique integers specifying axes along which the contraction is
       computed, for `a` and `b`, respectively. The number of axes for `a` and
-      `b` must be equal.
+      `b` must be equal. If `axes=0`, computes the outer product between `a` and
+      `b`.
     name: A name for the operation (optional).
 
   Returns:
@@ -4003,3 +4052,35 @@ def polyval(coeffs, x, name=None):
     for c in coeffs[1:]:
       p = c + p * x
     return p
+
+
+@tf_export("math.reciprocal_no_nan")
+def reciprocal_no_nan(x, name=None):
+  """Performs a safe reciprocal operation, element wise.
+
+  If a particular element is zero, the reciprocal for that element is
+  also set to zero.
+
+  For example:
+  ```python
+  x = tf.constant([2.0, 0.5, 0, 1], dtype=tf.float32)
+  tf.math.reciprocal_no_nan(x)  # [ 0.5, 2, 0.0, 1.0 ]
+  ```
+
+  Args:
+    x: A `Tensor` of type `float16`, `float32`, `float64` `complex64` or
+      `complex128`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of same shape and type as `x`.
+
+  Raises:
+    TypeError: x must be of a valid dtype.
+
+  """
+
+  with ops.name_scope(name, "reciprocal_no_nan", [x]) as scope:
+    x = ops.convert_to_tensor(x, name="x")
+    one = constant_op.constant(1, dtype=x.dtype.base_dtype, name="one")
+    return gen_math_ops.div_no_nan(one, x, name=scope)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 68740b67374..cb988b1a806 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -326,27 +326,6 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])
 
 
-class AccumulateNTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_deprecated_v1
-  def testFloat(self):
-    np.random.seed(12345)
-    x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
-    tf_x = ops.convert_n_to_tensor(x)
-    with self.session(use_gpu=True):
-      self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
-      self.assertAllClose(x[0] * 5, math_ops.accumulate_n([tf_x[0]] * 5).eval())
-
-  @test_util.run_deprecated_v1
-  def testInt(self):
-    np.random.seed(54321)
-    x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
-    tf_x = ops.convert_n_to_tensor(x)
-    with self.session(use_gpu=True):
-      self.assertAllEqual(sum(x), math_ops.accumulate_n(tf_x).eval())
-      self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
-
-
 class AddNTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -699,5 +678,37 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
       a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
       self.evaluate(a)
 
+
+class ReciprocalNoNanTest(test_util.TensorFlowTestCase):
+
+  allowed_dtypes = [
+      dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+      dtypes.complex128
+  ]
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for dtype in self.allowed_dtypes:
+      x = constant_op.constant([1.0, 2.0, 0.0, 4.0], dtype=dtype)
+
+      y = math_ops.reciprocal_no_nan(x)
+
+      target = constant_op.constant([1.0, 0.5, 0.0, 0.25], dtype=dtype)
+
+      self.assertAllEqual(y, target)
+      self.assertEqual(y.dtype.base_dtype, target.dtype.base_dtype)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInverse(self):
+    for dtype in self.allowed_dtypes:
+      x = np.random.choice([0, 1, 2, 4, 5], size=(5, 5, 5))
+      x = constant_op.constant(x, dtype=dtype)
+
+      y = math_ops.reciprocal_no_nan(math_ops.reciprocal_no_nan(x))
+
+      self.assertAllClose(y, x)
+      self.assertEqual(y.dtype.base_dtype, x.dtype.base_dtype)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index e978f1d3260..5f0616b384f 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -364,7 +364,7 @@ class SufficientStatisticsTest(test.TestCase):
       if d in set(axes):
         count *= x.shape[d]
     if not keep_dims:
-      shift = np.squeeze(shift, axis=axis)
+      shift = np.asarray(shift)
     return count, m_ss, v_ss, shift
 
   def _opSuffStats(self, x, axes, shift, keep_dims):
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 8e02871154d..7e443b91b82 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -614,15 +614,17 @@ def _DepthwiseConv2dNativeGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           op.inputs[1],
           grad,
-          op.get_attr("strides"),
-          op.get_attr("padding"),
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
           data_format=op.get_attr("data_format")),
       nn_ops.depthwise_conv2d_native_backprop_filter(
           op.inputs[0],
           array_ops.shape(op.inputs[1]),
           grad,
-          op.get_attr("strides"),
-          op.get_attr("padding"),
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
           data_format=op.get_attr("data_format"))
   ]
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 216c5754606..931131ff21b 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -24,11 +24,11 @@ from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context as ds
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
@@ -499,31 +499,8 @@ def relu_layer(x, weights, biases, name=None):
     return nn_ops.relu(xw_plus_b, name=name)
 
 
-def _swish_shape(op):
-  """Shape helper function for swish and _swish_grad function below."""
-  return [op.inputs[0].shape]
-
-
-@function.Defun(shape_func=_swish_shape, func_name="swish_grad", noinline=True)
-def _swish_grad(features, grad):
-  """Gradient of Swish function defined below."""
-  sigmoid_features = math_ops.sigmoid(features)
-  activation_grad = (
-      sigmoid_features * (1.0 + features * (1.0 - sigmoid_features)))
-  return grad * activation_grad
-
-
-# Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x) around
-# for backprop, effectively doubling the tensor's memory consumption. We use a
-# @Defun decorator with noinline=True so that sigmoid(features) is re-computed
-# during backprop, and we can free the sigmoid(features) expression immediately
-# after use during the forward pass.
 @tf_export("nn.swish")
-@function.Defun(
-    grad_func=_swish_grad,
-    shape_func=_swish_shape,
-    func_name="swish",
-    noinline=True)
+@custom_gradient.custom_gradient
 def swish(features):
   # pylint: disable=g-doc-args
   """Computes the Swish activation function: `x * sigmoid(x)`.
@@ -540,7 +517,22 @@ def swish(features):
   """
   # pylint: enable=g-doc-args
   features = ops.convert_to_tensor(features, name="features")
-  return features * math_ops.sigmoid(features)
+
+  def grad(dy):
+    """Gradient for the Swish activation function"""
+    # Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x)
+    # around for backprop, effectively doubling the tensor's memory consumption.
+    # We use a control dependency here so that sigmoid(features) is re-computed
+    # during backprop (the control dep prevents it being de-duped with the
+    # forward pass) and we can free the sigmoid(features) expression immediately
+    # after use during the forward pass.
+    with ops.control_dependencies([dy]):
+      sigmoid_features = math_ops.sigmoid(features)
+    activation_grad = (
+        sigmoid_features * (1.0 + features * (1.0 - sigmoid_features)))
+    return dy * activation_grad
+
+  return features * math_ops.sigmoid(features), grad
 
 
 # pylint: disable=redefined-builtin
@@ -715,6 +707,22 @@ def zero_fraction(value, name=None):
     return array_ops.identity(zero_fraction_float32, "fraction")
 
 
+# copybara:strip_begin
+# TODO(b/138808492): Remove code inside copybara
+# to make TPU code and CPU code consistent.
+def _enclosing_tpu_context():
+  # pylint: disable=protected-access
+  context = ops.get_default_graph()._get_control_flow_context()
+  # pylint: enable=protected-access
+  while context is not None and not isinstance(
+      context, control_flow_ops.XLAControlFlowContext):
+    context = context.outer_context
+  return context
+
+
+# copybara:strip_end
+
+
 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
 def depthwise_conv2d(input,
@@ -774,6 +782,25 @@ def depthwise_conv2d(input,
     if rate is None:
       rate = [1, 1]
 
+    # copybara:strip_begin
+    # TODO(b/138808492): Remove code inside copybara
+    # to make TPU code and CPU code consistent.
+    # Use depthwise_conv2d_native if executing on TPU.
+    if _enclosing_tpu_context() is not None:
+      if data_format == "NCHW":
+        dilations = [1, 1, rate[0], rate[1]]
+      else:
+        dilations = [1, rate[0], rate[1], 1]
+      return nn_ops.depthwise_conv2d_native(
+          input=input,
+          filter=filter,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations,
+          name=name)
+    # copybara:strip_end
+
     def op(input_converted, _, padding):
       return nn_ops.depthwise_conv2d_native(
           input=input_converted,
diff --git a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
index 3578fb06a9d..cf2a7d2b289 100644
--- a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
+++ b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
@@ -184,3 +184,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
           RuntimeError, "You are calling `scale_regularization_loss` in "
           "cross replica context"):
         nn_impl.scale_regularization_loss([2, 3])
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 418a34fce50..17e6bed4ae5 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -33,6 +33,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+# copybara:strip_begin
+# TODO(b/138808492): Remove code inside copybara
+from tensorflow.python.ops import control_flow_ops
+# copybara:strip_end
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -42,6 +46,7 @@ from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
@@ -57,7 +62,7 @@ def _get_sequence(value, n, channel_index, name):
   """Formats a value input for gen_nn_ops."""
   if value is None:
     value = [1]
-  elif not isinstance(value, collections.Sized):
+  elif not isinstance(value, collections_abc.Sized):
     value = [value]
 
   current_n = len(value)
@@ -280,7 +285,7 @@ def dilation2d_v2(
       tensor. Must be: `[1, stride_height, stride_width, 1]`.
     padding: A `string` from: `"SAME", "VALID"`.
       The type of padding algorithm to use.
-    data_format: A `string`, only `"NCHW"` is currently supported.
+    data_format: A `string`, only `"NHWC"` is currently supported.
     dilations: A list of `ints` that has length `>= 4`.
       The input stride for atrous morphological dilation. Must be:
       `[1, rate_height, rate_width, 1]`.
@@ -289,8 +294,8 @@ def dilation2d_v2(
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if data_format != "NCHW":
-    raise ValueError("Data formats other than NCHW are not yet supported")
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than NHWC are not yet supported")
 
   return gen_nn_ops.dilation2d(input=input,
                                filter=filters,
@@ -918,6 +923,22 @@ convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     "filter", "filters")
 
 
+# copybara:strip_begin
+# TODO(b/138808492): Remove code inside copybara
+# to make TPU code and CPU code consistent.
+def _enclosing_tpu_context():
+  # pylint: disable=protected-access
+  run_context = ops.get_default_graph()._get_control_flow_context()
+  # pylint: enable=protected-access
+  while run_context is not None and not isinstance(
+      run_context, control_flow_ops.XLAControlFlowContext):
+    run_context = run_context.outer_context
+  return run_context
+
+
+# copybara:strip_end
+
+
 def convolution_internal(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -925,40 +946,58 @@ def convolution_internal(
     padding="VALID",
     data_format=None,
     dilations=None,
-    name=None):
+    name=None,
+    call_from_convolution=True):
   """Internal function which performs rank agnostic convolution."""
-  with ops.name_scope(name, "convolution", [input, filters]) as name:
-    if isinstance(input.shape, tensor_shape.TensorShape) and \
+  if isinstance(input.shape, tensor_shape.TensorShape) and \
         input.shape.rank is not None:
-      n = len(input.shape) - 2
-    elif not isinstance(input.shape, tensor_shape.TensorShape) and \
+    n = len(input.shape) - 2
+  elif not isinstance(input.shape, tensor_shape.TensorShape) and \
         input.shape is not None:
-      n = len(input.shape) - 2
-    elif isinstance(filters.shape, tensor_shape.TensorShape) and \
+    n = len(input.shape) - 2
+  elif isinstance(filters.shape, tensor_shape.TensorShape) and \
         filters.shape.rank is not None:
-      n = len(filters.shape) - 2
-    elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
+    n = len(filters.shape) - 2
+  elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
         filters.shape is not None:
-      n = len(filters.shape) - 2
-    else:
-      raise ValueError("rank of input or filter must be known")
+    n = len(filters.shape) - 2
+  else:
+    raise ValueError("rank of input or filter must be known")
 
-    if not 1 <= n <= 3:
-      raise ValueError(
-          "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
 
-    if data_format is None:
-      channel_index = n + 1
-    else:
-      channel_index = 1 if data_format.startswith("NC") else n + 1
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
 
-    strides = _get_sequence(strides, n, channel_index, "strides")
-    dilations = _get_sequence(dilations, n, channel_index, "dilations")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+  dilations = _get_sequence(dilations, n, channel_index, "dilations")
 
+  # copybara:strip_begin
+  # TODO(b/138808492): Remove code inside copybara
+  # to make TPU code and CPU code consistent.
+  scopes = {1: "conv1d", 2: "Conv2D", 3: "Conv3D"}
+  if not call_from_convolution and _enclosing_tpu_context() is not None:
+    scope = scopes[n]
+  else:
+    scope = "convolution"
+  # copybara:strip_end
+  # copybara:insert scope = "convolution"
+
+  with ops.name_scope(name, scope, [input, filters]) as name:
     conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
 
-    if all(i == 1 for i in dilations):
-      # fast path if no dilation as gradient only supported on GPU for dilations
+    # copybara:strip_begin
+    # TODO(b/138808492): Remove code inside copybara
+    # to make TPU code and CPU code consistent.
+    if _enclosing_tpu_context() is not None or all(i == 1 for i in dilations):
+      # fast path for TPU or if no dilation as gradient only supported on GPU
+      # for dilations
+    # copybara:strip_end
+    # copybara:insert if all(i == 1 for i in dilations):
       op = conv_ops[n]
       return op(
           input,
@@ -1055,7 +1094,9 @@ class Convolution(object):
     self.filter_shape = filter_shape
     self.data_format = data_format
     self.strides = strides
+    self.padding = padding
     self.name = name
+    self.dilation_rate = dilation_rate
     self.conv_op = _WithSpaceToBatch(
         input_shape,
         dilation_rate=dilation_rate,
@@ -1075,7 +1116,24 @@ class Convolution(object):
         name=self.name)
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
-    return self.conv_op(inp, filter)
+    # copybara:strip_begin
+    # TODO(b/138808492): Remove code inside copybara
+    # to make TPU code and CPU code consistent.
+    # TPU convolution supports dilations greater than 1.
+    if _enclosing_tpu_context() is not None:
+      return convolution_internal(
+          inp,
+          filter,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dilations=self.dilation_rate,
+          name=self.name,
+          call_from_convolution=False)
+    else:
+      return self.conv_op(inp, filter)
+    # copybara:strip_end
+    # copybara:insert return self.conv_op(inp, filter)
 
 
 @tf_export(v1=["nn.pool"])
@@ -2282,7 +2340,8 @@ def atrous_conv2d_transpose(value,
           data_format="NHWC")
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
+    if not output_shape_.get_shape().is_compatible_with(
+        tensor_shape.TensorShape([4])):
       raise ValueError("output_shape must have shape (4,), got {}".format(
           output_shape_.get_shape()))
 
@@ -2600,10 +2659,10 @@ def conv_transpose(input,  # pylint: disable=redefined-builtin
   """
   with ops.name_scope(name, "conv_transpose",
                       [input, filter, output_shape]) as name:
-    if isinstance(output_shape, collections.Sized):
-      n = len(output_shape) - 2
-    elif isinstance(output_shape, ops.Tensor):
+    if tensor_util.is_tensor(output_shape):
       n = output_shape.shape[0] - 2
+    elif isinstance(output_shape, collections.Sized):
+      n = len(output_shape) - 2
     else:
       raise ValueError("output_shape must be a tensor or sized collection.")
 
@@ -2742,7 +2801,7 @@ def relu6(features, name=None):
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
-  Source: [Rectifier Nonlinearities Improve Neural Network Acoustic Models. 
+  Source: [Rectifier Nonlinearities Improve Neural Network Acoustic Models.
   AL Maas, AY Hannun, AY Ng - Proc. ICML, 2013](https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf).
 
   Args:
@@ -3594,8 +3653,8 @@ def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  #
     ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
     strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
 
-    data_format = "NHWC" if data_format == "NWC" else "NCHW"
     expanding_dim = 1 if data_format == "NWC" else 2
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
 
     input = array_ops.expand_dims_v2(input, expanding_dim)
     result = gen_nn_ops.avg_pool(
@@ -3785,8 +3844,8 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
     ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
     strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
 
-    data_format = "NHWC" if data_format == "NWC" else "NCHW"
     expanding_dim = 1 if data_format == "NWC" else 2
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
 
     input = array_ops.expand_dims_v2(input, expanding_dim)
     result = gen_nn_ops.max_pool(
@@ -4233,7 +4292,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
     else:
       rate = ops.convert_to_tensor(
           rate, dtype=x.dtype, name="rate")
-      rate.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+      rate.get_shape().assert_has_rank(0)
 
       # Do nothing if we know rate == 0
       if tensor_util.constant_value(rate) == 0:
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 71f16deb80f..4763ae085db 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1276,6 +1276,17 @@ class AvgPoolTest(test_lib.TestCase):
 
     self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
 
+  def test1DNumpyWithGolden(self):
+    dtype = np.float32 if test_lib.is_built_with_rocm() else np.float64
+    x = np.array([[[3], [6], [5]],
+                  [[1], [0], [1]]], dtype=dtype)
+    ksize = 2
+    strides = 1
+    y = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+    expected_y = np.array([[[4.5], [5.5], [5.0]],
+                           [[0.5], [0.5], [1.0]]], dtype=dtype)
+    self.assertAllEqual(self.evaluate(y), expected_y)
+
   def test2DTensor(self):
     x = array_ops.ones([3, 6, 6, 5])
     ksize = 2
@@ -1350,6 +1361,17 @@ class MaxPoolTest(test_lib.TestCase):
 
     self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
 
+  def test1DNumpyWithGolden(self):
+    dtype = np.float32 if test_lib.is_built_with_rocm() else np.float64
+    x = np.array([[[3], [6], [5]],
+                  [[1], [0], [1]]], dtype=dtype)
+    ksize = 2
+    strides = 1
+    y = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+    expected_y = np.array([[[6], [6], [5]],
+                           [[1], [1], [1]]], dtype=dtype)
+    self.assertAllEqual(self.evaluate(y), expected_y)
+
   def test2DTensor(self):
     x = array_ops.ones([3, 6, 6, 5])
     ksize = 2
diff --git a/tensorflow/python/ops/op_selector.py b/tensorflow/python/ops/op_selector.py
index 68594a51c87..1ae43aa5bda 100644
--- a/tensorflow/python/ops/op_selector.py
+++ b/tensorflow/python/ops/op_selector.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.util import object_identity
 
 
 def is_differentiable(op):
@@ -275,11 +276,11 @@ def get_backward_walk_ops(seed_ops,
   else:
     seed_ops = make_list_of_op(seed_ops, allow_graph=False)
 
-  stop_at_ts = frozenset(make_list_of_t(stop_at_ts))
-  seed_ops = frozenset(make_list_of_op(seed_ops))
+  stop_at_ts = object_identity.ObjectIdentitySet(make_list_of_t(stop_at_ts))
+  seed_ops = object_identity.ObjectIdentitySet(make_list_of_op(seed_ops))
   if within_ops:
     within_ops = make_list_of_op(within_ops, allow_graph=False)
-    within_ops = frozenset(within_ops)
+    within_ops = object_identity.ObjectIdentitySet(within_ops)
     seed_ops &= within_ops
 
   def is_within(op):
@@ -390,7 +391,7 @@ def map_subgraph(init_tensor, sources, disallowed_placeholders, visited_ops,
       sources and add_sources is False.
   """
   ops_to_visit = [_as_operation(init_tensor)]
-  extra_sources = set()
+  extra_sources = object_identity.ObjectIdentitySet()
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in visited_ops:
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 44011e44382..cbef051b973 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -21,6 +21,7 @@ py_library(
         ":control_flow_ops",
         ":gradients",
         ":test_util",
+        "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -49,6 +50,7 @@ py_library(
     srcs = ["pfor.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -128,6 +130,8 @@ cuda_py_test(
     additional_deps = [
         ":control_flow_ops",
         ":test_util",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python/compiler/xla:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 1d1bb1f363b..d3e0d2ae662 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -43,7 +43,7 @@ class ArrayTest(PForTestCase):
       outputs = []
       x_i = array_ops.gather(x, i)
       for y in [x, x_i]:
-        axes = [0, 2, -1] if y == x else [0]
+        axes = [0, 2, -1] if y is x else [0]
         for axis in axes:
           outputs.append(array_ops.gather(y, 2, axis=axis))
           outputs.append(array_ops.gather(y, i, axis=axis))
@@ -54,6 +54,19 @@ class ArrayTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
 
+  def test_gather_nd(self):
+    x = random_ops.random_uniform([3, 3, 3])
+
+    def loop_fn(i):
+      outputs = []
+      x_i = array_ops.gather(x, i)
+      outputs.append(array_ops.gather_nd(x_i, [0], batch_dims=0))
+      outputs.append(array_ops.gather_nd(x_i, [i], batch_dims=0))
+      outputs.append(array_ops.gather_nd(x_i, [[i], [i], [i]], batch_dims=1))
+      return outputs
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
   def test_shape(self):
     x = random_ops.random_uniform([3, 2, 3])
 
@@ -314,7 +327,7 @@ class ArrayTest(PForTestCase):
 
     def loop_fn(i):
       diagonal = array_ops.gather(x, i)
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
         return array_ops.matrix_diag(diagonal, k=(0, 1), num_rows=4, num_cols=5)
       return array_ops.matrix_diag(diagonal)
 
@@ -325,7 +338,7 @@ class ArrayTest(PForTestCase):
 
     def loop_fn(i):
       input = array_ops.gather(x, i)  # pylint: disable=redefined-builtin
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
         return array_ops.matrix_diag_part(input, k=(-2, 0), padding_value=3)
       return array_ops.matrix_diag_part(input)
 
@@ -335,7 +348,7 @@ class ArrayTest(PForTestCase):
     matrices = random_ops.random_uniform([3, 4, 4])
     diags = random_ops.random_uniform([3, 4])
     num_outputs = 3
-    if compat.forward_compatible(2019, 7, 31):
+    if compat.forward_compatible(2019, 8, 31):
       bands = random_ops.random_uniform([3, 3, 4])
       num_outputs = 6
 
@@ -347,7 +360,7 @@ class ArrayTest(PForTestCase):
           array_ops.matrix_set_diag(matrices[0, ...], diag_i),
           array_ops.matrix_set_diag(matrix_i, diags[0, ...])
       ]
-      if compat.forward_compatible(2019, 7, 31):
+      if compat.forward_compatible(2019, 8, 31):
         k = (-1, 1)
         band_i = array_ops.gather(bands, i)
         results.extend([
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 7c569560d43..d329825a3c3 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -99,7 +99,12 @@ def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
 
   output = [None if is_none else ta.concat()
             for ta, is_none in zip(ta_list, is_none_list)]
-  return nest.pack_sequence_as(loop_fn_dtypes, output)
+  assert len(output) in (0, len(flat_loop_fn_dtypes))
+  if not output:
+    # This may happen for the case where iters == 0.
+    return None
+  else:
+    return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
 def _flatten_first_two_dims(x):
@@ -306,42 +311,17 @@ def vectorized_map(fn, elems):
 
 
   This method works similar to tf.map_fn but is optimized to run much faster,
-  but possibly with a much larger memory footprint. The speedups are obtained by
+  possibly with a much larger memory footprint. The speedups are obtained by
   vectorization (see https://arxiv.org/pdf/1903.04243.pdf). The idea behind
   vectorization is to semantically launch all the invocations of `fn` in
   parallel and fuse corresponding operations across all these invocations. This
   fusion is done statically at graph generation time and the generated code is
   often similar in performance to a manually fused version.
 
-
-  For example, let's look at a method that calculates the outer product of a
-  matrix.
-
-  ```python
-  def outer_product(a):
-    return tf.tensordot(a, a, 0)
-
-  # outer_product was designed to not support batching.
-  c = outer_product(tf.ones((2, 3)))
-  # The shape is consistent
-  assert c.shape == (2, 3, 2, 3)
-  ```
-
-  Now suppose we want an efficient batched version of outer_product. We can
-  simply write:
-
-  ```python
-  batch_size = 100
-  a = tf.ones((batch_size, 32, 32))
-  c = tf.vectorized_map(outer_product, a)
-  assert c.shape == (batch_size, 32, 32, 32, 32)
-   ```
-
   Because `tf.vectorized_map` fully parallelizes the batch, this method will
   generally be significantly faster than using `tf.map_fn`, especially in eager
-  mode.
-
-  This is an experimental feature and currently has a lot of limitations:
+  mode. However this is an experimental feature and currently has a lot of
+  limitations:
     - There should be no data dependency between the different semantic
       invocations of `fn`, i.e. it should be safe to map the elements of the
       inputs in any order.
@@ -352,8 +332,42 @@ def vectorized_map(fn, elems):
       particular is not supported.
     - `fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
-    - The shape and dtype of `fn` outputs should not depend on the input
-      to `fn`.
+    - The shape and dtype of any intermediate or output tensors in the
+      computation of `fn` should not depend on the input to `fn`.
+
+  Examples:
+  ```python
+  def outer_product(a):
+    return tf.tensordot(a, a, 0)
+
+  batch_size = 100
+  a = tf.ones((batch_size, 32, 32))
+  c = tf.vectorized_map(outer_product, a)
+  assert c.shape == (batch_size, 32, 32, 32, 32)
+  ```
+
+  ```python
+  # Computing per-example gradients
+
+  batch_size = 10
+  num_features = 32
+  layer = tf.keras.layers.Dense(1)
+
+  def model_fn(arg):
+    with tf.GradientTape() as g:
+      inp, label = arg
+      inp = tf.expand_dims(inp, 0)
+      label = tf.expand_dims(label, 0)
+      prediction = layer(inp)
+      loss = tf.nn.l2_loss(label - prediction)
+    return g.gradient(loss, (layer.kernel, layer.bias))
+
+  inputs = tf.random_uniform([batch_size, num_features])
+  labels = tf.random_uniform([batch_size, 1])
+  per_example_gradients = tf.vectorized_map(model_fn, (inputs, labels))
+  assert per_example_gradients[0].shape == (batch_size, num_features, 1)
+  assert per_example_gradients[1].shape == (batch_size, 1)
+  ```
 
   Args:
     fn: The callable to be performed. It accepts one argument, which will have
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index ac45a89473f..d0de84c8ba3 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import control_flow_ops
@@ -60,6 +61,7 @@ from tensorflow.python.util import nest
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
 class PForTest(PForTestCase):
 
   def test_op_conversion_fallback_to_while_loop(self):
@@ -111,6 +113,37 @@ class PForTest(PForTestCase):
         compute, array_ops.ones((10, 5, 3)))
     self.run_and_assert_equal(result, array_ops.ones((10, 1, 3)))
 
+  def test_vectorized_map_example_1(self):
+    def outer_product(a):
+      return math_ops.tensordot(a, a, 0)
+
+    batch_size = 100
+    a = array_ops.ones((batch_size, 32, 32))
+    c = pfor_control_flow_ops.vectorized_map(outer_product, a)
+    self.assertAllEqual((batch_size, 32, 32, 32, 32), c.shape)
+
+  def test_vectorized_map_example_2(self):
+    batch_size = 10
+    num_features = 32
+    layer = keras_core.Dense(1)
+
+    def model_fn(arg):
+      with backprop.GradientTape() as g:
+        inp, label = arg
+        inp = array_ops.expand_dims(inp, 0)
+        label = array_ops.expand_dims(label, 0)
+        prediction = layer(inp)
+        loss = nn.l2_loss(label - prediction)
+      return g.gradient(loss, (layer.kernel, layer.bias))
+
+    inputs = random_ops.random_uniform([batch_size, num_features])
+    labels = random_ops.random_uniform([batch_size, 1])
+    per_example_gradients = pfor_control_flow_ops.vectorized_map(
+        model_fn, (inputs, labels))
+    self.assertAllEqual(per_example_gradients[0].shape,
+                        (batch_size, num_features, 1))
+    self.assertAllEqual(per_example_gradients[1].shape, (batch_size, 1))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(PForTestCase):
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 756ef7bc7e9..5a380446abb 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
@@ -35,7 +37,7 @@ from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class MathTest(PForTestCase):
+class MathTest(PForTestCase, parameterized.TestCase):
 
   def _test_unary_cwise_ops(self, ops, is_complex):
     for op in ops:
@@ -240,6 +242,18 @@ class MathTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  def test_cross(self):
+    x = random_ops.random_uniform([4, 2, 3])
+    y = random_ops.random_uniform([4, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      y_i = array_ops.gather(y, i)
+      x_0 = array_ops.gather(x, 0)
+      return math_ops.cross(x_i, y_i), math_ops.cross(x_0, y_i)
+
+    self._test_loop_fn(loop_fn, 4, loop_fn_dtypes=[dtypes.float32] * 2)
+
   def test_matmul(self):
     for tr_a in (True, False):
       for tr_b in (True, False):
@@ -441,6 +455,61 @@ class MathTest(PForTestCase):
 
         self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 3)
 
+  @parameterized.parameters((math_ops.sparse_segment_sum_v2, True),
+                            (math_ops.sparse_segment_mean_v2, True),
+                            (math_ops.sparse_segment_sqrt_n_v2, True),
+                            (math_ops.sparse_segment_sum_v2, False),
+                            (math_ops.sparse_segment_mean_v2, False),
+                            (math_ops.sparse_segment_sqrt_n_v2, False))
+  def test_sparse_segment(self, op_func, with_num_segments):
+    data = random_ops.random_uniform([3, 4, 2])
+    indices = constant_op.constant([[1, 2, 3], [0, 1, 2], [0, 2, 3]])
+    seg_ids = constant_op.constant([[0, 0, 2], [1, 1, 1], [0, 1, 1]])
+    if with_num_segments:
+      num_segments = 3
+    else:
+      num_segments = None
+
+    def loop_fn(i):
+      data_i = array_ops.gather(data, i)
+      data_0 = array_ops.gather(data, 0)
+      indices_i = array_ops.gather(indices, i)
+      indices_0 = array_ops.gather(indices, 0)
+      seg_ids_i = array_ops.gather(seg_ids, i)
+      seg_ids_0 = array_ops.gather(seg_ids, 0)
+      outputs = [
+          op_func(data_0, indices_i, seg_ids_0, num_segments=num_segments),
+          op_func(data_i, indices_i, seg_ids_0, num_segments=num_segments),
+          op_func(data_0, indices_0, seg_ids_0, num_segments=num_segments),
+          op_func(data_i, indices_0, seg_ids_0, num_segments=num_segments)
+      ]
+      if with_num_segments:
+        # For this case, we support loop variant segment_ids as well.
+        outputs += [
+            op_func(data_0, indices_i, seg_ids_i, num_segments=num_segments),
+            op_func(data_i, indices_i, seg_ids_i, num_segments=num_segments),
+            op_func(data_0, indices_0, seg_ids_i, num_segments=num_segments),
+            op_func(data_i, indices_0, seg_ids_i, num_segments=num_segments)
+        ]
+      return outputs
+
+    num_outputs = 8 if with_num_segments else 4
+    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * num_outputs)
+
+  @parameterized.parameters(math_ops.sparse_segment_mean_grad,
+                            math_ops.sparse_segment_sqrt_n_grad)
+  def test_sparse_segment_grad(self, op_func):
+    grad = random_ops.random_uniform([3, 3, 2])
+    indices = constant_op.constant([1, 2, 3])
+    seg_ids = constant_op.constant([0, 0, 2])
+    dim0 = 4
+
+    def loop_fn(i):
+      grad_i = array_ops.gather(grad, i)
+      return op_func(grad_i, indices, seg_ids, dim0)
+
+    self._test_loop_fn(loop_fn, 3)
+
   def test_cast(self):
     x = constant_op.constant([[1], [2]])
     y = constant_op.constant([[1.0], [2.0]])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index d880ddb74e3..05e2d6a455c 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import string
 
+from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
@@ -52,6 +54,7 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 
 flags.DEFINE_bool(
     "op_conversion_fallback_to_while_loop", False,
@@ -799,12 +802,63 @@ class RegisterPFor(object):
     ...
 
   The above will register conversion function `_foo_converter` for handling
-  conversion of `foo_op_type`. During conversion, the registered functin will be
-  called with a single argument of type `PForInput` which will contain state
-  needed for the conversion.  This registered function should output a list of
-  WrappedTensor object with the same length as the number of outputs of op being
-  converted. If the op had zero outputs, then it should return a ops.Operation
-  object.
+  conversion of `foo_op_type`. These converters are called during vectorization
+  of a `pfor` loop body. For each operation node in this loop body,
+  the vectorization process will call the converter corresponding to the
+  operation type of the node.
+
+  During conversion, the registered function will be called with a single
+  argument `pfor_input`, of type `PForInput`, which will contain state needed
+  for the conversion.  When the converter is called for a node, all its inputs
+  should already have been converted and these converted values are stored in
+  `pfor_input.inputs`.  This registered function should output a list of
+  WrappedTensor objects with the same length as the number of outputs of the
+  node being converted. If the node had zero outputs, then it should return an
+  ops.Operation object.  These new sets of nodes should implement the
+  functionality of running that operation for the number of iterations specified
+  by `pfor_input.pfor.loop_len_vector[0]` where the inputs of the node for each
+  iteration are picked from `pfor_inputs.inputs()`.
+
+  One tricky aspect of the conversion process is keeping track of, and
+  leveraging loop invariance of computation. Each converted input is a
+  WrappedTensor which indicates whether the input was loop invariant or not. If
+  the converted value is loop invariant, its rank should match the rank of the
+  corresponding tensor in the loop body, else its rank is larger by 1. The
+  converter should look at the loop invariance of the inputs and generate new
+  nodes based on that. Note that the converter will not be called if all inputs
+  are loop invariant and the operation is not stateful. The converter should
+  determine if its own output is loop invariant and `wrap` its output
+  accordingly.
+
+  Example:
+
+  Here, the converter is trying to convert a Reshape node in the loop body. This
+  node will have two inputs: the tensor to reshape, and the new shape.  The
+  example here only handles the case where the shape is loop invariant.
+
+  @RegisterPFor("Reshape")
+  def _convert_reshape(pfor_input):
+    # We assume that input is not loop invariant. Call to `stacked_input`
+    # asserts that and returns the converted value. This value will have a rank
+    # larger by 1 compared to the rank of the input in the loop body.
+    t = pfor_input.stacked_input(0)
+
+    # We assume that shape input is loop invariant. Call to `unstacked_input`
+    # asserts that and returns the converted value.
+    shape = pfor_input.unstacked_input(1)
+
+    # We compute `new_shape` by prepending the number of iterations to the
+    # original shape.
+    new_shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape],
+                                 axis=0)
+
+    # The vectorized output involves reshaping the converted input `t` using
+    # `new_shape`.
+    new_output = array_ops.reshape(t, new_shape)
+
+    # The converted output is marked as not loop invariant using the call to
+    # wrap.
+    return wrap(new_output, True)
   """
 
   def __init__(self, op_type):
@@ -926,9 +980,9 @@ class PForConfig(object):
     # This may be set to the number of iterations.
     self._maybe_iters = None
     # Map from output placeholder to the unvectorized tensor.
-    self._reduce_concat_map = {}
+    self._reduce_concat_map = object_identity.ObjectIdentityDictionary()
     # Reverse map of `self._reduce_concat_map`.
-    self._reverse_reduce_concat_map = {}
+    self._reverse_reduce_concat_map = object_identity.ObjectIdentityDictionary()
 
   def _has_reductions(self):
     """True if some reductions where performed by loop body."""
@@ -1071,7 +1125,7 @@ class PFor(object):
     self.all_indices = (
         math_ops.range(loop_len) if all_indices is None else all_indices)
 
-    self._conversion_map = {}
+    self._conversion_map = object_identity.ObjectIdentityDictionary()
     self._conversion_map[loop_var] = wrap(self.all_indices, True)
     self._pfor_ops = set(pfor_ops)
     self._pfor_op_ids = set([x._id for x in pfor_ops])
@@ -1316,7 +1370,7 @@ class PFor(object):
                (not is_stateful and not some_input_converted and
                 not some_control_input_converted)) and
               y.graph == ops.get_default_graph()):
-          if y == y_op:
+          if y is y_op:
             assert not isinstance(y_op, WhileOp)
             new_outputs = y_op
           else:
@@ -1329,7 +1383,7 @@ class PFor(object):
           new_op = _create_op(y_op.type, [x.t for x in converted_inputs],
                               [x.dtype for x in y_op.outputs],
                               y_op.node_def.attr)
-          if y == y_op:
+          if y is y_op:
             new_outputs = new_op
           else:
             new_outputs = [wrap(x, False) for x in new_op.outputs]
@@ -1359,7 +1413,7 @@ class PFor(object):
         logging.vlog(2, "converted %s %s", y_op, new_outputs)
 
         # Insert into self._conversion_map
-        if y == y_op:
+        if y is y_op:
           assert isinstance(new_outputs, ops.Operation)
           self._add_conversion(y_op, new_outputs)
         else:
@@ -1400,6 +1454,10 @@ class PFor(object):
     """
     return self._all_indices_partitioned
 
+
+# The code below defines converters for different operations. Please see comment
+# for RegisterPFor to see how converters should be defined.
+
 # nn_ops
 
 
@@ -1476,7 +1534,7 @@ def _channel_flatten_input(x, data_format):
   """
 
   graph = ops.get_default_graph()
-  cache_key = (graph, x, data_format)
+  cache_key = (graph, x.experimental_ref(), data_format)
   if cache_key not in _channel_flatten_input_cache:
     x_shape = array_ops.shape(x)
     if data_format == b"NCHW":
@@ -1909,7 +1967,7 @@ def _convert_gather(pfor_input):
     if axis_value is not None:
       axis = axis_value
   if indices_stacked and not param_stacked:
-    if indices == pfor_input.pfor.all_indices and axis == 0:
+    if indices is pfor_input.pfor.all_indices and axis == 0:
       param_shape0 = param.shape.dims[0].value
       indices_shape0 = indices.shape.dims[0].value
       if param_shape0 is not None and indices_shape0 == param_shape0:
@@ -1967,6 +2025,19 @@ def _convert_gather(pfor_input):
     return wrap(output, True)
 
 
+@RegisterPFor("GatherNd")
+def _convert_gather_nd(pfor_input):
+  # TODO(jmenick): Add support for unstacked params.
+  pfor_input.stack_inputs(stack_indices=[1])
+  params = pfor_input.stacked_input(0)
+  indices = pfor_input.stacked_input(1)
+  stacked_result = array_ops.gather_nd(
+      params,
+      indices,
+      batch_dims=1)
+  return wrap(stacked_result, True)
+
+
 @RegisterPFor("ConcatV2")
 def _convert_concatv2(pfor_input):
   n = pfor_input.num_inputs
@@ -2049,7 +2120,6 @@ def _convert_strided_slice_grad(pfor_input):
 
 # math_ops
 
-
 @RegisterPFor("MatMul")
 def _convert_matmul(pfor_input):
   # TODO(agarwal): Check if tiling is faster than two transposes.
@@ -2218,6 +2288,82 @@ def _convert_unsortedsegmentsum(pfor_input):
   return wrap(output, True)
 
 
+def _flatten_array_with_offset(ids, offset_delta, num_rows):
+  """Flattens a rank 2 tensor, adding an offset to each row."""
+  # Note that if `ids` is rank 1, it is broadcast to rank 2.
+  offset_delta = math_ops.cast(offset_delta, ids.dtype)
+  n = math_ops.cast(num_rows, dtype=ids.dtype)
+  offsets = math_ops.range(
+      start=0, limit=n * offset_delta, delta=offset_delta, dtype=ids.dtype)
+  offsets = array_ops.expand_dims(offsets, -1)
+  ids += offsets
+  return array_ops.reshape(ids, [-1])
+
+
+@RegisterPForWithArgs("SparseSegmentSum", math_ops.sparse_segment_sum_v2)
+@RegisterPForWithArgs("SparseSegmentMean", math_ops.sparse_segment_mean_v2)
+@RegisterPForWithArgs("SparseSegmentSqrtN", math_ops.sparse_segment_sqrt_n_v2)
+@RegisterPForWithArgs("SparseSegmentSumWithNumSegments",
+                      math_ops.sparse_segment_sum_v2)
+@RegisterPForWithArgs("SparseSegmentMeanWithNumSegments",
+                      math_ops.sparse_segment_mean_v2)
+@RegisterPForWithArgs("SparseSegmentSqrtNWithNumSegments",
+                      math_ops.sparse_segment_sqrt_n_v2)
+def _convert_sparse_segment(pfor_input, _, op_func):
+  _, segment_ids_stacked, _ = pfor_input.input(2)
+  if segment_ids_stacked:
+    pfor_input.stack_inputs([1])
+  data, data_stacked, _ = pfor_input.input(0)
+  indices, _, _ = pfor_input.input(1)
+  num_inputs = len(pfor_input.inputs)
+  assert num_inputs in (3, 4)
+  if num_inputs == 3:
+    # `segment_ids` needs to be unstacked since otherwise output sizes could
+    # differ across pfor iterations.
+    segment_ids = pfor_input.unstacked_input(2)
+    num_segments = nn_ops.relu(math_ops.reduce_max(segment_ids) + 1)
+  else:
+    segment_ids, _, _ = pfor_input.input(2)
+    num_segments = pfor_input.unstacked_input(3)
+
+  n = pfor_input.pfor.loop_len_vector[0]
+  if data_stacked:
+    indices = _flatten_array_with_offset(indices, array_ops.shape(data)[1], n)
+    data = _flatten_first_two_dims(data)
+  else:
+    indices = array_ops.reshape(indices, [-1])
+  segment_ids = _flatten_array_with_offset(segment_ids, num_segments, n)
+
+  if num_inputs == 3:
+    num_segments = None
+  else:
+    num_segments *= n
+  output = op_func(data, indices, segment_ids, num_segments=num_segments)
+  output = _unflatten_first_dim(output, [n])
+  return wrap(output, True)
+
+
+@RegisterPForWithArgs("SparseSegmentMeanGrad",
+                      math_ops.sparse_segment_mean_grad)
+@RegisterPForWithArgs("SparseSegmentSqrtNGrad",
+                      math_ops.sparse_segment_sqrt_n_grad)
+def _convert_sparse_segment_grad(pfor_input, _, op_func):
+  grad = pfor_input.stacked_input(0)
+  indices = pfor_input.unstacked_input(1)
+  segment_ids = pfor_input.unstacked_input(2)
+  dim0 = pfor_input.unstacked_input(3)
+
+  n = pfor_input.pfor.loop_len_vector[0]
+  indices = _flatten_array_with_offset(indices, dim0, n)
+  num_segments = nn_ops.relu(math_ops.reduce_max(segment_ids) + 1)
+  segment_ids = _flatten_array_with_offset(segment_ids, num_segments, n)
+  grad = _flatten_first_two_dims(grad)
+  dim0 *= n
+  output = op_func(grad, indices, segment_ids, dim0)
+  output = _unflatten_first_dim(output, [n])
+  return wrap(output, True)
+
+
 @RegisterPFor("Cast")
 def _convert_cast(pfor_input):
   inp = pfor_input.stacked_input(0)
@@ -2373,6 +2519,14 @@ def _convert_addn(pfor_input):
   return wrap(math_ops.add_n([x.t for x in pfor_input.inputs]), True)
 
 
+@RegisterPFor("Cross")
+def _convert_cross(pfor_input):
+  pfor_input.stack_inputs()
+  a = pfor_input.stacked_input(0)
+  b = pfor_input.stacked_input(1)
+  return wrap(math_ops.cross(a, b), True)
+
+
 @RegisterPFor("BiasAddGrad")
 def _convert_biasaddgrad(pfor_input):
   grad = pfor_input.stacked_input(0)
@@ -2538,6 +2692,44 @@ def _convert_multinomial(pfor_input):
 
 # linalg_ops
 
+# TODO(jmenick) - the same logic applies to other einsums. Generalize this
+# in a future CL.
+@RegisterPFor("XlaEinsum")
+def _convert_einsum(pfor_input):
+  first_input, first_input_stacked, _ = pfor_input.input(0)
+  second_input, second_input_stacked, _ = pfor_input.input(1)
+
+  # Parse the einsum equation.
+  equation = pfor_input.get_attr("equation").decode("utf-8")
+  input_expr, output_expr = equation.split("->")
+  input_a_expr, input_b_expr = input_expr.split(",")
+
+  # pick a placeholder symbol to use for the new axis
+  chosen_symbol = None
+  for s in string.ascii_letters:
+    if s in equation:
+      continue
+    else:
+      chosen_symbol = s
+      break
+
+  if chosen_symbol is None:
+    raise ValueError("Could not figure out what symbol to use for new axis.")
+
+  assert first_input_stacked or second_input_stacked
+  if first_input_stacked:
+    input_a_expr = "{}{}".format(chosen_symbol, input_a_expr)
+  if second_input_stacked:
+    input_b_expr = "{}{}".format(chosen_symbol, input_b_expr)
+  output_expr = "{}{}".format(chosen_symbol, output_expr)
+
+  new_equation = "{},{}->{}".format(input_a_expr, input_b_expr, output_expr)
+  result = xla.einsum(
+      equation=new_equation,
+      a=first_input,
+      b=second_input)
+  return wrap(result, True)
+
 
 @RegisterPFor("Cholesky")
 def _convert_cholesky(pfor_input):
diff --git a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
index 0b1678823f9..95672519176 100644
--- a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
@@ -19,11 +19,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.compiler.tf2xla.python import xla as xla_ops
 from tensorflow.python.compiler.xla import xla
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
 from tensorflow.python.ops.parallel_for.test_util import PForTestCase
 from tensorflow.python.platform import test
@@ -32,6 +35,26 @@ from tensorflow.python.platform import test
 @test_util.run_all_in_graph_and_eager_modes
 class PForTest(PForTestCase):
 
+  def test_einsum(self):
+    num_loop = 10
+    x_series = random_ops.random_uniform([num_loop, 9, 9])
+    y_series = random_ops.random_uniform([num_loop, 9, 1])
+
+    def loop_fn(i):
+      x = array_ops.gather(x_series, 0)  # invariant.
+      y = array_ops.gather(y_series, 0)  # invariant.
+      x_i = array_ops.gather(x_series, i)
+      y_i = array_ops.gather(y_series, i)
+      z1 = xla_ops.einsum(x_i, y, "ab,bc->ac")
+      z2 = xla_ops.einsum(x, y_i, "ab,bc->ac")
+      z3 = xla_ops.einsum(x, y, "ab,bc->ac")
+      z4 = xla_ops.einsum(x_i, y_i, "ab,bc->ac")
+      z5 = xla_ops.einsum(y_i, x_i, "cd,ce->de")  # Includes transpose.
+      outputs = [z1, z2, z3, z4, z5]
+      return outputs
+
+    self._test_loop_fn(loop_fn, num_loop, loop_fn_dtypes=[dtypes.float32] * 5)
+
   def test_xla(self):
 
     def compute(x):
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index fbed2169677..88c541b5f77 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -62,6 +62,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sort_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
     ],
@@ -232,6 +233,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -740,6 +742,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "ragged_string_ops_test",
+    srcs = ["ragged_string_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "ragged_constant_value_op_test",
     srcs = ["ragged_constant_value_op_test.py"],
@@ -1032,3 +1053,49 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_dynamic_partition_op_test",
+    srcs = ["ragged_dynamic_partition_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_merge_dims_op_test",
+    srcs = ["ragged_merge_dims_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "string_ngrams_op_test",
+    size = "small",
+    srcs = ["string_ngrams_op_test.py"],
+    python_version = "PY2",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_string_ops",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 7714217fe50..18af9828bf6 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -22,7 +22,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -520,3 +522,132 @@ def rank(input, name=None):  # pylint: disable=redefined-builtin
       return array_ops.rank(input, name)
 
     return input.ragged_rank + array_ops.rank(input.flat_values)
+
+
+#===============================================================================
+# ragged.one_hot
+#===============================================================================
+def ragged_one_hot(indices,
+                   depth,
+                   on_value=None,
+                   off_value=None,
+                   axis=None,
+                   dtype=None,
+                   name=None):
+  """Applies tf.one_hot along the values of a RaggedTensor."""
+  with ops.name_scope(name, 'RaggedOneHot', [indices]):
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    if axis is not None:
+      axis = ragged_util.get_positive_axis(axis, indices.shape.ndims)
+      if axis < indices.ragged_rank:
+        raise ValueError('axis may not be less than indices.ragged_rank.')
+    return indices.with_flat_values(
+        array_ops.one_hot(indices.flat_values, depth, on_value, off_value, axis,
+                          dtype, name))
+
+
+#===============================================================================
+# ragged.stack_dynamic_partitions
+#===============================================================================
+@tf_export('ragged.stack_dynamic_partitions')
+def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
+  """Stacks dynamic partitions of a Tensor or RaggedTensor.
+
+  Returns a RaggedTensor `output` with `num_partitions` rows, where the row
+  `output[i]` is formed by stacking all slices `data[j1...jN]` such that
+  `partitions[j1...jN] = i`.  Slices of `data` are stacked in row-major
+  order.
+
+  If `num_partitions` is an `int` (not a `Tensor`), then this is equivalent to
+  `tf.ragged.stack(tf.dynamic_partition(data, partitions, num_partitions))`.
+
+  ####Example:
+    ```python
+    >>> data           = ['a', 'b', 'c', 'd', 'e']
+    >>> partitions     = [  3,   0,   2,   2,   3]
+    >>> num_partitions = 5
+    >>> tf.ragged.stack_dynamic_partitions(data, partitions, num_partitions)
+    <RaggedTensor [['b'], [], ['c', 'd'], ['a', 'e'], []]>
+    ```
+
+  Args:
+    data: A `Tensor` or `RaggedTensor` containing the values to stack.
+    partitions: An `int32` or `int64` `Tensor` or `RaggedTensor` specifying the
+      partition that each slice of `data` should be added to.
+      `partitions.shape` must be a prefix of `data.shape`.  Values must be
+      greater than or equal to zero, and less than `num_partitions`.
+      `partitions` is not required to be sorted.
+    num_partitions: An `int32` or `int64` scalar specifying the number of
+      partitions to output.  This determines the number of rows in `output`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the stacked partitions.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_partitions, (D)] + data.shape[partitions.rank:]`, where `(D)` is a
+    ragged dimension whose length is the number of data slices stacked for
+    each `partition`.
+  """
+  with ops.name_scope(name, 'SegmentStack', [data, partitions, num_partitions]):
+    # Convert inputs to tensors.
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    row_splits_dtype = (
+        data.row_splits.dtype
+        if isinstance(data, ragged_tensor.RaggedTensor) else None)
+    partitions = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        partitions, name='partitions', preferred_dtype=row_splits_dtype)
+    num_partitions = ops.convert_to_tensor(
+        num_partitions, name='num_partitions', preferred_dtype=partitions.dtype)
+    if row_splits_dtype is not None:
+      partitions = math_ops.cast(partitions, row_splits_dtype)
+    num_partitions = math_ops.cast(num_partitions, partitions.dtype)
+
+    # Sanity-checks for shapes.
+    partitions_rank = partitions.shape.ndims
+    if partitions_rank is None:
+      raise ValueError('partitions must have known rank.')
+    num_partitions.shape.assert_has_rank(0)
+    partitions.shape.assert_is_compatible_with(data.shape[:partitions_rank])
+
+    if partitions_rank == 0:
+      # If partitions is a scalar, then just create a RaggedTensor containing
+      # that single the complete `data` value in the specified row.
+      return ragged_tensor.RaggedTensor.from_value_rowids(
+          values=array_ops.stack([data]),
+          value_rowids=array_ops.stack([partitions]),
+          nrows=num_partitions,
+          validate=False)
+
+    elif partitions_rank == 1:
+      # If partitions is a vector (the typical case): we can just use data and
+      # partitions as the `values` and `value_rowids` for `from_value_rowids`,
+      # as long as we sort them first.
+      permutation = sort_ops.argsort(partitions, stable=True)
+      value_rowids = array_ops.gather(partitions, permutation)
+      values = array_ops.gather(data, permutation)
+      check = check_ops.assert_less(
+          value_rowids[-1:],
+          num_partitions,
+          message='partitions must be less than num_partitions')
+      with ops.control_dependencies([check]):
+        return ragged_tensor.RaggedTensor.from_value_rowids(
+            values, value_rowids, nrows=num_partitions, validate=False)
+
+    else:
+      # Handle higher-dimensional partitions via recursion.
+      if not isinstance(data, ragged_tensor.RaggedTensor):
+        data = ragged_tensor.RaggedTensor.from_tensor(
+            data, row_splits_dtype=partitions.dtype, ragged_rank=1)
+      if not isinstance(partitions, ragged_tensor.RaggedTensor):
+        partitions = ragged_tensor.RaggedTensor.from_tensor(
+            partitions,
+            row_splits_dtype=partitions.dtype,
+            ragged_rank=max(data.ragged_rank, partitions_rank - 1))
+      check = check_ops.assert_equal(
+          data.row_splits,
+          partitions.row_splits,
+          message='data and partitions have incompatible ragged shapes')
+      with ops.control_dependencies([check]):
+        return stack_dynamic_partitions(data.values, partitions.values,
+                                        num_partitions)
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 30fe7530781..1372db07abc 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 def concat(values, axis, name=None):
@@ -70,40 +71,41 @@ def concat(values, axis, name=None):
     return _ragged_stack_concat_helper(values, axis, stack_values=False)
 
 
+@tf_export('ragged.stack')
 def stack(values, axis=0, name=None):
-  """Stacks potentially ragged tensors along one dimension.
+  """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`.
 
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to stack.
-      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
+  Given a list of tensors or ragged tensors with the same rank `R`
+  (`R >= axis`), returns a rank-`R+1` `RaggedTensor` `result` such that
+  `result[i0...iaxis]` is `[value[i0...iaxis] for value in values]`.
 
   #### Example:
     ```python
     >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
     >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.stack([t1, t2], axis=0)
+    >>> tf.ragged.stack([t1, t2], axis=0)
     [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
-    >>> ragged.stack([t1, t2], axis=1)
+    >>> tf.ragged.stack([t1, t2], axis=1)
     [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
     ```
+
+  Args:
+    values: A list of `tf.Tensor` or `tf.RaggedTensor`.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.stack`, they can have arbitrary dimension sizes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+      Negative values are supported only if the rank of at least one
+      `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `R+1`.
+    `result.ragged_rank=1+max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
   """
   if not isinstance(values, (list, tuple)):
     values = [values]
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index fecbb2e6d47..94df6617a74 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor_value
@@ -175,6 +176,8 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
       dict(
           pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
           dtype=np.dtype('S1')),
+      dict(pylist=[], dtype=dtypes.float32, expected_dtype=np.float32),
+      dict(pylist=[], dtype=dtypes.int32, expected_dtype=np.int32),
   )
   def testRaggedValues(self,
                        pylist,
@@ -190,10 +193,10 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
     # E.g., [np.array((1,2))] --> [[1,2]]
     pylist = _normalize_pylist(pylist)
     # If dtype was explicitly specified, check it.
-    if dtype is not None:
-      self.assertEqual(rt.dtype, dtype)
     if expected_dtype is not None:
       self.assertEqual(rt.dtype, expected_dtype)
+    elif dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
 
     # If ragged_rank was explicitly specified, check it.
     if ragged_rank is not None:
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index 8e06a2d801b..585e914ac17 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -18,12 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
 
-def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1,
-                row_splits_dtype=dtypes.int64, name=None):
+def from_tensor(tensor,
+                lengths=None,
+                padding=None,
+                ragged_rank=1,
+                row_splits_dtype=dtypes.int64,
+                name=None):
   if ragged_tensor.is_ragged(tensor):
     return tensor
   else:
@@ -43,6 +53,130 @@ def to_tensor(rt_input, default_value=None, name=None):
     return rt_input
 
 
+def _get_row_partition_type_tensor_pairs_tail(rt_value):
+  """Gets a list of the row partitions for rt_value.
+
+  If parent_indices are defined, then they are used. Otherwise, row_splits
+  are used.
+
+  This assumes that rt_input is nested inside another RaggedTensor. If it is
+  a tensor, then return an empty list.
+
+  Args:
+    rt_value: a ragged tensor value. May be a tensor.
+
+  Returns:
+    A list of (row_partition_type, row_partition_tensor) pairs.
+  """
+  if isinstance(rt_value, ragged_tensor.RaggedTensor):
+    tail = _get_row_partition_type_tensor_pairs_tail(rt_value.values)
+    if rt_value._cached_value_rowids is not None:  # pylint: disable=protected-access
+      return [("VALUE_ROWIDS", rt_value.value_rowids())] + tail
+    else:
+      return [("ROW_SPLITS", rt_value.row_splits)] + tail
+  return []
+
+
+def _get_row_partition_type_tensor_pairs(rt_input):
+  """Gets a list of the row partitions for rt_input.
+
+  If value_rowids are defined, then they are used. Otherwise, row_splits
+  are used. If the outermost level has value_rowids defind, then nrows is
+  also added.
+
+  Args:
+    rt_input: a ragged tensor.
+
+  Returns:
+    A list of (row_partition_type, row_partition_tensor) pairs.
+  """
+  tail = _get_row_partition_type_tensor_pairs_tail(rt_input.values)
+  if rt_input._cached_value_rowids is not None:  # pylint: disable=protected-access
+    return [("FIRST_DIM_SIZE", rt_input.nrows()),
+            ("VALUE_ROWIDS", rt_input.value_rowids())] + tail
+  else:
+    return [("ROW_SPLITS", rt_input.row_splits)] + tail
+
+
+def _shape_as_tensor(shape, dtype):
+  """Takes shape and coerces it to a shape as a tensor.
+
+  If the object is already a tensor, simply passes it on (result is guaranteed
+  to be int64 or int32, but not necessarily dtype).
+  If not, creates a tensor of type dtype.
+
+  Result is either a scalar equal to -1 if the shape is unknown_rank.
+  Otherwise, it is a vector, where unknown dimensions are represented with a
+  value of -1.
+
+  In C++, see TensorShapeFromTensor for parsing shapes in kernels, and
+  InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape, for
+  use in the shape inference function.
+
+  Args:
+    shape: input to coerce from TensorShape, Tensor, None, List[Optional[Int]],
+      Tuple[Optional[Int]].
+    dtype: tf.int64 or tf.int32
+
+  Returns:
+    a scalar or vector tensor of dtype tf.int32 or tf.int64.
+  """
+  if dtype != dtypes.int64 and dtype != dtypes.int32:
+    raise ValueError("Expected int64 or int32 for dtype: got {}".format(dtype))
+
+  if isinstance(shape, ops.Tensor):
+    if shape.dtype != dtypes.int64 and shape.dtype != dtypes.int32:
+      return math_ops.cast(shape, dtype)
+    return shape
+  shape = tensor_shape.as_shape(shape)
+  if not shape:
+    # Imply rank is unknown using a -1 scalar.
+    return constant_op.constant(-1, dtype=dtype)
+  shape = [(-1 if x is None else x) for x in shape.as_list()]
+  # At this point, shape is List[Int].
+  return constant_op.constant(shape, dtype=dtype)
+
+
+# TODO(martinz): add a gradient for this op.
+# TODO(martinz): this is a replacement for RaggedTensor.to_tensor. Move this
+# after there is a chance for the kernels to propagate.
+def ragged_to_dense(rt_input, default_value=None, shape=None):
+  """Create a dense tensor from a ragged tensor.
+
+  If the shape is None, then the resulting dense tensor is the same size as
+  the maximum length of the ragged tensor in each dimension.
+
+  If the shape is not None, then it must be the same number of dimensions
+  as the ragged tensor. For dimension i, if shape[i] is None, then the maximum
+  length of the ragged tensor in that dimension is the size of the output in
+  that dimension. If shape[i] is an integer, then that is the size of the output
+  in that dimension.
+
+  Args:
+    rt_input: the tensor to densify.
+    default_value: used when a value is missing.
+    shape: the shape of the resulting tensor.
+
+  Returns:
+    a dense tensor.
+  """
+
+  type_tensor_pairs = _get_row_partition_type_tensor_pairs(rt_input)
+  row_partition_types = [x[0] for x in type_tensor_pairs]
+  row_partition_tensors = [x[1] for x in type_tensor_pairs]
+  values = rt_input.flat_values
+  if default_value is None:
+    default_value = array_ops.zeros((), values.dtype)
+
+  shape_tensor = _shape_as_tensor(shape, row_partition_tensors[0].dtype)
+  return gen_ragged_conversion_ops.ragged_tensor_to_tensor(
+      shape=shape_tensor,
+      values=values,
+      default_value=default_value,
+      row_partition_types=row_partition_types,
+      row_partition_tensors=row_partition_tensors)
+
+
 def to_sparse(rt_input, name=None):
   return rt_input.to_sparse(name)
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 50d9079a287..871c7ee9c71 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_squeeze_op
+from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_util
@@ -388,7 +390,7 @@ _BINARY_ELEMENTWISE_OPS = [
 # We don't need to register a separate delegation handler for these v1 ops,
 # since they delegate to the v2 ops (which already have a handler).  But we
 # still want to include them in the ragged_op_list() output.
-_V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
+_V2_OPS_THAT_ARE_DELEGATED_TO_FROM_V1_OPS = [
     math_ops.reduce_sum,
     math_ops.reduce_prod,
     math_ops.reduce_min,
@@ -396,6 +398,9 @@ _V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
     math_ops.reduce_mean,
     math_ops.reduce_any,
     math_ops.reduce_all,
+    string_ops.string_to_number,
+    string_ops.string_to_hash_bucket,
+    string_ops.reduce_join_v2,
 ]
 
 
@@ -433,6 +438,15 @@ def _ragged_squeeze_v1(input, axis=None, name=None, squeeze_dims=None):  # pylin
                                                 squeeze_dims)
   return ragged_squeeze_op.squeeze(input, axis, name)
 
+
+def _ragged_dynamic_partition(data, partitions, num_partitions, name=None):
+  """RaggedTensor Dispatch override for tf.dynamic_partition."""
+  if not isinstance(num_partitions, int) or num_partitions < 0:
+    raise TypeError('num_partitions must be a non-negative integer')
+  result = ragged_array_ops.stack_dynamic_partitions(data, partitions,
+                                                     num_partitions, name)
+  return [result[i] for i in range(num_partitions)]
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
     (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
@@ -445,6 +459,7 @@ _RAGGED_DISPATCH_OPS = [
     (array_ops.gather_nd, _ragged_gather_nd_v1, ['params', 'indices']),
     (array_ops.gather_nd_v2, ragged_gather_ops.gather_nd, ['params',
                                                            'indices']),
+    (array_ops.one_hot, ragged_array_ops.ragged_one_hot, ['indices']),
     (array_ops.rank, ragged_array_ops.rank, ['input']),
     (array_ops.size, _ragged_size_v1, ['input']),
     (array_ops.size_v2, ragged_array_ops.size, ['input']),
@@ -453,6 +468,8 @@ _RAGGED_DISPATCH_OPS = [
     (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
     (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
+    (data_flow_ops.dynamic_partition, _ragged_dynamic_partition,
+     ['data', 'partitions']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
@@ -465,6 +482,7 @@ _RAGGED_DISPATCH_OPS = [
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
      ['data', 'segment_ids']),
+    (string_ops.reduce_join_v2, ragged_string_ops.reduce_join, ['inputs']),
     (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
     (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
     (math_ops.reduce_min, ragged_math_ops.reduce_min, ['input_tensor']),
@@ -527,7 +545,7 @@ def _ragged_op_signature(op, ragged_args):
 def _op_is_in_tf_version(op, version):
   if version == 1:
     return (tf_export.get_v1_names(tf_decorator.unwrap(op)[1]) or
-            op in _V1_OPS_THAT_DELEGATE_TO_V2_OPS)
+            op in _V2_OPS_THAT_ARE_DELEGATED_TO_FROM_V1_OPS)
   elif version == 2:
     return tf_export.get_v2_names(tf_decorator.unwrap(op)[1])
   else:
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 2c54cbce917..9b75109db6e 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -29,10 +29,12 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
@@ -536,6 +538,20 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
               'indices': [[0, 1], [1, 0], [0, 0]]
           },
           expected=ragged_factory_ops.constant_value([8, 9, 7])),
+      dict(
+          op=array_ops.one_hot,
+          kwargs={
+              'indices':
+                  ragged_factory_ops.constant_value([[1, 2, 3], [0]],
+                                                    dtype=np.int32),
+              'depth':
+                  4,
+              'axis':
+                  1
+          },
+          expected=ragged_factory_ops.constant_value(
+              [[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], [[1, 0, 0, 0]]],
+              ragged_rank=1)),
       dict(
           op=array_ops.stack,
           args=([
@@ -672,6 +688,25 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
                   1
           },
           expected=[True, True]),
+      dict(
+          op=string_ops.reduce_join,
+          kwargs={
+              'inputs':
+                  ragged_factory_ops.constant_value([[
+                      b'this', b'is', b'a', b'test', b'for', b'ragged',
+                      b'tensors'
+                  ], [b'please', b'do', b'not', b'panic', b'!']]),
+              'axis':
+                  0,
+              'keepdims':
+                  False,
+              'separator':
+                  ''
+          },
+          expected=[
+              b'thisplease', b'isdo', b'anot', b'testpanic', b'for!', b'ragged',
+              b'tensors'
+          ]),
       dict(
           op=math_ops.reduce_all,
           kwargs={
@@ -708,11 +743,86 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
               'axis': [0]
           },
           expected=ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]])),
+      dict(
+          op=data_flow_ops.dynamic_partition,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1], [2, 3, 4], [5]]),
+              'partitions': [2, 1, 1],
+              'num_partitions': 3
+          },
+          expected=[
+              ragged_factory_ops.constant_value([], ragged_rank=1),
+              ragged_factory_ops.constant_value([[2, 3, 4], [5]]),
+              ragged_factory_ops.constant_value([[1]])
+          ],
+          result_is_list=True),
   ])
-  def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
+  def testRaggedDispatch(self, op, expected, args=(), result_is_list=False,
+                         kwargs=None):
     if kwargs is None: kwargs = {}
     result = op(*args, **kwargs)
-    self.assertAllEqual(result, expected)
+    if result_is_list:
+      self.assertLen(result, len(expected))
+      for (r, e) in zip(result, expected):
+        self.assertAllEqual(r, e)
+    else:
+      self.assertAllEqual(result, expected)
+
+  def test_ragged_op_list(self):
+    # Ops that should be listed as supported in both v1 and v2.
+    supported_ops = [
+        'bitwise.bitwise_and', 'bitwise.bitwise_or', 'bitwise.bitwise_xor',
+        'bitwise.invert', 'bitwise.left_shift', 'bitwise.right_shift',
+        'clip_by_value', 'concat', 'debugging.check_numerics', 'cast',
+        'dtypes.complex', 'dtypes.saturate_cast', 'expand_dims', 'gather_nd',
+        'gather', 'identity', 'io.decode_base64', 'io.decode_compressed',
+        'io.encode_base64', 'math.abs', 'math.acos', 'math.acosh', 'math.add_n',
+        'math.add', 'math.angle', 'math.asin', 'math.asinh', 'math.atan2',
+        'math.atan', 'math.atanh', 'math.ceil', 'math.conj', 'math.cos',
+        'math.cosh', 'math.digamma', 'math.divide_no_nan', 'math.divide',
+        'math.equal', 'math.erf', 'math.erfc', 'math.exp', 'math.expm1',
+        'math.floor', 'math.floordiv', 'math.floormod', 'math.greater_equal',
+        'math.greater', 'math.imag', 'math.is_finite', 'math.is_inf',
+        'math.is_nan', 'math.less_equal', 'math.less', 'math.lgamma',
+        'math.log1p', 'math.log_sigmoid', 'math.log', 'math.logical_and',
+        'math.logical_not', 'math.logical_or', 'math.logical_xor',
+        'math.maximum', 'math.minimum', 'math.multiply', 'math.negative',
+        'math.not_equal', 'math.pow', 'math.real', 'math.reciprocal',
+        'math.reduce_any', 'math.reduce_max', 'math.reduce_mean',
+        'math.reduce_min', 'math.reduce_prod', 'math.reduce_sum', 'math.rint',
+        'math.round', 'math.rsqrt', 'math.sign', 'math.sin', 'math.sinh',
+        'math.sqrt', 'math.square', 'math.squared_difference', 'math.subtract',
+        'math.tan', 'math.truediv', 'math.unsorted_segment_max',
+        'math.unsorted_segment_mean', 'math.unsorted_segment_min',
+        'math.unsorted_segment_prod', 'math.unsorted_segment_sqrt_n',
+        'math.unsorted_segment_sum', 'one_hot', 'ones_like', 'rank', 'realdiv',
+        'reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string',
+        'strings.join', 'strings.length', 'strings.reduce_join',
+        'strings.regex_full_match', 'strings.regex_replace', 'strings.strip',
+        'strings.substr', 'strings.to_hash_bucket_fast',
+        'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
+        'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
+        'truncatemod', 'zeros_like', 'dynamic_partition'
+    ]
+
+    # Ops that should be listed as supported in v1 only.
+    # TODO(edloper): Add a dispatch for where_v2.
+    supported_ops_v1 = ['batch_gather', 'where']
+
+    # Ops that should be listed as supported in v2 only.
+    supported_ops_v2 = []
+
+    v1_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=1)
+    for element in supported_ops + supported_ops_v1:
+      self.assertIn(element, v1_ragged_ops)
+    for element in supported_ops_v2:
+      self.assertNotIn(element, v1_ragged_ops)
+
+    v2_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=2)
+    for element in supported_ops + supported_ops_v2:
+      self.assertIn(element, v2_ragged_ops)
+    for element in supported_ops_v1:
+      self.assertNotIn(element, v2_ragged_ops)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py
new file mode 100644
index 00000000000..790cabdaf6f
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py
@@ -0,0 +1,257 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_array_ops.stack_dynamic_partitions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSegmentStackOpTest(test_util.TensorFlowTestCase,
+                               parameterized.TestCase):
+
+  @parameterized.parameters([
+      dict(  # empty inputs
+          data=[],
+          partitions=[],
+          num_partitions=0,
+          expected=[],
+          expected_ragged_rank=1),
+      dict(  # empty data, num_partitions>0
+          data=[],
+          partitions=[],
+          num_partitions=3,
+          expected=[[], [], []]),
+      dict(  # 1D data, 1D partitions (docstring example)
+          data=['a', 'b', 'c', 'd', 'e'],
+          partitions=[3, 0, 2, 2, 3],
+          num_partitions=5,
+          expected=[['b'], [], ['c', 'd'], ['a', 'e'], []]),
+      dict(  # 2D data, 1D partitions
+          data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']],
+          data_ragged_rank=0,
+          partitions=[2, 1, 2, 3],
+          num_partitions=4,
+          expected=[[], [['c', 'd']], [['a', 'b'], ['e', 'f']], [['g', 'h']]],
+          expected_ragged_rank=1),
+      dict(  # 2D ragged data, 1D partitions
+          data=[['a'], ['b', 'c', 'd'], [], ['e', 'f']],
+          data_ragged_rank=1,
+          partitions=[2, 1, 2, 3],
+          num_partitions=4,
+          expected=[[], [['b', 'c', 'd']], [['a'], []], [['e', 'f']]],
+          expected_ragged_rank=2),
+      dict(  # 2D data, 2D partitions
+          data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']],
+          data_ragged_rank=0,
+          partitions=[[3, 0], [2, 2], [4, 3], [2, 0]],
+          num_partitions=5,
+          expected=[['b', 'h'], [], ['c', 'd', 'g'], ['a', 'f'], ['e']]),
+      dict(  # 2D ragged data, 2D ragged partitions
+          data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']],
+          data_ragged_rank=0,
+          partitions=[[3, 0], [2, 2], [4, 3], [2, 0]],
+          num_partitions=5,
+          expected=[['b', 'h'], [], ['c', 'd', 'g'], ['a', 'f'], ['e']]),
+      dict(  # 3D data, 1d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f'], ['g', 'h']]],
+          data_ragged_rank=0,
+          partitions=[1, 0],
+          num_partitions=2,
+          expected=[[[['e', 'f'], ['g', 'h']]], [[['a', 'b'], ['c', 'd']]]],
+          expected_ragged_rank=1),
+      dict(  # 3D data (ragged_rank=1), 1d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f']]],
+          data_ragged_rank=1,
+          partitions=[2, 0],
+          num_partitions=3,
+          expected=[[[['e', 'f']]], [], [[['a', 'b'], ['c', 'd']]]],
+          expected_ragged_rank=2),
+      dict(  # 3D data (ragged_rank=2), 1d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]],
+          data_ragged_rank=2,
+          partitions=[2, 0],
+          num_partitions=3,
+          expected=[[[['e', 'f', 'g', 'h']]], [], [[['a', 'b'], ['c', 'd']]]],
+          expected_ragged_rank=3),
+      dict(  # 3D data, 2d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f'], ['g', 'h']]],
+          data_ragged_rank=0,
+          partitions=[[1, 0], [0, 3]],
+          segment_ids_ragged_rank=0,
+          num_partitions=4,
+          expected=[[['c', 'd'], ['e', 'f']], [['a', 'b']], [], [['g', 'h']]],
+          expected_ragged_rank=1),
+      dict(  # 3D data (ragged_rank=1), 2d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f']]],
+          data_ragged_rank=1,
+          partitions=[[1, 0], [0]],
+          segment_ids_ragged_rank=1,
+          num_partitions=2,
+          expected=[[['c', 'd'], ['e', 'f']], [['a', 'b']]],
+          expected_ragged_rank=1),
+      dict(  # 3D data (ragged_rank=2), 2d partitions
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]],
+          data_ragged_rank=2,
+          partitions=[[1, 0], [0]],
+          segment_ids_ragged_rank=1,
+          num_partitions=3,
+          expected=[[['c', 'd'], ['e', 'f', 'g', 'h']], [['a', 'b']], []],
+          expected_ragged_rank=2),
+      dict(  # 3D data (ragged_rank=2), 3d partitions (ragged_rank=2)
+          data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]],
+          data_ragged_rank=2,
+          partitions=[[[3, 0], [1, 2]], [[1, 1, 0, 1]]],
+          segment_ids_ragged_rank=2,
+          num_partitions=4,
+          expected=[['b', 'g'], ['c', 'e', 'f', 'h'], ['d'], ['a']]),
+      dict(  # 0D data, 0D partitions
+          data='a',
+          partitions=3,
+          num_partitions=5,
+          expected=[[], [], [], ['a'], []]),
+      dict(  # 1D data, 0D partitions
+          data=['a', 'b', 'c'],
+          partitions=3,
+          num_partitions=5,
+          expected=[[], [], [], [['a', 'b', 'c']], []],
+          expected_ragged_rank=1),
+      dict(  # 2D data, 0D partitions
+          data=[['a', 'b'], ['c', 'd']],
+          data_ragged_rank=0,
+          partitions=3,
+          num_partitions=5,
+          expected=[[], [], [], [[['a', 'b'], ['c', 'd']]], []],
+          expected_ragged_rank=1),
+      dict(  # 2D data (ragged_rank=1), 0D partitions
+          data=[['a', 'b'], ['c']],
+          data_ragged_rank=1,
+          partitions=3,
+          num_partitions=5,
+          expected=[[], [], [], [[['a', 'b'], ['c']]], []],
+          expected_ragged_rank=3),
+  ])
+  def testRaggedSegmentStack(self,
+                             data,
+                             partitions,
+                             num_partitions,
+                             expected,
+                             data_ragged_rank=None,
+                             segment_ids_ragged_rank=None,
+                             expected_ragged_rank=None):
+    for seg_dtype in [dtypes.int32, dtypes.int64]:
+      data_tensor = ragged_factory_ops.constant(
+          data, row_splits_dtype=seg_dtype, ragged_rank=data_ragged_rank)
+      segment_ids_tensor = ragged_factory_ops.constant(
+          partitions,
+          dtype=seg_dtype,
+          row_splits_dtype=seg_dtype,
+          ragged_rank=segment_ids_ragged_rank)
+      expected_tensor = ragged_factory_ops.constant(
+          expected,
+          row_splits_dtype=seg_dtype,
+          ragged_rank=expected_ragged_rank)
+      result = ragged_array_ops.stack_dynamic_partitions(
+          data_tensor, segment_ids_tensor, num_partitions)
+      self.assertAllEqual(result, expected_tensor)
+
+      # Check that it's equivalent to tf.stack(dynamic_partition(...)),
+      # where applicable.
+      if (data_ragged_rank == 0 and segment_ids_ragged_rank == 0 and
+          seg_dtype == dtypes.int32):
+        equiv = ragged_concat_ops.stack(
+            data_flow_ops.dynamic_partition(data_tensor, segment_ids_tensor,
+                                            num_partitions))
+        self.assertAllEqual(result, self.evaluate(equiv).to_list())
+
+  @parameterized.parameters([
+      dict(
+          data=['a', 'b', 'c'],
+          partitions=[2, -1, 0],
+          num_partitions=10,
+          error='must be non-negative'),
+      dict(
+          data=['a', 'b', 'c'],
+          partitions=[2, 10, 0],
+          num_partitions=1,
+          error='partitions must be less than num_partitions'),
+      dict(
+          data=['a', 'b', 'c'],
+          partitions=[2, 10, 0],
+          num_partitions=10,
+          error='partitions must be less than num_partitions'),
+      dict(
+          data=[['a', 'b'], ['c']],
+          partitions=[[2], [3, 0]],
+          num_partitions=10,
+          error='data and partitions have incompatible ragged shapes'),
+  ])
+  def testRuntimeError(self, data, partitions, num_partitions, error):
+    data = ragged_factory_ops.constant(data)
+    partitions = ragged_factory_ops.constant(partitions, dtype=dtypes.int64)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 error):
+      self.evaluate(
+          ragged_array_ops.stack_dynamic_partitions(data, partitions,
+                                                    num_partitions))
+
+  @parameterized.parameters([
+      dict(
+          data=['a', 'b', 'c'],
+          partitions=[1, 2],
+          num_partitions=10,
+          error=r'Shapes \(2,\) and \(3,\) are incompatible'),
+      dict(
+          data=[['a', 'b'], ['c', 'd']],
+          partitions=[[1, 2, 3], [4, 5, 6]],
+          num_partitions=10,
+          error=r'Shapes \(2, 3\) and \(2, 2\) are incompatible'),
+      dict(
+          data=['a', 'b', 'c'],
+          partitions=[1, 2, 3],
+          num_partitions=[1, 2, 3],
+          error='must have rank 0'),
+  ])
+  def testStaticError(self, data, partitions, num_partitions, error):
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 error):
+      ragged_array_ops.stack_dynamic_partitions(data, partitions,
+                                                num_partitions)
+
+  def testUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    partitions = array_ops.placeholder(dtypes.int32, None)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 'partitions must have known rank'):
+      ragged_array_ops.stack_dynamic_partitions(['a', 'b', 'c'], partitions, 10)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 5c654c6d6be..7ab450ee7f5 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -133,6 +133,8 @@ def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
     ValueError: If the scalar values in `pylist` have inconsistent nesting
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
+  if dtype is not None and isinstance(dtype, dtypes.DType):
+    dtype = dtype.as_numpy_dtype
   row_splits_dtype = dtypes.as_dtype(row_splits_dtype).as_numpy_dtype
   def _ragged_factory(values, row_splits):
     row_splits = np.array(row_splits, dtype=row_splits_dtype)
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 7e27cd29377..22b6288caee 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -159,6 +159,7 @@ def _ragged_segment_aggregate(unsorted_segment_op,
                               data,
                               segment_ids,
                               num_segments,
+                              separator=None,
                               name=None):
   """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
 
@@ -181,6 +182,8 @@ def _ragged_segment_aggregate(unsorted_segment_op,
       `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
       `segment_ids` is not required to be sorted.
     num_segments: An `int32` or `int64` scalar.
+    separator: An optional string. Defaults to None. The separator to
+      use when joining. Only used for string types.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -192,7 +195,12 @@ def _ragged_segment_aggregate(unsorted_segment_op,
   """
   if not (ragged_tensor.is_ragged(data) or
           ragged_tensor.is_ragged(segment_ids)):
-    return unsorted_segment_op(data, segment_ids, num_segments, name)
+    if separator is not None:
+      # It uses unsorted_segment_join.
+      return unsorted_segment_op(data, segment_ids, num_segments, separator,
+                                 name)
+    else:
+      return unsorted_segment_op(data, segment_ids, num_segments, name)
 
   with ops.name_scope(name, 'RaggedSegment',
                       [data, segment_ids, num_segments]) as name:
@@ -213,7 +221,8 @@ def _ragged_segment_aggregate(unsorted_segment_op,
           message='segment_ids.shape must be a prefix of data.shape')
       with ops.control_dependencies([check_splits]):
         return _ragged_segment_aggregate(unsorted_segment_op, data.values,
-                                         segment_ids.values, num_segments, name)
+                                         segment_ids.values, num_segments,
+                                         separator)
 
     # Find the length of each row in data.  (shape=[data_nrows])
     data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]
@@ -246,37 +255,45 @@ def _ragged_segment_aggregate(unsorted_segment_op,
     # Recursively aggregate the values.
     output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
                                               data_val_to_out_val_index,
-                                              output_splits[-1])
+                                              output_splits[-1], separator)
     return ragged_tensor.RaggedTensor.from_row_splits(
         output_values, output_splits, validate=False)
 
 
 def segment_sum(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum, data,
-                                   segment_ids, num_segments, name or
-                                   'RaggedSegmentSum')
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum,
+                                   data=data,
+                                   segment_ids=segment_ids,
+                                   num_segments=num_segments,
+                                   name=(name or'RaggedSegmentSum'))
 
 
 def segment_prod(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod, data,
-                                   segment_ids, num_segments, name or
-                                   'RaggedSegmentProd')
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod,
+                                   data=data,
+                                   segment_ids=segment_ids,
+                                   num_segments=num_segments,
+                                   name=(name or 'RaggedSegmentProd'))
 
 
 def segment_min(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_min, data,
-                                   segment_ids, num_segments, name or
-                                   'RaggedSegmentMin')
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_min,
+                                   data=data,
+                                   segment_ids=segment_ids,
+                                   num_segments=num_segments,
+                                   name=(name or 'RaggedSegmentMin'))
 
 
 def segment_max(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_max, data,
-                                   segment_ids, num_segments, name or
-                                   'RaggedSegmentMax')
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_max,
+                                   data=data,
+                                   segment_ids=segment_ids,
+                                   num_segments=num_segments,
+                                   name=(name or 'RaggedSegmentMax'))
 
 
 def segment_mean(data, segment_ids, num_segments, name=None):
@@ -407,12 +424,13 @@ _RAGGED_REDUCE_ANY_EXAMPLE = """
 """
 
 
-def _ragged_reduce_aggregate(reduce_op,
-                             unsorted_segment_op,
-                             rt_input,
-                             axis,
-                             keepdims,
-                             name=None):
+def ragged_reduce_aggregate(reduce_op,
+                            unsorted_segment_op,
+                            rt_input,
+                            axis,
+                            keepdims,
+                            separator=None,
+                            name=None):
   """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
 
   Reduces `rt_input` along the dimensions given in `axis`.  The rank of the
@@ -437,6 +455,9 @@ def _ragged_reduce_aggregate(reduce_op,
       given set of axes), or a `Tensor` with a constant value.  Must be in the
       range `[0, rt_input.rank)`.
     keepdims: If true, retains reduced dimensions with length 1.
+    separator: An optional string. Defaults to None. The separator to use when
+      joining. The separator must not be set for non-string data types. (i.e.
+      if separator is not None then it uses string ops)
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -449,7 +470,12 @@ def _ragged_reduce_aggregate(reduce_op,
     ValueError: If `axis` contains a `Tensor` whose value is not constant.
   """
   if not ragged_tensor.is_ragged(rt_input):
-    return reduce_op(rt_input, axis, name=name)
+    if separator is None:
+      return reduce_op(rt_input, axis, name=name)
+    else:
+      # When separator is not None, We infer that dtype is string and
+      # reduce_join will be called.
+      return reduce_op(rt_input, axis, name=name, separator=separator)
 
   if keepdims:
     raise ValueError('keepdims=True is not supported for RaggedTensors.')
@@ -484,10 +510,12 @@ def _ragged_reduce_aggregate(reduce_op,
         # does not work for reduce_mean.)  However, reducing multiple axes at
         # once will probably require a nontrivial c++ op.
         axis = sorted(axis)
-        inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                                 rt_input, axis[-1], keepdims)
-        return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                        inner_reduced, axis[:-1], keepdims)
+        inner_reduced = ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                                rt_input, axis[-1], keepdims,
+                                                separator)
+        return ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                       inner_reduced, axis[:-1], keepdims,
+                                       separator)
 
     rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt_input, name='rt_input')
@@ -500,48 +528,65 @@ def _ragged_reduce_aggregate(reduce_op,
       num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0)
       segment_ids = range(row_lengths).values
       return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
-                                       segment_ids, num_segments)
+                                       segment_ids, num_segments, separator)
     elif axis == 1:
       # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
       num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
       segment_ids = segment_id_ops.row_splits_to_segment_ids(
           rt_input.row_splits)
       return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
-                                       segment_ids, num_segments)
+                                       segment_ids, num_segments, separator)
     else:
       # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
       #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
       return rt_input.with_values(
-          _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                   rt_input.values, axis - 1, keepdims))
+          ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                  rt_input.values, axis - 1, keepdims,
+                                  separator))
 
 
 def reduce_sum(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  return _ragged_reduce_aggregate(math_ops.reduce_sum,
-                                  math_ops.unsorted_segment_sum, input_tensor,
-                                  axis, keepdims, name or 'RaggedReduceSum')
+
+  return ragged_reduce_aggregate(
+      reduce_op=math_ops.reduce_sum,
+      unsorted_segment_op=math_ops.unsorted_segment_sum,
+      rt_input=input_tensor,
+      axis=axis, keepdims=keepdims,
+      name=(name or 'RaggedReduceSum'))
 
 
 def reduce_prod(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  return _ragged_reduce_aggregate(math_ops.reduce_prod,
-                                  math_ops.unsorted_segment_prod, input_tensor,
-                                  axis, keepdims, name or 'RaggedReduceProd')
+  return ragged_reduce_aggregate(
+      reduce_op=math_ops.reduce_prod,
+      unsorted_segment_op=math_ops.unsorted_segment_prod,
+      rt_input=input_tensor,
+      axis=axis,
+      keepdims=keepdims,
+      name=(name or 'RaggedReduceProd'))
 
 
 def reduce_min(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  return _ragged_reduce_aggregate(math_ops.reduce_min,
-                                  math_ops.unsorted_segment_min, input_tensor,
-                                  axis, keepdims, name or 'RaggedReduceMin')
+  return ragged_reduce_aggregate(
+      reduce_op=math_ops.reduce_min,
+      unsorted_segment_op=math_ops.unsorted_segment_min,
+      rt_input=input_tensor,
+      axis=axis,
+      keepdims=keepdims,
+      name=(name or 'RaggedReduceMin'))
 
 
 def reduce_max(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  return _ragged_reduce_aggregate(math_ops.reduce_max,
-                                  math_ops.unsorted_segment_max, input_tensor,
-                                  axis, keepdims, name or 'RaggedReduceMax')
+  return ragged_reduce_aggregate(
+      reduce_op=math_ops.reduce_max,
+      unsorted_segment_op=math_ops.unsorted_segment_max,
+      rt_input=input_tensor,
+      axis=axis,
+      keepdims=keepdims,
+      name=(name or 'RaggedReduceMax'))
 
 
 def reduce_mean(input_tensor, axis=None, keepdims=None, name=None):
diff --git a/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py
new file mode 100644
index 00000000000..7fb05263540
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py
@@ -0,0 +1,272 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor.merge_dims."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import nest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMergeDimsOpTest(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      {
+          'testcase_name': '2DAxis0To1',
+          'rt': [[1, 2], [], [3, 4, 5]],
+          'outer_axis': 0,
+          'inner_axis': 1,
+          'expected': [1, 2, 3, 4, 5],
+      },
+      {
+          'testcase_name': '3DAxis0To1',
+          'rt': [[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]],
+          'outer_axis': 0,
+          'inner_axis': 1,
+          'expected': [[1, 2], [], [3, 4, 5], [6], [7, 8], []],
+      },
+      {
+          'testcase_name': '3DAxis1To2',
+          'rt': [[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]],
+          'outer_axis': 1,
+          'inner_axis': 2,
+          'expected': [[1, 2, 3, 4, 5], [6, 7, 8]],
+      },
+      {
+          'testcase_name': '3DAxis0To2',
+          'rt': [[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]],
+          'outer_axis': 0,
+          'inner_axis': 2,
+          'expected': [1, 2, 3, 4, 5, 6, 7, 8],
+      },
+      {
+          'testcase_name': '3DAxis0To1WithDenseValues',
+          'rt': [[[1, 2], [3, 4], [5, 6]], [[7, 8]]],
+          'ragged_ranks': (1, 2),
+          'outer_axis': 0,
+          'inner_axis': 1,
+          'expected': [[1, 2], [3, 4], [5, 6], [7, 8]],
+      },
+      {
+          'testcase_name': '3DAxis1To2WithDenseValues',
+          'rt': [[[1, 2], [3, 4], [5, 6]], [[7, 8]]],
+          'ragged_ranks': (1, 2),
+          'outer_axis': 1,
+          'inner_axis': 2,
+          'expected': [[1, 2, 3, 4, 5, 6], [7, 8]],
+      },
+      {
+          'testcase_name': '4DAxis0To1',
+          'rt': [[[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]], [[[9], [0]]]],
+          'outer_axis': 0,
+          'inner_axis': 1,
+          'expected': [[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []], [[9], [0]]],
+      },
+      {
+          'testcase_name': '4DAxis1To2',
+          'rt': [[[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]], [[[9], [0]]]],
+          'outer_axis': 1,
+          'inner_axis': 2,
+          'expected': [[[1, 2], [], [3, 4, 5], [6], [7, 8], []], [[9], [0]]],
+      },
+      {
+          'testcase_name': '4DAxis2To3',
+          'rt': [[[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]], [[[9], [0]]]],
+          'outer_axis': 2,
+          'inner_axis': 3,
+          'expected': [[[1, 2, 3, 4, 5], [6, 7, 8]], [[9, 0]]],
+      },
+      {
+          'testcase_name': '4DAxis1To3',
+          'rt': [[[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]], [[[9], [0]]]],
+          'outer_axis': 1,
+          'inner_axis': 3,
+          'expected': [[1, 2, 3, 4, 5, 6, 7, 8], [9, 0]],
+      },
+      {
+          'testcase_name': '4DAxis1ToNeg1',
+          'rt': [[[[1, 2], [], [3, 4, 5]], [[6], [7, 8], []]], [[[9], [0]]]],
+          'outer_axis': 1,
+          'inner_axis': -1,
+          'expected': [[1, 2, 3, 4, 5, 6, 7, 8], [9, 0]],
+      },
+      {
+          'testcase_name': '4DAxis1To2WithDenseValues',
+          'rt': [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]]]],
+          'ragged_ranks': (1, 2, 3),
+          'outer_axis': 1,
+          'inner_axis': 2,
+          'expected': [[[1, 2], [3, 4], [5, 6], [7, 8]], [[9, 10], [11, 12]]],
+      },
+      {
+          'testcase_name': '4DAxis2To3WithDenseValues',
+          'rt': [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]]]],
+          'ragged_ranks': (1, 2, 3),
+          'outer_axis': 2,
+          'inner_axis': 3,
+          'expected': [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12]]],
+      },
+      {
+          'testcase_name': '4DAxis1To3WithDenseValues',
+          'rt': [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]]]],
+          'ragged_ranks': (1, 2, 3),
+          'outer_axis': 1,
+          'inner_axis': 3,
+          'expected': [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12]],
+      },
+      {
+          'testcase_name': '5DAxis2To3WithDenseValues',
+          'rt': [[[[[1, 2], [3, 4]]], [[[5, 6], [7, 8]]]],
+                 [[[[9, 10], [11, 12]]]]],
+          'ragged_ranks': (1, 2, 3, 4),
+          'outer_axis': 2,
+          'inner_axis': 3,
+          'expected': [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                       [[[9, 10], [11, 12]]]],
+      },
+      {
+          'testcase_name': '5DAxis3To4WithDenseValues',
+          'rt': [[[[[1, 2], [3, 4]]], [[[5, 6], [7, 8]]]],
+                 [[[[9, 10], [11, 12]]]]],
+          'ragged_ranks': (1, 2, 3, 4),
+          'outer_axis': 3,
+          'inner_axis': 4,
+          'expected': [[[[1, 2, 3, 4]], [[5, 6, 7, 8]]], [[[9, 10, 11, 12]]]],
+      },
+      {
+          'testcase_name': '5DAxis1To3WithDenseValues',
+          'rt': [[[[[1, 2], [3, 4]]], [[[5, 6], [7, 8]]]],
+                 [[[[9, 10], [11, 12]]]]],
+          'ragged_ranks': (1, 2, 3, 4),
+          'outer_axis': 1,
+          'inner_axis': 3,
+          'expected': [[[1, 2], [3, 4], [5, 6], [7, 8]], [[9, 10], [11, 12]]],
+      },
+  ])  # pyformat: disable
+  def testRaggedMergeDims(self,
+                          rt,
+                          outer_axis,
+                          inner_axis,
+                          expected,
+                          ragged_ranks=(None,)):
+    for ragged_rank in ragged_ranks:
+      x = ragged_factory_ops.constant(rt, ragged_rank=ragged_rank)
+
+      # Check basic behavior.
+      actual = x.merge_dims(outer_axis, inner_axis)
+      self.assertAllEqual(expected, actual)
+      if outer_axis >= 0 and inner_axis >= 0:
+        self.assertEqual(actual.shape.rank,
+                         x.shape.rank - (inner_axis - outer_axis))
+
+      # Check behavior with negative axis.
+      if outer_axis >= 0 and inner_axis >= 0:
+        actual_with_neg_axis = x.merge_dims(outer_axis - x.shape.rank,
+                                            inner_axis - x.shape.rank)
+        self.assertAllEqual(expected, actual_with_neg_axis)
+
+      # Check behavior with placeholder input (no shape info).
+      if (not context.executing_eagerly() and outer_axis >= 0 and
+          inner_axis >= 0):
+        x_with_placeholders = nest.map_structure(
+            lambda t: array_ops.placeholder_with_default(t, None),
+            x,
+            expand_composites=True)
+        actual_with_placeholders = x_with_placeholders.merge_dims(
+            outer_axis, inner_axis)
+        self.assertAllEqual(expected, actual_with_placeholders)
+
+  @parameterized.parameters([
+      {
+          'rt': [[1]],
+          'outer_axis': {},
+          'inner_axis': 1,
+          'exception': TypeError,
+          'message': 'axis must be an int',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 1,
+          'inner_axis': {},
+          'exception': TypeError,
+          'message': 'axis must be an int',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 1,
+          'inner_axis': 3,
+          'exception': ValueError,
+          'message': 'axis=3 out of bounds: expected -2<=axis<2',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 1,
+          'inner_axis': -3,
+          'exception': ValueError,
+          'message': 'axis=-3 out of bounds: expected -2<=axis<2',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 0,
+          'inner_axis': 0,
+          'exception': ValueError,
+          'message': 'Expected outer_axis .* to be less than inner_axis .*',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 1,
+          'inner_axis': 0,
+          'exception': ValueError,
+          'message': 'Expected outer_axis .* to be less than inner_axis .*',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': -1,
+          'inner_axis': -2,
+          'exception': ValueError,
+          'message': 'Expected outer_axis .* to be less than inner_axis .*',
+      },
+      {
+          'rt': [[1]],
+          'outer_axis': 1,
+          'inner_axis': -1,
+          'exception': ValueError,
+          'message': 'Expected outer_axis .* to be less than inner_axis .*',
+      },
+  ])  # pyformat: disable
+  def testRaggedMergeDimsError(self,
+                               rt,
+                               outer_axis,
+                               inner_axis,
+                               exception,
+                               message=None,
+                               ragged_rank=None):
+    x = ragged_factory_ops.constant(rt, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(exception, message):
+      self.evaluate(x.merge_dims(outer_axis, inner_axis))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 4b225da2edd..b93b02bc12c 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -24,7 +24,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -642,3 +644,157 @@ def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redef
       return ragged_result
     else:
       raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.")
+
+
+def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return ragged_math_ops.ragged_reduce_aggregate(
+      string_ops.reduce_join, string_ops.unsorted_segment_join, inputs, axis,
+      keepdims, separator, name or "RaggedSegmentJoin")
+
+
+@tf_export("strings.ngrams")
+def ngrams(data,
+           ngram_width,
+           separator=" ",
+           pad_values=None,
+           padding_width=None,
+           preserve_short_sequences=False,
+           name=None):
+  """Create a tensor of n-grams based on `data`.
+
+  Creates a tensor of n-grams based on `data`. The n-grams are created by
+  joining windows of `width` adjacent strings from the inner axis of `data`
+  using `separator`.
+
+  The input data can be padded on both the start and end of the sequence, if
+  desired, using the `pad_values` argument. If set, `pad_values` should contain
+  either a tuple of strings or a single string; the 0th element of the tuple
+  will be used to pad the left side of the sequence and the 1st element of the
+  tuple will be used to pad the right side of the sequence. The `padding_width`
+  arg controls how many padding values are added to each side; it defaults to
+  `ngram_width-1`.
+
+  If this op is configured to not have padding, or if it is configured to add
+  padding with `padding_width` set to less than ngram_width-1, it is possible
+  that a sequence, or a sequence plus padding, is smaller than the ngram
+  width. In that case, no ngrams will be generated for that sequence. This can
+  be prevented by setting `preserve_short_sequences`, which will cause the op
+  to always generate at least one ngram per non-empty sequence.
+
+  Args:
+    data: A Tensor or RaggedTensor containing the source data for the ngrams.
+    ngram_width: The width(s) of the ngrams to create. If this is a list or
+      tuple, the op will return ngrams of all specified arities in list order.
+      Values must be non-Tensor integers greater than 0.
+    separator: The separator string used between ngram elements. Must be a
+      string constant, not a Tensor.
+    pad_values: A tuple of (left_pad_value, right_pad_value), a single string,
+      or None. If None, no padding will be added; if a single string, then that
+      string will be used for both left and right padding. Values must be Python
+      strings.
+    padding_width: If set, `padding_width` pad values will be added to both
+      sides of each sequence. Defaults to `ngram_width`-1. Must be greater than
+      0. (Note that 1-grams are never padded, regardless of this value.)
+    preserve_short_sequences: If true, then ensure that at least one ngram is
+      generated for each input sequence.  In particular, if an input sequence is
+      shorter than `min(ngram_width) + 2*pad_width`, then generate a single
+      ngram containing the entire sequence.  If false, then no ngrams are
+      generated for these short input sequences.
+    name: The op name.
+
+  Returns:
+    A RaggedTensor of ngrams. If `data.shape=[D1...DN, S]`, then
+    `output.shape=[D1...DN, NUM_NGRAMS]`, where
+    `NUM_NGRAMS=S-ngram_width+1+2*padding_width`.
+
+  Raises:
+    TypeError: if `pad_values` is set to an invalid type.
+    ValueError: if `pad_values`, `padding_width`, or `ngram_width` is set to an
+      invalid value.
+  """
+
+  with ops.name_scope(name, "StringNGrams", [data]):
+    if pad_values is None:
+      left_pad = ""
+      right_pad = ""
+    elif isinstance(pad_values, (list, tuple)):
+      if (not isinstance(pad_values[0], util_compat.bytes_or_text_types) or
+          not isinstance(pad_values[1], util_compat.bytes_or_text_types)):
+        raise TypeError(
+            "pad_values must be a string, tuple of strings, or None.")
+      left_pad = pad_values[0]
+      right_pad = pad_values[1]
+    else:
+      if not isinstance(pad_values, util_compat.bytes_or_text_types):
+        raise TypeError(
+            "pad_values must be a string, tuple of strings, or None.")
+      left_pad = pad_values
+      right_pad = pad_values
+
+    if padding_width is not None and padding_width < 1:
+      raise ValueError("padding_width must be greater than 0.")
+
+    if padding_width is not None and pad_values is None:
+      raise ValueError("pad_values must be provided if padding_width is set.")
+
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        data, name="data", dtype=dtypes.string)
+
+    # preserve the shape of the data if it is a tensor
+    to_tensor = False
+    if isinstance(data, ops.Tensor):
+      dense_shape = array_ops.concat([array_ops.shape(data)[:-1], [-1]], axis=0)
+      to_tensor = True
+
+    if not isinstance(data, ragged_tensor.RaggedTensor):
+      if data.shape.ndims is None:
+        raise ValueError("Rank of data must be known.")
+      elif data.shape.ndims == 0:
+        raise ValueError("Data must have rank>0")
+      elif data.shape.ndims == 1:
+        rt = ragged_tensor.RaggedTensor.from_row_starts(
+            data, [0], validate=False)
+        return ngrams(rt, ngram_width, separator, pad_values, padding_width,
+                      preserve_short_sequences, name)[0]
+      else:
+        data = ragged_tensor.RaggedTensor.from_tensor(
+            data, ragged_rank=data.shape.ndims - 1)
+
+    if data.ragged_rank > 1:
+      output = data.with_values(
+          ngrams(data.values, ngram_width, separator, pad_values, padding_width,
+                 preserve_short_sequences, name))
+      return array_ops.reshape(output.flat_values,
+                               dense_shape) if to_tensor else output
+
+    if pad_values is None:
+      padding_width = 0
+
+    if pad_values is not None and padding_width is None:
+      padding_width = -1
+
+    if not isinstance(ngram_width, (list, tuple)):
+      ngram_widths = [ngram_width]
+    else:
+      ngram_widths = ngram_width
+    for width in ngram_widths:
+      if width < 1:
+        raise ValueError("All ngram_widths must be greater than 0. Got %s" %
+                         ngram_width)
+
+    output, output_splits = gen_string_ops.string_n_grams(
+        data=data.flat_values,
+        data_splits=data.row_splits,
+        separator=separator,
+        ngram_widths=ngram_widths,
+        left_pad=left_pad,
+        right_pad=right_pad,
+        pad_width=padding_width,
+        preserve_short_sequences=preserve_short_sequences)
+
+    # if the input is Dense tensor, the output should also be a dense tensor
+    output = ragged_tensor.RaggedTensor.from_row_splits(
+        values=output, row_splits=output_splits, validate=False)
+    return array_ops.reshape(output.flat_values,
+                             dense_shape) if to_tensor else output
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops_test.py b/tensorflow/python/ops/ragged/ragged_string_ops_test.py
new file mode 100644
index 00000000000..978d54c22ee
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_string_ops_test.py
@@ -0,0 +1,143 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_string_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedStringOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def test_rank_one(self):
+    input_array = [b'this', b'is', b'a', b'test']
+    truth = b'thisisatest'
+    truth_shape = []
+    with self.cached_session():
+      output = ragged_string_ops.reduce_join(
+          inputs=input_array, axis=-1, keepdims=False, separator='')
+      output_array = self.evaluate(output)
+    self.assertAllEqual(truth, output_array)
+    self.assertAllEqual(truth_shape, output.get_shape())
+
+  @parameterized.parameters([
+      {
+          'input_array': [[
+              b'this', b'is', b'a', b'test', b'for', b'ragged', b'tensors'
+          ], [b'please', b'do', b'not', b'panic', b'!']],
+          'axis': 0,
+          'keepdims': False,
+          'truth': [
+              b'thisplease', b'isdo', b'anot', b'testpanic', b'for!', b'ragged',
+              b'tensors'
+          ],
+          'truth_shape': [7],
+      },
+      {
+          'input_array': [[
+              b'this', b'is', b'a', b'test', b'for', b'ragged', b'tensors'
+          ], [b'please', b'do', b'not', b'panic', b'!']],
+          'axis': 1,
+          'keepdims': False,
+          'truth': [b'thisisatestforraggedtensors', b'pleasedonotpanic!'],
+          'truth_shape': [2],
+      },
+      {
+          'input_array': [[
+              b'this', b'is', b'a', b'test', b'for', b'ragged', b'tensors'
+          ], [b'please', b'do', b'not', b'panic', b'!']],
+          'axis': 1,
+          'keepdims': False,
+          'truth': [
+              b'this|is|a|test|for|ragged|tensors', b'please|do|not|panic|!'
+          ],
+          'truth_shape': [2],
+          'separator': '|',
+      },
+      {
+          'input_array': [[[b'a', b'b'], [b'b', b'c']], [[b'dd', b'ee']]],
+          'axis': -1,
+          'keepdims': False,
+          'truth': [[b'a|b', b'b|c'], [b'dd|ee']],
+          'truth_shape': [2, None],
+          'separator': '|',
+      },
+      {
+          'input_array': [[[[b'a', b'b', b'c'], [b'dd', b'ee']]],
+                          [[[b'f', b'g', b'h'], [b'ii', b'jj']]]],
+          'axis': -2,
+          'keepdims': False,
+          'truth': [[[b'a|dd', b'b|ee', b'c']], [[b'f|ii', b'g|jj', b'h']]],
+          'truth_shape': [2, None, None],
+          'separator': '|',
+      },
+      {
+          'input_array': [[[b't', b'h', b'i', b's'], [b'i', b's'], [b'a'],
+                           [b't', b'e', b's', b't']],
+                          [[b'p', b'l', b'e', b'a', b's', b'e'],
+                           [b'p', b'a', b'n', b'i', b'c']]],
+          'axis': -1,
+          'keepdims': False,
+          'truth': [[b'this', b'is', b'a', b'test'], [b'please', b'panic']],
+          'truth_shape': [2, None],
+          'separator': '',
+      },
+      {
+          'input_array': [[[[b't'], [b'h'], [b'i'], [b's']], [[b'i', b's']],
+                           [[b'a', b'n']], [[b'e'], [b'r'], [b'r']]],
+                          [[[b'p'], [b'l'], [b'e'], [b'a'], [b's'], [b'e']],
+                           [[b'p'], [b'a'], [b'n'], [b'i'], [b'c']]]],
+          'axis': -1,
+          'keepdims': False,
+          'truth': [[[b't', b'h', b'i', b's'], [b'is'], [b'an'],
+                     [b'e', b'r', b'r']],
+                    [[b'p', b'l', b'e', b'a', b's', b'e'],
+                     [b'p', b'a', b'n', b'i', b'c']]],
+          'truth_shape': [2, None, None],
+          'separator': '',
+      },
+  ])
+  def test_different_ranks(self,
+                           input_array,
+                           axis,
+                           keepdims,
+                           truth,
+                           truth_shape,
+                           separator=''):
+    with self.cached_session():
+      input_tensor = ragged_factory_ops.constant(input_array)
+      output = ragged_string_ops.reduce_join(
+          inputs=input_tensor,
+          axis=axis,
+          keepdims=keepdims,
+          separator=separator)
+      output_array = self.evaluate(output)
+    self.assertAllEqual(truth, output_array)
+    if all(isinstance(s, tensor_shape.Dimension) for s in output.shape):
+      output_shape = [dim.value for dim in output.shape]
+    else:
+      output_shape = output.shape
+    self.assertAllEqual(truth_shape, output_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index d06819cbf90..eef3aec10b6 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import operator
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
@@ -788,9 +791,12 @@ class RaggedTensor(composite_tensor.CompositeTensor):
                                           name=name)
     else:
       values = ops.convert_to_tensor(values, name="values")
-      partition = ops.convert_to_tensor(
-          partition, preferred_dtype=dtypes.int64,
-          name=name)
+      if isinstance(partition, np.ndarray) and partition.dtype == np.int32:
+        partition = ops.convert_to_tensor(partition, name=name)
+      else:
+        partition = ops.convert_to_tensor(
+            partition, preferred_dtype=dtypes.int64,
+            name=name)
       if partition.dtype not in (dtypes.int32, dtypes.int64):
         raise ValueError("%s must have dtype int32 or int64" % name)
 
@@ -1309,9 +1315,52 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     return RaggedTensor(values, row_splits, cached_row_lengths,
                         cached_value_rowids, cached_nrows, internal=True)
 
-  #=============================================================================
-  # Tensor Type Conversions
-  #=============================================================================
+  def merge_dims(self, outer_axis, inner_axis):
+    """Merges outer_axis...inner_axis into a single dimension.
+
+    Returns a copy of this RaggedTensor with the specified range of dimensions
+    flattened into a single dimension, with elements in row-major order.
+
+    #### Examples:
+
+    ```python
+    >>> rt = tf.ragged.constant([[[1, 2], [3]], [[4, 5, 6]]])
+    >>> rt.merge_dims(0, 1)
+    [[1, 2], [3], [4, 5, 6]]
+    >>> rt.merge_dims(1, 2)
+    [[1, 2, 3], [4, 5, 6]]
+    >>> rt.merge_dims(0, 2)
+    [1, 2, 3, 4, 5, 6]
+    ```
+
+    To mimic the behavior of `np.flatten` (which flattens all dimensions), use
+    `rt.merge_dims(0, -1).  To mimic the behavior of `tf.layers.Flatten` (which
+    flattens all dimensions except the outermost batch dimension), use
+    `rt.merge_dims(1, -1)`.
+
+    Args:
+      outer_axis: `int`: The first dimension in the range of dimensions to
+        merge. May be negative if `self.shape.rank` is statically known.
+      inner_axis: `int`: The last dimension in the range of dimensions to
+        merge. May be negative if `self.shape.rank` is statically known.
+
+    Returns:
+      A copy of this tensor, with the specified dimensions merged into a
+      single dimension.  The shape of the returned tensor will be
+      `self.shape[:outer_axis] + [N] + self.shape[inner_axis + 1:]`, where `N`
+      is the total number of slices in the merged dimensions.
+    """
+    outer_axis = ragged_util.get_positive_axis(outer_axis, self.shape.ndims)
+    inner_axis = ragged_util.get_positive_axis(inner_axis, self.shape.ndims)
+    if not outer_axis < inner_axis:
+      raise ValueError("Expected outer_axis (%d) to be less than "
+                       "inner_axis (%d)" % (outer_axis, inner_axis))
+    return _merge_dims(self, outer_axis, inner_axis)
+
+
+#=============================================================================
+# Tensor Type Conversions
+#=============================================================================
 
   @classmethod
   def from_tensor(cls,
@@ -1985,16 +2034,17 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
       return [value]
 
   def _from_components(self, tensor_list):
-    # Currently, Keras converts tensors to numpy and then calls from_components
-    # with those np.arrays.  So if we see np.ndarrays, convert them to tensors.
-    # TODO(b/133606651) Update Keras to do something different here.  Consider
-    # adding something like TypeSpec.from_numpy_components?
-    if isinstance(tensor_list[0], np.ndarray):
-      tensor_list = [ops.convert_to_tensor(t) for t in tensor_list]
-
     result = tensor_list[0]
-    for row_splits in reversed(tensor_list[1:]):
-      result = RaggedTensor(result, row_splits, internal=True)
+    if (all(isinstance(t, np.ndarray) for t in tensor_list) and
+        not tf2.enabled()):
+      for row_splits in reversed(tensor_list[1:]):
+        result = ragged_tensor_value.RaggedTensorValue(result, row_splits)
+    else:
+      if isinstance(tensor_list[0], np.ndarray):
+        tensor_list = [ops.convert_to_tensor(t) for t in tensor_list]
+        result = tensor_list[0]
+      for row_splits in reversed(tensor_list[1:]):
+        result = RaggedTensor(result, row_splits, internal=True)
     return result
 
   # The RaggedTensorSpec tensor_list encoding uses to/from_variant ops
@@ -2254,4 +2304,76 @@ def _nrows(tensor, out_type=dtypes.int32):
     return array_ops.shape(tensor, out_type=out_type)[0]
 
 
+def _merge_dims(value, outer_axis, inner_axis):
+  """Merges value[outer_axis...inner_axis] into a single dimension.
+
+  See `RaggedTensor.merge_dims()` for more details.  This helper differs from
+  `RaggedTensor.merge_dims()` in that `value` may be a dense or ragged tensor.
+
+  Args:
+    value: A `RaggedTensor` or `Tensor`
+    outer_axis: `int`
+    inner_axis: `int`
+
+  Returns:
+    A flattened `RaggedTensor` or `Tensor`.
+  """
+  if outer_axis == inner_axis:
+    return value
+
+  # Flatten outer dimensions of a RaggedTensor by just taking its values.
+  while outer_axis == 0 and isinstance(value, RaggedTensor):
+    value = value.values
+    inner_axis -= 1
+    if inner_axis == 0:
+      return value
+
+  # Flatten non-Ragged tensors using tf.reshape().
+  if not isinstance(value, RaggedTensor):
+    if value.shape.is_fully_defined():
+      old_shape = value.shape.as_list()
+      new_shape = old_shape[:outer_axis] + [-1] + old_shape[inner_axis + 1:]
+    else:
+      old_shape = array_ops.shape(value)
+      new_shape = array_ops.concat(
+          [old_shape[:outer_axis], [-1], old_shape[inner_axis + 1:]], axis=0)
+    return array_ops.reshape(value, new_shape)
+
+  # Handle outer_axis>1 via recursion.
+  if outer_axis > 1:
+    return value.with_values(
+        _merge_dims(value.values, outer_axis - 1, inner_axis - 1))
+
+  # At this point, we know outer_axis == 1, and value is a RaggedTensor.
+  # So we need to flatten the values and build a corresponding splits tensor.
+  new_values = value.values
+  new_splits = value.row_splits
+  for axis in range(outer_axis, inner_axis):
+    if isinstance(new_values, RaggedTensor):
+      # Flatten a single ragged dimension.
+      new_splits = array_ops.gather(new_values.row_splits, new_splits)
+      new_values = new_values.values
+    else:
+      # Flatten all remaining dense dimensions.
+      shape_split = inner_axis - axis + 1
+      if new_values.shape.is_fully_defined():
+        old_shape = new_values.shape.as_list()
+        new_shape = [-1] + old_shape[shape_split:]
+        flat_size = _prod(old_shape[1:shape_split])
+      else:
+        old_shape = array_ops.shape(new_values)
+        new_shape = array_ops.concat([[-1], old_shape[shape_split:]], axis=0)
+        flat_size = math_ops.cast(
+            math_ops.reduce_prod(old_shape[1:shape_split]), new_splits.dtype)
+      new_values = array_ops.reshape(new_values, new_shape)
+      new_splits = new_splits * flat_size
+      break
+  return RaggedTensor.from_row_splits(new_values, new_splits)
+
+
+def _prod(lst):
+  """Returns the product of the numbers in a list."""
+  return functools.reduce(operator.mul, lst, 1)
+
+
 ops.no_gradient("RaggedTensorToVariant")
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index eb8767b56e0..06338725b26 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -29,12 +29,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorSpec
 from tensorflow.python.platform import googletest
 
 
@@ -374,6 +376,24 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
         rt,
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  def testFromRowSplitsWithDifferentSplitTypes(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    splits1 = [0, 2, 2, 5, 6, 7]
+    splits2 = np.array([0, 2, 2, 5, 6, 7], np.int64)
+    splits3 = np.array([0, 2, 2, 5, 6, 7], np.int32)
+    splits4 = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    splits5 = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int32)
+    rt1 = RaggedTensor.from_row_splits(values, splits1)
+    rt2 = RaggedTensor.from_row_splits(values, splits2)
+    rt3 = RaggedTensor.from_row_splits(values, splits3)
+    rt4 = RaggedTensor.from_row_splits(values, splits4)
+    rt5 = RaggedTensor.from_row_splits(values, splits5)
+    self.assertEqual(rt1.row_splits.dtype, dtypes.int64)
+    self.assertEqual(rt2.row_splits.dtype, dtypes.int64)
+    self.assertEqual(rt3.row_splits.dtype, dtypes.int32)
+    self.assertEqual(rt4.row_splits.dtype, dtypes.int64)
+    self.assertEqual(rt5.row_splits.dtype, dtypes.int32)
+
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
     with self.assertRaisesRegexp(ValueError, err_msg):
@@ -1547,5 +1567,179 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
           output_ragged_rank=1,
           input_ragged_rank=1)
 
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorSpecTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    spec1 = RaggedTensorSpec(ragged_rank=1)
+    self.assertEqual(spec1._shape.rank, None)
+    self.assertEqual(spec1._dtype, dtypes.float32)
+    self.assertEqual(spec1._row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1._ragged_rank, 1)
+
+    spec2 = RaggedTensorSpec(shape=[None, None, None])
+    self.assertEqual(spec2._shape.as_list(), [None, None, None])
+    self.assertEqual(spec2._dtype, dtypes.float32)
+    self.assertEqual(spec2._row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec2._ragged_rank, 2)
+
+    with self.assertRaisesRegexp(ValueError, 'Must specify ragged_rank'):
+      RaggedTensorSpec()
+    with self.assertRaisesRegexp(TypeError, 'ragged_rank must be an int'):
+      RaggedTensorSpec(ragged_rank=constant_op.constant(1))
+    with self.assertRaisesRegexp(ValueError,
+                                 'ragged_rank must be less than rank'):
+      RaggedTensorSpec(ragged_rank=2, shape=[None, None])
+
+  def testValueType(self):
+    spec1 = RaggedTensorSpec(ragged_rank=1)
+    self.assertEqual(spec1.value_type, RaggedTensor)
+    spec2 = RaggedTensorSpec(ragged_rank=0)
+    self.assertEqual(spec2.value_type, ops.Tensor)
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(ragged_rank=1),
+       (tensor_shape.TensorShape(None), dtypes.float32, 1, dtypes.int64)),
+      (RaggedTensorSpec(shape=[5, None, None]),
+       (tensor_shape.TensorShape([5, None, None]), dtypes.float32,
+        2, dtypes.int64)),
+      (RaggedTensorSpec(shape=[5, None, None], dtype=dtypes.int32),
+       (tensor_shape.TensorShape([5, None, None]), dtypes.int32, 2,
+        dtypes.int64)),
+      (RaggedTensorSpec(ragged_rank=1, row_splits_dtype=dtypes.int32),
+       (tensor_shape.TensorShape(None), dtypes.float32, 1, dtypes.int32)),
+  ])  # pyformat: disable
+  def testSerialize(self, rt_spec, expected):
+    serialization = rt_spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(ragged_rank=0, shape=[5, 3]), [
+          tensor_spec.TensorSpec([5, 3], dtypes.float32),
+      ]),
+      (RaggedTensorSpec(ragged_rank=1), [
+          tensor_spec.TensorSpec(None, dtypes.float32),
+          tensor_spec.TensorSpec([None], dtypes.int64)
+      ]),
+      (RaggedTensorSpec(ragged_rank=1, row_splits_dtype=dtypes.int32), [
+          tensor_spec.TensorSpec(None, dtypes.float32),
+          tensor_spec.TensorSpec([None], dtypes.int32),
+      ]),
+      (RaggedTensorSpec(ragged_rank=2), [
+          tensor_spec.TensorSpec(None, dtypes.float32),
+          tensor_spec.TensorSpec([None], dtypes.int64),
+          tensor_spec.TensorSpec([None], dtypes.int64),
+      ]),
+      (RaggedTensorSpec(shape=[5, None, None], dtype=dtypes.string), [
+          tensor_spec.TensorSpec([None], dtypes.string),
+          tensor_spec.TensorSpec([6], dtypes.int64),
+          tensor_spec.TensorSpec([None], dtypes.int64),
+      ]),
+  ])
+  def testComponentSpecs(self, rt_spec, expected):
+    self.assertEqual(rt_spec._component_specs, expected)
+
+  @parameterized.parameters([
+      {
+          'rt_spec': RaggedTensorSpec(ragged_rank=0),
+          'rt': [1.0, 2.0, 3.0],
+          'components': [[1.0, 2.0, 3.0]]
+      },
+      {
+          'rt_spec': RaggedTensorSpec(ragged_rank=1),
+          'rt': [[1.0, 2.0], [3.0]],
+          'components': [[1.0, 2.0, 3.0], [0, 2, 3]]
+      },
+      {
+          'rt_spec': RaggedTensorSpec(shape=[2, None, None]),
+          'rt': [[[1.0, 2.0], [3.0]], [[], [4.0]]],
+          'components': [[1.0, 2.0, 3.0, 4.0], [0, 2, 4], [0, 2, 3, 3, 4]]
+      },
+  ])
+  def testToFromComponents(self, rt_spec, rt, components):
+    rt = ragged_factory_ops.constant(rt)
+    actual_components = rt_spec._to_components(rt)
+    self.assertAllTensorsEqual(actual_components, components)
+    rt_reconstructed = rt_spec._from_components(actual_components)
+    self.assertAllEqual(rt, rt_reconstructed)
+
+  @test_util.run_v1_only('RaggedTensorValue is deprecated in v2')
+  def testFromNumpyComponents(self):
+    spec1 = RaggedTensorSpec(ragged_rank=1, dtype=dtypes.int32)
+    rt1 = spec1._from_components([np.array([1, 2, 3]), np.array([0, 2, 3])])
+    self.assertIsInstance(rt1, ragged_tensor_value.RaggedTensorValue)
+    self.assertAllEqual(rt1, [[1, 2], [3]])
+
+    spec2 = RaggedTensorSpec(ragged_rank=2, dtype=dtypes.int32)
+    rt2 = spec2._from_components([np.array([1, 2, 3]), np.array([0, 2, 3]),
+                                  np.array([0, 0, 2, 3])])
+    self.assertIsInstance(rt2, ragged_tensor_value.RaggedTensorValue)
+    self.assertAllEqual(rt2, [[[], [1, 2]], [[3]]])
+
+    spec3 = RaggedTensorSpec(ragged_rank=0, dtype=dtypes.int32)
+    rt3 = spec3._from_components([np.array([1, 2, 3])])
+    self.assertIsInstance(rt3, np.ndarray)
+    self.assertAllEqual(rt3, [1, 2, 3])
+
+  @parameterized.parameters([
+      RaggedTensorSpec(ragged_rank=0, shape=[5, 3]),
+      RaggedTensorSpec(ragged_rank=1),
+      RaggedTensorSpec(ragged_rank=1, row_splits_dtype=dtypes.int32),
+      RaggedTensorSpec(ragged_rank=2, dtype=dtypes.string),
+      RaggedTensorSpec(shape=[5, None, None]),
+  ])
+  def testFlatTensorSpecs(self, rt_spec):
+    self.assertEqual(rt_spec._flat_tensor_specs,
+                     [tensor_spec.TensorSpec(None, dtypes.variant)])
+
+  @parameterized.parameters([
+      {
+          'rt_spec': RaggedTensorSpec(ragged_rank=1),
+          'rt': [[1.0, 2.0], [3.0]]
+      },
+      {
+          'rt_spec': RaggedTensorSpec(shape=[2, None, None]),
+          'rt': [[[1.0, 2.0], [3.0]], [[], [4.0]]]
+      },
+  ])
+  def testToFromTensorList(self, rt_spec, rt):
+    rt = ragged_factory_ops.constant(rt)
+    tensor_list = rt_spec._to_tensor_list(rt)
+    rt_reconstructed = rt_spec._from_tensor_list(tensor_list)
+    self.assertAllEqual(rt, rt_reconstructed)
+
+  @parameterized.parameters([
+      (RaggedTensorSpec([2, None], dtypes.float32, 1), 32,
+       RaggedTensorSpec([32, 2, None], dtypes.float32, 2)),
+      (RaggedTensorSpec([4, None], dtypes.float32, 1), None,
+       RaggedTensorSpec([None, 4, None], dtypes.float32, 2)),
+      (RaggedTensorSpec([2], dtypes.float32,
+                        -1), 32, RaggedTensorSpec([32, 2], dtypes.float32, 0)),
+  ])
+  def testBatch(self, spec, batch_size, expected):
+    self.assertEqual(spec._batch(batch_size), expected)
+
+  @parameterized.parameters([
+      (RaggedTensorSpec([32, None, None], dtypes.float32, 2),
+       RaggedTensorSpec([None, None], dtypes.float32, 1)),
+      (RaggedTensorSpec([None, None, None], dtypes.float32, 2),
+       RaggedTensorSpec([None, None], dtypes.float32, 1)),
+      (RaggedTensorSpec([32, 2], dtypes.float32, 0),
+       RaggedTensorSpec([2], dtypes.float32, -1)),
+  ])  # pyformat: disable
+  def testUnbatch(self, spec, expected):
+    self.assertEqual(spec._unbatch(), expected)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 1c589a721de..0b994af2ff7 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -21,9 +21,16 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
@@ -132,5 +139,355 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
       rt.to_tensor(default)
 
 
+# This covers the tests above, but with the new implementation.
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToTensorOpNewTest(test_util.TensorFlowTestCase,
+                                    parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    """Example from ragged_to_tensor.__doc__."""
+    rt = ragged_factory_ops.constant([[9, 8, 7], [], [6, 5], [4]])
+    dt = ragged_conversion_ops.ragged_to_dense(rt)
+    self.assertAllEqual(dt, [[9, 8, 7], [0, 0, 0], [6, 5, 0], [4, 0, 0]])
+
+  @parameterized.parameters(
+      {
+          'rt_input': [],
+          'ragged_rank': 1,
+          'expected': [],
+          'expected_shape': [0, 0],
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'expected': [[1, 2, 3], [0, 0, 0], [4, 0, 0], [5, 6, 0]]
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'default': 9,
+          'expected': [[1, 2, 3], [9, 9, 9], [4, 9, 9], [5, 6, 9]]
+      },
+      {
+          'rt_input': [[[1], [2], [3]], [], [[4]], [[5], [6]]],
+          'ragged_rank':
+              1,
+          'default': [9],
+          'expected': [[[1], [2], [3]], [[9], [9], [9]], [[4], [9], [9]],
+                       [[5], [6], [9]]]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'expected': [
+              [[1, 2], [0, 0], [3, 4]],  #
+              [[0, 0], [0, 0], [0, 0]],  #
+              [[5, 0], [0, 0], [0, 0]],  #
+              [[6, 7], [8, 0], [0, 0]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'default':
+              9,
+          'expected': [
+              [[1, 2], [9, 9], [3, 4]],  #
+              [[9, 9], [9, 9], [9, 9]],  #
+              [[5, 9], [9, 9], [9, 9]],  #
+              [[6, 7], [8, 9], [9, 9]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'ragged_rank': 1,
+          'default': 0,
+          'expected': [[[1], [2], [3]]],
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 9,
+          'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
+      },
+  )
+  def testRaggedTensorToTensor(self,
+                               rt_input,
+                               expected,
+                               ragged_rank=None,
+                               default=None,
+                               expected_shape=None):
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    dt = ragged_conversion_ops.ragged_to_dense(rt, default_value=default)
+
+    self.assertIsInstance(dt, ops.Tensor)
+    self.assertEqual(rt.dtype, dt.dtype)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    if expected_shape is not None:
+      expected = np.ndarray(expected_shape, buffer=np.array(expected))
+    self.assertAllEqual(dt, expected)
+
+  @parameterized.parameters(
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': 'a',
+          'error': (TypeError, '.*'),
+      }, {
+          'rt_input': [[1, 2, 3]],
+          'default': 'b',
+          'error': (TypeError, '.*'),
+      })
+  def testError(self, rt_input, default, error, ragged_rank=None):
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(error[0], error[1]):
+      ragged_conversion_ops.ragged_to_dense(rt, default_value=default)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedToTensorOpAdditionalTests(test_util.TensorFlowTestCase):
+
+  def _compare_to_reference(self,
+                            ragged_tensor,
+                            expected=None,
+                            default_value=None):
+    treatment = ragged_conversion_ops.ragged_to_dense(
+        ragged_tensor, default_value=default_value)
+    control = ragged_tensor.to_tensor(default_value=default_value)
+    self.assertAllEqual(control, treatment)
+    if expected is not None:
+      self.assertAllEqual(expected, treatment)
+
+  def test_already_dense_simple(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant([6, 7, 8, 9, 10, 11], dtype=dtypes.int64),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1, 1],
+                                          dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(input_data, [[6, 7, 8], [9, 10, 11]])
+
+  def test_already_dense_with_dense_values_and_default(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            [[6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17]],
+            dtype=dtypes.int64),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1, 1],
+                                          dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(
+        input_data,
+        [[[6, 7], [8, 9], [10, 11]], [[12, 13], [14, 15], [16, 17]]],
+        default_value=constant_op.constant([31, 32], dtype=dtypes.int64))
+
+  def test_already_dense_with_dense_values(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            [[6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17]],
+            dtype=dtypes.int64),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1, 1],
+                                          dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(
+        input_data,
+        [[[6, 7], [8, 9], [10, 11]], [[12, 13], [14, 15], [16, 17]]])
+
+  def test_ragged_with_dense_values_and_default(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            [[6, 7], [8, 9], [10, 11], [12, 13], [14, 15]], dtype=dtypes.int64),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1], dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(
+        input_data, [[[6, 7], [8, 9], [10, 11]], [[12, 13], [14, 15], [2, 3]]],
+        default_value=[2, 3])
+
+  def test_ragged_with_dense_values_and_small_default(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            [[6, 7], [8, 9], [10, 11], [12, 13], [14, 15]], dtype=dtypes.int64),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1], dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(
+        input_data, [[[6, 7], [8, 9], [10, 11]], [[12, 13], [14, 15], [2, 2]]],
+        default_value=2)
+
+  def test_already_dense_with_dense_values_string(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            [[b'a', b'b'], [b'c', b'd'], [b'e', b'f'], [b'g', b'jalapeno'],
+             [b'kangaroo', b'llama'], [b'manzana', b'nectar']],
+            dtype=dtypes.string),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1, 1],
+                                          dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(input_data,
+                               [[[b'a', b'b'], [b'c', b'd'], [b'e', b'f']],
+                                [[b'g', b'jalapeno'], [b'kangaroo', b'llama'],
+                                 [b'manzana', b'nectar']]])
+
+  def test_already_dense_with_string(self):
+    """This studies a tensor initialized with value_rowids and nrows."""
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant(
+            ['a', 'b', 'c', 'd', 'e', 'antidisestablishmentarianism'],
+            dtype=dtypes.string),
+        value_rowids=constant_op.constant([0, 0, 0, 1, 1, 1],
+                                          dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(
+        input_data,
+        [[b'a', b'b', b'c'], [b'd', b'e', b'antidisestablishmentarianism']])
+
+  def test_already_dense(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [3, 4, 5]])
+    self._compare_to_reference(input_data, [[0, 1, 2], [3, 4, 5]])
+
+  def test_true_ragged(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [], [3]])
+    self._compare_to_reference(input_data, [[0, 1, 2], [0, 0, 0], [3, 0, 0]])
+
+  def test_true_ragged_default_3(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [], [3]])
+    self._compare_to_reference(
+        input_data, [[0, 1, 2], [3, 3, 3], [3, 3, 3]], default_value=3)
+
+  def test_three_dimensional_ragged(self):
+    input_data = ragged_factory_ops.constant([[[0, 1, 2], []], [], [[3]]])
+    self._compare_to_reference(
+        input_data, [[[0, 1, 2], [3, 3, 3]], [[3, 3, 3], [3, 3, 3]],
+                     [[3, 3, 3], [3, 3, 3]]],
+        default_value=3)
+
+  def test_empty_tensor(self):
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant([], dtype=dtypes.int64),
+        value_rowids=constant_op.constant([], dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(input_data, [[], []], default_value=3)
+
+  def test_empty_last(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [], [3], []])
+    self._compare_to_reference(input_data,
+                               [[0, 1, 2], [0, 0, 0], [3, 0, 0], [0, 0, 0]])
+
+  def test_shape_limit(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(input_data, shape=[2, 3])
+    self.assertAllEqual(actual, [[0, 1, 2], [0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [2, 3])
+
+  def test_shape_limit_tuple(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(input_data, shape=(2, 3))
+    self.assertAllEqual(actual, [[0, 1, 2], [0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [2, 3])
+
+  def test_shape_limit_tensor_shape(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, shape=tensor_shape.TensorShape([2, 3]))
+    self.assertAllEqual(actual, [[0, 1, 2], [0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [2, 3])
+
+  def test_shape_half_limit_tensor_shape(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, shape=tensor_shape.TensorShape([2, None]))
+    self.assertAllEqual(actual, [[0, 1, 2, 3], [0, 0, 0, 0]])
+
+  def test_skip_eager_shape_half_limit_tensor_shape(self):
+    # Eager would produce a shape of [2, 4]
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, shape=tensor_shape.TensorShape([2, None]))
+    result = actual.shape.as_list()
+    # This is equal to [2, 4] in eager, or [2, None] in non-eager.
+    self.assertEqual(result[0], 2)
+
+  def test_shape_limit_shape_is_tensor_int64(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, shape=constant_op.constant([2, 3], dtype=dtypes.int64))
+    self.assertAllEqual(actual, [[0, 1, 2], [0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [2, 3])
+
+  def test_shape_limit_shape_is_tensor_int32(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2, 3], [], [4], []])
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, shape=constant_op.constant([2, 3], dtype=dtypes.int32))
+    self.assertAllEqual(actual, [[0, 1, 2], [0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [2, 3])
+
+  def test_shape_expand_first_dim(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [], [3]])
+    actual = ragged_conversion_ops.ragged_to_dense(input_data, shape=[4, 4])
+    self.assertAllEqual(
+        actual, [[0, 1, 2, 0], [0, 0, 0, 0], [3, 0, 0, 0], [0, 0, 0, 0]])
+    self.assertEqual(actual.shape.as_list(), [4, 4])
+
+  def test_value_transposed(self):
+    # This test tries to get a tensor in columnar format, where I am uncertain
+    # as to whether the underlying op, which copies data in the raw format,
+    # could fail.
+    my_value = array_ops.transpose(
+        constant_op.constant([[0, 1, 2, 3], [4, 5, 6, 7]]))
+    input_data = RaggedTensor.from_value_rowids(
+        values=my_value,
+        value_rowids=constant_op.constant([0, 1, 2, 3], dtype=dtypes.int64),
+        nrows=constant_op.constant(4, dtype=dtypes.int64),
+        validate=True)
+    self._compare_to_reference(input_data,
+                               [[[0, 4]], [[1, 5]], [[2, 6]], [[3, 7]]])
+
+  # This fails on the older version of to_tensor.
+  def test_broadcast_default(self):
+    # This test is commented out. The functionality here is not supported.
+    # The dense dimension here is 2 x 2
+    input_data = ragged_factory_ops.constant([[[[1, 2], [3, 4]]], []],
+                                             ragged_rank=1)
+    # This placeholder has a 2 x 1 dimension.
+    default_value = array_ops.placeholder_with_default([[5], [6]], shape=None)
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, default_value=default_value)
+    expected = [[[[1, 2], [3, 4]]], [[[5, 5], [6, 6]]]]
+    self.assertAllEqual(actual, expected)
+
+  # This fails on the older version of to_tensor.
+  def test_broadcast_default_no_placeholder(self):
+    # Again, this functionality is not supported. It fails more gracefully
+    # when creating the op.
+    input_data = ragged_factory_ops.constant([[[[1, 2], [3, 4]]], []],
+                                             ragged_rank=1)
+    # default_value has a 2 x 1 dimension.
+    default_value = constant_op.constant([[5], [6]], shape=None)
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, default_value=default_value)
+    expected = [[[[1, 2], [3, 4]]], [[[5, 5], [6, 6]]]]
+    self.assertAllEqual(actual, expected)
+
+  def test_shape_expand_second_dim(self):
+    input_data = ragged_factory_ops.constant([[0, 1, 2], [], [3], []])
+    actual = ragged_conversion_ops.ragged_to_dense(input_data, shape=[3, 4])
+    self.assertAllEqual(actual, [[0, 1, 2, 0], [0, 0, 0, 0], [3, 0, 0, 0]])
+
+  def test_empty_tensor_with_shape(self):
+    input_data = RaggedTensor.from_value_rowids(
+        values=constant_op.constant([], dtype=dtypes.int64),
+        value_rowids=constant_op.constant([], dtype=dtypes.int64),
+        nrows=constant_op.constant(2, dtype=dtypes.int64),
+        validate=True)
+    actual = ragged_conversion_ops.ragged_to_dense(
+        input_data, default_value=3, shape=[2, 3])
+    self.assertAllEqual(actual, [[3, 3, 3], [3, 3, 3]])
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
index 2c738e7cd29..aec9162adad 100644
--- a/tensorflow/python/ops/ragged/ragged_util.py
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -21,59 +21,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
 
 
-def convert_to_int_tensor(tensor, name, dtype=dtypes.int32):
-  """Converts the given value to an integer Tensor."""
-  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
-  if tensor.dtype.is_integer:
-    tensor = math_ops.cast(tensor, dtype)
-  else:
-    raise TypeError(
-        "%s must be an integer tensor; dtype=%s" % (name, tensor.dtype))
-  return tensor
-
-
-def get_positive_axis(axis, ndims):
-  """Validate an `axis` parameter, and normalize it to be positive.
-
-  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
-  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
-  `axis + ndims` (otherwise).
-  If `ndims` is not known, and `axis` is positive, then return it as-is.
-  If `ndims` is not known, and `axis` is negative, then report an error.
-
-  Args:
-    axis: An integer constant
-    ndims: An integer constant, or `None`
-
-  Returns:
-    The normalized `axis` value.
-
-  Raises:
-    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
-      `ndims is None`.
-  """
-  if not isinstance(axis, int):
-    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
-  if ndims is not None:
-    if 0 <= axis < ndims:
-      return axis
-    elif -ndims <= axis < 0:
-      return axis + ndims
-    else:
-      raise ValueError(
-          "axis=%s out of bounds: expected %s<=axis<%s" % (axis, -ndims, ndims))
-  elif axis < 0:
-    raise ValueError("axis may only be negative if ndims is statically known.")
-  return axis
-
 
 def assert_splits_match(nested_splits_lists):
   """Checks that the given splits lists are identical.
@@ -103,133 +56,10 @@ def assert_splits_match(nested_splits_lists):
   ]
 
 
-# This op is intended to exactly match the semantics of numpy.repeat, with
-# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
-# when axis is not specified.  Rather than implement that special behavior, we
-# simply make `axis` be a required argument.
-#
-# External (OSS) `tf.repeat` feature request:
-# https://github.com/tensorflow/tensorflow/issues/8246
-def repeat(data, repeats, axis, name=None):
-  """Repeats elements of `data`.
-
-  Args:
-    data: An `N`-dimensional tensor.
-    repeats: A 1-D integer tensor specifying how many times each element in
-      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
-      Supports broadcasting from a scalar value.
-    axis: `int`.  The axis along which to repeat values.  Must be less than
-      `max(N, 1)`.
-    name: A name for the operation.
-
-  Returns:
-    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
-    except that dimension `axis` has size `sum(repeats)`.
-
-  #### Examples:
-    ```python
-    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
-    ['a', 'a', 'a', 'c', 'c']
-    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
-    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
-    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
-    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
-    ```
-  """
-  if not isinstance(axis, int):
-    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
-
-  with ops.name_scope(name, "Repeat", [data, repeats]):
-    data = ops.convert_to_tensor(data, name="data")
-    repeats = convert_to_int_tensor(repeats, name="repeats")
-    repeats.shape.with_rank_at_most(1)
-
-    # If `data` is a scalar, then upgrade it to a vector.
-    data = _with_nonzero_rank(data)
-    data_shape = array_ops.shape(data)
-
-    # If `axis` is negative, then convert it to a positive value.
-    axis = get_positive_axis(axis, data.shape.ndims)
-
-    # Check data Tensor shapes.
-    if repeats.shape.ndims == 1:
-      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
-
-    # If we know that `repeats` is a scalar, then we can just tile & reshape.
-    if repeats.shape.ndims == 0:
-      expanded = array_ops.expand_dims(data, axis + 1)
-      tiled = tile_one_dimension(expanded, axis + 1, repeats)
-      result_shape = array_ops.concat(
-          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
-      return array_ops.reshape(tiled, result_shape)
-
-    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
-    if repeats.shape.ndims != axis + 1:
-      repeats_shape = array_ops.shape(repeats)
-      repeats_ndims = array_ops.rank(repeats)
-      broadcast_shape = array_ops.concat(
-          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
-      repeats = array_ops.broadcast_to(repeats, broadcast_shape)
-      repeats.set_shape([None] * (axis + 1))
-
-    # Create a "sequence mask" based on `repeats`, where slices across `axis`
-    # contain one `True` value for each repetition.  E.g., if
-    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
-    max_repeat = math_ops.maximum(0, math_ops.reduce_max(repeats))
-    mask = array_ops.sequence_mask(repeats, max_repeat)
-
-    # Add a new dimension around each value that needs to be repeated, and
-    # then tile that new dimension to match the maximum number of repetitions.
-    expanded = array_ops.expand_dims(data, axis + 1)
-    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
-
-    # Use `boolean_mask` to discard the extra repeated values.  This also
-    # flattens all dimensions up through `axis`.
-    masked = array_ops.boolean_mask(tiled, mask)
-
-    # Reshape the output tensor to add the outer dimensions back.
-    if axis == 0:
-      result = masked
-    else:
-      result_shape = array_ops.concat(
-          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
-      result = array_ops.reshape(masked, result_shape)
-
-    # Preserve shape information.
-    if data.shape.ndims is not None:
-      new_axis_size = 0 if repeats.shape[0] == 0 else None
-      result.set_shape(data.shape[:axis].concatenate(
-          [new_axis_size]).concatenate(data.shape[axis + 1:]))
-
-    return result
-
-
-def tile_one_dimension(data, axis, multiple):
-  """Tiles a single dimension of a tensor."""
-  # Assumes axis is a nonnegative int.
-  if data.shape.ndims is not None:
-    multiples = [1] * data.shape.ndims
-    multiples[axis] = multiple
-  else:
-    ones = array_ops.ones(array_ops.rank(data), dtypes.int32)
-    multiples = array_ops.concat([ones[:axis], [multiple], ones[axis + 1:]],
-                                 axis=0)
-  return array_ops.tile(data, multiples)
-
-
-def _with_nonzero_rank(data):
-  """If `data` is scalar, then add a dimension; otherwise return as-is."""
-  if data.shape.ndims is not None:
-    if data.shape.ndims == 0:
-      return array_ops.stack([data])
-    else:
-      return data
-  else:
-    data_shape = array_ops.shape(data)
-    data_ndims = array_ops.rank(data)
-    return array_ops.reshape(
-        data,
-        array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
+# Note: imported here to avoid circular dependency of array_ops.
+get_positive_axis = array_ops.get_positive_axis
+convert_to_int_tensor = array_ops.convert_to_int_tensor
+repeat = array_ops.repeat_with_axis
 
 
 def lengths_to_splits(lengths):
diff --git a/tensorflow/python/ops/ragged/string_ngrams_op_test.py b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
new file mode 100644
index 00000000000..464eb3bb7f5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
@@ -0,0 +1,278 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the b"License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an b"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the Tensorflow strings.ngrams op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import test
+
+
+class StringNgramsTest(test_util.TensorFlowTestCase):
+
+  def test_unpadded_ngrams(self):
+    data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd"], []]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_tuple_multi_ngrams(self):
+    data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=(2, 3), separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"aa|bb", b"bb|cc", b"cc|dd", b"aa|bb|cc", b"bb|cc|dd"],
+                       [b"ee|ff"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_tuple_multi_ngrams_inverted_order(self):
+    data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=(3, 2), separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd", b"aa|bb", b"bb|cc", b"cc|dd"],
+                       [b"ee|ff"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_list_multi_ngrams(self):
+    data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=[2, 3], separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"aa|bb", b"bb|cc", b"cc|dd", b"aa|bb|cc", b"bb|cc|dd"],
+                       [b"ee|ff"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_multi_ngram_ordering(self):
+    data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=[3, 2], separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd", b"aa|bb", b"bb|cc", b"cc|dd"],
+                       [b"ee|ff"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_fully_padded_ngrams(self):
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [
+        [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"],  # 0
+        [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"],  # 1
+        [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]  # 2
+    ]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_ngram_padding_size_cap(self):
+    # Validate that the padding size is never greater than ngram_size - 1.
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=3,
+        separator=b"|",
+        pad_values=(b"LP", b"RP"),
+        padding_width=10)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [
+        [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"],  # 0
+        [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"],  # 1
+        [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]  # 2
+    ]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_singly_padded_ngrams(self):
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=5,
+        separator=b"|",
+        pad_values=(b"LP", b"RP"),
+        padding_width=1)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[], [b"LP|b|c|d|RP"], []]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_singly_padded_ngrams_with_preserve_short(self):
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=5,
+        separator=b"|",
+        pad_values=(b"LP", b"RP"),
+        padding_width=1,
+        preserve_short_sequences=True)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"LP|a|RP"], [b"LP|b|c|d|RP"], [b"LP|e|f|RP"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_singly_padded_multiple_ngrams(self):
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=(1, 5),
+        separator=b"|",
+        pad_values=(b"LP", b"RP"),
+        padding_width=1)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"a"], [b"b", b"c", b"d", b"LP|b|c|d|RP"], [b"e", b"f"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_single_padding_string(self):
+    data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=5,
+        separator=b"|",
+        pad_values=b"[PAD]",
+        padding_width=1)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[], [b"[PAD]|b|c|d|[PAD]"], []]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_explicit_multiply_padded_ngrams(self):
+    data = [[b"a"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=5,
+        separator=b"|",
+        pad_values=(b"LP", b"RP"),
+        padding_width=2)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[b"LP|LP|a|RP|RP"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_ragged_inputs_with_multiple_ragged_dimensions(self):
+    data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[[[b"aa|bb|cc", b"bb|cc|dd"]], [[]]]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_ragged_inputs_with_multiple_ragged_dimensions_and_preserve(self):
+    data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor,
+        ngram_width=3,
+        separator=b"|",
+        preserve_short_sequences=True)
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[[[b"aa|bb|cc", b"bb|cc|dd"]], [[b"ee|ff"]]]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_ragged_inputs_with_multiple_ragged_dimensions_bigrams(self):
+    data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=2, separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[[[b"aa|bb", b"bb|cc", b"cc|dd"]], [[b"ee|ff"]]]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_ragged_inputs_with_multiple_ragged_dimensions_and_multiple_ngrams(
+      self):
+    data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=(3, 4), separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[[[b"aa|bb|cc", b"bb|cc|dd", b"aa|bb|cc|dd"]], [[]]]]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_dense_input_rank_3(self):
+    data = [[[b"a", b"z"], [b"b", b""]], [[b"b", b""], [b"e", b"f"]]]
+    data_tensor = constant_op.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[[b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"],
+                        [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"]],
+                       [[b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"],
+                        [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]]]
+    self.assertIsInstance(ngram_op, ops.Tensor)
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_dense_input(self):
+    data = [[b"a", b"z"], [b"b", b""], [b"e", b"f"]]
+    data_tensor = constant_op.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [
+        [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"],
+        [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"],
+        [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"],
+    ]
+    self.assertIsInstance(ngram_op, ops.Tensor)
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_input_list_input(self):
+    data = [[b"a", b"z"], [b"b", b""], [b"e", b"f"]]
+    ngram_op = ragged_string_ops.ngrams(
+        data, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [
+        [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"],
+        [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"],
+        [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"],
+    ]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_vector_input(self):
+    data = [b"a", b"z"]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"]
+    self.assertAllEqual(expected_ngrams, result)
+
+  def test_dense_input_with_multiple_ngrams(self):
+    data = [[b"a", b"b", b"c", b"d"], [b"e", b"f", b"g", b"h"]]
+    data_tensor = ragged_factory_ops.constant(data)
+    ngram_op = ragged_string_ops.ngrams(
+        data_tensor, ngram_width=(1, 2, 3), separator=b"|")
+    result = self.evaluate(ngram_op)
+    expected_ngrams = [[
+        b"a", b"b", b"c", b"d", b"a|b", b"b|c", b"c|d", b"a|b|c", b"b|c|d"
+    ], [b"e", b"f", b"g", b"h", b"e|f", b"f|g", b"g|h", b"e|f|g", b"f|g|h"]]
+    self.assertAllEqual(expected_ngrams, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 8af6d664332..addcdbe9b84 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -73,6 +74,7 @@ def random_normal(shape,
         shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
+    tensor_util.maybe_set_static_shape(value, shape)
     return value
 
 
@@ -129,6 +131,7 @@ def parameterized_truncated_normal(shape,
         maxvals_tensor,
         seed=seed1,
         seed2=seed2)
+    tensor_util.maybe_set_static_shape(rnd, shape)
     return rnd
 
 
@@ -172,6 +175,7 @@ def truncated_normal(shape,
         shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
+    tensor_util.maybe_set_static_shape(value, shape)
     return value
 
 
@@ -235,11 +239,17 @@ def random_uniform(shape,
     maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
     seed1, seed2 = random_seed.get_seed(seed)
     if dtype.is_integer:
-      return gen_random_ops.random_uniform_int(
+      result = gen_random_ops.random_uniform_int(
           shape, minval, maxval, seed=seed1, seed2=seed2, name=name)
     else:
       rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
-      return math_ops.add(rnd * (maxval - minval), minval, name=name)
+      result = math_ops.add(rnd * (maxval - minval), minval, name=name)
+    # TODO(b/132092188): C++ shape inference inside functional ops does not
+    # cross FuncGraph boundaries since that information is only available in
+    # python. So we manually get the static shape using
+    # `constant_value_as_shape` which *does* cross function boundaries.
+    tensor_util.maybe_set_static_shape(result, shape)
+    return result
 
 
 ops.NotDifferentiable("RandomUniform")
@@ -332,7 +342,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   ```python
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
-  samples = tf.random.categorical(tf.math.log([[10., 10.]]), 5)
+  samples = tf.random.categorical(tf.math.log([[0.5, 0.5]]), 5)
   ```
 
   Args:
@@ -360,7 +370,7 @@ def categorical(logits, num_samples, dtype=None, seed=None, name=None):
   ```python
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
-  samples = tf.random.categorical(tf.math.log([[10., 10.]]), 5)
+  samples = tf.random.categorical(tf.math.log([[0.5, 0.5]]), 5)
   ```
 
   Args:
@@ -390,6 +400,16 @@ def multinomial_categorical_impl(logits, num_samples, dtype, seed):
 ops.NotDifferentiable("Multinomial")
 
 
+def _maybe_set_static_shape_helper(tensor, shape, postfix_tensor):
+  if (not context.executing_eagerly() and
+      ops.get_default_graph().building_function and
+      not tensor.shape.is_fully_defined()):
+    shape = tensor_util.shape_tensor(shape)
+    const_shape = tensor_util.constant_value_as_shape(shape)
+    postfix_tensor = ops.convert_to_tensor(postfix_tensor)
+    tensor.set_shape(const_shape.concatenate(postfix_tensor.shape))
+
+
 @tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
@@ -468,10 +488,12 @@ def random_gamma(shape,
         beta if beta is not None else 1, name="beta", dtype=dtype)
     alpha_broadcast = alpha + array_ops.zeros_like(beta)
     seed1, seed2 = random_seed.get_seed(seed)
-    return math_ops.maximum(
-        np.finfo(dtype.as_numpy_dtype).tiny,
+    result = math_ops.maximum(
+        np.finfo(alpha.dtype.as_numpy_dtype).tiny,
         gen_random_ops.random_gamma(
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
+    _maybe_set_static_shape_helper(result, shape, alpha_broadcast)
+    return result
 
 
 @tf_export(v1=["random.poisson", "random_poisson"])
@@ -553,5 +575,7 @@ def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
   with ops.name_scope(name, "random_poisson", [lam, shape]):
     shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
     seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.random_poisson_v2(
+    result = gen_random_ops.random_poisson_v2(
         shape, lam, dtype=dtype, seed=seed1, seed2=seed2)
+    _maybe_set_static_shape_helper(result, shape, lam)
+    return result
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 6e591ca14a8..dc6ebd0f64f 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -24,6 +24,7 @@ import functools
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
+from tensorflow.python import _pywrap_utils
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -540,9 +541,6 @@ class BaseResourceVariable(variables.VariableV1):
     return self._initial_value
 
   @property
-  @deprecated(
-      None,
-      "Apply a constraint manually following the optimizer update step.")
   def constraint(self):
     """Returns the constraint function associated with this variable.
 
@@ -1784,7 +1782,7 @@ class UninitializedVariable(BaseResourceVariable):
         synchronization=synchronization, aggregation=aggregation)
 
 
-pywrap_tensorflow.RegisterType("ResourceVariable", ResourceVariable)
+_pywrap_utils.RegisterType("ResourceVariable", ResourceVariable)
 math_ops._resource_variable_type = ResourceVariable  # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 21040976996..b2a8b3c28df 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -252,9 +252,10 @@ class RNNCell(base_layer.Layer):
           variable in tf_variables.trainable_variables() or
           (isinstance(variable, tf_variables.PartitionedVariable) and
            list(variable)[0] in tf_variables.trainable_variables()))
-    if trainable and variable not in self._trainable_weights:
+    if trainable and all(variable is not v for v in self._trainable_weights):
       self._trainable_weights.append(variable)
-    elif not trainable and variable not in self._non_trainable_weights:
+    elif not trainable and all(
+        variable is not v for v in self._non_trainable_weights):
       self._non_trainable_weights.append(variable)
     return variable
 
@@ -301,7 +302,7 @@ class RNNCell(base_layer.Layer):
 
       batch_size = inputs.shape.dims[0].value or array_ops.shape(inputs)[0]
       dtype = inputs.dtype
-    if None in [batch_size, dtype]:
+    if batch_size is None or dtype is None:
       raise ValueError(
           "batch_size and dtype cannot be None while constructing initial "
           "state: batch_size={}, dtype={}".format(batch_size, dtype))
diff --git a/tensorflow/python/ops/rnn_cell_wrapper_impl.py b/tensorflow/python/ops/rnn_cell_wrapper_impl.py
index f0020b652bc..49d61c5b1e5 100644
--- a/tensorflow/python/ops/rnn_cell_wrapper_impl.py
+++ b/tensorflow/python/ops/rnn_cell_wrapper_impl.py
@@ -194,6 +194,10 @@ class DropoutWrapperBase(object):
   def output_size(self):
     return self.cell.output_size
 
+  def build(self, inputs_shape):
+    self.cell.build(inputs_shape)
+    self.built = True
+
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       return self.cell.zero_state(batch_size, dtype)
diff --git a/tensorflow/python/ops/rnn_grad.py b/tensorflow/python/ops/rnn_grad.py
index f2707e178b0..e316b7fb8a1 100644
--- a/tensorflow/python/ops/rnn_grad.py
+++ b/tensorflow/python/ops/rnn_grad.py
@@ -21,7 +21,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_rnn_ops
 
 
-@ops.RegisterGradient("BlockLSTM")
 def _block_lstm_grad(op, *grads):
   """Gradient for the BlockLSTM op."""
   seq_len_max, x, cs_prev, h_prev, w, wci, wcf, wco, b = op.inputs
@@ -50,3 +49,7 @@ def _block_lstm_grad(op, *grads):
        use_peephole=op.get_attr("use_peephole"))
   return (None, x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad,
           wco_grad, b_grad)
+
+
+ops.RegisterGradient("BlockLSTM")(_block_lstm_grad)
+ops.RegisterGradient("BlockLSTMV2")(_block_lstm_grad)
diff --git a/tensorflow/python/ops/rnn_grad_test.py b/tensorflow/python/ops/rnn_grad_test.py
new file mode 100644
index 00000000000..2b320234538
--- /dev/null
+++ b/tensorflow/python/ops/rnn_grad_test.py
@@ -0,0 +1,99 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for gradients of (block) LSTM/GRU operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_rnn_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+class RNNGradTest(test.TestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def testBlockLSTMV1V2Consistency(self):
+    num_steps = 1
+    batch_size = 1
+    input_size = 1
+    hidden_size = 8
+    w = deterministic_random_uniform(
+        [input_size + hidden_size, 4 * hidden_size])
+    b = deterministic_random_uniform([4 * hidden_size])
+    x = deterministic_random_uniform([num_steps, batch_size, input_size])
+    cs_prev = h_prev = deterministic_random_uniform([batch_size, hidden_size])
+
+    all_cs, all_h = self._lstm_block(
+        functools.partial(
+            gen_rnn_ops.BlockLSTM,
+            forget_bias=0.0,  # Disable to match V2 default.
+            cell_clip=0.0),  # Disable to match V2 default.
+        w, b, x, cs_prev, h_prev)
+    w_grad, b_grad = gradients.gradients(all_cs + all_h, [w, b])
+
+    w_ifco, b_ifco = icfo_to_ifco(w, b)
+    all_cs_ifco, all_h_ifco = self._lstm_block(
+        gen_rnn_ops.BlockLSTMV2, w_ifco, b_ifco, x, cs_prev, h_prev)
+    w_ifco_grad, b_ifco_grad = gradients.gradients(
+        all_cs_ifco + all_h_ifco, [w_ifco, b_ifco])
+
+    self.assertAllEqual(all_cs, all_cs_ifco)
+    self.assertAllEqual(all_h, all_h_ifco)
+    self.assertAllEqual(w_grad, w_ifco_grad)
+    self.assertAllEqual(b_grad, b_ifco_grad)
+
+  def _lstm_block(self, op, w, b, x, cs_prev, h_prev):
+    w_peephole = array_ops.zeros(cs_prev.shape[1:], dtype=w.dtype)
+    _, all_cs, _, _, _, _, all_h = op(
+        seq_len_max=math_ops.cast(array_ops.shape(x)[0], dtypes.int64),
+        x=x,
+        cs_prev=cs_prev,
+        h_prev=h_prev,
+        w=w,
+        wci=w_peephole,
+        wcf=w_peephole,
+        wco=w_peephole,
+        b=b,
+        use_peephole=False)
+    return all_cs, all_h
+
+
+def deterministic_random_uniform(shape):
+  return ops.convert_to_tensor(np.random.random(shape), dtype=dtypes.float32)
+
+
+def icfo_to_ifco(w, b):
+  """Convert gates' weights and biases from ICFO to IFCO layout."""
+  w_i, w_c, w_f, w_o = array_ops.split(w, num_or_size_splits=4, axis=1)
+  b_i, b_c, b_f, b_o = array_ops.split(b, num_or_size_splits=4)
+  w_ifco = array_ops.concat([w_i, w_f, w_c, w_o], axis=1)
+  b_ifco = array_ops.concat([b_i, b_f, b_c, b_o], axis=0)
+  return w_ifco, b_ifco
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index a99ddffe242..3de4e1a23ef 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -34,7 +34,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
@@ -47,6 +46,19 @@ from tensorflow.python.util.tf_export import tf_export
 tape_cache = {}
 
 
+def _maybe_copy_to_context_device(tensor, device_name):
+  """Copy an EagerTensor to the current device if it's not on `device_name`."""
+  in_device = tensor.backing_device
+  if device_name == in_device:
+    return tensor
+  else:
+    # Note that EagerTensor._copy bypasses the placer and copies to the context
+    # device, which means e.g. int32 Tensors which would normally be forced onto
+    # the CPU can instead be placed on the GPU. This is necessary so that the
+    # PyFunc kernel always returns Tensors on the device it's executing on.
+    return tensor._copy()  # pylint: disable=protected-access
+
+
 class EagerFunc(object):
   """A wrapper for a function owned by an EagerPyFunc."""
 
@@ -108,18 +120,24 @@ class EagerFunc(object):
           if t.dtype.is_floating:
             tape.watch(t)
       ret = self._func(*args)
-      # Use tf.identity to copy the returned tensors to device if neccesary.
+      # copy the returned tensors to the PyFunc op's device if necessary.
+      device_name = device
+      if device_name is None:
+        # "None" here means "CPU", from the nullptr convention with C++ device
+        # pointers.
+        device_name = "/job:localhost/replica:0/task:0/device:CPU:0"
       with ops.device(device):
         if isinstance(ret, (tuple, list)):
           outputs = [
-              array_ops.identity(self._convert(x, dtype=dtype))
+              _maybe_copy_to_context_device(self._convert(x, dtype=dtype),
+                                            device_name)
               for (x, dtype) in zip(ret, self._out_dtypes)
           ]
         elif ret is None:
           outputs = None
         else:
-          outputs = array_ops.identity(
-              self._convert(ret, dtype=self._out_dtypes[0]))
+          outputs = _maybe_copy_to_context_device(
+              self._convert(ret, dtype=self._out_dtypes[0]), device_name)
     tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
     return outputs
 
@@ -252,6 +270,9 @@ def _internal_py_func(func,
                       is_grad_func=False,
                       name=None):
   """See documentation for py_func and eager_py_func."""
+  if not callable(func):
+    raise ValueError("Expected func to be callable, got func of type {}".format(
+        type(func)))
 
   is_list_or_tuple = False
   if isinstance(Tout, (list, tuple)):
@@ -286,7 +307,11 @@ def _internal_py_func(func,
 
   if eager:
     result = gen_script_ops.eager_py_func(
-        input=inp, token=token, Tout=Tout, name=name)
+        input=inp,
+        token=token,
+        is_async=context.is_async(),
+        Tout=Tout,
+        name=name)
   else:
     if stateful:
       result = gen_script_ops.py_func(
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index f6b26c80a10..2737c8b46bc 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -23,11 +23,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import numbers
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -46,6 +46,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
@@ -785,29 +786,44 @@ def sparse_reshape(sp_input, shape, name=None):
     reshaped_ind, reshaped_shape = gen_sparse_ops.sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
-    reshaped_shape_const = tensor_util.constant_value(shape)
-    if (reshaped_shape_const is not None and
-        sp_input.get_shape().is_fully_defined()):
-      num_implied = sum((dim == -1) for dim in reshaped_shape_const)
-      if num_implied > 1:
-        raise ValueError("At most one dimension can be inferred (-1). Found: %s"
-                         % reshaped_shape_const)
-      original_reshaped_shape = list(reshaped_shape_const)  # Copy.
-      in_shape_size = np.prod(sp_input.get_shape().as_list())
-      if num_implied:
-        implied_idx = original_reshaped_shape.index(-1)
+    reshaped_shape_const = tensor_util.constant_value_as_shape(shape)
+    reshaped_shape_const = (
+        reshaped_shape_const.as_list() if reshaped_shape_const.ndims is not None
+        else None)
+
+    if (reshaped_shape_const is not None
+        and sp_input.shape.is_fully_defined()):
+      # constant_value_as_shape tends to get more information about the partial
+      # shape values, but here we specifically need to know if the *user* passed
+      # a shape with 2+ unknown dimensions; and for that constant_value
+      # provides either the user's direct value or None if only partial elements
+      # are known via the python shape inference code.
+      shape_const_by_user = tensor_util.constant_value(shape)
+      if shape_const_by_user is not None:
+        num_implied_by_user = sum(d == -1 for d in shape_const_by_user)
+        if num_implied_by_user > 1:
+          raise ValueError(
+              "At most one dimension can be inferred (-1). Found: %s"
+              % shape_const_by_user)
+      original_reshaped_shape = list(reshaped_shape_const)  # A copy
+      in_shape_size = np.prod(sp_input.shape.as_list())
+      num_implied = sum(dim is None for dim in reshaped_shape_const)
+      if num_implied == 1:
+        implied_idx = original_reshaped_shape.index(None)
         non_implied_idx = (
             original_reshaped_shape[:implied_idx] +
             original_reshaped_shape[implied_idx + 1:])
-        reshaped_shape_const[implied_idx] = (
+        reshaped_shape_const[implied_idx] = int(
             in_shape_size // np.prod(non_implied_idx))
-      reshaped_size = np.prod(reshaped_shape_const)
-      if reshaped_size != in_shape_size:
-        raise ValueError("Cannot reshape a tensor with %d elements to shape %s "
-                         "(%d elements)." %
-                         (in_shape_size, original_reshaped_shape,
-                          reshaped_size))
-      reshaped_shape = reshaped_shape_const
+      if num_implied <= 1:
+        reshaped_size = np.prod(reshaped_shape_const)
+        if reshaped_size != in_shape_size:
+          raise ValueError(
+              "Cannot reshape a tensor with %d elements to shape %s "
+              "(%d elements)." %
+              (in_shape_size, original_reshaped_shape, reshaped_size))
+        reshaped_shape = constant_op.constant(
+            reshaped_shape_const, dtype=dtypes.int64)
 
     return sparse_tensor.SparseTensor(reshaped_ind,
                                       array_ops.identity(sp_input.values),
@@ -1430,7 +1446,7 @@ def sparse_reduce_sum_sparse(sp_input,
 @tf_export("sparse.to_dense", v1=["sparse.to_dense", "sparse_tensor_to_dense"])
 @deprecation.deprecated_endpoints("sparse_tensor_to_dense")
 def sparse_tensor_to_dense(sp_input,
-                           default_value=0,
+                           default_value=None,
                            validate_indices=True,
                            name=None):
   """Converts a `SparseTensor` into a dense tensor.
@@ -1470,6 +1486,8 @@ def sparse_tensor_to_dense(sp_input,
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
+  if default_value is None:
+    default_value = array_ops.zeros([], dtype=sp_input.dtype)
 
   return gen_sparse_ops.sparse_to_dense(
       sp_input.indices,
@@ -1658,10 +1676,10 @@ def sparse_merge_impl(sp_ids,
                       type(vocab_size))
     vocab_size = [vocab_size]
   else:
-    if not isinstance(sp_ids, collections.Iterable):
+    if not isinstance(sp_ids, collections_abc.Iterable):
       raise TypeError("sp_ids has to be a SparseTensor or list thereof. "
                       "Found %s" % type(sp_ids))
-    if not isinstance(vocab_size, collections.Iterable):
+    if not isinstance(vocab_size, collections_abc.Iterable):
       raise TypeError("vocab_size has to be a list of Tensors or Python ints. "
                       "Found %s" % type(vocab_size))
     for dim in vocab_size:
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 992a330a959..f48c5448aaf 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -125,6 +125,14 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     epsilon = 1e-4
     self.assertLess(gradient_checker.max_error(*grads), epsilon)
 
+  def testSparseTensorToDenseString(self):
+    sp = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=['a', 'b'], dense_shape=[2, 3])
+    dense = sparse_ops.sparse_tensor_to_dense(sp)
+    expected_dense = [[b'a', b'', b''], [b'', b'', b'b']]
+    result_dense = self.evaluate(dense)
+    self.assertAllEqual(expected_dense, result_dense)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 6b47f7e6347..5f3b179d5d2 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -169,7 +169,7 @@ def _enclosing_tpu_context():
 
 @tf_export('einsum', 'linalg.einsum')
 def einsum(equation, *inputs, **kwargs):
-  """A generalized contraction between tensors of arbitrary dimension.
+  """Tensor contraction over specified indices and outer product.
 
   This function returns a tensor whose elements are defined by `equation`,
   which is written in a shorthand form inspired by the Einstein summation
@@ -521,7 +521,7 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
 
 def _transpose_if_necessary(tensor, perm):
   """Like transpose(), but avoids creating a new tensor if possible."""
-  if perm != range(len(perm)):
+  if perm != list(range(len(perm))):
     return array_ops.transpose(tensor, perm=perm)
   else:
     return tensor
@@ -533,7 +533,8 @@ def _reshape_if_necessary(tensor, new_shape):
   new_shape = tuple(-1 if x is None else x for x in new_shape)
   cur_shape = tuple(x.value for x in tensor.get_shape().dims)
   if (len(new_shape) == len(cur_shape) and
-      all(d0 == d1 or d1 == -1 for d0, d1 in zip(cur_shape, new_shape))):
+      all(not isinstance(d1, ops.Tensor) and (d0 == d1 or d1 == -1)
+          for d0, d1 in zip(cur_shape, new_shape))):
     return tensor
   else:
     return array_ops.reshape(tensor, new_shape)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 6ede007ff1e..047a3a54780 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import sys as _sys
 
 from tensorflow.python import autograph
+from tensorflow.python.training.experimental import loss_scaling_gradient_tape
 
 # pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 736921f238a..6819c8fc6b3 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -193,10 +193,12 @@ class Generator(tracking.AutoTrackable):
       copy_from: a generator to be copied from.
       state: a vector of dtype STATE_TYPE representing the initial state of the
         RNG, whose length and semantics are algorithm-specific.
-      alg: the RNG algorithm. Possible values are RNG_ALG_PHILOX for the
-        Philox algorithm and RNG_ALG_THREEFRY for the ThreeFry
+      alg: the RNG algorithm. Possible values are `RNG_ALG_PHILOX` for the
+        Philox algorithm and `RNG_ALG_THREEFRY` for the ThreeFry
         algorithm (see paper 'Parallel Random Numbers: As Easy as 1, 2, 3'
         [https://www.thesalmons.org/john/random123/papers/random123sc11.pdf]).
+        Note `RNG_ALG_PHILOX` guarantees the same numbers are produced (given
+        the same random state) across all architextures (CPU, GPU, XLA etc).
     """
     if copy_from is not None:
       # All other arguments should be None
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index dc3f8ff26fe..62f65ced8d6 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -92,12 +92,14 @@ def stateless_random_uniform(shape,
     minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
     maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
     if dtype.is_integer:
-      return gen_stateless_random_ops.stateless_random_uniform_int(
+      result = gen_stateless_random_ops.stateless_random_uniform_int(
           shape, seed=seed, minval=minval, maxval=maxval, name=name)
     else:
       rnd = gen_stateless_random_ops.stateless_random_uniform(
           shape, seed=seed, dtype=dtype)
-      return math_ops.add(rnd * (maxval - minval), minval, name=name)
+      result = math_ops.add(rnd * (maxval - minval), minval, name=name)
+    tensor_util.maybe_set_static_shape(result, shape)
+    return result
 
 
 @tf_export("random.stateless_normal")
@@ -134,7 +136,9 @@ def stateless_random_normal(shape,
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
-    return math_ops.add(rnd * stddev, mean, name=name)
+    result = math_ops.add(rnd * stddev, mean, name=name)
+    tensor_util.maybe_set_static_shape(result, shape)
+    return result
 
 
 @tf_export("random.stateless_truncated_normal")
@@ -177,7 +181,9 @@ def stateless_truncated_normal(shape,
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     rnd = gen_stateless_random_ops.stateless_truncated_normal(
         shape, seed, dtype)
-    return math_ops.add(rnd * stddev, mean, name=name)
+    result = math_ops.add(rnd * stddev, mean, name=name)
+    tensor_util.maybe_set_static_shape(result, shape)
+    return result
 
 
 @tf_export(v1=["random.stateless_multinomial"])
@@ -202,7 +208,7 @@ def stateless_multinomial(logits,
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
   samples = tf.random.stateless_categorical(
-      tf.math.log([[10., 10.]]), 5, seed=[7, 17])
+      tf.math.log([[0.5, 0.5]]), 5, seed=[7, 17])
   ```
 
   Args:
@@ -241,7 +247,7 @@ def stateless_categorical(logits,
   # samples has shape [1, 5], where each value is either 0 or 1 with equal
   # probability.
   samples = tf.random.stateless_categorical(
-      tf.math.log([[10., 10.]]), 5, seed=[7, 17])
+      tf.math.log([[0.5, 0.5]]), 5, seed=[7, 17])
   ```
 
   Args:
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index c27d845db4d..dced1400287 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -304,13 +304,8 @@ def string_split_v2(source, sep=None, maxsplit=-1):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
-def _reduce_join_reduction_dims(x, axis, reduction_indices):
-  """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
-  # TODO(aselle): Remove this after deprecation
-  if reduction_indices is not None:
-    if axis is not None:
-      raise ValueError("Can't specify both 'axis' and 'reduction_indices'.")
-    axis = reduction_indices
+def _reduce_join_reduction_dims(x, axis):
+  """Returns range(rank(x) - 1, 0, -1) if axis is None; or axis otherwise."""
   if axis is not None:
     return axis
   else:
@@ -324,35 +319,48 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
 
 
 @tf_export(v1=["strings.reduce_join", "reduce_join"])
+@deprecation.deprecated_args(None,
+                             "keep_dims is deprecated, use keepdims instead",
+                             "keep_dims")
 @deprecation.deprecated_endpoints("reduce_join")
 def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
-                keep_dims=False,
+                keep_dims=None,
                 separator="",
                 name=None,
                 reduction_indices=None,
                 keepdims=None):
-  keep_dims = deprecation.deprecated_argument_lookup(
-      "keepdims", keepdims, "keep_dims", keep_dims)
-  inputs_t = ops.convert_to_tensor(inputs)
-  reduction_indices = _reduce_join_reduction_dims(
-      inputs_t, axis, reduction_indices)
-  return gen_string_ops.reduce_join(
-      inputs=inputs_t,
-      reduction_indices=reduction_indices,
-      keep_dims=keep_dims,
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
+  return reduce_join_v2(
+      inputs=inputs,
+      axis=axis,
+      keepdims=keepdims,
       separator=separator,
       name=name)
 
 
 @tf_export("strings.reduce_join", v1=[])
+@dispatch.add_dispatch_support
 def reduce_join_v2(  # pylint: disable=missing-docstring
     inputs,
     axis=None,
     keepdims=False,
     separator="",
     name=None):
-  return reduce_join(
-      inputs, axis, keep_dims=keepdims, separator=separator, name=name)
+  with ops.name_scope(None, "ReduceJoin", [inputs, axis]):
+    inputs_t = ops.convert_to_tensor(inputs)
+    axis = _reduce_join_reduction_dims(inputs_t, axis)
+    return gen_string_ops.reduce_join(
+        inputs=inputs_t,
+        reduction_indices=axis,
+        keep_dims=keepdims,
+        separator=separator,
+        name=name)
 
 
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index c13b9b83ff4..ff5919a8d50 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import util as trackable_util
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -617,8 +618,9 @@ class EagerTemplate(Template):
           raise ValueError(
               "Trainable variable created when calling a template "
               "after the first time, perhaps you used tf.Variable "
-              "when you meant tf.get_variable: %s" %
-              list(set(trainable_variables) - set(trainable_at_start)))
+              "when you meant tf.get_variable: %s" % list(
+                  object_identity.ObjectIdentitySet(trainable_variables) -
+                  object_identity.ObjectIdentitySet(trainable_at_start)))
 
         # Non-trainable tracking variables are a legitimate reason why a new
         # variable would be created, but it is a relatively advanced use-case,
@@ -629,7 +631,9 @@ class EagerTemplate(Template):
               "New variables created when calling a template after "
               "the first time, perhaps you used tf.Variable when you "
               "meant tf.get_variable: %s",
-              list(set(variables) - set(vars_at_start)))
+              list(
+                  object_identity.ObjectIdentitySet(variables) -
+                  object_identity.ObjectIdentitySet(vars_at_start)))
       else:
         self._variables_created = True
       return result
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 8007fd78954..cc0f2d246a5 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import numpy as np
 import traceback
 import weakref
 
@@ -135,7 +136,8 @@ class _GraphTensorArray(object):
     # of the first write. If `infer_shape` is true, all writes checks for
     # shape equality.
     self._element_shape = [tensor_shape.as_shape(element_shape)]
-    self._infer_shape = element_shape is not None or infer_shape
+    self._infer_shape = infer_shape
+    self._size = size
     with ops.name_scope(name, "TensorArray", [handle, size, flow]) as scope:
       if handle is not None:
         self._handle = handle
@@ -179,7 +181,7 @@ class _GraphTensorArray(object):
   def element_shape(self):
     return self._element_shape[0]
 
-  def _merge_element_shape(self, shape):
+  def _check_element_shape(self, shape):
     """Changes the element shape of the array given a shape to merge with.
 
     Args:
@@ -190,10 +192,10 @@ class _GraphTensorArray(object):
           element shape of the `TensorArray`.
     """
     if not shape.is_compatible_with(self.element_shape):
-      raise ValueError(
-          "Inconsistent shapes: saw %s but expected %s "
-          "(and infer_shape=True)" % (shape, self.element_shape))
-    self._element_shape[0] = self.element_shape.merge_with(shape)
+      raise ValueError("Inconsistent shapes: saw %s but expected %s " %
+                       (shape, self.element_shape))
+    if self._infer_shape:
+      self._element_shape[0] = self.element_shape.merge_with(shape)
 
   @contextlib.contextmanager
   def _maybe_colocate_with(self, value):
@@ -266,8 +268,7 @@ class _GraphTensorArray(object):
       value = ops.convert_to_tensor(
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
-      if self._infer_shape:
-        self._merge_element_shape(value.shape)
+      self._check_element_shape(value.shape)
       with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops.tensor_array_write_v3(
             handle=self._handle,
@@ -281,7 +282,12 @@ class _GraphTensorArray(object):
     """See TensorArray."""
     with ops.colocate_with(self._handle):
       with ops.name_scope(name, "TensorArrayStack", [self._handle]):
-        return self.gather(math_ops.range(0, self.size()), name=name)
+        value = self.gather(math_ops.range(0, self.size()), name=name)
+        if (self.element_shape and not self._dynamic_size and
+            self._size is not None):
+          value.set_shape([tensor_util.constant_value(self._size)] +
+                          self.element_shape.dims)
+        return value
 
   def gather(self, indices, name=None):
     """See TensorArray."""
@@ -329,8 +335,8 @@ class _GraphTensorArray(object):
       value = ops.convert_to_tensor(
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
-      if self._infer_shape and not context.executing_eagerly():
-        self._merge_element_shape(value.shape[1:])
+      if not context.executing_eagerly():
+        self._check_element_shape(value.shape[1:])
       with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops.tensor_array_scatter_v3(
             handle=self._handle,
@@ -348,11 +354,11 @@ class _GraphTensorArray(object):
       value = ops.convert_to_tensor(value, dtype=self._dtype, name="value")
       with self._maybe_colocate_with(value):
         lengths_64 = math_ops.cast(lengths, dtypes.int64)
-        if self._infer_shape and not context.executing_eagerly():
+        if not context.executing_eagerly():
           clengths = tensor_util.constant_value(lengths_64)
-          if value.shape.dims is not None:
-            if clengths is not None and clengths.max() == clengths.min():
-              self._merge_element_shape(
+          if value.shape.dims is not None and clengths is not None:
+            if clengths.shape and clengths.max() == clengths.min():
+              self._check_element_shape(
                   tensor_shape.TensorShape([clengths[0]]).concatenate(
                       value.shape[1:]))
         flow_out = gen_data_flow_ops.tensor_array_split_v3(
@@ -365,8 +371,11 @@ class _GraphTensorArray(object):
 
   def size(self, name=None):
     """See TensorArray."""
-    return gen_data_flow_ops.tensor_array_size_v3(
-        handle=self._handle, flow_in=self.flow, name=name)
+    if not self._dynamic_size and self._size is not None:
+      return ops.convert_to_tensor(self._size, dtype=dtypes.int32)
+    else:
+      return gen_data_flow_ops.tensor_array_size_v3(
+          handle=self._handle, flow_in=self.flow, name=name)
 
   @tf_should_use.should_use_result
   def close(self, name=None):
@@ -427,6 +436,7 @@ class _GraphTensorArrayV2(object):
     del colocate_with_first_write_call
 
     self._dynamic_size = dynamic_size
+    self._size = size
 
     if (flow is not None and
         (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
@@ -447,7 +457,7 @@ class _GraphTensorArrayV2(object):
     # of the first write. If `infer_shape` is true, all writes checks for
     # shape equality.
     self._element_shape = [tensor_shape.as_shape(element_shape)]
-    self._infer_shape = element_shape is not None or infer_shape
+    self._infer_shape = infer_shape
     with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope:
       if flow is None:
         self._flow = list_ops.tensor_list_reserve(
@@ -480,7 +490,7 @@ class _GraphTensorArrayV2(object):
     # complain.
     return None
 
-  def _merge_element_shape(self, shape):
+  def _check_element_shape(self, shape):
     """Changes the element shape of the array given a shape to merge with.
 
     Args:
@@ -491,10 +501,10 @@ class _GraphTensorArrayV2(object):
           element shape of the `TensorArray`.
     """
     if not shape.is_compatible_with(self.element_shape):
-      raise ValueError(
-          "Inconsistent shapes: saw %s but expected %s "
-          "(and infer_shape=True)" % (shape, self.element_shape))
-    self._element_shape[0] = self.element_shape.merge_with(shape)
+      raise ValueError("Inconsistent shapes: saw %s but expected %s " %
+                       (shape, self.element_shape))
+    if self._infer_shape:
+      self._element_shape[0] = self.element_shape.merge_with(shape)
 
   def identity(self):
     """See TensorArray."""
@@ -524,8 +534,7 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
-      if self._infer_shape:
-        self._merge_element_shape(value.shape)
+      self._check_element_shape(value.shape)
       flow_out = list_ops.tensor_list_set_item(
           input_handle=self._flow,
           index=index,
@@ -537,9 +546,15 @@ class _GraphTensorArrayV2(object):
   def stack(self, name=None):
     """See TensorArray."""
     with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      # TODO(b/139941163): remove constant_value after changing num_elements to regular input
+      if not self._dynamic_size and self._size is not None:
+        ta_size = tensor_util.constant_value(self._size)
+      else:
+        ta_size = -1
       value = list_ops.tensor_list_stack(
           input_handle=self._flow,
           element_dtype=self._dtype,
+          num_elements=ta_size,
           element_shape=self.element_shape)
       return value
 
@@ -575,8 +590,7 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
-      if self._infer_shape and not context.executing_eagerly():
-        self._merge_element_shape(value.shape[1:])
+      self._check_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_from_tensor(
           tensor=value, element_shape=value.shape[1:])
       return build_ta_with_new_flow(self, flow_out)
@@ -590,8 +604,7 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
-      if self._infer_shape and not context.executing_eagerly():
-        self._merge_element_shape(value.shape[1:])
+      self._check_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_scatter(
           tensor=value, indices=indices, element_shape=self.element_shape,
           input_handle=self._flow)
@@ -606,11 +619,11 @@ class _GraphTensorArrayV2(object):
           value, preferred_dtype=self._dtype, name="value")
       _check_dtypes(value, self._dtype)
       lengths_64 = math_ops.cast(lengths, dtypes.int64)
-      if self._infer_shape and not context.executing_eagerly():
+      if not context.executing_eagerly():
         clengths = tensor_util.constant_value(lengths_64)
-        if value.shape.dims is not None:
-          if clengths is not None and clengths.max() == clengths.min():
-            self._merge_element_shape(
+        if value.shape.dims is not None and clengths is not None:
+          if clengths.shape and clengths.max() == clengths.min():
+            self._check_element_shape(
                 tensor_shape.TensorShape([clengths[0]]).concatenate(
                     value.shape[1:]))
       flow_out = list_ops.tensor_list_split(
@@ -622,7 +635,10 @@ class _GraphTensorArrayV2(object):
 
   def size(self, name=None):
     """See TensorArray."""
-    return list_ops.tensor_list_length(input_handle=self._flow, name=name)
+    if not self._dynamic_size and self._size is not None:
+      return ops.convert_to_tensor(self._size, dtype=dtypes.int32)
+    else:
+      return list_ops.tensor_list_length(input_handle=self._flow, name=name)
 
   @tf_should_use.should_use_result
   def close(self, name=None):
@@ -688,7 +704,7 @@ class _EagerTensorArray(object):
     # we assign a dummy value to _flow in case other code assumes it to be
     # a Tensor
     self._flow = constant_op.constant(0, dtype=dtypes.int32)
-    self._infer_shape = element_shape is not None or infer_shape
+    self._infer_shape = infer_shape
     self._element_shape = tensor_shape.as_shape(element_shape)
     self._colocate_with_first_write_call = colocate_with_first_write_call
 
@@ -804,12 +820,12 @@ class _EagerTensorArray(object):
           "TensorArray dtype is %s but Op is trying to write dtype %s" %
           (self._dtype.name, value.dtype.name))
 
+    if not self._element_shape.is_compatible_with(value.shape):
+      raise ValueError("Incompatible shape for value (%s), expected (%s)" %
+                       (value.shape, self._element_shape))
+
     if self._infer_shape:
-      if not self._element_shape.is_compatible_with(value.shape):
-        raise ValueError("Incompatible shape for value (%s), expected (%s)" %
-                         (value.shape, self._element_shape))
-      else:
-        self._element_shape = self._element_shape.merge_with(value.shape)
+      self._element_shape = self._element_shape.merge_with(value.shape)
 
     self._tensor_array[index] = value
 
@@ -831,8 +847,12 @@ class _EagerTensorArray(object):
     if self._tensor_array:
       for ix in range(len(self._tensor_array)):
         self._maybe_zero(ix)
-    return ops.convert_to_tensor(
-        self._tensor_array, name=name, dtype=self._dtype)
+    if not self._tensor_array and self._element_shape.is_fully_defined():
+      return ops.convert_to_tensor(
+          np.ndarray([0] + self._element_shape), name=name, dtype=self._dtype)
+    else:
+      return ops.convert_to_tensor(
+          self._tensor_array, name=name, dtype=self._dtype)
 
   def gather(self, indices, name=None):
     """See TensorArray."""
@@ -1226,6 +1246,7 @@ def build_ta_with_new_flow(old_ta, flow):
       colocate_with_first_write_call=impl._colocate_with_first_write_call)
   new_impl = new_ta._implementation
   new_impl._dynamic_size = impl._dynamic_size
+  new_impl._size = impl._size
   new_impl._colocate_with = impl._colocate_with
   new_impl._element_shape = impl._element_shape  # Share _element_shape.
   return new_ta
@@ -1318,7 +1339,7 @@ class TensorArraySpec(type_spec.TypeSpec):
         flow=tensor_list[0],
         dynamic_size=self._dynamic_size,
         infer_shape=self._infer_shape)
-    ret._element_shape = [self._element_shape]  # pylint: disable=protected-access
+    ret._implementation._element_shape = [self._element_shape]  # pylint: disable=protected-access
     return ret
 
   @staticmethod
@@ -1338,8 +1359,8 @@ class TensorArraySpec(type_spec.TypeSpec):
 
   def _to_legacy_output_shapes(self):
     # Sneak the dynamic_size and infer_shape values into the legacy shape.
-    return (tensor_shape.matrix(self._dynamic_size, self._infer_shape)
-            .concatenate(self._element_shape))
+    return (tensor_shape.TensorShape([self._dynamic_size, self._infer_shape
+                                     ]).concatenate(self._element_shape))
 
   def _to_legacy_output_classes(self):
     return TensorArray
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 553052550d2..a51f1bf434c 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -899,10 +899,14 @@ class _VariableStore(object):
         if tf_inspect.isclass(initializer):
           initializer = initializer()
         if shape is not None and shape.is_fully_defined():
-          init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-              shape.as_list(),
-              dtype=dtype,
-              partition_info=partition_info)
+          if "partition_info" in tf_inspect.getargspec(initializer).args:
+            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+                shape.as_list(),
+                dtype=dtype,
+                partition_info=partition_info)
+          else:
+            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+                shape.as_list(), dtype=dtype)
           variable_dtype = dtype.base_dtype
         elif len(tf_inspect.getargspec(initializer).args) == len(
             tf_inspect.getargspec(initializer).defaults or []):
@@ -2588,41 +2592,42 @@ def variable_creator_scope_v1(variable_creator):
   custom creators when they do create variables.
 
   The valid keyword arguments in kwds are:
-      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+
+   * initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, also adds the variable to the graph
+   * trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
         `trainable` defaults to `True`, unless `synchronization` is
         set to `ON_READ`, in which case it defaults to `False`.
-      collections: List of graph collections keys. The new variable is added to
+   * collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-      validate_shape: If `False`, allows the variable to be initialized with a
+   * validate_shape: If `False`, allows the variable to be initialized with a
         value of unknown shape. If `True`, the default, the shape of
         `initial_value` must be known.
-      caching_device: Optional device string describing where the Variable
+   * caching_device: Optional device string describing where the Variable
         should be cached for reading.  Defaults to the Variable's device.
         If not `None`, caches on another device.  Typical use is to cache
         on the device where the Ops using the Variable reside, to deduplicate
         copying through `Switch` and other conditional statements.
-      name: Optional name for the variable. Defaults to `'Variable'` and gets
+   * name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
+   * dtype: If set, initial_value will be converted to the given type.
         If `None`, either the datatype will be kept (if `initial_value` is
         a Tensor), or `convert_to_tensor` will decide.
-      constraint: A constraint function to be applied to the variable after
+   * constraint: A constraint function to be applied to the variable after
         updates by some algorithms.
-      use_resource: if True, a ResourceVariable is always created.
-      synchronization: Indicates when a distributed a variable will be
+   * use_resource: if True, a ResourceVariable is always created.
+   * synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
+   * aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
 
@@ -2663,35 +2668,36 @@ def variable_creator_scope(variable_creator):
   custom creators when they do create variables.
 
   The valid keyword arguments in kwds are:
-      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+
+   * initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, GradientTapes automatically watch
+   * trainable: If `True`, the default, GradientTapes automatically watch
         uses of this Variable.
-      validate_shape: If `False`, allows the variable to be initialized with a
+   * validate_shape: If `False`, allows the variable to be initialized with a
         value of unknown shape. If `True`, the default, the shape of
         `initial_value` must be known.
-      caching_device: Optional device string describing where the Variable
+   * caching_device: Optional device string describing where the Variable
         should be cached for reading.  Defaults to the Variable's device.
         If not `None`, caches on another device.  Typical use is to cache
         on the device where the Ops using the Variable reside, to deduplicate
         copying through `Switch` and other conditional statements.
-      name: Optional name for the variable. Defaults to `'Variable'` and gets
+   * name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
       dtype: If set, initial_value will be converted to the given type.
         If `None`, either the datatype will be kept (if `initial_value` is
         a Tensor), or `convert_to_tensor` will decide.
-      constraint: A constraint function to be applied to the variable after
+   * constraint: A constraint function to be applied to the variable after
         updates by some algorithms.
-      synchronization: Indicates when a distributed a variable will be
+   * synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
+   * aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
 
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 39ebc5f1fb9..f78dd5e94ad 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -26,7 +26,8 @@ import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -41,6 +42,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -1080,7 +1082,11 @@ class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
     setattr(cls, operator, _run_op)
 
   def __hash__(self):
-    return id(self)
+    if ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions():  # pylint: disable=protected-access
+      raise TypeError("Variable is unhashable if Tensor equality is enabled. "
+                      "Instead, use tensor.experimental_ref() as the key.")
+    else:
+      return id(self)
 
   # TODO(gjn): duplicate of math_ops.tensor_equals, consider removing
   def __eq__(self, other):
@@ -1210,6 +1216,59 @@ class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   def _get_save_slice_info(self):
     return self._save_slice_info
 
+  def experimental_ref(self):
+    # tf.Tensor also has the same experimental_ref() API.  If you update the
+    # documenation here, please update tf.Tensor.experimental_ref() as well.
+    """Returns a hashable reference object to this Variable.
+
+    Warning: Experimental API that could be changed or removed.
+
+    The primary usecase for this API is to put variables in a set/dictionary.
+    We can't put variables in a set/dictionary as `variable.__hash__()` is no
+    longer available starting Tensorflow 2.0.
+
+    ```python
+    import tensorflow as tf
+
+    x = tf.Variable(5)
+    y = tf.Variable(10)
+    z = tf.Variable(10)
+
+    # The followings will raise an exception starting 2.0
+    # TypeError: Variable is unhashable if Variable equality is enabled.
+    variable_set = {x, y, z}
+    variable_dict = {x: 'five', y: 'ten'}
+    ```
+
+    Instead, we can use `variable.experimental_ref()`.
+
+    ```python
+    variable_set = {x.experimental_ref(),
+                    y.experimental_ref(),
+                    z.experimental_ref()}
+
+    print(x.experimental_ref() in variable_set)
+    ==> True
+
+    variable_dict = {x.experimental_ref(): 'five',
+                     y.experimental_ref(): 'ten',
+                     z.experimental_ref(): 'ten'}
+
+    print(variable_dict[y.experimental_ref()])
+    ==> ten
+    ```
+
+    Also, the reference object provides `.deref()` function that returns the
+    original Variable.
+
+    ```python
+    x = tf.Variable(5)
+    print(x.experimental_ref().deref())
+    ==> <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=5>
+    ```
+    """
+    return object_identity.Reference(self)
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
@@ -1293,7 +1352,7 @@ class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
 
 
 Variable._OverloadAllOperators()  # pylint: disable=protected-access
-pywrap_tensorflow.RegisterType("Variable", Variable)
+_pywrap_utils.RegisterType("Variable", Variable)
 
 
 @tf_export(v1=["Variable"])
@@ -1779,6 +1838,10 @@ class RefVariable(VariableV1):
           self._variable = state_ops.variable_op_v2(
               shape, self._initial_value.dtype.base_dtype, name=name)
 
+        # Cache the name in `self`, because some APIs call `Variable.name` in a
+        # tight loop, and this halves the cost.
+        self._name = self._variable.name
+
         # Manually overrides the variable's shape with the initial value's.
         if validate_shape:
           initial_value_shape = self._initial_value.get_shape()
@@ -1824,6 +1887,7 @@ class RefVariable(VariableV1):
     self._variable = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.variable_name, import_scope=import_scope))
+    self._name = self._variable.name
     self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initializer_name, import_scope=import_scope))
@@ -2481,7 +2545,7 @@ class RefVariable(VariableV1):
   @property
   def name(self):
     """The name of this variable."""
-    return self._variable.name
+    return self._name
 
   @property
   def initializer(self):
@@ -2704,7 +2768,7 @@ def _safe_initial_value_from_op(name, op, op_cache):
   """
   op_type = op.node_def.op
   if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
-                 "ReadVariableOp"):
+                 "ReadVariableOp", "If"):
     return op
 
   # Attempt to find the initialized_value of any variable reference / handles.
@@ -2771,7 +2835,7 @@ class PartitionedVariable(object):
   eager execution.  Use `tf.Variable` instead which is compatible
   with both eager execution and graph construction.  See [the
   TensorFlow Eager Execution
-  guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
+  guide](https://www.tensorflow.org/guide/eager#variables_and_optimizers)
   for details on how variables work in eager execution.
   @end_compatibility
   """
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 420818920a9..73a767caf25 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,13 +23,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
-from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -43,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import while_v2_indexed_slices_rewriter
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 
 # pylint: disable=protected-access
 
@@ -60,7 +62,8 @@ def while_loop(cond,
                parallel_iterations=10,
                maximum_iterations=None,
                name=None,
-               return_same_structure=True):
+               return_same_structure=True,
+               back_prop=True):
   """Like tf.while_loop, except emits a single While op."""
   # Keep the original loop_vars around to know which args were TensorArrays.
   orig_loop_vars = loop_vars
@@ -107,8 +110,7 @@ def while_loop(cond,
     # Add loop counter needed for computing gradients.
     loop_vars = [loop_counter, maximum_iterations_loop_var] + loop_vars
 
-    shape_invariants = (
-        [tensor_shape.scalar(), tensor_shape.scalar()] + shape_invariants)
+    shape_invariants = [tensor_shape.TensorShape([])] * 2 + shape_invariants
     signature = (
         [tensor_spec.TensorSpec.from_tensor(loop_counter),
          tensor_spec.TensorSpec.from_tensor(maximum_iterations_loop_var)] +
@@ -118,19 +120,23 @@ def while_loop(cond,
     # graphs. Propagate that behavior here.
     add_control_dependencies = ops.get_default_graph()._add_control_dependencies
 
-    # Build a `cond` wrapper that can handle the extra counter loop_var.
     def wrapped_cond(loop_counter, maximum_iterations_arg, *args):
+      """Extra `cond` wrapper that can handle the extra counter loop_var."""
       # Convert the flow variables in `args` to TensorArrays. `args` should
       # already have the same structure as `orig_loop_vars` but currently there
       # is no nest.zip so we call `_pack_sequence_as` which flattens both
       # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
       # and packs it into the structure of `orig_loop_vars`.
+      pred = cond(*_pack_sequence_as(orig_loop_vars, args))
+      if (tensor_util.is_tensor(pred) and
+          (pred.shape.dims is None or pred.shape.dims)):
+        pred = array_ops.squeeze_v2(pred)
+
       if maximum_iterations is None:
-        return cond(*_pack_sequence_as(orig_loop_vars, args))
+        return pred
       else:
         return math_ops.logical_and(
-            loop_counter < maximum_iterations_arg,
-            cond(*_pack_sequence_as(orig_loop_vars, args)))
+            loop_counter < maximum_iterations_arg, pred)
 
     # NOTE(skyewm): we set collections to the outer graph's collections for
     # compatibility with TPUEstimator.
@@ -203,8 +209,10 @@ def while_loop(cond,
       num_cond_captures = len(cond_graph.external_captures)
       assert (cond_graph.external_captures ==
               body_graph.external_captures[:num_cond_captures])
+      cond_graph_captures = object_identity.ObjectIdentitySet(
+          cond_graph.external_captures)
       for body_capture in body_graph.external_captures[num_cond_captures:]:
-        assert body_capture not in cond_graph.captures
+        assert body_capture not in cond_graph_captures
         cond_graph.capture(body_capture)
 
     # Make sure that the shapes of the loop outputs are compatible with the
@@ -222,6 +230,32 @@ def while_loop(cond,
                              len_orig_loop_vars], expand_composites=True),
         nest.flatten(loop_vars[first_loop_var_index:first_loop_var_index +
                                len_orig_loop_vars], expand_composites=True))
+
+    num_original_outputs = len(body_graph.outputs)
+    if back_prop and util.output_all_intermediates():
+      # Export all tensors in the loop body that may be needed for gradient
+      # computation. We do this by accumulating the intermediate values in
+      # TensorLists.
+      intermediate_tensors = _get_intermediates(body_graph)
+
+      for intermediate_tensor in intermediate_tensors:
+        tensor_list = list_ops.empty_tensor_list(
+            element_dtype=intermediate_tensor.dtype,
+            element_shape=intermediate_tensor.shape,
+            max_num_elements=maximum_iterations)
+        loop_vars.append(tensor_list)
+        with cond_graph.as_default():
+          # Add a placeholder to cond_graph's inputs corresponding to the
+          # tensor_list.
+          cond_graph.capture(tensor_list)
+        with body_graph.as_default():
+          # Push the intermediate tensor to the tensor list. This captures the
+          # `tensor_list` as well.
+          appended_tensor_list = list_ops.tensor_list_push_back(
+              tensor_list, intermediate_tensor)
+          # Add this modified tensor list to the list of outputs.
+          body_graph.outputs.append(appended_tensor_list)
+
     flattened_loop_vars = nest.flatten(loop_vars, expand_composites=True)
     _check_num_inputs_outputs(cond_graph, body_graph,
                               len(flattened_loop_vars))
@@ -234,13 +268,28 @@ def while_loop(cond,
                                    first_loop_var_index + num_flattened_outputs)
       output_shapes[orig_loop_vars_range] = nest.flatten(
           shape_invariants, expand_composites=True)[orig_loop_vars_range]
-      outputs = gen_functional_ops._while(
+
+      cond_stateful_ops = [
+          op for op in cond_graph.get_operations() if op._is_stateful
+      ]
+      body_stateful_ops = [
+          op for op in body_graph.get_operations() if op._is_stateful
+      ]
+      if (cond_stateful_ops or body_stateful_ops):
+        op_fn = gen_functional_ops._while
+      else:
+        op_fn = gen_functional_ops.stateless_while
+
+      outputs = op_fn(
           flattened_loop_vars,
           util.create_new_tf_function(cond_graph),
           util.create_new_tf_function(body_graph),
           output_shapes=output_shapes,
           parallel_iterations=parallel_iterations,
           name=scope)
+      # This is needed so we do not compute derivative wrt these extra outputs.
+      outputs[0].op._set_attr("_num_original_outputs",
+                              attr_value_pb2.AttrValue(i=num_original_outputs))
 
     _copy_handle_data(body_graph.outputs, outputs)
     util.maybe_set_lowering_attr(outputs[0].op)
@@ -268,6 +317,7 @@ def while_loop(cond,
     return outputs
 
 
+@ops.RegisterGradient("StatelessWhile")
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
@@ -283,9 +333,19 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   maximum_iterations = op.inputs[1]
   parallel_iterations = op.get_attr("parallel_iterations")
 
-  grads = [_preprocess_grad(grad, body_out, while_out)
-           for grad, body_out, while_out
-           in zip(grads, body_graph.outputs, while_op.outputs)]
+  try:
+    num_original_outputs = while_op.get_attr("_num_original_outputs")
+  except:  # pylint: disable=bare-except
+    num_original_outputs = len(while_op.outputs)
+
+  num_intermediates = len(while_op.outputs) - num_original_outputs
+  grads = [
+      _preprocess_grad(grad, body_out, while_out)  # pylint: disable=g-complex-comprehension
+      for grad, body_out, while_out in zip(
+          grads[:num_original_outputs],
+          body_graph.outputs[:num_original_outputs],
+          while_op.outputs[:num_original_outputs])
+  ] + [None] * num_intermediates
 
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
@@ -318,6 +378,11 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
                           [t.shape for t in new_outputs])
     _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
 
+  # Do not ingore grads wrt extra outputs when computing higher order
+  # derivatives.
+  while_op._set_attr("_num_original_outputs",
+                     attr_value_pb2.AttrValue(i=len(while_op.outputs)))
+
   captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph,
                                            while_op)
   loop_vars = args + captured_inputs
@@ -355,6 +420,50 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return _get_structured_grad_output(outputs, grads, body_grad_graph)
 
 
+def _get_intermediates(func_graph):
+  """Returns all tensors in `func_graph` that should be accumulated."""
+  # We currently accumulate output tensors of most ops in the function and rely
+  # on the pruning pass to get rid of the unused accumulators at runtime.
+  # However, this can bloat the GraphDef and make debugging harder so we perform
+  # some optimizations.
+  #
+  # Optimization we currently perform:
+  # 1. We do not accumulate tensors which already have an accumulator
+  #    in the loop body.
+  # 2. We do not accumulate outputs of Identity nodes. When building the
+  #    FuncGraph, we add an Identity node for each output (see
+  #    `AutomaticControlDependencies.mark_as_return`). Accumulating outputs
+  #    of all these nodes bloats the GraphDef quite a bit so we remove those.
+  #    Since the gradient of an Identity node does not rely on its forward op's
+  #    input this is safe to do.
+  #
+  # Other possible optimizations:
+  # 1. Only accumulate tensors that will be required by the backward pass.
+  #    This will require running the gradient pass and hence would increase the
+  #    graph building time for the forward pass.
+  # 2. Do not accumulate Const nodes created inside the loop body.
+  # 3. Do not accumulate loop vars that are returned as-is just like captured
+  #    tensors.
+  intermediates = []
+  reverse_captures = dict(
+      (v.experimental_ref(), k) for k, v in func_graph.captures)
+
+  for op in func_graph.get_operations():
+    if op.type == "Identity":
+      continue
+    # Accumulating mutexes can cause deadlock.
+    if op.type == "MutexLock":
+      continue
+    for o in op.outputs:
+      if (o is not func_graph.inputs[0] and  # Loop counter.
+          o.dtype != dtypes.resource and  # Do not accumulate resource tensors.
+          _get_accumulator(o) is None and  # Has existing accumulator.
+          o.experimental_ref() not in reverse_captures
+         ):  # Captured value, hence loop invariant.
+        intermediates.append(o)
+  return intermediates
+
+
 def _preprocess_grad(grad, body_graph_output, while_op_output):
   """Returns the initial gradient to be used for a given output tensor.
 
@@ -416,7 +525,6 @@ def _is_trainable(tensor):
   return True
 
 
-# TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
 def _get_graph(while_op, func_attr_name):
   """Returns `FuncGraph` for the given function attribute.
 
@@ -432,14 +540,7 @@ def _get_graph(while_op, func_attr_name):
       tensor_shape.TensorShape(s) for s in while_op.get_attr("output_shapes")
   ]
   func_name = while_op.get_attr(func_attr_name).name
-  fdef = while_op.graph._get_function(func_name).definition
-  # `while_op.graph` may not be the same as `ops.get_default_graph()` e.g.
-  # if the `while_op` is in the body of another if/while/defun. We build the
-  # `func_graph` with `while_op.graph` as its `outer_graph`. This resembles how
-  # the `FuncGraph` was built in the forward pass. We need this so that we can
-  # appropriately capture references to outer tensors in the nested grad graphs.
-  with while_op.graph.as_default():
-    func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
+  func_graph = util.get_func_graph(while_op, input_shapes, func_name)
   func_graph._while = while_op
   return func_graph
 
@@ -477,8 +578,8 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
   # the output of `_is_loop_invariant`. Also we would never attempt to capture
   # those accumulators so `_is_loop_invariant` should never receive those new
   # tensors as args.
-  body_graph_inputs = frozenset(body_graph.inputs)
-  body_graph_outputs = frozenset(body_graph.outputs)
+  body_graph_inputs = object_identity.ObjectIdentitySet(body_graph.inputs)
+  body_graph_outputs = object_identity.ObjectIdentitySet(body_graph.outputs)
 
   args = [counter, maximum_iterations, total_iters] + list(grads)
   # Note: The returned function does not have `args` in the list of
@@ -498,9 +599,10 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
   #    `popped_tensor_lists` by `_WhileBodyGradFuncGraph`.
   # 2. Resources, which are output as is.
   # 3. Forward graph loop invariants, which are output as is.
-  for external_capture, internal_capture in grad_func_graph.captures.items():
-    if internal_capture in grad_func_graph.popped_tensor_lists:
-      new_output = grad_func_graph.popped_tensor_lists[internal_capture]
+  for external_capture, internal_capture in grad_func_graph.captures:
+    if ops.tensor_id(internal_capture) in grad_func_graph.popped_tensor_lists:
+      new_output = grad_func_graph.popped_tensor_lists[ops.tensor_id(
+          internal_capture)]
     elif (internal_capture.dtype == dtypes.resource or _is_loop_invariant(
         external_capture, body_graph_inputs, body_graph_outputs)):
       new_output = internal_capture
@@ -583,7 +685,11 @@ def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
     # regular non-captured inputs).
     if t.graph == body_graph:
       # Captured accumulator or loop invariant.
-      t = while_op.outputs[t.graph.outputs.index(t)]
+      for i, output in enumerate(t.graph.outputs):
+        if output is t:
+          t = while_op.outputs[i]
+          break
+
       # Note: We rely on the capturing logic of the gradient While op graph to
       # correctly capture the tensors in `body_graph.outer_graph`. Both cond_v2
       # and while_v2 handle this while building their gradient functions.
@@ -667,8 +773,9 @@ def _get_accumulator(tensor):
 
   def get_func_graph_output(t):
     """Returns t or Identity(t) whichever exists in graph outputs else None."""
-    if t in tensor.graph.outputs:
-      return t
+    for output in tensor.graph.outputs:
+      if output is t:
+        return t
     # tf.defun adds an Identity for each output, check whether that is the case.
     identity_op = t.consumers()[0]
     if (identity_op.type == "Identity" and
@@ -679,8 +786,14 @@ def _get_accumulator(tensor):
   for consumer in tensor.consumers():
     # Find the consumer that is a TensorListPushBack node whose TensorList input
     # is in the list of function inputs.
-    if (consumer.type != "TensorListPushBack" or
-        consumer.inputs[0] not in tensor.graph.inputs):
+    if consumer.type != "TensorListPushBack":
+      continue
+
+    accum_input_idx = -1
+    for accum_input_idx, inp in enumerate(tensor.graph.inputs):
+      if inp is consumer.inputs[0]:
+        break
+    else:
       continue
 
     output = get_func_graph_output(consumer.outputs[0])
@@ -689,10 +802,12 @@ def _get_accumulator(tensor):
       # outputs.
       continue
 
-    accum_input_idx = tensor.graph.inputs.index(consumer.inputs[0])
-    accum_output_idx = tensor.graph.outputs.index(output)
-    if accum_input_idx == accum_output_idx:
-      return output
+    for accum_output_idx, out in enumerate(tensor.graph.outputs):
+      if out is output:
+        if accum_input_idx == accum_output_idx:
+          return output
+        break
+
   return None
 
 
@@ -799,10 +914,33 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
       # the input of the Identity node instead.
       tensor = tensor.op.inputs[0]
 
-    captured_tensor = self._indirect_captures.get(tensor)
+    captured_tensor = self._indirect_captures.get(ops.tensor_id(tensor))
     if captured_tensor is not None:
       return captured_tensor
 
+    # Do not accumulate loop invariants.
+    if (any(tensor is t for t in self._forward_graph.inputs) and
+        any(tensor is t for t in self._forward_graph.outputs)):
+      captured_tensor = super(_WhileBodyGradFuncGraph,
+                              self)._capture_helper(tensor, name)
+      # Add to `popped_tensor_lists` so that this gets added to the list of
+      # outputs.
+      # TODO(srbs): Rename popped_tensor_lists.
+      self.popped_tensor_lists[ops.tensor_id(captured_tensor)] = captured_tensor
+      self._indirect_captures[ops.tensor_id(tensor)] = captured_tensor
+      return captured_tensor
+
+    # Do not accumulate Const nodes. Instead copy them directly in the backward
+    # graph.
+    # TODO(srbs): This just checks for `Const` nodes. Consider checking for
+    # graph compile time consts in general.
+    # TODO(srbs): Consider making this a loop input.
+    if constant_op.is_constant(tensor):
+      real_value = constant_op.constant(
+          tensor_util.constant_value(tensor), dtype=tensor.dtype)
+      self._indirect_captures[ops.tensor_id(tensor)] = real_value
+      return real_value
+
     # Resource tensors are not accumulated and handled specially.
     if tensor.dtype == dtypes.resource:
       return self._resource_capture_helper(tensor)
@@ -866,8 +1004,9 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     new_tensor_list, captured_tensor = list_ops.tensor_list_pop_back(
         captured_accumulator, element_dtype=tensor.dtype)
 
-    self._indirect_captures[tensor] = captured_tensor
-    self.popped_tensor_lists[captured_accumulator] = new_tensor_list
+    self._indirect_captures[ops.tensor_id(tensor)] = captured_tensor
+    self.popped_tensor_lists[ops.tensor_id(
+        captured_accumulator)] = new_tensor_list
     return captured_tensor
 
   def _resource_capture_helper(self, tensor):
@@ -896,13 +1035,12 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     assert input_placeholder.dtype == dtypes.resource
     assert tensor_in_outer_graph.dtype == dtypes.resource
     # This must be a loop invariant.
-    assert input_placeholder == self._forward_graph.outputs[index], (
-        "Resource tensors must be loop invariants %s." %
-        tensor_in_outer_graph)
+    assert input_placeholder is self._forward_graph.outputs[index], (
+        "Resource tensors must be loop invariants %s." % tensor_in_outer_graph)
 
-    self._indirect_captures[tensor] = self.capture(
+    self._indirect_captures[ops.tensor_id(tensor)] = self.capture(
         tensor_in_outer_graph, whitelisted=True)
-    return self._indirect_captures[tensor]
+    return self._indirect_captures[ops.tensor_id(tensor)]
 
 
 def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index f146d751744..cabd85d8e79 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -24,9 +24,9 @@ import sys
 import tempfile
 
 # go/tf-wildcard-import
-# pylint: disable=wildcard-import
+# pylint: disable=wildcard-import,redefined-builtin
 from absl.testing.absltest import *
-# pylint: enable=wildcard-import
+# pylint: enable=wildcard-import,redefined-builtin
 
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import file_io
diff --git a/tensorflow/examples/get_started/regression/__init__.py b/tensorflow/python/platform/remote_utils.py
similarity index 79%
rename from tensorflow/examples/get_started/regression/__init__.py
rename to tensorflow/python/platform/remote_utils.py
index b81f4789f58..9ec2e5e5ef8 100644
--- a/tensorflow/examples/get_started/regression/__init__.py
+++ b/tensorflow/python/platform/remote_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""A collection of regression examples using `Estimators`."""
-
+"""Platform-specific helpers for connecting to remote servers."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+
+def get_default_communication_protocol():
+  return 'grpc'
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 2d007149f72..eec7cd273bb 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -164,3 +164,21 @@ py_test(
         "@com_google_pprof//:pprof_proto_py",
     ],
 )
+
+py_library(
+    name = "traceme",
+    srcs = ["traceme.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "scoped_annotation",
+    srcs = ["scoped_annotation.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:platform",
+    ],
+)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 47f2a0915cb..c457bf0944d 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -67,6 +67,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
     tags = [
+        "no_gpu",  # b/138442728
         "no_pip",
     ],
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index ccaaa941ea8..c06310310f9 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -26,11 +26,9 @@ import re
 import numpy as np
 
 from tensorflow.core.profiler import profile_pb2
-from tensorflow.core.profiler import tfprof_log_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -785,31 +783,6 @@ class PrintModelAnalysisTest(test.TestCase):
           sess.graph, run_meta=run_metadata, cmd='scope', options=options)
       self.assertGreater(ret_pb.total_requested_bytes, 1000000)
 
-  def testEager(self):
-    ops.reset_default_graph()
-    with context.eager_mode():
-      outfile = os.path.join(test.get_temp_dir(), 'dump')
-      opts = builder(
-          builder.time_and_memory()).with_file_output(outfile).build()
-      context.enable_run_metadata()
-      lib.BuildSmallModel()
-
-      profiler = model_analyzer.Profiler()
-      profiler.add_step(0, context.export_run_metadata())
-      context.disable_run_metadata()
-      profiler.profile_operations(opts)
-      with gfile.Open(outfile, 'r') as f:
-        out_str = f.read()
-        self.assertTrue('Conv2D' in out_str)
-        self.assertTrue('VarHandleOp' in out_str)
-
-      with gfile.Open('/tmp/eager_profile', 'wb') as f:
-        profile_pb = tfprof_log_pb2.ProfileProto()
-        profile_pb.ParseFromString(profiler.serialize_to_string())
-        profile_pb_str = '%s' % profile_pb
-        self.assertTrue('Conv2D' in profile_pb_str)
-        self.assertTrue('VarHandleOp' in profile_pb_str)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/pprof_profiler.py b/tensorflow/python/profiler/pprof_profiler.py
index 3892a0e724d..b8e38d53768 100644
--- a/tensorflow/python/profiler/pprof_profiler.py
+++ b/tensorflow/python/profiler/pprof_profiler.py
@@ -328,7 +328,7 @@ class PprofProfiler(object):
         # Call at current frame calls function at previous frame.
         prev_file_path = prev_stack_frame[0]
         prev_function = prev_stack_frame[2]
-        prev_function_start_line = prev_stack_frame[4]
+        prev_function_start_line = -1
         curr_file_path = stack_frame[0]
         curr_line_number = stack_frame[1]
 
@@ -371,7 +371,7 @@ class PprofProfiler(object):
     node_to_traceback = defaultdict(list)
     node_to_op_type = defaultdict(str)
     for op in self._graph.get_operations():
-      node_to_traceback[op.name] = op.traceback_with_start_lines
+      node_to_traceback[op.name] = op.traceback
       node_to_op_type[op.name] = op.type
 
     def profile_data_generator(device_step_stats):
diff --git a/tensorflow/python/profiler/scoped_annotation.py b/tensorflow/python/profiler/scoped_annotation.py
new file mode 100644
index 00000000000..7ddd50ac404
--- /dev/null
+++ b/tensorflow/python/profiler/scoped_annotation.py
@@ -0,0 +1,47 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ScopedAnnotation allows the profiler to track python events.
+
+Usage:
+    with scoped_annotation.ScopedAnnotation('name'):
+      ...
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.pywrap_tensorflow import PythonScopedAnnotation
+
+
+class ScopedAnnotation(object):
+  """Context manager that generates an annotation for the profiler."""
+
+  def __init__(self, name, **kwargs):
+    if PythonScopedAnnotation.IsEnabled():
+      if kwargs:
+        name += '#' + ','.join(
+            [key + '=' + str(value) for key, value in kwargs.iteritems()]) + '#'
+      self._scoped_annotation = PythonScopedAnnotation(name)
+    else:
+      self._scoped_annotation = None
+
+  def __enter__(self):
+    if self._scoped_annotation:
+      self._scoped_annotation.Enter()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    if self._scoped_annotation:
+      self._scoped_annotation.Exit()
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index 75ee827adb5..8aff8cec085 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -113,13 +113,14 @@ def _get_logged_ops(graph, run_meta=None, add_trace=True,
       add_entry = True
 
     if add_trace:
-      for tb in op.traceback_with_start_lines:
+      for tb in op.traceback:
         trace = entry.code_def.traces.add()
         trace.file_id = _str_id(tb[0], string_to_id) if tb[0] else 0
         trace.lineno = tb[1] if tb[1] else -1
         trace.function_id = _str_id(tb[2], string_to_id) if tb[2] else 0
         trace.line_id = _str_id(tb[3], string_to_id) if tb[3] else 0
-        trace.func_start_line = tb[4] if tb[4] else -1
+        # TODO(slebedev): remove this unused field from the proto.
+        trace.func_start_line = -1
       add_entry = True
 
     if add_entry:
diff --git a/tensorflow/python/profiler/traceme.py b/tensorflow/python/profiler/traceme.py
new file mode 100644
index 00000000000..936761a7aa8
--- /dev/null
+++ b/tensorflow/python/profiler/traceme.py
@@ -0,0 +1,47 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TraceMe allows the profiler to trace python events.
+
+Usage:
+    with profiler.TraceMe('name'):
+      ...
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.pywrap_tensorflow import PythonTraceMe
+
+
+class TraceMe(object):
+  """Context manager that generates a trace event in the profiler."""
+
+  def __init__(self, name, **kwargs):
+    if PythonTraceMe.IsEnabled():
+      if kwargs:
+        name += '#' + ','.join(
+            [key + '=' + str(value) for key, value in kwargs.iteritems()]) + '#'
+      self._traceme = PythonTraceMe(name)
+    else:
+      self._traceme = None
+
+  def __enter__(self):
+    if self._traceme:
+      self._traceme.Enter()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    if self._traceme:
+      self._traceme.Exit()
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index f07f8dffd73..09b2a5b552e 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -41,17 +41,18 @@ limitations under the License.
 %rename("%s") TFE_ContextGetMirroringPolicy;
 %rename("%s") TFE_ContextSetThreadLocalDevicePlacementPolicy;
 %rename("%s") TFE_ContextSetThreadLocalMirroringPolicy;
-%rename("%s") TFE_ContextSetAsyncForThread;
 %rename("%s") TFE_ContextSetServerDef;
-%rename("%s") TFE_ContextAsyncWait;
-%rename("%s") TFE_ContextAsyncClearError;
+%rename("%s") TFE_NewExecutor;
+%rename("%s") TFE_DeleteExecutor;
+%rename("%s") TFE_ExecutorIsAsync;
+%rename("%s") TFE_ExecutorWaitForAllPendingNodes;
+%rename("%s") TFE_ExecutorClearError;
+%rename("%s") TFE_ContextSetExecutorForThread;
+%rename("%s") TFE_ContextGetExecutorForThread;
 %rename("%s") TFE_NewProfiler;
 %rename("%s") TFE_ProfilerIsOk;
 %rename("%s") TFE_DeleteProfiler;
 %rename("%s") TFE_ProfilerSerializeToString;
-%rename("%s") TFE_NewProfilerContext;
-%rename("%s") TFE_ProfilerContextSetEagerContext;
-%rename("%s") TFE_DeleteProfilerContext;
 %rename("%s") TFE_StartProfilerServer;
 %rename("%s") TFE_ProfilerClientStartTracing;
 %rename("%s") TFE_ProfilerClientMonitor;
@@ -74,10 +75,12 @@ limitations under the License.
 %rename("%s") TFE_Py_TapeSetRestartOnThread;
 %rename("%s") TFE_Py_TapeSetIsStopped;
 %rename("%s") TFE_Py_TapeSetIsEmpty;
-%rename("%s") TFE_Py_TapeSetShouldRecord;
+%rename("%s") TFE_Py_TapeSetShouldRecordBackprop;
 %rename("%s") TFE_Py_TapeSetPossibleGradientTypes;
 %rename("%s") TFE_Py_TapeSetDeleteTrace;
 %rename("%s") TFE_Py_TapeSetRecordOperation;
+%rename("%s") TFE_Py_TapeSetRecordOperationBackprop;
+%rename("%s") TFE_Py_TapeSetRecordOperationForwardprop;
 %rename("%s") TFE_Py_TapeGradient;
 %rename("%s") TFE_Py_TapeVariableAccessed;
 %rename("%s") TFE_Py_TapeWatch;
@@ -87,6 +90,9 @@ limitations under the License.
 %rename("%s") TFE_Py_ForwardAccumulatorSetRemove;
 %rename("%s") TFE_Py_ForwardAccumulatorWatch;
 %rename("%s") TFE_Py_ForwardAccumulatorJVP;
+%rename("%s") TFE_Py_ForwardAccumulatorPushState;
+%rename("%s") TFE_Py_ForwardAccumulatorPopState;
+%rename("%s") TFE_Py_PackForwardGradients;
 %rename("%s") TFE_NewContextOptions;
 %rename("%s") TFE_ContextOptionsSetConfig;
 %rename("%s") TFE_ContextOptionsSetDevicePlacementPolicy;
@@ -96,6 +102,7 @@ limitations under the License.
 %rename("%s") TFE_Py_TensorShapeSlice;
 %rename("%s") TFE_Py_TensorShapeOnDevice;
 %rename("%s") TFE_Py_EnableInteractivePythonLogging;
+%rename("%s") TFE_Py_SetEagerContext;
 %rename("%s") TFE_ContextStartStep;
 %rename("%s") TFE_ContextEndStep;
 %rename("%s") TFE_Py_RegisterVSpace;
@@ -164,9 +171,12 @@ limitations under the License.
 %rename("%s") TFE_CancellationManagerIsCancelled;
 %rename("%s") TFE_CancellationManagerStartCancel;
 %rename("%s") TFE_DeleteCancellationManager;
+%rename("%s") TF_ImportGraphDefOptionsSetValidateColocationConstraints;
+%rename("%s") TFE_ClearScalarCache;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/python/util/util.h"
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
@@ -191,6 +201,16 @@ static PyObject* TF_ListPhysicalDevices(TF_Status* status) {
 %}
 static PyObject* TF_ListPhysicalDevices(TF_Status* status);
 
+%{
+#include "tensorflow/python/eager/pywrap_tensor_conversion.h"
+
+static PyObject* TFE_ClearScalarCache() {
+  tensorflow::TFE_TensorHandleCache::Get()->Clear();
+  Py_RETURN_NONE;
+}
+%}
+static PyObject* TFE_ClearScalarCache();
+
 %typemap(in) (const void* proto) {
   char* c_string;
   Py_ssize_t py_size;
@@ -219,6 +239,7 @@ static PyObject* TF_ListPhysicalDevices(TF_Status* status);
 }
 
 %typemap(in, numinputs=0) unsigned char* is_list (unsigned char tmp) {
+  tmp = 0;
   $1 = &tmp;
 }
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 29ce69ce9a3..1ca3804515e 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -191,6 +191,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
@@ -481,6 +482,12 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:optional_ops",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/ops/ragged",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 65cffc624d8..29b62a6566b 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -155,14 +155,14 @@ class _SavedModelBuilder(object):
   def _validate_tensor_info(self, tensor_info):
     """Validates the `TensorInfo` proto.
 
-    Checks if the `encoding` (`name` or `coo_sparse`) and `dtype` fields exist
-    and are non-empty.
+    Checks if the `encoding` (`name` or `coo_sparse` or `type_spec`) and
+    `dtype` fields exist and are non-empty.
 
     Args:
       tensor_info: `TensorInfo` protocol buffer to validate.
 
     Raises:
-      AssertionError: If the `name` or `dtype` fields of the supplied
+      AssertionError: If the `encoding` or `dtype` fields of the supplied
           `TensorInfo` proto are not populated.
     """
     if tensor_info is None:
@@ -175,7 +175,10 @@ class _SavedModelBuilder(object):
           "All TensorInfo protos used in the SignatureDefs must have one of "
           "the 'encoding' fields (e.g., name or coo_sparse) set: %s"
           % tensor_info)
-    if tensor_info.dtype is types_pb2.DT_INVALID:
+    if tensor_info.WhichOneof("encoding") == "composite_tensor":
+      for component in tensor_info.composite_tensor.components:
+        self._validate_tensor_info(component)
+    elif tensor_info.dtype == types_pb2.DT_INVALID:
       raise AssertionError(
           "All TensorInfo protos used in the SignatureDefs must have the dtype "
           "field set: %s" % tensor_info)
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 599759a0c84..7641e5e6d8e 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import func_graph as func_graph_lib
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
@@ -60,9 +61,11 @@ def _call_concrete_function(function, inputs):
     The structured function output.
   """
   expected_structure = function.graph.structured_input_signature
-  flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  flatten_inputs = nest.flatten_up_to(
+      expected_structure, inputs, expand_composites=True)
+  flatten_expected = nest.flatten(expected_structure, expand_composites=True)
   tensor_inputs = []
-  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+  for arg, expected in zip(flatten_inputs, flatten_expected):
     if isinstance(expected, tensor_spec.TensorSpec):
       tensor_inputs.append(
           ops.convert_to_tensor(arg, dtype_hint=expected.dtype))
@@ -111,9 +114,11 @@ def _concrete_function_callable_with(function, inputs, allow_conversion):
         return False
       if not expected.shape.is_compatible_with(arg.shape):
         return False
-    else:
-      if arg != expected:
-        return False
+    elif isinstance(expected, type_spec.TypeSpec):
+      return expected.is_compatible_with(arg)
+    elif (_is_tensor(arg) and
+          id(arg) != id(expected)) or (not _is_tensor(arg) and arg != expected):
+      return False
   return True
 
 
@@ -317,7 +322,7 @@ def load_function_def_library(library, load_shared_name_suffix=None):
 
     # Also register the gradients in the current root context.
     with ops.init_scope():
-      func._register_gradient()  # pylint: disable=protected-access
+      func._register_delayed_rewrite_gradient()  # pylint: disable=protected-access
 
   return functions
 
@@ -363,8 +368,8 @@ def fix_node_def(node_def, functions, shared_name_suffix, debug_name):
       # function call is the default gradient for the function and not a
       # custom one.
       fname = node_def.attr["f"].func.name
-      node_def.attr["_gradient_op_type"].s = compat.as_bytes(
-          functions[fname]._gradient_name)  # pylint: disable=protected-access
+      gradient_name = functions[fname]._register_delayed_rewrite_gradient()  # pylint: disable=protected-access
+      node_def.attr["_gradient_op_type"].s = compat.as_bytes(gradient_name)
     else:
       logging.warning("Importing a function (%s) with ops with custom "
                       "gradients. Will likely fail if a gradient is "
@@ -375,6 +380,15 @@ def fix_node_def(node_def, functions, shared_name_suffix, debug_name):
     if attr_value.func.name:
       attr_value.func.name = functions[attr_value.func.name].name
 
+  # Fix old table creation bug.
+  if node_def.op == "HashTableV2":
+    if ("use_node_name_sharing" not in node_def.attr or
+        not node_def.attr["use_node_name_sharing"].b):
+      node_def.attr["use_node_name_sharing"].b = True
+      # We are turning on node mame sharing, so have to make sure we don't
+      # accidentally share a table resource.
+      shared_name_suffix += "_{}".format(ops.uid())
+
   # TODO(b/124205571): Avoid accidental sharing and destruction of restored
   # resources. For now uniquify "shared_name" when loading functions to avoid
   # sharing.
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index bf62c6b8530..88f0f819ea7 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -176,22 +176,27 @@ class Loader(object):
       if bound_inputs:
         for bound_input, internal_capture in zip(
             bound_inputs, concrete_function.inputs[-len(bound_inputs):]):
-          concrete_function.graph.captures[bound_input] = internal_capture
-          if internal_capture.dtype == dtypes.resource:
-            if resource_variable_ops.is_resource_variable(bound_input):
-              try:
-                handle = bound_input.handle
-              except ValueError:
-                # For mirrored variables we'll copy handle data for components
-                # as they get captured.
-                pass
+          if ds_values.is_distributed_variable(bound_input):
+            concrete_function.graph.capture_distributed_variable(
+                bound_input, internal_capture)
+          else:
+            concrete_function.graph._captures[ops.tensor_id(bound_input)] = (  # pylint: disable=protected-access
+                bound_input, internal_capture)
+            if internal_capture.dtype == dtypes.resource:
+              if resource_variable_ops.is_resource_variable(bound_input):
+                try:
+                  handle = bound_input.handle
+                except ValueError:
+                  # For mirrored variables we'll copy handle data for components
+                  # as they get captured.
+                  pass
+                else:
+                  custom_gradient.copy_handle_data(handle, internal_capture)
               else:
-                custom_gradient.copy_handle_data(handle, internal_capture)
-            else:
-              custom_gradient.copy_handle_data(bound_input, internal_capture)
-          # Setting "captures" first means "capture" won't create a new
-          # placeholder for this input.
-          concrete_function.graph.capture(bound_input)
+                custom_gradient.copy_handle_data(bound_input, internal_capture)
+            # Setting "captures" first means "capture" won't create a new
+            # placeholder for this input.
+            concrete_function.graph.capture(bound_input)
 
   def _get_tensor_from_node(self, node_id):
     """Resolves a node id into a tensor to be captured for a function."""
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index e28ee4b5546..0fb77683daa 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
-from tensorflow.python.feature_column import feature_column_v2
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -42,6 +42,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
@@ -58,6 +59,8 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import tag_constants
@@ -67,34 +70,35 @@ from tensorflow.python.training.tracking import util
 from tensorflow.python.util import tf_inspect
 
 
+def cycle(obj, cycles, signatures=None):
+  to_save = obj
+  # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
+  # point w.r.t. saving/restoring, ideally after 2nd saving.
+  for _ in range(cycles):
+    path = tempfile.mkdtemp(prefix=test.get_temp_dir())
+    # If available, we'll run the save and restore preferring the GPU. This
+    # just makes sure we aren't throwing errors and have enough
+    # device("CPU") blocks to satisfy the placer.
+    with test_util.use_gpu():
+      save.save(to_save, path, signatures)
+      loaded = load.load(path)
+    to_save = loaded
+  return loaded
+
+
 @parameterized.named_parameters(
     dict(testcase_name="ReloadOnce", cycles=1),
     dict(testcase_name="ReloadTwice", cycles=2),
     dict(testcase_name="ReloadThrice", cycles=3))
 class LoadTest(test.TestCase, parameterized.TestCase):
 
-  def cycle(self, obj, cycles, signatures=None):
-    to_save = obj
-    # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
-    # point w.r.t. saving/restoring, ideally after 2nd saving.
-    for _ in range(cycles):
-      path = tempfile.mkdtemp(prefix=self.get_temp_dir())
-      # If available, we'll run the save and restore preferring the GPU. This
-      # just makes sure we aren't throwing errors and have enough
-      # device("CPU") blocks to satisfy the placer.
-      with test_util.use_gpu():
-        save.save(to_save, path, signatures)
-        loaded = load.load(path)
-      to_save = loaded
-    return loaded
-
   def test_structure_import(self, cycles):
     root = tracking.AutoTrackable()
     root.dep_one = tracking.AutoTrackable()
     root.dep_two = tracking.AutoTrackable()
     root.dep_two.dep = tracking.AutoTrackable()
     root.dep_three = root.dep_two.dep
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
@@ -102,7 +106,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1., trainable=True)
     root.v2 = variables.Variable(2., trainable=False)
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(imported.v1.numpy(), 1.0)
     self.assertTrue(imported.v1.trainable)
     self.assertEqual(imported.v2.numpy(), 2.0)
@@ -114,13 +118,13 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     # is based on object name and not on variable name.
     root.v1 = variables.Variable(1., trainable=True, name="v1")
     root.v2 = variables.Variable(2., trainable=False, name="v1")
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(imported.v1.numpy(), 1.0)
     self.assertEqual(imported.v2.numpy(), 2.0)
     self.assertEqual(imported.v1.name, root.v1.name)
     self.assertEqual(imported.v2.name, root.v2.name)
     with variable_scope.variable_scope("foo"):
-      imported = self.cycle(root, cycles)
+      imported = cycle(root, cycles)
       self.assertTrue(imported.v1.name.startswith("foo/"))
       self.assertTrue(imported.v2.name.startswith("foo/"))
 
@@ -139,7 +143,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     m = MakeVariable()
     m.make_variable([1, 2, 3])
-    m = self.cycle(m, cycles)
+    m = cycle(m, cycles)
     m.v.assign([1, 2, 3, 4])
     self.assertEqual([None], tensor_shape.as_shape(m.v.shape).as_list())
 
@@ -152,7 +156,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         lambda x: root.weights * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     for _ in range(cycles):
-      imported = self.cycle(root, 1)
+      imported = cycle(root, 1)
       self.evaluate(imported.weights.initializer)
     self.assertEqual(4., self.evaluate(imported.f(constant_op.constant(2.))))
     self.evaluate(imported.weights.assign(4.0))
@@ -165,7 +169,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = def_function.function(
         lambda x: captured_constant * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(4., self.evaluate(imported.f(constant_op.constant(2.))))
 
   def test_control_outputs(self, cycles):
@@ -178,7 +182,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         exported_graph.get_operation_by_name("should_be_control_output"),
         exported_graph.control_outputs)
 
-    imported = self.cycle(exported, cycles)
+    imported = cycle(exported, cycles)
     # Calling get_concrete_function wraps in a second call operation; we want to
     # inspect the original function body for the control output; digging into
     # graph.as_graph_def() and its FunctionDefLibrary is another option.
@@ -244,7 +248,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = Adder()
     root.add(constant_op.constant(1.))
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     root.add(constant_op.constant(1.))
 
   def test_capture_assets(self, cycles):
@@ -253,7 +257,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     original_output = root.f().numpy()
     imported_output = imported.f().numpy()
     self.assertNotEqual(original_output, imported_output)
@@ -270,7 +274,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     original_output = root.f().numpy()
 
     if cycles > 1:
-      root = self.cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
@@ -288,7 +292,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(vocab)
     root.asset2 = tracking.TrackableAsset(vocab)
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
 
@@ -304,7 +308,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f(constant_op.constant(1.))
     root.f(constant_op.constant(1))
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
     self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
@@ -318,7 +322,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.f = func
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
 
   def test_explicit_save_signature(self, cycles):
@@ -329,7 +333,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.f = func
 
-    imported = self.cycle(
+    imported = cycle(
         root, cycles, {
             "f":
                 root.f.get_concrete_function(
@@ -347,7 +351,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = tracking.AutoTrackable()
     root.g = g
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     imported.g(constant_op.constant([1.0]))
 
   def test_function_with_default_bool_input(self, cycles):
@@ -365,7 +369,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
@@ -395,7 +399,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertEqual(4, len(concrete_functions))
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     self.assertAllEqual([0.0, 0.0, 0.0],
                         imported.f(constant_op.constant([1, 2, 3]),
@@ -429,7 +433,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     obj.increase()
     self.assertEqual(16.0, obj.variable.numpy())
 
-    imported = self.cycle(obj, cycles)
+    imported = cycle(obj, cycles)
 
     imported.increase(constant_op.constant(10.0))
     self.assertEqual(26.0, imported.variable.numpy())
@@ -460,7 +464,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     # matching signature will be valid on the loaded model.
     self.assertEqual(31, root.f(input1).numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     with self.assertRaisesRegexp(ValueError,
                                  "Could not find matching function to call"):
@@ -489,7 +493,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     result = imported.f(constant_op.constant(2), constant_op.constant(5))
     self.assertEqual(7, result[0].a.numpy())
@@ -524,7 +528,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     train_input = dict(x=constant_op.constant([[1.]]),
                        y=constant_op.constant([[2.]]))
     root.train(**train_input)
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertAllClose(root.optimizer.learning_rate.numpy(),
                         imported.optimizer.learning_rate.numpy())
     self.assertAllClose(root(constant_op.constant([[-0.5]])),
@@ -552,7 +556,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
     self.assertEqual(6, root.f(constant_op.constant(1), defg=7.0).numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
@@ -572,7 +576,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(10)
     self.assertEqual(7, root.f(x, learning_rate=0.5, epochs=3).numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     with self.assertRaisesRegexp(ValueError,
                                  "Could not find matching function to call.*"):
@@ -600,7 +604,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
@@ -620,7 +624,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         return x * self.var
 
     m = M()
-    self.cycle(m, cycles)
+    cycle(m, cycles)
     self.assertEqual(4.0, m.f(constant_op.constant(2.0)).numpy())
 
   def test_basic_backprop(self, cycles):
@@ -634,7 +638,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.weight = weight
     root.bias = bias
     root.g = g
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     with backprop.GradientTape() as t:
       x = constant_op.constant([3.5])
       loss = imported.g(x)
@@ -675,7 +679,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.bias = bias
     root.g = h
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     with backprop.GradientTape() as t:
       x = constant_op.constant([3.5])
       loss = imported.g(x)
@@ -696,7 +700,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.m2.__call__ = def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
             lambda x: x*3.0)
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     x = constant_op.constant(1.0)
 
     self.assertTrue(callable(imported.m1))
@@ -720,7 +724,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.__call__.__call__ = tracking.AutoTrackable()
     root.__call__.__call__.__call__ = func
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertTrue(callable(imported))
     x = constant_op.constant(1.0)
     self.assertAllEqual(imported(x).numpy(), 3.0)
@@ -734,7 +738,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
     if cycles > 1:
-      root = self.cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
@@ -764,7 +768,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
     if cycles > 1:
-      root = self.cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
@@ -794,7 +798,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertEqual(1, len(concrete_functions))
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     with self.assertRaisesRegexp(ValueError, "Python inputs incompatible"):
       # We cannot call the function with a constant of shape ().
@@ -824,7 +828,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.f = func
 
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     concrete = imported.f.get_concrete_function(
         training=True, x=tensor_spec.TensorSpec([None], dtypes.int32))
@@ -852,7 +856,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
 
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
 
     self.assertAllEqual([2, 4, 6, 8],
                         imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
@@ -874,17 +878,18 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = Root()
     self.assertIn(root.v.handle,
-                  root.use_v.get_concrete_function().graph.captures)
+                  root.use_v.get_concrete_function().graph.external_captures)
     for _ in range(cycles):
-      root = self.cycle(root, 1, signatures=root.use_v.get_concrete_function())
-    func_captures = root.use_v.get_concrete_function().graph.captures
+      root = cycle(root, 1, signatures=root.use_v.get_concrete_function())
+    func_captures = root.use_v.get_concrete_function().graph.external_captures
     self.assertLen(func_captures, 2)
-    self.assertIn(root.v.handle, func_captures)
-    self.assertIn(root.v1.handle, func_captures)
-    signature_captures = root.signatures["serving_default"].graph.captures
+    self.assertTrue(any(root.v.handle is t for t in func_captures))
+    self.assertTrue(any(root.v1.handle is t for t in func_captures))
+    signature_captures = root.signatures[
+        "serving_default"].graph.external_captures
     self.assertLen(signature_captures, 2)
-    self.assertIn(root.v.handle, signature_captures)
-    self.assertIn(root.v1.handle, signature_captures)
+    self.assertTrue(any(root.v.handle is t for t in signature_captures))
+    self.assertTrue(any(root.v1.handle is t for t in signature_captures))
 
   def test_concrete_function_arg_names(self, cycles):
 
@@ -899,7 +904,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
 
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
 
     self.assertAllEqual([2, 4, 6],
                         imported.f(x=constant_op.constant([1, 2, 3])).numpy())
@@ -913,7 +918,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func.get_concrete_function(constant_op.constant([1]))
     self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
     self.assertAllEqual([6],
                         imported.f(constant_op.constant([3])).numpy())
 
@@ -934,7 +939,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(2., _compute_gradient(root.f).numpy())
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
     self.assertEqual(2., _compute_gradient(imported.f).numpy())
 
   def test_revived_concrete_function_kwargs(self, cycles):
@@ -949,7 +954,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(8., root.f(y=constant_op.constant(3.),
                                 x=constant_op.constant(2.)).numpy())
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
     self.assertEqual(8., imported.f(y=constant_op.constant(3.),
                                     x=constant_op.constant(2.)).numpy())
 
@@ -965,7 +970,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         tensor_spec.TensorSpec([], dtypes.float32, name="y"))
     self.assertEqual(8., root.f(y=constant_op.constant(3.),
                                 x=constant_op.constant(2.)).numpy())
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
     self.assertEqual(8., imported.f(y=constant_op.constant(3.),
                                     x=constant_op.constant(2.)).numpy())
 
@@ -987,7 +992,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f(vsave)
     self.assertEqual(2, vsave.numpy())
     self.assertEqual(-1, capture.numpy())
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     vload = variables.Variable(1)
     imported.f(vload)
@@ -1010,7 +1015,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     one = constant_op.constant(1)
     self.assertEqual(2, root.func(one).numpy())
     self.assertEqual(2, root.concrete_func(one).numpy())
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(2, imported.func(one).numpy())
     self.assertEqual(2, imported.concrete_func(one).numpy())
 
@@ -1022,7 +1027,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.funcs = dict(
         a=def_function.function(lambda: constant_op.constant(100.)))
     root.funcs["conc"] = root.funcs["a"].get_concrete_function()
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(1., imported.variables["a"].numpy())
     self.assertEqual(2., imported.variables["b"].numpy())
     self.assertEqual(set(["a", "b"]), set(imported.variables.keys()))
@@ -1034,7 +1039,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.variables = [variables.Variable(1.)]
     root.variables.append(1)
     root.variables.append(variables.Variable(3.))
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(1., imported.variables[0].numpy())
     self.assertEqual(3., imported.variables[2].numpy())
     self.assertIs(None, imported.variables[1])
@@ -1055,7 +1060,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root.losses.append(_v2_loss)
     self.assertAllClose([1., 4.], [loss() for loss in root.losses])
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertAllClose([1., 4.], [loss() for loss in imported.losses])
     imported.variables[0].assign(3.)
     imported.variables[1].assign(4.)
@@ -1068,7 +1073,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.g = def_function.function(lambda: const + 2.)
     self.assertAllClose(array_ops.ones([100]), root.f())
     self.assertAllClose(2. * array_ops.ones([100]), root.g())
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertAllClose(array_ops.ones([100]), imported.f())
     self.assertAllClose(2. * array_ops.ones([100]), imported.g())
     # TODO(b/123408994): Use the public get_concrete_function.
@@ -1098,14 +1103,14 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         return f
 
     exported = Exported()
-    imported = self.cycle(exported, cycles)
+    imported = cycle(exported, cycles)
     self.assertEqual(0, imported.make_func().numpy())
     self.assertEqual(1, exported.make_func().numpy())
 
   def test_overwritten_signatures_error(self, cycles):
     exported = tracking.AutoTrackable()
     exported.f = def_function.function(lambda: constant_op.constant(1.))
-    imported = self.cycle(
+    imported = cycle(
         exported, cycles,
         signatures={"key": exported.f.get_concrete_function()})
     self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
@@ -1125,13 +1130,13 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         return self.v * x
 
     exported = Exported()
-    imported = self.cycle(
+    imported = cycle(
         exported,
         cycles=1,
         signatures=exported.do.get_concrete_function(
             tensor_spec.TensorSpec(None, dtypes.float32)))
     for _ in range(cycles - 1):
-      imported = self.cycle(imported, cycles=1, signatures=imported.signatures)
+      imported = cycle(imported, cycles=1, signatures=imported.signatures)
     self.assertEqual(["serving_default"], list(imported.signatures.keys()))
     imported_function = imported.signatures["serving_default"]
     two = constant_op.constant(2.)
@@ -1152,12 +1157,12 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         return x + y
 
     exported = Exported()
-    imported = self.cycle(
+    imported = cycle(
         exported, cycles=1, signatures=exported.do.get_concrete_function(
             tensor_spec.TensorSpec(None, dtypes.float32),
             tensor_spec.TensorSpec(None, dtypes.float32)))
     for _ in range(cycles - 1):
-      imported = self.cycle(imported, cycles=1, signatures=imported.signatures)
+      imported = cycle(imported, cycles=1, signatures=imported.signatures)
     with self.assertRaises(TypeError):
       imported.signatures["serving_default"](
           constant_op.constant(1.),
@@ -1193,7 +1198,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_table(self, cycles):
     root = self._make_model_with_tables()
-    imported = self.cycle(root, cycles, signatures={})
+    imported = cycle(root, cycles, signatures={})
     keys = constant_op.constant(["brain", "test", "foo", "surgery"])
     self.assertAllEqual([0, -1, -1, 2], imported.lookup1(keys).numpy())
     self.assertAllEqual([2, 0, 1, -1], imported.lookup2(keys).numpy())
@@ -1212,19 +1217,19 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = self._make_model_with_tables()
     # Warm up collections to ignore those that don't expand every iteration,
     # e.g. the __varscope collection.
-    self.cycle(root, 1)
+    cycle(root, 1)
     original_collections = _gather_nonempty_collections()
-    self.cycle(root, cycles)
+    cycle(root, cycles)
     self.assertEqual(original_collections, _gather_nonempty_collections())
 
   def test_table_in_graph(self, cycles):
     root = self._make_model_with_tables()
 
     if cycles > 1:
-      root = self.cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
-    imported = self.cycle(root, 1)
+    imported = cycle(root, 1)
 
     with ops.Graph().as_default():
       imported = load.load(path)
@@ -1243,7 +1248,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = tracking.AutoTrackable()
     root.f = def_function.function(f)
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
 
     restored_fullargspec = tf_inspect.getfullargspec(imported.f)
     self.assertEqual(original_fullargspec, restored_fullargspec)
@@ -1268,7 +1273,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = tracking.AutoTrackable()
     root.f = func
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
     self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
 
@@ -1289,7 +1294,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = tracking.AutoTrackable()
     root.f = func
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(2, root.f(2).numpy())
     self.assertAllEqual(4, root.f(3).numpy())
     self.assertAllEqual(3, root.f(constant_op.constant(2)).numpy())
@@ -1306,7 +1311,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertAllEqual(root.f(), [1.0])
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(), [1.0])
 
   def test_partial_with_non_tensor_defaults(self, cycles):
@@ -1320,7 +1325,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertAllEqual(root.f(1), 6)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(1), 6)
 
   def test_partial_with_positional(self, cycles):
@@ -1333,7 +1338,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertAllEqual(root.f(1), 6)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(1), 6)
 
   def test_partial_with_positional_captured_tensors(self, cycles):
@@ -1348,7 +1353,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertAllEqual(root.f(1), 13)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(1), 13)
 
   def test_partial_keyword_hiding_default(self, cycles):
@@ -1366,7 +1371,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(root.f().numpy(), 9)
     self.assertEqual(root.f(training=False).numpy(), 11)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(root.f().numpy(), 9)
     self.assertEqual(root.f(training=False).numpy(), 11)
 
@@ -1385,7 +1390,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertEqual(root.f(constant_op.constant(4)).numpy(), 44)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(root.f(constant_op.constant(5)).numpy(), 45)
 
   def test_partial_bind_only_first_argument(self, cycles):
@@ -1403,7 +1408,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = tf_func
     self.assertAllEqual(root.f(y=constant_op.constant(7)), 12)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(root.f(y=constant_op.constant(9)), 14)
 
   def test_partial_with_passed_fn_as_default(self, cycles):
@@ -1420,7 +1425,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.f = func
     self.assertEqual(root.f(constant_op.constant(3)).numpy(), 9)
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(root.f(constant_op.constant(3)).numpy(), 9)
 
   def test_partial_with_input_signature(self, cycles):
@@ -1439,7 +1444,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     a, b, c = root.f(2.0)
     self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 2.0, 4))
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     a, b, c = root.f(3.0)
     self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 3.0, 4))
 
@@ -1453,7 +1458,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.f = func
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
 
     self.assertEqual([2], root.f([2]).numpy())
 
@@ -1475,7 +1480,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     if sys.version_info.major == 3 and sys.version_info.minor < 5:
       # TODO(allenl): figure out why this doesn't work in Python3.4
       self.skipTest("Not working in Python 3.4")
-    imported = self.cycle(obj, cycles)
+    imported = cycle(obj, cycles)
     self.assertAllClose(3.,
                         imported(NamedTupleType(a=constant_op.constant(1.),
                                                 b=constant_op.constant(2.))))
@@ -1490,7 +1495,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     obj = tracking.AutoTrackable()
     obj.__call__ = f
-    imported = self.cycle(obj, cycles)
+    imported = cycle(obj, cycles)
 
     self.assertEqual(4.0, imported({"a": 3.0}).numpy())
 
@@ -1510,7 +1515,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.f = func
 
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
 
     imported_graph = root.f.get_concrete_function().graph
     input_x, input_y = imported_graph.inputs
@@ -1530,7 +1535,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     v1 = variables.Variable(1.)
     weak_v1 = weakref.ref(v1)
     root = util.Checkpoint(v=v1)
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     del v1
     self.assertIsNone(weak_v1())
     weak_v2 = weakref.ref(root.v)
@@ -1549,7 +1554,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
                      v.aggregation)
     root = tracking.AutoTrackable()
     root.v = v
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(False, root.v.trainable)
     self.assertEqual(variables.VariableSynchronization.NONE,
                      root.v.synchronization)
@@ -1577,62 +1582,18 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(
         3 * (1 + 4 + 9 + 16),
         root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(
         3 * (1 + 4 + 9 + 16),
         root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_dense_features_layer(self, cycles):
-    columns = [feature_column_v2.numeric_column("x"),
-               feature_column_v2.numeric_column("y")]
-    layer = feature_column_v2.DenseFeatures(columns)
-    model = sequential.Sequential([layer])
-    model_input = {"x": constant_op.constant([[1.]]),
-                   "y": constant_op.constant([[2.]])}
-    self.assertAllClose([[1., 2.]], model.predict(model_input, steps=1))
-    loaded = self.cycle(model, cycles)
-    output, = loaded._default_save_signature(model_input).values()
-    self.assertAllClose([[1., 2.]], output)
-    signature_output, = loaded.signatures["serving_default"](
-        **model_input).values()
-    self.assertAllClose([[1., 2.]], signature_output)
-
-  def test_dense_features_layer_fit(self, cycles):
-    columns = [feature_column_v2.numeric_column("x")]
-    model = sequential.Sequential(
-        [feature_column_v2.DenseFeatures(columns),
-         core.Dense(1)])
-    model_input = {"x": constant_op.constant([[1.]])}
-    model.compile(optimizer="adam", loss="mse")
-    model.fit(model_input, constant_op.constant([[3.]]))
-    loaded = self.cycle(model, cycles)
-    loaded._default_save_signature(model_input)
-    loaded.signatures["serving_default"](**model_input)
-
-  def test_multi_output_layer(self, cycles):
-
-    inp = input_layer.Input(name="inp", shape=(None,), dtype=dtypes.float32)
-
-    class _MultiOutput(base_layer.Layer):
-
-      def call(self, x):
-        return x + 1., x + 2.
-
-    out = _MultiOutput(name="out")(inp)
-    model = training_lib.Model(inp, out)
-    loaded = self.cycle(model, cycles)
-    self.assertAllClose(
-        dict(out=2., out_1=3.),
-        loaded.signatures["serving_default"](constant_op.constant(1.)))
-
   def test_tuple_signature(self, cycles):
     root = util.Checkpoint()
     root.f = def_function.function(
         lambda: (array_ops.ones([]), array_ops.zeros([])),
         input_signature=())
     for _ in range(cycles):
-      root = self.cycle(root, 1, signatures=root.f)
+      root = cycle(root, 1, signatures=root.f)
     self.assertEqual(({"output_0": 1., "output_1": 0.}),
                      self.evaluate(root.signatures["serving_default"]()))
 
@@ -1646,14 +1607,14 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root.model.traced_call = _use_sequential
 
     original = root.model.traced_call(array_ops.zeros([1, 1])).numpy()
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertAllEqual(
         original,
         root.model.traced_call(array_ops.zeros([1, 1])).numpy())
 
   def test_version_info(self, cycles):
     root = util.Checkpoint()
-    root = self.cycle(root, cycles)
+    root = cycle(root, cycles)
     self.assertEqual(versions.__version__, root.tensorflow_version)
     self.assertEqual(versions.__git_version__, root.tensorflow_git_version)
 
@@ -1669,18 +1630,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         output = root.g(inp)
         self.assertAllClose(4., output)
       self.assertAllClose(2., tape.gradient(output, inp))
-      root = self.cycle(root, 1)
-
-  def test_functional_model_with_conv(self, cycles):
-    x = input_layer.Input(name="x", shape=(None, None, 3), dtype=dtypes.float32)
-    conved = convolutional.Conv2D(filters=3, kernel_size=3, dilation_rate=2)(x)
-    model = training_lib.Model([x], conved)
-    model_input = array_ops.ones((1, 10, 10, 3))
-    initial_output = model.predict([model_input])
-    model = self.cycle(model, cycles)
-    self.assertAllClose(
-        [initial_output],
-        list(model.signatures["serving_default"](model_input).values()))
+      root = cycle(root, 1)
 
   def test_destroy_resource(self, cycles):
 
@@ -1728,7 +1678,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
             handle, dtypes.float32)
 
     root = MyModel()
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertEqual(11, imported.increase().numpy())  # Create the resource.
 
     handle = imported.resource.resource_handle
@@ -1756,9 +1706,92 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     root = module.Module()
     root.f = outer
-    imported = self.cycle(root, cycles)
+    imported = cycle(root, cycles)
     self.assertAllClose(2., imported.f(constant_op.constant(1.)))
 
+  def test_ragged(self, cycles):
+
+    @def_function.function(input_signature=[
+        ragged_tensor.RaggedTensorSpec(shape=[None, None], dtype=dtypes.int32)
+    ])
+    def f(x):
+      return x + 1
+
+    obj = tracking.AutoTrackable()
+    obj.f = f
+
+    imported1 = cycle(obj, cycles, signatures={})
+    rt = ragged_factory_ops.constant([[1, 2], [3]])
+    self.assertAllEqual(imported1.f(rt), [[2, 3], [4]])
+
+    imported2 = cycle(obj, cycles)
+    rt = ragged_factory_ops.constant([[1, 2], [3]])
+    self.assertAllEqual(imported2.f(rt), [[2, 3], [4]])
+
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+@parameterized.named_parameters(
+    dict(testcase_name="ReloadOnce", cycles=1),
+    dict(testcase_name="ReloadTwice", cycles=2),
+    dict(testcase_name="ReloadThrice", cycles=3))
+class KerasLoadTest(test.TestCase, parameterized.TestCase):
+
+  def test_dense_features_layer(self, cycles):
+    columns = [
+        feature_column_lib.numeric_column("x"),
+        feature_column_lib.numeric_column("y")
+    ]
+    layer = feature_column_lib.DenseFeatures(columns)
+    model = sequential.Sequential([layer])
+    model_input = {"x": constant_op.constant([[1.]]),
+                   "y": constant_op.constant([[2.]])}
+    self.assertAllClose([[1., 2.]], model.predict(model_input, steps=1))
+    loaded = cycle(model, cycles)
+    output, = loaded._default_save_signature(model_input).values()
+    self.assertAllClose([[1., 2.]], output)
+    signature_output, = loaded.signatures["serving_default"](
+        **model_input).values()
+    self.assertAllClose([[1., 2.]], signature_output)
+
+  def test_dense_features_layer_fit(self, cycles):
+    columns = [feature_column_lib.numeric_column("x")]
+    model = sequential.Sequential(
+        [feature_column_lib.DenseFeatures(columns),
+         core.Dense(1)])
+    model_input = {"x": constant_op.constant([[1.]])}
+    model.compile(optimizer="adam", loss="mse", run_eagerly=True,
+                  experimental_run_tf_function=True)
+    model.fit(model_input, constant_op.constant([[3.]]))
+    loaded = cycle(model, cycles)
+    loaded._default_save_signature(model_input)
+    loaded.signatures["serving_default"](**model_input)
+
+  def test_multi_output_layer(self, cycles):
+
+    inp = input_layer.Input(name="inp", shape=(None,), dtype=dtypes.float32)
+
+    class _MultiOutput(base_layer.Layer):
+
+      def call(self, x):
+        return x + 1., x + 2.
+
+    out = _MultiOutput(name="out")(inp)
+    model = training_lib.Model(inp, out)
+    loaded = cycle(model, cycles)
+    self.assertAllClose(
+        dict(out=2., out_1=3.),
+        loaded.signatures["serving_default"](constant_op.constant(1.)))
+
+  def test_functional_model_with_conv(self, cycles):
+    x = input_layer.Input(name="x", shape=(None, None, 3), dtype=dtypes.float32)
+    conved = convolutional.Conv2D(filters=3, kernel_size=3, dilation_rate=2)(x)
+    model = training_lib.Model([x], conved)
+    model_input = array_ops.ones((1, 10, 10, 3))
+    initial_output = model.predict([model_input])
+    model = cycle(model, cycles)
+    self.assertAllClose(
+        [initial_output],
+        list(model.signatures["serving_default"](model_input).values()))
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index c14af7b7319..e2e62bae386 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -174,8 +174,8 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     # we don't have duplicates or name collisions.
     meta_graph_def.graph_def.library.Clear()
     for function in functions.values():
-      meta_graph_def.graph_def.library.function.append(
-          function._inference_function.definition)  # pylint: disable=protected-access
+      meta_graph_def.graph_def.library.function.add().CopyFrom(
+          function.function_def)
     # We've renamed functions and shared names. We need the same operation on
     # the GraphDef itself for consistency.
     for node_def in meta_graph_def.graph_def.node:
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 3e61b441d94..906b8198335 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -197,7 +197,7 @@ class LoadTest(test.TestCase):
                      self.evaluate(second_imported.signatures["second_key"](
                          second_start=constant_op.constant(2.))))
 
-  def _v1_asset_saved_model(self):
+  def _v1_asset_saved_model(self, clear_shared_name):
     export_graph = ops.Graph()
     vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
     with open(vocab_path, "w") as f:
@@ -214,6 +214,9 @@ class LoadTest(test.TestCase):
       start = array_ops.placeholder(
           shape=None, dtype=dtypes.string, name="in")
       output = table.lookup(start, name="out")
+      if clear_shared_name:
+        export_graph.get_operation_by_name("hash_table")._clear_attr(
+            "shared_name")
       with session_lib.Session() as session:
         session.run([table.initializer])
         path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
@@ -228,7 +231,7 @@ class LoadTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_asset_loading(self):
-    first_path = self._v1_asset_saved_model()
+    first_path = self._v1_asset_saved_model(clear_shared_name=False)
     imported = load.load(first_path)
     self.evaluate(lookup_ops.tables_initializer())
     fn = imported.signatures["serving_default"]
@@ -256,6 +259,15 @@ class LoadTest(test.TestCase):
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_node_name_sharing(self):
+    fourth_path = self._v1_asset_saved_model(clear_shared_name=True)
+    fourth_import = load.load(fourth_path)
+    self.evaluate(lookup_ops.tables_initializer())
+    fn = fourth_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
   def _v1_cond_saved_model(self):
     export_graph = ops.Graph()
     with export_graph.as_default():
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys.py b/tensorflow/python/saved_model/model_utils/mode_keys.py
index 2912de7210f..6f7a787befa 100644
--- a/tensorflow/python/saved_model/model_utils/mode_keys.py
+++ b/tensorflow/python/saved_model/model_utils/mode_keys.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+from tensorflow.python.util.compat import collections_abc
 
 
 class KerasModeKeys(object):
@@ -65,7 +65,7 @@ def is_train(mode):
   return mode in [KerasModeKeys.TRAIN, EstimatorModeKeys.TRAIN]
 
 
-class ModeKeyMap(collections.Mapping):
+class ModeKeyMap(collections_abc.Mapping):
   """Map using ModeKeys as keys.
 
   This class creates an immutable mapping from modes to values. For example,
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 66b02b119d1..3144bbdf942 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -35,10 +35,19 @@ import functools
 import six
 
 from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import optional_ops
+from tensorflow.python.distribute import values
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
+from tensorflow.python.util.compat import collections_abc
 
 
 class NotEncodableError(Exception):
@@ -153,7 +162,7 @@ def _is_named_tuple(instance):
   if not isinstance(instance, tuple):
     return False
   return (hasattr(instance, "_fields") and
-          isinstance(instance._fields, collections.Sequence) and
+          isinstance(instance._fields, collections_abc.Sequence) and
           all(isinstance(f, six.string_types) for f in instance._fields))
 
 
@@ -435,3 +444,67 @@ class _TensorSpecCodec(object):
 
 
 StructureCoder.register_codec(_TensorSpecCodec())
+
+
+class _TypeSpecCodec(object):
+  """Codec for `tf.TypeSpec`."""
+
+  # Mapping from enum value to type (TypeSpec subclass).
+  TYPE_SPEC_CLASS_FROM_PROTO = {
+      struct_pb2.TypeSpecProto.SPARSE_TENSOR_SPEC:
+          sparse_tensor.SparseTensorSpec,
+      struct_pb2.TypeSpecProto.INDEXED_SLICES_SPEC:
+          indexed_slices.IndexedSlicesSpec,
+      struct_pb2.TypeSpecProto.RAGGED_TENSOR_SPEC:
+          ragged_tensor.RaggedTensorSpec,
+      struct_pb2.TypeSpecProto.TENSOR_ARRAY_SPEC:
+          tensor_array_ops.TensorArraySpec,
+      struct_pb2.TypeSpecProto.DATA_DATASET_SPEC:
+          dataset_ops.DatasetSpec,
+      struct_pb2.TypeSpecProto.DATA_ITERATOR_SPEC:
+          iterator_ops.IteratorSpec,
+      struct_pb2.TypeSpecProto.OPTIONAL_SPEC:
+          optional_ops.OptionalSpec,
+      struct_pb2.TypeSpecProto.PER_REPLICA_SPEC:
+          values.PerReplicaSpec,
+  }
+
+  # Mapping from type (TypeSpec subclass) to enum value.
+  TYPE_SPEC_CLASS_TO_PROTO = dict(
+      (cls, enum) for (enum, cls) in TYPE_SPEC_CLASS_FROM_PROTO.items())
+
+  def can_encode(self, pyobj):
+    # pylint: disable=unidiomatic-typecheck
+    return type(pyobj) in self.TYPE_SPEC_CLASS_TO_PROTO
+
+  def do_encode(self, type_spec_value, encode_fn):
+    """Returns an encoded proto for the given `tf.TypeSpec`."""
+    type_spec_class = self.TYPE_SPEC_CLASS_TO_PROTO[type(type_spec_value)]
+    type_state = type_spec_value._serialize()  # pylint: disable=protected-access
+    encoded_type_spec = struct_pb2.StructuredValue()
+    encoded_type_spec.type_spec_value.CopyFrom(
+        struct_pb2.TypeSpecProto(
+            type_spec_class=type_spec_class,
+            type_state=encode_fn(type_state),
+            type_spec_class_name=type(type_spec_value).__name__))
+    return encoded_type_spec
+
+  def can_decode(self, value):
+    return value.HasField("type_spec_value")
+
+  def do_decode(self, value, decode_fn):
+    """Returns the `tf.TypeSpec` encoded by the proto `value`."""
+    type_spec_proto = value.type_spec_value
+    type_spec_class_enum = type_spec_proto.type_spec_class
+    if type_spec_class_enum not in self.TYPE_SPEC_CLASS_FROM_PROTO:
+      raise ValueError(
+          "The type '%s' is not supported by this version of TensorFlow. "
+          "(The object you are loading must have been created with a newer "
+          "version of TensorFlow.)" % type_spec_proto.type_spec_class_name)
+
+    type_spec_class = self.TYPE_SPEC_CLASS_FROM_PROTO[type_spec_class_enum]
+    # pylint: disable=protected-access
+    return type_spec_class._deserialize(decode_fn(type_spec_proto.type_state))
+
+
+StructureCoder.register_codec(_TypeSpecCodec())
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 16c56b1ddbf..23c305d0708 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -20,10 +20,14 @@ from __future__ import print_function
 
 import collections
 
+from google.protobuf import text_format
 from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import nested_structure_coder
 
@@ -187,6 +191,96 @@ class NestedStructureTest(test.TestCase):
     decoded = self._coder.decode_proto(encoded)
     self.assertEqual(structure, decoded)
 
+  def testEncodeDecodeRaggedTensorSpec(self):
+    structure = [ragged_tensor.RaggedTensorSpec(
+        [1, 2, 3], dtypes.int64, 2, dtypes.int32)]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected_pbtxt = r"""
+      list_value {
+        values {
+          type_spec_value {
+            type_spec_class: RAGGED_TENSOR_SPEC
+            type_spec_class_name: 'RaggedTensorSpec'
+            type_state {
+              tuple_value {
+                # spec._shape
+                values {
+                  tensor_shape_value {
+                    dim { size: 1 }
+                    dim { size: 2 }
+                    dim { size: 3 }
+                  }
+                }
+                # spec._dtype
+                values { tensor_dtype_value: DT_INT64 }
+                # spec._ragged_rank
+                values { int64_value: 2 }
+                # spec._row_splits_dtype
+                values { tensor_dtype_value: DT_INT32 }
+              }
+            }
+          }
+        }
+      }
+    """
+    expected = struct_pb2.StructuredValue()
+    text_format.Parse(expected_pbtxt, expected)
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeSparseTensorSpec(self):
+    structure = [sparse_tensor.SparseTensorSpec([10, 20], dtypes.float32)]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected_pbtxt = r"""
+      list_value {
+        values {
+          type_spec_value {
+            type_spec_class: SPARSE_TENSOR_SPEC
+            type_spec_class_name: 'SparseTensorSpec'
+            type_state {
+              tuple_value {
+                # spec._shape
+                values {
+                  tensor_shape_value {
+                    dim { size: 10 }
+                    dim { size: 20 }
+                  }
+                }
+                # spec._dtype
+                values { tensor_dtype_value: DT_FLOAT }
+              }
+            }
+          }
+        }
+      }
+    """
+    expected = struct_pb2.StructuredValue()
+    text_format.Parse(expected_pbtxt, expected)
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testDecodeUnknownTensorSpec(self):
+    encoded = struct_pb2.StructuredValue()
+    encoded.type_spec_value.type_spec_class = 0
+    encoded.type_spec_value.type_spec_class_name = "FutureTensorSpec"
+    with self.assertRaisesRegexp(
+        ValueError, "The type 'FutureTensorSpec' is not supported"):
+      self._coder.decode_proto(encoded)
+
+  def testEncodeDataSetSpec(self):
+    structure = [dataset_ops.DatasetSpec(
+        {"rt": ragged_tensor.RaggedTensorSpec([10, None], dtypes.int32),
+         "st": sparse_tensor.SparseTensorSpec([10, 20], dtypes.float32),
+         "t": tensor_spec.TensorSpec([10, 8], dtypes.string)})]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
   def testNotEncodable(self):
 
     class NotEncodable(object):
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index f357ed0728d..f3b65d62389 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -25,6 +25,7 @@ from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
+from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -240,6 +241,7 @@ class _SaveableView(object):
         asset_initializers_by_resource={},
         asset_filename_map={},
         asset_index={})
+
     for node_id, obj in enumerate(self.nodes):
       if isinstance(obj, tracking.CapturableResource):
         # pylint: disable=protected-access
@@ -248,6 +250,20 @@ class _SaveableView(object):
         # pylint: enable=protected-access
         resource_map[obj.resource_handle] = new_resource
         self.captured_tensor_node_ids[obj.resource_handle] = node_id
+      elif ds_values.is_distributed_variable(obj):
+        # Put both the distributed variable and component variable handles in
+        # `captured_tensor_node_ids`.
+        # Also create a new distributed variable for `object_map` with newly
+        # created component variables.
+        new_vars = []
+        for v in obj.values:
+          new_variable = resource_variable_ops.copy_to_graph_uninitialized(v)
+          object_map[v] = new_variable
+          new_vars.append(new_variable)
+          resource_map[v.handle] = new_variable.handle
+          self.captured_tensor_node_ids[v.handle] = node_id
+        object_map[obj] = obj._clone_with_new_values(new_vars)  # pylint: disable=protected-access
+        self.captured_tensor_node_ids[obj] = node_id
       elif resource_variable_ops.is_resource_variable(obj):
         new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
         object_map[obj] = new_variable
@@ -258,6 +274,11 @@ class _SaveableView(object):
         self.captured_tensor_node_ids[obj.asset_path] = node_id
 
     for concrete_function in self.concrete_functions:
+      if not concrete_function.graph.saveable:
+        raise ValueError(
+            ("Unable to save function {name} for the following reason(s):\n" +
+             "\n".join(concrete_function.graph.saving_errors))
+            .format(name=concrete_function.name))
       for capture in concrete_function.captured_inputs:
         if (tensor_util.is_tensor(capture)
             and capture.dtype not in _UNCOPIABLE_DTYPES
@@ -306,7 +327,7 @@ def _map_captures_to_created_tensors(
       `resource_map`.
   """
   export_captures = []
-  for exterior, interior in original_captures.items():
+  for exterior, interior in original_captures:
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
       raise AssertionError(
@@ -393,13 +414,12 @@ def _call_function_with_mapped_captures(function, args, resource_map):
   """Calls `function` in the exported graph, using mapped resource captures."""
   export_captures = _map_captures_to_created_tensors(
       function.graph.captures, resource_map)
-  mapped_inputs = args + export_captures
   # Calls the function quite directly, since we have new captured resource
   # tensors we need to feed in which weren't part of the original function
   # definition.
   # pylint: disable=protected-access
-  outputs = function._build_call_outputs(
-      function._inference_function.call(context.context(), mapped_inputs))
+  outputs = function._call_flat(args, export_captures)
+  # pylint: enable=protected-access
   return outputs
 
 
@@ -848,12 +868,12 @@ def save(obj, export_dir, signatures=None):
   builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
                                               export_dir)
   path = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+      compat.as_str(export_dir),
+      compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
   object_graph_proto = _serialize_object_graph(
       saveable_view, asset_info.asset_index)
   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
-  file_io.write_string_to_file(path, saved_model.SerializeToString())
+  file_io.atomic_write_string_to_file(path, saved_model.SerializeToString())
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector. Before this point we need to keep references to captured
   # constants in the saved graph.
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 566c508526d..a5200b01b41 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -141,6 +141,22 @@ class SaveTest(test.TestCase):
       save.save(root, os.path.join(self.get_temp_dir(), "saved_model"),
                 signatures=root.f)
 
+  def test_unsaveable_func_graph(self):
+    root = module.Module()
+
+    @def_function.function(input_signature=[])
+    def nested_f():
+      ops.get_default_graph().mark_as_unsaveable("ERROR MSG")
+      return 1
+
+    @def_function.function(input_signature=[])
+    def f():
+      return nested_f()
+
+    root.f = f
+    with self.assertRaisesRegexp(ValueError, "ERROR MSG"):
+      save.save(root, os.path.join(self.get_temp_dir(), "saved_model"))
+
   def test_version_information_included(self):
     root = tracking.AutoTrackable()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index e36b8b30bf2..7722cd3b14c 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
@@ -43,6 +44,7 @@ from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import utils
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
@@ -642,6 +644,19 @@ class SavedModelTest(SavedModelTestBase):
     builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
+  @test_util.run_deprecated_v1
+  def testSignatureDefValidationSucceedsWithRagged(self):
+    ragged_tensor = ragged_factory_ops.constant([[1, 2], [3]])
+    tensor_with_ragged = utils.build_tensor_info(ragged_tensor)
+
+    export_dir = self._get_export_dir("test_signature_def_validation_ragged_1")
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
+    self._validate_inputs_tensor_info_accept(builder, tensor_with_ragged)
+
+    export_dir = self._get_export_dir("test_signature_def_validation_ragged_2")
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
+    self._validate_outputs_tensor_info_accept(builder, tensor_with_ragged)
+
   @test_util.run_deprecated_v1
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 0e969e1b43a..3f3725f39c9 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import ops
@@ -29,6 +27,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training.tracking import base
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
@@ -87,7 +86,7 @@ def canonicalize_signatures(signatures):
   """Converts `signatures` into a dictionary of concrete functions."""
   if signatures is None:
     return {}
-  if not isinstance(signatures, collections.Mapping):
+  if not isinstance(signatures, collections_abc.Mapping):
     signatures = {
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
@@ -146,7 +145,7 @@ def _is_flat(sequence):
 
 def _normalize_outputs(outputs, function_name, signature_key):
   """Construct an output dictionary from unnormalized function outputs."""
-  if isinstance(outputs, collections.Mapping):
+  if isinstance(outputs, collections_abc.Mapping):
     for key, value in outputs.items():
       if not isinstance(value, ops.Tensor):
         raise ValueError(
@@ -158,7 +157,7 @@ def _normalize_outputs(outputs, function_name, signature_key):
     return outputs
   else:
     original_outputs = outputs
-    if not isinstance(outputs, collections.Sequence):
+    if not isinstance(outputs, collections_abc.Sequence):
       outputs = [outputs]
     if not _is_flat(outputs):
       raise ValueError(
@@ -180,7 +179,7 @@ def _normalize_outputs(outputs, function_name, signature_key):
 # saved if they contain a _SignatureMap. A ".signatures" attribute containing
 # any other type (e.g. a regular dict) will raise an exception asking the user
 # to first "del obj.signatures" if they want it overwritten.
-class _SignatureMap(collections.Mapping, base.Trackable):
+class _SignatureMap(collections_abc.Mapping, base.Trackable):
   """A collection of SavedModel signatures."""
 
   def __init__(self):
@@ -234,7 +233,7 @@ def create_signature_map(signatures):
     # be more problematic in case future export changes violated these
     # assertions.
     assert isinstance(func, defun.ConcreteFunction)
-    assert isinstance(func.structured_outputs, collections.Mapping)
+    assert isinstance(func.structured_outputs, collections_abc.Mapping)
     # pylint: disable=protected-access
     if len(func._arg_keywords) == 1:
       assert 1 == func._num_positional_args
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 2e7b2080574..3dd7d6c7ae4 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,15 +22,19 @@ import os
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -65,6 +69,10 @@ def build_tensor_info(tensor):
 
 def build_tensor_info_internal(tensor):
   """Utility function to build TensorInfo proto from a Tensor."""
+  if (isinstance(tensor, composite_tensor.CompositeTensor) and
+      not isinstance(tensor, sparse_tensor.SparseTensor)):
+    return _build_composite_tensor_info_internal(tensor)
+
   tensor_info = meta_graph_pb2.TensorInfo(
       dtype=dtypes.as_dtype(tensor.dtype).as_datatype_enum,
       tensor_shape=tensor.get_shape().as_proto())
@@ -77,6 +85,19 @@ def build_tensor_info_internal(tensor):
   return tensor_info
 
 
+def _build_composite_tensor_info_internal(tensor):
+  """Utility function to build TensorInfo proto from a CompositeTensor."""
+  spec = tensor._type_spec  # pylint: disable=protected-access
+  tensor_info = meta_graph_pb2.TensorInfo()
+  struct_coder = nested_structure_coder.StructureCoder()
+  spec_proto = struct_coder.encode_structure(spec)
+  tensor_info.composite_tensor.type_spec.CopyFrom(spec_proto.type_spec_value)
+  for component in nest.flatten(tensor, expand_composites=True):
+    tensor_info.composite_tensor.components.add().CopyFrom(
+        build_tensor_info_internal(component))
+  return tensor_info
+
+
 def build_tensor_info_from_op(op):
   """Utility function to build TensorInfo proto from an Op.
 
@@ -120,17 +141,19 @@ def build_tensor_info_from_op(op):
     "library as tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info or "
     "tf.compat.v1.saved_model.get_tensor_from_tensor_info.")
 def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
-  """Returns the Tensor or SparseTensor described by a TensorInfo proto.
+  """Returns the Tensor or CompositeTensor described by a TensorInfo proto.
 
   Args:
-    tensor_info: A TensorInfo proto describing a Tensor or SparseTensor.
+    tensor_info: A TensorInfo proto describing a Tensor or SparseTensor or
+      CompositeTensor.
     graph: The tf.Graph in which tensors are looked up. If None, the
         current default graph is used.
     import_scope: If not None, names in `tensor_info` are prefixed with this
         string before lookup.
 
   Returns:
-    The Tensor or SparseTensor in `graph` described by `tensor_info`.
+    The Tensor or SparseTensor or CompositeTensor in `graph` described by
+    `tensor_info`.
 
   Raises:
     KeyError: If `tensor_info` does not correspond to a tensor in `graph`.
@@ -148,6 +171,14 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
         _get_tensor(tensor_info.coo_sparse.indices_tensor_name),
         _get_tensor(tensor_info.coo_sparse.values_tensor_name),
         _get_tensor(tensor_info.coo_sparse.dense_shape_tensor_name))
+  elif encoding == "composite_tensor":
+    struct_coder = nested_structure_coder.StructureCoder()
+    spec_proto = struct_pb2.StructuredValue(
+        type_spec_value=tensor_info.composite_tensor.type_spec)
+    spec = struct_coder.decode_proto(spec_proto)
+    components = [_get_tensor(component.name) for component in
+                  tensor_info.composite_tensor.components]
+    return spec.from_components(components)
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 1e12de91b86..d176b91db1e 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
@@ -28,7 +29,9 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import utils
 
 
@@ -82,6 +85,26 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  @test_util.run_v1_only("b/120545219")
+  def testBuildTensorInfoRagged(self):
+    x = ragged_factory_ops.constant([[1, 2], [3]])
+    x_tensor_info = utils.build_tensor_info(x)
+    # Check components
+    self.assertEqual(x.values.name,
+                     x_tensor_info.composite_tensor.components[0].name)
+    self.assertEqual(types_pb2.DT_INT32,
+                     x_tensor_info.composite_tensor.components[0].dtype)
+    self.assertEqual(x.row_splits.name,
+                     x_tensor_info.composite_tensor.components[1].name)
+    self.assertEqual(types_pb2.DT_INT64,
+                     x_tensor_info.composite_tensor.components[1].dtype)
+    # Check type_spec.
+    struct_coder = nested_structure_coder.StructureCoder()
+    spec_proto = struct_pb2.StructuredValue(
+        type_spec_value=x_tensor_info.composite_tensor.type_spec)
+    spec = struct_coder.decode_proto(spec_proto)
+    self.assertEqual(spec, x._type_spec)
+
   def testBuildTensorInfoEager(self):
     x = constant_op.constant(1, name="x")
     with context.eager_mode(), self.assertRaisesRegexp(
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 82e30be33a3..c331601bebd 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -49,11 +49,12 @@ limitations under the License.
 
 %include "tensorflow/python/util/transform_graph.i"
 
-%include "tensorflow/python/util/util.i"
-
 %include "tensorflow/python/grappler/cluster.i"
 %include "tensorflow/python/grappler/item.i"
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
 %include "tensorflow/python/grappler/graph_analyzer.i"
 %include "tensorflow/python/grappler/model_analyzer.i"
+
+%include "tensorflow/python/util/traceme.i"
+%include "tensorflow/python/util/scoped_annotation.i"
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 5e64cc64d24..71610d3574b 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -67,6 +67,7 @@ def gen_api_init_files(
         name = api_gen_binary_target,
         srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
         main = "//tensorflow/python/tools/api/generator:create_python_api.py",
+        python_version = "PY2",
         srcs_version = "PY2AND3",
         visibility = ["//visibility:public"],
         deps = package_deps + [
@@ -91,6 +92,8 @@ def gen_api_init_files(
             " --compat_init_template=$(location %s)" % compat_init_template
         )
 
+    loading_flag = " --loading=default"
+
     native.genrule(
         name = name,
         outs = all_output_files,
@@ -99,7 +102,7 @@ def gen_api_init_files(
             root_init_template_flag + " --apidir=$(@D)" + output_dir +
             " --apiname=" + api_name + " --apiversion=" + str(api_version) +
             compat_api_version_flags + " " + compat_init_template_flags +
-            " --package=" + ",".join(packages) +
+            loading_flag + " --package=" + ",".join(packages) +
             " --output_package=" + output_package + " $(OUTS)"
         ),
         srcs = srcs,
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index eaaef4e09d4..38dfff6525f 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -37,6 +37,8 @@ TENSORFLOW_API_INIT_FILES = [
     "lookup/__init__.py",
     "lookup/experimental/__init__.py",
     "math/__init__.py",
+    "mixed_precision/__init__.py",
+    "mixed_precision/experimental/__init__.py",
     "nest/__init__.py",
     "nn/__init__.py",
     "quantization/__init__.py",
@@ -101,6 +103,7 @@ KERAS_API_INIT_FILES = [
     "keras/metrics/__init__.py",
     "keras/mixed_precision/__init__.py",
     "keras/mixed_precision/experimental/__init__.py",
+    "keras/premade/__init__.py",
     "keras/models/__init__.py",
     "keras/optimizers/__init__.py",
     "keras/optimizers/schedules/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index b60a729ea0b..94d72c2a878 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -131,6 +131,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/models/__init__.py",
     "keras/optimizers/__init__.py",
     "keras/optimizers/schedules/__init__.py",
+    "keras/premade/__init__.py",
     "keras/preprocessing/__init__.py",
     "keras/preprocessing/image/__init__.py",
     "keras/preprocessing/sequence/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index a8a1c760637..98cd159a63f 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -75,34 +75,6 @@ class SymbolExposedTwiceError(Exception):
   pass
 
 
-def format_import(source_module_name, source_name, dest_name):
-  """Formats import statement.
-
-  Args:
-    source_module_name: (string) Source module to import from.
-    source_name: (string) Source symbol name to import.
-    dest_name: (string) Destination alias name.
-
-  Returns:
-    An import statement string.
-  """
-  if _LAZY_LOADING:
-    return "  '%s': ('%s', '%s')," % (dest_name, source_module_name,
-                                      source_name)
-  else:
-    if source_module_name:
-      if source_name == dest_name:
-        return 'from %s import %s' % (source_module_name, source_name)
-      else:
-        return 'from %s import %s as %s' % (source_module_name, source_name,
-                                            dest_name)
-    else:
-      if source_name == dest_name:
-        return 'import %s' % source_name
-      else:
-        return 'import %s as %s' % (source_name, dest_name)
-
-
 def get_canonical_import(import_set):
   """Obtain one single import from a set of possible sources of a symbol.
 
@@ -133,7 +105,7 @@ def get_canonical_import(import_set):
 class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
-  def __init__(self, output_package, api_version):
+  def __init__(self, output_package, api_version, lazy_loading=_LAZY_LOADING):
     self._output_package = output_package
     # Maps API module to API symbol name to set of tuples of the form
     # (module name, priority).
@@ -145,6 +117,9 @@ class _ModuleInitCodeBuilder(object):
     # Names that start with underscore in the root module.
     self._underscore_names_in_root = []
     self._api_version = api_version
+    # Controls whether or not exported symbols are lazily loaded or statically
+    # imported.
+    self._lazy_loading = lazy_loading
 
   def _check_already_imported(self, symbol_id, api_name):
     if (api_name in self._dest_import_to_id and
@@ -171,7 +146,7 @@ class _ModuleInitCodeBuilder(object):
       SymbolExposedTwiceError: Raised when an import with the same
         dest_name has already been added to dest_module_name.
     """
-    import_str = format_import(source_module_name, source_name, dest_name)
+    import_str = self.format_import(source_module_name, source_name, dest_name)
 
     # Check if we are trying to expose two different symbols with same name.
     full_api_name = dest_name
@@ -211,7 +186,7 @@ class _ModuleInitCodeBuilder(object):
           submodule = module_split[submodule_index-1]
           parent_module += '.' + submodule if parent_module else submodule
         import_from = self._output_package
-        if _LAZY_LOADING:
+        if self._lazy_loading:
           import_from += '.' + '.'.join(module_split[:submodule_index + 1])
           self.add_import(
               symbol=None,
@@ -247,7 +222,7 @@ class _ModuleInitCodeBuilder(object):
           get_canonical_import(imports)
           for _, imports in dest_name_to_imports.items()
       ]
-      if _LAZY_LOADING:
+      if self._lazy_loading:
         module_text_map[
             dest_module] = _LAZY_LOADING_MODULE_TEXT_TEMPLATE % '\n'.join(
                 sorted(imports_list))
@@ -258,7 +233,7 @@ class _ModuleInitCodeBuilder(object):
     # from it using * import. Don't need this for lazy_loading because the
     # underscore symbols are already included in __all__ when passed in and
     # handled by TFModuleWrapper.
-    if not _LAZY_LOADING:
+    if not self._lazy_loading:
       underscore_names_str = ', '.join(
           '\'%s\'' % name for name in self._underscore_names_in_root)
 
@@ -275,9 +250,10 @@ __all__.extend([_s for _s in _names_with_underscore])
         if not dest_module.startswith(_COMPAT_MODULE_PREFIX):
           deprecation = 'True'
       # Workaround to make sure not load lite from lite/__init__.py
-      if not dest_module and 'lite' in self._module_imports and _LAZY_LOADING:
+      if (not dest_module and 'lite' in self._module_imports
+          and self._lazy_loading):
         has_lite = 'True'
-      if _LAZY_LOADING:
+      if self._lazy_loading:
         public_apis_name = '_PUBLIC_APIS'
       else:
         public_apis_name = 'None'
@@ -286,6 +262,33 @@ __all__.extend([_s for _s in _names_with_underscore])
 
     return module_text_map, footer_text_map
 
+  def format_import(self, source_module_name, source_name, dest_name):
+    """Formats import statement.
+
+    Args:
+      source_module_name: (string) Source module to import from.
+      source_name: (string) Source symbol name to import.
+      dest_name: (string) Destination alias name.
+
+    Returns:
+      An import statement string.
+    """
+    if self._lazy_loading:
+      return "  '%s': ('%s', '%s')," % (dest_name, source_module_name,
+                                        source_name)
+    else:
+      if source_module_name:
+        if source_name == dest_name:
+          return 'from %s import %s' % (source_module_name, source_name)
+        else:
+          return 'from %s import %s as %s' % (source_module_name, source_name,
+                                              dest_name)
+      else:
+        if source_name == dest_name:
+          return 'import %s' % source_name
+        else:
+          return 'import %s as %s' % (source_name, dest_name)
+
 
 def _get_name_and_module(full_name):
   """Split full_name into module and short name.
@@ -368,7 +371,8 @@ def get_api_init_text(packages,
                       output_package,
                       api_name,
                       api_version,
-                      compat_api_versions=None):
+                      compat_api_versions=None,
+                      lazy_loading=_LAZY_LOADING):
   """Get a map from destination module to __init__.py code for that module.
 
   Args:
@@ -380,6 +384,8 @@ def get_api_init_text(packages,
     api_version: API version you want to generate (1 or 2).
     compat_api_versions: Additional API versions to generate under compat/
       directory.
+    lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
+      produced and if `False`, static imports are used.
 
   Returns:
     A dictionary where
@@ -389,7 +395,8 @@ def get_api_init_text(packages,
   """
   if compat_api_versions is None:
     compat_api_versions = []
-  module_code_builder = _ModuleInitCodeBuilder(output_package, api_version)
+  module_code_builder = _ModuleInitCodeBuilder(
+      output_package, api_version, lazy_loading)
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
 
@@ -491,7 +498,8 @@ def get_module_docstring(module_name, package, api_name):
 
 def create_api_files(output_files, packages, root_init_template, output_dir,
                      output_package, api_name, api_version,
-                     compat_api_versions, compat_init_templates):
+                     compat_api_versions, compat_init_templates,
+                     lazy_loading=_LAZY_LOADING):
   """Creates __init__.py files for the Python API.
 
   Args:
@@ -509,6 +517,8 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
       subdirectory.
     compat_init_templates: List of templates for top level compat init files
       in the same order as compat_api_versions.
+    lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
+      produced and if `False`, static imports are used.
 
   Raises:
     ValueError: if output_files list is missing a required file.
@@ -526,7 +536,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
 
   module_text_map, deprecation_footer_map = get_api_init_text(
       packages, output_package, api_name,
-      api_version, compat_api_versions)
+      api_version, compat_api_versions, lazy_loading)
 
   # Add imports to output files.
   missing_output_files = []
@@ -621,6 +631,14 @@ def main():
   parser.add_argument(
       '--output_package', default='tensorflow', type=str,
       help='Root output package.')
+  parser.add_argument(
+      '--loading', default='default', type=str,
+      choices=['lazy', 'static', 'default'],
+      help='Controls how the generated __init__.py file loads the exported '
+           'symbols. \'lazy\' means the symbols are loaded when first used. '
+           '\'static\' means all exported symbols are loaded in the '
+           '__init__.py file. \'default\' uses the value of the '
+           '_LAZY_LOADING constant in create_python_api.py.')
   args = parser.parse_args()
 
   if len(args.outputs) == 1:
@@ -635,9 +653,23 @@ def main():
   packages = args.packages.split(',')
   for package in packages:
     importlib.import_module(package)
+
+  # Determine if the modules shall be loaded lazily or statically.
+  if args.loading == 'default':
+    lazy_loading = _LAZY_LOADING
+  elif args.loading == 'lazy':
+    lazy_loading = True
+  elif args.loading == 'static':
+    lazy_loading = False
+  else:
+    # This should never happen (tm).
+    raise ValueError('Invalid value for --loading flag: %s. Must be one of '
+                     'lazy, static, default.' % args.loading)
+
   create_api_files(outputs, packages, args.root_init_template, args.apidir,
                    args.output_package, args.apiname, args.apiversion,
-                   args.compat_apiversions, args.compat_init_templates)
+                   args.compat_apiversions, args.compat_init_templates,
+                   lazy_loading)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index cdef42e2bf8..57ffc3f05c2 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import collections
 import os
 import re
 import sys
@@ -38,9 +39,12 @@ from tensorflow.python.client import session
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import meta_graph as meta_graph_lib
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import app  # pylint: disable=unused-import
+from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
 from tensorflow.python.tools import saved_model_utils
 
 # Set of ops to blacklist.
@@ -155,6 +159,82 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
            meta_graph_def.signature_def[signature_def_key].method_name)
 
 
+def _show_defined_functions(saved_model_dir):
+  """Prints the callable concrete and polymorphic functions of the Saved Model.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+  """
+  meta_graphs = saved_model_utils.read_saved_model(saved_model_dir).meta_graphs
+  has_object_graph_def = False
+
+  for meta_graph_def in meta_graphs:
+    has_object_graph_def |= meta_graph_def.HasField('object_graph_def')
+  if not has_object_graph_def:
+    return
+  with ops_lib.Graph().as_default():
+    trackable_object = load.load(saved_model_dir)
+
+  print('\nDefined Functions:', end='')
+  functions = (
+      save._AugmentedGraphView(trackable_object)  # pylint: disable=protected-access
+      .list_functions(trackable_object))
+  functions = sorted(functions.items(), key=lambda x: x[0])
+  for name, function in functions:
+    print('\n  Function Name: \'%s\'' % name)
+    concrete_functions = \
+        function._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    concrete_functions = sorted(concrete_functions, key=lambda x: x.name)
+    for index, concrete_function in enumerate(concrete_functions, 1):
+      args, kwargs = concrete_function.structured_input_signature
+      print('    Option #%d' % index)
+      print('      Callable with:')
+      _print_args(args, indent=4)
+      if kwargs:
+        _print_args(kwargs, 'Named Argument', indent=4)
+
+
+def _print_args(arguments, argument_type='Argument', indent=0):
+  """Formats and prints the argument of the concrete functions defined in the model.
+
+  Args:
+    arguments: Arguments to format print.
+    argument_type: Type of arguments.
+    indent: How far (in increments of 2 spaces) to indent each line of
+     output.
+  """
+  indent_str = '  ' * indent
+
+  def _maybe_add_quotes(value):
+    is_quotes = '\'' * isinstance(value, str)
+    return is_quotes + str(value) + is_quotes
+
+  def in_print(s, end='\n'):
+    print(indent_str + s, end=end)
+
+  for index, element in enumerate(arguments, 1):
+    if indent == 4:
+      in_print('%s #%d' % (argument_type, index))
+    if isinstance(element, tensor_spec.TensorSpec):
+      print((indent + 1) * '  ' + '%s: %s' % (element.name, repr(element)))
+    elif (isinstance(element, collections.Iterable) and
+          not isinstance(element, dict)):
+      in_print('  DType: %s' % type(element).__name__)
+      in_print('  Value: [', end='')
+      for value in element:
+        print('%s' % _maybe_add_quotes(value), end=', ')
+      print('\b\b]')
+    elif isinstance(element, dict):
+      in_print('  DType: %s' % type(element).__name__)
+      in_print('  Value: {', end='')
+      for (key, value) in element.items():
+        print('\'%s\': %s' % (str(key), _maybe_add_quotes(value)), end=', ')
+      print('\b\b}')
+    else:
+      in_print('  DType: %s' % type(element).__name__)
+      in_print('  Value: %s' % str(element))
+
+
 def _print_tensor_info(tensor_info, indent=0):
   """Prints details of the given tensor_info.
 
@@ -200,6 +280,7 @@ def _show_all(saved_model_dir):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
       _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key,
                            indent=1)
+  _show_defined_functions(saved_model_dir)
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -551,7 +632,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
   input_examples = preprocess_input_examples_arg_string(input_examples_str)
 
   for input_tensor_key, (filename, variable_name) in inputs.items():
-    data = np.load(file_io.FileIO(filename, mode='rb'))
+    data = np.load(file_io.FileIO(filename, mode='rb'), allow_pickle=True)
 
     # When a variable_name key is specified for the input file
     if variable_name:
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index eedc893a38d..e4a2cc9e61c 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -31,8 +31,14 @@ from six import StringIO
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import save
 from tensorflow.python.tools import saved_model_cli
+from tensorflow.python.training.tracking import tracking
 
 SAVED_MODEL_PATH = ('cc/saved_model/testdata/half_plus_two/00000123')
 
@@ -138,7 +144,96 @@ signature_def['serving_default']:
         name: y:0
   Method name is: tensorflow/serving/predict"""
     # pylint: enable=line-too-long
-    self.maxDiff = None # Produce a useful error msg if the comparison fails
+    self.maxDiff = None  # Produce a useful error msg if the comparison fails
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowAllWithConcreteFunctions(self):
+
+    class DummyModel(tracking.AutoTrackable):
+      """Model with callable polymorphic functions specified."""
+
+      @def_function.function
+      def func1(self, a, b, c):
+        if c:
+          return a + b
+        else:
+          return a * b
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=(2, 2), dtype=dtypes.float32)
+      ])
+      def func2(self, x):
+        return x + 2
+
+      @def_function.function
+      def __call__(self, y, c=7):
+        return y + 2 * c
+
+    saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
+    dummy_model = DummyModel()
+    # Call with specific values to create new polymorphic function traces.
+    dummy_model.func1(
+        constant_op.constant(5), constant_op.constant(9), True)
+    dummy_model(constant_op.constant(5))
+    save.save(dummy_model, saved_model_dir)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(['show', '--dir', saved_model_dir, '--all'])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['__saved_model_init_op']:
+  The given SavedModel SignatureDef contains the following input(s):
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['__saved_model_init_op'] tensor_info:
+        dtype: DT_INVALID
+        shape: unknown_rank
+        name: NoOp
+  Method name is: 
+
+signature_def['serving_default']:
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['x'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (2, 2)
+        name: serving_default_x:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['output_0'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (2, 2)
+        name: PartitionedCall:0
+  Method name is: tensorflow/serving/predict
+
+Defined Functions:
+  Function Name: '__call__'
+    Option #1
+      Callable with:
+        Argument #1
+          y: TensorSpec(shape=(), dtype=tf.int32, name=u'y')
+        Argument #2
+          DType: int
+          Value: 7
+
+  Function Name: 'func1'
+    Option #1
+      Callable with:
+        Argument #1
+          a: TensorSpec(shape=(), dtype=tf.int32, name=u'a')
+        Argument #2
+          b: TensorSpec(shape=(), dtype=tf.int32, name=u'b')
+        Argument #3
+          DType: bool
+          Value: True
+
+  Function Name: 'func2'
+    Option #1
+      Callable with:
+        Argument #1
+          x: TensorSpec(shape=(2, 2), dtype=tf.float32, name=u'x')
+""".strip()  # pylint: enable=line-too-long
+    self.maxDiff = None  # Produce a useful error msg if the comparison fails
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index f653d10e32b..c3d01b23dc4 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -8,6 +8,7 @@ load(
     "tf_py_test",
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 
 # Do not add anymore paths here. You do not need to be in the visibility list
 # to use TPU symbols. They are accessible from tf.contrib.tpu in TF 1.x and
@@ -200,7 +201,9 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/compiler/xla",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/profiler",
+        "@six_archive//:six",
     ],
 )
 
@@ -400,3 +403,12 @@ tf_py_test(
     ],
     main = "feature_column_v2_test.py",
 )
+
+tf_proto_library(
+    name = "tensor_tracer_proto",
+    srcs = ["tensor_tracer.proto"],
+    protodeps = [
+        "//tensorflow/core:protos_all_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
index 7e7ab5a3fb0..8a6e71b4baa 100644
--- a/tensorflow/python/tpu/feature_column.py
+++ b/tensorflow/python/tpu/feature_column.py
@@ -52,7 +52,8 @@ def embedding_column(categorical_column,
                      dimension,
                      combiner='mean',
                      initializer=None,
-                     max_sequence_length=0):
+                     max_sequence_length=0,
+                     learning_rate_fn=None):
   """TPU embedding_column for `tf.feature_column.embedding_column`.
 
   Note that the interface for TPU embedding_column is different from the non-TPU
@@ -79,6 +80,8 @@ def embedding_column(categorical_column,
       length. Any sequence shorter then this will be padded with 0 embeddings
       and any sequence longer will be truncated. This must be positive for
       sequence features and 0 for non-sequence features.
+    learning_rate_fn: A function that takes global step and returns learning
+      rate for the embedding table.
 
   Returns:
     A  _TPUEmbeddingColumn.
@@ -124,7 +127,8 @@ def embedding_column(categorical_column,
       tensor_name_in_ckpt=None,
       max_norm=None,
       trainable=True,
-      max_sequence_length=max_sequence_length)
+      max_sequence_length=max_sequence_length,
+      learning_rate_fn=learning_rate_fn)
   # For Embedding column, the initializer is hidden inside the creator Fn, which
   # is not accessiable later. So, we attach it to a speicial field. Also note
   # that non-TPU Embedding column and non-TPU shared Embedding column handle the
@@ -138,7 +142,8 @@ def shared_embedding_columns(categorical_columns,
                              combiner='mean',
                              initializer=None,
                              shared_embedding_collection_name=None,
-                             max_sequence_lengths=None):
+                             max_sequence_lengths=None,
+                             learning_rate_fn=None):
   """List of dense columns that convert from sparse, categorical input.
 
   Note that the interface for TPU embedding_column is different from the non-TPU
@@ -171,6 +176,8 @@ def shared_embedding_columns(categorical_columns,
       to sequence columns specify the max sequence length for the column. Any
       sequence shorter then this will be padded with 0 embeddings and any
       sequence longer will be truncated.
+    learning_rate_fn: A function that takes global step and returns learning
+      rate for the embedding table.
 
   Returns:
     A  _TPUEmbeddingColumn.
@@ -240,7 +247,8 @@ def shared_embedding_columns(categorical_columns,
         tensor_name_in_ckpt=None,
         max_norm=None,
         trainable=True,
-        max_sequence_length=max_sequence_length)
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     tpu_columns.append(column)
 
   return tpu_columns
@@ -249,9 +257,13 @@ def shared_embedding_columns(categorical_columns,
 class _TPUBaseEmbeddingColumn(object):
   """Base class for TPU Embedding Column."""
 
-  def __init__(self, categorical_column, max_sequence_length=0):
+  def __init__(self,
+               categorical_column,
+               max_sequence_length=0,
+               learning_rate_fn=None):
     self._tpu_categorical_column = categorical_column
     self._max_sequence_length = max_sequence_length
+    self._learning_rate_fn = learning_rate_fn
     if (self.is_sequence_column() and max_sequence_length < 1):
       raise ValueError('max_sequence_length must be greater than 0 for '
                        'sequence columns. Got max_sequence_length={} for '
@@ -300,6 +312,9 @@ class _TPUBaseEmbeddingColumn(object):
   def get_max_sequence_length(self):
     return self._max_sequence_length
 
+  def get_learning_rate_fn(self):
+    return self._learning_rate_fn
+
   def get_sequence_length_feature_key_name(self):
     """Get the key for the associated sequence length feature."""
     return get_sequence_length_feature_key_name_from_feature_key_name(
@@ -318,7 +333,8 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
               tensor_name_in_ckpt=None,
               max_norm=None,
               trainable=True,
-              max_sequence_length=0):
+              max_sequence_length=0,
+              learning_rate_fn=None):
     # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
     # are not supported on TPU. They are solely for matching the signature of
     # __new__ of parent class fc._EmbeddingColumn.
@@ -342,9 +358,13 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
                tensor_name_in_ckpt=None,
                max_norm=None,
                trainable=True,
-               max_sequence_length=0):
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
-                                     max_sequence_length=max_sequence_length)
+               max_sequence_length=0,
+               learning_rate_fn=None):
+    _TPUBaseEmbeddingColumn.__init__(
+        self,
+        categorical_column,
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     self._key = None
 
   def get_combiner(self):
@@ -445,7 +465,8 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
               tensor_name_in_ckpt=None,
               max_norm=None,
               trainable=True,
-              max_sequence_length=0):
+              max_sequence_length=0,
+              learning_rate_fn=None):
     return fc._SharedEmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -468,10 +489,14 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
                tensor_name_in_ckpt=None,
                max_norm=None,
                trainable=True,
-               max_sequence_length=0):
+               max_sequence_length=0,
+               learning_rate_fn=None):
 
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
-                                     max_sequence_length=max_sequence_length)
+    _TPUBaseEmbeddingColumn.__init__(
+        self,
+        categorical_column,
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     self._key = None
 
   def get_combiner(self):
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index afc7e6173f9..b2d3e242a32 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -37,7 +37,8 @@ def embedding_column_v2(categorical_column,
                         dimension,
                         combiner='mean',
                         initializer=None,
-                        max_sequence_length=0):
+                        max_sequence_length=0,
+                        learning_rate_fn=None):
   """TPU version of `tf.compat.v1.feature_column.embedding_column`.
 
   Note that the interface for `tf.tpu.experimental.embedding_column` is
@@ -86,6 +87,8 @@ def embedding_column_v2(categorical_column,
       length. Any sequence shorter then this will be padded with 0 embeddings
       and any sequence longer will be truncated. This must be positive for
       sequence features and 0 for non-sequence features.
+    learning_rate_fn: A function that takes global step and returns learning
+      rate for the embedding table.
 
   Returns:
     A  `_TPUEmbeddingColumnV2`.
@@ -117,7 +120,8 @@ def embedding_column_v2(categorical_column,
       dimension=dimension,
       combiner=combiner,
       initializer=initializer,
-      max_sequence_length=max_sequence_length)
+      max_sequence_length=max_sequence_length,
+      learning_rate_fn=learning_rate_fn)
   return column
 
 
@@ -127,7 +131,8 @@ def shared_embedding_columns_v2(categorical_columns,
                                 combiner='mean',
                                 initializer=None,
                                 shared_embedding_collection_name=None,
-                                max_sequence_lengths=None):
+                                max_sequence_lengths=None,
+                                learning_rate_fn=None):
   """TPU version of `tf.compat.v1.feature_column.shared_embedding_columns`.
 
   Note that the interface for `tf.tpu.experimental.shared_embedding_columns` is
@@ -184,6 +189,8 @@ def shared_embedding_columns_v2(categorical_columns,
       to sequence columns specify the max sequence length for the column. Any
       sequence shorter then this will be padded with 0 embeddings and any
       sequence longer will be truncated.
+    learning_rate_fn: A function that takes global step and returns learning
+      rate for the embedding table.
 
   Returns:
     A  list of `_TPUSharedEmbeddingColumnV2`.
@@ -255,7 +262,8 @@ def shared_embedding_columns_v2(categorical_columns,
         combiner=combiner,
         initializer=initializer,
         shared_embedding_collection_name=shared_embedding_collection_name,
-        max_sequence_length=max_sequence_length)
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     tpu_columns.append(column)
 
   return tpu_columns
@@ -269,7 +277,8 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
               dimension,
               combiner='mean',
               initializer=None,
-              max_sequence_length=0):
+              max_sequence_length=0,
+              learning_rate_fn=None):
     return fc_lib.EmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -286,9 +295,13 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
                dimension,
                combiner='mean',
                initializer=None,
-               max_sequence_length=0):
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
-                                     max_sequence_length=max_sequence_length)
+               max_sequence_length=0,
+               learning_rate_fn=None):
+    _TPUBaseEmbeddingColumn.__init__(
+        self,
+        categorical_column,
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     self._key = None
 
   def get_combiner(self):
@@ -439,7 +452,8 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn,
               combiner='mean',
               initializer=None,
               shared_embedding_collection_name=None,
-              max_sequence_length=0):
+              max_sequence_length=0,
+              learning_rate_fn=None):
     return fc_lib.SharedEmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -453,10 +467,14 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn,
                combiner='mean',
                initializer=None,
                shared_embedding_collection_name=None,
-               max_sequence_length=0):
+               max_sequence_length=0,
+               learning_rate_fn=None):
 
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column,
-                                     max_sequence_length=max_sequence_length)
+    _TPUBaseEmbeddingColumn.__init__(
+        self,
+        categorical_column,
+        max_sequence_length=max_sequence_length,
+        learning_rate_fn=learning_rate_fn)
     self._initializer = initializer
     self._shared_embedding_collection_name = shared_embedding_collection_name
 
@@ -534,8 +552,8 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn,
       return fc_lib.SharedEmbeddingColumn.get_sequence_dense_tensor(
           self, transformation_cache, state_manager)
 
-    tensor = fc_lib.SharedEmbeddingColumn._dense_tensor_internal(
-        self, transformation_cache, state_manager)
+    tensor = self._get_dense_tensor_internal(
+        transformation_cache, state_manager)
     tensor_lengths = transformation_cache.get(
         self.get_sequence_length_feature_key_name(),
         state_manager)
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index b879753c61b..f62ba1b9003 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -93,7 +93,7 @@ class EmbeddingColumnTestV2(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
@@ -249,7 +249,7 @@ class SharedEmbeddingColumnTestV2(test.TestCase):
         (7., 11.)  # id 2
     )
 
-    def _initializer(shape, dtype, partition_info):
+    def _initializer(shape, dtype, partition_info=None):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
diff --git a/tensorflow/python/tpu/preempted_hook.py b/tensorflow/python/tpu/preempted_hook.py
index 7149259713e..7b9544f36cc 100644
--- a/tensorflow/python/tpu/preempted_hook.py
+++ b/tensorflow/python/tpu/preempted_hook.py
@@ -60,6 +60,7 @@ class _TPUPollingThread(threading.Thread):
   def __init__(self, cluster, session):
     super(_TPUPollingThread, self).__init__()
 
+    self.daemon = True
     self._running = True
     self._session_closed = False
     self._cluster = cluster
@@ -86,7 +87,8 @@ class _TPUPollingThread(threading.Thread):
       response = self._cluster._fetch_cloud_tpu_metadata()  # pylint: disable=protected-access
       logging.warning(
           'TPUPollingThread found TPU %s in state %s, and health %s.',
-          self._cluster._tpu, response['state'], response['health'])  # pylint: disable=protected-access
+          self._cluster._tpu, response['state'],  # pylint: disable=protected-access
+          response.get('health', 'UNKNOWN'))
 
       if 'state' in response and response['state'] in [
           'TERMINATED', 'PREEMPTED'
diff --git a/tensorflow/python/tpu/profiler/capture_tpu_profile.py b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
index 6c201f78ada..7dcd3a67f15 100644
--- a/tensorflow/python/tpu/profiler/capture_tpu_profile.py
+++ b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
@@ -27,6 +27,7 @@ from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver a
 from tensorflow.python.eager import profiler_client
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import versions
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu.profiler import version as profiler_version
 
@@ -155,11 +156,16 @@ def main(unused_argv=None):
                    '--tpu and using --service_addr.')
     service_addr = FLAGS.service_addr
   else:
-    tpu_cluster_resolver = (
-        resolver.TPUClusterResolver([FLAGS.tpu],
-                                    zone=FLAGS.tpu_zone,
-                                    project=FLAGS.gcp_project))
-    service_addr = tpu_cluster_resolver.get_master()
+    try:
+      tpu_cluster_resolver = (
+          resolver.TPUClusterResolver([FLAGS.tpu],
+                                      zone=FLAGS.tpu_zone,
+                                      project=FLAGS.gcp_project))
+      service_addr = tpu_cluster_resolver.get_master()
+    except (ValueError, TypeError):
+      sys.exit('Failed to find TPU %s in zone %s project %s. You may use '
+               '--tpu_zone and --gcp_project to specify the zone and project of'
+               ' your TPU.' % (FLAGS.tpu, FLAGS.tpu_zone, FLAGS.gcp_project))
   service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466')
 
   workers_list = ''
@@ -180,7 +186,11 @@ def main(unused_argv=None):
                       FLAGS.display_timestamp, FLAGS.num_queries)
   else:
     if not FLAGS.logdir:
-      sys.exit('logdir must be provided')
+      sys.exit('You must specify either --logdir or --monitoring_level.')
+
+    if not gfile.Exists(FLAGS.logdir):
+      gfile.MakeDirs(FLAGS.logdir)
+
     try:
       profiler_client.start_tracing(service_addr,
                                     os.path.expanduser(FLAGS.logdir),
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
index 8280939ea57..48a3e5f4003 100644
--- a/tensorflow/python/tpu/session_support.py
+++ b/tensorflow/python/tpu/session_support.py
@@ -145,12 +145,14 @@ class WorkerHeartbeatManager(object):
 
   # Default timeout is set to allow other shutdown triggered operations (log
   # flushing etc) to finish before terminating the worker.
-  def shutdown(self, wait_time_in_ms=60000):
+  def shutdown(self, wait_time_in_ms=60000, exit_code=None):
     """Shutdown all workers after `shutdown_timeout_secs`."""
     logging.info('Shutting down %s.', self)
     req = event_pb2.WorkerHeartbeatRequest(
         watchdog_config=event_pb2.WatchdogConfig(timeout_ms=wait_time_in_ms),
-        shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT)
+        shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT,
+        exit_code=event_pb2.RequestedExitCode(
+            exit_code=exit_code) if exit_code is not None else None)
     self.configure(req)
 
     # Wait for workers to shutdown.
diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto
new file mode 100644
index 00000000000..ad5392d65fe
--- /dev/null
+++ b/tensorflow/python/tpu/tensor_tracer.proto
@@ -0,0 +1,74 @@
+syntax = "proto3";
+
+package tensorflow;
+
+import "tensorflow/core/framework/graph.proto";
+
+// Tensor Tracer Report proto gives information about the trace including:
+// - TensorTracerConfig: version, device, num replicas, trace mode.
+// - Graphdef, e.g., list of operations, tensors
+// - TracedTensorDef:
+//    * Name of the tensor
+//    * Tracepoint name if provided.
+//    * Index of the tensor in the compact cache if traced.
+//    * Explanation for why the tensor is traced or not.
+message TensorTracerReport {
+  TensorTracerConfig config = 1;
+
+  // Tensorflow graph.
+  tensorflow.GraphDef graphdef = 2;
+
+  // A map from tensor name to its TracedTensorDef.
+  map<string, TracedTensorDef> tensordef = 3;
+
+  message TensorTracerConfig {
+    // Tensor tracer version, e.g. hostcall, outside compilation.
+    string version = 1;
+    // Traced device, CPU, TPU...
+    string device = 2;
+
+    // Trace mode, norm, summary, full-trace.
+    string trace_mode = 3;
+
+    // Number of cores, e.g. TPU cores, in the system.
+    int32 num_cores = 4;
+
+    // Number of hosts, e.g. compute nodes in the system.
+    int32 num_hosts = 5;
+
+    // Keep submode as string for backward compatibility.
+    string submode = 6;
+
+    // Keep num cores per host for backward compatibility.
+    int32 num_cores_per_host = 7;
+
+    // Id of the included cores, if a subset of cores are traced.
+    repeated int32 included_cores = 8;
+
+    // The names of the signatures corresponding to the cache indices.
+    repeated string signatures = 9;
+  }
+
+  message TracedTensorDef {
+    // Name of the tensor as appears in tf graph.
+    string name = 1;
+    // Cache index of the tensor. This may be different than topological index.
+    int32 cache_index = 2;
+    // If trace points are provided, corresponding tracepoint name of the
+    // tensor. Trace points are placed on the edges (tensors) in the tensorflow
+    // graph, and they force tensor tracer to trace the corresponding tensor.
+    // Tracepoints can be added using the programatic interface
+    // tensor_tracer.tensor_tracepoint(tensor, trace_point_name) function.
+    // This will add a trace point with the given trace_point_name for the given
+    // tensor. If a trace_point is provided for the tensor,
+    // trace_point name will be used for the rest of the analysis instead of
+    // tensor names. One can use trace_point_name's to compare two models with
+    // arbitrary tensor names by providing the same trace point name for the
+    // tensors that are comparable.
+    string trace_point_name = 3;
+    // Whether the tensor is traced or not.
+    bool is_traced = 4;
+    // Detailed explanation why the tensor is traced or not.
+    string explanation = 5;
+  }
+}
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index ea4ce94585e..6d838bbceb3 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -22,6 +22,10 @@ import os
 import os.path
 import sys
 
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
@@ -35,14 +39,18 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary_iterator
 from tensorflow.python.tpu import tensor_tracer_flags
 from tensorflow.python.tpu import tensor_tracer_report
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import training_util
 
 _DEVICE_TYPE_TPU = 'tpu'
 _DEVICE_TYPE_CPU = 'cpu'
@@ -70,9 +78,58 @@ _TRACE_FILE_NAME = 'trace.all'
 _COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
 _COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
 _TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
-_TENSOR_VALUES_CACHE = 'tensor_values_cache'
+_TT_SNAPSHOT = 'tensor_tracer_snapshot'
 _REPLICA_ID_TAG = '#replica-id: '
 
+_TT_SUMMARY_NORM = 'tensor_tracer_norm'
+_TT_SUMMARY_MAX = 'tensor_tracer_max'
+_TT_SUMMARY_MIN = 'tensor_tracer_min'
+_TT_SUMMARY_MEAN = 'tensor_tracer_mean'
+_TT_SUMMARY_VAR = 'tensor_tracer_var'
+_TT_SUMMARY_SIZE = 'tensor_tracer_size'
+
+_TT_SUMMARY_TAG = 'tensor_tracer_summary'
+_TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer'
+_TT_HOSTCALL_KEY = 'tensor_tracer_host_call'
+_TT_EVENT_FILE_SUFFIX = '.tensor_tracer'
+
+_TT_SUMMARY_MAX_QUEUE = 100
+
+
+def read_tensor_tracer_event_file(event_file):
+  """Reads the event file written by tensor tracer.
+
+  Args:
+    event_file: Path to the event file that contains only tensor tracer events.
+  Returns:
+    An event dictionary in the form of
+    {step_number: {tensor_name: tensor_content}}
+  Raises:
+    ValueError: If an unexpected trace is found.
+  """
+  event_dict = {}
+  for trace_event in summary_iterator.summary_iterator(event_file):
+    # First event is an event with file_version: "brain.Event:2"
+    if not trace_event.HasField('summary'):
+      continue
+    step = trace_event.step
+    if step not in event_dict:
+      event_dict[step] = {}
+
+    if len(trace_event.summary.value) != 1:
+      raise ValueError('Single step contains %d summary values,'
+                       ' expected 1.' % len(trace_event.summary.value))
+    tensor_value = trace_event.summary.value[0]
+    tensor_name = tensor_value.tag
+
+    real_shape = [d.size for d in tensor_value.tensor.tensor_shape.dim]
+    tensor_content = np.frombuffer(
+        tensor_value.tensor.tensor_content,
+        dtypes.DType(tensor_value.tensor.dtype).as_numpy_dtype()
+        ).reshape(real_shape)
+    event_dict[step][tensor_name] = tensor_content
+  return event_dict
+
 
 def tensor_tracepoint(tensor, checkpoint_name):
   """Adds a checkpoint with the given checkpoint name for the given tensor.
@@ -144,36 +201,6 @@ def _trace_files_need_precreated(output_dir):
   return True
 
 
-def _get_tensor_values_cache(graph=None):
-  """Returns the variable that implements tensor-value caching."""
-
-  graph = graph or ops.get_default_graph()
-  collection = graph.get_collection(_TENSOR_TRACER_STORAGE)
-  if len(collection) == 1:
-    return collection[0]
-  elif not collection:
-    raise RuntimeError('%s has not been created'%_TENSOR_VALUES_CACHE)
-  else:
-    raise RuntimeError('Multiple %s created'%_TENSOR_VALUES_CACHE)
-  return None
-
-
-def _create_tensor_values_cache(graph, num_tensors):
-  """Creates a variable as the cache to store intermediate tensor values."""
-  graph = graph or ops.get_default_graph()
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        _TENSOR_VALUES_CACHE,
-        shape=[num_tensors],
-        dtype=dtypes.float32,
-        initializer=init_ops.constant_initializer(
-            _COMPACT_TRACE_ENTRY_INIT_VALUE),
-        trainable=False,
-        use_resource=True,
-        collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.LOCAL_VARIABLES])
-
-
 class TensorTracer(object):
   """A software construct for tracing tensor values in a TF graph on TPU.
 
@@ -202,9 +229,25 @@ class TensorTracer(object):
   def check_device_type(device_type):
     """Checks if the given device type is valid."""
 
-    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+    if device_type not in (_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU):
       raise ValueError('Invalid device_type "%s"'%device_type)
 
+  @staticmethod
+  def check_trace_mode(device_type, trace_mode):
+    """Checks if the given trace mode work on the given device type.
+
+    Args:
+      device_type: Device type, TPU, GPU, CPU.
+      trace_mode: Tensor tracer trace mode.
+    Raises:
+      ValueError: If the given trace mode is not supported for the device.
+    """
+    if trace_mode in (tensor_tracer_flags.TRACE_MODE_SUMMARY,
+                      tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY):
+      if device_type != _DEVICE_TYPE_TPU:
+        raise ValueError('Device_type "%s" is not yet supported for '
+                         'trace mode "%s"' % (device_type, trace_mode))
+
   @staticmethod
   def loop_cond_op(op):
     return op.type in ('LoopCond', 'RefLoopCond')
@@ -236,7 +279,7 @@ class TensorTracer(object):
       return True
     # Reasons for not including following op types:
     #    Assign: cause incorrect result with CPU tracing.
-    if op.type in ['Assign']:
+    if op.type == 'Assign':
       return True
     return False
 
@@ -253,17 +296,17 @@ class TensorTracer(object):
     """Return true if scalar output tensor from Op is not safe to be traced."""
 
     # Tracing the following causes cycle in the graph on TPU.
-    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
-                   'Switch', 'Less', 'ReadVariableOp']:
+    if op.type in ('LoopCond', 'Enter', 'Merge', 'Const',
+                   'Switch', 'Less', 'ReadVariableOp'):
       return True
     # Tracing the following will cause casting-issue
     # with the norm tracing mode or other compilation issues on CPU.
-    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
+    if op.type in ('VarHandleOp', 'IteratorToStringHandle',
                    'IteratorGetNext', 'OneShotIterator',
                    'IteratorV2', 'MakeIterator',
                    'BatchDatasetV2', 'MapDataset',
                    'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
-                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
+                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice'):
       return True
     return False
 
@@ -273,7 +316,7 @@ class TensorTracer(object):
     if self._parameters.include_less_interesting_ops:
       return False
     # Following ops are highly unlikey to cause bugs.
-    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
+    return op.type in ('Const', 'Identity', 'Cast', 'Shape')
 
   @staticmethod
   def reason(op_idx, details):
@@ -290,6 +333,50 @@ class TensorTracer(object):
     self._tt_config = tensor_tracer_report.TensorTracerConfig()
     self._parameters = tensor_tracer_flags.TTParameters()
     self._included_op_full_names = set()
+    self._host_call_fn = {}
+    self._cache_variables = {}
+
+  def _get_all_cache_variables(self):
+    return self._cache_variables
+
+  def _create_or_get_tensor_values_cache(self, cache_name, graph=None,
+                                         shape=None, dtype=dtypes.float32,
+                                         num_signatures=None):
+    """Creates a variable as the cache to store intermediate tensor values.
+
+    Args:
+      cache_name: Name to be given to the cache (an instance of tf.variable).
+      graph: Tensorflow graph.
+      shape: A list of dimensions.
+      dtype: Data type of created cache
+    Returns:
+      A ref to newly created or existing cache with the given dimensions.
+    Raises:
+      ValueError: If missing a parameter to create the cache.
+    """
+    def _escape_namescopes(variable_name):
+      # TODO(deveci): This might cause name collisions as in "foo/bar/mytensor"
+      # and "foo_bar/mytensor".
+      return variable_name.replace('/', '_').replace(':', '_')
+
+    if cache_name not in self._cache_variables:
+      if graph is None:
+        raise ValueError('Graph must be provided at cache creation.')
+      if shape is None:
+        raise ValueError('shape must be provided at cache creation.')
+      graph = graph or ops.get_default_graph()
+
+      # Create in proper graph and base name_scope.
+      with graph.as_default() as g, g.name_scope(None):
+        self._cache_variables[cache_name] = variable_scope.get_variable(
+            _TT_SNAPSHOT + '_' + _escape_namescopes(cache_name),
+            shape=shape, dtype=dtype,
+            initializer=init_ops.constant_initializer(
+                _COMPACT_TRACE_ENTRY_INIT_VALUE),
+            trainable=False,
+            use_resource=True,
+            collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.LOCAL_VARIABLES])
+    return self._cache_variables[cache_name]
 
   def _add_replica_id_to_graph(self):
     """Adds nodes for computing the replica ID to the graph."""
@@ -368,26 +455,78 @@ class TensorTracer(object):
         return True
     return False
 
+  def _signature_types(self):
+    """Returns a dictionary holding the order of signatures in the cache for the selected trace mode."""
+    if self._parameters.trace_mode in set([
+        tensor_tracer_flags.TRACE_MODE_NAN_INF,
+        tensor_tracer_flags.TRACE_MODE_NORM,
+        tensor_tracer_flags.TRACE_MODE_MAX_ABS]):
+      return {self._parameters.trace_mode: 0}
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
+      return {_TT_SUMMARY_NORM: 0, _TT_SUMMARY_MAX: 1, _TT_SUMMARY_MIN: 2,
+              _TT_SUMMARY_MEAN: 3, _TT_SUMMARY_VAR: 4, _TT_SUMMARY_SIZE: 5}
+    return {}
+
+  def _num_signature_dimensions(self):
+    return len(self._signature_types())
+
   def _use_tensor_values_cache(self):
     """Returns True if immediate tensors should be first saved to a cache."""
 
     if self._parameters.trace_mode not in set([
         tensor_tracer_flags.TRACE_MODE_NAN_INF,
         tensor_tracer_flags.TRACE_MODE_NORM,
-        tensor_tracer_flags.TRACE_MODE_MAX_ABS]):
+        tensor_tracer_flags.TRACE_MODE_MAX_ABS,
+        tensor_tracer_flags.TRACE_MODE_SUMMARY
+    ]):
       return False
     if (self._parameters.trace_dir and
         _trace_files_need_precreated(self._parameters.trace_dir)):
       return True
     return self._parameters.use_compact_trace
 
-  def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
-    """Returns an Op that will save the given updates to an entry in the cache."""
+  def _use_tensor_buffer(self):
+    """Returns true if the whole tensor needs to be cached/buffered in memory."""
+    return (self._parameters.trace_mode ==
+            tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)
 
-    cache = _get_tensor_values_cache(graph)
+  def _save_tensor_value_to_cache_op(self, cache_idx, updates):
+    """Returns an op that will save the given updates to an entry in the cache.
+
+    Args:
+      cache_idx: The cache index of the tensor within the cache.
+      updates: A dictionary of the signature updates.
+    Returns:
+      Cache update operation.
+    """
+    # state_ops.scatter_update allows updates only along the first dimension.
+    # Make a compact array by concantating different signatures, and update
+    # them all together.
+    sorted_update = []
+    signature_indices = self._signature_types()
+    for _, val in sorted(updates.items(),
+                         key=lambda item: signature_indices[item[0]]):
+      sorted_update.append(val)
+    cache = self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG)
     indices = constant_op.constant([cache_idx])
+    updates = array_ops.concat(sorted_update, axis=0)
+    updates = array_ops.reshape(updates, [1, self._num_signature_dimensions()])
     return state_ops.scatter_update(cache, indices, updates).op
 
+  def _snapshot_tensor(self, tensor):
+    """Creates a new tf.Variable and a new tf.Operation that assigns the value of the tensor to this variable.
+
+    Args:
+      tensor: tensor whose values will be stored in a new tf.Variable.
+    Returns:
+      An assignment operation.
+    """
+
+    snapshot_variable = self._create_or_get_tensor_values_cache(
+        tensor.name, tensor.op.graph,
+        tensor.shape.as_list(), tensor.dtype)
+    return state_ops.assign(snapshot_variable, tensor).op
+
   def _preprocess_traced_tensor(self, tensor):
     """Computes NAN/Norm/Max on TPUs before sending to CPU.
 
@@ -415,13 +554,46 @@ class TensorTracer(object):
       output_tensor = array_ops.reshape(output_tensor, [1])
       return output_tensor
 
-    def _show_norm(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = linalg_ops.norm(tensor)
+    def _compute_signature(tensor, tf_op, cast_to_f32=True):
+      if cast_to_f32:
+        tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = tf_op(tensor)
       # The shape has to be 1. Set it if it does not have the information.
       output_tensor = array_ops.reshape(output_tensor, [1])
       return output_tensor
 
+    def _show_size(tensor):
+      # In order to check the size of a tensor.
+      # Not all sizes are known at the compile time, also, different replicas
+      # sometimes get different sizes of tensors.
+      # Collect it here to be used in merging replica data.
+      tsize = _compute_signature(tensor, array_ops.size, cast_to_f32=False)
+      # Cast to float32, so that it can be placed into same cache with other
+      # signatures.
+      return math_ops.cast(tsize, dtypes.float32)
+
+    def _show_max(tensor, cast_to_f32=True):
+      # returns -inf for empty tensor
+      return _compute_signature(tensor, math_ops.reduce_max, cast_to_f32)
+
+    def _show_min(tensor, cast_to_f32=True):
+      # returns inf for empty tensor
+      return _compute_signature(tensor, math_ops.reduce_min, cast_to_f32)
+
+    def _show_norm(tensor, cast_to_f32=True):
+      # returns 0 for empty tensor
+      return _compute_signature(tensor, linalg_ops.norm, cast_to_f32)
+
+    def _show_mean_and_variance(tensor, cast_to_f32=True):
+      if cast_to_f32:
+        tensor = math_ops.cast(tensor, dtypes.float32)
+      # returns nan for empty tensor
+      mean, var = nn_impl.moments(array_ops.reshape(tensor, [-1]), axes=[0])
+      # The shape has to be 1. Set it if it does not have the information.
+      mean = array_ops.reshape(mean, [1])
+      var = array_ops.reshape(var, [1])
+      return mean, var
+
     def _show_max_abs(tensor):
       tensor = math_ops.cast(tensor, dtypes.float32)
       output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
@@ -450,19 +622,31 @@ class TensorTracer(object):
 
     if (self._parameters.trace_mode ==
         tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
-      return _detect_inf_nan_producer(tensor)
+      return {self._parameters.trace_mode: _detect_inf_nan_producer(tensor)}
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
-      return _detect_nan_inf(tensor)
+      return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
     if (self._parameters.trace_mode ==
         tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
-      return tensor
-    if (self._parameters.trace_mode ==
-        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR):
-      return tensor
+      return {self._parameters.trace_mode: tensor}
+    if (self._parameters.trace_mode in (
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR,
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)):
+      return {self._parameters.trace_mode: tensor}
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NORM:
-      return _show_norm(tensor)
+      return {self._parameters.trace_mode: _show_norm(tensor)}
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_MAX_ABS:
-      return _show_max_abs(tensor)
+      return {self._parameters.trace_mode: _show_max_abs(tensor)}
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      tsize = _show_size(tensor)
+      tnorm = _show_norm(tensor, cast_to_f32=False)
+      tmax = _show_max(tensor, cast_to_f32=False)
+      tmin = _show_min(tensor, cast_to_f32=False)
+      tmean, tvar = _show_mean_and_variance(tensor, cast_to_f32=False)
+      return {_TT_SUMMARY_NORM: tnorm, _TT_SUMMARY_MAX: tmax,
+              _TT_SUMMARY_MIN: tmin, _TT_SUMMARY_MEAN: tmean,
+              _TT_SUMMARY_VAR: tvar, _TT_SUMMARY_SIZE: tsize}
+
     raise RuntimeError(
         'Tensor trace fun for %s is not yet implemented'
         % self._parameters.trace_mode)
@@ -566,18 +750,29 @@ class TensorTracer(object):
     # TRACE_MODE_NORM, and TRACE_MODE_MAX_ABS, as related computations are
     # performed within TPUs and only their results are transferred to CPU.
     # Simply, print the full tensor for these trace modes.
-    if self._parameters.trace_mode in [
+    if self._parameters.trace_mode in (
         tensor_tracer_flags.TRACE_MODE_NAN_INF,
         tensor_tracer_flags.TRACE_MODE_NORM,
         tensor_tracer_flags.TRACE_MODE_FULL_TENSOR,
-        tensor_tracer_flags.TRACE_MODE_MAX_ABS]:
+        tensor_tracer_flags.TRACE_MODE_MAX_ABS,
+        tensor_tracer_flags.TRACE_MODE_SUMMARY
+        ):
       return _show_full_tensor
 
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
                        %self._parameters.trace_mode)
 
   def _skip_op(self, op_id, op, ops_in_exec_path, report_handler):
-    """Returns True if we should not trace Op."""
+    """Returns True if we should not trace Op.
+
+    Args:
+      op_id: Topological index of the op.
+      op: tf.Operation
+      ops_in_exec_path: Set of operations that are in the execution path.
+      report_handler: An instance of tensor_tracer_report.TTReportHandle.
+    Returns:
+      True if the op should not be traced, false otherwise.
+    """
     if TensorTracer.while_loop_op(op):
       report_handler.instrument_op(
           op, TensorTracer.reason(op_id, _REASON_WHILELOOP_OP))
@@ -614,7 +809,15 @@ class TensorTracer(object):
     return False
 
   def _skip_tensor(self, op_id, out_tensor, report_handler):
-    """Returns True if we should not trace out_tensor."""
+    """Returns True if we should not trace out_tensor.
+
+    Args:
+      op_id: Topological index of the op producing tensor.
+      out_tensor: tf.Tensor
+      report_handler: An instance of tensor_tracer_report.TTReportHandle.
+    Returns:
+      True if the tensor should not be traced, false otherwise.
+    """
 
     # Skips a tensor if the tensor has a non-numeric type.
     #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
@@ -644,11 +847,12 @@ class TensorTracer(object):
     if not out_tensor.get_shape().is_fully_defined():
       # If trace mode is nan-inf, norm or max, then the tensor will be reduced
       # to a scalar before the outside compilation call.
-      if self._parameters.trace_mode in [
+      if self._parameters.trace_mode in (
           tensor_tracer_flags.TRACE_MODE_NAN_INF,
           tensor_tracer_flags.TRACE_MODE_NORM,
-          tensor_tracer_flags.TRACE_MODE_MAX_ABS
-      ]:
+          tensor_tracer_flags.TRACE_MODE_MAX_ABS,
+          tensor_tracer_flags.TRACE_MODE_SUMMARY
+          ):
         report_handler.instrument_tensor(
             out_tensor, TensorTracer.reason(op_id, _REASON_TENSOR_GET_TRACED))
         return False
@@ -721,7 +925,20 @@ class TensorTracer(object):
                                                ops_in_exec_path,
                                                tensor_trace_points,
                                                report_handler):
-    """Determines the tensors to trace and instruments the trace details."""
+    """Determines the tensors to trace and instruments the trace details.
+
+    Args:
+      graph_order: graph_order tuple containing graph (tf.graph), operations
+        (list of operations), op_to_idx (op id mapping), (tensors) list of
+        tensors, tensor_to_idx (tensor id mapping), contains_cycle (whether
+        there is a cycle in the graph), topological_order_or_cycle (list of ops
+        in topological order or list of ops creating a cycle).
+      ops_in_exec_path: Set of ops in the execution path.
+      tensor_trace_points: Collection of programatic tensor trace points.
+      report_handler: An instance of tensor_tracer_report.TTReportHandle.
+    Returns:
+      List of tensors to be traced.
+    """
 
     traced_tensors = []
     checkpoint_operations = set([tensor.op
@@ -743,6 +960,10 @@ class TensorTracer(object):
     if not self._parameters.trace_dir:
       # traces will be written to stderr. No need to check trace files.
       return
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
+      # Output files are handled by tf.summary operations, no need to precreate
+      # them.
+      return
     if _trace_files_need_precreated(self._parameters.trace_dir):
       for replica_id in range(0, self._tt_config.num_replicas):
         trace_file_path = os.path.join(
@@ -759,7 +980,15 @@ class TensorTracer(object):
           raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
 
   def _determine_trace_and_create_report(self, graph, ops_in_exec_path):
-    """Work needs to be done prior to TPU or CPU tracing."""
+    """Work needs to be done prior to TPU or CPU tracing.
+
+    Args:
+      graph: tf.graph
+      ops_in_exec_path: Set of operations in the execution path.
+    Returns:
+      An instance of tensor_tracer_report.TensorTraceOrder, containing list of
+      tensors to be traced with their topological order information.
+    """
 
     self._check_trace_files()
 
@@ -772,17 +1001,36 @@ class TensorTracer(object):
 
     tensor_trace_order = tensor_tracer_report.TensorTraceOrder(graph_order,
                                                                traced_tensors)
-    if self._use_tensor_values_cache():
-      _create_tensor_values_cache(graph, len(traced_tensors))
-    report_handler.create_report(self._tt_config, self._parameters,
-                                 tensor_trace_order, tensor_trace_points)
+    num_signatures = self._num_signature_dimensions()
+    if num_signatures:
+      self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG,
+                                              graph,
+                                              [len(traced_tensors),
+                                               num_signatures])
+    if self._parameters.trace_mode in (
+        tensor_tracer_flags.TRACE_MODE_SUMMARY,
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY):
+      report_proto = report_handler.create_report_proto(self._tt_config,
+                                                        self._parameters,
+                                                        tensor_trace_order,
+                                                        tensor_trace_points,
+                                                        self._signature_types())
+      report_handler.write_report_proto(report_proto, self._parameters)
+    else:
+      report_handler.create_report(self._tt_config, self._parameters,
+                                   tensor_trace_order, tensor_trace_points)
     return tensor_trace_order
 
-  def _generate_flush_cache_op(self, graph, num_replicas, on_tpu):
+  def _create_host_call(self):
+    return self._parameters.trace_mode in (
+        tensor_tracer_flags.TRACE_MODE_SUMMARY,
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)
+
+
+  def _generate_flush_cache_op(self, num_replicas, on_tpu):
     """Generates an Op that will flush the cache to file.
 
     Args:
-      graph: the graph of Ops
       num_replicas: total number of replicas.
       on_tpu: if the graph is executed on TPU.
 
@@ -807,12 +1055,14 @@ class TensorTracer(object):
             output_stream = sys.stderr
 
           new_step_line = _REPLICA_ID_TAG + replica_str
-          print_op = logging_ops.print_v2(
-              new_step_line, '\n',
-              cache, '\n',
-              summarize=-1,
-              output_stream=output_stream)
-          with ops.control_dependencies([print_op]):
+          print_ops = []
+          for i in range(self._num_signature_dimensions()):
+            print_ops.append(logging_ops.print_v2(
+                new_step_line, '\n',
+                cache[:, i], '\n',
+                summarize=-1,
+                output_stream=output_stream))
+          with ops.control_dependencies(print_ops):
             return constant_op.constant(0).op
         return _print_cache
 
@@ -829,7 +1079,7 @@ class TensorTracer(object):
       # only known during tf runtime, and we cannot create dynamic filenames.
       return control_flow_ops.case(flush_op_cases, exclusive=True)
 
-    cache = _get_tensor_values_cache(graph)
+    cache = self._create_or_get_tensor_values_cache(_TT_SUMMARY_TAG)
     if on_tpu:
       flush_op = tpu.outside_compilation(_flush_fun,
                                          cache.value(), self._replica_id)
@@ -844,12 +1094,10 @@ class TensorTracer(object):
       with ops.control_dependencies([assign_op]):
         return constant_op.constant(0).op
 
-  def _flush_tensor_values_cache(self, graph, tensor_fetches, op_fetches,
-                                 on_tpu):
+  def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu):
     """Flushes the intermediate tensor values in the graph to the cache.
 
     Args:
-      graph: the graph of Ops
       tensor_fetches: list of tensor results returned by the model_fn.
       op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
       on_tpu: if the graph is executed on TPU.
@@ -862,8 +1110,7 @@ class TensorTracer(object):
     with ops.control_dependencies(op_fetches +
                                   [tensor.op for tensor in tensor_fetches]):
       flush_cache_op = self._generate_flush_cache_op(
-          graph, self._tt_config.num_replicas, on_tpu)
-
+          self._tt_config.num_replicas, on_tpu)
       return control_flow_ops.tuple(tensor_fetches,
                                     control_inputs=[flush_cache_op])
 
@@ -898,6 +1145,8 @@ class TensorTracer(object):
     for fetch in op_fetches:
       if isinstance(fetch, ops.Operation):
         fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        fetches.append(fetch.op)
       else:
         logging.warning('Ignoring the given op_fetch:%s, which is not an op.' %
                         fetch)
@@ -934,6 +1183,86 @@ class TensorTracer(object):
       op_control_flow_context = op_control_flow_context.outer_context
     return op_control_flow_context
 
+  def _prepare_host_call_fn(self, processed_t_fetches, op_fetches):
+    """Creates a host call function that will write the cache as tb summary.
+
+    Args:
+      processed_t_fetches: List of tensor provided to session.run.
+      op_fetches: List of operations provided to session.run.
+    Raises:
+      ValueError if trace_dir is not set.
+    """
+    if self._parameters.trace_dir is None:
+      raise ValueError('Provide a trace_dir for tensor tracer in summary mode. '
+                       '--trace_dir=/model/dir')
+
+    def _write_cache(step, **kwargs):
+      """Writes the given caches as tensor summary.
+
+      Args:
+        step: Step tensor with dimension [num_cores].
+        **kwargs: The dictionary of tensors that needs to be written as
+          summaries. Key and value pairs within kwargs correspond to the tag
+          name, and tensor content that will be written using summary.write.
+          The trace_modes that use this function are:
+            - summary: In summary mode, kwargs includes a single (tag, content)
+            pair which are, _TT_SUMMARY_TAG and a tf.float32 signature_cache
+            variable. The dimension of the signature_cache is:
+              num_cores x num_traced_tensors x num_signatures.
+            - full_tensor_summary: kwargs will include all traced tensors. Tag
+            and content correspond to the name of the tensor, and its actual
+            content.
+      Returns:
+        A tf.Operation that needs to be executed for the host call dependencies.
+      """
+
+      # TODO(deveci): Parametrize max_queue, so that flushing op can be called
+      # less frequently.
+      # Setting max_queue to 100 appears to be safe even when the number of
+      # iterations are much lower, as the destructor of the writer will flushes
+      # it.
+      summary_write_ops = []
+      with summary.create_file_writer_v2(
+          self._parameters.trace_dir,
+          filename_suffix=_TT_EVENT_FILE_SUFFIX,
+          max_queue=_TT_SUMMARY_MAX_QUEUE).as_default():
+        summary_metadata = summary_pb2.SummaryMetadata(
+            plugin_data=summary_pb2.SummaryMetadata.PluginData(
+                plugin_name=_TT_TENSORBOARD_PLUGIN_NAME))
+        for key, value in kwargs.items():
+          summary_write_ops.append(summary.write(
+              _TT_SUMMARY_TAG + '/' + key, value, metadata=summary_metadata,
+              step=step[0]))
+      return control_flow_ops.group(summary_write_ops)
+
+    step = array_ops.reshape(training_util.get_or_create_global_step(), [1])
+    self._host_call_fn = {}
+
+    host_call_deps = op_fetches + [tensor.op for tensor in processed_t_fetches]
+
+    caches_to_write = {}
+    with ops.control_dependencies(host_call_deps):
+      all_caches = self._get_all_cache_variables()
+      for cache_name, cache_variable in all_caches.items():
+        # Increase the cache rank by 1, so that when host call concatenates
+        # tensors from different replicas, we can identify them with [core_id].
+        new_cache_shape = [1]
+        new_cache_shape.extend(cache_variable.shape.as_list())
+        cache = array_ops.reshape(cache_variable.value(), new_cache_shape)
+        caches_to_write[cache_name] = cache
+    # Add step to parameter dictionary.
+    caches_to_write['step'] = step
+    # Other options without adding step to parameter dictionary are
+    #  * host_call_fn = (_write_cache(step, caches_to_write)) : fails as it
+    #    considers caches_to_write as a single parameter, rather than a keyword
+    #    parameters.
+    #  * host_call_fn = (_write_cache(step, **caches_to_write)) : fails with
+    #    a syntax error.
+    self._host_call_fn[_TT_HOSTCALL_KEY] = (_write_cache, caches_to_write)
+
+  def host_call_deps_and_fn(self):
+    return self._host_call_fn
+
   def _trace_execution(self, graph,
                        tensor_fetches,
                        op_fetches=None,
@@ -974,6 +1303,8 @@ class TensorTracer(object):
       return tensor
 
     TensorTracer.check_device_type(self._tt_config.device_type)
+    TensorTracer.check_trace_mode(self._tt_config.device_type,
+                                  self._parameters.trace_mode)
     # Check in_tensor_fetches, and op_fetches and convert them to lists.
     processed_t_fetches = self._process_tensor_fetches(tensor_fetches)
     op_fetches = self._process_op_fetches(op_fetches)
@@ -1021,16 +1352,25 @@ class TensorTracer(object):
         # pylint: disable=protected-access
         graph._set_control_flow_context(op_control_flow_context)
         # pylint: enable=protected-access
-        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+        processed_tensors = self._preprocess_traced_tensor(out_tensor)
 
         if on_tpu:
-          processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
+          for signature in processed_tensors.keys():
+            processed_tensors[signature] = _cast_unsupported_dtypes(
+                processed_tensors[signature])
 
         if self._use_tensor_values_cache():
+          # Use a small cache to store the characteristics of the tensor.
           cache_idx = tensor_trace_order.tensorname_to_cache_idx[tensor_name]
-          trace_op = self._save_tensor_value_to_cache_op(graph,
-                                                         cache_idx,
-                                                         processed_out_tensor)
+          trace_op = self._save_tensor_value_to_cache_op(cache_idx,
+                                                         processed_tensors)
+        elif self._use_tensor_buffer():
+          if len(processed_tensors) != 1:
+            raise RuntimeError('Multiple stats are only allowed in compact '
+                               'mode.')
+          processed_out_tensor = processed_tensors.values()[0]
+          # Store the whole tensor in a buffer.
+          trace_op = self._snapshot_tensor(processed_out_tensor)
         else:
 
           def tpu_wrap_trace_fn(tensor, out_tensor_name):
@@ -1049,6 +1389,14 @@ class TensorTracer(object):
                 predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
                 lambda: constant_op.constant(False)).op
 
+          if len(processed_tensors) != 1:
+            raise RuntimeError('Multiple stats are only allowed in compact '
+                               'mode.')
+          # Collecting multiple statistics are only supported in the summary
+          # mode that uses compact format(self._use_tensor_values_cache = true).
+          # Non-compact mode currently allows single stat per tensor.
+          processed_out_tensor = six.next(six.itervalues(processed_tensors))
+
           if self._parameters.is_conditional_trace:
             trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
                                             tpu_wrap_trace_fn, tensor_name)
@@ -1081,11 +1429,13 @@ class TensorTracer(object):
       # tracing_ops.
       processed_t_fetches = control_flow_ops.tuple(processed_t_fetches,
                                                    control_inputs=tracing_ops)
-    if self._use_tensor_values_cache():
-      processed_t_fetches = self._flush_tensor_values_cache(graph,
-                                                            processed_t_fetches,
-                                                            op_fetches,
-                                                            on_tpu=on_tpu)
+    if self._use_tensor_values_cache() or self._use_tensor_buffer():
+      if self._create_host_call() and on_tpu:
+        self._prepare_host_call_fn(processed_t_fetches, op_fetches)
+      else:
+        processed_t_fetches = self._flush_tensor_values_cache(
+            processed_t_fetches, op_fetches, on_tpu=on_tpu)
+
     # processed_t_fetches is a list at this point. Convert it to the same
     # format as given in tensor_fetches.
     return self._convert_fetches_to_input_format(tensor_fetches,
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index bf2752da64b..2094c11aaa5 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -31,6 +31,12 @@ TRACE_MODE_FULL_TENSOR = 'full-tensor'
 TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
 TRACE_MODE_NORM = 'norm'
 TRACE_MODE_MAX_ABS = 'max-abs'
+TRACE_MODE_SUMMARY = 'summary'
+# summary mode to collects a finite set of signatures for each traced tensor,
+# (such as norm, max, min, mean) and dumps it using tb summaries.
+TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
+# Full tensor mode dumps the whole tensor values for the traced tensors without
+# any processing on them; using tb summaries.
 _FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
 _SUBMODE_BRIEF = 'brief'
 _SUBMODE_DETAILED = 'detailed'
@@ -164,7 +170,8 @@ class TTParameters(object):
       trace_mode = TRACE_MODE_NORM
     valid_trace_modes = [
         TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
-        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN
+        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN,
+        TRACE_MODE_SUMMARY, TRACE_MODE_FULL_TENSOR_SUMMARY
     ]
     if trace_mode not in valid_trace_modes:
       raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index 4bf76aa853a..29e48752f23 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import collections
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tensor_tracer_pb2
 
 _TRACER_LOG_PREFIX = ' [>>>TT>>>]'
 _MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
@@ -48,6 +50,7 @@ _FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
 _FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
 
 _CURRENT_VERSION = 'use-outside-compilation'
+_TT_REPORT_PROTO = 'tensor_tracer_report.proto'
 
 
 def topological_sort(g):
@@ -219,6 +222,67 @@ class TTReportHandle(object):
   def instrument_tensor(self, tensor, explanation):
     self.instrument(tensor.name, explanation)
 
+  def create_report_proto(self, tt_config, tt_parameters, tensor_trace_order,
+                          tensor_trace_points, collected_signature_types):
+    """Creates and returns a proto that stores tensor tracer configuration.
+
+    Args:
+      tt_config: TensorTracerConfig object holding information about the run
+        environment (device, # cores, # hosts), and tensor tracer version
+        information.
+      tt_parameters: TTParameters objects storing the user provided parameters
+        for tensor tracer.
+      tensor_trace_order: TensorTraceOrder object storing a topological order of
+        the graph.
+      tensor_trace_points: Progromatically added trace_points/checkpoints.
+        collected_signature_types: The signature types collected, e,g, norm,
+        max, min, mean...
+    Returns:
+      TensorTracerReport proto.
+    """
+    report = tensor_tracer_pb2.TensorTracerReport()
+    report.config.version = tt_config.version
+    report.config.device = tt_config.device_type
+    report.config.num_cores = tt_config.num_replicas
+    report.config.num_hosts = tt_config.num_hosts
+    report.config.num_cores_per_host = tt_config.num_replicas_per_host
+    for core in tt_parameters.included_cores:
+      report.config.included_cores.append(core)
+    report.config.submode = tt_parameters.submode
+    report.config.trace_mode = tt_parameters.trace_mode
+
+    for signature_name, _ in sorted(collected_signature_types.items(),
+                                    key=lambda x: x[1]):
+      report.config.signatures.append(signature_name)
+
+    tf_graph = tensor_trace_order.graph_order.graph
+    report.graphdef.CopyFrom(tf_graph.as_graph_def())
+    for tensor in tensor_trace_order.graph_order.tensors:
+      tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef()
+      tensor_def.name = tensor.name
+      if tensor.name in tensor_trace_order.tensorname_to_cache_idx:
+        tensor_def.is_traced = True
+        tensor_def.cache_index = (
+            tensor_trace_order.tensorname_to_cache_idx[tensor.name])
+      else:
+        tensor_def.is_traced = False
+
+      if tensor.name in tensor_trace_points:
+        tensor_def.trace_point_name = tensor_trace_points[tensor.name]
+      if tensor.name in self.instrument_records:
+        tensor_def.explanation = self.instrument_records[tensor.name]
+      elif tensor.op.name in self.instrument_records:
+        tensor_def.explanation = self.instrument_records[tensor.op.name]
+      report.tensordef[tensor.name].CopyFrom(tensor_def)
+    return report
+
+  def write_report_proto(self, report_proto, tt_parameters):
+    """Writes the given report proto under trace_dir."""
+    gfile.MakeDirs(tt_parameters.trace_dir)
+    report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO)
+    with gfile.GFile(report_path, 'wb') as f:
+      f.write(report_proto.SerializeToString())
+
   def create_report(self, tt_config, tt_parameters,
                     tensor_trace_order, tensor_trace_points):
     """Creates a report file and writes the trace information."""
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index eeb612edbcd..609acfdeeba 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -19,28 +19,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+from absl import logging
+import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding
 from tensorflow.python.compat import compat as api_compat
 from tensorflow.python.compiler.xla import xla
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tpu_function
 from tensorflow.python.tpu.ops import tpu_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("TPUReplicatedInput")
@@ -106,6 +111,32 @@ def initialize_system(embedding_config=None, job=None):
     return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
 
 
+def initialize_system_for_tpu_embedding(embedding_config, job=None):
+  """Initializes a distributed TPU Embedding system for use with TensorFlow.
+
+  The following two are equivalent:
+  1. initialize_system() with embedding_config.
+  2. initialize_system() without embedding_config, then
+     initialize_system_for_tpu_embedding().
+  initialize_system() should not be called with embedding_config if
+  initialize_system_for_tpu_embedding() is meant to be called later.
+
+  Args:
+    embedding_config: a `TPUEmbeddingConfiguration` proto describing the desired
+      configuration of the hardware embedding lookup tables.
+    job: The job (the XXX in TensorFlow device specification /job:XXX) that
+      contains the TPU devices that will be initialized. If job=None it is
+      assumed there is only one job in the TensorFlow flock, and an error will
+      be returned if this assumption does not hold.
+
+  Returns:
+    A no-op.
+  """
+  config_string = embedding_config.SerializeToString()
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_tpu_embedding(config=config_string)
+
+
 @tf_export(v1=["tpu.shutdown_system"])
 def shutdown_system(job=None):
   """Shuts down a running a distributed TPU system.
@@ -134,6 +165,38 @@ def core(num):
   return "device:TPU_REPLICATED_CORE:{}".format(num)
 
 
+def _enclosing_tpu_context_and_graph():
+  """Returns the TPUReplicateContext and its associated graph."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, TPUReplicateContext):
+        return context_, graph
+      context_ = context_.outer_context
+    graph = getattr(graph, "outer_graph", None)
+  raise ValueError("get_replicated_var_handle() called without "
+                   "TPUReplicateContext. This shouldn't happen. Please file "
+                   "a bug.")
+
+
+def is_tpu_strategy(strategy):
+  is_tpu_strat = lambda k: k.__name__.startswith("TPUStrategy")
+  clz = strategy.__class__
+  return is_tpu_strat(clz) or any(map(is_tpu_strat, clz.__bases__))
+
+
+def _enclosing_tpu_device_assignment():
+  if not distribution_strategy_context.has_strategy():
+    return None
+  strategy = distribution_strategy_context.get_strategy()
+  if not is_tpu_strategy(strategy):
+    return None
+  return strategy.extended._device_assignment  # pylint: disable=protected-access
+
+
 class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   """A `ControlFlowContext` for nodes inside a TPU computation.
 
@@ -177,7 +240,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._pivot = pivot
     self._replicated_vars = {}
 
-  def get_replicated_var_handle(self, name, vars_):
+  def get_replicated_var_handle(self, name, vars_, device_map=None):
     """Returns a variable handle for replicated TPU variable 'var'.
 
     This is a method used by an experimental replicated variable implementation
@@ -186,14 +249,31 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     Args:
       name: The common name of the variable.
       vars_: The replicated TPU variables.
+      device_map: The DeviceMap used to create the variables if it is a
+      TPUMirroredVariable.
 
     Returns:
       The handle of the TPU replicated input node.
     """
+    device_assignment = _enclosing_tpu_device_assignment()
+    # We don't need to put device assignment as part of the replicated_vars key
+    # because each TPUReplicateContext will only have one device assignment.
     handle = self._replicated_vars.get(name)
     if handle is not None:
       return handle
 
+    replicated_vars = []
+    if device_assignment is not None and device_map is not None:
+      job_name = pydev.DeviceSpec.from_string(device_map.all_devices[0]).job
+      for replica_id in range(device_assignment.num_replicas):
+        tpu_device = device_assignment.tpu_device(
+            replica=replica_id, logical_core=0, job=job_name)
+        tpu_device = device_util.canonicalize(tpu_device)
+        replica = device_map.replica_for_device(tpu_device)
+        replicated_vars.append(vars_[replica])
+    else:
+      replicated_vars = vars_
+
     # Builds a TPUReplicatedInput node for the variable, if one does not already
     # exist. The TPUReplicatedInput node must belong to the enclosing
     # control-flow scope of the TPUReplicateContext.
@@ -201,14 +281,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
     # instead.
 
-    # pylint: disable=protected-access
-    graph = ops.get_default_graph()
-    saved_context = graph._get_control_flow_context()
-    graph._set_control_flow_context(self.outer_context)
-    handle = tpu_ops.tpu_replicated_input(
-        [v.handle for v in vars_], name=name + "/handle")
-    graph._set_control_flow_context(saved_context)
-    # pylint: enable=protected-access
+    _, graph = _enclosing_tpu_context_and_graph()
+    with graph.as_default():
+      # pylint: disable=protected-access
+      saved_context = graph._get_control_flow_context()
+      graph._set_control_flow_context(self.outer_context)
+      handle = tpu_ops.tpu_replicated_input([v.handle for v in replicated_vars],
+                                            name=name + "/handle",
+                                            is_mirrored_variable=True)
+      graph._set_control_flow_context(saved_context)
+      # pylint: enable=protected-access
     self._replicated_vars[name] = handle
     return handle
 
@@ -414,7 +496,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       for index in xrange(len(op.inputs)):
         x = op.inputs[index]
         real_x = self.AddValue(x)
-        if real_x != x:
+        if real_x is not x:
           op._update_input(index, real_x)  # pylint: disable=protected-access
 
     if external_control_inputs:
@@ -605,18 +687,28 @@ def _pad_all_input(inputs, padded_shapes):
     The padded inputs and a PaddingMap list which maps the padded input
     dimension to the real shape argument index.
   """
-  input_shape_tensors = []
+  # maximum_static_shapes[idx][i] indicates the maximum static size of ith
+  # dimension of the idx input among all the replicas.
+  maximum_static_shapes = []
+  # need_padding[idx][i] indicates whether the ith dimension of the idx input
+  # needs padding.
   need_padding = []
+  input_shape_tensors = []
   for core_idx, inputs_per_core in enumerate(inputs):
     for idx, input_tensor in enumerate(inputs_per_core):
+      input_shape = input_tensor.get_shape().as_list()
       if core_idx == 0:
         input_shape_tensors.append([])
-        need_padding.append(not input_tensor.get_shape().is_fully_defined())
+        maximum_static_shapes.append(input_shape)
+        need_padding.append(np.full_like(input_shape, False, dtype=bool))
+      else:
+        for i, s in enumerate(input_shape):
+          if not s or s != maximum_static_shapes[idx][i]:
+            need_padding[idx][i] = True
+        maximum_static_shapes[idx] = max(input_shape,
+                                         maximum_static_shapes[idx])
       input_shape_tensors[idx].append(array_ops.shape(input_tensor))
 
-      if input_tensor.get_shape() != inputs[0][idx].get_shape():
-        need_padding[idx] = True
-
   maximum_shapes = []
   for shapes_per_input in input_shape_tensors:
     maximum_shapes.append(
@@ -631,12 +723,12 @@ def _pad_all_input(inputs, padded_shapes):
     real_shape_idx = len(inputs_per_core) - 1
     for idx, input_tensor in enumerate(inputs_per_core):
       input_shape_tensor = input_shape_tensors[idx][core_idx]
-      input_shape = input_tensor.get_shape()
+      input_shape = input_tensor.get_shape().as_list()
       padded_shape = padded_shapes[idx]
 
-      if need_padding[idx]:
-        for i, s in enumerate(input_shape.dims):
-          if s.value is None:
+      if any(need_padding[idx]):
+        for i, s in enumerate(input_shape):
+          if need_padding[idx][i]:
             if core_idx == 0:
               real_shape_idx += 1
               padding_map = dynamic_padding.PaddingMap()
@@ -649,22 +741,28 @@ def _pad_all_input(inputs, padded_shapes):
 
         paddings = []
         for i, s in enumerate(padded_shape.dims):
-          # Use static input shape dimension if possible.
-          if input_shape.dims[i].value:
-            input_shape_dim = input_shape.dims[i].value
+          if need_padding[idx][i]:
+            if s.value:
+              # Pad to the given maximum value.
+              padding = [0, s.value - input_shape_tensor[i]]
+            else:
+              # If maximum value is not given, then pad to the maximum dimension
+              # among all the cores.
+              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
           else:
-            input_shape_dim = input_shape_tensor[i]
-
-          if s.value:
-            # Pad to the given maximum value.
-            padding = [0, s.value - input_shape_dim]
-          else:
-            # If maximum value is not given, then pad to the maximum dimension
-            # among all the cores.
-            padding = [0, maximum_shapes[idx][i] - input_shape_dim]
+            padding = [0, 0]
           paddings.append(padding)
 
-        padded_input = array_ops.pad(input_tensor, paddings)
+        if input_tensor.get_shape().is_fully_defined():
+          # TODO(rxsang): This is a hack to make sure padded_input has dynamic
+          # shapes, so any tf.size/tf.shape op performed on it won't be constant
+          # folded. Do we have better ways to do it?
+          padded_input = control_flow_ops.cond(
+              array_ops.constant(True),
+              lambda: array_ops.pad(input_tensor, paddings),  # pylint: disable=cell-var-from-loop
+              lambda: input_tensor)
+        else:
+          padded_input = array_ops.pad(input_tensor, paddings)
         padded_inputs[core_idx].append(padded_input)
       else:
         padded_inputs[core_idx].append(input_tensor)
@@ -755,6 +853,9 @@ def split_compile_and_replicate(computation,
           device_assignment.num_cores_per_replica
       ]
 
+  # This entry is used for enabling automatic outside compilation.
+  metadata_kwargs["allow_soft_placement"] = config.get_soft_device_placement()
+
   if ((not isinstance(inputs, list)) or
       any(not isinstance(inp, (list, tuple)) for inp in inputs)):
     raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
@@ -1019,7 +1120,7 @@ def _postprocess_flat_outputs(outputs):
   if outputs is None:
     outputs = tuple()
   # If the computation only returned one value, makes it a tuple.
-  if not isinstance(outputs, collections.Sequence):
+  if not isinstance(outputs, collections_abc.Sequence):
     outputs = (outputs,)
 
   # Append `no_op` here so that fetching any return value of this function
@@ -1461,16 +1562,20 @@ _BLACKLISTED_INFERENCE_OPS = set([
 
 
 def under_tpu_inference_context():
-  """Check if it is currently under `tpu.rewrite_for_inference()`."""
+  """Check if it is currently under `_TPUInferenceContext`."""
   graph = ops.get_default_graph()
-
-  context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  while context:
-    if isinstance(context, _TPUInferenceContext):
-      return True
-    context = context.outer_context
-
-  return False
+  while graph:
+    context = graph._get_control_flow_context()  # pylint: disable=protected-access
+    while context:
+      if isinstance(context, _TPUInferenceContext):
+        return True
+      context = context.outer_context
+    if isinstance(graph, function._FuncGraph):  # pylint: disable=protected-access
+      graph = graph._outer_graph  # pylint: disable=protected-access
+    elif isinstance(graph, func_graph.FuncGraph):
+      graph = graph.outer_graph
+    else:
+      return False
 
 
 class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
@@ -1622,8 +1727,10 @@ def prune_unconnected_ops_from_xla(prune_graph):
   """
   # Scan over the top level graph and all function graphs.
   for graph in [prune_graph] + [
-      f for f in prune_graph._functions.values() if isinstance(f, ops.Graph)  # pylint: disable=protected-access
+      f for f in prune_graph._functions.values()  # pylint: disable=protected-access
   ]:
+    if not isinstance(graph, ops.Graph):
+      continue
     for op in graph.get_operations():
       if op.type not in _UNCONNECTED_OPS_TO_PRUNE:
         continue
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 1a781bb9781..9712f0bdc2e 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -43,17 +43,23 @@ TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
 INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
 
 
+# TODO(shizhiw): a more future-proof way is to have optimization_parameter such
+#  as AdagradParameters etc instead of learning_rate.
 class TableConfig(
-    collections.namedtuple(
-        'TableConfig',
-        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
+    collections.namedtuple('TableConfig', [
+        'vocabulary_size', 'dimension', 'initializer', 'combiner',
+        'hot_id_replication', 'learning_rate', 'learning_rate_key'
+    ])):
   """Embedding table configuration."""
 
   def __new__(cls,
               vocabulary_size,
               dimension,
               initializer=None,
-              combiner='mean'):
+              combiner='mean',
+              hot_id_replication=False,
+              learning_rate=None,
+              learning_rate_key=None):
     """Embedding table configuration.
 
     Args:
@@ -69,6 +75,20 @@ class TableConfig(
         accuracy, in particular with bag-of-words columns. For more information,
         see `tf.nn.embedding_lookup_sparse`. None is only valid for dense rather
         than sparse tensors.
+      hot_id_replication: If true, enables hot id replication, which can make
+        embedding lookups faster if there are some hot rows in the table.
+      learning_rate: float, static learning rate for this table. If
+        learning_rate and learning_rate_key are both `None`, global
+        static learning rate as specified in `optimization_parameters` in
+        `TPUEmbedding` constructor will be used. `learning_rate_key` must be
+        `None` if `learning_rate` is not `None.
+      learning_rate_key: string, use dynamic learning rate of
+        `learning_rates[learning_rate_key]` for this table, where
+        `learning_rates` is the second argument of
+        `generate_send_gradients_op()`. If learning_rate and learning_rate_key
+        are both `None`, global static learning rate as specified in
+        `optimization_parameters` in `TPUEmbedding` constructor will be used.
+        `learning_rate` must be `None` if `learning_rate_key` is not `None.
 
     Returns:
       `TableConfig`.
@@ -78,6 +98,8 @@ class TableConfig(
       ValueError: if `dimension` is not positive integer.
       ValueError: if `initializer` is specified and is not callable.
       ValueError: if `combiner` is not supported.
+      ValueError: if `learning_rate` and `learning_rate_key` are both not
+        `None`.
     """
     if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
       raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
@@ -94,8 +116,14 @@ class TableConfig(
     if combiner not in ('mean', 'sum', 'sqrtn', None):
       raise ValueError('Invalid combiner {}'.format(combiner))
 
-    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
-                                           initializer, combiner)
+    if learning_rate is not None and learning_rate_key is not None:
+      raise ValueError('At most one of learning_rate and learning_rate_key '
+                       'can be None; got {} and {}'
+                       .format(learning_rate, learning_rate_key))
+
+    return super(TableConfig, cls).__new__(
+        cls, vocabulary_size, dimension, initializer, combiner,
+        hot_id_replication, learning_rate, learning_rate_key)
 
 
 class FeatureConfig(
@@ -656,6 +684,10 @@ class TPUEmbedding(object):
 
   def _create_config_proto(self):
     """Create `TPUEmbeddingConfiguration`."""
+    self._learning_rate_keys = list(
+        set(c.learning_rate_key
+            for c in self._table_to_config_dict.values()
+            if c.learning_rate_key is not None))
     config_proto = elc.TPUEmbeddingConfiguration()
     for table in self._table_to_config_dict:
       table_descriptor = config_proto.table_descriptor.add()
@@ -670,18 +702,28 @@ class TPUEmbedding(object):
 
       table_descriptor.num_features = self._table_to_num_features_dict[table]
 
-      table_descriptor.optimization_parameters.learning_rate.constant = (
-          self._optimization_parameters.learning_rate)
-      table_descriptor.optimization_parameters.gradient_accumulation_status = (
+      parameters = table_descriptor.optimization_parameters
+      if table_config.learning_rate:
+        parameters.learning_rate.constant = (table_config.learning_rate)
+      elif table_config.learning_rate_key:
+        parameters.learning_rate.dynamic.tag = (
+            self._learning_rate_keys.index(table_config.learning_rate_key))
+      else:
+        parameters.learning_rate.constant = (
+            self._optimization_parameters.learning_rate)
+      parameters.gradient_accumulation_status = (
           optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
           if self._optimization_parameters.use_gradient_accumulation else
           optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
       if self._optimization_parameters.clip_weight_min is not None:
-        table_descriptor.optimization_parameters.clipping_limits.lower.value = (
+        parameters.clipping_limits.lower.value = (
             self._optimization_parameters.clip_weight_min)
       if self._optimization_parameters.clip_weight_max is not None:
-        table_descriptor.optimization_parameters.clipping_limits.upper.value = (
+        parameters.clipping_limits.upper.value = (
             self._optimization_parameters.clip_weight_max)
+      if table_config.hot_id_replication:
+        parameters.hot_id_replication_configuration.status = (
+            optimization_parameters_pb2.HotIdReplicationConfiguration.ENABLED)
       self._optimizer_handler.set_optimization_parameters(table_descriptor)
 
     config_proto.mode = self._mode
@@ -960,12 +1002,16 @@ class TPUEmbedding(object):
 
     return activations
 
-  def generate_send_gradients_op(self, feature_to_gradient_dict):
+  def generate_send_gradients_op(self,
+                                 feature_to_gradient_dict,
+                                 learning_rates=None):
     """Send gradient to TPU embedding.
 
     Args:
       feature_to_gradient_dict: dict mapping feature names to gradient wrt
         activations.
+      learning_rates: dict mapping from learning rate key to dynamic learning
+        rate. Defaults to `None`.
 
     Returns:
       SendTPUEmbeddingGradients Op.
@@ -977,6 +1023,10 @@ class TPUEmbedding(object):
       raise RuntimeError('Only in training mode gradients need to '
                          'be sent to TPU embedding; got mode {}.'
                          .format(self._mode))
+
+    if learning_rates is None:
+      learning_rates = dict()
+
     gradients = []
     for table in self._table_to_features_dict:
       features = self._table_to_features_dict[table]
@@ -991,8 +1041,13 @@ class TPUEmbedding(object):
           array_ops.concat(table_gradients, axis=1),
           [-1, array_ops.shape(table_gradients[0])[-1]])
       gradients.append(interleaved_table_grads)
+
     return tpu_ops.send_tpu_embedding_gradients(
-        inputs=gradients, config=self.config_proto.SerializeToString())
+        inputs=gradients,
+        learning_rates=[
+            learning_rates[tag] for tag in self._learning_rate_keys
+        ],
+        config=self.config_proto.SerializeToString())
 
 
 def _validate_table_to_config_dict(table_to_config_dict):
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index 6b62f55b5bf..438059a7fee 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -63,9 +63,10 @@ def initialize_tpu_system(cluster_resolver=None):
 
   tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
   if tpu_name in _INITIALIZED_TPU_SYSTEMS:
-    logging.warning("TPU system %s has already been initialized. "
-                    "Reinitializing the TPU can cause previously created "
-                    "variables on TPU to be lost.")
+    logging.warning(
+        "TPU system %s has already been initialized. "
+        "Reinitializing the TPU can cause previously created "
+        "variables on TPU to be lost.", tpu_name)
 
   logging.info("Initializing the TPU system: %s", tpu_name)
 
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index aacfe6faf4e..0730618e31f 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(test.TestCase):
         update.run()
 
         v0_val, v1_val = self.evaluate([var0, var1])
-        # Let g to be gradient accumulator, gg to be gradient squared
-        # accumulator, T be the global step, lr is the learning rate, and k the
-        # initial gradient squared accumulator value.
+        # Let g be the gradient accumulator, gg be the gradient squared
+        # accumulator, T be the global step, lr be the learning rate,
+        # and k the initial gradient squared accumulator value.
         # w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})}
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 15958112bd8..8ac5f944cd6 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -366,7 +366,8 @@ class AdamOptimizerTest(test.TestCase):
       opt.minimize(lambda: v1 + v2)
       # There should be two non-slot variables, and two unique slot variables
       # for v1 and v2 respectively.
-      self.assertEqual(6, len(set(opt.variables())))
+      self.assertEqual(6, len({id(v) for v in opt.variables()}))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 6bf5d8aca1f..e3e94a6b6a1 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -33,7 +33,8 @@ STANDARD_PS_OPS = ("Variable", "VariableV2", "AutoReloadVariable",
                    "MutableDenseHashTable", "MutableDenseHashTableV2",
                    "VarHandleOp", "BoostedTreesEnsembleResourceHandleOp",
                    "BoostedTreesQuantileStreamResourceHandleOp",
-                   "ResourceConditionalAccumulator")
+                   "ResourceConditionalAccumulator",
+                   "DecisionTreeResource")
 
 
 class _RoundRobinStrategy(object):
diff --git a/tensorflow/python/training/experimental/loss_scale.py b/tensorflow/python/training/experimental/loss_scale.py
index bbbd0cd7ec4..46f52f0a955 100644
--- a/tensorflow/python/training/experimental/loss_scale.py
+++ b/tensorflow/python/training/experimental/loss_scale.py
@@ -205,7 +205,7 @@ class FixedLossScale(LossScale):
         number as long as no nan or inf is encountered in training.
 
     Raises:
-      ValueError: If loss_scale is less than 1.
+      ValueError: If loss_scale_value is less than 1.
     """
     super(FixedLossScale, self).__init__()
     if not isinstance(loss_scale_value, six.integer_types + (float,)):
@@ -227,6 +227,9 @@ class FixedLossScale(LossScale):
     del grads
     return control_flow_ops.no_op(), True
 
+  def __repr__(self):
+    return 'FixedLossScale(%s)' % self._loss_scale_value
+
   def get_config(self):
     return {'loss_scale_value': self._loss_scale_value}
 
@@ -376,6 +379,17 @@ class DynamicLossScale(LossScale):
     should_apply_gradients = is_finite
     return update_op, should_apply_gradients
 
+  def __repr__(self):
+    if context.executing_eagerly():
+      return ('DynamicLossScale(current_loss_scale=%s, num_good_steps=%s, '
+              'initial_loss_scale=%s, increment_period=%s, multiplier=%s)' %
+              (self._current_loss_scale.numpy(), self._num_good_steps.numpy(),
+               self.initial_loss_scale, self.increment_period, self.multiplier))
+    else:
+      return ('DynamicLossScale(initial_loss_scale=%s, increment_period=%s, '
+              'multiplier=%s)' %
+              (self.initial_loss_scale, self.increment_period, self.multiplier))
+
   def get_config(self):
     return {
         'initial_loss_scale': self.initial_loss_scale,
diff --git a/tensorflow/python/training/experimental/loss_scale_test.py b/tensorflow/python/training/experimental/loss_scale_test.py
index c3e18a18422..e4a11144041 100644
--- a/tensorflow/python/training/experimental/loss_scale_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_test.py
@@ -92,6 +92,11 @@ class FixedLossScaleTest(test.TestCase):
     scalar = loss_scale_module.FixedLossScale(123)
     self.assertIsInstance(scalar(), ops.Tensor)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_repr(self):
+    loss_scale = loss_scale_module.FixedLossScale(123)
+    self.assertEqual(repr(loss_scale), 'FixedLossScale(123.0)')
+
 
 def _get_example_iter(inputs):
   dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
@@ -302,5 +307,22 @@ class DynamicLossScaleTest(test.TestCase, parameterized.TestCase):
     scalar = loss_scale_module.DynamicLossScale()
     self.assertIsInstance(scalar(), ops.Tensor)
 
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_repr(self, strategy_fn):
+    with strategy_fn().scope():
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=1, increment_period=2, multiplier=3)
+      if context.executing_eagerly():
+        self.assertEqual(repr(loss_scale),
+                         'DynamicLossScale(current_loss_scale=1.0, '
+                         'num_good_steps=0, initial_loss_scale=1.0, '
+                         'increment_period=2, multiplier=3.0)')
+      else:
+        self.assertEqual(repr(loss_scale),
+                         'DynamicLossScale(initial_loss_scale=1.0, '
+                         'increment_period=2, multiplier=3.0)')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
new file mode 100644
index 00000000000..aea2c1f61f5
--- /dev/null
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
@@ -0,0 +1,154 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains Loss Scaling Gradient Tape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("mixed_precision.experimental.LossScalingGradientTape", v1=[])
+class LossScalingGradientTape(backprop.GradientTape):
+  """A gradient tape that scales losses and unscales resulting gradients.
+
+  Operates as a normal gradient tape, but takes in a
+  `tf.train.experimental.LossScale` object. Losses are scaled up by some amount
+  before the gradients are calculated and the resulting gradients are scaled
+  down by the same amount.
+
+  This has no net mathematical effect, but can be used to prevent vanishing
+  gradients, for example in the case of mixed precision training.
+
+  If a DynamicLossScale object is used and non-finite gradients are encountered,
+  the loss scale will be updated and the gradients recomputed until either
+  finite gradients are encountered or the loss scale becomes 1.
+
+  This class should *not* be used with a LossScaleOptimizer, as both classes
+  update the LossScale object. Use a non-loss scaling optimizer instead.
+
+  Usage:
+  ```
+  opt = tf.keras.optimizers.SGD(1.0)
+  model_loss_scale = tf.train.experimental.DynamicLossScale()
+
+  for step in training_steps:
+    with LossScalingGradientTape(model_loss_scale) as tape:
+      logits = ...  # Run model and get logits
+      loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
+                                                     labels=labels)
+      loss = tf.reduce_mean(loss)
+    vars = tape.watched_variables()
+    grads = tape.gradient(loss, vars)
+    opt.apply_gradients(zip(grads, vars))
+  ```
+  """
+
+  def __init__(self,
+               loss_scale,
+               persistent=False,
+               watch_accessed_variables=True):
+    """Creates a new LossScalingGradientTape.
+
+    Args:
+      loss_scale: `tf.train.experimental.LossScale` object that
+        manages what quantity to scale by. This is typically either a
+        FixedLossScale object with a constant scalar or a
+        `tf.train.experimental.DynamicLossScale` object that will
+        adjust the scalar appropriately if any non-finite gradients are
+        encountered.
+      persistent: Boolean controlling whether a persistent gradient tape is
+        created. False by default, which means at most one call can be made to
+        the gradient() method on this object.
+      watch_accessed_variables: Boolean controlling whether the tape will
+        automatically `watch` any (trainable) variables accessed while the tape
+        is active. Defaults to True meaning gradients can be requested from any
+        result computed in the tape derived from reading a trainable `Variable`.
+        If False users must explicitly `watch` any `Variable`s they want to
+        request gradients from.
+    """
+    if not isinstance(loss_scale, loss_scale_module.LossScale):
+      raise ValueError("`loss_scale` must be an instance of LossScale.")
+
+    # always make a persistent tape to loop over loss scaling
+    super(LossScalingGradientTape, self).__init__(True,
+                                                  watch_accessed_variables)
+    self._outer_persistent = persistent
+    self._loss_scale = loss_scale
+
+  def gradient(self,
+               target,
+               sources,
+               output_gradients=None,
+               unconnected_gradients=UnconnectedGradients.NONE):
+    """Computes the gradient using operations recorded in context of this tape.
+
+    Uses the `LossScale` object provided in the constructor to scale `target`
+    and then to unscale the resulting gradients.
+
+    Args:
+      target: a list or nested structure of Tensors or Variables to be
+        differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target` will
+        be differentiated against elements in `sources`.
+      output_gradients: a list of gradients, one for each element of target.
+        Defaults to None.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+
+    Returns:
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`. If non-finite gradients are encountered
+      after dynamic scaling, the loss scale will be updated and the gradients
+      recomputed until either finite gradients are encountered or the loss scale
+      becomes 1.
+
+    Raises:
+      RuntimeError: if called inside the context of the tape, or if called more
+       than once on a non-persistent tape.
+      ValueError: if the target is a variable or if unconnected gradients is
+       called with an unknown value.
+    """
+    if self._tape is None:  # pylint: disable=access-member-before-definition
+      raise RuntimeError("GradientTape.gradient can only be called once on "
+                         "non-persistent tapes.")
+
+    ready_to_update = False
+    grads = nest.map_structure(array_ops.zeros_like, sources)
+
+    while not ready_to_update and self._loss_scale() > 1:
+      with self:  # re-enter the gradient tape so it sees the loss scaling
+        loss_scale = self._loss_scale()
+        scaled_target = nest.map_structure(lambda t: t * loss_scale, target)
+
+      old_grads = super(LossScalingGradientTape, self).gradient(
+          scaled_target, sources, output_gradients, unconnected_gradients)
+      inv_loss_scale = 1.0 / self._loss_scale()
+      grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads)
+      # Check for non-finite gradients possibly resulting from scaling
+      _, ready_to_update = self._loss_scale.update(grads)
+
+    if not self._outer_persistent:
+      self._tape = None  # free up resources if a persistent tape was not needed
+    return grads
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
new file mode 100644
index 00000000000..25aa9cbcb13
--- /dev/null
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
@@ -0,0 +1,183 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lsgt.LossScalingGradientTape."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt
+
+
+class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_basic_tapes_eager_mode(self, loss_scale):
+    x = constant_op.constant(3.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(x)
+      y = x * x
+    dy_dx = g.gradient(y, x)
+    self.assertEqual(self.evaluate(dy_dx), 6.0)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_basic_tapes_graph_mode(self, loss_scale):
+    loss_scale = loss_scale(32)
+
+    @def_function.function
+    def _inner_test():
+      x = constant_op.constant(3.0)
+      with lsgt.LossScalingGradientTape(loss_scale) as g:
+        g.watch(x)
+        y = x * x
+      return g.gradient(y, x)
+    self.assertEqual(self.evaluate(_inner_test()), 6.0)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_nested_tapes(self, loss_scale):
+    x = constant_op.constant(3.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(x)
+      with lsgt.LossScalingGradientTape(loss_scale(32)) as gg:
+        gg.watch(x)
+        y = x * x
+      dy_dx = gg.gradient(y, x)
+      self.assertEqual(self.evaluate(dy_dx), 6.0)
+    d2y_dx2 = g.gradient(dy_dx, x)
+    self.assertEqual(self.evaluate(d2y_dx2), 2.0)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_non_persistent_tapes_error(self, loss_scale):
+    x = constant_op.constant(3.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32), persistent=False) as g:
+      g.watch(x)
+      y = x * x
+      z = y * y
+    g.gradient(z, x)
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.gradient(y, x)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_persistent_tapes(self, loss_scale):
+    x = constant_op.constant(3.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32), persistent=True) as g:
+      g.watch(x)
+      y = x * x
+      z = y * y
+    dz_dx = g.gradient(z, x)
+    self.assertEqual(self.evaluate(dz_dx), 108.0)
+    dy_dx = g.gradient(y, x)
+    self.assertEqual(self.evaluate(dy_dx), 6.0)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_nested_sources(self, loss_scale):
+    x = (constant_op.constant(19.0), (constant_op.constant(8.),
+                                      constant_op.constant(9.)))
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(x)
+      y = x * 13
+    dy_dx = g.gradient(y, x)
+    self.assertEqual(self.evaluate(dy_dx), (13., (13., 13.)))
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_nested_targets(self, loss_scale):
+    w = constant_op.constant(3.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(w)
+      x = w * 5
+      y = w * 7
+      z = w * 11
+    grad = g.gradient([x, (y, z)], w)
+    self.assertEqual(self.evaluate(grad), 23)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_scaling_inf_gradient(self, loss_scale):
+    x = constant_op.constant(1.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(x)
+      y = x * np.inf
+    dy_dx = g.gradient(y, x)
+    self.assertEqual(self.evaluate(dy_dx), np.inf)
+
+  @parameterized.parameters(loss_scale_module.FixedLossScale,
+                            loss_scale_module.DynamicLossScale)
+  def test_scaling_nan_gradient(self, loss_scale):
+    x = constant_op.constant(1.0)
+    with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
+      g.watch(x)
+      y = x * np.nan
+    dy_dx = g.gradient(y, x)
+    self.assertTrue(np.isnan(self.evaluate(dy_dx)))
+
+  @parameterized.parameters(np.inf, np.nan)
+  def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term):
+    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
+    x = constant_op.constant(1.0)
+    with lsgt.LossScalingGradientTape(loss_scale) as g:
+      g.watch(x)
+      y = x * non_finite_term
+    g.gradient(y, x)
+    self.assertEqual(self.evaluate(loss_scale()), 1.0)
+
+  @parameterized.parameters([np.inf, np.isposinf], [np.nan, np.isnan])
+  def test_fixed_scaling_no_change_non_finite_gradient(self, non_finite_term,
+                                                       is_non_finite):
+    loss_scale = loss_scale_module.FixedLossScale(32)
+    x = constant_op.constant(1.0)
+    with lsgt.LossScalingGradientTape(loss_scale) as g:
+      g.watch(x)
+      y = x * non_finite_term
+    dy_dx = g.gradient(y, x)
+    self.assertTrue(is_non_finite(self.evaluate(dy_dx)))
+    self.assertEqual(self.evaluate(loss_scale()), 32.0)
+
+  def test_dynamic_loss_scaling_down_loop(self):
+    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
+    x = constant_op.constant(1.0)
+    with lsgt.LossScalingGradientTape(loss_scale) as g:
+      g.watch(x)
+      y = x * (3.0 * (10**37))  # grad will be inf after scaling
+    dy_dx = g.gradient(y, x)
+    self.assertEqual(self.evaluate(loss_scale()), 8.0)
+    self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
+
+  def test_dynamic_loss_scaling_inf_target_post_scale(self):
+    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0)
+    x = constant_op.constant(3.0 * (10**37))
+    with lsgt.LossScalingGradientTape(loss_scale) as g:
+      g.watch(x)
+      y = x * 3.0  # target will be inf after scaling
+    dy_dx = g.gradient(y, x)
+    self.assertAllClose(self.evaluate(dy_dx), 3.0)
+    self.assertEqual(self.evaluate(loss_scale()), 32.0)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index 949c4981bfb..f2cc6c31339 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -27,7 +27,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _wrap_optimizer(opt, loss_scale):
+def _wrap_optimizer(opt, loss_scale, use_v1_behavior):
   """Wraps an optimizer with a LossScaleOptimizer."""
 
   if isinstance(opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer):
@@ -67,51 +67,272 @@ def _wrap_optimizer(opt, loss_scale):
     from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2  # pylint: disable=g-import-not-at-top
     return loss_scale_optimizer_v2.LossScaleOptimizer(opt, loss_scale)
 
-  raise ValueError('"opt" must be an instance of a tf.train.Optimizer or a '
-                   'tf.keras.optimizers.Optimizer, but got: %s' % opt)
+  if use_v1_behavior:
+    raise ValueError('"opt" must be an instance of a tf.train.Optimizer or a '
+                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
+  else:
+    raise ValueError('"opt" must be an instance of a '
+                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
 
 
-@tf_export(v1=['train.experimental.enable_mixed_precision_graph_rewrite'])
+@tf_export('train.experimental.enable_mixed_precision_graph_rewrite', v1=[])
 def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
   """Enable mixed precision via a graph rewrite.
 
-  Mixed precision is the use of both float16 and float32 when training a model,
-  and is used to make the model run faster. This function will use mixed
-  precision to speed up the execution time of your model when run on a GPU. It
-  does this by changing the dtype of certain operations in the graph from
-  float32 to float16.
+  Mixed precision is the use of both float32 and float16 data types when
+  training a model to improve performance. This is achieved via a graph rewrite
+  operation and a loss-scale optimizer.
 
-  This function additionally wraps an Optimizer with a LossScaleOptimizer, which
-  is required to prevent underflow in the float16 tensors during the backwards
-  pass. An optimizer must be passed to this function, which will then be wrapped
-  to use loss scaling.
+  Performing arithmetic operations in float16 takes advantage of specialized
+  processing units, such as NVIDIA Tensor Cores for much higher arithmetic
+  throughput. However, due to the smaller representable range, performing the
+  entire training with float16 can result in gradient underflow, that is, small
+  gradient values becoming zeroes. Instead, performing only select arithmetic
+  operations in float16 results in higher throughput and decreased training
+  time when using compatible hardware accelerators while also reducing memory
+  usage, typically without sacrificing model accuracy.
 
-  When this function is used, gradients should only be computed and applied with
-  the returned optimizer, either by calling `opt.minimize()` or
-  `opt.compute_gradients()` followed by `opt.apply_gradients()`. Gradients
-  should not be computed with `tf.gradients` or `tf.GradientTape`. This is
-  because the returned optimizer will apply loss scaling, and
-  `tf.gradients`/`tf.GradientTape` will not. If you do directly use
-  `tf.gradients` or `tf.GradientTape`, your model may train to a worse quality.
+  Note: While the mixed precision rewrite changes the datatype of various
+  layers throughout the model, the same accuracy reached in float32 is
+  expected. If a `NaN` gradient occurs with dynamic loss scaling, the model
+  update for that batch is skipped. In this case, the global step count is not
+  incremented, and the `LossScaleOptimizer` attempts to decrease the loss
+  scaling value to avoid `NaN` values in subsequent iterations. This approach
+  has been shown to achieve the same accuracy as float32 and, in most cases,
+  better training throughput.
+
+  Example:
+
+  ```python
+  model = tf.keras.models.Sequential([
+    ...
+  ])
+
+  opt = tf.keras.optimizers.SGD()
+  opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt)
+
+  model.compile(loss="categorical_crossentropy",
+              optimizer=opt,
+              metrics=["accuracy"])
+
+  model.fit(x_train, y_train,
+          batch_size=batch_size,
+          epochs=epochs)
+  ```
+
+  For a complete example showing the speed-up on training an image
+  classification task on CIFAR10, check out this
+  <a href="https://colab.research.google.com/github/NVIDIA/
+  DeepLearningExamples/blob/master/TensorFlow/docs/amp/notebook_v1.14/
+  auto_mixed_precision_demo_cifar10.ipynb">Colab notebook</a>.
+
+  Calling `enable_mixed_precision_graph_rewrite(opt)` enables the graph rewrite
+  operation before computing gradients. The function additionally returns an
+  `Optimizer`(`opt`) wrapped with a `LossScaleOptimizer`. This prevents
+  underflow in the float16 tensors during the backward pass. An optimizer of
+  type `tf.train.Optimizer` or `tf.keras.optimizers.Optimizer` must be passed
+  to this function, which will then be wrapped to use loss scaling.
+
+  <img src="
+  http://developer.download.nvidia.com/compute/machine-learning/frameworks/
+  TF_mixed_precision_training.png" width="500px">
+
+  The graph rewrite operation changes the `dtype` of certain operations in the
+  graph from float32 to float16. There are several categories of operations
+  that are either included or excluded by this rewrite operation. The following
+  categories of Ops are defined inside corresponding functions under the class
+  `AutoMixedPrecisionLists` in
+  <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/
+  core/grappler/optimizers/auto_mixed_precision_lists.h">
+  auto_mixed_precision_lists.h</a>:
+
+  * `ClearList`: Ops that do not have numerically significant adverse effects.
+  E.g. `ArgMax` and `Floor`.
+  * `WhiteList`: Ops that are considered numerically safe for execution in
+  float16, and thus are always converted. E.g. `Conv2D`.
+  * `BlackList`: Ops that are numerically unsafe to execute in float16 and
+  can negatively affect downstream nodes. E.g. `Softmax`.
+  * `GrayList`: Ops that are considered numerically safe for execution in
+  float16 unless downstream from a BlackList Op. E.g. `Add` and `AvgPool`.
+
+  When this function is used, gradients should only be computed and applied
+  with the returned optimizer, either by calling `opt.minimize()` or
+  `opt.compute_gradients()` followed by `opt.apply_gradients()`.
+  Gradients should not be computed with `tf.gradients` or `tf.GradientTape`.
+  This is because the returned optimizer will apply loss scaling, and
+  `tf.gradients` or `tf.GradientTape` will not. If you do directly use
+  `tf.gradients` or `tf.GradientTape`, your model may not converge due to
+  float16 underflow problems.
 
   When eager execution is enabled, the mixed precision graph rewrite is only
-  enabled within `tf.function`s, as outside `tf.function`s, there is no graph.
+  enabled within `tf.function`, as outside `tf.function`, there is no graph.
 
-  When enabled, mixed precision is only used on Volta GPUs and above. The parts
-  of the graph on CPUs and TPUs are untouched by the graph rewrite.
+  For NVIDIA GPUs with Tensor cores, as a general performance guide, dimensions
+  (such as batch size, input size, output size, and channel counts)
+  should be powers of two if under 256, or  otherwise divisible by 8 if above
+  256. For more information, check out the
+  [NVIDIA Deep Learning Performance Guide](
+  https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html).
 
+  Currently, mixed precision is only enabled on NVIDIA Tensor Core GPUs with
+  Compute Capability 7.0 and above (Volta, Turing, or newer architectures). The
+  parts of the graph on CPUs and TPUs are untouched by the graph rewrite. TPU
+  support is coming soon. CPUs are not supported, as CPUs do not run float16
+  operations faster than float32 operations.
+
+  Raises:
+    `ValueError` when
+    `mixed_precision_global_state.using_default_mixed_precision_policy`
+    is set to `False` before
+    `tf.train.experimental.enable_mixed_precision_graph_rewrite()`
+    is called.
+
+  Args:
+    opt: An instance of a `tf.keras.optimizers.Optimizer`.
+    loss_scale: Either an int/float, the string `"dynamic"`, or an instance of a
+      `tf.train.experimental.LossScale`. The loss scale to use. It is
+      recommended to keep this as its default value of `"dynamic"`, which will
+      adjust the scaling automatically to prevent `Inf` or `NaN` values.
+
+  Returns:
+    A version of `opt` that will use loss scaling to prevent underflow.
+  """
+  return _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
+                                                    use_v1_behavior=False)
+
+
+@tf_export(v1=['train.experimental.enable_mixed_precision_graph_rewrite'])
+def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
+  """Enable mixed precision via a graph rewrite.
+
+  Mixed precision is the use of both float32 and float16 data types when
+  training a model to improve performance. This is achieved via a graph rewrite
+  operation and a loss-scale optimizer.
+  
+  Performing arithmetic operations in float16 takes advantage of specialized
+  processing units, such as NVIDIA Tensor Cores for much higher arithmetic
+  throughput. However, due to the smaller representable range, performing the
+  entire training with float16 can result in gradient underflow, that is, small
+  gradient values becoming zeroes. Instead, performing only select arithmetic
+  operations in float16 results in higher throughput and decreased training
+  time when using compatible hardware accelerators while also reducing memory
+  usage, typically without sacrificing model accuracy.
+  
+  Note: While the mixed precision rewrite changes the datatype of various
+  layers throughout the model, the same accuracy reached in float32 is
+  expected. If a `NaN` gradient occurs with dynamic loss scaling, the model
+  update for that batch is skipped. In this case, the global step count is not
+  incremented, and the `LossScaleOptimizer` attempts to decrease the loss
+  scaling value to avoid `NaN` values in subsequent iterations. This approach
+  has been shown to achieve the same accuracy as float32 and, in most cases,
+  better training throughput.
+  
+  Example:
+  
+  ```python
+  model = tf.keras.models.Sequential([
+    ...
+  ])
+  
+  opt = tf.keras.optimizers.SGD()
+  opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt)
+  
+  model.compile(loss="categorical_crossentropy",
+              optimizer=opt,
+              metrics=["accuracy"])
+  
+  model.fit(x_train, y_train,
+          batch_size=batch_size,
+          epochs=epochs)
+  ```
+  
+  For a complete example showing the speed-up on training an image
+  classification task on CIFAR10, check out this
+  <a href="https://colab.research.google.com/github/NVIDIA/
+  DeepLearningExamples/blob/master/TensorFlow/docs/amp/notebook_v1.14/
+  auto_mixed_precision_demo_cifar10.ipynb">Colab notebook</a>.
+  
+  Calling `enable_mixed_precision_graph_rewrite(opt)` enables the graph rewrite
+  operation before computing gradients. The function additionally returns an
+  `Optimizer`(`opt`) wrapped with a `LossScaleOptimizer`. This prevents
+  underflow in the float16 tensors during the backward pass. An optimizer of
+  type `tf.train.Optimizer` or `tf.keras.optimizers.Optimizer` must be passed
+  to this function, which will then be wrapped to use loss scaling.
+  
+  <img src="
+  http://developer.download.nvidia.com/compute/machine-learning/frameworks/
+  TF_mixed_precision_training.png" width="500px">
+  
+  The graph rewrite operation changes the `dtype` of certain operations in the
+  graph from float32 to float16. There are several categories of operations
+  that are either included or excluded by this rewrite operation. The following
+  categories of Ops are defined inside corresponding functions under the class 
+  `AutoMixedPrecisionLists` in
+  <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/
+  core/grappler/optimizers/auto_mixed_precision_lists.h">
+  auto_mixed_precision_lists.h</a>:
+  
+  * `ClearList`: Ops that do not have numerically significant adverse effects.
+  E.g. `ArgMax` and `Floor`.
+  * `WhiteList`: Ops that are considered numerically safe for execution in
+  float16, and thus are always converted. E.g. `Conv2D`.
+  * `BlackList`: Ops that are numerically unsafe to execute in float16 and
+  can negatively affect downstream nodes. E.g. `Softmax`.
+  * `GrayList`: Ops that are considered numerically safe for execution in
+  float16 unless downstream from a BlackList Op. E.g. `Add` and `AvgPool`.
+  
+  When this function is used, gradients should only be computed and applied
+  with the returned optimizer, either by calling `opt.minimize()` or
+  `opt.compute_gradients()` followed by `opt.apply_gradients()`.
+  Gradients should not be computed with `tf.gradients` or `tf.GradientTape`.
+  This is because the returned optimizer will apply loss scaling, and
+  `tf.gradients` or `tf.GradientTape` will not. If you do directly use
+  `tf.gradients` or `tf.GradientTape`, your model may not converge due to
+  float16 underflow problems.
+  
+  When eager execution is enabled, the mixed precision graph rewrite is only
+  enabled within `tf.function`, as outside `tf.function`, there is no graph.
+  
+  For NVIDIA GPUs with Tensor cores, as a general performance guide, dimensions
+  (such as batch size, input size, output size, and channel counts)
+  should be powers of two if under 256, or  otherwise divisible by 8 if above
+  256. For more information, check out the
+  [NVIDIA Deep Learning Performance Guide](
+  https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html).
+  
+  Currently, mixed precision is only enabled on NVIDIA Tensor Core GPUs with
+  Compute Capability 7.0 and above (Volta, Turing, or newer architectures). The
+  parts of the graph on CPUs and TPUs are untouched by the graph rewrite. TPU
+  support is coming soon. CPUs are not supported, as CPUs do not run float16
+  operations faster than float32 operations.
+  
+  Raises:
+    `ValueError` when
+    `mixed_precision_global_state.using_default_mixed_precision_policy`
+    is set to `False` before
+    `tf.train.experimental.enable_mixed_precision_graph_rewrite()`
+    is called.
+  
   Args:
     opt: An instance of a `tf.keras.optimizers.Optimizer` or a
       `tf.train.Optimizer`.
-    loss_scale: Either an int/float, the string "dynamic", or an instance of a
-      `tf.train.experimental.LossScale`. The loss scale to use. It is
-      recommended to keep this as its default value of "dynamic".
-
+    loss_scale: Either an int/float, the string `"dynamic"`, or an instance of
+      a `tf.train.experimental.LossScale`. The loss scale to use. It is
+      recommended to keep this as its default value of `"dynamic"`, which will 
+      adjust the scaling automatically to prevent `Inf` or `NaN` values.
+  
   Returns:
     A version of `opt` that will use loss scaling to prevent underflow.
   """
   # TODO(reedwm): If a ConfigProto is passed to Session, either assert that
   # auto_mixed_precision is on or turn it on for the user.
+  return _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
+                                                    use_v1_behavior=True)
+
+
+def _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
+                                               use_v1_behavior):
+  """Enables mixed precision. See `enable_mixed_precision_graph_rewrite`."""
   if not mixed_precision_global_state.using_default_mixed_precision_policy:
     raise ValueError(
         'The mixed precision graph rewrite cannot be enabled, because a keras '
@@ -122,10 +343,11 @@ def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
         '  2. tf.train.experimental.enable_mixed_precision_graph_rewrite() '
         '(You called this second)\n\n'
         'You called both functions, which is an error, because both functions '
-        'enable you to use mixed precision. The second function enables mixed '
-        'precision in the graph with a graph rewrite. However it is currently '
-        'not very customizable, and does not support eager. The first '
-        'function is for Keras layers, but is not yet fully complete.')
+        'enable you to use mixed precision. If in doubt which function to use, '
+        'use the second, as it is currently more complete and easy to use. The '
+        'second function enables mixed precision in the graph with a graph '
+        'rewrite. However it is currently not very customizable, and does not '
+        'support eager.')
 
   if mixed_precision_global_state.non_mixed_precision_session_created:
     # TODO(reedwm): Give the stacktrace of the existing Sessions. And if the
@@ -133,16 +355,40 @@ def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
     tf_logging.warn('You already have existing Sessions that do not use mixed '
                     'precision. enable_mixed_precision_graph_rewrite() will '
                     'not affect these Sessions.')
-  opt = _wrap_optimizer(opt, loss_scale)
+  opt = _wrap_optimizer(opt, loss_scale, use_v1_behavior=use_v1_behavior)
   config.set_optimizer_experimental_options({'auto_mixed_precision': True})
   mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = True
   return opt
 
 
-@tf_export(v1=['train.experimental.disable_mixed_precision_graph_rewrite'])
+@tf_export('train.experimental.disable_mixed_precision_graph_rewrite', v1=[])
 def disable_mixed_precision_graph_rewrite():
   """Disables the mixed precision graph rewrite.
 
+  After this is called, the mixed precision graph rewrite will no longer run for
+  tf.functions, and so float32 operations will no longer be converted to
+  float16.
+
+  This does not undo the effects of loss scaling. Any optimizers wrapped with a
+  LossScaleOptimizer will continue to do loss scaling, although this loss
+  scaling will no longer be useful, as the graph rewrite no longer converts
+  tf.functions to use float16.
+
+  This function is useful for unit testing. A unit test can test using the mixed
+  precision graph rewrite, then disable it so future unit tests continue using
+  float32.
+  """
+  if not mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
+    tf_logging.warn('disable_mixed_precision_graph_rewrite() called when mixed '
+                    'precision is already disabled.')
+  config.set_optimizer_experimental_options({'auto_mixed_precision': False})
+  mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = False
+
+
+@tf_export(v1=['train.experimental.disable_mixed_precision_graph_rewrite'])
+def disable_mixed_precision_graph_rewrite_v1():
+  """Disables the mixed precision graph rewrite.
+
   After this is called, the mixed precision graph rewrite will no longer run for
   new Sessions, and so float32 operations will no longer be converted to float16
   in such Sessions. However, any existing Sessions will continue to have the
@@ -161,8 +407,6 @@ def disable_mixed_precision_graph_rewrite():
   as `enable_mixed_precision_graph_rewrite` and
   `disable_mixed_precision_graph_rewrite` have no effect on existing sessions.
   """
-  if not mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
-    tf_logging.warn('disable_mixed_precision_graph_rewrite() called when mixed '
-                    'precision is already disabled.')
-  config.set_optimizer_experimental_options({'auto_mixed_precision': False})
-  mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = False
+  # We only have a separate V1 version of this function, because the V1
+  # docstring mentions sessions.
+  disable_mixed_precision_graph_rewrite()
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index 162aee53d7c..2b03906660d 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -21,6 +21,7 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -40,6 +41,14 @@ from tensorflow.python.training.experimental import mixed_precision
 from tensorflow.python.training.experimental import mixed_precision_global_state
 
 
+if tf2.enabled():
+  enable_mixed_precision_graph_rewrite = (
+      mixed_precision.enable_mixed_precision_graph_rewrite)
+else:
+  enable_mixed_precision_graph_rewrite = (
+      mixed_precision.enable_mixed_precision_graph_rewrite_v1)
+
+
 class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
 
   IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
@@ -64,13 +73,13 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_wrap_optimizer(self):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
-    opt = mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
+    opt = enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
         opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
 
     opt = gradient_descent_v2.SGD(1.0)
-    opt = mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
+    opt = enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
         opt, loss_scale_optimizer_v2.LossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
@@ -78,10 +87,14 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_optimizer_errors(self):
     opt = 1
-    expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or '
-                      'a tf.keras.optimizers.Optimizer, but got')
+    if tf2.enabled():
+      expected_regex = ('"opt" must be an instance of a '
+                        'tf.keras.optimizers.Optimizer, but got')
+    else:
+      expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or '
+                        'a tf.keras.optimizers.Optimizer, but got')
     with self.assertRaisesRegexp(ValueError, expected_regex):
-      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
+      enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -91,7 +104,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError,
                                  '"opt" must not already be an instance of a '
                                  'MixedPrecisionLossScaleOptimizer.'):
-      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
+      enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -100,7 +113,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError,
                                  '"opt" must not already be an instance of a '
                                  'LossScaleOptimizer.'):
-      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
+      enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -108,7 +121,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_grappler_pass_enabled(self):
     opt = gradient_descent_v2.SGD(1.0)
-    mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
+    enable_mixed_precision_graph_rewrite(opt, 123.)
 
     var = variables.Variable([[1.0]])
 
@@ -153,8 +166,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     mixed_precision_global_state.non_mixed_precision_session_created = False
 
     with session.Session():
-      mixed_precision.enable_mixed_precision_graph_rewrite(
-          gradient_descent_v2.SGD(1.0))
+      enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
       mock_warn.assert_any_call(
           'You already have existing Sessions that do not use mixed precision. '
           'enable_mixed_precision_graph_rewrite() will not affect these '
@@ -166,8 +178,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     # the warning.
     mixed_precision_global_state.non_mixed_precision_session_created = False
 
-    mixed_precision.enable_mixed_precision_graph_rewrite(
-        gradient_descent_v2.SGD(1.0))
+    enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
     with session.Session():
       # Make sure the "You already have existing Sessions" warning was not
       # issued, since the Session was only created after
@@ -181,11 +192,9 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     with policy.policy_scope('infer_float32_vars'):
       with self.assertRaisesRegexp(
           ValueError, 'a keras mixed precision Policy has been set'):
-        mixed_precision.enable_mixed_precision_graph_rewrite(
-            gradient_descent_v2.SGD(1.0))
+        enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
     # Test no error is thrown when the policy is current the default.
-    mixed_precision.enable_mixed_precision_graph_rewrite(
-        gradient_descent_v2.SGD(1.0))
+    enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 21408f3988e..41c8c715d77 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -24,8 +24,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
@@ -46,6 +44,7 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -600,7 +599,7 @@ def _store_sparse_tensors_join(tensor_list_list, enqueue_many, keep_input):
 
 def _restore_sparse_tensors(stored_list, sparse_info_list):
   """Restore SparseTensors after dequeue in batch, batch_join, etc."""
-  received_sequence = isinstance(stored_list, collections.Sequence)
+  received_sequence = isinstance(stored_list, collections_abc.Sequence)
   if not received_sequence:
     stored_list = (stored_list,)
   tensors = [
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 5d5bcff2fcc..4e89c966ad1 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -368,7 +369,7 @@ class ExponentialMovingAverage(object):
     self._num_updates = num_updates
     self._zero_debias = zero_debias
     self._name = name
-    self._averages = {}
+    self._averages = object_identity.ObjectIdentityDictionary()
 
   @property
   def name(self):
@@ -426,25 +427,25 @@ class ExponentialMovingAverage(object):
         # For variables: to lower communication bandwidth across devices we keep
         # the moving averages on the same device as the variables. For other
         # tensors, we rely on the existing device allocation mechanism.
-        with ops.init_scope():
-          if isinstance(var, variables.Variable):
-            avg = slot_creator.create_slot(
-                var,
-                var.initialized_value(),
-                self.name,
-                colocate_with_primary=True)
-            # NOTE(mrry): We only add `tf.Variable` objects to the
-            # `MOVING_AVERAGE_VARIABLES` collection.
-            ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
+        if isinstance(var, variables.Variable):
+          if ops.executing_eagerly_outside_functions():
+            init_value = var.read_value()
           else:
-            avg = slot_creator.create_zeros_slot(
-                var,
-                self.name,
-                colocate_with_primary=(var.op.type in [
-                    "Variable", "VariableV2", "VarHandleOp"
-                ]))
-            if self._zero_debias:
-              zero_debias_true.add(avg)
+            init_value = var.initialized_value()
+          avg = slot_creator.create_slot(
+              var, init_value, self.name, colocate_with_primary=True)
+          # NOTE(mrry): We only add `tf.Variable` objects to the
+          # `MOVING_AVERAGE_VARIABLES` collection.
+          ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
+        else:
+          avg = slot_creator.create_zeros_slot(
+              var,
+              self.name,
+              colocate_with_primary=(var.op.type in [
+                  "Variable", "VariableV2", "VarHandleOp"
+              ]))
+          if self._zero_debias:
+            zero_debias_true.add(avg)
         self._averages[var] = avg
 
     with ops.name_scope(self.name) as scope:
@@ -456,7 +457,7 @@ class ExponentialMovingAverage(object):
                                  (1.0 + num_updates) / (10.0 + num_updates))
       updates = []
       for var in var_list:
-        zero_debias = self._averages[var] in zero_debias_true
+        zero_debias = any(self._averages[var] is v for v in zero_debias_true)
         updates.append(
             assign_moving_average(
                 self._averages[var], var, decay, zero_debias=zero_debias))
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 889d1119555..3a52d7653f4 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -59,26 +59,25 @@ class MovingAveragesTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssignMovingAverage(self):
-    with self.cached_session():
-      var = variables.Variable([0.0, 0.0])
-      val = constant_op.constant([1.0, 2.0], dtypes.float32)
-      decay = 0.25
-      if context.executing_eagerly():
-        self.assertAllClose([0.0, 0.0], self.evaluate(var))
-        assign = moving_averages.assign_moving_average(var, val, decay)
-        self.assertAllClose(
-            [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
-            self.evaluate(var))
-      else:
-        assign = moving_averages.assign_moving_average(var, val, decay)
-        self.evaluate(variables.global_variables_initializer())
-        self.assertAllClose([0.0, 0.0], self.evaluate(var))
-        assign.op.run()
-        self.assertAllClose(
-            [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
-            self.evaluate(var))
+    var = variables.Variable([0.0, 0.0])
+    val = constant_op.constant([1.0, 2.0], dtypes.float32)
+    decay = 0.25
+    if context.executing_eagerly():
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
+      assign = moving_averages.assign_moving_average(var, val, decay)
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
+    else:
+      assign = moving_averages.assign_moving_average(var, val, decay)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
+      assign.op.run()
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testAssignMovingAverageNewNamingMultipleCalls(self):
     with variable_scope.variable_scope("scope1") as vs1:
       with variable_scope.variable_scope("scope2"):
@@ -93,7 +92,7 @@ class MovingAveragesTest(test.TestCase):
     actual_names = [v.name for v in vs1.global_variables()]
     self.assertSetEqual(set(expected_names), set(actual_names))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self):
     with variable_scope.variable_scope("scope1") as vs1:
       var = variable_scope.get_variable("Var", shape=[])
@@ -104,7 +103,7 @@ class MovingAveragesTest(test.TestCase):
       moving_averages.assign_moving_average(var, 0.0, 0.99)
       moving_averages.assign_moving_average(var, 0.0, 0.99)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testWeightedMovingAverage(self):
     with self.cached_session() as sess:
       decay = 0.5
@@ -130,7 +129,7 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(numerator_2 / denominator_2, wma_array)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testWeightedMovingAverageBfloat16(self):
     bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
     with self.cached_session() as sess:
@@ -157,6 +156,7 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(bfloat16(numerator_2 / denominator_2), wma_array)
 
+
 def _Repeat(value, dim):
   if dim == 1:
     return value
@@ -188,9 +188,9 @@ class ExponentialMovingAverageTest(test.TestCase):
 
     self.assertItemsEqual([var0, var1], variables.moving_average_variables())
 
-    self.assertFalse(avg0 in variables.trainable_variables())
-    self.assertFalse(avg1 in variables.trainable_variables())
-    self.assertFalse(avg2 in variables.trainable_variables())
+    self.assertNotIn(avg0, variables.trainable_variables())
+    self.assertNotIn(avg1, variables.trainable_variables())
+    self.assertNotIn(avg2, variables.trainable_variables())
     self.evaluate(variables.global_variables_initializer())
 
     self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name)
@@ -210,7 +210,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertAllClose(_Repeat(0.0, dim), self.evaluate(avg2))
 
     # Update the averages and check.
-    update.run()
+    self.evaluate(update)
     dk = actual_decay
 
     expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
@@ -221,7 +221,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertAllClose(expected, self.evaluate(avg2))
 
     # Again, update the averages and check.
-    update.run()
+    self.evaluate(update)
     expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                        dim)
     self.assertAllClose(expected, self.evaluate(avg0))
@@ -232,87 +232,76 @@ class ExponentialMovingAverageTest(test.TestCase):
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
     self.assertAllClose(expected, self.evaluate(avg2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNoNumUpdates_Scalar(self):
-    with self.cached_session():
-      ema = moving_averages.ExponentialMovingAverage(0.25)
-      self._CheckDecay(ema, actual_decay=0.25, dim=1)
+    ema = moving_averages.ExponentialMovingAverage(0.25)
+    self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNoNumUpdates_Scalar_Debias(self):
-    with self.cached_session():
-      ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
-      self._CheckDecay(ema, actual_decay=0.25, dim=1)
+    ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
+    self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNoNumUpdates_Vector(self):
-    with self.cached_session():
-      ema = moving_averages.ExponentialMovingAverage(0.25)
-      self._CheckDecay(ema, actual_decay=0.25, dim=5)
+    ema = moving_averages.ExponentialMovingAverage(0.25)
+    self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNoNumUpdates_Vector_Debias(self):
-    with self.cached_session():
-      ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
-      self._CheckDecay(ema, actual_decay=0.25, dim=5)
+    ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
+    self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNumUpdates_Scalar(self):
-    with self.cached_session():
-      # With num_updates 1, the decay applied is 0.1818
-      ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
-      self._CheckDecay(ema, actual_decay=0.181818, dim=1)
+    # With num_updates 1, the decay applied is 0.1818
+    ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
+    self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNumUpdates_Scalar_Debias(self):
-    with self.cached_session():
-      # With num_updates 1, the decay applied is 0.1818
-      ema = moving_averages.ExponentialMovingAverage(
-          0.25, num_updates=1, zero_debias=True)
-      self._CheckDecay(ema, actual_decay=0.181818, dim=1)
+    # With num_updates 1, the decay applied is 0.1818
+    ema = moving_averages.ExponentialMovingAverage(
+        0.25, num_updates=1, zero_debias=True)
+    self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNumUpdates_Vector(self):
-    with self.cached_session():
-      # With num_updates 1, the decay applied is 0.1818
-      ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
-      self._CheckDecay(ema, actual_decay=0.181818, dim=5)
+    # With num_updates 1, the decay applied is 0.1818
+    ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
+    self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNumUpdates_Vector_Debias(self):
-    with self.cached_session():
-      # With num_updates 1, the decay applied is 0.1818
-      ema = moving_averages.ExponentialMovingAverage(
-          0.25, num_updates=1, zero_debias=True)
-      self._CheckDecay(ema, actual_decay=0.181818, dim=5)
+    # With num_updates 1, the decay applied is 0.1818
+    ema = moving_averages.ExponentialMovingAverage(
+        0.25, num_updates=1, zero_debias=True)
+    self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesWithControlDeps(self):
-    with self.cached_session() as sess:
-      v0 = variables.Variable(0, name="v0")
-      add_to_v0 = v0.assign_add(1)
-      v1 = variables.Variable([10.0], name="v1")
-      assign_to_v1 = v1.assign([20.0])
-      ema = moving_averages.ExponentialMovingAverage(0.25)
-      with ops.control_dependencies([add_to_v0]):
-        ema_op = ema.apply([v1])
-      # the moving average of v1 should not have any control inputs
-      v1_avg = ema.average(v1)
-      self.assertEqual([], v1_avg.initializer.control_inputs)
-      self.assertEqual([], v1_avg.value().op.control_inputs)
-      self.assertEqual([], v1_avg.value().op.control_inputs)
-      # We should be able to initialize v1_avg before v0.
-      self.evaluate(v1_avg.initializer)
-      self.evaluate(v0.initializer)
-      self.assertEqual([10.0], self.evaluate(v1_avg))
-      # running ema_op should add to v0 (in addition to updating v1_avg)
-      self.evaluate(assign_to_v1)
-      self.evaluate(ema_op)
-      self.assertEqual(1, self.evaluate(v0))
-      self.assertEqual([17.5], self.evaluate(v1_avg))
+    v0 = variables.Variable(0, name="v0")
+    add_to_v0 = v0.assign_add(1)
+    v1 = variables.Variable([10.0], name="v1")
+    assign_to_v1 = v1.assign([20.0])
+    ema = moving_averages.ExponentialMovingAverage(0.25)
+    with ops.control_dependencies([add_to_v0]):
+      ema_op = ema.apply([v1])
+    # the moving average of v1 should not have any control inputs
+    v1_avg = ema.average(v1)
+    self.assertEqual([], v1_avg.initializer.control_inputs)
+    self.assertEqual([], v1_avg.value().op.control_inputs)
+    self.assertEqual([], v1_avg.value().op.control_inputs)
+    # We should be able to initialize v1_avg before v0.
+    self.evaluate(v1_avg.initializer)
+    self.evaluate(v0.initializer)
+    self.assertEqual([10.0], self.evaluate(v1_avg))
+    # running ema_op should add to v0 (in addition to updating v1_avg)
+    self.evaluate(assign_to_v1)
+    self.evaluate(ema_op)
+    self.assertEqual(1, self.evaluate(v0))
+    self.assertEqual([17.5], self.evaluate(v1_avg))
 
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
   def testBasicEager(self):
     v0 = variables.Variable(1.0)
     v1 = variables.Variable(2.0)
@@ -332,130 +321,129 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertAllEqual(self.evaluate(ema.average(v1)), 3.5)
 
   def averageVariablesNamesHelper(self, zero_debias):
-    with self.cached_session():
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(30.0, name="v1")
+    # Add a non-trainable variable.
+    v2 = variables.Variable(20.0, name="v2", trainable=False)
+    tensor2 = v0 + v1
+    ema = moving_averages.ExponentialMovingAverage(
+        0.25, zero_debias=zero_debias, name="foo")
+    self.assertEqual("foo", ema.name)
+    self.assertEqual("v0/foo", ema.average_name(v0))
+    self.assertEqual("v1/foo", ema.average_name(v1))
+    self.assertEqual("add/foo", ema.average_name(tensor2))
+    ema.apply([v0, v1, tensor2])
+    vars_to_restore = ema.variables_to_restore()
+    # vars_to_restore should contain the following:
+    # {v0/foo : v0,
+    #  v1/foo : v1,
+    #  add/foo : add/foo,
+    #  v2 : v2}
+    expected_names = [
+        ema.average_name(v0),
+        ema.average_name(v1),
+        ema.average_name(tensor2), v2.op.name
+    ]
+    if zero_debias:
+      # vars_to_restore should also contain the following:
+      #  {add/foo/biased: add/foo/biased,
+      #  add/foo/local_step: add/foo/local_step}
+      expected_names += [
+          ema.average_name(tensor2) + "/biased",
+          ema.average_name(tensor2) + "/local_step"
+      ]
+    self.assertEqual(sorted(expected_names), sorted(vars_to_restore.keys()))
+    self.assertEqual(ema.average(v0).op.name, ema.average_name(v0))
+    self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
+    self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
+
+  @test_util.deprecated_graph_mode_only
+  def testAverageVariablesNames(self):
+    self.averageVariablesNamesHelper(zero_debias=True)
+
+  @test_util.deprecated_graph_mode_only
+  def testAverageVariablesNamesNoDebias(self):
+    self.averageVariablesNamesHelper(zero_debias=False)
+
+  @test_util.deprecated_graph_mode_only
+  def averageVariablesNamesRespectScopeHelper(self, zero_debias):
+    # See discussion on #2740.
+    with variable_scope.variable_scope("scope1"):
       v0 = variables.Variable(10.0, name="v0")
       v1 = variables.Variable(30.0, name="v1")
       # Add a non-trainable variable.
       v2 = variables.Variable(20.0, name="v2", trainable=False)
       tensor2 = v0 + v1
+    with variable_scope.variable_scope("scope2"):
       ema = moving_averages.ExponentialMovingAverage(
           0.25, zero_debias=zero_debias, name="foo")
-      self.assertEqual("foo", ema.name)
-      self.assertEqual("v0/foo", ema.average_name(v0))
-      self.assertEqual("v1/foo", ema.average_name(v1))
-      self.assertEqual("add/foo", ema.average_name(tensor2))
+      self.assertEqual("scope2/scope1/v0/foo", ema.average_name(v0))
+      self.assertEqual("scope2/scope1/v1/foo", ema.average_name(v1))
+      self.assertEqual("scope2/scope1/add/foo", ema.average_name(tensor2))
       ema.apply([v0, v1, tensor2])
       vars_to_restore = ema.variables_to_restore()
-      # vars_to_restore should contain the following:
-      # {v0/foo : v0,
-      #  v1/foo : v1,
-      #  add/foo : add/foo,
-      #  v2 : v2}
+      # `vars_to_restore` should contain the following:
+      # {scope2/scope1/v0/foo : v0,
+      #  scope2/scope1/v1/foo : v1,
+      #  scope2/scope1/add/foo : add/foo,
+      #  scope1/v2 : v2}
       expected_names = [
-          ema.average_name(v0), ema.average_name(v1), ema.average_name(tensor2),
-          v2.op.name
+          ema.average_name(v0),
+          ema.average_name(v1),
+          ema.average_name(tensor2), v2.op.name
       ]
       if zero_debias:
-        # vars_to_restore should also contain the following:
-        #  {add/foo/biased: add/foo/biased,
-        #  add/foo/local_step: add/foo/local_step}
+        # `vars_to_restore` should also contain the following:
+        # {scope2/scope2/scope1/add/foo/biased: add/foo/biased,
+        #  scope2/scope2/scope1/add/foo/local_step: add/foo/local_step}
+        sc = "scope2/"
         expected_names += [
-            ema.average_name(tensor2) + "/biased",
-            ema.average_name(tensor2) + "/local_step"
+            sc + ema.average_name(tensor2) + "/biased",
+            sc + ema.average_name(tensor2) + "/local_step"
         ]
+
       self.assertEqual(sorted(expected_names), sorted(vars_to_restore.keys()))
       self.assertEqual(ema.average(v0).op.name, ema.average_name(v0))
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
-  @test_util.run_v1_only("b/120545219")
-  def testAverageVariablesNames(self):
-    self.averageVariablesNamesHelper(zero_debias=True)
-
-  @test_util.run_v1_only("b/120545219")
-  def testAverageVariablesNamesNoDebias(self):
-    self.averageVariablesNamesHelper(zero_debias=False)
-
-  def averageVariablesNamesRespectScopeHelper(self, zero_debias):
-    # See discussion on #2740.
-    with self.cached_session():
-      with variable_scope.variable_scope("scope1"):
-        v0 = variables.Variable(10.0, name="v0")
-        v1 = variables.Variable(30.0, name="v1")
-        # Add a non-trainable variable.
-        v2 = variables.Variable(20.0, name="v2", trainable=False)
-        tensor2 = v0 + v1
-      with variable_scope.variable_scope("scope2"):
-        ema = moving_averages.ExponentialMovingAverage(
-            0.25, zero_debias=zero_debias, name="foo")
-        self.assertEqual("scope2/scope1/v0/foo", ema.average_name(v0))
-        self.assertEqual("scope2/scope1/v1/foo", ema.average_name(v1))
-        self.assertEqual("scope2/scope1/add/foo", ema.average_name(tensor2))
-        ema.apply([v0, v1, tensor2])
-        vars_to_restore = ema.variables_to_restore()
-        # `vars_to_restore` should contain the following:
-        # {scope2/scope1/v0/foo : v0,
-        #  scope2/scope1/v1/foo : v1,
-        #  scope2/scope1/add/foo : add/foo,
-        #  scope1/v2 : v2}
-        expected_names = [
-            ema.average_name(v0), ema.average_name(v1),
-            ema.average_name(tensor2), v2.op.name
-        ]
-        if zero_debias:
-          # `vars_to_restore` should also contain the following:
-          # {scope2/scope2/scope1/add/foo/biased: add/foo/biased,
-          #  scope2/scope2/scope1/add/foo/local_step: add/foo/local_step}
-          sc = "scope2/"
-          expected_names += [
-              sc + ema.average_name(tensor2) + "/biased",
-              sc + ema.average_name(tensor2) + "/local_step"
-          ]
-
-        self.assertEqual(sorted(expected_names), sorted(vars_to_restore.keys()))
-        self.assertEqual(ema.average(v0).op.name, ema.average_name(v0))
-        self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
-        self.assertEqual(
-            ema.average(tensor2).op.name, ema.average_name(tensor2))
-
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNamesRespectScope(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesNamesRespectScopeNoDebias(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testSubsetAverageVariablesNames(self):
-    with self.cached_session():
-      v0 = variables.Variable(10.0, name="v0")
-      v1 = variables.Variable(30.0, name="v1")
-      # Add a non-trainable variable.
-      v2 = variables.Variable(20.0, name="v2", trainable=False)
-      tensor2 = v0 + v1
-      ema = moving_averages.ExponentialMovingAverage(0.25, name="foo_avg")
-      self.assertEqual("v0/foo_avg", ema.average_name(v0))
-      self.assertEqual("v1/foo_avg", ema.average_name(v1))
-      self.assertEqual("add/foo_avg", ema.average_name(tensor2))
-      vars_to_restore = ema.variables_to_restore([v0, tensor2])
-      # vars_to_restore should contain the following:
-      # {v0/foo_avg : v0,
-      #  add/foo_avg : add
-      #  v1 : v1,
-      #  v2 : v2}
-      self.assertEqual(
-          sorted(vars_to_restore.keys()),
-          sorted([
-              ema.average_name(v0), ema.average_name(tensor2), v1.op.name,
-              v2.op.name
-          ]))
-      ema.apply([v0, v1, tensor2])
-      self.assertEqual(ema.average(v0).op.name, ema.average_name(v0))
-      self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
-      self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(30.0, name="v1")
+    # Add a non-trainable variable.
+    v2 = variables.Variable(20.0, name="v2", trainable=False)
+    tensor2 = v0 + v1
+    ema = moving_averages.ExponentialMovingAverage(0.25, name="foo_avg")
+    self.assertEqual("v0/foo_avg", ema.average_name(v0))
+    self.assertEqual("v1/foo_avg", ema.average_name(v1))
+    self.assertEqual("add/foo_avg", ema.average_name(tensor2))
+    vars_to_restore = ema.variables_to_restore([v0, tensor2])
+    # vars_to_restore should contain the following:
+    # {v0/foo_avg : v0,
+    #  add/foo_avg : add
+    #  v1 : v1,
+    #  v2 : v2}
+    self.assertEqual(
+        sorted(vars_to_restore.keys()),
+        sorted([
+            ema.average_name(v0),
+            ema.average_name(tensor2), v1.op.name, v2.op.name
+        ]))
+    ema.apply([v0, v1, tensor2])
+    self.assertEqual(ema.average(v0).op.name, ema.average_name(v0))
+    self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
+    self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.deprecated_graph_mode_only
   def testAverageVariablesDeviceAssignment(self):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
@@ -486,7 +474,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       _ = saver_lib.import_meta_graph(meta_graph)
     return graph_copy
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testImportedGraphVariablesToRestore(self):
     g = ops.Graph()
     with g.as_default():
@@ -502,7 +490,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       # need to be sure that two variables referring to the same variable don't
       # both get added to vars_to_restore.
       self.assertEqual(len(vars_to_restore), 1)
-      self.assertTrue("v/foo_avg" in vars_to_restore)
+      self.assertIn("v/foo_avg", vars_to_restore)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 81d3d4d0031..099fcf0548d 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import object_identity
 
 
 # Op names which identify variable reads which should be saved.
@@ -335,7 +336,7 @@ def validate_and_slice_inputs(names_to_saveables):
     names_to_saveables = op_list_to_dict(names_to_saveables)
 
   saveables = []
-  seen_ops = set()
+  seen_ops = object_identity.ObjectIdentitySet()
   for name, op in sorted(names_to_saveables.items(),
                          # Avoid comparing ops, sort only by name.
                          key=lambda x: x[0]):
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 094f5f5a3a9..488bd2ebcdc 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -40,7 +40,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.eager import context
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -110,12 +109,12 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     A `Variable` object.
   """
   # Scope the slot name in the namespace of the primary variable.
-  # Set "primary.op.name + '/' + name" as default name, so the scope name of
+  # Set primary's name + '/' + name as default name, so the scope name of
   # optimizer can be shared when reuse is True. Meanwhile when reuse is False
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = val.get_shape().is_fully_defined()
-  if context.executing_eagerly():
+  if isinstance(primary, variables.Variable):
     prefix = primary._shared_name  # pylint: disable=protected-access
   else:
     prefix = primary.op.name
@@ -152,7 +151,7 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = shape.is_fully_defined()
-  if context.executing_eagerly():
+  if isinstance(primary, variables.Variable):
     prefix = primary._shared_name  # pylint: disable=protected-access
   else:
     prefix = primary.op.name
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index 00bb8e6362f..3e805d21b3c 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
-from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 
@@ -307,7 +306,7 @@ class CheckpointPosition(object):
         value_tensors[serialized_tensor.name] = array_ops.identity(value)
       return value_tensors
 
-  def _gather_ops_or_named_saveables(self):
+  def gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
     saveables = self.trackable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
     # Name saveables based on the name this object had when it was checkpointed.
@@ -391,7 +390,7 @@ class CheckpointPosition(object):
       eagerly.
     """
     (restore_ops, tensor_saveables,
-     python_saveables) = self._gather_ops_or_named_saveables()
+     python_saveables) = self.gather_ops_or_named_saveables()
     restore_ops.extend(
         self._checkpoint.restore_saveables(tensor_saveables, python_saveables))
     return restore_ops
@@ -464,6 +463,38 @@ def no_automatic_dependency_tracking(method):
       target=method, decorator_func=_method_wrapper)
 
 
+@tf_contextlib.contextmanager
+def no_manual_dependency_tracking_scope(obj):
+  """A context that disables manual dependency tracking for the given `obj`.
+
+  Sometimes library methods might track objects on their own and we might want
+  to disable that and do the tracking on our own. One can then use this context
+  manager to disable the tracking the library method does and do your own
+  tracking.
+
+  For example:
+
+  class TestLayer(tf.keras.Layer):
+    def build():
+      with no_manual_dependency_tracking_scope(self):
+        var = self.add_variable("name1")  # Creates a var and doesn't track it
+      self._track_trackable("name2", var)  # We track variable with name `name2`
+
+  Args:
+    obj: A trackable object.
+
+  Yields:
+    a scope in which the object doesn't track dependencies manually.
+  """
+  # pylint: disable=protected-access
+  previous_value = getattr(obj, "_manual_tracking", True)
+  obj._manual_tracking = False
+  try:
+    yield
+  finally:
+    obj._manual_tracking = previous_value
+
+
 @tf_contextlib.contextmanager
 def no_automatic_dependency_tracking_scope(obj):
   """A context that disables automatic dependency tracking when assigning attrs.
@@ -791,6 +822,8 @@ class Trackable(object):
     if not isinstance(trackable, Trackable):
       raise TypeError(("Trackable._track_trackable() passed type %s, not a "
                        "Trackable.") % (type(trackable),))
+    if not getattr(self, "_manual_tracking", True):
+      return trackable
     new_reference = TrackableReference(name=name, ref=trackable)
     current_object = self._lookup_dependency(name)
     if (current_object is not None and current_object is not trackable):
@@ -858,13 +891,21 @@ class Trackable(object):
     # traversals will happen later).
     visit_queue = collections.deque([checkpoint_position])
     restore_ops = []
+    tensor_saveables = {}
+    python_saveables = []
     while visit_queue:
       current_position = visit_queue.popleft()
-      restore_ops.extend(
-          nest.flatten(current_position.trackable  # pylint: disable=protected-access
-                       ._single_restoration_from_checkpoint_position(
-                           checkpoint_position=current_position,
-                           visit_queue=visit_queue)))
+      new_restore_ops, new_tensor_saveables, new_python_saveables = (
+          current_position.trackable  # pylint: disable=protected-access
+          ._single_restoration_from_checkpoint_position(
+              checkpoint_position=current_position,
+              visit_queue=visit_queue))
+      restore_ops.extend(new_restore_ops)
+      tensor_saveables.update(new_tensor_saveables)
+      python_saveables.extend(new_python_saveables)
+    restore_ops.extend(
+        current_position.checkpoint.restore_saveables(
+            tensor_saveables, python_saveables))
     return restore_ops
 
   def _single_restoration_from_checkpoint_position(self, checkpoint_position,
@@ -876,10 +917,13 @@ class Trackable(object):
     # need to actually restore the object. However, we should pass the
     # restoration on to our dependencies.
     if checkpoint.restore_uid > self._self_update_uid:
-      restore_ops = checkpoint_position.restore_ops()
+      restore_ops, tensor_saveables, python_saveables = (
+          checkpoint_position.gather_ops_or_named_saveables())
       self._self_update_uid = checkpoint.restore_uid
     else:
       restore_ops = ()
+      tensor_saveables = {}
+      python_saveables = ()
     for child in checkpoint_position.object_proto.children:
       child_position = CheckpointPosition(
           checkpoint=checkpoint, proto_id=child.node_id)
@@ -896,7 +940,7 @@ class Trackable(object):
           # resolution order (shallowest paths first). The caller is responsible
           # for emptying visit_queue.
           visit_queue.append(child_position)
-    return restore_ops
+    return restore_ops, tensor_saveables, python_saveables
 
   def _gather_saveables_for_checkpoint(self):
     """Returns a dictionary of values to checkpoint with this object.
diff --git a/tensorflow/python/training/tracking/benchmarks_test.py b/tensorflow/python/training/tracking/benchmarks_test.py
index a3cec89cb2d..7514d9f54cf 100644
--- a/tensorflow/python/training/tracking/benchmarks_test.py
+++ b/tensorflow/python/training/tracking/benchmarks_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 import os
 import time
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.tracking import base
@@ -112,6 +114,17 @@ class SavingBenchmarks(test.Benchmark):
 
     self._run(_create_and_call, 3)
 
+  def benchmark_raw_restore(self):
+    checkpoint_path = _save_checkpoint()
+    all_names, all_dtypes = zip(*pywrap_tensorflow.NewCheckpointReader(
+        checkpoint_path).get_variable_to_dtype_map().items())
+
+    def _call_restore_v2():
+      gen_io_ops.restore_v2(checkpoint_path, all_names, [""] * len(all_names),
+                            all_dtypes)
+
+    self._run(_call_restore_v2, 3)
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index b3f5046bfe6..652e9a930f9 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import layer_utils
+from tensorflow.python.util.compat import collections_abc
 
 
 class NoDependency(object):
@@ -249,7 +250,7 @@ class TrackableDataStructure(base.Trackable):
     return self is other
 
 
-class List(TrackableDataStructure, collections.Sequence):
+class List(TrackableDataStructure, collections_abc.Sequence):
   """An append-only sequence type which is trackable.
 
   Maintains checkpoint dependencies on its contents (which must also be
@@ -371,9 +372,11 @@ class List(TrackableDataStructure, collections.Sequence):
 # TODO(tomhennigan) Update to collections.UserList?
 # TODO(allenl): Try switching this to wrapt.ObjectProxy again when we drop
 # Python 3.4 support (may still be tricky).
-class ListWrapper(List, collections.MutableSequence,
-                  # Shadowed, but there for isinstance checks.
-                  list):
+class ListWrapper(
+    List,
+    collections_abc.MutableSequence,
+    # Shadowed, but there for isinstance checks.
+    list):
   """Wraps the built-in `list` to support restore-on-create for variables.
 
   Unlike `List`, this sequence type is mutable in the same ways built-in lists
@@ -579,7 +582,7 @@ class ListWrapper(List, collections.MutableSequence,
     }
 
 
-class Mapping(TrackableDataStructure, collections.Mapping):
+class Mapping(TrackableDataStructure, collections_abc.Mapping):
   """An append-only trackable mapping data structure with string keys.
 
   Maintains checkpoint dependencies on its contents (which must also be
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index d3aaf78bc90..8b0bc6e5e3a 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -383,6 +383,8 @@ def cached_per_instance(f):
     if output is None:
       cache[item] = output = f(item)
     return output
+
+  wrapped.cache = cache
   return wrapped
 
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 9583d714759..5790d57607d 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -301,7 +301,10 @@ class _NameBasedRestoreCoordinator(object):
   def __init__(self, save_path, dtype_map=None):
     self.save_path = save_path
     self.dtype_map = dtype_map
-    self.unused_attributes = weakref.WeakKeyDictionary()
+    # A map from trackable objects to unused attribute names. We don't have
+    # proto IDs when doing a name-based restore, so the map keys differ from
+    # those in _CheckpointRestoreCoordinator.
+    self.unused_attributes = object_identity.ObjectIdentityWeakKeyDictionary()
     self.restore_uid = ops.uid()
 
   def globally_named_object_attributes(self, trackable):
@@ -930,11 +933,14 @@ class NameBasedSaverStatus(_LoadStatus):
 
   def assert_consumed(self):
     """Raises an exception if any variables/objects are unmatched."""
-    unused_attributes = dict(self._checkpoint.unused_attributes)
+    unused_attributes = list(self._checkpoint.unused_attributes.items())
     if unused_attributes:
+      unused_attribute_strings = [
+          "\n    {}: {}".format(obj, attributes)
+          for obj, attributes in unused_attributes]
       raise AssertionError(
-          "Some objects had attributes which were not restored: %s" %
-          (unused_attributes,))
+          "Some objects had attributes which were not restored:{}".format(
+              "".join(unused_attribute_strings)))
     for trackable in self._graph_view.list_objects():
       # pylint: disable=protected-access
       trackable._maybe_initialize_trackable()
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 5045f5af5c3..9fec0841b1a 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -1413,8 +1413,9 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
     six.assertCountEqual(
         self,
-        [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        trackable_utils.list_objects(save_template))
+        [id(v1_save), id(v2_save), id(manual_scope),
+         id(manual_scope_v), id(save_template)],
+        map(id, trackable_utils.list_objects(save_template)))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index c36790b4fb4..f469e321c3a 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -758,10 +758,10 @@ class TemplateTests(test.TestCase):
 
     save_template = template.make_template("s1", _templated)
     v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
-    six.assertCountEqual(
-        self,
-        [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        trackable_utils.list_objects(save_template))
+    six.assertCountEqual(self, [
+        id(obj) for obj in
+        [v1_save, v2_save, manual_scope, manual_scope_v, save_template]
+    ], [id(obj) for obj in trackable_utils.list_objects(save_template)])
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 510e6188196..50acc0882b7 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -35,6 +35,7 @@ from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.ftrl import FtrlOptimizer
 from tensorflow.python.training.experimental.loss_scale_optimizer import MixedPrecisionLossScaleOptimizer
 from tensorflow.python.training.experimental.mixed_precision import enable_mixed_precision_graph_rewrite
+from tensorflow.python.training.experimental.mixed_precision import enable_mixed_precision_graph_rewrite_v1
 from tensorflow.python.training.momentum import MomentumOptimizer
 from tensorflow.python.training.moving_averages import ExponentialMovingAverage
 from tensorflow.python.training.optimizer import Optimizer
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 76ba91d632f..54d1495ca0c 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -38,6 +38,12 @@ import six as _six
 
 from tensorflow.python.util.tf_export import tf_export
 
+try:
+  # This import only works on python 3.3 and above.
+  import collections.abc as collections_abc  # pylint: disable=unused-import
+except ImportError:
+  import collections as collections_abc  # pylint: disable=unused-import
+
 
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts `bytearray`, `bytes`, or unicode python input types to `bytes`.
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 16a32961414..5e822f87e8c 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -99,7 +99,7 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
-  stack = tf_stack.extract_stack_file_and_line(max_length=4)
+  stack = tf_stack.extract_stack(limit=4)
   length = len(stack)
   if length == 0:  # should never happen as we're in a function
     return 'UNKNOWN'
@@ -107,7 +107,7 @@ def _call_location(outer=False):
   if index < 0:
     index = 0
   frame = stack[index]
-  return '{}:{}'.format(frame.file, frame.line)
+  return '{}:{}'.format(frame.filename, frame.lineno)
 
 
 def _wrap_decorator(wrapped_function):
diff --git a/tensorflow/python/util/module_wrapper.py b/tensorflow/python/util/module_wrapper.py
index 7ca1e17b630..6207a393d60 100644
--- a/tensorflow/python/util/module_wrapper.py
+++ b/tensorflow/python/util/module_wrapper.py
@@ -39,13 +39,14 @@ def get_rename_v2(name):
 
 
 def _call_location():
-  # We want to get stack frame 2 frames up from current frame,
-  # i.e. above _getattr__ and _call_location calls.
-  stack = tf_stack.extract_stack_file_and_line(max_length=3)
+  # We want to get stack frame 3 frames up from current frame,
+  # i.e. above __getattr__, _tfmw_add_deprecation_warning,
+  # and _call_location calls.
+  stack = tf_stack.extract_stack(limit=4)
   if not stack:  # should never happen as we're in a function
     return 'UNKNOWN'
   frame = stack[0]
-  return '{}:{}'.format(frame.file, frame.line)
+  return '{}:{}'.format(frame.filename, frame.lineno)
 
 
 def contains_deprecation_decorator(decorators):
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d43720f0ed8..2bd5d810e61 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -38,8 +38,9 @@ import collections as _collections
 
 import six as _six
 
-from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.compat import collections_abc as _collections_abc
 
 
 _SHALLOW_TREE_HAS_INVALID_KEYS = (
@@ -103,14 +104,15 @@ def _is_namedtuple(instance, strict=False):
   Returns:
     True if `instance` is a `namedtuple`.
   """
-  return _pywrap_tensorflow.IsNamedtuple(instance, strict)
+  return _pywrap_utils.IsNamedtuple(instance, strict)
 
 
 # See the swig file (util.i) for documentation.
-_is_mapping = _pywrap_tensorflow.IsMapping
-_is_attrs = _pywrap_tensorflow.IsAttrs
-_is_composite_tensor = _pywrap_tensorflow.IsCompositeTensor
-_is_type_spec = _pywrap_tensorflow.IsTypeSpec
+_is_mapping = _pywrap_utils.IsMapping
+_is_mapping_view = _pywrap_utils.IsMappingView
+_is_attrs = _pywrap_utils.IsAttrs
+_is_composite_tensor = _pywrap_utils.IsCompositeTensor
+_is_type_spec = _pywrap_utils.IsTypeSpec
 
 
 def _sequence_like(instance, args):
@@ -132,7 +134,17 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in instance)
+    instance_type = type(instance)
+    if instance_type == _collections.defaultdict:
+      d = _collections.defaultdict(instance.default_factory)
+      for key in instance:
+        d[key] = result[key]
+      return d
+    else:
+      return instance_type((key, result[key]) for key in instance)
+  elif _is_mapping_view(instance):
+    # We can't directly construct mapping views, so we create a list instead
+    return list(args)
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
   elif _is_composite_tensor(instance):
@@ -170,7 +182,7 @@ def _yield_sorted_items(iterable):
   Yields:
     The iterable's (key, value) pairs, in order of sorted keys.
   """
-  if isinstance(iterable, _collections.Mapping):
+  if isinstance(iterable, _collections_abc.Mapping):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -196,23 +208,23 @@ def _yield_sorted_items(iterable):
 
 
 # See the swig file (util.i) for documentation.
-is_sequence = _pywrap_tensorflow.IsSequence
+is_sequence = _pywrap_utils.IsSequence
 
 
 # See the swig file (util.i) for documentation.
-is_sequence_or_composite = _pywrap_tensorflow.IsSequenceOrComposite
+is_sequence_or_composite = _pywrap_utils.IsSequenceOrComposite
 
 
 @tf_export("nest.is_nested")
 def is_nested(seq):
-  """Returns true if its input is a collections.Sequence (except strings).
+  """Returns true if its input is a collections.abc.Sequence (except strings).
 
   Args:
     seq: an input sequence.
 
   Returns:
-    True if the sequence is a not a string and is a collections.Sequence or a
-    dict.
+    True if the sequence is a not a string and is a collections.abc.Sequence
+    or a dict.
   """
   return is_sequence(seq)
 
@@ -248,11 +260,11 @@ def flatten(structure, expand_composites=False):
   Raises:
     TypeError: The nest is or contains a dict with non-sortable keys.
   """
-  return _pywrap_tensorflow.Flatten(structure, expand_composites)
+  return _pywrap_utils.Flatten(structure, expand_composites)
 
 
 # See the swig file (util.i) for documentation.
-_same_namedtuples = _pywrap_tensorflow.SameNamedtuples
+_same_namedtuples = _pywrap_utils.SameNamedtuples
 
 
 class _DotString(object):
@@ -303,8 +315,8 @@ def assert_same_structure(nest1, nest2, check_types=True,
       their substructures. Only possible if `check_types` is `True`.
   """
   try:
-    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types,
-                                           expand_composites)
+    _pywrap_utils.AssertSameStructure(nest1, nest2, check_types,
+                                      expand_composites)
   except (ValueError, TypeError) as e:
     str1 = str(map_structure(lambda _: _DOT, nest1))
     str2 = str(map_structure(lambda _: _DOT, nest2))
@@ -344,7 +356,7 @@ def flatten_dict_items(dictionary):
     ValueError: If any key and value do not have the same structure layout, or
     if keys are not unique.
   """
-  if not isinstance(dictionary, (dict, _collections.Mapping)):
+  if not isinstance(dictionary, (dict, _collections_abc.Mapping)):
     raise TypeError("input must be a dictionary")
   flat_dictionary = {}
   for i, v in _six.iteritems(dictionary):
@@ -714,8 +726,8 @@ def assert_shallow_structure(shallow_tree,
             (_is_type_spec(shallow_tree) or _is_type_spec(input_tree))):
         pass  # Compatibility will be checked below.
 
-      elif not (isinstance(shallow_tree, _collections.Mapping)
-                and isinstance(input_tree, _collections.Mapping)):
+      elif not (isinstance(shallow_tree, _collections_abc.Mapping) and
+                isinstance(input_tree, _collections_abc.Mapping)):
         raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
             input_type=type(input_tree),
             shallow_type=type(shallow_tree)))
@@ -753,7 +765,7 @@ def assert_shallow_structure(shallow_tree,
             _INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
                 input_size=len(input_tree), shallow_size=len(shallow_tree)))
 
-    if isinstance(shallow_tree, _collections.Mapping):
+    if isinstance(shallow_tree, _collections_abc.Mapping):
       absent_keys = set(shallow_tree) - set(input_tree)
       if absent_keys:
         raise ValueError(_SHALLOW_TREE_HAS_INVALID_KEYS
@@ -1315,5 +1327,6 @@ def flatten_with_tuple_paths(structure, expand_composites=False):
                   flatten(structure, expand_composites=expand_composites)))
 
 
-_pywrap_tensorflow.RegisterType("Mapping", _collections.Mapping)
-_pywrap_tensorflow.RegisterType("Sequence", _collections.Sequence)
+_pywrap_utils.RegisterType("Mapping", _collections_abc.Mapping)
+_pywrap_utils.RegisterType("Sequence", _collections_abc.Sequence)
+_pywrap_utils.RegisterType("MappingView", _collections_abc.MappingView)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 73cb1788766..6ec4a5d61f9 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 try:
   import attr  # pylint:disable=g-import-not-at-top
@@ -39,7 +40,7 @@ except ImportError:
   attr = None
 
 
-class _CustomMapping(collections.Mapping):
+class _CustomMapping(collections_abc.Mapping):
 
   def __init__(self, *args, **kwargs):
     self._wrapped = dict(*args, **kwargs)
@@ -171,6 +172,23 @@ class NestTest(parameterized.TestCase, test.TestCase):
         custom_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testFlattenAndPackMappingViews(self):
+    """`flatten` orders dicts by key, including OrderedDicts."""
+    ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
+
+    # test flattening
+    ordered_keys_flat = nest.flatten(ordered.keys())
+    ordered_values_flat = nest.flatten(ordered.values())
+    ordered_items_flat = nest.flatten(ordered.items())
+    self.assertEqual([3, 1, 0, 2], ordered_values_flat)
+    self.assertEqual(["d", "b", "a", "c"], ordered_keys_flat)
+    self.assertEqual(["d", 3, "b", 1, "a", 0, "c", 2], ordered_items_flat)
+
+    # test packing
+    self.assertEqual([("d", 3), ("b", 1), ("a", 0), ("c", 2)],
+                     nest.pack_sequence_as(ordered.items(), ordered_items_flat))
+
   Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
@@ -258,6 +276,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertTrue(nest.is_nested(((7, 8), (5, 6))))
     self.assertTrue(nest.is_nested([]))
     self.assertTrue(nest.is_nested({"a": 1, "b": 2}))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}.keys()))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}.values()))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}.items()))
     self.assertFalse(nest.is_nested(set([1, 2])))
     ones = array_ops.ones([2, 3])
     self.assertFalse(nest.is_nested(ones))
@@ -436,6 +457,16 @@ class NestTest(parameterized.TestCase, test.TestCase):
 
     self.assertEqual(7, nest.map_structure(lambda x, y: x + y, 3, 4))
 
+    structure3 = collections.defaultdict(list)
+    structure3["a"] = [1, 2, 3, 4]
+    structure3["b"] = [2, 3, 4, 5]
+
+    expected_structure3 = collections.defaultdict(list)
+    expected_structure3["a"] = [2, 3, 4, 5]
+    expected_structure3["b"] = [3, 4, 5, 6]
+    self.assertEqual(expected_structure3,
+                     nest.map_structure(lambda x: x + 1, structure3))
+
     # Empty structures
     self.assertEqual((), nest.map_structure(lambda x: x + 1, ()))
     self.assertEqual([], nest.map_structure(lambda x: x + 1, []))
diff --git a/tensorflow/python/util/object_identity.py b/tensorflow/python/util/object_identity.py
index d4eef5b34b5..3d191c2281d 100644
--- a/tensorflow/python/util/object_identity.py
+++ b/tensorflow/python/util/object_identity.py
@@ -17,9 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import weakref
 
+from tensorflow.python.util.compat import collections_abc
+
 
 class _ObjectIdentityWrapper(object):
   """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
@@ -29,6 +30,8 @@ class _ObjectIdentityWrapper(object):
   _ListWrapper objects to object-identity collections.
   """
 
+  __slots__ = ["_wrapped"]
+
   def __init__(self, wrapped):
     self._wrapped = wrapped
 
@@ -39,7 +42,10 @@ class _ObjectIdentityWrapper(object):
   def __eq__(self, other):
     if isinstance(other, _ObjectIdentityWrapper):
       return self._wrapped is other._wrapped  # pylint: disable=protected-access
-    return self._wrapped is other
+    return False
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
 
   def __hash__(self):
     # Wrapper id() is also fine for weakrefs. In fact, we rely on
@@ -47,6 +53,9 @@ class _ObjectIdentityWrapper(object):
     # weakref.ref(a) in _WeakObjectIdentityWrapper.
     return id(self._wrapped)
 
+  def __repr__(self):
+    return "<{} wrapping {!r}>".format(type(self).__name__, self._wrapped)
+
 
 class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
 
@@ -58,7 +67,41 @@ class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
     return self._wrapped()
 
 
-class ObjectIdentityDictionary(collections.MutableMapping):
+class Reference(_ObjectIdentityWrapper):
+  """Reference that refers an object.
+
+  ```python
+  x = [1]
+  y = [1]
+
+  x_ref1 = Reference(x)
+  x_ref2 = Reference(x)
+  y_ref2 = Reference(y)
+
+  print(x_ref1 == x_ref2)
+  ==> True
+
+  print(x_ref1 == y)
+  ==> False
+  ```
+  """
+
+  # Disabling super class' unwrapped field.
+  unwrapped = property()
+
+  def deref(self):
+    """Returns the referenced object.
+
+    ```python
+    x_ref = Reference(x)
+    print(x is x_ref.deref())
+    ==> True
+    ```
+    """
+    return self._wrapped
+
+
+class ObjectIdentityDictionary(collections_abc.MutableMapping):
   """A mutable mapping data structure which compares using "is".
 
   This is necessary because we have trackable objects (_ListWrapper) which
@@ -88,6 +131,9 @@ class ObjectIdentityDictionary(collections.MutableMapping):
     for key in self._storage:
       yield key.unwrapped
 
+  def __repr__(self):
+    return "ObjectIdentityDictionary(%s)" % repr(self._storage)
+
 
 class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
   """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
@@ -109,12 +155,18 @@ class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
         yield unwrapped
 
 
-class ObjectIdentitySet(collections.MutableSet):
+class ObjectIdentitySet(collections_abc.MutableSet):
   """Like the built-in set, but compares objects with "is"."""
 
   def __init__(self, *args):
     self._storage = set([self._wrap_key(obj) for obj in list(*args)])
 
+  @staticmethod
+  def _from_storage(storage):
+    result = ObjectIdentitySet()
+    result._storage = storage  # pylint: disable=protected-access
+    return result
+
   def _wrap_key(self, key):
     return _ObjectIdentityWrapper(key)
 
@@ -127,6 +179,16 @@ class ObjectIdentitySet(collections.MutableSet):
   def add(self, key):
     self._storage.add(self._wrap_key(key))
 
+  def update(self, items):
+    self._storage.update([self._wrap_key(item) for item in items])
+
+  def intersection(self, items):
+    return self._storage.intersection([self._wrap_key(item) for item in items])
+
+  def difference(self, items):
+    return ObjectIdentitySet._from_storage(
+        self._storage.difference([self._wrap_key(item) for item in items]))
+
   def __len__(self):
     return len(self._storage)
 
diff --git a/tensorflow/python/util/object_identity_test.py b/tensorflow/python/util/object_identity_test.py
new file mode 100644
index 00000000000..5dc8be1a25d
--- /dev/null
+++ b/tensorflow/python/util/object_identity_test.py
@@ -0,0 +1,52 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for object_identity."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import object_identity
+
+
+class ObjectIdentityWrapperTest(test.TestCase):
+
+  def testWrapperNotEqualToWrapped(self):
+    o = object()
+    self.assertNotEqual(o, object_identity._ObjectIdentityWrapper(o))
+    self.assertNotEqual(object_identity._ObjectIdentityWrapper(o), o)
+
+
+class ObjectIdentitySetTest(test.TestCase):
+
+  def testDifference(self):
+
+    class Element(object):
+      pass
+
+    a = Element()
+    b = Element()
+    c = Element()
+    set1 = object_identity.ObjectIdentitySet([a, b])
+    set2 = object_identity.ObjectIdentitySet([b, c])
+    diff_set = set1.difference(set2)
+    self.assertIn(a, diff_set)
+    self.assertNotIn(b, diff_set)
+    self.assertNotIn(c, diff_set)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index 3a3af4bffa5..6331b427e4f 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -62,7 +62,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import difflib
 
 import six
@@ -72,6 +71,8 @@ from google.protobuf import descriptor_pool
 from google.protobuf import message
 from google.protobuf import text_format
 
+from ..compat import collections_abc
+
 
 def assertProtoEqual(self, a, b, check_initialized=True,  # pylint: disable=invalid-name
                      normalize_numbers=False, msg=None):
@@ -186,7 +187,7 @@ def NormalizeNumberFields(pb):
 
 
 def _IsMap(value):
-  return isinstance(value, collections.Mapping)
+  return isinstance(value, collections_abc.Mapping)
 
 
 def _IsRepeatedContainer(value):
diff --git a/tensorflow/python/util/scoped_annotation.i b/tensorflow/python/util/scoped_annotation.i
new file mode 100644
index 00000000000..4dd89cb7eb0
--- /dev/null
+++ b/tensorflow/python/util/scoped_annotation.i
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/lib/core/strings.i"
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/core/profiler/internal/python_scoped_annotation.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::profiler;
+%unignore tensorflow::profiler::PythonScopedAnnotation;
+%unignore tensorflow::profiler::PythonScopedAnnotation::PythonScopedAnnotation;
+%unignore tensorflow::profiler::PythonScopedAnnotation::Enter;
+%unignore tensorflow::profiler::PythonScopedAnnotation::Exit;
+%unignore tensorflow::profiler::PythonScopedAnnotation::~PythonScopedAnnotation;
+%unignore tensorflow::profiler::PythonScopedAnnotation::IsEnabled;
+
+%include "tensorflow/core/profiler/internal/python_scoped_annotation.h"
+
+%unignoreall
\ No newline at end of file
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index 2164ba4dbf2..d9335da637f 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util.compat import collections_abc
 
 
 def get_json_type(obj):
@@ -63,7 +62,7 @@ def get_json_type(obj):
   if isinstance(obj, tensor_shape.TensorShape):
     return obj.as_list()
 
-  if isinstance(obj, collections.Mapping):
+  if isinstance(obj, collections_abc.Mapping):
     return dict(obj)
 
   raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index b0a63fc964c..92233f27aea 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -85,7 +85,7 @@ def make_decorator(target,
   """
   if decorator_name is None:
     frame = tf_stack.extract_stack(limit=2)[0]
-    decorator_name = frame[2]  # Caller's name
+    decorator_name = frame.name
   decorator = TFDecorator(decorator_name, target, decorator_doc,
                           decorator_argspec)
   setattr(decorator_func, '_tf_decorator', decorator)
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
new file mode 100644
index 00000000000..84c8ac08ee4
--- /dev/null
+++ b/tensorflow/python/util/tf_stack.cc
@@ -0,0 +1,159 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+#include <frameobject.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/stl_bind.h"
+
+struct StackFrame;  // Forward declaration.
+
+PYBIND11_MAKE_OPAQUE(std::vector<StackFrame>);
+
+namespace tensorflow {
+
+namespace {
+
+namespace py = pybind11;
+
+struct StackFrame {
+  py::str filename;
+  int lineno;
+  py::str name;
+  py::object globals;
+
+  py::object line() const {
+    static const auto* linecache =
+        new py::module(py::module::import("linecache"));
+    const auto& checkcache = linecache->attr("checkcache");
+    const auto& getline = linecache->attr("getline");
+    checkcache(filename);
+    const auto& code =
+        py::cast<py::str>(getline(filename, lineno, globals).attr("strip")());
+    ssize_t size = 0;
+#if PY_MAJOR_VERSION == 3
+    if (PyUnicode_AsUTF8AndSize(code.ptr(), &size) == nullptr) {
+      throw py::error_already_set();
+    }
+#else
+    size = PyString_Size(code.ptr());
+#endif
+    return size > 0 ? static_cast<py::object>(code) : py::none();
+  }
+};
+
+std::vector<StackFrame> ExtractStack(ssize_t limit, const py::list& mappers,
+                                     const py::list& filters) {
+  const py::dict& source_map =
+      mappers.size() == 0
+          ? py::dict()
+          : mappers[mappers.size() - 1].attr("get_effective_source_map")();
+  const py::set& filtered_filenames =
+      filters.size() == 0
+          ? py::set()
+          : filters[filters.size() - 1].attr("get_filtered_filenames")();
+
+  const auto* tstate = PyThreadState_GET();
+  // Drop extract_stack() wrapper-function frame from the result.
+  const PyFrameObject* f = tstate->frame->f_back;  // TODO(slebedev): INCREF?
+
+  std::vector<StackFrame> ret;
+  // 16 is somewhat arbitrary, but TensorFlow stack traces tend to be deep.
+  ret.reserve(limit < 0 ? 16 : static_cast<size_t>(limit));
+  for (; f != nullptr && (limit < 0 || ret.size() < limit); f = f->f_back) {
+    const PyCodeObject* co = f->f_code;
+    int lineno = PyFrame_GetLineNumber(const_cast<PyFrameObject*>(f));
+    auto filename = py::reinterpret_borrow<py::str>(co->co_filename);
+    auto name = py::reinterpret_borrow<py::str>(co->co_name);
+
+    // TODO(slebedev): consider moving the mappers/filters to C++ as well.
+    if (source_map.size() > 0) {
+      const auto& key = py::make_tuple(filename, lineno);
+      if (source_map.contains(key)) {
+        const py::tuple& mapped = source_map[key];
+        filename = mapped[0];
+        lineno = py::cast<py::int_>(mapped[1]);
+        name = mapped[2];
+      }
+    }
+
+    if (!ret.empty() &&  // Never filter the innermost frame.
+        filtered_filenames.size() > 0 &&
+        PySet_Contains(filtered_filenames.ptr(), filename.ptr())) {
+      continue;
+    }
+
+    const auto& globals = py::reinterpret_borrow<py::object>(f->f_globals);
+    ret.push_back({std::move(filename), lineno, std::move(name), globals});
+  }
+
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+}  // namespace
+
+PYBIND11_MODULE(_tf_stack, m) {
+  // TODO(slebedev): rename to FrameSummary to match Python 3.5+.
+  py::class_<StackFrame>(m, "StackFrame")
+      .def_readonly("filename", &StackFrame::filename)
+      .def_readonly("lineno", &StackFrame::lineno)
+      .def_readonly("name", &StackFrame::name)
+      .def_property_readonly("line", &StackFrame::line)
+      .def("__repr__",
+           [](const StackFrame& self) {
+             return py::str("<StackFrame file {}, line {} in {}>")
+                 .format(self.filename, self.lineno, self.name);
+           })
+
+      // For compatibility with the traceback module.
+      .def("__getitem__",
+           [](const StackFrame& self, const py::object& index) -> py::object {
+             return py::make_tuple(self.filename, self.lineno, self.name,
+                                   self.line())[index];
+           })
+      .def("__len__", [](const StackFrame&) {
+        return 4;  // For compatibility with the traceback module.
+      });
+
+  // TODO(slebedev): rename to StackSummary to match Python 3.5+.
+  py::bind_vector<std::vector<StackFrame>>(m, "Stack", py::module_local(true))
+      // TODO(slebedev): upstream negative indexing support into pybind11.
+      .def(
+          "__getitem__",
+          [](const std::vector<StackFrame>& self, ssize_t index) {
+            const size_t eff_index =
+                index < 0 ? self.size() + index : static_cast<size_t>(index);
+            if (eff_index > self.size()) {
+              throw py::index_error();
+            }
+            return self[eff_index];
+          },
+          py::return_value_policy::reference_internal);
+
+  m.def("extract_stack", [](const py::object& limit, const py::list& mappers,
+                            const py::list& filters) {
+    // In Python 3.X ``traceback.extract_stack`` allows ``limit`` to
+    // either be None or -1.
+    return ExtractStack(limit.is_none() ? -1 : py::cast<ssize_t>(limit),
+                        mappers, filters);
+  });
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/util/tf_stack.py b/tensorflow/python/util/tf_stack.py
index 5603989a0d1..6ac8f3e1a2b 100644
--- a/tensorflow/python/util/tf_stack.py
+++ b/tensorflow/python/util/tf_stack.py
@@ -20,10 +20,26 @@ from __future__ import print_function
 
 import collections
 import inspect
-import linecache
-import sys
 import threading
 
+import six
+
+# TODO(b/138203821): change to from ...util import ... once the bug is fixed.
+from tensorflow.python import _tf_stack
+
+# Generally such lookups should be done using `threading.local()`. See
+# https://blogs.gnome.org/jamesh/2008/06/11/tls-python/ for a detailed
+# explanation of why. However the transform stacks are expected to be empty
+# when a thread is joined, so reusing the key does not introduce a correctness
+# issue. Moreover, get_ident is faster than storing and retrieving a unique
+# key in a thread local store.
+if six.PY3:
+  _get_thread_key = threading.get_ident
+else:
+  import thread  # pylint: disable=g-import-not-at-top
+  _get_thread_key = thread.get_ident
+
+
 # Names for indices into TF traceback tuples.
 TB_FILENAME = 0
 TB_LINENO = 1
@@ -31,48 +47,62 @@ TB_FUNCNAME = 2
 TB_CODEDICT = 3  # Dictionary of Python interpreter state.
 
 
-stacks = threading.local()
+_source_mapper_stacks = collections.defaultdict(list)
+_source_filter_stacks = collections.defaultdict(list)
 
 
-def _source_mappers():
-  if not hasattr(stacks, 'source_mapper'):
-    stacks.source_mapper = []
-  return stacks.source_mapper
+class StackTraceTransform(object):
+  """Base class for stack trace transformation functions."""
 
-
-def _source_filters():
-  if not hasattr(stacks, 'source_filter'):
-    stacks.source_filter = []
-  return stacks.source_filter
-
-
-class StackTraceMapper(object):
-  """Allows remapping traceback information to different source code."""
+  _stack_dict = None  # Subclasses should override
+  _thread_key = None
 
   def __enter__(self):
-    _source_mappers().append(self)
+    self.reset()
+
+    # Any given instance is assumed to be used by a single thread, which reduces
+    # expensive thread local lookups.
+    if self._thread_key is None:
+      self._thread_key = _get_thread_key()
+    else:
+      assert self._thread_key == _get_thread_key(), 'Shared across threads?'
+
+    stack = self._stack_dict[self._thread_key]
+    if stack:
+      self.parent = stack[-1]
+    else:
+      self.parent = None
+    stack.append(self)
     return self
 
   def __exit__(self, unused_type, unused_value, unused_traceback):
-    assert _source_mappers()[-1] is self, 'Concurrent access?'
-    _source_mappers().pop()
+    top = self._stack_dict[self._thread_key].pop()
+    assert top is self, 'Concurrent access?'
 
-  def map(self, filename, lineno, name):
+  def reset(self):
+    pass
+
+
+class StackTraceMapper(StackTraceTransform):
+  """Allows remapping traceback information to different source code."""
+  _stack_dict = _source_mapper_stacks
+
+  def reset(self):
+    self._effective_source_map = None
+
+  def get_effective_source_map(self):
+    """Returns a map (filename, lineno) -> (filename, lineno, function_name)."""
     raise NotImplementedError('subclasses need to override this')
 
 
-class StackTraceFilter(object):
+class StackTraceFilter(StackTraceTransform):
   """Allows filtering traceback information by removing superfluous frames."""
+  _stack_dict = _source_filter_stacks
 
-  def __enter__(self):
-    _source_filters().append(self)
-    return self
+  def reset(self):
+    self._filtered_filenames = None
 
-  def __exit__(self, unused_type, unused_value, unused_traceback):
-    assert _source_filters()[-1] is self, 'Concurrent access?'
-    _source_filters().pop()
-
-  def filter(self, filename, lineno, name):
+  def get_filtered_filenames(self):
     raise NotImplementedError('subclasses need to override this')
 
 
@@ -97,12 +127,15 @@ class CurrentModuleFilter(StackTraceFilter):
       del f
       del outer_f
 
-  def should_remove(self, filename, lineno, name):
-    del lineno, name
-    return filename == self._filename
+  def get_filtered_filenames(self):
+    if self._filtered_filenames is None:
+      self._filtered_filenames = frozenset((self._filename,))
+      if self.parent is not None:
+        self._filtered_filenames |= self.parent.get_filtered_filenames()
+    return self._filtered_filenames
 
 
-def extract_stack(limit=None):
+def extract_stack(limit=-1):
   """A lightweight, extensible re-implementation of traceback.extract_stack.
 
   NOTE(mrry): traceback.extract_stack eagerly retrieves the line of code for
@@ -115,105 +148,15 @@ def extract_stack(limit=None):
     limit: A limit on the number of frames to return.
 
   Returns:
-    A list of 5-tuples
-        (filename, lineno, name, frame_globals, func_start_lineno)
-    corresponding to the call stack of the current thread.  The returned tuples
-    have the innermost stack frame at the end, unlike the Python inspect
-    module's stack() function.
+    A sequence of StackFrame objects (filename, lineno, name, line)
+    corresponding to the call stack of the current thread.
   """
-  try:
-    raise ZeroDivisionError
-  except ZeroDivisionError:
-    f = sys.exc_info()[2].tb_frame.f_back
-  ret = []
-  length = 0
-  while f is not None and (limit is None or length < limit):
-    lineno = f.f_lineno
-    co = f.f_code
-    filename = co.co_filename
-    name = co.co_name
-    frame_globals = f.f_globals
-    func_start_lineno = co.co_firstlineno
+  # N.B ExtractStack in tf_stack.cc will drop this frame prior to
+  # traversing the stack.
+  thread_key = _get_thread_key()
+  return _tf_stack.extract_stack(
+      limit,
+      _source_mapper_stacks[thread_key],
+      _source_filter_stacks[thread_key])
 
-    for mapper in _source_mappers():
-      # TODO(mdan): Show some indication that the frame was translated.
-      filename, lineno, name = mapper.map(filename, lineno, name)
-
-    keep = True
-    if ret:  # Never filter the innermost frame.
-      keep = not any(
-          f.should_remove(filename, lineno, name) for f in _source_filters())
-    if keep:
-      ret.append((filename, lineno, name, frame_globals, func_start_lineno))
-      length += 1
-
-    f = f.f_back
-
-  # TODO(mdan): Also add a truncation mechanism.
-
-  ret.reverse()
-  return ret
-
-
-FileAndLine = collections.namedtuple('FileAndLine', ['file', 'line'])
-
-
-def extract_stack_file_and_line(max_length=1000):
-  """A version of extract_stack that only returns filenames and line numbers.
-
-  Callers often only require filenames and line numbers, and do not need the
-  additional information gathered by extract_stack, as they never call
-  convert_stack.
-
-  As a further optimisation, we allow users to specify a limit on the number of
-  frames examined.
-
-  Args:
-    max_length: The maximum length of stack to extract.
-
-  Returns:
-    A list of FileAndLine objects corresponding to the call stack of the current
-    thread.
-  """
-  try:
-    raise ZeroDivisionError
-  except ZeroDivisionError:
-    frame = sys.exc_info()[2].tb_frame.f_back
-  ret = []
-  length = 0
-  while frame is not None and length < max_length:
-    ret.append(FileAndLine(frame.f_code.co_filename, frame.f_lineno))
-    length += 1
-    frame = frame.f_back
-  ret.reverse()
-  return ret
-
-
-def convert_stack(stack, include_func_start_lineno=False):
-  """Converts a stack extracted using extract_stack() to a traceback stack.
-
-  Args:
-    stack: A list of n 5-tuples,
-      (filename, lineno, name, frame_globals, func_start_lineno).
-    include_func_start_lineno: True if function start line number should be
-      included as the 5th entry in return tuples.
-
-  Returns:
-    A list of n 4-tuples or 5-tuples
-    (filename, lineno, name, code, [optional: func_start_lineno]), where the
-    code tuple element is calculated from the corresponding elements of the
-    input tuple.
-  """
-  ret = []
-  for (filename, lineno, name, frame_globals, func_start_lineno) in stack:
-    linecache.checkcache(filename)
-    line = linecache.getline(filename, lineno, frame_globals)
-    if line:
-      line = line.strip()
-    else:
-      line = None
-    if include_func_start_lineno:
-      ret.append((filename, lineno, name, line, func_start_lineno))
-    else:
-      ret.append((filename, lineno, name, line))
-  return ret
+StackFrame = _tf_stack.StackFrame
diff --git a/tensorflow/python/util/tf_stack_test.py b/tensorflow/python/util/tf_stack_test.py
new file mode 100644
index 00000000000..1395785df8c
--- /dev/null
+++ b/tensorflow/python/util/tf_stack_test.py
@@ -0,0 +1,54 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for functions used to extract and analyze stacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import traceback
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_stack
+
+
+class TFStackTest(test.TestCase):
+
+  def testLimit(self):
+    self.assertEmpty(tf_stack.extract_stack(limit=0))
+    self.assertLen(tf_stack.extract_stack(limit=1), 1)
+    self.assertEqual(
+        len(tf_stack.extract_stack(limit=-1)),
+        len(tf_stack.extract_stack()))
+
+  def testConsistencyWithTraceback(self):
+    stack, expected_stack = extract_stack()
+    for frame, expected in zip(stack, expected_stack):
+      self.assertEqual(tuple(frame), expected)
+
+  def testFormatStack(self):
+    stack, expected_stack = extract_stack()
+    self.assertEqual(
+        traceback.format_list(stack),
+        traceback.format_list(expected_stack))
+
+
+def extract_stack(limit=None):
+  # Both defined on the same line to produce identical stacks.
+  return tf_stack.extract_stack(limit), traceback.extract_stack(limit)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/util/traceme.i b/tensorflow/python/util/traceme.i
new file mode 100644
index 00000000000..7b1377cb645
--- /dev/null
+++ b/tensorflow/python/util/traceme.i
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/lib/core/strings.i"
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/core/profiler/internal/python_traceme.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::profiler;
+%unignore tensorflow::profiler::PythonTraceMe;
+%unignore tensorflow::profiler::PythonTraceMe::PythonTraceMe;
+%unignore tensorflow::profiler::PythonTraceMe::Enter;
+%unignore tensorflow::profiler::PythonTraceMe::Exit;
+%unignore tensorflow::profiler::PythonTraceMe::~PythonTraceMe;
+%unignore tensorflow::profiler::PythonTraceMe::IsEnabled;
+
+%include "tensorflow/core/profiler/internal/python_traceme.h"
+
+%unignoreall
\ No newline at end of file
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 42678a8226f..e8a9e5ed344 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -221,6 +221,16 @@ int IsMappingHelper(PyObject* o) {
   return check_cache->CachedLookup(o);
 }
 
+// Returns 1 if `o` is considered a mapping view for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsMappingViewHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    return IsInstanceOfRegisteredType(to_check, "MappingView");
+  });
+  return check_cache->CachedLookup(o);
+}
+
 // Returns 1 if `o` is an instance of attrs-decorated class.
 // Returns 0 otherwise.
 int IsAttrsHelper(PyObject* o) {
@@ -283,6 +293,7 @@ int IsVariableHelper(PyObject* o) {
 int IsSequenceHelper(PyObject* o) {
   // We treat dicts and other mappings as special cases of sequences.
   if (IsMappingHelper(o)) return true;
+  if (IsMappingViewHelper(o)) return true;
   if (IsAttrsHelper(o)) return true;
   if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
     LOG(WARNING) << "Sets are not currently considered sequences, "
@@ -830,6 +841,7 @@ bool AssertSameStructureHelper(
 
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
 bool IsMapping(PyObject* o) { return IsMappingHelper(o) == 1; }
+bool IsMappingView(PyObject* o) { return IsMappingViewHelper(o) == 1; }
 bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
 bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
 bool IsResourceVariable(PyObject* o) {
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 885c1cdac17..3fd589f5ccf 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -86,6 +86,15 @@ PyObject* IsNamedtuple(PyObject* o, bool strict);
 //   True if the sequence subclasses mapping.
 bool IsMapping(PyObject* o);
 
+// Returns a true if its input is a collections.MappingView.
+//
+// Args:
+//   seq: the input to be checked.
+//
+// Returns:
+//   True if the sequence subclasses mapping.
+bool IsMappingView(PyObject* o);
+
 // A version of PyMapping_Keys that works in C++11
 //
 // Args:
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
deleted file mode 100644
index 052deeb4636..00000000000
--- a/tensorflow/python/util/util.i
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-%include "tensorflow/python/platform/base.i"
-
-%{
-#include "tensorflow/python/util/util.h"
-%}
-
-%ignoreall
-
-%unignore tensorflow;
-%unignore tensorflow::swig;
-// The %exception block defined in tf_session.i releases the Python GIL for
-// the length of each wrapped method. This file is included in tensorflow.i
-// after tf_session.i and inherits this definition. We disable this behavior
-// for functions in this module because they use python methods that need GIL.
-// TODO(iga): Find a way not to leak such definitions across files.
-
-%unignore tensorflow::swig::RegisterType;
-%noexception tensorflow::swig::RegisterType;
-
-%unignore tensorflow::swig::IsTensor;
-%noexception tensorflow::swig::IsTensor;
-
-%unignore tensorflow::swig::IsResourceVariable;
-%noexception tensorflow::swig::IsResourceVariable;
-
-%unignore tensorflow::swig::IsVariable;
-%noexception tensorflow::swig::IsVariable;
-
-%feature("docstring") tensorflow::swig::IsSequence
-"""Returns true if its input is a collections.Sequence (except strings).
-
-Args:
-  seq: an input sequence.
-
-Returns:
-  True if the sequence is a not a string and is a collections.Sequence or a
-  dict.
-"""
-%unignore tensorflow::swig::IsSequence;
-%noexception tensorflow::swig::IsSequence;
-
-%feature("docstring") tensorflow::swig::IsSequenceOrComposite
-"""Returns true if its input is a sequence or a `CompositeTensor`.
-
-Args:
-  seq: an input sequence.
-
-Returns:
-  True if the sequence is a not a string and is a collections.Sequence or a
-  dict or a CompositeTensor or a TypeSpec (except string and TensorSpec).
-"""
-%unignore tensorflow::swig::IsSequenceOrComposite;
-%noexception tensorflow::swig::IsSequenceOrComposite;
-
-%feature("docstring") tensorflow::swig::IsCompositeTensor
-"""Returns true if its input is a `CompositeTensor`.
-
-Args:
-  seq: an input sequence.
-
-Returns:
-  True if the sequence is a CompositeTensor.
-"""
-%unignore tensorflow::swig::IsCompositeTensor;
-%noexception tensorflow::swig::IsCompositeTensor;
-
-%feature("docstring") tensorflow::swig::IsTypeSpec
-"""Returns true if its input is a `TypeSpec`, but is not a `TensorSpec`.
-
-Args:
-  seq: an input sequence.
-
-Returns:
-  True if the sequence is a `TypeSpec`, but is not a `TensorSpec`.
-"""
-%unignore tensorflow::swig::IsTypeSpec;
-%noexception tensorflow::swig::IsTypeSpec;
-
-%unignore tensorflow::swig::IsNamedtuple;
-%noexception tensorflow::swig::IsNamedtuple;
-
-%feature("docstring") tensorflow::swig::IsMapping
-"""Returns True iff `instance` is a `collections.Mapping`.
-
-Args:
-  instance: An instance of a Python object.
-
-Returns:
-  True if `instance` is a `collections.Mapping`.
-"""
-%unignore tensorflow::swig::IsMapping;
-%noexception tensorflow::swig::IsMapping;
-
-%feature("docstring") tensorflow::swig::IsAttrs
-"""Returns True iff `instance` is an instance of an `attr.s` decorated class.
-
-Args:
-  instance: An instance of a Python object.
-
-Returns:
-  True if `instance` is an instance of an `attr.s` decorated class.
-"""
-%unignore tensorflow::swig::IsAttrs;
-%noexception tensorflow::swig::IsAttrs;
-
-%feature("docstring") tensorflow::swig::SameNamedtuples
-"Returns True if the two namedtuples have the same name and fields."
-%unignore tensorflow::swig::SameNamedtuples;
-%noexception tensorflow::swig::SameNamedtuples;
-
-%unignore tensorflow::swig::AssertSameStructure;
-%noexception tensorflow::swig::AssertSameStructure;
-
-%feature("docstring") tensorflow::swig::Flatten
-"""Returns a flat list from a given nested structure.
-
-If `nest` is not a sequence, tuple, or dict, then returns a single-element
-list: `[nest]`.
-
-In the case of dict instances, the sequence consists of the values, sorted by
-key to ensure deterministic behavior. This is true also for `OrderedDict`
-instances: their sequence order is ignored, the sorting order of keys is
-used instead. The same convention is followed in `pack_sequence_as`. This
-correctly repacks dicts and `OrderedDict`s after they have been flattened,
-and also allows flattening an `OrderedDict` and then repacking it back using
-a corresponding plain dict, or vice-versa.
-Dictionaries with non-sortable keys cannot be flattened.
-
-Users must not modify any collections used in `nest` while this function is
-running.
-
-Args:
-  nest: an arbitrarily nested structure or a scalar object. Note, numpy
-      arrays are considered scalars.
-  expand_composites: If true, then composite tensors such as `tf.SparseTensor`
-      and `tf.RaggedTensor` are expanded into their component tensors.
-
-Returns:
-  A Python list, the flattened version of the input.
-
-Raises:
-  TypeError: The nest is or contains a dict with non-sortable keys.
-"""
-%unignore tensorflow::swig::Flatten;
-%noexception tensorflow::swig::Flatten;
-%feature("kwargs") tensorflow::swig::Flatten;
-
-%feature("docstring") tensorflow::swig::IsSequenceForData
-"""Returns a true if `seq` is a Sequence or dict (except strings/lists).
-
-NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
-which *does* treat a Python list as a sequence. For ergonomic
-reasons, `tf.data` users would prefer to treat lists as
-implicit `tf.Tensor` objects, and dicts as (nested) sequences.
-
-Args:
-  seq: an input sequence.
-
-Returns:
-  True if the sequence is a not a string or list and is a
-  collections.Sequence.
-"""
-%unignore tensorflow::swig::IsSequenceForData;
-%noexception tensorflow::swig::IsSequenceForData;
-
-%feature("docstring") tensorflow::swig::FlattenForData
-"""Returns a flat sequence from a given nested structure.
-
-If `nest` is not a sequence, this returns a single-element list: `[nest]`.
-
-Args:
-  nest: an arbitrarily nested structure or a scalar object.
-    Note, numpy arrays are considered scalars.
-
-Returns:
-  A Python list, the flattened version of the input.
-"""
-%unignore tensorflow::swig::FlattenForData;
-%noexception tensorflow::swig::FlattenForData;
-
-%unignore tensorflow::swig::AssertSameStructureForData;
-%noexception tensorflow::swig::AssertSameStructureForData;
-
-%include "tensorflow/python/util/util.h"
-
-%unignoreall
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
new file mode 100644
index 00000000000..835ba070b01
--- /dev/null
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -0,0 +1,333 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "tensorflow/python/util/util.h"
+
+namespace py = pybind11;
+
+inline py::object pyo_or_throw(PyObject* ptr) {
+  if (PyErr_Occurred() || ptr == nullptr) {
+    throw py::error_already_set();
+  }
+  return py::reinterpret_steal<py::object>(ptr);
+}
+
+PYBIND11_MODULE(_pywrap_utils, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_utils
+    -----
+  )pbdoc";
+  m.def("RegisterType",
+        [](const py::handle& type_name, const py::handle& type) {
+          return pyo_or_throw(
+              tensorflow::swig::RegisterType(type_name.ptr(), type.ptr()));
+        });
+  m.def(
+      "IsTensor",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsTensor(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Check if an object is a Tensor.
+    )pbdoc");
+  m.def(
+      "IsSequence",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsSequence(o.ptr());
+        return result;
+      },
+      R"pbdoc(
+      Returns true if its input is a collections.Sequence (except strings).
+
+      Args:
+        seq: an input sequence.
+
+      Returns:
+        True if the sequence is a not a string and is a collections.Sequence or a
+        dict.
+    )pbdoc");
+  m.def(
+      "IsSequenceOrComposite",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsSequenceOrComposite(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns true if its input is a sequence or a `CompositeTensor`.
+
+      Args:
+        seq: an input sequence.
+
+      Returns:
+        True if the sequence is a not a string and is a collections.Sequence or a
+        dict or a CompositeTensor or a TypeSpec (except string and TensorSpec).
+    )pbdoc");
+  m.def(
+      "IsCompositeTensor",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsCompositeTensor(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns true if its input is a `CompositeTensor`.
+
+      Args:
+        seq: an input sequence.
+
+      Returns:
+        True if the sequence is a CompositeTensor.
+    )pbdoc");
+  m.def(
+      "IsTypeSpec",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsTypeSpec(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns true if its input is a `TypeSpec`, but is not a `TensorSpec`.
+
+      Args:
+        seq: an input sequence.
+
+      Returns:
+        True if the sequence is a `TypeSpec`, but is not a `TensorSpec`.
+    )pbdoc");
+  m.def(
+      "IsNamedtuple",
+      [](const py::handle& o, bool strict) {
+        return pyo_or_throw(tensorflow::swig::IsNamedtuple(o.ptr(), strict));
+      },
+      R"pbdoc(
+      Check if an object is a NamedTuple.
+    )pbdoc");
+  m.def(
+      "IsMapping",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsMapping(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if `instance` is a `collections.Mapping`.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if `instance` is a `collections.Mapping`.
+    )pbdoc");
+  m.def(
+      "IsMappingView",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsMappingView(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if considered a mapping view for the purposes of Flatten()`.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if considered a mapping view for the purposes of Flatten().
+    )pbdoc");
+  m.def(
+      "IsAttrs",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsAttrs(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if `instance` is an instance of an `attr.s` decorated class.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if `instance` is an instance of an `attr.s` decorated class.
+    )pbdoc");
+  m.def(
+      "SameNamedtuples",
+      [](const py::handle& o1, const py::handle& o2) {
+        return pyo_or_throw(
+            tensorflow::swig::SameNamedtuples(o1.ptr(), o2.ptr()));
+      },
+      R"pbdoc(
+      Returns True if the two namedtuples have the same name and fields.
+    )pbdoc");
+  m.def(
+      "AssertSameStructure",
+      [](const py::handle& o1, const py::handle& o2, bool check_types,
+         bool expand_composites) {
+        bool result = tensorflow::swig::AssertSameStructure(
+            o1.ptr(), o2.ptr(), check_types, expand_composites);
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if the two structures are nested in the same way.
+    )pbdoc");
+  m.def(
+      "Flatten",
+      [](const py::handle& o, bool expand_composites) {
+        return pyo_or_throw(
+            tensorflow::swig::Flatten(o.ptr(), expand_composites));
+      },
+      R"pbdoc(
+      Returns a flat list from a given nested structure.
+
+      If `nest` is not a sequence, tuple, or dict, then returns a single-element
+      list: `[nest]`.
+
+      In the case of dict instances, the sequence consists of the values, sorted by
+      key to ensure deterministic behavior. This is true also for `OrderedDict`
+      instances: their sequence order is ignored, the sorting order of keys is
+      used instead. The same convention is followed in `pack_sequence_as`. This
+      correctly repacks dicts and `OrderedDict`s after they have been flattened,
+      and also allows flattening an `OrderedDict` and then repacking it back using
+      a corresponding plain dict, or vice-versa.
+      Dictionaries with non-sortable keys cannot be flattened.
+
+      Users must not modify any collections used in `nest` while this function is
+      running.
+
+      Args:
+        nest: an arbitrarily nested structure or a scalar object. Note, numpy
+            arrays are considered scalars.
+        expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+            and `tf.RaggedTensor` are expanded into their component tensors.
+
+      Returns:
+        A Python list, the flattened version of the input.
+
+      Raises:
+        TypeError: The nest is or contains a dict with non-sortable keys.
+    )pbdoc");
+  m.def(
+      "IsSequenceForData",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsSequenceForData(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns a true if `seq` is a Sequence or dict (except strings/lists).
+
+      NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
+      which *does* treat a Python list as a sequence. For ergonomic
+      reasons, `tf.data` users would prefer to treat lists as
+      implicit `tf.Tensor` objects, and dicts as (nested) sequences.
+
+      Args:
+        seq: an input sequence.
+
+      Returns:
+        True if the sequence is a not a string or list and is a
+        collections.Sequence.
+    )pbdoc");
+  m.def(
+      "FlattenForData",
+      [](const py::handle& o) {
+        return pyo_or_throw(tensorflow::swig::FlattenForData(o.ptr()));
+      },
+      R"pbdoc(
+      Returns a flat sequence from a given nested structure.
+
+      If `nest` is not a sequence, this returns a single-element list: `[nest]`.
+
+      Args:
+        nest: an arbitrarily nested structure or a scalar object.
+          Note, numpy arrays are considered scalars.
+
+      Returns:
+        A Python list, the flattened version of the input.
+    )pbdoc");
+  m.def(
+      "AssertSameStructureForData",
+      [](const py::handle& o1, const py::handle& o2, bool check_types) {
+        bool result = tensorflow::swig::AssertSameStructureForData(
+            o1.ptr(), o2.ptr(), check_types);
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if the two structures are nested in the same way in particular tf.data.
+    )pbdoc");
+  m.def(
+      "IsResourceVariable",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsResourceVariable(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns 1 if `o` is a ResourceVariable.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if `instance` is a `ResourceVariable`.
+    )pbdoc");
+  m.def(
+      "IsVariable",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsVariable(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns 1 if `o` is a Variable.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if `instance` is a `Variable`.
+    )pbdoc");
+}
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 654e84b2bc1..29c704d83ae 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -5,8 +5,8 @@
 # do not link against restricted binary blobs.
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
@@ -694,6 +694,7 @@ cc_library(
         ":device_memory",
         ":device_memory_allocator",
         ":platform",
+        ":stream_executor_headers",
         "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index f25ed700d6d..faf4a13b17f 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -45,6 +45,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace Eigen {
@@ -1382,6 +1383,8 @@ class BlasSupport {
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) = 0;
 
+  virtual port::Status GetVersion(string *version) = 0;
+
  protected:
   BlasSupport() {}
 
@@ -2192,7 +2195,8 @@ class BlasSupport {
                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
                   uint64 n, std::complex<double> alpha,                        \
                   const DeviceMemory<std::complex<double>> &a, int lda,        \
-                  DeviceMemory<std::complex<double>> *b, int ldb) override;
+                  DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+  port::Status GetVersion(string *version) override;
 
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 469f5511e99..575ff639e75 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -13,9 +13,6 @@ def tf_additional_cuda_driver_deps():
 def tf_additional_cudnn_plugin_deps():
     return []
 
-def tf_additional_cupti_stub_data():
-    return ["@local_config_cuda//cuda:cupti_dsos"]
-
 # Returns whether any GPU backend is configuered.
 def if_gpu_is_configured(x):
     if cuda_is_configured() or rocm_is_configured():
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 2f3483b485f..27b1364c6cb 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -8,16 +8,14 @@ load(
     "tf_additional_cuda_driver_deps",
     "tf_additional_cuda_platform_deps",
     "tf_additional_cudnn_plugin_deps",
-    "tf_additional_cupti_stub_data",
 )
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "if_static",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
@@ -140,8 +138,8 @@ cc_library(
         "//tensorflow/stream_executor/platform:dso_loader",
     ] + tf_additional_cuda_driver_deps()) + select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub"],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
+        "//tensorflow:build_oss_using_cuda_nvcc": ["cudart_stub"],
+        "//tensorflow:build_oss_using_cuda_clang": ["cudart_stub"],
         "//conditions:default": ["//tensorflow/core:cuda"],
     }) + [
         "@com_google_absl//absl/base:core_headers",
@@ -155,20 +153,20 @@ cc_library(
     name = "cudart_stub",
     srcs = select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub.cc"],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub.cc"],
+        "//tensorflow:build_oss_using_cuda_nvcc": ["cudart_stub.cc"],
+        "//tensorflow:build_oss_using_cuda_clang": ["cudart_stub.cc"],
         "//conditions:default": [],
     }),
     textual_hdrs = glob(["cuda_runtime_*.inc"]),
     visibility = ["//visibility:public"],
     deps = select({
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [
+        "//tensorflow:build_oss_using_cuda_nvcc": [
             ":cuda_stub",
             "@local_config_cuda//cuda:cuda_headers",
             "//tensorflow/stream_executor/lib",
             "//tensorflow/stream_executor/platform:dso_loader",
         ],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": [
+        "//tensorflow:build_oss_using_cuda_clang": [
             ":cuda_stub",
             "@local_config_cuda//cuda:cuda_headers",
             "//tensorflow/stream_executor/lib",
@@ -233,11 +231,11 @@ cc_library(
 
 alias(
     name = "cublas_lib",
-    actual = if_static(
-        "@local_config_cuda//cuda:cublas",
-        ":cublas_stub",
-    ),
-    visibility = ["//visibility:private"],
+    actual = select({
+        "//tensorflow:oss": ":cublas_stub",
+        "//conditions:default": "@local_config_cuda//cuda:cublas",
+    }),
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -289,11 +287,11 @@ cc_library(
 
 alias(
     name = "cufft_lib",
-    actual = if_static(
-        "@local_config_cuda//cuda:cufft",
-        ":cufft_stub",
-    ),
-    visibility = ["//visibility:private"],
+    actual = select({
+        "//tensorflow:oss": ":cufft_stub",
+        "//conditions:default": "@local_config_cuda//cuda:cufft",
+    }),
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -334,11 +332,11 @@ cc_library(
 
 alias(
     name = "cudnn_lib",
-    actual = if_static(
-        "@local_config_cuda//cuda:cudnn",
-        ":cudnn_stub",
-    ),
-    visibility = ["//visibility:private"],
+    actual = select({
+        "//tensorflow:oss": ":cudnn_stub",
+        "//conditions:default": "@local_config_cuda//cuda:cudnn",
+    }),
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -387,11 +385,11 @@ cc_library(
 
 alias(
     name = "curand_lib",
-    actual = if_static(
-        "@local_config_cuda//cuda:curand",
-        ":curand_stub",
-    ),
-    visibility = ["//visibility:private"],
+    actual = select({
+        "//tensorflow:oss": ":curand_stub",
+        "//conditions:default": "@local_config_cuda//cuda:curand",
+    }),
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -421,7 +419,7 @@ cc_library(
 cc_library(
     name = "cupti_stub",
     srcs = if_cuda_is_configured(["cupti_stub.cc"]),
-    data = if_cuda_is_configured(tf_additional_cupti_stub_data()),
+    data = if_cuda_is_configured(["@local_config_cuda//cuda:cupti_dsos"]),
     textual_hdrs = ["cupti_10_0.inc"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cupti_headers",
@@ -444,6 +442,15 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "cusolver_lib",
+    actual = select({
+        "//tensorflow:oss": ":cusolver_stub",
+        "//conditions:default": "@local_config_cuda//cuda:cusolver",
+    }),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cusparse_stub",
     srcs = if_cuda_is_configured(["cusparse_stub.cc"]),
@@ -455,6 +462,15 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "cusparse_lib",
+    actual = select({
+        "//tensorflow:oss": ":cusparse_stub",
+        "//conditions:default": "@local_config_cuda//cuda:cusparse",
+    }),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cuda_kernel",
     srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
diff --git a/tensorflow/stream_executor/cuda/cublas_10_0.inc b/tensorflow/stream_executor/cuda/cublas_10_0.inc
index 854545f4f77..c24fd44c4f2 100644
--- a/tensorflow/stream_executor/cuda/cublas_10_0.inc
+++ b/tensorflow/stream_executor/cuda/cublas_10_0.inc
@@ -2,5196 +2,4894 @@
 
 extern "C" {
 
-cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, version);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdErr, const char* logFileName) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const char *);
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
+                                                  int logToStdErr,
+                                                  const char *logFileName) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback);
+cublasStatus_t CUBLASWINAPI
+cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(userCallback);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback *);
+cublasStatus_t CUBLASWINAPI
+cublasGetLoggerCallback(cublasLogCallback *userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(userCallback);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *devicePtr, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, devicePtr, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *y, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B, 
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B,
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
-                                                  const void *hostPtr, int incx, 
-                                                  void *devicePtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
-                                                  const void *devicePtr, int incx,
-                                                  void *hostPtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
-  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
   if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
   return func_ptr(srName, info);
 }
 
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *x, 
-                                                     cudaDataType xType,
-                                                     int incx, 
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     const float *y, 
-                                                     int incy,
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     const double *y,
-                                                     int incy,
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *alpha,  /* host or device pointer */
-                                                     cudaDataType alphaType,
-                                                     void *x, 
-                                                     cudaDataType xType,
-                                                     int incx,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *alpha,  /* host or device pointer */
-                                                     float *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *alpha,  /* host or device pointer */
-                                                     double *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuComplex *alpha, /* host or device pointer */
-                                                     cuComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */
-                                                     cuDoubleComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
-                                                      int n,
-                                                      const void *alpha, /* host or device pointer */
-                                                      cudaDataType alphaType,
-                                                      const void *x,
-                                                      cudaDataType xType,
-                                                      int incx,
-                                                      void *y,
-                                                      cudaDataType yType,
-                                                      int incy,
-                                                      cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *alpha, /* host or device pointer */
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,      /* host or device pointer */
-                                                     const cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,            /* host or device pointer */
-                                                     const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
-                                                     float *a,   /* host or device pointer */
-                                                     float *b,   /* host or device pointer */
-                                                     float *c,   /* host or device pointer */
-                                                     float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
-                                                     double *a,  /* host or device pointer */
-                                                     double *b,  /* host or device pointer */
-                                                     double *c,  /* host or device pointer */
-                                                     double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
-                                                     cuComplex *a,  /* host or device pointer */
-                                                     cuComplex *b,  /* host or device pointer */
-                                                     float *c,      /* host or device pointer */
-                                                     cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
-                                                     cuDoubleComplex *a,  /* host or device pointer */
-                                                     cuDoubleComplex *b,  /* host or device pointer */
-                                                     double *c,           /* host or device pointer */
-                                                     cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
-                                                      float *d1,        /* host or device pointer */
-                                                      float *d2,        /* host or device pointer */
-                                                      float *x1,        /* host or device pointer */
-                                                      const float *y1,  /* host or device pointer */
-                                                      float *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
-                                                      double *d1,        /* host or device pointer */  
-                                                      double *d2,        /* host or device pointer */  
-                                                      double *x1,        /* host or device pointer */  
-                                                      const double *y1,  /* host or device pointer */  
-                                                      double *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      const float *beta,  /* host or device pointer */
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda, 
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda, 
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,   /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x, 
-                                                      int incx,
-                                                      const float *beta,  /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,   /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x, 
-                                                      int incx,
-                                                      const double *beta,   /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha,  /* host or device pointer */                                           
-                                                      const float *AP,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *AP,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *AP,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *AP,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
-                                                     int m,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     const float *y,
-                                                     int incy,
-                                                     float *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
-                                                     int m,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */   
-                                                     const double *x,
-                                                     int incx,
-                                                     const double *y,
-                                                     int incy,
-                                                     double *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *y,
-                                                      int incy,
-                                                      double *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx, 
-                                                      const double *y,
-                                                      int incy,
-                                                      double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb, 
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
-                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
-                                                           int m, int n, int k, 
-                                                           const unsigned char *A, int A_bias, int lda, 
-                                                           const unsigned char *B, int B_bias, int ldb,
-                                                                 unsigned char *C, int C_bias, int ldc,
-                                                           int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      int n, 
-                                                      int k,
-                                                      const cuComplex *alpha, 
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, 
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo, 
-                                                       cublasOperation_t trans, 
-                                                       int n, 
-                                                       int k,
-                                                       const float *alpha, 
-                                                       const void *A, cudaDataType Atype, 
-                                                       int lda,
-                                                       const float *beta, 
-                                                       void *C, 
-                                                       cudaDataType Ctype, 
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const float *alpha, /* host or device pointer */  
-                                                       const float *A,
-                                                       int lda,
-                                                       const float *B,
-                                                       int ldb,
-                                                       const float *beta, /* host or device pointer */  
-                                                       float *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const double *alpha, /* host or device pointer */  
-                                                       const double *A,
-                                                       int lda,
-                                                       const double *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       double *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const cuComplex *beta, /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                       const cuDoubleComplex *A,
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const float *beta,   /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans, 
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                       const cuDoubleComplex *A, 
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const float *alpha, /* host or device pointer */ 
-                                                    const float *A,
-                                                    int lda,
-                                                    const float *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    float *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const double *alpha, /* host or device pointer */ 
-                                                    const double *A,
-                                                    int lda,
-                                                    const double *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    double *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const cuComplex *beta, /* host or device pointer */ 
-                                                    cuComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo, 
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    cuComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      float *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,                                        
-                                                     int lda,
-                                                     cuDoubleComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda, 
-                                                      const float *B,
-                                                      int ldb,
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     const cuComplex *B,
-                                                     int ldb,
-                                                     cuComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     const cuDoubleComplex *B,
-                                                     int ldb,
-                                                     cuDoubleComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const float *alpha,  /* host or device pointer */  
-                                                          const float *const Aarray[], 
-                                                          int lda,
-                                                          const float *const Barray[],
-                                                          int ldb, 
-                                                          const float *beta,   /* host or device pointer */  
-                                                          float *const Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *const [], int, const float *const [], int, const float *, float *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *const[], int, const float *const[], int,
+      const float *, float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const double *alpha,  /* host or device pointer */ 
-                                                          const double *const Aarray[], 
-                                                          int lda,
-                                                          const double *const Barray[],
-                                                          int ldb, 
-                                                          const double *beta,  /* host or device pointer */ 
-                                                          double *const Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *const [], int, const double *const [], int, const double *, double *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *const Aarray[], int lda, const double *const Barray[],
+    int ldb, const double *beta, /* host or device pointer */
+    double *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *const[], int, const double *const[], int,
+      const double *, double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *const Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *const Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *const Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *const Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *const Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *const Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                          const cuDoubleComplex *const Aarray[], 
-                                                          int lda,
-                                                          const cuDoubleComplex *const Barray[],
-                                                          int ldb, 
-                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                          cuDoubleComplex *const Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, const cuDoubleComplex *const [], int, const cuDoubleComplex *, cuDoubleComplex *const [], int, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const cuDoubleComplex *alpha, /* host or device pointer */
+                   const cuDoubleComplex *const Aarray[], int lda,
+                   const cuDoubleComplex *const Barray[], int ldb,
+                   const cuDoubleComplex *beta, /* host or device pointer */
+                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
+      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
+      cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *const Aarray[], 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *const Barray[],
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *const Carray[],
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      int batchCount,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *const [], cudaDataType, int, const void *const [], cudaDataType, int, const void *, void *const [], cudaDataType, int, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *const Aarray[], cudaDataType Atype, int lda,
+    const void *const Barray[], cudaDataType Btype, int ldb,
+    const void *beta, /* host or device pointer */
+    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
+    cudaDataType computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *const[], cudaDataType, int, const void *const[],
+      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
+      cudaDataType, cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda, Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
+                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
+                  computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const void *alpha,  /* host or device pointer */
-                                                                 const void *A,
-                                                                 cudaDataType Atype,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const void *B,
-                                                                 cudaDataType Btype,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const void *beta,   /* host or device pointer */
-                                                                 void *C,
-                                                                 cudaDataType Ctype,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount,
-                                                                 cudaDataType computeType,
-                                                                 cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, long long, const void *, cudaDataType, int, long long, const void *, void *, cudaDataType, int, long long, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    long long int strideA, /* purposely signed */
+    const void *B, cudaDataType Btype, int ldb, long long int strideB,
+    const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
+    cudaDataType computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, long long, const void *,
+      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
+      long long, int, cudaDataType, cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC, batchCount, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
+                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
+                  batchCount, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const float *alpha,  /* host or device pointer */
-                                                                 const float *A,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const float *B,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const float *beta,   /* host or device pointer */
-                                                                 float *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const double *alpha,  /* host or device pointer */
-                                                                 const double *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const double *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const double *beta,   /* host or device pointer */
-                                                                 double *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
-                                                                 const cuDoubleComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuDoubleComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuDoubleComplex *beta,   /* host or device poi */
-                                                                 cuDoubleComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const float *alpha, /* host or device pointer */ 
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *beta , /* host or device pointer */ 
-                                                  const float *B, 
-                                                  int ldb,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const double *alpha, /* host or device pointer */ 
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *beta, /* host or device pointer */ 
-                                                  const double *B, 
-                                                  int ldb,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *alpha, /* host or device pointer */ 
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *beta, /* host or device pointer */  
-                                                  const cuComplex *B, 
-                                                  int ldb,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *beta, /* host or device pointer */  
-                                                  const cuDoubleComplex *B, 
-                                                  int ldb,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  float *const A[],                /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
+    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
+    int lda, int *P,                                /*Device Pointer*/
+    int *info,                                      /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  double *const A[],               /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
+    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
+    int lda, int *P,                                 /*Device Pointer*/
+    int *info,                                       /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuComplex *const A[],           /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuDoubleComplex *const A[],     /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                          /*Device Pointer*/
+    int *info,                                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const float *const A[],         /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  float *const C[],               /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, const int *, float *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, const int *P,                                /*Device pointer*/
+    float *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *const[], int, const int *,
+      float *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const double *const A[],        /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  double *const C[],              /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, const int *, double *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, const int *P,                                 /*Device pointer*/
+    double *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *const[], int, const int *,
+      double *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuComplex *const A[],     /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  cuComplex *const C[],           /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuComplex *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, const int *,
+      cuComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuDoubleComplex *const A[], /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                     /*Device pointer*/
-                                                  cuDoubleComplex *const C[],       /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgetriBatched(cublasHandle_t handle, int n,
+                    const cuDoubleComplex *const A[], /*Device pointer*/
+                    int lda, const int *P,            /*Device pointer*/
+                    cuDoubleComplex *const C[],       /*Device pointer*/
+                    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const float *const Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            float *const Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *const [], int, const int *, float *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const float *const Aarray[], int lda, const int *devIpiv,
+    float *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
+      const int *, float *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int n, 
-                                                           int nrhs, 
-                                                           const double *const Aarray[], 
-                                                           int lda, 
-                                                           const int *devIpiv, 
-                                                           double *const Barray[], 
-                                                           int ldb, 
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *const [], int, const int *, double *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *const Aarray[], int lda, const int *devIpiv,
+    double *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
+      const int *, double *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuComplex *const Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuComplex *const Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *const Aarray[], int lda, const int *devIpiv,
+    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
+      int, const int *, cuComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuDoubleComplex *const Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuDoubleComplex *const Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const float *alpha,           /*Host or Device Pointer*/
-                                                          const float *const A[], 
-                                                          int lda,
-                                                          float *const B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *const [], int, float *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *const A[], int lda, float *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *const[], int,
+      float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const double *alpha,          /*Host or Device Pointer*/
-                                                          const double *const A[], 
-                                                          int lda,
-                                                          double *const B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *const [], int, double *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *const A[], int lda, double *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *const[], int,
+      double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
-                                                          const cuComplex *const A[], 
-                                                          int lda,
-                                                          cuComplex *const B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const [], int, cuComplex *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
+      int, cuComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-                                                          const cuDoubleComplex *const A[], 
-                                                          int lda,
-                                                          cuDoubleComplex *const B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
+    int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const float *const A[],      /*Device pointer*/
-                                                          int lda, 
-                                                          float *const Ainv[],         /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, float *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, float *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                               /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
+                                     int, float *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const double *const A[],     /*Device pointer*/
-                                                          int lda, 
-                                                          double *const Ainv[],        /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, double *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, double *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
+                                     int, double *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuComplex *const A[],  /*Device pointer*/
-                                                          int lda, 
-                                                          cuComplex *const Ainv[],     /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, cuComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
+      int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuDoubleComplex *const A[], /*Device pointer*/
-                                                          int lda, 
-                                                          cuDoubleComplex *const Ainv[],    /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                        /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZmatinvBatched(cublasHandle_t handle, int n,
+                     const cuDoubleComplex *const A[],       /*Device pointer*/
+                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
+                     int lda_inv, int *info,                 /*Device Pointer*/
+                     int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
-                                                           int m, 
-                                                           int n,
-                                                           float *const Aarray[],      /*Device pointer*/
-                                                           int lda,
-                                                           float *const TauArray[],    /*Device pointer*/                                                           
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *const [], int, float *const [], int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    float *const Aarray[],            /*Device pointer*/
+                    int lda, float *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
+                                     int, float *const[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            double *const Aarray[],     /*Device pointer*/
-                                                            int lda, 
-                                                            double *const TauArray[],   /*Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *const [], int, double *const [], int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    double *const Aarray[],            /*Device pointer*/
+                    int lda, double *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
+                                     int, double *const[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuComplex *const Aarray[],          /*Device pointer*/
-                                                            int lda, 
-                                                            cuComplex *const TauArray[],        /*Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *const [], int, cuComplex *const [], int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuComplex *const Aarray[],            /*Device pointer*/
+                    int lda, cuComplex *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
+      int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuDoubleComplex *const Aarray[],    /*Device pointer*/
-                                                            int lda,
-                                                            cuDoubleComplex *const TauArray[],  /*Device pointer*/
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
+    cublasHandle_t handle, int m, int n,
+    cuDoubleComplex *const Aarray[],            /*Device pointer*/
+    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           float *const Aarray[],      /*Device pointer*/
-                                                           int lda, 
-                                                           float *const Carray[],      /*Device pointer*/
-                                                           int ldc,
-                                                           int *info, 
-                                                           int *devInfoArray,          /*Device pointer*/
-                                                           int batchSize ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *const [], int, float *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *const Aarray[],       /*Device pointer*/
+                   int lda, float *const Carray[],        /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
+      float *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
-                                                           cublasOperation_t trans,
-                                                           int m,
-                                                           int n,
-                                                           int nrhs,
-                                                           double *const Aarray[],     /*Device pointer*/
-                                                           int lda, 
-                                                           double *const Carray[],     /*Device pointer*/
-                                                           int ldc,
-                                                           int *info, 
-                                                           int *devInfoArray,          /*Device pointer*/
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *const [], int, double *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *const Aarray[],      /*Device pointer*/
+                   int lda, double *const Carray[],       /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
+      double *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle,
-                                                           cublasOperation_t trans,
-                                                           int m,
-                                                           int n,
-                                                           int nrhs,
-                                                           cuComplex *const Aarray[],  /*Device pointer*/
-                                                           int lda,
-                                                           cuComplex *const Carray[],  /*Device pointer*/
-                                                           int ldc,
-                                                           int *info,
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const [], int, cuComplex *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
+      cuComplex *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle,
-                                                           cublasOperation_t trans,
-                                                           int m,
-                                                           int n,
-                                                           int nrhs,
-                                                           cuDoubleComplex *const Aarray[],  /*Device pointer*/
-                                                           int lda,
-                                                           cuDoubleComplex *const Carray[],  /*Device pointer*/
-                                                           int ldc,
-                                                           int *info,
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int,
+      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
+      int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *x, 
-                                                  int incx,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *x, 
-                                                  int incx,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *x, 
-                                                  int incx,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *x, 
-                                                  int incx,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *AP,
-                                                     float *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *AP,
-                                                     double *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *AP,
-                                                     cuComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *AP,
-                                                     cuDoubleComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *A,
-                                                     int lda,
-                                                     float *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *A,
-                                                     int lda,
-                                                     double *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     cuDoubleComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus CUBLASWINAPI cublasInit (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasShutdown (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasGetError (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream);
 }
 
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
   return func_ptr(n, x, incx);
 }
 
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
   return func_ptr(n, x, incx, y, incy, sc, cs);
 }
 
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
diff --git a/tensorflow/stream_executor/cuda/cublas_10_1.inc b/tensorflow/stream_executor/cuda/cublas_10_1.inc
new file mode 100644
index 00000000000..067ba675288
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_10_1.inc
@@ -0,0 +1,5023 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+size_t CUBLASWINAPI cublasGetCudartVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
+                                                  int logToStdErr,
+                                                  const char *logFileName) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasGetLoggerCallback(cublasLogCallback *userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *y, cudaDataType yType,
+                                         int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
+                                         cudaDataType xType, int incx, void *y,
+                                         cudaDataType yType, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
+                                     int, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIamaxEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIaminEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAsumEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    void *result, cudaDataType resultType, /* host or device pointer */
+    cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
+            void *y, cudaDataType yType, int incy,
+            const void *c, /* host or device pointer */
+            const void *s, cudaDataType csType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                         void *a, /* host or device pointer */
+                                         void *b, /* host or device pointer */
+                                         cudaDataType abType,
+                                         void *c, /* host or device pointer */
+                                         void *s, /* host or device pointer */
+                                         cudaDataType csType,
+                                         cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
+                                                 cudaDataType, void *, void *,
+                                                 cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
+             int incx, void *y, cudaDataType yType, int incy,
+             const void *param, /* host or device pointer */
+             cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
+              cudaDataType d1Type, void *d2,       /* host or device pointer */
+              cudaDataType d2Type, void *x1,       /* host or device pointer */
+              cudaDataType x1Type, const void *y1, /* host or device pointer */
+              cudaDataType y1Type, void *param,    /* host or device pointer */
+              cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
+      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
+                  paramType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *const[], int, const float *const[], int,
+      const float *, float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *const Aarray[], int lda, const double *const Barray[],
+    int ldb, const double *beta, /* host or device pointer */
+    double *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *const[], int, const double *const[], int,
+      const double *, double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const cuDoubleComplex *alpha, /* host or device pointer */
+                   const cuDoubleComplex *const Aarray[], int lda,
+                   const cuDoubleComplex *const Barray[], int ldb,
+                   const cuDoubleComplex *beta, /* host or device pointer */
+                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
+      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
+      cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *const Aarray[], cudaDataType Atype, int lda,
+    const void *const Barray[], cudaDataType Btype, int ldb,
+    const void *beta, /* host or device pointer */
+    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
+    cudaDataType computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *const[], cudaDataType, int, const void *const[],
+      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
+      cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
+                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
+                  computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    long long int strideA, /* purposely signed */
+    const void *B, cudaDataType Btype, int ldb, long long int strideB,
+    const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
+    cudaDataType computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, long long, const void *,
+      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
+      long long, int, cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
+                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
+                  batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
+    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
+    int lda, int *P,                                /*Device Pointer*/
+    int *info,                                      /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
+    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
+    int lda, int *P,                                 /*Device Pointer*/
+    int *info,                                       /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                          /*Device Pointer*/
+    int *info,                                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, const int *P,                                /*Device pointer*/
+    float *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *const[], int, const int *,
+      float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, const int *P,                                 /*Device pointer*/
+    double *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *const[], int, const int *,
+      double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuComplex *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, const int *,
+      cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgetriBatched(cublasHandle_t handle, int n,
+                    const cuDoubleComplex *const A[], /*Device pointer*/
+                    int lda, const int *P,            /*Device pointer*/
+                    cuDoubleComplex *const C[],       /*Device pointer*/
+                    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const float *const Aarray[], int lda, const int *devIpiv,
+    float *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
+      const int *, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *const Aarray[], int lda, const int *devIpiv,
+    double *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
+      const int *, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *const Aarray[], int lda, const int *devIpiv,
+    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
+      int, const int *, cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *const A[], int lda, float *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *const[], int,
+      float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *const A[], int lda, double *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *const[], int,
+      double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
+      int, cuComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
+    int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, float *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                               /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
+                                     int, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, double *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
+                                     int, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZmatinvBatched(cublasHandle_t handle, int n,
+                     const cuDoubleComplex *const A[],       /*Device pointer*/
+                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
+                     int lda_inv, int *info,                 /*Device Pointer*/
+                     int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    float *const Aarray[],            /*Device pointer*/
+                    int lda, float *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
+                                     int, float *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    double *const Aarray[],            /*Device pointer*/
+                    int lda, double *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
+                                     int, double *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuComplex *const Aarray[],            /*Device pointer*/
+                    int lda, cuComplex *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
+    cublasHandle_t handle, int m, int n,
+    cuDoubleComplex *const Aarray[],            /*Device pointer*/
+    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *const Aarray[],       /*Device pointer*/
+                   int lda, float *const Carray[],        /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
+      float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *const Aarray[],      /*Device pointer*/
+                   int lda, double *const Carray[],       /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
+      double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
+      cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int,
+      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
+  return func_ptr(n, x, incx);
+}
+
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
+  return func_ptr(n, x, incx, y, incy, sc, cs);
+}
+
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index b8e203fe235..b7f8be717f5 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -60,6 +60,8 @@ typedef enum {} cublasMath_t;
 // Parameter constness changed in cuBLAS 9.2
 #if CUDA_VERSION < 9020
 #include "tensorflow/stream_executor/cuda/cublas_9_0.inc"
-#else
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cublas_10_0.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cublas_10_1.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_10_0.inc b/tensorflow/stream_executor/cuda/cuda_10_0.inc
index 0bbb8912da4..26c272d683c 100644
--- a/tensorflow/stream_executor/cuda/cuda_10_0.inc
+++ b/tensorflow/stream_executor/cuda/cuda_10_0.inc
@@ -2,1580 +2,1828 @@
 
 extern "C" {
 CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(error, pStr);
 }
 
 CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(error, pStr);
 }
 
 CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int);
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(Flags);
 }
 
 CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(driverVersion);
 }
 
 CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, int);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, ordinal);
 }
 
 CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(count);
 }
 
 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(name, len, dev);
 }
 
 CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUuuid *, CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(uuid, dev);
 }
 
 CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(bytes, dev);
 }
 
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                      CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pi, attrib, dev);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevprop *, CUdevice);
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
+                                                         CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(prop, dev);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUdevice);
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
+                                                             int *minor,
+                                                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(major, minor, dev);
 }
 
 CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pctx, dev);
 }
 
 CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev);
 }
 
 CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev, flags);
 }
 
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int *, int *);
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
+                                            int *active) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev, flags, active);
 }
 
 CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev);
 }
 
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
+                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pctx, flags, dev);
 }
 
 CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctx);
 }
 
 CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctx);
 }
 
 CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pctx);
 }
 
 CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctx);
 }
 
 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pctx);
 }
 
 CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device);
 }
 
 CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *);
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(flags);
 }
 
 CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult (CUDAAPI *)();
+  using FuncPtr = CUresult(CUDAAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUlimit, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(limit, value);
 }
 
 CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUlimit);
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pvalue, limit);
 }
 
 CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pconfig);
 }
 
 CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache);
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(config);
 }
 
 CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pConfig);
 }
 
 CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig);
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(config);
 }
 
 CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctx, version);
 }
 
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, int *);
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
+                                             int *greatestPriority) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(leastPriority, greatestPriority);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pctx, flags);
 }
 
 __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctx);
 }
 
 CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const char *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(module, fname);
 }
 
 CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(module, image);
 }
 
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *, unsigned int, CUjit_option *, void **);
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
+                                      CUjit_option *, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(module, image, numOptions, options, optionValues);
 }
 
 CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(module, fatCubin);
 }
 
 CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmodule);
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hmod);
 }
 
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction *, CUmodule, const char *);
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                     const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, hmod, name);
 }
 
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
+                                   CUmodule hmod, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr, bytes, hmod, name);
 }
 
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *, CUmodule, const char *);
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
+                                   const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pTexRef, hmod, name);
 }
 
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
+                                    const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pSurfRef, hmod, name);
 }
 
-CUresult CUDAAPI
-cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+                              void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numOptions, options, optionValues, stateOut);
 }
 
-CUresult CUDAAPI
-cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-    unsigned int numOptions, CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **);
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
+                               void *data, size_t size, const char *name,
+                               unsigned int numOptions, CUjit_option *options,
+                               void **optionValues) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
+                          const char *, unsigned int, CUjit_option *, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options, optionValues);
+  return func_ptr(state, type, data, size, name, numOptions, options,
+                  optionValues);
 }
 
-CUresult CUDAAPI
-cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-    unsigned int numOptions, CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **);
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
+                               const char *path, unsigned int numOptions,
+                               CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
+                                      unsigned int, CUjit_option *, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(state, type, path, numOptions, options, optionValues);
 }
 
-CUresult CUDAAPI
-cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, void **, size_t *);
+CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
+                                size_t *sizeOut) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(state, cubinOut, sizeOut);
 }
 
-CUresult CUDAAPI
-cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState);
+CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(state);
 }
 
 CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult (CUDAAPI *)(size_t *, size_t *);
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(free, total);
 }
 
 CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr, bytesize);
 }
 
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t, unsigned int);
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
+                                 size_t WidthInBytes, size_t Height,
+                                 unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
+                                      unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
 }
 
 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr);
 }
 
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
+                                      CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pbase, psize, dptr);
 }
 
 CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pp, bytesize);
 }
 
 CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(p);
 }
 
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t, unsigned int);
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
+                                unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pp, bytesize, Flags);
 }
 
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
+                                           unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pdptr, p, Flags);
 }
 
 CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, void *);
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pFlags, p);
 }
 
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+                                   unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr, bytesize, flags);
 }
 
 CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, const char *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev, pciBusId);
 }
 
 CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pciBusId, len, dev);
 }
 
 CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUipcEventHandle *, CUevent);
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pHandle, event);
 }
 
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, CUipcEventHandle);
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
+                                      CUipcEventHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phEvent, handle);
 }
 
 CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pHandle, dptr);
 }
 
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
+                                    unsigned int Flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pdptr, handle, Flags);
 }
 
 CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dptr);
 }
 
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, unsigned int);
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(p, bytesize, Flags);
 }
 
 CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(p);
 }
 
 CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dst, src, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
+                              CUdeviceptr srcDevice, CUcontext srcContext,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t);
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, srcHost, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t);
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstHost, srcDevice, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, srcDevice, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
+                              CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
+                              size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t);
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
+                              const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t);
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
 }
 
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
+                              CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
 }
 
 CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy);
 }
 
 CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy);
 }
 
 CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy);
 }
 
 CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy);
 }
 
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
+                               size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dst, src, ByteCount, hStream);
 }
 
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
+                                   CUdeviceptr srcDevice, CUcontext srcContext,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream);
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
+                  hStream);
 }
 
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, srcHost, ByteCount, hStream);
 }
 
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstHost, srcDevice, ByteCount, hStream);
 }
 
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
 }
 
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                   const void *srcHost, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
 }
 
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
+                                   size_t srcOffset, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
 }
 
 CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy, hStream);
 }
 
 CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy, hStream);
 }
 
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
+                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pCopy, hStream);
 }
 
 CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, uc, N);
 }
 
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
+                             size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, us, N);
 }
 
 CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, ui, N);
 }
 
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
+                              unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, uc, Width, Height);
 }
 
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, us, Width, Height);
 }
 
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, ui, Width, Height);
 }
 
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+                                 size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, uc, N, hStream);
 }
 
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, us, N, hStream);
 }
 
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, ui, N, hStream);
 }
 
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                   unsigned char uc, size_t Width,
+                                   size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
+                                      size_t, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
 }
 
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned short us, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
+                                      size_t, size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
 }
 
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream);
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned int ui, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
+                                      size_t, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
 }
 
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
+                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pHandle, pAllocateArray);
 }
 
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
+                                      CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pArrayDescriptor, hArray);
 }
 
 CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray);
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hArray);
 }
 
-CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+CUresult CUDAAPI cuArray3DCreate(
+    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pHandle, pAllocateArray);
 }
 
-CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+CUresult CUDAAPI cuArray3DGetDescriptor(
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pArrayDescriptor, hArray);
 }
 
-CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+CUresult CUDAAPI
+cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
+                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
+                       unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
 }
 
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
+                                          CUmipmappedArray hMipmappedArray,
+                                          unsigned int level) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pLevelArray, hMipmappedArray, level);
 }
 
 CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray);
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hMipmappedArray);
 }
 
-CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+CUresult CUDAAPI cuPointerGetAttribute(void *data,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(data, attribute, ptr);
 }
 
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
+                                    CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devPtr, count, dstDevice, hStream);
 }
 
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
+                             CUmem_advise advice, CUdevice device) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devPtr, count, advice, device);
 }
 
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, CUmem_range_attribute, CUdeviceptr, size_t);
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
+                                        CUmem_range_attribute attribute,
+                                        CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
+                                      CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(data, dataSize, attribute, devPtr, count);
 }
 
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
+                                         CUmem_range_attribute *attributes,
+                                         size_t numAttributes,
+                                         CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
 }
 
-CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+CUresult CUDAAPI cuPointerSetAttribute(const void *value,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(value, attribute, ptr);
 }
 
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
+                                        CUpointer_attribute *attributes,
+                                        void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
+                                      void **, CUdeviceptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numAttributes, attributes, data, ptr);
 }
 
 CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phStream, Flags);
 }
 
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int, int);
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
+                                            unsigned int flags, int priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phStream, flags, priority);
 }
 
 CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, int *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, priority);
 }
 
 CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, flags);
 }
 
 CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUcontext *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, pctx);
 }
 
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUevent, unsigned int);
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, hEvent, Flags);
 }
 
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
+                                     CUstreamCallback callback, void *userData,
+                                     unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, callback, userData, flags);
 }
 
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
+                                        size_t length, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, dptr, length, flags);
 }
 
 CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream);
 }
 
 CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream);
 }
 
 CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream);
 }
 
 CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, unsigned int);
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phEvent, Flags);
 }
 
 CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent, CUstream);
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hEvent, hStream);
 }
 
 CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hEvent);
 }
 
 CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hEvent);
 }
 
 CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hEvent);
 }
 
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd) {
-  using FuncPtr = CUresult (CUDAAPI *)(float *, CUevent, CUevent);
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                    CUevent hEnd) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pMilliseconds, hStart, hEnd);
 }
 
-CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory *, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+CUresult CUDAAPI
+cuImportExternalMemory(CUexternalMemory *extMem_out,
+                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
+                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extMem_out, memHandleDesc);
 }
 
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
+    CUdeviceptr *devPtr, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
+                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devPtr, extMem, bufferDesc);
 }
 
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
+    CUmipmappedArray *mipmap, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
+                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mipmap, extMem, mipmapDesc);
 }
 
 CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory);
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extMem);
 }
 
-CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+CUresult CUDAAPI cuImportExternalSemaphore(
+    CUexternalSemaphore *extSem_out,
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extSem_out, semHandleDesc);
 }
 
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *,
+      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extSemArray, paramsArray, numExtSems, stream);
 }
 
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *, unsigned int, CUstream);
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
+      unsigned int, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extSemArray, paramsArray, numExtSems, stream);
 }
 
 CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore);
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(extSem);
 }
 
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
+                                     cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, addr, value, flags);
 }
 
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
+                                     cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, addr, value, flags);
 }
 
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
+                                      cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, addr, value, flags);
 }
 
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
+                                      cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, addr, value, flags);
 }
 
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int);
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
+                                    CUstreamBatchMemOpParams *paramArray,
+                                    unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
+                                      CUstreamBatchMemOpParams *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, count, paramArray, flags);
 }
 
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                    CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pi, attrib, hfunc);
 }
 
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
+                                    CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, attrib, value);
 }
 
 CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunc_cache);
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, config);
 }
 
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUsharedconfig);
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
+                                          CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, config);
 }
 
-CUresult CUDAAPI cuLaunchKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
+CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                unsigned int gridDimY, unsigned int gridDimZ,
+                                unsigned int blockDimX, unsigned int blockDimY,
                                 unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams,
-                                void **extra) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+                                unsigned int sharedMemBytes, CUstream hStream,
+                                void **kernelParams, void **extra) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
 }
 
-CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **);
+CUresult CUDAAPI cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams);
 }
 
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
+    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(launchParamsList, numDevices, flags);
 }
 
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUhostFn, void *);
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
+                                  void *userData) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hStream, fn, userData);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
+                                                       int y, int z) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, x, y, z);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
+                                                       unsigned int bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, bytes);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
+                                                  unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, numbytes);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, unsigned int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
+                                               unsigned int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, offset, value);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, float);
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
+                                               float value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, offset, value);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, void *, unsigned int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
+                                               void *ptr,
+                                               unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, offset, ptr, numbytes);
 }
 
 __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction);
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(f);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int);
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
+                                                int grid_height) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(f, grid_width, grid_height);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, CUstream);
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
+                                                     int grid_width,
+                                                     int grid_height,
+                                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(f, grid_width, grid_height, hStream);
 }
 
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, CUtexref);
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
+                                                    int texunit,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hfunc, texunit, hTexRef);
 }
 
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
 }
 
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
 }
 
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
+                                      CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit);
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit);
 }
 
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags);
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit, flags);
 }
 
-CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray, unsigned int);
+CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray,
+                                  unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, hArray, Flags);
 }
 
-CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef,
+                                           CUmipmappedArray hMipmappedArray,
+                                           unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, hMipmappedArray, Flags);
 }
 
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) {
-  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef,
+                                    CUdeviceptr dptr, size_t bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ByteOffset, hTexRef, dptr, bytes);
 }
 
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t);
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef,
+                                      const CUDA_ARRAY_DESCRIPTOR *desc,
+                                      CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
+                                      CUdeviceptr, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, desc, dptr, Pitch);
 }
 
-CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray_format, int);
+CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt,
+                                   int NumPackedComponents) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, fmt, NumPackedComponents);
 }
 
-CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, int, CUaddress_mode);
+CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim,
+                                        CUaddress_mode am) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, dim, am);
 }
 
 CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, fm);
 }
 
-CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef,
+                                             CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, fm);
 }
 
 CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, bias);
 }
 
-CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float, float);
+CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef,
+                                             float minMipmapLevelClamp,
+                                             float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
 }
 
-CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef,
+                                          unsigned int maxAniso) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, maxAniso);
 }
 
 CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, pBorderColor);
 }
 
 CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef, Flags);
 }
 
 CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pdptr, hTexRef);
 }
 
 CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phArray, hTexRef);
 }
 
-CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUtexref);
+CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray,
+                                           CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phMipmappedArray, hTexRef);
 }
 
-CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef,
+                                        int dim) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pam, hTexRef, dim);
 }
 
 CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pfm, hTexRef);
 }
 
-CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray_format *, int *, CUtexref);
+CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels,
+                                   CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pFormat, pNumChannels, hTexRef);
 }
 
-CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm,
+                                             CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pfm, hTexRef);
 }
 
 CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pbias, hTexRef);
 }
 
-CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(float *, float *, CUtexref);
+CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
+                                             float *pmaxMipmapLevelClamp,
+                                             CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
 }
 
 CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pmaxAniso, hTexRef);
 }
 
 CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pBorderColor, hTexRef);
 }
 
 CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pFlags, hTexRef);
 }
 
 __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pTexRef);
 }
 
 __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexref);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hTexRef);
 }
 
-CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(hSurfRef, hArray, Flags);
 }
 
 CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUsurfref);
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(phArray, hSurfRef);
 }
 
-CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *, const CUDA_TEXTURE_DESC *, const CUDA_RESOURCE_VIEW_DESC *);
+CUresult CUDAAPI
+cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
+                  const CUDA_TEXTURE_DESC *pTexDesc,
+                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
+                                      const CUDA_TEXTURE_DESC *,
+                                      const CUDA_RESOURCE_VIEW_DESC *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
 }
 
 CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject);
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(texObject);
 }
 
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                            CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pResDesc, texObject);
 }
 
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
+                                           CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pTexDesc, texObject);
 }
 
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pResViewDesc, texObject);
 }
 
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
+                                    const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pSurfObject, pResDesc);
 }
 
 CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject);
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(surfObject);
 }
 
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                             CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pResDesc, surfObject);
 }
 
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice, CUdevice);
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
+                                       CUdevice peerDev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(canAccessPeer, dev, peerDev);
 }
 
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int);
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
+                                       unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(peerContext, Flags);
 }
 
 CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(peerContext);
 }
 
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) {
-  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
+                                         CUdevice_P2PAttribute attrib,
+                                         CUdevice srcDevice,
+                                         CUdevice dstDevice) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(value, attrib, srcDevice, dstDevice);
 }
 
 CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource);
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(resource);
 }
 
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUgraphicsResource, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
+    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
+    unsigned int mipLevel) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
+                                      unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pArray, resource, arrayIndex, mipLevel);
 }
 
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
+    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pMipmappedArray, resource);
 }
 
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
+    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(pDevPtr, pSize, resource);
 }
 
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags) {
-  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(resource, flags);
 }
 
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
+                                        CUgraphicsResource *resources,
+                                        CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(count, resources, hStream);
 }
 
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
-  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
+                                          CUgraphicsResource *resources,
+                                          CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(count, resources, hStream);
 }
 
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult (CUDAAPI *)(const void **, const CUuuid *);
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
+                                  const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ppExportTable, pExportTableId);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 421b9b4ce42..aceec6211a7 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -2179,11 +2179,11 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   // whether a scratch allocator was passed.
   if (scratch_allocator != nullptr) {
     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
-                        scratch_allocator->AllocateBytes(stream, size));
+                        scratch_allocator->AllocateBytes(size));
     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
-                        scratch_allocator->AllocateBytes(stream, size));
+                        scratch_allocator->AllocateBytes(size));
     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
-                        scratch_allocator->AllocateBytes(stream, size));
+                        scratch_allocator->AllocateBytes(size));
     a = DeviceMemory<CUDA_T *>(a_bytes);
     b = DeviceMemory<CUDA_T *>(b_bytes);
     c = DeviceMemory<CUDA_T *>(c_bytes);
@@ -2794,6 +2794,18 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
+port::Status CUDABlas::GetVersion(string *version) {
+  absl::MutexLock lock(&mu_);
+
+  int v;
+  auto status = cublasGetVersion(blas_, &v);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::InternalError(ToString(status));
+  }
+  *version = std::to_string(v);
+  return port::Status::OK();
+}
+
 }  // namespace gpu
 
 void initialize_cublas() {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 4e900b41881..d15fdd06556 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -952,8 +952,8 @@ class CudnnDropoutDescriptor {
       size_t state_sizes_in_bytes = 0;
       RETURN_IF_CUDNN_ERROR(
           cudnnDropoutGetStatesSize(cudnn.handle(), &state_sizes_in_bytes));
-      SE_ASSIGN_OR_RETURN(state_memory, state_allocator->AllocateBytes(
-                                            nullptr, state_sizes_in_bytes));
+      SE_ASSIGN_OR_RETURN(state_memory,
+                          state_allocator->AllocateBytes(state_sizes_in_bytes));
     }
     RETURN_IF_CUDNN_ERROR(cudnnSetDropoutDescriptor(
         handle.get(), cudnn.handle(), dropout, state_memory.opaque(),
@@ -1043,7 +1043,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
       cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
       cudnnDataType_t data_type, cudnnDataType_t compute_type,
       const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
-      ScratchAllocator* state_allocator) {
+      ScratchAllocator* state_allocator, bool use_padded_io) {
     SE_ASSIGN_OR_RETURN(
         CudnnDropoutDescriptor dropout_desc,
         CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
@@ -1079,8 +1079,10 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // But in the future if these APIs are used to process full length arrays,
     // we need to distinguish when to set it.
 #if CUDNN_VERSION >= 7201
-    RETURN_IF_CUDNN_ERROR(
-        cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED));
+    if (use_padded_io) {
+      RETURN_IF_CUDNN_ERROR(
+          cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
 #endif
 
     port::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
@@ -1603,7 +1605,7 @@ port::StatusOr<DeviceMemory<uint8>> CreateRnnWorkspace(
   if (workspace_size_in_bytes == 0) {
     return DeviceMemory<uint8>();
   }
-  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+  return workspace_allocator->AllocateBytes(workspace_size_in_bytes);
 }
 
 #if CUDNN_VERSION >= 7402
@@ -1628,7 +1630,7 @@ port::StatusOr<DeviceMemory<uint8>> CreateBatchNormForwardWorkspace(
   if (workspace_size_in_bytes == 0) {
     return DeviceMemory<uint8>();
   }
-  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+  return workspace_allocator->AllocateBytes(workspace_size_in_bytes);
 }
 
 port::StatusOr<DeviceMemory<uint8>> CreateBatchNormBackwardWorkspace(
@@ -1652,7 +1654,7 @@ port::StatusOr<DeviceMemory<uint8>> CreateBatchNormBackwardWorkspace(
   if (workspace_size_in_bytes == 0) {
     return DeviceMemory<uint8>();
   }
-  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+  return workspace_allocator->AllocateBytes(workspace_size_in_bytes);
 }
 #endif
 
@@ -1701,9 +1703,8 @@ port::Status CudnnSupport::DoRnnForwardImpl(
         /*sizeInBytes=*/&reserve_space_size_in_bytes));
 
     if (reserve_space_size_in_bytes > 0) {
-      SE_ASSIGN_OR_RETURN(reserve_space,
-                          reserve_space_allocator->AllocateBytes(
-                              stream, reserve_space_size_in_bytes));
+      SE_ASSIGN_OR_RETURN(reserve_space, reserve_space_allocator->AllocateBytes(
+                                             reserve_space_size_in_bytes));
     }
   }
 
@@ -1974,7 +1975,8 @@ CudnnSupport::createRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    float dropout, uint64 seed, ScratchAllocator* state_allocator) {
+    float dropout, uint64 seed, ScratchAllocator* state_allocator,
+    bool use_padded_io) {
   // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
   // not enqueueing anything into a stream, we pass in the null stream.
   auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr);
@@ -1985,7 +1987,7 @@ CudnnSupport::createRnnDescriptor(
           ToCudnnRnnInputMode(input_mode),
           ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
           ToCudnnDataType(data_type), GetRnnComputeType(data_type),
-          algorithm_config, dropout, seed, state_allocator));
+          algorithm_config, dropout, seed, state_allocator, use_padded_io));
   return std::unique_ptr<dnn::RnnDescriptor>(
       new CudnnRnnDescriptor(std::move(rnn_desc)));
 }
@@ -2401,7 +2403,7 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
                         "No scratch allocator provided");
   }
 
-  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+  return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
 port::StatusOr<DeviceMemory<uint8>>
@@ -2446,7 +2448,7 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
                         "No scratch allocator provided");
   }
 
-  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+  return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
 port::StatusOr<DeviceMemory<uint8>>
@@ -2491,7 +2493,7 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
                         "No scratch allocator provided");
   }
 
-  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+  return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
 static bool TensorOpMathAvailable(int cc_major) {
@@ -2512,7 +2514,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
             : 0ll;
     SE_ASSIGN_OR_RETURN(cudnnConvolutionFwdAlgo_t algo,
                         GetCudnnConvolutionForwardAlgo(
@@ -2540,8 +2542,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
   if (!algo_desc.has_value()) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
-        "The primary convolution algorithm failed memory allocation, "
-        "while a secondary algorithm is not provided.");
+        absl::StrCat("The primary convolution algorithm failed, ",
+                     "while a secondary algorithm is not provided. ",
+                     "Returned status: ", scratch_or.status().ToString()));
   }
 
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
@@ -2564,7 +2567,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
             : 0ll;
     SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdDataAlgo_t algo,
                         GetCudnnConvolutionBackwardDataAlgo(
@@ -2616,7 +2619,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
             : 0ll;
     SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdFilterAlgo_t algo,
                         GetCudnnConvolutionBackwardFilterAlgo(
@@ -2852,7 +2855,8 @@ port::Status CudnnSupport::DoPrepareForConvolution(
 }
 
 port::Status CudnnSupport::DoConvolve(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -2862,7 +2866,8 @@ port::Status CudnnSupport::DoConvolve(
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = ToCudnnDataType(element_type);
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor output_nd(output_descriptor,
+                                  ToCudnnDataType(output_type));
   CudnnFilterDescriptor filter_nd(filter_descriptor, cudnn_type);
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
@@ -2931,6 +2936,16 @@ port::Status CudnnSupport::DoConvolve(
           "This configuration has potential integer overflow in "
           "cuDNNv5 and cuDNNv6. See b/68264959.");
     }
+    if (CUDNN_VERSION < 8000) {
+      if (algorithm_desc.algo_id() ==
+              CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM &&
+          ToCudnnDataType(element_type) == CUDNN_DATA_INT8 &&
+          ToCudnnDataType(output_type) == CUDNN_DATA_FLOAT) {
+        return port::Status(
+            port::error::FAILED_PRECONDITION,
+            "This configuration potentially produces incorrect results.");
+      }
+    }
     return port::Status::OK();
   };
 
@@ -3099,18 +3114,19 @@ static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
   return math_type == CUDNN_TENSOR_OP_MATH;
 }
 
-template <typename ElementType, typename BiasType, typename ScaleType>
+template <typename ElementType, typename BiasType, typename ScaleType,
+          typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<ElementType>& conv_input_data,
     ScaleType conv_input_scale, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<ElementType>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<ElementType>& side_input_data,
-    ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<OutputType>& side_input_data, ScaleType side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
     const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+    DeviceMemory<OutputType>* output_data, dnn::DataType accumulator_type,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
@@ -3126,7 +3142,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor output_nd(
       output_descriptor,
-      GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
+      GetCudnnDataType<OutputType>(conv_input_descriptor.layout()));
   CudnnFilterDescriptor filter(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
@@ -3469,9 +3485,8 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
               /*activationDesc=*/activation_desc.handle(),
               /*xDesc=*/x_descriptor.handle(),
               /*sizeInBytes=*/&reserve_space_size_in_bytes));
-      SE_ASSIGN_OR_RETURN(reserve_space,
-                          reserve_space_allocator->AllocateBytes(
-                              stream, reserve_space_size_in_bytes));
+      SE_ASSIGN_OR_RETURN(reserve_space, reserve_space_allocator->AllocateBytes(
+                                             reserve_space_size_in_bytes));
     }
   }
 #endif
@@ -3768,6 +3783,40 @@ bool CudnnSupport::DoFusedConvolve(
                     "supported on GPUs with compute capability 6.1 or later.";
     return false;
   }
+
+  return IsStatusOk(
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
+}
+
+bool CudnnSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<int8>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<float>& side_input_data, float side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  int cc_major, cc_minor;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor);
+  if (cc_major < 6 || (cc_major == 6 && cc_minor < 1)) {
+    LOG(WARNING) << "cudnnConvolutionBiasActivationForward() for int8 is only "
+                    "supported on GPUs with compute capability 6.1 or later.";
+    return false;
+  }
+
   return IsStatusOk(
       DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index e3742c07a56..93beee85a5a 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -51,7 +51,8 @@ class CudnnSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      float dropout, uint64 seed, ScratchAllocator* state_allocator) override;
+      float dropout, uint64 seed, ScratchAllocator* state_allocator,
+      bool use_padded_io) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
@@ -266,7 +267,8 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator) override;
 
   port::Status DoConvolve(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType element_type,
+      dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemoryBase filter_data,
@@ -336,6 +338,20 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
+  bool DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int8>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<float>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
   bool DoConvolveQuantized(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<float>& input_data,
@@ -587,7 +603,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop, DeviceMemory<uint8>* reserve_space_data,
       ScratchAllocator* workspace_allocator);
 
-  template <typename ElementType, typename BiasType, typename ScaleType>
+  template <typename ElementType, typename BiasType, typename ScaleType,
+            typename OutputType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<ElementType>& conv_input_data,
@@ -595,11 +612,11 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<ElementType>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<ElementType>& side_input_data,
+      const DeviceMemory<OutputType>& side_input_data,
       ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+      DeviceMemory<OutputType>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index d323b743010..f7a69fc086a 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -539,7 +539,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool GpuDriver::LaunchKernel(
+/* static */ port::Status GpuDriver::LaunchKernel(
     GpuContext* context, CUfunction function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
@@ -554,12 +554,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                 block_dim_x, block_dim_y, block_dim_z,
                                 shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "failed to launch CUDA kernel: " << function
-               << "; result: " << ToString(res);
-    return false;
+    return port::InternalError(absl::StrCat(
+        "Failed to launch CUDA kernel: ", reinterpret_cast<uint64>(function),
+        "; result: ", ToString(res)));
   }
   VLOG(2) << "successfully launched kernel";
-  return true;
+  return port::Status::OK();
 }
 
 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
@@ -575,11 +575,11 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
-                                     const char* ptx_contents,
-                                     CUmodule* module) {
+/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
+                                             const char* ptx_contents,
+                                             CUmodule* module) {
   absl::Notification notification;
-  bool ret = true;
+  port::Status ret = port::Status::OK();
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
     ScopedActivateContext activation(context);
@@ -629,7 +629,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                               : 0] = '\0';
       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
                  << " bytes): " << error_log_buffer.data();
-      ret = false;
+      ret = port::InternalError(
+          absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
       notification.Notify();
     }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 3bf2f5b9742..79047d989bb 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -244,8 +244,7 @@ port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
 port::Status CUDAFftPlan::UpdateScratchAllocator(
     Stream *stream, ScratchAllocator *scratch_allocator) {
   if (scratch_size_bytes_ != 0) {
-    auto allocated =
-        scratch_allocator->AllocateBytes(stream, scratch_size_bytes_);
+    auto allocated = scratch_allocator->AllocateBytes(scratch_size_bytes_);
     if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "failed to allocate work area.";
       return allocated.status();
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index a9289e35c6e..38d3dc98463 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -217,16 +217,13 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
+port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
+                                              CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
 
   if (*module == nullptr) {
-    auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
-    if (!load_status.ok()) {
-      LOG(ERROR) << "failed to load CUBIN: " << load_status;
-      return false;
-    }
+    TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
     module_refcount = 1;
     VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
             << " as module " << *module;
@@ -236,17 +233,15 @@ bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
             << " is already loaded as module " << *module;
   }
   gpu_binary_to_module_[cubin] = {*module, module_refcount};
-  return true;
+  return port::Status::OK();
 }
 
-bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
+port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
 
   if (*module == nullptr) {
-    if (!GpuDriver::LoadPtx(context_, ptx, module)) {
-      return false;
-    }
+    TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
     VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
             << *module;
     module_refcount = 1;
@@ -256,7 +251,7 @@ bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
             << " is already loaded as module " << module;
   }
   gpu_binary_to_module_[ptx] = {*module, module_refcount};
-  return true;
+  return port::Status::OK();
 }
 
 bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
@@ -264,8 +259,8 @@ bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
   return false;
 }
 
-bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                            KernelBase* kernel) {
+port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                    KernelBase* kernel) {
   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const string *kernelname;
@@ -276,15 +271,13 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     absl::MutexLock lock{&in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
     const char *cubin = spec.cuda_cubin_in_memory().bytes();
-    if (!LoadModuleFromCuBin(cubin, &module)) {
-      return false;
-    }
+    TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
     kernel_to_gpu_binary_[kernel] = cubin;
   } else if (spec.has_cuda_ptx_in_memory()) {
     kernelname = &spec.cuda_ptx_in_memory().kernelname();
 
     if (cc_major_ == 0 && cc_minor_ == 0) {
-      return false;
+      return port::InternalError("Compute capability not set");
     }
 
     const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
@@ -292,23 +285,19 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
       ptx = spec.cuda_ptx_in_memory().default_text();
     }
     if (ptx == nullptr) {
-      LOG(FATAL) << "loader spec has no ptx for kernel " << *kernelname;
-      return false;
+      LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
     }
 
     absl::MutexLock lock{&in_memory_modules_mu_};
-    if (!LoadModuleFromPtx(ptx, &module)) {
-      return false;
-    }
+    TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
     kernel_to_gpu_binary_[kernel] = ptx;
   } else {
-    LOG(WARNING) << "no method of loading CUDA kernel provided";
-    return false;
+    return port::InternalError("No method of loading CUDA kernel provided");
   }
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                     cuda_kernel->gpu_function_ptr())) {
-    return false;
+    return port::InternalError("Could not find the corresponding function");
   }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
@@ -321,7 +310,7 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   }
   kernel->set_metadata(kernel_metadata);
   kernel->set_name(*kernelname);
-  return true;
+  return port::Status::OK();
 }
 
 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
@@ -357,40 +346,36 @@ void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                             ModuleHandle* module_handle) {
+port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                                     ModuleHandle* module_handle) {
   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   CUmodule cu_module;
   if (spec.has_cuda_cubin_in_memory()) {
     absl::MutexLock lock{&in_memory_modules_mu_};
-    if (!LoadModuleFromCuBin(
-            reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
-            &cu_module)) {
-      return false;
-    }
+    TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
+        reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
+        &cu_module));
     *module_handle = ModuleHandle(const_cast<void *>(
         static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
-    return true;
+    return port::Status::OK();
   } else if (spec.has_cuda_ptx_in_memory()) {
     if (cc_major_ == 0 && cc_minor_ == 0) {
-      return false;
+      return port::InternalError("Compute capability not set");
     }
 
     if (!spec.cuda_ptx_in_memory()) {
-      return false;
+      return port::InternalError("PTX not found in spec");
     }
 
     absl::MutexLock lock{&in_memory_modules_mu_};
-    if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
-      return false;
-    }
+    TF_RETURN_IF_ERROR(
+        LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
     *module_handle = ModuleHandle(const_cast<void *>(
         static_cast<const void *>(spec.cuda_ptx_in_memory())));
-    return true;
+    return port::Status::OK();
   }
-  LOG(WARNING) << "no method of loading CUDA module provided";
-  return false;
+  return port::InternalError("No method of loading CUDA module provided");
 }
 
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
@@ -417,9 +402,10 @@ bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
   return true;
 }
 
-bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
-                         const BlockDim& block_dims, const KernelBase& kernel,
-                         const KernelArgsArrayBase& args) {
+port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                 const BlockDim& block_dims,
+                                 const KernelBase& kernel,
+                                 const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
   CUstream custream = AsGpuStreamValue(stream);
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
@@ -445,19 +431,10 @@ bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
-                               block_dims.z, thread_dims.x, thread_dims.y,
-                               thread_dims.z, args.number_of_shared_bytes(),
-                               custream, kernel_params,
-                               nullptr /* = extra */)) {
-    LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
-               << args.number_of_arguments()
-               << " args; thread dim: " << thread_dims.ToString()
-               << "; block dim: " << block_dims.ToString();
-    return false;
-  }
-
-  return true;
+  return GpuDriver::LaunchKernel(
+      context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
+      thread_dims.y, thread_dims.z, args.number_of_shared_bytes(), custream,
+      kernel_params, nullptr /* = extra */);
 }
 
 // This is a non-essential operation; if there's a failure, proceed without
@@ -907,7 +884,7 @@ bool GpuExecutor::GetSymbol(const string& symbol_name,
     }
   }
 
-  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  LOG(INFO) << "Failed to find symbol in any modules: " << symbol_name;
   return false;
 }
 
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index acdf34e373f..0c6b274f88b 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -25,7 +25,11 @@ void* GetDsoHandle() {
   static auto handle = []() -> void* {
     auto handle_or =
         stream_executor::internal::DsoLoader::GetCudaRuntimeDsoHandle();
-    if (!handle_or.ok()) return nullptr;
+    if (!handle_or.ok()) {
+      LOG(INFO) << "Ignore above cudart dlerror if you do not have a GPU set "
+                   "up on your machine.";
+      return nullptr;
+    }
     return handle_or.ValueOrDie();
   }();
   return handle;
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_6.inc b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
new file mode 100644
index 00000000000..030f3ed20d0
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
@@ -0,0 +1,3162 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t, cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, const uint32_t, const cudnnTensorFormat_t, const int32_t [], const int32_t [], const uint32_t [], const cudnnFoldingDirection_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA, foldA, direction);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *, int32_t [], int32_t [], uint32_t [], cudnnFoldingDirection_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA, padAfterA, foldA, direction);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorFormat_t, cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, transformFormat, foldedFilterDesc, paddedDiffDesc, foldedConvDesc, foldedGradDesc, filterFoldTransDesc, diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t, const void *, void *, int, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, reorderType, filterData, reorderedFilterData, reorderBias, biasData, reorderedBiasData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, reorderType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, reorderType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                    const cudnnTensorDescriptor_t xDesc,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const cudnnConvolutionDescriptor_t convDesc,
+                                    const cudnnTensorDescriptor_t yDesc,
+                                    cudnnConvolutionFwdPreference_t preference,
+                                    size_t memoryLimitInBytes,
+                                    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnConvolutionDescriptor_t convDesc,
+                                           const cudnnFilterDescriptor_t dwDesc,
+                                           cudnnConvolutionBwdFilterPreference_t preference,
+                                           size_t memoryLimitInBytes,
+                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                         const cudnnFilterDescriptor_t wDesc,
+                                         const cudnnTensorDescriptor_t dyDesc,
+                                         const cudnnConvolutionDescriptor_t convDesc,
+                                         const cudnnTensorDescriptor_t dxDesc,
+                                         cudnnConvolutionBwdDataPreference_t preference,
+                                         size_t memoryLimitInBytes,
+                                         cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      const int hiddenSize,
+                      const int numLayers,
+                      cudnnDropoutDescriptor_t dropoutDesc,
+                      cudnnRNNInputMode_t inputMode,
+                      cudnnDirectionMode_t direction,
+                      cudnnRNNMode_t mode,
+                      cudnnRNNAlgo_t algo,
+                      cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      int *hiddenSize,
+                      int *numLayers,
+                      cudnnDropoutDescriptor_t *dropoutDesc,
+                      cudnnRNNInputMode_t *inputMode,
+                      cudnnDirectionMode_t *direction,
+                      cudnnRNNMode_t *mode,
+                      cudnnRNNAlgo_t *algo,
+                      cudnnDataType_t *mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, biasMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, biasMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workspace,
+                         size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workspace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int [], const cudnnSeqDataAxis_t [], size_t, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int [], cudnnSeqDataAxis_t [], size_t *, size_t, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes, seqLengthArraySize, seqLengthSizeRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       cudnnAttnQueryMap_t queryMap,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t, cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       cudnnAttnQueryMap_t *queryMap,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *, cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *w,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnMultiHeadAttnWeightKind_t, size_t, const void *, cudnnTensorDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, w, wDesc, wAddr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int *loWinIdx,
+                          const int *hiWinIdx,
+                          const int *seqLengthArrayQRO,
+                          const int *seqLengthArrayKV,
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *w,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx, seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries, residuals, kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int *loWinIdx,
+                               const int *hiWinIdx,
+                               const int *seqLengthArrayDQDO,
+                               const int *seqLengthArrayDKDV,
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *w,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, size_t, const void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO, seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries, dkDesc, dkeys, keys, dvDesc, dvalues, values, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *w,
+                                  void *dw,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, size_t, const void *, void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc, values, doDesc, dout, weightSizeInBytes, w, dw, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *, cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(src, dest);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToCreate);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToDestroy);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, paramLabel, param);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, void *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, paramLabel, param, isNULL);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, paramLabel, ptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, paramLabel, ptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t, cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, plan, varPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
+                         int hiddenSize,
+                         int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, mathPrec);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_stub.cc b/tensorflow/stream_executor/cuda/cudnn_stub.cc
index 3b567c15c6c..5a05437480e 100644
--- a/tensorflow/stream_executor/cuda/cudnn_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_stub.cc
@@ -57,6 +57,8 @@ cudnnStatus_t GetSymbolNotFoundError() { return CUDNN_STATUS_INTERNAL_ERROR; }
 #include "tensorflow/stream_executor/cuda/cudnn_7_1.inc"
 #elif CUDNN_MINOR < 4
 #include "tensorflow/stream_executor/cuda/cudnn_7_3.inc"
-#else
+#elif CUDNN_MINOR < 6
 #include "tensorflow/stream_executor/cuda/cudnn_7_4.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cudnn_7_6.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cufft_10_0.inc b/tensorflow/stream_executor/cuda/cufft_10_0.inc
index 19ae08815f2..ba726770ac3 100644
--- a/tensorflow/stream_executor/cuda/cufft_10_0.inc
+++ b/tensorflow/stream_executor/cuda/cufft_10_0.inc
@@ -1,317 +1,295 @@
 // Auto-generated, do not edit.
 
 extern "C" {
-cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, 
-                                 int nx, 
-                                 cufftType type, 
+cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, int nx, cufftType type,
                                  int batch) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, cufftType, int);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, cufftType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan1d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, type, batch);
 }
 
-cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, 
-                                 int nx, int ny,
+cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, int nx, int ny,
                                  cufftType type) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int, cufftType);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int, cufftType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan2d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, ny, type);
 }
 
-cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, 
-                                 int nx, int ny, int nz, 
+cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, int nx, int ny, int nz,
                                  cufftType type) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int, int, cufftType);
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle *, int, int, int, cufftType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan3d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, ny, nz, type);
 }
 
-cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
-                                   int rank,
-                                   int *n,
+cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan, int rank, int *n,
                                    int *inembed, int istride, int idist,
                                    int *onembed, int ostride, int odist,
-                                   cufftType type,
-                                   int batch) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int *, int *, int, int, int *, int, int, cufftType, int);
+                                   cufftType type, int batch) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int *, int *, int,
+                                          int, int *, int, int, cufftType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlanMany");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch);
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
+                  odist, type, batch);
 }
 
-cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, 
-                                     int nx, 
-                                     cufftType type, 
-                                     int batch,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
+cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, int nx, cufftType type,
+                                     int batch, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan1d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, 
-                                     int nx, int ny,
-                                     cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
+cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, int nx, int ny,
+                                     cufftType type, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan2d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, ny, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, 
-                                     int nx, int ny, int nz, 
-                                     cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
+cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, int nx, int ny, int nz,
+                                     cufftType type, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan3d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, nx, ny, nz, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
-                                       int rank,
-                                       int *n,
+cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan, int rank, int *n,
                                        int *inembed, int istride, int idist,
                                        int *onembed, int ostride, int odist,
-                                       cufftType type,
-                                       int batch,
+                                       cufftType type, int batch,
                                        size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
+                              int, int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
+                  odist, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan, 
-                                         int rank, 
-                                         long long int *n,
-                                         long long int *inembed, 
-                                         long long int istride, 
-                                         long long int idist,
-                                         long long int *onembed, 
-                                         long long int ostride, long long int odist,
-                                         cufftType type, 
-                                         long long int batch,
-                                         size_t * workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, long long *, long long *, long long, long long, long long *, long long, long long, cufftType, long long, size_t *);
+cufftResult CUFFTAPI cufftMakePlanMany64(
+    cufftHandle plan, int rank, long long int *n, long long int *inembed,
+    long long int istride, long long int idist, long long int *onembed,
+    long long int ostride, long long int odist, cufftType type,
+    long long int batch, size_t *workSize) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(
+      cufftHandle, int, long long *, long long *, long long, long long,
+      long long *, long long, long long, cufftType, long long, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany64");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
+                  odist, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
-                                        int rank,
-                                        long long int *n,
-                                        long long int *inembed, 
-                                        long long int istride, long long int idist,
-                                        long long int *onembed, 
-                                        long long int ostride, long long int odist,
-                                        cufftType type,
-                                        long long int batch,
-                                        size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, long long *, long long *, long long, long long, long long *, long long, long long, cufftType, long long, size_t *);
+cufftResult CUFFTAPI cufftGetSizeMany64(
+    cufftHandle plan, int rank, long long int *n, long long int *inembed,
+    long long int istride, long long int idist, long long int *onembed,
+    long long int ostride, long long int odist, cufftType type,
+    long long int batch, size_t *workSize) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(
+      cufftHandle, int, long long *, long long *, long long, long long,
+      long long *, long long, long long, cufftType, long long, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany64");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
+                  odist, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftEstimate1d(int nx, 
-                                     cufftType type, 
-                                     int batch,
+cufftResult CUFFTAPI cufftEstimate1d(int nx, cufftType type, int batch,
                                      size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(int, cufftType, int, size_t *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate1d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(nx, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
-                                     cufftType type,
+cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny, cufftType type,
                                      size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(int, int, cufftType, size_t *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate2d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(nx, ny, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, 
-                                     cufftType type,
+cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, cufftType type,
                                      size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(int, int, int, cufftType, size_t *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate3d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(nx, ny, nz, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftEstimateMany(int rank,
-                                       int *n,
-                                       int *inembed, int istride, int idist,
-                                       int *onembed, int ostride, int odist,
-                                       cufftType type,
-                                       int batch,
-                                       size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+cufftResult CUFFTAPI cufftEstimateMany(int rank, int *n, int *inembed,
+                                       int istride, int idist, int *onembed,
+                                       int ostride, int odist, cufftType type,
+                                       int batch, size_t *workSize) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(int, int *, int *, int, int, int *,
+                                          int, int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimateMany");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+  return func_ptr(rank, n, inembed, istride, idist, onembed, ostride, odist,
+                  type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftCreate(cufftHandle * handle) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *);
+cufftResult CUFFTAPI cufftCreate(cufftHandle *handle) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, 
-                                    int nx, 
-                                    cufftType type, 
-                                    int batch,
-                                    size_t *workSize ) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
+cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, int nx, cufftType type,
+                                    int batch, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize1d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, nx, type, batch, workSize);
 }
 
-cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, 
-                                    int nx, int ny,
-                                    cufftType type,
-                                    size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
+cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, int nx, int ny,
+                                    cufftType type, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize2d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, nx, ny, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
-                                    int nx, int ny, int nz, 
-                                    cufftType type,
-                                    size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
+cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle, int nx, int ny, int nz,
+                                    cufftType type, size_t *workSize) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize3d");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, nx, ny, nz, type, workSize);
 }
 
-cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, 
-                                      int rank, int *n,
+cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, int rank, int *n,
                                       int *inembed, int istride, int idist,
                                       int *onembed, int ostride, int odist,
-                                      cufftType type, int batch, size_t *workArea) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+                                      cufftType type, int batch,
+                                      size_t *workArea) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
+                              int, int, cufftType, int, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workArea);
+  return func_ptr(handle, rank, n, inembed, istride, idist, onembed, ostride,
+                  odist, type, batch, workArea);
 }
 
 cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, size_t *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, workSize);
 }
 
 cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, void *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetWorkArea");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, workArea);
 }
 
-cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int);
+cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan,
+                                            int autoAllocate) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetAutoAllocation");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, autoAllocate);
 }
 
-cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, 
-                                  cufftComplex *idata,
-                                  cufftComplex *odata,
-                                  int direction) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftComplex *, cufftComplex *, int);
+cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, cufftComplex *idata,
+                                  cufftComplex *odata, int direction) {
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2C");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata, direction);
 }
 
-cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, 
-                                  cufftReal *idata,
+cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, cufftReal *idata,
                                   cufftComplex *odata) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftReal *, cufftComplex *);
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, cufftReal *, cufftComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecR2C");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata);
 }
 
-cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, 
-                                  cufftComplex *idata,
+cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, cufftComplex *idata,
                                   cufftReal *odata) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftComplex *, cufftReal *);
+  using FuncPtr =
+      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftReal *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2R");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata);
 }
 
-cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, 
-                                  cufftDoubleComplex *idata,
-                                  cufftDoubleComplex *odata,
-                                  int direction) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleComplex *, cufftDoubleComplex *, int);
+cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, cufftDoubleComplex *idata,
+                                  cufftDoubleComplex *odata, int direction) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
+                                          cufftDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2Z");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata, direction);
 }
 
-cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, 
-                                  cufftDoubleReal *idata,
+cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, cufftDoubleReal *idata,
                                   cufftDoubleComplex *odata) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleReal *, cufftDoubleComplex *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleReal *,
+                                          cufftDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecD2Z");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata);
 }
 
-cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, 
-                                  cufftDoubleComplex *idata,
+cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, cufftDoubleComplex *idata,
                                   cufftDoubleReal *odata) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleComplex *, cufftDoubleReal *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
+                                          cufftDoubleReal *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2D");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, idata, odata);
 }
 
-cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
-                                    cudaStream_t stream) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cudaStream_t);
+cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, cudaStream_t stream) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, stream);
 }
 
 cufftResult CUFFTAPI cufftDestroy(cufftHandle plan) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle);
+  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
 cufftResult CUFFTAPI cufftGetVersion(int *version) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(int *);
+  using FuncPtr = cufftResult(CUFFTAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
-                                      int *value) {
-  using FuncPtr = cufftResult (CUFFTAPI *)(libraryPropertyType, int *);
+cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cufftResult(CUFFTAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
diff --git a/tensorflow/stream_executor/cuda/cupti_10_0.inc b/tensorflow/stream_executor/cuda/cupti_10_0.inc
index 21da4525519..16425df5096 100644
--- a/tensorflow/stream_executor/cuda/cupti_10_0.inc
+++ b/tensorflow/stream_executor/cuda/cupti_10_0.inc
@@ -2,8 +2,8 @@
 
 extern "C" {
 CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char** str) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUptiResult, const char**);
+                                          const char **str) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
   if (!func_ptr) {
     if (str) {
@@ -14,45 +14,45 @@ CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
   return func_ptr(result, str);
 }
 
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t* version) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t*);
+CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t* domainCount,
-                                           CUpti_DomainTable* domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(size_t*, CUpti_DomainTable*);
+CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
+                                           CUpti_DomainTable *domainTable) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(domainCount, domainTable);
 }
 
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle* subscriber,
+CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
                                     CUpti_CallbackFunc callback,
-                                    void* userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_SubscriberHandle*,
-                                         CUpti_CallbackFunc, void*);
+                                    void *userdata) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
+                                          CUpti_CallbackFunc, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(subscriber, callback, userdata);
 }
 
 CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_SubscriberHandle);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(subscriber);
 }
 
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t* enable,
+CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
                                            CUpti_SubscriberHandle subscriber,
                                            CUpti_CallbackDomain domain,
                                            CUpti_CallbackId cbid) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(uint32_t*, CUpti_SubscriberHandle,
-                             CUpti_CallbackDomain, CUpti_CallbackId);
+      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
+                              CUpti_CallbackDomain, CUpti_CallbackId);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(enable, subscriber, domain, cbid);
@@ -62,7 +62,7 @@ CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
                                          CUpti_SubscriberHandle subscriber,
                                          CUpti_CallbackDomain domain,
                                          CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
       uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -72,8 +72,8 @@ CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
 CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
                                        CUpti_SubscriberHandle subscriber,
                                        CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t, CUpti_SubscriberHandle,
-                                         CUpti_CallbackDomain);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
+                                          CUpti_CallbackDomain);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(enable, subscriber, domain);
@@ -81,16 +81,16 @@ CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
 
 CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
                                            CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t, CUpti_SubscriberHandle);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(enable, subscriber);
 }
 
 CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char** name) {
+                                          uint32_t cbid, const char **name) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_CallbackDomain, uint32_t, const char**);
+      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(domain, cbid, name);
@@ -98,7 +98,7 @@ CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
 
 CUptiResult CUPTIAPI
 cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, CUpti_EventCollectionMode);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, mode);
@@ -106,34 +106,34 @@ cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
 
 CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
                                              CUpti_DeviceAttribute attrib,
-                                             size_t* valueSize, void* value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUdevice, CUpti_DeviceAttribute, size_t*, void*);
+                                             size_t *valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
+                                          size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, attrib, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t* timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, uint64_t*);
+                                             uint64_t *timestamp) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, timestamp);
 }
 
 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t* numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUdevice, uint32_t*);
+                                                   uint32_t *numDomains) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, numDomains);
 }
 
 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t* arraySizeBytes, CUpti_EventDomainID* domainArray) {
+    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUdevice, size_t*, CUpti_EventDomainID*);
+      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, arraySizeBytes, domainArray);
@@ -141,26 +141,26 @@ CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
 
 CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
     CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t* valueSize, void* value) {
+    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUdevice, CUpti_EventDomainID,
-                             CUpti_EventDomainAttribute, size_t*, void*);
+      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
+                              CUpti_EventDomainAttribute, size_t *, void *);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, eventDomain, attrib, valueSize, value);
 }
 
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t* numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t*);
+CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numDomains);
 }
 
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t* arraySizeBytes,
-                                           CUpti_EventDomainID* domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(size_t*, CUpti_EventDomainID*);
+CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
+                                           CUpti_EventDomainID *domainArray) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(arraySizeBytes, domainArray);
@@ -168,27 +168,27 @@ CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t* arraySizeBytes,
 
 CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
     CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t* valueSize, void* value) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t*, void*);
+    size_t *valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
+      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventDomain, attrib, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t* numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventDomainID, uint32_t*);
+    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventDomain, numEvents);
 }
 
 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t* arraySizeBytes,
-                                                CUpti_EventID* eventArray) {
+                                                size_t *arraySizeBytes,
+                                                CUpti_EventID *eventArray) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_EventDomainID, size_t*, CUpti_EventID*);
+      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventDomain, arraySizeBytes, eventArray);
@@ -196,35 +196,36 @@ CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
 
 CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
                                             CUpti_EventAttribute attrib,
-                                            size_t* valueSize, void* value) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventID, CUpti_EventAttribute,
-                                         size_t*, void*);
+                                            size_t *valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
+                                          size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(event, attrib, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char* eventName,
-                                             CUpti_EventID* event) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUdevice, const char*, CUpti_EventID*);
+                                             const char *eventName,
+                                             CUpti_EventID *event) {
+  using FuncPtr =
+      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, eventName, event);
 }
 
 CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup* eventGroup,
+                                           CUpti_EventGroup *eventGroup,
                                            uint32_t flags) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUcontext, CUpti_EventGroup*, uint32_t);
+      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, eventGroup, flags);
 }
 
 CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup);
@@ -232,9 +233,9 @@ CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
 
 CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
     CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t* valueSize, void* value) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t*, void*);
+    size_t *valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
+      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, attrib, valueSize, value);
@@ -242,9 +243,9 @@ CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
 
 CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
     CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void* value) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void*);
+    size_t valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
+      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, attrib, valueSize, value);
@@ -252,7 +253,7 @@ CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
 
 CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
                                              CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup, CUpti_EventID);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, event);
@@ -260,7 +261,7 @@ CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
 
 CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
                                                 CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup, CUpti_EventID);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, event);
@@ -268,7 +269,7 @@ CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
 
 CUptiResult CUPTIAPI
 cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup);
@@ -276,21 +277,21 @@ cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
 
 CUptiResult CUPTIAPI
 cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup);
 }
 
 CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup);
 }
 
 CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup);
@@ -299,10 +300,11 @@ CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
 CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
                                               CUpti_ReadEventFlags flags,
                                               CUpti_EventID event,
-                                              size_t* eventValueBufferSizeBytes,
-                                              uint64_t* eventValueBuffer) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                                         CUpti_EventID, size_t*, uint64_t*);
+                                              size_t *eventValueBufferSizeBytes,
+                                              uint64_t *eventValueBuffer) {
+  using FuncPtr =
+      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
+                              CUpti_EventID, size_t *, uint64_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
@@ -311,12 +313,12 @@ CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
 
 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
     CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t* eventValueBufferSizeBytes, uint64_t* eventValueBuffer,
-    size_t* eventIdArraySizeBytes, CUpti_EventID* eventIdArray,
-    size_t* numEventIdsRead) {
+    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
+    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
+    size_t *numEventIdsRead) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t*,
-                             uint64_t*, size_t*, CUpti_EventID*, size_t*);
+      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
+                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
@@ -326,9 +328,9 @@ CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
 
 CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
     CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID* eventIdArray, CUpti_EventGroupSets** eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, size_t, CUpti_EventID*,
-                                         CUpti_EventGroupSets**);
+    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
+                                          CUpti_EventGroupSets **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
@@ -336,79 +338,79 @@ CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
 }
 
 CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets* eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroupSets*);
+cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroupSets);
 }
 
 CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet* eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroupSet*);
+cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroupSet);
 }
 
 CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet* eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_EventGroupSet*);
+cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(eventGroupSet);
 }
 
 CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context);
 }
 
 CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context);
 }
 
 CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void* customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_KernelReplayUpdateFunc, void*);
+    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(updateFunc, customData);
 }
 
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t* numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t*);
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(numMetrics);
 }
 
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t* arraySizeBytes,
-                                      CUpti_MetricID* metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(size_t*, CUpti_MetricID*);
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(arraySizeBytes, metricArray);
 }
 
 CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t* numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUdevice, uint32_t*);
+                                              uint32_t *numMetrics) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, numMetrics);
 }
 
 CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t* arraySizeBytes,
-                                            CUpti_MetricID* metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUdevice, size_t*, CUpti_MetricID*);
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, arraySizeBytes, metricArray);
@@ -416,55 +418,55 @@ CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
 
 CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
                                              CUpti_MetricAttribute attrib,
-                                             size_t* valueSize, void* value) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_MetricID, CUpti_MetricAttribute,
-                                         size_t*, void*);
+                                             size_t *valueSize, void *value) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
+                                          size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, attrib, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char* metricName,
-                                              CUpti_MetricID* metric) {
+                                              const char *metricName,
+                                              CUpti_MetricID *metric) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUdevice, const char*, CUpti_MetricID*);
+      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, metricName, metric);
 }
 
 CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t* numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_MetricID, uint32_t*);
+                                             uint32_t *numEvents) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, numEvents);
 }
 
 CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t* eventIdArraySizeBytes,
-                                           CUpti_EventID* eventIdArray) {
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_MetricID, size_t*, CUpti_EventID*);
+      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
 }
 
 CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t* numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_MetricID, uint32_t*);
+                                                 uint32_t *numProp) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, numProp);
 }
 
 CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t* propIdArraySizeBytes,
-                          CUpti_MetricPropertyID* propIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_MetricID, size_t*, CUpti_MetricPropertyID*);
+cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
+                          CUpti_MetricPropertyID *propIdArray) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
+                                          CUpti_MetricPropertyID *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, propIdArraySizeBytes, propIdArray);
@@ -472,9 +474,9 @@ cuptiMetricEnumProperties(CUpti_MetricID metric, size_t* propIdArraySizeBytes,
 
 CUptiResult CUPTIAPI
 cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets** eventGroupSets) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUcontext, CUpti_MetricID, CUpti_EventGroupSets**);
+                                     CUpti_EventGroupSets **eventGroupSets) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
+                                          CUpti_EventGroupSets **);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -483,9 +485,9 @@ cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
 
 CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
     CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID* metricIdArray, CUpti_EventGroupSets** eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, size_t, CUpti_MetricID*,
-                                         CUpti_EventGroupSets**);
+    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
+                                          CUpti_EventGroupSets **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
@@ -494,14 +496,14 @@ CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
 
 CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
                                          size_t eventIdArraySizeBytes,
-                                         CUpti_EventID* eventIdArray,
+                                         CUpti_EventID *eventIdArray,
                                          size_t eventValueArraySizeBytes,
-                                         uint64_t* eventValueArray,
+                                         uint64_t *eventValueArray,
                                          uint64_t timeDuration,
-                                         CUpti_MetricValue* metricValue) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUdevice, CUpti_MetricID, size_t, CUpti_EventID*,
-                             size_t, uint64_t*, uint64_t, CUpti_MetricValue*);
+                                         CUpti_MetricValue *metricValue) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
+                                          CUpti_EventID *, size_t, uint64_t *,
+                                          uint64_t, CUpti_MetricValue *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
@@ -511,13 +513,13 @@ CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
 
 CUptiResult CUPTIAPI cuptiMetricGetValue2(
     CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID* eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t* eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID* propIdArray, size_t propValueArraySizeBytes,
-    uint64_t* propValueArray, CUpti_MetricValue* metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
-      CUpti_MetricID, size_t, CUpti_EventID*, size_t, uint64_t*, size_t,
-      CUpti_MetricPropertyID*, size_t, uint64_t*, CUpti_MetricValue*);
+    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
+    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
+    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
+    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
+      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
+      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
@@ -526,23 +528,23 @@ CUptiResult CUPTIAPI cuptiMetricGetValue2(
                   propValueArray, metricValue);
 }
 
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t* timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint64_t*);
+CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(timestamp);
 }
 
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t* contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, uint32_t*);
+CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, contextId);
 }
 
 CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t* streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, CUstream, uint32_t*);
+                                      uint32_t *streamId) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, stream, streamId);
@@ -550,30 +552,30 @@ CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
 
 CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
                                         uint8_t perThreadStream,
-                                        uint32_t* streamId) {
+                                        uint32_t *streamId) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUcontext, CUstream, uint8_t, uint32_t*);
+      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, stream, perThreadStream, streamId);
 }
 
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t* deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, uint32_t*);
+CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, deviceId);
 }
 
 CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_ActivityKind);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(kind);
 }
 
 CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_ActivityKind);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(kind);
@@ -581,7 +583,7 @@ CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
 
 CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, CUpti_ActivityKind);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, kind);
@@ -589,7 +591,7 @@ CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
 
 CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
                                                  CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, CUpti_ActivityKind);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, kind);
@@ -597,18 +599,18 @@ CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
 
 CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
                                                        uint32_t streamId,
-                                                       size_t* dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, uint32_t, size_t*);
+                                                       size_t *dropped) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, streamId, dropped);
 }
 
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer,
+CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
                                                 size_t validBufferSizeBytes,
-                                                CUpti_Activity** record) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint8_t*, size_t, CUpti_Activity**);
+                                                CUpti_Activity **record) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(buffer, validBufferSizeBytes, record);
@@ -617,8 +619,8 @@ CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer,
 CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
     CUpti_BuffersCallbackRequestFunc funcBufferRequested,
     CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_BuffersCallbackRequestFunc,
-                                         CUpti_BuffersCallbackCompleteFunc);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
+                                          CUpti_BuffersCallbackCompleteFunc);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(funcBufferRequested, funcBufferCompleted);
@@ -626,41 +628,41 @@ CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
 
 CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
                                         uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUcontext, uint32_t, uint32_t);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, streamId, flag);
 }
 
 CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint32_t);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(flag);
 }
 
 CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t* valueSize, void* value) {
+                                               size_t *valueSize, void *value) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_ActivityAttribute, size_t*, void*);
+      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attr, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t* valueSize, void* value) {
+                                               size_t *valueSize, void *value) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_ActivityAttribute, size_t*, void*);
+      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attr, valueSize, value);
 }
 
 CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(
-      CUpti_ActivityUnifiedMemoryCounterConfig*, uint32_t);
+    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(
+      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -668,18 +670,18 @@ CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
 }
 
 CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState* state) {
+cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUcontext, CUpti_ActivityAutoBoostState*);
+      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(context, state);
 }
 
 CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig* config) {
+    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUcontext, CUpti_ActivityPCSamplingConfig*);
+      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -687,43 +689,43 @@ CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
 }
 
 CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)();
+  using FuncPtr = CUptiResult(CUPTIAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_ActivityThreadIdType);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type);
 }
 
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType* type) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUpti_ActivityThreadIdType*);
+CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type);
 }
 
 CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int* support) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(int, int, int*);
+                                                     int *support) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(major, minor, support);
 }
 
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int* support) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(CUdevice, int*);
+CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
+  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dev, support);
 }
 
 CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)();
+  using FuncPtr = CUptiResult(CUPTIAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
@@ -732,7 +734,7 @@ CUptiResult CUPTIAPI cuptiFinalize(void) {
 CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
     CUpti_ExternalCorrelationKind kind, uint64_t id) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_ExternalCorrelationKind, uint64_t);
+      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -740,9 +742,9 @@ CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
 }
 
 CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t* lastId) {
+    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
   using FuncPtr =
-      CUptiResult(CUPTIAPI*)(CUpti_ExternalCorrelationKind, uint64_t*);
+      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
   if (!func_ptr) return GetSymbolNotFoundError();
@@ -750,7 +752,7 @@ CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
 }
 
 CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI*)(uint8_t);
+  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
   static auto func_ptr =
       LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
   return func_ptr(enable);
diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc
index 0c7dd2e75f0..130c3f96e44 100644
--- a/tensorflow/stream_executor/cuda/cupti_stub.cc
+++ b/tensorflow/stream_executor/cuda/cupti_stub.cc
@@ -23,16 +23,12 @@ limitations under the License.
 namespace {
 // Returns DSO handle or null if loading the DSO fails.
 void* GetDsoHandle() {
-#ifdef PLATFORM_GOOGLE
-  return nullptr;
-#else
   static auto handle = []() -> void* {
     auto handle_or = stream_executor::internal::DsoLoader::GetCuptiDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.ValueOrDie();
   }();
   return handle;
-#endif
 }
 
 template <typename T>
diff --git a/tensorflow/stream_executor/cuda/curand_10_0.inc b/tensorflow/stream_executor/cuda/curand_10_0.inc
index e6024e2bb3b..7b8cb63580d 100644
--- a/tensorflow/stream_executor/cuda/curand_10_0.inc
+++ b/tensorflow/stream_executor/cuda/curand_10_0.inc
@@ -1,157 +1,173 @@
 // Auto-generated, do not edit.
 
 extern "C" {
-curandStatus_t CURANDAPI 
-curandCreateGenerator(curandGenerator_t *generator, curandRngType_t rng_type) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t *, curandRngType_t);
+curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
+                                               curandRngType_t rng_type) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, rng_type);
 }
 
-curandStatus_t CURANDAPI 
-curandCreateGeneratorHost(curandGenerator_t *generator, curandRngType_t rng_type) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t *, curandRngType_t);
+curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
+                                                   curandRngType_t rng_type) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, rng_type);
 }
 
-curandStatus_t CURANDAPI 
-curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t);
+curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator);
 }
 
-curandStatus_t CURANDAPI
-curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(int *);
+curandStatus_t CURANDAPI curandGetVersion(int *version) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-curandStatus_t CURANDAPI
-curandGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(libraryPropertyType, int *);
+curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-curandStatus_t CURANDAPI
-curandSetStream(curandGenerator_t generator, cudaStream_t stream) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, cudaStream_t);
+curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
+                                         cudaStream_t stream) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, stream);
 }
 
-curandStatus_t CURANDAPI 
-curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
+curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
+    curandGenerator_t generator, unsigned long long seed) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, seed);
 }
 
-curandStatus_t CURANDAPI 
-curandSetGeneratorOffset(curandGenerator_t generator, unsigned long long offset) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long);
+curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
+                                                  unsigned long long offset) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, offset);
 }
 
-curandStatus_t CURANDAPI 
-curandSetGeneratorOrdering(curandGenerator_t generator, curandOrdering_t order) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, curandOrdering_t);
+curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
+                                                    curandOrdering_t order) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, order);
 }
 
-curandStatus_t CURANDAPI
-curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
+curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
+    curandGenerator_t generator, unsigned int num_dimensions) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, num_dimensions);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerate(curandGenerator_t generator, unsigned int *outputPtr, size_t num) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
+curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
+                                        unsigned int *outputPtr, size_t num) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateLongLong(curandGenerator_t generator, unsigned long long *outputPtr, size_t num) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long *, size_t);
+curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
+                                                unsigned long long *outputPtr,
+                                                size_t num) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
+                                              unsigned long long *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateUniform(curandGenerator_t generator, float *outputPtr, size_t num) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t);
+curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
+                                               float *outputPtr, size_t num) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t);
+curandStatus_t CURANDAPI curandGenerateUniformDouble(
+    curandGenerator_t generator, double *outputPtr, size_t num) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateNormal(curandGenerator_t generator, float *outputPtr, 
-                     size_t n, float mean, float stddev) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t, float, float);
+curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
+                                              float *outputPtr, size_t n,
+                                              float mean, float stddev) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
+                                              size_t, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, mean, stddev);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr, 
-                     size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t, double, double);
+curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
+                                                    double *outputPtr, size_t n,
+                                                    double mean,
+                                                    double stddev) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
+                                              size_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, mean, stddev);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr, 
-                     size_t n, float mean, float stddev) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t, float, float);
+curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
+                                                 float *outputPtr, size_t n,
+                                                 float mean, float stddev) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
+                                              size_t, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, mean, stddev);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr, 
-                     size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t, double, double);
+curandStatus_t CURANDAPI
+curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
+                              size_t n, double mean, double stddev) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
+                                              size_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, mean, stddev);
 }
 
-curandStatus_t CURANDAPI
-curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(double, curandDiscreteDistribution_t *);
+curandStatus_t CURANDAPI curandCreatePoissonDistribution(
+    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lambda, discrete_distribution);
@@ -159,85 +175,90 @@ curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *dis
 
 curandStatus_t CURANDAPI
 curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDiscreteDistribution_t);
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(discrete_distribution);
 }
 
-curandStatus_t CURANDAPI
-curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr,
-                     size_t n, double lambda) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, double);
+curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
+                                               unsigned int *outputPtr,
+                                               size_t n, double lambda) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
+                                              size_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, lambda);
 }
 
-curandStatus_t CURANDAPI
-curandGeneratePoissonMethod(curandGenerator_t generator, unsigned int *outputPtr,
-                     size_t n, double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, double, curandMethod_t);
+curandStatus_t CURANDAPI curandGeneratePoissonMethod(
+    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
+    double lambda, curandMethod_t method) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
+                                              size_t, double, curandMethod_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, n, lambda, method);
 }
 
-curandStatus_t CURANDAPI
-curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr,
-                       size_t num, unsigned int n, double p) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, unsigned int, double);
+curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
+                                                unsigned int *outputPtr,
+                                                size_t num, unsigned int n,
+                                                double p) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
+                                              size_t, unsigned int, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num, n, p);
 }
 
-curandStatus_t CURANDAPI
-curandGenerateBinomialMethod(curandGenerator_t generator,
-                             unsigned int *outputPtr,
-                             size_t num, unsigned int n, double p,
-                             curandMethod_t method) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, unsigned int, double, curandMethod_t);
+curandStatus_t CURANDAPI curandGenerateBinomialMethod(
+    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
+    unsigned int n, double p, curandMethod_t method) {
+  using FuncPtr =
+      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
+                                  unsigned int, double, curandMethod_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator, outputPtr, num, n, p, method);
 }
 
-curandStatus_t CURANDAPI 
-curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t);
+curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(generator);
 }
 
-curandStatus_t CURANDAPI
-curandGetDirectionVectors32(curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDirectionVectors32_t *[], curandDirectionVectorSet_t);
+curandStatus_t CURANDAPI curandGetDirectionVectors32(
+    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
+                                              curandDirectionVectorSet_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(vectors, set);
 }
 
 curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int * * constants) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(unsigned int **);
+curandGetScrambleConstants32(unsigned int **constants) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constants);
 }
 
-curandStatus_t CURANDAPI
-curandGetDirectionVectors64(curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDirectionVectors64_t *[], curandDirectionVectorSet_t);
+curandStatus_t CURANDAPI curandGetDirectionVectors64(
+    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
+                                              curandDirectionVectorSet_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(vectors, set);
 }
 
 curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long * * constants) {
-  using FuncPtr = curandStatus_t (CURANDAPI *)(unsigned long long **);
+curandGetScrambleConstants64(unsigned long long **constants) {
+  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
   static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constants);
diff --git a/tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc b/tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc
new file mode 100644
index 00000000000..d247958143a
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc
@@ -0,0 +1,3139 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t *streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              float *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              double *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              float *B, int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs,
+                                              const cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, float *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, double *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, cuComplex *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
+      double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
+                        int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
+      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
+                        int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
+      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
+                                      int, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
+                                              float *A, int lda, int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
+                                              double *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
+                                              cuComplex *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuDoubleComplex *, int, int,
+                                                  int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *devIpiv, float *B,
+                                              int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *devIpiv,
+                                              double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, const int *devIpiv,
+                                              cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
+    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
+    int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *TAU, float *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *TAU, double *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
+                                      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *TAU,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, cuComplex *,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda, cuDoubleComplex *TAU,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
+    const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
+    const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const double *, int,
+                                                  const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
+    const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const cuComplex *, int,
+                                                  const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
+    int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
+    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau,
+    const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau,
+    const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
+    int ldc, float *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
+    int ldc, double *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
+    cuDoubleComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, int *ipiv,
+                                              float *work, int lwork,
+                                              int *info) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, int *ipiv,
+                                              double *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, int *ipiv,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int *ipiv, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *ipiv, float *B,
+                                              int ldb, float *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *ipiv,
+                                              double *B, int ldb, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
+                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const int *ipiv, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const int *ipiv, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              const int *ipiv, cuComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *D, float *E, float *TAUQ,
+                                              float *TAUP, float *Work,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *D, double *E,
+                                              double *TAUQ, double *TAUP,
+                                              double *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              float *D, float *E,
+                                              cuComplex *TAUQ, cuComplex *TAUP,
+                                              cuComplex *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
+      cuComplex *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
+    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
+    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
+    cuDoubleComplex *Work, int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const float *A, int lda, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const double *A, int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
+      int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
+      const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
+      const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
+                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
+                 cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *d, const float *e, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
+      const float *, const float *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *d, const double *e, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, const double *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const float *, const float *, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const double *d, const double *e,
+    const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const double *, const double *, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *d,
+                                              float *e, float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *d, double *e, double *tau, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
+      double *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *d,
+                                              float *e, cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
+      float *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              const cuDoubleComplex *tau,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const float *A, int lda,
+    const float *tau, const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const double *A, int lda,
+    const double *tau, const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const double *, int, const double *, const double *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
+    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
+    float *C, int ldc, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, float *, int, float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
+    double *C, int ldc, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, double *, int, double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
+                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
+                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
+                 cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
+    float *work, int lwork, float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
+    int ldvt, double *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
+                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
+                 float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuDoubleComplex *A, int lda, double *S,
+                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
+                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              double *W, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
+    int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const double *, int, double, double, int, int,
+      int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
+    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
+      int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
+      int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
+    int iu, int *meig, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
+    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
+                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
+                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
+      float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
+      int *, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
+      const float *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
+      const double *, int, double, double, int, int, int *, const double *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int, float, float, int, int, int *, const float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    double vl, double vu, int il, int iu, int *meig, const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, double, int, int, int *,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
+    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
+    float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
+      float, float, int, int, int *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
+      double, double, int, int, int *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double, double, int, int, int *, double *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
+                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
+                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
+                 double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
+                                                          double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
+                                                          int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
+                                                        int sort_eig) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_eig);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
+    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  syevjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
+    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
+    int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *W,
+                                              float *work, int lwork, int *info,
+                                              syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *W,
+                                              double *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
+    int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
+                                                           double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
+                                                           int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
+                                                         int sort_svd) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_svd);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  gesvdjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
+      const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
+      int, const double *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
+    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
+      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
+    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
+      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
+      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
+    float *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
+    double *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const float *d_U, int ldu, long long int strideU,
+    const float *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, const float *, long long, const float *, int, long long,
+      const float *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, const double *d_S,
+    long long int strideS, const double *d_U, int ldu, long long int strideU,
+    const double *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, const double *, long long, const double *, int, long long,
+      const double *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
+    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, const float *, long long, const cuComplex *, int,
+      long long, const cuComplex *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA,
+    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
+    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
+    long long int strideV, int *lwork, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, const double *, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
+      long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, float *d_U, int ldu, long long int strideU,
+    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, float *, long long, float *, int, long long, float *, int,
+      long long, float *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, double *d_U, int ldu, long long int strideU,
+    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, double *, long long, double *, int, long long, double *, int,
+      long long, double *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
+    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
+    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, float *, long long, cuComplex *, int, long long,
+      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
+    cuDoubleComplex *d_V, int ldv, long long int strideV,
+    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, double *, long long,
+      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
+      cuDoubleComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusolver_stub.cc b/tensorflow/stream_executor/cuda/cusolver_stub.cc
index f8d3df98e7e..f92af64fcf1 100644
--- a/tensorflow/stream_executor/cuda/cusolver_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusolver_stub.cc
@@ -50,4 +50,8 @@ cusolverStatus_t GetSymbolNotFoundError() {
 }
 }  // namespace
 
+#if CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_0.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc"
+#endif
diff --git a/tensorflow/stream_executor/cuda/cusparse_10_1.inc b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
new file mode 100644
index 00000000000..09b3ad11138
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
@@ -0,0 +1,8258 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
+  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
+  return func_ptr(status);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
+  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
+                                                  cusparsePointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
+                                                  const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dest, src);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
+                                                cusparseMatrixType_t type) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, type);
+}
+
+cusparseMatrixType_t CUSPARSEAPI
+cusparseGetMatType(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, fillMode);
+}
+
+cusparseFillMode_t CUSPARSEAPI
+cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, diagType);
+}
+
+cusparseDiagType_t CUSPARSEAPI
+cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
+                                                     cusparseIndexBase_t base) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, base);
+}
+
+cusparseIndexBase_t CUSPARSEAPI
+cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSolveAnalysisInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDestroySolveAnalysisInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetLevelInfo(cusparseHandle_t handle, cusparseSolveAnalysisInfo_t info,
+                     int *nlevels, int **levelPtr, int **levelInd) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseSolveAnalysisInfo_t, int *, int **, int **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetLevelInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, nlevels, levelPtr, levelInd);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateHybMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hybA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyHybMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hybA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t alg) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t *alg) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
+                                                  cusparseColorAlg_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
+                                            const float *alpha,
+                                            const float *xVal, const int *xInd,
+                                            float *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const int *, float *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
+                                            const double *alpha,
+                                            const double *xVal, const int *xInd,
+                                            double *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const int *,
+      double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuComplex *alpha,
+                                            const cuComplex *xVal,
+                                            const int *xInd, cuComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
+      cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuDoubleComplex *alpha,
+                                            const cuDoubleComplex *xVal,
+                                            const int *xInd, cuDoubleComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const int *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, int nnz,
+                                           const float *xVal, const int *xInd,
+                                           const float *y,
+                                           float *resultDevHostPtr,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const int *, const float *, float *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdoti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, int nnz,
+                                           const double *xVal, const int *xInd,
+                                           const double *y,
+                                           double *resultDevHostPtr,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const int *, const double *,
+      double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdoti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *xVal,
+                                           const int *xInd, const cuComplex *y,
+                                           cuComplex *resultDevHostPtr,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
+      cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdoti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *xVal,
+                                           const int *xInd,
+                                           const cuDoubleComplex *y,
+                                           cuDoubleComplex *resultDevHostPtr,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdoti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, int nnz,
+                                            const cuComplex *xVal,
+                                            const int *xInd, const cuComplex *y,
+                                            cuComplex *resultDevHostPtr,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
+      cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdotci");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, int nnz,
+                                            const cuDoubleComplex *xVal,
+                                            const int *xInd,
+                                            const cuDoubleComplex *y,
+                                            cuDoubleComplex *resultDevHostPtr,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdotci");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
+                                           const float *y, float *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, float *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
+                                           const double *y, double *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, double *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *y, cuComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *y,
+                                           cuDoubleComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
+      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
+                                            float *y, float *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
+                                            double *y, double *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
+                                            cuComplex *y, cuComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
+                                            cuDoubleComplex *y,
+                                            cuDoubleComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
+                                           const float *xVal, const int *xInd,
+                                           float *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
+                                                  const float *, const int *,
+                                                  float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
+                                           const double *xVal, const int *xInd,
+                                           double *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const int *, double *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *xVal,
+                                           const int *xInd, cuComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *xVal,
+                                           const int *xInd, cuDoubleComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
+      cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
+                                           float *xVal, const int *xInd,
+                                           float *y, const float *c,
+                                           const float *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, float *, const int *, float *, const float *,
+      const float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
+                                           double *xVal, const int *xInd,
+                                           double *y, const double *c,
+                                           const double *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, double *, const int *, double *, const double *,
+      const double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const float *alpha, const float *A, int lda, int nnz,
+               const float *xVal, const int *xInd, const float *beta, float *y,
+               cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const float *, int, int, const float *, const int *, const float *,
+      float *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const double *alpha, const double *A, int lda, int nnz,
+               const double *xVal, const int *xInd, const double *beta,
+               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const double *, int, int, const double *, const int *, const double *,
+      double *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
+    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, int, const cuComplex *, const int *,
+      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
+    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
+    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmv(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, int nnz, const double *alpha,
+               const cusparseMatDescr_t descrA, const double *csrSortedValA,
+               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+               const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, int nnz, const cuComplex *alpha,
+               const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+               const cuComplex *x, const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv_mp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+                  int n, int nnz, const double *alpha,
+                  const cusparseMatDescr_t descrA, const double *csrSortedValA,
+                  const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+                  const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv_mp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *x, const cuComplex *beta,
+    cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv_mp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv_mp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseShybmv(
+    cusparseHandle_t handle, cusparseOperation_t transA, const float *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const float *,
+      const cusparseMatDescr_t, const cusparseHybMat_t, const float *,
+      const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDhybmv(
+    cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const double *,
+      const cusparseMatDescr_t, const cusparseHybMat_t, const double *,
+      const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseChybmv(
+    cusparseHandle_t handle, cusparseOperation_t transA, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    const cuComplex *x, const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
+      const cusparseMatDescr_t, const cusparseHybMat_t, const cuComplex *,
+      const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZhybmv(cusparseHandle_t handle, cusparseOperation_t transA,
+               const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+               const cusparseHybMat_t hybA, const cuDoubleComplex *x,
+               const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cusparseHybMat_t, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+               cusparseOperation_t transA, int mb, int nb, int nnzb,
+               const cuComplex *alpha, const cusparseMatDescr_t descrA,
+               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+               const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const cuComplex *, const cuComplex *,
+      cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const float *x,
+                const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const int *, const int *, int, const float *, const float *,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const double *x,
+                const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const int *, const int *, int, const double *,
+      const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+    const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const int *, const int *, int,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const int *,
+      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const void *csrSortedValA,
+    cudaDataType csrSortedValAtype, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    cudaDataType executiontype) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const void *, cudaDataType, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_analysisEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
+                  executiontype);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m,
+    const void *alpha, cudaDataType alphatype, const cusparseMatDescr_t descrA,
+    const void *csrSortedValA, cudaDataType csrSortedValAtype,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info, const void *f, cudaDataType ftype,
+    void *x, cudaDataType xtype, cudaDataType executiontype) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const void *, cudaDataType,
+      const cusparseMatDescr_t, const void *, cudaDataType, const int *,
+      const int *, cusparseSolveAnalysisInfo_t, const void *, cudaDataType,
+      void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_solveEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, alpha, alphatype, descrA, csrSortedValA,
+                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
+                  f, ftype, x, xtype, executiontype);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const float *f, float *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m,
+    const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const double *f, double *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const cuComplex *f, cuComplex *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const cuDoubleComplex *f, cuDoubleComplex *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
+    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
+    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, bsrsv2Info_t, const float *, float *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, bsrsv2Info_t, const double *, double *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
+      cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseShybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
+                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
+                        cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
+      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, descrA, hybA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
+                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
+                        cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
+      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, descrA, hybA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseChybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
+                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
+                        cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
+      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, descrA, hybA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
+                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
+                        cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
+      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, descrA, hybA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t trans, const float *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    cusparseSolveAnalysisInfo_t info, const float *f, float *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const float *,
+      const cusparseMatDescr_t, const cusparseHybMat_t,
+      cusparseSolveAnalysisInfo_t, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t trans, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    cusparseSolveAnalysisInfo_t info, const cuComplex *f, cuComplex *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
+      const cusparseMatDescr_t, const cusparseHybMat_t,
+      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t trans, const double *alpha,
+    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
+    cusparseSolveAnalysisInfo_t info, const double *f, double *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const double *,
+      const cusparseMatDescr_t, const cusparseHybMat_t,
+      cusparseSolveAnalysisInfo_t, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(
+    cusparseHandle_t handle, cusparseOperation_t trans,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cusparseHybMat_t hybA, cusparseSolveAnalysisInfo_t info,
+    const cuDoubleComplex *f, cuDoubleComplex *x) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cusparseHybMat_t,
+      cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrmm(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, int k, int nnz, const float *alpha,
+               const cusparseMatDescr_t descrA, const float *csrSortedValA,
+               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+               const float *B, int ldb, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
+    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *B, int ldb, const double *beta,
+    double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
+    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb,
+    const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
+    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
+                cusparseOperation_t transB, int m, int n, int k, int nnz,
+                const float *alpha, const cusparseMatDescr_t descrA,
+                const float *csrSortedValA, const int *csrSortedRowPtrA,
+                const int *csrSortedColIndA, const float *B, int ldb,
+                const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
+                cusparseOperation_t transB, int m, int n, int k, int nnz,
+                const double *alpha, const cusparseMatDescr_t descrA,
+                const double *csrSortedValA, const int *csrSortedRowPtrA,
+                const int *csrSortedColIndA, const double *B, int ldb,
+                const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, const double *, double *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
+                cusparseOperation_t transB, int m, int n, int k, int nnz,
+                const cuComplex *alpha, const cusparseMatDescr_t descrA,
+                const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+                const int *csrSortedColIndA, const cuComplex *B, int ldb,
+                const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const float *B,
+    const int ldb, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const int, const float *, const int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const double *B,
+    const int ldb, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const int, const double *, const int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
+    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
+    const int blockSize, const cuDoubleComplex *B, const int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, const int, const cuDoubleComplex *, const int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
+    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
+      const float *, const int *, const int *, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
+    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
+      const double *, const int *, const int *, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz,
+    const cuComplex *alpha, const cuComplex *A, int lda,
+    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
+    const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, const int *, const int *,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
+               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
+               const cuDoubleComplex *cscValB, const int *cscColPtrB,
+               const int *cscRowIndB, const cuDoubleComplex *beta,
+               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const float *B, int ldb, float *X, int ldx) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const double *B, int ldb, double *X, int ldx) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const cuComplex *B, int ldb, cuComplex *X, int ldx) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
+      csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuComplex *B, int ldb, cuComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(
+    cusparseHandle_t handle, cusparseOperation_t trans, int m,
+    const cusparseMatDescr_t descrA, void *csrSortedValA_ValM,
+    cudaDataType csrSortedValA_ValMtype, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
+    cudaDataType executiontype) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      void *, cudaDataType, const int *, const int *,
+      cusparseSolveAnalysisInfo_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrilu0Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedValA_ValMtype, csrSortedRowPtrA, csrSortedColIndA,
+                  info, executiontype);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
+                 const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
+                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+                 cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
+                 const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
+                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+                 cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
+                 const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
+                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+                 cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(
+    cusparseHandle_t handle, cusparseOperation_t trans, int m,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
+    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
+    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric0(cusparseHandle_t handle,
+                                             cusparseOperation_t trans, int m,
+                                             const cusparseMatDescr_t descrA,
+                                             float *csrSortedValA_ValM,
+                                             const int *csrSortedRowPtrA,
+                                             const int *csrSortedColIndA,
+                                             cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric0(cusparseHandle_t handle,
+                                             cusparseOperation_t trans, int m,
+                                             const cusparseMatDescr_t descrA,
+                                             double *csrSortedValA_ValM,
+                                             const int *csrSortedRowPtrA,
+                                             const int *csrSortedColIndA,
+                                             cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric0(cusparseHandle_t handle,
+                                             cusparseOperation_t trans, int m,
+                                             const cusparseMatDescr_t descrA,
+                                             cuComplex *csrSortedValA_ValM,
+                                             const int *csrSortedRowPtrA,
+                                             const int *csrSortedColIndA,
+                                             cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric0(
+    cusparseHandle_t handle, cusparseOperation_t trans, int m,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    cusparseSolveAnalysisInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric0");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
+                  csrSortedRowPtrA, csrSortedColIndA, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                                                        csric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                                                        bsric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv(cusparseHandle_t handle, int m,
+                                           int n, const float *dl,
+                                           const float *d, const float *du,
+                                           float *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  const float *, const float *,
+                                                  const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv(cusparseHandle_t handle, int m,
+                                           int n, const double *dl,
+                                           const double *d, const double *du,
+                                           double *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv(cusparseHandle_t handle, int m,
+                                           int n, const cuComplex *dl,
+                                           const cuComplex *d,
+                                           const cuComplex *du, cuComplex *B,
+                                           int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv(cusparseHandle_t handle, int m,
+                                           int n, const cuDoubleComplex *dl,
+                                           const cuDoubleComplex *d,
+                                           const cuDoubleComplex *du,
+                                           cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const float *dl,
+                                            const float *d, const float *du,
+                                            float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const double *dl,
+                                            const double *d, const double *du,
+                                            double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuComplex *dl,
+                                            const cuComplex *d,
+                                            const cuComplex *du, cuComplex *B,
+                                            int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuDoubleComplex *dl,
+                                            const cuDoubleComplex *d,
+                                            const cuDoubleComplex *du,
+                                            cuDoubleComplex *B, int ldb,
+                                            void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv_nopivot(cusparseHandle_t handle, int m, int n, const float *dl,
+                      const float *d, const float *du, float *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  const float *, const float *,
+                                                  const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv_nopivot(cusparseHandle_t handle, int m, int n, const double *dl,
+                      const double *d, const double *du, double *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv_nopivot(cusparseHandle_t handle, int m, int n,
+                      const cuDoubleComplex *dl, const cuDoubleComplex *d,
+                      const cuDoubleComplex *du, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
+    int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, float *x, int batchCount, int batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      float *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch(
+    cusparseHandle_t handle, int m, const double *dl, const double *d,
+    const double *du, double *x, int batchCount, int batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      double *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, cuComplex *x, int batchCount, int batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
+    int batchCount, int batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      const float *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      const double *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      float *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
+                           const double *d, const double *du, double *x,
+                           int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      double *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
+    int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
+    float *x, int batchCount, void *pBuffer) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
+                                      float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
+    double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  double *, double *, double *,
+                                                  double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
+    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
+    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
+    const float *d, const float *du, const float *dw, const float *x,
+    int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *ds,
+    const double *dl, const double *d, const double *du, const double *dw,
+    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, const double *, const double *, int,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
+    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
+    const cuComplex *dw, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, const cuComplex *,
+      const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
+    const cuDoubleComplex *dl, const cuDoubleComplex *d,
+    const cuDoubleComplex *du, const cuDoubleComplex *dw,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
+    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
+    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, double *, double *, double *, double *,
+      double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
+    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
+    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
+    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgemmNnz(cusparseHandle_t handle, cusparseOperation_t transA,
+                    cusparseOperation_t transB, int m, int n, int k,
+                    const cusparseMatDescr_t descrA, const int nnzA,
+                    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+                    const cusparseMatDescr_t descrB, const int nnzB,
+                    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+                    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+                    int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      const cusparseMatDescr_t, const int, const int *, const int *,
+      const cusparseMatDescr_t, const int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemmNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k,
+    const cusparseMatDescr_t descrA, const int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, const int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      const cusparseMatDescr_t, const int, const float *, const int *,
+      const int *, const cusparseMatDescr_t, const int, const float *,
+      const int *, const int *, const cusparseMatDescr_t, float *, const int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuComplex *,
+      const int *, const int *, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
+      const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, csrgemm2Info_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const float *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const double *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
+    cusparseHandle_t handle, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
+      const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const double *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuComplex *,
+      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuComplex *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrD, int nnzD,
+    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
+      const int *, const int *, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
+      int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeamNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, workspace);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const double *, int *,
+      int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const float *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const double *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
+      const int *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
+      const int *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
+      const int *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, int, const int *, float *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, int, const int *, double *, int *, int *,
+      double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const int *, cuComplex *, int *, int *,
+      cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
+    int *csrSortedRowPtrC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, const int *,
+      cuDoubleComplex *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
+    int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
+    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
+    int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
+                                              const int *cooRowInd, int nnz,
+                                              int m, int *csrSortedRowPtr,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
+                                              const int *csrSortedRowPtr,
+                                              int nnz, int m, int *cooRowInd,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrSortedVal,
+    cudaDataType csrSortedValtype, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, void *cscSortedVal,
+    cudaDataType cscSortedValtype, int *cscSortedRowInd, int *cscSortedColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+    cudaDataType executiontype) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, cudaDataType, const int *,
+      const int *, void *, cudaDataType, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedValtype,
+                  csrSortedRowPtr, csrSortedColInd, cscSortedVal,
+                  cscSortedValtype, cscSortedRowInd, cscSortedColPtr,
+                  copyValues, idxBase, executiontype);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(
+    cusparseHandle_t handle, int m, int n, int nnz, const float *csrSortedVal,
+    const int *csrSortedRowPtr, const int *csrSortedColInd, float *cscSortedVal,
+    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      float *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr, copyValues, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(
+    cusparseHandle_t handle, int m, int n, int nnz, const double *csrSortedVal,
+    const int *csrSortedRowPtr, const int *csrSortedColInd,
+    double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      double *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr, copyValues, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csc(cusparseHandle_t handle, int m, int n, int nnz,
+                 const cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+                 const int *csrSortedColInd, cuComplex *cscSortedVal,
+                 int *cscSortedRowInd, int *cscSortedColPtr,
+                 cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, cuComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr, copyValues, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, cuDoubleComplex *cscSortedVal,
+    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, cuDoubleComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr, copyValues, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
+    int userEllWidth, cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
+                  partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
+    int userEllWidth, cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
+                  partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
+    int userEllWidth, cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
+                  partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZdense2hyb(cusparseHandle_t handle, int m, int n,
+                   const cusparseMatDescr_t descrA, const cuDoubleComplex *A,
+                   int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
+                   int userEllWidth, cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cusparseHybMat_t, int,
+      cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
+                  partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle,
+                                                const cusparseMatDescr_t descrA,
+                                                const cusparseHybMat_t hybA,
+                                                float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle,
+                                                const cusparseMatDescr_t descrA,
+                                                const cusparseHybMat_t hybA,
+                                                double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle,
+                                                const cusparseMatDescr_t descrA,
+                                                const cusparseHybMat_t hybA,
+                                                cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle,
+                                                const cusparseMatDescr_t descrA,
+                                                const cusparseHybMat_t hybA,
+                                                cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
+      cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              float *csrSortedValA,
+                                              int *csrSortedRowPtrA,
+                                              int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              double *csrSortedValA,
+                                              int *csrSortedRowPtrA,
+                                              int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              cuComplex *csrSortedValA,
+                                              int *csrSortedRowPtrA,
+                                              int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              cuDoubleComplex *csrSortedValA,
+                                              int *csrSortedRowPtrA,
+                                              int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
+    cusparseHybPartition_t partitionType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
+      cusparseHybPartition_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2hyb");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              float *cscSortedVal,
+                                              int *cscSortedRowInd,
+                                              int *cscSortedColPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              double *cscSortedVal,
+                                              int *cscSortedRowInd,
+                                              int *cscSortedColPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              cuComplex *cscSortedVal,
+                                              int *cscSortedRowInd,
+                                              int *cscSortedColPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle,
+                                              const cusparseMatDescr_t descrA,
+                                              const cusparseHybMat_t hybA,
+                                              cuDoubleComplex *cscSortedVal,
+                                              int *cscSortedRowInd,
+                                              int *cscSortedColPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
+      cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
+                  cscSortedColPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
+    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
+                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, const cusparseMatDescr_t,
+      float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, const cusparseMatDescr_t,
+      double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
+      cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
+    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
+    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const int *, const int *, int, int,
+      const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
+                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
+    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
+      int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, p);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
+    const int *cooColsA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
+                                                   int m, int n, int nnz,
+                                                   int *cooRowsA, int *cooColsA,
+                                                   int *P, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
+                                                      int m, int n, int nnz,
+                                                      int *cooRowsA,
+                                                      int *cooColsA, int *P,
+                                                      void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
+    const int *csrColIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *csrRowPtrA,
+                                              int *csrColIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
+    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *cscColPtrA,
+                                              int *cscRowIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, float *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, double *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      const float *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      const double *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, const float *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const double *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, const double *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, float *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, double *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
+                    void *indices, void *values, cusparseIndexType_t idxType,
+                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(
+    const cusparseSpVecDescr_t spVecDescr, int64_t *size, int64_t *nnz,
+    void **indices, void **values, cusparseIndexType_t *idxType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
+    const cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(const cusparseSpVecDescr_t spVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
+                    void *values, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGet(const cusparseDnVecDescr_t dnVecDescr, int64_t *size,
+                 void **values, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(const cusparseDnVecDescr_t dnVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
+                                               int64_t rows, int64_t cols,
+                                               int64_t nnz, void *cooRowInd,
+                                               void *cooColInd, void *cooValues,
+                                               cusparseIndexType_t cooIdxType,
+                                               cusparseIndexBase_t idxBase,
+                                               cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *csrRowOffsets, void *csrColInd, void *csrValues,
+    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
+               int64_t *cols, int64_t *nnz,
+               void **cooRowInd,  // COO row indices
+               void **cooColInd,  // COO column indices
+               void **cooValues,  // COO values
+               cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+               cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      void **, void **, cusparseIndexType_t *, cusparseIndexBase_t *,
+      cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooAoSGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
+                  int64_t *cols, int64_t *nnz,
+                  void **cooInd,     // COO indices
+                  void **cooValues,  // COO values
+                  cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+                  cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
+    const cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *nnz, void **csrRowOffsets, void **csrColInd, void **csrValues,
+    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      void **, void **, cusparseIndexType_t *, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
+    const cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
+                                                  cusparseFormat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, format);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
+    const cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(const cusparseSpMatDescr_t spMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetStridedBatch(
+    const cusparseSpMatDescr_t spMatDescr, int *batchCount) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
+    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
+    void *values, cudaDataType valueType, cusparseOrder_t order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
+      cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(
+    const cusparseDnMatDescr_t dnMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *ld, void **values, cudaDataType *type, cusparseOrder_t *order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      const cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(const cusparseDnMatDescr_t dnMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetStridedBatch(const cusparseDnMatDescr_t dnMatDescr,
+                             int *batchCount, int64_t *batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t,
+                                                  int *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
+             const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
+             void *result, cudaDataType computeType, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
+      const cusparseDnVecDescr_t, void *, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opX,
+    const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
+    const void *result, cudaDataType computeType, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
+      const cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
+    cusparseSpMVAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
+      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
+    cusparseSpMVAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
+      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, const cusparseSpMatDescr_t matA,
+    const cusparseDnMatDescr_t matB, const void *beta,
+    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, const cusparseSpMatDescr_t matA,
+    const cusparseDnMatDescr_t matB, const void *beta,
+    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, const cusparseDnMatDescr_t matA,
+    const cusparseDnMatDescr_t matB, const void *beta,
+    cusparseSpMatDescr_t matC, cudaDataType computeType, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, const cusparseDnMatDescr_t matA,
+    const cusparseDnMatDescr_t matB, const void *beta,
+    cusparseSpMatDescr_t matC, cudaDataType computeType, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  bufferSize);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index 439de5eb83a..4b941bc1751 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -52,6 +52,8 @@ cusparseStatus_t GetSymbolNotFoundError() {
 
 #if CUDA_VERSION < 9020
 #include "tensorflow/stream_executor/cuda/cusparse_9_0.inc"
-#else
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusparse_10_0.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cusparse_10_1.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.cc b/tensorflow/stream_executor/cuda/redzone_allocator.cc
index 76ff86cbdd5..afd4f57024d 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator.cc
+++ b/tensorflow/stream_executor/cuda/redzone_allocator.cc
@@ -45,26 +45,28 @@ constexpr int64 kRhsRedzoneAlign = 4;
 using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;
 
 RedzoneAllocator::RedzoneAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator,
-    cuda::PtxCompilationOptions ptx_compilation_opts, uint64 redzone_size,
-    uint8 redzone_pattern)
-    : device_ordinal_(device_ordinal),
+    Stream* stream, DeviceMemoryAllocator* memory_allocator,
+    cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit,
+    int64 redzone_size, uint8 redzone_pattern)
+    : device_ordinal_(stream->parent()->device_ordinal()),
+      stream_(stream),
+      memory_limit_(memory_limit),
       redzone_size_(RoundUpToNearest(
           redzone_size,
-          static_cast<uint64>(tensorflow::Allocator::kAllocatorAlignment))),
+          static_cast<int64>(tensorflow::Allocator::kAllocatorAlignment))),
       redzone_pattern_(redzone_pattern),
       memory_allocator_(memory_allocator),
       ptx_compilation_opts_(ptx_compilation_opts) {}
 
 port::StatusOr<DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
-    Stream* stream, int64 byte_size) {
+    int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
+  if (byte_size > GetMemoryLimitInBytes()) {
     return port::Status(
         port::error::RESOURCE_EXHAUSTED,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
+            byte_size, GetMemoryLimitInBytes()));
   }
 
   int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
@@ -78,10 +80,10 @@ port::StatusOr<DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
   static_assert(sizeof(uint8) == 1, "Unexpected size");
   DeviceMemory<uint8> allocated_buffer_memory(*allocated_buffer);
 
-  DeviceMemory<uint8> lhs_redzone = stream->parent()->GetSubBuffer(
+  DeviceMemory<uint8> lhs_redzone = stream_->parent()->GetSubBuffer(
       &allocated_buffer_memory, 0, redzone_size_);
 
-  DeviceMemory<uint8> data_chunk = stream->parent()->GetSubBuffer(
+  DeviceMemory<uint8> data_chunk = stream_->parent()->GetSubBuffer(
       &allocated_buffer_memory, redzone_size_, byte_size);
 
   // Split up the RHS redzone into two pieces:
@@ -89,10 +91,10 @@ port::StatusOr<DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
   //  - redzone_size_ bytes.
   // We do this because Stream::ThenMemset32 requires the buffer address and
   // size to be aligned to 4 bytes.
-  DeviceMemory<uint8> rhs_redzone_slop = stream->parent()->GetSubBuffer(
+  DeviceMemory<uint8> rhs_redzone_slop = stream_->parent()->GetSubBuffer(
       &allocated_buffer_memory, redzone_size_ + byte_size, rhs_slop);
 
-  DeviceMemory<uint8> rhs_redzone_nonslop = stream->parent()->GetSubBuffer(
+  DeviceMemory<uint8> rhs_redzone_nonslop = stream_->parent()->GetSubBuffer(
       &allocated_buffer_memory, redzone_size_ + byte_size + rhs_slop,
       redzone_size_);
 
@@ -100,11 +102,11 @@ port::StatusOr<DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
                          redzone_pattern_};
   uint32 pattern32;
   std::memcpy(&pattern32, pattern_arr, sizeof(pattern32));
-  stream->ThenMemset32(&lhs_redzone, pattern32, redzone_size_);
+  stream_->ThenMemset32(&lhs_redzone, pattern32, redzone_size_);
   if (rhs_slop != 0) {
-    stream->ThenMemcpy(&rhs_redzone_slop, &pattern32, rhs_slop);
+    stream_->ThenMemcpy(&rhs_redzone_slop, &pattern32, rhs_slop);
   }
-  stream->ThenMemset32(&rhs_redzone_nonslop, pattern32, redzone_size_);
+  stream_->ThenMemset32(&rhs_redzone_nonslop, pattern32, redzone_size_);
 
   allocated_buffers_.emplace_back(std::move(allocated_buffer), byte_size);
   return data_chunk;
@@ -295,9 +297,8 @@ static port::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
   return RedzoneCheckStatus::OK();
 }
 
-port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones(
-    Stream* stream) const {
-  StreamExecutor* executor = stream->parent();
+port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
+  StreamExecutor* executor = stream_->parent();
 
   absl::Span<const uint8> compiled_ptx = {};
   port::StatusOr<absl::Span<const uint8>> compiled_ptx_or =
@@ -316,7 +317,7 @@ port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones(
 
   ScopedDeviceMemory<uint64> out_param =
       executor->AllocateOwnedScalar<uint64>();
-  stream->ThenMemZero(out_param.ptr(), sizeof(uint64));
+  stream_->ThenMemZero(out_param.ptr(), sizeof(uint64));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ComparisonKernelT> comparison_kernel,
@@ -327,7 +328,7 @@ port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones(
   for (const auto& buf_and_size : allocated_buffers_) {
     TF_ASSIGN_OR_RETURN(
         RedzoneCheckStatus redzone_status,
-        CheckRedzonesForBuffer(stream, *buf_and_size.first, out_param.cref(),
+        CheckRedzonesForBuffer(stream_, *buf_and_size.first, out_param.cref(),
                                *comparison_kernel, buf_and_size.second,
                                redzone_size_, redzone_pattern_));
     if (!redzone_status.ok()) {
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.h b/tensorflow/stream_executor/cuda/redzone_allocator.h
index 42ddd99b7ce..d09a5c0903b 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator.h
+++ b/tensorflow/stream_executor/cuda/redzone_allocator.h
@@ -39,21 +39,24 @@ namespace cuda {
 // memory for cudnn convolutions.
 class RedzoneAllocator : public ScratchAllocator {
  public:
-  RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator,
+  static const int64 kDefaultMemoryLimit = 1LL << 32;  // 4GB
+  static const int64 kDefaultRedzoneSize =
+      1LL << 23;  // 8MiB per side, 16MiB total.
+  static const uint8 kDefaultRedzonePattern = -1;
+  RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
                    cuda::PtxCompilationOptions ptx_compilation_opts,
-                   uint64 redzone_size = 1 << 23,  // 8MiB per side, 16MiB total
-                   uint8 redzone_pattern = -1);
+                   int64 memory_limit = kDefaultMemoryLimit,
+                   int64 redzone_size = kDefaultRedzoneSize,
+                   uint8 redzone_pattern = kDefaultRedzonePattern);
 
   // Redzones don't count towards the memory limit.
-  int64 GetMemoryLimitInBytes(Stream* stream) override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
-  }
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
+
   int64 TotalAllocatedBytesExcludingRedzones() const {
     return allocated_bytes_excluding_redzones_;
   }
 
-  port::StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                                    int64 byte_size) override;
+  port::StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
 
   // Non-empty redzone check status implies that there was a write into a
   // redzone, with a string communicating the location of the write.
@@ -92,12 +95,16 @@ class RedzoneAllocator : public ScratchAllocator {
   //  - RedzoneCheckStatus with a non-empty error message iff a write into a
   //    redzone has been detected.
   //  - A stream error, if loading or launching the kernel has failed.
-  port::StatusOr<RedzoneCheckStatus> CheckRedzones(Stream* stream) const;
+  port::StatusOr<RedzoneCheckStatus> CheckRedzones() const;
 
  private:
   const int device_ordinal_;
+  Stream* stream_;
 
-  // Redzone size on *one side* of allocation.
+  // Memory limit of the allocator in bytes.
+  const int64 memory_limit_;
+
+  // Redzone size on *one side* of allocation in bytes.
   //
   // Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers
   // returned to users will be misaligned.
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
index 23fee5164e5..97aa2c9e301 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
+++ b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
@@ -55,15 +55,17 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
   cuda::PtxCompilationOptions opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
-  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, opts,
-                             kRedzoneSize, kRedzonePattern);
 
   Stream stream(stream_exec);
   stream.Init();
+  RedzoneAllocator allocator(
+      &stream, &se_allocator, opts,
+      /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
+      /*redzone_size=*/kRedzoneSize,
+      /*redzone_pattern=*/kRedzonePattern);
   TF_ASSERT_OK_AND_ASSIGN(DeviceMemory<uint8> buf,
-                          allocator.AllocateBytes(&stream,
-                                                  /*byte_size=*/kAllocSize));
-  EXPECT_REDZONE_OK(allocator.CheckRedzones(&stream));
+                          allocator.AllocateBytes(/*byte_size=*/kAllocSize));
+  EXPECT_REDZONE_OK(allocator.CheckRedzones());
 
   char* buf_addr = reinterpret_cast<char*>(buf.opaque());
   DeviceMemoryBase lhs_redzone(buf_addr - kRedzoneSize, kRedzoneSize);
@@ -100,15 +102,13 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
     DeviceMemoryBase redzone_at_offset(
         reinterpret_cast<char*>(redzone.opaque()) + offset, 1);
     char old_redzone_value = 0;
-    {
-      EXPECT_REDZONE_OK(allocator.CheckRedzones(&stream));
-    }
+    { EXPECT_REDZONE_OK(allocator.CheckRedzones()); }
     stream.ThenMemcpy(&old_redzone_value, redzone_at_offset, 1)
         .ThenMemZero(&redzone_at_offset, 1);
-    EXPECT_REDZONE_VIOLATION(allocator.CheckRedzones(&stream));
+    EXPECT_REDZONE_VIOLATION(allocator.CheckRedzones());
 
     // Checking reinitializes the redzone.
-    EXPECT_REDZONE_OK(allocator.CheckRedzones(&stream));
+    EXPECT_REDZONE_OK(allocator.CheckRedzones());
   };
 
   modify_redzone(lhs_redzone, /*offset=*/0, "lhs");
@@ -130,12 +130,15 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
   cuda::PtxCompilationOptions opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
-  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, opts,
-                             kRedzoneSize, /*redzone_pattern=*/-1);
   Stream stream(stream_exec);
   stream.Init();
-  (void)allocator.AllocateBytes(&stream, /*byte_size=*/1);
-  EXPECT_REDZONE_OK(allocator.CheckRedzones(&stream));
+  RedzoneAllocator allocator(
+      &stream, &se_allocator, opts,
+      /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
+      /*redzone_size=*/kRedzoneSize,
+      /*redzone_pattern=*/-1);
+  (void)allocator.AllocateBytes(/*byte_size=*/1);
+  EXPECT_REDZONE_OK(allocator.CheckRedzones());
 }
 
 }  // namespace
diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
index c9213cfe390..71bd64a65c0 100644
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -147,9 +147,10 @@ class ScopedDeviceMemory {
 // Type alias for compatibility with the previous managed memory implementation.
 using OwningDeviceMemory = ScopedDeviceMemory<uint8>;
 
-// Interface for device memory allocators used within the XLA service. An
-// allocator is responsible for allocating memory on all devices of a particular
-// platform.
+// Memory allocator interface for the device.
+//
+// Intended usage is through Allocate() functions which return an owning smart
+// pointer.
 class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
@@ -186,7 +187,9 @@ class DeviceMemoryAllocator {
     return Allocate(device_ordinal, size, retry_on_failure);
   }
 
-  // Must be a nop for null pointers.
+  // Must be a nop for null pointers. Should not be used.
+  //
+  // TODO(cheshire): Add deprecation notice.
   virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
@@ -194,7 +197,17 @@ class DeviceMemoryAllocator {
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
-  virtual bool AllowsAsynchronousDeallocation() const = 0;
+  virtual bool AllowsAsynchronousDeallocation() const { return false; }
+
+  // Returns nullable stream pointer.
+  //
+  // If the pointer is non-null, then it is always safe to access the memory
+  // allocated by the allocator on the returned stream. This condition is not
+  // required though, as streams could be synchronized by other means.
+  //
+  // TODO(cheshire): clean up the interface, it might be cleaner to explicitly
+  // pass the stream to Compiler.
+  virtual Stream *GetStream() const { return nullptr; }
 
  protected:
   const Platform* platform_;
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index e4db76f0f37..38d6abc69f7 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -27,6 +27,14 @@ uint64 AlgorithmDesc::hash() const {
   return absl::Hash<decltype(p)>()(p);
 }
 
+string AlgorithmDesc::ToString() const {
+  if (tensor_ops_enabled()) {
+    return absl::StrCat(algo_id(), "#TC");
+  } else {
+    return absl::StrCat(algo_id());
+  }
+}
+
 bool DnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
@@ -223,15 +231,15 @@ std::vector<int64> ReorderDims(const std::vector<int64>& input,
 // -- AlgorithmConfig
 
 string AlgorithmConfig::ToString() const {
-  AlgorithmDesc::Index algo_id = -1;
+  string algo = "none";
   if (algorithm().has_value()) {
-    algo_id = algorithm()->algo_id();
+    algo = algorithm()->ToString();
   }
-  AlgorithmDesc::Index algo_id_no_scratch = -1;
+  string algo_no_scratch = "none";
   if (algorithm_no_scratch().has_value()) {
-    algo_id_no_scratch = algorithm_no_scratch()->algo_id();
+    algo_no_scratch = algorithm_no_scratch()->ToString();
   }
-  return absl::StrCat(algo_id, ", ", algo_id_no_scratch);
+  return absl::StrCat(algo, ", ", algo_no_scratch);
 }
 
 // -- BatchDescriptor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 7837c8e3b69..73e378a31ba 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -160,7 +160,7 @@ enum class RnnDirectionMode {
 // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
 // performing depth to space and the read layout when performing space to depth.
 // It's specified with most-major dimension first and most-minor dimension last.
-// In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth,
+// In DepthToSpace, the D*M^2 values are read in and then, for DepthHeightWidth,
 // written out to the output patch, by varying first width, then height, then
 // depth. In C array format, it looks like [depth][height][width]. See
 // DepthToSpace comment for more information.
@@ -783,6 +783,8 @@ class AlgorithmDesc {
 
   AlgorithmProto ToProto() const { return proto_; }
 
+  string ToString() const;
+
  private:
   AlgorithmProto proto_;
 };
@@ -1239,7 +1241,28 @@ class DnnSupport {
     return false;
   }
 
-  template <typename ElementType>
+  // This is the int8 version of DoFusedConvolve.
+  // The output, bias input and scaling parameters are floats.
+  virtual bool DoFusedConvolve(
+      Stream* /*stream*/, const dnn::BatchDescriptor& /*conv_input_descriptor*/,
+      const DeviceMemory<int8>& /*conv_input_data*/, float /*conv_input_scale*/,
+      const dnn::FilterDescriptor& /*filter_descriptor*/,
+      const DeviceMemory<int8>& /*filter_data*/,
+      const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
+      const DeviceMemory<float>& /*side_input_data*/,
+      float /*side_input_scale*/,
+      const dnn::BatchDescriptor& /*bias_descriptor*/,
+      const DeviceMemory<float>& /*biases*/,
+      dnn::ActivationMode /*activation_mode*/,
+      const dnn::BatchDescriptor& /*output_descriptor*/,
+      DeviceMemory<float>* /*output_data*/,
+      ScratchAllocator* /*scratch_allocator*/,
+      const dnn::AlgorithmConfig& /*algorithm_config*/,
+      dnn::ProfileResult* /*output_profile_result*/) {
+    return false;
+  }
+
+  template <typename ElementType, typename OutputType>
   port::Status PrepareForConvolution(
       ConvolutionKind kind, Stream* stream,
       const BatchDescriptor& batch_descriptor,
@@ -1247,7 +1270,7 @@ class DnnSupport {
       const FilterDescriptor& filter_descriptor,
       DeviceMemory<ElementType> filter_data,
       const BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType> output_data,
+      DeviceMemory<OutputType> output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       const AlgorithmConfig& algorithm_config,
       ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
@@ -1294,31 +1317,32 @@ class DnnSupport {
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
   virtual port::Status DoConvolve(
-      ConvolutionKind kind, DataType element_type, Stream* stream,
-      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
-      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      ConvolutionKind kind, DataType element_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
-  template <typename ElementType>
+  template <typename ElementType, typename OutputType>
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
                   const DeviceMemory<ElementType>& input_data,
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<ElementType>& filter_data,
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<ElementType>* output_data,
+                  DeviceMemory<OutputType>* output_data,
                   const dnn::AlgorithmDesc& algorithm_desc,
                   DeviceMemory<uint8>* scratch_memory,
                   ProfileResult* output_profile_result) {
     return IsStatusOk(
         DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
-                   stream, input_descriptor, input_data, filter_descriptor,
-                   filter_data, output_descriptor, *output_data,
-                   convolution_descriptor, algorithm_desc, *scratch_memory,
-                   output_profile_result),
+                   ToDataType<OutputType>::value, stream, input_descriptor,
+                   input_data, filter_descriptor, filter_data,
+                   output_descriptor, *output_data, convolution_descriptor,
+                   algorithm_desc, *scratch_memory, output_profile_result),
         !output_profile_result);
   }
 
@@ -1406,12 +1430,12 @@ class DnnSupport {
       DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) {
     return IsStatusOk(
-        DoConvolve(ConvolutionKind::BACKWARD_DATA,
-                   ToDataType<ElementType>::value, stream, input_descriptor,
-                   *backward_input_data, filter_descriptor, filter_data,
-                   output_descriptor, backward_output_data,
-                   convolution_descriptor, algorithm_desc, *scratch_memory,
-                   output_profile_result),
+        DoConvolve(
+            ConvolutionKind::BACKWARD_DATA, ToDataType<ElementType>::value,
+            ToDataType<ElementType>::value, stream, input_descriptor,
+            *backward_input_data, filter_descriptor, filter_data,
+            output_descriptor, backward_output_data, convolution_descriptor,
+            algorithm_desc, *scratch_memory, output_profile_result),
         !output_profile_result);
   }
 
@@ -1453,12 +1477,12 @@ class DnnSupport {
       DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) {
     return IsStatusOk(
-        DoConvolve(ConvolutionKind::BACKWARD_FILTER,
-                   ToDataType<ElementType>::value, stream, input_descriptor,
-                   input_data, filter_descriptor, *backward_filter_data,
-                   output_descriptor, backward_output_data,
-                   convolution_descriptor, algorithm_desc, *scratch_memory,
-                   output_profile_result),
+        DoConvolve(
+            ConvolutionKind::BACKWARD_FILTER, ToDataType<ElementType>::value,
+            ToDataType<ElementType>::value, stream, input_descriptor,
+            input_data, filter_descriptor, *backward_filter_data,
+            output_descriptor, backward_output_data, convolution_descriptor,
+            algorithm_desc, *scratch_memory, output_profile_result),
         !output_profile_result);
   }
 
@@ -1842,8 +1866,8 @@ class DnnSupport {
     return false;
   }
 
-  // Depth to space takes an X by Y image with depth D*M² and changes it to an
-  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
+  // Depth to space takes an X by Y image with depth D*M^2 and changes it to an
+  // MX x MY image with depth D. Each input location (x,y) with depth D*M^2 in
   // the input image is changed to an MxM contiguous area in the output image,
   // with the values being laid out in the raster order by DepthToSpaceLayout,
   // and will have a new depth of D.
@@ -1873,9 +1897,9 @@ class DnnSupport {
 
   // Space to depth is the inverse of depth to space. Space to depth takes each
   // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
-  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
-  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
-  // data elements is not changed.
+  // the input, and transforms it to a 1 by 1 patch with depth D*M^2. If the
+  // input has size (MX, MY, D), the output has size (X, Y, D*M^2). The number
+  // of data elements is not changed.
   //
   // Example.
   // M=2, Din =2, Xin=4, Yin=4,  Dout=8
@@ -2095,6 +2119,7 @@ class DnnSupport {
   //  state_allocator: an memory allocator that will be used to store the state
   //    for dropout layer. The user has to maintain the memory until the model
   //    is no longer in use.
+  //  use_padded_io: a bool to specify whether the input is using padded IO.
   virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
                       int cell_size, int batch_size,
@@ -2103,7 +2128,7 @@ class DnnSupport {
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
                       const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
-                      ScratchAllocator* state_allocator) {
+                      ScratchAllocator* state_allocator, bool use_padded_io) {
     return port::Status(port::error::UNIMPLEMENTED,
                         "createRnnDescriptor is unimplemented");
   }
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 1981490f7ea..cd598b486dc 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -6,7 +6,7 @@ load(
     "if_gpu_is_configured",
 )
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 57a87fb5bd2..5de443ed7ad 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -197,19 +197,18 @@ class GpuDriver {
   // TODO(leary) describe the structure of kernel_params and extra in a readable
   // way.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
-                           unsigned int grid_dim_x, unsigned int grid_dim_y,
-                           unsigned int grid_dim_z, unsigned int block_dim_x,
-                           unsigned int block_dim_y, unsigned int block_dim_z,
-                           unsigned int shared_mem_bytes,
-                           GpuStreamHandle stream, void** kernel_params,
-                           void** extra);
+  static port::Status LaunchKernel(
+      GpuContext* context, GpuFunctionHandle function, unsigned int grid_dim_x,
+      unsigned int grid_dim_y, unsigned int grid_dim_z,
+      unsigned int block_dim_x, unsigned int block_dim_y,
+      unsigned int block_dim_z, unsigned int shared_mem_bytes,
+      GpuStreamHandle stream, void** kernel_params, void** extra);
 
   // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
   // handle in "module". Any error logs that are produced are logged internally.
   // (supported on CUDA only)
-  static bool LoadPtx(GpuContext* context, const char* ptx_contents,
-                      GpuModuleHandle* module);
+  static port::Status LoadPtx(GpuContext* context, const char* ptx_contents,
+                              GpuModuleHandle* module);
 
   // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
   // the resulting handle in "module".
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index 2149f136877..c61bd732176 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -61,17 +61,17 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
-  bool GetKernel(const MultiKernelLoaderSpec& spec,
-                 KernelBase* kernel) override;
+  port::Status GetKernel(const MultiKernelLoaderSpec& spec,
+                         KernelBase* kernel) override;
   // (supported on CUDA only)
   void UnloadKernel(const KernelBase* kernel) override;
-  bool LoadModule(const MultiModuleLoaderSpec& spec,
-                  ModuleHandle* module_handle) override;
+  port::Status LoadModule(const MultiModuleLoaderSpec& spec,
+                          ModuleHandle* module_handle) override;
   bool UnloadModule(ModuleHandle module_handle) override;
 
-  bool Launch(Stream* stream, const ThreadDim& thread_dims,
-              const BlockDim& block_dims, const KernelBase& k,
-              const KernelArgsArrayBase& args) override;
+  port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                      const BlockDim& block_dims, const KernelBase& k,
+                      const KernelArgsArrayBase& args) override;
 
   // (supported on CUDA only)
   int CalculateOccupancy(const DeviceDescription& device_description,
@@ -271,12 +271,12 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                          const BlockDim& block_dims);
 
   // (supported on CUDA only)
-  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+  port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
   // (supported on CUDA only)
-  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+  port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // (supported on ROCm only)
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index d24eec6646c..75f5431d2c2 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -254,6 +254,9 @@ HostExecutor::CreateDeviceDescription(int device_ordinal) {
       tensorflow::profile_utils::CpuUtils::GetCycleCounterFrequency());
   builder.set_clock_rate_ghz(cycle_counter_frequency / 1e9);
 
+  builder.set_name("Host");
+  builder.set_platform_version("Default Version");
+
   return builder.Build();
 }
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index dfe43e1f43a..d0cc004e43c 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -50,14 +50,14 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return port::Status::OK();
   }
 
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override {
-    return false;
+  port::Status GetKernel(const MultiKernelLoaderSpec &spec,
+                         KernelBase *kernel) override {
+    return port::UnimplementedError("Not Implemented");
   }
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &kernel,
-              const KernelArgsArrayBase &args) override {
-    return false;
+  port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims, const KernelBase &kernel,
+                      const KernelArgsArrayBase &args) override {
+    return port::UnimplementedError("Not Implemented");
   }
 
   void *Allocate(uint64 size) override;
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 9384db68582..1e4f375073e 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -525,16 +525,19 @@ class TypedKernel : public KernelBase {
   // structure.
   void PackParams(KernelArgsArray<kNumberOfParameters> *args,
                   Params &... params) const {
-    PackOneParam(args, params...);
+    PackOneParamFromList(args, params...);
   }
 
   template <typename T, typename... RestOfParams>
-  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, const T &arg,
-                    const RestOfParams &... rest) const {
+  void PackOneParamFromList(KernelArgsArray<kNumberOfParameters> *args,
+                            const T &arg, const RestOfParams &... rest) const {
     PackOneParam(args, arg);
-    PackOneParam(args, rest...);
+    PackOneParamFromList(args, rest...);
   }
 
+  // Base case for variadic template expansion - nothing to do!
+  void PackOneParamFromList(KernelArgsArray<kNumberOfParameters> *args) const {}
+
   // Packs one (non-DeviceMemoryBase) parameter into the arg and sizes array.
   // The enable_if<> is for excluding DeviceMemoryBase args, which have a
   // separate implementation below.
@@ -581,9 +584,6 @@ class TypedKernel : public KernelBase {
     args->add_shared_bytes(arg.size());
   }
 
-  // Base case for variadic template expansion - nothing to do!
-  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args) const {}
-
   SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel);
 };
 
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
index f5071d1eb9d..f9540db0103 100644
--- a/tensorflow/stream_executor/platform/BUILD
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -1,5 +1,4 @@
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_platform_hdrs")
 
 package(
     default_visibility = [":friends"],
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
index e039b5e4f57..bd6404b92b5 100644
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -2,7 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
 
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 cc_library(
@@ -15,10 +14,11 @@ cc_library(
 
 cc_library(
     name = "dso_loader",
-    srcs = ["dso_loader.cc"] + if_static(
-        ["dlopen_checker_stub.cc"],
-        ["dlopen_checker.cc"],
-    ),
+    srcs = ["dso_loader.cc"] + select({
+        # include dynamic loading checker only for open source build
+        "//tensorflow:oss": ["dlopen_checker.cc"],
+        "//conditions:default": ["dlopen_checker_stub.cc"],
+    }),
     hdrs = ["dso_loader.h"],
     copts = tf_copts(),
     deps = [
diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker.cc b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
index 750c1f29d37..b55c9f53793 100644
--- a/tensorflow/stream_executor/platform/default/dlopen_checker.cc
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
@@ -20,7 +20,7 @@ namespace stream_executor {
 namespace internal {
 namespace DsoLoader {
 
-port::Status MaybeTryDlopenCUDALibraries() {
+port::Status TryDlopenCUDALibraries() {
   auto cudart_status = GetCudaRuntimeDsoHandle();
   auto cublas_status = GetCublasDsoHandle();
   auto cufft_status = GetCufftDsoHandle();
@@ -39,7 +39,7 @@ port::Status MaybeTryDlopenCUDALibraries() {
   }
 }
 
-port::Status MaybeTryDlopenROCmLibraries() {
+port::Status TryDlopenROCmLibraries() {
   auto rocblas_status = GetRocblasDsoHandle();
   auto miopen_status = GetMiopenDsoHandle();
   auto rocfft_status = GetRocfftDsoHandle();
@@ -55,14 +55,26 @@ port::Status MaybeTryDlopenROCmLibraries() {
 
 port::Status MaybeTryDlopenGPULibraries() {
 #if GOOGLE_CUDA
-  return MaybeTryDlopenCUDALibraries();
+  return TryDlopenCUDALibraries();
 #elif TENSORFLOW_USE_ROCM
-  return MaybeTryDlopenROCmLibraries();
+  return TryDlopenROCmLibraries();
 #else
   LOG(INFO) << "Not built with GPU enabled. Skip GPU library dlopen check.";
   return port::Status::OK();
 #endif
 }
+
+port::Status TryDlopenTensorRTLibraries() {
+  auto nvinfer_status = GetNvInferDsoHandle();
+  auto nvinferplugin_status = GetNvInferPluginDsoHandle();
+  if (!nvinfer_status.status().ok() || !nvinferplugin_status.status().ok()) {
+    return port::Status(port::error::INTERNAL,
+                        absl::StrCat("Cannot dlopen all TensorRT libraries."));
+  } else {
+    return port::Status::OK();
+  }
+}
+
 }  // namespace DsoLoader
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index e9927a7d9f2..7eee2e60785 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -57,6 +57,11 @@ port::StatusOr<void*> GetHipDsoHandle();
 // dynamically loaded. Error status is returned when any of the libraries cannot
 // be dlopened.
 port::Status MaybeTryDlopenGPULibraries();
+
+// The following method tries to dlopen all necessary TensorRT libraries when
+// these libraries should be dynamically loaded. Error status is returned when
+// any of the libraries cannot be dlopened.
+port::Status TryDlopenTensorRTLibraries();
 }  // namespace DsoLoader
 
 // Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
diff --git a/tensorflow/stream_executor/platform/dso_loader.h b/tensorflow/stream_executor/platform/dso_loader.h
index 1dd56684b19..dd752e962c8 100644
--- a/tensorflow/stream_executor/platform/dso_loader.h
+++ b/tensorflow/stream_executor/platform/dso_loader.h
@@ -22,7 +22,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/stream_executor/platform/google/dso_loader.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID)
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
 #include "tensorflow/stream_executor/platform/default/dso_loader.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/stream_executor/platform/platform.h b/tensorflow/stream_executor/platform/platform.h
index 5bf0e120d39..b7c615c72ad 100644
--- a/tensorflow/stream_executor/platform/platform.h
+++ b/tensorflow/stream_executor/platform/platform.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
 
-#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
-    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) &&                 \
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
+    !defined(PLATFORM_WINDOWS)
 
 // Choose which platform we are on.
 #if defined(ANDROID) || defined(__ANDROID__)
@@ -26,6 +27,9 @@ limitations under the License.
 #elif defined(__APPLE__)
 #define PLATFORM_POSIX
 
+#elif defined(_WIN32)
+#define PLATFORM_WINDOWS
+
 #else
 // If no platform specified, use:
 #define PLATFORM_POSIX
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index 71139c01e23..008de9e918d 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -8,7 +8,7 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static")
 
 package(
     default_visibility = [":friends"],
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index 52c654617f8..a5a588bbbde 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -1849,7 +1849,7 @@ port::Status ROCMBlas::AllocateStridedBuffer(
   if (scratch_allocator != nullptr) {
     SE_ASSIGN_OR_RETURN(
         DeviceMemory<uint8> batch_matrix_bytes,
-        scratch_allocator->AllocateBytes(stream, matrix_batch_byte_size));
+        scratch_allocator->AllocateBytes(matrix_batch_byte_size));
     *device_memory = DeviceMemory<MAPPED_T>(batch_matrix_bytes);
   } else {
     assert(temp_memory != nullptr);
@@ -2407,6 +2407,11 @@ bool ROCMBlas::DoBlasGemmStridedBatched(
              << "for the \"complex<double>\" dataype";
   return false;
 }
+
+port::Status ROCMBlas::GetVersion(string *version) {
+  return port::UnimplementedError("");
+}
+
 }  // namespace gpu
 
 void initialize_rocblas() {
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index efe49ddcf3f..90de516fa25 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -1985,7 +1985,7 @@ bool CreateRnnWorkspace(Stream* stream, miopenHandle_t miopen_handle,
   // Allocate the workspace.
   if (workspace_size_in_bytes > 0) {
     auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
     if (!allocated.ok() || (*workspace = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate RNN workspace";
 
@@ -2062,8 +2062,8 @@ bool MIOpenSupport::DoRnnForwardImpl(
     }
 
     if (reserve_space_size_in_bytes > 0) {
-      auto allocated = reserve_space_allocator->AllocateBytes(
-          stream, reserve_space_size_in_bytes);
+      auto allocated =
+          reserve_space_allocator->AllocateBytes(reserve_space_size_in_bytes);
       if (!allocated.ok() ||
           (reserve_space = allocated.ValueOrDie()) == nullptr) {
         LOG(ERROR) << "Fail to allocate RNN reserve space";
@@ -2280,7 +2280,8 @@ MIOpenSupport::createRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    float dropout, uint64 seed, ScratchAllocator* state_allocator) {
+    float dropout, uint64 seed, ScratchAllocator* state_allocator,
+    bool use_padded_io) {
   // ROCM TODO: cell_size is ignored for now
   // ROCM TODO: batch_size is ignored for now
 
@@ -2575,8 +2576,7 @@ struct MIOpenAllocatorContext {
 
 void* MIOpenAllocatorCallback(void* ctx, size_t size_in_bytes) {
   auto* mac = static_cast<MIOpenAllocatorContext*>(ctx);
-  auto allocated =
-      mac->scratch_allocator_->AllocateBytes(mac->stream_, size_in_bytes);
+  auto allocated = mac->scratch_allocator_->AllocateBytes(size_in_bytes);
 
   DeviceMemory<uint8> scratch;
   if (allocated.ok()) {
@@ -2659,7 +2659,7 @@ port::Status MIOpenSupport::DoPrepareForConvolution(
     }
 
     if (status == miopenStatusSuccess && size_in_bytes != 0) {
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
+      auto allocated = scratch_allocator->AllocateBytes(size_in_bytes);
       if (allocated.ok()) {
         scratch_memory_temp = allocated.ValueOrDie();
       }
@@ -2744,8 +2744,7 @@ port::Status MIOpenSupport::DoPrepareForConvolution(
           absl::StrCat("An allocator must be specified when scratch memory is "
                        "needed"));
     }
-    auto allocated =
-        scratch_allocator->AllocateBytes(stream, scratch_memory_size);
+    auto allocated = scratch_allocator->AllocateBytes(scratch_memory_size);
     if (!allocated.ok()) {
       return port::InternalError(absl::StrCat(
           "Failed to allocate scratch memory of size: ", scratch_memory_size));
@@ -2806,7 +2805,8 @@ static DeviceMemoryBase MaybeTransformLayout(
 }
 
 port::Status MIOpenSupport::DoConvolve(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -3600,7 +3600,7 @@ bool MIOpenSupport::DoPoolBackward(
   if (workspace_size_in_bytes > 0) {
     assert(workspace_allocator);
     auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
     if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate backward pooling workspace";
       return false;
@@ -3624,7 +3624,7 @@ bool MIOpenSupport::DoPoolBackward(
 
   if (dest2_size > 0) {
     assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
     if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate backward pooling workspace";
       return false;
@@ -3696,7 +3696,7 @@ bool MIOpenSupport::DoPoolBackward(
   if (workspace_size_in_bytes > 0) {
     assert(workspace_allocator);
     auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
     if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate backward pooling workspace";
       return false;
@@ -3720,7 +3720,7 @@ bool MIOpenSupport::DoPoolBackward(
 
   if (dest2_size > 0) {
     assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
     if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate backward pooling workspace";
       return false;
@@ -3831,7 +3831,7 @@ bool MIOpenSupport::DoNormalizeBackwardWithDimensions(
   if (workspace_size_in_bytes > 0) {
     assert(workspace_allocator);
     auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
     if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR) << "Failed to allocate backward pooling workspace";
       return false;
@@ -3856,7 +3856,7 @@ bool MIOpenSupport::DoNormalizeBackwardWithDimensions(
 
   if (dest2_size > 0) {
     assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
     if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
       LOG(ERROR)
           << "Failed to allocate tensor to chain forward and backward LRN";
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 5bc0914f140..346d25afe6d 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -50,7 +50,8 @@ class MIOpenSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      float dropout, uint64 seed, ScratchAllocator* state_allocator) override;
+      float dropout, uint64 seed, ScratchAllocator* state_allocator,
+      bool use_padded_io) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
@@ -258,7 +259,8 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator) override;
 
   port::Status DoConvolve(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType element_type,
+      dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemoryBase filter_data,
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 7cd35fff2a0..1aae9f29740 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -419,7 +419,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool GpuDriver::LaunchKernel(
+/* static */ port::Status GpuDriver::LaunchKernel(
     GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
@@ -434,19 +434,18 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
       block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
   if (res != hipSuccess) {
-    LOG(ERROR) << "failed to launch ROCM kernel: " << function
-               << "; result: " << ToString(res);
-    return false;
+    return port::InternalError(
+        absl::StrCat("Failed to launch ROCM kernel: ", ToString(res)));
   }
   VLOG(2) << "successfully launched kernel";
-  return true;
+  return port::Status::OK();
 }
 
-/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
-                                     const char* ptx_contents,
-                                     hipModule_t* module) {
+/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
+                                             const char* ptx_contents,
+                                             hipModule_t* module) {
   LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
-  return false;
+  return port::InternalError("Not Implemented");
 }
 
 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.cc b/tensorflow/stream_executor/rocm/rocm_fft.cc
index d2c542fef18..82dce9ef354 100644
--- a/tensorflow/stream_executor/rocm/rocm_fft.cc
+++ b/tensorflow/stream_executor/rocm/rocm_fft.cc
@@ -272,8 +272,7 @@ port::Status ROCMFftPlan::Initialize(
       // TODO(yangzihao): refactor this code and the one with the same function
       // in the batch mode.
       if (size_in_bytes != 0) {
-        auto allocated =
-            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        auto allocated = scratch_allocator->AllocateBytes(size_in_bytes);
         if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
           LOG(ERROR) << "failed to allocate work area.";
           return allocated.status();
@@ -328,8 +327,7 @@ port::Status ROCMFftPlan::Initialize(
                             "Failed to make rocFFT bacthed plan."};
       }
       if (size_in_bytes != 0) {
-        auto allocated =
-            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        auto allocated = scratch_allocator->AllocateBytes(size_in_bytes);
         if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
           LOG(ERROR) << "failed to allocate work area.";
           return allocated.status();
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index e37d6d24232..d1ee42e0448 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -230,8 +230,8 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                            KernelBase* kernel) {
+port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                    KernelBase* kernel) {
   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
   hipModule_t module = nullptr;
   const string* kernelname;
@@ -243,8 +243,8 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   }
 
   if (on_disk_spec != nullptr) {
-    LOG(WARNING) << "loading ROCM kernel from disk is not supported";
-    return false;
+    return port::InternalError(
+        "Loading ROCM kernel from disk is not supported");
   } else if (spec.has_cuda_cubin_in_memory()) {
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
 
@@ -254,20 +254,18 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
 
     if (module == nullptr) {
       if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
-        LOG(ERROR) << "failed to load HSACO\n";
-        return false;
+        return port::InternalError("Failed to load HSACO");
       }
     }
     kernel_to_gpu_binary_[kernel] = hsaco;
   } else {
-    LOG(WARNING) << "no method of loading ROCM kernel provided";
-    return false;
+    return port::InternalError("No method of loading ROCM kernel provided");
   }
 
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                     rocm_kernel->gpu_function_ptr())) {
-    return false;
+    return port::InternalError("Failed getting module function");
   }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
@@ -280,7 +278,7 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   }
   kernel->set_metadata(kernel_metadata);
   kernel->set_name(*kernelname);
-  return true;
+  return port::Status::OK();
 }
 
 bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
@@ -295,9 +293,10 @@ bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
   return true;
 }
 
-bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
-                         const BlockDim& block_dims, const KernelBase& kernel,
-                         const KernelArgsArrayBase& args) {
+port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                 const BlockDim& block_dims,
+                                 const KernelBase& kernel,
+                                 const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
@@ -339,18 +338,10 @@ bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
   void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
 
-  if (!GpuDriver::LaunchKernel(
-          GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
-          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
-          args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
-    LOG(ERROR) << "failed to launch ROCM kernel with args: "
-               << args.number_of_arguments()
-               << "; thread dim: " << thread_dims.ToString()
-               << "; block dim: " << block_dims.ToString();
-    return false;
-  }
-
-  return true;
+  return GpuDriver::LaunchKernel(
+      GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y, block_dims.z,
+      thread_dims.x, thread_dims.y, thread_dims.z,
+      args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
 }
 
 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
@@ -372,8 +363,8 @@ int GpuExecutor::CompareOccupancy(int* initial_blocks,
   return 0;
 }
 
-bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                             ModuleHandle* module_handle) {
+port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                                     ModuleHandle* module_handle) {
   // In GpuExecutor we store the pointer to the  HSACO binary  as
   // ModuleHandle::id().
   hipModule_t hip_module = nullptr;
@@ -383,25 +374,24 @@ bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     if (!LoadModuleFromHsaco(
             reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
             &hip_module)) {
-      return false;
+      return port::InternalError("Failed loading module from HSACO");
     }
     *module_handle = ModuleHandle(const_cast<void*>(
         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
-    return true;
+    return port::Status::OK();
   } else {
-    LOG(ERROR) << "No HSACO binary found \n";
-    return false;
+    return port::InternalError("No HASCO binary found");
   }
 }
 
-bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
+port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
+                                              hipModule_t* module) {
   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
-  return false;
 }
 
-bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
+port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
+                                            hipModule_t* module) {
   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
-  return false;
 }
 
 bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
diff --git a/tensorflow/stream_executor/scratch_allocator.cc b/tensorflow/stream_executor/scratch_allocator.cc
index 8fc4c4c509c..520ee8a4208 100644
--- a/tensorflow/stream_executor/scratch_allocator.cc
+++ b/tensorflow/stream_executor/scratch_allocator.cc
@@ -22,18 +22,17 @@ namespace stream_executor {
 
 ScratchAllocator::~ScratchAllocator() {}
 
-OneTimeScratchAllocator::OneTimeScratchAllocator() {}
+OneTimeScratchAllocator::OneTimeScratchAllocator(Stream* stream)
+    : stream_(stream) {}
 OneTimeScratchAllocator::~OneTimeScratchAllocator() {}
 
-int64 OneTimeScratchAllocator::GetMemoryLimitInBytes(Stream* stream) {
-  return -1;
-}
+int64 OneTimeScratchAllocator::GetMemoryLimitInBytes() { return -1; }
 
 port::StatusOr<DeviceMemory<uint8>> OneTimeScratchAllocator::AllocateBytes(
-    Stream* stream, int64 byte_size) {
+    int64 byte_size) {
   CHECK(temporary_ == nullptr);
   SE_ASSIGN_OR_RETURN(temporary_,
-                      stream->AllocateTemporaryArray<uint8>(byte_size));
+                      stream_->AllocateTemporaryArray<uint8>(byte_size));
   return temporary_->device_memory();
 }
 
diff --git a/tensorflow/stream_executor/scratch_allocator.h b/tensorflow/stream_executor/scratch_allocator.h
index 2aed2c44373..29b4e5aa012 100644
--- a/tensorflow/stream_executor/scratch_allocator.h
+++ b/tensorflow/stream_executor/scratch_allocator.h
@@ -27,16 +27,12 @@ namespace stream_executor {
 
 class Stream;
 
-// Interface that allows stream operations (e.g.
-// Stream::ThenConvolveWithScratch) to optionally request scratch space be
-// allocated in order to speed up the operation being enqueued.
+// Interface for "scratch" allocator for device memory, which deallocates all
+// buffers it has allocated at destruction. Returned memory pointers are not
+// owning.
 //
-// Note that the caller is responsible for deallocating the scratch space at a
-// known-safe point, when all scratch-memory-consuming kernels are known for
-// sure to have finished; e.g. at stream synchronization time. This is different
-// from a traditional C++ object allocator, where the client is responsible for
-// releasing. (Conceptually, scratch memory is a form of "temporary" device
-// memory allocation.)
+// Used by stream operations (e.g. Stream::ThenConvolveWithScratch) to optonally
+// request scratch space to speed up the operation.
 class ScratchAllocator {
  public:
   virtual ~ScratchAllocator();
@@ -45,14 +41,14 @@ class ScratchAllocator {
   // bytes. This information may be used to help select an algorithm.
   //
   // Returns values < 0 to indicate that there is no recommended limit.
-  virtual int64 GetMemoryLimitInBytes(Stream* stream) = 0;
+  virtual int64 GetMemoryLimitInBytes() = 0;
 
   // Returns an allocation on byte_size bytes for use in an operation on stream.
   //
   // This is a temporary allocation, and the caller is responsible for
   // deallocating at some known-safe point. See the class comment above.
   virtual port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      Stream* stream, int64 byte_size) = 0;
+      int64 byte_size) = 0;
 };
 
 // Allocates a single temporary memory allocation -- this memory is deallocated
@@ -64,14 +60,14 @@ class ScratchAllocator {
 // thread will request the scratch allocation).
 class OneTimeScratchAllocator : public ScratchAllocator {
  public:
-  OneTimeScratchAllocator();
+  explicit OneTimeScratchAllocator(Stream* stream);
   ~OneTimeScratchAllocator() override;
-  int64 GetMemoryLimitInBytes(Stream* stream) override;
-  port::StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
-                                                    int64 byte_size) override;
+  int64 GetMemoryLimitInBytes() override;
+  port::StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
 
  private:
   std::unique_ptr<TemporaryDeviceMemory<uint8>> temporary_;
+  Stream* stream_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(OneTimeScratchAllocator);
 };
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index fda4581456c..c1dc49ff1be 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -606,6 +606,44 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
   return *this;
 }
 
+Stream &Stream::ThenFusedConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &conv_input_descriptor,
+    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<int8> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<float> &side_input_data, float side_input_scale,
+    const dnn::BatchDescriptor &bias_descriptor,
+    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
+            PARAM(conv_input_scale), PARAM(filter_descriptor),
+            PARAM(filter_data), PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(side_input_data), PARAM(side_input_scale),
+            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
+            PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoFusedConvolve(
+          this, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output, scratch_allocator,
+          algorithm_config, output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolveWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<double> &input_data,
@@ -732,6 +770,90 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   return *this;
 }
 
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<int8> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<int8> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(output_descriptor),
+            PARAM(output), PARAM(algorithm_config));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<int8> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<int8> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(output_descriptor),
+            PARAM(output), PARAM(algorithm_config));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolve(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -938,7 +1060,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
   VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(output_descriptor), PARAM(backward_output_data),
             PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
+            PARAM(backward_input_data), PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 54e4be5dbeb..dddd0fa6441 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -343,6 +343,28 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<int8> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<int8> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<int8> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<int8> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenFusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<double> &conv_input_data, double conv_input_scale,
@@ -400,6 +422,20 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
+  Stream &ThenFusedConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &conv_input_descriptor,
+      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<int8> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<float> &side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor &bias_descriptor,
+      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenSeparableConvolve(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 4619fe18cb6..ca60a0999bd 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -179,20 +179,21 @@ class StreamExecutorInterface {
   virtual port::Status Init(int device_ordinal,
                             DeviceOptions device_options) = 0;
 
-  virtual bool GetKernel(const MultiKernelLoaderSpec &spec,
-                         KernelBase *kernel) {
-    return false;
-  }
-  virtual bool LoadModule(const MultiModuleLoaderSpec &spec,
-                          ModuleHandle *module_handle) {
-    return false;
+  virtual port::Status GetKernel(const MultiKernelLoaderSpec &spec,
+                                 KernelBase *kernel) {
+    return port::UnimplementedError("Not Implemented");
   }
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
-  virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
-                      const BlockDim &block_dims, const KernelBase &k,
-                      const KernelArgsArrayBase &args) {
-    return false;
+  virtual port::Status LoadModule(const MultiModuleLoaderSpec &spec,
+                                  ModuleHandle *module_handle) {
+    return port::UnimplementedError("Not Implemented");
   }
+  virtual port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                              const BlockDim &block_dims, const KernelBase &k,
+                              const KernelArgsArrayBase &args) {
+    return port::UnimplementedError("Not Implemented");
+  }
+
   // Releases any state associated with the kernel.
   virtual void UnloadKernel(const KernelBase *kernel) {}
   virtual void *Allocate(uint64 size) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 839f1cd20be..f8b6655e586 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -188,8 +188,8 @@ port::Status StreamExecutor::Init(DeviceOptions device_options) {
 
 port::Status StreamExecutor::Init() { return Init(DeviceOptions::Default()); }
 
-bool StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
-                               KernelBase *kernel) {
+port::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
+                                       KernelBase *kernel) {
   return implementation_->GetKernel(spec, kernel);
 }
 
@@ -197,8 +197,8 @@ void StreamExecutor::UnloadKernel(const KernelBase *kernel) {
   implementation_->UnloadKernel(kernel);
 }
 
-bool StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
-                                ModuleHandle *module_handle) {
+port::Status StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+                                        ModuleHandle *module_handle) {
   return implementation_->LoadModule(spec, module_handle);
 }
 
@@ -340,7 +340,8 @@ StreamExecutor::createRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config,
-    float dropout, uint64 seed, ScratchAllocator *state_allocator) {
+    float dropout, uint64 seed, ScratchAllocator *state_allocator,
+    bool use_padded_io) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return port::Status(port::error::UNKNOWN,
@@ -349,7 +350,7 @@ StreamExecutor::createRnnDescriptor(
   return dnn_support->createRnnDescriptor(
       num_layers, hidden_size, input_size, cell_size, batch_size, input_mode,
       direction_mode, rnn_mode, data_type, algorithm_config, dropout, seed,
-      state_allocator);
+      state_allocator, use_padded_io);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -433,10 +434,11 @@ rng::RngSupport *StreamExecutor::AsRng() {
   return rng_.get();
 }
 
-bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
-                            const BlockDim &block_dims,
-                            const KernelBase &kernel,
-                            const KernelArgsArrayBase &args) {
+port::Status StreamExecutor::Launch(Stream *stream,
+                                    const ThreadDim &thread_dims,
+                                    const BlockDim &block_dims,
+                                    const KernelBase &kernel,
+                                    const KernelArgsArrayBase &args) {
   SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
               kernel, args);
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index d2f2f591e2a..efa4034c88a 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -100,8 +100,8 @@ class StreamExecutor {
   //    instantiation should not be loaded into more than once.
   //
   // If an error occurs, or there is no kernel available for the StreamExecutor
-  // platform, false is returned.
-  bool GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel);
+  // platform, error status is returned.
+  port::Status GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel);
 
   // Releases any state associated with the previously loaded kernel.
   void UnloadKernel(const KernelBase *kernel);
@@ -109,9 +109,10 @@ class StreamExecutor {
   // Loads a module for the platform this StreamExecutor is acting upon.
   //
   // `spec` describes the module to be loaded.  On success writes the handle for
-  // the loaded module to `module_handle` and returns true.  Else returns false.
-  bool LoadModule(const MultiModuleLoaderSpec &spec,
-                  ModuleHandle *module_handle);
+  // the loaded module to `module_handle` and returns Status::OK.
+  // Otherwise, returns the error which has occurred.
+  port::Status LoadModule(const MultiModuleLoaderSpec &spec,
+                          ModuleHandle *module_handle);
 
   // Unloads the module with handle `module_handle`.
   bool UnloadModule(ModuleHandle module_handle);
@@ -185,9 +186,6 @@ class StreamExecutor {
   //
   // Resets the internal contents of mem to be null-representative, but this
   // null-out effect should not be relied upon in client code.
-  //
-  // TODO(jlebar): Change this to accept a DeviceMemoryBase by value, see
-  // discussion in cl/195744342.
   void Deallocate(DeviceMemoryBase *mem);
 
   // Retrieves a mapping of active opaque device memory pointer to a string
@@ -398,7 +396,8 @@ class StreamExecutor {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config,
-      float dropout, uint64 seed, ScratchAllocator *state_allocator);
+      float dropout, uint64 seed, ScratchAllocator *state_allocator,
+      bool use_padded_io);
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
@@ -451,9 +450,9 @@ class StreamExecutor {
   //
   // This is called by Stream::Launch() to delegate to the platform's launch
   // implementation in StreamExecutorInterface::Launch().
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &kernel,
-              const KernelArgsArrayBase &args);
+  port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims, const KernelBase &kernel,
+                      const KernelArgsArrayBase &args);
 
   // Gets-or-creates (creates with memoization) a FftSupport datatype that can
   // be used to execute FFT routines on the current platform.
@@ -473,6 +472,19 @@ class StreamExecutor {
   // underlying platform.
   dnn::DnnSupport *AsDnn();
 
+  // Gets-or-creates (creates with memoization) a BlasSupport datatype that can
+  // be used to execute BLAS routines on the current platform. This is typically
+  // not user-facing, as users will use the Stream::ThenBlas* family of routines
+  // to entrain BLAS operations. See blas.h for additional details.
+  //
+  // Ownership is not transferred to the caller -- ownership is retained by this
+  // object for memoization. This BLAS interface is also only expected to be
+  // used by a Stream for entraining calls to BLAS functionality.
+  //
+  // Returns null if there was an error initializing the BLAS support for the
+  // underlying platform.
+  blas::BlasSupport *AsBlas();
+
   // Turns StreamExecutor operation tracing on or off.
   void EnableTracing(bool enable);
 
@@ -493,9 +505,6 @@ class StreamExecutor {
 
   // Return an allocator which delegates to this stream executor for memory
   // allocation.
-  //
-  // Creates the allocator object on the first access, as the device ordinal
-  // of this stream_executor is not set in constructor.
   StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
 
  private:
@@ -510,18 +519,10 @@ class StreamExecutor {
   template <typename... Args>
   friend struct ThenBlasImpl;
 
-  // Gets-or-creates (creates with memoization) a BlasSupport datatype that can
-  // be used to execute BLAS routines on the current platform. This is typically
-  // not user-facing, as users will use the Stream::ThenBlas* family of routines
-  // to entrain BLAS operations. See blas.h for additional details.
-  //
-  // Ownership is not transferred to the caller -- ownership is retained by this
-  // object for memoization. This BLAS interface is also only expected to be
-  // used by a Stream for entraining calls to BLAS functionality.
-  //
-  // Returns null if there was an error initializing the BLAS support for the
-  // underlying platform.
-  blas::BlasSupport *AsBlas();
+  // Synchronously allocates size bytes on the underlying platform and returns
+  // an opaque void* representing that allocation. In the case of failure,
+  // nullptr is returned.
+  void *Allocate(uint64 size);
 
   // Gets-or-creates (creates with memoization) an RngSupport datatype that can
   // be used for random-number-generation routines on the current platform.
@@ -540,11 +541,6 @@ class StreamExecutor {
   // Without blocking the device, retrieve the current stream status.
   port::Status GetStatus(Stream *stream);
 
-  // Synchronously allocates size bytes on the underlying platform and returns
-  // an opaque void* representing that allocation. In the case of failure,
-  // nullptr is returned.
-  void *Allocate(uint64 size);
-
   // Finds and retrieves device memory for the symbol on the underlying
   // platform.
   bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
@@ -785,10 +781,7 @@ StreamExecutor::CreateTypedKernel(absl::string_view kernel_name,
         reinterpret_cast<const char *>(cubin_data.data()), kernel_name);
   }
 
-  if (!GetKernel(loader_spec, kernel_base.get())) {
-    return port::InternalError("Unable to load kernel");
-  }
-
+  TF_RETURN_IF_ERROR(GetKernel(loader_spec, kernel_base.get()));
   return std::move(kernel_base);
 }
 
@@ -882,7 +875,8 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
     kernel.PackParams(&kernel_args, args...);
     DCHECK(parent_ != nullptr);
     bool ok =
-        parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args);
+        parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args)
+            .ok();
     if (!ok) {
       SetError();
       LOG(WARNING) << "parent failed to launch kernel: " << &kernel;
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.cc b/tensorflow/stream_executor/tf_allocator_adapter.cc
index 892673d63e6..c1f535e7e8d 100644
--- a/tensorflow/stream_executor/tf_allocator_adapter.cc
+++ b/tensorflow/stream_executor/tf_allocator_adapter.cc
@@ -17,12 +17,20 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor.h"
 
 namespace stream_executor {
 
-TfAllocatorAdapter::TfAllocatorAdapter(const Platform *platform,
-                                       tensorflow::Allocator *wrapped)
-    : DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+TfAllocatorAdapter::TfAllocatorAdapter(tensorflow::Allocator *wrapped,
+                                       Stream *stream)
+    : DeviceMemoryAllocator(stream->parent()->platform()),
+      wrapped_(wrapped),
+      stream_(stream) {}
+
+TfAllocatorAdapter::TfAllocatorAdapter(tensorflow::Allocator *wrapped,
+                                       Platform *platform)
+    : DeviceMemoryAllocator(platform), wrapped_(wrapped), stream_(nullptr) {}
 
 TfAllocatorAdapter::~TfAllocatorAdapter() {}
 
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.h b/tensorflow/stream_executor/tf_allocator_adapter.h
index 3ab15d2ae66..13f2e2679d5 100644
--- a/tensorflow/stream_executor/tf_allocator_adapter.h
+++ b/tensorflow/stream_executor/tf_allocator_adapter.h
@@ -30,7 +30,13 @@ namespace stream_executor {
 // see comment on `AllowsAsynchronousDeallocation()`.
 class TfAllocatorAdapter : public DeviceMemoryAllocator {
  public:
-  TfAllocatorAdapter(const Platform *platform, tensorflow::Allocator *wrapped);
+  // stream: a Stream on which the allocator can only be used. If non-null, the
+  // allocator can not be used on any other stream.
+  TfAllocatorAdapter(tensorflow::Allocator *wrapped, Stream *stream);
+
+  // Constructor for the cases where `stream` can not be provided.
+  TfAllocatorAdapter(tensorflow::Allocator *wrapped, Platform *platform);
+
   ~TfAllocatorAdapter() override;
 
   port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
@@ -47,22 +53,27 @@ class TfAllocatorAdapter : public DeviceMemoryAllocator {
   // (This attribute has no effect on CPU.)
   bool AllowsAsynchronousDeallocation() const override { return true; }
 
+  Stream *GetStream() const override { return stream_; }
+
  private:
   tensorflow::Allocator *wrapped_;
+  Stream *stream_;
 };
 
-// Adapter class that wraps per-device TF allocators as an XLA allocator.
-// Assumes that the Tensorflow allocator permits asynchronous deallocation;
-// see comment on `AllowsAsynchronousDeallocation()`.
+// Adapter class that wraps per-device TF allocators with corresponding streams
+// as a TfAllocatorAdapter. Assumes that the Tensorflow allocator permits
+// asynchronous deallocation; see comment on `AllowsAsynchronousDeallocation()`.
 class MultiDeviceAdapter : public DeviceMemoryAllocator {
  public:
-  MultiDeviceAdapter(
-      const Platform *platform,
-      std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators)
-      : DeviceMemoryAllocator(platform),
-        tf_allocators_(std::move(tf_allocators)) {
-    for (const auto &tf_allocator : tf_allocators_) {
-      per_device_allocators_.emplace_back(platform, tf_allocator.get());
+  using AllocatorWithStream =
+      std::pair<std::unique_ptr<tensorflow::Allocator>, Stream *>;
+  MultiDeviceAdapter(const Platform *platform,
+                     std::vector<AllocatorWithStream> tf_allocators)
+      : DeviceMemoryAllocator(platform) {
+    tf_allocators_.reserve(tf_allocators.size());
+    for (AllocatorWithStream &p : tf_allocators) {
+      per_device_allocators_.emplace_back(p.first.get(), p.second);
+      tf_allocators_.push_back(std::move(p.first));
     }
   }
 
@@ -90,8 +101,8 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
 
  private:
   std::vector<TfAllocatorAdapter> per_device_allocators_;
-  // The wrapped TF allocators backing per_device_allocators_ (XlaAllocator does
-  // not take ownership of its underlying Allocator).
+  // The wrapped TF allocators backing per_device_allocators_
+  // (TfAllocatorAdapter does not take ownership of its underlying Allocator).
   std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators_;
 };
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 5d9aba8637a..1606ce66db4 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -3,7 +3,7 @@
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "//tensorflow/core/platform:default/build_config_root.bzl",
     "if_dynamic_kernels",
     "if_static",
     "tf_additional_grpc_deps_py",
@@ -18,7 +18,7 @@ load(
     "if_tensorrt",
 )
 load(
-    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load(
@@ -80,7 +80,7 @@ def if_cuda_is_configured_compat(x):
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
-    return src.replace("/", "_").split(".")[0]
+    return src.replace("/", "_").replace(":", "_").split(".")[0]
 
 def full_path(relative_paths):
     return [native.package_name() + "/" + relative for relative in relative_paths]
@@ -267,8 +267,10 @@ def get_win_copts(is_external = False):
     else:
         return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
 
-# LINT.IfChange
-def tf_copts(android_optimization_level_override = "-O2", is_external = False):
+def tf_copts(
+        android_optimization_level_override = "-O2",
+        is_external = False,
+        allow_exceptions = False):
     # For compatibility reasons, android_optimization_level_override
     # is currently only being set for Android.
     # To clear this value, and allow the CROSSTOOL default
@@ -285,9 +287,9 @@ def tf_copts(android_optimization_level_override = "-O2", is_external = False):
             "-DEIGEN_AVOID_STL_ARRAY",
             "-Iexternal/gemmlowp",
             "-Wno-sign-compare",
-            "-fno-exceptions",
             "-ftemplate-depth=900",
         ]) +
+        (if_not_windows(["-fno-exceptions"]) if not allow_exceptions else []) +
         if_cuda(["-DGOOGLE_CUDA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
@@ -326,8 +328,6 @@ def tf_opts_nortti_if_android():
         "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
     ])
 
-# LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
-
 def tf_opts_nortti_if_emscripten():
     return if_emscripten([
         "-fno-rtti",
@@ -419,6 +419,25 @@ def tf_binary_additional_data_deps():
         ],
     )
 
+def tf_binary_pybind_deps():
+    return select({
+        clean_dep("//tensorflow:macos"): [
+            clean_dep(
+                "//tensorflow/python:_pywrap_tensorflow_internal_macos",
+            ),
+        ],
+        clean_dep("//tensorflow:windows"): [
+            clean_dep(
+                "//tensorflow/python:_pywrap_tensorflow_internal_windows",
+            ),
+        ],
+        "//conditions:default": [
+            clean_dep(
+                "//tensorflow/python:_pywrap_tensorflow_internal_linux",
+            ),
+        ],
+    })
+
 # Helper function for the per-OS tensorflow libraries and their version symlinks
 def tf_shared_library_deps():
     return select({
@@ -951,7 +970,6 @@ def tf_cc_test(
         extra_copts = [],
         suffix = "",
         linkopts = [],
-        nocopts = None,
         kernels = [],
         **kwargs):
     native.cc_test(
@@ -990,7 +1008,6 @@ def tf_cc_test(
             clean_dep("//tensorflow:macos"): 1,
             "//conditions:default": 0,
         }),
-        nocopts = nocopts,
         **kwargs
     )
 
@@ -1154,8 +1171,7 @@ def tf_cc_tests(
         size = "medium",
         args = None,
         linkopts = [],
-        kernels = [],
-        nocopts = None):
+        kernels = []):
     for src in srcs:
         tf_cc_test(
             name = src_to_test_name(src),
@@ -1165,7 +1181,6 @@ def tf_cc_tests(
             kernels = kernels,
             linkopts = linkopts,
             linkstatic = linkstatic,
-            nocopts = nocopts,
             tags = tags,
             deps = deps,
         )
@@ -1187,7 +1202,7 @@ def tf_cc_test_mkl(
         native.cc_test(
             name = src_to_test_name(src),
             srcs = if_mkl([src]) + tf_binary_additional_srcs(),
-            copts = tf_copts(),
+            copts = tf_copts(allow_exceptions = True),
             linkopts = select({
                 clean_dep("//tensorflow:android"): [
                     "-pie",
@@ -1206,7 +1221,6 @@ def tf_cc_test_mkl(
             size = size,
             args = args,
             features = disable_header_modules,
-            nocopts = "-fno-exceptions",
         )
 
 def tf_cc_tests_gpu(
@@ -1270,10 +1284,10 @@ register_extension_info(
 def _cuda_copts(opts = []):
     """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
-      If we're doing CUDA compilation, returns copts for our particular CUDA
-      compiler.  If we're not doing CUDA compilation, returns an empty list.
+        If we're doing CUDA compilation, returns copts for our particular CUDA
+        compiler.  If we're not doing CUDA compilation, returns an empty list.
 
-      """
+        """
     return cuda_default_copts() + select({
         "//conditions:default": [],
         "@local_config_cuda//cuda:using_nvcc": ([
@@ -1326,21 +1340,21 @@ register_extension_info(
 def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
     """Generate a cc_library with a conditional set of CUDA dependencies.
 
-    When the library is built with --config=cuda:
+      When the library is built with --config=cuda:
 
-    - Both deps and cuda_deps are used as dependencies.
-    - The cuda runtime is added as a dependency (if necessary).
-    - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
-    - In addition, when the library is also built with TensorRT enabled, it
-        additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
+      - Both deps and cuda_deps are used as dependencies.
+      - The cuda runtime is added as a dependency (if necessary).
+      - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+      - In addition, when the library is also built with TensorRT enabled, it
+          additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
 
-    Args:
-    - cuda_deps: BUILD dependencies which will be linked if and only if:
-        '--config=cuda' is passed to the bazel command line.
-    - deps: dependencies which will always be linked.
-    - copts: copts always passed to the cc_library.
-    - kwargs: Any other argument to cc_library.
-    """
+      Args:
+      - cuda_deps: BUILD dependencies which will be linked if and only if:
+          '--config=cuda' is passed to the bazel command line.
+      - deps: dependencies which will always be linked.
+      - copts: copts always passed to the cc_library.
+      - kwargs: Any other argument to cc_library.
+      """
     if not deps:
         deps = []
     if not cuda_deps:
@@ -1386,25 +1400,25 @@ def tf_kernel_library(
         **kwargs):
     """A rule to build a TensorFlow OpKernel.
 
-    May either specify srcs/hdrs or prefix.  Similar to tf_gpu_library,
-    but with alwayslink=1 by default.  If prefix is specified:
-      * prefix*.cc (except *.cu.cc) is added to srcs
-      * prefix*.h (except *.cu.h) is added to hdrs
-      * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
-    With the exception that test files are excluded.
-    For example, with prefix = "cast_op",
-      * srcs = ["cast_op.cc"]
-      * hdrs = ["cast_op.h"]
-      * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
-      * "cast_op_test.cc" is excluded
-    With prefix = "cwise_op"
-      * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
-      * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
-      * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
-                    "cwise_ops.h", "cwise_ops_common.h",
-                    "cwise_ops_gpu_common.cu.h"]
-      * "cwise_ops_test.cc" is excluded
-    """
+      May either specify srcs/hdrs or prefix.  Similar to tf_gpu_library,
+      but with alwayslink=1 by default.  If prefix is specified:
+        * prefix*.cc (except *.cu.cc) is added to srcs
+        * prefix*.h (except *.cu.h) is added to hdrs
+        * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
+      With the exception that test files are excluded.
+      For example, with prefix = "cast_op",
+        * srcs = ["cast_op.cc"]
+        * hdrs = ["cast_op.h"]
+        * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
+        * "cast_op_test.cc" is excluded
+      With prefix = "cwise_op"
+        * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
+        * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
+        * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
+                      "cwise_ops.h", "cwise_ops_common.h",
+                      "cwise_ops_gpu_common.cu.h"]
+        * "cwise_ops_test.cc" is excluded
+      """
     if not srcs:
         srcs = []
     if not hdrs:
@@ -1494,8 +1508,7 @@ def tf_mkl_kernel_library(
         hdrs = None,
         deps = None,
         alwayslink = 1,
-        copts = tf_copts(),
-        nocopts = "-fno-exceptions"):
+        copts = tf_copts(allow_exceptions = True)):
     """A rule to build MKL-based TensorFlow kernel libraries."""
 
     if not bool(srcs):
@@ -1523,7 +1536,6 @@ def tf_mkl_kernel_library(
         deps = deps,
         alwayslink = alwayslink,
         copts = copts,
-        nocopts = nocopts,
         features = disable_header_modules,
     )
 
@@ -1535,13 +1547,13 @@ register_extension_info(
 def _get_transitive_headers(hdrs, deps):
     """Obtain the header files for a target and its transitive dependencies.
 
-    Args:
-      hdrs: a list of header files
-      deps: a list of targets that are direct dependencies
+      Args:
+        hdrs: a list of header files
+        deps: a list of targets that are direct dependencies
 
-    Returns:
-      a collection of the transitive headers
-    """
+      Returns:
+        a collection of the transitive headers
+      """
     return depset(
         hdrs,
         transitive = [dep[CcInfo].compilation_context.headers for dep in deps],
@@ -1621,14 +1633,14 @@ _py_wrap_cc = rule(
 def _get_repository_roots(ctx, files):
     """Returns abnormal root directories under which files reside.
 
-    When running a ctx.action, source files within the main repository are all
-    relative to the current directory; however, files that are generated or exist
-    in remote repositories will have their root directory be a subdirectory,
-    e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
-    returns the set of these devious directories, ranked and sorted by popularity
-    in order to hopefully minimize the number of I/O system calls within the
-    compiler, because includes have quadratic complexity.
-    """
+      When running a ctx.action, source files within the main repository are all
+      relative to the current directory; however, files that are generated or exist
+      in remote repositories will have their root directory be a subdirectory,
+      e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+      returns the set of these devious directories, ranked and sorted by popularity
+      in order to hopefully minimize the number of I/O system calls within the
+      compiler, because includes have quadratic complexity.
+      """
     result = {}
     for f in files.to_list():
         root = f.root.path
@@ -1756,7 +1768,7 @@ check_deps = rule(
 
 def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], copts = [], **kwargs):
     """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.
-    """
+      """
     cuda_deps = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
         "@local_config_cuda//cuda:cuda_headers",
@@ -1895,7 +1907,11 @@ def tf_py_wrap_cc(
 
     # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
     # and use that as the name for the rule producing the .so file.
-    cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
+    cc_library_base = "/".join(name.split("/")[:-1] + ["_" + module_name])
+
+    # TODO(b/137885063): tf_cc_shared_object needs to be cleaned up; we really
+    # shouldn't be passing a name qualified with .so here.
+    cc_library_name = cc_library_base + ".so"
     cc_library_pyd_name = "/".join(
         name.split("/")[:-1] + ["_" + module_name + ".pyd"],
     )
@@ -1957,6 +1973,25 @@ def tf_py_wrap_cc(
         deps = deps + extra_deps,
         **kwargs
     )
+
+    # When a non-versioned .so is added as a 'src' to a bazel target, it uses
+    # -l%(so_name) instead of -l:%(so_file) during linking.  When -l%(so_name)
+    # is passed to ld, it will look for an associated file with the schema
+    # lib%(so_name).so.  Since pywrap_tensorflow is not explicitly versioned
+    # and is not prefixed with lib_, we add a rule for the creation of an .so
+    # file with the canonical lib schema (e.g. libNAME.so), so that
+    # -l%(so_name) is resolved during linking.
+    #
+    # See: https://github.com/bazelbuild/bazel/blob/7a6808260a733d50983c1adf0cf5a7493472267f/src/main/java/com/google/devtools/build/lib/rules/cpp/LibrariesToLinkCollector.java#L319
+    for pattern in SHARED_LIBRARY_NAME_PATTERNS:
+        name_os = pattern % (cc_library_base, "")
+        native.genrule(
+            name = name_os + "_rule",
+            srcs = [":" + cc_library_name],
+            outs = [name_os],
+            cmd = "cp $< $@",
+        )
+
     native.genrule(
         name = "gen_" + cc_library_pyd_name,
         srcs = [":" + cc_library_name],
@@ -2078,11 +2113,12 @@ def tf_py_test(
         shard_count = shard_count,
         srcs_version = "PY2AND3",
         tags = tags,
-        visibility = [clean_dep("//tensorflow:internal")] + additional_visibility,
-        deps = [
+        visibility = [clean_dep("//tensorflow:internal")] +
+                     additional_visibility,
+        deps = depset([
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
-        ] + additional_deps + xla_test_true_list,
+        ] + additional_deps + xla_test_true_list),
         **kwargs
     )
 
@@ -2348,7 +2384,8 @@ register_extension_info(
 def tensorflow_opensource_extra_deps():
     return []
 
-def tf_pybind_extension(
+# buildozer: disable=function-docstring-args
+def pybind_extension(
         name,
         srcs,
         module_name,
@@ -2357,7 +2394,6 @@ def tf_pybind_extension(
         srcs_version = "PY2AND3",
         data = [],
         copts = None,
-        nocopts = None,
         linkopts = [],
         deps = [],
         visibility = None,
@@ -2366,7 +2402,7 @@ def tf_pybind_extension(
         compatible_with = None,
         restricted_to = None,
         deprecation = None):
-    """Builds a Python extension module."""
+    """Builds a generic Python extension module."""
     _ignore = [module_name]
     p = name.rfind("/")
     if p == -1:
@@ -2404,8 +2440,7 @@ def tf_pybind_extension(
         srcs = srcs + hdrs,
         data = data,
         copts = copts,
-        nocopts = nocopts,
-        linkopts = linkopts + select({
+        linkopts = linkopts + _rpath_linkopts(name) + select({
             "@local_config_cuda//cuda:darwin": [
                 "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
             ],
@@ -2454,29 +2489,54 @@ def tf_pybind_extension(
         compatible_with = compatible_with,
     )
 
+# buildozer: enable=function-docstring-args
+
+def tf_python_pybind_extension(
+        name,
+        srcs,
+        module_name,
+        hdrs = [],
+        features = [],
+        copts = None,
+        deps = []):
+    """A wrapper macro for pybind_extension that is used in tensorflow/python/BUILD.
+
+    It is used for targets under //third_party/tensorflow/python that link
+    against libtensorflow_framework.so and pywrap_tensorflow_internal.so.
+    """
+    pybind_extension(
+        name,
+        srcs + tf_binary_additional_srcs(),
+        module_name,
+        hdrs = hdrs,
+        features = features,
+        copts = copts,
+        deps = deps + tf_binary_pybind_deps(),
+    )
+
 def if_cuda_or_rocm(if_true, if_false = []):
     """Shorthand for select()'ing whether to build for either CUDA or ROCm.
 
-    Returns a select statement which evaluates to
-       if_true if we're building with either CUDA or ROCm enabled.
-       if_false, otherwise.
+      Returns a select statement which evaluates to
+         if_true if we're building with either CUDA or ROCm enabled.
+         if_false, otherwise.
 
-    Sometimes a target has additional CUDa or ROCm specific dependencies.
-    The `if_cuda` / `if_rocm` functions are used to specify these additional
-    dependencies. For eg, see the `//tensorflow/core/kernels:bias_op` target
+      Sometimes a target has additional CUDa or ROCm specific dependencies.
+      The `if_cuda` / `if_rocm` functions are used to specify these additional
+      dependencies. For eg, see the `//tensorflow/core/kernels:bias_op` target
 
-    If the same additional dependency is needed for both CUDA and ROCm
-    (for eg. `reduction_ops` dependency for the `bias_op` target above),
-    then specifying that dependency in both  both `if_cuda` and `if_rocm` will
-    result in both those functions returning a select statement, which contains
-    the same dependency, which then leads to a duplicate dependency bazel error.
+      If the same additional dependency is needed for both CUDA and ROCm
+      (for eg. `reduction_ops` dependency for the `bias_op` target above),
+      then specifying that dependency in both  both `if_cuda` and `if_rocm` will
+      result in both those functions returning a select statement, which contains
+      the same dependency, which then leads to a duplicate dependency bazel error.
 
-    In order to work around this error, any additional dependency that is common
-    to both the CUDA and ROCm platforms, should be specified using this function.
-    Doing so will eliminate the cause of the bazel error (i.e. the  same
-    dependency showing up in two different select statements)
+      In order to work around this error, any additional dependency that is common
+      to both the CUDA and ROCm platforms, should be specified using this function.
+      Doing so will eliminate the cause of the bazel error (i.e. the  same
+      dependency showing up in two different select statements)
 
-    """
+      """
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
         "@local_config_cuda//cuda:using_clang": if_true,
@@ -2493,5 +2553,9 @@ def if_mlir(if_true, if_false = []):
         "//tensorflow:with_mlir_support": if_true,
     })
 
+# TODO(b/138724071): Remove when build is stable.
+def if_mlir_tflite(if_true, if_false = []):
+    return if_true  # Internally we always build with MLIR.
+
 def tfcompile_extra_flags():
     return ""
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 9166a18c0a8..5c2a24c0669 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -1,10 +1,7 @@
 # TensorFlow API backwards compatibility test goldens.
 
 package(
-    default_visibility = [
-        "//tensorflow:tensorflow_py:__subpackages__",
-        "//tensorflow/tools/api:__subpackages__",
-    ],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
index 64240f70698..0f43a49ee96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
@@ -38,10 +38,6 @@ tf_class {
     name: "traceback"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "traceback_with_start_lines"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index 2f7918843dd..5aaf53fd84d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "from_value_rowids"
     argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "merge_dims"
+    argspec: "args=[\'self\', \'outer_axis\', \'inner_axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "nested_row_lengths"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
index 983d34b4963..80d98535ffc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
@@ -4,6 +4,14 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "value_type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.-composite-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.-composite-tensor.pbtxt
new file mode 100644
index 00000000000..5fe1b984af2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.-composite-tensor.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.TensorInfo.CompositeTensor"
+tf_proto {
+  descriptor {
+    name: "CompositeTensor"
+    field {
+      name: "type_spec"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TypeSpecProto"
+    }
+    field {
+      name: "components"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorInfo"
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt
index 63566c808e5..48773ea0dce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt
@@ -17,6 +17,14 @@ tf_proto {
       type_name: ".tensorflow.TensorInfo.CooSparse"
       oneof_index: 0
     }
+    field {
+      name: "composite_tensor"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorInfo.CompositeTensor"
+      oneof_index: 0
+    }
     field {
       name: "dtype"
       number: 2
@@ -52,6 +60,23 @@ tf_proto {
         type: TYPE_STRING
       }
     }
+    nested_type {
+      name: "CompositeTensor"
+      field {
+        name: "type_spec"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TypeSpecProto"
+      }
+      field {
+        name: "components"
+        number: 2
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorInfo"
+      }
+    }
     oneof_decl {
       name: "encoding"
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 4506fcce708..9f35e140d43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -47,6 +47,10 @@ tf_class {
     name: "eval"
     argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index fb7af9acfe1..df68721be37 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -84,6 +84,10 @@ tf_class {
     name: "eval"
     argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_proto"
     argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
index 44afc348637..1454a2d9567 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "do_not_convert"
-    argspec: "args=[\'func\', \'run_as\', \'return_dtypes\'], varargs=None, keywords=None, defaults=[\'None\', \'RunMode.GRAPH\', \'None\'], "
+    argspec: "args=[\'func\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_loop_options"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
index 0c29d7a0594..9bd1ae8edd0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "experimental_connect_to_cluster"
-    argspec: "args=[\'cluster_spec_or_resolver\', \'job_name\', \'task_index\', \'protocol\'], varargs=None, keywords=None, defaults=[\'localhost\', \'0\', \'grpc\'], "
+    argspec: "args=[\'cluster_spec_or_resolver\', \'job_name\', \'task_index\', \'protocol\', \'make_master_device_default\'], varargs=None, keywords=None, defaults=[\'localhost\', \'0\', \'None\', \'True\'], "
   }
   member_method {
     name: "experimental_connect_to_host"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index a40c032e9a4..3e815728841 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_allow_stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 52b9488a6f3..d8373707742 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -158,11 +158,11 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'None\', \'None\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index 0c3f04e468c..5826676cc80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "output_all_intermediates"
+    argspec: "args=[\'state\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index a13e20be2dc..5d01d069a54 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -277,7 +277,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 9ddbdf2b38c..48252ca87b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -294,7 +294,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 6908635a501..e4551954f44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -244,6 +244,10 @@ tf_module {
     name: "int_shape"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_keras_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_sparse"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
new file mode 100644
index 00000000000..f43bf3904ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -0,0 +1,315 @@
+path: "tensorflow.keras.experimental.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sample_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
new file mode 100644
index 00000000000..f47a393c4d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -0,0 +1,315 @@
+path: "tensorflow.keras.experimental.WideDeepModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sample_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'linear_model\', \'dnn_model\', \'activation\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index bfd169a9b35..4a83b58df83 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "LinearCosineDecay"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "NoisyLinearCosineDecay"
     mtype: "<type \'type\'>"
@@ -24,6 +28,10 @@ tf_module {
     name: "SequenceFeatures"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "WideDeepModel"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "export_saved_model"
     argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index 0e176cb0da2..718b0f75c59 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
index a2af65554f7..7fc2f9c8d3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -3,7 +3,11 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "default_variable_dtype"
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss_scale"
     mtype: "<type \'property\'>"
   }
   member {
@@ -14,8 +18,12 @@ tf_class {
     name: "should_cast_variables"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'name\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'USE_DEFAULT\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 3840d3d7750..3a9b05d8de6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -277,7 +277,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 3d9f85c87ce..2343dd8fb7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -294,7 +294,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
index 9a3f95ca282..311142b0cc8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
@@ -30,6 +30,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
index 08c8b73a4ee..e6a82676a73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "plot_model"
     argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'True\', \'TB\', \'False\', \'96\'], "
   }
+  member_method {
+    name: "register_keras_serializable"
+    argspec: "args=[\'package\', \'name\'], varargs=None, keywords=None, defaults=[\'Custom\', \'None\'], "
+  }
   member_method {
     name: "serialize_keras_object"
     argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 948ebb7e327..f97c6f2ee90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -152,10 +152,26 @@ tf_module {
     name: "lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "lu_matrix_inverse"
+    argspec: "args=[\'lower_upper\', \'perm\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "lu_reconstruct"
+    argspec: "args=[\'lower_upper\', \'perm\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "lu_solve"
+    argspec: "args=[\'lower_upper\', \'perm\', \'rhs\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matrix_rank"
+    argspec: "args=[\'a\', \'tol\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "matrix_transpose"
     argspec: "args=[\'a\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\', \'False\'], "
@@ -172,6 +188,10 @@ tf_module {
     name: "normalize"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "pinv"
+    argspec: "args=[\'a\', \'rcond\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "qr"
     argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 1fd765a5f81..bf7812a668d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "cumulative_logsumexp"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "digamma"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -308,6 +312,10 @@ tf_module {
     name: "reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reciprocal_no_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "reduce_all"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 4706690d60d..239872b111d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "rnn_cell"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "swish"
-    mtype: "<class \'tensorflow.python.framework.function._OverloadedFunction\'>"
-  }
   member_method {
     name: "all_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -404,6 +400,10 @@ tf_module {
     name: "sufficient_statistics"
     argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "swish"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 5205ca9be6b..4058574cbd3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -160,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 90dcb1c4934..bdccd5b436c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1070,7 +1070,7 @@ tf_module {
   }
   member_method {
     name: "custom_gradient"
-    argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'f\', \'primals\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "decode_base64"
@@ -1144,6 +1144,10 @@ tf_module {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_tensor_equality"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1192,6 +1196,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_tensor_equality"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1322,7 +1330,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -1874,7 +1882,7 @@ tf_module {
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
@@ -1908,6 +1916,10 @@ tf_module {
     name: "register_tensor_conversion_function"
     argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
   }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'input\', \'repeats\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "report_uninitialized_variables"
     argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'report_uninitialized_variables\'], "
@@ -2214,7 +2226,7 @@ tf_module {
   }
   member_method {
     name: "sparse_tensor_to_dense"
-    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "sparse_to_dense"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index adb4041e1fc..e5921e8c3b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
index 6b07759af97..c37b5118dbd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
@@ -36,4 +36,12 @@ tf_module {
     name: "segment_ids_to_row_splits"
     argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "stack_dynamic_partitions"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index eef3ed54817..48c9552e473 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -96,10 +96,18 @@ tf_module {
     name: "AnonymousIteratorV2"
     argspec: "args=[\'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "AnonymousMemoryCache"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "AnonymousMultiDeviceIterator"
     argspec: "args=[\'devices\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "AnonymousRandomSeedGenerator"
+    argspec: "args=[\'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Any"
     argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -120,6 +128,10 @@ tf_module {
     name: "ApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ApplyAdam"
     argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
@@ -460,6 +472,14 @@ tf_module {
     name: "BlockLSTMGrad"
     argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'i\', \'cs\', \'f\', \'o\', \'ci\', \'co\', \'h\', \'cs_grad\', \'h_grad\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BlockLSTMGradV2"
+    argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'i\', \'cs\', \'f\', \'o\', \'ci\', \'co\', \'h\', \'cs_grad\', \'h_grad\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BlockLSTMV2"
+    argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'cell_clip\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'None\'], "
+  }
   member_method {
     name: "BoostedTreesAggregateStats"
     argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'feature\', \'max_splits\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -500,6 +520,10 @@ tf_module {
     name: "BoostedTreesExampleDebugOutputs"
     argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BoostedTreesFlushQuantileSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BoostedTreesGetEnsembleStates"
     argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -556,6 +580,10 @@ tf_module {
     name: "BoostedTreesUpdateEnsemble"
     argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BoostedTreesUpdateEnsembleV2"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'dimension_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'split_types\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
   member_method {
     name: "BroadcastArgs"
     argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -596,6 +624,10 @@ tf_module {
     name: "CacheDataset"
     argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "CacheDatasetV2"
+    argspec: "args=[\'input_dataset\', \'filename\', \'cache\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Case"
     argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
@@ -638,15 +670,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -654,7 +686,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -700,6 +732,10 @@ tf_module {
     name: "ConfigureDistributedTPU"
     argspec: "args=[\'embedding_config\', \'tpu_embedding_config\', \'is_global_init\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "ConfigureTPUEmbedding"
+    argspec: "args=[\'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Conj"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -844,6 +880,10 @@ tf_module {
     name: "Cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "CumulativeLogsumexp"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "DataFormatDimMap"
     argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
@@ -862,7 +902,7 @@ tf_module {
   }
   member_method {
     name: "DatasetToGraph"
-    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'stateful_whitelist\', \'allow_stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'False\', \'None\'], "
   }
   member_method {
     name: "DatasetToSingleElement"
@@ -940,10 +980,18 @@ tf_module {
     name: "DeleteIterator"
     argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeleteMemoryCache"
+    argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DeleteMultiDeviceIterator"
     argspec: "args=[\'multi_device_iterator\', \'iterators\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeleteRandomSeedGenerator"
+    argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DeleteSessionTensor"
     argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1054,7 +1102,7 @@ tf_module {
   }
   member_method {
     name: "EagerPyFunc"
-    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'is_async\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "EditDistance"
@@ -1250,7 +1298,7 @@ tf_module {
   }
   member_method {
     name: "ExperimentalRebatchDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "ExperimentalScanDataset"
@@ -2538,7 +2586,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2594,11 +2642,11 @@ tf_module {
   }
   member_method {
     name: "QuantizeAndDequantizeV2"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizeAndDequantizeV3"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizeDownAndShrinkRange"
@@ -2836,6 +2884,10 @@ tf_module {
     name: "RaggedTensorToSparse"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToTensor"
+    argspec: "args=[\'shape\', \'values\', \'default_value\', \'row_partition_tensors\', \'row_partition_types\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2974,7 +3026,7 @@ tf_module {
   }
   member_method {
     name: "RebatchDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "Reciprocal"
@@ -3140,6 +3192,10 @@ tf_module {
     name: "ResourceApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ResourceApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ResourceApplyAdam"
     argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
@@ -3260,6 +3316,10 @@ tf_module {
     name: "ResourceSparseApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ResourceSparseApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ResourceSparseApplyCenteredRMSProp"
     argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -3620,6 +3680,10 @@ tf_module {
     name: "ShuffleDataset"
     argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "ShuffleDatasetV2"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed_generator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ShutdownDistributedTPU"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3736,6 +3800,10 @@ tf_module {
     name: "SparseApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "SparseApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "SparseApplyCenteredRMSProp"
     argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -4042,7 +4110,7 @@ tf_module {
   }
   member_method {
     name: "StatelessWhile"
-    argspec: "args=[\'input\', \'cond\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
   }
   member_method {
     name: "StaticRegexFullMatch"
@@ -4100,6 +4168,10 @@ tf_module {
     name: "StringLower"
     argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "StringNGrams"
+    argspec: "args=[\'data\', \'data_splits\', \'separator\', \'ngram_widths\', \'left_pad\', \'right_pad\', \'pad_width\', \'preserve_short_sequences\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StringSplit"
     argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -4190,11 +4262,11 @@ tf_module {
   }
   member_method {
     name: "TPUReplicateMetadata"
-    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 1fc79d509a9..27c64f2cbf7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -126,7 +126,7 @@ tf_module {
   }
   member_method {
     name: "to_dense"
-    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "to_indicator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 54e7ce6b5e3..b5008339866 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -24,9 +24,13 @@ tf_module {
     name: "lower"
     argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "ngrams"
+    argspec: "args=[\'data\', \'ngram_width\', \'separator\', \'pad_values\', \'padding_width\', \'preserve_short_sequences\', \'name\'], varargs=None, keywords=None, defaults=[\' \', \'None\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
index cc530dba06d..fc96311a91b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "embedding_column"
-    argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\'], "
+    argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "initialize_tpu_system"
@@ -26,6 +26,6 @@ tf_module {
   }
   member_method {
     name: "shared_embedding_columns"
-    argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'max_sequence_lengths\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'max_sequence_lengths\', \'learning_rate_fn\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
index 64240f70698..0f43a49ee96 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
@@ -38,10 +38,6 @@ tf_class {
     name: "traceback"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "traceback_with_start_lines"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index 2f7918843dd..5aaf53fd84d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "from_value_rowids"
     argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "merge_dims"
+    argspec: "args=[\'self\', \'outer_axis\', \'inner_axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "nested_row_lengths"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
index 983d34b4963..80d98535ffc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
@@ -4,6 +4,14 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "value_type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 4506fcce708..9f35e140d43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -47,6 +47,10 @@ tf_class {
     name: "eval"
     argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index 35851976d17..f53a5ec6beb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "eval"
     argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_proto"
     argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
index 44afc348637..1454a2d9567 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "do_not_convert"
-    argspec: "args=[\'func\', \'run_as\', \'return_dtypes\'], varargs=None, keywords=None, defaults=[\'None\', \'RunMode.GRAPH\', \'None\'], "
+    argspec: "args=[\'func\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_loop_options"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
index 0c29d7a0594..9bd1ae8edd0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "experimental_connect_to_cluster"
-    argspec: "args=[\'cluster_spec_or_resolver\', \'job_name\', \'task_index\', \'protocol\'], varargs=None, keywords=None, defaults=[\'localhost\', \'0\', \'grpc\'], "
+    argspec: "args=[\'cluster_spec_or_resolver\', \'job_name\', \'task_index\', \'protocol\', \'make_master_device_default\'], varargs=None, keywords=None, defaults=[\'localhost\', \'0\', \'None\', \'True\'], "
   }
   member_method {
     name: "experimental_connect_to_host"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index a40c032e9a4..3e815728841 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_allow_stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 8786dc0a480..09bc3ac06c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -130,11 +130,11 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'None\', \'None\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index a13e20be2dc..5d01d069a54 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -277,7 +277,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 9ddbdf2b38c..48252ca87b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -294,7 +294,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 840c4e78be5..97bdc150c91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -236,6 +236,10 @@ tf_module {
     name: "int_shape"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_keras_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_sparse"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
new file mode 100644
index 00000000000..f43bf3904ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -0,0 +1,315 @@
+path: "tensorflow.keras.experimental.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sample_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
new file mode 100644
index 00000000000..f47a393c4d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -0,0 +1,315 @@
+path: "tensorflow.keras.experimental.WideDeepModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sample_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'linear_model\', \'dnn_model\', \'activation\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index bfd169a9b35..4a83b58df83 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "LinearCosineDecay"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "NoisyLinearCosineDecay"
     mtype: "<type \'type\'>"
@@ -24,6 +28,10 @@ tf_module {
     name: "SequenceFeatures"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "WideDeepModel"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "export_saved_model"
     argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 0e176cb0da2..631012bb227 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
index a2af65554f7..7fc2f9c8d3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -3,7 +3,11 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "default_variable_dtype"
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss_scale"
     mtype: "<type \'property\'>"
   }
   member {
@@ -14,8 +18,12 @@ tf_class {
     name: "should_cast_variables"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'name\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'USE_DEFAULT\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 3840d3d7750..3a9b05d8de6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -277,7 +277,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 3d9f85c87ce..2343dd8fb7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -294,7 +294,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "save_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
index 9a3f95ca282..311142b0cc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
@@ -30,6 +30,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
index 08c8b73a4ee..e6a82676a73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "plot_model"
     argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'True\', \'TB\', \'False\', \'96\'], "
   }
+  member_method {
+    name: "register_keras_serializable"
+    argspec: "args=[\'package\', \'name\'], varargs=None, keywords=None, defaults=[\'Custom\', \'None\'], "
+  }
   member_method {
     name: "serialize_keras_object"
     argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 5117c5671ae..dabe6673e6b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -152,10 +152,26 @@ tf_module {
     name: "lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "lu_matrix_inverse"
+    argspec: "args=[\'lower_upper\', \'perm\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "lu_reconstruct"
+    argspec: "args=[\'lower_upper\', \'perm\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "lu_solve"
+    argspec: "args=[\'lower_upper\', \'perm\', \'rhs\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matrix_rank"
+    argspec: "args=[\'a\', \'tol\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "matrix_transpose"
     argspec: "args=[\'a\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\', \'False\'], "
@@ -172,6 +188,10 @@ tf_module {
     name: "normalize"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "pinv"
+    argspec: "args=[\'a\', \'rcond\', \'validate_args\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "qr"
     argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 3ec5c656b3f..82688f51640 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "cumulative_logsumexp"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "digamma"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -308,6 +312,10 @@ tf_module {
     name: "reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reciprocal_no_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "reduce_all"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.-loss-scaling-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.-loss-scaling-gradient-tape.pbtxt
new file mode 100644
index 00000000000..95c09bfb16c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.-loss-scaling-gradient-tape.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.mixed_precision.experimental.LossScalingGradientTape"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scaling_gradient_tape.LossScalingGradientTape\'>"
+  is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_scale\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "gradient"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
+  }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_recording"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "watch"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "watched_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.pbtxt
new file mode 100644
index 00000000000..43615a11b55
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mixed_precision.experimental"
+tf_module {
+  member {
+    name: "LossScalingGradientTape"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.pbtxt
new file mode 100644
index 00000000000..475c4a2ccde
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mixed_precision.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mixed_precision"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 8f52d2123dc..6e8e88a3598 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "RNNCellResidualWrapper"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "swish"
-    mtype: "<class \'tensorflow.python.framework.function._OverloadedFunction\'>"
-  }
   member_method {
     name: "all_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -312,6 +308,10 @@ tf_module {
     name: "sufficient_statistics"
     argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "swish"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 33c4610d97b..ee3c0cc22bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -256,6 +256,10 @@ tf_module {
     name: "metrics"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mixed_precision"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -570,7 +574,7 @@ tf_module {
   }
   member_method {
     name: "custom_gradient"
-    argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'f\', \'primals\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "device"
@@ -646,7 +650,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -900,6 +904,10 @@ tf_module {
     name: "register_tensor_conversion_function"
     argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
   }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'input\', \'repeats\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "required_space_to_batch_paddings"
     argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index adb4041e1fc..e5921e8c3b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
index d3f70f130f7..75144f1cf97 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
@@ -24,4 +24,12 @@ tf_module {
     name: "segment_ids_to_row_splits"
     argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "stack_dynamic_partitions"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index eef3ed54817..48c9552e473 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -96,10 +96,18 @@ tf_module {
     name: "AnonymousIteratorV2"
     argspec: "args=[\'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "AnonymousMemoryCache"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "AnonymousMultiDeviceIterator"
     argspec: "args=[\'devices\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "AnonymousRandomSeedGenerator"
+    argspec: "args=[\'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Any"
     argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -120,6 +128,10 @@ tf_module {
     name: "ApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ApplyAdam"
     argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
@@ -460,6 +472,14 @@ tf_module {
     name: "BlockLSTMGrad"
     argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'i\', \'cs\', \'f\', \'o\', \'ci\', \'co\', \'h\', \'cs_grad\', \'h_grad\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BlockLSTMGradV2"
+    argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'i\', \'cs\', \'f\', \'o\', \'ci\', \'co\', \'h\', \'cs_grad\', \'h_grad\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BlockLSTMV2"
+    argspec: "args=[\'seq_len_max\', \'x\', \'cs_prev\', \'h_prev\', \'w\', \'wci\', \'wcf\', \'wco\', \'b\', \'cell_clip\', \'use_peephole\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'None\'], "
+  }
   member_method {
     name: "BoostedTreesAggregateStats"
     argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'feature\', \'max_splits\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -500,6 +520,10 @@ tf_module {
     name: "BoostedTreesExampleDebugOutputs"
     argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BoostedTreesFlushQuantileSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BoostedTreesGetEnsembleStates"
     argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -556,6 +580,10 @@ tf_module {
     name: "BoostedTreesUpdateEnsemble"
     argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BoostedTreesUpdateEnsembleV2"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'dimension_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'split_types\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
   member_method {
     name: "BroadcastArgs"
     argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -596,6 +624,10 @@ tf_module {
     name: "CacheDataset"
     argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "CacheDatasetV2"
+    argspec: "args=[\'input_dataset\', \'filename\', \'cache\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Case"
     argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
@@ -638,15 +670,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -654,7 +686,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -700,6 +732,10 @@ tf_module {
     name: "ConfigureDistributedTPU"
     argspec: "args=[\'embedding_config\', \'tpu_embedding_config\', \'is_global_init\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "ConfigureTPUEmbedding"
+    argspec: "args=[\'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Conj"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -844,6 +880,10 @@ tf_module {
     name: "Cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "CumulativeLogsumexp"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "DataFormatDimMap"
     argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
@@ -862,7 +902,7 @@ tf_module {
   }
   member_method {
     name: "DatasetToGraph"
-    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'stateful_whitelist\', \'allow_stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'False\', \'None\'], "
   }
   member_method {
     name: "DatasetToSingleElement"
@@ -940,10 +980,18 @@ tf_module {
     name: "DeleteIterator"
     argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeleteMemoryCache"
+    argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DeleteMultiDeviceIterator"
     argspec: "args=[\'multi_device_iterator\', \'iterators\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeleteRandomSeedGenerator"
+    argspec: "args=[\'handle\', \'deleter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DeleteSessionTensor"
     argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1054,7 +1102,7 @@ tf_module {
   }
   member_method {
     name: "EagerPyFunc"
-    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'is_async\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "EditDistance"
@@ -1250,7 +1298,7 @@ tf_module {
   }
   member_method {
     name: "ExperimentalRebatchDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "ExperimentalScanDataset"
@@ -2538,7 +2586,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2594,11 +2642,11 @@ tf_module {
   }
   member_method {
     name: "QuantizeAndDequantizeV2"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizeAndDequantizeV3"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizeDownAndShrinkRange"
@@ -2836,6 +2884,10 @@ tf_module {
     name: "RaggedTensorToSparse"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToTensor"
+    argspec: "args=[\'shape\', \'values\', \'default_value\', \'row_partition_tensors\', \'row_partition_types\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2974,7 +3026,7 @@ tf_module {
   }
   member_method {
     name: "RebatchDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "Reciprocal"
@@ -3140,6 +3192,10 @@ tf_module {
     name: "ResourceApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ResourceApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ResourceApplyAdam"
     argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
@@ -3260,6 +3316,10 @@ tf_module {
     name: "ResourceSparseApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "ResourceSparseApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "ResourceSparseApplyCenteredRMSProp"
     argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -3620,6 +3680,10 @@ tf_module {
     name: "ShuffleDataset"
     argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "ShuffleDatasetV2"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed_generator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ShutdownDistributedTPU"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3736,6 +3800,10 @@ tf_module {
     name: "SparseApplyAdagradDA"
     argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "SparseApplyAdagradV2"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
   member_method {
     name: "SparseApplyCenteredRMSProp"
     argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -4042,7 +4110,7 @@ tf_module {
   }
   member_method {
     name: "StatelessWhile"
-    argspec: "args=[\'input\', \'cond\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
   }
   member_method {
     name: "StaticRegexFullMatch"
@@ -4100,6 +4168,10 @@ tf_module {
     name: "StringLower"
     argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "StringNGrams"
+    argspec: "args=[\'data\', \'data_splits\', \'separator\', \'ngram_widths\', \'left_pad\', \'right_pad\', \'pad_width\', \'preserve_short_sequences\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StringSplit"
     argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -4190,11 +4262,11 @@ tf_module {
   }
   member_method {
     name: "TPUReplicateMetadata"
-    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 96e05c6ea4a..da3149947b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "to_dense"
-    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "to_indicator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 6f0cd870f6f..8fc27ccedab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "lower"
     argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "ngrams"
+    argspec: "args=[\'data\', \'ngram_width\', \'separator\', \'pad_values\', \'padding_width\', \'preserve_short_sequences\', \'name\'], varargs=None, keywords=None, defaults=[\' \', \'None\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_join"
     argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
index 381cc5a8cfe..f5323324846 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
@@ -16,4 +16,12 @@ tf_module {
     name: "PythonState"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "disable_mixed_precision_graph_rewrite"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "enable_mixed_precision_graph_rewrite"
+    argspec: "args=[\'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'dynamic\'], "
+  }
 }
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 4c2c9b876fa..75cb9338d4a 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -1,7 +1,7 @@
 # Helper libraries for TensorFlow API compatibility test.
 
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library",
 )
 
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 8f9f5085c3e..716654bdfa8 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -48,6 +48,7 @@ py_test(
     srcs = ["deprecation_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = ["v1only"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/tools/api/tests/deprecation_test.py b/tensorflow/tools/api/tests/deprecation_test.py
index 3a5cf0d043e..962b557d7a9 100644
--- a/tensorflow/tools/api/tests/deprecation_test.py
+++ b/tensorflow/tools/api/tests/deprecation_test.py
@@ -39,7 +39,7 @@ class DeprecationTest(test.TestCase):
     tf.tables_initializer()
     self.assertEqual(1, mock_warning.call_count)
     self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "module_wrapper.py:")
+                             "deprecation_test.py:")
     self.assertRegexpMatches(
         mock_warning.call_args[0][2], r"tables_initializer")
     self.assertRegexpMatches(
@@ -60,7 +60,7 @@ class DeprecationTest(test.TestCase):
     tf.ragged.RaggedTensorValue(value, row_splits)
     self.assertEqual(1, mock_warning.call_count)
     self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "module_wrapper.py:")
+                             "deprecation_test.py:")
     self.assertRegexpMatches(
         mock_warning.call_args[0][2], r"ragged.RaggedTensorValue")
     self.assertRegexpMatches(
@@ -83,7 +83,7 @@ class DeprecationTest(test.TestCase):
     tf.sparse_mask(array, mask_indices)
     self.assertEqual(1, mock_warning.call_count)
     self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "module_wrapper.py:")
+                             "deprecation_test.py:")
     self.assertRegexpMatches(
         mock_warning.call_args[0][2], r"sparse_mask")
     self.assertRegexpMatches(
@@ -101,7 +101,7 @@ class DeprecationTest(test.TestCase):
     tf.VarLenFeature(tf.dtypes.int32)
     self.assertEqual(1, mock_warning.call_count)
     self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "module_wrapper.py:")
+                             "deprecation_test.py:")
     self.assertRegexpMatches(
         mock_warning.call_args[0][2], r"VarLenFeature")
     self.assertRegexpMatches(
@@ -119,7 +119,7 @@ class DeprecationTest(test.TestCase):
     tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY  # pylint: disable=pointless-statement
     self.assertEqual(1, mock_warning.call_count)
     self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "module_wrapper.py:")
+                             "deprecation_test.py:")
     self.assertRegexpMatches(
         mock_warning.call_args[0][2],
         r"saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY")
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index e5187ab8727..4f74b2150f2 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
@@ -101,7 +102,7 @@ void CreateTensorsFromInputInfo(
         if (!input.initialization_values.empty()) {
           LOG(FATAL) << "Initialization values are not supported for strings";
         }
-        auto type_tensor = input_tensor.flat<string>();
+        auto type_tensor = input_tensor.flat<tstring>();
         type_tensor = type_tensor.constant("");
         break;
       }
@@ -532,24 +533,34 @@ int Main(int argc, char** argv) {
     InputLayerInfo input;
     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
         << input_layer_types[n] << " was an invalid type";
-    std::vector<int32> sizes;
-    CHECK(str_util::SplitAndParseAsInts(input_layer_shapes[n], ',', &sizes))
-        << "Incorrect size string specified: " << input_layer_shapes[n];
-    for (int i = 0; i < sizes.size(); ++i) {
-      int32 size = sizes[i];
-      if (size == -1) {
+
+    std::vector<string> split_layer_shapes =
+        str_util::Split(input_layer_shapes[n], ',');
+    for (const string& layer_shape : split_layer_shapes) {
+      int32 tmp;
+      CHECK(strings::safe_strto32(layer_shape, &tmp))
+          << "Incorrect size string specified: " << input_layer_shapes[n];
+      if (tmp == -1) {
         LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
                    << " with the size you want to benchmark with.";
         return -1;
+      } else {
+        input.shape.AddDim(tmp);
       }
-      input.shape.AddDim(sizes[i]);
     }
     input.name = input_layers[n];
     if (n < input_layer_values.size()) {
-      CHECK(str_util::SplitAndParseAsFloats(input_layer_values[n], ',',
-                                            &input.initialization_values))
-          << "Incorrect initialization values string specified: "
-          << input_layer_values[n];
+      std::vector<string> string_tokens =
+          str_util::Split(input_layer_values[n], ',');
+      input.initialization_values.clear();
+      input.initialization_values.reserve(string_tokens.size());
+      for (const string& str_val : string_tokens) {
+        float val;
+        CHECK(strings::safe_strtof(str_val, &val))
+            << "Incorrect initialization values string specified: "
+            << input_layer_values[n];
+        input.initialization_values.push_back(val);
+      }
     }
     inputs.push_back(input);
   }
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
deleted file mode 100644
index d9f5b7c0364..00000000000
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
+++ /dev/null
@@ -1,24 +0,0 @@
-FROM ubuntu:16.04
-
-LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
-RUN /install/install_deb_packages.sh
-RUN /install/install_pip_packages.sh
-RUN /install/install_bazel.sh
-RUN /install/install_proto3.sh
-RUN /install/install_buildifier.sh
-RUN /install/install_mpi.sh
-
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
-
-# Set up MPI
-ENV TF_NEED_MPI 1
-ENV MPI_HOME /usr/lib/openmpi
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
new file mode 100644
index 00000000000..6645ad7c88b
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
@@ -0,0 +1,72 @@
+# Dockerfile for Ubuntu 16.04 manylinux2010 custom ops with CPU.
+
+FROM ubuntu:16.04 as devtoolset
+
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      bzip2 \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      patch \
+      rpm2cpio \
+      unar \
+      wget \
+      tar \
+      xz-utils \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM ubuntu:16.04
+
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN /install/install_deb_packages.sh
+RUN /install/install_clang.sh
+RUN /install/install_bazel.sh
+RUN /install/install_buildifier.sh
+
+# Install golang.
+RUN /install/install_golang.sh
+env GOROOT=/usr/local/go
+env PATH=$GOROOT/bin:$PATH
+
+# Install python 3.6.
+RUN add-apt-repository ppa:jonathonf/python-3.6 && \
+    apt-get update && apt-get install -y \
+    python3.6 python3.6-dev python3-pip python3.6-venv && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3.6 -m pip install pip --upgrade && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0
+
+RUN /install/install_pip_packages.sh
+RUN /install/install_auditwheel.sh
+
+# TODO(klimek): Figure out a better way to get the right include paths
+# forwarded when we install new packages.
+RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu
new file mode 100644
index 00000000000..5db23056c89
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu
@@ -0,0 +1,69 @@
+# Dockerfile for Ubuntu 16.04 manylinux2010 custom ops with GPU.
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 as devtoolset
+
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      rpm2cpio \
+      unar \
+      wget \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+
+LABEL maintainer="Amit Patankar <amitpatankar@google.com>"
+
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN apt-get update && apt-get install -y \
+    libnvinfer-dev=5.1.5-1+cuda10.0 \
+    libnvinfer5=5.1.5-1+cuda10.0 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN /install/install_deb_packages.sh
+RUN /install/install_clang.sh
+RUN /install/install_bazel.sh
+RUN /install/install_buildifier.sh
+
+ENV TF_NEED_CUDA=1
+
+# Install python 3.6.
+RUN add-apt-repository ppa:jonathonf/python-3.6 && \
+    apt-get update && apt-get install -y \
+    python3.6 python3.6-dev python3-pip python3.6-venv && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3.6 -m pip install pip --upgrade && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0
+
+RUN /install/install_pip_packages.sh
+RUN /install/install_auditwheel.sh
+
+# TODO(klimek): Figure out a better way to get the right include paths
+# forwarded when we install new packages.
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
index e0533d871ce..32100d63768 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
@@ -16,7 +16,7 @@ RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.
 RUN apt-get update --allow-insecure-repositories && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     rocm-dev rocm-libs rocm-utils rocm-cmake \
-    rocfft miopen-hip miopengemm rocblas hipblas rocrand \
+    rocfft miopen-hip miopengemm rocblas hipblas rocrand rccl \
     rocm-profiler cxlactivitylogger && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
new file mode 100644
index 00000000000..93ad40dbb99
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -0,0 +1,73 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.ubuntu16.04-manylinux2010 \
+#  --tag "gcr.io/tensorflow-testing/nosla-ubuntu16.04-manylinux2010" .
+# $ docker push gcr.io/tensorflow-testing/nosla-ubuntu16.04-manylinux2010
+
+FROM ubuntu:16.04 as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      bzip2 \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      patch \
+      rpm2cpio \
+      unar \
+      wget \
+      tar \
+      xz-utils \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM ubuntu:16.04
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN /install/install_deb_packages.sh
+RUN /install/install_clang.sh
+RUN /install/install_bazel.sh
+
+# Install golang.
+RUN /install/install_golang.sh
+env GOROOT=/usr/local/go
+env PATH=$GOROOT/bin:$PATH
+
+# Install python 3.6.
+RUN add-apt-repository ppa:jonathonf/python-3.6 && \
+    apt-get update && apt-get install -y \
+    python3.6 python3.6-dev python3-pip python3.6-venv && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3.6 -m pip install pip --upgrade && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0
+
+RUN /install/install_pip_packages.sh
+
+# TODO(klimek): Figure out a better way to get the right include paths
+# forwarded when we install new packages.
+RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index d6601a28a78..5ab7363c90f 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -1,15 +1,15 @@
-# This Dockerfile provides a starting point for a ROCm installation of 
-# MIOpen and tensorflow.  
+# This Dockerfile provides a starting point for a ROCm installation of
+# MIOpen and tensorflow.
 FROM ubuntu:xenial
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian/
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/2.6/
 ARG ROCM_PATH=/opt/rocm
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
 ENV HOME /root/
-RUN apt update && apt install -y wget software-properties-common 
+RUN apt update && apt install -y wget software-properties-common
 
 # Add rocm repository
 RUN apt-get clean all
@@ -58,7 +58,7 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
 RUN apt-get update --allow-insecure-repositories && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     rocm-dev rocm-libs rocm-utils rocm-cmake \
-    rocfft miopen-hip miopengemm rocblas hipblas rocrand \
+    rocfft miopen-hip miopengemm rocblas hipblas rocrand rccl \
     rocm-profiler cxlactivitylogger && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index e2fd977f507..988c6706c11 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -74,3 +74,38 @@ this UI, to see the logs for a failed build:
     2.  In the grid that appears on the right, click on the specific shard,
         run, and attempt to view its log. You can also type the desired shard,
         run, or attempt number in the field above its grid.
+
+### Third party TensorFlow CI
+
+#### [Mellanox](https://www.mellanox.com/) TensorFlow CI
+
+##### How to start CI
+
+*   Submit special pull request (PR) comment to trigger CI: **bot:mlx:test**
+*   Test session is run automatically.
+*   Test results and artefacts (log files) are reported via PR comments
+
+##### CI Steps
+
+CI includes the following steps: * Build TensorFlow (GPU version) * Run
+TensorFlow tests: *
+[TF CNN benchmarks](https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)
+(TensorFlow 1.13 and less) *
+[TF models](https://github.com/tensorflow/models/tree/master/official/r1/resnet)
+(TensorFlow 2.0): ResNet, synthetic data, NCCL, multi_worker_mirrored
+distributed strategy
+
+##### Test Environment
+
+CI is run in the Mellanox lab on a 2-node cluster with the following parameters:
+* Hardware * IB: 1x ConnectX-6 HCA (connected to Mellanox Quantum™ HDR switch) *
+GPU: 1x Nvidia Tesla K40m * Software * Ubuntu 16.04.6 * Internal stable
+[MLNX_OFED](https://www.mellanox.com/page/products_dyn?product_family=26),
+[HPC-X™](https://www.mellanox.com/page/hpcx_overview) and
+[SHARP™](https://www.mellanox.com/page/products_dyn?product_family=261&mtag=sharp)
+versions
+
+##### Support (Mellanox)
+
+With any questions/suggestions or in case of issues contact
+[Artem Ryabov](mailto:artemry@mellanox.com).
diff --git a/tensorflow/tools/ci_build/builds/android.sh b/tensorflow/tools/ci_build/builds/android.sh
index 50fa5fe25f7..b5d37b4f117 100755
--- a/tensorflow/tools/ci_build/builds/android.sh
+++ b/tensorflow/tools/ci_build/builds/android.sh
@@ -36,7 +36,7 @@ TARGETS+=" //tensorflow/core/common_runtime/eager:execute"
 # in assets/ dir (see https://github.com/bazelbuild/bazel/issues/2334)
 # TODO(gunan): remove extra flags once sandboxing is enabled for all builds.
 bazel --bazelrc=/dev/null build \
-    --compilation_mode=opt --cxxopt=-std=c++11 --fat_apk_cpu=x86_64 \
+    --compilation_mode=opt --cxxopt=-std=c++14 --fat_apk_cpu=x86_64 \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     --define=grpc_no_ares=true \
     ${TARGETS}
diff --git a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
index 26dcc6a1dec..c87ec292471 100755
--- a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+++ b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
@@ -37,7 +37,8 @@ bazel test --define=no_tensorflow_py_deps=true \
       --test_timeout 300,450,1200,3600 \
       --test_output=errors \
       -- //${PIP_TEST_ROOT}/tensorflow/python/... \
-      -//${PIP_TEST_ROOT}/tensorflow/python/keras:training_eager_test \
-      -//${PIP_TEST_ROOT}/tensorflow/python/keras:base_layer_test \
       -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test \
-      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test_gpu
+      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test_gpu \
+      -//${PIP_TEST_ROOT}/tensorflow/python:collective_ops_gpu_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python:collective_ops_gpu_test_gpu
+
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index d1fad98ed7e..9f8f8da7106 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -131,8 +131,8 @@ echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
 GPU_FLAG=""
+ROCM_FLAG=""
 if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
-   [[ ${CONTAINER_TYPE} == "rocm" ]] || \
    [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
   bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
@@ -140,6 +140,10 @@ elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
   bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
   GPU_FLAG="--gpu"
+elif [[ ${CONTAINER_TYPE} == "rocm" ]]; then
+  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
+      die "Build failed."
+  ROCM_FLAG="--rocm"
 else
   die "Unrecognized container type: \"${CONTAINER_TYPE}\""
 fi
@@ -193,7 +197,7 @@ fi
 PIP_WHL_DIR="${PIP_TEST_ROOT}/whl"
 PIP_WHL_DIR=$(realpath ${PIP_WHL_DIR})  # Get absolute path
 rm -rf ${PIP_WHL_DIR} && mkdir -p ${PIP_WHL_DIR}
-bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${NIGHTLY_FLAG} || \
+bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${ROCM_FLAG} ${NIGHTLY_FLAG} || \
     die "build_pip_package FAILED"
 
 WHL_PATH=$(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl)
@@ -406,7 +410,7 @@ do_virtualenv_pip_test() {
     return ${SKIP_RETURN_CODE}
   else
     # Call run_pip_tests.sh to perform test-on-install
-    "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG}
+    "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${ROCM_FLAG} ${MAC_FLAG}
     if [[ $? != 0 ]]; then
       echo "PIP tests-on-install FAILED"
       return 1
@@ -426,7 +430,7 @@ do_virtualenv_oss_serial_pip_test() {
   else
     # Call run_pip_tests.sh to perform test-on-install
     "${SCRIPT_DIR}/run_pip_tests.sh" \
-      --virtualenv ${GPU_FLAG} ${MAC_FLAG} --oss_serial
+      --virtualenv ${GPU_FLAG} ${ROCM_FLAG} ${MAC_FLAG} --oss_serial
     if [[ $? != 0 ]]; then
       echo "PIP tests-on-install (oss_serial) FAILED"
       return 1
@@ -439,7 +443,7 @@ do_virtualenv_oss_serial_pip_test() {
 ################################################################################
 do_test_user_ops() {
   if [[ "${DO_TEST_USER_OPS}" == "1" ]]; then
-    "${SCRIPT_DIR}/test_user_ops.sh" --virtualenv ${GPU_FLAG}
+    "${SCRIPT_DIR}/test_user_ops.sh" --virtualenv ${GPU_FLAG} ${ROCM_FLAG}
     if [[ $? != 0 ]]; then
       echo "PIP user-op tests-on-install FAILED"
       return 1
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index 1fb02caae79..72f1b582087 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -421,7 +421,7 @@ install_tensorflow_pip() {
   echo "PYTHON_BIN_PATH to be used to install the .whl: ${PYTHON_BIN_PATH}"
   echo "PIP_BIN_PATH to be used to install the .whl: ${PIP_BIN_PATH}"
 
-  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
+  # Upgrade pip so it supports tags such as cp27mu, manylinux2010 etc.
   echo "Upgrade pip in virtualenv"
 
   # NOTE: pip install --upgrade pip leads to a documented TLS issue for
@@ -452,6 +452,12 @@ install_tensorflow_pip() {
   #   ImportError: cannot import name py31compat
   ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
     die "Error: setuptools install, upgrade FAILED"
+
+  # Install the future package in the virtualenv. Installing it in user system
+  # packages does not appear to port it over when creating a virtualenv.
+  #   ImportError: No module named builtins
+  ${PIP_BIN_PATH} install --upgrade "future>=0.17.1" || \
+    die "Error: future install, upgrade FAILED"
 }
 
 run_test_with_bazel() {
@@ -613,7 +619,7 @@ fi
 
 WHL_DIR=$(dirname "${WHL_PATH}")
 WHL_BASE_NAME=$(basename "${WHL_PATH}")
-AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux1}")
+AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
 
 # Print the size of the wheel file.
 echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
@@ -626,25 +632,25 @@ for WHL_PATH in $(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl); do
     # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
     WHL_PATH=${AUDITED_WHL_NAME}
     cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
-    echo "Copied manylinux1 wheel file at ${WHL_PATH}"
+    echo "Copied manylinux2010 wheel file at ${WHL_PATH}"
   else
     if [[ ${OS_TYPE} == "ubuntu" ]]; then
       # Avoid Python3.6 abnormality by installing auditwheel here.
       set +e
       pip3 show auditwheel || "pip${PY_MAJOR_MINOR_VER}" show auditwheel
-      pip3 install auditwheel==1.5.0 || "pip${PY_MAJOR_MINOR_VER}" install auditwheel==1.5.0
-      sudo pip3 install auditwheel==1.5.0 || \
-        sudo "pip${PY_MAJOR_MINOR_VER}" install auditwheel==1.5.0
+      pip3 install auditwheel==2.0.0 || "pip${PY_MAJOR_MINOR_VER}" install auditwheel==2.0.0
+      sudo pip3 install auditwheel==2.0.0 || \
+        sudo "pip${PY_MAJOR_MINOR_VER}" install auditwheel==2.0.0
       set -e
       auditwheel --version
 
-      # Repair the wheels for cpu manylinux1
+      # Repair the wheels for cpu manylinux2010
       echo "auditwheel repairing ${WHL_PATH}"
-      auditwheel repair -w "${WHL_DIR}" "${WHL_PATH}"
+      auditwheel repair --plat manylinux2010_x86_64 -w "${WHL_DIR}" "${WHL_PATH}"
 
       if [[ -f ${AUDITED_WHL_NAME} ]]; then
         WHL_PATH=${AUDITED_WHL_NAME}
-        echo "Repaired manylinux1 wheel file at: ${WHL_PATH}"
+        echo "Repaired manylinux2010 wheel file at: ${WHL_PATH}"
       else
         die "WARNING: Cannot find repaired wheel."
       fi
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index b78281dfc23..138d9671bec 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -219,7 +219,7 @@ do_pylint() {
 
   echo ""
   if [[ ${N_ERRORS} != 0 ]]; then
-    echo "FAIL: Found ${N_ERRORS} non-whitelited pylint errors:"
+    echo "FAIL: Found ${N_ERRORS} non-whitelisted pylint errors:"
     cat "${NONWL_ERRORS_FILE}"
     return 1
   else
@@ -363,12 +363,12 @@ do_external_licenses_check(){
 
   # Blacklist
   echo ${MISSING_LICENSES_FILE}
-  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -e "@com_github_googlecloudplatform_google_cloud_cpp//google" -v ${MISSING_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//third_party/" -e "@bazel_tools//tools" -e "@local" -e "@com_google_absl//absl" -e "@org_tensorflow//" -e "@com_github_googlecloudplatform_google_cloud_cpp//google" -v ${MISSING_LICENSES_FILE} > temp.txt
   mv temp.txt ${MISSING_LICENSES_FILE}
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "//third_party/mkl_dnn" -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@org_tensorflow//tensorflow" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
@@ -551,10 +551,11 @@ do_check_file_name_test() {
 _check_no_deps() {
   TARGET="$1"
   DISALLOWED_DEP="$2"
+  EXTRA_FLAG="$3"
 
   TMP_FILE="$(mktemp)_tmp.log"
   echo "Checking ${TARGET} does not depend on ${DISALLOWED_DEP} ..."
-  bazel cquery "somepath(${TARGET}, ${DISALLOWED_DEP})" --keep_going> "${TMP_FILE}" 2>&1
+  bazel cquery ${EXTRA_FLAG} "somepath(${TARGET}, ${DISALLOWED_DEP})" --keep_going> "${TMP_FILE}" 2>&1
   if cat "${TMP_FILE}" | grep "Empty query results"; then
       echo "Success."
   else
@@ -568,17 +569,19 @@ _check_no_deps() {
   rm "${TMP_FILE}"
 }
 
-do_pip_no_cuda_deps_check() {
+_do_pip_no_cuda_deps_check() {
+  EXTRA_FLAG="$1"
   DISALLOWED_CUDA_DEPS=("@local_config_cuda//cuda:cudart"
         "@local_config_cuda//cuda:cublas"
         "@local_config_cuda//cuda:cuda_driver"
         "@local_config_cuda//cuda:cudnn"
         "@local_config_cuda//cuda:curand"
         "@local_config_cuda//cuda:cusolver"
-        "@local_config_cuda//cuda:cusparse")
+        "@local_config_cuda//cuda:cusparse"
+        "@local_config_tensorrt//:tensorrt")
   for cuda_dep in "${DISALLOWED_CUDA_DEPS[@]}"
   do
-   _check_no_deps "//tensorflow/tools/pip_package:build_pip_package" "${cuda_dep}"
+   _check_no_deps "//tensorflow/tools/pip_package:build_pip_package" "${cuda_dep}" "${EXTRA_FLAG}"
    RESULT=$?
 
    if [[ ${RESULT} != "0" ]]; then
@@ -587,6 +590,14 @@ do_pip_no_cuda_deps_check() {
   done
 }
 
+do_pip_no_cuda_deps_check_ubuntu() {
+  _do_pip_no_cuda_deps_check "--define using_cuda=true --define using_cuda_nvcc=true"
+}
+
+do_pip_no_cuda_deps_check_windows() {
+  _do_pip_no_cuda_deps_check "--define using_cuda=true --define using_cuda_nvcc=true --define framework_shared_object=false"
+}
+
 do_configure_test() {
   for WITH_CUDA in 1 0
   do
@@ -602,8 +613,8 @@ do_configure_test() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_configure_test" "do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_pip_no_cuda_deps_check")
-SANITY_STEPS_DESC=("Run ./configure" "Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu pip package does not depend on cuda shared libraries.")
+SANITY_STEPS=("do_configure_test" "do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_pip_no_cuda_deps_check_ubuntu" "do_pip_no_cuda_deps_check_windows")
+SANITY_STEPS_DESC=("Run ./configure" "Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check Ubuntu gpu pip package does not depend on cuda shared libraries" "Check Windows gpu pip package does not depend on cuda shared libraries")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 9c6390070c0..ee70f2f608b 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -68,6 +68,7 @@ for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
         # This export only works within the brackets, so it is isolated to one
         # single command.
         export CUDA_VISIBLE_DEVICES=$i
+        export HIP_VISIBLE_DEVICES=$i
         echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
         "$TEST_BINARY" $@
       )
diff --git a/tensorflow/tools/ci_build/install/install_auditwheel.sh b/tensorflow/tools/ci_build/install/install_auditwheel.sh
index 0e6d98c0a8d..c84bdf4e2ec 100755
--- a/tensorflow/tools/ci_build/install/install_auditwheel.sh
+++ b/tensorflow/tools/ci_build/install/install_auditwheel.sh
@@ -16,7 +16,7 @@
 
 set -e
 
-sudo pip3 install auditwheel==1.5.0
+sudo pip3 install auditwheel==2.0.0
 
 # Pin wheel==0.31.1 to work around issue
 # https://github.com/pypa/auditwheel/issues/102
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 215778593ac..0e4ce18e745 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.24.1"
+BAZEL_VERSION="0.26.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 75de2450c75..6d221a75353 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.24.1"
+BAZEL_VERSION="0.26.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 9e641d44867..c8fc266f93a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,12 +16,9 @@
 
 set -e
 
-# We don't apt-get install so that we can install a newer version of pip.
-# Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
-# Run easy_install after easy_install3, so that the default pip points to pip2,
-# to match the default python version of 2.7.
-easy_install3 -U pip==18.1
-easy_install -U pip==18.1
+# Get the latest version of pip so it recognize manylinux2010
+easy_install3 -U pip
+easy_install -U pip
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index 8f8f031005d..bb12098a3dc 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -31,8 +31,19 @@ export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 if [[ "$MODE" == "eigen" ]]; then
     CONFIG=""
+    OMPTHREADS=""
 else
     CONFIG="--config=mkl"
+# Setting OMP_THREADS for low performing benchmarks.
+#   Default value(=core count) degrades perfrmance of some banchmark cases. 
+#   Optimal thread count is case specific. 
+#   An argument can be passed to script, the value of which is used if given.
+#   Otherwise OMP_NUM_THREADS is set to 10
+    if [[ -z $1 ]]; then
+        OMPTHREADS="--action_env=OMP_NUM_THREADS=10"
+    else 
+        OMPTHREADS="--action_env=OMP_NUM_THREADS=$1"
+    fi
 fi
 
 # Run bazel test command. Double test timeouts to avoid flakes.
@@ -41,5 +52,5 @@ fi
 # caused by executing multiple tests concurrently.
 bazel test --test_tag_filters=-no_oss,-no_oss_py2,-oss_serial,-gpu,-benchmark-test --test_lang_filters=cc,py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
-    ${CONFIG} --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
+    ${CONFIG} --test_env=KMP_BLOCKTIME=0 ${OMPTHREADS} --config=opt --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index 9d2c8383fae..c07e1a022f5 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -28,6 +28,7 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
+export TF_NEED_ROCM=0
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index 5b3383e1059..7cefca0b84b 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -28,6 +28,7 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
+export TF_NEED_ROCM=0
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
new file mode 100755
index 00000000000..446a1e39ebc
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_GPUS=$(lspci|grep 'VGA'|grep 'AMD/ATI'|wc -l)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
+
+export TF_NEED_ROCM=1
+export TF_GPU_COUNT=${N_GPUS}
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test \
+      --config=rocm \
+      -k \
+      --test_tag_filters=gpu,-no_gpu,-no_rocm,-benchmark-test,-no_oss,-oss_serial, \
+      --test_timeout 600,900,2400,7200 \
+      --test_output=errors \
+      --jobs=${N_JOBS} \
+      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_sharding_strategy=disabled \
+      --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+      -- \
+      //tensorflow/... \
+      -//tensorflow/compiler/... \
+      -//tensorflow/contrib/... \
+      -//tensorflow/lite/... \
+      -//tensorflow/python/compiler/tensorrt/... \
+
+
+
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 9961f44e11a..1398b79b338 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -87,12 +87,13 @@ if [[ $1 == "PI_ONE" ]]; then
   echo "Building for the Pi One/Zero, with no NEON support"
   WHEEL_ARCH=linux_armv6l
 else
-  PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  PI_COPTS="--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
   --copt=-O3 --copt=-fno-tree-pre
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+  --define=raspberry_pi_with_neon=true"
   WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index e1db8caedd6..3ddcafb88cf 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -169,7 +169,6 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # https://github.com/bazelbuild/bazel/issues/6622
 bazel test --announce_rc --config=opt -k --test_output=errors \
   ${EXTRA_TEST_FLAGS} \
-  --experimental_windows_native_test_wrapper \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
   --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 0277479ad0b..bdd70eb9281 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -170,11 +170,10 @@ TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_env=TF_GPU_COUNT \
   ${EXTRA_TEST_FLAGS} \
-  --experimental_windows_native_test_wrapper \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss \
-  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss --build_tests_only \
+  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss,gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss,gpu --build_tests_only \
   --test_size_filters=small,medium \
   --local_test_jobs=$TF_GPU_COUNT --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
diff --git a/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh
new file mode 100644
index 00000000000..adfe79f2b2d
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script assumes the standard setup on tensorflow Jenkins windows machines.
+# It is NOT guaranteed to work on any other machine. Use at your own risk!
+#
+# REQUIREMENTS:
+# * All installed in standard locations:
+#   - JDK8, and JAVA_HOME set.
+#   - Microsoft Visual Studio 2015 Community Edition
+#   - Msys2
+#   - Anaconda3
+# * Bazel windows executable copied as "bazel.exe" and included in PATH.
+
+# All commands shall pass, and all should be visible.
+set -x
+set -e
+
+# This script is under <repo_root>/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/
+# Change into repository root.
+script_dir=$(dirname $0)
+cd ${script_dir%%tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu}.
+
+# Setting up the environment variables Bazel and ./configure needs
+source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
+  || { echo "Failed to source common_env.sh" >&2; exit 1; }
+
+# load bazel_test_lib.sh
+source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
+  || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
+
+# Recreate an empty bazelrc file under source root
+export TMP_BAZELRC=.tmp.bazelrc
+rm -f "${TMP_BAZELRC}"
+touch "${TMP_BAZELRC}"
+
+function cleanup {
+  # Remove all options in .tmp.bazelrc
+  echo "" > "${TMP_BAZELRC}"
+}
+trap cleanup EXIT
+
+PY_TEST_DIR="py_test_dir"
+
+SKIP_TEST=0
+RELEASE_BUILD=0
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
+EXTRA_TEST_FLAGS=""
+
+# --skip_test            Skip running tests
+# --enable_remote_cache  Add options to enable remote cache for build and test
+# --release_build        Build for release, compilation time will be longer to
+#                        ensure performance
+# --test_core_only       Use tensorflow/python/... as test target
+# --test_contrib_only    Use tensorflow/contrib/... as test target
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --tf_nightly) TF_NIGHTLY=1 ;;
+    --skip_test) SKIP_TEST=1 ;;
+    --enable_remote_cache) set_remote_cache_options ;;
+    --release_build) RELEASE_BUILD=1 ;;
+    --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
+    --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
+    --extra_test_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_TEST_FLAGS="$1"
+      ;;
+    *)
+  esac
+  shift
+done
+
+if [[ "$RELEASE_BUILD" == 1 ]]; then
+  # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+  # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
+  # Because this hurts the performance of TF, we don't override it in release build.
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=0
+else
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=1
+fi
+
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
+  else
+    python tensorflow/tools/ci_build/update_version.py --nightly
+  fi
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name ${PROJECT_NAME} --nightly_flag"
+  fi
+fi
+
+# Enable short object file path to avoid long path issue on Windows.
+echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
+
+# Disable nvcc warnings to reduce log file size.
+echo "build --copt=-nvcc_options=disable-warnings" >> "${TMP_BAZELRC}"
+
+if ! grep -q "import %workspace%/${TMP_BAZELRC}" .bazelrc; then
+  echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
+fi
+
+run_configure_for_gpu_build
+
+bazel build --announce_rc --config=opt --define=no_tensorflow_py_deps=true \
+  --output_filter=^$ \
+  ${EXTRA_BUILD_FLAGS} \
+  tensorflow/tools/pip_package:build_pip_package || exit $?
+
+if [[ "$SKIP_TEST" == 1 ]]; then
+  exit 0
+fi
+
+# Create a python test directory to avoid package name conflict
+create_python_test_dir "${PY_TEST_DIR}"
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
+  --gpu ${EXTRA_PIP_FLAGS}
+
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  exit 0
+fi
+
+# Running python tests on Windows needs pip package installed
+PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow_gpu-*.whl)
+reinstall_tensorflow_pip ${PIP_NAME}
+
+###########################
+# Run pip tests without GPU
+###########################
+# Wipe out CUDA related envs
+export CUDA_TOOLKIT_PATH=""
+export CUDNN_INSTALL_PATH=""
+
+# Setting up environment for CPU tests
+export TF_NEED_CUDA=0
+yes "" | ./configure
+
+# Remove cuda libraries from PATH
+echo ${PATH}
+NEW_PATH=""
+echo "Removing NVIDIA GPU Computing Toolkit related directories from PATH..."
+for DIR in ${PATH//:/ } ; do
+  if [[ ${DIR} == *"CUDA"* ]]; then
+    echo "Skipping ${DIR}"
+  else
+    NEW_PATH="${NEW_PATH}:${DIR}"
+  fi
+done
+export PATH=${NEW_PATH}
+echo ${PATH}
+
+
+# NUMBER_OF_PROCESSORS is predefined on Windows
+N_JOBS="${NUMBER_OF_PROCESSORS}"
+
+# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
+# which will result testing system installed tensorflow
+# TODO(pcloudy): remove --experimental_windows_native_test_wrapper once
+# native test wrapper is enabled by default.
+# https://github.com/bazelbuild/bazel/issues/6622
+bazel test --announce_rc --config=opt -k --test_output=errors \
+  ${EXTRA_TEST_FLAGS} \
+  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
+  --test_size_filters=small,medium \
+  --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
+  --flaky_test_attempts=3 \
+  --output_filter=^$ \
+  -- ${TEST_TARGET} \
+  -//${PY_TEST_DIR}/tensorflow/python:virtual_gpu_test
diff --git a/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/run.bat b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/run.bat
new file mode 100644
index 00000000000..5d2f6e8ef28
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/run.bat
@@ -0,0 +1 @@
+bash -l %cd%/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh %*
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 814e19ca8e3..fd31c35b4e5 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -60,7 +60,12 @@ mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/lib
 cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
 cp bazel-genfiles/tensorflow/tensorflow.lib ${DIR}/lib/tensorflow.lib
-cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp tensorflow/c/c_api.h \
+  tensorflow/c/tf_attrtype.h \
+  tensorflow/c/tf_datatype.h \
+  tensorflow/c/tf_status.h \
+  tensorflow/c/tf_tensor.h \
+  ${DIR}/include/tensorflow/c
 cp tensorflow/c/eager/c_api.h ${DIR}/include/tensorflow/c/eager
 cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
 cd ${DIR}
@@ -69,5 +74,9 @@ zip libtensorflow-cpu-windows-$(uname -m).zip \
   lib/tensorflow.lib \
   include/tensorflow/c/eager/c_api.h \
   include/tensorflow/c/c_api.h \
+  include/tensorflow/c/tf_attrtype.h \
+  include/tensorflow/c/tf_datatype.h \
+  include/tensorflow/c/tf_status.h \
+  include/tensorflow/c/tf_tensor.h \
   include/tensorflow/c/LICENSE
 rm -rf lib include
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 29736b2d9d4..df5c3e67e59 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -60,7 +60,12 @@ mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/lib
 cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
 cp bazel-genfiles/tensorflow/tensorflow.lib ${DIR}/lib/tensorflow.lib
-cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp tensorflow/c/c_api.h \
+  tensorflow/c/tf_attrtype.h \
+  tensorflow/c/tf_datatype.h \
+  tensorflow/c/tf_status.h \
+  tensorflow/c/tf_tensor.h \
+  ${DIR}/include/tensorflow/c
 cp tensorflow/c/eager/c_api.h ${DIR}/include/tensorflow/c/eager
 cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
 cd ${DIR}
@@ -69,5 +74,9 @@ zip libtensorflow-gpu-windows-$(uname -m).zip \
   lib/tensorflow.lib \
   include/tensorflow/c/eager/c_api.h \
   include/tensorflow/c/c_api.h \
+  include/tensorflow/c/tf_attrtype.h \
+  include/tensorflow/c/tf_datatype.h \
+  include/tensorflow/c/tf_status.h \
+  include/tensorflow/c/tf_tensor.h \
   include/tensorflow/c/LICENSE
 rm -rf lib include
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
index a0de1280206..efbfbe99f58 100755
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -19,23 +19,27 @@ set -e
 set -x
 
 N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_GPUS=$(lspci|grep 'VGA'|grep 'AMD/ATI'|wc -l)
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
 echo ""
 
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_ROCM=1
+export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
 echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
 
 bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=rocm --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=1 \
+bazel test --config=rocm --test_tag_filters=-no_gpu,-benchmark-test,-no_oss,-no_rocm -k \
+    --jobs=${N_JOBS} --test_timeout 600,900,2400,7200 \
+    --build_tests_only --test_output=errors --local_test_jobs=${TF_GPU_COUNT} \
+    --test_sharding_strategy=disabled \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
     --config=xla -- \
     //tensorflow/compiler/...
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index c4fc1a993df..5a50d77b010 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -138,7 +138,7 @@ py_binary(
     name = "tf_upgrade_v2",
     srcs = ["tf_upgrade_v2_main.py"],
     main = "tf_upgrade_v2_main.py",
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
         ":ast_edits",
@@ -153,6 +153,7 @@ py_test(
     srcs = ["tf_upgrade_v2_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = ["v1only"],
     deps = [
         ":tf_upgrade_v2_lib",
         "//tensorflow:tensorflow_py",
@@ -225,7 +226,8 @@ genrule(
     cmd = ("$(location :tf_upgrade_v2)" +
            " --infile $(location testdata/test_file_v1_12.py)" +
            " --outfile $(location test_file_v2_0.py)" +
-           " --reportfile $(location report_v2.txt)"),
+           " --reportfile $(location report_v2.txt) && " +
+           "sed -i'.original' 's/_TEST_VERSION = 1/_TEST_VERSION = 2/g' $(location test_file_v2_0.py)"),
     tools = [":tf_upgrade_v2"],
 )
 
@@ -235,6 +237,7 @@ py_test(
     srcs = ["testdata/test_file_v1_12.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = ["v1only"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index e80bdc47b82..70ed82dd009 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -1032,10 +1032,25 @@ class ASTCodeUpgrader(object):
       output_directory = os.path.dirname(output_path)
       if not os.path.isdir(output_directory):
         os.makedirs(output_directory)
+
+      if os.path.islink(input_path):
+        link_target = os.readlink(input_path)
+        link_target_output = os.path.join(
+            output_root_directory, os.path.relpath(link_target, root_directory))
+        if (link_target, link_target_output) in files_to_process:
+          # Create a link to the new location of the target file
+          os.symlink(link_target_output, output_path)
+        else:
+          report += "Copying symlink %s without modifying its target %s" % (
+              input_path, link_target)
+          os.symlink(link_target, output_path)
+        continue
+
       file_count += 1
       _, l_report, l_errors = self.process_file(input_path, output_path)
       tree_errors[input_path] = l_errors
       report += l_report
+
     for input_path, output_path in files_to_copy:
       output_directory = os.path.dirname(output_path)
       if not os.path.isdir(output_directory):
@@ -1059,6 +1074,9 @@ class ASTCodeUpgrader(object):
     report += ("=" * 80) + "\n"
 
     for path in files_to_process:
+      if os.path.islink(path):
+        report += "Skipping symlink %s.\n" % path
+        continue
       file_count += 1
       _, l_report, l_errors = self.process_file(path, path)
       tree_errors[path] = l_errors
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 0bc87d17d53..d6a366d7220 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -45,6 +45,7 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
+import os
 import six
 
 from tensorflow.python.framework import test_util
@@ -605,6 +606,89 @@ def t():
     _, new_text = self._upgrade(RenameImports(), text)
     self.assertEqual(expected_text, new_text)
 
+  def testUpgradeInplaceWithSymlink(self):
+    upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
+    os.mkdir(upgrade_dir)
+    file_a = os.path.join(upgrade_dir, "a.py")
+    file_b = os.path.join(upgrade_dir, "b.py")
+
+    with open(file_a, "a") as f:
+      f.write("import foo as f")
+    os.symlink(file_a, file_b)
+
+    upgrader = ast_edits.ASTCodeUpgrader(RenameImports())
+    upgrader.process_tree_inplace(upgrade_dir)
+
+    self.assertTrue(os.path.islink(file_b))
+    self.assertEqual(file_a, os.readlink(file_b))
+    with open(file_a, "r") as f:
+      self.assertEqual("import bar as f", f.read())
+
+  def testUpgradeInPlaceWithSymlinkInDifferentDir(self):
+    upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
+    other_dir = os.path.join(self.get_temp_dir(), "bar")
+    os.mkdir(upgrade_dir)
+    os.mkdir(other_dir)
+    file_c = os.path.join(other_dir, "c.py")
+    file_d = os.path.join(upgrade_dir, "d.py")
+
+    with open(file_c, "a") as f:
+      f.write("import foo as f")
+    os.symlink(file_c, file_d)
+
+    upgrader = ast_edits.ASTCodeUpgrader(RenameImports())
+    upgrader.process_tree_inplace(upgrade_dir)
+
+    self.assertTrue(os.path.islink(file_d))
+    self.assertEqual(file_c, os.readlink(file_d))
+    # File pointed to by symlink is in a different directory.
+    # Therefore, it should not be upgraded.
+    with open(file_c, "r") as f:
+      self.assertEqual("import foo as f", f.read())
+
+  def testUpgradeCopyWithSymlink(self):
+    upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
+    output_dir = os.path.join(self.get_temp_dir(), "bar")
+    os.mkdir(upgrade_dir)
+    file_a = os.path.join(upgrade_dir, "a.py")
+    file_b = os.path.join(upgrade_dir, "b.py")
+
+    with open(file_a, "a") as f:
+      f.write("import foo as f")
+    os.symlink(file_a, file_b)
+
+    upgrader = ast_edits.ASTCodeUpgrader(RenameImports())
+    upgrader.process_tree(upgrade_dir, output_dir, copy_other_files=True)
+
+    new_file_a = os.path.join(output_dir, "a.py")
+    new_file_b = os.path.join(output_dir, "b.py")
+    self.assertTrue(os.path.islink(new_file_b))
+    self.assertEqual(new_file_a, os.readlink(new_file_b))
+    with open(new_file_a, "r") as f:
+      self.assertEqual("import bar as f", f.read())
+
+  def testUpgradeCopyWithSymlinkInDifferentDir(self):
+    upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
+    other_dir = os.path.join(self.get_temp_dir(), "bar")
+    output_dir = os.path.join(self.get_temp_dir(), "baz")
+    os.mkdir(upgrade_dir)
+    os.mkdir(other_dir)
+    file_a = os.path.join(other_dir, "a.py")
+    file_b = os.path.join(upgrade_dir, "b.py")
+
+    with open(file_a, "a") as f:
+      f.write("import foo as f")
+    os.symlink(file_a, file_b)
+
+    upgrader = ast_edits.ASTCodeUpgrader(RenameImports())
+    upgrader.process_tree(upgrade_dir, output_dir, copy_other_files=True)
+
+    new_file_b = os.path.join(output_dir, "b.py")
+    self.assertTrue(os.path.islink(new_file_b))
+    self.assertEqual(file_a, os.readlink(new_file_b))
+    with open(file_a, "r") as f:
+      self.assertEqual("import foo as f", f.read())
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 58f5dff660f..b0b29fc2d99 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -18,6 +18,7 @@
 THIS FILE IS AUTOGENERATED: To update, please run:
   bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
   bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+  pyformat --in_place third_party/tensorflow/tools/compatibility/renames_v2.py
 This file should be updated whenever endpoints are deprecated.
 """
 from __future__ import absolute_import
@@ -277,6 +278,8 @@ renames = {
         'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables':
         'tf.compat.v1.disable_resource_variables',
+    'tf.disable_tensor_equality':
+        'tf.compat.v1.disable_tensor_equality',
     'tf.disable_v2_behavior':
         'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape':
@@ -331,6 +334,8 @@ renames = {
         'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables':
         'tf.compat.v1.enable_resource_variables',
+    'tf.enable_tensor_equality':
+        'tf.compat.v1.enable_tensor_equality',
     'tf.enable_v2_behavior':
         'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape':
@@ -363,6 +368,8 @@ renames = {
         'tf.compat.v1.estimator.tpu.TPUEstimatorSpec',
     'tf.estimator.tpu.experimental.EmbeddingSpec':
         'tf.compat.v1.estimator.tpu.experimental.EmbeddingSpec',
+    'tf.experimental.output_all_intermediates':
+        'tf.compat.v1.experimental.output_all_intermediates',
     'tf.expm1':
         'tf.math.expm1',
     'tf.fake_quant_with_min_max_args':
@@ -1459,10 +1466,6 @@ renames = {
         'tf.compat.v1.train.do_quantize_training_on_graphdef',
     'tf.train.experimental.MixedPrecisionLossScaleOptimizer':
         'tf.compat.v1.train.experimental.MixedPrecisionLossScaleOptimizer',
-    'tf.train.experimental.disable_mixed_precision_graph_rewrite':
-        'tf.compat.v1.train.experimental.disable_mixed_precision_graph_rewrite',
-    'tf.train.experimental.enable_mixed_precision_graph_rewrite':
-        'tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite',
     'tf.train.exponential_decay':
         'tf.compat.v1.train.exponential_decay',
     'tf.train.export_meta_graph':
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 42f8cb711e3..ca33adb4e33 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -21,10 +21,16 @@ import tensorflow as tf
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 
+_TEST_VERSION = 1
+
 
 class TestUpgrade(test_util.TensorFlowTestCase):
   """Test various APIs that have been changed in 2.0."""
 
+  @classmethod
+  def setUpClass(cls):
+    cls._tf_api_version = 1 if hasattr(tf, 'contrib') else 2
+
   def setUp(self):
     tf.compat.v1.enable_v2_behavior()
 
@@ -74,6 +80,14 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertAllClose(out, 0.40318608)
 
   def testLinearClassifier(self):
+    if _TEST_VERSION == 2 and self._tf_api_version == 1:
+      # Skip if we converted this file to v2 but running with tf v1.
+      # In this case, conversion script adds reference to
+      # tf.keras.losses.Reduction which is not available in v1.
+      self.skipTest(
+          'After converting to 2.0, this test does not work with '
+          'TensorFlow 1.x.')
+      return
     feature_column = tf.feature_column.numeric_column(
         'feature', shape=(1,))
 
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index adc8aa4e4fa..221353d87cd 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -1526,6 +1526,10 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
             "'merge_repeated' argument and behaves as if merge_repeated=False. "
             "This call site specifies something other than "
             "merge_repeated=False, so it was converted to compat.v1."),
+        "tf.nn.dilation2d": functools.partial(
+            _add_argument_transformer,
+            arg_name="data_format",
+            arg_value_ast=ast.Str("NHWC")),
         "tf.nn.erosion2d": functools.partial(
             _add_argument_transformer,
             arg_name="data_format",
@@ -2024,7 +2028,7 @@ def _add_loss_reduction_transformer(parent, node, full_name, name, logs):
 
   Default value for tf.estimator.*Classifier and tf.estimator.*Regressor
   loss_reduction argument changed to SUM_OVER_BATCH_SIZE. So, we update
-  existing calls to use the old default value `tf.losses.Reduction.SUM`.
+  existing calls to use the old default value `tf.keras.losses.Reduction.SUM`.
 
   Note: to apply this transformation, symbol must be added
   to reordered_function_names above.
@@ -2032,9 +2036,7 @@ def _add_loss_reduction_transformer(parent, node, full_name, name, logs):
   for keyword_arg in node.keywords:
     if keyword_arg.arg == "loss_reduction":
       return node
-  # TODO(annarev): this should be updated to tf.keras.losses.Reduction.SUM
-  # once b/125525822 is fixed.
-  default_value = "tf.compat.v1.losses.Reduction.SUM"
+  default_value = "tf.keras.losses.Reduction.SUM"
   # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
   ast_value = pasta.parse(default_value)
   node.keywords.append(ast.keyword(arg="loss_reduction", value=ast_value))
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 68fe923c2b6..4464a2aed63 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -684,7 +684,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     for c in classes:
       ns = "tf.estimator." + c
       text = ns + "()"
-      expected_text = ns + "(loss_reduction=tf.compat.v1.losses.Reduction.SUM)"
+      expected_text = ns + "(loss_reduction=tf.keras.losses.Reduction.SUM)"
       _, report, errors, new_text = self._upgrade(text)
       self.assertEqual(expected_text, new_text)
 
@@ -703,7 +703,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     text = "tf.estimator.BaselineClassifier(model_dir=model_dir)"
     expected_text = ("tf.estimator.BaselineClassifier(" +
                      "model_dir=model_dir, "
-                     "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                     "loss_reduction=tf.keras.losses.Reduction.SUM)")
     _, report, errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
@@ -728,7 +728,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       suffix = "(input_layer_partitioner=TEST)"
       text = ns + suffix
       suffix = ("(input_layer_partitioner=TEST, "
-                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                "loss_reduction=tf.keras.losses.Reduction.SUM)")
       expected_text = "tf.compat.v1.estimator." + c + suffix
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(new_text, expected_text)
@@ -764,7 +764,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       suffix = "(optimizer=TEST)"
       text = ns + suffix
       suffix = ("(optimizer=TEST, "
-                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                "loss_reduction=tf.keras.losses.Reduction.SUM)")
       expected_text = "tf.compat.v1.estimator." + c + suffix
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(new_text, expected_text)
@@ -779,7 +779,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       suffix = "(dnn_optimizer=TEST, linear_optimizer=Test)"
       text = ns + suffix
       suffix = ("(dnn_optimizer=TEST, linear_optimizer=Test, "
-                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                "loss_reduction=tf.keras.losses.Reduction.SUM)")
       expected_text = "tf.compat.v1.estimator." + c + suffix
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(new_text, expected_text)
@@ -815,7 +815,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       suffix = "(input_layer_partitioner=TEST, optimizer=TEST)"
       text = ns + suffix
       suffix = ("(input_layer_partitioner=TEST, optimizer=TEST, "
-                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                "loss_reduction=tf.keras.losses.Reduction.SUM)")
       expected_text = "tf.compat.v1.estimator." + c + suffix
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(new_text, expected_text)
@@ -833,7 +833,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       text = ns + suffix
       suffix = ("(input_layer_partitioner=TEST, dnn_optimizer=TEST, "
                 "linear_optimizer=TEST, "
-                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+                "loss_reduction=tf.keras.losses.Reduction.SUM)")
       expected_text = "tf.compat.v1.estimator." + c + suffix
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(new_text, expected_text)
@@ -2069,6 +2069,12 @@ def _log_prob(self, x):
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testNnDilation2d(self):
+    text = "tf.nn.dilation2d(v, k, s, r, p)"
+    expected_text = "tf.nn.dilation2d(v, k, s, r, p, data_format='NHWC')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testPywrapTensorflowWarning(self):
     text = "tf.pywrap_tensorflow.foo()"
     expected = "tf.pywrap_tensorflow.foo()"
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 0a65458e346..6761fa6ae3d 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -18,6 +18,7 @@
 To update renames_v2.py, run:
   bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
   bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+  pyformat --in_place third_party/tensorflow/tools/compatibility/renames_v2.py
 """
 # pylint: enable=line-too-long
 import sys
diff --git a/tensorflow/tools/def_file_filter/BUILD b/tensorflow/tools/def_file_filter/BUILD
index e390e0fb05c..250f31a6beb 100644
--- a/tensorflow/tools/def_file_filter/BUILD
+++ b/tensorflow/tools/def_file_filter/BUILD
@@ -7,3 +7,8 @@
 # so we have to filter some useless symbols through this python script.
 
 package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "symbols_pybind",
+    srcs = ["symbols_pybind.txt"],
+)
diff --git a/tensorflow/tools/def_file_filter/BUILD.tpl b/tensorflow/tools/def_file_filter/BUILD.tpl
index 3cb72f49797..066298440db 100644
--- a/tensorflow/tools/def_file_filter/BUILD.tpl
+++ b/tensorflow/tools/def_file_filter/BUILD.tpl
@@ -13,3 +13,8 @@ py_binary(
     srcs = ["def_file_filter.py"],
     srcs_version = "PY2AND3",
 )
+
+filegroup(
+    name = "symbols_pybind",
+    srcs = ["symbols_pybind.txt"],
+)
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index 329a9bb94ec..883a4fa27a9 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -40,6 +40,8 @@ import tempfile
 
 # External tools we use that come with visual studio sdk
 UNDNAME = "%{undname_bin_path}"
+DUMPBIN_CMD = "\"{}\" /SYMBOLS".format("%{dumpbin_bin_path}")
+GREP_CMD = "| grep External"
 
 # Exclude if matched
 EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
@@ -84,7 +86,13 @@ DATA_EXCLUDE_RE = re.compile(r"[)(]|"
                              r"protobuf::internal::ExplicitlyConstructed")
 
 def get_args():
-  """Parse command line."""
+  """Parse command line.
+
+  Examples:
+  (usecases in //tensorflow/python:pywrap_tensorflow_filtered_def_file)
+    --symbols $(location //tensorflow/tools/def_file_filter:symbols_pybind)
+    --lib_paths $(execpath :cpp_python_util) $(execpath :kernel_registry)
+  """
   filename_list = lambda x: x.split(";")
   parser = argparse.ArgumentParser()
   parser.add_argument("--input", type=filename_list,
@@ -92,14 +100,87 @@ def get_args():
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
   parser.add_argument("--target", help="name of the target")
+  parser.add_argument("--symbols", help="name of the target")
+  parser.add_argument("--lib_paths", nargs="+", help="lib_paths")
   args = parser.parse_args()
   return args
 
+def get_symbols(path_to_lib, re_filter):
+  """Get a list of symbols to be exported.
+
+  Args:
+    path_to_lib: String that is path (execpath) to target .lib file.
+    re_filter: String that is regex filter for filtering symbols from .lib.
+  """
+  sym_found = subprocess.check_output("{} {} {}".format(DUMPBIN_CMD, path_to_lib, GREP_CMD), shell=True)
+  sym_found = sym_found.decode()
+  # Example symbol line:
+  # 954 00000000 SECT2BD notype ()    External    | ?IsSequence@swig@tensorflow@@YA_NPEAU_object@@@Z (bool __cdecl tensorflow::swig::IsSequence(struct _object *))
+  # Split lines with `External` since each line must have the string.
+  sym_split = sym_found.split("External")
+  sym_filtered = []
+  re_filter_comp = re.compile(r"{}".format(re_filter))
+
+  for sym_line in sym_split:
+    if re_filter_comp.search(sym_line):
+      # Spliting each symbol line by ` ` returns below (fifth element = symbol):
+      # ["", "", "|", "", "?IsSequence@swig@tensorflow@@YA_NPEAU_object@@@Z", ...]
+      sym = sym_line.split(" ")[5]
+      sym_filtered.append(sym)
+
+  return sym_filtered
+
+def get_pybind_export_symbols(symbols_file, lib_paths):
+  """Returns a list of symbols to be exported from the target libs.
+
+  Args:
+    symbols_file: String that is the path to symbols_pybind.txt.
+    lib_paths: List of cc_library target execpaths.
+  """
+  # cc_library target name is always in [target_name] format in
+  # `symbols_pybind.txt`.
+  section_header_filter = r"\[(\S+)\]"  # e.g. `[cpp_python_util]`
+
+  # Create a dict of target libs and their symbols to be exported and populate
+  # it. (key = cc_library target, value = list of symbols) that we need to
+  # export.
+  symbols = {}
+  with open(symbols_file, "r") as f:
+    curr_lib = ""
+    for line in f:
+      line = line.strip()
+      section_header = re.match(section_header_filter, line)
+      if section_header:
+        curr_lib = section_header.groups()[0]
+        symbols[curr_lib] = []
+      elif not line:
+        pass
+      else:
+        # If not a section header and not an empty line, then it's a symbol
+        # line. e.g. `tensorflow::swig::IsSequence`
+        symbols[curr_lib].append(line)
+
+  # All symbols to be exported.
+  symbols_all = []
+  for lib in lib_paths:
+    lib = lib.strip()
+    if lib:
+      for cc_lib in symbols:  # keys in symbols = cc_library target name
+        if cc_lib in lib:
+          symbols_all.extend(
+            get_symbols(lib, "|".join(symbols[cc_lib])))
+
+  return symbols_all
 
 def main():
   """main."""
   args = get_args()
 
+  # Get symbols that need to be exported from specific libraries for pybind.
+  symbols_pybind = []
+  if args.symbols and args.lib_paths:
+    symbols_pybind = get_pybind_export_symbols(args.symbols, args.lib_paths)
+
   # Pipe dumpbin to extract all linkable symbols from libs.
   # Good symbols are collected in candidates and also written to
   # a temp file.
@@ -154,6 +235,10 @@ def main():
       else:
         def_fp.write("\t" + decorated + " DATA\n")
       taken.add(decorated)
+
+    for sym in symbols_pybind:
+      def_fp.write("\t{}\n".format(sym))
+      taken.add(sym)
     def_fp.close()
 
   exit_code = proc.wait()
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index df0fd053194..8c0ac234888 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -37,11 +37,17 @@ def _def_file_filter_configure_impl(repository_ctx):
         auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
     undname_bin_path = undname.replace("\\", "\\\\")
 
+    dumpbin = find_msvc_tool(repository_ctx, vc_path, "dumpbin.exe")
+    if dumpbin == None:
+        auto_configure_fail("Couldn't find dumpbin.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+    dumpbin_bin_path = dumpbin.replace("\\", "\\\\")
+
     repository_ctx.template(
         "def_file_filter.py",
         Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
         {
             "%{undname_bin_path}": undname_bin_path,
+            "%{dumpbin_bin_path}": dumpbin_bin_path,
         },
     )
     repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
new file mode 100644
index 00000000000..4dcc8abaa8d
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -0,0 +1,19 @@
+[cpp_python_util]
+tensorflow::swig::IsSequence
+tensorflow::swig::IsSequenceOrComposite
+tensorflow::swig::IsCompositeTensor
+tensorflow::swig::IsTypeSpec
+tensorflow::swig::IsNamedtuple
+tensorflow::swig::IsMapping
+tensorflow::swig::IsMappingView
+tensorflow::swig::IsAttrs
+tensorflow::swig::IsTensor
+tensorflow::swig::IsResourceVariable
+tensorflow::swig::IsVariable
+tensorflow::swig::SameNamedtuples
+tensorflow::swig::AssertSameStructure
+tensorflow::swig::Flatten
+tensorflow::swig::IsSequenceForData
+tensorflow::swig::FlattenForData
+tensorflow::swig::AssertSameStructureForData
+tensorflow::swig::RegisterType
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index 22b2d51836b..2e8d43843c5 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -64,8 +64,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index db177448901..120f6cb5149 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -98,7 +98,7 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=0.24.1
+ARG BAZEL_VERSION=0.26.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
@@ -117,8 +117,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 5c3ca61488e..f4396ca9fff 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -98,7 +98,7 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=0.24.1
+ARG BAZEL_VERSION=0.26.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 02d8f89919e..c2745d71905 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -28,11 +28,11 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 ARG CUDNN_MAJOR_VERSION=7
 ARG LIB_DIR_PREFIX=x86_64
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -62,11 +62,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            libnvinfer5=5.0.2-1+cuda${CUDA} \
-            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
+        libnvinfer-dev=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
@@ -84,6 +81,12 @@ ARG CACHE_STOP=1
 ARG CHECKOUT_TF_SRC=0
 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -130,7 +133,7 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=0.24.1
+ARG BAZEL_VERSION=0.26.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
@@ -149,8 +152,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 6d00ef3c115..ddd0c5051a0 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -28,11 +28,11 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 ARG CUDNN_MAJOR_VERSION=7
 ARG LIB_DIR_PREFIX=x86_64
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -62,11 +62,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            libnvinfer5=5.0.2-1+cuda${CUDA} \
-            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
+        libnvinfer-dev=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
@@ -84,6 +81,12 @@ ARG CACHE_STOP=1
 ARG CHECKOUT_TF_SRC=0
 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -130,7 +133,7 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=0.24.1
+ARG BAZEL_VERSION=0.26.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index fde7c9e8c39..fe2045bf193 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -28,9 +28,9 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -51,15 +51,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip
 
 RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -101,8 +105,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index a6ff1a5ccea..bfeaebe00c8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -28,9 +28,9 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -51,15 +51,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip
 
 RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
index e8fe0580a35..cd78aa57e22 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -82,8 +82,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index ca636556469..0d6190fedd5 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -118,8 +118,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index a05c718f6fb..49110036a1a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -28,11 +28,11 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 ARG CUDNN_MAJOR_VERSION=7
 ARG LIB_DIR_PREFIX=x86_64
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -62,11 +62,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            libnvinfer5=5.0.2-1+cuda${CUDA} \
-            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
+        libnvinfer-dev=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
@@ -84,6 +81,12 @@ ARG CACHE_STOP=1
 ARG CHECKOUT_TF_SRC=0
 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -150,8 +153,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 44d91ad067f..733404a5ce1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -28,11 +28,11 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 ARG CUDNN_MAJOR_VERSION=7
 ARG LIB_DIR_PREFIX=x86_64
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -62,11 +62,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            libnvinfer5=5.0.2-1+cuda${CUDA} \
-            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
+        libnvinfer-dev=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
@@ -84,6 +81,12 @@ ARG CACHE_STOP=1
 ARG CHECKOUT_TF_SRC=0
 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
index b2f1ce152c2..7ebfcedbf85 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -28,9 +28,9 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -51,15 +51,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip
 
 RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -119,8 +123,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
index 3422eadb60c..0b511bb1817 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -28,9 +28,9 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -51,15 +51,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip
 
 RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index c056d915d65..8290021a1ac 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -6,8 +6,12 @@ RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
 RUN apt-get install -y --no-install-recommends wget
 WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
 COPY readme-for-jupyter.md README.md
 RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 4f76a1d575e..7ece3a41035 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -25,7 +25,7 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=0.24.1
+ARG BAZEL_VERSION=0.26.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index fc0976b023f..496b3ac9e49 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -5,11 +5,11 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 ARG CUDNN_MAJOR_VERSION=7
 ARG LIB_DIR_PREFIX=x86_64
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -39,11 +39,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            libnvinfer5=5.0.2-1+cuda${CUDA} \
-            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
+        libnvinfer-dev=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
@@ -60,3 +57,9 @@ ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index b09c6456e9c..8593d1fa2b7 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -5,9 +5,9 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.4.1.5-1
+ARG CUDNN=7.6.2.24-1
 
-# Needed for string substitution 
+# Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -28,11 +28,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip
 
 RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda${CUDA} \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda${CUDA} \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
index 76b25d5a741..0e107e3f85b 100755
--- a/tensorflow/tools/dockerfiles/tests/build-gpu.sh
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -22,8 +22,6 @@ cd /tensorflow
 
 ln -s $(which ${PYTHON}) /usr/local/bin/python 
 
-ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
 LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
 tensorflow/tools/ci_build/builds/configured GPU \
 bazel build -c opt --copt=-mavx --config=cuda \
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 0e8cba27e14..1ea9f139569 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -123,10 +123,16 @@ if tf.__version__.startswith('1'):
       'tf.contrib.util': ['loader'],
   }
 else:
-  PRIVATE_MAP = {}
+  PRIVATE_MAP = {
+      'tf': ['python', 'core', 'compiler', 'examples', 'tools'],
+      # There's some aliasing between the compats and v1/2s, so it's easier to
+      # block by name and location than by deleting, or hiding objects.
+      'tf.compat.v1.compat': ['v1', 'v2'],
+      'tf.compat.v2.compat': ['v1', 'v2']
+  }
   DO_NOT_DESCEND_MAP = {}
   tf.__doc__ = """
-    ## TensorFlow 2.0 Beta
+    ## TensorFlow 2.0 RC
 
     Caution:  This is a developer preview.  You will likely find some bugs,
     performance issues, and more, and we encourage you to tell us about them.
@@ -138,7 +144,7 @@ else:
     with:
 
     ```
-    pip install tensorflow==2.0.0-beta1
+    pip install tensorflow==2.0.0-rc0
     ```
     """
 
@@ -234,11 +240,11 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
   except AttributeError:
     pass
 
-  base_dir = path.dirname(tf.__file__)
+  base_dir = path.normpath(path.join(tf.__file__, "../.."))
 
   base_dirs = (
-      base_dir,
-      # External packages base directories,
+      path.join(base_dir, "tensorflow_core"),
+      # External packages base directories
       path.dirname(tensorboard.__file__),
       path.dirname(tensorflow_estimator.__file__),
   )
@@ -250,8 +256,13 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
       "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
   )
 
+  if LooseVersion(tf.__version__) < LooseVersion('2'):
+    root_title = 'TensorFlow'
+  elif LooseVersion(tf.__version__) >= LooseVersion('2'):
+    root_title = 'TensorFlow 2.0'
+
   doc_generator = generate_lib.DocGenerator(
-      root_title="TensorFlow 2.0 Preview",
+      root_title=root_title,
       py_modules=[("tf", tf)],
       base_dir=base_dirs,
       search_hints=search_hints,
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 0e1a682d582..18d3a8349e8 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -41,7 +41,7 @@ def write_docs(output_dir,
                yaml_toc,
                root_title='TensorFlow',
                search_hints=True,
-               site_api_path=''):
+               site_api_path='api_docs/python'):
   """Write previously extracted docs to disk.
 
   Write a docs page for each symbol included in the indices of parser_config to
@@ -95,8 +95,7 @@ def write_docs(output_dir,
             parser.is_free_function(py_object, full_name, parser_config.index)):
       continue
 
-    sitepath = os.path.join('api_docs/python',
-                            parser.documentation_path(full_name)[:-3])
+    sitepath = os.path.join(parser.documentation_path(full_name)[:-3])
 
     # For TOC, we need to store a mapping from full_name to the file
     # we're generating
@@ -534,7 +533,7 @@ class DocGenerator(object):
 
     self.argument_parser.add_argument(
         '--site_api_path',
-        type=str, default='',
+        type=str, default='api_docs/python',
         help='The path from the site-root to api_docs'
              'directory for this project')
 
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index de18b132545..863504913e5 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -107,8 +107,7 @@ class GenerateTest(googletest.TestCase):
 
     output_dir = googletest.GetTempDir()
 
-    generate_lib.write_docs(output_dir, parser_config, yaml_toc=True,
-                            site_api_path='api_docs/python')
+    generate_lib.write_docs(output_dir, parser_config, yaml_toc=True)
 
     # Check redirects
     redirects_file = os.path.join(output_dir, '_redirects.yaml')
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index c98c4ee7ac5..adafe2aca12 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -336,4 +336,5 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     main = "python/transform_graph_test.py",
+    tags = ["v1only"],
 )
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index a90916cd1b9..34d6305725f 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -111,7 +111,7 @@ unsure, the
 tool can inspect the model and provide guesses about likely input and output nodes,
 as well as other information that's useful for debugging. Here's an example of
 how to use it on the [Inception V3
-graph](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz):
+graph](https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz):
 
 ```bash
 bazel build tensorflow/tools/graph_transforms:summarize_graph
@@ -124,7 +124,7 @@ This section has small guides for some of the most frequently-used
 transformation pipelines, aimed at users who want to quickly accomplish one of
 these tasks. A lot of them will use the Inception V3 model for their examples,
 which can be downloaded from
-[http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz).
+[https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz](https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz).
 
 ### Optimizing for Deployment
 
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 49e5cca461f..cc4078dfb85 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -126,7 +126,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
     if (node.name() == tensor_names_node) {
       Tensor tensor_names_tensor;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor_names_tensor));
-      const auto& tensor_names_value = tensor_names_tensor.flat<string>();
+      const auto& tensor_names_value = tensor_names_tensor.flat<tstring>();
       for (int i = 0; i < tensor_names_value.size(); i++) {
         if (tensor_names_value(i) == GetMonolithicTensorKey(target_name)) {
           offset = i;
@@ -144,7 +144,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
       Tensor shape_and_slices_tensor;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &shape_and_slices_tensor));
       const auto& shape_and_slices_value =
-          shape_and_slices_tensor.flat<string>();
+          shape_and_slices_tensor.flat<tstring>();
       *shape_slice_string = shape_and_slices_value(offset);
       return Status::OK();
     }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index b8d6ba00de8..0161ad67e2c 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -107,7 +107,7 @@ class SparsifyGatherTest : public ::testing::Test {
           CreateNode("save/Const", "Const", {}, &graph_def);
 
       Tensor tensor_names_values(DT_STRING, TensorShape({1}));
-      test::FillValues<string>(&tensor_names_values, {"w"});
+      test::FillValues<tstring>(&tensor_names_values, {"w"});
       NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
       SetNodeTensorAttr<string>("value", tensor_names_values,
@@ -116,7 +116,7 @@ class SparsifyGatherTest : public ::testing::Test {
       NodeDef* tensor_shapes_slices_node = CreateNode(
           "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
       Tensor shapes_slices_val(DT_STRING, TensorShape({1}));
-      shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<tstring>()(0) = "4 1 0,4:0,1";
       SetNodeTensorAttr<string>("value", shapes_slices_val,
                                 tensor_shapes_slices_node);
 
@@ -320,15 +320,15 @@ class SparsifyGatherTest : public ::testing::Test {
       NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
       Tensor tensor_names_values(DT_STRING, TensorShape({2}));
-      test::FillValues<string>(&tensor_names_values, {"w1", "w2"});
+      test::FillValues<tstring>(&tensor_names_values, {"w1", "w2"});
       SetNodeTensorAttr<string>("value", tensor_names_values,
                                 tensor_names_node);
 
       NodeDef* tensor_shapes_slices_node = CreateNode(
           "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
       Tensor shapes_slices_val(DT_STRING, TensorShape({2}));
-      shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
-      shapes_slices_val.flat<string>()(1) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<tstring>()(0) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<tstring>()(1) = "4 1 0,4:0,1";
       SetNodeTensorAttr<string>("value", shapes_slices_val,
                                 tensor_shapes_slices_node);
 
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index cd638da6b82..6c5b80e3381 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -596,10 +597,16 @@ Status TensorShapeFromString(const string& shape_string, TensorShape* result) {
   if (shape_string.empty()) {
     return errors::InvalidArgument("Specificed shape is empty.");
   }
+  std::vector<string> dims_as_str = str_util::Split(shape_string, ",");
   std::vector<int64> dims;
-  if (!str_util::SplitAndParseAsInts(shape_string, ',', &dims)) {
-    return errors::InvalidArgument("Could parse as shape: '", shape_string,
-                                   "'");
+  for (const string& dim : dims_as_str) {
+    int64 tmp;
+    if (strings::safe_strto64(dim, &tmp)) {
+      dims.push_back(tmp);
+    } else {
+      return errors::InvalidArgument("Could parse as shape: '", shape_string,
+                                     "'");
+    }
   }
   *result = TensorShape(dims);
   return Status::OK();
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index b8ebdc81053..75e01e3803b 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -8,7 +8,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_macos")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "tf_additional_license_deps")
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
 
 genrule(
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 162d39d7aee..8cd9e32ba6f 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -111,8 +111,8 @@ Status OptimizationPassRunner::Run(absl::string_view pass_to_run,
   GraphConstructorOptions graph_opts;
   graph_opts.expect_device_spec = true;
   graph_opts.allow_internal_ops = true;
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(graph_opts, input, options.graph->get()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_opts, std::move(input),
+                                            options.graph->get()));
 
   // Add all devices that were previously configured with AddDevice.
   DeviceSet device_set;
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e9a017acc90..68b1ba2de1d 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -13,7 +13,7 @@ load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "tf_additional_license_deps")
 load(
     "//third_party/ngraph:build_defs.bzl",
     "if_ngraph",
@@ -80,11 +80,11 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
-    "//tensorflow/python/data/kernel_tests:filter_test_base",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/keras:model_subclassing_test_util",
     "//tensorflow/python/keras:preprocessing_test_utils",
     "//tensorflow/python/keras/distribute:distribute_strategy_test_lib",
     "//tensorflow/python/keras/distribute:multi_worker_testing_utils",
@@ -105,49 +105,15 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/docs:py_guide_parser",
 ]
 
-COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
-    "//tensorflow/contrib/autograph:autograph",
-    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-    "//tensorflow/contrib/compiler:xla",
-    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
-    "//tensorflow/contrib/distribute/python:distribute_test_lib_pip",
-    "//tensorflow/contrib/eager/python/examples:examples_pip",
-    "//tensorflow/contrib/eager/python:evaluator",
-    "//tensorflow/contrib/gan:gan",
-    "//tensorflow/contrib/graph_editor:graph_editor_pip",
-    "//tensorflow/contrib/keras:keras",
-    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
-    "//tensorflow/contrib/nn:nn_py",
-    "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto",
-    "//tensorflow/contrib/receptive_field:receptive_field_pip",
-    "//tensorflow/contrib/rate:rate",
-    "//tensorflow/contrib/rpc:rpc_pip",
-    "//tensorflow/contrib/session_bundle:session_bundle_pip",
-    "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/slim:slim",
-    "//tensorflow/contrib/slim/python/slim/data:data_pip",
-    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-    "//tensorflow/contrib/specs:specs",
-    "//tensorflow/contrib/summary:summary_test_util",
-    "//tensorflow/contrib/tensor_forest:init_py",
-    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-    "//tensorflow/contrib/timeseries:timeseries_pip",
-    "//tensorflow/contrib/tpu",
-    "//tensorflow/examples/tutorials/mnist:package",
-]
-
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = select({
-        "//conditions:default": COMMON_PIP_DEPS_V1,
-        "//tensorflow:api_version_2": COMMON_PIP_DEPS,
-    }) + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
+    data = COMMON_PIP_DEPS + [
+        "//tensorflow/python:pywrap_tensorflow_import_lib_file",
+    ],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -265,10 +231,7 @@ filegroup(
 sh_binary(
     name = "build_pip_package",
     srcs = ["build_pip_package.sh"],
-    data = select({
-               "//tensorflow:api_version_2": COMMON_PIP_DEPS,
-               "//conditions:default": COMMON_PIP_DEPS_V1,
-           }) +
+    data = COMMON_PIP_DEPS +
            select({
                "//tensorflow:windows": [
                    ":simple_console_for_windows",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 5420769e25d..4857ee36003 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -227,8 +227,10 @@ function usage() {
   echo ""
   echo "  Options:"
   echo "    --project_name <name> set project name to name"
+  echo "    --cpu                 build tensorflow_cpu"
   echo "    --gpu                 build tensorflow_gpu"
   echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --rocm                build tensorflow_rocm"
   echo "    --nightly_flag        build tensorflow nightly"
   echo ""
   exit 1
@@ -238,6 +240,8 @@ function main() {
   PKG_NAME_FLAG=""
   PROJECT_NAME=""
   GPU_BUILD=0
+  PROJECT_NAME_CPU=0
+  ROCM_BUILD=0
   NIGHTLY_BUILD=0
   SRCDIR=""
   DSTDIR=""
@@ -250,8 +254,19 @@ function main() {
       NIGHTLY_BUILD=1
     elif [[ "$1" == "--gpu" ]]; then
       GPU_BUILD=1
+    elif [[ "$1" == "--cpu" ]]; then
+      # Check that --gpu has not been passed.
+      if [[ ${GPU_BUILD} == "1" ]]; then
+        echo "Specifying both --cpu and --gpu to build_pip_package is not allowed."
+        usage
+        exit 1
+      fi
+
+      PROJECT_NAME_CPU=1
     elif [[ "$1" == "--gpudirect" ]]; then
       PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--rocm" ]]; then
+      ROCM_BUILD=1
     elif [[ "$1" == "--project_name" ]]; then
       shift
       if [[ -z "$1" ]]; then
@@ -297,10 +312,18 @@ function main() {
     PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
   elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${ROCM_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_rocm"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${PROJECT_NAME_CPU} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_cpu"
   elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly"
   elif [[ ${GPU_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  elif [[ ${ROCM_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_rocm"
+  elif [[ ${PROJECT_NAME_CPU} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_cpu"
   fi
 
   build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 9adbb36a74a..7e3643f65b7 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -52,16 +52,10 @@ def GetBuild(dir_base):
 
 def BuildPyTestDependencies():
   python_targets = GetBuild("tensorflow/python")
-  contrib_targets = GetBuild("tensorflow/contrib")
-  tensorboard_targets = GetBuild("tensorflow/contrib/tensorboard")
   tensorflow_targets = GetBuild("tensorflow")
   # Build list of test targets,
-  # python + contrib - tensorboard - attr(manual|pno_pip)
+  # python - attr(manual|pno_pip)
   targets = " + ".join(python_targets)
-  for t in contrib_targets:
-    targets += " + " + t
-  for t in tensorboard_targets:
-    targets += " - " + t
   targets += ' - attr(tags, "manual|no_pip", %s)' % " + ".join(
       tensorflow_targets)
   query_kind = "kind(py_test, %s)" % targets
@@ -101,27 +95,6 @@ DEPENDENCY_BLACKLIST = [
     "//tensorflow/lite/python:interpreter_test",
     "//tensorflow/lite/python:interpreter.py",
     "//tensorflow/lite/python:interpreter_test.py",
-    # contrib
-    "//tensorflow/contrib/eager/python/examples/revnet:blocks_test_main_lib",
-    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
-    "//tensorflow/contrib/keras:testing_utils",
-    "//tensorflow/contrib/ffmpeg:test_data",
-    "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
-    "//tensorflow/contrib/hadoop:test_data",
-    "//tensorflow/contrib/factorization/examples:mnist",
-    "//tensorflow/contrib/factorization/examples:mnist.py",
-    "//tensorflow/contrib/factorization:factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/framework:checkpoint_ops_testdata",
-    "//tensorflow/contrib/bayesflow:reinforce_simple_example",
-    "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/saved_model:reader",  # Not present in v2
-    "//tensorflow/contrib/timeseries/examples:predict",
-    "//tensorflow/contrib/timeseries/examples:multivariate",
-    "//tensorflow/contrib/timeseries/examples:known_anomaly",
-    "//tensorflow/contrib/timeseries/examples:data/period_trend.csv",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
-    "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/image:sparse_image_warp_test_data",
 ]
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 8bd954de099..9e01575db68 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -113,6 +113,7 @@ CONSOLE_SCRIPTS = [
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
     'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
+    'estimator_ckpt_converter = tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
 ]
 # pylint: enable=line-too-long
 
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 4e5db0ce58d..893a7f3e350 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -17,7 +17,7 @@ load(
 
 # For platform specific build config
 load(
-    "//tensorflow/core:platform/default/build_config.bzl",
+    "//tensorflow/core/platform:default/build_config.bzl",
     "tf_proto_library_cc",
 )
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0303a49982d..d9fadc1030e 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,7 +7,6 @@ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load("//third_party/mlir:mlir_configure.bzl", "mlir_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
@@ -29,6 +28,7 @@ load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
+load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/keras_applications_archive:workspace.bzl", keras_applications = "repo")
 load("//third_party/pasta:workspace.bzl", pasta = "repo")
@@ -45,6 +45,7 @@ def initialize_third_party():
     kissfft()
     jpeg()
     nasm()
+    opencl_headers()
     pasta()
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -74,7 +75,10 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
-    mlir_configure(name = "local_config_mlir")
+    native.local_repository(
+        name = "local_config_mlir",
+        path = "third_party/mlir",
+    )
     remote_execution_configure(name = "local_config_remote_execution")
 
     initialize_third_party()
@@ -96,7 +100,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "a936d6b277a33d2a027a024ea8e65df62bd2e162c7ca52c48486ed9d5dc27160",
         strip_prefix = "mklml_lnx_2019.0.5.20190502",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz",
             "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz",
         ],
     )
@@ -106,7 +110,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "535857b17643d7f7546b58fc621244e7cfcc4fff2aa2ebd3fc5b4e126bfc36cf",
         strip_prefix = "mklml_win_2019.0.5.20190502",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip",
             "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip",
         ],
     )
@@ -116,7 +120,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "2fbb71a0365d42a39ea7906568d69b1db3bfc9914fee75eedb06c5f32bf5fa68",
         strip_prefix = "mklml_mac_2019.0.5.20190502",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz",
             "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz",
         ],
     )
@@ -133,11 +137,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "26f720ed912843ba293e8a1e0822fe5318e93c529d80c87af1cf555d68e642d0",
-        strip_prefix = "mkl-dnn-0.20.1",
+        sha256 = "a198a9bd3c584607e6a467f780beca92c8411cd656fcc8ec6fa5abe73d4af823",
+        strip_prefix = "mkl-dnn-0.20.3",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.20.1.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/v0.20.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.20.3.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/v0.20.3.tar.gz",
         ],
     )
 
@@ -147,7 +151,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "fcc2d951f7170eade0cfdd0d8d1d58e3e7785bd326bca6555f3722f8cba71811",
         strip_prefix = "mkl-dnn-1.0-pc2",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz",
             "https://github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz",
         ],
     )
@@ -158,7 +162,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "acd93f6baaedc4414ebd08b33bebca7c7a46888916101d8c0b8083573526d070",
         strip_prefix = "abseil-cpp-43ef2148c0936ebf7cb4be6b19927a9d9d145b8f",
         urls = [
-            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
             "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
         ],
     )
@@ -167,11 +171,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "f3d69ac773ecaf3602cb940040390d4e71a501bb145ca9e01ce5464cf6d4eb68",
-        strip_prefix = "eigen-eigen-049af2f56331",
+        sha256 = "7e7a57e33c59280a17a66e521396cd8b1a55d0676c9f807078522fda52114b5c",
+        strip_prefix = "eigen-eigen-8071cda5714d",
         urls = [
-            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/8071cda5714d.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/8071cda5714d.tar.gz",
         ],
     )
 
@@ -181,7 +185,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "4c622a5c7b9feb9615d4723b03a13142a7f3f813f9296861d5401282b9fbea96",
         strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
         urls = [
-            "http://mirror.tensorflow.org/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
             "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
         ],
     )
@@ -192,7 +196,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "5fc1972471cd8e2b8b64ea017590193739fc88d9818e3d086621e5c08e86ea35",
         strip_prefix = "libxsmm-1.11",
         urls = [
-            "http://mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.11.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.11.tar.gz",
             "https://github.com/hfp/libxsmm/archive/1.11.tar.gz",
         ],
     )
@@ -203,7 +207,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "re2-506cfa4bffd060c06ec338ce50ea3468daa6c814",
         system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz",
             "https://github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz",
         ],
     )
@@ -217,7 +221,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz",
             "https://github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz",
         ],
     )
@@ -229,7 +233,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "googleapis-f81082ea1e2f85c43649bee26e0d9871d4b41cdb",
         system_build_file = clean_dep("//third_party/systemlibs:googleapis.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip",
             "https://github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip",
         ],
     )
@@ -239,7 +243,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",
         strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
             "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
         ],
     )
@@ -250,7 +254,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
         strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
             "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
         ],
     )
@@ -263,7 +267,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "libpng-1.6.37",
         system_build_file = clean_dep("//third_party/systemlibs:png.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.37.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.37.tar.gz",
             "https://github.com/glennrp/libpng/archive/v1.6.37.tar.gz",
         ],
     )
@@ -275,7 +279,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "sqlite-amalgamation-3280000",
         system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/www.sqlite.org/2019/sqlite-amalgamation-3280000.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2019/sqlite-amalgamation-3280000.zip",
             "https://www.sqlite.org/2019/sqlite-amalgamation-3280000.zip",
         ],
     )
@@ -287,7 +291,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "giflib-5.1.4",
         system_build_file = clean_dep("//third_party/systemlibs:gif.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
             "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
         ],
     )
@@ -299,7 +303,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "six-1.10.0",
         system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
             "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
         ],
     )
@@ -311,7 +315,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "astor-0.7.1",
         system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
             "https://pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
         ],
     )
@@ -322,7 +326,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "f6253dfbe0538ad2e387bd8fdfd9293c925d63553f5813c4e587745416501e6d",
         strip_prefix = "functools32-3.2.3-2",
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz",
             "https://pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz",
         ],
     )
@@ -334,7 +338,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "gast-0.2.2",
         system_build_file = clean_dep("//third_party/systemlibs:gast.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz",
             "https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz",
         ],
     )
@@ -346,7 +350,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "termcolor-1.1.0",
         system_build_file = clean_dep("//third_party/systemlibs:termcolor.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
             "https://pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
         ],
     )
@@ -358,7 +362,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "opt_einsum-2.3.2",
         system_build_file = clean_dep("//third_party/systemlibs:opt_einsum.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz",
             "https://pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz",
         ],
     )
@@ -374,7 +378,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
         urls = [
-            "http://mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz",
             "https://github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz",
         ],
     )
@@ -382,11 +386,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "enum34_archive",
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
             "https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
         ],
         sha256 = "8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
         build_file = clean_dep("//third_party:enum34.BUILD"),
+        system_build_file = clean_dep("//third_party/systemlibs:enum34.BUILD"),
         strip_prefix = "enum34-1.1.6/enum",
     )
 
@@ -396,7 +401,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
         strip_prefix = "backports.weakref-1.0rc1/src",
         urls = [
-            "http://mirror.tensorflow.org/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
             "https://pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
         ],
     )
@@ -406,7 +411,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         licenses = ["notice"],  # Python 2.0
         sha256_urls = {
             "e76cacdf0bdd265ff074ccca03671c33126f597f39d0ed97bc3e5673d9170cf6": [
-                "http://mirror.tensorflow.org/docs.python.org/2.7/_sources/license.rst.txt",
+                "https://storage.googleapis.com/mirror.tensorflow.org/docs.python.org/2.7/_sources/license.rst.txt",
                 "https://docs.python.org/2.7/_sources/license.rst.txt",
             ],
         },
@@ -414,7 +419,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     # 310ba5ee72661c081129eb878c1bbcec936b20f0 is based on 3.8.0 with a fix for protobuf.bzl.
     PROTOBUF_URLS = [
-        "http://mirror.tensorflow.org/github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz",
+        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz",
         "https://github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz",
     ]
     PROTOBUF_SHA256 = "b9e92f9af8819bbbc514e2902aec860415b70209f31dfc8c4fa72515a5df9d59"
@@ -438,12 +443,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nsync",
-        sha256 = "704be7f58afa47b99476bbac7aafd1a9db4357cef519db361716f13538547ffd",
-        strip_prefix = "nsync-1.20.2",
+        sha256 = "caf32e6b3d478b78cff6c2ba009c3400f8251f646804bcb65465666a9cea93c4",
+        strip_prefix = "nsync-1.22.0",
         system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/nsync/archive/1.20.2.tar.gz",
-            "https://github.com/google/nsync/archive/1.20.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/nsync/archive/1.22.0.tar.gz",
+            "https://github.com/google/nsync/archive/1.22.0.tar.gz",
         ],
     )
 
@@ -452,7 +457,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86",
         strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
             "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
         ],
     )
@@ -462,7 +467,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "ae27cdbcd6a2f935baa78e4f21f675649271634c092b1be01469440495609d0e",
         strip_prefix = "gflags-2.2.1",
         urls = [
-            "http://mirror.tensorflow.org/github.com/gflags/gflags/archive/v2.2.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/gflags/gflags/archive/v2.2.1.tar.gz",
             "https://github.com/gflags/gflags/archive/v2.2.1.tar.gz",
         ],
     )
@@ -474,7 +479,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "pcre-8.42",
         system_build_file = clean_dep("//third_party/systemlibs:pcre.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
             "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
         ],
     )
@@ -486,7 +491,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "swig-3.0.8",
         system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
             "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
             "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
         ],
@@ -495,12 +500,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "curl",
         build_file = clean_dep("//third_party:curl.BUILD"),
-        sha256 = "e9c37986337743f37fd14fe8737f246e97aec94b39d1b71e8a5973f72a9fc4f5",
-        strip_prefix = "curl-7.60.0",
+        sha256 = "4376ac72b95572fb6c4fbffefb97c7ea0dd083e1974c0e44cd7e49396f454839",
+        strip_prefix = "curl-7.65.3",
         system_build_file = clean_dep("//third_party/systemlibs:curl.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/curl.haxx.se/download/curl-7.60.0.tar.gz",
-            "https://curl.haxx.se/download/curl-7.60.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/curl.haxx.se/download/curl-7.65.3.tar.gz",
+            "https://curl.haxx.se/download/curl-7.65.3.tar.gz",
         ],
     )
 
@@ -511,7 +516,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "grpc-4566c2a29ebec0835643b972eb99f4306c4234a3",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz",
             "https://github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz",
         ],
     )
@@ -522,7 +527,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = "@grpc//third_party:nanopb.BUILD",
         strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b",
         urls = [
-            "http://mirror.tensorflow.org/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
             "https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
         ],
     )
@@ -533,7 +538,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
         strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
         urls = [
-            "http://mirror.tensorflow.org/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
             "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
         ],
     )
@@ -543,11 +548,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "9257e111ae3d5b9d80925ef1329666440460abf4d052e701fa587f5236be6fcc",
-        strip_prefix = "llvm-df22a5e50a3d36a7b68eea106970dfa5df6d2453",
+        sha256 = "599b89411df88b9e2be40b019e7ab0f7c9c10dd5ab1c948cd22e678cc8f8f352",
+        strip_prefix = "llvm-7a7e03f906aada0cf4b749b51213fe5784eeff84",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/df22a5e50a3d36a7b68eea106970dfa5df6d2453.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/df22a5e50a3d36a7b68eea106970dfa5df6d2453.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7a7e03f906aada0cf4b749b51213fe5784eeff84.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/7a7e03f906aada0cf4b749b51213fe5784eeff84.tar.gz",
         ],
     )
 
@@ -558,7 +563,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb",
         system_build_file = clean_dep("//third_party/systemlibs:lmdb.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
             "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
         ],
     )
@@ -570,7 +575,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "jsoncpp-1.8.4",
         system_build_file = clean_dep("//third_party/systemlibs:jsoncpp.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
             "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
         ],
     )
@@ -581,7 +586,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "boringssl-7f634429a04abc48e2eb041c81c5235816c96514",
         system_build_file = clean_dep("//third_party/systemlibs:boringssl.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
             "https://github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
         ],
     )
@@ -593,7 +598,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "zlib-1.2.11",
         system_build_file = clean_dep("//third_party/systemlibs:zlib.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/zlib.net/zlib-1.2.11.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/zlib.net/zlib-1.2.11.tar.gz",
             "https://zlib.net/zlib-1.2.11.tar.gz",
         ],
     )
@@ -603,7 +608,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"),
         sha256 = "ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9",
         urls = [
-            "http://mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
             "http://www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
         ],
     )
@@ -615,7 +620,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "snappy-1.1.7",
         system_build_file = clean_dep("//third_party/systemlibs:snappy.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/snappy/archive/1.1.7.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/snappy/archive/1.1.7.tar.gz",
             "https://github.com/google/snappy/archive/1.1.7.tar.gz",
         ],
     )
@@ -627,7 +632,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
         strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
         urls = [
-            "http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
             "https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
         ],
     )
@@ -639,7 +644,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "cc6ebbcd0a826eec1b8ce1f625ffe71b53ef3290f8192b6cae38412a958f4fd3",
         strip_prefix = "librdkafka-0.11.5",
         urls = [
-            "http://mirror.tensorflow.org/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
             "https://github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
         ],
     )
@@ -648,7 +653,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "junit",
         jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
             "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
             "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar",
         ],
@@ -661,7 +666,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "org_hamcrest_core",
         jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
             "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
             "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
         ],
@@ -673,7 +678,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "com_google_testing_compile",
         jar_sha256 = "edc180fdcd9f740240da1a7a45673f46f59c5578d8cd3fbc912161f74b5aebb8",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
             "http://repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
         ],
         licenses = ["notice"],  # New BSD License
@@ -685,7 +690,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "com_google_truth",
         jar_sha256 = "032eddc69652b0a1f8d458f999b4a9534965c646b8b5de0eba48ee69407051df",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar",
             "http://repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar",
         ],
         licenses = ["notice"],  # Apache 2.0
@@ -697,7 +702,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "org_checkerframework_qual",
         jar_sha256 = "a17501717ef7c8dda4dba73ded50c0d7cde440fd721acfeacbf19786ceac1ed6",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar",
             "http://repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar",
         ],
         licenses = ["notice"],  # Apache 2.0
@@ -707,7 +712,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "com_squareup_javapoet",
         jar_sha256 = "5bb5abdfe4366c15c0da3332c57d484e238bd48260d6f9d6acf2b08fdde1efea",
         jar_urls = [
-            "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
+            "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
             "http://repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
         ],
         licenses = ["notice"],  # Apache 2.0
@@ -719,7 +724,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
         strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
             "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
         ],
     )
@@ -730,7 +735,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
         strip_prefix = "cub-1.8.0",
         urls = [
-            "http://mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip",
             "https://github.com/NVlabs/cub/archive/1.8.0.zip",
         ],
     )
@@ -754,7 +759,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "cython-0.28.4",
         system_build_file = clean_dep("//third_party/systemlibs:cython.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/cython/cython/archive/0.28.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/cython/cython/archive/0.28.4.tar.gz",
             "https://github.com/cython/cython/archive/0.28.4.tar.gz",
         ],
     )
@@ -765,7 +770,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "213733991310b904b11b053ac224fee2d4e0179e46b52fe7f8735b8831e04dcc",
         strip_prefix = "ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f",
         urls = [
-            "http://mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
             "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
         ],
     )
@@ -777,7 +782,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
         system_build_file = clean_dep("//third_party/systemlibs:double_conversion.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
             "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
         ],
     )
@@ -787,8 +792,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = clean_dep("//third_party:tflite_mobilenet_float.BUILD"),
         sha256 = "2fadeabb9968ec6833bee903900dda6e61b3947200535874ce2fe42a8493abc0",
         urls = [
-            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
-            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+            "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+            "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
         ],
     )
 
@@ -797,8 +802,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = clean_dep("//third_party:tflite_mobilenet_quant.BUILD"),
         sha256 = "d32432d28673a936b2d6281ab0600c71cf7226dfe4cdcef3012555f691744166",
         urls = [
-            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
-            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         ],
     )
 
@@ -807,7 +812,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
         ],
     )
@@ -817,7 +822,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "a809cd290b4d6a2e8a9d5dad076e0bd695b8091974e0eed1052b480b2f21b6dc",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip",
         ],
     )
@@ -828,8 +833,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "09280972c5777f1aa775ef67cb4ac5d5ed21970acd8535aeca62450ef14f0d79",
         strip_prefix = "ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
-            "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
+            "https://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
         ],
     )
 
@@ -838,7 +843,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
         ],
     )
@@ -848,7 +853,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
         sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
         ],
     )
@@ -859,7 +864,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "033c941b7829b05ca55a124a26a6a0581b1ececc154a2153cafcfdb54f80dca2",
         strip_prefix = "ovic",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
             "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
         ],
     )
@@ -869,7 +874,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
         strip_prefix = "rules_android-0.1.1",
         urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/rules_android/archive/v0.1.1.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_android/archive/v0.1.1.zip",
             "https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip",
         ],
     )
@@ -880,7 +885,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "c3245012296f09f1418b78a8c2f17df5188b3bd0db620f7fd5fabe363320805a",
         strip_prefix = "tbb-2019_U1",
         urls = [
-            "http://mirror.tensorflow.org/github.com/01org/tbb/archive/2019_U1.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/01org/tbb/archive/2019_U1.zip",
             "https://github.com/01org/tbb/archive/2019_U1.zip",
         ],
     )
@@ -891,7 +896,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "a1780f24a1381fc25e323b4b2d08b6ef5129f42e011305b2a34dcf43a48030d5",
         strip_prefix = "ngraph-0.11.0",
         urls = [
-            "http://mirror.tensorflow.org/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
             "https://github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
         ],
     )
@@ -902,7 +907,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "c377963a95989270c943d522bfefe7b889ef5ed0e1e15d535fd6f6f16ed70732",
         strip_prefix = "json-3.4.0",
         urls = [
-            "http://mirror.tensorflow.org/github.com/nlohmann/json/archive/v3.4.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nlohmann/json/archive/v3.4.0.tar.gz",
             "https://github.com/nlohmann/json/archive/v3.4.0.tar.gz",
         ],
     )
@@ -913,7 +918,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "742a642d2c6622277df4c902b6830d616d0539cc8cd843d6cdb899bb99e66e36",
         strip_prefix = "ngraph-tf-0.9.0",
         urls = [
-            "http://mirror.tensorflow.org/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
             "https://github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
         ],
     )
@@ -921,11 +926,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "pybind11",
         urls = [
-            "https://mirror.bazel.build/github.com/pybind/pybind11/archive/v2.2.4.tar.gz",
-            "https://github.com/pybind/pybind11/archive/v2.2.4.tar.gz",
+            "https://mirror.bazel.build/github.com/pybind/pybind11/archive/v2.3.0.tar.gz",
+            "https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz",
         ],
-        sha256 = "b69e83658513215b8d1443544d0549b7d231b9f201f6fc787a2b2218b408181e",
-        strip_prefix = "pybind11-2.2.4",
+        sha256 = "0f34838f2c8024a6765168227ba587b3687729ebf03dc912f88ff75c7aa9cfe8",
+        strip_prefix = "pybind11-2.3.0",
         build_file = clean_dep("//third_party:pybind11.BUILD"),
     )
 
@@ -936,7 +941,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "wrapt-1.11.1/src/wrapt",
         system_build_file = clean_dep("//third_party/systemlibs:wrapt.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz",
             "https://github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz",
         ],
     )
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index 81c22dde537..f37699e34c5 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,7 +9,7 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "http://mirror.tensorflow.org/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
             "https://github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
         ],
         sha256 = "89905075fe50aa13e0337ff905c2e8c1ce9caf77a3504484a7cda39179120ffc",
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index a3aa3ce4ddb..b99a3508333 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -154,8 +154,7 @@ cc_library(
         "lib/parsedate.c",
         "lib/parsedate.h",
         "lib/pingpong.h",
-        "lib/pipeline.c",
-        "lib/pipeline.h",
+        "lib/pingpong.c",
         "lib/pop3.h",
         "lib/progress.c",
         "lib/progress.h",
@@ -217,9 +216,6 @@ cc_library(
         "lib/vauth/vauth.c",
         "lib/vauth/vauth.h",
         "lib/version.c",
-        "lib/vtls/axtls.h",
-        "lib/vtls/cyassl.h",
-        "lib/vtls/darwinssl.h",
         "lib/vtls/gskit.h",
         "lib/vtls/gtls.h",
         "lib/vtls/mbedtls.h",
@@ -230,17 +226,31 @@ cc_library(
         "lib/vtls/schannel.h",
         "lib/vtls/vtls.c",
         "lib/vtls/vtls.h",
+        "lib/vtls/wolfssl.h",
         "lib/warnless.c",
         "lib/warnless.h",
         "lib/wildcard.c",
         "lib/wildcard.h",
         "lib/x509asn1.h",
+        "lib/psl.h",
+        "lib/psl.c",
+        "lib/vtls/sectransp.h",
+        "lib/vtls/mesalink.h",
+        "lib/vtls/mesalink.c",
+        "lib/curl_get_line.h",
+        "lib/curl_get_line.c",
+        "lib/urlapi-int.h",
+        "lib/urlapi.c",
+        "lib/altsvc.h",
+        "lib/altsvc.c",
+        "lib/doh.h",
+        "lib/doh.c",
     ] + select({
         "@org_tensorflow//tensorflow:macos": [
-            "lib/vtls/darwinssl.c",
+            "lib/vtls/sectransp.c",
         ],
         "@org_tensorflow//tensorflow:ios": [
-            "lib/vtls/darwinssl.c",
+            "lib/vtls/sectransp.c",
         ],
         "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
         "//conditions:default": [
@@ -256,6 +266,7 @@ cc_library(
         "include/curl/stdcheaders.h",
         "include/curl/system.h",
         "include/curl/typecheck-gcc.h",
+        "include/curl/urlapi.h",
     ],
     copts = select({
         "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
@@ -465,7 +476,7 @@ genrule(
         "#  define HAVE_SYS_FILIO_H 1",
         "#  define HAVE_SYS_SOCKIO_H 1",
         "#  define OS \"x86_64-apple-darwin15.5.0\"",
-        "#  define USE_DARWINSSL 1",
+        "#  define USE_SECTRANSP 1",
         "#else",
         "#  define CURL_CA_BUNDLE \"/etc/ssl/certs/ca-certificates.crt\"",
         "#  define GETSERVBYPORT_R_ARGS 6",
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 194a2272d54..8bdfb087703 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -11,22 +11,6 @@ licenses([
 
 exports_files(["COPYING.MPL2"])
 
-# License-restricted (i.e. not reciprocal or notice) files inside Eigen/...
-EIGEN_RESTRICTED_FILES = [
-    "Eigen/src/OrderingMethods/Amd.h",
-    "Eigen/src/SparseCholesky/**",
-]
-
-# Notable transitive dependencies of restricted files inside Eigen/...
-EIGEN_RESTRICTED_DEPS = [
-    "Eigen/Eigen",
-    "Eigen/IterativeLinearSolvers",
-    "Eigen/MetisSupport",
-    "Eigen/Sparse",
-    "Eigen/SparseCholesky",
-    "Eigen/SparseLU",
-]
-
 EIGEN_FILES = [
     "Eigen/**",
     "unsupported/Eigen/CXX11/**",
@@ -40,18 +24,12 @@ EIGEN_FILES = [
     "unsupported/Eigen/src/SpecialFunctions/**",
 ]
 
-# List of files picked up by glob but actually part of another target.
-EIGEN_EXCLUDE_FILES = [
-    "Eigen/src/Core/arch/AVX/PacketMathGoogleTest.cc",
-]
-
 # Files known to be under MPL2 license.
 EIGEN_MPL2_HEADER_FILES = glob(
     EIGEN_FILES,
-    exclude = EIGEN_EXCLUDE_FILES +
-              EIGEN_RESTRICTED_FILES +
-              EIGEN_RESTRICTED_DEPS + [
-        # Guarantees any file missed by excludes above will not compile.
+    exclude = [
+        # Guarantees that any non-MPL2 file added to the list above will fail to
+        # compile.
         "Eigen/src/Core/util/NonMPL2.h",
         "Eigen/**/CMakeLists.txt",
     ],
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
index 5a64d80d053..5bf25c51e12 100644
--- a/third_party/flatbuffers/workspace.bzl
+++ b/third_party/flatbuffers/workspace.bzl
@@ -8,7 +8,7 @@ def repo():
         strip_prefix = "flatbuffers-1.11.0",
         sha256 = "3f4a286642094f45b1b77228656fbd7ea123964f19502f9ecfd29933fd23a50b",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz",
             "https://github.com/google/flatbuffers/archive/v1.11.0.tar.gz",
         ],
         build_file = "//third_party/flatbuffers:BUILD.bazel",
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 1a09756813e..20b70f86e1a 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -28,6 +28,7 @@ import subprocess
 import re
 import sys
 import pipes
+import tempfile
 
 # Template values set by cuda_autoconf.
 CPU_COMPILER = ('%{cpu_compiler}')
@@ -145,15 +146,17 @@ def InvokeNvcc(argv, log=False):
   nvccopts += m_options
   nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
   nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # Specify an unique temp directory for nvcc to generate intermediate files,
+  # then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
   # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
   # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
   if os.path.isfile(NVCC_TEMP_DIR):
     os.remove(NVCC_TEMP_DIR)
   if not os.path.exists(NVCC_TEMP_DIR):
     os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  # Provide an unique dir for each compiling action to avoid conflicts.
+  tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', tempdir]
   cmd = [NVCC_PATH] + nvccopts
   if log:
     Log(cmd)
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index 502b6b8de2f..21d1433a32f 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -84,6 +84,18 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "rccl",
+    srcs = ["rocm/lib/%{rccl_lib}"],
+    data = ["rocm/lib/%{rccl_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "rocm",
     visibility = ["//visibility:public"],
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index a15335fa5b2..610e184e99b 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -186,6 +186,7 @@ def _rocm_include_path(repository_ctx, rocm_config):
     # Add HIP-Clang headers
     inc_dirs.append("/opt/rocm/llvm/lib/clang/8.0/include")
     inc_dirs.append("/opt/rocm/llvm/lib/clang/9.0.0/include")
+    inc_dirs.append("/opt/rocm/llvm/lib/clang/10.0.0/include")
 
     # Add rocrand and hiprand headers
     inc_dirs.append("/opt/rocm/rocrand/include")
@@ -200,6 +201,9 @@ def _rocm_include_path(repository_ctx, rocm_config):
     # Add MIOpen headers
     inc_dirs.append("/opt/rocm/miopen/include")
 
+    # Add RCCL headers
+    inc_dirs.append("/opt/rocm/rccl/include")
+
     # Add hcc headers
     inc_dirs.append("/opt/rocm/hcc/include")
     inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
@@ -213,6 +217,10 @@ def _rocm_include_path(repository_ctx, rocm_config):
     inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/")
     inc_dirs.append("/opt/rocm/hcc/lib/clang/9.0.0/include")
 
+    # Support hcc based off clang 10.0.0, included in ROCm2.8
+    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/")
+    inc_dirs.append("/opt/rocm/hcc/lib/clang/10.0.0/include")
+
     return inc_dirs
 
 def _enable_rocm(repository_ctx):
@@ -467,6 +475,12 @@ def _find_libs(repository_ctx, rocm_config):
             cpu_value,
             rocm_config.rocm_toolkit_path + "/miopen",
         ),
+        "rccl": _find_rocm_lib(
+            "rccl",
+            repository_ctx,
+            cpu_value,
+            rocm_config.rocm_toolkit_path + "/rccl",
+        ),
     }
 
 def _get_rocm_config(repository_ctx):
@@ -549,6 +563,7 @@ def _create_dummy_repository(repository_ctx):
             "%{hip_lib}": _lib_name("hip", cpu_value),
             "%{rocblas_lib}": _lib_name("rocblas", cpu_value),
             "%{miopen_lib}": _lib_name("miopen", cpu_value),
+            "%{rccl_lib}": _lib_name("rccl", cpu_value),
             "%{rocfft_lib}": _lib_name("rocfft", cpu_value),
             "%{hiprand_lib}": _lib_name("hiprand", cpu_value),
             "%{copy_rules}": "",
@@ -690,6 +705,12 @@ def _create_local_rocm_repository(repository_ctx):
             src_dir = rocm_toolkit_path + "/miopen/include",
             out_dir = "rocm/include/miopen",
         ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rccl-include",
+            src_dir = rocm_toolkit_path + "/rccl/include",
+            out_dir = "rocm/include/rccl",
+        ),
     ]
 
     rocm_libs = _find_libs(repository_ctx, rocm_config)
@@ -726,11 +747,13 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
             "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
             "%{miopen_lib}": rocm_libs["miopen"].file_name,
+            "%{rccl_lib}": rocm_libs["rccl"].file_name,
             "%{copy_rules}": "\n".join(copy_rules),
             "%{rocm_headers}": ('":rocm-include",\n' +
                                 '":rocfft-include",\n' +
                                 '":rocblas-include",\n' +
-                                '":miopen-include",'),
+                                '":miopen-include",\n' +
+                                '":rccl-include",'),
         },
     )
 
diff --git a/third_party/highwayhash/workspace.bzl b/third_party/highwayhash/workspace.bzl
index dbec1ffea82..1a698aef918 100644
--- a/third_party/highwayhash/workspace.bzl
+++ b/third_party/highwayhash/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "highwayhash",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
             "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
         ],
         sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
diff --git a/third_party/hwloc/workspace.bzl b/third_party/hwloc/workspace.bzl
index 3c7373a451c..dc8e1579e9c 100644
--- a/third_party/hwloc/workspace.bzl
+++ b/third_party/hwloc/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "hwloc",
         urls = [
-            "http://mirror.tensorflow.org/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
             "https://download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
         ],
         sha256 = "64def246aaa5b3a6e411ce10932a22e2146c3031b735c8f94739534f06ad071c",
diff --git a/third_party/icu/BUILD.bazel b/third_party/icu/BUILD.bazel
index 36d6b9006b9..69496567ebd 100644
--- a/third_party/icu/BUILD.bazel
+++ b/third_party/icu/BUILD.bazel
@@ -44,7 +44,7 @@ cc_library(
     ]),
     copts = [
         "-DU_COMMON_IMPLEMENTATION",
-        "-DU_HAVE_STD_ATOMICS",
+        "-DU_HAVE_STD_ATOMICS",  # TODO(gunan): Remove when TF is on ICU 64+.
     ] + select({
         ":android": [
             "-fdata-sections",
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
index 9ea63563840..ddd309a3ee6 100644
--- a/third_party/icu/workspace.bzl
+++ b/third_party/icu/workspace.bzl
@@ -13,7 +13,7 @@ def repo():
         strip_prefix = "icu-release-62-1",
         sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761",
         urls = [
-            "http://mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
             "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
         ],
         build_file = "//third_party/icu:BUILD.bazel",
diff --git a/third_party/jpeg/BUILD.bazel b/third_party/jpeg/BUILD.bazel
index 5243e995a3d..90e45237c7d 100644
--- a/third_party/jpeg/BUILD.bazel
+++ b/third_party/jpeg/BUILD.bazel
@@ -7,8 +7,6 @@ exports_files(["LICENSE.md"])
 
 load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
-libjpegturbo_nocopts = "-[W]error"
-
 WIN_COPTS = [
     "/Ox",
     "-DWITH_SIMD",
@@ -120,7 +118,6 @@ cc_library(
         "jstdhuff.c",  # should have been named .inc
     ],
     copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
     visibility = ["//visibility:public"],
     deps = select({
         ":k8": [":simd_x86_64"],
@@ -168,7 +165,6 @@ cc_library(
         "simd/powerpc/jsimd_altivec.h",
     ],
     copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
 )
 
 cc_library(
@@ -217,7 +213,6 @@ cc_library(
     ],
     copts = libjpegturbo_copts,
     linkstatic = 1,
-    nocopts = libjpegturbo_nocopts,
 )
 
 genrule(
@@ -327,7 +322,6 @@ cc_library(
         "simd/jsimd.h",
     ],
     copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
 )
 
 cc_library(
@@ -348,7 +342,6 @@ cc_library(
         "simd/jsimd.h",
     ],
     copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
 )
 
 cc_library(
@@ -501,7 +494,6 @@ cc_library(
         "jsimddct.h",
     ],
     copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
 )
 
 template_rule(
diff --git a/third_party/jpeg/workspace.bzl b/third_party/jpeg/workspace.bzl
index f11dfd15e23..831e954779d 100644
--- a/third_party/jpeg/workspace.bzl
+++ b/third_party/jpeg/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "jpeg",
         urls = [
-            "http://mirror.tensorflow.org/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
             "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
         ],
         sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl
index 1530ed8099d..bd92f18a9f2 100644
--- a/third_party/keras_applications_archive/workspace.bzl
+++ b/third_party/keras_applications_archive/workspace.bzl
@@ -8,7 +8,7 @@ def repo():
         strip_prefix = "keras-applications-1.0.8",
         sha256 = "7c37f9e9ef93efac9b4956301cb21ce46c474ce9da41fac9a46753bab6823dfc",
         urls = [
-            "http://mirror.tensorflow.org/github.com/keras-team/keras-applications/archive/1.0.8.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/keras-team/keras-applications/archive/1.0.8.tar.gz",
             "https://github.com/keras-team/keras-applications/archive/1.0.8.tar.gz",
         ],
         build_file = "//third_party/keras_applications_archive:BUILD.bazel",
diff --git a/third_party/kissfft/workspace.bzl b/third_party/kissfft/workspace.bzl
index f3679c7d0cf..f8e28c92160 100644
--- a/third_party/kissfft/workspace.bzl
+++ b/third_party/kissfft/workspace.bzl
@@ -8,7 +8,7 @@ def repo():
         strip_prefix = "kissfft-36dbc057604f00aacfc0288ddad57e3b21cfc1b8",
         sha256 = "42b7ef406d5aa2d57a7b3b56fc44e8ad3011581692458a69958a911071efdcf2",
         urls = [
-            "http://mirror.tensorflow.org/github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz",
             "https://github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz",
         ],
         build_file = "//third_party/kissfft:BUILD.bazel",
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 2eb65ae68b5..32705321ea1 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -472,6 +472,7 @@ cc_library(
         ":selection_dag",
         ":support",
         ":target",
+        ":transform_utils",
     ],
 )
 
@@ -2777,6 +2778,7 @@ cc_library(
     deps = [
         ":config",
         ":debug_info_code_view",
+        ":mc",
         ":object",
         ":support",
     ],
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index efb62a4644f..8b0fdec0482 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -354,7 +354,7 @@ llvm_defines = select({
         "UNICODE",
         "_UNICODE",
     ],
-    "//conditions:default": ["_DEBUG"],
+    "//conditions:default": [],
 }) + [
     "LLVM_ENABLE_STATS",
     "__STDC_LIMIT_MACROS",
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 6331a108e50..35832ffcefb 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -45,7 +45,7 @@ template_rule(
     substitutions = {
         "@MKLDNN_VERSION_MAJOR@": "0",
         "@MKLDNN_VERSION_MINOR@": "20",
-        "@MKLDNN_VERSION_PATCH@": "0",
+        "@MKLDNN_VERSION_PATCH@": "3",
         "@MKLDNN_VERSION_HASH@": "N/A",
     },
 )
@@ -62,8 +62,6 @@ cc_library(
         "src/cpu/xbyak/*.h",
     ]) + if_mkl_v1_open_source_only([
         ":mkldnn_config_h",
-        "src/cpu/jit_utils/jit_utils.cpp",
-        "src/cpu/jit_utils/jit_utils.hpp",
     ]) + [":mkldnn_version_h"],
     hdrs = glob(["include/*"]),
     copts = [
@@ -93,7 +91,6 @@ cc_library(
         "src/cpu/gemm",
         "src/cpu/xbyak",
     ],
-    nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
     deps = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
@@ -136,6 +133,5 @@ cc_library(
         "src/cpu/gemm",
         "src/cpu/xbyak",
     ],
-    nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mlir/.clang-format b/third_party/mlir/.clang-format
new file mode 100644
index 00000000000..392e2018955
--- /dev/null
+++ b/third_party/mlir/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: LLVM
+AlwaysBreakTemplateDeclarations: Yes
\ No newline at end of file
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 9e6a07adc2d..fb0bfef7797 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -13,11 +13,16 @@ package_group(
     packages = ["//..."],
 )
 
-# Please do not depend on this from any other packages.
+# Before adding a project here, please read go/mlir-sla
+# In particular the OWNERS file of the dependent project should be updated.
 package_group(
     name = "friends",
     includes = ["@org_tensorflow//tensorflow/compiler/mlir:subpackages"],
-    packages = ["//..."],
+    packages = [
+        "//...",
+        "//learning/glassbox/evaluation/compiler/...",
+        "//tensorflow/compiler/xla/service/gpu/mlir/...",
+    ],
 )
 
 exports_files([
@@ -42,6 +47,7 @@ cc_library(
         "lib/IR/Diagnostics.cpp",
         "lib/IR/Dialect.cpp",
         "lib/IR/Function.cpp",
+        "lib/IR/FunctionSupport.cpp",
         "lib/IR/IntegerSet.cpp",
         "lib/IR/IntegerSetDetail.h",
         "lib/IR/Location.cpp",
@@ -55,6 +61,7 @@ cc_library(
         "lib/IR/StandardTypes.cpp",
         "lib/IR/SymbolTable.cpp",
         "lib/IR/TypeDetail.h",
+        "lib/IR/TypeUtilities.cpp",
         "lib/IR/Types.cpp",
         "lib/IR/Value.cpp",
     ],
@@ -70,8 +77,10 @@ cc_library(
         "include/mlir/IR/Diagnostics.h",
         "include/mlir/IR/Dialect.h",
         "include/mlir/IR/DialectHooks.h",
+        "include/mlir/IR/DialectInterface.h",
         "include/mlir/IR/DialectSymbolRegistry.def",
         "include/mlir/IR/Function.h",
+        "include/mlir/IR/FunctionSupport.h",
         "include/mlir/IR/Identifier.h",
         "include/mlir/IR/IntegerSet.h",
         "include/mlir/IR/Location.h",
@@ -89,10 +98,12 @@ cc_library(
         "include/mlir/IR/StorageUniquerSupport.h",
         "include/mlir/IR/SymbolTable.h",
         "include/mlir/IR/TypeSupport.h",
+        "include/mlir/IR/TypeUtilities.h",
         "include/mlir/IR/Types.h",
         "include/mlir/IR/UseDefLists.h",
         "include/mlir/IR/Value.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Support",
@@ -118,6 +129,7 @@ cc_library(
         "include/mlir/Pass/PassManager.h",
         "include/mlir/Pass/PassRegistry.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     linkopts = [
         "-lm",
@@ -144,6 +156,7 @@ cc_library(
         "include/mlir/EDSC/Helpers.h",
         "include/mlir/EDSC/Intrinsics.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
@@ -167,7 +180,8 @@ filegroup(
 filegroup(
     name = "AffineOpsTdFiles",
     srcs = [
-        "include/mlir/AffineOps/AffineOps.td",
+        "include/mlir/Dialect/AffineOps/AffineOps.td",
+        "include/mlir/Dialect/AffineOps/AffineOpsBase.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -177,15 +191,15 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/AffineOps/AffineOps.h.inc",
+            "include/mlir/Dialect/AffineOps/AffineOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/AffineOps/AffineOps.cpp.inc",
+            "include/mlir/Dialect/AffineOps/AffineOps.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/AffineOps/AffineOps.td",
+    td_file = "include/mlir/Dialect/AffineOps/AffineOps.td",
     td_srcs = [
         ":AffineOpsTdFiles",
     ],
@@ -221,7 +235,7 @@ gentbl(
 filegroup(
     name = "StdOpsTdFiles",
     srcs = [
-        "include/mlir/StandardOps/Ops.td",
+        "include/mlir/Dialect/StandardOps/Ops.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -231,15 +245,15 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/StandardOps/Ops.h.inc",
+            "include/mlir/Dialect/StandardOps/Ops.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/StandardOps/Ops.cpp.inc",
+            "include/mlir/Dialect/StandardOps/Ops.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/StandardOps/Ops.td",
+    td_file = "include/mlir/Dialect/StandardOps/Ops.td",
     td_srcs = [
         ":StdOpsTdFiles",
     ],
@@ -253,6 +267,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/Traits.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         "@llvm//:support",
@@ -262,11 +277,12 @@ cc_library(
 cc_library(
     name = "AffineOps",
     srcs = [
-        "lib/AffineOps/AffineOps.cpp",
+        "lib/Dialect/AffineOps/AffineOps.cpp",
     ],
     hdrs = [
-        "include/mlir/AffineOps/AffineOps.h",
+        "include/mlir/Dialect/AffineOps/AffineOps.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":AffineOpsIncGen",
         ":IR",
@@ -279,7 +295,8 @@ cc_library(
 # Library with affine dialect static initialization.
 cc_library(
     name = "AffineDialectRegistration",
-    srcs = ["lib/AffineOps/DialectRegistration.cpp"],
+    srcs = ["lib/Dialect/AffineOps/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":AffineOps",
         ":IR",
@@ -293,16 +310,17 @@ cc_library(
 cc_library(
     name = "SDBM",
     srcs = [
-        "lib/SDBM/SDBM.cpp",
-        "lib/SDBM/SDBMDialect.cpp",
-        "lib/SDBM/SDBMExpr.cpp",
-        "lib/SDBM/SDBMExprDetail.h",
+        "lib/Dialect/SDBM/SDBM.cpp",
+        "lib/Dialect/SDBM/SDBMDialect.cpp",
+        "lib/Dialect/SDBM/SDBMExpr.cpp",
+        "lib/Dialect/SDBM/SDBMExprDetail.h",
     ],
     hdrs = [
-        "include/mlir/SDBM/SDBM.h",
-        "include/mlir/SDBM/SDBMDialect.h",
-        "include/mlir/SDBM/SDBMExpr.h",
+        "include/mlir/Dialect/SDBM/SDBM.h",
+        "include/mlir/Dialect/SDBM/SDBMDialect.h",
+        "include/mlir/Dialect/SDBM/SDBMExpr.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":Support",
@@ -319,6 +337,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/LoopOps/LoopOps.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":LoopOpsIncGen",
@@ -331,6 +350,7 @@ cc_library(
 cc_library(
     name = "LoopDialectRegistration",
     srcs = ["lib/Dialect/LoopOps/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":LoopOps",
@@ -341,11 +361,12 @@ cc_library(
 cc_library(
     name = "StandardOps",
     srcs = [
-        "lib/StandardOps/Ops.cpp",
+        "lib/Dialect/StandardOps/Ops.cpp",
     ],
     hdrs = [
-        "include/mlir/StandardOps/Ops.h",
+        "include/mlir/Dialect/StandardOps/Ops.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":StandardOpsIncGen",
@@ -357,7 +378,8 @@ cc_library(
 # Library with standard dialect static initialization.
 cc_library(
     name = "StandardDialectRegistration",
-    srcs = ["lib/StandardOps/DialectRegistration.cpp"],
+    srcs = ["lib/Dialect/StandardOps/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":StandardOps",
@@ -368,21 +390,24 @@ cc_library(
 cc_library(
     name = "VectorOps",
     srcs = [
-        "lib/VectorOps/VectorOps.cpp",
+        "lib/Dialect/VectorOps/VectorOps.cpp",
     ],
     hdrs = [
-        "include/mlir/VectorOps/VectorOps.h",
+        "include/mlir/Dialect/VectorOps/VectorOps.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":Support",
+        ":VectorOpsIncGen",
         "@llvm//:support",
     ],
 )
 
 cc_library(
     name = "VectorDialectRegistration",
-    srcs = ["lib/VectorOps/DialectRegistration.cpp"],
+    srcs = ["lib/Dialect/VectorOps/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":VectorOps",
@@ -405,28 +430,15 @@ cc_library(
         "include/mlir/Support/MathExtras.h",
         "include/mlir/Support/STLExtras.h",
         "include/mlir/Support/StorageUniquer.h",
+        "include/mlir/Support/StringExtras.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         "@llvm//:support",
     ],
 )
 
-cc_library(
-    name = "TypeUtilities",
-    srcs = [
-        "lib/Support/TypeUtilities.cpp",
-    ],
-    hdrs = [
-        "include/mlir/Support/TypeUtilities.h",
-    ],
-    includes = ["include"],
-    deps = [
-        ":IR",
-        "@llvm//:support",
-    ],
-)
-
 cc_library(
     name = "Parser",
     srcs = [
@@ -440,6 +452,7 @@ cc_library(
         "include/mlir/Parser.h",
         "lib/Parser/TokenKinds.def",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Analysis",
@@ -452,13 +465,14 @@ cc_library(
 cc_library(
     name = "LLVMDialect",
     srcs = [
-        "include/mlir/LLVMIR/LLVMOps.cpp.inc",
-        "include/mlir/LLVMIR/LLVMOps.h.inc",
-        "lib/LLVMIR/IR/LLVMDialect.cpp",
+        "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc",
+        "include/mlir/Dialect/LLVMIR/LLVMOps.h.inc",
+        "lib/Dialect/LLVMIR/IR/LLVMDialect.cpp",
     ],
     hdrs = [
-        "include/mlir/LLVMIR/LLVMDialect.h",
+        "include/mlir/Dialect/LLVMIR/LLVMDialect.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -474,7 +488,7 @@ cc_library(
 filegroup(
     name = "GPUOpsTdFiles",
     srcs = [
-        "include/mlir/GPU/GPUOps.td",
+        "include/mlir/Dialect/GPU/GPUOps.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -484,15 +498,15 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/GPU/GPUOps.h.inc",
+            "include/mlir/Dialect/GPU/GPUOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/GPU/GPUOps.cpp.inc",
+            "include/mlir/Dialect/GPU/GPUOps.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/GPU/GPUOps.td",
+    td_file = "include/mlir/Dialect/GPU/GPUOps.td",
     td_srcs = [
         ":GPUOpsTdFiles",
     ],
@@ -500,8 +514,9 @@ gentbl(
 
 cc_library(
     name = "GPUDialect",
-    srcs = ["lib/GPU/IR/GPUDialect.cpp"],
-    hdrs = ["include/mlir/GPU/GPUDialect.h"],
+    srcs = ["lib/Dialect/GPU/IR/GPUDialect.cpp"],
+    hdrs = ["include/mlir/Dialect/GPU/GPUDialect.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUOpsIncGen",
@@ -513,7 +528,8 @@ cc_library(
 
 cc_library(
     name = "GPUDialectRegistration",
-    srcs = ["lib/GPU/IR/DialectRegistration.cpp"],
+    srcs = ["lib/Dialect/GPU/IR/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
@@ -523,8 +539,9 @@ cc_library(
 
 cc_library(
     name = "GPUTransforms",
-    srcs = ["lib/GPU/Transforms/KernelOutlining.cpp"],
-    hdrs = ["include/mlir/GPU/Passes.h"],
+    srcs = ["lib/Dialect/GPU/Transforms/KernelOutlining.cpp"],
+    hdrs = ["include/mlir/Dialect/GPU/Passes.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
@@ -540,8 +557,8 @@ cc_library(
 filegroup(
     name = "LLVMOpsTdFiles",
     srcs = [
-        "include/mlir/LLVMIR/LLVMOpBase.td",
-        "include/mlir/LLVMIR/LLVMOps.td",
+        "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
+        "include/mlir/Dialect/LLVMIR/LLVMOps.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -552,13 +569,16 @@ cc_library(
     hdrs = [
         "include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
         ":IR",
         ":LLVMDialect",
+        ":LLVMTransforms",
         ":NVVMDialect",
         ":Pass",
+        ":Transforms",
         "@llvm//:support",
     ],
     alwayslink = 1,
@@ -572,6 +592,7 @@ cc_library(
         "lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp",
     ],
     hdrs = ["include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
@@ -588,28 +609,46 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "GPUToSPIRVTransforms",
+    srcs = [
+        "lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp",
+    ],
+    copts = ["-std=c++14"],
+    includes = ["include"],
+    deps = [
+        ":GPUDialect",
+        ":IR",
+        ":Pass",
+        ":SPIRVConversions",
+        ":SPIRVDialect",
+        ":StandardOps",
+    ],
+    alwayslink = 1,
+)
+
 gentbl(
     name = "LLVMOpsIncGen",
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/LLVMIR/LLVMOps.h.inc",
+            "include/mlir/Dialect/LLVMIR/LLVMOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/LLVMIR/LLVMOps.cpp.inc",
+            "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc",
         ),
         (
             "-gen-enum-decls",
-            "include/mlir/LLVMIR/LLVMOpsEnums.h.inc",
+            "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc",
         ),
         (
             "-gen-enum-defs",
-            "include/mlir/LLVMIR/LLVMOpsEnums.cpp.inc",
+            "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/LLVMIR/LLVMOps.td",
+    td_file = "include/mlir/Dialect/LLVMIR/LLVMOps.td",
     td_srcs = [
         ":LLVMOpsTdFiles",
     ],
@@ -620,11 +659,11 @@ gentbl(
     tbl_outs = [
         (
             "-gen-llvmir-conversions",
-            "include/mlir/LLVMIR/LLVMConversions.inc",
+            "include/mlir/Dialect/LLVMIR/LLVMConversions.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/LLVMIR/LLVMOps.td",
+    td_file = "include/mlir/Dialect/LLVMIR/LLVMOps.td",
     td_srcs = [
         ":LLVMOpsTdFiles",
     ],
@@ -633,13 +672,14 @@ gentbl(
 cc_library(
     name = "NVVMDialect",
     srcs = [
-        "include/mlir/LLVMIR/NVVMOps.cpp.inc",
-        "include/mlir/LLVMIR/NVVMOps.h.inc",
-        "lib/LLVMIR/IR/NVVMDialect.cpp",
+        "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc",
+        "include/mlir/Dialect/LLVMIR/NVVMOps.h.inc",
+        "lib/Dialect/LLVMIR/IR/NVVMDialect.cpp",
     ],
     hdrs = [
-        "include/mlir/LLVMIR/NVVMDialect.h",
+        "include/mlir/Dialect/LLVMIR/NVVMDialect.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -657,8 +697,8 @@ cc_library(
 filegroup(
     name = "NVVMOpsTdFiles",
     srcs = [
-        "include/mlir/LLVMIR/LLVMOpBase.td",
-        "include/mlir/LLVMIR/NVVMOps.td",
+        "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
+        "include/mlir/Dialect/LLVMIR/NVVMOps.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -668,15 +708,15 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/LLVMIR/NVVMOps.h.inc",
+            "include/mlir/Dialect/LLVMIR/NVVMOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/LLVMIR/NVVMOps.cpp.inc",
+            "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/LLVMIR/NVVMOps.td",
+    td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td",
     td_srcs = [
         ":NVVMOpsTdFiles",
     ],
@@ -687,11 +727,11 @@ gentbl(
     tbl_outs = [
         (
             "-gen-llvmir-conversions",
-            "include/mlir/LLVMIR/NVVMConversions.inc",
+            "include/mlir/Dialect/LLVMIR/NVVMConversions.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/LLVMIR/NVVMOps.td",
+    td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td",
     td_srcs = [
         ":NVVMOpsTdFiles",
     ],
@@ -700,7 +740,10 @@ gentbl(
 filegroup(
     name = "SPIRVOpsTdFiles",
     srcs = [
+        "include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td",
         "include/mlir/Dialect/SPIRV/SPIRVBase.td",
+        "include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td",
+        "include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td",
         "include/mlir/Dialect/SPIRV/SPIRVOps.td",
         "include/mlir/Dialect/SPIRV/SPIRVStructureOps.td",
         ":OpBaseTdFiles",
@@ -739,15 +782,15 @@ gentbl(
 )
 
 gentbl(
-    name = "StdOpsToSPIRVConversionIncGen",
+    name = "StandardToSPIRVGen",
     tbl_outs = [
         (
             "-gen-rewriters",
-            "lib/Conversion/StandardToSPIRV/StdOpsToSPIRVConversion.cpp.inc",
+            "lib/Conversion/StandardToSPIRV/StandardToSPIRV.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "lib/Conversion/StandardToSPIRV/StdOpsToSPIRVConversion.td",
+    td_file = "lib/Conversion/StandardToSPIRV/StandardToSPIRV.td",
     td_srcs = [
         ":SPIRVOpsTdFiles",
         ":StdOpsTdFiles",
@@ -770,10 +813,10 @@ gentbl(
 )
 
 gentbl(
-    name = "SPIRVSerializationIncGen",
+    name = "SPIRVSerializationGen",
     tbl_outs = [
         (
-            "-gen-spirv-serial",
+            "-gen-spirv-serialization",
             "include/mlir/Dialect/SPIRV/SPIRVSerialization.inc",
         ),
     ],
@@ -801,6 +844,7 @@ cc_library(
         "include/mlir/Dialect/SPIRV/SPIRVOps.h",
         "include/mlir/Dialect/SPIRV/SPIRVTypes.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -816,12 +860,15 @@ cc_library(
 cc_library(
     name = "SPIRVConversions",
     srcs = [
-        "lib/Conversion/StandardToSPIRV/StdOpsToSPIRVConversion.cpp",
-        "lib/Conversion/StandardToSPIRV/StdOpsToSPIRVConversion.cpp.inc",
+        "lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp",
+        "lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp",
+        "lib/Conversion/StandardToSPIRV/StandardToSPIRV.cpp.inc",
     ],
     hdrs = [
+        "include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h",
         "include/mlir/Dialect/SPIRV/Passes.h",
     ],
+    copts = ["-std=c++14"],
     includes = [
         "include",
         "lib/Conversion/StandardToSPIRV",
@@ -831,8 +878,9 @@ cc_library(
         ":Pass",
         ":SPIRVDialect",
         ":StandardOps",
-        ":StdOpsToSPIRVConversionIncGen",
+        ":StandardToSPIRVGen",
         ":Support",
+        ":Transforms",
         "@llvm//:support",
     ],
     alwayslink = 1,
@@ -843,17 +891,19 @@ cc_library(
     srcs = [
         "include/mlir/Dialect/SPIRV/SPIRVSerialization.inc",
         "lib/Dialect/SPIRV/Serialization/Deserializer.cpp",
-        "lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.h",
+        "lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp",
         "lib/Dialect/SPIRV/Serialization/Serializer.cpp",
     ],
     hdrs = [
+        "include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h",
         "include/mlir/Dialect/SPIRV/Serialization.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
         ":SPIRVDialect",
-        ":SPIRVSerializationIncGen",
+        ":SPIRVSerializationGen",
         ":Support",
         "@llvm//:support",
     ],
@@ -864,6 +914,7 @@ cc_library(
     srcs = [
         "lib/Dialect/SPIRV/Serialization/ConvertToBinary.cpp",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -881,6 +932,7 @@ cc_library(
     srcs = [
         "lib/Dialect/SPIRV/Serialization/ConvertFromBinary.cpp",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -897,6 +949,7 @@ cc_library(
 cc_library(
     name = "SPIRVDialectRegistration",
     srcs = ["lib/Dialect/SPIRV/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":SPIRVDialect",
     ],
@@ -920,6 +973,7 @@ cc_library(
         "include/mlir/Transforms/RegionUtils.h",
         "include/mlir/Transforms/Utils.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
@@ -937,14 +991,13 @@ cc_library(
 cc_library(
     name = "Transforms",
     srcs = [
+        "lib/Transforms/AffineDataCopyGeneration.cpp",
         "lib/Transforms/CSE.cpp",
         "lib/Transforms/Canonicalizer.cpp",
         "lib/Transforms/DialectConversion.cpp",
-        "lib/Transforms/DmaGeneration.cpp",
         "lib/Transforms/LoopCoalescing.cpp",
         "lib/Transforms/LoopFusion.cpp",
         "lib/Transforms/LoopInvariantCodeMotion.cpp",
-        "lib/Transforms/LoopParametricTiling.cpp",
         "lib/Transforms/LoopTiling.cpp",
         "lib/Transforms/LoopUnroll.cpp",
         "lib/Transforms/LoopUnrollAndJam.cpp",
@@ -962,6 +1015,7 @@ cc_library(
         "include/mlir/Transforms/LowerAffine.h",
         "include/mlir/Transforms/Passes.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
@@ -987,6 +1041,7 @@ cc_library(
     hdrs = [
         "include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
@@ -1010,6 +1065,7 @@ cc_library(
     hdrs = [
         "include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
@@ -1029,6 +1085,7 @@ cc_library(
     hdrs = [
         "include/mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -1054,6 +1111,7 @@ cc_library(
         "include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h",
         "include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":CFGTransforms",
@@ -1099,10 +1157,12 @@ cc_library(
         "include/mlir/Analysis/VectorAnalysis.h",
         "include/mlir/Analysis/Verifier.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":AffineOps",
         ":IR",
+        ":LoopOps",
         ":Pass",
         ":StandardOps",
         ":Support",
@@ -1116,6 +1176,7 @@ cc_library(
     name = "Translation",
     srcs = ["lib/Translation/Translation.cpp"],
     hdrs = ["include/mlir/Translation.h"],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":Parser",
@@ -1133,6 +1194,7 @@ cc_library(
     hdrs = [
         "include/mlir/Target/LLVMIR/ModuleTranslation.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":LLVMConversionIncGen",
@@ -1152,6 +1214,7 @@ cc_library(
     hdrs = [
         "include/mlir/Target/LLVMIR.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":LLVMIRModuleTranslation",
@@ -1171,6 +1234,7 @@ cc_library(
     hdrs = [
         "include/mlir/Target/NVVMIR.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
@@ -1196,6 +1260,7 @@ cc_library(
         "include/mlir/ExecutionEngine/ExecutionEngine.h",
         "include/mlir/ExecutionEngine/MemRefUtils.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -1203,12 +1268,15 @@ cc_library(
         ":Support",
         ":TargetLLVMIR",
         ":Translation",
+        "//third_party/llvm/llvm:bit_reader",
+        "//third_party/llvm/llvm:bit_writer",
         "@llvm//:core",
         "@llvm//:execution_engine",
         "@llvm//:mc",
         "@llvm//:orc_jit",
         "@llvm//:support",
         "@llvm//:target",  # fixdeps: keep
+        "@llvm//:transform_utils",
         "@llvm//:x86_code_gen",  # fixdeps: keep
         "@llvm//:x86_disassembler",  # fixdeps: keep
     ],
@@ -1222,6 +1290,7 @@ cc_library(
     hdrs = [
         "include/mlir/ExecutionEngine/OptUtils.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         "@llvm//:analysis",
@@ -1240,10 +1309,12 @@ cc_library(
     hdrs = [
         "include/mlir/Support/MlirOptMain.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Analysis",
         ":GPUToNVVMTransforms",
+        ":GPUToSPIRVTransforms",
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
@@ -1256,6 +1327,7 @@ cc_library(
         ":SPIRVDialectRegistration",
         ":Support",
         ":Transforms",
+        ":VectorToLLVMTransforms",
         ":ViewRegionGraph",
         "@llvm//:support",
     ],
@@ -1265,6 +1337,7 @@ cc_library(
     name = "ViewRegionGraph",
     srcs = ["lib/Transforms/ViewRegionGraph.cpp"],
     hdrs = ["include/mlir/Transforms/ViewRegionGraph.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Analysis",
@@ -1279,6 +1352,7 @@ cc_library(
     name = "TranslateClParser",
     srcs = ["lib/Support/TranslateClParser.cpp"],
     hdrs = ["include/mlir/Support/TranslateClParser.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Analysis",
@@ -1292,6 +1366,7 @@ cc_library(
 
 cc_binary(
     name = "mlir-translate",
+    copts = ["-std=c++14"],
     deps = [
         ":tools/mlir-translate/mlir-translate",
     ],
@@ -1300,6 +1375,7 @@ cc_binary(
 cc_library(
     name = "tools/mlir-translate/mlir-translate",
     srcs = ["tools/mlir-translate/mlir-translate.cpp"],
+    copts = ["-std=c++14"],
     deps = [
         ":DeserializeSPIRV",
         ":IR",
@@ -1324,6 +1400,7 @@ cc_library(
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":Analysis",
         ":MlirOptLib",
@@ -1335,6 +1412,7 @@ cc_library(
 
 cc_binary(
     name = "mlir-opt",
+    copts = ["-std=c++14"],
     deps = [
         ":AffineDialectRegistration",
         ":Analysis",
@@ -1352,9 +1430,9 @@ cc_binary(
         ":StandardDialectRegistration",
         ":Transforms",
         ":VectorDialectRegistration",
+        "//test:TestDialect",
+        "//test:TestTransforms",
         "@llvm//:support",
-        "@local_config_mlir//test:TestDialect",
-        "@local_config_mlir//test:TestTransforms",
     ],
 )
 
@@ -1362,6 +1440,7 @@ cc_library(
     name = "MlirJitRunner",
     srcs = ["lib/Support/JitRunner.cpp"],
     hdrs = ["include/mlir/Support/JitRunner.h"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":ExecutionEngine",
@@ -1376,6 +1455,7 @@ cc_library(
         ":Support",
         ":Transforms",
         "@llvm//:core",
+        "@llvm//:orc_jit",
         "@llvm//:support",
     ],
     alwayslink = 1,
@@ -1384,6 +1464,7 @@ cc_library(
 cc_binary(
     name = "mlir-cpu-runner",
     srcs = ["tools/mlir-cpu-runner/mlir-cpu-runner.cpp"],
+    copts = ["-std=c++14"],
     linkopts = ["-ldl"],
     deps = [
         ":MlirJitRunner",
@@ -1394,6 +1475,7 @@ cc_binary(
 cc_binary(
     name = "tools/libcuda-runtime-wrappers.so",
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     linkshared = True,
     deps = [
@@ -1407,6 +1489,7 @@ cc_binary(
 cc_binary(
     name = "mlir-cuda-runner",
     srcs = ["tools/mlir-cuda-runner/mlir-cuda-runner.cpp"],
+    copts = ["-std=c++14"],
     data = [
         ":tools/libcuda-runtime-wrappers.so",
     ],
@@ -1420,6 +1503,7 @@ cc_binary(
         ":LLVMDialect",
         ":LLVMTransforms",
         ":MlirJitRunner",
+        ":NVVMDialect",
         ":Pass",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
@@ -1459,6 +1543,7 @@ cc_library(
         "include/mlir/TableGen/Region.h",
         "include/mlir/TableGen/Type.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":Support",
@@ -1474,11 +1559,13 @@ cc_binary(
         "tools/mlir-tblgen/LLVMIRConversionGen.cpp",
         "tools/mlir-tblgen/OpDefinitionsGen.cpp",
         "tools/mlir-tblgen/OpDocGen.cpp",
+        "tools/mlir-tblgen/OpInterfacesGen.cpp",
         "tools/mlir-tblgen/ReferenceImplGen.cpp",
         "tools/mlir-tblgen/RewriterGen.cpp",
         "tools/mlir-tblgen/SPIRVUtilsGen.cpp",
         "tools/mlir-tblgen/mlir-tblgen.cpp",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     linkopts = [
         "-lm",
@@ -1549,6 +1636,7 @@ cc_library(
         "include/mlir/Dialect/QuantOps/QuantizeUtils.h",
         "include/mlir/Dialect/QuantOps/UniformSupport.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":Analysis",
         ":IR",
@@ -1565,6 +1653,7 @@ cc_library(
 cc_library(
     name = "QuantOpsDialectRegistration",
     srcs = ["lib/Dialect/QuantOps/IR/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -1620,6 +1709,7 @@ cc_library(
         "include/mlir/Dialect/FxpMathOps/FxpMathOps.h",
         "include/mlir/Dialect/FxpMathOps/Passes.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":Analysis",
         ":FxpMathOpsIncGen",
@@ -1637,6 +1727,7 @@ cc_library(
 cc_library(
     name = "FxpMathOpsDialectRegistration",
     srcs = ["lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":FxpMathOps",
@@ -1649,8 +1740,9 @@ cc_library(
 filegroup(
     name = "LinalgOpsTdFiles",
     srcs = [
-        "include/mlir/Linalg/IR/LinalgBase.td",
-        "include/mlir/Linalg/IR/LinalgOps.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgBase.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
+        ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
 )
@@ -1660,15 +1752,15 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Linalg/IR/LinalgOps.h.inc",
+            "include/mlir/Dialect/Linalg/IR/LinalgOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Linalg/IR/LinalgOps.cpp.inc",
+            "include/mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Linalg/IR/LinalgOps.td",
+    td_file = "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
     td_srcs = [
         ":LinalgOpsTdFiles",
     ],
@@ -1677,8 +1769,9 @@ gentbl(
 filegroup(
     name = "LinalgLibraryOpsTdFiles",
     srcs = [
-        "include/mlir/Linalg/IR/LinalgBase.td",
-        "include/mlir/Linalg/IR/LinalgLibraryOps.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgBase.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td",
+        ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
 )
@@ -1688,15 +1781,23 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Linalg/IR/LinalgLibraryOps.h.inc",
+            "include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Linalg/IR/LinalgLibraryOps.cpp.inc",
+            "include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.cpp.inc",
+        ),
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Dialect/Linalg/IR/LinalgLibraryOpInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Dialect/Linalg/IR/LinalgLibraryOpInterfaces.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Linalg/IR/LinalgLibraryOps.td",
+    td_file = "include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td",
     td_srcs = [
         ":LinalgLibraryOpsTdFiles",
     ],
@@ -1705,25 +1806,27 @@ gentbl(
 cc_library(
     name = "Linalg",
     srcs = [
-        "lib/Linalg/Analysis/DependenceAnalysis.cpp",
-        "lib/Linalg/IR/LinalgOps.cpp",
-        "lib/Linalg/IR/LinalgTypes.cpp",
-        "lib/Linalg/Transforms/Fusion.cpp",
-        "lib/Linalg/Transforms/LowerToLLVMDialect.cpp",
-        "lib/Linalg/Transforms/LowerToLoops.cpp",
-        "lib/Linalg/Transforms/Tiling.cpp",
-        "lib/Linalg/Utils/Utils.cpp",
+        "lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp",
+        "lib/Dialect/Linalg/IR/LinalgOps.cpp",
+        "lib/Dialect/Linalg/IR/LinalgTypes.cpp",
+        "lib/Dialect/Linalg/Transforms/Fusion.cpp",
+        "lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp",
+        "lib/Dialect/Linalg/Transforms/LowerToLoops.cpp",
+        "lib/Dialect/Linalg/Transforms/Tiling.cpp",
+        "lib/Dialect/Linalg/Utils/Utils.cpp",
     ],
     hdrs = [
-        "include/mlir/Linalg/Analysis/DependenceAnalysis.h",
-        "include/mlir/Linalg/IR/LinalgOps.h",
-        "include/mlir/Linalg/IR/LinalgTraits.h",
-        "include/mlir/Linalg/IR/LinalgTypes.h",
-        "include/mlir/Linalg/Passes.h",
-        "include/mlir/Linalg/Utils/Intrinsics.h",
-        "include/mlir/Linalg/Utils/Utils.h",
+        "include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h",
+        "include/mlir/Dialect/Linalg/IR/LinalgOps.h",
+        "include/mlir/Dialect/Linalg/IR/LinalgTraits.h",
+        "include/mlir/Dialect/Linalg/IR/LinalgTypes.h",
+        "include/mlir/Dialect/Linalg/Passes.h",
+        "include/mlir/Dialect/Linalg/Utils/Intrinsics.h",
+        "include/mlir/Dialect/Linalg/Utils/Utils.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
+        ":AffineOps",
         ":CFGTransforms",
         ":EDSC",
         ":IR",
@@ -1738,6 +1841,7 @@ cc_library(
         ":Support",
         ":TransformUtils",
         ":Transforms",
+        ":VectorToLLVMTransforms",
         "@llvm//:core",
         "@llvm//:support",
     ],
@@ -1746,7 +1850,8 @@ cc_library(
 
 cc_library(
     name = "LinalgDialectRegistration",
-    srcs = ["lib/Linalg/LinalgRegistration.cpp"],
+    srcs = ["lib/Dialect/Linalg/LinalgRegistration.cpp"],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -1780,6 +1885,7 @@ cc_library(
         "include/mlir/Quantizer/Support/UniformConstraints.h",
         "include/mlir/Quantizer/Support/UniformSolvers.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["include"],
     deps = [
         ":FxpMathOps",
@@ -1800,6 +1906,7 @@ cc_library(
     hdrs = [
         "include/mlir/Quantizer/Transforms/Passes.h",
     ],
+    copts = ["-std=c++14"],
     deps = [
         ":IR",
         ":Pass",
@@ -1811,6 +1918,54 @@ cc_library(
     alwayslink = 1,
 )
 
+filegroup(
+    name = "VectorOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/VectorOps/VectorOps.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "VectorOpsIncGen",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/VectorOps/VectorOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/VectorOps/VectorOps.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/VectorOps/VectorOps.td",
+    td_srcs = [
+        ":VectorOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "VectorToLLVMTransforms",
+    srcs = ["lib/Conversion/VectorToLLVM/VectorToLLVM.cpp"],
+    hdrs = [
+        "include/mlir/Conversion/VectorToLLVM/VectorToLLVM.h",
+    ],
+    copts = ["-std=c++14"],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":LLVMDialect",
+        ":LLVMTransforms",
+        ":Pass",
+        ":Transforms",
+        ":VectorOps",
+        "@llvm//:core",
+        "@llvm//:support",
+    ],
+    alwayslink = 1,
+)
+
 # To reference all tablegen files here when checking for updates to them.
 filegroup(
     name = "TdFiles",
diff --git a/third_party/mlir/CMakeLists.txt b/third_party/mlir/CMakeLists.txt
new file mode 100644
index 00000000000..5329de4e893
--- /dev/null
+++ b/third_party/mlir/CMakeLists.txt
@@ -0,0 +1,58 @@
+# MLIR project.
+set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include ) # --src-root
+set(MLIR_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include ) # --includedir
+set(MLIR_TABLEGEN_EXE mlir-tblgen)
+
+set(MLIR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(MLIR_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+function(mlir_tablegen ofn)
+  tablegen(MLIR ${ARGV} "-I${MLIR_MAIN_SRC_DIR}" "-I${MLIR_INCLUDE_DIR}")
+  set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+      PARENT_SCOPE)
+endfunction()
+
+# TODO: This is to handle the current static registration, but should be
+# factored out a bit.
+function(whole_archive_link target)
+  if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+    set(link_flags "-L${CMAKE_BINARY_DIR}/lib ")
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "-Wl,-force_load ${CMAKE_BINARY_DIR}/lib/lib${LIB}.a ")
+    ENDFOREACH(LIB)
+  elseif(MSVC)
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "/WHOLEARCHIVE:${LIB} ")
+    ENDFOREACH(LIB)
+  else()
+    set(link_flags "-L${CMAKE_BINARY_DIR}/lib -Wl,--whole-archive,")
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "-l${LIB},")
+    ENDFOREACH(LIB)
+    string(CONCAT link_flags ${link_flags} "--no-whole-archive")
+  endif()
+  set_target_properties(${target} PROPERTIES LINK_FLAGS ${link_flags})
+endfunction(whole_archive_link)
+
+# Build the CUDA conversions and run according tests if the NVPTX backend
+# is available
+if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_CUDA_CONVERSIONS_ENABLED 1)
+else()
+  set(MLIR_CUDA_CONVERSIONS_ENABLED 0)
+endif()
+
+set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
+
+include_directories( "include")
+include_directories( ${MLIR_INCLUDE_DIR})
+
+add_subdirectory(include/mlir)
+add_subdirectory(lib)
+add_subdirectory(tools)
+add_subdirectory(unittests)
+add_subdirectory(test)
+
+if( LLVM_INCLUDE_EXAMPLES )
+  add_subdirectory(examples)
+endif()
diff --git a/third_party/mlir/CONTRIBUTING.md b/third_party/mlir/CONTRIBUTING.md
new file mode 100644
index 00000000000..e21e4b8db56
--- /dev/null
+++ b/third_party/mlir/CONTRIBUTING.md
@@ -0,0 +1,49 @@
+# How to Contribute
+
+Everyone is welcome to contribute to MLIR. There are several ways of getting involved and contributing including reporting bugs, improving documentation, writing models or tutorials. 
+
+Please read our [Code of Conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md) before participating.
+
+## Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google.com/conduct/).
+
+## How to become a contributor and submit your own code
+
+### Contributor License Agreements
+
+We'd love to accept your patches! Before we can take them, please fill out either the individual or corporate Contributor License Agreement (CLA).
+
+* If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](https://code.google.com/legal/individual-cla-v1.0.html).
+  * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](https://code.google.com/legal/corporate-cla-v1.0.html).
+
+Follow either of the two links above to access the appropriate CLA and instructions for how to sign and return it. Once we receive it, we'll be able to accept your pull requests.
+
+***NOTE***: Only original source code from you and other people that have signed the CLA can be accepted into the main repository.
+
+### Contributing code
+
+If you have improvements to MLIR, send us your pull requests! For those
+just getting started, GitHub has a [howto](https://help.github.com/articles/using-pull-requests/).
+
+MLIR team members will be assigned to review your pull requests. Once the pull requests are approved and pass continuous integration checks, a team member will merge your pull request submitted to our internal repository. After the change has been submitted internally, your pull request will be merged automatically on GitHub.
+
+If you want to contribute, start working through the MLIR codebase, navigate to [Github "issues" tab](https://github.com/tensorflow/mlir/issues) and start looking through interesting issues. If you decide to start on an issue, leave a comment so that other people know that you're working on it. If you want to help out, but not alone, use the issue comment thread to coordinate.
+
+### Contribution guidelines and standards
+
+*   Read the [developer guide](g3doc/DeveloperGuide.md).
+*   Ensure that you use the correct license. Examples are provided below.
+*   Include tests when you contribute new features, as they help to a)
+    prove that your code works correctly, and b) guard against future breaking
+    changes to lower the maintenance cost.
+*   Bug fixes also generally require tests, because the presence of bugs
+    usually indicates insufficient test coverage.
+
+#### License
+
+Include a license at the top of new files.
+
+* [C/C++ license example](https://github.com/tensorflow/mlir/blob/master/examples/toy/Ch1/toyc.cpp)
+* [Python license example](https://github.com/tensorflow/mlir/blob/master/bindings/python/test/test_py2and3.py)
diff --git a/third_party/mlir/LICENSE.TXT b/third_party/mlir/LICENSE.TXT
new file mode 100644
index 00000000000..a4b160b6e33
--- /dev/null
+++ b/third_party/mlir/LICENSE.TXT
@@ -0,0 +1,205 @@
+Copyright 2019 The MLIR Authors.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
diff --git a/third_party/mlir/README.md b/third_party/mlir/README.md
new file mode 100644
index 00000000000..ce1f1b0958e
--- /dev/null
+++ b/third_party/mlir/README.md
@@ -0,0 +1,132 @@
+# Multi-Level Intermediate Representation Overview
+
+The MLIR project aims to define a common intermediate representation (IR) that
+will unify the infrastructure required to execute high performance machine
+learning models in TensorFlow and similar ML frameworks. This project will
+include the application of HPC techniques, along with integration of search
+algorithms like reinforcement learning. This project aims to reduce the cost to
+bring up new hardware, and improve usability for existing TensorFlow users.
+
+Note that this repository contains the core of the MLIR framework. The
+TensorFlow compilers we are building on top of MLIR will be part of the
+main TensorFlow repository soon.
+
+# How to Contribute
+
+Thank you for your interest in contributing to MLIR! If you want to contribute
+to MLIR, be sure to review the [contribution guidelines](CONTRIBUTING.md).
+
+## More resources
+
+For more information on MLIR, please see:
+
+*   [The MLIR draft specification](g3doc/LangRef.md), which describes the IR
+    itself.
+*   [The MLIR rationale document](g3doc/Rationale.md), covering motivation
+    behind some decisions.
+*   Previous external [talks](#mlir-talks).
+
+Join the [MLIR mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/mlir)
+to hear about announcements and discussions.
+Please be mindful of the [TensorFlow Code of Conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md),
+which pledges to foster an open and welcoming environment.
+
+## What is MLIR for?
+
+MLIR is intended to be a hybrid IR which can support multiple different
+requirements in a unified infrastructure. For example, this includes:
+
+*   The ability to represent all TensorFlow graphs, including dynamic shapes,
+    the user-extensible op ecosystem, TensorFlow variables, etc.
+*   Optimizations and transformations typically done on a TensorFlow graph, e.g.
+    in Grappler.
+*   Quantization and other graph transformations done on a TensorFlow graph or
+    the TF Lite representation.
+*   Representation of kernels for ML operations in a form suitable for
+    optimization.
+*   Ability to host high-performance-computing-style loop optimizations across
+    kernels (fusion, loop interchange, tiling, etc) and to transform memory
+    layouts of data.
+*   Code generation "lowering" transformations such as DMA insertion, explicit
+    cache management, memory tiling, and vectorization for 1D and 2D register
+    architectures.
+*   Ability to represent target-specific operations, e.g. the MXU on TPUs.
+
+MLIR is a common IR that also supports hardware specific operations. Thus,
+any investment into the infrastructure surrounding MLIR (e.g. the compiler
+passes that work on it) should yield good returns; many targets can use that
+infrastructure and will benefit from it.
+
+MLIR is a powerful representation, but it also has non-goals. We do not try to
+support low level machine code generation algorithms (like register allocation
+and instruction scheduling). They are a better fit for lower level optimizers
+(such as LLVM). Also, we do not intend MLIR to be a source language that
+end-users would themselves write kernels in (analogous to CUDA C++). While we
+would love to see a kernel language happen someday, that will be an independent
+project that compiles down to MLIR.
+
+## Compiler infrastructure
+
+We benefited from experience gained from building other IRs (HLO, LLVM and SIL)
+when building MLIR. We will directly adopt existing best practices, e.g. writing
+and maintaining an IR spec, building an IR verifier, providing the ability to
+dump and parse MLIR files to text, writing extensive unit tests with the
+[FileCheck](https://llvm.org/docs/CommandGuide/FileCheck.html) tool, and
+building the infrastructure as a set of modular libraries that can be combined
+in new ways. We plan to use the infrastructure developed by the XLA team for
+performance analysis and benchmarking.
+
+Other lessons have been incorporated and integrated into the design in subtle
+ways. For example, LLVM has non-obvious design mistakes that prevent a
+multithreaded compiler from working on multiple functions in an LLVM module at
+the same time. MLIR solves these problems by having per-function constant pools
+and by making references explicit with `function_ref`.
+
+# Getting started with MLIR
+
+The following instructions for compiling and testing MLIR assume that you have
+`git`, [`ninja`](https://ninja-build.org/), and a working C++ toolchain. In the
+future, we aim to align on the same level of platform support as
+[LLVM](https://llvm.org/docs/GettingStarted.html#requirements). For now, MLIR
+has been tested on Linux and macOS, with recent versions of clang and with
+gcc 7.
+
+```sh
+git clone https://github.com/llvm/llvm-project.git
+git clone https://github.com/tensorflow/mlir llvm-project/llvm/projects/mlir
+mkdir llvm-project/build
+cd llvm-project/build
+cmake -G Ninja ../llvm -DLLVM_BUILD_EXAMPLES=ON -DLLVM_TARGETS_TO_BUILD="host"
+cmake --build . --target check-mlir
+```
+
+To compile and test on Windows using Visual Studio 2017:
+
+```bat
+REM In shell with Visual Studio environment set up, e.g., with command such as
+REM   $visual-studio-install\Auxiliary\Build\vcvarsall.bat" x64
+REM invoked.
+git clone https://github.com/llvm/llvm-project.git
+git clone https://github.com/tensorflow/mlir llvm-project\llvm\projects\mlir
+mkdir llvm-project\build
+cd llvm-project\build
+cmake ..\llvm -G "Visual Studio 15 2017 Win64" -DLLVM_BUILD_EXAMPLES=ON -DLLVM_TARGETS_TO_BUILD="host" -DCMAKE_BUILD_TYPE=Release -Thost=x64
+cmake --build . --target check-mlir
+```
+
+As a starter, you may try [the tutorial](g3doc/Tutorials/Toy/Ch-1.md) on
+building a compiler for a Toy language.
+
+# MLIR talks
+
+* "[MLIR Primer: A Compiler Infrastructure for the End of Moore’s Law](https://ai.google/research/pubs/pub48035.pdf)"
+  * Chris Lattner & Jacques Pienaar, Google at
+    [Compilers for Machine Learning](https://www.c4ml.org/) workshop at
+    [CGO 2019](http://cgo.org/cgo2019/)
+* "[MLIR: Multi-Level Intermediate Representation for Compiler
+    Infrastructure](https://llvm.org/devmtg/2019-04/talks.html#Keynote_1)"
+  * Tatiana Shpeisman & Chris Lattner, Google at
+    [EuroLLVM 2019](https://llvm.org/devmtg/2019-04)
+* "[Tutorial: Building a Compiler with MLIR](https://llvm.org/devmtg/2019-04/talks.html#Tutorial_1)"
+  * Mehdi Amini, Jacques Pienaar, Nicolas Vasilache, Google at
+    [EuroLLVM 2019](https://llvm.org/devmtg/2019-04)
diff --git a/tensorflow/examples/how_tos/reading_data/__init__.py b/third_party/mlir/WORKSPACE
similarity index 100%
rename from tensorflow/examples/how_tos/reading_data/__init__.py
rename to third_party/mlir/WORKSPACE
diff --git a/third_party/mlir/bindings/python/BUILD b/third_party/mlir/bindings/python/BUILD
index a539d0b4830..00e896c8ee2 100644
--- a/third_party/mlir/bindings/python/BUILD
+++ b/third_party/mlir/bindings/python/BUILD
@@ -27,8 +27,9 @@ py_extension(
     features = ["-use_header_modules"],
     module_name = "pybind",
     deps = [
-        "@llvm//:ir",
-        "@llvm//:support",
+        "//third_party/llvm/llvm:ir",
+        "//third_party/llvm/llvm:support",
+        "//third_party/pybind11",
         "@local_config_mlir//:EDSC",
         "@local_config_mlir//:ExecutionEngine",
         "@local_config_mlir//:IR",
@@ -37,6 +38,5 @@ py_extension(
         "@local_config_mlir//:StandardDialectRegistration",
         "@local_config_mlir//:TargetLLVMIR",
         "@local_config_mlir//:Transforms",
-        "@pybind11",
     ],
 )
diff --git a/third_party/mlir/bindings/python/pybind.cpp b/third_party/mlir/bindings/python/pybind.cpp
new file mode 100644
index 00000000000..5efd08da4c2
--- /dev/null
+++ b/third_party/mlir/bindings/python/pybind.cpp
@@ -0,0 +1,932 @@
+//===- pybind.cpp - MLIR Python bindings ----------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <unordered_map>
+
+#include "mlir-c/Core.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR.h"
+#include "mlir/Transforms/Passes.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+
+static bool inited = [] {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  return true;
+}();
+
+namespace mlir {
+namespace edsc {
+namespace python {
+
+namespace py = pybind11;
+
+struct PythonAttribute;
+struct PythonAttributedType;
+struct PythonBindable;
+struct PythonExpr;
+struct PythonFunctionContext;
+struct PythonStmt;
+struct PythonBlock;
+
+struct PythonType {
+  PythonType() : type{nullptr} {}
+  PythonType(mlir_type_t t) : type{t} {}
+
+  operator mlir_type_t() const { return type; }
+
+  PythonAttributedType attachAttributeDict(
+      const std::unordered_map<std::string, PythonAttribute> &attrs) const;
+
+  std::string str() {
+    mlir::Type f = mlir::Type::getFromOpaquePointer(type);
+    std::string res;
+    llvm::raw_string_ostream os(res);
+    f.print(os);
+    return res;
+  }
+
+  mlir_type_t type;
+};
+
+struct PythonValueHandle {
+  PythonValueHandle(PythonType type)
+      : value(mlir::Type::getFromOpaquePointer(type.type)) {}
+  PythonValueHandle(const PythonValueHandle &other) = default;
+  PythonValueHandle(const mlir::edsc::ValueHandle &other) : value(other) {}
+  operator ValueHandle() const { return value; }
+  operator ValueHandle &() { return value; }
+
+  std::string str() const {
+    return std::to_string(reinterpret_cast<intptr_t>(value.getValue()));
+  }
+
+  PythonValueHandle call(const std::vector<PythonValueHandle> &args) {
+    assert(value.hasType() && value.getType().isa<FunctionType>() &&
+           "can only call function-typed values");
+
+    std::vector<Value *> argValues;
+    argValues.reserve(args.size());
+    for (auto arg : args)
+      argValues.push_back(arg.value.getValue());
+    return ValueHandle::create<CallIndirectOp>(value, argValues);
+  }
+
+  mlir::edsc::ValueHandle value;
+};
+
+struct PythonFunction {
+  PythonFunction() : function{nullptr} {}
+  PythonFunction(mlir_func_t f) : function{f} {}
+  PythonFunction(mlir::FuncOp f)
+      : function(const_cast<void *>(f.getAsOpaquePointer())) {}
+  operator mlir_func_t() { return function; }
+  std::string str() {
+    mlir::FuncOp f = mlir::FuncOp::getFromOpaquePointer(function);
+    std::string res;
+    llvm::raw_string_ostream os(res);
+    f.print(os);
+    return res;
+  }
+
+  // If the function does not yet have an entry block, i.e. if it is a function
+  // declaration, add the entry block, transforming the declaration into a
+  // definition.  Return true if the block was added, false otherwise.
+  bool define() {
+    auto f = mlir::FuncOp::getFromOpaquePointer(function);
+    if (!f.getBlocks().empty())
+      return false;
+
+    f.addEntryBlock();
+    return true;
+  }
+
+  PythonValueHandle arg(unsigned index) {
+    auto f = mlir::FuncOp::getFromOpaquePointer(function);
+    assert(index < f.getNumArguments() && "argument index out of bounds");
+    return PythonValueHandle(ValueHandle(f.getArgument(index)));
+  }
+
+  mlir_func_t function;
+};
+
+/// Trivial C++ wrappers make use of the EDSC C API.
+struct PythonMLIRModule {
+  PythonMLIRModule()
+      : mlirContext(),
+        module(mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))),
+        moduleManager(*module) {}
+
+  PythonType makeScalarType(const std::string &mlirElemType,
+                            unsigned bitwidth) {
+    return ::makeScalarType(mlir_context_t{&mlirContext}, mlirElemType.c_str(),
+                            bitwidth);
+  }
+  PythonType makeMemRefType(PythonType elemType, std::vector<int64_t> sizes) {
+    return ::makeMemRefType(mlir_context_t{&mlirContext}, elemType,
+                            int64_list_t{sizes.data(), sizes.size()});
+  }
+  PythonType makeIndexType() {
+    return ::makeIndexType(mlir_context_t{&mlirContext});
+  }
+
+  // Declare a function with the given name, input types and their attributes,
+  // output types, and function attributes, but do not define it.
+  PythonFunction declareFunction(const std::string &name,
+                                 const py::list &inputs,
+                                 const std::vector<PythonType> &outputTypes,
+                                 const py::kwargs &funcAttributes);
+
+  // Declare a function with the given name, input types and their attributes,
+  // output types, and function attributes.
+  PythonFunction makeFunction(const std::string &name, const py::list &inputs,
+                              const std::vector<PythonType> &outputTypes,
+                              const py::kwargs &funcAttributes) {
+    auto declaration =
+        declareFunction(name, inputs, outputTypes, funcAttributes);
+    declaration.define();
+    return declaration;
+  }
+
+  // Create a custom op given its name and arguments.
+  PythonExpr op(const std::string &name, PythonType type,
+                const py::list &arguments, const py::list &successors,
+                py::kwargs attributes);
+
+  // Create an integer attribute.
+  PythonAttribute integerAttr(PythonType type, int64_t value);
+
+  // Create a boolean attribute.
+  PythonAttribute boolAttr(bool value);
+
+  void compile() {
+    PassManager manager;
+    manager.addPass(mlir::createCanonicalizerPass());
+    manager.addPass(mlir::createCSEPass());
+    manager.addPass(mlir::createLowerAffinePass());
+    manager.addPass(mlir::createConvertToLLVMIRPass());
+    if (failed(manager.run(*module))) {
+      llvm::errs() << "conversion to the LLVM IR dialect failed\n";
+      return;
+    }
+
+    auto created = mlir::ExecutionEngine::create(*module);
+    llvm::handleAllErrors(created.takeError(),
+                          [](const llvm::ErrorInfoBase &b) {
+                            b.log(llvm::errs());
+                            assert(false);
+                          });
+    engine = std::move(*created);
+  }
+
+  std::string getIR() {
+    std::string res;
+    llvm::raw_string_ostream os(res);
+    module->print(os);
+    return res;
+  }
+
+  uint64_t getEngineAddress() {
+    assert(engine && "module must be compiled into engine first");
+    return reinterpret_cast<uint64_t>(reinterpret_cast<void *>(engine.get()));
+  }
+
+  PythonFunction getNamedFunction(const std::string &name) {
+    return moduleManager.lookupSymbol<FuncOp>(name);
+  }
+
+  PythonFunctionContext
+  makeFunctionContext(const std::string &name, const py::list &inputs,
+                      const std::vector<PythonType> &outputs,
+                      const py::kwargs &attributes);
+
+private:
+  mlir::MLIRContext mlirContext;
+  // One single module in a python-exposed MLIRContext for now.
+  mlir::OwningModuleRef module;
+  mlir::ModuleManager moduleManager;
+  std::unique_ptr<mlir::ExecutionEngine> engine;
+};
+
+struct PythonFunctionContext {
+  PythonFunctionContext(PythonFunction f) : function(f) {}
+  PythonFunctionContext(PythonMLIRModule &module, const std::string &name,
+                        const py::list &inputs,
+                        const std::vector<PythonType> &outputs,
+                        const py::kwargs &attributes) {
+    auto function = module.declareFunction(name, inputs, outputs, attributes);
+    function.define();
+  }
+
+  PythonFunction enter() {
+    assert(function.function && "function is not set up");
+    auto mlirFunc = mlir::FuncOp::getFromOpaquePointer(function.function);
+    contextBuilder.emplace(mlirFunc.getBody());
+    context = new mlir::edsc::ScopedContext(*contextBuilder, mlirFunc.getLoc());
+    return function;
+  }
+
+  void exit(py::object, py::object, py::object) {
+    delete context;
+    context = nullptr;
+    contextBuilder.reset();
+  }
+
+  PythonFunction function;
+  mlir::edsc::ScopedContext *context;
+  llvm::Optional<OpBuilder> contextBuilder;
+};
+
+PythonFunctionContext PythonMLIRModule::makeFunctionContext(
+    const std::string &name, const py::list &inputs,
+    const std::vector<PythonType> &outputs, const py::kwargs &attributes) {
+  auto func = declareFunction(name, inputs, outputs, attributes);
+  func.define();
+  return PythonFunctionContext(func);
+}
+
+struct PythonBlockHandle {
+  PythonBlockHandle() : value(nullptr) {}
+  PythonBlockHandle(const PythonBlockHandle &other) = default;
+  PythonBlockHandle(const mlir::edsc::BlockHandle &other) : value(other) {}
+  operator mlir::edsc::BlockHandle() const { return value; }
+
+  PythonValueHandle arg(int index) { return arguments[index]; }
+
+  std::string str() {
+    std::string s;
+    llvm::raw_string_ostream os(s);
+    value.getBlock()->print(os);
+    return os.str();
+  }
+
+  mlir::edsc::BlockHandle value;
+  std::vector<mlir::edsc::ValueHandle> arguments;
+};
+
+struct PythonLoopContext {
+  PythonLoopContext(PythonValueHandle lb, PythonValueHandle ub, int64_t step)
+      : lb(lb), ub(ub), step(step) {}
+  PythonLoopContext(const PythonLoopContext &) = delete;
+  PythonLoopContext(PythonLoopContext &&) = default;
+  PythonLoopContext &operator=(const PythonLoopContext &) = delete;
+  PythonLoopContext &operator=(PythonLoopContext &&) = default;
+  ~PythonLoopContext() { assert(!builder && "did not exit from the context"); }
+
+  PythonValueHandle enter() {
+    ValueHandle iv(lb.value.getType());
+    builder = new LoopBuilder(&iv, lb.value, ub.value, step);
+    return iv;
+  }
+
+  void exit(py::object, py::object, py::object) {
+    (*builder)({}); // exit from the builder's scope.
+    delete builder;
+    builder = nullptr;
+  }
+
+  PythonValueHandle lb, ub;
+  int64_t step;
+  LoopBuilder *builder = nullptr;
+};
+
+struct PythonLoopNestContext {
+  PythonLoopNestContext(const std::vector<PythonValueHandle> &lbs,
+                        const std::vector<PythonValueHandle> &ubs,
+                        const std::vector<int64_t> steps)
+      : lbs(lbs), ubs(ubs), steps(steps) {
+    assert(lbs.size() == ubs.size() && lbs.size() == steps.size() &&
+           "expected the same number of lower, upper bounds, and steps");
+  }
+  PythonLoopNestContext(const PythonLoopNestContext &) = delete;
+  PythonLoopNestContext(PythonLoopNestContext &&) = default;
+  PythonLoopNestContext &operator=(const PythonLoopNestContext &) = delete;
+  PythonLoopNestContext &operator=(PythonLoopNestContext &&) = default;
+  ~PythonLoopNestContext() {
+    assert(!builder && "did not exit from the context");
+  }
+
+  std::vector<PythonValueHandle> enter() {
+    if (steps.empty())
+      return {};
+
+    auto type = mlir_type_t(lbs.front().value.getType().getAsOpaquePointer());
+    std::vector<PythonValueHandle> handles(steps.size(),
+                                           PythonValueHandle(type));
+    std::vector<ValueHandle *> handlePtrs;
+    handlePtrs.reserve(steps.size());
+    for (auto &h : handles)
+      handlePtrs.push_back(&h.value);
+    builder = new LoopNestBuilder(
+        handlePtrs, std::vector<ValueHandle>(lbs.begin(), lbs.end()),
+        std::vector<ValueHandle>(ubs.begin(), ubs.end()), steps);
+    return handles;
+  }
+
+  void exit(py::object, py::object, py::object) {
+    (*builder)({}); // exit from the builder's scope.
+    delete builder;
+    builder = nullptr;
+  }
+
+  std::vector<PythonValueHandle> lbs;
+  std::vector<PythonValueHandle> ubs;
+  std::vector<int64_t> steps;
+  LoopNestBuilder *builder = nullptr;
+};
+
+struct PythonBlockAppender {
+  PythonBlockAppender(const PythonBlockHandle &handle) : handle(handle) {}
+  PythonBlockHandle handle;
+};
+
+struct PythonBlockContext {
+public:
+  PythonBlockContext() {
+    createBlockBuilder();
+    clearBuilder();
+  }
+  PythonBlockContext(const std::vector<PythonType> &argTypes) {
+    handle.arguments.reserve(argTypes.size());
+    for (const auto &t : argTypes) {
+      auto type =
+          Type::getFromOpaquePointer(reinterpret_cast<const void *>(t.type));
+      handle.arguments.emplace_back(type);
+    }
+    createBlockBuilder();
+    clearBuilder();
+  }
+  PythonBlockContext(const PythonBlockAppender &a) : handle(a.handle) {}
+  PythonBlockContext(const PythonBlockContext &) = delete;
+  PythonBlockContext(PythonBlockContext &&) = default;
+  PythonBlockContext &operator=(const PythonBlockContext &) = delete;
+  PythonBlockContext &operator=(PythonBlockContext &&) = default;
+  ~PythonBlockContext() {
+    assert(!builder && "did not exit from the block context");
+  }
+
+  // EDSC maintain an implicit stack of builders (mostly for keeping track of
+  // insretion points); every operation gets inserted using the top-of-the-stack
+  // builder.  Creating a new EDSC Builder automatically puts it on the stack,
+  // effectively entering the block for it.
+  void createBlockBuilder() {
+    if (handle.value.getBlock()) {
+      builder = new BlockBuilder(handle.value, mlir::edsc::Append());
+    } else {
+      std::vector<ValueHandle *> args;
+      args.reserve(handle.arguments.size());
+      for (auto &a : handle.arguments)
+        args.push_back(&a);
+      builder = new BlockBuilder(&handle.value, args);
+    }
+  }
+
+  PythonBlockHandle enter() {
+    createBlockBuilder();
+    return handle;
+  }
+
+  void exit(py::object, py::object, py::object) { clearBuilder(); }
+
+  PythonBlockHandle getHandle() { return handle; }
+
+  // EDSC maintain an implicit stack of builders (mostly for keeping track of
+  // insretion points); every operation gets inserted using the top-of-the-stack
+  // builder.  Calling operator() on a builder pops the builder from the stack,
+  // effectively resetting the insertion point to its position before we entered
+  // the block.
+  void clearBuilder() {
+    (*builder)({}); // exit from the builder's scope.
+    delete builder;
+    builder = nullptr;
+  }
+
+  PythonBlockHandle handle;
+  BlockBuilder *builder = nullptr;
+};
+
+struct PythonAttribute {
+  PythonAttribute() : attr(nullptr) {}
+  PythonAttribute(const mlir_attr_t &a) : attr(a) {}
+  PythonAttribute(const PythonAttribute &other) = default;
+  operator mlir_attr_t() { return attr; }
+
+  std::string str() const {
+    if (!attr)
+      return "##null attr##";
+
+    std::string res;
+    llvm::raw_string_ostream os(res);
+    Attribute::getFromOpaquePointer(reinterpret_cast<const void *>(attr))
+        .print(os);
+    return res;
+  }
+
+  mlir_attr_t attr;
+};
+
+struct PythonAttributedType {
+  PythonAttributedType() : type(nullptr) {}
+  PythonAttributedType(mlir_type_t t) : type(t) {}
+  PythonAttributedType(
+      PythonType t,
+      const std::unordered_map<std::string, PythonAttribute> &attributes =
+          std::unordered_map<std::string, PythonAttribute>())
+      : type(t), attrs(attributes) {}
+
+  operator mlir_type_t() const { return type.type; }
+  operator PythonType() const { return type; }
+
+  // Return a vector of named attribute descriptors.  The vector owns the
+  // mlir_named_attr_t objects it contains, but not the names and attributes
+  // those objects point to (names and opaque pointers to attributes are owned
+  // by `this`).
+  std::vector<mlir_named_attr_t> getNamedAttrs() const {
+    std::vector<mlir_named_attr_t> result;
+    result.reserve(attrs.size());
+    for (const auto &namedAttr : attrs)
+      result.push_back({namedAttr.first.c_str(), namedAttr.second.attr});
+    return result;
+  }
+
+  std::string str() {
+    mlir::Type t = mlir::Type::getFromOpaquePointer(type);
+    std::string res;
+    llvm::raw_string_ostream os(res);
+    t.print(os);
+    if (attrs.empty())
+      return os.str();
+
+    os << '{';
+    bool first = true;
+    for (const auto &namedAttr : attrs) {
+      if (first)
+        first = false;
+      else
+        os << ", ";
+      os << namedAttr.first << ": " << namedAttr.second.str();
+    }
+    os << '}';
+
+    return os.str();
+  }
+
+private:
+  PythonType type;
+  std::unordered_map<std::string, PythonAttribute> attrs;
+};
+
+struct PythonIndexedValue {
+  explicit PythonIndexedValue(PythonType type)
+      : indexed(Type::getFromOpaquePointer(type.type)) {}
+  explicit PythonIndexedValue(const IndexedValue &other) : indexed(other) {}
+  PythonIndexedValue(PythonValueHandle handle) : indexed(handle.value) {}
+  PythonIndexedValue(const PythonIndexedValue &other) = default;
+
+  // Create a new indexed value with the same base as this one but with indices
+  // provided as arguments.
+  PythonIndexedValue index(const std::vector<PythonValueHandle> &indices) {
+    std::vector<ValueHandle> handles(indices.begin(), indices.end());
+    return PythonIndexedValue(IndexedValue(indexed(handles)));
+  }
+
+  void store(const std::vector<PythonValueHandle> &indices,
+             PythonValueHandle value) {
+    // Uses the overloaded `opreator=` to emit a store.
+    index(indices).indexed = value.value;
+  }
+
+  PythonValueHandle load(const std::vector<PythonValueHandle> &indices) {
+    // Uses the overloaded cast to `ValueHandle` to emit a load.
+    return static_cast<ValueHandle>(index(indices).indexed);
+  }
+
+  IndexedValue indexed;
+};
+
+template <typename ListTy, typename PythonTy, typename Ty>
+ListTy makeCList(SmallVectorImpl<Ty> &owning, const py::list &list) {
+  for (auto &inp : list) {
+    owning.push_back(Ty{inp.cast<PythonTy>()});
+  }
+  return ListTy{owning.data(), owning.size()};
+}
+
+static mlir_type_list_t makeCTypes(llvm::SmallVectorImpl<mlir_type_t> &owning,
+                                   const py::list &types) {
+  return makeCList<mlir_type_list_t, PythonType>(owning, types);
+}
+
+PythonFunction
+PythonMLIRModule::declareFunction(const std::string &name,
+                                  const py::list &inputs,
+                                  const std::vector<PythonType> &outputTypes,
+                                  const py::kwargs &funcAttributes) {
+
+  std::vector<PythonAttributedType> attributedInputs;
+  attributedInputs.reserve(inputs.size());
+  for (const auto &in : inputs) {
+    std::string className = in.get_type().str();
+    if (className.find(".Type'") != std::string::npos)
+      attributedInputs.emplace_back(in.cast<PythonType>());
+    else
+      attributedInputs.push_back(in.cast<PythonAttributedType>());
+  }
+
+  // Create the function type.
+  std::vector<mlir_type_t> ins(attributedInputs.begin(),
+                               attributedInputs.end());
+  std::vector<mlir_type_t> outs(outputTypes.begin(), outputTypes.end());
+  auto funcType = ::makeFunctionType(
+      mlir_context_t{&mlirContext}, mlir_type_list_t{ins.data(), ins.size()},
+      mlir_type_list_t{outs.data(), outs.size()});
+
+  // Build the list of function attributes.
+  std::vector<mlir::NamedAttribute> attrs;
+  attrs.reserve(funcAttributes.size());
+  for (const auto &named : funcAttributes)
+    attrs.emplace_back(
+        Identifier::get(std::string(named.first.str()), &mlirContext),
+        mlir::Attribute::getFromOpaquePointer(reinterpret_cast<const void *>(
+            named.second.cast<PythonAttribute>().attr)));
+
+  // Build the list of lists of function argument attributes.
+  std::vector<mlir::NamedAttributeList> inputAttrs;
+  inputAttrs.reserve(attributedInputs.size());
+  for (const auto &in : attributedInputs) {
+    std::vector<mlir::NamedAttribute> inAttrs;
+    for (const auto &named : in.getNamedAttrs())
+      inAttrs.emplace_back(Identifier::get(named.name, &mlirContext),
+                           mlir::Attribute::getFromOpaquePointer(
+                               reinterpret_cast<const void *>(named.value)));
+    inputAttrs.emplace_back(inAttrs);
+  }
+
+  // Create the function itself.
+  auto func = mlir::FuncOp::create(
+      UnknownLoc::get(&mlirContext), name,
+      mlir::Type::getFromOpaquePointer(funcType).cast<FunctionType>(), attrs,
+      inputAttrs);
+  moduleManager.insert(func);
+  return func;
+}
+
+PythonAttributedType PythonType::attachAttributeDict(
+    const std::unordered_map<std::string, PythonAttribute> &attrs) const {
+  return PythonAttributedType(*this, attrs);
+}
+
+PythonAttribute PythonMLIRModule::integerAttr(PythonType type, int64_t value) {
+  return PythonAttribute(::makeIntegerAttr(type, value));
+}
+
+PythonAttribute PythonMLIRModule::boolAttr(bool value) {
+  return PythonAttribute(::makeBoolAttr(&mlirContext, value));
+}
+
+PYBIND11_MODULE(pybind, m) {
+  m.doc() =
+      "Python bindings for MLIR Embedded Domain-Specific Components (EDSCs)";
+  m.def("version", []() { return "EDSC Python extensions v1.0"; });
+
+  py::class_<PythonLoopContext>(
+      m, "LoopContext", "A context for building the body of a 'for' loop")
+      .def(py::init<PythonValueHandle, PythonValueHandle, int64_t>())
+      .def("__enter__", &PythonLoopContext::enter)
+      .def("__exit__", &PythonLoopContext::exit);
+
+  py::class_<PythonLoopNestContext>(m, "LoopNestContext",
+                                    "A context for building the body of a the "
+                                    "innermost loop in a nest of 'for' loops")
+      .def(py::init<const std::vector<PythonValueHandle> &,
+                    const std::vector<PythonValueHandle> &,
+                    const std::vector<int64_t> &>())
+      .def("__enter__", &PythonLoopNestContext::enter)
+      .def("__exit__", &PythonLoopNestContext::exit);
+
+  m.def("constant_index", [](int64_t val) -> PythonValueHandle {
+    return ValueHandle(index_t(val));
+  });
+  m.def("constant_int", [](int64_t val, int width) -> PythonValueHandle {
+    return ValueHandle::create<ConstantIntOp>(val, width);
+  });
+  m.def("constant_float", [](double val, PythonType type) -> PythonValueHandle {
+    FloatType floatType =
+        Type::getFromOpaquePointer(type.type).cast<FloatType>();
+    assert(floatType);
+    auto value = APFloat(val);
+    bool lostPrecision;
+    value.convert(floatType.getFloatSemantics(), APFloat::rmNearestTiesToEven,
+                  &lostPrecision);
+    return ValueHandle::create<ConstantFloatOp>(value, floatType);
+  });
+  m.def("constant_function", [](PythonFunction func) -> PythonValueHandle {
+    auto function = FuncOp::getFromOpaquePointer(func.function);
+    auto attr = SymbolRefAttr::get(function.getName(), function.getContext());
+    return ValueHandle::create<ConstantOp>(function.getType(), attr);
+  });
+  m.def("appendTo", [](const PythonBlockHandle &handle) {
+    return PythonBlockAppender(handle);
+  });
+  m.def(
+      "ret",
+      [](const std::vector<PythonValueHandle> &args) {
+        std::vector<ValueHandle> values(args.begin(), args.end());
+        (intrinsics::ret(ArrayRef<ValueHandle>{values})); // vexing parse
+        return PythonValueHandle(nullptr);
+      },
+      py::arg("args") = std::vector<PythonValueHandle>());
+  m.def(
+      "br",
+      [](const PythonBlockHandle &dest,
+         const std::vector<PythonValueHandle> &args) {
+        std::vector<ValueHandle> values(args.begin(), args.end());
+        intrinsics::br(dest, values);
+        return PythonValueHandle(nullptr);
+      },
+      py::arg("dest"), py::arg("args") = std::vector<PythonValueHandle>());
+  m.def(
+      "cond_br",
+      [](PythonValueHandle condition, const PythonBlockHandle &trueDest,
+         const std::vector<PythonValueHandle> &trueArgs,
+         const PythonBlockHandle &falseDest,
+         const std::vector<PythonValueHandle> &falseArgs) -> PythonValueHandle {
+        std::vector<ValueHandle> trueArguments(trueArgs.begin(),
+                                               trueArgs.end());
+        std::vector<ValueHandle> falseArguments(falseArgs.begin(),
+                                                falseArgs.end());
+        intrinsics::cond_br(condition, trueDest, trueArguments, falseDest,
+                            falseArguments);
+        return PythonValueHandle(nullptr);
+      });
+  m.def("select",
+        [](PythonValueHandle condition, PythonValueHandle trueValue,
+           PythonValueHandle falseValue) -> PythonValueHandle {
+          return ValueHandle::create<SelectOp>(condition.value, trueValue.value,
+                                               falseValue.value);
+        });
+  m.def("op",
+        [](const std::string &name,
+           const std::vector<PythonValueHandle> &operands,
+           const std::vector<PythonType> &resultTypes,
+           const py::kwargs &attributes) -> PythonValueHandle {
+          std::vector<ValueHandle> operandHandles(operands.begin(),
+                                                  operands.end());
+          std::vector<Type> types;
+          types.reserve(resultTypes.size());
+          for (auto t : resultTypes)
+            types.push_back(Type::getFromOpaquePointer(t.type));
+
+          std::vector<NamedAttribute> attrs;
+          attrs.reserve(attributes.size());
+          for (const auto &a : attributes) {
+            std::string name = a.first.str();
+            auto pyAttr = a.second.cast<PythonAttribute>();
+            auto cppAttr = Attribute::getFromOpaquePointer(pyAttr.attr);
+            auto identifier =
+                Identifier::get(name, ScopedContext::getContext());
+            attrs.emplace_back(identifier, cppAttr);
+          }
+
+          return ValueHandle::create(name, operandHandles, types, attrs);
+        });
+
+  py::class_<PythonFunction>(m, "Function", "Wrapping class for mlir::FuncOp.")
+      .def(py::init<PythonFunction>())
+      .def("__str__", &PythonFunction::str)
+      .def("define", &PythonFunction::define,
+           "Adds a body to the function if it does not already have one.  "
+           "Returns true if the body was added")
+      .def("arg", &PythonFunction::arg,
+           "Get the ValueHandle to the indexed argument of the function");
+
+  py::class_<PythonAttribute>(m, "Attribute",
+                              "Wrapping class for mlir::Attribute")
+      .def(py::init<PythonAttribute>())
+      .def("__str__", &PythonAttribute::str);
+
+  py::class_<PythonType>(m, "Type", "Wrapping class for mlir::Type.")
+      .def(py::init<PythonType>())
+      .def("__call__", &PythonType::attachAttributeDict,
+           "Attach the attributes to these type, making it suitable for "
+           "constructing functions with argument attributes")
+      .def("__str__", &PythonType::str);
+
+  py::class_<PythonAttributedType>(
+      m, "AttributedType",
+      "A class containing a wrapped mlir::Type and a wrapped "
+      "mlir::NamedAttributeList that are used together, e.g. in function "
+      "argument declaration")
+      .def(py::init<PythonAttributedType>())
+      .def("__str__", &PythonAttributedType::str);
+
+  py::class_<PythonMLIRModule>(
+      m, "MLIRModule",
+      "An MLIRModule is the abstraction that owns the allocations to support "
+      "compilation of a single mlir::ModuleOp into an ExecutionEngine backed "
+      "by "
+      "the LLVM ORC JIT. A typical flow consists in creating an MLIRModule, "
+      "adding functions, compiling the module to obtain an ExecutionEngine on "
+      "which named functions may be called. For now the only means to retrieve "
+      "the ExecutionEngine is by calling `get_engine_address`. This mode of "
+      "execution is limited to passing the pointer to C++ where the function "
+      "is called. Extending the API to allow calling JIT compiled functions "
+      "directly require integration with a tensor library (e.g. numpy). This "
+      "is left as the prerogative of libraries and frameworks for now.")
+      .def(py::init<>())
+      .def("boolAttr", &PythonMLIRModule::boolAttr,
+           "Creates an mlir::BoolAttr with the given value")
+      .def(
+          "integerAttr", &PythonMLIRModule::integerAttr,
+          "Creates an mlir::IntegerAttr of the given type with the given value "
+          "in the context associated with this MLIR module.")
+      .def("declare_function", &PythonMLIRModule::declareFunction,
+           "Declares a new mlir::FuncOp in the current mlir::ModuleOp.  The "
+           "function arguments can have attributes.  The function has no "
+           "definition and can be linked to an external library.")
+      .def("make_function", &PythonMLIRModule::makeFunction,
+           "Defines a new mlir::FuncOp in the current mlir::ModuleOp.")
+      .def("function_context", &PythonMLIRModule::makeFunctionContext,
+           "Defines a new mlir::FuncOp in the mlir::ModuleOp and creates the "
+           "function context for building the body of the function.")
+      .def("get_function", &PythonMLIRModule::getNamedFunction,
+           "Looks up the function with the given name in the module.")
+      .def(
+          "make_scalar_type",
+          [](PythonMLIRModule &instance, const std::string &type,
+             unsigned bitwidth) {
+            return instance.makeScalarType(type, bitwidth);
+          },
+          py::arg("type"), py::arg("bitwidth") = 0,
+          "Returns a scalar mlir::Type using the following convention:\n"
+          "  - makeScalarType(c, \"bf16\") return an "
+          "`mlir::FloatType::getBF16`\n"
+          "  - makeScalarType(c, \"f16\") return an `mlir::FloatType::getF16`\n"
+          "  - makeScalarType(c, \"f32\") return an `mlir::FloatType::getF32`\n"
+          "  - makeScalarType(c, \"f64\") return an `mlir::FloatType::getF64`\n"
+          "  - makeScalarType(c, \"index\") return an `mlir::IndexType::get`\n"
+          "  - makeScalarType(c, \"i\", bitwidth) return an "
+          "`mlir::IntegerType::get(bitwidth)`\n\n"
+          " No other combinations are currently supported.")
+      .def("make_memref_type", &PythonMLIRModule::makeMemRefType,
+           "Returns an mlir::MemRefType of an elemental scalar. -1 is used to "
+           "denote symbolic dimensions in the resulting memref shape.")
+      .def("make_index_type", &PythonMLIRModule::makeIndexType,
+           "Returns an mlir::IndexType")
+      .def("compile", &PythonMLIRModule::compile,
+           "Compiles the mlir::ModuleOp to LLVMIR a creates new opaque "
+           "ExecutionEngine backed by the ORC JIT.")
+      .def("get_ir", &PythonMLIRModule::getIR,
+           "Returns a dump of the MLIR representation of the module. This is "
+           "used for serde to support out-of-process execution as well as "
+           "debugging purposes.")
+      .def("get_engine_address", &PythonMLIRModule::getEngineAddress,
+           "Returns the address of the compiled ExecutionEngine. This is used "
+           "for in-process execution.")
+      .def("__str__", &PythonMLIRModule::getIR,
+           "Get the string representation of the module");
+
+  py::class_<PythonFunctionContext>(
+      m, "FunctionContext", "A wrapper around mlir::edsc::ScopedContext")
+      .def(py::init<PythonFunction>())
+      .def("__enter__", &PythonFunctionContext::enter)
+      .def("__exit__", &PythonFunctionContext::exit);
+
+  {
+    using namespace mlir::edsc::op;
+    py::class_<PythonValueHandle>(m, "ValueHandle",
+                                  "A wrapper around mlir::edsc::ValueHandle")
+        .def(py::init<PythonType>())
+        .def(py::init<PythonValueHandle>())
+        .def("__add__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value + rhs.value; })
+        .def("__sub__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value - rhs.value; })
+        .def("__mul__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value * rhs.value; })
+        .def("__div__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value / rhs.value; })
+        .def("__truediv__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value / rhs.value; })
+        .def("__floordiv__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return floorDiv(lhs, rhs); })
+        .def("__mod__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value % rhs.value; })
+        .def("__lt__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::SLT, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__le__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::SLE, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__gt__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::SGT, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__ge__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::SGE, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__eq__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::EQ, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__ne__",
+             [](PythonValueHandle lhs,
+                PythonValueHandle rhs) -> PythonValueHandle {
+               return ValueHandle::create<CmpIOp>(CmpIPredicate::NE, lhs.value,
+                                                  rhs.value);
+             })
+        .def("__invert__",
+             [](PythonValueHandle handle) -> PythonValueHandle {
+               return !handle.value;
+             })
+        .def("__and__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value && rhs.value; })
+        .def("__or__",
+             [](PythonValueHandle lhs, PythonValueHandle rhs)
+                 -> PythonValueHandle { return lhs.value || rhs.value; })
+        .def("__call__", &PythonValueHandle::call);
+  }
+
+  py::class_<PythonBlockAppender>(
+      m, "BlockAppender",
+      "A dummy class signaling BlockContext to append IR to the given block "
+      "instead of creating a new block")
+      .def(py::init<const PythonBlockHandle &>());
+  py::class_<PythonBlockHandle>(m, "BlockHandle",
+                                "A wrapper around mlir::edsc::BlockHandle")
+      .def(py::init<PythonBlockHandle>())
+      .def("arg", &PythonBlockHandle::arg);
+
+  py::class_<PythonBlockContext>(m, "BlockContext",
+                                 "A wrapper around mlir::edsc::BlockBuilder")
+      .def(py::init<>())
+      .def(py::init<const std::vector<PythonType> &>())
+      .def(py::init<const PythonBlockAppender &>())
+      .def("__enter__", &PythonBlockContext::enter)
+      .def("__exit__", &PythonBlockContext::exit)
+      .def("handle", &PythonBlockContext::getHandle);
+
+  py::class_<PythonIndexedValue>(m, "IndexedValue",
+                                 "A wrapper around mlir::edsc::IndexedValue")
+      .def(py::init<PythonValueHandle>())
+      .def("load", &PythonIndexedValue::load)
+      .def("store", &PythonIndexedValue::store);
+}
+
+} // namespace python
+} // namespace edsc
+} // namespace mlir
diff --git a/third_party/mlir/bindings/python/test/BUILD b/third_party/mlir/bindings/python/test/BUILD
new file mode 100644
index 00000000000..36fe5cbe48e
--- /dev/null
+++ b/third_party/mlir/bindings/python/test/BUILD
@@ -0,0 +1,36 @@
+# Description:
+#   BUILD file for the Python wrappers for EDSCs
+
+licenses(["notice"])  # Apache 2.0
+
+# Export the BUILD file so automated tooling can check licenses
+exports_files(["BUILD"])
+
+load("//third_party/llvm/build_defs:lit.bzl", "glob_lit_tests")
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@local_config_mlir//:run_lit.sh",
+    test_file_exts = ["py"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        ":test_edsc",
+        "//third_party/llvm/llvm:FileCheck",
+    ],
+)
+
+py_binary(
+    name = "test_edsc",
+    srcs = ["test_py2and3.py"],
+    main = "test_py2and3.py",
+    python_version = "PY2",
+    deps = [
+        "//testing/pybase",
+        "@local_config_mlir//bindings/python:_pybind",
+    ],
+)
diff --git a/third_party/mlir/bindings/python/test/test_py2and3.py b/third_party/mlir/bindings/python/test/test_py2and3.py
new file mode 100644
index 00000000000..c658c9411b7
--- /dev/null
+++ b/third_party/mlir/bindings/python/test/test_py2and3.py
@@ -0,0 +1,486 @@
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/test_edsc %s | FileCheck %s
+"""Python2 and 3 test for the MLIR EDSC Python bindings"""
+
+import google_mlir.bindings.python.pybind as E
+import inspect
+
+# Prints `str` prefixed by the current test function name so we can use it in
+# Filecheck label directives.
+# This is achieved by inspecting the stack and getting the parent name.
+def printWithCurrentFunctionName(str):
+  print(inspect.stack()[1][3])
+  print(str)
+
+class EdscTest:
+
+  def setUp(self):
+    self.module = E.MLIRModule()
+    self.boolType = self.module.make_scalar_type("i", 1)
+    self.i32Type = self.module.make_scalar_type("i", 32)
+    self.f32Type = self.module.make_scalar_type("f32")
+    self.indexType = self.module.make_index_type()
+
+  def testBlockArguments(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      E.constant_index(42)
+      with E.BlockContext([self.f32Type, self.f32Type]) as b:
+        b.arg(0) + b.arg(1)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBlockArguments
+    #       CHECK: %{{.*}} = constant 42 : index
+    #       CHECK: ^bb{{.*}}(%{{.*}}: f32, %{{.*}}: f32):
+    #       CHECK:   %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+
+  def testBlockContext(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      cst = E.constant_index(42)
+      with E.BlockContext():
+        cst + cst
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBlockContext
+    #       CHECK: %{{.*}} = constant 42 : index
+    #       CHECK: ^bb
+    #       CHECK: %{{.*}} = "affine.apply"() {map = () -> (84)} : () -> index
+
+  def testBlockContextAppend(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      E.constant_index(41)
+      with E.BlockContext() as b:
+        blk = b  # save block handle for later
+        E.constant_index(0)
+      E.constant_index(42)
+      with E.BlockContext(E.appendTo(blk)):
+        E.constant_index(1)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBlockContextAppend
+    #       CHECK: %{{.*}} = constant 41 : index
+    #       CHECK: %{{.*}} = constant 42 : index
+    #       CHECK: ^bb
+    #       CHECK: %{{.*}} = constant 0 : index
+    #       CHECK: %{{.*}} = constant 1 : index
+
+  def testBlockContextStandalone(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      blk1 = E.BlockContext()
+      blk2 = E.BlockContext()
+      with blk1:
+        E.constant_index(0)
+      with blk2:
+        E.constant_index(56)
+        E.constant_index(57)
+      E.constant_index(41)
+      with blk1:
+        E.constant_index(1)
+      E.constant_index(42)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBlockContextStandalone
+    #       CHECK: %{{.*}} = constant 41 : index
+    #       CHECK: %{{.*}} = constant 42 : index
+    #       CHECK: ^bb
+    #       CHECK: %{{.*}} = constant 0 : index
+    #       CHECK: %{{.*}} = constant 1 : index
+    #       CHECK: ^bb
+    #       CHECK: %{{.*}} = constant 56 : index
+    #       CHECK: %{{.*}} = constant 57 : index
+
+  def testBooleanOps(self):
+    self.setUp()
+    with self.module.function_context(
+        "booleans", [self.boolType for _ in range(4)], []) as fun:
+      i, j, k, l = (fun.arg(x) for x in range(4))
+      stmt1 = (i < j) & (j >= k)
+      stmt2 = ~(stmt1 | (k == l))
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBooleanOps
+    #       CHECK: %{{.*}} = cmpi "slt", %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = cmpi "sge", %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = muli %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = cmpi "eq", %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = constant 1 : i1
+    #       CHECK: %{{.*}} = subi %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = constant 1 : i1
+    #       CHECK: %{{.*}} = subi %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = muli %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = constant 1 : i1
+    #       CHECK: %{{.*}} = subi %{{.*}}, %{{.*}} : i1
+    #       CHECK: %{{.*}} = constant 1 : i1
+    #       CHECK: %{{.*}} = subi %{{.*}}, %{{.*}} : i1
+
+  def testBr(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      with E.BlockContext() as b:
+        blk = b
+        E.ret()
+      E.br(blk)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBr
+    #       CHECK:   br ^bb
+    #       CHECK: ^bb
+    #       CHECK:   return
+
+  def testBrArgs(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      # Create an infinite loop.
+      with E.BlockContext([self.indexType, self.indexType]) as b:
+        E.br(b, [b.arg(1), b.arg(0)])
+      E.br(b, [E.constant_index(0), E.constant_index(1)])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBrArgs
+    #       CHECK:   %{{.*}} = constant 0 : index
+    #       CHECK:   %{{.*}} = constant 1 : index
+    #       CHECK:   br ^bb{{.*}}(%{{.*}}, %{{.*}} : index, index)
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index, %{{.*}}: index):
+    #       CHECK:   br ^bb{{.*}}(%{{.*}}, %{{.*}} : index, index)
+
+  def testBrDeclaration(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      blk = E.BlockContext()
+      E.br(blk.handle())
+      with blk:
+        E.ret()
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testBrDeclaration
+    #       CHECK:   br ^bb
+    #       CHECK: ^bb
+    #       CHECK:   return
+
+  def testCallOp(self):
+    self.setUp()
+    callee = self.module.declare_function("sqrtf", [self.f32Type],
+                                          [self.f32Type])
+    with self.module.function_context("call", [self.f32Type], []) as fun:
+      funCst = E.constant_function(callee)
+      funCst([fun.arg(0)]) + E.constant_float(42., self.f32Type)
+      printWithCurrentFunctionName(str(self.module))
+    # CHECK-LABEL: testCallOp
+    #       CHECK: func @sqrtf(f32) -> f32
+    #       CHECK:   %{{.*}} = constant @sqrtf : (f32) -> f32
+    #       CHECK:   %{{.*}} = call_indirect %{{.*}}(%{{.*}}) : (f32) -> f32
+
+  def testCondBr(self):
+    self.setUp()
+    with self.module.function_context("foo", [self.boolType], []) as fun:
+      with E.BlockContext() as blk1:
+        E.ret([])
+      with E.BlockContext([self.indexType]) as blk2:
+        E.ret([])
+      cst = E.constant_index(0)
+      E.cond_br(fun.arg(0), blk1, [], blk2, [cst])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testCondBr
+    #       CHECK:   cond_br %{{.*}}, ^bb{{.*}}, ^bb{{.*}}(%{{.*}} : index)
+
+  def testConstants(self):
+    self.setUp()
+    with self.module.function_context("constants", [], []) as fun:
+      E.constant_float(1.23, self.module.make_scalar_type("bf16"))
+      E.constant_float(1.23, self.module.make_scalar_type("f16"))
+      E.constant_float(1.23, self.module.make_scalar_type("f32"))
+      E.constant_float(1.23, self.module.make_scalar_type("f64"))
+      E.constant_int(1, 1)
+      E.constant_int(123, 8)
+      E.constant_int(123, 16)
+      E.constant_int(123, 32)
+      E.constant_int(123, 64)
+      E.constant_index(123)
+      E.constant_function(fun)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testConstants
+    #       CHECK:  constant 1.230000e+00 : bf16
+    #       CHECK:  constant 1.230470e+00 : f16
+    #       CHECK:  constant 1.230000e+00 : f32
+    #       CHECK:  constant 1.230000e+00 : f64
+    #       CHECK:  constant 1 : i1
+    #       CHECK:  constant 123 : i8
+    #       CHECK:  constant 123 : i16
+    #       CHECK:  constant 123 : i32
+    #       CHECK:  constant 123 : index
+    #       CHECK:  constant @constants : () -> ()
+
+  def testCustom(self):
+    self.setUp()
+    with self.module.function_context("custom", [self.indexType, self.f32Type],
+                                      []) as fun:
+      E.op("foo", [fun.arg(0)], [self.f32Type]) + fun.arg(1)
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testCustom
+    #       CHECK: %{{.*}} = "foo"(%{{.*}}) : (index) -> f32
+    #       CHECK:  %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+
+  # Create 'addi' using the generic Op interface.  We need an operation known
+  # to the execution engine so that the engine can compile it.
+  def testCustomOpCompilation(self):
+    self.setUp()
+    with self.module.function_context("adder", [self.i32Type], []) as f:
+      c1 = E.op(
+          "std.constant", [], [self.i32Type],
+          value=self.module.integerAttr(self.i32Type, 42))
+      E.op("std.addi", [c1, f.arg(0)], [self.i32Type])
+      E.ret([])
+    self.module.compile()
+    printWithCurrentFunctionName(str(self.module.get_engine_address() == 0))
+    # CHECK-LABEL: testCustomOpCompilation
+    #       CHECK: False
+
+  def testDivisions(self):
+    self.setUp()
+    with self.module.function_context(
+        "division", [self.indexType, self.i32Type, self.i32Type], []) as fun:
+      # indices only support floor division
+      fun.arg(0) // E.constant_index(42)
+      # regular values only support regular division
+      fun.arg(1) / fun.arg(2)
+      printWithCurrentFunctionName(str(self.module))
+    # CHECK-LABEL: testDivisions
+    #       CHECK:  floordiv 42
+    #       CHECK:  divis %{{.*}}, %{{.*}} : i32
+
+  def testFunctionArgs(self):
+    self.setUp()
+    with self.module.function_context("foo", [self.f32Type, self.f32Type],
+                                      [self.indexType]) as fun:
+      pass
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testFunctionArgs
+    #       CHECK: func @foo(%{{.*}}: f32, %{{.*}}: f32) -> index
+
+  def testFunctionContext(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []):
+      pass
+      printWithCurrentFunctionName(self.module.get_function("foo"))
+    # CHECK-LABEL: testFunctionContext
+    #       CHECK: func @foo() {
+
+  def testFunctionDeclaration(self):
+    self.setUp()
+    boolAttr = self.module.boolAttr(True)
+    t = self.module.make_memref_type(self.f32Type, [10])
+    t_llvm_noalias = t({"llvm.noalias": boolAttr})
+    t_readonly = t({"readonly": boolAttr})
+    f = self.module.declare_function("foo", [t, t_llvm_noalias, t_readonly], [])
+    printWithCurrentFunctionName(str(self.module))
+    # CHECK-LABEL: testFunctionDeclaration
+    #       CHECK: func @foo(memref<10xf32>, memref<10xf32> {llvm.noalias = true}, memref<10xf32> {readonly = true})
+
+  def testFunctionMultiple(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []):
+      pass
+    with self.module.function_context("foo", [], []):
+      E.constant_index(0)
+    printWithCurrentFunctionName(str(self.module))
+    # CHECK-LABEL: testFunctionMultiple
+    #       CHECK: func @foo()
+    #       CHECK: func @foo_0()
+    #       CHECK: %{{.*}} = constant 0 : index
+
+  def testIndexedValue(self):
+    self.setUp()
+    memrefType = self.module.make_memref_type(self.f32Type, [10, 42])
+    with self.module.function_context("indexed", [memrefType],
+                                      [memrefType]) as fun:
+      A = E.IndexedValue(fun.arg(0))
+      cst = E.constant_float(1., self.f32Type)
+      with E.LoopNestContext(
+          [E.constant_index(0), E.constant_index(0)],
+          [E.constant_index(10), E.constant_index(42)], [1, 1]) as (i, j):
+        A.store([i, j], A.load([i, j]) + cst)
+      E.ret([fun.arg(0)])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testIndexedValue
+    #       CHECK: "affine.for"()
+    #       CHECK: "affine.for"()
+    #       CHECK: "affine.load"
+    #  CHECK-SAME: memref<10x42xf32>
+    #       CHECK:  %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+    #       CHECK:  "affine.store"
+    #  CHECK-SAME:  memref<10x42xf32>
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (42)}
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (10)}
+
+  def testLoopContext(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      lhs = E.constant_index(0)
+      rhs = E.constant_index(42)
+      with E.LoopContext(lhs, rhs, 1) as i:
+        lhs + rhs + i
+        with E.LoopContext(rhs, rhs + rhs, 2) as j:
+          x = i + j
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testLoopContext
+    #       CHECK: "affine.for"() (
+    #       CHECK:   ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: "affine.for"(%{{.*}}, %{{.*}}) (
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: "affine.apply"(%{{.*}}, %{{.*}}) {map = (d0, d1) -> (d0 + d1)} : (index, index) -> index
+    #       CHECK: {lower_bound = (d0) -> (d0), step = 2 : index, upper_bound = (d0) -> (d0)} : (index, index) -> ()
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (42)}
+
+  def testLoopNestContext(self):
+    self.setUp()
+    with self.module.function_context("foo", [], []) as fun:
+      lbs = [E.constant_index(i) for i in range(4)]
+      ubs = [E.constant_index(10 * i + 5) for i in range(4)]
+      with E.LoopNestContext(lbs, ubs, [1, 3, 5, 7]) as (i, j, k, l):
+        i + j + k + l
+    printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testLoopNestContext
+    #       CHECK: "affine.for"() (
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: "affine.for"() (
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: "affine.for"() (
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: "affine.for"() (
+    #       CHECK: ^bb{{.*}}(%{{.*}}: index):
+    #       CHECK: %{{.*}} = "affine.apply"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {map = (d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)} : (index, index, index, index) -> index
+
+  def testMLIRBooleanCompilation(self):
+    self.setUp()
+    m = self.module.make_memref_type(self.boolType, [10])  # i1 tensor
+    with self.module.function_context("mkbooltensor", [m, m], []) as f:
+      input = E.IndexedValue(f.arg(0))
+      output = E.IndexedValue(f.arg(1))
+      zero = E.constant_index(0)
+      ten = E.constant_index(10)
+      with E.LoopNestContext([zero] * 3, [ten] * 3, [1] * 3) as (i, j, k):
+        b1 = (i < j) & (j < k)
+        b2 = ~b1
+        b3 = b2 | (k < j)
+        output.store([i], input.load([i]) & b3)
+      E.ret([])
+    self.module.compile()
+    printWithCurrentFunctionName(str(self.module.get_engine_address() == 0))
+    # CHECK-LABEL: testMLIRBooleanCompilation
+    #       CHECK: False
+
+  def testMLIRFunctionCreation(self):
+    self.setUp()
+    module = E.MLIRModule()
+    t = module.make_scalar_type("f32")
+    m = module.make_memref_type(t, [3, 4, -1, 5])
+    printWithCurrentFunctionName(str(t))
+    print(str(m))
+    print(str(module.make_function("copy", [m, m], [])))
+    print(str(module.make_function("sqrtf", [t], [t])))
+    # CHECK-LABEL: testMLIRFunctionCreation
+    #       CHECK:  f32
+    #       CHECK:  memref<3x4x?x5xf32>
+    #       CHECK: func @copy(%{{.*}}: memref<3x4x?x5xf32>, %{{.*}}: memref<3x4x?x5xf32>) {
+    #       CHECK:  func @sqrtf(%{{.*}}: f32) -> f32
+
+  def testMLIRScalarTypes(self):
+    self.setUp()
+    module = E.MLIRModule()
+    printWithCurrentFunctionName(str(module.make_scalar_type("bf16")))
+    print(str(module.make_scalar_type("f16")))
+    print(str(module.make_scalar_type("f32")))
+    print(str(module.make_scalar_type("f64")))
+    print(str(module.make_scalar_type("i", 1)))
+    print(str(module.make_scalar_type("i", 8)))
+    print(str(module.make_scalar_type("i", 32)))
+    print(str(module.make_scalar_type("i", 123)))
+    print(str(module.make_scalar_type("index")))
+    # CHECK-LABEL: testMLIRScalarTypes
+    #       CHECK:  bf16
+    #       CHECK:  f16
+    #       CHECK:  f32
+    #       CHECK:  f64
+    #       CHECK:  i1
+    #       CHECK:  i8
+    #       CHECK:  i32
+    #       CHECK:  i123
+    #       CHECK:  index
+
+  def testMatrixMultiply(self):
+    self.setUp()
+    memrefType = self.module.make_memref_type(self.f32Type, [32, 32])
+    with self.module.function_context(
+        "matmul", [memrefType, memrefType, memrefType], []) as fun:
+      A = E.IndexedValue(fun.arg(0))
+      B = E.IndexedValue(fun.arg(1))
+      C = E.IndexedValue(fun.arg(2))
+      c0 = E.constant_index(0)
+      c32 = E.constant_index(32)
+      with E.LoopNestContext([c0, c0, c0], [c32, c32, c32], [1, 1, 1]) as (i, j,
+                                                                           k):
+        C.store([i, j], A.load([i, k]) * B.load([k, j]))
+      E.ret([])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testMatrixMultiply
+    #       CHECK: "affine.for"()
+    #       CHECK: "affine.for"()
+    #       CHECK: "affine.for"()
+    #   CHECK-DAG:  %{{.*}} = "affine.load"
+    #   CHECK-DAG:  %{{.*}} = "affine.load"
+    #       CHECK:  %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+    #       CHECK:  "affine.store"
+    #  CHECK-SAME:  memref<32x32xf32>
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (32)} : () -> ()
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (32)} : () -> ()
+    #       CHECK: {lower_bound = () -> (0), step = 1 : index, upper_bound = () -> (32)} : () -> ()
+
+  def testRet(self):
+    self.setUp()
+    with self.module.function_context("foo", [],
+                                      [self.indexType, self.indexType]) as fun:
+      c42 = E.constant_index(42)
+      c0 = E.constant_index(0)
+      E.ret([c42, c0])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testRet
+    #       CHECK:    %{{.*}} = constant 42 : index
+    #       CHECK:    %{{.*}} = constant 0 : index
+    #       CHECK:    return %{{.*}}, %{{.*}} : index, index
+
+  def testSelectOp(self):
+    self.setUp()
+    with self.module.function_context("foo", [self.boolType],
+                                      [self.i32Type]) as fun:
+      a = E.constant_int(42, 32)
+      b = E.constant_int(0, 32)
+      E.ret([E.select(fun.arg(0), a, b)])
+      printWithCurrentFunctionName(str(fun))
+    # CHECK-LABEL: testSelectOp
+    #       CHECK:  %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : i32
+
+
+# Until python 3.6 this cannot be used because the order in the dict is not the
+# order of method declaration.
+def runTests():
+  def isTest(attr):
+    return inspect.ismethod(attr) and "EdscTest.setUp " not in str(attr)
+
+  edscTest = EdscTest()
+  tests = sorted(filter(isTest,
+                        (getattr(edscTest, attr) for attr in dir(edscTest))),
+                 key = lambda x : str(x))
+  for test in tests:
+    test()
+
+if __name__ == '__main__':
+  runTests()
diff --git a/third_party/mlir/include/mlir-c/Core.h b/third_party/mlir/include/mlir-c/Core.h
new file mode 100644
index 00000000000..918ccdf60ec
--- /dev/null
+++ b/third_party/mlir/include/mlir-c/Core.h
@@ -0,0 +1,119 @@
+/*===-- mlir-c/Core.h - Core Library C Interface ------------------*- C -*-===*\
+|*                                                                            *|
+|* Copyright 2019 The MLIR Authors.                                           *|
+|*                                                                            *|
+|* Licensed under the Apache License, Version 2.0 (the "License");            *|
+|* you may not use this file except in compliance with the License.           *|
+|* You may obtain a copy of the License at                                    *|
+|*                                                                            *|
+|*   http://www.apache.org/licenses/LICENSE-2.0                               *|
+|*                                                                            *|
+|* Unless required by applicable law or agreed to in writing, software        *|
+|* distributed under the License is distributed on an "AS IS" BASIS,          *|
+|* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *|
+|* See the License for the specific language governing permissions and        *|
+|* limitations under the License.                                             *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to MLIR.                              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+#ifndef MLIR_C_CORE_H
+#define MLIR_C_CORE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+extern "C" {
+#else
+#include <stdint.h>
+#endif
+
+/// Opaque MLIR types.
+/// Opaque C type for mlir::MLIRContext*.
+typedef void *mlir_context_t;
+/// Opaque C type for mlir::Type.
+typedef const void *mlir_type_t;
+/// Opaque C type for mlir::FuncOp.
+typedef void *mlir_func_t;
+/// Opaque C type for mlir::Attribute.
+typedef const void *mlir_attr_t;
+
+/// Simple C lists for non-owning mlir Opaque C types.
+/// Recommended usage is construction from the `data()` and `size()` of a scoped
+/// owning SmallVectorImpl<...> and passing to one of the C functions declared
+/// later in this file.
+/// Once the function returns and the proper EDSC has been constructed,
+/// resources are freed by exiting the scope.
+typedef struct {
+  int64_t *values;
+  uint64_t n;
+} int64_list_t;
+
+typedef struct {
+  mlir_type_t *types;
+  uint64_t n;
+} mlir_type_list_t;
+
+typedef struct {
+  const char *name;
+  mlir_attr_t value;
+} mlir_named_attr_t;
+
+typedef struct {
+  mlir_named_attr_t *list;
+  uint64_t n;
+} mlir_named_attr_list_t;
+
+/// Minimal C API for exposing EDSCs to Swift, Python and other languages.
+
+/// Returns a simple scalar mlir::Type using the following convention:
+///   - makeScalarType(c, "bf16") return an `mlir::FloatType::getBF16`
+///   - makeScalarType(c, "f16") return an `mlir::FloatType::getF16`
+///   - makeScalarType(c, "f32") return an `mlir::FloatType::getF32`
+///   - makeScalarType(c, "f64") return an `mlir::FloatType::getF64`
+///   - makeScalarType(c, "index") return an `mlir::IndexType::get`
+///   - makeScalarType(c, "i", bitwidth) return an
+///     `mlir::IntegerType::get(bitwidth)`
+///
+/// No other combinations are currently supported.
+mlir_type_t makeScalarType(mlir_context_t context, const char *name,
+                           unsigned bitwidth);
+
+/// Returns an `mlir::MemRefType` of the element type `elemType` and shape
+/// `sizes`.
+mlir_type_t makeMemRefType(mlir_context_t context, mlir_type_t elemType,
+                           int64_list_t sizes);
+
+/// Returns an `mlir::FunctionType` of the element type `elemType` and shape
+/// `sizes`.
+mlir_type_t makeFunctionType(mlir_context_t context, mlir_type_list_t inputs,
+                             mlir_type_list_t outputs);
+
+/// Returns an `mlir::IndexType`.
+mlir_type_t makeIndexType(mlir_context_t context);
+
+/// Returns an `mlir::IntegerAttr` of the specified type that contains the given
+/// value.
+mlir_attr_t makeIntegerAttr(mlir_type_t type, int64_t value);
+
+/// Returns an `mlir::BoolAttr` with the given value.
+mlir_attr_t makeBoolAttr(mlir_context_t context, bool value);
+
+/// Returns the arity of `function`.
+unsigned getFunctionArity(mlir_func_t function);
+
+/// Returns the rank of the `function` argument at position `pos`.
+/// If the argument is of MemRefType, this returns the rank of the MemRef.
+/// Otherwise returns `0`.
+/// TODO(ntv): support more than MemRefType and scalar Type.
+unsigned getRankOfFunctionArgument(mlir_func_t function, unsigned pos);
+
+/// Returns an opaque mlir::Type of the `function` argument at position `pos`.
+mlir_type_t getTypeOfFunctionArgument(mlir_func_t function, unsigned pos);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // MLIR_C_CORE_H
diff --git a/third_party/mlir/include/mlir/Analysis/AffineAnalysis.h b/third_party/mlir/include/mlir/Analysis/AffineAnalysis.h
new file mode 100644
index 00000000000..bb25a65205c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -0,0 +1,134 @@
+//===- AffineAnalysis.h - analyses for affine structures --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for methods that perform analysis
+// involving affine structures (AffineExprStorage, AffineMap, IntegerSet, etc.)
+// and other IR structures that in turn use these.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+#define MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class AffineValueMap;
+class FlatAffineConstraints;
+class Operation;
+class Value;
+
+/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
+/// Operations that are reachable via a search starting from `operands` and
+/// ending at those operands that are not the result of an AffineApplyOp.
+void getReachableAffineApplyOps(
+    llvm::ArrayRef<Value *> operands,
+    llvm::SmallVectorImpl<Operation *> &affineApplyOps);
+
+/// Builds a system of constraints with dimensional identifiers corresponding to
+/// the loop IVs of the forOps appearing in that order. Bounds of the loop are
+/// used to add appropriate inequalities. Any symbols founds in the bound
+/// operands are added as symbols in the system. Returns failure for the yet
+/// unimplemented cases.
+//  TODO(bondhugula): handle non-unit strides.
+LogicalResult getIndexSet(llvm::MutableArrayRef<AffineForOp> forOps,
+                          FlatAffineConstraints *domain);
+
+/// Encapsulates a memref load or store access information.
+struct MemRefAccess {
+  Value *memref;
+  Operation *opInst;
+  llvm::SmallVector<Value *, 4> indices;
+
+  /// Constructs a MemRefAccess from a load or store operation.
+  // TODO(b/119949820): add accessors to standard op's load, store, DMA op's to
+  // return MemRefAccess, i.e., loadOp->getAccess(), dmaOp->getRead/WriteAccess.
+  explicit MemRefAccess(Operation *opInst);
+
+  // Returns the rank of the memref associated with this access.
+  unsigned getRank() const;
+  // Returns true if this access is of a store op.
+  bool isStore() const;
+
+  /// Populates 'accessMap' with composition of AffineApplyOps reachable from
+  // 'indices'.
+  void getAccessMap(AffineValueMap *accessMap) const;
+};
+
+// DependenceComponent contains state about the direction of a dependence as an
+// interval [lb, ub] for an AffineForOp.
+// Distance vectors components are represented by the interval [lb, ub] with
+// lb == ub.
+// Direction vectors components are represented by the interval [lb, ub] with
+// lb < ub. Note that ub/lb == None means unbounded.
+struct DependenceComponent {
+  // The AffineForOp Operation associated with this dependence component.
+  Operation *op;
+  // The lower bound of the dependence distance.
+  llvm::Optional<int64_t> lb;
+  // The upper bound of the dependence distance (inclusive).
+  llvm::Optional<int64_t> ub;
+  DependenceComponent() : lb(llvm::None), ub(llvm::None) {}
+};
+
+/// Checks whether two accesses to the same memref access the same element.
+/// Each access is specified using the MemRefAccess structure, which contains
+/// the operation, indices and memref associated with the access. Returns
+/// 'NoDependence' if it can be determined conclusively that the accesses do not
+/// access the same memref element. If 'allowRAR' is true, will consider
+/// read-after-read dependences (typically used by applications trying to
+/// optimize input reuse).
+// TODO(andydavis) Wrap 'dependenceConstraints' and 'dependenceComponents' into
+// a single struct.
+// TODO(andydavis) Make 'dependenceConstraints' optional arg.
+struct DependenceResult {
+  enum ResultEnum {
+    HasDependence, // A dependence exists between 'srcAccess' and 'dstAccess'.
+    NoDependence,  // No dependence exists between 'srcAccess' and 'dstAccess'.
+    Failure,       // Dependence check failed due to unsupported cases.
+  } value;
+  DependenceResult(ResultEnum v) : value(v) {}
+};
+
+DependenceResult checkMemrefAccessDependence(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    llvm::SmallVector<DependenceComponent, 2> *dependenceComponents,
+    bool allowRAR = false);
+
+/// Utility function that returns true if the provided DependenceResult
+/// corresponds to a dependence result.
+inline bool hasDependence(DependenceResult result) {
+  return result.value == DependenceResult::HasDependence;
+}
+
+/// Returns in 'depCompsVec', dependence components for dependences between all
+/// load and store ops in loop nest rooted at 'forOp', at loop depths in range
+/// [1, maxLoopDepth].
+void getDependenceComponents(
+    AffineForOp forOp, unsigned maxLoopDepth,
+    std::vector<llvm::SmallVector<DependenceComponent, 2>> *depCompsVec);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_AFFINE_ANALYSIS_H
diff --git a/third_party/mlir/include/mlir/Analysis/AffineStructures.h b/third_party/mlir/include/mlir/Analysis/AffineStructures.h
new file mode 100644
index 00000000000..968ffb1a791
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/AffineStructures.h
@@ -0,0 +1,813 @@
+//===- AffineStructures.h - MLIR Affine Structures Class --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Structures for affine/polyhedral analysis of ML functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_AFFINE_STRUCTURES_H
+#define MLIR_ANALYSIS_AFFINE_STRUCTURES_H
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineBound;
+class AffineCondition;
+class AffineMap;
+class AffineForOp;
+class IntegerSet;
+class MLIRContext;
+class Value;
+class HyperRectangularSet;
+class MemRefType;
+
+/// A mutable affine map. Its affine expressions are however unique.
+struct MutableAffineMap {
+public:
+  MutableAffineMap() {}
+  MutableAffineMap(AffineMap map);
+
+  ArrayRef<AffineExpr> getResults() const { return results; }
+  AffineExpr getResult(unsigned idx) const { return results[idx]; }
+  void setResult(unsigned idx, AffineExpr result) { results[idx] = result; }
+  unsigned getNumResults() const { return results.size(); }
+  unsigned getNumDims() const { return numDims; }
+  void setNumDims(unsigned d) { numDims = d; }
+  unsigned getNumSymbols() const { return numSymbols; }
+  void setNumSymbols(unsigned d) { numSymbols = d; }
+  MLIRContext *getContext() const { return context; }
+
+  /// Returns true if the idx'th result expression is a multiple of factor.
+  bool isMultipleOf(unsigned idx, int64_t factor) const;
+
+  /// Resets this MutableAffineMap with 'map'.
+  void reset(AffineMap map);
+
+  /// Simplify the (result) expressions in this map using analysis (used by
+  //-simplify-affine-expr pass).
+  void simplify();
+  /// Get the AffineMap corresponding to this MutableAffineMap. Note that an
+  /// AffineMap will be uniqued and stored in context, while a mutable one
+  /// isn't.
+  AffineMap getAffineMap() const;
+
+private:
+  // Same meaning as AffineMap's fields.
+  SmallVector<AffineExpr, 8> results;
+  unsigned numDims;
+  unsigned numSymbols;
+  /// A pointer to the IR's context to store all newly created
+  /// AffineExprStorage's.
+  MLIRContext *context;
+};
+
+/// A mutable integer set. Its affine expressions are however unique.
+struct MutableIntegerSet {
+public:
+  MutableIntegerSet(IntegerSet set, MLIRContext *context);
+
+  /// Create a universal set (no constraints).
+  MutableIntegerSet(unsigned numDims, unsigned numSymbols,
+                    MLIRContext *context);
+
+  unsigned getNumDims() const { return numDims; }
+  unsigned getNumSymbols() const { return numSymbols; }
+  unsigned getNumConstraints() const { return constraints.size(); }
+
+  void clear() {
+    constraints.clear();
+    eqFlags.clear();
+  }
+
+private:
+  unsigned numDims;
+  unsigned numSymbols;
+
+  SmallVector<AffineExpr, 8> constraints;
+  SmallVector<bool, 8> eqFlags;
+};
+
+/// An AffineValueMap is an affine map plus its ML value operands and
+/// results for analysis purposes. The structure is still a tree form that is
+/// same as that of an affine map or an AffineApplyOp. However, its operands,
+/// results, and its map can themselves change  as a result of
+/// substitutions, simplifications, and other analysis.
+// An affine value map can readily be constructed from an AffineApplyOp, or an
+// AffineBound of a AffineForOp. It can be further transformed, substituted
+// into, or simplified. Unlike AffineMap's, AffineValueMap's are created and
+// destroyed during analysis. Only the AffineMap expressions that are pointed by
+// them are unique'd. An affine value map, and the operations on it, maintain
+// the invariant that operands are always positionally aligned with the
+// AffineDimExpr and AffineSymbolExpr in the underlying AffineMap.
+// TODO(bondhugula): Some of these classes could go into separate files.
+class AffineValueMap {
+public:
+  // Creates an empty AffineValueMap (users should call 'reset' to reset map
+  // and operands).
+  AffineValueMap() {}
+  AffineValueMap(AffineMap map);
+  AffineValueMap(AffineMap map, ArrayRef<Value *> operands,
+                 ArrayRef<Value *> results = llvm::None);
+
+  explicit AffineValueMap(AffineApplyOp applyOp);
+  explicit AffineValueMap(AffineBound bound);
+
+  ~AffineValueMap();
+
+  // Resets this AffineValueMap with 'map', 'operands', and 'results'.
+  void reset(AffineMap map, ArrayRef<Value *> operands,
+             ArrayRef<Value *> results = llvm::None);
+
+  /// Return true if the idx^th result can be proved to be a multiple of
+  /// 'factor', false otherwise.
+  inline bool isMultipleOf(unsigned idx, int64_t factor) const;
+
+  /// Return true if the idx^th result depends on 'value', false otherwise.
+  bool isFunctionOf(unsigned idx, Value *value) const;
+
+  /// Return true if the result at 'idx' is a constant, false
+  /// otherwise.
+  bool isConstant(unsigned idx) const;
+
+  /// Return true if this is an identity map.
+  bool isIdentity() const;
+
+  inline unsigned getNumOperands() const { return operands.size(); }
+  inline unsigned getNumDims() const { return map.getNumDims(); }
+  inline unsigned getNumSymbols() const { return map.getNumSymbols(); }
+  inline unsigned getNumResults() const { return map.getNumResults(); }
+
+  Value *getOperand(unsigned i) const;
+  ArrayRef<Value *> getOperands() const;
+  AffineMap getAffineMap() const;
+
+private:
+  // A mutable affine map.
+  MutableAffineMap map;
+
+  // TODO: make these trailing objects?
+  /// The SSA operands binding to the dim's and symbols of 'map'.
+  SmallVector<Value *, 4> operands;
+  /// The SSA results binding to the results of 'map'.
+  SmallVector<Value *, 4> results;
+};
+
+/// An IntegerValueSet is an integer set plus its operands.
+// Both, the integer set being pointed to and the operands can change during
+// analysis, simplification, and transformation.
+class IntegerValueSet {
+  /// Constructs an integer value set from an affine value map.
+  // This will lead to a single equality in 'set'.
+  explicit IntegerValueSet(const AffineValueMap &avm);
+
+  /// Returns true if this integer set is determined to be empty. Emptiness is
+  /// checked by by eliminating identifiers successively (through either
+  /// Gaussian or Fourier-Motzkin) while using the GCD test and a trivial
+  /// invalid constraint check. Returns 'true' if the constaint system is found
+  /// to be empty; false otherwise. This method is exact for rational spaces but
+  /// not integer spaces - thus, if it returns true, the set is provably integer
+  /// empty as well, but if it returns false, it doesn't necessarily mean an
+  /// integer point exists in it. This method also returns false where an
+  /// explosion of constraints is detected - due to the super-exponential
+  /// worse-case complexity of Fourier-Motzkin elimination (rare for realistic
+  /// problem cases but possible for artificial adversarial or improperly
+  // constructed ones), this method returns false conservatively.
+  bool isEmpty() const;
+
+  bool getNumDims() const { return set.getNumDims(); }
+  bool getNumSymbols() const { return set.getNumSymbols(); }
+
+private:
+  // The set pointed to may itself change unlike in IR structures like
+  // 'AffineCondition'.
+  MutableIntegerSet set;
+  /// The SSA operands binding to the dim's and symbols of 'set'.
+  SmallVector<Value *, 4> operands;
+};
+
+/// A flat list of affine equalities and inequalities in the form.
+/// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} == 0
+/// Equality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} >= 0
+///
+/// FlatAffineConstraints stores coefficients in a contiguous buffer (one buffer
+/// for equalities and one for inequalities). The size of each buffer is
+/// numReservedCols * number of inequalities (or equalities). The reserved size
+/// is numReservedCols * numReservedInequalities (or numReservedEqualities). A
+/// coefficient (r, c) lives at the location numReservedCols * r + c in the
+/// buffer. The extra space between getNumCols() and numReservedCols exists to
+/// prevent frequent movement of data when adding columns, especially at the
+/// end.
+///
+/// The identifiers x_0, x_1, ... appear in the order: dimensional identifiers,
+/// symbolic identifiers, and local identifiers.  The local identifiers
+/// correspond to local/internal variables created when converting from
+/// AffineExpr's containing mod's and div's; they are thus needed to increase
+/// representational power. Each local identifier is always (by construction) a
+/// floordiv of a pure add/mul affine function of dimensional, symbolic, and
+/// other local identifiers, in a non-mutually recursive way. Hence, every local
+/// identifier can ultimately always be recovered as an affine function of
+/// dimensional and symbolic identifiers (involving floordiv's); note however
+/// that some floordiv combinations are converted to mod's by AffineExpr
+/// construction.
+///
+class FlatAffineConstraints {
+public:
+  enum IdKind { Dimension, Symbol, Local };
+
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and identifiers..
+  FlatAffineConstraints(unsigned numReservedInequalities,
+                        unsigned numReservedEqualities,
+                        unsigned numReservedCols, unsigned numDims = 0,
+                        unsigned numSymbols = 0, unsigned numLocals = 0,
+                        ArrayRef<Optional<Value *>> idArgs = {})
+      : numReservedCols(numReservedCols), numDims(numDims),
+        numSymbols(numSymbols) {
+    assert(numReservedCols >= numDims + numSymbols + 1);
+    assert(idArgs.empty() || idArgs.size() == numDims + numSymbols + numLocals);
+    equalities.reserve(numReservedCols * numReservedEqualities);
+    inequalities.reserve(numReservedCols * numReservedInequalities);
+    numIds = numDims + numSymbols + numLocals;
+    ids.reserve(numReservedCols);
+    if (idArgs.empty())
+      ids.resize(numIds, None);
+    else
+      ids.append(idArgs.begin(), idArgs.end());
+  }
+
+  /// Constructs a constraint system with the specified number of
+  /// dimensions and symbols.
+  FlatAffineConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
+                        unsigned numLocals = 0,
+                        ArrayRef<Optional<Value *>> idArgs = {})
+      : numReservedCols(numDims + numSymbols + numLocals + 1), numDims(numDims),
+        numSymbols(numSymbols) {
+    assert(numReservedCols >= numDims + numSymbols + 1);
+    assert(idArgs.empty() || idArgs.size() == numDims + numSymbols + numLocals);
+    numIds = numDims + numSymbols + numLocals;
+    ids.reserve(numIds);
+    if (idArgs.empty())
+      ids.resize(numIds, None);
+    else
+      ids.append(idArgs.begin(), idArgs.end());
+  }
+
+  explicit FlatAffineConstraints(const HyperRectangularSet &set);
+
+  /// Create a flat affine constraint system from an AffineValueMap or a list of
+  /// these. The constructed system will only include equalities.
+  // TODO(bondhugula)
+  explicit FlatAffineConstraints(const AffineValueMap &avm);
+  explicit FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef);
+
+  /// Creates an affine constraint system from an IntegerSet.
+  explicit FlatAffineConstraints(IntegerSet set);
+
+  /// Create an affine constraint system from an IntegerValueSet.
+  // TODO(bondhugula)
+  explicit FlatAffineConstraints(const IntegerValueSet &set);
+
+  FlatAffineConstraints(const FlatAffineConstraints &other);
+
+  FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef,
+                        IntegerSet set);
+
+  FlatAffineConstraints(const MutableAffineMap &map);
+
+  ~FlatAffineConstraints() {}
+
+  // Clears any existing data and reserves memory for the specified constraints.
+  void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
+             unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
+             unsigned numLocals = 0, ArrayRef<Value *> idArgs = {});
+
+  void reset(unsigned numDims = 0, unsigned numSymbols = 0,
+             unsigned numLocals = 0, ArrayRef<Value *> idArgs = {});
+
+  /// Appends constraints from 'other' into this. This is equivalent to an
+  /// intersection with no simplification of any sort attempted.
+  void append(const FlatAffineConstraints &other);
+
+  // Checks for emptiness by performing variable elimination on all identifiers,
+  // running the GCD test on each equality constraint, and checking for invalid
+  // constraints.
+  // Returns true if the GCD test fails for any equality, or if any invalid
+  // constraints are discovered on any row. Returns false otherwise.
+  bool isEmpty() const;
+
+  // Runs the GCD test on all equality constraints. Returns 'true' if this test
+  // fails on any equality. Returns 'false' otherwise.
+  // This test can be used to disprove the existence of a solution. If it
+  // returns true, no integer solution to the equality constraints can exist.
+  bool isEmptyByGCDTest() const;
+
+  // Clones this object.
+  std::unique_ptr<FlatAffineConstraints> clone() const;
+
+  /// Returns the value at the specified equality row and column.
+  inline int64_t atEq(unsigned i, unsigned j) const {
+    return equalities[i * numReservedCols + j];
+  }
+  inline int64_t &atEq(unsigned i, unsigned j) {
+    return equalities[i * numReservedCols + j];
+  }
+
+  inline int64_t atIneq(unsigned i, unsigned j) const {
+    return inequalities[i * numReservedCols + j];
+  }
+
+  inline int64_t &atIneq(unsigned i, unsigned j) {
+    return inequalities[i * numReservedCols + j];
+  }
+
+  /// Returns the number of columns in the constraint system.
+  inline unsigned getNumCols() const { return numIds + 1; }
+
+  inline unsigned getNumEqualities() const {
+    assert(equalities.size() % numReservedCols == 0 &&
+           "inconsistent equality buffer size");
+    return equalities.size() / numReservedCols;
+  }
+
+  inline unsigned getNumInequalities() const {
+    assert(inequalities.size() % numReservedCols == 0 &&
+           "inconsistent inequality buffer size");
+    return inequalities.size() / numReservedCols;
+  }
+
+  inline unsigned getNumReservedEqualities() const {
+    return equalities.capacity() / numReservedCols;
+  }
+
+  inline unsigned getNumReservedInequalities() const {
+    return inequalities.capacity() / numReservedCols;
+  }
+
+  inline ArrayRef<int64_t> getEquality(unsigned idx) const {
+    return ArrayRef<int64_t>(&equalities[idx * numReservedCols], getNumCols());
+  }
+
+  inline ArrayRef<int64_t> getInequality(unsigned idx) const {
+    return ArrayRef<int64_t>(&inequalities[idx * numReservedCols],
+                             getNumCols());
+  }
+
+  AffineExpr toAffineExpr(unsigned idx, MLIRContext *context);
+
+  /// Adds constraints (lower and upper bounds) for the specified 'affine.for'
+  /// operation's Value using IR information stored in its bound maps. The
+  /// right identifier is first looked up using forOp's Value. Asserts if the
+  /// Value corresponding to the 'affine.for' operation isn't found in the
+  /// constraint system. Returns failure for the yet unimplemented/unsupported
+  /// cases.  Any new identifiers that are found in the bound operands of the
+  /// 'affine.for' operation are added as trailing identifiers (either
+  /// dimensional or symbolic depending on whether the operand is a valid
+  /// symbol).
+  //  TODO(bondhugula): add support for non-unit strides.
+  LogicalResult addAffineForOpDomain(AffineForOp forOp);
+
+  /// Adds a lower or an upper bound for the identifier at the specified
+  /// position with constraints being drawn from the specified bound map and
+  /// operands. If `eq` is true, add a single equality equal to the bound map's
+  /// first result expr.
+  LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
+                                     ArrayRef<Value *> operands, bool eq,
+                                     bool lower = true);
+
+  /// Computes the lower and upper bounds of the first 'num' dimensional
+  /// identifiers (starting at 'offset') as an affine map of the remaining
+  /// identifiers (dimensional and symbolic). This method is able to detect
+  /// identifiers as floordiv's and mod's of affine expressions of other
+  /// identifiers with respect to (positive) constants. Sets bound map to a
+  /// null AffineMap if such a bound can't be found (or yet unimplemented).
+  void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
+                      SmallVectorImpl<AffineMap> *lbMaps,
+                      SmallVectorImpl<AffineMap> *ubMaps);
+
+  /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
+  /// bounds in 'ubMaps' to each identifier in the constraint system which has
+  /// a value in 'values'. Note that both lower/upper bounds share the same
+  /// operand list 'operands'.
+  /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'.
+  /// Note that both lower/upper bounds use operands from 'operands'.
+  LogicalResult addSliceBounds(ArrayRef<Value *> values,
+                               ArrayRef<AffineMap> lbMaps,
+                               ArrayRef<AffineMap> ubMaps,
+                               ArrayRef<Value *> operands);
+
+  // Adds an inequality (>= 0) from the coefficients specified in inEq.
+  void addInequality(ArrayRef<int64_t> inEq);
+  // Adds an equality from the coefficients specified in eq.
+  void addEquality(ArrayRef<int64_t> eq);
+
+  /// Adds a constant lower bound constraint for the specified identifier.
+  void addConstantLowerBound(unsigned pos, int64_t lb);
+  /// Adds a constant upper bound constraint for the specified identifier.
+  void addConstantUpperBound(unsigned pos, int64_t ub);
+
+  /// Adds a new local identifier as the floordiv of an affine function of other
+  /// identifiers, the coefficients of which are provided in 'dividend' and with
+  /// respect to a positive constant 'divisor'. Two constraints are added to the
+  /// system to capture equivalence with the floordiv:
+  /// q = dividend floordiv c    <=>   c*q <= dividend <= c*q + c - 1.
+  void addLocalFloorDiv(ArrayRef<int64_t> dividend, int64_t divisor);
+
+  /// Adds a constant lower bound constraint for the specified expression.
+  void addConstantLowerBound(ArrayRef<int64_t> expr, int64_t lb);
+  /// Adds a constant upper bound constraint for the specified expression.
+  void addConstantUpperBound(ArrayRef<int64_t> expr, int64_t ub);
+
+  /// Sets the identifier at the specified position to a constant.
+  void setIdToConstant(unsigned pos, int64_t val);
+
+  /// Sets the identifier corresponding to the specified Value id to a
+  /// constant. Asserts if the 'id' is not found.
+  void setIdToConstant(Value &id, int64_t val);
+
+  /// Looks up the position of the identifier with the specified Value. Returns
+  /// true if found (false otherwise). `pos' is set to the (column) position of
+  /// the identifier.
+  bool findId(Value &id, unsigned *pos) const;
+
+  /// Returns true if an identifier with the specified Value exists, false
+  /// otherwise.
+  bool containsId(Value &id) const;
+
+  // Add identifiers of the specified kind - specified positions are relative to
+  // the kind of identifier. The coefficient column corresponding to the added
+  // identifier is initialized to zero. 'id' is the Value corresponding to the
+  // identifier that can optionally be provided.
+  void addDimId(unsigned pos, Value *id = nullptr);
+  void addSymbolId(unsigned pos, Value *id = nullptr);
+  void addLocalId(unsigned pos);
+  void addId(IdKind kind, unsigned pos, Value *id = nullptr);
+
+  /// Add the specified values as a dim or symbol id depending on its nature, if
+  /// it already doesn't exist in the system. `id' has to be either a terminal
+  /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any
+  /// symbols or loop IVs. The identifier is added to the end of the existing
+  /// dims or symbols. Additional information on the identifier is extracted
+  /// from the IR and added to the constraint system.
+  void addInductionVarOrTerminalSymbol(Value *id);
+
+  /// Composes the affine value map with this FlatAffineConstrains, adding the
+  /// results of the map as dimensions at the front [0, vMap->getNumResults())
+  /// and with the dimensions set to the equalities specified by the value map.
+  /// Returns failure if the composition fails (when vMap is a semi-affine map).
+  /// The vMap's operand Value's are used to look up the right positions in
+  /// the FlatAffineConstraints with which to associate. The dimensional and
+  /// symbolic operands of vMap should match 1:1 (in the same order) with those
+  /// of this constraint system, but the latter could have additional trailing
+  /// operands.
+  LogicalResult composeMap(AffineValueMap *vMap);
+
+  /// Projects out (aka eliminates) 'num' identifiers starting at position
+  /// 'pos'. The resulting constraint system is the shadow along the dimensions
+  /// that still exist. This method may not always be integer exact.
+  // TODO(bondhugula): deal with integer exactness when necessary - can return a
+  // value to mark exactness for example.
+  void projectOut(unsigned pos, unsigned num);
+  inline void projectOut(unsigned pos) { return projectOut(pos, 1); }
+
+  /// Projects out the identifier that is associate with Value *.
+  void projectOut(Value *id);
+
+  void removeId(IdKind idKind, unsigned pos);
+  void removeId(unsigned pos);
+
+  void removeDim(unsigned pos);
+
+  void removeEquality(unsigned pos);
+  void removeInequality(unsigned pos);
+
+  /// Changes the partition between dimensions and symbols. Depending on the new
+  /// symbol count, either a chunk of trailing dimensional identifiers becomes
+  /// symbols, or some of the leading symbols become dimensions.
+  void setDimSymbolSeparation(unsigned newSymbolCount);
+
+  /// Changes all symbol identifiers which are loop IVs to dim identifiers.
+  void convertLoopIVSymbolsToDims();
+
+  /// Sets the specified identifier to a constant and removes it.
+  void setAndEliminate(unsigned pos, int64_t constVal);
+
+  /// Tries to fold the specified identifier to a constant using a trivial
+  /// equality detection; if successful, the constant is substituted for the
+  /// identifier everywhere in the constraint system and then removed from the
+  /// system.
+  LogicalResult constantFoldId(unsigned pos);
+
+  /// This method calls constantFoldId for the specified range of identifiers,
+  /// 'num' identifiers starting at position 'pos'.
+  void constantFoldIdRange(unsigned pos, unsigned num);
+
+  /// Returns true if all the identifiers in the specified range [start, limit)
+  /// can only take a single value each if the remaining identifiers are treated
+  /// as symbols/parameters, i.e., for given values of the latter, there only
+  /// exists a unique value for each of the dimensions in the specified range.
+  bool isRangeOneToOne(unsigned start, unsigned limit) const;
+
+  /// Updates the constraints to be the smallest bounding (enclosing) box that
+  /// contains the points of 'this' set and that of 'other', with the symbols
+  /// being treated specially. For each of the dimensions, the min of the lower
+  /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
+  /// to determine such a bounding box. `other' is expected to have the same
+  /// dimensional identifiers as this constraint system (in the same order).
+  ///
+  /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the
+  ///      output is {0 <= d0 <= 192}.
+  /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 +
+  ///     9}, output = {s0 + 1 <= d0 <= s0 + 20}.
+  /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1
+  ///     <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}.
+  LogicalResult unionBoundingBox(const FlatAffineConstraints &other);
+
+  /// Returns 'true' if this constraint system and 'other' are in the same
+  /// space, i.e., if they are associated with the same set of identifiers,
+  /// appearing in the same order. Returns 'false' otherwise.
+  bool areIdsAlignedWithOther(const FlatAffineConstraints &other);
+
+  /// Merge and align the identifiers of 'this' and 'other' starting at
+  /// 'offset', so that both constraint systems get the union of the contained
+  /// identifiers that is dimension-wise and symbol-wise unique; both
+  /// constraint systems are updated so that they have the union of all
+  /// identifiers, with this's original identifiers appearing first followed by
+  /// any of other's identifiers that didn't appear in 'this'. Local
+  /// identifiers of each system are by design separate/local and are placed
+  /// one after other (this's followed by other's).
+  //  Eg: Input: 'this'  has ((%i %j) [%M %N])
+  //             'other' has (%k, %j) [%P, %N, %M])
+  //      Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P]
+  //
+  void mergeAndAlignIdsWithOther(unsigned offset, FlatAffineConstraints *other);
+
+  unsigned getNumConstraints() const {
+    return getNumInequalities() + getNumEqualities();
+  }
+  inline unsigned getNumIds() const { return numIds; }
+  inline unsigned getNumDimIds() const { return numDims; }
+  inline unsigned getNumSymbolIds() const { return numSymbols; }
+  inline unsigned getNumDimAndSymbolIds() const { return numDims + numSymbols; }
+  inline unsigned getNumLocalIds() const {
+    return numIds - numDims - numSymbols;
+  }
+
+  inline ArrayRef<Optional<Value *>> getIds() const {
+    return {ids.data(), ids.size()};
+  }
+  inline MutableArrayRef<Optional<Value *>> getIds() {
+    return {ids.data(), ids.size()};
+  }
+
+  /// Returns the optional Value corresponding to the pos^th identifier.
+  inline Optional<Value *> getId(unsigned pos) const { return ids[pos]; }
+  inline Optional<Value *> &getId(unsigned pos) { return ids[pos]; }
+
+  /// Returns the Value associated with the pos^th identifier. Asserts if
+  /// no Value identifier was associated.
+  inline Value *getIdValue(unsigned pos) const {
+    assert(ids[pos].hasValue() && "identifier's Value not set");
+    return ids[pos].getValue();
+  }
+
+  /// Returns the Values associated with identifiers in range [start, end).
+  /// Asserts if no Value was associated with one of these identifiers.
+  void getIdValues(unsigned start, unsigned end,
+                   SmallVectorImpl<Value *> *values) const {
+    assert((start < numIds || start == end) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    values->clear();
+    values->reserve(end - start);
+    for (unsigned i = start; i < end; i++) {
+      values->push_back(getIdValue(i));
+    }
+  }
+  inline void getAllIdValues(SmallVectorImpl<Value *> *values) const {
+    getIdValues(0, numIds, values);
+  }
+
+  /// Sets Value associated with the pos^th identifier.
+  inline void setIdValue(unsigned pos, Value *val) {
+    assert(pos < numIds && "invalid id position");
+    ids[pos] = val;
+  }
+  /// Sets Values associated with identifiers in the range [start, end).
+  void setIdValues(unsigned start, unsigned end, ArrayRef<Value *> values) {
+    assert((start < numIds || end == start) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    assert(values.size() == end - start);
+    for (unsigned i = start; i < end; ++i)
+      ids[i] = values[i - start];
+  }
+
+  /// Clears this list of constraints and copies other into it.
+  void clearAndCopyFrom(const FlatAffineConstraints &other);
+
+  /// Returns the smallest known constant bound for the extent of the specified
+  /// identifier (pos^th), i.e., the smallest known constant that is greater
+  /// than or equal to 'exclusive upper bound' - 'lower bound' of the
+  /// identifier. Returns None if it's not a constant. This method employs
+  /// trivial (low complexity / cost) checks and detection. Symbolic identifiers
+  /// are treated specially, i.e., it looks for constant differences between
+  /// affine expressions involving only the symbolic identifiers. See comments
+  /// at function definition for examples. 'lb' and 'lbDivisor', if provided,
+  /// are used to express the lower bound associated with the constant
+  /// difference: 'lb' has the coefficients and lbDivisor, the divisor. For eg.,
+  /// if the lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three
+  /// symbolic identifiers, *lb = [1, 0, 1], lbDivisor = 32.
+  Optional<int64_t>
+  getConstantBoundOnDimSize(unsigned pos,
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbFloorDivisor = nullptr,
+                            SmallVectorImpl<int64_t> *ub = nullptr) const;
+
+  /// Returns the constant lower bound for the pos^th identifier if there is
+  /// one; None otherwise.
+  Optional<int64_t> getConstantLowerBound(unsigned pos) const;
+
+  /// Returns the constant upper bound for the pos^th identifier if there is
+  /// one; None otherwise.
+  Optional<int64_t> getConstantUpperBound(unsigned pos) const;
+
+  /// Gets the lower and upper bound of the pos^th identifier treating
+  /// [0, offset) U [offset + num, symbStartPos) as dimensions and
+  /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
+  /// multi-dimensional maps in the pair represent the max and min of
+  /// potentially multiple affine expressions. The upper bound is exclusive.
+  /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
+  /// the system.
+  std::pair<AffineMap, AffineMap>
+  getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
+                        unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
+                        MLIRContext *context);
+
+  /// Returns true if the set can be trivially detected as being
+  /// hyper-rectangular on the specified contiguous set of identifiers.
+  bool isHyperRectangular(unsigned pos, unsigned num) const;
+
+  /// Removes duplicate constraints, trivially true constraints, and constraints
+  /// that can be detected as redundant as a result of differing only in their
+  /// constant term part. A constraint of the form <non-negative constant> >= 0
+  /// is considered trivially true. This method is a linear time method on the
+  /// constraints, does a single scan, and updates in place.
+  void removeTrivialRedundancy();
+
+  /// A more expensive check to detect redundant inequalities thatn
+  /// removeTrivialRedundancy.
+  void removeRedundantInequalities();
+
+  // Removes all equalities and inequalities.
+  void clearConstraints();
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+private:
+  /// Returns false if the fields corresponding to various identifier counts, or
+  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
+  /// is meant to be used within an assert internally.
+  bool hasConsistentState() const;
+
+  /// Checks all rows of equality/inequality constraints for trivial
+  /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
+  /// after elimination. Returns 'true' if an invalid constraint is found;
+  /// 'false'otherwise.
+  bool hasInvalidConstraint() const;
+
+  /// Returns the constant lower bound bound if isLower is true, and the upper
+  /// bound if isLower is false.
+  template <bool isLower>
+  Optional<int64_t> computeConstantLowerOrUpperBound(unsigned pos);
+
+  // Eliminates a single identifier at 'position' from equality and inequality
+  // constraints. Returns 'success' if the identifier was eliminated, and
+  // 'failure' otherwise.
+  inline LogicalResult gaussianEliminateId(unsigned position) {
+    return success(gaussianEliminateIds(position, position + 1) == 1);
+  }
+
+  // Eliminates identifiers from equality and inequality constraints
+  // in column range [posStart, posLimit).
+  // Returns the number of variables eliminated.
+  unsigned gaussianEliminateIds(unsigned posStart, unsigned posLimit);
+
+  /// Eliminates identifier at the specified position using Fourier-Motzkin
+  /// variable elimination, but uses Gaussian elimination if there is an
+  /// equality involving that identifier. If the result of the elimination is
+  /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is
+  /// set to true, a potential under approximation (subset) of the rational
+  /// shadow / exact integer shadow is computed.
+  // See implementation comments for more details.
+  void FourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
+                               bool *isResultIntegerExact = nullptr);
+
+  /// Tightens inequalities given that we are dealing with integer spaces. This
+  /// is similar to the GCD test but applied to inequalities. The constant term
+  /// can be reduced to the preceding multiple of the GCD of the coefficients,
+  /// i.e.,
+  ///  64*i - 100 >= 0  =>  64*i - 128 >= 0 (since 'i' is an integer). This is a
+  /// fast method (linear in the number of coefficients).
+  void GCDTightenInequalities();
+
+  /// Normalized each constraints by the GCD of its coefficients.
+  void normalizeConstraintsByGCD();
+
+  /// Removes identifiers in column range [idStart, idLimit), and copies any
+  /// remaining valid data into place, updates member variables, and resizes
+  /// arrays as needed.
+  void removeIdRange(unsigned idStart, unsigned idLimit);
+
+  /// Coefficients of affine equalities (in == 0 form).
+  SmallVector<int64_t, 64> equalities;
+
+  /// Coefficients of affine inequalities (in >= 0 form).
+  SmallVector<int64_t, 64> inequalities;
+
+  /// Number of columns reserved. Actual ones in used are returned by
+  /// getNumCols().
+  unsigned numReservedCols;
+
+  /// Total number of identifiers.
+  unsigned numIds;
+
+  /// Number of identifiers corresponding to real dimensions.
+  unsigned numDims;
+
+  /// Number of identifiers corresponding to symbols (unknown but constant for
+  /// analysis).
+  unsigned numSymbols;
+
+  /// Values corresponding to the (column) identifiers of this constraint
+  /// system appearing in the order the identifiers correspond to columns.
+  /// Temporary ones or those that aren't associated to any Value are set to
+  /// None.
+  SmallVector<Optional<Value *>, 8> ids;
+
+  /// A parameter that controls detection of an unrealistic number of
+  /// constraints. If the number of constraints is this many times the number of
+  /// variables, we consider such a system out of line with the intended use
+  /// case of FlatAffineConstraints.
+  // The rationale for 32 is that in the typical simplest of cases, an
+  // identifier is expected to have one lower bound and one upper bound
+  // constraint. With a level of tiling or a connection to another identifier
+  // through a div or mod, an extra pair of bounds gets added. As a limit, we
+  // don't expect an identifier to have more than 32 lower/upper/equality
+  // constraints. This is conservatively set low and can be raised if needed.
+  constexpr static unsigned kExplosionFactor = 32;
+};
+
+/// Simplify an affine expression by flattening and some amount of
+/// simple analysis. This has complexity linear in the number of nodes in
+/// 'expr'. Returns the simplified expression, which is the same as the input
+///  expression if it can't be simplified.
+AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                              unsigned numSymbols);
+
+/// Flattens 'expr' into 'flattenedExpr'. Returns failure if 'expr' could not be
+/// flattened (i.e., semi-affine is not yet handled). 'cst' contains constraints
+/// that connect newly introduced local identifiers to existing dimensional and
+/// symbolic identifiers. See documentation for AffineExprFlattener on how
+/// mod's and div's are flattened.
+LogicalResult
+getFlattenedAffineExpr(AffineExpr expr, unsigned numDims, unsigned numSymbols,
+                       llvm::SmallVectorImpl<int64_t> *flattenedExpr,
+                       FlatAffineConstraints *cst = nullptr);
+
+/// Flattens the result expressions of the map to their corresponding flattened
+/// forms and set in 'flattenedExprs'. Returns failure if any expression in the
+/// map could not be flattened (i.e., semi-affine is not yet handled). 'cst'
+/// contains constraints that connect newly introduced local identifiers to
+/// existing dimensional and / symbolic identifiers. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened. For all affine
+/// expressions that share the same operands (like those of an affine map), this
+/// method should be used instead of repeatedly calling getFlattenedAffineExpr
+/// since local variables added to deal with div's and mod's will be reused
+/// across expressions.
+LogicalResult getFlattenedAffineExprs(
+    AffineMap map, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *cst = nullptr);
+LogicalResult getFlattenedAffineExprs(
+    IntegerSet set, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *cst = nullptr);
+
+} // end namespace mlir.
+
+#endif // MLIR_ANALYSIS_AFFINE_STRUCTURES_H
diff --git a/third_party/mlir/include/mlir/Analysis/Dominance.h b/third_party/mlir/include/mlir/Analysis/Dominance.h
new file mode 100644
index 00000000000..d3e5b61a9bd
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/Dominance.h
@@ -0,0 +1,144 @@
+//===- Dominance.h - Dominator analysis for CFGs ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_ANALYSIS_DOMINANCE_H
+#define MLIR_ANALYSIS_DOMINANCE_H
+
+#include "mlir/IR/RegionGraphTraits.h"
+#include "llvm/Support/GenericDomTree.h"
+
+extern template class llvm::DominatorTreeBase<mlir::Block, false>;
+extern template class llvm::DominatorTreeBase<mlir::Block, true>;
+
+namespace mlir {
+using DominanceInfoNode = llvm::DomTreeNodeBase<Block>;
+class Operation;
+
+namespace detail {
+template <bool IsPostDom> class DominanceInfoBase {
+  using base = llvm::DominatorTreeBase<Block, IsPostDom>;
+
+public:
+  DominanceInfoBase(Operation *op) { recalculate(op); }
+  DominanceInfoBase(DominanceInfoBase &&) = default;
+  DominanceInfoBase &operator=(DominanceInfoBase &&) = default;
+
+  DominanceInfoBase(const DominanceInfoBase &) = delete;
+  DominanceInfoBase &operator=(const DominanceInfoBase &) = delete;
+
+  /// Recalculate the dominance info.
+  void recalculate(Operation *op);
+
+  /// Get the root dominance node of the given region.
+  DominanceInfoNode *getRootNode(Region *region) {
+    assert(dominanceInfos.count(region) != 0);
+    return dominanceInfos[region]->getRootNode();
+  }
+
+protected:
+  using super = DominanceInfoBase<IsPostDom>;
+
+  /// Return true if the specified block A properly dominates block B.
+  bool properlyDominates(Block *a, Block *b);
+
+  /// A mapping of regions to their base dominator tree.
+  llvm::DenseMap<Region *, std::unique_ptr<base>> dominanceInfos;
+};
+} // end namespace detail
+
+/// A class for computing basic dominance information.
+class DominanceInfo : public detail::DominanceInfoBase</*IsPostDom=*/false> {
+public:
+  using super::super;
+
+  /// Return true if operation A properly dominates operation B.
+  bool properlyDominates(Operation *a, Operation *b);
+
+  /// Return true if operation A dominates operation B.
+  bool dominates(Operation *a, Operation *b) {
+    return a == b || properlyDominates(a, b);
+  }
+
+  /// Return true if value A properly dominates operation B.
+  bool properlyDominates(Value *a, Operation *b);
+
+  /// Return true if operation A dominates operation B.
+  bool dominates(Value *a, Operation *b) {
+    return (Operation *)a->getDefiningOp() == b || properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A dominates block B.
+  bool dominates(Block *a, Block *b) {
+    return a == b || properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A properly dominates block B.
+  bool properlyDominates(Block *a, Block *b) {
+    return super::properlyDominates(a, b);
+  }
+};
+
+/// A class for computing basic postdominance information.
+class PostDominanceInfo : public detail::DominanceInfoBase</*IsPostDom=*/true> {
+public:
+  using super::super;
+
+  /// Return true if operation A properly postdominates operation B.
+  bool properlyPostDominates(Operation *a, Operation *b);
+
+  /// Return true if operation A postdominates operation B.
+  bool postDominates(Operation *a, Operation *b) {
+    return a == b || properlyPostDominates(a, b);
+  }
+
+  /// Return true if the specified block A properly postdominates block B.
+  bool properlyPostDominates(Block *a, Block *b) {
+    return super::properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A postdominates block B.
+  bool postDominates(Block *a, Block *b) {
+    return a == b || properlyPostDominates(a, b);
+  }
+};
+
+} //  end namespace mlir
+
+namespace llvm {
+
+/// DominatorTree GraphTraits specialization so the DominatorTree can be
+/// iterated by generic graph iterators.
+template <> struct GraphTraits<mlir::DominanceInfoNode *> {
+  using ChildIteratorType = mlir::DominanceInfoNode::iterator;
+  using NodeRef = mlir::DominanceInfoNode *;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static inline ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
+  static inline ChildIteratorType child_end(NodeRef N) { return N->end(); }
+};
+
+template <> struct GraphTraits<const mlir::DominanceInfoNode *> {
+  using ChildIteratorType = mlir::DominanceInfoNode::const_iterator;
+  using NodeRef = const mlir::DominanceInfoNode *;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static inline ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
+  static inline ChildIteratorType child_end(NodeRef N) { return N->end(); }
+};
+
+} // end namespace llvm
+#endif
diff --git a/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h b/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h
new file mode 100644
index 00000000000..7763a2bd262
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h
@@ -0,0 +1,111 @@
+//===- LoopAnalysis.h - loop analysis methods -------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for methods to analyze loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_LOOP_ANALYSIS_H
+#define MLIR_ANALYSIS_LOOP_ANALYSIS_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+
+namespace mlir {
+
+class AffineExpr;
+class AffineForOp;
+class AffineMap;
+class Operation;
+class MemRefType;
+class Value;
+
+/// Returns the trip count of the loop as an affine map with its corresponding
+/// operands if the latter is expressible as an affine expression, and nullptr
+/// otherwise. This method always succeeds as long as the lower bound is not a
+/// multi-result map. The trip count expression is simplified before returning.
+/// This method only utilizes map composition to construct lower and upper
+/// bounds before computing the trip count expressions
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints
+void buildTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
+                                  SmallVectorImpl<Value *> *operands);
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// uses affine expression analysis and is able to determine constant trip count
+/// in non-trivial cases.
+llvm::Optional<uint64_t> getConstantTripCount(AffineForOp forOp);
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
+
+/// Given an induction variable `iv` of type AffineForOp and an `index` of type
+/// IndexType, returns `true` if `index` is independent of `iv` and false
+/// otherwise.
+/// The determination supports composition with at most one AffineApplyOp.
+/// The at most one AffineApplyOp comes from the fact that composition of
+/// AffineApplyOp need to be canonicalized by construction to avoid writing code
+/// that composes arbitrary numbers of AffineApplyOps everywhere. To achieve
+/// this, at the very least, the compose-affine-apply pass must have been run.
+///
+/// Prerequisites:
+///   1. `iv` and `index` of the proper type;
+///   2. at most one reachable AffineApplyOp from index;
+///
+/// Returns false in cases with more than one AffineApplyOp, this is
+/// conservative.
+bool isAccessInvariant(Value *iv, Value *index);
+
+/// Given an induction variable `iv` of type AffineForOp and `indices` of type
+/// IndexType, returns the set of `indices` that are independent of `iv`.
+///
+/// Prerequisites (inherited from `isAccessInvariant` above):
+///   1. `iv` and `indices` of the proper type;
+///   2. at most one affine.apply is reachable from each index in `indices`;
+///
+/// Emits a note if it encounters a chain of affine.apply and conservatively
+///  those cases.
+llvm::DenseSet<Value *, llvm::DenseMapInfo<Value *>>
+getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices);
+
+using VectorizableLoopFun = std::function<bool(AffineForOp)>;
+
+/// Checks whether the loop is structurally vectorizable; i.e.:
+///   1. no conditionals are nested under the loop;
+///   2. all nested load/stores are to scalar MemRefs.
+/// TODO(ntv): relax the no-conditionals restriction
+bool isVectorizableLoopBody(AffineForOp loop);
+
+/// Checks whether the loop is structurally vectorizable and that all the LoadOp
+/// and StoreOp matched have access indexing functions that are are either:
+///   1. invariant along the loop induction variable created by 'loop';
+///   2. varying along at most one memory dimension. If such a unique dimension
+///      is found, it is written into `memRefDim`.
+bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim);
+
+/// Checks where SSA dominance would be violated if a for op's body
+/// operations are shifted by the specified shifts. This method checks if a
+/// 'def' and all its uses have the same shift factor.
+// TODO(mlir-team): extend this to check for memory-based dependence
+// violation when we have the support.
+bool isInstwiseShiftValid(AffineForOp forOp, llvm::ArrayRef<uint64_t> shifts);
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_LOOP_ANALYSIS_H
diff --git a/third_party/mlir/include/mlir/Analysis/NestedMatcher.h b/third_party/mlir/include/mlir/Analysis/NestedMatcher.h
new file mode 100644
index 00000000000..b07b73a023a
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/NestedMatcher.h
@@ -0,0 +1,193 @@
+//===- NestedMacher.h - Nested matcher for MLFunction -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
+#define MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/Allocator.h"
+
+namespace mlir {
+
+struct NestedPattern;
+class Operation;
+
+/// An NestedPattern captures nested patterns in the IR.
+/// It is used in conjunction with a scoped NestedPatternContext which is an
+/// llvm::BumpPtrAllocator that handles memory allocations efficiently and
+/// avoids ownership issues.
+///
+/// In order to use NestedPatterns, first create a scoped context.
+/// When the context goes out of scope, everything is freed.
+/// This design simplifies the API by avoiding references to the context and
+/// makes it clear that references to matchers must not escape.
+///
+/// Example:
+///   {
+///      NestedPatternContext context;
+///      auto gemmLike = Doall(Doall(Red(LoadStores())));
+///      auto matches = gemmLike.match(f);
+///      // do work on matches
+///   }  // everything is freed
+///
+///
+/// Nested abstraction for matching results.
+/// Provides access to the nested Operation* captured by a Matcher.
+///
+/// A NestedMatch contains an Operation* and the children NestedMatch and is
+/// thus cheap to copy. NestedMatch is stored in a scoped bumper allocator whose
+/// lifetime is managed by an RAII NestedPatternContext.
+struct NestedMatch {
+  static NestedMatch build(Operation *operation,
+                           ArrayRef<NestedMatch> nestedMatches);
+  NestedMatch(const NestedMatch &) = default;
+  NestedMatch &operator=(const NestedMatch &) = default;
+
+  explicit operator bool() { return matchedOperation != nullptr; }
+
+  Operation *getMatchedOperation() { return matchedOperation; }
+  ArrayRef<NestedMatch> getMatchedChildren() { return matchedChildren; }
+
+private:
+  friend struct NestedPattern;
+  friend struct NestedPatternContext;
+
+  /// Underlying global bump allocator managed by a NestedPatternContext.
+  static llvm::BumpPtrAllocator *&allocator();
+
+  NestedMatch() = default;
+
+  /// Payload, holds a NestedMatch and all its children along this branch.
+  Operation *matchedOperation;
+  ArrayRef<NestedMatch> matchedChildren;
+};
+
+/// A NestedPattern is a nested operation walker that:
+///   1. recursively matches a substructure in the tree;
+///   2. uses a filter function to refine matches with extra semantic
+///      constraints (passed via a lambda of type FilterFunctionType);
+///   3. TODO(ntv) optionally applies actions (lambda).
+///
+/// Nested patterns are meant to capture imperfectly nested loops while matching
+/// properties over the whole loop nest. For instance, in vectorization we are
+/// interested in capturing all the imperfectly nested loops of a certain type
+/// and such that all the load and stores have certain access patterns along the
+/// loops' induction variables). Such NestedMatches are first captured using the
+/// `match` function and are later processed to analyze properties and apply
+/// transformations in a non-greedy way.
+///
+/// The NestedMatches captured in the IR can grow large, especially after
+/// aggressive unrolling. As experience has shown, it is generally better to use
+/// a plain walk over operations to match flat patterns but the current
+/// implementation is competitive nonetheless.
+using FilterFunctionType = std::function<bool(Operation &)>;
+inline bool defaultFilterFunction(Operation &) { return true; }
+struct NestedPattern {
+  NestedPattern(ArrayRef<NestedPattern> nested,
+                FilterFunctionType filter = defaultFilterFunction);
+  NestedPattern(const NestedPattern &) = default;
+  NestedPattern &operator=(const NestedPattern &) = default;
+
+  /// Returns all the top-level matches in `func`.
+  void match(FuncOp func, SmallVectorImpl<NestedMatch> *matches) {
+    func.walk([&](Operation *op) { matchOne(op, matches); });
+  }
+
+  /// Returns all the top-level matches in `op`.
+  void match(Operation *op, SmallVectorImpl<NestedMatch> *matches) {
+    op->walk([&](Operation *child) { matchOne(child, matches); });
+  }
+
+  /// Returns the depth of the pattern.
+  unsigned getDepth() const;
+
+private:
+  friend struct NestedPatternContext;
+  friend struct NestedMatch;
+  friend struct State;
+
+  /// Underlying global bump allocator managed by a NestedPatternContext.
+  static llvm::BumpPtrAllocator *&allocator();
+
+  /// Matches this pattern against a single `op` and fills matches with the
+  /// result.
+  void matchOne(Operation *op, SmallVectorImpl<NestedMatch> *matches);
+
+  /// Nested patterns to be matched.
+  ArrayRef<NestedPattern> nestedPatterns;
+
+  /// Extra filter function to apply to prune patterns as the IR is walked.
+  FilterFunctionType filter;
+
+  /// skip is an implementation detail needed so that we can implement match
+  /// without switching on the type of the Operation. The idea is that a
+  /// NestedPattern first checks if it matches locally and then recursively
+  /// applies its nested matchers to its elem->nested. Since we want to rely on
+  /// the existing operation walking functionality rather than duplicate
+  /// it, we allow an off-by-one traversal to account for the fact that we
+  /// write:
+  ///
+  ///  void match(Operation *elem) {
+  ///    for (auto &c : getNestedPatterns()) {
+  ///      NestedPattern childPattern(...);
+  ///                                  ^~~~ Needs off-by-one skip.
+  ///
+  Operation *skip;
+};
+
+/// RAII structure to transparently manage the bump allocator for
+/// NestedPattern and NestedMatch classes. This avoids passing a context to
+/// all the API functions.
+struct NestedPatternContext {
+  NestedPatternContext() {
+    assert(NestedMatch::allocator() == nullptr &&
+           "Only a single NestedPatternContext is supported");
+    assert(NestedPattern::allocator() == nullptr &&
+           "Only a single NestedPatternContext is supported");
+    NestedMatch::allocator() = &allocator;
+    NestedPattern::allocator() = &allocator;
+  }
+  ~NestedPatternContext() {
+    NestedMatch::allocator() = nullptr;
+    NestedPattern::allocator() = nullptr;
+  }
+  llvm::BumpPtrAllocator allocator;
+};
+
+namespace matcher {
+// Syntactic sugar NestedPattern builder functions.
+NestedPattern Op(FilterFunctionType filter = defaultFilterFunction);
+NestedPattern If(NestedPattern child);
+NestedPattern If(FilterFunctionType filter, NestedPattern child);
+NestedPattern If(ArrayRef<NestedPattern> nested = {});
+NestedPattern If(FilterFunctionType filter,
+                 ArrayRef<NestedPattern> nested = {});
+NestedPattern For(NestedPattern child);
+NestedPattern For(FilterFunctionType filter, NestedPattern child);
+NestedPattern For(ArrayRef<NestedPattern> nested = {});
+NestedPattern For(FilterFunctionType filter,
+                  ArrayRef<NestedPattern> nested = {});
+
+bool isParallelLoop(Operation &op);
+bool isReductionLoop(Operation &op);
+bool isLoadOrStore(Operation &op);
+
+} // end namespace matcher
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
diff --git a/third_party/mlir/include/mlir/Analysis/Passes.h b/third_party/mlir/include/mlir/Analysis/Passes.h
new file mode 100644
index 00000000000..9eafcd35576
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/Passes.h
@@ -0,0 +1,43 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes that expose pass constructors in the
+// analysis library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PASSES_H
+#define MLIR_ANALYSIS_PASSES_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+class FunctionPassBase;
+
+/// Creates a pass to check memref accesses in a Function.
+FunctionPassBase *createMemRefBoundCheckPass();
+
+/// Creates a pass to check memref access dependences in a Function.
+FunctionPassBase *createTestMemRefDependenceCheckPass();
+
+/// Creates a pass to test parallelism detection; emits note for parallel loops.
+FunctionPassBase *createParallelismDetectionTestPass();
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_PASSES_H
diff --git a/third_party/mlir/include/mlir/Analysis/SliceAnalysis.h b/third_party/mlir/include/mlir/Analysis/SliceAnalysis.h
new file mode 100644
index 00000000000..ad6b65387be
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -0,0 +1,215 @@
+//===- SliceAnalysis.h - Analysis for Transitive UseDef chains --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_ANALYSIS_SLICEANALYSIS_H_
+#define MLIR_ANALYSIS_SLICEANALYSIS_H_
+
+#include <functional>
+#include <vector>
+
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+
+class Operation;
+
+/// Type of the condition to limit the propagation of transitive use-defs.
+/// This can be used in particular to limit the propagation to a given Scope or
+/// to avoid passing through certain types of operation in a configurable
+/// manner.
+using TransitiveFilter = std::function<bool(Operation *)>;
+
+/// Fills `forwardSlice` with the computed forward slice (i.e. all
+/// the transitive uses of op), **without** including that operation.
+///
+/// This additionally takes a TransitiveFilter which acts as a frontier:
+/// when looking at uses transitively, a operation that does not pass the
+/// filter is never propagated through. This allows in particular to carve out
+/// the scope within a ForInst or the scope within an IfInst.
+///
+/// The implementation traverses the use chains in postorder traversal for
+/// efficiency reasons: if a operation is already in `forwardSlice`, no
+/// need to traverse its uses again. Since use-def chains form a DAG, this
+/// terminates.
+///
+/// Upon return to the root call, `forwardSlice` is filled with a
+/// postorder list of uses (i.e. a reverse topological order). To get a proper
+/// topological order, we just just reverse the order in `forwardSlice` before
+/// returning.
+///
+/// Example starting from node 0
+/// ============================
+///
+///               0
+///    ___________|___________
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |
+///    |   5             6
+///    |___|_____________|
+///      |               |
+///      7               8
+///      |_______________|
+///              |
+///              9
+///
+/// Assuming all local orders match the numbering order:
+/// 1. after getting back to the root getForwardSlice, `forwardSlice` may
+///    contain:
+///      {9, 7, 8, 5, 1, 2, 6, 3, 4}
+/// 2. reversing the result of 1. gives:
+///      {4, 3, 6, 2, 1, 5, 8, 7, 9}
+///
+void getForwardSlice(
+    Operation *op, llvm::SetVector<Operation *> *forwardSlice,
+    TransitiveFilter filter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Fills `backwardSlice` with the computed backward slice (i.e.
+/// all the transitive defs of op), **without** including that operation.
+///
+/// This additionally takes a TransitiveFilter which acts as a frontier:
+/// when looking at defs transitively, a operation that does not pass the
+/// filter is never propagated through. This allows in particular to carve out
+/// the scope within a ForInst or the scope within an IfInst.
+///
+/// The implementation traverses the def chains in postorder traversal for
+/// efficiency reasons: if a operation is already in `backwardSlice`, no
+/// need to traverse its definitions again. Since useuse-def chains form a DAG,
+/// this terminates.
+///
+/// Upon return to the root call, `backwardSlice` is filled with a
+/// postorder list of defs. This happens to be a topological order, from the
+/// point of view of the use-def chains.
+///
+/// Example starting from node 8
+/// ============================
+///
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |
+///    |   5             6
+///    |___|_____________|
+///      |               |
+///      7               8
+///      |_______________|
+///              |
+///              9
+///
+/// Assuming all local orders match the numbering order:
+///    {1, 2, 5, 3, 4, 6}
+///
+void getBackwardSlice(
+    Operation *op, llvm::SetVector<Operation *> *backwardSlice,
+    TransitiveFilter filter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Iteratively computes backward slices and forward slices until
+/// a fixed point is reached. Returns an `llvm::SetVector<Operation *>` which
+/// **includes** the original operation.
+///
+/// This allows building a slice (i.e. multi-root DAG where everything
+/// that is reachable from an Value in forward and backward direction is
+/// contained in the slice).
+/// This is the abstraction we need to materialize all the operations for
+/// supervectorization without worrying about orderings and Value
+/// replacements.
+///
+/// Example starting from any node
+/// ==============================
+///
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |   |
+///    |   5             6___|
+///    |___|_____________|   |
+///      |               |   |
+///      7               8   |
+///      |_______________|   |
+///              |           |
+///              9          10
+///
+/// Return the whole DAG in some topological order.
+///
+/// The implementation works by just filling up a worklist with iterative
+/// alternate calls to `getBackwardSlice` and `getForwardSlice`.
+///
+/// The following section describes some additional implementation
+/// considerations for a potentially more efficient implementation but they are
+/// just an intuition without proof, we still use a worklist for now.
+///
+/// Additional implementation considerations
+/// ========================================
+/// Consider the defs-op-uses hourglass.
+///    ____
+///    \  /  defs (in some topological order)
+///     \/
+///     op
+///     /\
+///    /  \  uses (in some topological order)
+///   /____\
+///
+/// We want to iteratively apply `getSlice` to construct the whole
+/// list of Operation that are reachable by (use|def)+ from op.
+/// We want the resulting slice in topological order.
+/// Ideally we would like the ordering to be maintained in-place to avoid
+/// copying Operation at each step. Keeping this ordering by construction
+/// seems very unclear, so we list invariants in the hope of seeing whether
+/// useful properties pop up.
+///
+/// In the following:
+///   we use |= for set inclusion;
+///   we use << for set topological ordering (i.e. each pair is ordered).
+///
+/// Assumption:
+/// ===========
+/// We wish to maintain the following property by a recursive argument:
+///   """
+///      defs << {op} <<uses are in topological order.
+///   """
+/// The property clearly holds for 0 and 1-sized uses and defs;
+///
+/// Invariants:
+///   2. defs and uses are in topological order internally, by construction;
+///   3. for any {x} |= defs, defs(x) |= defs;    because all go through op
+///   4. for any {x} |= uses,    defs |= defs(x); because all go through op
+///   5. for any {x} |= defs,    uses |= uses(x); because all go through op
+///   6. for any {x} |= uses, uses(x) |= uses;    because all go through op
+///
+/// Intuitively, we should be able to recurse like:
+///   preorder(defs) - op - postorder(uses)
+/// and keep things ordered but this is still hand-wavy and not worth the
+/// trouble for now: punt to a simple worklist-based solution.
+///
+llvm::SetVector<Operation *> getSlice(
+    Operation *op,
+    TransitiveFilter backwardFilter = /* pass-through*/
+    [](Operation *) { return true; },
+    TransitiveFilter forwardFilter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Multi-root DAG topological sort.
+/// Performs a topological sort of the Operation in the `toSort` SetVector.
+/// Returns a topologically sorted SetVector.
+llvm::SetVector<Operation *>
+topologicalSort(const llvm::SetVector<Operation *> &toSort);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_SLICEANALYSIS_H_
diff --git a/third_party/mlir/include/mlir/Analysis/Utils.h b/third_party/mlir/include/mlir/Analysis/Utils.h
new file mode 100644
index 00000000000..b012cc1e60e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/Utils.h
@@ -0,0 +1,304 @@
+//===- Utils.h - General analysis utilities ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for various transformation utilities for
+// memref's and non-loop IR structures. These are not passes by themselves but
+// are used either by passes, optimization sequences, or in turn by other
+// transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_UTILS_H
+#define MLIR_ANALYSIS_UTILS_H
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Location.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
+namespace mlir {
+
+class AffineForOp;
+class Block;
+class FlatAffineConstraints;
+class Location;
+struct MemRefAccess;
+class Operation;
+class Value;
+
+/// Populates 'loops' with IVs of the loops surrounding 'op' ordered from
+/// the outermost 'affine.for' operation to the innermost one.
+//  TODO(bondhugula): handle 'affine.if' ops.
+void getLoopIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops);
+
+/// Returns the nesting depth of this operation, i.e., the number of loops
+/// surrounding this operation.
+unsigned getNestingDepth(Operation &op);
+
+/// Returns in 'sequentialLoops' all sequential loops in loop nest rooted
+/// at 'forOp'.
+void getSequentialLoops(AffineForOp forOp,
+                        llvm::SmallDenseSet<Value *, 8> *sequentialLoops);
+
+/// ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their
+/// associated operands for a set of loops within a loop nest (typically the
+/// set of loops surrounding a store operation). Loop bound AffineMaps which
+/// are non-null represent slices of that loop's iteration space.
+struct ComputationSliceState {
+  // List of sliced loop IVs (ordered from outermost to innermost).
+  // EX: 'ivs[i]' has lower bound 'lbs[i]' and upper bound 'ubs[i]'.
+  SmallVector<Value *, 4> ivs;
+  // List of lower bound AffineMaps.
+  SmallVector<AffineMap, 4> lbs;
+  // List of upper bound AffineMaps.
+  SmallVector<AffineMap, 4> ubs;
+  // List of lower bound operands (lbOperands[i] are used by 'lbs[i]').
+  std::vector<SmallVector<Value *, 4>> lbOperands;
+  // List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
+  std::vector<SmallVector<Value *, 4>> ubOperands;
+  // Slice loop nest insertion point in target loop nest.
+  Block::iterator insertPoint;
+  // Adds to 'cst' with constraints which represent the slice bounds on 'ivs'
+  // in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim
+  // identifiers and the values in 'lb/ubOperands' are added as symbols.
+  // Constraints are added for all loop IV bounds (dim or symbol), and
+  // constraints are added for slice bounds in 'lbs'/'ubs'.
+  // Returns failure if we cannot add loop bounds because of unsupported cases.
+  LogicalResult getAsConstraints(FlatAffineConstraints *cst);
+
+  // Clears all bounds and operands in slice state.
+  void clearBounds();
+};
+
+/// Computes the computation slice loop bounds for one loop nest as affine maps
+/// of the other loop nest's IVs and symbols, using 'dependenceConstraints'
+/// computed between 'depSourceAccess' and 'depSinkAccess'.
+/// If 'isBackwardSlice' is true, a backwards slice is computed in which the
+/// slice bounds of loop nest surrounding 'depSourceAccess' are computed in
+/// terms of loop IVs and symbols of the loop nest surrounding 'depSinkAccess'
+/// at 'loopDepth'.
+/// If 'isBackwardSlice' is false, a forward slice is computed in which the
+/// slice bounds of loop nest surrounding 'depSinkAccess' are computed in terms
+/// of loop IVs and symbols of the loop nest surrounding 'depSourceAccess' at
+/// 'loopDepth'.
+/// The slice loop bounds and associated operands are returned in 'sliceState'.
+//
+//  Backward slice example:
+//
+//    affine.for %i0 = 0 to 10 {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//    affine.for %i1 = 0 to 10 {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+//    // Backward computation slice of loop nest '%i0'.
+//    affine.for %i0 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 1)(%i1) {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//
+//  Forward slice example:
+//
+//    affine.for %i0 = 0 to 10 {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//    affine.for %i1 = 0 to 10 {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+//    // Forward computation slice of loop nest '%i1'.
+//    affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp,
+                              FlatAffineConstraints *dependenceConstraints,
+                              unsigned loopDepth, bool isBackwardSlice,
+                              ComputationSliceState *sliceState);
+
+/// Computes in 'sliceUnion' the union of all slice bounds computed at
+/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
+/// The parameter 'numCommonLoops' is the number of loops common to the
+/// operations in 'opsA' and 'opsB'.
+/// If 'isBackwardSlice' is true, computes slice bounds for loop nest
+/// surrounding ops in 'opsA', as a function of IVs and symbols of loop nest
+/// surrounding ops in 'opsB' at 'loopDepth'.
+/// If 'isBackwardSlice' is false, computes slice bounds for loop nest
+/// surrounding ops in 'opsB', as a function of IVs and symbols of loop nest
+/// surrounding ops in 'opsA' at 'loopDepth'.
+/// Returns 'success' if union was computed, 'failure' otherwise.
+// TODO(andydavis) Change this API to take 'forOpA'/'forOpB'.
+LogicalResult computeSliceUnion(ArrayRef<Operation *> opsA,
+                                ArrayRef<Operation *> opsB, unsigned loopDepth,
+                                unsigned numCommonLoops, bool isBackwardSlice,
+                                ComputationSliceState *sliceUnion);
+
+/// Creates a clone of the computation contained in the loop nest surrounding
+/// 'srcOpInst', slices the iteration space of src loop based on slice bounds
+/// in 'sliceState', and inserts the computation slice at the beginning of the
+/// operation block of the loop at 'dstLoopDepth' in the loop nest surrounding
+/// 'dstOpInst'. Returns the top-level loop of the computation slice on
+/// success, returns nullptr otherwise.
+// Loop depth is a crucial optimization choice that determines where to
+// materialize the results of the backward slice - presenting a trade-off b/w
+// storage and redundant computation in several cases.
+// TODO(andydavis) Support computation slices with common surrounding loops.
+AffineForOp insertBackwardComputationSlice(Operation *srcOpInst,
+                                           Operation *dstOpInst,
+                                           unsigned dstLoopDepth,
+                                           ComputationSliceState *sliceState);
+
+/// A region of a memref's data space; this is typically constructed by
+/// analyzing load/store op's on this memref and the index space of loops
+/// surrounding such op's.
+// For example, the memref region for a load operation at loop depth = 1:
+//
+//    affine.for %i = 0 to 32 {
+//      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+//        affine.load %A[%ii]
+//      }
+//    }
+//
+// Region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+//
+struct MemRefRegion {
+  explicit MemRefRegion(Location loc) : loc(loc) {}
+
+  /// Computes the memory region accessed by this memref with the region
+  /// represented as constraints symbolic/parameteric in 'loopDepth' loops
+  /// surrounding opInst. The computed region's 'cst' field has exactly as many
+  /// dimensional identifiers as the rank of the memref, and *potentially*
+  /// additional symbolic identifiers which could include any of the loop IVs
+  /// surrounding opInst up until 'loopDepth' and another additional Function
+  /// symbols involved with the access (for eg., those appear in affine.apply's,
+  /// loop bounds, etc.). If 'sliceState' is non-null, operands from
+  /// 'sliceState' are added as symbols, and the following constraints are added
+  /// to the system:
+  /// *) Inequality constraints which represent loop bounds for 'sliceState'
+  ///    operands which are loop IVS (these represent the destination loop IVs
+  ///    of the slice, and are added as symbols to MemRefRegion's constraint
+  ///    system).
+  /// *) Inequality constraints for the slice bounds in 'sliceState', which
+  ///    represent the bounds on the loop IVs in this constraint system w.r.t
+  ///    to slice operands (which correspond to symbols).
+  /// If 'addMemRefDimBounds' is true, constant upper/lower bounds
+  /// [0, memref.getDimSize(i)) are added for each MemRef dimension 'i'.
+  ///
+  ///  For example, the memref region for this operation at loopDepth = 1 will
+  ///  be:
+  ///
+  ///    affine.for %i = 0 to 32 {
+  ///      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+  ///        load %A[%ii]
+  ///      }
+  ///    }
+  ///
+  ///   {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+  /// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+  ///
+  LogicalResult compute(Operation *op, unsigned loopDepth,
+                        ComputationSliceState *sliceState = nullptr,
+                        bool addMemRefDimBounds = true);
+
+  FlatAffineConstraints *getConstraints() { return &cst; }
+  const FlatAffineConstraints *getConstraints() const { return &cst; }
+  bool isWrite() const { return write; }
+  void setWrite(bool flag) { write = flag; }
+
+  /// Returns a constant upper bound on the number of elements in this region if
+  /// bounded by a known constant (always possible for static shapes), None
+  /// otherwise. Note that the symbols of the region are treated specially,
+  /// i.e., the returned bounding constant holds for *any given* value of the
+  /// symbol identifiers. The 'shape' vector is set to the corresponding
+  /// dimension-wise bounds major to minor. We use int64_t instead of uint64_t
+  /// since index types can be at most int64_t.
+  Optional<int64_t> getConstantBoundingSizeAndShape(
+      SmallVectorImpl<int64_t> *shape = nullptr,
+      std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
+      SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;
+
+  /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
+  /// corresponds to the position of the memref shape's dimension (major to
+  /// minor) which matches 1:1 with the dimensional identifier positions in
+  //'cst'.
+  Optional<int64_t>
+  getConstantBoundOnDimSize(unsigned pos,
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbFloorDivisor = nullptr) const {
+    assert(pos < getRank() && "invalid position");
+    return cst.getConstantBoundOnDimSize(pos, lb);
+  }
+
+  /// Returns the size of this MemRefRegion in bytes.
+  Optional<int64_t> getRegionSize();
+
+  // Wrapper around FlatAffineConstraints::unionBoundingBox.
+  LogicalResult unionBoundingBox(const MemRefRegion &other);
+
+  /// Returns the rank of the memref that this region corresponds to.
+  unsigned getRank() const;
+
+  /// Memref that this region corresponds to.
+  Value *memref;
+
+  /// Read or write.
+  bool write;
+
+  /// If there is more than one load/store op associated with the region, the
+  /// location information would correspond to one of those op's.
+  Location loc;
+
+  /// Region (data space) of the memref accessed. This set will thus have at
+  /// least as many dimensional identifiers as the shape dimensionality of the
+  /// memref, and these are the leading dimensions of the set appearing in that
+  /// order (major to minor / outermost to innermost). There may be additional
+  /// identifiers since getMemRefRegion() is called with a specific loop depth,
+  /// and thus the region is symbolic in the outer surrounding loops at that
+  /// depth.
+  // TODO(bondhugula): Replace this to exploit HyperRectangularSet.
+  FlatAffineConstraints cst;
+};
+
+/// Returns the size of memref data in bytes if it's statically shaped, None
+/// otherwise.
+Optional<uint64_t> getMemRefSizeInBytes(MemRefType memRefType);
+
+/// Checks a load or store op for an out of bound access; returns failure if the
+/// access is out of bounds along any of the dimensions, success otherwise.
+/// Emits a diagnostic error (with location information) if emitError is true.
+template <typename LoadOrStoreOpPointer>
+LogicalResult boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
+                                      bool emitError = true);
+
+/// Returns the number of surrounding loops common to both A and B.
+unsigned getNumCommonSurroundingLoops(Operation &A, Operation &B);
+
+/// Gets the memory footprint of all data touched in the specified memory space
+/// in bytes; if the memory space is unspecified, considers all memory spaces.
+Optional<int64_t> getMemoryFootprintBytes(AffineForOp forOp,
+                                          int memorySpace = -1);
+
+/// Returns true if `forOp' is a parallel loop.
+bool isLoopParallel(AffineForOp forOp);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_UTILS_H
diff --git a/third_party/mlir/include/mlir/Analysis/VectorAnalysis.h b/third_party/mlir/include/mlir/Analysis/VectorAnalysis.h
new file mode 100644
index 00000000000..8b9992da90e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/VectorAnalysis.h
@@ -0,0 +1,143 @@
+//===- VectorAnalysis.h - Analysis for Vectorization -------*- C++ -*-=======//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_ANALYSIS_VECTORANALYSIS_H_
+#define MLIR_ANALYSIS_VECTORANALYSIS_H_
+
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class AffineMap;
+class Location;
+class MemRefType;
+class OpBuilder;
+class Operation;
+class Value;
+class VectorType;
+
+/// Computes and returns the multi-dimensional ratio of `superShape` to
+/// `subShape`. This is calculated by performing a traversal from minor to major
+/// dimensions (i.e. in reverse shape order). If integral division is not
+/// possible, returns None.
+/// The ArrayRefs are assumed (and enforced) to only contain > 1 values.
+/// This constraint comes from the fact that they are meant to be used with
+/// VectorTypes, for which the property holds by construction.
+///
+/// Examples:
+///   - shapeRatio({3, 4, 5, 8}, {2, 5, 2}) returns {3, 2, 1, 4}
+///   - shapeRatio({3, 4, 4, 8}, {2, 5, 2}) returns None
+///   - shapeRatio({1, 2, 10, 32}, {2, 5, 2}) returns {1, 1, 2, 16}
+llvm::Optional<llvm::SmallVector<unsigned, 4>>
+shapeRatio(ArrayRef<int64_t> superShape, ArrayRef<int64_t> subShape);
+
+/// Computes and returns the multi-dimensional ratio of the shapes of
+/// `superVector` to `subVector`. If integral division is not possible, returns
+/// None.
+/// Assumes and enforces that the VectorTypes have the same elemental type.
+llvm::Optional<llvm::SmallVector<unsigned, 4>>
+shapeRatio(VectorType superVectorType, VectorType subVectorType);
+
+/// Constructs a permutation map of invariant memref indices to vector
+/// dimension.
+///
+/// If no index is found to be invariant, 0 is added to the permutation_map and
+/// corresponds to a vector broadcast along that dimension.
+///
+/// The implementation uses the knowledge of the mapping of loops to
+/// vector dimension. `loopToVectorDim` carries this information as a map with:
+///   - keys representing "vectorized enclosing loops";
+///   - values representing the corresponding vector dimension.
+/// Note that loopToVectorDim is a whole function map from which only enclosing
+/// loop information is extracted.
+///
+/// Prerequisites: `opInst` is a vectorizable load or store operation (i.e. at
+/// most one invariant index along each AffineForOp of `loopToVectorDim`).
+///
+/// Example 1:
+/// The following MLIR snippet:
+///
+/// ```mlir
+///    affine.for %i3 = 0 to %0 {
+///      affine.for %i4 = 0 to %1 {
+///        affine.for %i5 = 0 to %2 {
+///          %a5 = load %arg0[%i4, %i5, %i3] : memref<?x?x?xf32>
+///    }}}
+/// ```
+///
+/// may vectorize with {permutation_map: (d0, d1, d2) -> (d2, d1)} into:
+///
+/// ```mlir
+///    affine.for %i3 = 0 to %0 step 32 {
+///      affine.for %i4 = 0 to %1 {
+///        affine.for %i5 = 0 to %2 step 256 {
+///          %4 = vector.transfer_read %arg0, %i4, %i5, %i3
+///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+///               (memref<?x?x?xf32>, index, index) -> vector<32x256xf32>
+///    }}}
+/// ```
+///
+/// Meaning that vector.transfer_read will be responsible for reading the slice:
+/// `%arg0[%i4, %i5:%15+256, %i3:%i3+32]` into vector<32x256xf32>.
+///
+/// Example 2:
+/// The following MLIR snippet:
+///
+/// ```mlir
+///    %cst0 = constant 0 : index
+///    affine.for %i0 = 0 to %0 {
+///      %a0 = load %arg0[%cst0, %cst0] : memref<?x?xf32>
+///    }
+/// ```
+///
+/// may vectorize with {permutation_map: (d0) -> (0)} into:
+///
+/// ```mlir
+///    affine.for %i0 = 0 to %0 step 128 {
+///      %3 = vector.transfer_read %arg0, %c0_0, %c0_0
+///           {permutation_map: (d0, d1) -> (0)} :
+///           (memref<?x?xf32>, index, index) -> vector<128xf32>
+///    }
+/// ````
+///
+/// Meaning that vector.transfer_read will be responsible of reading the slice
+/// `%arg0[%c0, %c0]` into vector<128xf32> which needs a 1-D vector broadcast.
+///
+AffineMap makePermutationMap(
+    Operation *op, ArrayRef<Value *> indices,
+    const llvm::DenseMap<Operation *, unsigned> &loopToVectorDim);
+
+namespace matcher {
+
+/// Matches vector.transfer_read, vector.transfer_write and ops that return a
+/// vector type that is a multiple of the sub-vector type. This allows passing
+/// over other smaller vector types in the function and avoids interfering with
+/// operations on those.
+/// This is a first approximation, it can easily be extended in the future.
+/// TODO(ntv): this could all be much simpler if we added a bit that a vector
+/// type to mark that a vector is a strict super-vector but it still does not
+/// warrant adding even 1 extra bit in the IR for now.
+bool operatesOnSuperVectorsOf(Operation &op, VectorType subVectorType);
+
+} // end namespace matcher
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_VECTORANALYSIS_H_
diff --git a/third_party/mlir/include/mlir/Analysis/Verifier.h b/third_party/mlir/include/mlir/Analysis/Verifier.h
new file mode 100644
index 00000000000..daaff57683e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Analysis/Verifier.h
@@ -0,0 +1,31 @@
+//===- Verifier.h - Verifier analysis for MLIR structures -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_ANALYSIS_VERIFIER_H
+#define MLIR_ANALYSIS_VERIFIER_H
+
+namespace mlir {
+struct LogicalResult;
+class Operation;
+
+/// Perform (potentially expensive) checks of invariants, used to detect
+/// compiler bugs, on this operation and any nested operations. On error, this
+/// reports the error through the MLIRContext and returns failure.
+LogicalResult verify(Operation *op);
+} //  end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/CMakeLists.txt b/third_party/mlir/include/mlir/CMakeLists.txt
new file mode 100644
index 00000000000..b393ea2c0e8
--- /dev/null
+++ b/third_party/mlir/include/mlir/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(Dialect)
+add_subdirectory(EDSC)
diff --git a/third_party/mlir/include/mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h b/third_party/mlir/include/mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h
new file mode 100644
index 00000000000..78e4356607f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h
@@ -0,0 +1,45 @@
+//===- ConvertControlFlowToCFG.h - Pass entrypoint --------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_CONVERSION_CONTROLFLOWTOCFG_CONVERTCONTROLFLOWTOCFG_H_
+#define MLIR_CONVERSION_CONTROLFLOWTOCFG_CONVERTCONTROLFLOWTOCFG_H_
+
+#include <memory>
+#include <vector>
+
+namespace mlir {
+class FuncOp;
+class FunctionPassBase;
+struct LogicalResult;
+class MLIRContext;
+class RewritePattern;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Collect a set of patterns to lower from loop.for, loop.if, and
+/// loop.terminator to CFG operations within the Standard dialect, in particular
+/// convert structured control flow into CFG branch-based control flow.
+void populateLoopToStdConversionPatterns(OwningRewritePatternList &patterns,
+                                         MLIRContext *ctx);
+
+/// Creates a pass to convert loop.for, loop.if and loop.terminator ops to CFG.
+FunctionPassBase *createConvertToCFGPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_CONTROLFLOWTOCFG_CONVERTCONTROLFLOWTOCFG_H_
diff --git a/third_party/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/third_party/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
new file mode 100644
index 00000000000..b8b7a1e37ef
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -0,0 +1,67 @@
+//===- GPUToCUDAPass.h - MLIR CUDA runtime support --------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
+#define MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace mlir {
+
+class FuncOp;
+class Location;
+class ModulePassBase;
+class OpBuilder;
+class Value;
+
+namespace LLVM {
+class LLVMDialect;
+}
+
+using OwnedCubin = std::unique_ptr<std::vector<char>>;
+using CubinGenerator = std::function<OwnedCubin(const std::string &, FuncOp &)>;
+
+/// Creates a pass to convert kernel functions into CUBIN blobs.
+///
+/// This transformation takes the body of each function that is annotated with
+/// the 'nvvm.kernel' attribute, copies it to a new LLVM module, compiles the
+/// module with help of the nvptx backend to PTX and then invokes the provided
+/// cubinGenerator to produce a binary blob (the cubin). Such blob is then
+/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
+/// After the transformation, the body of the kernel function is removed (i.e.,
+/// it is turned into a declaration).
+std::unique_ptr<ModulePassBase>
+createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
+
+/// Creates a pass to convert a gpu.launch_func operation into a sequence of
+/// CUDA calls.
+///
+/// This pass does not generate code to call CUDA directly but instead uses a
+/// small wrapper library that exports a stable and conveniently typed ABI
+/// ontop of CUDA.
+std::unique_ptr<ModulePassBase> createConvertGpuLaunchFuncToCudaCallsPass();
+
+/// Creates a pass to augment a module with getter functions for all contained
+/// cubins as encoded via the 'nvvm.cubin' attribute.
+std::unique_ptr<ModulePassBase> createGenerateCubinAccessorPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/third_party/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/third_party/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
new file mode 100644
index 00000000000..35f231464f1
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -0,0 +1,36 @@
+//===- GPUToNVMMPass.h - Convert GPU kernel to NVVM dialect -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
+#define MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
+
+#include <memory>
+
+namespace mlir {
+class LLVMTypeConverter;
+class ModulePassBase;
+class OwningRewritePatternList;
+
+/// Collect a set of patterns to convert from the GPU dialect to NVVM.
+void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns);
+
+/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
+std::unique_ptr<ModulePassBase> createLowerGpuOpsToNVVMOpsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
diff --git a/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h b/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
new file mode 100644
index 00000000000..973b995f10b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
@@ -0,0 +1,57 @@
+//===- LoopsToGPU.h - Convert loop nests to GPU kernels ---------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
+#define MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
+
+namespace mlir {
+class AffineForOp;
+struct LogicalResult;
+
+namespace loop {
+class ForOp;
+} // end namespace loop
+
+/// Convert a perfect affine loop nest with the outermost loop identified by
+/// `forOp` into a gpu::Launch operation.  Map `numBlockDims` outer loops to
+/// GPU blocks and `numThreadDims` to GPU threads.  The bounds of the loops that
+/// are mapped should be independent of the induction variables of the other
+/// mapped loops.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
+                                               unsigned numBlockDims,
+                                               unsigned numThreadDims);
+
+/// Convert a perfect linalg loop nest with the outermost loop identified by
+/// `forOp` into a gpu::Launch operation.  Map `numBlockDims` outer loops to
+/// GPU blocks and `numThreadDims` to GPU threads.  The bounds of the loops that
+/// are mapped should be independent of the induction variables of the other
+/// mapped loops.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+LogicalResult convertLoopNestToGPULaunch(loop::ForOp forOp,
+                                         unsigned numBlockDims,
+                                         unsigned numThreadDims);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
diff --git a/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h b/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
new file mode 100644
index 00000000000..3d32c36c43c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
@@ -0,0 +1,37 @@
+//===- LoopsToGPUPass.h - Pass converting loops to GPU kernels --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
+#define MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
+
+#include <memory>
+
+namespace mlir {
+class FunctionPassBase;
+
+/// Create a pass that converts loop nests into GPU kernels.  It considers
+/// top-level affine.for and linalg.for operations as roots of loop nests and
+/// converts them to the gpu.launch operations if possible.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+std::unique_ptr<FunctionPassBase>
+createSimpleLoopsToGPUPass(unsigned numBlockDims, unsigned numThreadDims);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
new file mode 100644
index 00000000000..2f413de8ec3
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -0,0 +1,129 @@
+//===- ConvertStandardToLLVM.h - Convert to the LLVM dialect ----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides a dialect conversion targeting the LLVM IR dialect.  By default, it
+// converts Standard ops and types and provides hooks for dialect-specific
+// extensions to the conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
+#define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace llvm {
+class IntegerType;
+class LLVMContext;
+class Module;
+class Type;
+} // namespace llvm
+
+namespace mlir {
+namespace LLVM {
+class LLVMDialect;
+class LLVMType;
+} // namespace LLVM
+
+/// Conversion from types in the Standard dialect to the LLVM IR dialect.
+class LLVMTypeConverter : public TypeConverter {
+public:
+  using TypeConverter::convertType;
+
+  LLVMTypeConverter(MLIRContext *ctx);
+
+  /// Convert types to LLVM IR.  This calls `convertAdditionalType` to convert
+  /// non-standard or non-builtin types.
+  Type convertType(Type t) override;
+
+  /// Convert a non-empty list of types to be returned from a function into a
+  /// supported LLVM IR type.  In particular, if more than one values is
+  /// returned, create an LLVM IR structure type with elements that correspond
+  /// to each of the MLIR types converted with `convertType`.
+  Type packFunctionResults(ArrayRef<Type> types);
+
+  /// Returns the LLVM context.
+  llvm::LLVMContext &getLLVMContext();
+
+  /// Returns the LLVM dialect.
+  LLVM::LLVMDialect *getDialect() { return llvmDialect; }
+
+protected:
+  /// LLVM IR module used to parse/create types.
+  llvm::Module *module;
+  LLVM::LLVMDialect *llvmDialect;
+
+private:
+  Type convertStandardType(Type type);
+
+  // Convert a function type.  The arguments and results are converted one by
+  // one.  Additionally, if the function returns more than one value, pack the
+  // results into an LLVM IR structure type so that the converted function type
+  // returns at most one result.
+  Type convertFunctionType(FunctionType type);
+
+  // Convert the index type.  Uses llvmModule data layout to create an integer
+  // of the pointer bitwidth.
+  Type convertIndexType(IndexType type);
+
+  // Convert an integer type `i*` to `!llvm<"i*">`.
+  Type convertIntegerType(IntegerType type);
+
+  // Convert a floating point type: `f16` to `!llvm.half`, `f32` to
+  // `!llvm.float` and `f64` to `!llvm.double`.  `bf16` is not supported
+  // by LLVM.
+  Type convertFloatType(FloatType type);
+
+  // Convert a memref type into an LLVM type that captures the relevant data.
+  // For statically-shaped memrefs, the resulting type is a pointer to the
+  // (converted) memref element type. For dynamically-shaped memrefs, the
+  // resulting type is an LLVM structure type that contains:
+  //   1. a pointer to the (converted) memref element type
+  //   2. as many index types as memref has dynamic dimensions.
+  Type convertMemRefType(MemRefType type);
+
+  // Convert a 1D vector type into an LLVM vector type.
+  Type convertVectorType(VectorType type);
+
+  // Get the LLVM representation of the index type based on the bitwidth of the
+  // pointer as defined by the data layout of the module.
+  LLVM::LLVMType getIndexType();
+
+  // Wrap the given LLVM IR type into an LLVM IR dialect type.
+  Type wrap(llvm::Type *llvmType);
+
+  // Extract an LLVM IR dialect type.
+  LLVM::LLVMType unwrap(Type type);
+};
+
+/// Base class for operation conversions targeting the LLVM IR dialect. Provides
+/// conversion patterns with an access to the containing LLVMLowering for the
+/// purpose of type conversions.
+class LLVMOpLowering : public ConversionPattern {
+public:
+  LLVMOpLowering(StringRef rootOpName, MLIRContext *context,
+                 LLVMTypeConverter &lowering, PatternBenefit benefit = 1);
+
+protected:
+  // Back-reference to the lowering class, used to call type and function
+  // conversions accounting for potential extensions.
+  LLVMTypeConverter &lowering;
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
new file mode 100644
index 00000000000..d2f416b35fe
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
@@ -0,0 +1,92 @@
+//===- ConvertStandardToLLVMPass.h - Pass entrypoint ------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
+#define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class DialectConversion;
+class FuncOp;
+class LLVMTypeConverter;
+struct LogicalResult;
+class MLIRContext;
+class ModuleOp;
+class ModulePassBase;
+class RewritePattern;
+class Type;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Type for a callback constructing the owning list of patterns for the
+/// conversion to the LLVMIR dialect.  The callback is expected to append
+/// patterns to the owning list provided as the second argument.
+using LLVMPatternListFiller =
+    std::function<void(LLVMTypeConverter &, OwningRewritePatternList &)>;
+
+/// Type for a callback constructing the type converter for the conversion to
+/// the LLVMIR dialect.  The callback is expected to return an instance of the
+/// converter.
+using LLVMTypeConverterMaker =
+    std::function<std::unique_ptr<LLVMTypeConverter>(MLIRContext *)>;
+
+/// Collect a set of patterns to convert from the Standard dialect to LLVM.
+void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns);
+
+/// Creates a pass to convert the Standard dialect into the LLVMIR dialect.
+std::unique_ptr<ModulePassBase> createConvertToLLVMIRPass();
+
+/// Creates a pass to convert operations to the LLVMIR dialect.  The conversion
+/// is defined by a list of patterns and a type converter that will be obtained
+/// during the pass using the provided callbacks.
+std::unique_ptr<ModulePassBase>
+createConvertToLLVMIRPass(LLVMPatternListFiller patternListFiller,
+                          LLVMTypeConverterMaker typeConverterMaker);
+
+/// Creates a pass to convert operations to the LLVMIR dialect.  The conversion
+/// is defined by a list of patterns obtained during the pass using the provided
+/// callback and an optional type conversion class, an instance is created
+/// during the pass.
+template <typename TypeConverter = LLVMTypeConverter>
+std::unique_ptr<ModulePassBase>
+createConvertToLLVMIRPass(LLVMPatternListFiller patternListFiller) {
+  return createConvertToLLVMIRPass(patternListFiller, [](MLIRContext *context) {
+    return std::make_unique<TypeConverter>(context);
+  });
+}
+
+namespace LLVM {
+/// Make argument-taking successors of each block distinct.  PHI nodes in LLVM
+/// IR use the predecessor ID to identify which value to take.  They do not
+/// support different values coming from the same predecessor.  If a block has
+/// another block as a successor more than once with different values, insert
+/// a new dummy block for LLVM PHI nodes to tell the sources apart.
+void ensureDistinctSuccessors(ModuleOp m);
+} // namespace LLVM
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
diff --git a/third_party/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h b/third_party/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h
new file mode 100644
index 00000000000..25a710f5f9e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h
@@ -0,0 +1,192 @@
+//===- ConvertStandardToSPIRV.h - Convert to SPIR-V dialect -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides type converters and patterns to convert from standard types/ops to
+// SPIR-V types and operations. Also provides utilities and base classes to use
+// while targeting SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
+#define MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Support/StringExtras.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+
+class LoadOp;
+class ReturnOp;
+class StoreOp;
+namespace spirv {
+class SPIRVDialect;
+}
+
+/// Type conversion from Standard Types to SPIR-V Types.
+class SPIRVBasicTypeConverter : public TypeConverter {
+public:
+  explicit SPIRVBasicTypeConverter(MLIRContext *context);
+
+  /// Converts types to SPIR-V supported types.
+  virtual Type convertType(Type t);
+
+protected:
+  spirv::SPIRVDialect *spirvDialect;
+};
+
+/// Converts a function type according to the requirements of a SPIR-V entry
+/// function. The arguments need to be converted to spv.Variables of spv.ptr
+/// types so that they could be bound by the runtime.
+class SPIRVTypeConverter final : public TypeConverter {
+public:
+  explicit SPIRVTypeConverter(SPIRVBasicTypeConverter *basicTypeConverter)
+      : basicTypeConverter(basicTypeConverter) {}
+
+  /// Convert types to SPIR-V types using the basic type converter.
+  Type convertType(Type t) override {
+    return basicTypeConverter->convertType(t);
+  }
+
+  /// Method to convert argument of a function. The `type` is converted to
+  /// spv.ptr<type, Uniform>.
+  // TODO(ravishankarm) : Support other storage classes.
+  LogicalResult convertSignatureArg(unsigned inputNo, Type type,
+                                    SignatureConversion &result) override;
+
+  /// Gets the basic type converter.
+  SPIRVBasicTypeConverter *getBasicTypeConverter() const {
+    return basicTypeConverter;
+  }
+
+private:
+  SPIRVBasicTypeConverter *basicTypeConverter;
+};
+
+/// Base class to define a conversion pattern to translate Ops into SPIR-V.
+template <typename OpTy> class SPIRVOpLowering : public ConversionPattern {
+public:
+  SPIRVOpLowering(MLIRContext *context, SPIRVTypeConverter &typeConverter)
+      : ConversionPattern(OpTy::getOperationName(), 1, context),
+        typeConverter(typeConverter) {}
+
+protected:
+  /// Gets the global variable associated with a builtin and add
+  /// it if it doesnt exist.
+  Value *loadFromBuiltinVariable(Operation *op, spirv::BuiltIn builtin,
+                                 ConversionPatternRewriter &rewriter) const {
+    auto moduleOp = op->getParentOfType<spirv::ModuleOp>();
+    if (!moduleOp) {
+      op->emitError("expected operation to be within a SPIR-V module");
+      return nullptr;
+    }
+    auto varOp =
+        getOrInsertBuiltinVariable(moduleOp, op->getLoc(), builtin, rewriter);
+    auto ptr = rewriter
+                   .create<spirv::AddressOfOp>(op->getLoc(), varOp.type(),
+                                               rewriter.getSymbolRefAttr(varOp))
+                   .pointer();
+    return rewriter.create<spirv::LoadOp>(
+        op->getLoc(),
+        ptr->getType().template cast<spirv::PointerType>().getPointeeType(),
+        ptr, /*memory_access =*/nullptr, /*alignment =*/nullptr);
+  }
+
+  /// Type lowering class.
+  SPIRVTypeConverter &typeConverter;
+
+private:
+  /// Look through all global variables in `moduleOp` and check if there is a
+  /// spv.globalVariable that has the same `builtin` attribute.
+  spirv::GlobalVariableOp getBuiltinVariable(spirv::ModuleOp &moduleOp,
+                                             spirv::BuiltIn builtin) const {
+    for (auto varOp : moduleOp.getBlock().getOps<spirv::GlobalVariableOp>()) {
+      if (auto builtinAttr = varOp.getAttrOfType<StringAttr>(convertToSnakeCase(
+              stringifyDecoration(spirv::Decoration::BuiltIn)))) {
+        auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
+        if (varBuiltIn && varBuiltIn.getValue() == builtin) {
+          return varOp;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  /// Gets name of global variable for a buitlin.
+  std::string getBuiltinVarName(spirv::BuiltIn builtin) const {
+    return std::string("__builtin_var_") + stringifyBuiltIn(builtin).str() +
+           "__";
+  }
+
+  /// Gets or inserts a global variable for a builtin within a module.
+  spirv::GlobalVariableOp
+  getOrInsertBuiltinVariable(spirv::ModuleOp &moduleOp, Location loc,
+                             spirv::BuiltIn builtin,
+                             ConversionPatternRewriter &builder) const {
+    if (auto varOp = getBuiltinVariable(moduleOp, builtin)) {
+      return varOp;
+    }
+    auto ip = builder.saveInsertionPoint();
+    builder.setInsertionPointToStart(&moduleOp.getBlock());
+    auto name = getBuiltinVarName(builtin);
+    spirv::GlobalVariableOp newVarOp;
+    switch (builtin) {
+    case spirv::BuiltIn::NumWorkgroups:
+    case spirv::BuiltIn::WorkgroupSize:
+    case spirv::BuiltIn::WorkgroupId:
+    case spirv::BuiltIn::LocalInvocationId:
+    case spirv::BuiltIn::GlobalInvocationId: {
+      auto ptrType = spirv::PointerType::get(
+          builder.getVectorType({3}, builder.getIntegerType(32)),
+          spirv::StorageClass::Input);
+      newVarOp = builder.create<spirv::GlobalVariableOp>(
+          loc, builder.getTypeAttr(ptrType), builder.getStringAttr(name),
+          nullptr);
+      newVarOp.setAttr(
+          convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn)),
+          builder.getStringAttr(stringifyBuiltIn(builtin)));
+      break;
+    }
+    default:
+      emitError(loc, "unimplemented builtin variable generation for ")
+          << stringifyBuiltIn(builtin);
+    }
+    builder.restoreInsertionPoint(ip);
+    return newVarOp;
+  }
+};
+
+/// Legalizes a function as a non-entry function.
+LogicalResult lowerFunction(FuncOp funcOp, ArrayRef<Value *> operands,
+                            SPIRVTypeConverter *typeConverter,
+                            ConversionPatternRewriter &rewriter,
+                            FuncOp &newFuncOp);
+
+/// Legalizes a function as an entry function.
+LogicalResult lowerAsEntryFunction(FuncOp funcOp, ArrayRef<Value *> operands,
+                                   SPIRVTypeConverter *typeConverter,
+                                   ConversionPatternRewriter &rewriter,
+                                   FuncOp &newFuncOp);
+
+/// Appends to a pattern list additional patterns for translating StandardOps to
+/// SPIR-V ops.
+void populateStandardToSPIRVPatterns(MLIRContext *context,
+                                     OwningRewritePatternList &patterns);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
diff --git a/third_party/mlir/include/mlir/Conversion/VectorToLLVM/VectorToLLVM.h b/third_party/mlir/include/mlir/Conversion/VectorToLLVM/VectorToLLVM.h
new file mode 100644
index 00000000000..7334c67e0d3
--- /dev/null
+++ b/third_party/mlir/include/mlir/Conversion/VectorToLLVM/VectorToLLVM.h
@@ -0,0 +1,33 @@
+//===- VectorToLLVM.h - Pass converting vector to LLVM dialect --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_VECTORTOLLVM_VECTORTOLLVM_H_
+#define MLIR_CONVERSION_VECTORTOLLVM_VECTORTOLLVM_H_
+
+namespace mlir {
+class LLVMTypeConverter;
+class ModulePassBase;
+class OwningRewritePatternList;
+
+/// Collect a set of patterns to convert from the Vector dialect to LLVM.
+void populateVectorToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                            OwningRewritePatternList &patterns);
+
+/// Create a pass to convert vector operations to the LLVMIR dialect.
+ModulePassBase *createLowerVectorToLLVMPass();
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_VECTORTOLLVM_VECTORTOLLVM_H_
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
new file mode 100644
index 00000000000..a6af20eca0b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
@@ -0,0 +1,598 @@
+//===- AffineOps.h - MLIR Affine Operations -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines convenience types for working with Affine operations
+// in the MLIR operation set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
+#define MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+class AffineBound;
+class AffineValueMap;
+class AffineTerminatorOp;
+class FlatAffineConstraints;
+class OpBuilder;
+
+/// A utility function to check if a value is defined at the top level of a
+/// function. A value defined at the top level is always a valid symbol.
+bool isTopLevelSymbol(Value *value);
+
+class AffineOpsDialect : public Dialect {
+public:
+  AffineOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "affine"; }
+};
+
+/// The "affine.apply" operation applies an affine map to a list of operands,
+/// yielding a single result. The operand list must be the same size as the
+/// number of arguments to the affine mapping.  All operands and the result are
+/// of type 'Index'. This operation requires a single affine map attribute named
+/// "map".  For example:
+///
+///   %y = "affine.apply" (%x) { map: (d0) -> (d0 + 1) } :
+///          (index) -> (index)
+///
+/// equivalently:
+///
+///   #map42 = (d0)->(d0+1)
+///   %y = affine.apply #map42(%x)
+///
+class AffineApplyOp : public Op<AffineApplyOp, OpTrait::VariadicOperands,
+                                OpTrait::OneResult, OpTrait::HasNoSideEffect> {
+public:
+  using Op::Op;
+
+  /// Builds an affine apply op with the specified map and operands.
+  static void build(Builder *builder, OperationState *result, AffineMap map,
+                    ArrayRef<Value *> operands);
+
+  /// Returns the affine map to be applied by this operation.
+  AffineMap getAffineMap() {
+    return getAttrOfType<AffineMapAttr>("map").getValue();
+  }
+
+  /// Returns true if the result of this operation can be used as dimension id.
+  bool isValidDim();
+
+  /// Returns true if the result of this operation is a symbol.
+  bool isValidSymbol();
+
+  static StringRef getOperationName() { return "affine.apply"; }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+  OpFoldResult fold(ArrayRef<Attribute> operands);
+
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// AffineDmaStartOp starts a non-blocking DMA operation that transfers data
+/// from a source memref to a destination memref. The source and destination
+/// memref need not be of the same dimensionality, but need to have the same
+/// elemental type. The operands include the source and destination memref's
+/// each followed by its indices, size of the data transfer in terms of the
+/// number of elements (of the elemental type of the memref), a tag memref with
+/// its indices, and optionally at the end, a stride and a
+/// number_of_elements_per_stride arguments. The tag location is used by an
+/// AffineDmaWaitOp to check for completion. The indices of the source memref,
+/// destination memref, and the tag memref have the same restrictions as any
+/// affine.load/store. In particular, index for each memref dimension must be an
+/// affine expression of loop induction variables and symbols.
+/// The optional stride arguments should be of 'index' type, and specify a
+/// stride for the slower memory space (memory space with a lower memory space
+/// id), tranferring chunks of number_of_elements_per_stride every stride until
+/// %num_elements are transferred. Either both or no stride arguments should be
+/// specified. The value of 'num_elements' must be a multiple of
+/// 'number_of_elements_per_stride'.
+//
+// For example, a DmaStartOp operation that transfers 256 elements of a memref
+// '%src' in memory space 0 at indices [%i + 3, %j] to memref '%dst' in memory
+// space 1 at indices [%k + 7, %l], would be specified as follows:
+//
+//   %num_elements = constant 256
+//   %idx = constant 0 : index
+//   %tag = alloc() : memref<1xi32, 4>
+//   affine.dma_start %src[%i + 3, %j], %dst[%k + 7, %l], %tag[%idx],
+//     %num_elements :
+//       memref<40x128xf32, 0>, memref<2x1024xf32, 1>, memref<1xi32, 2>
+//
+//   If %stride and %num_elt_per_stride are specified, the DMA is expected to
+//   transfer %num_elt_per_stride elements every %stride elements apart from
+//   memory space 0 until %num_elements are transferred.
+//
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%idx], %num_elements,
+//     %stride, %num_elt_per_stride : ...
+//
+// TODO(mlir-team): add additional operands to allow source and destination
+// striding, and multiple stride levels (possibly using AffineMaps to specify
+// multiple levels of striding).
+// TODO(andydavis) Consider replacing src/dst memref indices with view memrefs.
+class AffineDmaStartOp : public Op<AffineDmaStartOp, OpTrait::VariadicOperands,
+                                   OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, Value *srcMemRef,
+                    AffineMap srcMap, ArrayRef<Value *> srcIndices,
+                    Value *destMemRef, AffineMap dstMap,
+                    ArrayRef<Value *> destIndices, Value *tagMemRef,
+                    AffineMap tagMap, ArrayRef<Value *> tagIndices,
+                    Value *numElements, Value *stride = nullptr,
+                    Value *elementsPerStride = nullptr);
+
+  /// Returns the operand index of the src memref.
+  unsigned getSrcMemRefOperandIndex() { return 0; }
+
+  /// Returns the source MemRefType for this DMA operation.
+  Value *getSrcMemRef() { return getOperand(getSrcMemRefOperandIndex()); }
+  MemRefType getSrcMemRefType() {
+    return getSrcMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the source MemRefType.
+  unsigned getSrcMemRefRank() { return getSrcMemRefType().getRank(); }
+
+  /// Returns the affine map used to access the src memref.
+  AffineMap getSrcMap() { return getSrcMapAttr().getValue(); }
+  AffineMapAttr getSrcMapAttr() {
+    return getAttr(getSrcMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the source memref affine map indices for this DMA operation.
+  operand_range getSrcIndices() {
+    return {operand_begin() + getSrcMemRefOperandIndex() + 1,
+            operand_begin() + getSrcMemRefOperandIndex() + 1 +
+                getSrcMap().getNumInputs()};
+  }
+
+  /// Returns the memory space of the src memref.
+  unsigned getSrcMemorySpace() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  /// Returns the operand index of the dst memref.
+  unsigned getDstMemRefOperandIndex() {
+    return getSrcMemRefOperandIndex() + 1 + getSrcMap().getNumInputs();
+  }
+
+  /// Returns the destination MemRefType for this DMA operations.
+  Value *getDstMemRef() { return getOperand(getDstMemRefOperandIndex()); }
+  MemRefType getDstMemRefType() {
+    return getDstMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the destination MemRefType.
+  unsigned getDstMemRefRank() {
+    return getDstMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the memory space of the src memref.
+  unsigned getDstMemorySpace() {
+    return getDstMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  /// Returns the affine map used to access the dst memref.
+  AffineMap getDstMap() { return getDstMapAttr().getValue(); }
+  AffineMapAttr getDstMapAttr() {
+    return getAttr(getDstMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the destination memref indices for this DMA operation.
+  operand_range getDstIndices() {
+    return {operand_begin() + getDstMemRefOperandIndex() + 1,
+            operand_begin() + getDstMemRefOperandIndex() + 1 +
+                getDstMap().getNumInputs()};
+  }
+
+  /// Returns the operand index of the tag memref.
+  unsigned getTagMemRefOperandIndex() {
+    return getDstMemRefOperandIndex() + 1 + getDstMap().getNumInputs();
+  }
+
+  /// Returns the Tag MemRef for this DMA operation.
+  Value *getTagMemRef() { return getOperand(getTagMemRefOperandIndex()); }
+  MemRefType getTagMemRefType() {
+    return getTagMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the tag MemRefType.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the affine map used to access the tag memref.
+  AffineMap getTagMap() { return getTagMapAttr().getValue(); }
+  AffineMapAttr getTagMapAttr() {
+    return getAttr(getTagMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the tag memref indices for this DMA operation.
+  operand_range getTagIndices() {
+    return {operand_begin() + getTagMemRefOperandIndex() + 1,
+            operand_begin() + getTagMemRefOperandIndex() + 1 +
+                getTagMap().getNumInputs()};
+  }
+
+  /// Returns the number of elements being transferred by this DMA operation.
+  Value *getNumElements() {
+    return getOperand(getTagMemRefOperandIndex() + 1 +
+                      getTagMap().getNumInputs());
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value *memref) {
+    if (memref == getSrcMemRef())
+      return {Identifier::get(getSrcMapAttrName(), getContext()),
+              getSrcMapAttr()};
+    else if (memref == getDstMemRef())
+      return {Identifier::get(getDstMapAttrName(), getContext()),
+              getDstMapAttr()};
+    assert(memref == getTagMemRef() &&
+           "DmaStartOp expected source, destination or tag memref");
+    return {Identifier::get(getTagMapAttrName(), getContext()),
+            getTagMapAttr()};
+  }
+
+  /// Returns true if this is a DMA from a faster memory space to a slower one.
+  bool isDestMemorySpaceFaster() {
+    return (getSrcMemorySpace() < getDstMemorySpace());
+  }
+
+  /// Returns true if this is a DMA from a slower memory space to a faster one.
+  bool isSrcMemorySpaceFaster() {
+    // Assumes that a lower number is for a slower memory space.
+    return (getDstMemorySpace() < getSrcMemorySpace());
+  }
+
+  /// Given a DMA start operation, returns the operand position of either the
+  /// source or destination memref depending on the one that is at the higher
+  /// level of the memory hierarchy. Asserts failure if neither is true.
+  unsigned getFasterMemPos() {
+    assert(isSrcMemorySpaceFaster() || isDestMemorySpaceFaster());
+    return isSrcMemorySpaceFaster() ? 0 : getDstMemRefOperandIndex();
+  }
+
+  static StringRef getSrcMapAttrName() { return "src_map"; }
+  static StringRef getDstMapAttrName() { return "dst_map"; }
+  static StringRef getTagMapAttrName() { return "tag_map"; }
+
+  static StringRef getOperationName() { return "affine.dma_start"; }
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+
+  /// Returns true if this DMA operation is strided, returns false otherwise.
+  bool isStrided() {
+    return getNumOperands() !=
+           getTagMemRefOperandIndex() + 1 + getTagMap().getNumInputs() + 1;
+  }
+
+  /// Returns the stride value for this DMA operation.
+  Value *getStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1 - 1);
+  }
+
+  /// Returns the number of elements to transfer per stride for this DMA op.
+  Value *getNumElementsPerStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1);
+  }
+};
+
+/// AffineDmaWaitOp blocks until the completion of a DMA operation associated
+/// with the tag element '%tag[%index]'. %tag is a memref, and %index has to be
+/// an index with the same restrictions as any load/store index. In particular,
+/// index for each memref dimension must be an affine expression of loop
+/// induction variables and symbols. %num_elements is the number of elements
+/// associated with the DMA operation. For example:
+//
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%index], %num_elements :
+//     memref<2048xf32, 0>, memref<256xf32, 1>, memref<1xi32, 2>
+//   ...
+//   ...
+//   affine.dma_wait %tag[%index], %num_elements : memref<1xi32, 2>
+//
+class AffineDmaWaitOp : public Op<AffineDmaWaitOp, OpTrait::VariadicOperands,
+                                  OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, Value *tagMemRef,
+                    AffineMap tagMap, ArrayRef<Value *> tagIndices,
+                    Value *numElements);
+
+  static StringRef getOperationName() { return "affine.dma_wait"; }
+
+  // Returns the Tag MemRef associated with the DMA operation being waited on.
+  Value *getTagMemRef() { return getOperand(0); }
+  MemRefType getTagMemRefType() {
+    return getTagMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the affine map used to access the tag memref.
+  AffineMap getTagMap() { return getTagMapAttr().getValue(); }
+  AffineMapAttr getTagMapAttr() {
+    return getAttr(getTagMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    return {operand_begin() + 1,
+            operand_begin() + 1 + getTagMap().getNumInputs()};
+  }
+
+  // Returns the rank (number of indices) of the tag memref.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value *memref) {
+    assert(memref == getTagMemRef());
+    return {Identifier::get(getTagMapAttrName(), getContext()),
+            getTagMapAttr()};
+  }
+
+  /// Returns the number of elements transferred in the associated DMA op.
+  Value *getNumElements() { return getOperand(1 + getTagMap().getNumInputs()); }
+
+  static StringRef getTagMapAttrName() { return "tag_map"; }
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// The "affine.load" op reads an element from a memref, where the index
+/// for each memref dimension is an affine expression of loop induction
+/// variables and symbols. The output of 'affine.load' is a new value with the
+/// same type as the elements of the memref. An affine expression of loop IVs
+/// and symbols must be specified for each dimension of the memref. The keyword
+/// 'symbol' can be used to indicate SSA identifiers which are symbolic.
+//
+//  Example 1:
+//
+//    %1 = affine.load %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+//
+//  Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+//
+//    %1 = affine.load %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+//      : memref<100x100xf32>
+//
+class AffineLoadOp : public Op<AffineLoadOp, OpTrait::OneResult,
+                               OpTrait::AtLeastNOperands<1>::Impl> {
+public:
+  using Op::Op;
+
+  /// Builds an affine load op with the specified map and operands.
+  static void build(Builder *builder, OperationState *result, AffineMap map,
+                    ArrayRef<Value *> operands);
+  /// Builds an affine load op an identify map and operands.
+  static void build(Builder *builder, OperationState *result, Value *memref,
+                    ArrayRef<Value *> indices = {});
+
+  /// Returns the operand index of the memref.
+  unsigned getMemRefOperandIndex() { return 0; }
+
+  /// Get memref operand.
+  Value *getMemRef() { return getOperand(getMemRefOperandIndex()); }
+  void setMemRef(Value *value) { setOperand(getMemRefOperandIndex(), value); }
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Get affine map operands.
+  operand_range getIndices() { return llvm::drop_begin(getOperands(), 1); }
+
+  /// Returns the affine map used to index the memref for this operation.
+  AffineMap getAffineMap() { return getAffineMapAttr().getValue(); }
+  AffineMapAttr getAffineMapAttr() {
+    return getAttr(getMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value *memref) {
+    assert(memref == getMemRef());
+    return {Identifier::get(getMapAttrName(), getContext()),
+            getAffineMapAttr()};
+  }
+
+  static StringRef getMapAttrName() { return "map"; }
+  static StringRef getOperationName() { return "affine.load"; }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// The "affine.store" op writes an element to a memref, where the index
+/// for each memref dimension is an affine expression of loop induction
+/// variables and symbols. The 'affine.store' op stores a new value which is the
+/// same type as the elements of the memref. An affine expression of loop IVs
+/// and symbols must be specified for each dimension of the memref. The keyword
+/// 'symbol' can be used to indicate SSA identifiers which are symbolic.
+//
+//  Example 1:
+//
+//    affine.store %v0, %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+//
+//  Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+//
+//    affine.store %v0, %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+//      : memref<100x100xf32>
+//
+class AffineStoreOp : public Op<AffineStoreOp, OpTrait::ZeroResult,
+                                OpTrait::AtLeastNOperands<1>::Impl> {
+public:
+  using Op::Op;
+
+  /// Builds an affine store operation with the specified map and operands.
+  static void build(Builder *builder, OperationState *result,
+                    Value *valueToStore, AffineMap map,
+                    ArrayRef<Value *> operands);
+  /// Builds an affine store operation with an identity map and operands.
+  static void build(Builder *builder, OperationState *result,
+                    Value *valueToStore, Value *memref,
+                    ArrayRef<Value *> operands);
+
+  /// Get value to be stored by store operation.
+  Value *getValueToStore() { return getOperand(0); }
+
+  /// Returns the operand index of the memref.
+  unsigned getMemRefOperandIndex() { return 1; }
+
+  /// Get memref operand.
+  Value *getMemRef() { return getOperand(getMemRefOperandIndex()); }
+  void setMemRef(Value *value) { setOperand(getMemRefOperandIndex(), value); }
+
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Get affine map operands.
+  operand_range getIndices() { return llvm::drop_begin(getOperands(), 2); }
+
+  /// Returns the affine map used to index the memref for this operation.
+  AffineMap getAffineMap() { return getAffineMapAttr().getValue(); }
+  AffineMapAttr getAffineMapAttr() {
+    return getAttr(getMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value *memref) {
+    assert(memref == getMemRef());
+    return {Identifier::get(getMapAttrName(), getContext()),
+            getAffineMapAttr()};
+  }
+
+  static StringRef getMapAttrName() { return "map"; }
+  static StringRef getOperationName() { return "affine.store"; }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// Returns true if the given Value can be used as a dimension id.
+bool isValidDim(Value *value);
+
+/// Returns true if the given Value can be used as a symbol.
+bool isValidSymbol(Value *value);
+
+/// Modifies both `map` and `operands` in-place so as to:
+/// 1. drop duplicate operands
+/// 2. drop unused dims and symbols from map
+void canonicalizeMapAndOperands(AffineMap *map,
+                                llvm::SmallVectorImpl<Value *> *operands);
+
+/// Returns a composed AffineApplyOp by composing `map` and `operands` with
+/// other AffineApplyOps supplying those operands. The operands of the resulting
+/// AffineApplyOp do not change the length of  AffineApplyOp chains.
+AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
+                                      llvm::ArrayRef<Value *> operands);
+
+/// Given an affine map `map` and its input `operands`, this method composes
+/// into `map`, maps of AffineApplyOps whose results are the values in
+/// `operands`, iteratively until no more of `operands` are the result of an
+/// AffineApplyOp. When this function returns, `map` becomes the composed affine
+/// map, and each Value in `operands` is guaranteed to be either a loop IV or a
+/// terminal symbol, i.e., a symbol defined at the top level or a block/function
+/// argument.
+void fullyComposeAffineMapAndOperands(AffineMap *map,
+                                      llvm::SmallVectorImpl<Value *> *operands);
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AffineOps/AffineOps.h.inc"
+
+/// Returns if the provided value is the induction variable of a AffineForOp.
+bool isForInductionVar(Value *val);
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+AffineForOp getForInductionVarOwner(Value *val);
+
+/// Extracts the induction variables from a list of AffineForOps and places them
+/// in the output argument `ivs`.
+void extractForInductionVars(ArrayRef<AffineForOp> forInsts,
+                             SmallVectorImpl<Value *> *ivs);
+
+/// AffineBound represents a lower or upper bound in the for operation.
+/// This class does not own the underlying operands. Instead, it refers
+/// to the operands stored in the AffineForOp. Its life span should not exceed
+/// that of the for operation it refers to.
+class AffineBound {
+public:
+  AffineForOp getAffineForOp() { return op; }
+  AffineMap getMap() { return map; }
+
+  /// Returns an AffineValueMap representing this bound.
+  AffineValueMap getAsAffineValueMap();
+
+  unsigned getNumOperands() { return opEnd - opStart; }
+  Value *getOperand(unsigned idx) {
+    return op.getOperation()->getOperand(opStart + idx);
+  }
+
+  using operand_iterator = AffineForOp::operand_iterator;
+  using operand_range = AffineForOp::operand_range;
+
+  operand_iterator operand_begin() { return op.operand_begin() + opStart; }
+  operand_iterator operand_end() { return op.operand_begin() + opEnd; }
+  operand_range getOperands() { return {operand_begin(), operand_end()}; }
+
+private:
+  // 'affine.for' operation that contains this bound.
+  AffineForOp op;
+  // Start and end positions of this affine bound operands in the list of
+  // the containing 'affine.for' operation operands.
+  unsigned opStart, opEnd;
+  // Affine map for this bound.
+  AffineMap map;
+
+  AffineBound(AffineForOp op, unsigned opStart, unsigned opEnd, AffineMap map)
+      : op(op), opStart(opStart), opEnd(opEnd), map(map) {}
+
+  friend class AffineForOp;
+};
+
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
new file mode 100644
index 00000000000..237692c04a7
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
@@ -0,0 +1,259 @@
+//===- AffineOps.td - Affine operation definitions ---------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines MLIR affine operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AFFINE_OPS
+#else
+#define AFFINE_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+
+def Affine_Dialect : Dialect {
+  let name = "affine";
+  let cppNamespace = "";
+}
+
+// Base class for Affine dialect ops.
+class Affine_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Affine_Dialect, mnemonic, traits> {
+  // For every affine op, there needs to be a:
+  //   * void print(OpAsmPrinter *p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser *parser,
+  //                                         OperationState *result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// Require regions to have affine terminator.
+def ImplicitAffineTerminator
+    : SingleBlockImplicitTerminator<"AffineTerminatorOp">;
+
+def AffineForOp : Affine_Op<"for", [ImplicitAffineTerminator]> {
+  let summary = "for operation";
+  let description = [{
+    The "affine.for" operation represents an affine loop nest, defining an SSA
+    value for its induction variable. It has one region capturing the loop body.
+    The induction variable is represented as a argument of this region. This SSA
+    value always has type index, which is the size of the machine word. The
+    stride, represented by step, is a positive constant integer which defaults
+    to "1" if not present. The lower and upper bounds specify a half-open range:
+    the range includes the lower bound but does not include the upper bound.
+
+    The body region must contain exactly one block that terminates with
+    "affine.terminator".  Calling AffineForOp::build will create such region
+    and insert the terminator, so will the parsing even in cases if it is absent
+    from the custom format.
+
+    The lower and upper bounds of a for operation are represented as an
+    application of an affine mapping to a list of SSA values passed to the map.
+    The same restrictions hold for these SSA values as for all bindings of SSA
+    values to dimensions and symbols. The affine mappings for the bounds may
+    return multiple results, in which case the max/min keywords are required
+    (for the lower/upper bound respectively), and the bound is the
+    maximum/minimum of the returned values.
+
+    Example:
+
+      affine.for %i = 1 to 10 {
+        ...
+      }
+
+  }];
+  let arguments = (ins Variadic<AnyType>);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, "
+              "int64_t lowerBound, int64_t upperBound, int64_t step = 1">,
+    OpBuilder<"Builder *builder, OperationState *result, "
+              "ArrayRef<Value *> lbOperands, AffineMap lbMap, "
+              "ArrayRef<Value *> ubOperands, AffineMap ubMap, "
+              "int64_t step = 1">
+  ];
+
+  let extraClassDeclaration = [{
+    static StringRef getStepAttrName() { return "step"; }
+    static StringRef getLowerBoundAttrName() { return "lower_bound"; }
+    static StringRef getUpperBoundAttrName() { return "upper_bound"; }
+
+    Block *getBody() { return &region().front(); }
+    Value *getInductionVar() { return getBody()->getArgument(0); }
+    OpBuilder getBodyBuilder() {
+      return OpBuilder(getBody(), std::prev(getBody()->end()));
+    }
+
+    // TODO: provide iterators for the lower and upper bound operands
+    // if the current access via getLowerBound(), getUpperBound() is too slow.
+
+    /// Returns operands for the lower bound map.
+    operand_range getLowerBoundOperands();
+
+    /// Returns operands for the upper bound map.
+    operand_range getUpperBoundOperands();
+
+    /// Returns information about the lower bound as a single object.
+    AffineBound getLowerBound();
+
+    /// Returns information about the upper bound as a single object.
+    AffineBound getUpperBound();
+
+    /// Returns loop step.
+    int64_t getStep() {
+      return getAttr(getStepAttrName()).cast<IntegerAttr>().getInt();
+    }
+
+    /// Returns affine map for the lower bound.
+    AffineMap getLowerBoundMap() { return getLowerBoundMapAttr().getValue(); }
+    AffineMapAttr getLowerBoundMapAttr() {
+      return getAttr(getLowerBoundAttrName()).cast<AffineMapAttr>();
+    }
+    /// Returns affine map for the upper bound. The upper bound is exclusive.
+    AffineMap getUpperBoundMap() { return getUpperBoundMapAttr().getValue(); }
+    AffineMapAttr getUpperBoundMapAttr() {
+      return getAttr(getUpperBoundAttrName()).cast<AffineMapAttr>();
+    }
+
+    /// Set lower bound. The new bound must have the same number of operands as
+    /// the current bound map. Otherwise, 'replaceForLowerBound' should be used.
+    void setLowerBound(ArrayRef<Value *> operands, AffineMap map);
+    /// Set upper bound. The new bound must not have more operands than the
+    /// current bound map. Otherwise, 'replaceForUpperBound' should be used.
+    void setUpperBound(ArrayRef<Value *> operands, AffineMap map);
+
+    /// Set the lower bound map without changing operands.
+    void setLowerBoundMap(AffineMap map);
+
+    /// Set the upper bound map without changing operands.
+    void setUpperBoundMap(AffineMap map);
+
+    /// Set loop step.
+    void setStep(int64_t step) {
+      assert(step > 0 && "step has to be a positive integer constant");
+      auto *context = getLowerBoundMap().getContext();
+      setAttr(Identifier::get(getStepAttrName(), context),
+              IntegerAttr::get(IndexType::get(context), step));
+    }
+
+    /// Returns true if the lower bound is constant.
+    bool hasConstantLowerBound();
+    /// Returns true if the upper bound is constant.
+    bool hasConstantUpperBound();
+    /// Returns true if both bounds are constant.
+    bool hasConstantBounds() {
+      return hasConstantLowerBound() && hasConstantUpperBound();
+    }
+    /// Returns the value of the constant lower bound.
+    /// Fails assertion if the bound is non-constant.
+    int64_t getConstantLowerBound();
+    /// Returns the value of the constant upper bound. The upper bound is
+    /// exclusive. Fails assertion if the bound is non-constant.
+    int64_t getConstantUpperBound();
+    /// Sets the lower bound to the given constant value.
+    void setConstantLowerBound(int64_t value);
+    /// Sets the upper bound to the given constant value.
+    void setConstantUpperBound(int64_t value);
+
+    /// Returns true if both the lower and upper bound have the same operand 
+    /// lists (same operands in the same order).
+    bool matchingBoundOperandList();
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def AffineIfOp : Affine_Op<"if", [ImplicitAffineTerminator]> {
+  let summary = "if-then-else operation";
+  let description = [{
+    The "if" operation represents an if-then-else construct for conditionally
+    executing two regions of code. The operands to an if operation are an
+    IntegerSet condition and a set of symbol/dimension operands to the
+    condition set. The operation produces no results. For example:
+
+       affine.if #set(%i)  {
+         ...
+       } else {
+         ...
+       }
+
+    The 'else' blocks to the if operation are optional, and may be omitted. For
+    example:
+
+       affine.if #set(%i)  {
+         ...
+       }
+  }];
+  let arguments = (ins Variadic<AnyType>);
+  let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, "
+              "Value *cond, bool withElseRegion">
+  ];
+
+  let extraClassDeclaration = [{
+    static StringRef getConditionAttrName() { return "condition"; }
+
+    IntegerSet getIntegerSet();
+    void setIntegerSet(IntegerSet newSet);
+
+    OpBuilder getThenBodyBuilder() {
+      assert(!thenRegion().empty() && "Unexpected empty 'then' region.");
+      Block &body = thenRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+    OpBuilder getElseBodyBuilder() {
+      assert(!elseRegion().empty() && "Unexpected empty 'else' region.");
+      Block &body = elseRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+  }];
+}
+
+def AffineTerminatorOp :
+    Affine_Op<"terminator", [Terminator]> {
+  let summary = "affine terminator operation";
+  let description = [{
+    Affine terminator is a special terminator operation for blocks inside affine
+    loops and branches. It unconditionally transmits the control flow to the
+    successor of the operation enclosing the region.
+
+    This operation does _not_ have a custom syntax. However, affine control
+    operations omit the terminator in their custom syntax for brevity.
+  }];
+
+  // No custom parsing/printing form.
+  let parser = ?;
+  let printer = ?;
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+#endif // AFFINE_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
new file mode 100644
index 00000000000..2ac1d379c12
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
@@ -0,0 +1,44 @@
+//===- AffineOpsBase.td - Affine operation definitions -----*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines base support for MLIR affine operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AFFINE_OPS_BASE
+#else
+#define AFFINE_OPS_BASE
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+// Attributes containing affine maps.
+def AffineMapAttr : Attr<
+    CPred<"$_self.isa<AffineMapAttr>()">, "AffineMap attribute"> {
+  let storageType = [{ AffineMapAttr }];
+  let returnType = [{ AffineMap }];
+  let constBuilderCall = "$_builder.getAffineMapAttr($0)";
+}
+
+def AffineMapArrayAttr : TypedArrayAttrBase<AffineMapAttr,
+                                      "AffineMap array attribute"> {
+  let constBuilderCall = "$_builder.getAffineMapArrayAttr($0)";
+}
+
+#endif // AFFINE_OPS_BASE
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
new file mode 100644
index 00000000000..6c5a58c957b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS AffineOps.td)
+mlir_tablegen(AffineOps.h.inc -gen-op-decls)
+mlir_tablegen(AffineOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRAffineOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..9235436995a
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_subdirectory(AffineOps)
+add_subdirectory(FxpMathOps)
+add_subdirectory(GPU)
+add_subdirectory(Linalg)
+add_subdirectory(LLVMIR)
+add_subdirectory(LoopOps)
+add_subdirectory(QuantOps)
+add_subdirectory(SPIRV)
+add_subdirectory(StandardOps)
+add_subdirectory(VectorOps)
diff --git a/third_party/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
new file mode 100644
index 00000000000..eaf72d214f8
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS FxpMathOps.td)
+mlir_tablegen(FxpMathOps.h.inc -gen-op-decls)
+mlir_tablegen(FxpMathOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRFxpMathOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h b/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h
new file mode 100644
index 00000000000..88a42344c3b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h
@@ -0,0 +1,40 @@
+//===- FxpMathOps.h - Fixed point ops ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
+#define MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+namespace fxpmath {
+
+/// Defines the 'FxpMathOps' dialect.
+class FxpMathOpsDialect : public Dialect {
+public:
+  FxpMathOpsDialect(MLIRContext *context);
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h.inc"
+
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td b/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td
new file mode 100644
index 00000000000..46b4293c1fd
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td
@@ -0,0 +1,290 @@
+//===- FxpMathOps.td - Fixed point ops  --------------------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the operation definition file for fixed point ops (and real
+// equivalents).
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef DIALECT_FXPMATHOPS_FXPMATH_OPS_
+#else
+#define DIALECT_FXPMATHOPS_FXPMATH_OPS_
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+include "mlir/Dialect/QuantOps/QuantPredicates.td"
+
+def fxpmath_Dialect : Dialect {
+  let name = "fxpmath";
+}
+
+//===----------------------------------------------------------------------===//
+// Attributes
+//===----------------------------------------------------------------------===//
+
+// Real value for an (inclusive) min/max clamp limit.
+def fxpmath_ClampValueAttr : OptionalAttr<F64Attr>;
+
+// Element-wise activation function to apply.
+// Note that RELU activations are not here: they are expressed as clamps.
+def fxpmath_EwUnaryFnAttr :
+    StringBasedAttr<CPred<"true">, "element-wise unary function"> {
+  let returnType = [{ StringRef }];
+  let defaultValue = "IDENTITY";
+}
+
+class fxpmath_ConstEwUnaryFn<string val> : ConstantAttr<fxpmath_EwUnaryFnAttr, val>;
+def fxpmath_EwUnaryFn_Abs     : fxpmath_ConstEwUnaryFn<"ABS">;
+def fxpmath_EwUnaryFn_Exp     : fxpmath_ConstEwUnaryFn<"EXP">;
+def fxpmath_EwUnaryFn_Identity: fxpmath_ConstEwUnaryFn<"IDENTITY">;
+def fxpmath_EwUnaryFn_Log     : fxpmath_ConstEwUnaryFn<"LOG">;
+def fxpmath_EwUnaryFn_Neg     : fxpmath_ConstEwUnaryFn<"NEG">;
+def fxpmath_EwUnaryFn_Rsqrt   : fxpmath_ConstEwUnaryFn<"RSQRT">;
+def fxpmath_EwUnaryFn_Sigmoid : fxpmath_ConstEwUnaryFn<"SIGMOID">;
+def fxpmath_EwUnaryFn_Sign    : fxpmath_ConstEwUnaryFn<"SIGN">;
+def fxpmath_EwUnaryFn_Sin     : fxpmath_ConstEwUnaryFn<"SIN">;
+def fxpmath_EwUnaryFn_Sqrt    : fxpmath_ConstEwUnaryFn<"SQRT">;
+def fxpmath_EwUnaryFn_Square  : fxpmath_ConstEwUnaryFn<"SQUARE">;
+def fxpmath_EwUnaryFn_Tanh    : fxpmath_ConstEwUnaryFn<"TANH">;
+
+//===----------------------------------------------------------------------===//
+// Comparison functions (compares relative to zero on a subtraction result).
+//===----------------------------------------------------------------------===//
+
+def fxpmath_CompareZ    : StrEnumAttrCase<"CMPZ">;
+def fxpmath_CompareNZ   : StrEnumAttrCase<"CMPNZ">;
+def fxpmath_CompareLZ   : StrEnumAttrCase<"CMPLZ">;
+def fxpmath_CompareLZE  : StrEnumAttrCase<"CMPLZE">;
+def fxpmath_CompareGZ   : StrEnumAttrCase<"CMPGZ">;
+def fxpmath_CompareGZE  : StrEnumAttrCase<"CMPGZE">;
+
+def fxpmath_CompareFnAttr : StrEnumAttr<"ComparisonFn",
+    "Type of subtraction-result comparison to perform.",
+    [
+      fxpmath_CompareZ,
+      fxpmath_CompareNZ,
+      fxpmath_CompareLZ,
+      fxpmath_CompareLZE,
+      fxpmath_CompareGZ,
+      fxpmath_CompareGZE
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Base classes
+//===----------------------------------------------------------------------===//
+
+class fxpmath_Op<string mnemonic, list<OpTrait> traits> :
+    Op<fxpmath_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Fixed-point (fxp) arithmetic ops used by kernels.
+// Some of these are temporary pending inclusion into a more core dialect.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_ClampISOp : fxpmath_Op<"clampis", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary =
+      "Clamps a signed-integer like argument to a min/max range.";
+  let description = [{
+    Element-wise equivalent to:
+      r = std::min(clamp_max, std::max(e, clamp_min))
+  }];
+  let arguments = (ins IntegerLike:$operand,
+                       APIntAttr:$clamp_min,
+                       APIntAttr:$clamp_max);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_ConvertISOp :
+    fxpmath_Op<"convertis",
+               [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary =
+      "Does an element-wise conversion from a signed integer to signed integer";
+  let description = [{
+    Similar to an element-wise static_cast in C++, from a one signed integer
+    element type to another.
+  }];
+  let arguments = (ins IntegerLike:$operand);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_ConvertISToFOp :
+    fxpmath_Op<"convertistof",
+               [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary =
+      "Does an element-wise conversion from a signed integer to a float";
+  let description = [{
+    Similar to an element-wise static_cast in C++, from a signed integer
+    element type to a floating point element type, rounding to the nearest
+    floating point value.
+  }];
+  let arguments = (ins IntegerLike:$operand);
+  let results = (outs FloatLike);
+}
+
+
+def fxpmath_VecScalarSaturatingRoundingDoublingHighMulISOp :
+    fxpmath_Op<"vs_saturating_rounding_doubling_high_mulis",
+               [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Implements equivalent functionality to ARMv7 NEON VQRDMULH";
+  let description = [{
+    Equivalent to the ARMv7 NEON VQRDMULH instruction.
+    See gemmlowp::SaturatingRoundingDoublingHighMul for a reference
+    implementation.
+  }];
+  let arguments = (ins IntegerLike:$a, APIntAttr:$b);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_RoundingDivideByPotISOp :
+    fxpmath_Op<"rounding_divide_by_potis", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+    Computes a rounding arithmetic right shift.
+  }];
+  let description = [{
+    Computes integer division by a power-of-two, correctly rounded-to-nearest.
+    Also known as a rounding arithmetic right shift. See
+    gemmlowp::RoundingDivideByPOT for a reference implementation.
+  }];
+  let arguments = (ins IntegerLike:$operand, APIntAttr:$exponent);
+  let results = (outs IntegerLike:$res);
+  let verifier = [{
+    auto verifyExponent = exponent().getSExtValue();
+    if (verifyExponent < 0 || verifyExponent > 31) {
+      return emitOpError("exponent must be in range [0..31]");
+    }
+    return success();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Real math ops.
+//
+// Math ops on real numbers which may have a representation in quantized
+// arithmetic. It is expected that eligible ops are lowered from a source
+// dialect to this set of ops prior to the process of converting a compuation
+// to a quantized form. It is a non-goal of these ops to preserve enough
+// information to convert back to the higher level, source dialect.
+//
+// These ops support either real/floating point or QuantizedTypes as operands
+// and results. Since not all transformations are supported (globally or
+// sometimes for specific targets), a computation may end up with
+// untransformable RealMathOps, in which case they need to be lowered as is
+// (using floating point math).
+//
+// This op set takes advantage of the fact that it is typically trivial to
+// combine a math function with a compatible bias addition and real-valued
+// clamp (which can be done at a higher accumulation bit depth).
+//
+// In addition, all element-wise unary functions are collapsed into a single
+// fxpmath_RealUnaryEwOp and selected via an enum-like attribute. Especially at
+// low bit depths, this makes matching simpler and allows the construction of
+// generic LUT-based implementations. It also allows specific lowering rules
+// to consolidate runs of chained unary ops and fuse them to preceding math
+// ops, potentially allowing them to operate directly on higher precision
+// intermediates without resorting to lots of custom kernels for common
+// formulas that can suffer from insufficient precision at low bit depths.
+//
+// Comparison operators are modeled as element-wise unary functions (i.e.
+// CMPZ, CMPNZ, CMPLZ, CMPGZ) intended to follow a sub and output a 1bit
+// quantized value. It is expected that lowering rules can fuse them with
+// the preceding sub.
+//===----------------------------------------------------------------------===//
+
+class fxpmath_RealMathOp<string mnemonic, list<OpTrait> traits = [], dag args> :
+    fxpmath_Op<mnemonic, traits>,
+    Arguments<!con(args, (ins
+        fxpmath_ClampValueAttr:$clamp_min, fxpmath_ClampValueAttr:$clamp_max))>;
+
+//===----------------------------------------------------------------------===//
+// Element wise binary real math ops.
+//===----------------------------------------------------------------------===//
+
+class fxpmath_RealBinaryOp<string mnemonic, list<OpTrait> traits = []> :
+    fxpmath_RealMathOp<mnemonic, traits,
+                     (ins quant_RealValueType:$lhs,
+                      quant_RealValueType:$rhs)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+class fxpmath_RealBinaryBiasOp<string mnemonic, list<OpTrait> traits = []> :
+    fxpmath_RealMathOp<mnemonic, traits,
+                     (ins quant_RealValueType:$lhs, quant_RealValueType:$rhs,
+                          quant_RealValueType:$bias)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+def fxpmath_RealAddEwOp :
+    fxpmath_RealBinaryOp<"real_add_ew", [NoSideEffect]>;
+
+def fxpmath_RealSubEwOp :
+    fxpmath_RealBinaryOp<"real_sub_ew", [NoSideEffect]>;
+
+def fxpmath_RealMulEwOp :
+    fxpmath_RealBinaryOp<"real_mul_ew", [NoSideEffect]>;
+
+def fxpmath_RealDivEwOp :
+    fxpmath_RealBinaryOp<"real_div_ew", [NoSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// Element wise unary real math op.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_RealUnaryEwOp :
+    fxpmath_RealMathOp<"real_unary_ew", [NoSideEffect],
+        (ins quant_RealValueType:$operand, fxpmath_EwUnaryFnAttr:$fn)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+def fxpmath_RealCompareZeroEwOp : fxpmath_Op<"compare", [NoSideEffect]>,
+    Arguments<(ins quant_RealValueType:$operand, fxpmath_CompareFnAttr:$fn)>,
+    Results<(outs I1Tensor:$res)> {
+  let description = [{
+    Compares a real value to zero, returning an I1 (boolean) tensor with the
+    result of applying the comparison function.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Dot op with fused bias addition.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_RealMatMulOp :
+    fxpmath_RealBinaryOp<"real_matmul", [NoSideEffect]> {
+  let summary = "Matmul";
+  let description = [{
+    A matrix multiply of [m, k] and [k, n] -> [m, n] where the bias vector is
+    of shape [n]. Also accepts rank 3 or more input tensors, in which case
+    the leading dimensions are batch dims.
+
+    Many real systems have specific library calls optimized for this precise
+    operation, which is why it is handled explicitly versus purely as a
+    generalized tensor contraction.
+  }];
+}
+
+def fxpmath_RealMatMulBiasOp :
+    fxpmath_RealBinaryBiasOp<"real_matmul_bias", [NoSideEffect]> {
+  let summary = "Matmul with bias";
+  let description = [{
+    A specialization of a RealMatMulOp that also accepts an [n] dimension
+    bias vector.
+
+    In addition, there is often special support for a fused bias and clamp,
+    which is why they are included.
+  }];
+}
+
+#endif  // DIALECT_FXPMATHOPS_FXPMATH_OPS_
diff --git a/third_party/mlir/include/mlir/Dialect/FxpMathOps/Passes.h b/third_party/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
new file mode 100644
index 00000000000..74c634a6889
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
@@ -0,0 +1,43 @@
+//===- Passes.h - Fixed point math passes -----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines all of the passes owned by the FxpMathOps dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_FXPMATHOPS_PASSES_H
+#define MLIR_DIALECT_FXPMATHOPS_PASSES_H
+
+namespace mlir {
+class FunctionPassBase;
+
+namespace fxpmath {
+
+/// Creates a pass that lowers uniform-quantized real math ops to integer
+/// arithmetic. This will leave unrecognized real math ops as-is and is
+/// typically followed by a pass that lowers any unrecognized ops to a pure
+/// floating point form.
+FunctionPassBase *createLowerUniformRealMathPass();
+
+/// Creates a pass that lowers uniform-quantized qcast/dcast ops to equivalent
+/// operations that perform quantize/dequantize.
+FunctionPassBase *createLowerUniformCastsPass();
+
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_DIALECT_FXPMATHOPS_PASSES_H
diff --git a/third_party/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 00000000000..5ba59a1026c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS GPUOps.td)
+mlir_tablegen(GPUOps.h.inc -gen-op-decls)
+mlir_tablegen(GPUOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRGPUOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h
new file mode 100644
index 00000000000..d034212fc80
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -0,0 +1,174 @@
+//===- GPUDialect.h - MLIR Dialect for GPU Kernels --------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the GPU kernel-related operations and puts them in the
+// corresponding dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
+#define MLIR_DIALECT_GPU_GPUDIALECT_H
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+class FuncOp;
+
+namespace gpu {
+
+/// The dialect containing GPU kernel launching operations and related
+/// facilities.
+class GPUDialect : public Dialect {
+public:
+  /// Create the dialect in the given `context`.
+  GPUDialect(MLIRContext *context);
+
+  /// Get the canonical string name of the dialect.
+  static StringRef getDialectName();
+
+  /// Get the name of the attribute used to annotate outlined kernel functions.
+  static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
+
+  /// Returns whether the given function is a kernel function, i.e., has the
+  /// 'gpu.kernel' attribute.
+  static bool isKernel(FuncOp function);
+};
+
+/// Utility class for the GPU dialect to represent triples of `Value`s
+/// accessible through `.x`, `.y`, and `.z` similarly to CUDA notation.
+struct KernelDim3 {
+  Value *x;
+  Value *y;
+  Value *z;
+};
+
+/// GPU kernel launch operation.  Takes a 3D grid of thread blocks as leading
+/// operands, followed by kernel data operands.  Has one region representing
+/// the kernel to be executed.  This region is not allowed to use values defined
+/// outside it.
+class LaunchOp : public Op<LaunchOp, OpTrait::AtLeastNOperands<6>::Impl,
+                           OpTrait::ZeroResult, OpTrait::IsIsolatedFromAbove> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, Value *gridSizeX,
+                    Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,
+                    Value *blockSizeY, Value *blockSizeZ,
+                    ArrayRef<Value *> operands);
+
+  /// Get the kernel region.
+  Region &getBody();
+
+  /// Get the SSA values corresponding to kernel block identifiers.
+  KernelDim3 getBlockIds();
+  /// Get the SSA values corresponding to kernel thread identifiers.
+  KernelDim3 getThreadIds();
+  /// Get the SSA values corresponding to kernel grid size.
+  KernelDim3 getGridSize();
+  /// Get the SSA values corresponding to kernel block size.
+  KernelDim3 getBlockSize();
+  /// Get the operand values passed as kernel arguments.
+  operand_range getKernelOperandValues();
+  /// Get the operand types passed as kernel arguments.
+  operand_type_range getKernelOperandTypes();
+
+  /// Get the SSA values passed as operands to specify the grid size.
+  KernelDim3 getGridSizeOperandValues();
+  /// Get the SSA values passed as operands to specify the block size.
+  KernelDim3 getBlockSizeOperandValues();
+
+  /// Get the SSA values of the kernel arguments.
+  llvm::iterator_range<Block::args_iterator> getKernelArguments();
+
+  LogicalResult verify();
+
+  /// Custom syntax support.
+  void print(OpAsmPrinter *p);
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+
+  static StringRef getOperationName() { return "gpu.launch"; }
+
+  /// Erase the `index`-th kernel argument.  Both the entry block argument and
+  /// the operand will be dropped.  The block argument must not have any uses.
+  void eraseKernelArgument(unsigned index);
+
+  /// Append canonicalization patterns to `results`.
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+
+private:
+  static StringRef getBlocksKeyword() { return "blocks"; }
+  static StringRef getThreadsKeyword() { return "threads"; }
+  static StringRef getArgsKeyword() { return "args"; }
+
+  /// The number of launch configuration operands, placed at the leading
+  /// positions of the operand list.
+  static constexpr unsigned kNumConfigOperands = 6;
+
+  /// The number of region attributes containing the launch configuration,
+  /// placed in the leading positions of the argument list.
+  static constexpr unsigned kNumConfigRegionAttributes = 12;
+};
+
+/// Operation to launch a kernel given as outlined function.
+class LaunchFuncOp : public Op<LaunchFuncOp, OpTrait::AtLeastNOperands<6>::Impl,
+                               OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, FuncOp kernelFunc,
+                    Value *gridSizeX, Value *gridSizeY, Value *gridSizeZ,
+                    Value *blockSizeX, Value *blockSizeY, Value *blockSizeZ,
+                    ArrayRef<Value *> kernelOperands);
+
+  static void build(Builder *builder, OperationState *result, FuncOp kernelFunc,
+                    KernelDim3 gridSize, KernelDim3 blockSize,
+                    ArrayRef<Value *> kernelOperands);
+
+  /// The kernel function specified by the operation's `kernel` attribute.
+  StringRef kernel();
+  /// The number of operands passed to the kernel function.
+  unsigned getNumKernelOperands();
+  /// The i-th operand passed to the kernel function.
+  Value *getKernelOperand(unsigned i);
+
+  /// Get the SSA values passed as operands to specify the grid size.
+  KernelDim3 getGridSizeOperandValues();
+  /// Get the SSA values passed as operands to specify the block size.
+  KernelDim3 getBlockSizeOperandValues();
+
+  LogicalResult verify();
+
+  static StringRef getOperationName() { return "gpu.launch_func"; }
+
+  /// The number of launch configuration operands, placed at the leading
+  /// positions of the operand list.
+  static constexpr unsigned kNumConfigOperands = 6;
+
+private:
+  /// The name of the function attribute specifying the kernel to launch.
+  static StringRef getKernelAttrName() { return "kernel"; }
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/GPUOps.h.inc"
+
+} // end namespace gpu
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_GPUDIALECT_H
diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td
new file mode 100644
index 00000000000..b38a597425b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -0,0 +1,60 @@
+//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines some operations of the GPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef GPU_OPS
+#else
+#define GPU_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def GPU_Dialect : Dialect {
+  let name = "gpu";
+}
+
+class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<GPU_Dialect, mnemonic, traits>;
+
+class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
+    GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+    Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)>;
+
+def gpu_BlockDim : GPU_IndexOp<"block_dim">;
+def gpu_BlockId : GPU_IndexOp<"block_id">;
+def gpu_GridDim : GPU_IndexOp<"grid_dim">;
+def gpu_ThreadId : GPU_IndexOp<"thread_id">;
+
+def gpu_Return : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
+    Results<(outs)> {
+  let summary = "Terminator for GPU launch regions.";
+  let description = [{
+    A terminator operation for regions that appear in the body of `gpu.launch`
+    operation.  These regions are not expected to return any value so the
+    terminator takes no operands.
+  }];
+
+  let parser = [{ return success(); }];
+  let printer = [{ *p << getOperationName(); }];
+}
+
+#endif // GPU_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/GPU/Passes.h b/third_party/mlir/include/mlir/Dialect/GPU/Passes.h
new file mode 100644
index 00000000000..d562b5835c7
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -0,0 +1,35 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PASSES_H_
+#define MLIR_DIALECT_GPU_PASSES_H_
+
+#include <memory>
+
+namespace mlir {
+
+class ModulePassBase;
+
+std::unique_ptr<ModulePassBase> createGpuKernelOutliningPass();
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PASSES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
new file mode 100644
index 00000000000..1d7d06bc25c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_TARGET_DEFINITIONS LLVMOps.td)
+mlir_tablegen(LLVMOps.h.inc -gen-op-decls)
+mlir_tablegen(LLVMOps.cpp.inc -gen-op-defs)
+mlir_tablegen(LLVMOpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(LLVMOpsEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRLLVMOpsIncGen)
+set(LLVM_TARGET_DEFINITIONS NVVMOps.td)
+mlir_tablegen(NVVMOps.h.inc -gen-op-decls)
+mlir_tablegen(NVVMOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRNVVMOpsIncGen)
+set(LLVM_TARGET_DEFINITIONS LLVMOps.td)
+mlir_tablegen(LLVMConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRLLVMConversionsIncGen)
+set(LLVM_TARGET_DEFINITIONS NVVMOps.td)
+mlir_tablegen(NVVMConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRNVVMConversionsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
new file mode 100644
index 00000000000..403fade515b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -0,0 +1,192 @@
+//===- LLVMDialect.h - MLIR LLVM IR dialect ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the LLVM IR dialect in MLIR, containing LLVM operations and
+// LLVM type system.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
+#define MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc"
+
+namespace llvm {
+class Type;
+class LLVMContext;
+} // end namespace llvm
+
+namespace mlir {
+namespace LLVM {
+class LLVMDialect;
+
+namespace detail {
+struct LLVMTypeStorage;
+struct LLVMDialectImpl;
+} // namespace detail
+
+class LLVMType : public mlir::Type::TypeBase<LLVMType, mlir::Type,
+                                             detail::LLVMTypeStorage> {
+public:
+  enum Kind {
+    LLVM_TYPE = FIRST_LLVM_TYPE,
+  };
+
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == LLVM_TYPE; }
+
+  LLVMDialect &getDialect();
+  llvm::Type *getUnderlyingType() const;
+
+  /// Array type utilities.
+  LLVMType getArrayElementType();
+  unsigned getArrayNumElements();
+  bool isArrayTy();
+
+  /// Vector type utilities.
+  LLVMType getVectorElementType();
+  bool isVectorTy();
+
+  /// Function type utilities.
+  LLVMType getFunctionParamType(unsigned argIdx);
+  unsigned getFunctionNumParams();
+  LLVMType getFunctionResultType();
+  bool isFunctionTy();
+
+  /// Pointer type utilities.
+  LLVMType getPointerTo(unsigned addrSpace = 0);
+  LLVMType getPointerElementTy();
+  bool isPointerTy();
+
+  /// Struct type utilities.
+  LLVMType getStructElementType(unsigned i);
+  bool isStructTy();
+
+  /// Utilities used to generate floating point types.
+  static LLVMType getDoubleTy(LLVMDialect *dialect);
+  static LLVMType getFloatTy(LLVMDialect *dialect);
+  static LLVMType getHalfTy(LLVMDialect *dialect);
+
+  /// Utilities used to generate integer types.
+  static LLVMType getIntNTy(LLVMDialect *dialect, unsigned numBits);
+  static LLVMType getInt1Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/1);
+  }
+  static LLVMType getInt8Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/8);
+  }
+  static LLVMType getInt8PtrTy(LLVMDialect *dialect) {
+    return getInt8Ty(dialect).getPointerTo();
+  }
+  static LLVMType getInt16Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/16);
+  }
+  static LLVMType getInt32Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/32);
+  }
+  static LLVMType getInt64Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/64);
+  }
+
+  /// Utilities used to generate other miscellaneous types.
+  static LLVMType getArrayTy(LLVMType elementType, uint64_t numElements);
+  static LLVMType getFunctionTy(LLVMType result, ArrayRef<LLVMType> params,
+                                bool isVarArg);
+  static LLVMType getFunctionTy(LLVMType result, bool isVarArg) {
+    return getFunctionTy(result, llvm::None, isVarArg);
+  }
+  static LLVMType getStructTy(LLVMDialect *dialect, ArrayRef<LLVMType> elements,
+                              bool isPacked = false);
+  static LLVMType getStructTy(LLVMDialect *dialect, bool isPacked = false) {
+    return getStructTy(dialect, llvm::None, isPacked);
+  }
+  template <typename... Args>
+  static typename std::enable_if<llvm::are_base_of<LLVMType, Args...>::value,
+                                 LLVMType>::type
+  getStructTy(LLVMType elt1, Args... elts) {
+    SmallVector<LLVMType, 8> fields({elt1, elts...});
+    return getStructTy(&elt1.getDialect(), fields);
+  }
+  static LLVMType getVectorTy(LLVMType elementType, unsigned numElements);
+  static LLVMType getVoidTy(LLVMDialect *dialect);
+
+private:
+  friend LLVMDialect;
+
+  /// Get an LLVMType with a pre-existing llvm type.
+  static LLVMType get(MLIRContext *context, llvm::Type *llvmType);
+
+  /// Get an LLVMType with an llvm type that may cause changes to the underlying
+  /// llvm context when constructed.
+  static LLVMType getLocked(LLVMDialect *dialect,
+                            llvm::function_ref<llvm::Type *()> typeBuilder);
+};
+
+///// Ops /////
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/LLVMOps.h.inc"
+
+class LLVMDialect : public Dialect {
+public:
+  explicit LLVMDialect(MLIRContext *context);
+  ~LLVMDialect();
+  static StringRef getDialectNamespace() { return "llvm"; }
+
+  llvm::LLVMContext &getLLVMContext();
+  llvm::Module &getLLVMModule();
+
+  /// Parse a type registered to this dialect.
+  Type parseType(StringRef tyData, Location loc) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, raw_ostream &os) const override;
+
+  /// Verify a region argument attribute registered to this dialect.
+  /// Returns failure if the verification failed, success otherwise.
+  LogicalResult verifyRegionArgAttribute(Operation *op, unsigned regionIdx,
+                                         unsigned argIdx,
+                                         NamedAttribute argAttr) override;
+
+private:
+  friend LLVMType;
+
+  std::unique_ptr<detail::LLVMDialectImpl> impl;
+};
+
+/// Create an LLVM global containing the string "value" at the module containing
+/// surrounding the insertion point of builder. Obtain the address of that
+/// global and use it to compute the address of the first character in the
+/// string (operations inserted at the builder insertion point).
+Value *createGlobalString(Location loc, OpBuilder &builder, StringRef name,
+                          StringRef value, LLVM::LLVMDialect *llvmDialect);
+
+} // end namespace LLVM
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
new file mode 100644
index 00000000000..a68cdbf3da0
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -0,0 +1,59 @@
+//===-- LLVMOpBase.td - LLVM IR dialect shared definitions -*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains shared definitions for the LLVM IR dialect and its
+// subdialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LLVMIR_OP_BASE
+#else
+#define LLVMIR_OP_BASE
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def LLVM_Dialect : Dialect {
+  let name = "llvm";
+  let cppNamespace = "LLVM";
+}
+
+// LLVM IR type wrapped in MLIR.
+def LLVM_Type : Type<CPred<"$_self.isa<::mlir::LLVM::LLVMType>()">,
+                     "LLVM dialect type">;
+
+// Base class for LLVM operations. Defines the interface to the llvm::IRBuilder
+// used to translate to LLVM IR proper.
+class LLVM_OpBase<Dialect dialect, string mnemonic, list<OpTrait> traits = []> :
+    Op<dialect, mnemonic, traits> {
+  // A pattern for constructing the LLVM IR Instruction (or other Value) that
+  // corresponds to this op.  This pattern can use `builder` to refer to an
+  // `llvm::IRBuilder<>` instance, $-names of arguments and results and the
+  // following special variable names:
+  //   - $_resultType - substituted with the LLVM IR type of the result;
+  //   - $_numOperands - substituted with the number of operands (including
+  //                     the variadic ones);
+  //   - $_hasResult - substituted with a check that a variadic-result op does
+  //                   have a result (LLVM ops can have 0 or 1 result);
+  //   - $_location - mlir::Location object of the instruction.
+  // Additionally, `$$` can be used to produce the dollar character.
+  string llvmBuilder = "";
+}
+
+#endif  // LLVMIR_OP_BASE
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
new file mode 100644
index 00000000000..f4f6f20c0c0
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -0,0 +1,568 @@
+//===-- LLVMOps.td - LLVM IR dialect op definition file ----*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the LLVM IR operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LLVMIR_OPS
+#else
+#define LLVMIR_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+// Base class for LLVM operations.  All operations get an "llvm." prefix in
+// their name automatically.  LLVM operations have either zero or one result,
+// this class is specialized below for both cases and should not be used
+// directly.
+class LLVM_Op<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_OpBase<LLVM_Dialect, mnemonic, traits> {
+}
+
+class LLVM_Builder<string builder> {
+  string llvmBuilder = builder;
+}
+
+def LLVM_OneResultOpBuilder : OpBuilder<
+  "Builder *, OperationState *result, Type resultType, "
+  "ArrayRef<Value *> operands, ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    if (resultType) result->addTypes(resultType);
+    result->addOperands(operands);
+    for (auto namedAttr : attributes) {
+      result->addAttribute(namedAttr.first, namedAttr.second);
+    }
+  }]>;
+
+def LLVM_ZeroResultOpBuilder : OpBuilder<
+  "Builder *, OperationState *result, ArrayRef<Value *> operands, "
+  "ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    result->addOperands(operands);
+    for (auto namedAttr : attributes) {
+      result->addAttribute(namedAttr.first, namedAttr.second);
+    }
+  }]>;
+
+class LLVM_TwoBuilders<OpBuilder b1, OpBuilder b2> {
+  list<OpBuilder> builders = [b1, b2];
+}
+
+// Base class for LLVM operations with one result.
+class LLVM_OneResultOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, traits>, Results<(outs LLVM_Type:$res)> {
+  let builders = [LLVM_OneResultOpBuilder];
+}
+
+// Compatibility builder that takes an instance of wrapped llvm::VoidType
+// to indicate no result.
+def LLVM_VoidResultTypeOpBuilder : OpBuilder<
+  "Builder *builder, OperationState *result, Type resultType, "
+  "ArrayRef<Value *> operands, ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    auto llvmType = resultType.dyn_cast<LLVM::LLVMType>(); (void)llvmType;
+    assert(llvmType && "result must be an LLVM type");
+    assert(llvmType.getUnderlyingType() &&
+            llvmType.getUnderlyingType()->isVoidTy() &&
+            "for zero-result operands, only 'void' is accepted as result type");
+    build(builder, result, operands, attributes);
+  }]>;
+
+// Base class for LLVM operations with zero results.
+class LLVM_ZeroResultOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, traits>, Results<(outs)>,
+    LLVM_TwoBuilders<LLVM_VoidResultTypeOpBuilder, LLVM_ZeroResultOpBuilder>;
+
+// Base class for LLVM terminator operations.  All terminator operations have
+// zero results and an optional list of successors.
+class LLVM_TerminatorOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, !listconcat(traits, [Terminator])>,
+    Arguments<(ins Variadic<LLVM_Type>:$args)>, Results<(outs)> {
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, "
+    "ArrayRef<Value *> properOperands, "
+    "ArrayRef<Block *> destinations, "
+    "ArrayRef<ArrayRef<Value *>> operands = {}, "
+    "ArrayRef<NamedAttribute> attributes = {}",
+    [{
+      result->addOperands(properOperands);
+      for (auto kvp : llvm::zip(destinations, operands)) {
+        result->addSuccessor(std::get<0>(kvp), std::get<1>(kvp));
+      }
+      for (auto namedAttr : attributes) {
+        result->addAttribute(namedAttr.first, namedAttr.second);
+      }
+    }]
+  >];
+}
+
+// Class for arithmetic binary operations.
+class LLVM_ArithmeticOp<string mnemonic, string builderFunc,
+                        list<OpTrait> traits = []> :
+    LLVM_OneResultOp<mnemonic,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$lhs, LLVM_Type:$rhs)>,
+    LLVM_Builder<"$res = builder." # builderFunc # "($lhs, $rhs);"> {
+  let parser = [{ return impl::parseBinaryOp(parser, result); }];
+  let printer = [{ mlir::impl::printBinaryOp(this->getOperation(), p); }];
+}
+
+// Integer binary operations.
+def LLVM_AddOp : LLVM_ArithmeticOp<"add", "CreateAdd", [Commutative]>;
+def LLVM_SubOp : LLVM_ArithmeticOp<"sub", "CreateSub">;
+def LLVM_MulOp : LLVM_ArithmeticOp<"mul", "CreateMul", [Commutative]>;
+def LLVM_UDivOp : LLVM_ArithmeticOp<"udiv", "CreateUDiv">;
+def LLVM_SDivOp : LLVM_ArithmeticOp<"sdiv", "CreateSDiv">;
+def LLVM_URemOp : LLVM_ArithmeticOp<"urem", "CreateURem">;
+def LLVM_SRemOp : LLVM_ArithmeticOp<"srem", "CreateSRem">;
+def LLVM_AndOp : LLVM_ArithmeticOp<"and", "CreateAnd">;
+def LLVM_OrOp : LLVM_ArithmeticOp<"or", "CreateOr">;
+def LLVM_XOrOp : LLVM_ArithmeticOp<"xor", "CreateXor">;
+
+// Predicate for integer comparisons.
+def ICmpPredicateEQ  : I64EnumAttrCase<"eq", 0>;
+def ICmpPredicateNE  : I64EnumAttrCase<"ne", 1>;
+def ICmpPredicateSLT : I64EnumAttrCase<"slt", 2>;
+def ICmpPredicateSLE : I64EnumAttrCase<"sle", 3>;
+def ICmpPredicateSGT : I64EnumAttrCase<"sgt", 4>;
+def ICmpPredicateSGE : I64EnumAttrCase<"sge", 5>;
+def ICmpPredicateULT : I64EnumAttrCase<"ult", 6>;
+def ICmpPredicateULE : I64EnumAttrCase<"ule", 7>;
+def ICmpPredicateUGT : I64EnumAttrCase<"ugt", 8>;
+def ICmpPredicateUGE : I64EnumAttrCase<"uge", 9>;
+def ICmpPredicate : I64EnumAttr<
+    "ICmpPredicate",
+    "llvm.icmp comparison predicate",
+    [ICmpPredicateEQ, ICmpPredicateNE, ICmpPredicateSLT, ICmpPredicateSLE,
+     ICmpPredicateSGT, ICmpPredicateSGE, ICmpPredicateULT, ICmpPredicateULE,
+     ICmpPredicateUGT, ICmpPredicateUGE]> {
+  let cppNamespace = "mlir::LLVM";
+
+  let returnType = "ICmpPredicate";
+  let convertFromStorage =
+      "static_cast<" # returnType # ">($_self.getValue().getZExtValue())";
+}
+
+// Other integer operations.
+def LLVM_ICmpOp : LLVM_OneResultOp<"icmp", [NoSideEffect]>,
+                  Arguments<(ins ICmpPredicate:$predicate, LLVM_Type:$lhs,
+                             LLVM_Type:$rhs)> {
+  let llvmBuilder = [{
+    $res = builder.CreateICmp(getLLVMCmpPredicate($predicate), $lhs, $rhs);
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, ICmpPredicate predicate, Value *lhs, "
+    "Value *rhs", [{
+      LLVMDialect *dialect = &lhs->getType().cast<LLVMType>().getDialect();
+      build(b, result, LLVMType::getInt1Ty(dialect),
+            b->getI64IntegerAttr(static_cast<int64_t>(predicate)), lhs, rhs);
+    }]>];
+  let parser = [{ return parseCmpOp<ICmpPredicate>(parser, result); }];
+  let printer = [{ printICmpOp(p, *this); }];
+}
+
+// Predicate for float comparisons
+def FCmpPredicateFALSE  : I64EnumAttrCase<"_false", 0>;
+def FCmpPredicateOEQ    : I64EnumAttrCase<"oeq", 1>;
+def FCmpPredicateOGT    : I64EnumAttrCase<"ogt", 2>;
+def FCmpPredicateOGE    : I64EnumAttrCase<"oge", 3>;
+def FCmpPredicateOLT    : I64EnumAttrCase<"olt", 4>;
+def FCmpPredicateOLE    : I64EnumAttrCase<"ole", 5>;
+def FCmpPredicateONE    : I64EnumAttrCase<"one", 6>;
+def FCmpPredicateORD    : I64EnumAttrCase<"ord", 7>;
+def FCmpPredicateUEQ    : I64EnumAttrCase<"ueq", 8>;
+def FCmpPredicateUGT    : I64EnumAttrCase<"ugt", 9>;
+def FCmpPredicateUGE    : I64EnumAttrCase<"uge", 10>;
+def FCmpPredicateULT    : I64EnumAttrCase<"ult", 11>;
+def FCmpPredicateULE    : I64EnumAttrCase<"ule", 12>;
+def FCmpPredicateUNE    : I64EnumAttrCase<"une", 13>;
+def FCmpPredicateUNO    : I64EnumAttrCase<"uno", 14>;
+def FCmpPredicateTRUE   : I64EnumAttrCase<"_true", 15>;
+
+def FCmpPredicate : I64EnumAttr<
+    "FCmpPredicate",
+    "llvm.fcmp comparison predicate",
+    [FCmpPredicateFALSE, FCmpPredicateOEQ, FCmpPredicateOGT, FCmpPredicateOGE,
+     FCmpPredicateOLT, FCmpPredicateOLE, FCmpPredicateONE, FCmpPredicateORD,
+     FCmpPredicateUEQ, FCmpPredicateUGT, FCmpPredicateUGE, FCmpPredicateULT,
+     FCmpPredicateULE, FCmpPredicateUNE, FCmpPredicateUNO, FCmpPredicateTRUE
+    ]> {
+  let cppNamespace = "mlir::LLVM";
+
+  let returnType = "FCmpPredicate";
+  let convertFromStorage =
+      "static_cast<" # returnType # ">($_self.getValue().getZExtValue())";
+}
+
+// Other integer operations.
+def LLVM_FCmpOp : LLVM_OneResultOp<"fcmp", [NoSideEffect]>,
+                  Arguments<(ins FCmpPredicate:$predicate, LLVM_Type:$lhs,
+                             LLVM_Type:$rhs)> {
+  let llvmBuilder = [{
+    $res = builder.CreateFCmp(getLLVMCmpPredicate($predicate), $lhs, $rhs);
+  }];
+  let parser = [{ return parseCmpOp<FCmpPredicate>(parser, result); }];
+  let printer = [{ printFCmpOp(p, *this); }];
+}
+
+// Floating point binary operations.
+def LLVM_FAddOp : LLVM_ArithmeticOp<"fadd", "CreateFAdd">;
+def LLVM_FSubOp : LLVM_ArithmeticOp<"fsub", "CreateFSub">;
+def LLVM_FMulOp : LLVM_ArithmeticOp<"fmul", "CreateFMul">;
+def LLVM_FDivOp : LLVM_ArithmeticOp<"fdiv", "CreateFDiv">;
+def LLVM_FRemOp : LLVM_ArithmeticOp<"frem", "CreateFRem">;
+
+// Memory-related operations.
+def LLVM_AllocaOp :
+    LLVM_OneResultOp<"alloca">,
+    Arguments<(ins LLVM_Type:$arraySize, OptionalAttr<I64Attr>:$alignment)> {
+  string llvmBuilder = [{
+    auto *alloca = builder.CreateAlloca(
+      $_resultType->getPointerElementType(), $arraySize);
+    if ($alignment.hasValue()) {
+      auto align = $alignment.getValue().getZExtValue();
+      if (align != 0)
+        alloca->setAlignment(align);
+    }
+    $res = alloca;
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Type resultType, Value *arraySize, "
+    "unsigned alignment",
+    [{
+      if (alignment == 0)
+        return build(b, result, resultType, arraySize, IntegerAttr());
+      build(b, result, resultType, arraySize, b->getI64IntegerAttr(alignment));
+  }]>];
+  let parser = [{ return parseAllocaOp(parser, result); }];
+  let printer = [{ printAllocaOp(p, *this); }];
+  let verifier = [{
+    if (alignment().hasValue()) {
+      auto align = alignment().getValue().getSExtValue();
+      if (align < 0)
+        return emitOpError("expected positive alignment");
+    }
+    return success();
+  }];
+}
+def LLVM_GEPOp : LLVM_OneResultOp<"getelementptr", [NoSideEffect]>,
+                 Arguments<(ins LLVM_Type:$base, Variadic<LLVM_Type>:$indices)>,
+                 LLVM_Builder<"$res = builder.CreateGEP($base, $indices);"> {
+  let parser = [{ return parseGEPOp(parser, result); }];
+  let printer = [{ printGEPOp(p, *this); }];
+}
+def LLVM_LoadOp : LLVM_OneResultOp<"load">, Arguments<(ins LLVM_Type:$addr)>,
+                  LLVM_Builder<"$res = builder.CreateLoad($addr);"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *addr",
+    [{
+      auto type = addr->getType().cast<LLVM::LLVMType>().getPointerElementTy();
+      build(b, result, type, addr);
+    }]>];
+  let parser = [{ return parseLoadOp(parser, result); }];
+  let printer = [{ printLoadOp(p, *this); }];
+}
+def LLVM_StoreOp : LLVM_ZeroResultOp<"store">,
+                   Arguments<(ins LLVM_Type:$value, LLVM_Type:$addr)>,
+                   LLVM_Builder<"builder.CreateStore($value, $addr);"> {
+  let parser = [{ return parseStoreOp(parser, result); }];
+  let printer = [{ printStoreOp(p, *this); }];
+}
+
+// Casts.
+class LLVM_CastOp<string mnemonic, string builderFunc,
+                  list<OpTrait> traits = []> :
+    LLVM_OneResultOp<mnemonic,
+           !listconcat([NoSideEffect], traits)>,
+    Arguments<(ins LLVM_Type:$arg)>,
+    LLVM_Builder<"$res = builder." # builderFunc # "($arg, $_resultType);"> {
+  let parser = [{ return mlir::impl::parseCastOp(parser, result); }];
+  let printer = [{ mlir::impl::printCastOp(this->getOperation(), p); }];
+}
+def LLVM_BitcastOp : LLVM_CastOp<"bitcast", "CreateBitCast">;
+def LLVM_IntToPtrOp : LLVM_CastOp<"inttoptr", "CreateIntToPtr">;
+def LLVM_PtrToIntOp : LLVM_CastOp<"ptrtoint", "CreatePtrToInt">;
+def LLVM_SExtOp : LLVM_CastOp<"sext", "CreateSExt">;
+def LLVM_ZExtOp : LLVM_CastOp<"zext", "CreateZExt">;
+def LLVM_TruncOp : LLVM_CastOp<"trunc", "CreateTrunc">;
+def LLVM_SIToFPOp : LLVM_CastOp<"sitofp", "CreateSIToFP">;
+def LLVM_FPToSIOp : LLVM_CastOp<"fptosi", "CreateFPToSI">;
+def LLVM_FPExtOp : LLVM_CastOp<"fpext", "CreateFPExt">;
+def LLVM_FPTruncOp : LLVM_CastOp<"fptrunc", "CreateFPTrunc">;
+
+// Call-related operations.
+def LLVM_CallOp : LLVM_Op<"call">,
+                  Arguments<(ins OptionalAttr<SymbolRefAttr>:$callee,
+                             // TODO(b/133216756): fix test failure and
+                             // change to LLVM_Type
+                             Variadic<AnyType>)>,
+                  Results<(outs Variadic<LLVM_Type>)>,
+                  LLVM_TwoBuilders<LLVM_OneResultOpBuilder,
+                                   LLVM_ZeroResultOpBuilder> {
+  let verifier = [{
+    if (getNumResults() > 1)
+      return emitOpError("must have 0 or 1 result");
+    return success();
+  }];
+  let parser = [{ return parseCallOp(parser, result); }];
+  let printer = [{ printCallOp(p, *this); }];
+}
+def LLVM_ExtractElementOp : LLVM_OneResultOp<"extractelement", [NoSideEffect]>,
+                          Arguments<(ins LLVM_Type:$vector,
+                                     LLVM_Type:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateExtractElement($vector, $position);
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *vector, Value *position,"
+    "ArrayRef<NamedAttribute> attrs = {}">];
+  let parser = [{ return parseExtractElementOp(parser, result); }];
+  let printer = [{ printExtractElementOp(p, *this); }];
+}
+def LLVM_ExtractValueOp : LLVM_OneResultOp<"extractvalue", [NoSideEffect]>,
+                          Arguments<(ins LLVM_Type:$container,
+                                     ArrayAttr:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateExtractValue($container, extractPosition($position));
+  }];
+  let parser = [{ return parseExtractValueOp(parser, result); }];
+  let printer = [{ printExtractValueOp(p, *this); }];
+}
+def LLVM_InsertElementOp : LLVM_OneResultOp<"insertelement", [NoSideEffect]>,
+                         Arguments<(ins LLVM_Type:$vector, LLVM_Type:$value,
+                                    LLVM_Type:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateInsertElement($vector, $value, $position);
+  }];
+  let parser = [{ return parseInsertElementOp(parser, result); }];
+  let printer = [{ printInsertElementOp(p, *this); }];
+}
+def LLVM_InsertValueOp : LLVM_OneResultOp<"insertvalue", [NoSideEffect]>,
+                         Arguments<(ins LLVM_Type:$container, LLVM_Type:$value,
+                                    ArrayAttr:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateInsertValue($container, $value,
+                                     extractPosition($position));
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *container, Value *value, "
+    "ArrayAttr position",
+    [{
+      build(b, result, container->getType(), container, value, position);
+    }]>];
+  let parser = [{ return parseInsertValueOp(parser, result); }];
+  let printer = [{ printInsertValueOp(p, *this); }];
+}
+def LLVM_ShuffleVectorOp
+    : LLVM_OneResultOp<"shufflevector", [NoSideEffect]>,
+      Arguments<(ins LLVM_Type:$v1, LLVM_Type:$v2, I32ArrayAttr:$mask)>,
+      LLVM_Builder<
+      "$res = builder.CreateShuffleVector($v1, $v2, extractPosition($mask));"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *v1, Value *v2, "
+    "ArrayAttr mask, ArrayRef<NamedAttribute> attrs = {}">];
+  let verifier = [{
+    auto wrappedVectorType1 = v1()->getType().cast<LLVM::LLVMType>();
+    auto wrappedVectorType2 = v2()->getType().cast<LLVM::LLVMType>();
+    if (!wrappedVectorType2.getUnderlyingType()->isVectorTy())
+      return emitOpError("expected LLVM IR Dialect vector type for operand #2");
+    if (wrappedVectorType1.getVectorElementType() !=
+        wrappedVectorType2.getVectorElementType())
+      return emitOpError("expected matching LLVM IR Dialect element types");
+    return success();
+  }];
+  let parser = [{ return parseShuffleVectorOp(parser, result); }];
+  let printer = [{ printShuffleVectorOp(p, *this); }];
+}
+
+// Misc operations.
+def LLVM_SelectOp
+    : LLVM_OneResultOp<"select", [NoSideEffect]>,
+      Arguments<(ins LLVM_Type:$condition, LLVM_Type:$trueValue,
+                 LLVM_Type:$falseValue)>,
+      LLVM_Builder<
+          "$res = builder.CreateSelect($condition, $trueValue, $falseValue);"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *condition, Value *lhs, "
+    "Value *rhs", [{
+      build(b, result, lhs->getType(), condition, lhs, rhs);
+    }]>];
+  let parser = [{ return parseSelectOp(parser, result); }];
+  let printer = [{ printSelectOp(p, *this); }];
+}
+
+// Terminators.
+def LLVM_BrOp : LLVM_TerminatorOp<"br", []> {
+  let parser = [{ return parseBrOp(parser, result); }];
+  let printer = [{ printBrOp(p, *this); }];
+}
+def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br", []> {
+  let verifier = [{
+    if (getNumSuccessors() != 2)
+      return emitOpError("expected exactly two successors");
+    return success();
+  }];
+  let parser = [{ return parseCondBrOp(parser, result); }];
+  let printer = [{ printCondBrOp(p, *this); }];
+}
+def LLVM_ReturnOp : LLVM_TerminatorOp<"return", []> {
+  string llvmBuilder = [{
+    if ($_numOperands != 0)
+      builder.CreateRet($args[0]);
+    else
+      builder.CreateRetVoid();
+  }];
+
+  let verifier = [{
+    if (getNumOperands() > 1)
+      return emitOpError("expects at most 1 operand");
+    return success();
+  }];
+
+  let parser = [{ return parseReturnOp(parser, result); }];
+  let printer = [{ printReturnOp(p, *this); }];
+}
+def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable", []> {
+  string llvmBuilder = [{ builder.CreateUnreachable(); }];
+  let parser = [{ return success(); }];
+  let printer = [{ *p << getOperationName(); }];
+}
+
+// Pseudo-operations (do not appear in LLVM IR but necessary for the dialect to
+// work correctly).
+def LLVM_AddressOfOp
+    : LLVM_OneResultOp<"addressof">,
+      Arguments<(ins SymbolRefAttr:$global_name)> {
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, LLVMType resType, "
+              "StringRef name, ArrayRef<NamedAttribute> attrs = {}", [{
+      result->addAttribute("global_name", builder->getSymbolRefAttr(name));
+      result->addAttributes(attrs);
+      result->addTypes(resType);}]>,
+
+    OpBuilder<"Builder *builder, OperationState *result, GlobalOp global, "
+              "ArrayRef<NamedAttribute> attrs = {}", [{
+      build(builder, result, global.getType().getPointerTo(), global.sym_name(),
+            attrs);}]>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return the llvm.global operation that defined the value referenced here.
+    GlobalOp getGlobal();
+  }];
+
+  let printer = "printAddressOfOp(p, *this);";
+  let parser = "return parseAddressOfOp(parser, result);";
+  let verifier = "return ::verify(*this);";
+}
+
+def LLVM_GlobalOp
+    : LLVM_ZeroResultOp<"global">,
+      Arguments<(ins TypeAttr:$type, UnitAttr:$constant, StrAttr:$sym_name,
+                 AnyAttr:$value)> {
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, LLVMType type, "
+              "bool isConstant, StringRef name, Attribute value, "
+              "ArrayRef<NamedAttribute> attrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return the LLVM type of the global.
+    LLVMType getType() {
+      return type().cast<LLVMType>();
+    }
+  }];
+
+  let printer = "printGlobalOp(p, *this);";
+  let parser = "return parseGlobalOp(parser, result);";
+  let verifier = "return ::verify(*this);";
+}
+
+def LLVM_LLVMFuncOp : LLVM_ZeroResultOp<"func",
+      [NativeOpTrait<"IsIsolatedFromAbove">, NativeOpTrait<"FunctionLike">]> {
+  let summary = "LLVM dialect function, has wrapped LLVM IR function type";
+
+  let regions = (region AnyRegion:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, StringRef name, "
+              "LLVMType type, ArrayRef<NamedAttribute> attrs, "
+              "ArrayRef<NamedAttributeList> argAttrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    LLVMType getType() {
+      return getAttrOfType<TypeAttr>(getTypeAttrName())
+          .getValue().cast<LLVMType>();
+    }
+    bool isVarArg() {
+      return getType().getUnderlyingType()->isFunctionVarArg();
+    }
+
+    // Hook for OpTrait::FunctionLike, returns the number of function arguments.
+    // Depends on the type attribute being correct as checked by verifyType.
+    unsigned getNumFuncArguments();
+
+    // Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+    // attribute is present.  This can check for preconditions of the
+    // getNumArguments hook not failing.
+    LogicalResult verifyType();
+  }];
+
+  let verifier = [{ return ::verify(*this); }];
+  let printer = [{ printLLVMFuncOp(p, *this); }];
+  let parser = [{
+    return impl::parseFunctionLikeOp(parser, result, /*allowVariadic=*/true,
+                                     buildLLVMFunctionType);
+  }];
+}
+
+def LLVM_UndefOp : LLVM_OneResultOp<"undef", [NoSideEffect]>,
+                   LLVM_Builder<"$res = llvm::UndefValue::get($_resultType);"> {
+  let parser = [{ return parseUndefOp(parser, result); }];
+  let printer = [{ printUndefOp(p, *this); }];
+}
+def LLVM_ConstantOp
+    : LLVM_OneResultOp<"constant", [NoSideEffect]>,
+      Arguments<(ins AnyAttr:$value)>,
+      LLVM_Builder<"$res = getLLVMConstant($_resultType, $value, $_location);">
+{
+  let parser = [{ return parseConstantOp(parser, result); }];
+  let printer = [{ printConstantOp(p, *this); }];
+}
+
+// Operations that correspond to LLVM intrinsics. With MLIR operation set being
+// extendable, there is no reason to introduce a hard boundary between "core"
+// operations and intrinsics. However, we systematically prefix them with
+// "intr." to avoid potential name clashes.
+
+def LLVM_fmuladd : LLVM_Op<"intr.fmuladd", [NoSideEffect]>,
+                   Arguments<(ins LLVM_Type:$a, LLVM_Type:$b, LLVM_Type:$c)>,
+                   Results<(outs LLVM_Type:$res)> {
+  let llvmBuilder = [{
+    llvm::Module *module = builder.GetInsertBlock()->getModule();
+    llvm::Function *fn = llvm::Intrinsic::getDeclaration(
+        module, llvm::Intrinsic::fmuladd,
+        {$a->getType(), $b->getType(), $c->getType()});
+    $res = builder.CreateCall(fn, {$a, $b, $c});
+  }];
+}
+
+#endif // LLVMIR_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
new file mode 100644
index 00000000000..0328cf4ba94
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -0,0 +1,45 @@
+//===- NVVMDialect.h - MLIR NVVM IR dialect ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the NVVM IR dialect in MLIR, containing NVVM operations and
+// NVVM specific extensions to the LLVM type system.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
+#define MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+namespace mlir {
+namespace NVVM {
+
+///// Ops /////
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOps.h.inc"
+
+class NVVMDialect : public Dialect {
+public:
+  explicit NVVMDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "nvvm"; }
+};
+
+} // namespace NVVM
+} // namespace mlir
+
+#endif /* MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_ */
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
new file mode 100644
index 00000000000..224a5804d5f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -0,0 +1,98 @@
+//===-- NVVMOps.td - NVVM IR dialect op definition file ----*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the NVVM IR operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef NVVMIR_OPS
+#else
+#define NVVMIR_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+def NVVM_Dialect : Dialect {
+  let name = "nvvm";
+  let cppNamespace = "NVVM";
+}
+
+class NVVM_Op<string mnemonic, list<OpTrait> traits = []> :
+  LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
+}
+
+class NVVM_SpecialRegisterOp<string mnemonic,
+    list<OpTrait> traits = []> :
+  NVVM_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+  Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
+  string llvmBuilder = "$res = createIntrinsicCall(builder,"
+    # "llvm::Intrinsic::nvvm_" # !subst(".","_", mnemonic) # ");";
+  let parser = [{ return parseNVVMSpecialRegisterOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+
+def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
+  string llvmBuilder = [{
+      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
+  }];
+  let parser = [{ return success(); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+def NVVM_ShflBflyOp :
+  NVVM_Op<"shfl.sync.bfly">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins LLVM_Type:$dst,
+                 LLVM_Type:$val,
+                 LLVM_Type:$offset,
+                 LLVM_Type:$mask_and_clamp)> {
+  string llvmBuilder = [{
+      auto intId = $val->getType()->isFloatTy() ?
+          llvm::Intrinsic::nvvm_shfl_sync_bfly_f32 :
+          llvm::Intrinsic::nvvm_shfl_sync_bfly_i32;
+      $res = createIntrinsicCall(builder,
+          intId, {$dst, $val, $offset, $mask_and_clamp});
+  }];
+  let parser = [{ return parseNVVMShflSyncBflyOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+def NVVM_VoteBallotOp :
+  NVVM_Op<"vote.ballot.sync">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins LLVM_Type:$mask, LLVM_Type:$pred)> {
+  string llvmBuilder = [{
+      $res = createIntrinsicCall(builder,
+            llvm::Intrinsic::nvvm_vote_ballot_sync, {$mask, $pred});
+  }];
+  let parser = [{ return parseNVVMVoteBallotOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+#endif // NVVMIR_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h b/third_party/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
new file mode 100644
index 00000000000..2367363b9b4
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
@@ -0,0 +1,137 @@
+//===- DependenceAnalysis.h - Dependence analysis on SSA views --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
+#define MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace linalg {
+
+class LinalgOp;
+
+/// A very primitive alias analysis which just records for each view, either:
+///   1. The base buffer, or
+///   2. The block argument view
+/// that it indexes into.
+/// This does not perform inter-block or inter-procedural analysis and assumes
+/// that different block argument views do not alias.
+class Aliases {
+public:
+  /// Returns true if v1 and v2 alias.
+  bool alias(Value *v1, Value *v2) { return find(v1) == find(v2); }
+
+private:
+  /// Returns the base buffer or block argument into which the view `v` aliases.
+  /// This lazily records the new aliases discovered while walking back the
+  /// use-def chain.
+  Value *find(Value *v);
+
+  DenseMap<Value *, Value *> aliases;
+};
+
+/// Data structure for holding a dependence graph that operates on LinalgOp and
+/// views as SSA values.
+class LinalgDependenceGraph {
+public:
+  struct LinalgOpView {
+    Operation *op;
+    Value *view;
+  };
+  struct LinalgDependenceGraphElem {
+    // dependentOpView may be either:
+    //   1. src in the case of dependencesIntoGraphs.
+    //   2. dst in the case of dependencesFromDstGraphs.
+    LinalgOpView dependentOpView;
+    // View in the op that is used to index in the graph:
+    //   1. src in the case of dependencesFromDstGraphs.
+    //   2. dst in the case of dependencesIntoGraphs.
+    Value *indexingView;
+  };
+  using LinalgDependences = llvm::SmallVector<LinalgDependenceGraphElem, 8>;
+  using DependenceGraph = DenseMap<Operation *, LinalgDependences>;
+  using dependence_iterator = LinalgDependences::iterator;
+  using dependence_range = llvm::iterator_range<dependence_iterator>;
+
+  enum DependenceType { RAR = 0, RAW, WAR, WAW, NumTypes };
+
+  LinalgDependenceGraph(Aliases &aliases, ArrayRef<Operation *> ops);
+
+  /// Returns the X such that op -> X is a dependence of type dt.
+  dependence_range getDependencesFrom(Operation *src, DependenceType dt);
+  dependence_range getDependencesFrom(LinalgOp src, DependenceType dt);
+
+  /// Returns the X such that X -> op is a dependence of type dt.
+  dependence_range getDependencesInto(Operation *dst, DependenceType dt);
+  dependence_range getDependencesInto(LinalgOp dst, DependenceType dt);
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in any RAW, WAR or WAW dependence
+  /// relation with `srcLinalgOp`, on any view.
+  /// Any such operation prevents reordering.
+  SmallVector<Operation *, 8> findCoveringDependences(LinalgOp srcLinalgOp,
+                                                      LinalgOp dstLinalgOp);
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in a RAR or RAW with `srcLinalgOp`.
+  /// Dependences are restricted to views aliasing `view`.
+  SmallVector<Operation *, 8>
+  findCoveringReads(LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value *view);
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in a WAR or WAW with `srcLinalgOp`.
+  /// Dependences are restricted to views aliasing `view`.
+  SmallVector<Operation *, 8>
+  findCoveringWrites(LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value *view);
+
+private:
+  // Keep dependences in both directions, this is not just a performance gain
+  // but it also reduces usage errors.
+  // Dependence information is stored as a map of:
+  //   (source operation -> LinalgDependenceGraphElem)
+  DependenceGraph dependencesFromGraphs[DependenceType::NumTypes];
+  // Reverse dependence information is stored as a map of:
+  //   (destination operation -> LinalgDependenceGraphElem)
+  DependenceGraph dependencesIntoGraphs[DependenceType::NumTypes];
+
+  /// Analyses the aliasing views between `src` and `dst` and inserts the proper
+  /// dependences in the graph.
+  void addDependencesBetween(LinalgOp src, LinalgOp dst);
+
+  // Adds an new dependence unit in the proper graph.
+  // Uses std::pair to keep operations and view together and avoid usage errors
+  // related to src/dst and producer/consumer terminology in the context of
+  // dependences.
+  void addDependenceElem(DependenceType dt, LinalgOpView indexingOpView,
+                         LinalgOpView dependentOpView);
+
+  /// Implementation detail for findCoveringxxx.
+  SmallVector<Operation *, 8>
+  findOperationsWithCoveringDependences(LinalgOp srcLinalgOp,
+                                        LinalgOp dstLinalgOp, Value *view,
+                                        ArrayRef<DependenceType> types);
+
+  Aliases &aliases;
+  SmallVector<Operation *, 8> linalgOps;
+  DenseMap<Operation *, unsigned> linalgOpPositions;
+};
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 00000000000..f33061b2d87
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
new file mode 100644
index 00000000000..b175e9ad044
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_TARGET_DEFINITIONS LinalgOps.td)
+mlir_tablegen(LinalgOps.h.inc -gen-op-decls)
+mlir_tablegen(LinalgOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRLinalgOpsIncGen)
+set(LLVM_TARGET_DEFINITIONS LinalgLibraryOps.td)
+mlir_tablegen(LinalgLibraryOps.h.inc -gen-op-decls)
+mlir_tablegen(LinalgLibraryOps.cpp.inc -gen-op-defs)
+mlir_tablegen(LinalgLibraryOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(LinalgLibraryOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRLinalgLibraryOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
new file mode 100644
index 00000000000..5ca798ed431
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
@@ -0,0 +1,62 @@
+//===- LinalgBase.td - Linalg dialect base support ---------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the definition file for base linear algebra support.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+#ifdef LINALG_BASE
+#else
+#define LINALG_BASE
+
+def Linalg_Dialect : Dialect {
+  let name = "linalg";
+  let description = [{
+    The Linalg dialect groups together a set of types and operations that are
+    useful to implement a "linear algebra"-like abstraction where ops can lower
+    to scalar load/store and operations or to more general library calls.
+
+    The Linalg dialect adopts a convention that is similar to BLAS when
+    offloading operations to fast library implementations: pass a non-owning
+    pointer to input and output data with additional metadata. This convention
+    is also found in libraries such as MKL, OpenBLAS, cuBLAS, cuDNN, etc.. and
+    more generally at interface points across language boundaries (e.g. C++ /
+    Python).
+
+    Generally, Linalg passes non-owning pointers to View data structures to
+    precompiled library calls linked externally.
+  }];
+}
+
+// Whether a type is a BufferType.
+def LinalgIsBufferTypePred : CPred<"$_self.isa<BufferType>()">;
+def Buffer : Type<LinalgIsBufferTypePred, "buffer">;
+
+// Whether a type is a RangeType.
+def LinalgIsRangeTypePred : CPred<"$_self.isa<RangeType>()">;
+def Range : Type<LinalgIsRangeTypePred, "range">;
+
+// Whether a type is a ViewType.
+def LinalgIsViewTypePred : CPred<"$_self.isa<ViewType>()">;
+def View : Type<LinalgIsViewTypePred, "view">;
+
+#endif // LINALG_BASE
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td
new file mode 100644
index 00000000000..cac24ceb5b9
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td
@@ -0,0 +1,462 @@
+//===- LinalgLibraryOps.td - Linalg dialect library ops -*- tablegen ----*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the operation definition file for linear algebra operations that
+// correspond to underlying library calls (e.g. BLAS).
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LINALG_LIBRARY_OPS
+#else
+#define LINALG_LIBRARY_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+class LinalgParametricNativeOpTrait<string prop, string parameters> :
+  NativeOpTrait<"linalg::" # prop # parameters>
+{}
+
+class LinalgParametricIntNativeOpTrait<string prop, list<int> parameters> :
+  LinalgParametricNativeOpTrait<
+    prop,
+    !strconcat("<",
+               !cast<string>(!head(parameters)),
+               !foldl("",
+                      !tail(parameters),
+                      sum,
+                      param,
+                      sum # "," # !cast<string>(param)),
+               ">::Impl")>
+{}
+
+// The Linalg `NInputsAndOutputs` trait provides the API for ops that are known
+// to have a specified number of inputs and outputs, all passed as operands.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NInputsAndOutputs<int n_ins, int n_outs> :
+  LinalgParametricIntNativeOpTrait<"NInputsAndOutputs", [n_ins, n_outs]>
+{}
+
+// The linalg `NLoopTypes` trait provides the API for ops that are known to have
+// a specified number of parallel (n_par), reduction (n_red) and window (n_win)
+// loops.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NLoopTypes<int n_par, int n_red, int n_win> :
+LinalgParametricIntNativeOpTrait<"NLoopTypes", [n_par, n_red, n_win]>
+{}
+
+// The linalg `ViewRanks` trait the API for ops that are known to have a
+// specified list of view ranks.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class ViewRanks<list<int> ranks> :
+LinalgParametricIntNativeOpTrait<"ViewRanks", ranks>
+{}
+
+def ViewTraits : NativeOpTrait<"linalg::ViewTraits">;
+
+// The linalg 'LinalgLibraryInterface' provides access to the 'LinalgOp'
+// interface.
+def LinalgLibraryInterface : OpInterface<"LinalgOp"> {
+  let methods = [
+    /// Query the number of inputs and outputs from the operation.
+    InterfaceMethod<"unsigned", "getNumInputs">,
+    InterfaceMethod<"unsigned", "getNumOutputs">,
+    InterfaceMethod<"unsigned", "getNumInputsAndOutputs">,
+    InterfaceMethod<"Operation::operand_range", "getInputs">,
+    InterfaceMethod<"Operation::operand_range", "getOutputs">,
+    InterfaceMethod<"Operation::operand_range", "getInputsAndOutputs">,
+
+    /// Query the number of each type of loop.
+    InterfaceMethod<"unsigned", "getNumParallelLoops">,
+    InterfaceMethod<"unsigned", "getNumReductionLoops">,
+    InterfaceMethod<"unsigned", "getNumWindowLoops">,
+    InterfaceMethod<"unsigned", "getNumLoops", (ins), [{
+      return op.getNumParallelLoops() + op.getNumReductionLoops() +
+             op.getNumWindowLoops();
+    }]>,
+
+    /// Get a specific input/output at the given index.
+    InterfaceMethod<"Value *", "getInput", (ins "unsigned":$i)>,
+    InterfaceMethod<"Value *", "getOutput", (ins "unsigned":$i)>,
+
+    /// Get the index of the given value, or None if the value is not an input.
+    InterfaceMethod<"llvm::Optional<unsigned>", "getIndexOfInput",
+                    (ins "Value *":$view)>,
+    InterfaceMethod<"llvm::Optional<unsigned>", "getIndexOfOutput",
+                    (ins "Value *":$view)>,
+
+    /// Get the view type of the input/output at the given index.
+    InterfaceMethod<"ViewType", "getInputViewType", (ins "unsigned":$i)>,
+    InterfaceMethod<"ViewType", "getOutputViewType", (ins "unsigned":$i)>,
+
+    /// Create an operation with the given location and operands.
+    StaticInterfaceMethod<"Operation *", "create",
+      (ins "OpBuilder &":$builder, "Location":$loc,
+           "ArrayRef<Value *>":$operands,
+           "ArrayRef<NamedAttribute>":$attributes), [{
+        return builder.create<ConcreteOp>(loc, ArrayRef<Type>{}, operands,
+                                          attributes);
+      }]>
+  ];
+}
+
+// Base Tablegen class for Linalg ops.
+// Linalg ops that correspond to library calls operate on linalg::View as their
+// first operands. These may be optionally followed by non-view operands
+// depending on the specific Linalg op.
+class LinalgLibraryBase_Op<string mnemonic, list<OpTrait> props>
+  : Op<Linalg_Dialect, mnemonic,
+       !listconcat(props, [ViewTraits, LinalgLibraryInterface])> {
+  let parser = [{ return parseLinalgLibraryOp(parser, result); }];
+  let printer = [{ printLinalgLibraryOp(p, *this); }];
+}
+
+class LinalgLibrary_Op<string mnemonic, list<OpTrait> props>
+  : LinalgLibraryBase_Op<mnemonic, props> {
+  code libraryCallName = [{
+    std::string getLibraryCallName() {
+      return generateLibraryCallName(getOperation());
+    }
+  }];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Concrete Linalg ops.
+////////////////////////////////////////////////////////////////////////////////
+def CopyOp : LinalgLibrary_Op<"copy", [NInputsAndOutputs<1, 1>]> {
+  let description = [{
+    Copies the data in the input view into the output view.
+
+    Usage:
+      linalg.copy(%arg0, %arg1) : !linalg.view<?xf32>, !linalg.view<?xf32>
+
+    One possible lowering to loop form is:
+      %0 = linalg.dim %arg0, 0 : index
+      loop.for %i0 = %c0 to %0 step %c1 {
+        %1 = linalg.load %arg0[%i0] : !linalg.view<?xf32>
+        linalg.store %1, %arg1[%i0] : !linalg.view<?xf32>
+      }
+
+    Optionally, can take `input_permutation` and `output_permutation` attributes
+    to reorder the dimensions of the input and output views.
+
+    Usage:
+      linalg.copy(%arg0, %arg1) {inputPermutation : (i, j, k) -> (i, k, j),
+                                 outputPermutation : (i, j, k) -> (k, j, i)} :
+        !linalg.view<?x?x?xf32>, !linalg.view<?x?x?xf32>
+
+    One possible lowering to loop form is:
+      %0 = linalg.dim %arg0, 0
+      %1 = linalg.dim %arg0, 1
+      %2 = linalg.dim %arg0, 2
+      loop.for %i0 = %c0 to %{{.*}} step %c1 {
+        loop.for %i1 = %c0 to %{{.*}} step %c1 {
+          loop.for %i2 = %c0 to %{{.*}} step %c1 {
+            %3 = linalg.load %arg0[%i0, %i2, %i1] : !linalg.view<?x?x?xf32>
+            linalg.store %3, %arg1[%i2, %i1, %i0] : !linalg.view<?x?x?xf32>
+
+    The views are expected to be compatible for correctness but this is not
+    enforced at the moment.
+  }];
+  let arguments = (ins
+    View:$input,
+    View:$output,
+    OptionalAttr<AffineMapAttr>:$inputPermutation,
+    OptionalAttr<AffineMapAttr>:$outputPermutation);
+  // TODO(ntv) this should go away once the usage of OptionalAttr triggers
+  // emission of builders with default arguments left unspecified.
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *input, Value *output", [{
+    return build(
+      builder, result, input, output, AffineMapAttr(), AffineMapAttr());
+  }]>];
+  let extraClassDeclaration = libraryCallName # [{
+    unsigned getNumParallelLoops() {
+      auto *view = *(getOperands().begin());
+      return view->getType().cast<ViewType>().getRank();
+    }
+    unsigned getNumReductionLoops() { return 0; }
+    unsigned getNumWindowLoops() { return 0; }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def FillOp : LinalgLibrary_Op<"fill", [NInputsAndOutputs<0, 1>]> {
+  let arguments = (ins View, AnyTypeOf<[AnyFloat, AnyInteger, AnyVector]>);
+  let extraClassDeclaration = libraryCallName # [{
+    unsigned getNumParallelLoops() {
+      auto *view = *(getOperands().begin());
+      return view->getType().cast<ViewType>().getRank();
+    }
+    unsigned getNumReductionLoops() { return 0; }
+    unsigned getNumWindowLoops() { return 0; }
+    Value *getValue() {
+      return *(getOperands().begin() + getNumInputsAndOutputs());
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def DotOp : LinalgLibrary_Op<"dot",
+                            [NInputsAndOutputs<2, 1>,
+                             NLoopTypes<0, 1, 0>,
+                             ViewRanks<[1, 1, 0]>]> {
+  let arguments = (ins View, View, View);
+  let extraClassDeclaration = libraryCallName;
+}
+
+def MatvecOp : LinalgLibrary_Op<"matvec",
+                                  [NInputsAndOutputs<2, 1>,
+                                   NLoopTypes<1, 1, 0>,
+                                   ViewRanks<[2, 1, 1]>]> {
+  let arguments = (ins View, View, View);
+  let extraClassDeclaration = libraryCallName;
+}
+
+def MatmulOp : LinalgLibrary_Op<"matmul",
+                                  [NInputsAndOutputs<2, 1>,
+                                   NLoopTypes<2, 1, 0>,
+                                   ViewRanks<[2, 2, 2]>]> {
+  let arguments = (ins View, View, View);
+  let extraClassDeclaration = libraryCallName;
+}
+
+def ConvOp : LinalgLibrary_Op<"conv", [NInputsAndOutputs<2, 1>]> {
+  let description = [{
+    Generic n-D convolution as described in the TF documentation:
+    https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/nn/convolution
+
+    ```
+      output[b, x[0], ..., x[N-1], k] =
+      sum_{z[0], ..., z[N-1], q}
+          filter[z[0], ..., z[N-1], q, k] *
+          padded_input[b,
+                       x[0] * strides[0] + dilation_rate[0] * z[0],
+                       ...,
+                       x[N-1] * strides[N-1] + dilation_rate[N-1] * z[N-1],
+                       q]
+    ```
+  }];
+  // TODO(ntv) padding.
+  // Following the TF source of truth above, strides and dilations are integer
+  // attributes of the same rank as the number of window dimensions.
+  let arguments = (ins View:$filter, View:$input, View:$output,
+                   OptionalAttr<I64ArrayAttr>:$strides,
+                   OptionalAttr<I64ArrayAttr>:$dilations);
+  let extraClassDeclaration = libraryCallName # [{
+    // TODO(ntv) extend to support more than 1 dimensions and potentially
+    // grouping too.
+    unsigned getNumBatchDimensions() { return 1; }
+    unsigned getNumInputFeatureDimensions() { return 1; }
+    unsigned getNumOutputFeatureDimensions() { return 1; }
+
+    // Outer parallel loops are always the number of output dimensions; i.e.
+    // [ b, xs, q] in the TF notation above.
+    unsigned getNumParallelLoops() { return getOutputViewType(0).getRank(); }
+
+    // Window loops are a special kind of reduction that is neither tiled or
+    // parallelized across; i.e. [zs] in the TF notation above whose number
+    // match `xs` (i.e. 1 window loop per "image" dimension).
+    unsigned getNumWindowLoops() {
+      return getNumParallelLoops() - getNumBatchDimensions() -
+             getNumInputFeatureDimensions(); }
+
+    // Reduction loops are exactly the non-parallel, non-window loops (i.e. `q`)
+    // We distinguish between reduction loops and convolution window loops for
+    // now. That distinction may disappear in the future.
+    unsigned getNumReductionLoops() { return getNumInputFeatureDimensions(); }
+
+    int64_t getStride(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!strides().hasValue()) return 1;
+      return strides()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+
+    int64_t getDilation(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!dilations().hasValue()) return 1;
+      return dilations()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def GenericOp : LinalgLibraryBase_Op<"generic", []> {
+  let description = [{
+    Generic Linalg op form where the key properties of the computation are
+    specified as attributes. In pretty form, a linalg.generic op is written as:
+
+      ```
+        linalg.generic #trait_attribute %A, %B, %C {other-attributes} :
+          !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+      ```
+
+    Where #trait_attributes is an alias of a dictionary attribute containing:
+      - doc [optional]: a documentation string
+      - fun: a SymbolRefAttr that must resolve to an existing function symbol.
+        To support inplace updates in a generic fashion, the signature of the
+        function must be:
+        ```
+          fun([input views element types], [output views element types])
+            -> ([output views element types])
+        ```
+      - indexing_maps: a list of AffineMapAttr, one AffineMapAttr per each input
+        and output view. Such AffineMapAttr specifies the mapping between the
+        loops and the indexing within each view.
+      - library_call [optional]: a StringAttr containing the name of an
+        external library function that the linalg.generic operation maps to.
+        The external library is assumed to be dynamically linked and no strong
+        compile-time guarantees are provided. In the absence of such a library
+        call, linalg.generic will always lower to loops.
+      - n_loops: a triple of I64Attr representing the number of enclosing
+        [parallel, reduction, window] loops respectively.
+      - n_views: a pair of I64Attr representing the number of input (readonly)
+        and output (readwrite) views.
+
+    Example:
+    Defining a #matmul_trait attribute in MLIR can be done as follows:
+      ```
+        func @fma(%a: f32, %b: f32, %c: f32) -> f32 {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+        #matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+        ]
+        #matmul_trait = {
+          doc = "C(m, n) += A(m, k) * B(k, n)",
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          n_views = [2, 1],
+          n_loop_types = [2, 1, 0]
+        }
+      ```
+
+    And can be reused in multiple places as:
+      ```
+        linalg.generic #matmul_trait %A, %B, %C [other-attributes] :
+          !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+      ```
+
+    This may lower to either:
+      ```
+        call @linalg_matmul(%A, %B, %C) :
+          (!linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>)
+          -> ()
+      ```
+
+    or IR resembling:
+    ```
+    loop.for %m = %c0 to %M step %c1 {
+      loop.for %n = %c0 to %N step %c1 {
+        loop.for %k = %c0 to %K step %c1 {
+          %a = linalg.load %A[%m, %k] : !linalg.view<?x?xf32>
+          %b = linalg.load %B[%k, %n] : !linalg.view<?x?xf32>
+          %c = linalg.load %C[%m, %n] : !linalg.view<?x?xf32>
+          %d = call @mac(%a, %b, %c) : (f32, f32, f32) -> (f32)
+          linalg.store %d, %C[%m, %n] : !linalg.view<?x?x?xf32>
+        }
+      }
+    }
+    ```
+  }];
+  let arguments = (ins Variadic<View>:$views,
+                   AffineMapArrayAttr:$indexing_maps,
+                   I64ArrayAttr:$n_loop_types,
+                   I64ArrayAttr:$n_views,
+                   OptionalAttr<StrAttr>:$doc,
+                   OptionalAttr<SymbolRefAttr>:$fun,
+                   OptionalAttr<StrAttr>:$library_call);
+  let regions = (region AnyRegion:$region);
+  let extraClassDeclaration = [{
+    SmallVector<StringRef, 8> linalgTraitAttrNames() {
+      return SmallVector<StringRef, 8>{
+        "doc", "fun", "indexing_maps", "library_call", "n_loop_types", "n_views"
+      };
+    }
+    unsigned getNumInputs() {
+      if (!getAttr("n_views") || n_views().getValue().size() != 2)
+        return 0;
+      auto val = n_views().getValue()[0].cast<IntegerAttr>().getValue();
+      assert(val.getSExtValue() >= 0);
+      return val.getZExtValue();
+    }
+    unsigned getNumOutputs() {
+      if (!getAttr("n_views") || n_views().getValue().size() != 2)
+        return 0;
+      auto val = n_views().getValue()[1].cast<IntegerAttr>().getValue();
+      assert(val.getSExtValue() >= 0);
+      return val.getZExtValue();
+    }
+    unsigned getNumParallelLoops() {
+      if (!getAttr("n_loop_types") || n_loop_types().getValue().size() != 3)
+        return 0;
+      auto val = n_loop_types().getValue()[0].cast<IntegerAttr>().getValue();
+      assert(val.getSExtValue() >= 0);
+      return val.getZExtValue();
+    }
+    unsigned getNumReductionLoops() {
+      if (!getAttr("n_loop_types") || n_loop_types().getValue().size() != 3)
+        return 0;
+      auto val = n_loop_types().getValue()[1].cast<IntegerAttr>().getValue();
+      assert(val.getSExtValue() >= 0);
+      return val.getZExtValue();
+    }
+    unsigned getNumWindowLoops() {
+      if (!getAttr("n_loop_types") || n_loop_types().getValue().size() != 3)
+        return 0;
+      auto val = n_loop_types().getValue()[2].cast<IntegerAttr>().getValue();
+      assert(val.getSExtValue() >= 0);
+      return val.getZExtValue();
+    }
+    unsigned getNumLoops() {
+      return getNumParallelLoops() + getNumReductionLoops() +
+        getNumWindowLoops();
+    }
+    FuncOp getFunction() {
+      auto moduleOp = getParentOfType<ModuleOp>();
+      return fun().hasValue() ?
+        moduleOp.lookupSymbol<FuncOp>(fun().getValue()) : FuncOp();
+    }
+    StringRef getLibraryCallName() {
+      return library_call().hasValue() ? library_call().getValue() : "";
+    }
+    AffineMap getIndexingMap(unsigned i) {
+      assert(i < getNumInputsAndOutputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getInputIndexingMap(unsigned i) {
+      assert(i < getNumInputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getOutputIndexingMap(unsigned i) {
+      assert(i < getNumOutputs());
+      return indexing_maps().getValue()[i + getNumInputs()]
+          .cast<AffineMapAttr>().getValue();
+    }
+  }];
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+#endif // LINALG_LIBRARY_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
new file mode 100644
index 00000000000..e30c4d1bd49
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
@@ -0,0 +1,89 @@
+//===- LinalgOps.h - Linalg Operations --------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_LINALGOPS_H_
+#define MLIR_DIALECT_LINALG_LINALGOPS_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTraits.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace linalg {
+
+/// Returns the name mangled library call name to disambiguate between different
+/// overloads at the C level. The name mangling scheme is basic and uses MLIR
+/// type names:
+///   1. form a string which is the concatenation of the linalg op name with all
+///      the operand type names, separate by underscores;
+///   2. drop the `linalg.` prefix, and the `<`, `>`, `?` symbols from the type.
+/// Assumes `op` is a LinalgOp.
+///
+/// Examples:
+///
+/// 1. linalg.fill(%A, %f) : !linalg.view<f32>, f32
+///   name mangles into `linalg_fill_viewf32_f32_impl`
+///
+/// 2. linalg.dot(%A, %B, %C) :
+///      !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
+///   name mangles into `linalg_dot_viewxf32_viewxf32_viewf32_impl`
+///
+/// 3. linalg.matmul(...) :
+///      !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+///   name mangles into `linalg_matmul_viewxxf32_viewxxf32_viewxxf32_impl`
+std::string generateLibraryCallName(Operation *op);
+
+/// Returns the list of maps that map loops to operands of a Linalg op.
+/// The i-th affine map identifies loop indices to subscripts that are used when
+/// accessing the i-th operand.
+/// For instance, a matmul that can be written in index notation as:
+/// `A(i, k) * B(k, j) -> C(i, j)` will have the following, ordered, list of
+/// affine maps:
+///
+/// ```{.mlir}
+///    (
+///      (i, j, k) -> (i, k),
+///      (i, j, k) -> (k, j),
+///      (i, j, k) -> (i, j)
+///    )
+/// ```
+///
+/// Only permutation maps are currently supported.
+SmallVector<AffineMap, 4> loopToOperandRangesMaps(Operation *op);
+
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOpInterfaces.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOps.h.inc"
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, SubViewOp::Range &range);
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGOPS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
new file mode 100644
index 00000000000..235817ff098
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -0,0 +1,490 @@
+//===- LinalgOps.td - Linalg dialect ops -------------------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the operation definition file for linear algebra operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LINALG_OPS
+#else
+#define LINALG_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+// Base class for Linalg dialect ops that do not correspond to library calls.
+class Linalg_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Linalg_Dialect, mnemonic, traits> {
+  // For every linalg op, there needs to be a:
+  //   * void print(OpAsmPrinter *p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser *parser,
+  //                                         OperationState *result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def BufferAllocOp :
+    Linalg_Op<"buffer_alloc">,
+    Arguments<(ins Variadic<Index>:$size, OptionalAttr<I64Attr>:$alignment)>,
+    Results<(outs Buffer)> {
+  let summary = "buffer allocation operation";
+  let description = [{
+    The "buffer_alloc" op creates a 1-D linalg.buffer of the specified type,
+    upon which a base view can be laid out to give it indexing semantics.
+    "buffer_alloc" takes a single argument, the size of the buffer to allocate
+    (in number of elements).
+    An optional alignment attribute may be specified in which case the actual
+    underlying allocation size may be increased. The base pointer is guaranteed
+    to be a multiple of `alignment`. Such an alignment must be a positive power
+    of 2.
+
+    Examples:
+
+        %0 = linalg.buffer_alloc(%arg0) : !linalg.buffer<?xf32>
+
+        %1 = linalg.buffer_alloc(%arg0) { alignment = 16 } :
+          !linalg.buffer<?xf32>
+
+    The size argument may be omitted if it is statically known, in which case it
+    must be reflected in the type.
+
+    Example:
+
+        %0 = linalg.buffer_alloc() : !linalg.buffer<4xf32>
+  }];
+  let builders = [
+    OpBuilder<
+      "Builder *b, OperationState *result, BufferType bufferType", [{
+          result->addTypes(bufferType);
+       }]>,
+    OpBuilder<
+      "Builder *b, OperationState *result, BufferType bufferType, "
+      "unsigned alignment", [{
+        build(b, result, bufferType);
+        if (alignment != 0)
+          result->addAttribute(BufferAllocOp::getAlignmentAttrName(),
+                               b->getI64IntegerAttr(alignment));
+      }]>,
+    OpBuilder<
+      "Builder *b, OperationState *result, BufferType bufferType, "
+      "Value *size, unsigned alignment", [{
+        if (alignment == 0)
+          return build(b, result, bufferType, size);
+        build(b, result, bufferType, size, b->getI64IntegerAttr(alignment));
+      }]>,
+    OpBuilder<
+      "Builder *b, OperationState *result, BufferType bufferType, Value *size",
+      [{
+        result->addOperands(size);
+        result->addTypes(bufferType);
+      }]>
+  ];
+  let extraClassDeclaration = [{
+    static StringRef getAlignmentAttrName() { return "alignment"; }
+    BufferType getBufferType() { return getType().cast<BufferType>(); }
+    Type getElementType() { return getBufferType().getElementType(); }
+  }];
+}
+
+def BufferDeallocOp :
+    Linalg_Op<"buffer_dealloc">,
+    Arguments<(ins Buffer:$buffer)>,
+    Results<(outs)> {
+  let summary = "buffer allocation operation";
+  let description = [{
+    The "buffer_dealloc" op frees a 1-D linalg.buffer of the specified type.
+
+    Example:
+
+        linalg.buffer_dealloc %0 : !linalg.buffer<f32>
+  }];
+  let extraClassDeclaration = [{
+    BufferType getBufferType() {
+      return buffer()->getType().cast<BufferType>();
+    }
+  }];
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+def BufferSizeOp :
+    Linalg_Op<"buffer_size", [NoSideEffect]>,
+    Arguments<(ins Buffer:$buffer)>,
+    Results<(outs Index)> {
+  let summary = "buffer size operation";
+  let description = [{
+    The "linalg.buffer_size" operation takes a linalg.buffer and returns an
+    "index".
+
+    Example:
+
+       %0 = linalg.buffer_size %arg0 : !linalg.buffer<f32>
+  }];
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+def DimOp : Linalg_Op<"dim", [NoSideEffect]>,
+    Arguments<(ins View:$view, APIntAttr:$index)>,
+    Results<(outs Index)> {
+  let summary = "dimension index operation";
+  let description = [{
+    The "linalg.dim" operation takes a linalg.view and returns an
+    "index". It requires a single integer attribute named "index". It
+     returns the size of the specified dimension.
+
+     Example:
+
+       %1 = linalg.dim %0, 2 : view<?x?x?xf32>
+  }];
+
+  let verifier = [{
+    if (getIndex() >= getViewType().getRank())
+      return emitOpError("index is out of range");
+    return success();
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *view, unsigned index",
+    [{
+      result->addOperands(view);
+      result->addAttribute(
+        "index", builder->getIntegerAttr(builder->getIndexType(), index));
+      result->types.push_back(builder->getIndexType());
+    }]>];
+
+  let extraClassDeclaration = [{
+    unsigned getIndex() {
+      return getAttrOfType<IntegerAttr>("index").getValue().getZExtValue();
+    }
+    ViewType getViewType() { return getOperand()->getType().cast<ViewType>(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def LoadOp :
+    Linalg_Op<"load"
+      // TODO(ntv): activate once ViewType can be made a ShapeType (i.e.
+      // shape type is extensible or standard adopts a reasonable view type).
+      // , [ PredOpTrait<"operand and result have same element type",
+      //             TCresVTEtIsSameAsOpBase<0, 0>>]
+      >,
+    Arguments<(ins View:$view, Variadic<Index>:$indices)>,
+    Results<(outs AnyType:$value)> {
+  let summary = "Read an elemental value from a view at a certain index";
+  let description = [{
+    The `linalg.load` op reads an elemental value from a view at a certain
+    index. This is the counterpart of other load ops but operating on ViewType.
+
+    Example:
+
+       %0 = linalg.load %V[%c0] : !linalg.view<?xf32>
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *view, "
+    "ArrayRef<Value*> indices",
+    [{
+      auto viewType = view->getType().cast<ViewType>();
+      build(builder, result, viewType.getElementType(), view, indices);
+    }]>];
+  let extraClassDeclaration = [{
+    unsigned getRank() { return getViewType().getRank(); }
+    ViewType getViewType() { return view()->getType().cast<ViewType>(); }
+  }];
+}
+
+def RangeOp :
+    Linalg_Op<"range", [NoSideEffect]>,
+    Arguments<(ins Index:$min, Index:$max, Index:$step)>,
+    Results<(outs Range)> {
+  let summary = "Create a range type value, used to create views";
+  let description = [{
+    The `linalg.range` op creates a linalg.range from 3 values of type `index`
+    that represent the min, max and step values of the range.
+
+    Example:
+
+      %3 = linalg.range %0:%1:%2 : !linalg.range
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *min, Value *max, "
+    "Value *step",
+    [{
+      auto rangeType = RangeType::get(builder->getContext());
+      build(builder, result, rangeType, min, max, step);
+    }]>];
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+def SliceOp : Linalg_Op<"slice", [NoSideEffect]>,
+    Arguments<(ins View:$view, Variadic<AnyTypeOf<[Range, Index]>>:$indexings)>,
+    Results<(outs View)> {
+  let summary = "Produce a linalg.view which is a subview of a base view.";
+  let description = [{
+    The "linalg.slice" op produces a linalg.view which is a subview of a given
+    base view. This allows defining a subregion within the underlying buffer to
+    operate on only a subset of the buffer.
+
+    A "linalg.slice" op takes a view and a variadic number of indexings and
+    produces a linalg.view of the same elemental type. An indexing is either:
+      1. a linalg.range, in which case it does not reduce the rank of the parent
+         view.
+      2. an index, in which case it reduces the rank of the parent view by one.
+
+    If an indexing extends past the size of the view, the slice operation
+    automatically truncates it to be within the bounds.
+
+    Examples:
+
+      1. rank-preserving slice:
+
+        %4 = linalg.slice %0[%1, %2] : !linalg.view<?x?xf32>, !linalg.range,
+               !linalg.range, !linalg.view<?x?xf32>
+
+      2. rank-reducing slice (from 2-D to 1-D):
+
+        %4 = linalg.slice %0[%1, %2] : !linalg.view<?x?xf32>, index,
+               !linalg.range, !linalg.view<?xf32>
+
+      3. rank-reducing slice (from 2-D to 0-D):
+
+        %4 = linalg.slice %0[%1, %2] : !linalg.view<?x?xf32>, index, index,
+               !linalg.view<f32>
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *base, "
+    "ArrayRef<Value *> indexings">];
+
+  let extraClassDeclaration = [{
+    enum { FirstIndexingOperand = 1 };
+    unsigned getRank() { return getViewType().getRank(); }
+    Type getElementType() { return getViewType().getElementType(); }
+    ViewType getViewType() { return getType().cast<ViewType>(); }
+    unsigned getBaseViewRank() { return getBaseViewType().getRank(); }
+    ViewType getBaseViewType() { return view()->getType().cast<ViewType>(); }
+
+    // Get the underlying indexing at a given rank.
+    Value *indexing(unsigned rank) { return *(indexings().begin() + rank); }
+
+    // Get the subset of indexings that are of RangeType.
+    SmallVector<Value *, 8> getRanges() {
+      llvm::SmallVector<Value *, 8> res;
+      for (auto *operand : indexings())
+        if (!operand->getType().isa<IndexType>())
+          res.push_back(operand);
+      return res;
+    }
+  }];
+}
+
+def StoreOp :
+    Linalg_Op<"store"
+      // TODO(ntv): activate once ViewType can be made a ShapeType (i.e.
+      // shape type is extensible or standard adopts a reasonable view type).
+      // , [ PredOpTrait<"value to store and view have the same element type",
+      //             TCopVTEtIsSameAs<0, 1>>]
+      >,
+    Arguments<(ins AnyType:$value, View:$view, Variadic<Index>:$indices)>,
+    Results<(outs)> {
+  let summary = "Write an elemental value in a view at a certain index";
+  let description = [{
+    The `linalg.store` op writes an elemental value in a view at a certain
+    index. This is the counterpart of other store ops but operating on ViewType.
+
+    Example:
+
+      linalg.store %f, %V[%c0] : !linalg.view<?xf32>
+  }];
+  let extraClassDeclaration = [{
+    unsigned getRank() { return getViewType().getRank(); }
+    ViewType getViewType() { return view()->getType().cast<ViewType>(); }
+  }];
+}
+
+def SubViewOp : Linalg_Op<"subview", [NoSideEffect]>,
+    Arguments<(ins View:$view, Variadic<Index>:$ranges)>,
+    Results<(outs View)> {
+  let summary = "subview operation";
+  let description = [{
+    The "linalg.subview" op produces a linalg.view which is a subview of a given
+    base view. This allows defining a subregion within the underlying buffer.
+
+    The "linalg.subview" operation takes a base view, a list of indices and
+    returns a new linalg.view of the same type that is contained within the
+    view. This operation is equivalent to a non-rank-reducing slice operation.
+    The main difference is the operands are all of type `index` and no
+    intermediate linalg.range operations are required. A "linalg.subview" is
+    thus a specialized linalg.slice with a higher level of abstraction.
+
+    Similary to linalg.slice, if a range extends past the size of the base view,
+    the slice operation automatically truncates it to be within the bounds of
+    the view.
+
+    Example:
+
+      %1 = linalg.subview %0[%1, %2, %3, %4, %5, %6] : view<?x?xf32>
+
+  }];
+  // TODO(ntv) evolve syntax towards:
+  //   linalg.subview %0[%1:%2:%3][%4:%5:%6] : view<?x?xf32>
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *view, "
+    "ArrayRef<Value *> ranges",
+    [{
+      result->addOperands(view);
+      result->addOperands(ranges);
+      result->types.push_back(view->getType());
+    }]>];
+
+  let verifier = [{
+    auto rank = getViewType().getRank();
+    if (getNumOperands() != 3 * rank + 1)
+      return emitOpError("expected a view followed by ") << (3 * rank) <<
+        " indices specifying a range for each dimension";
+    return success();
+  }];
+
+  let extraClassDeclaration = [{
+    Value *getView() { return getOperand(0); }
+    ViewType getViewType() { return getView()->getType().cast<ViewType>(); }
+
+    struct Range { Value *min; Value *max; Value *step; };
+
+    Range getRange(unsigned i) {
+      return Range{
+        getOperand(1 + 3*i), getOperand(1 + 3*i + 1), getOperand(1 + 3*i + 2)};
+    }
+
+    SmallVector<Range, 8> getRanges() {
+      SmallVector<Range, 8> res;
+      unsigned rank = getViewType().getRank();
+      res.reserve(rank);
+      for (unsigned i = 0; i < rank; ++i)
+        res.push_back(getRange(i));
+      return res;
+    }
+
+    // This requires `SubViewOp` to be declared, in the future it should be
+    // folded into the builders.
+    static void build(Builder *builder, OperationState *result, Value *view,
+        ArrayRef<SubViewOp::Range> ranges) {
+      result->addOperands(view);
+      for (auto r : ranges)
+        result->addOperands({r.min, r.max, r.step});
+      result->types.push_back(view->getType());
+    }
+  }];
+}
+
+def TransposeOp : Linalg_Op<"transpose", [NoSideEffect]>,
+    Arguments<(ins View:$view, AffineMapAttr:$permutation)>,
+    Results<(outs View)> {
+  let summary = "transpose operation produces a new view (metadata-only)";
+  let description = [{
+    The "linalg.transpose" op produces a linalg.view whose sizes and strides are
+    a permutation of the original. This is a pure metadata transformation.
+
+    Example:
+
+       %1 = linalg.transpose %0 (i, j) -> (j, i) : !linalg.view<?x?xf32>
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *view, "
+    "AffineMapAttr permutation, ArrayRef<NamedAttribute> attrs = {}">];
+
+  let verifier = [{
+    if (!permutation().isPermutation())
+      return emitOpError("expected a permutation map");
+    if (permutation().getNumDims() != getViewType().getRank())
+      return emitOpError("expected a permutation map of same rank as the view");
+    return success();
+  }];
+
+  let extraClassDeclaration = [{
+    static StringRef getPermutationAttrName() { return "permutation"; }
+    ViewType getViewType() { return view()->getType().cast<ViewType>(); }
+  }];
+}
+
+def ViewOp : Linalg_Op<"view", [NoSideEffect]>,
+    Arguments<(ins Buffer:$buffer, Variadic<Range>:$ranges)>,
+    Results<(outs View)> {
+  let summary = "view operation";
+  let description = [{
+    The "linalg.view" op produces a linalg.view which is a multi-dimensional
+    range abstraction on top of an underlying linalg.buffer. This gives an
+    indexing structure to an otherwise non-indexable linalg.buffer.
+
+    A "linalg.view" takes a buffer and a variadic number of ranges and produces
+    a `view` of rank the number of ranges. The elemental type may not match the
+    buffer element type:
+
+    Example:
+
+       %1 = linalg.buffer_alloc %0 : !linalg.buffer<f32>
+       %2 = linalg.range %arg2:%arg3:%arg4 : !linalg.range
+       %3 = linalg.view %1[%2, %2] : !linalg.view<?x?xvector<4xf32>>
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result, Value *buffer, "
+    "ArrayRef<Value *> ranges, Type resultType = Type(), "
+    "ArrayRef<NamedAttribute> attrs = {}">];
+
+  let verifier = [{
+    if (getViewType().getRank() != llvm::size(ranges()))
+      return emitOpError("the view rank must be the number of its ranges");
+    return success();
+  }];
+
+  let extraClassDeclaration = [{
+    enum { FirstIndexingOperand = 1 };
+    unsigned getRank() { return getViewType().getRank(); }
+    Type getElementType() { return getViewType().getElementType(); }
+    ViewType getViewType() { return getType().cast<ViewType>(); }
+    /// Get the underlying indexing at a given rank.
+    Value *getRange(unsigned rank) {
+      assert(rank < getRank() && "rank overflow");
+      return *(ranges().begin() + rank);
+    }
+  }];
+}
+
+def YieldOp : Linalg_Op<"yield", [NativeOpTrait<"IsTerminator">]>,
+    Arguments<(ins Variadic<AnyType>:$values)> {
+  let summary = "Linalg yield operation";
+  let description = [{
+    "linalg.yield" is a special terminator operation for blocks inside regions
+    in linalg ops. It returns values to the immediately enclosing linalg op.
+
+    Example:
+
+       linalg.yield %f0, %f1 : f32, f32
+  }];
+}
+
+#endif // LINALG_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
new file mode 100644
index 00000000000..593021db2f8
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
@@ -0,0 +1,193 @@
+//===- LinalgTraits.h - Linalg Traits ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_LINALGTRAITS_H_
+#define MLIR_DIALECT_LINALG_LINALGTRAITS_H_
+
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace OpTrait {
+namespace linalg {
+
+/// This class provides the API for ops that are known to have a specified
+/// number of inputs and outputs, all passed as operands. This is used as a
+/// trait like this:
+///
+///   class DotOp : public Op<DotOp, OpTrait::NInputsAndOutputs<2, 1>::Impl> {
+///
+template <unsigned NInputs, unsigned NOutputs> class NInputsAndOutputs {
+public:
+  template <typename ConcreteType>
+  class Impl
+      : public OpTrait::TraitBase<ConcreteType,
+                                  NInputsAndOutputs<NInputs, NOutputs>::Impl> {
+  public:
+    static unsigned getNumInputs() { return NInputs; }
+    static unsigned getNumOutputs() { return NOutputs; }
+    static LogicalResult verifyTrait(Operation *op) {
+      return OpTrait::impl::verifyAtLeastNOperands(op, NInputs + NOutputs);
+    }
+  };
+};
+
+/// This class provides the API for ops that are known to operate on views. This
+/// trait must be used in conjunction with an op definition or a trait that
+/// provides the methods `getNumInputs` and `getNumOutputs`. This is used as a
+/// trait like this:
+///
+///   class DotOp : public Op<DotOp, OpTrait::ViewTrait> {
+///
+template <typename ConcreteType>
+class ViewTraits : public OpTrait::TraitBase<ConcreteType, ViewTraits> {
+private:
+  /// Return the number of input views. For internal use only.
+  unsigned nInputs() {
+    return cast<ConcreteType>(this->getOperation()).getNumInputs();
+  }
+  /// Return the number of input views. For internal use only.
+  unsigned nOutputs() {
+    return cast<ConcreteType>(this->getOperation()).getNumOutputs();
+  }
+
+public:
+  /// Return the `i`-th input view.
+  Value *getInput(unsigned i) {
+    assert(i < nInputs());
+    return this->getOperation()->getOperand(i);
+  }
+  /// Return the index of `view` in the list of input views if found, llvm::None
+  /// otherwise.
+  llvm::Optional<unsigned> getIndexOfInput(Value *view) {
+    auto it = llvm::find(getInputs(), view);
+    if (it != getInputs().end())
+      return it - getInputs().begin();
+    return llvm::None;
+  }
+  /// Return the `i`-th input view type.
+  mlir::linalg::ViewType getInputViewType(unsigned i) {
+    return getInput(i)->getType().template cast<mlir::linalg::ViewType>();
+  }
+  /// Return the range over input views.
+  Operation::operand_range getInputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin(), range.begin() + nInputs()};
+  }
+  /// Return the `i`-th output view.
+  Value *getOutput(unsigned i) {
+    return this->getOperation()->getOperand(nInputs() + i);
+  }
+  /// Return the index of `view` in the list of output views if found,
+  /// llvm::None otherwise.
+  llvm::Optional<unsigned> getIndexOfOutput(Value *view) {
+    auto it = llvm::find(getOutputs(), view);
+    if (it != getOutputs().end())
+      return it - getOutputs().begin();
+    return llvm::None;
+  }
+  /// Return the `i`-th output view type.
+  mlir::linalg::ViewType getOutputViewType(unsigned i) {
+    return getOutput(i)->getType().template cast<mlir::linalg::ViewType>();
+  }
+  /// Return the range over output views.
+  Operation::operand_range getOutputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin() + nInputs(),
+            range.begin() + getNumInputsAndOutputs()};
+  }
+  /// Return the number of input and output views.
+  unsigned getNumInputsAndOutputs() { return nInputs() + nOutputs(); }
+  /// Return the `i`-th view type.
+  mlir::linalg::ViewType getViewType(unsigned i) {
+    return (i < nInputs()) ? getInputViewType(i)
+                           : getOutputViewType(i - nInputs());
+  }
+  /// Return the range over input and output views.
+  Operation::operand_range getInputsAndOutputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin(), range.begin() + getNumInputsAndOutputs()};
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    auto nViews = cast<ConcreteType>(op).getNumInputsAndOutputs();
+    if (failed(OpTrait::impl::verifyAtLeastNOperands(op, nViews)))
+      return failure();
+    for (unsigned i = 0, e = nViews; i < e; ++i) {
+      if (!op->getOperand(i)->getType().dyn_cast<mlir::linalg::ViewType>())
+        return op->emitOpError("operand ") << i << " must have view type ";
+    }
+    return success();
+  }
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of parallel, reduction and window loops. This is used as a trait like
+/// this:
+///
+///   class MatmulOp : public Op<MatmulOp, OpTrait::NLoopTypes<2, 1, 0>::Impl> {
+///
+template <unsigned NParallel, unsigned NReduction, unsigned NWindow = 0>
+class NLoopTypes {
+public:
+  template <typename ConcreteType>
+  class Impl
+      : public OpTrait::TraitBase<
+            ConcreteType, NLoopTypes<NParallel, NReduction, NWindow>::Impl> {
+  public:
+    static unsigned getNumParallelLoops() { return NParallel; }
+    static unsigned getNumReductionLoops() { return NReduction; }
+    static unsigned getNumWindowLoops() { return NWindow; }
+    static unsigned getNumLoops() { return NParallel + NReduction + NWindow; }
+  };
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// list of view ranks. This is used as a trait like this:
+///
+///   class MatvecOp : public Op<MatvecOp, OpTrait::ViewRanks<2, 1, 1>::Impl> {
+///
+template <unsigned... Ranks> class ViewRanks {
+public:
+  template <typename ConcreteType>
+  class Impl
+      : public OpTrait::TraitBase<ConcreteType, ViewRanks<Ranks...>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      if (op->getNumOperands() != sizeof...(Ranks))
+        return op->emitError("expected ") << sizeof...(Ranks) << " operands";
+
+      unsigned ranks[]{Ranks...};
+      for (unsigned i = 0, e = op->getNumOperands(); i < e; ++i) {
+        auto viewType =
+            op->getOperand(i)->getType().dyn_cast<mlir::linalg::ViewType>();
+        if (!viewType)
+          return op->emitOpError("operand ") << i << " must have view type ";
+        if (ranks[i] != viewType.getRank())
+          return op->emitOpError("operand ")
+                 << i << " must have rank " << ranks[i];
+      }
+      return success();
+    }
+  };
+};
+
+} // namespace linalg
+} // namespace OpTrait
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGTRAITS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
new file mode 100644
index 00000000000..86b77f17868
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
@@ -0,0 +1,121 @@
+//===- LinalgTypes.h - Linalg Types ---------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_LINALGTYPES_H_
+#define MLIR_DIALECT_LINALG_LINALGTYPES_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+class MLIRContext;
+
+namespace linalg {
+enum LinalgTypes {
+  Buffer = Type::FIRST_LINALG_TYPE,
+  Range,
+  View,
+  LAST_USED_LINALG_TYPE = View,
+};
+
+class LinalgDialect : public Dialect {
+public:
+  explicit LinalgDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "linalg"; }
+
+  /// Parse a type registered to this dialect.
+  Type parseType(llvm::StringRef spec, Location loc) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, llvm::raw_ostream &os) const override;
+};
+
+/// A BufferType represents a contiguous block of memory that can be allocated
+/// and deallocated. A buffer cannot be indexed directly, a view must be
+/// laid out on a buffer to give it indexing semantics.
+struct BufferTypeStorage;
+class BufferType : public Type::TypeBase<BufferType, Type, BufferTypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+  /// Construction hook.
+  static BufferType get(MLIRContext *context, Type elementType,
+                        int64_t bufferSize = -1);
+  /// Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == LinalgTypes::Buffer; }
+
+  // Type-specific functionality.
+  Type getElementType();
+  bool hasConstantSize();
+  Optional<int64_t> getBufferSize();
+};
+
+/// A RangeType represents a minimal range abstraction (min, max, step).
+/// It is constructed by calling the linalg.range op with three values index of
+/// index type:
+///
+/// ```{.mlir}
+///    func @foo(%arg0 : index, %arg1 : index, %arg2 : index) {
+///      %0 = linalg.range %arg0:%arg1:%arg2 : !linalg.range
+///    }
+/// ```
+class RangeType : public Type::TypeBase<RangeType, Type> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+  /// Construction hook.
+  static RangeType get(MLIRContext *context) {
+    /// Custom, uniq'ed construction in the MLIRContext.
+    return Base::get(context, LinalgTypes::Range);
+  }
+  /// Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == LinalgTypes::Range; }
+};
+
+/// A ViewType represents a multi-dimensional range abstraction on top of an
+/// underlying storage type. It is parameterizable by the underlying element
+/// type and the rank of the view.
+/// A new value of ViewType is constructed from a buffer with a view op and
+/// passing it ranges:
+///
+/// ```{.mlir}
+///    %1 = linalg.buffer_alloc %0 : !linalg.buffer<f32>
+///    %2 = linalg.range %arg2:%arg3:%arg4 : !linalg.range
+///    %3 = linalg.view %1[%2, %2] : !linalg.view<?x?xf32>
+/// ```
+struct ViewTypeStorage;
+class ViewType : public Type::TypeBase<ViewType, Type, ViewTypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+  /// Construction hook.
+  static ViewType get(MLIRContext *context, Type elementType, unsigned rank);
+  // Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == LinalgTypes::View; }
+
+  // Type-specific functionality.
+  /// Return the underlying elemental type.
+  Type getElementType();
+  /// Return the rank of the view.
+  /// This is the number of indexings needed to reach an underlying element.
+  unsigned getRank();
+};
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGTYPES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Passes.h b/third_party/mlir/include/mlir/Dialect/Linalg/Passes.h
new file mode 100644
index 00000000000..e17439f6eea
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -0,0 +1,46 @@
+//===- Passes.h - Linalg pass entry points ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_PASSES_H_
+#define MLIR_DIALECT_LINALG_PASSES_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+class FunctionPassBase;
+class ModulePassBase;
+
+namespace linalg {
+std::unique_ptr<FunctionPassBase>
+createLinalgFusionPass(ArrayRef<int64_t> tileSizes = {});
+
+std::unique_ptr<FunctionPassBase>
+createLinalgTilingPass(ArrayRef<int64_t> tileSizes = {},
+                       bool promoteViews = false);
+
+std::unique_ptr<FunctionPassBase> createLowerLinalgToLoopsPass();
+
+std::unique_ptr<ModulePassBase> createLowerLinalgToLLVMPass();
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_PASSES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h
new file mode 100644
index 00000000000..014fa728405
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h
@@ -0,0 +1,51 @@
+//===- Intrinsics.h - Linalg intrinsics definitions -----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_INTRINSICS_H_
+#define MLIR_DIALECT_LINALG_INTRINSICS_H_
+
+#include "mlir/EDSC/Intrinsics.h"
+
+namespace mlir {
+namespace linalg {
+class BufferAllocOp;
+class BufferDeallocOp;
+class CopyOp;
+class DimOp;
+class FillOp;
+class LoadOp;
+class RangeOp;
+class SliceOp;
+class StoreOp;
+class ViewOp;
+namespace intrinsics {
+using buffer_alloc = mlir::edsc::intrinsics::ValueBuilder<BufferAllocOp>;
+using buffer_dealloc =
+    mlir::edsc::intrinsics::OperationBuilder<BufferDeallocOp>;
+using copy = mlir::edsc::intrinsics::OperationBuilder<CopyOp>;
+using dim = mlir::edsc::intrinsics::ValueBuilder<linalg::DimOp>;
+using fill = mlir::edsc::intrinsics::OperationBuilder<FillOp>;
+using linalg_load = mlir::edsc::intrinsics::ValueBuilder<linalg::LoadOp>;
+using linalg_store = mlir::edsc::intrinsics::OperationBuilder<linalg::StoreOp>;
+using range = mlir::edsc::intrinsics::ValueBuilder<RangeOp>;
+using slice = mlir::edsc::intrinsics::ValueBuilder<SliceOp>;
+using view = mlir::edsc::intrinsics::ValueBuilder<ViewOp>;
+} // namespace intrinsics
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_INTRINSICS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
new file mode 100644
index 00000000000..ff46f6a10ce
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -0,0 +1,156 @@
+//===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_LINALG_UTILS_H_
+#define MLIR_DIALECT_LINALG_UTILS_H_
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class AffineExpr;
+class AffineMap;
+class OperationFolder;
+
+namespace edsc {
+
+/// A LoopRangeBuilder is a generic NestedBuilder for loop.for operations.
+/// More specifically it is meant to be used as a temporary object for
+/// representing any nested MLIR construct that is "related to" an mlir::Value*
+/// (for now an induction variable).
+class LoopRangeBuilder : public NestedBuilder {
+public:
+  /// Constructs a new loop.for and captures the associated induction
+  /// variable. A ValueHandle pointer is passed as the first argument and is the
+  /// *only* way to capture the loop induction variable.
+  LoopRangeBuilder(ValueHandle *iv, ValueHandle range);
+  LoopRangeBuilder(ValueHandle *iv, Value *range);
+  LoopRangeBuilder(ValueHandle *iv, linalg::SubViewOp::Range range);
+
+  LoopRangeBuilder(const LoopRangeBuilder &) = delete;
+  LoopRangeBuilder(LoopRangeBuilder &&) = default;
+
+  LoopRangeBuilder &operator=(const LoopRangeBuilder &) = delete;
+  LoopRangeBuilder &operator=(LoopRangeBuilder &&) = default;
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a LoopRangeBuilder.
+  ValueHandle operator()(std::function<void(void)> fun = nullptr);
+};
+
+/// Helper class to sugar building loop.for loop nests from ranges.
+/// This is similar to edsc::LoopNestBuilder except it works on ranges directly.
+/// In the current implementation it produces loop.for operations.
+class LoopNestRangeBuilder {
+public:
+  LoopNestRangeBuilder(llvm::ArrayRef<edsc::ValueHandle *> ivs,
+                       llvm::ArrayRef<edsc::ValueHandle> ranges);
+  LoopNestRangeBuilder(llvm::ArrayRef<edsc::ValueHandle *> ivs,
+                       llvm::ArrayRef<Value *> ranges);
+  LoopNestRangeBuilder(llvm::ArrayRef<edsc::ValueHandle *> ivs,
+                       llvm::ArrayRef<linalg::SubViewOp::Range> ranges);
+  edsc::ValueHandle operator()(std::function<void(void)> fun = nullptr);
+
+private:
+  llvm::SmallVector<LoopRangeBuilder, 4> loops;
+};
+
+} // namespace edsc
+
+namespace linalg {
+
+/// Returns the linearized list of all view dimensions in a linalgOp. Applying
+/// the inverse, concatenated loopToOperandRangeMaps to this list allows the
+/// derivation of loop ranges for any linalgOp.
+template <typename ConcreteOp>
+SmallVector<Value *, 8> getViewSizes(ConcreteOp linalgOp) {
+  SmallVector<Value *, 8> res;
+  for (auto v : linalgOp.getInputsAndOutputs()) {
+    ViewType t = v->getType().template cast<ViewType>();
+    for (unsigned i = 0; i < t.getRank(); ++i)
+      res.push_back(intrinsics::dim(v, i));
+  }
+  return res;
+}
+
+/// Returns the values obtained by applying `map` to the list of values.
+/// Performs simplifications and foldings where possible.
+SmallVector<Value *, 4> applyMapToValues(OpBuilder &b, Location loc,
+                                         AffineMap map,
+                                         ArrayRef<Value *> values,
+                                         OperationFolder &state);
+
+struct TiledLinalgOp {
+  LinalgOp op;
+  SmallVector<loop::ForOp, 8> loops;
+};
+
+/// Performs standalone tiling of a single LinalgOp by `tileSizes`.
+/// Inserts scoped local buffers and copies tiled views into/from those buffers
+/// when the corresponding entry in `viewsToPromote` is true.
+/// Returns a struct containing the tiled loops and the cloned op if successful,
+/// llvm::None otherwise.
+// TODO(ntv) implement a heuristic for view promotion.
+llvm::Optional<TiledLinalgOp> tileLinalgOp(LinalgOp op,
+                                           ArrayRef<Value *> tileSizes,
+                                           OperationFolder &folder,
+                                           ArrayRef<bool> viewsToPromote = {});
+
+/// Performs standalone tiling of a single LinalgOp by constant `tileSizes`.
+/// Inserts scoped local buffers and copies tiled views into/from those buffers
+/// when the corresponding entry in `viewsToPromote` is true.
+/// Returns a struct containing the tiled loops and the cloned op if successful,
+/// llvm::None otherwise.
+// TODO(ntv) implement a heuristic for view promotion.
+llvm::Optional<TiledLinalgOp> tileLinalgOp(LinalgOp op,
+                                           ArrayRef<int64_t> tileSizes,
+                                           OperationFolder &folder,
+                                           ArrayRef<bool> viewsToPromote = {});
+
+struct PromotionInfo {
+  Value *buffer;
+  Value *fullLocalView;
+  Value *partialLocalView;
+};
+
+/// Promotes the `views` into a new buffer allocated at the insertion point `b`.
+/// For now, promotion occurs in 3 steps:
+///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
+///   2. Take a full view on the buffer and `linalg.fill` it with zeros (use
+///      float zero for now).
+///   3. Take a partial slice of the full view in step 2. and copy into it.
+///
+/// Returns a list of PromotionInfo which hold the promoted buffer and the
+/// full and partial views indexing into the buffer.
+llvm::SmallVector<PromotionInfo, 8> promoteLinalgViews(OpBuilder &b,
+                                                       Location loc,
+                                                       ArrayRef<Value *> views,
+                                                       OperationFolder &folder);
+
+/// Returns all the operands of `linalgOp` that are not views.
+/// Asserts that these operands are value types to allow transformations like
+/// tiling to just use the values when cloning `linalgOp`.
+llvm::SmallVector<Value *, 4> getAssumedNonViewOperands(LinalgOp linalgOp);
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_UTILS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
new file mode 100644
index 00000000000..2d699580c04
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS LoopOps.td)
+mlir_tablegen(LoopOps.h.inc -gen-op-decls)
+mlir_tablegen(LoopOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRLoopOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.h b/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.h
new file mode 100644
index 00000000000..90cc0b78bde
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.h
@@ -0,0 +1,56 @@
+//===- Ops.h - Loop MLIR Operations -----------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines convenience types for working with loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LOOPOPS_OPS_H_
+#define MLIR_LOOPOPS_OPS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace loop {
+
+class TerminatorOp;
+
+class LoopOpsDialect : public Dialect {
+public:
+  LoopOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "loop"; }
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LoopOps/LoopOps.h.inc"
+
+// Insert `loop.terminator` at the end of the only region's only block if it
+// does not have a terminator already.  If a new `loop.terminator` is inserted,
+// the location is specified by `loc`. If the region is empty, insert a new
+// block first.
+void ensureLoopTerminator(Region &region, Builder &builder, Location loc);
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+ForOp getForInductionVarOwner(Value *val);
+
+} // end namespace loop
+} // end namespace mlir
+#endif // MLIR_LOOPOPS_OPS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
new file mode 100644
index 00000000000..8b1b591c63f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -0,0 +1,158 @@
+//===- Ops.td - Loop operation definitions ---------------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines MLIR loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LOOP_OPS
+#else
+#define LOOP_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def Loop_Dialect : Dialect {
+  let name = "loop";
+  let cppNamespace = "";
+}
+
+// Base class for Loop dialect ops.
+class Loop_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Loop_Dialect, mnemonic, traits> {
+  // For every standard op, there needs to be a:
+  //   * void print(OpAsmPrinter *p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser *parser,
+  //                                         OperationState *result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def ForOp : Loop_Op<"for",
+      [SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let summary = "for operation";
+  let description = [{
+    The "loop.for" operation represents a loop nest taking 3 SSA value as
+    operands that represent the lower bound, upper bound and step respectively.
+    The operation defines an SSA value for its induction variable. It has one
+    region capturing the loop body. The induction variable is represented as an
+    argument of this region. This SSA value always has type index, which is the
+    size of the machine word. The step is a value of type index, required to be
+    positive.
+    The lower and upper bounds specify a half-open range: the range includes the
+    lower bound but does not include the upper bound.
+
+    The body region must contain exactly one block that terminates with
+    "loop.terminator".  Calling ForOp::build will create such region and insert
+    the terminator, so will the parsing even in cases when it is absent from the
+    custom format. For example:
+
+       loop.for %iv = %lb to %ub step %step {
+         ... // body
+       }
+  }];
+  let arguments = (ins Index:$lowerBound, Index:$upperBound, Index:$step);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, "
+              "Value *lowerBound, Value *upperBound, Value *step">
+  ];
+
+  let extraClassDeclaration = [{
+    Block *getBody() { return &region().front(); }
+    Value *getInductionVar() { return getBody()->getArgument(0); }
+    OpBuilder getBodyBuilder() {
+      return OpBuilder(getBody(), std::prev(getBody()->end()));
+    }
+    void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
+    void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
+    void setStep(Value *step) { getOperation()->setOperand(2, step); }
+  }];
+}
+
+def IfOp : Loop_Op<"if",
+      [SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let summary = "if-then-else operation";
+  let description = [{
+    The "loop.if" operation represents an if-then-else construct for
+    conditionally executing two regions of code. The operand to an if operation
+    is a boolean value. The operation produces no results. For example:
+
+       loop.if %b  {
+         ...
+       } else {
+         ...
+       }
+
+    The 'else' block is optional, and may be omitted. For
+    example:
+
+       loop.if %b  {
+         ...
+       }
+  }];
+  let arguments = (ins I1:$condition);
+  let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState *result, "
+              "Value *cond, bool withElseRegion">
+  ];
+
+  let extraClassDeclaration = [{
+    OpBuilder getThenBodyBuilder() {
+      assert(!thenRegion().empty() && "Unexpected empty 'then' region.");
+      Block &body = thenRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+    OpBuilder getElseBodyBuilder() {
+      assert(!elseRegion().empty() && "Unexpected empty 'else' region.");
+      Block &body = elseRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+  }];
+}
+
+def TerminatorOp :
+    Loop_Op<"terminator", [NativeOpTrait<"IsTerminator">]> {
+  let summary = "cf terminator operation";
+  let description = [{
+    "loop.terminator" is a special terminator operation for blocks inside
+    loops. It terminates the region. This operation does _not_ have a custom
+    syntax. However, `std` control operations omit the terminator in their
+    custom syntax for brevity.
+
+       loop.terminator
+  }];
+
+  // No custom parsing/printing form.
+  let parser = ?;
+  let printer = ?;
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+#endif // LOOP_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt
new file mode 100644
index 00000000000..3e3b9462b88
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS QuantOps.td)
+mlir_tablegen(QuantOps.h.inc -gen-op-decls)
+mlir_tablegen(QuantOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRQuantOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h b/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
new file mode 100644
index 00000000000..560b6327f96
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
@@ -0,0 +1,68 @@
+//===- FakeQuantSupport.h - Support utilities for FakeQuant ops -*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines support utilities for interoperating with FakeQuant* based
+// QAT (Quantized Aware Training) computations, as implemented by TFLite. Note
+// that FakeQuant* operators mix multiple concerns specific to how TFLite
+// originally implemented quantization. As such, utilities here enforce
+// opinions taken by that codebase (vs providing any amount of genericity).
+//
+// Specifically, it combines the following concerns, each of which would be
+// independent variables in a more generic setup:
+//   - numBits and isSigned imply storage data type (uint8, int8, int16)
+//   - numBits < 8 is promoted to uint8 or int8
+//   - "narrow_range" narrows the lower bound of the storage type's range by
+//     1
+//   - the specified min/max values are "nudged" so that the result has a zero
+//     that can be exactly expressed
+//   - min=max=0 implies scale=0 and zero_point=0
+//
+// With the above assumptions applied, every conforming specified FakeQuant op
+// can be represented by a UniformQuantizedType. This scheme is not expected to
+// be generalized further in the future and should be considered to be a
+// legacy set of rules.
+//
+// As canonically used in TensorFlow graphs, the presence of a FakeQuant node
+// is a hint that the specific math represented here has been simulated at
+// training time. As such, it is usually not advised to arbitrarily change
+// quantization parameters derived from FakeQuant.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
+#define MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+
+namespace mlir {
+namespace quant {
+
+/// Converts per-layer FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+/// Note that there are multiple variants of a per-layer FakeQuant op, so
+/// this function takes the attributes discretely vs taking a reference to the
+/// originating op.
+UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                          double rmin, double rmax,
+                                          bool narrowRange, Type expressedType,
+                                          bool isSigned = false);
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/Passes.h b/third_party/mlir/include/mlir/Dialect/QuantOps/Passes.h
new file mode 100644
index 00000000000..1d43f7087db
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/Passes.h
@@ -0,0 +1,49 @@
+//===- Passes.h - Quantization Passes ------ --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines all of the passes owned by the quantization dialect. As
+// things mature, it is expected that passes specific to certain frontend or
+// backend dialects will move to those dialects directly. For now, they are
+// incubated here.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_PASSES_H
+#define MLIR_DIALECT_QUANTOPS_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class FunctionPassBase;
+
+namespace quant {
+
+/// Creates a pass that converts quantization simulation operations (i.e.
+/// FakeQuant and those like it) to casts into/out of supported QuantizedTypes.
+std::unique_ptr<FunctionPassBase> createConvertSimulatedQuantPass();
+
+/// Creates a pass that converts constants followed by a qbarrier to a
+/// constant whose value is quantized. This is typically one of the last
+/// passes done when lowering to express actual quantized arithmetic in a
+/// low level representation. Because it modifies the constant, it is
+/// destructive and cannot be undone.
+std::unique_ptr<FunctionPassBase> createConvertConstPass();
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_PASSES_H
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.h b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.h
new file mode 100644
index 00000000000..8753cd2ed48
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.h
@@ -0,0 +1,50 @@
+//===- QuantOps.h - Quantization Ops and Types ------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
+#define MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace mlir {
+namespace quant {
+
+/// Defines the 'Quantization' dialect
+class QuantizationDialect : public Dialect {
+public:
+  QuantizationDialect(MLIRContext *context);
+
+  /// Parse a type registered to this dialect.
+  Type parseType(StringRef spec, Location loc) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, raw_ostream &os) const override;
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/QuantOps/QuantOps.h.inc"
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
new file mode 100644
index 00000000000..394d3a18ced
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
@@ -0,0 +1,227 @@
+//===- QuantOps.td - Quantization operation definition -----*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the operation definition file for Quantization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef DIALECT_QUANTOPS_QUANT_OPS_
+#else
+#define DIALECT_QUANTOPS_QUANT_OPS_
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/QuantOps/QuantPredicates.td"
+#endif // OP_BASE
+
+def quant_Dialect : Dialect {
+  let name = "quant";
+}
+
+//===----------------------------------------------------------------------===//
+// Base classes
+//===----------------------------------------------------------------------===//
+
+class quant_Op<string mnemonic, list<OpTrait> traits> :
+    Op<quant_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Quantization casts
+//===----------------------------------------------------------------------===//
+// A QuantizeCast (qcast) represents a potential type shift from a quantizable
+// type to a quantized type.
+//
+// At runtime, a qcast will apply the transformation expressed by its
+// operand and result type. For flexibility during transformation, it is also
+// possible to have a qcast that performs no transformation (both its
+// operand and result type are quantizable).
+//
+// A qcast will typically originate from either:
+//   a) An expressed or implied constraint in the source dialect which signals
+//      that a certain level of quantization is possible or required.
+//   b) An inference made by a quantization algorithm indicating that a
+//      quantized representation may be acceptable.
+//
+// Especially early in transformation, it is common to have pairs of
+// qcast/dcast at points where a transition to a quantized type is
+// required. In addition, it is also common to have an identity qcast
+// (where the operand and result type are not quantized) at all points where
+// it is legal to use a quantized representation (but is not known to be
+// acceptable).
+def quant_QuantizeCastOp : quant_Op<"qcast", [NoSideEffect]> {
+  let arguments = (ins quant_RealValueType:$arg);
+  let results = (outs quant_RealValueType);
+}
+
+// A DequantizeCast op (dcast) represents the inverse of a qcast,
+// converting back from a quantized to quantizable (expressed) type.
+//
+// Like qcasts, a dcast is allowed to have both its operand and result
+// as non quantized types. This facilitates transformations and marks edges
+// where the computation must be carried out in the expressed type.
+//
+// Especially early in transformation, it is common to have dcasts on
+// all operands to ops that must operate with the expressed type (typically
+// math ops prior to lowering to target-specific, quantized kernels).
+def quant_DequantizeCastOp : quant_Op<"dcast", [NoSideEffect]> {
+  let arguments = (ins quant_RealValueType:$arg);
+  let results = (outs quant_RealValueType);
+}
+
+// A StorageCast (scast) represents a cast from or to a type based on the
+// storage type and a type based on a corresponding quantized type.
+//
+// This op exists to ensure type coherency for between parts of the computation
+// which are operating directly on an underlying storage type and those which
+// operate on quantized values.
+//
+// Examples from storage to quantized type:
+//   i8 -> !quant<"uniform[i8:f32]{1.0}">
+//   tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+//   vector<4xi8> -> vector<4x!quant<"uniform[i8:f32]{1.0}">>
+def quant_StorageCastOp : quant_Op<"scast", [NoSideEffect]> {
+  let arguments = (ins quant_RealOrStorageValueType:$arg);
+  let results = (outs quant_RealOrStorageValueType);
+  let hasCanonicalizer = 0b1;
+}
+
+//===----------------------------------------------------------------------===//
+// Training integration and instrumentation ops
+//===----------------------------------------------------------------------===//
+
+def quant_ConstFakeQuant : quant_Op<"const_fake_quant",
+                                    [SameOperandsAndResultType, NoSideEffect]> {
+  let summary =
+      "Simulates the effect of uniform quantization with const range.";
+
+  let description = [{
+    Given a const min, max, num_bits and narrow_range attribute, applies the
+    same uniform quantization simulation as is done by the TensorFlow
+    fake_quant_with_min_max_args op. See the fakeQuantAttrsToType() utility
+    method and the quant-convert-simulated-quantization pass for futher details.
+  }];
+
+  let arguments = (ins
+    F32Tensor:$inputs,
+    F32Attr:$min,
+    F32Attr:$max,
+    // The bitwidth of the quantization; between 2 and 16, inclusive.
+    I64Attr:$num_bits,
+    // Quantization range starts from 0 or 1; starts from 1 if true.
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    // The sign of the quantization.
+    DefaultValuedAttr<BoolAttr, "false">:$is_signed
+  );
+
+  let results = (outs
+    F32Tensor:$outputs
+  );
+}
+
+def quant_StatisticsRefOp : quant_Op<"stats_ref", [SameOperandsAndResultType]> {
+  let summary =
+      "Indicates that statistics are resolved by reference.";
+
+  let description = [{
+    This op acts as an identity that, when encountered at runtime, should result
+    in statistics being collected about about the value of its operand/result.
+    Such statistics will be stored with the provided key, allowing this node
+    to later be converted to a 'stats' op if statistics with that key have been
+    encountered.
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    StrAttr:$statsKey
+  );
+  let results = (outs quant_RealValueType);
+}
+
+def quant_StatisticsOp : quant_Op<"stats", [SameOperandsAndResultType]> {
+  let summary =
+      "Identity op which associates statistics with the value.";
+
+  let description = [{
+    Associates statistics about the runtime ranges of values observed for
+    evaluations of this node.
+
+    Statistics about the entire type are reported in the 'layerStats' attribute
+    and those for each axis, in the (optional) `axisStats` attribute. The
+    interpretation of each is determined by the last dimension of its shape.
+    Currently, only dim=2 is supported, which is interpreted as [min, max].
+
+    `layerStats` must be a rank 1 tensor: [2]
+    `axisStats` must be a rank 2 tensor: [N, 2], where N=the rank of `arg`.
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    ElementsAttr:$layerStats,
+    OptionalAttr<ElementsAttr>:$axisStats);
+  let results = (outs quant_RealValueType);
+
+  let verifier = [{
+    auto tensorArg = arg()->getType().dyn_cast<TensorType>();
+    auto argRank = tensorArg ? tensorArg.getRank() : 0;
+    // Verify layerStats attribute.
+    {
+      auto layerStatsType = layerStats().getType();
+      if (!layerStatsType.getElementType().isa<FloatType>()) {
+        return emitOpError(
+            "layerStats must have a floating point element type");
+      }
+      if (layerStatsType.getRank() != 1 || layerStatsType.getDimSize(0) != 2) {
+        return emitOpError("layerStats must have shape [2]");
+      }
+    }
+    // Verify axisStats (optional) attribute.
+    if (axisStats()) {
+      auto axisStatsType = axisStats()->getType();
+      if (!axisStatsType.getElementType().isa<FloatType>()) {
+        return emitOpError("axisStats must have a floating point element type");
+      }
+      if (axisStatsType.getRank() != 2 ||
+          axisStatsType.getDimSize(1) != 2 ||
+          axisStatsType.getDimSize(0) != argRank) {
+        return emitOpError("axisStats must have shape [N,2] "
+                           "where N = the argument rank");
+      }
+    }
+    return success();
+  }];
+}
+
+def quant_CoupledRefOp : quant_Op<"coupled_ref", [SameOperandsAndResultType]> {
+  let summary =
+      "Indicates that one point of the computation is coupled to another.";
+
+  let description = [{
+    Ordinarily, relationships between ops for the purposes of determining
+    compatible quantized types is explicit based on the use-def chain. However,
+    in some situations, a use may be separated from its def by arbitrary
+    external connections. In such a case, during analysis, all coupled_ref
+    nodes in a module which share a coupledKey will be considered to be
+    directly connected as via an identity op for the purpose of type inference.
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    StrAttr:$coupledKey);
+  let results = (outs quant_RealValueType);
+}
+
+#endif // DIALECT_QUANTOPS_QUANT_OPS_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td
new file mode 100644
index 00000000000..4940b015331
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td
@@ -0,0 +1,73 @@
+//===- QuantPredicates.td - Predicates for dialect types ---*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Predicates for types in the Quantization dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef DIALECT_QUANTOPS_QUANT_PREDICATES_
+#else
+#define DIALECT_QUANTOPS_QUANT_PREDICATES_
+
+//===----------------------------------------------------------------------===//
+// Quantization type definitions
+//===----------------------------------------------------------------------===//
+
+class quant_TypedPrimitiveOrContainer<Type etype> :
+    Type<Or<[etype.predicate,
+                TensorOf<[etype]>.predicate,
+                VectorOf<[etype]>.predicate]>,
+         "primitive/tensor/vector of " # etype.description>;
+
+// An implementation of QuantizedType.
+def quant_QuantizedType :
+    Type<CPred<"$_self.isa<mlir::quant::QuantizedType>()">, "QuantizedType">;
+
+// A primitive type that can represent a real value. This is either a
+// floating point value or a quantized type.
+def quant_RealPrimitiveType :
+    Type<Or<[AnyFloat.predicate, quant_QuantizedType.predicate]>,
+    "real valued primitive (float or quantized type)">;
+
+// A primitive type that can represent a storage value. This is either an
+// integer or quantized type.
+def quant_StoragePrimitiveType :
+    Type<Or<[AnyInteger.predicate, quant_QuantizedType.predicate]>,
+    "quantized storage primitive (integer or quantized type)">;
+
+// A primitive or container of RealPrimitiveType.
+def quant_RealValueType :
+    quant_TypedPrimitiveOrContainer<quant_RealPrimitiveType>;
+
+// A primitive or container of StoragePrimitiveType.
+def quant_StorageValueType :
+    quant_TypedPrimitiveOrContainer<quant_StoragePrimitiveType>;
+
+// Either a real valued or storage primitive or container type.
+def quant_RealOrStorageValueType :
+    Type<Or<[quant_RealValueType.predicate,
+                quant_StorageValueType.predicate]>>;
+
+// An implementation of UniformQuantizedType.
+def quant_UniformQuantizedType :
+    Type<CPred<"$_self.isa<UniformQuantizedType>()">, "UniformQuantizedType">;
+
+// Predicate for detecting a container or primitive of UniformQuantizedType.
+def quant_UniformQuantizedValueType :
+    quant_TypedPrimitiveOrContainer<quant_UniformQuantizedType>;
+
+#endif // DIALECT_QUANTOPS_QUANT_PREDICATES_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h
new file mode 100644
index 00000000000..803ee4eb634
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h
@@ -0,0 +1,411 @@
+//===- QuantTypes.h - Quantization Ops and Types ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
+#define MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace mlir {
+namespace quant {
+
+class QuantizedIntegerType;
+
+namespace detail {
+
+struct QuantizedTypeStorage;
+struct AnyQuantizedTypeStorage;
+struct UniformQuantizedTypeStorage;
+struct UniformQuantizedPerAxisTypeStorage;
+
+} // namespace detail
+
+namespace QuantizationTypes {
+enum Kind {
+  Any = Type::FIRST_QUANTIZATION_TYPE,
+  UniformQuantized,
+  UniformQuantizedPerAxis,
+  LAST_USED_QUANTIZATION_TYPE = UniformQuantizedPerAxis,
+};
+} // namespace QuantizationTypes
+
+/// Enumeration of bit-mapped flags related to quantized types.
+namespace QuantizationFlags {
+enum FlagValue {
+  // Indicates that the storage type should be interpreted as a signed
+  // integer. The default is to interpret it as an unsigned value.
+  Signed = 1,
+};
+} // namespace QuantizationFlags
+
+/// Base class for all quantized types known to this dialect.
+/// All quantized types have:
+///   - storageType: The (narrower) numeric type that is being used to
+///     approximate some expressed type.
+///   - expressedType: The type that is being approximated.
+///
+/// The base class provides generic support for manipulating the types based
+/// on these fields.
+class QuantizedType : public Type {
+public:
+  using ImplType = detail::QuantizedTypeStorage;
+  using Type::Type;
+
+  /// The maximum number of bits supported for storage types.
+  static constexpr unsigned MaxStorageBits = 32;
+
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, unsigned flags,
+                               Type storageType, Type expressedType,
+                               int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool classof(Type type) {
+    return type.getKind() >= Type::FIRST_QUANTIZATION_TYPE &&
+           type.getKind() <= QuantizationTypes::LAST_USED_QUANTIZATION_TYPE;
+  }
+
+  /// Gets the minimum possible stored by a storageType. storageTypeMin must
+  /// be greater than or equal to this value.
+  static int64_t getDefaultMininumForInteger(bool isSigned,
+                                             unsigned integralWidth) {
+    if (isSigned) {
+      return llvm::minIntN(integralWidth);
+    }
+    return 0;
+  }
+
+  /// Gets the maximum possible stored by a storageType. storageTypeMax must
+  /// be less than or equal to this value.
+  static int64_t getDefaultMaxinumForInteger(bool isSigned,
+                                             unsigned integralWidth) {
+    if (isSigned) {
+      return llvm::maxIntN(integralWidth);
+    }
+    return llvm::maxUIntN(integralWidth);
+  }
+
+  /// Gets the original expressed type that this quantized type approximates.
+  /// Note that this presumes that the quantized type was always derived from
+  /// a floating point type, which in the broadest definition, is not true (i.e.
+  /// it could be some form of integral, fixed type or affine type in its own
+  /// right); however, at the high level, no examples of such usage are
+  /// presently known and the restriction serves some useful purposes (such as
+  /// always being able to reverse a transformation or measure error). In most
+  /// cases, this will be f32.
+  Type getExpressedType() const;
+
+  /// Gets the flags associated with this type. Typically a more specific
+  /// accessor is appropriate.
+  unsigned getFlags() const;
+
+  // Convenience helpers.
+  /// Whether the storage type should be interpreted as a signed quantity
+  /// (true) or an unsigned value (false).
+  bool isSigned() const {
+    return (getFlags() & QuantizationFlags::Signed) ==
+           QuantizationFlags::Signed;
+  }
+
+  /// Gets the underlying type used for to store values. Note that this may
+  /// be signed or unsigned. Use the isSigned() accessor to differentiate.
+  Type getStorageType() const;
+
+  /// The minimum value that storageType can take.
+  int64_t getStorageTypeMin() const;
+
+  /// The maximum value that storageType can take.
+  int64_t getStorageTypeMax() const;
+
+  /// Gets the integral bit width that the underlying storage type can exactly
+  /// represent. For integral storage types, this will just be their width.
+  unsigned getStorageTypeIntegralWidth() const;
+
+  /// Returns whether the candidateExpressedType is a match for this
+  /// QuantizedType. This will be true if the candidate type is either a
+  /// primitive type or a container type whose element type equals this
+  /// QuantizedType's expressed type.
+  /// Examples of compatible candidateExpressedType:
+  ///   !quant.uniform<i8:f32, 1.0> =~ f32
+  ///   !quant.uniform<i8:f32, 1.0> =~ tensor<4xf32>
+  bool isCompatibleExpressedType(Type candidateExpressedType);
+
+  /// Returns the element type as a QuantizedType or nullptr if it is not
+  /// a quantized type. If the type is primitive, returns that. If it is a
+  /// container (vector/tensor), return the element type.
+  /// Examples:
+  ///   !quant.uniform<i8:f32, 1.0> -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4x!quant.uniform<i8:f32, 1.0> -> quant.uniform<i8:f32, 1.0>
+  static QuantizedType getQuantizedElementType(Type primitiveOrContainerType);
+
+  /// Casts from a type based on the storageType to a corresponding type based
+  /// on this type (returns nullptr if the cast is not valid).
+  /// Examples:
+  ///   i8 -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4xi8> -> tensor<4x!quant.uniform<i8:f32, 1.0}>>
+  ///   vector<4xi8> -> vector<4x!quant.uniform<i8:f32, 1.0>>
+  Type castFromStorageType(Type candidateType);
+
+  /// Casts from a type based on a QuantizedType to a corresponding type based
+  /// on the storageType (returns nullptr if the cast is not valid).
+  /// This is the inverse of castFromStorageType().
+  static Type castToStorageType(Type quantizedType);
+
+  /// Casts from a type based on the expressedType to a corresponding type based
+  /// on this type (returns nullptr if the cast is not valid).
+  /// Examples:
+  ///   f32 -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+  ///   vector<4xf32> -> vector<4x!quant.uniform<i8:f32, 1.0>>
+  Type castFromExpressedType(Type candidateType);
+
+  /// Casts from a type based on QuantizedType to a corresponding type based
+  /// on the expressedType (returns nullptr if the cast is not valid).
+  /// This is the inverse of castFromExpressedType.
+  static Type castToExpressedType(Type quantizedType);
+
+  /// Casts from a type based on the expressedType to the equivalent type
+  /// based on storageType by way of this QuantizedType. Equivalent to:
+  ///   QuantizedType::castToStorageType(castFromExpressedType(candidateType))
+  /// (but with validity checks).
+  /// Example (for this = !quant.uniform<i8:f32, 1.0>):
+  ///   tensor<4xf32> -> tensor<4xi8>
+  Type castExpressedToStorageType(Type candidateType);
+
+private:
+  /// Hide the following methods inherited from `Type`. It is almost certainly
+  /// a bug to call them from a `QuantizedType` object. Users should call
+  /// `getStorageType` or `getExpressedType` to get the underlying types
+  /// they want to inspect.
+  using Type::isBF16;
+  using Type::isF16;
+  using Type::isF32;
+  using Type::isF64;
+  using Type::isIndex;
+  using Type::isInteger;
+};
+
+/// A quantized type that maps storage to/from expressed types in an
+/// unspecified way.
+///
+/// Typical syntax:
+///   quant.any<i8:f32>
+///   quant.any<i8>
+///   quant.any<i8<-16,15>>
+///
+/// Note that for the any type, the expressed type is optional.
+class AnyQuantizedType
+    : public Type::TypeBase<AnyQuantizedType, QuantizedType,
+                            detail::AnyQuantizedTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) { return kind == QuantizationTypes::Any; }
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static AnyQuantizedType get(unsigned flags, Type storageType,
+                              Type expressedType, int64_t storageTypeMin,
+                              int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static AnyQuantizedType getChecked(unsigned flags, Type storageType,
+                                     Type expressedType, int64_t storageTypeMin,
+                                     int64_t storageTypeMax, Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, unsigned flags,
+                               Type storageType, Type expressedType,
+                               int64_t storageTypeMin, int64_t storageTypeMax);
+};
+
+/// Represents a family of uniform, quantized types.
+///
+/// Each instance of this type expresses a mapping between real values (most
+/// often expressed in floating point f32) and quantized values (either fixed
+/// point or affine).
+///
+/// The relationship is:
+///     real_value = scale * (quantized_value - zero_point)
+///
+/// It is used as part of high level graph transformations that have the goal
+/// of re-expressing parts of a computation in terms of this common form for
+/// more efficient execution at runtime. In addition, it is designed to be
+/// expressive enough to facilitate lowering to precise types and operations
+/// in target hardware.
+///
+/// As a high-level type, focused on intermediate passes, this type holds
+/// opinions consistent with high-level usage. If lowering math kernels below
+/// the high level arithmetic ops (i.e. to LLVM IR or hardware specific
+/// instruction sets), it is expected that the information expressed here
+/// will be used to drive low level codegen and target specific type selection,
+/// but this type will likely be erased in the process.
+///
+/// Syntax synopsis:
+///   Per-layer, all parameters expressed:
+///     !quant<uniform[StorageType:ExpressedType]{Scale:ZeroPoint}>
+///   Per-layer, optional parameters omitted:
+///     !quant<uniform[StorageType]{Scale}>
+///
+///   StorageType: 'i'|'u' NumBits
+///   ExpressedType: 'f16', 'f32', 'bf16', 'f64'
+///   Scale: A legal double value
+///   ZeroPoint: An integer value
+class UniformQuantizedType
+    : public Type::TypeBase<UniformQuantizedType, QuantizedType,
+                            detail::UniformQuantizedTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static UniformQuantizedType get(unsigned flags, Type storageType,
+                                  Type expressedType, double scale,
+                                  int64_t zeroPoint, int64_t storageTypeMin,
+                                  int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static UniformQuantizedType
+  getChecked(unsigned flags, Type storageType, Type expressedType, double scale,
+             int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax,
+             Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult verifyConstructionInvariants(
+      llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+      Type storageType, Type expressedType, double scale, int64_t zeroPoint,
+      int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) {
+    return kind == QuantizationTypes::UniformQuantized;
+  }
+
+  /// Gets the scale term. The scale designates the difference between the real
+  /// values corresponding to consecutive quantized values differing by 1.
+  double getScale() const;
+
+  /// Gets the storage value corresponding to the real value 0 in the affine
+  /// equation.
+  int64_t getZeroPoint() const;
+
+  // Fixed point values are real numbers divided by a scale.
+  // Currently, only signed storage types are treated as fixed point.
+  // A fixed point value can be obtained from an affine value by subtracting
+  // the zeroPoint.
+  // In the future, this may be explicit versus implied by type and zeroPoint.
+  bool isFixedPoint() const { return isSigned() && getZeroPoint() == 0; }
+};
+
+/// Represents per-axis (also known as per-channel quantization).
+///
+/// Syntax synopsis:
+///   Per-axis, all parameters expressed:
+///     !quant<uniform[StorageType:ExpressedType:QuantizedDim]{QuantParams}>
+///   Per-axis, optional parameters omitted:
+///     !quant<uniform[StorageType]{Scale}>
+///
+///   StorageType: 'i'|'u' NumBits
+///   ExpressedType: 'f16', 'f32', 'bf16', 'f64'
+///   QuantizedDim: An integer value
+///   QuantParams: (Scale ':' ZeroPoint)+
+///   Scale: A legal double value
+///   ZeroPoint: An integer value
+class UniformQuantizedPerAxisType
+    : public Type::TypeBase<UniformQuantizedPerAxisType, QuantizedType,
+                            detail::UniformQuantizedPerAxisTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static UniformQuantizedPerAxisType
+  get(unsigned flags, Type storageType, Type expressedType,
+      ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+      int32_t quantizedDimension, int64_t storageTypeMin,
+      int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static UniformQuantizedPerAxisType
+  getChecked(unsigned flags, Type storageType, Type expressedType,
+             ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+             int32_t quantizedDimension, int64_t storageTypeMin,
+             int64_t storageTypeMax, Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult verifyConstructionInvariants(
+      llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+      Type storageType, Type expressedType, ArrayRef<double> scales,
+      ArrayRef<int64_t> zeroPoints, int32_t quantizedDimension,
+      int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) {
+    return kind == QuantizationTypes::UniformQuantizedPerAxis;
+  }
+
+  /// Gets the quantization scales. The scales designate the difference between
+  /// the real values corresponding to consecutive quantized values differing
+  /// by 1. The ith scale corresponds to the ith slice in the
+  /// quantized_dimension.
+  ArrayRef<double> getScales() const;
+
+  /// Gets the storage values corresponding to the real value 0 in the affine
+  /// equation. The ith zero point corresponds to the ith slice in the
+  /// quantized_dimension.
+  ArrayRef<int64_t> getZeroPoints() const;
+
+  /// Specifies the dimension of the Tensor's shape that the scales and
+  /// zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  /// with quantization params:
+  ///   scales=[1.0, 2.0, 3.0], zeroPoints=[1, 2, 3], quantizedDimension=1
+  /// will be quantized across the second dimension of t.
+  ///   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  ///   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  ///   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  int32_t getQuantizedDimension() const;
+
+  /// Fixed point values are real numbers divided by a scale.
+  /// Currently, only signed storage types are treated as fixed point.
+  /// A fixed point value can be obtained from an affine value by subtracting
+  /// the zeroPoint.
+  /// In the future, this may be explicit versus implied by type and zeroPoint.
+  bool isFixedPoint() const {
+    if (!isSigned())
+      return false;
+    return llvm::all_of(getZeroPoints(),
+                        [](int64_t zeroPoint) { return zeroPoint != 0; });
+  }
+};
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h
new file mode 100644
index 00000000000..de87ca1e67c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h
@@ -0,0 +1,70 @@
+//===- QuantizeUtils.h - Support utilities for quantization -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
+#define MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
+
+namespace mlir {
+class Attribute;
+class Type;
+
+namespace quant {
+class QuantizedType;
+class UniformQuantizedType;
+class UniformQuantizedValueConverter;
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType().
+/// Returns nullptr if the conversion is not supported. On success, stores the
+/// converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttr(Attribute realValue, QuantizedType quantizedElementType,
+                       Type &outConvertedType);
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType() and casted to an
+/// UniformQuantizedType. Returns nullptr if the conversion is not supported. On
+/// success, stores the converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttrUniform(Attribute realValue,
+                              UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType);
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h b/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
new file mode 100644
index 00000000000..5d11c769b8e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
@@ -0,0 +1,119 @@
+//===- UniformSupport.h - Support utilities for uniform quant ---*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
+#define MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+
+namespace mlir {
+namespace quant {
+
+/// Performs type conversion from an arbitrary input type to a type
+/// that is expressed by a UniformQuantizedType.
+///
+/// This handles cases where the inputType is a supported primitive type
+/// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
+/// elemental type.
+///
+/// Since conversion often involves introspecting some attributes of the
+/// input type in order to determine how to represent it, this is a two step
+/// process.
+struct ExpressedToUniformQuantizedConverter {
+  /// Creates a converter for the given input type.
+  static const ExpressedToUniformQuantizedConverter
+  forInputType(Type inputType);
+
+  /// Converts the inputType to be based on the given elemental type,
+  /// returning the new type (or nullptr and emit an error on failure).
+  Type convert(UniformQuantizedType elementalType) const;
+
+  /// Whether the conversion is legal.
+  explicit operator bool() const { return (bool)expressedType; }
+
+  /// The input type that is being converted from.
+  /// This may be an elemental or composite type.
+  const Type inputType;
+
+  /// Supported, elemental expressed type (i.e. f32).
+  /// Will be nullptr if conversion is not supported.
+  const Type expressedType;
+};
+
+/// Reference implementation of converting between real numbers and values
+/// represented by a UniformQuantizedType.
+/// Note that this is not expected to be speedy and may be superceded eventually
+/// by a more optimal implementation.
+/// Also, the interface assumes that quantization is done per-layer and will
+/// need to be wider for various per-channel schemes. As such, this is a
+/// placeholder.
+class UniformQuantizedValueConverter {
+public:
+  UniformQuantizedValueConverter(UniformQuantizedType uniformType)
+      : scale(uniformType.getScale()),
+        zeroPoint(static_cast<double>(uniformType.getZeroPoint())),
+        clampMin(static_cast<double>(uniformType.getStorageTypeMin())),
+        clampMax(static_cast<double>(uniformType.getStorageTypeMax())),
+        storageBitWidth(uniformType.getStorageTypeIntegralWidth()),
+        isSigned(uniformType.isSigned()) {
+    assert(uniformType.getExpressedType().isa<FloatType>());
+    assert(uniformType.getStorageType().isa<IntegerType>());
+  }
+
+  virtual APInt quantizeFloatToInt(APFloat expressedValue) const {
+    bool lossy;
+    expressedValue.convert(scale.getSemantics(), APFloat::rmNearestTiesToEven,
+                           &lossy);
+    // fixedpoint = clamp(clampMin, clampMax, (
+    //   roundHalfToEven(expressed / scale) + zeroPoint))
+    APFloat scaled = (expressedValue / scale);
+    scaled.roundToIntegral(APFloat::rmNearestTiesToEven);
+    scaled.add(zeroPoint, APFloat::rmNearestTiesToEven);
+    APFloat fixedpoint = llvm::minimum(scaled, clampMax);
+    fixedpoint = llvm::maximum(fixedpoint, clampMin);
+
+    llvm::APSInt result(storageBitWidth, !isSigned);
+    fixedpoint.convertToInteger(result, APFloat::rmNearestTiesToEven, &lossy);
+
+    return std::move(result);
+  }
+
+  int64_t quantizeFloatToInt64(APFloat expressedValue) const {
+    APInt qValue = quantizeFloatToInt(expressedValue);
+    return isSigned ? qValue.getSExtValue() : qValue.getZExtValue();
+  }
+
+  virtual ~UniformQuantizedValueConverter() {}
+
+private:
+  const APFloat scale;
+  const APFloat zeroPoint;
+  const APFloat clampMin;
+  const APFloat clampMax;
+  const uint32_t storageBitWidth;
+  const bool isSigned;
+};
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h b/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h
new file mode 100644
index 00000000000..3115805bb5f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h
@@ -0,0 +1,206 @@
+//===- SDBM.h - MLIR SDBM declaration ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// A striped difference-bound matrix (SDBM) is a set in Z^N (or R^N) defined
+// as {(x_1, ... x_n) | f(x_1, ... x_n) >= 0} where f is an SDBM expression.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SDBM_SDBM_H
+#define MLIR_DIALECT_SDBM_SDBM_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class MLIRContext;
+class SDBMDialect;
+class SDBMExpr;
+class SDBMPositiveExpr;
+
+/// A utility class for SDBM to represent an integer with potentially infinite
+/// positive value. This uses the largest value of int64_t to represent infinity
+/// and redefines the arithmetic operators so that the infinity "saturates":
+///   inf + x = inf,
+///   inf - x = inf.
+/// If a sum of two finite values reaches the largest value of int64_t, the
+/// behavior of IntInfty is undefined (in practice, it asserts), similarly to
+/// regular signed integer overflow.
+class IntInfty {
+public:
+  constexpr static int64_t infty = std::numeric_limits<int64_t>::max();
+
+  /*implicit*/ IntInfty(int64_t v) : value(v) {}
+
+  IntInfty &operator=(int64_t v) {
+    value = v;
+    return *this;
+  }
+
+  static IntInfty infinity() { return IntInfty(infty); }
+
+  int64_t getValue() const { return value; }
+  explicit operator int64_t() const { return value; }
+
+  bool isFinite() { return value != infty; }
+
+private:
+  int64_t value;
+};
+
+inline IntInfty operator+(IntInfty lhs, IntInfty rhs) {
+  if (!lhs.isFinite() || !rhs.isFinite())
+    return IntInfty::infty;
+
+  // Check for overflows, treating the sum of two values adding up to INT_MAX as
+  // overflow.  Convert values to unsigned to get an extra bit and avoid the
+  // undefined behavior of signed integer overflows.
+  assert((lhs.getValue() <= 0 || rhs.getValue() <= 0 ||
+          static_cast<uint64_t>(lhs.getValue()) +
+                  static_cast<uint64_t>(rhs.getValue()) <
+              static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) &&
+         "IntInfty overflow");
+  // Check for underflows by converting values to unsigned to avoid undefined
+  // behavior of signed integers perform the addition (bitwise result is same
+  // because numbers are required to be two's complement in C++) and check if
+  // the sign bit remains negative.
+  assert((lhs.getValue() >= 0 || rhs.getValue() >= 0 ||
+          ((static_cast<uint64_t>(lhs.getValue()) +
+            static_cast<uint64_t>(rhs.getValue())) >>
+           63) == 1) &&
+         "IntInfty underflow");
+
+  return lhs.getValue() + rhs.getValue();
+}
+
+inline bool operator<(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() < rhs.getValue();
+}
+
+inline bool operator<=(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() <= rhs.getValue();
+}
+
+inline bool operator==(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() == rhs.getValue();
+}
+
+inline bool operator!=(IntInfty lhs, IntInfty rhs) { return !(lhs == rhs); }
+
+/// Striped difference-bound matrix is a representation of an integer set bound
+/// by a system of SDBMExprs interpreted as inequalities "expr <= 0".
+class SDBM {
+public:
+  /// Obtain an SDBM from a list of SDBM expressions treated as inequalities and
+  /// equalities with zero.
+  static SDBM get(ArrayRef<SDBMExpr> inequalities,
+                  ArrayRef<SDBMExpr> equalities);
+
+  void getSDBMExpressions(SDBMDialect *dialect,
+                          SmallVectorImpl<SDBMExpr> &inequalities,
+                          SmallVectorImpl<SDBMExpr> &equalities);
+
+  void print(llvm::raw_ostream &os);
+  void dump();
+
+  IntInfty operator()(int i, int j) { return at(i, j); }
+
+private:
+  /// Get the given element of the difference bounds matrix.  First index
+  /// corresponds to the negative term of the difference, second index
+  /// corresponds to the positive term of the difference.
+  IntInfty &at(int i, int j) { return matrix[i * getNumVariables() + j]; }
+
+  /// Populate `inequalities` and `equalities` based on the values at(row,col)
+  /// and at(col,row) of the DBM.  Depending on the values being finite and
+  /// being subsumed by stripe expressions, this may or may not add elements to
+  /// the lists of equalities and inequalities.
+  void convertDBMElement(unsigned row, unsigned col, SDBMPositiveExpr rowExpr,
+                         SDBMPositiveExpr colExpr,
+                         SmallVectorImpl<SDBMExpr> &inequalities,
+                         SmallVectorImpl<SDBMExpr> &equalities);
+
+  /// Populate `inequalities` based on the value at(pos,pos) of the DBM. Only
+  /// adds new inequalities if the inequality is not trivially true.
+  void convertDBMDiagonalElement(unsigned pos, SDBMPositiveExpr expr,
+                                 SmallVectorImpl<SDBMExpr> &inequalities);
+
+  /// Get the total number of elements in the matrix.
+  unsigned getNumVariables() const {
+    return 1 + numDims + numSymbols + numTemporaries;
+  }
+
+  /// Get the position in the matrix that corresponds to the given dimension.
+  unsigned getDimPosition(unsigned position) const { return 1 + position; }
+
+  /// Get the position in the matrix that corresponds to the given symbol.
+  unsigned getSymbolPosition(unsigned position) const {
+    return 1 + numDims + position;
+  }
+
+  /// Get the position in the matrix that corresponds to the given temporary.
+  unsigned getTemporaryPosition(unsigned position) const {
+    return 1 + numDims + numSymbols + position;
+  }
+
+  /// Number of dimensions in the system,
+  unsigned numDims;
+  /// Number of symbols in the system.
+  unsigned numSymbols;
+  /// Number of temporary variables in the system.
+  unsigned numTemporaries;
+
+  /// Difference bounds matrix, stored as a linearized row-major vector.
+  /// Each value in this matrix corresponds to an inequality
+  ///
+  ///   v@col - v@row <= at(row, col)
+  ///
+  /// where v@col and v@row are the variables that correspond to the linearized
+  /// position in the matrix.  The positions correspond to
+  ///
+  ///   - constant 0 (producing constraints v@col <= X and -v@row <= Y);
+  ///   - SDBM expression dimensions (d0, d1, ...);
+  ///   - SDBM expression symbols (s0, s1, ...);
+  ///   - temporary variables (t0, t1, ...).
+  ///
+  /// Temporary variables are introduced to represent expressions that are not
+  /// trivially a difference between two variables.  For example, if one side of
+  /// a difference expression is itself a stripe expression, it will be replaced
+  /// with a temporary variable assigned equal to this expression.
+  ///
+  /// Infinite entries in the matrix correspond correspond to an absence of a
+  /// constraint:
+  ///
+  ///   v@col - v@row <= infinity
+  ///
+  /// is trivially true.  Negated values at symmetric positions in the matrix
+  /// allow one to couple two inequalities into a single equality.
+  std::vector<IntInfty> matrix;
+
+  /// The mapping between the indices of variables in the DBM and the stripe
+  /// expressions they are equal to.  These expressions are stored as they
+  /// appeared when constructing an SDBM from a SDBMExprs, in particular no
+  /// temporaries can appear in these expressions.  This removes the need to
+  /// iteratively substitute definitions of the temporaries in the reverse
+  /// conversion.
+  llvm::DenseMap<unsigned, SDBMExpr> stripeToPoint;
+};
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SDBM_SDBM_H
diff --git a/third_party/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h
new file mode 100644
index 00000000000..e3573ba604d
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h
@@ -0,0 +1,41 @@
+//===- SDBMDialect.h - Dialect for striped DBMs -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_DIALECT_SDBM_SDBMDIALECT_H
+#define MLIR_DIALECT_SDBM_SDBMDIALECT_H
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+class MLIRContext;
+
+class SDBMDialect : public Dialect {
+public:
+  SDBMDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) {}
+
+  static StringRef getDialectNamespace() { return "sdbm"; }
+
+  /// Get the uniquer for SDBM expressions. This should not be used directly.
+  StorageUniquer &getUniquer() { return uniquer; }
+
+private:
+  StorageUniquer uniquer;
+};
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SDBM_SDBMDIALECT_H
diff --git a/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
new file mode 100644
index 00000000000..1e695b68f97
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
@@ -0,0 +1,530 @@
+//===- SDBMExpr.h - MLIR SDBM Expression ------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// A striped difference-bound matrix (SDBM) expression is a constant expression,
+// an identifier, a binary expression with constant RHS and +, stripe operators
+// or a difference expression between two identifiers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SDBM_SDBMEXPR_H
+#define MLIR_DIALECT_SDBM_SDBMEXPR_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+
+class AffineExpr;
+class MLIRContext;
+
+enum class SDBMExprKind { Add, Stripe, Diff, Constant, DimId, SymbolId, Neg };
+
+namespace detail {
+struct SDBMExprStorage;
+struct SDBMBinaryExprStorage;
+struct SDBMDiffExprStorage;
+struct SDBMPositiveExprStorage;
+struct SDBMConstantExprStorage;
+struct SDBMNegExprStorage;
+} // namespace detail
+
+class SDBMConstantExpr;
+class SDBMDialect;
+class SDBMDimExpr;
+class SDBMSymbolExpr;
+
+/// Striped Difference-Bounded Matrix (SDBM) expression is a base left-hand side
+/// expression for the SDBM framework.  SDBM expressions are a subset of affine
+/// expressions supporting low-complexity algorithms for the operations used in
+/// loop transformations.  In particular, are supported:
+///   - constant expressions;
+///   - single variables (dimensions and symbols) with +1 or -1 coefficient;
+///   - stripe expressions: "x # C", where "x" is a single variable or another
+///     stripe expression, "#" is the stripe operator, and "C" is a constant
+///     expression; "#" is defined as x - x mod C.
+///   - sum expressions between single variable/stripe expressions and constant
+///     expressions;
+///   - difference expressions between single variable/stripe expressions.
+/// `SDBMExpr` class hierarchy provides a type-safe interface to constructing
+/// and operating on SDBM expressions.  For example, it requires the LHS of a
+/// sum expression to be a single variable or a stripe expression.  These
+/// restrictions are intended to force the caller to perform the necessary
+/// simplifications to stay within the SDBM domain, because SDBM expressions do
+/// not combine in more cases than they do.  This choice may be reconsidered in
+/// the future.
+///
+/// `SDBMExpr` and derived classes are thin wrappers around a pointer owned by
+/// an MLIRContext, and should be used by-value.  They are uniqued in the
+/// MLIRContext and immortal.
+class SDBMExpr {
+public:
+  using ImplType = detail::SDBMExprStorage;
+  SDBMExpr() : impl(nullptr) {}
+  /* implicit */ SDBMExpr(ImplType *expr) : impl(expr) {}
+
+  /// SDBM expressions are thin wrappers around a unique'ed immutable pointer,
+  /// which makes them trivially assignable and trivially copyable.
+  SDBMExpr(const SDBMExpr &) = default;
+  SDBMExpr &operator=(const SDBMExpr &) = default;
+
+  /// SDBM expressions can be compared straight-forwardly.
+  bool operator==(const SDBMExpr &other) const { return impl == other.impl; }
+  bool operator!=(const SDBMExpr &other) const { return !(*this == other); }
+
+  /// SDBM expressions are convertible to `bool`: null expressions are converted
+  /// to false, non-null expressions are converted to true.
+  explicit operator bool() const { return impl != nullptr; }
+  bool operator!() const { return !static_cast<bool>(*this); }
+
+  /// Negate the given SDBM expression.
+  SDBMExpr operator-();
+
+  /// Prints the SDBM expression.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// LLVM-style casts.
+  template <typename U> bool isa() const { return U::isClassFor(*this); }
+  template <typename U> U dyn_cast() const {
+    if (!isa<U>())
+      return {};
+    return U(const_cast<SDBMExpr *>(this)->impl);
+  }
+  template <typename U> U cast() const {
+    assert(isa<U>() && "cast to incorrect subtype");
+    return U(const_cast<SDBMExpr *>(this)->impl);
+  }
+
+  /// Support for LLVM hashing.
+  ::llvm::hash_code hash_value() const { return ::llvm::hash_value(impl); }
+
+  /// Returns the kind of the SDBM expression.
+  SDBMExprKind getKind() const;
+
+  /// Returns the MLIR context in which this expression lives.
+  MLIRContext *getContext() const;
+
+  /// Returns the SDBM dialect instance.
+  SDBMDialect *getDialect() const;
+
+  /// Convert the SDBM expression into an Affine expression.  This always
+  /// succeeds because SDBM are a subset of affine.
+  AffineExpr getAsAffineExpr() const;
+
+  /// Try constructing an SDBM expression from the given affine expression.
+  /// This may fail if the affine expression is not representable as SDBM, in
+  /// which case llvm::None is returned.  The conversion procedure recognizes
+  /// (nested) multiplicative ((x floordiv B) * B) and additive (x - x mod B)
+  /// patterns for the stripe expression.
+  static Optional<SDBMExpr> tryConvertAffineExpr(AffineExpr affine);
+
+protected:
+  ImplType *impl;
+};
+
+/// SDBM constant expression, wraps a 64-bit integer.
+class SDBMConstantExpr : public SDBMExpr {
+public:
+  using ImplType = detail::SDBMConstantExprStorage;
+
+  using SDBMExpr::SDBMExpr;
+
+  /// Obtain or create a constant expression unique'ed in the given dialect
+  /// (which belongs to a context).
+  static SDBMConstantExpr get(SDBMDialect *dialect, int64_t value);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Constant;
+  }
+
+  int64_t getValue() const;
+};
+
+/// SDBM varying expression can be one of:
+///   - input variable expression;
+///   - stripe expression;
+///   - negation (product with -1) of either of the above.
+///   - sum of a varying and a constant expression
+///   - difference between varying expressions
+class SDBMVaryingExpr : public SDBMExpr {
+public:
+  using ImplType = detail::SDBMExprStorage;
+  using SDBMExpr::SDBMExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId ||
+           expr.getKind() == SDBMExprKind::Neg ||
+           expr.getKind() == SDBMExprKind::Stripe ||
+           expr.getKind() == SDBMExprKind::Add ||
+           expr.getKind() == SDBMExprKind::Diff;
+  }
+};
+
+/// SDBM positive variable expression can be one of:
+///  - single variable expression;
+///  - stripe expression.
+class SDBMPositiveExpr : public SDBMVaryingExpr {
+public:
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId ||
+           expr.getKind() == SDBMExprKind::Stripe;
+  }
+};
+
+/// SDBM sum expression.  LHS is a varying expression and RHS is always a
+/// constant expression.
+class SDBMSumExpr : public SDBMVaryingExpr {
+public:
+  using ImplType = detail::SDBMBinaryExprStorage;
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// Obtain or create a sum expression unique'ed in the given context.
+  static SDBMSumExpr get(SDBMVaryingExpr lhs, SDBMConstantExpr rhs);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    SDBMExprKind kind = expr.getKind();
+    return kind == SDBMExprKind::Add;
+  }
+
+  SDBMVaryingExpr getLHS() const;
+  SDBMConstantExpr getRHS() const;
+};
+
+/// SDBM difference expression.  Both LHS and RHS are positive variable
+/// expressions.
+class SDBMDiffExpr : public SDBMVaryingExpr {
+public:
+  using ImplType = detail::SDBMDiffExprStorage;
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// Obtain or create a difference expression unique'ed in the given context.
+  static SDBMDiffExpr get(SDBMPositiveExpr lhs, SDBMPositiveExpr rhs);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Diff;
+  }
+
+  SDBMPositiveExpr getLHS() const;
+  SDBMPositiveExpr getRHS() const;
+};
+
+/// SDBM stripe expression "x # C" where "x" is a positive variable expression,
+/// "C" is a constant expression and "#" is the stripe operator defined as:
+///   x # C = x - x mod C.
+class SDBMStripeExpr : public SDBMPositiveExpr {
+public:
+  using ImplType = detail::SDBMBinaryExprStorage;
+  using SDBMPositiveExpr::SDBMPositiveExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Stripe;
+  }
+
+  static SDBMStripeExpr get(SDBMPositiveExpr var,
+                            SDBMConstantExpr stripeFactor);
+
+  SDBMPositiveExpr getVar() const;
+  SDBMConstantExpr getStripeFactor() const;
+};
+
+/// SDBM "input" variable expression can be either a dimension identifier or
+/// a symbol identifier.  When used to define SDBM functions, dimensions are
+/// interpreted as function arguments while symbols are treated as unknown but
+/// constant values, hence the name.
+class SDBMInputExpr : public SDBMPositiveExpr {
+public:
+  using ImplType = detail::SDBMPositiveExprStorage;
+  using SDBMPositiveExpr::SDBMPositiveExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId;
+  }
+
+  unsigned getPosition() const;
+};
+
+/// SDBM dimension expression.  Dimensions correspond to function arguments
+/// when defining functions using SDBM expressions.
+class SDBMDimExpr : public SDBMInputExpr {
+public:
+  using ImplType = detail::SDBMPositiveExprStorage;
+  using SDBMInputExpr::SDBMInputExpr;
+
+  /// Obtain or create a dimension expression unique'ed in the given dialect
+  /// (which belongs to a context).
+  static SDBMDimExpr get(SDBMDialect *dialect, unsigned position);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId;
+  }
+};
+
+/// SDBM symbol expression.  Symbols correspond to symbolic constants when
+/// defining functions using SDBM expressions.
+class SDBMSymbolExpr : public SDBMInputExpr {
+public:
+  using ImplType = detail::SDBMPositiveExprStorage;
+  using SDBMInputExpr::SDBMInputExpr;
+
+  /// Obtain or create a symbol expression unique'ed in the given dialect (which
+  /// belongs to a context).
+  static SDBMSymbolExpr get(SDBMDialect *dialect, unsigned position);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::SymbolId;
+  }
+};
+
+/// Negation of an SDBM variable expression.  Equivalent to multiplying the
+/// expression with -1 (SDBM does not support other coefficients that 1 and -1).
+class SDBMNegExpr : public SDBMVaryingExpr {
+public:
+  using ImplType = detail::SDBMNegExprStorage;
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// Obtain or create a negation expression unique'ed in the given context.
+  static SDBMNegExpr get(SDBMPositiveExpr var);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Neg;
+  }
+
+  SDBMPositiveExpr getVar() const;
+};
+
+/// A visitor class for SDBM expressions.  Calls the kind-specific function
+/// depending on the kind of expression it visits.
+template <typename Derived, typename Result = void> class SDBMVisitor {
+public:
+  /// Visit the given SDBM expression, dispatching to kind-specific functions.
+  Result visit(SDBMExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    switch (expr.getKind()) {
+    case SDBMExprKind::Add:
+    case SDBMExprKind::Diff:
+    case SDBMExprKind::DimId:
+    case SDBMExprKind::SymbolId:
+    case SDBMExprKind::Neg:
+    case SDBMExprKind::Stripe:
+      return derived->visitVarying(expr.cast<SDBMVaryingExpr>());
+    case SDBMExprKind::Constant:
+      return derived->visitConstant(expr.cast<SDBMConstantExpr>());
+    }
+
+    llvm_unreachable("unsupported SDBM expression kind");
+  }
+
+  /// Traverse the SDBM expression tree calling `visit` on each node
+  /// in depth-first preorder.
+  void walkPreorder(SDBMExpr expr) { return walk</*isPreorder=*/true>(expr); }
+
+  /// Traverse the SDBM expression tree calling `visit` on each node in
+  /// depth-first postorder.
+  void walkPostorder(SDBMExpr expr) { return walk</*isPreorder=*/false>(expr); }
+
+protected:
+  /// Default visitors do nothing.
+  void visitSum(SDBMSumExpr) {}
+  void visitDiff(SDBMDiffExpr) {}
+  void visitStripe(SDBMStripeExpr) {}
+  void visitDim(SDBMDimExpr) {}
+  void visitSymbol(SDBMSymbolExpr) {}
+  void visitNeg(SDBMNegExpr) {}
+  void visitConstant(SDBMConstantExpr) {}
+
+  /// Default implementation of visitPositive dispatches to the special
+  /// functions for stripes and other variables.  Concrete visitors can override
+  /// it.
+  Result visitPositive(SDBMPositiveExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (expr.getKind() == SDBMExprKind::Stripe)
+      return derived->visitStripe(expr.cast<SDBMStripeExpr>());
+    else
+      return derived->visitInput(expr.cast<SDBMInputExpr>());
+  }
+
+  /// Default implementation of visitInput dispatches to the special
+  /// functions for dimensions or symbols.  Concrete visitors can override it to
+  /// visit all variables instead.
+  Result visitInput(SDBMInputExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (expr.getKind() == SDBMExprKind::DimId)
+      return derived->visitDim(expr.cast<SDBMDimExpr>());
+    else
+      return derived->visitSymbol(expr.cast<SDBMSymbolExpr>());
+  }
+
+  /// Default implementation of visitVarying dispatches to the special
+  /// functions for variables and negations thereof.  Concerete visitors can
+  /// override it to visit all variables and negations instead.
+  Result visitVarying(SDBMVaryingExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (auto var = expr.dyn_cast<SDBMPositiveExpr>())
+      return derived->visitPositive(var);
+    else if (auto neg = expr.dyn_cast<SDBMNegExpr>())
+      return derived->visitNeg(neg);
+    else if (auto sum = expr.dyn_cast<SDBMSumExpr>())
+      return derived->visitSum(sum);
+    else if (auto diff = expr.dyn_cast<SDBMDiffExpr>())
+      return derived->visitDiff(diff);
+
+    llvm_unreachable("unhandled subtype of varying SDBM expression");
+  }
+
+  template <bool isPreorder> void walk(SDBMExpr expr) {
+    if (isPreorder)
+      visit(expr);
+    if (auto sumExpr = expr.dyn_cast<SDBMSumExpr>()) {
+      walk<isPreorder>(sumExpr.getLHS());
+      walk<isPreorder>(sumExpr.getRHS());
+    } else if (auto diffExpr = expr.dyn_cast<SDBMDiffExpr>()) {
+      walk<isPreorder>(diffExpr.getLHS());
+      walk<isPreorder>(diffExpr.getRHS());
+    } else if (auto stripeExpr = expr.dyn_cast<SDBMStripeExpr>()) {
+      walk<isPreorder>(stripeExpr.getVar());
+      walk<isPreorder>(stripeExpr.getStripeFactor());
+    } else if (auto negExpr = expr.dyn_cast<SDBMNegExpr>()) {
+      walk<isPreorder>(negExpr.getVar());
+    }
+    if (!isPreorder)
+      visit(expr);
+  }
+};
+
+/// Overloaded arithmetic operators for SDBM expressions asserting that their
+/// arguments have the proper SDBM expression subtype.  Perform canonicalization
+/// and constant folding on these expressions.
+namespace ops_assertions {
+
+/// Add two SDBM expressions.  At least one of the expressions must be a
+/// constant or a negation, but both expressions cannot be negations
+/// simultaneously.
+SDBMExpr operator+(SDBMExpr lhs, SDBMExpr rhs);
+inline SDBMExpr operator+(SDBMExpr lhs, int64_t rhs) {
+  return lhs + SDBMConstantExpr::get(lhs.getDialect(), rhs);
+}
+inline SDBMExpr operator+(int64_t lhs, SDBMExpr rhs) {
+  return SDBMConstantExpr::get(rhs.getDialect(), lhs) + rhs;
+}
+
+/// Subtract an SDBM expression from another SDBM expression.  Both expressions
+/// must not be difference expressions.
+SDBMExpr operator-(SDBMExpr lhs, SDBMExpr rhs);
+inline SDBMExpr operator-(SDBMExpr lhs, int64_t rhs) {
+  return lhs - SDBMConstantExpr::get(lhs.getDialect(), rhs);
+}
+inline SDBMExpr operator-(int64_t lhs, SDBMExpr rhs) {
+  return SDBMConstantExpr::get(rhs.getDialect(), lhs) - rhs;
+}
+
+/// Construct a stripe expression from a positive expression and a positive
+/// constant stripe factor.
+SDBMExpr stripe(SDBMExpr expr, SDBMExpr factor);
+inline SDBMExpr stripe(SDBMExpr expr, int64_t factor) {
+  return stripe(expr, SDBMConstantExpr::get(expr.getDialect(), factor));
+}
+} // namespace ops_assertions
+
+} // end namespace mlir
+
+namespace llvm {
+// SDBMExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMExpr> {
+  static mlir::SDBMExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMExpr lhs, mlir::SDBMExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMVaryingExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMVaryingExpr> {
+  static mlir::SDBMVaryingExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMVaryingExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMVaryingExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMVaryingExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMVaryingExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMVaryingExpr lhs, mlir::SDBMVaryingExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMPositiveExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMPositiveExpr> {
+  static mlir::SDBMPositiveExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMPositiveExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMPositiveExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMPositiveExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMPositiveExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMPositiveExpr lhs, mlir::SDBMPositiveExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMConstantExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMConstantExpr> {
+  static mlir::SDBMConstantExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMConstantExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMConstantExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMConstantExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMConstantExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMConstantExpr lhs, mlir::SDBMConstantExpr rhs) {
+    return lhs == rhs;
+  }
+};
+} // namespace llvm
+
+#endif // MLIR_DIALECT_SDBM_SDBMEXPR_H
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
new file mode 100644
index 00000000000..af4520df130
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_TARGET_DEFINITIONS SPIRVOps.td)
+mlir_tablegen(SPIRVOps.h.inc -gen-op-decls)
+mlir_tablegen(SPIRVOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRSPIRVOpsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVBase.td)
+mlir_tablegen(SPIRVEnums.h.inc -gen-enum-decls)
+mlir_tablegen(SPIRVEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRSPIRVEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVOps.td)
+mlir_tablegen(SPIRVSerialization.inc -gen-spirv-serialization)
+add_public_tablegen_target(MLIRSPIRVSerializationGen)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVBase.td)
+mlir_tablegen(SPIRVOpUtils.inc -gen-spirv-op-utils)
+add_public_tablegen_target(MLIRSPIRVOpUtilsGen)
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/Passes.h b/third_party/mlir/include/mlir/Dialect/SPIRV/Passes.h
new file mode 100644
index 00000000000..85f4f79ed59
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/Passes.h
@@ -0,0 +1,35 @@
+//===- Passes.h - SPIR-V pass entry points ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_PASSES_H_
+#define MLIR_DIALECT_SPIRV_PASSES_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace spirv {
+
+std::unique_ptr<ModulePassBase> createConvertStandardToSPIRVPass();
+
+} // namespace spirv
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_PASSES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td
new file mode 100644
index 00000000000..ded9920ed7f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td
@@ -0,0 +1,507 @@
+//===-- SPIRVArithmeticOps.td - MLIR SPIR-V Arithmetic Ops -*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains arithmetic ops for the SPIR-V dialect. It corresponds
+// to "3.32.13. Arithmetic Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef SPIRV_ARITHMETIC_OPS
+#else
+#define SPIRV_ARITHMETIC_OPS
+
+#ifdef SPIRV_BASE
+#else
+include "mlir/SPIRV/SPIRVBase.td"
+#endif // SPIRV_BASE
+
+class SPV_ArithmeticOp<string mnemonic, Type type,
+                       list<OpTrait> traits = []> :
+      // Operands type same as result type.
+      SPV_BinaryOp<mnemonic, type, type,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultType])>;
+
+// -----
+
+def SPV_FAddOp : SPV_ArithmeticOp<"FAdd", SPV_Float, [Commutative]> {
+  let summary = "Floating-point addition of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fadd-op ::= ssa-id `=` `spv.FAdd` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.FAdd %0, %1 : f32
+    %5 = spv.FAdd %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FDivOp : SPV_ArithmeticOp<"FDiv", SPV_Float> {
+  let summary = "Floating-point division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+    ### Custom assembly form
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fdiv-op ::= ssa-id `=` `spv.FDiv` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FDiv %0, %1 : f32
+    %5 = spv.FDiv %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FModOp : SPV_ArithmeticOp<"FMod", SPV_Float> {
+  let summary = [{
+    The floating-point remainder whose sign matches the sign of Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 2.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmod-op ::= ssa-id `=` `spv.FMod` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.FMod %0, %1 : f32
+    %5 = spv.FMod %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FMulOp : SPV_ArithmeticOp<"FMul", SPV_Float, [Commutative]> {
+  let summary = "Floating-point multiplication of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmul-op ::= `spv.FMul` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FMul %0, %1 : f32
+    %5 = spv.FMul %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FRemOp : SPV_ArithmeticOp<"FRem", SPV_Float> {
+  let summary = [{
+    The floating-point remainder whose sign matches the sign of Operand 1.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 1.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    frem-op ::= ssa-id `=` `spv.FRemOp` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FRemOp %0, %1 : f32
+    %5 = spv.FRemOp %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FSubOp : SPV_ArithmeticOp<"FSub", SPV_Float> {
+  let summary = "Floating-point subtraction of Operand 2 from Operand 1.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fsub-op ::= ssa-id `=` `spv.FRemOp` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FRemOp %0, %1 : f32
+    %5 = spv.FRemOp %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_IAddOp : SPV_ArithmeticOp<"IAdd", SPV_Integer, [Commutative]> {
+  let summary = "Integer addition of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    iadd-op ::= ssa-id `=` `spv.IAdd` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.IAdd %0, %1 : i32
+    %5 = spv.IAdd %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_IMulOp : SPV_ArithmeticOp<"IMul", SPV_Integer, [Commutative]> {
+  let summary = "Integer multiplication of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    imul-op ::= ssa-id `=` `spv.IMul` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.IMul %0, %1 : i32
+    %5 = spv.IMul %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ISubOp : SPV_ArithmeticOp<"ISub", SPV_Integer> {
+  let summary = "Integer subtraction of Operand 2 from Operand 1.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    isub-op ::= `spv.ISub` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.ISub %0, %1 : i32
+    %5 = spv.ISub %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SDivOp : SPV_ArithmeticOp<"SDiv", SPV_Integer> {
+  let summary = "Signed-integer division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sdiv-op ::= ssa-id `=` `spv.SDiv` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.SDiv %0, %1 : i32
+    %5 = spv.SDiv %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SModOp : SPV_ArithmeticOp<"SMod", SPV_Integer> {
+  let summary = [{
+    Signed remainder operation for the remainder whose sign matches the sign
+    of Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 2.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    smod-op ::= ssa-id `=` `spv.SMod` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SMod %0, %1 : i32
+    %5 = spv.SMod %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SRemOp : SPV_ArithmeticOp<"SRem", SPV_Integer> {
+  let summary = [{
+    Signed remainder operation for the remainder whose sign matches the sign
+    of Operand 1.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 1.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    srem-op ::= ssa-id `=` `spv.SRem` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SRem %0, %1 : i32
+    %5 = spv.SRem %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UDivOp : SPV_ArithmeticOp<"UDiv", SPV_Integer> {
+  let summary = "Unsigned-integer division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    udiv-op ::= ssa-id `=` `spv.UDiv` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UDiv %0, %1 : i32
+    %5 = spv.UDiv %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UModOp : SPV_ArithmeticOp<"UMod", SPV_Integer> {
+  let summary = "Unsigned modulo operation of Operand 1 modulo Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    umod-op ::= ssa-id `=` `spv.UMod` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UMod %0, %1 : i32
+    %5 = spv.UMod %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+#endif // SPIRV_ARITHMETIC_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
new file mode 100644
index 00000000000..538891e6688
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
@@ -0,0 +1,1102 @@
+//===- SPIRVBase.td - MLIR SPIR-V Op Definitions Base file -*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the base file for SPIR-V operation definition specification.
+// This file defines the SPIR-V dialect, common SPIR-V types, and utilities
+// for facilitating defining SPIR-V ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef SPIRV_BASE
+#else
+#define SPIRV_BASE
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+//===----------------------------------------------------------------------===//
+// SPIR-V dialect definitions
+//===----------------------------------------------------------------------===//
+
+def SPV_Dialect : Dialect {
+  let name = "spv";
+
+  let description = [{
+    The SPIR-V dialect in MLIR.
+
+    SPIR-V is the Khronos Group's binary intermediate language for representing
+    graphical-shader stages and compute kernels for multiple Khronos APIs,
+    including OpenCL, OpenGL, and Vulkan.
+    See https://www.khronos.org/registry/spir-v for more details.
+
+    This dialect aims to be a simple proxy for the SPIR-V binary format to
+    enable straightforward and lightweight conversion from/to the binary
+    format. Ops in this dialect should stay at the same semantic level and
+    try to be a mechanical mapping to the corresponding SPIR-V instructions;
+    but they may deviate representationally to allow using MLIR mechanisms.
+    As a convention, if such deviation happens, the op name follows "snake_case"
+    style; otherwise, the op name just follows the SPIR-V mnemonic (by removing
+    the leading `Op` prefix) to use "CamelCase" style.
+  }];
+
+  let cppNamespace = "spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V opcode specification
+//===----------------------------------------------------------------------===//
+
+class SPV_OpCode<string name, int val> {
+  // Name used as reference to retrieve the opcode
+  string opname = name;
+
+  // Opcode associated with the name
+  int opcode = val;
+}
+
+// Begin opcode section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+def SPV_OC_OpNop                   : I32EnumAttrCase<"OpNop", 0>;
+def SPV_OC_OpName                  : I32EnumAttrCase<"OpName", 5>;
+def SPV_OC_OpExtension             : I32EnumAttrCase<"OpExtension", 10>;
+def SPV_OC_OpMemoryModel           : I32EnumAttrCase<"OpMemoryModel", 14>;
+def SPV_OC_OpEntryPoint            : I32EnumAttrCase<"OpEntryPoint", 15>;
+def SPV_OC_OpExecutionMode         : I32EnumAttrCase<"OpExecutionMode", 16>;
+def SPV_OC_OpCapability            : I32EnumAttrCase<"OpCapability", 17>;
+def SPV_OC_OpTypeVoid              : I32EnumAttrCase<"OpTypeVoid", 19>;
+def SPV_OC_OpTypeBool              : I32EnumAttrCase<"OpTypeBool", 20>;
+def SPV_OC_OpTypeInt               : I32EnumAttrCase<"OpTypeInt", 21>;
+def SPV_OC_OpTypeFloat             : I32EnumAttrCase<"OpTypeFloat", 22>;
+def SPV_OC_OpTypeVector            : I32EnumAttrCase<"OpTypeVector", 23>;
+def SPV_OC_OpTypeArray             : I32EnumAttrCase<"OpTypeArray", 28>;
+def SPV_OC_OpTypeStruct            : I32EnumAttrCase<"OpTypeStruct", 30>;
+def SPV_OC_OpTypePointer           : I32EnumAttrCase<"OpTypePointer", 32>;
+def SPV_OC_OpTypeFunction          : I32EnumAttrCase<"OpTypeFunction", 33>;
+def SPV_OC_OpConstantTrue          : I32EnumAttrCase<"OpConstantTrue", 41>;
+def SPV_OC_OpConstantFalse         : I32EnumAttrCase<"OpConstantFalse", 42>;
+def SPV_OC_OpConstant              : I32EnumAttrCase<"OpConstant", 43>;
+def SPV_OC_OpConstantComposite     : I32EnumAttrCase<"OpConstantComposite", 44>;
+def SPV_OC_OpConstantNull          : I32EnumAttrCase<"OpConstantNull", 46>;
+def SPV_OC_OpSpecConstantTrue      : I32EnumAttrCase<"OpSpecConstantTrue", 48>;
+def SPV_OC_OpSpecConstantFalse     : I32EnumAttrCase<"OpSpecConstantFalse", 49>;
+def SPV_OC_OpSpecConstant          : I32EnumAttrCase<"OpSpecConstant", 50>;
+def SPV_OC_OpSpecConstantComposite : I32EnumAttrCase<"OpSpecConstantComposite", 51>;
+def SPV_OC_OpFunction              : I32EnumAttrCase<"OpFunction", 54>;
+def SPV_OC_OpFunctionParameter     : I32EnumAttrCase<"OpFunctionParameter", 55>;
+def SPV_OC_OpFunctionEnd           : I32EnumAttrCase<"OpFunctionEnd", 56>;
+def SPV_OC_OpVariable              : I32EnumAttrCase<"OpVariable", 59>;
+def SPV_OC_OpLoad                  : I32EnumAttrCase<"OpLoad", 61>;
+def SPV_OC_OpStore                 : I32EnumAttrCase<"OpStore", 62>;
+def SPV_OC_OpAccessChain           : I32EnumAttrCase<"OpAccessChain", 65>;
+def SPV_OC_OpDecorate              : I32EnumAttrCase<"OpDecorate", 71>;
+def SPV_OC_OpMemberDecorate        : I32EnumAttrCase<"OpMemberDecorate", 72>;
+def SPV_OC_OpCompositeExtract      : I32EnumAttrCase<"OpCompositeExtract", 81>;
+def SPV_OC_OpIAdd                  : I32EnumAttrCase<"OpIAdd", 128>;
+def SPV_OC_OpFAdd                  : I32EnumAttrCase<"OpFAdd", 129>;
+def SPV_OC_OpISub                  : I32EnumAttrCase<"OpISub", 130>;
+def SPV_OC_OpFSub                  : I32EnumAttrCase<"OpFSub", 131>;
+def SPV_OC_OpIMul                  : I32EnumAttrCase<"OpIMul", 132>;
+def SPV_OC_OpFMul                  : I32EnumAttrCase<"OpFMul", 133>;
+def SPV_OC_OpUDiv                  : I32EnumAttrCase<"OpUDiv", 134>;
+def SPV_OC_OpSDiv                  : I32EnumAttrCase<"OpSDiv", 135>;
+def SPV_OC_OpFDiv                  : I32EnumAttrCase<"OpFDiv", 136>;
+def SPV_OC_OpUMod                  : I32EnumAttrCase<"OpUMod", 137>;
+def SPV_OC_OpSRem                  : I32EnumAttrCase<"OpSRem", 138>;
+def SPV_OC_OpSMod                  : I32EnumAttrCase<"OpSMod", 139>;
+def SPV_OC_OpFRem                  : I32EnumAttrCase<"OpFRem", 140>;
+def SPV_OC_OpFMod                  : I32EnumAttrCase<"OpFMod", 141>;
+def SPV_OC_OpIEqual                : I32EnumAttrCase<"OpIEqual", 170>;
+def SPV_OC_OpINotEqual             : I32EnumAttrCase<"OpINotEqual", 171>;
+def SPV_OC_OpUGreaterThan          : I32EnumAttrCase<"OpUGreaterThan", 172>;
+def SPV_OC_OpSGreaterThan          : I32EnumAttrCase<"OpSGreaterThan", 173>;
+def SPV_OC_OpUGreaterThanEqual     : I32EnumAttrCase<"OpUGreaterThanEqual", 174>;
+def SPV_OC_OpSGreaterThanEqual     : I32EnumAttrCase<"OpSGreaterThanEqual", 175>;
+def SPV_OC_OpULessThan             : I32EnumAttrCase<"OpULessThan", 176>;
+def SPV_OC_OpSLessThan             : I32EnumAttrCase<"OpSLessThan", 177>;
+def SPV_OC_OpULessThanEqual        : I32EnumAttrCase<"OpULessThanEqual", 178>;
+def SPV_OC_OpSLessThanEqual        : I32EnumAttrCase<"OpSLessThanEqual", 179>;
+def SPV_OC_OpLabel                 : I32EnumAttrCase<"OpLabel", 248>;
+def SPV_OC_OpReturn                : I32EnumAttrCase<"OpReturn", 253>;
+def SPV_OC_OpReturnValue           : I32EnumAttrCase<"OpReturnValue", 254>;
+
+def SPV_OpcodeAttr :
+    I32EnumAttr<"Opcode", "valid SPIR-V instructions", [
+      SPV_OC_OpNop, SPV_OC_OpName, SPV_OC_OpExtension, SPV_OC_OpMemoryModel,
+      SPV_OC_OpEntryPoint, SPV_OC_OpExecutionMode, SPV_OC_OpCapability,
+      SPV_OC_OpTypeVoid, SPV_OC_OpTypeBool, SPV_OC_OpTypeInt, SPV_OC_OpTypeFloat,
+      SPV_OC_OpTypeVector, SPV_OC_OpTypeArray, SPV_OC_OpTypeStruct,
+      SPV_OC_OpTypePointer, SPV_OC_OpTypeFunction, SPV_OC_OpConstantTrue,
+      SPV_OC_OpConstantFalse, SPV_OC_OpConstant, SPV_OC_OpConstantComposite,
+      SPV_OC_OpConstantNull, SPV_OC_OpSpecConstantTrue, SPV_OC_OpSpecConstantFalse,
+      SPV_OC_OpSpecConstant, SPV_OC_OpSpecConstantComposite, SPV_OC_OpFunction,
+      SPV_OC_OpFunctionParameter, SPV_OC_OpFunctionEnd, SPV_OC_OpVariable,
+      SPV_OC_OpLoad, SPV_OC_OpStore, SPV_OC_OpAccessChain, SPV_OC_OpDecorate,
+      SPV_OC_OpMemberDecorate, SPV_OC_OpCompositeExtract, SPV_OC_OpIAdd,
+      SPV_OC_OpFAdd, SPV_OC_OpISub, SPV_OC_OpFSub, SPV_OC_OpIMul, SPV_OC_OpFMul,
+      SPV_OC_OpUDiv, SPV_OC_OpSDiv, SPV_OC_OpFDiv, SPV_OC_OpUMod, SPV_OC_OpSRem,
+      SPV_OC_OpSMod, SPV_OC_OpFRem, SPV_OC_OpFMod, SPV_OC_OpIEqual,
+      SPV_OC_OpINotEqual, SPV_OC_OpUGreaterThan, SPV_OC_OpSGreaterThan,
+      SPV_OC_OpUGreaterThanEqual, SPV_OC_OpSGreaterThanEqual, SPV_OC_OpULessThan,
+      SPV_OC_OpSLessThan, SPV_OC_OpULessThanEqual, SPV_OC_OpSLessThanEqual,
+      SPV_OC_OpLabel, SPV_OC_OpReturn, SPV_OC_OpReturnValue
+      ]> {
+    let returnType = "::mlir::spirv::Opcode";
+    let convertFromStorage = "static_cast<::mlir::spirv::Opcode>($_self.getInt())";
+    let cppNamespace = "::mlir::spirv";
+}
+
+// End opcode section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+
+//===----------------------------------------------------------------------===//
+// SPIR-V type definitions
+//===----------------------------------------------------------------------===//
+
+def SPV_IsPtrType : CPred<"$_self.isa<::mlir::spirv::PointerType>()">;
+def SPV_IsArrayType : CPred<"$_self.isa<::mlir::spirv::ArrayType>()">;
+def SPV_IsRTArrayType : CPred<"$_self.isa<::mlir::spirv::RuntimeArrayType>()">;
+def SPV_IsStructType : CPred<"$_self.isa<::mlir::spirv::StructType>()">;
+
+// See https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_types
+// for the definition of the following types and type categories.
+
+def SPV_Void : TypeAlias<NoneType, "void type">;
+def SPV_Bool : IntOfWidths<[1]>;
+def SPV_Integer : IntOfWidths<[8, 16, 32, 64]>;
+def SPV_Float : FloatOfWidths<[16, 32, 64]>;
+def SPV_Vector : VectorOf<[SPV_Bool, SPV_Integer, SPV_Float]>;
+// Component type check is done in the type parser for the following SPIR-V
+// dialect-specific types so we use "Any" here.
+def SPV_AnyPtr : Type<SPV_IsPtrType, "any SPIR-V pointer type">;
+def SPV_AnyArray : Type<SPV_IsArrayType, "any SPIR-V array type">;
+def SPV_AnyRTArray : Type<SPV_IsRTArrayType, "any SPIR-V runtime array type">;
+def SPV_AnyStruct : Type<SPV_IsStructType, "any SPIR-V struct type">;
+
+def SPV_Numerical : AnyTypeOf<[SPV_Integer, SPV_Float]>;
+def SPV_Scalar : AnyTypeOf<[SPV_Numerical, SPV_Bool]>;
+def SPV_Aggregrate : AnyTypeOf<[SPV_AnyArray, SPV_AnyStruct]>;
+def SPV_Composite: AnyTypeOf<[SPV_Vector, SPV_AnyArray, SPV_AnyStruct]>;
+def SPV_Type : AnyTypeOf<[
+    SPV_Void, SPV_Bool, SPV_Integer, SPV_Float, SPV_Vector,
+    SPV_AnyPtr, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct
+  ]>;
+
+class SPV_ScalarOrVectorOf<Type type> :
+    Type<Or<[type.predicate, VectorOf<[type]>.predicate]>,
+         "scalar/vector of " # type.description>;
+
+// TODO(antiagainst): Use a more appropriate way to model optional operands
+class SPV_Optional<Type type> : Variadic<type>;
+
+def SPV_IsEntryPointType :
+    CPred<"$_self.isa<::mlir::spirv::EntryPointType>()">;
+def SPV_EntryPoint : Type<SPV_IsEntryPointType, "SPIR-V entry point type">;
+
+//===----------------------------------------------------------------------===//
+// SPIR-V extension definitions
+//===----------------------------------------------------------------------===//
+
+// Extensions known to the SPIR-V dialect.
+// https://github.com/KhronosGroup/SPIRV-Registry has the full list.
+def SPV_KHR_16bit_storage                : StrEnumAttrCase<"SPV_KHR_16bit_storage">;
+def SPV_KHR_8bit_storage                 : StrEnumAttrCase<"SPV_KHR_8bit_storage">;
+def SPV_KHR_float_controls               : StrEnumAttrCase<"SPV_KHR_float_controls">;
+def SPV_KHR_shader_atomic_counter_ops    : StrEnumAttrCase<"SPV_KHR_shader_atomic_counter_ops">;
+def SPV_KHR_shader_ballot                : StrEnumAttrCase<"SPV_KHR_shader_ballot">;
+def SPV_KHR_storage_buffer_storage_class : StrEnumAttrCase<"SPV_KHR_storage_buffer_storage_class">;
+def SPV_KHR_subgroup_vote                : StrEnumAttrCase<"SPV_KHR_subgroup_vote">;
+def SPV_KHR_variable_pointers            : StrEnumAttrCase<"SPV_KHR_variable_pointers">;
+def SPV_KHR_vulkan_memory_model          : StrEnumAttrCase<"SPV_KHR_vulkan_memory_model">;
+
+def SPV_ExtensionAttr :
+    StrEnumAttr<"Extension", "supported SPIR-V extensions", [
+      SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_float_controls,
+      SPV_KHR_shader_atomic_counter_ops, SPV_KHR_shader_ballot,
+      SPV_KHR_storage_buffer_storage_class, SPV_KHR_subgroup_vote,
+      SPV_KHR_variable_pointers, SPV_KHR_vulkan_memory_model
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V enum definitions
+//===----------------------------------------------------------------------===//
+
+// Begin enum section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+def SPV_AM_Logical                    : I32EnumAttrCase<"Logical", 0>;
+def SPV_AM_Physical32                 : I32EnumAttrCase<"Physical32", 1>;
+def SPV_AM_Physical64                 : I32EnumAttrCase<"Physical64", 2>;
+def SPV_AM_PhysicalStorageBuffer64EXT : I32EnumAttrCase<"PhysicalStorageBuffer64EXT", 5348>;
+
+def SPV_AddressingModelAttr :
+    I32EnumAttr<"AddressingModel", "valid SPIR-V AddressingModel", [
+      SPV_AM_Logical, SPV_AM_Physical32, SPV_AM_Physical64,
+      SPV_AM_PhysicalStorageBuffer64EXT
+    ]> {
+  let returnType = "::mlir::spirv::AddressingModel";
+  let convertFromStorage = "static_cast<::mlir::spirv::AddressingModel>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_BI_Position                    : I32EnumAttrCase<"Position", 0>;
+def SPV_BI_PointSize                   : I32EnumAttrCase<"PointSize", 1>;
+def SPV_BI_ClipDistance                : I32EnumAttrCase<"ClipDistance", 3>;
+def SPV_BI_CullDistance                : I32EnumAttrCase<"CullDistance", 4>;
+def SPV_BI_VertexId                    : I32EnumAttrCase<"VertexId", 5>;
+def SPV_BI_InstanceId                  : I32EnumAttrCase<"InstanceId", 6>;
+def SPV_BI_PrimitiveId                 : I32EnumAttrCase<"PrimitiveId", 7>;
+def SPV_BI_InvocationId                : I32EnumAttrCase<"InvocationId", 8>;
+def SPV_BI_Layer                       : I32EnumAttrCase<"Layer", 9>;
+def SPV_BI_ViewportIndex               : I32EnumAttrCase<"ViewportIndex", 10>;
+def SPV_BI_TessLevelOuter              : I32EnumAttrCase<"TessLevelOuter", 11>;
+def SPV_BI_TessLevelInner              : I32EnumAttrCase<"TessLevelInner", 12>;
+def SPV_BI_TessCoord                   : I32EnumAttrCase<"TessCoord", 13>;
+def SPV_BI_PatchVertices               : I32EnumAttrCase<"PatchVertices", 14>;
+def SPV_BI_FragCoord                   : I32EnumAttrCase<"FragCoord", 15>;
+def SPV_BI_PointCoord                  : I32EnumAttrCase<"PointCoord", 16>;
+def SPV_BI_FrontFacing                 : I32EnumAttrCase<"FrontFacing", 17>;
+def SPV_BI_SampleId                    : I32EnumAttrCase<"SampleId", 18>;
+def SPV_BI_SamplePosition              : I32EnumAttrCase<"SamplePosition", 19>;
+def SPV_BI_SampleMask                  : I32EnumAttrCase<"SampleMask", 20>;
+def SPV_BI_FragDepth                   : I32EnumAttrCase<"FragDepth", 22>;
+def SPV_BI_HelperInvocation            : I32EnumAttrCase<"HelperInvocation", 23>;
+def SPV_BI_NumWorkgroups               : I32EnumAttrCase<"NumWorkgroups", 24>;
+def SPV_BI_WorkgroupSize               : I32EnumAttrCase<"WorkgroupSize", 25>;
+def SPV_BI_WorkgroupId                 : I32EnumAttrCase<"WorkgroupId", 26>;
+def SPV_BI_LocalInvocationId           : I32EnumAttrCase<"LocalInvocationId", 27>;
+def SPV_BI_GlobalInvocationId          : I32EnumAttrCase<"GlobalInvocationId", 28>;
+def SPV_BI_LocalInvocationIndex        : I32EnumAttrCase<"LocalInvocationIndex", 29>;
+def SPV_BI_WorkDim                     : I32EnumAttrCase<"WorkDim", 30>;
+def SPV_BI_GlobalSize                  : I32EnumAttrCase<"GlobalSize", 31>;
+def SPV_BI_EnqueuedWorkgroupSize       : I32EnumAttrCase<"EnqueuedWorkgroupSize", 32>;
+def SPV_BI_GlobalOffset                : I32EnumAttrCase<"GlobalOffset", 33>;
+def SPV_BI_GlobalLinearId              : I32EnumAttrCase<"GlobalLinearId", 34>;
+def SPV_BI_SubgroupSize                : I32EnumAttrCase<"SubgroupSize", 36>;
+def SPV_BI_SubgroupMaxSize             : I32EnumAttrCase<"SubgroupMaxSize", 37>;
+def SPV_BI_NumSubgroups                : I32EnumAttrCase<"NumSubgroups", 38>;
+def SPV_BI_NumEnqueuedSubgroups        : I32EnumAttrCase<"NumEnqueuedSubgroups", 39>;
+def SPV_BI_SubgroupId                  : I32EnumAttrCase<"SubgroupId", 40>;
+def SPV_BI_SubgroupLocalInvocationId   : I32EnumAttrCase<"SubgroupLocalInvocationId", 41>;
+def SPV_BI_VertexIndex                 : I32EnumAttrCase<"VertexIndex", 42>;
+def SPV_BI_InstanceIndex               : I32EnumAttrCase<"InstanceIndex", 43>;
+def SPV_BI_SubgroupEqMask              : I32EnumAttrCase<"SubgroupEqMask", 4416>;
+def SPV_BI_SubgroupGeMask              : I32EnumAttrCase<"SubgroupGeMask", 4417>;
+def SPV_BI_SubgroupGtMask              : I32EnumAttrCase<"SubgroupGtMask", 4418>;
+def SPV_BI_SubgroupLeMask              : I32EnumAttrCase<"SubgroupLeMask", 4419>;
+def SPV_BI_SubgroupLtMask              : I32EnumAttrCase<"SubgroupLtMask", 4420>;
+def SPV_BI_BaseVertex                  : I32EnumAttrCase<"BaseVertex", 4424>;
+def SPV_BI_BaseInstance                : I32EnumAttrCase<"BaseInstance", 4425>;
+def SPV_BI_DrawIndex                   : I32EnumAttrCase<"DrawIndex", 4426>;
+def SPV_BI_DeviceIndex                 : I32EnumAttrCase<"DeviceIndex", 4438>;
+def SPV_BI_ViewIndex                   : I32EnumAttrCase<"ViewIndex", 4440>;
+def SPV_BI_BaryCoordNoPerspAMD         : I32EnumAttrCase<"BaryCoordNoPerspAMD", 4992>;
+def SPV_BI_BaryCoordNoPerspCentroidAMD : I32EnumAttrCase<"BaryCoordNoPerspCentroidAMD", 4993>;
+def SPV_BI_BaryCoordNoPerspSampleAMD   : I32EnumAttrCase<"BaryCoordNoPerspSampleAMD", 4994>;
+def SPV_BI_BaryCoordSmoothAMD          : I32EnumAttrCase<"BaryCoordSmoothAMD", 4995>;
+def SPV_BI_BaryCoordSmoothCentroidAMD  : I32EnumAttrCase<"BaryCoordSmoothCentroidAMD", 4996>;
+def SPV_BI_BaryCoordSmoothSampleAMD    : I32EnumAttrCase<"BaryCoordSmoothSampleAMD", 4997>;
+def SPV_BI_BaryCoordPullModelAMD       : I32EnumAttrCase<"BaryCoordPullModelAMD", 4998>;
+def SPV_BI_FragStencilRefEXT           : I32EnumAttrCase<"FragStencilRefEXT", 5014>;
+def SPV_BI_ViewportMaskNV              : I32EnumAttrCase<"ViewportMaskNV", 5253>;
+def SPV_BI_SecondaryPositionNV         : I32EnumAttrCase<"SecondaryPositionNV", 5257>;
+def SPV_BI_SecondaryViewportMaskNV     : I32EnumAttrCase<"SecondaryViewportMaskNV", 5258>;
+def SPV_BI_PositionPerViewNV           : I32EnumAttrCase<"PositionPerViewNV", 5261>;
+def SPV_BI_ViewportMaskPerViewNV       : I32EnumAttrCase<"ViewportMaskPerViewNV", 5262>;
+def SPV_BI_FullyCoveredEXT             : I32EnumAttrCase<"FullyCoveredEXT", 5264>;
+def SPV_BI_TaskCountNV                 : I32EnumAttrCase<"TaskCountNV", 5274>;
+def SPV_BI_PrimitiveCountNV            : I32EnumAttrCase<"PrimitiveCountNV", 5275>;
+def SPV_BI_PrimitiveIndicesNV          : I32EnumAttrCase<"PrimitiveIndicesNV", 5276>;
+def SPV_BI_ClipDistancePerViewNV       : I32EnumAttrCase<"ClipDistancePerViewNV", 5277>;
+def SPV_BI_CullDistancePerViewNV       : I32EnumAttrCase<"CullDistancePerViewNV", 5278>;
+def SPV_BI_LayerPerViewNV              : I32EnumAttrCase<"LayerPerViewNV", 5279>;
+def SPV_BI_MeshViewCountNV             : I32EnumAttrCase<"MeshViewCountNV", 5280>;
+def SPV_BI_MeshViewIndicesNV           : I32EnumAttrCase<"MeshViewIndicesNV", 5281>;
+def SPV_BI_BaryCoordNV                 : I32EnumAttrCase<"BaryCoordNV", 5286>;
+def SPV_BI_BaryCoordNoPerspNV          : I32EnumAttrCase<"BaryCoordNoPerspNV", 5287>;
+def SPV_BI_FragSizeEXT                 : I32EnumAttrCase<"FragSizeEXT", 5292>;
+def SPV_BI_FragInvocationCountEXT      : I32EnumAttrCase<"FragInvocationCountEXT", 5293>;
+def SPV_BI_LaunchIdNV                  : I32EnumAttrCase<"LaunchIdNV", 5319>;
+def SPV_BI_LaunchSizeNV                : I32EnumAttrCase<"LaunchSizeNV", 5320>;
+def SPV_BI_WorldRayOriginNV            : I32EnumAttrCase<"WorldRayOriginNV", 5321>;
+def SPV_BI_WorldRayDirectionNV         : I32EnumAttrCase<"WorldRayDirectionNV", 5322>;
+def SPV_BI_ObjectRayOriginNV           : I32EnumAttrCase<"ObjectRayOriginNV", 5323>;
+def SPV_BI_ObjectRayDirectionNV        : I32EnumAttrCase<"ObjectRayDirectionNV", 5324>;
+def SPV_BI_RayTminNV                   : I32EnumAttrCase<"RayTminNV", 5325>;
+def SPV_BI_RayTmaxNV                   : I32EnumAttrCase<"RayTmaxNV", 5326>;
+def SPV_BI_InstanceCustomIndexNV       : I32EnumAttrCase<"InstanceCustomIndexNV", 5327>;
+def SPV_BI_ObjectToWorldNV             : I32EnumAttrCase<"ObjectToWorldNV", 5330>;
+def SPV_BI_WorldToObjectNV             : I32EnumAttrCase<"WorldToObjectNV", 5331>;
+def SPV_BI_HitTNV                      : I32EnumAttrCase<"HitTNV", 5332>;
+def SPV_BI_HitKindNV                   : I32EnumAttrCase<"HitKindNV", 5333>;
+def SPV_BI_IncomingRayFlagsNV          : I32EnumAttrCase<"IncomingRayFlagsNV", 5351>;
+def SPV_BI_WarpsPerSMNV                : I32EnumAttrCase<"WarpsPerSMNV", 5374>;
+def SPV_BI_SMCountNV                   : I32EnumAttrCase<"SMCountNV", 5375>;
+def SPV_BI_WarpIDNV                    : I32EnumAttrCase<"WarpIDNV", 5376>;
+def SPV_BI_SMIDNV                      : I32EnumAttrCase<"SMIDNV", 5377>;
+
+def SPV_BuiltInAttr :
+    I32EnumAttr<"BuiltIn", "valid SPIR-V BuiltIn", [
+      SPV_BI_Position, SPV_BI_PointSize, SPV_BI_ClipDistance, SPV_BI_CullDistance,
+      SPV_BI_VertexId, SPV_BI_InstanceId, SPV_BI_PrimitiveId, SPV_BI_InvocationId,
+      SPV_BI_Layer, SPV_BI_ViewportIndex, SPV_BI_TessLevelOuter,
+      SPV_BI_TessLevelInner, SPV_BI_TessCoord, SPV_BI_PatchVertices,
+      SPV_BI_FragCoord, SPV_BI_PointCoord, SPV_BI_FrontFacing, SPV_BI_SampleId,
+      SPV_BI_SamplePosition, SPV_BI_SampleMask, SPV_BI_FragDepth,
+      SPV_BI_HelperInvocation, SPV_BI_NumWorkgroups, SPV_BI_WorkgroupSize,
+      SPV_BI_WorkgroupId, SPV_BI_LocalInvocationId, SPV_BI_GlobalInvocationId,
+      SPV_BI_LocalInvocationIndex, SPV_BI_WorkDim, SPV_BI_GlobalSize,
+      SPV_BI_EnqueuedWorkgroupSize, SPV_BI_GlobalOffset, SPV_BI_GlobalLinearId,
+      SPV_BI_SubgroupSize, SPV_BI_SubgroupMaxSize, SPV_BI_NumSubgroups,
+      SPV_BI_NumEnqueuedSubgroups, SPV_BI_SubgroupId,
+      SPV_BI_SubgroupLocalInvocationId, SPV_BI_VertexIndex, SPV_BI_InstanceIndex,
+      SPV_BI_SubgroupEqMask, SPV_BI_SubgroupGeMask, SPV_BI_SubgroupGtMask,
+      SPV_BI_SubgroupLeMask, SPV_BI_SubgroupLtMask, SPV_BI_BaseVertex,
+      SPV_BI_BaseInstance, SPV_BI_DrawIndex, SPV_BI_DeviceIndex, SPV_BI_ViewIndex,
+      SPV_BI_BaryCoordNoPerspAMD, SPV_BI_BaryCoordNoPerspCentroidAMD,
+      SPV_BI_BaryCoordNoPerspSampleAMD, SPV_BI_BaryCoordSmoothAMD,
+      SPV_BI_BaryCoordSmoothCentroidAMD, SPV_BI_BaryCoordSmoothSampleAMD,
+      SPV_BI_BaryCoordPullModelAMD, SPV_BI_FragStencilRefEXT, SPV_BI_ViewportMaskNV,
+      SPV_BI_SecondaryPositionNV, SPV_BI_SecondaryViewportMaskNV,
+      SPV_BI_PositionPerViewNV, SPV_BI_ViewportMaskPerViewNV, SPV_BI_FullyCoveredEXT,
+      SPV_BI_TaskCountNV, SPV_BI_PrimitiveCountNV, SPV_BI_PrimitiveIndicesNV,
+      SPV_BI_ClipDistancePerViewNV, SPV_BI_CullDistancePerViewNV,
+      SPV_BI_LayerPerViewNV, SPV_BI_MeshViewCountNV, SPV_BI_MeshViewIndicesNV,
+      SPV_BI_BaryCoordNV, SPV_BI_BaryCoordNoPerspNV, SPV_BI_FragSizeEXT,
+      SPV_BI_FragInvocationCountEXT, SPV_BI_LaunchIdNV, SPV_BI_LaunchSizeNV,
+      SPV_BI_WorldRayOriginNV, SPV_BI_WorldRayDirectionNV, SPV_BI_ObjectRayOriginNV,
+      SPV_BI_ObjectRayDirectionNV, SPV_BI_RayTminNV, SPV_BI_RayTmaxNV,
+      SPV_BI_InstanceCustomIndexNV, SPV_BI_ObjectToWorldNV, SPV_BI_WorldToObjectNV,
+      SPV_BI_HitTNV, SPV_BI_HitKindNV, SPV_BI_IncomingRayFlagsNV,
+      SPV_BI_WarpsPerSMNV, SPV_BI_SMCountNV, SPV_BI_WarpIDNV, SPV_BI_SMIDNV
+    ]> {
+  let returnType = "::mlir::spirv::BuiltIn";
+  let convertFromStorage = "static_cast<::mlir::spirv::BuiltIn>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_C_Matrix                                       : I32EnumAttrCase<"Matrix", 0>;
+def SPV_C_Shader                                       : I32EnumAttrCase<"Shader", 1>;
+def SPV_C_Geometry                                     : I32EnumAttrCase<"Geometry", 2>;
+def SPV_C_Tessellation                                 : I32EnumAttrCase<"Tessellation", 3>;
+def SPV_C_Addresses                                    : I32EnumAttrCase<"Addresses", 4>;
+def SPV_C_Linkage                                      : I32EnumAttrCase<"Linkage", 5>;
+def SPV_C_Kernel                                       : I32EnumAttrCase<"Kernel", 6>;
+def SPV_C_Vector16                                     : I32EnumAttrCase<"Vector16", 7>;
+def SPV_C_Float16Buffer                                : I32EnumAttrCase<"Float16Buffer", 8>;
+def SPV_C_Float16                                      : I32EnumAttrCase<"Float16", 9>;
+def SPV_C_Float64                                      : I32EnumAttrCase<"Float64", 10>;
+def SPV_C_Int64                                        : I32EnumAttrCase<"Int64", 11>;
+def SPV_C_Int64Atomics                                 : I32EnumAttrCase<"Int64Atomics", 12>;
+def SPV_C_ImageBasic                                   : I32EnumAttrCase<"ImageBasic", 13>;
+def SPV_C_ImageReadWrite                               : I32EnumAttrCase<"ImageReadWrite", 14>;
+def SPV_C_ImageMipmap                                  : I32EnumAttrCase<"ImageMipmap", 15>;
+def SPV_C_Pipes                                        : I32EnumAttrCase<"Pipes", 17>;
+def SPV_C_Groups                                       : I32EnumAttrCase<"Groups", 18>;
+def SPV_C_DeviceEnqueue                                : I32EnumAttrCase<"DeviceEnqueue", 19>;
+def SPV_C_LiteralSampler                               : I32EnumAttrCase<"LiteralSampler", 20>;
+def SPV_C_AtomicStorage                                : I32EnumAttrCase<"AtomicStorage", 21>;
+def SPV_C_Int16                                        : I32EnumAttrCase<"Int16", 22>;
+def SPV_C_TessellationPointSize                        : I32EnumAttrCase<"TessellationPointSize", 23>;
+def SPV_C_GeometryPointSize                            : I32EnumAttrCase<"GeometryPointSize", 24>;
+def SPV_C_ImageGatherExtended                          : I32EnumAttrCase<"ImageGatherExtended", 25>;
+def SPV_C_StorageImageMultisample                      : I32EnumAttrCase<"StorageImageMultisample", 27>;
+def SPV_C_UniformBufferArrayDynamicIndexing            : I32EnumAttrCase<"UniformBufferArrayDynamicIndexing", 28>;
+def SPV_C_SampledImageArrayDynamicIndexing             : I32EnumAttrCase<"SampledImageArrayDynamicIndexing", 29>;
+def SPV_C_StorageBufferArrayDynamicIndexing            : I32EnumAttrCase<"StorageBufferArrayDynamicIndexing", 30>;
+def SPV_C_StorageImageArrayDynamicIndexing             : I32EnumAttrCase<"StorageImageArrayDynamicIndexing", 31>;
+def SPV_C_ClipDistance                                 : I32EnumAttrCase<"ClipDistance", 32>;
+def SPV_C_CullDistance                                 : I32EnumAttrCase<"CullDistance", 33>;
+def SPV_C_ImageCubeArray                               : I32EnumAttrCase<"ImageCubeArray", 34>;
+def SPV_C_SampleRateShading                            : I32EnumAttrCase<"SampleRateShading", 35>;
+def SPV_C_ImageRect                                    : I32EnumAttrCase<"ImageRect", 36>;
+def SPV_C_SampledRect                                  : I32EnumAttrCase<"SampledRect", 37>;
+def SPV_C_GenericPointer                               : I32EnumAttrCase<"GenericPointer", 38>;
+def SPV_C_Int8                                         : I32EnumAttrCase<"Int8", 39>;
+def SPV_C_InputAttachment                              : I32EnumAttrCase<"InputAttachment", 40>;
+def SPV_C_SparseResidency                              : I32EnumAttrCase<"SparseResidency", 41>;
+def SPV_C_MinLod                                       : I32EnumAttrCase<"MinLod", 42>;
+def SPV_C_Sampled1D                                    : I32EnumAttrCase<"Sampled1D", 43>;
+def SPV_C_Image1D                                      : I32EnumAttrCase<"Image1D", 44>;
+def SPV_C_SampledCubeArray                             : I32EnumAttrCase<"SampledCubeArray", 45>;
+def SPV_C_SampledBuffer                                : I32EnumAttrCase<"SampledBuffer", 46>;
+def SPV_C_ImageBuffer                                  : I32EnumAttrCase<"ImageBuffer", 47>;
+def SPV_C_ImageMSArray                                 : I32EnumAttrCase<"ImageMSArray", 48>;
+def SPV_C_StorageImageExtendedFormats                  : I32EnumAttrCase<"StorageImageExtendedFormats", 49>;
+def SPV_C_ImageQuery                                   : I32EnumAttrCase<"ImageQuery", 50>;
+def SPV_C_DerivativeControl                            : I32EnumAttrCase<"DerivativeControl", 51>;
+def SPV_C_InterpolationFunction                        : I32EnumAttrCase<"InterpolationFunction", 52>;
+def SPV_C_TransformFeedback                            : I32EnumAttrCase<"TransformFeedback", 53>;
+def SPV_C_GeometryStreams                              : I32EnumAttrCase<"GeometryStreams", 54>;
+def SPV_C_StorageImageReadWithoutFormat                : I32EnumAttrCase<"StorageImageReadWithoutFormat", 55>;
+def SPV_C_StorageImageWriteWithoutFormat               : I32EnumAttrCase<"StorageImageWriteWithoutFormat", 56>;
+def SPV_C_MultiViewport                                : I32EnumAttrCase<"MultiViewport", 57>;
+def SPV_C_SubgroupDispatch                             : I32EnumAttrCase<"SubgroupDispatch", 58>;
+def SPV_C_NamedBarrier                                 : I32EnumAttrCase<"NamedBarrier", 59>;
+def SPV_C_PipeStorage                                  : I32EnumAttrCase<"PipeStorage", 60>;
+def SPV_C_GroupNonUniform                              : I32EnumAttrCase<"GroupNonUniform", 61>;
+def SPV_C_GroupNonUniformVote                          : I32EnumAttrCase<"GroupNonUniformVote", 62>;
+def SPV_C_GroupNonUniformArithmetic                    : I32EnumAttrCase<"GroupNonUniformArithmetic", 63>;
+def SPV_C_GroupNonUniformBallot                        : I32EnumAttrCase<"GroupNonUniformBallot", 64>;
+def SPV_C_GroupNonUniformShuffle                       : I32EnumAttrCase<"GroupNonUniformShuffle", 65>;
+def SPV_C_GroupNonUniformShuffleRelative               : I32EnumAttrCase<"GroupNonUniformShuffleRelative", 66>;
+def SPV_C_GroupNonUniformClustered                     : I32EnumAttrCase<"GroupNonUniformClustered", 67>;
+def SPV_C_GroupNonUniformQuad                          : I32EnumAttrCase<"GroupNonUniformQuad", 68>;
+def SPV_C_SubgroupBallotKHR                            : I32EnumAttrCase<"SubgroupBallotKHR", 4423>;
+def SPV_C_DrawParameters                               : I32EnumAttrCase<"DrawParameters", 4427>;
+def SPV_C_SubgroupVoteKHR                              : I32EnumAttrCase<"SubgroupVoteKHR", 4431>;
+def SPV_C_StorageBuffer16BitAccess                     : I32EnumAttrCase<"StorageBuffer16BitAccess", 4433>;
+def SPV_C_UniformAndStorageBuffer16BitAccess           : I32EnumAttrCase<"UniformAndStorageBuffer16BitAccess", 4434>;
+def SPV_C_StoragePushConstant16                        : I32EnumAttrCase<"StoragePushConstant16", 4435>;
+def SPV_C_StorageInputOutput16                         : I32EnumAttrCase<"StorageInputOutput16", 4436>;
+def SPV_C_DeviceGroup                                  : I32EnumAttrCase<"DeviceGroup", 4437>;
+def SPV_C_MultiView                                    : I32EnumAttrCase<"MultiView", 4439>;
+def SPV_C_VariablePointersStorageBuffer                : I32EnumAttrCase<"VariablePointersStorageBuffer", 4441>;
+def SPV_C_VariablePointers                             : I32EnumAttrCase<"VariablePointers", 4442>;
+def SPV_C_AtomicStorageOps                             : I32EnumAttrCase<"AtomicStorageOps", 4445>;
+def SPV_C_SampleMaskPostDepthCoverage                  : I32EnumAttrCase<"SampleMaskPostDepthCoverage", 4447>;
+def SPV_C_StorageBuffer8BitAccess                      : I32EnumAttrCase<"StorageBuffer8BitAccess", 4448>;
+def SPV_C_UniformAndStorageBuffer8BitAccess            : I32EnumAttrCase<"UniformAndStorageBuffer8BitAccess", 4449>;
+def SPV_C_StoragePushConstant8                         : I32EnumAttrCase<"StoragePushConstant8", 4450>;
+def SPV_C_DenormPreserve                               : I32EnumAttrCase<"DenormPreserve", 4464>;
+def SPV_C_DenormFlushToZero                            : I32EnumAttrCase<"DenormFlushToZero", 4465>;
+def SPV_C_SignedZeroInfNanPreserve                     : I32EnumAttrCase<"SignedZeroInfNanPreserve", 4466>;
+def SPV_C_RoundingModeRTE                              : I32EnumAttrCase<"RoundingModeRTE", 4467>;
+def SPV_C_RoundingModeRTZ                              : I32EnumAttrCase<"RoundingModeRTZ", 4468>;
+def SPV_C_Float16ImageAMD                              : I32EnumAttrCase<"Float16ImageAMD", 5008>;
+def SPV_C_ImageGatherBiasLodAMD                        : I32EnumAttrCase<"ImageGatherBiasLodAMD", 5009>;
+def SPV_C_FragmentMaskAMD                              : I32EnumAttrCase<"FragmentMaskAMD", 5010>;
+def SPV_C_StencilExportEXT                             : I32EnumAttrCase<"StencilExportEXT", 5013>;
+def SPV_C_ImageReadWriteLodAMD                         : I32EnumAttrCase<"ImageReadWriteLodAMD", 5015>;
+def SPV_C_ShaderClockKHR                               : I32EnumAttrCase<"ShaderClockKHR", 5055>;
+def SPV_C_SampleMaskOverrideCoverageNV                 : I32EnumAttrCase<"SampleMaskOverrideCoverageNV", 5249>;
+def SPV_C_GeometryShaderPassthroughNV                  : I32EnumAttrCase<"GeometryShaderPassthroughNV", 5251>;
+def SPV_C_ShaderViewportIndexLayerEXT                  : I32EnumAttrCase<"ShaderViewportIndexLayerEXT", 5254>;
+def SPV_C_ShaderViewportMaskNV                         : I32EnumAttrCase<"ShaderViewportMaskNV", 5255>;
+def SPV_C_ShaderStereoViewNV                           : I32EnumAttrCase<"ShaderStereoViewNV", 5259>;
+def SPV_C_PerViewAttributesNV                          : I32EnumAttrCase<"PerViewAttributesNV", 5260>;
+def SPV_C_FragmentFullyCoveredEXT                      : I32EnumAttrCase<"FragmentFullyCoveredEXT", 5265>;
+def SPV_C_MeshShadingNV                                : I32EnumAttrCase<"MeshShadingNV", 5266>;
+def SPV_C_ImageFootprintNV                             : I32EnumAttrCase<"ImageFootprintNV", 5282>;
+def SPV_C_FragmentBarycentricNV                        : I32EnumAttrCase<"FragmentBarycentricNV", 5284>;
+def SPV_C_ComputeDerivativeGroupQuadsNV                : I32EnumAttrCase<"ComputeDerivativeGroupQuadsNV", 5288>;
+def SPV_C_FragmentDensityEXT                           : I32EnumAttrCase<"FragmentDensityEXT", 5291>;
+def SPV_C_GroupNonUniformPartitionedNV                 : I32EnumAttrCase<"GroupNonUniformPartitionedNV", 5297>;
+def SPV_C_ShaderNonUniformEXT                          : I32EnumAttrCase<"ShaderNonUniformEXT", 5301>;
+def SPV_C_RuntimeDescriptorArrayEXT                    : I32EnumAttrCase<"RuntimeDescriptorArrayEXT", 5302>;
+def SPV_C_InputAttachmentArrayDynamicIndexingEXT       : I32EnumAttrCase<"InputAttachmentArrayDynamicIndexingEXT", 5303>;
+def SPV_C_UniformTexelBufferArrayDynamicIndexingEXT    : I32EnumAttrCase<"UniformTexelBufferArrayDynamicIndexingEXT", 5304>;
+def SPV_C_StorageTexelBufferArrayDynamicIndexingEXT    : I32EnumAttrCase<"StorageTexelBufferArrayDynamicIndexingEXT", 5305>;
+def SPV_C_UniformBufferArrayNonUniformIndexingEXT      : I32EnumAttrCase<"UniformBufferArrayNonUniformIndexingEXT", 5306>;
+def SPV_C_SampledImageArrayNonUniformIndexingEXT       : I32EnumAttrCase<"SampledImageArrayNonUniformIndexingEXT", 5307>;
+def SPV_C_StorageBufferArrayNonUniformIndexingEXT      : I32EnumAttrCase<"StorageBufferArrayNonUniformIndexingEXT", 5308>;
+def SPV_C_StorageImageArrayNonUniformIndexingEXT       : I32EnumAttrCase<"StorageImageArrayNonUniformIndexingEXT", 5309>;
+def SPV_C_InputAttachmentArrayNonUniformIndexingEXT    : I32EnumAttrCase<"InputAttachmentArrayNonUniformIndexingEXT", 5310>;
+def SPV_C_UniformTexelBufferArrayNonUniformIndexingEXT : I32EnumAttrCase<"UniformTexelBufferArrayNonUniformIndexingEXT", 5311>;
+def SPV_C_StorageTexelBufferArrayNonUniformIndexingEXT : I32EnumAttrCase<"StorageTexelBufferArrayNonUniformIndexingEXT", 5312>;
+def SPV_C_RayTracingNV                                 : I32EnumAttrCase<"RayTracingNV", 5340>;
+def SPV_C_VulkanMemoryModelKHR                         : I32EnumAttrCase<"VulkanMemoryModelKHR", 5345>;
+def SPV_C_VulkanMemoryModelDeviceScopeKHR              : I32EnumAttrCase<"VulkanMemoryModelDeviceScopeKHR", 5346>;
+def SPV_C_PhysicalStorageBufferAddressesEXT            : I32EnumAttrCase<"PhysicalStorageBufferAddressesEXT", 5347>;
+def SPV_C_ComputeDerivativeGroupLinearNV               : I32EnumAttrCase<"ComputeDerivativeGroupLinearNV", 5350>;
+def SPV_C_CooperativeMatrixNV                          : I32EnumAttrCase<"CooperativeMatrixNV", 5357>;
+def SPV_C_FragmentShaderSampleInterlockEXT             : I32EnumAttrCase<"FragmentShaderSampleInterlockEXT", 5363>;
+def SPV_C_FragmentShaderShadingRateInterlockEXT        : I32EnumAttrCase<"FragmentShaderShadingRateInterlockEXT", 5372>;
+def SPV_C_ShaderSMBuiltinsNV                           : I32EnumAttrCase<"ShaderSMBuiltinsNV", 5373>;
+def SPV_C_FragmentShaderPixelInterlockEXT              : I32EnumAttrCase<"FragmentShaderPixelInterlockEXT", 5378>;
+def SPV_C_DemoteToHelperInvocationEXT                  : I32EnumAttrCase<"DemoteToHelperInvocationEXT", 5379>;
+def SPV_C_SubgroupShuffleINTEL                         : I32EnumAttrCase<"SubgroupShuffleINTEL", 5568>;
+def SPV_C_SubgroupBufferBlockIOINTEL                   : I32EnumAttrCase<"SubgroupBufferBlockIOINTEL", 5569>;
+def SPV_C_SubgroupImageBlockIOINTEL                    : I32EnumAttrCase<"SubgroupImageBlockIOINTEL", 5570>;
+def SPV_C_SubgroupImageMediaBlockIOINTEL               : I32EnumAttrCase<"SubgroupImageMediaBlockIOINTEL", 5579>;
+def SPV_C_IntegerFunctions2INTEL                       : I32EnumAttrCase<"IntegerFunctions2INTEL", 5584>;
+def SPV_C_SubgroupAvcMotionEstimationINTEL             : I32EnumAttrCase<"SubgroupAvcMotionEstimationINTEL", 5696>;
+def SPV_C_SubgroupAvcMotionEstimationIntraINTEL        : I32EnumAttrCase<"SubgroupAvcMotionEstimationIntraINTEL", 5697>;
+def SPV_C_SubgroupAvcMotionEstimationChromaINTEL       : I32EnumAttrCase<"SubgroupAvcMotionEstimationChromaINTEL", 5698>;
+
+def SPV_CapabilityAttr :
+    I32EnumAttr<"Capability", "valid SPIR-V Capability", [
+      SPV_C_Matrix, SPV_C_Shader, SPV_C_Geometry, SPV_C_Tessellation,
+      SPV_C_Addresses, SPV_C_Linkage, SPV_C_Kernel, SPV_C_Vector16,
+      SPV_C_Float16Buffer, SPV_C_Float16, SPV_C_Float64, SPV_C_Int64,
+      SPV_C_Int64Atomics, SPV_C_ImageBasic, SPV_C_ImageReadWrite, SPV_C_ImageMipmap,
+      SPV_C_Pipes, SPV_C_Groups, SPV_C_DeviceEnqueue, SPV_C_LiteralSampler,
+      SPV_C_AtomicStorage, SPV_C_Int16, SPV_C_TessellationPointSize,
+      SPV_C_GeometryPointSize, SPV_C_ImageGatherExtended,
+      SPV_C_StorageImageMultisample, SPV_C_UniformBufferArrayDynamicIndexing,
+      SPV_C_SampledImageArrayDynamicIndexing,
+      SPV_C_StorageBufferArrayDynamicIndexing,
+      SPV_C_StorageImageArrayDynamicIndexing, SPV_C_ClipDistance, SPV_C_CullDistance,
+      SPV_C_ImageCubeArray, SPV_C_SampleRateShading, SPV_C_ImageRect,
+      SPV_C_SampledRect, SPV_C_GenericPointer, SPV_C_Int8, SPV_C_InputAttachment,
+      SPV_C_SparseResidency, SPV_C_MinLod, SPV_C_Sampled1D, SPV_C_Image1D,
+      SPV_C_SampledCubeArray, SPV_C_SampledBuffer, SPV_C_ImageBuffer,
+      SPV_C_ImageMSArray, SPV_C_StorageImageExtendedFormats, SPV_C_ImageQuery,
+      SPV_C_DerivativeControl, SPV_C_InterpolationFunction, SPV_C_TransformFeedback,
+      SPV_C_GeometryStreams, SPV_C_StorageImageReadWithoutFormat,
+      SPV_C_StorageImageWriteWithoutFormat, SPV_C_MultiViewport,
+      SPV_C_SubgroupDispatch, SPV_C_NamedBarrier, SPV_C_PipeStorage,
+      SPV_C_GroupNonUniform, SPV_C_GroupNonUniformVote,
+      SPV_C_GroupNonUniformArithmetic, SPV_C_GroupNonUniformBallot,
+      SPV_C_GroupNonUniformShuffle, SPV_C_GroupNonUniformShuffleRelative,
+      SPV_C_GroupNonUniformClustered, SPV_C_GroupNonUniformQuad,
+      SPV_C_SubgroupBallotKHR, SPV_C_DrawParameters, SPV_C_SubgroupVoteKHR,
+      SPV_C_StorageBuffer16BitAccess, SPV_C_UniformAndStorageBuffer16BitAccess,
+      SPV_C_StoragePushConstant16, SPV_C_StorageInputOutput16, SPV_C_DeviceGroup,
+      SPV_C_MultiView, SPV_C_VariablePointersStorageBuffer, SPV_C_VariablePointers,
+      SPV_C_AtomicStorageOps, SPV_C_SampleMaskPostDepthCoverage,
+      SPV_C_StorageBuffer8BitAccess, SPV_C_UniformAndStorageBuffer8BitAccess,
+      SPV_C_StoragePushConstant8, SPV_C_DenormPreserve, SPV_C_DenormFlushToZero,
+      SPV_C_SignedZeroInfNanPreserve, SPV_C_RoundingModeRTE, SPV_C_RoundingModeRTZ,
+      SPV_C_Float16ImageAMD, SPV_C_ImageGatherBiasLodAMD, SPV_C_FragmentMaskAMD,
+      SPV_C_StencilExportEXT, SPV_C_ImageReadWriteLodAMD, SPV_C_ShaderClockKHR,
+      SPV_C_SampleMaskOverrideCoverageNV, SPV_C_GeometryShaderPassthroughNV,
+      SPV_C_ShaderViewportIndexLayerEXT, SPV_C_ShaderViewportMaskNV,
+      SPV_C_ShaderStereoViewNV, SPV_C_PerViewAttributesNV,
+      SPV_C_FragmentFullyCoveredEXT, SPV_C_MeshShadingNV, SPV_C_ImageFootprintNV,
+      SPV_C_FragmentBarycentricNV, SPV_C_ComputeDerivativeGroupQuadsNV,
+      SPV_C_FragmentDensityEXT, SPV_C_GroupNonUniformPartitionedNV,
+      SPV_C_ShaderNonUniformEXT, SPV_C_RuntimeDescriptorArrayEXT,
+      SPV_C_InputAttachmentArrayDynamicIndexingEXT,
+      SPV_C_UniformTexelBufferArrayDynamicIndexingEXT,
+      SPV_C_StorageTexelBufferArrayDynamicIndexingEXT,
+      SPV_C_UniformBufferArrayNonUniformIndexingEXT,
+      SPV_C_SampledImageArrayNonUniformIndexingEXT,
+      SPV_C_StorageBufferArrayNonUniformIndexingEXT,
+      SPV_C_StorageImageArrayNonUniformIndexingEXT,
+      SPV_C_InputAttachmentArrayNonUniformIndexingEXT,
+      SPV_C_UniformTexelBufferArrayNonUniformIndexingEXT,
+      SPV_C_StorageTexelBufferArrayNonUniformIndexingEXT, SPV_C_RayTracingNV,
+      SPV_C_VulkanMemoryModelKHR, SPV_C_VulkanMemoryModelDeviceScopeKHR,
+      SPV_C_PhysicalStorageBufferAddressesEXT, SPV_C_ComputeDerivativeGroupLinearNV,
+      SPV_C_CooperativeMatrixNV, SPV_C_FragmentShaderSampleInterlockEXT,
+      SPV_C_FragmentShaderShadingRateInterlockEXT, SPV_C_ShaderSMBuiltinsNV,
+      SPV_C_FragmentShaderPixelInterlockEXT, SPV_C_DemoteToHelperInvocationEXT,
+      SPV_C_SubgroupShuffleINTEL, SPV_C_SubgroupBufferBlockIOINTEL,
+      SPV_C_SubgroupImageBlockIOINTEL, SPV_C_SubgroupImageMediaBlockIOINTEL,
+      SPV_C_IntegerFunctions2INTEL, SPV_C_SubgroupAvcMotionEstimationINTEL,
+      SPV_C_SubgroupAvcMotionEstimationIntraINTEL,
+      SPV_C_SubgroupAvcMotionEstimationChromaINTEL
+    ]> {
+  let returnType = "::mlir::spirv::Capability";
+  let convertFromStorage = "static_cast<::mlir::spirv::Capability>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_D_RelaxedPrecision            : I32EnumAttrCase<"RelaxedPrecision", 0>;
+def SPV_D_SpecId                      : I32EnumAttrCase<"SpecId", 1>;
+def SPV_D_Block                       : I32EnumAttrCase<"Block", 2>;
+def SPV_D_BufferBlock                 : I32EnumAttrCase<"BufferBlock", 3>;
+def SPV_D_RowMajor                    : I32EnumAttrCase<"RowMajor", 4>;
+def SPV_D_ColMajor                    : I32EnumAttrCase<"ColMajor", 5>;
+def SPV_D_ArrayStride                 : I32EnumAttrCase<"ArrayStride", 6>;
+def SPV_D_MatrixStride                : I32EnumAttrCase<"MatrixStride", 7>;
+def SPV_D_GLSLShared                  : I32EnumAttrCase<"GLSLShared", 8>;
+def SPV_D_GLSLPacked                  : I32EnumAttrCase<"GLSLPacked", 9>;
+def SPV_D_CPacked                     : I32EnumAttrCase<"CPacked", 10>;
+def SPV_D_BuiltIn                     : I32EnumAttrCase<"BuiltIn", 11>;
+def SPV_D_NoPerspective               : I32EnumAttrCase<"NoPerspective", 13>;
+def SPV_D_Flat                        : I32EnumAttrCase<"Flat", 14>;
+def SPV_D_Patch                       : I32EnumAttrCase<"Patch", 15>;
+def SPV_D_Centroid                    : I32EnumAttrCase<"Centroid", 16>;
+def SPV_D_Sample                      : I32EnumAttrCase<"Sample", 17>;
+def SPV_D_Invariant                   : I32EnumAttrCase<"Invariant", 18>;
+def SPV_D_Restrict                    : I32EnumAttrCase<"Restrict", 19>;
+def SPV_D_Aliased                     : I32EnumAttrCase<"Aliased", 20>;
+def SPV_D_Volatile                    : I32EnumAttrCase<"Volatile", 21>;
+def SPV_D_Constant                    : I32EnumAttrCase<"Constant", 22>;
+def SPV_D_Coherent                    : I32EnumAttrCase<"Coherent", 23>;
+def SPV_D_NonWritable                 : I32EnumAttrCase<"NonWritable", 24>;
+def SPV_D_NonReadable                 : I32EnumAttrCase<"NonReadable", 25>;
+def SPV_D_Uniform                     : I32EnumAttrCase<"Uniform", 26>;
+def SPV_D_UniformId                   : I32EnumAttrCase<"UniformId", 27>;
+def SPV_D_SaturatedConversion         : I32EnumAttrCase<"SaturatedConversion", 28>;
+def SPV_D_Stream                      : I32EnumAttrCase<"Stream", 29>;
+def SPV_D_Location                    : I32EnumAttrCase<"Location", 30>;
+def SPV_D_Component                   : I32EnumAttrCase<"Component", 31>;
+def SPV_D_Index                       : I32EnumAttrCase<"Index", 32>;
+def SPV_D_Binding                     : I32EnumAttrCase<"Binding", 33>;
+def SPV_D_DescriptorSet               : I32EnumAttrCase<"DescriptorSet", 34>;
+def SPV_D_Offset                      : I32EnumAttrCase<"Offset", 35>;
+def SPV_D_XfbBuffer                   : I32EnumAttrCase<"XfbBuffer", 36>;
+def SPV_D_XfbStride                   : I32EnumAttrCase<"XfbStride", 37>;
+def SPV_D_FuncParamAttr               : I32EnumAttrCase<"FuncParamAttr", 38>;
+def SPV_D_FPRoundingMode              : I32EnumAttrCase<"FPRoundingMode", 39>;
+def SPV_D_FPFastMathMode              : I32EnumAttrCase<"FPFastMathMode", 40>;
+def SPV_D_LinkageAttributes           : I32EnumAttrCase<"LinkageAttributes", 41>;
+def SPV_D_NoContraction               : I32EnumAttrCase<"NoContraction", 42>;
+def SPV_D_InputAttachmentIndex        : I32EnumAttrCase<"InputAttachmentIndex", 43>;
+def SPV_D_Alignment                   : I32EnumAttrCase<"Alignment", 44>;
+def SPV_D_MaxByteOffset               : I32EnumAttrCase<"MaxByteOffset", 45>;
+def SPV_D_AlignmentId                 : I32EnumAttrCase<"AlignmentId", 46>;
+def SPV_D_MaxByteOffsetId             : I32EnumAttrCase<"MaxByteOffsetId", 47>;
+def SPV_D_NoSignedWrap                : I32EnumAttrCase<"NoSignedWrap", 4469>;
+def SPV_D_NoUnsignedWrap              : I32EnumAttrCase<"NoUnsignedWrap", 4470>;
+def SPV_D_ExplicitInterpAMD           : I32EnumAttrCase<"ExplicitInterpAMD", 4999>;
+def SPV_D_OverrideCoverageNV          : I32EnumAttrCase<"OverrideCoverageNV", 5248>;
+def SPV_D_PassthroughNV               : I32EnumAttrCase<"PassthroughNV", 5250>;
+def SPV_D_ViewportRelativeNV          : I32EnumAttrCase<"ViewportRelativeNV", 5252>;
+def SPV_D_SecondaryViewportRelativeNV : I32EnumAttrCase<"SecondaryViewportRelativeNV", 5256>;
+def SPV_D_PerPrimitiveNV              : I32EnumAttrCase<"PerPrimitiveNV", 5271>;
+def SPV_D_PerViewNV                   : I32EnumAttrCase<"PerViewNV", 5272>;
+def SPV_D_PerTaskNV                   : I32EnumAttrCase<"PerTaskNV", 5273>;
+def SPV_D_PerVertexNV                 : I32EnumAttrCase<"PerVertexNV", 5285>;
+def SPV_D_NonUniformEXT               : I32EnumAttrCase<"NonUniformEXT", 5300>;
+def SPV_D_RestrictPointerEXT          : I32EnumAttrCase<"RestrictPointerEXT", 5355>;
+def SPV_D_AliasedPointerEXT           : I32EnumAttrCase<"AliasedPointerEXT", 5356>;
+def SPV_D_CounterBuffer               : I32EnumAttrCase<"CounterBuffer", 5634>;
+def SPV_D_UserSemantic                : I32EnumAttrCase<"UserSemantic", 5635>;
+def SPV_D_UserTypeGOOGLE              : I32EnumAttrCase<"UserTypeGOOGLE", 5636>;
+
+def SPV_DecorationAttr :
+    I32EnumAttr<"Decoration", "valid SPIR-V Decoration", [
+      SPV_D_RelaxedPrecision, SPV_D_SpecId, SPV_D_Block, SPV_D_BufferBlock,
+      SPV_D_RowMajor, SPV_D_ColMajor, SPV_D_ArrayStride, SPV_D_MatrixStride,
+      SPV_D_GLSLShared, SPV_D_GLSLPacked, SPV_D_CPacked, SPV_D_BuiltIn,
+      SPV_D_NoPerspective, SPV_D_Flat, SPV_D_Patch, SPV_D_Centroid, SPV_D_Sample,
+      SPV_D_Invariant, SPV_D_Restrict, SPV_D_Aliased, SPV_D_Volatile, SPV_D_Constant,
+      SPV_D_Coherent, SPV_D_NonWritable, SPV_D_NonReadable, SPV_D_Uniform,
+      SPV_D_UniformId, SPV_D_SaturatedConversion, SPV_D_Stream, SPV_D_Location,
+      SPV_D_Component, SPV_D_Index, SPV_D_Binding, SPV_D_DescriptorSet, SPV_D_Offset,
+      SPV_D_XfbBuffer, SPV_D_XfbStride, SPV_D_FuncParamAttr, SPV_D_FPRoundingMode,
+      SPV_D_FPFastMathMode, SPV_D_LinkageAttributes, SPV_D_NoContraction,
+      SPV_D_InputAttachmentIndex, SPV_D_Alignment, SPV_D_MaxByteOffset,
+      SPV_D_AlignmentId, SPV_D_MaxByteOffsetId, SPV_D_NoSignedWrap,
+      SPV_D_NoUnsignedWrap, SPV_D_ExplicitInterpAMD, SPV_D_OverrideCoverageNV,
+      SPV_D_PassthroughNV, SPV_D_ViewportRelativeNV,
+      SPV_D_SecondaryViewportRelativeNV, SPV_D_PerPrimitiveNV, SPV_D_PerViewNV,
+      SPV_D_PerTaskNV, SPV_D_PerVertexNV, SPV_D_NonUniformEXT,
+      SPV_D_RestrictPointerEXT, SPV_D_AliasedPointerEXT, SPV_D_CounterBuffer,
+      SPV_D_UserSemantic, SPV_D_UserTypeGOOGLE
+    ]> {
+  let returnType = "::mlir::spirv::Decoration";
+  let convertFromStorage = "static_cast<::mlir::spirv::Decoration>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_D_1D          : I32EnumAttrCase<"1D", 0>;
+def SPV_D_2D          : I32EnumAttrCase<"2D", 1>;
+def SPV_D_3D          : I32EnumAttrCase<"3D", 2>;
+def SPV_D_Cube        : I32EnumAttrCase<"Cube", 3>;
+def SPV_D_Rect        : I32EnumAttrCase<"Rect", 4>;
+def SPV_D_Buffer      : I32EnumAttrCase<"Buffer", 5>;
+def SPV_D_SubpassData : I32EnumAttrCase<"SubpassData", 6>;
+
+def SPV_DimAttr :
+    I32EnumAttr<"Dim", "valid SPIR-V Dim", [
+      SPV_D_1D, SPV_D_2D, SPV_D_3D, SPV_D_Cube, SPV_D_Rect, SPV_D_Buffer,
+      SPV_D_SubpassData
+    ]> {
+  let returnType = "::mlir::spirv::Dim";
+  let convertFromStorage = "static_cast<::mlir::spirv::Dim>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_EM_Invocations                      : I32EnumAttrCase<"Invocations", 0>;
+def SPV_EM_SpacingEqual                     : I32EnumAttrCase<"SpacingEqual", 1>;
+def SPV_EM_SpacingFractionalEven            : I32EnumAttrCase<"SpacingFractionalEven", 2>;
+def SPV_EM_SpacingFractionalOdd             : I32EnumAttrCase<"SpacingFractionalOdd", 3>;
+def SPV_EM_VertexOrderCw                    : I32EnumAttrCase<"VertexOrderCw", 4>;
+def SPV_EM_VertexOrderCcw                   : I32EnumAttrCase<"VertexOrderCcw", 5>;
+def SPV_EM_PixelCenterInteger               : I32EnumAttrCase<"PixelCenterInteger", 6>;
+def SPV_EM_OriginUpperLeft                  : I32EnumAttrCase<"OriginUpperLeft", 7>;
+def SPV_EM_OriginLowerLeft                  : I32EnumAttrCase<"OriginLowerLeft", 8>;
+def SPV_EM_EarlyFragmentTests               : I32EnumAttrCase<"EarlyFragmentTests", 9>;
+def SPV_EM_PointMode                        : I32EnumAttrCase<"PointMode", 10>;
+def SPV_EM_Xfb                              : I32EnumAttrCase<"Xfb", 11>;
+def SPV_EM_DepthReplacing                   : I32EnumAttrCase<"DepthReplacing", 12>;
+def SPV_EM_DepthGreater                     : I32EnumAttrCase<"DepthGreater", 14>;
+def SPV_EM_DepthLess                        : I32EnumAttrCase<"DepthLess", 15>;
+def SPV_EM_DepthUnchanged                   : I32EnumAttrCase<"DepthUnchanged", 16>;
+def SPV_EM_LocalSize                        : I32EnumAttrCase<"LocalSize", 17>;
+def SPV_EM_LocalSizeHint                    : I32EnumAttrCase<"LocalSizeHint", 18>;
+def SPV_EM_InputPoints                      : I32EnumAttrCase<"InputPoints", 19>;
+def SPV_EM_InputLines                       : I32EnumAttrCase<"InputLines", 20>;
+def SPV_EM_InputLinesAdjacency              : I32EnumAttrCase<"InputLinesAdjacency", 21>;
+def SPV_EM_Triangles                        : I32EnumAttrCase<"Triangles", 22>;
+def SPV_EM_InputTrianglesAdjacency          : I32EnumAttrCase<"InputTrianglesAdjacency", 23>;
+def SPV_EM_Quads                            : I32EnumAttrCase<"Quads", 24>;
+def SPV_EM_Isolines                         : I32EnumAttrCase<"Isolines", 25>;
+def SPV_EM_OutputVertices                   : I32EnumAttrCase<"OutputVertices", 26>;
+def SPV_EM_OutputPoints                     : I32EnumAttrCase<"OutputPoints", 27>;
+def SPV_EM_OutputLineStrip                  : I32EnumAttrCase<"OutputLineStrip", 28>;
+def SPV_EM_OutputTriangleStrip              : I32EnumAttrCase<"OutputTriangleStrip", 29>;
+def SPV_EM_VecTypeHint                      : I32EnumAttrCase<"VecTypeHint", 30>;
+def SPV_EM_ContractionOff                   : I32EnumAttrCase<"ContractionOff", 31>;
+def SPV_EM_Initializer                      : I32EnumAttrCase<"Initializer", 33>;
+def SPV_EM_Finalizer                        : I32EnumAttrCase<"Finalizer", 34>;
+def SPV_EM_SubgroupSize                     : I32EnumAttrCase<"SubgroupSize", 35>;
+def SPV_EM_SubgroupsPerWorkgroup            : I32EnumAttrCase<"SubgroupsPerWorkgroup", 36>;
+def SPV_EM_SubgroupsPerWorkgroupId          : I32EnumAttrCase<"SubgroupsPerWorkgroupId", 37>;
+def SPV_EM_LocalSizeId                      : I32EnumAttrCase<"LocalSizeId", 38>;
+def SPV_EM_LocalSizeHintId                  : I32EnumAttrCase<"LocalSizeHintId", 39>;
+def SPV_EM_PostDepthCoverage                : I32EnumAttrCase<"PostDepthCoverage", 4446>;
+def SPV_EM_DenormPreserve                   : I32EnumAttrCase<"DenormPreserve", 4459>;
+def SPV_EM_DenormFlushToZero                : I32EnumAttrCase<"DenormFlushToZero", 4460>;
+def SPV_EM_SignedZeroInfNanPreserve         : I32EnumAttrCase<"SignedZeroInfNanPreserve", 4461>;
+def SPV_EM_RoundingModeRTE                  : I32EnumAttrCase<"RoundingModeRTE", 4462>;
+def SPV_EM_RoundingModeRTZ                  : I32EnumAttrCase<"RoundingModeRTZ", 4463>;
+def SPV_EM_StencilRefReplacingEXT           : I32EnumAttrCase<"StencilRefReplacingEXT", 5027>;
+def SPV_EM_OutputLinesNV                    : I32EnumAttrCase<"OutputLinesNV", 5269>;
+def SPV_EM_OutputPrimitivesNV               : I32EnumAttrCase<"OutputPrimitivesNV", 5270>;
+def SPV_EM_DerivativeGroupQuadsNV           : I32EnumAttrCase<"DerivativeGroupQuadsNV", 5289>;
+def SPV_EM_DerivativeGroupLinearNV          : I32EnumAttrCase<"DerivativeGroupLinearNV", 5290>;
+def SPV_EM_OutputTrianglesNV                : I32EnumAttrCase<"OutputTrianglesNV", 5298>;
+def SPV_EM_PixelInterlockOrderedEXT         : I32EnumAttrCase<"PixelInterlockOrderedEXT", 5366>;
+def SPV_EM_PixelInterlockUnorderedEXT       : I32EnumAttrCase<"PixelInterlockUnorderedEXT", 5367>;
+def SPV_EM_SampleInterlockOrderedEXT        : I32EnumAttrCase<"SampleInterlockOrderedEXT", 5368>;
+def SPV_EM_SampleInterlockUnorderedEXT      : I32EnumAttrCase<"SampleInterlockUnorderedEXT", 5369>;
+def SPV_EM_ShadingRateInterlockOrderedEXT   : I32EnumAttrCase<"ShadingRateInterlockOrderedEXT", 5370>;
+def SPV_EM_ShadingRateInterlockUnorderedEXT : I32EnumAttrCase<"ShadingRateInterlockUnorderedEXT", 5371>;
+
+def SPV_ExecutionModeAttr :
+    I32EnumAttr<"ExecutionMode", "valid SPIR-V ExecutionMode", [
+      SPV_EM_Invocations, SPV_EM_SpacingEqual, SPV_EM_SpacingFractionalEven,
+      SPV_EM_SpacingFractionalOdd, SPV_EM_VertexOrderCw, SPV_EM_VertexOrderCcw,
+      SPV_EM_PixelCenterInteger, SPV_EM_OriginUpperLeft, SPV_EM_OriginLowerLeft,
+      SPV_EM_EarlyFragmentTests, SPV_EM_PointMode, SPV_EM_Xfb, SPV_EM_DepthReplacing,
+      SPV_EM_DepthGreater, SPV_EM_DepthLess, SPV_EM_DepthUnchanged, SPV_EM_LocalSize,
+      SPV_EM_LocalSizeHint, SPV_EM_InputPoints, SPV_EM_InputLines,
+      SPV_EM_InputLinesAdjacency, SPV_EM_Triangles, SPV_EM_InputTrianglesAdjacency,
+      SPV_EM_Quads, SPV_EM_Isolines, SPV_EM_OutputVertices, SPV_EM_OutputPoints,
+      SPV_EM_OutputLineStrip, SPV_EM_OutputTriangleStrip, SPV_EM_VecTypeHint,
+      SPV_EM_ContractionOff, SPV_EM_Initializer, SPV_EM_Finalizer,
+      SPV_EM_SubgroupSize, SPV_EM_SubgroupsPerWorkgroup,
+      SPV_EM_SubgroupsPerWorkgroupId, SPV_EM_LocalSizeId, SPV_EM_LocalSizeHintId,
+      SPV_EM_PostDepthCoverage, SPV_EM_DenormPreserve, SPV_EM_DenormFlushToZero,
+      SPV_EM_SignedZeroInfNanPreserve, SPV_EM_RoundingModeRTE,
+      SPV_EM_RoundingModeRTZ, SPV_EM_StencilRefReplacingEXT, SPV_EM_OutputLinesNV,
+      SPV_EM_OutputPrimitivesNV, SPV_EM_DerivativeGroupQuadsNV,
+      SPV_EM_DerivativeGroupLinearNV, SPV_EM_OutputTrianglesNV,
+      SPV_EM_PixelInterlockOrderedEXT, SPV_EM_PixelInterlockUnorderedEXT,
+      SPV_EM_SampleInterlockOrderedEXT, SPV_EM_SampleInterlockUnorderedEXT,
+      SPV_EM_ShadingRateInterlockOrderedEXT, SPV_EM_ShadingRateInterlockUnorderedEXT
+    ]> {
+  let returnType = "::mlir::spirv::ExecutionMode";
+  let convertFromStorage = "static_cast<::mlir::spirv::ExecutionMode>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_EM_Vertex                 : I32EnumAttrCase<"Vertex", 0>;
+def SPV_EM_TessellationControl    : I32EnumAttrCase<"TessellationControl", 1>;
+def SPV_EM_TessellationEvaluation : I32EnumAttrCase<"TessellationEvaluation", 2>;
+def SPV_EM_Geometry               : I32EnumAttrCase<"Geometry", 3>;
+def SPV_EM_Fragment               : I32EnumAttrCase<"Fragment", 4>;
+def SPV_EM_GLCompute              : I32EnumAttrCase<"GLCompute", 5>;
+def SPV_EM_Kernel                 : I32EnumAttrCase<"Kernel", 6>;
+def SPV_EM_TaskNV                 : I32EnumAttrCase<"TaskNV", 5267>;
+def SPV_EM_MeshNV                 : I32EnumAttrCase<"MeshNV", 5268>;
+def SPV_EM_RayGenerationNV        : I32EnumAttrCase<"RayGenerationNV", 5313>;
+def SPV_EM_IntersectionNV         : I32EnumAttrCase<"IntersectionNV", 5314>;
+def SPV_EM_AnyHitNV               : I32EnumAttrCase<"AnyHitNV", 5315>;
+def SPV_EM_ClosestHitNV           : I32EnumAttrCase<"ClosestHitNV", 5316>;
+def SPV_EM_MissNV                 : I32EnumAttrCase<"MissNV", 5317>;
+def SPV_EM_CallableNV             : I32EnumAttrCase<"CallableNV", 5318>;
+
+def SPV_ExecutionModelAttr :
+    I32EnumAttr<"ExecutionModel", "valid SPIR-V ExecutionModel", [
+      SPV_EM_Vertex, SPV_EM_TessellationControl, SPV_EM_TessellationEvaluation,
+      SPV_EM_Geometry, SPV_EM_Fragment, SPV_EM_GLCompute, SPV_EM_Kernel,
+      SPV_EM_TaskNV, SPV_EM_MeshNV, SPV_EM_RayGenerationNV, SPV_EM_IntersectionNV,
+      SPV_EM_AnyHitNV, SPV_EM_ClosestHitNV, SPV_EM_MissNV, SPV_EM_CallableNV
+    ]> {
+  let returnType = "::mlir::spirv::ExecutionModel";
+  let convertFromStorage = "static_cast<::mlir::spirv::ExecutionModel>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_FC_None       : I32EnumAttrCase<"None", 0x0000>;
+def SPV_FC_Inline     : I32EnumAttrCase<"Inline", 0x0001>;
+def SPV_FC_DontInline : I32EnumAttrCase<"DontInline", 0x0002>;
+def SPV_FC_Pure       : I32EnumAttrCase<"Pure", 0x0004>;
+def SPV_FC_Const      : I32EnumAttrCase<"Const", 0x0008>;
+
+def SPV_FunctionControlAttr :
+    I32EnumAttr<"FunctionControl", "valid SPIR-V FunctionControl", [
+      SPV_FC_None, SPV_FC_Inline, SPV_FC_DontInline, SPV_FC_Pure, SPV_FC_Const
+    ]> {
+  let returnType = "::mlir::spirv::FunctionControl";
+  let convertFromStorage = "static_cast<::mlir::spirv::FunctionControl>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_IF_Unknown      : I32EnumAttrCase<"Unknown", 0>;
+def SPV_IF_Rgba32f      : I32EnumAttrCase<"Rgba32f", 1>;
+def SPV_IF_Rgba16f      : I32EnumAttrCase<"Rgba16f", 2>;
+def SPV_IF_R32f         : I32EnumAttrCase<"R32f", 3>;
+def SPV_IF_Rgba8        : I32EnumAttrCase<"Rgba8", 4>;
+def SPV_IF_Rgba8Snorm   : I32EnumAttrCase<"Rgba8Snorm", 5>;
+def SPV_IF_Rg32f        : I32EnumAttrCase<"Rg32f", 6>;
+def SPV_IF_Rg16f        : I32EnumAttrCase<"Rg16f", 7>;
+def SPV_IF_R11fG11fB10f : I32EnumAttrCase<"R11fG11fB10f", 8>;
+def SPV_IF_R16f         : I32EnumAttrCase<"R16f", 9>;
+def SPV_IF_Rgba16       : I32EnumAttrCase<"Rgba16", 10>;
+def SPV_IF_Rgb10A2      : I32EnumAttrCase<"Rgb10A2", 11>;
+def SPV_IF_Rg16         : I32EnumAttrCase<"Rg16", 12>;
+def SPV_IF_Rg8          : I32EnumAttrCase<"Rg8", 13>;
+def SPV_IF_R16          : I32EnumAttrCase<"R16", 14>;
+def SPV_IF_R8           : I32EnumAttrCase<"R8", 15>;
+def SPV_IF_Rgba16Snorm  : I32EnumAttrCase<"Rgba16Snorm", 16>;
+def SPV_IF_Rg16Snorm    : I32EnumAttrCase<"Rg16Snorm", 17>;
+def SPV_IF_Rg8Snorm     : I32EnumAttrCase<"Rg8Snorm", 18>;
+def SPV_IF_R16Snorm     : I32EnumAttrCase<"R16Snorm", 19>;
+def SPV_IF_R8Snorm      : I32EnumAttrCase<"R8Snorm", 20>;
+def SPV_IF_Rgba32i      : I32EnumAttrCase<"Rgba32i", 21>;
+def SPV_IF_Rgba16i      : I32EnumAttrCase<"Rgba16i", 22>;
+def SPV_IF_Rgba8i       : I32EnumAttrCase<"Rgba8i", 23>;
+def SPV_IF_R32i         : I32EnumAttrCase<"R32i", 24>;
+def SPV_IF_Rg32i        : I32EnumAttrCase<"Rg32i", 25>;
+def SPV_IF_Rg16i        : I32EnumAttrCase<"Rg16i", 26>;
+def SPV_IF_Rg8i         : I32EnumAttrCase<"Rg8i", 27>;
+def SPV_IF_R16i         : I32EnumAttrCase<"R16i", 28>;
+def SPV_IF_R8i          : I32EnumAttrCase<"R8i", 29>;
+def SPV_IF_Rgba32ui     : I32EnumAttrCase<"Rgba32ui", 30>;
+def SPV_IF_Rgba16ui     : I32EnumAttrCase<"Rgba16ui", 31>;
+def SPV_IF_Rgba8ui      : I32EnumAttrCase<"Rgba8ui", 32>;
+def SPV_IF_R32ui        : I32EnumAttrCase<"R32ui", 33>;
+def SPV_IF_Rgb10a2ui    : I32EnumAttrCase<"Rgb10a2ui", 34>;
+def SPV_IF_Rg32ui       : I32EnumAttrCase<"Rg32ui", 35>;
+def SPV_IF_Rg16ui       : I32EnumAttrCase<"Rg16ui", 36>;
+def SPV_IF_Rg8ui        : I32EnumAttrCase<"Rg8ui", 37>;
+def SPV_IF_R16ui        : I32EnumAttrCase<"R16ui", 38>;
+def SPV_IF_R8ui         : I32EnumAttrCase<"R8ui", 39>;
+
+def SPV_ImageFormatAttr :
+    I32EnumAttr<"ImageFormat", "valid SPIR-V ImageFormat", [
+      SPV_IF_Unknown, SPV_IF_Rgba32f, SPV_IF_Rgba16f, SPV_IF_R32f, SPV_IF_Rgba8,
+      SPV_IF_Rgba8Snorm, SPV_IF_Rg32f, SPV_IF_Rg16f, SPV_IF_R11fG11fB10f,
+      SPV_IF_R16f, SPV_IF_Rgba16, SPV_IF_Rgb10A2, SPV_IF_Rg16, SPV_IF_Rg8,
+      SPV_IF_R16, SPV_IF_R8, SPV_IF_Rgba16Snorm, SPV_IF_Rg16Snorm, SPV_IF_Rg8Snorm,
+      SPV_IF_R16Snorm, SPV_IF_R8Snorm, SPV_IF_Rgba32i, SPV_IF_Rgba16i, SPV_IF_Rgba8i,
+      SPV_IF_R32i, SPV_IF_Rg32i, SPV_IF_Rg16i, SPV_IF_Rg8i, SPV_IF_R16i, SPV_IF_R8i,
+      SPV_IF_Rgba32ui, SPV_IF_Rgba16ui, SPV_IF_Rgba8ui, SPV_IF_R32ui,
+      SPV_IF_Rgb10a2ui, SPV_IF_Rg32ui, SPV_IF_Rg16ui, SPV_IF_Rg8ui, SPV_IF_R16ui,
+      SPV_IF_R8ui
+    ]> {
+  let returnType = "::mlir::spirv::ImageFormat";
+  let convertFromStorage = "static_cast<::mlir::spirv::ImageFormat>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_LT_Export : I32EnumAttrCase<"Export", 0>;
+def SPV_LT_Import : I32EnumAttrCase<"Import", 1>;
+
+def SPV_LinkageTypeAttr :
+    I32EnumAttr<"LinkageType", "valid SPIR-V LinkageType", [
+      SPV_LT_Export, SPV_LT_Import
+    ]> {
+  let returnType = "::mlir::spirv::LinkageType";
+  let convertFromStorage = "static_cast<::mlir::spirv::LinkageType>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_MA_None                    : I32EnumAttrCase<"None", 0x0000>;
+def SPV_MA_Volatile                : I32EnumAttrCase<"Volatile", 0x0001>;
+def SPV_MA_Aligned                 : I32EnumAttrCase<"Aligned", 0x0002>;
+def SPV_MA_Nontemporal             : I32EnumAttrCase<"Nontemporal", 0x0004>;
+def SPV_MA_MakePointerAvailableKHR : I32EnumAttrCase<"MakePointerAvailableKHR", 0x0008>;
+def SPV_MA_MakePointerVisibleKHR   : I32EnumAttrCase<"MakePointerVisibleKHR", 0x0010>;
+def SPV_MA_NonPrivatePointerKHR    : I32EnumAttrCase<"NonPrivatePointerKHR", 0x0020>;
+
+def SPV_MemoryAccessAttr :
+    I32EnumAttr<"MemoryAccess", "valid SPIR-V MemoryAccess", [
+      SPV_MA_None, SPV_MA_Volatile, SPV_MA_Aligned, SPV_MA_Nontemporal,
+      SPV_MA_MakePointerAvailableKHR, SPV_MA_MakePointerVisibleKHR,
+      SPV_MA_NonPrivatePointerKHR
+    ]> {
+  let returnType = "::mlir::spirv::MemoryAccess";
+  let convertFromStorage = "static_cast<::mlir::spirv::MemoryAccess>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_MM_Simple    : I32EnumAttrCase<"Simple", 0>;
+def SPV_MM_GLSL450   : I32EnumAttrCase<"GLSL450", 1>;
+def SPV_MM_OpenCL    : I32EnumAttrCase<"OpenCL", 2>;
+def SPV_MM_VulkanKHR : I32EnumAttrCase<"VulkanKHR", 3>;
+
+def SPV_MemoryModelAttr :
+    I32EnumAttr<"MemoryModel", "valid SPIR-V MemoryModel", [
+      SPV_MM_Simple, SPV_MM_GLSL450, SPV_MM_OpenCL, SPV_MM_VulkanKHR
+    ]> {
+  let returnType = "::mlir::spirv::MemoryModel";
+  let convertFromStorage = "static_cast<::mlir::spirv::MemoryModel>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_SC_UniformConstant          : I32EnumAttrCase<"UniformConstant", 0>;
+def SPV_SC_Input                    : I32EnumAttrCase<"Input", 1>;
+def SPV_SC_Uniform                  : I32EnumAttrCase<"Uniform", 2>;
+def SPV_SC_Output                   : I32EnumAttrCase<"Output", 3>;
+def SPV_SC_Workgroup                : I32EnumAttrCase<"Workgroup", 4>;
+def SPV_SC_CrossWorkgroup           : I32EnumAttrCase<"CrossWorkgroup", 5>;
+def SPV_SC_Private                  : I32EnumAttrCase<"Private", 6>;
+def SPV_SC_Function                 : I32EnumAttrCase<"Function", 7>;
+def SPV_SC_Generic                  : I32EnumAttrCase<"Generic", 8>;
+def SPV_SC_PushConstant             : I32EnumAttrCase<"PushConstant", 9>;
+def SPV_SC_AtomicCounter            : I32EnumAttrCase<"AtomicCounter", 10>;
+def SPV_SC_Image                    : I32EnumAttrCase<"Image", 11>;
+def SPV_SC_StorageBuffer            : I32EnumAttrCase<"StorageBuffer", 12>;
+def SPV_SC_CallableDataNV           : I32EnumAttrCase<"CallableDataNV", 5328>;
+def SPV_SC_IncomingCallableDataNV   : I32EnumAttrCase<"IncomingCallableDataNV", 5329>;
+def SPV_SC_RayPayloadNV             : I32EnumAttrCase<"RayPayloadNV", 5338>;
+def SPV_SC_HitAttributeNV           : I32EnumAttrCase<"HitAttributeNV", 5339>;
+def SPV_SC_IncomingRayPayloadNV     : I32EnumAttrCase<"IncomingRayPayloadNV", 5342>;
+def SPV_SC_ShaderRecordBufferNV     : I32EnumAttrCase<"ShaderRecordBufferNV", 5343>;
+def SPV_SC_PhysicalStorageBufferEXT : I32EnumAttrCase<"PhysicalStorageBufferEXT", 5349>;
+
+def SPV_StorageClassAttr :
+    I32EnumAttr<"StorageClass", "valid SPIR-V StorageClass", [
+      SPV_SC_UniformConstant, SPV_SC_Input, SPV_SC_Uniform, SPV_SC_Output,
+      SPV_SC_Workgroup, SPV_SC_CrossWorkgroup, SPV_SC_Private, SPV_SC_Function,
+      SPV_SC_Generic, SPV_SC_PushConstant, SPV_SC_AtomicCounter, SPV_SC_Image,
+      SPV_SC_StorageBuffer, SPV_SC_CallableDataNV, SPV_SC_IncomingCallableDataNV,
+      SPV_SC_RayPayloadNV, SPV_SC_HitAttributeNV, SPV_SC_IncomingRayPayloadNV,
+      SPV_SC_ShaderRecordBufferNV, SPV_SC_PhysicalStorageBufferEXT
+    ]> {
+  let returnType = "::mlir::spirv::StorageClass";
+  let convertFromStorage = "static_cast<::mlir::spirv::StorageClass>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
+// End enum section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+// Enums added manually that are not part of SPIRV spec
+
+def SPV_IDI_NoDepth      : I32EnumAttrCase<"NoDepth", 0>;
+def SPV_IDI_IsDepth      : I32EnumAttrCase<"IsDepth", 1>;
+def SPV_IDI_DepthUnknown : I32EnumAttrCase<"DepthUnknown", 2>;
+
+def SPV_DepthAttr :
+    I32EnumAttr<"ImageDepthInfo", "valid SPIR-V Image Depth specification",
+      [SPV_IDI_NoDepth, SPV_IDI_IsDepth, SPV_IDI_DepthUnknown]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_IAI_NonArrayed : I32EnumAttrCase<"NonArrayed", 0>;
+def SPV_IAI_Arrayed    : I32EnumAttrCase<"Arrayed", 1>;
+
+def SPV_ArrayedAttr :
+    I32EnumAttr<"ImageArrayedInfo", "valid SPIR-V Image Arrayed specification",
+      [SPV_IAI_NonArrayed, SPV_IAI_Arrayed]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_ISI_SingleSampled : I32EnumAttrCase<"SingleSampled", 0>;
+def SPV_ISI_MultiSampled  : I32EnumAttrCase<"MultiSampled", 1>;
+
+def SPV_SamplingAttr:
+    I32EnumAttr<"ImageSamplingInfo", "valid SPIR-V Image Sampling specification",
+      [SPV_ISI_SingleSampled, SPV_ISI_MultiSampled]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_ISUI_SamplerUnknown : I32EnumAttrCase<"SamplerUnknown", 0>;
+def SPV_ISUI_NeedSampler    : I32EnumAttrCase<"NeedSampler", 1>;
+def SPV_ISUI_NoSampler      : I32EnumAttrCase<"NoSampler", 2>;
+
+def SPV_SamplerUseAttr:
+    I32EnumAttr<"ImageSamplerUseInfo", "valid SPIR-V Sampler Use specification",
+      [SPV_ISUI_SamplerUnknown, SPV_ISUI_NeedSampler, SPV_ISUI_NoSampler]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V OpTrait definitions
+//===----------------------------------------------------------------------===//
+
+// Check that an op can only be used within the scope of a FuncOp.
+def InFunctionScope : PredOpTrait<
+  "op must appear in a 'func' block",
+  CPred<"llvm::isa_and_nonnull<FuncOp>($_op.getParentOp())">>;
+
+// Check that an op can only be used within the scope of a SPIR-V ModuleOp.
+def InModuleScope : PredOpTrait<
+  "op must appear in a 'spv.module' block",
+  CPred<"llvm::isa_and_nonnull<spirv::ModuleOp>($_op.getParentOp())">>;
+
+//===----------------------------------------------------------------------===//
+// SPIR-V op definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for all SPIR-V ops.
+class SPV_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<SPV_Dialect, mnemonic, traits> {
+
+  // For each SPIR-V op, the following static functions need to be defined
+  // in SPVOps.cpp:
+  //
+  // * static ParseResult parse<op-c++-class-name>(OpAsmParser *parser,
+  //                                               OperationState *result)
+  // * static void print(OpAsmPrinter *p, <op-c++-class-name> op)
+  // * static LogicalResult verify(<op-c++-class-name> op)
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(*this, p); }];
+  let verifier = [{ return ::verify(*this); }];
+
+  // Specifies whether this op has a direct corresponding SPIR-V binary
+  // instruction opcode. The (de)serializer use this field to determine whether
+  // to auto-generate an entry in the (de)serialization dispatch table for this
+  // op. If set, this field also futher enables `autogenSerialization` (see
+  // below for details).
+  bit hasOpcode = 1;
+
+  // Name of the corresponding SPIR-V op. Only valid to use when hasOpcode is 1.
+  string spirvOpName = "Op" # mnemonic;
+
+  // Controls whether to auto-generate this op's (de)serialization method.
+  // If set, it results in generation of the following methods:
+  //
+  // ```c++
+  // template<typename OpTy> Serializer::processOp(OpTy op);
+  // template<typename OpTy> Deserializer::processOp(ArrayRef<uint32_t>);
+  // ```
+  //
+  // If this field is not set, then manual implementation of a specialization of
+  // these methods is required.
+  //
+  // Note:
+  //
+  // 1) If hasOpcode is set but autogenSerialization is not set, the
+  //    (de)serializer dispatch method still calls the above method for
+  //    (de)serializing this op.
+  //
+  // 2) If hasOpcode is not set, then this field is not interpreted; this op's
+  //    (de)serialization method will not be auto-generated regardless. Neither
+  //    does the handling in the (de)serialization dispatch table. Both
+  //    (de)serializing this op and its dispatch should be handled manually.
+  bit autogenSerialization = 1;
+}
+
+class SPV_BinaryOp<string mnemonic, Type resultType, Type operandsType,
+                   list<OpTrait> traits = []> :
+      SPV_Op<mnemonic, traits> {
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandsType>:$operand1,
+    SPV_ScalarOrVectorOf<operandsType>:$operand2
+  );
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+  let parser = [{ return impl::parseBinaryOp(parser, result); }];
+  let printer = [{ return impl::printBinaryOp(getOperation(), p); }];
+  // No additional verification needed in addition to the ODS-generated ones.
+  let verifier = [{ return success(); }];
+}
+
+#endif // SPIRV_BASE
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h
new file mode 100644
index 00000000000..f255446b91d
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h
@@ -0,0 +1,52 @@
+//===- SPIRVBinaryUtils.cpp - SPIR-V Binary Module Utils --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares common utilities for SPIR-V binary module.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
+#define MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+#include <cstdint>
+
+namespace mlir {
+namespace spirv {
+
+/// SPIR-V binary header word count
+constexpr unsigned kHeaderWordCount = 5;
+
+/// SPIR-V magic number
+constexpr uint32_t kMagicNumber = 0x07230203;
+
+/// The serializer tool ID registered to the Khronos Group
+constexpr uint32_t kGeneratorNumber = 22;
+
+/// Auto-generated getOpcode<*Op>() specializations
+#define GET_SPIRV_SERIALIZATION_UTILS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+
+/// Appends a SPRI-V module header to `header` with the given `idBound`.
+void appendModuleHeader(SmallVectorImpl<uint32_t> &header, uint32_t idBound);
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
new file mode 100644
index 00000000000..b0cde8b4e0b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
@@ -0,0 +1,87 @@
+//===-- SPIRVControlFlowOps.td - SPIR-V Control Flow Ops ---*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains control flow ops for the SPIR-V dialect. It corresponds
+// to "3.32.17. Control-Flow Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef SPIRV_CONTROLFLOW_OPS
+#else
+#define SPIRV_CONTROLFLOW_OPS
+
+#ifdef SPIRV_BASE
+#else
+include "mlir/SPIRV/SPIRVBase.td"
+#endif // SPIRV_BASE
+
+// -----
+
+def SPV_ReturnOp : SPV_Op<"Return", [InFunctionScope, Terminator]> {
+  let summary = "Return with no value from a function with void return type.";
+
+  let description = [{
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    return-op ::= `spv.Return`
+    ```
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+}
+
+// -----
+
+def SPV_ReturnValueOp : SPV_Op<"ReturnValue", [InFunctionScope, Terminator]> {
+  let summary = "Return a value from a function.";
+
+  let description = [{
+    Value is the value returned, by copy, and must match the Return Type
+    operand of the OpTypeFunction type of the OpFunction body this return
+    instruction is in.
+
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    return-value-op ::= `spv.ReturnValue` ssa-use `:` spirv-type
+    ```
+
+    For example:
+
+    ```
+    spv.ReturnValue %0 : f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_Type:$value
+  );
+
+  let results = (outs);
+}
+
+#endif // SPIRV_CONTROLFLOW_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h
new file mode 100644
index 00000000000..494adc103af
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h
@@ -0,0 +1,49 @@
+//===- SPIRVDialect.h - MLIR SPIR-V dialect ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the SPIR-V dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
+#define MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+
+namespace mlir {
+namespace spirv {
+
+class SPIRVDialect : public Dialect {
+public:
+  explicit SPIRVDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "spv"; }
+
+  /// Parses a type registered to this dialect.
+  Type parseType(llvm::StringRef spec, Location loc) const override;
+
+  /// Prints a type registered to this dialect.
+  void printType(Type type, llvm::raw_ostream &os) const override;
+
+  /// Checks if a type is valid in SPIR-V dialect.
+  bool isValidSPIRVType(Type t) const;
+};
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td
new file mode 100644
index 00000000000..c5632341f89
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td
@@ -0,0 +1,374 @@
+//===-- SPIRVLogicalOps.td - MLIR SPIR-V Logical Ops -------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains arithmetic ops for the SPIR-V dialect. It corresponds
+// to "3.32.15. Relational and Logical Instructions" of the SPIR-V spec.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef SPIRV_LOGICAL_OPS
+#else
+#define SPIRV_LOGICAL_OPS
+
+#ifdef SPIRV_BASE
+#else
+include "mlir/SPIRV/SPIRVBase.td"
+#endif // SPIRV_BASE
+
+class SPV_LogicalOp<string mnemonic, Type operandsType,
+                    list<OpTrait> traits = []> :
+      // Result type is SPV_Bool.
+      SPV_BinaryOp<mnemonic, SPV_Bool, operandsType,
+                   !listconcat(traits,
+                               [NoSideEffect, SameTypeOperands,
+                                SameOperandsAndResultShape])> {
+  let parser = [{ return ::parseBinaryLogicalOp(parser, result); }];
+  let printer = [{ return ::printBinaryLogicalOp(getOperation(), p); }];
+}
+
+// -----
+
+def SPV_IEqualOp : SPV_LogicalOp<"IEqual", SPV_Integer, [Commutative]> {
+  let summary = "Integer comparison for equality.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    iequal-op ::= ssa-id `=` `spv.IEqual` ssa-use, ssa-use
+                             `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.IEqual %0, %1 : i32
+    %5 = spv.IEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_INotEqualOp : SPV_LogicalOp<"INotEqual", SPV_Integer, [Commutative]> {
+  let summary = "Integer comparison for inequality.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    inot-equal-op ::= ssa-id `=` `spv.INotEqual` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.INotEqual %0, %1 : i32
+    %5 = spv.INotEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SGreaterThanOp : SPV_LogicalOp<"SGreaterThan", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sgreater-than-op ::= ssa-id `=` `spv.SGreaterThan` ssa-use, ssa-use
+                                    `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SGreaterThan %0, %1 : i32
+    %5 = spv.SGreaterThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SGreaterThanEqualOp : SPV_LogicalOp<"SGreaterThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is greater than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sgreater-than-equal-op ::= ssa-id `=` `spv.SGreaterThanEqual` ssa-use, ssa-use
+                                          `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SGreaterThanEqual %0, %1 : i32
+    %5 = spv.SGreaterThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SLessThanOp : SPV_LogicalOp<"SLessThan", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is less than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sless-than-op ::= ssa-id `=` `spv.SLessThan` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SLessThan %0, %1 : i32
+    %5 = spv.SLessThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SLessThanEqualOp : SPV_LogicalOp<"SLessThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is less than or equal to Operand
+    2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sless-than-equal-op ::= ssa-id `=` `spv.SLessThanEqual` ssa-use, ssa-use
+                                       `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SLessThanEqual %0, %1 : i32
+    %5 = spv.SLessThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UGreaterThanOp : SPV_LogicalOp<"UGreaterThan", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    ugreater-than-op ::= ssa-id `=` `spv.UGreaterThan` ssa-use, ssa-use
+                                    `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UGreaterhan %0, %1 : i32
+    %5 = spv.UGreaterThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UGreaterThanEqualOp
+    : SPV_LogicalOp<"UGreaterThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is greater than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    ugreater-than-equal-op ::= ssa-id `=` `spv.UGreaterThanEqual` ssa-use, ssa-use
+                                          `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UGreaterThanEqual %0, %1 : i32
+    %5 = spv.UGreaterThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ULessThanOp : SPV_LogicalOp<"ULessThan", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is less than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    uless-than-op ::= ssa-id `=` `spv.ULessThan` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.ULessThan %0, %1 : i32
+    %5 = spv.ULessThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ULessThanEqualOp : SPV_LogicalOp<"ULessThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is less than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    uless-than-equal-op ::= ssa-id `=` `spv.ULessThanEqual` ssa-use, ssa-use
+                                       `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.ULessThanEqual %0, %1 : i32
+    %5 = spv.ULessThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+#endif // SPIRV_LOGICAL_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h
new file mode 100644
index 00000000000..104a4798e7c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h
@@ -0,0 +1,48 @@
+//===- SPIRVOps.h - MLIR SPIR-V operations ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the operations in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVOPS_H_
+#define MLIR_DIALECT_SPIRV_SPIRVOPS_H_
+
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Function.h"
+
+namespace mlir {
+namespace spirv {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/SPIRV/SPIRVOps.h.inc"
+
+/// Following methods are auto-generated.
+///
+/// Get the name used in the Op to refer to an enum value of the given
+/// `EnumClass`.
+/// template <typename EnumClass> StringRef attributeName();
+///
+/// Get the function that can be used to symbolize an enum value.
+/// template <typename EnumClass>
+/// llvm::Optional<EnumClass> (*)(StringRef) symbolizeEnum();
+#include "mlir/Dialect/SPIRV/SPIRVOpUtils.inc"
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVOPS_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
new file mode 100644
index 00000000000..6aad60009af
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
@@ -0,0 +1,360 @@
+//===-- SPIRVOps.td - MLIR SPIR-V Op Definitions Spec ------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the main operation definition specification file for SPIR-V
+// operations.
+//
+//===----------------------------------------------------------------------===//
+
+// Note that for each op in this file, we use a tool to automatically generate
+// certain sections in its definition: basic structure, summary, description.
+// So modifications to these sections will not be respected. Modifications to
+// op traits, arguments, results, and sections after the results are retained.
+// Besides, ops in this file must be separated via the '// -----' marker.
+
+#ifdef SPIRV_OPS
+#else
+#define SPIRV_OPS
+
+#ifdef SPIRV_BASE
+#else
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+#endif // SPIRV_BASE
+
+#ifdef SPIRV_ARITHMETIC_OPS
+#else
+include "mlir/Dialect/SPIRV/SPIRVArithmeticOps.td"
+#endif // SPIRV_ARITHMETIC_OPS
+
+#ifdef SPIRV_CONTROLFLOW_OPS
+#else
+include "mlir/Dialect/SPIRV/SPIRVControlFlowOps.td"
+#endif // SPIRV_CONTROLFLOW_OPS
+
+#ifdef SPIRV_LOGICAL_OPS
+#else
+include "mlir/Dialect/SPIRV/SPIRVLogicalOps.td"
+#endif // SPIRV_LOGICAL_OPS
+
+#ifdef SPIRV_STRUCTURE_OPS
+#else
+// Pull in ops for defining the SPIR-V module structure
+include "mlir/Dialect/SPIRV/SPIRVStructureOps.td"
+#endif // SPIRV_STRUCTURE_OPS
+
+// -----
+
+def SPV_AccessChainOp : SPV_Op<"AccessChain", [NoSideEffect]> {
+  let summary = [{
+    Create a pointer into a composite object that can be used with OpLoad
+    and OpStore.
+  }];
+
+  let description = [{
+    Result Type must be an OpTypePointer. Its Type operand must be the type
+    reached by walking the Base’s type hierarchy down to the last provided
+    index in Indexes, and its Storage Class operand must be the same as the
+    Storage Class of Base.
+
+    Base must be a pointer, pointing to the base of a composite object.
+
+    Indexes walk the type hierarchy to the desired depth, potentially down
+    to scalar granularity. The first index in Indexes will select the top-
+    level member/element/component/element of the base composite. All
+    composite constituents use zero-based numbering, as described by their
+    OpType… instruction. The second index will apply similarly to that
+    result, and so on. Once any non-composite type is reached, there must be
+    no remaining (unused) indexes.
+
+     Each index in Indexes
+
+    - must be a scalar integer type,
+
+    - is treated as a signed count, and
+
+    - must be an OpConstant when indexing into a structure.
+
+    ### Custom assembly form
+    ``` {.ebnf}
+    access-chain-op ::= ssa-id `=` `spv.AccessChain` ssa-use
+                        `[` ssa-use (',' ssa-use)* `]`
+                        `:` pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = "spv.constant"() { value = 1: i32} : () -> i32
+    %1 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+    %2 = spv.AccessChain %1[%0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+    %3 = spv.Load "Function" %2 ["Volatile"] : !spv.array<4xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$base_ptr,
+    Variadic<SPV_Integer>:$indices
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$component_ptr
+  );
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState *state,
+                              Value *basePtr, ArrayRef<Value *> indices}]>];
+}
+
+// -----
+
+def SPV_CompositeExtractOp : SPV_Op<"CompositeExtract", [NoSideEffect]> {
+  let summary = "Extract a part of a composite object.";
+
+  let description = [{
+    Result Type must be the type of object selected by the last provided
+    index.  The instruction result is the extracted object.
+
+    Composite is the composite to extract from.
+
+    Indexes walk the type hierarchy, potentially down to component
+    granularity, to select the part to extract. All indexes must be in
+    bounds.  All composite constituents use zero-based numbering, as
+    described by their OpType… instruction.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    composite-extract-op ::= ssa-id `=` `spv.CompositeExtract` ssa-use
+                             `[` integer-literal (',' integer-literal)* `]`
+                             `:` composite-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+    %1 = spv.Load "Function" %0 ["Volatile"] : !spv.array<4x!spv.array<4xf32>>
+    %2 = spv.CompositeExtract %1[1 : i32] : !spv.array<4x!spv.array<4xf32>>
+    ```
+
+  }];
+
+  let arguments = (ins
+    SPV_Composite:$composite,
+    I32ArrayAttr:$indices
+  );
+
+  let results = (outs
+    SPV_Type:$component
+  );
+}
+
+// -----
+
+def SPV_ExecutionModeOp : SPV_Op<"ExecutionMode", [InModuleScope]> {
+  let summary = "Declare an execution mode for an entry point.";
+
+  let description = [{
+    Entry Point must be the Entry Point <id> operand of an OpEntryPoint
+    instruction.
+
+    Mode is the execution mode. See Execution Mode.
+
+    This instruction is only valid when the Mode operand is an execution
+    mode that takes no Extra Operands, or takes Extra Operands that are not
+    <id> operands.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    execution-mode ::= "Invocations" | "SpacingEqual" |
+                       <and other SPIR-V execution modes...>
+
+    execution-mode-op ::= `spv.ExecutionMode ` ssa-use execution-mode
+                          (integer-literal (`, ` integer-literal)* )?
+    ```
+
+    For example:
+
+    ```
+    spv.ExecutionMode @foo "ContractionOff"
+    spv.ExecutionMode @bar "LocalSizeHint", 3, 4, 5
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolRefAttr:$fn,
+    SPV_ExecutionModeAttr:$execution_mode,
+    OptionalAttr<I32ArrayAttr>:$values
+  );
+
+  let results = (outs);
+
+  let verifier = [{ return success(); }];
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_LoadOp : SPV_Op<"Load", []> {
+  let summary = "Load through a pointer.";
+
+  let description = [{
+    Result Type is the type of the loaded object. It must be a type with
+    fixed size; i.e., it cannot be, nor include, any OpTypeRuntimeArray
+    types.
+
+    Pointer is the pointer to load through.  Its type must be an
+    OpTypePointer whose Type operand is the same as Result Type.
+
+    If present, any Memory Operands must begin with a memory operand
+    literal. If not present, it is the same as specifying the memory operand
+    None.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    memory-access ::= `"None"` | `"Volatile"` | `"Aligned", ` integer-literal
+                    | `"NonTemporal"`
+
+    load-op ::= ssa-id ` = spv.Load ` storage-class ssa-use
+                (`[` memory-access `]`)? ` : ` spirv-element-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<f32, Function>
+    %1 = spv.Load "Function" %0 : f32
+    %2 = spv.Load "Function" %0 ["Volatile"] : f32
+    %3 = spv.Load "Function" %0 ["Aligned", 4] : f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$ptr,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let results = (outs
+    SPV_Type:$value
+  );
+}
+
+// -----
+
+def SPV_StoreOp : SPV_Op<"Store", []> {
+  let summary = "Store through a pointer.";
+
+  let description = [{
+    Pointer is the pointer to store through.  Its type must be an
+    OpTypePointer whose Type operand is the same as the type of Object.
+
+    Object is the object to store.
+
+    If present, any Memory Operands must begin with a memory operand
+    literal. If not present, it is the same as specifying the memory operand
+    None.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    store-op ::= `spv.Store ` storage-class ssa-use `, ` ssa-use `, `
+                  (`[` memory-access `]`)? `:` spirv-element-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<f32, Function>
+    %1 = spv.FMul ... : f32
+    spv.Store "Function" %0, %1 : f32
+    spv.Store "Function" %0, %1 ["Volatile"] : f32
+    spv.Store "Function" %0, %1 ["Aligned", 4] : f32
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$ptr,
+    SPV_Type:$value,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let results = (outs);
+}
+
+// -----
+
+def SPV_VariableOp : SPV_Op<"Variable", []> {
+  let summary = [{
+    Allocate an object in memory, resulting in a pointer to it, which can be
+    used with OpLoad and OpStore.
+  }];
+
+  let description = [{
+    Result Type must be an OpTypePointer. Its Type operand is the type of
+    object in memory.
+
+    Storage Class is the Storage Class of the memory holding the object. It
+    cannot be Generic. It must be the same as the Storage Class operand of
+    the Result Type.
+
+    Initializer is optional.  If Initializer is present, it will be the
+    initial value of the variable’s memory content. Initializer must be an
+    <id> from a constant instruction or a global (module scope) OpVariable
+    instruction. Initializer must have the same type as the type pointed to
+    by Result Type.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    variable-op ::= ssa-id `=` `spv.Variable` (`init(` ssa-use `)`)?
+                    (`bind(` integer-literal, integer-literal `)`)?
+                    (`built_in(` string-literal `)`)?
+                    attribute-dict? `:` spirv-pointer-type
+    ```
+
+    where `init` specifies initializer and `bind` specifies the
+    descriptor set and binding number. `built_in` specifies SPIR-V
+    BuiltIn decoration associated with the op.
+
+    For example:
+
+    ```
+    %0 = spv.constant ...
+
+    %1 = spv.Variable : !spv.ptr<f32, Function>
+    %2 = spv.Variable init(%0): !spv.ptr<f32, Private>
+    %3 = spv.Variable init(%0) bind(1, 2): !spv.ptr<f32, Uniform>
+    %3 = spv.Variable built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Uniform>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_StorageClassAttr:$storage_class,
+    SPV_Optional<AnyType>:$initializer
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$pointer
+  );
+}
+
+// -----
+
+#endif // SPIRV_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td
new file mode 100644
index 00000000000..2073fa44e13
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td
@@ -0,0 +1,415 @@
+//===-- SPIRVStructureOps.td - MLIR SPIR-V Structure Ops ---*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains ops for defining the SPIR-V structure: module, function,
+// and module-level operations. The representational form of these ops deviate
+// from the SPIR-V binary format in order to utilize MLIR mechanisms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef SPIRV_STRUCTURE_OPS
+#else
+#define SPIRV_STRUCTURE_OPS
+
+#ifdef SPIRV_BASE
+#else
+include "mlir/SPIRV/SPIRVBase.td"
+#endif // SPIRV_BASE
+
+def SPV_AddressOfOp : SPV_Op<"_address_of", [InFunctionScope, NoSideEffect]> {
+  let summary = "Get the address of a global variable.";
+
+  let description = [{
+    Variables in module scope are defined using symbol names. This op generates
+    an SSA value that can be used to refer to the symbol within function scope
+    for use in ops that expect an SSA value. This operation has no corresponding
+    SPIR-V instruction; it's merely used for modelling purpose in the SPIR-V
+    dialect. Since variables in module scope in SPIR-V dialect are of pointer
+    type, this op returns a pointer type as well, and the type is the same as
+    the variable referenced.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    spv-address-of-op ::= ssa-id `=` `spv._address_of` symbol-ref-id
+                                     `:` spirv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv._address_of @global_var : !spv.ptr<f32, Input>
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolRefAttr:$variable
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$pointer
+  );
+
+  let hasOpcode = 0;
+}
+
+def SPV_ConstantOp : SPV_Op<"constant", [NoSideEffect]> {
+  let summary = "The op that declares a SPIR-V normal constant";
+
+  let description = [{
+    This op declares a SPIR-V normal constant. SPIR-V has multiple constant
+    instructions covering different constant types:
+
+    * `OpConstantTrue` and `OpConstantFalse` for boolean constants
+    * `OpConstant` for scalar constants
+    * `OpConstantComposite` for composite constants
+    * `OpConstantNull` for null constants
+    * ...
+
+    Having such a plethora of constant instructions renders IR transformations
+    more tedious. Therefore, we use a single `spv.constant` op to represent
+    them all. Note that conversion between those SPIR-V constant instructions
+    and this op is purely mechanical; so it can be scoped to the binary
+    (de)serialzation process.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    spv-constant-op ::= ssa-id `=` `spv.constant` attribute-value
+                        (`:` spirv-type)?
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.constant true
+    %1 = spv.constant dense<[2, 3]> : vector<2xf32>
+    %2 = spv.constant [dense<3.0> : vector<2xf32>] : !spv.array<1xvector<2xf32>>
+    ```
+
+    TODO(antiagainst): support constant structs
+  }];
+
+  let arguments = (ins
+    AnyAttr:$value
+  );
+
+  let results = (outs
+    SPV_Type:$constant
+  );
+
+  let hasOpcode = 0;
+}
+
+def SPV_EntryPointOp : SPV_Op<"EntryPoint", [InModuleScope]> {
+  let summary = [{
+    Declare an entry point, its execution model, and its interface.
+  }];
+
+  let description = [{
+    Execution Model is the execution model for the entry point and its
+    static call tree. See Execution Model.
+
+    Entry Point must be the Result <id> of an OpFunction instruction.
+
+    Name is a name string for the entry point. A module cannot have two
+    OpEntryPoint instructions with the same Execution Model and the same
+    Name string.
+
+    Interface is a list of symbol references to `spv.globalVariable`
+    operations. These declare the set of global variables from a
+    module that form the interface of this entry point. The set of
+    Interface symbols must be equal to or a superset of the
+    `spv.globalVariable`s referenced by the entry point’s static call
+    tree, within the interface’s storage classes.  Before version 1.4,
+    the interface’s storage classes are limited to the Input and
+    Output storage classes. Starting with version 1.4, the interface’s
+    storage classes are all storage classes used in declaring all
+    global variables referenced by the entry point’s call tree.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    execution-model ::= "Vertex" | "TesellationControl" |
+                        <and other SPIR-V execution models...>
+
+    entry-point-op ::= ssa-id `=` `spv.EntryPoint` execution-model
+                       symbol-reference (`, ` symbol-reference)*
+    ```
+
+    For example:
+
+    ```
+    spv.EntryPoint "GLCompute" @foo
+    spv.EntryPoint "Kernel" @foo, @var1, @var2
+
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ExecutionModelAttr:$execution_model,
+    SymbolRefAttr:$fn,
+    OptionalAttr<SymbolRefArrayAttr>:$interface
+  );
+
+  let results = (outs);
+  let autogenSerialization = 0;
+}
+
+
+def SPV_GlobalVariableOp : SPV_Op<"globalVariable", [InModuleScope]> {
+  let summary = [{
+    Allocate an object in memory at module scope. The object is
+    referenced using a symbol name.
+  }];
+
+  let description = [{
+    The variable type must be an OpTypePointer. Its type operand is the type of
+    object in memory.
+
+    Storage Class is the Storage Class of the memory holding the object. It
+    cannot be Generic. It must be the same as the Storage Class operand of
+    the variable types. Only those storage classes that are valid at module
+    scope (like Input, Output, StorageBuffer, etc.) are valid.
+
+    Initializer is optional.  If Initializer is present, it will be
+    the initial value of the variable’s memory content. Initializer
+    must be an symbol defined from a constant instruction or other
+    `spv.globalVariable` operation in module scope. Initializer must
+    have the same type as the type of the defined symbol.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    variable-op ::= `spv.globalVariable` spirv-type symbol-ref-id
+                    (`initializer(` symbol-ref-id `)`)?
+                    (`bind(` integer-literal, integer-literal `)`)?
+                    (`built_in(` string-literal `)`)?
+                    attribute-dict?
+    ```
+
+    where `initializer` specifies initializer and `bind` specifies the
+    descriptor set and binding number. `built_in` specifies SPIR-V
+    BuiltIn decoration associated with the op.
+
+    For example:
+
+    ```
+    spv.globalVariable @var0 : !spv.ptr<f32, Input> @var0
+    spv.globalVariable @var1 initializer(@var0) : !spv.ptr<f32, Output>
+    spv.globalVariable @var2 bind(1, 2) : !spv.ptr<f32, Uniform>
+    spv.globalVariable @var3 built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+    ```
+  }];
+
+  let arguments = (ins
+    TypeAttr:$type,
+    StrAttr:$sym_name,
+    OptionalAttr<SymbolRefAttr>:$initializer
+  );
+
+  let results = (outs);
+
+  let hasOpcode = 0;
+
+  let extraClassDeclaration = [{
+    ::mlir::spirv::StorageClass storageClass() {
+      return this->type().cast<::mlir::spirv::PointerType>().getStorageClass();
+    }
+  }];
+}
+
+def SPV_ModuleOp : SPV_Op<"module",
+                          [SingleBlockImplicitTerminator<"ModuleEndOp">,
+                           NativeOpTrait<"SymbolTable">]> {
+  let summary = "The top-level op that defines a SPIR-V module";
+
+  let description = [{
+    This op defines a SPIR-V module using a MLIR region. The region contains
+    one block. Module-level operations, including functions definitions,
+    are all placed in this block.
+
+    Using an op with a region to define a SPIR-V module enables "embedding"
+    SPIR-V modules in other dialects in a clean manner: this op guarantees
+    the validaty and serializability of a SPIR-V module and thus serves as
+    a clear-cut boundary.
+
+    This op takes no operands and generates no results. This op should not
+    implicitly capture values from the enclosing environment.
+
+    This op has only one region, which only contains one block. The block
+    must be terminated via the `spv._module_end` op.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    addressing-model ::= `"Logical"` | `"Physical32"` | `"Physical64"`
+    memory-model ::= `"Simple"` | `"GLSL450"` | `"OpenCL"` | `"VulkanKHR"`
+    spv-module-op ::= `spv.module` addressing-model memory-model
+                      region
+                      (`attributes` attribute-dict)?
+    ```
+
+    For example:
+
+    ```
+    spv.module "Logical" "VulkanKHR" { }
+
+    spv.module "Logical" "VulkanKHR" {
+      func @do_nothing() -> () {
+        spv.Return
+      }
+    } attributes {
+      capability = ["Shader"],
+      extension = ["SPV_KHR_16bit_storage"]
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AddressingModelAttr:$addressing_model,
+    SPV_MemoryModelAttr:$memory_model,
+    OptionalAttr<StrArrayAttr>:$capabilities,
+    OptionalAttr<StrArrayAttr>:$extensions,
+    OptionalAttr<StrArrayAttr>:$extended_instruction_sets
+  );
+
+  let results = (outs);
+
+  let regions = (region SizedRegion<1>:$body);
+
+  let builders = [OpBuilder<"Builder *, OperationState *state">,
+                  OpBuilder<[{Builder *, OperationState *state,
+                              IntegerAttr addressing_model,
+                              IntegerAttr memory_model,
+                              /*optional*/ArrayAttr capabilities = nullptr,
+                              /*optional*/ArrayAttr extensions = nullptr,
+                              /*optional*/ArrayAttr extended_instruction_sets = nullptr}]>];
+
+  // We need to ensure the block inside the region is properly terminated;
+  // the auto-generated builders do not guarantee that.
+  let skipDefaultBuilders = 1;
+
+  let hasOpcode = 0;
+
+  let extraClassDeclaration = [{
+    Block& getBlock() {
+      return this->getOperation()->getRegion(0).front();
+    }
+  }];
+}
+
+def SPV_ModuleEndOp : SPV_Op<"_module_end", [InModuleScope, Terminator]> {
+  let summary = "The pseudo op that ends a SPIR-V module";
+
+  let description = [{
+    This op terminates the only block inside a `spv.module`'s only region.
+    This op does not have a corresponding SPIR-V instruction and thus will
+    not be serialized into the binary format; it is used solely to satisfy
+    the structual requirement that an block must be ended with a terminator.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+
+  let verifier = [{ return success(); }];
+
+  let hasOpcode = 0;
+}
+
+def SPV_ReferenceOfOp : SPV_Op<"_reference_of", [NoSideEffect]> {
+  let summary = "Reference a specialization constant.";
+
+  let description = [{
+    Specialization constant in module scope are defined using symbol names.
+    This op generates an SSA value that can be used to refer to the symbol
+    within function scope for use in ops that expect an SSA value.
+    This operation has no corresponding SPIR-V instruction; it's merely used
+    for modelling purpose in the SPIR-V dialect. This op's return type is
+    the same as the specialization constant.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    spv-reference-of-op ::= ssa-id `=` `spv._reference_of` symbol-ref-id
+                                       `:` spirv-scalar-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv._reference_of @spec_const : f32
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolRefAttr:$spec_const
+  );
+
+  let results = (outs
+    SPV_Type:$reference
+  );
+
+  let hasOpcode = 0;
+}
+
+def SPV_SpecConstantOp : SPV_Op<"specConstant", [InModuleScope]> {
+  let summary = "The op that declares a SPIR-V specialization constant";
+
+  let description = [{
+    This op declares a SPIR-V scalar specialization constant. SPIR-V has
+    multiple constant instructions covering different scalar types:
+
+    * `OpSpecConstantTrue` and `OpSpecConstantFalse` for boolean constants
+    * `OpSpecConstant` for scalar constants
+
+    Similar as `spv.constant`, this op represents all of the above cases.
+    `OpSpecConstantComposite` and `OpSpecConstantOp` are modelled with
+    separate ops.
+
+    ### Custom assembly form
+
+    ``` {.ebnf}
+    spv-spec-constant-op ::= `spv.specConstant` symbol-ref-id
+                             `=` attribute-value (`:` spirv-type)?
+    ```
+
+    For example:
+
+    ```
+    spv.specConstant @spec_const1 = true
+    spv.specConstant @spec_const2 = 42 : i32
+    ```
+
+    TODO(antiagainst): support composite spec cosntants with another op
+  }];
+
+  let arguments = (ins
+    StrAttr:$sym_name,
+    AnyAttr:$default_value
+  );
+
+  let results = (outs);
+
+  let hasOpcode = 0;
+}
+
+#endif // SPIRV_STRUCTURE_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
new file mode 100644
index 00000000000..b25c7a30917
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
@@ -0,0 +1,194 @@
+//===- SPIRVTypes.h - MLIR SPIR-V Types -------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the types in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
+#define MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
+
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+
+// Pull in all enum type definitions and utility function declarations
+#include "mlir/Dialect/SPIRV/SPIRVEnums.h.inc"
+
+#include <tuple>
+
+namespace mlir {
+namespace spirv {
+
+namespace detail {
+struct ArrayTypeStorage;
+struct ImageTypeStorage;
+struct PointerTypeStorage;
+struct RuntimeArrayTypeStorage;
+struct StructTypeStorage;
+} // namespace detail
+
+namespace TypeKind {
+enum Kind {
+  Array = Type::FIRST_SPIRV_TYPE,
+  Image,
+  Pointer,
+  RuntimeArray,
+  Struct,
+};
+}
+
+// SPIR-V composite type: VectorType, SPIR-V ArrayType, or SPIR-V StructType.
+class CompositeType : public Type {
+public:
+  using Type::Type;
+
+  static bool classof(Type type) {
+    return (type.getKind() == TypeKind::Array ||
+            type.getKind() == TypeKind::Struct ||
+            type.getKind() == StandardTypes::Vector);
+  }
+
+  unsigned getNumElements() const;
+
+  Type getElementType(unsigned) const;
+};
+
+// SPIR-V array type
+class ArrayType : public Type::TypeBase<ArrayType, CompositeType,
+                                        detail::ArrayTypeStorage> {
+public:
+  using Base::Base;
+  // Zero layout specifies that is no layout
+  using LayoutInfo = uint64_t;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Array; }
+
+  static ArrayType get(Type elementType, unsigned elementCount);
+
+  static ArrayType get(Type elementType, unsigned elementCount,
+                       LayoutInfo layoutInfo);
+
+  unsigned getNumElements() const;
+
+  Type getElementType() const;
+
+  bool hasLayout() const;
+
+  uint64_t getArrayStride() const;
+};
+
+// SPIR-V image type
+class ImageType
+    : public Type::TypeBase<ImageType, Type, detail::ImageTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Image; }
+
+  static ImageType
+  get(Type elementType, Dim dim,
+      ImageDepthInfo depth = ImageDepthInfo::DepthUnknown,
+      ImageArrayedInfo arrayed = ImageArrayedInfo::NonArrayed,
+      ImageSamplingInfo samplingInfo = ImageSamplingInfo::SingleSampled,
+      ImageSamplerUseInfo samplerUse = ImageSamplerUseInfo::SamplerUnknown,
+      ImageFormat format = ImageFormat::Unknown) {
+    return ImageType::get(
+        std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                   ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>(
+            elementType, dim, depth, arrayed, samplingInfo, samplerUse,
+            format));
+  }
+
+  static ImageType
+      get(std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                     ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>);
+
+  Type getElementType() const;
+  Dim getDim() const;
+  ImageDepthInfo getDepthInfo() const;
+  ImageArrayedInfo getArrayedInfo() const;
+  ImageSamplingInfo getSamplingInfo() const;
+  ImageSamplerUseInfo getSamplerUseInfo() const;
+  ImageFormat getImageFormat() const;
+  // TODO(ravishankarm): Add support for Access qualifier
+};
+
+// SPIR-V pointer type
+class PointerType
+    : public Type::TypeBase<PointerType, Type, detail::PointerTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Pointer; }
+
+  static PointerType get(Type pointeeType, StorageClass storageClass);
+
+  Type getPointeeType() const;
+
+  StorageClass getStorageClass() const;
+};
+
+// SPIR-V run-time array type
+class RuntimeArrayType
+    : public Type::TypeBase<RuntimeArrayType, Type,
+                            detail::RuntimeArrayTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::RuntimeArray; }
+
+  static RuntimeArrayType get(Type elementType);
+
+  Type getElementType() const;
+};
+
+// SPIR-V struct type
+class StructType : public Type::TypeBase<StructType, CompositeType,
+                                         detail::StructTypeStorage> {
+public:
+  using Base::Base;
+
+  // Layout information used for members in a struct in SPIR-V
+  //
+  // TODO(ravishankarm) : For now this only supports the offset type, so uses
+  // uint64_t value to represent the offset, with
+  // std::numeric_limit<uint64_t>::max indicating no offset. Change this to
+  // something that can hold all the information needed for different member
+  // types
+  using LayoutInfo = uint64_t;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Struct; }
+
+  static StructType get(ArrayRef<Type> memberTypes);
+
+  static StructType get(ArrayRef<Type> memberTypes,
+                        ArrayRef<LayoutInfo> layoutInfo);
+
+  unsigned getNumElements() const;
+
+  Type getElementType(unsigned) const;
+
+  bool hasLayout() const;
+
+  uint64_t getOffset(unsigned) const;
+};
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/Serialization.h b/third_party/mlir/include/mlir/Dialect/SPIRV/Serialization.h
new file mode 100644
index 00000000000..bfc9062bfd9
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/Serialization.h
@@ -0,0 +1,49 @@
+//===- Serialization.h - MLIR SPIR-V (De)serialization ----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the entry points for serialize and deserialze SPIR-V
+// binary modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SERIALIZATION_H_
+#define MLIR_DIALECT_SPIRV_SERIALIZATION_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+struct LogicalResult;
+class MLIRContext;
+
+namespace spirv {
+class ModuleOp;
+
+/// Serializes the given SPIR-V `module` and writes to `binary`. On failure,
+/// reports errors to the error handler registered with the MLIR context for
+/// `module`.
+LogicalResult serialize(ModuleOp module, SmallVectorImpl<uint32_t> &binary);
+
+/// Deserializes the given SPIR-V `binary` module and creates a MLIR ModuleOp
+/// in the given `context`. Returns the ModuleOp on success; otherwise, reports
+/// errors to the error handler registered with `context` and returns
+/// llvm::None.
+Optional<ModuleOp> deserialize(ArrayRef<uint32_t> binary, MLIRContext *context);
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SERIALIZATION_H_
diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt
new file mode 100644
index 00000000000..670676f24db
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRStandardOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.h b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.h
new file mode 100644
index 00000000000..3d2f34c40da
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.h
@@ -0,0 +1,363 @@
+//===- Ops.h - Standard MLIR Operations -------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines convenience types for working with standard operations
+// in the MLIR operation set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_STANDARDOPS_OPS_H
+#define MLIR_DIALECT_STANDARDOPS_OPS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+class AffineMap;
+class Builder;
+class FuncOp;
+class OpBuilder;
+
+class StandardOpsDialect : public Dialect {
+public:
+  StandardOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "std"; }
+};
+
+/// The predicate indicates the type of the comparison to perform:
+/// (in)equality; (un)signed less/greater than (or equal to).
+enum class CmpIPredicate {
+  FirstValidValue,
+  // (In)equality comparisons.
+  EQ = FirstValidValue,
+  NE,
+  // Signed comparisons.
+  SLT,
+  SLE,
+  SGT,
+  SGE,
+  // Unsigned comparisons.
+  ULT,
+  ULE,
+  UGT,
+  UGE,
+  // Number of predicates.
+  NumPredicates
+};
+
+/// The predicate indicates the type of the comparison to perform:
+/// (un)orderedness, (in)equality and less/greater than (or equal to) as
+/// well as predicates that are always true or false.
+enum class CmpFPredicate {
+  FirstValidValue,
+  // Always false
+  AlwaysFalse = FirstValidValue,
+  // Ordered comparisons
+  OEQ,
+  OGT,
+  OGE,
+  OLT,
+  OLE,
+  ONE,
+  // Both ordered
+  ORD,
+  // Unordered comparisons
+  UEQ,
+  UGT,
+  UGE,
+  ULT,
+  ULE,
+  UNE,
+  // Any unordered
+  UNO,
+  // Always true
+  AlwaysTrue,
+  // Number of predicates.
+  NumPredicates
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/StandardOps/Ops.h.inc"
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning a float value of FloatType.
+///
+///   %1 = "std.constant"(){value: 42.0} : bf16
+///
+class ConstantFloatOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+
+  /// Builds a constant float op producing a float of the specified type.
+  static void build(Builder *builder, OperationState *result,
+                    const APFloat &value, FloatType type);
+
+  APFloat getValue() { return getAttrOfType<FloatAttr>("value").getValue(); }
+
+  static bool classof(Operation *op);
+};
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning an integer value of IntegerType.
+///
+///   %1 = "std.constant"(){value: 42} : i32
+///
+class ConstantIntOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+  /// Build a constant int op producing an integer of the specified width.
+  static void build(Builder *builder, OperationState *result, int64_t value,
+                    unsigned width);
+
+  /// Build a constant int op producing an integer with the specified type,
+  /// which must be an integer type.
+  static void build(Builder *builder, OperationState *result, int64_t value,
+                    Type type);
+
+  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+
+  static bool classof(Operation *op);
+};
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning an integer value of Index type.
+///
+///   %1 = "std.constant"(){value: 99} : () -> index
+///
+class ConstantIndexOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+
+  /// Build a constant int op producing an index.
+  static void build(Builder *builder, OperationState *result, int64_t value);
+
+  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+
+  static bool classof(Operation *op);
+};
+
+// DmaStartOp starts a non-blocking DMA operation that transfers data from a
+// source memref to a destination memref. The source and destination memref need
+// not be of the same dimensionality, but need to have the same elemental type.
+// The operands include the source and destination memref's each followed by its
+// indices, size of the data transfer in terms of the number of elements (of the
+// elemental type of the memref), a tag memref with its indices, and optionally
+// at the end, a stride and a number_of_elements_per_stride arguments. The tag
+// location is used by a DmaWaitOp to check for completion. The indices of the
+// source memref, destination memref, and the tag memref have the same
+// restrictions as any load/store. The optional stride arguments should be of
+// 'index' type, and specify a stride for the slower memory space (memory space
+// with a lower memory space id), tranferring chunks of
+// number_of_elements_per_stride every stride until %num_elements are
+// transferred. Either both or no stride arguments should be specified.
+//
+// For example, a DmaStartOp operation that transfers 256 elements of a memref
+// '%src' in memory space 0 at indices [%i, %j] to memref '%dst' in memory space
+// 1 at indices [%k, %l], would be specified as follows:
+//
+//   %num_elements = constant 256
+//   %idx = constant 0 : index
+//   %tag = alloc() : memref<1 x i32, (d0) -> (d0), 4>
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+//     memref<40 x 128 x f32>, (d0) -> (d0), 0>,
+//     memref<2 x 1024 x f32>, (d0) -> (d0), 1>,
+//     memref<1 x i32>, (d0) -> (d0), 2>
+//
+//   If %stride and %num_elt_per_stride are specified, the DMA is expected to
+//   transfer %num_elt_per_stride elements every %stride elements apart from
+//   memory space 0 until %num_elements are transferred.
+//
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+//             %num_elt_per_stride :
+//
+// TODO(mlir-team): add additional operands to allow source and destination
+// striding, and multiple stride levels.
+// TODO(andydavis) Consider replacing src/dst memref indices with view memrefs.
+class DmaStartOp
+    : public Op<DmaStartOp, OpTrait::VariadicOperands, OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, Value *srcMemRef,
+                    ArrayRef<Value *> srcIndices, Value *destMemRef,
+                    ArrayRef<Value *> destIndices, Value *numElements,
+                    Value *tagMemRef, ArrayRef<Value *> tagIndices,
+                    Value *stride = nullptr,
+                    Value *elementsPerStride = nullptr);
+
+  // Returns the source MemRefType for this DMA operation.
+  Value *getSrcMemRef() { return getOperand(0); }
+  // Returns the rank (number of indices) of the source MemRefType.
+  unsigned getSrcMemRefRank() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getRank();
+  }
+  // Returns the source memerf indices for this DMA operation.
+  operand_range getSrcIndices() {
+    return {getOperation()->operand_begin() + 1,
+            getOperation()->operand_begin() + 1 + getSrcMemRefRank()};
+  }
+
+  // Returns the destination MemRefType for this DMA operations.
+  Value *getDstMemRef() { return getOperand(1 + getSrcMemRefRank()); }
+  // Returns the rank (number of indices) of the destination MemRefType.
+  unsigned getDstMemRefRank() {
+    return getDstMemRef()->getType().cast<MemRefType>().getRank();
+  }
+  unsigned getSrcMemorySpace() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+  unsigned getDstMemorySpace() {
+    return getDstMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  // Returns the destination memref indices for this DMA operation.
+  operand_range getDstIndices() {
+    return {getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1,
+            getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1 +
+                getDstMemRefRank()};
+  }
+
+  // Returns the number of elements being transferred by this DMA operation.
+  Value *getNumElements() {
+    return getOperand(1 + getSrcMemRefRank() + 1 + getDstMemRefRank());
+  }
+
+  // Returns the Tag MemRef for this DMA operation.
+  Value *getTagMemRef() {
+    return getOperand(1 + getSrcMemRefRank() + 1 + getDstMemRefRank() + 1);
+  }
+  // Returns the rank (number of indices) of the tag MemRefType.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    unsigned tagIndexStartPos =
+        1 + getSrcMemRefRank() + 1 + getDstMemRefRank() + 1 + 1;
+    return {getOperation()->operand_begin() + tagIndexStartPos,
+            getOperation()->operand_begin() + tagIndexStartPos +
+                getTagMemRefRank()};
+  }
+
+  /// Returns true if this is a DMA from a faster memory space to a slower one.
+  bool isDestMemorySpaceFaster() {
+    return (getSrcMemorySpace() < getDstMemorySpace());
+  }
+
+  /// Returns true if this is a DMA from a slower memory space to a faster one.
+  bool isSrcMemorySpaceFaster() {
+    // Assumes that a lower number is for a slower memory space.
+    return (getDstMemorySpace() < getSrcMemorySpace());
+  }
+
+  /// Given a DMA start operation, returns the operand position of either the
+  /// source or destination memref depending on the one that is at the higher
+  /// level of the memory hierarchy. Asserts failure if neither is true.
+  unsigned getFasterMemPos() {
+    assert(isSrcMemorySpaceFaster() || isDestMemorySpaceFaster());
+    return isSrcMemorySpaceFaster() ? 0 : getSrcMemRefRank() + 1;
+  }
+
+  static StringRef getOperationName() { return "std.dma_start"; }
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+
+  bool isStrided() {
+    return getNumOperands() != 1 + getSrcMemRefRank() + 1 + getDstMemRefRank() +
+                                   1 + 1 + getTagMemRefRank();
+  }
+
+  Value *getStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1 - 1);
+  }
+
+  Value *getNumElementsPerStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1);
+  }
+};
+
+// DmaWaitOp blocks until the completion of a DMA operation associated with the
+// tag element '%tag[%index]'. %tag is a memref, and %index has to be an index
+// with the same restrictions as any load/store index. %num_elements is the
+// number of elements associated with the DMA operation. For example:
+//
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+//     memref<2048 x f32>, (d0) -> (d0), 0>,
+//     memref<256 x f32>, (d0) -> (d0), 1>
+//     memref<1 x i32>, (d0) -> (d0), 2>
+//   ...
+//   ...
+//   dma_wait %tag[%index], %num_elements : memref<1 x i32, (d0) -> (d0), 2>
+//
+class DmaWaitOp
+    : public Op<DmaWaitOp, OpTrait::VariadicOperands, OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState *result, Value *tagMemRef,
+                    ArrayRef<Value *> tagIndices, Value *numElements);
+
+  static StringRef getOperationName() { return "std.dma_wait"; }
+
+  // Returns the Tag MemRef associated with the DMA operation being waited on.
+  Value *getTagMemRef() { return getOperand(0); }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    return {getOperation()->operand_begin() + 1,
+            getOperation()->operand_begin() + 1 + getTagMemRefRank()};
+  }
+
+  // Returns the rank (number of indices) of the tag memref.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  // Returns the number of elements transferred in the associated DMA operation.
+  Value *getNumElements() { return getOperand(1 + getTagMemRefRank()); }
+
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// Prints dimension and symbol list.
+void printDimAndSymbolList(Operation::operand_iterator begin,
+                           Operation::operand_iterator end, unsigned numDims,
+                           OpAsmPrinter *p);
+
+/// Parses dimension and symbol list and returns true if parsing failed.
+ParseResult parseDimAndSymbolList(OpAsmParser *parser,
+                                  SmallVector<Value *, 4> &operands,
+                                  unsigned &numDims);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_STANDARDOPS_OPS_H
diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td
new file mode 100644
index 00000000000..37f2ac7b5c1
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td
@@ -0,0 +1,959 @@
+//===- Ops.td - Standard operation definitions -------------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines some MLIR standard operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef STANDARD_OPS
+#else
+#define STANDARD_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def Std_Dialect : Dialect {
+  let name = "std";
+  let cppNamespace = "";
+}
+
+// Base class for Standard dialect ops.
+class Std_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Std_Dialect, mnemonic, traits> {
+  // For every standard op, there needs to be a:
+  //   * void print(OpAsmPrinter *p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser *parser,
+  //                                         OperationState *result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// Base class for standard cast operations. Requires single operand and result,
+// but does not constrain them to specific types.
+class CastOp<string mnemonic, list<OpTrait> traits = []> :
+    Std_Op<mnemonic, !listconcat(traits, [NoSideEffect])> {
+
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *source, Type destType", [{
+       impl::buildCastOp(builder, result, source, destType);
+  }]>];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+  let printer = [{
+    return printStandardCastOp(this->getOperation(), p);
+  }];
+  let verifier = [{ return ::verifyCastOp(*this); }];
+
+  let hasFolder = 1;
+}
+
+// Base class for standard arithmetic operations.  Requires operands and
+// results to be of the same type, but does not constrain them to specific
+// types.  Individual classes will have `lhs` and `rhs` accessor to operands.
+class ArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    Op<Std_Dialect, mnemonic,
+       !listconcat(traits, [NoSideEffect, SameOperandsAndResultType])> {
+
+  let results = (outs AnyType);
+
+  let parser = [{
+    return impl::parseBinaryOp(parser, result);
+  }];
+
+  let printer = [{
+    return printStandardBinaryOp(this->getOperation(), p);
+  }];
+}
+
+// Base class for standard arithmetic operations on integers, vectors and
+// tensors thereof.  This operation takes two operands and returns one result,
+// each of these is required to be of the same type.  This type may be an
+// integer scalar type, a vector whose element type is an integer type, or an
+// integer tensor.  The custom assembly form of the operaton is as follows
+//
+//     <op>i %0, %1 : i32
+class IntArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    ArithmeticOp<mnemonic, traits>,
+    Arguments<(ins IntegerLike:$lhs, IntegerLike:$rhs)>;
+
+// Base class for standard arithmetic binary operations on floats, vectors and
+// tensors thereof.  This operation has two operands and returns one result,
+// each of these is required to be of the same type.  This type may be a
+// floating point scalar type, a vector whose element type is a floating point
+// type, or a floating point tensor.  The custom assembly form of the operation
+// is as follows
+//
+//     <op>f %0, %1 : f32
+class FloatArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    ArithmeticOp<mnemonic, traits>,
+    Arguments<(ins FloatLike:$lhs, FloatLike:$rhs)>;
+
+def AddFOp : FloatArithmeticOp<"addf"> {
+  let summary = "floating point addition operation";
+  let hasFolder = 1;
+}
+
+def AddIOp : IntArithmeticOp<"addi", [Commutative]> {
+  let summary = "integer addition operation";
+  let hasFolder = 1;
+}
+
+def AllocOp : Std_Op<"alloc"> {
+  let summary = "memory allocation operation";
+  let description = [{
+    The "alloc" operation allocates a region of memory, as specified by its
+    memref type. For example:
+
+      %0 = alloc() : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+
+    The optional list of dimension operands are bound to the dynamic dimensions
+    specified in its memref type. In the example below, the ssa value '%d' is
+    bound to the second dimension of the memref (which is dynamic).
+
+      %0 = alloc(%d) : memref<8x?xf32, (d0, d1) -> (d0, d1), 1>
+
+    The optional list of symbol operands are bound to the symbols of the
+    memrefs affine map. In the example below, the ssa value '%s' is bound to
+    the symbol 's0' in the affine map specified in the allocs memref type.
+
+      %0 = alloc()[%s] : memref<8x64xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1>
+
+    This operation returns a single ssa value of memref type, which can be used
+    by subsequent load and store operations.
+  }];
+
+  let arguments = (ins Variadic<Index>:$value);
+  let results = (outs AnyMemRef);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, MemRefType memrefType", [{
+       result->types.push_back(memrefType);
+     }]
+  >];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult()->getType().cast<MemRefType>(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def AndOp : IntArithmeticOp<"and", [Commutative]> {
+  let summary = "integer binary and";
+  let hasFolder = 1;
+}
+
+def BranchOp : Std_Op<"br", [Terminator]> {
+  let summary = "branch operation";
+  let description = [{
+    The "br" operation represents a branch operation in a function.
+    The operation takes variable number of operands and produces no results.
+    The operand number and types for each successor must match the arguments of
+    the block successor. For example:
+
+      ^bb2:
+        %2 = call @someFn()
+        br ^bb3(%2 : tensor<*xf32>)
+      ^bb3(%3: tensor<*xf32>):
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, Block *dest,"
+    "ArrayRef<Value *> operands = {}", [{
+      result->addSuccessor(dest, operands);
+  }]>];
+
+  // BranchOp is fully verified by traits.
+  let verifier = ?;
+
+  let extraClassDeclaration = [{
+    Block *getDest();
+    void setDest(Block *block);
+
+    /// Erase the operand at 'index' from the operand list.
+    void eraseOperand(unsigned index);
+  }];
+}
+
+def CallOp : Std_Op<"call"> {
+  let summary = "call operation";
+  let description = [{
+    The "call" operation represents a direct call to a function.  The operands
+    and result types of the call must match the specified function type.  The
+    callee is encoded as a function attribute named "callee".
+
+      %2 = call @my_add(%0, %1) : (f32, f32) -> f32
+  }];
+
+  let arguments = (ins SymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, FuncOp callee,"
+    "ArrayRef<Value *> operands = {}", [{
+      result->addOperands(operands);
+      result->addAttribute("callee", builder->getSymbolRefAttr(callee));
+      result->addTypes(callee.getType().getResults());
+  }]>, OpBuilder<
+    "Builder *builder, OperationState *result, StringRef callee,"
+    "ArrayRef<Type> results, ArrayRef<Value *> operands = {}", [{
+      result->addOperands(operands);
+      result->addAttribute("callee", builder->getSymbolRefAttr(callee));
+      result->addTypes(results);
+  }]>];
+
+  let extraClassDeclaration = [{
+    StringRef getCallee() { return callee(); }
+    FunctionType getCalleeType();
+
+    /// Get the argument operands to the called function.
+    operand_range getArgOperands() {
+      return {arg_operand_begin(), arg_operand_end()};
+    }
+
+    operand_iterator arg_operand_begin() { return operand_begin(); }
+    operand_iterator arg_operand_end() { return operand_end(); }
+  }];
+}
+
+def CallIndirectOp : Std_Op<"call_indirect"> {
+  let summary = "indirect call operation";
+  let description = [{
+    The "call_indirect" operation represents an indirect call to a value of
+    function type.  Functions are first class types in MLIR, and may be passed
+    as arguments and merged together with block arguments.  The operands
+    and result types of the call must match the specified function type.
+
+      %3 = call_indirect %2(%0, %1) : (f32, f32) -> f32
+  }];
+
+  let arguments = (ins FunctionType:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, Value *callee,"
+    "ArrayRef<Value *> operands = {}", [{
+      result->operands.push_back(callee);
+      result->addOperands(operands);
+      result->addTypes(callee->getType().cast<FunctionType>().getResults());
+  }]>];
+
+  let extraClassDeclaration = [{
+    Value *getCallee() { return getOperand(0); }
+
+    /// Get the argument operands to the called function.
+    operand_range getArgOperands() {
+      return {arg_operand_begin(), arg_operand_end()};
+    }
+
+    operand_iterator arg_operand_begin() { return ++operand_begin(); }
+    operand_iterator arg_operand_end() { return operand_end(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def CmpIOp : Std_Op<"cmpi", [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> {
+  let summary = "integer comparison operation";
+  let description = [{
+    The "cmpi" operation compares its two operands according to the integer
+    comparison rules and the predicate specified by the respective attribute.
+    The predicate defines the type of comparison: (in)equality, (un)signed
+    less/greater than (or equal to).  The operands must have the same type, and
+    this type must be an integer type, a vector or a tensor thereof.  The result
+    is an i1, or a vector/tensor thereof having the same shape as the inputs.
+    Since integers are signless, the predicate also explicitly indicates
+    whether to interpret the operands as signed or unsigned integers for
+    less/greater than comparisons.  For the sake of readability by humans,
+    custom assembly form for the operation uses a string-typed attribute for
+    the predicate.  The value of this attribute corresponds to lower-cased name
+    of the predicate constant, e.g., "slt" means "signed less than".  The string
+    representation of the attribute is merely a syntactic sugar and is converted
+    to an integer attribute by the parser.
+
+      %r1 = cmpi "eq" %0, %1 : i32
+      %r2 = cmpi "slt" %0, %1 : tensor<42x42xi64>
+      %r3 = "std.cmpi"(%0, %1){predicate: 0} : (i8, i8) -> i1
+  }];
+
+  let arguments = (ins IntegerLike:$lhs, IntegerLike:$rhs);
+  let results = (outs BoolLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, CmpIPredicate predicate,"
+    "Value *lhs, Value *rhs", [{
+      ::buildCmpIOp(builder, result, predicate, lhs, rhs);
+  }]>];
+
+  let extraClassDeclaration = [{
+    static StringRef getPredicateAttrName() { return "predicate"; }
+    static CmpIPredicate getPredicateByName(StringRef name);
+
+    CmpIPredicate getPredicate() {
+      return (CmpIPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
+          .getInt();
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def CmpFOp : Std_Op<"cmpf", [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> {
+  let summary = "floating-point comparison operation";
+  let description = [{
+    The "cmpf" operation compares its two operands according to the float
+    comparison rules and the predicate specified by the respective attribute.
+    The predicate defines the type of comparison: (un)orderedness, (in)equality
+    and signed less/greater than (or equal to) as well as predicates that are
+    always true or false.  The operands must have the same type, and this type
+    must be a float type, or a vector or tensor thereof.  The result is an i1,
+    or a vector/tensor thereof having the same shape as the inputs. Unlike cmpi,
+    the operands are always treated as signed. The u prefix indicates
+    *unordered* comparison, not unsigned comparison, so "une" means unordered or
+    not equal. For the sake of readability by humans, custom assembly form for
+    the operation uses a string-typed attribute for the predicate.  The value of
+    this attribute corresponds to lower-cased name of the predicate constant,
+    e.g., "one" means "ordered not equal".  The string representation of the
+    attribute is merely a syntactic sugar and is converted to an integer
+    attribute by the parser.
+
+      %r1 = cmpf "oeq" %0, %1 : f32
+      %r2 = cmpf "ult" %0, %1 : tensor<42x42xf64>
+      %r3 = "std.cmpf"(%0, %1) {predicate: 0} : (f8, f8) -> i1
+  }];
+
+  let arguments = (ins FloatLike:$lhs, FloatLike:$rhs);
+  let results = (outs BoolLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, CmpFPredicate predicate,"
+    "Value *lhs, Value *rhs", [{
+      ::buildCmpFOp(builder, result, predicate, lhs, rhs);
+  }]>];
+
+  let extraClassDeclaration = [{
+    static StringRef getPredicateAttrName() { return "predicate"; }
+    static CmpFPredicate getPredicateByName(StringRef name);
+
+    CmpFPredicate getPredicate() {
+      return (CmpFPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
+          .getInt();
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def CondBranchOp : Std_Op<"cond_br", [Terminator]> {
+  let summary = "conditional branch operation";
+  let description = [{
+    The "cond_br" operation represents a conditional branch operation in a
+    function. The operation takes variable number of operands and produces
+    no results. The operand number and types for each successor must match the
+    arguments of the block successor. For example:
+
+      ^bb0:
+         %0 = extract_element %arg0[] : tensor<i1>
+         cond_br %0, ^bb1, ^bb2
+      ^bb1:
+         ...
+      ^bb2:
+         ...
+  }];
+
+  let arguments = (ins I1:$condition, Variadic<AnyType>:$branchOperands);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, Value *condition,"
+    "Block *trueDest, ArrayRef<Value *> trueOperands,"
+    "Block *falseDest, ArrayRef<Value *> falseOperands", [{
+      result->addOperands(condition);
+      result->addSuccessor(trueDest, trueOperands);
+      result->addSuccessor(falseDest, falseOperands);
+  }]>];
+
+  // CondBranchOp is fully verified by traits.
+  let verifier = ?;
+
+  let extraClassDeclaration = [{
+    // These are the indices into the dests list.
+    enum { trueIndex = 0, falseIndex = 1 };
+
+    // The condition operand is the first operand in the list.
+    Value *getCondition() { return getOperand(0); }
+
+    /// Return the destination if the condition is true.
+    Block *getTrueDest() {
+      return getOperation()->getSuccessor(trueIndex);
+    }
+
+    /// Return the destination if the condition is false.
+    Block *getFalseDest() {
+      return getOperation()->getSuccessor(falseIndex);
+    }
+
+    // Accessors for operands to the 'true' destination.
+    Value *getTrueOperand(unsigned idx) {
+      assert(idx < getNumTrueOperands());
+      return getOperand(getTrueDestOperandIndex() + idx);
+    }
+
+    void setTrueOperand(unsigned idx, Value *value) {
+      assert(idx < getNumTrueOperands());
+      setOperand(getTrueDestOperandIndex() + idx, value);
+    }
+
+    operand_iterator true_operand_begin() {
+      return operand_begin() + getTrueDestOperandIndex();
+    }
+    operand_iterator true_operand_end() {
+      return true_operand_begin() + getNumTrueOperands();
+    }
+    operand_range getTrueOperands() {
+      return {true_operand_begin(), true_operand_end()};
+    }
+
+    unsigned getNumTrueOperands()  {
+      return getOperation()->getNumSuccessorOperands(trueIndex);
+    }
+
+    /// Erase the operand at 'index' from the true operand list.
+    void eraseTrueOperand(unsigned index)  {
+      getOperation()->eraseSuccessorOperand(trueIndex, index);
+    }
+
+    // Accessors for operands to the 'false' destination.
+    Value *getFalseOperand(unsigned idx) {
+      assert(idx < getNumFalseOperands());
+      return getOperand(getFalseDestOperandIndex() + idx);
+    }
+    void setFalseOperand(unsigned idx, Value *value) {
+      assert(idx < getNumFalseOperands());
+      setOperand(getFalseDestOperandIndex() + idx, value);
+    }
+
+    operand_iterator false_operand_begin() { return true_operand_end(); }
+    operand_iterator false_operand_end() {
+      return false_operand_begin() + getNumFalseOperands();
+    }
+    operand_range getFalseOperands() {
+      return {false_operand_begin(), false_operand_end()};
+    }
+
+    unsigned getNumFalseOperands() {
+      return getOperation()->getNumSuccessorOperands(falseIndex);
+    }
+
+    /// Erase the operand at 'index' from the false operand list.
+    void eraseFalseOperand(unsigned index) {
+      getOperation()->eraseSuccessorOperand(falseIndex, index);
+    }
+
+  private:
+    /// Get the index of the first true destination operand.
+    unsigned getTrueDestOperandIndex() { return 1; }
+
+    /// Get the index of the first false destination operand.
+    unsigned getFalseDestOperandIndex() {
+      return getTrueDestOperandIndex() + getNumTrueOperands();
+    }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def ConstantOp : Std_Op<"constant", [NoSideEffect]> {
+  let summary = "constant";
+
+  let arguments = (ins AnyAttr:$value);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Attribute value",
+    [{ build(builder, result, value.getType(), value); }]>];
+
+  let extraClassDeclaration = [{
+    Attribute getValue() { return getAttr("value"); }
+
+    /// Returns true if a constant operation can be built with the given value
+    /// and result type.
+    static bool isBuildableWith(Attribute value, Type type);
+  }];
+
+  let hasFolder = 1;
+}
+
+def DeallocOp : Std_Op<"dealloc"> {
+  let summary = "memory deallocation operation";
+  let description = [{
+    The "dealloc" operation frees the region of memory referenced by a memref
+    which was originally created by the "alloc" operation.
+    The "dealloc" operation should not be called on memrefs which alias an
+    alloc'd memref (i.e. memrefs returned by the "view" and "reshape"
+    operations).
+
+      %0 = alloc() : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+      dealloc %0 : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+  }];
+
+  let arguments = (ins AnyMemRef:$memref);
+
+  let hasCanonicalizer = 1;
+}
+
+def DimOp : Std_Op<"dim", [NoSideEffect]> {
+  let summary = "dimension index operation";
+  let description = [{
+    The "dim" operation takes a memref or tensor operand and returns an "index".
+    It requires a single integer attribute named "index". It returns the size
+    of the specified dimension. For example:
+
+      %1 = dim %0, 2 : tensor<?x?x?xf32>
+  }];
+
+  let arguments = (ins AnyTypeOf<[AnyMemRef, AnyTensor],
+                                 "any tensor or memref type">:$memrefOrTensor,
+                       APIntAttr:$index);
+  let results = (outs Index);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *memrefOrTensor,"
+    "unsigned index", [{
+      auto indexType = builder->getIndexType();
+      auto indexAttr = builder->getIntegerAttr(indexType, index);
+      build(builder, result, indexType, memrefOrTensor, indexAttr);
+    }]>];
+
+  let extraClassDeclaration = [{
+    unsigned getIndex() {
+      return getAttrOfType<IntegerAttr>("index").getValue().getZExtValue();
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def DivFOp : FloatArithmeticOp<"divf"> {
+  let summary = "floating point division operation";
+}
+
+def DivISOp : IntArithmeticOp<"divis"> {
+  let summary = "signed integer division operation";
+  let hasFolder = 1;
+}
+
+def DivIUOp : IntArithmeticOp<"diviu"> {
+  let summary = "unsigned integer division operation";
+  let hasFolder = 1;
+}
+
+def ExtractElementOp : Std_Op<"extract_element", [NoSideEffect]> {
+  let summary = "element extract operation";
+  let description = [{
+    The "extract_element" op reads a tensor or vector and returns one element
+    from it specified by an index list. The output of extract is a new value
+    with the same type as the elements of the tensor or vector. The arity of
+    indices matches the rank of the accessed value (i.e., if a tensor is of rank
+    3, then 3 indices are required for the extract).  The indices should all be
+    of affine_int type. For example:
+
+      %0 = extract_element %0[%1, %2] : vector<4x4xi32>
+  }];
+
+  let arguments = (ins AnyTypeOf<[AnyVector, AnyTensor]>:$aggregate,
+                       Variadic<Index>:$indices);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *aggregate,"
+    "ArrayRef<Value *> indices = {}", [{
+      auto resType = aggregate->getType().cast<ShapedType>()
+                                         .getElementType();
+      build(builder, result, resType, aggregate, indices);
+    }]>];
+
+  let extraClassDeclaration = [{
+    Value *getAggregate() { return getOperand(0); }
+
+    operand_range getIndices() {
+      return {getOperation()->operand_begin() + 1,
+              getOperation()->operand_end()};
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def IndexCastOp : CastOp<"index_cast">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast between index and integer types";
+  let description = [{
+    Casts between integer scalars and 'index' scalars.  Index is an integer of
+    platform-specific bit width.  If casting to a wider integer, the value is
+    sign-extended.  If casting to a narrower integer, the value is truncated.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def SIToFPOp : CastOp<"sitofp">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast from integer type to floating-point";
+  let description = [{
+    Cast from a value interpreted as signed integer to the corresponding
+    floating-point value. If the value cannot be exactly represented, it is
+    rounded using the default rounding mode. Only scalars are currently
+    supported.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def LoadOp : Std_Op<"load"> {
+  let summary = "load operation";
+  let description = [{
+    The "load" op reads an element from a memref specified by an index list. The
+    output of load is a new value with the same type as the elements of the
+    memref. The arity of indices is the rank of the memref (i.e., if the memref
+    loaded from is of rank 3, then 3 indices are required for the load following
+    the memref identifier). For example:
+
+      %3 = load %0[%1, %1] : memref<4x4xi32>
+  }];
+
+  let arguments = (ins AnyMemRef:$memref, Variadic<Index>:$indices);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, Value *memref,"
+    "ArrayRef<Value *> indices = {}", [{
+      auto memrefType = memref->getType().cast<MemRefType>();
+      result->addOperands(memref);
+      result->addOperands(indices);
+      result->types.push_back(memrefType.getElementType());
+  }]>];
+
+  let extraClassDeclaration = [{
+    Value *getMemRef() { return getOperand(0); }
+    void setMemRef(Value *value) { setOperand(0, value); }
+    MemRefType getMemRefType() {
+      return getMemRef()->getType().cast<MemRefType>();
+    }
+
+    operand_range getIndices() {
+      return {getOperation()->operand_begin() + 1, getOperation()->operand_end()};
+    }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def MemRefCastOp : CastOp<"memref_cast"> {
+  let summary = "memref cast operation";
+  let description = [{
+    The "memref_cast" operation converts a memref from one type to an equivalent
+    type with a compatible shape. The source and destination types are
+    when both are memref types with the same element type, affine mappings,
+    address space, and rank but where the individual dimensions may add or
+    remove constant dimensions from the memref type.
+
+    If the cast converts any dimensions from an unknown to a known size, then it
+    acts as an assertion that fails at runtime of the dynamic dimensions
+    disagree with resultant destination size.
+
+    Assert that the input dynamic shape matches the destination static shape.
+       %2 = memref_cast %1 : memref<?x?xf32> to memref<4x4xf32>
+    Erase static shape information, replacing it with dynamic information.
+       %3 = memref_cast %1 : memref<4xf32> to memref<?xf32>
+  }];
+
+  let arguments = (ins AnyMemRef:$source);
+  let results = (outs AnyMemRef);
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+
+    /// The result of a memref_cast is always a memref.
+    MemRefType getType() { return getResult()->getType().cast<MemRefType>(); }
+  }];
+}
+
+def MulFOp : FloatArithmeticOp<"mulf"> {
+  let summary = "foating point multiplication operation";
+  let hasFolder = 1;
+}
+
+def MulIOp : IntArithmeticOp<"muli", [Commutative]> {
+  let summary = "integer multiplication operation";
+  let hasFolder = 1;
+}
+
+def OrOp : IntArithmeticOp<"or", [Commutative]> {
+  let summary = "integer binary or";
+  let hasFolder = 1;
+}
+
+def RankOp : Std_Op<"rank", [NoSideEffect]> {
+  let summary = "rank operation";
+  let description = [{
+    The "rank" operation takes a tensor operand and returns its rank.
+
+      %1 = rank %0 : index
+  }];
+
+  let arguments = (ins AnyTensor);
+  let results = (outs Index);
+  let verifier = ?;
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *tensor", [{
+      auto indexType = builder->getIndexType();
+      build(builder, result, indexType, tensor);
+    }]>];
+
+  let hasFolder = 1;
+}
+
+def RemFOp : FloatArithmeticOp<"remf"> {
+  let summary = "floating point division remainder operation";
+}
+
+def RemISOp : IntArithmeticOp<"remis"> {
+  let summary = "signed integer division remainder operation";
+  let hasFolder = 1;
+}
+
+def RemIUOp : IntArithmeticOp<"remiu"> {
+  let summary = "unsigned integer division remainder operation";
+  let hasFolder = 1;
+}
+
+def ReturnOp : Std_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes variable number of operands and produces no results.
+    The operand number and types must match the signature of the function
+    that contains the operation. For example:
+
+      func @foo() : (i32, f8) {
+      ...
+      return %0, %1 : i32, f8
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState *result", [{ build(b, result, llvm::None); }]
+  >];
+}
+
+def SelectOp : Std_Op<"select", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "select operation";
+  let description = [{
+    The "select" operation chooses one value based on a binary condition
+    supplied as its first operand. If the value of the first operand is 1, the
+    second operand is chosen, otherwise the third operand is chosen. The second
+    and the third operand must have the same type. The operation applies
+    elementwise to vectors and tensors.  The shape of all arguments must be
+    identical. For example, the maximum operation is obtained by combining
+    "select" with "cmpi" as follows.
+
+      %2 = cmpi "gt" %0, %1 : i32         // %2 is i1
+      %3 = select %2, %0, %1 : i32
+  }];
+
+  let arguments = (ins BoolLike:$condition, AnyType:$true_value,
+                       AnyType:$false_value);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *condition,"
+    "Value *trueValue, Value *falseValue", [{
+      result->addOperands({condition, trueValue, falseValue});
+      result->addTypes(trueValue->getType());
+  }]>];
+
+  let extraClassDeclaration = [{
+      Value *getCondition() { return condition(); }
+      Value *getTrueValue() { return true_value(); }
+      Value *getFalseValue() { return false_value(); }
+  }];
+
+  let hasFolder = 1;
+}
+def ShlISOp : IntArithmeticOp<"shlis"> {
+  let summary = "signed integer shift left";
+}
+
+def SubFOp : FloatArithmeticOp<"subf"> {
+  let summary = "floating point subtraction operation";
+  let hasFolder = 1;
+}
+
+def SubIOp : IntArithmeticOp<"subi"> {
+  let summary = "integer subtraction operation";
+  let hasFolder = 1;
+}
+
+def StoreOp : Std_Op<"store"> {
+  let summary = "store operation";
+  let description = [{
+    The "store" op writes an element to a memref specified by an index list.
+    The arity of indices is the rank of the memref (i.e. if the memref being
+    stored to is of rank 3, then 3 indices are required for the store following
+    the memref identifier). The store operation does not produce a result.
+
+    In the following example, the ssa value '%v' is stored in memref '%A' at
+    indices [%i, %j]:
+      store %v, %A[%i, %j] : memref<4x128xf32, (d0, d1) -> (d0, d1), 0>
+  }];
+
+  let arguments = (ins AnyType:$value, AnyMemRef:$memref, Variadic<Index>:$indices);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState *result, Value *valueToStore, Value *memref", [{
+      result->addOperands(valueToStore);
+      result->addOperands(memref);
+  }]>];
+
+  let extraClassDeclaration = [{
+      Value *getValueToStore() { return getOperand(0); }
+
+      Value *getMemRef() { return getOperand(1); }
+      void setMemRef(Value *value) { setOperand(1, value); }
+      MemRefType getMemRefType() {
+        return getMemRef()->getType().cast<MemRefType>();
+      }
+
+      operand_range getIndices() {
+        return {getOperation()->operand_begin() + 2, getOperation()->operand_end()};
+      }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def TensorCastOp : CastOp<"tensor_cast"> {
+  let summary = "tensor cast operation";
+  let description = [{
+    The "tensor_cast" operation converts a tensor from one type to an equivalent
+    type without changing any data elements.  The source and destination types
+    must both be tensor types with the same element type.  If both are ranked
+    then the rank should be the same and static dimensions should match.  The
+    operation is invalid if converting to a mismatching constant dimension.
+
+    Convert from unknown rank to rank 2 with unknown dimension sizes.
+       %2 = tensor_cast %1 : tensor<??f32> to tensor<?x?xf32>
+  }];
+
+  let arguments = (ins AnyTensor);
+  let results = (outs AnyTensor);
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+
+    /// The result of a tensor_cast is always a tensor.
+    TensorType getType() { return getResult()->getType().cast<TensorType>(); }
+  }];
+}
+
+def TensorLoadOp : Std_Op<"tensor_load",
+    [SameOperandsAndResultShape, SameOperandsAndResultElementType]> {
+  let summary = "tensor load operation";
+  let description = [{
+    The "tensor_load" operation creates a tensor from a memref, making an
+    independent copy of the element data. The result value is a tensor whose
+    shape and element type match the memref operand.
+
+    Produce a value of tensor<4x?xf32> type.
+       %12 = tensor_load %10 : memref<4x?xf32, #layout, memspace0>
+  }];
+
+  let arguments = (ins AnyMemRef);
+  let results = (outs AnyTensor);
+  // TensorLoadOp is fully verified by traits.
+  let verifier = ?;
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState *result, Value *memref", [{
+      auto memrefType = memref->getType().cast<MemRefType>();
+      auto resultType = builder->getTensorType(memrefType.getShape(),
+          memrefType.getElementType());
+      result->addOperands(memref);
+      result->addTypes(resultType);
+  }]>];
+
+
+  let extraClassDeclaration = [{
+    /// The result of a tensor_load is always a tensor.
+    TensorType getType() { return getResult()->getType().cast<TensorType>(); }
+  }];
+}
+
+def TensorStoreOp : Std_Op<"tensor_store",
+    [SameOperandsShape, SameOperandsElementType]> {
+  let summary = "tensor store operation";
+  let description = [{
+    The "tensor_store" operation stores the contents of a tensor into a memref.
+    The first operand is a value of tensor type, the second operand is a value
+    of memref type. The shapes and element types of these must match, and are
+    specified by the memref type.
+
+    Example:
+       %9 = dim %8, 1 : tensor<4x?xf32>
+       %10 = alloc(%9) : memref<4x?xf32, #layout, memspace0>
+       tensor_store %8, %10 : memref<4x?xf32, #layout, memspace0>
+  }];
+
+  let arguments = (ins AnyTensor:$tensor, AnyMemRef:$memref);
+  // TensorStoreOp is fully verified by traits.
+  let verifier = ?;
+}
+
+
+def XOrOp : IntArithmeticOp<"xor", [Commutative]> {
+  let summary = "integer binary xor";
+  let hasFolder = 1;
+}
+
+#endif // STANDARD_OPS
diff --git a/third_party/mlir/include/mlir/Dialect/Traits.h b/third_party/mlir/include/mlir/Dialect/Traits.h
new file mode 100644
index 00000000000..8bb5e4b8c1b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/Traits.h
@@ -0,0 +1,89 @@
+//===- Traits.h - Common op traits shared by dialects -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares common op traits that are not core to MLIR but can be
+// shared by multiple dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TRAITS
+#define MLIR_DIALECT_TRAITS
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace OpTrait {
+
+// These functions are out-of-line implementations of the methods in the
+// corresponding trait classes.  This avoids them being template
+// instantiated/duplicated.
+namespace impl {
+LogicalResult verifyCompatibleOperandBroadcast(Operation *op);
+} // namespace impl
+
+namespace util {
+/// Returns true and sets `resultShape` to the broadcasted shape from the two
+/// given shapes if they are broadcast compatible. Returns false and clears
+/// `resultShape` otherwise.
+///
+/// The rules for determing the result shape are:
+///
+/// Zip together the dimensions in the two given shapes by prepending the shape
+/// with less dimensions with 1s. For each dimension pair, deduces the result
+/// dimension according to the following order:
+/// - If there are unknown dimensions, follows the TensorFlow behavior:
+///   - If either dimension is greater than 1, we assume that the program is
+///     correct, and the other dimension will be broadcast to match it.
+///   - If either dimension is 1, the other dimension is the result.
+///   - Otherwise, the result dimension is unknown dimension.
+/// - If one of the dimension is 1, the other dimension is the result.
+/// - If two dimensions are the same, that's the result.
+/// - Otherwise, incompatible shape.
+bool getBroadcastedShape(ArrayRef<int64_t> shape1, ArrayRef<int64_t> shape2,
+                         SmallVectorImpl<int64_t> &resultShape);
+
+/// Returns the result broadcast composition type from the two given types by
+/// following NumPy broadcast semantics. Returned type may have dynamic shape if
+/// either of the input types has dynamic shape. Returns null type if the two
+/// given types are not broadcast-compatible.
+Type getBroadcastedType(Type type1, Type type2);
+} // namespace util
+
+/// This class provides the API for ops that are known to have broadcast-
+/// compatible operand and result types. Specifically,  starting from the
+/// most varying dimension, each dimension pair of the two operands' types
+/// should either be the same or one of them is one. Also, the result type
+/// should have the corresponding dimension equal to the larger one, if known.
+/// Shapes are checked partially if ranks or dimensions are not known. For
+/// example, an op with tensor<? x 2 x f32> and tensor <2 x f32> as operand
+/// types and tensor<3 x 2 x f32> as the result type is broadcast-compatible.
+///
+/// Ths trait assumes the op has two operands and one result, and it asserts
+/// if the pre-condition is not satisfied.
+template <typename ConcreteType>
+class BroadcastableTwoOperandsOneResult
+    : public TraitBase<ConcreteType, BroadcastableTwoOperandsOneResult> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyCompatibleOperandBroadcast(op);
+  }
+};
+
+} // end namespace OpTrait
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_TRAITS
diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt b/third_party/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt
new file mode 100644
index 00000000000..6cc7e44e387
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS VectorOps.td)
+mlir_tablegen(VectorOps.h.inc -gen-op-decls)
+mlir_tablegen(VectorOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRVectorOpsIncGen)
diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
new file mode 100644
index 00000000000..11b1efd3eaf
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
@@ -0,0 +1,212 @@
+//===- VectorOps.h - MLIR Super Vectorizer Operations -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines convenience types for working with super-vectorization
+// operations, in particular super-vector loads and stores.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOROPS_VECTOROPS_H
+#define MLIR_DIALECT_VECTOROPS_VECTOROPS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+namespace vector {
+
+/// Dialect for super-vectorization Ops.
+class VectorOpsDialect : public Dialect {
+public:
+  VectorOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "vector"; }
+};
+
+/// VectorTransferReadOp performs a blocking read from a scalar memref
+/// location into a super-vector of the same elemental type. This operation is
+/// called 'read' by opposition to 'load' because the super-vector granularity
+/// is generally not representable with a single hardware register. As a
+/// consequence, memory transfers will generally be required when lowering
+/// VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
+/// that supports super-vectorization with non-effecting padding for full-tile
+/// only code.
+//
+/// A vector transfer read has semantics similar to a vector load, with
+/// additional support for:
+///   1. an optional value of the elemental type of the MemRef. This value
+///      supports non-effecting padding and is inserted in places where the
+///      vector read exceeds the MemRef bounds. If the value is not specified,
+///      the access is statically guaranteed to be within bounds;
+///   2. an attribute of type AffineMap to specify a slice of the original
+///      MemRef access and its transposition into the super-vector shape.
+///      The permutation_map is an AffineMap that must represent a permutation
+///      from the MemRef dim space projected onto the vector dim space.
+///      This permutation_map has as many output dimensions as the vector rank.
+///      However, it is not necessarily full rank on the target space to signify
+///      that broadcast operations will be needed along certain vector
+///      dimensions.
+///      In the limit, one may load a 0-D slice of a memref (i.e. a single
+///      value) into a vector, which corresponds to broadcasting that value in
+///      the whole vector (i.e. a non-constant splat).
+///
+/// Example with full rank permutation_map:
+/// ```mlir
+///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
+///   ...
+///   %val = `ssa-value` : f32
+///   // let %i, %j, %k, %l be ssa-values of type index
+///   %v0 = vector.transfer_read %src[%i, %j, %k, %l]
+///          {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
+///   %v1 = vector.transfer_read %src[%i, %j, %k, %l], (%val)
+///          {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
+/// ```
+///
+/// Example with partial rank permutation_map:
+/// ```mlir
+///   %c0 = constant 0 : index
+///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
+///   ...
+///   // let %i, %j be ssa-values of type index
+///   %v0 = vector.transfer_read %src[%i, %c0, %c0, %c0]
+///          {permutation_map: (d0, d1, d2, d3) -> (0, d1, 0)} :
+///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
+class VectorTransferReadOp
+    : public Op<VectorTransferReadOp, OpTrait::VariadicOperands,
+                OpTrait::OneResult> {
+  enum Offsets : unsigned { MemRefOffset = 0, FirstIndexOffset = 1 };
+
+public:
+  using Op::Op;
+
+  static StringRef getOperationName() { return "vector.transfer_read"; }
+  static StringRef getPermutationMapAttrName() { return "permutation_map"; }
+  static void build(Builder *builder, OperationState *result,
+                    VectorType vectorType, Value *srcMemRef,
+                    ArrayRef<Value *> srcIndices, AffineMap permutationMap,
+                    Optional<Value *> paddingValue = None);
+  VectorType getResultType() {
+    return getResult()->getType().cast<VectorType>();
+  }
+  Value *getVector() { return getResult(); }
+  Value *getMemRef() { return getOperand(Offsets::MemRefOffset); }
+  VectorType getVectorType() { return getResultType(); }
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+  operand_range getIndices();
+  Optional<Value *> getPaddingValue();
+  AffineMap getPermutationMap();
+
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+};
+
+/// VectorTransferWriteOp performs a blocking write from a super-vector to
+/// a scalar memref of the same elemental type. This operation is
+/// called 'write' by opposition to 'store' because the super-vector granularity
+/// is generally not representable with a single hardware register. As a
+/// consequence, memory transfers will generally be required when lowering
+/// VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
+/// abstraction that supports super-vectorization with non-effecting padding for
+/// full-tile only code.
+///
+/// A vector transfer write has semantics similar to a vector store, with
+/// additional support for handling out-of-bounds situations. It is the
+/// responsibility of vector.transfer_write's implementation to ensure the
+/// memory writes are valid. Different implementations may be pertinent
+/// depending on the hardware support including:
+/// 1. predication;
+/// 2. explicit control-flow;
+/// 3. Read-Modify-Write;
+/// 4. writing out of bounds of the memref when the allocation allows it.
+///
+/// Example:
+/// ```mlir
+///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
+///   %val = `ssa-value` : vector<16x32x64xf32>
+///   // let %i, %j, %k, %l be ssa-values of type index
+///   vector.transfer_write %val, %src[%i, %j, %k, %l]
+///     {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+///   vector<16x32x64xf32>, memref<?x?x?x?xf32>
+/// ```
+class VectorTransferWriteOp
+    : public Op<VectorTransferWriteOp, OpTrait::VariadicOperands,
+                OpTrait::ZeroResult> {
+  enum Offsets : unsigned {
+    VectorOffset = 0,
+    MemRefOffset = 1,
+    FirstIndexOffset = 2
+  };
+
+public:
+  using Op::Op;
+
+  static StringRef getOperationName() { return "vector.transfer_write"; }
+  static StringRef getPermutationMapAttrName() { return "permutation_map"; }
+  static void build(Builder *builder, OperationState *result, Value *srcVector,
+                    Value *dstMemRef, ArrayRef<Value *> dstIndices,
+                    AffineMap permutationMap);
+  Value *getVector() { return getOperand(Offsets::VectorOffset); }
+  VectorType getVectorType() {
+    return getVector()->getType().cast<VectorType>();
+  }
+  Value *getMemRef() { return getOperand(Offsets::MemRefOffset); }
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+  operand_range getIndices();
+  AffineMap getPermutationMap();
+
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+};
+
+/// VectorTypeCastOp performs a conversion from a memref with scalar element to
+/// memref with vector element, copying the shape of the memref to the vector.
+///
+/// Example:
+///
+/// ```mlir
+///  %A  = alloc() : memref<5x4x3xf32>
+///  %VA = vector.type_cast %A : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>>
+/// ```
+class VectorTypeCastOp
+    : public Op<VectorTypeCastOp, OpTrait::OneOperand, OpTrait::OneResult> {
+public:
+  using Op::Op;
+
+  static StringRef getOperationName() { return "vector.type_cast"; }
+  static void build(Builder *builder, OperationState *result, Value *srcVector,
+                    Type dstType);
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/VectorOps/VectorOps.h.inc"
+
+} // end namespace vector
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_VECTOROPS_VECTOROPS_H
diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
new file mode 100644
index 00000000000..ba8c0d1360b
--- /dev/null
+++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
@@ -0,0 +1,111 @@
+//===- VectorOps.td - Vector op definitions ---------------*- tablegen -*-====//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines MLIR vector operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef VECTOR_OPS
+#else
+#define VECTOR_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def Vector_Dialect : Dialect {
+  let name = "vector";
+  let cppNamespace = "vector";
+}
+
+// Base class for Vector dialect ops.
+class Vector_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Vector_Dialect, mnemonic, traits> {
+  // For every vector op, there needs to be a:
+  //   * void print(OpAsmPrinter *p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser *parser,
+  //                                         OperationState *result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def ExtractElementOp :
+  Vector_Op<"extractelement", [NoSideEffect,
+     PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, I32ArrayAttr:$position)>,
+    Results<(outs AnyType)> {
+  let summary = "extractelement operation";
+  let description = [{
+    Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at
+    the proper position. Degenerates to an element type in the 0-D case.
+
+    Examples:
+    ```
+      %1 = vector.extractelement %0[3]: vector<4x8x16xf32>
+      %2 = vector.extractelement %0[3, 3, 3]: vector<4x8x16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+def OuterProductOp :
+  Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic<AnyVector>:$acc)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector outerproduct with optional fused add";
+  let description = [{
+    Takes 2 1-D vectors and returns the 2-D vector containing the outer product.
+
+    An optional extra 2-D vector argument may be specified in which case the
+    operation returns the sum of the outer product and the extra vector. When
+    lowered to the LLVMIR dialect, this form emits `llvm.intr.fmuladd`, which
+    can lower to actual `fma` instructions in LLVM.
+
+    Examples
+
+      %2 = vector.extractelement %0, %1: vector<4xf32>, vector<8xf32>
+      return %2: vector<4x8xf32>
+
+      %3 = vector.extractelement %0, %1, %2:
+        vector<4xf32>, vector<8xf32>, vector<4x8xf32>
+      return %3: vector<4x8xf32>
+  }];
+  let extraClassDeclaration = [{
+    VectorType getOperandVectorTypeLHS() {
+      return lhs()->getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeRHS() {
+      return rhs()->getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeACC() {
+      return (llvm::size(acc()) == 0) ? VectorType() :
+        (*acc().begin())->getType().cast<VectorType>();
+    }
+    VectorType getVectorType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+  }];
+}
+#endif // VECTOR_OPS
diff --git a/third_party/mlir/include/mlir/EDSC/Builders.h b/third_party/mlir/include/mlir/EDSC/Builders.h
new file mode 100644
index 00000000000..51c5c331fe9
--- /dev/null
+++ b/third_party/mlir/include/mlir/EDSC/Builders.h
@@ -0,0 +1,500 @@
+//===- Builders.h - MLIR Declarative Builder Classes ------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides intuitive composable interfaces for building structured MLIR
+// snippets in a declarative fashion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_BUILDERS_H_
+#define MLIR_EDSC_BUILDERS_H_
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+namespace mlir {
+
+namespace edsc {
+
+struct index_t {
+  explicit index_t(int64_t v) : v(v) {}
+  explicit operator int64_t() { return v; }
+  int64_t v;
+};
+
+class BlockHandle;
+class CapturableHandle;
+class NestedBuilder;
+class ValueHandle;
+
+/// Helper class to transparently handle builder insertion points by RAII.
+/// As its name indicates, a ScopedContext is means to be used locally in a
+/// scoped fashion. This abstracts away all the boilerplate related to
+/// checking proper usage of captures, NestedBuilders as well as handling the
+/// setting and restoring of insertion points.
+class ScopedContext {
+public:
+  ScopedContext(OpBuilder &builder, Location location);
+
+  /// Sets the insertion point of the builder to 'newInsertPt' for the duration
+  /// of the scope. The existing insertion point of the builder is restored on
+  /// destruction.
+  ScopedContext(OpBuilder &builder, OpBuilder::InsertPoint newInsertPt,
+                Location location);
+  ~ScopedContext();
+
+  static MLIRContext *getContext();
+  static OpBuilder &getBuilder();
+  static Location getLocation();
+
+private:
+  /// Only NestedBuilder (which is used to create an operation with a body)
+  /// may access private members in order to implement scoping.
+  friend class NestedBuilder;
+
+  ScopedContext() = delete;
+  ScopedContext(const ScopedContext &) = delete;
+  ScopedContext &operator=(const ScopedContext &) = delete;
+
+  static ScopedContext *&getCurrentScopedContext();
+
+  /// Top level OpBuilder.
+  OpBuilder &builder;
+  /// The previous insertion point of the builder.
+  llvm::Optional<OpBuilder::InsertPoint> prevBuilderInsertPoint;
+  /// Current location.
+  Location location;
+  /// Parent context we return into.
+  ScopedContext *enclosingScopedContext;
+  /// Defensively keeps track of the current NestedBuilder to ensure proper
+  /// scoping usage.
+  NestedBuilder *nestedBuilder;
+
+  // TODO: Implement scoping of ValueHandles. To do this we need a proper data
+  // structure to hold ValueHandle objects. We can emulate one but there should
+  // already be something available in LLVM for this purpose.
+};
+
+/// A NestedBuilder is a scoping abstraction to create an idiomatic syntax
+/// embedded in C++ that serves the purpose of building nested MLIR.
+/// Nesting and compositionality is obtained by using the strict ordering that
+/// exists between object construction and method invocation on said object (in
+/// our case, the call to `operator()`).
+/// This ordering allows implementing an abstraction that decouples definition
+/// from declaration (in a PL sense) on placeholders of type ValueHandle and
+/// BlockHandle.
+class NestedBuilder {
+protected:
+  NestedBuilder() = default;
+  NestedBuilder(const NestedBuilder &) = delete;
+  NestedBuilder(NestedBuilder &&other) : bodyScope(other.bodyScope) {
+    other.bodyScope = nullptr;
+  }
+
+  NestedBuilder &operator=(const NestedBuilder &) = delete;
+  NestedBuilder &operator=(NestedBuilder &&other) {
+    std::swap(bodyScope, other.bodyScope);
+    return *this;
+  }
+
+  /// Enter an mlir::Block and setup a ScopedContext to insert operations at
+  /// the end of it. Since we cannot use c++ language-level scoping to implement
+  /// scoping itself, we use enter/exit pairs of operations.
+  /// As a consequence we must allocate a new OpBuilder + ScopedContext and
+  /// let the escape.
+  /// Step back "prev" times from the end of the block to set up the insertion
+  /// point, which is useful for non-empty blocks.
+  void enter(mlir::Block *block, int prev = 0) {
+    bodyScope = new ScopedContext(
+        ScopedContext::getBuilder(),
+        OpBuilder::InsertPoint(block, std::prev(block->end(), prev)),
+        ScopedContext::getLocation());
+    bodyScope->nestedBuilder = this;
+  }
+
+  /// Exit the current mlir::Block by explicitly deleting the dynamically
+  /// allocated OpBuilder and ScopedContext.
+  void exit() {
+    // Reclaim now to exit the scope.
+    bodyScope->nestedBuilder = nullptr;
+    delete bodyScope;
+    bodyScope = nullptr;
+  }
+
+  /// Custom destructor does nothing because we already destroyed bodyScope
+  /// manually in `exit`. Insert an assertion to defensively guard against
+  /// improper usage of scoping.
+  ~NestedBuilder() {
+    assert(!bodyScope &&
+           "Illegal use of NestedBuilder; must have called exit()");
+  }
+
+private:
+  ScopedContext *bodyScope = nullptr;
+};
+
+/// A LoopBuilder is a generic NestedBuilder for loop-like MLIR operations.
+/// More specifically it is meant to be used as a temporary object for
+/// representing any nested MLIR construct that is "related to" an mlir::Value*
+/// (for now an induction variable).
+/// This is extensible and will evolve in the future as MLIR evolves, hence
+/// the name LoopBuilder (as opposed to say ForBuilder or AffineForBuilder).
+class LoopBuilder : public NestedBuilder {
+public:
+  /// Constructs a new AffineForOp and captures the associated induction
+  /// variable. A ValueHandle pointer is passed as the first argument and is the
+  /// *only* way to capture the loop induction variable.
+  LoopBuilder(ValueHandle *iv, ArrayRef<ValueHandle> lbHandles,
+              ArrayRef<ValueHandle> ubHandles, int64_t step);
+  LoopBuilder(const LoopBuilder &) = delete;
+  LoopBuilder(LoopBuilder &&) = default;
+
+  LoopBuilder &operator=(const LoopBuilder &) = delete;
+  LoopBuilder &operator=(LoopBuilder &&) = default;
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a LoopBuilder.
+  ValueHandle operator()(llvm::function_ref<void(void)> fun = nullptr);
+};
+
+/// Explicit nested LoopBuilder. Offers a compressed multi-loop builder to avoid
+/// explicitly writing all the loops in a nest. This simple functionality is
+/// also useful to write rank-agnostic custom ops.
+///
+/// Usage:
+///
+/// ```c++
+///    LoopNestBuilder({&i, &j, &k}, {lb, lb, lb}, {ub, ub, ub}, {1, 1, 1})(
+///      [&](){
+///        ...
+///      });
+/// ```
+///
+/// ```c++
+///    LoopNestBuilder({&i}, {lb}, {ub}, {1})([&](){
+///      LoopNestBuilder({&j}, {lb}, {ub}, {1})([&](){
+///        LoopNestBuilder({&k}, {lb}, {ub}, {1})([&](){
+///          ...
+///        }),
+///      }),
+///    });
+/// ```
+class LoopNestBuilder {
+public:
+  LoopNestBuilder(ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> lbs,
+                  ArrayRef<ValueHandle> ubs, ArrayRef<int64_t> steps);
+
+  ValueHandle operator()(llvm::function_ref<void(void)> fun = nullptr);
+
+private:
+  SmallVector<LoopBuilder, 4> loops;
+};
+
+// This class exists solely to handle the C++ vexing parse case when
+// trying to enter a Block that has already been constructed.
+class Append {};
+
+/// A BlockBuilder is a NestedBuilder for mlir::Block*.
+/// This exists by opposition to LoopBuilder which is not related to an
+/// mlir::Block* but to a mlir::Value*.
+/// It is meant to be used as a temporary object for representing any nested
+/// MLIR construct that is "related to" an mlir::Block*.
+class BlockBuilder : public NestedBuilder {
+public:
+  /// Enters the mlir::Block* previously captured by `bh` and sets the insertion
+  /// point to its end.
+  BlockBuilder(BlockHandle bh, Append);
+
+  /// Constructs a new mlir::Block with argument types derived from `args`.
+  /// Captures the new block in `bh` and its arguments into `args`.
+  /// Enters the new mlir::Block* and sets the insertion point to its end.
+  ///
+  /// Prerequisites:
+  ///   The ValueHandle `args` are typed delayed ValueHandles; i.e. they are
+  ///   not yet bound to mlir::Value*.
+  BlockBuilder(BlockHandle *bh, ArrayRef<ValueHandle *> args);
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a BlockBuilder.
+  void operator()(llvm::function_ref<void(void)> fun = nullptr);
+
+private:
+  BlockBuilder(BlockBuilder &) = delete;
+  BlockBuilder &operator=(BlockBuilder &other) = delete;
+};
+
+/// Base class for ValueHandle, OperationHandle and BlockHandle.
+/// Not meant to be used outside of these classes.
+class CapturableHandle {
+protected:
+  CapturableHandle() = default;
+};
+
+/// ValueHandle implements a (potentially "delayed") typed Value abstraction.
+/// ValueHandle should be captured by pointer but otherwise passed by Value
+/// everywhere.
+/// A ValueHandle can have 3 states:
+///   1. null state (empty type and empty value), in which case it does not hold
+///      a value and must never hold a Value (now or in the future). This is
+///      used for MLIR operations with zero returns as well as the result of
+///      calling a NestedBuilder::operator(). In both cases the objective is to
+///      have an object that can be inserted in an ArrayRef<ValueHandle> to
+///      implement nesting;
+///   2. delayed state (empty value), in which case it represents an eagerly
+///      typed "delayed" value that can be hold a Value in the future;
+///   3. constructed state,in which case it holds a Value.
+///
+/// A ValueHandle is meant to capture a single Value* and should be used for
+/// operations that have a single result. For convenience of use, we also
+/// include AffineForOp in this category although it does not return a value.
+/// In the case of AffineForOp, the captured Value* is the loop induction
+/// variable.
+class ValueHandle : public CapturableHandle {
+public:
+  /// A ValueHandle in a null state can never be captured;
+  static ValueHandle null() { return ValueHandle(); }
+
+  /// A ValueHandle that is constructed from a Type represents a typed "delayed"
+  /// Value. A delayed Value can only capture Values of the specified type.
+  /// Such a delayed value represents the declaration (in the PL sense) of a
+  /// placeholder for an mlir::Value* that will be constructed and captured at
+  /// some later point in the program.
+  explicit ValueHandle(Type t) : t(t), v(nullptr) {}
+
+  /// A ValueHandle that is constructed from an mlir::Value* is an "eager"
+  /// Value. An eager Value represents both the declaration and the definition
+  /// (in the PL sense) of a placeholder for an mlir::Value* that has already
+  /// been constructed in the past and that is captured "now" in the program.
+  explicit ValueHandle(Value *v) : t(v->getType()), v(v) {}
+
+  /// Builds a ConstantIndexOp of value `cst`. The constant is created at the
+  /// current insertion point.
+  /// This implicit constructor is provided to each build an eager Value for a
+  /// constant at the current insertion point in the IR. An implicit constructor
+  /// allows idiomatic expressions mixing ValueHandle and literals.
+  ValueHandle(index_t cst);
+
+  /// ValueHandle is a value type, use the default copy constructor.
+  ValueHandle(const ValueHandle &other) = default;
+
+  /// ValueHandle is a value type, the assignment operator typechecks before
+  /// assigning.
+  ValueHandle &operator=(const ValueHandle &other);
+
+  /// Provide a swap operator.
+  void swap(ValueHandle &other) {
+    if (this == &other)
+      return;
+    std::swap(t, other.t);
+    std::swap(v, other.v);
+  }
+
+  /// Implicit conversion useful for automatic conversion to Container<Value*>.
+  operator Value *() const { return getValue(); }
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  template <typename Op, typename... Args>
+  static ValueHandle create(Args... args);
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  template <typename Op, typename... Args>
+  static ValueHandle create(OperationFolder &folder, Args... args);
+
+  /// Special case to build composed AffineApply operations.
+  // TODO: createOrFold when available and move inside of the `create` method.
+  static ValueHandle createComposedAffineApply(AffineMap map,
+                                               ArrayRef<Value *> operands);
+
+  /// Generic create for a named operation producing a single value.
+  static ValueHandle create(StringRef name, ArrayRef<ValueHandle> operands,
+                            ArrayRef<Type> resultTypes,
+                            ArrayRef<NamedAttribute> attributes = {});
+
+  bool hasValue() const { return v != nullptr; }
+  Value *getValue() const {
+    assert(hasValue() && "Unexpected null value;");
+    return v;
+  }
+  bool hasType() const { return t != Type(); }
+  Type getType() const { return t; }
+
+  Operation *getOperation() const {
+    if (!v)
+      return nullptr;
+    return v->getDefiningOp();
+  }
+
+protected:
+  ValueHandle() : t(), v(nullptr) {}
+
+  Type t;
+  Value *v;
+};
+
+/// An OperationHandle can be used in lieu of ValueHandle to capture the
+/// operation in cases when one does not care about, or cannot extract, a
+/// unique Value* from the operation.
+/// This can be used for capturing zero result operations as well as
+/// multi-result operations that are not supported by ValueHandle.
+/// We do not distinguish further between zero and multi-result operations at
+/// this time.
+struct OperationHandle : public CapturableHandle {
+  OperationHandle() : op(nullptr) {}
+  OperationHandle(Operation *op) : op(op) {}
+
+  OperationHandle(const OperationHandle &) = default;
+  OperationHandle &operator=(const OperationHandle &) = default;
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  template <typename Op, typename... Args>
+  static OperationHandle create(Args... args);
+  template <typename Op, typename... Args> static Op createOp(Args... args);
+
+  /// Generic create for a named operation.
+  static OperationHandle create(StringRef name, ArrayRef<ValueHandle> operands,
+                                ArrayRef<Type> resultTypes,
+                                ArrayRef<NamedAttribute> attributes = {});
+
+  operator Operation *() { return op; }
+  Operation *getOperation() const { return op; }
+
+private:
+  Operation *op;
+};
+
+/// Simple wrapper to build a generic operation without successor blocks.
+template <typename HandleType> struct CustomOperation {
+  CustomOperation(StringRef name) : name(name) {
+    static_assert(std::is_same<HandleType, ValueHandle>() ||
+                      std::is_same<HandleType, OperationHandle>(),
+                  "Only CustomOperation<ValueHandle> or "
+                  "CustomOperation<OperationHandle> can be constructed.");
+  }
+  HandleType operator()(ArrayRef<ValueHandle> operands = {},
+                        ArrayRef<Type> resultTypes = {},
+                        ArrayRef<NamedAttribute> attributes = {}) {
+    return HandleType::create(name, operands, resultTypes, attributes);
+  }
+  std::string name;
+};
+
+/// A BlockHandle represents a (potentially "delayed") Block abstraction.
+/// This extra abstraction is necessary because an mlir::Block is not an
+/// mlir::Value.
+/// A BlockHandle should be captured by pointer but otherwise passed by Value
+/// everywhere.
+class BlockHandle : public CapturableHandle {
+public:
+  /// A BlockHandle constructed without an mlir::Block* represents a "delayed"
+  /// Block. A delayed Block represents the declaration (in the PL sense) of a
+  /// placeholder for an mlir::Block* that will be constructed and captured at
+  /// some later point in the program.
+  BlockHandle() : block(nullptr) {}
+
+  /// A BlockHandle constructed with an mlir::Block* represents an "eager"
+  /// Block. An eager Block represents both the declaration and the definition
+  /// (in the PL sense) of a placeholder for an mlir::Block* that has already
+  /// been constructed in the past and that is captured "now" in the program.
+  BlockHandle(mlir::Block *block) : block(block) {}
+
+  /// BlockHandle is a value type, use the default copy constructor and
+  /// assignment operator.
+  BlockHandle(const BlockHandle &) = default;
+  BlockHandle &operator=(const BlockHandle &) = default;
+
+  /// Delegates block creation to MLIR and wrap the resulting mlir::Block.
+  static BlockHandle create(ArrayRef<Type> argTypes);
+
+  operator bool() { return block != nullptr; }
+  operator mlir::Block *() { return block; }
+  mlir::Block *getBlock() { return block; }
+
+private:
+  mlir::Block *block;
+};
+
+template <typename Op, typename... Args>
+OperationHandle OperationHandle::create(Args... args) {
+  return OperationHandle(ScopedContext::getBuilder()
+                             .create<Op>(ScopedContext::getLocation(), args...)
+                             .getOperation());
+}
+
+template <typename Op, typename... Args>
+Op OperationHandle::createOp(Args... args) {
+  return cast<Op>(
+      OperationHandle(ScopedContext::getBuilder()
+                          .create<Op>(ScopedContext::getLocation(), args...)
+                          .getOperation())
+          .getOperation());
+}
+
+template <typename Op, typename... Args>
+ValueHandle ValueHandle::create(Args... args) {
+  Operation *op = ScopedContext::getBuilder()
+                      .create<Op>(ScopedContext::getLocation(), args...)
+                      .getOperation();
+  if (op->getNumResults() == 1) {
+    return ValueHandle(op->getResult(0));
+  } else if (op->getNumResults() == 0) {
+    if (auto f = dyn_cast<AffineForOp>(op)) {
+      return ValueHandle(f.getInductionVar());
+    }
+  }
+  llvm_unreachable("unsupported operation, use an OperationHandle instead");
+}
+
+template <typename Op, typename... Args>
+ValueHandle ValueHandle::create(OperationFolder &folder, Args... args) {
+  return ValueHandle(folder.create<Op>(ScopedContext::getBuilder(),
+                                       ScopedContext::getLocation(), args...));
+}
+
+namespace op {
+
+ValueHandle operator+(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator-(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator*(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator/(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator%(ValueHandle lhs, ValueHandle rhs);
+ValueHandle floorDiv(ValueHandle lhs, ValueHandle rhs);
+ValueHandle ceilDiv(ValueHandle lhs, ValueHandle rhs);
+
+ValueHandle operator!(ValueHandle value);
+ValueHandle operator&&(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator||(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator^(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator==(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator!=(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator<(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator<=(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator>(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator>=(ValueHandle lhs, ValueHandle rhs);
+
+} // namespace op
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_BUILDERS_H_
diff --git a/third_party/mlir/include/mlir/EDSC/CMakeLists.txt b/third_party/mlir/include/mlir/EDSC/CMakeLists.txt
new file mode 100644
index 00000000000..0b6f249ae2f
--- /dev/null
+++ b/third_party/mlir/include/mlir/EDSC/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS "${MLIR_SOURCE_DIR}/test/mlir-tblgen/reference-impl.td")
+mlir_tablegen("reference-impl.inc" -gen-reference-implementations)
+add_public_tablegen_target(MLIRReferenceImplementationTestGen)
diff --git a/third_party/mlir/include/mlir/EDSC/Helpers.h b/third_party/mlir/include/mlir/EDSC/Helpers.h
new file mode 100644
index 00000000000..69b72905eb0
--- /dev/null
+++ b/third_party/mlir/include/mlir/EDSC/Helpers.h
@@ -0,0 +1,267 @@
+//===- Helpers.h - MLIR Declarative Helper Functionality --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides helper classes and syntactic sugar for declarative builders.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_HELPERS_H_
+#define MLIR_EDSC_HELPERS_H_
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+
+namespace mlir {
+namespace edsc {
+
+// A TemplatedIndexedValue brings an index notation over the template Load and
+// Store parameters.
+template <typename Load, typename Store> class TemplatedIndexedValue;
+
+// By default, edsc::IndexedValue provides an index notation around the affine
+// load and stores. edsc::StdIndexedValue provides the standard load/store
+// counterpart.
+using IndexedValue =
+    TemplatedIndexedValue<intrinsics::affine_load, intrinsics::affine_store>;
+using StdIndexedValue =
+    TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+// Base class for MemRefView and VectorView.
+class View {
+public:
+  unsigned rank() const { return lbs.size(); }
+  ValueHandle lb(unsigned idx) { return lbs[idx]; }
+  ValueHandle ub(unsigned idx) { return ubs[idx]; }
+  int64_t step(unsigned idx) { return steps[idx]; }
+  std::tuple<ValueHandle, ValueHandle, int64_t> range(unsigned idx) {
+    return std::make_tuple(lbs[idx], ubs[idx], steps[idx]);
+  }
+  void swapRanges(unsigned i, unsigned j) {
+    if (i == j)
+      return;
+    lbs[i].swap(lbs[j]);
+    ubs[i].swap(ubs[j]);
+    std::swap(steps[i], steps[j]);
+  }
+
+  ArrayRef<ValueHandle> getLbs() { return lbs; }
+  ArrayRef<ValueHandle> getUbs() { return ubs; }
+  ArrayRef<int64_t> getSteps() { return steps; }
+
+protected:
+  SmallVector<ValueHandle, 8> lbs;
+  SmallVector<ValueHandle, 8> ubs;
+  SmallVector<int64_t, 8> steps;
+};
+
+/// A MemRefView represents the information required to step through a
+/// MemRef. It has placeholders for non-contiguous tensors that fit within the
+/// Fortran subarray model.
+/// At the moment it can only capture a MemRef with an identity layout map.
+// TODO(ntv): Support MemRefs with layoutMaps.
+class MemRefView : public View {
+public:
+  explicit MemRefView(Value *v);
+  MemRefView(const MemRefView &) = default;
+  MemRefView &operator=(const MemRefView &) = default;
+
+  unsigned fastestVarying() const { return rank() - 1; }
+
+private:
+  friend IndexedValue;
+  ValueHandle base;
+};
+
+/// A VectorView represents the information required to step through a
+/// Vector accessing each scalar element at a time. It is the counterpart of
+/// a MemRefView but for vectors. This exists purely for boilerplate avoidance.
+class VectorView : public View {
+public:
+  explicit VectorView(Value *v);
+  VectorView(const VectorView &) = default;
+  VectorView &operator=(const VectorView &) = default;
+
+private:
+  friend IndexedValue;
+  ValueHandle base;
+};
+
+/// A TemplatedIndexedValue brings an index notation over the template Load and
+/// Store parameters. This helper class is an abstraction purely for sugaring
+/// purposes and allows writing compact expressions such as:
+///
+/// ```mlir
+///    // `IndexedValue` provided by default in the mlir::edsc namespace.
+///    using IndexedValue =
+///      TemplatedIndexedValue<intrinsics::load, intrinsics::store>;
+///    IndexedValue A(...), B(...), C(...);
+///    For(ivs, zeros, shapeA, ones, {
+///      C(ivs) = A(ivs) + B(ivs)
+///    });
+/// ```
+///
+/// Assigning to an IndexedValue emits an actual `Store` operation, while
+/// converting an IndexedValue to a ValueHandle emits an actual `Load`
+/// operation.
+template <typename Load, typename Store> class TemplatedIndexedValue {
+public:
+  explicit TemplatedIndexedValue(Type t) : base(t) {}
+  explicit TemplatedIndexedValue(Value *v)
+      : TemplatedIndexedValue(ValueHandle(v)) {}
+  explicit TemplatedIndexedValue(ValueHandle v) : base(v) {}
+
+  TemplatedIndexedValue(const TemplatedIndexedValue &rhs) = default;
+
+  TemplatedIndexedValue operator()() { return *this; }
+  /// Returns a new `TemplatedIndexedValue`.
+  TemplatedIndexedValue operator()(ValueHandle index) {
+    TemplatedIndexedValue res(base);
+    res.indices.push_back(index);
+    return res;
+  }
+  template <typename... Args>
+  TemplatedIndexedValue operator()(ValueHandle index, Args... indices) {
+    return TemplatedIndexedValue(base, index).append(indices...);
+  }
+  TemplatedIndexedValue operator()(llvm::ArrayRef<ValueHandle> indices) {
+    return TemplatedIndexedValue(base, indices);
+  }
+  TemplatedIndexedValue operator()(llvm::ArrayRef<IndexHandle> indices) {
+    return TemplatedIndexedValue(
+        base, llvm::ArrayRef<ValueHandle>(indices.begin(), indices.end()));
+  }
+
+  /// Emits a `store`.
+  // NOLINTNEXTLINE: unconventional-assign-operator
+  OperationHandle operator=(const TemplatedIndexedValue &rhs) {
+    ValueHandle rrhs(rhs);
+    return Store(rrhs, getBase(), {indices.begin(), indices.end()});
+  }
+  // NOLINTNEXTLINE: unconventional-assign-operator
+  OperationHandle operator=(ValueHandle rhs) {
+    return Store(rhs, getBase(), {indices.begin(), indices.end()});
+  }
+
+  /// Emits a `load` when converting to a ValueHandle.
+  operator ValueHandle() const {
+    return Load(getBase(), {indices.begin(), indices.end()});
+  }
+
+  /// Emits a `load` when converting to a Value*.
+  Value *operator*(void)const {
+    return Load(getBase(), {indices.begin(), indices.end()}).getValue();
+  }
+
+  ValueHandle getBase() const { return base; }
+
+  /// Operator overloadings.
+  ValueHandle operator+(ValueHandle e);
+  ValueHandle operator-(ValueHandle e);
+  ValueHandle operator*(ValueHandle e);
+  ValueHandle operator/(ValueHandle e);
+  OperationHandle operator+=(ValueHandle e);
+  OperationHandle operator-=(ValueHandle e);
+  OperationHandle operator*=(ValueHandle e);
+  OperationHandle operator/=(ValueHandle e);
+  ValueHandle operator+(TemplatedIndexedValue e) {
+    return *this + static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator-(TemplatedIndexedValue e) {
+    return *this - static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator*(TemplatedIndexedValue e) {
+    return *this * static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator/(TemplatedIndexedValue e) {
+    return *this / static_cast<ValueHandle>(e);
+  }
+  OperationHandle operator+=(TemplatedIndexedValue e) {
+    return this->operator+=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator-=(TemplatedIndexedValue e) {
+    return this->operator-=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator*=(TemplatedIndexedValue e) {
+    return this->operator*=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator/=(TemplatedIndexedValue e) {
+    return this->operator/=(static_cast<ValueHandle>(e));
+  }
+
+private:
+  TemplatedIndexedValue(ValueHandle base, ArrayRef<ValueHandle> indices)
+      : base(base), indices(indices.begin(), indices.end()) {}
+
+  TemplatedIndexedValue &append() { return *this; }
+
+  template <typename T, typename... Args>
+  TemplatedIndexedValue &append(T index, Args... indices) {
+    this->indices.push_back(static_cast<ValueHandle>(index));
+    append(indices...);
+    return *this;
+  }
+  ValueHandle base;
+  llvm::SmallVector<ValueHandle, 8> indices;
+};
+
+/// Operator overloadings.
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator+(ValueHandle e) {
+  using op::operator+;
+  return static_cast<ValueHandle>(*this) + e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator-(ValueHandle e) {
+  using op::operator-;
+  return static_cast<ValueHandle>(*this) - e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator*(ValueHandle e) {
+  using op::operator*;
+  return static_cast<ValueHandle>(*this) * e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator/(ValueHandle e) {
+  using op::operator/;
+  return static_cast<ValueHandle>(*this) / e;
+}
+
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator+=(ValueHandle e) {
+  using op::operator+;
+  return Store(*this + e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator-=(ValueHandle e) {
+  using op::operator-;
+  return Store(*this - e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator*=(ValueHandle e) {
+  using op::operator*;
+  return Store(*this * e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator/=(ValueHandle e) {
+  using op::operator/;
+  return Store(*this / e, getBase(), {indices.begin(), indices.end()});
+}
+
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_HELPERS_H_
diff --git a/third_party/mlir/include/mlir/EDSC/Intrinsics.h b/third_party/mlir/include/mlir/EDSC/Intrinsics.h
new file mode 100644
index 00000000000..98e9cea377f
--- /dev/null
+++ b/third_party/mlir/include/mlir/EDSC/Intrinsics.h
@@ -0,0 +1,278 @@
+//===- Intrinsics.h - MLIR Operations for Declarative Builders ---*- C++-*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides intuitive composable intrinsics for building snippets of MLIR
+// declaratively
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_INTRINSICS_H_
+#define MLIR_EDSC_INTRINSICS_H_
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+class MemRefType;
+class Type;
+
+namespace edsc {
+
+/// An IndexHandle is a simple wrapper around a ValueHandle.
+/// IndexHandles are ubiquitous enough to justify a new type to allow simple
+/// declarations without boilerplate such as:
+///
+/// ```c++
+///    IndexHandle i, j, k;
+/// ```
+struct IndexHandle : public ValueHandle {
+  explicit IndexHandle()
+      : ValueHandle(ScopedContext::getBuilder().getIndexType()) {}
+  explicit IndexHandle(index_t v) : ValueHandle(v) {}
+  explicit IndexHandle(Value *v) : ValueHandle(v) {
+    assert(v->getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+  }
+  explicit IndexHandle(ValueHandle v) : ValueHandle(v) {
+    assert(v.getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+  }
+  IndexHandle &operator=(const ValueHandle &v) {
+    assert(v.getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+    /// Creating a new IndexHandle(v) and then std::swap rightly complains the
+    /// binding has already occurred and that we should use another name.
+    this->t = v.getType();
+    this->v = v.getValue();
+    return *this;
+  }
+};
+
+inline SmallVector<IndexHandle, 8> makeIndexHandles(unsigned rank) {
+  return SmallVector<IndexHandle, 8>(rank);
+}
+
+inline SmallVector<ValueHandle *, 8>
+makeIndexHandlePointers(MutableArrayRef<IndexHandle> ivs) {
+  SmallVector<ValueHandle *, 8> pivs;
+  pivs.reserve(ivs.size());
+  for (auto &iv : ivs) {
+    pivs.push_back(&iv);
+  }
+  return pivs;
+}
+
+/// Returns a vector of the underlying Value* from `ivs`.
+inline SmallVector<Value *, 8> extractValues(ArrayRef<IndexHandle> ivs) {
+  SmallVector<Value *, 8> vals;
+  vals.reserve(ivs.size());
+  for (auto &iv : ivs) {
+    vals.push_back(iv.getValue());
+  }
+  return vals;
+}
+
+/// Provides a set of first class intrinsics.
+/// In the future, most of intrinsics related to Operation that don't contain
+/// other operations should be Tablegen'd.
+namespace intrinsics {
+namespace detail {
+/// Helper structure to be used with ValueBuilder / OperationBuilder.
+/// It serves the purpose of removing boilerplate specialization for the sole
+/// purpose of implicitly converting ArrayRef<ValueHandle> -> ArrayRef<Value*>.
+class ValueHandleArray {
+public:
+  ValueHandleArray(ArrayRef<ValueHandle> vals) {
+    values.append(vals.begin(), vals.end());
+  }
+  ValueHandleArray(ArrayRef<IndexHandle> vals) {
+    values.append(vals.begin(), vals.end());
+  }
+  ValueHandleArray(ArrayRef<index_t> vals) {
+    llvm::SmallVector<IndexHandle, 8> tmp(vals.begin(), vals.end());
+    values.append(tmp.begin(), tmp.end());
+  }
+  operator ArrayRef<Value *>() { return values; }
+
+private:
+  ValueHandleArray() = default;
+  llvm::SmallVector<Value *, 8> values;
+};
+
+template <typename T> inline T unpack(T value) { return value; }
+
+inline detail::ValueHandleArray unpack(ArrayRef<ValueHandle> values) {
+  return detail::ValueHandleArray(values);
+}
+
+} // namespace detail
+
+/// Helper variadic abstraction to allow extending to any MLIR op without
+/// boilerplate or Tablegen.
+/// Arguably a builder is not a ValueHandle but in practice it is only used as
+/// an alias to a notional ValueHandle<Op>.
+/// Implementing it as a subclass allows it to compose all the way to Value*.
+/// Without subclassing, implicit conversion to Value* would fail when composing
+/// in patterns such as: `select(a, b, select(c, d, e))`.
+template <typename Op> struct ValueBuilder : public ValueHandle {
+  // Builder-based
+  template <typename... Args>
+  ValueBuilder(Args... args)
+      : ValueHandle(ValueHandle::create<Op>(detail::unpack(args)...)) {}
+  ValueBuilder(ArrayRef<ValueHandle> vs)
+      : ValueBuilder(ValueBuilder::create<Op>(detail::unpack(vs))) {}
+  template <typename... Args>
+  ValueBuilder(ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  ValueBuilder(T t, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            detail::unpack(t), detail::unpack(vs), detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  ValueBuilder(T1 t1, T2 t2, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+
+  /// Folder-based
+  template <typename... Args>
+  ValueBuilder(OperationFolder &folder, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(args)...)) {}
+  ValueBuilder(OperationFolder &folder, ArrayRef<ValueHandle> vs)
+      : ValueBuilder(ValueBuilder::create<Op>(folder, detail::unpack(vs))) {}
+  template <typename... Args>
+  ValueBuilder(OperationFolder &folder, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  ValueBuilder(OperationFolder &folder, T t, ArrayRef<ValueHandle> vs,
+               Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(t),
+                                            detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  ValueBuilder(OperationFolder &folder, T1 t1, T2 t2, ArrayRef<ValueHandle> vs,
+               Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            folder, detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+
+  ValueBuilder() : ValueHandle(ValueHandle::create<Op>()) {}
+};
+
+template <typename Op> struct OperationBuilder : public OperationHandle {
+  template <typename... Args>
+  OperationBuilder(Args... args)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(args)...)) {}
+  OperationBuilder(ArrayRef<ValueHandle> vs)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(vs))) {}
+  template <typename... Args>
+  OperationBuilder(ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(vs),
+                                                    detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  OperationBuilder(T t, ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(
+            detail::unpack(t), detail::unpack(vs), detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  OperationBuilder(T1 t1, T2 t2, ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(
+            detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+  OperationBuilder() : OperationHandle(OperationHandle::create<Op>()) {}
+};
+
+using alloc = ValueBuilder<AllocOp>;
+using affine_apply = ValueBuilder<AffineApplyOp>;
+using affine_load = ValueBuilder<AffineLoadOp>;
+using affine_store = OperationBuilder<AffineStoreOp>;
+using call = OperationBuilder<mlir::CallOp>;
+using constant_float = ValueBuilder<ConstantFloatOp>;
+using constant_index = ValueBuilder<ConstantIndexOp>;
+using constant_int = ValueBuilder<ConstantIntOp>;
+using dealloc = OperationBuilder<DeallocOp>;
+using dim = ValueBuilder<DimOp>;
+using muli = ValueBuilder<MulIOp>;
+using ret = OperationBuilder<ReturnOp>;
+using select = ValueBuilder<SelectOp>;
+using std_load = ValueBuilder<LoadOp>;
+using std_store = OperationBuilder<StoreOp>;
+using subi = ValueBuilder<SubIOp>;
+using vector_type_cast = ValueBuilder<vector::VectorTypeCastOp>;
+
+/// Branches into the mlir::Block* captured by BlockHandle `b` with `operands`.
+///
+/// Prerequisites:
+///   All Handles have already captured previously constructed IR objects.
+OperationHandle br(BlockHandle bh, ArrayRef<ValueHandle> operands);
+
+/// Creates a new mlir::Block* and branches to it from the current block.
+/// Argument types are specified by `operands`.
+/// Captures the new block in `bh` and the actual `operands` in `captures`. To
+/// insert the new mlir::Block*, a local ScopedContext is constructed and
+/// released to the current block. The branch operation is then added to the
+/// new block.
+///
+/// Prerequisites:
+///   `b` has not yet captured an mlir::Block*.
+///   No `captures` have captured any mlir::Value*.
+///   All `operands` have already captured an mlir::Value*
+///   captures.size() == operands.size()
+///   captures and operands are pairwise of the same type.
+OperationHandle br(BlockHandle *bh, ArrayRef<ValueHandle *> captures,
+                   ArrayRef<ValueHandle> operands);
+
+/// Branches into the mlir::Block* captured by BlockHandle `trueBranch` with
+/// `trueOperands` if `cond` evaluates to `true` (resp. `falseBranch` and
+/// `falseOperand` if `cond` evaluates to `false`).
+///
+/// Prerequisites:
+///   All Handles have captured previouly constructed IR objects.
+OperationHandle cond_br(ValueHandle cond, BlockHandle trueBranch,
+                        ArrayRef<ValueHandle> trueOperands,
+                        BlockHandle falseBranch,
+                        ArrayRef<ValueHandle> falseOperands);
+
+/// Eagerly creates new mlir::Block* with argument types specified by
+/// `trueOperands`/`falseOperands`.
+/// Captures the new blocks in `trueBranch`/`falseBranch` and the arguments in
+/// `trueCaptures/falseCaptures`.
+/// To insert the new mlir::Block*, a local ScopedContext is constructed and
+/// released. The branch operation is then added in the original location and
+/// targeting the eagerly constructed blocks.
+///
+/// Prerequisites:
+///   `trueBranch`/`falseBranch` has not yet captured an mlir::Block*.
+///   No `trueCaptures`/`falseCaptures` have captured any mlir::Value*.
+///   All `trueOperands`/`trueOperands` have already captured an mlir::Value*
+///   `trueCaptures`.size() == `trueOperands`.size()
+///   `falseCaptures`.size() == `falseOperands`.size()
+///   `trueCaptures` and `trueOperands` are pairwise of the same type
+///   `falseCaptures` and `falseOperands` are pairwise of the same type.
+OperationHandle cond_br(ValueHandle cond, BlockHandle *trueBranch,
+                        ArrayRef<ValueHandle *> trueCaptures,
+                        ArrayRef<ValueHandle> trueOperands,
+                        BlockHandle *falseBranch,
+                        ArrayRef<ValueHandle *> falseCaptures,
+                        ArrayRef<ValueHandle> falseOperands);
+} // namespace intrinsics
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_INTRINSICS_H_
diff --git a/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
new file mode 100644
index 00000000000..e3ba4909918
--- /dev/null
+++ b/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -0,0 +1,124 @@
+//===- ExecutionEngine.h - MLIR Execution engine and utils -----*- C++ -*--===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file provides a JIT-backed execution engine for MLIR modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
+#define MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Error.h"
+
+#include <functional>
+#include <memory>
+
+namespace llvm {
+template <typename T> class Expected;
+class Module;
+class ExecutionEngine;
+class MemoryBuffer;
+} // namespace llvm
+
+namespace mlir {
+
+class ModuleOp;
+
+/// A simple object cache following Lang's LLJITWithObjectCache example.
+class SimpleObjectCache : public llvm::ObjectCache {
+public:
+  void notifyObjectCompiled(const llvm::Module *M,
+                            llvm::MemoryBufferRef ObjBuffer) override;
+  std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module *M) override;
+
+private:
+  llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> CachedObjects;
+};
+
+/// JIT-backed execution engine for MLIR modules.  Assumes the module can be
+/// converted to LLVM IR.  For each function, creates a wrapper function with
+/// the fixed interface
+///
+///     void _mlir_funcName(void **)
+///
+/// where the only argument is interpreted as a list of pointers to the actual
+/// arguments of the function, followed by a pointer to the result.  This allows
+/// the engine to provide the caller with a generic function pointer that can
+/// be used to invoke the JIT-compiled function.
+class ExecutionEngine {
+public:
+  /// Creates an execution engine for the given module.  If `transformer` is
+  /// provided, it will be called on the LLVM module during JIT-compilation and
+  /// can be used, e.g., for reporting or optimization.
+  /// If `sharedLibPaths` are provided, the underlying JIT-compilation will open
+  /// and link the shared libraries for symbol resolution.
+  static llvm::Expected<std::unique_ptr<ExecutionEngine>>
+  create(ModuleOp m,
+         std::function<llvm::Error(llvm::Module *)> transformer = {},
+         ArrayRef<StringRef> sharedLibPaths = {});
+
+  /// Looks up a packed-argument function with the given name and returns a
+  /// pointer to it.  Propagates errors in case of failure.
+  llvm::Expected<void (*)(void **)> lookup(StringRef name) const;
+
+  /// Invokes the function with the given name passing it the list of arguments.
+  /// The arguments are accepted by lvalue-reference since the packed function
+  /// interface expects a list of non-null pointers.
+  template <typename... Args>
+  llvm::Error invoke(StringRef name, Args &... args);
+
+  /// Invokes the function with the given name passing it the list of arguments
+  /// as a list of opaque pointers. This is the arity-agnostic equivalent of
+  /// the templated `invoke`.
+  llvm::Error invoke(StringRef name, MutableArrayRef<void *> args);
+
+  /// Set the target triple on the module. This is implicitly done when creating
+  /// the engine.
+  static bool setupTargetTriple(llvm::Module *llvmModule);
+
+private:
+  // Ordering of llvmContext and jit is important for destruction purposes: the
+  // jit must be destroyed before the context.
+  llvm::LLVMContext llvmContext;
+
+  // Underlying LLJIT.
+  std::unique_ptr<llvm::orc::LLJIT> jit;
+
+  // Underlying cache.
+  std::unique_ptr<SimpleObjectCache> cache;
+};
+
+template <typename... Args>
+llvm::Error ExecutionEngine::invoke(StringRef name, Args &... args) {
+  auto expectedFPtr = lookup(name);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+  auto fptr = *expectedFPtr;
+
+  llvm::SmallVector<void *, 8> packedArgs{static_cast<void *>(&args)...};
+  (*fptr)(packedArgs.data());
+
+  return llvm::Error::success();
+}
+
+} // end namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
diff --git a/third_party/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/third_party/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
new file mode 100644
index 00000000000..694686467a9
--- /dev/null
+++ b/third_party/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -0,0 +1,54 @@
+//===- MemRefUtils.h - MLIR runtime utilities for memrefs -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a set of utilities to working with objects of memref type in an JIT
+// context using the MLIR execution engine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_MEMREFUTILS_H_
+#define MLIR_EXECUTIONENGINE_MEMREFUTILS_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace llvm {
+template <typename T> class Expected;
+}
+
+namespace mlir {
+class FuncOp;
+
+/// Simple memref descriptor class compatible with the ABI of functions emitted
+/// by MLIR to LLVM IR conversion for statically-shaped memrefs of float type.
+struct StaticFloatMemRef {
+  float *data;
+};
+
+/// Given an MLIR function that takes only statically-shaped memrefs with
+/// element type f32, allocate the memref descriptor and the data storage for
+/// each of the arguments, initialize the storage with `initialValue`, and
+/// return a list of type-erased descriptor pointers.
+llvm::Expected<SmallVector<void *, 8>>
+allocateMemRefArguments(FuncOp func, float initialValue = 0.0);
+
+/// Free a list of type-erased descriptors to statically-shaped memrefs with
+/// element type f32.
+void freeMemRefArguments(ArrayRef<void *> args);
+
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_MEMREFUTILS_H_
diff --git a/third_party/mlir/include/mlir/ExecutionEngine/OptUtils.h b/third_party/mlir/include/mlir/ExecutionEngine/OptUtils.h
new file mode 100644
index 00000000000..8c0249d5c09
--- /dev/null
+++ b/third_party/mlir/include/mlir/ExecutionEngine/OptUtils.h
@@ -0,0 +1,66 @@
+//===- OptUtils.h - MLIR Execution Engine opt pass utilities ----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the utility functions to trigger LLVM optimizations from
+// MLIR Execution Engine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_OPTUTILS_H_
+#define MLIR_EXECUTIONENGINE_OPTUTILS_H_
+
+#include "llvm/Pass.h"
+
+#include <functional>
+#include <string>
+
+namespace llvm {
+class Module;
+class Error;
+class TargetMachine;
+} // namespace llvm
+
+namespace mlir {
+
+/// Initialize LLVM passes that can be when running MLIR code using
+/// ExecutionEngine.
+void initializeLLVMPasses();
+
+/// Create a module transformer function for MLIR ExecutionEngine that runs
+/// LLVM IR passes corresponding to the given speed and size optimization
+/// levels (e.g. -O2 or -Os). If not null, `targetMachine` is used to
+/// initialize passes that provide target-specific information to the LLVM
+/// optimizer. `targetMachine` must outlive the returned std::function.
+std::function<llvm::Error(llvm::Module *)>
+makeOptimizingTransformer(unsigned optLevel, unsigned sizeLevel,
+                          llvm::TargetMachine *targetMachine);
+
+/// Create a module transformer function for MLIR ExecutionEngine that runs
+/// LLVM IR passes explicitly specified, plus an optional optimization level,
+/// Any optimization passes, if present, will be inserted before the pass at
+/// position optPassesInsertPos. If not null, `targetMachine` is used to
+/// initialize passes that provide target-specific information to the LLVM
+/// optimizer. `targetMachine` must outlive the returned std::function.
+std::function<llvm::Error(llvm::Module *)>
+makeLLVMPassesTransformer(llvm::ArrayRef<const llvm::PassInfo *> llvmPasses,
+                          llvm::Optional<unsigned> mbOptLevel,
+                          llvm::TargetMachine *targetMachine,
+                          unsigned optPassesInsertPos = 0);
+
+} // end namespace mlir
+
+#endif // LIR_EXECUTIONENGINE_OPTUTILS_H_
diff --git a/third_party/mlir/include/mlir/IR/AffineExpr.h b/third_party/mlir/include/mlir/IR/AffineExpr.h
new file mode 100644
index 00000000000..58b4fbc3be1
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/AffineExpr.h
@@ -0,0 +1,311 @@
+//===- AffineExpr.h - MLIR Affine Expr Class --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// An affine expression is an affine combination of dimension identifiers and
+// symbols, including ceildiv/floordiv/mod by a constant integer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_EXPR_H
+#define MLIR_IR_AFFINE_EXPR_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/Casting.h"
+#include <type_traits>
+
+namespace mlir {
+
+class MLIRContext;
+class AffineMap;
+class IntegerSet;
+
+namespace detail {
+
+struct AffineExprStorage;
+struct AffineBinaryOpExprStorage;
+struct AffineDimExprStorage;
+struct AffineSymbolExprStorage;
+struct AffineConstantExprStorage;
+
+} // namespace detail
+
+enum class AffineExprKind {
+  Add,
+  /// RHS of mul is always a constant or a symbolic expression.
+  Mul,
+  /// RHS of mod is always a constant or a symbolic expression with a positive
+  /// value.
+  Mod,
+  /// RHS of floordiv is always a constant or a symbolic expression.
+  FloorDiv,
+  /// RHS of ceildiv is always a constant or a symbolic expression.
+  CeilDiv,
+
+  /// This is a marker for the last affine binary op. The range of binary
+  /// op's is expected to be this element and earlier.
+  LAST_AFFINE_BINARY_OP = CeilDiv,
+
+  /// Constant integer.
+  Constant,
+  /// Dimensional identifier.
+  DimId,
+  /// Symbolic identifier.
+  SymbolId,
+};
+
+/// Base type for affine expression.
+/// AffineExpr's are immutable value types with intuitive operators to
+/// operate on chainable, lightweight compositions.
+/// An AffineExpr is an interface to the underlying storage type pointer.
+class AffineExpr {
+public:
+  using ImplType = detail::AffineExprStorage;
+
+  AffineExpr() : expr(nullptr) {}
+  /* implicit */ AffineExpr(const ImplType *expr)
+      : expr(const_cast<ImplType *>(expr)) {}
+
+  AffineExpr(const AffineExpr &other) : expr(other.expr) {}
+  AffineExpr &operator=(AffineExpr other) {
+    expr = other.expr;
+    return *this;
+  }
+
+  bool operator==(AffineExpr other) const { return expr == other.expr; }
+  bool operator!=(AffineExpr other) const { return !(*this == other); }
+  explicit operator bool() const { return expr; }
+
+  bool operator!() const { return expr == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U cast() const;
+
+  MLIRContext *getContext() const;
+
+  /// Return the classification for this type.
+  AffineExprKind getKind() const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// Returns true if this expression is made out of only symbols and
+  /// constants, i.e., it does not involve dimensional identifiers.
+  bool isSymbolicOrConstant() const;
+
+  /// Returns true if this is a pure affine expression, i.e., multiplication,
+  /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
+  bool isPureAffine() const;
+
+  /// Returns the greatest known integral divisor of this affine expression.
+  uint64_t getLargestKnownDivisor() const;
+
+  /// Return true if the affine expression is a multiple of 'factor'.
+  bool isMultipleOf(int64_t factor) const;
+
+  /// Return true if the affine expression involves AffineDimExpr `position`.
+  bool isFunctionOfDim(unsigned position) const;
+
+  /// Walk all of the AffineExpr's in this expression in postorder.
+  void walk(std::function<void(AffineExpr)> callback) const;
+
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) and returns the modified expression tree.
+  AffineExpr replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                   ArrayRef<AffineExpr> symReplacements) const;
+
+  AffineExpr operator+(int64_t v) const;
+  AffineExpr operator+(AffineExpr other) const;
+  AffineExpr operator-() const;
+  AffineExpr operator-(int64_t v) const;
+  AffineExpr operator-(AffineExpr other) const;
+  AffineExpr operator*(int64_t v) const;
+  AffineExpr operator*(AffineExpr other) const;
+  AffineExpr floorDiv(uint64_t v) const;
+  AffineExpr floorDiv(AffineExpr other) const;
+  AffineExpr ceilDiv(uint64_t v) const;
+  AffineExpr ceilDiv(AffineExpr other) const;
+  AffineExpr operator%(uint64_t v) const;
+  AffineExpr operator%(AffineExpr other) const;
+
+  /// Compose with an AffineMap.
+  /// Returns the composition of this AffineExpr with `map`.
+  ///
+  /// Prerequisites:
+  /// `this` and `map` are composable, i.e. that the number of AffineDimExpr of
+  /// `this` is smaller than the number of results of `map`. If a result of a
+  /// map does not have a corresponding AffineDimExpr, that result simply does
+  /// not appear in the produced AffineExpr.
+  ///
+  /// Example:
+  ///   expr: `d0 + d2`
+  ///   map:  `(d0, d1, d2)[s0, s1] -> (d0 + s1, d1 + s0, d0 + d1 + d2)`
+  ///   returned expr: `d0 * 2 + d1 + d2 + s1`
+  AffineExpr compose(AffineMap map) const;
+
+  friend ::llvm::hash_code hash_value(AffineExpr arg);
+
+protected:
+  ImplType *expr;
+};
+
+/// Affine binary operation expression. An affine binary operation could be an
+/// add, mul, floordiv, ceildiv, or a modulo operation. (Subtraction is
+/// represented through a multiply by -1 and add.) These expressions are always
+/// constructed in a simplified form. For eg., the LHS and RHS operands can't
+/// both be constants. There are additional canonicalizing rules depending on
+/// the op type: see checks in the constructor.
+class AffineBinaryOpExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineBinaryOpExprStorage;
+  /* implicit */ AffineBinaryOpExpr(AffineExpr::ImplType *ptr);
+  AffineExpr getLHS() const;
+  AffineExpr getRHS() const;
+};
+
+/// A dimensional identifier appearing in an affine expression.
+class AffineDimExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineDimExprStorage;
+  /* implicit */ AffineDimExpr(AffineExpr::ImplType *ptr);
+  unsigned getPosition() const;
+};
+
+/// A symbolic identifier appearing in an affine expression.
+class AffineSymbolExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineDimExprStorage;
+  /* implicit */ AffineSymbolExpr(AffineExpr::ImplType *ptr);
+  unsigned getPosition() const;
+};
+
+/// An integer constant appearing in affine expression.
+class AffineConstantExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineConstantExprStorage;
+  /* implicit */ AffineConstantExpr(AffineExpr::ImplType *ptr);
+  int64_t getValue() const;
+};
+
+/// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(AffineExpr arg) {
+  return ::llvm::hash_value(arg.expr);
+}
+
+inline AffineExpr operator+(int64_t val, AffineExpr expr) { return expr + val; }
+inline AffineExpr operator*(int64_t val, AffineExpr expr) { return expr * val; }
+inline AffineExpr operator-(int64_t val, AffineExpr expr) {
+  return expr * (-1) + val;
+}
+
+/// These free functions allow clients of the API to not use classes in detail.
+AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context);
+AffineExpr getAffineSymbolExpr(unsigned position, MLIRContext *context);
+AffineExpr getAffineConstantExpr(int64_t constant, MLIRContext *context);
+AffineExpr getAffineBinaryOpExpr(AffineExprKind kind, AffineExpr lhs,
+                                 AffineExpr rhs);
+
+/// Constructs an affine expression from a flat ArrayRef. If there are local
+/// identifiers (neither dimensional nor symbolic) that appear in the sum of
+/// products expression, 'localExprs' is expected to have the AffineExpr
+/// for it, and is substituted into. The ArrayRef 'eq' is expected to be in the
+/// format [dims, symbols, locals, constant term].
+AffineExpr toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
+                        unsigned numSymbols, ArrayRef<AffineExpr> localExprs,
+                        MLIRContext *context);
+
+raw_ostream &operator<<(raw_ostream &os, AffineExpr &expr);
+
+template <typename U> bool AffineExpr::isa() const {
+  if (std::is_same<U, AffineBinaryOpExpr>::value) {
+    return getKind() <= AffineExprKind::LAST_AFFINE_BINARY_OP;
+  }
+  if (std::is_same<U, AffineDimExpr>::value) {
+    return getKind() == AffineExprKind::DimId;
+  }
+  if (std::is_same<U, AffineSymbolExpr>::value) {
+    return getKind() == AffineExprKind::SymbolId;
+  }
+  if (std::is_same<U, AffineConstantExpr>::value) {
+    return getKind() == AffineExprKind::Constant;
+  }
+}
+template <typename U> U AffineExpr::dyn_cast() const {
+  if (isa<U>()) {
+    return U(expr);
+  }
+  return U(nullptr);
+}
+template <typename U> U AffineExpr::cast() const {
+  assert(isa<U>());
+  return U(expr);
+}
+
+/// Simplify an affine expression by flattening and some amount of
+/// simple analysis. This has complexity linear in the number of nodes in
+/// 'expr'. Returns the simplified expression, which is the same as the input
+///  expression if it can't be simplified.
+AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                              unsigned numSymbols);
+
+/// Flattens 'expr' into 'flattenedExpr'. Returns true on success or false
+/// if 'expr' could not be flattened (i.e., semi-affine is not yet handled).
+/// 'cst' contains constraints that connect newly introduced local identifiers
+/// to existing dimensional and / symbolic identifiers. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened.
+bool getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                            unsigned numSymbols,
+                            llvm::SmallVectorImpl<int64_t> *flattenedExpr);
+
+/// Flattens the result expressions of the map to their corresponding flattened
+/// forms and set in 'flattenedExprs'. Returns true on success or false
+/// if any expression in the map could not be flattened (i.e., semi-affine is
+/// not yet handled).  For all affine expressions that share the same operands
+/// (like those of an affine map), this method should be used instead of
+/// repeatedly calling getFlattenedAffineExpr since local variables added to
+/// deal with div's and mod's will be reused across expressions.
+bool getFlattenedAffineExprs(
+    AffineMap map, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs);
+bool getFlattenedAffineExprs(
+    IntegerSet set, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs);
+
+} // namespace mlir
+
+namespace llvm {
+
+// AffineExpr hash just like pointers
+template <> struct DenseMapInfo<mlir::AffineExpr> {
+  static mlir::AffineExpr getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::AffineExpr(static_cast<mlir::AffineExpr::ImplType *>(pointer));
+  }
+  static mlir::AffineExpr getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::AffineExpr(static_cast<mlir::AffineExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::AffineExpr val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::AffineExpr LHS, mlir::AffineExpr RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_AFFINE_EXPR_H
diff --git a/third_party/mlir/include/mlir/IR/AffineExprVisitor.h b/third_party/mlir/include/mlir/IR/AffineExprVisitor.h
new file mode 100644
index 00000000000..7b14381193f
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/AffineExprVisitor.h
@@ -0,0 +1,334 @@
+//===- AffineExprVisitor.h - MLIR AffineExpr Visitor Class ------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the AffineExpr visitor class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_EXPR_VISITOR_H
+#define MLIR_IR_AFFINE_EXPR_VISITOR_H
+
+#include "mlir/IR/AffineExpr.h"
+
+namespace mlir {
+
+/// Base class for AffineExpr visitors/walkers.
+///
+/// AffineExpr visitors are used when you want to perform different actions
+/// for different kinds of AffineExprs without having to use lots of casts
+/// and a big switch instruction.
+///
+/// To define your own visitor, inherit from this class, specifying your
+/// new type for the 'SubClass' template parameter, and "override" visitXXX
+/// functions in your class. This class is defined in terms of statically
+/// resolved overloading, not virtual functions.
+///
+/// For example, here is a visitor that counts the number of for AffineDimExprs
+/// in an AffineExpr.
+///
+///  /// Declare the class.  Note that we derive from AffineExprVisitor
+///  /// instantiated with our new subclasses_ type.
+///
+///  struct DimExprCounter : public AffineExprVisitor<DimExprCounter> {
+///    unsigned numDimExprs;
+///    DimExprCounter() : numDimExprs(0) {}
+///    void visitDimExpr(AffineDimExpr expr) { ++numDimExprs; }
+///  };
+///
+///  And this class would be used like this:
+///    DimExprCounter dec;
+///    dec.visit(affineExpr);
+///    numDimExprs = dec.numDimExprs;
+///
+/// AffineExprVisitor provides visit methods for the following binary affine
+/// op expressions:
+/// AffineBinaryAddOpExpr, AffineBinaryMulOpExpr,
+/// AffineBinaryModOpExpr, AffineBinaryFloorDivOpExpr,
+/// AffineBinaryCeilDivOpExpr. Note that default implementations of these
+/// methods will call the general AffineBinaryOpExpr method.
+///
+/// In addition, visit methods are provided for the following affine
+//  expressions: AffineConstantExpr, AffineDimExpr, and
+//  AffineSymbolExpr.
+///
+/// Note that if you don't implement visitXXX for some affine expression type,
+/// the visitXXX method for Instruction superclass will be invoked.
+///
+/// Note that this class is specifically designed as a template to avoid
+/// virtual function call overhead. Defining and using a AffineExprVisitor is
+/// just as efficient as having your own switch instruction over the instruction
+/// opcode.
+
+template <typename SubClass, typename RetTy = void> class AffineExprVisitor {
+  //===--------------------------------------------------------------------===//
+  // Interface code - This is the public interface of the AffineExprVisitor
+  // that you use to visit affine expressions...
+public:
+  // Function to walk an AffineExpr (in post order).
+  RetTy walkPostOrder(AffineExpr expr) {
+    static_assert(std::is_base_of<AffineExprVisitor, SubClass>::value,
+                  "Must instantiate with a derived type of AffineExprVisitor");
+    switch (expr.getKind()) {
+    case AffineExprKind::Add: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitAddExpr(binOpExpr);
+    }
+    case AffineExprKind::Mul: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitMulExpr(binOpExpr);
+    }
+    case AffineExprKind::Mod: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitModExpr(binOpExpr);
+    }
+    case AffineExprKind::FloorDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitFloorDivExpr(binOpExpr);
+    }
+    case AffineExprKind::CeilDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitCeilDivExpr(binOpExpr);
+    }
+    case AffineExprKind::Constant:
+      return static_cast<SubClass *>(this)->visitConstantExpr(
+          expr.cast<AffineConstantExpr>());
+    case AffineExprKind::DimId:
+      return static_cast<SubClass *>(this)->visitDimExpr(
+          expr.cast<AffineDimExpr>());
+    case AffineExprKind::SymbolId:
+      return static_cast<SubClass *>(this)->visitSymbolExpr(
+          expr.cast<AffineSymbolExpr>());
+    }
+  }
+
+  // Function to visit an AffineExpr.
+  RetTy visit(AffineExpr expr) {
+    static_assert(std::is_base_of<AffineExprVisitor, SubClass>::value,
+                  "Must instantiate with a derived type of AffineExprVisitor");
+    switch (expr.getKind()) {
+    case AffineExprKind::Add: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitAddExpr(binOpExpr);
+    }
+    case AffineExprKind::Mul: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitMulExpr(binOpExpr);
+    }
+    case AffineExprKind::Mod: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitModExpr(binOpExpr);
+    }
+    case AffineExprKind::FloorDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitFloorDivExpr(binOpExpr);
+    }
+    case AffineExprKind::CeilDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitCeilDivExpr(binOpExpr);
+    }
+    case AffineExprKind::Constant:
+      return static_cast<SubClass *>(this)->visitConstantExpr(
+          expr.cast<AffineConstantExpr>());
+    case AffineExprKind::DimId:
+      return static_cast<SubClass *>(this)->visitDimExpr(
+          expr.cast<AffineDimExpr>());
+    case AffineExprKind::SymbolId:
+      return static_cast<SubClass *>(this)->visitSymbolExpr(
+          expr.cast<AffineSymbolExpr>());
+    }
+    llvm_unreachable("Unknown AffineExpr");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Visitation functions... these functions provide default fallbacks in case
+  // the user does not specify what to do for a particular instruction type.
+  // The default behavior is to generalize the instruction type to its subtype
+  // and try visiting the subtype.  All of this should be inlined perfectly,
+  // because there are no virtual functions to get in the way.
+  //
+
+  // Default visit methods. Note that the default op-specific binary op visit
+  // methods call the general visitAffineBinaryOpExpr visit method.
+  void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {}
+  void visitAddExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitMulExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitModExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitFloorDivExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitCeilDivExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitConstantExpr(AffineConstantExpr expr) {}
+  void visitDimExpr(AffineDimExpr expr) {}
+  void visitSymbolExpr(AffineSymbolExpr expr) {}
+
+private:
+  // Walk the operands - each operand is itself walked in post order.
+  void walkOperandsPostOrder(AffineBinaryOpExpr expr) {
+    walkPostOrder(expr.getLHS());
+    walkPostOrder(expr.getRHS());
+  }
+};
+
+// This class is used to flatten a pure affine expression (AffineExpr,
+// which is in a tree form) into a sum of products (w.r.t constants) when
+// possible, and in that process simplifying the expression. For a modulo,
+// floordiv, or a ceildiv expression, an additional identifier, called a local
+// identifier, is introduced to rewrite the expression as a sum of product
+// affine expression. Each local identifier is always and by construction a
+// floordiv of a pure add/mul affine function of dimensional, symbolic, and
+// other local identifiers, in a non-mutually recursive way. Hence, every local
+// identifier can ultimately always be recovered as an affine function of
+// dimensional and symbolic identifiers (involving floordiv's); note however
+// that by AffineExpr construction, some floordiv combinations are converted to
+// mod's. The result of the flattening is a flattened expression and a set of
+// constraints involving just the local variables.
+//
+// d2 + (d0 + d1) floordiv 4  is flattened to d2 + q where 'q' is the local
+// variable introduced, with localVarCst containing 4*q <= d0 + d1 <= 4*q + 3.
+//
+// The simplification performed includes the accumulation of contributions for
+// each dimensional and symbolic identifier together, the simplification of
+// floordiv/ceildiv/mod expressions and other simplifications that in turn
+// happen as a result. A simplification that this flattening naturally performs
+// is of simplifying the numerator and denominator of floordiv/ceildiv, and
+// folding a modulo expression to a zero, if possible. Three examples are below:
+//
+// (d0 + 3 * d1) + d0) - 2 * d1) - d0    simplified to     d0 + d1
+// (d0 - d0 mod 4 + 4) mod 4             simplified to     0
+// (3*d0 + 2*d1 + d0) floordiv 2 + d1    simplified to     2*d0 + 2*d1
+//
+// The way the flattening works for the second example is as follows: d0 % 4 is
+// replaced by d0 - 4*q with q being introduced: the expression then simplifies
+// to: (d0 - (d0 - 4q) + 4) = 4q + 4, modulo of which w.r.t 4 simplifies to
+// zero. Note that an affine expression may not always be expressible purely as
+// a sum of products involving just the original dimensional and symbolic
+// identifiers due to the presence of modulo/floordiv/ceildiv expressions that
+// may not be eliminated after simplification; in such cases, the final
+// expression can be reconstructed by replacing the local identifiers with their
+// corresponding explicit form stored in 'localExprs' (note that each of the
+// explicit forms itself would have been simplified).
+//
+// The expression walk method here performs a linear time post order walk that
+// performs the above simplifications through visit methods, with partial
+// results being stored in 'operandExprStack'. When a parent expr is visited,
+// the flattened expressions corresponding to its two operands would already be
+// on the stack - the parent expression looks at the two flattened expressions
+// and combines the two. It pops off the operand expressions and pushes the
+// combined result (although this is done in-place on its LHS operand expr).
+// When the walk is completed, the flattened form of the top-level expression
+// would be left on the stack.
+//
+// A flattener can be repeatedly used for multiple affine expressions that bind
+// to the same operands, for example, for all result expressions of an
+// AffineMap or AffineValueMap. In such cases, using it for multiple expressions
+// is more efficient than creating a new flattener for each expression since
+// common idenical div and mod expressions appearing across different
+// expressions are mapped to the same local identifier (same column position in
+// 'localVarCst').
+class SimpleAffineExprFlattener
+    : public AffineExprVisitor<SimpleAffineExprFlattener> {
+public:
+  // Flattend expression layout: [dims, symbols, locals, constant]
+  // Stack that holds the LHS and RHS operands while visiting a binary op expr.
+  // In future, consider adding a prepass to determine how big the SmallVector's
+  // will be, and linearize this to std::vector<int64_t> to prevent
+  // SmallVector moves on re-allocation.
+  std::vector<SmallVector<int64_t, 8>> operandExprStack;
+
+  unsigned numDims;
+  unsigned numSymbols;
+
+  // Number of newly introduced identifiers to flatten mod/floordiv/ceildiv's.
+  unsigned numLocals;
+
+  // AffineExpr's corresponding to the floordiv/ceildiv/mod expressions for
+  // which new identifiers were introduced; if the latter do not get canceled
+  // out, these expressions can be readily used to reconstruct the AffineExpr
+  // (tree) form. Note that these expressions themselves would have been
+  // simplified (recursively) by this pass. Eg. d0 + (d0 + 2*d1 + d0) ceildiv 4
+  // will be simplified to d0 + q, where q = (d0 + d1) ceildiv 2. (d0 + d1)
+  // ceildiv 2 would be the local expression stored for q.
+  SmallVector<AffineExpr, 4> localExprs;
+
+  SimpleAffineExprFlattener(unsigned numDims, unsigned numSymbols);
+
+  virtual ~SimpleAffineExprFlattener() = default;
+
+  // Visitor method overrides.
+  void visitMulExpr(AffineBinaryOpExpr expr);
+  void visitAddExpr(AffineBinaryOpExpr expr);
+  void visitDimExpr(AffineDimExpr expr);
+  void visitSymbolExpr(AffineSymbolExpr expr);
+  void visitConstantExpr(AffineConstantExpr expr);
+  void visitCeilDivExpr(AffineBinaryOpExpr expr);
+  void visitFloorDivExpr(AffineBinaryOpExpr expr);
+
+  //
+  // t = expr mod c   <=>  t = expr - c*q and c*q <= expr <= c*q + c - 1
+  //
+  // A mod expression "expr mod c" is thus flattened by introducing a new local
+  // variable q (= expr floordiv c), such that expr mod c is replaced with
+  // 'expr - c * q' and c * q <= expr <= c * q + c - 1 are added to localVarCst.
+  void visitModExpr(AffineBinaryOpExpr expr);
+
+protected:
+  // Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+  // The local identifier added is always a floordiv of a pure add/mul affine
+  // function of other identifiers, coefficients of which are specified in
+  // dividend and with respect to a positive constant divisor. localExpr is the
+  // simplified tree expression (AffineExpr) corresponding to the quantifier.
+  virtual void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
+                                  AffineExpr localExpr);
+
+private:
+  // t = expr floordiv c   <=> t = q, c * q <= expr <= c * q + c - 1
+  // A floordiv is thus flattened by introducing a new local variable q, and
+  // replacing that expression with 'q' while adding the constraints
+  // c * q <= expr <= c * q + c - 1 to localVarCst (done by
+  // FlatAffineConstraints::addLocalFloorDiv).
+  //
+  // A ceildiv is similarly flattened:
+  // t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
+  void visitDivExpr(AffineBinaryOpExpr expr, bool isCeil);
+
+  int findLocalId(AffineExpr localExpr);
+
+  inline unsigned getNumCols() const {
+    return numDims + numSymbols + numLocals + 1;
+  }
+  inline unsigned getConstantIndex() const { return getNumCols() - 1; }
+  inline unsigned getLocalVarStartIndex() const { return numDims + numSymbols; }
+  inline unsigned getSymbolStartIndex() const { return numDims; }
+  inline unsigned getDimStartIndex() const { return 0; }
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_AFFINE_EXPR_VISITOR_H
diff --git a/third_party/mlir/include/mlir/IR/AffineMap.h b/third_party/mlir/include/mlir/IR/AffineMap.h
new file mode 100644
index 00000000000..b1ab50f937a
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/AffineMap.h
@@ -0,0 +1,251 @@
+//===- AffineMap.h - MLIR Affine Map Class ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Affine maps are mathematical functions which map a list of dimension
+// identifiers and symbols, to multidimensional affine expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_MAP_H
+#define MLIR_IR_AFFINE_MAP_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+
+namespace detail {
+struct AffineMapStorage;
+} // end namespace detail
+
+class AffineExpr;
+class Attribute;
+struct LogicalResult;
+class MLIRContext;
+
+/// A multi-dimensional affine map
+/// Affine map's are immutable like Type's, and they are uniqued.
+/// Eg: (d0, d1) -> (d0/128, d0 mod 128, d1)
+/// The names used (d0, d1) don't matter - it's the mathematical function that
+/// is unique to this affine map.
+class AffineMap {
+public:
+  using ImplType = detail::AffineMapStorage;
+
+  AffineMap() : map(nullptr) {}
+  explicit AffineMap(ImplType *map) : map(map) {}
+  AffineMap(const AffineMap &other) : map(other.map) {}
+  AffineMap &operator=(const AffineMap &other) = default;
+
+  /// Returns a zero result affine map with no dimensions or symbols: () -> ().
+  static AffineMap get(MLIRContext *context);
+
+  static AffineMap get(unsigned dimCount, unsigned symbolCount,
+                       ArrayRef<AffineExpr> results);
+
+  /// Returns a single constant result affine map.
+  static AffineMap getConstantMap(int64_t val, MLIRContext *context);
+
+  /// Returns an AffineMap with 'numDims' identity result dim exprs.
+  static AffineMap getMultiDimIdentityMap(unsigned numDims,
+                                          MLIRContext *context);
+
+  MLIRContext *getContext() const;
+
+  explicit operator bool() { return map != nullptr; }
+  bool operator==(AffineMap other) const { return other.map == map; }
+  bool operator!=(AffineMap other) const { return !(other.map == map); }
+
+  /// Returns true if this affine map is an identity affine map.
+  /// An identity affine map corresponds to an identity affine function on the
+  /// dimensional identifiers.
+  bool isIdentity() const;
+
+  /// Returns true if this affine map is an empty map, i.e., () -> ().
+  bool isEmpty() const;
+
+  /// Returns true if this affine map is a single result constant function.
+  bool isSingleConstant() const;
+
+  /// Returns the constant result of this map. This methods asserts that the map
+  /// has a single constant result.
+  int64_t getSingleConstantResult() const;
+
+  // Prints affine map to 'os'.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  unsigned getNumDims() const;
+  unsigned getNumSymbols() const;
+  unsigned getNumResults() const;
+  unsigned getNumInputs() const;
+
+  ArrayRef<AffineExpr> getResults() const;
+  AffineExpr getResult(unsigned idx) const;
+
+  /// Walk all of the AffineExpr's in this mapping. Each node in an expression
+  /// tree is visited in postorder.
+  void walkExprs(std::function<void(AffineExpr)> callback) const;
+
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+  /// expression mapping.  Because this can be used to eliminate dims and
+  /// symbols, the client needs to specify the number of dims and symbols in
+  /// the result.  The returned map always has the same number of results.
+  AffineMap replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                  ArrayRef<AffineExpr> symReplacements,
+                                  unsigned numResultDims,
+                                  unsigned numResultSyms);
+
+  /// Folds the results of the application of an affine map on the provided
+  /// operands to a constant if possible.
+  LogicalResult constantFold(ArrayRef<Attribute> operandConstants,
+                             SmallVectorImpl<Attribute> &results) const;
+
+  /// Returns the AffineMap resulting from composing `this` with `map`.
+  /// The resulting AffineMap has as many AffineDimExpr as `map` and as many
+  /// AffineSymbolExpr as the concatenation of `this` and `map` (in which case
+  /// the symbols of `this` map come first).
+  ///
+  /// Prerequisites:
+  /// The maps are composable, i.e. that the number of AffineDimExpr of `this`
+  /// matches the number of results of `map`.
+  ///
+  /// Example:
+  ///   map1: `(d0, d1)[s0, s1] -> (d0 + 1 + s1, d1 - 1 - s0)`
+  ///   map2: `(d0)[s0] -> (d0 + s0, d0 - s0))`
+  ///   map1.compose(map2):
+  ///     `(d0)[s0, s1, s2] -> (d0 + s1 + s2 + 1, d0 - s0 - s2 - 1)`
+  AffineMap compose(AffineMap map);
+
+  /// Returns true if the AffineMap represents a subset (i.e. a projection) of a
+  /// symbol-less permutation map.
+  bool isProjectedPermutation();
+
+  /// Returns true if the AffineMap represents a symbol-less permutation map.
+  bool isPermutation();
+
+  /// Returns the map consisting of the `resultPos` subset.
+  AffineMap getSubMap(ArrayRef<unsigned> resultPos);
+
+  friend ::llvm::hash_code hash_value(AffineMap arg);
+
+private:
+  ImplType *map;
+
+  static AffineMap getImpl(unsigned dimCount, unsigned symbolCount,
+                           ArrayRef<AffineExpr> results, MLIRContext *context);
+};
+
+// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(AffineMap arg) {
+  return ::llvm::hash_value(arg.map);
+}
+
+/// Simplify an affine map by simplifying its underlying AffineExpr results.
+AffineMap simplifyAffineMap(AffineMap map);
+
+/// Returns a map of codomain to domain dimensions such that the first codomain
+/// dimension for a particular domain dimension is selected.
+/// Returns an empty map if the input map is empty or if `map` is not invertible
+/// (i.e. `map` does not contain a subset that is a permutation of full domain
+/// rank).
+///
+/// Prerequisites:
+///   1. `map` has no symbols.
+///
+/// Example 1:
+///
+/// ```{.mlir}
+///    (d0, d1, d2) -> (d1, d1, d0, d2, d1, d2, d1, d0)
+///                      0       2   3
+/// ```
+///
+/// returns:
+///
+/// ```{.mlir}
+///    (d0, d1, d2, d3, d4, d5, d6, d7) -> (d2, d0, d3)
+/// ```
+///
+/// Example 2:
+///
+/// ```{.mlir}
+///    (d0, d1, d2) -> (d1, d0 + d1, d0, d2, d1, d2, d1, d0)
+///                      0            2   3
+/// ```
+///
+/// returns:
+///
+/// ```{.mlir}
+///    (d0, d1, d2, d3, d4, d5, d6, d7) -> (d2, d0, d3)
+/// ```
+AffineMap inversePermutation(AffineMap map);
+
+/// Concatenates a list of `maps` into a single AffineMap, stepping over
+/// potentially empty maps. Assumes each of the underlying map has 0 symbols.
+/// The resulting map has a number of dims equal to the max of `maps`' dims and
+/// the concatenated results as its results.
+/// Returns an empty map if all input `maps` are empty.
+///
+/// Example:
+/// When applied to the following list of 3 affine maps,
+///
+/// ```{.mlir}
+///    {
+///      (i, j, k) -> (i, k),
+///      (i, j, k) -> (k, j),
+///      (i, j, k) -> (i, j)
+///    }
+/// ```
+///
+/// Returns the map:
+///
+/// ```{.mlir}
+///     (i, j, k) -> (i, k, k, j, i, j)
+/// ```
+AffineMap concatAffineMaps(llvm::ArrayRef<AffineMap> maps);
+
+inline raw_ostream &operator<<(raw_ostream &os, AffineMap map) {
+  map.print(os);
+  return os;
+}
+} // end namespace mlir
+
+namespace llvm {
+
+// AffineExpr hash just like pointers
+template <> struct DenseMapInfo<mlir::AffineMap> {
+  static mlir::AffineMap getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::AffineMap(static_cast<mlir::AffineMap::ImplType *>(pointer));
+  }
+  static mlir::AffineMap getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::AffineMap(static_cast<mlir::AffineMap::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::AffineMap val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::AffineMap LHS, mlir::AffineMap RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_AFFINE_MAP_H
diff --git a/third_party/mlir/include/mlir/IR/AttributeSupport.h b/third_party/mlir/include/mlir/IR/AttributeSupport.h
new file mode 100644
index 00000000000..78b3a2779d3
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/AttributeSupport.h
@@ -0,0 +1,116 @@
+//===- AttributeSupport.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines support types for registering dialect extended attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_ATTRIBUTESUPPORT_H
+#define MLIR_IR_ATTRIBUTESUPPORT_H
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StorageUniquerSupport.h"
+#include "llvm/ADT/PointerIntPair.h"
+
+namespace mlir {
+class MLIRContext;
+class Type;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorage
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+class AttributeUniquer;
+} // end namespace detail
+
+/// Base storage class appearing in an attribute. Derived storage classes should
+/// only be constructed within the context of the AttributeUniquer.
+class AttributeStorage : public StorageUniquer::BaseStorage {
+  friend detail::AttributeUniquer;
+  friend StorageUniquer;
+
+public:
+  /// Get the type of this attribute.
+  Type getType() const;
+
+  /// Get the dialect of this attribute.
+  Dialect &getDialect() const {
+    assert(dialect && "Malformed attribute storage object.");
+    return const_cast<Dialect &>(*dialect);
+  }
+
+protected:
+  /// Construct a new attribute storage instance with the given type.
+  /// Note: All attributes require a valid type. If no type is provided here,
+  ///       the type of the attribute will automatically default to NoneType
+  ///       upon initialization in the uniquer.
+  AttributeStorage(Type type);
+  AttributeStorage();
+
+  /// Set the type of this attribute.
+  void setType(Type type);
+
+  // Set the dialect for this storage instance. This is used by the
+  // AttributeUniquer when initializing a newly constructed storage object.
+  void initializeDialect(Dialect &newDialect) { dialect = &newDialect; }
+
+private:
+  /// The dialect for this attribute.
+  Dialect *dialect;
+
+  /// The opaque type of the attribute value.
+  const void *type;
+};
+
+/// Default storage type for attributes that require no additional
+/// initialization or storage.
+using DefaultAttributeStorage = AttributeStorage;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorageAllocator
+//===----------------------------------------------------------------------===//
+
+// This is a utility allocator used to allocate memory for instances of derived
+// Attributes.
+using AttributeStorageAllocator = StorageUniquer::StorageAllocator;
+
+//===----------------------------------------------------------------------===//
+// AttributeUniquer
+//===----------------------------------------------------------------------===//
+namespace detail {
+// A utility class to get, or create, unique instances of attributes within an
+// MLIRContext. This class manages all creation and uniquing of attributes.
+class AttributeUniquer {
+public:
+  /// Get an uniqued instance of attribute T.
+  template <typename T, typename... Args>
+  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+    return ctx->getAttributeUniquer().get<typename T::ImplType>(
+        getInitFn(ctx, T::getClassID()), kind, std::forward<Args>(args)...);
+  }
+
+private:
+  /// Returns a functor used to initialize new attribute storage instances.
+  static std::function<void(AttributeStorage *)>
+  getInitFn(MLIRContext *ctx, const ClassID *const attrID);
+};
+} // namespace detail
+
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Attributes.h b/third_party/mlir/include/mlir/IR/Attributes.h
new file mode 100644
index 00000000000..066eece17ae
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Attributes.h
@@ -0,0 +1,1319 @@
+//===- Attributes.h - MLIR Attribute Classes --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_ATTRIBUTES_H
+#define MLIR_IR_ATTRIBUTES_H
+
+#include "mlir/IR/AttributeSupport.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Sequence.h"
+
+namespace mlir {
+class AffineMap;
+class Dialect;
+class FunctionType;
+class Identifier;
+class IntegerSet;
+class Location;
+class MLIRContext;
+class ShapedType;
+class Type;
+
+namespace detail {
+
+struct AffineMapAttributeStorage;
+struct ArrayAttributeStorage;
+struct BoolAttributeStorage;
+struct DictionaryAttributeStorage;
+struct IntegerAttributeStorage;
+struct IntegerSetAttributeStorage;
+struct FloatAttributeStorage;
+struct OpaqueAttributeStorage;
+struct StringAttributeStorage;
+struct TypeAttributeStorage;
+
+/// Elements Attributes.
+struct DenseElementsAttributeStorage;
+struct OpaqueElementsAttributeStorage;
+struct SparseElementsAttributeStorage;
+} // namespace detail
+
+/// Attributes are known-constant values of operations and functions.
+///
+/// Instances of the Attribute class are references to immutable, uniqued,
+/// and immortal values owned by MLIRContext. As such, an Attribute is a thin
+/// wrapper around an underlying storage pointer. Attributes are usually passed
+/// by value.
+class Attribute {
+public:
+  /// Integer identifier for all the concrete attribute kinds.
+  enum Kind {
+  // Reserve attribute kinds for dialect specific extensions.
+#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
+  FIRST_##Dialect##_ATTR, LAST_##Dialect##_ATTR = FIRST_##Dialect##_ATTR + 0xff,
+#include "DialectSymbolRegistry.def"
+  };
+
+  /// Utility class for implementing attributes.
+  template <typename ConcreteType, typename BaseType = Attribute,
+            typename StorageType = AttributeStorage>
+  using AttrBase = detail::StorageUserBase<ConcreteType, BaseType, StorageType,
+                                           detail::AttributeUniquer>;
+
+  using ImplType = AttributeStorage;
+  using ValueType = void;
+
+  Attribute() : impl(nullptr) {}
+  /* implicit */ Attribute(const ImplType *impl)
+      : impl(const_cast<ImplType *>(impl)) {}
+
+  Attribute(const Attribute &other) : impl(other.impl) {}
+  Attribute &operator=(Attribute other) {
+    impl = other.impl;
+    return *this;
+  }
+
+  bool operator==(Attribute other) const { return impl == other.impl; }
+  bool operator!=(Attribute other) const { return !(*this == other); }
+  explicit operator bool() const { return impl; }
+
+  bool operator!() const { return impl == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U dyn_cast_or_null() const;
+  template <typename U> U cast() const;
+
+  // Support dyn_cast'ing Attribute to itself.
+  static bool classof(Attribute) { return true; }
+
+  /// Return the classification for this attribute.
+  unsigned getKind() const { return impl->getKind(); }
+
+  /// Return the type of this attribute.
+  Type getType() const;
+
+  /// Return the context this attribute belongs to.
+  MLIRContext *getContext() const;
+
+  /// Get the dialect this attribute is registered to.
+  Dialect &getDialect() const;
+
+  /// Print the attribute.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// Get an opaque pointer to the attribute.
+  const void *getAsOpaquePointer() const { return impl; }
+  /// Construct an attribute from the opaque pointer representation.
+  static Attribute getFromOpaquePointer(const void *ptr) {
+    return Attribute(reinterpret_cast<const ImplType *>(ptr));
+  }
+
+  friend ::llvm::hash_code hash_value(Attribute arg);
+
+protected:
+  ImplType *impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Attribute attr) {
+  attr.print(os);
+  return os;
+}
+
+namespace StandardAttributes {
+enum Kind {
+  AffineMap = Attribute::FIRST_STANDARD_ATTR,
+  Array,
+  Bool,
+  Dictionary,
+  Float,
+  Integer,
+  IntegerSet,
+  Opaque,
+  String,
+  SymbolRef,
+  Type,
+  Unit,
+
+  /// Elements Attributes.
+  DenseElements,
+  OpaqueElements,
+  SparseElements,
+  FIRST_ELEMENTS_ATTR = DenseElements,
+  LAST_ELEMENTS_ATTR = SparseElements,
+
+  /// Locations.
+  CallSiteLocation,
+  FileLineColLocation,
+  FusedLocation,
+  NameLocation,
+  UnknownLocation,
+
+  // Represents a location as a 'void*' pointer to a front-end's opaque
+  // location information, which must live longer than the MLIR objects that
+  // refer to it.  OpaqueLocation's are never serialized.
+  //
+  // TODO: OpaqueLocation,
+
+  // Represents a value inlined through a function call.
+  // TODO: InlinedLocation,
+
+  FIRST_LOCATION_ATTR = CallSiteLocation,
+  LAST_LOCATION_ATTR = UnknownLocation,
+};
+} // namespace StandardAttributes
+
+class AffineMapAttr
+    : public Attribute::AttrBase<AffineMapAttr, Attribute,
+                                 detail::AffineMapAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = AffineMap;
+
+  static AffineMapAttr get(AffineMap value);
+
+  AffineMap getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::AffineMap;
+  }
+};
+
+/// Array attributes are lists of other attributes.  They are not necessarily
+/// type homogenous given that attributes don't, in general, carry types.
+class ArrayAttr : public Attribute::AttrBase<ArrayAttr, Attribute,
+                                             detail::ArrayAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = ArrayRef<Attribute>;
+
+  static ArrayAttr get(ArrayRef<Attribute> value, MLIRContext *context);
+
+  ArrayRef<Attribute> getValue() const;
+
+  /// Support range iteration.
+  using iterator = llvm::ArrayRef<Attribute>::iterator;
+  iterator begin() const { return getValue().begin(); }
+  iterator end() const { return getValue().end(); }
+  size_t size() const { return getValue().size(); }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Array;
+  }
+};
+
+class BoolAttr : public Attribute::AttrBase<BoolAttr, Attribute,
+                                            detail::BoolAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = bool;
+
+  static BoolAttr get(bool value, MLIRContext *context);
+
+  bool getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Bool; }
+};
+
+/// NamedAttribute is used for dictionary attributes, it holds an identifier for
+/// the name and a value for the attribute. The attribute pointer should always
+/// be non-null.
+using NamedAttribute = std::pair<Identifier, Attribute>;
+
+/// Dictionary attribute is an attribute that represents a sorted collection of
+/// named attribute values. The elements are sorted by name, and each name must
+/// be unique within the collection.
+class DictionaryAttr
+    : public Attribute::AttrBase<DictionaryAttr, Attribute,
+                                 detail::DictionaryAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = ArrayRef<NamedAttribute>;
+
+  static DictionaryAttr get(ArrayRef<NamedAttribute> value,
+                            MLIRContext *context);
+
+  ArrayRef<NamedAttribute> getValue() const;
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute get(StringRef name) const;
+  Attribute get(Identifier name) const;
+
+  /// Support range iteration.
+  using iterator = llvm::ArrayRef<NamedAttribute>::iterator;
+  iterator begin() const;
+  iterator end() const;
+  bool empty() const { return size() == 0; }
+  size_t size() const;
+
+  /// Methods for supporting type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Dictionary;
+  }
+};
+
+class FloatAttr : public Attribute::AttrBase<FloatAttr, Attribute,
+                                             detail::FloatAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = APFloat;
+
+  /// Return a float attribute for the specified value in the specified type.
+  /// These methods should only be used for simple constant values, e.g 1.0/2.0,
+  /// that are known-valid both as host double and the 'type' format.
+  static FloatAttr get(Type type, double value);
+  static FloatAttr getChecked(Type type, double value, Location loc);
+
+  /// Return a float attribute for the specified value in the specified type.
+  static FloatAttr get(Type type, const APFloat &value);
+  static FloatAttr getChecked(Type type, const APFloat &value, Location loc);
+
+  APFloat getValue() const;
+
+  /// This function is used to convert the value to a double, even if it loses
+  /// precision.
+  double getValueAsDouble() const;
+  static double getValueAsDouble(APFloat val);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Float;
+  }
+
+  /// Verify the construction invariants for a double value.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc, MLIRContext *ctx,
+                               Type type, double value);
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc, MLIRContext *ctx,
+                               Type type, const APFloat &value);
+};
+
+class IntegerAttr
+    : public Attribute::AttrBase<IntegerAttr, Attribute,
+                                 detail::IntegerAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = APInt;
+
+  static IntegerAttr get(Type type, int64_t value);
+  static IntegerAttr get(Type type, const APInt &value);
+
+  APInt getValue() const;
+  // TODO(jpienaar): Change callers to use getValue instead.
+  int64_t getInt() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Integer;
+  }
+};
+
+class IntegerSetAttr
+    : public Attribute::AttrBase<IntegerSetAttr, Attribute,
+                                 detail::IntegerSetAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = IntegerSet;
+
+  static IntegerSetAttr get(IntegerSet value);
+
+  IntegerSet getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::IntegerSet;
+  }
+};
+
+/// Opaque attributes represent attributes of non-registered dialects. These are
+/// attribute represented in their raw string form, and can only usefully be
+/// tested for attribute equality.
+class OpaqueAttr : public Attribute::AttrBase<OpaqueAttr, Attribute,
+                                              detail::OpaqueAttributeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new OpaqueAttr with the provided dialect and string data.
+  static OpaqueAttr get(Identifier dialect, StringRef attrData, Type type,
+                        MLIRContext *context);
+
+  /// Get or create a new OpaqueAttr with the provided dialect and string data.
+  /// If the given identifier is not a valid namespace for a dialect, then a
+  /// null attribute is returned.
+  static OpaqueAttr getChecked(Identifier dialect, StringRef attrData,
+                               Type type, Location location);
+
+  /// Returns the dialect namespace of the opaque attribute.
+  Identifier getDialectNamespace() const;
+
+  /// Returns the raw attribute data of the opaque attribute.
+  StringRef getAttrData() const;
+
+  /// Verify the construction of an opaque attribute.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, Identifier dialect,
+                               StringRef attrData, Type type);
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Opaque;
+  }
+};
+
+class StringAttr : public Attribute::AttrBase<StringAttr, Attribute,
+                                              detail::StringAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = StringRef;
+
+  /// Get an instance of a StringAttr with the given string.
+  static StringAttr get(StringRef bytes, MLIRContext *context);
+
+  /// Get an instance of a StringAttr with the given string and Type.
+  static StringAttr get(StringRef bytes, Type type);
+
+  StringRef getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::String;
+  }
+};
+
+/// A symbol reference attribute represents a symbolic reference to another
+/// operation.
+class SymbolRefAttr
+    : public Attribute::AttrBase<SymbolRefAttr, Attribute,
+                                 detail::StringAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = StringRef;
+
+  static SymbolRefAttr get(StringRef value, MLIRContext *ctx);
+
+  /// Returns the name of the held symbol reference.
+  StringRef getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::SymbolRef;
+  }
+};
+
+class TypeAttr : public Attribute::AttrBase<TypeAttr, Attribute,
+                                            detail::TypeAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = Type;
+
+  static TypeAttr get(Type value);
+
+  Type getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Type; }
+};
+
+/// Unit attributes are attributes that hold no specific value and are given
+/// meaning by their existence.
+class UnitAttr : public Attribute::AttrBase<UnitAttr> {
+public:
+  using Base::Base;
+
+  static UnitAttr get(MLIRContext *context);
+
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Unit; }
+};
+
+//===----------------------------------------------------------------------===//
+// Elements Attributes
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+template <typename T> class ElementsAttrIterator;
+template <typename T> class ElementsAttrRange;
+} // namespace detail
+
+/// A base attribute that represents a reference to a static shaped tensor or
+/// vector constant.
+class ElementsAttr : public Attribute {
+public:
+  using Attribute::Attribute;
+  template <typename T> using iterator = detail::ElementsAttrIterator<T>;
+  template <typename T> using iterator_range = detail::ElementsAttrRange<T>;
+
+  /// Return the type of this ElementsAttr, guaranteed to be a vector or tensor
+  /// with static shape.
+  ShapedType getType() const;
+
+  /// Return the value at the given index. The index is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Return the value of type 'T' at the given index, where 'T' corresponds to
+  /// an Attribute type.
+  template <typename T> T getValue(ArrayRef<uint64_t> index) const {
+    return getValue(index).template cast<T>();
+  }
+
+  /// Return the elements of this attribute as a value of type 'T'. Note:
+  /// Aborts if the subclass is OpaqueElementsAttrs, these attrs do not support
+  /// iteration.
+  template <typename T> iterator_range<T> getValues() const;
+
+  /// Return if the given 'index' refers to a valid element in this attribute.
+  bool isValidIndex(ArrayRef<uint64_t> index) const;
+
+  /// Returns the number of elements held by this attribute.
+  int64_t getNumElements() const;
+
+  /// Generates a new ElementsAttr by mapping each int value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This ElementsAttr should contain integers.
+  ElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Generates a new ElementsAttr by mapping each float value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This ElementsAttr should contain floats.
+  ElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APFloat &)> mapping) const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() >= StandardAttributes::FIRST_ELEMENTS_ATTR &&
+           attr.getKind() <= StandardAttributes::LAST_ELEMENTS_ATTR;
+  }
+
+protected:
+  /// Returns the 1 dimenional flattened row-major index from the given
+  /// multi-dimensional index.
+  uint64_t getFlattenedIndex(ArrayRef<uint64_t> index) const;
+};
+
+namespace detail {
+/// DenseElementsAttr data is aligned to uint64_t, so this traits class is
+/// necessary to interop with PointerIntPair.
+class DenseElementDataPointerTypeTraits {
+public:
+  static inline const void *getAsVoidPointer(const char *ptr) { return ptr; }
+  static inline const char *getFromVoidPointer(const void *ptr) {
+    return static_cast<const char *>(ptr);
+  }
+
+  // Note: We could steal more bits if the need arises.
+  enum { NumLowBitsAvailable = 1 };
+};
+
+/// Pair of raw pointer and a boolean flag of whether the pointer holds a splat,
+using DenseIterPtrAndSplat =
+    llvm::PointerIntPair<const char *, 1, bool,
+                         DenseElementDataPointerTypeTraits>;
+
+/// Impl iterator for indexed DenseElementAttr iterators that records a data
+/// pointer and data index that is adjusted for the case of a splat attribute.
+template <typename ConcreteT, typename T, typename PointerT = T *,
+          typename ReferenceT = T &>
+class DenseElementIndexedIteratorImpl
+    : public indexed_accessor_iterator<ConcreteT, DenseIterPtrAndSplat, T,
+                                       PointerT, ReferenceT> {
+protected:
+  DenseElementIndexedIteratorImpl(const char *data, bool isSplat,
+                                  size_t dataIndex)
+      : indexed_accessor_iterator<ConcreteT, DenseIterPtrAndSplat, T, PointerT,
+                                  ReferenceT>({data, isSplat}, dataIndex) {}
+
+  /// Return the current index for this iterator, adjusted for the case of a
+  /// splat.
+  ptrdiff_t getDataIndex() const {
+    bool isSplat = this->object.getInt();
+    return isSplat ? 0 : this->index;
+  }
+
+  /// Return the data object pointer.
+  const char *getData() const { return this->object.getPointer(); }
+};
+} // namespace detail
+
+/// An attribute that represents a reference to a dense vector or tensor object.
+///
+class DenseElementsAttr
+    : public Attribute::AttrBase<DenseElementsAttr, ElementsAttr,
+                                 detail::DenseElementsAttributeStorage> {
+public:
+  using Base::Base;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() == StandardAttributes::DenseElements;
+  }
+
+  /// Constructs a dense elements attribute from an array of element values.
+  /// Each element attribute value is expected to be an element of 'type'.
+  /// 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<Attribute> values);
+
+  /// Constructs a dense integer elements attribute from an array of integer
+  /// or floating-point values. Each value is expected to be the same bitwidth
+  /// of the element type of 'type'. 'type' must be a vector or tensor with
+  /// static shape.
+  template <typename T, typename = typename std::enable_if<
+                            std::numeric_limits<T>::is_integer ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  static DenseElementsAttr get(const ShapedType &type, ArrayRef<T> values) {
+    const char *data = reinterpret_cast<const char *>(values.data());
+    return getRawIntOrFloat(
+        type, ArrayRef<char>(data, values.size() * sizeof(T)), sizeof(T),
+        /*isInt=*/std::numeric_limits<T>::is_integer);
+  }
+
+  /// Constructs a dense integer elements attribute from a single element.
+  template <typename T, typename = typename std::enable_if<
+                            std::numeric_limits<T>::is_integer ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  static DenseElementsAttr get(const ShapedType &type, T value) {
+    return get(type, llvm::makeArrayRef(value));
+  }
+
+  /// Overload of the above 'get' method that is specialized for boolean values.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<bool> values);
+
+  /// Constructs a dense integer elements attribute from an array of APInt
+  /// values. Each APInt value is expected to have the same bitwidth as the
+  /// element type of 'type'. 'type' must be a vector or tensor with static
+  /// shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<APInt> values);
+
+  /// Constructs a dense float elements attribute from an array of APFloat
+  /// values. Each APFloat value is expected to have the same bitwidth as the
+  /// element type of 'type'. 'type' must be a vector or tensor with static
+  /// shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<APFloat> values);
+
+  /// Construct a dense elements attribute for an initializer_list of values.
+  /// Each value is expected to be the same bitwidth of the element type of
+  /// 'type'. 'type' must be a vector or tensor with static shape.
+  template <typename T>
+  static DenseElementsAttr get(const ShapedType &type,
+                               const std::initializer_list<T> &list) {
+    return get(type, ArrayRef<T>(list));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Iterators
+  //===--------------------------------------------------------------------===//
+
+  /// A utility iterator that allows walking over the internal Attribute values
+  /// of a DenseElementsAttr.
+  class AttributeElementIterator
+      : public indexed_accessor_iterator<AttributeElementIterator, const void *,
+                                         Attribute, Attribute, Attribute> {
+  public:
+    /// Accesses the Attribute value at this iterator position.
+    Attribute operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    AttributeElementIterator(DenseElementsAttr attr, size_t index);
+  };
+
+  /// Iterator for walking raw element values of the specified type 'T', which
+  /// may be any c++ data type matching the stored representation: int32_t,
+  /// float, etc.
+  template <typename T>
+  class ElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<ElementIterator<T>,
+                                                       const T> {
+  public:
+    /// Accesses the raw value at this iterator position.
+    const T &operator*() const {
+      return reinterpret_cast<const T *>(this->getData())[this->getDataIndex()];
+    }
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    ElementIterator(const char *data, bool isSplat, size_t dataIndex)
+        : detail::DenseElementIndexedIteratorImpl<ElementIterator<T>, const T>(
+              data, isSplat, dataIndex) {}
+  };
+
+  /// A utility iterator that allows walking over the internal bool values.
+  class BoolElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<BoolElementIterator,
+                                                       bool, bool, bool> {
+  public:
+    /// Accesses the bool value at this iterator position.
+    bool operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    BoolElementIterator(DenseElementsAttr attr, size_t dataIndex);
+  };
+
+  /// A utility iterator that allows walking over the internal raw APInt values.
+  class IntElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<IntElementIterator,
+                                                       APInt, APInt, APInt> {
+  public:
+    /// Accesses the raw APInt value at this iterator position.
+    APInt operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    IntElementIterator(DenseElementsAttr attr, size_t dataIndex);
+
+    /// The bitwidth of the element type.
+    size_t bitWidth;
+  };
+
+  /// Iterator for walking over APFloat values.
+  class FloatElementIterator final
+      : public llvm::mapped_iterator<IntElementIterator,
+                                     std::function<APFloat(const APInt &)>> {
+    friend DenseElementsAttr;
+
+    /// Initializes the float element iterator to the specified iterator.
+    FloatElementIterator(const llvm::fltSemantics &smt, IntElementIterator it);
+
+  public:
+    using reference = APFloat;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // Value Querying
+  //===--------------------------------------------------------------------===//
+
+  /// Returns if this attribute corresponds to a splat, i.e. if all element
+  /// values are the same.
+  bool isSplat() const;
+
+  /// Return the splat value for this attribute. This asserts that the attribute
+  /// corresponds to a splat.
+  Attribute getSplatValue() const { return getSplatValue<Attribute>(); }
+  template <typename T>
+  typename std::enable_if<!std::is_base_of<Attribute, T>::value ||
+                              std::is_same<Attribute, T>::value,
+                          T>::type
+  getSplatValue() const {
+    assert(isSplat() && "expected the attribute to be a splat");
+    return *getValues<T>().begin();
+  }
+  /// Return the splat value for derived attribute element types.
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Attribute, T>::value &&
+                              !std::is_same<Attribute, T>::value,
+                          T>::type
+  getSplatValue() const {
+    return getSplatValue().template cast<T>();
+  }
+
+  /// Return the value at the given index. The 'index' is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const {
+    return getValue<Attribute>(index);
+  }
+  template <typename T> T getValue(ArrayRef<uint64_t> index) const {
+    // Skip to the element corresponding to the flattened index.
+    return *std::next(getValues<T>().begin(), getFlattenedIndex(index));
+  }
+
+  /// Return the held element values as a range of integer or floating-point
+  /// values.
+  template <typename T, typename = typename std::enable_if<
+                            (!std::is_same<T, bool>::value &&
+                             std::numeric_limits<T>::is_integer) ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  llvm::iterator_range<ElementIterator<T>> getValues() const {
+    assert(isValidIntOrFloat(sizeof(T), std::numeric_limits<T>::is_integer));
+    auto rawData = getRawData().data();
+    bool splat = isSplat();
+    return {ElementIterator<T>(rawData, splat, 0),
+            ElementIterator<T>(rawData, splat, getNumElements())};
+  }
+
+  /// Return the held element values as a range of Attributes.
+  llvm::iterator_range<AttributeElementIterator> getAttributeValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, Attribute>::value>::type>
+  llvm::iterator_range<AttributeElementIterator> getValues() const {
+    return getAttributeValues();
+  }
+  AttributeElementIterator attr_value_begin() const;
+  AttributeElementIterator attr_value_end() const;
+
+  /// Return the held element values a range of T, where T is a derived
+  /// attribute type.
+  template <typename T>
+  using DerivedAttributeElementIterator =
+      llvm::mapped_iterator<AttributeElementIterator, T (*)(Attribute)>;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_base_of<Attribute, T>::value &&
+                            !std::is_same<Attribute, T>::value>::type>
+  llvm::iterator_range<DerivedAttributeElementIterator<T>> getValues() const {
+    auto castFn = [](Attribute attr) { return attr.template cast<T>(); };
+    return llvm::map_range(getAttributeValues(),
+                           static_cast<T (*)(Attribute)>(castFn));
+  }
+
+  /// Return the held element values as a range of bool. The element type of
+  /// this attribute must be of integer type of bitwidth 1.
+  llvm::iterator_range<BoolElementIterator> getBoolValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, bool>::value>::type>
+  llvm::iterator_range<BoolElementIterator> getValues() const {
+    return getBoolValues();
+  }
+
+  /// Return the held element values as a range of APInts. The element type of
+  /// this attribute must be of integer type.
+  llvm::iterator_range<IntElementIterator> getIntValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, APInt>::value>::type>
+  llvm::iterator_range<IntElementIterator> getValues() const {
+    return getIntValues();
+  }
+  IntElementIterator int_value_begin() const;
+  IntElementIterator int_value_end() const;
+
+  /// Return the held element values as a range of APFloat. The element type of
+  /// this attribute must be of float type.
+  llvm::iterator_range<FloatElementIterator> getFloatValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, APFloat>::value>::type>
+  llvm::iterator_range<FloatElementIterator> getValues() const {
+    return getFloatValues();
+  }
+  FloatElementIterator float_value_begin() const;
+  FloatElementIterator float_value_end() const;
+
+  //===--------------------------------------------------------------------===//
+  // Mutation Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Return a new DenseElementsAttr that has the same data as the current
+  /// attribute, but has been reshaped to 'newType'. The new type must have the
+  /// same total number of elements as well as element type.
+  DenseElementsAttr reshape(ShapedType newType);
+
+  /// Generates a new DenseElementsAttr by mapping each int value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This underlying type must be an DenseIntElementsAttr.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Generates a new DenseElementsAttr by mapping each float value to a new
+  /// underlying APInt. the new values can represent either a integer or float.
+  /// This underlying type must be an DenseFPElementsAttr.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APFloat &)> mapping) const;
+
+protected:
+  /// Return the raw storage data held by this attribute.
+  ArrayRef<char> getRawData() const;
+
+  /// Get iterators to the raw APInt values for each element in this attribute.
+  IntElementIterator raw_int_begin() const {
+    return IntElementIterator(*this, 0);
+  }
+  IntElementIterator raw_int_end() const {
+    return IntElementIterator(*this, getNumElements());
+  }
+
+  /// Constructs a dense elements attribute from an array of raw APInt values.
+  /// Each APInt value is expected to have the same bitwidth as the element type
+  /// of 'type'. 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr getRaw(ShapedType type, ArrayRef<APInt> values);
+
+  /// Get or create a new dense elements attribute instance with the given raw
+  /// data buffer. 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr getRaw(ShapedType type, ArrayRef<char> data,
+                                  bool isSplat);
+
+  /// Overload of the raw 'get' method that asserts that the given type is of
+  /// integer or floating-point type. This method is used to verify type
+  /// invariants that the templatized 'get' method cannot.
+  static DenseElementsAttr getRawIntOrFloat(ShapedType type,
+                                            ArrayRef<char> data,
+                                            int64_t dataEltSize, bool isInt);
+
+  /// Check the information for a c++ data type, check if this type is valid for
+  /// the current attribute. This method is used to verify specific type
+  /// invariants that the templatized 'getValues' method cannot.
+  bool isValidIntOrFloat(int64_t dataEltSize, bool isInt) const;
+};
+
+/// An attribute that represents a reference to a dense float vector or tensor
+/// object. Each element is stored as a double.
+class DenseFPElementsAttr : public DenseElementsAttr {
+public:
+  using iterator = DenseElementsAttr::FloatElementIterator;
+
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Generates a new DenseElementsAttr by mapping each value attribute, and
+  /// constructing the DenseElementsAttr given the new element type.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APFloat &)> mapping) const;
+
+  /// Iterator access to the float element values.
+  iterator begin() const { return float_value_begin(); }
+  iterator end() const { return float_value_end(); }
+
+  /// Method for supporting type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr);
+};
+
+/// An attribute that represents a reference to a dense integer vector or tensor
+/// object.
+class DenseIntElementsAttr : public DenseElementsAttr {
+public:
+  /// DenseIntElementsAttr iterates on APInt, so we can use the raw element
+  /// iterator directly.
+  using iterator = DenseElementsAttr::IntElementIterator;
+
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Generates a new DenseElementsAttr by mapping each value attribute, and
+  /// constructing the DenseElementsAttr given the new element type.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            llvm::function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Iterator access to the integer element values.
+  iterator begin() const { return raw_int_begin(); }
+  iterator end() const { return raw_int_end(); }
+
+  /// Method for supporting type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr);
+};
+
+/// An opaque attribute that represents a reference to a vector or tensor
+/// constant with opaque content. This respresentation is for tensor constants
+/// which the compiler may not need to interpret. This attribute is always
+/// associated with a particular dialect, which provides a method to convert
+/// tensor representation to a non-opaque format.
+class OpaqueElementsAttr
+    : public Attribute::AttrBase<OpaqueElementsAttr, ElementsAttr,
+                                 detail::OpaqueElementsAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = StringRef;
+
+  static OpaqueElementsAttr get(Dialect *dialect, ShapedType type,
+                                StringRef bytes);
+
+  StringRef getValue() const;
+
+  /// Return the value at the given index. The 'index' is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Decodes the attribute value using dialect-specific decoding hook.
+  /// Returns false if decoding is successful. If not, returns true and leaves
+  /// 'result' argument unspecified.
+  bool decode(ElementsAttr &result);
+
+  /// Returns dialect associated with this opaque constant.
+  Dialect *getDialect() const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::OpaqueElements;
+  }
+};
+
+/// An attribute that represents a reference to a sparse vector or tensor
+/// object.
+///
+/// This class uses COO (coordinate list) encoding to represent the sparse
+/// elements in an element attribute. Specifically, the sparse vector/tensor
+/// stores the indices and values as two separate dense elements attributes of
+/// tensor type (even if the sparse attribute is of vector type, in order to
+/// support empty lists). The dense elements attribute indices is a 2-D tensor
+/// of 64-bit integer elements with shape [N, ndims], which specifies the
+/// indices of the elements in the sparse tensor that contains nonzero values.
+/// The dense elements attribute values is a 1-D tensor with shape [N], and it
+/// supplies the corresponding values for the indices.
+///
+/// For example,
+/// `sparse<tensor<3x4xi32>, [[0, 0], [1, 2]], [1, 5]>` represents tensor
+/// [[1, 0, 0, 0],
+///  [0, 0, 5, 0],
+///  [0, 0, 0, 0]].
+class SparseElementsAttr
+    : public Attribute::AttrBase<SparseElementsAttr, ElementsAttr,
+                                 detail::SparseElementsAttributeStorage> {
+public:
+  using Base::Base;
+
+  template <typename T>
+  using iterator =
+      llvm::mapped_iterator<llvm::detail::value_sequence_iterator<ptrdiff_t>,
+                            std::function<T(ptrdiff_t)>>;
+
+  /// 'type' must be a vector or tensor with static shape.
+  static SparseElementsAttr get(ShapedType type, DenseElementsAttr indices,
+                                DenseElementsAttr values);
+
+  DenseIntElementsAttr getIndices() const;
+
+  DenseElementsAttr getValues() const;
+
+  /// Return the values of this attribute in the form of the given type 'T'. 'T'
+  /// may be any of Attribute, APInt, APFloat, c++ integer/float types, etc.
+  template <typename T> llvm::iterator_range<iterator<T>> getValues() const {
+    auto zeroValue = getZeroValue<T>();
+    auto valueIt = getValues().getValues<T>().begin();
+    const std::vector<ptrdiff_t> flatSparseIndices(getFlattenedSparseIndices());
+    // TODO(riverriddle): Move-capture flatSparseIndices when c++14 is
+    // available.
+    std::function<T(ptrdiff_t)> mapFn = [=](ptrdiff_t index) {
+      // Try to map the current index to one of the sparse indices.
+      for (unsigned i = 0, e = flatSparseIndices.size(); i != e; ++i)
+        if (flatSparseIndices[i] == index)
+          return *std::next(valueIt, i);
+      // Otherwise, return the zero value.
+      return zeroValue;
+    };
+    return llvm::map_range(llvm::seq<ptrdiff_t>(0, getNumElements()), mapFn);
+  }
+
+  /// Return the value of the element at the given index. The 'index' is
+  /// expected to refer to a valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::SparseElements;
+  }
+
+private:
+  /// Get a zero APFloat for the given sparse attribute.
+  APFloat getZeroAPFloat() const;
+
+  /// Get a zero APInt for the given sparse attribute.
+  APInt getZeroAPInt() const;
+
+  /// Get a zero attribute for the given sparse attribute.
+  Attribute getZeroAttr() const;
+
+  /// Utility methods to generate a zero value of some type 'T'. This is used by
+  /// the 'iterator' class.
+  /// Get a zero for a given attribute type.
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Attribute, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAttr().template cast<T>();
+  }
+  /// Get a zero for an APInt.
+  template <typename T>
+  typename std::enable_if<std::is_same<APInt, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAPInt();
+  }
+  /// Get a zero for an APFloat.
+  template <typename T>
+  typename std::enable_if<std::is_same<APFloat, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAPFloat();
+  }
+  /// Get a zero for an C++ integer or float type.
+  template <typename T>
+  typename std::enable_if<std::numeric_limits<T>::is_integer ||
+                              llvm::is_one_of<T, float, double>::value,
+                          T>::type
+  getZeroValue() const {
+    return T(0);
+  }
+
+  /// Flatten, and return, all of the sparse indices in this attribute in
+  /// row-major order.
+  std::vector<ptrdiff_t> getFlattenedSparseIndices() const;
+};
+
+/// An attribute that represents a reference to a splat vector or tensor
+/// constant, meaning all of the elements have the same value.
+class SplatElementsAttr : public DenseElementsAttr {
+public:
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    auto denseAttr = attr.dyn_cast<DenseElementsAttr>();
+    return denseAttr && denseAttr.isSplat();
+  }
+};
+
+namespace detail {
+/// This class represents a general iterator over the values of an ElementsAttr.
+/// It supports all subclasses aside from OpaqueElementsAttr.
+template <typename T>
+class ElementsAttrIterator
+    : public llvm::iterator_facade_base<ElementsAttrIterator<T>,
+                                        std::random_access_iterator_tag, T,
+                                        std::ptrdiff_t, T, T> {
+  // NOTE: We use a dummy enable_if here because MSVC cannot use 'decltype'
+  // inside of a conversion operator.
+  using DenseIteratorT = typename std::enable_if<
+      true,
+      decltype(std::declval<DenseElementsAttr>().getValues<T>().begin())>::type;
+  using SparseIteratorT = SparseElementsAttr::iterator<T>;
+
+  /// A union containing the specific iterators for each derived attribute kind.
+  union Iterator {
+    Iterator(DenseIteratorT &&it) : denseIt(std::move(it)) {}
+    Iterator(SparseIteratorT &&it) : sparseIt(std::move(it)) {}
+    Iterator() {}
+    ~Iterator() {}
+
+    operator const DenseIteratorT &() const { return denseIt; }
+    operator const SparseIteratorT &() const { return sparseIt; }
+    operator DenseIteratorT &() { return denseIt; }
+    operator SparseIteratorT &() { return sparseIt; }
+
+    /// An instance of a dense elements iterator.
+    DenseIteratorT denseIt;
+    /// An instance of a sparse elements iterator.
+    SparseIteratorT sparseIt;
+  };
+
+  /// Utility method to process a functor on each of the internal iterator
+  /// types.
+  template <typename RetT, template <typename> class ProcessFn,
+            typename... Args>
+  RetT process(Args &... args) const {
+    switch (attrKind) {
+    case StandardAttributes::DenseElements:
+      return ProcessFn<DenseIteratorT>()(args...);
+    case StandardAttributes::SparseElements:
+      return ProcessFn<SparseIteratorT>()(args...);
+    }
+    llvm_unreachable("unexpected attribute kind");
+  }
+
+  /// Utility functors used to generically implement the iterators methods.
+  template <typename ItT> struct PlusAssign {
+    void operator()(ItT &it, ptrdiff_t offset) { it += offset; }
+  };
+  template <typename ItT> struct Minus {
+    ptrdiff_t operator()(const ItT &lhs, const ItT &rhs) { return lhs - rhs; }
+  };
+  template <typename ItT> struct MinusAssign {
+    void operator()(ItT &it, ptrdiff_t offset) { it -= offset; }
+  };
+  template <typename ItT> struct Dereference {
+    T operator()(ItT &it) { return *it; }
+  };
+  template <typename ItT> struct ConstructIter {
+    void operator()(ItT &dest, const ItT &it) { ::new (&dest) ItT(it); }
+  };
+  template <typename ItT> struct DestructIter {
+    void operator()(ItT &it) { it.~ItT(); }
+  };
+
+public:
+  ElementsAttrIterator(const ElementsAttrIterator<T> &rhs)
+      : attrKind(rhs.attrKind) {
+    process<void, ConstructIter>(it, rhs.it);
+  }
+  ~ElementsAttrIterator() { process<void, DestructIter>(it); }
+
+  /// Methods necessary to support random access iteration.
+  ptrdiff_t operator-(const ElementsAttrIterator<T> &rhs) const {
+    assert(attrKind == rhs.attrKind && "incompatible iterators");
+    return process<ptrdiff_t, Minus>(it, rhs.it);
+  }
+  bool operator==(const ElementsAttrIterator<T> &rhs) const {
+    return rhs.attrKind == attrKind && process<bool, std::equal_to>(it, rhs.it);
+  }
+  bool operator<(const ElementsAttrIterator<T> &rhs) const {
+    assert(attrKind == rhs.attrKind && "incompatible iterators");
+    return process<bool, std::less>(it, rhs.it);
+  }
+  ElementsAttrIterator<T> &operator+=(ptrdiff_t offset) {
+    process<void, PlusAssign>(it, offset);
+    return *this;
+  }
+  ElementsAttrIterator<T> &operator-=(ptrdiff_t offset) {
+    process<void, MinusAssign>(it, offset);
+    return *this;
+  }
+
+  /// Dereference the iterator at the current index.
+  T operator*() { return process<T, Dereference>(it); }
+
+private:
+  template <typename IteratorT>
+  ElementsAttrIterator(unsigned attrKind, IteratorT &&it)
+      : attrKind(attrKind), it(std::forward<IteratorT>(it)) {}
+
+  /// Allow accessing the constructor.
+  friend ElementsAttr;
+
+  /// The kind of derived elements attribute.
+  unsigned attrKind;
+
+  /// A union containing the specific iterators for each derived kind.
+  Iterator it;
+};
+
+template <typename T>
+class ElementsAttrRange : public llvm::iterator_range<ElementsAttrIterator<T>> {
+  using llvm::iterator_range<ElementsAttrIterator<T>>::iterator_range;
+};
+} // namespace detail
+
+/// Return the elements of this attribute as a value of type 'T'.
+template <typename T>
+auto ElementsAttr::getValues() const -> iterator_range<T> {
+  if (DenseElementsAttr denseAttr = dyn_cast<DenseElementsAttr>()) {
+    auto values = denseAttr.getValues<T>();
+    return {iterator<T>(getKind(), values.begin()),
+            iterator<T>(getKind(), values.end())};
+  }
+  if (SparseElementsAttr sparseAttr = dyn_cast<SparseElementsAttr>()) {
+    auto values = sparseAttr.getValues<T>();
+    return {iterator<T>(getKind(), values.begin()),
+            iterator<T>(getKind(), values.end())};
+  }
+  llvm_unreachable("unexpected attribute kind");
+}
+
+//===----------------------------------------------------------------------===//
+// Attributes Utils
+//===----------------------------------------------------------------------===//
+
+template <typename U> bool Attribute::isa() const {
+  assert(impl && "isa<> used on a null attribute.");
+  return U::classof(*this);
+}
+template <typename U> U Attribute::dyn_cast() const {
+  return isa<U>() ? U(impl) : U(nullptr);
+}
+template <typename U> U Attribute::dyn_cast_or_null() const {
+  return (impl && isa<U>()) ? U(impl) : U(nullptr);
+}
+template <typename U> U Attribute::cast() const {
+  assert(isa<U>());
+  return U(impl);
+}
+
+// Make Attribute hashable.
+inline ::llvm::hash_code hash_value(Attribute arg) {
+  return ::llvm::hash_value(arg.impl);
+}
+
+//===----------------------------------------------------------------------===//
+// NamedAttributeList
+//===----------------------------------------------------------------------===//
+
+/// A NamedAttributeList is used to manage a list of named attributes. This
+/// provides simple interfaces for adding/removing/finding attributes from
+/// within a DictionaryAttr.
+///
+/// We assume there will be relatively few attributes on a given operation
+/// (maybe a dozen or so, but not hundreds or thousands) so we use linear
+/// searches for everything.
+class NamedAttributeList {
+public:
+  NamedAttributeList(DictionaryAttr attrs = nullptr)
+      : attrs((attrs && !attrs.empty()) ? attrs : nullptr) {}
+  NamedAttributeList(ArrayRef<NamedAttribute> attributes);
+
+  /// Return the underlying dictionary attribute. This may be null, if this list
+  /// has no attributes.
+  DictionaryAttr getDictionary() const { return attrs; }
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() const;
+
+  /// Replace the held attributes with ones provided in 'newAttrs'.
+  void setAttrs(ArrayRef<NamedAttribute> attributes);
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute get(StringRef name) const;
+  Attribute get(Identifier name) const;
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void set(Identifier name, Attribute value);
+
+  enum class RemoveResult { Removed, NotFound };
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  RemoveResult remove(Identifier name);
+
+private:
+  DictionaryAttr attrs;
+};
+
+} // end namespace mlir.
+
+namespace llvm {
+
+// Attribute hash just like pointers.
+template <> struct DenseMapInfo<mlir::Attribute> {
+  static mlir::Attribute getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Attribute(static_cast<mlir::Attribute::ImplType *>(pointer));
+  }
+  static mlir::Attribute getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Attribute(static_cast<mlir::Attribute::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::Attribute val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::Attribute LHS, mlir::Attribute RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// Allow LLVM to steal the low bits of Attributes.
+template <> struct PointerLikeTypeTraits<mlir::Attribute> {
+public:
+  static inline void *getAsVoidPointer(mlir::Attribute attr) {
+    return const_cast<void *>(attr.getAsOpaquePointer());
+  }
+  static inline mlir::Attribute getFromVoidPointer(void *ptr) {
+    return mlir::Attribute::getFromOpaquePointer(ptr);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // namespace llvm
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Block.h b/third_party/mlir/include/mlir/IR/Block.h
new file mode 100644
index 00000000000..84144b89c36
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Block.h
@@ -0,0 +1,456 @@
+//===- Block.h - MLIR Block Class -------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the Block class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BLOCK_H
+#define MLIR_IR_BLOCK_H
+
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Operation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace ilist_detail {
+// Explicitly define the node access for the operation list so that we can
+// break the dependence on the Operation class in this header. This allows for
+// operations to have trailing Regions without a circular include
+// dependence.
+template <>
+struct SpecificNodeAccess<
+    typename compute_node_options<::mlir::Operation>::type> : NodeAccess {
+protected:
+  using OptionsT = typename compute_node_options<mlir::Operation>::type;
+  using pointer = typename OptionsT::pointer;
+  using const_pointer = typename OptionsT::const_pointer;
+  using node_type = ilist_node_impl<OptionsT>;
+
+  static node_type *getNodePtr(pointer N);
+  static const node_type *getNodePtr(const_pointer N);
+
+  static pointer getValuePtr(node_type *N);
+  static const_pointer getValuePtr(const node_type *N);
+};
+} // end namespace ilist_detail
+
+template <> struct ilist_traits<::mlir::Operation> {
+  using Operation = ::mlir::Operation;
+  using op_iterator = simple_ilist<Operation>::iterator;
+
+  static void deleteNode(Operation *op);
+  void addNodeToList(Operation *op);
+  void removeNodeFromList(Operation *op);
+  void transferNodesFromList(ilist_traits<Operation> &otherList,
+                             op_iterator first, op_iterator last);
+
+private:
+  mlir::Block *getContainingBlock();
+};
+} // end namespace llvm
+
+namespace mlir {
+using BlockOperand = IROperandImpl<Block>;
+
+class PredecessorIterator;
+class SuccessorIterator;
+
+/// `Block` represents an ordered list of `Operation`s.
+class Block : public IRObjectWithUseList,
+              public llvm::ilist_node_with_parent<Block, Region> {
+public:
+  explicit Block() {}
+  ~Block();
+
+  void clear() {
+    // Drop all references from within this block.
+    dropAllReferences();
+
+    // Clear operations in the reverse order so that uses are destroyed
+    // before their defs.
+    while (!empty())
+      operations.pop_back();
+  }
+
+  /// Blocks are maintained in a Region.
+  Region *getParent();
+
+  /// Returns the closest surrounding operation that contains this block.
+  Operation *getParentOp();
+
+  /// Return if this block is the entry block in the parent region.
+  bool isEntryBlock();
+
+  /// Insert this block (which must not already be in a function) right before
+  /// the specified block.
+  void insertBefore(Block *block);
+
+  /// Unlink this Block from its parent region and delete it.
+  void erase();
+
+  //===--------------------------------------------------------------------===//
+  // Block argument management
+  //===--------------------------------------------------------------------===//
+
+  // This is the list of arguments to the block.
+  using BlockArgListType = ArrayRef<BlockArgument *>;
+
+  BlockArgListType getArguments() { return arguments; }
+
+  using args_iterator = BlockArgListType::iterator;
+  using reverse_args_iterator = BlockArgListType::reverse_iterator;
+  args_iterator args_begin() { return getArguments().begin(); }
+  args_iterator args_end() { return getArguments().end(); }
+  reverse_args_iterator args_rbegin() { return getArguments().rbegin(); }
+  reverse_args_iterator args_rend() { return getArguments().rend(); }
+
+  bool args_empty() { return arguments.empty(); }
+
+  /// Add one value to the argument list.
+  BlockArgument *addArgument(Type type);
+
+  /// Add one argument to the argument list for each type specified in the list.
+  llvm::iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
+
+  /// Erase the argument at 'index' and remove it from the argument list. If
+  /// 'updatePredTerms' is set to true, this argument is also removed from the
+  /// terminators of each predecessor to this block.
+  void eraseArgument(unsigned index, bool updatePredTerms = true);
+
+  unsigned getNumArguments() { return arguments.size(); }
+  BlockArgument *getArgument(unsigned i) { return arguments[i]; }
+
+  //===--------------------------------------------------------------------===//
+  // Operation list management
+  //===--------------------------------------------------------------------===//
+
+  /// This is the list of operations in the block.
+  using InstListType = llvm::iplist<Operation>;
+  InstListType &getOperations() { return operations; }
+
+  // Iteration over the operations in the block.
+  using iterator = InstListType::iterator;
+  using reverse_iterator = InstListType::reverse_iterator;
+
+  iterator begin() { return operations.begin(); }
+  iterator end() { return operations.end(); }
+  reverse_iterator rbegin() { return operations.rbegin(); }
+  reverse_iterator rend() { return operations.rend(); }
+
+  bool empty() { return operations.empty(); }
+  void push_back(Operation *op) { operations.push_back(op); }
+  void push_front(Operation *op) { operations.push_front(op); }
+
+  Operation &back() { return operations.back(); }
+  Operation &front() { return operations.front(); }
+
+  /// Returns 'op' if 'op' lies in this block, or otherwise finds the
+  /// ancestor operation of 'op' that lies in this block. Returns nullptr if
+  /// the latter fails.
+  /// TODO: This is very specific functionality that should live somewhere else,
+  /// probably in Dominance.cpp.
+  Operation *findAncestorInstInBlock(Operation &op);
+
+  /// This drops all operand uses from operations within this block, which is
+  /// an essential step in breaking cyclic dependences between references when
+  /// they are to be deleted.
+  void dropAllReferences();
+
+  /// This drops all uses of values defined in this block or in the blocks of
+  /// nested regions wherever the uses are located.
+  void dropAllDefinedValueUses();
+
+  /// Returns true if the ordering of the child operations is valid, false
+  /// otherwise.
+  bool isInstOrderValid();
+
+  /// Invalidates the current ordering of operations.
+  void invalidateInstOrder();
+
+  /// Verifies the current ordering of child operations matches the
+  /// validInstOrder flag. Returns false if the order is valid, true otherwise.
+  bool verifyInstOrder();
+
+  /// Recomputes the ordering of child operations within the block.
+  void recomputeInstOrder();
+
+private:
+  /// A utility iterator that filters out operations that are not 'OpT'.
+  template <typename OpT>
+  class op_filter_iterator
+      : public llvm::filter_iterator<Block::iterator, bool (*)(Operation &)> {
+    static bool filter(Operation &op) { return llvm::isa<OpT>(op); }
+
+  public:
+    op_filter_iterator(Block::iterator it, Block::iterator end)
+        : llvm::filter_iterator<Block::iterator, bool (*)(Operation &)>(
+              it, end, &filter) {}
+
+    /// Allow implict conversion to the underlying block iterator.
+    operator Block::iterator() const { return this->wrapped(); }
+  };
+
+public:
+  /// This class provides iteration over the held instructions of a block for a
+  /// specific operation type.
+  template <typename OpT>
+  class op_iterator : public llvm::mapped_iterator<op_filter_iterator<OpT>,
+                                                   OpT (*)(Operation &)> {
+    static OpT unwrap(Operation &op) { return llvm::cast<OpT>(op); }
+
+  public:
+    using reference = OpT;
+
+    /// Initializes the iterator to the specified filter iterator.
+    op_iterator(op_filter_iterator<OpT> it)
+        : llvm::mapped_iterator<op_filter_iterator<OpT>, OpT (*)(Operation &)>(
+              it, &unwrap) {}
+
+    /// Allow implict conversion to the underlying block iterator.
+    operator Block::iterator() const { return this->wrapped(); }
+  };
+
+  /// Return an iterator range over the operations within this block that are of
+  /// 'OpT'.
+  template <typename OpT> llvm::iterator_range<op_iterator<OpT>> getOps() {
+    auto endIt = end();
+    return {op_filter_iterator<OpT>(begin(), endIt),
+            op_filter_iterator<OpT>(endIt, endIt)};
+  }
+  template <typename OpT> op_iterator<OpT> op_begin() {
+    return op_filter_iterator<OpT>(begin(), end());
+  }
+  template <typename OpT> op_iterator<OpT> op_end() {
+    return op_filter_iterator<OpT>(end(), end());
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Terminator management
+  //===--------------------------------------------------------------------===//
+
+  /// Get the terminator operation of this block. This function asserts that
+  /// the block has a valid terminator operation.
+  Operation *getTerminator();
+
+  //===--------------------------------------------------------------------===//
+  // Predecessors and successors.
+  //===--------------------------------------------------------------------===//
+
+  // Predecessor iteration.
+  using pred_iterator = PredecessorIterator;
+  pred_iterator pred_begin();
+  pred_iterator pred_end();
+  llvm::iterator_range<pred_iterator> getPredecessors();
+
+  /// Return true if this block has no predecessors.
+  bool hasNoPredecessors();
+
+  /// If this block has exactly one predecessor, return it.  Otherwise, return
+  /// null.
+  ///
+  /// Note that if a block has duplicate predecessors from a single block (e.g.
+  /// if you have a conditional branch with the same block as the true/false
+  /// destinations) is not considered to be a single predecessor.
+  Block *getSinglePredecessor();
+
+  // Indexed successor access.
+  unsigned getNumSuccessors();
+  Block *getSuccessor(unsigned i);
+
+  // Successor iteration.
+  using succ_iterator = SuccessorIterator;
+  succ_iterator succ_begin();
+  succ_iterator succ_end();
+  llvm::iterator_range<succ_iterator> getSuccessors();
+
+  //===--------------------------------------------------------------------===//
+  // Operation Walkers
+  //===--------------------------------------------------------------------===//
+
+  /// Walk the operations in this block in postorder, calling the callback for
+  /// each operation.
+  void walk(llvm::function_ref<void(Operation *)> callback);
+
+  /// Specialization of walk to only visit operations of 'OpTy'.
+  template <typename OpTy> void walk(llvm::function_ref<void(OpTy)> callback) {
+    walk([&](Operation *opInst) {
+      if (auto op = dyn_cast<OpTy>(opInst))
+        callback(op);
+    });
+  }
+
+  /// Walk the operations in the specified [begin, end) range of this block in
+  /// postorder, calling the callback for each operation.
+  void walk(Block::iterator begin, Block::iterator end,
+            llvm::function_ref<void(Operation *)> callback);
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// Split the block into two blocks before the specified operation or
+  /// iterator.
+  ///
+  /// Note that all operations BEFORE the specified iterator stay as part of
+  /// the original basic block, and the rest of the operations in the original
+  /// block are moved to the new block, including the old terminator.  The
+  /// original block is left without a terminator.
+  ///
+  /// The newly formed Block is returned, and the specified iterator is
+  /// invalidated.
+  Block *splitBlock(iterator splitBefore);
+  Block *splitBlock(Operation *splitBeforeInst) {
+    return splitBlock(iterator(splitBeforeInst));
+  }
+
+  /// Returns pointer to member of operation list.
+  static InstListType Block::*getSublistAccess(Operation *) {
+    return &Block::operations;
+  }
+
+  void print(raw_ostream &os);
+  void dump();
+
+  /// Print out the name of the block without printing its body.
+  /// NOTE: The printType argument is ignored.  We keep it for compatibility
+  /// with LLVM dominator machinery that expects it to exist.
+  void printAsOperand(raw_ostream &os, bool printType = true);
+
+private:
+  /// Pair of the parent object that owns this block and a bit that signifies if
+  /// the operations within this block have a valid ordering.
+  llvm::PointerIntPair<Region *, /*IntBits=*/1, bool> parentValidInstOrderPair;
+
+  /// This is the list of operations in the block.
+  InstListType operations;
+
+  /// This is the list of arguments to the block.
+  std::vector<BlockArgument *> arguments;
+
+  Block(Block &) = delete;
+  void operator=(Block &) = delete;
+
+  friend struct llvm::ilist_traits<Block>;
+};
+
+} // end namespace mlir
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Block
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+template <>
+struct ilist_traits<::mlir::Block> : public ilist_alloc_traits<::mlir::Block> {
+  using Block = ::mlir::Block;
+  using block_iterator = simple_ilist<::mlir::Block>::iterator;
+
+  void addNodeToList(Block *block);
+  void removeNodeFromList(Block *block);
+  void transferNodesFromList(ilist_traits<Block> &otherList,
+                             block_iterator first, block_iterator last);
+
+private:
+  mlir::Region *getParentRegion();
+};
+} // end namespace llvm
+
+namespace mlir {
+//===----------------------------------------------------------------------===//
+// Predecessors
+//===----------------------------------------------------------------------===//
+
+/// Implement a predecessor iterator for blocks. This works by walking the use
+/// lists of the blocks. The entries on this list are the BlockOperands that
+/// are embedded into terminator operations. From the operand, we can get the
+/// terminator that contains it, and its parent block is the predecessor.
+class PredecessorIterator final
+    : public llvm::mapped_iterator<ValueUseIterator<BlockOperand>,
+                                   Block *(*)(BlockOperand &)> {
+  static Block *unwrap(BlockOperand &value);
+
+public:
+  using reference = Block *;
+
+  /// Initializes the operand type iterator to the specified operand iterator.
+  PredecessorIterator(ValueUseIterator<BlockOperand> it)
+      : llvm::mapped_iterator<ValueUseIterator<BlockOperand>,
+                              Block *(*)(BlockOperand &)>(it, &unwrap) {}
+  explicit PredecessorIterator(BlockOperand *operand)
+      : PredecessorIterator(ValueUseIterator<BlockOperand>(operand)) {}
+
+  /// Get the successor number in the predecessor terminator.
+  unsigned getSuccessorIndex() const;
+};
+
+inline auto Block::pred_begin() -> pred_iterator {
+  return pred_iterator((BlockOperand *)getFirstUse());
+}
+
+inline auto Block::pred_end() -> pred_iterator {
+  return pred_iterator(nullptr);
+}
+
+inline auto Block::getPredecessors() -> llvm::iterator_range<pred_iterator> {
+  return {pred_begin(), pred_end()};
+}
+
+//===----------------------------------------------------------------------===//
+// Successors
+//===----------------------------------------------------------------------===//
+
+/// This template implements the successor iterators for Block.
+class SuccessorIterator final
+    : public indexed_accessor_iterator<SuccessorIterator, Block *, Block *,
+                                       Block *, Block *> {
+public:
+  /// Initializes the result iterator to the specified index.
+  SuccessorIterator(Block *object, unsigned index)
+      : indexed_accessor_iterator<SuccessorIterator, Block *, Block *, Block *,
+                                  Block *>(object, index) {}
+
+  SuccessorIterator(const SuccessorIterator &other)
+      : SuccessorIterator(other.object, other.index) {}
+
+  Block *operator*() const { return this->object->getSuccessor(this->index); }
+
+  /// Get the successor number in the terminator.
+  unsigned getSuccessorIndex() const { return this->index; }
+};
+
+inline auto Block::succ_begin() -> succ_iterator {
+  return succ_iterator(this, 0);
+}
+
+inline auto Block::succ_end() -> succ_iterator {
+  return succ_iterator(this, getNumSuccessors());
+}
+
+inline auto Block::getSuccessors() -> llvm::iterator_range<succ_iterator> {
+  return {succ_begin(), succ_end()};
+}
+
+} // end namespace mlir
+
+#endif // MLIR_IR_BLOCK_H
diff --git a/third_party/mlir/include/mlir/IR/BlockAndValueMapping.h b/third_party/mlir/include/mlir/IR/BlockAndValueMapping.h
new file mode 100644
index 00000000000..bd69aa2c07f
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/BlockAndValueMapping.h
@@ -0,0 +1,93 @@
+//===- BlockAndValueMapping.h -----------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a utility class for maintaining a mapping for multiple
+// value types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BLOCKANDVALUEMAPPING_H
+#define MLIR_IR_BLOCKANDVALUEMAPPING_H
+
+#include "mlir/IR/Block.h"
+
+namespace mlir {
+// This is a utility class for mapping one set of values to another. New
+// mappings can be inserted via 'map'. Existing mappings can be
+// found via the 'lookup*' functions. There are two variants that differ only in
+// return value when an existing is not found for the provided key.
+// 'lookupOrNull' returns nullptr where as 'lookupOrDefault' will return the
+// lookup key.
+class BlockAndValueMapping {
+public:
+  /// Inserts a new mapping for 'from' to 'to'. If there is an existing mapping,
+  /// it is overwritten.
+  void map(Block *from, Block *to) { valueMap[from] = to; }
+  void map(Value *from, Value *to) { valueMap[from] = to; }
+
+  /// Erases a mapping for 'from'.
+  void erase(IRObjectWithUseList *from) { valueMap.erase(from); }
+
+  /// Checks to see if a mapping for 'from' exists.
+  bool contains(IRObjectWithUseList *from) const {
+    return valueMap.count(from);
+  }
+
+  /// Lookup a mapped value within the map. If a mapping for the provided value
+  /// does not exist then return nullptr.
+  Block *lookupOrNull(Block *from) const {
+    return lookupOrValue(from, (Block *)nullptr);
+  }
+  Value *lookupOrNull(Value *from) const {
+    return lookupOrValue(from, (Value *)nullptr);
+  }
+
+  /// Lookup a mapped value within the map. If a mapping for the provided value
+  /// does not exist then return the provided value.
+  Block *lookupOrDefault(Block *from) const {
+    return lookupOrValue(from, from);
+  }
+  Value *lookupOrDefault(Value *from) const {
+    return lookupOrValue(from, from);
+  }
+
+  /// Lookup a mapped value within the map. This asserts the provided value
+  /// exists within the map.
+  template <typename T> T *lookup(T *from) const {
+    auto *result = lookupOrNull(from);
+    assert(result && "expected 'from' to be contained within the map");
+    return result;
+  }
+
+  /// Clears all mappings held by the mapper.
+  void clear() { valueMap.clear(); }
+
+private:
+  /// Utility lookupOrValue that looks up an existing key or returns the
+  /// provided value. This function assumes that if a mapping does exist, then
+  /// it is of 'T' type.
+  template <typename T> T *lookupOrValue(T *from, T *value) const {
+    auto it = valueMap.find(from);
+    return it != valueMap.end() ? static_cast<T *>(it->second) : value;
+  }
+
+  llvm::DenseMap<IRObjectWithUseList *, IRObjectWithUseList *> valueMap;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_BLOCKANDVALUEMAPPING_H
diff --git a/third_party/mlir/include/mlir/IR/Builders.h b/third_party/mlir/include/mlir/IR/Builders.h
new file mode 100644
index 00000000000..a58d5119ddf
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Builders.h
@@ -0,0 +1,401 @@
+//===- Builders.h - Helpers for constructing MLIR Classes -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_BUILDERS_H
+#define MLIR_IR_BUILDERS_H
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+
+class AffineExpr;
+class BlockAndValueMapping;
+class ModuleOp;
+class UnknownLoc;
+class FileLineColLoc;
+class Type;
+class PrimitiveType;
+class IntegerType;
+class FunctionType;
+class MemRefType;
+class VectorType;
+class RankedTensorType;
+class UnrankedTensorType;
+class TupleType;
+class NoneType;
+class BoolAttr;
+class IntegerAttr;
+class FloatAttr;
+class StringAttr;
+class TypeAttr;
+class ArrayAttr;
+class SymbolRefAttr;
+class ElementsAttr;
+class DenseElementsAttr;
+class DenseIntElementsAttr;
+class AffineMapAttr;
+class AffineMap;
+class UnitAttr;
+
+/// This class is a general helper class for creating context-global objects
+/// like types, attributes, and affine expressions.
+class Builder {
+public:
+  explicit Builder(MLIRContext *context) : context(context) {}
+  explicit Builder(ModuleOp module);
+
+  MLIRContext *getContext() const { return context; }
+
+  Identifier getIdentifier(StringRef str);
+
+  // Locations.
+  Location getUnknownLoc();
+  Location getFileLineColLoc(Identifier filename, unsigned line,
+                             unsigned column);
+  Location getFusedLoc(ArrayRef<Location> locs,
+                       Attribute metadata = Attribute());
+
+  // Types.
+  FloatType getBF16Type();
+  FloatType getF16Type();
+  FloatType getF32Type();
+  FloatType getF64Type();
+
+  IndexType getIndexType();
+
+  IntegerType getI1Type();
+  IntegerType getIntegerType(unsigned width);
+  FunctionType getFunctionType(ArrayRef<Type> inputs, ArrayRef<Type> results);
+  MemRefType getMemRefType(ArrayRef<int64_t> shape, Type elementType,
+                           ArrayRef<AffineMap> affineMapComposition = {},
+                           unsigned memorySpace = 0);
+  VectorType getVectorType(ArrayRef<int64_t> shape, Type elementType);
+  RankedTensorType getTensorType(ArrayRef<int64_t> shape, Type elementType);
+  UnrankedTensorType getTensorType(Type elementType);
+  TupleType getTupleType(ArrayRef<Type> elementTypes);
+  NoneType getNoneType();
+
+  /// Get or construct an instance of the type 'ty' with provided arguments.
+  template <typename Ty, typename... Args> Ty getType(Args... args) {
+    return Ty::get(context, args...);
+  }
+
+  // Attributes.
+  NamedAttribute getNamedAttr(StringRef name, Attribute val);
+
+  UnitAttr getUnitAttr();
+  BoolAttr getBoolAttr(bool value);
+  DictionaryAttr getDictionaryAttr(ArrayRef<NamedAttribute> value);
+  IntegerAttr getIntegerAttr(Type type, int64_t value);
+  IntegerAttr getIntegerAttr(Type type, const APInt &value);
+  FloatAttr getFloatAttr(Type type, double value);
+  FloatAttr getFloatAttr(Type type, const APFloat &value);
+  StringAttr getStringAttr(StringRef bytes);
+  StringAttr getStringAttr(StringRef bytes, Type type);
+  ArrayAttr getArrayAttr(ArrayRef<Attribute> value);
+  AffineMapAttr getAffineMapAttr(AffineMap map);
+  IntegerSetAttr getIntegerSetAttr(IntegerSet set);
+  TypeAttr getTypeAttr(Type type);
+  SymbolRefAttr getSymbolRefAttr(Operation *value);
+  SymbolRefAttr getSymbolRefAttr(StringRef value);
+  ElementsAttr getDenseElementsAttr(ShapedType type,
+                                    ArrayRef<Attribute> values);
+  ElementsAttr getDenseIntElementsAttr(ShapedType type,
+                                       ArrayRef<int64_t> values);
+  ElementsAttr getSparseElementsAttr(ShapedType type,
+                                     DenseIntElementsAttr indices,
+                                     DenseElementsAttr values);
+  ElementsAttr getOpaqueElementsAttr(Dialect *dialect, ShapedType type,
+                                     StringRef bytes);
+  // Returns a 0-valued attribute of the given `type`. This function only
+  // supports boolean, integer, and 16-/32-/64-bit float types, and vector or
+  // ranked tensor of them. Returns null attribute otherwise.
+  Attribute getZeroAttr(Type type);
+
+  // Convenience methods for fixed types.
+  FloatAttr getF16FloatAttr(float value);
+  FloatAttr getF32FloatAttr(float value);
+  FloatAttr getF64FloatAttr(double value);
+
+  IntegerAttr getI32IntegerAttr(int32_t value);
+  IntegerAttr getI64IntegerAttr(int64_t value);
+
+  ArrayAttr getAffineMapArrayAttr(ArrayRef<AffineMap> values);
+  ArrayAttr getI32ArrayAttr(ArrayRef<int32_t> values);
+  ArrayAttr getI64ArrayAttr(ArrayRef<int64_t> values);
+  ArrayAttr getIndexArrayAttr(ArrayRef<int64_t> values);
+  ArrayAttr getF32ArrayAttr(ArrayRef<float> values);
+  ArrayAttr getF64ArrayAttr(ArrayRef<double> values);
+  ArrayAttr getStrArrayAttr(ArrayRef<StringRef> values);
+
+  // Affine expressions and affine maps.
+  AffineExpr getAffineDimExpr(unsigned position);
+  AffineExpr getAffineSymbolExpr(unsigned position);
+  AffineExpr getAffineConstantExpr(int64_t constant);
+
+  AffineMap getAffineMap(unsigned dimCount, unsigned symbolCount,
+                         ArrayRef<AffineExpr> results);
+
+  // Special cases of affine maps and integer sets
+  /// Returns a zero result affine map with no dimensions or symbols: () -> ().
+  AffineMap getEmptyAffineMap();
+  /// Returns a single constant result affine map with 0 dimensions and 0
+  /// symbols.  One constant result: () -> (val).
+  AffineMap getConstantAffineMap(int64_t val);
+  // One dimension id identity map: (i) -> (i).
+  AffineMap getDimIdentityMap();
+  // Multi-dimensional identity map: (d0, d1, d2) -> (d0, d1, d2).
+  AffineMap getMultiDimIdentityMap(unsigned rank);
+  // One symbol identity map: ()[s] -> (s).
+  AffineMap getSymbolIdentityMap();
+
+  /// Returns a map that shifts its (single) input dimension by 'shift'.
+  /// (d0) -> (d0 + shift)
+  AffineMap getSingleDimShiftAffineMap(int64_t shift);
+
+  /// Returns an affine map that is a translation (shift) of all result
+  /// expressions in 'map' by 'shift'.
+  /// Eg: input: (d0, d1)[s0] -> (d0, d1 + s0), shift = 2
+  ///   returns:    (d0, d1)[s0] -> (d0 + 2, d1 + s0 + 2)
+  AffineMap getShiftedAffineMap(AffineMap map, int64_t shift);
+
+  // Integer set.
+  IntegerSet getIntegerSet(unsigned dimCount, unsigned symbolCount,
+                           ArrayRef<AffineExpr> constraints,
+                           ArrayRef<bool> isEq);
+  // TODO: Helpers for affine map/exprs, etc.
+protected:
+  MLIRContext *context;
+};
+
+/// This class helps build Operations. Operations that are created are
+/// automatically inserted at an insertion point. The builder is copyable.
+class OpBuilder : public Builder {
+public:
+  /// Create a builder with the given context.
+  explicit OpBuilder(MLIRContext *ctx) : Builder(ctx) {}
+
+  /// Create a builder and set the insertion point to the start of the region.
+  explicit OpBuilder(Region *region) : Builder(region->getContext()) {
+    if (!region->empty())
+      setInsertionPoint(&region->front(), region->front().begin());
+  }
+  explicit OpBuilder(Region &region) : OpBuilder(&region) {}
+
+  virtual ~OpBuilder();
+
+  /// Create a builder and set insertion point to the given operation, which
+  /// will cause subsequent insertions to go right before it.
+  explicit OpBuilder(Operation *op) : Builder(op->getContext()) {
+    setInsertionPoint(op);
+  }
+
+  explicit OpBuilder(Block *block) : OpBuilder(block, block->end()) {}
+
+  OpBuilder(Block *block, Block::iterator insertPoint)
+      : OpBuilder(block->getParent()) {
+    setInsertionPoint(block, insertPoint);
+  }
+
+  /// This class represents a saved insertion point.
+  class InsertPoint {
+  public:
+    /// Creates a new insertion point which doesn't point to anything.
+    InsertPoint() = default;
+
+    /// Creates a new insertion point at the given location.
+    InsertPoint(Block *insertBlock, Block::iterator insertPt)
+        : block(insertBlock), point(insertPt) {}
+
+    /// Returns true if this insert point is set.
+    bool isSet() const { return (block != nullptr); }
+
+    Block *getBlock() const { return block; }
+    Block::iterator getPoint() const { return point; }
+
+  private:
+    Block *block = nullptr;
+    Block::iterator point;
+  };
+
+  /// RAII guard to reset the insertion point of the builder when destroyed.
+  class InsertionGuard {
+  public:
+    InsertionGuard(OpBuilder &builder)
+        : builder(builder), ip(builder.saveInsertionPoint()) {}
+    ~InsertionGuard() { builder.restoreInsertionPoint(ip); }
+
+  private:
+    OpBuilder &builder;
+    OpBuilder::InsertPoint ip;
+  };
+
+  /// Reset the insertion point to no location.  Creating an operation without a
+  /// set insertion point is an error, but this can still be useful when the
+  /// current insertion point a builder refers to is being removed.
+  void clearInsertionPoint() {
+    this->block = nullptr;
+    insertPoint = Block::iterator();
+  }
+
+  /// Return a saved insertion point.
+  InsertPoint saveInsertionPoint() const {
+    return InsertPoint(getInsertionBlock(), getInsertionPoint());
+  }
+
+  /// Restore the insert point to a previously saved point.
+  void restoreInsertionPoint(InsertPoint ip) {
+    if (ip.isSet())
+      setInsertionPoint(ip.getBlock(), ip.getPoint());
+    else
+      clearInsertionPoint();
+  }
+
+  /// Set the insertion point to the specified location.
+  void setInsertionPoint(Block *block, Block::iterator insertPoint) {
+    // TODO: check that insertPoint is in this rather than some other block.
+    this->block = block;
+    this->insertPoint = insertPoint;
+  }
+
+  /// Sets the insertion point to the specified operation, which will cause
+  /// subsequent insertions to go right before it.
+  void setInsertionPoint(Operation *op) {
+    setInsertionPoint(op->getBlock(), Block::iterator(op));
+  }
+
+  /// Sets the insertion point to the start of the specified block.
+  void setInsertionPointToStart(Block *block) {
+    setInsertionPoint(block, block->begin());
+  }
+
+  /// Sets the insertion point to the end of the specified block.
+  void setInsertionPointToEnd(Block *block) {
+    setInsertionPoint(block, block->end());
+  }
+
+  /// Return the block the current insertion point belongs to.  Note that the
+  /// the insertion point is not necessarily the end of the block.
+  Block *getInsertionBlock() const { return block; }
+
+  /// Returns the current insertion point of the builder.
+  Block::iterator getInsertionPoint() const { return insertPoint; }
+
+  /// Add new block and set the insertion point to the end of it. The block is
+  /// inserted at the provided insertion point of 'parent'.
+  Block *createBlock(Region *parent, Region::iterator insertPt = {});
+
+  /// Add new block and set the insertion point to the end of it. The block is
+  /// placed before 'insertBefore'.
+  Block *createBlock(Block *insertBefore);
+
+  /// Returns the current block of the builder.
+  Block *getBlock() const { return block; }
+
+  /// Creates an operation given the fields represented as an OperationState.
+  virtual Operation *createOperation(const OperationState &state);
+
+  /// Create an operation of specific op type at the current insertion point.
+  template <typename OpTy, typename... Args>
+  OpTy create(Location location, Args &&... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, &state, std::forward<Args>(args)...);
+    auto *op = createOperation(state);
+    auto result = dyn_cast<OpTy>(op);
+    assert(result && "Builder didn't return the right type");
+    return result;
+  }
+
+  /// Create an operation of specific op type at the current insertion point,
+  /// and immediately try to fold it. This functions populates 'results' with
+  /// the results after folding the operation.
+  template <typename OpTy, typename... Args>
+  void createOrFold(SmallVectorImpl<Value *> &results, Location location,
+                    Args &&... args) {
+    auto op = create<OpTy>(location, std::forward<Args>(args)...);
+    tryFold(op.getOperation(), results);
+  }
+
+  /// Overload to create or fold a single result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::OneResult>(),
+                          Value *>::type
+  createOrFold(Location location, Args &&... args) {
+    SmallVector<Value *, 1> results;
+    createOrFold<OpTy>(results, location, std::forward<Args>(args)...);
+    return results.front();
+  }
+
+  /// Overload to create or fold a zero result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::ZeroResult>(),
+                          OpTy>::type
+  createOrFold(Location location, Args &&... args) {
+    auto op = create<OpTy>(location, std::forward<Args>(args)...);
+    SmallVector<Value *, 0> unused;
+    tryFold(op.getOperation(), unused);
+
+    // Folding cannot remove a zero-result operation, so for convenience we
+    // continue to return it.
+    return op;
+  }
+
+  /// Creates a deep copy of the specified operation, remapping any operands
+  /// that use values outside of the operation using the map that is provided
+  /// ( leaving them alone if no entry is present).  Replaces references to
+  /// cloned sub-operations to the corresponding operation that is copied,
+  /// and adds those mappings to the map.
+  Operation *clone(Operation &op, BlockAndValueMapping &mapper) {
+    Operation *cloneOp = op.clone(mapper);
+    insert(cloneOp);
+    return cloneOp;
+  }
+  Operation *clone(Operation &op) {
+    Operation *cloneOp = op.clone();
+    insert(cloneOp);
+    return cloneOp;
+  }
+
+  /// Creates a deep copy of this operation but keep the operation regions
+  /// empty. Operands are remapped using `mapper` (if present), and `mapper` is
+  /// updated to contain the results.
+  Operation *cloneWithoutRegions(Operation &op, BlockAndValueMapping &mapper) {
+    Operation *cloneOp = op.cloneWithoutRegions(mapper);
+    insert(cloneOp);
+    return cloneOp;
+  }
+  Operation *cloneWithoutRegions(Operation &op) {
+    Operation *cloneOp = op.cloneWithoutRegions();
+    insert(cloneOp);
+    return cloneOp;
+  }
+
+private:
+  /// Attempts to fold the given operation and places new results within
+  /// 'results'.
+  void tryFold(Operation *op, SmallVectorImpl<Value *> &results);
+
+  /// Insert the given operation at the current insertion point.
+  void insert(Operation *op);
+
+  Block *block = nullptr;
+  Block::iterator insertPoint;
+};
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Diagnostics.h b/third_party/mlir/include/mlir/IR/Diagnostics.h
new file mode 100644
index 00000000000..b9621b65834
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Diagnostics.h
@@ -0,0 +1,604 @@
+//===- Diagnostics.h - MLIR Diagnostics -------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines utilities for emitting diagnostics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIAGNOSTICS_H
+#define MLIR_IR_DIAGNOSTICS_H
+
+#include "mlir/IR/Location.h"
+#include "mlir/Support/STLExtras.h"
+#include <functional>
+
+namespace llvm {
+class MemoryBuffer;
+class SMLoc;
+class SourceMgr;
+} // end namespace llvm
+
+namespace mlir {
+class DiagnosticEngine;
+class Identifier;
+struct LogicalResult;
+class MLIRContext;
+class Operation;
+class OperationName;
+class Type;
+
+namespace detail {
+struct DiagnosticEngineImpl;
+} // end namespace detail
+
+/// Defines the different supported severity of a diagnostic.
+enum class DiagnosticSeverity {
+  Note,
+  Warning,
+  Error,
+  Remark,
+};
+
+//===----------------------------------------------------------------------===//
+// DiagnosticArgument
+//===----------------------------------------------------------------------===//
+
+/// A variant type that holds a single argument for a diagnostic.
+class DiagnosticArgument {
+public:
+  /// Enum that represents the different kinds of diagnostic arguments
+  /// supported.
+  enum class DiagnosticArgumentKind {
+    Attribute,
+    Double,
+    Integer,
+    Operation,
+    String,
+    Type,
+    Unsigned,
+  };
+
+  /// Outputs this argument to a stream.
+  void print(raw_ostream &os) const;
+
+  /// Returns the kind of this argument.
+  DiagnosticArgumentKind getKind() const { return kind; }
+
+  /// Returns this argument as an Attribute.
+  Attribute getAsAttribute() const;
+
+  /// Returns this argument as a double.
+  double getAsDouble() const {
+    assert(getKind() == DiagnosticArgumentKind::Double);
+    return doubleVal;
+  }
+
+  /// Returns this argument as a signed integer.
+  int64_t getAsInteger() const {
+    assert(getKind() == DiagnosticArgumentKind::Integer);
+    return static_cast<int64_t>(opaqueVal);
+  }
+
+  /// Returns this argument as an operation.
+  Operation &getAsOperation() const {
+    assert(getKind() == DiagnosticArgumentKind::Operation);
+    return *reinterpret_cast<Operation *>(opaqueVal);
+  }
+
+  /// Returns this argument as a string.
+  StringRef getAsString() const {
+    assert(getKind() == DiagnosticArgumentKind::String);
+    return stringVal;
+  }
+
+  /// Returns this argument as a Type.
+  Type getAsType() const;
+
+  /// Returns this argument as an unsigned integer.
+  uint64_t getAsUnsigned() const {
+    assert(getKind() == DiagnosticArgumentKind::Unsigned);
+    return static_cast<uint64_t>(opaqueVal);
+  }
+
+private:
+  friend class Diagnostic;
+
+  // Construct from an Attribute.
+  explicit DiagnosticArgument(Attribute attr);
+
+  // Construct from a floating point number.
+  explicit DiagnosticArgument(double val)
+      : kind(DiagnosticArgumentKind::Double), doubleVal(val) {}
+  explicit DiagnosticArgument(float val) : DiagnosticArgument(double(val)) {}
+
+  // Construct from a signed integer.
+  template <typename T>
+  explicit DiagnosticArgument(
+      T val, typename std::enable_if<std::is_signed<T>::value &&
+                                     std::numeric_limits<T>::is_integer &&
+                                     sizeof(T) <= sizeof(int64_t)>::type * = 0)
+      : kind(DiagnosticArgumentKind::Integer), opaqueVal(int64_t(val)) {}
+
+  // Construct from an unsigned integer.
+  template <typename T>
+  explicit DiagnosticArgument(
+      T val, typename std::enable_if<std::is_unsigned<T>::value &&
+                                     std::numeric_limits<T>::is_integer &&
+                                     sizeof(T) <= sizeof(uint64_t)>::type * = 0)
+      : kind(DiagnosticArgumentKind::Unsigned), opaqueVal(uint64_t(val)) {}
+
+  // Construct from an operation reference.
+  explicit DiagnosticArgument(Operation &val) : DiagnosticArgument(&val) {}
+  explicit DiagnosticArgument(Operation *val)
+      : kind(DiagnosticArgumentKind::Operation),
+        opaqueVal(reinterpret_cast<intptr_t>(val)) {
+    assert(val && "expected valid operation");
+  }
+
+  // Construct from a string reference.
+  explicit DiagnosticArgument(StringRef val)
+      : kind(DiagnosticArgumentKind::String), stringVal(val) {}
+
+  // Construct from a Type.
+  explicit DiagnosticArgument(Type val);
+
+  /// The kind of this argument.
+  DiagnosticArgumentKind kind;
+
+  /// The value of this argument.
+  union {
+    double doubleVal;
+    intptr_t opaqueVal;
+    StringRef stringVal;
+  };
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const DiagnosticArgument &arg) {
+  arg.print(os);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic
+//===----------------------------------------------------------------------===//
+
+/// This class contains all of the information necessary to report a diagnostic
+/// to the DiagnosticEngine. It should generally not be constructed directly,
+/// and instead used transitively via InFlightDiagnostic.
+class Diagnostic {
+  using NoteVector = std::vector<std::unique_ptr<Diagnostic>>;
+
+  /// This class implements a wrapper iterator around NoteVector::iterator to
+  /// implicitly dereference the unique_ptr.
+  template <typename IteratorTy, typename NotePtrTy = decltype(*IteratorTy()),
+            typename ResultTy = decltype(**IteratorTy())>
+  class NoteIteratorImpl
+      : public llvm::mapped_iterator<IteratorTy, ResultTy (*)(NotePtrTy)> {
+    static ResultTy &unwrap(NotePtrTy note) { return *note; }
+
+  public:
+    NoteIteratorImpl(IteratorTy it)
+        : llvm::mapped_iterator<IteratorTy, ResultTy (*)(NotePtrTy)>(it,
+                                                                     &unwrap) {}
+  };
+
+public:
+  Diagnostic(Location loc, DiagnosticSeverity severity)
+      : loc(loc), severity(severity) {}
+  Diagnostic(Diagnostic &&) = default;
+  Diagnostic &operator=(Diagnostic &&) = default;
+
+  /// Returns the severity of this diagnostic.
+  DiagnosticSeverity getSeverity() const { return severity; }
+
+  /// Returns the source location for this diagnostic.
+  Location getLocation() const { return loc; }
+
+  /// Returns the current list of diagnostic arguments.
+  MutableArrayRef<DiagnosticArgument> getArguments() { return arguments; }
+  ArrayRef<DiagnosticArgument> getArguments() const { return arguments; }
+
+  /// Stream operator for inserting new diagnostic arguments.
+  template <typename Arg>
+  typename std::enable_if<!std::is_convertible<Arg, StringRef>::value,
+                          Diagnostic &>::type
+  operator<<(Arg &&val) {
+    arguments.push_back(DiagnosticArgument(std::forward<Arg>(val)));
+    return *this;
+  }
+
+  /// Stream in a string literal.
+  Diagnostic &operator<<(const char *val) {
+    arguments.push_back(DiagnosticArgument(val));
+    return *this;
+  }
+
+  /// Stream in a Twine argument.
+  Diagnostic &operator<<(char val);
+  Diagnostic &operator<<(const Twine &val);
+  Diagnostic &operator<<(Twine &&val);
+
+  /// Stream in an Identifier.
+  Diagnostic &operator<<(Identifier val);
+
+  /// Stream in an OperationName.
+  Diagnostic &operator<<(OperationName val);
+
+  /// Stream in a range.
+  template <typename T> Diagnostic &operator<<(llvm::iterator_range<T> range) {
+    return appendRange(range);
+  }
+  template <typename T> Diagnostic &operator<<(llvm::ArrayRef<T> range) {
+    return appendRange(range);
+  }
+
+  /// Append a range to the diagnostic. The default delimiter between elements
+  /// is ','.
+  template <typename T, template <typename> class Container>
+  Diagnostic &appendRange(const Container<T> &c, const char *delim = ", ") {
+    interleave(
+        c, [&](const detail::ValueOfRange<Container<T>> &a) { *this << a; },
+        [&]() { *this << delim; });
+    return *this;
+  }
+
+  /// Append arguments to the diagnostic.
+  template <typename Arg1, typename Arg2, typename... Args>
+  Diagnostic &append(Arg1 &&arg1, Arg2 &&arg2, Args &&... args) {
+    append(std::forward<Arg1>(arg1));
+    return append(std::forward<Arg2>(arg2), std::forward<Args>(args)...);
+  }
+  /// Append one argument to the diagnostic.
+  template <typename Arg> Diagnostic &append(Arg &&arg) {
+    *this << std::forward<Arg>(arg);
+    return *this;
+  }
+
+  /// Outputs this diagnostic to a stream.
+  void print(raw_ostream &os) const;
+
+  /// Converts the diagnostic to a string.
+  std::string str() const;
+
+  /// Attaches a note to this diagnostic. A new location may be optionally
+  /// provided, if not, then the location defaults to the one specified for this
+  /// diagnostic. Notes may not be attached to other notes.
+  Diagnostic &attachNote(llvm::Optional<Location> noteLoc = llvm::None);
+
+  using note_iterator = NoteIteratorImpl<NoteVector::iterator>;
+  using const_note_iterator = NoteIteratorImpl<NoteVector::const_iterator>;
+
+  /// Returns the notes held by this diagnostic.
+  llvm::iterator_range<note_iterator> getNotes() {
+    return {notes.begin(), notes.end()};
+  }
+  llvm::iterator_range<const_note_iterator> getNotes() const {
+    return {notes.begin(), notes.end()};
+  }
+
+  /// Allow a diagnostic to be converted to 'failure'.
+  operator LogicalResult() const;
+
+private:
+  Diagnostic(const Diagnostic &rhs) = delete;
+  Diagnostic &operator=(const Diagnostic &rhs) = delete;
+
+  /// The source location.
+  Location loc;
+
+  /// The severity of this diagnostic.
+  DiagnosticSeverity severity;
+
+  /// The current list of arguments.
+  SmallVector<DiagnosticArgument, 4> arguments;
+
+  /// A list of string values used as arguments. This is used to guarantee the
+  /// liveness of non-constant strings used in diagnostics.
+  std::vector<std::unique_ptr<char[]>> strings;
+
+  /// A list of attached notes.
+  NoteVector notes;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const Diagnostic &diag) {
+  diag.print(os);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// InFlightDiagnostic
+//===----------------------------------------------------------------------===//
+
+/// This class represents a diagnostic that is inflight and set to be reported.
+/// This allows for last minute modifications of the diagnostic before it is
+/// emitted by a DiagnosticEngine.
+class InFlightDiagnostic {
+public:
+  InFlightDiagnostic() = default;
+  InFlightDiagnostic(InFlightDiagnostic &&rhs)
+      : owner(rhs.owner), impl(std::move(rhs.impl)) {
+    // Reset the rhs diagnostic.
+    rhs.impl.reset();
+    rhs.abandon();
+  }
+  ~InFlightDiagnostic() {
+    if (isInFlight())
+      report();
+  }
+
+  /// Stream operator for new diagnostic arguments.
+  template <typename Arg> InFlightDiagnostic &operator<<(Arg &&arg) & {
+    return append(std::forward<Arg>(arg));
+  }
+  template <typename Arg> InFlightDiagnostic &&operator<<(Arg &&arg) && {
+    return std::move(append(std::forward<Arg>(arg)));
+  }
+
+  /// Append arguments to the diagnostic.
+  template <typename... Args> InFlightDiagnostic &append(Args &&... args) & {
+    assert(isActive() && "diagnostic not active");
+    if (isInFlight())
+      impl->append(std::forward<Args>(args)...);
+    return *this;
+  }
+  template <typename... Args> InFlightDiagnostic &&append(Args &&... args) && {
+    return std::move(append(std::forward<Args>(args)...));
+  }
+
+  /// Attaches a note to this diagnostic.
+  Diagnostic &attachNote(llvm::Optional<Location> noteLoc = llvm::None) {
+    assert(isActive() && "diagnostic not active");
+    return impl->attachNote(noteLoc);
+  }
+
+  /// Reports the diagnostic to the engine.
+  void report();
+
+  /// Abandons this diagnostic so that it will no longer be reported.
+  void abandon();
+
+  /// Allow an inflight diagnostic to be converted to 'failure', otherwise
+  /// 'success' if this is an empty diagnostic.
+  operator LogicalResult() const;
+
+private:
+  InFlightDiagnostic &operator=(const InFlightDiagnostic &) = delete;
+  InFlightDiagnostic &operator=(InFlightDiagnostic &&) = delete;
+  InFlightDiagnostic(DiagnosticEngine *owner, Diagnostic &&rhs)
+      : owner(owner), impl(std::move(rhs)) {}
+
+  /// Returns if the diagnostic is still active, i.e. it has a live diagnostic.
+  bool isActive() const { return impl.hasValue(); }
+
+  /// Returns if the diagnostic is still in flight to be reported.
+  bool isInFlight() const { return owner; }
+
+  // Allow access to the constructor.
+  friend DiagnosticEngine;
+
+  /// The engine that this diagnostic is to report to.
+  DiagnosticEngine *owner;
+
+  /// The raw diagnostic that is inflight to be reported.
+  llvm::Optional<Diagnostic> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngine
+//===----------------------------------------------------------------------===//
+
+/// This class is the main interface for diagnostics. The DiagnosticEngine
+/// manages the registration of diagnostic handlers as well as the core API for
+/// diagnostic emission. This class should not be constructed directly, but
+/// instead interfaced with via an MLIRContext instance.
+class DiagnosticEngine {
+public:
+  ~DiagnosticEngine();
+
+  // Diagnostic handler registration and use.  MLIR supports the ability for the
+  // IR to carry arbitrary metadata about operation location information.  If a
+  // problem is detected by the compiler, it can invoke the emitError /
+  // emitWarning / emitRemark method on an Operation and have it get reported
+  // through this interface.
+  //
+  // Tools using MLIR are encouraged to register error handlers and define a
+  // schema for their location information.  If they don't, then warnings and
+  // notes will be dropped and errors will be emitted to errs.
+
+  using HandlerTy = std::function<void(Diagnostic)>;
+
+  /// Set the diagnostic handler for this engine. Note that this replaces any
+  /// existing handler.
+  void setHandler(const HandlerTy &handler);
+
+  /// Return the current diagnostic handler, or null if none is present.
+  HandlerTy getHandler();
+
+  /// Create a new inflight diagnostic with the given location and severity.
+  InFlightDiagnostic emit(Location loc, DiagnosticSeverity severity) {
+    assert(severity != DiagnosticSeverity::Note &&
+           "notes should not be emitted directly");
+    return InFlightDiagnostic(this, Diagnostic(loc, severity));
+  }
+
+  /// Emit a diagnostic using the registered issue handler if present, or with
+  /// the default behavior if not.
+  void emit(Diagnostic diag);
+
+private:
+  friend class MLIRContextImpl;
+  DiagnosticEngine();
+
+  /// The internal implementation of the DiagnosticEngine.
+  std::unique_ptr<detail::DiagnosticEngineImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// ScopedDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+/// This diagnostic handler is a simple RAII class that saves and restores the
+/// current diagnostic handler registered to a given context. This class can
+/// be either be used directly, or in conjunction with a derived diagnostic
+/// handler.
+class ScopedDiagnosticHandler {
+public:
+  ScopedDiagnosticHandler(MLIRContext *ctx);
+  ScopedDiagnosticHandler(MLIRContext *ctx,
+                          const DiagnosticEngine::HandlerTy &handler);
+  ~ScopedDiagnosticHandler();
+
+  /// Propagate a diagnostic to the existing diagnostic handler.
+  void propagateDiagnostic(Diagnostic diag) {
+    if (existingHandler)
+      existingHandler(std::move(diag));
+  }
+
+private:
+  /// The existing diagnostic handler registered with the context at the time of
+  /// construction.
+  DiagnosticEngine::HandlerTy existingHandler;
+
+  /// The context to register the handler back to.
+  MLIRContext *ctx;
+};
+
+/// Utility method to emit an error message using this location.
+InFlightDiagnostic emitError(Location loc);
+InFlightDiagnostic emitError(Location loc, const Twine &message);
+
+/// Utility method to emit a warning message using this location.
+InFlightDiagnostic emitWarning(Location loc);
+InFlightDiagnostic emitWarning(Location loc, const Twine &message);
+
+/// Utility method to emit a remark message using this location.
+InFlightDiagnostic emitRemark(Location loc);
+InFlightDiagnostic emitRemark(Location loc, const Twine &message);
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct SourceMgrDiagnosticHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use with llvm::SourceMgr.
+class SourceMgrDiagnosticHandler : public ScopedDiagnosticHandler {
+public:
+  SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr, MLIRContext *ctx,
+                             llvm::raw_ostream &os);
+  SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr, MLIRContext *ctx);
+  ~SourceMgrDiagnosticHandler();
+
+  /// Emit the given diagnostic information with the held source manager.
+  void emitDiagnostic(Location loc, Twine message, DiagnosticSeverity kind);
+
+protected:
+  /// Emit the given diagnostic with the held source manager.
+  void emitDiagnostic(Diagnostic &diag);
+
+  /// Get a memory buffer for the given file, or nullptr if no file is
+  /// available.
+  const llvm::MemoryBuffer *getBufferForFile(StringRef filename);
+
+  /// The source manager that we are wrapping.
+  llvm::SourceMgr &mgr;
+
+  /// The output stream to use when printing diagnostics.
+  llvm::raw_ostream &os;
+
+private:
+  /// Convert a location into the given memory buffer into an SMLoc.
+  llvm::SMLoc convertLocToSMLoc(FileLineColLoc loc);
+
+  /// The maximum depth that a call stack will be printed.
+  /// TODO(riverriddle) This should be a tunable flag.
+  unsigned callStackLimit = 10;
+
+  std::unique_ptr<detail::SourceMgrDiagnosticHandlerImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticVerifierHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct SourceMgrDiagnosticVerifierHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use with llvm::SourceMgr that
+/// verifies that emitted diagnostics match 'expected-*' lines on the
+/// corresponding line of the source file.
+class SourceMgrDiagnosticVerifierHandler : public SourceMgrDiagnosticHandler {
+public:
+  SourceMgrDiagnosticVerifierHandler(llvm::SourceMgr &srcMgr, MLIRContext *ctx,
+                                     llvm::raw_ostream &out);
+  SourceMgrDiagnosticVerifierHandler(llvm::SourceMgr &srcMgr, MLIRContext *ctx);
+  ~SourceMgrDiagnosticVerifierHandler();
+
+  /// Returns the status of the handler and verifies that all expected
+  /// diagnostics were emitted. This return success if all diagnostics were
+  /// verified correctly, failure otherwise.
+  LogicalResult verify();
+
+private:
+  /// Process a single diagnostic.
+  void process(Diagnostic &diag);
+
+  /// Process a FileLineColLoc diagnostic.
+  void process(FileLineColLoc loc, StringRef msg, DiagnosticSeverity kind);
+
+  std::unique_ptr<detail::SourceMgrDiagnosticVerifierHandlerImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// ParallelDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct ParallelDiagnosticHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use when multi-threading some
+/// part of the compiler where diagnostics may be emitted. This handler ensures
+/// a deterministic ordering to the emitted diagnostics that mirrors that of a
+/// single-threaded compilation.
+class ParallelDiagnosticHandler {
+public:
+  ParallelDiagnosticHandler(MLIRContext *ctx);
+  ~ParallelDiagnosticHandler();
+
+  /// Set the order id for the current thread. This is required to be set by
+  /// each thread that will be emitting diagnostics to this handler. The orderID
+  /// corresponds to the order in which diagnostics would be emitted when
+  /// executing synchronously. For example, if we were processing a list
+  /// of operations [a, b, c] on a single-thread. Diagnostics emitted while
+  /// processing operation 'a' would be emitted before those for 'b' or 'c'.
+  /// This corresponds 1-1 with the 'orderID'. The thread that is processing 'a'
+  /// should set the orderID to '0'; the thread processing 'b' should set it to
+  /// '1'; and so on and so forth. This provides a way for the handler to
+  /// deterministically order the diagnostics that it receives given the thread
+  /// that it is receiving on.
+  void setOrderIDForThread(size_t orderID);
+
+private:
+  std::unique_ptr<detail::ParallelDiagnosticHandlerImpl> impl;
+};
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Dialect.h b/third_party/mlir/include/mlir/IR/Dialect.h
new file mode 100644
index 00000000000..810d11c2ef2
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Dialect.h
@@ -0,0 +1,314 @@
+//===- Dialect.h - IR Dialect Description -----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the 'dialect' abstraction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECT_H
+#define MLIR_IR_DIALECT_H
+
+#include "mlir/IR/OperationSupport.h"
+
+namespace mlir {
+class DialectInterface;
+class OpBuilder;
+class Type;
+
+using DialectConstantDecodeHook =
+    std::function<bool(const OpaqueElementsAttr, ElementsAttr &)>;
+using DialectConstantFoldHook = std::function<LogicalResult(
+    Operation *, ArrayRef<Attribute>, SmallVectorImpl<Attribute> &)>;
+using DialectExtractElementHook =
+    std::function<Attribute(const OpaqueElementsAttr, ArrayRef<uint64_t>)>;
+
+/// Dialects are groups of MLIR operations and behavior associated with the
+/// entire group.  For example, hooks into other systems for constant folding,
+/// default named types for asm printing, etc.
+///
+/// Instances of the dialect object are global across all MLIRContext's that may
+/// be active in the process.
+///
+class Dialect {
+public:
+  virtual ~Dialect();
+
+  /// Utility function that returns if the given string is a valid dialect
+  /// namespace.
+  static bool isValidNamespace(StringRef str);
+
+  MLIRContext *getContext() const { return context; }
+
+  StringRef getNamespace() const { return name; }
+
+  /// Returns true if this dialect allows for unregistered operations, i.e.
+  /// operations prefixed with the dialect namespace but not registered with
+  /// addOperation.
+  bool allowsUnknownOperations() const { return unknownOpsAllowed; }
+
+  /// Return true if this dialect allows for unregistered types, i.e., types
+  /// prefixed with the dialect namespace but not registered with addType.
+  /// These are represented with OpaqueType.
+  bool allowsUnknownTypes() const { return unknownTypesAllowed; }
+
+  //===--------------------------------------------------------------------===//
+  // Constant Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Registered fallback constant fold hook for the dialect. Like the constant
+  /// fold hook of each operation, it attempts to constant fold the operation
+  /// with the specified constant operand values - the elements in "operands"
+  /// will correspond directly to the operands of the operation, but may be null
+  /// if non-constant.  If constant folding is successful, this fills in the
+  /// `results` vector.  If not, this returns failure and `results` is
+  /// unspecified.
+  DialectConstantFoldHook constantFoldHook =
+      [](Operation *op, ArrayRef<Attribute> operands,
+         SmallVectorImpl<Attribute> &results) { return failure(); };
+
+  /// Registered hook to decode opaque constants associated with this
+  /// dialect. The hook function attempts to decode an opaque constant tensor
+  /// into a tensor with non-opaque content. If decoding is successful, this
+  /// method returns false and sets 'output' attribute. If not, it returns true
+  /// and leaves 'output' unspecified. The default hook fails to decode.
+  DialectConstantDecodeHook decodeHook =
+      [](const OpaqueElementsAttr input, ElementsAttr &output) { return true; };
+
+  /// Registered hook to extract an element from an opaque constant associated
+  /// with this dialect. If element has been successfully extracted, this
+  /// method returns that element. If not, it returns an empty attribute.
+  /// The default hook fails to extract an element.
+  DialectExtractElementHook extractElementHook =
+      [](const OpaqueElementsAttr input, ArrayRef<uint64_t> index) {
+        return Attribute();
+      };
+
+  /// Registered hook to materialize a single constant operation from a given
+  /// attribute value with the desired resultant type. This method should use
+  /// the provided builder to create the operation without changing the
+  /// insertion position. The generated operation is expected to be constant
+  /// like, i.e. single result, zero operands, non side-effecting, etc. On
+  /// success, this hook should return the value generated to represent the
+  /// constant value. Otherwise, it should return null on failure.
+  virtual Operation *materializeConstant(OpBuilder &builder, Attribute value,
+                                         Type type, Location loc) {
+    return nullptr;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Parsing Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an attribute registered to this dialect. If 'type' is nonnull, it
+  /// refers to the expected type of the attribute.
+  virtual Attribute parseAttribute(StringRef attrData, Type type,
+                                   Location loc) const;
+
+  /// Print an attribute registered to this dialect. Note: The type of the
+  /// attribute need not be printed by this method as it is always printed by
+  /// the caller.
+  virtual void printAttribute(Attribute, raw_ostream &) const {
+    llvm_unreachable("dialect has no registered attribute printing hook");
+  }
+
+  /// Parse a type registered to this dialect.
+  virtual Type parseType(StringRef tyData, Location loc) const;
+
+  /// Print a type registered to this dialect.
+  virtual void printType(Type, raw_ostream &) const {
+    llvm_unreachable("dialect has no registered type printing hook");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Verification Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Verify an attribute from this dialect on the argument at 'argIndex' for
+  /// the region at 'regionIndex' on the given operation. Returns failure if
+  /// the verification failed, success otherwise. This hook may optionally be
+  /// invoked from any operation containing a region.
+  virtual LogicalResult verifyRegionArgAttribute(Operation *,
+                                                 unsigned regionIndex,
+                                                 unsigned argIndex,
+                                                 NamedAttribute);
+
+  /// Verify an attribute from this dialect on the given operation. Returns
+  /// failure if the verification failed, success otherwise.
+  virtual LogicalResult verifyOperationAttribute(Operation *, NamedAttribute) {
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Interfaces
+  //===--------------------------------------------------------------------===//
+
+  /// Lookup an interface for the given ID if one is registered, otherwise
+  /// nullptr.
+  const DialectInterface *getRegisteredInterface(ClassID *interfaceID) {
+    auto it = registeredInterfaces.find(interfaceID);
+    return it != registeredInterfaces.end() ? it->getSecond().get() : nullptr;
+  }
+  template <typename InterfaceT> const InterfaceT *getRegisteredInterface() {
+    return static_cast<const InterfaceT *>(
+        getRegisteredInterface(InterfaceT::getInterfaceID()));
+  }
+
+protected:
+  /// The constructor takes a unique namespace for this dialect as well as the
+  /// context to bind to.
+  /// Note: The namespace must not contain '.' characters.
+  /// Note: All operations belonging to this dialect must have names starting
+  ///       with the namespace followed by '.'.
+  /// Example:
+  ///       - "tf" for the TensorFlow ops like "tf.add".
+  Dialect(StringRef name, MLIRContext *context);
+
+  /// This method is used by derived classes to add their operations to the set.
+  ///
+  template <typename... Args> void addOperations() {
+    VariadicOperationAdder<Args...>::addToSet(*this);
+  }
+
+  // It would be nice to define this as variadic functions instead of a nested
+  // variadic type, but we can't do that: function template partial
+  // specialization is not allowed, and we can't define an overload set because
+  // we don't have any arguments of the types we are pushing around.
+  template <typename First, typename... Rest> class VariadicOperationAdder {
+  public:
+    static void addToSet(Dialect &dialect) {
+      dialect.addOperation(AbstractOperation::get<First>(dialect));
+      VariadicOperationAdder<Rest...>::addToSet(dialect);
+    }
+  };
+
+  template <typename First> class VariadicOperationAdder<First> {
+  public:
+    static void addToSet(Dialect &dialect) {
+      dialect.addOperation(AbstractOperation::get<First>(dialect));
+    }
+  };
+
+  void addOperation(AbstractOperation opInfo);
+
+  /// This method is used by derived classes to add their types to the set.
+  template <typename... Args> void addTypes() {
+    VariadicSymbolAdder<Args...>::addToSet(*this);
+  }
+
+  /// This method is used by derived classes to add their attributes to the set.
+  template <typename... Args> void addAttributes() {
+    VariadicSymbolAdder<Args...>::addToSet(*this);
+  }
+
+  // It would be nice to define this as variadic functions instead of a nested
+  // variadic type, but we can't do that: function template partial
+  // specialization is not allowed, and we can't define an overload set
+  // because we don't have any arguments of the types we are pushing around.
+  template <typename First, typename... Rest> struct VariadicSymbolAdder {
+    static void addToSet(Dialect &dialect) {
+      VariadicSymbolAdder<First>::addToSet(dialect);
+      VariadicSymbolAdder<Rest...>::addToSet(dialect);
+    }
+  };
+
+  template <typename First> struct VariadicSymbolAdder<First> {
+    static void addToSet(Dialect &dialect) {
+      dialect.addSymbol(First::getClassID());
+    }
+  };
+
+  /// Enable support for unregistered operations.
+  void allowUnknownOperations(bool allow = true) { unknownOpsAllowed = allow; }
+
+  /// Enable support for unregistered types.
+  void allowUnknownTypes(bool allow = true) { unknownTypesAllowed = allow; }
+
+  /// Register a dialect interface with this dialect instance.
+  void addInterface(std::unique_ptr<DialectInterface> interface);
+
+  /// Register a set of dialect interfaces with this dialect instance.
+  template <typename T, typename T2, typename... Tys> void addInterfaces() {
+    addInterfaces<T>();
+    addInterfaces<T2, Tys...>();
+  }
+  template <typename T> void addInterfaces() {
+    addInterface(std::make_unique<T>(this));
+  }
+
+private:
+  // Register a symbol(e.g. type) with its given unique class identifier.
+  void addSymbol(const ClassID *const classID);
+
+  Dialect(const Dialect &) = delete;
+  void operator=(Dialect &) = delete;
+
+  /// Register this dialect object with the specified context.  The context
+  /// takes ownership of the heap allocated dialect.
+  void registerDialect(MLIRContext *context);
+
+  /// The namespace of this dialect.
+  StringRef name;
+
+  /// This is the context that owns this Dialect object.
+  MLIRContext *context;
+
+  /// Flag that specifies whether this dialect supports unregistered operations,
+  /// i.e. operations prefixed with the dialect namespace but not registered
+  /// with addOperation.
+  bool unknownOpsAllowed = false;
+
+  /// Flag that specifies whether this dialect allows unregistered types, i.e.
+  /// types prefixed with the dialect namespace but not registered with addType.
+  /// These types are represented with OpaqueType.
+  bool unknownTypesAllowed = false;
+
+  /// A collection of registered dialect interfaces.
+  DenseMap<ClassID *, std::unique_ptr<DialectInterface>> registeredInterfaces;
+};
+
+using DialectAllocatorFunction = std::function<void(MLIRContext *)>;
+
+/// Registers a specific dialect creation function with the system, typically
+/// used through the DialectRegistration template.
+void registerDialectAllocator(const DialectAllocatorFunction &function);
+
+/// Registers all dialects with the specified MLIRContext.
+void registerAllDialects(MLIRContext *context);
+
+/// Utility to register a dialect. Client can register their dialect with the
+/// global registry by calling registerDialect<MyDialect>();
+template <typename ConcreteDialect> void registerDialect() {
+  registerDialectAllocator([](MLIRContext *ctx) {
+    // Just allocate the dialect, the context takes ownership of it.
+    new ConcreteDialect(ctx);
+  });
+}
+
+/// DialectRegistration provides a global initialiser that registers a Dialect
+/// allocation routine.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static DialectRegistration<MyDialect> Unused;
+template <typename ConcreteDialect> struct DialectRegistration {
+  DialectRegistration() { registerDialect<ConcreteDialect>(); }
+};
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/DialectHooks.h b/third_party/mlir/include/mlir/IR/DialectHooks.h
new file mode 100644
index 00000000000..f368988b5b4
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/DialectHooks.h
@@ -0,0 +1,82 @@
+//===- DialectHooks.h - MLIR DialectHooks mechanism -------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines abstraction and registration mechanism for dialect hooks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECT_HOOKS_H
+#define MLIR_IR_DIALECT_HOOKS_H
+
+#include "mlir/IR/Dialect.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+using DialectHooksSetter = std::function<void(MLIRContext *)>;
+
+/// Dialect hooks allow external components to register their functions to
+/// be called for specific tasks specialized per dialect, such as decoding
+/// of opaque constants. To register concrete dialect hooks, one should
+/// define a DialectHooks subclass and use it as a template
+/// argument to DialectHooksRegistration. For example,
+///     class MyHooks : public DialectHooks {...};
+///     static DialectHooksRegistration<MyHooks, MyDialect> hooksReg;
+/// The subclass should override DialectHook methods for supported hooks.
+class DialectHooks {
+public:
+  // Returns hook to constant fold an operation.
+  DialectConstantFoldHook getConstantFoldHook() { return nullptr; }
+  // Returns hook to decode opaque constant tensor.
+  DialectConstantDecodeHook getDecodeHook() { return nullptr; }
+  // Returns hook to extract an element of an opaque constant tensor.
+  DialectExtractElementHook getExtractElementHook() { return nullptr; }
+};
+
+/// Registers a function that will set hooks in the registered dialects
+/// based on information coming from DialectHooksRegistration.
+void registerDialectHooksSetter(const DialectHooksSetter &function);
+
+/// DialectHooksRegistration provides a global initialiser that registers
+/// a dialect hooks setter routine.
+/// Usage:
+///
+///   // At namespace scope.
+///   static DialectHooksRegistration<MyHooks, MyDialect> unused;
+template <typename ConcreteHooks> struct DialectHooksRegistration {
+  DialectHooksRegistration(StringRef dialectName) {
+    registerDialectHooksSetter([dialectName](MLIRContext *ctx) {
+      Dialect *dialect = ctx->getRegisteredDialect(dialectName);
+      if (!dialect) {
+        llvm::errs() << "error: cannot register hooks for unknown dialect '"
+                     << dialectName << "'\n";
+        abort();
+      }
+      // Set hooks.
+      ConcreteHooks hooks;
+      if (auto h = hooks.getConstantFoldHook())
+        dialect->constantFoldHook = h;
+      if (auto h = hooks.getDecodeHook())
+        dialect->decodeHook = h;
+      if (auto h = hooks.getExtractElementHook())
+        dialect->extractElementHook = h;
+    });
+  }
+};
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/DialectInterface.h b/third_party/mlir/include/mlir/IR/DialectInterface.h
new file mode 100644
index 00000000000..4eb41105032
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/DialectInterface.h
@@ -0,0 +1,190 @@
+//===- DialectInterface.h - IR Dialect Interfaces ---------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_DIALECTINTERFACE_H
+#define MLIR_IR_DIALECTINTERFACE_H
+
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace mlir {
+class Dialect;
+class MLIRContext;
+class Operation;
+
+//===----------------------------------------------------------------------===//
+// DialectInterface
+//===----------------------------------------------------------------------===//
+namespace detail {
+/// The base class used for all derived interface types. This class provides
+/// utilities necessary for registration.
+template <typename ConcreteType, typename BaseT>
+class DialectInterfaceBase : public BaseT {
+public:
+  using Base = DialectInterfaceBase<ConcreteType, BaseT>;
+
+  /// Get a unique id for the derived interface type.
+  static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+protected:
+  DialectInterfaceBase(Dialect *dialect) : BaseT(dialect, getInterfaceID()) {}
+};
+} // end namespace detail
+
+/// This class represents an interface overridden for a single dialect.
+class DialectInterface {
+public:
+  virtual ~DialectInterface();
+
+  /// The base class used for all derived interface types. This class provides
+  /// utilities necessary for registration.
+  template <typename ConcreteType>
+  using Base = detail::DialectInterfaceBase<ConcreteType, DialectInterface>;
+
+  /// Return the dialect that this interface represents.
+  Dialect *getDialect() const { return dialect; }
+
+  /// Return the derived interface id.
+  ClassID *getID() const { return interfaceID; }
+
+protected:
+  DialectInterface(Dialect *dialect, ClassID *id)
+      : dialect(dialect), interfaceID(id) {}
+
+private:
+  /// The dialect that represents this interface.
+  Dialect *dialect;
+
+  /// The unique identifier for the derived interface type.
+  ClassID *interfaceID;
+};
+
+//===----------------------------------------------------------------------===//
+// DialectInterfaceCollection
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// This class is the base class for a collection of instances for a specific
+/// interface kind.
+class DialectInterfaceCollectionBase {
+  /// DenseMap info for dialect interfaces that allows lookup by the dialect.
+  struct InterfaceKeyInfo : public DenseMapInfo<const DialectInterface *> {
+    using DenseMapInfo<const DialectInterface *>::isEqual;
+
+    static unsigned getHashValue(Dialect *key) { return llvm::hash_value(key); }
+    static unsigned getHashValue(const DialectInterface *key) {
+      return getHashValue(key->getDialect());
+    }
+
+    static bool isEqual(Dialect *lhs, const DialectInterface *rhs) {
+      if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+        return false;
+      return lhs == rhs->getDialect();
+    }
+  };
+
+  /// A set of registered dialect interface instances.
+  using InterfaceSetT = DenseSet<const DialectInterface *, InterfaceKeyInfo>;
+  using InterfaceVectorT = std::vector<const DialectInterface *>;
+
+public:
+  DialectInterfaceCollectionBase(MLIRContext *ctx, ClassID *interfaceKind);
+  virtual ~DialectInterfaceCollectionBase();
+
+protected:
+  /// Get the interface for the dialect of given operation, or null if one
+  /// is not registered.
+  const DialectInterface *getInterfaceFor(Operation *op) const;
+
+  /// Get the interface for the given dialect.
+  const DialectInterface *getInterfaceFor(Dialect *dialect) const {
+    auto it = interfaces.find_as(dialect);
+    return it == interfaces.end() ? nullptr : *it;
+  }
+
+  /// An iterator class that iterates the held interface objects of the given
+  /// derived interface type.
+  template <typename InterfaceT>
+  class iterator : public llvm::mapped_iterator<
+                       InterfaceVectorT::const_iterator,
+                       const InterfaceT &(*)(const DialectInterface *)> {
+    static const InterfaceT &remapIt(const DialectInterface *interface) {
+      return *static_cast<const InterfaceT *>(interface);
+    }
+
+    iterator(InterfaceVectorT::const_iterator it)
+        : llvm::mapped_iterator<
+              InterfaceVectorT::const_iterator,
+              const InterfaceT &(*)(const DialectInterface *)>(it, &remapIt) {}
+
+    /// Allow access to the constructor.
+    friend DialectInterfaceCollectionBase;
+  };
+
+  /// Iterator access to the held interfaces.
+  template <typename InterfaceT> iterator<InterfaceT> interface_begin() const {
+    return iterator<InterfaceT>(orderedInterfaces.begin());
+  }
+  template <typename InterfaceT> iterator<InterfaceT> interface_end() const {
+    return iterator<InterfaceT>(orderedInterfaces.end());
+  }
+
+private:
+  /// A set of registered dialect interface instances.
+  InterfaceSetT interfaces;
+  /// An ordered list of the registered interface instances, necessary for
+  /// deterministic iteration.
+  // NOTE: SetVector does not provide find access, so it can't be used here.
+  InterfaceVectorT orderedInterfaces;
+};
+} // namespace detail
+
+/// A collection of dialect interfaces within a context, for a given concrete
+/// interface type.
+template <typename InterfaceType>
+class DialectInterfaceCollection
+    : public detail::DialectInterfaceCollectionBase {
+public:
+  using Base = DialectInterfaceCollection<InterfaceType>;
+
+  /// Collect the registered dialect interfaces within the provided context.
+  DialectInterfaceCollection(MLIRContext *ctx)
+      : detail::DialectInterfaceCollectionBase(
+            ctx, InterfaceType::getInterfaceID()) {}
+
+  /// Get the interface for a given object, or null if one is not registered.
+  /// The object may be a dialect or an operation instance.
+  template <typename Object>
+  const InterfaceType *getInterfaceFor(Object *obj) const {
+    return static_cast<const InterfaceType *>(
+        detail::DialectInterfaceCollectionBase::getInterfaceFor(obj));
+  }
+
+  /// Iterator access to the held interfaces.
+  using iterator =
+      detail::DialectInterfaceCollectionBase::iterator<InterfaceType>;
+  iterator begin() const { return interface_begin<InterfaceType>(); }
+  iterator end() const { return interface_end<InterfaceType>(); }
+
+private:
+  using detail::DialectInterfaceCollectionBase::interface_begin;
+  using detail::DialectInterfaceCollectionBase::interface_end;
+};
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/DialectSymbolRegistry.def b/third_party/mlir/include/mlir/IR/DialectSymbolRegistry.def
new file mode 100644
index 00000000000..bf9fc1d361d
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/DialectSymbolRegistry.def
@@ -0,0 +1,48 @@
+//===- DialectSymbolRegistry.def - MLIR Dialect Symbol Registry -*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file enumerates the different dialects that define custom classes
+// within the attribute or type system.
+//
+//===----------------------------------------------------------------------===//
+
+DEFINE_SYM_KIND_RANGE(STANDARD)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW_CONTROL)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW_EXECUTOR)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW)
+DEFINE_SYM_KIND_RANGE(LLVM)
+DEFINE_SYM_KIND_RANGE(QUANTIZATION)
+DEFINE_SYM_KIND_RANGE(IREE) // IREE stands for IR Execution Engine
+DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
+DEFINE_SYM_KIND_RANGE(FIR) // Flang Fortran IR Dialect
+DEFINE_SYM_KIND_RANGE(TOY) // Toy language (tutorial) Dialect
+DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect
+
+// The following ranges are reserved for experimenting with MLIR dialects in a
+// private context without having to register them here.
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_1)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_2)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_3)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_4)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_5)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_6)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_7)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_8)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_9)
+
+#undef DEFINE_SYM_KIND_RANGE
diff --git a/third_party/mlir/include/mlir/IR/Function.h b/third_party/mlir/include/mlir/IR/Function.h
new file mode 100644
index 00000000000..73da52ff8ec
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Function.h
@@ -0,0 +1,160 @@
+//===- Function.h - MLIR Function Class -------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Functions are the basic unit of composition in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_FUNCTION_H
+#define MLIR_IR_FUNCTION_H
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+//===--------------------------------------------------------------------===//
+// Function Operation.
+//===--------------------------------------------------------------------===//
+
+/// FuncOp represents a function, or an operation containing one region that
+/// forms a CFG(Control Flow Graph). The region of a function is not allowed to
+/// implicitly capture global values, and all external references must use
+/// Function arguments or attributes that establish a symbolic connection(e.g.
+/// symbols referenced by name via a string attribute).
+class FuncOp : public Op<FuncOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+                         OpTrait::IsIsolatedFromAbove, OpTrait::FunctionLike> {
+public:
+  using Op::Op;
+  using Op::print;
+
+  static StringRef getOperationName() { return "func"; }
+
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       ArrayRef<NamedAttribute> attrs = {});
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       llvm::iterator_range<dialect_attr_iterator> attrs);
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       ArrayRef<NamedAttribute> attrs,
+                       ArrayRef<NamedAttributeList> argAttrs);
+
+  static void build(Builder *builder, OperationState *result, StringRef name,
+                    FunctionType type, ArrayRef<NamedAttribute> attrs);
+  static void build(Builder *builder, OperationState *result, StringRef name,
+                    FunctionType type, ArrayRef<NamedAttribute> attrs,
+                    ArrayRef<NamedAttributeList> argAttrs);
+
+  /// Operation hooks.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+
+  /// Returns the type of this function.
+  FunctionType getType() {
+    return getAttrOfType<TypeAttr>(getTypeAttrName())
+        .getValue()
+        .cast<FunctionType>();
+  }
+
+  /// Change the type of this function in place. This is an extremely dangerous
+  /// operation and it is up to the caller to ensure that this is legal for this
+  /// function, and to restore invariants:
+  ///  - the entry block args must be updated to match the function params.
+  ///  - the arguments attributes may need an update: if the new type has less
+  ///    parameters we drop the extra attributes, if there are more parameters
+  ///    they won't have any attributes.
+  void setType(FunctionType newType) {
+    setAttr(getTypeAttrName(), TypeAttr::get(newType));
+  }
+
+  /// Create a deep copy of this function and all of its blocks, remapping
+  /// any operands that use values outside of the function using the map that is
+  /// provided (leaving them alone if no entry is present). If the mapper
+  /// contains entries for function arguments, these arguments are not included
+  /// in the new function. Replaces references to cloned sub-values with the
+  /// corresponding value that is copied, and adds those mappings to the mapper.
+  FuncOp clone(BlockAndValueMapping &mapper);
+  FuncOp clone();
+
+  /// Clone the internal blocks and attributes from this function into dest. Any
+  /// cloned blocks are appended to the back of dest. This function asserts that
+  /// the attributes of the current function and dest are compatible.
+  void cloneInto(FuncOp dest, BlockAndValueMapping &mapper);
+
+  //===--------------------------------------------------------------------===//
+  // Body Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Add an entry block to an empty function, and set up the block arguments
+  /// to match the signature of the function. The newly inserted entry block is
+  /// returned.
+  Block *addEntryBlock();
+
+private:
+  // This trait needs access to `getNumFuncArguments` and `verifyType` hooks
+  // defined below.
+  friend class OpTrait::FunctionLike<FuncOp>;
+
+  /// Returns the number of arguments. This is a hook for OpTrait::FunctionLike.
+  unsigned getNumFuncArguments() { return getType().getInputs().size(); }
+
+  /// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+  /// attribute is present and checks if it holds a function type.  Ensures
+  /// getType and getNumFuncArguments can be called safely.
+  LogicalResult verifyType() {
+    auto type = getTypeAttr().getValue();
+    if (!type.isa<FunctionType>())
+      return emitOpError("requires '" + getTypeAttrName() +
+                         "' attribute of function type");
+    return success();
+  }
+};
+} // end namespace mlir
+
+namespace llvm {
+
+// Functions hash just like pointers.
+template <> struct DenseMapInfo<mlir::FuncOp> {
+  static mlir::FuncOp getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::FuncOp::getFromOpaquePointer(pointer);
+  }
+  static mlir::FuncOp getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::FuncOp::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::FuncOp val) {
+    return hash_value(val.getAsOpaquePointer());
+  }
+  static bool isEqual(mlir::FuncOp LHS, mlir::FuncOp RHS) { return LHS == RHS; }
+};
+
+/// Allow stealing the low bits of FuncOp.
+template <> struct PointerLikeTypeTraits<mlir::FuncOp> {
+public:
+  static inline void *getAsVoidPointer(mlir::FuncOp I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::FuncOp getFromVoidPointer(void *P) {
+    return mlir::FuncOp::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_FUNCTION_H
diff --git a/third_party/mlir/include/mlir/IR/FunctionSupport.h b/third_party/mlir/include/mlir/IR/FunctionSupport.h
new file mode 100644
index 00000000000..75a0a6710a5
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/FunctionSupport.h
@@ -0,0 +1,407 @@
+//===- FunctionSupport.h - Utility types for function-like ops --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines support types for Operations that represent function-like
+// constructs to use.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_FUNCTIONSUPPORT_H
+#define MLIR_IR_FUNCTIONSUPPORT_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/SymbolTable.h"
+#include "llvm/ADT/SmallString.h"
+
+namespace mlir {
+
+namespace impl {
+/// Return the name of the attribute used for function types.
+inline StringRef getTypeAttrName() { return "type"; }
+
+/// Return the name of the attribute used for function arguments.
+inline StringRef getArgAttrName(unsigned arg, SmallVectorImpl<char> &out) {
+  out.clear();
+  return ("arg" + Twine(arg)).toStringRef(out);
+}
+
+/// Returns the dictionary attribute corresponding to the argument at 'index'.
+/// If there are no argument attributes at 'index', a null attribute is
+/// returned.
+inline DictionaryAttr getArgAttrDict(Operation *op, unsigned index) {
+  SmallString<8> nameOut;
+  return op->getAttrOfType<DictionaryAttr>(getArgAttrName(index, nameOut));
+}
+
+/// Return all of the attributes for the argument at 'index'.
+inline ArrayRef<NamedAttribute> getArgAttrs(Operation *op, unsigned index) {
+  auto argDict = getArgAttrDict(op, index);
+  return argDict ? argDict.getValue() : llvm::None;
+}
+
+/// A named class for passing around the variadic flag.
+class VariadicFlag {
+public:
+  explicit VariadicFlag(bool variadic) : variadic(variadic) {}
+  bool isVariadic() const { return variadic; }
+
+private:
+  /// Underlying storage.
+  bool variadic;
+};
+
+/// Callback type for `parseFunctionLikeOp`, the callback should produce the
+/// type that will be associated with a function-like operation from lists of
+/// function arguments and results, VariadicFlag indicates whether the function
+/// should have variadic arguments; in case of error, it may populate the last
+/// argument with a message.
+using FuncTypeBuilder = llvm::function_ref<Type(
+    Builder &, ArrayRef<Type>, ArrayRef<Type>, VariadicFlag, std::string &)>;
+
+/// Parser implementation for function-like operations.  Uses
+/// `funcTypeBuilder` to construct the custom function type given lists of
+/// input and output types.  If `allowVariadic` is set, the parser will accept
+/// trailing ellipsis in the function signature and indicate to the builder
+/// whether the function is variadic.  If the builder returns a null type,
+/// `result` will not contain the `type` attribute.  The caller can then add a
+/// type, report the error or delegate the reporting to the op's verifier.
+ParseResult parseFunctionLikeOp(OpAsmParser *parser, OperationState *result,
+                                bool allowVariadic,
+                                FuncTypeBuilder funcTypeBuilder);
+
+/// Printer implementation for function-like operations.  Accepts lists of
+/// argument and result types to use while printing.
+void printFunctionLikeOp(OpAsmPrinter *p, Operation *op,
+                         ArrayRef<Type> argTypes, bool isVariadic,
+                         ArrayRef<Type> results);
+
+} // namespace impl
+
+namespace OpTrait {
+
+/// This trait provides APIs for Ops that behave like functions.  In particular:
+/// - Ops can be used with SymbolTable in the parent Op and have names;
+/// - Ops have a single region with multiple blocks that corresponds to the body
+///   of the function;
+/// - the absence of a region corresonds to an external function;
+/// - arguments of the first block of the region are treated as function
+///   arguments;
+/// - they can have argument attributes that are stored in a dictionary
+///   attribute on the Op itself.
+/// This trait does *NOT* provide type support for the functions, meaning that
+/// concrete Ops must handle the type of the declared or defined function.
+/// `getTypeAttrName()` is a convenience function that returns the name of the
+/// attribute that can be used to store the function type, but the trait makes
+/// no assumption based on it.
+///
+/// - Concrete ops *must* define a member function `getNumFuncArguments()` that
+/// returns the number of function arguments based exclusively on type (so that
+/// it can be called on function declarations).
+/// - To verify that the type respects op-specific invariants, concrete ops may
+/// redefine the `verifyType()` hook that will be called after verifying the
+/// presence of the `type` attribute and before any call to
+/// `getNumFuncArguments` from the verifier.
+template <typename ConcreteType>
+class FunctionLike : public OpTrait::TraitBase<ConcreteType, FunctionLike> {
+public:
+  /// Verify that all of the argument attributes are dialect attributes.
+  static LogicalResult verifyTrait(Operation *op);
+
+  //===--------------------------------------------------------------------===//
+  // Name Handling.
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the name of this function.
+  StringRef getName() {
+    return this->getOperation()
+        ->template getAttrOfType<StringAttr>(
+            mlir::SymbolTable::getSymbolAttrName())
+        .getValue();
+  }
+
+  /// Set the name of this function.
+  void setName(StringRef name) {
+    this->getOperation()->setAttr(
+        mlir::SymbolTable::getSymbolAttrName(),
+        StringAttr::get(name, this->getOperation()->getContext()));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Body Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if this function is external, i.e. it has no body.
+  bool isExternal() { return empty(); }
+
+  Region &getBody() { return this->getOperation()->getRegion(0); }
+
+  /// Delete all blocks from this function.
+  void eraseBody() {
+    getBody().dropAllReferences();
+    getBody().getBlocks().clear();
+  }
+
+  /// This is the list of blocks in the function.
+  using RegionType = Region::RegionType;
+  RegionType &getBlocks() { return getBody().getBlocks(); }
+
+  // Iteration over the block in the function.
+  using iterator = RegionType::iterator;
+  using reverse_iterator = RegionType::reverse_iterator;
+
+  iterator begin() { return getBody().begin(); }
+  iterator end() { return getBody().end(); }
+  reverse_iterator rbegin() { return getBody().rbegin(); }
+  reverse_iterator rend() { return getBody().rend(); }
+
+  bool empty() { return getBody().empty(); }
+  void push_back(Block *block) { getBody().push_back(block); }
+  void push_front(Block *block) { getBody().push_front(block); }
+
+  Block &back() { return getBody().back(); }
+  Block &front() { return getBody().front(); }
+
+  //===--------------------------------------------------------------------===//
+  // Type Attribute Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Return the name of the attribute used for function types.
+  static StringRef getTypeAttrName() { return ::mlir::impl::getTypeAttrName(); }
+
+  TypeAttr getTypeAttr() {
+    return this->getOperation()->template getAttrOfType<TypeAttr>(
+        getTypeAttrName());
+  }
+
+  bool isTypeAttrValid() {
+    auto typeAttr = getTypeAttr();
+    if (!typeAttr)
+      return false;
+    return typeAttr.getValue() != Type{};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Argument Handling
+  //===--------------------------------------------------------------------===//
+
+  unsigned getNumArguments() {
+    return static_cast<ConcreteType *>(this)->getNumFuncArguments();
+  }
+
+  /// Gets argument.
+  BlockArgument *getArgument(unsigned idx) {
+    return getBlocks().front().getArgument(idx);
+  }
+
+  // Supports non-const operand iteration.
+  using args_iterator = Block::args_iterator;
+  args_iterator args_begin() { return front().args_begin(); }
+  args_iterator args_end() { return front().args_end(); }
+  llvm::iterator_range<args_iterator> getArguments() {
+    return {args_begin(), args_end()};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Argument Attributes
+  //===--------------------------------------------------------------------===//
+
+  /// FunctionLike operations allow for attaching attributes to each of the
+  /// respective function arguments. These argument attributes are stored as
+  /// DictionaryAttrs in the main operation attribute dictionary. The name of
+  /// these entries is `arg` followed by the index of the argument. These
+  /// argument attribute dictionaries are optional, and will generally only
+  /// exist if they are non-empty.
+
+  /// Return all of the attributes for the argument at 'index'.
+  ArrayRef<NamedAttribute> getArgAttrs(unsigned index) {
+    return ::mlir::impl::getArgAttrs(this->getOperation(), index);
+  }
+
+  /// Return all argument attributes of this function.
+  void getAllArgAttrs(SmallVectorImpl<NamedAttributeList> &result) {
+    for (unsigned i = 0, e = getNumArguments(); i != e; ++i)
+      result.emplace_back(getArgAttrDict(i));
+  }
+
+  /// Return the specified attribute, if present, for the argument at 'index',
+  /// null otherwise.
+  Attribute getArgAttr(unsigned index, Identifier name) {
+    auto argDict = getArgAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+  Attribute getArgAttr(unsigned index, StringRef name) {
+    auto argDict = getArgAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+
+  template <typename AttrClass>
+  AttrClass getArgAttrOfType(unsigned index, Identifier name) {
+    return getArgAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+  template <typename AttrClass>
+  AttrClass getArgAttrOfType(unsigned index, StringRef name) {
+    return getArgAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+
+  /// Set the attributes held by the argument at 'index'.
+  void setArgAttrs(unsigned index, ArrayRef<NamedAttribute> attributes);
+  void setArgAttrs(unsigned index, NamedAttributeList attributes);
+  void setAllArgAttrs(ArrayRef<NamedAttributeList> attributes) {
+    assert(attributes.size() == getNumArguments());
+    for (unsigned i = 0, e = attributes.size(); i != e; ++i)
+      setArgAttrs(i, attributes[i]);
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value. Otherwise, add a new attribute with the specified name/value.
+  void setArgAttr(unsigned index, Identifier name, Attribute value);
+  void setArgAttr(unsigned index, StringRef name, Attribute value) {
+    setArgAttr(index, Identifier::get(name, this->getOperation()->getContext()),
+               value);
+  }
+
+  /// Remove the attribute 'name' from the argument at 'index'.
+  NamedAttributeList::RemoveResult removeArgAttr(unsigned index,
+                                                 Identifier name);
+
+protected:
+  /// Returns the attribute entry name for the set of argument attributes at
+  /// index 'arg'.
+  static StringRef getArgAttrName(unsigned arg, SmallVectorImpl<char> &out) {
+    return ::mlir::impl::getArgAttrName(arg, out);
+  }
+
+  /// Returns the dictionary attribute corresponding to the argument at 'index'.
+  /// If there are no argument attributes at 'index', a null attribute is
+  /// returned.
+  DictionaryAttr getArgAttrDict(unsigned index) {
+    assert(index < getNumArguments() && "invalid argument number");
+    return ::mlir::impl::getArgAttrDict(this->getOperation(), index);
+  }
+
+  /// Hook for concrete classes to verify that the type attribute respects
+  /// op-specific invariants.  Default implementation always succeeds.
+  LogicalResult verifyType() { return success(); }
+};
+
+template <typename ConcreteType>
+LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
+  MLIRContext *ctx = op->getContext();
+  auto funcOp = cast<ConcreteType>(op);
+
+  if (!funcOp.isTypeAttrValid())
+    return funcOp.emitOpError("requires a type attribute '")
+           << getTypeAttrName() << '\'';
+
+  if (failed(funcOp.verifyType()))
+    return failure();
+
+  for (unsigned i = 0, e = funcOp.getNumArguments(); i != e; ++i) {
+    // Verify that all of the argument attributes are dialect attributes, i.e.
+    // that they contain a dialect prefix in their name.  Call the dialect, if
+    // registered, to verify the attributes themselves.
+    for (auto attr : funcOp.getArgAttrs(i)) {
+      if (!attr.first.strref().contains('.'))
+        return funcOp.emitOpError("arguments may only have dialect attributes");
+      auto dialectNamePair = attr.first.strref().split('.');
+      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+        if (failed(dialect->verifyRegionArgAttribute(op, /*regionIndex=*/0,
+                                                     /*argIndex=*/i, attr)))
+          return failure();
+      }
+    }
+  }
+
+  // Check that the op has exactly one region for the body.
+  if (op->getNumRegions() != 1)
+    return funcOp.emitOpError("expects one region");
+
+  // Check that if the entry block exists, it has the same number of arguments
+  // as the function-like operation.
+  if (funcOp.isExternal())
+    return success();
+
+  unsigned numArguments = funcOp.getNumArguments();
+  if (funcOp.front().getNumArguments() != numArguments)
+    return funcOp.emitOpError("entry block must have ")
+           << numArguments << " arguments to match function signature";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Function Argument Attribute.
+//===----------------------------------------------------------------------===//
+
+/// Set the attributes held by the argument at 'index'.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttrs(
+    unsigned index, ArrayRef<NamedAttribute> attributes) {
+  assert(index < getNumArguments() && "invalid argument number");
+  SmallString<8> nameOut;
+  getArgAttrName(index, nameOut);
+  Operation *op = this->getOperation();
+
+  if (attributes.empty())
+    return (void)static_cast<ConcreteType *>(this)->removeAttr(nameOut);
+  op->setAttr(nameOut, DictionaryAttr::get(attributes, op->getContext()));
+}
+
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttrs(unsigned index,
+                                             NamedAttributeList attributes) {
+  assert(index < getNumArguments() && "invalid argument number");
+  SmallString<8> nameOut;
+  if (auto newAttr = attributes.getDictionary())
+    return this->getOperation()->setAttr(getArgAttrName(index, nameOut),
+                                         newAttr);
+  static_cast<ConcreteType *>(this)->removeAttr(getArgAttrName(index, nameOut));
+}
+
+/// If the an attribute exists with the specified name, change it to the new
+/// value. Otherwise, add a new attribute with the specified name/value.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttr(unsigned index, Identifier name,
+                                            Attribute value) {
+  auto curAttr = getArgAttrDict(index);
+  NamedAttributeList attrList(curAttr);
+  attrList.set(name, value);
+
+  // If the attribute changed, then set the new arg attribute list.
+  if (curAttr != attrList.getDictionary())
+    setArgAttrs(index, attrList);
+}
+
+/// Remove the attribute 'name' from the argument at 'index'.
+template <typename ConcreteType>
+NamedAttributeList::RemoveResult
+FunctionLike<ConcreteType>::removeArgAttr(unsigned index, Identifier name) {
+  // Build an attribute list and remove the attribute at 'name'.
+  NamedAttributeList attrList(getArgAttrDict(index));
+  auto result = attrList.remove(name);
+
+  // If the attribute was removed, then update the argument dictionary.
+  if (result == NamedAttributeList::RemoveResult::Removed)
+    setArgAttrs(index, attrList);
+  return result;
+}
+
+} // end namespace OpTrait
+
+} // end namespace mlir
+
+#endif // MLIR_IR_FUNCTIONSUPPORT_H
diff --git a/third_party/mlir/include/mlir/IR/Identifier.h b/third_party/mlir/include/mlir/IR/Identifier.h
new file mode 100644
index 00000000000..bc84c200545
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Identifier.h
@@ -0,0 +1,143 @@
+//===- Identifier.h - MLIR Identifier Class ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_IDENTIFIER_H
+#define MLIR_IR_IDENTIFIER_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+class MLIRContext;
+
+/// This class represents a uniqued string owned by an MLIRContext.  Strings
+/// represented by this type cannot contain nul characters, and may not have a
+/// zero length.
+///
+/// This is a POD type with pointer size, so it should be passed around by
+/// value.  The underlying data is owned by MLIRContext and is thus immortal for
+/// almost all clients.
+class Identifier {
+public:
+  /// Return an identifier for the specified string.
+  static Identifier get(StringRef str, MLIRContext *context);
+  Identifier(const Identifier &) = default;
+  Identifier &operator=(const Identifier &other) = default;
+
+  /// Return a StringRef for the string.
+  StringRef strref() const { return StringRef(pointer, size()); }
+
+  /// Identifiers implicitly convert to StringRefs.
+  operator StringRef() const { return strref(); }
+
+  /// Return an std::string.
+  std::string str() const { return strref().str(); }
+
+  /// Return a null terminated C string.
+  const char *c_str() const { return pointer; }
+
+  /// Return a pointer to the start of the string data.
+  const char *data() const { return pointer; }
+
+  /// Return the number of bytes in this string.
+  unsigned size() const { return ::strlen(pointer); }
+
+  /// Return true if this identifier is the specified string.
+  bool is(StringRef string) const { return strref().equals(string); }
+
+  const char *begin() const { return pointer; }
+  const char *end() const { return pointer + size(); }
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>(pointer);
+  }
+  static Identifier getFromOpaquePointer(const void *pointer) {
+    return Identifier((const char *)pointer);
+  }
+
+private:
+  /// These are the bytes of the string, which is a nul terminated string.
+  const char *pointer;
+  explicit Identifier(const char *pointer) : pointer(pointer) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Identifier identifier) {
+  identifier.print(os);
+  return os;
+}
+
+inline bool operator==(Identifier lhs, Identifier rhs) {
+  return lhs.data() == rhs.data();
+}
+
+inline bool operator!=(Identifier lhs, Identifier rhs) {
+  return lhs.data() != rhs.data();
+}
+
+inline bool operator==(Identifier lhs, StringRef rhs) { return lhs.is(rhs); }
+inline bool operator!=(Identifier lhs, StringRef rhs) { return !lhs.is(rhs); }
+inline bool operator==(StringRef lhs, Identifier rhs) { return rhs.is(lhs); }
+inline bool operator!=(StringRef lhs, Identifier rhs) { return !rhs.is(lhs); }
+
+// Make identifiers hashable.
+inline llvm::hash_code hash_value(Identifier arg) {
+  return llvm::hash_value(arg.strref());
+}
+
+} // end namespace mlir
+
+namespace llvm {
+// Identifiers hash just like pointers, there is no need to hash the bytes.
+template <>
+struct DenseMapInfo<mlir::Identifier> {
+  static mlir::Identifier getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<const void *>::getEmptyKey();
+    return mlir::Identifier::getFromOpaquePointer(pointer);
+  }
+  static mlir::Identifier getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<const void *>::getTombstoneKey();
+    return mlir::Identifier::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::Identifier Val) {
+    return DenseMapInfo<const void *>::getHashValue(Val.data());
+  }
+  static bool isEqual(mlir::Identifier LHS, mlir::Identifier RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// The pointer inside of an identifier comes from a StringMap, so its alignment
+/// is always at least 4 and probably 8 (on 64-bit machines).  Allow LLVM to
+/// steal the low bits.
+template <>
+struct PointerLikeTypeTraits<mlir::Identifier> {
+public:
+  static inline void *getAsVoidPointer(mlir::Identifier I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Identifier getFromVoidPointer(void *P) {
+    return mlir::Identifier::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 2 };
+};
+
+} // end namespace llvm
+#endif
diff --git a/third_party/mlir/include/mlir/IR/IntegerSet.h b/third_party/mlir/include/mlir/IR/IntegerSet.h
new file mode 100644
index 00000000000..b7662f095a5
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/IntegerSet.h
@@ -0,0 +1,137 @@
+//===- IntegerSet.h - MLIR Integer Set Class --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Integer sets are sets of points from the integer lattice constrained by
+// affine equality/inequality constraints. This class is meant to represent
+// integer sets in the IR - for 'affine.if' operations and as attributes of
+// other operations. It is typically expected to contain only a handful of
+// affine constraints, and is immutable like an affine map. Integer sets are not
+// unique'd - although affine expressions that make up its equalities and
+// inequalites are themselves unique.
+
+// This class is not meant for affine analysis and operations like set
+// operations, emptiness checks, or other math operations for analysis and
+// transformation. For the latter, use FlatAffineConstraints.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_INTEGER_SET_H
+#define MLIR_IR_INTEGER_SET_H
+
+#include "mlir/IR/AffineExpr.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+namespace detail {
+struct IntegerSetStorage;
+}
+
+class MLIRContext;
+
+/// An integer set representing a conjunction of one or more affine equalities
+/// and inequalities. An integer set in the IR is immutable like the affine map,
+/// but integer sets are not unique'd. The affine expressions that make up the
+/// equalities and inequalities of an integer set are themselves unique and are
+/// allocated by the bump pointer allocator.
+class IntegerSet {
+public:
+  using ImplType = detail::IntegerSetStorage;
+
+  IntegerSet() : set(nullptr) {}
+  explicit IntegerSet(ImplType *set) : set(set) {}
+  IntegerSet(const IntegerSet &other) : set(other.set) {}
+  IntegerSet &operator=(const IntegerSet &other) = default;
+
+  static IntegerSet get(unsigned dimCount, unsigned symbolCount,
+                        ArrayRef<AffineExpr> constraints,
+                        ArrayRef<bool> eqFlags);
+
+  // Returns the canonical empty IntegerSet (i.e. a set with no integer points).
+  static IntegerSet getEmptySet(unsigned numDims, unsigned numSymbols,
+                                MLIRContext *context) {
+    auto one = getAffineConstantExpr(1, context);
+    /* 1 == 0 */
+    return get(numDims, numSymbols, one, true);
+  }
+
+  /// Returns true if this is the canonical integer set.
+  bool isEmptyIntegerSet() const;
+
+  explicit operator bool() { return set; }
+  bool operator==(IntegerSet other) const { return set == other.set; }
+
+  unsigned getNumDims() const;
+  unsigned getNumSymbols() const;
+  unsigned getNumOperands() const;
+  unsigned getNumConstraints() const;
+  unsigned getNumEqualities() const;
+  unsigned getNumInequalities() const;
+
+  ArrayRef<AffineExpr> getConstraints() const;
+
+  AffineExpr getConstraint(unsigned idx) const;
+
+  /// Returns the equality bits, which specify whether each of the constraints
+  /// is an equality or inequality.
+  ArrayRef<bool> getEqFlags() const;
+
+  /// Returns true if the idx^th constraint is an equality, false if it is an
+  /// inequality.
+  bool isEq(unsigned idx) const;
+
+  MLIRContext *getContext() const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  friend ::llvm::hash_code hash_value(IntegerSet arg);
+
+private:
+  ImplType *set;
+  /// Sets with constraints fewer than kUniquingThreshold are uniqued.
+  constexpr static unsigned kUniquingThreshold = 4;
+};
+
+// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(IntegerSet arg) {
+  return ::llvm::hash_value(arg.set);
+}
+
+} // end namespace mlir
+namespace llvm {
+
+// IntegerSet hash just like pointers
+template <> struct DenseMapInfo<mlir::IntegerSet> {
+  static mlir::IntegerSet getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::IntegerSet(static_cast<mlir::IntegerSet::ImplType *>(pointer));
+  }
+  static mlir::IntegerSet getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::IntegerSet(static_cast<mlir::IntegerSet::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::IntegerSet val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::IntegerSet LHS, mlir::IntegerSet RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+#endif // MLIR_IR_INTEGER_SET_H
diff --git a/third_party/mlir/include/mlir/IR/Location.h b/third_party/mlir/include/mlir/IR/Location.h
new file mode 100644
index 00000000000..d7ad5f7f031
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Location.h
@@ -0,0 +1,269 @@
+//===- Location.h - MLIR Location Classes -----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// These classes provide the ability to relate MLIR objects back to source
+// location position information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_LOCATION_H
+#define MLIR_IR_LOCATION_H
+
+#include "mlir/IR/Attributes.h"
+
+namespace mlir {
+
+class Attribute;
+class MLIRContext;
+class Identifier;
+
+namespace detail {
+
+struct LocationStorage;
+struct UnknownLocationStorage;
+struct FileLineColLocationStorage;
+struct NameLocationStorage;
+struct CallSiteLocationStorage;
+struct FusedLocationStorage;
+
+} // namespace detail
+
+/// Location objects represent source locations information in MLIR.
+/// LocationAttr acts as the anchor for all Location based attributes.
+class LocationAttr : public Attribute {
+public:
+  using Attribute::Attribute;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() >= StandardAttributes::FIRST_LOCATION_ATTR &&
+           attr.getKind() <= StandardAttributes::LAST_LOCATION_ATTR;
+  }
+};
+
+/// This class defines the main interface for locations in MLIR and acts as a
+/// non-nullable wrapper around a LocationAttr.
+class Location {
+public:
+  Location(LocationAttr loc) : impl(loc) {
+    assert(loc && "location should never be null.");
+  }
+
+  /// Access the impl location attribute.
+  operator LocationAttr() const { return impl; }
+  LocationAttr *operator->() const { return const_cast<LocationAttr *>(&impl); }
+
+  /// Type casting utilities on the underlying location.
+  template <typename U> bool isa() const { return impl.isa<U>(); }
+  template <typename U> U dyn_cast() const { return impl.dyn_cast<U>(); }
+  template <typename U> U cast() const { return impl.cast<U>(); }
+
+  /// Comparison operators.
+  bool operator==(Location rhs) const { return impl == rhs.impl; }
+  bool operator!=(Location rhs) const { return !(*this == rhs); }
+
+  /// Print the location.
+  void print(raw_ostream &os) const { impl.print(os); }
+  void dump() const { impl.dump(); }
+
+  friend ::llvm::hash_code hash_value(Location arg);
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const { return impl.getAsOpaquePointer(); }
+  static Location getFromOpaquePointer(const void *pointer) {
+    return LocationAttr(reinterpret_cast<const AttributeStorage *>(pointer));
+  }
+
+protected:
+  /// The internal backing location attribute.
+  LocationAttr impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const Location &loc) {
+  loc.print(os);
+  return os;
+}
+
+/// Represents a location as call site. "callee" is the concrete location
+/// (Unknown/NameLocation/FileLineColLoc) and "caller" points to the caller's
+/// location (another CallLocation or a concrete location). Multiple
+/// CallSiteLocs can be chained to form a call stack.
+class CallSiteLoc
+    : public Attribute::AttrBase<CallSiteLoc, LocationAttr,
+                                 detail::CallSiteLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued call location object.
+  static Location get(Location callee, Location caller);
+
+  /// Return a call site location which represents a name reference in one line
+  /// or a stack of frames. The input frames are ordered from innermost to
+  /// outermost.
+  static Location get(Location name, ArrayRef<Location> frames);
+
+  /// The concrete location information this object presents.
+  Location getCallee() const;
+
+  /// The caller's location.
+  Location getCaller() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::CallSiteLocation;
+  }
+};
+
+/// Represents a location derived from a file/line/column location.  The column
+/// and line may be zero to represent unknown column and/or unknown line/column
+/// information.
+class FileLineColLoc
+    : public Attribute::AttrBase<FileLineColLoc, LocationAttr,
+                                 detail::FileLineColLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued FileLineCol location object.
+  static Location get(Identifier filename, unsigned line, unsigned column,
+                      MLIRContext *context);
+  static Location get(StringRef filename, unsigned line, unsigned column,
+                      MLIRContext *context);
+
+  StringRef getFilename() const;
+
+  unsigned getLine() const;
+  unsigned getColumn() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::FileLineColLocation;
+  }
+};
+
+/// Represents a value composed of multiple source constructs, with an optional
+/// metadata attribute.
+class FusedLoc : public Attribute::AttrBase<FusedLoc, LocationAttr,
+                                            detail::FusedLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued Fused Location object. The first location in the list
+  /// will get precedence during diagnostic emission, with the rest being
+  /// displayed as supplementary "fused from here" style notes.
+  static Location get(ArrayRef<Location> locs, Attribute metadata,
+                      MLIRContext *context);
+  static Location get(ArrayRef<Location> locs, MLIRContext *context) {
+    return get(locs, Attribute(), context);
+  }
+
+  ArrayRef<Location> getLocations() const;
+
+  /// Returns the optional metadata attached to this fused location. Given that
+  /// it is optional, the return value may be a null node.
+  Attribute getMetadata() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::FusedLocation;
+  }
+};
+
+/// Represents an identity name attached to a child location.
+class NameLoc : public Attribute::AttrBase<NameLoc, LocationAttr,
+                                           detail::NameLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued name location object. The child location must not be
+  /// another NameLoc.
+  static Location get(Identifier name, Location child);
+
+  /// Return a uniqued name location object with an unknown child.
+  static Location get(Identifier name, MLIRContext *context);
+
+  /// Return the name identifier.
+  Identifier getName() const;
+
+  /// Return the child location.
+  Location getChildLoc() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::NameLocation;
+  }
+};
+
+/// Represents an unknown location.  This is always a singleton for a given
+/// MLIRContext.
+class UnknownLoc : public Attribute::AttrBase<UnknownLoc, LocationAttr> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the UnknownLoc.
+  static Location get(MLIRContext *context);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::UnknownLocation;
+  }
+};
+
+// Make Location hashable.
+inline ::llvm::hash_code hash_value(Location arg) {
+  return hash_value(arg.impl);
+}
+
+} // end namespace mlir
+
+namespace llvm {
+
+// Type hash just like pointers.
+template <> struct DenseMapInfo<mlir::Location> {
+  static mlir::Location getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Location::getFromOpaquePointer(pointer);
+  }
+  static mlir::Location getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Location::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::Location val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::Location LHS, mlir::Location RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// We align LocationStorage by 8, so allow LLVM to steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::Location> {
+public:
+  static inline void *getAsVoidPointer(mlir::Location I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Location getFromVoidPointer(void *P) {
+    return mlir::Location::getFromOpaquePointer(P);
+  }
+  enum {
+    NumLowBitsAvailable =
+        PointerLikeTypeTraits<mlir::Attribute>::NumLowBitsAvailable
+  };
+};
+
+} // namespace llvm
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/MLIRContext.h b/third_party/mlir/include/mlir/IR/MLIRContext.h
new file mode 100644
index 00000000000..a93cb8b3353
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/MLIRContext.h
@@ -0,0 +1,92 @@
+//===- MLIRContext.h - MLIR Global Context Class ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_MLIRCONTEXT_H
+#define MLIR_IR_MLIRCONTEXT_H
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace mlir {
+class AbstractOperation;
+class DiagnosticEngine;
+class Dialect;
+class InFlightDiagnostic;
+class Location;
+class MLIRContextImpl;
+class StorageUniquer;
+
+/// MLIRContext is the top-level object for a collection of MLIR modules.  It
+/// holds immortal uniqued objects like types, and the tables used to unique
+/// them.
+///
+/// MLIRContext gets a redundant "MLIR" prefix because otherwise it ends up with
+/// a very generic name ("Context") and because it is uncommon for clients to
+/// interact with it.
+///
+class MLIRContext {
+public:
+  explicit MLIRContext();
+  ~MLIRContext();
+
+  /// Return information about all registered IR dialects.
+  std::vector<Dialect *> getRegisteredDialects();
+
+  /// Get a registered IR dialect with the given namespace. If an exact match is
+  /// not found, then return nullptr.
+  Dialect *getRegisteredDialect(StringRef name);
+
+  /// Get a registered IR dialect for the given derived dialect type. The
+  /// derived type must provide a static 'getDialectNamespace' method.
+  template <typename T> T *getRegisteredDialect() {
+    return static_cast<T *>(getRegisteredDialect(T::getDialectNamespace()));
+  }
+
+  /// Return information about all registered operations.  This isn't very
+  /// efficient: typically you should ask the operations about their properties
+  /// directly.
+  std::vector<AbstractOperation *> getRegisteredOperations();
+
+  // This is effectively private given that only MLIRContext.cpp can see the
+  // MLIRContextImpl type.
+  MLIRContextImpl &getImpl() { return *impl; }
+
+  /// Returns the diagnostic engine for this context.
+  DiagnosticEngine &getDiagEngine();
+
+  /// Returns the storage uniquer used for creating affine constructs.
+  StorageUniquer &getAffineUniquer();
+
+  /// Returns the storage uniquer used for constructing type storage instances.
+  /// This should not be used directly.
+  StorageUniquer &getTypeUniquer();
+
+  /// Returns the storage uniquer used for constructing attribute storage
+  /// instances. This should not be used directly.
+  StorageUniquer &getAttributeUniquer();
+
+private:
+  const std::unique_ptr<MLIRContextImpl> impl;
+
+  MLIRContext(const MLIRContext &) = delete;
+  void operator=(const MLIRContext &) = delete;
+};
+} // end namespace mlir
+
+#endif // MLIR_IR_MLIRCONTEXT_H
diff --git a/third_party/mlir/include/mlir/IR/Matchers.h b/third_party/mlir/include/mlir/IR/Matchers.h
new file mode 100644
index 00000000000..4ea1ce2c621
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Matchers.h
@@ -0,0 +1,177 @@
+//===- Matchers.h - Various common matchers ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file provides a simple and efficient mechanism for performing general
+// tree-based pattern matching over MLIR. This mechanism is inspired by LLVM's
+// include/llvm/IR/PatternMatch.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_MATCHERS_H
+#define MLIR_MATCHERS_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include <type_traits>
+
+namespace mlir {
+
+namespace detail {
+
+/// The matcher that matches a certain kind of Attribute and binds the value
+/// inside the Attribute.
+template <
+    typename AttrClass,
+    // Require AttrClass to be a derived class from Atribute and get its
+    // value type
+    typename ValueType =
+        typename std::enable_if<std::is_base_of<Attribute, AttrClass>::value,
+                                AttrClass>::type::ValueType,
+    // Require the ValueType is not void
+    typename = typename std::enable_if<!std::is_void<ValueType>::value>::type>
+struct attr_value_binder {
+  ValueType *bind_value;
+
+  /// Creates a matcher instance that binds the value to bv if match succeeds.
+  attr_value_binder(ValueType *bv) : bind_value(bv) {}
+
+  bool match(const Attribute &attr) {
+    if (auto intAttr = attr.dyn_cast<AttrClass>()) {
+      *bind_value = intAttr.getValue();
+      return true;
+    }
+    return false;
+  }
+};
+
+/// The matcher that matches a constant foldable operation that has no side
+/// effect, no operands and produces a single result.
+template <typename AttrT> struct constant_op_binder {
+  AttrT *bind_value;
+
+  /// Creates a matcher instance that binds the constant attribute value to
+  /// bind_value if match succeeds.
+  constant_op_binder(AttrT *bind_value) : bind_value(bind_value) {}
+
+  bool match(Operation *op) {
+    if (op->getNumOperands() > 0 || op->getNumResults() != 1)
+      return false;
+    if (!op->hasNoSideEffect())
+      return false;
+
+    SmallVector<OpFoldResult, 1> foldedOp;
+    if (succeeded(op->fold(/*operands=*/llvm::None, foldedOp))) {
+      if (auto attr = foldedOp.front().dyn_cast<Attribute>()) {
+        if ((*bind_value = attr.dyn_cast<AttrT>()))
+          return true;
+      }
+    }
+    return false;
+  }
+};
+
+/// The matcher that matches a constant scalar / vector splat / tensor splat
+/// integer operation and binds the constant integer value.
+struct constant_int_op_binder {
+  IntegerAttr::ValueType *bind_value;
+
+  /// Creates a matcher instance that binds the value to bv if match succeeds.
+  constant_int_op_binder(IntegerAttr::ValueType *bv) : bind_value(bv) {}
+
+  bool match(Operation *op) {
+    Attribute attr;
+    if (!constant_op_binder<Attribute>(&attr).match(op))
+      return false;
+    auto type = op->getResult(0)->getType();
+
+    if (type.isa<IntegerType>()) {
+      return attr_value_binder<IntegerAttr>(bind_value).match(attr);
+    }
+    if (type.isa<VectorType>() || type.isa<RankedTensorType>()) {
+      if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
+        return attr_value_binder<IntegerAttr>(bind_value)
+            .match(splatAttr.getSplatValue());
+      }
+    }
+    return false;
+  }
+};
+
+// The matcher that matches a given target constant scalar / vector splat /
+// tensor splat integer value.
+template <int64_t TargetValue> struct constant_int_value_matcher {
+  bool match(Operation *op) {
+    APInt value;
+
+    return constant_int_op_binder(&value).match(op) && TargetValue == value;
+  }
+};
+
+/// The matcher that matches a certain kind of op.
+template <typename OpClass> struct op_matcher {
+  bool match(Operation *op) { return isa<OpClass>(op); }
+};
+
+} // end namespace detail
+
+/// Entry point for matching a pattern over a Value.
+template <typename Pattern>
+inline bool matchPattern(Value *value, const Pattern &pattern) {
+  // TODO: handle other cases
+  if (auto *op = value->getDefiningOp())
+    return const_cast<Pattern &>(pattern).match(op);
+  return false;
+}
+
+/// Entry point for matching a pattern over an Operation.
+template <typename Pattern>
+inline bool matchPattern(Operation *op, const Pattern &pattern) {
+  return const_cast<Pattern &>(pattern).match(op);
+}
+
+/// Matches a constant holding a scalar/vector/tensor integer (splat) and
+/// writes the integer value to bind_value.
+inline detail::constant_int_op_binder
+m_ConstantInt(IntegerAttr::ValueType *bind_value) {
+  return detail::constant_int_op_binder(bind_value);
+}
+
+/// Matches a value from a constant foldable operation and writes the value to
+/// bind_value.
+template <typename AttrT>
+inline detail::constant_op_binder<AttrT> m_Constant(AttrT *bind_value) {
+  return detail::constant_op_binder<AttrT>(bind_value);
+}
+
+/// Matches a constant scalar / vector splat / tensor splat integer one.
+inline detail::constant_int_value_matcher<1> m_One() {
+  return detail::constant_int_value_matcher<1>();
+}
+
+/// Matches the given OpClass.
+template <typename OpClass> inline detail::op_matcher<OpClass> m_Op() {
+  return detail::op_matcher<OpClass>();
+}
+
+/// Matches a constant scalar / vector splat / tensor splat integer zero.
+inline detail::constant_int_value_matcher<0> m_Zero() {
+  return detail::constant_int_value_matcher<0>();
+}
+
+} // end namespace mlir
+
+#endif // MLIR_MATCHERS_H
diff --git a/third_party/mlir/include/mlir/IR/Module.h b/third_party/mlir/include/mlir/IR/Module.h
new file mode 100644
index 00000000000..147337f9db2
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Module.h
@@ -0,0 +1,216 @@
+//===- Module.h - MLIR Module Class -----------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Module is the top-level container for code in an MLIR program.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_MODULE_H
+#define MLIR_IR_MODULE_H
+
+#include "mlir/IR/SymbolTable.h"
+
+namespace mlir {
+class ModuleTerminatorOp;
+
+//===----------------------------------------------------------------------===//
+// Module Operation.
+//===----------------------------------------------------------------------===//
+
+/// ModuleOp represents a module, or an operation containing one region with a
+/// single block containing opaque operations. The region of a module is not
+/// allowed to implicitly capture global values, and all external references
+/// must use symbolic references via attributes(e.g. via a string name).
+class ModuleOp
+    : public Op<
+          ModuleOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+          OpTrait::IsIsolatedFromAbove, OpTrait::SymbolTable,
+          OpTrait::SingleBlockImplicitTerminator<ModuleTerminatorOp>::Impl> {
+public:
+  using Op::Op;
+  using Op::print;
+
+  static StringRef getOperationName() { return "module"; }
+
+  static void build(Builder *builder, OperationState *result);
+
+  /// Construct a module from the given location.
+  static ModuleOp create(Location loc);
+
+  /// Operation hooks.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+  void print(OpAsmPrinter *p);
+  LogicalResult verify();
+
+  /// Return body of this module.
+  Region &getBodyRegion();
+  Block *getBody();
+
+  /// Print the this module in the custom top-level form.
+  void print(raw_ostream &os);
+  void dump();
+
+  //===--------------------------------------------------------------------===//
+  // Body Management.
+  //===--------------------------------------------------------------------===//
+
+  /// Iteration over the operations in the module.
+  using iterator = Block::iterator;
+
+  iterator begin() { return getBody()->begin(); }
+  iterator end() { return getBody()->end(); }
+  Operation &front() { return *begin(); }
+
+  /// This returns a range of operations of the given type 'T' held within the
+  /// module.
+  template <typename T> llvm::iterator_range<Block::op_iterator<T>> getOps() {
+    return getBody()->getOps<T>();
+  }
+
+  /// Insert the operation into the back of the body, before the terminator.
+  void push_back(Operation *op) {
+    insert(Block::iterator(getBody()->getTerminator()), op);
+  }
+
+  /// Insert the operation at the given insertion point. Note: The operation is
+  /// never inserted after the terminator, even if the insertion point is end().
+  void insert(Operation *insertPt, Operation *op) {
+    insert(Block::iterator(insertPt), op);
+  }
+  void insert(Block::iterator insertPt, Operation *op) {
+    auto *body = getBody();
+    if (insertPt == body->end())
+      insertPt = Block::iterator(body->getTerminator());
+    body->getOperations().insert(insertPt, op);
+  }
+};
+
+/// The ModuleTerminatorOp is a special terminator operation for the body of a
+/// ModuleOp, it has no semantic meaning beyond keeping the body of a ModuleOp
+/// well-formed.
+///
+/// This operation does _not_ have a custom syntax. However, ModuleOp will omit
+/// the terminator in their custom syntax for brevity.
+class ModuleTerminatorOp
+    : public Op<ModuleTerminatorOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+                OpTrait::HasParent<ModuleOp>::Impl, OpTrait::IsTerminator> {
+public:
+  using Op::Op;
+  static StringRef getOperationName() { return "module_terminator"; }
+  static void build(Builder *, OperationState *) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Module Manager.
+//===----------------------------------------------------------------------===//
+
+/// A class used to manage the symbols held by a module. This class handles
+/// ensures that symbols inserted into a module have a unique name, and provides
+/// efficent named lookup to held symbols.
+class ModuleManager {
+public:
+  ModuleManager(ModuleOp module) : module(module), symbolTable(module) {}
+
+  /// Look up a symbol with the specified name, returning null if no such
+  /// name exists. Names must never include the @ on them.
+  template <typename T, typename NameTy> T lookupSymbol(NameTy &&name) const {
+    return symbolTable.lookup<T>(name);
+  }
+
+  /// Insert a new symbol into the module, auto-renaming it as necessary.
+  void insert(Operation *op) {
+    symbolTable.insert(op);
+    module.push_back(op);
+  }
+  void insert(Block::iterator insertPt, Operation *op) {
+    symbolTable.insert(op);
+    module.insert(insertPt, op);
+  }
+
+  /// Remove the given symbol from the module symbol table and then erase it.
+  void erase(Operation *op) {
+    symbolTable.erase(op);
+    op->erase();
+  }
+
+  /// Return the internally held module.
+  ModuleOp getModule() const { return module; }
+
+  /// Return the context of the internal module.
+  MLIRContext *getContext() { return module.getContext(); }
+
+private:
+  ModuleOp module;
+  SymbolTable symbolTable;
+};
+
+/// This class acts as an owning reference to a module, and will automatically
+/// destroy the held module if valid.
+class OwningModuleRef {
+public:
+  OwningModuleRef(std::nullptr_t = nullptr) {}
+  OwningModuleRef(ModuleOp module) : module(module) {}
+  OwningModuleRef(OwningModuleRef &&other) : module(other.release()) {}
+  ~OwningModuleRef() {
+    if (module)
+      module.erase();
+  }
+
+  // Assign from another module reference.
+  OwningModuleRef &operator=(OwningModuleRef &&other) {
+    if (module)
+      module.erase();
+    module = other.release();
+    return *this;
+  }
+
+  /// Allow accessing the internal module.
+  ModuleOp get() const { return module; }
+  ModuleOp operator*() const { return module; }
+  ModuleOp *operator->() { return &module; }
+  explicit operator bool() const { return module; }
+
+  /// Release the referenced module.
+  ModuleOp release() {
+    ModuleOp released;
+    std::swap(released, module);
+    return released;
+  }
+
+private:
+  ModuleOp module;
+};
+
+} // end namespace mlir
+
+namespace llvm {
+
+/// Allow stealing the low bits of ModuleOp.
+template <> struct PointerLikeTypeTraits<mlir::ModuleOp> {
+public:
+  static inline void *getAsVoidPointer(mlir::ModuleOp I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::ModuleOp getFromVoidPointer(void *P) {
+    return mlir::ModuleOp::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // end namespace llvm
+
+#endif // MLIR_IR_MODULE_H
diff --git a/third_party/mlir/include/mlir/IR/OpBase.td b/third_party/mlir/include/mlir/IR/OpBase.td
new file mode 100644
index 00000000000..dd9d4e29e4b
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/OpBase.td
@@ -0,0 +1,1545 @@
+//===-- OpBase.td - Base op definition file ----------------*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is the base operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef OP_BASE
+#else
+#define OP_BASE
+
+//===----------------------------------------------------------------------===//
+// Common utilities for defining TableGen mechanisms
+//===----------------------------------------------------------------------===//
+
+// Concatenates a list of strings with a separator (default ", ")
+class StrJoin<list<string> strings, string sep = ", "> {
+  string result =
+      !if(!empty(strings), "",
+          !foldl(!head(strings), !tail(strings), prev, cur, prev # sep # cur));
+}
+
+// Concatenates a list of integers into a string with a separator (default ", ")
+class StrJoinInt<list<int> integers, string sep = ", "> :
+    StrJoin<!foreach(i, integers, !cast<string>(i)), sep>;
+
+//===----------------------------------------------------------------------===//
+// Predicate definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for logical predicates.
+//
+// Predicates are used to compose constraints (see next section for details).
+// There are two categories of predicates:
+//
+// 1. CPred: the primitive leaf predicate.
+// 2. Compound predicate: a predicate composed from child predicates using
+//    predicate combiners ("conjunction", "disjunction", "negation" or
+//    "substitution").
+class Pred;
+
+// A logical predicate wrapping any C expression.
+//
+// This is the basis for composing more complex predicates. It is the "atom"
+// predicate from the perspective of TableGen and the "interface" between
+// TableGen and C++. What is inside is already C++ code, which will be treated
+// as opaque strings with special placeholders to be substituted.
+//
+// ## Special placeholders
+//
+// Special placeholders can be used to refer to entities in the context where
+// this predicate is used. They serve as "hooks" to the enclosing environment.
+// The following special placeholders are supported in constraints for an op:
+//
+// * `$_builder` will be replaced by a mlir::Builder instance.
+// * `$_op` will be replaced by the current operation.
+// * `$_self` will be replaced with the entity this predicate is attached to.
+//   E.g., `BoolAttr` is an attribute constraint that wraps a
+//   `CPred<"$_self.isa<BoolAttr>()">` (see the following sections for details).
+//   Then for `F32:$attr`,`$_self` will be replaced by `$attr`.
+//   For type constraints, it's a little bit special since we want the
+//   constraints on each type definition reads naturally and we want to attach
+//   type constraints directly to an operand/result, $_self will be replaced
+//   by the operand/result's type. E.g., for `F32` in `F32:$operand`, its
+//   `$_self` will be expanded as `getOperand(...)->getType()`.
+class CPred<code pred> : Pred {
+  code predExpr = "(" # pred # ")";
+}
+
+// Kinds of predicate combiners.  These must closesly match the predicates
+// implemented by the C++ backend (tblgen::PredCombinerKind).
+class PredCombinerKind;
+def PredCombinerAnd : PredCombinerKind;
+def PredCombinerOr : PredCombinerKind;
+def PredCombinerNot : PredCombinerKind;
+def PredCombinerSubstLeaves : PredCombinerKind;
+def PredCombinerConcat : PredCombinerKind;
+
+// A predicate that combines other predicates as defined by PredCombinerKind.
+// Instantiated below.
+class CombinedPred<PredCombinerKind k, list<Pred> c> : Pred {
+  PredCombinerKind kind = k;
+  list<Pred> children = c;
+}
+
+// Predicate combiners
+
+// A predicate that holds if all of its children hold.  Always holds for zero
+// children.
+class And<list<Pred> children> : CombinedPred<PredCombinerAnd, children>;
+
+// A predicate that holds if any of its children hold.  Never holds for zero
+// children.
+class Or<list<Pred> children> : CombinedPred<PredCombinerOr, children>;
+
+// A predicate that holds if its child does not.
+class Neg<Pred child> : CombinedPred<PredCombinerNot, [child]>;
+
+// A predicate that substitutes "pat" with "repl" in predicate calls of the
+// leaves of the predicate tree (i.e., not CombinedPred).
+//
+// This is plain string substitution without regular expressions or captures.
+// New predicates with more complex logical can be introduced should the need
+// arise.
+class SubstLeaves<string pat, string repl, Pred child>
+    : CombinedPred<PredCombinerSubstLeaves, [child]> {
+  string pattern = pat;
+  string replacement = repl;
+}
+
+// A predicate that prepends `pre` and appends `suf` to the final predicate
+// string composed from `child`. This is plain string concatenation and there
+// will be no substitution happening for `pre` and `suf`.
+class Concat<string pre, Pred child, string suf> :
+    CombinedPred<PredCombinerConcat, [child]> {
+  string prefix = pre;
+  string suffix = suf;
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint definitions
+//===----------------------------------------------------------------------===//
+
+// TODO(b/130064155): Merge Constraints into Pred.
+
+// Base class for named constraints.
+//
+// An op's operands/attributes/results can have various requirements, e.g.,
+// having certain types, having values inside a certain range, and so on.
+// Besides, for a graph rewrite rule, the source pattern used to match against
+// the existing graph has conditions, like the op's operand must be of a more
+// constrained subtype, the attribute must have a certain value, and so on.
+//
+// These requirements and conditions are modeled using this class. Records of
+// this class are used to generate verification code in op verifier, and
+// matching code in pattern matcher.
+//
+// Constraints are predicates with descriptive names, to facilitate inspection,
+// provide nice error messages, etc.
+class Constraint<Pred pred, string desc = ""> {
+  // The predicates that this constraint requires.
+  Pred predicate = pred;
+  // User-readable description used in error reporting messages. If empty, a
+  // generic message will be used.
+  string description = desc;
+}
+
+// Subclasses used to differentiate different constraint kinds. These are used
+// as markers for the TableGen backend to handle different constraint kinds
+// differently if needed. Constraints not deriving from the following subclasses
+// are considered as uncategorized constraints.
+
+// Subclass for constraints on a type.
+class TypeConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// Subclass for constraints on an attribute.
+class AttrConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// Subclass for constraints on a region.
+class RegionConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// How to use these constraint categories:
+//
+// * Use TypeConstraint to specify
+//   * Constraints on an op's operand/result definition
+//   * Further constraints to match an op's operand/result in source pattern
+//
+// * Use Attr (a subclass for AttrConstraint) for
+//   * Constraints on an op's attribute definition
+// * Use AttrConstraint to specify
+//   * Further constraints to match an op's attribute in source pattern
+//
+// * Use uncategorized constraint to specify
+//   * Multi-entity constraints in rewrite rules
+
+//===----------------------------------------------------------------------===//
+// Common predicates
+//===----------------------------------------------------------------------===//
+
+// Whether a type is a VectorType.
+def IsVectorTypePred : CPred<"$_self.isa<VectorType>()">;
+
+// Whether a type is a TensorType.
+def IsTensorTypePred : CPred<"$_self.isa<TensorType>()">;
+
+// Whether a type is a MemRefType.
+def IsMemRefTypePred : CPred<"$_self.isa<MemRefType>()">;
+
+// Whether a type is a ShapedType.
+def IsShapedTypePred : CPred<"$_self.isa<ShapedType>()">;
+
+// For a ShapedType, verify that it has a static shape.
+def HasStaticShapePred : CPred<"$_self.cast<ShapedType>().hasStaticShape()">;
+
+// Whether a type is a TupleType.
+def IsTupleTypePred : CPred<"$_self.isa<TupleType>()">;
+
+//===----------------------------------------------------------------------===//
+// Dialect definitions
+//===----------------------------------------------------------------------===//
+
+class Dialect {
+  // The name of the dialect.
+  string name = ?;
+
+  // Short summary of the dialect.
+  string summary = ?;
+
+  // The description of the dialect.
+  string description = ?;
+
+  // The C++ namespace that ops of this dialect should be placed into.
+  //
+  // By default, uses the name of the dialect as the only namespace. To avoid
+  // placing in any namespace, use "". To specify nested namespaces, use "::"
+  // as the delimiter, e.g., given "A::B", ops will be placed in
+  // `namespace A { namespace B { <ops> } }`.
+  //
+  // Note that this works in conjunction with dialect C++ code. Depending on how
+  // the generated files are included into the dialect, you may want to specify
+  // a full namespace path or a partial one.
+  string cppNamespace = name;
+}
+
+//===----------------------------------------------------------------------===//
+// Type definitions
+//===----------------------------------------------------------------------===//
+
+// A type, carries type constraints.
+class Type<Pred condition, string descr = ""> :
+    TypeConstraint<condition, descr>;
+
+// Allows providing an alternative name and description to an existing type def.
+class TypeAlias<Type t, string description = t.description> :
+    Type<t.predicate, description>;
+
+// A variadic type constraint. It expands to zero or more of the base type. This
+// class is used for supporting variadic operands/results. An op can declare no
+// more than one variadic operand/result, and that operand/result must be the
+// last one in the operand/result list.
+class Variadic<Type type> : TypeConstraint<type.predicate, type.description> {
+  Type baseType = type;
+}
+
+// A type that can be constructed using MLIR::Builder.
+// Note that this does not "inherit" from Type because it would require
+// duplicating Type subclasses for buildable and non-buildable cases to avoid
+// diamond "inheritance".
+// TODO(zinenko): we may extend this to a more general 'Buildable' trait,
+// making some Types and some Attrs buildable.
+class BuildableType<code builder> {
+  // The builder call to invoke (if specified) to construct the BuildableType.
+  // Format: this will be affixed to the builder.
+  code builderCall = builder;
+}
+
+// Any type at all.
+def AnyType : Type<CPred<"true">, "any type">;
+
+// None type
+def NoneType : Type<CPred<"$_self.isa<NoneType>()">, "none type">;
+
+// Any type from the given list
+class AnyTypeOf<list<Type> allowedTypes, string description = ""> : Type<
+    // Satisfy any of the allowed type's condition
+    Or<!foreach(allowedtype, allowedTypes, allowedtype.predicate)>,
+    !if(!eq(description, ""),
+        StrJoin<!foreach(t, allowedTypes, t.description), " or ">.result,
+        description)>;
+
+// Integer types.
+// Any integer type irrespective of its width.
+def AnyInteger : Type<CPred<"$_self.isa<IntegerType>()">, "integer">;
+
+// Index type.
+def Index : Type<CPred<"$_self.isa<IndexType>()">, "index">;
+
+// Integer type of a specific width.
+class I<int width>
+    : Type<CPred<"$_self.isInteger(" # width # ")">,
+                  width # "-bit integer">,
+      BuildableType<"getIntegerType(" # width # ")"> {
+  int bitwidth = width;
+}
+
+class IntOfWidths<list<int> widths> :
+    AnyTypeOf<!foreach(w, widths, I<w>),
+              StrJoinInt<widths, "/">.result # "-bit integer">;
+
+def I1  : I<1>;
+def I8  : I<8>;
+def I16 : I<16>;
+def I32 : I<32>;
+def I64 : I<64>;
+
+// Floating point types.
+
+// Any float type irrespective of its width.
+def AnyFloat : Type<CPred<"$_self.isa<FloatType>()">, "floating-point">;
+
+// Float type of a specific width.
+class F<int width>
+    : Type<CPred<"$_self.isF" # width # "()">,
+                width # "-bit float">,
+      BuildableType<"getF" # width # "Type()"> {
+  int bitwidth = width;
+}
+
+class FloatOfWidths<list<int> widths> :
+    AnyTypeOf<!foreach(w, widths, F<w>),
+              StrJoinInt<widths, "/">.result # "-bit float">;
+
+def F16 : F<16>;
+def F32 : F<32>;
+def F64 : F<64>;
+
+def BF16 : Type<CPred<"$_self.isBF16()">, "bfloat16 type">,
+           BuildableType<"getBF16Type()">;
+
+class OpaqueType<string dialect, string name, string description>
+  : Type<CPred<"isOpaqueTypeWithName($_self, \""#dialect#"\", \""#name#"\")">,
+         description>;
+
+// Function Type
+
+// Any function type.
+def FunctionType : Type<CPred<"$_self.isa<FunctionType>()">, "function type">;
+
+// A container type is a type that has another type embedded within it.
+class ContainerType<Type etype, Pred containerPred, code elementTypeCall,
+                    string descr> :
+    // First, check the container predicate.  Then, substitute the extracted
+    // element into the element type checker.
+    Type<And<[containerPred,
+                SubstLeaves<"$_self", !cast<string>(elementTypeCall),
+                etype.predicate>]>,
+         descr # " of " # etype.description # " values"> {
+  // The type of elements in the container.
+  Type elementType = etype;
+
+  // Call to retrieve.
+  code getElementTypeCall = elementTypeCall;
+}
+
+class ShapedContainerType<list<Type> allowedTypes, Pred containerPred, string descr> :
+    ContainerType<AnyTypeOf<allowedTypes>, containerPred,
+                  "$_self.cast<ShapedType>().getElementType()", descr>;
+
+// Vector types.
+
+class VectorOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsVectorTypePred, "vector">;
+
+def AnyVector : VectorOf<[AnyType]>;
+
+// Tensor types.
+
+// Any tensor type whose element type is from the given `allowedTypes` list
+class TensorOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsTensorTypePred, "tensor">;
+
+def AnyTensor : TensorOf<[AnyType]>;
+
+// TODO(b/130064155) Have an easy way to add another constraint to a type.
+class StaticShapeTensorOf<list<Type> allowedTypes>
+    : Type<And<[TensorOf<allowedTypes>.predicate, HasStaticShapePred]>,
+           "statically shaped " # TensorOf<allowedTypes>.description>;
+
+def AnyStaticShapeTensor : StaticShapeTensorOf<[AnyType]>;
+
+// Whether a type is a ranked tensor type.
+def HasRankPred : CPred<"$_self.cast<ShapedType>().hasRank()">;
+
+// Whether a type is a ranked tensor type with one of the specified ranks.
+class HasAnyRankOfPred<list<int> ranks> : And<[
+    HasRankPred,
+    Or<!foreach(rank, ranks,
+                CPred<"$_self.cast<ShapedType>().getRank() == " # rank>)>]>;
+
+def I1Tensor   : TensorOf<[I1]>;
+def I8Tensor   : TensorOf<[I8]>;
+def I16Tensor  : TensorOf<[I16]>;
+def I32Tensor  : TensorOf<[I32]>;
+def I64Tensor  : TensorOf<[I64]>;
+
+def BF16Tensor : TensorOf<[BF16]>;
+def F16Tensor  : TensorOf<[F16]>;
+def F32Tensor  : TensorOf<[F32]>;
+def F64Tensor  : TensorOf<[F64]>;
+
+// Memref type.
+
+// Memrefs are blocks of data with fixed type and rank.
+class MemRefOf<list<Type> allowedTypes> :
+    ShapedContainerType<allowedTypes, IsMemRefTypePred, "memref">;
+
+def AnyMemRef : MemRefOf<[AnyType]>;
+
+// Memref declarations handle any memref, independent of rank, size, (static or
+// dynamic), layout, or memory space.
+def I1MemRef  : MemRefOf<[I1]>;
+def I8MemRef  : MemRefOf<[I8]>;
+def I16MemRef : MemRefOf<[I16]>;
+def I32MemRef : MemRefOf<[I32]>;
+def I64MemRef : MemRefOf<[I64]>;
+
+def BF16MemRef : MemRefOf<[BF16]>;
+def F16MemRef  : MemRefOf<[F16]>;
+def F32MemRef  : MemRefOf<[F32]>;
+def F64MemRef  : MemRefOf<[F64]>;
+
+// TODO(b/130064155) Have an easy way to add another constraint to a type.
+class StaticShapeMemRefOf<list<Type> allowedTypes>
+    : Type<And<[MemRefOf<allowedTypes>.predicate, HasStaticShapePred]>,
+           "statically shaped " # MemRefOf<allowedTypes>.description>;
+
+def AnyStaticShapeMemRef : StaticShapeMemRefOf<[AnyType]>;
+
+// This represents a generic tuple without any constraints on element type.
+def AnyTuple : Type<IsTupleTypePred, "tuple">;
+
+// A container type that has other types embedded in it, but (unlike
+// ContainerType) can hold elements with a mix of types. Requires a call that
+// produces a list of all elements' types.
+class MixedContainerType<Type etype, Pred containerPred, code elementTypesCall,
+                         string descr> :
+    Type<
+        And<[
+            containerPred,
+            Concat<
+                "llvm::all_of(" # elementTypesCall # ", [](Type t) { return ",
+                SubstLeaves<"$_self", "t", etype.predicate>,
+                "; })"
+            >
+        ]>,
+        descr # " with any combination of " # etype.description # " values"> {
+  // The type of elements in the container.
+  Type elementType = etype;
+
+  // Call to retrieve.
+  code getElementTypesCall = elementTypesCall;
+}
+
+// A Tuple that holds a mix of elements of the allowed types.
+class TupleOf<list<Type> allowedTypes>
+    : MixedContainerType<AnyTypeOf<allowedTypes>, IsTupleTypePred,
+                         "$_self.cast<TupleType>().getTypes()", "tuple">;
+
+// A Tuple with arbitrary nesting, where all elements are a mix of the allowed
+// types.
+class NestedTupleOf<list<Type> allowedTypes> :
+    MixedContainerType<AnyTypeOf<allowedTypes>, IsTupleTypePred,
+                       "getFlattenedTypes($_self.cast<TupleType>())",
+                       "nested tuple">;
+
+//===----------------------------------------------------------------------===//
+// Common type constraints
+//===----------------------------------------------------------------------===//
+
+// Type constraint for bool-like types: bools, vectors of bools, tensors of
+// bools.
+def BoolLike : TypeConstraint<Or<[I1.predicate, VectorOf<[I1]>.predicate,
+                                  TensorOf<[I1]>.predicate]>,
+    "bool-like">;
+
+// Type constraint for integer-like types: integers, indices, vectors of
+// integers, tensors of integers.
+def IntegerLike : TypeConstraint<Or<[AnyInteger.predicate, Index.predicate,
+        VectorOf<[AnyInteger]>.predicate, TensorOf<[AnyInteger]>.predicate]>,
+    "integer-like">;
+
+// Type constraint for float-like types: floats, vectors or tensors thereof.
+def FloatLike : TypeConstraint<Or<[AnyFloat.predicate,
+        VectorOf<[AnyFloat]>.predicate, TensorOf<[AnyFloat]>.predicate]>,
+    "floating-point-like">;
+
+
+//===----------------------------------------------------------------------===//
+// Attribute definitions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Base attribute definition
+
+// Base class for all attributes.
+class Attr<Pred condition, string descr = ""> :
+    AttrConstraint<condition, descr> {
+  code storageType = ?; // The backing mlir::Attribute type
+  code returnType = ?;  // The underlying C++ value type
+
+  // The call expression to convert from the storage type to the return
+  // type. For example, an enum can be stored as an int but returned as an
+  // enum class.
+  //
+  // Format: $_self will be expanded to the attribute.
+  //
+  // For example, `$_self.getValue().getSExtValue()` for `IntegerAttr val` will
+  // expand to `getAttrOfType<IntegerAttr>("val").getValue().getSExtValue()`.
+  code convertFromStorage = "$_self.getValue()";
+
+  // The call expression to build an attribute from a constant value.
+  //
+  // Format: $0 will be expanded to the constant value of the attribute.
+  //
+  // For example, `$_builder.getStringAttr("$0")` for `StringAttr:"foo"` will
+  // expand to `builder.getStringAttr("foo")`.
+  string constBuilderCall = ?;
+
+  // Default value for attribute.
+  // Requires a constBuilderCall defined.
+  string defaultValue = ?;
+
+  // Whether the attribute is optional. Typically requires a custom
+  // convertFromStorage method to handle the case where the attribute is
+  // not present.
+  bit isOptional = 0;
+
+  // What is the base-level Attr instantiation that this Attr is built upon.
+  // Unset means this is a base-level Attr.
+  //
+  // This field is used by attribute wrapper classes (DefaultValuedAttr,
+  // OptionalAttr, etc.) to retrive the base-level attribute definition.
+  // This can be used for getting its name; otherwise, we will see
+  // "anonymous_<number>" as the attribute def name because of template
+  // instantiation.
+  // TOOD(b/132458159): deduplicate the fields in attribute wrapper classes.
+  Attr baseAttr = ?;
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute modifier definition
+
+// Decorates an attribute to have an (unvalidated) default value if not present.
+class DefaultValuedAttr<Attr attr, string val> :
+    Attr<attr.predicate, attr.description> {
+  // Construct this attribute with the input attribute and change only
+  // the default value.
+  // Note: this has to be kept up to date with Attr above.
+  let storageType = attr.storageType;
+  let returnType = attr.returnType;
+  let convertFromStorage = attr.convertFromStorage;
+  let constBuilderCall = attr.constBuilderCall;
+  let defaultValue = val;
+
+  let baseAttr = attr;
+}
+
+// Decorates an attribute as optional. The return type of the generated
+// attribute accessor method will be Optional<>.
+class OptionalAttr<Attr attr> : Attr<attr.predicate, attr.description> {
+  // Rewrite the attribute to be optional.
+  // Note: this has to be kept up to date with Attr above.
+  let storageType = attr.storageType;
+  let returnType = "Optional<" # attr.returnType #">";
+  let convertFromStorage = "$_self ? " # returnType # "(" #
+                           attr.convertFromStorage # ") : (llvm::None)";
+  let isOptional = 1;
+
+  let baseAttr = attr;
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive attribute kinds
+
+// A generic attribute that must be constructed around a specific type
+// `attrValType`. Backed by MLIR attribute kind `attrKind`.
+class TypedAttrBase<BuildableType attrValType, string attrKind,
+                    Pred condition, string descr> :
+    Attr<condition, descr> {
+  let constBuilderCall = "$_builder.get" # attrKind # "($_builder." #
+                         attrValType.builderCall # ", $0)";
+  let storageType = attrKind;
+}
+
+// Any attribute.
+def AnyAttr : Attr<CPred<"true">, "any attribute"> {
+  let storageType = "Attribute";
+  let returnType = "Attribute";
+  let convertFromStorage = "$_self";
+  let constBuilderCall = "$0";
+}
+
+def BoolAttr : Attr<CPred<"$_self.isa<BoolAttr>()">, "bool attribute"> {
+  let storageType = [{ BoolAttr }];
+  let returnType = [{ bool }];
+  let constBuilderCall = "$_builder.getBoolAttr($0)";
+}
+
+// Base class for integer attributes of fixed width.
+class IntegerAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[CPred<"$_self.isa<IntegerAttr>()">,
+           CPred<"$_self.cast<IntegerAttr>().getType()."
+                 "isInteger(" # attrValType.bitwidth # ")">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def APIntAttr : Attr<CPred<"$_self.isa<IntegerAttr>()">,
+                     "arbitrary integer attribute"> {
+  let storageType = [{ IntegerAttr }];
+  let returnType = [{ APInt }];
+}
+
+def I1Attr  : IntegerAttrBase<I1,  "1-bit integer attribute">;
+def I8Attr  : IntegerAttrBase<I8,  "8-bit integer attribute">;
+def I16Attr : IntegerAttrBase<I16, "16-bit integer attribute">;
+def I32Attr : IntegerAttrBase<I32, "32-bit integer attribute">;
+def I64Attr : IntegerAttrBase<I64, "64-bit integer attribute">;
+
+class NonNegativeIntAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[IntegerAttrBase<attrValType, "">.predicate,
+           CPred<"!$_self.cast<IntegerAttr>().getValue().isNegative()">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def NonNegativeI32Attr : NonNegativeIntAttrBase<
+    I32, "non-negative 32-bit integer attribute">;
+def NonNegativeI64Attr : NonNegativeIntAttrBase<
+    I64, "non-negative 64-bit integer attribute">;
+
+class PositiveIntAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[IntegerAttrBase<attrValType, "">.predicate,
+           CPred<"$_self.cast<IntegerAttr>().getValue()"
+                 ".isStrictlyPositive()">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def PositiveI32Attr : PositiveIntAttrBase<
+    I32, "positive 32-bit integer attribute">;
+def PositiveI64Attr : PositiveIntAttrBase<
+    I64, "positive 64-bit integer attribute">;
+
+// Base class for float attributes of fixed width.
+class FloatAttrBase<F attrValType, string descr> :
+    TypedAttrBase<attrValType, "FloatAttr",
+              And<[CPred<"$_self.isa<FloatAttr>()">,
+                     CPred<"$_self.cast<FloatAttr>().getType().isF" #
+                           attrValType.bitwidth # "()">]>,
+              descr> {
+  let returnType = [{ APFloat }];
+}
+
+def F32Attr : FloatAttrBase<F32, "32-bit float attribute">;
+def F64Attr : FloatAttrBase<F64, "64-bit float attribute">;
+
+// An attribute backed by a string type.
+class StringBasedAttr<Pred condition, string descr> : Attr<condition, descr> {
+  let constBuilderCall = "$_builder.getStringAttr(\"$0\")";
+  let storageType = [{ StringAttr }];
+  let returnType = [{ StringRef }];
+}
+
+def StrAttr : StringBasedAttr<CPred<"$_self.isa<StringAttr>()">,
+                              "string attribute">;
+
+// Base class for attributes containing types. Example:
+//   def IntTypeAttr : TypeAttrBase<"IntegerType", "integer type attribute">
+// defines a type attribute containing an integer type.
+class TypeAttrBase<string retType, string description> :
+    Attr<And<[
+      CPred<"$_self.isa<TypeAttr>()">,
+      CPred<"$_self.cast<TypeAttr>().getValue().isa<" # retType # ">()">]>,
+    description> {
+  let storageType = [{ TypeAttr }];
+  let returnType = retType;
+  let convertFromStorage = "$_self.getValue().cast<" # retType # ">()";
+}
+
+def TypeAttr : TypeAttrBase<"Type", "any type attribute">;
+
+// The mere presence of unit attributes has a meaning.  Therefore, unit
+// attributes are always treated as optional and accessors to them return
+// "true" if the attribute is present and "false" otherwise.
+def UnitAttr : Attr<CPred<"$_self.isa<UnitAttr>()">, "unit attribute"> {
+  let storageType = [{ UnitAttr }];
+  let constBuilderCall = "$_builder.getUnitAttr()";
+  let convertFromStorage = "$_self != nullptr";
+  let returnType = "bool";
+  let isOptional = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Enum attribute kinds
+
+// Additional information for an enum attribute case.
+class EnumAttrCaseInfo<string sym, int val> {
+  // The C++ enumerant symbol
+  string symbol = sym;
+
+  // The C++ enumerant value
+  // If less than zero, there will be no explicit discriminator values assigned
+  // to enumerators in the generated enum class.
+  int value = val;
+}
+
+// An enum attribute case stored with StringAttr.
+class StrEnumAttrCase<string sym, int val = -1> :
+    EnumAttrCaseInfo<sym, val>,
+    StringBasedAttr<
+      CPred<"$_self.cast<StringAttr>().getValue() == \"" # sym # "\"">,
+      "case " # sym>;
+
+// An enum attribute case stored with IntegerAttr.
+class IntEnumAttrCaseBase<I intType, string sym, int val> :
+    EnumAttrCaseInfo<sym, val>,
+    IntegerAttrBase<intType, "case " # sym> {
+  let predicate =
+    CPred<"$_self.cast<IntegerAttr>().getInt() == " # val>;
+}
+
+class I32EnumAttrCase<string sym, int val> : IntEnumAttrCaseBase<I32, sym, val>;
+class I64EnumAttrCase<string sym, int val> : IntEnumAttrCaseBase<I64, sym, val>;
+
+// Additional information for an enum attribute.
+class EnumAttrInfo<string name, list<EnumAttrCaseInfo> cases> {
+  // The C++ enum class name
+  string className = name;
+
+  // List of all accepted cases
+  list<EnumAttrCaseInfo> enumerants = cases;
+
+  // The following fields are only used by the EnumsGen backend to generate
+  // an enum class definition and conversion utility functions.
+
+  // The underlying type for the C++ enum class. An empty string mean the
+  // underlying type is not explicitly specified.
+  string underlyingType = "";
+
+  // The C++ namespaces that the enum class definition and utility functions
+  // should be placed into.
+  //
+  // Normally you want to place the full namespace path here. If it is nested,
+  // use "::" as the delimiter, e.g., given "A::B", generated code will be
+  // placed in `namespace A { namespace B { ... } }`. To avoid placing in any
+  // namespace, use "".
+  // TODO(b/134741431): use dialect to provide the namespace.
+  string cppNamespace = "";
+
+  // The name of the utility function that converts a value of the underlying
+  // type to the corresponding symbol. It will have the following signature:
+  //
+  // ```c++
+  // llvm::Optional<<qualified-enum-class-name>> <fn-name>(<underlying-type>);
+  // ```
+  string underlyingToSymbolFnName = "symbolize" # name;
+
+  // The name of the utility function that converts a string to the
+  // corresponding symbol. It will have the following signature:
+  //
+  // ```c++
+  // llvm::Optional<<qualified-enum-class-name>> <fn-name>(llvm::StringRef);
+  // ```
+  string stringToSymbolFnName = "symbolize" # name;
+
+  // The name of the utility function that converts a symbol to the
+  // corresponding string. It will have the following signature:
+  //
+  // ```c++
+  // llvm::StringRef <fn-name>(<qualified-enum-class-name>);
+  // ```
+  string symbolToStringFnName = "stringify" # name;
+
+  // The name of the utility function that returns the max enum value used
+  // within the enum class. It will have the following signature:
+  //
+  // ```c++
+  // static constexpr unsigned <fn-name>();
+  // ```
+  string maxEnumValFnName = "getMaxEnumValFor" # name;
+}
+
+// An enum attribute backed by StringAttr.
+//
+// Op attributes of this kind are stored as StringAttr. Extra verification will
+// be generated on the string though: only the symbols of the allowed cases are
+// permitted as the string value.
+class StrEnumAttr<string name, string description,
+                  list<StrEnumAttrCase> cases> :
+    EnumAttrInfo<name, cases>,
+    StringBasedAttr<
+      And<[StrAttr.predicate, Or<!foreach(case, cases, case.predicate)>]>,
+      !if(!empty(description), "allowed string cases: " #
+          StrJoin<!foreach(case, cases, "'" # case.symbol # "'")>.result,
+          description)>;
+
+// An enum attribute backed by IntegerAttr.
+//
+// Op attributes of this kind are stored as IntegerAttr. Extra verification will
+// be generated on the integer though: only the values of the allowed cases are
+// permitted as the integer value.
+class IntEnumAttr<I intType, string name, string description,
+                  list<IntEnumAttrCaseBase> cases> :
+    EnumAttrInfo<name, cases>,
+    IntegerAttrBase<intType,
+      !if(!empty(description), "allowed " # intType.description # " cases: " #
+          StrJoinInt<!foreach(case, cases, case.value)>.result, description)> {
+  let predicate = And<[
+    IntegerAttrBase<intType, "">.predicate,
+    Or<!foreach(case, cases, case.predicate)>]>;
+}
+
+class I32EnumAttr<string name, string description,
+                  list<I32EnumAttrCase> cases> :
+    IntEnumAttr<I32, name, description, cases> {
+  let underlyingType = "uint32_t";
+}
+class I64EnumAttr<string name, string description,
+                  list<I64EnumAttrCase> cases> :
+    IntEnumAttr<I64, name, description, cases> {
+  let underlyingType = "uint64_t";
+}
+
+//===----------------------------------------------------------------------===//
+// Composite attribute kinds
+
+def DictionaryAttr : Attr<CPred<"$_self.isa<DictionaryAttr>()">,
+                          "dictionary of named attribute values"> {
+  let storageType = [{ DictionaryAttr }];
+  let returnType = [{ DictionaryAttr }];
+  let convertFromStorage = "$_self";
+}
+
+class ElementsAttrBase<Pred condition, string description> :
+    Attr<condition, description> {
+  let storageType = [{ ElementsAttr }];
+  let returnType = [{ ElementsAttr }];
+  let convertFromStorage = "$_self";
+}
+
+def ElementsAttr: ElementsAttrBase<CPred<"$_self.isa<ElementsAttr>()">,
+                                   "constant vector/tensor attribute">;
+
+// Base class for array attributes.
+class ArrayAttrBase<Pred condition, string description> :
+    Attr<condition, description> {
+  let storageType = [{ ArrayAttr }];
+  let returnType = [{ ArrayAttr }];
+  let convertFromStorage = "$_self";
+}
+
+def ArrayAttr : ArrayAttrBase<CPred<"$_self.isa<ArrayAttr>()">,
+                              "array attribute">;
+
+// Base class for array attributes whose elements are of the same kind.
+// `element` specifies the element attribute kind stored in this array.
+class TypedArrayAttrBase<Attr element, string description>: ArrayAttrBase<
+    And<[
+      // Guranatee this is an ArrayAttr first
+      CPred<"$_self.isa<ArrayAttr>()">,
+      // Guarantee all elements satisfy the constraints from `element`
+      Concat<"llvm::all_of($_self.cast<ArrayAttr>(), "
+                          "[](Attribute attr) { return ",
+                             SubstLeaves<"$_self", "attr", element.predicate>,
+                          "; })">]>,
+    description> {
+  let constBuilderCall = "$_builder.getArrayAttr($0)";
+}
+
+def I32ArrayAttr : TypedArrayAttrBase<I32Attr,
+                                      "32-bit integer array attribute"> {
+  let constBuilderCall = "$_builder.getI32ArrayAttr($0)";
+}
+def I64ArrayAttr : TypedArrayAttrBase<I64Attr,
+                                      "64-bit integer array attribute"> {
+  let constBuilderCall = "$_builder.getI64ArrayAttr($0)";
+}
+def F32ArrayAttr : TypedArrayAttrBase<F32Attr, "32-bit float array attribute"> {
+  let constBuilderCall = "$_builder.getF32ArrayAttr($0)";
+}
+def F64ArrayAttr : TypedArrayAttrBase<F64Attr, "64-bit float array attribute"> {
+  let constBuilderCall = "$_builder.getF64ArrayAttr($0)";
+}
+def StrArrayAttr : TypedArrayAttrBase<StrAttr, "string array attribute"> {
+  let constBuilderCall = "$_builder.getStrArrayAttr($0)";
+}
+def TypeArrayAttr : TypedArrayAttrBase<TypeAttr, "type array attribute"> {
+  let constBuilderCall = ?;
+}
+
+def I32ElementsAttr : Attr<
+  CPred<"$_self.isa<DenseIntElementsAttr>() &&"
+      "$_self.cast<DenseIntElementsAttr>().getType()."
+      "getElementType().isInteger(32)">,
+  "32-bit integer elements attribute"> {
+  let storageType = [{ DenseIntElementsAttr }];
+  let returnType = [{ DenseIntElementsAttr }];
+  let constBuilderCall = "$_builder.getDenseElementsAttr("
+    "$_builder.getTensorType({}, $_builder.getIntegerType(32)), "
+      "{$_builder.getI32IntegerAttr($0)})";
+  let convertFromStorage = "$_self";
+}
+
+// Attributes containing symbol references.
+def SymbolRefAttr : Attr<CPred<"$_self.isa<SymbolRefAttr>()">,
+                        "symbol reference attribute"> {
+  let storageType = [{ SymbolRefAttr }];
+  let returnType = [{ StringRef }];
+  let constBuilderCall = "$_builder.getSymbolRefAttr($0)";
+}
+
+def SymbolRefArrayAttr :
+  TypedArrayAttrBase<SymbolRefAttr, "symbol ref array attribute"> {
+  let constBuilderCall = ?;
+}
+
+//===----------------------------------------------------------------------===//
+// Derive attribute kinds
+
+// DerivedAttr are attributes whose value is computed from properties
+// of the operation. They do not require additional storage and are
+// materialized as needed.
+class DerivedAttr<code ret, code b> : Attr<CPred<"true">, "derived attribute"> {
+  let returnType = ret;
+  code body = b;
+}
+
+// Derived attribute that returns a mlir::Type.
+class DerivedTypeAttr<code body> : DerivedAttr<"Type", body>;
+
+//===----------------------------------------------------------------------===//
+// Constant attribute kinds
+
+// Represents a constant attribute of specific Attr type. A constant
+// attribute can be specified only of attributes that have a constant
+// builder call defined. The constant value is specified as a string.
+//
+// If used as a constraint, it generates a matcher on a constant attribute by
+// using the constant value builder of the attribute and the value.
+class ConstantAttr<Attr attribute, string val> : AttrConstraint<
+    CPred<"$_self == " # !subst("$0", val, attribute.constBuilderCall)>,
+    "constant attribute " # val> {
+  Attr attr = attribute;
+  string value = val;
+}
+
+class ConstF32Attr<string val> : ConstantAttr<F32Attr, val>;
+def ConstBoolAttrFalse : ConstantAttr<BoolAttr, "false">;
+def ConstBoolAttrTrue : ConstantAttr<BoolAttr, "true">;
+def ConstUnitAttr : ConstantAttr<UnitAttr, "unit">;
+
+//===----------------------------------------------------------------------===//
+// Common attribute constraints
+//===----------------------------------------------------------------------===//
+
+// A general mechanism to further confine the given `attr` with all the
+// `constraints`. This allows to compose complex constraints out of a series
+// of more primitive ones.
+class Confined<Attr attr, list<AttrConstraint> constraints> : Attr<
+    And<!listconcat([attr.predicate],
+                      !foreach(pred, constraints, pred.predicate))>,
+    !foldl(/*init*/attr.description, /*list*/constraints,
+           prev, cur, prev # " " # cur.description)> {
+  let storageType = attr.storageType;
+  let returnType = attr.returnType;
+  let convertFromStorage = attr.convertFromStorage;
+  let constBuilderCall = attr.constBuilderCall;
+  let defaultValue = attr.defaultValue;
+  let isOptional = attr.isOptional;
+
+  let baseAttr = attr;
+}
+
+// An AttrConstraint that holds if all attr constraints specified in
+// 'constraints' hold.
+class AllAttrConstraintsOf<list<AttrConstraint> constraints> : AttrConstraint<
+    And<!listconcat([!head(constraints).predicate],
+                      !foreach(pred, !tail(constraints), pred.predicate))>,
+    !foldl(/*init*/!head(constraints).description, /*list*/!tail(constraints),
+           prev, cur, prev # " and " # cur.description)> {
+}
+
+class IntMinValue<int n> : AttrConstraint<
+    CPred<"$_self.cast<IntegerAttr>().getInt() >= " # n>,
+    "whose minimal value is " # n>;
+
+class ArrayMinCount<int n> : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().size() >= " # n>,
+    "with at least " # n # " elements">;
+
+class IntArrayNthElemEq<int index, int value> : AttrConstraint<
+    And<[
+      CPred<"$_self.cast<ArrayAttr>().size() > " # index>,
+      CPred<"$_self.cast<ArrayAttr>().getValue()[" # index # "]"
+        ".cast<IntegerAttr>().getInt() == " # value>
+       ]>,
+    "whose " # index # "-th element must be " # value>;
+
+class IntArrayNthElemMinValue<int index, int min> : AttrConstraint<
+    And<[
+      CPred<"$_self.cast<ArrayAttr>().size() > " # index>,
+      CPred<"$_self.cast<ArrayAttr>().getValue()[" # index # "]"
+        ".cast<IntegerAttr>().getInt() >= " # min>
+        ]>,
+    "whose " # index # "-th element must be at least " # min>;
+
+def IsNullAttr : AttrConstraint<
+    CPred<"!$_self">, "empty attribute (for optional attributes)">;
+
+//===----------------------------------------------------------------------===//
+// Region definitions
+//===----------------------------------------------------------------------===//
+
+class Region<Pred condition, string descr = ""> :
+    RegionConstraint<condition, descr>;
+
+// Any region.
+def AnyRegion : Region<CPred<"true">, "any region">;
+
+// A region with the given number of blocks.
+class SizedRegion<int numBlocks> : Region<
+  CPred<"$_self.getBlocks().size() == " # numBlocks>,
+  "region with " # numBlocks # " blocks">;
+
+//===----------------------------------------------------------------------===//
+// OpTrait definitions
+//===----------------------------------------------------------------------===//
+
+// OpTrait represents a trait regarding an op.
+class OpTrait;
+
+// NativeOpTrait corresponds to the MLIR C++ OpTrait mechanism. The
+// purpose to wrap around C++ symbol string with this class is to make
+// traits specified for ops in TableGen less alien and more integrated.
+class NativeOpTrait<string prop> : OpTrait {
+  string trait = "OpTrait::" # prop;
+}
+
+// ParamNativeOpTrait corresponds to the template-parameterized traits in the
+// C++ implementation.  MLIR uses nested class templates to implement such
+// traits leading to constructs of the form "TraitName<Parameters>::Impl". Use
+// the value in `prop` as the trait name and the value in `params` as
+// parameters to construct the native trait class name.
+class ParamNativeOpTrait<string prop, string params>
+    : NativeOpTrait<prop # "<" # params # ">::Impl"> {
+}
+
+// GenInternalOpTrait is an op trait that does not have direct C++ mapping but
+// affects op definition generator internals, like how op builders and
+// operand/attribute/result getters are generated.
+class GenInternalOpTrait<string prop> : OpTrait {
+  string trait = "OpTrait::" # prop;
+}
+
+// PredOpTrait is an op trait implemented by way of a predicate on the op.
+class PredOpTrait<string descr, Pred pred> : OpTrait {
+  string description = descr;
+  Pred predicate = pred;
+}
+
+// Op supports operand broadcast behavior.
+def Broadcastable  : NativeOpTrait<"BroadcastableTwoOperandsOneResult">;
+// X op Y == Y op X
+def Commutative  : NativeOpTrait<"IsCommutative">;
+// Op is isolated from above.
+def IsolatedFromAbove : NativeOpTrait<"IsIsolatedFromAbove">;
+// Op results are float or vectors/tensors thereof.
+def ResultsAreFloatLike : NativeOpTrait<"ResultsAreFloatLike">;
+// Op has no side effect.
+def NoSideEffect : NativeOpTrait<"HasNoSideEffect">;
+// Op has the same operand type.
+def SameTypeOperands : NativeOpTrait<"SameTypeOperands">;
+// Op has same shape for all operands.
+def SameOperandsShape : NativeOpTrait<"SameOperandsShape">;
+// Op has same operand and result shape.
+def SameOperandsAndResultShape : NativeOpTrait<"SameOperandsAndResultShape">;
+// Op has the same operand and result type.
+def SameOperandsAndResultType : NativeOpTrait<"SameOperandsAndResultType">;
+// Op has the same element type for all operands.
+def SameOperandsElementType : NativeOpTrait<"SameOperandsElementType">;
+// Op has the same operand and result element type.
+def SameOperandsAndResultElementType :
+  NativeOpTrait<"SameOperandsAndResultElementType">;
+// Op is a terminator.
+def Terminator       : NativeOpTrait<"IsTerminator">;
+
+// Op's regions have a single block with the specified terminator.
+class SingleBlockImplicitTerminator<string op>
+    : ParamNativeOpTrait<"SingleBlockImplicitTerminator", op>;
+
+// Op's parent operation is the provided one.
+class HasParent<string op>
+    : ParamNativeOpTrait<"HasParent", op>;
+
+// Op result type is derived from the first attribute. If the attribute is an
+// subclass of `TypeAttrBase`, its value is used, otherwise, the type of the
+// attribute content is used.
+def FirstAttrDerivedResultType :
+  GenInternalOpTrait<"FirstAttrDerivedResultType">;
+
+// TODO(antiagainst): Turn the following into normal traits and generate
+// verification for them.
+
+// All variadic operands of the op have the same number of values.
+// A variadic operand contains an array of values whose array size is only
+// known at runtime. This trait requires all variadic operands of an op
+// to have the same array size.
+def SameVariadicOperandSize : GenInternalOpTrait<"SameVariadicOperandSize">;
+// All variadic results of the op have the same number of values.
+// A variadic result contains an array of values whose array size is only
+// known at runtime. This trait requires all variadic results of an op
+// to have the same array size.
+def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
+
+//===----------------------------------------------------------------------===//
+// OpInterface definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the argument list for an op or interface method.
+def ins;
+
+// OpInterfaceTrait corresponds to a specific 'OpInterface' class defined in
+// C++. The purpose to wrap around C++ symbol string with this class is to make
+// interfaces specified for ops in TableGen less alien and more integrated.
+class OpInterfaceTrait<string name> : NativeOpTrait<""> {
+  let trait = name # "::Trait";
+}
+
+// This class represents a single, optionally static, interface method.
+// Note: non-static interface methods have an implicit 'op' parameter
+// corresponding to an instance of the derived operation.
+class InterfaceMethod<string retTy, string methodName,
+                      dag args = (ins), code methodBody = [{}]> {
+  /// The name of the interface method.
+  string name = methodName;
+
+  /// The c++ type-name of the return type.
+  string returnType = retTy;
+
+  /// A dag of string that correspond to the arguments of the method.
+  dag arguments = args;
+
+  /// An optional body to the method.
+  code body = methodBody;
+}
+
+// This class represents a single static interface method.
+class StaticInterfaceMethod<string retTy, string methodName,
+                            dag args = (ins), code methodBody = [{}]>
+    : InterfaceMethod<retTy, methodName, args, methodBody>;
+
+// OpInterface represents an interface regarding an op.
+class OpInterface<string name> : OpInterfaceTrait<name> {
+  // The name given to the c++ interface class.
+  string cppClassName = name;
+
+  /// The list of methods defined by this interface.
+  list<InterfaceMethod> methods = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Op definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the result list for an op.
+def outs;
+
+// Marker used to identify the region list for an op.
+def region;
+
+// Class for defining a custom builder.
+//
+// TableGen generates several generic builders for each op by default (see
+// comment in the `Op` class). If the default generated ones cannot cover
+// some use case, custom builders can be defined using instances of this class.
+//
+// The signature of the builder is always
+//
+// ```c++
+// static void build(Builder *builder, OperationState *state,
+//                   <other-parameters>...) {
+//   <body>...
+// }
+// ```
+//
+// To define a custom builder, the parameter list (*including* the `Builder
+// *builder, OperationState *state` part) and body should be passed in
+// as separate template arguments to this class. This is because we generate
+// op declaration and definition into separate files. If an empty string is
+// passed in for `body`, then *only* the builder declaration will be
+// generated; this provides a way to define complicated builders entirely
+// in C++.
+class OpBuilder<string p, code b = ""> {
+  string params = p;
+  code body = b;
+}
+
+// Base class for all ops.
+class Op<Dialect dialect, string mnemonic, list<OpTrait> props = []> {
+  // The dialect of the op.
+  Dialect opDialect = dialect;
+
+  // The mnemonic of the op.
+  string opName = mnemonic;
+
+  // One-line human-readable description of what the op does.
+  string summary = "";
+
+  // Additional, longer human-readable description of what the op does.
+  string description = "";
+
+  // Dag containting the arguments of the op. Default to 0 arguments.
+  dag arguments = (ins);
+
+  // The list of results of the op. Default to 0 results.
+  dag results = (outs);
+
+  // The list of regions of the op. Default to 0 regions.
+  dag regions = (region);
+
+  // Attribute getters can be added to the op by adding an Attr member
+  // with the name and type of the attribute. E.g., adding int attribute
+  // with name "value" and type "i32":
+  //   I32Attr value;
+
+  // Define the hooks used for building, parsing, printing, verification.
+
+  // Custom builder.
+  // In addition to the custom builder provided here, and unless
+  // skipDefaultBuilders is set, two default builders are generated, with the
+  // following signatures:
+  //
+  // ```c++
+  // static void build(Builder *, OperationState *tblgen_state,
+  //                   Type <result0-name>, Type <result1-name>, ...,
+  //                   Value <arg0-name>, Value <arg1-name>, ...,
+  //                   Attribute <attr0-name>, Attribute <attr1-name>, ...);
+  // ```
+  // * where the attributes follow the same declaration order as in the op.
+  //
+  // ```c++
+  // static void build(Builder *, OperationState *tblgen_state,
+  //                   ArrayRef<Type> resultTypes,
+  //                   ArrayRef<Value> operands,
+  //                   ArrayRef<NamedAttribute> attributes);
+  // ```
+  list<OpBuilder> builders = ?;
+
+  // Avoid generating default build functions.  Custom builders must be
+  // provided.
+  bit skipDefaultBuilders = 0;
+
+  // Custom parser.
+  code parser = ?;
+
+  // Custom printer.
+  code printer = ?;
+
+  // Custom verifier.
+  code verifier = ?;
+
+  // Whether this op has associated canonicalization patterns.
+  // TODO(b/120163349): figure out a better way to write canonicalization
+  // patterns in TableGen rules directly instead of using this marker
+  // and C++ implementations.
+  bit hasCanonicalizer = 0;
+
+  // Whether this op has a folder.
+  bit hasFolder = 0;
+
+  // Op traits.
+  list<OpTrait> traits = props;
+
+  // Additional code that will be added to the public part of the generated
+  // C++ code of the op declaration.
+  code extraClassDeclaration = ?;
+}
+
+// The arguments of an op.
+class Arguments<dag args> {
+  dag arguments = args;
+}
+
+// The results of an op.
+class Results<dag rets> {
+  dag results = rets;
+}
+
+//===----------------------------------------------------------------------===//
+// Common value constraints
+//===----------------------------------------------------------------------===//
+
+def HasNoUseOf: Constraint<
+    CPred<"$_self->use_begin() == $_self->use_end()">, "has no use">;
+
+//===----------------------------------------------------------------------===//
+// Common op type constraints
+//===----------------------------------------------------------------------===//
+
+// These traits are for verifying properties of an op that require knowledge of
+// multiple arguments or results. For verifying properties of a single argument
+// or result, prefer operand type constraints.
+
+// These traits often require including "mlir/IR/TypeUtilities.h".
+
+// TODO(b/135033717): Improve the autogenerated error messages.
+
+// Type Constraint operand `idx`'s Element type is `type`.
+class TCopVTEtIs<int idx, Type type> : And<[
+   CPred<"$_op.getNumOperands() > " # idx>,
+   SubstLeaves<"$_self", "$_op.getOperand(" # idx # ")->getType()",
+     IsShapedTypePred>,
+   SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(" # idx # "))",
+     type.predicate>]>;
+
+// Predicate to verify that a named argument or result's element type matches a
+// given type.
+class TypeIsPred<string name, Type type> :
+   SubstLeaves<"$_self", "$" # name # ".getType()", type.predicate>;
+class TypeIs<string name, Type type> : PredOpTrait<
+  "'" # name # "' is " # type.description, TypeIsPred<name, type>>;
+
+// Predicate to verify that a named argument or result's element type matches a
+// given type.
+class ElementTypeIsPred<string name, Type type> : And<[
+   SubstLeaves<"$_self", "$" # name # ".getType()", IsShapedTypePred>,
+   SubstLeaves<"$_self", "getElementTypeOrSelf($" # name # ")",
+     type.predicate>]>;
+class ElementTypeIs<string name, Type type> : PredOpTrait<
+  "'" # name # "' is " # type.description, ElementTypeIsPred<name, type>>;
+
+// TODO(b/135032064): Only works for non-variadic.
+class AllMatchPred<list<string> names, string operator> :
+    CPred<"llvm::is_splat(llvm::makeArrayRef({" #
+          StrJoin<!foreach(n, names,
+                           !subst("$_self", "$" # n, operator))>.result
+          # "}))">;
+
+class AllMatchTrait<list<string> names, string operator, string description> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have same " # description,
+        AllMatchPred<names, operator>>;
+
+class AllElementTypesMatch<list<string> names> :
+    AllMatchTrait<names, "getElementTypeOrSelf($_self)", "element type">;
+
+class AllTypesMatch<list<string> names> :
+    AllMatchTrait<names, "$_self.getType()", "type">;
+
+// Predicate to verify that the i'th operand and the j'th operand have the same
+// elemental type.
+// Type Constraint operand `i`'s Element type is Same As operand `j`'s Element
+// type.
+class TCopVTEtIsSameAs<int i, int j> : And<[
+    CPred<"$_op.getNumOperands() > std::max(" # i # "u," # j # "u)">,
+    SubstLeaves<"$_self", "$_op.getOperand(" # i # ")->getType()",
+      IsShapedTypePred>,
+    SubstLeaves<"$_self", "$_op.getOperand(" # j # ")->getType()",
+      IsShapedTypePred>,
+    CPred<"mlir::getElementTypeOrSelf($_op.getOperand(" # i # ")) == "
+          "mlir::getElementTypeOrSelf($_op.getOperand(" # j # "))">]>;
+
+// Predicate to verify that the i'th result and the j'th operand exist and has
+// shaped types.
+class TCOpResIsShapedTypePred<int i, int j> : And<[
+    CPred<"$_op.getNumResults() > " # i>,
+    CPred<"$_op.getNumOperands() > " # j>,
+    SubstLeaves<"$_self", "$_op.getResult(" # i # ")->getType()",
+      IsShapedTypePred>,
+    SubstLeaves<"$_self", "$_op.getOperand(" # j # ")->getType()",
+      IsShapedTypePred>]>;
+
+// Basic Predicate to verify that the i'th result and the j'th operand have the
+// same elemental type.
+class TCresVTEtIsSameAsOpBase<int i, int j> :
+    CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")) == "
+          "getElementTypeOrSelf($_op.getOperand(" # j # "))">;
+
+// Predicate to verify that the i'th result and the j'th operand have the same
+// elemental type.
+// Type Constraint result`i`'s Element type is Same As Operand `j`'s Element
+// type.
+class TCresVTEtIsSameAsOp<int i, int j> : And<[
+    TCOpResIsShapedTypePred<i, j>,
+    TCresVTEtIsSameAsOpBase<i, j>]>;
+
+// Predicate to verify that the opId'th operand can be broadcasted to the type
+// of the resId'th result.
+class TCOpIsBroadcastableToRes<int opId, int resId> : And<[
+    TCOpResIsShapedTypePred<opId, resId>,
+    CPred<"OpTrait::util::getBroadcastedType("
+              "$_op.getOperand(" # opId # ")->getType(), "
+              "$_op.getResult(" # resId # ")->getType())">]>;
+
+// Predicate to verify that all the operands at the given `indices`
+// have the same element type.
+// Type Constraint operands' Element type are all Same At the given `indices`.
+// We query the operands' types into a list and check they are all the same.
+// Precondition:
+// 1) all operands involved are of shaped type and
+// 2) the indices are not out of range.
+class TCopVTEtAreSameAt<list<int> indices> : CPred<
+  "llvm::is_splat(mlir::functional::map("
+    "[this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); }, "
+    "llvm::ArrayRef<unsigned>({" # StrJoinInt<indices>.result # "})))">;
+
+//===----------------------------------------------------------------------===//
+// Pattern definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the delta value added to the default benefit value.
+def addBenefit;
+
+// Base class for op+ -> op+ rewrite rules. These allow declaratively
+// specifying rewrite rules.
+//
+// A rewrite rule contains two components: a source pattern and one or more
+// result patterns. Each pattern is specified as a (recursive) DAG node (tree)
+// in the form of `(node arg0, arg1, ...)`.
+//
+// The `node` are normally MLIR ops, but it can also be one of the directives
+// listed later in this section.
+//
+// ## Symbol binding
+//
+// In the source pattern, `argN` can be used to specify matchers (e.g., using
+// type/attribute type constraints, etc.) and bound to a name for later use.
+// We can also bound names to op instances to reference them later in
+// multi-entity constraints.
+//
+// In the result pattern, `argN` can be used to refer to a previously bound
+// name, with potential transformations (e.g., using tAttr, etc.). `argN` can
+// itself be nested DAG node. We can also bound names to ops to reference
+// them later in other result patterns.
+//
+// For example,
+//
+// ```
+// def : Pattern<(OneResultOp1:$op1 $arg0, $arg1),
+//               [(OneResultOp2:$op2 $arg0, $arg1),
+//                (OneResultOp3 $op2 (OneResultOp4))],
+//               [(HasStaticShapePred $op1)]>;
+// ```
+//
+// `$argN` is bound to the `OneResultOp1`'s N-th argument and used later to
+// build `OneResultOp2`. `$op1` is bound to `OneResultOp1` and used to
+// check whether the result's shape is static. `$op2` is bound to
+// `OneResultOp2` and used to build `OneResultOp3`.
+//
+// ## Multi-result op
+//
+// To create multi-result ops in result pattern, you can use a syntax similar
+// to uni-result op, and it will act as a value pack for all results:
+//
+// ```
+// def : Pattern<(ThreeResultOp ...),
+//               [(TwoResultOp ...), (OneResultOp ...)]>;
+// ```
+//
+// Then `TwoResultOp` will replace the first two values of `ThreeResultOp`.
+//
+// You can also use `$<name>__N` to explicitly access the N-th reusult.
+// ```
+// def : Pattern<(FiveResultOp ...),
+//               [(TwoResultOp1:$res1__1 ...), (replaceWithValue $res1__0),
+//                (TwoResultOp2:$res2 ...), (replaceWithValue $res2__1)]>;
+// ```
+//
+// Then the values generated by `FiveResultOp` will be replaced by
+//
+// * `FiveResultOp`#0: `TwoResultOp1`#1
+// * `FiveResultOp`#1: `TwoResultOp1`#0
+// * `FiveResultOp`#2: `TwoResultOp2`#0
+// * `FiveResultOp`#3: `TwoResultOp2`#1
+// * `FiveResultOp`#4: `TwoResultOp2`#1
+class Pattern<dag source, list<dag> results, list<dag> preds = [],
+  dag benefitAdded = (addBenefit 0)> {
+  dag sourcePattern = source;
+  // Result patterns. Each result pattern is expected to replace one result
+  // of the root op in the source pattern. In the case of more result patterns
+  // than needed to replace the source op, only the last N results generated
+  // by the last N result pattern is used to replace a N-result source op.
+  // So that the beginning result patterns can be used to generate additional
+  // ops to aid building the results used for replacement.
+  list<dag> resultPatterns = results;
+  // Multi-entity constraints. Each constraint here involves multiple entities
+  // matched in source pattern and places further constraints on them as a
+  // whole.
+  list<dag> constraints = preds;
+  // The delta value added to the default benefit value. The default value is
+  // the number of ops in the source pattern. The rule with the highest final
+  // benefit value will be applied first if there are multiple rules matches.
+  // This delta value can be either positive or negative.
+  dag benefitDelta = benefitAdded;
+}
+
+// Form of a pattern which produces a single result.
+class Pat<dag pattern, dag result, list<dag> preds = [],
+  dag benefitAdded = (addBenefit 0)> :
+  Pattern<pattern, [result], preds, benefitAdded>;
+
+// Native code call wrapper. This allows invoking an arbitrary C++ expression
+// to create an op operand/attribute or replace an op result.
+//
+// ## Placeholders
+//
+// If used as a DAG leaf, i.e., `(... NativeCodeCall<"...">:$arg, ...)`,
+// the wrapped expression can take special placeholders listed below:
+//
+// * `$_builder` will be replaced by the current `mlir::PatternRewriter`.
+// * `$_self` will be replaced with the entity this transformer is attached to.
+//   E.g., with the definition `def transform : tAttr<$_self...>`, `$_self` in
+//   `transform:$attr` will be replaced by  the value for `$att`.
+//
+// If used as a DAG node, i.e., `(NativeCodeCall<"..."> <arg0>, ..., <argN>)`,
+// then positional placeholders are also supported; placeholder `$N` in the
+// wrapped C++ expression will be replaced by `<argN>`.
+
+class NativeCodeCall<string expr> {
+  string expression = expr;
+}
+
+//===----------------------------------------------------------------------===//
+// Common directives
+//===----------------------------------------------------------------------===//
+
+// Directive used in result pattern to indicate that no new op are generated,
+// so to replace the matched DAG with an existing SSA value.
+def replaceWithValue;
+
+#endif // OP_BASE
diff --git a/third_party/mlir/include/mlir/IR/OpDefinition.h b/third_party/mlir/include/mlir/IR/OpDefinition.h
new file mode 100644
index 00000000000..570990a2c58
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/OpDefinition.h
@@ -0,0 +1,1162 @@
+//===- OpDefinition.h - Classes for defining concrete Op types --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements helper classes for implementing the "Op" types.  This
+// includes the Op type, which is the base class for Op class definitions,
+// as well as number of traits in the OpTrait namespace that provide a
+// declarative way to specify properties of Ops.
+//
+// The purpose of these types are to allow light-weight implementation of
+// concrete ops (like DimOp) with very little boilerplate.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPDEFINITION_H
+#define MLIR_IR_OPDEFINITION_H
+
+#include "mlir/IR/Operation.h"
+#include <type_traits>
+
+namespace mlir {
+class Builder;
+
+namespace OpTrait {
+template <typename ConcreteType> class OneResult;
+}
+
+/// This class represents success/failure for operation parsing. It is
+/// essentially a simple wrapper class around LogicalResult that allows for
+/// explicit conversion to bool. This allows for the parser to chain together
+/// parse rules without the clutter of "failed/succeeded".
+class ParseResult : public LogicalResult {
+public:
+  ParseResult(LogicalResult result = success()) : LogicalResult(result) {}
+
+  // Allow diagnostics emitted during parsing to be converted to failure.
+  ParseResult(const InFlightDiagnostic &) : LogicalResult(failure()) {}
+  ParseResult(const Diagnostic &) : LogicalResult(failure()) {}
+
+  /// Failure is true in a boolean context.
+  explicit operator bool() const { return failed(*this); }
+};
+
+// These functions are out-of-line utilities, which avoids them being template
+// instantiated/duplicated.
+namespace impl {
+/// Insert an operation, generated by `buildTerminatorOp`, at the end of the
+/// region's only block if it does not have a terminator already. If the region
+/// is empty, insert a new block first. `buildTerminatorOp` should return the
+/// terminator operation to insert.
+void ensureRegionTerminator(
+    Region &region, Location loc,
+    llvm::function_ref<Operation *()> buildTerminatorOp);
+/// Templated version that fills the generates the provided operation type.
+template <typename OpTy>
+void ensureRegionTerminator(Region &region, Builder &builder, Location loc) {
+  ensureRegionTerminator(region, loc, [&] {
+    OperationState state(loc, OpTy::getOperationName());
+    OpTy::build(&builder, &state);
+    return Operation::create(state);
+  });
+}
+} // namespace impl
+
+/// This is the concrete base class that holds the operation pointer and has
+/// non-generic methods that only depend on State (to avoid having them
+/// instantiated on template types that don't affect them.
+///
+/// This also has the fallback implementations of customization hooks for when
+/// they aren't customized.
+class OpState {
+public:
+  /// Ops are pointer-like, so we allow implicit conversion to bool.
+  operator bool() { return getOperation() != nullptr; }
+
+  /// This implicitly converts to Operation*.
+  operator Operation *() const { return state; }
+
+  /// Return the operation that this refers to.
+  Operation *getOperation() { return state; }
+
+  /// Returns the closest surrounding operation that contains this operation
+  /// or nullptr if this is a top-level operation.
+  Operation *getParentOp() { return getOperation()->getParentOp(); }
+
+  /// Return the closest surrounding parent operation that is of type 'OpTy'.
+  template <typename OpTy> OpTy getParentOfType() {
+    return getOperation()->getParentOfType<OpTy>();
+  }
+
+  /// Return the context this operation belongs to.
+  MLIRContext *getContext() { return getOperation()->getContext(); }
+
+  /// Print the operation to the given stream.
+  void print(raw_ostream &os) { state->print(os); }
+
+  /// Dump this operation.
+  void dump() { state->dump(); }
+
+  /// The source location the operation was defined or derived from.
+  Location getLoc() { return state->getLoc(); }
+  void setLoc(Location loc) { state->setLoc(loc); }
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() { return state->getAttrs(); }
+
+  /// A utility iterator that filters out non-dialect attributes.
+  using dialect_attr_iterator = Operation::dialect_attr_iterator;
+  using dialect_attr_range = Operation::dialect_attr_range;
+
+  /// Return a range corresponding to the dialect attributes for this operation.
+  dialect_attr_range getDialectAttrs() { return state->getDialectAttrs(); }
+  dialect_attr_iterator dialect_attr_begin() {
+    return state->dialect_attr_begin();
+  }
+  dialect_attr_iterator dialect_attr_end() { return state->dialect_attr_end(); }
+
+  /// Return an attribute with the specified name.
+  Attribute getAttr(StringRef name) { return state->getAttr(name); }
+
+  /// If the operation has an attribute of the specified type, return it.
+  template <typename AttrClass> AttrClass getAttrOfType(StringRef name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void setAttr(Identifier name, Attribute value) {
+    state->setAttr(name, value);
+  }
+  void setAttr(StringRef name, Attribute value) {
+    setAttr(Identifier::get(name, getContext()), value);
+  }
+
+  /// Set the attributes held by this operation.
+  void setAttrs(ArrayRef<NamedAttribute> attributes) {
+    state->setAttrs(attributes);
+  }
+  void setAttrs(NamedAttributeList newAttrs) { state->setAttrs(newAttrs); }
+
+  /// Set the dialect attributes for this operation, and preserve all dependent.
+  template <typename DialectAttrs> void setDialectAttrs(DialectAttrs &&attrs) {
+    state->setDialectAttrs(std::move(attrs));
+  }
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  NamedAttributeList::RemoveResult removeAttr(Identifier name) {
+    return state->removeAttr(name);
+  }
+  NamedAttributeList::RemoveResult removeAttr(StringRef name) {
+    return state->removeAttr(Identifier::get(name, getContext()));
+  }
+
+  /// Return true if there are no users of any results of this operation.
+  bool use_empty() { return state->use_empty(); }
+
+  /// Remove this operation from its parent block and delete it.
+  void erase() { state->erase(); }
+
+  /// Emit an error with the op name prefixed, like "'dim' op " which is
+  /// convenient for verifiers.
+  InFlightDiagnostic emitOpError(const Twine &message = {});
+
+  /// Emit an error about fatal conditions with this operation, reporting up to
+  /// any diagnostic handlers that may be listening.
+  InFlightDiagnostic emitError(const Twine &message = {});
+
+  /// Emit a warning about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitWarning(const Twine &message = {});
+
+  /// Emit a remark about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitRemark(const Twine &message = {});
+
+  /// Walk the operation in postorder, calling the callback for each nested
+  /// operation(including this one).
+  void walk(llvm::function_ref<void(Operation *)> callback) {
+    state->walk(callback);
+  }
+
+  /// Specialization of walk to only visit operations of 'OpTy'.
+  template <typename OpTy> void walk(llvm::function_ref<void(OpTy)> callback) {
+    walk([&](Operation *opInst) {
+      if (auto op = dyn_cast<OpTy>(opInst))
+        callback(op);
+    });
+  }
+
+  // These are default implementations of customization hooks.
+public:
+  /// This hook returns any canonicalization pattern rewrites that the operation
+  /// supports, for use by the canonicalization pass.
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {}
+
+protected:
+  /// If the concrete type didn't implement a custom verifier hook, just fall
+  /// back to this one which accepts everything.
+  LogicalResult verify() { return success(); }
+
+  /// Unless overridden, the custom assembly form of an op is always rejected.
+  /// Op implementations should implement this to return failure.
+  /// On success, they should fill in result with the fields to use.
+  static ParseResult parse(OpAsmParser *parser, OperationState *result);
+
+  // The fallback for the printer is to print it the generic assembly form.
+  void print(OpAsmPrinter *p);
+
+  /// Mutability management is handled by the OpWrapper/OpConstWrapper classes,
+  /// so we can cast it away here.
+  explicit OpState(Operation *state) : state(state) {}
+
+private:
+  Operation *state;
+};
+
+// Allow comparing operators.
+inline bool operator==(OpState lhs, OpState rhs) {
+  return lhs.getOperation() == rhs.getOperation();
+}
+inline bool operator!=(OpState lhs, OpState rhs) {
+  return lhs.getOperation() != rhs.getOperation();
+}
+
+/// This class represents a single result from folding an operation.
+class OpFoldResult : public llvm::PointerUnion<Attribute, Value *> {
+  using llvm::PointerUnion<Attribute, Value *>::PointerUnion;
+};
+
+/// This template defines the foldHook as used by AbstractOperation.
+///
+/// The default implementation uses a general fold method that can be defined on
+/// custom ops which can return multiple results.
+template <typename ConcreteType, bool isSingleResult, typename = void>
+class FoldingHook {
+public:
+  /// This is an implementation detail of the constant folder hook for
+  /// AbstractOperation.
+  static LogicalResult foldHook(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results) {
+    return cast<ConcreteType>(op).fold(operands, results);
+  }
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return failure.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return success.
+  ///  3. They can return a list of existing values that can be used instead of
+  ///     the operation.  In this case, fill in the results list and return
+  ///     success.  The caller will remove the operation and use those results
+  ///     instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  ///
+  /// If not overridden, this fallback implementation always fails to fold.
+  ///
+  LogicalResult fold(ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results) {
+    return failure();
+  }
+};
+
+/// This template specialization defines the foldHook as used by
+/// AbstractOperation for single-result operations.  This gives the hook a nicer
+/// signature that is easier to implement.
+template <typename ConcreteType, bool isSingleResult>
+class FoldingHook<ConcreteType, isSingleResult,
+                  typename std::enable_if<isSingleResult>::type> {
+public:
+  /// If the operation returns a single value, then the Op can be implicitly
+  /// converted to an Value*.  This yields the value of the only result.
+  operator Value *() {
+    return static_cast<ConcreteType *>(this)->getOperation()->getResult(0);
+  }
+
+  /// This is an implementation detail of the constant folder hook for
+  /// AbstractOperation.
+  static LogicalResult foldHook(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results) {
+    auto result = cast<ConcreteType>(op).fold(operands);
+    if (!result)
+      return failure();
+
+    // Check if the operation was folded in place. In this case, the operation
+    // returns itself.
+    if (result.template dyn_cast<Value *>() != op->getResult(0))
+      results.push_back(result);
+    return success();
+  }
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return nullptr.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return the operation itself.
+  ///  3. They can return an existing SSA value that can be used instead of
+  ///     the operation.  In this case, return that value.  The caller will
+  ///     remove the operation and use that result instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  ///
+  /// If not overridden, this fallback implementation always fails to fold.
+  ///
+  OpFoldResult fold(ArrayRef<Attribute> operands) { return {}; }
+};
+
+//===----------------------------------------------------------------------===//
+// Operation Trait Types
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+
+// These functions are out-of-line implementations of the methods in the
+// corresponding trait classes.  This avoids them being template
+// instantiated/duplicated.
+namespace impl {
+LogicalResult verifyZeroOperands(Operation *op);
+LogicalResult verifyOneOperand(Operation *op);
+LogicalResult verifyNOperands(Operation *op, unsigned numOperands);
+LogicalResult verifyAtLeastNOperands(Operation *op, unsigned numOperands);
+LogicalResult verifyOperandsAreFloatLike(Operation *op);
+LogicalResult verifyOperandsAreIntegerLike(Operation *op);
+LogicalResult verifySameTypeOperands(Operation *op);
+LogicalResult verifyZeroResult(Operation *op);
+LogicalResult verifyOneResult(Operation *op);
+LogicalResult verifyNResults(Operation *op, unsigned numOperands);
+LogicalResult verifyAtLeastNResults(Operation *op, unsigned numOperands);
+LogicalResult verifySameOperandsShape(Operation *op);
+LogicalResult verifySameOperandsAndResultShape(Operation *op);
+LogicalResult verifySameOperandsElementType(Operation *op);
+LogicalResult verifySameOperandsAndResultElementType(Operation *op);
+LogicalResult verifySameOperandsAndResultType(Operation *op);
+LogicalResult verifyResultsAreBoolLike(Operation *op);
+LogicalResult verifyResultsAreFloatLike(Operation *op);
+LogicalResult verifyResultsAreIntegerLike(Operation *op);
+LogicalResult verifyIsTerminator(Operation *op);
+} // namespace impl
+
+/// Helper class for implementing traits.  Clients are not expected to interact
+/// with this directly, so its members are all protected.
+template <typename ConcreteType, template <typename> class TraitType>
+class TraitBase {
+protected:
+  /// Return the ultimate Operation being worked on.
+  Operation *getOperation() {
+    // We have to cast up to the trait type, then to the concrete type, then to
+    // the BaseState class in explicit hops because the concrete type will
+    // multiply derive from the (content free) TraitBase class, and we need to
+    // be able to disambiguate the path for the C++ compiler.
+    auto *trait = static_cast<TraitType<ConcreteType> *>(this);
+    auto *concrete = static_cast<ConcreteType *>(trait);
+    auto *base = static_cast<OpState *>(concrete);
+    return base->getOperation();
+  }
+
+  /// Provide default implementations of trait hooks.  This allows traits to
+  /// provide exactly the overrides they care about.
+  static LogicalResult verifyTrait(Operation *op) { return success(); }
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return 0;
+  }
+};
+
+namespace detail {
+/// Utility trait base that provides accessors for derived traits that have
+/// multiple operands.
+template <typename ConcreteType, template <typename> class TraitType>
+struct MultiOperandTraitBase : public TraitBase<ConcreteType, TraitType> {
+  using operand_iterator = Operation::operand_iterator;
+  using operand_range = Operation::operand_range;
+  using operand_type_iterator = Operation::operand_type_iterator;
+  using operand_type_range = Operation::operand_type_range;
+
+  /// Return the number of operands.
+  unsigned getNumOperands() { return this->getOperation()->getNumOperands(); }
+
+  /// Return the operand at index 'i'.
+  Value *getOperand(unsigned i) { return this->getOperation()->getOperand(i); }
+
+  /// Set the operand at index 'i' to 'value'.
+  void setOperand(unsigned i, Value *value) {
+    this->getOperation()->setOperand(i, value);
+  }
+
+  /// Operand iterator access.
+  operand_iterator operand_begin() {
+    return this->getOperation()->operand_begin();
+  }
+  operand_iterator operand_end() { return this->getOperation()->operand_end(); }
+  operand_range getOperands() { return this->getOperation()->getOperands(); }
+
+  /// Operand type access.
+  operand_type_iterator operand_type_begin() {
+    return this->getOperation()->operand_type_begin();
+  }
+  operand_type_iterator operand_type_end() {
+    return this->getOperation()->operand_type_end();
+  }
+  operand_type_range getOperandTypes() {
+    return this->getOperation()->getOperandTypes();
+  }
+};
+} // end namespace detail
+
+/// This class provides the API for ops that are known to have no
+/// SSA operand.
+template <typename ConcreteType>
+class ZeroOperands : public TraitBase<ConcreteType, ZeroOperands> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyZeroOperands(op);
+  }
+
+private:
+  // Disable these.
+  void getOperand() {}
+  void setOperand() {}
+};
+
+/// This class provides the API for ops that are known to have exactly one
+/// SSA operand.
+template <typename ConcreteType>
+class OneOperand : public TraitBase<ConcreteType, OneOperand> {
+public:
+  Value *getOperand() { return this->getOperation()->getOperand(0); }
+
+  void setOperand(Value *value) { this->getOperation()->setOperand(0, value); }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneOperand(op);
+  }
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of operands.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::NOperands<2>::Impl> {
+///
+template <unsigned N> class NOperands {
+public:
+  static_assert(N > 1, "use ZeroOperands/OneOperand for N < 2");
+
+  template <typename ConcreteType>
+  class Impl
+      : public detail::MultiOperandTraitBase<ConcreteType, NOperands<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyNOperands(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops that are known to have a at least a
+/// specified number of operands.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::AtLeastNOperands<2>::Impl> {
+///
+template <unsigned N> class AtLeastNOperands {
+public:
+  template <typename ConcreteType>
+  class Impl : public detail::MultiOperandTraitBase<ConcreteType,
+                                                    AtLeastNOperands<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyAtLeastNOperands(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops which have an unknown number of
+/// SSA operands.
+template <typename ConcreteType>
+class VariadicOperands
+    : public detail::MultiOperandTraitBase<ConcreteType, VariadicOperands> {};
+
+/// This class provides return value APIs for ops that are known to have
+/// zero results.
+template <typename ConcreteType>
+class ZeroResult : public TraitBase<ConcreteType, ZeroResult> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyZeroResult(op);
+  }
+};
+
+namespace detail {
+/// Utility trait base that provides accessors for derived traits that have
+/// multiple results.
+template <typename ConcreteType, template <typename> class TraitType>
+struct MultiResultTraitBase : public TraitBase<ConcreteType, TraitType> {
+  using result_iterator = Operation::result_iterator;
+  using result_range = Operation::result_range;
+  using result_type_iterator = Operation::result_type_iterator;
+  using result_type_range = Operation::result_type_range;
+
+  /// Return the number of results.
+  unsigned getNumResults() { return this->getOperation()->getNumResults(); }
+
+  /// Return the result at index 'i'.
+  Value *getResult(unsigned i) { return this->getOperation()->getResult(i); }
+
+  /// Replace all uses of results of this operation with the provided 'values'.
+  /// 'values' may correspond to an existing operation, or a range of 'Value'.
+  template <typename ValuesT> void replaceAllUsesWith(ValuesT &&values) {
+    this->getOperation()->replaceAllUsesWith(std::forward<ValuesT>(values));
+  }
+
+  /// Return the type of the `i`-th result.
+  Type getType(unsigned i) { return getResult(i)->getType(); }
+
+  /// Result iterator access.
+  result_iterator result_begin() {
+    return this->getOperation()->result_begin();
+  }
+  result_iterator result_end() { return this->getOperation()->result_end(); }
+  result_range getResults() { return this->getOperation()->getResults(); }
+
+  /// Result type access.
+  result_type_iterator result_type_begin() {
+    return this->getOperation()->result_type_begin();
+  }
+  result_type_iterator result_type_end() {
+    return this->getOperation()->result_type_end();
+  }
+  result_type_range getResultTypes() {
+    return this->getOperation()->getResultTypes();
+  }
+};
+} // end namespace detail
+
+/// This class provides return value APIs for ops that are known to have a
+/// single result.
+template <typename ConcreteType>
+class OneResult : public TraitBase<ConcreteType, OneResult> {
+public:
+  Value *getResult() { return this->getOperation()->getResult(0); }
+  Type getType() { return getResult()->getType(); }
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(Value *newValue) {
+    getResult()->replaceAllUsesWith(newValue);
+  }
+
+  /// Replace all uses of 'this' value with the result of 'op'.
+  void replaceAllUsesWith(Operation *op) {
+    this->getOperation()->replaceAllUsesWith(op);
+  }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneResult(op);
+  }
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of results.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::NResults<2>::Impl> {
+///
+template <unsigned N> class NResults {
+public:
+  static_assert(N > 1, "use ZeroResult/OneResult for N < 2");
+
+  template <typename ConcreteType>
+  class Impl
+      : public detail::MultiResultTraitBase<ConcreteType, NResults<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyNResults(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops that are known to have at least a
+/// specified number of results.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::AtLeastNResults<2>::Impl> {
+///
+template <unsigned N> class AtLeastNResults {
+public:
+  template <typename ConcreteType>
+  class Impl : public detail::MultiResultTraitBase<ConcreteType,
+                                                   AtLeastNResults<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyAtLeastNResults(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops which have an unknown number of
+/// results.
+template <typename ConcreteType>
+class VariadicResults
+    : public detail::MultiResultTraitBase<ConcreteType, VariadicResults> {};
+
+/// This class provides verification for ops that are known to have the same
+/// operand shape: all operands are scalars, vectors/tensors of the same
+/// shape.
+template <typename ConcreteType>
+class SameOperandsShape : public TraitBase<ConcreteType, SameOperandsShape> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsShape(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result shape: both are scalars, vectors/tensors of the same
+/// shape.
+template <typename ConcreteType>
+class SameOperandsAndResultShape
+    : public TraitBase<ConcreteType, SameOperandsAndResultShape> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultShape(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand element type.
+///
+template <typename ConcreteType>
+class SameOperandsElementType
+    : public TraitBase<ConcreteType, SameOperandsElementType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsElementType(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result element type.
+///
+template <typename ConcreteType>
+class SameOperandsAndResultElementType
+    : public TraitBase<ConcreteType, SameOperandsAndResultElementType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultElementType(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result type.
+///
+/// Note: this trait subsumes the SameOperandsAndResultShape and
+/// SameOperandsAndResultElementType traits.
+template <typename ConcreteType>
+class SameOperandsAndResultType
+    : public TraitBase<ConcreteType, SameOperandsAndResultType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultType(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have a boolean
+/// type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreBoolLike : public TraitBase<ConcreteType, ResultsAreBoolLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreBoolLike(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have a floating
+/// point type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreFloatLike
+    : public TraitBase<ConcreteType, ResultsAreFloatLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreFloatLike(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have an integer or
+/// index type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreIntegerLike
+    : public TraitBase<ConcreteType, ResultsAreIntegerLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreIntegerLike(op);
+  }
+};
+
+/// This class adds property that the operation is commutative.
+template <typename ConcreteType>
+class IsCommutative : public TraitBase<ConcreteType, IsCommutative> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::Commutative);
+  }
+};
+
+/// This class adds property that the operation has no side effects.
+template <typename ConcreteType>
+class HasNoSideEffect : public TraitBase<ConcreteType, HasNoSideEffect> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::NoSideEffect);
+  }
+};
+
+/// This class verifies that all operands of the specified op have a float type,
+/// a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class OperandsAreFloatLike
+    : public TraitBase<ConcreteType, OperandsAreFloatLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOperandsAreFloatLike(op);
+  }
+};
+
+/// This class verifies that all operands of the specified op have an integer or
+/// index type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class OperandsAreIntegerLike
+    : public TraitBase<ConcreteType, OperandsAreIntegerLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOperandsAreIntegerLike(op);
+  }
+};
+
+/// This class verifies that all operands of the specified op have the same
+/// type.
+template <typename ConcreteType>
+class SameTypeOperands : public TraitBase<ConcreteType, SameTypeOperands> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameTypeOperands(op);
+  }
+};
+
+/// This class provides the API for ops that are known to be terminators.
+template <typename ConcreteType>
+class IsTerminator : public TraitBase<ConcreteType, IsTerminator> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::Terminator);
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyIsTerminator(op);
+  }
+
+  unsigned getNumSuccessors() {
+    return this->getOperation()->getNumSuccessors();
+  }
+  unsigned getNumSuccessorOperands(unsigned index) {
+    return this->getOperation()->getNumSuccessorOperands(index);
+  }
+
+  Block *getSuccessor(unsigned index) {
+    return this->getOperation()->getSuccessor(index);
+  }
+
+  void setSuccessor(Block *block, unsigned index) {
+    return this->getOperation()->setSuccessor(block, index);
+  }
+
+  void addSuccessorOperand(unsigned index, Value *value) {
+    return this->getOperation()->addSuccessorOperand(index, value);
+  }
+  void addSuccessorOperands(unsigned index, ArrayRef<Value *> values) {
+    return this->getOperation()->addSuccessorOperand(index, values);
+  }
+};
+
+/// This class provides the API for ops that are known to be isolated from
+/// above.
+template <typename ConcreteType>
+class IsIsolatedFromAbove
+    : public TraitBase<ConcreteType, IsIsolatedFromAbove> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::IsolatedFromAbove);
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    for (auto &region : op->getRegions())
+      if (!region.isIsolatedFromAbove(op->getLoc()))
+        return failure();
+    return success();
+  }
+};
+
+/// This class provides APIs and verifiers for ops with regions having a single
+/// block that must terminate with `TerminatorOpType`.
+template <typename TerminatorOpType> struct SingleBlockImplicitTerminator {
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      for (unsigned i = 0, e = op->getNumRegions(); i < e; ++i) {
+        Region &region = op->getRegion(i);
+
+        // Empty regions are fine.
+        if (region.empty())
+          continue;
+
+        // Non-empty regions must contain a single basic block.
+        if (std::next(region.begin()) != region.end())
+          return op->emitOpError("expects region #")
+                 << i << " to have 0 or 1 blocks";
+
+        Block &block = region.front();
+        if (block.empty())
+          return op->emitOpError() << "expects a non-empty block";
+        Operation &terminator = block.back();
+        if (isa<TerminatorOpType>(terminator))
+          continue;
+
+        return op->emitOpError("expects regions to end with '" +
+                               TerminatorOpType::getOperationName() +
+                               "', found '" +
+                               terminator.getName().getStringRef() + "'")
+                   .attachNote()
+               << "in custom textual format, the absence of terminator implies "
+                  "'"
+               << TerminatorOpType::getOperationName() << '\'';
+      }
+
+      return success();
+    }
+
+    /// Ensure that the given region has the terminator required by this trait.
+    static void ensureTerminator(Region &region, Builder &builder,
+                                 Location loc) {
+      ::mlir::impl::template ensureRegionTerminator<TerminatorOpType>(
+          region, builder, loc);
+    }
+  };
+};
+
+/// This class provides a verifier for ops that are expecting a specific parent.
+template <typename ParentOpType> struct HasParent {
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      if (isa<ParentOpType>(op->getParentOp()))
+        return success();
+      return op->emitOpError() << "expects parent op '"
+                               << ParentOpType::getOperationName() << "'";
+    }
+  };
+};
+
+} // end namespace OpTrait
+
+//===----------------------------------------------------------------------===//
+// Operation Definition classes
+//===----------------------------------------------------------------------===//
+
+/// This provides public APIs that all operations should have.  The template
+/// argument 'ConcreteType' should be the concrete type by CRTP and the others
+/// are base classes by the policy pattern.
+template <typename ConcreteType, template <typename T> class... Traits>
+class Op : public OpState,
+           public Traits<ConcreteType>...,
+           public FoldingHook<ConcreteType,
+                              llvm::is_one_of<OpTrait::OneResult<ConcreteType>,
+                                              Traits<ConcreteType>...>::value> {
+public:
+  /// Return if this operation contains the provided trait.
+  template <template <typename T> class Trait>
+  static constexpr bool hasTrait() {
+    return llvm::is_one_of<Trait<ConcreteType>, Traits<ConcreteType>...>::value;
+  }
+
+  /// Return the operation that this refers to.
+  Operation *getOperation() { return OpState::getOperation(); }
+
+  /// Return the dialect that this refers to.
+  Dialect *getDialect() { return getOperation()->getDialect(); }
+
+  /// Return the parent Region of this operation.
+  Region *getParentRegion() { return getOperation()->getParentRegion(); }
+
+  /// Return true if this "op class" can match against the specified operation.
+  /// This hook can be overridden with a more specific implementation in
+  /// the subclass of Base.
+  ///
+  static bool classof(Operation *op) {
+    return op->getName().getStringRef() == ConcreteType::getOperationName();
+  }
+
+  /// This is the hook used by the AsmParser to parse the custom form of this
+  /// op from an .mlir file.  Op implementations should provide a parse method,
+  /// which returns failure.  On success, they should return fill in result with
+  /// the fields to use.
+  static ParseResult parseAssembly(OpAsmParser *parser,
+                                   OperationState *result) {
+    return ConcreteType::parse(parser, result);
+  }
+
+  /// This is the hook used by the AsmPrinter to emit this to the .mlir file.
+  /// Op implementations should provide a print method.
+  static void printAssembly(Operation *op, OpAsmPrinter *p) {
+    auto opPointer = dyn_cast<ConcreteType>(op);
+    assert(opPointer &&
+           "op's name does not match name of concrete type instantiated with");
+    opPointer.print(p);
+  }
+
+  /// This is the hook that checks whether or not this operation is well
+  /// formed according to the invariants of its opcode.  It delegates to the
+  /// Traits for their policy implementations, and allows the user to specify
+  /// their own verify() method.
+  ///
+  /// On success this returns false; on failure it emits an error to the
+  /// diagnostic subsystem and returns true.
+  static LogicalResult verifyInvariants(Operation *op) {
+    return failure(
+        failed(BaseVerifier<Traits<ConcreteType>...>::verifyTrait(op)) ||
+        failed(cast<ConcreteType>(op).verify()));
+  }
+
+  // Returns the properties of an operation by combining the properties of the
+  // traits of the op.
+  static AbstractOperation::OperationProperties getOperationProperties() {
+    return BaseProperties<Traits<ConcreteType>...>::getTraitProperties();
+  }
+
+  /// Expose the type we are instantiated on to template machinery that may want
+  /// to introspect traits on this operation.
+  using ConcreteOpType = ConcreteType;
+
+  /// This is a public constructor.  Any op can be initialized to null.
+  explicit Op() : OpState(nullptr) {}
+  Op(std::nullptr_t) : OpState(nullptr) {}
+
+  /// This is a public constructor to enable access via the llvm::cast family of
+  /// methods. This should not be used directly.
+  explicit Op(Operation *state) : OpState(state) {}
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>((Operation *)*this);
+  }
+  static ConcreteOpType getFromOpaquePointer(const void *pointer) {
+    return ConcreteOpType(
+        reinterpret_cast<Operation *>(const_cast<void *>(pointer)));
+  }
+
+private:
+  template <typename... Types> struct BaseVerifier;
+
+  template <typename First, typename... Rest>
+  struct BaseVerifier<First, Rest...> {
+    static LogicalResult verifyTrait(Operation *op) {
+      return failure(failed(First::verifyTrait(op)) ||
+                     failed(BaseVerifier<Rest...>::verifyTrait(op)));
+    }
+  };
+
+  template <typename...> struct BaseVerifier {
+    static LogicalResult verifyTrait(Operation *op) { return success(); }
+  };
+
+  template <typename... Types> struct BaseProperties;
+
+  template <typename First, typename... Rest>
+  struct BaseProperties<First, Rest...> {
+    static AbstractOperation::OperationProperties getTraitProperties() {
+      return First::getTraitProperties() |
+             BaseProperties<Rest...>::getTraitProperties();
+    }
+  };
+
+  template <typename...> struct BaseProperties {
+    static AbstractOperation::OperationProperties getTraitProperties() {
+      return 0;
+    }
+  };
+
+  /// Returns true if this operation contains the trait for the given classID.
+  static bool hasTrait(ClassID *traitID) {
+    return llvm::is_contained(llvm::makeArrayRef({ClassID::getID<Traits>()...}),
+                              traitID);
+  }
+
+  /// Returns an opaque pointer to a concept instance of the interface with the
+  /// given ID if one was registered to this operation.
+  static void *getRawInterface(ClassID *id) {
+    return InterfaceLookup::template lookup<Traits<ConcreteType>...>(id);
+  }
+
+  struct InterfaceLookup {
+    /// Trait to check if T provides a static 'getInterfaceID' method.
+    template <typename T, typename... Args>
+    using has_get_interface_id = decltype(T::getInterfaceID());
+
+    /// If 'T' is the same interface as 'interfaceID' return the concept
+    /// instance.
+    template <typename T>
+    static typename std::enable_if<is_detected<has_get_interface_id, T>::value,
+                                   void *>::type
+    lookup(ClassID *interfaceID) {
+      return (T::getInterfaceID() == interfaceID) ? &T::instance() : nullptr;
+    }
+
+    /// 'T' is known to not be an interface, return nullptr.
+    template <typename T>
+    static typename std::enable_if<!is_detected<has_get_interface_id, T>::value,
+                                   void *>::type
+    lookup(ClassID *) {
+      return nullptr;
+    }
+
+    template <typename T, typename T2, typename... Ts>
+    static void *lookup(ClassID *interfaceID) {
+      auto *concept = lookup<T>(interfaceID);
+      return concept ? concept : lookup<T2, Ts...>(interfaceID);
+    }
+  };
+
+  /// Allow access to 'hasTrait' and 'getRawInterface'.
+  friend AbstractOperation;
+};
+
+/// This class represents the base of an operation interface. Operation
+/// interfaces provide access to derived *Op properties through an opaquely
+/// Operation instance. Derived interfaces must also provide a 'Traits' class
+/// that defines a 'Concept' and a 'Model' class. The 'Concept' class defines an
+/// abstract virtual interface, where as the 'Model' class implements this
+/// interface for a specific derived *Op type. Both of these classes *must* not
+/// contain non-static data. A simple example is shown below:
+///
+///  struct ExampleOpInterfaceTraits {
+///    struct Concept {
+///      virtual unsigned getNumInputs(Operation *op) = 0;
+///    };
+///    template <typename OpT> class Model {
+///      unsigned getNumInputs(Operation *op) final {
+///        return llvm::cast<OpT>(op).getNumInputs();
+///      }
+///    };
+///  };
+///
+template <typename ConcreteType, typename Traits>
+class OpInterface : public Op<ConcreteType> {
+public:
+  using Concept = typename Traits::Concept;
+  template <typename T> using Model = typename Traits::template Model<T>;
+
+  OpInterface(Operation *op = nullptr)
+      : Op<ConcreteType>(op), impl(op ? getInterfaceFor(op) : nullptr) {
+    assert((!op || impl) &&
+           "instantiating an interface with an unregistered operation");
+  }
+
+  /// Support 'classof' by checking if the given operation defines the concrete
+  /// interface.
+  static bool classof(Operation *op) { return getInterfaceFor(op); }
+
+  /// Define an accessor for the ID of this interface.
+  static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+  /// This is a special trait that registers a given interface with an
+  /// operation.
+  template <typename ConcreteOp>
+  struct Trait : public OpTrait::TraitBase<ConcreteOp, Trait> {
+    /// Define an accessor for the ID of this interface.
+    static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+    /// Provide an accessor to a static instance of the interface model for the
+    /// concrete operation type.
+    /// The implementation is inspired from Sean Parent's concept-based
+    /// polymorphism. A key difference is that the set of classes erased is
+    /// statically known, which alleviates the need for using dynamic memory
+    /// allocation.
+    /// We use a zero-sized templated class `Model<ConcreteOp>` to emit the
+    /// virtual table and generate a singleton object for each instantiation of
+    /// this class.
+    static Concept &instance() {
+      static Model<ConcreteOp> singleton;
+      return singleton;
+    }
+  };
+
+protected:
+  /// Get the raw concept in the correct derived concept type.
+  Concept *getImpl() { return impl; }
+
+private:
+  /// Returns the impl interface instance for the given operation.
+  static Concept *getInterfaceFor(Operation *op) {
+    // Access the raw interface from the abstract operation.
+    auto *abstractOp = op->getAbstractOperation();
+    return abstractOp ? abstractOp->getInterface<ConcreteType>() : nullptr;
+  }
+
+  /// A pointer to the impl concept object.
+  Concept *impl;
+};
+
+// These functions are out-of-line implementations of the methods in BinaryOp,
+// which avoids them being template instantiated/duplicated.
+namespace impl {
+void buildBinaryOp(Builder *builder, OperationState *result, Value *lhs,
+                   Value *rhs);
+ParseResult parseBinaryOp(OpAsmParser *parser, OperationState *result);
+// Prints the given binary `op` in custom assembly form if both the two operands
+// and the result have the same time. Otherwise, prints the generic assembly
+// form.
+void printBinaryOp(Operation *op, OpAsmPrinter *p);
+} // namespace impl
+
+// These functions are out-of-line implementations of the methods in CastOp,
+// which avoids them being template instantiated/duplicated.
+namespace impl {
+void buildCastOp(Builder *builder, OperationState *result, Value *source,
+                 Type destType);
+ParseResult parseCastOp(OpAsmParser *parser, OperationState *result);
+void printCastOp(Operation *op, OpAsmPrinter *p);
+Value *foldCastOp(Operation *op);
+} // namespace impl
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/OpImplementation.h b/third_party/mlir/include/mlir/IR/OpImplementation.h
new file mode 100644
index 00000000000..c4e87ce3eef
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/OpImplementation.h
@@ -0,0 +1,571 @@
+//===- OpImplementation.h - Classes for implementing Op types ---*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This classes used by the implementation details of Op types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPIMPLEMENTATION_H
+#define MLIR_IR_OPIMPLEMENTATION_H
+
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+
+class Builder;
+
+//===----------------------------------------------------------------------===//
+// OpAsmPrinter
+//===----------------------------------------------------------------------===//
+
+/// This is a pure-virtual base class that exposes the asmprinter hooks
+/// necessary to implement a custom print() method.
+class OpAsmPrinter {
+public:
+  OpAsmPrinter() {}
+  virtual ~OpAsmPrinter();
+  virtual raw_ostream &getStream() const = 0;
+
+  /// Print implementations for various things an operation contains.
+  virtual void printOperand(Value *value) = 0;
+
+  /// Print a comma separated list of operands.
+  template <typename ContainerType>
+  void printOperands(const ContainerType &container) {
+    printOperands(container.begin(), container.end());
+  }
+
+  /// Print a comma separated list of operands.
+  template <typename IteratorType>
+  void printOperands(IteratorType it, IteratorType end) {
+    if (it == end)
+      return;
+    printOperand(*it);
+    for (++it; it != end; ++it) {
+      getStream() << ", ";
+      printOperand(*it);
+    }
+  }
+  virtual void printType(Type type) = 0;
+  virtual void printAttribute(Attribute attr) = 0;
+
+  /// Print a successor, and use list, of a terminator operation given the
+  /// terminator and the successor index.
+  virtual void printSuccessorAndUseList(Operation *term, unsigned index) = 0;
+
+  /// If the specified operation has attributes, print out an attribute
+  /// dictionary with their values.  elidedAttrs allows the client to ignore
+  /// specific well known attributes, commonly used if the attribute value is
+  /// printed some other way (like as a fixed operand).
+  virtual void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                                     ArrayRef<StringRef> elidedAttrs = {}) = 0;
+
+  /// Print the entire operation with the default generic assembly form.
+  virtual void printGenericOp(Operation *op) = 0;
+
+  /// Prints a region.
+  virtual void printRegion(Region &blocks, bool printEntryBlockArgs = true,
+                           bool printBlockTerminators = true) = 0;
+
+  /// Renumber the arguments for the specified region to the same names as the
+  /// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+  /// operations.  If any entry in namesToUse is null, the corresponding
+  /// argument name is left alone.
+  virtual void shadowRegionArgs(Region &region,
+                                ArrayRef<Value *> namesToUse) = 0;
+
+  /// Prints an affine map of SSA ids, where SSA id names are used in place
+  /// of dims/symbols.
+  /// Operand values must come from single-result sources, and be valid
+  /// dimensions/symbol identifiers according to mlir::isValidDim/Symbol.
+  virtual void printAffineMapOfSSAIds(AffineMapAttr mapAttr,
+                                      ArrayRef<Value *> operands) = 0;
+
+  /// Print an optional arrow followed by a type list.
+  void printOptionalArrowTypeList(ArrayRef<Type> types) {
+    if (types.empty())
+      return;
+    auto &os = getStream() << " -> ";
+    bool wrapped = types.size() != 1 || types[0].isa<FunctionType>();
+    if (wrapped)
+      os << '(';
+    interleaveComma(types, *this);
+    if (wrapped)
+      os << ')';
+  }
+
+  /// Print the complete type of an operation in functional form.
+  void printFunctionalType(Operation *op) {
+    auto &os = getStream();
+    os << "(";
+    interleaveComma(op->getNonSuccessorOperands(), os,
+                    [&](Value *operand) { printType(operand->getType()); });
+    os << ") -> ";
+    if (op->getNumResults() == 1 &&
+        !op->getResult(0)->getType().isa<FunctionType>()) {
+      printType(op->getResult(0)->getType());
+    } else {
+      os << '(';
+      interleaveComma(op->getResultTypes(), os);
+      os << ')';
+    }
+  }
+
+private:
+  OpAsmPrinter(const OpAsmPrinter &) = delete;
+  void operator=(const OpAsmPrinter &) = delete;
+};
+
+// Make the implementations convenient to use.
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Value &value) {
+  p.printOperand(&value);
+  return p;
+}
+
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Type type) {
+  p.printType(type);
+  return p;
+}
+
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Attribute attr) {
+  p.printAttribute(attr);
+  return p;
+}
+
+// Support printing anything that isn't convertible to one of the above types,
+// even if it isn't exactly one of them.  For example, we want to print
+// FunctionType with the Type version above, not have it match this.
+template <typename T, typename std::enable_if<
+                          !std::is_convertible<T &, Value &>::value &&
+                              !std::is_convertible<T &, Type &>::value &&
+                              !std::is_convertible<T &, Attribute &>::value,
+                          T>::type * = nullptr>
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, const T &other) {
+  p.getStream() << other;
+  return p;
+}
+
+//===----------------------------------------------------------------------===//
+// OpAsmParser
+//===----------------------------------------------------------------------===//
+
+/// The OpAsmParser has methods for interacting with the asm parser: parsing
+/// things from it, emitting errors etc.  It has an intentionally high-level API
+/// that is designed to reduce/constrain syntax innovation in individual
+/// operations.
+///
+/// For example, consider an op like this:
+///
+///    %x = load %p[%1, %2] : memref<...>
+///
+/// The "%x = load" tokens are already parsed and therefore invisible to the
+/// custom op parser.  This can be supported by calling `parseOperandList` to
+/// parse the %p, then calling `parseOperandList` with a `SquareDelimiter` to
+/// parse the indices, then calling `parseColonTypeList` to parse the result
+/// type.
+///
+class OpAsmParser {
+public:
+  virtual ~OpAsmParser();
+
+  /// Emit a diagnostic at the specified location and return failure.
+  virtual InFlightDiagnostic emitError(llvm::SMLoc loc,
+                                       const Twine &message = {}) = 0;
+
+  /// Return a builder which provides useful access to MLIRContext, global
+  /// objects like types and attributes.
+  virtual Builder &getBuilder() const = 0;
+
+  /// Get the location of the next token and store it into the argument.  This
+  /// always succeeds.
+  virtual llvm::SMLoc getCurrentLocation() = 0;
+  ParseResult getCurrentLocation(llvm::SMLoc *loc) {
+    *loc = getCurrentLocation();
+    return success();
+  }
+
+  /// Return the location of the original name token.
+  virtual llvm::SMLoc getNameLoc() const = 0;
+
+  // These methods emit an error and return failure or success. This allows
+  // these to be chained together into a linear sequence of || expressions in
+  // many cases.
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a '->' token.
+  virtual ParseResult parseArrow() = 0;
+
+  /// Parse a '->' token if present
+  virtual ParseResult parseOptionalArrow() = 0;
+
+  /// Parse a `:` token.
+  virtual ParseResult parseColon() = 0;
+
+  /// Parse a `:` token if present.
+  virtual ParseResult parseOptionalColon() = 0;
+
+  /// Parse a `,` token.
+  virtual ParseResult parseComma() = 0;
+
+  /// Parse a `,` token if present.
+  virtual ParseResult parseOptionalComma() = 0;
+
+  /// Parse a `=` token.
+  virtual ParseResult parseEqual() = 0;
+
+  /// Parse a keyword.
+  ParseResult parseKeyword(const char *keyword, const Twine &msg = "") {
+    if (parseOptionalKeyword(keyword))
+      return emitError(getNameLoc(), "expected '") << keyword << "'" << msg;
+    return success();
+  }
+
+  /// Parse a keyword if present.
+  virtual ParseResult parseOptionalKeyword(const char *keyword) = 0;
+
+  /// Parse a `(` token.
+  virtual ParseResult parseLParen() = 0;
+
+  /// Parse a `(` token if present.
+  virtual ParseResult parseOptionalLParen() = 0;
+
+  /// Parse a `)` token.
+  virtual ParseResult parseRParen() = 0;
+
+  /// Parse a `)` token if present.
+  virtual ParseResult parseOptionalRParen() = 0;
+
+  /// Parse a `[` token.
+  virtual ParseResult parseLSquare() = 0;
+
+  /// Parse a `[` token if present.
+  virtual ParseResult parseOptionalLSquare() = 0;
+
+  /// Parse a `]` token.
+  virtual ParseResult parseRSquare() = 0;
+
+  /// Parse a `]` token if present.
+  virtual ParseResult parseOptionalRSquare() = 0;
+
+  /// Parse a `...` token if present;
+  virtual ParseResult parseOptionalEllipsis() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute and return it in result.  This also adds the
+  /// attribute to the specified attribute list with the specified name.
+  ParseResult parseAttribute(Attribute &result, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) {
+    return parseAttribute(result, Type(), attrName, attrs);
+  }
+
+  /// Parse an arbitrary attribute of a given type and return it in result. This
+  /// also adds the attribute to the specified attribute list with the specified
+  /// name.
+  virtual ParseResult
+  parseAttribute(Attribute &result, Type type, StringRef attrName,
+                 SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  /// Parse an attribute of a specific kind and type.
+  template <typename AttrType>
+  ParseResult parseAttribute(AttrType &result, Type type, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of attribute.
+    Attribute attr;
+    if (parseAttribute(attr, type, attrName, attrs))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = attr.dyn_cast<AttrType>();
+    if (!result)
+      return emitError(loc, "invalid kind of constant specified");
+
+    return success();
+  }
+
+  /// Parse a named dictionary into 'result' if it is present.
+  virtual ParseResult
+  parseOptionalAttributeDict(SmallVectorImpl<NamedAttribute> &result) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Identifier Parsing
+  //===--------------------------------------------------------------------===//
+
+  virtual ParseResult
+  parseSymbolName(StringAttr &result, StringRef attrName,
+                  SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Operand Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// This is the representation of an operand reference.
+  struct OperandType {
+    llvm::SMLoc location; // Location of the token.
+    StringRef name;       // Value name, e.g. %42 or %abc
+    unsigned number;      // Number, e.g. 12 for an operand like %xyz#12
+  };
+
+  /// Parse a single operand.
+  virtual ParseResult parseOperand(OperandType &result) = 0;
+
+  /// These are the supported delimiters around operand lists and region
+  /// argument lists, used by parseOperandList and parseRegionArgumentList.
+  enum class Delimiter {
+    /// Zero or more operands with no delimiters.
+    None,
+    /// Parens surrounding zero or more operands.
+    Paren,
+    /// Square brackets surrounding zero or more operands.
+    Square,
+    /// Parens supporting zero or more operands, or nothing.
+    OptionalParen,
+    /// Square brackets supporting zero or more ops, or nothing.
+    OptionalSquare,
+  };
+
+  /// Parse zero or more SSA comma-separated operand references with a specified
+  /// surrounding delimiter, and an optional required operand count.
+  virtual ParseResult
+  parseOperandList(SmallVectorImpl<OperandType> &result,
+                   int requiredOperandCount = -1,
+                   Delimiter delimiter = Delimiter::None) = 0;
+  ParseResult parseOperandList(SmallVectorImpl<OperandType> &result,
+                               Delimiter delimiter) {
+    return parseOperandList(result, /*requiredOperandCount=*/-1, delimiter);
+  }
+
+  /// Parse zero or more trailing SSA comma-separated trailing operand
+  /// references with a specified surrounding delimiter, and an optional
+  /// required operand count. A leading comma is expected before the operands.
+  virtual ParseResult
+  parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                           int requiredOperandCount = -1,
+                           Delimiter delimiter = Delimiter::None) = 0;
+  ParseResult parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                                       Delimiter delimiter) {
+    return parseTrailingOperandList(result, /*requiredOperandCount=*/-1,
+                                    delimiter);
+  }
+
+  /// Resolve an operand to an SSA value, emitting an error on failure.
+  virtual ParseResult resolveOperand(const OperandType &operand, Type type,
+                                     SmallVectorImpl<Value *> &result) = 0;
+
+  /// Resolve a list of operands to SSA values, emitting an error on failure, or
+  /// appending the results to the list on success. This method should be used
+  /// when all operands have the same type.
+  ParseResult resolveOperands(ArrayRef<OperandType> operands, Type type,
+                              SmallVectorImpl<Value *> &result) {
+    for (auto elt : operands)
+      if (resolveOperand(elt, type, result))
+        return failure();
+    return success();
+  }
+
+  /// Resolve a list of operands and a list of operand types to SSA values,
+  /// emitting an error and returning failure, or appending the results
+  /// to the list on success.
+  ParseResult resolveOperands(ArrayRef<OperandType> operands,
+                              ArrayRef<Type> types, llvm::SMLoc loc,
+                              SmallVectorImpl<Value *> &result) {
+    if (operands.size() != types.size())
+      return emitError(loc)
+             << operands.size() << " operands present, but expected "
+             << types.size();
+
+    for (unsigned i = 0, e = operands.size(); i != e; ++i)
+      if (resolveOperand(operands[i], types[i], result))
+        return failure();
+    return success();
+  }
+
+  /// Parses an affine map attribute where dims and symbols are SSA operands.
+  /// Operand values must come from single-result sources, and be valid
+  /// dimensions/symbol identifiers according to mlir::isValidDim/Symbol.
+  virtual ParseResult
+  parseAffineMapOfSSAIds(SmallVectorImpl<OperandType> &operands, Attribute &map,
+                         StringRef attrName,
+                         SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parses a region. Any parsed blocks are appended to "region" and must be
+  /// moved to the op regions after the op is created. The first block of the
+  /// region takes "arguments" of types "argTypes". If "enableNameShadowing" is
+  /// set to true, the argument names are allowed to shadow the names of other
+  /// existing SSA values defined above the region scope. "enableNameShadowing"
+  /// can only be set to true for regions attached to operations that are
+  /// "IsolatedFromAbove".
+  virtual ParseResult parseRegion(Region &region,
+                                  ArrayRef<OperandType> arguments,
+                                  ArrayRef<Type> argTypes,
+                                  bool enableNameShadowing = false) = 0;
+
+  /// Parses a region if present.
+  virtual ParseResult parseOptionalRegion(Region &region,
+                                          ArrayRef<OperandType> arguments,
+                                          ArrayRef<Type> argTypes,
+                                          bool enableNameShadowing = false) = 0;
+
+  /// Parse a region argument, this argument is resolved when calling
+  /// 'parseRegion'.
+  virtual ParseResult parseRegionArgument(OperandType &argument) = 0;
+
+  /// Parse zero or more region arguments with a specified surrounding
+  /// delimiter, and an optional required argument count. Region arguments
+  /// define new values; so this also checks if values with the same names have
+  /// not been defined yet.
+  virtual ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          int requiredOperandCount = -1,
+                          Delimiter delimiter = Delimiter::None) = 0;
+  virtual ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          Delimiter delimiter) {
+    return parseRegionArgumentList(result, /*requiredOperandCount=*/-1,
+                                   delimiter);
+  }
+
+  /// Parse a region argument if present.
+  virtual ParseResult parseOptionalRegionArgument(OperandType &argument) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Successor Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operation successor and its operand list.
+  virtual ParseResult
+  parseSuccessorAndUseList(Block *&dest,
+                           SmallVectorImpl<Value *> &operands) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a type.
+  virtual ParseResult parseType(Type &result) = 0;
+
+  /// Parse an optional arrow followed by a type list.
+  virtual ParseResult
+  parseOptionalArrowTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse a colon followed by a type.
+  virtual ParseResult parseColonType(Type &result) = 0;
+
+  /// Parse a colon followed by a type of a specific kind, e.g. a FunctionType.
+  template <typename TypeType> ParseResult parseColonType(TypeType &result) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of type.
+    Type type;
+    if (parseColonType(type))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = type.dyn_cast<TypeType>();
+    if (!result)
+      return emitError(loc, "invalid kind of type specified");
+
+    return success();
+  }
+
+  /// Parse a colon followed by a type list, which must have at least one type.
+  virtual ParseResult parseColonTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse an optional colon followed by a type list, which if present must
+  /// have at least one type.
+  virtual ParseResult
+  parseOptionalColonTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse a keyword followed by a type.
+  ParseResult parseKeywordType(const char *keyword, Type &result) {
+    return failure(parseKeyword(keyword) || parseType(result));
+  }
+
+  /// Add the specified type to the end of the specified type list and return
+  /// success.  This is a helper designed to allow parse methods to be simple
+  /// and chain through || operators.
+  ParseResult addTypeToList(Type type, SmallVectorImpl<Type> &result) {
+    result.push_back(type);
+    return success();
+  }
+
+  /// Add the specified types to the end of the specified type list and return
+  /// success.  This is a helper designed to allow parse methods to be simple
+  /// and chain through || operators.
+  ParseResult addTypesToList(ArrayRef<Type> types,
+                             SmallVectorImpl<Type> &result) {
+    result.append(types.begin(), types.end());
+    return success();
+  }
+
+private:
+  /// Parse either an operand list or a region argument list depending on
+  /// whether isOperandList is true.
+  ParseResult parseOperandOrRegionArgList(SmallVectorImpl<OperandType> &result,
+                                          bool isOperandList,
+                                          int requiredOperandCount,
+                                          Delimiter delimiter);
+};
+
+//===--------------------------------------------------------------------===//
+// Dialect OpAsm interface.
+//===--------------------------------------------------------------------===//
+
+class OpAsmDialectInterface
+    : public DialectInterface::Base<OpAsmDialectInterface> {
+public:
+  OpAsmDialectInterface(Dialect *dialect) : Base(dialect) {}
+
+  /// Hooks for getting identifier aliases for symbols. The identifier is used
+  /// in place of the symbol when printing textual IR.
+  ///
+  /// Hook for defining Attribute kind aliases. This will generate an alias for
+  /// all attributes of the given kind in the form : <alias>[0-9]+. These
+  /// aliases must not contain `.`.
+  virtual void getAttributeKindAliases(
+      SmallVectorImpl<std::pair<unsigned, StringRef>> &aliases) const {}
+  /// Hook for defining Attribute aliases. These aliases must not contain `.` or
+  /// end with a numeric digit([0-9]+).
+  virtual void getAttributeAliases(
+      SmallVectorImpl<std::pair<Attribute, StringRef>> &aliases) const {}
+  /// Hook for defining Type aliases.
+  virtual void
+  getTypeAliases(SmallVectorImpl<std::pair<Type, StringRef>> &aliases) const {}
+
+  /// Get a special name to use when printing the given operation. The desired
+  /// name should be streamed into 'os'.
+  virtual void getOpResultName(Operation *op, raw_ostream &os) const {}
+};
+
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Operation.h b/third_party/mlir/include/mlir/IR/Operation.h
new file mode 100644
index 00000000000..d61dc067936
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Operation.h
@@ -0,0 +1,728 @@
+//===- Operation.h - MLIR Operation Class -----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the Operation class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPERATION_H
+#define MLIR_IR_OPERATION_H
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Region.h"
+#include "llvm/ADT/Twine.h"
+
+namespace mlir {
+class BlockAndValueMapping;
+class Location;
+class MLIRContext;
+class OperandIterator;
+class OperandTypeIterator;
+struct OperationState;
+class ResultIterator;
+class ResultTypeIterator;
+
+/// Terminator operations can have Block operands to represent successors.
+using BlockOperand = IROperandImpl<Block>;
+
+/// Operation is a basic unit of execution within a function. Operations can
+/// be nested within other operations effectively forming a tree. Child
+/// operations are organized into operation blocks represented by a 'Block'
+/// class.
+class Operation final
+    : public llvm::ilist_node_with_parent<Operation, Block>,
+      private llvm::TrailingObjects<Operation, OpResult, BlockOperand, unsigned,
+                                    Region, detail::OperandStorage> {
+public:
+  /// Create a new Operation with the specific fields.
+  static Operation *create(Location location, OperationName name,
+                           ArrayRef<Value *> operands,
+                           ArrayRef<Type> resultTypes,
+                           ArrayRef<NamedAttribute> attributes,
+                           ArrayRef<Block *> successors, unsigned numRegions,
+                           bool resizableOperandList);
+
+  /// Overload of create that takes an existing NamedAttributeList to avoid
+  /// unnecessarily uniquing a list of attributes.
+  static Operation *create(Location location, OperationName name,
+                           ArrayRef<Value *> operands,
+                           ArrayRef<Type> resultTypes,
+                           const NamedAttributeList &attributes,
+                           ArrayRef<Block *> successors, unsigned numRegions,
+                           bool resizableOperandList);
+
+  /// Create a new Operation from the fields stored in `state`.
+  static Operation *create(const OperationState &state);
+
+  /// The name of an operation is the key identifier for it.
+  OperationName getName() { return name; }
+
+  /// If this operation has a registered operation description, return it.
+  /// Otherwise return null.
+  const AbstractOperation *getAbstractOperation() {
+    return getName().getAbstractOperation();
+  }
+
+  /// Returns true if this operation has a registered operation description,
+  /// otherwise false.
+  bool isRegistered() { return getAbstractOperation(); }
+
+  /// Remove this operation from its parent block and delete it.
+  void erase();
+
+  /// Create a deep copy of this operation, remapping any operands that use
+  /// values outside of the operation using the map that is provided (leaving
+  /// them alone if no entry is present).  Replaces references to cloned
+  /// sub-operations to the corresponding operation that is copied, and adds
+  /// those mappings to the map.
+  Operation *clone(BlockAndValueMapping &mapper);
+  Operation *clone();
+
+  /// Create a deep copy of this operation but keep the operation regions empty.
+  /// Operands are remapped using `mapper` (if present), and `mapper` is updated
+  /// to contain the results.
+  Operation *cloneWithoutRegions(BlockAndValueMapping &mapper);
+  Operation *cloneWithoutRegions();
+
+  /// Returns the operation block that contains this operation.
+  Block *getBlock() { return block; }
+
+  /// Return the context this operation is associated with.
+  MLIRContext *getContext();
+
+  /// Return the dialact this operation is associated with, or nullptr if the
+  /// associated dialect is not registered.
+  Dialect *getDialect();
+
+  /// The source location the operation was defined or derived from.
+  Location getLoc() { return location; }
+
+  /// Set the source location the operation was defined or derived from.
+  void setLoc(Location loc) { location = loc; }
+
+  /// Returns the region to which the instruction belongs. Returns nullptr if
+  /// the instruction is unlinked.
+  Region *getParentRegion();
+
+  /// Returns the closest surrounding operation that contains this operation
+  /// or nullptr if this is a top-level operation.
+  Operation *getParentOp();
+
+  /// Return the closest surrounding parent operation that is of type 'OpTy'.
+  template <typename OpTy> OpTy getParentOfType() {
+    auto *op = this;
+    while ((op = op->getParentOp()))
+      if (auto parentOp = llvm::dyn_cast<OpTy>(op))
+        return parentOp;
+    return OpTy();
+  }
+
+  /// Replace any uses of 'from' with 'to' within this operation.
+  void replaceUsesOfWith(Value *from, Value *to);
+
+  /// Replace all uses of results of this operation with the provided 'values'.
+  template <typename ValuesT,
+            typename = decltype(std::declval<ValuesT>().begin())>
+  void replaceAllUsesWith(ValuesT &&values) {
+    assert(std::distance(values.begin(), values.end()) == getNumResults() &&
+           "expected 'values' to correspond 1-1 with the number of results");
+
+    auto valueIt = values.begin();
+    for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+      getResult(i)->replaceAllUsesWith(*(valueIt++));
+  }
+
+  /// Replace all uses of results of this operation with results of 'op'.
+  void replaceAllUsesWith(Operation *op) {
+    assert(getNumResults() == op->getNumResults());
+    for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+      getResult(i)->replaceAllUsesWith(op->getResult(i));
+  }
+
+  /// Destroys this operation and its subclass data.
+  void destroy();
+
+  /// This drops all operand uses from this operation, which is an essential
+  /// step in breaking cyclic dependences between references when they are to
+  /// be deleted.
+  void dropAllReferences();
+
+  /// Drop uses of all values defined by this operation or its nested regions.
+  void dropAllDefinedValueUses();
+
+  /// Unlink this operation from its current block and insert it right before
+  /// `existingInst` which may be in the same or another block in the same
+  /// function.
+  void moveBefore(Operation *existingInst);
+
+  /// Unlink this operation from its current block and insert it right before
+  /// `iterator` in the specified block.
+  void moveBefore(Block *block, llvm::iplist<Operation>::iterator iterator);
+
+  /// Given an operation 'other' that is within the same parent block, return
+  /// whether the current operation is before 'other' in the operation list
+  /// of the parent block.
+  /// Note: This function has an average complexity of O(1), but worst case may
+  /// take O(N) where N is the number of operations within the parent block.
+  bool isBeforeInBlock(Operation *other);
+
+  void print(raw_ostream &os);
+  void dump();
+
+  //===--------------------------------------------------------------------===//
+  // Operands
+  //===--------------------------------------------------------------------===//
+
+  /// Returns if the operation has a resizable operation list, i.e. operands can
+  /// be added.
+  bool hasResizableOperandsList() { return getOperandStorage().isResizable(); }
+
+  /// Replace the current operands of this operation with the ones provided in
+  /// 'operands'. If the operands list is not resizable, the size of 'operands'
+  /// must be less than or equal to the current number of operands.
+  void setOperands(ArrayRef<Value *> operands) {
+    getOperandStorage().setOperands(this, operands);
+  }
+
+  unsigned getNumOperands() { return getOperandStorage().size(); }
+
+  Value *getOperand(unsigned idx) { return getOpOperand(idx).get(); }
+  void setOperand(unsigned idx, Value *value) {
+    return getOpOperand(idx).set(value);
+  }
+
+  // Support operand iteration.
+  using operand_iterator = OperandIterator;
+  using operand_range = llvm::iterator_range<operand_iterator>;
+
+  operand_iterator operand_begin();
+  operand_iterator operand_end();
+
+  /// Returns an iterator on the underlying Value's (Value *).
+  operand_range getOperands();
+
+  /// Erase the operand at position `idx`.
+  void eraseOperand(unsigned idx) { getOperandStorage().eraseOperand(idx); }
+
+  MutableArrayRef<OpOperand> getOpOperands() {
+    return getOperandStorage().getOperands();
+  }
+
+  OpOperand &getOpOperand(unsigned idx) { return getOpOperands()[idx]; }
+
+  // Support operand type iteration.
+  using operand_type_iterator = OperandTypeIterator;
+  using operand_type_range = llvm::iterator_range<operand_type_iterator>;
+  operand_type_iterator operand_type_begin();
+  operand_type_iterator operand_type_end();
+  operand_type_range getOperandTypes();
+
+  //===--------------------------------------------------------------------===//
+  // Results
+  //===--------------------------------------------------------------------===//
+
+  /// Return true if there are no users of any results of this operation.
+  bool use_empty();
+
+  unsigned getNumResults() { return numResults; }
+
+  Value *getResult(unsigned idx) { return &getOpResult(idx); }
+
+  // Support result iteration.
+  using result_iterator = ResultIterator;
+  using result_range = llvm::iterator_range<result_iterator>;
+
+  result_iterator result_begin();
+  result_iterator result_end();
+
+  result_range getResults();
+
+  MutableArrayRef<OpResult> getOpResults() {
+    return {getTrailingObjects<OpResult>(), numResults};
+  }
+
+  OpResult &getOpResult(unsigned idx) { return getOpResults()[idx]; }
+
+  // Support result type iteration.
+  using result_type_iterator = ResultTypeIterator;
+  using result_type_range = llvm::iterator_range<result_type_iterator>;
+  result_type_iterator result_type_begin();
+  result_type_iterator result_type_end();
+  result_type_range getResultTypes();
+
+  //===--------------------------------------------------------------------===//
+  // Attributes
+  //===--------------------------------------------------------------------===//
+
+  // Operations may optionally carry a list of attributes that associate
+  // constants to names.  Attributes may be dynamically added and removed over
+  // the lifetime of an operation.
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() { return attrs.getAttrs(); }
+
+  /// Return the internal attribute list on this operation.
+  NamedAttributeList &getAttrList() { return attrs; }
+
+  /// Set the attribute list on this operation.
+  /// Using a NamedAttributeList is more efficient as it does not require new
+  /// uniquing in the MLIRContext.
+  void setAttrs(NamedAttributeList newAttrs) { attrs = newAttrs; }
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute getAttr(Identifier name) { return attrs.get(name); }
+  Attribute getAttr(StringRef name) { return attrs.get(name); }
+
+  template <typename AttrClass> AttrClass getAttrOfType(Identifier name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  template <typename AttrClass> AttrClass getAttrOfType(StringRef name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void setAttr(Identifier name, Attribute value) { attrs.set(name, value); }
+  void setAttr(StringRef name, Attribute value) {
+    setAttr(Identifier::get(name, getContext()), value);
+  }
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  NamedAttributeList::RemoveResult removeAttr(Identifier name) {
+    return attrs.remove(name);
+  }
+
+  /// A utility iterator that filters out non-dialect attributes.
+  class dialect_attr_iterator
+      : public llvm::filter_iterator<ArrayRef<NamedAttribute>::iterator,
+                                     bool (*)(NamedAttribute)> {
+    static bool filter(NamedAttribute attr) {
+      // Dialect attributes are prefixed by the dialect name, like operations.
+      return attr.first.strref().count('.');
+    }
+
+    explicit dialect_attr_iterator(ArrayRef<NamedAttribute>::iterator it,
+                                   ArrayRef<NamedAttribute>::iterator end)
+        : llvm::filter_iterator<ArrayRef<NamedAttribute>::iterator,
+                                bool (*)(NamedAttribute)>(it, end, &filter) {}
+
+    // Allow access to the constructor.
+    friend Operation;
+  };
+  using dialect_attr_range = llvm::iterator_range<dialect_attr_iterator>;
+
+  /// Return a range corresponding to the dialect attributes for this operation.
+  dialect_attr_range getDialectAttrs() {
+    auto attrs = getAttrs();
+    return {dialect_attr_iterator(attrs.begin(), attrs.end()),
+            dialect_attr_iterator(attrs.end(), attrs.end())};
+  }
+  dialect_attr_iterator dialect_attr_begin() {
+    auto attrs = getAttrs();
+    return dialect_attr_iterator(attrs.begin(), attrs.end());
+  }
+  dialect_attr_iterator dialect_attr_end() {
+    auto attrs = getAttrs();
+    return dialect_attr_iterator(attrs.end(), attrs.end());
+  }
+
+  /// Set the dialect attributes for this operation, and preserve all dependent.
+  template <typename DialectAttrT>
+  void setDialectAttrs(DialectAttrT &&dialectAttrs) {
+    SmallVector<NamedAttribute, 16> attrs;
+    attrs.assign(std::begin(dialectAttrs), std::end(dialectAttrs));
+    for (auto attr : getAttrs())
+      if (!attr.first.strref().count('.'))
+        attrs.push_back(attr);
+    setAttrs(llvm::makeArrayRef(attrs));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Blocks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the number of regions held by this operation.
+  unsigned getNumRegions() { return numRegions; }
+
+  /// Returns the regions held by this operation.
+  MutableArrayRef<Region> getRegions() {
+    auto *regions = getTrailingObjects<Region>();
+    return {regions, numRegions};
+  }
+
+  /// Returns the region held by this operation at position 'index'.
+  Region &getRegion(unsigned index) {
+    assert(index < numRegions && "invalid region index");
+    return getRegions()[index];
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Terminators
+  //===--------------------------------------------------------------------===//
+
+  MutableArrayRef<BlockOperand> getBlockOperands() {
+    return {getTrailingObjects<BlockOperand>(), numSuccs};
+  }
+
+  /// Return the operands of this operation that are *not* successor arguments.
+  operand_range getNonSuccessorOperands();
+
+  operand_range getSuccessorOperands(unsigned index);
+
+  Value *getSuccessorOperand(unsigned succIndex, unsigned opIndex) {
+    assert(!isKnownNonTerminator() && "only terminators may have successors");
+    assert(opIndex < getNumSuccessorOperands(succIndex));
+    return getOperand(getSuccessorOperandIndex(succIndex) + opIndex);
+  }
+
+  bool hasSuccessors() { return numSuccs != 0; }
+  unsigned getNumSuccessors() { return numSuccs; }
+  unsigned getNumSuccessorOperands(unsigned index) {
+    assert(!isKnownNonTerminator() && "only terminators may have successors");
+    assert(index < getNumSuccessors());
+    return getTrailingObjects<unsigned>()[index];
+  }
+
+  Block *getSuccessor(unsigned index) {
+    assert(index < getNumSuccessors());
+    return getBlockOperands()[index].get();
+  }
+  void setSuccessor(Block *block, unsigned index);
+
+  /// Erase a specific operand from the operand list of the successor at
+  /// 'index'.
+  void eraseSuccessorOperand(unsigned succIndex, unsigned opIndex) {
+    assert(succIndex < getNumSuccessors());
+    assert(opIndex < getNumSuccessorOperands(succIndex));
+    getOperandStorage().eraseOperand(getSuccessorOperandIndex(succIndex) +
+                                     opIndex);
+    --getTrailingObjects<unsigned>()[succIndex];
+  }
+
+  /// Get the index of the first operand of the successor at the provided
+  /// index.
+  unsigned getSuccessorOperandIndex(unsigned index);
+
+  //===--------------------------------------------------------------------===//
+  // Accessors for various properties of operations
+  //===--------------------------------------------------------------------===//
+
+  /// Returns whether the operation is commutative.
+  bool isCommutative() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::Commutative);
+    return false;
+  }
+
+  /// Returns whether the operation has side-effects.
+  bool hasNoSideEffect() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::NoSideEffect);
+    return false;
+  }
+
+  /// Represents the status of whether an operation is a terminator. We
+  /// represent an 'unknown' status because we want to support unregistered
+  /// terminators.
+  enum class TerminatorStatus { Terminator, NonTerminator, Unknown };
+
+  /// Returns the status of whether this operation is a terminator or not.
+  TerminatorStatus getTerminatorStatus() {
+    if (auto *absOp = getAbstractOperation()) {
+      return absOp->hasProperty(OperationProperty::Terminator)
+                 ? TerminatorStatus::Terminator
+                 : TerminatorStatus::NonTerminator;
+    }
+    return TerminatorStatus::Unknown;
+  }
+
+  /// Returns if the operation is known to be a terminator.
+  bool isKnownTerminator() {
+    return getTerminatorStatus() == TerminatorStatus::Terminator;
+  }
+
+  /// Returns if the operation is known to *not* be a terminator.
+  bool isKnownNonTerminator() {
+    return getTerminatorStatus() == TerminatorStatus::NonTerminator;
+  }
+
+  /// Returns if the operation is known to be completely isolated from enclosing
+  /// regions, i.e. no internal regions reference values defined above this
+  /// operation.
+  bool isKnownIsolatedFromAbove() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::IsolatedFromAbove);
+    return false;
+  }
+
+  /// Attempt to fold this operation with the specified constant operand values
+  /// - the elements in "operands" will correspond directly to the operands of
+  /// the operation, but may be null if non-constant. If folding is successful,
+  /// this fills in the `results` vector. If not, `results` is unspecified.
+  LogicalResult fold(ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results);
+
+  /// Returns if the operation was registered with a particular trait, e.g.
+  /// hasTrait<OperandsAreIntegerLike>().
+  template <template <typename T> class Trait> bool hasTrait() {
+    auto *absOp = getAbstractOperation();
+    return absOp ? absOp->hasTrait<Trait>() : false;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operation Walkers
+  //===--------------------------------------------------------------------===//
+
+  /// Walk this operation in postorder, calling the callback for each operation
+  /// including this one.
+  void walk(llvm::function_ref<void(Operation *)> callback);
+
+  /// Specialization of walk to only visit operations of 'T'.
+  template <typename T> void walk(llvm::function_ref<void(T)> callback) {
+    walk([&](Operation *op) {
+      if (auto derivedOp = dyn_cast<T>(op))
+        callback(derivedOp);
+    });
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// Emit an error with the op name prefixed, like "'dim' op " which is
+  /// convenient for verifiers.
+  InFlightDiagnostic emitOpError(const Twine &message = {});
+
+  /// Emit an error about fatal conditions with this operation, reporting up to
+  /// any diagnostic handlers that may be listening.
+  InFlightDiagnostic emitError(const Twine &message = {});
+
+  /// Emit a warning about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitWarning(const Twine &message = {});
+
+  /// Emit a remark about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitRemark(const Twine &message = {});
+
+private:
+  Operation(Location location, OperationName name, unsigned numResults,
+            unsigned numSuccessors, unsigned numRegions,
+            const NamedAttributeList &attributes);
+
+  // Operations are deleted through the destroy() member because they are
+  // allocated with malloc.
+  ~Operation();
+
+  /// Returns the operand storage object.
+  detail::OperandStorage &getOperandStorage() {
+    return *getTrailingObjects<detail::OperandStorage>();
+  }
+
+  /// Provide a 'getParent' method for ilist_node_with_parent methods.
+  /// We mark it as const function because ilist_node_with_parent specifically
+  /// requires a 'getParent() const' method. Once ilist_node removes this
+  /// constraint, we should drop the const to fit the rest of the MLIR const
+  /// model.
+  Block *getParent() const { return block; }
+
+  /// The operation block that containts this operation.
+  Block *block = nullptr;
+
+  /// This holds information about the source location the operation was defined
+  /// or derived from.
+  Location location;
+
+  /// Relative order of this operation in its parent block. Used for
+  /// O(1) local dominance checks between operations.
+  mutable unsigned orderIndex = 0;
+
+  const unsigned numResults, numSuccs, numRegions;
+
+  /// This holds the name of the operation.
+  OperationName name;
+
+  /// This holds general named attributes for the operation.
+  NamedAttributeList attrs;
+
+  // allow ilist_traits access to 'block' field.
+  friend struct llvm::ilist_traits<Operation>;
+
+  // allow block to access the 'orderIndex' field.
+  friend class Block;
+
+  // allow ilist_node_with_parent to access the 'getParent' method.
+  friend class llvm::ilist_node_with_parent<Operation, Block>;
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<Operation, OpResult, BlockOperand, unsigned,
+                               Region, detail::OperandStorage>;
+  size_t numTrailingObjects(OverloadToken<OpResult>) const {
+    return numResults;
+  }
+  size_t numTrailingObjects(OverloadToken<BlockOperand>) const {
+    return numSuccs;
+  }
+  size_t numTrailingObjects(OverloadToken<Region>) const { return numRegions; }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const { return numSuccs; }
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Operation &op) {
+  op.print(os);
+  return os;
+}
+
+/// This class implements the const/non-const operand iterators for the
+/// Operation class in terms of getOperand(idx).
+class OperandIterator final
+    : public indexed_accessor_iterator<OperandIterator, Operation *, Value *,
+                                       Value *, Value *> {
+public:
+  /// Initializes the operand iterator to the specified operand index.
+  OperandIterator(Operation *object, unsigned index)
+      : indexed_accessor_iterator<OperandIterator, Operation *, Value *,
+                                  Value *, Value *>(object, index) {}
+
+  Value *operator*() const { return this->object->getOperand(this->index); }
+};
+
+/// This class implements the operand type iterators for the Operation
+/// class in terms of operand_iterator->getType().
+class OperandTypeIterator final
+    : public llvm::mapped_iterator<OperandIterator, Type (*)(Value *)> {
+  static Type unwrap(Value *value) { return value->getType(); }
+
+public:
+  using reference = Type;
+
+  /// Initializes the operand type iterator to the specified operand iterator.
+  OperandTypeIterator(OperandIterator it)
+      : llvm::mapped_iterator<OperandIterator, Type (*)(Value *)>(it, &unwrap) {
+  }
+};
+
+// Implement the inline operand iterator methods.
+inline auto Operation::operand_begin() -> operand_iterator {
+  return operand_iterator(this, 0);
+}
+
+inline auto Operation::operand_end() -> operand_iterator {
+  return operand_iterator(this, getNumOperands());
+}
+
+inline auto Operation::getOperands() -> operand_range {
+  return {operand_begin(), operand_end()};
+}
+
+inline auto Operation::operand_type_begin() -> operand_type_iterator {
+  return operand_type_iterator(operand_begin());
+}
+
+inline auto Operation::operand_type_end() -> operand_type_iterator {
+  return operand_type_iterator(operand_end());
+}
+
+inline auto Operation::getOperandTypes() -> operand_type_range {
+  return {operand_type_begin(), operand_type_end()};
+}
+
+/// This class implements the result iterators for the Operation class
+/// in terms of getResult(idx).
+class ResultIterator final
+    : public indexed_accessor_iterator<ResultIterator, Operation *, Value *,
+                                       Value *, Value *> {
+public:
+  /// Initializes the result iterator to the specified index.
+  ResultIterator(Operation *object, unsigned index)
+      : indexed_accessor_iterator<ResultIterator, Operation *, Value *, Value *,
+                                  Value *>(object, index) {}
+
+  Value *operator*() const { return this->object->getResult(this->index); }
+};
+
+/// This class implements the result type iterators for the Operation
+/// class in terms of result_iterator->getType().
+class ResultTypeIterator final
+    : public llvm::mapped_iterator<ResultIterator, Type (*)(Value *)> {
+  static Type unwrap(Value *value) { return value->getType(); }
+
+public:
+  using reference = Type;
+
+  /// Initializes the result type iterator to the specified result iterator.
+  ResultTypeIterator(ResultIterator it)
+      : llvm::mapped_iterator<ResultIterator, Type (*)(Value *)>(it, &unwrap) {}
+};
+
+// Implement the inline result iterator methods.
+inline auto Operation::result_begin() -> result_iterator {
+  return result_iterator(this, 0);
+}
+
+inline auto Operation::result_end() -> result_iterator {
+  return result_iterator(this, getNumResults());
+}
+
+inline auto Operation::getResults() -> llvm::iterator_range<result_iterator> {
+  return {result_begin(), result_end()};
+}
+
+inline auto Operation::result_type_begin() -> result_type_iterator {
+  return result_type_iterator(result_begin());
+}
+
+inline auto Operation::result_type_end() -> result_type_iterator {
+  return result_type_iterator(result_end());
+}
+
+inline auto Operation::getResultTypes() -> result_type_range {
+  return {result_type_begin(), result_type_end()};
+}
+
+} // end namespace mlir
+
+namespace llvm {
+/// Provide isa functionality for operation casts.
+template <typename T> struct isa_impl<T, ::mlir::Operation> {
+  static inline bool doit(const ::mlir::Operation &op) {
+    return T::classof(const_cast<::mlir::Operation *>(&op));
+  }
+};
+
+/// Provide specializations for operation casts as the resulting T is value
+/// typed.
+template <typename T> struct cast_retty_impl<T, ::mlir::Operation *> {
+  using ret_type = T;
+};
+template <typename T> struct cast_retty_impl<T, ::mlir::Operation> {
+  using ret_type = T;
+};
+template <class T>
+struct cast_convert_val<T, ::mlir::Operation, ::mlir::Operation> {
+  static T doit(::mlir::Operation &val) { return T(&val); }
+};
+template <class T>
+struct cast_convert_val<T, ::mlir::Operation *, ::mlir::Operation *> {
+  static T doit(::mlir::Operation *val) { return T(val); }
+};
+} // end namespace llvm
+
+#endif // MLIR_IR_OPERATION_H
diff --git a/third_party/mlir/include/mlir/IR/OperationSupport.h b/third_party/mlir/include/mlir/IR/OperationSupport.h
new file mode 100644
index 00000000000..fd5e96930ea
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/OperationSupport.h
@@ -0,0 +1,495 @@
+//===- OperationSupport.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a number of support types that Operation and related
+// classes build on top of.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPERATION_SUPPORT_H
+#define MLIR_IR_OPERATION_SUPPORT_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <memory>
+
+namespace mlir {
+class Block;
+class Dialect;
+class Operation;
+struct OperationState;
+class OpAsmParser;
+class OpAsmParserResult;
+class OpAsmPrinter;
+class OpFoldResult;
+class ParseResult;
+class Pattern;
+class Region;
+class RewritePattern;
+class Type;
+class Value;
+
+/// This is an adaptor from a list of values to named operands of OpTy.  In a
+/// generic operation context, e.g., in dialect conversions, an ordered array of
+/// `Value`s is treated as operands of `OpTy`.  This adaptor takes a reference
+/// to the array and provides accessors with the same names as `OpTy` for
+/// operands.  This makes possible to create function templates that operate on
+/// either OpTy or OperandAdaptor<OpTy> seamlessly.
+template <typename OpTy> using OperandAdaptor = typename OpTy::OperandAdaptor;
+
+class OwningRewritePatternList;
+
+enum class OperationProperty {
+  /// This bit is set for an operation if it is a commutative operation: that
+  /// is a binary operator (two inputs) where "a op b" and "b op a" produce the
+  /// same results.
+  Commutative = 0x1,
+
+  /// This bit is set for operations that have no side effects: that means that
+  /// they do not read or write memory, or access any hidden state.
+  NoSideEffect = 0x2,
+
+  /// This bit is set for an operation if it is a terminator: that means
+  /// an operation at the end of a block.
+  Terminator = 0x4,
+
+  /// This bit is set for operations that are completely isolated from above.
+  /// This is used for operations whose regions are explicit capture only, i.e.
+  /// they are never allowed to implicitly reference values defined above the
+  /// parent operation.
+  IsolatedFromAbove = 0x8,
+};
+
+/// This is a "type erased" representation of a registered operation.  This
+/// should only be used by things like the AsmPrinter and other things that need
+/// to be parameterized by generic operation hooks.  Most user code should use
+/// the concrete operation types.
+class AbstractOperation {
+public:
+  using OperationProperties = uint32_t;
+
+  /// This is the name of the operation.
+  const StringRef name;
+
+  /// This is the dialect that this operation belongs to.
+  Dialect &dialect;
+
+  /// Return true if this "op class" can match against the specified operation.
+  bool (&classof)(Operation *op);
+
+  /// Use the specified object to parse this ops custom assembly format.
+  ParseResult (&parseAssembly)(OpAsmParser *parser, OperationState *result);
+
+  /// This hook implements the AsmPrinter for this operation.
+  void (&printAssembly)(Operation *op, OpAsmPrinter *p);
+
+  /// This hook implements the verifier for this operation.  It should emits an
+  /// error message and returns failure if a problem is detected, or returns
+  /// success if everything is ok.
+  LogicalResult (&verifyInvariants)(Operation *op);
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return failure.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return success.
+  ///  3. They can return a list of existing values that can be used instead of
+  ///     the operation.  In this case, fill in the results list and return
+  ///     success.  The caller will remove the operation and use those results
+  ///     instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  LogicalResult (&foldHook)(Operation *op, ArrayRef<Attribute> operands,
+                            SmallVectorImpl<OpFoldResult> &results);
+
+  /// This hook returns any canonicalization pattern rewrites that the operation
+  /// supports, for use by the canonicalization pass.
+  void (&getCanonicalizationPatterns)(OwningRewritePatternList &results,
+                                      MLIRContext *context);
+
+  /// Returns whether the operation has a particular property.
+  bool hasProperty(OperationProperty property) const {
+    return opProperties & static_cast<OperationProperties>(property);
+  }
+
+  /// Returns an instance of the concept object for the given interface if it
+  /// was registered to this operation, null otherwise. This should not be used
+  /// directly.
+  template <typename T> typename T::Concept *getInterface() const {
+    return reinterpret_cast<typename T::Concept *>(
+        getRawInterface(T::getInterfaceID()));
+  }
+
+  /// Returns if the operation has a particular trait.
+  template <template <typename T> class Trait> bool hasTrait() const {
+    return hasRawTrait(ClassID::getID<Trait>());
+  }
+
+  /// Look up the specified operation in the specified MLIRContext and return a
+  /// pointer to it if present.  Otherwise, return a null pointer.
+  static const AbstractOperation *lookup(StringRef opName,
+                                         MLIRContext *context);
+
+  /// This constructor is used by Dialect objects when they register the list of
+  /// operations they contain.
+  template <typename T> static AbstractOperation get(Dialect &dialect) {
+    return AbstractOperation(
+        T::getOperationName(), dialect, T::getOperationProperties(), T::classof,
+        T::parseAssembly, T::printAssembly, T::verifyInvariants, T::foldHook,
+        T::getCanonicalizationPatterns, T::getRawInterface, T::hasTrait);
+  }
+
+private:
+  AbstractOperation(
+      StringRef name, Dialect &dialect, OperationProperties opProperties,
+      bool (&classof)(Operation *op),
+      ParseResult (&parseAssembly)(OpAsmParser *parser, OperationState *result),
+      void (&printAssembly)(Operation *op, OpAsmPrinter *p),
+      LogicalResult (&verifyInvariants)(Operation *op),
+      LogicalResult (&foldHook)(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results),
+      void (&getCanonicalizationPatterns)(OwningRewritePatternList &results,
+                                          MLIRContext *context),
+      void *(&getRawInterface)(ClassID *interfaceID),
+      bool (&hasTrait)(ClassID *traitID))
+      : name(name), dialect(dialect), classof(classof),
+        parseAssembly(parseAssembly), printAssembly(printAssembly),
+        verifyInvariants(verifyInvariants), foldHook(foldHook),
+        getCanonicalizationPatterns(getCanonicalizationPatterns),
+        opProperties(opProperties), getRawInterface(getRawInterface),
+        hasRawTrait(hasTrait) {}
+
+  /// The properties of the operation.
+  const OperationProperties opProperties;
+
+  /// Returns a raw instance of the concept for the given interface id if it is
+  /// registered to this operation, nullptr otherwise. This should not be used
+  /// directly.
+  void *(&getRawInterface)(ClassID *interfaceID);
+
+  /// This hook returns if the operation contains the trait corresponding
+  /// to the given ClassID.
+  bool (&hasRawTrait)(ClassID *traitID);
+};
+
+class OperationName {
+public:
+  using RepresentationUnion =
+      llvm::PointerUnion<Identifier, const AbstractOperation *>;
+
+  OperationName(AbstractOperation *op) : representation(op) {}
+  OperationName(StringRef name, MLIRContext *context);
+
+  /// Return the name of the dialect this operation is registered to.
+  StringRef getDialect() const;
+
+  /// Return the name of this operation.  This always succeeds.
+  StringRef getStringRef() const;
+
+  /// If this operation has a registered operation description, return it.
+  /// Otherwise return null.
+  const AbstractOperation *getAbstractOperation() const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  void *getAsOpaquePointer() const {
+    return static_cast<void *>(representation.getOpaqueValue());
+  }
+  static OperationName getFromOpaquePointer(void *pointer);
+
+private:
+  RepresentationUnion representation;
+  OperationName(RepresentationUnion representation)
+      : representation(representation) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, OperationName identifier) {
+  identifier.print(os);
+  return os;
+}
+
+inline bool operator==(OperationName lhs, OperationName rhs) {
+  return lhs.getAsOpaquePointer() == rhs.getAsOpaquePointer();
+}
+
+inline bool operator!=(OperationName lhs, OperationName rhs) {
+  return lhs.getAsOpaquePointer() != rhs.getAsOpaquePointer();
+}
+
+// Make operation names hashable.
+inline llvm::hash_code hash_value(OperationName arg) {
+  return llvm::hash_value(arg.getAsOpaquePointer());
+}
+
+/// This represents an operation in an abstracted form, suitable for use with
+/// the builder APIs.  This object is a large and heavy weight object meant to
+/// be used as a temporary object on the stack.  It is generally unwise to put
+/// this in a collection.
+struct OperationState {
+  Location location;
+  OperationName name;
+  SmallVector<Value *, 4> operands;
+  /// Types of the results of this operation.
+  SmallVector<Type, 4> types;
+  SmallVector<NamedAttribute, 4> attributes;
+  /// Successors of this operation and their respective operands.
+  SmallVector<Block *, 1> successors;
+  /// Regions that the op will hold.
+  SmallVector<std::unique_ptr<Region>, 1> regions;
+  /// If the operation has a resizable operand list.
+  bool resizableOperandList = false;
+
+public:
+  OperationState(Location location, StringRef name);
+
+  OperationState(Location location, OperationName name);
+
+  OperationState(Location location, StringRef name, ArrayRef<Value *> operands,
+                 ArrayRef<Type> types, ArrayRef<NamedAttribute> attributes,
+                 ArrayRef<Block *> successors = {},
+                 MutableArrayRef<std::unique_ptr<Region>> regions = {},
+                 bool resizableOperandList = false);
+
+  void addOperands(ArrayRef<Value *> newOperands) {
+    assert(successors.empty() &&
+           "Non successor operands should be added first.");
+    operands.append(newOperands.begin(), newOperands.end());
+  }
+
+  void addTypes(ArrayRef<Type> newTypes) {
+    types.append(newTypes.begin(), newTypes.end());
+  }
+
+  /// Add an attribute with the specified name.
+  void addAttribute(StringRef name, Attribute attr) {
+    addAttribute(Identifier::get(name, getContext()), attr);
+  }
+
+  /// Add an attribute with the specified name.
+  void addAttribute(Identifier name, Attribute attr) {
+    attributes.push_back({name, attr});
+  }
+
+  /// Add an array of named attributes.
+  void addAttributes(ArrayRef<NamedAttribute> newAttributes) {
+    attributes.append(newAttributes.begin(), newAttributes.end());
+  }
+
+  void addSuccessor(Block *successor, ArrayRef<Value *> succOperands) {
+    successors.push_back(successor);
+    // Insert a sentinal operand to mark a barrier between successor operands.
+    operands.push_back(nullptr);
+    operands.append(succOperands.begin(), succOperands.end());
+  }
+
+  /// Create a region that should be attached to the operation.  These regions
+  /// can be filled in immediately without waiting for Operation to be
+  /// created.  When it is, the region bodies will be transferred.
+  Region *addRegion();
+
+  /// Take a region that should be attached to the Operation.  The body of the
+  /// region will be transferred when the Operation is constructed.  If the
+  /// region is null, a new empty region will be attached to the Operation.
+  void addRegion(std::unique_ptr<Region> &&region);
+
+  /// Sets the operand list of the operation as resizable.
+  void setOperandListToResizable(bool isResizable = true) {
+    resizableOperandList = isResizable;
+  }
+
+  /// Get the context held by this operation state.
+  MLIRContext *getContext() { return location->getContext(); }
+};
+
+namespace detail {
+/// A utility class holding the information necessary to dynamically resize
+/// operands.
+struct ResizableStorage {
+  ResizableStorage(OpOperand *opBegin, unsigned numOperands)
+      : firstOpAndIsDynamic(opBegin, false), capacity(numOperands) {}
+
+  ~ResizableStorage() { cleanupStorage(); }
+
+  /// Cleanup any allocated storage.
+  void cleanupStorage() {
+    // If the storage is dynamic, then we need to free the storage.
+    if (isStorageDynamic())
+      free(firstOpAndIsDynamic.getPointer());
+  }
+
+  /// Sets the storage pointer to a new dynamically allocated block.
+  void setDynamicStorage(OpOperand *opBegin) {
+    /// Cleanup the old storage if necessary.
+    cleanupStorage();
+    firstOpAndIsDynamic.setPointerAndInt(opBegin, true);
+  }
+
+  /// Returns the current storage pointer.
+  OpOperand *getPointer() { return firstOpAndIsDynamic.getPointer(); }
+
+  /// Returns if the current storage of operands is in the trailing objects is
+  /// in a dynamically allocated memory block.
+  bool isStorageDynamic() const { return firstOpAndIsDynamic.getInt(); }
+
+  /// A pointer to the first operand element. This is either to the trailing
+  /// objects storage, or a dynamically allocated block of memory.
+  llvm::PointerIntPair<OpOperand *, 1, bool> firstOpAndIsDynamic;
+
+  // The maximum number of operands that can be currently held by the storage.
+  unsigned capacity;
+};
+
+/// This class handles the management of operation operands. Operands are
+/// stored similarly to the elements of a SmallVector except for two key
+/// differences. The first is the inline storage, which is a trailing objects
+/// array. The second is that being able to dynamically resize the operand list
+/// is optional.
+class OperandStorage final
+    : private llvm::TrailingObjects<OperandStorage, ResizableStorage,
+                                    OpOperand> {
+public:
+  OperandStorage(unsigned numOperands, bool resizable)
+      : numOperands(numOperands), resizable(resizable) {
+    // Initialize the resizable storage.
+    if (resizable) {
+      new (&getResizableStorage())
+          ResizableStorage(getTrailingObjects<OpOperand>(), numOperands);
+    }
+  }
+
+  ~OperandStorage() {
+    // Manually destruct the operands.
+    for (auto &operand : getOperands())
+      operand.~OpOperand();
+
+    // If the storage is resizable then destruct the utility.
+    if (resizable)
+      getResizableStorage().~ResizableStorage();
+  }
+
+  /// Replace the operands contained in the storage with the ones provided in
+  /// 'operands'.
+  void setOperands(Operation *owner, ArrayRef<Value *> operands);
+
+  /// Erase an operand held by the storage.
+  void eraseOperand(unsigned index);
+
+  /// Get the operation operands held by the storage.
+  MutableArrayRef<OpOperand> getOperands() {
+    return {getRawOperands(), size()};
+  }
+
+  /// Return the number of operands held in the storage.
+  unsigned size() const { return numOperands; }
+
+  /// Returns the additional size necessary for allocating this object.
+  static size_t additionalAllocSize(unsigned numOperands, bool resizable) {
+    return additionalSizeToAlloc<ResizableStorage, OpOperand>(resizable ? 1 : 0,
+                                                              numOperands);
+  }
+
+  /// Returns if this storage is resizable.
+  bool isResizable() const { return resizable; }
+
+private:
+  /// Clear the storage and destroy the current operands held by the storage.
+  void clear() { numOperands = 0; }
+
+  /// Returns the current pointer for the raw operands array.
+  OpOperand *getRawOperands() {
+    return resizable ? getResizableStorage().getPointer()
+                     : getTrailingObjects<OpOperand>();
+  }
+
+  /// Returns the resizable operand utility class.
+  ResizableStorage &getResizableStorage() {
+    assert(resizable);
+    return *getTrailingObjects<ResizableStorage>();
+  }
+
+  /// Grow the internal resizable operand storage.
+  void grow(ResizableStorage &resizeUtil, size_t minSize);
+
+  /// The current number of operands, and the current max operand capacity.
+  unsigned numOperands : 31;
+
+  /// Whether this storage is resizable or not.
+  bool resizable : 1;
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<OperandStorage, ResizableStorage, OpOperand>;
+  size_t numTrailingObjects(OverloadToken<ResizableStorage>) const {
+    return resizable ? 1 : 0;
+  }
+};
+} // end namespace detail
+} // end namespace mlir
+
+namespace llvm {
+// Identifiers hash just like pointers, there is no need to hash the bytes.
+template <> struct DenseMapInfo<mlir::OperationName> {
+  static mlir::OperationName getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::OperationName::getFromOpaquePointer(pointer);
+  }
+  static mlir::OperationName getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::OperationName::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::OperationName Val) {
+    return DenseMapInfo<void *>::getHashValue(Val.getAsOpaquePointer());
+  }
+  static bool isEqual(mlir::OperationName LHS, mlir::OperationName RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// The pointer inside of an identifier comes from a StringMap, so its alignment
+/// is always at least 4 and probably 8 (on 64-bit machines).  Allow LLVM to
+/// steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::OperationName> {
+public:
+  static inline void *getAsVoidPointer(mlir::OperationName I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::OperationName getFromVoidPointer(void *P) {
+    return mlir::OperationName::getFromOpaquePointer(P);
+  }
+  enum {
+    NumLowBitsAvailable = PointerLikeTypeTraits<
+        mlir::OperationName::RepresentationUnion>::NumLowBitsAvailable
+  };
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/PatternMatch.h b/third_party/mlir/include/mlir/IR/PatternMatch.h
new file mode 100644
index 00000000000..b531a617b99
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/PatternMatch.h
@@ -0,0 +1,467 @@
+//===- PatternMatch.h - PatternMatcher classes -------==---------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_PATTERNMATCHER_H
+#define MLIR_PATTERNMATCHER_H
+
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+
+class PatternRewriter;
+
+//===----------------------------------------------------------------------===//
+// PatternBenefit class
+//===----------------------------------------------------------------------===//
+
+/// This class represents the benefit of a pattern match in a unitless scheme
+/// that ranges from 0 (very little benefit) to 65K.  The most common unit to
+/// use here is the "number of operations matched" by the pattern.
+///
+/// This also has a sentinel representation that can be used for patterns that
+/// fail to match.
+///
+class PatternBenefit {
+  enum { ImpossibleToMatchSentinel = 65535 };
+
+public:
+  /*implicit*/ PatternBenefit(unsigned benefit);
+  PatternBenefit(const PatternBenefit &) = default;
+  PatternBenefit &operator=(const PatternBenefit &) = default;
+
+  static PatternBenefit impossibleToMatch() { return PatternBenefit(); }
+  bool isImpossibleToMatch() const { return *this == impossibleToMatch(); }
+
+  /// If the corresponding pattern can match, return its benefit.  If the
+  // corresponding pattern isImpossibleToMatch() then this aborts.
+  unsigned short getBenefit() const;
+
+  bool operator==(const PatternBenefit &rhs) const {
+    return representation == rhs.representation;
+  }
+  bool operator!=(const PatternBenefit &rhs) const { return !(*this == rhs); }
+  bool operator<(const PatternBenefit &rhs) const {
+    return representation < rhs.representation;
+  }
+
+private:
+  PatternBenefit() : representation(ImpossibleToMatchSentinel) {}
+  unsigned short representation;
+};
+
+/// Pattern state is used by patterns that want to maintain state between their
+/// match and rewrite phases.  Patterns can define a pattern-specific subclass
+/// of this.
+class PatternState {
+public:
+  virtual ~PatternState() {}
+
+protected:
+  // Must be subclassed.
+  PatternState() {}
+};
+
+/// This is the type returned by a pattern match.  A match failure returns a
+/// None value.  A match success returns a Some value with any state the pattern
+/// may need to maintain (but may also be null).
+using PatternMatchResult = Optional<std::unique_ptr<PatternState>>;
+
+//===----------------------------------------------------------------------===//
+// Pattern class
+//===----------------------------------------------------------------------===//
+
+/// Instances of Pattern can be matched against SSA IR.  These matches get used
+/// in ways dependent on their subclasses and the driver doing the matching.
+/// For example, RewritePatterns implement a rewrite from one matched pattern
+/// to a replacement DAG tile.
+class Pattern {
+public:
+  /// Return the benefit (the inverse of "cost") of matching this pattern.  The
+  /// benefit of a Pattern is always static - rewrites that may have dynamic
+  /// benefit can be instantiated multiple times (different Pattern instances)
+  /// for each benefit that they may return, and be guarded by different match
+  /// condition predicates.
+  PatternBenefit getBenefit() const { return benefit; }
+
+  /// Return the root node that this pattern matches.  Patterns that can
+  /// match multiple root types are instantiated once per root.
+  OperationName getRootKind() const { return rootKind; }
+
+  //===--------------------------------------------------------------------===//
+  // Implementation hooks for patterns to implement.
+  //===--------------------------------------------------------------------===//
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind().  On failure, this
+  /// returns a None value.  On success it returns a (possibly null)
+  /// pattern-specific state wrapped in an Optional.
+  virtual PatternMatchResult match(Operation *op) const = 0;
+
+  virtual ~Pattern() {}
+
+  //===--------------------------------------------------------------------===//
+  // Helper methods to simplify pattern implementations
+  //===--------------------------------------------------------------------===//
+
+  /// This method indicates that no match was found.
+  static PatternMatchResult matchFailure() { return None; }
+
+  /// This method indicates that a match was found and has the specified cost.
+  PatternMatchResult
+  matchSuccess(std::unique_ptr<PatternState> state = {}) const {
+    return PatternMatchResult(std::move(state));
+  }
+
+protected:
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  Pattern(StringRef rootName, PatternBenefit benefit, MLIRContext *context);
+
+private:
+  const OperationName rootKind;
+  const PatternBenefit benefit;
+
+  virtual void anchor();
+};
+
+/// RewritePattern is the common base class for all DAG to DAG replacements.
+/// There are two possible usages of this class:
+///   * Multi-step RewritePattern with "match" and "rewrite"
+///     - By overloading the "match" and "rewrite" functions, the user can
+///       separate the concerns of matching and rewriting.
+///   * Single-step RewritePattern with "matchAndRewrite"
+///     - By overloading the "matchAndRewrite" function, the user can perform
+///       the rewrite in the same call as the match. This removes the need for
+///       any PatternState.
+///
+class RewritePattern : public Pattern {
+public:
+  /// Rewrite the IR rooted at the specified operation with the result of
+  /// this pattern, generating any new operations with the specified
+  /// rewriter.  If an unexpected error is encountered (an internal
+  /// compiler error), it is emitted through the normal MLIR diagnostic
+  /// hooks and the IR is left in a valid state.
+  virtual void rewrite(Operation *op, std::unique_ptr<PatternState> state,
+                       PatternRewriter &rewriter) const;
+
+  /// Rewrite the IR rooted at the specified operation with the result of
+  /// this pattern, generating any new operations with the specified
+  /// builder.  If an unexpected error is encountered (an internal
+  /// compiler error), it is emitted through the normal MLIR diagnostic
+  /// hooks and the IR is left in a valid state.
+  virtual void rewrite(Operation *op, PatternRewriter &rewriter) const;
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind().  On failure, this
+  /// returns a None value.  On success, it returns a (possibly null)
+  /// pattern-specific state wrapped in an Optional.  This state is passed back
+  /// into the rewrite function if this match is selected.
+  PatternMatchResult match(Operation *op) const override;
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind(). If successful, this
+  /// function will automatically perform the rewrite.
+  virtual PatternMatchResult matchAndRewrite(Operation *op,
+                                             PatternRewriter &rewriter) const {
+    if (auto matchResult = match(op)) {
+      rewrite(op, std::move(*matchResult), rewriter);
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+
+  /// Return a list of operations that may be generated when rewriting an
+  /// operation instance with this pattern.
+  ArrayRef<OperationName> getGeneratedOps() const { return generatedOps; }
+
+protected:
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  RewritePattern(StringRef rootName, PatternBenefit benefit,
+                 MLIRContext *context)
+      : Pattern(rootName, benefit, context) {}
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching. They can also specify
+  /// the names of operations that may be generated during a successful rewrite.
+  RewritePattern(StringRef rootName, ArrayRef<StringRef> generatedNames,
+                 PatternBenefit benefit, MLIRContext *context);
+
+  /// A list of the potential operations that may be generated when rewriting
+  /// an op with this pattern.
+  llvm::SmallVector<OperationName, 2> generatedOps;
+};
+
+/// OpRewritePattern is a wrapper around RewritePattern that allows for
+/// matching and rewriting against an instance of a derived operation class as
+/// opposed to a raw Operation.
+template <typename SourceOp> struct OpRewritePattern : public RewritePattern {
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  OpRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : RewritePattern(SourceOp::getOperationName(), benefit, context) {}
+
+  /// Wrappers around the RewritePattern methods that pass the derived op type.
+  void rewrite(Operation *op, std::unique_ptr<PatternState> state,
+               PatternRewriter &rewriter) const final {
+    rewrite(llvm::cast<SourceOp>(op), std::move(state), rewriter);
+  }
+  void rewrite(Operation *op, PatternRewriter &rewriter) const final {
+    rewrite(llvm::cast<SourceOp>(op), rewriter);
+  }
+  PatternMatchResult match(Operation *op) const final {
+    return match(llvm::cast<SourceOp>(op));
+  }
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    return matchAndRewrite(llvm::cast<SourceOp>(op), rewriter);
+  }
+
+  /// Rewrite and Match methods that operate on the SourceOp type. These must be
+  /// overridden by the derived pattern class.
+  virtual void rewrite(SourceOp op, std::unique_ptr<PatternState> state,
+                       PatternRewriter &rewriter) const {
+    rewrite(op, rewriter);
+  }
+  virtual void rewrite(SourceOp op, PatternRewriter &rewriter) const {
+    llvm_unreachable("must override matchAndRewrite or a rewrite method");
+  }
+  virtual PatternMatchResult match(SourceOp op) const {
+    llvm_unreachable("must override match or matchAndRewrite");
+  }
+  virtual PatternMatchResult matchAndRewrite(SourceOp op,
+                                             PatternRewriter &rewriter) const {
+    if (auto matchResult = match(op)) {
+      rewrite(op, std::move(*matchResult), rewriter);
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PatternRewriter class
+//===----------------------------------------------------------------------===//
+
+/// This class coordinates the application of a pattern to the current function,
+/// providing a way to create operations and keep track of what gets deleted.
+///
+/// These class serves two purposes:
+///  1) it is the interface that patterns interact with to make mutations to the
+///     IR they are being applied to.
+///  2) It is a base class that clients of the PatternMatcher use when they want
+///     to apply patterns and observe their effects (e.g. to keep worklists or
+///     other data structures up to date).
+///
+class PatternRewriter : public OpBuilder {
+public:
+  /// Create operation of specific op type at the current insertion point
+  /// without verifying to see if it is valid.
+  template <typename OpTy, typename... Args>
+  OpTy create(Location location, Args... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, &state, args...);
+    auto *op = createOperation(state);
+    auto result = dyn_cast<OpTy>(op);
+    assert(result && "Builder didn't return the right type");
+    return result;
+  }
+
+  /// Creates an operation of specific op type at the current insertion point.
+  /// If the result is an invalid op (the verifier hook fails), emit an error
+  /// and return null.
+  template <typename OpTy, typename... Args>
+  OpTy createChecked(Location location, Args... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, &state, args...);
+    auto *op = createOperation(state);
+
+    // If the Operation we produce is valid, return it.
+    if (!OpTy::verifyInvariants(op)) {
+      auto result = dyn_cast<OpTy>(op);
+      assert(result && "Builder didn't return the right type");
+      return result;
+    }
+
+    // Otherwise, the error message got emitted.  Just remove the operation
+    // we made.
+    op->erase();
+    return OpTy();
+  }
+
+  /// This is implemented to create the specified operations and serves as a
+  /// notification hook for rewriters that want to know about new operations.
+  virtual Operation *createOperation(const OperationState &state) = 0;
+
+  /// Move the blocks that belong to "region" before the given position in
+  /// another region "parent".  The two regions must be different.  The caller
+  /// is responsible for creating or updating the operation transferring flow
+  // of control to the region and pass it the correct block arguments.
+  virtual void inlineRegionBefore(Region &region, Region &parent,
+                                  Region::iterator before);
+  void inlineRegionBefore(Region &region, Block *before);
+
+  /// This method performs the final replacement for a pattern, where the
+  /// results of the operation are updated to use the specified list of SSA
+  /// values.  In addition to replacing and removing the specified operation,
+  /// clients can specify a list of other nodes that this replacement may make
+  /// (perhaps transitively) dead.  If any of those values are dead, this will
+  /// remove them as well.
+  virtual void replaceOp(Operation *op, ArrayRef<Value *> newValues,
+                         ArrayRef<Value *> valuesToRemoveIfDead);
+  void replaceOp(Operation *op, ArrayRef<Value *> newValues) {
+    replaceOp(op, newValues, llvm::None);
+  }
+
+  /// Replaces the result op with a new op that is created without verification.
+  /// The result values of the two ops must be the same types.
+  template <typename OpTy, typename... Args>
+  void replaceOpWithNewOp(Operation *op, Args &&... args) {
+    auto newOp = create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+    replaceOpWithResultsOfAnotherOp(op, newOp.getOperation(), {});
+  }
+
+  /// Replaces the result op with a new op that is created without verification.
+  /// The result values of the two ops must be the same types.  This allows
+  /// specifying a list of ops that may be removed if dead.
+  template <typename OpTy, typename... Args>
+  void replaceOpWithNewOp(ArrayRef<Value *> valuesToRemoveIfDead, Operation *op,
+                          Args &&... args) {
+    auto newOp = create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+    replaceOpWithResultsOfAnotherOp(op, newOp.getOperation(),
+                                    valuesToRemoveIfDead);
+  }
+
+  /// Split the operations starting at "before" (inclusive) out of the given
+  /// block into a new block, and return it.
+  virtual Block *splitBlock(Block *block, Block::iterator before) {
+    return block->splitBlock(before);
+  }
+
+  /// This method is used as the final notification hook for patterns that end
+  /// up modifying the pattern root in place, by changing its operands.  This is
+  /// a minor efficiency win (it avoids creating a new operation and removing
+  /// the old one) but also often allows simpler code in the client.
+  ///
+  /// The valuesToRemoveIfDead list is an optional list of values that the
+  /// rewriter should remove if they are dead at this point.
+  ///
+  void updatedRootInPlace(Operation *op,
+                          ArrayRef<Value *> valuesToRemoveIfDead = {});
+
+protected:
+  explicit PatternRewriter(MLIRContext *ctx) : OpBuilder(ctx) {}
+  virtual ~PatternRewriter();
+
+  // These are the callback methods that subclasses can choose to implement if
+  // they would like to be notified about certain types of mutations.
+
+  /// Notify the pattern rewriter that the specified operation has been mutated
+  /// in place.  This is called after the mutation is done.
+  virtual void notifyRootUpdated(Operation *op) {}
+
+  /// Notify the pattern rewriter that the specified operation is about to be
+  /// replaced with another set of operations.  This is called before the uses
+  /// of the operation have been changed.
+  virtual void notifyRootReplaced(Operation *op) {}
+
+  /// This is called on an operation that a pattern match is removing, right
+  /// before the operation is deleted.  At this point, the operation has zero
+  /// uses.
+  virtual void notifyOperationRemoved(Operation *op) {}
+
+private:
+  /// op and newOp are known to have the same number of results, replace the
+  /// uses of op with uses of newOp
+  void replaceOpWithResultsOfAnotherOp(Operation *op, Operation *newOp,
+                                       ArrayRef<Value *> valuesToRemoveIfDead);
+};
+
+//===----------------------------------------------------------------------===//
+// Pattern-driven rewriters
+//===----------------------------------------------------------------------===//
+
+class OwningRewritePatternList {
+  using PatternListT = std::vector<std::unique_ptr<RewritePattern>>;
+
+public:
+  PatternListT::iterator begin() { return patterns.begin(); }
+  PatternListT::iterator end() { return patterns.end(); }
+  PatternListT::const_iterator begin() const { return patterns.begin(); }
+  PatternListT::const_iterator end() const { return patterns.end(); }
+  void clear() { patterns.clear(); }
+
+  //===--------------------------------------------------------------------===//
+  // Pattern Insertion
+  //===--------------------------------------------------------------------===//
+
+  void insert(RewritePattern *pattern) { patterns.emplace_back(pattern); }
+
+  /// Add an instance of each of the pattern types 'Ts' to the pattern list with
+  /// the given arguments.
+  // Note: ConstructorArg is necessary here to separate the two variadic lists.
+  template <typename... Ts, typename ConstructorArg,
+            typename... ConstructorArgs>
+  void insert(ConstructorArg &&arg, ConstructorArgs &&... args) {
+    // The following expands a call to emplace_back for each of the pattern
+    // types 'Ts'. This magic is necessary due to a limitation in the places
+    // that a parameter pack can be expanded in c++11.
+    // FIXME: In c++17 this can be simplified by using 'fold expressions'.
+    using dummy = int[];
+    (void)dummy{
+        0, (patterns.emplace_back(std::make_unique<Ts>(arg, args...)), 0)...};
+  }
+
+private:
+  PatternListT patterns;
+};
+
+/// This class manages optimization and execution of a group of rewrite
+/// patterns, providing an API for finding and applying, the best match against
+/// a given node.
+///
+class RewritePatternMatcher {
+public:
+  /// Create a RewritePatternMatcher with the specified set of patterns.
+  explicit RewritePatternMatcher(const OwningRewritePatternList &patterns);
+
+  /// Try to match the given operation to a pattern and rewrite it. Return
+  /// true if any pattern matches.
+  bool matchAndRewrite(Operation *op, PatternRewriter &rewriter);
+
+private:
+  RewritePatternMatcher(const RewritePatternMatcher &) = delete;
+  void operator=(const RewritePatternMatcher &) = delete;
+
+  /// The group of patterns that are matched for optimization through this
+  /// matcher.
+  std::vector<RewritePattern *> patterns;
+};
+
+/// Rewrite the regions of the specified operation, which must be isolated from
+/// above, by repeatedly applying the highest benefit patterns in a greedy
+/// work-list driven manner. Return true if no more patterns can be matched in
+/// the result operation regions.
+/// Note: This does not apply patterns to the top-level operation itself.
+/// Note: This method also performs folding and simply dead-code elimination
+///       before attempting to match any of the provided patterns.
+///
+bool applyPatternsGreedily(Operation *op,
+                           const OwningRewritePatternList &patterns);
+
+} // end namespace mlir
+
+#endif // MLIR_PATTERN_MATCH_H
diff --git a/third_party/mlir/include/mlir/IR/Region.h b/third_party/mlir/include/mlir/IR/Region.h
new file mode 100644
index 00000000000..c6f97c31fb8
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Region.h
@@ -0,0 +1,147 @@
+//===- Region.h - MLIR Region Class -----------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the Region class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_REGION_H
+#define MLIR_IR_REGION_H
+
+#include "mlir/IR/Block.h"
+
+namespace mlir {
+class BlockAndValueMapping;
+
+/// This class contains a list of basic blocks and a link to the parent
+/// operation it is attached to.
+class Region {
+public:
+  Region() = default;
+  explicit Region(Operation *container);
+  ~Region();
+
+  /// Return the context this region is inserted in.  The region must have a
+  /// valid parent container.
+  MLIRContext *getContext();
+
+  /// Return a location for this region. This is the location attached to the
+  /// parent container. The region must have a valid parent container.
+  Location getLoc();
+
+  using RegionType = llvm::iplist<Block>;
+  RegionType &getBlocks() { return blocks; }
+
+  // Iteration over the block in the function.
+  using iterator = RegionType::iterator;
+  using reverse_iterator = RegionType::reverse_iterator;
+
+  iterator begin() { return blocks.begin(); }
+  iterator end() { return blocks.end(); }
+  reverse_iterator rbegin() { return blocks.rbegin(); }
+  reverse_iterator rend() { return blocks.rend(); }
+
+  bool empty() { return blocks.empty(); }
+  void push_back(Block *block) { blocks.push_back(block); }
+  void push_front(Block *block) { blocks.push_front(block); }
+
+  Block &back() { return blocks.back(); }
+  Block &front() { return blocks.front(); }
+
+  /// getSublistAccess() - Returns pointer to member of region.
+  static RegionType Region::*getSublistAccess(Block *) {
+    return &Region::blocks;
+  }
+
+  /// Return the region containing this region or nullptr if the region is
+  /// attached to a top-level operation.
+  Region *getParentRegion();
+
+  /// Return the parent operation this region is attached to.
+  Operation *getParentOp();
+
+  /// Find the first parent operation of the given type, or nullptr if there is
+  /// no ancestor operation.
+  template <typename ParentT> ParentT getParentOfType() {
+    auto *region = this;
+    do {
+      if (auto parent = dyn_cast_or_null<ParentT>(region->container))
+        return parent;
+    } while ((region = region->getParentRegion()));
+    return ParentT();
+  }
+
+  /// Return the number of this region in the parent operation.
+  unsigned getRegionNumber();
+
+  /// Return true if this region is a proper ancestor of the `other` region.
+  bool isProperAncestor(Region *other);
+
+  /// Return true if this region is ancestor of the `other` region.  A region
+  /// is considered as its own ancestor, use `isProperAncestor` to avoid this.
+  bool isAncestor(Region *other) {
+    return this == other || isProperAncestor(other);
+  }
+
+  /// Clone the internal blocks from this region into dest. Any
+  /// cloned blocks are appended to the back of dest. If the mapper
+  /// contains entries for block arguments, these arguments are not included
+  /// in the respective cloned block.
+  void cloneInto(Region *dest, BlockAndValueMapping &mapper);
+  /// Clone this region into 'dest' before the given position in 'dest'.
+  void cloneInto(Region *dest, Region::iterator destPos,
+                 BlockAndValueMapping &mapper);
+
+  /// Takes body of another region (that region will have no body after this
+  /// operation completes).  The current body of this region is cleared.
+  void takeBody(Region &other) {
+    blocks.clear();
+    blocks.splice(blocks.end(), other.getBlocks());
+  }
+
+  /// Check that this does not use any value defined outside it.
+  /// Emit errors if `noteLoc` is provided; this location is used to point
+  /// to the operation containing the region, the actual error is reported at
+  /// the operation with an offending use.
+  bool isIsolatedFromAbove(llvm::Optional<Location> noteLoc = llvm::None);
+
+  /// Drop all operand uses from operations within this region, which is
+  /// an essential step in breaking cyclic dependences between references when
+  /// they are to be deleted.
+  void dropAllReferences();
+
+  /// Walk the operations in this block in postorder, calling the callback for
+  /// each operation.
+  void walk(llvm::function_ref<void(Operation *)> callback);
+
+  /// Displays the CFG in a window. This is for use from the debugger and
+  /// depends on Graphviz to generate the graph.
+  /// This function is defined in ViewRegionGraph and only works with that
+  /// target linked.
+  void viewGraph(const llvm::Twine &regionName);
+  void viewGraph();
+
+private:
+  RegionType blocks;
+
+  /// This is the object we are part of.
+  Operation *container;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_REGION_H
diff --git a/third_party/mlir/include/mlir/IR/RegionGraphTraits.h b/third_party/mlir/include/mlir/IR/RegionGraphTraits.h
new file mode 100644
index 00000000000..f45dcc41a4a
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/RegionGraphTraits.h
@@ -0,0 +1,94 @@
+//===- RegionGraphTraits.h - llvm::GraphTraits for CFGs ---------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements specializations of llvm::GraphTraits for various MLIR
+// CFG data types.  This allows the generic LLVM graph algorithms to be applied
+// to CFGs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_REGIONGRAPHTRAITS_H
+#define MLIR_IR_REGIONGRAPHTRAITS_H
+
+#include "mlir/IR/Region.h"
+#include "llvm/ADT/GraphTraits.h"
+
+namespace llvm {
+template <> struct GraphTraits<mlir::Block *> {
+  using ChildIteratorType = mlir::Block::succ_iterator;
+  using Node = mlir::Block;
+  using NodeRef = Node *;
+
+  static NodeRef getEntryNode(NodeRef bb) { return bb; }
+
+  static ChildIteratorType child_begin(NodeRef node) {
+    return node->succ_begin();
+  }
+  static ChildIteratorType child_end(NodeRef node) { return node->succ_end(); }
+};
+
+template <> struct GraphTraits<Inverse<mlir::Block *>> {
+  using ChildIteratorType = mlir::Block::pred_iterator;
+  using Node = mlir::Block;
+  using NodeRef = Node *;
+  static NodeRef getEntryNode(Inverse<NodeRef> inverseGraph) {
+    return inverseGraph.Graph;
+  }
+  static inline ChildIteratorType child_begin(NodeRef node) {
+    return node->pred_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef node) {
+    return node->pred_end();
+  }
+};
+
+template <>
+struct GraphTraits<mlir::Region *> : public GraphTraits<mlir::Block *> {
+  using GraphType = mlir::Region *;
+  using NodeRef = mlir::Block *;
+
+  static NodeRef getEntryNode(GraphType fn) { return &fn->front(); }
+
+  using nodes_iterator = pointer_iterator<mlir::Region::iterator>;
+  static nodes_iterator nodes_begin(GraphType fn) {
+    return nodes_iterator(fn->begin());
+  }
+  static nodes_iterator nodes_end(GraphType fn) {
+    return nodes_iterator(fn->end());
+  }
+};
+
+template <>
+struct GraphTraits<Inverse<mlir::Region *>>
+    : public GraphTraits<Inverse<mlir::Block *>> {
+  using GraphType = Inverse<mlir::Region *>;
+  using NodeRef = NodeRef;
+
+  static NodeRef getEntryNode(GraphType fn) { return &fn.Graph->front(); }
+
+  using nodes_iterator = pointer_iterator<mlir::Region::iterator>;
+  static nodes_iterator nodes_begin(GraphType fn) {
+    return nodes_iterator(fn.Graph->begin());
+  }
+  static nodes_iterator nodes_end(GraphType fn) {
+    return nodes_iterator(fn.Graph->end());
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/StandardTypes.h b/third_party/mlir/include/mlir/IR/StandardTypes.h
new file mode 100644
index 00000000000..0e788988c4d
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/StandardTypes.h
@@ -0,0 +1,470 @@
+//===- StandardTypes.h - MLIR Standard Type Classes -------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_STANDARDTYPES_H
+#define MLIR_IR_STANDARDTYPES_H
+
+#include "mlir/IR/Types.h"
+
+namespace llvm {
+struct fltSemantics;
+} // namespace llvm
+
+namespace mlir {
+class AffineMap;
+class FloatType;
+class IndexType;
+class IntegerType;
+class Location;
+class MLIRContext;
+
+namespace detail {
+
+struct IntegerTypeStorage;
+struct ShapedTypeStorage;
+struct VectorTypeStorage;
+struct RankedTensorTypeStorage;
+struct UnrankedTensorTypeStorage;
+struct MemRefTypeStorage;
+struct ComplexTypeStorage;
+struct TupleTypeStorage;
+
+} // namespace detail
+
+namespace StandardTypes {
+enum Kind {
+  // Floating point.
+  BF16 = Type::Kind::FIRST_STANDARD_TYPE,
+  F16,
+  F32,
+  F64,
+  FIRST_FLOATING_POINT_TYPE = BF16,
+  LAST_FLOATING_POINT_TYPE = F64,
+
+  // Target pointer sized integer, used (e.g.) in affine mappings.
+  Index,
+
+  // Derived types.
+  Integer,
+  Vector,
+  RankedTensor,
+  UnrankedTensor,
+  MemRef,
+  Complex,
+  Tuple,
+  None,
+};
+
+} // namespace StandardTypes
+
+/// Index is a special integer-like type with unknown platform-dependent bit
+/// width.
+class IndexType : public Type::TypeBase<IndexType, Type> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the IndexType.
+  static IndexType get(MLIRContext *context);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Index; }
+};
+
+/// Integer types can have arbitrary bitwidth up to a large fixed limit.
+class IntegerType
+    : public Type::TypeBase<IntegerType, Type, detail::IntegerTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new IntegerType of the given width within the context.
+  /// Assume the width is within the allowed range and assert on failures.
+  /// Use getChecked to handle failures gracefully.
+  static IntegerType get(unsigned width, MLIRContext *context);
+
+  /// Get or create a new IntegerType of the given width within the context,
+  /// defined at the given, potentially unknown, location.  If the width is
+  /// outside the allowed range, emit errors and return a null type.
+  static IntegerType getChecked(unsigned width, MLIRContext *context,
+                                Location location);
+
+  /// Verify the construction of an integer type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, unsigned width);
+
+  /// Return the bitwidth of this integer type.
+  unsigned getWidth() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Integer; }
+
+  /// Integer representation maximal bitwidth.
+  static constexpr unsigned kMaxWidth = 4096;
+};
+
+class FloatType : public Type::TypeBase<FloatType, Type> {
+public:
+  using Base::Base;
+
+  static FloatType get(StandardTypes::Kind kind, MLIRContext *context);
+
+  // Convenience factories.
+  static FloatType getBF16(MLIRContext *ctx) {
+    return get(StandardTypes::BF16, ctx);
+  }
+  static FloatType getF16(MLIRContext *ctx) {
+    return get(StandardTypes::F16, ctx);
+  }
+  static FloatType getF32(MLIRContext *ctx) {
+    return get(StandardTypes::F32, ctx);
+  }
+  static FloatType getF64(MLIRContext *ctx) {
+    return get(StandardTypes::F64, ctx);
+  }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind >= StandardTypes::FIRST_FLOATING_POINT_TYPE &&
+           kind <= StandardTypes::LAST_FLOATING_POINT_TYPE;
+  }
+
+  /// Return the bitwidth of this float type.
+  unsigned getWidth();
+
+  /// Return the floating semantics of this float type.
+  const llvm::fltSemantics &getFloatSemantics();
+};
+
+/// This is a common base class between Vector, UnrankedTensor, RankedTensor,
+/// and MemRef types because they share behavior and semantics around shape,
+/// rank, and fixed element type. Any type with these semantics should inherit
+/// from ShapedType.
+class ShapedType : public Type {
+public:
+  using ImplType = detail::ShapedTypeStorage;
+  using Type::Type;
+
+  /// Return the element type.
+  Type getElementType() const;
+
+  /// If an element type is an integer or a float, return its width. Otherwise,
+  /// abort.
+  unsigned getElementTypeBitWidth() const;
+
+  /// If it has static shape, return the number of elements. Otherwise, abort.
+  int64_t getNumElements() const;
+
+  /// If this is a ranked type, return the rank. Otherwise, abort.
+  int64_t getRank() const;
+
+  /// Whether or not this is a ranked type. Memrefs, vectors and ranked tensors
+  /// have a rank, while unranked tensors do not.
+  bool hasRank() const;
+
+  /// If this is a ranked type, return the shape. Otherwise, abort.
+  ArrayRef<int64_t> getShape() const;
+
+  /// If this is unranked type or any dimension has unknown size (<0), it
+  /// doesn't have static shape. If all dimensions have known size (>= 0), it
+  /// has static shape.
+  bool hasStaticShape() const;
+
+  /// If this is a ranked type, return the number of dimensions with dynamic
+  /// size. Otherwise, abort.
+  int64_t getNumDynamicDims() const;
+
+  /// If this is ranked type, return the size of the specified dimension.
+  /// Otherwise, abort.
+  int64_t getDimSize(int64_t i) const;
+
+  /// Get the total amount of bits occupied by a value of this type.  This does
+  /// not take into account any memory layout or widening constraints, e.g. a
+  /// vector<3xi57> is reported to occupy 3x57=171 bit, even though in practice
+  /// it will likely be stored as in a 4xi64 vector register.  Fail an assertion
+  /// if the size cannot be computed statically, i.e. if the type has a dynamic
+  /// shape or if its elemental type does not have a known bit width.
+  int64_t getSizeInBits() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Type type) {
+    return type.getKind() == StandardTypes::Vector ||
+           type.getKind() == StandardTypes::RankedTensor ||
+           type.getKind() == StandardTypes::UnrankedTensor ||
+           type.getKind() == StandardTypes::MemRef;
+  }
+
+  /// Whether the given dimension size indicates a dynamic dimension.
+  static constexpr bool isDynamic(int64_t dSize) { return dSize < 0; }
+};
+
+/// Vector types represent multi-dimensional SIMD vectors, and have a fixed
+/// known constant shape with one or more dimension.
+class VectorType
+    : public Type::TypeBase<VectorType, ShapedType, detail::VectorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new VectorType of the provided shape and element type.
+  /// Assumes the arguments define a well-formed VectorType.
+  static VectorType get(ArrayRef<int64_t> shape, Type elementType);
+
+  /// Get or create a new VectorType of the provided shape and element type
+  /// declared at the given, potentially unknown, location.  If the VectorType
+  /// defined by the arguments would be ill-formed, emit errors and return
+  /// nullptr-wrapping type.
+  static VectorType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                               Location location);
+
+  /// Verify the construction of a vector type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, ArrayRef<int64_t> shape,
+                               Type elementType);
+
+  /// Returns true of the given type can be used as an element of a vector type.
+  /// In particular, vectors can consist of integer or float primitives.
+  static bool isValidElementType(Type t) { return t.isIntOrFloat(); }
+
+  ArrayRef<int64_t> getShape() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Vector; }
+};
+
+/// Tensor types represent multi-dimensional arrays, and have two variants:
+/// RankedTensorType and UnrankedTensorType.
+class TensorType : public ShapedType {
+public:
+  using ShapedType::ShapedType;
+
+  /// Return true if the specified element type is ok in a tensor.
+  static bool isValidElementType(Type type) {
+    // Note: Non standard/builtin types are allowed to exist within tensor
+    // types. Dialects are expected to verify that tensor types have a valid
+    // element type within that dialect.
+    return type.isIntOrFloat() || type.isa<VectorType>() ||
+           type.isa<OpaqueType>() ||
+           (type.getKind() > Type::Kind::LAST_STANDARD_TYPE);
+  }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Type type) {
+    return type.getKind() == StandardTypes::RankedTensor ||
+           type.getKind() == StandardTypes::UnrankedTensor;
+  }
+};
+
+/// Ranked tensor types represent multi-dimensional arrays that have a shape
+/// with a fixed number of dimensions. Each shape element can be a positive
+/// integer or unknown (represented -1).
+class RankedTensorType
+    : public Type::TypeBase<RankedTensorType, TensorType,
+                            detail::RankedTensorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new RankedTensorType of the provided shape and element
+  /// type. Assumes the arguments define a well-formed type.
+  static RankedTensorType get(ArrayRef<int64_t> shape, Type elementType);
+
+  /// Get or create a new RankedTensorType of the provided shape and element
+  /// type declared at the given, potentially unknown, location.  If the
+  /// RankedTensorType defined by the arguments would be ill-formed, emit errors
+  /// and return a nullptr-wrapping type.
+  static RankedTensorType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                     Location location);
+
+  /// Verify the construction of a ranked tensor type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, ArrayRef<int64_t> shape,
+                               Type elementType);
+
+  ArrayRef<int64_t> getShape() const;
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardTypes::RankedTensor;
+  }
+};
+
+/// Unranked tensor types represent multi-dimensional arrays that have an
+/// unknown shape.
+class UnrankedTensorType
+    : public Type::TypeBase<UnrankedTensorType, TensorType,
+                            detail::UnrankedTensorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new UnrankedTensorType of the provided shape and element
+  /// type. Assumes the arguments define a well-formed type.
+  static UnrankedTensorType get(Type elementType);
+
+  /// Get or create a new UnrankedTensorType of the provided shape and element
+  /// type declared at the given, potentially unknown, location.  If the
+  /// UnrankedTensorType defined by the arguments would be ill-formed, emit
+  /// errors and return a nullptr-wrapping type.
+  static UnrankedTensorType getChecked(Type elementType, Location location);
+
+  /// Verify the construction of a unranked tensor type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, Type elementType);
+
+  ArrayRef<int64_t> getShape() const { return llvm::None; }
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardTypes::UnrankedTensor;
+  }
+};
+
+/// MemRef types represent a region of memory that have a shape with a fixed
+/// number of dimensions. Each shape element can be a non-negative integer or
+/// unknown (represented by any negative integer). MemRef types also have an
+/// affine map composition, represented as an array AffineMap pointers.
+class MemRefType
+    : public Type::TypeBase<MemRefType, ShapedType, detail::MemRefTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new MemRefType based on shape, element type, affine
+  /// map composition, and memory space.  Assumes the arguments define a
+  /// well-formed MemRef type.  Use getChecked to gracefully handle MemRefType
+  /// construction failures.
+  static MemRefType get(ArrayRef<int64_t> shape, Type elementType,
+                        ArrayRef<AffineMap> affineMapComposition = {},
+                        unsigned memorySpace = 0);
+
+  /// Get or create a new MemRefType based on shape, element type, affine
+  /// map composition, and memory space declared at the given location.
+  /// If the location is unknown, the last argument should be an instance of
+  /// UnknownLoc.  If the MemRefType defined by the arguments would be
+  /// ill-formed, emits errors (to the handler registered with the context or to
+  /// the error stream) and returns nullptr.
+  static MemRefType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                               ArrayRef<AffineMap> affineMapComposition,
+                               unsigned memorySpace, Location location);
+
+  ArrayRef<int64_t> getShape() const;
+
+  /// Returns an array of affine map pointers representing the memref affine
+  /// map composition.
+  ArrayRef<AffineMap> getAffineMaps() const;
+
+  /// Returns the memory space in which data referred to by this memref resides.
+  unsigned getMemorySpace() const;
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::MemRef; }
+
+private:
+  /// Get or create a new MemRefType defined by the arguments.  If the resulting
+  /// type would be ill-formed, return nullptr.  If the location is provided,
+  /// emit detailed error messages.
+  static MemRefType getImpl(ArrayRef<int64_t> shape, Type elementType,
+                            ArrayRef<AffineMap> affineMapComposition,
+                            unsigned memorySpace, Optional<Location> location);
+  using Base::getImpl;
+};
+
+/// The 'complex' type represents a complex number with a parameterized element
+/// type, which is composed of a real and imaginary value of that element type.
+///
+/// The element must be a floating point or integer scalar type.
+///
+class ComplexType
+    : public Type::TypeBase<ComplexType, Type, detail::ComplexTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a ComplexType with the provided element type.
+  static ComplexType get(Type elementType);
+
+  /// Get or create a ComplexType with the provided element type.  This emits
+  /// and error at the specified location and returns null if the element type
+  /// isn't supported.
+  static ComplexType getChecked(Type elementType, Location location);
+
+  /// Verify the construction of an integer type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, Type elementType);
+
+  Type getElementType();
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Complex; }
+
+private:
+  static ComplexType getCheckedImpl(Type elementType,
+                                    Optional<Location> location);
+};
+
+/// Tuple types represent a collection of other types. Note: This type merely
+/// provides a common mechanism for representing tuples in MLIR. It is up to
+/// dialect authors to provides operations for manipulating them, e.g.
+/// extract_tuple_element. When possible, users should prefer multi-result
+/// operations in the place of tuples.
+class TupleType
+    : public Type::TypeBase<TupleType, Type, detail::TupleTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new TupleType with the provided element types. Assumes the
+  /// arguments define a well-formed type.
+  static TupleType get(ArrayRef<Type> elementTypes, MLIRContext *context);
+
+  /// Get or create an empty tuple type.
+  static TupleType get(MLIRContext *context) { return get({}, context); }
+
+  /// Return the elements types for this tuple.
+  ArrayRef<Type> getTypes() const;
+
+  /// Accumulate the types contained in this tuple and tuples nested within it.
+  /// Note that this only flattens nested tuples, not any other container type,
+  /// e.g. a tuple<i32, tensor<i32>, tuple<f32, tuple<i64>>> is flattened to
+  /// (i32, tensor<i32>, f32, i64)
+  void getFlattenedTypes(SmallVectorImpl<Type> &types);
+
+  /// Return the number of held types.
+  size_t size() const;
+
+  /// Iterate over the held elements.
+  using iterator = ArrayRef<Type>::iterator;
+  iterator begin() const { return getTypes().begin(); }
+  iterator end() const { return getTypes().end(); }
+
+  /// Return the element type at index 'index'.
+  Type getType(size_t index) const {
+    assert(index < size() && "invalid index for tuple type");
+    return getTypes()[index];
+  }
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Tuple; }
+};
+
+/// NoneType is a unit type, i.e. a type with exactly one possible value, where
+/// its value does not have a defined dynamic representation.
+class NoneType : public Type::TypeBase<NoneType, Type> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the NoneType.
+  static NoneType get(MLIRContext *context);
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::None; }
+};
+} // end namespace mlir
+
+#endif // MLIR_IR_STANDARDTYPES_H
diff --git a/third_party/mlir/include/mlir/IR/StorageUniquerSupport.h b/third_party/mlir/include/mlir/IR/StorageUniquerSupport.h
new file mode 100644
index 00000000000..1a730731f32
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/StorageUniquerSupport.h
@@ -0,0 +1,94 @@
+//===- StorageUniquerSupport.h - MLIR Storage Uniquer Utilities -*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines utility classes for interfacing with StorageUniquer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_STORAGEUNIQUERSUPPORT_H
+#define MLIR_IR_STORAGEUNIQUERSUPPORT_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+class Location;
+class MLIRContext;
+
+namespace detail {
+/// Utility class for implementing users of storage classes uniqued by a
+/// StorageUniquer. Clients are not expected to interact with this class
+/// directly.
+template <typename ConcreteT, typename BaseT, typename StorageT,
+          typename UniquerT>
+class StorageUserBase : public BaseT {
+public:
+  using BaseT::BaseT;
+
+  /// Utility declarations for the concrete attribute class.
+  using Base = StorageUserBase<ConcreteT, BaseT, StorageT, UniquerT>;
+  using ImplType = StorageT;
+
+  /// Return a unique identifier for the concrete type.
+  static ClassID *getClassID() { return ClassID::getID<ConcreteT>(); }
+
+  /// Provide a default implementation of 'classof' that invokes a 'kindof'
+  /// method on the concrete type.
+  template <typename T> static bool classof(T val) {
+    static_assert(std::is_convertible<ConcreteT, T>::value,
+                  "casting from a non-convertible type");
+    return ConcreteT::kindof(val.getKind());
+  }
+
+protected:
+  /// Get or create a new ConcreteT instance within the ctx. This
+  /// function is guaranteed to return a non null object and will assert if
+  /// the arguments provided are invalid.
+  template <typename... Args>
+  static ConcreteT get(MLIRContext *ctx, unsigned kind, Args... args) {
+    // Ensure that the invariants are correct for construction.
+    assert(succeeded(
+        ConcreteT::verifyConstructionInvariants(llvm::None, ctx, args...)));
+    return UniquerT::template get<ConcreteT>(ctx, kind, args...);
+  }
+
+  /// Get or create a new ConcreteT instance within the ctx, defined at
+  /// the given, potentially unknown, location. If the arguments provided are
+  /// invalid then emit errors and return a null object.
+  template <typename... Args>
+  static ConcreteT getChecked(const Location &loc, MLIRContext *ctx,
+                              unsigned kind, Args... args) {
+    // If the construction invariants fail then we return a null attribute.
+    if (failed(ConcreteT::verifyConstructionInvariants(loc, ctx, args...)))
+      return ConcreteT();
+    return UniquerT::template get<ConcreteT>(ctx, kind, args...);
+  }
+
+  /// Default implementation that just returns success.
+  template <typename... Args>
+  static LogicalResult verifyConstructionInvariants(Args... args) {
+    return success();
+  }
+
+  /// Utility for easy access to the storage instance.
+  ImplType *getImpl() const { return static_cast<ImplType *>(this->impl); }
+};
+} // namespace detail
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/SymbolTable.h b/third_party/mlir/include/mlir/IR/SymbolTable.h
new file mode 100644
index 00000000000..88268094204
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/SymbolTable.h
@@ -0,0 +1,109 @@
+//===- SymbolTable.h - MLIR Symbol Table Class ------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_SYMBOLTABLE_H
+#define MLIR_IR_SYMBOLTABLE_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir {
+class Identifier;
+class MLIRContext;
+class Operation;
+
+/// This class allows for representing and managing the symbol table used by
+/// operations with the 'SymbolTable' trait.
+class SymbolTable {
+public:
+  /// Build a symbol table with the symbols within the given operation.
+  SymbolTable(Operation *op);
+
+  /// Look up a symbol with the specified name, returning null if no such
+  /// name exists. Names never include the @ on them.
+  Operation *lookup(StringRef name) const;
+  template <typename T> T lookup(StringRef name) const {
+    return dyn_cast_or_null<T>(lookup(name));
+  }
+
+  /// Erase the given symbol from the table.
+  void erase(Operation *symbol);
+
+  /// Insert a new symbol into the table, and rename it as necessary to avoid
+  /// collisions.
+  void insert(Operation *symbol);
+
+  /// Returns the context held by this symbol table.
+  MLIRContext *getContext() const { return context; }
+
+  /// Return the name of the attribute used for symbol names.
+  static StringRef getSymbolAttrName() { return "sym_name"; }
+
+private:
+  MLIRContext *context;
+
+  /// This is a mapping from a name to the symbol with that name.
+  llvm::StringMap<Operation *> symbolTable;
+
+  /// This is used when name conflicts are detected.
+  unsigned uniquingCounter = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// SymbolTable Trait Types
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+namespace impl {
+LogicalResult verifySymbolTable(Operation *op);
+} // namespace impl
+
+/// A trait used to provide symbol table functionalities to a region operation.
+/// This operation must hold exactly 1 region. Once attached, all operations
+/// that are directly within the region, i.e not including those within child
+/// regions, that contain a 'SymbolTable::getSymbolAttrName()' StringAttr will
+/// be verified to ensure that the names are uniqued.
+template <typename ConcreteType>
+class SymbolTable : public TraitBase<ConcreteType, SymbolTable> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySymbolTable(op);
+  }
+
+  /// Look up a symbol with the specified name, returning null if no such
+  /// name exists. Symbol names never include the @ on them. Note: This
+  /// performs a linear scan of held symbols.
+  Operation *lookupSymbol(StringRef name) {
+    // Look for a symbol with the given name.
+    for (auto &block : this->getOperation()->getRegion(0)) {
+      for (auto &op : block) {
+        auto nameAttr = op.template getAttrOfType<StringAttr>(
+            mlir::SymbolTable::getSymbolAttrName());
+        if (nameAttr && nameAttr.getValue() == name)
+          return &op;
+      }
+    }
+    return nullptr;
+  }
+  template <typename T> T lookupSymbol(StringRef name) {
+    return dyn_cast_or_null<T>(lookupSymbol(name));
+  }
+};
+} // end namespace OpTrait
+} // end namespace mlir
+
+#endif // MLIR_IR_SYMBOLTABLE_H
diff --git a/third_party/mlir/include/mlir/IR/TypeSupport.h b/third_party/mlir/include/mlir/IR/TypeSupport.h
new file mode 100644
index 00000000000..86620da0b5c
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/TypeSupport.h
@@ -0,0 +1,121 @@
+//===- TypeSupport.h --------------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines support types for registering dialect extended types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_TYPE_SUPPORT_H
+#define MLIR_IR_TYPE_SUPPORT_H
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StorageUniquerSupport.h"
+
+namespace mlir {
+struct ClassID;
+class Dialect;
+class MLIRContext;
+
+//===----------------------------------------------------------------------===//
+// TypeStorage
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+class TypeUniquer;
+} // end namespace detail
+
+/// Base storage class appearing in a Type.
+class TypeStorage : public StorageUniquer::BaseStorage {
+  friend detail::TypeUniquer;
+  friend StorageUniquer;
+
+protected:
+  /// This constructor is used by derived classes as part of the TypeUniquer.
+  /// When using this constructor, the initializeTypeInfo function must be
+  /// invoked afterwards for the storage to be valid.
+  TypeStorage(unsigned subclassData = 0)
+      : dialect(nullptr), subclassData(subclassData) {}
+
+public:
+  /// Get the dialect that this type is registered to.
+  Dialect &getDialect() {
+    assert(dialect && "Malformed type storage object.");
+    return *dialect;
+  }
+  /// Get the subclass data.
+  unsigned getSubclassData() const { return subclassData; }
+
+  /// Set the subclass data.
+  void setSubclassData(unsigned val) { subclassData = val; }
+
+private:
+  // Set the dialect for this storage instance. This is used by the TypeUniquer
+  // when initializing a newly constructed type storage object.
+  void initializeDialect(Dialect &newDialect) { dialect = &newDialect; }
+
+  /// The dialect for this type.
+  Dialect *dialect;
+
+  /// Space for subclasses to store data.
+  unsigned subclassData;
+};
+
+/// Default storage type for types that require no additional initialization or
+/// storage.
+using DefaultTypeStorage = TypeStorage;
+
+//===----------------------------------------------------------------------===//
+// TypeStorageAllocator
+//===----------------------------------------------------------------------===//
+
+// This is a utility allocator used to allocate memory for instances of derived
+// Types.
+using TypeStorageAllocator = StorageUniquer::StorageAllocator;
+
+//===----------------------------------------------------------------------===//
+// TypeUniquer
+//===----------------------------------------------------------------------===//
+namespace detail {
+// A utility class to get, or create, unique instances of types within an
+// MLIRContext. This class manages all creation and uniquing of types.
+class TypeUniquer {
+public:
+  /// Get an uniqued instance of a type T.
+  template <typename T, typename... Args>
+  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+    return ctx->getTypeUniquer().get<typename T::ImplType>(
+        [&](TypeStorage *storage) {
+          storage->initializeDialect(lookupDialectForType<T>(ctx));
+        },
+        kind, std::forward<Args>(args)...);
+  }
+
+private:
+  /// Get the dialect that the type 'T' was registered with.
+  template <typename T> static Dialect &lookupDialectForType(MLIRContext *ctx) {
+    return lookupDialectForType(ctx, T::getClassID());
+  }
+
+  /// Get the dialect that registered the type with the provided typeid.
+  static Dialect &lookupDialectForType(MLIRContext *ctx,
+                                       const ClassID *const typeID);
+};
+} // namespace detail
+
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/TypeUtilities.h b/third_party/mlir/include/mlir/IR/TypeUtilities.h
new file mode 100644
index 00000000000..ce0169ff24a
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/TypeUtilities.h
@@ -0,0 +1,94 @@
+//===- TypeUtilities.h - Helper function for type queries -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines generic type utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_TYPEUTILITIES_H
+#define MLIR_SUPPORT_TYPEUTILITIES_H
+
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+
+class Attribute;
+class TupleType;
+class Type;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// Utility Functions
+//===----------------------------------------------------------------------===//
+
+/// Return the element type or return the type itself.
+Type getElementTypeOrSelf(Type type);
+
+/// Return the element type or return the type itself.
+Type getElementTypeOrSelf(Attribute attr);
+Type getElementTypeOrSelf(Value *val);
+Type getElementTypeOrSelf(Value &val);
+
+/// Get the types within a nested Tuple. A helper for the class method that
+/// handles storage concerns, which is tricky to do in tablegen.
+SmallVector<Type, 10> getFlattenedTypes(TupleType t);
+
+/// Return true if the specified type is an opaque type with the specified
+/// dialect and typeData.
+bool isOpaqueTypeWithName(Type type, StringRef dialect, StringRef typeData);
+
+//===----------------------------------------------------------------------===//
+// Utility Iterators
+//===----------------------------------------------------------------------===//
+
+// An iterator for the element types of an op's operands of shaped types.
+class OperandElementTypeIterator final
+    : public llvm::mapped_iterator<OperandIterator, Type (*)(Value *)> {
+public:
+  using reference = Type;
+
+  /// Initializes the result element type iterator to the specified operand
+  /// iterator.
+  explicit OperandElementTypeIterator(OperandIterator it);
+
+private:
+  static Type unwrap(Value *value);
+};
+
+using OperandElementTypeRange =
+    llvm::iterator_range<OperandElementTypeIterator>;
+
+// An iterator for the tensor element types of an op's results of shaped types.
+class ResultElementTypeIterator final
+    : public llvm::mapped_iterator<ResultIterator, Type (*)(Value *)> {
+public:
+  using reference = Type;
+
+  /// Initializes the result element type iterator to the specified result
+  /// iterator.
+  explicit ResultElementTypeIterator(ResultIterator it);
+
+private:
+  static Type unwrap(Value *value);
+};
+
+using ResultElementTypeRange = llvm::iterator_range<ResultElementTypeIterator>;
+
+} // end namespace mlir
+
+#endif // MLIR_SUPPORT_TYPEUTILITIES_H
diff --git a/third_party/mlir/include/mlir/IR/Types.h b/third_party/mlir/include/mlir/IR/Types.h
new file mode 100644
index 00000000000..48c7cb305dd
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Types.h
@@ -0,0 +1,313 @@
+//===- Types.h - MLIR Type Classes ------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_IR_TYPES_H
+#define MLIR_IR_TYPES_H
+
+#include "mlir/IR/TypeSupport.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+class FloatType;
+class Identifier;
+class IndexType;
+class IntegerType;
+class MLIRContext;
+class TypeStorage;
+
+namespace detail {
+struct FunctionTypeStorage;
+struct OpaqueTypeStorage;
+} // namespace detail
+
+/// Instances of the Type class are immutable and uniqued.  They wrap a pointer
+/// to the storage object owned by MLIRContext.  Therefore, instances of Type
+/// are passed around by value.
+///
+/// Some types are "primitives" meaning they do not have any parameters, for
+/// example the Index type.  Parametric types have additional information that
+/// differentiates the types of the same kind between them, for example the
+/// Integer type has bitwidth, making i8 and i16 belong to the same kind by be
+/// different instances of the IntegerType.
+///
+/// Types are constructed and uniqued via the 'detail::TypeUniquer' class.
+///
+/// Derived type classes are expected to implement several required
+/// implementaiton hooks:
+///  * Required:
+///    - static bool kindof(unsigned kind);
+///      * Returns if the provided type kind corresponds to an instance of the
+///        current type. Used for isa/dyn_cast casting functionality.
+///
+///  * Optional:
+///    - static LogicalResult verifyConstructionInvariants(
+///                                               llvm::Optional<Location> loc,
+///                                               MLIRContext *context,
+///                                               Args... args)
+///      * This method is invoked when calling the 'TypeBase::get/getChecked'
+///        methods to ensure that the arguments passed in are valid to construct
+///        a type instance with.
+///      * This method is expected to return failure if a type cannot be
+///        constructed with 'args', success otherwise.
+///      * 'args' must correspond with the arguments passed into the
+///        'TypeBase::get' call after the type kind.
+///
+///
+/// Type storage objects inherit from TypeStorage and contain the following:
+///    - The type kind (for LLVM-style RTTI).
+///    - The dialect that defined the type.
+///    - Any parameters of the type.
+/// For non-parametric types, a convenience DefaultTypeStorage is provided.
+/// Parametric storage types must derive TypeStorage and respect the following:
+///    - Define a type alias, KeyTy, to a type that uniquely identifies the
+///      instance of the type within its kind.
+///      * The key type must be constructible from the values passed into the
+///        detail::TypeUniquer::get call after the type kind.
+///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
+///        storage class must define a hashing method:
+///         'static unsigned hashKey(const KeyTy &)'
+///
+///    - Provide a method, 'bool operator==(const KeyTy &) const', to
+///      compare the storage instance against an instance of the key type.
+///
+///    - Provide a construction method:
+///        'DerivedStorage *construct(TypeStorageAllocator &, const KeyTy &key)'
+///      that builds a unique instance of the derived storage. The arguments to
+///      this function are an allocator to store any uniqued data within the
+///      context and the key type for this storage.
+class Type {
+public:
+  /// Integer identifier for all the concrete type kinds.
+  /// Note: This is not an enum class as each dialect will likely define a
+  /// separate enumeration for the specific types that they define. Not being an
+  /// enum class also simplifies the handling of type kinds by not requiring
+  /// casts for each use.
+  enum Kind {
+    // Builtin types.
+    Function,
+    Opaque,
+    LAST_BUILTIN_TYPE = Opaque,
+
+  // Reserve type kinds for dialect specific type system extensions.
+#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
+  FIRST_##Dialect##_TYPE, LAST_##Dialect##_TYPE = FIRST_##Dialect##_TYPE + 0xff,
+#include "DialectSymbolRegistry.def"
+  };
+
+  /// Utility class for implementing types.
+  template <typename ConcreteType, typename BaseType,
+            typename StorageType = DefaultTypeStorage>
+  using TypeBase = detail::StorageUserBase<ConcreteType, BaseType, StorageType,
+                                           detail::TypeUniquer>;
+
+  using ImplType = TypeStorage;
+
+  Type() : impl(nullptr) {}
+  /* implicit */ Type(const ImplType *impl)
+      : impl(const_cast<ImplType *>(impl)) {}
+
+  Type(const Type &other) : impl(other.impl) {}
+  Type &operator=(Type other) {
+    impl = other.impl;
+    return *this;
+  }
+
+  bool operator==(Type other) const { return impl == other.impl; }
+  bool operator!=(Type other) const { return !(*this == other); }
+  explicit operator bool() const { return impl; }
+
+  bool operator!() const { return impl == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U dyn_cast_or_null() const;
+  template <typename U> U cast() const;
+
+  // Support type casting Type to itself.
+  static bool classof(Type) { return true; }
+
+  /// Return the classification for this type.
+  unsigned getKind() const;
+
+  /// Return the LLVMContext in which this type was uniqued.
+  MLIRContext *getContext() const;
+
+  /// Get the dialect this type is registered to.
+  Dialect &getDialect() const;
+
+  // Convenience predicates.  This is only for floating point types,
+  // derived types should use isa/dyn_cast.
+  bool isIndex();
+  bool isBF16();
+  bool isF16();
+  bool isF32();
+  bool isF64();
+
+  /// Return true if this is an integer type with the specified width.
+  bool isInteger(unsigned width);
+
+  /// Return the bit width of an integer or a float type, assert failure on
+  /// other types.
+  unsigned getIntOrFloatBitWidth();
+
+  /// Return true if this is an integer or index type.
+  bool isIntOrIndex();
+  /// Return true if this is an integer, index, or float type.
+  bool isIntOrIndexOrFloat();
+  /// Return true of this is an integer or a float type.
+  bool isIntOrFloat();
+
+  /// Print the current type.
+  void print(raw_ostream &os);
+  void dump();
+
+  friend ::llvm::hash_code hash_value(Type arg);
+
+  unsigned getSubclassData() const;
+  void setSubclassData(unsigned val);
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>(impl);
+  }
+  static Type getFromOpaquePointer(const void *pointer) {
+    return Type(reinterpret_cast<ImplType *>(const_cast<void *>(pointer)));
+  }
+
+protected:
+  ImplType *impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Type type) {
+  type.print(os);
+  return os;
+}
+
+/// Function types map from a list of inputs to a list of results.
+class FunctionType
+    : public Type::TypeBase<FunctionType, Type, detail::FunctionTypeStorage> {
+public:
+  using Base::Base;
+
+  static FunctionType get(ArrayRef<Type> inputs, ArrayRef<Type> results,
+                          MLIRContext *context);
+
+  // Input types.
+  unsigned getNumInputs() const { return getSubclassData(); }
+
+  Type getInput(unsigned i) const { return getInputs()[i]; }
+
+  ArrayRef<Type> getInputs() const;
+
+  // Result types.
+  unsigned getNumResults() const;
+
+  Type getResult(unsigned i) const { return getResults()[i]; }
+
+  ArrayRef<Type> getResults() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == Kind::Function; }
+};
+
+/// Opaque types represent types of non-registered dialects. These are types
+/// represented in their raw string form, and can only usefully be tested for
+/// type equality.
+class OpaqueType
+    : public Type::TypeBase<OpaqueType, Type, detail::OpaqueTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new OpaqueType with the provided dialect and string data.
+  static OpaqueType get(Identifier dialect, StringRef typeData,
+                        MLIRContext *context);
+
+  /// Get or create a new OpaqueType with the provided dialect and string data.
+  /// If the given identifier is not a valid namespace for a dialect, then a
+  /// null type is returned.
+  static OpaqueType getChecked(Identifier dialect, StringRef typeData,
+                               MLIRContext *context, Location location);
+
+  /// Returns the dialect namespace of the opaque type.
+  Identifier getDialectNamespace() const;
+
+  /// Returns the raw type data of the opaque type.
+  StringRef getTypeData() const;
+
+  /// Verify the construction of an opaque type.
+  static LogicalResult
+  verifyConstructionInvariants(llvm::Optional<Location> loc,
+                               MLIRContext *context, Identifier dialect,
+                               StringRef typeData);
+
+  static bool kindof(unsigned kind) { return kind == Kind::Opaque; }
+};
+
+// Make Type hashable.
+inline ::llvm::hash_code hash_value(Type arg) {
+  return ::llvm::hash_value(arg.impl);
+}
+
+template <typename U> bool Type::isa() const {
+  assert(impl && "isa<> used on a null type.");
+  return U::classof(*this);
+}
+template <typename U> U Type::dyn_cast() const {
+  return isa<U>() ? U(impl) : U(nullptr);
+}
+template <typename U> U Type::dyn_cast_or_null() const {
+  return (impl && isa<U>()) ? U(impl) : U(nullptr);
+}
+template <typename U> U Type::cast() const {
+  assert(isa<U>());
+  return U(impl);
+}
+
+} // end namespace mlir
+
+namespace llvm {
+
+// Type hash just like pointers.
+template <> struct DenseMapInfo<mlir::Type> {
+  static mlir::Type getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Type(static_cast<mlir::Type::ImplType *>(pointer));
+  }
+  static mlir::Type getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Type(static_cast<mlir::Type::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::Type val) { return mlir::hash_value(val); }
+  static bool isEqual(mlir::Type LHS, mlir::Type RHS) { return LHS == RHS; }
+};
+
+/// We align TypeStorage by 8, so allow LLVM to steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::Type> {
+public:
+  static inline void *getAsVoidPointer(mlir::Type I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Type getFromVoidPointer(void *P) {
+    return mlir::Type::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_TYPES_H
diff --git a/third_party/mlir/include/mlir/IR/UseDefLists.h b/third_party/mlir/include/mlir/IR/UseDefLists.h
new file mode 100644
index 00000000000..fe0e9e02ad5
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/UseDefLists.h
@@ -0,0 +1,282 @@
+//===- UseDefLists.h --------------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines generic use/def list machinery and manipulation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_USEDEFLISTS_H
+#define MLIR_IR_USEDEFLISTS_H
+
+#include "mlir/IR/Location.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/iterator_range.h"
+
+namespace mlir {
+
+class IROperand;
+class Operation;
+template <typename OperandType> class ValueUseIterator;
+template <typename OperandType> class ValueUserIterator;
+
+class IRObjectWithUseList {
+public:
+  ~IRObjectWithUseList() {
+    assert(use_empty() && "Cannot destroy a value that still has uses!");
+  }
+
+  /// Returns true if this value has no uses.
+  bool use_empty() const { return firstUse == nullptr; }
+
+  /// Returns true if this value has exactly one use.
+  inline bool hasOneUse() const;
+
+  using use_iterator = ValueUseIterator<IROperand>;
+  using use_range = llvm::iterator_range<use_iterator>;
+
+  inline use_iterator use_begin() const;
+  inline use_iterator use_end() const;
+
+  /// Returns a range of all uses, which is useful for iterating over all uses.
+  inline use_range getUses() const;
+
+  using user_iterator = ValueUserIterator<IROperand>;
+  using user_range = llvm::iterator_range<user_iterator>;
+
+  inline user_iterator user_begin() const;
+  inline user_iterator user_end() const;
+
+  /// Returns a range of all users.
+  inline user_range getUsers() const;
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(IRObjectWithUseList *newValue);
+
+  /// Drop all uses of this object from their respective owners.
+  void dropAllUses();
+
+protected:
+  IRObjectWithUseList() {}
+
+  /// Return the first IROperand that is using this value, for use by custom
+  /// use/def iterators.
+  IROperand *getFirstUse() { return firstUse; }
+  const IROperand *getFirstUse() const { return firstUse; }
+
+private:
+  friend class IROperand;
+  IROperand *firstUse = nullptr;
+};
+
+/// A reference to a value, suitable for use as an operand of an operation.
+class IROperand {
+public:
+  IROperand(Operation *owner) : owner(owner) {}
+  IROperand(Operation *owner, IRObjectWithUseList *value)
+      : value(value), owner(owner) {
+    insertIntoCurrent();
+  }
+
+  /// Return the current value being used by this operand.
+  IRObjectWithUseList *get() const { return value; }
+
+  /// Set the current value being used by this operand.
+  void set(IRObjectWithUseList *newValue) {
+    // It isn't worth optimizing for the case of switching operands on a single
+    // value.
+    removeFromCurrent();
+    value = newValue;
+    insertIntoCurrent();
+  }
+
+  /// Return the owner of this operand.
+  Operation *getOwner() { return owner; }
+  Operation *getOwner() const { return owner; }
+
+  /// \brief Remove this use of the operand.
+  void drop() {
+    removeFromCurrent();
+    value = nullptr;
+    nextUse = nullptr;
+    back = nullptr;
+  }
+
+  ~IROperand() { removeFromCurrent(); }
+
+  /// Return the next operand on the use-list of the value we are referring to.
+  /// This should generally only be used by the internal implementation details
+  /// of the SSA machinery.
+  IROperand *getNextOperandUsingThisValue() { return nextUse; }
+
+  /// We support a move constructor so IROperand's can be in vectors, but this
+  /// shouldn't be used by general clients.
+  IROperand(IROperand &&other) : owner(other.owner) {
+    *this = std::move(other);
+  }
+  IROperand &operator=(IROperand &&other) {
+    removeFromCurrent();
+    other.removeFromCurrent();
+    value = other.value;
+    other.value = nullptr;
+    other.back = nullptr;
+    nextUse = nullptr;
+    back = nullptr;
+    insertIntoCurrent();
+    return *this;
+  }
+
+private:
+  /// The value used as this operand.  This can be null when in a
+  /// "dropAllUses" state.
+  IRObjectWithUseList *value = nullptr;
+
+  /// The next operand in the use-chain.
+  IROperand *nextUse = nullptr;
+
+  /// This points to the previous link in the use-chain.
+  IROperand **back = nullptr;
+
+  /// The operation owner of this operand.
+  Operation *const owner;
+
+  /// Operands are not copyable or assignable.
+  IROperand(const IROperand &use) = delete;
+  IROperand &operator=(const IROperand &use) = delete;
+
+  void removeFromCurrent() {
+    if (!back)
+      return;
+    *back = nextUse;
+    if (nextUse)
+      nextUse->back = back;
+  }
+
+  void insertIntoCurrent() {
+    back = &value->firstUse;
+    nextUse = value->firstUse;
+    if (nextUse)
+      nextUse->back = &nextUse;
+    value->firstUse = this;
+  }
+};
+
+/// A reference to a value, suitable for use as an operand of an operation,
+/// operation, etc.  IRValueTy is the root type to use for values this tracks,
+/// and SSAUserTy is the type that will contain operands.
+template <typename IRValueTy> class IROperandImpl : public IROperand {
+public:
+  IROperandImpl(Operation *owner) : IROperand(owner) {}
+  IROperandImpl(Operation *owner, IRValueTy *value) : IROperand(owner, value) {}
+
+  /// Return the current value being used by this operand.
+  IRValueTy *get() { return (IRValueTy *)IROperand::get(); }
+
+  /// Set the current value being used by this operand.
+  void set(IRValueTy *newValue) { IROperand::set(newValue); }
+
+  /// Return which operand this is in the operand list of the User.
+  unsigned getOperandNumber();
+};
+
+/// An iterator over all uses of a ValueBase.
+template <typename OperandType>
+class ValueUseIterator
+    : public std::iterator<std::forward_iterator_tag, OperandType> {
+public:
+  ValueUseIterator() = default;
+  explicit ValueUseIterator(OperandType *current) : current(current) {}
+  OperandType *operator->() const { return current; }
+  OperandType &operator*() const { return *current; }
+
+  Operation *getUser() const { return current->getOwner(); }
+
+  ValueUseIterator &operator++() {
+    assert(current && "incrementing past end()!");
+    current = (OperandType *)current->getNextOperandUsingThisValue();
+    return *this;
+  }
+
+  ValueUseIterator operator++(int unused) {
+    ValueUseIterator copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  friend bool operator==(ValueUseIterator lhs, ValueUseIterator rhs) {
+    return lhs.current == rhs.current;
+  }
+
+  friend bool operator!=(ValueUseIterator lhs, ValueUseIterator rhs) {
+    return !(lhs == rhs);
+  }
+
+private:
+  OperandType *current;
+};
+
+inline auto IRObjectWithUseList::use_begin() const -> use_iterator {
+  return use_iterator(firstUse);
+}
+
+inline auto IRObjectWithUseList::use_end() const -> use_iterator {
+  return use_iterator(nullptr);
+}
+
+inline auto IRObjectWithUseList::getUses() const -> use_range {
+  return {use_begin(), use_end()};
+}
+
+/// Returns true if this value has exactly one use.
+inline bool IRObjectWithUseList::hasOneUse() const {
+  return firstUse && firstUse->getNextOperandUsingThisValue() == nullptr;
+}
+
+/// An iterator over all users of a ValueBase.
+template <typename OperandType>
+class ValueUserIterator final
+    : public llvm::mapped_iterator<ValueUseIterator<OperandType>,
+                                   Operation *(*)(OperandType &)> {
+  static Operation *unwrap(OperandType &value) { return value.getOwner(); }
+
+public:
+  using pointer = Operation *;
+  using reference = Operation *;
+
+  /// Initializes the result type iterator to the specified result iterator.
+  ValueUserIterator(ValueUseIterator<OperandType> it)
+      : llvm::mapped_iterator<ValueUseIterator<OperandType>,
+                              Operation *(*)(OperandType &)>(it, &unwrap) {}
+  Operation *operator->() { return **this; }
+};
+
+inline auto IRObjectWithUseList::user_begin() const -> user_iterator {
+  return user_iterator(use_begin());
+}
+
+inline auto IRObjectWithUseList::user_end() const -> user_iterator {
+  return user_iterator(use_end());
+}
+
+inline auto IRObjectWithUseList::getUsers() const -> user_range {
+  return {user_begin(), user_end()};
+}
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/IR/Value.h b/third_party/mlir/include/mlir/IR/Value.h
new file mode 100644
index 00000000000..110c74f41f1
--- /dev/null
+++ b/third_party/mlir/include/mlir/IR/Value.h
@@ -0,0 +1,166 @@
+//===- Value.h - Base of the SSA Value hierarchy ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines generic Value type and manipulation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_VALUE_H
+#define MLIR_IR_VALUE_H
+
+#include "mlir/IR/Types.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class Block;
+class Operation;
+class Region;
+class Value;
+
+/// Operands contain a Value.
+using OpOperand = IROperandImpl<Value>;
+
+/// This is the common base class for all SSA values in the MLIR system,
+/// representing a computable value that has a type and a set of users.
+///
+class Value : public IRObjectWithUseList {
+public:
+  /// This enumerates all of the SSA value kinds in the MLIR system.
+  enum class Kind {
+    BlockArgument, // block argument
+    OpResult,      // operation result
+  };
+
+  ~Value() {}
+
+  Kind getKind() const { return typeAndKind.getInt(); }
+
+  Type getType() const { return typeAndKind.getPointer(); }
+
+  /// Utility to get the associated MLIRContext that this value is defined in.
+  MLIRContext *getContext() const { return getType().getContext(); }
+
+  /// Mutate the type of this Value to be of the specified type.
+  ///
+  /// Note that this is an extremely dangerous operation which can create
+  /// completely invalid IR very easily.  It is strongly recommended that you
+  /// recreate IR objects with the right types instead of mutating them in
+  /// place.
+  void setType(Type newType) { typeAndKind.setPointer(newType); }
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(Value *newValue) {
+    IRObjectWithUseList::replaceAllUsesWith(newValue);
+  }
+
+  /// If this value is the result of an operation, return the operation that
+  /// defines it.
+  Operation *getDefiningOp();
+
+  /// If this value is the result of an operation, use it as a location,
+  /// otherwise return an unknown location.
+  Location getLoc();
+
+  /// Return the Region in which this Value is defined.
+  Region *getParentRegion();
+
+  using use_iterator = ValueUseIterator<OpOperand>;
+  using use_range = llvm::iterator_range<use_iterator>;
+
+  inline use_iterator use_begin();
+  inline use_iterator use_end();
+
+  /// Returns a range of all uses, which is useful for iterating over all uses.
+  inline use_range getUses();
+
+  void print(raw_ostream &os);
+  void dump();
+
+protected:
+  Value(Kind kind, Type type) : typeAndKind(type, kind) {}
+
+private:
+  llvm::PointerIntPair<Type, 1, Kind> typeAndKind;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Value &value) {
+  value.print(os);
+  return os;
+}
+
+// Utility functions for iterating through Value uses.
+inline auto Value::use_begin() -> use_iterator {
+  return use_iterator((OpOperand *)getFirstUse());
+}
+
+inline auto Value::use_end() -> use_iterator { return use_iterator(nullptr); }
+
+inline auto Value::getUses() -> llvm::iterator_range<use_iterator> {
+  return {use_begin(), use_end()};
+}
+
+/// Block arguments are values.
+class BlockArgument : public Value {
+public:
+  static bool classof(const Value *value) {
+    return const_cast<Value *>(value)->getKind() == Kind::BlockArgument;
+  }
+
+  Block *getOwner() { return owner; }
+
+  /// Returns the number of this argument.
+  unsigned getArgNumber();
+
+private:
+  friend class Block; // For access to private constructor.
+  BlockArgument(Type type, Block *owner)
+      : Value(Value::Kind::BlockArgument, type), owner(owner) {}
+
+  /// The owner of this operand.
+  /// TODO: can encode this more efficiently to avoid the space hit of this
+  /// through bitpacking shenanigans.
+  Block *const owner;
+};
+
+/// This is a value defined by a result of an operation.
+class OpResult : public Value {
+public:
+  OpResult(Type type, Operation *owner)
+      : Value(Value::Kind::OpResult, type), owner(owner) {}
+
+  static bool classof(const Value *value) {
+    return const_cast<Value *>(value)->getKind() == Kind::OpResult;
+  }
+
+  Operation *getOwner() { return owner; }
+
+  /// Returns the number of this result.
+  unsigned getResultNumber();
+
+private:
+  /// The owner of this operand.
+  /// TODO: can encode this more efficiently to avoid the space hit of this
+  /// through bitpacking shenanigans.
+  Operation *const owner;
+};
+
+} // namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/Parser.h b/third_party/mlir/include/mlir/Parser.h
new file mode 100644
index 00000000000..71babe71d3c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Parser.h
@@ -0,0 +1,70 @@
+//===- Parser.h - MLIR Parser Library Interface -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file is contains the interface to the MLIR parser library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PARSER_H
+#define MLIR_PARSER_H
+
+namespace llvm {
+class SourceMgr;
+class SMDiagnostic;
+class StringRef;
+} // end namespace llvm
+
+namespace mlir {
+class Location;
+class ModuleOp;
+class MLIRContext;
+class Type;
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+ModuleOp parseSourceFile(const llvm::SourceMgr &sourceMgr,
+                         MLIRContext *context);
+
+/// This parses the file specified by the indicated filename and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+ModuleOp parseSourceFile(llvm::StringRef filename, MLIRContext *context);
+
+/// This parses the file specified by the indicated filename using the provided
+/// SourceMgr and returns an MLIR module if it was valid.  If not, the error
+/// message is emitted through the error handler registered in the context, and
+/// a null pointer is returned.
+ModuleOp parseSourceFile(llvm::StringRef filename, llvm::SourceMgr &sourceMgr,
+                         MLIRContext *context);
+
+/// This parses the module string to a MLIR module if it was valid.  If not, the
+/// error message is emitted through the error handler registered in the
+/// context, and a null pointer is returned.
+ModuleOp parseSourceString(llvm::StringRef moduleStr, MLIRContext *context);
+
+/// This parses a single MLIR type to an MLIR context if it was valid.  If not,
+/// an error message is emitted through a new SourceMgrDiagnosticHandler
+/// constructed from a new SourceMgr with a single a MemoryBuffer wrapping
+/// `typeStr`. If the passed `typeStr` has additional tokens that were not part
+/// of the type, an error is emitted.
+// TODO(ntv) Improve diagnostic reporting.
+Type parseType(llvm::StringRef typeStr, MLIRContext *context);
+
+} // end namespace mlir
+
+#endif // MLIR_PARSER_H
diff --git a/third_party/mlir/include/mlir/Pass/AnalysisManager.h b/third_party/mlir/include/mlir/Pass/AnalysisManager.h
new file mode 100644
index 00000000000..163ecf6356f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Pass/AnalysisManager.h
@@ -0,0 +1,303 @@
+//===- AnalysisManager.h - Analysis Management Infrastructure ---*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_PASS_ANALYSISMANAGER_H
+#define MLIR_PASS_ANALYSISMANAGER_H
+
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/PassInstrumentation.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/TypeName.h"
+
+namespace mlir {
+/// A special type used by analyses to provide an address that identifies a
+/// particular analysis set or a concrete analysis type.
+using AnalysisID = ClassID;
+
+//===----------------------------------------------------------------------===//
+// Analysis Preservation and Concept Modeling
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// A utility class to represent the analyses that are known to be preserved.
+class PreservedAnalyses {
+public:
+  /// Mark all analyses as preserved.
+  void preserveAll() { preservedIDs.insert(&allAnalysesID); }
+
+  /// Returns true if all analyses were marked preserved.
+  bool isAll() const { return preservedIDs.count(&allAnalysesID); }
+
+  /// Returns true if no analyses were marked preserved.
+  bool isNone() const { return preservedIDs.empty(); }
+
+  /// Preserve the given analyses.
+  template <typename AnalysisT> void preserve() {
+    preserve(AnalysisID::getID<AnalysisT>());
+  }
+  template <typename AnalysisT, typename AnalysisT2, typename... OtherAnalysesT>
+  void preserve() {
+    preserve<AnalysisT>();
+    preserve<AnalysisT2, OtherAnalysesT...>();
+  }
+  void preserve(const AnalysisID *id) { preservedIDs.insert(id); }
+
+  /// Returns if the given analysis has been marked as preserved. Note that this
+  /// simply checks for the presence of a given analysis ID and should not be
+  /// used as a general preservation checker.
+  template <typename AnalysisT> bool isPreserved() const {
+    return isPreserved(AnalysisID::getID<AnalysisT>());
+  }
+  bool isPreserved(const AnalysisID *id) const {
+    return preservedIDs.count(id);
+  }
+
+private:
+  /// An identifier used to represent all potential analyses.
+  constexpr static AnalysisID allAnalysesID = {};
+
+  /// The set of analyses that are known to be preserved.
+  SmallPtrSet<const void *, 2> preservedIDs;
+};
+
+/// The abstract polymorphic base class representing an analysis.
+struct AnalysisConcept {
+  virtual ~AnalysisConcept() = default;
+};
+
+/// A derived analysis model used to hold a specific analysis object.
+template <typename AnalysisT> struct AnalysisModel : public AnalysisConcept {
+  template <typename... Args>
+  explicit AnalysisModel(Args &&... args)
+      : analysis(std::forward<Args>(args)...) {}
+
+  AnalysisT analysis;
+};
+
+/// This class represents a cache of analyses for a single operation. All
+/// computation, caching, and invalidation of analyses takes place here.
+class AnalysisMap {
+  /// A mapping between an analysis id and an existing analysis instance.
+  using ConceptMap =
+      llvm::DenseMap<const AnalysisID *, std::unique_ptr<AnalysisConcept>>;
+
+  /// Utility to return the name of the given analysis class.
+  template <typename AnalysisT> static llvm::StringRef getAnalysisName() {
+    StringRef name = llvm::getTypeName<AnalysisT>();
+    if (!name.consume_front("mlir::"))
+      name.consume_front("(anonymous namespace)::");
+    return name;
+  }
+
+public:
+  explicit AnalysisMap(Operation *ir) : ir(ir) {}
+
+  /// Get an analysis for the current IR unit, computing it if necessary.
+  template <typename AnalysisT> AnalysisT &getAnalysis(PassInstrumentor *pi) {
+    auto *id = AnalysisID::getID<AnalysisT>();
+
+    typename ConceptMap::iterator it;
+    bool wasInserted;
+    std::tie(it, wasInserted) = analyses.try_emplace(id);
+
+    // If we don't have a cached analysis for this function, compute it directly
+    // and add it to the cache.
+    if (wasInserted) {
+      if (pi)
+        pi->runBeforeAnalysis(getAnalysisName<AnalysisT>(), id, ir);
+
+      it->second = std::make_unique<AnalysisModel<AnalysisT>>(ir);
+
+      if (pi)
+        pi->runAfterAnalysis(getAnalysisName<AnalysisT>(), id, ir);
+    }
+    return static_cast<AnalysisModel<AnalysisT> &>(*it->second).analysis;
+  }
+
+  /// Get a cached analysis instance if one exists, otherwise return null.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() const {
+    auto res = analyses.find(AnalysisID::getID<AnalysisT>());
+    if (res == analyses.end())
+      return llvm::None;
+    return {static_cast<AnalysisModel<AnalysisT> &>(*res->second).analysis};
+  }
+
+  /// Returns the operation that this analysis map represents.
+  Operation *getOperation() const { return ir; }
+
+  /// Clear any held analyses.
+  void clear() { analyses.clear(); }
+
+  /// Invalidate any cached analyses based upon the given set of preserved
+  /// analyses.
+  void invalidate(const detail::PreservedAnalyses &pa) {
+    // Remove any analyses not marked as preserved.
+    for (auto it = analyses.begin(), e = analyses.end(); it != e;) {
+      auto curIt = it++;
+      if (!pa.isPreserved(curIt->first))
+        analyses.erase(curIt);
+    }
+  }
+
+private:
+  Operation *ir;
+  ConceptMap analyses;
+};
+
+/// An analysis map that contains a map for the current operation, and a set of
+/// maps for any child operations.
+struct NestedAnalysisMap {
+  NestedAnalysisMap(Operation *op) : analyses(op) {}
+
+  /// Get the operation for this analysis map.
+  Operation *getOperation() const { return analyses.getOperation(); }
+
+  /// Invalidate any non preserved analyses.
+  void invalidate(const detail::PreservedAnalyses &pa);
+
+  /// The cached analyses for nested operations.
+  llvm::DenseMap<Operation *, std::unique_ptr<NestedAnalysisMap>> childAnalyses;
+
+  /// The analyses for the owning module.
+  detail::AnalysisMap analyses;
+};
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// Analysis Management
+//===----------------------------------------------------------------------===//
+class ModuleAnalysisManager;
+
+/// This class represents an analysis manager for a particular operation
+/// instance. It is used to manage and cache analyses on the operation as well
+/// as those for child operations, via nested AnalysisManager instances
+/// accessible via 'slice'. This class is intended to be passed around by value,
+/// and cannot be constructed directly.
+class AnalysisManager {
+  using ParentPointerT = llvm::PointerUnion<const ModuleAnalysisManager *,
+                                            const AnalysisManager *>;
+
+public:
+  // Query for a cached analysis on the given parent operation. The analysis may
+  // not exist and if it does it may be out-of-date.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>>
+  getCachedParentAnalysis(Operation *parentOp) const {
+    ParentPointerT curParent = parent;
+    while (auto *parentAM = curParent.dyn_cast<const AnalysisManager *>()) {
+      if (parentAM->impl->getOperation() == parentOp)
+        return parentAM->getCachedAnalysis<AnalysisT>();
+      curParent = parentAM->parent;
+    }
+    return None;
+  }
+
+  // Query for the given analysis for the current operation.
+  template <typename AnalysisT> AnalysisT &getAnalysis() {
+    return impl->analyses.getAnalysis<AnalysisT>(getPassInstrumentor());
+  }
+
+  // Query for a cached entry of the given analysis on the current operation.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() const {
+    return impl->analyses.getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Query for a analysis of a child operation, constructing it if necessary.
+  template <typename AnalysisT> AnalysisT &getChildAnalysis(Operation *op) {
+    return slice(op).template getAnalysis<AnalysisT>();
+  }
+
+  /// Query for a cached analysis of a child operation, or return null.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>>
+  getCachedChildAnalysis(Operation *op) const {
+    assert(op->getParentOp() == impl->getOperation());
+    auto it = impl->childAnalyses.find(op);
+    if (it == impl->childAnalyses.end())
+      return llvm::None;
+    return it->second->analyses.getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Get an analysis manager for the given child operation.
+  AnalysisManager slice(Operation *op);
+
+  /// Invalidate any non preserved analyses,
+  void invalidate(const detail::PreservedAnalyses &pa) { impl->invalidate(pa); }
+
+  /// Clear any held analyses.
+  void clear() {
+    impl->analyses.clear();
+    impl->childAnalyses.clear();
+  }
+
+  /// Returns a pass instrumentation object for the current operation. This
+  /// value may be null.
+  PassInstrumentor *getPassInstrumentor() const;
+
+private:
+  AnalysisManager(const AnalysisManager *parent,
+                  detail::NestedAnalysisMap *impl)
+      : parent(parent), impl(impl) {}
+  AnalysisManager(const ModuleAnalysisManager *parent,
+                  detail::NestedAnalysisMap *impl)
+      : parent(parent), impl(impl) {}
+
+  /// A reference to the parent analysis manager, or the top-level module
+  /// analysis manager.
+  llvm::PointerUnion<const ModuleAnalysisManager *, const AnalysisManager *>
+      parent;
+
+  /// A reference to the impl analysis map within the parent analysis manager.
+  detail::NestedAnalysisMap *impl;
+
+  /// Allow access to the constructor.
+  friend class ModuleAnalysisManager;
+};
+
+/// An analysis manager class specifically for the top-level module operation.
+/// This class contains the memory allocations for all nested analysis managers,
+/// and provides an anchor point. This is necessary because AnalysisManager is
+/// designed to be a thin wrapper around an existing analysis map instance.
+class ModuleAnalysisManager {
+public:
+  ModuleAnalysisManager(ModuleOp module, PassInstrumentor *passInstrumentor)
+      : analyses(module), passInstrumentor(passInstrumentor) {}
+  ModuleAnalysisManager(const ModuleAnalysisManager &) = delete;
+  ModuleAnalysisManager &operator=(const ModuleAnalysisManager &) = delete;
+
+  /// Returns a pass instrumentation object for the current module. This value
+  /// may be null.
+  PassInstrumentor *getPassInstrumentor() const { return passInstrumentor; }
+
+  /// Returns an analysis manager for the current top-level module.
+  operator AnalysisManager() { return AnalysisManager(this, &analyses); }
+
+private:
+  /// The analyses for the owning module.
+  detail::NestedAnalysisMap analyses;
+
+  /// An optional instrumentation object.
+  PassInstrumentor *passInstrumentor;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_PASS_ANALYSISMANAGER_H
diff --git a/third_party/mlir/include/mlir/Pass/Pass.h b/third_party/mlir/include/mlir/Pass/Pass.h
new file mode 100644
index 00000000000..360eaaff9b3
--- /dev/null
+++ b/third_party/mlir/include/mlir/Pass/Pass.h
@@ -0,0 +1,287 @@
+//===- Pass.h - Base classes for compiler passes ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_PASS_PASS_H
+#define MLIR_PASS_PASS_H
+
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/AnalysisManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/PointerIntPair.h"
+
+namespace mlir {
+/// The abstract base pass class. This class contains information describing the
+/// derived pass object, e.g its kind and abstract PassInfo.
+class Pass {
+public:
+  enum class Kind { FunctionPass, ModulePass };
+
+  virtual ~Pass() = default;
+
+  /// Returns the unique identifier that corresponds to this pass.
+  const PassID *getPassID() const { return passIDAndKind.getPointer(); }
+
+  /// Returns the pass info for the specified pass class or null if unknown.
+  static const PassInfo *lookupPassInfo(const PassID *passID);
+  template <typename PassT> static const PassInfo *lookupPassInfo() {
+    return lookupPassInfo(PassID::getID<PassT>());
+  }
+
+  /// Returns the pass info for this pass.
+  const PassInfo *lookupPassInfo() const { return lookupPassInfo(getPassID()); }
+
+  /// Return the kind of this pass.
+  Kind getKind() const { return passIDAndKind.getInt(); }
+
+  /// Returns the derived pass name.
+  virtual StringRef getName() = 0;
+
+protected:
+  Pass(const PassID *passID, Kind kind) : passIDAndKind(passID, kind) {}
+
+private:
+  /// Out of line virtual method to ensure vtables and metadata are emitted to a
+  /// single .o file.
+  virtual void anchor();
+
+  /// Represents a unique identifier for the pass and its kind.
+  llvm::PointerIntPair<const PassID *, 1, Kind> passIDAndKind;
+};
+
+namespace detail {
+class FunctionPassExecutor;
+class ModulePassExecutor;
+
+/// The state for a single execution of a pass. This provides a unified
+/// interface for accessing and initializing necessary state for pass execution.
+template <typename IRUnitT> struct PassExecutionState {
+  PassExecutionState(IRUnitT ir, AnalysisManager analysisManager)
+      : irAndPassFailed(ir, false), analysisManager(analysisManager) {}
+
+  /// The current IR unit being transformed and a bool for if the pass signaled
+  /// a failure.
+  llvm::PointerIntPair<IRUnitT, 1, bool> irAndPassFailed;
+
+  /// The analysis manager for the IR unit.
+  AnalysisManager analysisManager;
+
+  /// The set of preserved analyses for the current execution.
+  detail::PreservedAnalyses preservedAnalyses;
+};
+} // namespace detail
+
+/// Pass to transform a specific function within a module. Derived passes should
+/// not inherit from this class directly, and instead should use the CRTP
+/// FunctionPass class.
+class FunctionPassBase : public Pass {
+  using PassStateT = detail::PassExecutionState<FuncOp>;
+
+public:
+  static bool classof(const Pass *pass) {
+    return pass->getKind() == Kind::FunctionPass;
+  }
+
+protected:
+  explicit FunctionPassBase(const PassID *id) : Pass(id, Kind::FunctionPass) {}
+
+  /// The polymorphic API that runs the pass over the currently held function.
+  virtual void runOnFunction() = 0;
+
+  /// A clone method to create a copy of this pass.
+  virtual std::unique_ptr<FunctionPassBase> clone() const = 0;
+
+  /// Return the current function being transformed.
+  FuncOp getFunction() { return getPassState().irAndPassFailed.getPointer(); }
+
+  /// Return the MLIR context for the current function being transformed.
+  MLIRContext &getContext() { return *getFunction().getContext(); }
+
+  /// Returns the current pass state.
+  PassStateT &getPassState() {
+    assert(passState && "pass state was never initialized");
+    return *passState;
+  }
+
+  /// Returns the current analysis manager.
+  AnalysisManager getAnalysisManager() {
+    return getPassState().analysisManager;
+  }
+
+private:
+  /// Forwarding function to execute this pass.
+  LLVM_NODISCARD
+  LogicalResult run(FuncOp fn, AnalysisManager am);
+
+  /// The current execution state for the pass.
+  llvm::Optional<PassStateT> passState;
+
+  /// Allow access to 'run'.
+  friend detail::FunctionPassExecutor;
+};
+
+/// Pass to transform a module. Derived passes should not inherit from this
+/// class directly, and instead should use the CRTP ModulePass class.
+class ModulePassBase : public Pass {
+  using PassStateT = detail::PassExecutionState<ModuleOp>;
+
+public:
+  static bool classof(const Pass *pass) {
+    return pass->getKind() == Kind::ModulePass;
+  }
+
+protected:
+  explicit ModulePassBase(const PassID *id) : Pass(id, Kind::ModulePass) {}
+
+  /// The polymorphic API that runs the pass over the currently held module.
+  virtual void runOnModule() = 0;
+
+  /// Return the current module being transformed.
+  ModuleOp getModule() { return getPassState().irAndPassFailed.getPointer(); }
+
+  /// Return the MLIR context for the current module being transformed.
+  MLIRContext &getContext() { return *getModule().getContext(); }
+
+  /// Returns the current pass state.
+  PassStateT &getPassState() {
+    assert(passState && "pass state was never initialized");
+    return *passState;
+  }
+
+  /// Returns the current analysis manager.
+  AnalysisManager getAnalysisManager() {
+    return getPassState().analysisManager;
+  }
+
+private:
+  /// Forwarding function to execute this pass.
+  LLVM_NODISCARD
+  LogicalResult run(ModuleOp module, AnalysisManager am);
+
+  /// The current execution state for the pass.
+  llvm::Optional<PassStateT> passState;
+
+  /// Allow access to 'run'.
+  friend detail::ModulePassExecutor;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Model Definitions
+//===----------------------------------------------------------------------===//
+namespace detail {
+/// The opaque CRTP model of a pass. This class provides utilities for derived
+/// pass execution and handles all of the necessary polymorphic API.
+template <typename IRUnitT, typename PassT, typename BasePassT>
+class PassModel : public BasePassT {
+public:
+  /// Support isa/dyn_cast functionality for the derived pass class.
+  static bool classof(const Pass *pass) {
+    return pass->getPassID() == PassID::getID<PassT>();
+  }
+
+protected:
+  PassModel() : BasePassT(PassID::getID<PassT>()) {}
+
+  /// Signal that some invariant was broken when running. The IR is allowed to
+  /// be in an invalid state.
+  void signalPassFailure() {
+    this->getPassState().irAndPassFailed.setInt(true);
+  }
+
+  /// Query an analysis for the current ir unit.
+  template <typename AnalysisT> AnalysisT &getAnalysis() {
+    return this->getAnalysisManager().template getAnalysis<AnalysisT>();
+  }
+
+  /// Query a cached instance of an analysis for the current ir unit if one
+  /// exists.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() {
+    return this->getAnalysisManager().template getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Mark all analyses as preserved.
+  void markAllAnalysesPreserved() {
+    this->getPassState().preservedAnalyses.preserveAll();
+  }
+
+  /// Mark the provided analyses as preserved.
+  template <typename... AnalysesT> void markAnalysesPreserved() {
+    this->getPassState().preservedAnalyses.template preserve<AnalysesT...>();
+  }
+  void markAnalysesPreserved(const AnalysisID *id) {
+    this->getPassState().preservedAnalyses.preserve(id);
+  }
+
+  /// Returns the derived pass name.
+  StringRef getName() override {
+    StringRef name = llvm::getTypeName<PassT>();
+    if (!name.consume_front("mlir::"))
+      name.consume_front("(anonymous namespace)::");
+    return name;
+  }
+};
+} // end namespace detail
+
+/// A model for providing function pass specific utilities.
+///
+/// Function passes must not:
+///   - read or modify any other functions within the parent module, as
+///     other threads may be manipulating them concurrently.
+///   - modify any state within the parent module, this includes adding
+///     additional functions.
+///
+/// Derived function passes are expected to provide the following:
+///   - A 'void runOnFunction()' method.
+template <typename T>
+struct FunctionPass : public detail::PassModel<FuncOp, T, FunctionPassBase> {
+  /// Returns the analysis for the parent module if it exists.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>> getCachedModuleAnalysis() {
+    return this->getAnalysisManager()
+        .template getCachedParentAnalysis<AnalysisT>(
+            this->getFunction().getParentOp());
+  }
+
+  /// A clone method to create a copy of this pass.
+  std::unique_ptr<FunctionPassBase> clone() const override {
+    return std::make_unique<T>(*static_cast<const T *>(this));
+  }
+};
+
+/// A model for providing module pass specific utilities.
+///
+/// Derived module passes are expected to provide the following:
+///   - A 'void runOnModule()' method.
+template <typename T>
+struct ModulePass : public detail::PassModel<ModuleOp, T, ModulePassBase> {
+  /// Returns the analysis for a child function.
+  template <typename AnalysisT> AnalysisT &getFunctionAnalysis(FuncOp f) {
+    return this->getAnalysisManager().template getChildAnalysis<AnalysisT>(f);
+  }
+
+  /// Returns an existing analysis for a child function if it exists.
+  template <typename AnalysisT>
+  llvm::Optional<std::reference_wrapper<AnalysisT>>
+  getCachedFunctionAnalysis(FuncOp f) {
+    return this->getAnalysisManager()
+        .template getCachedChildAnalysis<AnalysisT>(f);
+  }
+};
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASS_H
diff --git a/third_party/mlir/include/mlir/Pass/PassInstrumentation.h b/third_party/mlir/include/mlir/Pass/PassInstrumentation.h
new file mode 100644
index 00000000000..46df6fdd877
--- /dev/null
+++ b/third_party/mlir/include/mlir/Pass/PassInstrumentation.h
@@ -0,0 +1,104 @@
+//===- PassInstrumentation.h ------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_PASS_PASSINSTRUMENTATION_H_
+#define MLIR_PASS_PASSINSTRUMENTATION_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+using AnalysisID = ClassID;
+class Operation;
+class Pass;
+
+namespace detail {
+struct PassInstrumentorImpl;
+} // end namespace detail
+
+/// PassInstrumentation provdes several entry points into the pass manager
+/// infrastructure. Instrumentations should be added directly to a PassManager
+/// before running a pipeline.
+class PassInstrumentation {
+public:
+  virtual ~PassInstrumentation() = 0;
+
+  /// A callback to run before a pass is executed. This function takes a pointer
+  /// to the pass to be executed, as well as the current operation being
+  /// operated on.
+  virtual void runBeforePass(Pass *pass, Operation *op) {}
+
+  /// A callback to run after a pass is successfully executed. This function
+  /// takes a pointer to the pass to be executed, as well as the current
+  /// operation being operated on.
+  virtual void runAfterPass(Pass *pass, Operation *op) {}
+
+  /// A callback to run when a pass execution fails. This function takes a
+  /// pointer to the pass that was being executed, as well as the current
+  /// operation being operated on. Note that the operation may be in an invalid
+  /// state.
+  virtual void runAfterPassFailed(Pass *pass, Operation *op) {}
+
+  /// A callback to run before an analysis is computed. This function takes the
+  /// name of the analysis to be computed, its AnalysisID, as well as the
+  /// current operation being analyzed.
+  virtual void runBeforeAnalysis(llvm::StringRef name, AnalysisID *id,
+                                 Operation *op) {}
+
+  /// A callback to run before an analysis is computed. This function takes the
+  /// name of the analysis that was computed, its AnalysisID, as well as the
+  /// current operation being analyzed.
+  virtual void runAfterAnalysis(llvm::StringRef name, AnalysisID *id,
+                                Operation *op) {}
+};
+
+/// This class holds a collection of PassInstrumentation objects, and invokes
+/// their respective call backs.
+class PassInstrumentor {
+public:
+  PassInstrumentor();
+  PassInstrumentor(PassInstrumentor &&) = delete;
+  PassInstrumentor(const PassInstrumentor &) = delete;
+  ~PassInstrumentor();
+
+  /// See PassInstrumentation::runBeforePass for details.
+  void runBeforePass(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runAfterPass for details.
+  void runAfterPass(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runAfterPassFailed for details.
+  void runAfterPassFailed(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runBeforeAnalysis for details.
+  void runBeforeAnalysis(llvm::StringRef name, AnalysisID *id, Operation *op);
+
+  /// See PassInstrumentation::runAfterAnalysis for details.
+  void runAfterAnalysis(llvm::StringRef name, AnalysisID *id, Operation *op);
+
+  /// Add the given instrumentation to the collection. This takes ownership over
+  /// the given pointer.
+  void addInstrumentation(PassInstrumentation *pi);
+
+private:
+  std::unique_ptr<detail::PassInstrumentorImpl> impl;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSINSTRUMENTATION_H_
diff --git a/third_party/mlir/include/mlir/Pass/PassManager.h b/third_party/mlir/include/mlir/Pass/PassManager.h
new file mode 100644
index 00000000000..b01445eae4c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Pass/PassManager.h
@@ -0,0 +1,142 @@
+//===- PassManager.h - Pass Management Interface ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_PASS_PASSMANAGER_H
+#define MLIR_PASS_PASSMANAGER_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+class Any;
+} // end namespace llvm
+
+namespace mlir {
+class FunctionPassBase;
+class ModuleOp;
+class ModulePassBase;
+class Pass;
+class PassInstrumentation;
+class PassInstrumentor;
+
+namespace detail {
+class PassExecutor;
+class ModulePassExecutor;
+} // end namespace detail
+
+/// An enum describing the different display modes for the pass timing
+/// information within the pass manager.
+enum class PassTimingDisplayMode {
+  // In this mode the results are displayed in a list sorted by total time,
+  // with each pass/analysis instance aggregated into one unique result.
+  List,
+
+  // In this mode the results are displayed in a nested pipeline view that
+  // mirrors the internal pass pipeline that is being executed in the pass
+  // manager.
+  Pipeline,
+};
+
+/// The main pass manager and pipeline builder.
+class PassManager {
+public:
+  // If verifyPasses is true, the verifier is run after each pass.
+  PassManager(bool verifyPasses = true);
+  ~PassManager();
+
+  /// Run the passes within this manager on the provided module.
+  LLVM_NODISCARD
+  LogicalResult run(ModuleOp module);
+
+  /// Disable support for multi-threading within the pass manager.
+  void disableMultithreading(bool disable = true);
+
+  //===--------------------------------------------------------------------===//
+  // Pipeline Building
+  //===--------------------------------------------------------------------===//
+
+  /// Add an opaque pass pointer to the current manager. This takes ownership
+  /// over the provided pass pointer.
+  void addPass(std::unique_ptr<Pass> pass);
+
+  /// Add a module pass to the current manager. This takes ownership over the
+  /// provided pass pointer.
+  void addPass(std::unique_ptr<ModulePassBase> pass);
+
+  /// Add a function pass to the current manager. This takes ownership over the
+  /// provided pass pointer. This will automatically create a function pass
+  /// executor if necessary.
+  void addPass(std::unique_ptr<FunctionPassBase> pass);
+
+  //===--------------------------------------------------------------------===//
+  // Instrumentations
+  //===--------------------------------------------------------------------===//
+
+  /// Add the provided instrumentation to the pass manager. This takes ownership
+  /// over the given pointer.
+  void addInstrumentation(PassInstrumentation *pi);
+
+  /// Add an instrumentation to print the IR before and after pass execution.
+  /// * 'shouldPrintBeforePass' and 'shouldPrintAfterPass' correspond to filter
+  ///   functions that take a 'Pass *'. These function should return true if the
+  ///   IR should be printed or not.
+  /// * 'printModuleScope' signals if the module IR should be printed, even for
+  ///   non module passes.
+  /// * 'out' corresponds to the stream to output the printed IR to.
+  void enableIRPrinting(std::function<bool(Pass *)> shouldPrintBeforePass,
+                        std::function<bool(Pass *)> shouldPrintAfterPass,
+                        bool printModuleScope, raw_ostream &out);
+
+  /// Add an instrumentation to time the execution of passes and the computation
+  /// of analyses.
+  /// Note: Timing should be enabled after all other instrumentations to avoid
+  /// any potential "ghost" timing from other instrumentations being
+  /// unintentionally included in the timing results.
+  void enableTiming(
+      PassTimingDisplayMode displayMode = PassTimingDisplayMode::Pipeline);
+
+private:
+  /// A stack of nested pass executors on sub-module IR units, e.g. function.
+  llvm::SmallVector<detail::PassExecutor *, 1> nestedExecutorStack;
+
+  /// The top level module pass executor.
+  std::unique_ptr<detail::ModulePassExecutor> mpe;
+
+  /// Flag that specifies if the IR should be verified after each pass has run.
+  bool verifyPasses : 1;
+
+  /// Flag that specifies if pass timing is enabled.
+  bool passTiming : 1;
+
+  /// Flag that specifies if multi-threading is disabled.
+  bool disableThreads : 1;
+
+  /// A manager for pass instrumentations.
+  std::unique_ptr<PassInstrumentor> instrumentor;
+};
+
+/// Register a set of useful command-line options that can be used to configure
+/// a pass manager. The values of these options can be applied via the
+/// 'applyPassManagerCLOptions' method below.
+void registerPassManagerCLOptions();
+
+/// Apply any values provided to the pass manager options that were registered
+/// with 'registerPassManagerOptions'.
+void applyPassManagerCLOptions(PassManager &pm);
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSMANAGER_H
diff --git a/third_party/mlir/include/mlir/Pass/PassRegistry.h b/third_party/mlir/include/mlir/Pass/PassRegistry.h
new file mode 100644
index 00000000000..eea3778d8b1
--- /dev/null
+++ b/third_party/mlir/include/mlir/Pass/PassRegistry.h
@@ -0,0 +1,168 @@
+//===- PassRegistry.h - Pass Registration Utilities -------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains utilities for registering information about compiler
+// passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSREGISTRY_H_
+#define MLIR_PASS_PASSREGISTRY_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include <functional>
+#include <memory>
+
+namespace mlir {
+class Pass;
+class PassManager;
+
+/// A registry function that adds passes to the given pass manager.
+using PassRegistryFunction = std::function<void(PassManager &)>;
+
+using PassAllocatorFunction = std::function<std::unique_ptr<Pass>()>;
+
+/// A special type used by transformation passes to provide an address that can
+/// act as a unique identifier during pass registration.
+using PassID = ClassID;
+
+/// Structure to group information about a passes and pass pipelines (argument
+/// to invoke via mlir-opt, description, pass pipeline builder).
+class PassRegistryEntry {
+public:
+  /// Adds this pass registry entry to the given pass manager.
+  void addToPipeline(PassManager &pm) const {
+    assert(builder &&
+           "Cannot call addToPipeline on PassRegistryEntry without builder");
+    builder(pm);
+  }
+
+  /// Returns the command line option that may be passed to 'mlir-opt' that will
+  /// cause this pass to run or null if there is no such argument.
+  StringRef getPassArgument() const { return arg; }
+
+  /// Returns a description for the pass, this never returns null.
+  StringRef getPassDescription() const { return description; }
+
+protected:
+  PassRegistryEntry(StringRef arg, StringRef description,
+                    PassRegistryFunction builder)
+      : arg(arg), description(description), builder(builder) {}
+
+private:
+  // The argument with which to invoke the pass via mlir-opt.
+  StringRef arg;
+
+  // Description of the pass.
+  StringRef description;
+
+  // Function to register this entry to a pass manager pipeline.
+  PassRegistryFunction builder;
+};
+
+/// A structure to represent the information of a registered pass pipeline.
+class PassPipelineInfo : public PassRegistryEntry {
+public:
+  PassPipelineInfo(StringRef arg, StringRef description,
+                   PassRegistryFunction builder)
+      : PassRegistryEntry(arg, description, builder) {}
+};
+
+/// A structure to represent the information for a derived pass class.
+class PassInfo : public PassRegistryEntry {
+public:
+  /// PassInfo constructor should not be invoked directly, instead use
+  /// PassRegistration or registerPass.
+  PassInfo(StringRef arg, StringRef description, const PassID *passID,
+           PassAllocatorFunction allocator);
+};
+
+/// Register a specific dialect pipeline registry function with the system,
+/// typically used through the PassPipelineRegistration template.
+void registerPassPipeline(StringRef arg, StringRef description,
+                          const PassRegistryFunction &function);
+
+/// Register a specific dialect pass allocator function with the system,
+/// typically used through the PassRegistration template.
+void registerPass(StringRef arg, StringRef description, const PassID *passID,
+                  const PassAllocatorFunction &function);
+
+/// PassRegistration provides a global initializer that registers a Pass
+/// allocation routine for a concrete pass instance.  The third argument is
+/// optional and provides a callback to construct a pass that does not have
+/// a default constructor.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static PassRegistration<MyPass> Unused("unused", "Unused pass");
+template <typename ConcretePass> struct PassRegistration {
+  PassRegistration(StringRef arg, StringRef description,
+                   const PassAllocatorFunction &constructor) {
+    registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
+  }
+
+  PassRegistration(StringRef arg, StringRef description) {
+    PassAllocatorFunction constructor = [] {
+      return std::make_unique<ConcretePass>();
+    };
+    registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
+  }
+};
+
+/// PassPipelineRegistration provides a global initializer that registers a Pass
+/// pipeline builder routine.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   void pipelineBuilder(PassManager &pm) {
+///      pm.addPass(new MyPass());
+///      pm.addPass(new MyOtherPass());
+///   }
+///
+///   static PassPipelineRegistration Unused("unused", "Unused pass",
+///                                          pipelineBuilder);
+struct PassPipelineRegistration {
+  PassPipelineRegistration(StringRef arg, StringRef description,
+                           PassRegistryFunction builder) {
+    registerPassPipeline(arg, description, builder);
+  }
+
+  /// Constructor that accepts a pass allocator function instead of the standard
+  /// registry function. This is useful for registering specializations of
+  /// existing passes.
+  PassPipelineRegistration(StringRef arg, StringRef description,
+                           PassAllocatorFunction allocator);
+};
+
+/// Adds command line option for each registered pass.
+struct PassNameParser : public llvm::cl::parser<const PassRegistryEntry *> {
+  PassNameParser(llvm::cl::Option &opt);
+
+  void initialize();
+
+  void printOptionInfo(const llvm::cl::Option &O,
+                       size_t GlobalWidth) const override;
+};
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSREGISTRY_H_
diff --git a/third_party/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h b/third_party/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h
new file mode 100644
index 00000000000..467512f2b77
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h
@@ -0,0 +1,50 @@
+//===- FxpMathConfig.h - Reference fixed point config -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a TargetConfiguration for reference fixed-point math
+// quantization scheme based on the FxpMathOps (plus a small category of
+// extension ops that can be added from other dialects).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
+#define MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
+
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Target configuration for a reference affine/fixed-point quantization
+/// scheme defined in terms of the FxpMathOps dialect. This can be extended
+/// with select ops from other dialects by way of the following public
+/// methods:
+///   - addValueIdentityOp
+class FxpMathTargetConfig : public TargetConfiguration {
+public:
+  /// Creates an FxpMathTargetConfig instance which can be further customized.
+  static std::unique_ptr<FxpMathTargetConfig> create(SolverContext &context);
+
+protected:
+  FxpMathTargetConfig(SolverContext &context) : TargetConfiguration(context) {}
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/Configuration.h b/third_party/mlir/include/mlir/Quantizer/Support/Configuration.h
new file mode 100644
index 00000000000..a260824a7e6
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/Configuration.h
@@ -0,0 +1,155 @@
+//===- Configuration.h - Configuration object base classes ------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// The quantizer is relatively agnostic to source and target dialects, with
+// the specific represented by configuration policy objects derived from
+// classes in this file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
+#define MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
+
+#include <functional>
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace mlir {
+class Operation;
+
+namespace quantizer {
+
+class CAGSlice;
+
+/// Defines quantization configuration for the target.
+/// The settings here depend on a variety of details about the deployment
+/// environment, although, where we have control over such things, we do
+/// try to standardize as possible.
+///
+/// Non-const methods are used to setup the configuration. It is expected that
+/// const instances/references are used post-build.
+class TargetConfiguration {
+public:
+  static constexpr size_t MaxSchemeIndex = 31;
+  using OpHandlerFn = std::function<void(Operation *op, CAGSlice &cag)>;
+
+  TargetConfiguration(SolverContext &context);
+  virtual ~TargetConfiguration() = default;
+
+  /// Adds a candidate type, returning its ordinal.
+  unsigned addCandidateType(quant::AnyQuantizedType quantizedType,
+                            CandidateQuantizedType::Scheme scheme) {
+    unsigned ordinal = candidateTypes.size();
+    assert(allCandidateTypesMask.size() == ordinal);
+    CandidateQuantizedType ct{ordinal, quantizedType, scheme};
+    candidateTypes.push_back(ct);
+    allCandidateTypesMask.push_back(true);
+    return ordinal;
+  }
+
+  /// Gets a prototype scheme by index.
+  const CandidateQuantizedType &getCandidateType(unsigned index) const {
+    assert(index < candidateTypes.size());
+    return candidateTypes[index];
+  }
+
+  llvm::ArrayRef<CandidateQuantizedType> getCandidateTypes() const {
+    return candidateTypes;
+  }
+
+  /// Gets a mask of all enabled candidate types by ordinal.
+  llvm::SmallBitVector getAllCandidateTypesMask() const {
+    return allCandidateTypesMask;
+  }
+
+  /// Gets a mask with every candidate type except those in the given mask.
+  llvm::SmallBitVector getCandidateTypeDisabledExceptMask(
+      llvm::ArrayRef<unsigned> exceptOrdinals) const {
+    llvm::SmallBitVector disabled(allCandidateTypesMask);
+    for (unsigned ordinal : exceptOrdinals) {
+      disabled.reset(ordinal);
+    }
+    return disabled;
+  }
+
+  /// Adds an op handler.
+  template <typename OpTy>
+  void addOpHandler(OpHandlerFn fn) {
+    addOpHandlerByName(OpTy::getOperationName(), fn);
+  }
+
+  /// Adds an operation which requires statistics at its result nodes for
+  /// best quantization performance. Note that the opName StringRef is
+  /// expected to come from getOperationName() and be static.
+  template <typename OpTy>
+  void addRequireStatsOp() {
+    addRequireStatsOpByName(OpTy::getOperationName());
+  }
+
+  /// Returns whether opName is a RequireStatsOp.
+  bool isRequireStatsOp(Operation *op) const;
+
+  /// Adds an op which does not mutate its values but may mutate its shape
+  /// or combine its operands in an arbitrary way.
+  /// Such ops are expected to have the same types for operands and results
+  /// and must be capable of operating on storage types.
+  template <typename OpTy>
+  void addValueIdentityOp() {
+    addValueIdentityOpByName(OpTy::getOperationName());
+  }
+
+  /// Handles the operation if a handler is defined for it.
+  void handleOp(Operation *op, CAGSlice &cag) const;
+
+  /// Finalizes the CAG after all anchors have been added.
+  virtual void finalizeAnchors(CAGSlice &cag) const {}
+
+  /// Whether an operand or result type is subject to analysis by this config.
+  virtual bool isHandledType(Type t) const = 0;
+
+protected:
+  virtual void addValueIdentityOpByName(StringRef opName) = 0;
+  void addOpHandlerByName(StringRef name, OpHandlerFn fn);
+
+private:
+  void addRequireStatsOpByName(StringRef opName);
+
+  /// Vector of all candidate type constraints, indexed by ordinal.
+  std::vector<CandidateQuantizedType> candidateTypes;
+
+  // A SmallBoolVector with bits set for all known candidate types.
+  llvm::SmallBitVector allCandidateTypesMask;
+
+  /// Map of all op handlers.
+  llvm::StringMap<OpHandlerFn> opHandlers;
+
+  /// Names of operations which should have their results annotated with
+  /// statistics.
+  llvm::StringSet<> requireStatsOpNames;
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h b/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h
new file mode 100644
index 00000000000..63f62dbeeeb
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h
@@ -0,0 +1,374 @@
+//===- ConstraintAnalysisGraph.h - Graphs type for constraints --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file provides graph-based data structures for representing anchors
+// and constraints between them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
+#define MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
+
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+namespace quantizer {
+
+class CAGNode;
+class CAGSlice;
+class TargetConfiguration;
+
+/// A node in the Constraint Analysis Graph.
+/// Nodes are either anchors (representing results and operands) or constraints.
+/// Anchor nodes are connected to other anchor nodes via constraints.
+/// Nodes exist within graph slices, which are typically analyses attached to
+/// the function or module. Slices can contain other slices, which mirrors
+/// the nesting of analyses.
+///
+/// Nodes have directed relationships which propagate successor-ward when dirty.
+/// Relationships can be bi-directional, in which case, the constraint's
+/// propagation mechanism must ensure convergence.
+class CAGNode {
+public:
+  enum class Kind {
+    /// Anchors.
+    Anchor,
+    OperandAnchor,
+    ResultAnchor,
+    LastAnchor = ResultAnchor,
+
+    /// Constraints.
+    Constraint,
+    SolveUniformConstraint,
+    UniformPropagateExplicitScale,
+    LastConstraint = UniformPropagateExplicitScale,
+  };
+
+  // Vector and iterator over nodes.
+  using node_vector = llvm::SmallVector<CAGNode *, 1>;
+  using iterator = node_vector::iterator;
+  using const_iterator = node_vector::const_iterator;
+
+  virtual ~CAGNode() = default;
+
+  Kind getKind() const { return kind; }
+
+  /// Unique id of the node within the slice.
+  int getNodeId() const { return nodeId; }
+
+  /// Whether the node is dirty, requiring one or more calls to propagate().
+  bool isDirty() const { return dirty; }
+  void markDirty() { dirty = true; }
+  void clearDirty() { dirty = false; }
+
+  /// Iterator over this node's children (outgoing) nodes.
+  const_iterator begin() const { return outgoing.begin(); }
+  const_iterator end() const { return outgoing.end(); }
+  iterator begin() { return outgoing.begin(); }
+  iterator end() { return outgoing.end(); }
+
+  /// Iterator over this parents (incoming) nodes.
+  const_iterator incoming_begin() const { return incoming.begin(); }
+  const_iterator incoming_end() const { return incoming.end(); }
+  iterator incoming_begin() { return incoming.begin(); }
+  iterator incoming_end() { return incoming.end(); }
+
+  virtual void propagate(SolverContext &solverContext,
+                         const TargetConfiguration &config) {}
+
+  /// Prints the node label, suitable for one-line display.
+  virtual void printLabel(llvm::raw_ostream &os) const;
+
+  template <typename T>
+  void findChildrenOfKind(llvm::SmallVectorImpl<T *> &found) {
+    for (CAGNode *child : *this) {
+      T *ofKind = llvm::dyn_cast<T>(child);
+      if (ofKind) {
+        found.push_back(ofKind);
+      }
+    }
+  }
+
+  /// Replaces this node by rerouting any parent nodes to have otherNode
+  /// as a child.
+  void replaceIncoming(CAGNode *otherNode);
+
+  /// Adds an outgoing connection to this node (and corresponding back
+  /// incoming connection).
+  void addOutgoing(CAGNode *toNode);
+
+  /// Whether this node is an orphan (has no incoming or outgoing connections).
+  bool isOrphan() const { return incoming.empty() && outgoing.empty(); }
+
+protected:
+  CAGNode(Kind kind) : kind(kind) {}
+
+private:
+  Kind kind;
+  int nodeId = -1;
+  node_vector outgoing;
+  node_vector incoming;
+  bool dirty = false;
+
+  friend class CAGSlice;
+};
+
+/// Anchor nodes represent points in the source IR where we may choose to
+/// introduce a type transition. These include operands, results, arguments
+/// returns, etc.
+class CAGAnchorNode : public CAGNode {
+public:
+  enum class TypeTransformRule {
+    /// The owning op directly supports all transformed types. In practice,
+    /// this means that the op supports QuantizedType for this anchor.
+    Direct,
+
+    /// The type of this anchor should be set to the QuantizedType storage
+    /// type. This will only be valid if constraints are such that all
+    /// inputs/outputs converge to the same storage type (i.e. coupled).
+    DirectStorage,
+
+    /// The anchor must only be typed based on the expressed type. This is
+    /// used for ops that do not natively support quantization, and suitable
+    /// casts will be inserted.
+    ExpressedOnly,
+  };
+
+  /// Metadata for solving uniform quantization params.
+  CAGUniformMetadata &getUniformMetadata() { return uniformMetadata; }
+  const CAGUniformMetadata &getUniformMetadata() const {
+    return uniformMetadata;
+  }
+
+  virtual Operation *getOp() const = 0;
+  virtual Value *getValue() const = 0;
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() >= Kind::Anchor && n->getKind() <= Kind::LastAnchor;
+  }
+
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override;
+
+  void printLabel(llvm::raw_ostream &os) const override;
+
+  /// Given the anchor metadata and resolved solutions, chooses the most
+  /// salient and returns an appropriate type to represent it.
+  Type getTransformedType();
+
+  TypeTransformRule getTypeTransformRule() const { return typeTransformRule; }
+
+  void setTypeTransformRule(TypeTransformRule r) { typeTransformRule = r; }
+
+  /// Gets the Type that was defined for this anchor at the time of
+  /// construction.
+  Type getOriginalType() const { return originalType; }
+
+protected:
+  CAGAnchorNode(Kind kind, Type originalType)
+      : CAGNode(kind), originalType(originalType) {}
+
+private:
+  CAGUniformMetadata uniformMetadata;
+  Type originalType;
+  TypeTransformRule typeTransformRule = TypeTransformRule::Direct;
+};
+
+/// An anchor tied to a specific operand.
+/// Since operand anchors can be rewritten so that the operand refers to
+/// a new result, they are maintained by reference (to the op and index).
+class CAGOperandAnchor : public CAGAnchorNode {
+public:
+  CAGOperandAnchor(Operation *op, unsigned operandIdx);
+
+  Operation *getOp() const final { return op; }
+  unsigned getOperandIdx() const { return operandIdx; }
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Anchor || n->getKind() == Kind::OperandAnchor;
+  }
+
+  Value *getValue() const final { return op->getOperand(operandIdx); }
+
+  void printLabel(llvm::raw_ostream &os) const override;
+
+private:
+  Operation *op;
+  unsigned operandIdx;
+};
+
+/// An anchor tied to a specific result.
+/// Since a result is already anchored to its defining op, result anchors refer
+/// directly to the underlying Value*.
+class CAGResultAnchor : public CAGAnchorNode {
+public:
+  CAGResultAnchor(Operation *op, unsigned resultIdx);
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Anchor || n->getKind() == Kind::ResultAnchor;
+  }
+
+  Operation *getOp() const final { return resultValue->getDefiningOp(); }
+  Value *getValue() const final { return resultValue; }
+
+  void printLabel(llvm::raw_ostream &os) const override;
+
+private:
+  Value *resultValue;
+};
+
+/// Base class for constraint nodes.
+class CAGConstraintNode : public CAGNode {
+public:
+  CAGConstraintNode(Kind kind) : CAGNode(kind) {}
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() >= Kind::Constraint &&
+           n->getKind() <= Kind::LastConstraint;
+  }
+};
+
+/// A slice of a CAG (which may be the whole graph).
+class CAGSlice {
+public:
+  CAGSlice(SolverContext &context);
+  ~CAGSlice();
+
+  using node_vector = std::vector<CAGNode *>;
+  using iterator = node_vector::iterator;
+  using const_iterator = node_vector::const_iterator;
+
+  iterator begin() { return allNodes.begin(); }
+  iterator end() { return allNodes.end(); }
+  const_iterator begin() const { return allNodes.begin(); }
+  const_iterator end() const { return allNodes.end(); }
+
+  /// Gets an operand anchor node.
+  CAGOperandAnchor *getOperandAnchor(Operation *op, unsigned operandIdx);
+
+  /// Gets a result anchor node.
+  CAGResultAnchor *getResultAnchor(Operation *op, unsigned resultIdx);
+
+  /// Adds a relation constraint with incoming 'from' anchors and outgoing 'to'
+  /// anchors.
+  template <typename T, typename... Args>
+  T *addUniqueConstraint(llvm::ArrayRef<CAGAnchorNode *> anchors,
+                         Args... args) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    T *constraintNode = addNode(std::make_unique<T>(args...));
+    for (auto *anchor : anchors)
+      anchor->addOutgoing(constraintNode);
+    return constraintNode;
+  }
+
+  /// Adds a unidirectional constraint from a node to an array of target nodes.
+  template <typename T, typename... Args>
+  T *addUnidirectionalConstraint(CAGAnchorNode *fromAnchor,
+                                 llvm::ArrayRef<CAGAnchorNode *> toAnchors,
+                                 Args... args) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    T *constraintNode = addNode(std::make_unique<T>(args...));
+    fromAnchor->addOutgoing(constraintNode);
+    for (auto *toAnchor : toAnchors) {
+      constraintNode->addOutgoing(toAnchor);
+    }
+    return constraintNode;
+  }
+
+  template <typename T>
+  T *addClusteredConstraint(llvm::ArrayRef<CAGAnchorNode *> anchors) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    llvm::SmallVector<T *, 8> cluster;
+    for (auto *anchor : anchors) {
+      anchor->findChildrenOfKind<T>(cluster);
+    }
+
+    T *constraintNode;
+    if (cluster.empty()) {
+      // Create new.
+      constraintNode = addNode(std::make_unique<T>());
+    } else {
+      // Merge existing.
+      constraintNode = cluster[0];
+      for (size_t i = 1, e = cluster.size(); i < e; ++i) {
+        cluster[i]->replaceIncoming(constraintNode);
+      }
+    }
+    for (auto *anchor : anchors) {
+      anchor->addOutgoing(constraintNode);
+    }
+    return constraintNode;
+  }
+
+  /// Enumerates all implied connections in the slice.
+  /// An implied connection is any two nodes that physically refer to the
+  /// same value in the IR, such as result->operand.
+  /// Typically this will be modeled with some kind of strong or weak
+  /// identity constraint such that types propagate.
+  /// This is usually called when the slice has been fully constructed in
+  /// order to add final constraints.
+  /// It is legal for the callback to modify the graph by adding constraints.
+  void enumerateImpliedConnections(
+      std::function<void(CAGAnchorNode *from, CAGAnchorNode *to)> callback);
+
+  /// Performs one round of propagation, returning the number of nodes
+  /// propagates. If returns > 0, then additional propagate() rounds are
+  /// required.
+  unsigned propagate(const TargetConfiguration &config);
+
+private:
+  /// Adds a node to the graph.
+  /// The node should be a subclass of TransformNode.
+  /// Returns the raw pointer to the node.
+  template <typename T>
+  T *addNode(std::unique_ptr<T> node) {
+    node->nodeId = allNodes.size();
+    T *unownedNode = node.release();
+    allNodes.push_back(unownedNode);
+    return unownedNode;
+  }
+
+  SolverContext &context;
+  std::vector<CAGNode *> allNodes;
+  llvm::DenseMap<std::pair<Operation *, unsigned>, CAGOperandAnchor *>
+      operandAnchors;
+  llvm::DenseMap<std::pair<Operation *, unsigned>, CAGResultAnchor *>
+      resultAnchors;
+};
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                     const CAGNode &node) {
+  node.printLabel(os);
+  return os;
+}
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h b/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h
new file mode 100644
index 00000000000..7e2b61d0496
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h
@@ -0,0 +1,58 @@
+//===- ConstraintAnalysisGraphTraits.h - Traits for CAGs --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Provides graph traits for constraint analysis graphs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
+#define MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
+
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "llvm/ADT/GraphTraits.h"
+
+namespace llvm {
+
+template <>
+struct GraphTraits<const mlir::quantizer::CAGNode *> {
+  using NodeRef = const mlir::quantizer::CAGNode *;
+
+  static NodeRef getEntryNode(NodeRef node) { return node; }
+
+  // Successors.
+  using ChildIteratorType = mlir::quantizer::CAGNode::const_iterator;
+  static ChildIteratorType child_begin(NodeRef node) { return node->begin(); }
+  static ChildIteratorType child_end(NodeRef node) { return node->end(); }
+};
+
+template <>
+struct GraphTraits<const mlir::quantizer::CAGSlice *>
+    : public llvm::GraphTraits<const mlir::quantizer::CAGNode *> {
+  using nodes_iterator = mlir::quantizer::CAGSlice::const_iterator;
+  static mlir::quantizer::CAGSlice::const_iterator
+  nodes_begin(const mlir::quantizer::CAGSlice *G) {
+    return G->begin();
+  }
+  static mlir::quantizer::CAGSlice::const_iterator
+  nodes_end(const mlir::quantizer::CAGSlice *G) {
+    return G->end();
+  }
+};
+
+} // end namespace llvm
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/Metadata.h b/third_party/mlir/include/mlir/Quantizer/Support/Metadata.h
new file mode 100644
index 00000000000..a2ed6814c5e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/Metadata.h
@@ -0,0 +1,110 @@
+//===- Metadata.h - Top level types and metadata ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains top level types needed to construct constraint graphs,
+// including context/allocator support and concrete metadata structs for
+// different quantization schemes (which must be attached to anchor nodes).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_METADATA_H
+#define MLIR_QUANTIZER_SUPPORT_METADATA_H
+
+#include <limits>
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace mlir {
+namespace quantizer {
+
+class SolverContext {
+public:
+  SolverContext(MLIRContext &mlirContext) : mlirContext(mlirContext) {}
+
+  MLIRContext &getMlirContext() { return mlirContext; }
+
+  llvm::BumpPtrAllocator &getAllocator() { return allocator; }
+
+  // Optional path to write a debug DOT file for the CAG.
+  StringRef getDebugCAGDotPath() const { return debugCAGDotPath; }
+  void setDebugCAGDotPath(StringRef p) { debugCAGDotPath = p; }
+
+private:
+  MLIRContext &mlirContext;
+  llvm::BumpPtrAllocator allocator;
+  std::string debugCAGDotPath;
+};
+
+/// Candidate for a quantized type conversion.
+struct CandidateQuantizedType {
+  // Note that scheme encodes more than just the target type: it also encodes
+  // additional constraints.
+  enum class Scheme {
+    // Uses aggregate range information for all nodes in the cluster to
+    // solve for uniform scale and zero point.
+    UniformPerLayer,
+    // Uses aggregate per-axis range information for all nodes in the cluster
+    // to solve for per-axis uniform scale and zero point.
+    UniformPerAxisFixedPoint,
+    // Uses the |explicitScaleZeroPoint| to set the scale (and zero point = 0)
+    // for the uniform type. This typically overrides all other constraints
+    // and is used for wide accumulator types (i.e. i32 bias vectors).
+    UniformExplicitFixedPointScale,
+  };
+  unsigned ordinal;
+  quant::AnyQuantizedType quantizedType;
+  Scheme scheme;
+};
+
+struct CAGUniformMetadata {
+  /// Default salience for facts that are derived from data either statically
+  /// discovered in the computation or observed from an outside source.
+  static constexpr int SalienceDefault = 0;
+
+  /// Highest salience level for facts derived from overrides provided
+  /// explicitly.
+  static constexpr int SalienceForced = 100;
+
+  /// Salience for facts derived from constraints in how the math is
+  /// expressed which must be satisfied.
+  static constexpr int SalienceRequired = 200;
+
+  /// The range that the scheme must represent in order to accomadate the
+  /// underlying data.
+  ExpandingMinMaxFact requiredRange;
+
+  /// Bool vector of scheme ordinals that are disabled.
+  llvm::SmallBitVector disabledCandidateTypes;
+
+  /// If set, then a solution has converged for the given per-layer scheme.
+  quant::QuantizedType selectedType;
+
+  /// Optional scale and zero point to be used by types which solve via the
+  /// UniformExplicitFixedPointScale scheme.
+  DiscreteScaleZeroPointFact explicitScaleZeroPoint;
+
+  /// Prints a summary of the metadata suitable for display in a graph label.
+  void printSummary(llvm::raw_ostream &os) const;
+};
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_METADATA_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/Rules.h b/third_party/mlir/include/mlir/Quantizer/Support/Rules.h
new file mode 100644
index 00000000000..9d1e53df5c0
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/Rules.h
@@ -0,0 +1,209 @@
+//===- Rules.h - Helpers for declaring facts and rules ----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines helper classes and functions for managing state (facts),
+// merging and tracking modification for various data types important for
+// quantization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_RULES_H
+#define MLIR_QUANTIZER_SUPPORT_RULES_H
+
+#include "llvm/ADT/Optional.h"
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+namespace mlir {
+namespace quantizer {
+
+/// Typed indicator of whether a mutator produces a modification.
+struct ModificationResult {
+  enum ModificationEnum { Retained, Modified } value;
+  ModificationResult(ModificationEnum v) : value(v) {}
+
+  ModificationResult operator|(ModificationResult other) {
+    if (value == Modified || other.value == Modified) {
+      return ModificationResult(Modified);
+    } else {
+      return ModificationResult(Retained);
+    }
+  }
+
+  ModificationResult operator|=(ModificationResult other) {
+    value =
+        (value == Modified || other.value == Modified) ? Modified : Retained;
+    return *this;
+  }
+};
+
+inline ModificationResult modify(bool isModified = true) {
+  return ModificationResult{isModified ? ModificationResult::Modified
+                                       : ModificationResult::Retained};
+}
+
+inline bool modified(ModificationResult m) {
+  return m.value == ModificationResult::Modified;
+}
+
+/// A fact that can converge through forward propagation alone without the
+/// need to track ownership or individual assertions. In practice, this works
+/// for static assertions that are either minimized or maximized and do not
+/// vary dynamically.
+///
+/// It is expected that ValueTy is appropriate to pass by value and has an
+/// operator==. The BinaryReducer type should have two static methods:
+///   using ValueTy : Type of the value.
+///   ValueTy initialValue() : Returns the initial value of the fact.
+///   ValueTy reduce(ValueTy lhs, ValueTy rhs) : Reduces two values.
+template <typename BinaryReducer>
+class BasePropagatedFact {
+public:
+  using ValueTy = typename BinaryReducer::ValueTy;
+  using ThisTy = BasePropagatedFact<BinaryReducer>;
+  BasePropagatedFact()
+      : value(BinaryReducer::initialValue()),
+        salience(std::numeric_limits<int>::min()) {}
+
+  int getSalience() const { return salience; }
+  bool hasValue() const { return salience != std::numeric_limits<int>::min(); }
+  ValueTy getValue() const { return value; }
+  ModificationResult assertValue(int assertSalience, ValueTy assertValue) {
+    if (assertSalience > salience) {
+      // New salience band.
+      value = assertValue;
+      salience = assertSalience;
+      return modify(true);
+    } else if (assertSalience < salience) {
+      // Lower salience - ignore.
+      return modify(false);
+    }
+    // Merge within same salience band.
+    ValueTy updatedValue = BinaryReducer::reduce(value, assertValue);
+    auto mod = modify(value != updatedValue);
+    value = updatedValue;
+    return mod;
+  }
+  ModificationResult mergeFrom(const ThisTy &other) {
+    if (other.hasValue()) {
+      return assertValue(other.getSalience(), other.getValue());
+    }
+    return modify(false);
+  }
+
+private:
+  ValueTy value;
+  int salience;
+};
+
+/// A binary reducer that expands a min/max range represented by a pair
+/// of doubles such that it represents the largest of all inputs.
+/// The initial value is (Inf, -Inf).
+struct ExpandingMinMaxReducer {
+  using ValueTy = std::pair<double, double>;
+  static ValueTy initialValue() {
+    return std::make_pair(std::numeric_limits<double>::infinity(),
+                          -std::numeric_limits<double>::infinity());
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) {
+    return std::make_pair(std::min(lhs.first, rhs.first),
+                          std::max(lhs.second, rhs.second));
+  }
+};
+using ExpandingMinMaxFact = BasePropagatedFact<ExpandingMinMaxReducer>;
+
+/// A binary reducer that minimizing a numeric type.
+template <typename T>
+struct MinimizingNumericReducer {
+  using ValueTy = T;
+  static ValueTy initialValue() {
+    if (std::numeric_limits<T>::has_infinity()) {
+      return std::numeric_limits<T>::infinity();
+    } else {
+      return std::numeric_limits<T>::max();
+    }
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) { return std::min(lhs, rhs); }
+};
+using MinimizingDoubleFact =
+    BasePropagatedFact<MinimizingNumericReducer<double>>;
+using MinimizingIntFact = BasePropagatedFact<MinimizingNumericReducer<int>>;
+
+/// A binary reducer that maximizes a numeric type.
+template <typename T>
+struct MaximizingNumericReducer {
+  using ValueTy = T;
+  static ValueTy initialValue() {
+    if (std::numeric_limits<T>::has_infinity()) {
+      return -std::numeric_limits<T>::infinity();
+    } else {
+      return std::numeric_limits<T>::min();
+    }
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) { return std::max(lhs, rhs); }
+};
+using MaximizingDoubleFact =
+    BasePropagatedFact<MaximizingNumericReducer<double>>;
+using MaximizingIntFact = BasePropagatedFact<MaximizingNumericReducer<int>>;
+
+/// A fact and reducer for tracking agreement of discrete values. The value
+/// type consists of a |T| value and a flag indicating whether there is a
+/// conflict (in which case, the preserved value is arbitrary).
+template <typename T>
+struct DiscreteReducer {
+  struct ValueTy {
+    ValueTy() : conflict(false) {}
+    ValueTy(T value) : value(value), conflict(false) {}
+    ValueTy(T value, bool conflict) : value(value), conflict(conflict) {}
+    llvm::Optional<T> value;
+    bool conflict;
+    bool operator==(const ValueTy &other) const {
+      if (conflict != other.conflict)
+        return false;
+      if (value && other.value) {
+        return *value == *other.value;
+      } else {
+        return !value && !other.value;
+      }
+    }
+    bool operator!=(const ValueTy &other) const { return !(*this == other); }
+  };
+  static ValueTy initialValue() { return ValueTy(); }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) {
+    if (!lhs.value && !rhs.value)
+      return lhs;
+    else if (!lhs.value)
+      return rhs;
+    else if (!rhs.value)
+      return lhs;
+    else
+      return ValueTy(*lhs.value, *lhs.value != *rhs.value);
+  }
+};
+
+template <typename T>
+using DiscreteFact = BasePropagatedFact<DiscreteReducer<T>>;
+
+/// Discrete scale/zeroPoint fact.
+using DiscreteScaleZeroPointFact = DiscreteFact<std::pair<double, int64_t>>;
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_RULES_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/Statistics.h b/third_party/mlir/include/mlir/Quantizer/Support/Statistics.h
new file mode 100644
index 00000000000..c6f059efd79
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/Statistics.h
@@ -0,0 +1,94 @@
+//===- Statistics.h - Collects statistics over tensors ----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines adapters for extracting various (per layer and per axis)
+// statistics over tensors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_STATISTICS_H
+#define MLIR_QUANTIZER_SUPPORT_STATISTICS_H
+
+#include "mlir/IR/Attributes.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Statistics about a tensor axis (or the whole tensor).
+struct TensorAxisStatistics {
+  int64_t sampleSize = 0;
+  double minValue = 0;
+  double maxValue = 0;
+  double mean = 0;
+  double variance = 0;
+
+  TensorAxisStatistics() {}
+  TensorAxisStatistics(int64_t sampleSize, double minValue, double maxValue,
+                       double mean, double variance)
+      : sampleSize(sampleSize), minValue(minValue), maxValue(maxValue),
+        mean(mean), variance(variance) {}
+  void clear() { *this = TensorAxisStatistics(); }
+};
+
+/// Base class for querying statistics about a tensor.
+class AbstractTensorStatistics {
+public:
+  virtual ~AbstractTensorStatistics() = default;
+
+  /// Gets statistics across the whole tensor.
+  /// Returns true if statistics are valid and were populated.
+  virtual bool get(TensorAxisStatistics &stats) const { return false; }
+
+  /// Whether this instance supports querying per axis statistics. If true,
+  /// then getForAxis(...) can be used.
+  virtual bool supportsPerAxis() const { return false; }
+
+  /// Count of axises supported in a per-axis query.
+  virtual unsigned getAxisCount() const { return 0; }
+
+  /// Gets statistics for a specific axis (0..getAxisCount() - 1).
+  /// Returns true if statistics are valid and were populated.
+  virtual bool getForAxis(unsigned axis, TensorAxisStatistics &stats) const {
+    return false;
+  }
+};
+
+/// Wraps an MLIR Attribte and returns statistics about it.
+/// It is expected that the attribute be one of:
+///   FloatAttr (scalar)
+///   DenseFPElementsAttr
+///   OpaqueElementsAttr (with Float based type)
+///   SparseElementAttr  (with Float based type)
+class AttributeTensorStatistics : public AbstractTensorStatistics {
+public:
+  AttributeTensorStatistics(Attribute attr) : attr(attr) {}
+
+  bool get(TensorAxisStatistics &stats) const override;
+
+  // TODO: Implement per-axis.
+
+private:
+  Attribute attr;
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const TensorAxisStatistics &stats);
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_STATISTICS_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/TypeUtils.h b/third_party/mlir/include/mlir/Quantizer/Support/TypeUtils.h
new file mode 100644
index 00000000000..074f8b9e854
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/TypeUtils.h
@@ -0,0 +1,40 @@
+//===- TypeUtils.h - Helper function for manipulating types -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines various helper functions for manipulating types. The
+// process of quantizing typically involves a number of type manipulations
+// that are not very common elsewhere, and it is best to name them and define
+// them here versus inline in the rest of the tool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
+#define THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
+
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Given an arbitrary container or primitive type, returns the element type,
+/// where the element type is just the type for non-containers.
+Type getElementOrPrimitiveType(Type t);
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/UniformConstraints.h b/third_party/mlir/include/mlir/Quantizer/Support/UniformConstraints.h
new file mode 100644
index 00000000000..90b5fe12153
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/UniformConstraints.h
@@ -0,0 +1,69 @@
+//===- UniformConstraints.h - Constraints for uniform quant -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a builder that lets you attach constraints necessary to
+// perform a variety of uniform quantization conversions to CAG anchors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
+#define MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
+
+#include "mlir/Quantizer/Support/Statistics.h"
+
+namespace mlir {
+namespace quantizer {
+
+class CAGAnchorNode;
+class CAGSlice;
+
+/// Factory methods for adding CAG constraints of various kinds suitable
+/// for solving for uniform quantization.
+class UniformConstraintsBuilder {
+public:
+  UniformConstraintsBuilder(CAGSlice &slice) : slice(slice) {}
+
+  /// Adds a coupling constraint between two nodes, effectively treating
+  /// them as a hard identity relationship.
+  void coupleAnchors(CAGAnchorNode *a, CAGAnchorNode *b);
+
+  /// Applies statistics constraints to the given anchor, such that the solver
+  /// ensures that the statistics are representable by chosen types.
+  void applyStats(CAGAnchorNode *a, TensorAxisStatistics stats);
+
+  /// Applies a constraint to a node which allows solutions that do not extend
+  /// beyond given min/max bounds (this is a hint that the tensor will not
+  /// take values outside of these bounds). If either minValue or maxValue is
+  /// NAN, then that side is considered open.
+  void clamp(CAGAnchorNode *a, APFloat minValue, APFloat maxValue);
+
+  /// Propagates an explicit scale from an anchor that may have a uniform
+  /// |selectedType| to the |explicitScaleZeroPoint| field of the to node.
+  /// This is typically used with a to node that has a candidate quantized
+  /// type of |UniformExplicitFixedPointScale|, indicating that it can be
+  /// an arbitrary (signed) type that is expected to share the same scale
+  /// as the originating node.
+  void propagateExplicitScale(CAGAnchorNode *from, CAGAnchorNode *to);
+
+private:
+  CAGSlice &slice;
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Support/UniformSolvers.h b/third_party/mlir/include/mlir/Quantizer/Support/UniformSolvers.h
new file mode 100644
index 00000000000..07597588fa4
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Support/UniformSolvers.h
@@ -0,0 +1,95 @@
+//===- UniformSolvers.h - Uniform type solver algorithms --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines algorithms for solving uniform type parameters for various
+// conditions (i.e. fixed-point, affine, scale matching, etc).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
+#define MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
+
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+class raw_ostream;
+} // end namespace llvm
+
+namespace mlir {
+namespace quantizer {
+
+struct UniformStorageParams {
+  static UniformStorageParams getQuint8() { return {255, 0}; }
+  static UniformStorageParams getQuint8SymmetricRight() { return {254, 1}; }
+  static UniformStorageParams getQuint16() { return {32767, 0}; }
+
+  uint64_t numLevels;
+  int64_t minValue;
+};
+
+/// Solves for the uniform quantization scheme paramers delta and z given
+/// bounding min/max.
+class UniformParamsFromMinMaxSolver {
+public:
+  UniformParamsFromMinMaxSolver(const UniformStorageParams &storageParams,
+                                double boundingMin, double boundingMax)
+      : storageParams(storageParams), boundingMin(boundingMin),
+        boundingMax(boundingMax) {}
+
+  /// Performs the computation, returning whether satisfied.
+  bool compute();
+
+  // Params.
+  double getBoundingMin() const { return boundingMin; }
+  double getBoundingMax() const { return boundingMax; }
+  bool isSatisfied() const { return satisfied; }
+  double getAdjMin() const { return adjMin; }
+  double getAdjMax() const { return adjMax; }
+  double getScale() const { return delta; }
+  int64_t getZp() const { return zp; }
+  int getStepCount() const { return stepCount; }
+
+  // Quantize and dequantize.
+  int64_t quantize(double x) const;
+  double dequantize(int64_t xq) const;
+
+private:
+  const UniformStorageParams storageParams;
+  const double boundingMin;
+  const double boundingMax;
+
+  // Results
+  int stepCount = 0;
+  double adjMin = std::numeric_limits<double>::quiet_NaN();
+  double adjMax = std::numeric_limits<double>::quiet_NaN();
+  double delta = std::numeric_limits<double>::quiet_NaN();
+  int64_t zp = 0;
+
+  bool satisfied = false;
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformStorageParams &p);
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformParamsFromMinMaxSolver &s);
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
diff --git a/third_party/mlir/include/mlir/Quantizer/Transforms/Passes.h b/third_party/mlir/include/mlir/Quantizer/Transforms/Passes.h
new file mode 100644
index 00000000000..f894ea801e0
--- /dev/null
+++ b/third_party/mlir/include/mlir/Quantizer/Transforms/Passes.h
@@ -0,0 +1,51 @@
+//===- Passes.h - Quantizer passes  -----------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines entry points to create passes to perform various kinds
+// of quantization related transforms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_TRANSFORMS_PASSES_H
+#define MLIR_QUANTIZER_TRANSFORMS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace quantizer {
+
+class SolverContext;
+class TargetConfiguration;
+
+/// Creates a pass that infers quantized types based on metadata discovered
+/// in the computation.
+std::unique_ptr<ModulePassBase>
+createInferQuantizedTypesPass(SolverContext &solverContext,
+                              const TargetConfiguration &config);
+
+/// Creates a pass which removes any instrumentation and hint ops which have
+/// no effect on final runtime.
+std::unique_ptr<FunctionPassBase> createRemoveInstrumentationPass();
+
+/// Adds default (dummy) statistics to ops that can benefit from runtime stats.
+/// Meant for testing.
+std::unique_ptr<FunctionPassBase> createAddDefaultStatsPass();
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_TRANSFORMS_PASSES_H
diff --git a/third_party/mlir/include/mlir/Support/DebugStringHelper.h b/third_party/mlir/include/mlir/Support/DebugStringHelper.h
new file mode 100644
index 00000000000..230ed231458
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/DebugStringHelper.h
@@ -0,0 +1,51 @@
+//===- DebugStringHelper.h - helpers to generate debug strings --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Convenience functions to make it easier to get a string representation for
+// ops that have a print method. For use in debugging output and errors
+// returned.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DEBUGSTRINGHELPER_H_
+#define MLIR_DEBUGSTRINGHELPER_H_
+
+#include <string>
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+
+// Simple helper function that returns a string as printed from a op.
+template <typename T> static std::string debugString(T &op) {
+  std::string instr_str;
+  llvm::raw_string_ostream os(instr_str);
+  op.print(os);
+  return os.str();
+}
+
+} // namespace mlir
+
+inline std::ostream &operator<<(std::ostream &out, const llvm::Twine &twine) {
+  llvm::raw_os_ostream rout(out);
+  rout << twine;
+  return out;
+}
+
+#endif // MLIR_DEBUGSTRINGHELPER_H_
diff --git a/third_party/mlir/include/mlir/Support/FileUtilities.h b/third_party/mlir/include/mlir/Support/FileUtilities.h
new file mode 100644
index 00000000000..5ce97223176
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/FileUtilities.h
@@ -0,0 +1,50 @@
+//===- FileUtilities.h - utilities for working with files -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Common utilities for working with files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_FILEUTILITIES_H_
+#define MLIR_SUPPORT_FILEUTILITIES_H_
+
+#include <memory>
+#include <string>
+
+namespace llvm {
+class MemoryBuffer;
+class ToolOutputFile;
+class StringRef;
+} // namespace llvm
+
+namespace mlir {
+
+/// Open the file specified by its name for reading. Write the error message to
+/// `errorMessage` if errors occur and `errorMessage` is not nullptr.
+std::unique_ptr<llvm::MemoryBuffer>
+openInputFile(llvm::StringRef inputFilename,
+              std::string *errorMessage = nullptr);
+
+/// Open the file specified by its name for writing. Write the error message to
+/// `errorMessage` if errors occur and `errorMessage` is not nullptr.
+std::unique_ptr<llvm::ToolOutputFile>
+openOutputFile(llvm::StringRef outputFilename,
+               std::string *errorMessage = nullptr);
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_FILEUTILITIES_H_
diff --git a/third_party/mlir/include/mlir/Support/Functional.h b/third_party/mlir/include/mlir/Support/Functional.h
new file mode 100644
index 00000000000..edc5e1dac63
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/Functional.h
@@ -0,0 +1,122 @@
+//===- Functional.h - Helpers for functional-style Combinators --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_SUPPORT_FUNCTIONAL_H_
+#define MLIR_SUPPORT_FUNCTIONAL_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+
+/// This file provides some simple template functional-style sugar to operate
+/// on **value** types. Make sure when using that the stored type is cheap to
+/// copy!
+///
+/// TODO(ntv): add some static_assert but we need proper traits for this.
+
+namespace mlir {
+namespace functional {
+
+/// Map with iterators.
+template <typename Fn, typename IterType>
+auto map(Fn fun, IterType begin, IterType end)
+    -> llvm::SmallVector<typename std::result_of<Fn(decltype(*begin))>::type,
+                         8> {
+  using R = typename std::result_of<Fn(decltype(*begin))>::type;
+  llvm::SmallVector<R, 8> res;
+  // auto i works with both pointer types and value types with an operator*.
+  // auto *i only works for pointer types.
+  for (auto i = begin; i != end; ++i) {
+    res.push_back(fun(*i));
+  }
+  return res;
+}
+
+/// Map with templated container.
+template <typename Fn, typename ContainerType>
+auto map(Fn fun, ContainerType input)
+    -> decltype(map(fun, std::begin(input), std::end(input))) {
+  return map(fun, std::begin(input), std::end(input));
+}
+
+/// Zip map with 2 templated container, iterates to the min of the sizes of
+/// the 2 containers.
+/// TODO(ntv): make variadic when needed.
+template <typename Fn, typename ContainerType1, typename ContainerType2>
+auto zipMap(Fn fun, ContainerType1 input1, ContainerType2 input2)
+    -> llvm::SmallVector<
+        typename std::result_of<Fn(decltype(*input1.begin()),
+                                   decltype(*input2.begin()))>::type,
+        8> {
+  using R = typename std::result_of<Fn(decltype(*input1.begin()),
+                                       decltype(*input2.begin()))>::type;
+  llvm::SmallVector<R, 8> res;
+  auto zipIter = llvm::zip(input1, input2);
+  for (auto it : zipIter) {
+    res.push_back(fun(std::get<0>(it), std::get<1>(it)));
+  }
+  return res;
+}
+
+/// Apply with iterators.
+template <typename Fn, typename IterType>
+void apply(Fn fun, IterType begin, IterType end) {
+  // auto i works with both pointer types and value types with an operator*.
+  // auto *i only works for pointer types.
+  for (auto i = begin; i != end; ++i) {
+    fun(*i);
+  }
+}
+
+/// Apply with templated container.
+template <typename Fn, typename ContainerType>
+void apply(Fn fun, ContainerType input) {
+  return apply(fun, std::begin(input), std::end(input));
+}
+
+/// Zip apply with 2 templated container, iterates to the min of the sizes of
+/// the 2 containers.
+/// TODO(ntv): make variadic when needed.
+template <typename Fn, typename ContainerType1, typename ContainerType2>
+void zipApply(Fn fun, ContainerType1 input1, ContainerType2 input2) {
+  auto zipIter = llvm::zip(input1, input2);
+  for (auto it : zipIter) {
+    fun(std::get<0>(it), std::get<1>(it));
+  }
+}
+
+/// Unwraps a pointer type to another type (possibly the same).
+/// Used in particular to allow easier compositions of
+///   Operation::operand_range types.
+template <typename T, typename ToType = T>
+inline std::function<ToType *(T *)> makePtrDynCaster() {
+  return [](T *val) { return llvm::dyn_cast<ToType>(val); };
+}
+
+/// Simple ScopeGuard.
+struct ScopeGuard {
+  explicit ScopeGuard(std::function<void(void)> destruct)
+      : destruct(destruct) {}
+  ~ScopeGuard() { destruct(); }
+
+private:
+  std::function<void(void)> destruct;
+};
+
+} // namespace functional
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_FUNCTIONAL_H_
diff --git a/third_party/mlir/include/mlir/Support/JitRunner.h b/third_party/mlir/include/mlir/Support/JitRunner.h
new file mode 100644
index 00000000000..14b66a8cebd
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/JitRunner.h
@@ -0,0 +1,47 @@
+//===- JitRunner.h - MLIR CPU Execution Driver Library ----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a library that provides a shared implementation for command line
+// utilities that execute an MLIR file on the CPU by translating MLIR to LLVM
+// IR before JIT-compiling and executing the latter.
+//
+// The translation can be customized by providing an MLIR to MLIR
+// transformation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_JITRUNNER_H_
+#define MLIR_SUPPORT_JITRUNNER_H_
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+
+class ModuleOp;
+struct LogicalResult;
+
+// Entry point for all CPU runners. Expects the common argc/argv arguments for
+// standard C++ main functions and an mlirTransformer.
+// The latter is applied after parsing the input into MLIR IR and before passing
+// the MLIR module to the ExecutionEngine.
+int JitRunnerMain(
+    int argc, char **argv,
+    llvm::function_ref<LogicalResult(mlir::ModuleOp)> mlirTransformer);
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_JITRUNNER_H_
diff --git a/third_party/mlir/include/mlir/Support/LLVM.h b/third_party/mlir/include/mlir/Support/LLVM.h
new file mode 100644
index 00000000000..f0dd1216652
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/LLVM.h
@@ -0,0 +1,103 @@
+//===- LLVM.h - Import and forward declare core LLVM types ------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file forward declares and imports various common LLVM datatypes that
+// MLIR wants to use unqualified.
+//
+// Note that most of these are forward declared and then imported into the MLIR
+// namespace with using decls, rather than being #included.  This is because we
+// want clients to explicitly #include the files they need.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_LLVM_H
+#define MLIR_SUPPORT_LLVM_H
+
+// We include these two headers because they cannot be practically forward
+// declared, and are effectively language features.
+#include "llvm/ADT/None.h"
+#include "llvm/Support/Casting.h"
+
+// Forward declarations.
+namespace llvm {
+// Containers.
+class StringRef;
+class StringLiteral;
+class Twine;
+template <typename T> class SmallPtrSetImpl;
+template <typename T, unsigned N> class SmallPtrSet;
+template <typename T> class SmallVectorImpl;
+template <typename T, unsigned N> class SmallVector;
+template <unsigned N> class SmallString;
+template <typename T> class ArrayRef;
+template <typename T> class MutableArrayRef;
+template <typename T> class TinyPtrVector;
+template <typename T> class Optional;
+template <typename... PT> class PointerUnion;
+namespace detail {
+template <typename KeyT, typename ValueT> struct DenseMapPair;
+}
+template <typename T> struct DenseMapInfo;
+template <typename ValueT, typename ValueInfoT> class DenseSet;
+template <typename KeyT, typename ValueT, typename KeyInfoT, typename BucketT>
+class DenseMap;
+
+// Other common classes.
+class raw_ostream;
+class APInt;
+class APFloat;
+} // end namespace llvm
+
+namespace mlir {
+// Casting operators.
+using llvm::cast;
+using llvm::cast_or_null;
+using llvm::dyn_cast;
+using llvm::dyn_cast_or_null;
+using llvm::isa;
+using llvm::isa_and_nonnull;
+
+// Containers.
+using llvm::ArrayRef;
+using llvm::DenseMapInfo;
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
+using DenseMap = llvm::DenseMap<KeyT, ValueT, KeyInfoT, BucketT>;
+template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
+using DenseSet = llvm::DenseSet<ValueT, ValueInfoT>;
+using llvm::MutableArrayRef;
+using llvm::None;
+using llvm::Optional;
+using llvm::PointerUnion;
+using llvm::SmallPtrSet;
+using llvm::SmallPtrSetImpl;
+using llvm::SmallString;
+using llvm::SmallVector;
+using llvm::SmallVectorImpl;
+using llvm::StringLiteral;
+using llvm::StringRef;
+using llvm::TinyPtrVector;
+using llvm::Twine;
+
+// Other common classes.
+using llvm::APFloat;
+using llvm::APInt;
+using llvm::raw_ostream;
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_LLVM_H
diff --git a/third_party/mlir/include/mlir/Support/LogicalResult.h b/third_party/mlir/include/mlir/Support/LogicalResult.h
new file mode 100644
index 00000000000..a9fc77ceef8
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/LogicalResult.h
@@ -0,0 +1,60 @@
+//===- LogicalResult.h - Utilities for handling success/failure -*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_SUPPORT_LOGICAL_RESULT_H
+#define MLIR_SUPPORT_LOGICAL_RESULT_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+// Values that can be used to signal success/failure. This should be used in
+// conjunction with the utility functions below.
+struct LogicalResult {
+  enum ResultEnum { Success, Failure } value;
+  LogicalResult(ResultEnum v) : value(v) {}
+};
+
+/// Utility function to generate a LogicalResult. If isSuccess is true a
+/// `success` result is generated, otherwise a 'failure' result is generated.
+inline LogicalResult success(bool isSuccess = true) {
+  return LogicalResult{isSuccess ? LogicalResult::Success
+                                 : LogicalResult::Failure};
+}
+
+/// Utility function to generate a LogicalResult. If isFailure is true a
+/// `failure` result is generated, otherwise a 'success' result is generated.
+inline LogicalResult failure(bool isFailure = true) {
+  return LogicalResult{isFailure ? LogicalResult::Failure
+                                 : LogicalResult::Success};
+}
+
+/// Utility function that returns true if the provided LogicalResult corresponds
+/// to a success value.
+inline bool succeeded(LogicalResult result) {
+  return result.value == LogicalResult::Success;
+}
+
+/// Utility function that returns true if the provided LogicalResult corresponds
+/// to a failure value.
+inline bool failed(LogicalResult result) {
+  return result.value == LogicalResult::Failure;
+}
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_LOGICAL_RESULT_H
diff --git a/third_party/mlir/include/mlir/Support/MathExtras.h b/third_party/mlir/include/mlir/Support/MathExtras.h
new file mode 100644
index 00000000000..767677fbc5d
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/MathExtras.h
@@ -0,0 +1,65 @@
+//===- MathExtras.h - Math functions relevant to MLIR -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains math functions relevant to MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_MATHEXTRAS_H_
+#define MLIR_SUPPORT_MATHEXTRAS_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/APInt.h"
+
+namespace mlir {
+
+/// Returns the result of MLIR's ceildiv operation on constants. The RHS is
+/// expected to be positive.
+inline int64_t ceilDiv(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  // C/C++'s integer division rounds towards 0.
+  return lhs % rhs > 0 ? lhs / rhs + 1 : lhs / rhs;
+}
+
+/// Returns the result of MLIR's floordiv operation on constants. The RHS is
+/// expected to be positive.
+inline int64_t floorDiv(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  // C/C++'s integer division rounds towards 0.
+  return lhs % rhs < 0 ? lhs / rhs - 1 : lhs / rhs;
+}
+
+/// Returns MLIR's mod operation on constants. MLIR's mod operation yields the
+/// remainder of the Euclidean division of 'lhs' by 'rhs', and is therefore not
+/// C's % operator.  The RHS is always expected to be positive, and the result
+/// is always non-negative.
+inline int64_t mod(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  return lhs % rhs < 0 ? lhs % rhs + rhs : lhs % rhs;
+}
+
+/// Returns the least common multiple of 'a' and 'b'.
+inline int64_t lcm(int64_t a, int64_t b) {
+  uint64_t x = std::abs(a);
+  uint64_t y = std::abs(b);
+  int64_t lcm = (x * y) / llvm::GreatestCommonDivisor64(x, y);
+  assert((lcm >= a && lcm >= b) && "LCM overflow");
+  return lcm;
+}
+} // end namespace mlir
+
+#endif // MLIR_SUPPORT_MATHEXTRAS_H_
diff --git a/third_party/mlir/include/mlir/Support/MlirOptMain.h b/third_party/mlir/include/mlir/Support/MlirOptMain.h
new file mode 100644
index 00000000000..00a1e48c255
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/MlirOptMain.h
@@ -0,0 +1,38 @@
+//===- MlirOptMain.h - MLIR Optimizer Driver main ---------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Main entry function for mlir-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+class MemoryBuffer;
+} // end namespace llvm
+namespace mlir {
+struct LogicalResult;
+class PassRegistryEntry;
+
+LogicalResult
+MlirOptMain(llvm::raw_ostream &os, std::unique_ptr<llvm::MemoryBuffer> buffer,
+            const std::vector<const PassRegistryEntry *> &passList,
+            bool splitInputFile, bool verifyDiagnostics, bool verifyPasses);
+
+} // end namespace mlir
diff --git a/third_party/mlir/include/mlir/Support/STLExtras.h b/third_party/mlir/include/mlir/Support/STLExtras.h
new file mode 100644
index 00000000000..3448b080d03
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/STLExtras.h
@@ -0,0 +1,239 @@
+//===- STLExtras.h - STL-like extensions that are used by MLIR --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains stuff that should be arguably sunk down to the LLVM
+// Support/STLExtras.h file over time.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_STLEXTRAS_H
+#define MLIR_SUPPORT_STLEXTRAS_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/iterator.h"
+#include <tuple>
+
+namespace mlir {
+
+namespace detail {
+template <typename RangeT>
+using ValueOfRange = typename std::remove_reference<decltype(
+    *std::begin(std::declval<RangeT &>()))>::type;
+} // end namespace detail
+
+/// An STL-style algorithm similar to std::for_each that applies a second
+/// functor between every pair of elements.
+///
+/// This provides the control flow logic to, for example, print a
+/// comma-separated list:
+/// \code
+///   interleave(names.begin(), names.end(),
+///              [&](StringRef name) { os << name; },
+///              [&] { os << ", "; });
+/// \endcode
+template <typename ForwardIterator, typename UnaryFunctor,
+          typename NullaryFunctor>
+inline void interleave(ForwardIterator begin, ForwardIterator end,
+                       UnaryFunctor each_fn, NullaryFunctor between_fn) {
+  if (begin == end)
+    return;
+  each_fn(*begin);
+  ++begin;
+  for (; begin != end; ++begin) {
+    between_fn();
+    each_fn(*begin);
+  }
+}
+
+template <typename Container, typename UnaryFunctor, typename NullaryFunctor>
+inline void interleave(const Container &c, UnaryFunctor each_fn,
+                       NullaryFunctor between_fn) {
+  interleave(c.begin(), c.end(), each_fn, between_fn);
+}
+
+template <typename Container, typename UnaryFunctor, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleaveComma(const Container &c, raw_ostream &os,
+                            UnaryFunctor each_fn) {
+  interleave(c.begin(), c.end(), each_fn, [&] { os << ", "; });
+}
+template <typename Container, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleaveComma(const Container &c, raw_ostream &os) {
+  interleaveComma(c, os, [&](const T &a) { os << a; });
+}
+
+/// A special type used to provide an address for a given class that can act as
+/// a unique identifier during pass registration.
+/// Note: We specify an explicit alignment here to allow use with PointerIntPair
+/// and other utilities/data structures that require a known pointer alignment.
+struct alignas(8) ClassID {
+  template <typename T> static ClassID *getID() {
+    static ClassID id;
+    return &id;
+  }
+  template <template <typename T> class Trait> static ClassID *getID() {
+    static ClassID id;
+    return &id;
+  }
+};
+
+/// Utilities for detecting if a given trait holds for some set of arguments
+/// 'Args'. For example, the given trait could be used to detect if a given type
+/// has a copy assignment operator:
+///   template<class T>
+///   using has_copy_assign_t = decltype(std::declval<T&>()
+///                                                 = std::declval<const T&>());
+///   bool fooHasCopyAssign = is_detected<has_copy_assign_t, FooClass>::value;
+namespace detail {
+template <typename...> using void_t = void;
+template <class, template <class...> class Op, class... Args> struct detector {
+  using value_t = std::false_type;
+};
+template <template <class...> class Op, class... Args>
+struct detector<void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+};
+} // end namespace detail
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<void, Op, Args...>::value_t;
+
+/// Check if a Callable type can be invoked with the given set of arg types.
+namespace detail {
+template <typename Callable, typename... Args>
+using is_invocable =
+    decltype(std::declval<Callable &>()(std::declval<Args>()...));
+} // namespace detail
+
+template <typename Callable, typename... Args>
+using is_invocable = is_detected<detail::is_invocable, Callable, Args...>;
+
+//===----------------------------------------------------------------------===//
+//     Extra additions to <iterator>
+//===----------------------------------------------------------------------===//
+
+/// A utility class used to implement an iterator that contains some object and
+/// an index. The iterator moves the index but keeps the object constant.
+template <typename DerivedT, typename ObjectType, typename T,
+          typename PointerT = T *, typename ReferenceT = T &>
+class indexed_accessor_iterator
+    : public llvm::iterator_facade_base<DerivedT,
+                                        std::random_access_iterator_tag, T,
+                                        std::ptrdiff_t, PointerT, ReferenceT> {
+public:
+  ptrdiff_t operator-(const indexed_accessor_iterator &rhs) const {
+    assert(object == rhs.object && "incompatible iterators");
+    return index - rhs.index;
+  }
+  bool operator==(const indexed_accessor_iterator &rhs) const {
+    return object == rhs.object && index == rhs.index;
+  }
+  bool operator<(const indexed_accessor_iterator &rhs) const {
+    assert(object == rhs.object && "incompatible iterators");
+    return index < rhs.index;
+  }
+
+  DerivedT &operator+=(ptrdiff_t offset) {
+    this->index += offset;
+    return static_cast<DerivedT &>(*this);
+  }
+  DerivedT &operator-=(ptrdiff_t offset) {
+    this->index -= offset;
+    return static_cast<DerivedT &>(*this);
+  }
+
+protected:
+  indexed_accessor_iterator(ObjectType object, ptrdiff_t index)
+      : object(object), index(index) {}
+  ObjectType object;
+  ptrdiff_t index;
+};
+
+} // end namespace mlir
+
+// Allow tuples to be usable as DenseMap keys.
+// TODO: Move this to upstream LLVM.
+
+/// Simplistic combination of 32-bit hash values into 32-bit hash values.
+/// This function is taken from llvm/ADT/DenseMapInfo.h.
+static inline unsigned llvm_combineHashValue(unsigned a, unsigned b) {
+  uint64_t key = (uint64_t)a << 32 | (uint64_t)b;
+  key += ~(key << 32);
+  key ^= (key >> 22);
+  key += ~(key << 13);
+  key ^= (key >> 8);
+  key += (key << 3);
+  key ^= (key >> 15);
+  key += ~(key << 27);
+  key ^= (key >> 31);
+  return (unsigned)key;
+}
+
+namespace llvm {
+template <typename... Ts> struct DenseMapInfo<std::tuple<Ts...>> {
+  using Tuple = std::tuple<Ts...>;
+
+  static inline Tuple getEmptyKey() {
+    return Tuple(DenseMapInfo<Ts>::getEmptyKey()...);
+  }
+
+  static inline Tuple getTombstoneKey() {
+    return Tuple(DenseMapInfo<Ts>::getTombstoneKey()...);
+  }
+
+  template <unsigned I>
+  static unsigned getHashValueImpl(const Tuple &values, std::false_type) {
+    using EltType = typename std::tuple_element<I, Tuple>::type;
+    std::integral_constant<bool, I + 1 == sizeof...(Ts)> atEnd;
+    return llvm_combineHashValue(
+        DenseMapInfo<EltType>::getHashValue(std::get<I>(values)),
+        getHashValueImpl<I + 1>(values, atEnd));
+  }
+
+  template <unsigned I>
+  static unsigned getHashValueImpl(const Tuple &values, std::true_type) {
+    return 0;
+  }
+
+  static unsigned getHashValue(const std::tuple<Ts...> &values) {
+    std::integral_constant<bool, 0 == sizeof...(Ts)> atEnd;
+    return getHashValueImpl<0>(values, atEnd);
+  }
+
+  template <unsigned I>
+  static bool isEqualImpl(const Tuple &lhs, const Tuple &rhs, std::false_type) {
+    using EltType = typename std::tuple_element<I, Tuple>::type;
+    std::integral_constant<bool, I + 1 == sizeof...(Ts)> atEnd;
+    return DenseMapInfo<EltType>::isEqual(std::get<I>(lhs), std::get<I>(rhs)) &&
+           isEqualImpl<I + 1>(lhs, rhs, atEnd);
+  }
+
+  template <unsigned I>
+  static bool isEqualImpl(const Tuple &lhs, const Tuple &rhs, std::true_type) {
+    return true;
+  }
+
+  static bool isEqual(const Tuple &lhs, const Tuple &rhs) {
+    std::integral_constant<bool, 0 == sizeof...(Ts)> atEnd;
+    return isEqualImpl<0>(lhs, rhs, atEnd);
+  }
+};
+
+} // end namespace llvm
+
+#endif // MLIR_SUPPORT_STLEXTRAS_H
diff --git a/third_party/mlir/include/mlir/Support/StorageUniquer.h b/third_party/mlir/include/mlir/Support/StorageUniquer.h
new file mode 100644
index 00000000000..1873df18c36
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/StorageUniquer.h
@@ -0,0 +1,270 @@
+//===- StorageUniquer.h - Common Storage Class Uniquer ----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_SUPPORT_STORAGEUNIQUER_H
+#define MLIR_SUPPORT_STORAGEUNIQUER_H
+
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace mlir {
+namespace detail {
+struct StorageUniquerImpl;
+
+/// Trait to check if ImplTy provides a 'getKey' method with types 'Args'.
+template <typename ImplTy, typename... Args>
+using has_impltype_getkey_t = decltype(ImplTy::getKey(std::declval<Args>()...));
+
+/// Trait to check if ImplTy provides a 'hashKey' method for 'T'.
+template <typename ImplTy, typename T>
+using has_impltype_hash_t = decltype(ImplTy::hashKey(std::declval<T>()));
+} // namespace detail
+
+/// A utility class to get, or create instances of storage classes. These
+/// storage classes must respect the following constraints:
+///    - Derive from StorageUniquer::BaseStorage.
+///    - Provide an unsigned 'kind' value to be used as part of the unique'ing
+///      process.
+///
+/// For non-parametric storage classes, i.e. those that are solely uniqued by
+/// their kind, nothing else is needed. Instances of these classes can be
+/// created by calling `get` without trailing arguments.
+///
+/// Otherwise, the parametric storage classes may be created with `get`,
+/// and must respect the following:
+///    - Define a type alias, KeyTy, to a type that uniquely identifies the
+///      instance of the storage class within its kind.
+///      * The key type must be constructible from the values passed into the
+///        getComplex call after the kind.
+///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
+///        storage class must define a hashing method:
+///         'static unsigned hashKey(const KeyTy &)'
+///
+///    - Provide a method, 'bool operator==(const KeyTy &) const', to
+///      compare the storage instance against an instance of the key type.
+///
+///    - Provide a static construction method:
+///        'DerivedStorage *construct(StorageAllocator &, const KeyTy &key)'
+///      that builds a unique instance of the derived storage. The arguments to
+///      this function are an allocator to store any uniqued data and the key
+///      type for this storage.
+///
+///    - Provide a cleanup method:
+///        'void cleanup()'
+///      that is called when erasing a storage instance. This should cleanup any
+///      fields of the storage as necessary and not attempt to free the memory
+///      of the storage itself.
+class StorageUniquer {
+public:
+  StorageUniquer();
+  ~StorageUniquer();
+
+  /// This class acts as the base storage that all storage classes must derived
+  /// from.
+  class BaseStorage {
+  public:
+    /// Get the kind classification of this storage.
+    unsigned getKind() const { return kind; }
+
+  protected:
+    BaseStorage() : kind(0) {}
+
+  private:
+    /// Allow access to the kind field.
+    friend detail::StorageUniquerImpl;
+
+    /// Classification of the subclass, used for type checking.
+    unsigned kind;
+  };
+
+  /// This is a utility allocator used to allocate memory for instances of
+  /// derived types.
+  class StorageAllocator {
+  public:
+    /// Copy the specified array of elements into memory managed by our bump
+    /// pointer allocator.  This assumes the elements are all PODs.
+    template <typename T> ArrayRef<T> copyInto(ArrayRef<T> elements) {
+      if (elements.empty())
+        return llvm::None;
+      auto result = allocator.Allocate<T>(elements.size());
+      std::uninitialized_copy(elements.begin(), elements.end(), result);
+      return ArrayRef<T>(result, elements.size());
+    }
+
+    /// Copy the provided string into memory managed by our bump pointer
+    /// allocator.
+    StringRef copyInto(StringRef str) {
+      auto result = copyInto(ArrayRef<char>(str.data(), str.size()));
+      return StringRef(result.data(), str.size());
+    }
+
+    /// Allocate an instance of the provided type.
+    template <typename T> T *allocate() { return allocator.Allocate<T>(); }
+
+    /// Allocate 'size' bytes of 'alignment' aligned memory.
+    void *allocate(size_t size, size_t alignment) {
+      return allocator.Allocate(size, alignment);
+    }
+
+  private:
+    /// The raw allocator for type storage objects.
+    llvm::BumpPtrAllocator allocator;
+  };
+
+  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
+  /// that can be used to initialize a newly inserted storage instance. This
+  /// function is used for derived types that have complex storage or uniquing
+  /// constraints.
+  template <typename Storage, typename Arg, typename... Args>
+  Storage *get(std::function<void(Storage *)> initFn, unsigned kind, Arg &&arg,
+               Args &&... args) {
+    // Construct a value of the derived key type.
+    auto derivedKey =
+        getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
+
+    // Create a hash of the kind and the derived key.
+    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+
+    // Generate an equality function for the derived storage.
+    std::function<bool(const BaseStorage *)> isEqual =
+        [&derivedKey](const BaseStorage *existing) {
+          return static_cast<const Storage &>(*existing) == derivedKey;
+        };
+
+    // Generate a constructor function for the derived storage.
+    std::function<BaseStorage *(StorageAllocator &)> ctorFn =
+        [&](StorageAllocator &allocator) {
+          auto *storage = Storage::construct(allocator, derivedKey);
+          if (initFn)
+            initFn(storage);
+          return storage;
+        };
+
+    // Get an instance for the derived storage.
+    return static_cast<Storage *>(getImpl(kind, hashValue, isEqual, ctorFn));
+  }
+
+  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
+  /// that can be used to initialize a newly inserted storage instance. This
+  /// function is used for derived types that use no additional storage or
+  /// uniquing outside of the kind.
+  template <typename Storage>
+  Storage *get(std::function<void(Storage *)> initFn, unsigned kind) {
+    auto ctorFn = [&](StorageAllocator &allocator) {
+      auto *storage = new (allocator.allocate<Storage>()) Storage();
+      if (initFn)
+        initFn(storage);
+      return storage;
+    };
+    return static_cast<Storage *>(getImpl(kind, ctorFn));
+  }
+
+  /// Erases a uniqued instance of 'Storage'. This function is used for derived
+  /// types that have complex storage or uniquing constraints.
+  template <typename Storage, typename Arg, typename... Args>
+  void erase(unsigned kind, Arg &&arg, Args &&... args) {
+    // Construct a value of the derived key type.
+    auto derivedKey =
+        getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
+
+    // Create a hash of the kind and the derived key.
+    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+
+    // Generate an equality function for the derived storage.
+    std::function<bool(const BaseStorage *)> isEqual =
+        [&derivedKey](const BaseStorage *existing) {
+          return static_cast<const Storage &>(*existing) == derivedKey;
+        };
+
+    // Attempt to erase the storage instance.
+    eraseImpl(kind, hashValue, isEqual, [](BaseStorage *storage) {
+      static_cast<Storage *>(storage)->cleanup();
+    });
+  }
+
+private:
+  /// Implementation for getting/creating an instance of a derived type with
+  /// complex storage.
+  BaseStorage *getImpl(unsigned kind, unsigned hashValue,
+                       llvm::function_ref<bool(const BaseStorage *)> isEqual,
+                       std::function<BaseStorage *(StorageAllocator &)> ctorFn);
+
+  /// Implementation for getting/creating an instance of a derived type with
+  /// default storage.
+  BaseStorage *getImpl(unsigned kind,
+                       std::function<BaseStorage *(StorageAllocator &)> ctorFn);
+
+  /// Implementation for erasing an instance of a derived type with complex
+  /// storage.
+  void eraseImpl(unsigned kind, unsigned hashValue,
+                 llvm::function_ref<bool(const BaseStorage *)> isEqual,
+                 std::function<void(BaseStorage *)> cleanupFn);
+
+  /// The internal implementation class.
+  std::unique_ptr<detail::StorageUniquerImpl> impl;
+
+  //===--------------------------------------------------------------------===//
+  // Key Construction
+  //===--------------------------------------------------------------------===//
+
+  /// Used to construct an instance of 'ImplTy::KeyTy' if there is an
+  /// 'ImplTy::getKey' function for the provided arguments.
+  template <typename ImplTy, typename... Args>
+  static typename std::enable_if<
+      is_detected<detail::has_impltype_getkey_t, ImplTy, Args...>::value,
+      typename ImplTy::KeyTy>::type
+  getKey(Args &&... args) {
+    return ImplTy::getKey(args...);
+  }
+  /// If there is no 'ImplTy::getKey' method, then we try to directly construct
+  /// the 'ImplTy::KeyTy' with the provided arguments.
+  template <typename ImplTy, typename... Args>
+  static typename std::enable_if<
+      !is_detected<detail::has_impltype_getkey_t, ImplTy, Args...>::value,
+      typename ImplTy::KeyTy>::type
+  getKey(Args &&... args) {
+    return typename ImplTy::KeyTy(args...);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Key and Kind Hashing
+  //===--------------------------------------------------------------------===//
+
+  /// Used to generate a hash for the 'ImplTy::KeyTy' and kind of a storage
+  /// instance if there is an 'ImplTy::hashKey' overload for 'DerivedKey'.
+  template <typename ImplTy, typename DerivedKey>
+  static typename std::enable_if<
+      is_detected<detail::has_impltype_hash_t, ImplTy, DerivedKey>::value,
+      ::llvm::hash_code>::type
+  getHash(unsigned kind, const DerivedKey &derivedKey) {
+    return llvm::hash_combine(kind, ImplTy::hashKey(derivedKey));
+  }
+  /// If there is no 'ImplTy::hashKey' default to using the
+  /// 'llvm::DenseMapInfo' definition for 'DerivedKey' for generating a hash.
+  template <typename ImplTy, typename DerivedKey>
+  static typename std::enable_if<
+      !is_detected<detail::has_impltype_hash_t, ImplTy, DerivedKey>::value,
+      ::llvm::hash_code>::type
+  getHash(unsigned kind, const DerivedKey &derivedKey) {
+    return llvm::hash_combine(
+        kind, llvm::DenseMapInfo<DerivedKey>::getHashValue(derivedKey));
+  }
+};
+} // end namespace mlir
+
+#endif
diff --git a/third_party/mlir/include/mlir/Support/StringExtras.h b/third_party/mlir/include/mlir/Support/StringExtras.h
new file mode 100644
index 00000000000..9948d15011a
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/StringExtras.h
@@ -0,0 +1,83 @@
+//===- StringExtras.h - String utilities used by MLIR -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains string utility functions used within MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_STRINGEXTRAS_H
+#define MLIR_SUPPORT_STRINGEXTRAS_H
+
+#include "llvm/ADT/StringExtras.h"
+
+#include <cctype>
+
+namespace mlir {
+/// Converts a string to snake-case from camel-case by replacing all uppercase
+/// letters with '_' followed by the letter in lowercase, except if the
+/// uppercase letter is the first character of the string.
+inline std::string convertToSnakeCase(llvm::StringRef input) {
+  std::string snakeCase;
+  snakeCase.reserve(input.size());
+  for (auto c : input) {
+    if (std::isupper(c)) {
+      if (!snakeCase.empty() && snakeCase.back() != '_') {
+        snakeCase.push_back('_');
+      }
+      snakeCase.push_back(llvm::toLower(c));
+    } else {
+      snakeCase.push_back(c);
+    }
+  }
+  return snakeCase;
+}
+
+/// Converts a string from camel-case to snake_case by replacing all occurences
+/// of '_' followed by a lowercase letter with the letter in
+/// uppercase. Optionally allow capitalization of the first letter (if it is a
+/// lowercase letter)
+inline std::string convertToCamelCase(llvm::StringRef input,
+                                      bool capitalizeFirst = false) {
+  if (input.empty()) {
+    return "";
+  }
+  std::string output;
+  output.reserve(input.size());
+  size_t pos = 0;
+  if (capitalizeFirst && std::islower(input[pos])) {
+    output.push_back(llvm::toUpper(input[pos]));
+    pos++;
+  }
+  while (pos < input.size()) {
+    auto cur = input[pos];
+    if (cur == '_') {
+      if (pos && (pos + 1 < input.size())) {
+        if (std::islower(input[pos + 1])) {
+          output.push_back(llvm::toUpper(input[pos + 1]));
+          pos += 2;
+          continue;
+        }
+      }
+    }
+    output.push_back(cur);
+    pos++;
+  }
+  return output;
+}
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_STRINGEXTRAS_H
diff --git a/third_party/mlir/include/mlir/Support/TranslateClParser.h b/third_party/mlir/include/mlir/Support/TranslateClParser.h
new file mode 100644
index 00000000000..d81dd83053f
--- /dev/null
+++ b/third_party/mlir/include/mlir/Support/TranslateClParser.h
@@ -0,0 +1,50 @@
+//===- TranslateClParser.h - Translations command line parser ---*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains custom command line parser for translations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
+#define MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/CommandLine.h"
+#include <functional>
+
+namespace mlir {
+
+struct LogicalResult;
+class MLIRContext;
+
+/// Common interface for source-to-source translation functions.
+using TranslateFunction = std::function<LogicalResult(
+    StringRef inputFilename, StringRef outputFilename, MLIRContext *)>;
+
+/// Custom parser for TranslateFunction.
+/// Wraps TranslateToMLIRFunctions and TranslateFromMLIRFunctions into
+/// TranslateFunctions before registering them as options.
+struct TranslationParser : public llvm::cl::parser<const TranslateFunction *> {
+  TranslationParser(llvm::cl::Option &opt);
+
+  void printOptionInfo(const llvm::cl::Option &O,
+                       size_t GlobalWidth) const override;
+};
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Argument.h b/third_party/mlir/include/mlir/TableGen/Argument.h
new file mode 100644
index 00000000000..83909392a43
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Argument.h
@@ -0,0 +1,68 @@
+//===- Argument.h - Argument definitions ------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file contains definitions for TableGen operation's arguments.
+// Operation arguments fall into two categories:
+//
+// 1. Operands: SSA values operated on by the operation
+// 2. Attributes: compile-time known properties that have influence over
+//    the operation's behavior
+//
+// These two categories are modelled with the unified argument concept in
+// TableGen because we need similar pattern matching mechanisms for them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_ARGUMENT_H_
+#define MLIR_TABLEGEN_ARGUMENT_H_
+
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/PointerUnion.h"
+#include <string>
+
+namespace llvm {
+class StringRef;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// A struct wrapping an op attribute and its name together
+struct NamedAttribute {
+  llvm::StringRef name;
+  Attribute attr;
+};
+
+// A struct wrapping an op operand/result's constraint and its name together
+struct NamedTypeConstraint {
+  // Returns true if this operand/result has constraint to be satisfied.
+  bool hasPredicate() const;
+  // Returns true if this operand/result is variadic.
+  bool isVariadic() const;
+
+  llvm::StringRef name;
+  TypeConstraint constraint;
+};
+
+// Operation argument: either attribute or operand
+using Argument = llvm::PointerUnion<NamedAttribute *, NamedTypeConstraint *>;
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_ARGUMENT_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Attribute.h b/third_party/mlir/include/mlir/TableGen/Attribute.h
new file mode 100644
index 00000000000..2f137a2aca4
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Attribute.h
@@ -0,0 +1,186 @@
+//===- Attribute.h - Attribute wrapper class --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Attribute wrapper to simplify using TableGen Record defining a MLIR
+// Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_ATTRIBUTE_H_
+#define MLIR_TABLEGEN_ATTRIBUTE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class DefInit;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing attribute constraints defined
+// in TableGen.
+class AttrConstraint : public Constraint {
+public:
+  explicit AttrConstraint(const llvm::Record *record);
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Attr; }
+};
+
+// Wrapper class providing helper methods for accessing MLIR Attribute defined
+// in TableGen. This class should closely reflect what is defined as class
+// `Attr` in TableGen.
+class Attribute : public AttrConstraint {
+public:
+  explicit Attribute(const llvm::Record *record);
+  explicit Attribute(const llvm::DefInit *init);
+
+  // Returns the storage type if set. Returns the default storage type
+  // ("Attribute") otherwise.
+  StringRef getStorageType() const;
+
+  // Returns the return type for this attribute.
+  StringRef getReturnType() const;
+
+  // Returns the template getter method call which reads this attribute's
+  // storage and returns the value as of the desired return type.
+  // The call will contain a `{0}` which will be expanded to this attribute.
+  StringRef getConvertFromStorageCall() const;
+
+  // Returns true if this attribute can be built from a constant value.
+  bool isConstBuildable() const;
+
+  // Returns the template that can be used to produce an instance of the
+  // attribute.
+  // Syntax: {0} should be replaced with a builder, {1} should be replaced with
+  // the constant value.
+  StringRef getConstBuilderTemplate() const;
+
+  // Returns the base-level attribute that this attribute constraint is
+  // built upon.
+  Attribute getBaseAttr() const;
+
+  // Returns whether this attribute has a default value's initializer.
+  bool hasDefaultValueInitializer() const;
+  // Returns the default value's initializer for this attribute.
+  StringRef getDefaultValueInitializer() const;
+
+  // Returns whether this attribute is optional.
+  bool isOptional() const;
+
+  // Returns true if this attribute is a derived attribute (i.e., a subclass
+  // of `DerivedAttr`).
+  bool isDerivedAttr() const;
+
+  // Returns true if this attribute is a type attribute (i.e., a subclass
+  // of `TypeAttrBase`).
+  bool isTypeAttr() const;
+
+  // Returns true if this attribute is an enum attribute (i.e., a subclass of
+  // `EnumAttrInfo`)
+  bool isEnumAttr() const;
+
+  // Returns this attribute's TableGen def name. If this is an `OptionalAttr`
+  // or `DefaultValuedAttr` without explicit name, returns the base attribute's
+  // name.
+  StringRef getAttrDefName() const;
+
+  // Returns the code body for derived attribute. Aborts if this is not a
+  // derived attribute.
+  StringRef getDerivedCodeBody() const;
+};
+
+// Wrapper class providing helper methods for accessing MLIR constant attribute
+// defined in TableGen. This class should closely reflect what is defined as
+// class `ConstantAttr` in TableGen.
+class ConstantAttr {
+public:
+  explicit ConstantAttr(const llvm::DefInit *init);
+
+  // Returns the attribute kind.
+  Attribute getAttribute() const;
+
+  // Returns the constant value.
+  StringRef getConstantValue() const;
+
+private:
+  // The TableGen definition of this constant attribute.
+  const llvm::Record *def;
+};
+
+// Wrapper class providing helper methods for accessing enum attribute cases
+// defined in TableGen. This is used for enum attribute case backed by both
+// StringAttr and IntegerAttr.
+class EnumAttrCase : public Attribute {
+public:
+  explicit EnumAttrCase(const llvm::DefInit *init);
+
+  // Returns true if this EnumAttrCase is backed by a StringAttr.
+  bool isStrCase() const;
+
+  // Returns the symbol of this enum attribute case.
+  StringRef getSymbol() const;
+
+  // Returns the value of this enum attribute case.
+  int64_t getValue() const;
+};
+
+// Wrapper class providing helper methods for accessing enum attributes defined
+// in TableGen.This is used for enum attribute case backed by both StringAttr
+// and IntegerAttr.
+class EnumAttr : public Attribute {
+public:
+  explicit EnumAttr(const llvm::Record *record);
+  explicit EnumAttr(const llvm::Record &record);
+  explicit EnumAttr(const llvm::DefInit *init);
+
+  // Returns the enum class name.
+  StringRef getEnumClassName() const;
+
+  // Returns the C++ namespaces this enum class should be placed in.
+  StringRef getCppNamespace() const;
+
+  // Returns the underlying type.
+  StringRef getUnderlyingType() const;
+
+  // Returns the name of the utility function that converts a value of the
+  // underlying type to the corresponding symbol.
+  StringRef getUnderlyingToSymbolFnName() const;
+
+  // Returns the name of the utility function that converts a string to the
+  // corresponding symbol.
+  StringRef getStringToSymbolFnName() const;
+
+  // Returns the name of the utility function that converts a symbol to the
+  // corresponding string.
+  StringRef getSymbolToStringFnName() const;
+
+  // Returns the name of the utilit function that returns the max enum value
+  // used within the enum class.
+  StringRef getMaxEnumValFnName() const;
+
+  // Returns all allowed cases for this enum attribute.
+  std::vector<EnumAttrCase> getAllCases() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_ATTRIBUTE_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Constraint.h b/third_party/mlir/include/mlir/TableGen/Constraint.h
new file mode 100644
index 00000000000..17b60da6027
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Constraint.h
@@ -0,0 +1,90 @@
+//===- Constraint.h - Constraint class --------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Constraint wrapper to simplify using TableGen Record for constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_CONSTRAINT_H_
+#define MLIR_TABLEGEN_CONSTRAINT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing Constraint defined in
+// TableGen.
+class Constraint {
+public:
+  Constraint(const llvm::Record *record);
+
+  bool operator==(const Constraint &that) { return def == that.def; }
+  bool operator!=(const Constraint &that) { return def != that.def; }
+
+  // Returns the predicate for this constraint.
+  Pred getPredicate() const;
+
+  // Returns the condition template that can be used to check if a type or
+  // attribute satisfies this constraint.  The template may contain "{0}" that
+  // must be substituted with an expression returning an mlir::Type or
+  // mlir::Attribute.
+  std::string getConditionTemplate() const;
+
+  // Returns the user-readable description of this constraint. If the
+  // description is not provided, returns the TableGen def name.
+  StringRef getDescription() const;
+
+  // Constraint kind
+  enum Kind { CK_Attr, CK_Region, CK_Type, CK_Uncategorized };
+
+  Kind getKind() const { return kind; }
+
+protected:
+  Constraint(Kind kind, const llvm::Record *record);
+
+  // The TableGen definition of this constraint.
+  const llvm::Record *def;
+
+private:
+  // What kind of constraint this is.
+  Kind kind;
+};
+
+// An constraint and the concrete entities to place the constraint on.
+struct AppliedConstraint {
+  AppliedConstraint(Constraint &&constraint, StringRef self,
+                    std::vector<std::string> &&entities);
+
+  Constraint constraint;
+  // The symbol to replace `$_self` special placeholder in the constraint.
+  std::string self;
+  // The symbols to replace `$N` positional placeholders in the constraint.
+  std::vector<std::string> entities;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_CONSTRAINT_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Dialect.h b/third_party/mlir/include/mlir/TableGen/Dialect.h
new file mode 100644
index 00000000000..0005ad1448c
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Dialect.h
@@ -0,0 +1,50 @@
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Dialect wrapper to simplify using TableGen Record defining a MLIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_DIALECT_H_
+#define MLIR_TABLEGEN_DIALECT_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace llvm {
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+// Wrapper class that contains a MLIR dialect's information defined in TableGen
+// and provides helper methods for accessing them.
+class Dialect {
+public:
+  explicit Dialect(const llvm::Record *def) : def(*def) {}
+
+  // Returns the name of this dialect.
+  StringRef getName() const;
+
+  // Returns the C++ namespaces that ops of this dialect should be placed into.
+  StringRef getCppNamespace() const;
+
+private:
+  const llvm::Record &def;
+};
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_DIALECT_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Format.h b/third_party/mlir/include/mlir/TableGen/Format.h
new file mode 100644
index 00000000000..75ace15e26e
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Format.h
@@ -0,0 +1,248 @@
+//===- Format.h - Utilities for String Format -------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares utilities for formatting strings. They are specially
+// tailored to the needs of TableGen'ing op definitions and rewrite rules,
+// so they are not expected to be used as widely applicable utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_FORMAT_H_
+#define MLIR_TABLEGEN_FORMAT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+namespace tblgen {
+
+/// Format context containing substitutions for special placeholders.
+///
+/// This context divides special placeholders into two categories: builtin ones
+/// and custom ones.
+///
+/// Builtin placeholders are baked into `FmtContext` and each one of them has a
+/// dedicated setter. They can be used in all dialects. Their names follow the
+/// convention of `$_<name>`. The rationale of the leading underscore is to
+/// avoid confusion and name collision: op arguments/attributes/results are
+/// named as $<name>, and we can potentially support referencing those entities
+/// directly in the format template in the future.
+//
+/// Custom ones are registered by dialect-specific TablGen backends and use the
+/// same unified setter.
+class FmtContext {
+public:
+  // Placeholder kinds
+  enum class PHKind : char {
+    None,
+    Custom,  // For custom placeholders
+    Builder, // For the $_builder placeholder
+    Op,      // For the $_op placeholder
+    Self,    // For the $_self placeholder
+  };
+
+  FmtContext() = default;
+
+  // Setter for custom placeholders
+  FmtContext &addSubst(StringRef placeholder, Twine subst);
+
+  // Setters for builtin placeholders
+  FmtContext &withBuilder(Twine subst);
+  FmtContext &withOp(Twine subst);
+  FmtContext &withSelf(Twine subst);
+
+  Optional<StringRef> getSubstFor(PHKind placeholder) const;
+  Optional<StringRef> getSubstFor(StringRef placeholder) const;
+
+  static PHKind getPlaceHolderKind(StringRef str);
+
+private:
+  struct PHKindInfo : DenseMapInfo<PHKind> {
+    using CharInfo = DenseMapInfo<char>;
+
+    static inline PHKind getEmptyKey() {
+      return static_cast<PHKind>(CharInfo::getEmptyKey());
+    }
+    static inline PHKind getTombstoneKey() {
+      return static_cast<PHKind>(CharInfo::getTombstoneKey());
+    }
+    static unsigned getHashValue(const PHKind &val) {
+      return CharInfo::getHashValue(static_cast<char>(val));
+    }
+
+    static bool isEqual(const PHKind &lhs, const PHKind &rhs) {
+      return lhs == rhs;
+    }
+  };
+
+  llvm::SmallDenseMap<PHKind, std::string, 4, PHKindInfo> builtinSubstMap;
+  llvm::StringMap<std::string> customSubstMap;
+};
+
+/// Struct representing a replacement segment for the formatted string. It can
+/// be a segment of the formatting template (for `Literal`) or a replacement
+/// parameter (for `PositionalPH` and `SpecialPH`).
+struct FmtReplacement {
+  enum class Type { Empty, Literal, PositionalPH, SpecialPH };
+
+  FmtReplacement() = default;
+  explicit FmtReplacement(StringRef literal)
+      : type(Type::Literal), spec(literal) {}
+  FmtReplacement(StringRef spec, size_t index)
+      : type(Type::PositionalPH), spec(spec), index(index) {}
+  FmtReplacement(StringRef spec, FmtContext::PHKind placeholder)
+      : type(Type::SpecialPH), spec(spec), placeholder(placeholder) {}
+
+  Type type = Type::Empty;
+  StringRef spec;
+  size_t index = 0;
+  FmtContext::PHKind placeholder = FmtContext::PHKind::None;
+};
+
+class FmtObjectBase {
+private:
+  static std::pair<FmtReplacement, StringRef> splitFmtSegment(StringRef fmt);
+  static std::vector<FmtReplacement> parseFormatString(StringRef fmt);
+
+protected:
+  // The parameters are stored in a std::tuple, which does not provide runtime
+  // indexing capabilities.  In order to enable runtime indexing, we use this
+  // structure to put the parameters into a std::vector.  Since the parameters
+  // are not all the same type, we use some type-erasure by wrapping the
+  // parameters in a template class that derives from a non-template superclass.
+  // Essentially, we are converting a std::tuple<Derived<Ts...>> to a
+  // std::vector<Base*>.
+  struct CreateAdapters {
+    template <typename... Ts>
+    std::vector<llvm::detail::format_adapter *> operator()(Ts &... items) {
+      return std::vector<llvm::detail::format_adapter *>{&items...};
+    }
+  };
+
+  StringRef fmt;
+  const FmtContext *context;
+  std::vector<llvm::detail::format_adapter *> adapters;
+  std::vector<FmtReplacement> replacements;
+
+public:
+  FmtObjectBase(StringRef fmt, const FmtContext *ctx, size_t numParams)
+      : fmt(fmt), context(ctx), replacements(parseFormatString(fmt)) {}
+
+  FmtObjectBase(const FmtObjectBase &that) = delete;
+
+  FmtObjectBase(FmtObjectBase &&that)
+      : fmt(std::move(that.fmt)), context(that.context),
+        adapters(), // adapters are initialized by FmtObject
+        replacements(std::move(that.replacements)) {}
+
+  void format(llvm::raw_ostream &s) const;
+
+  std::string str() const {
+    std::string result;
+    llvm::raw_string_ostream s(result);
+    format(s);
+    return s.str();
+  }
+
+  template <unsigned N> SmallString<N> sstr() const {
+    SmallString<N> result;
+    llvm::raw_svector_ostream s(result);
+    format(s);
+    return result;
+  }
+
+  template <unsigned N> operator SmallString<N>() const { return sstr<N>(); }
+
+  operator std::string() const { return str(); }
+};
+
+template <typename Tuple> class FmtObject : public FmtObjectBase {
+  // Storage for the parameter adapters.  Since the base class erases the type
+  // of the parameters, we have to own the storage for the parameters here, and
+  // have the base class store type-erased pointers into this tuple.
+  Tuple parameters;
+
+public:
+  FmtObject(StringRef fmt, const FmtContext *ctx, Tuple &&params)
+      : FmtObjectBase(fmt, ctx, std::tuple_size<Tuple>::value),
+        parameters(std::move(params)) {
+    adapters.reserve(std::tuple_size<Tuple>::value);
+    adapters = llvm::apply_tuple(CreateAdapters(), parameters);
+  }
+
+  FmtObject(FmtObject const &that) = delete;
+
+  FmtObject(FmtObject &&that)
+      : FmtObjectBase(std::move(that)), parameters(std::move(that.parameters)) {
+    adapters.reserve(that.adapters.size());
+    adapters = llvm::apply_tuple(CreateAdapters(), parameters);
+  }
+};
+
+/// Formats text by substituting placeholders in format string with replacement
+/// parameters.
+///
+/// There are two categories of placeholders accepted, both led by a '$' sign:
+///
+/// 1. Positional placeholder: $[0-9]+
+/// 2. Special placeholder:    $[a-zA-Z_][a-zA-Z0-9_]*
+///
+/// Replacement parameters for positional placeholders are supplied as the
+/// `vals` parameter pack with 1:1 mapping. That is, $0 will be replaced by the
+/// first parameter in `vals`, $1 by the second one, and so on. Note that you
+/// can use the positional placeholders in any order and repeat any times, for
+/// example, "$2 $1 $1 $0" is accepted.
+///
+/// Replacement parameters for special placeholders are supplied using the `ctx`
+/// format context.
+///
+/// The `fmt` is recorded as a `StringRef` inside the returned `FmtObject`.
+/// The caller needs to make sure the underlying data is available when the
+/// `FmtObject` is used.
+///
+/// `ctx` accepts a nullptr if there is no special placeholder is used.
+///
+/// If no substitution is provided for a placeholder or any error happens during
+/// format string parsing or replacement, the placeholder will be outputted
+/// as-is with an additional marker '<no-subst-found>', to aid debugging.
+///
+/// To print a '$' literally, escape it with '$$'.
+///
+/// This utility function is inspired by LLVM formatv(), with modifications
+/// specially tailored for TableGen C++ generation usage:
+///
+/// 1. This utility use '$' instead of '{' and '}' for denoting the placeholder
+///    because '{' and '}' are frequently used in C++ code.
+/// 2. This utility does not support format layout because it is rarely needed
+///    in C++ code generation.
+template <typename... Ts>
+inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&... vals)
+    -> FmtObject<decltype(std::make_tuple(
+        llvm::detail::build_format_adapter(std::forward<Ts>(vals))...))> {
+  using ParamTuple = decltype(std::make_tuple(
+      llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
+  return FmtObject<ParamTuple>(
+      fmt, ctx,
+      std::make_tuple(
+          llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
+}
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_FORMAT_H_
diff --git a/third_party/mlir/include/mlir/TableGen/GenInfo.h b/third_party/mlir/include/mlir/TableGen/GenInfo.h
new file mode 100644
index 00000000000..0b0bd192ae5
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/GenInfo.h
@@ -0,0 +1,81 @@
+//===- GenInfo.h - Generator info -------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_TABLEGEN_GENINFO_H_
+#define MLIR_TABLEGEN_GENINFO_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+#include <functional>
+
+namespace llvm {
+class RecordKeeper;
+} // end namespace llvm
+
+namespace mlir {
+
+/// Generator function to invoke.
+using GenFunction = std::function<bool(const llvm::RecordKeeper &recordKeeper,
+                                       raw_ostream &os)>;
+
+/// Structure to group information about a generator (argument to invoke via
+/// mlir-tblgen, description, and generator function).
+class GenInfo {
+public:
+  /// GenInfo constructor should not be invoked directly, instead use
+  /// GenRegistration or registerGen.
+  GenInfo(StringRef arg, StringRef description, GenFunction generator)
+      : arg(arg), description(description), generator(generator) {}
+
+  /// Invokes the generator and returns whether the generator failed.
+  bool invoke(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) const {
+    assert(generator && "Cannot call generator with null generator");
+    return generator(recordKeeper, os);
+  }
+
+  /// Returns the command line option that may be passed to 'mlir-tblgen' to
+  /// invoke this generator.
+  StringRef getGenArgument() const { return arg; }
+
+  /// Returns a description for the generator.
+  StringRef getGenDescription() const { return description; }
+
+private:
+  // The argument with which to invoke the generator via mlir-tblgen.
+  StringRef arg;
+
+  // Description of the generator.
+  StringRef description;
+
+  // Generator function.
+  GenFunction generator;
+};
+
+/// GenRegistration provides a global initializer that registers a generator
+/// function.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static GenRegistration Print("print", "Print records", [](...){...});
+struct GenRegistration {
+  GenRegistration(StringRef arg, StringRef description, GenFunction function);
+};
+
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_GENINFO_H_
diff --git a/third_party/mlir/include/mlir/TableGen/GenNameParser.h b/third_party/mlir/include/mlir/TableGen/GenNameParser.h
new file mode 100644
index 00000000000..7b1e8a36d03
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/GenNameParser.h
@@ -0,0 +1,40 @@
+//===- GenNameParser.h - Command line parser for generators -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// The GenNameParser class adds all passes linked in to the system that are
+// creatable to the tool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_GENNAMEPARSER_H_
+#define MLIR_TABLEGEN_GENNAMEPARSER_H_
+
+#include "llvm/Support/CommandLine.h"
+
+namespace mlir {
+class GenInfo;
+
+/// Adds command line option for each registered generator.
+struct GenNameParser : public llvm::cl::parser<const GenInfo *> {
+  GenNameParser(llvm::cl::Option &opt);
+
+  void printOptionInfo(const llvm::cl::Option &O,
+                       size_t GlobalWidth) const override;
+};
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_GENNAMEPARSER_H_
diff --git a/third_party/mlir/include/mlir/TableGen/OpTrait.h b/third_party/mlir/include/mlir/TableGen/OpTrait.h
new file mode 100644
index 00000000000..8a3463d257e
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/OpTrait.h
@@ -0,0 +1,98 @@
+//===- OpTrait.h - OpTrait wrapper class ------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// OpTrait wrapper to simplify using TableGen Record defining an MLIR OpTrait.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_OPTRAIT_H_
+#define MLIR_TABLEGEN_OPTRAIT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Init;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing OpTrait constraints defined
+// in TableGen.
+class OpTrait {
+public:
+  // Discriminator for kinds of op traits.
+  enum class Kind {
+    // OpTrait corresponding to C++ class.
+    Native,
+    // OpTrait corresponding to predicate on operation.
+    Pred,
+    // OpTrait controlling op definition generator internals.
+    Internal
+  };
+
+  explicit OpTrait(Kind kind, const llvm::Record *def);
+
+  // Returns an OpTrait corresponding to the init provided.
+  static OpTrait create(const llvm::Init *init);
+
+  Kind getKind() const { return kind; }
+
+protected:
+  // The TableGen definition of this trait.
+  const llvm::Record *def;
+  Kind kind;
+};
+
+// OpTrait corresponding to a native C++ OpTrait.
+class NativeOpTrait : public OpTrait {
+public:
+  // Returns the trait corresponding to a C++ trait class.
+  StringRef getTrait() const;
+
+  static bool classof(const OpTrait *t) { return t->getKind() == Kind::Native; }
+};
+
+// OpTrait corresponding to a predicate on the operation.
+class PredOpTrait : public OpTrait {
+public:
+  // Returns the template for constructing the predicate.
+  std::string getPredTemplate() const;
+
+  // Returns the description of what the predicate is verifying.
+  StringRef getDescription() const;
+
+  static bool classof(const OpTrait *t) { return t->getKind() == Kind::Pred; }
+};
+
+// OpTrait controlling op definition generator internals.
+class InternalOpTrait : public OpTrait {
+public:
+  // Returns the trait controlling op definition generator internals.
+  StringRef getTrait() const;
+
+  static bool classof(const OpTrait *t) {
+    return t->getKind() == Kind::Internal;
+  }
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_OPTRAIT_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Operator.h b/third_party/mlir/include/mlir/TableGen/Operator.h
new file mode 100644
index 00000000000..d9b60d236fc
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Operator.h
@@ -0,0 +1,206 @@
+//===- Operator.h - Operator class ------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Operator wrapper to simplify using TableGen Record defining a MLIR Op.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_OPERATOR_H_
+#define MLIR_TABLEGEN_OPERATOR_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Dialect.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Region.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace llvm {
+class CodeInit;
+class DefInit;
+class Record;
+class StringInit;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class that contains a MLIR op's information (e.g., operands,
+// atributes) defined in TableGen and provides helper methods for
+// accessing them.
+class Operator {
+public:
+  explicit Operator(const llvm::Record &def);
+  explicit Operator(const llvm::Record *def) : Operator(*def) {}
+
+  // Returns this op's dialect name.
+  StringRef getDialectName() const;
+
+  // Returns the operation name. The name will follow the "<dialect>.<op-name>"
+  // format if its dialect name is not empty.
+  std::string getOperationName() const;
+
+  // Returns this op's C++ class name.
+  StringRef getCppClassName() const;
+
+  // Returns this op's C++ class name prefixed with namespaces.
+  std::string getQualCppClassName() const;
+
+  using value_iterator = NamedTypeConstraint *;
+  using value_range = llvm::iterator_range<value_iterator>;
+
+  // Returns true if this op has variadic operands or results.
+  bool isVariadic() const;
+
+  // Returns true if default builders should not be generated.
+  bool skipDefaultBuilders() const;
+
+  // Op result iterators.
+  value_iterator result_begin();
+  value_iterator result_end();
+  value_range getResults();
+
+  // Returns the number of results this op produces.
+  int getNumResults() const;
+
+  // Returns the op result at the given `index`.
+  NamedTypeConstraint &getResult(int index) { return results[index]; }
+  const NamedTypeConstraint &getResult(int index) const {
+    return results[index];
+  }
+
+  // Returns the `index`-th result's type constraint.
+  TypeConstraint getResultTypeConstraint(int index) const;
+  // Returns the `index`-th result's name.
+  StringRef getResultName(int index) const;
+
+  // Returns the number of variadic results in this operation.
+  unsigned getNumVariadicResults() const;
+
+  // Op attribute interators.
+  using attribute_iterator = const NamedAttribute *;
+  attribute_iterator attribute_begin() const;
+  attribute_iterator attribute_end() const;
+  llvm::iterator_range<attribute_iterator> getAttributes() const;
+
+  int getNumAttributes() const { return attributes.size(); }
+
+  // Op attribute accessors.
+  NamedAttribute &getAttribute(int index) { return attributes[index]; }
+
+  // Op operand iterators.
+  value_iterator operand_begin();
+  value_iterator operand_end();
+  value_range getOperands();
+
+  int getNumOperands() const { return operands.size(); }
+  NamedTypeConstraint &getOperand(int index) { return operands[index]; }
+  const NamedTypeConstraint &getOperand(int index) const {
+    return operands[index];
+  }
+
+  // Returns the number of variadic operands in this operation.
+  unsigned getNumVariadicOperands() const;
+
+  // Returns the total number of arguments.
+  int getNumArgs() const { return arguments.size(); }
+
+  // Op argument (attribute or operand) accessors.
+  Argument getArg(int index) const;
+  StringRef getArgName(int index) const;
+
+  // Returns true if this op has the given MLIR C++ `trait`.
+  // TODO: We should add a C++ wrapper class for TableGen OpTrait instead of
+  // requiring the raw MLIR trait here.
+  bool hasTrait(llvm::StringRef trait) const;
+
+  // Returns the number of regions.
+  unsigned getNumRegions() const;
+  // Returns the `index`-th region.
+  const NamedRegion &getRegion(unsigned index) const;
+
+  // Trait.
+  using const_trait_iterator = const OpTrait *;
+  const_trait_iterator trait_begin() const;
+  const_trait_iterator trait_end() const;
+  llvm::iterator_range<const_trait_iterator> getTraits() const;
+
+  ArrayRef<llvm::SMLoc> getLoc() const;
+
+  // Query functions for the documentation of the operator.
+  bool hasDescription() const;
+  StringRef getDescription() const;
+  bool hasSummary() const;
+  StringRef getSummary() const;
+
+  // Returns this op's extra class declaration code.
+  StringRef getExtraClassDeclaration() const;
+
+  // Returns the Tablegen definition this operator was constructed from.
+  // TODO(antiagainst,zinenko): do not expose the TableGen record, this is a
+  // temporary solution to OpEmitter requiring a Record because Operator does
+  // not provide enough methods.
+  const llvm::Record &getDef() const;
+
+private:
+  // Populates the vectors containing operands, attributes, results and traits.
+  void populateOpStructure();
+
+  // The dialect of this op.
+  Dialect dialect;
+
+  // The unqualified C++ class name of the op.
+  StringRef cppClassName;
+
+  // The operands of the op.
+  SmallVector<NamedTypeConstraint, 4> operands;
+
+  // The attributes of the op.  Contains native attributes (corresponding to the
+  // actual stored attributed of the operation) followed by derived attributes
+  // (corresponding to dynamic properties of the operation that are computed
+  // upon request).
+  SmallVector<NamedAttribute, 4> attributes;
+
+  // The arguments of the op (operands and native attributes).
+  SmallVector<Argument, 4> arguments;
+
+  // The results of the op.
+  SmallVector<NamedTypeConstraint, 4> results;
+
+  // The traits of the op.
+  SmallVector<OpTrait, 4> traits;
+
+  // The regions of this op.
+  SmallVector<NamedRegion, 1> regions;
+
+  // The number of native attributes stored in the leading positions of
+  // `attributes`.
+  int numNativeAttributes;
+
+  // The TableGen definition of this op.
+  const llvm::Record &def;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_OPERATOR_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Pattern.h b/third_party/mlir/include/mlir/TableGen/Pattern.h
new file mode 100644
index 00000000000..f93d86d3d31
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Pattern.h
@@ -0,0 +1,413 @@
+//===- Pattern.h - Pattern wrapper class ------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Pattern wrapper class to simplify using TableGen Record defining a MLIR
+// Pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_PATTERN_H_
+#define MLIR_TABLEGEN_PATTERN_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+class DagInit;
+class Init;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Mapping from TableGen Record to Operator wrapper object.
+//
+// We allocate each wrapper object in heap to make sure the pointer to it is
+// valid throughout the lifetime of this map. This is important because this map
+// is shared among multiple patterns to avoid creating the wrapper object for
+// the same op again and again. But this map will continuously grow.
+using RecordOperatorMap =
+    llvm::DenseMap<const llvm::Record *, std::unique_ptr<Operator>>;
+
+class Pattern;
+
+// Wrapper class providing helper methods for accessing TableGen DAG leaves
+// used inside Patterns. This class is lightweight and designed to be used like
+// values.
+//
+// A TableGen DAG construct is of the syntax
+//   `(operator, arg0, arg1, ...)`.
+//
+// This class provides getters to retrieve `arg*` as tblgen:: wrapper objects
+// for handy helper methods. It only works on `arg*`s that are not nested DAG
+// constructs.
+class DagLeaf {
+public:
+  explicit DagLeaf(const llvm::Init *def) : def(def) {}
+
+  // Returns true if this DAG leaf is not specified in the pattern. That is, it
+  // places no further constraints/transforms and just carries over the original
+  // value.
+  bool isUnspecified() const;
+
+  // Returns true if this DAG leaf is matching an operand. That is, it specifies
+  // a type constraint.
+  bool isOperandMatcher() const;
+
+  // Returns true if this DAG leaf is matching an attribute. That is, it
+  // specifies an attribute constraint.
+  bool isAttrMatcher() const;
+
+  // Returns true if this DAG leaf is wrapping native code call.
+  bool isNativeCodeCall() const;
+
+  // Returns true if this DAG leaf is specifying a constant attribute.
+  bool isConstantAttr() const;
+
+  // Returns true if this DAG leaf is specifying an enum attribute case.
+  bool isEnumAttrCase() const;
+
+  // Returns this DAG leaf as a constraint. Asserts if fails.
+  Constraint getAsConstraint() const;
+
+  // Returns this DAG leaf as an constant attribute. Asserts if fails.
+  ConstantAttr getAsConstantAttr() const;
+
+  // Returns this DAG leaf as an enum attribute case.
+  // Precondition: isEnumAttrCase()
+  EnumAttrCase getAsEnumAttrCase() const;
+
+  // Returns the matching condition template inside this DAG leaf. Assumes the
+  // leaf is an operand/attribute matcher and asserts otherwise.
+  std::string getConditionTemplate() const;
+
+  // Returns the native code call template inside this DAG leaf.
+  // Precondition: isNativeCodeCall()
+  StringRef getNativeCodeTemplate() const;
+
+private:
+  // Returns true if the TableGen Init `def` in this DagLeaf is a DefInit and
+  // also a subclass of the given `superclass`.
+  bool isSubClassOf(StringRef superclass) const;
+
+  const llvm::Init *def;
+};
+
+// Wrapper class providing helper methods for accessing TableGen DAG constructs
+// used inside Patterns. This class is lightweight and designed to be used like
+// values.
+//
+// A TableGen DAG construct is of the syntax
+//   `(operator, arg0, arg1, ...)`.
+//
+// When used inside Patterns, `operator` corresponds to some dialect op, or
+// a known list of verbs that defines special transformation actions. This
+// `arg*` can be a nested DAG construct. This class provides getters to
+// retrieve `operator` and `arg*` as tblgen:: wrapper objects for handy helper
+// methods.
+//
+// A null DagNode contains a nullptr and converts to false implicitly.
+class DagNode {
+public:
+  explicit DagNode(const llvm::DagInit *node) : node(node) {}
+
+  // Implicit bool converter that returns true if this DagNode is not a null
+  // DagNode.
+  operator bool() const { return node != nullptr; }
+
+  // Returns the symbol bound to this DAG node.
+  StringRef getSymbol() const;
+
+  // Returns the operator wrapper object corresponding to the dialect op matched
+  // by this DAG. The operator wrapper will be queried from the given `mapper`
+  // and created in it if not existing.
+  Operator &getDialectOp(RecordOperatorMap *mapper) const;
+
+  // Returns the number of operations recursively involved in the DAG tree
+  // rooted from this node.
+  int getNumOps() const;
+
+  // Returns the number of immediate arguments to this DAG node.
+  int getNumArgs() const;
+
+  // Returns true if the `index`-th argument is a nested DAG construct.
+  bool isNestedDagArg(unsigned index) const;
+
+  // Gets the `index`-th argument as a nested DAG construct if possible. Returns
+  // null DagNode otherwise.
+  DagNode getArgAsNestedDag(unsigned index) const;
+
+  // Gets the `index`-th argument as a DAG leaf.
+  DagLeaf getArgAsLeaf(unsigned index) const;
+
+  // Returns the specified name of the `index`-th argument.
+  StringRef getArgName(unsigned index) const;
+
+  // Returns true if this DAG construct means to replace with an existing SSA
+  // value.
+  bool isReplaceWithValue() const;
+
+  // Returns true if this DAG node is wrapping native code call.
+  bool isNativeCodeCall() const;
+
+  // Returns true if this DAG node is an operation.
+  bool isOperation() const;
+
+  // Returns the native code call template inside this DAG node.
+  // Precondition: isNativeCodeCall()
+  StringRef getNativeCodeTemplate() const;
+
+private:
+  const llvm::DagInit *node; // nullptr means null DagNode
+};
+
+// A class for maintaining information for symbols bound in patterns and
+// provides methods for resolving them according to specific use cases.
+//
+// Symbols can be bound to
+//
+// * Op arguments and op results in the source pattern and
+// * Op results in result patterns.
+//
+// Symbols can be referenced in result patterns and additional constraints to
+// the pattern.
+//
+// For example, in
+//
+// ```
+// def : Pattern<
+//     (SrcOp:$results1 $arg0, %arg1),
+//     [(ResOp1:$results2), (ResOp2 $results2 (ResOp3 $arg0, $arg1))]>;
+// ```
+//
+// `$argN` is bound to the `SrcOp`'s N-th argument. `$results1` is bound to
+// `SrcOp`. `$results2` is bound to `ResOp1`. $result2 is referenced to build
+// `ResOp2`. `$arg0` and `$arg1` are referenced to build `ResOp3`.
+//
+// If a symbol binds to a multi-result op and it does not have the `__N`
+// suffix, the symbol is expanded to represent all results generated by the
+// multi-result op. If the symbol has a `__N` suffix, then it will expand to
+// only the N-th *static* result as declared in ODS, and that can still
+// corresponds to multiple *dynamic* values if the N-th *static* result is
+// variadic.
+//
+// This class keeps track of such symbols and resolves them into their bound
+// values in a suitable way.
+class SymbolInfoMap {
+public:
+  explicit SymbolInfoMap(ArrayRef<llvm::SMLoc> loc) : loc(loc) {}
+
+  // Class for information regarding a symbol.
+  class SymbolInfo {
+  public:
+    // Returns a string for defining a variable named as `name` to store the
+    // value bound by this symbol.
+    std::string getVarDecl(StringRef name) const;
+
+  private:
+    // Allow SymbolInfoMap to access private methods.
+    friend class SymbolInfoMap;
+
+    // What kind of entity this symbol represents:
+    // * Attr: op attribute
+    // * Operand: op operand
+    // * Result: op result
+    // * Value: a value not attached to an op (e.g., from NativeCodeCall)
+    enum class Kind : uint8_t { Attr, Operand, Result, Value };
+
+    // Creates a SymbolInfo instance. `index` is only used for `Attr` and
+    // `Operand` so should be negative for `Result` and `Value` kind.
+    SymbolInfo(const Operator *op, Kind kind, Optional<int> index);
+
+    // Static methods for creating SymbolInfo.
+    static SymbolInfo getAttr(const Operator *op, int index) {
+      return SymbolInfo(op, Kind::Attr, index);
+    }
+    static SymbolInfo getOperand(const Operator *op, int index) {
+      return SymbolInfo(op, Kind::Operand, index);
+    }
+    static SymbolInfo getResult(const Operator *op) {
+      return SymbolInfo(op, Kind::Result, llvm::None);
+    }
+    static SymbolInfo getValue() {
+      return SymbolInfo(nullptr, Kind::Value, llvm::None);
+    }
+
+    // Returns the number of static values this symbol corresponds to.
+    // A static value is an operand/result declared in ODS. Normally a symbol
+    // only represents one static value, but symbols bound to op results can
+    // represent more than one if the op is a multi-result op.
+    int getStaticValueCount() const;
+
+    // Returns a string containing the C++ expression for referencing this
+    // symbol as a value (if this symbol represents one static value) or a value
+    // range (if this symbol represents multiple static values). `name` is the
+    // name of the C++ variable that this symbol bounds to. `index` should only
+    // be used for indexing results.  `fmt` is used to format each value.
+    // `separator` is used to separate values if this is a value range.
+    std::string getValueAndRangeUse(StringRef name, int index, const char *fmt,
+                                    const char *separator) const;
+
+    // Returns a string containing the C++ expression for referencing this
+    // symbol as a value range regardless of how many static values this symbol
+    // represents. `name` is the name of the C++ variable that this symbol
+    // bounds to. `index` should only be used for indexing results. `fmt` is
+    // used to format each value. `separator` is used to separate values in the
+    // range.
+    std::string getAllRangeUse(StringRef name, int index, const char *fmt,
+                               const char *separator) const;
+
+    const Operator *op; // The op where the bound entity belongs
+    Kind kind;          // The kind of the bound entity
+    // The argument index (for `Attr` and `Operand` only)
+    Optional<int> argIndex;
+  };
+
+  using BaseT = llvm::StringMap<SymbolInfo>;
+
+  // Iterators for accessing all symbols.
+  using iterator = BaseT::iterator;
+  iterator begin() { return symbolInfoMap.begin(); }
+  iterator end() { return symbolInfoMap.end(); }
+
+  // Const iterators for accessing all symbols.
+  using const_iterator = BaseT::const_iterator;
+  const_iterator begin() const { return symbolInfoMap.begin(); }
+  const_iterator end() const { return symbolInfoMap.end(); }
+
+  // Binds the given `symbol` to the `argIndex`-th argument to the given `op`.
+  // Returns false if `symbol` is already bound.
+  bool bindOpArgument(StringRef symbol, const Operator &op, int argIndex);
+
+  // Binds the given `symbol` to the results the given `op`. Returns false if
+  // `symbol` is already bound.
+  bool bindOpResult(StringRef symbol, const Operator &op);
+
+  // Registers the given `symbol` as bound to a value. Returns false if `symbol`
+  // is already bound.
+  bool bindValue(StringRef symbol);
+
+  // Returns true if the given `symbol` is bound.
+  bool contains(StringRef symbol) const;
+
+  // Returns an interator to the information of the given symbol named as `key`.
+  const_iterator find(StringRef key) const;
+
+  // Returns the number of static values of the given `symbol` corresponds to.
+  // A static value is a operand/result declared in ODS. Normally a symbol only
+  // represents one static value, but symbols bound to op results can represent
+  // more than one if the op is a multi-result op.
+  int getStaticValueCount(StringRef symbol) const;
+
+  // Returns a string containing the C++ expression for referencing this
+  // symbol as a value (if this symbol represents one static value) or a value
+  // range (if this symbol represents multiple static values). `fmt` is used to
+  // format each value. `separator` is used to seperate values if `symbol`
+  // represents a value range.
+  std::string getValueAndRangeUse(StringRef symbol, const char *fmt = "{0}",
+                                  const char *separator = ", ") const;
+
+  // Returns a string containing the C++ expression for referencing this
+  // symbol as a value range regardless of how many static values this symbol
+  // represents. `fmt` is used to format each value. `seperator` is used to
+  // separate values in the range.
+  std::string getAllRangeUse(StringRef symbol, const char *fmt = "{0}",
+                             const char *separator = ", ") const;
+
+  // Splits the given `symbol` into a value pack name and an index. Returns the
+  // value pack name and writes the index to `index` on sucess. Returns `symbol`
+  // itself if it does not contain an index.
+  //
+  // We can use `name__N` to access the `N`-th value in the value pack bound to
+  // `name`. `name` is typically the results of an multi-result op.
+  static StringRef getValuePackName(StringRef symbol, int *index = nullptr);
+
+private:
+  llvm::StringMap<SymbolInfo> symbolInfoMap;
+
+  // Pattern instantiation location. This is intended to be used as parameter
+  // to PrintFatalError() to report errors.
+  ArrayRef<llvm::SMLoc> loc;
+};
+
+// Wrapper class providing helper methods for accessing MLIR Pattern defined
+// in TableGen. This class should closely reflect what is defined as class
+// `Pattern` in TableGen. This class contains maps so it is not intended to be
+// used as values.
+class Pattern {
+public:
+  explicit Pattern(const llvm::Record *def, RecordOperatorMap *mapper);
+
+  // Returns the source pattern to match.
+  DagNode getSourcePattern() const;
+
+  // Returns the number of result patterns generated by applying this rewrite
+  // rule.
+  int getNumResultPatterns() const;
+
+  // Returns the DAG tree root node of the `index`-th result pattern.
+  DagNode getResultPattern(unsigned index) const;
+
+  // Collects all symbols bound in the source pattern into `infoMap`.
+  void collectSourcePatternBoundSymbols(SymbolInfoMap &infoMap);
+
+  // Collects all symbols bound in result patterns into `infoMap`.
+  void collectResultPatternBoundSymbols(SymbolInfoMap &infoMap);
+
+  // Returns the op that the root node of the source pattern matches.
+  const Operator &getSourceRootOp();
+
+  // Returns the operator wrapper object corresponding to the given `node`'s DAG
+  // operator.
+  Operator &getDialectOp(DagNode node);
+
+  // Returns the constraints.
+  std::vector<AppliedConstraint> getConstraints() const;
+
+  // Returns the benefit score of the pattern.
+  int getBenefit() const;
+
+  using IdentifierLine = std::pair<StringRef, unsigned>;
+
+  // Returns the file location of the pattern (buffer identifier + line number
+  // pair).
+  std::vector<IdentifierLine> getLocation() const;
+
+private:
+  // Recursively collects all bound symbols inside the DAG tree rooted
+  // at `tree` and updates the given `infoMap`.
+  void collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap,
+                           bool isSrcPattern);
+
+  // The TableGen definition of this pattern.
+  const llvm::Record &def;
+
+  // All operators.
+  // TODO(antiagainst): we need a proper context manager, like MLIRContext,
+  // for managing the lifetime of shared entities.
+  RecordOperatorMap *recordOpMap;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_PATTERN_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Predicate.h b/third_party/mlir/include/mlir/TableGen/Predicate.h
new file mode 100644
index 00000000000..49f7ebcfe52
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Predicate.h
@@ -0,0 +1,128 @@
+//===- Predicate.h - Predicate class ----------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Wrapper around predicates defined in TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_PREDICATE_H_
+#define MLIR_TABLEGEN_PREDICATE_H_
+
+#include "mlir/Support/LLVM.h"
+
+#include <string>
+#include <vector>
+
+namespace llvm {
+class Init;
+class ListInit;
+class Record;
+class SMLoc;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// A logical predicate.  This class must closely follow the definition of
+// TableGen class 'Pred'.
+class Pred {
+public:
+  // Constructs the null Predicate (e.g., always true).
+  explicit Pred() : def(nullptr) {}
+  // Construct a Predicate from a record.
+  explicit Pred(const llvm::Record *record);
+  // Construct a Predicate from an initializer.
+  explicit Pred(const llvm::Init *init);
+
+  // Check if the predicate is defined.  Callers may use this to interpret the
+  // missing predicate as either true (e.g. in filters) or false (e.g. in
+  // precondition verification).
+  bool isNull() const { return def == nullptr; }
+
+  // Get the predicate condition.  This may dispatch to getConditionImpl() of
+  // the underlying predicate type.
+  std::string getCondition() const;
+
+  // Whether the predicate is a combination of other predicates, i.e. an
+  // record of type CombinedPred.
+  bool isCombined() const;
+
+  // Records are pointer-comparable.
+  bool operator==(const Pred &other) const { return def == other.def; }
+
+  // Get the location of the predicate.
+  ArrayRef<llvm::SMLoc> getLoc() const;
+
+protected:
+  // The TableGen definition of this predicate.
+  const llvm::Record *def;
+};
+
+// A logical predicate wrapping a C expression.  This class must closely follow
+// the definition of TableGen class 'CPred'.
+class CPred : public Pred {
+public:
+  // Construct a CPred from a record.
+  explicit CPred(const llvm::Record *record);
+  // Construct a CPred an initializer.
+  explicit CPred(const llvm::Init *init);
+
+  // Get the predicate condition.
+  std::string getConditionImpl() const;
+};
+
+// A logical predicate that is a combination of other predicates.  This class
+// must closely follow the definition of TableGen class 'CombinedPred'.
+class CombinedPred : public Pred {
+public:
+  // Construct a CombinedPred from a record.
+  explicit CombinedPred(const llvm::Record *record);
+  // Construct a CombinedPred from an initializer.
+  explicit CombinedPred(const llvm::Init *init);
+
+  // Get the predicate condition.
+  std::string getConditionImpl() const;
+
+  // Get the definition of the combiner used in this predicate.
+  const llvm::Record *getCombinerDef() const;
+
+  // Get the predicates that are combined by this predicate.
+  const std::vector<llvm::Record *> getChildren() const;
+};
+
+// A combined predicate that requires all child predicates of 'CPred' type to
+// have their expression rewritten with a simple string substitution rule.
+class SubstLeavesPred : public CombinedPred {
+public:
+  // Get the replacement pattern.
+  StringRef getPattern() const;
+  // Get the string used to replace the pattern.
+  StringRef getReplacement() const;
+};
+
+// A combined predicate that prepends a prefix and appends a suffix to the
+// predicate string composed from a child predicate.
+class ConcatPred : public CombinedPred {
+public:
+  StringRef getPrefix() const;
+  StringRef getSuffix() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_PREDICATE_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Region.h b/third_party/mlir/include/mlir/TableGen/Region.h
new file mode 100644
index 00000000000..21dffe687f4
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Region.h
@@ -0,0 +1,45 @@
+//===- TGRegion.h - TableGen region definitions -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_TABLEGEN_REGION_H_
+#define MLIR_TABLEGEN_REGION_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class providing helper methods for accessing Region defined in
+// TableGen.
+class Region : public Constraint {
+public:
+  using Constraint::Constraint;
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Region; }
+};
+
+// A struct bundling a region's constraint and its name.
+struct NamedRegion {
+  StringRef name;
+  Region constraint;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_REGION_H_
diff --git a/third_party/mlir/include/mlir/TableGen/Type.h b/third_party/mlir/include/mlir/TableGen/Type.h
new file mode 100644
index 00000000000..c7f92e4b74b
--- /dev/null
+++ b/third_party/mlir/include/mlir/TableGen/Type.h
@@ -0,0 +1,52 @@
+//===- Type.h - Type class --------------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Type wrapper to simplify using TableGen Record defining a MLIR Type.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_TYPE_H_
+#define MLIR_TABLEGEN_TYPE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+
+namespace llvm {
+class DefInit;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing Type constraints defined in
+// TableGen.
+class TypeConstraint : public Constraint {
+public:
+  explicit TypeConstraint(const llvm::Record *record);
+  explicit TypeConstraint(const llvm::DefInit *init);
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Type; }
+
+  // Returns true if this is a variadic type constraint.
+  bool isVariadic() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_TYPE_H_
diff --git a/third_party/mlir/include/mlir/Target/LLVMIR.h b/third_party/mlir/include/mlir/Target/LLVMIR.h
new file mode 100644
index 00000000000..4176490729e
--- /dev/null
+++ b/third_party/mlir/include/mlir/Target/LLVMIR.h
@@ -0,0 +1,45 @@
+//===- LLVMIR.h - MLIR to LLVM IR conversion --------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the entry point for the MLIR to LLVM IR conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_H
+#define MLIR_TARGET_LLVMIR_H
+
+#include <memory>
+
+// Forward-declare LLVM classses.
+namespace llvm {
+class LLVMContext;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+
+class ModuleOp;
+
+/// Convert the given MLIR module into LLVM IR.  The LLVM context is extracted
+/// from the registered LLVM IR dialect.  In case of error, report it
+/// to the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `nullptr`.
+std::unique_ptr<llvm::Module> translateModuleToLLVMIR(ModuleOp m);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_H
diff --git a/third_party/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/third_party/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
new file mode 100644
index 00000000000..584d2a84fe9
--- /dev/null
+++ b/third_party/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -0,0 +1,105 @@
+//===- ModuleTranslation.h - MLIR to LLVM conversion ------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the translation between an MLIR LLVM dialect module and
+// the corresponding LLVMIR module. It only handles core LLVM IR operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
+#define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+
+namespace mlir {
+class Attribute;
+class FuncOp;
+class Location;
+class ModuleOp;
+class Operation;
+
+namespace LLVM {
+
+// Implementation class for module translation.  Holds a reference to the module
+// being translated, and the mappings between the original and the translated
+// functions, basic blocks and values.  It is practically easier to hold these
+// mappings in one class since the conversion of control flow operations
+// needs to look up block and function mappings.
+class ModuleTranslation {
+public:
+  template <typename T = ModuleTranslation>
+  static std::unique_ptr<llvm::Module> translateModule(ModuleOp m) {
+    auto llvmModule = prepareLLVMModule(m);
+
+    T translator(m);
+    translator.llvmModule = std::move(llvmModule);
+    translator.convertGlobals();
+    if (failed(translator.convertFunctions()))
+      return nullptr;
+
+    return std::move(translator.llvmModule);
+  }
+
+protected:
+  // Translate the given MLIR module expressed in MLIR LLVM IR dialect into an
+  // LLVM IR module.  The MLIR LLVM IR dialect holds a pointer to an
+  // LLVMContext, the LLVM IR module will be created in that context.
+  explicit ModuleTranslation(ModuleOp module) : mlirModule(module) {}
+  virtual ~ModuleTranslation() {}
+
+  virtual LogicalResult convertOperation(Operation &op,
+                                         llvm::IRBuilder<> &builder);
+  static std::unique_ptr<llvm::Module> prepareLLVMModule(ModuleOp m);
+
+private:
+  LogicalResult convertFunctions();
+  void convertGlobals();
+  LogicalResult convertOneFunction(FuncOp func);
+  void connectPHINodes(FuncOp func);
+  LogicalResult convertBlock(Block &bb, bool ignoreArguments);
+
+  template <typename Range>
+  SmallVector<llvm::Value *, 8> lookupValues(Range &&values);
+
+  llvm::Constant *getLLVMConstant(llvm::Type *llvmType, Attribute attr,
+                                  Location loc);
+
+  // Original and translated module.
+  ModuleOp mlirModule;
+  std::unique_ptr<llvm::Module> llvmModule;
+
+  // Mappings between llvm.global definitions and corresponding globals.
+  llvm::DenseMap<Operation *, llvm::GlobalValue *> globalsMapping;
+
+protected:
+  // Mappings between original and translated values, used for lookups.
+  llvm::StringMap<llvm::Function *> functionMapping;
+  llvm::DenseMap<Value *, llvm::Value *> valueMapping;
+  llvm::DenseMap<Block *, llvm::BasicBlock *> blockMapping;
+};
+
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
diff --git a/third_party/mlir/include/mlir/Target/NVVMIR.h b/third_party/mlir/include/mlir/Target/NVVMIR.h
new file mode 100644
index 00000000000..d3e24db69ee
--- /dev/null
+++ b/third_party/mlir/include/mlir/Target/NVVMIR.h
@@ -0,0 +1,44 @@
+//===- NVVMIR.h - MLIR to LLVM + NVVM IR conversion -------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the entry point for the MLIR to LLVM + NVVM IR conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_NVVMIR_H
+#define MLIR_TARGET_NVVMIR_H
+
+#include <memory>
+
+// Forward-declare LLVM classses.
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class ModuleOp;
+
+/// Convert the given MLIR module into NVVM IR. This conversion requires the
+/// registration of the LLVM IR dialect and will extract the LLVM context
+/// from the registered LLVM IR dialect.  In case of error, report it
+/// to the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `nullptr`.
+std::unique_ptr<llvm::Module> translateModuleToNVVMIR(ModuleOp m);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_NVVMIR_H
diff --git a/third_party/mlir/include/mlir/Transforms/DialectConversion.h b/third_party/mlir/include/mlir/Transforms/DialectConversion.h
new file mode 100644
index 00000000000..3cdf14c3eb3
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/DialectConversion.h
@@ -0,0 +1,515 @@
+//===- DialectConversion.h - MLIR dialect conversion pass -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares a generic pass for converting between MLIR dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_DIALECTCONVERSION_H_
+#define MLIR_TRANSFORMS_DIALECTCONVERSION_H_
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/MapVector.h"
+
+namespace mlir {
+
+// Forward declarations.
+class Block;
+class ConversionPatternRewriter;
+class FuncOp;
+class MLIRContext;
+class Operation;
+class Type;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+/// Base class for type conversion interface. Specific converters must
+/// derive this class and implement the pure virtual functions.
+class TypeConverter {
+public:
+  virtual ~TypeConverter() = default;
+
+  /// This class provides all of the information necessary to convert a type
+  /// signature.
+  class SignatureConversion {
+  public:
+    SignatureConversion(unsigned numOrigInputs)
+        : remappedInputs(numOrigInputs) {}
+
+    /// This struct represents a range of new types that remap an existing
+    /// signature input.
+    struct InputMapping {
+      size_t inputNo, size;
+    };
+
+    /// Return the argument types for the new signature.
+    ArrayRef<Type> getConvertedTypes() const { return argTypes; }
+
+    /// Get the input mapping for the given argument.
+    llvm::Optional<InputMapping> getInputMapping(unsigned input) const {
+      return remappedInputs[input];
+    }
+
+    //===------------------------------------------------------------------===//
+    // Conversion Hooks
+    //===------------------------------------------------------------------===//
+
+    /// Remap an input of the original signature with a new set of types. The
+    /// new types are appended to the new signature conversion.
+    void addInputs(unsigned origInputNo, ArrayRef<Type> types);
+
+    /// Append new input types to the signature conversion, this should only be
+    /// used if the new types are not intended to remap an existing input.
+    void addInputs(ArrayRef<Type> types);
+
+    /// Remap an input of the original signature with a range of types in the
+    /// new signature.
+    void remapInput(unsigned origInputNo, unsigned newInputNo,
+                    unsigned newInputCount = 1);
+
+  private:
+    /// The remapping information for each of the original arguments.
+    SmallVector<llvm::Optional<InputMapping>, 4> remappedInputs;
+
+    /// The set of new argument types.
+    SmallVector<Type, 4> argTypes;
+  };
+
+  /// This hook allows for converting a type. This function should return
+  /// failure if no valid conversion exists, success otherwise. If the new set
+  /// of types is empty, the type is removed and any usages of the existing
+  /// value are expected to be removed during conversion.
+  virtual LogicalResult convertType(Type t, SmallVectorImpl<Type> &results);
+
+  /// This hook simplifies defining 1-1 type conversions. This function returns
+  /// the type to convert to on success, and a null type on failure.
+  virtual Type convertType(Type t) { return t; }
+
+  /// Convert the given set of types, filling 'results' as necessary. This
+  /// returns failure if the conversion of any of the types fails, success
+  /// otherwise.
+  LogicalResult convertTypes(ArrayRef<Type> types,
+                             SmallVectorImpl<Type> &results);
+
+  /// Return true if the given type is legal for this type converter, i.e. the
+  /// type converts to itself.
+  bool isLegal(Type type);
+
+  /// Return true if the inputs and outputs of the given function type are
+  /// legal.
+  bool isSignatureLegal(FunctionType funcType);
+
+  /// This hook allows for converting a specific argument of a signature. It
+  /// takes as inputs the original argument input number, type.
+  /// On success, this function should populate 'result' with any new mappings.
+  virtual LogicalResult convertSignatureArg(unsigned inputNo, Type type,
+                                            SignatureConversion &result);
+
+  /// This function converts the type signature of the given block, by invoking
+  /// 'convertSignatureArg' for each argument. This function should return a
+  /// valid conversion for the signature on success, None otherwise.
+  llvm::Optional<SignatureConversion> convertBlockSignature(Block *block);
+
+  /// This hook allows for materializing a conversion from a set of types into
+  /// one result type by generating a cast operation of some kind. The generated
+  /// operation should produce one result, of 'resultType', with the provided
+  /// 'inputs' as operands. This hook must be overridden when a type conversion
+  /// results in more than one type, or if a type conversion may persist after
+  /// the conversion has finished.
+  virtual Operation *materializeConversion(PatternRewriter &rewriter,
+                                           Type resultType,
+                                           ArrayRef<Value *> inputs,
+                                           Location loc) {
+    llvm_unreachable("expected 'materializeConversion' to be overridden");
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+/// Base class for the conversion patterns that require type changes. Specific
+/// conversions must derive this class and implement least one `rewrite` method.
+/// NOTE: These conversion patterns can only be used with the 'apply*' methods
+/// below.
+class ConversionPattern : public RewritePattern {
+public:
+  /// Construct an ConversionPattern.  `rootName` must correspond to the
+  /// canonical name of the first operation matched by the pattern.
+  ConversionPattern(StringRef rootName, PatternBenefit benefit,
+                    MLIRContext *ctx)
+      : RewritePattern(rootName, benefit, ctx) {}
+
+  /// Hook for derived classes to implement rewriting. `op` is the (first)
+  /// operation matched by the pattern, `operands` is a list of rewritten values
+  /// that are passed to this operation, `rewriter` can be used to emit the new
+  /// operations. This function must be reimplemented if the
+  /// ConversionPattern ever needs to replace an operation that does not
+  /// have successors. This function should not fail. If some specific cases of
+  /// the operation are not supported, these cases should not be matched.
+  virtual void rewrite(Operation *op, ArrayRef<Value *> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("unimplemented rewrite");
+  }
+
+  /// Hook for derived classes to implement rewriting. `op` is the (first)
+  /// operation matched by the pattern, `properOperands` is a list of rewritten
+  /// values that are passed to the operation itself, `destinations` is a list
+  /// of (potentially rewritten) successor blocks, `operands` is a list of lists
+  /// of rewritten values passed to each of the successors, co-indexed with
+  /// `destinations`, `rewriter` can be used to emit the new operations. It must
+  /// be reimplemented if the ConversionPattern ever needs to replace a
+  /// terminator operation that has successors. This function should not fail
+  /// the pass. If some specific cases of the operation are not supported,
+  /// these cases should not be matched.
+  virtual void rewrite(Operation *op, ArrayRef<Value *> properOperands,
+                       ArrayRef<Block *> destinations,
+                       ArrayRef<ArrayRef<Value *>> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("unimplemented rewrite for terminators");
+  }
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value *>> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, properOperands, destinations, operands, rewriter);
+    return matchSuccess();
+  }
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, operands, rewriter);
+    return matchSuccess();
+  }
+
+  /// Attempt to match and rewrite the IR root at the specified operation.
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final;
+
+private:
+  using RewritePattern::rewrite;
+};
+
+/// Add a pattern to the given pattern list to convert the signature of a FuncOp
+/// with the given type converter.
+void populateFuncOpTypeConversionPattern(OwningRewritePatternList &patterns,
+                                         MLIRContext *ctx,
+                                         TypeConverter &converter);
+
+//===----------------------------------------------------------------------===//
+// Conversion PatternRewriter
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct ConversionPatternRewriterImpl;
+} // end namespace detail
+
+/// This class implements a pattern rewriter for use with ConversionPatterns. It
+/// extends the base PatternRewriter and provides special conversion specific
+/// hooks.
+class ConversionPatternRewriter final : public PatternRewriter {
+public:
+  ConversionPatternRewriter(MLIRContext *ctx, TypeConverter *converter);
+  ~ConversionPatternRewriter() override;
+
+  /// Apply a signature conversion to the entry block of the given region.
+  void applySignatureConversion(Region *region,
+                                TypeConverter::SignatureConversion &conversion);
+
+  /// Clone the given operation without cloning its regions.
+  Operation *cloneWithoutRegions(Operation *op);
+  template <typename OpT> OpT cloneWithoutRegions(OpT op) {
+    return cast<OpT>(cloneWithoutRegions(op.getOperation()));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // PatternRewriter Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// PatternRewriter hook for replacing the results of an operation.
+  void replaceOp(Operation *op, ArrayRef<Value *> newValues,
+                 ArrayRef<Value *> valuesToRemoveIfDead) override;
+  using PatternRewriter::replaceOp;
+
+  /// PatternRewriter hook for splitting a block into two parts.
+  Block *splitBlock(Block *block, Block::iterator before) override;
+
+  /// PatternRewriter hook for moving blocks out of a region.
+  void inlineRegionBefore(Region &region, Region &parent,
+                          Region::iterator before) override;
+  using PatternRewriter::inlineRegionBefore;
+
+  /// PatternRewriter hook for creating a new operation.
+  Operation *createOperation(const OperationState &state) override;
+
+  /// PatternRewriter hook for updating the root operation in-place.
+  void notifyRootUpdated(Operation *op) override;
+
+  /// Return a reference to the internal implementation.
+  detail::ConversionPatternRewriterImpl &getImpl();
+
+private:
+  std::unique_ptr<detail::ConversionPatternRewriterImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// ConversionTarget
+//===----------------------------------------------------------------------===//
+
+/// This class describes a specific conversion target.
+class ConversionTarget {
+public:
+  /// This enumeration corresponds to the specific action to take when
+  /// considering an operation legal for this conversion target.
+  enum class LegalizationAction {
+    /// The target supports this operation.
+    Legal,
+
+    /// This operation has dynamic legalization constraints that must be checked
+    /// by the target.
+    Dynamic,
+
+    /// The target explicitly does not support this operation.
+    Illegal,
+  };
+
+  /// The type used to store operation legality information.
+  using LegalityMapTy = llvm::MapVector<OperationName, LegalizationAction>;
+
+  /// The signature of the callback used to determine if an operation is
+  /// dynamically legal on the target.
+  using DynamicLegalityCallbackFn = std::function<bool(Operation *)>;
+
+  ConversionTarget(MLIRContext &ctx) : ctx(ctx) {}
+  virtual ~ConversionTarget() = default;
+
+  //===--------------------------------------------------------------------===//
+  // Legality Registration
+  //===--------------------------------------------------------------------===//
+
+  /// Register a legality action for the given operation.
+  void setOpAction(OperationName op, LegalizationAction action);
+  template <typename OpT> void setOpAction(LegalizationAction action) {
+    setOpAction(OperationName(OpT::getOperationName(), &ctx), action);
+  }
+
+  /// Register the given operations as legal.
+  template <typename OpT> void addLegalOp() {
+    setOpAction<OpT>(LegalizationAction::Legal);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs> void addLegalOp() {
+    addLegalOp<OpT>();
+    addLegalOp<OpT2, OpTs...>();
+  }
+
+  /// Register the given operation as dynamically legal, i.e. requiring custom
+  /// handling by the target via 'isDynamicallyLegal'.
+  template <typename OpT> void addDynamicallyLegalOp() {
+    setOpAction<OpT>(LegalizationAction::Dynamic);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs>
+  void addDynamicallyLegalOp() {
+    addDynamicallyLegalOp<OpT>();
+    addDynamicallyLegalOp<OpT2, OpTs...>();
+  }
+
+  /// Register the given operation as dynamically legal and set the dynamic
+  /// legalization callback to the one provided.
+  template <typename OpT>
+  void addDynamicallyLegalOp(const DynamicLegalityCallbackFn &callback) {
+    OperationName opName(OpT::getOperationName(), &ctx);
+    setOpAction(opName, LegalizationAction::Dynamic);
+    setLegalityCallback(opName, callback);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs>
+  void addDynamicallyLegalOp(const DynamicLegalityCallbackFn &callback) {
+    addDynamicallyLegalOp<OpT>(callback);
+    addDynamicallyLegalOp<OpT2, OpTs...>(callback);
+  }
+  template <typename OpT, class Callable>
+  typename std::enable_if<!is_invocable<Callable, Operation *>::value>::type
+  addDynamicallyLegalOp(Callable &&callback) {
+    addDynamicallyLegalOp<OpT>(
+        [=](Operation *op) { return callback(cast<OpT>(op)); });
+  }
+
+  /// Register the given operation as illegal, i.e. this operation is known to
+  /// not be supported by this target.
+  template <typename OpT> void addIllegalOp() {
+    setOpAction<OpT>(LegalizationAction::Illegal);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs> void addIllegalOp() {
+    addIllegalOp<OpT>();
+    addIllegalOp<OpT2, OpTs...>();
+  }
+
+  /// Register a legality action for the given dialects.
+  void setDialectAction(ArrayRef<StringRef> dialectNames,
+                        LegalizationAction action);
+
+  /// Register the operations of the given dialects as legal.
+  template <typename... Names>
+  void addLegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Legal);
+  }
+  template <typename... Args> void addLegalDialect() {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Legal);
+  }
+
+  /// Register the operations of the given dialects as dynamically legal, i.e.
+  /// requiring custom handling by the target via 'isDynamicallyLegal'.
+  template <typename... Names>
+  void addDynamicallyLegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Dynamic);
+  }
+  template <typename... Args>
+  void addDynamicallyLegalDialect(
+      llvm::Optional<DynamicLegalityCallbackFn> callback = llvm::None) {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Dynamic);
+    if (callback)
+      setLegalityCallback(dialectNames, *callback);
+  }
+
+  /// Register the operations of the given dialects as illegal, i.e.
+  /// operations of this dialect are not supported by the target.
+  template <typename... Names>
+  void addIllegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Illegal);
+  }
+  template <typename... Args> void addIllegalDialect() {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Illegal);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Legality Querying
+  //===--------------------------------------------------------------------===//
+
+  /// Get the legality action for the given operation.
+  llvm::Optional<LegalizationAction> getOpAction(OperationName op) const;
+
+  /// Return true if the given operation instance is legal on this target.
+  bool isLegal(Operation *op) const;
+
+protected:
+  /// Runs a custom legalization query for the given operation. This should
+  /// return true if the given operation is legal, otherwise false.
+  virtual bool isDynamicallyLegal(Operation *op) const {
+    llvm_unreachable(
+        "targets with custom legalization must override 'isDynamicallyLegal'");
+  }
+
+private:
+  /// Set the dynamic legality callback for the given operation.
+  void setLegalityCallback(OperationName name,
+                           const DynamicLegalityCallbackFn &callback);
+
+  /// Set the dynamic legality callback for the given dialects.
+  void setLegalityCallback(ArrayRef<StringRef> dialects,
+                           const DynamicLegalityCallbackFn &callback);
+
+  /// A deterministic mapping of operation name to the specific legality action
+  /// to take.
+  LegalityMapTy legalOperations;
+
+  /// A set of dynamic legality callbacks for given operation names.
+  DenseMap<OperationName, DynamicLegalityCallbackFn> opLegalityFns;
+
+  /// A deterministic mapping of dialect name to the specific legality action to
+  /// take.
+  llvm::StringMap<LegalizationAction> legalDialects;
+
+  /// A set of dynamic legality callbacks for given dialect names.
+  llvm::StringMap<DynamicLegalityCallbackFn> dialectLegalityFns;
+
+  /// The current context this target applies to.
+  MLIRContext &ctx;
+};
+
+//===----------------------------------------------------------------------===//
+// Op Conversion Entry Points
+//===----------------------------------------------------------------------===//
+
+/// Apply a partial conversion on the given operations, and all nested
+/// operations. This method converts as many operations to the target as
+/// possible, ignoring operations that failed to legalize. This method only
+/// returns failure if there are unreachable blocks in any of the regions nested
+/// within 'ops'. If 'converter' is provided, the signatures of blocks and
+/// regions are also converted.
+LLVM_NODISCARD LogicalResult
+applyPartialConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                       const OwningRewritePatternList &patterns,
+                       TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult
+applyPartialConversion(Operation *op, ConversionTarget &target,
+                       const OwningRewritePatternList &patterns,
+                       TypeConverter *converter = nullptr);
+
+/// Apply a complete conversion on the given operations, and all nested
+/// operations. This method returns failure if the conversion of any operation
+/// fails, or if there are unreachable blocks in any of the regions nested
+/// within 'ops'. If 'converter' is provided, the signatures of blocks and
+/// regions are also converted.
+LLVM_NODISCARD LogicalResult
+applyFullConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                    const OwningRewritePatternList &patterns,
+                    TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult
+applyFullConversion(Operation *op, ConversionTarget &target,
+                    const OwningRewritePatternList &patterns,
+                    TypeConverter *converter = nullptr);
+
+/// Apply an analysis conversion on the given operations, and all nested
+/// operations. This method analyzes which operations would be successfully
+/// converted to the target if a conversion was applied. All operations that
+/// were found to be legalizable to the given 'target' are placed within the
+/// provided 'convertedOps' set; note that no actual rewrites are applied to the
+/// operations on success and only pre-existing operations are added to the set.
+/// This method only returns failure if there are unreachable blocks in any of
+/// the regions nested within 'ops', or if a type conversion failed. If
+/// 'converter' is provided, the signatures of blocks and regions are also
+/// considered for conversion.
+LLVM_NODISCARD LogicalResult applyAnalysisConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult applyAnalysisConversion(
+    Operation *op, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter = nullptr);
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_DIALECTCONVERSION_H_
diff --git a/third_party/mlir/include/mlir/Transforms/FoldUtils.h b/third_party/mlir/include/mlir/Transforms/FoldUtils.h
new file mode 100644
index 00000000000..87a3e13c0cd
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/FoldUtils.h
@@ -0,0 +1,123 @@
+//===- FoldUtils.h - Operation Fold Utilities -------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file declares various operation folding utilities. These
+// utilities are intended to be used by passes to unify and simply their logic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_FOLDUTILS_H
+#define MLIR_TRANSFORMS_FOLDUTILS_H
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+
+namespace mlir {
+class Operation;
+class Value;
+
+/// A utility class for folding operations, and unifying duplicated constants
+/// generated along the way.
+class OperationFolder {
+public:
+  /// Tries to perform folding on the given `op`, including unifying
+  /// deduplicated constants. If successful, replaces `op`'s uses with
+  /// folded results, and returns success. `preReplaceAction` is invoked on `op`
+  /// before it is replaced. 'processGeneratedConstants' is invoked for any new
+  /// operations generated when folding. If the op was completely folded it is
+  /// erased.
+  LogicalResult tryToFold(
+      Operation *op,
+      llvm::function_ref<void(Operation *)> processGeneratedConstants = nullptr,
+      llvm::function_ref<void(Operation *)> preReplaceAction = nullptr);
+
+  /// Notifies that the given constant `op` should be remove from this
+  /// OperationFolder's internal bookkeeping.
+  ///
+  /// Note: this method must be called if a constant op is to be deleted
+  /// externally to this OperationFolder. `op` must be a constant op.
+  void notifyRemoval(Operation *op);
+
+  /// Create an operation of specific op type with the given builder,
+  /// and immediately try to fold it. This function populates 'results' with
+  /// the results after folding the operation.
+  template <typename OpTy, typename... Args>
+  void create(OpBuilder &builder, SmallVectorImpl<Value *> &results,
+              Location location, Args &&... args) {
+    Operation *op = builder.create<OpTy>(location, std::forward<Args>(args)...);
+    if (failed(tryToFold(op, results)))
+      results.assign(op->result_begin(), op->result_end());
+    else if (op->getNumResults() != 0)
+      op->erase();
+  }
+
+  /// Overload to create or fold a single result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::OneResult>(),
+                          Value *>::type
+  create(OpBuilder &builder, Location location, Args &&... args) {
+    SmallVector<Value *, 1> results;
+    create<OpTy>(builder, results, location, std::forward<Args>(args)...);
+    return results.front();
+  }
+
+  /// Overload to create or fold a zero result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::ZeroResult>(),
+                          OpTy>::type
+  create(OpBuilder &builder, Location location, Args &&... args) {
+    auto op = builder.create<OpTy>(location, std::forward<Args>(args)...);
+    SmallVector<Value *, 0> unused;
+    (void)tryToFold(op.getOperation(), unused);
+
+    // Folding cannot remove a zero-result operation, so for convenience we
+    // continue to return it.
+    return op;
+  }
+
+private:
+  /// This map keeps track of uniqued constants by dialect, attribute, and type.
+  /// A constant operation materializes an attribute with a type. Dialects may
+  /// generate different constants with the same input attribute and type, so we
+  /// also need to track per-dialect.
+  using ConstantMap =
+      DenseMap<std::tuple<Dialect *, Attribute, Type>, Operation *>;
+
+  /// Tries to perform folding on the given `op`. If successful, populates
+  /// `results` with the results of the folding.
+  LogicalResult tryToFold(Operation *op, SmallVectorImpl<Value *> &results,
+                          llvm::function_ref<void(Operation *)>
+                              processGeneratedConstants = nullptr);
+
+  /// Try to get or create a new constant entry. On success this returns the
+  /// constant operation, nullptr otherwise.
+  Operation *tryGetOrCreateConstant(ConstantMap &uniquedConstants,
+                                    Dialect *dialect, OpBuilder &builder,
+                                    Attribute value, Type type, Location loc);
+
+  /// A mapping between an insertion region and the constants that have been
+  /// created within it.
+  DenseMap<Region *, ConstantMap> foldScopes;
+
+  /// This map tracks all of the dialects that an operation is referenced by;
+  /// given that many dialects may generate the same constant.
+  DenseMap<Operation *, SmallVector<Dialect *, 2>> referencedDialects;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_FOLDUTILS_H
diff --git a/third_party/mlir/include/mlir/Transforms/LoopFusionUtils.h b/third_party/mlir/include/mlir/Transforms/LoopFusionUtils.h
new file mode 100644
index 00000000000..b6d1ea41ce6
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/LoopFusionUtils.h
@@ -0,0 +1,100 @@
+//===- LoopFusionUtils.h - Loop fusion utilities ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for various loop fusion utility
+// methods: these are not passes by themselves but are used either by passes,
+// optimization sequences, or in turn by other transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
+#define MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+class AffineForOp;
+struct ComputationSliceState;
+class Operation;
+
+// TODO(andydavis) Extend this module to include utility functions for querying
+// fusion cost/storage reduction, and for performing the loop fusion
+// transformation.
+
+struct FusionResult {
+  enum ResultEnum {
+    Success,
+    FailPrecondition,     // Failed precondition for fusion. (e.g. same block).
+    FailBlockDependence,  // Fusion would violate another dependence in block.
+    FailFusionDependence, // Fusion would reverse dependences between loops.
+    FailComputationSlice, // Unable to compute src loop computation slice.
+  } value;
+  FusionResult(ResultEnum v) : value(v) {}
+};
+
+/// Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the
+/// loop nest rooted at 'dstForOp' at 'dstLoopDepth'. Returns FusionResult
+/// 'Success' if fusion of the src/dst loop nests is feasible (i.e. they are
+/// in the same block and dependences would not be violated). Otherwise
+/// returns a FusionResult explaining why fusion is not feasible.
+/// NOTE: This function is not feature complete and should only be used in
+/// testing.
+/// TODO(andydavis) Update comments when this function is fully implemented.
+FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
+                          unsigned dstLoopDepth,
+                          ComputationSliceState *srcSlice);
+
+/// LoopNestStats aggregates various per-loop statistics (eg. loop trip count
+/// and operation count) for a loop nest up until (and including) the innermost
+/// loop body.
+struct LoopNestStats {
+  /// Map from AffineForOp to immediate child AffineForOps in its loop body.
+  llvm::DenseMap<Operation *, llvm::SmallVector<AffineForOp, 2>> loopMap;
+  /// Map from AffineForOp to count of operations in its loop body.
+  llvm::DenseMap<Operation *, uint64_t> opCountMap;
+  /// Map from AffineForOp to its constant trip count.
+  llvm::DenseMap<Operation *, uint64_t> tripCountMap;
+};
+
+/// Collect loop nest statistics (eg. loop trip count and operation count)
+/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
+/// returns false otherwise.
+// TODO(andydavis) Consider moving this to LoopUtils.
+bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats);
+
+/// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
+/// Currently, the total cost is computed by counting the total operation
+/// instance count (i.e. total number of operations in the loop body * loop
+/// trip count) for the entire loop nest.
+// TODO(andydavis) Improve this cost model.
+int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats);
+
+/// Computes and returns in 'computeCost', the total compute cost of fusing the
+/// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
+/// the total cost is computed by counting the total operation instance count
+/// (i.e. total number of operations in the loop body * loop trip count) for
+/// the entire loop nest.
+/// Returns true on success, failure otherwise (e.g. non-constant trip counts).
+// TODO(andydavis) Improve this cost model.
+bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
+                          AffineForOp dstForOp, LoopNestStats &dstStats,
+                          ComputationSliceState *slice, int64_t *computeCost);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
diff --git a/third_party/mlir/include/mlir/Transforms/LoopUtils.h b/third_party/mlir/include/mlir/Transforms/LoopUtils.h
new file mode 100644
index 00000000000..3bc76baf8c5
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/LoopUtils.h
@@ -0,0 +1,208 @@
+//===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for various loop transformation utility
+// methods: these are not passes by themselves but are used either by passes,
+// optimization sequences, or in turn by other transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_LOOP_UTILS_H
+#define MLIR_TRANSFORMS_LOOP_UTILS_H
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+class AffineMap;
+class AffineForOp;
+class FuncOp;
+class OpBuilder;
+class Value;
+
+namespace loop {
+class ForOp;
+} // end namespace loop
+
+/// Unrolls this for operation completely if the trip count is known to be
+/// constant. Returns failure otherwise.
+LogicalResult loopUnrollFull(AffineForOp forOp);
+
+/// Unrolls this for operation by the specified unroll factor. Returns failure
+/// if the loop cannot be unrolled either due to restrictions or due to invalid
+/// unroll factors.
+LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor);
+
+/// Unrolls this loop by the specified unroll factor or its trip count,
+/// whichever is lower.
+LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
+
+/// Get perfectly nested sequence of loops starting at root of loop nest
+/// (the first op being another AffineFor, and the second op - a terminator).
+/// A loop is perfectly nested iff: the first op in the loop's body is another
+/// AffineForOp, and the second op is a terminator).
+void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
+                             AffineForOp root);
+void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                             loop::ForOp root);
+
+/// Unrolls and jams this loop by the specified factor. Returns success if the
+/// loop is successfully unroll-jammed.
+LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
+                                    uint64_t unrollJamFactor);
+
+/// Unrolls and jams this loop by the specified factor or by the trip count (if
+/// constant), whichever is lower.
+LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
+                                      uint64_t unrollJamFactor);
+
+/// Promotes the loop body of a AffineForOp to its containing block if the
+/// AffineForOp was known to have a single iteration.
+LogicalResult promoteIfSingleIteration(AffineForOp forOp);
+
+/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
+/// their body into the containing Block.
+void promoteSingleIterationLoops(FuncOp f);
+
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                              AffineMap *map,
+                              SmallVectorImpl<Value *> *operands,
+                              OpBuilder &builder);
+
+/// Skew the operations in the body of a 'affine.for' operation with the
+/// specified operation-wise shifts. The shifts are with respect to the
+/// original execution order, and are multiplied by the loop 'step' before being
+/// applied.
+LLVM_NODISCARD
+LogicalResult instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
+                           bool unrollPrologueEpilogue = false);
+
+/// Tiles the specified band of perfectly nested loops creating tile-space loops
+/// and intra-tile loops. A band is a contiguous set of loops.
+LLVM_NODISCARD
+LogicalResult tileCodeGen(MutableArrayRef<AffineForOp> band,
+                          ArrayRef<unsigned> tileSizes);
+
+/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
+/// and 'forOpB' are part of a perfectly nested sequence of loops.
+void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
+
+/// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
+/// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
+/// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
+bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
+                                       ArrayRef<unsigned> loopPermMap);
+
+/// Performs a sequence of loop interchanges on perfectly nested 'loops', as
+/// specified by permutation 'loopPermMap' (loop 'i' in 'loops' is mapped to
+/// location 'j = 'loopPermMap[i]' after the loop interchange).
+unsigned interchangeLoops(ArrayRef<AffineForOp> loops,
+                          ArrayRef<unsigned> loopPermMap);
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+// Returns AffineForOp of the root of the new loop nest after loop interchanges.
+AffineForOp sinkSequentialLoops(AffineForOp forOp);
+
+/// Sinks 'forOp' by 'loopDepth' levels by performing a series of loop
+/// interchanges. Requires that 'forOp' is part of a perfect nest with
+/// 'loopDepth' AffineForOps consecutively nested under it.
+void sinkLoop(AffineForOp forOp, unsigned loopDepth);
+
+/// Performs tiling fo imperfectly nested loops (with interchange) by
+/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
+/// occurrence in `forOps`, under each of the `targets`.
+/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
+/// nested immediately under each of `targets`.
+using Loops = SmallVector<loop::ForOp, 8>;
+using TileLoops = std::pair<Loops, Loops>;
+SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
+                                                 ArrayRef<uint64_t> sizes,
+                                                 ArrayRef<AffineForOp> targets);
+SmallVector<Loops, 8> tile(ArrayRef<loop::ForOp> forOps,
+                           ArrayRef<Value *> sizes,
+                           ArrayRef<loop::ForOp> targets);
+
+/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
+/// and sinking them, in their order of occurrence in `forOps`, under `target`.
+/// Returns the new AffineForOps, one per `forOps`, nested immediately under
+/// `target`.
+SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
+                                 ArrayRef<uint64_t> sizes, AffineForOp target);
+Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
+           loop::ForOp target);
+
+/// Tile a nest of loop::ForOp loops rooted at `rootForOp` with the given
+/// (parametric) sizes. Sizes are expected to be strictly positive values at
+/// runtime.  If more sizes than loops are provided, discard the trailing values
+/// in sizes.  Assumes the loop nest is permutable.
+/// Returns the newly created intra-tile loops.
+Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value *> sizes);
+
+/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
+/// parametric tile sizes that the outer loops have a fixed number of iterations
+/// as defined in `sizes`.
+TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp,
+                                 ArrayRef<int64_t> sizes);
+
+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops with bounds and steps
+/// independent of any loop induction variable involved in the nest.
+void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
+
+/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
+/// size given by `numProcessors`. This is achieved by embedding the SSA values
+/// corresponding to `processorIds` and `numProcessors` into the bounds and step
+/// of the `forOp`. No check is performed on the legality of the rewrite, it is
+/// the caller's responsibility to ensure legality.
+///
+/// Requires that `processorIds` and `numProcessors` have the same size and that
+/// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
+/// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
+///   1. GPU (threadIdx, get_local_id(), ...)
+///   2. MPI (MPI_Comm_rank)
+///   3. OpenMP (omp_get_thread_num)
+///
+/// Example:
+/// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
+/// numProcessors = [gridDim.x, blockDim.x], the loop:
+///
+/// ```
+///    loop.for %i = %lb to %ub step %step {
+///      ...
+///    }
+/// ```
+///
+/// is rewritten into a version resembling the following pseudo-IR:
+///
+/// ```
+///    loop.for %i = %lb + threadIdx.x + blockIdx.x * blockDim.x to %ub
+///       step %gridDim.x * blockDim.x {
+///      ...
+///    }
+/// ```
+void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value *> processorId,
+                           ArrayRef<Value *> numProcessors);
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOOP_UTILS_H
diff --git a/third_party/mlir/include/mlir/Transforms/LowerAffine.h b/third_party/mlir/include/mlir/Transforms/LowerAffine.h
new file mode 100644
index 00000000000..5fae4763bf7
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/LowerAffine.h
@@ -0,0 +1,58 @@
+//===- LowerAffine.h - Convert Affine to Standard dialect -------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_TRANSFORMS_LOWERAFFINE_H
+#define MLIR_TRANSFORMS_LOWERAFFINE_H
+
+#include "mlir/Support/LLVM.h"
+#include <vector>
+
+namespace mlir {
+class AffineExpr;
+class AffineForOp;
+class Location;
+struct LogicalResult;
+class MLIRContext;
+class OpBuilder;
+class RewritePattern;
+class Value;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Emit code that computes the given affine expression using standard
+/// arithmetic operations applied to the provided dimension and symbol values.
+Value *expandAffineExpr(OpBuilder &builder, Location loc, AffineExpr expr,
+                        ArrayRef<Value *> dimValues,
+                        ArrayRef<Value *> symbolValues);
+
+/// Collect a set of patterns to convert from the Affine dialect to the Standard
+/// dialect, in particular convert structured affine control flow into CFG
+/// branch-based control flow.
+void populateAffineToStdConversionPatterns(OwningRewritePatternList &patterns,
+                                           MLIRContext *ctx);
+
+/// Emit code that computes the lower bound of the given affine loop using
+/// standard arithmetic operations.
+Value *lowerAffineLowerBound(AffineForOp op, OpBuilder &builder);
+
+/// Emit code that computes the upper bound of the given affine loop using
+/// standard arithmetic operations.
+Value *lowerAffineUpperBound(AffineForOp op, OpBuilder &builder);
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOWERAFFINE_H
diff --git a/third_party/mlir/include/mlir/Transforms/Passes.h b/third_party/mlir/include/mlir/Transforms/Passes.h
new file mode 100644
index 00000000000..693c7b0ae00
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/Passes.h
@@ -0,0 +1,137 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes that expose pass constructors in the loop
+// transformation library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_PASSES_H
+#define MLIR_TRANSFORMS_PASSES_H
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <limits>
+
+namespace mlir {
+
+class AffineForOp;
+class FunctionPassBase;
+class ModulePassBase;
+
+/// Creates a constant folding pass. Note that this pass solely provides simple
+/// top-down constant folding functionality; it is intended to be used for
+/// testing purpose. Use Canonicalizer pass, which exploits more simplification
+/// opportunties exposed by constant folding, for the general cases.
+std::unique_ptr<FunctionPassBase> createTestConstantFoldPass();
+
+/// Creates an instance of the Canonicalizer pass.
+std::unique_ptr<FunctionPassBase> createCanonicalizerPass();
+
+/// Creates a pass to perform common sub expression elimination.
+std::unique_ptr<FunctionPassBase> createCSEPass();
+
+/// Creates a pass to vectorize loops, operations and data types using a
+/// target-independent, n-D super-vector abstraction.
+std::unique_ptr<FunctionPassBase>
+createVectorizePass(llvm::ArrayRef<int64_t> virtualVectorSize);
+
+/// Creates a pass to allow independent testing of vectorizer functionality with
+/// FileCheck.
+std::unique_ptr<FunctionPassBase> createVectorizerTestPass();
+
+/// Creates a pass to lower super-vectors to target-dependent HW vectors.
+std::unique_ptr<FunctionPassBase>
+createMaterializeVectorsPass(llvm::ArrayRef<int64_t> vectorSize);
+
+/// Creates a loop unrolling pass with the provided parameters.
+/// 'getUnrollFactor' is a function callback for clients to supply a function
+/// that computes an unroll factor - the callback takes precedence over unroll
+/// factors supplied through other means. If -1 is passed as the unrollFactor
+/// and no callback is provided, anything passed from the command-line (if at
+/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
+std::unique_ptr<FunctionPassBase> createLoopUnrollPass(
+    int unrollFactor = -1, int unrollFull = -1,
+    const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
+
+/// Creates a loop unroll jam pass to unroll jam by the specified factor. A
+/// factor of -1 lets the pass use the default factor or the one on the command
+/// line if provided.
+std::unique_ptr<FunctionPassBase>
+createLoopUnrollAndJamPass(int unrollJamFactor = -1);
+
+/// Creates an simplification pass for affine structures.
+std::unique_ptr<FunctionPassBase> createSimplifyAffineStructuresPass();
+
+/// Creates a loop fusion pass which fuses loops. Buffers of size less than or
+/// equal to `localBufSizeThreshold` are promoted to memory space
+/// `fastMemorySpace'.
+std::unique_ptr<FunctionPassBase>
+createLoopFusionPass(unsigned fastMemorySpace = 0,
+                     uint64_t localBufSizeThreshold = 0,
+                     bool maximalFusion = false);
+
+/// Creates a loop invariant code motion pass that hoists loop invariant
+/// instructions out of the loop.
+std::unique_ptr<FunctionPassBase> createLoopInvariantCodeMotionPass();
+
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<FunctionPassBase> createPipelineDataTransferPass();
+
+/// Lowers affine control flow operations (ForStmt, IfStmt and AffineApplyOp)
+/// to equivalent lower-level constructs (flow of basic blocks and arithmetic
+/// primitives).
+std::unique_ptr<FunctionPassBase> createLowerAffinePass();
+
+/// Creates a pass to perform tiling on loop nests.
+std::unique_ptr<FunctionPassBase> createLoopTilingPass(uint64_t cacheSizeBytes);
+
+/// Creates a pass that performs parametric tiling so that the outermost loops
+/// have the given fixed number of iterations.  Assumes outermost loop nests
+/// are permutable.
+std::unique_ptr<FunctionPassBase>
+createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
+
+/// Creates a pass that transforms perfectly nested loops with independent
+/// bounds into a single loop.
+std::unique_ptr<FunctionPassBase> createLoopCoalescingPass();
+
+/// Performs packing (or explicit copying) of accessed memref regions into
+/// buffers in the specified faster memory space through either pointwise copies
+/// or DMA operations.
+std::unique_ptr<FunctionPassBase> createAffineDataCopyGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace,
+    unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
+    uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
+
+/// Creates a pass to lower VectorTransferReadOp and VectorTransferWriteOp.
+std::unique_ptr<FunctionPassBase> createLowerVectorTransfersPass();
+
+/// Creates a pass to perform optimizations relying on memref dataflow such as
+/// store to load forwarding, elimination of dead stores, and dead allocs.
+std::unique_ptr<FunctionPassBase> createMemRefDataFlowOptPass();
+
+/// Creates a pass to strip debug information from a function.
+std::unique_ptr<FunctionPassBase> createStripDebugInfoPass();
+
+/// Creates a pass which tests loop fusion utilities.
+std::unique_ptr<FunctionPassBase> createTestLoopFusionPass();
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_PASSES_H
diff --git a/third_party/mlir/include/mlir/Transforms/RegionUtils.h b/third_party/mlir/include/mlir/Transforms/RegionUtils.h
new file mode 100644
index 00000000000..a00ddc6ff4c
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/RegionUtils.h
@@ -0,0 +1,50 @@
+//===- RegionUtils.h - Region-related transformation utilities --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_TRANSFORMS_REGIONUTILS_H_
+#define MLIR_TRANSFORMS_REGIONUTILS_H_
+
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+
+/// Check if all values in the provided range are defined above the `limit`
+/// region.  That is, if they are defined in a region that is a proper ancestor
+/// of `limit`.
+template <typename Range>
+bool areValuesDefinedAbove(Range values, Region &limit) {
+  for (Value *v : values)
+    if (!v->getParentRegion()->isProperAncestor(&limit))
+      return false;
+  return true;
+}
+
+/// Replace all uses of `orig` within the given region with `replacement`.
+void replaceAllUsesInRegionWith(Value *orig, Value *replacement,
+                                Region &region);
+
+/// Fill `values` with a list of values defined at the ancestors of the `limit`
+/// region and used within `region` or its descendants.
+void getUsedValuesDefinedAbove(Region &region, Region &limit,
+                               llvm::SetVector<Value *> &values);
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_REGIONUTILS_H_
diff --git a/third_party/mlir/include/mlir/Transforms/Utils.h b/third_party/mlir/include/mlir/Transforms/Utils.h
new file mode 100644
index 00000000000..23286af8a49
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/Utils.h
@@ -0,0 +1,130 @@
+//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for various transformation utilities for
+// memref's and non-loop IR structures. These are not passes by themselves but
+// are used either by passes, optimization sequences, or in turn by other
+// transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_UTILS_H
+#define MLIR_TRANSFORMS_UTILS_H
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class Location;
+class OpBuilder;
+
+/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
+/// optionally remapping the old memref's indices using the supplied affine map,
+/// `indexRemap`. The new memref could be of a different shape or rank.
+/// `extraIndices` provides additional access indices to be added to the start.
+///
+/// `indexRemap` remaps indices of the old memref access to a new set of indices
+/// that are used to index the memref. Additional input operands to indexRemap
+/// can be optionally provided, and they are added at the start of its input
+/// list. `indexRemap` is expected to have only dimensional inputs, and the
+/// number of its inputs equal to extraOperands.size() plus rank of the memref.
+/// 'extraOperands' is an optional argument that corresponds to additional
+/// operands (inputs) for indexRemap at the beginning of its input list.
+///
+/// `domInstFilter`, if non-null, restricts the replacement to only those
+/// operations that are dominated by the former; similarly, `postDomInstFilter`
+/// restricts replacement to only those operations that are postdominated by it.
+///
+/// Returns true on success and false if the replacement is not possible,
+/// whenever a memref is used as an operand in a non-dereferencing context,
+/// except for dealloc's on the memref which are left untouched. See comments at
+/// function definition for an example.
+//
+//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
+//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
+//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
+//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
+//  extra operands, note that 'indexRemap' would just be applied to existing
+//  indices (%i, %j).
+//  TODO(bondhugula): allow extraIndices to be added at any position.
+LogicalResult replaceAllMemRefUsesWith(Value *oldMemRef, Value *newMemRef,
+                                       ArrayRef<Value *> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value *> extraOperands = {},
+                                       Operation *domInstFilter = nullptr,
+                                       Operation *postDomInstFilter = nullptr);
+
+/// Performs the same replacement as the other version above but only for the
+/// dereferencing uses of `oldMemRef` in `op`.
+LogicalResult replaceAllMemRefUsesWith(Value *oldMemRef, Value *newMemRef,
+                                       Operation *op,
+                                       ArrayRef<Value *> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value *> extraOperands = {});
+
+/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+/// its results equal to the number of operands, as a composition
+/// of all other AffineApplyOps reachable from input parameter 'operands'. If
+/// different operands were drawing results from multiple affine apply ops,
+/// these will also be collected into a single (multi-result) affine apply op.
+/// The final results of the composed AffineApplyOp are returned in output
+/// parameter 'results'. Returns the affine apply op created.
+Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
+                                       ArrayRef<Value *> operands,
+                                       ArrayRef<Operation *> affineApplyOps,
+                                       SmallVectorImpl<Value *> *results);
+
+/// Given an operation, inserts one or more single result affine apply
+/// operations, results of which are exclusively used by this operation.
+/// The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %v = "compute"(%idx, ...)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   %v = "compute"(%idx_, ...)
+
+/// This allows the application of different transformations on send and
+/// compute (for eg. different shifts/delays)
+///
+/// Fills `sliceOps` with the list of affine.apply operations.
+/// In the following cases, `sliceOps` remains empty:
+///   1. If none of opInst's operands were the result of an affine.apply
+///      (i.e., there was no affine computation slice to create).
+///   2. If all the affine.apply op's supplying operands to this opInst did not
+///      have any uses other than those in this opInst.
+void createAffineComputationSlice(Operation *opInst,
+                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_UTILS_H
diff --git a/third_party/mlir/include/mlir/Transforms/ViewRegionGraph.h b/third_party/mlir/include/mlir/Transforms/ViewRegionGraph.h
new file mode 100644
index 00000000000..61da9f11f19
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/ViewRegionGraph.h
@@ -0,0 +1,49 @@
+//===- ViewRegionGraph.h - View/write graphviz graphs -----------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines interface to produce Graphviz outputs of MLIR Regions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
+#define MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+class FunctionPassBase;
+class Region;
+
+/// Displays the CFG in a window. This is for use from the debugger and
+/// depends on Graphviz to generate the graph.
+void viewGraph(Region &region, const Twine &name, bool shortNames = false,
+               const Twine &title = "",
+               llvm::GraphProgram::Name program = llvm::GraphProgram::DOT);
+
+llvm::raw_ostream &writeGraph(llvm::raw_ostream &os, Region &region,
+                              bool shortNames = false, const Twine &title = "");
+
+/// Creates a pass to print CFG graphs.
+FunctionPassBase *createPrintCFGGraphPass(llvm::raw_ostream &os = llvm::errs(),
+                                          bool shortNames = false,
+                                          const llvm::Twine &title = "");
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
diff --git a/third_party/mlir/include/mlir/Translation.h b/third_party/mlir/include/mlir/Translation.h
new file mode 100644
index 00000000000..b0cb93091b6
--- /dev/null
+++ b/third_party/mlir/include/mlir/Translation.h
@@ -0,0 +1,71 @@
+//===- Translation.h - Translation registry ---------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Registry for user-provided translations.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_TRANSLATION_H
+#define MLIR_TRANSLATION_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+struct LogicalResult;
+class MLIRContext;
+class ModuleOp;
+class OwningModuleRef;
+
+/// Interface of the function that translates a file to MLIR.  The
+/// implementation should create a new MLIR ModuleOp in the given context and
+/// return a pointer to it, or a nullptr in case of any error.
+using TranslateToMLIRFunction =
+    std::function<OwningModuleRef(llvm::StringRef, MLIRContext *)>;
+/// Interface of the function that translates MLIR to a different format and
+/// outputs the result to a file. It is allowed to modify the module.
+using TranslateFromMLIRFunction =
+    std::function<LogicalResult(ModuleOp, llvm::StringRef)>;
+
+/// Use Translate[To|From]MLIRRegistration as a global initialiser that
+/// registers a function and associates it with name. This requires that a
+/// translation has not been registered to a given name.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static TranslateToMLIRRegistration Unused(&MySubCommand, [] { ... });
+///
+/// \{
+struct TranslateToMLIRRegistration {
+  TranslateToMLIRRegistration(llvm::StringRef name,
+                              const TranslateToMLIRFunction &function);
+};
+
+struct TranslateFromMLIRRegistration {
+  TranslateFromMLIRRegistration(llvm::StringRef name,
+                                const TranslateFromMLIRFunction &function);
+};
+/// \}
+
+/// Get a read-only reference to the translator registry.
+const llvm::StringMap<TranslateToMLIRFunction> &getTranslationToMLIRRegistry();
+const llvm::StringMap<TranslateFromMLIRFunction> &
+getTranslationFromMLIRRegistry();
+
+} // namespace mlir
+
+#endif // MLIR_TRANSLATION_H
diff --git a/third_party/mlir/lib/Analysis/AffineAnalysis.cpp b/third_party/mlir/lib/Analysis/AffineAnalysis.cpp
new file mode 100644
index 00000000000..92997ad27a7
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -0,0 +1,896 @@
+//===- AffineAnalysis.cpp - Affine structures analysis routines -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous analysis routines for affine structures
+// (expressions, maps, sets), and other utilities relying on such analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "affine-analysis"
+
+using namespace mlir;
+
+using llvm::dbgs;
+
+/// Returns the sequence of AffineApplyOp Operations operation in
+/// 'affineApplyOps', which are reachable via a search starting from 'operands',
+/// and ending at operands which are not defined by AffineApplyOps.
+// TODO(andydavis) Add a method to AffineApplyOp which forward substitutes
+// the AffineApplyOp into any user AffineApplyOps.
+void mlir::getReachableAffineApplyOps(
+    ArrayRef<Value *> operands, SmallVectorImpl<Operation *> &affineApplyOps) {
+  struct State {
+    // The ssa value for this node in the DFS traversal.
+    Value *value;
+    // The operand index of 'value' to explore next during DFS traversal.
+    unsigned operandIndex;
+  };
+  SmallVector<State, 4> worklist;
+  for (auto *operand : operands) {
+    worklist.push_back({operand, 0});
+  }
+
+  while (!worklist.empty()) {
+    State &state = worklist.back();
+    auto *opInst = state.value->getDefiningOp();
+    // Note: getDefiningOp will return nullptr if the operand is not an
+    // Operation (i.e. block argument), which is a terminator for the search.
+    if (!isa_and_nonnull<AffineApplyOp>(opInst)) {
+      worklist.pop_back();
+      continue;
+    }
+
+    if (state.operandIndex == 0) {
+      // Pre-Visit: Add 'opInst' to reachable sequence.
+      affineApplyOps.push_back(opInst);
+    }
+    if (state.operandIndex < opInst->getNumOperands()) {
+      // Visit: Add next 'affineApplyOp' operand to worklist.
+      // Get next operand to visit at 'operandIndex'.
+      auto *nextOperand = opInst->getOperand(state.operandIndex);
+      // Increment 'operandIndex' in 'state'.
+      ++state.operandIndex;
+      // Add 'nextOperand' to worklist.
+      worklist.push_back({nextOperand, 0});
+    } else {
+      // Post-visit: done visiting operands AffineApplyOp, pop off stack.
+      worklist.pop_back();
+    }
+  }
+}
+
+// Builds a system of constraints with dimensional identifiers corresponding to
+// the loop IVs of the forOps appearing in that order. Any symbols founds in
+// the bound operands are added as symbols in the system. Returns failure for
+// the yet unimplemented cases.
+// TODO(andydavis,bondhugula) Handle non-unit steps through local variables or
+// stride information in FlatAffineConstraints. (For eg., by using iv - lb %
+// step = 0 and/or by introducing a method in FlatAffineConstraints
+// setExprStride(ArrayRef<int64_t> expr, int64_t stride)
+LogicalResult mlir::getIndexSet(MutableArrayRef<AffineForOp> forOps,
+                                FlatAffineConstraints *domain) {
+  SmallVector<Value *, 4> indices;
+  extractForInductionVars(forOps, &indices);
+  // Reset while associated Values in 'indices' to the domain.
+  domain->reset(forOps.size(), /*numSymbols=*/0, /*numLocals=*/0, indices);
+  for (auto forOp : forOps) {
+    // Add constraints from forOp's bounds.
+    if (failed(domain->addAffineForOpDomain(forOp)))
+      return failure();
+  }
+  return success();
+}
+
+// Computes the iteration domain for 'opInst' and populates 'indexSet', which
+// encapsulates the constraints involving loops surrounding 'opInst' and
+// potentially involving any Function symbols. The dimensional identifiers in
+// 'indexSet' correspond to the loops surounding 'op' from outermost to
+// innermost.
+// TODO(andydavis) Add support to handle IfInsts surrounding 'op'.
+static LogicalResult getInstIndexSet(Operation *op,
+                                     FlatAffineConstraints *indexSet) {
+  // TODO(andydavis) Extend this to gather enclosing IfInsts and consider
+  // factoring it out into a utility function.
+  SmallVector<AffineForOp, 4> loops;
+  getLoopIVs(*op, &loops);
+  return getIndexSet(loops, indexSet);
+}
+
+// ValuePositionMap manages the mapping from Values which represent dimension
+// and symbol identifiers from 'src' and 'dst' access functions to positions
+// in new space where some Values are kept separate (using addSrc/DstValue)
+// and some Values are merged (addSymbolValue).
+// Position lookups return the absolute position in the new space which
+// has the following format:
+//
+//   [src-dim-identifiers] [dst-dim-identifiers] [symbol-identifers]
+//
+// Note: access function non-IV dimension identifiers (that have 'dimension'
+// positions in the access function position space) are assigned as symbols
+// in the output position space. Convienience access functions which lookup
+// an Value in multiple maps are provided (i.e. getSrcDimOrSymPos) to handle
+// the common case of resolving positions for all access function operands.
+//
+// TODO(andydavis) Generalize this: could take a template parameter for
+// the number of maps (3 in the current case), and lookups could take indices
+// of maps to check. So getSrcDimOrSymPos would be "getPos(value, {0, 2})".
+class ValuePositionMap {
+public:
+  void addSrcValue(Value *value) {
+    if (addValueAt(value, &srcDimPosMap, numSrcDims))
+      ++numSrcDims;
+  }
+  void addDstValue(Value *value) {
+    if (addValueAt(value, &dstDimPosMap, numDstDims))
+      ++numDstDims;
+  }
+  void addSymbolValue(Value *value) {
+    if (addValueAt(value, &symbolPosMap, numSymbols))
+      ++numSymbols;
+  }
+  unsigned getSrcDimOrSymPos(Value *value) const {
+    return getDimOrSymPos(value, srcDimPosMap, 0);
+  }
+  unsigned getDstDimOrSymPos(Value *value) const {
+    return getDimOrSymPos(value, dstDimPosMap, numSrcDims);
+  }
+  unsigned getSymPos(Value *value) const {
+    auto it = symbolPosMap.find(value);
+    assert(it != symbolPosMap.end());
+    return numSrcDims + numDstDims + it->second;
+  }
+
+  unsigned getNumSrcDims() const { return numSrcDims; }
+  unsigned getNumDstDims() const { return numDstDims; }
+  unsigned getNumDims() const { return numSrcDims + numDstDims; }
+  unsigned getNumSymbols() const { return numSymbols; }
+
+private:
+  bool addValueAt(Value *value, DenseMap<Value *, unsigned> *posMap,
+                  unsigned position) {
+    auto it = posMap->find(value);
+    if (it == posMap->end()) {
+      (*posMap)[value] = position;
+      return true;
+    }
+    return false;
+  }
+  unsigned getDimOrSymPos(Value *value,
+                          const DenseMap<Value *, unsigned> &dimPosMap,
+                          unsigned dimPosOffset) const {
+    auto it = dimPosMap.find(value);
+    if (it != dimPosMap.end()) {
+      return dimPosOffset + it->second;
+    }
+    it = symbolPosMap.find(value);
+    assert(it != symbolPosMap.end());
+    return numSrcDims + numDstDims + it->second;
+  }
+
+  unsigned numSrcDims = 0;
+  unsigned numDstDims = 0;
+  unsigned numSymbols = 0;
+  DenseMap<Value *, unsigned> srcDimPosMap;
+  DenseMap<Value *, unsigned> dstDimPosMap;
+  DenseMap<Value *, unsigned> symbolPosMap;
+};
+
+// Builds a map from Value to identifier position in a new merged identifier
+// list, which is the result of merging dim/symbol lists from src/dst
+// iteration domains, the format of which is as follows:
+//
+//   [src-dim-identifiers, dst-dim-identifiers, symbol-identifiers, const_term]
+//
+// This method populates 'valuePosMap' with mappings from operand Values in
+// 'srcAccessMap'/'dstAccessMap' (as well as those in 'srcDomain'/'dstDomain')
+// to the position of these values in the merged list.
+static void buildDimAndSymbolPositionMaps(
+    const FlatAffineConstraints &srcDomain,
+    const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap,
+    const AffineValueMap &dstAccessMap, ValuePositionMap *valuePosMap,
+    FlatAffineConstraints *dependenceConstraints) {
+  auto updateValuePosMap = [&](ArrayRef<Value *> values, bool isSrc) {
+    for (unsigned i = 0, e = values.size(); i < e; ++i) {
+      auto *value = values[i];
+      if (!isForInductionVar(values[i])) {
+        assert(isValidSymbol(values[i]) &&
+               "access operand has to be either a loop IV or a symbol");
+        valuePosMap->addSymbolValue(value);
+      } else if (isSrc) {
+        valuePosMap->addSrcValue(value);
+      } else {
+        valuePosMap->addDstValue(value);
+      }
+    }
+  };
+
+  SmallVector<Value *, 4> srcValues, destValues;
+  srcDomain.getIdValues(0, srcDomain.getNumDimAndSymbolIds(), &srcValues);
+  dstDomain.getIdValues(0, dstDomain.getNumDimAndSymbolIds(), &destValues);
+  // Update value position map with identifiers from src iteration domain.
+  updateValuePosMap(srcValues, /*isSrc=*/true);
+  // Update value position map with identifiers from dst iteration domain.
+  updateValuePosMap(destValues, /*isSrc=*/false);
+  // Update value position map with identifiers from src access function.
+  updateValuePosMap(srcAccessMap.getOperands(), /*isSrc=*/true);
+  // Update value position map with identifiers from dst access function.
+  updateValuePosMap(dstAccessMap.getOperands(), /*isSrc=*/false);
+}
+
+// Sets up dependence constraints columns appropriately, in the format:
+// [src-dim-ids, dst-dim-ids, symbol-ids, local-ids, const_term]
+void initDependenceConstraints(const FlatAffineConstraints &srcDomain,
+                               const FlatAffineConstraints &dstDomain,
+                               const AffineValueMap &srcAccessMap,
+                               const AffineValueMap &dstAccessMap,
+                               const ValuePositionMap &valuePosMap,
+                               FlatAffineConstraints *dependenceConstraints) {
+  // Calculate number of equalities/inequalities and columns required to
+  // initialize FlatAffineConstraints for 'dependenceDomain'.
+  unsigned numIneq =
+      srcDomain.getNumInequalities() + dstDomain.getNumInequalities();
+  AffineMap srcMap = srcAccessMap.getAffineMap();
+  assert(srcMap.getNumResults() == dstAccessMap.getAffineMap().getNumResults());
+  unsigned numEq = srcMap.getNumResults();
+  unsigned numDims = srcDomain.getNumDimIds() + dstDomain.getNumDimIds();
+  unsigned numSymbols = valuePosMap.getNumSymbols();
+  unsigned numLocals = srcDomain.getNumLocalIds() + dstDomain.getNumLocalIds();
+  unsigned numIds = numDims + numSymbols + numLocals;
+  unsigned numCols = numIds + 1;
+
+  // Set flat affine constraints sizes and reserving space for constraints.
+  dependenceConstraints->reset(numIneq, numEq, numCols, numDims, numSymbols,
+                               numLocals);
+
+  // Set values corresponding to dependence constraint identifiers.
+  SmallVector<Value *, 4> srcLoopIVs, dstLoopIVs;
+  srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcLoopIVs);
+  dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstLoopIVs);
+
+  dependenceConstraints->setIdValues(0, srcLoopIVs.size(), srcLoopIVs);
+  dependenceConstraints->setIdValues(
+      srcLoopIVs.size(), srcLoopIVs.size() + dstLoopIVs.size(), dstLoopIVs);
+
+  // Set values for the symbolic identifier dimensions.
+  auto setSymbolIds = [&](ArrayRef<Value *> values) {
+    for (auto *value : values) {
+      if (!isForInductionVar(value)) {
+        assert(isValidSymbol(value) && "expected symbol");
+        dependenceConstraints->setIdValue(valuePosMap.getSymPos(value), value);
+      }
+    }
+  };
+
+  setSymbolIds(srcAccessMap.getOperands());
+  setSymbolIds(dstAccessMap.getOperands());
+
+  SmallVector<Value *, 8> srcSymbolValues, dstSymbolValues;
+  srcDomain.getIdValues(srcDomain.getNumDimIds(),
+                        srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
+  dstDomain.getIdValues(dstDomain.getNumDimIds(),
+                        dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
+  setSymbolIds(srcSymbolValues);
+  setSymbolIds(dstSymbolValues);
+
+  for (unsigned i = 0, e = dependenceConstraints->getNumDimAndSymbolIds();
+       i < e; i++)
+    assert(dependenceConstraints->getIds()[i].hasValue());
+}
+
+// Adds iteration domain constraints from 'srcDomain' and 'dstDomain' into
+// 'dependenceDomain'.
+// Uses 'valuePosMap' to determine the position in 'dependenceDomain' to which a
+// srcDomain/dstDomain Value maps.
+static void addDomainConstraints(const FlatAffineConstraints &srcDomain,
+                                 const FlatAffineConstraints &dstDomain,
+                                 const ValuePositionMap &valuePosMap,
+                                 FlatAffineConstraints *dependenceDomain) {
+  unsigned depNumDimsAndSymbolIds = dependenceDomain->getNumDimAndSymbolIds();
+
+  SmallVector<int64_t, 4> cst(dependenceDomain->getNumCols());
+
+  auto addDomain = [&](bool isSrc, bool isEq, unsigned localOffset) {
+    const FlatAffineConstraints &domain = isSrc ? srcDomain : dstDomain;
+    unsigned numCsts =
+        isEq ? domain.getNumEqualities() : domain.getNumInequalities();
+    unsigned numDimAndSymbolIds = domain.getNumDimAndSymbolIds();
+    auto at = [&](unsigned i, unsigned j) -> int64_t {
+      return isEq ? domain.atEq(i, j) : domain.atIneq(i, j);
+    };
+    auto map = [&](unsigned i) -> int64_t {
+      return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getIdValue(i))
+                   : valuePosMap.getDstDimOrSymPos(domain.getIdValue(i));
+    };
+
+    for (unsigned i = 0; i < numCsts; ++i) {
+      // Zero fill.
+      std::fill(cst.begin(), cst.end(), 0);
+      // Set coefficients for identifiers corresponding to domain.
+      for (unsigned j = 0; j < numDimAndSymbolIds; ++j)
+        cst[map(j)] = at(i, j);
+      // Local terms.
+      for (unsigned j = 0, e = domain.getNumLocalIds(); j < e; j++)
+        cst[depNumDimsAndSymbolIds + localOffset + j] =
+            at(i, numDimAndSymbolIds + j);
+      // Set constant term.
+      cst[cst.size() - 1] = at(i, domain.getNumCols() - 1);
+      // Add constraint.
+      if (isEq)
+        dependenceDomain->addEquality(cst);
+      else
+        dependenceDomain->addInequality(cst);
+    }
+  };
+
+  // Add equalities from src domain.
+  addDomain(/*isSrc=*/true, /*isEq=*/true, /*localOffset=*/0);
+  // Add inequalities from src domain.
+  addDomain(/*isSrc=*/true, /*isEq=*/false, /*localOffset=*/0);
+  // Add equalities from dst domain.
+  addDomain(/*isSrc=*/false, /*isEq=*/true,
+            /*localOffset=*/srcDomain.getNumLocalIds());
+  // Add inequalities from dst domain.
+  addDomain(/*isSrc=*/false, /*isEq=*/false,
+            /*localOffset=*/srcDomain.getNumLocalIds());
+}
+
+// Adds equality constraints that equate src and dst access functions
+// represented by 'srcAccessMap' and 'dstAccessMap' for each result.
+// Requires that 'srcAccessMap' and 'dstAccessMap' have the same results count.
+// For example, given the following two accesses functions to a 2D memref:
+//
+//   Source access function:
+//     (a0 * d0 + a1 * s0 + a2, b0 * d0 + b1 * s0 + b2)
+//
+//   Destination acceses function:
+//     (c0 * d0 + c1 * s0 + c2, f0 * d0 + f1 * s0 + f2)
+//
+// This method constructs the following equality constraints in
+// 'dependenceDomain', by equating the access functions for each result
+// (i.e. each memref dim). Notice that 'd0' for the destination access function
+// is mapped into 'd0' in the equality constraint:
+//
+//   d0      d1      s0         c
+//   --      --      --         --
+//   a0     -c0      (a1 - c1)  (a1 - c2) = 0
+//   b0     -f0      (b1 - f1)  (b1 - f2) = 0
+//
+// Returns failure if any AffineExpr cannot be flattened (due to it being
+// semi-affine). Returns success otherwise.
+static LogicalResult
+addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
+                           const AffineValueMap &dstAccessMap,
+                           const ValuePositionMap &valuePosMap,
+                           FlatAffineConstraints *dependenceDomain) {
+  AffineMap srcMap = srcAccessMap.getAffineMap();
+  AffineMap dstMap = dstAccessMap.getAffineMap();
+  assert(srcMap.getNumResults() == dstMap.getNumResults());
+  unsigned numResults = srcMap.getNumResults();
+
+  unsigned srcNumIds = srcMap.getNumDims() + srcMap.getNumSymbols();
+  ArrayRef<Value *> srcOperands = srcAccessMap.getOperands();
+
+  unsigned dstNumIds = dstMap.getNumDims() + dstMap.getNumSymbols();
+  ArrayRef<Value *> dstOperands = dstAccessMap.getOperands();
+
+  std::vector<SmallVector<int64_t, 8>> srcFlatExprs;
+  std::vector<SmallVector<int64_t, 8>> destFlatExprs;
+  FlatAffineConstraints srcLocalVarCst, destLocalVarCst;
+  // Get flattened expressions for the source destination maps.
+  if (failed(getFlattenedAffineExprs(srcMap, &srcFlatExprs, &srcLocalVarCst)) ||
+      failed(getFlattenedAffineExprs(dstMap, &destFlatExprs, &destLocalVarCst)))
+    return failure();
+
+  unsigned domNumLocalIds = dependenceDomain->getNumLocalIds();
+  unsigned srcNumLocalIds = srcLocalVarCst.getNumLocalIds();
+  unsigned dstNumLocalIds = destLocalVarCst.getNumLocalIds();
+  unsigned numLocalIdsToAdd = srcNumLocalIds + dstNumLocalIds;
+  for (unsigned i = 0; i < numLocalIdsToAdd; i++) {
+    dependenceDomain->addLocalId(dependenceDomain->getNumLocalIds());
+  }
+
+  unsigned numDims = dependenceDomain->getNumDimIds();
+  unsigned numSymbols = dependenceDomain->getNumSymbolIds();
+  unsigned numSrcLocalIds = srcLocalVarCst.getNumLocalIds();
+  unsigned newLocalIdOffset = numDims + numSymbols + domNumLocalIds;
+
+  // Equality to add.
+  SmallVector<int64_t, 8> eq(dependenceDomain->getNumCols());
+  for (unsigned i = 0; i < numResults; ++i) {
+    // Zero fill.
+    std::fill(eq.begin(), eq.end(), 0);
+
+    // Flattened AffineExpr for src result 'i'.
+    const auto &srcFlatExpr = srcFlatExprs[i];
+    // Set identifier coefficients from src access function.
+    for (unsigned j = 0, e = srcOperands.size(); j < e; ++j)
+      eq[valuePosMap.getSrcDimOrSymPos(srcOperands[j])] = srcFlatExpr[j];
+    // Local terms.
+    for (unsigned j = 0, e = srcNumLocalIds; j < e; j++)
+      eq[newLocalIdOffset + j] = srcFlatExpr[srcNumIds + j];
+    // Set constant term.
+    eq[eq.size() - 1] = srcFlatExpr[srcFlatExpr.size() - 1];
+
+    // Flattened AffineExpr for dest result 'i'.
+    const auto &destFlatExpr = destFlatExprs[i];
+    // Set identifier coefficients from dst access function.
+    for (unsigned j = 0, e = dstOperands.size(); j < e; ++j)
+      eq[valuePosMap.getDstDimOrSymPos(dstOperands[j])] -= destFlatExpr[j];
+    // Local terms.
+    for (unsigned j = 0, e = dstNumLocalIds; j < e; j++)
+      eq[newLocalIdOffset + numSrcLocalIds + j] = -destFlatExpr[dstNumIds + j];
+    // Set constant term.
+    eq[eq.size() - 1] -= destFlatExpr[destFlatExpr.size() - 1];
+
+    // Add equality constraint.
+    dependenceDomain->addEquality(eq);
+  }
+
+  // Add equality constraints for any operands that are defined by constant ops.
+  auto addEqForConstOperands = [&](ArrayRef<Value *> operands) {
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      if (isForInductionVar(operands[i]))
+        continue;
+      auto *symbol = operands[i];
+      assert(isValidSymbol(symbol));
+      // Check if the symbol is a constant.
+      if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(symbol->getDefiningOp()))
+        dependenceDomain->setIdToConstant(valuePosMap.getSymPos(symbol),
+                                          cOp.getValue());
+    }
+  };
+
+  // Add equality constraints for any src symbols defined by constant ops.
+  addEqForConstOperands(srcOperands);
+  // Add equality constraints for any dst symbols defined by constant ops.
+  addEqForConstOperands(dstOperands);
+
+  // By construction (see flattener), local var constraints will not have any
+  // equalities.
+  assert(srcLocalVarCst.getNumEqualities() == 0 &&
+         destLocalVarCst.getNumEqualities() == 0);
+  // Add inequalities from srcLocalVarCst and destLocalVarCst into the
+  // dependence domain.
+  SmallVector<int64_t, 8> ineq(dependenceDomain->getNumCols());
+  for (unsigned r = 0, e = srcLocalVarCst.getNumInequalities(); r < e; r++) {
+    std::fill(ineq.begin(), ineq.end(), 0);
+
+    // Set identifier coefficients from src local var constraints.
+    for (unsigned j = 0, e = srcOperands.size(); j < e; ++j)
+      ineq[valuePosMap.getSrcDimOrSymPos(srcOperands[j])] =
+          srcLocalVarCst.atIneq(r, j);
+    // Local terms.
+    for (unsigned j = 0, e = srcNumLocalIds; j < e; j++)
+      ineq[newLocalIdOffset + j] = srcLocalVarCst.atIneq(r, srcNumIds + j);
+    // Set constant term.
+    ineq[ineq.size() - 1] =
+        srcLocalVarCst.atIneq(r, srcLocalVarCst.getNumCols() - 1);
+    dependenceDomain->addInequality(ineq);
+  }
+
+  for (unsigned r = 0, e = destLocalVarCst.getNumInequalities(); r < e; r++) {
+    std::fill(ineq.begin(), ineq.end(), 0);
+    // Set identifier coefficients from dest local var constraints.
+    for (unsigned j = 0, e = dstOperands.size(); j < e; ++j)
+      ineq[valuePosMap.getDstDimOrSymPos(dstOperands[j])] =
+          destLocalVarCst.atIneq(r, j);
+    // Local terms.
+    for (unsigned j = 0, e = dstNumLocalIds; j < e; j++)
+      ineq[newLocalIdOffset + numSrcLocalIds + j] =
+          destLocalVarCst.atIneq(r, dstNumIds + j);
+    // Set constant term.
+    ineq[ineq.size() - 1] =
+        destLocalVarCst.atIneq(r, destLocalVarCst.getNumCols() - 1);
+
+    dependenceDomain->addInequality(ineq);
+  }
+  return success();
+}
+
+// Returns the number of outer loop common to 'src/dstDomain'.
+// Loops common to 'src/dst' domains are added to 'commonLoops' if non-null.
+static unsigned
+getNumCommonLoops(const FlatAffineConstraints &srcDomain,
+                  const FlatAffineConstraints &dstDomain,
+                  SmallVectorImpl<AffineForOp> *commonLoops = nullptr) {
+  // Find the number of common loops shared by src and dst accesses.
+  unsigned minNumLoops =
+      std::min(srcDomain.getNumDimIds(), dstDomain.getNumDimIds());
+  unsigned numCommonLoops = 0;
+  for (unsigned i = 0; i < minNumLoops; ++i) {
+    if (!isForInductionVar(srcDomain.getIdValue(i)) ||
+        !isForInductionVar(dstDomain.getIdValue(i)) ||
+        srcDomain.getIdValue(i) != dstDomain.getIdValue(i))
+      break;
+    if (commonLoops != nullptr)
+      commonLoops->push_back(getForInductionVarOwner(srcDomain.getIdValue(i)));
+    ++numCommonLoops;
+  }
+  if (commonLoops != nullptr)
+    assert(commonLoops->size() == numCommonLoops);
+  return numCommonLoops;
+}
+
+// Returns Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
+static Block *getCommonBlock(const MemRefAccess &srcAccess,
+                             const MemRefAccess &dstAccess,
+                             const FlatAffineConstraints &srcDomain,
+                             unsigned numCommonLoops) {
+  if (numCommonLoops == 0) {
+    auto *block = srcAccess.opInst->getBlock();
+    while (!llvm::isa<FuncOp>(block->getParentOp())) {
+      block = block->getParentOp()->getBlock();
+    }
+    return block;
+  }
+  auto *commonForValue = srcDomain.getIdValue(numCommonLoops - 1);
+  auto forOp = getForInductionVarOwner(commonForValue);
+  assert(forOp && "commonForValue was not an induction variable");
+  return forOp.getBody();
+}
+
+// Returns true if the ancestor operation of 'srcAccess' appears before the
+// ancestor operation of 'dstAccess' in the common ancestral block. Returns
+// false otherwise.
+// Note that because 'srcAccess' or 'dstAccess' may be nested in conditionals,
+// the function is named 'srcAppearsBeforeDstInCommonBlock'. Note that
+// 'numCommonLoops' is the number of contiguous surrounding outer loops.
+static bool srcAppearsBeforeDstInAncestralBlock(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    const FlatAffineConstraints &srcDomain, unsigned numCommonLoops) {
+  // Get Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
+  auto *commonBlock =
+      getCommonBlock(srcAccess, dstAccess, srcDomain, numCommonLoops);
+  // Check the dominance relationship between the respective ancestors of the
+  // src and dst in the Block of the innermost among the common loops.
+  auto *srcInst = commonBlock->findAncestorInstInBlock(*srcAccess.opInst);
+  assert(srcInst != nullptr);
+  auto *dstInst = commonBlock->findAncestorInstInBlock(*dstAccess.opInst);
+  assert(dstInst != nullptr);
+
+  // Determine whether dstInst comes after srcInst.
+  return srcInst->isBeforeInBlock(dstInst);
+}
+
+// Adds ordering constraints to 'dependenceDomain' based on number of loops
+// common to 'src/dstDomain' and requested 'loopDepth'.
+// Note that 'loopDepth' cannot exceed the number of common loops plus one.
+// EX: Given a loop nest of depth 2 with IVs 'i' and 'j':
+// *) If 'loopDepth == 1' then one constraint is added: i' >= i + 1
+// *) If 'loopDepth == 2' then two constraints are added: i == i' and j' > j + 1
+// *) If 'loopDepth == 3' then two constraints are added: i == i' and j == j'
+static void addOrderingConstraints(const FlatAffineConstraints &srcDomain,
+                                   const FlatAffineConstraints &dstDomain,
+                                   unsigned loopDepth,
+                                   FlatAffineConstraints *dependenceDomain) {
+  unsigned numCols = dependenceDomain->getNumCols();
+  SmallVector<int64_t, 4> eq(numCols);
+  unsigned numSrcDims = srcDomain.getNumDimIds();
+  unsigned numCommonLoops = getNumCommonLoops(srcDomain, dstDomain);
+  unsigned numCommonLoopConstraints = std::min(numCommonLoops, loopDepth);
+  for (unsigned i = 0; i < numCommonLoopConstraints; ++i) {
+    std::fill(eq.begin(), eq.end(), 0);
+    eq[i] = -1;
+    eq[i + numSrcDims] = 1;
+    if (i == loopDepth - 1) {
+      eq[numCols - 1] = -1;
+      dependenceDomain->addInequality(eq);
+    } else {
+      dependenceDomain->addEquality(eq);
+    }
+  }
+}
+
+// Computes distance and direction vectors in 'dependences', by adding
+// variables to 'dependenceDomain' which represent the difference of the IVs,
+// eliminating all other variables, and reading off distance vectors from
+// equality constraints (if possible), and direction vectors from inequalities.
+static void computeDirectionVector(
+    const FlatAffineConstraints &srcDomain,
+    const FlatAffineConstraints &dstDomain, unsigned loopDepth,
+    FlatAffineConstraints *dependenceDomain,
+    llvm::SmallVector<DependenceComponent, 2> *dependenceComponents) {
+  // Find the number of common loops shared by src and dst accesses.
+  SmallVector<AffineForOp, 4> commonLoops;
+  unsigned numCommonLoops =
+      getNumCommonLoops(srcDomain, dstDomain, &commonLoops);
+  if (numCommonLoops == 0)
+    return;
+  // Compute direction vectors for requested loop depth.
+  unsigned numIdsToEliminate = dependenceDomain->getNumIds();
+  // Add new variables to 'dependenceDomain' to represent the direction
+  // constraints for each shared loop.
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    dependenceDomain->addDimId(j);
+  }
+
+  // Add equality contraints for each common loop, setting newly introduced
+  // variable at column 'j' to the 'dst' IV minus the 'src IV.
+  SmallVector<int64_t, 4> eq;
+  eq.resize(dependenceDomain->getNumCols());
+  unsigned numSrcDims = srcDomain.getNumDimIds();
+  // Constraint variables format:
+  // [num-common-loops][num-src-dim-ids][num-dst-dim-ids][num-symbols][constant]
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    std::fill(eq.begin(), eq.end(), 0);
+    eq[j] = 1;
+    eq[j + numCommonLoops] = 1;
+    eq[j + numCommonLoops + numSrcDims] = -1;
+    dependenceDomain->addEquality(eq);
+  }
+
+  // Eliminate all variables other than the direction variables just added.
+  dependenceDomain->projectOut(numCommonLoops, numIdsToEliminate);
+
+  // Scan each common loop variable column and set direction vectors based
+  // on eliminated constraint system.
+  dependenceComponents->resize(numCommonLoops);
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    (*dependenceComponents)[j].op = commonLoops[j].getOperation();
+    auto lbConst = dependenceDomain->getConstantLowerBound(j);
+    (*dependenceComponents)[j].lb =
+        lbConst.getValueOr(std::numeric_limits<int64_t>::min());
+    auto ubConst = dependenceDomain->getConstantUpperBound(j);
+    (*dependenceComponents)[j].ub =
+        ubConst.getValueOr(std::numeric_limits<int64_t>::max());
+  }
+}
+
+// Populates 'accessMap' with composition of AffineApplyOps reachable from
+// indices of MemRefAccess.
+void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {
+  // Get affine map from AffineLoad/Store.
+  AffineMap map;
+  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst))
+    map = loadOp.getAffineMap();
+  else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst))
+    map = storeOp.getAffineMap();
+  SmallVector<Value *, 8> operands(indices.begin(), indices.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  map = simplifyAffineMap(map);
+  canonicalizeMapAndOperands(&map, &operands);
+  accessMap->reset(map, operands);
+}
+
+// Builds a flat affine constraint system to check if there exists a dependence
+// between memref accesses 'srcAccess' and 'dstAccess'.
+// Returns 'NoDependence' if the accesses can be definitively shown not to
+// access the same element.
+// Returns 'HasDependence' if the accesses do access the same element.
+// Returns 'Failure' if an error or unsupported case was encountered.
+// If a dependence exists, returns in 'dependenceComponents' a direction
+// vector for the dependence, with a component for each loop IV in loops
+// common to both accesses (see Dependence in AffineAnalysis.h for details).
+//
+// The memref access dependence check is comprised of the following steps:
+// *) Compute access functions for each access. Access functions are computed
+//    using AffineValueMaps initialized with the indices from an access, then
+//    composed with AffineApplyOps reachable from operands of that access,
+//    until operands of the AffineValueMap are loop IVs or symbols.
+// *) Build iteration domain constraints for each access. Iteration domain
+//    constraints are pairs of inequality contraints representing the
+//    upper/lower loop bounds for each AffineForOp in the loop nest associated
+//    with each access.
+// *) Build dimension and symbol position maps for each access, which map
+//    Values from access functions and iteration domains to their position
+//    in the merged constraint system built by this method.
+//
+// This method builds a constraint system with the following column format:
+//
+//  [src-dim-identifiers, dst-dim-identifiers, symbols, constant]
+//
+// For example, given the following MLIR code with with "source" and
+// "destination" accesses to the same memref labled, and symbols %M, %N, %K:
+//
+//   affine.for %i0 = 0 to 100 {
+//     affine.for %i1 = 0 to 50 {
+//       %a0 = affine.apply
+//         (d0, d1) -> (d0 * 2 - d1 * 4 + s1, d1 * 3 - s0) (%i0, %i1)[%M, %N]
+//       // Source memref access.
+//       store %v0, %m[%a0#0, %a0#1] : memref<4x4xf32>
+//     }
+//   }
+//
+//   affine.for %i2 = 0 to 100 {
+//     affine.for %i3 = 0 to 50 {
+//       %a1 = affine.apply
+//         (d0, d1) -> (d0 * 7 + d1 * 9 - s1, d1 * 11 + s0) (%i2, %i3)[%K, %M]
+//       // Destination memref access.
+//       %v1 = load %m[%a1#0, %a1#1] : memref<4x4xf32>
+//     }
+//   }
+//
+// The access functions would be the following:
+//
+//   src: (%i0 * 2 - %i1 * 4 + %N, %i1 * 3 - %M)
+//   dst: (%i2 * 7 + %i3 * 9 - %M, %i3 * 11 - %K)
+//
+// The iteration domains for the src/dst accesses would be the following:
+//
+//   src: 0 <= %i0 <= 100, 0 <= %i1 <= 50
+//   dst: 0 <= %i2 <= 100, 0 <= %i3 <= 50
+//
+// The symbols by both accesses would be assigned to a canonical position order
+// which will be used in the dependence constraint system:
+//
+//   symbol name: %M  %N  %K
+//   symbol  pos:  0   1   2
+//
+// Equality constraints are built by equating each result of src/destination
+// access functions. For this example, the following two equality constraints
+// will be added to the dependence constraint system:
+//
+//   [src_dim0, src_dim1, dst_dim0, dst_dim1, sym0, sym1, sym2, const]
+//      2         -4        -7        -9       1      1     0     0    = 0
+//      0          3         0        -11     -1      0     1     0    = 0
+//
+// Inequality constraints from the iteration domain will be meged into
+// the dependence constraint system
+//
+//   [src_dim0, src_dim1, dst_dim0, dst_dim1, sym0, sym1, sym2, const]
+//       1         0         0         0        0     0     0     0    >= 0
+//      -1         0         0         0        0     0     0     100  >= 0
+//       0         1         0         0        0     0     0     0    >= 0
+//       0        -1         0         0        0     0     0     50   >= 0
+//       0         0         1         0        0     0     0     0    >= 0
+//       0         0        -1         0        0     0     0     100  >= 0
+//       0         0         0         1        0     0     0     0    >= 0
+//       0         0         0        -1        0     0     0     50   >= 0
+//
+//
+// TODO(andydavis) Support AffineExprs mod/floordiv/ceildiv.
+DependenceResult mlir::checkMemrefAccessDependence(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    llvm::SmallVector<DependenceComponent, 2> *dependenceComponents,
+    bool allowRAR) {
+  LLVM_DEBUG(llvm::dbgs() << "Checking for dependence at depth: "
+                          << Twine(loopDepth) << " between:\n";);
+  LLVM_DEBUG(srcAccess.opInst->dump(););
+  LLVM_DEBUG(dstAccess.opInst->dump(););
+
+  // Return 'NoDependence' if these accesses do not access the same memref.
+  if (srcAccess.memref != dstAccess.memref)
+    return DependenceResult::NoDependence;
+
+  // Return 'NoDependence' if one of these accesses is not an AffineStoreOp.
+  if (!allowRAR && !isa<AffineStoreOp>(srcAccess.opInst) &&
+      !isa<AffineStoreOp>(dstAccess.opInst))
+    return DependenceResult::NoDependence;
+
+  // Get composed access function for 'srcAccess'.
+  AffineValueMap srcAccessMap;
+  srcAccess.getAccessMap(&srcAccessMap);
+
+  // Get composed access function for 'dstAccess'.
+  AffineValueMap dstAccessMap;
+  dstAccess.getAccessMap(&dstAccessMap);
+
+  // Get iteration domain for the 'srcAccess' operation.
+  FlatAffineConstraints srcDomain;
+  if (failed(getInstIndexSet(srcAccess.opInst, &srcDomain)))
+    return DependenceResult::Failure;
+
+  // Get iteration domain for 'dstAccess' operation.
+  FlatAffineConstraints dstDomain;
+  if (failed(getInstIndexSet(dstAccess.opInst, &dstDomain)))
+    return DependenceResult::Failure;
+
+  // Return 'NoDependence' if loopDepth > numCommonLoops and if the ancestor
+  // operation of 'srcAccess' does not properly dominate the ancestor
+  // operation of 'dstAccess' in the same common operation block.
+  // Note: this check is skipped if 'allowRAR' is true, because because RAR
+  // deps can exist irrespective of lexicographic ordering b/w src and dst.
+  unsigned numCommonLoops = getNumCommonLoops(srcDomain, dstDomain);
+  assert(loopDepth <= numCommonLoops + 1);
+  if (!allowRAR && loopDepth > numCommonLoops &&
+      !srcAppearsBeforeDstInAncestralBlock(srcAccess, dstAccess, srcDomain,
+                                           numCommonLoops)) {
+    return DependenceResult::NoDependence;
+  }
+  // Build dim and symbol position maps for each access from access operand
+  // Value to position in merged contstraint system.
+  ValuePositionMap valuePosMap;
+  buildDimAndSymbolPositionMaps(srcDomain, dstDomain, srcAccessMap,
+                                dstAccessMap, &valuePosMap,
+                                dependenceConstraints);
+
+  initDependenceConstraints(srcDomain, dstDomain, srcAccessMap, dstAccessMap,
+                            valuePosMap, dependenceConstraints);
+
+  assert(valuePosMap.getNumDims() ==
+         srcDomain.getNumDimIds() + dstDomain.getNumDimIds());
+
+  // Create memref access constraint by equating src/dst access functions.
+  // Note that this check is conservative, and will fail in the future when
+  // local variables for mod/div exprs are supported.
+  if (failed(addMemRefAccessConstraints(srcAccessMap, dstAccessMap, valuePosMap,
+                                        dependenceConstraints)))
+    return DependenceResult::Failure;
+
+  // Add 'src' happens before 'dst' ordering constraints.
+  addOrderingConstraints(srcDomain, dstDomain, loopDepth,
+                         dependenceConstraints);
+  // Add src and dst domain constraints.
+  addDomainConstraints(srcDomain, dstDomain, valuePosMap,
+                       dependenceConstraints);
+
+  // Return 'NoDependence' if the solution space is empty: no dependence.
+  if (dependenceConstraints->isEmpty()) {
+    return DependenceResult::NoDependence;
+  }
+
+  // Compute dependence direction vector and return true.
+  if (dependenceComponents != nullptr) {
+    computeDirectionVector(srcDomain, dstDomain, loopDepth,
+                           dependenceConstraints, dependenceComponents);
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Dependence polyhedron:\n");
+  LLVM_DEBUG(dependenceConstraints->dump());
+  return DependenceResult::HasDependence;
+}
+
+/// Gathers dependence components for dependences between all ops in loop nest
+/// rooted at 'forOp' at loop depths in range [1, maxLoopDepth].
+void mlir::getDependenceComponents(
+    AffineForOp forOp, unsigned maxLoopDepth,
+    std::vector<llvm::SmallVector<DependenceComponent, 2>> *depCompsVec) {
+  // Collect all load and store ops in loop nest rooted at 'forOp'.
+  SmallVector<Operation *, 8> loadAndStoreOpInsts;
+  forOp.getOperation()->walk([&](Operation *opInst) {
+    if (isa<AffineLoadOp>(opInst) || isa<AffineStoreOp>(opInst))
+      loadAndStoreOpInsts.push_back(opInst);
+  });
+
+  unsigned numOps = loadAndStoreOpInsts.size();
+  for (unsigned d = 1; d <= maxLoopDepth; ++d) {
+    for (unsigned i = 0; i < numOps; ++i) {
+      auto *srcOpInst = loadAndStoreOpInsts[i];
+      MemRefAccess srcAccess(srcOpInst);
+      for (unsigned j = 0; j < numOps; ++j) {
+        auto *dstOpInst = loadAndStoreOpInsts[j];
+        MemRefAccess dstAccess(dstOpInst);
+
+        FlatAffineConstraints dependenceConstraints;
+        llvm::SmallVector<DependenceComponent, 2> depComps;
+        // TODO(andydavis,bondhugula) Explore whether it would be profitable
+        // to pre-compute and store deps instead of repeatedly checking.
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
+        if (hasDependence(result))
+          depCompsVec->push_back(depComps);
+      }
+    }
+  }
+}
diff --git a/third_party/mlir/lib/Analysis/AffineStructures.cpp b/third_party/mlir/lib/Analysis/AffineStructures.cpp
new file mode 100644
index 00000000000..70daca9754f
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/AffineStructures.cpp
@@ -0,0 +1,2806 @@
+//===- AffineStructures.cpp - MLIR Affine Structures Class-----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Structures for affine/polyhedral analysis of MLIR functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "affine-structures"
+
+using namespace mlir;
+using llvm::SmallDenseMap;
+using llvm::SmallDenseSet;
+using llvm::SmallPtrSet;
+
+namespace {
+
+// See comments for SimpleAffineExprFlattener.
+// An AffineExprFlattener extends a SimpleAffineExprFlattener by recording
+// constraint information associated with mod's, floordiv's, and ceildiv's
+// in FlatAffineConstraints 'localVarCst'.
+struct AffineExprFlattener : public SimpleAffineExprFlattener {
+public:
+  // Constraints connecting newly introduced local variables (for mod's and
+  // div's) to existing (dimensional and symbolic) ones. These are always
+  // inequalities.
+  FlatAffineConstraints localVarCst;
+
+  AffineExprFlattener(unsigned nDims, unsigned nSymbols, MLIRContext *ctx)
+      : SimpleAffineExprFlattener(nDims, nSymbols) {
+    localVarCst.reset(nDims, nSymbols, /*numLocals=*/0);
+  }
+
+private:
+  // Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+  // The local identifier added is always a floordiv of a pure add/mul affine
+  // function of other identifiers, coefficients of which are specified in
+  // `dividend' and with respect to the positive constant `divisor'. localExpr
+  // is the simplified tree expression (AffineExpr) corresponding to the
+  // quantifier.
+  void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
+                          AffineExpr localExpr) override {
+    SimpleAffineExprFlattener::addLocalFloorDivId(dividend, divisor, localExpr);
+    // Update localVarCst.
+    localVarCst.addLocalFloorDiv(dividend, divisor);
+  }
+};
+
+} // end anonymous namespace
+
+// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+// flattened (i.e., semi-affine expressions not handled yet).
+static LogicalResult getFlattenedAffineExprs(
+    ArrayRef<AffineExpr> exprs, unsigned numDims, unsigned numSymbols,
+    std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *localVarCst) {
+  if (exprs.empty()) {
+    localVarCst->reset(numDims, numSymbols);
+    return success();
+  }
+
+  AffineExprFlattener flattener(numDims, numSymbols, exprs[0].getContext());
+  // Use the same flattener to simplify each expression successively. This way
+  // local identifiers / expressions are shared.
+  for (auto expr : exprs) {
+    if (!expr.isPureAffine())
+      return failure();
+
+    flattener.walkPostOrder(expr);
+  }
+
+  assert(flattener.operandExprStack.size() == exprs.size());
+  flattenedExprs->clear();
+  flattenedExprs->assign(flattener.operandExprStack.begin(),
+                         flattener.operandExprStack.end());
+
+  if (localVarCst) {
+    localVarCst->clearAndCopyFrom(flattener.localVarCst);
+  }
+
+  return success();
+}
+
+// Flattens 'expr' into 'flattenedExpr'. Returns failure if 'expr' was unable to
+// be flattened (semi-affine expressions not handled yet).
+LogicalResult
+mlir::getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                             unsigned numSymbols,
+                             llvm::SmallVectorImpl<int64_t> *flattenedExpr,
+                             FlatAffineConstraints *localVarCst) {
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  LogicalResult ret = ::getFlattenedAffineExprs({expr}, numDims, numSymbols,
+                                                &flattenedExprs, localVarCst);
+  *flattenedExpr = flattenedExprs[0];
+  return ret;
+}
+
+/// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+/// flattened (i.e., semi-affine expressions not handled yet).
+LogicalResult mlir::getFlattenedAffineExprs(
+    AffineMap map, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *localVarCst) {
+  if (map.getNumResults() == 0) {
+    localVarCst->reset(map.getNumDims(), map.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
+                                   map.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+LogicalResult mlir::getFlattenedAffineExprs(
+    IntegerSet set, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *localVarCst) {
+  if (set.getNumConstraints() == 0) {
+    localVarCst->reset(set.getNumDims(), set.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
+                                   set.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+//===----------------------------------------------------------------------===//
+// MutableAffineMap.
+//===----------------------------------------------------------------------===//
+
+MutableAffineMap::MutableAffineMap(AffineMap map)
+    : numDims(map.getNumDims()), numSymbols(map.getNumSymbols()),
+      // A map always has at least 1 result by construction
+      context(map.getResult(0).getContext()) {
+  for (auto result : map.getResults())
+    results.push_back(result);
+}
+
+void MutableAffineMap::reset(AffineMap map) {
+  results.clear();
+  numDims = map.getNumDims();
+  numSymbols = map.getNumSymbols();
+  // A map always has at least 1 result by construction
+  context = map.getResult(0).getContext();
+  for (auto result : map.getResults())
+    results.push_back(result);
+}
+
+bool MutableAffineMap::isMultipleOf(unsigned idx, int64_t factor) const {
+  if (results[idx].isMultipleOf(factor))
+    return true;
+
+  // TODO(bondhugula): use simplifyAffineExpr and FlatAffineConstraints to
+  // complete this (for a more powerful analysis).
+  return false;
+}
+
+// Simplifies the result affine expressions of this map. The expressions have to
+// be pure for the simplification implemented.
+void MutableAffineMap::simplify() {
+  // Simplify each of the results if possible.
+  // TODO(ntv): functional-style map
+  for (unsigned i = 0, e = getNumResults(); i < e; i++) {
+    results[i] = simplifyAffineExpr(getResult(i), numDims, numSymbols);
+  }
+}
+
+AffineMap MutableAffineMap::getAffineMap() const {
+  return AffineMap::get(numDims, numSymbols, results);
+}
+
+MutableIntegerSet::MutableIntegerSet(IntegerSet set, MLIRContext *context)
+    : numDims(set.getNumDims()), numSymbols(set.getNumSymbols()) {
+  // TODO(bondhugula)
+}
+
+// Universal set.
+MutableIntegerSet::MutableIntegerSet(unsigned numDims, unsigned numSymbols,
+                                     MLIRContext *context)
+    : numDims(numDims), numSymbols(numSymbols) {}
+
+//===----------------------------------------------------------------------===//
+// AffineValueMap.
+//===----------------------------------------------------------------------===//
+
+AffineValueMap::AffineValueMap(AffineMap map, ArrayRef<Value *> operands,
+                               ArrayRef<Value *> results)
+    : map(map), operands(operands.begin(), operands.end()),
+      results(results.begin(), results.end()) {}
+
+AffineValueMap::AffineValueMap(AffineApplyOp applyOp)
+    : map(applyOp.getAffineMap()),
+      operands(applyOp.operand_begin(), applyOp.operand_end()) {
+  results.push_back(applyOp.getResult());
+}
+
+AffineValueMap::AffineValueMap(AffineBound bound)
+    : map(bound.getMap()),
+      operands(bound.operand_begin(), bound.operand_end()) {}
+
+void AffineValueMap::reset(AffineMap map, ArrayRef<Value *> operands,
+                           ArrayRef<Value *> results) {
+  this->map.reset(map);
+  this->operands.assign(operands.begin(), operands.end());
+  this->results.assign(results.begin(), results.end());
+}
+
+// Returns true and sets 'indexOfMatch' if 'valueToMatch' is found in
+// 'valuesToSearch' beginning at 'indexStart'. Returns false otherwise.
+static bool findIndex(Value *valueToMatch, ArrayRef<Value *> valuesToSearch,
+                      unsigned indexStart, unsigned *indexOfMatch) {
+  unsigned size = valuesToSearch.size();
+  for (unsigned i = indexStart; i < size; ++i) {
+    if (valueToMatch == valuesToSearch[i]) {
+      *indexOfMatch = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool AffineValueMap::isMultipleOf(unsigned idx, int64_t factor) const {
+  return map.isMultipleOf(idx, factor);
+}
+
+/// This method uses the invariant that operands are always positionally aligned
+/// with the AffineDimExpr in the underlying AffineMap.
+bool AffineValueMap::isFunctionOf(unsigned idx, Value *value) const {
+  unsigned index;
+  if (!findIndex(value, operands, /*indexStart=*/0, &index)) {
+    return false;
+  }
+  auto expr = const_cast<AffineValueMap *>(this)->getAffineMap().getResult(idx);
+  // TODO(ntv): this is better implemented on a flattened representation.
+  // At least for now it is conservative.
+  return expr.isFunctionOfDim(index);
+}
+
+Value *AffineValueMap::getOperand(unsigned i) const {
+  return static_cast<Value *>(operands[i]);
+}
+
+ArrayRef<Value *> AffineValueMap::getOperands() const {
+  return ArrayRef<Value *>(operands);
+}
+
+AffineMap AffineValueMap::getAffineMap() const { return map.getAffineMap(); }
+
+AffineValueMap::~AffineValueMap() {}
+
+//===----------------------------------------------------------------------===//
+// FlatAffineConstraints.
+//===----------------------------------------------------------------------===//
+
+// Copy constructor.
+FlatAffineConstraints::FlatAffineConstraints(
+    const FlatAffineConstraints &other) {
+  numReservedCols = other.numReservedCols;
+  numDims = other.getNumDimIds();
+  numSymbols = other.getNumSymbolIds();
+  numIds = other.getNumIds();
+
+  auto otherIds = other.getIds();
+  ids.reserve(numReservedCols);
+  ids.append(otherIds.begin(), otherIds.end());
+
+  unsigned numReservedEqualities = other.getNumReservedEqualities();
+  unsigned numReservedInequalities = other.getNumReservedInequalities();
+
+  equalities.reserve(numReservedEqualities * numReservedCols);
+  inequalities.reserve(numReservedInequalities * numReservedCols);
+
+  for (unsigned r = 0, e = other.getNumInequalities(); r < e; r++) {
+    addInequality(other.getInequality(r));
+  }
+  for (unsigned r = 0, e = other.getNumEqualities(); r < e; r++) {
+    addEquality(other.getEquality(r));
+  }
+}
+
+// Clones this object.
+std::unique_ptr<FlatAffineConstraints> FlatAffineConstraints::clone() const {
+  return std::make_unique<FlatAffineConstraints>(*this);
+}
+
+// Construct from an IntegerSet.
+FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
+    : numReservedCols(set.getNumOperands() + 1),
+      numIds(set.getNumDims() + set.getNumSymbols()), numDims(set.getNumDims()),
+      numSymbols(set.getNumSymbols()) {
+  equalities.reserve(set.getNumEqualities() * numReservedCols);
+  inequalities.reserve(set.getNumInequalities() * numReservedCols);
+  ids.resize(numIds, None);
+
+  // Flatten expressions and add them to the constraint system.
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatAffineConstraints localVarCst;
+  if (failed(getFlattenedAffineExprs(set, &flatExprs, &localVarCst))) {
+    assert(false && "flattening unimplemented for semi-affine integer sets");
+    return;
+  }
+  assert(flatExprs.size() == set.getNumConstraints());
+  for (unsigned l = 0, e = localVarCst.getNumLocalIds(); l < e; l++) {
+    addLocalId(getNumLocalIds());
+  }
+
+  for (unsigned i = 0, e = flatExprs.size(); i < e; ++i) {
+    const auto &flatExpr = flatExprs[i];
+    assert(flatExpr.size() == getNumCols());
+    if (set.getEqFlags()[i]) {
+      addEquality(flatExpr);
+    } else {
+      addInequality(flatExpr);
+    }
+  }
+  // Add the other constraints involving local id's from flattening.
+  append(localVarCst);
+}
+
+void FlatAffineConstraints::reset(unsigned numReservedInequalities,
+                                  unsigned numReservedEqualities,
+                                  unsigned newNumReservedCols,
+                                  unsigned newNumDims, unsigned newNumSymbols,
+                                  unsigned newNumLocals,
+                                  ArrayRef<Value *> idArgs) {
+  assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
+         "minimum 1 column");
+  numReservedCols = newNumReservedCols;
+  numDims = newNumDims;
+  numSymbols = newNumSymbols;
+  numIds = numDims + numSymbols + newNumLocals;
+  assert(idArgs.empty() || idArgs.size() == numIds);
+
+  clearConstraints();
+  if (numReservedEqualities >= 1)
+    equalities.reserve(newNumReservedCols * numReservedEqualities);
+  if (numReservedInequalities >= 1)
+    inequalities.reserve(newNumReservedCols * numReservedInequalities);
+  if (idArgs.empty()) {
+    ids.resize(numIds, None);
+  } else {
+    ids.assign(idArgs.begin(), idArgs.end());
+  }
+}
+
+void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols,
+                                  unsigned newNumLocals,
+                                  ArrayRef<Value *> idArgs) {
+  reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
+        newNumSymbols, newNumLocals, idArgs);
+}
+
+void FlatAffineConstraints::append(const FlatAffineConstraints &other) {
+  assert(other.getNumCols() == getNumCols());
+  assert(other.getNumDimIds() == getNumDimIds());
+  assert(other.getNumSymbolIds() == getNumSymbolIds());
+
+  inequalities.reserve(inequalities.size() +
+                       other.getNumInequalities() * numReservedCols);
+  equalities.reserve(equalities.size() +
+                     other.getNumEqualities() * numReservedCols);
+
+  for (unsigned r = 0, e = other.getNumInequalities(); r < e; r++) {
+    addInequality(other.getInequality(r));
+  }
+  for (unsigned r = 0, e = other.getNumEqualities(); r < e; r++) {
+    addEquality(other.getEquality(r));
+  }
+}
+
+void FlatAffineConstraints::addLocalId(unsigned pos) {
+  addId(IdKind::Local, pos);
+}
+
+void FlatAffineConstraints::addDimId(unsigned pos, Value *id) {
+  addId(IdKind::Dimension, pos, id);
+}
+
+void FlatAffineConstraints::addSymbolId(unsigned pos, Value *id) {
+  addId(IdKind::Symbol, pos, id);
+}
+
+/// Adds a dimensional identifier. The added column is initialized to
+/// zero.
+void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value *id) {
+  if (kind == IdKind::Dimension) {
+    assert(pos <= getNumDimIds());
+  } else if (kind == IdKind::Symbol) {
+    assert(pos <= getNumSymbolIds());
+  } else {
+    assert(pos <= getNumLocalIds());
+  }
+
+  unsigned oldNumReservedCols = numReservedCols;
+
+  // Check if a resize is necessary.
+  if (getNumCols() + 1 > numReservedCols) {
+    equalities.resize(getNumEqualities() * (getNumCols() + 1));
+    inequalities.resize(getNumInequalities() * (getNumCols() + 1));
+    numReservedCols++;
+  }
+
+  int absolutePos;
+
+  if (kind == IdKind::Dimension) {
+    absolutePos = pos;
+    numDims++;
+  } else if (kind == IdKind::Symbol) {
+    absolutePos = pos + getNumDimIds();
+    numSymbols++;
+  } else {
+    absolutePos = pos + getNumDimIds() + getNumSymbolIds();
+  }
+  numIds++;
+
+  // Note that getNumCols() now will already return the new size, which will be
+  // at least one.
+  int numInequalities = static_cast<int>(getNumInequalities());
+  int numEqualities = static_cast<int>(getNumEqualities());
+  int numCols = static_cast<int>(getNumCols());
+  for (int r = numInequalities - 1; r >= 0; r--) {
+    for (int c = numCols - 2; c >= 0; c--) {
+      if (c < absolutePos)
+        atIneq(r, c) = inequalities[r * oldNumReservedCols + c];
+      else
+        atIneq(r, c + 1) = inequalities[r * oldNumReservedCols + c];
+    }
+    atIneq(r, absolutePos) = 0;
+  }
+
+  for (int r = numEqualities - 1; r >= 0; r--) {
+    for (int c = numCols - 2; c >= 0; c--) {
+      // All values in column absolutePositions < absolutePos have the same
+      // coordinates in the 2-d view of the coefficient buffer.
+      if (c < absolutePos)
+        atEq(r, c) = equalities[r * oldNumReservedCols + c];
+      else
+        // Those at absolutePosition >= absolutePos, get a shifted
+        // absolutePosition.
+        atEq(r, c + 1) = equalities[r * oldNumReservedCols + c];
+    }
+    // Initialize added dimension to zero.
+    atEq(r, absolutePos) = 0;
+  }
+
+  // If an 'id' is provided, insert it; otherwise use None.
+  if (id) {
+    ids.insert(ids.begin() + absolutePos, id);
+  } else {
+    ids.insert(ids.begin() + absolutePos, None);
+  }
+  assert(ids.size() == getNumIds());
+}
+
+/// Checks if two constraint systems are in the same space, i.e., if they are
+/// associated with the same set of identifiers, appearing in the same order.
+static bool areIdsAligned(const FlatAffineConstraints &A,
+                          const FlatAffineConstraints &B) {
+  return A.getNumDimIds() == B.getNumDimIds() &&
+         A.getNumSymbolIds() == B.getNumSymbolIds() &&
+         A.getNumIds() == B.getNumIds() && A.getIds().equals(B.getIds());
+}
+
+/// Calls areIdsAligned to check if two constraint systems have the same set
+/// of identifiers in the same order.
+bool FlatAffineConstraints::areIdsAlignedWithOther(
+    const FlatAffineConstraints &other) {
+  return areIdsAligned(*this, other);
+}
+
+/// Checks if the SSA values associated with `cst''s identifiers are unique.
+static bool LLVM_ATTRIBUTE_UNUSED
+areIdsUnique(const FlatAffineConstraints &cst) {
+  SmallPtrSet<Value *, 8> uniqueIds;
+  for (auto id : cst.getIds()) {
+    if (id.hasValue() && !uniqueIds.insert(id.getValue()).second)
+      return false;
+  }
+  return true;
+}
+
+// Swap the posA^th identifier with the posB^th identifier.
+static void swapId(FlatAffineConstraints *A, unsigned posA, unsigned posB) {
+  assert(posA < A->getNumIds() && "invalid position A");
+  assert(posB < A->getNumIds() && "invalid position B");
+
+  if (posA == posB)
+    return;
+
+  for (unsigned r = 0, e = A->getNumInequalities(); r < e; r++) {
+    std::swap(A->atIneq(r, posA), A->atIneq(r, posB));
+  }
+  for (unsigned r = 0, e = A->getNumEqualities(); r < e; r++) {
+    std::swap(A->atEq(r, posA), A->atEq(r, posB));
+  }
+  std::swap(A->getId(posA), A->getId(posB));
+}
+
+/// Merge and align the identifiers of A and B starting at 'offset', so that
+/// both constraint systems get the union of the contained identifiers that is
+/// dimension-wise and symbol-wise unique; both constraint systems are updated
+/// so that they have the union of all identifiers, with A's original
+/// identifiers appearing first followed by any of B's identifiers that didn't
+/// appear in A. Local identifiers of each system are by design separate/local
+/// and are placed one after other (A's followed by B's).
+//  Eg: Input: A has ((%i %j) [%M %N]) and B has (%k, %j) [%P, %N, %M])
+//      Output: both A, B have (%i, %j, %k) [%M, %N, %P]
+//
+static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *A,
+                             FlatAffineConstraints *B) {
+  assert(offset <= A->getNumDimIds() && offset <= B->getNumDimIds());
+  // A merge/align isn't meaningful if a cst's ids aren't distinct.
+  assert(areIdsUnique(*A) && "A's id values aren't unique");
+  assert(areIdsUnique(*B) && "B's id values aren't unique");
+
+  assert(std::all_of(A->getIds().begin() + offset,
+                     A->getIds().begin() + A->getNumDimAndSymbolIds(),
+                     [](Optional<Value *> id) { return id.hasValue(); }));
+
+  assert(std::all_of(B->getIds().begin() + offset,
+                     B->getIds().begin() + B->getNumDimAndSymbolIds(),
+                     [](Optional<Value *> id) { return id.hasValue(); }));
+
+  // Place local id's of A after local id's of B.
+  for (unsigned l = 0, e = A->getNumLocalIds(); l < e; l++) {
+    B->addLocalId(0);
+  }
+  for (unsigned t = 0, e = B->getNumLocalIds() - A->getNumLocalIds(); t < e;
+       t++) {
+    A->addLocalId(A->getNumLocalIds());
+  }
+
+  SmallVector<Value *, 4> aDimValues, aSymValues;
+  A->getIdValues(offset, A->getNumDimIds(), &aDimValues);
+  A->getIdValues(A->getNumDimIds(), A->getNumDimAndSymbolIds(), &aSymValues);
+  {
+    // Merge dims from A into B.
+    unsigned d = offset;
+    for (auto *aDimValue : aDimValues) {
+      unsigned loc;
+      if (B->findId(*aDimValue, &loc)) {
+        assert(loc >= offset && "A's dim appears in B's aligned range");
+        assert(loc < B->getNumDimIds() &&
+               "A's dim appears in B's non-dim position");
+        swapId(B, d, loc);
+      } else {
+        B->addDimId(d);
+        B->setIdValue(d, aDimValue);
+      }
+      d++;
+    }
+
+    // Dimensions that are in B, but not in A, are added at the end.
+    for (unsigned t = A->getNumDimIds(), e = B->getNumDimIds(); t < e; t++) {
+      A->addDimId(A->getNumDimIds());
+      A->setIdValue(A->getNumDimIds() - 1, B->getIdValue(t));
+    }
+  }
+  {
+    // Merge symbols: merge A's symbols into B first.
+    unsigned s = B->getNumDimIds();
+    for (auto *aSymValue : aSymValues) {
+      unsigned loc;
+      if (B->findId(*aSymValue, &loc)) {
+        assert(loc >= B->getNumDimIds() && loc < B->getNumDimAndSymbolIds() &&
+               "A's symbol appears in B's non-symbol position");
+        swapId(B, s, loc);
+      } else {
+        B->addSymbolId(s - B->getNumDimIds());
+        B->setIdValue(s, aSymValue);
+      }
+      s++;
+    }
+    // Symbols that are in B, but not in A, are added at the end.
+    for (unsigned t = A->getNumDimAndSymbolIds(),
+                  e = B->getNumDimAndSymbolIds();
+         t < e; t++) {
+      A->addSymbolId(A->getNumSymbolIds());
+      A->setIdValue(A->getNumDimAndSymbolIds() - 1, B->getIdValue(t));
+    }
+  }
+  assert(areIdsAligned(*A, *B) && "IDs expected to be aligned");
+}
+
+// Call 'mergeAndAlignIds' to align constraint systems of 'this' and 'other'.
+void FlatAffineConstraints::mergeAndAlignIdsWithOther(
+    unsigned offset, FlatAffineConstraints *other) {
+  mergeAndAlignIds(offset, this, other);
+}
+
+// This routine may add additional local variables if the flattened expression
+// corresponding to the map has such variables due to mod's, ceildiv's, and
+// floordiv's in it.
+LogicalResult FlatAffineConstraints::composeMap(AffineValueMap *vMap) {
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatAffineConstraints localCst;
+  if (failed(getFlattenedAffineExprs(vMap->getAffineMap(), &flatExprs,
+                                     &localCst))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "composition unimplemented for semi-affine maps\n");
+    return failure();
+  }
+  assert(flatExprs.size() == vMap->getNumResults());
+
+  // Add localCst information.
+  if (localCst.getNumLocalIds() > 0) {
+    SmallVector<Value *, 8> values(vMap->getOperands().begin(),
+                                   vMap->getOperands().end());
+    localCst.setIdValues(0, localCst.getNumDimAndSymbolIds(), values);
+    // Align localCst and this.
+    mergeAndAlignIds(/*offset=*/0, &localCst, this);
+    // Finally, append localCst to this constraint set.
+    append(localCst);
+  }
+
+  // Add dimensions corresponding to the map's results.
+  for (unsigned t = 0, e = vMap->getNumResults(); t < e; t++) {
+    // TODO: Consider using a batched version to add a range of IDs.
+    addDimId(0);
+  }
+
+  // We add one equality for each result connecting the result dim of the map to
+  // the other identifiers.
+  // For eg: if the expression is 16*i0 + i1, and this is the r^th
+  // iteration/result of the value map, we are adding the equality:
+  //  d_r - 16*i0 - i1 = 0. Hence, when flattening say (i0 + 1, i0 + 8*i2), we
+  //  add two equalities overall: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
+  for (unsigned r = 0, e = flatExprs.size(); r < e; r++) {
+    const auto &flatExpr = flatExprs[r];
+    assert(flatExpr.size() >= vMap->getNumOperands() + 1);
+
+    // eqToAdd is the equality corresponding to the flattened affine expression.
+    SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
+    // Set the coefficient for this result to one.
+    eqToAdd[r] = 1;
+
+    // Dims and symbols.
+    for (unsigned i = 0, e = vMap->getNumOperands(); i < e; i++) {
+      unsigned loc;
+      bool ret = findId(*vMap->getOperand(i), &loc);
+      assert(ret && "value map's id can't be found");
+      (void)ret;
+      // Negate 'eq[r]' since the newly added dimension will be set to this one.
+      eqToAdd[loc] = -flatExpr[i];
+    }
+    // Local vars common to eq and localCst are at the beginning.
+    unsigned j = getNumDimIds() + getNumSymbolIds();
+    unsigned end = flatExpr.size() - 1;
+    for (unsigned i = vMap->getNumOperands(); i < end; i++, j++) {
+      eqToAdd[j] = -flatExpr[i];
+    }
+
+    // Constant term.
+    eqToAdd[getNumCols() - 1] = -flatExpr[flatExpr.size() - 1];
+
+    // Add the equality connecting the result of the map to this constraint set.
+    addEquality(eqToAdd);
+  }
+
+  return success();
+}
+
+// Turn a dimension into a symbol.
+static void turnDimIntoSymbol(FlatAffineConstraints *cst, Value &id) {
+  unsigned pos;
+  if (cst->findId(id, &pos) && pos < cst->getNumDimIds()) {
+    swapId(cst, pos, cst->getNumDimIds() - 1);
+    cst->setDimSymbolSeparation(cst->getNumSymbolIds() + 1);
+  }
+}
+
+// Turn a symbol into a dimension.
+static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value &id) {
+  unsigned pos;
+  if (cst->findId(id, &pos) && pos >= cst->getNumDimIds() &&
+      pos < cst->getNumDimAndSymbolIds()) {
+    swapId(cst, pos, cst->getNumDimIds());
+    cst->setDimSymbolSeparation(cst->getNumSymbolIds() - 1);
+  }
+}
+
+// Changes all symbol identifiers which are loop IVs to dim identifiers.
+void FlatAffineConstraints::convertLoopIVSymbolsToDims() {
+  // Gather all symbols which are loop IVs.
+  SmallVector<Value *, 4> loopIVs;
+  for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) {
+    if (ids[i].hasValue() && getForInductionVarOwner(ids[i].getValue()))
+      loopIVs.push_back(ids[i].getValue());
+  }
+  // Turn each symbol in 'loopIVs' into a dim identifier.
+  for (auto *iv : loopIVs) {
+    turnSymbolIntoDim(this, *iv);
+  }
+}
+
+void FlatAffineConstraints::addInductionVarOrTerminalSymbol(Value *id) {
+  if (containsId(*id))
+    return;
+
+  // Caller is expected to fully compose map/operands if necessary.
+  assert((isTopLevelSymbol(id) || isForInductionVar(id)) &&
+         "non-terminal symbol / loop IV expected");
+  // Outer loop IVs could be used in forOp's bounds.
+  if (auto loop = getForInductionVarOwner(id)) {
+    addDimId(getNumDimIds(), id);
+    if (failed(this->addAffineForOpDomain(loop)))
+      LLVM_DEBUG(
+          loop.emitWarning("failed to add domain info to constraint system"));
+    return;
+  }
+  // Add top level symbol.
+  addSymbolId(getNumSymbolIds(), id);
+  // Check if the symbol is a constant.
+  if (auto constOp = dyn_cast_or_null<ConstantIndexOp>(id->getDefiningOp()))
+    setIdToConstant(*id, constOp.getValue());
+}
+
+LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) {
+  unsigned pos;
+  // Pre-condition for this method.
+  if (!findId(*forOp.getInductionVar(), &pos)) {
+    assert(false && "Value not found");
+    return failure();
+  }
+
+  int64_t step = forOp.getStep();
+  if (step != 1) {
+    if (!forOp.hasConstantLowerBound())
+      forOp.emitWarning("domain conservatively approximated");
+    else {
+      // Add constraints for the stride.
+      // (iv - lb) % step = 0 can be written as:
+      // (iv - lb) - step * q = 0 where q = (iv - lb) / step.
+      // Add local variable 'q' and add the above equality.
+      // The first constraint is q = (iv - lb) floordiv step
+      SmallVector<int64_t, 8> dividend(getNumCols(), 0);
+      int64_t lb = forOp.getConstantLowerBound();
+      dividend[pos] = 1;
+      dividend.back() -= lb;
+      addLocalFloorDiv(dividend, step);
+      // Second constraint: (iv - lb) - step * q = 0.
+      SmallVector<int64_t, 8> eq(getNumCols(), 0);
+      eq[pos] = 1;
+      eq.back() -= lb;
+      // For the local var just added above.
+      eq[getNumCols() - 2] = -step;
+      addEquality(eq);
+    }
+  }
+
+  if (forOp.hasConstantLowerBound()) {
+    addConstantLowerBound(pos, forOp.getConstantLowerBound());
+  } else {
+    // Non-constant lower bound case.
+    SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands().begin(),
+                                       forOp.getLowerBoundOperands().end());
+    if (failed(addLowerOrUpperBound(pos, forOp.getLowerBoundMap(), lbOperands,
+                                    /*eq=*/false, /*lower=*/true)))
+      return failure();
+  }
+
+  if (forOp.hasConstantUpperBound()) {
+    addConstantUpperBound(pos, forOp.getConstantUpperBound() - 1);
+    return success();
+  }
+  // Non-constant upper bound case.
+  SmallVector<Value *, 4> ubOperands(forOp.getUpperBoundOperands().begin(),
+                                     forOp.getUpperBoundOperands().end());
+  return addLowerOrUpperBound(pos, forOp.getUpperBoundMap(), ubOperands,
+                              /*eq=*/false, /*lower=*/false);
+}
+
+// Searches for a constraint with a non-zero coefficient at 'colIdx' in
+// equality (isEq=true) or inequality (isEq=false) constraints.
+// Returns true and sets row found in search in 'rowIdx'.
+// Returns false otherwise.
+static bool
+findConstraintWithNonZeroAt(const FlatAffineConstraints &constraints,
+                            unsigned colIdx, bool isEq, unsigned *rowIdx) {
+  auto at = [&](unsigned rowIdx) -> int64_t {
+    return isEq ? constraints.atEq(rowIdx, colIdx)
+                : constraints.atIneq(rowIdx, colIdx);
+  };
+  unsigned e =
+      isEq ? constraints.getNumEqualities() : constraints.getNumInequalities();
+  for (*rowIdx = 0; *rowIdx < e; ++(*rowIdx)) {
+    if (at(*rowIdx) != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Normalizes the coefficient values across all columns in 'rowIDx' by their
+// GCD in equality or inequality contraints as specified by 'isEq'.
+template <bool isEq>
+static void normalizeConstraintByGCD(FlatAffineConstraints *constraints,
+                                     unsigned rowIdx) {
+  auto at = [&](unsigned colIdx) -> int64_t {
+    return isEq ? constraints->atEq(rowIdx, colIdx)
+                : constraints->atIneq(rowIdx, colIdx);
+  };
+  uint64_t gcd = std::abs(at(0));
+  for (unsigned j = 1, e = constraints->getNumCols(); j < e; ++j) {
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(at(j)));
+  }
+  if (gcd > 0 && gcd != 1) {
+    for (unsigned j = 0, e = constraints->getNumCols(); j < e; ++j) {
+      int64_t v = at(j) / static_cast<int64_t>(gcd);
+      isEq ? constraints->atEq(rowIdx, j) = v
+           : constraints->atIneq(rowIdx, j) = v;
+    }
+  }
+}
+
+void FlatAffineConstraints::normalizeConstraintsByGCD() {
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    normalizeConstraintByGCD</*isEq=*/true>(this, i);
+  }
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    normalizeConstraintByGCD</*isEq=*/false>(this, i);
+  }
+}
+
+bool FlatAffineConstraints::hasConsistentState() const {
+  if (inequalities.size() != getNumInequalities() * numReservedCols)
+    return false;
+  if (equalities.size() != getNumEqualities() * numReservedCols)
+    return false;
+  if (ids.size() != getNumIds())
+    return false;
+
+  // Catches errors where numDims, numSymbols, numIds aren't consistent.
+  if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds)
+    return false;
+
+  return true;
+}
+
+/// Checks all rows of equality/inequality constraints for trivial
+/// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
+/// after elimination. Returns 'true' if an invalid constraint is found;
+/// 'false' otherwise.
+bool FlatAffineConstraints::hasInvalidConstraint() const {
+  assert(hasConsistentState());
+  auto check = [&](bool isEq) -> bool {
+    unsigned numCols = getNumCols();
+    unsigned numRows = isEq ? getNumEqualities() : getNumInequalities();
+    for (unsigned i = 0, e = numRows; i < e; ++i) {
+      unsigned j;
+      for (j = 0; j < numCols - 1; ++j) {
+        int64_t v = isEq ? atEq(i, j) : atIneq(i, j);
+        // Skip rows with non-zero variable coefficients.
+        if (v != 0)
+          break;
+      }
+      if (j < numCols - 1) {
+        continue;
+      }
+      // Check validity of constant term at 'numCols - 1' w.r.t 'isEq'.
+      // Example invalid constraints include: '1 == 0' or '-1 >= 0'
+      int64_t v = isEq ? atEq(i, numCols - 1) : atIneq(i, numCols - 1);
+      if ((isEq && v != 0) || (!isEq && v < 0)) {
+        return true;
+      }
+    }
+    return false;
+  };
+  if (check(/*isEq=*/true))
+    return true;
+  return check(/*isEq=*/false);
+}
+
+// Eliminate identifier from constraint at 'rowIdx' based on coefficient at
+// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be
+// updated as they have already been eliminated.
+static void eliminateFromConstraint(FlatAffineConstraints *constraints,
+                                    unsigned rowIdx, unsigned pivotRow,
+                                    unsigned pivotCol, unsigned elimColStart,
+                                    bool isEq) {
+  // Skip if equality 'rowIdx' if same as 'pivotRow'.
+  if (isEq && rowIdx == pivotRow)
+    return;
+  auto at = [&](unsigned i, unsigned j) -> int64_t {
+    return isEq ? constraints->atEq(i, j) : constraints->atIneq(i, j);
+  };
+  int64_t leadCoeff = at(rowIdx, pivotCol);
+  // Skip if leading coefficient at 'rowIdx' is already zero.
+  if (leadCoeff == 0)
+    return;
+  int64_t pivotCoeff = constraints->atEq(pivotRow, pivotCol);
+  int64_t sign = (leadCoeff * pivotCoeff > 0) ? -1 : 1;
+  int64_t lcm = mlir::lcm(pivotCoeff, leadCoeff);
+  int64_t pivotMultiplier = sign * (lcm / std::abs(pivotCoeff));
+  int64_t rowMultiplier = lcm / std::abs(leadCoeff);
+
+  unsigned numCols = constraints->getNumCols();
+  for (unsigned j = 0; j < numCols; ++j) {
+    // Skip updating column 'j' if it was just eliminated.
+    if (j >= elimColStart && j < pivotCol)
+      continue;
+    int64_t v = pivotMultiplier * constraints->atEq(pivotRow, j) +
+                rowMultiplier * at(rowIdx, j);
+    isEq ? constraints->atEq(rowIdx, j) = v
+         : constraints->atIneq(rowIdx, j) = v;
+  }
+}
+
+// Remove coefficients in column range [colStart, colLimit) in place.
+// This removes in data in the specified column range, and copies any
+// remaining valid data into place.
+static void shiftColumnsToLeft(FlatAffineConstraints *constraints,
+                               unsigned colStart, unsigned colLimit,
+                               bool isEq) {
+  assert(colLimit <= constraints->getNumIds());
+  if (colLimit <= colStart)
+    return;
+
+  unsigned numCols = constraints->getNumCols();
+  unsigned numRows = isEq ? constraints->getNumEqualities()
+                          : constraints->getNumInequalities();
+  unsigned numToEliminate = colLimit - colStart;
+  for (unsigned r = 0, e = numRows; r < e; ++r) {
+    for (unsigned c = colLimit; c < numCols; ++c) {
+      if (isEq) {
+        constraints->atEq(r, c - numToEliminate) = constraints->atEq(r, c);
+      } else {
+        constraints->atIneq(r, c - numToEliminate) = constraints->atIneq(r, c);
+      }
+    }
+  }
+}
+
+// Removes identifiers in column range [idStart, idLimit), and copies any
+// remaining valid data into place, and updates member variables.
+void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) {
+  assert(idLimit < getNumCols() && "invalid id limit");
+
+  if (idStart >= idLimit)
+    return;
+
+  // We are going to be removing one or more identifiers from the range.
+  assert(idStart < numIds && "invalid idStart position");
+
+  // TODO(andydavis) Make 'removeIdRange' a lambda called from here.
+  // Remove eliminated identifiers from equalities.
+  shiftColumnsToLeft(this, idStart, idLimit, /*isEq=*/true);
+
+  // Remove eliminated identifiers from inequalities.
+  shiftColumnsToLeft(this, idStart, idLimit, /*isEq=*/false);
+
+  // Update members numDims, numSymbols and numIds.
+  unsigned numDimsEliminated = 0;
+  unsigned numLocalsEliminated = 0;
+  unsigned numColsEliminated = idLimit - idStart;
+  if (idStart < numDims) {
+    numDimsEliminated = std::min(numDims, idLimit) - idStart;
+  }
+  // Check how many local id's were removed. Note that our identifier order is
+  // [dims, symbols, locals]. Local id start at position numDims + numSymbols.
+  if (idLimit > numDims + numSymbols) {
+    numLocalsEliminated = std::min(
+        idLimit - std::max(idStart, numDims + numSymbols), getNumLocalIds());
+  }
+  unsigned numSymbolsEliminated =
+      numColsEliminated - numDimsEliminated - numLocalsEliminated;
+
+  numDims -= numDimsEliminated;
+  numSymbols -= numSymbolsEliminated;
+  numIds = numIds - numColsEliminated;
+
+  ids.erase(ids.begin() + idStart, ids.begin() + idLimit);
+
+  // No resize necessary. numReservedCols remains the same.
+}
+
+/// Returns the position of the identifier that has the minimum <number of lower
+/// bounds> times <number of upper bounds> from the specified range of
+/// identifiers [start, end). It is often best to eliminate in the increasing
+/// order of these counts when doing Fourier-Motzkin elimination since FM adds
+/// that many new constraints.
+static unsigned getBestIdToEliminate(const FlatAffineConstraints &cst,
+                                     unsigned start, unsigned end) {
+  assert(start < cst.getNumIds() && end < cst.getNumIds() + 1);
+
+  auto getProductOfNumLowerUpperBounds = [&](unsigned pos) {
+    unsigned numLb = 0;
+    unsigned numUb = 0;
+    for (unsigned r = 0, e = cst.getNumInequalities(); r < e; r++) {
+      if (cst.atIneq(r, pos) > 0) {
+        ++numLb;
+      } else if (cst.atIneq(r, pos) < 0) {
+        ++numUb;
+      }
+    }
+    return numLb * numUb;
+  };
+
+  unsigned minLoc = start;
+  unsigned min = getProductOfNumLowerUpperBounds(start);
+  for (unsigned c = start + 1; c < end; c++) {
+    unsigned numLbUbProduct = getProductOfNumLowerUpperBounds(c);
+    if (numLbUbProduct < min) {
+      min = numLbUbProduct;
+      minLoc = c;
+    }
+  }
+  return minLoc;
+}
+
+// Checks for emptiness of the set by eliminating identifiers successively and
+// using the GCD test (on all equality constraints) and checking for trivially
+// invalid constraints. Returns 'true' if the constraint system is found to be
+// empty; false otherwise.
+bool FlatAffineConstraints::isEmpty() const {
+  if (isEmptyByGCDTest() || hasInvalidConstraint())
+    return true;
+
+  // First, eliminate as many identifiers as possible using Gaussian
+  // elimination.
+  FlatAffineConstraints tmpCst(*this);
+  unsigned currentPos = 0;
+  while (currentPos < tmpCst.getNumIds()) {
+    tmpCst.gaussianEliminateIds(currentPos, tmpCst.getNumIds());
+    ++currentPos;
+    // We check emptiness through trivial checks after eliminating each ID to
+    // detect emptiness early. Since the checks isEmptyByGCDTest() and
+    // hasInvalidConstraint() are linear time and single sweep on the constraint
+    // buffer, this appears reasonable - but can optimize in the future.
+    if (tmpCst.hasInvalidConstraint() || tmpCst.isEmptyByGCDTest())
+      return true;
+  }
+
+  // Eliminate the remaining using FM.
+  for (unsigned i = 0, e = tmpCst.getNumIds(); i < e; i++) {
+    tmpCst.FourierMotzkinEliminate(
+        getBestIdToEliminate(tmpCst, 0, tmpCst.getNumIds()));
+    // Check for a constraint explosion. This rarely happens in practice, but
+    // this check exists as a safeguard against improperly constructed
+    // constraint systems or artifically created arbitrarily complex systems
+    // that aren't the intended use case for FlatAffineConstraints. This is
+    // needed since FM has a worst case exponential complexity in theory.
+    if (tmpCst.getNumConstraints() >= kExplosionFactor * getNumIds()) {
+      LLVM_DEBUG(llvm::dbgs() << "FM constraint explosion detected\n");
+      return false;
+    }
+
+    // FM wouldn't have modified the equalities in any way. So no need to again
+    // run GCD test. Check for trivial invalid constraints.
+    if (tmpCst.hasInvalidConstraint())
+      return true;
+  }
+  return false;
+}
+
+// Runs the GCD test on all equality constraints. Returns 'true' if this test
+// fails on any equality. Returns 'false' otherwise.
+// This test can be used to disprove the existence of a solution. If it returns
+// true, no integer solution to the equality constraints can exist.
+//
+// GCD test definition:
+//
+// The equality constraint:
+//
+//  c_1*x_1 + c_2*x_2 + ... + c_n*x_n = c_0
+//
+// has an integer solution iff:
+//
+//  GCD of c_1, c_2, ..., c_n divides c_0.
+//
+bool FlatAffineConstraints::isEmptyByGCDTest() const {
+  assert(hasConsistentState());
+  unsigned numCols = getNumCols();
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    uint64_t gcd = std::abs(atEq(i, 0));
+    for (unsigned j = 1; j < numCols - 1; ++j) {
+      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(atEq(i, j)));
+    }
+    int64_t v = std::abs(atEq(i, numCols - 1));
+    if (gcd > 0 && (v % gcd != 0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Tightens inequalities given that we are dealing with integer spaces. This is
+/// analogous to the GCD test but applied to inequalities. The constant term can
+/// be reduced to the preceding multiple of the GCD of the coefficients, i.e.,
+///  64*i - 100 >= 0  =>  64*i - 128 >= 0 (since 'i' is an integer). This is a
+/// fast method - linear in the number of coefficients.
+// Example on how this affects practical cases: consider the scenario:
+// 64*i >= 100, j = 64*i; without a tightening, elimination of i would yield
+// j >= 100 instead of the tighter (exact) j >= 128.
+void FlatAffineConstraints::GCDTightenInequalities() {
+  unsigned numCols = getNumCols();
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    uint64_t gcd = std::abs(atIneq(i, 0));
+    for (unsigned j = 1; j < numCols - 1; ++j) {
+      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(atIneq(i, j)));
+    }
+    if (gcd > 0 && gcd != 1) {
+      int64_t gcdI = static_cast<int64_t>(gcd);
+      // Tighten the constant term and normalize the constraint by the GCD.
+      atIneq(i, numCols - 1) = mlir::floorDiv(atIneq(i, numCols - 1), gcdI);
+      for (unsigned j = 0, e = numCols - 1; j < e; ++j)
+        atIneq(i, j) /= gcdI;
+    }
+  }
+}
+
+// Eliminates all identifer variables in column range [posStart, posLimit).
+// Returns the number of variables eliminated.
+unsigned FlatAffineConstraints::gaussianEliminateIds(unsigned posStart,
+                                                     unsigned posLimit) {
+  // Return if identifier positions to eliminate are out of range.
+  assert(posLimit <= numIds);
+  assert(hasConsistentState());
+
+  if (posStart >= posLimit)
+    return 0;
+
+  GCDTightenInequalities();
+
+  unsigned pivotCol = 0;
+  for (pivotCol = posStart; pivotCol < posLimit; ++pivotCol) {
+    // Find a row which has a non-zero coefficient in column 'j'.
+    unsigned pivotRow;
+    if (!findConstraintWithNonZeroAt(*this, pivotCol, /*isEq=*/true,
+                                     &pivotRow)) {
+      // No pivot row in equalities with non-zero at 'pivotCol'.
+      if (!findConstraintWithNonZeroAt(*this, pivotCol, /*isEq=*/false,
+                                       &pivotRow)) {
+        // If inequalities are also non-zero in 'pivotCol', it can be
+        // eliminated.
+        continue;
+      }
+      break;
+    }
+
+    // Eliminate identifier at 'pivotCol' from each equality row.
+    for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, pivotCol, posStart,
+                              /*isEq=*/true);
+      normalizeConstraintByGCD</*isEq=*/true>(this, i);
+    }
+
+    // Eliminate identifier at 'pivotCol' from each inequality row.
+    for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, pivotCol, posStart,
+                              /*isEq=*/false);
+      normalizeConstraintByGCD</*isEq=*/false>(this, i);
+    }
+    removeEquality(pivotRow);
+    GCDTightenInequalities();
+  }
+  // Update position limit based on number eliminated.
+  posLimit = pivotCol;
+  // Remove eliminated columns from all constraints.
+  removeIdRange(posStart, posLimit);
+  return posLimit - posStart;
+}
+
+// Detect the identifier at 'pos' (say id_r) as modulo of another identifier
+// (say id_n) w.r.t a constant. When this happens, another identifier (say id_q)
+// could be detected as the floordiv of n. For eg:
+// id_n - 4*id_q - id_r = 0, 0 <= id_r <= 3    <=>
+//                          id_r = id_n mod 4, id_q = id_n floordiv 4.
+// lbConst and ubConst are the constant lower and upper bounds for 'pos' -
+// pre-detected at the caller.
+static bool detectAsMod(const FlatAffineConstraints &cst, unsigned pos,
+                        int64_t lbConst, int64_t ubConst,
+                        SmallVectorImpl<AffineExpr> *memo) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  // Check if 0 <= id_r <= divisor - 1 and if id_r is equal to
+  // id_n - divisor * id_q. If these are true, then id_n becomes the dividend
+  // and id_q the quotient when dividing id_n by the divisor.
+
+  if (lbConst != 0 || ubConst < 1)
+    return false;
+
+  int64_t divisor = ubConst + 1;
+
+  // Now check for: id_r =  id_n - divisor * id_q. As an example, we
+  // are looking r = d - 4q, i.e., either r - d + 4q = 0 or -r + d - 4q = 0.
+  unsigned seenQuotient = 0, seenDividend = 0;
+  int quotientPos = -1, dividendPos = -1;
+  for (unsigned r = 0, e = cst.getNumEqualities(); r < e; r++) {
+    // id_n should have coeff 1 or -1.
+    if (std::abs(cst.atEq(r, pos)) != 1)
+      continue;
+    // constant term should be 0.
+    if (cst.atEq(r, cst.getNumCols() - 1) != 0)
+      continue;
+    unsigned c, f;
+    int quotientSign = 1, dividendSign = 1;
+    for (c = 0, f = cst.getNumDimAndSymbolIds(); c < f; c++) {
+      if (c == pos)
+        continue;
+      // The coefficient of the quotient should be +/-divisor.
+      // TODO(bondhugula): could be extended to detect an affine function for
+      // the quotient (i.e., the coeff could be a non-zero multiple of divisor).
+      int64_t v = cst.atEq(r, c) * cst.atEq(r, pos);
+      if (v == divisor || v == -divisor) {
+        seenQuotient++;
+        quotientPos = c;
+        quotientSign = v > 0 ? 1 : -1;
+      }
+      // The coefficient of the dividend should be +/-1.
+      // TODO(bondhugula): could be extended to detect an affine function of
+      // the other identifiers as the dividend.
+      else if (v == -1 || v == 1) {
+        seenDividend++;
+        dividendPos = c;
+        dividendSign = v < 0 ? 1 : -1;
+      } else if (cst.atEq(r, c) != 0) {
+        // Cannot be inferred as a mod since the constraint has a coefficient
+        // for an identifier that's neither a unit nor the divisor (see TODOs
+        // above).
+        break;
+      }
+    }
+    if (c < f)
+      // Cannot be inferred as a mod since the constraint has a coefficient for
+      // an identifier that's neither a unit nor the divisor (see TODOs above).
+      continue;
+
+    // We are looking for exactly one identifier as the dividend.
+    if (seenDividend == 1 && seenQuotient >= 1) {
+      if (!(*memo)[dividendPos])
+        return false;
+      // Successfully detected a mod.
+      (*memo)[pos] = (*memo)[dividendPos] % divisor * dividendSign;
+      auto ub = cst.getConstantUpperBound(dividendPos);
+      if (ub.hasValue() && ub.getValue() < divisor)
+        // The mod can be optimized away.
+        (*memo)[pos] = (*memo)[dividendPos] * dividendSign;
+      else
+        (*memo)[pos] = (*memo)[dividendPos] % divisor * dividendSign;
+
+      if (seenQuotient == 1 && !(*memo)[quotientPos])
+        // Successfully detected a floordiv as well.
+        (*memo)[quotientPos] =
+            (*memo)[dividendPos].floorDiv(divisor) * quotientSign;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Gather lower and upper bounds for the pos^th identifier.
+static void getLowerAndUpperBoundIndices(const FlatAffineConstraints &cst,
+                                         unsigned pos,
+                                         SmallVectorImpl<unsigned> *lbIndices,
+                                         SmallVectorImpl<unsigned> *ubIndices) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  // Gather all lower bounds and upper bounds of the variable. Since the
+  // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
+  // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = cst.getNumInequalities(); r < e; r++) {
+    if (cst.atIneq(r, pos) >= 1) {
+      // Lower bound.
+      lbIndices->push_back(r);
+    } else if (cst.atIneq(r, pos) <= -1) {
+      // Upper bound.
+      ubIndices->push_back(r);
+    }
+  }
+}
+
+// Check if the pos^th identifier can be expressed as a floordiv of an affine
+// function of other identifiers (where the divisor is a positive constant).
+// For eg: 4q <= i + j <= 4q + 3   <=>   q = (i + j) floordiv 4.
+bool detectAsFloorDiv(const FlatAffineConstraints &cst, unsigned pos,
+                      SmallVectorImpl<AffineExpr> *memo, MLIRContext *context) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+  getLowerAndUpperBoundIndices(cst, pos, &lbIndices, &ubIndices);
+
+  // Check if any lower bound, upper bound pair is of the form:
+  // divisor * id >=  expr - (divisor - 1)    <-- Lower bound for 'id'
+  // divisor * id <=  expr                    <-- Upper bound for 'id'
+  // Then, 'id' is equivalent to 'expr floordiv divisor'.  (where divisor > 1).
+  //
+  // For example, if -32*k + 16*i + j >= 0
+  //                  32*k - 16*i - j + 31 >= 0   <=>
+  //             k = ( 16*i + j ) floordiv 32
+  unsigned seenDividends = 0;
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      // Check if lower bound's constant term is 'divisor - 1'. The 'divisor'
+      // here is cst.atIneq(lbPos, pos) and we already know that it's positive
+      // (since cst.Ineq(lbPos, ...) is a lower bound expression for 'pos'.
+      if (cst.atIneq(lbPos, cst.getNumCols() - 1) != cst.atIneq(lbPos, pos) - 1)
+        continue;
+      // Check if upper bound's constant term is 0.
+      if (cst.atIneq(ubPos, cst.getNumCols() - 1) != 0)
+        continue;
+      // For the remaining part, check if the lower bound expr's coeff's are
+      // negations of corresponding upper bound ones'.
+      unsigned c, f;
+      for (c = 0, f = cst.getNumCols() - 1; c < f; c++) {
+        if (cst.atIneq(lbPos, c) != -cst.atIneq(ubPos, c))
+          break;
+        if (c != pos && cst.atIneq(lbPos, c) != 0)
+          seenDividends++;
+      }
+      // Lb coeff's aren't negative of ub coeff's (for the non constant term
+      // part).
+      if (c < f)
+        continue;
+      if (seenDividends >= 1) {
+        // The divisor is the constant term of the lower bound expression.
+        // We already know that cst.atIneq(lbPos, pos) > 0.
+        int64_t divisor = cst.atIneq(lbPos, pos);
+        // Construct the dividend expression.
+        auto dividendExpr = getAffineConstantExpr(0, context);
+        unsigned c, f;
+        for (c = 0, f = cst.getNumCols() - 1; c < f; c++) {
+          if (c == pos)
+            continue;
+          int64_t ubVal = cst.atIneq(ubPos, c);
+          if (ubVal == 0)
+            continue;
+          if (!(*memo)[c])
+            break;
+          dividendExpr = dividendExpr + ubVal * (*memo)[c];
+        }
+        // Expression can't be constructed as it depends on a yet unknown
+        // identifier.
+        // TODO(mlir-team): Visit/compute the identifiers in an order so that
+        // this doesn't happen. More complex but much more efficient.
+        if (c < f)
+          continue;
+        // Successfully detected the floordiv.
+        (*memo)[pos] = dividendExpr.floorDiv(divisor);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Fills an inequality row with the value 'val'.
+static inline void fillInequality(FlatAffineConstraints *cst, unsigned r,
+                                  int64_t val) {
+  for (unsigned c = 0, f = cst->getNumCols(); c < f; c++) {
+    cst->atIneq(r, c) = val;
+  }
+}
+
+// Negates an inequality.
+static inline void negateInequality(FlatAffineConstraints *cst, unsigned r) {
+  for (unsigned c = 0, f = cst->getNumCols(); c < f; c++) {
+    cst->atIneq(r, c) = -cst->atIneq(r, c);
+  }
+}
+
+// A more complex check to eliminate redundant inequalities. Uses FourierMotzkin
+// to check if a constraint is redundant.
+void FlatAffineConstraints::removeRedundantInequalities() {
+  SmallVector<bool, 32> redun(getNumInequalities(), false);
+  // To check if an inequality is redundant, we replace the inequality by its
+  // complement (for eg., i - 1 >= 0 by i <= 0), and check if the resulting
+  // system is empty. If it is, the inequality is redundant.
+  FlatAffineConstraints tmpCst(*this);
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    // Change the inequality to its complement.
+    negateInequality(&tmpCst, r);
+    tmpCst.atIneq(r, tmpCst.getNumCols() - 1)--;
+    if (tmpCst.isEmpty()) {
+      redun[r] = true;
+      // Zero fill the redundant inequality.
+      fillInequality(this, r, /*val=*/0);
+      fillInequality(&tmpCst, r, /*val=*/0);
+    } else {
+      // Reverse the change (to avoid recreating tmpCst each time).
+      tmpCst.atIneq(r, tmpCst.getNumCols() - 1)++;
+      negateInequality(&tmpCst, r);
+    }
+  }
+
+  // Scan to get rid of all rows marked redundant, in-place.
+  auto copyRow = [&](unsigned src, unsigned dest) {
+    if (src == dest)
+      return;
+    for (unsigned c = 0, e = getNumCols(); c < e; c++) {
+      atIneq(dest, c) = atIneq(src, c);
+    }
+  };
+  unsigned pos = 0;
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (!redun[r])
+      copyRow(r, pos++);
+  }
+  inequalities.resize(numReservedCols * pos);
+}
+
+std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
+    unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
+    ArrayRef<AffineExpr> localExprs, MLIRContext *context) {
+  assert(pos + offset < getNumDimIds() && "invalid dim start pos");
+  assert(symStartPos >= (pos + offset) && "invalid sym start pos");
+  assert(getNumLocalIds() == localExprs.size() &&
+         "incorrect local exprs count");
+
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+  getLowerAndUpperBoundIndices(*this, pos + offset, &lbIndices, &ubIndices);
+
+  /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
+  auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
+    b.clear();
+    for (unsigned i = 0, e = a.size(); i < e; ++i) {
+      if (i < offset || i >= offset + num)
+        b.push_back(a[i]);
+    }
+  };
+
+  SmallVector<int64_t, 8> lb, ub;
+  SmallVector<AffineExpr, 4> exprs;
+  unsigned dimCount = symStartPos - num;
+  unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
+  exprs.reserve(lbIndices.size());
+  // Lower bound expressions.
+  for (auto idx : lbIndices) {
+    auto ineq = getInequality(idx);
+    // Extract the lower bound (in terms of other coeff's + const), i.e., if
+    // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
+    // - 1.
+    addCoeffs(ineq, lb);
+    std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
+    auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context);
+    exprs.push_back(expr);
+  }
+  auto lbMap =
+      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+  exprs.clear();
+  exprs.reserve(ubIndices.size());
+  // Upper bound expressions.
+  for (auto idx : ubIndices) {
+    auto ineq = getInequality(idx);
+    // Extract the upper bound (in terms of other coeff's + const).
+    addCoeffs(ineq, ub);
+    auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context);
+    // Upper bound is exclusive.
+    exprs.push_back(expr + 1);
+  }
+  auto ubMap =
+      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+  return {lbMap, ubMap};
+}
+
+/// Computes the lower and upper bounds of the first 'num' dimensional
+/// identifiers (starting at 'offset') as affine maps of the remaining
+/// identifiers (dimensional and symbolic identifiers). Local identifiers are
+/// themselves explicitly computed as affine functions of other identifiers in
+/// this process if needed.
+void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
+                                           MLIRContext *context,
+                                           SmallVectorImpl<AffineMap> *lbMaps,
+                                           SmallVectorImpl<AffineMap> *ubMaps) {
+  assert(num < getNumDimIds() && "invalid range");
+
+  // Basic simplification.
+  normalizeConstraintsByGCD();
+
+  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for first " << num
+                          << " identifiers\n");
+  LLVM_DEBUG(dump());
+
+  // Record computed/detected identifiers.
+  SmallVector<AffineExpr, 8> memo(getNumIds());
+  // Initialize dimensional and symbolic identifiers.
+  for (unsigned i = 0, e = getNumDimIds(); i < e; i++) {
+    if (i < offset)
+      memo[i] = getAffineDimExpr(i, context);
+    else if (i >= offset + num)
+      memo[i] = getAffineDimExpr(i - num, context);
+  }
+  for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++)
+    memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context);
+
+  bool changed;
+  do {
+    changed = false;
+    // Identify yet unknown identifiers as constants or mod's / floordiv's of
+    // other identifiers if possible.
+    for (unsigned pos = 0; pos < getNumIds(); pos++) {
+      if (memo[pos])
+        continue;
+
+      auto lbConst = getConstantLowerBound(pos);
+      auto ubConst = getConstantUpperBound(pos);
+      if (lbConst.hasValue() && ubConst.hasValue()) {
+        // Detect equality to a constant.
+        if (lbConst.getValue() == ubConst.getValue()) {
+          memo[pos] = getAffineConstantExpr(lbConst.getValue(), context);
+          changed = true;
+          continue;
+        }
+
+        // Detect an identifier as modulo of another identifier w.r.t a
+        // constant.
+        if (detectAsMod(*this, pos, lbConst.getValue(), ubConst.getValue(),
+                        &memo)) {
+          changed = true;
+          continue;
+        }
+      }
+
+      // Detect an identifier as floordiv of another identifier w.r.t a
+      // constant.
+      if (detectAsFloorDiv(*this, pos, &memo, context)) {
+        changed = true;
+        continue;
+      }
+
+      // Detect an identifier as an expression of other identifiers.
+      unsigned idx;
+      if (!findConstraintWithNonZeroAt(*this, pos, /*isEq=*/true, &idx)) {
+        continue;
+      }
+
+      // Build AffineExpr solving for identifier 'pos' in terms of all others.
+      auto expr = getAffineConstantExpr(0, context);
+      unsigned j, e;
+      for (j = 0, e = getNumIds(); j < e; ++j) {
+        if (j == pos)
+          continue;
+        int64_t c = atEq(idx, j);
+        if (c == 0)
+          continue;
+        // If any of the involved IDs hasn't been found yet, we can't proceed.
+        if (!memo[j])
+          break;
+        expr = expr + memo[j] * c;
+      }
+      if (j < e)
+        // Can't construct expression as it depends on a yet uncomputed
+        // identifier.
+        continue;
+
+      // Add constant term to AffineExpr.
+      expr = expr + atEq(idx, getNumIds());
+      int64_t vPos = atEq(idx, pos);
+      assert(vPos != 0 && "expected non-zero here");
+      if (vPos > 0)
+        expr = (-expr).floorDiv(vPos);
+      else
+        // vPos < 0.
+        expr = expr.floorDiv(-vPos);
+      // Successfully constructed expression.
+      memo[pos] = expr;
+      changed = true;
+    }
+    // This loop is guaranteed to reach a fixed point - since once an
+    // identifier's explicit form is computed (in memo[pos]), it's not updated
+    // again.
+  } while (changed);
+
+  // Set the lower and upper bound maps for all the identifiers that were
+  // computed as affine expressions of the rest as the "detected expr" and
+  // "detected expr + 1" respectively; set the undetected ones to null.
+  Optional<FlatAffineConstraints> tmpClone;
+  for (unsigned pos = 0; pos < num; pos++) {
+    unsigned numMapDims = getNumDimIds() - num;
+    unsigned numMapSymbols = getNumSymbolIds();
+    AffineExpr expr = memo[pos + offset];
+    if (expr)
+      expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
+
+    AffineMap &lbMap = (*lbMaps)[pos];
+    AffineMap &ubMap = (*ubMaps)[pos];
+
+    if (expr) {
+      lbMap = AffineMap::get(numMapDims, numMapSymbols, expr);
+      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + 1);
+    } else {
+      // TODO(bondhugula): Whenever there are local identifiers in the
+      // dependence constraints, we'll conservatively over-approximate, since we
+      // don't always explicitly compute them above (in the while loop).
+      if (getNumLocalIds() == 0) {
+        // Work on a copy so that we don't update this constraint system.
+        if (!tmpClone) {
+          tmpClone.emplace(FlatAffineConstraints(*this));
+          // Removing redudnant inequalities is necessary so that we don't get
+          // redundant loop bounds.
+          tmpClone->removeRedundantInequalities();
+        }
+        std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
+            pos, offset, num, getNumDimIds(), {}, context);
+      }
+
+      // If the above fails, we'll just use the constant lower bound and the
+      // constant upper bound (if they exist) as the slice bounds.
+      // TODO(b/126426796): being conservative for the moment in cases that
+      // lead to multiple bounds - until getConstDifference in LoopFusion.cpp is
+      // fixed (b/126426796).
+      if (!lbMap || lbMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice lb\n");
+        auto lbConst = getConstantLowerBound(pos + offset);
+        if (lbConst.hasValue()) {
+          lbMap = AffineMap::get(
+              numMapDims, numMapSymbols,
+              getAffineConstantExpr(lbConst.getValue(), context));
+        }
+      }
+      if (!ubMap || ubMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice ub\n");
+        auto ubConst = getConstantUpperBound(pos + offset);
+        if (ubConst.hasValue()) {
+          (ubMap) = AffineMap::get(
+              numMapDims, numMapSymbols,
+              getAffineConstantExpr(ubConst.getValue() + 1, context));
+        }
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "lb map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(lbMap.dump(););
+    LLVM_DEBUG(llvm::dbgs()
+               << "ub map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(ubMap.dump(););
+  }
+}
+
+LogicalResult
+FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
+                                            ArrayRef<Value *> boundOperands,
+                                            bool eq, bool lower) {
+  assert(pos < getNumDimAndSymbolIds() && "invalid position");
+  // Equality follows the logic of lower bound except that we add an equality
+  // instead of an inequality.
+  assert((!eq || boundMap.getNumResults() == 1) && "single result expected");
+  if (eq)
+    lower = true;
+
+  // Fully commpose map and operands; canonicalize and simplify so that we
+  // transitively get to terminal symbols or loop IVs.
+  auto map = boundMap;
+  SmallVector<Value *, 4> operands(boundOperands.begin(), boundOperands.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  map = simplifyAffineMap(map);
+  canonicalizeMapAndOperands(&map, &operands);
+  for (auto *operand : operands)
+    addInductionVarOrTerminalSymbol(operand);
+
+  FlatAffineConstraints localVarCst;
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  if (failed(getFlattenedAffineExprs(map, &flatExprs, &localVarCst))) {
+    LLVM_DEBUG(llvm::dbgs() << "semi-affine expressions not yet supported\n");
+    return failure();
+  }
+
+  // Merge and align with localVarCst.
+  if (localVarCst.getNumLocalIds() > 0) {
+    // Set values for localVarCst.
+    localVarCst.setIdValues(0, localVarCst.getNumDimAndSymbolIds(), operands);
+    for (auto *operand : operands) {
+      unsigned pos;
+      if (findId(*operand, &pos)) {
+        if (pos >= getNumDimIds() && pos < getNumDimAndSymbolIds()) {
+          // If the local var cst has this as a dim, turn it into its symbol.
+          turnDimIntoSymbol(&localVarCst, *operand);
+        } else if (pos < getNumDimIds()) {
+          // Or vice versa.
+          turnSymbolIntoDim(&localVarCst, *operand);
+        }
+      }
+    }
+    mergeAndAlignIds(/*offset=*/0, this, &localVarCst);
+    append(localVarCst);
+  }
+
+  // Record positions of the operands in the constraint system. Need to do
+  // this here since the constraint system changes after a bound is added.
+  SmallVector<unsigned, 8> positions;
+  unsigned numOperands = operands.size();
+  for (auto *operand : operands) {
+    unsigned pos;
+    if (!findId(*operand, &pos))
+      assert(0 && "expected to be found");
+    positions.push_back(pos);
+  }
+
+  for (const auto &flatExpr : flatExprs) {
+    SmallVector<int64_t, 4> ineq(getNumCols(), 0);
+    ineq[pos] = lower ? 1 : -1;
+    // Dims and symbols.
+    for (unsigned j = 0, e = map.getNumInputs(); j < e; j++) {
+      ineq[positions[j]] = lower ? -flatExpr[j] : flatExpr[j];
+    }
+    // Copy over the local id coefficients.
+    unsigned numLocalIds = flatExpr.size() - 1 - numOperands;
+    for (unsigned jj = 0, j = getNumIds() - numLocalIds; jj < numLocalIds;
+         jj++, j++) {
+      ineq[j] =
+          lower ? -flatExpr[numOperands + jj] : flatExpr[numOperands + jj];
+    }
+    // Constant term.
+    ineq[getNumCols() - 1] =
+        lower ? -flatExpr[flatExpr.size() - 1]
+              // Upper bound in flattenedExpr is an exclusive one.
+              : flatExpr[flatExpr.size() - 1] - 1;
+    eq ? addEquality(ineq) : addInequality(ineq);
+  }
+  return success();
+}
+
+// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
+// bounds in 'ubMaps' to each value in `values' that appears in the constraint
+// system. Note that both lower/upper bounds share the same operand list
+// 'operands'.
+// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size', and
+// skips any null AffineMaps in 'lbMaps' or 'ubMaps'.
+// Note that both lower/upper bounds use operands from 'operands'.
+// Returns failure for unimplemented cases such as semi-affine expressions or
+// expressions with mod/floordiv.
+LogicalResult FlatAffineConstraints::addSliceBounds(
+    ArrayRef<Value *> values, ArrayRef<AffineMap> lbMaps,
+    ArrayRef<AffineMap> ubMaps, ArrayRef<Value *> operands) {
+  assert(values.size() == lbMaps.size());
+  assert(lbMaps.size() == ubMaps.size());
+
+  for (unsigned i = 0, e = lbMaps.size(); i < e; ++i) {
+    unsigned pos;
+    if (!findId(*values[i], &pos))
+      continue;
+
+    AffineMap lbMap = lbMaps[i];
+    AffineMap ubMap = ubMaps[i];
+    assert(!lbMap || lbMap.getNumInputs() == operands.size());
+    assert(!ubMap || ubMap.getNumInputs() == operands.size());
+
+    // Check if this slice is just an equality along this dimension.
+    if (lbMap && ubMap && lbMap.getNumResults() == 1 &&
+        ubMap.getNumResults() == 1 &&
+        lbMap.getResult(0) + 1 == ubMap.getResult(0)) {
+      if (failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/true,
+                                      /*lower=*/true)))
+        return failure();
+      continue;
+    }
+
+    if (lbMap && failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/false,
+                                             /*lower=*/true)))
+      return failure();
+
+    if (ubMap && failed(addLowerOrUpperBound(pos, ubMap, operands, /*eq=*/false,
+                                             /*lower=*/false)))
+      return failure();
+  }
+  return success();
+}
+
+void FlatAffineConstraints::addEquality(ArrayRef<int64_t> eq) {
+  assert(eq.size() == getNumCols());
+  unsigned offset = equalities.size();
+  equalities.resize(equalities.size() + numReservedCols);
+  std::copy(eq.begin(), eq.end(), equalities.begin() + offset);
+}
+
+void FlatAffineConstraints::addInequality(ArrayRef<int64_t> inEq) {
+  assert(inEq.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::copy(inEq.begin(), inEq.end(), inequalities.begin() + offset);
+}
+
+void FlatAffineConstraints::addConstantLowerBound(unsigned pos, int64_t lb) {
+  assert(pos < getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  inequalities[offset + pos] = 1;
+  inequalities[offset + getNumCols() - 1] = -lb;
+}
+
+void FlatAffineConstraints::addConstantUpperBound(unsigned pos, int64_t ub) {
+  assert(pos < getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  inequalities[offset + pos] = -1;
+  inequalities[offset + getNumCols() - 1] = ub;
+}
+
+void FlatAffineConstraints::addConstantLowerBound(ArrayRef<int64_t> expr,
+                                                  int64_t lb) {
+  assert(expr.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  std::copy(expr.begin(), expr.end(), inequalities.begin() + offset);
+  inequalities[offset + getNumCols() - 1] += -lb;
+}
+
+void FlatAffineConstraints::addConstantUpperBound(ArrayRef<int64_t> expr,
+                                                  int64_t ub) {
+  assert(expr.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  for (unsigned i = 0, e = getNumCols(); i < e; i++) {
+    inequalities[offset + i] = -expr[i];
+  }
+  inequalities[offset + getNumCols() - 1] += ub;
+}
+
+/// Adds a new local identifier as the floordiv of an affine function of other
+/// identifiers, the coefficients of which are provided in 'dividend' and with
+/// respect to a positive constant 'divisor'. Two constraints are added to the
+/// system to capture equivalence with the floordiv.
+///      q = expr floordiv c    <=>   c*q <= expr <= c*q + c - 1.
+void FlatAffineConstraints::addLocalFloorDiv(ArrayRef<int64_t> dividend,
+                                             int64_t divisor) {
+  assert(dividend.size() == getNumCols() && "incorrect dividend size");
+  assert(divisor > 0 && "positive divisor expected");
+
+  addLocalId(getNumLocalIds());
+
+  // Add two constraints for this new identifier 'q'.
+  SmallVector<int64_t, 8> bound(dividend.size() + 1);
+
+  // dividend - q * divisor >= 0
+  std::copy(dividend.begin(), dividend.begin() + dividend.size() - 1,
+            bound.begin());
+  bound.back() = dividend.back();
+  bound[getNumIds() - 1] = -divisor;
+  addInequality(bound);
+
+  // -dividend +qdivisor * q + divisor - 1 >= 0
+  std::transform(bound.begin(), bound.end(), bound.begin(),
+                 std::negate<int64_t>());
+  bound[bound.size() - 1] += divisor - 1;
+  addInequality(bound);
+}
+
+bool FlatAffineConstraints::findId(Value &id, unsigned *pos) const {
+  unsigned i = 0;
+  for (const auto &mayBeId : ids) {
+    if (mayBeId.hasValue() && mayBeId.getValue() == &id) {
+      *pos = i;
+      return true;
+    }
+    i++;
+  }
+  return false;
+}
+
+bool FlatAffineConstraints::containsId(Value &id) const {
+  return llvm::any_of(ids, [&](const Optional<Value *> &mayBeId) {
+    return mayBeId.hasValue() && mayBeId.getValue() == &id;
+  });
+}
+
+void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) {
+  assert(newSymbolCount <= numDims + numSymbols &&
+         "invalid separation position");
+  numDims = numDims + numSymbols - newSymbolCount;
+  numSymbols = newSymbolCount;
+}
+
+/// Sets the specified identifer to a constant value.
+void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) {
+  unsigned offset = equalities.size();
+  equalities.resize(equalities.size() + numReservedCols);
+  std::fill(equalities.begin() + offset,
+            equalities.begin() + offset + getNumCols(), 0);
+  equalities[offset + pos] = 1;
+  equalities[offset + getNumCols() - 1] = -val;
+}
+
+/// Sets the specified identifer to a constant value; asserts if the id is not
+/// found.
+void FlatAffineConstraints::setIdToConstant(Value &id, int64_t val) {
+  unsigned pos;
+  if (!findId(id, &pos))
+    // This is a pre-condition for this method.
+    assert(0 && "id not found");
+  setIdToConstant(pos, val);
+}
+
+void FlatAffineConstraints::removeEquality(unsigned pos) {
+  unsigned numEqualities = getNumEqualities();
+  assert(pos < numEqualities);
+  unsigned outputIndex = pos * numReservedCols;
+  unsigned inputIndex = (pos + 1) * numReservedCols;
+  unsigned numElemsToCopy = (numEqualities - pos - 1) * numReservedCols;
+  std::copy(equalities.begin() + inputIndex,
+            equalities.begin() + inputIndex + numElemsToCopy,
+            equalities.begin() + outputIndex);
+  equalities.resize(equalities.size() - numReservedCols);
+}
+
+/// Finds an equality that equates the specified identifier to a constant.
+/// Returns the position of the equality row. If 'symbolic' is set to true,
+/// symbols are also treated like a constant, i.e., an affine function of the
+/// symbols is also treated like a constant.
+static int findEqualityToConstant(const FlatAffineConstraints &cst,
+                                  unsigned pos, bool symbolic = false) {
+  assert(pos < cst.getNumIds() && "invalid position");
+  for (unsigned r = 0, e = cst.getNumEqualities(); r < e; r++) {
+    int64_t v = cst.atEq(r, pos);
+    if (v * v != 1)
+      continue;
+    unsigned c;
+    unsigned f = symbolic ? cst.getNumDimIds() : cst.getNumIds();
+    // This checks for zeros in all positions other than 'pos' in [0, f)
+    for (c = 0; c < f; c++) {
+      if (c == pos)
+        continue;
+      if (cst.atEq(r, c) != 0) {
+        // Dependent on another identifier.
+        break;
+      }
+    }
+    if (c == f)
+      // Equality is free of other identifiers.
+      return r;
+  }
+  return -1;
+}
+
+void FlatAffineConstraints::setAndEliminate(unsigned pos, int64_t constVal) {
+  assert(pos < getNumIds() && "invalid position");
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    atIneq(r, getNumCols() - 1) += atIneq(r, pos) * constVal;
+  }
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    atEq(r, getNumCols() - 1) += atEq(r, pos) * constVal;
+  }
+  removeId(pos);
+}
+
+LogicalResult FlatAffineConstraints::constantFoldId(unsigned pos) {
+  assert(pos < getNumIds() && "invalid position");
+  int rowIdx;
+  if ((rowIdx = findEqualityToConstant(*this, pos)) == -1)
+    return failure();
+
+  // atEq(rowIdx, pos) is either -1 or 1.
+  assert(atEq(rowIdx, pos) * atEq(rowIdx, pos) == 1);
+  int64_t constVal = -atEq(rowIdx, getNumCols() - 1) / atEq(rowIdx, pos);
+  setAndEliminate(pos, constVal);
+  return success();
+}
+
+void FlatAffineConstraints::constantFoldIdRange(unsigned pos, unsigned num) {
+  for (unsigned s = pos, t = pos, e = pos + num; s < e; s++) {
+    if (failed(constantFoldId(t)))
+      t++;
+  }
+}
+
+/// Returns the extent (upper bound - lower bound) of the specified
+/// identifier if it is found to be a constant; returns None if it's not a
+/// constant. This methods treats symbolic identifiers specially, i.e.,
+/// it looks for constant differences between affine expressions involving
+/// only the symbolic identifiers. See comments at function definition for
+/// example. 'lb', if provided, is set to the lower bound associated with the
+/// constant difference. Note that 'lb' is purely symbolic and thus will contain
+/// the coefficients of the symbolic identifiers and the constant coefficient.
+//  Egs: 0 <= i <= 15, return 16.
+//       s0 + 2 <= i <= s0 + 17, returns 16. (s0 has to be a symbol)
+//       s0 + s1 + 16 <= d0 <= s0 + s1 + 31, returns 16.
+//       s0 - 7 <= 8*j <= s0 returns 1 with lb = s0, lbDivisor = 8 (since lb =
+//       ceil(s0 - 7 / 8) = floor(s0 / 8)).
+Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
+    unsigned pos, SmallVectorImpl<int64_t> *lb, int64_t *lbFloorDivisor,
+    SmallVectorImpl<int64_t> *ub) const {
+  assert(pos < getNumDimIds() && "Invalid identifier position");
+  assert(getNumLocalIds() == 0);
+
+  // TODO(bondhugula): eliminate all remaining dimensional identifiers (other
+  // than the one at 'pos' to make this more powerful. Not needed for
+  // hyper-rectangular spaces.
+
+  // Find an equality for 'pos'^th identifier that equates it to some function
+  // of the symbolic identifiers (+ constant).
+  int eqRow = findEqualityToConstant(*this, pos, /*symbolic=*/true);
+  if (eqRow != -1) {
+    // This identifier can only take a single value.
+    if (lb) {
+      // Set lb to the symbolic value.
+      lb->resize(getNumSymbolIds() + 1);
+      if (ub)
+        ub->resize(getNumSymbolIds() + 1);
+      for (unsigned c = 0, f = getNumSymbolIds() + 1; c < f; c++) {
+        int64_t v = atEq(eqRow, pos);
+        // atEq(eqRow, pos) is either -1 or 1.
+        assert(v * v == 1);
+        (*lb)[c] = v < 0 ? atEq(eqRow, getNumDimIds() + c) / -v
+                         : -atEq(eqRow, getNumDimIds() + c) / v;
+        // Since this is an equality, ub = lb.
+        if (ub)
+          (*ub)[c] = (*lb)[c];
+      }
+      assert(lbFloorDivisor &&
+             "both lb and divisor or none should be provided");
+      *lbFloorDivisor = 1;
+    }
+    return 1;
+  }
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) != 0)
+      break;
+  }
+  if (r == e)
+    // If it doesn't, there isn't a bound on it.
+    return None;
+
+  // Positions of constraints that are lower/upper bounds on the variable.
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+
+  // Gather all symbolic lower bounds and upper bounds of the variable. Since
+  // the canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a
+  // lower bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    unsigned c, f;
+    for (c = 0, f = getNumDimIds(); c < f; c++) {
+      if (c != pos && atIneq(r, c) != 0)
+        break;
+    }
+    if (c < getNumDimIds())
+      // Not a pure symbolic bound.
+      continue;
+    if (atIneq(r, pos) >= 1)
+      // Lower bound.
+      lbIndices.push_back(r);
+    else if (atIneq(r, pos) <= -1)
+      // Upper bound.
+      ubIndices.push_back(r);
+  }
+
+  // TODO(bondhugula): eliminate other dimensional identifiers to make this more
+  // powerful. Not needed for hyper-rectangular iteration spaces.
+
+  Optional<int64_t> minDiff = None;
+  unsigned minLbPosition, minUbPosition;
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      // Look for a lower bound and an upper bound that only differ by a
+      // constant, i.e., pairs of the form  0 <= c_pos - f(c_i's) <= diffConst.
+      // For example, if ii is the pos^th variable, we are looking for
+      // constraints like ii >= i, ii <= ii + 50, 50 being the difference. The
+      // minimum among all such constant differences is kept since that's the
+      // constant bounding the extent of the pos^th variable.
+      unsigned j, e;
+      for (j = 0, e = getNumCols() - 1; j < e; j++)
+        if (atIneq(ubPos, j) != -atIneq(lbPos, j)) {
+          break;
+        }
+      if (j < getNumCols() - 1)
+        continue;
+      int64_t diff = ceilDiv(atIneq(ubPos, getNumCols() - 1) +
+                                 atIneq(lbPos, getNumCols() - 1) + 1,
+                             atIneq(lbPos, pos));
+      if (minDiff == None || diff < minDiff) {
+        minDiff = diff;
+        minLbPosition = lbPos;
+        minUbPosition = ubPos;
+      }
+    }
+  }
+  if (lb && minDiff.hasValue()) {
+    // Set lb to the symbolic lower bound.
+    lb->resize(getNumSymbolIds() + 1);
+    if (ub)
+      ub->resize(getNumSymbolIds() + 1);
+    // The lower bound is the ceildiv of the lb constraint over the coefficient
+    // of the variable at 'pos'. We express the ceildiv equivalently as a floor
+    // for uniformity. For eg., if the lower bound constraint was: 32*d0 - N +
+    // 31 >= 0, the lower bound for d0 is ceil(N - 31, 32), i.e., floor(N, 32).
+    *lbFloorDivisor = atIneq(minLbPosition, pos);
+    assert(*lbFloorDivisor == -atIneq(minUbPosition, pos));
+    for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++) {
+      (*lb)[c] = -atIneq(minLbPosition, getNumDimIds() + c);
+    }
+    if (ub) {
+      for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++)
+        (*ub)[c] = atIneq(minUbPosition, getNumDimIds() + c);
+    }
+    // The lower bound leads to a ceildiv while the upper bound is a floordiv
+    // whenever the cofficient at pos != 1. ceildiv (val / d) = floordiv (val +
+    // d - 1 / d); hence, the addition of 'atIneq(minLbPosition, pos) - 1' to
+    // the constant term for the lower bound.
+    (*lb)[getNumSymbolIds()] += atIneq(minLbPosition, pos) - 1;
+  }
+  return minDiff;
+}
+
+template <bool isLower>
+Optional<int64_t>
+FlatAffineConstraints::computeConstantLowerOrUpperBound(unsigned pos) {
+  assert(pos < getNumIds() && "invalid position");
+  // Project to 'pos'.
+  projectOut(0, pos);
+  projectOut(1, getNumIds() - 1);
+  // Check if there's an equality equating the '0'^th identifier to a constant.
+  int eqRowIdx = findEqualityToConstant(*this, 0, /*symbolic=*/false);
+  if (eqRowIdx != -1)
+    // atEq(rowIdx, 0) is either -1 or 1.
+    return -atEq(eqRowIdx, getNumCols() - 1) / atEq(eqRowIdx, 0);
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, 0) != 0)
+      break;
+  }
+  if (r == e)
+    // If it doesn't, there isn't a bound on it.
+    return None;
+
+  Optional<int64_t> minOrMaxConst = None;
+
+  // Take the max across all const lower bounds (or min across all constant
+  // upper bounds).
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (isLower) {
+      if (atIneq(r, 0) <= 0)
+        // Not a lower bound.
+        continue;
+    } else if (atIneq(r, 0) >= 0) {
+      // Not an upper bound.
+      continue;
+    }
+    unsigned c, f;
+    for (c = 0, f = getNumCols() - 1; c < f; c++)
+      if (c != 0 && atIneq(r, c) != 0)
+        break;
+    if (c < getNumCols() - 1)
+      // Not a constant bound.
+      continue;
+
+    int64_t boundConst =
+        isLower ? mlir::ceilDiv(-atIneq(r, getNumCols() - 1), atIneq(r, 0))
+                : mlir::floorDiv(atIneq(r, getNumCols() - 1), -atIneq(r, 0));
+    if (isLower) {
+      if (minOrMaxConst == None || boundConst > minOrMaxConst)
+        minOrMaxConst = boundConst;
+    } else {
+      if (minOrMaxConst == None || boundConst < minOrMaxConst)
+        minOrMaxConst = boundConst;
+    }
+  }
+  return minOrMaxConst;
+}
+
+Optional<int64_t>
+FlatAffineConstraints::getConstantLowerBound(unsigned pos) const {
+  FlatAffineConstraints tmpCst(*this);
+  return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/true>(pos);
+}
+
+Optional<int64_t>
+FlatAffineConstraints::getConstantUpperBound(unsigned pos) const {
+  FlatAffineConstraints tmpCst(*this);
+  return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/false>(pos);
+}
+
+// A simple (naive and conservative) check for hyper-rectangularlity.
+bool FlatAffineConstraints::isHyperRectangular(unsigned pos,
+                                               unsigned num) const {
+  assert(pos < getNumCols() - 1);
+  // Check for two non-zero coefficients in the range [pos, pos + sum).
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    unsigned sum = 0;
+    for (unsigned c = pos; c < pos + num; c++) {
+      if (atIneq(r, c) != 0)
+        sum++;
+    }
+    if (sum > 1)
+      return false;
+  }
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    unsigned sum = 0;
+    for (unsigned c = pos; c < pos + num; c++) {
+      if (atEq(r, c) != 0)
+        sum++;
+    }
+    if (sum > 1)
+      return false;
+  }
+  return true;
+}
+
+void FlatAffineConstraints::print(raw_ostream &os) const {
+  assert(hasConsistentState());
+  os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
+     << " symbols, " << getNumLocalIds() << " locals), (" << getNumConstraints()
+     << " constraints)\n";
+  os << "(";
+  for (unsigned i = 0, e = getNumIds(); i < e; i++) {
+    if (ids[i] == None)
+      os << "None ";
+    else
+      os << "Value ";
+  }
+  os << " const)\n";
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atEq(i, j) << " ";
+    }
+    os << "= 0\n";
+  }
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atIneq(i, j) << " ";
+    }
+    os << ">= 0\n";
+  }
+  os << '\n';
+}
+
+void FlatAffineConstraints::dump() const { print(llvm::errs()); }
+
+/// Removes duplicate constraints, trivially true constraints, and constraints
+/// that can be detected as redundant as a result of differing only in their
+/// constant term part. A constraint of the form <non-negative constant> >= 0 is
+/// considered trivially true.
+//  Uses a DenseSet to hash and detect duplicates followed by a linear scan to
+//  remove duplicates in place.
+void FlatAffineConstraints::removeTrivialRedundancy() {
+  SmallDenseSet<ArrayRef<int64_t>, 8> rowSet;
+
+  // A map used to detect redundancy stemming from constraints that only differ
+  // in their constant term. The value stored is <row position, const term>
+  // for a given row.
+  SmallDenseMap<ArrayRef<int64_t>, std::pair<unsigned, int64_t>>
+      rowsWithoutConstTerm;
+
+  // Check if constraint is of the form <non-negative-constant> >= 0.
+  auto isTriviallyValid = [&](unsigned r) -> bool {
+    for (unsigned c = 0, e = getNumCols() - 1; c < e; c++) {
+      if (atIneq(r, c) != 0)
+        return false;
+    }
+    return atIneq(r, getNumCols() - 1) >= 0;
+  };
+
+  // Detect and mark redundant constraints.
+  SmallVector<bool, 256> redunIneq(getNumInequalities(), false);
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    int64_t *rowStart = inequalities.data() + numReservedCols * r;
+    auto row = ArrayRef<int64_t>(rowStart, getNumCols());
+    if (isTriviallyValid(r) || !rowSet.insert(row).second) {
+      redunIneq[r] = true;
+      continue;
+    }
+
+    // Among constraints that only differ in the constant term part, mark
+    // everything other than the one with the smallest constant term redundant.
+    // (eg: among i - 16j - 5 >= 0, i - 16j - 1 >=0, i - 16j - 7 >= 0, the
+    // former two are redundant).
+    int64_t constTerm = atIneq(r, getNumCols() - 1);
+    auto rowWithoutConstTerm = ArrayRef<int64_t>(rowStart, getNumCols() - 1);
+    const auto &ret =
+        rowsWithoutConstTerm.insert({rowWithoutConstTerm, {r, constTerm}});
+    if (!ret.second) {
+      // Check if the other constraint has a higher constant term.
+      auto &val = ret.first->second;
+      if (val.second > constTerm) {
+        // The stored row is redundant. Mark it so, and update with this one.
+        redunIneq[val.first] = true;
+        val = {r, constTerm};
+      } else {
+        // The one stored makes this one redundant.
+        redunIneq[r] = true;
+      }
+    }
+  }
+
+  auto copyRow = [&](unsigned src, unsigned dest) {
+    if (src == dest)
+      return;
+    for (unsigned c = 0, e = getNumCols(); c < e; c++) {
+      atIneq(dest, c) = atIneq(src, c);
+    }
+  };
+
+  // Scan to get rid of all rows marked redundant, in-place.
+  unsigned pos = 0;
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (!redunIneq[r])
+      copyRow(r, pos++);
+  }
+  inequalities.resize(numReservedCols * pos);
+
+  // TODO(bondhugula): consider doing this for equalities as well, but probably
+  // not worth the savings.
+}
+
+void FlatAffineConstraints::clearAndCopyFrom(
+    const FlatAffineConstraints &other) {
+  FlatAffineConstraints copy(other);
+  std::swap(*this, copy);
+  assert(copy.getNumIds() == copy.getIds().size());
+}
+
+void FlatAffineConstraints::removeId(unsigned pos) {
+  removeIdRange(pos, pos + 1);
+}
+
+static std::pair<unsigned, unsigned>
+getNewNumDimsSymbols(unsigned pos, const FlatAffineConstraints &cst) {
+  unsigned numDims = cst.getNumDimIds();
+  unsigned numSymbols = cst.getNumSymbolIds();
+  unsigned newNumDims, newNumSymbols;
+  if (pos < numDims) {
+    newNumDims = numDims - 1;
+    newNumSymbols = numSymbols;
+  } else if (pos < numDims + numSymbols) {
+    assert(numSymbols >= 1);
+    newNumDims = numDims;
+    newNumSymbols = numSymbols - 1;
+  } else {
+    newNumDims = numDims;
+    newNumSymbols = numSymbols;
+  }
+  return {newNumDims, newNumSymbols};
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "fm"
+
+/// Eliminates identifier at the specified position using Fourier-Motzkin
+/// variable elimination. This technique is exact for rational spaces but
+/// conservative (in "rare" cases) for integer spaces. The operation corresponds
+/// to a projection operation yielding the (convex) set of integer points
+/// contained in the rational shadow of the set. An emptiness test that relies
+/// on this method will guarantee emptiness, i.e., it disproves the existence of
+/// a solution if it says it's empty.
+/// If a non-null isResultIntegerExact is passed, it is set to true if the
+/// result is also integer exact. If it's set to false, the obtained solution
+/// *may* not be exact, i.e., it may contain integer points that do not have an
+/// integer pre-image in the original set.
+///
+/// Eg:
+/// j >= 0, j <= i + 1
+/// i >= 0, i <= N + 1
+/// Eliminating i yields,
+///   j >= 0, 0 <= N + 1, j - 1 <= N + 1
+///
+/// If darkShadow = true, this method computes the dark shadow on elimination;
+/// the dark shadow is a convex integer subset of the exact integer shadow. A
+/// non-empty dark shadow proves the existence of an integer solution. The
+/// elimination in such a case could however be an under-approximation, and thus
+/// should not be used for scanning sets or used by itself for dependence
+/// checking.
+///
+/// Eg: 2-d set, * represents grid points, 'o' represents a point in the set.
+///            ^
+///            |
+///            | * * * * o o
+///         i  | * * o o o o
+///            | o * * * * *
+///            --------------->
+///                 j ->
+///
+/// Eliminating i from this system (projecting on the j dimension):
+/// rational shadow / integer light shadow:  1 <= j <= 6
+/// dark shadow:                             3 <= j <= 6
+/// exact integer shadow:                    j = 1 \union  3 <= j <= 6
+/// holes/splinters:                         j = 2
+///
+/// darkShadow = false, isResultIntegerExact = nullptr are default values.
+// TODO(bondhugula): a slight modification to yield dark shadow version of FM
+// (tightened), which can prove the existence of a solution if there is one.
+void FlatAffineConstraints::FourierMotzkinEliminate(
+    unsigned pos, bool darkShadow, bool *isResultIntegerExact) {
+  LLVM_DEBUG(llvm::dbgs() << "FM input (eliminate pos " << pos << "):\n");
+  LLVM_DEBUG(dump());
+  assert(pos < getNumIds() && "invalid position");
+  assert(hasConsistentState());
+
+  // Check if this identifier can be eliminated through a substitution.
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    if (atEq(r, pos) != 0) {
+      // Use Gaussian elimination here (since we have an equality).
+      LogicalResult ret = gaussianEliminateId(pos);
+      (void)ret;
+      assert(succeeded(ret) && "Gaussian elimination guaranteed to succeed");
+      LLVM_DEBUG(llvm::dbgs() << "FM output (through Gaussian elimination):\n");
+      LLVM_DEBUG(dump());
+      return;
+    }
+  }
+
+  // A fast linear time tightening.
+  GCDTightenInequalities();
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) != 0)
+      break;
+  }
+  if (r == getNumInequalities()) {
+    // If it doesn't appear, just remove the column and return.
+    // TODO(andydavis,bondhugula): refactor removeColumns to use it from here.
+    removeId(pos);
+    LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+    LLVM_DEBUG(dump());
+    return;
+  }
+
+  // Positions of constraints that are lower bounds on the variable.
+  SmallVector<unsigned, 4> lbIndices;
+  // Positions of constraints that are lower bounds on the variable.
+  SmallVector<unsigned, 4> ubIndices;
+  // Positions of constraints that do not involve the variable.
+  std::vector<unsigned> nbIndices;
+  nbIndices.reserve(getNumInequalities());
+
+  // Gather all lower bounds and upper bounds of the variable. Since the
+  // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
+  // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) == 0) {
+      // Id does not appear in bound.
+      nbIndices.push_back(r);
+    } else if (atIneq(r, pos) >= 1) {
+      // Lower bound.
+      lbIndices.push_back(r);
+    } else {
+      // Upper bound.
+      ubIndices.push_back(r);
+    }
+  }
+
+  // Set the number of dimensions, symbols in the resulting system.
+  const auto &dimsSymbols = getNewNumDimsSymbols(pos, *this);
+  unsigned newNumDims = dimsSymbols.first;
+  unsigned newNumSymbols = dimsSymbols.second;
+
+  SmallVector<Optional<Value *>, 8> newIds;
+  newIds.reserve(numIds - 1);
+  newIds.append(ids.begin(), ids.begin() + pos);
+  newIds.append(ids.begin() + pos + 1, ids.end());
+
+  /// Create the new system which has one identifier less.
+  FlatAffineConstraints newFac(
+      lbIndices.size() * ubIndices.size() + nbIndices.size(),
+      getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols,
+      /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds);
+
+  assert(newFac.getIds().size() == newFac.getNumIds());
+
+  // This will be used to check if the elimination was integer exact.
+  unsigned lcmProducts = 1;
+
+  // Let x be the variable we are eliminating.
+  // For each lower bound, lb <= c_l*x, and each upper bound c_u*x <= ub, (note
+  // that c_l, c_u >= 1) we have:
+  // lb*lcm(c_l, c_u)/c_l <= lcm(c_l, c_u)*x <= ub*lcm(c_l, c_u)/c_u
+  // We thus generate a constraint:
+  // lcm(c_l, c_u)/c_l*lb <= lcm(c_l, c_u)/c_u*ub.
+  // Note if c_l = c_u = 1, all integer points captured by the resulting
+  // constraint correspond to integer points in the original system (i.e., they
+  // have integer pre-images). Hence, if the lcm's are all 1, the elimination is
+  // integer exact.
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      SmallVector<int64_t, 4> ineq;
+      ineq.reserve(newFac.getNumCols());
+      int64_t lbCoeff = atIneq(lbPos, pos);
+      // Note that in the comments above, ubCoeff is the negation of the
+      // coefficient in the canonical form as the view taken here is that of the
+      // term being moved to the other size of '>='.
+      int64_t ubCoeff = -atIneq(ubPos, pos);
+      // TODO(bondhugula): refactor this loop to avoid all branches inside.
+      for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+        if (l == pos)
+          continue;
+        assert(lbCoeff >= 1 && ubCoeff >= 1 && "bounds wrongly identified");
+        int64_t lcm = mlir::lcm(lbCoeff, ubCoeff);
+        ineq.push_back(atIneq(ubPos, l) * (lcm / ubCoeff) +
+                       atIneq(lbPos, l) * (lcm / lbCoeff));
+        lcmProducts *= lcm;
+      }
+      if (darkShadow) {
+        // The dark shadow is a convex subset of the exact integer shadow. If
+        // there is a point here, it proves the existence of a solution.
+        ineq[ineq.size() - 1] += lbCoeff * ubCoeff - lbCoeff - ubCoeff + 1;
+      }
+      // TODO: we need to have a way to add inequalities in-place in
+      // FlatAffineConstraints instead of creating and copying over.
+      newFac.addInequality(ineq);
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "FM isResultIntegerExact: " << (lcmProducts == 1)
+                          << "\n");
+  if (lcmProducts == 1 && isResultIntegerExact)
+    *isResultIntegerExact = 1;
+
+  // Copy over the constraints not involving this variable.
+  for (auto nbPos : nbIndices) {
+    SmallVector<int64_t, 4> ineq;
+    ineq.reserve(getNumCols() - 1);
+    for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+      if (l == pos)
+        continue;
+      ineq.push_back(atIneq(nbPos, l));
+    }
+    newFac.addInequality(ineq);
+  }
+
+  assert(newFac.getNumConstraints() ==
+         lbIndices.size() * ubIndices.size() + nbIndices.size());
+
+  // Copy over the equalities.
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    SmallVector<int64_t, 4> eq;
+    eq.reserve(newFac.getNumCols());
+    for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+      if (l == pos)
+        continue;
+      eq.push_back(atEq(r, l));
+    }
+    newFac.addEquality(eq);
+  }
+
+  // GCD tightening and normalization allows detection of more trivially
+  // redundant constraints.
+  newFac.GCDTightenInequalities();
+  newFac.normalizeConstraintsByGCD();
+  newFac.removeTrivialRedundancy();
+  clearAndCopyFrom(newFac);
+  LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+  LLVM_DEBUG(dump());
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "affine-structures"
+
+void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
+  if (num == 0)
+    return;
+
+  // 'pos' can be at most getNumCols() - 2 if num > 0.
+  assert((getNumCols() < 2 || pos <= getNumCols() - 2) && "invalid position");
+  assert(pos + num < getNumCols() && "invalid range");
+
+  // Eliminate as many identifiers as possible using Gaussian elimination.
+  unsigned currentPos = pos;
+  unsigned numToEliminate = num;
+  unsigned numGaussianEliminated = 0;
+
+  while (currentPos < getNumIds()) {
+    unsigned curNumEliminated =
+        gaussianEliminateIds(currentPos, currentPos + numToEliminate);
+    ++currentPos;
+    numToEliminate -= curNumEliminated + 1;
+    numGaussianEliminated += curNumEliminated;
+  }
+
+  // Eliminate the remaining using Fourier-Motzkin.
+  for (unsigned i = 0; i < num - numGaussianEliminated; i++) {
+    unsigned numToEliminate = num - numGaussianEliminated - i;
+    FourierMotzkinEliminate(
+        getBestIdToEliminate(*this, pos, pos + numToEliminate));
+  }
+
+  // Fast/trivial simplifications.
+  GCDTightenInequalities();
+  // Normalize constraints after tightening since the latter impacts this, but
+  // not the other way round.
+  normalizeConstraintsByGCD();
+}
+
+void FlatAffineConstraints::projectOut(Value *id) {
+  unsigned pos;
+  bool ret = findId(*id, &pos);
+  assert(ret);
+  (void)ret;
+  FourierMotzkinEliminate(pos);
+}
+
+bool FlatAffineConstraints::isRangeOneToOne(unsigned start,
+                                            unsigned limit) const {
+  assert(start <= getNumIds() - 1 && "invalid start position");
+  assert(limit > start && limit <= getNumIds() && "invalid limit");
+
+  FlatAffineConstraints tmpCst(*this);
+
+  if (start != 0) {
+    // Move [start, limit) to the left.
+    for (unsigned r = 0, e = getNumInequalities(); r < e; ++r) {
+      for (unsigned c = 0, f = getNumCols(); c < f; ++c) {
+        if (c >= start && c < limit)
+          tmpCst.atIneq(r, c - start) = atIneq(r, c);
+        else if (c < start)
+          tmpCst.atIneq(r, c + limit - start) = atIneq(r, c);
+        else
+          tmpCst.atIneq(r, c) = atIneq(r, c);
+      }
+    }
+    for (unsigned r = 0, e = getNumEqualities(); r < e; ++r) {
+      for (unsigned c = 0, f = getNumCols(); c < f; ++c) {
+        if (c >= start && c < limit)
+          tmpCst.atEq(r, c - start) = atEq(r, c);
+        else if (c < start)
+          tmpCst.atEq(r, c + limit - start) = atEq(r, c);
+        else
+          tmpCst.atEq(r, c) = atEq(r, c);
+      }
+    }
+  }
+
+  // Mark everything to the right as symbols so that we can check the extents in
+  // a symbolic way below.
+  tmpCst.setDimSymbolSeparation(getNumIds() - (limit - start));
+
+  // Check if the extents of all the specified dimensions are just one (when
+  // treating the rest as symbols).
+  for (unsigned pos = 0, e = tmpCst.getNumDimIds(); pos < e; ++pos) {
+    auto extent = tmpCst.getConstantBoundOnDimSize(pos);
+    if (!extent.hasValue() || extent.getValue() != 1)
+      return false;
+  }
+  return true;
+}
+
+void FlatAffineConstraints::clearConstraints() {
+  equalities.clear();
+  inequalities.clear();
+}
+
+namespace {
+
+enum BoundCmpResult { Greater, Less, Equal, Unknown };
+
+/// Compares two affine bounds whose coefficients are provided in 'first' and
+/// 'second'. The last coefficient is the constant term.
+static BoundCmpResult compareBounds(ArrayRef<int64_t> a, ArrayRef<int64_t> b) {
+  assert(a.size() == b.size());
+
+  // For the bounds to be comparable, their corresponding identifier
+  // coefficients should be equal; the constant terms are then compared to
+  // determine less/greater/equal.
+
+  if (!std::equal(a.begin(), a.end() - 1, b.begin()))
+    return Unknown;
+
+  if (a.back() == b.back())
+    return Equal;
+
+  return a.back() < b.back() ? Less : Greater;
+}
+} // namespace
+
+// Computes the bounding box with respect to 'other' by finding the min of the
+// lower bounds and the max of the upper bounds along each of the dimensions.
+LogicalResult
+FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
+  assert(otherCst.getNumDimIds() == numDims && "dims mismatch");
+  assert(otherCst.getIds()
+             .slice(0, getNumDimIds())
+             .equals(getIds().slice(0, getNumDimIds())) &&
+         "dim values mismatch");
+  assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
+  assert(getNumLocalIds() == 0 && "local ids not supported yet here");
+
+  Optional<FlatAffineConstraints> otherCopy;
+  if (!areIdsAligned(*this, otherCst)) {
+    otherCopy.emplace(FlatAffineConstraints(otherCst));
+    mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue());
+  }
+
+  const auto &other = otherCopy ? *otherCopy : otherCst;
+
+  std::vector<SmallVector<int64_t, 8>> boundingLbs;
+  std::vector<SmallVector<int64_t, 8>> boundingUbs;
+  boundingLbs.reserve(2 * getNumDimIds());
+  boundingUbs.reserve(2 * getNumDimIds());
+
+  // To hold lower and upper bounds for each dimension.
+  SmallVector<int64_t, 4> lb, otherLb, ub, otherUb;
+  // To compute min of lower bounds and max of upper bounds for each dimension.
+  SmallVector<int64_t, 4> minLb(getNumSymbolIds() + 1);
+  SmallVector<int64_t, 4> maxUb(getNumSymbolIds() + 1);
+  // To compute final new lower and upper bounds for the union.
+  SmallVector<int64_t, 8> newLb(getNumCols()), newUb(getNumCols());
+
+  int64_t lbFloorDivisor, otherLbFloorDivisor;
+  for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
+    auto extent = getConstantBoundOnDimSize(d, &lb, &lbFloorDivisor, &ub);
+    if (!extent.hasValue())
+      // TODO(bondhugula): symbolic extents when necessary.
+      // TODO(bondhugula): handle union if a dimension is unbounded.
+      return failure();
+
+    auto otherExtent = other.getConstantBoundOnDimSize(
+        d, &otherLb, &otherLbFloorDivisor, &otherUb);
+    if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor)
+      // TODO(bondhugula): symbolic extents when necessary.
+      return failure();
+
+    assert(lbFloorDivisor > 0 && "divisor always expected to be positive");
+
+    auto res = compareBounds(lb, otherLb);
+    // Identify min.
+    if (res == BoundCmpResult::Less || res == BoundCmpResult::Equal) {
+      minLb = lb;
+      // Since the divisor is for a floordiv, we need to convert to ceildiv,
+      // i.e., i >= expr floordiv div <=> i >= (expr - div + 1) ceildiv div <=>
+      // div * i >= expr - div + 1.
+      minLb.back() -= lbFloorDivisor - 1;
+    } else if (res == BoundCmpResult::Greater) {
+      minLb = otherLb;
+      minLb.back() -= otherLbFloorDivisor - 1;
+    } else {
+      // Uncomparable - check for constant lower/upper bounds.
+      auto constLb = getConstantLowerBound(d);
+      auto constOtherLb = other.getConstantLowerBound(d);
+      if (!constLb.hasValue() || !constOtherLb.hasValue())
+        return failure();
+      std::fill(minLb.begin(), minLb.end(), 0);
+      minLb.back() = std::min(constLb.getValue(), constOtherLb.getValue());
+    }
+
+    // Do the same for ub's but max of upper bounds. Identify max.
+    auto uRes = compareBounds(ub, otherUb);
+    if (uRes == BoundCmpResult::Greater || uRes == BoundCmpResult::Equal) {
+      maxUb = ub;
+    } else if (uRes == BoundCmpResult::Less) {
+      maxUb = otherUb;
+    } else {
+      // Uncomparable - check for constant lower/upper bounds.
+      auto constUb = getConstantUpperBound(d);
+      auto constOtherUb = other.getConstantUpperBound(d);
+      if (!constUb.hasValue() || !constOtherUb.hasValue())
+        return failure();
+      std::fill(maxUb.begin(), maxUb.end(), 0);
+      maxUb.back() = std::max(constUb.getValue(), constOtherUb.getValue());
+    }
+
+    std::fill(newLb.begin(), newLb.end(), 0);
+    std::fill(newUb.begin(), newUb.end(), 0);
+
+    // The divisor for lb, ub, otherLb, otherUb at this point is lbDivisor,
+    // and so it's the divisor for newLb and newUb as well.
+    newLb[d] = lbFloorDivisor;
+    newUb[d] = -lbFloorDivisor;
+    // Copy over the symbolic part + constant term.
+    std::copy(minLb.begin(), minLb.end(), newLb.begin() + getNumDimIds());
+    std::transform(newLb.begin() + getNumDimIds(), newLb.end(),
+                   newLb.begin() + getNumDimIds(), std::negate<int64_t>());
+    std::copy(maxUb.begin(), maxUb.end(), newUb.begin() + getNumDimIds());
+
+    boundingLbs.push_back(newLb);
+    boundingUbs.push_back(newUb);
+  }
+
+  // Clear all constraints and add the lower/upper bounds for the bounding box.
+  clearConstraints();
+  for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
+    addInequality(boundingLbs[d]);
+    addInequality(boundingUbs[d]);
+  }
+  // TODO(mlir-team): copy over pure symbolic constraints from this and 'other'
+  // over to the union (since the above are just the union along dimensions); we
+  // shouldn't be discarding any other constraints on the symbols.
+
+  return success();
+}
diff --git a/third_party/mlir/lib/Analysis/CMakeLists.txt b/third_party/mlir/lib/Analysis/CMakeLists.txt
new file mode 100644
index 00000000000..e2b1d126cdf
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_llvm_library(MLIRAnalysis STATIC
+  AffineAnalysis.cpp
+  AffineStructures.cpp
+  Dominance.cpp
+  LoopAnalysis.cpp
+  MemRefBoundCheck.cpp
+  NestedMatcher.cpp
+  OpStats.cpp
+  SliceAnalysis.cpp
+  TestMemRefDependenceCheck.cpp
+  TestParallelismDetection.cpp
+  Utils.cpp
+  VectorAnalysis.cpp
+  Verifier.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Analysis
+  )
+add_dependencies(MLIRAnalysis MLIRAffineOps MLIRLoopOps)
+target_link_libraries(MLIRAnalysis MLIRAffineOps MLIRLoopOps)
diff --git a/third_party/mlir/lib/Analysis/Dominance.cpp b/third_party/mlir/lib/Analysis/Dominance.cpp
new file mode 100644
index 00000000000..ead8d7e070c
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/Dominance.cpp
@@ -0,0 +1,164 @@
+//===- Dominance.cpp - Dominator analysis for CFGs ------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Implementation of dominance related classes and instantiations of extern
+// templates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+template class llvm::DominatorTreeBase<Block, /*IsPostDom=*/false>;
+template class llvm::DominatorTreeBase<Block, /*IsPostDom=*/true>;
+template class llvm::DomTreeNodeBase<Block>;
+
+//===----------------------------------------------------------------------===//
+// DominanceInfoBase
+//===----------------------------------------------------------------------===//
+
+template <bool IsPostDom>
+void DominanceInfoBase<IsPostDom>::recalculate(Operation *op) {
+  dominanceInfos.clear();
+
+  /// Build the dominance for each of the operation regions.
+  op->walk([&](Operation *op) {
+    for (auto &region : op->getRegions()) {
+      // Don't compute dominance if the region is empty.
+      if (region.empty())
+        continue;
+      auto opDominance = std::make_unique<base>();
+      opDominance->recalculate(region);
+      dominanceInfos.try_emplace(&region, std::move(opDominance));
+    }
+  });
+}
+
+/// Return true if the specified block A properly dominates block B.
+template <bool IsPostDom>
+bool DominanceInfoBase<IsPostDom>::properlyDominates(Block *a, Block *b) {
+  // A block dominates itself but does not properly dominate itself.
+  if (a == b)
+    return false;
+
+  // If either a or b are null, then conservatively return false.
+  if (!a || !b)
+    return false;
+
+  // If both blocks are not in the same region, 'a' properly dominates 'b' if
+  // 'b' is defined in an operation region that (recursively) ends up being
+  // dominated by 'a'. Walk up the list of containers enclosing B.
+  auto *regionA = a->getParent(), *regionB = b->getParent();
+  if (regionA != regionB) {
+    Operation *bAncestor;
+    do {
+      bAncestor = regionB->getParentOp();
+      // If 'bAncestor' is the top level region, then 'a' is a block that post
+      // dominates 'b'.
+      if (!bAncestor || !bAncestor->getBlock())
+        return IsPostDom;
+
+      regionB = bAncestor->getBlock()->getParent();
+    } while (regionA != regionB);
+
+    // Check to see if the ancestor of 'b' is the same block as 'a'.
+    b = bAncestor->getBlock();
+    if (a == b)
+      return true;
+  }
+
+  // Otherwise, use the standard dominance functionality.
+
+  // If we don't have a dominance information for this region, assume that b is
+  // dominated by anything.
+  auto baseInfoIt = dominanceInfos.find(regionA);
+  if (baseInfoIt == dominanceInfos.end())
+    return true;
+  return baseInfoIt->second->properlyDominates(a, b);
+}
+
+template class mlir::detail::DominanceInfoBase</*IsPostDom=*/true>;
+template class mlir::detail::DominanceInfoBase</*IsPostDom=*/false>;
+
+//===----------------------------------------------------------------------===//
+// DominanceInfo
+//===----------------------------------------------------------------------===//
+
+/// Return true if operation A properly dominates operation B.
+bool DominanceInfo::properlyDominates(Operation *a, Operation *b) {
+  auto *aBlock = a->getBlock(), *bBlock = b->getBlock();
+
+  // If a or b are not within a block, then a does not dominate b.
+  if (!aBlock || !bBlock)
+    return false;
+
+  // If the blocks are the same, then check if b is before a in the block.
+  if (aBlock == bBlock)
+    return a->isBeforeInBlock(b);
+
+  // Traverse up b's hierarchy to check if b's block is contained in a's.
+  if (auto *bAncestor = aBlock->findAncestorInstInBlock(*b)) {
+    // Since we already know that aBlock != bBlock, here bAncestor != b.
+    // a and bAncestor are in the same block; check if 'a' dominates
+    // bAncestor.
+    return dominates(a, bAncestor);
+  }
+
+  // If the blocks are different, check if a's block dominates b's.
+  return properlyDominates(aBlock, bBlock);
+}
+
+/// Return true if value A properly dominates operation B.
+bool DominanceInfo::properlyDominates(Value *a, Operation *b) {
+  if (auto *aInst = a->getDefiningOp())
+    return properlyDominates(aInst, b);
+
+  // block arguments properly dominate all operations in their own block, so
+  // we use a dominates check here, not a properlyDominates check.
+  return dominates(cast<BlockArgument>(a)->getOwner(), b->getBlock());
+}
+
+//===----------------------------------------------------------------------===//
+// PostDominanceInfo
+//===----------------------------------------------------------------------===//
+
+/// Returns true if statement 'a' properly postdominates statement b.
+bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b) {
+  auto *aBlock = a->getBlock(), *bBlock = b->getBlock();
+
+  // If a or b are not within a block, then a does not post dominate b.
+  if (!aBlock || !bBlock)
+    return false;
+
+  // If the blocks are the same, check if b is before a in the block.
+  if (aBlock == bBlock)
+    return b->isBeforeInBlock(a);
+
+  // Traverse up b's hierarchy to check if b's block is contained in a's.
+  if (auto *bAncestor = a->getBlock()->findAncestorInstInBlock(*b))
+    // Since we already know that aBlock != bBlock, here bAncestor != b.
+    // a and bAncestor are in the same block; check if 'a' postdominates
+    // bAncestor.
+    return postDominates(a, bAncestor);
+
+  // If the blocks are different, check if a's block post dominates b's.
+  return properlyDominates(aBlock, bBlock);
+}
diff --git a/third_party/mlir/lib/Analysis/LoopAnalysis.cpp b/third_party/mlir/lib/Analysis/LoopAnalysis.cpp
new file mode 100644
index 00000000000..21d47c3c1ea
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/LoopAnalysis.cpp
@@ -0,0 +1,402 @@
+//===- LoopAnalysis.cpp - Misc loop analysis routines //-------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous loop analysis routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/MathExtras.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
+#include <type_traits>
+
+using namespace mlir;
+
+/// Returns the trip count of the loop as an affine expression if the latter is
+/// expressible as an affine expression, and nullptr otherwise. The trip count
+/// expression is simplified before returning. This method only utilizes map
+/// composition to construct lower and upper bounds before computing the trip
+/// count expressions.
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints; the latter will also
+// be more powerful (since both inequalities and equalities will be considered).
+void mlir::buildTripCountMapAndOperands(
+    AffineForOp forOp, AffineMap *map,
+    SmallVectorImpl<Value *> *tripCountOperands) {
+  int64_t loopSpan;
+
+  int64_t step = forOp.getStep();
+  OpBuilder b(forOp.getOperation());
+
+  if (forOp.hasConstantBounds()) {
+    int64_t lb = forOp.getConstantLowerBound();
+    int64_t ub = forOp.getConstantUpperBound();
+    loopSpan = ub - lb;
+    if (loopSpan < 0)
+      loopSpan = 0;
+    *map = b.getConstantAffineMap(ceilDiv(loopSpan, step));
+    tripCountOperands->clear();
+    return;
+  }
+  auto lbMap = forOp.getLowerBoundMap();
+  auto ubMap = forOp.getUpperBoundMap();
+  if (lbMap.getNumResults() != 1) {
+    *map = AffineMap();
+    return;
+  }
+  SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands());
+  SmallVector<Value *, 4> ubOperands(forOp.getUpperBoundOperands());
+  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
+  SmallVector<Value *, 4> ubs;
+  ubs.reserve(ubMap.getNumResults());
+  for (auto ubExpr : ubMap.getResults())
+    ubs.push_back(b.create<AffineApplyOp>(
+        forOp.getLoc(),
+        b.getAffineMap(ubMap.getNumDims(), ubMap.getNumSymbols(), {ubExpr}),
+        ubOperands));
+
+  tripCountOperands->clear();
+  tripCountOperands->reserve(1 + ubs.size());
+  tripCountOperands->push_back(lb);
+  tripCountOperands->append(ubs.begin(), ubs.end());
+
+  SmallVector<AffineExpr, 4> tripCountExprs(ubs.size());
+  for (unsigned i = 0, e = ubs.size(); i < e; i++)
+    tripCountExprs[i] =
+        (b.getAffineDimExpr(1 + i) - b.getAffineDimExpr(0)).ceilDiv(step);
+  *map = b.getAffineMap(1 + ubs.size(), 0, tripCountExprs);
+
+  fullyComposeAffineMapAndOperands(map, tripCountOperands);
+  *map = simplifyAffineMap(*map);
+  canonicalizeMapAndOperands(map, tripCountOperands);
+  // Remove any affine.apply's that became dead as a result of composition,
+  // simplification, and canonicalization above.
+  for (auto *v : ubs)
+    if (v->use_empty())
+      v->getDefiningOp()->erase();
+  if (lb.use_empty())
+    lb.erase();
+}
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// method uses affine expression analysis (in turn using getTripCount) and is
+/// able to determine constant trip count in non-trivial cases.
+// FIXME(mlir-team): this is really relying on buildTripCountMapAndOperands;
+// being an analysis utility, it shouldn't. Replace with a version that just
+// works with analysis structures (FlatAffineConstraints) and thus doesn't
+// update the IR.
+llvm::Optional<uint64_t> mlir::getConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value *, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return None;
+
+  // Take the min if all trip counts are constant.
+  Optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      if (tripCount.hasValue())
+        tripCount = std::min(tripCount.getValue(),
+                             static_cast<uint64_t>(constExpr.getValue()));
+      else
+        tripCount = constExpr.getValue();
+    } else
+      return None;
+  }
+  return tripCount;
+}
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
+  SmallVector<Value *, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return 1;
+
+  // The largest divisor of the trip count is the GCD of the individual largest
+  // divisors.
+  assert(map.getNumResults() >= 1 && "expected one or more results");
+  Optional<uint64_t> gcd;
+  for (auto resultExpr : map.getResults()) {
+    uint64_t thisGcd;
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      uint64_t tripCount = constExpr.getValue();
+      // 0 iteration loops (greatest divisor is 2^64 - 1).
+      if (tripCount == 0)
+        thisGcd = std::numeric_limits<uint64_t>::max();
+      else
+        // The greatest divisor is the trip count.
+        thisGcd = tripCount;
+    } else {
+      // Trip count is not a known constant; return its largest known divisor.
+      thisGcd = resultExpr.getLargestKnownDivisor();
+    }
+    if (gcd.hasValue())
+      gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
+    else
+      gcd = thisGcd;
+  }
+  assert(gcd.hasValue() && "value expected per above logic");
+  return gcd.getValue();
+}
+
+bool mlir::isAccessInvariant(Value *iv, Value *index) {
+  assert(isForInductionVar(iv) && "iv must be a AffineForOp");
+  assert(index->getType().isa<IndexType>() && "index must be of IndexType");
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps({index}, affineApplyOps);
+
+  if (affineApplyOps.empty()) {
+    // Pointer equality test because of Value pointer semantics.
+    return index != iv;
+  }
+
+  if (affineApplyOps.size() > 1) {
+    affineApplyOps[0]->emitRemark(
+        "CompositionAffineMapsPass must have been run: there should be at most "
+        "one AffineApplyOp, returning false conservatively.");
+    return false;
+  }
+
+  auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
+  // We need yet another level of indirection because the `dim` index of the
+  // access may not correspond to the `dim` index of composeOp.
+  return !(AffineValueMap(composeOp).isFunctionOf(0, iv));
+}
+
+llvm::DenseSet<Value *>
+mlir::getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices) {
+  llvm::DenseSet<Value *> res;
+  for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
+    auto *val = indices[idx];
+    if (isAccessInvariant(iv, val)) {
+      res.insert(val);
+    }
+  }
+  return res;
+}
+
+/// Given:
+///   1. an induction variable `iv` of type AffineForOp;
+///   2. a `memoryOp` of type const LoadOp& or const StoreOp&;
+/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
+/// is defined as either invariant or varying only along a unique MemRef dim.
+/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
+/// convey the memRef access is invariant along `iv`).
+///
+/// Prerequisites:
+///   1. `memRefDim` ~= nullptr;
+///   2. `iv` of the proper type;
+///   3. the MemRef accessed by `memoryOp` has no layout map or at most an
+///      identity layout map.
+///
+/// Currently only supports no layoutMap or identity layoutMap in the MemRef.
+/// Returns false if the MemRef has a non-identity layoutMap or more than 1
+/// layoutMap. This is conservative.
+///
+// TODO(ntv): check strides.
+template <typename LoadOrStoreOp>
+static bool isContiguousAccess(Value *iv, LoadOrStoreOp memoryOp,
+                               int *memRefDim) {
+  static_assert(std::is_same<LoadOrStoreOp, AffineLoadOp>::value ||
+                    std::is_same<LoadOrStoreOp, AffineStoreOp>::value,
+                "Must be called on either const LoadOp & or const StoreOp &");
+  assert(memRefDim && "memRefDim == nullptr");
+  auto memRefType = memoryOp.getMemRefType();
+
+  auto layoutMap = memRefType.getAffineMaps();
+  // TODO(ntv): remove dependence on Builder once we support non-identity
+  // layout map.
+  Builder b(memoryOp.getContext());
+  if (layoutMap.size() >= 2 ||
+      (layoutMap.size() == 1 &&
+       !(layoutMap[0] ==
+         b.getMultiDimIdentityMap(layoutMap[0].getNumDims())))) {
+    return memoryOp.emitError("NYI: non-trivial layoutMap"), false;
+  }
+
+  int uniqueVaryingIndexAlongIv = -1;
+  auto accessMap = memoryOp.getAffineMap();
+  SmallVector<Value *, 4> mapOperands(memoryOp.getIndices());
+  unsigned numDims = accessMap.getNumDims();
+  for (unsigned i = 0, e = memRefType.getRank(); i < e; ++i) {
+    // Gather map operands used result expr 'i' in 'exprOperands'.
+    SmallVector<Value *, 4> exprOperands;
+    auto resultExpr = accessMap.getResult(i);
+    resultExpr.walk([&](AffineExpr expr) {
+      if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+        exprOperands.push_back(mapOperands[dimExpr.getPosition()]);
+      else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
+        exprOperands.push_back(mapOperands[numDims + symExpr.getPosition()]);
+    });
+    // Check access invariance of each operand in 'exprOperands'.
+    for (auto *exprOperand : exprOperands) {
+      if (!isAccessInvariant(iv, exprOperand)) {
+        if (uniqueVaryingIndexAlongIv != -1) {
+          // 2+ varying indices -> do not vectorize along iv.
+          return false;
+        }
+        uniqueVaryingIndexAlongIv = i;
+      }
+    }
+  }
+
+  if (uniqueVaryingIndexAlongIv == -1)
+    *memRefDim = -1;
+  else
+    *memRefDim = memRefType.getRank() - (uniqueVaryingIndexAlongIv + 1);
+  return true;
+}
+
+template <typename LoadOrStoreOpPointer>
+static bool isVectorElement(LoadOrStoreOpPointer memoryOp) {
+  auto memRefType = memoryOp.getMemRefType();
+  return memRefType.getElementType().template isa<VectorType>();
+}
+
+static bool isVectorTransferReadOrWrite(Operation &op) {
+  return isa<vector::VectorTransferReadOp>(op) ||
+         isa<vector::VectorTransferWriteOp>(op);
+}
+
+using VectorizableOpFun = std::function<bool(AffineForOp, Operation &)>;
+
+static bool
+isVectorizableLoopBodyWithOpCond(AffineForOp loop,
+                                 VectorizableOpFun isVectorizableOp) {
+  auto *forOp = loop.getOperation();
+
+  // No vectorization across conditionals for now.
+  auto conditionals = matcher::If();
+  SmallVector<NestedMatch, 8> conditionalsMatched;
+  conditionals.match(forOp, &conditionalsMatched);
+  if (!conditionalsMatched.empty()) {
+    return false;
+  }
+
+  // No vectorization across unknown regions.
+  auto regions = matcher::Op([](Operation &op) -> bool {
+    return op.getNumRegions() != 0 &&
+           !(isa<AffineIfOp>(op) || isa<AffineForOp>(op));
+  });
+  SmallVector<NestedMatch, 8> regionsMatched;
+  regions.match(forOp, &regionsMatched);
+  if (!regionsMatched.empty()) {
+    return false;
+  }
+
+  auto vectorTransfers = matcher::Op(isVectorTransferReadOrWrite);
+  SmallVector<NestedMatch, 8> vectorTransfersMatched;
+  vectorTransfers.match(forOp, &vectorTransfersMatched);
+  if (!vectorTransfersMatched.empty()) {
+    return false;
+  }
+
+  auto loadAndStores = matcher::Op(matcher::isLoadOrStore);
+  SmallVector<NestedMatch, 8> loadAndStoresMatched;
+  loadAndStores.match(forOp, &loadAndStoresMatched);
+  for (auto ls : loadAndStoresMatched) {
+    auto *op = ls.getMatchedOperation();
+    auto load = dyn_cast<AffineLoadOp>(op);
+    auto store = dyn_cast<AffineStoreOp>(op);
+    // Only scalar types are considered vectorizable, all load/store must be
+    // vectorizable for a loop to qualify as vectorizable.
+    // TODO(ntv): ponder whether we want to be more general here.
+    bool vector = load ? isVectorElement(load) : isVectorElement(store);
+    if (vector) {
+      return false;
+    }
+    if (isVectorizableOp && !isVectorizableOp(loop, *op)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim) {
+  VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
+    auto load = dyn_cast<AffineLoadOp>(op);
+    auto store = dyn_cast<AffineStoreOp>(op);
+    return load ? isContiguousAccess(loop.getInductionVar(), load, memRefDim)
+                : isContiguousAccess(loop.getInductionVar(), store, memRefDim);
+  });
+  return isVectorizableLoopBodyWithOpCond(loop, fun);
+}
+
+bool mlir::isVectorizableLoopBody(AffineForOp loop) {
+  return isVectorizableLoopBodyWithOpCond(loop, nullptr);
+}
+
+/// Checks whether SSA dominance would be violated if a for op's body
+/// operations are shifted by the specified shifts. This method checks if a
+/// 'def' and all its uses have the same shift factor.
+// TODO(mlir-team): extend this to check for memory-based dependence violation
+// when we have the support.
+bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
+  auto *forBody = forOp.getBody();
+  assert(shifts.size() == forBody->getOperations().size());
+
+  // Work backwards over the body of the block so that the shift of a use's
+  // ancestor operation in the block gets recorded before it's looked up.
+  DenseMap<Operation *, uint64_t> forBodyShift;
+  for (auto it : llvm::enumerate(llvm::reverse(forBody->getOperations()))) {
+    auto &op = it.value();
+
+    // Get the index of the current operation, note that we are iterating in
+    // reverse so we need to fix it up.
+    size_t index = shifts.size() - it.index() - 1;
+
+    // Remember the shift of this operation.
+    uint64_t shift = shifts[index];
+    forBodyShift.try_emplace(&op, shift);
+
+    // Validate the results of this operation if it were to be shifted.
+    for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
+      Value *result = op.getResult(i);
+      for (auto *user : result->getUsers()) {
+        // If an ancestor operation doesn't lie in the block of forOp,
+        // there is no shift to check.
+        if (auto *ancInst = forBody->findAncestorInstInBlock(*user)) {
+          assert(forBodyShift.count(ancInst) > 0 && "ancestor expected in map");
+          if (shift != forBodyShift[ancInst])
+            return false;
+        }
+      }
+    }
+  }
+  return true;
+}
diff --git a/third_party/mlir/lib/Analysis/MemRefBoundCheck.cpp b/third_party/mlir/lib/Analysis/MemRefBoundCheck.cpp
new file mode 100644
index 00000000000..849407520da
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/MemRefBoundCheck.cpp
@@ -0,0 +1,63 @@
+//===- MemRefBoundCheck.cpp - MLIR Affine Structures Class ----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to check memref accessses for out of bound
+// accesses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "memref-bound-check"
+
+using namespace mlir;
+
+namespace {
+
+/// Checks for out of bound memef access subscripts..
+struct MemRefBoundCheck : public FunctionPass<MemRefBoundCheck> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+FunctionPassBase *mlir::createMemRefBoundCheckPass() {
+  return new MemRefBoundCheck();
+}
+
+void MemRefBoundCheck::runOnFunction() {
+  getFunction().walk([](Operation *opInst) {
+    if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+      boundCheckLoadOrStoreOp(loadOp);
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+      boundCheckLoadOrStoreOp(storeOp);
+    }
+    // TODO(bondhugula): do this for DMA ops as well.
+  });
+}
+
+static PassRegistration<MemRefBoundCheck>
+    memRefBoundCheck("memref-bound-check",
+                     "Check memref access bounds in a Function");
diff --git a/third_party/mlir/lib/Analysis/NestedMatcher.cpp b/third_party/mlir/lib/Analysis/NestedMatcher.cpp
new file mode 100644
index 00000000000..9d7d17f836c
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/NestedMatcher.cpp
@@ -0,0 +1,161 @@
+//===- NestedMatcher.cpp - NestedMatcher Impl  ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+llvm::BumpPtrAllocator *&NestedMatch::allocator() {
+  thread_local llvm::BumpPtrAllocator *allocator = nullptr;
+  return allocator;
+}
+
+NestedMatch NestedMatch::build(Operation *operation,
+                               ArrayRef<NestedMatch> nestedMatches) {
+  auto *result = allocator()->Allocate<NestedMatch>();
+  auto *children = allocator()->Allocate<NestedMatch>(nestedMatches.size());
+  std::uninitialized_copy(nestedMatches.begin(), nestedMatches.end(), children);
+  new (result) NestedMatch();
+  result->matchedOperation = operation;
+  result->matchedChildren =
+      ArrayRef<NestedMatch>(children, nestedMatches.size());
+  return *result;
+}
+
+llvm::BumpPtrAllocator *&NestedPattern::allocator() {
+  thread_local llvm::BumpPtrAllocator *allocator = nullptr;
+  return allocator;
+}
+
+NestedPattern::NestedPattern(ArrayRef<NestedPattern> nested,
+                             FilterFunctionType filter)
+    : nestedPatterns(), filter(filter), skip(nullptr) {
+  if (!nested.empty()) {
+    auto *newNested = allocator()->Allocate<NestedPattern>(nested.size());
+    std::uninitialized_copy(nested.begin(), nested.end(), newNested);
+    nestedPatterns = ArrayRef<NestedPattern>(newNested, nested.size());
+  }
+}
+
+unsigned NestedPattern::getDepth() const {
+  if (nestedPatterns.empty()) {
+    return 1;
+  }
+  unsigned depth = 0;
+  for (auto &c : nestedPatterns) {
+    depth = std::max(depth, c.getDepth());
+  }
+  return depth + 1;
+}
+
+/// Matches a single operation in the following way:
+///   1. checks the kind of operation against the matcher, if different then
+///      there is no match;
+///   2. calls the customizable filter function to refine the single operation
+///      match with extra semantic constraints;
+///   3. if all is good, recursivey matches the nested patterns;
+///   4. if all nested match then the single operation matches too and is
+///      appended to the list of matches;
+///   5. TODO(ntv) Optionally applies actions (lambda), in which case we will
+///      want to traverse in post-order DFS to avoid invalidating iterators.
+void NestedPattern::matchOne(Operation *op,
+                             SmallVectorImpl<NestedMatch> *matches) {
+  if (skip == op) {
+    return;
+  }
+  // Local custom filter function
+  if (!filter(*op)) {
+    return;
+  }
+
+  if (nestedPatterns.empty()) {
+    SmallVector<NestedMatch, 8> nestedMatches;
+    matches->push_back(NestedMatch::build(op, nestedMatches));
+    return;
+  }
+  // Take a copy of each nested pattern so we can match it.
+  for (auto nestedPattern : nestedPatterns) {
+    SmallVector<NestedMatch, 8> nestedMatches;
+    // Skip elem in the walk immediately following. Without this we would
+    // essentially need to reimplement walk here.
+    nestedPattern.skip = op;
+    nestedPattern.match(op, &nestedMatches);
+    // If we could not match even one of the specified nestedPattern, early exit
+    // as this whole branch is not a match.
+    if (nestedMatches.empty()) {
+      return;
+    }
+    matches->push_back(NestedMatch::build(op, nestedMatches));
+  }
+}
+
+static bool isAffineForOp(Operation &op) { return isa<AffineForOp>(op); }
+
+static bool isAffineIfOp(Operation &op) { return isa<AffineIfOp>(op); }
+
+namespace mlir {
+namespace matcher {
+
+NestedPattern Op(FilterFunctionType filter) {
+  return NestedPattern({}, filter);
+}
+
+NestedPattern If(NestedPattern child) {
+  return NestedPattern(child, isAffineIfOp);
+}
+NestedPattern If(FilterFunctionType filter, NestedPattern child) {
+  return NestedPattern(child, [filter](Operation &op) {
+    return isAffineIfOp(op) && filter(op);
+  });
+}
+NestedPattern If(ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, isAffineIfOp);
+}
+NestedPattern If(FilterFunctionType filter, ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, [filter](Operation &op) {
+    return isAffineIfOp(op) && filter(op);
+  });
+}
+
+NestedPattern For(NestedPattern child) {
+  return NestedPattern(child, isAffineForOp);
+}
+NestedPattern For(FilterFunctionType filter, NestedPattern child) {
+  return NestedPattern(
+      child, [=](Operation &op) { return isAffineForOp(op) && filter(op); });
+}
+NestedPattern For(ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, isAffineForOp);
+}
+NestedPattern For(FilterFunctionType filter, ArrayRef<NestedPattern> nested) {
+  return NestedPattern(
+      nested, [=](Operation &op) { return isAffineForOp(op) && filter(op); });
+}
+
+bool isLoadOrStore(Operation &op) {
+  return isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op);
+}
+
+} // end namespace matcher
+} // end namespace mlir
diff --git a/third_party/mlir/lib/Analysis/OpStats.cpp b/third_party/mlir/lib/Analysis/OpStats.cpp
new file mode 100644
index 00000000000..f01ec56ddb1
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/OpStats.cpp
@@ -0,0 +1,93 @@
+//===- OpStats.cpp - Prints stats of operations in module -----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+struct PrintOpStatsPass : public ModulePass<PrintOpStatsPass> {
+  explicit PrintOpStatsPass(llvm::raw_ostream &os = llvm::errs()) : os(os) {}
+
+  // Prints the resultant operation statistics post iterating over the module.
+  void runOnModule() override;
+
+  // Print summary of op stats.
+  void printSummary();
+
+private:
+  llvm::StringMap<int64_t> opCount;
+  llvm::raw_ostream &os;
+};
+} // namespace
+
+void PrintOpStatsPass::runOnModule() {
+  opCount.clear();
+
+  // Compute the operation statistics for each function in the module.
+  for (auto &op : getModule())
+    op.walk([&](Operation *op) { ++opCount[op->getName().getStringRef()]; });
+  printSummary();
+}
+
+void PrintOpStatsPass::printSummary() {
+  os << "Operations encountered:\n";
+  os << "-----------------------\n";
+  SmallVector<StringRef, 64> sorted(opCount.keys());
+  llvm::sort(sorted);
+
+  // Split an operation name from its dialect prefix.
+  auto splitOperationName = [](StringRef opName) {
+    auto splitName = opName.split('.');
+    return splitName.second.empty() ? std::make_pair("", splitName.first)
+                                    : splitName;
+  };
+
+  // Compute the largest dialect and operation name.
+  StringRef dialectName, opName;
+  size_t maxLenOpName = 0, maxLenDialect = 0;
+  for (const auto &key : sorted) {
+    std::tie(dialectName, opName) = splitOperationName(key);
+    maxLenDialect = std::max(maxLenDialect, dialectName.size());
+    maxLenOpName = std::max(maxLenOpName, opName.size());
+  }
+
+  for (const auto &key : sorted) {
+    std::tie(dialectName, opName) = splitOperationName(key);
+
+    // Left-align the names (aligning on the dialect) and right-align the count
+    // below. The alignment is for readability and does not affect CSV/FileCheck
+    // parsing.
+    if (dialectName.empty())
+      os.indent(maxLenDialect + 3);
+    else
+      os << llvm::right_justify(dialectName, maxLenDialect + 2) << '.';
+
+    // Left justify the operation name.
+    os << llvm::left_justify(opName, maxLenOpName) << " , " << opCount[key]
+       << '\n';
+  }
+}
+
+static PassRegistration<PrintOpStatsPass>
+    pass("print-op-stats", "Print statistics of operations");
diff --git a/third_party/mlir/lib/Analysis/SliceAnalysis.cpp b/third_party/mlir/lib/Analysis/SliceAnalysis.cpp
new file mode 100644
index 00000000000..2f7eddf5ab3
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -0,0 +1,223 @@
+//===- UseDefAnalysis.cpp - Analysis for Transitive UseDef chains ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements Analysis functions specific to slicing in Function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+
+///
+/// Implements Analysis functions specific to slicing in Function.
+///
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+static void getForwardSliceImpl(Operation *op,
+                                SetVector<Operation *> *forwardSlice,
+                                TransitiveFilter filter) {
+  if (!op) {
+    return;
+  }
+
+  // Evaluate whether we should keep this use.
+  // This is useful in particular to implement scoping; i.e. return the
+  // transitive forwardSlice in the current scope.
+  if (!filter(op)) {
+    return;
+  }
+
+  if (auto forOp = dyn_cast<AffineForOp>(op)) {
+    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
+      if (forwardSlice->count(ownerInst) == 0)
+        getForwardSliceImpl(ownerInst, forwardSlice, filter);
+  } else if (auto forOp = dyn_cast<loop::ForOp>(op)) {
+    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
+      if (forwardSlice->count(ownerInst) == 0)
+        getForwardSliceImpl(ownerInst, forwardSlice, filter);
+  } else {
+    assert(op->getNumRegions() == 0 && "unexpected generic op with regions");
+    assert(op->getNumResults() <= 1 && "unexpected multiple results");
+    if (op->getNumResults() > 0) {
+      for (auto *ownerInst : op->getResult(0)->getUsers())
+        if (forwardSlice->count(ownerInst) == 0)
+          getForwardSliceImpl(ownerInst, forwardSlice, filter);
+    }
+  }
+
+  forwardSlice->insert(op);
+}
+
+void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
+                           TransitiveFilter filter) {
+  getForwardSliceImpl(op, forwardSlice, filter);
+  // Don't insert the top level operation, we just queried on it and don't
+  // want it in the results.
+  forwardSlice->remove(op);
+
+  // Reverse to get back the actual topological order.
+  // std::reverse does not work out of the box on SetVector and I want an
+  // in-place swap based thing (the real std::reverse, not the LLVM adapter).
+  std::vector<Operation *> v(forwardSlice->takeVector());
+  forwardSlice->insert(v.rbegin(), v.rend());
+}
+
+static void getBackwardSliceImpl(Operation *op,
+                                 SetVector<Operation *> *backwardSlice,
+                                 TransitiveFilter filter) {
+  if (!op)
+    return;
+
+  assert((op->getNumRegions() == 0 || isa<AffineForOp>(op) ||
+          isa<loop::ForOp>(op)) &&
+         "unexpected generic op with regions");
+
+  // Evaluate whether we should keep this def.
+  // This is useful in particular to implement scoping; i.e. return the
+  // transitive forwardSlice in the current scope.
+  if (!filter(op)) {
+    return;
+  }
+
+  for (auto en : llvm::enumerate(op->getOperands())) {
+    auto *operand = en.value();
+    if (auto *blockArg = dyn_cast<BlockArgument>(operand)) {
+      if (auto affIv = getForInductionVarOwner(operand)) {
+        auto *affOp = affIv.getOperation();
+        if (backwardSlice->count(affOp) == 0)
+          getBackwardSliceImpl(affOp, backwardSlice, filter);
+      } else if (auto loopIv = loop::getForInductionVarOwner(operand)) {
+        auto *loopOp = loopIv.getOperation();
+        if (backwardSlice->count(loopOp) == 0)
+          getBackwardSliceImpl(loopOp, backwardSlice, filter);
+      } else if (blockArg->getOwner() !=
+                 &op->getParentOfType<FuncOp>().getBody().front()) {
+        op->emitError("Unsupported CF for operand ") << en.index();
+        llvm_unreachable("Unsupported control flow");
+      }
+      continue;
+    }
+    auto *op = operand->getDefiningOp();
+    if (backwardSlice->count(op) == 0) {
+      getBackwardSliceImpl(op, backwardSlice, filter);
+    }
+  }
+
+  backwardSlice->insert(op);
+}
+
+void mlir::getBackwardSlice(Operation *op,
+                            SetVector<Operation *> *backwardSlice,
+                            TransitiveFilter filter) {
+  getBackwardSliceImpl(op, backwardSlice, filter);
+
+  // Don't insert the top level operation, we just queried on it and don't
+  // want it in the results.
+  backwardSlice->remove(op);
+}
+
+SetVector<Operation *> mlir::getSlice(Operation *op,
+                                      TransitiveFilter backwardFilter,
+                                      TransitiveFilter forwardFilter) {
+  SetVector<Operation *> slice;
+  slice.insert(op);
+
+  unsigned currentIndex = 0;
+  SetVector<Operation *> backwardSlice;
+  SetVector<Operation *> forwardSlice;
+  while (currentIndex != slice.size()) {
+    auto *currentInst = (slice)[currentIndex];
+    // Compute and insert the backwardSlice starting from currentInst.
+    backwardSlice.clear();
+    getBackwardSlice(currentInst, &backwardSlice, backwardFilter);
+    slice.insert(backwardSlice.begin(), backwardSlice.end());
+
+    // Compute and insert the forwardSlice starting from currentInst.
+    forwardSlice.clear();
+    getForwardSlice(currentInst, &forwardSlice, forwardFilter);
+    slice.insert(forwardSlice.begin(), forwardSlice.end());
+    ++currentIndex;
+  }
+  return topologicalSort(slice);
+}
+
+namespace {
+/// DFS post-order implementation that maintains a global count to work across
+/// multiple invocations, to help implement topological sort on multi-root DAGs.
+/// We traverse all operations but only record the ones that appear in
+/// `toSort` for the final result.
+struct DFSState {
+  DFSState(const SetVector<Operation *> &set)
+      : toSort(set), topologicalCounts(), seen() {}
+  const SetVector<Operation *> &toSort;
+  SmallVector<Operation *, 16> topologicalCounts;
+  DenseSet<Operation *> seen;
+};
+} // namespace
+
+static void DFSPostorder(Operation *current, DFSState *state) {
+  assert(current->getNumResults() <= 1 && "NYI: multi-result");
+  if (current->getNumResults() > 0) {
+    for (auto &u : current->getResult(0)->getUses()) {
+      auto *op = u.getOwner();
+      DFSPostorder(op, state);
+    }
+  }
+  bool inserted;
+  using IterTy = decltype(state->seen.begin());
+  IterTy iter;
+  std::tie(iter, inserted) = state->seen.insert(current);
+  if (inserted) {
+    if (state->toSort.count(current) > 0) {
+      state->topologicalCounts.push_back(current);
+    }
+  }
+}
+
+SetVector<Operation *>
+mlir::topologicalSort(const SetVector<Operation *> &toSort) {
+  if (toSort.empty()) {
+    return toSort;
+  }
+
+  // Run from each root with global count and `seen` set.
+  DFSState state(toSort);
+  for (auto *s : toSort) {
+    assert(toSort.count(s) == 1 && "NYI: multi-sets not supported");
+    DFSPostorder(s, &state);
+  }
+
+  // Reorder and return.
+  SetVector<Operation *> res;
+  for (auto it = state.topologicalCounts.rbegin(),
+            eit = state.topologicalCounts.rend();
+       it != eit; ++it) {
+    res.insert(*it);
+  }
+  return res;
+}
diff --git a/third_party/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp b/third_party/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp
new file mode 100644
index 00000000000..477121fcc24
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp
@@ -0,0 +1,129 @@
+//===- TestMemRefDependenceCheck.cpp - Test dep analysis ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to run pair-wise memref access dependence checks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "test-memref-dependence-check"
+
+using namespace mlir;
+
+namespace {
+
+// TODO(andydavis) Add common surrounding loop depth-wise dependence checks.
+/// Checks dependences between all pairs of memref accesses in a Function.
+struct TestMemRefDependenceCheck
+    : public FunctionPass<TestMemRefDependenceCheck> {
+  SmallVector<Operation *, 4> loadsAndStores;
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+FunctionPassBase *mlir::createTestMemRefDependenceCheckPass() {
+  return new TestMemRefDependenceCheck();
+}
+
+// Returns a result string which represents the direction vector (if there was
+// a dependence), returns the string "false" otherwise.
+static std::string
+getDirectionVectorStr(bool ret, unsigned numCommonLoops, unsigned loopNestDepth,
+                      ArrayRef<DependenceComponent> dependenceComponents) {
+  if (!ret)
+    return "false";
+  if (dependenceComponents.empty() || loopNestDepth > numCommonLoops)
+    return "true";
+  std::string result;
+  for (unsigned i = 0, e = dependenceComponents.size(); i < e; ++i) {
+    std::string lbStr = "-inf";
+    if (dependenceComponents[i].lb.hasValue() &&
+        dependenceComponents[i].lb.getValue() !=
+            std::numeric_limits<int64_t>::min())
+      lbStr = std::to_string(dependenceComponents[i].lb.getValue());
+
+    std::string ubStr = "+inf";
+    if (dependenceComponents[i].ub.hasValue() &&
+        dependenceComponents[i].ub.getValue() !=
+            std::numeric_limits<int64_t>::max())
+      ubStr = std::to_string(dependenceComponents[i].ub.getValue());
+
+    result += "[" + lbStr + ", " + ubStr + "]";
+  }
+  return result;
+}
+
+// For each access in 'loadsAndStores', runs a depence check between this
+// "source" access and all subsequent "destination" accesses in
+// 'loadsAndStores'. Emits the result of the dependence check as a note with
+// the source access.
+static void checkDependences(ArrayRef<Operation *> loadsAndStores) {
+  for (unsigned i = 0, e = loadsAndStores.size(); i < e; ++i) {
+    auto *srcOpInst = loadsAndStores[i];
+    MemRefAccess srcAccess(srcOpInst);
+    for (unsigned j = 0; j < e; ++j) {
+      auto *dstOpInst = loadsAndStores[j];
+      MemRefAccess dstAccess(dstOpInst);
+
+      unsigned numCommonLoops =
+          getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
+      for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
+        FlatAffineConstraints dependenceConstraints;
+        llvm::SmallVector<DependenceComponent, 2> dependenceComponents;
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints,
+            &dependenceComponents);
+        assert(result.value != DependenceResult::Failure);
+        bool ret = hasDependence(result);
+        // TODO(andydavis) Print dependence type (i.e. RAW, etc) and print
+        // distance vectors as: ([2, 3], [0, 10]). Also, shorten distance
+        // vectors from ([1, 1], [3, 3]) to (1, 3).
+        srcOpInst->emitRemark("dependence from ")
+            << i << " to " << j << " at depth " << d << " = "
+            << getDirectionVectorStr(ret, numCommonLoops, d,
+                                     dependenceComponents);
+      }
+    }
+  }
+}
+
+// Walks the Function 'f' adding load and store ops to 'loadsAndStores'.
+// Runs pair-wise dependence checks.
+void TestMemRefDependenceCheck::runOnFunction() {
+  // Collect the loads and stores within the function.
+  loadsAndStores.clear();
+  getFunction().walk([&](Operation *op) {
+    if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op))
+      loadsAndStores.push_back(op);
+  });
+
+  checkDependences(loadsAndStores);
+}
+
+static PassRegistration<TestMemRefDependenceCheck>
+    pass("test-memref-dependence-check",
+         "Checks dependences between all pairs of memref accesses.");
diff --git a/third_party/mlir/lib/Analysis/TestParallelismDetection.cpp b/third_party/mlir/lib/Analysis/TestParallelismDetection.cpp
new file mode 100644
index 00000000000..351a6a7a191
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/TestParallelismDetection.cpp
@@ -0,0 +1,57 @@
+//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to detect parallel affine 'affine.for' ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestParallelismDetection
+    : public FunctionPass<TestParallelismDetection> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+FunctionPassBase *mlir::createParallelismDetectionTestPass() {
+  return new TestParallelismDetection();
+}
+
+// Walks the function and emits a note for all 'affine.for' ops detected as
+// parallel.
+void TestParallelismDetection::runOnFunction() {
+  FuncOp f = getFunction();
+  OpBuilder b(f.getBody());
+  f.walk<AffineForOp>([&](AffineForOp forOp) {
+    if (isLoopParallel(forOp))
+      forOp.emitRemark("parallel loop");
+    else
+      forOp.emitRemark("sequential loop");
+  });
+}
+
+static PassRegistration<TestParallelismDetection>
+    pass("test-detect-parallel", "Test parallelism detection ");
diff --git a/third_party/mlir/lib/Analysis/Utils.cpp b/third_party/mlir/lib/Analysis/Utils.cpp
new file mode 100644
index 00000000000..aaefd98d1bd
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/Utils.cpp
@@ -0,0 +1,1002 @@
+//===- Utils.cpp ---- Misc utilities for analysis -------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous analysis routines for non-loop IR
+// structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Utils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "analysis-utils"
+
+using namespace mlir;
+
+using llvm::SmallDenseMap;
+
+/// Populates 'loops' with IVs of the loops surrounding 'op' ordered from
+/// the outermost 'affine.for' operation to the innermost one.
+void mlir::getLoopIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops) {
+  auto *currOp = op.getParentOp();
+  AffineForOp currAffineForOp;
+  // Traverse up the hierarchy collecing all 'affine.for' operation while
+  // skipping over 'affine.if' operations.
+  while (currOp && ((currAffineForOp = dyn_cast<AffineForOp>(currOp)) ||
+                    isa<AffineIfOp>(currOp))) {
+    if (currAffineForOp)
+      loops->push_back(currAffineForOp);
+    currOp = currOp->getParentOp();
+  }
+  std::reverse(loops->begin(), loops->end());
+}
+
+// Populates 'cst' with FlatAffineConstraints which represent slice bounds.
+LogicalResult
+ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) {
+  assert(!lbOperands.empty());
+  // Adds src 'ivs' as dimension identifiers in 'cst'.
+  unsigned numDims = ivs.size();
+  // Adds operands (dst ivs and symbols) as symbols in 'cst'.
+  unsigned numSymbols = lbOperands[0].size();
+
+  SmallVector<Value *, 4> values(ivs);
+  // Append 'ivs' then 'operands' to 'values'.
+  values.append(lbOperands[0].begin(), lbOperands[0].end());
+  cst->reset(numDims, numSymbols, 0, values);
+
+  // Add loop bound constraints for values which are loop IVs and equality
+  // constraints for symbols which are constants.
+  for (const auto &value : values) {
+    assert(cst->containsId(*value) && "value expected to be present");
+    if (isValidSymbol(value)) {
+      // Check if the symbol is a constant.
+
+      if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
+        cst->setIdToConstant(*value, cOp.getValue());
+    } else if (auto loop = getForInductionVarOwner(value)) {
+      if (failed(cst->addAffineForOpDomain(loop)))
+        return failure();
+    }
+  }
+
+  // Add slices bounds on 'ivs' using maps 'lbs'/'ubs' with 'lbOperands[0]'
+  LogicalResult ret = cst->addSliceBounds(ivs, lbs, ubs, lbOperands[0]);
+  assert(succeeded(ret) &&
+         "should not fail as we never have semi-affine slice maps");
+  (void)ret;
+  return success();
+}
+
+// Clears state bounds and operand state.
+void ComputationSliceState::clearBounds() {
+  lbs.clear();
+  ubs.clear();
+  lbOperands.clear();
+  ubOperands.clear();
+}
+
+unsigned MemRefRegion::getRank() const {
+  return memref->getType().cast<MemRefType>().getRank();
+}
+
+Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
+    SmallVectorImpl<int64_t> *shape, std::vector<SmallVector<int64_t, 4>> *lbs,
+    SmallVectorImpl<int64_t> *lbDivisors) const {
+  auto memRefType = memref->getType().cast<MemRefType>();
+  unsigned rank = memRefType.getRank();
+  if (shape)
+    shape->reserve(rank);
+
+  assert(rank == cst.getNumDimIds() && "inconsistent memref region");
+
+  // Find a constant upper bound on the extent of this memref region along each
+  // dimension.
+  int64_t numElements = 1;
+  int64_t diffConstant;
+  int64_t lbDivisor;
+  for (unsigned d = 0; d < rank; d++) {
+    SmallVector<int64_t, 4> lb;
+    Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
+    if (diff.hasValue()) {
+      diffConstant = diff.getValue();
+      assert(lbDivisor > 0);
+    } else {
+      // If no constant bound is found, then it can always be bound by the
+      // memref's dim size if the latter has a constant size along this dim.
+      auto dimSize = memRefType.getDimSize(d);
+      if (dimSize == -1)
+        return None;
+      diffConstant = dimSize;
+      // Lower bound becomes 0.
+      lb.resize(cst.getNumSymbolIds() + 1, 0);
+      lbDivisor = 1;
+    }
+    numElements *= diffConstant;
+    if (lbs) {
+      lbs->push_back(lb);
+      assert(lbDivisors && "both lbs and lbDivisor or none");
+      lbDivisors->push_back(lbDivisor);
+    }
+    if (shape) {
+      shape->push_back(diffConstant);
+    }
+  }
+  return numElements;
+}
+
+LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
+  assert(memref == other.memref);
+  return cst.unionBoundingBox(*other.getConstraints());
+}
+
+/// Computes the memory region accessed by this memref with the region
+/// represented as constraints symbolic/parameteric in 'loopDepth' loops
+/// surrounding opInst and any additional Function symbols.
+//  For example, the memref region for this load operation at loopDepth = 1 will
+//  be as below:
+//
+//    affine.for %i = 0 to 32 {
+//      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+//        load %A[%ii]
+//      }
+//    }
+//
+// region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+//
+// TODO(bondhugula): extend this to any other memref dereferencing ops
+// (dma_start, dma_wait).
+LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
+                                    ComputationSliceState *sliceState,
+                                    bool addMemRefDimBounds) {
+  assert((isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) &&
+         "affine load/store op expected");
+
+  MemRefAccess access(op);
+  memref = access.memref;
+  write = access.isStore();
+
+  unsigned rank = access.getRank();
+
+  LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *op
+                          << "depth: " << loopDepth << "\n";);
+
+  if (rank == 0) {
+    SmallVector<AffineForOp, 4> ivs;
+    getLoopIVs(*op, &ivs);
+    SmallVector<Value *, 8> regionSymbols;
+    extractForInductionVars(ivs, &regionSymbols);
+    // A rank 0 memref has a 0-d region.
+    cst.reset(rank, loopDepth, 0, regionSymbols);
+    return success();
+  }
+
+  // Build the constraints for this region.
+  AffineValueMap accessValueMap;
+  access.getAccessMap(&accessValueMap);
+  AffineMap accessMap = accessValueMap.getAffineMap();
+
+  unsigned numDims = accessMap.getNumDims();
+  unsigned numSymbols = accessMap.getNumSymbols();
+  unsigned numOperands = accessValueMap.getNumOperands();
+  // Merge operands with slice operands.
+  SmallVector<Value *, 4> operands;
+  operands.resize(numOperands);
+  for (unsigned i = 0; i < numOperands; ++i)
+    operands[i] = accessValueMap.getOperand(i);
+
+  if (sliceState != nullptr) {
+    operands.reserve(operands.size() + sliceState->lbOperands[0].size());
+    // Append slice operands to 'operands' as symbols.
+    for (auto extraOperand : sliceState->lbOperands[0]) {
+      if (!llvm::is_contained(operands, extraOperand)) {
+        operands.push_back(extraOperand);
+        numSymbols++;
+      }
+    }
+  }
+  // We'll first associate the dims and symbols of the access map to the dims
+  // and symbols resp. of cst. This will change below once cst is
+  // fully constructed out.
+  cst.reset(numDims, numSymbols, 0, operands);
+
+  // Add equality constraints.
+  // Add inequalties for loop lower/upper bounds.
+  for (unsigned i = 0; i < numDims + numSymbols; ++i) {
+    auto *operand = operands[i];
+    if (auto loop = getForInductionVarOwner(operand)) {
+      // Note that cst can now have more dimensions than accessMap if the
+      // bounds expressions involve outer loops or other symbols.
+      // TODO(bondhugula): rewrite this to use getInstIndexSet; this way
+      // conditionals will be handled when the latter supports it.
+      if (failed(cst.addAffineForOpDomain(loop)))
+        return failure();
+    } else {
+      // Has to be a valid symbol.
+      auto *symbol = operand;
+      assert(isValidSymbol(symbol));
+      // Check if the symbol is a constant.
+      if (auto *op = symbol->getDefiningOp()) {
+        if (auto constOp = dyn_cast<ConstantIndexOp>(op)) {
+          cst.setIdToConstant(*symbol, constOp.getValue());
+        }
+      }
+    }
+  }
+
+  // Add lower/upper bounds on loop IVs using bounds from 'sliceState'.
+  if (sliceState != nullptr) {
+    // Add dim and symbol slice operands.
+    for (auto operand : sliceState->lbOperands[0]) {
+      cst.addInductionVarOrTerminalSymbol(operand);
+    }
+    // Add upper/lower bounds from 'sliceState' to 'cst'.
+    LogicalResult ret =
+        cst.addSliceBounds(sliceState->ivs, sliceState->lbs, sliceState->ubs,
+                           sliceState->lbOperands[0]);
+    assert(succeeded(ret) &&
+           "should not fail as we never have semi-affine slice maps");
+    (void)ret;
+  }
+
+  // Add access function equalities to connect loop IVs to data dimensions.
+  if (failed(cst.composeMap(&accessValueMap))) {
+    op->emitError("getMemRefRegion: compose affine map failed");
+    LLVM_DEBUG(accessValueMap.getAffineMap().dump());
+    return failure();
+  }
+
+  // Set all identifiers appearing after the first 'rank' identifiers as
+  // symbolic identifiers - so that the ones corresponding to the memref
+  // dimensions are the dimensional identifiers for the memref region.
+  cst.setDimSymbolSeparation(cst.getNumDimAndSymbolIds() - rank);
+
+  // Eliminate any loop IVs other than the outermost 'loopDepth' IVs, on which
+  // this memref region is symbolic.
+  SmallVector<AffineForOp, 4> enclosingIVs;
+  getLoopIVs(*op, &enclosingIVs);
+  assert(loopDepth <= enclosingIVs.size() && "invalid loop depth");
+  enclosingIVs.resize(loopDepth);
+  SmallVector<Value *, 4> ids;
+  cst.getIdValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids);
+  for (auto *id : ids) {
+    AffineForOp iv;
+    if ((iv = getForInductionVarOwner(id)) &&
+        llvm::is_contained(enclosingIVs, iv) == false) {
+      cst.projectOut(id);
+    }
+  }
+
+  // Project out any local variables (these would have been added for any
+  // mod/divs).
+  cst.projectOut(cst.getNumDimAndSymbolIds(), cst.getNumLocalIds());
+
+  // Constant fold any symbolic identifiers.
+  cst.constantFoldIdRange(/*pos=*/cst.getNumDimIds(),
+                          /*num=*/cst.getNumSymbolIds());
+
+  assert(cst.getNumDimIds() == rank && "unexpected MemRefRegion format");
+
+  // Add upper/lower bounds for each memref dimension with static size
+  // to guard against potential over-approximation from projection.
+  // TODO(andydavis) Support dynamic memref dimensions.
+  if (addMemRefDimBounds) {
+    auto memRefType = memref->getType().cast<MemRefType>();
+    for (unsigned r = 0; r < rank; r++) {
+      cst.addConstantLowerBound(r, 0);
+      int64_t dimSize = memRefType.getDimSize(r);
+      if (ShapedType::isDynamic(dimSize))
+        continue;
+      cst.addConstantUpperBound(r, dimSize - 1);
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
+  LLVM_DEBUG(cst.dump());
+  return success();
+}
+
+//  TODO(mlir-team): improve/complete this when we have target data.
+static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
+  auto elementType = memRefType.getElementType();
+
+  unsigned sizeInBits;
+  if (elementType.isIntOrFloat()) {
+    sizeInBits = elementType.getIntOrFloatBitWidth();
+  } else {
+    auto vectorType = elementType.cast<VectorType>();
+    sizeInBits =
+        vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
+  }
+  return llvm::divideCeil(sizeInBits, 8);
+}
+
+// Returns the size of the region.
+Optional<int64_t> MemRefRegion::getRegionSize() {
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return false;
+  }
+
+  // Indices to use for the DmaStart op.
+  // Indices for the original memref being DMAed from/to.
+  SmallVector<Value *, 4> memIndices;
+  // Indices for the faster buffer being DMAed into/from.
+  SmallVector<Value *, 4> bufIndices;
+
+  // Compute the extents of the buffer.
+  Optional<int64_t> numElements = getConstantBoundingSizeAndShape();
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Dynamic shapes not yet supported\n");
+    return None;
+  }
+  return getMemRefEltSizeInBytes(memRefType) * numElements.getValue();
+}
+
+/// Returns the size of memref data in bytes if it's statically shaped, None
+/// otherwise.  If the element of the memref has vector type, takes into account
+/// size of the vector as well.
+//  TODO(mlir-team): improve/complete this when we have target data.
+Optional<uint64_t> mlir::getMemRefSizeInBytes(MemRefType memRefType) {
+  if (!memRefType.hasStaticShape())
+    return None;
+  auto elementType = memRefType.getElementType();
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
+    return None;
+
+  uint64_t sizeInBytes = getMemRefEltSizeInBytes(memRefType);
+  for (unsigned i = 0, e = memRefType.getRank(); i < e; i++) {
+    sizeInBytes = sizeInBytes * memRefType.getDimSize(i);
+  }
+  return sizeInBytes;
+}
+
+template <typename LoadOrStoreOpPointer>
+LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
+                                            bool emitError) {
+  static_assert(std::is_same<LoadOrStoreOpPointer, AffineLoadOp>::value ||
+                    std::is_same<LoadOrStoreOpPointer, AffineStoreOp>::value,
+                "argument should be either a AffineLoadOp or a AffineStoreOp");
+
+  Operation *opInst = loadOrStoreOp.getOperation();
+  MemRefRegion region(opInst->getLoc());
+  if (failed(region.compute(opInst, /*loopDepth=*/0, /*sliceState=*/nullptr,
+                            /*addMemRefDimBounds=*/false)))
+    return success();
+
+  LLVM_DEBUG(llvm::dbgs() << "Memory region");
+  LLVM_DEBUG(region.getConstraints()->dump());
+
+  bool outOfBounds = false;
+  unsigned rank = loadOrStoreOp.getMemRefType().getRank();
+
+  // For each dimension, check for out of bounds.
+  for (unsigned r = 0; r < rank; r++) {
+    FlatAffineConstraints ucst(*region.getConstraints());
+
+    // Intersect memory region with constraint capturing out of bounds (both out
+    // of upper and out of lower), and check if the constraint system is
+    // feasible. If it is, there is at least one point out of bounds.
+    SmallVector<int64_t, 4> ineq(rank + 1, 0);
+    int64_t dimSize = loadOrStoreOp.getMemRefType().getDimSize(r);
+    // TODO(bondhugula): handle dynamic dim sizes.
+    if (dimSize == -1)
+      continue;
+
+    // Check for overflow: d_i >= memref dim size.
+    ucst.addConstantLowerBound(r, dimSize);
+    outOfBounds = !ucst.isEmpty();
+    if (outOfBounds && emitError) {
+      loadOrStoreOp.emitOpError()
+          << "memref out of upper bound access along dimension #" << (r + 1);
+    }
+
+    // Check for a negative index.
+    FlatAffineConstraints lcst(*region.getConstraints());
+    std::fill(ineq.begin(), ineq.end(), 0);
+    // d_i <= -1;
+    lcst.addConstantUpperBound(r, -1);
+    outOfBounds = !lcst.isEmpty();
+    if (outOfBounds && emitError) {
+      loadOrStoreOp.emitOpError()
+          << "memref out of lower bound access along dimension #" << (r + 1);
+    }
+  }
+  return failure(outOfBounds);
+}
+
+// Explicitly instantiate the template so that the compiler knows we need them!
+template LogicalResult mlir::boundCheckLoadOrStoreOp(AffineLoadOp loadOp,
+                                                     bool emitError);
+template LogicalResult mlir::boundCheckLoadOrStoreOp(AffineStoreOp storeOp,
+                                                     bool emitError);
+
+// Returns in 'positions' the Block positions of 'op' in each ancestor
+// Block from the Block containing operation, stopping at 'limitBlock'.
+static void findInstPosition(Operation *op, Block *limitBlock,
+                             SmallVectorImpl<unsigned> *positions) {
+  Block *block = op->getBlock();
+  while (block != limitBlock) {
+    // FIXME: This algorithm is unnecessarily O(n) and should be improved to not
+    // rely on linear scans.
+    int instPosInBlock = std::distance(block->begin(), op->getIterator());
+    positions->push_back(instPosInBlock);
+    op = block->getParentOp();
+    block = op->getBlock();
+  }
+  std::reverse(positions->begin(), positions->end());
+}
+
+// Returns the Operation in a possibly nested set of Blocks, where the
+// position of the operation is represented by 'positions', which has a
+// Block position for each level of nesting.
+static Operation *getInstAtPosition(ArrayRef<unsigned> positions,
+                                    unsigned level, Block *block) {
+  unsigned i = 0;
+  for (auto &op : *block) {
+    if (i != positions[level]) {
+      ++i;
+      continue;
+    }
+    if (level == positions.size() - 1)
+      return &op;
+    if (auto childAffineForOp = dyn_cast<AffineForOp>(op))
+      return getInstAtPosition(positions, level + 1,
+                               childAffineForOp.getBody());
+
+    for (auto &region : op.getRegions()) {
+      for (auto &b : region)
+        if (auto *ret = getInstAtPosition(positions, level + 1, &b))
+          return ret;
+    }
+    return nullptr;
+  }
+  return nullptr;
+}
+
+// Adds loop IV bounds to 'cst' for loop IVs not found in 'ivs'.
+LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value *, 8> &ivs,
+                                     FlatAffineConstraints *cst) {
+  for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) {
+    auto *value = cst->getIdValue(i);
+    if (ivs.count(value) == 0) {
+      assert(isForInductionVar(value));
+      auto loop = getForInductionVarOwner(value);
+      if (failed(cst->addAffineForOpDomain(loop)))
+        return failure();
+    }
+  }
+  return success();
+}
+
+// Returns the innermost common loop depth for the set of operations in 'ops'.
+// TODO(andydavis) Move this to LoopUtils.
+static unsigned
+getInnermostCommonLoopDepth(ArrayRef<Operation *> ops,
+                            SmallVectorImpl<AffineForOp> &surroundingLoops) {
+  unsigned numOps = ops.size();
+  assert(numOps > 0);
+
+  std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
+  unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
+  for (unsigned i = 0; i < numOps; ++i) {
+    getLoopIVs(*ops[i], &loops[i]);
+    loopDepthLimit =
+        std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
+  }
+
+  unsigned loopDepth = 0;
+  for (unsigned d = 0; d < loopDepthLimit; ++d) {
+    unsigned i;
+    for (i = 1; i < numOps; ++i) {
+      if (loops[i - 1][d] != loops[i][d])
+        return loopDepth;
+    }
+    surroundingLoops.push_back(loops[i - 1][d]);
+    ++loopDepth;
+  }
+  return loopDepth;
+}
+
+/// Computes in 'sliceUnion' the union of all slice bounds computed at
+/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
+/// Returns 'Success' if union was computed, 'failure' otherwise.
+LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> opsA,
+                                      ArrayRef<Operation *> opsB,
+                                      unsigned loopDepth,
+                                      unsigned numCommonLoops,
+                                      bool isBackwardSlice,
+                                      ComputationSliceState *sliceUnion) {
+  // Compute the union of slice bounds between all pairs in 'opsA' and
+  // 'opsB' in 'sliceUnionCst'.
+  FlatAffineConstraints sliceUnionCst;
+  assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
+  std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
+  for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
+    MemRefAccess srcAccess(opsA[i]);
+    for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) {
+      MemRefAccess dstAccess(opsB[j]);
+      if (srcAccess.memref != dstAccess.memref)
+        continue;
+      // Check if 'loopDepth' exceeds nesting depth of src/dst ops.
+      if ((!isBackwardSlice && loopDepth > getNestingDepth(*opsA[i])) ||
+          (isBackwardSlice && loopDepth > getNestingDepth(*opsB[j]))) {
+        LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n.");
+        return failure();
+      }
+
+      bool readReadAccesses = isa<AffineLoadOp>(srcAccess.opInst) &&
+                              isa<AffineLoadOp>(dstAccess.opInst);
+      FlatAffineConstraints dependenceConstraints;
+      // Check dependence between 'srcAccess' and 'dstAccess'.
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
+          &dependenceConstraints, /*dependenceComponents=*/nullptr,
+          /*allowRAR=*/readReadAccesses);
+      if (result.value == DependenceResult::Failure) {
+        LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n.");
+        return failure();
+      }
+      if (result.value == DependenceResult::NoDependence)
+        continue;
+      dependentOpPairs.push_back({opsA[i], opsB[j]});
+
+      // Compute slice bounds for 'srcAccess' and 'dstAccess'.
+      ComputationSliceState tmpSliceState;
+      mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints,
+                                     loopDepth, isBackwardSlice,
+                                     &tmpSliceState);
+
+      if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
+        // Initialize 'sliceUnionCst' with the bounds computed in previous step.
+        if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Unable to compute slice bound constraints\n.");
+          return failure();
+        }
+        assert(sliceUnionCst.getNumDimAndSymbolIds() > 0);
+        continue;
+      }
+
+      // Compute constraints for 'tmpSliceState' in 'tmpSliceCst'.
+      FlatAffineConstraints tmpSliceCst;
+      if (failed(tmpSliceState.getAsConstraints(&tmpSliceCst))) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Unable to compute slice bound constraints\n.");
+        return failure();
+      }
+
+      // Align coordinate spaces of 'sliceUnionCst' and 'tmpSliceCst' if needed.
+      if (!sliceUnionCst.areIdsAlignedWithOther(tmpSliceCst)) {
+
+        // Pre-constraint id alignment: record loop IVs used in each constraint
+        // system.
+        SmallPtrSet<Value *, 8> sliceUnionIVs;
+        for (unsigned k = 0, l = sliceUnionCst.getNumDimIds(); k < l; ++k)
+          sliceUnionIVs.insert(sliceUnionCst.getIdValue(k));
+        SmallPtrSet<Value *, 8> tmpSliceIVs;
+        for (unsigned k = 0, l = tmpSliceCst.getNumDimIds(); k < l; ++k)
+          tmpSliceIVs.insert(tmpSliceCst.getIdValue(k));
+
+        sliceUnionCst.mergeAndAlignIdsWithOther(/*offset=*/0, &tmpSliceCst);
+
+        // Post-constraint id alignment: add loop IV bounds missing after
+        // id alignment to constraint systems. This can occur if one constraint
+        // system uses an loop IV that is not used by the other. The call
+        // to unionBoundingBox below expects constraints for each Loop IV, even
+        // if they are the unsliced full loop bounds added here.
+        if (failed(addMissingLoopIVBounds(sliceUnionIVs, &sliceUnionCst)))
+          return failure();
+        if (failed(addMissingLoopIVBounds(tmpSliceIVs, &tmpSliceCst)))
+          return failure();
+      }
+      // Compute union bounding box of 'sliceUnionCst' and 'tmpSliceCst'.
+      if (failed(sliceUnionCst.unionBoundingBox(tmpSliceCst))) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Unable to compute union bounding box of slice bounds."
+                      "\n.");
+        return failure();
+      }
+    }
+  }
+
+  // Empty union.
+  if (sliceUnionCst.getNumDimAndSymbolIds() == 0)
+    return failure();
+
+  // Gather loops surrounding ops from loop nest where slice will be inserted.
+  SmallVector<Operation *, 4> ops;
+  for (auto &dep : dependentOpPairs) {
+    ops.push_back(isBackwardSlice ? dep.second : dep.first);
+  }
+  SmallVector<AffineForOp, 4> surroundingLoops;
+  unsigned innermostCommonLoopDepth =
+      getInnermostCommonLoopDepth(ops, surroundingLoops);
+  if (loopDepth > innermostCommonLoopDepth) {
+    LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n.");
+    return failure();
+  }
+
+  // Store 'numSliceLoopIVs' before converting dst loop IVs to dims.
+  unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds();
+
+  // Convert any dst loop IVs which are symbol identifiers to dim identifiers.
+  sliceUnionCst.convertLoopIVSymbolsToDims();
+  sliceUnion->clearBounds();
+  sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap());
+  sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap());
+
+  // Get slice bounds from slice union constraints 'sliceUnionCst'.
+  sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs,
+                               opsA[0]->getContext(), &sliceUnion->lbs,
+                               &sliceUnion->ubs);
+
+  // Add slice bound operands of union.
+  SmallVector<Value *, 4> sliceBoundOperands;
+  sliceUnionCst.getIdValues(numSliceLoopIVs,
+                            sliceUnionCst.getNumDimAndSymbolIds(),
+                            &sliceBoundOperands);
+
+  // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
+  sliceUnion->ivs.clear();
+  sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs);
+
+  // Set loop nest insertion point to block start at 'loopDepth'.
+  sliceUnion->insertPoint =
+      isBackwardSlice
+          ? surroundingLoops[loopDepth - 1].getBody()->begin()
+          : std::prev(surroundingLoops[loopDepth - 1].getBody()->end());
+
+  // Give each bound its own copy of 'sliceBoundOperands' for subsequent
+  // canonicalization.
+  sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  return success();
+}
+
+const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
+// Computes slice bounds by projecting out any loop IVs from
+// 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice
+// bounds in 'sliceState' which represent the one loop nest's IVs in terms of
+// the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
+void mlir::getComputationSliceState(
+    Operation *depSourceOp, Operation *depSinkOp,
+    FlatAffineConstraints *dependenceConstraints, unsigned loopDepth,
+    bool isBackwardSlice, ComputationSliceState *sliceState) {
+  // Get loop nest surrounding src operation.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*depSourceOp, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Get loop nest surrounding dst operation.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*depSinkOp, &dstLoopIVs);
+  unsigned numDstLoopIVs = dstLoopIVs.size();
+
+  assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) ||
+         (isBackwardSlice && loopDepth <= numDstLoopIVs));
+
+  // Project out dimensions other than those up to 'loopDepth'.
+  unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth;
+  unsigned num =
+      isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth;
+  dependenceConstraints->projectOut(pos, num);
+
+  // Add slice loop IV values to 'sliceState'.
+  unsigned offset = isBackwardSlice ? 0 : loopDepth;
+  unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
+  dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs,
+                                     &sliceState->ivs);
+
+  // Set up lower/upper bound affine maps for the slice.
+  sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
+  sliceState->ubs.resize(numSliceLoopIVs, AffineMap());
+
+  // Get bounds for slice IVs in terms of other IVs, symbols, and constants.
+  dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs,
+                                        depSourceOp->getContext(),
+                                        &sliceState->lbs, &sliceState->ubs);
+
+  // Set up bound operands for the slice's lower and upper bounds.
+  SmallVector<Value *, 4> sliceBoundOperands;
+  unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
+  for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
+    if (i < offset || i >= offset + numSliceLoopIVs) {
+      sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i));
+    }
+  }
+
+  // Give each bound its own copy of 'sliceBoundOperands' for subsequent
+  // canonicalization.
+  sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+
+  // Set destination loop nest insertion point to block start at 'dstLoopDepth'.
+  sliceState->insertPoint =
+      isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin()
+                      : std::prev(srcLoopIVs[loopDepth - 1].getBody()->end());
+
+  llvm::SmallDenseSet<Value *, 8> sequentialLoops;
+  if (isa<AffineLoadOp>(depSourceOp) && isa<AffineLoadOp>(depSinkOp)) {
+    // For read-read access pairs, clear any slice bounds on sequential loops.
+    // Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
+    getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
+                       &sequentialLoops);
+  }
+  // Clear all sliced loop bounds beginning at the first sequential loop, or
+  // first loop with a slice fusion barrier attribute..
+  // TODO(andydavis, bondhugula) Use MemRef read/write regions instead of
+  // using 'kSliceFusionBarrierAttrName'.
+  auto getSliceLoop = [&](unsigned i) {
+    return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
+  };
+  for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
+    Value *iv = getSliceLoop(i).getInductionVar();
+    if (sequentialLoops.count(iv) == 0 &&
+        getSliceLoop(i).getAttr(kSliceFusionBarrierAttrName) == nullptr)
+      continue;
+    for (unsigned j = i; j < numSliceLoopIVs; ++j) {
+      sliceState->lbs[j] = AffineMap();
+      sliceState->ubs[j] = AffineMap();
+    }
+    break;
+  }
+}
+
+/// Creates a computation slice of the loop nest surrounding 'srcOpInst',
+/// updates the slice loop bounds with any non-null bound maps specified in
+/// 'sliceState', and inserts this slice into the loop nest surrounding
+/// 'dstOpInst' at loop depth 'dstLoopDepth'.
+// TODO(andydavis,bondhugula): extend the slicing utility to compute slices that
+// aren't necessarily a one-to-one relation b/w the source and destination. The
+// relation between the source and destination could be many-to-many in general.
+// TODO(andydavis,bondhugula): the slice computation is incorrect in the cases
+// where the dependence from the source to the destination does not cover the
+// entire destination index set. Subtract out the dependent destination
+// iterations from destination index set and check for emptiness --- this is one
+// solution.
+AffineForOp
+mlir::insertBackwardComputationSlice(Operation *srcOpInst, Operation *dstOpInst,
+                                     unsigned dstLoopDepth,
+                                     ComputationSliceState *sliceState) {
+  // Get loop nest surrounding src operation.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*srcOpInst, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Get loop nest surrounding dst operation.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*dstOpInst, &dstLoopIVs);
+  unsigned dstLoopIVsSize = dstLoopIVs.size();
+  if (dstLoopDepth > dstLoopIVsSize) {
+    dstOpInst->emitError("invalid destination loop depth");
+    return AffineForOp();
+  }
+
+  // Find the op block positions of 'srcOpInst' within 'srcLoopIVs'.
+  SmallVector<unsigned, 4> positions;
+  // TODO(andydavis): This code is incorrect since srcLoopIVs can be 0-d.
+  findInstPosition(srcOpInst, srcLoopIVs[0].getOperation()->getBlock(),
+                   &positions);
+
+  // Clone src loop nest and insert it a the beginning of the operation block
+  // of the loop at 'dstLoopDepth' in 'dstLoopIVs'.
+  auto dstAffineForOp = dstLoopIVs[dstLoopDepth - 1];
+  OpBuilder b(dstAffineForOp.getBody(), dstAffineForOp.getBody()->begin());
+  auto sliceLoopNest =
+      cast<AffineForOp>(b.clone(*srcLoopIVs[0].getOperation()));
+
+  Operation *sliceInst =
+      getInstAtPosition(positions, /*level=*/0, sliceLoopNest.getBody());
+  // Get loop nest surrounding 'sliceInst'.
+  SmallVector<AffineForOp, 4> sliceSurroundingLoops;
+  getLoopIVs(*sliceInst, &sliceSurroundingLoops);
+
+  // Sanity check.
+  unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
+  (void)sliceSurroundingLoopsSize;
+  assert(dstLoopDepth + numSrcLoopIVs >= sliceSurroundingLoopsSize);
+  unsigned sliceLoopLimit = dstLoopDepth + numSrcLoopIVs;
+  (void)sliceLoopLimit;
+  assert(sliceLoopLimit >= sliceSurroundingLoopsSize);
+
+  // Update loop bounds for loops in 'sliceLoopNest'.
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    auto forOp = sliceSurroundingLoops[dstLoopDepth + i];
+    if (AffineMap lbMap = sliceState->lbs[i])
+      forOp.setLowerBound(sliceState->lbOperands[i], lbMap);
+    if (AffineMap ubMap = sliceState->ubs[i])
+      forOp.setUpperBound(sliceState->ubOperands[i], ubMap);
+  }
+  return sliceLoopNest;
+}
+
+// Constructs  MemRefAccess populating it with the memref, its indices and
+// opinst from 'loadOrStoreOpInst'.
+MemRefAccess::MemRefAccess(Operation *loadOrStoreOpInst) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(loadOrStoreOpInst)) {
+    memref = loadOp.getMemRef();
+    opInst = loadOrStoreOpInst;
+    auto loadMemrefType = loadOp.getMemRefType();
+    indices.reserve(loadMemrefType.getRank());
+    for (auto *index : loadOp.getIndices()) {
+      indices.push_back(index);
+    }
+  } else {
+    assert(isa<AffineStoreOp>(loadOrStoreOpInst) && "load/store op expected");
+    auto storeOp = dyn_cast<AffineStoreOp>(loadOrStoreOpInst);
+    opInst = loadOrStoreOpInst;
+    memref = storeOp.getMemRef();
+    auto storeMemrefType = storeOp.getMemRefType();
+    indices.reserve(storeMemrefType.getRank());
+    for (auto *index : storeOp.getIndices()) {
+      indices.push_back(index);
+    }
+  }
+}
+
+unsigned MemRefAccess::getRank() const {
+  return memref->getType().cast<MemRefType>().getRank();
+}
+
+bool MemRefAccess::isStore() const { return isa<AffineStoreOp>(opInst); }
+
+/// Returns the nesting depth of this statement, i.e., the number of loops
+/// surrounding this statement.
+unsigned mlir::getNestingDepth(Operation &op) {
+  Operation *currOp = &op;
+  unsigned depth = 0;
+  while ((currOp = currOp->getParentOp())) {
+    if (isa<AffineForOp>(currOp))
+      depth++;
+  }
+  return depth;
+}
+
+/// Returns the number of surrounding loops common to 'loopsA' and 'loopsB',
+/// where each lists loops from outer-most to inner-most in loop nest.
+unsigned mlir::getNumCommonSurroundingLoops(Operation &A, Operation &B) {
+  SmallVector<AffineForOp, 4> loopsA, loopsB;
+  getLoopIVs(A, &loopsA);
+  getLoopIVs(B, &loopsB);
+
+  unsigned minNumLoops = std::min(loopsA.size(), loopsB.size());
+  unsigned numCommonLoops = 0;
+  for (unsigned i = 0; i < minNumLoops; ++i) {
+    if (loopsA[i].getOperation() != loopsB[i].getOperation())
+      break;
+    ++numCommonLoops;
+  }
+  return numCommonLoops;
+}
+
+static Optional<int64_t> getMemoryFootprintBytes(Block &block,
+                                                 Block::iterator start,
+                                                 Block::iterator end,
+                                                 int memorySpace) {
+  SmallDenseMap<Value *, std::unique_ptr<MemRefRegion>, 4> regions;
+
+  // Walk this 'affine.for' operation to gather all memory regions.
+  bool error = false;
+  block.walk(start, end, [&](Operation *opInst) {
+    if (!isa<AffineLoadOp>(opInst) && !isa<AffineStoreOp>(opInst)) {
+      // Neither load nor a store op.
+      return;
+    }
+
+    // Compute the memref region symbolic in any IVs enclosing this block.
+    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
+    if (failed(
+            region->compute(opInst,
+                            /*loopDepth=*/getNestingDepth(*block.begin())))) {
+      opInst->emitError("Error obtaining memory region\n");
+      error = true;
+      return;
+    }
+    auto it = regions.find(region->memref);
+    if (it == regions.end()) {
+      regions[region->memref] = std::move(region);
+    } else if (failed(it->second->unionBoundingBox(*region))) {
+      opInst->emitWarning(
+          "getMemoryFootprintBytes: unable to perform a union on a memory "
+          "region");
+      error = true;
+      return;
+    }
+  });
+
+  if (error)
+    return None;
+
+  int64_t totalSizeInBytes = 0;
+  for (const auto &region : regions) {
+    Optional<int64_t> size = region.second->getRegionSize();
+    if (!size.hasValue())
+      return None;
+    totalSizeInBytes += size.getValue();
+  }
+  return totalSizeInBytes;
+}
+
+Optional<int64_t> mlir::getMemoryFootprintBytes(AffineForOp forOp,
+                                                int memorySpace) {
+  auto *forInst = forOp.getOperation();
+  return ::getMemoryFootprintBytes(
+      *forInst->getBlock(), Block::iterator(forInst),
+      std::next(Block::iterator(forInst)), memorySpace);
+}
+
+/// Returns in 'sequentialLoops' all sequential loops in loop nest rooted
+/// at 'forOp'.
+void mlir::getSequentialLoops(
+    AffineForOp forOp, llvm::SmallDenseSet<Value *, 8> *sequentialLoops) {
+  forOp.getOperation()->walk([&](Operation *op) {
+    if (auto innerFor = dyn_cast<AffineForOp>(op))
+      if (!isLoopParallel(innerFor))
+        sequentialLoops->insert(innerFor.getInductionVar());
+  });
+}
+
+/// Returns true if 'forOp' is parallel.
+bool mlir::isLoopParallel(AffineForOp forOp) {
+  // Collect all load and store ops in loop nest rooted at 'forOp'.
+  SmallVector<Operation *, 8> loadAndStoreOpInsts;
+  bool hasSideEffectingOps = false;
+  forOp.getOperation()->walk([&](Operation *opInst) {
+    if (isa<AffineLoadOp>(opInst) || isa<AffineStoreOp>(opInst))
+      return loadAndStoreOpInsts.push_back(opInst);
+    if (!isa<AffineForOp>(opInst) && !isa<AffineTerminatorOp>(opInst) &&
+        !isa<AffineIfOp>(opInst) && !opInst->hasNoSideEffect()) {
+      hasSideEffectingOps = true;
+    }
+  });
+  // Stop early if the loop has unknown ops with side effects.
+  if (hasSideEffectingOps)
+    return false;
+
+  // Dep check depth would be number of enclosing loops + 1.
+  unsigned depth = getNestingDepth(*forOp.getOperation()) + 1;
+
+  // Check dependences between all pairs of ops in 'loadAndStoreOpInsts'.
+  for (auto *srcOpInst : loadAndStoreOpInsts) {
+    MemRefAccess srcAccess(srcOpInst);
+    for (auto *dstOpInst : loadAndStoreOpInsts) {
+      MemRefAccess dstAccess(dstOpInst);
+      FlatAffineConstraints dependenceConstraints;
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, dstAccess, depth, &dependenceConstraints,
+          /*dependenceComponents=*/nullptr);
+      if (result.value != DependenceResult::NoDependence)
+        return false;
+    }
+  }
+  return true;
+}
diff --git a/third_party/mlir/lib/Analysis/VectorAnalysis.cpp b/third_party/mlir/lib/Analysis/VectorAnalysis.cpp
new file mode 100644
index 00000000000..9846abb7be2
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/VectorAnalysis.cpp
@@ -0,0 +1,241 @@
+//===- VectorAnalysis.cpp - Analysis for Vectorization --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+
+///
+/// Implements Analysis functions specific to vectors which support
+/// the vectorization and vectorization materialization passes.
+///
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+Optional<SmallVector<unsigned, 4>>
+mlir::shapeRatio(ArrayRef<int64_t> superShape, ArrayRef<int64_t> subShape) {
+  if (superShape.size() < subShape.size()) {
+    return Optional<SmallVector<unsigned, 4>>();
+  }
+
+  // Starting from the end, compute the integer divisors.
+  // Set the boolean `divides` if integral division is not possible.
+  std::vector<unsigned> result;
+  result.reserve(superShape.size());
+  bool divides = true;
+  auto divide = [&divides, &result](int superSize, int subSize) {
+    assert(superSize > 0 && "superSize must be > 0");
+    assert(subSize > 0 && "subSize must be > 0");
+    divides &= (superSize % subSize == 0);
+    result.push_back(superSize / subSize);
+  };
+  functional::zipApply(
+      divide, SmallVector<int64_t, 8>{superShape.rbegin(), superShape.rend()},
+      SmallVector<int64_t, 8>{subShape.rbegin(), subShape.rend()});
+
+  // If integral division does not occur, return and let the caller decide.
+  if (!divides) {
+    return None;
+  }
+
+  // At this point we computed the ratio (in reverse) for the common
+  // size. Fill with the remaining entries from the super-vector shape (still in
+  // reverse).
+  int commonSize = subShape.size();
+  std::copy(superShape.rbegin() + commonSize, superShape.rend(),
+            std::back_inserter(result));
+
+  assert(result.size() == superShape.size() &&
+         "super to sub shape ratio is not of the same size as the super rank");
+
+  // Reverse again to get it back in the proper order and return.
+  return SmallVector<unsigned, 4>{result.rbegin(), result.rend()};
+}
+
+Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(VectorType superVectorType,
+                                                    VectorType subVectorType) {
+  assert(superVectorType.getElementType() == subVectorType.getElementType() &&
+         "vector types must be of the same elemental type");
+  return shapeRatio(superVectorType.getShape(), subVectorType.getShape());
+}
+
+/// Constructs a permutation map from memref indices to vector dimension.
+///
+/// The implementation uses the knowledge of the mapping of enclosing loop to
+/// vector dimension. `enclosingLoopToVectorDim` carries this information as a
+/// map with:
+///   - keys representing "vectorized enclosing loops";
+///   - values representing the corresponding vector dimension.
+/// The algorithm traverses "vectorized enclosing loops" and extracts the
+/// at-most-one MemRef index that is invariant along said loop. This index is
+/// guaranteed to be at most one by construction: otherwise the MemRef is not
+/// vectorizable.
+/// If this invariant index is found, it is added to the permutation_map at the
+/// proper vector dimension.
+/// If no index is found to be invariant, 0 is added to the permutation_map and
+/// corresponds to a vector broadcast along that dimension.
+///
+/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
+/// signalling that no permutation map can be constructed given
+/// `enclosingLoopToVectorDim`.
+///
+/// Examples can be found in the documentation of `makePermutationMap`, in the
+/// header file.
+static AffineMap makePermutationMap(
+    ArrayRef<Value *> indices,
+    const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
+  if (enclosingLoopToVectorDim.empty())
+    return AffineMap();
+  MLIRContext *context =
+      enclosingLoopToVectorDim.begin()->getFirst()->getContext();
+  using functional::makePtrDynCaster;
+  using functional::map;
+  SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
+                                  getAffineConstantExpr(0, context));
+
+  for (auto kvp : enclosingLoopToVectorDim) {
+    assert(kvp.second < perm.size());
+    auto invariants = getInvariantAccesses(
+        cast<AffineForOp>(kvp.first).getInductionVar(), indices);
+    unsigned numIndices = indices.size();
+    unsigned countInvariantIndices = 0;
+    for (unsigned dim = 0; dim < numIndices; ++dim) {
+      if (!invariants.count(indices[dim])) {
+        assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
+               "permutationMap already has an entry along dim");
+        perm[kvp.second] = getAffineDimExpr(dim, context);
+      } else {
+        ++countInvariantIndices;
+      }
+    }
+    assert((countInvariantIndices == numIndices ||
+            countInvariantIndices == numIndices - 1) &&
+           "Vectorization prerequisite violated: at most 1 index may be "
+           "invariant wrt a vectorized loop");
+  }
+  return AffineMap::get(indices.size(), 0, perm);
+}
+
+/// Implementation detail that walks up the parents and records the ones with
+/// the specified type.
+/// TODO(ntv): could also be implemented as a collect parents followed by a
+/// filter and made available outside this file.
+template <typename T>
+static SetVector<Operation *> getParentsOfType(Operation *op) {
+  SetVector<Operation *> res;
+  auto *current = op;
+  while (auto *parent = current->getParentOp()) {
+    if (auto typedParent = dyn_cast<T>(parent)) {
+      assert(res.count(parent) == 0 && "Already inserted");
+      res.insert(parent);
+    }
+    current = parent;
+  }
+  return res;
+}
+
+/// Returns the enclosing AffineForOp, from closest to farthest.
+static SetVector<Operation *> getEnclosingforOps(Operation *op) {
+  return getParentsOfType<AffineForOp>(op);
+}
+
+AffineMap mlir::makePermutationMap(
+    Operation *op, ArrayRef<Value *> indices,
+    const DenseMap<Operation *, unsigned> &loopToVectorDim) {
+  DenseMap<Operation *, unsigned> enclosingLoopToVectorDim;
+  auto enclosingLoops = getEnclosingforOps(op);
+  for (auto *forInst : enclosingLoops) {
+    auto it = loopToVectorDim.find(forInst);
+    if (it != loopToVectorDim.end()) {
+      enclosingLoopToVectorDim.insert(*it);
+    }
+  }
+  return ::makePermutationMap(indices, enclosingLoopToVectorDim);
+}
+
+bool mlir::matcher::operatesOnSuperVectorsOf(Operation &op,
+                                             VectorType subVectorType) {
+  // First, extract the vector type and ditinguish between:
+  //   a. ops that *must* lower a super-vector (i.e. vector.transfer_read,
+  //      vector.transfer_write); and
+  //   b. ops that *may* lower a super-vector (all other ops).
+  // The ops that *may* lower a super-vector only do so if the super-vector to
+  // sub-vector ratio exists. The ops that *must* lower a super-vector are
+  // explicitly checked for this property.
+  /// TODO(ntv): there should be a single function for all ops to do this so we
+  /// do not have to special case. Maybe a trait, or just a method, unclear atm.
+  bool mustDivide = false;
+  (void)mustDivide;
+  VectorType superVectorType;
+  if (auto read = dyn_cast<vector::VectorTransferReadOp>(op)) {
+    superVectorType = read.getResultType();
+    mustDivide = true;
+  } else if (auto write = dyn_cast<vector::VectorTransferWriteOp>(op)) {
+    superVectorType = write.getVectorType();
+    mustDivide = true;
+  } else if (op.getNumResults() == 0) {
+    if (!isa<ReturnOp>(op)) {
+      op.emitError("NYI: assuming only return operations can have 0 "
+                   " results at this point");
+    }
+    return false;
+  } else if (op.getNumResults() == 1) {
+    if (auto v = op.getResult(0)->getType().dyn_cast<VectorType>()) {
+      superVectorType = v;
+    } else {
+      // Not a vector type.
+      return false;
+    }
+  } else {
+    // Not a vector.transfer and has more than 1 result, fail hard for now to
+    // wake us up when something changes.
+    op.emitError("NYI: operation has more than 1 result");
+    return false;
+  }
+
+  // Get the ratio.
+  auto ratio = shapeRatio(superVectorType, subVectorType);
+
+  // Sanity check.
+  assert((ratio.hasValue() || !mustDivide) &&
+         "vector.transfer operation in which super-vector size is not an"
+         " integer multiple of sub-vector size");
+
+  // This catches cases that are not strictly necessary to have multiplicity but
+  // still aren't divisible by the sub-vector shape.
+  // This could be useful information if we wanted to reshape at the level of
+  // the vector type (but we would have to look at the compute and distinguish
+  // between parallel, reduction and possibly other cases.
+  if (!ratio.hasValue()) {
+    return false;
+  }
+
+  return true;
+}
diff --git a/third_party/mlir/lib/Analysis/Verifier.cpp b/third_party/mlir/lib/Analysis/Verifier.cpp
new file mode 100644
index 00000000000..d250996c979
--- /dev/null
+++ b/third_party/mlir/lib/Analysis/Verifier.cpp
@@ -0,0 +1,273 @@
+//===- Verifier.cpp - MLIR Verifier Implementation ------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the verify() methods on the various IR types, performing
+// (potentially expensive) checks on the holistic structure of the code.  This
+// can be used for detecting bugs in compiler transformations and hand written
+// .mlir files.
+//
+// The checks in this file are only for things that can occur as part of IR
+// transformations: e.g. violation of dominance information, malformed operation
+// attributes, etc.  MLIR supports transformations moving IR through locally
+// invalid states (e.g. unlinking an operation from a block before re-inserting
+// it in a new place), but each transformation must complete with the IR in a
+// valid form.
+//
+// This should not check for things that are always wrong by construction (e.g.
+// attributes or other immutable structures that are incorrect), because those
+// are not mutable and can be checked at time of construction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Regex.h"
+using namespace mlir;
+
+namespace {
+/// This class encapsulates all the state used to verify an operation region.
+class OperationVerifier {
+public:
+  explicit OperationVerifier(MLIRContext *ctx)
+      : ctx(ctx), identifierRegex("^[a-zA-Z_][a-zA-Z_0-9\\.\\$]*$") {}
+
+  /// Verify the given operation.
+  LogicalResult verify(Operation &op);
+
+  /// Returns the registered dialect for a dialect-specific attribute.
+  Dialect *getDialectForAttribute(const NamedAttribute &attr) {
+    assert(attr.first.strref().contains('.') && "expected dialect attribute");
+    auto dialectNamePair = attr.first.strref().split('.');
+    return ctx->getRegisteredDialect(dialectNamePair.first);
+  }
+
+  /// Returns if the given string is valid to use as an identifier name.
+  bool isValidName(StringRef name) { return identifierRegex.match(name); }
+
+private:
+  /// Verify the given potentially nested region or block.
+  LogicalResult verifyRegion(Region &region);
+  LogicalResult verifyBlock(Block &block);
+  LogicalResult verifyOperation(Operation &op);
+
+  /// Verify the dominance within the given IR unit.
+  LogicalResult verifyDominance(Region &region);
+  LogicalResult verifyDominance(Operation &op);
+
+  /// Emit an error for the given block.
+  InFlightDiagnostic emitError(Block &bb, const Twine &message) {
+    // Take the location information for the first operation in the block.
+    if (!bb.empty())
+      return bb.front().emitError(message);
+
+    // Worst case, fall back to using the parent's location.
+    return mlir::emitError(bb.getParent()->getLoc(), message);
+  }
+
+  /// The current context for the verifier.
+  MLIRContext *ctx;
+
+  /// Dominance information for this operation, when checking dominance.
+  DominanceInfo *domInfo = nullptr;
+
+  /// Regex checker for attribute names.
+  llvm::Regex identifierRegex;
+
+  /// Mapping between dialect namespace and if that dialect supports
+  /// unregistered operations.
+  llvm::StringMap<bool> dialectAllowsUnknownOps;
+};
+} // end anonymous namespace
+
+/// Verify the given operation.
+LogicalResult OperationVerifier::verify(Operation &op) {
+  // Verify the operation first.
+  if (failed(verifyOperation(op)))
+    return failure();
+
+  // Since everything looks structurally ok to this point, we do a dominance
+  // check for any nested regions. We do this as a second pass since malformed
+  // CFG's can cause dominator analysis constructure to crash and we want the
+  // verifier to be resilient to malformed code.
+  DominanceInfo theDomInfo(&op);
+  domInfo = &theDomInfo;
+  for (auto &region : op.getRegions())
+    if (failed(verifyDominance(region)))
+      return failure();
+
+  domInfo = nullptr;
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyRegion(Region &region) {
+  if (region.empty())
+    return success();
+
+  // Verify the first block has no predecessors.
+  auto *firstBB = &region.front();
+  if (!firstBB->hasNoPredecessors())
+    return mlir::emitError(region.getLoc(),
+                           "entry block of region may not have predecessors");
+
+  // Verify each of the blocks within the region.
+  for (auto &block : region)
+    if (failed(verifyBlock(block)))
+      return failure();
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyBlock(Block &block) {
+  for (auto *arg : block.getArguments())
+    if (arg->getOwner() != &block)
+      return emitError(block, "block argument not owned by block");
+
+  // Verify that this block has a terminator.
+  if (block.empty())
+    return emitError(block, "block with no terminator");
+
+  // Verify the non-terminator operations separately so that we can verify
+  // they has no successors.
+  for (auto &op : llvm::make_range(block.begin(), std::prev(block.end()))) {
+    if (op.getNumSuccessors() != 0)
+      return op.emitError(
+          "operation with block successors must terminate its parent block");
+
+    if (failed(verifyOperation(op)))
+      return failure();
+  }
+
+  // Verify the terminator.
+  if (failed(verifyOperation(block.back())))
+    return failure();
+  if (block.back().isKnownNonTerminator())
+    return emitError(block, "block with no terminator");
+
+  // Verify that this block is not branching to a block of a different
+  // region.
+  for (Block *successor : block.getSuccessors())
+    if (successor->getParent() != block.getParent())
+      return block.back().emitOpError(
+          "branching to block of a different region");
+
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyOperation(Operation &op) {
+  // Check that operands are non-nil and structurally ok.
+  for (auto *operand : op.getOperands())
+    if (!operand)
+      return op.emitError("null operand found");
+
+  /// Verify that all of the attributes are okay.
+  for (auto attr : op.getAttrs()) {
+    if (!identifierRegex.match(attr.first))
+      return op.emitError("invalid attribute name '") << attr.first << "'";
+
+    // Check for any optional dialect specific attributes.
+    if (!attr.first.strref().contains('.'))
+      continue;
+    if (auto *dialect = getDialectForAttribute(attr))
+      if (failed(dialect->verifyOperationAttribute(&op, attr)))
+        return failure();
+  }
+
+  // If we can get operation info for this, check the custom hook.
+  auto *opInfo = op.getAbstractOperation();
+  if (opInfo && failed(opInfo->verifyInvariants(&op)))
+    return failure();
+
+  // Verify that all child regions are ok.
+  for (auto &region : op.getRegions())
+    if (failed(verifyRegion(region)))
+      return failure();
+
+  // If this is a registered operation, there is nothing left to do.
+  if (opInfo)
+    return success();
+
+  // Otherwise, verify that the parent dialect allows un-registered operations.
+  auto dialectPrefix = op.getName().getDialect();
+
+  // Check for an existing answer for the operation dialect.
+  auto it = dialectAllowsUnknownOps.find(dialectPrefix);
+  if (it == dialectAllowsUnknownOps.end()) {
+    // If the operation dialect is registered, query it directly.
+    if (auto *dialect = ctx->getRegisteredDialect(dialectPrefix))
+      it = dialectAllowsUnknownOps
+               .try_emplace(dialectPrefix, dialect->allowsUnknownOperations())
+               .first;
+    // Otherwise, conservatively allow unknown operations.
+    else
+      it = dialectAllowsUnknownOps.try_emplace(dialectPrefix, true).first;
+  }
+
+  if (!it->second) {
+    return op.emitError("unregistered operation '")
+           << op.getName() << "' found in dialect ('" << dialectPrefix
+           << "') that does not allow unknown operations";
+  }
+
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyDominance(Region &region) {
+  // Verify the dominance of each of the held operations.
+  for (auto &block : region)
+    for (auto &op : block)
+      if (failed(verifyDominance(op)))
+        return failure();
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyDominance(Operation &op) {
+  // Check that operands properly dominate this use.
+  for (unsigned operandNo = 0, e = op.getNumOperands(); operandNo != e;
+       ++operandNo) {
+    auto *operand = op.getOperand(operandNo);
+    if (domInfo->properlyDominates(operand, &op))
+      continue;
+
+    auto diag = op.emitError("operand #")
+                << operandNo << " does not dominate this use";
+    if (auto *useOp = operand->getDefiningOp())
+      diag.attachNote(useOp->getLoc()) << "operand defined here";
+    return failure();
+  }
+
+  // Verify the dominance of each of the nested blocks within this operation.
+  for (auto &region : op.getRegions())
+    if (failed(verifyDominance(region)))
+      return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// Perform (potentially expensive) checks of invariants, used to detect
+/// compiler bugs.  On error, this reports the error through the MLIRContext and
+/// returns failure.
+LogicalResult mlir::verify(Operation *op) {
+  return OperationVerifier(op->getContext()).verify(*op);
+}
diff --git a/third_party/mlir/lib/CMakeLists.txt b/third_party/mlir/lib/CMakeLists.txt
new file mode 100644
index 00000000000..f34b1e8bead
--- /dev/null
+++ b/third_party/mlir/lib/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_subdirectory(Analysis)
+add_subdirectory(Conversion)
+add_subdirectory(Dialect)
+add_subdirectory(EDSC)
+add_subdirectory(ExecutionEngine)
+add_subdirectory(IR)
+add_subdirectory(Parser)
+add_subdirectory(Pass)
+add_subdirectory(Quantizer)
+add_subdirectory(Support)
+add_subdirectory(TableGen)
+add_subdirectory(Target)
+add_subdirectory(Transforms)
+add_subdirectory(Translation)
diff --git a/third_party/mlir/lib/Conversion/CMakeLists.txt b/third_party/mlir/lib/Conversion/CMakeLists.txt
new file mode 100644
index 00000000000..6c14f5487a6
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_subdirectory(LoopsToGPU)
+add_subdirectory(ControlFlowToCFG)
+add_subdirectory(GPUToCUDA)
+add_subdirectory(GPUToNVVM)
+add_subdirectory(GPUToSPIRV)
+add_subdirectory(StandardToLLVM)
+add_subdirectory(StandardToSPIRV)
+add_subdirectory(VectorToLLVM)
diff --git a/third_party/mlir/lib/Conversion/ControlFlowToCFG/CMakeLists.txt b/third_party/mlir/lib/Conversion/ControlFlowToCFG/CMakeLists.txt
new file mode 100644
index 00000000000..d8793c208de
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/ControlFlowToCFG/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_llvm_library(MLIRControlFlowToCFG
+  ConvertControlFlowToCFG.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ControlFlowToCFG
+)
+add_dependencies(
+  MLIRControlFlowToCFG
+
+  MLIRLoopOps
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
+target_link_libraries(
+  MLIRControlFlowToCFG
+
+  MLIRLoopOps
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
diff --git a/third_party/mlir/lib/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.cpp b/third_party/mlir/lib/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.cpp
new file mode 100644
index 00000000000..d68c2658f6e
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.cpp
@@ -0,0 +1,278 @@
+//===- ConvertControlFlowToCFG.cpp - ControlFlow to CFG conversion --------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert loop.for, loop.if and loop.terminator
+// ops into standard CFG ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+namespace {
+
+struct ControlFlowToCFGPass : public FunctionPass<ControlFlowToCFGPass> {
+  void runOnFunction() override;
+};
+
+// Create a CFG subgraph for the loop around its body blocks (if the body
+// contained other loops, they have been already lowered to a flow of blocks).
+// Maintain the invariants that a CFG subgraph created for any loop has a single
+// entry and a single exit, and that the entry/exit blocks are respectively
+// first/last blocks in the parent region.  The original loop operation is
+// replaced by the initialization operations that set up the initial value of
+// the loop induction variable (%iv) and computes the loop bounds that are loop-
+// invariant for affine loops.  The operations following the original loop.for
+// are split out into a separate continuation (exit) block. A condition block is
+// created before the continuation block. It checks the exit condition of the
+// loop and branches either to the continuation block, or to the first block of
+// the body. Induction variable modification is appended to the last block of
+// the body (which is the exit block from the body subgraph thanks to the
+// invariant we maintain) along with a branch that loops back to the condition
+// block.
+//
+//      +---------------------------------+
+//      |   <code before the ForOp>       |
+//      |   <compute initial %iv value>   |
+//      |   br cond(%iv)                  |
+//      +---------------------------------+
+//             |
+//  -------|   |
+//  |      v   v
+//  |   +--------------------------------+
+//  |   | cond(%iv):                     |
+//  |   |   <compare %iv to upper bound> |
+//  |   |   cond_br %r, body, end        |
+//  |   +--------------------------------+
+//  |          |               |
+//  |          |               -------------|
+//  |          v                            |
+//  |   +--------------------------------+  |
+//  |   | body-first:                    |  |
+//  |   |   <body contents>              |  |
+//  |   +--------------------------------+  |
+//  |                   |                   |
+//  |                  ...                  |
+//  |                   |                   |
+//  |   +--------------------------------+  |
+//  |   | body-last:                     |  |
+//  |   |   <body contents>              |  |
+//  |   |   %new_iv =<add step to %iv>   |  |
+//  |   |   br cond(%new_iv)             |  |
+//  |   +--------------------------------+  |
+//  |          |                            |
+//  |-----------        |--------------------
+//                      v
+//      +--------------------------------+
+//      | end:                           |
+//      |   <code after the ForOp> |
+//      +--------------------------------+
+//
+struct ForLowering : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ForOp forOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+// Create a CFG subgraph for the loop.if operation (including its "then" and
+// optional "else" operation blocks).  We maintain the invariants that the
+// subgraph has a single entry and a single exit point, and that the entry/exit
+// blocks are respectively the first/last block of the enclosing region. The
+// operations following the loop.if are split into a continuation (subgraph
+// exit) block. The condition is lowered to a chain of blocks that implement the
+// short-circuit scheme.  Condition blocks are created by splitting out an empty
+// block from the block that contains the loop.if operation.  They
+// conditionally branch to either the first block of the "then" region, or to
+// the first block of the "else" region.  If the latter is absent, they branch
+// to the continuation block instead.  The last blocks of "then" and "else"
+// regions (which are known to be exit blocks thanks to the invariant we
+// maintain).
+//
+//      +--------------------------------+
+//      | <code before the IfOp>         |
+//      | cond_br %cond, %then, %else    |
+//      +--------------------------------+
+//             |              |
+//             |              --------------|
+//             v                            |
+//      +--------------------------------+  |
+//      | then:                          |  |
+//      |   <then contents>              |  |
+//      |   br continue                  |  |
+//      +--------------------------------+  |
+//             |                            |
+//   |----------               |-------------
+//   |                         V
+//   |  +--------------------------------+
+//   |  | else:                          |
+//   |  |   <else contents>              |
+//   |  |   br continue                  |
+//   |  +--------------------------------+
+//   |         |
+//   ------|   |
+//         v   v
+//      +--------------------------------+
+//      | continue:                      |
+//      |   <code after the IfOp>  |
+//      +--------------------------------+
+//
+struct IfLowering : public OpRewritePattern<IfOp> {
+  using OpRewritePattern<IfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(IfOp ifOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+struct TerminatorLowering : public OpRewritePattern<TerminatorOp> {
+  using OpRewritePattern<TerminatorOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TerminatorOp op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op, {});
+    return matchSuccess();
+  }
+};
+} // namespace
+
+PatternMatchResult
+ForLowering::matchAndRewrite(ForOp forOp, PatternRewriter &rewriter) const {
+  Location loc = forOp.getLoc();
+
+  // Start by splitting the block containing the 'loop.for' into two parts.
+  // The part before will get the init code, the part after will be the end
+  // point.
+  auto *initBlock = rewriter.getInsertionBlock();
+  auto initPosition = rewriter.getInsertionPoint();
+  auto *endBlock = rewriter.splitBlock(initBlock, initPosition);
+
+  // Use the first block of the loop body as the condition block since it is
+  // the block that has the induction variable as its argument.  Split out
+  // all operations from the first block into a new block.  Move all body
+  // blocks from the loop body region to the region containing the loop.
+  auto *conditionBlock = &forOp.region().front();
+  auto *firstBodyBlock =
+      rewriter.splitBlock(conditionBlock, conditionBlock->begin());
+  auto *lastBodyBlock = &forOp.region().back();
+  rewriter.inlineRegionBefore(forOp.region(), endBlock);
+  auto *iv = conditionBlock->getArgument(0);
+
+  // Append the induction variable stepping logic to the last body block and
+  // branch back to the condition block.  Construct an expression f :
+  // (x -> x+step) and apply this expression to the induction variable.
+  rewriter.setInsertionPointToEnd(lastBodyBlock);
+  auto *step = forOp.step();
+  auto *stepped = rewriter.create<AddIOp>(loc, iv, step).getResult();
+  if (!stepped)
+    return matchFailure();
+  rewriter.create<BranchOp>(loc, conditionBlock, stepped);
+
+  // Compute loop bounds before branching to the condition.
+  rewriter.setInsertionPointToEnd(initBlock);
+  Value *lowerBound = forOp.lowerBound();
+  Value *upperBound = forOp.upperBound();
+  if (!lowerBound || !upperBound)
+    return matchFailure();
+  rewriter.create<BranchOp>(loc, conditionBlock, lowerBound);
+
+  // With the body block done, we can fill in the condition block.
+  rewriter.setInsertionPointToEnd(conditionBlock);
+  auto comparison =
+      rewriter.create<CmpIOp>(loc, CmpIPredicate::SLT, iv, upperBound);
+
+  rewriter.create<CondBranchOp>(loc, comparison, firstBodyBlock,
+                                ArrayRef<Value *>(), endBlock,
+                                ArrayRef<Value *>());
+  // Ok, we're done!
+  rewriter.replaceOp(forOp, {});
+  return matchSuccess();
+}
+
+PatternMatchResult
+IfLowering::matchAndRewrite(IfOp ifOp, PatternRewriter &rewriter) const {
+  auto loc = ifOp.getLoc();
+
+  // Start by splitting the block containing the 'loop.if' into two parts.
+  // The part before will contain the condition, the part after will be the
+  // continuation point.
+  auto *condBlock = rewriter.getInsertionBlock();
+  auto opPosition = rewriter.getInsertionPoint();
+  auto *continueBlock = rewriter.splitBlock(condBlock, opPosition);
+
+  // Move blocks from the "then" region to the region containing 'loop.if',
+  // place it before the continuation block, and branch to it.
+  auto &thenRegion = ifOp.thenRegion();
+  auto *thenBlock = &thenRegion.front();
+  rewriter.setInsertionPointToEnd(&thenRegion.back());
+  rewriter.create<BranchOp>(loc, continueBlock);
+  rewriter.inlineRegionBefore(thenRegion, continueBlock);
+
+  // Move blocks from the "else" region (if present) to the region containing
+  // 'loop.if', place it before the continuation block and branch to it.  It
+  // will be placed after the "then" regions.
+  auto *elseBlock = continueBlock;
+  auto &elseRegion = ifOp.elseRegion();
+  if (!elseRegion.empty()) {
+    elseBlock = &elseRegion.front();
+    rewriter.setInsertionPointToEnd(&elseRegion.back());
+    rewriter.create<BranchOp>(loc, continueBlock);
+    rewriter.inlineRegionBefore(elseRegion, continueBlock);
+  }
+
+  rewriter.setInsertionPointToEnd(condBlock);
+  rewriter.create<CondBranchOp>(loc, ifOp.condition(), thenBlock,
+                                /*trueArgs=*/ArrayRef<Value *>(), elseBlock,
+                                /*falseArgs=*/ArrayRef<Value *>());
+
+  // Ok, we're done!
+  rewriter.replaceOp(ifOp, {});
+  return matchSuccess();
+}
+
+void mlir::populateLoopToStdConversionPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns.insert<ForLowering, IfLowering, TerminatorLowering>(ctx);
+}
+
+void ControlFlowToCFGPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  ConversionTarget target(getContext());
+  target.addLegalDialect<StandardOpsDialect>();
+  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+    signalPassFailure();
+}
+
+FunctionPassBase *mlir::createConvertToCFGPass() {
+  return new ControlFlowToCFGPass();
+}
+
+static PassRegistration<ControlFlowToCFGPass>
+    pass("lower-to-cfg", "Convert control flow operations to ");
diff --git a/third_party/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/third_party/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
new file mode 100644
index 00000000000..fbaf36c25c9
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -0,0 +1,17 @@
+if(MLIR_CUDA_CONVERSIONS_ENABLED)
+  llvm_map_components_to_libnames(nvptx "NVPTX")
+
+  add_llvm_library(MLIRGPUtoCUDATransforms
+    ConvertKernelFuncToCubin.cpp
+    ConvertLaunchFuncToCudaCalls.cpp
+    GenerateCubinAccessors.cpp
+  )
+  target_link_libraries(MLIRGPUtoCUDATransforms
+    MLIRGPU
+    MLIRLLVMIR
+    MLIRNVVMIR
+    MLIRPass
+    MLIRTargetNVVMIR
+    ${nvptx}
+  )
+endif()
diff --git a/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
new file mode 100644
index 00000000000..29771fe7ea5
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -0,0 +1,173 @@
+//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert gpu kernel functions into a
+// corresponding binary blob that can be executed on a CUDA GPU. Currently
+// only translates the function itself but no dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/NVVMIR.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace mlir;
+
+namespace {
+// TODO(herhut): Move to shared location.
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
+
+/// A pass converting tagged kernel functions to cubin blobs.
+class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
+public:
+  GpuKernelToCubinPass(
+      CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
+      : cubinGenerator(cubinGenerator) {}
+
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    // Make sure the NVPTX target is initialized.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    for (auto function : getModule().getOps<FuncOp>()) {
+      if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
+        continue;
+      }
+      if (failed(translateGpuKernelToCubinAnnotation(function)))
+        signalPassFailure();
+    }
+  }
+
+private:
+  static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
+                                                FuncOp &function);
+
+  std::string translateModuleToPtx(llvm::Module &module,
+                                   llvm::TargetMachine &target_machine);
+  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
+  LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
+
+  CubinGenerator cubinGenerator;
+};
+
+} // anonymous namespace
+
+std::string GpuKernelToCubinPass::translateModuleToPtx(
+    llvm::Module &module, llvm::TargetMachine &target_machine) {
+  std::string ptx;
+  {
+    llvm::raw_string_ostream stream(ptx);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_passes;
+    target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
+                                       llvm::TargetMachine::CGFT_AssemblyFile);
+    codegen_passes.run(module);
+  }
+
+  return ptx;
+}
+
+OwnedCubin
+GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
+                                                  FuncOp &function) {
+  const char data[] = "CUBIN";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+
+OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
+                                                      FuncOp &function) {
+  std::unique_ptr<llvm::TargetMachine> targetMachine;
+  {
+    std::string error;
+    // TODO(herhut): Make triple configurable.
+    constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
+    llvm::Triple triple(cudaTriple);
+    const llvm::Target *target =
+        llvm::TargetRegistry::lookupTarget("", triple, error);
+    if (target == nullptr) {
+      function.emitError("Cannot initialize target triple");
+      return {};
+    }
+    targetMachine.reset(
+        target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
+  }
+
+  // Set the data layout of the llvm module to match what the ptx target needs.
+  llvmModule.setDataLayout(targetMachine->createDataLayout());
+
+  auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
+
+  return cubinGenerator(ptx, function);
+}
+
+LogicalResult
+GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
+  Builder builder(function.getContext());
+
+  OwningModuleRef module = ModuleOp::create(function.getLoc());
+
+  // TODO(herhut): Also handle called functions.
+  module->push_back(function.clone());
+
+  auto llvmModule = translateModuleToNVVMIR(*module);
+  auto cubin = convertModuleToCubin(*llvmModule, function);
+
+  if (!cubin) {
+    return function.emitError("Translation to CUDA binary failed.");
+  }
+
+  function.setAttr(kCubinAnnotation,
+                   builder.getStringAttr({cubin->data(), cubin->size()}));
+
+  // Remove the body of the kernel function now that it has been translated.
+  // The main reason to do this is so that the resulting module no longer
+  // contains the NVVM instructions (typically contained in the kernel bodies)
+  // and hence can be compiled into host code by a separate pass.
+  function.eraseBody();
+
+  return success();
+}
+
+std::unique_ptr<ModulePassBase>
+mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
+  return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
+}
+
+static PassRegistration<GpuKernelToCubinPass>
+    pass("test-kernel-to-cubin",
+         "Convert all kernel functions to CUDA cubin blobs");
diff --git a/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
new file mode 100644
index 00000000000..d4293ba5a1c
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -0,0 +1,379 @@
+//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert gpu.launch_func op into a sequence of
+// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
+// this pass uses a slim runtime layer that builds on top of the public API from
+// the CUDA headers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+// To avoid name mangling, these are defined in the mini-runtime file.
+static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
+static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
+static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
+static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
+static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
+
+static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
+
+namespace {
+
+/// A pass to convert gpu.launch_func operations into a sequence of CUDA
+/// runtime calls.
+///
+/// In essence, a gpu.launch_func operations gets compiled into the following
+/// sequence of runtime calls:
+///
+/// * mcuModuleLoad        -- loads the module given the cubin data
+/// * mcuModuleGetFunction -- gets a handle to the actual kernel function
+/// * mcuGetStreamHelper   -- initializes a new CUDA stream
+/// * mcuLaunchKernelName  -- launches the kernel on a stream
+/// * mcuStreamSynchronize -- waits for operations on the stream to finish
+///
+/// Intermediate data structures are allocated on the stack.
+class GpuLaunchFuncToCudaCallsPass
+    : public ModulePass<GpuLaunchFuncToCudaCallsPass> {
+private:
+  LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
+
+  llvm::LLVMContext &getLLVMContext() {
+    return getLLVMDialect()->getLLVMContext();
+  }
+
+  void initializeCachedTypes() {
+    const llvm::Module &module = llvmDialect->getLLVMModule();
+    llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+    llvmPointerPointerType = llvmPointerType.getPointerTo();
+    llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
+    llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
+    llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
+    llvmIntPtrType = LLVM::LLVMType::getIntNTy(
+        llvmDialect, module.getDataLayout().getPointerSizeInBits());
+  }
+
+  LLVM::LLVMType getPointerType() { return llvmPointerType; }
+
+  LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
+
+  LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
+
+  LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
+
+  LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
+
+  LLVM::LLVMType getIntPtrType() {
+    const llvm::Module &module = getLLVMDialect()->getLLVMModule();
+    return LLVM::LLVMType::getIntNTy(
+        getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
+  }
+
+  LLVM::LLVMType getCUResultType() {
+    // This is declared as an enum in CUDA but helpers use i32.
+    return getInt32Type();
+  }
+
+  // Allocate a void pointer on the stack.
+  Value *allocatePointer(OpBuilder &builder, Location loc) {
+    auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                builder.getI32IntegerAttr(1));
+    return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
+                                          /*alignment=*/0);
+  }
+
+  void declareCudaFunctions(Location loc);
+  Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
+  Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
+                                    OpBuilder &builder);
+  void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
+
+public:
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    // Cache the LLVMDialect for the current module.
+    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
+    // Cache the used LLVM types.
+    initializeCachedTypes();
+
+    for (auto func : getModule().getOps<FuncOp>()) {
+      func.walk<mlir::gpu::LaunchFuncOp>(
+          [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
+    }
+  }
+
+private:
+  LLVM::LLVMDialect *llvmDialect;
+  LLVM::LLVMType llvmPointerType;
+  LLVM::LLVMType llvmPointerPointerType;
+  LLVM::LLVMType llvmInt8Type;
+  LLVM::LLVMType llvmInt32Type;
+  LLVM::LLVMType llvmInt64Type;
+  LLVM::LLVMType llvmIntPtrType;
+};
+
+} // anonymous namespace
+
+// Adds declarations for the needed helper functions from the CUDA wrapper.
+// The types in comments give the actual types expected/returned but the API
+// uses void pointers. This is fine as they have the same linkage in C.
+void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
+  ModuleOp module = getModule();
+  Builder builder(module);
+  if (!module.lookupSymbol<FuncOp>(cuModuleLoadName)) {
+    module.push_back(
+        FuncOp::create(loc, cuModuleLoadName,
+                       builder.getFunctionType(
+                           {
+                               getPointerPointerType(), /* CUmodule *module */
+                               getPointerType()         /* void *cubin */
+                           },
+                           getCUResultType())));
+  }
+  if (!module.lookupSymbol<FuncOp>(cuModuleGetFunctionName)) {
+    // The helper uses void* instead of CUDA's opaque CUmodule and
+    // CUfunction.
+    module.push_back(
+        FuncOp::create(loc, cuModuleGetFunctionName,
+                       builder.getFunctionType(
+                           {
+                               getPointerPointerType(), /* void **function */
+                               getPointerType(),        /* void *module */
+                               getPointerType()         /* char *name */
+                           },
+                           getCUResultType())));
+  }
+  if (!module.lookupSymbol<FuncOp>(cuLaunchKernelName)) {
+    // Other than the CUDA api, the wrappers use uintptr_t to match the
+    // LLVM type if MLIR's index type, which the GPU dialect uses.
+    // Furthermore, they use void* instead of CUDA's opaque CUfunction and
+    // CUstream.
+    module.push_back(FuncOp::create(
+        loc, cuLaunchKernelName,
+        builder.getFunctionType(
+            {
+                getPointerType(),        /* void* f */
+                getIntPtrType(),         /* intptr_t gridXDim */
+                getIntPtrType(),         /* intptr_t gridyDim */
+                getIntPtrType(),         /* intptr_t gridZDim */
+                getIntPtrType(),         /* intptr_t blockXDim */
+                getIntPtrType(),         /* intptr_t blockYDim */
+                getIntPtrType(),         /* intptr_t blockZDim */
+                getInt32Type(),          /* unsigned int sharedMemBytes */
+                getPointerType(),        /* void *hstream */
+                getPointerPointerType(), /* void **kernelParams */
+                getPointerPointerType()  /* void **extra */
+            },
+            getCUResultType())));
+  }
+  if (!module.lookupSymbol<FuncOp>(cuGetStreamHelperName)) {
+    // Helper function to get the current CUDA stream. Uses void* instead of
+    // CUDAs opaque CUstream.
+    module.push_back(FuncOp::create(
+        loc, cuGetStreamHelperName,
+        builder.getFunctionType({}, getPointerType() /* void *stream */)));
+  }
+  if (!module.lookupSymbol<FuncOp>(cuStreamSynchronizeName)) {
+    module.push_back(
+        FuncOp::create(loc, cuStreamSynchronizeName,
+                       builder.getFunctionType(
+                           {
+                               getPointerType() /* CUstream stream */
+                           },
+                           getCUResultType())));
+  }
+}
+
+// Generates a parameters array to be used with a CUDA kernel launch call. The
+// arguments are extracted from the launchOp.
+// The generated code is essentially as follows:
+//
+// %array = alloca(numparams * sizeof(void *))
+// for (i : [0, NumKernelOperands))
+//   %array[i] = cast<void*>(KernelOperand[i])
+// return %array
+Value *
+GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
+                                               OpBuilder &builder) {
+  Location loc = launchOp.getLoc();
+  auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                              builder.getI32IntegerAttr(1));
+  auto arraySize = builder.create<LLVM::ConstantOp>(
+      loc, getInt32Type(),
+      builder.getI32IntegerAttr(launchOp.getNumKernelOperands()));
+  auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
+                                              arraySize, /*alignment=*/0);
+  for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) {
+    auto operand = launchOp.getKernelOperand(idx);
+    auto llvmType = operand->getType().cast<LLVM::LLVMType>();
+    auto memLocation = builder.create<LLVM::AllocaOp>(
+        loc, llvmType.getPointerTo(), one, /*alignment=*/1);
+    builder.create<LLVM::StoreOp>(loc, operand, memLocation);
+    auto casted =
+        builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+    auto index = builder.create<LLVM::ConstantOp>(
+        loc, getInt32Type(), builder.getI32IntegerAttr(idx));
+    auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array,
+                                           ArrayRef<Value *>{index});
+    builder.create<LLVM::StoreOp>(loc, casted, gep);
+  }
+  return array;
+}
+
+// Generates an LLVM IR dialect global that contains the name of the given
+// kernel function as a C string, and returns a pointer to its beginning.
+// The code is essentially:
+//
+// llvm.global constant @kernel_name("function_name\00")
+// func(...) {
+//   %0 = llvm.addressof @kernel_name
+//   %1 = llvm.constant (0 : index)
+//   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
+// }
+Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
+    FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
+  // Make sure the trailing zero is included in the constant.
+  std::vector<char> kernelName(kernelFunction.getName().begin(),
+                               kernelFunction.getName().end());
+  kernelName.push_back('\0');
+
+  std::string globalName =
+      llvm::formatv("{0}_kernel_name", kernelFunction.getName());
+  return LLVM::createGlobalString(
+      loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
+      llvmDialect);
+}
+
+// Emits LLVM IR to launch a kernel function. Expects the module that contains
+// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
+// kernel function in the IR.
+// While MLIR has no global constants, also expects a cubin getter function in
+// an 'nvvm.cubingetter' attribute. Such function is expected to return a
+// pointer to the cubin blob when invoked.
+// With these given, the generated code in essence is
+//
+// %0 = call %cubingetter
+// %1 = alloca sizeof(void*)
+// call %mcuModuleLoad(%2, %1)
+// %2 = alloca sizeof(void*)
+// %3 = load %1
+// %4 = <see generateKernelNameConstant>
+// call %mcuModuleGetFunction(%2, %3, %4)
+// %5 = call %mcuGetStreamHelper()
+// %6 = load %2
+// %7 = <see setupParamsArray>
+// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
+// call %mcuStreamSynchronize(%5)
+void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
+    mlir::gpu::LaunchFuncOp launchOp) {
+  OpBuilder builder(launchOp);
+  Location loc = launchOp.getLoc();
+  declareCudaFunctions(loc);
+
+  auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                               builder.getI32IntegerAttr(0));
+  // Emit a call to the cubin getter to retrieve a pointer to the data that
+  // represents the cubin at runtime.
+  // TODO(herhut): This should rather be a static global once supported.
+  auto kernelFunction = getModule().lookupSymbol<FuncOp>(launchOp.kernel());
+  auto cubinGetter =
+      kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation);
+  if (!cubinGetter) {
+    kernelFunction.emitError("Missing ")
+        << kCubinGetterAnnotation << " attribute.";
+    return signalPassFailure();
+  }
+  auto data = builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getPointerType()}, cubinGetter, ArrayRef<Value *>{});
+  // Emit the load module call to load the module data. Error checking is done
+  // in the called helper function.
+  auto cuModule = allocatePointer(builder, loc);
+  FuncOp cuModuleLoad = getModule().lookupSymbol<FuncOp>(cuModuleLoadName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuModuleLoad),
+                               ArrayRef<Value *>{cuModule, data.getResult(0)});
+  // Get the function from the module. The name corresponds to the name of
+  // the kernel function.
+  auto cuOwningModuleRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
+  auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
+  auto cuFunction = allocatePointer(builder, loc);
+  FuncOp cuModuleGetFunction =
+      getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuModuleGetFunction),
+      ArrayRef<Value *>{cuFunction, cuOwningModuleRef, kernelName});
+  // Grab the global stream needed for execution.
+  FuncOp cuGetStreamHelper =
+      getModule().lookupSymbol<FuncOp>(cuGetStreamHelperName);
+  auto cuStream = builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getPointerType()},
+      builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value *>{});
+  // Invoke the function with required arguments.
+  auto cuLaunchKernel = getModule().lookupSymbol<FuncOp>(cuLaunchKernelName);
+  auto cuFunctionRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
+  auto paramsArray = setupParamsArray(launchOp, builder);
+  auto nullpointer =
+      builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuLaunchKernel),
+      ArrayRef<Value *>{cuFunctionRef, launchOp.getOperand(0),
+                        launchOp.getOperand(1), launchOp.getOperand(2),
+                        launchOp.getOperand(3), launchOp.getOperand(4),
+                        launchOp.getOperand(5), zero, /* sharedMemBytes */
+                        cuStream.getResult(0),        /* stream */
+                        paramsArray,                  /* kernel params */
+                        nullpointer /* extra */});
+  // Sync on the stream to make it synchronous.
+  auto cuStreamSync = getModule().lookupSymbol<FuncOp>(cuStreamSynchronizeName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuStreamSync),
+                               ArrayRef<Value *>(cuStream.getResult(0)));
+  launchOp.erase();
+}
+
+std::unique_ptr<mlir::ModulePassBase>
+mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
+  return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
+}
+
+static PassRegistration<GpuLaunchFuncToCudaCallsPass>
+    pass("launch-func-to-cuda",
+         "Convert all launch_func ops to CUDA runtime calls");
diff --git a/third_party/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/third_party/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
new file mode 100644
index 00000000000..c4daf8af956
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@@ -0,0 +1,121 @@
+//===- GenerateCubinAccessors.cpp - MLIR GPU lowering passes --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to generate LLVMIR functions that return the
+// data stored in nvvm.cubin char* blob.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+namespace {
+
+// TODO(herhut): Move to shared location.
+constexpr const char *kCubinAnnotation = "nvvm.cubin";
+constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
+constexpr const char *kCubinGetterSuffix = "_cubin";
+constexpr const char *kCubinStorageSuffix = "_cubin_cst";
+
+/// A pass generating global strings and getter functions for all cubin blobs
+/// annotated on functions via the nvvm.cubin attribute.
+class GpuGenerateCubinAccessorsPass
+    : public ModulePass<GpuGenerateCubinAccessorsPass> {
+private:
+  LLVM::LLVMType getIndexType() {
+    unsigned bits =
+        llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
+    return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
+  }
+
+  // Inserts a global constant string containing `blob` into the parent module
+  // of `orig` and generates the function that returns the address of the first
+  // character of this string.
+  // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
+  void generate(FuncOp orig, StringAttr blob) {
+    Location loc = orig.getLoc();
+    SmallString<128> nameBuffer(orig.getName());
+    auto module = orig.getParentOfType<ModuleOp>();
+    assert(module && "function must belong to a module");
+
+    // Insert the getter function just after the original function.
+    OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
+    moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode());
+    auto getterType = moduleBuilder.getFunctionType(
+        llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
+    nameBuffer.append(kCubinGetterSuffix);
+    auto result = moduleBuilder.create<FuncOp>(
+        loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
+    Block *entryBlock = result.addEntryBlock();
+
+    // Drop the getter suffix before appending the storage suffix.
+    nameBuffer.resize(orig.getName().size());
+    nameBuffer.append(kCubinStorageSuffix);
+
+    // Obtain the address of the first character of the global string containing
+    // the cubin and return from the getter.
+    OpBuilder builder(entryBlock);
+    Value *startPtr = LLVM::createGlobalString(
+        loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
+    builder.create<LLVM::ReturnOp>(loc, startPtr);
+
+    // Store the name of the getter on the function for easier lookup.
+    orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+  }
+
+public:
+  // Perform the conversion on the module.  This may insert globals, so it
+  // cannot be done on multiple functions in parallel.
+  void runOnModule() override {
+    llvmDialect =
+        getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+
+    for (auto func : getModule().getOps<FuncOp>()) {
+      StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation);
+      if (!cubinBlob)
+        continue;
+      generate(func, cubinBlob);
+    }
+  }
+
+private:
+  LLVM::LLVMDialect *llvmDialect;
+};
+
+} // anonymous namespace
+
+std::unique_ptr<ModulePassBase> createGenerateCubinAccessorPass() {
+  return std::make_unique<GpuGenerateCubinAccessorsPass>();
+}
+
+static PassRegistration<GpuGenerateCubinAccessorsPass>
+    pass("generate-cubin-accessors",
+         "Generate LLVMIR functions that give access to cubin data");
+
+} // namespace mlir
diff --git a/third_party/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/third_party/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
new file mode 100644
index 00000000000..492f3a112fe
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRGPUtoNVVMTransforms
+  LowerGpuOpsToNVVMOps.cpp
+  )
+target_link_libraries(MLIRGPUtoNVVMTransforms
+  LLVMSupport
+  MLIRGPU
+  MLIRLLVMIR
+  MLIRNVVMIR
+  MLIRPass
+  )
diff --git a/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
new file mode 100644
index 00000000000..ed7ebfbced1
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -0,0 +1,171 @@
+//===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to generate NVVMIR operations for higher-level
+// GPU operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+
+namespace {
+
+// Rewriting that replaces the types of a LaunchFunc operation with their
+// LLVM counterparts.
+struct GPULaunchFuncOpLowering : public LLVMOpLowering {
+public:
+  explicit GPULaunchFuncOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(gpu::LaunchFuncOp::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_) {}
+
+  // Convert the kernel arguments to an LLVM type, preserve the rest.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.clone(*op)->setOperands(operands);
+    return rewriter.replaceOp(op, llvm::None), matchSuccess();
+  }
+};
+
+// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
+// that Op operates on.  Op is assumed to return an `std.index` value and
+// XOp, YOp and ZOp are assumed to return an `llvm.i32` value.  Depending on
+// `indexBitwidth`, sign-extend or truncate the resulting value to match the
+// bitwidth expected by the consumers of the value.
+template <typename Op, typename XOp, typename YOp, typename ZOp>
+struct GPUIndexIntrinsicOpLowering : public LLVMOpLowering {
+private:
+  enum dimension { X = 0, Y = 1, Z = 2, invalid };
+  unsigned indexBitwidth;
+
+  static dimension dimensionToIndex(Op op) {
+    return llvm::StringSwitch<dimension>(op.dimension())
+        .Case("x", X)
+        .Case("y", Y)
+        .Case("z", Z)
+        .Default(invalid);
+  }
+
+  static unsigned getIndexBitWidth(LLVMTypeConverter &lowering) {
+    auto dialect = lowering.getDialect();
+    return dialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
+  }
+
+public:
+  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(Op::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_),
+        indexBitwidth(getIndexBitWidth(lowering_)) {}
+
+  // Convert the kernel arguments to an LLVM type, preserve the rest.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto dialect = lowering.getDialect();
+    Value *newOp;
+    switch (dimensionToIndex(cast<Op>(op))) {
+    case X:
+      newOp = rewriter.create<XOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    case Y:
+      newOp = rewriter.create<YOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    case Z:
+      newOp = rewriter.create<ZOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    default:
+      return matchFailure();
+    }
+
+    if (indexBitwidth > 32) {
+      newOp = rewriter.create<LLVM::SExtOp>(
+          loc, LLVM::LLVMType::getIntNTy(dialect, indexBitwidth), newOp);
+    } else if (indexBitwidth < 32) {
+      newOp = rewriter.create<LLVM::TruncOp>(
+          loc, LLVM::LLVMType::getIntNTy(dialect, indexBitwidth), newOp);
+    }
+
+    rewriter.replaceOp(op, {newOp});
+    return matchSuccess();
+  }
+};
+
+// A pass that replaces all occurences of GPU operations with their
+// corresponding NVVM equivalent.
+//
+// This pass does not handle launching of kernels. Instead, it is meant to be
+// used on the body region of a launch or the body region of a kernel
+// function.
+class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
+public:
+  void runOnModule() override {
+    ModuleOp m = getModule();
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateGpuToNVVMConversionPatterns(converter, patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalDialect<NVVM::NVVMDialect>();
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+    if (failed(applyPartialConversion(m, target, patterns, &converter)))
+      signalPassFailure();
+  }
+};
+
+} // anonymous namespace
+
+/// Collect a set of patterns to convert from the GPU dialect to NVVM.
+void mlir::populateGpuToNVVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  patterns
+      .insert<GPULaunchFuncOpLowering,
+              GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
+                                          NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockDim, NVVM::BlockDimXOp,
+                                          NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockId, NVVM::BlockIdXOp,
+                                          NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::GridDim, NVVM::GridDimXOp,
+                                          NVVM::GridDimYOp, NVVM::GridDimZOp>>(
+          converter);
+}
+
+std::unique_ptr<ModulePassBase> mlir::createLowerGpuOpsToNVVMOpsPass() {
+  return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
+}
+
+static PassRegistration<LowerGpuOpsToNVVMOpsPass>
+    pass("lower-gpu-ops-to-nvvm-ops",
+         "Generate NVVM operations for gpu operations");
diff --git a/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
new file mode 100644
index 00000000000..8426420fd92
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(MLIRGPUtoSPIRVTransforms
+  GPUToSPIRV.cpp
+  )
+
+target_link_libraries(MLIRGPUtoSPIRVTransforms
+  MLIRGPU
+  MLIRIR
+  MLIRPass
+  MLIRSPIRV
+  MLIRStandardOps
+  MLIRSPIRVConversion
+  MLIRTransforms
+  )
diff --git a/third_party/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/third_party/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
new file mode 100644
index 00000000000..06b2498279d
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -0,0 +1,172 @@
+//===- GPUToSPIRV.cpp - MLIR SPIR-V lowering passes -----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert a kernel function in the GPU Dialect
+// into a spv.module operation
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Pattern lowering GPU block/thread size/id to loading SPIR-V invocation
+/// builin variables.
+template <typename OpTy, spirv::BuiltIn builtin>
+class LaunchConfigConversion : public SPIRVOpLowering<OpTy> {
+public:
+  using SPIRVOpLowering<OpTy>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern to convert a kernel function in GPU dialect (a FuncOp with the
+/// attribute gpu.kernel) within a spv.module.
+class KernelFnConversion final : public SPIRVOpLowering<FuncOp> {
+public:
+  using SPIRVOpLowering<FuncOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+} // namespace
+
+template <typename OpTy, spirv::BuiltIn builtin>
+PatternMatchResult LaunchConfigConversion<OpTy, builtin>::matchAndRewrite(
+    Operation *op, ArrayRef<Value *> operands,
+    ConversionPatternRewriter &rewriter) const {
+  auto dimAttr = op->getAttrOfType<StringAttr>("dimension");
+  if (!dimAttr) {
+    return this->matchFailure();
+  }
+  int32_t index = 0;
+  if (dimAttr.getValue() == "x") {
+    index = 0;
+  } else if (dimAttr.getValue() == "y") {
+    index = 1;
+  } else if (dimAttr.getValue() == "z") {
+    index = 2;
+  } else {
+    return this->matchFailure();
+  }
+
+  // SPIR-V invocation builtin variables are a vector of type <3xi32>
+  auto spirvBuiltin = this->loadFromBuiltinVariable(op, builtin, rewriter);
+  rewriter.replaceOpWithNewOp<spirv::CompositeExtractOp>(
+      op, rewriter.getIntegerType(32), spirvBuiltin,
+      rewriter.getI32ArrayAttr({index}));
+  return this->matchSuccess();
+}
+
+PatternMatchResult
+KernelFnConversion::matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                                    ConversionPatternRewriter &rewriter) const {
+  auto funcOp = cast<FuncOp>(op);
+  FuncOp newFuncOp;
+  if (!gpu::GPUDialect::isKernel(funcOp)) {
+    return succeeded(lowerFunction(funcOp, operands, &typeConverter, rewriter,
+                                   newFuncOp))
+               ? matchSuccess()
+               : matchFailure();
+  }
+
+  if (failed(lowerAsEntryFunction(funcOp, operands, &typeConverter, rewriter,
+                                  newFuncOp))) {
+    return matchFailure();
+  }
+  newFuncOp.getOperation()->removeAttr(Identifier::get(
+      gpu::GPUDialect::getKernelFuncAttrName(), op->getContext()));
+  return matchSuccess();
+}
+
+namespace {
+/// Pass to lower GPU Dialect to SPIR-V. The pass only converts those functions
+/// that have the "gpu.kernel" attribute, i.e. those functions that are
+/// referenced in gpu::LaunchKernelOp operations. For each such function
+///
+/// 1) Create a spirv::ModuleOp, and clone the function into spirv::ModuleOp
+/// (the original function is still needed by the gpu::LaunchKernelOp, so cannot
+/// replace it).
+///
+/// 2) Lower the body of the spirv::ModuleOp.
+class GPUToSPIRVPass : public ModulePass<GPUToSPIRVPass> {
+  void runOnModule() override;
+};
+} // namespace
+
+void GPUToSPIRVPass::runOnModule() {
+  auto context = &getContext();
+  auto module = getModule();
+
+  SmallVector<Operation *, 4> spirvModules;
+  for (auto funcOp : module.getOps<FuncOp>()) {
+    if (gpu::GPUDialect::isKernel(funcOp)) {
+      OpBuilder builder(module.getBodyRegion());
+      // Create a new spirv::ModuleOp for this function, and clone the
+      // function into it.
+      // TODO : Generalize this to account for different extensions,
+      // capabilities, extended_instruction_sets, other addressing models
+      // and memory models.
+      auto spvModule = builder.create<spirv::ModuleOp>(
+          funcOp.getLoc(),
+          builder.getI32IntegerAttr(
+              static_cast<int32_t>(spirv::AddressingModel::Logical)),
+          builder.getI32IntegerAttr(
+              static_cast<int32_t>(spirv::MemoryModel::VulkanKHR)));
+      OpBuilder moduleBuilder(spvModule.getOperation()->getRegion(0));
+      moduleBuilder.clone(*funcOp.getOperation());
+      spirvModules.push_back(spvModule);
+    }
+  }
+
+  /// Dialect conversion to lower the functions with the spirv::ModuleOps.
+  SPIRVBasicTypeConverter basicTypeConverter(context);
+  SPIRVTypeConverter typeConverter(&basicTypeConverter);
+  OwningRewritePatternList patterns;
+  patterns.insert<
+      KernelFnConversion,
+      LaunchConfigConversion<gpu::BlockDim, spirv::BuiltIn::WorkgroupSize>,
+      LaunchConfigConversion<gpu::BlockId, spirv::BuiltIn::WorkgroupId>,
+      LaunchConfigConversion<gpu::GridDim, spirv::BuiltIn::NumWorkgroups>,
+      LaunchConfigConversion<gpu::ThreadId, spirv::BuiltIn::LocalInvocationId>>(
+      context, typeConverter);
+  populateStandardToSPIRVPatterns(context, patterns);
+
+  ConversionTarget target(*context);
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp Op) {
+    return basicTypeConverter.isSignatureLegal(Op.getType());
+  });
+
+  if (failed(applyFullConversion(spirvModules, target, patterns,
+                                 &typeConverter))) {
+    return signalPassFailure();
+  }
+}
+
+ModulePassBase *createGPUToSPIRVPass() { return new GPUToSPIRVPass(); }
+
+static PassRegistration<GPUToSPIRVPass>
+    pass("convert-gpu-to-spirv", "Convert GPU dialect to SPIR-V dialect");
diff --git a/third_party/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt b/third_party/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
new file mode 100644
index 00000000000..2dacc800cb2
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LIBS
+  MLIRAffineOps
+  MLIRGPU
+  MLIRIR
+  MLIRLinalg
+  MLIRPass
+  MLIRStandardOps
+  MLIRSupport
+  MLIRTransforms
+  LLVMSupport
+)
+
+add_llvm_library(MLIRLoopsToGPU
+  LoopsToGPU.cpp
+  LoopsToGPUPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopsToGPU
+)
+add_dependencies(MLIRLoopsToGPU ${LIBS})
+target_link_libraries(MLIRLoopsToGPU ${LIBS})
diff --git a/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
new file mode 100644
index 00000000000..154a8660bee
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -0,0 +1,337 @@
+//===- LoopsToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This implements a straightforward conversion of an loop nest into a GPU
+// kernel.  The caller is expected to guarantee that the conversion is correct
+// or to further transform the kernel to ensure correctness.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/LowerAffine.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "loops-to-gpu"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+// Extract an indexed value from KernelDim3.
+static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
+  switch (pos) {
+  case 0:
+    return dim3.x;
+  case 1:
+    return dim3.y;
+  case 2:
+    return dim3.z;
+  default:
+    llvm_unreachable("dim3 position out of bounds");
+  }
+  return nullptr;
+}
+
+// Get the lower bound-related operands of a loop operation.
+static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
+  return forOp.getLowerBoundOperands();
+}
+static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) {
+  SmallVector<Value *, 1> bounds(1, forOp.lowerBound());
+  return bounds;
+}
+
+// Get the upper bound-related operands of a loop operation.
+static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
+  return forOp.getUpperBoundOperands();
+}
+static SmallVector<Value *, 1> getUpperBoundOperands(ForOp forOp) {
+  SmallVector<Value *, 1> bounds(1, forOp.upperBound());
+  return bounds;
+}
+
+// Get a Value that corresponds to the loop step.  If the step is an attribute,
+// materialize a corresponding constant using builder.
+static Value *getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
+  return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
+}
+static Value *getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
+
+// Get a Value for the loop lower bound.  If the value requires computation,
+// materialize the instructions using builder.
+static Value *getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
+  return lowerAffineLowerBound(forOp, builder);
+}
+static Value *getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
+  return forOp.lowerBound();
+}
+
+// Get a Value for the loop upper bound.  If the value requires computation,
+// materialize the instructions using builder.
+static Value *getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
+  return lowerAffineUpperBound(forOp, builder);
+}
+static Value *getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
+  return forOp.upperBound();
+}
+
+// Check the structure of the loop nest:
+//   - there are enough loops to map to numBlockDims + numThreadDims;
+//   - the loops are perfectly nested;
+//   - the loop bounds can be computed above the outermost loop.
+// This roughly corresponds to the "matcher" part of the pattern-based
+// rewriting infrastructure.
+template <typename OpTy>
+LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims,
+                                    unsigned numThreadDims) {
+  if (numBlockDims < 1 || numThreadDims < 1) {
+    LLVM_DEBUG(llvm::dbgs() << "nothing to map");
+    return success();
+  }
+
+  OpBuilder builder(forOp.getOperation());
+  if (numBlockDims > 3) {
+    return emitError(builder.getUnknownLoc(),
+                     "cannot map to more than 3 block dimensions");
+  }
+  if (numThreadDims > 3) {
+    return emitError(builder.getUnknownLoc(),
+                     "cannot map to more than 3 thread dimensions");
+  }
+
+  OpTy currentLoop = forOp;
+  Region &limit = forOp.region();
+  for (unsigned i = 0, e = numBlockDims + numThreadDims; i < e; ++i) {
+    Operation *nested = &currentLoop.getBody()->front();
+    if (!areValuesDefinedAbove(getLowerBoundOperands(currentLoop), limit) ||
+        !areValuesDefinedAbove(getUpperBoundOperands(currentLoop), limit))
+      return currentLoop.emitError(
+          "loops with bounds depending on other mapped loops "
+          "are not supported");
+
+    // The innermost loop can have an arbitrary body, skip the perfect nesting
+    // check for it.
+    if (i == e - 1)
+      break;
+
+    auto begin = currentLoop.getBody()->begin(),
+         end = currentLoop.getBody()->end();
+    if (currentLoop.getBody()->empty() || std::next(begin, 2) != end)
+      return currentLoop.emitError(
+          "expected perfectly nested loops in the body");
+
+    if (!(currentLoop = dyn_cast<OpTy>(nested)))
+      return nested->emitError("expected a nested loop");
+  }
+
+  return success();
+}
+
+namespace {
+// Helper structure that holds common state of the loop to GPU kernel
+// conversion.
+struct LoopToGpuConverter {
+  template <typename OpTy>
+  Optional<OpTy> collectBounds(OpTy forOp, unsigned numLoops);
+
+  template <typename OpTy>
+  void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims,
+                    unsigned numThreadDims);
+
+  // Ranges of the loops mapped to blocks or threads.
+  SmallVector<Value *, 6> dims;
+  // Lower bounds of the loops mapped to blocks or threads.
+  SmallVector<Value *, 6> lbs;
+  // Induction variables of the loops mapped to blocks or threads.
+  SmallVector<Value *, 6> ivs;
+  // Steps of the loops mapped to blocks or threads.
+  SmallVector<Value *, 6> steps;
+};
+} // namespace
+
+// Return true if the value is obviously a constant "one".
+static bool isConstantOne(Value *value) {
+  if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
+    return def.getValue() == 1;
+  return false;
+}
+
+// Collect ranges, bounds, steps and induction variables in preparation for
+// mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
+// This may fail if the IR for computing loop bounds cannot be constructed, for
+// example if an affine loop uses semi-affine maps. Return the last loop to be
+// mapped on success, llvm::None on failure.
+template <typename OpTy>
+Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
+                                                 unsigned numLoops) {
+  OpBuilder builder(forOp.getOperation());
+  dims.reserve(numLoops);
+  lbs.reserve(numLoops);
+  ivs.reserve(numLoops);
+  steps.reserve(numLoops);
+  OpTy currentLoop = forOp;
+  for (unsigned i = 0; i < numLoops; ++i) {
+    Value *lowerBound = getOrEmitLowerBound(currentLoop, builder);
+    Value *upperBound = getOrEmitUpperBound(currentLoop, builder);
+    if (!lowerBound || !upperBound) {
+      return llvm::None;
+    }
+
+    Value *range =
+        builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
+    Value *step = getOrCreateStep(currentLoop, builder);
+    if (!isConstantOne(step))
+      range = builder.create<DivISOp>(currentLoop.getLoc(), range, step);
+    dims.push_back(range);
+
+    lbs.push_back(lowerBound);
+    ivs.push_back(currentLoop.getInductionVar());
+    steps.push_back(step);
+
+    if (i != numLoops - 1)
+      currentLoop = cast<OpTy>(&currentLoop.getBody()->front());
+  }
+  return currentLoop;
+}
+
+// Replace the rooted at "rootForOp" with a GPU launch operation.  This expects
+// "innermostForOp" to point to the last loop to be transformed to the kernel,
+// and to have (numBlockDims + numThreadDims) perfectly nested loops between
+// "rootForOp" and "innermostForOp".
+template <typename OpTy>
+void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
+                                      unsigned numBlockDims,
+                                      unsigned numThreadDims) {
+  OpBuilder builder(rootForOp.getOperation());
+  // Prepare the grid and block sizes for the launch operation.  If there is
+  // no loop mapped to a specific dimension, use constant "1" as its size.
+  Value *constOne = (numBlockDims < 3 || numThreadDims < 3)
+                        ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
+                        : nullptr;
+  Value *gridSizeX = dims[0];
+  Value *gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
+  Value *gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
+  Value *blockSizeX = dims[numBlockDims];
+  Value *blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
+  Value *blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
+
+  // Create a launch op and move the body region of the innermost loop to the
+  // launch op.  Pass the values defined outside the outermost loop and used
+  // inside the innermost loop and loop lower bounds as kernel data arguments.
+  // Still assuming perfect nesting so there are no values other than induction
+  // variables that are defined in one loop and used in deeper loops.
+  llvm::SetVector<Value *> valuesToForwardSet;
+  getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(),
+                            valuesToForwardSet);
+  auto valuesToForward = valuesToForwardSet.takeVector();
+  auto originallyForwardedValues = valuesToForward.size();
+  valuesToForward.insert(valuesToForward.end(), lbs.begin(), lbs.end());
+  valuesToForward.insert(valuesToForward.end(), steps.begin(), steps.end());
+  auto launchOp = builder.create<gpu::LaunchOp>(
+      rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
+      blockSizeY, blockSizeZ, valuesToForward);
+  valuesToForward.resize(originallyForwardedValues);
+
+  // Replace the loop terminator (loops contain only a single block) with the
+  // gpu return and move the operations from the loop body block to the gpu
+  // launch body block.  Do not move the entire block because of the difference
+  // in block arguments.
+  Operation &terminator = innermostForOp.getBody()->back();
+  Location terminatorLoc = terminator.getLoc();
+  terminator.erase();
+  builder.setInsertionPointToEnd(innermostForOp.getBody());
+  builder.create<gpu::Return>(terminatorLoc);
+  launchOp.getBody().front().getOperations().splice(
+      launchOp.getBody().front().begin(),
+      innermostForOp.getBody()->getOperations());
+
+  // Remap the loop iterators to use block/thread identifiers instead.  Loops
+  // may iterate from LB with step S whereas GPU thread/block ids always iterate
+  // from 0 to N with step 1.  Therefore, loop induction variables are replaced
+  // with (gpu-thread/block-id * S) + LB.
+  builder.setInsertionPointToStart(&launchOp.getBody().front());
+  auto lbArgumentIt = std::next(launchOp.getKernelArguments().begin(),
+                                originallyForwardedValues);
+  auto stepArgumentIt = std::next(lbArgumentIt, lbs.size());
+  for (auto en : llvm::enumerate(ivs)) {
+    Value *id =
+        en.index() < numBlockDims
+            ? getDim3Value(launchOp.getBlockIds(), en.index())
+            : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
+    Value *step = steps[en.index()];
+    if (!isConstantOne(step))
+      id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
+
+    Value *ivReplacement =
+        builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
+    en.value()->replaceAllUsesWith(ivReplacement);
+    replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt,
+                               launchOp.getBody());
+    std::advance(lbArgumentIt, 1);
+    std::advance(stepArgumentIt, 1);
+  }
+
+  // Remap the values defined outside the body to use kernel arguments instead.
+  // The list of kernel arguments also contains the lower bounds for loops at
+  // trailing positions, make sure we don't touch those.
+  for (const auto &pair :
+       llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
+    Value *from = std::get<0>(pair);
+    Value *to = std::get<1>(pair);
+    replaceAllUsesInRegionWith(from, to, launchOp.getBody());
+  }
+
+  // We are done and can erase the original outermost loop.
+  rootForOp.erase();
+}
+
+// Generic loop to GPU kernel conversion function.
+template <typename OpTy>
+static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
+                                                unsigned numBlockDims,
+                                                unsigned numThreadDims) {
+  if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims)))
+    return failure();
+
+  LoopToGpuConverter converter;
+  auto maybeInnerLoop =
+      converter.collectBounds(forOp, numBlockDims + numThreadDims);
+  if (!maybeInnerLoop)
+    return failure();
+  converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
+
+  return success();
+}
+
+LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
+                                                     unsigned numBlockDims,
+                                                     unsigned numThreadDims) {
+  return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
+}
+
+LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
+                                               unsigned numBlockDims,
+                                               unsigned numThreadDims) {
+  return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
+}
diff --git a/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
new file mode 100644
index 00000000000..9dd9fdbbb87
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -0,0 +1,79 @@
+//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/Support/CommandLine.h"
+
+#define PASS_NAME "convert-loops-to-gpu"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+static llvm::cl::opt<unsigned>
+    clNumBlockDims("gpu-block-dims",
+                   llvm::cl::desc("Number of GPU block dimensions for mapping"),
+                   llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
+static llvm::cl::opt<unsigned> clNumThreadDims(
+    "gpu-thread-dims",
+    llvm::cl::desc("Number of GPU thread dimensions for mapping"),
+    llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
+
+namespace {
+// A pass that traverses top-level loops in the function and converts them to
+// GPU launch operations.  Nested launches are not allowed, so this does not
+// walk the function recursively to avoid considering nested loops.
+struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
+  ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
+      : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
+
+  void runOnFunction() override {
+    for (Block &block : getFunction())
+      for (Operation &op : llvm::make_early_inc_range(block)) {
+        if (auto forOp = dyn_cast<AffineForOp>(&op)) {
+          if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
+                                                      numThreadDims)))
+            signalPassFailure();
+        } else if (auto forOp = dyn_cast<ForOp>(&op)) {
+          if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims,
+                                                numThreadDims)))
+            signalPassFailure();
+        }
+      }
+  }
+
+  unsigned numBlockDims;
+  unsigned numThreadDims;
+};
+} // namespace
+
+std::unique_ptr<FunctionPassBase>
+mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
+                                 unsigned numThreadDims) {
+  return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
+}
+
+static PassRegistration<ForLoopMapper>
+    registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
+      return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
+                                             clNumThreadDims.getValue());
+    });
diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt b/third_party/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
new file mode 100644
index 00000000000..3f3a3342cd7
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_llvm_library(MLIRStandardToLLVM
+  ConvertStandardToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToLLVM
+)
+add_dependencies(
+  MLIRStandardToLLVM
+
+  MLIRControlFlowToCFG
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
+target_link_libraries(
+  MLIRStandardToLLVM
+
+  MLIRControlFlowToCFG
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
new file mode 100644
index 00000000000..779aa27d9ce
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
@@ -0,0 +1,1241 @@
+//===- ConvertStandardToLLVM.cpp - Standard to LLVM dialect conversion-----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert MLIR standard and builtin dialects
+// into the LLVM IR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+
+using namespace mlir;
+
+LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx)
+    : llvmDialect(ctx->getRegisteredDialect<LLVM::LLVMDialect>()) {
+  assert(llvmDialect && "LLVM IR dialect is not registered");
+  module = &llvmDialect->getLLVMModule();
+}
+
+// Get the LLVM context.
+llvm::LLVMContext &LLVMTypeConverter::getLLVMContext() {
+  return module->getContext();
+}
+
+// Extract an LLVM IR type from the LLVM IR dialect type.
+LLVM::LLVMType LLVMTypeConverter::unwrap(Type type) {
+  if (!type)
+    return nullptr;
+  auto *mlirContext = type.getContext();
+  auto wrappedLLVMType = type.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedLLVMType)
+    emitError(UnknownLoc::get(mlirContext),
+              "conversion resulted in a non-LLVM type");
+  return wrappedLLVMType;
+}
+
+LLVM::LLVMType LLVMTypeConverter::getIndexType() {
+  return LLVM::LLVMType::getIntNTy(
+      llvmDialect, module->getDataLayout().getPointerSizeInBits());
+}
+
+Type LLVMTypeConverter::convertIndexType(IndexType type) {
+  return getIndexType();
+}
+
+Type LLVMTypeConverter::convertIntegerType(IntegerType type) {
+  return LLVM::LLVMType::getIntNTy(llvmDialect, type.getWidth());
+}
+
+Type LLVMTypeConverter::convertFloatType(FloatType type) {
+  switch (type.getKind()) {
+  case mlir::StandardTypes::F32:
+    return LLVM::LLVMType::getFloatTy(llvmDialect);
+  case mlir::StandardTypes::F64:
+    return LLVM::LLVMType::getDoubleTy(llvmDialect);
+  case mlir::StandardTypes::F16:
+    return LLVM::LLVMType::getHalfTy(llvmDialect);
+  case mlir::StandardTypes::BF16: {
+    auto *mlirContext = llvmDialect->getContext();
+    return emitError(UnknownLoc::get(mlirContext), "unsupported type: BF16"),
+           Type();
+  }
+  default:
+    llvm_unreachable("non-float type in convertFloatType");
+  }
+}
+
+// Function types are converted to LLVM Function types by recursively converting
+// argument and result types.  If MLIR Function has zero results, the LLVM
+// Function has one VoidType result.  If MLIR Function has more than one result,
+// they are into an LLVM StructType in their order of appearance.
+Type LLVMTypeConverter::convertFunctionType(FunctionType type) {
+  // Convert argument types one by one and check for errors.
+  SmallVector<LLVM::LLVMType, 8> argTypes;
+  for (auto t : type.getInputs()) {
+    auto converted = convertType(t);
+    if (!converted)
+      return {};
+    argTypes.push_back(unwrap(converted));
+  }
+
+  // If function does not return anything, create the void result type,
+  // if it returns on element, convert it, otherwise pack the result types into
+  // a struct.
+  LLVM::LLVMType resultType =
+      type.getNumResults() == 0
+          ? LLVM::LLVMType::getVoidTy(llvmDialect)
+          : unwrap(packFunctionResults(type.getResults()));
+  if (!resultType)
+    return {};
+  return LLVM::LLVMType::getFunctionTy(resultType, argTypes, /*isVarArg=*/false)
+      .getPointerTo();
+}
+
+// Convert a MemRef to an LLVM type. If the memref is statically-shaped, then
+// we return a pointer to the converted element type. Otherwise we return an
+// LLVM stucture type, where the first element of the structure type is a
+// pointer to the elemental type of the MemRef and the following N elements are
+// values of the Index type, one for each of N dynamic dimensions of the MemRef.
+Type LLVMTypeConverter::convertMemRefType(MemRefType type) {
+  LLVM::LLVMType elementType = unwrap(convertType(type.getElementType()));
+  if (!elementType)
+    return {};
+  auto ptrType = elementType.getPointerTo();
+
+  // Extra value for the memory space.
+  unsigned numDynamicSizes = type.getNumDynamicDims();
+  // If memref is statically-shaped we return the underlying pointer type.
+  if (numDynamicSizes == 0)
+    return ptrType;
+
+  SmallVector<LLVM::LLVMType, 8> types(numDynamicSizes + 1, getIndexType());
+  types.front() = ptrType;
+
+  return LLVM::LLVMType::getStructTy(llvmDialect, types);
+}
+
+// Convert an n-D vector type to an LLVM vector type via (n-1)-D array type when
+// n > 1.
+// For example, `vector<4 x f32>` converts to `!llvm.type<"<4 x float>">` and
+// `vector<4 x 8 x 16 f32>` converts to `!llvm<"[4 x [8 x <16 x float>]]">`.
+Type LLVMTypeConverter::convertVectorType(VectorType type) {
+  auto elementType = unwrap(convertType(type.getElementType()));
+  if (!elementType)
+    return {};
+  auto vectorType =
+      LLVM::LLVMType::getVectorTy(elementType, type.getShape().back());
+  auto shape = type.getShape();
+  for (int i = shape.size() - 2; i >= 0; --i)
+    vectorType = LLVM::LLVMType::getArrayTy(vectorType, shape[i]);
+  return vectorType;
+}
+
+// Dispatch based on the actual type.  Return null type on error.
+Type LLVMTypeConverter::convertStandardType(Type type) {
+  if (auto funcType = type.dyn_cast<FunctionType>())
+    return convertFunctionType(funcType);
+  if (auto intType = type.dyn_cast<IntegerType>())
+    return convertIntegerType(intType);
+  if (auto floatType = type.dyn_cast<FloatType>())
+    return convertFloatType(floatType);
+  if (auto indexType = type.dyn_cast<IndexType>())
+    return convertIndexType(indexType);
+  if (auto memRefType = type.dyn_cast<MemRefType>())
+    return convertMemRefType(memRefType);
+  if (auto vectorType = type.dyn_cast<VectorType>())
+    return convertVectorType(vectorType);
+  if (auto llvmType = type.dyn_cast<LLVM::LLVMType>())
+    return llvmType;
+
+  return {};
+}
+
+// Convert the element type of the memref `t` to to an LLVM type using
+// `lowering`, get a pointer LLVM type pointing to the converted `t`, wrap it
+// into the MLIR LLVM dialect type and return.
+static Type getMemRefElementPtrType(MemRefType t, LLVMTypeConverter &lowering) {
+  auto elementType = t.getElementType();
+  auto converted = lowering.convertType(elementType);
+  if (!converted)
+    return {};
+  return converted.cast<LLVM::LLVMType>().getPointerTo();
+}
+
+LLVMOpLowering::LLVMOpLowering(StringRef rootOpName, MLIRContext *context,
+                               LLVMTypeConverter &lowering_,
+                               PatternBenefit benefit)
+    : ConversionPattern(rootOpName, benefit, context), lowering(lowering_) {}
+
+namespace {
+// Base class for Standard to LLVM IR op conversions.  Matches the Op type
+// provided as template argument.  Carries a reference to the LLVM dialect in
+// case it is necessary for rewriters.
+template <typename SourceOp>
+class LLVMLegalizationPattern : public LLVMOpLowering {
+public:
+  // Construct a conversion pattern.
+  explicit LLVMLegalizationPattern(LLVM::LLVMDialect &dialect_,
+                                   LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(SourceOp::getOperationName(), dialect_.getContext(),
+                       lowering_),
+        dialect(dialect_) {}
+
+  // Get the LLVM IR dialect.
+  LLVM::LLVMDialect &getDialect() const { return dialect; }
+  // Get the LLVM context.
+  llvm::LLVMContext &getContext() const { return dialect.getLLVMContext(); }
+  // Get the LLVM module in which the types are constructed.
+  llvm::Module &getModule() const { return dialect.getLLVMModule(); }
+
+  // Get the MLIR type wrapping the LLVM integer type whose bit width is defined
+  // by the pointer size used in the LLVM module.
+  LLVM::LLVMType getIndexType() const {
+    return LLVM::LLVMType::getIntNTy(
+        &dialect, getModule().getDataLayout().getPointerSizeInBits());
+  }
+
+  // Get the MLIR type wrapping the LLVM i8* type.
+  LLVM::LLVMType getVoidPtrType() const {
+    return LLVM::LLVMType::getInt8PtrTy(&dialect);
+  }
+
+  // Create an LLVM IR pseudo-operation defining the given index constant.
+  Value *createIndexConstant(ConversionPatternRewriter &builder, Location loc,
+                             uint64_t value) const {
+    auto attr = builder.getIntegerAttr(builder.getIndexType(), value);
+    return builder.create<LLVM::ConstantOp>(loc, getIndexType(), attr);
+  }
+
+  // Get the array attribute named "position" containing the given list of
+  // integers as integer attribute elements.
+  static ArrayAttr getIntegerArrayAttr(ConversionPatternRewriter &builder,
+                                       ArrayRef<int64_t> values) {
+    SmallVector<Attribute, 4> attrs;
+    attrs.reserve(values.size());
+    for (int64_t pos : values)
+      attrs.push_back(builder.getIntegerAttr(builder.getIndexType(), pos));
+    return builder.getArrayAttr(attrs);
+  }
+
+  // Extract raw data pointer value from a value representing a memref.
+  static Value *extractMemRefElementPtr(ConversionPatternRewriter &builder,
+                                        Location loc,
+                                        Value *convertedMemRefValue,
+                                        Type elementTypePtr,
+                                        bool hasStaticShape) {
+    Value *buffer;
+    if (hasStaticShape)
+      return convertedMemRefValue;
+    else
+      return builder.create<LLVM::ExtractValueOp>(
+          loc, elementTypePtr, convertedMemRefValue,
+          getIntegerArrayAttr(builder, 0));
+    return buffer;
+  }
+
+protected:
+  LLVM::LLVMDialect &dialect;
+};
+
+struct FuncOpConversion : public LLVMLegalizationPattern<FuncOp> {
+  using LLVMLegalizationPattern<FuncOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto funcOp = cast<FuncOp>(op);
+    FunctionType type = funcOp.getType();
+
+    // Convert the original function arguments.
+    TypeConverter::SignatureConversion result(type.getNumInputs());
+    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i)
+      if (failed(lowering.convertSignatureArg(i, type.getInput(i), result)))
+        return matchFailure();
+
+    // Pack the result types into a struct.
+    Type packedResult;
+    if (type.getNumResults() != 0) {
+      if (!(packedResult = lowering.packFunctionResults(type.getResults())))
+        return matchFailure();
+    }
+
+    // Create a new function with an updated signature.
+    auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
+    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                                newFuncOp.end());
+    newFuncOp.setType(FunctionType::get(
+        result.getConvertedTypes(),
+        packedResult ? ArrayRef<Type>(packedResult) : llvm::None,
+        funcOp.getContext()));
+
+    // Tell the rewriter to convert the region signature.
+    rewriter.applySignatureConversion(&newFuncOp.getBody(), result);
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+
+// Basic lowering implementation for one-to-one rewriting from Standard Ops to
+// LLVM Dialect Ops.
+template <typename SourceOp, typename TargetOp>
+struct OneToOneLLVMOpLowering : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = OneToOneLLVMOpLowering<SourceOp, TargetOp>;
+
+  // Convert the type of the result to an LLVM type, pass operands as is,
+  // preserve attributes.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    unsigned numResults = op->getNumResults();
+
+    Type packedType;
+    if (numResults != 0) {
+      packedType = this->lowering.packFunctionResults(
+          llvm::to_vector<4>(op->getResultTypes()));
+      assert(packedType && "type conversion failed, such operation should not "
+                           "have been matched");
+    }
+
+    auto newOp = rewriter.create<TargetOp>(op->getLoc(), packedType, operands,
+                                           op->getAttrs());
+
+    // If the operation produced 0 or 1 result, return them immediately.
+    if (numResults == 0)
+      return rewriter.replaceOp(op, llvm::None), this->matchSuccess();
+    if (numResults == 1)
+      return rewriter.replaceOp(op, newOp.getOperation()->getResult(0)),
+             this->matchSuccess();
+
+    // Otherwise, it had been converted to an operation producing a structure.
+    // Extract individual results from the structure and return them as list.
+    SmallVector<Value *, 4> results;
+    results.reserve(numResults);
+    for (unsigned i = 0; i < numResults; ++i) {
+      auto type = this->lowering.convertType(op->getResult(i)->getType());
+      results.push_back(rewriter.create<LLVM::ExtractValueOp>(
+          op->getLoc(), type, newOp.getOperation()->getResult(0),
+          rewriter.getIndexArrayAttr(i)));
+    }
+    rewriter.replaceOp(op, results);
+    return this->matchSuccess();
+  }
+};
+
+// Express `linearIndex` in terms of coordinates of `basis`.
+// Returns the empty vector when linearIndex is out of the range [0, P] where
+// P is the product of all the basis coordinates.
+//
+// Prerequisites:
+//   Basis is an array of nonnegative integers (signed type inherited from
+//   vector shape type).
+static SmallVector<int64_t, 4> getCoordinates(ArrayRef<int64_t> basis,
+                                              unsigned linearIndex) {
+  SmallVector<int64_t, 4> res;
+  res.reserve(basis.size());
+  for (unsigned basisElement : llvm::reverse(basis)) {
+    res.push_back(linearIndex % basisElement);
+    linearIndex = linearIndex / basisElement;
+  }
+  if (linearIndex > 0)
+    return {};
+  std::reverse(res.begin(), res.end());
+  return res;
+}
+
+// Basic lowering implementation for rewriting from Standard Ops to LLVM Dialect
+// Ops for binary ops with one result. This supports higher-dimensional vector
+// types.
+template <typename SourceOp, typename TargetOp>
+struct BinaryOpLLVMOpLowering : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = BinaryOpLLVMOpLowering<SourceOp, TargetOp>;
+
+  // Convert the type of the result to an LLVM type, pass operands as is,
+  // preserve attributes.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    static_assert(
+        std::is_base_of<OpTrait::NOperands<2>::Impl<SourceOp>, SourceOp>::value,
+        "expected binary op");
+    static_assert(
+        std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
+        "expected single result op");
+    static_assert(std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
+                                  SourceOp>::value,
+                  "expected single result op");
+
+    auto loc = op->getLoc();
+    auto llvmArrayTy = operands[0]->getType().cast<LLVM::LLVMType>();
+
+    if (!llvmArrayTy.isArrayTy()) {
+      auto newOp = rewriter.create<TargetOp>(
+          op->getLoc(), operands[0]->getType(), operands, op->getAttrs());
+      rewriter.replaceOp(op, newOp.getResult());
+      return this->matchSuccess();
+    }
+
+    // Unroll iterated array type until we hit a non-array type.
+    auto llvmTy = llvmArrayTy;
+    SmallVector<int64_t, 4> arraySizes;
+    while (llvmTy.isArrayTy()) {
+      arraySizes.push_back(llvmTy.getArrayNumElements());
+      llvmTy = llvmTy.getArrayElementType();
+    }
+    assert(llvmTy.isVectorTy() && "unexpected binary op over non-vector type");
+    auto llvmVectorTy = llvmTy;
+
+    // Iteratively extract a position coordinates with basis `arraySize` from a
+    // `linearIndex` that is incremented at each step. This terminates when
+    // `linearIndex` exceeds the range specified by `arraySize`.
+    // This has the effect of fully unrolling the dimensions of the n-D array
+    // type, getting to the underlying vector element.
+    Value *desc = rewriter.create<LLVM::UndefOp>(loc, llvmArrayTy);
+    unsigned ub = 1;
+    for (auto s : arraySizes)
+      ub *= s;
+    for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) {
+      auto coords = getCoordinates(arraySizes, linearIndex);
+      // Linear index is out of bounds, we are done.
+      if (coords.empty())
+        break;
+
+      auto position = rewriter.getIndexArrayAttr(coords);
+
+      // For this unrolled `position` corresponding to the `linearIndex`^th
+      // element, extract operand vectors
+      Value *extractedLHS = rewriter.create<LLVM::ExtractValueOp>(
+          loc, llvmVectorTy, operands[0], position);
+      Value *extractedRHS = rewriter.create<LLVM::ExtractValueOp>(
+          loc, llvmVectorTy, operands[1], position);
+      Value *newVal = rewriter.create<TargetOp>(
+          loc, llvmVectorTy, ArrayRef<Value *>{extractedLHS, extractedRHS},
+          op->getAttrs());
+      desc = rewriter.create<LLVM::InsertValueOp>(loc, llvmArrayTy, desc,
+                                                  newVal, position);
+    }
+    rewriter.replaceOp(op, desc);
+    return this->matchSuccess();
+  }
+};
+
+// Specific lowerings.
+// FIXME: this should be tablegen'ed.
+struct AddIOpLowering : public BinaryOpLLVMOpLowering<AddIOp, LLVM::AddOp> {
+  using Super::Super;
+};
+struct SubIOpLowering : public BinaryOpLLVMOpLowering<SubIOp, LLVM::SubOp> {
+  using Super::Super;
+};
+struct MulIOpLowering : public BinaryOpLLVMOpLowering<MulIOp, LLVM::MulOp> {
+  using Super::Super;
+};
+struct DivISOpLowering : public BinaryOpLLVMOpLowering<DivISOp, LLVM::SDivOp> {
+  using Super::Super;
+};
+struct DivIUOpLowering : public BinaryOpLLVMOpLowering<DivIUOp, LLVM::UDivOp> {
+  using Super::Super;
+};
+struct RemISOpLowering : public BinaryOpLLVMOpLowering<RemISOp, LLVM::SRemOp> {
+  using Super::Super;
+};
+struct RemIUOpLowering : public BinaryOpLLVMOpLowering<RemIUOp, LLVM::URemOp> {
+  using Super::Super;
+};
+struct AndOpLowering : public BinaryOpLLVMOpLowering<AndOp, LLVM::AndOp> {
+  using Super::Super;
+};
+struct OrOpLowering : public BinaryOpLLVMOpLowering<OrOp, LLVM::OrOp> {
+  using Super::Super;
+};
+struct XOrOpLowering : public BinaryOpLLVMOpLowering<XOrOp, LLVM::XOrOp> {
+  using Super::Super;
+};
+struct AddFOpLowering : public BinaryOpLLVMOpLowering<AddFOp, LLVM::FAddOp> {
+  using Super::Super;
+};
+struct SubFOpLowering : public BinaryOpLLVMOpLowering<SubFOp, LLVM::FSubOp> {
+  using Super::Super;
+};
+struct MulFOpLowering : public BinaryOpLLVMOpLowering<MulFOp, LLVM::FMulOp> {
+  using Super::Super;
+};
+struct DivFOpLowering : public BinaryOpLLVMOpLowering<DivFOp, LLVM::FDivOp> {
+  using Super::Super;
+};
+struct RemFOpLowering : public BinaryOpLLVMOpLowering<RemFOp, LLVM::FRemOp> {
+  using Super::Super;
+};
+struct SelectOpLowering
+    : public OneToOneLLVMOpLowering<SelectOp, LLVM::SelectOp> {
+  using Super::Super;
+};
+struct CallOpLowering : public OneToOneLLVMOpLowering<CallOp, LLVM::CallOp> {
+  using Super::Super;
+};
+struct CallIndirectOpLowering
+    : public OneToOneLLVMOpLowering<CallIndirectOp, LLVM::CallOp> {
+  using Super::Super;
+};
+struct ConstLLVMOpLowering
+    : public OneToOneLLVMOpLowering<ConstantOp, LLVM::ConstantOp> {
+  using Super::Super;
+};
+
+// Check if the MemRefType `type` is supported by the lowering. We currently do
+// not support memrefs with affine maps and non-default memory spaces.
+static bool isSupportedMemRefType(MemRefType type) {
+  if (!type.getAffineMaps().empty())
+    return false;
+  if (type.getMemorySpace() != 0)
+    return false;
+  return true;
+}
+
+// An `alloc` is converted into a definition of a memref descriptor value and
+// a call to `malloc` to allocate the underlying data buffer.  The memref
+// descriptor is of the LLVM structure type where the first element is a pointer
+// to the (typed) data buffer, and the remaining elements serve to store
+// dynamic sizes of the memref using LLVM-converted `index` type.
+struct AllocOpLowering : public LLVMLegalizationPattern<AllocOp> {
+  using LLVMLegalizationPattern<AllocOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult match(Operation *op) const override {
+    MemRefType type = cast<AllocOp>(op).getType();
+    return isSupportedMemRefType(type) ? matchSuccess() : matchFailure();
+  }
+
+  void rewrite(Operation *op, ArrayRef<Value *> operands,
+               ConversionPatternRewriter &rewriter) const override {
+    auto allocOp = cast<AllocOp>(op);
+    MemRefType type = allocOp.getType();
+
+    // Get actual sizes of the memref as values: static sizes are constant
+    // values and dynamic sizes are passed to 'alloc' as operands.  In case of
+    // zero-dimensional memref, assume a scalar (size 1).
+    SmallVector<Value *, 4> sizes;
+    auto numOperands = allocOp.getNumOperands();
+    sizes.reserve(numOperands);
+    unsigned i = 0;
+    for (int64_t s : type.getShape())
+      sizes.push_back(s == -1 ? operands[i++]
+                              : createIndexConstant(rewriter, op->getLoc(), s));
+    if (sizes.empty())
+      sizes.push_back(createIndexConstant(rewriter, op->getLoc(), 1));
+
+    // Compute the total number of memref elements.
+    Value *cumulativeSize = sizes.front();
+    for (unsigned i = 1, e = sizes.size(); i < e; ++i)
+      cumulativeSize = rewriter.create<LLVM::MulOp>(
+          op->getLoc(), getIndexType(),
+          ArrayRef<Value *>{cumulativeSize, sizes[i]});
+
+    // Compute the total amount of bytes to allocate.
+    auto elementType = type.getElementType();
+    assert((elementType.isIntOrFloat() || elementType.isa<VectorType>()) &&
+           "invalid memref element type");
+    uint64_t elementSize = 0;
+    if (auto vectorType = elementType.dyn_cast<VectorType>())
+      elementSize = vectorType.getNumElements() *
+                    llvm::divideCeil(vectorType.getElementTypeBitWidth(), 8);
+    else
+      elementSize = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
+    cumulativeSize = rewriter.create<LLVM::MulOp>(
+        op->getLoc(), getIndexType(),
+        ArrayRef<Value *>{
+            cumulativeSize,
+            createIndexConstant(rewriter, op->getLoc(), elementSize)});
+
+    // Insert the `malloc` declaration if it is not already present.
+    auto module = op->getParentOfType<ModuleOp>();
+    FuncOp mallocFunc = module.lookupSymbol<FuncOp>("malloc");
+    if (!mallocFunc) {
+      auto mallocType =
+          rewriter.getFunctionType(getIndexType(), getVoidPtrType());
+      mallocFunc =
+          FuncOp::create(rewriter.getUnknownLoc(), "malloc", mallocType);
+      module.push_back(mallocFunc);
+    }
+
+    // Allocate the underlying buffer and store a pointer to it in the MemRef
+    // descriptor.
+    Value *allocated =
+        rewriter
+            .create<LLVM::CallOp>(op->getLoc(), getVoidPtrType(),
+                                  rewriter.getSymbolRefAttr(mallocFunc),
+                                  cumulativeSize)
+            .getResult(0);
+    auto structElementType = lowering.convertType(elementType);
+    auto elementPtrType =
+        structElementType.cast<LLVM::LLVMType>().getPointerTo();
+    allocated = rewriter.create<LLVM::BitcastOp>(op->getLoc(), elementPtrType,
+                                                 ArrayRef<Value *>(allocated));
+
+    // Deal with static memrefs
+    if (numOperands == 0)
+      return rewriter.replaceOp(op, allocated);
+
+    // Create the MemRef descriptor.
+    auto structType = lowering.convertType(type);
+    Value *memRefDescriptor = rewriter.create<LLVM::UndefOp>(
+        op->getLoc(), structType, ArrayRef<Value *>{});
+
+    memRefDescriptor = rewriter.create<LLVM::InsertValueOp>(
+        op->getLoc(), structType, memRefDescriptor, allocated,
+        rewriter.getIndexArrayAttr(0));
+
+    // Store dynamically allocated sizes in the descriptor.  Dynamic sizes are
+    // passed in as operands.
+    for (auto indexedSize : llvm::enumerate(operands)) {
+      memRefDescriptor = rewriter.create<LLVM::InsertValueOp>(
+          op->getLoc(), structType, memRefDescriptor, indexedSize.value(),
+          rewriter.getIndexArrayAttr(1 + indexedSize.index()));
+    }
+
+    // Return the final value of the descriptor.
+    rewriter.replaceOp(op, memRefDescriptor);
+  }
+};
+
+// A `dealloc` is converted into a call to `free` on the underlying data buffer.
+// The memref descriptor being an SSA value, there is no need to clean it up
+// in any way.
+struct DeallocOpLowering : public LLVMLegalizationPattern<DeallocOp> {
+  using LLVMLegalizationPattern<DeallocOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    assert(operands.size() == 1 && "dealloc takes one operand");
+    OperandAdaptor<DeallocOp> transformed(operands);
+
+    // Insert the `free` declaration if it is not already present.
+    FuncOp freeFunc =
+        op->getParentOfType<ModuleOp>().lookupSymbol<FuncOp>("free");
+    if (!freeFunc) {
+      auto freeType = rewriter.getFunctionType(getVoidPtrType(), {});
+      freeFunc = FuncOp::create(rewriter.getUnknownLoc(), "free", freeType);
+      op->getParentOfType<ModuleOp>().push_back(freeFunc);
+    }
+
+    auto type = transformed.memref()->getType().cast<LLVM::LLVMType>();
+    auto hasStaticShape = type.isPointerTy();
+    Type elementPtrType = hasStaticShape ? type : type.getStructElementType(0);
+    Value *bufferPtr =
+        extractMemRefElementPtr(rewriter, op->getLoc(), transformed.memref(),
+                                elementPtrType, hasStaticShape);
+    Value *casted = rewriter.create<LLVM::BitcastOp>(
+        op->getLoc(), getVoidPtrType(), bufferPtr);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, ArrayRef<Type>(), rewriter.getSymbolRefAttr(freeFunc), casted);
+    return matchSuccess();
+  }
+};
+
+struct MemRefCastOpLowering : public LLVMLegalizationPattern<MemRefCastOp> {
+  using LLVMLegalizationPattern<MemRefCastOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult match(Operation *op) const override {
+    auto memRefCastOp = cast<MemRefCastOp>(op);
+    MemRefType sourceType =
+        memRefCastOp.getOperand()->getType().cast<MemRefType>();
+    MemRefType targetType = memRefCastOp.getType();
+    return (isSupportedMemRefType(targetType) &&
+            isSupportedMemRefType(sourceType))
+               ? matchSuccess()
+               : matchFailure();
+  }
+
+  void rewrite(Operation *op, ArrayRef<Value *> operands,
+               ConversionPatternRewriter &rewriter) const override {
+    auto memRefCastOp = cast<MemRefCastOp>(op);
+    OperandAdaptor<MemRefCastOp> transformed(operands);
+    auto targetType = memRefCastOp.getType();
+    auto sourceType = memRefCastOp.getOperand()->getType().cast<MemRefType>();
+
+    // Copy the data buffer pointer.
+    auto elementTypePtr = getMemRefElementPtrType(targetType, lowering);
+    Value *buffer =
+        extractMemRefElementPtr(rewriter, op->getLoc(), transformed.source(),
+                                elementTypePtr, sourceType.hasStaticShape());
+    // Account for static memrefs as target types
+    if (targetType.hasStaticShape())
+      return rewriter.replaceOp(op, buffer);
+
+    // Create the new MemRef descriptor.
+    auto structType = lowering.convertType(targetType);
+    Value *newDescriptor = rewriter.create<LLVM::UndefOp>(
+        op->getLoc(), structType, ArrayRef<Value *>{});
+    // Otherwise target type is dynamic memref, so create a proper descriptor.
+    newDescriptor = rewriter.create<LLVM::InsertValueOp>(
+        op->getLoc(), structType, newDescriptor, buffer,
+        rewriter.getIndexArrayAttr(0));
+
+    // Fill in the dynamic sizes of the new descriptor.  If the size was
+    // dynamic, copy it from the old descriptor.  If the size was static, insert
+    // the constant.  Note that the positions of dynamic sizes in the
+    // descriptors start from 1 (the buffer pointer is at position zero).
+    int64_t sourceDynamicDimIdx = 1;
+    int64_t targetDynamicDimIdx = 1;
+    for (int i = 0, e = sourceType.getRank(); i < e; ++i) {
+      // Ignore new static sizes (they will be known from the type).  If the
+      // size was dynamic, update the index of dynamic types.
+      if (targetType.getShape()[i] != -1) {
+        if (sourceType.getShape()[i] == -1)
+          ++sourceDynamicDimIdx;
+        continue;
+      }
+
+      auto sourceSize = sourceType.getShape()[i];
+      Value *size =
+          sourceSize == -1
+              ? rewriter.create<LLVM::ExtractValueOp>(
+                    op->getLoc(), getIndexType(),
+                    transformed.source(), // NB: dynamic memref
+                    rewriter.getIndexArrayAttr(sourceDynamicDimIdx++))
+              : createIndexConstant(rewriter, op->getLoc(), sourceSize);
+      newDescriptor = rewriter.create<LLVM::InsertValueOp>(
+          op->getLoc(), structType, newDescriptor, size,
+          rewriter.getIndexArrayAttr(targetDynamicDimIdx++));
+    }
+    assert(sourceDynamicDimIdx - 1 == sourceType.getNumDynamicDims() &&
+           "source dynamic dimensions were not processed");
+    assert(targetDynamicDimIdx - 1 == targetType.getNumDynamicDims() &&
+           "target dynamic dimensions were not set up");
+
+    rewriter.replaceOp(op, newDescriptor);
+  }
+};
+
+// A `dim` is converted to a constant for static sizes and to an access to the
+// size stored in the memref descriptor for dynamic sizes.
+struct DimOpLowering : public LLVMLegalizationPattern<DimOp> {
+  using LLVMLegalizationPattern<DimOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult match(Operation *op) const override {
+    auto dimOp = cast<DimOp>(op);
+    MemRefType type = dimOp.getOperand()->getType().cast<MemRefType>();
+    return isSupportedMemRefType(type) ? matchSuccess() : matchFailure();
+  }
+
+  void rewrite(Operation *op, ArrayRef<Value *> operands,
+               ConversionPatternRewriter &rewriter) const override {
+    auto dimOp = cast<DimOp>(op);
+    OperandAdaptor<DimOp> transformed(operands);
+    MemRefType type = dimOp.getOperand()->getType().cast<MemRefType>();
+
+    auto shape = type.getShape();
+    uint64_t index = dimOp.getIndex();
+    // Extract dynamic size from the memref descriptor and define static size
+    // as a constant.
+    if (shape[index] == -1) {
+      // Find the position of the dynamic dimension in the list of dynamic sizes
+      // by counting the number of preceding dynamic dimensions.  Start from 1
+      // because the buffer pointer is at position zero.
+      int64_t position = 1;
+      for (uint64_t i = 0; i < index; ++i) {
+        if (shape[i] == -1)
+          ++position;
+      }
+      rewriter.replaceOpWithNewOp<LLVM::ExtractValueOp>(
+          op, getIndexType(), transformed.memrefOrTensor(),
+          rewriter.getIndexArrayAttr(position));
+    } else {
+      rewriter.replaceOp(
+          op, createIndexConstant(rewriter, op->getLoc(), shape[index]));
+    }
+  }
+};
+
+// Common base for load and store operations on MemRefs.  Restricts the match
+// to supported MemRef types.  Provides functionality to emit code accessing a
+// specific element of the underlying data buffer.
+template <typename Derived>
+struct LoadStoreOpLowering : public LLVMLegalizationPattern<Derived> {
+  using LLVMLegalizationPattern<Derived>::LLVMLegalizationPattern;
+  using Base = LoadStoreOpLowering<Derived>;
+
+  PatternMatchResult match(Operation *op) const override {
+    MemRefType type = cast<Derived>(op).getMemRefType();
+    return isSupportedMemRefType(type) ? this->matchSuccess()
+                                       : this->matchFailure();
+  }
+
+  // Given subscript indices and array sizes in row-major order,
+  //   i_n, i_{n-1}, ..., i_1
+  //   s_n, s_{n-1}, ..., s_1
+  // obtain a value that corresponds to the linearized subscript
+  //   \sum_k i_k * \prod_{j=1}^{k-1} s_j
+  // by accumulating the running linearized value.
+  // Note that `indices` and `allocSizes` are passed in the same order as they
+  // appear in load/store operations and memref type declarations.
+  Value *linearizeSubscripts(ConversionPatternRewriter &builder, Location loc,
+                             ArrayRef<Value *> indices,
+                             ArrayRef<Value *> allocSizes) const {
+    assert(indices.size() == allocSizes.size() &&
+           "mismatching number of indices and allocation sizes");
+    assert(!indices.empty() && "cannot linearize a 0-dimensional access");
+
+    Value *linearized = indices.front();
+    for (int i = 1, nSizes = allocSizes.size(); i < nSizes; ++i) {
+      linearized = builder.create<LLVM::MulOp>(
+          loc, this->getIndexType(),
+          ArrayRef<Value *>{linearized, allocSizes[i]});
+      linearized = builder.create<LLVM::AddOp>(
+          loc, this->getIndexType(), ArrayRef<Value *>{linearized, indices[i]});
+    }
+    return linearized;
+  }
+
+  // Given the MemRef type, a descriptor and a list of indices, extract the data
+  // buffer pointer from the descriptor, convert multi-dimensional subscripts
+  // into a linearized index (using dynamic size data from the descriptor if
+  // necessary) and get the pointer to the buffer element identified by the
+  // indices.
+  Value *getElementPtr(Location loc, Type elementTypePtr,
+                       ArrayRef<int64_t> shape, Value *memRefDescriptor,
+                       ArrayRef<Value *> indices,
+                       ConversionPatternRewriter &rewriter) const {
+    // Get the list of MemRef sizes.  Static sizes are defined as constants.
+    // Dynamic sizes are extracted from the MemRef descriptor, where they start
+    // from the position 1 (the buffer is at position 0).
+    SmallVector<Value *, 4> sizes;
+    unsigned dynamicSizeIdx = 1;
+    for (int64_t s : shape) {
+      if (s == -1) {
+        Value *size = rewriter.create<LLVM::ExtractValueOp>(
+            loc, this->getIndexType(), memRefDescriptor,
+            rewriter.getIndexArrayAttr(dynamicSizeIdx++));
+        sizes.push_back(size);
+      } else {
+        sizes.push_back(this->createIndexConstant(rewriter, loc, s));
+      }
+    }
+
+    // The second and subsequent operands are access subscripts.  Obtain the
+    // linearized address in the buffer.
+    Value *subscript = linearizeSubscripts(rewriter, loc, indices, sizes);
+
+    Value *dataPtr = rewriter.create<LLVM::ExtractValueOp>(
+        loc, elementTypePtr, memRefDescriptor, rewriter.getIndexArrayAttr(0));
+    return rewriter.create<LLVM::GEPOp>(loc, elementTypePtr,
+                                        ArrayRef<Value *>{dataPtr, subscript},
+                                        ArrayRef<NamedAttribute>{});
+  }
+  // This is a getElementPtr variant, where the value is a direct raw pointer.
+  // If a shape is empty, we are dealing with a zero-dimensional memref. Return
+  // the pointer unmodified in this case.  Otherwise, linearize subscripts to
+  // obtain the offset with respect to the base pointer.  Use this offset to
+  // compute and return the element pointer.
+  Value *getRawElementPtr(Location loc, Type elementTypePtr,
+                          ArrayRef<int64_t> shape, Value *rawDataPtr,
+                          ArrayRef<Value *> indices,
+                          ConversionPatternRewriter &rewriter) const {
+    if (shape.empty())
+      return rawDataPtr;
+
+    SmallVector<Value *, 4> sizes;
+    for (int64_t s : shape) {
+      sizes.push_back(this->createIndexConstant(rewriter, loc, s));
+    }
+
+    Value *subscript = linearizeSubscripts(rewriter, loc, indices, sizes);
+    return rewriter.create<LLVM::GEPOp>(
+        loc, elementTypePtr, ArrayRef<Value *>{rawDataPtr, subscript},
+        ArrayRef<NamedAttribute>{});
+  }
+
+  Value *getDataPtr(Location loc, MemRefType type, Value *dataPtr,
+                    ArrayRef<Value *> indices,
+                    ConversionPatternRewriter &rewriter,
+                    llvm::Module &module) const {
+    auto ptrType = getMemRefElementPtrType(type, this->lowering);
+    auto shape = type.getShape();
+    if (type.hasStaticShape()) {
+      // NB: If memref was statically-shaped, dataPtr is pointer to raw data.
+      return getRawElementPtr(loc, ptrType, shape, dataPtr, indices, rewriter);
+    }
+    return getElementPtr(loc, ptrType, shape, dataPtr, indices, rewriter);
+  }
+};
+
+// Load operation is lowered to obtaining a pointer to the indexed element
+// and loading it.
+struct LoadOpLowering : public LoadStoreOpLowering<LoadOp> {
+  using Base::Base;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loadOp = cast<LoadOp>(op);
+    OperandAdaptor<LoadOp> transformed(operands);
+    auto type = loadOp.getMemRefType();
+
+    Value *dataPtr = getDataPtr(op->getLoc(), type, transformed.memref(),
+                                transformed.indices(), rewriter, getModule());
+    rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, dataPtr);
+    return matchSuccess();
+  }
+};
+
+// Store opreation is lowered to obtaining a pointer to the indexed element,
+// and storing the given value to it.
+struct StoreOpLowering : public LoadStoreOpLowering<StoreOp> {
+  using Base::Base;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto type = cast<StoreOp>(op).getMemRefType();
+    OperandAdaptor<StoreOp> transformed(operands);
+
+    Value *dataPtr = getDataPtr(op->getLoc(), type, transformed.memref(),
+                                transformed.indices(), rewriter, getModule());
+    rewriter.replaceOpWithNewOp<LLVM::StoreOp>(op, transformed.value(),
+                                               dataPtr);
+    return matchSuccess();
+  }
+};
+
+// The lowering of index_cast becomes an integer conversion since index becomes
+// an integer.  If the bit width of the source and target integer types is the
+// same, just erase the cast.  If the target type is wider, sign-extend the
+// value, otherwise truncate it.
+struct IndexCastOpLowering : public LLVMLegalizationPattern<IndexCastOp> {
+  using LLVMLegalizationPattern<IndexCastOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    IndexCastOpOperandAdaptor transformed(operands);
+    auto indexCastOp = cast<IndexCastOp>(op);
+
+    auto targetType =
+        this->lowering.convertType(indexCastOp.getResult()->getType())
+            .cast<LLVM::LLVMType>();
+    auto sourceType = transformed.in()->getType().cast<LLVM::LLVMType>();
+    unsigned targetBits = targetType.getUnderlyingType()->getIntegerBitWidth();
+    unsigned sourceBits = sourceType.getUnderlyingType()->getIntegerBitWidth();
+
+    if (targetBits == sourceBits)
+      rewriter.replaceOp(op, transformed.in());
+    else if (targetBits < sourceBits)
+      rewriter.replaceOpWithNewOp<LLVM::TruncOp>(op, targetType,
+                                                 transformed.in());
+    else
+      rewriter.replaceOpWithNewOp<LLVM::SExtOp>(op, targetType,
+                                                transformed.in());
+    return matchSuccess();
+  }
+};
+
+// Convert std.cmp predicate into the LLVM dialect CmpPredicate.  The two
+// enums share the numerical values so just cast.
+template <typename LLVMPredType, typename StdPredType>
+static LLVMPredType convertCmpPredicate(StdPredType pred) {
+  return static_cast<LLVMPredType>(pred);
+}
+
+struct CmpIOpLowering : public LLVMLegalizationPattern<CmpIOp> {
+  using LLVMLegalizationPattern<CmpIOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto cmpiOp = cast<CmpIOp>(op);
+    CmpIOpOperandAdaptor transformed(operands);
+
+    rewriter.replaceOpWithNewOp<LLVM::ICmpOp>(
+        op, lowering.convertType(cmpiOp.getResult()->getType()),
+        rewriter.getI64IntegerAttr(static_cast<int64_t>(
+            convertCmpPredicate<LLVM::ICmpPredicate>(cmpiOp.getPredicate()))),
+        transformed.lhs(), transformed.rhs());
+
+    return matchSuccess();
+  }
+};
+
+struct CmpFOpLowering : public LLVMLegalizationPattern<CmpFOp> {
+  using LLVMLegalizationPattern<CmpFOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto cmpfOp = cast<CmpFOp>(op);
+    CmpFOpOperandAdaptor transformed(operands);
+
+    rewriter.replaceOpWithNewOp<LLVM::FCmpOp>(
+        op, lowering.convertType(cmpfOp.getResult()->getType()),
+        rewriter.getI64IntegerAttr(static_cast<int64_t>(
+            convertCmpPredicate<LLVM::FCmpPredicate>(cmpfOp.getPredicate()))),
+        transformed.lhs(), transformed.rhs());
+
+    return matchSuccess();
+  }
+};
+
+struct SIToFPLowering
+    : public OneToOneLLVMOpLowering<SIToFPOp, LLVM::SIToFPOp> {
+  using Super::Super;
+};
+
+// Base class for LLVM IR lowering terminator operations with successors.
+template <typename SourceOp, typename TargetOp>
+struct OneToOneLLVMTerminatorLowering
+    : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = OneToOneLLVMTerminatorLowering<SourceOp, TargetOp>;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value *>> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<TargetOp>(op, properOperands, destinations,
+                                          operands, op->getAttrs());
+    return this->matchSuccess();
+  }
+};
+
+// Special lowering pattern for `ReturnOps`.  Unlike all other operations,
+// `ReturnOp` interacts with the function signature and must have as many
+// operands as the function has return values.  Because in LLVM IR, functions
+// can only return 0 or 1 value, we pack multiple values into a structure type.
+// Emit `UndefOp` followed by `InsertValueOp`s to create such structure if
+// necessary before returning it
+struct ReturnOpLowering : public LLVMLegalizationPattern<ReturnOp> {
+  using LLVMLegalizationPattern<ReturnOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    unsigned numArguments = op->getNumOperands();
+
+    // If ReturnOp has 0 or 1 operand, create it and return immediately.
+    if (numArguments == 0) {
+      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+          op, llvm::ArrayRef<Value *>(), llvm::ArrayRef<Block *>(),
+          llvm::ArrayRef<llvm::ArrayRef<Value *>>(), op->getAttrs());
+      return matchSuccess();
+    }
+    if (numArguments == 1) {
+      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+          op, llvm::ArrayRef<Value *>(operands.front()),
+          llvm::ArrayRef<Block *>(), llvm::ArrayRef<llvm::ArrayRef<Value *>>(),
+          op->getAttrs());
+      return matchSuccess();
+    }
+
+    // Otherwise, we need to pack the arguments into an LLVM struct type before
+    // returning.
+    auto packedType =
+        lowering.packFunctionResults(llvm::to_vector<4>(op->getOperandTypes()));
+
+    Value *packed = rewriter.create<LLVM::UndefOp>(op->getLoc(), packedType);
+    for (unsigned i = 0; i < numArguments; ++i) {
+      packed = rewriter.create<LLVM::InsertValueOp>(
+          op->getLoc(), packedType, packed, operands[i],
+          rewriter.getIndexArrayAttr(i));
+    }
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+        op, llvm::makeArrayRef(packed), llvm::ArrayRef<Block *>(),
+        llvm::ArrayRef<llvm::ArrayRef<Value *>>(), op->getAttrs());
+    return matchSuccess();
+  }
+};
+
+// FIXME: this should be tablegen'ed as well.
+struct BranchOpLowering
+    : public OneToOneLLVMTerminatorLowering<BranchOp, LLVM::BrOp> {
+  using Super::Super;
+};
+struct CondBranchOpLowering
+    : public OneToOneLLVMTerminatorLowering<CondBranchOp, LLVM::CondBrOp> {
+  using Super::Super;
+};
+
+} // namespace
+
+static void ensureDistinctSuccessors(Block &bb) {
+  auto *terminator = bb.getTerminator();
+
+  // Find repeated successors with arguments.
+  llvm::SmallDenseMap<Block *, llvm::SmallVector<int, 4>> successorPositions;
+  for (int i = 0, e = terminator->getNumSuccessors(); i < e; ++i) {
+    Block *successor = terminator->getSuccessor(i);
+    // Blocks with no arguments are safe even if they appear multiple times
+    // because they don't need PHI nodes.
+    if (successor->getNumArguments() == 0)
+      continue;
+    successorPositions[successor].push_back(i);
+  }
+
+  // If a successor appears for the second or more time in the terminator,
+  // create a new dummy block that unconditionally branches to the original
+  // destination, and retarget the terminator to branch to this new block.
+  // There is no need to pass arguments to the dummy block because it will be
+  // dominated by the original block and can therefore use any values defined in
+  // the original block.
+  for (const auto &successor : successorPositions) {
+    const auto &positions = successor.second;
+    // Start from the second occurrence of a block in the successor list.
+    for (auto position = std::next(positions.begin()), end = positions.end();
+         position != end; ++position) {
+      auto *dummyBlock = new Block();
+      bb.getParent()->push_back(dummyBlock);
+      auto builder = OpBuilder(dummyBlock);
+      SmallVector<Value *, 8> operands(
+          terminator->getSuccessorOperands(*position));
+      builder.create<BranchOp>(terminator->getLoc(), successor.first, operands);
+      terminator->setSuccessor(dummyBlock, *position);
+      for (int i = 0, e = terminator->getNumSuccessorOperands(*position); i < e;
+           ++i)
+        terminator->eraseSuccessorOperand(*position, i);
+    }
+  }
+}
+
+void mlir::LLVM::ensureDistinctSuccessors(ModuleOp m) {
+  for (auto f : m.getOps<FuncOp>()) {
+    for (auto &bb : f.getBlocks()) {
+      ::ensureDistinctSuccessors(bb);
+    }
+  }
+}
+
+/// Collect a set of patterns to convert from the Standard dialect to LLVM.
+void mlir::populateStdToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  // FIXME: this should be tablegen'ed
+  patterns.insert<
+      AddFOpLowering, AddIOpLowering, AndOpLowering, AllocOpLowering,
+      BranchOpLowering, CallIndirectOpLowering, CallOpLowering, CmpIOpLowering,
+      CmpFOpLowering, CondBranchOpLowering, ConstLLVMOpLowering,
+      DeallocOpLowering, DimOpLowering, DivISOpLowering, DivIUOpLowering,
+      DivFOpLowering, FuncOpConversion, IndexCastOpLowering, LoadOpLowering,
+      MemRefCastOpLowering, MulFOpLowering, MulIOpLowering, OrOpLowering,
+      RemISOpLowering, RemIUOpLowering, RemFOpLowering, ReturnOpLowering,
+      SelectOpLowering, SIToFPLowering, StoreOpLowering, SubFOpLowering,
+      SubIOpLowering, XOrOpLowering>(*converter.getDialect(), converter);
+}
+
+// Convert types using the stored LLVM IR module.
+Type LLVMTypeConverter::convertType(Type t) { return convertStandardType(t); }
+
+// Create an LLVM IR structure type if there is more than one result.
+Type LLVMTypeConverter::packFunctionResults(ArrayRef<Type> types) {
+  assert(!types.empty() && "expected non-empty list of type");
+
+  if (types.size() == 1)
+    return convertType(types.front());
+
+  SmallVector<LLVM::LLVMType, 8> resultTypes;
+  resultTypes.reserve(types.size());
+  for (auto t : types) {
+    auto converted = convertType(t).dyn_cast<LLVM::LLVMType>();
+    if (!converted)
+      return {};
+    resultTypes.push_back(converted);
+  }
+
+  return LLVM::LLVMType::getStructTy(llvmDialect, resultTypes);
+}
+
+/// Create an instance of LLVMTypeConverter in the given context.
+static std::unique_ptr<LLVMTypeConverter>
+makeStandardToLLVMTypeConverter(MLIRContext *context) {
+  return std::make_unique<LLVMTypeConverter>(context);
+}
+
+namespace {
+/// A pass converting MLIR operations into the LLVM IR dialect.
+struct LLVMLoweringPass : public ModulePass<LLVMLoweringPass> {
+  // By default, the patterns are those converting Standard operations to the
+  // LLVMIR dialect.
+  explicit LLVMLoweringPass(
+      LLVMPatternListFiller patternListFiller =
+          populateStdToLLVMConversionPatterns,
+      LLVMTypeConverterMaker converterBuilder = makeStandardToLLVMTypeConverter)
+      : patternListFiller(patternListFiller),
+        typeConverterMaker(converterBuilder) {}
+
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    if (!typeConverterMaker || !patternListFiller)
+      return signalPassFailure();
+
+    ModuleOp m = getModule();
+    LLVM::ensureDistinctSuccessors(m);
+    std::unique_ptr<LLVMTypeConverter> typeConverter =
+        typeConverterMaker(&getContext());
+    if (!typeConverter)
+      return signalPassFailure();
+
+    OwningRewritePatternList patterns;
+    populateLoopToStdConversionPatterns(patterns, m.getContext());
+    patternListFiller(*typeConverter, patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
+      return typeConverter->isSignatureLegal(op.getType());
+    });
+    if (failed(applyPartialConversion(m, target, patterns, &*typeConverter)))
+      signalPassFailure();
+  }
+
+  // Callback for creating a list of patterns.  It is called every time in
+  // runOnModule since applyPartialConversion consumes the list.
+  LLVMPatternListFiller patternListFiller;
+
+  // Callback for creating an instance of type converter.  The converter
+  // constructor needs an MLIRContext, which is not available until runOnModule.
+  LLVMTypeConverterMaker typeConverterMaker;
+};
+} // end namespace
+
+std::unique_ptr<ModulePassBase> mlir::createConvertToLLVMIRPass() {
+  return std::make_unique<LLVMLoweringPass>();
+}
+
+std::unique_ptr<ModulePassBase>
+mlir::createConvertToLLVMIRPass(LLVMPatternListFiller patternListFiller,
+                                LLVMTypeConverterMaker typeConverterMaker) {
+  return std::make_unique<LLVMLoweringPass>(patternListFiller,
+                                            typeConverterMaker);
+}
+
+static PassRegistration<LLVMLoweringPass>
+    pass("lower-to-llvm", "Convert all functions to the LLVM IR dialect");
diff --git a/third_party/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/third_party/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
new file mode 100644
index 00000000000..be531127503
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(LLVM_TARGET_DEFINITIONS StandardToSPIRV.td)
+mlir_tablegen(StandardToSPIRV.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRStandardToSPIRVIncGen)
+
+add_llvm_library(MLIRSPIRVConversion
+  ConvertStandardToSPIRV.cpp
+  ConvertStandardToSPIRVPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+  )
+
+add_dependencies(MLIRSPIRVConversion
+  MLIRStandardToSPIRVIncGen)
+
+target_link_libraries(MLIRSPIRVConversion
+  MLIRIR
+  MLIRPass
+  MLIRSPIRV
+  MLIRSupport
+  MLIRTransformUtils
+  MLIRSPIRV
+  MLIRStandardOps
+  )
diff --git a/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
new file mode 100644
index 00000000000..e3bcc041aa9
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
@@ -0,0 +1,311 @@
+//===- ConvertStandardToSPIRV.cpp - Standard to SPIR-V dialect conversion--===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert MLIR standard and builtin dialects
+// into the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+SPIRVBasicTypeConverter::SPIRVBasicTypeConverter(MLIRContext *context)
+    : spirvDialect(context->getRegisteredDialect<spirv::SPIRVDialect>()) {}
+
+Type SPIRVBasicTypeConverter::convertType(Type t) {
+  // Check if the type is SPIR-V supported. If so return the type.
+  if (spirvDialect->isValidSPIRVType(t)) {
+    return t;
+  }
+
+  if (auto indexType = t.dyn_cast<IndexType>()) {
+    // Return I32 for index types.
+    return IntegerType::get(32, t.getContext());
+  }
+
+  if (auto memRefType = t.dyn_cast<MemRefType>()) {
+    if (memRefType.hasStaticShape()) {
+      // Convert MemrefType to a multi-dimensional spv.array if size is known.
+      auto elementType = memRefType.getElementType();
+      for (auto size : reverse(memRefType.getShape())) {
+        elementType = spirv::ArrayType::get(elementType, size);
+      }
+      // TODO(ravishankarm) : For now hard-coding this to be StorageBuffer. Need
+      // to support other Storage Classes.
+      return spirv::PointerType::get(elementType,
+                                     spirv::StorageClass::StorageBuffer);
+    }
+  }
+  return Type();
+}
+
+//===----------------------------------------------------------------------===//
+// Entry Function signature Conversion
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+SPIRVTypeConverter::convertSignatureArg(unsigned inputNo, Type type,
+                                        SignatureConversion &result) {
+  // Try to convert the given input type.
+  auto convertedType = basicTypeConverter->convertType(type);
+  // TODO(ravishankarm) : Vulkan spec requires these to be a
+  // spirv::StructType. This is not a SPIR-V requirement, so just making this a
+  // pointer type for now.
+  if (!convertedType)
+    return failure();
+  // For arguments to entry functions, convert the type into a pointer type if
+  // it is already not one, unless the original type was an index type.
+  // TODO(ravishankarm): For arguments that are of index type, keep the
+  // arguments as the scalar converted type, i.e. i32. These are still not
+  // handled effectively. These are potentially best handled as specialization
+  // constants.
+  if (!convertedType.isa<spirv::PointerType>() && !type.isa<IndexType>()) {
+    // TODO(ravishankarm) : For now hard-coding this to be StorageBuffer. Need
+    // to support other Storage classes.
+    convertedType = spirv::PointerType::get(convertedType,
+                                            spirv::StorageClass::StorageBuffer);
+  }
+
+  // Add the new inputs.
+  result.addInputs(inputNo, convertedType);
+  return success();
+}
+
+static LogicalResult lowerFunctionImpl(
+    FuncOp funcOp, ArrayRef<Value *> operands,
+    ConversionPatternRewriter &rewriter, TypeConverter *typeConverter,
+    TypeConverter::SignatureConversion &signatureConverter, FuncOp &newFuncOp) {
+  auto fnType = funcOp.getType();
+
+  if (fnType.getNumResults()) {
+    return funcOp.emitError("SPIR-V dialect only supports functions with no "
+                            "return values right now");
+  }
+
+  for (auto &argType : enumerate(fnType.getInputs())) {
+    // Get the type of the argument
+    if (failed(typeConverter->convertSignatureArg(
+            argType.index(), argType.value(), signatureConverter))) {
+      return funcOp.emitError("unable to convert argument type ")
+             << argType.value() << " to SPIR-V type";
+    }
+  }
+
+  // Create a new function with an updated signature.
+  newFuncOp = rewriter.cloneWithoutRegions(funcOp);
+  rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                              newFuncOp.end());
+  newFuncOp.setType(FunctionType::get(signatureConverter.getConvertedTypes(),
+                                      llvm::None, funcOp.getContext()));
+
+  // Tell the rewriter to convert the region signature.
+  rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
+  rewriter.replaceOp(funcOp.getOperation(), llvm::None);
+  return success();
+}
+
+namespace mlir {
+LogicalResult lowerFunction(FuncOp funcOp, ArrayRef<Value *> operands,
+                            SPIRVTypeConverter *typeConverter,
+                            ConversionPatternRewriter &rewriter,
+                            FuncOp &newFuncOp) {
+  auto fnType = funcOp.getType();
+  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
+  return lowerFunctionImpl(funcOp, operands, rewriter,
+                           typeConverter->getBasicTypeConverter(),
+                           signatureConverter, newFuncOp);
+}
+
+LogicalResult lowerAsEntryFunction(FuncOp funcOp, ArrayRef<Value *> operands,
+                                   SPIRVTypeConverter *typeConverter,
+                                   ConversionPatternRewriter &rewriter,
+                                   FuncOp &newFuncOp) {
+  auto fnType = funcOp.getType();
+  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
+  if (failed(lowerFunctionImpl(funcOp, operands, rewriter, typeConverter,
+                               signatureConverter, newFuncOp))) {
+    return failure();
+  }
+  // Create spv.globalVariable ops for each of the arguments. These need to be
+  // bound by the runtime. For now use descriptor_set 0, and arg number as the
+  // binding number.
+  auto module = funcOp.getParentOfType<spirv::ModuleOp>();
+  if (!module) {
+    return funcOp.emitError("expected op to be within a spv.module");
+  }
+  auto ip = rewriter.saveInsertionPoint();
+  rewriter.setInsertionPointToStart(&module.getBlock());
+  SmallVector<Attribute, 4> interface;
+  for (auto &convertedArgType :
+       llvm::enumerate(signatureConverter.getConvertedTypes())) {
+    // TODO(ravishankarm) : The arguments to the converted function are either
+    // spirv::PointerType or i32 type, the latter due to conversion of index
+    // type to i32. Eventually entry function should be of signature
+    // void(void). Arguments converted to spirv::PointerType, will be made
+    // variables and those converted to i32 will be made specialization
+    // constants. Latter is not implemented.
+    if (!convertedArgType.value().isa<spirv::PointerType>()) {
+      continue;
+    }
+    std::string varName = funcOp.getName().str() + "_arg_" +
+                          std::to_string(convertedArgType.index());
+    auto variableOp = rewriter.create<spirv::GlobalVariableOp>(
+        funcOp.getLoc(), rewriter.getTypeAttr(convertedArgType.value()),
+        rewriter.getStringAttr(varName), nullptr);
+    variableOp.setAttr("descriptor_set", rewriter.getI32IntegerAttr(0));
+    variableOp.setAttr("binding",
+                       rewriter.getI32IntegerAttr(convertedArgType.index()));
+    interface.push_back(rewriter.getSymbolRefAttr(variableOp.sym_name()));
+  }
+  // Create an entry point instruction for this function.
+  // TODO(ravishankarm) : Add execution mode for the entry function
+  rewriter.setInsertionPoint(&(module.getBlock().back()));
+  rewriter.create<spirv::EntryPointOp>(
+      funcOp.getLoc(),
+      rewriter.getI32IntegerAttr(
+          static_cast<int32_t>(spirv::ExecutionModel::GLCompute)),
+      rewriter.getSymbolRefAttr(newFuncOp.getName()),
+      rewriter.getArrayAttr(interface));
+  rewriter.restoreInsertionPoint(ip);
+  return success();
+}
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// Operation conversion
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Convert integer binary operations to SPIR-V operations. Cannot use tablegen
+/// for this. If the integer operation is on variables of IndexType, the type of
+/// the return value of the replacement operation differs from that of the
+/// replaced operation. This is not handled in tablegen-based pattern
+/// specification.
+template <typename StdOp, typename SPIRVOp>
+class IntegerOpConversion final : public ConversionPattern {
+public:
+  IntegerOpConversion(MLIRContext *context)
+      : ConversionPattern(StdOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.template replaceOpWithNewOp<SPIRVOp>(
+        op, operands[0]->getType(), operands, ArrayRef<NamedAttribute>());
+    return this->matchSuccess();
+  }
+};
+
+/// Convert load -> spv.LoadOp. The operands of the replaced operation are of
+/// IndexType while that of the replacement operation are of type i32. This is
+/// not suppored in tablegen based pattern specification.
+// TODO(ravishankarm) : These could potentially be templated on the operation
+// being converted, since the same logic should work for linalg.load.
+class LoadOpConversion final : public ConversionPattern {
+public:
+  LoadOpConversion(MLIRContext *context)
+      : ConversionPattern(LoadOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    LoadOpOperandAdaptor loadOperands(operands);
+    auto basePtr = loadOperands.memref();
+    auto ptrType = basePtr->getType().dyn_cast<spirv::PointerType>();
+    if (!ptrType) {
+      return matchFailure();
+    }
+    auto loadPtr = rewriter.create<spirv::AccessChainOp>(
+        op->getLoc(), basePtr, loadOperands.indices());
+    auto loadPtrType = loadPtr.getType().cast<spirv::PointerType>();
+    rewriter.replaceOpWithNewOp<spirv::LoadOp>(
+        op, loadPtrType.getPointeeType(), loadPtr, /*memory_access =*/nullptr,
+        /*alignment =*/nullptr);
+    return matchSuccess();
+  }
+};
+
+/// Convert return -> spv.Return.
+class ReturnToSPIRVConversion : public ConversionPattern {
+public:
+  ReturnToSPIRVConversion(MLIRContext *context)
+      : ConversionPattern(ReturnOp::getOperationName(), 1, context) {}
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (op->getNumOperands()) {
+      return matchFailure();
+    }
+    rewriter.replaceOpWithNewOp<spirv::ReturnOp>(op);
+    return matchSuccess();
+  }
+};
+
+/// Convert store -> spv.StoreOp. The operands of the replaced operation are of
+/// IndexType while that of the replacement operation are of type i32. This is
+/// not suppored in tablegen based pattern specification.
+// TODO(ravishankarm) : These could potentially be templated on the operation
+// being converted, since the same logic should work for linalg.store.
+class StoreOpConversion final : public ConversionPattern {
+public:
+  StoreOpConversion(MLIRContext *context)
+      : ConversionPattern(StoreOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    StoreOpOperandAdaptor storeOperands(operands);
+    auto value = storeOperands.value();
+    auto basePtr = storeOperands.memref();
+    auto ptrType = basePtr->getType().dyn_cast<spirv::PointerType>();
+    if (!ptrType) {
+      return matchFailure();
+    }
+    auto storePtr = rewriter.create<spirv::AccessChainOp>(
+        op->getLoc(), basePtr, storeOperands.indices());
+    rewriter.replaceOpWithNewOp<spirv::StoreOp>(op, storePtr, value,
+                                                /*memory_access =*/nullptr,
+                                                /*alignment =*/nullptr);
+    return matchSuccess();
+  }
+};
+
+} // namespace
+
+namespace {
+/// Import the Standard Ops to SPIR-V Patterns.
+#include "StandardToSPIRV.cpp.inc"
+} // namespace
+
+namespace mlir {
+void populateStandardToSPIRVPatterns(MLIRContext *context,
+                                     OwningRewritePatternList &patterns) {
+  populateWithGenerated(context, &patterns);
+  // Add the return op conversion.
+  patterns.insert<IntegerOpConversion<AddIOp, spirv::IAddOp>,
+                  IntegerOpConversion<MulIOp, spirv::IMulOp>, LoadOpConversion,
+                  ReturnToSPIRVConversion, StoreOpConversion>(context);
+}
+} // namespace mlir
diff --git a/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp b/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
new file mode 100644
index 00000000000..174a4477560
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
@@ -0,0 +1,57 @@
+//===- ConvertStandardToSPIRVPass.cpp - Convert Std Ops to SPIR-V Ops -----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert MLIR standard ops into the SPIR-V
+// ops. It does not legalize FuncOps.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Dialect/SPIRV/Passes.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+
+using namespace mlir;
+
+namespace {
+/// A pass converting MLIR Standard operations into the SPIR-V dialect.
+class ConvertStandardToSPIRVPass
+    : public ModulePass<ConvertStandardToSPIRVPass> {
+  void runOnModule() override;
+};
+} // namespace
+
+void ConvertStandardToSPIRVPass::runOnModule() {
+  OwningRewritePatternList patterns;
+  auto module = getModule();
+
+  populateStandardToSPIRVPatterns(module.getContext(), patterns);
+  ConversionTarget target(*(module.getContext()));
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  target.addLegalOp<FuncOp>();
+
+  if (failed(applyPartialConversion(module, target, patterns))) {
+    return signalPassFailure();
+  }
+}
+
+std::unique_ptr<ModulePassBase>
+mlir::spirv::createConvertStandardToSPIRVPass() {
+  return std::make_unique<ConvertStandardToSPIRVPass>();
+}
+
+static PassRegistration<ConvertStandardToSPIRVPass>
+    pass("convert-std-to-spirv", "Convert Standard Ops to SPIR-V dialect");
diff --git a/third_party/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td b/third_party/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td
new file mode 100644
index 00000000000..b37eee88570
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td
@@ -0,0 +1,49 @@
+//==- StandardToSPIRV.td - Standard Ops to SPIR-V Patterns ---*- tablegen -*==//
+
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Patterns to lower standard ops to SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef MLIR_CONVERSION_STANDARDTOSPIRV_TD
+#else
+#define MLIR_CONVERSION_STANDARDTOSPIRV_TD
+
+#ifdef STANDARD_OPS
+#else
+include "mlir/Dialect/StandardOps/Ops.td"
+#endif // STANDARD_OPS
+
+#ifdef SPIRV_OPS
+#else
+include "mlir/Dialect/SPIRV/SPIRVOps.td"
+#endif // SPIRV_OPS
+
+def IsScalar : TypeConstraint<CPred<"!($_self.isa<ShapedType>())">, "scalar">;
+
+class IsVectorLengthPred<int vecLength> :
+      CPred<"($_self.cast<VectorType>().getShape().size() == 1 && " #
+            "$_self.cast<VectorType>().getShape()[0] == " # vecLength # ")">;
+
+class IsVectorOfLength<int vecLength>:
+    TypeConstraint<And<[IsVectorTypePred, IsVectorLengthPred<vecLength>]>,
+                   vecLength # "-element vector">;
+
+multiclass BinaryOpPattern<Op src, SPV_Op tgt> {
+  def : Pat<(src IsScalar:$l, IsScalar:$r), (tgt $l, $r)>;
+  foreach vecLength = [2, 3, 4] in {
+    def : Pat<(src IsVectorOfLength<vecLength>:$l,
+                   IsVectorOfLength<vecLength>:$r),
+              (tgt $l, $r)>;
+  }
+}
+
+defm : BinaryOpPattern<AddFOp, SPV_FAddOp>;
+defm : BinaryOpPattern<MulFOp, SPV_FMulOp>;
+
+#endif // MLIR_CONVERSION_STANDARDTOSPIRV_TD
diff --git a/third_party/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/third_party/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
new file mode 100644
index 00000000000..a75b6c1e98a
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRVectorToLLVM
+  VectorToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToLLVM
+)
+set(LIBS
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(MLIRVectorToLLVM ${LIBS})
+target_link_libraries(MLIRVectorToLLVM ${LIBS})
diff --git a/third_party/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp b/third_party/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp
new file mode 100644
index 00000000000..174e3d6910c
--- /dev/null
+++ b/third_party/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp
@@ -0,0 +1,203 @@
+//===- LowerToLLVMDialect.cpp - conversion from Linalg to LLVM dialect ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Conversion/VectorToLLVM/VectorToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace mlir;
+
+template <typename T>
+static LLVM::LLVMType getPtrToElementType(T containerType,
+                                          LLVMTypeConverter &lowering) {
+  return lowering.convertType(containerType.getElementType())
+      .template cast<LLVM::LLVMType>()
+      .getPointerTo();
+}
+
+// Create an array attribute containing integer attributes with values provided
+// in `position`.
+static ArrayAttr positionAttr(Builder &builder, ArrayRef<int> position) {
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(position.size());
+  for (auto p : position)
+    attrs.push_back(builder.getI64IntegerAttr(p));
+  return builder.getArrayAttr(attrs);
+}
+
+class ExtractElementOpConversion : public LLVMOpLowering {
+public:
+  explicit ExtractElementOpConversion(MLIRContext *context,
+                                      LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::ExtractElementOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::ExtractElementOpOperandAdaptor(operands);
+    auto extractOp = cast<vector::ExtractElementOp>(op);
+    auto vectorType = extractOp.vector()->getType().cast<VectorType>();
+    auto resultType = extractOp.getResult()->getType();
+    auto llvmResultType = lowering.convertType(resultType);
+
+    auto positionArrayAttr = extractOp.position();
+    // One-shot extraction of vector from array (only requires extractvalue).
+    if (resultType.isa<VectorType>()) {
+      Value *extracted = rewriter.create<LLVM::ExtractValueOp>(
+          loc, llvmResultType, adaptor.vector(), positionArrayAttr);
+      rewriter.replaceOp(op, extracted);
+      return matchSuccess();
+    }
+
+    // Potential extraction of 1-D vector from struct.
+    auto *context = op->getContext();
+    Value *extracted = adaptor.vector();
+    auto positionAttrs = positionArrayAttr.getValue();
+    auto i32Type = rewriter.getIntegerType(32);
+    if (positionAttrs.size() > 1) {
+      auto nDVectorType = vectorType;
+      auto oneDVectorType = VectorType::get(nDVectorType.getShape().take_back(),
+                                            nDVectorType.getElementType());
+      auto nMinusOnePositionAttrs =
+          ArrayAttr::get(positionAttrs.drop_back(), context);
+      extracted = rewriter.create<LLVM::ExtractValueOp>(
+          loc, lowering.convertType(oneDVectorType), extracted,
+          nMinusOnePositionAttrs);
+    }
+
+    // Remaining extraction of element from 1-D LLVM vector
+    auto position = positionAttrs.back().cast<IntegerAttr>();
+    auto constant = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(i32Type), position);
+    extracted =
+        rewriter.create<LLVM::ExtractElementOp>(loc, extracted, constant);
+    rewriter.replaceOp(op, extracted);
+
+    return matchSuccess();
+  }
+};
+
+class OuterProductOpConversion : public LLVMOpLowering {
+public:
+  explicit OuterProductOpConversion(MLIRContext *context,
+                                    LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::OuterProductOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::OuterProductOpOperandAdaptor(operands);
+    auto *ctx = op->getContext();
+    auto vLHS = adaptor.lhs()->getType().cast<LLVM::LLVMType>();
+    auto vRHS = adaptor.rhs()->getType().cast<LLVM::LLVMType>();
+    auto rankLHS = vLHS.getUnderlyingType()->getVectorNumElements();
+    auto rankRHS = vRHS.getUnderlyingType()->getVectorNumElements();
+    auto llvmArrayOfVectType = lowering.convertType(
+        cast<vector::OuterProductOp>(op).getResult()->getType());
+    Value *desc = rewriter.create<LLVM::UndefOp>(loc, llvmArrayOfVectType);
+    Value *a = adaptor.lhs(), *b = adaptor.rhs();
+    Value *acc = adaptor.acc().empty() ? nullptr : adaptor.acc().front();
+    SmallVector<Value *, 8> lhs, accs;
+    lhs.reserve(rankLHS);
+    accs.reserve(rankLHS);
+    for (unsigned d = 0, e = rankLHS; d < e; ++d) {
+      // shufflevector explicitly requires i32.
+      auto attr = rewriter.getI32IntegerAttr(d);
+      SmallVector<Attribute, 4> bcastAttr(rankRHS, attr);
+      auto bcastArrayAttr = ArrayAttr::get(bcastAttr, ctx);
+      Value *aD = nullptr, *accD = nullptr;
+      // 1. Broadcast the element a[d] into vector aD.
+      aD = rewriter.create<LLVM::ShuffleVectorOp>(loc, a, a, bcastArrayAttr);
+      // 2. If acc is present, extract 1-d vector acc[d] into accD.
+      if (acc)
+        accD = rewriter.create<LLVM::ExtractValueOp>(loc, vRHS, acc,
+                                                     positionAttr(rewriter, d));
+      // 3. Compute aD outer b (plus accD, if relevant).
+      Value *aOuterbD =
+          accD ? rewriter.create<LLVM::fmuladd>(loc, vRHS, aD, b, accD)
+                     .getResult()
+               : rewriter.create<LLVM::FMulOp>(loc, aD, b).getResult();
+      // 4. Insert as value `d` in the descriptor.
+      desc = rewriter.create<LLVM::InsertValueOp>(
+          loc, llvmArrayOfVectType, desc, aOuterbD, positionAttr(rewriter, d));
+    }
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+/// Populate the given list with patterns that convert from Vector to LLVM.
+void mlir::populateVectorToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  patterns.insert<ExtractElementOpConversion, OuterProductOpConversion>(
+      converter.getDialect()->getContext(), converter);
+}
+
+namespace {
+struct LowerVectorToLLVMPass : public ModulePass<LowerVectorToLLVMPass> {
+  void runOnModule();
+};
+} // namespace
+
+void LowerVectorToLLVMPass::runOnModule() {
+  // Convert to the LLVM IR dialect using the converter defined above.
+  OwningRewritePatternList patterns;
+  LLVMTypeConverter converter(&getContext());
+  populateVectorToLLVMConversionPatterns(converter, patterns);
+  populateStdToLLVMConversionPatterns(converter, patterns);
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+  if (failed(
+          applyPartialConversion(getModule(), target, patterns, &converter))) {
+    signalPassFailure();
+  }
+}
+
+ModulePassBase *mlir::createLowerVectorToLLVMPass() {
+  return new LowerVectorToLLVMPass();
+}
+
+static PassRegistration<LowerVectorToLLVMPass>
+    pass("vector-lower-to-llvm-dialect",
+         "Lower the operations from the vector dialect into the LLVM dialect");
diff --git a/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp b/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp
new file mode 100644
index 00000000000..7db3fa07c52
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp
@@ -0,0 +1,1764 @@
+//===- AffineOps.cpp - MLIR Affine Operations -----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Support/Debug.h"
+using namespace mlir;
+using llvm::dbgs;
+
+#define DEBUG_TYPE "affine-analysis"
+
+//===----------------------------------------------------------------------===//
+// AffineOpsDialect
+//===----------------------------------------------------------------------===//
+
+AffineOpsDialect::AffineOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<AffineApplyOp, AffineDmaStartOp, AffineDmaWaitOp, AffineLoadOp,
+                AffineStoreOp,
+#define GET_OP_LIST
+#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
+                >();
+}
+
+/// A utility function to check if a given region is attached to a function.
+static bool isFunctionRegion(Region *region) {
+  return llvm::isa<FuncOp>(region->getParentOp());
+}
+
+/// A utility function to check if a value is defined at the top level of a
+/// function. A value defined at the top level is always a valid symbol.
+bool mlir::isTopLevelSymbol(Value *value) {
+  if (auto *arg = dyn_cast<BlockArgument>(value))
+    return isFunctionRegion(arg->getOwner()->getParent());
+  return isFunctionRegion(value->getDefiningOp()->getParentRegion());
+}
+
+// Value can be used as a dimension id if it is valid as a symbol, or
+// it is an induction variable, or it is a result of affine apply operation
+// with dimension id arguments.
+bool mlir::isValidDim(Value *value) {
+  // The value must be an index type.
+  if (!value->getType().isIndex())
+    return false;
+
+  if (auto *op = value->getDefiningOp()) {
+    // Top level operation or constant operation is ok.
+    if (isFunctionRegion(op->getParentRegion()) || isa<ConstantOp>(op))
+      return true;
+    // Affine apply operation is ok if all of its operands are ok.
+    if (auto applyOp = dyn_cast<AffineApplyOp>(op))
+      return applyOp.isValidDim();
+    // The dim op is okay if its operand memref/tensor is defined at the top
+    // level.
+    if (auto dimOp = dyn_cast<DimOp>(op))
+      return isTopLevelSymbol(dimOp.getOperand());
+    return false;
+  }
+  // This value is a block argument (which also includes 'affine.for' loop IVs).
+  return true;
+}
+
+// Value can be used as a symbol if it is a constant, or it is defined at
+// the top level, or it is a result of affine apply operation with symbol
+// arguments.
+bool mlir::isValidSymbol(Value *value) {
+  // The value must be an index type.
+  if (!value->getType().isIndex())
+    return false;
+
+  if (auto *op = value->getDefiningOp()) {
+    // Top level operation or constant operation is ok.
+    if (isFunctionRegion(op->getParentRegion()) || isa<ConstantOp>(op))
+      return true;
+    // Affine apply operation is ok if all of its operands are ok.
+    if (auto applyOp = dyn_cast<AffineApplyOp>(op))
+      return applyOp.isValidSymbol();
+    // The dim op is okay if its operand memref/tensor is defined at the top
+    // level.
+    if (auto dimOp = dyn_cast<DimOp>(op))
+      return isTopLevelSymbol(dimOp.getOperand());
+    return false;
+  }
+  // Otherwise, check that the value is a top level symbol.
+  return isTopLevelSymbol(value);
+}
+
+// Returns true if 'value' is a valid index to an affine operation (e.g.
+// affine.load, affine.store, affine.dma_start, affine.dma_wait).
+// Returns false otherwise.
+static bool isValidAffineIndexOperand(Value *value) {
+  return isValidDim(value) || isValidSymbol(value);
+}
+
+/// Utility function to verify that a set of operands are valid dimension and
+/// symbol identifiers. The operands should be layed out such that the dimension
+/// operands are before the symbol operands. This function returns failure if
+/// there was an invalid operand. An operation is provided to emit any necessary
+/// errors.
+template <typename OpTy>
+static LogicalResult
+verifyDimAndSymbolIdentifiers(OpTy &op, Operation::operand_range operands,
+                              unsigned numDims) {
+  unsigned opIt = 0;
+  for (auto *operand : operands) {
+    if (opIt++ < numDims) {
+      if (!isValidDim(operand))
+        return op.emitOpError("operand cannot be used as a dimension id");
+    } else if (!isValidSymbol(operand)) {
+      return op.emitOpError("operand cannot be used as a symbol");
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// AffineApplyOp
+//===----------------------------------------------------------------------===//
+
+void AffineApplyOp::build(Builder *builder, OperationState *result,
+                          AffineMap map, ArrayRef<Value *> operands) {
+  result->addOperands(operands);
+  result->types.append(map.getNumResults(), builder->getIndexType());
+  result->addAttribute("map", builder->getAffineMapAttr(map));
+}
+
+ParseResult AffineApplyOp::parse(OpAsmParser *parser, OperationState *result) {
+  auto &builder = parser->getBuilder();
+  auto affineIntTy = builder.getIndexType();
+
+  AffineMapAttr mapAttr;
+  unsigned numDims;
+  if (parser->parseAttribute(mapAttr, "map", result->attributes) ||
+      parseDimAndSymbolList(parser, result->operands, numDims) ||
+      parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+  auto map = mapAttr.getValue();
+
+  if (map.getNumDims() != numDims ||
+      numDims + map.getNumSymbols() != result->operands.size()) {
+    return parser->emitError(parser->getNameLoc(),
+                             "dimension or symbol index mismatch");
+  }
+
+  result->types.append(map.getNumResults(), affineIntTy);
+  return success();
+}
+
+void AffineApplyOp::print(OpAsmPrinter *p) {
+  *p << "affine.apply " << getAttr("map");
+  printDimAndSymbolList(operand_begin(), operand_end(),
+                        getAffineMap().getNumDims(), p);
+  p->printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{"map"});
+}
+
+LogicalResult AffineApplyOp::verify() {
+  // Check that affine map attribute was specified.
+  auto affineMapAttr = getAttrOfType<AffineMapAttr>("map");
+  if (!affineMapAttr)
+    return emitOpError("requires an affine map");
+
+  // Check input and output dimensions match.
+  auto map = affineMapAttr.getValue();
+
+  // Verify that operand count matches affine map dimension and symbol count.
+  if (getNumOperands() != map.getNumDims() + map.getNumSymbols())
+    return emitOpError(
+        "operand count and affine map dimension and symbol count must match");
+
+  // Verify that all operands are of `index` type.
+  for (Type t : getOperandTypes()) {
+    if (!t.isIndex())
+      return emitOpError("operands must be of type 'index'");
+  }
+
+  if (!getResult()->getType().isIndex())
+    return emitOpError("result must be of type 'index'");
+
+  // Verify that the operands are valid dimension and symbol identifiers.
+  if (failed(verifyDimAndSymbolIdentifiers(*this, getOperands(),
+                                           map.getNumDims())))
+    return failure();
+
+  // Verify that the map only produces one result.
+  if (map.getNumResults() != 1)
+    return emitOpError("mapping must produce one value");
+
+  return success();
+}
+
+// The result of the affine apply operation can be used as a dimension id if it
+// is a CFG value or if it is an Value, and all the operands are valid
+// dimension ids.
+bool AffineApplyOp::isValidDim() {
+  return llvm::all_of(getOperands(),
+                      [](Value *op) { return mlir::isValidDim(op); });
+}
+
+// The result of the affine apply operation can be used as a symbol if it is
+// a CFG value or if it is an Value, and all the operands are symbols.
+bool AffineApplyOp::isValidSymbol() {
+  return llvm::all_of(getOperands(),
+                      [](Value *op) { return mlir::isValidSymbol(op); });
+}
+
+OpFoldResult AffineApplyOp::fold(ArrayRef<Attribute> operands) {
+  auto map = getAffineMap();
+
+  // Fold dims and symbols to existing values.
+  auto expr = map.getResult(0);
+  if (auto dim = expr.dyn_cast<AffineDimExpr>())
+    return getOperand(dim.getPosition());
+  if (auto sym = expr.dyn_cast<AffineSymbolExpr>())
+    return getOperand(map.getNumDims() + sym.getPosition());
+
+  // Otherwise, default to folding the map.
+  SmallVector<Attribute, 1> result;
+  if (failed(map.constantFold(operands, result)))
+    return {};
+  return result[0];
+}
+
+namespace {
+/// An `AffineApplyNormalizer` is a helper class that is not visible to the user
+/// and supports renumbering operands of AffineApplyOp. This acts as a
+/// reindexing map of Value* to positional dims or symbols and allows
+/// simplifications such as:
+///
+/// ```mlir
+///    %1 = affine.apply (d0, d1) -> (d0 - d1) (%0, %0)
+/// ```
+///
+/// into:
+///
+/// ```mlir
+///    %1 = affine.apply () -> (0)
+/// ```
+struct AffineApplyNormalizer {
+  AffineApplyNormalizer(AffineMap map, ArrayRef<Value *> operands);
+
+  /// Returns the AffineMap resulting from normalization.
+  AffineMap getAffineMap() { return affineMap; }
+
+  SmallVector<Value *, 8> getOperands() {
+    SmallVector<Value *, 8> res(reorderedDims);
+    res.append(concatenatedSymbols.begin(), concatenatedSymbols.end());
+    return res;
+  }
+
+private:
+  /// Helper function to insert `v` into the coordinate system of the current
+  /// AffineApplyNormalizer. Returns the AffineDimExpr with the corresponding
+  /// renumbered position.
+  AffineDimExpr renumberOneDim(Value *v);
+
+  /// Given an `other` normalizer, this rewrites `other.affineMap` in the
+  /// coordinate system of the current AffineApplyNormalizer.
+  /// Returns the rewritten AffineMap and updates the dims and symbols of
+  /// `this`.
+  AffineMap renumber(const AffineApplyNormalizer &other);
+
+  /// Maps of Value* to position in `affineMap`.
+  DenseMap<Value *, unsigned> dimValueToPosition;
+
+  /// Ordered dims and symbols matching positional dims and symbols in
+  /// `affineMap`.
+  SmallVector<Value *, 8> reorderedDims;
+  SmallVector<Value *, 8> concatenatedSymbols;
+
+  AffineMap affineMap;
+
+  /// Used with RAII to control the depth at which AffineApply are composed
+  /// recursively. Only accepts depth 1 for now to allow a behavior where a
+  /// newly composed AffineApplyOp does not increase the length of the chain of
+  /// AffineApplyOps. Full composition is implemented iteratively on top of
+  /// this behavior.
+  static unsigned &affineApplyDepth() {
+    static thread_local unsigned depth = 0;
+    return depth;
+  }
+  static constexpr unsigned kMaxAffineApplyDepth = 1;
+
+  AffineApplyNormalizer() { affineApplyDepth()++; }
+
+public:
+  ~AffineApplyNormalizer() { affineApplyDepth()--; }
+};
+} // end anonymous namespace.
+
+AffineDimExpr AffineApplyNormalizer::renumberOneDim(Value *v) {
+  DenseMap<Value *, unsigned>::iterator iterPos;
+  bool inserted = false;
+  std::tie(iterPos, inserted) =
+      dimValueToPosition.insert(std::make_pair(v, dimValueToPosition.size()));
+  if (inserted) {
+    reorderedDims.push_back(v);
+  }
+  return getAffineDimExpr(iterPos->second, v->getContext())
+      .cast<AffineDimExpr>();
+}
+
+AffineMap AffineApplyNormalizer::renumber(const AffineApplyNormalizer &other) {
+  SmallVector<AffineExpr, 8> dimRemapping;
+  for (auto *v : other.reorderedDims) {
+    auto kvp = other.dimValueToPosition.find(v);
+    if (dimRemapping.size() <= kvp->second)
+      dimRemapping.resize(kvp->second + 1);
+    dimRemapping[kvp->second] = renumberOneDim(kvp->first);
+  }
+  unsigned numSymbols = concatenatedSymbols.size();
+  unsigned numOtherSymbols = other.concatenatedSymbols.size();
+  SmallVector<AffineExpr, 8> symRemapping(numOtherSymbols);
+  for (unsigned idx = 0; idx < numOtherSymbols; ++idx) {
+    symRemapping[idx] =
+        getAffineSymbolExpr(idx + numSymbols, other.affineMap.getContext());
+  }
+  concatenatedSymbols.insert(concatenatedSymbols.end(),
+                             other.concatenatedSymbols.begin(),
+                             other.concatenatedSymbols.end());
+  auto map = other.affineMap;
+  return map.replaceDimsAndSymbols(dimRemapping, symRemapping,
+                                   dimRemapping.size(), symRemapping.size());
+}
+
+// Gather the positions of the operands that are produced by an AffineApplyOp.
+static llvm::SetVector<unsigned>
+indicesFromAffineApplyOp(ArrayRef<Value *> operands) {
+  llvm::SetVector<unsigned> res;
+  for (auto en : llvm::enumerate(operands))
+    if (isa_and_nonnull<AffineApplyOp>(en.value()->getDefiningOp()))
+      res.insert(en.index());
+  return res;
+}
+
+// Support the special case of a symbol coming from an AffineApplyOp that needs
+// to be composed into the current AffineApplyOp.
+// This case is handled by rewriting all such symbols into dims for the purpose
+// of allowing mathematical AffineMap composition.
+// Returns an AffineMap where symbols that come from an AffineApplyOp have been
+// rewritten as dims and are ordered after the original dims.
+// TODO(andydavis,ntv): This promotion makes AffineMap lose track of which
+// symbols are represented as dims. This loss is static but can still be
+// recovered dynamically (with `isValidSymbol`). Still this is annoying for the
+// semi-affine map case. A dynamic canonicalization of all dims that are valid
+// symbols (a.k.a `canonicalizePromotedSymbols`) into symbols helps and even
+// results in better simplifications and foldings. But we should evaluate
+// whether this behavior is what we really want after using more.
+static AffineMap promoteComposedSymbolsAsDims(AffineMap map,
+                                              ArrayRef<Value *> symbols) {
+  if (symbols.empty()) {
+    return map;
+  }
+
+  // Sanity check on symbols.
+  for (auto *sym : symbols) {
+    assert(isValidSymbol(sym) && "Expected only valid symbols");
+    (void)sym;
+  }
+
+  // Extract the symbol positions that come from an AffineApplyOp and
+  // needs to be rewritten as dims.
+  auto symPositions = indicesFromAffineApplyOp(symbols);
+  if (symPositions.empty()) {
+    return map;
+  }
+
+  // Create the new map by replacing each symbol at pos by the next new dim.
+  unsigned numDims = map.getNumDims();
+  unsigned numSymbols = map.getNumSymbols();
+  unsigned numNewDims = 0;
+  unsigned numNewSymbols = 0;
+  SmallVector<AffineExpr, 8> symReplacements(numSymbols);
+  for (unsigned i = 0; i < numSymbols; ++i) {
+    symReplacements[i] =
+        symPositions.count(i) > 0
+            ? getAffineDimExpr(numDims + numNewDims++, map.getContext())
+            : getAffineSymbolExpr(numNewSymbols++, map.getContext());
+  }
+  assert(numSymbols >= numNewDims);
+  AffineMap newMap = map.replaceDimsAndSymbols(
+      {}, symReplacements, numDims + numNewDims, numNewSymbols);
+
+  return newMap;
+}
+
+/// The AffineNormalizer composes AffineApplyOp recursively. Its purpose is to
+/// keep a correspondence between the mathematical `map` and the `operands` of
+/// a given AffineApplyOp. This correspondence is maintained by iterating over
+/// the operands and forming an `auxiliaryMap` that can be composed
+/// mathematically with `map`. To keep this correspondence in cases where
+/// symbols are produced by affine.apply operations, we perform a local rewrite
+/// of symbols as dims.
+///
+/// Rationale for locally rewriting symbols as dims:
+/// ================================================
+/// The mathematical composition of AffineMap must always concatenate symbols
+/// because it does not have enough information to do otherwise. For example,
+/// composing `(d0)[s0] -> (d0 + s0)` with itself must produce
+/// `(d0)[s0, s1] -> (d0 + s0 + s1)`.
+///
+/// The result is only equivalent to `(d0)[s0] -> (d0 + 2 * s0)` when
+/// applied to the same mlir::Value* for both s0 and s1.
+/// As a consequence mathematical composition of AffineMap always concatenates
+/// symbols.
+///
+/// When AffineMaps are used in AffineApplyOp however, they may specify
+/// composition via symbols, which is ambiguous mathematically. This corner case
+/// is handled by locally rewriting such symbols that come from AffineApplyOp
+/// into dims and composing through dims.
+/// TODO(andydavis, ntv): Composition via symbols comes at a significant code
+/// complexity. Alternatively we should investigate whether we want to
+/// explicitly disallow symbols coming from affine.apply and instead force the
+/// user to compose symbols beforehand. The annoyances may be small (i.e. 1 or 2
+/// extra API calls for such uses, which haven't popped up until now) and the
+/// benefit potentially big: simpler and more maintainable code for a
+/// non-trivial, recursive, procedure.
+AffineApplyNormalizer::AffineApplyNormalizer(AffineMap map,
+                                             ArrayRef<Value *> operands)
+    : AffineApplyNormalizer() {
+  static_assert(kMaxAffineApplyDepth > 0, "kMaxAffineApplyDepth must be > 0");
+  assert(map.getNumInputs() == operands.size() &&
+         "number of operands does not match the number of map inputs");
+
+  LLVM_DEBUG(map.print(dbgs() << "\nInput map: "));
+
+  // Promote symbols that come from an AffineApplyOp to dims by rewriting the
+  // map to always refer to:
+  //   (dims, symbols coming from AffineApplyOp, other symbols).
+  // The order of operands can remain unchanged.
+  // This is a simplification that relies on 2 ordering properties:
+  //   1. rewritten symbols always appear after the original dims in the map;
+  //   2. operands are traversed in order and either dispatched to:
+  //      a. auxiliaryExprs (dims and symbols rewritten as dims);
+  //      b. concatenatedSymbols (all other symbols)
+  // This allows operand order to remain unchanged.
+  unsigned numDimsBeforeRewrite = map.getNumDims();
+  map = promoteComposedSymbolsAsDims(map,
+                                     operands.take_back(map.getNumSymbols()));
+
+  LLVM_DEBUG(map.print(dbgs() << "\nRewritten map: "));
+
+  SmallVector<AffineExpr, 8> auxiliaryExprs;
+  bool furtherCompose = (affineApplyDepth() <= kMaxAffineApplyDepth);
+  // We fully spell out the 2 cases below. In this particular instance a little
+  // code duplication greatly improves readability.
+  // Note that the first branch would disappear if we only supported full
+  // composition (i.e. infinite kMaxAffineApplyDepth).
+  if (!furtherCompose) {
+    // 1. Only dispatch dims or symbols.
+    for (auto en : llvm::enumerate(operands)) {
+      auto *t = en.value();
+      assert(t->getType().isIndex());
+      bool isDim = (en.index() < map.getNumDims());
+      if (isDim) {
+        // a. The mathematical composition of AffineMap composes dims.
+        auxiliaryExprs.push_back(renumberOneDim(t));
+      } else {
+        // b. The mathematical composition of AffineMap concatenates symbols.
+        //    We do the same for symbol operands.
+        concatenatedSymbols.push_back(t);
+      }
+    }
+  } else {
+    assert(numDimsBeforeRewrite <= operands.size());
+    // 2. Compose AffineApplyOps and dispatch dims or symbols.
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      auto *t = operands[i];
+      auto affineApply = dyn_cast_or_null<AffineApplyOp>(t->getDefiningOp());
+      if (affineApply) {
+        // a. Compose affine.apply operations.
+        LLVM_DEBUG(affineApply.getOperation()->print(
+            dbgs() << "\nCompose AffineApplyOp recursively: "));
+        AffineMap affineApplyMap = affineApply.getAffineMap();
+        SmallVector<Value *, 8> affineApplyOperands(
+            affineApply.getOperands().begin(), affineApply.getOperands().end());
+        AffineApplyNormalizer normalizer(affineApplyMap, affineApplyOperands);
+
+        LLVM_DEBUG(normalizer.affineMap.print(
+            dbgs() << "\nRenumber into current normalizer: "));
+
+        auto renumberedMap = renumber(normalizer);
+
+        LLVM_DEBUG(
+            renumberedMap.print(dbgs() << "\nRecursive composition yields: "));
+
+        auxiliaryExprs.push_back(renumberedMap.getResult(0));
+      } else {
+        if (i < numDimsBeforeRewrite) {
+          // b. The mathematical composition of AffineMap composes dims.
+          auxiliaryExprs.push_back(renumberOneDim(t));
+        } else {
+          // c. The mathematical composition of AffineMap concatenates symbols.
+          //    We do the same for symbol operands.
+          concatenatedSymbols.push_back(t);
+        }
+      }
+    }
+  }
+
+  // Early exit if `map` is already composed.
+  if (auxiliaryExprs.empty()) {
+    affineMap = map;
+    return;
+  }
+
+  assert(concatenatedSymbols.size() >= map.getNumSymbols() &&
+         "Unexpected number of concatenated symbols");
+  auto numDims = dimValueToPosition.size();
+  auto numSymbols = concatenatedSymbols.size() - map.getNumSymbols();
+  auto auxiliaryMap = AffineMap::get(numDims, numSymbols, auxiliaryExprs);
+
+  LLVM_DEBUG(map.print(dbgs() << "\nCompose map: "));
+  LLVM_DEBUG(auxiliaryMap.print(dbgs() << "\nWith map: "));
+  LLVM_DEBUG(map.compose(auxiliaryMap).print(dbgs() << "\nResult: "));
+
+  // TODO(andydavis,ntv): Disabling simplification results in major speed gains.
+  // Another option is to cache the results as it is expected a lot of redundant
+  // work is performed in practice.
+  affineMap = simplifyAffineMap(map.compose(auxiliaryMap));
+
+  LLVM_DEBUG(affineMap.print(dbgs() << "\nSimplified result: "));
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+/// Implements `map` and `operands` composition and simplification to support
+/// `makeComposedAffineApply`. This can be called to achieve the same effects
+/// on `map` and `operands` without creating an AffineApplyOp that needs to be
+/// immediately deleted.
+static void composeAffineMapAndOperands(AffineMap *map,
+                                        SmallVectorImpl<Value *> *operands) {
+  AffineApplyNormalizer normalizer(*map, *operands);
+  auto normalizedMap = normalizer.getAffineMap();
+  auto normalizedOperands = normalizer.getOperands();
+  canonicalizeMapAndOperands(&normalizedMap, &normalizedOperands);
+  *map = normalizedMap;
+  *operands = normalizedOperands;
+  assert(*map);
+}
+
+void mlir::fullyComposeAffineMapAndOperands(
+    AffineMap *map, SmallVectorImpl<Value *> *operands) {
+  while (llvm::any_of(*operands, [](Value *v) {
+    return isa_and_nonnull<AffineApplyOp>(v->getDefiningOp());
+  })) {
+    composeAffineMapAndOperands(map, operands);
+  }
+}
+
+AffineApplyOp mlir::makeComposedAffineApply(OpBuilder &b, Location loc,
+                                            AffineMap map,
+                                            ArrayRef<Value *> operands) {
+  AffineMap normalizedMap = map;
+  SmallVector<Value *, 8> normalizedOperands(operands.begin(), operands.end());
+  composeAffineMapAndOperands(&normalizedMap, &normalizedOperands);
+  assert(normalizedMap);
+  return b.create<AffineApplyOp>(loc, normalizedMap, normalizedOperands);
+}
+
+// A symbol may appear as a dim in affine.apply operations. This function
+// canonicalizes dims that are valid symbols into actual symbols.
+static void
+canonicalizePromotedSymbols(AffineMap *map,
+                            llvm::SmallVectorImpl<Value *> *operands) {
+  if (!map || operands->empty())
+    return;
+
+  assert(map->getNumInputs() == operands->size() &&
+         "map inputs must match number of operands");
+
+  auto *context = map->getContext();
+  SmallVector<Value *, 8> resultOperands;
+  resultOperands.reserve(operands->size());
+  SmallVector<Value *, 8> remappedSymbols;
+  remappedSymbols.reserve(operands->size());
+  unsigned nextDim = 0;
+  unsigned nextSym = 0;
+  unsigned oldNumSyms = map->getNumSymbols();
+  SmallVector<AffineExpr, 8> dimRemapping(map->getNumDims());
+  for (unsigned i = 0, e = map->getNumInputs(); i != e; ++i) {
+    if (i < map->getNumDims()) {
+      if (isValidSymbol((*operands)[i])) {
+        // This is a valid symbols that appears as a dim, canonicalize it.
+        dimRemapping[i] = getAffineSymbolExpr(oldNumSyms + nextSym++, context);
+        remappedSymbols.push_back((*operands)[i]);
+      } else {
+        dimRemapping[i] = getAffineDimExpr(nextDim++, context);
+        resultOperands.push_back((*operands)[i]);
+      }
+    } else {
+      resultOperands.push_back((*operands)[i]);
+    }
+  }
+
+  resultOperands.append(remappedSymbols.begin(), remappedSymbols.end());
+  *operands = resultOperands;
+  *map = map->replaceDimsAndSymbols(dimRemapping, {}, nextDim,
+                                    oldNumSyms + nextSym);
+
+  assert(map->getNumInputs() == operands->size() &&
+         "map inputs must match number of operands");
+}
+
+void mlir::canonicalizeMapAndOperands(
+    AffineMap *map, llvm::SmallVectorImpl<Value *> *operands) {
+  if (!map || operands->empty())
+    return;
+
+  assert(map->getNumInputs() == operands->size() &&
+         "map inputs must match number of operands");
+
+  canonicalizePromotedSymbols(map, operands);
+
+  // Check to see what dims are used.
+  llvm::SmallBitVector usedDims(map->getNumDims());
+  llvm::SmallBitVector usedSyms(map->getNumSymbols());
+  map->walkExprs([&](AffineExpr expr) {
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      usedDims[dimExpr.getPosition()] = true;
+    else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
+      usedSyms[symExpr.getPosition()] = true;
+  });
+
+  auto *context = map->getContext();
+
+  SmallVector<Value *, 8> resultOperands;
+  resultOperands.reserve(operands->size());
+
+  llvm::SmallDenseMap<Value *, AffineExpr, 8> seenDims;
+  SmallVector<AffineExpr, 8> dimRemapping(map->getNumDims());
+  unsigned nextDim = 0;
+  for (unsigned i = 0, e = map->getNumDims(); i != e; ++i) {
+    if (usedDims[i]) {
+      auto it = seenDims.find((*operands)[i]);
+      if (it == seenDims.end()) {
+        dimRemapping[i] = getAffineDimExpr(nextDim++, context);
+        resultOperands.push_back((*operands)[i]);
+        seenDims.insert(std::make_pair((*operands)[i], dimRemapping[i]));
+      } else {
+        dimRemapping[i] = it->second;
+      }
+    }
+  }
+  llvm::SmallDenseMap<Value *, AffineExpr, 8> seenSymbols;
+  SmallVector<AffineExpr, 8> symRemapping(map->getNumSymbols());
+  unsigned nextSym = 0;
+  for (unsigned i = 0, e = map->getNumSymbols(); i != e; ++i) {
+    if (usedSyms[i]) {
+      auto it = seenSymbols.find((*operands)[i + map->getNumDims()]);
+      if (it == seenSymbols.end()) {
+        symRemapping[i] = getAffineSymbolExpr(nextSym++, context);
+        resultOperands.push_back((*operands)[i + map->getNumDims()]);
+        seenSymbols.insert(std::make_pair((*operands)[i + map->getNumDims()],
+                                          symRemapping[i]));
+      } else {
+        symRemapping[i] = it->second;
+      }
+    }
+  }
+  *map =
+      map->replaceDimsAndSymbols(dimRemapping, symRemapping, nextDim, nextSym);
+  *operands = resultOperands;
+}
+
+namespace {
+/// Simplify AffineApply operations.
+///
+struct SimplifyAffineApply : public OpRewritePattern<AffineApplyOp> {
+  using OpRewritePattern<AffineApplyOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineApplyOp apply,
+                                     PatternRewriter &rewriter) const override {
+    auto map = apply.getAffineMap();
+
+    AffineMap oldMap = map;
+    SmallVector<Value *, 8> resultOperands(apply.getOperands());
+    composeAffineMapAndOperands(&map, &resultOperands);
+    if (map == oldMap)
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<AffineApplyOp>(apply, map, resultOperands);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+void AffineApplyOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyAffineApply>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Common canonicalization pattern support logic
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This is a common class used for patterns of the form
+/// "someop(memrefcast) -> someop".  It folds the source of any memref_cast
+/// into the root operation directly.
+struct MemRefCastFolder : public RewritePattern {
+  /// The rootOpName is the name of the root operation to match against.
+  MemRefCastFolder(StringRef rootOpName, MLIRContext *context)
+      : RewritePattern(rootOpName, 1, context) {}
+
+  PatternMatchResult match(Operation *op) const override {
+    for (auto *operand : op->getOperands())
+      if (matchPattern(operand, m_Op<MemRefCastOp>()))
+        return matchSuccess();
+
+    return matchFailure();
+  }
+
+  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+    for (unsigned i = 0, e = op->getNumOperands(); i != e; ++i)
+      if (auto *memref = op->getOperand(i)->getDefiningOp())
+        if (auto cast = dyn_cast<MemRefCastOp>(memref))
+          op->setOperand(i, cast.getOperand());
+    rewriter.updatedRootInPlace(op);
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// AffineDmaStartOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/133776335) Check that map operands are loop IVs or symbols.
+void AffineDmaStartOp::build(Builder *builder, OperationState *result,
+                             Value *srcMemRef, AffineMap srcMap,
+                             ArrayRef<Value *> srcIndices, Value *destMemRef,
+                             AffineMap dstMap, ArrayRef<Value *> destIndices,
+                             Value *tagMemRef, AffineMap tagMap,
+                             ArrayRef<Value *> tagIndices, Value *numElements,
+                             Value *stride, Value *elementsPerStride) {
+  result->addOperands(srcMemRef);
+  result->addAttribute(getSrcMapAttrName(), builder->getAffineMapAttr(srcMap));
+  result->addOperands(srcIndices);
+  result->addOperands(destMemRef);
+  result->addAttribute(getDstMapAttrName(), builder->getAffineMapAttr(dstMap));
+  result->addOperands(destIndices);
+  result->addOperands(tagMemRef);
+  result->addAttribute(getTagMapAttrName(), builder->getAffineMapAttr(tagMap));
+  result->addOperands(tagIndices);
+  result->addOperands(numElements);
+  if (stride) {
+    result->addOperands({stride, elementsPerStride});
+  }
+}
+
+void AffineDmaStartOp::print(OpAsmPrinter *p) {
+  *p << "affine.dma_start " << *getSrcMemRef() << '[';
+  SmallVector<Value *, 8> operands(getSrcIndices());
+  p->printAffineMapOfSSAIds(getSrcMapAttr(), operands);
+  *p << "], " << *getDstMemRef() << '[';
+  operands.assign(getDstIndices().begin(), getDstIndices().end());
+  p->printAffineMapOfSSAIds(getDstMapAttr(), operands);
+  *p << "], " << *getTagMemRef() << '[';
+  operands.assign(getTagIndices().begin(), getTagIndices().end());
+  p->printAffineMapOfSSAIds(getTagMapAttr(), operands);
+  *p << "], " << *getNumElements();
+  if (isStrided()) {
+    *p << ", " << *getStride();
+    *p << ", " << *getNumElementsPerStride();
+  }
+  *p << " : " << getSrcMemRefType() << ", " << getDstMemRefType() << ", "
+     << getTagMemRefType();
+}
+
+// Parse AffineDmaStartOp.
+// Ex:
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%index], %size,
+//     %stride, %num_elt_per_stride
+//       : memref<3076 x f32, 0>, memref<1024 x f32, 2>, memref<1 x i32>
+//
+ParseResult AffineDmaStartOp::parse(OpAsmParser *parser,
+                                    OperationState *result) {
+  OpAsmParser::OperandType srcMemRefInfo;
+  AffineMapAttr srcMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> srcMapOperands;
+  OpAsmParser::OperandType dstMemRefInfo;
+  AffineMapAttr dstMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> dstMapOperands;
+  OpAsmParser::OperandType tagMemRefInfo;
+  AffineMapAttr tagMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> tagMapOperands;
+  OpAsmParser::OperandType numElementsInfo;
+  SmallVector<OpAsmParser::OperandType, 2> strideInfo;
+
+  SmallVector<Type, 3> types;
+  auto indexType = parser->getBuilder().getIndexType();
+
+  // Parse and resolve the following list of operands:
+  // *) dst memref followed by its affine maps operands (in square brackets).
+  // *) src memref followed by its affine map operands (in square brackets).
+  // *) tag memref followed by its affine map operands (in square brackets).
+  // *) number of elements transferred by DMA operation.
+  if (parser->parseOperand(srcMemRefInfo) ||
+      parser->parseAffineMapOfSSAIds(srcMapOperands, srcMapAttr,
+                                     getSrcMapAttrName(), result->attributes) ||
+      parser->parseComma() || parser->parseOperand(dstMemRefInfo) ||
+      parser->parseAffineMapOfSSAIds(dstMapOperands, dstMapAttr,
+                                     getDstMapAttrName(), result->attributes) ||
+      parser->parseComma() || parser->parseOperand(tagMemRefInfo) ||
+      parser->parseAffineMapOfSSAIds(tagMapOperands, tagMapAttr,
+                                     getTagMapAttrName(), result->attributes) ||
+      parser->parseComma() || parser->parseOperand(numElementsInfo))
+    return failure();
+
+  // Parse optional stride and elements per stride.
+  if (parser->parseTrailingOperandList(strideInfo)) {
+    return failure();
+  }
+  if (!strideInfo.empty() && strideInfo.size() != 2) {
+    return parser->emitError(parser->getNameLoc(),
+                             "expected two stride related operands");
+  }
+  bool isStrided = strideInfo.size() == 2;
+
+  if (parser->parseColonTypeList(types))
+    return failure();
+
+  if (types.size() != 3)
+    return parser->emitError(parser->getNameLoc(), "expected three types");
+
+  if (parser->resolveOperand(srcMemRefInfo, types[0], result->operands) ||
+      parser->resolveOperands(srcMapOperands, indexType, result->operands) ||
+      parser->resolveOperand(dstMemRefInfo, types[1], result->operands) ||
+      parser->resolveOperands(dstMapOperands, indexType, result->operands) ||
+      parser->resolveOperand(tagMemRefInfo, types[2], result->operands) ||
+      parser->resolveOperands(tagMapOperands, indexType, result->operands) ||
+      parser->resolveOperand(numElementsInfo, indexType, result->operands))
+    return failure();
+
+  if (isStrided) {
+    if (parser->resolveOperands(strideInfo, indexType, result->operands))
+      return failure();
+  }
+
+  // Check that src/dst/tag operand counts match their map.numInputs.
+  if (srcMapOperands.size() != srcMapAttr.getValue().getNumInputs() ||
+      dstMapOperands.size() != dstMapAttr.getValue().getNumInputs() ||
+      tagMapOperands.size() != tagMapAttr.getValue().getNumInputs())
+    return parser->emitError(parser->getNameLoc(),
+                             "memref operand count not equal to map.numInputs");
+  return success();
+}
+
+LogicalResult AffineDmaStartOp::verify() {
+  if (!getOperand(getSrcMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA source to be of memref type");
+  if (!getOperand(getDstMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA destination to be of memref type");
+  if (!getOperand(getTagMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA tag to be of memref type");
+
+  // DMAs from different memory spaces supported.
+  if (getSrcMemorySpace() == getDstMemorySpace()) {
+    return emitOpError("DMA should be between different memory spaces");
+  }
+  unsigned numInputsAllMaps = getSrcMap().getNumInputs() +
+                              getDstMap().getNumInputs() +
+                              getTagMap().getNumInputs();
+  if (getNumOperands() != numInputsAllMaps + 3 + 1 &&
+      getNumOperands() != numInputsAllMaps + 3 + 1 + 2) {
+    return emitOpError("incorrect number of operands");
+  }
+
+  for (auto *idx : getSrcIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("src index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("src index must be a dimension or symbol identifier");
+  }
+  for (auto *idx : getDstIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("dst index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("dst index must be a dimension or symbol identifier");
+  }
+  for (auto *idx : getTagIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("tag index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("tag index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineDmaStartOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  /// dma_start(memrefcast) -> dma_start
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineDmaWaitOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/133776335) Check that map operands are loop IVs or symbols.
+void AffineDmaWaitOp::build(Builder *builder, OperationState *result,
+                            Value *tagMemRef, AffineMap tagMap,
+                            ArrayRef<Value *> tagIndices, Value *numElements) {
+  result->addOperands(tagMemRef);
+  result->addAttribute(getTagMapAttrName(), builder->getAffineMapAttr(tagMap));
+  result->addOperands(tagIndices);
+  result->addOperands(numElements);
+}
+
+void AffineDmaWaitOp::print(OpAsmPrinter *p) {
+  *p << "affine.dma_wait " << *getTagMemRef() << '[';
+  SmallVector<Value *, 2> operands(getTagIndices());
+  p->printAffineMapOfSSAIds(getTagMapAttr(), operands);
+  *p << "], ";
+  p->printOperand(getNumElements());
+  *p << " : " << getTagMemRef()->getType();
+}
+
+// Parse AffineDmaWaitOp.
+// Eg:
+//   affine.dma_wait %tag[%index], %num_elements
+//     : memref<1 x i32, (d0) -> (d0), 4>
+//
+ParseResult AffineDmaWaitOp::parse(OpAsmParser *parser,
+                                   OperationState *result) {
+  OpAsmParser::OperandType tagMemRefInfo;
+  AffineMapAttr tagMapAttr;
+  SmallVector<OpAsmParser::OperandType, 2> tagMapOperands;
+  Type type;
+  auto indexType = parser->getBuilder().getIndexType();
+  OpAsmParser::OperandType numElementsInfo;
+
+  // Parse tag memref, its map operands, and dma size.
+  if (parser->parseOperand(tagMemRefInfo) ||
+      parser->parseAffineMapOfSSAIds(tagMapOperands, tagMapAttr,
+                                     getTagMapAttrName(), result->attributes) ||
+      parser->parseComma() || parser->parseOperand(numElementsInfo) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(tagMemRefInfo, type, result->operands) ||
+      parser->resolveOperands(tagMapOperands, indexType, result->operands) ||
+      parser->resolveOperand(numElementsInfo, indexType, result->operands))
+    return failure();
+
+  if (!type.isa<MemRefType>())
+    return parser->emitError(parser->getNameLoc(),
+                             "expected tag to be of memref type");
+
+  if (tagMapOperands.size() != tagMapAttr.getValue().getNumInputs())
+    return parser->emitError(parser->getNameLoc(),
+                             "tag memref operand count != to map.numInputs");
+  return success();
+}
+
+LogicalResult AffineDmaWaitOp::verify() {
+  if (!getOperand(0)->getType().isa<MemRefType>())
+    return emitOpError("expected DMA tag to be of memref type");
+  for (auto *idx : getTagIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to dma_wait must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineDmaWaitOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  /// dma_wait(memrefcast) -> dma_wait
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineForOp
+//===----------------------------------------------------------------------===//
+
+void AffineForOp::build(Builder *builder, OperationState *result,
+                        ArrayRef<Value *> lbOperands, AffineMap lbMap,
+                        ArrayRef<Value *> ubOperands, AffineMap ubMap,
+                        int64_t step) {
+  assert(((!lbMap && lbOperands.empty()) ||
+          lbOperands.size() == lbMap.getNumInputs()) &&
+         "lower bound operand count does not match the affine map");
+  assert(((!ubMap && ubOperands.empty()) ||
+          ubOperands.size() == ubMap.getNumInputs()) &&
+         "upper bound operand count does not match the affine map");
+  assert(step > 0 && "step has to be a positive integer constant");
+
+  // Add an attribute for the step.
+  result->addAttribute(getStepAttrName(),
+                       builder->getIntegerAttr(builder->getIndexType(), step));
+
+  // Add the lower bound.
+  result->addAttribute(getLowerBoundAttrName(),
+                       builder->getAffineMapAttr(lbMap));
+  result->addOperands(lbOperands);
+
+  // Add the upper bound.
+  result->addAttribute(getUpperBoundAttrName(),
+                       builder->getAffineMapAttr(ubMap));
+  result->addOperands(ubOperands);
+
+  // Create a region and a block for the body.  The argument of the region is
+  // the loop induction variable.
+  Region *bodyRegion = result->addRegion();
+  Block *body = new Block();
+  body->addArgument(IndexType::get(builder->getContext()));
+  bodyRegion->push_back(body);
+  ensureTerminator(*bodyRegion, *builder, result->location);
+
+  // Set the operands list as resizable so that we can freely modify the bounds.
+  result->setOperandListToResizable();
+}
+
+void AffineForOp::build(Builder *builder, OperationState *result, int64_t lb,
+                        int64_t ub, int64_t step) {
+  auto lbMap = AffineMap::getConstantMap(lb, builder->getContext());
+  auto ubMap = AffineMap::getConstantMap(ub, builder->getContext());
+  return build(builder, result, {}, lbMap, {}, ubMap, step);
+}
+
+static LogicalResult verify(AffineForOp op) {
+  // Check that the body defines as single block argument for the induction
+  // variable.
+  auto *body = op.getBody();
+  if (body->getNumArguments() != 1 ||
+      !body->getArgument(0)->getType().isIndex())
+    return op.emitOpError(
+        "expected body to have a single index argument for the "
+        "induction variable");
+
+  // Verify that there are enough operands for the bounds.
+  AffineMap lowerBoundMap = op.getLowerBoundMap(),
+            upperBoundMap = op.getUpperBoundMap();
+  if (op.getNumOperands() !=
+      (lowerBoundMap.getNumInputs() + upperBoundMap.getNumInputs()))
+    return op.emitOpError(
+        "operand count must match with affine map dimension and symbol count");
+
+  // Verify that the bound operands are valid dimension/symbols.
+  /// Lower bound.
+  if (failed(verifyDimAndSymbolIdentifiers(op, op.getLowerBoundOperands(),
+                                           op.getLowerBoundMap().getNumDims())))
+    return failure();
+  /// Upper bound.
+  if (failed(verifyDimAndSymbolIdentifiers(op, op.getUpperBoundOperands(),
+                                           op.getUpperBoundMap().getNumDims())))
+    return failure();
+  return success();
+}
+
+/// Parse a for operation loop bounds.
+static ParseResult parseBound(bool isLower, OperationState *result,
+                              OpAsmParser *p) {
+  // 'min' / 'max' prefixes are generally syntactic sugar, but are required if
+  // the map has multiple results.
+  bool failedToParsedMinMax =
+      failed(p->parseOptionalKeyword(isLower ? "max" : "min"));
+
+  auto &builder = p->getBuilder();
+  auto boundAttrName = isLower ? AffineForOp::getLowerBoundAttrName()
+                               : AffineForOp::getUpperBoundAttrName();
+
+  // Parse ssa-id as identity map.
+  SmallVector<OpAsmParser::OperandType, 1> boundOpInfos;
+  if (p->parseOperandList(boundOpInfos))
+    return failure();
+
+  if (!boundOpInfos.empty()) {
+    // Check that only one operand was parsed.
+    if (boundOpInfos.size() > 1)
+      return p->emitError(p->getNameLoc(),
+                          "expected only one loop bound operand");
+
+    // TODO: improve error message when SSA value is not an affine integer.
+    // Currently it is 'use of value ... expects different type than prior uses'
+    if (p->resolveOperand(boundOpInfos.front(), builder.getIndexType(),
+                          result->operands))
+      return failure();
+
+    // Create an identity map using symbol id. This representation is optimized
+    // for storage. Analysis passes may expand it into a multi-dimensional map
+    // if desired.
+    AffineMap map = builder.getSymbolIdentityMap();
+    result->addAttribute(boundAttrName, builder.getAffineMapAttr(map));
+    return success();
+  }
+
+  // Get the attribute location.
+  llvm::SMLoc attrLoc = p->getCurrentLocation();
+
+  Attribute boundAttr;
+  if (p->parseAttribute(boundAttr, builder.getIndexType(), boundAttrName,
+                        result->attributes))
+    return failure();
+
+  // Parse full form - affine map followed by dim and symbol list.
+  if (auto affineMapAttr = boundAttr.dyn_cast<AffineMapAttr>()) {
+    unsigned currentNumOperands = result->operands.size();
+    unsigned numDims;
+    if (parseDimAndSymbolList(p, result->operands, numDims))
+      return failure();
+
+    auto map = affineMapAttr.getValue();
+    if (map.getNumDims() != numDims)
+      return p->emitError(
+          p->getNameLoc(),
+          "dim operand count and integer set dim count must match");
+
+    unsigned numDimAndSymbolOperands =
+        result->operands.size() - currentNumOperands;
+    if (numDims + map.getNumSymbols() != numDimAndSymbolOperands)
+      return p->emitError(
+          p->getNameLoc(),
+          "symbol operand count and integer set symbol count must match");
+
+    // If the map has multiple results, make sure that we parsed the min/max
+    // prefix.
+    if (map.getNumResults() > 1 && failedToParsedMinMax) {
+      if (isLower) {
+        return p->emitError(attrLoc, "lower loop bound affine map with "
+                                     "multiple results requires 'max' prefix");
+      }
+      return p->emitError(attrLoc, "upper loop bound affine map with multiple "
+                                   "results requires 'min' prefix");
+    }
+    return success();
+  }
+
+  // Parse custom assembly form.
+  if (auto integerAttr = boundAttr.dyn_cast<IntegerAttr>()) {
+    result->attributes.pop_back();
+    result->addAttribute(
+        boundAttrName, builder.getAffineMapAttr(
+                           builder.getConstantAffineMap(integerAttr.getInt())));
+    return success();
+  }
+
+  return p->emitError(
+      p->getNameLoc(),
+      "expected valid affine map representation for loop bounds");
+}
+
+ParseResult parseAffineForOp(OpAsmParser *parser, OperationState *result) {
+  auto &builder = parser->getBuilder();
+  OpAsmParser::OperandType inductionVariable;
+  // Parse the induction variable followed by '='.
+  if (parser->parseRegionArgument(inductionVariable) || parser->parseEqual())
+    return failure();
+
+  // Parse loop bounds.
+  if (parseBound(/*isLower=*/true, result, parser) ||
+      parser->parseKeyword("to", " between bounds") ||
+      parseBound(/*isLower=*/false, result, parser))
+    return failure();
+
+  // Parse the optional loop step, we default to 1 if one is not present.
+  if (parser->parseOptionalKeyword("step")) {
+    result->addAttribute(
+        AffineForOp::getStepAttrName(),
+        builder.getIntegerAttr(builder.getIndexType(), /*value=*/1));
+  } else {
+    llvm::SMLoc stepLoc = parser->getCurrentLocation();
+    IntegerAttr stepAttr;
+    if (parser->parseAttribute(stepAttr, builder.getIndexType(),
+                               AffineForOp::getStepAttrName().data(),
+                               result->attributes))
+      return failure();
+
+    if (stepAttr.getValue().getSExtValue() < 0)
+      return parser->emitError(
+          stepLoc,
+          "expected step to be representable as a positive signed integer");
+  }
+
+  // Parse the body region.
+  Region *body = result->addRegion();
+  if (parser->parseRegion(*body, inductionVariable, builder.getIndexType()))
+    return failure();
+
+  AffineForOp::ensureTerminator(*body, builder, result->location);
+
+  // Parse the optional attribute list.
+  if (parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+
+  // Set the operands list as resizable so that we can freely modify the bounds.
+  result->setOperandListToResizable();
+  return success();
+}
+
+static void printBound(AffineMapAttr boundMap,
+                       Operation::operand_range boundOperands,
+                       const char *prefix, OpAsmPrinter *p) {
+  AffineMap map = boundMap.getValue();
+
+  // Check if this bound should be printed using custom assembly form.
+  // The decision to restrict printing custom assembly form to trivial cases
+  // comes from the will to roundtrip MLIR binary -> text -> binary in a
+  // lossless way.
+  // Therefore, custom assembly form parsing and printing is only supported for
+  // zero-operand constant maps and single symbol operand identity maps.
+  if (map.getNumResults() == 1) {
+    AffineExpr expr = map.getResult(0);
+
+    // Print constant bound.
+    if (map.getNumDims() == 0 && map.getNumSymbols() == 0) {
+      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+        *p << constExpr.getValue();
+        return;
+      }
+    }
+
+    // Print bound that consists of a single SSA symbol if the map is over a
+    // single symbol.
+    if (map.getNumDims() == 0 && map.getNumSymbols() == 1) {
+      if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+        p->printOperand(*boundOperands.begin());
+        return;
+      }
+    }
+  } else {
+    // Map has multiple results. Print 'min' or 'max' prefix.
+    *p << prefix << ' ';
+  }
+
+  // Print the map and its operands.
+  *p << boundMap;
+  printDimAndSymbolList(boundOperands.begin(), boundOperands.end(),
+                        map.getNumDims(), p);
+}
+
+void print(OpAsmPrinter *p, AffineForOp op) {
+  *p << "affine.for ";
+  p->printOperand(op.getBody()->getArgument(0));
+  *p << " = ";
+  printBound(op.getLowerBoundMapAttr(), op.getLowerBoundOperands(), "max", p);
+  *p << " to ";
+  printBound(op.getUpperBoundMapAttr(), op.getUpperBoundOperands(), "min", p);
+
+  if (op.getStep() != 1)
+    *p << " step " << op.getStep();
+  p->printRegion(op.region(),
+                 /*printEntryBlockArgs=*/false,
+                 /*printBlockTerminators=*/false);
+  p->printOptionalAttrDict(op.getAttrs(),
+                           /*elidedAttrs=*/{op.getLowerBoundAttrName(),
+                                            op.getUpperBoundAttrName(),
+                                            op.getStepAttrName()});
+}
+
+namespace {
+/// This is a pattern to fold constant loop bounds.
+struct AffineForLoopBoundFolder : public OpRewritePattern<AffineForOp> {
+  using OpRewritePattern<AffineForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineForOp forOp,
+                                     PatternRewriter &rewriter) const override {
+    auto foldLowerOrUpperBound = [&forOp](bool lower) {
+      // Check to see if each of the operands is the result of a constant.  If
+      // so, get the value.  If not, ignore it.
+      SmallVector<Attribute, 8> operandConstants;
+      auto boundOperands =
+          lower ? forOp.getLowerBoundOperands() : forOp.getUpperBoundOperands();
+      for (auto *operand : boundOperands) {
+        Attribute operandCst;
+        matchPattern(operand, m_Constant(&operandCst));
+        operandConstants.push_back(operandCst);
+      }
+
+      AffineMap boundMap =
+          lower ? forOp.getLowerBoundMap() : forOp.getUpperBoundMap();
+      assert(boundMap.getNumResults() >= 1 &&
+             "bound maps should have at least one result");
+      SmallVector<Attribute, 4> foldedResults;
+      if (failed(boundMap.constantFold(operandConstants, foldedResults)))
+        return failure();
+
+      // Compute the max or min as applicable over the results.
+      assert(!foldedResults.empty() &&
+             "bounds should have at least one result");
+      auto maxOrMin = foldedResults[0].cast<IntegerAttr>().getValue();
+      for (unsigned i = 1, e = foldedResults.size(); i < e; i++) {
+        auto foldedResult = foldedResults[i].cast<IntegerAttr>().getValue();
+        maxOrMin = lower ? llvm::APIntOps::smax(maxOrMin, foldedResult)
+                         : llvm::APIntOps::smin(maxOrMin, foldedResult);
+      }
+      lower ? forOp.setConstantLowerBound(maxOrMin.getSExtValue())
+            : forOp.setConstantUpperBound(maxOrMin.getSExtValue());
+      return success();
+    };
+
+    // Try to fold the lower bound.
+    bool folded = false;
+    if (!forOp.hasConstantLowerBound())
+      folded |= succeeded(foldLowerOrUpperBound(/*lower=*/true));
+
+    // Try to fold the upper bound.
+    if (!forOp.hasConstantUpperBound())
+      folded |= succeeded(foldLowerOrUpperBound(/*lower=*/false));
+
+    // If any of the bounds were folded we return success.
+    if (!folded)
+      return matchFailure();
+    rewriter.updatedRootInPlace(forOp);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace
+
+void AffineForOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<AffineForLoopBoundFolder>(context);
+}
+
+AffineBound AffineForOp::getLowerBound() {
+  auto lbMap = getLowerBoundMap();
+  return AffineBound(AffineForOp(*this), 0, lbMap.getNumInputs(), lbMap);
+}
+
+AffineBound AffineForOp::getUpperBound() {
+  auto lbMap = getLowerBoundMap();
+  auto ubMap = getUpperBoundMap();
+  return AffineBound(AffineForOp(*this), lbMap.getNumInputs(), getNumOperands(),
+                     ubMap);
+}
+
+void AffineForOp::setLowerBound(ArrayRef<Value *> lbOperands, AffineMap map) {
+  assert(lbOperands.size() == map.getNumInputs());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+
+  SmallVector<Value *, 4> newOperands(lbOperands.begin(), lbOperands.end());
+
+  auto ubOperands = getUpperBoundOperands();
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setUpperBound(ArrayRef<Value *> ubOperands, AffineMap map) {
+  assert(ubOperands.size() == map.getNumInputs());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+
+  SmallVector<Value *, 4> newOperands(getLowerBoundOperands());
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setLowerBoundMap(AffineMap map) {
+  auto lbMap = getLowerBoundMap();
+  assert(lbMap.getNumDims() == map.getNumDims() &&
+         lbMap.getNumSymbols() == map.getNumSymbols());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+  (void)lbMap;
+  setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setUpperBoundMap(AffineMap map) {
+  auto ubMap = getUpperBoundMap();
+  assert(ubMap.getNumDims() == map.getNumDims() &&
+         ubMap.getNumSymbols() == map.getNumSymbols());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+  (void)ubMap;
+  setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map));
+}
+
+bool AffineForOp::hasConstantLowerBound() {
+  return getLowerBoundMap().isSingleConstant();
+}
+
+bool AffineForOp::hasConstantUpperBound() {
+  return getUpperBoundMap().isSingleConstant();
+}
+
+int64_t AffineForOp::getConstantLowerBound() {
+  return getLowerBoundMap().getSingleConstantResult();
+}
+
+int64_t AffineForOp::getConstantUpperBound() {
+  return getUpperBoundMap().getSingleConstantResult();
+}
+
+void AffineForOp::setConstantLowerBound(int64_t value) {
+  setLowerBound({}, AffineMap::getConstantMap(value, getContext()));
+}
+
+void AffineForOp::setConstantUpperBound(int64_t value) {
+  setUpperBound({}, AffineMap::getConstantMap(value, getContext()));
+}
+
+AffineForOp::operand_range AffineForOp::getLowerBoundOperands() {
+  return {operand_begin(), operand_begin() + getLowerBoundMap().getNumInputs()};
+}
+
+AffineForOp::operand_range AffineForOp::getUpperBoundOperands() {
+  return {operand_begin() + getLowerBoundMap().getNumInputs(), operand_end()};
+}
+
+bool AffineForOp::matchingBoundOperandList() {
+  auto lbMap = getLowerBoundMap();
+  auto ubMap = getUpperBoundMap();
+  if (lbMap.getNumDims() != ubMap.getNumDims() ||
+      lbMap.getNumSymbols() != ubMap.getNumSymbols())
+    return false;
+
+  unsigned numOperands = lbMap.getNumInputs();
+  for (unsigned i = 0, e = lbMap.getNumInputs(); i < e; i++) {
+    // Compare Value *'s.
+    if (getOperand(i) != getOperand(numOperands + i))
+      return false;
+  }
+  return true;
+}
+
+/// Returns if the provided value is the induction variable of a AffineForOp.
+bool mlir::isForInductionVar(Value *val) {
+  return getForInductionVarOwner(val) != AffineForOp();
+}
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+AffineForOp mlir::getForInductionVarOwner(Value *val) {
+  auto *ivArg = dyn_cast<BlockArgument>(val);
+  if (!ivArg || !ivArg->getOwner())
+    return AffineForOp();
+  auto *containingInst = ivArg->getOwner()->getParent()->getParentOp();
+  return dyn_cast<AffineForOp>(containingInst);
+}
+
+/// Extracts the induction variables from a list of AffineForOps and returns
+/// them.
+void mlir::extractForInductionVars(ArrayRef<AffineForOp> forInsts,
+                                   SmallVectorImpl<Value *> *ivs) {
+  ivs->reserve(forInsts.size());
+  for (auto forInst : forInsts)
+    ivs->push_back(forInst.getInductionVar());
+}
+
+//===----------------------------------------------------------------------===//
+// AffineIfOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(AffineIfOp op) {
+  // Verify that we have a condition attribute.
+  auto conditionAttr =
+      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+  if (!conditionAttr)
+    return op.emitOpError(
+        "requires an integer set attribute named 'condition'");
+
+  // Verify that there are enough operands for the condition.
+  IntegerSet condition = conditionAttr.getValue();
+  if (op.getNumOperands() != condition.getNumOperands())
+    return op.emitOpError(
+        "operand count and condition integer set dimension and "
+        "symbol count must match");
+
+  // Verify that the operands are valid dimension/symbols.
+  if (failed(verifyDimAndSymbolIdentifiers(
+          op, op.getOperation()->getNonSuccessorOperands(),
+          condition.getNumDims())))
+    return failure();
+
+  // Verify that the entry of each child region does not have arguments.
+  for (auto &region : op.getOperation()->getRegions()) {
+    for (auto &b : region)
+      if (b.getNumArguments() != 0)
+        return op.emitOpError(
+            "requires that child entry blocks have no arguments");
+  }
+  return success();
+}
+
+ParseResult parseAffineIfOp(OpAsmParser *parser, OperationState *result) {
+  // Parse the condition attribute set.
+  IntegerSetAttr conditionAttr;
+  unsigned numDims;
+  if (parser->parseAttribute(conditionAttr, AffineIfOp::getConditionAttrName(),
+                             result->attributes) ||
+      parseDimAndSymbolList(parser, result->operands, numDims))
+    return failure();
+
+  // Verify the condition operands.
+  auto set = conditionAttr.getValue();
+  if (set.getNumDims() != numDims)
+    return parser->emitError(
+        parser->getNameLoc(),
+        "dim operand count and integer set dim count must match");
+  if (numDims + set.getNumSymbols() != result->operands.size())
+    return parser->emitError(
+        parser->getNameLoc(),
+        "symbol operand count and integer set symbol count must match");
+
+  // Create the regions for 'then' and 'else'.  The latter must be created even
+  // if it remains empty for the validity of the operation.
+  result->regions.reserve(2);
+  Region *thenRegion = result->addRegion();
+  Region *elseRegion = result->addRegion();
+
+  // Parse the 'then' region.
+  if (parser->parseRegion(*thenRegion, {}, {}))
+    return failure();
+  AffineIfOp::ensureTerminator(*thenRegion, parser->getBuilder(),
+                               result->location);
+
+  // If we find an 'else' keyword then parse the 'else' region.
+  if (!parser->parseOptionalKeyword("else")) {
+    if (parser->parseRegion(*elseRegion, {}, {}))
+      return failure();
+    AffineIfOp::ensureTerminator(*elseRegion, parser->getBuilder(),
+                                 result->location);
+  }
+
+  // Parse the optional attribute list.
+  if (parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+
+  return success();
+}
+
+void print(OpAsmPrinter *p, AffineIfOp op) {
+  auto conditionAttr =
+      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+  *p << "affine.if " << conditionAttr;
+  printDimAndSymbolList(op.operand_begin(), op.operand_end(),
+                        conditionAttr.getValue().getNumDims(), p);
+  p->printRegion(op.thenRegion(),
+                 /*printEntryBlockArgs=*/false,
+                 /*printBlockTerminators=*/false);
+
+  // Print the 'else' regions if it has any blocks.
+  auto &elseRegion = op.elseRegion();
+  if (!elseRegion.empty()) {
+    *p << " else";
+    p->printRegion(elseRegion,
+                   /*printEntryBlockArgs=*/false,
+                   /*printBlockTerminators=*/false);
+  }
+
+  // Print the attribute list.
+  p->printOptionalAttrDict(op.getAttrs(),
+                           /*elidedAttrs=*/op.getConditionAttrName());
+}
+
+IntegerSet AffineIfOp::getIntegerSet() {
+  return getAttrOfType<IntegerSetAttr>(getConditionAttrName()).getValue();
+}
+void AffineIfOp::setIntegerSet(IntegerSet newSet) {
+  setAttr(getConditionAttrName(), IntegerSetAttr::get(newSet));
+}
+
+//===----------------------------------------------------------------------===//
+// AffineLoadOp
+//===----------------------------------------------------------------------===//
+
+void AffineLoadOp::build(Builder *builder, OperationState *result,
+                         AffineMap map, ArrayRef<Value *> operands) {
+  result->addOperands(operands);
+  if (map)
+    result->addAttribute(getMapAttrName(), builder->getAffineMapAttr(map));
+  auto memrefType = operands[0]->getType().cast<MemRefType>();
+  result->types.push_back(memrefType.getElementType());
+}
+
+void AffineLoadOp::build(Builder *builder, OperationState *result,
+                         Value *memref, ArrayRef<Value *> indices) {
+  result->addOperands(memref);
+  result->addOperands(indices);
+  auto memrefType = memref->getType().cast<MemRefType>();
+  auto rank = memrefType.getRank();
+  // Create identity map for memrefs with at least one dimension or () -> ()
+  // for zero-dimensional memrefs.
+  auto map = rank ? builder->getMultiDimIdentityMap(rank)
+                  : builder->getEmptyAffineMap();
+  result->addAttribute(getMapAttrName(), builder->getAffineMapAttr(map));
+  result->types.push_back(memrefType.getElementType());
+}
+
+ParseResult AffineLoadOp::parse(OpAsmParser *parser, OperationState *result) {
+  auto &builder = parser->getBuilder();
+  auto affineIntTy = builder.getIndexType();
+
+  MemRefType type;
+  OpAsmParser::OperandType memrefInfo;
+  AffineMapAttr mapAttr;
+  SmallVector<OpAsmParser::OperandType, 1> mapOperands;
+  return failure(
+      parser->parseOperand(memrefInfo) ||
+      parser->parseAffineMapOfSSAIds(mapOperands, mapAttr, getMapAttrName(),
+                                     result->attributes) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(memrefInfo, type, result->operands) ||
+      parser->resolveOperands(mapOperands, affineIntTy, result->operands) ||
+      parser->addTypeToList(type.getElementType(), result->types));
+}
+
+void AffineLoadOp::print(OpAsmPrinter *p) {
+  *p << "affine.load " << *getMemRef() << '[';
+  AffineMapAttr mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    SmallVector<Value *, 2> operands(getIndices());
+    p->printAffineMapOfSSAIds(mapAttr, operands);
+  }
+  *p << ']';
+  p->printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{getMapAttrName()});
+  *p << " : " << getMemRefType();
+}
+
+LogicalResult AffineLoadOp::verify() {
+  if (getType() != getMemRefType().getElementType())
+    return emitOpError("result type must match element type of memref");
+
+  auto mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    AffineMap map = getAttrOfType<AffineMapAttr>(getMapAttrName()).getValue();
+    if (map.getNumResults() != getMemRefType().getRank())
+      return emitOpError("affine.load affine map num results must equal"
+                         " memref rank");
+    if (map.getNumInputs() != getNumOperands() - 1)
+      return emitOpError("expects as many subscripts as affine map inputs");
+  } else {
+    if (getMemRefType().getRank() != getNumOperands() - 1)
+      return emitOpError(
+          "expects the number of subscripts to be equal to memref rank");
+  }
+
+  for (auto *idx : getIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to load must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineLoadOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  /// load(memrefcast) -> load
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineStoreOp
+//===----------------------------------------------------------------------===//
+
+void AffineStoreOp::build(Builder *builder, OperationState *result,
+                          Value *valueToStore, AffineMap map,
+                          ArrayRef<Value *> operands) {
+  result->addOperands(valueToStore);
+  result->addOperands(operands);
+  if (map)
+    result->addAttribute(getMapAttrName(), builder->getAffineMapAttr(map));
+}
+
+void AffineStoreOp::build(Builder *builder, OperationState *result,
+                          Value *valueToStore, Value *memref,
+                          ArrayRef<Value *> operands) {
+  result->addOperands(valueToStore);
+  result->addOperands(memref);
+  result->addOperands(operands);
+  auto memrefType = memref->getType().cast<MemRefType>();
+  auto rank = memrefType.getRank();
+  // Create identity map for memrefs with at least one dimension or () -> ()
+  // for zero-dimensional memrefs.
+  auto map = rank ? builder->getMultiDimIdentityMap(rank)
+                  : builder->getEmptyAffineMap();
+  result->addAttribute(getMapAttrName(), builder->getAffineMapAttr(map));
+}
+
+ParseResult AffineStoreOp::parse(OpAsmParser *parser, OperationState *result) {
+  auto affineIntTy = parser->getBuilder().getIndexType();
+
+  MemRefType type;
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memrefInfo;
+  AffineMapAttr mapAttr;
+  SmallVector<OpAsmParser::OperandType, 1> mapOperands;
+  return failure(
+      parser->parseOperand(storeValueInfo) || parser->parseComma() ||
+      parser->parseOperand(memrefInfo) ||
+      parser->parseAffineMapOfSSAIds(mapOperands, mapAttr, getMapAttrName(),
+                                     result->attributes) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(storeValueInfo, type.getElementType(),
+                             result->operands) ||
+      parser->resolveOperand(memrefInfo, type, result->operands) ||
+      parser->resolveOperands(mapOperands, affineIntTy, result->operands));
+}
+
+void AffineStoreOp::print(OpAsmPrinter *p) {
+  *p << "affine.store " << *getValueToStore();
+  *p << ", " << *getMemRef() << '[';
+  AffineMapAttr mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    SmallVector<Value *, 2> operands(getIndices());
+    p->printAffineMapOfSSAIds(mapAttr, operands);
+  }
+  *p << ']';
+  p->printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{getMapAttrName()});
+  *p << " : " << getMemRefType();
+}
+
+LogicalResult AffineStoreOp::verify() {
+  // First operand must have same type as memref element type.
+  if (getValueToStore()->getType() != getMemRefType().getElementType())
+    return emitOpError("first operand must have same type memref element type");
+
+  auto mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    AffineMap map = mapAttr.getValue();
+    if (map.getNumResults() != getMemRefType().getRank())
+      return emitOpError("affine.store affine map num results must equal"
+                         " memref rank");
+    if (map.getNumInputs() != getNumOperands() - 2)
+      return emitOpError("expects as many subscripts as affine map inputs");
+  } else {
+    if (getMemRefType().getRank() != getNumOperands() - 2)
+      return emitOpError(
+          "expects the number of subscripts to be equal to memref rank");
+  }
+
+  for (auto *idx : getIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to store must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineStoreOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  /// load(memrefcast) -> load
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
diff --git a/third_party/mlir/lib/Dialect/AffineOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/AffineOps/CMakeLists.txt
new file mode 100644
index 00000000000..dbe469369a3
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/AffineOps/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRAffineOps
+  AffineOps.cpp
+  DialectRegistration.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AffineOps
+  )
+add_dependencies(MLIRAffineOps MLIRAffineOpsIncGen MLIRIR MLIRStandardOps)
+target_link_libraries(MLIRAffineOps MLIRIR MLIRStandardOps)
+
diff --git a/third_party/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp
new file mode 100644
index 00000000000..9197e3c619f
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp
@@ -0,0 +1,22 @@
+//===- DialectRegistration.cpp - Register Affine Op dialect ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+using namespace mlir;
+
+// Static initialization for Affine op dialect registration.
+static DialectRegistration<AffineOpsDialect> StandardOps;
diff --git a/third_party/mlir/lib/Dialect/CMakeLists.txt b/third_party/mlir/lib/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..b0641a9611f
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_subdirectory(AffineOps)
+add_subdirectory(FxpMathOps)
+add_subdirectory(GPU)
+add_subdirectory(Linalg)
+add_subdirectory(LLVMIR)
+add_subdirectory(LoopOps)
+add_subdirectory(QuantOps)
+add_subdirectory(SDBM)
+add_subdirectory(SPIRV)
+add_subdirectory(StandardOps)
+add_subdirectory(VectorOps)
+
+add_llvm_library(MLIRDialect
+  Traits.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect
+  )
+target_link_libraries(MLIRDialect MLIRIR)
diff --git a/third_party/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
new file mode 100644
index 00000000000..9eddc5545f5
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRFxpMathOps
+  IR/FxpMathOps.cpp
+  IR/DialectRegistration.cpp
+  Transforms/LowerUniformRealMath.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/FxpMathOps
+  )
+add_dependencies(MLIRFxpMathOps
+                 MLIRFxpMathOpsIncGen
+                 MLIRQuantOps
+                 MLIRIR
+                 MLIRPass
+                 MLIRSupport
+                 MLIRStandardOps)
diff --git a/third_party/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp
new file mode 100644
index 00000000000..aa6782e1464
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp
@@ -0,0 +1,24 @@
+//===- DialectRegistration.cpp - Register FxpMathOps dialect --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+
+// Static initialization for the fxpmath ops dialect registration.
+static mlir::DialectRegistration<FxpMathOpsDialect> FxpMathOps;
diff --git a/third_party/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp b/third_party/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp
new file mode 100644
index 00000000000..18c07b07117
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp
@@ -0,0 +1,38 @@
+//===- FxpMathOps.cpp - Op implementation for FxpMathOps ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc"
+
+FxpMathOpsDialect::FxpMathOpsDialect(MLIRContext *context)
+    : Dialect(/*name=*/"fxpmath", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc"
+      >();
+}
diff --git a/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp b/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
new file mode 100644
index 00000000000..83307da957b
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
@@ -0,0 +1,401 @@
+//===- LowerUniformRealMath.cpp  ------------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "UniformKernelUtils.h"
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/FxpMathOps/Passes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+using namespace mlir::fxpmath::detail;
+using namespace mlir::quant;
+
+namespace {
+
+struct LowerUniformRealMathPass
+    : public FunctionPass<LowerUniformRealMathPass> {
+  void runOnFunction() override;
+};
+
+struct LowerUniformCastsPass : public FunctionPass<LowerUniformCastsPass> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Dequantize
+//===----------------------------------------------------------------------===//
+
+static Value *emitUniformPerLayerDequantize(Location loc, Value *input,
+                                            UniformQuantizedType elementType,
+                                            PatternRewriter &rewriter) {
+  // Pre-conditions.
+  if (!elementType.isSigned()) {
+    // TODO: Support unsigned storage type.
+    emitWarning(loc, "unimplemented: dequantize signed uniform");
+    return nullptr;
+  }
+
+  Type storageType = elementType.castToStorageType(input->getType());
+  Type realType = elementType.castToExpressedType(input->getType());
+  Type intermediateType =
+      castElementType(storageType, IntegerType::get(32, rewriter.getContext()));
+  assert(storageType && "cannot cast to storage type");
+  assert(realType && "cannot cast to expressed type");
+
+  // Cast to storage type.
+  input = rewriter.create<StorageCastOp>(loc, storageType, input);
+
+  // Promote to intermediate type.
+  input = rewriter.create<ConvertISOp>(loc, intermediateType, input);
+
+  // Apply zero-point offset.
+  if (elementType.getZeroPoint() != 0) {
+    Value *negZeroPointConst = rewriter.create<ConstantOp>(
+        loc, broadcastScalarConstIntValue(intermediateType,
+                                          -elementType.getZeroPoint()));
+    input = rewriter.create<AddIOp>(loc, input, negZeroPointConst);
+  }
+
+  // Convert to float.
+  input = rewriter.create<ConvertISToFOp>(loc, realType, input);
+
+  // Mul by scale.
+  Value *scaleConst = rewriter.create<ConstantOp>(
+      loc, broadcastScalarConstFloatValue(realType,
+                                          APFloat(elementType.getScale())));
+  return rewriter.create<MulFOp>(loc, input, scaleConst);
+}
+
+static Value *
+emitUniformPerAxisDequantize(Location loc, Value *input,
+                             UniformQuantizedPerAxisType elementType,
+                             PatternRewriter &rewriter) {
+  // TODO: Support per-axis dequantize.
+  rewriter.getContext()->getDiagEngine().emit(loc, DiagnosticSeverity::Warning)
+      << "unimplemented: per-axis uniform dequantization";
+  return nullptr;
+}
+
+static Value *emitDequantize(Location loc, Value *input,
+                             PatternRewriter &rewriter) {
+  Type inputType = input->getType();
+  QuantizedType qElementType =
+      QuantizedType::getQuantizedElementType(inputType);
+  if (auto uperLayerElementType =
+          qElementType.dyn_cast_or_null<UniformQuantizedType>()) {
+    return emitUniformPerLayerDequantize(loc, input, uperLayerElementType,
+                                         rewriter);
+  } else if (auto uperAxisElementType =
+                 qElementType.dyn_cast_or_null<UniformQuantizedPerAxisType>()) {
+    return emitUniformPerAxisDequantize(loc, input, uperAxisElementType,
+                                        rewriter);
+  } else {
+    return nullptr;
+  }
+}
+
+namespace {
+
+struct UniformDequantizePattern : public OpRewritePattern<DequantizeCastOp> {
+  using OpRewritePattern<DequantizeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(DequantizeCastOp op,
+                                     PatternRewriter &rewriter) const {
+    Type inputType = op.arg()->getType();
+    Type outputType = op.getResult()->getType();
+
+    QuantizedType inputElementType =
+        QuantizedType::getQuantizedElementType(inputType);
+    Type expressedOutputType = inputElementType.castToExpressedType(inputType);
+    if (expressedOutputType != outputType) {
+      // Not a valid uniform cast.
+      return matchFailure();
+    }
+
+    Value *dequantizedValue = emitDequantize(op.getLoc(), op.arg(), rewriter);
+    if (!dequantizedValue) {
+      return matchFailure();
+    }
+
+    rewriter.replaceOp(op, dequantizedValue);
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Elementwise add
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+tryRewriteAffineAddEwIsomorphicSigned(const UniformBinaryOpInfo &info,
+                                      PatternRewriter &rewriter) {
+  if (!info.resultType.isSigned() || info.lhsType != info.resultType ||
+      info.rhsType != info.resultType) {
+    return failure();
+  }
+
+  // Choose a byte aligned intermediate width big enough to perform the
+  // calculation without overflow.
+  // TODO: This should probably be made just big enough to avoid overflow and
+  // leave the downstream tooling to decide how to align that to machine
+  // word sizes.
+  unsigned intermediateWidth =
+      info.resultType.getStorageTypeIntegralWidth() <= 8 ? 16 : 32;
+  IntegerType intermediateElementType =
+      IntegerType::get(intermediateWidth, rewriter.getContext());
+  Type intermediateType =
+      castElementType(info.resultStorageType, intermediateElementType);
+
+  // Cast operands to storage type.
+  Value *lhsValue = rewriter
+                        .create<StorageCastOp>(info.op->getLoc(),
+                                               info.lhsStorageType, info.lhs)
+                        .getResult();
+  Value *rhsValue = rewriter
+                        .create<StorageCastOp>(info.op->getLoc(),
+                                               info.rhsStorageType, info.rhs)
+                        .getResult();
+
+  // Cast to the intermediate sized type.
+  lhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          lhsValue);
+  rhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          rhsValue);
+
+  // Add.
+  Value *resultValue =
+      rewriter.create<AddIOp>(info.op->getLoc(), lhsValue, rhsValue);
+
+  // Zero point offset adjustment.
+  // result = (lhs - zp) + (rhs - zp) + zp
+  // zpOffset = -zp
+  int zpOffset = -1 * info.resultType.getZeroPoint();
+  if (zpOffset != 0) {
+    Value *zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(),
+        broadcastScalarConstIntValue(intermediateType, zpOffset));
+    resultValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), resultValue, zpOffsetConst);
+  }
+
+  // Clamp.
+  auto clampMinMax = info.getClampMinMax(intermediateElementType);
+  resultValue = rewriter.create<ClampISOp>(
+      info.op->getLoc(), resultValue, clampMinMax.first, clampMinMax.second);
+
+  // Convert back to original type.
+  resultValue = rewriter.create<ConvertISOp>(
+      info.op->getLoc(), info.resultStorageType, resultValue);
+
+  // Cast back for new result.
+  rewriter.replaceOpWithNewOp<StorageCastOp>(
+      info.op, info.getQuantizedResultType(), resultValue);
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Elementwise mul
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+tryRewriteAffineMulEwSigned(const UniformBinaryOpInfo &info,
+                            PatternRewriter &rewriter) {
+  if (!info.resultType.isSigned()) {
+    return failure();
+  }
+
+  double outputMultiplierReal = info.lhsType.getScale() *
+                                info.rhsType.getScale() /
+                                info.resultType.getScale();
+  if (outputMultiplierReal > 1.0) {
+    info.op->emitWarning("unimplemented: cannot multiply with multipler > 1.0");
+    return failure();
+  }
+
+  // TODO: Choose an appropriate intermediate width for muls > 8 bits to
+  // avoid overflow.
+  unsigned intermediateWidth = 32;
+  IntegerType intermediateElementType =
+      IntegerType::get(intermediateWidth, rewriter.getContext());
+  Type intermediateType =
+      castElementType(info.resultStorageType, intermediateElementType);
+
+  // Cast operands to storage type.
+  Value *lhsValue = rewriter
+                        .create<StorageCastOp>(info.op->getLoc(),
+                                               info.lhsStorageType, info.lhs)
+                        .getResult();
+  Value *rhsValue = rewriter
+                        .create<StorageCastOp>(info.op->getLoc(),
+                                               info.rhsStorageType, info.rhs)
+                        .getResult();
+
+  // Cast to the intermediate sized type.
+  lhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          lhsValue);
+  rhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          rhsValue);
+
+  // Apply argument zeroPoints.
+  if (info.lhsType.getZeroPoint() != 0) {
+    Value *zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(), broadcastScalarConstIntValue(
+                               intermediateType, -info.lhsType.getZeroPoint()));
+    lhsValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), lhsValue, zpOffsetConst);
+  }
+
+  if (info.rhsType.getZeroPoint() != 0) {
+    Value *zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(), broadcastScalarConstIntValue(
+                               intermediateType, -info.rhsType.getZeroPoint()));
+    rhsValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), rhsValue, zpOffsetConst);
+  }
+
+  // Mul.
+  Value *resultValue =
+      rewriter.create<MulIOp>(info.op->getLoc(), lhsValue, rhsValue);
+
+  // Scale output.
+  QuantizedMultiplierSmallerThanOneExp outputMultiplier(outputMultiplierReal);
+  resultValue = rewriter.create<VecScalarSaturatingRoundingDoublingHighMulISOp>(
+      info.op->getLoc(), resultValue,
+      IntegerAttr::get(intermediateElementType, outputMultiplier.multiplier));
+  resultValue = rewriter.create<RoundingDivideByPotISOp>(
+      info.op->getLoc(), resultValue,
+      IntegerAttr::get(intermediateElementType, -outputMultiplier.exponent));
+
+  // Zero point offset adjustment.
+  if (info.resultType.getZeroPoint() != 0) {
+    Value *zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(),
+        broadcastScalarConstIntValue(intermediateType,
+                                     info.resultType.getZeroPoint()));
+    resultValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), resultValue, zpOffsetConst);
+  }
+
+  // Clamp.
+  auto clampMinMax = info.getClampMinMax(intermediateElementType);
+  resultValue = rewriter.create<ClampISOp>(
+      info.op->getLoc(), resultValue, clampMinMax.first, clampMinMax.second);
+
+  // Convert back to original type.
+  resultValue = rewriter.create<ConvertISOp>(
+      info.op->getLoc(), info.resultStorageType, resultValue);
+
+  // Cast back for new result.
+  rewriter.replaceOpWithNewOp<StorageCastOp>(
+      info.op, info.getQuantizedResultType(), resultValue);
+
+  return success();
+}
+
+namespace {
+
+struct UniformRealAddEwPattern : public OpRewritePattern<RealAddEwOp> {
+  using OpRewritePattern<RealAddEwOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(RealAddEwOp op,
+                                     PatternRewriter &rewriter) const {
+    const UniformBinaryOpInfo info(op, op.lhs(), op.rhs(), op.clamp_min(),
+                                   op.clamp_max());
+    if (!info.isValid()) {
+      return matchFailure();
+    }
+
+    // Try all of the permutations we support.
+    if (succeeded(tryRewriteAffineAddEwIsomorphicSigned(info, rewriter))) {
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+};
+
+struct UniformRealMulEwPattern : public OpRewritePattern<RealMulEwOp> {
+  using OpRewritePattern<RealMulEwOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(RealMulEwOp op,
+                                     PatternRewriter &rewriter) const {
+    const UniformBinaryOpInfo info(op, op.lhs(), op.rhs(), op.clamp_min(),
+                                   op.clamp_max());
+    if (!info.isValid()) {
+      return matchFailure();
+    }
+
+    // Try all of the permutations we support.
+    if (succeeded(tryRewriteAffineMulEwSigned(info, rewriter))) {
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// LowerUniformRealMath pass
+//===----------------------------------------------------------------------===//
+
+void LowerUniformRealMathPass::runOnFunction() {
+  auto fn = getFunction();
+  OwningRewritePatternList patterns;
+  auto *context = &getContext();
+  patterns.insert<UniformRealAddEwPattern, UniformRealMulEwPattern>(context);
+  applyPatternsGreedily(fn, patterns);
+}
+
+FunctionPassBase *mlir::fxpmath::createLowerUniformRealMathPass() {
+  return new LowerUniformRealMathPass();
+}
+
+static PassRegistration<LowerUniformRealMathPass> lowerUniformRealMathPass(
+    "fxpmath-lower-uniform-real-math",
+    "Lowers uniform-quantized real math ops to integer arithmetic.");
+
+//===----------------------------------------------------------------------===//
+// LowerUniformCasts pass
+//===----------------------------------------------------------------------===//
+
+void LowerUniformCastsPass::runOnFunction() {
+  auto fn = getFunction();
+  OwningRewritePatternList patterns;
+  auto *context = &getContext();
+  patterns.insert<UniformDequantizePattern>(context);
+  applyPatternsGreedily(fn, patterns);
+}
+
+FunctionPassBase *mlir::fxpmath::createLowerUniformCastsPass() {
+  return new LowerUniformCastsPass();
+}
+
+static PassRegistration<LowerUniformCastsPass>
+    lowerUniformCastsPass("fxpmath-lower-uniform-casts",
+                          "Lowers uniform-quantized casts.");
diff --git a/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h b/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h
new file mode 100644
index 00000000000..f0eeba0891a
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h
@@ -0,0 +1,236 @@
+//===- UniformKernelUtils.h - Utilities for lowering uniform math - C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
+#define MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Operation.h"
+
+#include <cmath>
+
+namespace mlir {
+namespace fxpmath {
+namespace detail {
+
+inline quant::UniformQuantizedType getUniformElementType(Type t) {
+  return quant::QuantizedType::getQuantizedElementType(t)
+      .dyn_cast_or_null<quant::UniformQuantizedType>();
+}
+
+inline bool hasStorageBitWidth(quant::QuantizedType t,
+                               llvm::ArrayRef<unsigned> checkWidths) {
+  unsigned w = t.getStorageType().getIntOrFloatBitWidth();
+  for (unsigned checkWidth : checkWidths) {
+    if (w == checkWidth)
+      return true;
+  }
+  return false;
+}
+
+/// Computes the log2(x), rounded to an integral value. Returns whether 'x' can
+/// be considered an exact integral value.
+template <typename F> bool integralLog2(F x, int &log2Result) {
+  const F xLog2 = std::log(x) * (1.0 / std::log(2.0));
+  const F xLog2Rounded = std::round(xLog2);
+  const F xLog2Frac = xLog2 - xLog2Rounded;
+  log2Result = static_cast<int>(xLog2Rounded);
+  // Allow small comparison slop below the level that would make a difference
+  // for 2^16 levels.
+  return std::abs(xLog2Frac) < 1e-6;
+}
+
+/// Helper class for operating on binary operations where all operands
+/// and the result are a UniformQuantizedType.
+struct UniformBinaryOpInfo {
+  UniformBinaryOpInfo(Operation *op, Value *lhs, Value *rhs,
+                      Optional<APFloat> clampMin, Optional<APFloat> clampMax)
+      : op(op), lhs(lhs), rhs(rhs), clampMin(clampMin), clampMax(clampMax),
+        lhsType(getUniformElementType(lhs->getType())),
+        rhsType(getUniformElementType(rhs->getType())),
+        resultType(getUniformElementType(*op->result_type_begin())),
+        lhsStorageType(quant::QuantizedType::castToStorageType(lhs->getType())),
+        rhsStorageType(quant::QuantizedType::castToStorageType(rhs->getType())),
+        resultStorageType(
+            quant::QuantizedType::castToStorageType(*op->result_type_begin())) {
+  }
+
+  /// Returns whether this info is valid (all types defined, etc).
+  bool isValid() const {
+    return lhsType && rhsType && resultType && lhsStorageType &&
+           rhsStorageType && resultStorageType;
+  }
+
+  /// Gets the final quantized result type of the result.
+  Type getQuantizedResultType() const { return *op->result_type_begin(); }
+
+  /// Returns whether the storage type of all operands is identical.
+  bool isSameStorageType() const {
+    return lhsType.getStorageType() == rhsType.getStorageType() &&
+           lhsType.getStorageType() == resultType.getStorageType();
+  }
+
+  /// Returns whether all operands and result are considered fixedpoint power
+  /// of two, setting the lhs, rhs, and result log2 scale references.
+  bool isFixedPointPOT(int &lhsLog2Scale, int &rhsLog2Scale,
+                       int &resultLog2Scale) const {
+    if (!lhsType.isFixedPoint() || !rhsType.isFixedPoint() ||
+        !resultType.isFixedPoint()) {
+      return false;
+    }
+
+    if (!integralLog2(lhsType.getScale(), lhsLog2Scale) ||
+        !integralLog2(rhsType.getScale(), rhsLog2Scale) ||
+        !integralLog2(resultType.getScale(), resultLog2Scale)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Gets the result integer clamp range given the result quantized type
+  // and any explicit clamp provided as attributes.
+  std::pair<IntegerAttr, IntegerAttr> getClampMinMax(IntegerType ty) const {
+    int64_t typeMin = resultType.getStorageTypeMin();
+    int64_t typeMax = resultType.getStorageTypeMax();
+
+    if (clampMin || clampMax) {
+      quant::UniformQuantizedValueConverter conv(resultType);
+      if (clampMin) {
+        typeMin = std::max(typeMin, conv.quantizeFloatToInt64(*clampMin));
+      }
+      if (clampMax) {
+        typeMax = std::min(typeMax, conv.quantizeFloatToInt64(*clampMax));
+      }
+    }
+
+    // The quantized, integral ops expect clamps as 32bit ints.
+    return {
+        IntegerAttr::get(ty, typeMin),
+        IntegerAttr::get(ty, typeMax),
+    };
+  }
+
+  Operation *op;
+  Value *lhs;
+  Value *rhs;
+  Optional<APFloat> clampMin;
+  Optional<APFloat> clampMax;
+
+  // Element UniformQuantizedType for operands/result.
+  quant::UniformQuantizedType lhsType;
+  quant::UniformQuantizedType rhsType;
+  quant::UniformQuantizedType resultType;
+
+  // Full storage-based types.
+  Type lhsStorageType;
+  Type rhsStorageType;
+  Type resultStorageType;
+};
+
+/// Derives a quantized multiplier and shift from a real valued multiplier
+/// less than 1.
+struct QuantizedMultiplierSmallerThanOneExp {
+  QuantizedMultiplierSmallerThanOneExp(double realMultiplier) {
+    assert(realMultiplier < 1.0);
+    assert(realMultiplier > 0.0);
+
+    const double q = std::frexp(realMultiplier, &exponent);
+    auto qFixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    assert(qFixed <= (1ll << 31));
+    if (qFixed == (1ll << 31)) {
+      qFixed /= 2;
+      ++exponent;
+    }
+    assert(qFixed <= std::numeric_limits<int32_t>::max());
+    multiplier = static_cast<int32_t>(qFixed);
+  }
+
+  int32_t multiplier;
+  int exponent;
+};
+
+/// Casts an integer or floating point based shaped type to a new element type.
+inline Type castElementType(Type t, Type newElementType) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    switch (st.getKind()) {
+    case StandardTypes::Kind::Vector:
+      return VectorType::get(st.getShape(), newElementType);
+    case StandardTypes::Kind::RankedTensor:
+      return RankedTensorType::get(st.getShape(), newElementType);
+    case StandardTypes::Kind::UnrankedTensor:
+      return UnrankedTensorType::get(newElementType);
+    case StandardTypes::Kind::MemRef:
+      return MemRefType::get(st.getShape(), newElementType,
+                             st.cast<MemRefType>().getAffineMaps());
+    }
+  }
+  assert(t.isIntOrFloat());
+  return newElementType;
+}
+
+/// Creates an IntegerAttr with a type that matches the shape of 't' (which can
+/// be a scalar primitive or a shaped type).
+inline Attribute broadcastScalarConstIntValue(Type t, int64_t value) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    assert(st.getElementType().isa<IntegerType>());
+    return DenseElementsAttr::get(st,
+                                  IntegerAttr::get(st.getElementType(), value));
+  }
+
+  auto integerType = t.cast<IntegerType>();
+  assert(t.isa<IntegerType>() && "integer broadcast must be of integer type");
+  return IntegerAttr::get(integerType, value);
+}
+
+/// Given an APFloat, converts it to the float semantics that matches the
+/// given FloatType, silently ignoring inexact conversions.
+inline APFloat convertFloatToType(FloatType ft, APFloat value) {
+  bool losesInfo;
+  auto status = value.convert(ft.getFloatSemantics(),
+                              APFloat::rmNearestTiesToEven, &losesInfo);
+  (void)status; // unused in opt mode
+  assert((status & (APFloat::opDivByZero | APFloat::opInvalidOp)) == 0 &&
+         "could not convert to float const");
+  return value;
+}
+
+/// Creates a FloatAttr with a type that matches the shape of 't' (which can be
+/// a scalar primitive or a shaped type).
+inline Attribute broadcastScalarConstFloatValue(Type t, APFloat value) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    FloatType floatElementType = st.getElementType().dyn_cast<FloatType>();
+    assert(floatElementType &&
+           "float broadcast element type must be float like");
+    APFloat apValue = convertFloatToType(floatElementType, value);
+    return DenseElementsAttr::get(st,
+                                  FloatAttr::get(st.getElementType(), apValue));
+  } else {
+    auto floatType = t.dyn_cast<FloatType>();
+    assert(floatType && "float broadcast must be of float type");
+    APFloat apValue = convertFloatToType(floatType, value);
+    return FloatAttr::get(floatType, apValue);
+  }
+}
+
+} // namespace detail
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
diff --git a/third_party/mlir/lib/Dialect/GPU/CMakeLists.txt b/third_party/mlir/lib/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 00000000000..09da5cc16e9
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRGPU
+  IR/GPUDialect.cpp
+  IR/DialectRegistration.cpp
+  Transforms/KernelOutlining.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
+)
+add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR LLVMSupport)
+target_link_libraries(MLIRGPU MLIRIR MLIRStandardOps LLVMSupport)
diff --git a/third_party/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp
new file mode 100644
index 00000000000..af50d0270cf
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp
@@ -0,0 +1,21 @@
+//===- DialectRegistration.cpp - MLIR GPU dialect registration ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+
+// Static initialization for GPU dialect registration.
+static mlir::DialectRegistration<mlir::gpu::GPUDialect> kernelDialect;
diff --git a/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
new file mode 100644
index 00000000000..22d433a74fc
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -0,0 +1,454 @@
+//===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the GPU kernel-related dialect and its operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+StringRef GPUDialect::getDialectName() { return "gpu"; }
+
+bool GPUDialect::isKernel(FuncOp function) {
+  UnitAttr isKernelAttr =
+      function.getAttrOfType<UnitAttr>(getKernelFuncAttrName());
+  return static_cast<bool>(isKernelAttr);
+}
+
+GPUDialect::GPUDialect(MLIRContext *context)
+    : Dialect(getDialectName(), context) {
+  addOperations<LaunchOp, LaunchFuncOp,
+#define GET_OP_LIST
+#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
+                >();
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// LaunchOp
+//===----------------------------------------------------------------------===//
+
+static SmallVector<Type, 4> getValueTypes(ArrayRef<Value *> values) {
+  SmallVector<Type, 4> types;
+  types.reserve(values.size());
+  for (Value *v : values)
+    types.push_back(v->getType());
+  return types;
+}
+
+void LaunchOp::build(Builder *builder, OperationState *result, Value *gridSizeX,
+                     Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,
+                     Value *blockSizeY, Value *blockSizeZ,
+                     ArrayRef<Value *> operands) {
+  // Add grid and block sizes as op operands, followed by the data operands.
+  result->addOperands(
+      {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
+  result->addOperands(operands);
+
+  // Create a kernel body region with kNumConfigRegionAttributes + N arguments,
+  // where the first kNumConfigRegionAttributes arguments have `index` type and
+  // the rest have the same types as the data operands.
+  Region *kernelRegion = result->addRegion();
+  Block *body = new Block();
+  body->addArguments(
+      std::vector<Type>(kNumConfigRegionAttributes, builder->getIndexType()));
+  body->addArguments(getValueTypes(operands));
+  kernelRegion->push_back(body);
+}
+
+Region &LaunchOp::getBody() { return getOperation()->getRegion(0); }
+
+KernelDim3 LaunchOp::getBlockIds() {
+  assert(!getBody().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = getBody().getBlocks().front().getArguments();
+  return KernelDim3{args[0], args[1], args[2]};
+}
+
+KernelDim3 LaunchOp::getThreadIds() {
+  assert(!getBody().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = getBody().getBlocks().front().getArguments();
+  return KernelDim3{args[3], args[4], args[5]};
+}
+
+KernelDim3 LaunchOp::getGridSize() {
+  assert(!getBody().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = getBody().getBlocks().front().getArguments();
+  return KernelDim3{args[6], args[7], args[8]};
+}
+
+KernelDim3 LaunchOp::getBlockSize() {
+  assert(!getBody().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = getBody().getBlocks().front().getArguments();
+  return KernelDim3{args[9], args[10], args[11]};
+}
+
+LaunchOp::operand_range LaunchOp::getKernelOperandValues() {
+  return llvm::drop_begin(getOperands(), kNumConfigOperands);
+}
+
+LaunchOp::operand_type_range LaunchOp::getKernelOperandTypes() {
+  return llvm::drop_begin(getOperandTypes(), kNumConfigOperands);
+}
+
+KernelDim3 LaunchOp::getGridSizeOperandValues() {
+  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+}
+
+KernelDim3 LaunchOp::getBlockSizeOperandValues() {
+  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+}
+
+llvm::iterator_range<Block::args_iterator> LaunchOp::getKernelArguments() {
+  auto args = getBody().getBlocks().front().getArguments();
+  return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes);
+}
+
+LogicalResult LaunchOp::verify() {
+  // Kernel launch takes kNumConfigOperands leading operands for grid/block
+  // sizes and transforms them into kNumConfigRegionAttributes region arguments
+  // for block/thread identifiers and grid/block sizes.
+  if (!getBody().empty()) {
+    Block &entryBlock = getBody().front();
+    if (entryBlock.getNumArguments() != kNumConfigOperands + getNumOperands())
+      return emitError("unexpected number of region arguments");
+  }
+
+  // Block terminators without successors are expected to exit the kernel region
+  // and must be `gpu.launch`.
+  for (Block &block : getBody()) {
+    if (block.empty())
+      continue;
+    if (block.back().getNumSuccessors() != 0)
+      continue;
+    if (!isa<gpu::Return>(&block.back())) {
+      return block.back()
+                 .emitError("expected 'gpu.terminator' or a terminator with "
+                            "successors")
+                 .attachNote(getLoc())
+             << "in '" << getOperationName() << "' body region";
+    }
+  }
+
+  return success();
+}
+
+// Pretty-print the kernel grid/block size assignment as
+//   (%iter-x, %iter-y, %iter-z) in
+//   (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
+// where %size-* and %iter-* will correspond to the body region arguments.
+static void printSizeAssignment(OpAsmPrinter *p, KernelDim3 size,
+                                ArrayRef<Value *> operands, KernelDim3 ids) {
+  *p << '(' << *ids.x << ", " << *ids.y << ", " << *ids.z << ") in (";
+  *p << *size.x << " = " << *operands[0] << ", ";
+  *p << *size.y << " = " << *operands[1] << ", ";
+  *p << *size.z << " = " << *operands[2] << ')';
+}
+
+void LaunchOp::print(OpAsmPrinter *p) {
+  SmallVector<Value *, 12> operandContainer(operand_begin(), operand_end());
+  ArrayRef<Value *> operands(operandContainer);
+
+  // Print the launch configuration.
+  *p << getOperationName() << ' ' << getBlocksKeyword();
+  printSizeAssignment(p, getGridSize(), operands.take_front(3), getBlockIds());
+  *p << ' ' << getThreadsKeyword();
+  printSizeAssignment(p, getBlockSize(), operands.slice(3, 3), getThreadIds());
+
+  // From now on, the first kNumConfigOperands operands corresponding to grid
+  // and block sizes are irrelevant, so we can drop them.
+  operands = operands.drop_front(kNumConfigOperands);
+
+  // Print the data argument remapping.
+  if (!getBody().empty() && !operands.empty()) {
+    *p << ' ' << getArgsKeyword() << '(';
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      if (i != 0)
+        *p << ", ";
+      *p << *getBody().front().getArgument(kNumConfigRegionAttributes + i)
+         << " = " << *operands[i];
+    }
+    *p << ") ";
+  }
+
+  // Print the types of data arguments.
+  if (!operands.empty()) {
+    *p << ": ";
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      if (i != 0)
+        *p << ", ";
+      *p << operands[i]->getType();
+    }
+  }
+
+  p->printRegion(getBody(), /*printEntryBlockArgs=*/false);
+  p->printOptionalAttrDict(getAttrs());
+}
+
+// Parse the size assignment blocks for blocks and threads.  These have the form
+//   (%region_arg, %region_arg, %region_arg) in
+//   (%region_arg = %operand, %region_arg = %operand, %region_arg = %operand)
+// where %region_arg are percent-identifiers for the region arguments to be
+// introduced futher (SSA defs), and %operand are percent-identifiers for the
+// SSA value uses.
+static ParseResult
+parseSizeAssignment(OpAsmParser *parser,
+                    MutableArrayRef<OpAsmParser::OperandType> sizes,
+                    MutableArrayRef<OpAsmParser::OperandType> regionSizes,
+                    MutableArrayRef<OpAsmParser::OperandType> indices) {
+  assert(indices.size() == 3 && "space for three indices expected");
+  SmallVector<OpAsmParser::OperandType, 3> args;
+  if (parser->parseRegionArgumentList(args, /*requiredOperandCount=*/3,
+                                      OpAsmParser::Delimiter::Paren) ||
+      parser->parseKeyword("in") || parser->parseLParen())
+    return failure();
+  std::move(args.begin(), args.end(), indices.begin());
+
+  for (int i = 0; i < 3; ++i) {
+    if (i != 0 && parser->parseComma())
+      return failure();
+    if (parser->parseRegionArgument(regionSizes[i]) || parser->parseEqual() ||
+        parser->parseOperand(sizes[i]))
+      return failure();
+  }
+
+  return parser->parseRParen();
+}
+
+// Parses a Launch operation.
+// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
+//                           `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+//                             (`args` ssa-reassignment `:` type-list)?
+//                             region attr-dict?
+// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+ParseResult LaunchOp::parse(OpAsmParser *parser, OperationState *result) {
+  // Sizes of the grid and block.
+  SmallVector<OpAsmParser::OperandType, kNumConfigOperands> sizes(
+      kNumConfigOperands);
+  MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);
+
+  // Actual (data) operands passed to the kernel.
+  SmallVector<OpAsmParser::OperandType, 4> dataOperands;
+
+  // Region arguments to be created.
+  SmallVector<OpAsmParser::OperandType, 16> regionArgs(
+      kNumConfigRegionAttributes);
+  MutableArrayRef<OpAsmParser::OperandType> regionArgsRef(regionArgs);
+
+  // Parse the size assignment segments: the first segment assigns grid siezs
+  // and defines values for block identifiers; the second segment assigns block
+  // sies and defines values for thread identifiers.  In the region argument
+  // list, identifiers preceed sizes, and block-related values preceed
+  // thread-related values.
+  if (parser->parseKeyword(getBlocksKeyword().data()) ||
+      parseSizeAssignment(parser, sizesRef.take_front(3),
+                          regionArgsRef.slice(6, 3),
+                          regionArgsRef.slice(0, 3)) ||
+      parser->parseKeyword(getThreadsKeyword().data()) ||
+      parseSizeAssignment(parser, sizesRef.drop_front(3),
+                          regionArgsRef.slice(9, 3),
+                          regionArgsRef.slice(3, 3)) ||
+      parser->resolveOperands(sizes, parser->getBuilder().getIndexType(),
+                              result->operands))
+    return failure();
+
+  // If kernel argument renaming segment is present, parse it.  When present,
+  // the segment should have at least one element.  If this segment is present,
+  // so is the trailing type list.  Parse it as well and use the parsed types
+  // to resolve the operands passed to the kernel arguments.
+  SmallVector<Type, 4> dataTypes;
+  if (!parser->parseOptionalKeyword(getArgsKeyword().data())) {
+    llvm::SMLoc argsLoc = parser->getCurrentLocation();
+
+    regionArgs.push_back({});
+    dataOperands.push_back({});
+    if (parser->parseLParen() ||
+        parser->parseRegionArgument(regionArgs.back()) ||
+        parser->parseEqual() || parser->parseOperand(dataOperands.back()))
+      return failure();
+
+    while (!parser->parseOptionalComma()) {
+      regionArgs.push_back({});
+      dataOperands.push_back({});
+      if (parser->parseRegionArgument(regionArgs.back()) ||
+          parser->parseEqual() || parser->parseOperand(dataOperands.back()))
+        return failure();
+    }
+
+    if (parser->parseRParen() || parser->parseColonTypeList(dataTypes) ||
+        parser->resolveOperands(dataOperands, dataTypes, argsLoc,
+                                result->operands))
+      return failure();
+  }
+
+  // Introduce the body region and parse it.  The region has
+  // kNumConfigRegionAttributes leading arguments that correspond to
+  // block/thread identifiers and grid/block sizes, all of the `index` type.
+  // Follow the actual kernel arguments.
+  Type index = parser->getBuilder().getIndexType();
+  dataTypes.insert(dataTypes.begin(), kNumConfigRegionAttributes, index);
+  Region *body = result->addRegion();
+  return failure(parser->parseRegion(*body, regionArgs, dataTypes) ||
+                 parser->parseOptionalAttributeDict(result->attributes));
+}
+
+void LaunchOp::eraseKernelArgument(unsigned index) {
+  Block &entryBlock = getBody().front();
+  assert(index < entryBlock.getNumArguments() - kNumConfigRegionAttributes &&
+         "kernel argument index overflow");
+  entryBlock.eraseArgument(kNumConfigRegionAttributes + index);
+  getOperation()->eraseOperand(kNumConfigOperands + index);
+}
+
+namespace {
+// Clone any known constants passed as operands to the kernel into its body.
+class PropagateConstantBounds : public OpRewritePattern<LaunchOp> {
+  using OpRewritePattern<LaunchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(LaunchOp launchOp,
+                                     PatternRewriter &rewriter) const override {
+    auto oringInsertionPoint = rewriter.saveInsertionPoint();
+    rewriter.setInsertionPointToStart(&launchOp.getBody().front());
+
+    // Traverse operands passed to kernel and check if some of them are known
+    // constants.  If so, clone the constant operation inside the kernel region
+    // and use it instead of passing the value from the parent region.  Perform
+    // the traversal in the inverse order to simplify index arithmetics when
+    // dropping arguments.
+    SmallVector<Value *, 8> operands(launchOp.getKernelOperandValues().begin(),
+                                     launchOp.getKernelOperandValues().end());
+    SmallVector<Value *, 8> kernelArgs(launchOp.getKernelArguments().begin(),
+                                       launchOp.getKernelArguments().end());
+    bool found = false;
+    for (unsigned i = operands.size(); i > 0; --i) {
+      unsigned index = i - 1;
+      Value *operand = operands[index];
+      if (!isa_and_nonnull<ConstantOp>(operand->getDefiningOp())) {
+        continue;
+      }
+
+      found = true;
+      Value *internalConstant =
+          rewriter.clone(*operand->getDefiningOp())->getResult(0);
+      Value *kernelArg = kernelArgs[index];
+      kernelArg->replaceAllUsesWith(internalConstant);
+      launchOp.eraseKernelArgument(index);
+    }
+    rewriter.restoreInsertionPoint(oringInsertionPoint);
+
+    if (!found)
+      return matchFailure();
+
+    rewriter.updatedRootInPlace(launchOp);
+    return matchSuccess();
+  }
+};
+} // end namespace
+
+void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<PropagateConstantBounds>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// LaunchFuncOp
+//===----------------------------------------------------------------------===//
+
+void LaunchFuncOp::build(Builder *builder, OperationState *result,
+                         FuncOp kernelFunc, Value *gridSizeX, Value *gridSizeY,
+                         Value *gridSizeZ, Value *blockSizeX, Value *blockSizeY,
+                         Value *blockSizeZ, ArrayRef<Value *> kernelOperands) {
+  // Add grid and block sizes as op operands, followed by the data operands.
+  result->addOperands(
+      {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
+  result->addOperands(kernelOperands);
+  result->addAttribute(getKernelAttrName(),
+                       builder->getSymbolRefAttr(kernelFunc));
+}
+
+void LaunchFuncOp::build(Builder *builder, OperationState *result,
+                         FuncOp kernelFunc, KernelDim3 gridSize,
+                         KernelDim3 blockSize,
+                         ArrayRef<Value *> kernelOperands) {
+  build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
+        blockSize.x, blockSize.y, blockSize.z, kernelOperands);
+}
+
+StringRef LaunchFuncOp::kernel() {
+  return getAttrOfType<SymbolRefAttr>(getKernelAttrName()).getValue();
+}
+
+unsigned LaunchFuncOp::getNumKernelOperands() {
+  return getNumOperands() - kNumConfigOperands;
+}
+
+Value *LaunchFuncOp::getKernelOperand(unsigned i) {
+  return getOperation()->getOperand(i + kNumConfigOperands);
+}
+
+KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
+  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+}
+
+KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
+  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+}
+
+LogicalResult LaunchFuncOp::verify() {
+  auto kernelAttr = this->getAttr(getKernelAttrName());
+  if (!kernelAttr) {
+    return emitOpError("attribute 'kernel' must be specified");
+  } else if (!kernelAttr.isa<SymbolRefAttr>()) {
+    return emitOpError("attribute 'kernel' must be a function");
+  }
+
+  auto module = getParentOfType<ModuleOp>();
+  FuncOp kernelFunc = module.lookupSymbol<FuncOp>(kernel());
+  if (!kernelFunc)
+    return emitError() << "kernel function '" << kernelAttr << "' is undefined";
+
+  if (!kernelFunc.getAttrOfType<mlir::UnitAttr>(
+          GPUDialect::getKernelFuncAttrName())) {
+    return emitError("kernel function is missing the '")
+           << GPUDialect::getKernelFuncAttrName() << "' attribute";
+  }
+  unsigned numKernelFuncArgs = kernelFunc.getNumArguments();
+  if (getNumKernelOperands() != numKernelFuncArgs) {
+    return emitOpError("got ")
+           << getNumKernelOperands() << " kernel operands but expected "
+           << numKernelFuncArgs;
+  }
+  auto functionType = kernelFunc.getType();
+  for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
+    if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
+      return emitOpError("type of function argument ")
+             << i << " does not match";
+    }
+  }
+  return success();
+}
diff --git a/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
new file mode 100644
index 00000000000..481ed247e81
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -0,0 +1,118 @@
+//===- KernelOutlining.cpp - Implementation of GPU kernel outling ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the GPU dialect kernel outlining pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+template <typename OpTy>
+static void createForAllDimensions(OpBuilder &builder, Location loc,
+                                   SmallVectorImpl<Value *> &values) {
+  for (StringRef dim : {"x", "y", "z"}) {
+    Value *v = builder.create<OpTy>(loc, builder.getIndexType(),
+                                    builder.getStringAttr(dim));
+    values.push_back(v);
+  }
+}
+
+// Add operations generating block/thread ids and gird/block dimensions at the
+// beginning of `kernelFunc` and replace uses of the respective function args.
+static void injectGpuIndexOperations(Location loc, FuncOp kernelFunc) {
+  OpBuilder OpBuilder(kernelFunc.getBody());
+  SmallVector<Value *, 12> indexOps;
+  createForAllDimensions<gpu::BlockId>(OpBuilder, loc, indexOps);
+  createForAllDimensions<gpu::ThreadId>(OpBuilder, loc, indexOps);
+  createForAllDimensions<gpu::GridDim>(OpBuilder, loc, indexOps);
+  createForAllDimensions<gpu::BlockDim>(OpBuilder, loc, indexOps);
+  // Replace the leading 12 function args with the respective thread/block index
+  // operations. Iterate backwards since args are erased and indices change.
+  for (int i = 11; i >= 0; --i) {
+    auto &firstBlock = kernelFunc.front();
+    firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]);
+    firstBlock.eraseArgument(i);
+  }
+}
+
+// Outline the `gpu.launch` operation body into a kernel function. Replace
+// `gpu.return` operations by `std.return` in the generated functions.
+static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
+  Location loc = launchOp.getLoc();
+  SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes());
+  FunctionType type =
+      FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
+  std::string kernelFuncName =
+      Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
+  FuncOp outlinedFunc = FuncOp::create(loc, kernelFuncName, type);
+  outlinedFunc.getBody().takeBody(launchOp.getBody());
+  Builder builder(launchOp.getContext());
+  outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                       builder.getUnitAttr());
+  injectGpuIndexOperations(loc, outlinedFunc);
+  outlinedFunc.walk<mlir::gpu::Return>([](mlir::gpu::Return op) {
+    OpBuilder replacer(op);
+    replacer.create<ReturnOp>(op.getLoc());
+    op.erase();
+  });
+  return outlinedFunc;
+}
+
+// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
+// `kernelFunc`.
+static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) {
+  OpBuilder builder(launchOp);
+  SmallVector<Value *, 4> kernelOperandValues(
+      launchOp.getKernelOperandValues());
+  builder.create<gpu::LaunchFuncOp>(
+      launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
+      launchOp.getBlockSizeOperandValues(), kernelOperandValues);
+  launchOp.erase();
+}
+
+namespace {
+
+class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
+public:
+  void runOnModule() override {
+    ModuleManager moduleManager(getModule());
+    for (auto func : getModule().getOps<FuncOp>()) {
+      func.walk<mlir::gpu::LaunchOp>([&](mlir::gpu::LaunchOp op) {
+        FuncOp outlinedFunc = outlineKernelFunc(op);
+        moduleManager.insert(outlinedFunc);
+        convertToLaunchFuncOp(op, outlinedFunc);
+      });
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<ModulePassBase> mlir::createGpuKernelOutliningPass() {
+  return std::make_unique<GpuKernelOutliningPass>();
+}
+
+static PassRegistration<GpuKernelOutliningPass>
+    pass("gpu-kernel-outlining",
+         "Outline gpu.launch bodies to kernel functions.");
diff --git a/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
new file mode 100644
index 00000000000..4469e7606d3
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_library(MLIRLLVMIR
+  IR/LLVMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+  )
+add_dependencies(MLIRLLVMIR MLIRLLVMOpsIncGen MLIRLLVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport)
+target_link_libraries(MLIRLLVMIR LLVMAsmParser LLVMCore LLVMSupport)
+
+add_llvm_library(MLIRNVVMIR
+  IR/NVVMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+  )
+add_dependencies(MLIRNVVMIR MLIRNVVMOpsIncGen MLIRNVVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport)
+target_link_libraries(MLIRNVVMIR LLVMAsmParser LLVMCore LLVMSupport)
diff --git a/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
new file mode 100644
index 00000000000..27ee2f62a4e
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -0,0 +1,1430 @@
+//===- LLVMDialect.cpp - LLVM IR Ops and Dialect registration -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the types and operation details for the LLVM IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+#include "mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CmpOp.
+//===----------------------------------------------------------------------===//
+static void printICmpOp(OpAsmPrinter *p, ICmpOp &op) {
+  *p << op.getOperationName() << " \"" << stringifyICmpPredicate(op.predicate())
+     << "\" " << *op.getOperand(0) << ", " << *op.getOperand(1);
+  p->printOptionalAttrDict(op.getAttrs(), {"predicate"});
+  *p << " : " << op.lhs()->getType();
+}
+
+static void printFCmpOp(OpAsmPrinter *p, FCmpOp &op) {
+  *p << op.getOperationName() << " \"" << stringifyFCmpPredicate(op.predicate())
+     << "\" " << *op.getOperand(0) << ", " << *op.getOperand(1);
+  p->printOptionalAttrDict(op.getAttrs(), {"predicate"});
+  *p << " : " << op.lhs()->getType();
+}
+
+// <operation> ::= `llvm.icmp` string-literal ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+// <operation> ::= `llvm.fcmp` string-literal ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+template <typename CmpPredicateType>
+static ParseResult parseCmpOp(OpAsmParser *parser, OperationState *result) {
+  Builder &builder = parser->getBuilder();
+
+  Attribute predicate;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType lhs, rhs;
+  Type type;
+  llvm::SMLoc predicateLoc, trailingTypeLoc;
+  if (parser->getCurrentLocation(&predicateLoc) ||
+      parser->parseAttribute(predicate, "predicate", attrs) ||
+      parser->parseOperand(lhs) || parser->parseComma() ||
+      parser->parseOperand(rhs) || parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColon() || parser->getCurrentLocation(&trailingTypeLoc) ||
+      parser->parseType(type) ||
+      parser->resolveOperand(lhs, type, result->operands) ||
+      parser->resolveOperand(rhs, type, result->operands))
+    return failure();
+
+  // Replace the string attribute `predicate` with an integer attribute.
+  auto predicateStr = predicate.dyn_cast<StringAttr>();
+  if (!predicateStr)
+    return parser->emitError(predicateLoc,
+                             "expected 'predicate' attribute of string type");
+
+  int64_t predicateValue = 0;
+  if (std::is_same<CmpPredicateType, ICmpPredicate>()) {
+    Optional<ICmpPredicate> predicate =
+        symbolizeICmpPredicate(predicateStr.getValue());
+    if (!predicate)
+      return parser->emitError(predicateLoc)
+             << "'" << predicateStr.getValue()
+             << "' is an incorrect value of the 'predicate' attribute";
+    predicateValue = static_cast<int64_t>(predicate.getValue());
+  } else {
+    Optional<FCmpPredicate> predicate =
+        symbolizeFCmpPredicate(predicateStr.getValue());
+    if (!predicate)
+      return parser->emitError(predicateLoc)
+             << "'" << predicateStr.getValue()
+             << "' is an incorrect value of the 'predicate' attribute";
+    predicateValue = static_cast<int64_t>(predicate.getValue());
+  }
+
+  attrs[0].second = parser->getBuilder().getI64IntegerAttr(predicateValue);
+
+  // The result type is either i1 or a vector type <? x i1> if the inputs are
+  // vectors.
+  auto *dialect = builder.getContext()->getRegisteredDialect<LLVMDialect>();
+  auto resultType = LLVMType::getInt1Ty(dialect);
+  auto argType = type.dyn_cast<LLVM::LLVMType>();
+  if (!argType)
+    return parser->emitError(trailingTypeLoc, "expected LLVM IR dialect type");
+  if (argType.getUnderlyingType()->isVectorTy())
+    resultType = LLVMType::getVectorTy(
+        resultType, argType.getUnderlyingType()->getVectorNumElements());
+
+  result->attributes = attrs;
+  result->addTypes({resultType});
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::AllocaOp.
+//===----------------------------------------------------------------------===//
+
+static void printAllocaOp(OpAsmPrinter *p, AllocaOp &op) {
+  auto elemTy = op.getType().cast<LLVM::LLVMType>().getPointerElementTy();
+
+  auto funcTy = FunctionType::get({op.arraySize()->getType()}, {op.getType()},
+                                  op.getContext());
+
+  *p << op.getOperationName() << ' ' << *op.arraySize() << " x " << elemTy;
+  if (op.alignment().hasValue() && op.alignment()->getSExtValue() != 0)
+    p->printOptionalAttrDict(op.getAttrs());
+  else
+    p->printOptionalAttrDict(op.getAttrs(), {"alignment"});
+  *p << " : " << funcTy;
+}
+
+// <operation> ::= `llvm.alloca` ssa-use `x` type attribute-dict?
+//                 `:` type `,` type
+static ParseResult parseAllocaOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType arraySize;
+  Type type, elemType;
+  llvm::SMLoc trailingTypeLoc;
+  if (parser->parseOperand(arraySize) || parser->parseKeyword("x") ||
+      parser->parseType(elemType) ||
+      parser->parseOptionalAttributeDict(attrs) || parser->parseColon() ||
+      parser->getCurrentLocation(&trailingTypeLoc) || parser->parseType(type))
+    return failure();
+
+  // Extract the result type from the trailing function type.
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType || funcType.getNumInputs() != 1 ||
+      funcType.getNumResults() != 1)
+    return parser->emitError(
+        trailingTypeLoc,
+        "expected trailing function type with one argument and one result");
+
+  if (parser->resolveOperand(arraySize, funcType.getInput(0), result->operands))
+    return failure();
+
+  result->attributes = attrs;
+  result->addTypes({funcType.getResult(0)});
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::GEPOp.
+//===----------------------------------------------------------------------===//
+
+static void printGEPOp(OpAsmPrinter *p, GEPOp &op) {
+  SmallVector<Type, 8> types(op.getOperandTypes());
+  auto funcTy = FunctionType::get(types, op.getType(), op.getContext());
+
+  *p << op.getOperationName() << ' ' << *op.base() << '[';
+  p->printOperands(std::next(op.operand_begin()), op.operand_end());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << funcTy;
+}
+
+// <operation> ::= `llvm.getelementptr` ssa-use `[` ssa-use-list `]`
+//                 attribute-dict? `:` type
+static ParseResult parseGEPOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType base;
+  SmallVector<OpAsmParser::OperandType, 8> indices;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+  if (parser->parseOperand(base) ||
+      parser->parseOperandList(indices, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(attrs) || parser->parseColon() ||
+      parser->getCurrentLocation(&trailingTypeLoc) || parser->parseType(type))
+    return failure();
+
+  // Deconstruct the trailing function type to extract the types of the base
+  // pointer and result (same type) and the types of the indices.
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType || funcType.getNumResults() != 1 ||
+      funcType.getNumInputs() == 0)
+    return parser->emitError(trailingTypeLoc,
+                             "expected trailing function type with at least "
+                             "one argument and one result");
+
+  if (parser->resolveOperand(base, funcType.getInput(0), result->operands) ||
+      parser->resolveOperands(indices, funcType.getInputs().drop_front(),
+                              parser->getNameLoc(), result->operands))
+    return failure();
+
+  result->attributes = attrs;
+  result->addTypes(funcType.getResults());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::LoadOp.
+//===----------------------------------------------------------------------===//
+
+static void printLoadOp(OpAsmPrinter *p, LoadOp &op) {
+  *p << op.getOperationName() << ' ' << *op.addr();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.addr()->getType();
+}
+
+// Extract the pointee type from the LLVM pointer type wrapped in MLIR.  Return
+// the resulting type wrapped in MLIR, or nullptr on error.
+static Type getLoadStoreElementType(OpAsmParser *parser, Type type,
+                                    llvm::SMLoc trailingTypeLoc) {
+  auto llvmTy = type.dyn_cast<LLVM::LLVMType>();
+  if (!llvmTy)
+    return parser->emitError(trailingTypeLoc, "expected LLVM IR dialect type"),
+           nullptr;
+  if (!llvmTy.getUnderlyingType()->isPointerTy())
+    return parser->emitError(trailingTypeLoc, "expected LLVM pointer type"),
+           nullptr;
+  return llvmTy.getPointerElementTy();
+}
+
+// <operation> ::= `llvm.load` ssa-use attribute-dict? `:` type
+static ParseResult parseLoadOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType addr;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+
+  if (parser->parseOperand(addr) || parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColon() || parser->getCurrentLocation(&trailingTypeLoc) ||
+      parser->parseType(type) ||
+      parser->resolveOperand(addr, type, result->operands))
+    return failure();
+
+  Type elemTy = getLoadStoreElementType(parser, type, trailingTypeLoc);
+
+  result->attributes = attrs;
+  result->addTypes(elemTy);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::StoreOp.
+//===----------------------------------------------------------------------===//
+
+static void printStoreOp(OpAsmPrinter *p, StoreOp &op) {
+  *p << op.getOperationName() << ' ' << *op.value() << ", " << *op.addr();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.addr()->getType();
+}
+
+// <operation> ::= `llvm.store` ssa-use `,` ssa-use attribute-dict? `:` type
+static ParseResult parseStoreOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType addr, value;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+
+  if (parser->parseOperand(value) || parser->parseComma() ||
+      parser->parseOperand(addr) || parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColon() || parser->getCurrentLocation(&trailingTypeLoc) ||
+      parser->parseType(type))
+    return failure();
+
+  Type elemTy = getLoadStoreElementType(parser, type, trailingTypeLoc);
+  if (!elemTy)
+    return failure();
+
+  if (parser->resolveOperand(value, elemTy, result->operands) ||
+      parser->resolveOperand(addr, type, result->operands))
+    return failure();
+
+  result->attributes = attrs;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CallOp.
+//===----------------------------------------------------------------------===//
+
+static void printCallOp(OpAsmPrinter *p, CallOp &op) {
+  auto callee = op.callee();
+  bool isDirect = callee.hasValue();
+
+  // Print the direct callee if present as a function attribute, or an indirect
+  // callee (first operand) otherwise.
+  *p << op.getOperationName() << ' ';
+  if (isDirect)
+    *p << '@' << callee.getValue();
+  else
+    *p << *op.getOperand(0);
+
+  *p << '(';
+  p->printOperands(llvm::drop_begin(op.getOperands(), isDirect ? 0 : 1));
+  *p << ')';
+
+  p->printOptionalAttrDict(op.getAttrs(), {"callee"});
+
+  // Reconstruct the function MLIR function type from operand and result types.
+  SmallVector<Type, 1> resultTypes(op.getResultTypes());
+  SmallVector<Type, 8> argTypes(
+      llvm::drop_begin(op.getOperandTypes(), isDirect ? 0 : 1));
+
+  *p << " : " << FunctionType::get(argTypes, resultTypes, op.getContext());
+}
+
+// <operation> ::= `llvm.call` (function-id | ssa-use) `(` ssa-use-list `)`
+//                 attribute-dict? `:` function-type
+static ParseResult parseCallOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  SmallVector<OpAsmParser::OperandType, 8> operands;
+  Type type;
+  SymbolRefAttr funcAttr;
+  llvm::SMLoc trailingTypeLoc;
+
+  // Parse an operand list that will, in practice, contain 0 or 1 operand.  In
+  // case of an indirect call, there will be 1 operand before `(`.  In case of a
+  // direct call, there will be no operands and the parser will stop at the
+  // function identifier without complaining.
+  if (parser->parseOperandList(operands))
+    return failure();
+  bool isDirect = operands.empty();
+
+  // Optionally parse a function identifier.
+  if (isDirect)
+    if (parser->parseAttribute(funcAttr, "callee", attrs))
+      return failure();
+
+  if (parser->parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser->parseOptionalAttributeDict(attrs) || parser->parseColon() ||
+      parser->getCurrentLocation(&trailingTypeLoc) || parser->parseType(type))
+    return failure();
+
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType)
+    return parser->emitError(trailingTypeLoc, "expected function type");
+  if (isDirect) {
+    // Make sure types match.
+    if (parser->resolveOperands(operands, funcType.getInputs(),
+                                parser->getNameLoc(), result->operands))
+      return failure();
+    result->addTypes(funcType.getResults());
+  } else {
+    // Construct the LLVM IR Dialect function type that the first operand
+    // should match.
+    if (funcType.getNumResults() > 1)
+      return parser->emitError(trailingTypeLoc,
+                               "expected function with 0 or 1 result");
+
+    Builder &builder = parser->getBuilder();
+    auto *llvmDialect =
+        builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+    LLVM::LLVMType llvmResultType;
+    if (funcType.getNumResults() == 0) {
+      llvmResultType = LLVM::LLVMType::getVoidTy(llvmDialect);
+    } else {
+      llvmResultType = funcType.getResult(0).dyn_cast<LLVM::LLVMType>();
+      if (!llvmResultType)
+        return parser->emitError(trailingTypeLoc,
+                                 "expected result to have LLVM type");
+    }
+
+    SmallVector<LLVM::LLVMType, 8> argTypes;
+    argTypes.reserve(funcType.getNumInputs());
+    for (int i = 0, e = funcType.getNumInputs(); i < e; ++i) {
+      auto argType = funcType.getInput(i).dyn_cast<LLVM::LLVMType>();
+      if (!argType)
+        return parser->emitError(trailingTypeLoc,
+                                 "expected LLVM types as inputs");
+      argTypes.push_back(argType);
+    }
+    auto llvmFuncType = LLVM::LLVMType::getFunctionTy(llvmResultType, argTypes,
+                                                      /*isVarArg=*/false);
+    auto wrappedFuncType = llvmFuncType.getPointerTo();
+
+    auto funcArguments =
+        ArrayRef<OpAsmParser::OperandType>(operands).drop_front();
+
+    // Make sure that the first operand (indirect callee) matches the wrapped
+    // LLVM IR function type, and that the types of the other call operands
+    // match the types of the function arguments.
+    if (parser->resolveOperand(operands[0], wrappedFuncType,
+                               result->operands) ||
+        parser->resolveOperands(funcArguments, funcType.getInputs(),
+                                parser->getNameLoc(), result->operands))
+      return failure();
+
+    result->addTypes(llvmResultType);
+  }
+
+  result->attributes = attrs;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ExtractElementOp.
+//===----------------------------------------------------------------------===//
+// Expects vector to be of wrapped LLVM vector type and position to be of
+// wrapped LLVM i32 type.
+void LLVM::ExtractElementOp::build(Builder *b, OperationState *result,
+                                   Value *vector, Value *position,
+                                   ArrayRef<NamedAttribute> attrs) {
+  auto wrappedVectorType = vector->getType().cast<LLVM::LLVMType>();
+  auto llvmType = wrappedVectorType.getVectorElementType();
+  build(b, result, llvmType, vector, position);
+  result->addAttributes(attrs);
+}
+
+static void printExtractElementOp(OpAsmPrinter *p, ExtractElementOp &op) {
+  *p << op.getOperationName() << ' ' << *op.vector() << ", " << *op.position();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.vector()->getType();
+}
+
+// <operation> ::= `llvm.extractelement` ssa-use `, ` ssa-use
+//                 attribute-dict? `:` type
+static ParseResult parseExtractElementOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  llvm::SMLoc loc;
+  OpAsmParser::OperandType vector, position;
+  auto *llvmDialect = parser->getBuilder()
+                          .getContext()
+                          ->getRegisteredDialect<LLVM::LLVMDialect>();
+  Type type, i32Type = LLVMType::getInt32Ty(llvmDialect);
+  if (parser->getCurrentLocation(&loc) || parser->parseOperand(vector) ||
+      parser->parseComma() || parser->parseOperand(position) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(vector, type, result->operands) ||
+      parser->resolveOperand(position, i32Type, result->operands))
+    return failure();
+  auto wrappedVectorType = type.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedVectorType ||
+      !wrappedVectorType.getUnderlyingType()->isVectorTy())
+    return parser->emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  result->addTypes(wrappedVectorType.getVectorElementType());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ExtractValueOp.
+//===----------------------------------------------------------------------===//
+
+static void printExtractValueOp(OpAsmPrinter *p, ExtractValueOp &op) {
+  *p << op.getOperationName() << ' ' << *op.container() << op.position();
+  p->printOptionalAttrDict(op.getAttrs(), {"position"});
+  *p << " : " << op.container()->getType();
+}
+
+// Extract the type at `position` in the wrapped LLVM IR aggregate type
+// `containerType`.  Position is an integer array attribute where each value
+// is a zero-based position of the element in the aggregate type.  Return the
+// resulting type wrapped in MLIR, or nullptr on error.
+static LLVM::LLVMType getInsertExtractValueElementType(OpAsmParser *parser,
+                                                       Type containerType,
+                                                       Attribute positionAttr,
+                                                       llvm::SMLoc attributeLoc,
+                                                       llvm::SMLoc typeLoc) {
+  auto wrappedContainerType = containerType.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedContainerType)
+    return parser->emitError(typeLoc, "expected LLVM IR Dialect type"), nullptr;
+
+  auto positionArrayAttr = positionAttr.dyn_cast<ArrayAttr>();
+  if (!positionArrayAttr)
+    return parser->emitError(attributeLoc, "expected an array attribute"),
+           nullptr;
+
+  // Infer the element type from the structure type: iteratively step inside the
+  // type by taking the element type, indexed by the position attribute for
+  // stuctures.  Check the position index before accessing, it is supposed to be
+  // in bounds.
+  for (Attribute subAttr : positionArrayAttr) {
+    auto positionElementAttr = subAttr.dyn_cast<IntegerAttr>();
+    if (!positionElementAttr)
+      return parser->emitError(attributeLoc,
+                               "expected an array of integer literals"),
+             nullptr;
+    int position = positionElementAttr.getInt();
+    auto *llvmContainerType = wrappedContainerType.getUnderlyingType();
+    if (llvmContainerType->isArrayTy()) {
+      if (position < 0 || static_cast<unsigned>(position) >=
+                              llvmContainerType->getArrayNumElements())
+        return parser->emitError(attributeLoc, "position out of bounds"),
+               nullptr;
+      wrappedContainerType = wrappedContainerType.getArrayElementType();
+    } else if (llvmContainerType->isStructTy()) {
+      if (position < 0 || static_cast<unsigned>(position) >=
+                              llvmContainerType->getStructNumElements())
+        return parser->emitError(attributeLoc, "position out of bounds"),
+               nullptr;
+      wrappedContainerType =
+          wrappedContainerType.getStructElementType(position);
+    } else {
+      return parser->emitError(typeLoc,
+                               "expected wrapped LLVM IR structure/array type"),
+             nullptr;
+    }
+  }
+  return wrappedContainerType;
+}
+
+// <operation> ::= `llvm.extractvalue` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseExtractValueOp(OpAsmParser *parser,
+                                       OperationState *result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType container;
+  Type containerType;
+  Attribute positionAttr;
+  llvm::SMLoc attributeLoc, trailingTypeLoc;
+
+  if (parser->parseOperand(container) ||
+      parser->getCurrentLocation(&attributeLoc) ||
+      parser->parseAttribute(positionAttr, "position", attrs) ||
+      parser->parseOptionalAttributeDict(attrs) || parser->parseColon() ||
+      parser->getCurrentLocation(&trailingTypeLoc) ||
+      parser->parseType(containerType) ||
+      parser->resolveOperand(container, containerType, result->operands))
+    return failure();
+
+  auto elementType = getInsertExtractValueElementType(
+      parser, containerType, positionAttr, attributeLoc, trailingTypeLoc);
+  if (!elementType)
+    return failure();
+
+  result->attributes = attrs;
+  result->addTypes(elementType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::InsertElementOp.
+//===----------------------------------------------------------------------===//
+
+static void printInsertElementOp(OpAsmPrinter *p, InsertElementOp &op) {
+  *p << op.getOperationName() << ' ' << *op.vector() << ", " << *op.value()
+     << ", " << *op.position();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.vector()->getType();
+}
+
+// <operation> ::= `llvm.insertelement` ssa-use `,` ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+static ParseResult parseInsertElementOp(OpAsmParser *parser,
+                                        OperationState *result) {
+  llvm::SMLoc loc;
+  OpAsmParser::OperandType vector, value, position;
+  auto *llvmDialect = parser->getBuilder()
+                          .getContext()
+                          ->getRegisteredDialect<LLVM::LLVMDialect>();
+  Type vectorType, i32Type = LLVMType::getInt32Ty(llvmDialect);
+  if (parser->getCurrentLocation(&loc) || parser->parseOperand(vector) ||
+      parser->parseComma() || parser->parseOperand(value) ||
+      parser->parseComma() || parser->parseOperand(position) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(vectorType))
+    return failure();
+
+  auto wrappedVectorType = vectorType.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedVectorType ||
+      !wrappedVectorType.getUnderlyingType()->isVectorTy())
+    return parser->emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  auto valueType = wrappedVectorType.getVectorElementType();
+  if (!valueType)
+    return failure();
+
+  if (parser->resolveOperand(vector, vectorType, result->operands) ||
+      parser->resolveOperand(value, valueType, result->operands) ||
+      parser->resolveOperand(position, i32Type, result->operands))
+    return failure();
+
+  result->addTypes(vectorType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::InsertValueOp.
+//===----------------------------------------------------------------------===//
+
+static void printInsertValueOp(OpAsmPrinter *p, InsertValueOp &op) {
+  *p << op.getOperationName() << ' ' << *op.value() << ", " << *op.container()
+     << op.position();
+  p->printOptionalAttrDict(op.getAttrs(), {"position"});
+  *p << " : " << op.container()->getType();
+}
+
+// <operation> ::= `llvm.insertvaluevalue` ssa-use `,` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseInsertValueOp(OpAsmParser *parser,
+                                      OperationState *result) {
+  OpAsmParser::OperandType container, value;
+  Type containerType;
+  Attribute positionAttr;
+  llvm::SMLoc attributeLoc, trailingTypeLoc;
+
+  if (parser->parseOperand(value) || parser->parseComma() ||
+      parser->parseOperand(container) ||
+      parser->getCurrentLocation(&attributeLoc) ||
+      parser->parseAttribute(positionAttr, "position", result->attributes) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColon() || parser->getCurrentLocation(&trailingTypeLoc) ||
+      parser->parseType(containerType))
+    return failure();
+
+  auto valueType = getInsertExtractValueElementType(
+      parser, containerType, positionAttr, attributeLoc, trailingTypeLoc);
+  if (!valueType)
+    return failure();
+
+  if (parser->resolveOperand(container, containerType, result->operands) ||
+      parser->resolveOperand(value, valueType, result->operands))
+    return failure();
+
+  result->addTypes(containerType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::SelectOp.
+//===----------------------------------------------------------------------===//
+
+static void printSelectOp(OpAsmPrinter *p, SelectOp &op) {
+  *p << op.getOperationName() << ' ' << *op.condition() << ", "
+     << *op.trueValue() << ", " << *op.falseValue();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.condition()->getType() << ", " << op.trueValue()->getType();
+}
+
+// <operation> ::= `llvm.select` ssa-use `,` ssa-use `,` ssa-use
+//                 attribute-dict? `:` type, type
+static ParseResult parseSelectOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType condition, trueValue, falseValue;
+  Type conditionType, argType;
+
+  if (parser->parseOperand(condition) || parser->parseComma() ||
+      parser->parseOperand(trueValue) || parser->parseComma() ||
+      parser->parseOperand(falseValue) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(conditionType) || parser->parseComma() ||
+      parser->parseType(argType))
+    return failure();
+
+  if (parser->resolveOperand(condition, conditionType, result->operands) ||
+      parser->resolveOperand(trueValue, argType, result->operands) ||
+      parser->resolveOperand(falseValue, argType, result->operands))
+    return failure();
+
+  result->addTypes(argType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::BrOp.
+//===----------------------------------------------------------------------===//
+
+static void printBrOp(OpAsmPrinter *p, BrOp &op) {
+  *p << op.getOperationName() << ' ';
+  p->printSuccessorAndUseList(op.getOperation(), 0);
+  p->printOptionalAttrDict(op.getAttrs());
+}
+
+// <operation> ::= `llvm.br` bb-id (`[` ssa-use-and-type-list `]`)?
+// attribute-dict?
+static ParseResult parseBrOp(OpAsmParser *parser, OperationState *result) {
+  Block *dest;
+  SmallVector<Value *, 4> operands;
+  if (parser->parseSuccessorAndUseList(dest, operands) ||
+      parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+
+  result->addSuccessor(dest, operands);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CondBrOp.
+//===----------------------------------------------------------------------===//
+
+static void printCondBrOp(OpAsmPrinter *p, CondBrOp &op) {
+  *p << op.getOperationName() << ' ' << *op.getOperand(0) << ", ";
+  p->printSuccessorAndUseList(op.getOperation(), 0);
+  *p << ", ";
+  p->printSuccessorAndUseList(op.getOperation(), 1);
+  p->printOptionalAttrDict(op.getAttrs());
+}
+
+// <operation> ::= `llvm.cond_br` ssa-use `,`
+//                  bb-id (`[` ssa-use-and-type-list `]`)? `,`
+//                  bb-id (`[` ssa-use-and-type-list `]`)? attribute-dict?
+static ParseResult parseCondBrOp(OpAsmParser *parser, OperationState *result) {
+  Block *trueDest;
+  Block *falseDest;
+  SmallVector<Value *, 4> trueOperands;
+  SmallVector<Value *, 4> falseOperands;
+  OpAsmParser::OperandType condition;
+
+  Builder &builder = parser->getBuilder();
+  auto *llvmDialect =
+      builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+  auto i1Type = LLVM::LLVMType::getInt1Ty(llvmDialect);
+
+  if (parser->parseOperand(condition) || parser->parseComma() ||
+      parser->parseSuccessorAndUseList(trueDest, trueOperands) ||
+      parser->parseComma() ||
+      parser->parseSuccessorAndUseList(falseDest, falseOperands) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->resolveOperand(condition, i1Type, result->operands))
+    return failure();
+
+  result->addSuccessor(trueDest, trueOperands);
+  result->addSuccessor(falseDest, falseOperands);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ReturnOp.
+//===----------------------------------------------------------------------===//
+
+static void printReturnOp(OpAsmPrinter *p, ReturnOp &op) {
+  *p << op.getOperationName();
+  p->printOptionalAttrDict(op.getAttrs());
+  assert(op.getNumOperands() <= 1);
+
+  if (op.getNumOperands() == 0)
+    return;
+
+  *p << ' ' << *op.getOperand(0) << " : " << op.getOperand(0)->getType();
+}
+
+// <operation> ::= `llvm.return` ssa-use-list attribute-dict? `:`
+//                 type-list-no-parens
+static ParseResult parseReturnOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 1> operands;
+  Type type;
+
+  if (parser->parseOperandList(operands) ||
+      parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+  if (operands.empty())
+    return success();
+
+  if (parser->parseColonType(type) ||
+      parser->resolveOperand(operands[0], type, result->operands))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::UndefOp.
+//===----------------------------------------------------------------------===//
+
+static void printUndefOp(OpAsmPrinter *p, UndefOp &op) {
+  *p << op.getOperationName();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.res()->getType();
+}
+
+// <operation> ::= `llvm.undef` attribute-dict? : type
+static ParseResult parseUndefOp(OpAsmParser *parser, OperationState *result) {
+  Type type;
+
+  if (parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type))
+    return failure();
+
+  result->addTypes(type);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printer, parser and verifier for LLVM::AddressOfOp.
+//===----------------------------------------------------------------------===//
+
+GlobalOp AddressOfOp::getGlobal() {
+  auto module = getParentOfType<ModuleOp>();
+  assert(module && "unexpected operation outside of a module");
+  return module.lookupSymbol<LLVM::GlobalOp>(global_name());
+}
+
+static void printAddressOfOp(OpAsmPrinter *p, AddressOfOp op) {
+  *p << op.getOperationName() << " @" << op.global_name();
+  p->printOptionalAttrDict(op.getAttrs(), {"global_name"});
+  *p << " : " << op.getResult()->getType();
+}
+
+static ParseResult parseAddressOfOp(OpAsmParser *parser,
+                                    OperationState *result) {
+  Attribute symRef;
+  Type type;
+  if (parser->parseAttribute(symRef, "global_name", result->attributes) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->addTypeToList(type, result->types))
+    return failure();
+
+  if (!symRef.isa<SymbolRefAttr>())
+    return parser->emitError(parser->getNameLoc(), "expected symbol reference");
+  return success();
+}
+
+static LogicalResult verify(AddressOfOp op) {
+  auto global = op.getGlobal();
+  if (!global)
+    return op.emitOpError("must reference a global defined by 'llvm.global'");
+
+  if (global.getType().getPointerTo() != op.getResult()->getType())
+    return op.emitOpError(
+        "the type must be a pointer to the type of the referred global");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ConstantOp.
+//===----------------------------------------------------------------------===//
+
+static void printConstantOp(OpAsmPrinter *p, ConstantOp &op) {
+  *p << op.getOperationName() << '(' << op.value() << ')';
+  p->printOptionalAttrDict(op.getAttrs(), {"value"});
+  *p << " : " << op.res()->getType();
+}
+
+// <operation> ::= `llvm.constant` `(` attribute `)` attribute-list? : type
+static ParseResult parseConstantOp(OpAsmParser *parser,
+                                   OperationState *result) {
+  Attribute valueAttr;
+  Type type;
+
+  if (parser->parseLParen() ||
+      parser->parseAttribute(valueAttr, "value", result->attributes) ||
+      parser->parseRParen() ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type))
+    return failure();
+
+  result->addTypes(type);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Builder, printer and verifier for LLVM::GlobalOp.
+//===----------------------------------------------------------------------===//
+
+void GlobalOp::build(Builder *builder, OperationState *result, LLVMType type,
+                     bool isConstant, StringRef name, Attribute value,
+                     ArrayRef<NamedAttribute> attrs) {
+  result->addAttribute(SymbolTable::getSymbolAttrName(),
+                       builder->getStringAttr(name));
+  result->addAttribute("type", builder->getTypeAttr(type));
+  if (isConstant)
+    result->addAttribute("constant", builder->getUnitAttr());
+  result->addAttribute("value", value);
+  result->attributes.append(attrs.begin(), attrs.end());
+}
+
+static void printGlobalOp(OpAsmPrinter *p, GlobalOp op) {
+  *p << op.getOperationName() << ' ';
+  if (op.constant())
+    *p << "constant ";
+  *p << '@' << op.sym_name() << '(';
+  p->printAttribute(op.value());
+  *p << ')';
+  p->printOptionalAttrDict(op.getAttrs(), {SymbolTable::getSymbolAttrName(),
+                                           "type", "constant", "value"});
+
+  // Print the trailing type unless it's a string global.
+  if (op.value().isa<StringAttr>())
+    return;
+  *p << " : ";
+  p->printType(op.type());
+}
+
+// <operation> ::= `llvm.global` `constant`? `@` identifier `(` attribute `)`
+//                  attribute-list? (`:` type)?
+//
+// The type can be omitted for string attributes, in which case it will be
+// inferred from the value of the string as [strlen(value) x i8].
+static ParseResult parseGlobalOp(OpAsmParser *parser, OperationState *result) {
+  if (succeeded(parser->parseOptionalKeyword("constant")))
+    result->addAttribute("constant", parser->getBuilder().getUnitAttr());
+
+  Attribute value;
+  StringAttr name;
+  SmallVector<Type, 1> types;
+  if (parser->parseSymbolName(name, SymbolTable::getSymbolAttrName(),
+                              result->attributes) ||
+      parser->parseLParen() ||
+      parser->parseAttribute(value, "value", result->attributes) ||
+      parser->parseRParen() ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseOptionalColonTypeList(types))
+    return failure();
+
+  if (types.size() > 1)
+    return parser->emitError(parser->getNameLoc(), "expected zero or one type");
+
+  if (types.empty()) {
+    if (auto strAttr = value.dyn_cast<StringAttr>()) {
+      MLIRContext *context = parser->getBuilder().getContext();
+      auto *dialect = context->getRegisteredDialect<LLVMDialect>();
+      auto arrayType = LLVM::LLVMType::getArrayTy(
+          LLVM::LLVMType::getInt8Ty(dialect), strAttr.getValue().size());
+      types.push_back(arrayType);
+    } else {
+      return parser->emitError(parser->getNameLoc(),
+                               "type can only be omitted for string globals");
+    }
+  }
+
+  result->addAttribute("type", parser->getBuilder().getTypeAttr(types[0]));
+  return success();
+}
+
+static LogicalResult verify(GlobalOp op) {
+  if (!llvm::PointerType::isValidElementType(op.getType().getUnderlyingType()))
+    return op.emitOpError(
+        "expects type to be a valid element type for an LLVM pointer");
+  if (op.getParentOp() && !isa<ModuleOp>(op.getParentOp()))
+    return op.emitOpError("must appear at the module level");
+  if (auto strAttr = op.value().dyn_cast<StringAttr>()) {
+    auto type = op.getType();
+    if (!type.getUnderlyingType()->isArrayTy() ||
+        !type.getArrayElementType().getUnderlyingType()->isIntegerTy(8) ||
+        type.getArrayNumElements() != strAttr.getValue().size())
+      return op.emitOpError(
+          "requires an i8 array type of the length equal to that of the string "
+          "attribute");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ShuffleVectorOp.
+//===----------------------------------------------------------------------===//
+// Expects vector to be of wrapped LLVM vector type and position to be of
+// wrapped LLVM i32 type.
+void LLVM::ShuffleVectorOp::build(Builder *b, OperationState *result, Value *v1,
+                                  Value *v2, ArrayAttr mask,
+                                  ArrayRef<NamedAttribute> attrs) {
+  auto wrappedContainerType1 = v1->getType().cast<LLVM::LLVMType>();
+  auto vType = LLVMType::getVectorTy(
+      wrappedContainerType1.getVectorElementType(), mask.size());
+  build(b, result, vType, v1, v2, mask);
+  result->addAttributes(attrs);
+}
+
+static void printShuffleVectorOp(OpAsmPrinter *p, ShuffleVectorOp &op) {
+  *p << op.getOperationName() << ' ' << *op.v1() << ", " << *op.v2() << " "
+     << op.mask();
+  p->printOptionalAttrDict(op.getAttrs(), {"mask"});
+  *p << " : " << op.v1()->getType() << ", " << op.v2()->getType();
+}
+
+// <operation> ::= `llvm.shufflevector` ssa-use `, ` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseShuffleVectorOp(OpAsmParser *parser,
+                                        OperationState *result) {
+  llvm::SMLoc loc;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType v1, v2;
+  Attribute maskAttr;
+  Type typeV1, typeV2;
+  if (parser->getCurrentLocation(&loc) || parser->parseOperand(v1) ||
+      parser->parseComma() || parser->parseOperand(v2) ||
+      parser->parseAttribute(maskAttr, "mask", attrs) ||
+      parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColonType(typeV1) || parser->parseComma() ||
+      parser->parseType(typeV2) ||
+      parser->resolveOperand(v1, typeV1, result->operands) ||
+      parser->resolveOperand(v2, typeV2, result->operands))
+    return failure();
+  auto wrappedContainerType1 = typeV1.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedContainerType1 ||
+      !wrappedContainerType1.getUnderlyingType()->isVectorTy())
+    return parser->emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  auto vType =
+      LLVMType::getVectorTy(wrappedContainerType1.getVectorElementType(),
+                            maskAttr.cast<ArrayAttr>().size());
+  result->attributes = attrs;
+  result->addTypes(vType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Builder, printer and verifier for LLVM::LLVMFuncOp.
+//===----------------------------------------------------------------------===//
+
+void LLVMFuncOp::build(Builder *builder, OperationState *result, StringRef name,
+                       LLVMType type, ArrayRef<NamedAttribute> attrs,
+                       ArrayRef<NamedAttributeList> argAttrs) {
+  result->addRegion();
+  result->addAttribute(SymbolTable::getSymbolAttrName(),
+                       builder->getStringAttr(name));
+  result->addAttribute("type", builder->getTypeAttr(type));
+  result->attributes.append(attrs.begin(), attrs.end());
+  if (argAttrs.empty())
+    return;
+
+  unsigned numInputs = type.getUnderlyingType()->getFunctionNumParams();
+  assert(numInputs == argAttrs.size() &&
+         "expected as many argument attribute lists as arguments");
+  SmallString<8> argAttrName;
+  for (unsigned i = 0; i < numInputs; ++i)
+    if (auto argDict = argAttrs[i].getDictionary())
+      result->addAttribute(getArgAttrName(i, argAttrName), argDict);
+}
+
+// Build an LLVM function type from the given lists of input and output types.
+// Returns a null type if any of the types provided are non-LLVM types, or if
+// there is more than one output type.
+static Type buildLLVMFunctionType(Builder &b, ArrayRef<Type> inputs,
+                                  ArrayRef<Type> outputs,
+                                  impl::VariadicFlag variadicFlag,
+                                  std::string &errorMessage) {
+  if (outputs.size() > 1) {
+    errorMessage = "expected zero or one function result";
+    return {};
+  }
+
+  // Convert inputs to LLVM types, exit early on error.
+  SmallVector<LLVMType, 4> llvmInputs;
+  for (auto t : inputs) {
+    auto llvmTy = t.dyn_cast<LLVMType>();
+    if (!llvmTy) {
+      errorMessage = "expected LLVM type for function arguments";
+      return {};
+    }
+    llvmInputs.push_back(llvmTy);
+  }
+
+  // Get the dialect from the input type, if any exist.  Look it up in the
+  // context otherwise.
+  LLVMDialect *dialect =
+      llvmInputs.empty() ? b.getContext()->getRegisteredDialect<LLVMDialect>()
+                         : &llvmInputs.front().getDialect();
+
+  // No output is denoted as "void" in LLVM type system.
+  LLVMType llvmOutput = outputs.empty() ? LLVMType::getVoidTy(dialect)
+                                        : outputs.front().dyn_cast<LLVMType>();
+  if (!llvmOutput) {
+    errorMessage = "expected LLVM type for function results";
+    return {};
+  }
+  return LLVMType::getFunctionTy(llvmOutput, llvmInputs,
+                                 variadicFlag.isVariadic());
+}
+
+// Print the LLVMFuncOp.  Collects argument and result types and passes them
+// to the trait printer.  Drops "void" result since it cannot be parsed back.
+static void printLLVMFuncOp(OpAsmPrinter *p, LLVMFuncOp op) {
+  LLVMType fnType = op.getType();
+  SmallVector<Type, 8> argTypes;
+  SmallVector<Type, 1> resTypes;
+  argTypes.reserve(fnType.getFunctionNumParams());
+  for (unsigned i = 0, e = fnType.getFunctionNumParams(); i < e; ++i)
+    argTypes.push_back(fnType.getFunctionParamType(i));
+
+  LLVMType returnType = fnType.getFunctionResultType();
+  if (!returnType.getUnderlyingType()->isVoidTy())
+    resTypes.push_back(returnType);
+
+  impl::printFunctionLikeOp(p, op, argTypes, op.isVarArg(), resTypes);
+}
+
+// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+// attribute is present.  This can check for preconditions of the
+// getNumArguments hook not failing.
+LogicalResult LLVMFuncOp::verifyType() {
+  auto llvmType = getTypeAttr().getValue().dyn_cast_or_null<LLVMType>();
+  if (!llvmType || !llvmType.getUnderlyingType()->isFunctionTy())
+    return emitOpError("requires '" + getTypeAttrName() +
+                       "' attribute of wrapped LLVM function type");
+
+  return success();
+}
+
+// Hook for OpTrait::FunctionLike, returns the number of function arguments.
+// Depends on the type attribute being correct as checked by verifyType
+unsigned LLVMFuncOp::getNumFuncArguments() {
+  return getType().getUnderlyingType()->getFunctionNumParams();
+}
+
+static LogicalResult verify(LLVMFuncOp op) {
+  if (op.isExternal())
+    return success();
+
+  if (op.isVarArg())
+    return op.emitOpError("only external functions can be variadic");
+
+  auto *funcType = cast<llvm::FunctionType>(op.getType().getUnderlyingType());
+  unsigned numArguments = funcType->getNumParams();
+  Block &entryBlock = op.front();
+  for (unsigned i = 0; i < numArguments; ++i) {
+    Type argType = entryBlock.getArgument(i)->getType();
+    auto argLLVMType = argType.dyn_cast<LLVMType>();
+    if (!argLLVMType)
+      return op.emitOpError("entry block argument #")
+             << i << " is not of LLVM type";
+    if (funcType->getParamType(i) != argLLVMType.getUnderlyingType())
+      return op.emitOpError("the type of entry block argument #")
+             << i << " does not match the function signature";
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// LLVMDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace LLVM {
+namespace detail {
+struct LLVMDialectImpl {
+  LLVMDialectImpl() : module("LLVMDialectModule", llvmContext) {}
+
+  llvm::LLVMContext llvmContext;
+  llvm::Module module;
+
+  /// A set of LLVMTypes that are cached on construction to avoid any lookups or
+  /// locking.
+  LLVMType int1Ty, int8Ty, int16Ty, int32Ty, int64Ty, int128Ty;
+  LLVMType doubleTy, floatTy, halfTy;
+  LLVMType voidTy;
+
+  /// A smart mutex to lock access to the llvm context. Unlike MLIR, LLVM is not
+  /// multi-threaded and requires locked access to prevent race conditions.
+  llvm::sys::SmartMutex<true> mutex;
+};
+} // end namespace detail
+} // end namespace LLVM
+} // end namespace mlir
+
+LLVMDialect::LLVMDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context),
+      impl(new detail::LLVMDialectImpl()) {
+  addTypes<LLVMType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/LLVMOps.cpp.inc"
+      >();
+
+  // Support unknown operations because not all LLVM operations are registered.
+  allowUnknownOperations();
+
+  // Cache some of the common LLVM types to avoid the need for lookups/locking.
+  auto &llvmContext = impl->llvmContext;
+  /// Integer Types.
+  impl->int1Ty = LLVMType::get(context, llvm::Type::getInt1Ty(llvmContext));
+  impl->int8Ty = LLVMType::get(context, llvm::Type::getInt8Ty(llvmContext));
+  impl->int16Ty = LLVMType::get(context, llvm::Type::getInt16Ty(llvmContext));
+  impl->int32Ty = LLVMType::get(context, llvm::Type::getInt32Ty(llvmContext));
+  impl->int64Ty = LLVMType::get(context, llvm::Type::getInt64Ty(llvmContext));
+  impl->int128Ty = LLVMType::get(context, llvm::Type::getInt128Ty(llvmContext));
+  /// Float Types.
+  impl->doubleTy = LLVMType::get(context, llvm::Type::getDoubleTy(llvmContext));
+  impl->floatTy = LLVMType::get(context, llvm::Type::getFloatTy(llvmContext));
+  impl->halfTy = LLVMType::get(context, llvm::Type::getHalfTy(llvmContext));
+  /// Other Types.
+  impl->voidTy = LLVMType::get(context, llvm::Type::getVoidTy(llvmContext));
+}
+
+LLVMDialect::~LLVMDialect() {}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/LLVMOps.cpp.inc"
+
+llvm::LLVMContext &LLVMDialect::getLLVMContext() { return impl->llvmContext; }
+llvm::Module &LLVMDialect::getLLVMModule() { return impl->module; }
+
+/// Parse a type registered to this dialect.
+Type LLVMDialect::parseType(StringRef tyData, Location loc) const {
+  // LLVM is not thread-safe, so lock access to it.
+  llvm::sys::SmartScopedLock<true> lock(impl->mutex);
+
+  llvm::SMDiagnostic errorMessage;
+  llvm::Type *type = llvm::parseType(tyData, errorMessage, impl->module);
+  if (!type)
+    return (emitError(loc, errorMessage.getMessage()), nullptr);
+  return LLVMType::get(getContext(), type);
+}
+
+/// Print a type registered to this dialect.
+void LLVMDialect::printType(Type type, raw_ostream &os) const {
+  auto llvmType = type.dyn_cast<LLVMType>();
+  assert(llvmType && "printing wrong type");
+  assert(llvmType.getUnderlyingType() && "no underlying LLVM type");
+  llvmType.getUnderlyingType()->print(os);
+}
+
+/// Verify LLVMIR function argument attributes.
+LogicalResult LLVMDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIdx,
+                                                    unsigned argIdx,
+                                                    NamedAttribute argAttr) {
+  // Check that llvm.noalias is a boolean attribute.
+  if (argAttr.first == "llvm.noalias" && !argAttr.second.isa<BoolAttr>())
+    return op->emitError()
+           << "llvm.noalias argument attribute of non boolean type";
+  return success();
+}
+
+static DialectRegistration<LLVMDialect> llvmDialect;
+
+//===----------------------------------------------------------------------===//
+// LLVMType.
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace LLVM {
+namespace detail {
+struct LLVMTypeStorage : public ::mlir::TypeStorage {
+  LLVMTypeStorage(llvm::Type *ty) : underlyingType(ty) {}
+
+  // LLVM types are pointer-unique.
+  using KeyTy = llvm::Type *;
+  bool operator==(const KeyTy &key) const { return key == underlyingType; }
+
+  static LLVMTypeStorage *construct(TypeStorageAllocator &allocator,
+                                    llvm::Type *ty) {
+    return new (allocator.allocate<LLVMTypeStorage>()) LLVMTypeStorage(ty);
+  }
+
+  llvm::Type *underlyingType;
+};
+} // end namespace detail
+} // end namespace LLVM
+} // end namespace mlir
+
+LLVMType LLVMType::get(MLIRContext *context, llvm::Type *llvmType) {
+  return Base::get(context, FIRST_LLVM_TYPE, llvmType);
+}
+
+/// Get an LLVMType with an llvm type that may cause changes to the underlying
+/// llvm context when constructed.
+LLVMType LLVMType::getLocked(LLVMDialect *dialect,
+                             llvm::function_ref<llvm::Type *()> typeBuilder) {
+  // Lock access to the llvm context and build the type.
+  llvm::sys::SmartScopedLock<true> lock(dialect->impl->mutex);
+  return get(dialect->getContext(), typeBuilder());
+}
+
+LLVMDialect &LLVMType::getDialect() {
+  return static_cast<LLVMDialect &>(Type::getDialect());
+}
+
+llvm::Type *LLVMType::getUnderlyingType() const {
+  return getImpl()->underlyingType;
+}
+
+/// Array type utilities.
+LLVMType LLVMType::getArrayElementType() {
+  return get(getContext(), getUnderlyingType()->getArrayElementType());
+}
+unsigned LLVMType::getArrayNumElements() {
+  return getUnderlyingType()->getArrayNumElements();
+}
+bool LLVMType::isArrayTy() { return getUnderlyingType()->isArrayTy(); }
+
+/// Vector type utilities.
+LLVMType LLVMType::getVectorElementType() {
+  return get(getContext(), getUnderlyingType()->getVectorElementType());
+}
+bool LLVMType::isVectorTy() { return getUnderlyingType()->isVectorTy(); }
+
+/// Function type utilities.
+LLVMType LLVMType::getFunctionParamType(unsigned argIdx) {
+  return get(getContext(), getUnderlyingType()->getFunctionParamType(argIdx));
+}
+unsigned LLVMType::getFunctionNumParams() {
+  return getUnderlyingType()->getFunctionNumParams();
+}
+LLVMType LLVMType::getFunctionResultType() {
+  return get(
+      getContext(),
+      llvm::cast<llvm::FunctionType>(getUnderlyingType())->getReturnType());
+}
+bool LLVMType::isFunctionTy() { return getUnderlyingType()->isFunctionTy(); }
+
+/// Pointer type utilities.
+LLVMType LLVMType::getPointerTo(unsigned addrSpace) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&getDialect(), [=] {
+    return getUnderlyingType()->getPointerTo(addrSpace);
+  });
+}
+LLVMType LLVMType::getPointerElementTy() {
+  return get(getContext(), getUnderlyingType()->getPointerElementType());
+}
+bool LLVMType::isPointerTy() { return getUnderlyingType()->isPointerTy(); }
+
+/// Struct type utilities.
+LLVMType LLVMType::getStructElementType(unsigned i) {
+  return get(getContext(), getUnderlyingType()->getStructElementType(i));
+}
+bool LLVMType::isStructTy() { return getUnderlyingType()->isStructTy(); }
+
+/// Utilities used to generate floating point types.
+LLVMType LLVMType::getDoubleTy(LLVMDialect *dialect) {
+  return dialect->impl->doubleTy;
+}
+LLVMType LLVMType::getFloatTy(LLVMDialect *dialect) {
+  return dialect->impl->floatTy;
+}
+LLVMType LLVMType::getHalfTy(LLVMDialect *dialect) {
+  return dialect->impl->halfTy;
+}
+
+/// Utilities used to generate integer types.
+LLVMType LLVMType::getIntNTy(LLVMDialect *dialect, unsigned numBits) {
+  switch (numBits) {
+  case 1:
+    return dialect->impl->int1Ty;
+  case 8:
+    return dialect->impl->int8Ty;
+  case 16:
+    return dialect->impl->int16Ty;
+  case 32:
+    return dialect->impl->int32Ty;
+  case 64:
+    return dialect->impl->int64Ty;
+  case 128:
+    return dialect->impl->int128Ty;
+  default:
+    break;
+  }
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(dialect, [=] {
+    return llvm::Type::getIntNTy(dialect->getLLVMContext(), numBits);
+  });
+}
+
+/// Utilities used to generate other miscellaneous types.
+LLVMType LLVMType::getArrayTy(LLVMType elementType, uint64_t numElements) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&elementType.getDialect(), [=] {
+    return llvm::ArrayType::get(elementType.getUnderlyingType(), numElements);
+  });
+}
+LLVMType LLVMType::getFunctionTy(LLVMType result, ArrayRef<LLVMType> params,
+                                 bool isVarArg) {
+  SmallVector<llvm::Type *, 8> llvmParams;
+  for (auto param : params)
+    llvmParams.push_back(param.getUnderlyingType());
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&result.getDialect(), [=] {
+    return llvm::FunctionType::get(result.getUnderlyingType(), llvmParams,
+                                   isVarArg);
+  });
+}
+LLVMType LLVMType::getStructTy(LLVMDialect *dialect,
+                               ArrayRef<LLVMType> elements, bool isPacked) {
+  SmallVector<llvm::Type *, 8> llvmElements;
+  for (auto elt : elements)
+    llvmElements.push_back(elt.getUnderlyingType());
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(dialect, [=] {
+    return llvm::StructType::get(dialect->getLLVMContext(), llvmElements,
+                                 isPacked);
+  });
+}
+LLVMType LLVMType::getVectorTy(LLVMType elementType, unsigned numElements) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&elementType.getDialect(), [=] {
+    return llvm::VectorType::get(elementType.getUnderlyingType(), numElements);
+  });
+}
+LLVMType LLVMType::getVoidTy(LLVMDialect *dialect) {
+  return dialect->impl->voidTy;
+}
+
+//===----------------------------------------------------------------------===//
+// Utility functions.
+//===----------------------------------------------------------------------===//
+
+Value *mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
+                                      StringRef name, StringRef value,
+                                      LLVM::LLVMDialect *llvmDialect) {
+  assert(builder.getInsertionBlock() &&
+         builder.getInsertionBlock()->getParentOp() &&
+         "expected builder to point to a block constained in an op");
+  auto module =
+      builder.getInsertionBlock()->getParentOp()->getParentOfType<ModuleOp>();
+  assert(module && "builder points to an op outside of a module");
+
+  // Create the global at the entry of the module.
+  OpBuilder moduleBuilder(module.getBodyRegion());
+  auto type = LLVM::LLVMType::getArrayTy(LLVM::LLVMType::getInt8Ty(llvmDialect),
+                                         value.size());
+  auto global = moduleBuilder.create<LLVM::GlobalOp>(
+      loc, type, /*isConstant=*/true, name, builder.getStringAttr(value));
+
+  // Get the pointer to the first character in the global string.
+  Value *globalPtr = builder.create<LLVM::AddressOfOp>(loc, global);
+  Value *cst0 = builder.create<LLVM::ConstantOp>(
+      loc, LLVM::LLVMType::getInt64Ty(llvmDialect),
+      builder.getIntegerAttr(builder.getIndexType(), 0));
+  return builder.create<LLVM::GEPOp>(
+      loc, LLVM::LLVMType::getInt8PtrTy(llvmDialect), globalPtr,
+      ArrayRef<Value *>({cst0, cst0}));
+}
diff --git a/third_party/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/third_party/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
new file mode 100644
index 00000000000..90d285e0311
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -0,0 +1,129 @@
+//===- NVVMDialect.cpp - NVVM IR Ops and Dialect registration -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the types and operation details for the NVVM IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+// The NVVM dialect only contains GPU specific additions on top of the general
+// LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/SourceMgr.h"
+
+namespace mlir {
+namespace NVVM {
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for NVVM ops
+//===----------------------------------------------------------------------===//
+
+static void printNVVMIntrinsicOp(OpAsmPrinter *p, Operation *op) {
+  *p << op->getName() << " ";
+  p->printOperands(op->getOperands());
+  if (op->getNumResults() > 0)
+    interleaveComma(op->getResultTypes(), *p << " : ");
+}
+
+// <operation> ::= `llvm.nvvm.XYZ` : type
+static ParseResult parseNVVMSpecialRegisterOp(OpAsmParser *parser,
+                                              OperationState *result) {
+  Type type;
+  if (parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type))
+    return failure();
+
+  result->addTypes(type);
+  return success();
+}
+
+static LLVM::LLVMDialect *getLlvmDialect(OpAsmParser *parser) {
+  return parser->getBuilder()
+      .getContext()
+      ->getRegisteredDialect<LLVM::LLVMDialect>();
+}
+
+// <operation> ::=
+//     `llvm.nvvm.shfl.sync.bfly %dst, %val, %offset, %clamp_and_mask`
+//     : result_type
+static ParseResult parseNVVMShflSyncBflyOp(OpAsmParser *parser,
+                                           OperationState *result) {
+  auto llvmDialect = getLlvmDialect(parser);
+  auto int32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+
+  SmallVector<OpAsmParser::OperandType, 8> ops;
+  Type type;
+  return failure(parser->parseOperandList(ops) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->addTypeToList(type, result->types) ||
+                 parser->resolveOperands(ops, {int32Ty, type, int32Ty, int32Ty},
+                                         parser->getNameLoc(),
+                                         result->operands));
+}
+
+// <operation> ::= `llvm.nvvm.vote.ballot.sync %mask, %pred` : result_type
+static ParseResult parseNVVMVoteBallotOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  auto llvmDialect = getLlvmDialect(parser);
+  auto int32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+  auto int1Ty = LLVM::LLVMType::getInt1Ty(llvmDialect);
+
+  SmallVector<OpAsmParser::OperandType, 8> ops;
+  Type type;
+  return failure(parser->parseOperandList(ops) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->addTypeToList(type, result->types) ||
+                 parser->resolveOperands(ops, {int32Ty, int1Ty},
+                                         parser->getNameLoc(),
+                                         result->operands));
+}
+
+//===----------------------------------------------------------------------===//
+// NVVMDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+// TODO(herhut): This should be the llvm.nvvm dialect once this is supported.
+NVVMDialect::NVVMDialect(MLIRContext *context) : Dialect("nvvm", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+      >();
+
+  // Support unknown operations because not all NVVM operations are registered.
+  allowUnknownOperations();
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+
+static DialectRegistration<NVVMDialect> nvvmDialect;
+
+} // namespace NVVM
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp b/third_party/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
new file mode 100644
index 00000000000..3fab843d56d
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
@@ -0,0 +1,212 @@
+//===- DependenceAnalysis.cpp - Dependence analysis on SSA views ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements view-based alias and dependence analyses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-dependence-analysis"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+using llvm::dbgs;
+
+Value *Aliases::find(Value *v) {
+  if (isa<BlockArgument>(v))
+    return v;
+
+  auto it = aliases.find(v);
+  if (it != aliases.end()) {
+    assert(((isa<BlockArgument>(it->getSecond()) &&
+             it->getSecond()->getType().isa<ViewType>()) ||
+            it->getSecond()->getType().isa<BufferType>()) &&
+           "Buffer or block argument expected");
+    return it->getSecond();
+  }
+
+  while (true) {
+    if (isa<BlockArgument>(v))
+      return v;
+    if (auto slice = dyn_cast_or_null<SliceOp>(v->getDefiningOp())) {
+      auto it = aliases.insert(std::make_pair(v, find(slice.view())));
+      return it.first->second;
+    }
+    if (auto view = dyn_cast_or_null<ViewOp>(v->getDefiningOp())) {
+      auto it = aliases.insert(std::make_pair(v, view.buffer()));
+      return it.first->second;
+    }
+    if (auto view = dyn_cast_or_null<SubViewOp>(v->getDefiningOp())) {
+      v = view.getView();
+      continue;
+    }
+    llvm::errs() << "View alias analysis reduces to: " << *v << "\n";
+    llvm_unreachable("unsupported view alias case");
+  }
+}
+
+LinalgDependenceGraph::LinalgDependenceGraph(Aliases &aliases,
+                                             ArrayRef<Operation *> ops)
+    : aliases(aliases), linalgOps(ops.begin(), ops.end()) {
+  for (auto en : llvm::enumerate(linalgOps)) {
+    assert(isa<LinalgOp>(en.value()) && "Expected value for LinalgOp");
+    linalgOpPositions.insert(std::make_pair(en.value(), en.index()));
+  }
+  for (unsigned i = 0, e = ops.size(); i < e; ++i) {
+    for (unsigned j = i + 1; j < e; ++j) {
+      addDependencesBetween(cast<LinalgOp>(ops[i]), cast<LinalgOp>(ops[j]));
+    }
+  }
+}
+
+void LinalgDependenceGraph::addDependenceElem(DependenceType dt,
+                                              LinalgOpView indexingOpView,
+                                              LinalgOpView dependentOpView) {
+  LLVM_DEBUG(dbgs() << "\nAdd dep type " << dt << ":\t" << *indexingOpView.op
+                    << " -> " << *dependentOpView.op);
+  dependencesFromGraphs[dt][indexingOpView.op].push_back(
+      LinalgDependenceGraphElem{dependentOpView, indexingOpView.view});
+  dependencesIntoGraphs[dt][dependentOpView.op].push_back(
+      LinalgDependenceGraphElem{indexingOpView, dependentOpView.view});
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesFrom(
+    LinalgOp src, LinalgDependenceGraph::DependenceType dt) {
+  return getDependencesFrom(src.getOperation(), dt);
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesFrom(
+    Operation *src, LinalgDependenceGraph::DependenceType dt) {
+  auto &vec = dependencesFromGraphs[dt][src];
+  return llvm::make_range(vec.begin(), vec.end());
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesInto(
+    LinalgOp dst, LinalgDependenceGraph::DependenceType dt) {
+  return getDependencesInto(dst.getOperation(), dt);
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesInto(
+    Operation *dst, LinalgDependenceGraph::DependenceType dt) {
+  auto &vec = dependencesIntoGraphs[dt][dst];
+  return llvm::make_range(vec.begin(), vec.end());
+}
+
+void LinalgDependenceGraph::addDependencesBetween(LinalgOp src, LinalgOp dst) {
+  for (auto *srcView : src.getOutputs()) { // W
+    // RAW graph
+    for (auto *dstView : dst.getInputs()) {  // R
+      if (aliases.alias(srcView, dstView)) { // if alias, fill RAW
+        addDependenceElem(DependenceType::RAW,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+    // WAW graph
+    for (auto *dstView : dst.getOutputs()) { // W
+      if (aliases.alias(srcView, dstView)) { // if alias, fill WAW
+        addDependenceElem(DependenceType::WAW,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+  }
+  for (auto *srcView : src.getInputs()) { // R
+    // RAR graph
+    for (auto *dstView : dst.getInputs()) {  // R
+      if (aliases.alias(srcView, dstView)) { // if alias, fill RAR
+        addDependenceElem(DependenceType::RAR,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+    // WAR graph
+    for (auto *dstView : dst.getOutputs()) { // W
+      if (aliases.alias(srcView, dstView)) { // if alias, fill WAR
+        addDependenceElem(DependenceType::WAR,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+  }
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findCoveringDependences(LinalgOp srcLinalgOp,
+                                               LinalgOp dstLinalgOp) {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, nullptr,
+      {DependenceType::WAW, DependenceType::WAR, DependenceType::RAW});
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findCoveringWrites(LinalgOp srcLinalgOp,
+                                          LinalgOp dstLinalgOp, Value *view) {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, view,
+      {DependenceType::WAW, DependenceType::WAR});
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findCoveringReads(LinalgOp srcLinalgOp,
+                                         LinalgOp dstLinalgOp, Value *view) {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, view,
+      {DependenceType::RAR, DependenceType::RAW});
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findOperationsWithCoveringDependences(
+    LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value *view,
+    ArrayRef<DependenceType> types) {
+  auto *src = srcLinalgOp.getOperation();
+  auto *dst = dstLinalgOp.getOperation();
+  auto srcPos = linalgOpPositions[src];
+  auto dstPos = linalgOpPositions[dst];
+  assert(srcPos < dstPos && "expected dst after src in IR traversal order");
+
+  SmallVector<Operation *, 8> res;
+  // Consider an intermediate interleaved `interim` op, look for any dependence
+  // to an aliasing view on a src -> op -> dst path.
+  // TODO(ntv) we are not considering paths yet, just interleaved positions.
+  for (auto dt : types) {
+    for (auto dependence : getDependencesFrom(src, dt)) {
+      auto interimPos = linalgOpPositions[dependence.dependentOpView.op];
+      // Skip if not interleaved.
+      if (interimPos >= dstPos || interimPos <= srcPos)
+        continue;
+      if (view && !aliases.alias(view, dependence.indexingView))
+        continue;
+      auto *op = dependence.dependentOpView.op;
+      LLVM_DEBUG(dbgs() << "\n***Found covering dependence of type " << dt
+                        << ": " << *src << " -> " << *op << " on "
+                        << *dependence.indexingView);
+      res.push_back(op);
+    }
+  }
+  return res;
+}
diff --git a/third_party/mlir/lib/Dialect/Linalg/CMakeLists.txt b/third_party/mlir/lib/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 00000000000..8eea5dc7137
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_llvm_library(MLIRLinalg
+  LinalgRegistration.cpp
+  Analysis/DependenceAnalysis.cpp
+  IR/LinalgOps.cpp
+  IR/LinalgTypes.cpp
+  Transforms/Fusion.cpp
+  Transforms/LowerToLLVMDialect.cpp
+  Transforms/LowerToLoops.cpp
+  Transforms/Tiling.cpp
+  Utils/Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
+  DEPENDS
+  intrinsics_gen
+  )
+
+add_dependencies(MLIRLinalg
+
+  MLIRAffineOps
+  MLIRLinalgOpsIncGen
+  MLIRLinalgLibraryOpsIncGen
+  MLIRStandardToLLVM
+  )
diff --git a/third_party/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/third_party/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
new file mode 100644
index 00000000000..b5bbb59091f
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -0,0 +1,1035 @@
+//===- LinalgOps.cpp - Implementation of the linalg operations ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a the Linalg operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+
+namespace {
+/// Fold constant dimensions into an alloc operation.
+struct SimplifyDimOp : public OpRewritePattern<linalg::DimOp> {
+  using OpRewritePattern<linalg::DimOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(linalg::DimOp dimOp,
+                                     PatternRewriter &rewriter) const override;
+};
+} // end namespace
+
+PatternMatchResult
+SimplifyDimOp::matchAndRewrite(linalg::DimOp dimOp,
+                               PatternRewriter &rewriter) const {
+  auto *viewProducingOp = dimOp.view()->getDefiningOp();
+  auto subView = dyn_cast_or_null<SubViewOp>(viewProducingOp);
+  auto slice = dyn_cast_or_null<SliceOp>(viewProducingOp);
+  auto view = dyn_cast_or_null<ViewOp>(viewProducingOp);
+  assert(subView || slice || view);
+
+  unsigned dim = dimOp.getIndex();
+  Value *min, *max, *step;
+  if (view) {
+    // Cannot traverse block arguments, fail.
+    if (isa<BlockArgument>(view.getRange(dim)))
+      return matchFailure();
+    // Record min, max, step for further processing.
+    auto range = cast<RangeOp>(view.getRange(dim)->getDefiningOp());
+    std::tie(min, max, step) =
+        std::make_tuple(range.min(), range.max(), range.step());
+  } else if (subView) {
+    // Record min, max, step for further processing.
+    auto range = subView.getRange(dim);
+    std::tie(min, max, step) =
+        std::make_tuple(range.min, range.max, range.step);
+  } else {
+    // Taking the dim of a slice must take a range (since other dims have been
+    // rank-reduced).
+    auto *rangeValue = slice.getRanges()[dim];
+    // Cannot traverse block arguments, fail.
+    if (isa<BlockArgument>(rangeValue))
+      return matchFailure();
+    auto range = cast<RangeOp>(rangeValue->getDefiningOp());
+    // Record min, max, step for further processing.
+    std::tie(min, max, step) =
+        std::make_tuple(range.min(), range.max(), range.step());
+  }
+
+  // Only support constant steps of 1 atm.
+  auto constant = dyn_cast_or_null<ConstantIndexOp>(step->getDefiningOp());
+  if (!constant || constant.getValue() != 1)
+    return matchFailure();
+
+  // Circumvent affine constraints:
+  //   emit an affine_apply when possible, otherwise emit a `subi`.
+  bool validAffineMin = isValidDim(min) || isValidSymbol(min) ||
+                        isa_and_nonnull<ConstantIndexOp>(min->getDefiningOp());
+  bool validAffineMax = isValidDim(max) || isValidSymbol(max) ||
+                        isa_and_nonnull<ConstantIndexOp>(max->getDefiningOp());
+
+  OpBuilder b(dimOp);
+  ScopedContext scope(b, dimOp.getLoc());
+  // Emit `subi`.
+  if (!validAffineMin || !validAffineMax) {
+    rewriter.replaceOp(dimOp, {subi(max, min)}, {dimOp.view()});
+    return matchSuccess();
+  }
+
+  // Emit affine_apply.
+  using edsc::op::operator-;
+  rewriter.replaceOp(dimOp, {ValueHandle(max) - ValueHandle(min)},
+                     {dimOp.view()});
+  return matchSuccess();
+}
+
+///////////////////// Operations defined with Tablegen /////////////////////////
+// For such operations that do not correspond to library calls (i.e. defined in
+// LinalgOps.td), we define an overloaded `print` function and a
+// parse`className` function.
+
+//===----------------------------------------------------------------------===//
+// BufferAllocOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, BufferAllocOp op) {
+  *p << op.getOperationName() << " ";
+  if (!llvm::empty(op.size()))
+    *p << *op.getOperand(0);
+  if (op.alignment().hasValue() && op.alignment()->getSExtValue() != 0)
+    p->printOptionalAttrDict(op.getAttrs());
+  else
+    p->printOptionalAttrDict(op.getAttrs(),
+                             BufferAllocOp::getAlignmentAttrName());
+  *p << " : " << op.getBufferType();
+}
+
+static ParseResult parseBufferAllocOp(OpAsmParser *parser,
+                                      OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 1> sizeInfo;
+  BufferType bufferType;
+  auto indexTy = parser->getBuilder().getIndexType();
+  if (parser->parseOperandList(sizeInfo) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(bufferType))
+    return failure();
+  if (sizeInfo.empty())
+    return parser->addTypeToList(bufferType, result->types);
+  return failure(parser->resolveOperands(sizeInfo, indexTy, result->operands) ||
+                 parser->addTypeToList(bufferType, result->types));
+}
+
+static LogicalResult verify(BufferAllocOp op) {
+  if (!op.getBufferType().hasConstantSize()) {
+    if (llvm::size(op.size()) != 1)
+      return op.emitOpError("expected one index operand");
+  } else { // op.getBufferType().hasConstantSize()
+    if (!llvm::empty(op.size()))
+      return op.emitOpError("expected zero operand");
+    if (op.getBufferType().getBufferSize().getValue() <= 0)
+      return op.emitOpError("expected nonnegative static buffer size");
+  }
+  if (op.alignment().hasValue()) {
+    auto align = op.alignment().getValue();
+    if (align.getSExtValue() < 0)
+      return op.emitOpError("expected positive alignment");
+    if (!llvm::isPowerOf2_64(align.getZExtValue()))
+      return op.emitOpError("expected power of 2 alignment");
+  }
+  if (!TensorType::isValidElementType(op.getElementType()))
+    return op.emitOpError("expected valid buffer element type");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BufferDeallocOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, BufferDeallocOp op) {
+  *p << op.getOperationName() << " " << *op.buffer();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getBufferType();
+}
+
+static ParseResult parseBufferDeallocOp(OpAsmParser *parser,
+                                        OperationState *result) {
+  OpAsmParser::OperandType bufferInfo;
+  BufferType bufferType;
+  if (parser->parseOperand(bufferInfo) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(bufferType))
+    return failure();
+  return parser->resolveOperands(bufferInfo, bufferType, result->operands);
+}
+
+//===----------------------------------------------------------------------===//
+// BufferSizeOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, BufferSizeOp op) {
+  *p << op.getOperationName() << " " << *op.buffer();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.buffer()->getType();
+}
+
+static ParseResult parseBufferSizeOp(OpAsmParser *parser,
+                                     OperationState *result) {
+  OpAsmParser::OperandType op;
+  Type type;
+  return failure(parser->parseOperand(op) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(op, type, result->operands) ||
+                 parser->addTypeToList(parser->getBuilder().getIndexType(),
+                                       result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// DimOp
+//===----------------------------------------------------------------------===//
+void mlir::linalg::DimOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyDimOp>(context);
+}
+
+static void print(OpAsmPrinter *p, linalg::DimOp op) {
+  *p << op.getOperationName() << " " << *op.getOperand() << ", "
+     << op.getIndex();
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"index"});
+  *p << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseDimOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType operandInfo;
+  IntegerAttr indexAttr;
+  Type type;
+  Type indexType = parser->getBuilder().getIndexType();
+  return failure(parser->parseOperand(operandInfo) || parser->parseComma() ||
+                 parser->parseAttribute(indexAttr, indexType, "index",
+                                        result->attributes) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(operandInfo, type, result->operands) ||
+                 parser->addTypeToList(indexType, result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// GenericOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, GenericOp op) {
+  auto attrNames = op.linalgTraitAttrNames();
+  llvm::StringSet<> linalgTraitAttrsSet;
+  linalgTraitAttrsSet.insert(attrNames.begin(), attrNames.end());
+  SmallVector<NamedAttribute, 8> attrs;
+  for (auto attr : op.getAttrs()) {
+    if (linalgTraitAttrsSet.count(attr.first.strref()) > 0)
+      attrs.push_back(attr);
+  }
+  auto dictAttr = DictionaryAttr::get(attrs, op.getContext());
+  *p << op.getOperationName() << " " << dictAttr << " ";
+  p->printOperands(op.getOperands());
+  if (!op.region().empty())
+    p->printRegion(op.region());
+  p->printOptionalAttrDict(op.getAttrs(), attrNames);
+  *p << ": ";
+  interleaveComma(op.getOperandTypes(), *p);
+}
+
+static ParseResult parseGenericOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 8> operandsInfo, regionOperandsInfo;
+  DictionaryAttr dictAttr;
+  // Parse the core linalg traits that must check into a dictAttr.
+  // The name is unimportant as we will overwrite result->attributes.
+  // The core linalg traits must contain the information necessary to pass the
+  // verifier.
+  if (parser->parseAttribute(dictAttr, "_", result->attributes) ||
+      parser->parseOperandList(operandsInfo))
+    return failure();
+  result->attributes.assign(dictAttr.getValue().begin(),
+                            dictAttr.getValue().end());
+
+  Region &region = *result->addRegion();
+  SmallVector<Type, 8> operandTypes, regionTypes;
+  // Optional attributes may be added.
+  // Either Optional "fun" attribute or region must be specified.
+  if (!dictAttr.get("fun") &&
+      parser->parseOptionalRegion(region, regionOperandsInfo, regionTypes))
+    return failure();
+  if (parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonTypeList(operandTypes))
+    return failure();
+  return parser->resolveOperands(operandsInfo, operandTypes,
+                                 parser->getCurrentLocation(),
+                                 result->operands);
+}
+
+static LogicalResult verify(GenericOp op) {
+  auto nInputViews = op.getNumInputs();
+  auto nViews = op.getNumInputsAndOutputs();
+  if (nViews != llvm::size(op.views()))
+    return op.emitError("op expected exactly ") << nViews << " view operands";
+
+  auto &region = op.region();
+  auto funOp = op.getFunction();
+  auto funType = funOp ? funOp.getType() : FunctionType();
+  if (!region.empty()) {
+    if (region.getBlocks().size() != 1)
+      return op.emitError("op expected region with 1 block");
+
+    auto &block = region.getBlocks().front();
+    if (block.getNumArguments() != nViews)
+      return op.emitError(
+          "op expected number of block arguments to match number of views");
+
+    for (unsigned i = 0; i < nViews; ++i) {
+      auto viewType = op.getViewType(i);
+      if (viewType.getElementType() != block.getArgument(i)->getType())
+        return op.emitError("op expected block argument ")
+               << i << " of the same type as elemental type of "
+               << ((i < nInputViews) ? "input " : "output ")
+               << "view: " << viewType;
+    }
+  } else {
+    if (!funOp || !funOp.getType())
+      return op.emitError(
+          "op expected fun attribute to refer to a defined symbol");
+    if (funType.getNumInputs() != nViews)
+      return op.emitError("op expected fun arguments to match number of views");
+    if (funType.getNumResults() != op.getNumOutputs())
+      return op.emitError(
+          "op expected fun results to match number of output views");
+  }
+
+  auto nLoops = op.getNumLoops();
+  SmallVector<AffineMap, 4> indexingMaps;
+  indexingMaps.reserve(op.indexing_maps().size());
+  for (auto en : llvm::enumerate(op.indexing_maps())) {
+    auto idx = en.index();
+    auto m = en.value().cast<AffineMapAttr>().getValue();
+    indexingMaps.push_back(m); // Save reference to map for further checks.
+    auto view = (idx < nInputViews) ? op.getInputViewType(idx)
+                                    : op.getOutputViewType(idx - nInputViews);
+
+    if (m.getNumSymbols() != 0)
+      return op.emitError("op expected indexing_map #")
+             << idx << " to have no symbols";
+
+    if (m.getNumDims() != nLoops)
+      return op.emitError("op expected indexing_map #")
+             << idx << " to have " << nLoops
+             << " dim(s) to match the number of loops";
+
+    if (m.getNumResults() == 1 && view.getRank() == 0) {
+      auto cst = m.getResult(0).dyn_cast<AffineConstantExpr>();
+      if (!cst || cst.getValue() != 0)
+        return op.emitError("op expected indexing_map #")
+               << idx << " to be 0 to match 0-D view: " << view;
+    }
+
+    if (m.getNumResults() != view.getRank())
+      return op.emitError("op expected indexing_map #")
+             << idx << " results to match view rank: " << view;
+
+    if (funType) {
+      if (funType.getInput(idx) != view.getElementType())
+        return op.emitError("op expected fun argument ")
+               << idx
+               << " to match view element type: " << view.getElementType();
+
+      if (idx >= nInputViews)
+        if (funType.getResult(idx - nInputViews) != view.getElementType())
+          return op.emitError("op expected fun result ")
+                 << idx << " to match output view element type: "
+                 << view.getElementType();
+    }
+  }
+
+  auto concatMap = concatAffineMaps(indexingMaps);
+  auto aggregateMap = inversePermutation(concatMap);
+  if (!aggregateMap)
+    return op.emitError("op expected the concatenation of maps in indexing_map "
+                        "to be invertible");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// LoadOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, linalg::LoadOp op) {
+  *p << op.getOperationName() << " " << *op.view() << '[';
+  p->printOperands(op.indices());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getViewType();
+}
+
+static ParseResult parseLoadOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType viewInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  ViewType type;
+
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(viewInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(viewInfo, type, result->operands) ||
+      parser->resolveOperands(indexInfo, affineIntTy, result->operands) ||
+      parser->addTypeToList(type.getElementType(), result->types));
+}
+
+static LogicalResult verify(linalg::LoadOp op) {
+  if (op.getRank() != llvm::size(op.indices()))
+    return op.emitOpError("expected ")
+           << op.getRank() << " indices, got " << llvm::size(op.indices());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// RangeOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, RangeOp op) {
+  *p << op.getOperationName() << " " << *op.min() << ":" << *op.max() << ":"
+     << *op.step();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getResult()->getType();
+}
+
+static ParseResult parseRangeOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 3> rangeInfo(3);
+  RangeType type;
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(rangeInfo[0]) || parser->parseColon() ||
+      parser->parseOperand(rangeInfo[1]) || parser->parseColon() ||
+      parser->parseOperand(rangeInfo[2]) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperands(rangeInfo, affineIntTy, result->operands) ||
+      parser->addTypeToList(type, result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// SliceOp
+//===----------------------------------------------------------------------===//
+
+void mlir::linalg::SliceOp::build(Builder *b, OperationState *result,
+                                  Value *base, ArrayRef<Value *> indexings) {
+  result->addOperands(base);
+  result->addOperands(indexings);
+
+  ViewType viewType = base->getType().cast<ViewType>();
+  unsigned rank = viewType.getRank();
+  for (auto *i : indexings)
+    if (!i->getType().isa<RangeType>())
+      rank--;
+  Type elementType = viewType.getElementType();
+  result->addTypes({ViewType::get(b->getContext(), elementType, rank)});
+}
+
+static void print(OpAsmPrinter *p, SliceOp op) {
+  *p << SliceOp::getOperationName() << " " << *op.view() << "[";
+  p->printOperands(op.indexings());
+  *p << "] ";
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getBaseViewType();
+  for (auto indexing : op.indexings()) {
+    *p << ", " << indexing->getType();
+  }
+  *p << ", " << op.getType();
+}
+
+static ParseResult parseSliceOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType baseInfo;
+  SmallVector<OpAsmParser::OperandType, 8> operands;
+  SmallVector<Type, 8> types;
+  if (parser->parseOperand(baseInfo) ||
+      parser->parseOperandList(operands, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonTypeList(types))
+    return failure();
+
+  if (types.size() < 2)
+    return parser->emitError(parser->getCurrentLocation(),
+                             "expected at least input and result view types");
+
+  ArrayRef<Type> indexingTypes = ArrayRef<Type>(types).drop_front().drop_back();
+  return failure(
+      parser->resolveOperand(baseInfo, types.front(), result->operands) ||
+      (!operands.empty() &&
+       parser->resolveOperands(operands, indexingTypes,
+                               operands.front().location, result->operands)) ||
+      parser->addTypeToList(types.back(), result->types));
+}
+
+static LogicalResult verify(SliceOp op) {
+  unsigned rank = op.getBaseViewRank();
+  if (rank != llvm::size(op.indexings()))
+    return op.emitOpError("expected ")
+           << op.getRank() << " indexings, got " << llvm::size(op.indexings());
+  unsigned index = 0;
+  for (auto indexing : op.indexings()) {
+    if (indexing->getType().isa<IndexType>())
+      --rank;
+    ++index;
+  }
+  if (op.getRank() != rank)
+    return op.emitOpError() << "expected rank of the view(" << op.getRank()
+                            << ") to be the number of ranges(" << rank << ")";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// StoreOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, linalg::StoreOp op) {
+  *p << op.getOperationName() << " " << *op.value();
+  *p << ", " << *op.view() << '[';
+  p->printOperands(op.indices());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getViewType();
+}
+
+static ParseResult parseStoreOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType viewInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  ViewType viewType;
+
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(storeValueInfo) || parser->parseComma() ||
+      parser->parseOperand(viewInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(viewType) ||
+      parser->resolveOperand(storeValueInfo, viewType.getElementType(),
+                             result->operands) ||
+      parser->resolveOperand(viewInfo, viewType, result->operands) ||
+      parser->resolveOperands(indexInfo, affineIntTy, result->operands));
+}
+
+static LogicalResult verify(linalg::StoreOp op) {
+  if (op.value()->getType() != op.getViewType().getElementType())
+    return op.emitOpError("expected value type to match view element type");
+  if (op.getRank() != llvm::size(op.indices()))
+    return op.emitOpError("expected ")
+           << op.getRank() << " indices, got " << llvm::size(op.indices());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SubViewOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, SubViewOp op) {
+  *p << op.getOperationName() << " " << *op.getOperand(0) << "[";
+  auto ranges = op.getRanges();
+  interleaveComma(ranges, *p, [&p](const SubViewOp::Range &i) {
+    *p << *i.min << ", " << *i.max << ", " << *i.step;
+  });
+  *p << "]";
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getViewType();
+}
+
+static ParseResult parseSubViewOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType inputView, resultView;
+  Type viewType;
+  if (parser->parseOperand(inputView))
+    return failure();
+
+  SmallVector<OpAsmParser::OperandType, 12> ops;
+  // TODO(ntv) evolve parsing from
+  //    linalg.subview %0[%1, %2, %3, %4, %5, %6]
+  // to something resembling
+  //    linalg.subview %0[%1:%2:%3][%4:%5:%6]
+  if (parser->parseOperandList(ops, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(viewType))
+    return failure();
+
+  auto indexTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->resolveOperand(inputView, viewType, result->operands) ||
+      parser->resolveOperands(ops, indexTy, result->operands) ||
+      parser->addTypeToList(viewType, result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+void mlir::linalg::TransposeOp::build(Builder *b, OperationState *result,
+                                      Value *view, AffineMapAttr permutation,
+                                      ArrayRef<NamedAttribute> attrs) {
+  // TODO(ntv): once views have static dimensions, compute the permuted type.
+  build(b, result, view->getType(), view, attrs);
+  result->addAttribute(TransposeOp::getPermutationAttrName(), permutation);
+}
+
+static void print(OpAsmPrinter *p, TransposeOp op) {
+  *p << op.getOperationName() << " " << *op.view() << " " << op.permutation();
+  p->printOptionalAttrDict(op.getAttrs(),
+                           {TransposeOp::getPermutationAttrName()});
+  *p << " : " << op.view()->getType();
+}
+
+static ParseResult parseTransposeOp(OpAsmParser *parser,
+                                    OperationState *result) {
+  OpAsmParser::OperandType view;
+  AffineMapAttr permutation;
+  Type type;
+  return failure(parser->parseOperand(view) ||
+                 parser->parseAttribute(permutation,
+                                        TransposeOp::getPermutationAttrName(),
+                                        result->attributes) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(view, type, result->operands) ||
+                 parser->addTypeToList(type, result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// ViewOp
+//===----------------------------------------------------------------------===//
+void mlir::linalg::ViewOp::build(Builder *b, OperationState *result,
+                                 Value *buffer, ArrayRef<Value *> ranges,
+                                 Type resultType,
+                                 ArrayRef<NamedAttribute> attrs) {
+  if (!resultType) {
+    Type elementType = buffer->getType().cast<BufferType>().getElementType();
+    resultType = ViewType::get(b->getContext(), elementType, ranges.size());
+  }
+  build(b, result, resultType, buffer, ranges);
+  result->addAttributes(attrs);
+}
+
+static void print(OpAsmPrinter *p, ViewOp op) {
+  *p << op.getOperationName() << " " << *op.buffer() << "[";
+  interleaveComma(op.ranges(), *p, [&](Value *v) { *p << *v; });
+  *p << "] ";
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.buffer()->getType() << " -> " << op.getType();
+}
+
+static ParseResult parseViewOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType bufferInfo;
+  SmallVector<OpAsmParser::OperandType, 8> rangesInfo;
+  Type bType, vType;
+  if (parser->parseOperand(bufferInfo) ||
+      parser->parseOperandList(rangesInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColon() || parser->parseType(bType) ||
+      parser->parseArrow() || parser->parseType(vType)) {
+    return failure();
+  }
+
+  ViewType viewType = vType.dyn_cast<ViewType>();
+  if (!viewType)
+    return parser->emitError(parser->getNameLoc(), "expected view type");
+  if (viewType.getRank() != rangesInfo.size())
+    return parser->emitError(parser->getNameLoc(), "expected ")
+           << viewType.getRank() << " ranges";
+  return failure(
+      parser->resolveOperand(bufferInfo, bType, result->operands) ||
+      (!rangesInfo.empty() &&
+       parser->resolveOperands(rangesInfo, RangeType::get(vType.getContext()),
+                               result->operands)) ||
+      parser->addTypeToList(viewType, result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, YieldOp op) {
+  *p << op.getOperationName();
+  if (op.getNumOperands() > 0) {
+    *p << ' ';
+    p->printOperands(op.operand_begin(), op.operand_end());
+  }
+  p->printOptionalAttrDict(op.getAttrs());
+  if (op.getNumOperands() > 0) {
+    *p << " : ";
+    interleaveComma(op.getOperands(), *p,
+                    [&](Value *e) { p->printType(e->getType()); });
+  }
+}
+
+static ParseResult parseYieldOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser->getCurrentLocation();
+  return failure(parser->parseOperandList(opInfo) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 (!opInfo.empty() && parser->parseColonTypeList(types)) ||
+                 parser->resolveOperands(opInfo, types, loc, result->operands));
+}
+
+static LogicalResult verify(YieldOp op) {
+  auto *parentOp = op.getParentOp();
+  if (parentOp->getNumRegions() != 1 || parentOp->getRegion(0).empty())
+    return op.emitOpError("op expected single non-empty parent region");
+
+  auto genericOp = dyn_cast<GenericOp>(parentOp);
+  if (!genericOp)
+    return op.emitOpError("op expected '")
+           << GenericOp::getOperationName() << "' parent op";
+
+  // The operand number and types must match the view element types.
+  auto nOutputViews = genericOp.getNumOutputs();
+  if (op.getNumOperands() != nOutputViews)
+    return op.emitOpError("op expected ")
+           << nOutputViews << " operand to match enclosing linalg.generic op";
+
+  for (unsigned i = 0; i != nOutputViews; ++i) {
+    auto elementType = genericOp.getOutputViewType(i).getElementType();
+    if (op.getOperand(i)->getType() != elementType)
+      return op.emitError("type of return operand ")
+             << i << " (" << op.getOperand(i)->getType()
+             << ") doesn't match view element type (" << elementType << ")";
+  }
+  return success();
+}
+
+/////// Operations corresponding to library calls defined with Tablegen ////////
+// For such operations correspond to library calls (i.e. defined in
+// LinalgLibraryOps.td), we define an overloaded `print` function and a
+// parse`className` function.
+
+// A LinalgLibraryOp prints as:
+//
+// ```{.mlir}
+//   concrete_op_name (ssa-inputs, ssa-outputs) : view-types
+// ```
+//
+// for example:
+//
+// ```
+//   linalg.matmul(%0, %1, %2) :
+//     !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+// ```
+//
+// Where %0, %1 and %2 are ssa-values of type ViewType.
+static void printLinalgLibraryOp(OpAsmPrinter *p, Operation *op) {
+  assert(op->getAbstractOperation() && "unregistered operation");
+  *p << op->getName().getStringRef() << "(";
+  interleave(
+      op->getOperands().begin(), op->getOperands().end(),
+      [&](Value *v) { *p << *v; }, [&]() { *p << ", "; });
+  *p << ")";
+  p->printOptionalAttrDict(op->getAttrs());
+  *p << " : ";
+  interleave(
+      op->getOperands().begin(), op->getOperands().end(),
+      [&](Value *v) { *p << v->getType(); }, [&]() { *p << ", "; });
+}
+
+static ParseResult parseLinalgLibraryOp(OpAsmParser *parser,
+                                        OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 3> ops;
+  SmallVector<Type, 3> types;
+  return failure(parser->parseOperandList(ops, OpAsmParser::Delimiter::Paren) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonTypeList(types) ||
+                 parser->resolveOperands(ops, types, parser->getNameLoc(),
+                                         result->operands));
+}
+
+static LogicalResult verify(FillOp op) {
+  auto viewType = op.getOutputViewType(0);
+  auto fillType = op.getValue()->getType();
+  if (viewType.getElementType() != fillType)
+    return op.emitOpError("expects fill type to match view elemental type");
+  return success();
+}
+
+static LogicalResult verify(CopyOp op) {
+  auto outputViewType = op.getOutputViewType(0);
+  auto inputViewType = op.getInputViewType(0);
+  if (inputViewType.getElementType() != outputViewType.getElementType())
+    return op.emitOpError("expects views of the same type");
+  if (inputViewType.getRank() != outputViewType.getRank())
+    return op.emitOpError("expects views of the same rank");
+  auto rank = op.getNumParallelLoops();
+  auto inputPermutationMap = op.inputPermutation();
+  if (inputPermutationMap) {
+    if (inputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional input_permutation map of rank ")
+             << rank;
+    if (!inputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional input_permutation map to be a permutation");
+  }
+  auto outputPermutationMap = op.outputPermutation();
+  if (outputPermutationMap) {
+    if (outputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional output_permutation map of rank ")
+             << rank;
+    if (!outputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional output_permutation map to be a permutation");
+  }
+  if (rank == 0 && inputPermutationMap)
+    return op.emitOpError("expected no input permutation when rank == 0");
+  if (rank == 0 && outputPermutationMap)
+    return op.emitOpError("expected no output permutation when rank == 0");
+  return success();
+}
+
+static LogicalResult
+verifyStrideOrDilation(ConvOp op, ArrayRef<Attribute> attrs, bool isStride) {
+  auto strideOrDilation = isStride ? "stride" : "dilation";
+  if (attrs.size() != op.getNumWindowLoops())
+    return op.emitOpError("expects num ")
+           << strideOrDilation
+           << "s equal to number of window dimensions: " << attrs.size()
+           << " vs " << op.getNumWindowLoops();
+  return success();
+}
+
+static LogicalResult verify(ConvOp op) {
+  auto oType = op.output()->getType().cast<ViewType>();
+  auto fType = op.filter()->getType().cast<ViewType>();
+  auto iType = op.input()->getType().cast<ViewType>();
+  if (oType.getElementType() != iType.getElementType() ||
+      oType.getElementType() != fType.getElementType())
+    return op.emitOpError("expects view elemental types to match");
+  if (oType.getRank() != iType.getRank() || oType.getRank() != fType.getRank())
+    return op.emitOpError("expects view ranks to match");
+  if (auto strides = op.strides()) {
+    if (failed(
+            verifyStrideOrDilation(op, strides->getValue(), /*isStride=*/true)))
+      return failure();
+  }
+  if (auto dilations = op.dilations()) {
+    if (failed(verifyStrideOrDilation(op, dilations->getValue(),
+                                      /*isStride=*/false)))
+      return failure();
+  }
+  return success();
+}
+
+llvm::raw_ostream &mlir::linalg::operator<<(llvm::raw_ostream &os,
+                                            SubViewOp::Range &range) {
+  return os << "range " << *range.min << ":" << *range.max << ":"
+            << *range.step;
+}
+
+namespace mlir {
+namespace linalg {
+
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOpInterfaces.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOps.cpp.inc"
+
+} // namespace linalg
+} // namespace mlir
+
+static AffineMap extractOrIdentityMap(llvm::Optional<AffineMap> maybeMap,
+                                      unsigned rank, MLIRContext *context) {
+  if (maybeMap)
+    return maybeMap.getValue();
+  if (rank == 0)
+    return AffineMap();
+  return AffineMap::getMultiDimIdentityMap(rank, context);
+}
+
+// Returns `num` AffineDimExpr dimensions at positions [curIdx, curIdx + num)
+// and increments `curIdx` to `curIdx + num`.
+static SmallVector<AffineExpr, 4>
+makeAffineDimExprs(unsigned num, unsigned &curIdx, MLIRContext *context) {
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(num);
+  for (unsigned i = 0; i < num; ++i)
+    res.push_back(getAffineDimExpr(curIdx++, context));
+  return res;
+}
+
+static SmallVector<AffineExpr, 4>
+weightedConvInputIndex(ConvOp op, ArrayRef<AffineExpr> a,
+                       ArrayRef<AffineExpr> b) {
+  assert(a.size() == b.size());
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(a.size());
+  for (unsigned i = 0, e = a.size(); i < e; ++i) {
+    res.push_back(op.getStride(i) * a[i] + op.getDilation(i) * b[i]);
+  }
+  return res;
+}
+
+static SmallVector<AffineExpr, 4> concat(ArrayRef<AffineExpr> a,
+                                         ArrayRef<AffineExpr> b) {
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(a.size() + b.size());
+  res.assign(a.begin(), a.end());
+  res.append(b.begin(), b.end());
+  return res;
+}
+
+// Note: both functions below would completely disappear with a simple tensor
+// kernel language.
+//
+// Ideally this should all be Tablegen'd but there is no good story for
+// AffineMap for now.
+SmallVector<AffineMap, 4> mlir::linalg::loopToOperandRangesMaps(Operation *op) {
+  MLIRContext *context = op->getContext();
+  if (auto copyOp = dyn_cast<CopyOp>(op)) {
+    // I(input_perm(ivs)) -> O(output_perm(ivs))
+    auto maybeInputMap = copyOp.inputPermutation();
+    auto maybeOutputMap = copyOp.outputPermutation();
+    unsigned inputRank = copyOp.getInputViewType(0).getRank();
+    unsigned outputRank = copyOp.getOutputViewType(0).getRank();
+    return SmallVector<AffineMap, 4>{
+        extractOrIdentityMap(maybeInputMap, inputRank, context),
+        extractOrIdentityMap(maybeOutputMap, outputRank, context)};
+  }
+  if (auto fillOp = dyn_cast<FillOp>(op)) {
+    // filling_value -> O(ivs)
+    unsigned rank = fillOp.getNumParallelLoops();
+    return SmallVector<AffineMap, 4>{
+        extractOrIdentityMap(llvm::None, rank, context)};
+  }
+  auto i = getAffineDimExpr(0, context);
+  auto j = getAffineDimExpr(1, context);
+  auto k = getAffineDimExpr(2, context);
+  if (isa<DotOp>(op))
+    // A(r_i) * B(r_i) -> C()
+    return SmallVector<AffineMap, 4>{AffineMap::get(1, 0, {i}),
+                                     AffineMap::get(1, 0, {i}), AffineMap()};
+  if (isa<MatvecOp>(op))
+    //   A(i, r_j) * B(r_j) -> C(i)
+    return SmallVector<AffineMap, 4>{AffineMap::get(2, 0, {i, j}),
+                                     AffineMap::get(2, 0, {j}),
+                                     AffineMap::get(2, 0, {i})};
+  if (isa<MatmulOp>(op))
+    //   A(i, r_k) * B(r_k, j) -> C(i, j)
+    return SmallVector<AffineMap, 4>{AffineMap::get(3, 0, {i, k}),
+                                     AffineMap::get(3, 0, {k, j}),
+                                     AffineMap::get(3, 0, {i, j})};
+  if (auto convOp = dyn_cast<ConvOp>(op)) {
+    //   F(z0, ..., zN-1, q, k) * I(b, x0 + z0, ..., xN-1 + zN-1, q) ->
+    //     O(b, x0, ..., xN-1, k)
+    // for N equal to `nWindow`.
+    auto nWin = convOp.getNumWindowLoops();
+    assert(nWin > 0 && "expected at least one window dimension");
+    unsigned idx = 0;
+    // In the following, AffineDimExprs are indexed in loop order:
+    //   [ b, xs, k,           q,                     zs]
+    //    parallels     non-window reductions     windows
+    //
+    // Parallel dims are exactly the dimensions indexing `output`:
+    //     output[b, x[0], ..., x[N-1], k]; i.e.
+    //  * batch dimensions (bs with #bs = 1 for now)
+    //  * "image" dimensions (xs with #xs = #zs = output_rank - #bs - #ks)
+    //  * output filter dimensions (ks with #ks = 1 for now)
+    auto bs = makeAffineDimExprs(convOp.getNumBatchDimensions(), idx, context);
+    auto xs = makeAffineDimExprs(nWin, idx, context);
+    auto ks = makeAffineDimExprs(convOp.getNumOutputFeatureDimensions(), idx,
+                                 context);
+    // Non-window reduction dim: sum_{z[0], ..., z[N-1], q}
+    auto qs =
+        makeAffineDimExprs(convOp.getNumInputFeatureDimensions(), idx, context);
+    // Window reduction dims: sum_{z[0], ..., z[N-1], q}
+    auto zs = makeAffineDimExprs(nWin, idx, context);
+    // Construct the weighedSum expression.
+    auto ws = weightedConvInputIndex(convOp, xs, zs);
+    return SmallVector<AffineMap, 4>{
+        // filter[z[0], ..., z[N-1], q, k]
+        AffineMap::get(idx, 0, concat(concat(zs, qs), ks)),
+        // input[b,
+        //       x[0]*s[0] + d[0]*z[0], ..., x[N-1]*s[N-1] + d[N-1]*z[N-1],
+        //       q]
+        AffineMap::get(idx, 0, concat(concat(bs, ws), qs)),
+        // output[b, x[0], ..., x[N-1], k]
+        AffineMap::get(idx, 0, concat(concat(bs, xs), ks))};
+  } else if (auto genericOp = dyn_cast<GenericOp>(op)) {
+    SmallVector<AffineMap, 4> res;
+    unsigned nViews = genericOp.getNumInputsAndOutputs();
+    res.reserve(nViews);
+    for (unsigned i = 0, e = nViews; i < e; ++i) {
+      res.push_back(genericOp.getIndexingMap(i));
+    }
+    return res;
+  }
+  llvm_unreachable("Missing loopToOperandRangesMaps for op");
+}
+
+static void appendMangledType(llvm::raw_string_ostream &ss, Type t) {
+  if (auto view = t.dyn_cast<ViewType>()) {
+    ss << "view";
+    for (unsigned i = 0, e = view.getRank(); i < e; ++i)
+      ss << "x";
+    appendMangledType(ss, view.getElementType());
+  } else if (auto vec = t.dyn_cast<VectorType>()) {
+    ss << "vector";
+    interleave(
+        vec.getShape(), [&](int64_t i) { ss << i; }, [&]() { ss << "x"; });
+    appendMangledType(ss, vec.getElementType());
+  } else if (t.isIntOrIndexOrFloat()) {
+    ss << t;
+  } else {
+    llvm_unreachable("Invalid type for linalg library name mangling");
+  }
+}
+
+std::string mlir::linalg::generateLibraryCallName(Operation *op) {
+  assert(isa<LinalgOp>(op));
+  std::string name(op->getName().getStringRef().str());
+  name.reserve(128);
+  std::replace(name.begin(), name.end(), '.', '_');
+  llvm::raw_string_ostream ss(name);
+  ss << "_";
+  auto types = op->getOperandTypes();
+  interleave(
+      types.begin(), types.end(), [&](Type t) { appendMangledType(ss, t); },
+      [&]() { ss << "_"; });
+  return ss.str();
+}
diff --git a/third_party/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp b/third_party/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
new file mode 100644
index 00000000000..6fdd9adb1dd
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
@@ -0,0 +1,267 @@
+//===- Dialect.cpp - Implementation of the linalg dialect and types -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the Linalg dialect types and dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+mlir::linalg::LinalgDialect::LinalgDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addTypes<BufferType, RangeType, ViewType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOps.cpp.inc"
+      >();
+}
+
+struct mlir::linalg::BufferTypeStorage : public TypeStorage {
+  /// Underlying Key type to transport the payload needed to construct a custom
+  /// type in a generic way.
+  struct Key {
+    Key(Type elementType, int64_t bufferSize = -1)
+        : elementType(elementType), bufferSize(bufferSize) {}
+    Type elementType;
+    int64_t bufferSize;
+  };
+  /// `KeyTy` is a necessary typename hook for MLIR's custom type unique'ing.
+  using KeyTy = Key;
+
+  /// Construction in the llvm::BumpPtrAllocator given a key.
+  static BufferTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const Key &key) {
+    return new (allocator.allocate<BufferTypeStorage>()) BufferTypeStorage(key);
+  }
+
+  /// Equality operator for hashing.
+  bool operator==(const Key &key) const {
+    return elementType == key.elementType && bufferSize == key.bufferSize;
+  }
+
+  /// Hashing for unique'ing.
+  static unsigned hashKey(const Key &key) {
+    return llvm::hash_combine(key.elementType, key.bufferSize);
+  }
+
+  Type getElementType() { return elementType; }
+  bool hasConstantSize() { return bufferSize >= 0; }
+  Optional<int64_t> getBufferSize() {
+    if (hasConstantSize()) {
+      return bufferSize;
+    }
+    return llvm::None;
+  }
+
+private:
+  BufferTypeStorage(const Key &key)
+      : elementType(key.elementType), bufferSize(key.bufferSize) {}
+
+  Type elementType;
+  int64_t bufferSize;
+};
+
+BufferType mlir::linalg::BufferType::get(MLIRContext *context, Type elementType,
+                                         int64_t bufferSize) {
+  return Base::get(context, LinalgTypes::Buffer, elementType, bufferSize);
+}
+
+Type mlir::linalg::BufferType::getElementType() {
+  return getImpl()->getElementType();
+}
+
+bool mlir::linalg::BufferType::hasConstantSize() {
+  return getImpl()->hasConstantSize();
+}
+
+Optional<int64_t> mlir::linalg::BufferType::getBufferSize() {
+  return getImpl()->getBufferSize();
+}
+
+Type mlir::linalg::LinalgDialect::parseType(StringRef spec,
+                                            Location loc) const {
+  StringRef origSpec = spec;
+  MLIRContext *context = getContext();
+  if (spec == "range")
+    return RangeType::get(getContext());
+  else if (spec.consume_front("buffer")) {
+    if (spec.consume_front("<") && spec.consume_back(">")) {
+      StringRef sizeSpec, typeSpec;
+      std::tie(sizeSpec, typeSpec) = spec.split('x');
+      if (typeSpec.empty()) {
+        emitError(loc, "expected 'x' followed by element type");
+        return Type();
+      }
+      // Check for '?'
+      int64_t bufferSize = -1;
+      if (!sizeSpec.consume_front("?")) {
+        if (sizeSpec.consumeInteger(10, bufferSize)) {
+          emitError(loc, "expected buffer size to be an unsigned integer");
+          return Type();
+        }
+      }
+      if (!sizeSpec.empty()) {
+        emitError(loc, "unexpected token '") << sizeSpec << "'";
+      }
+
+      typeSpec = typeSpec.trim();
+      auto t = mlir::parseType(typeSpec, context);
+      if (!t) {
+        emitError(loc, "invalid type specification: '") << typeSpec << "'";
+        return Type();
+      }
+      return (bufferSize == -1 ? BufferType::get(getContext(), t)
+                               : BufferType::get(getContext(), t, bufferSize));
+    }
+  } else if (spec.consume_front("view")) {
+    if (spec.consume_front("<") && spec.consume_back(">")) {
+      // Just count the number of ? to get the rank.
+      unsigned rank = 0;
+      for (unsigned i = 0, e = spec.size(); i < e; ++i) {
+        if (spec.consume_front("?")) {
+          ++rank;
+          if (!spec.consume_front("x")) {
+            emitError(loc, "expected a list of '?x' dimension specifiers: ")
+                << spec;
+            return Type();
+          }
+        }
+      }
+      if (auto t = mlir::parseType(spec, context))
+        return ViewType::get(context, t, rank);
+    }
+  }
+  return (emitError(loc, "unknown Linalg type: " + origSpec), Type());
+}
+
+struct mlir::linalg::ViewTypeStorage : public TypeStorage {
+  /// Underlying Key type to transport the payload needed to construct a custom
+  /// type in a generic way.
+  struct Key {
+    Key(Type elementType, unsigned rank)
+        : elementType(elementType), rank(rank) {}
+    Type elementType;
+    unsigned rank;
+  };
+  /// `KeyTy` is a necessary typename hook for MLIR's custom type unique'ing.
+  using KeyTy = Key;
+
+  /// Construction in the llvm::BumpPtrAllocator given a key.
+  static ViewTypeStorage *construct(TypeStorageAllocator &allocator,
+                                    const Key &key) {
+    return new (allocator.allocate<ViewTypeStorage>()) ViewTypeStorage(key);
+  }
+
+  /// Equality operator for hashing.
+  bool operator==(const Key &key) const {
+    return elementType == key.elementType && rank == key.rank;
+  }
+
+  /// Hashing for unique'ing.
+  static unsigned hashKey(const Key &key) {
+    return llvm::hash_combine(key.elementType, key.rank);
+  }
+
+  unsigned getRank() { return rank; };
+  Type getElementType() { return elementType; };
+
+private:
+  ViewTypeStorage(const Key &key)
+      : elementType(key.elementType), rank(key.rank) {}
+
+  Type elementType;
+  unsigned rank;
+};
+
+ViewType mlir::linalg::ViewType::get(MLIRContext *context, Type elementType,
+                                     unsigned rank) {
+  return Base::get(context, LinalgTypes::View, elementType, rank);
+}
+
+Type mlir::linalg::ViewType::getElementType() {
+  return getImpl()->getElementType();
+}
+
+unsigned mlir::linalg::ViewType::getRank() { return getImpl()->getRank(); }
+
+/// BufferType prints as "buffer<element_type>".
+static void print(BufferType bt, raw_ostream &os) {
+  os << "buffer<";
+  auto bs = bt.getBufferSize();
+  if (bs) {
+    os << bs.getValue();
+  } else {
+    os << "?";
+  }
+  os << "x" << bt.getElementType() << ">";
+}
+
+/// RangeType prints as just "range".
+static void print(RangeType rt, raw_ostream &os) { os << "range"; }
+
+/// ViewType prints as:
+///
+/// ```{.mlir}
+///   view<?x?xf32>
+/// ```
+///
+/// or
+///
+/// ```{.mlir}
+///   view<?xf32>
+/// ```
+///
+/// for 0-D views (a.k.a pointer to a scalar value).
+static void print(mlir::linalg::ViewType rt, raw_ostream &os) {
+  os << "view<";
+  for (unsigned i = 0, e = rt.getRank(); i < e; ++i) {
+    os << "?x";
+  }
+  os << rt.getElementType();
+  os << ">";
+}
+
+void mlir::linalg::LinalgDialect::printType(Type type, raw_ostream &os) const {
+  switch (type.getKind()) {
+  default:
+    llvm_unreachable("Unhandled Linalg type");
+  case LinalgTypes::Buffer:
+    print(type.cast<BufferType>(), os);
+    break;
+  case LinalgTypes::Range:
+    print(type.cast<RangeType>(), os);
+    break;
+  case LinalgTypes::View:
+    print(type.cast<ViewType>(), os);
+    break;
+  }
+}
diff --git a/third_party/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp b/third_party/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp
new file mode 100644
index 00000000000..df21ffa88ac
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp
@@ -0,0 +1,25 @@
+//===- LinalgRegistration.cpp - Register the linalg dialect statically ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+// Static initialization for LinalgOps dialect registration.
+static DialectRegistration<LinalgDialect> LinalgOps;
diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
new file mode 100644
index 00000000000..9f44b1cffa6
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -0,0 +1,363 @@
+//===- Fusion.cpp - Implementation of linalg Fusion -----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the linalg dialect Fusion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-fusion"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using llvm::dbgs;
+
+/// Implements a simple high-level fusion pass of linalg library operations.
+///
+/// In each block, linalg ops are processed in reverse textual order.
+/// Given a linalg op, fusion occurs by:
+///   1. tiling the op by a given multi-dimensional tile size;
+///   2. inspecting the linalg ops that write into the views read by the op in
+///      step 1. This uses the SSA value of the views to determine producer-
+///      consumer dependences: only identical SSA views are considered for
+///      fusion at this point;
+///   3. greedily fuse the producing linalg ops into the consuming loop tiles;
+///   4. inspect the fused ops and determine whether they have other remaining
+///      LinalgOp uses. If not, then erase the original producing linalg op.
+///
+/// More advanced use cases, analyses as well as profitability heuristics are
+/// left for future work.
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+static llvm::cl::list<unsigned> clTileSizes(
+    "linalg-fusion-tile-sizes",
+    llvm::cl::desc(
+        "Tile sizes by which to tile linalg operations during linalg fusion"),
+    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+    llvm::cl::cat(clOptionsCategory));
+
+// Return a cloned version of `op` that operates on `loopRanges`, assumed to be
+// a subset of the original loop ranges of `op`.
+// This is achieved by applying the `loopToOperandRangesMaps` permutation maps
+// to the `loopRanges` in order to obtain view ranges.
+static LinalgOp cloneWithLoopRanges(OpBuilder &b, Location loc, LinalgOp op,
+                                    ArrayRef<SubViewOp::Range> loopRanges,
+                                    OperationFolder &state) {
+  ScopedContext scope(b, loc);
+
+  auto maps = loopToOperandRangesMaps(op);
+  SmallVector<Value *, 8> clonedViews;
+  clonedViews.reserve(op.getNumInputsAndOutputs());
+  // Iterate over the inputs and outputs in order.
+  // Extract the subranges from the linearized ranges.
+  SmallVector<Value *, 8> ios(op.getInputsAndOutputs());
+  for (auto en : llvm::enumerate(ios)) {
+    unsigned idx = en.index();
+    auto map = maps[idx];
+    LLVM_DEBUG(dbgs() << "map: " << map << "\n");
+    Value *view = en.value();
+    SmallVector<SubViewOp::Range, 8> viewRanges(map.getNumResults());
+    for (auto en2 : llvm::enumerate(map.getResults())) {
+      unsigned d = en2.index();
+      // loopToOperandRangesMaps are permutations-only.
+      unsigned loopPos = en2.value().cast<AffineDimExpr>().getPosition();
+      viewRanges[d] = loopRanges[loopPos];
+      LLVM_DEBUG(dbgs() << "\ni,j: " << en.index() << ", " << en2.index()
+                        << "\t"
+                        << "loopPos: " << loopPos << "\t" << viewRanges[d]);
+    }
+    // TODO(ntv) opportunities for folding/CSE here rather than build new IR.
+    clonedViews.push_back(b.create<SubViewOp>(loc, view, viewRanges));
+  }
+  auto operands = getAssumedNonViewOperands(op);
+  clonedViews.append(operands.begin(), operands.end());
+  return op.create(b, loc, clonedViews, op.getAttrs());
+}
+
+struct ViewDimension {
+  Value *view;
+  unsigned dimension;
+};
+
+static ViewDimension getViewDefiningLoopRange(LinalgOp op, unsigned loopDepth) {
+  auto maps = loopToOperandRangesMaps(op);
+  SmallVector<Value *, 8> clonedViews;
+  clonedViews.reserve(op.getNumInputsAndOutputs());
+  // Iterate over the inputs and outputs in order.
+  // Extract the subranges from the linearized ranges.
+  SmallVector<Value *, 8> ios(op.getInputsAndOutputs());
+  for (auto en : llvm::enumerate(ios)) {
+    unsigned idx = en.index();
+    auto map = maps[idx];
+    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange I/O idx: " << idx << "\n");
+    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange map: " << map << "\n");
+    Value *view = en.value();
+    SmallVector<Value *, 8> viewRanges(map.getNumResults(), nullptr);
+    for (auto en2 : llvm::enumerate(map.getResults())) {
+      if (loopDepth == en2.value().cast<AffineDimExpr>().getPosition()) {
+        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange loopDepth: " << loopDepth
+                          << "\n");
+        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange view: " << *view
+                          << "\n");
+        return ViewDimension{view, static_cast<unsigned>(en2.index())};
+      }
+    }
+  }
+  llvm_unreachable("Expect to be able to extract a view defining loop range");
+}
+
+static Optional<LinalgOp> fuse(Value *producedView, LinalgOp producer,
+                               LinalgOp consumer, LinalgOp tiledConsumer,
+                               OperationFolder &state) {
+  auto maybeConsumerIdx = consumer.getIndexOfInput(producedView);
+  if (!maybeConsumerIdx.hasValue())
+    return llvm::None;
+  unsigned consumerIdx = maybeConsumerIdx.getValue();
+
+  auto maybeProducerIdx = producer.getIndexOfOutput(producedView);
+  if (!maybeProducerIdx.hasValue())
+    return llvm::None;
+  unsigned producerIdx = maybeProducerIdx.getValue();
+
+  // If the view is the same between consumer and tiledConsumer, this means we
+  // don't have loops and the producer cannot be fused at this level.
+  if (consumer.getInput(consumerIdx) == tiledConsumer.getInput(consumerIdx))
+    return llvm::None;
+
+  auto tiledConsumerSubView = dyn_cast_or_null<SubViewOp>(
+      tiledConsumer.getInput(consumerIdx)->getDefiningOp());
+
+  // If we don't have a slice, this also means we don't have loops and the
+  // producer cannot be fused at this level.
+  if (!tiledConsumerSubView)
+    return llvm::None;
+
+  // loopToOperandRangesMaps are permutations-only by construction:
+  //   we can always identify a data dimension with a (at least one) loop
+  //   dimension.
+  AffineMap producerMap =
+      loopToOperandRangesMaps(producer)[producer.getNumInputs() + producerIdx];
+  LLVM_DEBUG(dbgs() << "Consumer Idx: " << consumerIdx << ", consumer map: "
+                    << loopToOperandRangesMaps(consumer)[consumerIdx] << "\n");
+  LLVM_DEBUG(dbgs() << "Producer Idx: " << producerIdx
+                    << ", producer map: " << producerMap << "\n");
+
+  unsigned nPar = producer.getNumParallelLoops();
+  unsigned nRed = producer.getNumReductionLoops();
+  unsigned nWin = producer.getNumWindowLoops();
+  SmallVector<SubViewOp::Range, 8> loopRanges(nPar + nRed + nWin);
+
+  // Iterate over dimensions identified by the producer map for `producerIdx`.
+  // This defines a subset of the loop ranges that we need to complete later.
+  for (auto en : llvm::enumerate(producerMap.getResults())) {
+    unsigned posInProducerLoop = en.value().cast<AffineDimExpr>().getPosition();
+    loopRanges[posInProducerLoop] = tiledConsumerSubView.getRange(en.index());
+  }
+
+  OpBuilder b(tiledConsumer.getOperation());
+  auto loc = tiledConsumer.getLoc();
+  // Iterate over all dimensions. For the dimensions not identified by the
+  // producer map for `producerIdx`, we need to explicitly compute the view that
+  // defines the loop ranges using the `producer`.
+  for (unsigned i = 0, nLoops = loopRanges.size(); i < nLoops; ++i) {
+    if (loopRanges[i].min)
+      LLVM_DEBUG(llvm::dbgs()
+                 << "existing LoopRange: " << loopRanges[i] << "\n");
+    else {
+      auto viewDim = getViewDefiningLoopRange(producer, i);
+      loopRanges[i] = SubViewOp::Range{
+          state.create<ConstantIndexOp>(b, loc, 0),
+          linalg::intrinsics::dim(viewDim.view, viewDim.dimension),
+          state.create<ConstantIndexOp>(b, loc, 1)};
+      LLVM_DEBUG(llvm::dbgs() << "new LoopRange: " << loopRanges[i] << "\n");
+    }
+  }
+
+  return cloneWithLoopRanges(b, loc, producer, loopRanges, state);
+}
+
+// Encode structural fusion safety preconditions.
+// Some of these will be lifted in the future with better analysis.
+static bool isStructurallyFusableProducer(LinalgOp producer, Value *readView,
+                                          LinalgOp consumer) {
+  // If a producer has multiple outputs, the analysis needs to take the tiling
+  // of other outputs into account.
+  if (producer.getNumOutputs() != 1)
+    return false;
+  // Until subview analysis is available, same SSA value is required for fusion.
+  if (producer.getOutput(0) != readView)
+    return false;
+  // No control-flow divergence supported. Only straightline op fusion allowed.
+  // TODO(ntv) allow fusion when a dominance relation exists.
+  if (producer.getOperation()->getBlock() !=
+      consumer.getOperation()->getBlock())
+    return false;
+  return true;
+}
+
+static void fuseLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
+  OperationFolder state;
+  DenseSet<Operation *> eraseSet;
+
+  LLVM_DEBUG(f.print(dbgs() << "\nBefore linalg-fusion: \n"));
+
+  // 1. Record the linalg ops so we can traverse them in reverse order.
+  SmallVector<Operation *, 8> linalgOps;
+  f.walk<LinalgOp>(
+      [&](LinalgOp op) { linalgOps.push_back(op.getOperation()); });
+
+  // 2. Setup the dependences graph, aliases are populated lazily.
+  Aliases aliases;
+  LinalgDependenceGraph G(aliases, linalgOps);
+
+  // 2. For each original linalg op (in reverse order to allow chained
+  // fusions).
+  for (auto *op : llvm::reverse(linalgOps)) {
+    auto consumer = cast<LinalgOp>(op);
+    LLVM_DEBUG(dbgs() << "\n******\nStart processing:\t" << *op);
+    // 3. If marked for erasure, it has already been fused. Skip fusing op.
+    if (eraseSet.count(op) > 0) {
+      LLVM_DEBUG(dbgs() << "\nAlready fused and marked for erasure, skip.");
+      continue;
+    }
+
+    // 4. Apply loop tiling to enable fusion. If unsuccessful, skip fusing op.
+    auto tiledOp = tileLinalgOp(op, tileSizes, state);
+    if (!tiledOp) {
+      LLVM_DEBUG(dbgs() << "\nTile sizes did not produce loops, skip.");
+      continue;
+    }
+
+    // 5. For now, we only fuse RAW dependences.
+    SmallVector<Operation *, 8> fusedProducers;
+    SmallVector<Value *, 8> fusedViews;
+    for (auto dependence : G.getDependencesInto(
+             consumer, LinalgDependenceGraph::DependenceType::RAW)) {
+      auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
+      LLVM_DEBUG(dbgs() << "\n***Consider producer:\t"
+                        << *producer.getOperation() << "\n");
+
+      // a. For now we require fusion on identical SSA values, this allows us to
+      // not worry about partial writes etc.
+      // TODO(ntv) support more elaborate fusion with non identical SSA values.
+      auto *view = dependence.indexingView;
+      if (view != dependence.dependentOpView.view) {
+        LLVM_DEBUG(dbgs() << "\nviews are different SSA values, skip.");
+        continue;
+      }
+      // b. Make some simple structural checks that alleviate the need for more
+      // complex analyses.
+      if (!isStructurallyFusableProducer(producer, view, op)) {
+        LLVM_DEBUG(dbgs() << "\n***Not fusable:\t" << *producer.getOperation());
+        continue;
+      }
+      // c. Check for fusion-preventing write that would violate dependences.
+      // `view` is a producer write that cannot bypass any other write or read.
+      bool preventFusion = false;
+      for (auto *op : G.findCoveringDependences(producer, consumer))
+        if (eraseSet.count(op) == 0) {
+          preventFusion = true;
+          LLVM_DEBUG(dbgs() << "\n***Found fusion preventing dep via: " << *op);
+          break;
+        }
+      if (preventFusion)
+        continue;
+
+      // 6. Try to fuse `producer` just before `tiledOp`.
+      LLVM_DEBUG(f.print(dbgs() << "\nBefore tiledOp-fusion: \n"));
+
+      auto tOp = tiledOp->op;
+      OpBuilder builder(tOp.getOperation());
+      ScopedContext scope(builder, tOp.getLoc());
+      LLVM_DEBUG(dbgs() << "Try fuse into tiled consumer: " << *tOp << "\n");
+      auto maybeFusedProducer = fuse(view, producer, op, tOp, state);
+      if (!maybeFusedProducer) {
+        LLVM_DEBUG(dbgs() << "\nFusion did not do anything, skip.");
+        continue;
+      }
+
+      fusedProducers.push_back(producer.getOperation());
+      fusedViews.push_back(view);
+    }
+
+    // 7. If no fusion occurred, or a drop the outer tiled loop which undoes
+    // everything we did.
+    if (fusedProducers.empty()) {
+      tiledOp->loops[0].erase();
+      continue;
+    }
+
+    eraseSet.insert(op);
+    eraseSet.insert(fusedProducers.begin(), fusedProducers.end());
+  }
+
+  for (auto *op : eraseSet)
+    op->erase();
+
+  LLVM_DEBUG(f.print(dbgs() << "\nAfter linalg-fusion: \n"));
+}
+
+namespace {
+struct LinalgFusionPass : public FunctionPass<LinalgFusionPass> {
+  LinalgFusionPass() = default;
+  LinalgFusionPass(ArrayRef<int64_t> sizes);
+
+  void runOnFunction() { fuseLinalgOps(getFunction(), tileSizes); }
+
+  SmallVector<int64_t, 8> tileSizes;
+};
+} // namespace
+
+LinalgFusionPass::LinalgFusionPass(ArrayRef<int64_t> sizes)
+    : LinalgFusionPass() {
+  if (!sizes.empty())
+    this->tileSizes.assign(sizes.begin(), sizes.end());
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::linalg::createLinalgFusionPass(ArrayRef<int64_t> tileSizes) {
+  return std::make_unique<LinalgFusionPass>(tileSizes);
+}
+
+static PassRegistration<LinalgFusionPass>
+    pass("linalg-fusion", "Fuse operations in the linalg dialect", [] {
+      auto pass = std::make_unique<LinalgFusionPass>();
+      pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
+      return pass;
+    });
diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp
new file mode 100644
index 00000000000..0bc355ae91d
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp
@@ -0,0 +1,921 @@
+//===- LowerToLLVMDialect.cpp - conversion from Linalg to LLVM dialect ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/LowerAffine.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::LLVM;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using add = ValueBuilder<mlir::LLVM::AddOp>;
+using addi = ValueBuilder<mlir::AddIOp>;
+using bitcast = ValueBuilder<mlir::LLVM::BitcastOp>;
+using cmpi = ValueBuilder<mlir::CmpIOp>;
+using constant = ValueBuilder<mlir::LLVM::ConstantOp>;
+using extractvalue = ValueBuilder<mlir::LLVM::ExtractValueOp>;
+using gep = ValueBuilder<mlir::LLVM::GEPOp>;
+using insertvalue = ValueBuilder<mlir::LLVM::InsertValueOp>;
+using llvm_call = OperationBuilder<mlir::LLVM::CallOp>;
+using llvm_icmp = ValueBuilder<LLVM::ICmpOp>;
+using llvm_load = ValueBuilder<LLVM::LoadOp>;
+using llvm_store = OperationBuilder<LLVM::StoreOp>;
+using llvm_select = ValueBuilder<LLVM::SelectOp>;
+using mul = ValueBuilder<mlir::LLVM::MulOp>;
+using ptrtoint = ValueBuilder<mlir::LLVM::PtrToIntOp>;
+using sub = ValueBuilder<mlir::LLVM::SubOp>;
+using undef = ValueBuilder<mlir::LLVM::UndefOp>;
+using urem = ValueBuilder<mlir::LLVM::URemOp>;
+using llvm_alloca = ValueBuilder<LLVM::AllocaOp>;
+using llvm_return = OperationBuilder<LLVM::ReturnOp>;
+
+template <typename T>
+static LLVMType getPtrToElementType(T containerType,
+                                    LLVMTypeConverter &lowering) {
+  return lowering.convertType(containerType.getElementType())
+      .template cast<LLVMType>()
+      .getPointerTo();
+}
+
+// Convert the given type to the LLVM IR Dialect type.  The following
+// conversions are supported:
+//   - an Index type is converted into an LLVM integer type with pointer
+//     bitwidth (analogous to intptr_t in C);
+//   - an Integer type is converted into an LLVM integer type of the same width;
+//   - an F32 type is converted into an LLVM float type
+//   - a Buffer, Range or View is converted into an LLVM structure type
+//     containing the respective dynamic values.
+static Type convertLinalgType(Type t, LLVMTypeConverter &lowering) {
+  auto *context = t.getContext();
+  auto int64Ty = lowering.convertType(IntegerType::get(64, context))
+                     .cast<LLVM::LLVMType>();
+
+  // A buffer descriptor contains the pointer to a flat region of storage and
+  // the size of the region.
+  //
+  // template <typename Elem, size_t Rank>
+  // struct {
+  //   void *baseAlloc;
+  //   Elem *ptr;
+  //   int64_t size;
+  // };
+  if (auto bufferType = t.dyn_cast<BufferType>()) {
+    auto voidPtrTy = LLVMType::getInt8Ty(lowering.getDialect()).getPointerTo();
+    auto ptrTy = getPtrToElementType(bufferType, lowering);
+    return LLVMType::getStructTy(voidPtrTy, ptrTy, int64Ty);
+  }
+
+  // Range descriptor contains the range bounds and the step as 64-bit integers.
+  //
+  // struct {
+  //   int64_t min;
+  //   int64_t max;
+  //   int64_t step;
+  // };
+  if (t.isa<RangeType>())
+    return LLVMType::getStructTy(int64Ty, int64Ty, int64Ty);
+
+  // A linalg.view type converts to a *pointer to* a view descriptor. The view
+  // descriptor contains the pointer to the data buffer, followed by a 64-bit
+  // integer containing the distance between the beginning of the buffer and the
+  // first element to be accessed through the view, followed by two arrays, each
+  // containing as many 64-bit integers as the rank of the View. The first array
+  // represents the size, in number of original elements, of the view along the
+  // given dimension.  When taking the view, the size is the difference between
+  // the upper and the lower bound of the range. The second array represents the
+  // "stride" (in tensor abstraction sense), i.e. the number of consecutive
+  // elements of the underlying buffer that separate two consecutive elements
+  // addressable through the view along the given dimension.  When taking the
+  // view, the strides are constructed as products of the original sizes along
+  // the trailing dimensions, multiplied by the view step.  For example, a view
+  // of a MxN memref with ranges {0:M:1}, {0:N:1}, i.e. the view of a complete
+  // memref, will have strides N and 1.  A view with ranges {0:M:2}, {0:N:3}
+  // will have strides 2*N and 3.
+  //
+  // template <typename Elem, size_t Rank>
+  // struct {
+  //   Elem *ptr;
+  //   int64_t offset;
+  //   int64_t sizes[Rank];
+  //   int64_t strides[Rank];
+  // } *;
+  if (auto viewType = t.dyn_cast<ViewType>()) {
+    auto ptrTy = getPtrToElementType(viewType, lowering);
+    auto arrayTy = LLVMType::getArrayTy(int64Ty, viewType.getRank());
+    return LLVMType::getStructTy(ptrTy, int64Ty, arrayTy, arrayTy)
+        .getPointerTo();
+  }
+
+  return Type();
+}
+
+static constexpr int kBasePtrPosInBuffer = 0;
+static constexpr int kPtrPosInBuffer = 1;
+static constexpr int kSizePosInBuffer = 2;
+static constexpr int kPtrPosInView = 0;
+static constexpr int kOffsetPosInView = 1;
+static constexpr int kSizePosInView = 2;
+static constexpr int kStridePosInView = 3;
+
+// Create an array attribute containing integer attributes with values provided
+// in `position`.
+static ArrayAttr positionAttr(Builder &builder, ArrayRef<int> position) {
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(position.size());
+  for (auto p : position)
+    attrs.push_back(builder.getI64IntegerAttr(p));
+  return builder.getArrayAttr(attrs);
+}
+
+namespace {
+/// Factor out the common information for all view conversions:
+///   1. common types in (standard and LLVM dialects)
+///   2. `pos` method
+///   3. op of the FuncOp alloca'ed value and descriptor.
+class BaseViewConversionHelper {
+public:
+  BaseViewConversionHelper(Operation *op, ViewType viewType,
+                           ConversionPatternRewriter &rewriter,
+                           LLVMTypeConverter &lowering)
+      : indexType(rewriter.getIndexType()), viewType(viewType),
+        elementTy(getPtrToElementType(viewType, lowering)),
+        int64Ty(
+            lowering.convertType(rewriter.getIntegerType(64)).cast<LLVMType>()),
+        viewDescriptorPtrTy(
+            convertLinalgType(viewType, lowering).cast<LLVMType>()),
+        rewriter(rewriter) {
+
+    OpBuilder::InsertionGuard insertGuard(rewriter);
+    rewriter.setInsertionPointToStart(
+        &op->getParentOfType<FuncOp>().getBlocks().front());
+
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    one = constant(int64Ty, IntegerAttr::get(indexType, 1));
+    // Alloca with proper alignment.
+    allocatedDesc = llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
+    // Load the alloca'ed descriptor.
+    desc = llvm_load(allocatedDesc);
+  }
+
+  ArrayAttr pos(ArrayRef<int> values) const {
+    return positionAttr(rewriter, values);
+  };
+
+  IndexType indexType;
+  ViewType viewType;
+  LLVMType elementTy, int64Ty, viewDescriptorPtrTy;
+  ConversionPatternRewriter &rewriter;
+  Value *one, *allocatedDesc, *desc;
+};
+} // namespace
+
+// BufferAllocOp creates a new `!linalg.buffer` value.
+class BufferAllocOpConversion : public LLVMOpLowering {
+public:
+  explicit BufferAllocOpConversion(MLIRContext *context,
+                                   LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(BufferAllocOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto indexType = IndexType::get(op->getContext());
+    auto voidPtrTy =
+        LLVM::LLVMType::getInt8Ty(lowering.getDialect()).getPointerTo();
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+    // Insert the `malloc` declaration if it is not already present.
+    auto module = op->getParentOfType<ModuleOp>();
+    FuncOp mallocFunc = module.lookupSymbol<FuncOp>("malloc");
+    if (!mallocFunc) {
+      auto mallocType = rewriter.getFunctionType(int64Ty, voidPtrTy);
+      mallocFunc =
+          FuncOp::create(rewriter.getUnknownLoc(), "malloc", mallocType);
+      module.push_back(mallocFunc);
+    }
+
+    // Get MLIR types for injecting element pointer.
+    auto allocOp = cast<BufferAllocOp>(op);
+    auto elementType = allocOp.getElementType();
+    uint64_t elementSize = 0;
+    if (auto vectorType = elementType.dyn_cast<VectorType>())
+      elementSize = vectorType.getNumElements() *
+                    llvm::divideCeil(vectorType.getElementTypeBitWidth(), 8);
+    else
+      elementSize = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
+    auto bufferType = allocOp.getBufferType();
+    auto elementPtrType = getPtrToElementType(bufferType, lowering);
+    auto bufferDescriptorTy = convertLinalgType(bufferType, lowering);
+
+    // Emit IR for creating a new buffer descriptor with an underlying malloc.
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    auto constantSize = bufferType.getBufferSize();
+    Value *size =
+        constantSize
+            ? constant(int64Ty, IntegerAttr::get(indexType, *constantSize))
+                  .getValue()
+            : operands[0];
+    Value *allocSize =
+        mul(size, constant(int64Ty, IntegerAttr::get(indexType, elementSize)));
+    Value *one = nullptr, *align = nullptr;
+    if (allocOp.alignment().hasValue()) {
+      one = constant(int64Ty, IntegerAttr::get(indexType, 1));
+      align =
+          constant(int64Ty, rewriter.getIntegerAttr(
+                                rewriter.getIndexType(),
+                                allocOp.alignment().getValue().getSExtValue()));
+      allocSize = sub(add(allocSize, align), one);
+    }
+
+    Value *allocated =
+        llvm_call(voidPtrTy, rewriter.getSymbolRefAttr(mallocFunc), allocSize)
+            .getOperation()
+            ->getResult(0);
+    Value *data = allocated;
+    if (allocOp.alignment().hasValue()) {
+      // offset = (align - (ptr % align))% align
+      Value *offset =
+          urem(sub(align, urem(ptrtoint(int64Ty, allocated), align)), align);
+      data = gep(voidPtrTy, allocated, offset);
+    }
+    data = bitcast(elementPtrType, data);
+    Value *desc = undef(bufferDescriptorTy);
+    desc = insertvalue(bufferDescriptorTy, desc, allocated,
+                       positionAttr(rewriter, kBasePtrPosInBuffer));
+    desc = insertvalue(bufferDescriptorTy, desc, data,
+                       positionAttr(rewriter, kPtrPosInBuffer));
+    desc = insertvalue(bufferDescriptorTy, desc, size,
+                       positionAttr(rewriter, kSizePosInBuffer));
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+// BufferDeallocOp creates no value.
+class BufferDeallocOpConversion : public LLVMOpLowering {
+public:
+  explicit BufferDeallocOpConversion(MLIRContext *context,
+                                     LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(BufferDeallocOp::getOperationName(), context,
+                       lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto voidPtrTy =
+        LLVM::LLVMType::getInt8Ty(lowering.getDialect()).getPointerTo();
+    // Insert the `free` declaration if it is not already present.
+    auto module = op->getParentOfType<ModuleOp>();
+    FuncOp freeFunc = module.lookupSymbol<FuncOp>("free");
+    if (!freeFunc) {
+      auto freeType = rewriter.getFunctionType(voidPtrTy, {});
+      freeFunc = FuncOp::create(rewriter.getUnknownLoc(), "free", freeType);
+      module.push_back(freeFunc);
+    }
+
+    // Emit MLIR for buffer_dealloc.
+    BufferDeallocOpOperandAdaptor adaptor(operands);
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    Value *base = extractvalue(voidPtrTy, adaptor.buffer(),
+                               positionAttr(rewriter, kBasePtrPosInBuffer));
+    llvm_call(ArrayRef<Type>(), rewriter.getSymbolRefAttr(freeFunc), base);
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+
+// BufferSizeOp creates a new `index` value.
+class BufferSizeOpConversion : public LLVMOpLowering {
+public:
+  BufferSizeOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(BufferSizeOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    BufferSizeOpOperandAdaptor adaptor(operands);
+    rewriter.replaceOp(
+        op, {extractvalue(int64Ty, adaptor.buffer(),
+                          positionAttr(rewriter, kSizePosInBuffer))});
+    return matchSuccess();
+  }
+};
+
+// DimOp creates a new `index` value.
+class DimOpConversion : public LLVMOpLowering {
+public:
+  explicit DimOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(linalg::DimOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto dimOp = cast<linalg::DimOp>(op);
+    auto indexTy = lowering.convertType(rewriter.getIndexType());
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    auto pos = positionAttr(
+        rewriter, {kSizePosInView, static_cast<int>(dimOp.getIndex())});
+    linalg::DimOpOperandAdaptor adaptor(operands);
+    Value *viewDescriptor = llvm_load(adaptor.view());
+    rewriter.replaceOp(op, {extractvalue(indexTy, viewDescriptor, pos)});
+    return matchSuccess();
+  }
+};
+
+namespace {
+// Common functionality for Linalg LoadOp and StoreOp conversion to the
+// LLVM IR Dialect.
+template <typename Op> class LoadStoreOpConversion : public LLVMOpLowering {
+public:
+  explicit LoadStoreOpConversion(MLIRContext *context,
+                                 LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(Op::getOperationName(), context, lowering_) {}
+  using Base = LoadStoreOpConversion<Op>;
+
+  // Compute the pointer to an element of the buffer underlying the view given
+  // current view indices.  Use the base offset and strides stored in the view
+  // descriptor to emit IR iteratively computing the actual offset, followed by
+  // a getelementptr. This must be called under an edsc::ScopedContext.
+  Value *obtainDataPtr(Operation *op, Value *viewDescriptorPtr,
+                       ArrayRef<Value *> indices,
+                       ConversionPatternRewriter &rewriter) const {
+    auto loadOp = cast<Op>(op);
+    auto elementTy = getPtrToElementType(loadOp.getViewType(), lowering);
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+    auto pos = [&rewriter](ArrayRef<int> values) {
+      return positionAttr(rewriter, values);
+    };
+
+    // Linearize subscripts as:
+    //   base_offset + SUM_i index_i * stride_i.
+    Value *viewDescriptor = llvm_load(viewDescriptorPtr);
+    Value *base = extractvalue(elementTy, viewDescriptor, pos(kPtrPosInView));
+    Value *offset =
+        extractvalue(int64Ty, viewDescriptor, pos(kOffsetPosInView));
+    for (int i = 0, e = loadOp.getRank(); i < e; ++i) {
+      Value *stride =
+          extractvalue(int64Ty, viewDescriptor, pos({kStridePosInView, i}));
+      Value *additionalOffset = mul(indices[i], stride);
+      offset = add(offset, additionalOffset);
+    }
+    return gep(elementTy, base, offset);
+  }
+};
+} // namespace
+
+// A load is converted into the actual address computation, getelementptr and
+// an LLVM IR load.
+class LoadOpConversion : public LoadStoreOpConversion<linalg::LoadOp> {
+  using Base::Base;
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    edsc::ScopedContext edscContext(rewriter, op->getLoc());
+    auto elementTy = lowering.convertType(*op->result_type_begin());
+    linalg::LoadOpOperandAdaptor adaptor(operands);
+    auto ptr = obtainDataPtr(op, adaptor.view(), adaptor.indices(), rewriter);
+    rewriter.replaceOp(op, {llvm_load(elementTy, ptr)});
+    return matchSuccess();
+  }
+};
+
+// RangeOp creates a new range descriptor.
+class RangeOpConversion : public LLVMOpLowering {
+public:
+  explicit RangeOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(RangeOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto rangeOp = cast<RangeOp>(op);
+    auto rangeDescriptorTy =
+        convertLinalgType(rangeOp.getResult()->getType(), lowering);
+
+    edsc::ScopedContext context(rewriter, op->getLoc());
+
+    // Fill in an aggregate value of the descriptor.
+    RangeOpOperandAdaptor adaptor(operands);
+    Value *desc = undef(rangeDescriptorTy);
+    desc = insertvalue(desc, adaptor.min(), positionAttr(rewriter, 0));
+    desc = insertvalue(desc, adaptor.max(), positionAttr(rewriter, 1));
+    desc = insertvalue(desc, adaptor.step(), positionAttr(rewriter, 2));
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a linalg.slice op into:
+///   1. A function entry `alloca` operation to allocate a ViewDescriptor.
+///   2. A load of the ViewDescriptor from the pointer allocated in 1.
+///   3. Updates to the ViewDescriptor to introduce the data ptr, offset, size
+///      and stride corresponding to the region of memory within the bounds of
+///      the parent view.
+///   4. A store of the resulting ViewDescriptor to the alloca'ed pointer.
+/// The linalg.slice op is replaced by the alloca'ed pointer.
+class SliceOpConversion : public LLVMOpLowering {
+public:
+  explicit SliceOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(SliceOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    SliceOpOperandAdaptor adaptor(operands);
+    auto sliceOp = cast<SliceOp>(op);
+    auto viewDescriptorPtrTy =
+        convertLinalgType(sliceOp.getViewType(), lowering);
+    auto viewType = sliceOp.getBaseViewType();
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+
+    // Helper function to create an integer array attribute out of a list of
+    // values.
+    auto pos = [&rewriter](ArrayRef<int> values) {
+      return positionAttr(rewriter, values);
+    };
+
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    // Declare the view descriptor and insert data ptr *at the entry block of
+    // the function*, which is the preferred location for LLVM's analyses.
+    auto ip = rewriter.getInsertionPoint();
+    auto ib = rewriter.getInsertionBlock();
+    rewriter.setInsertionPointToStart(
+        &op->getParentOfType<FuncOp>().getBlocks().front());
+    Value *zero =
+        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
+    Value *one =
+        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+    // Alloca with proper alignment.
+    Value *allocatedDesc =
+        llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
+    Value *desc = llvm_load(allocatedDesc);
+    rewriter.setInsertionPoint(ib, ip);
+
+    Value *baseDesc = llvm_load(adaptor.view());
+
+    auto ptrPos = pos(kPtrPosInView);
+    auto elementTy = getPtrToElementType(sliceOp.getViewType(), lowering);
+    desc = insertvalue(desc, extractvalue(elementTy, baseDesc, ptrPos), ptrPos);
+
+    // TODO(ntv): extract sizes and emit asserts.
+    SmallVector<Value *, 4> strides(viewType.getRank());
+    for (int i = 0, e = viewType.getRank(); i < e; ++i) {
+      strides[i] = extractvalue(int64Ty, baseDesc, pos({kStridePosInView, i}));
+    }
+
+    // Compute and insert base offset.
+    Value *baseOffset = extractvalue(int64Ty, baseDesc, pos(kOffsetPosInView));
+    for (int i = 0, e = viewType.getRank(); i < e; ++i) {
+      Value *indexing = adaptor.indexings()[i];
+      Value *min = indexing;
+      if (sliceOp.indexing(i)->getType().isa<RangeType>())
+        min = extractvalue(int64Ty, indexing, pos(0));
+      baseOffset = add(baseOffset, mul(min, strides[i]));
+    }
+    desc = insertvalue(desc, baseOffset, pos(kOffsetPosInView));
+
+    // Compute and insert view sizes (max - min along the range) and strides.
+    // Skip the non-range operands as they will be projected away from the view.
+    int numNewDims = 0;
+    for (auto en : llvm::enumerate(sliceOp.indexings())) {
+      Value *indexing = en.value();
+      if (indexing->getType().isa<RangeType>()) {
+        int rank = en.index();
+        Value *rangeDescriptor = adaptor.indexings()[rank];
+        Value *min = extractvalue(int64Ty, rangeDescriptor, pos(0));
+        Value *max = extractvalue(int64Ty, rangeDescriptor, pos(1));
+        Value *step = extractvalue(int64Ty, rangeDescriptor, pos(2));
+        Value *baseSize =
+            extractvalue(int64Ty, baseDesc, pos({kSizePosInView, rank}));
+        // Bound upper by base view upper bound.
+        max = llvm_select(llvm_icmp(ICmpPredicate::slt, max, baseSize), max,
+                          baseSize);
+        Value *size = sub(max, min);
+        // Bound lower by zero.
+        size =
+            llvm_select(llvm_icmp(ICmpPredicate::slt, size, zero), zero, size);
+        Value *stride = mul(strides[rank], step);
+        desc = insertvalue(desc, size, pos({kSizePosInView, numNewDims}));
+        desc = insertvalue(desc, stride, pos({kStridePosInView, numNewDims}));
+        ++numNewDims;
+      }
+    }
+
+    // Store back in alloca'ed region.
+    llvm_store(desc, allocatedDesc);
+    rewriter.replaceOp(op, allocatedDesc);
+    return matchSuccess();
+  }
+};
+
+// A store is converted into the actual address computation, getelementptr and
+// an LLVM IR store.
+class StoreOpConversion : public LoadStoreOpConversion<linalg::StoreOp> {
+  using Base::Base;
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    edsc::ScopedContext edscContext(rewriter, op->getLoc());
+    linalg::StoreOpOperandAdaptor adaptor(operands);
+    Value *ptr = obtainDataPtr(op, adaptor.view(), adaptor.indices(), rewriter);
+    llvm_store(adaptor.value(), ptr);
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a linalg.transpose op into:
+///   1. A function entry `alloca` operation to allocate a ViewDescriptor.
+///   2. A load of the ViewDescriptor from the pointer allocated in 1.
+///   3. Updates to the ViewDescriptor to introduce the data ptr, offset, size
+///      and stride. Size and stride are permutations of the original values.
+///   4. A store of the resulting ViewDescriptor to the alloca'ed pointer.
+/// The linalg.transpose op is replaced by the alloca'ed pointer.
+class TransposeOpConversion : public LLVMOpLowering {
+public:
+  explicit TransposeOpConversion(MLIRContext *context,
+                                 LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(TransposeOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Initialize the common boilerplate and alloca at the top of the FuncOp.
+    TransposeOpOperandAdaptor adaptor(operands);
+    auto tranposeOp = cast<TransposeOp>(op);
+    BaseViewConversionHelper helper(op, tranposeOp.getViewType(), rewriter,
+                                    lowering);
+    IndexType indexType = helper.indexType;
+    ViewType viewType = helper.viewType;
+    LLVMType elementTy = helper.elementTy, int64Ty = helper.int64Ty,
+             viewDescriptorPtrTy = helper.viewDescriptorPtrTy;
+    Value *allocatedDesc = helper.allocatedDesc, *desc = helper.desc;
+
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    // Load the descriptor of the view constructed by the helper.
+    Value *baseDesc = llvm_load(adaptor.view());
+
+    // Copy the base pointer from the old descriptor to the new one.
+    ArrayAttr ptrPos = helper.pos(kPtrPosInView);
+    desc = insertvalue(desc, extractvalue(elementTy, baseDesc, ptrPos), ptrPos);
+
+    // Copy the offset pointer from the old descriptor to the new one.
+    ArrayAttr offPos = helper.pos(kOffsetPosInView);
+    desc = insertvalue(desc, extractvalue(int64Ty, baseDesc, offPos), offPos);
+
+    if (tranposeOp.permutation().isIdentity()) {
+      // No permutation, just store back in alloca'ed region.
+      llvm_store(desc, allocatedDesc);
+      return rewriter.replaceOp(op, allocatedDesc), matchSuccess();
+    }
+
+    // Iterate over the dimensions and apply size/stride permutation.
+    for (auto en : llvm::enumerate(tranposeOp.permutation().getResults())) {
+      int sourcePos = en.index();
+      int targetPos = en.value().cast<AffineDimExpr>().getPosition();
+      Value *size = extractvalue(int64Ty, baseDesc,
+                                 helper.pos({kSizePosInView, sourcePos}));
+      desc = insertvalue(desc, size, helper.pos({kSizePosInView, targetPos}));
+      Value *stride = extractvalue(int64Ty, baseDesc,
+                                   helper.pos({kStridePosInView, sourcePos}));
+      desc =
+          insertvalue(desc, stride, helper.pos({kStridePosInView, targetPos}));
+    }
+
+    // Store back in alloca'ed region.
+    llvm_store(desc, allocatedDesc);
+    rewriter.replaceOp(op, allocatedDesc);
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a linalg.view op into:
+///   1. A function entry `alloca` operation to allocate a ViewDescriptor.
+///   2. A load of the ViewDescriptor from the pointer allocated in 1.
+///   3. Updates to the ViewDescriptor to introduce the data ptr, offset, size
+///      and stride.
+///   4. A store of the resulting ViewDescriptor to the alloca'ed pointer.
+/// The linalg.view op is replaced by the alloca'ed pointer.
+class ViewOpConversion : public LLVMOpLowering {
+public:
+  explicit ViewOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(ViewOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto viewOp = cast<ViewOp>(op);
+    ViewOpOperandAdaptor adaptor(operands);
+    auto viewDescriptorPtrTy =
+        convertLinalgType(viewOp.getViewType(), lowering);
+    auto elementTy = getPtrToElementType(viewOp.getViewType(), lowering);
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+
+    auto pos = [&rewriter](ArrayRef<int> values) {
+      return positionAttr(rewriter, values);
+    };
+
+    Value *bufferDescriptor = adaptor.buffer();
+    auto bufferTy = getPtrToElementType(
+        viewOp.buffer()->getType().cast<BufferType>(), lowering);
+
+    // Declare the descriptor of the view.
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    auto ip = rewriter.getInsertionPoint();
+    auto ib = rewriter.getInsertionBlock();
+    rewriter.setInsertionPointToStart(
+        &op->getParentOfType<FuncOp>().getBlocks().front());
+    Value *one =
+        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+    // Alloca for proper alignment.
+    Value *allocatedDesc =
+        llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
+    Value *desc = llvm_load(allocatedDesc);
+    rewriter.setInsertionPoint(ib, ip);
+
+    // Copy the buffer pointer from the old descriptor to the new one.
+    Value *bufferAsViewElementType =
+        bitcast(elementTy,
+                extractvalue(bufferTy, bufferDescriptor, pos(kPtrPosInBuffer)));
+    desc = insertvalue(desc, bufferAsViewElementType, pos(kPtrPosInView));
+
+    // Zero base offset.
+    auto indexTy = rewriter.getIndexType();
+    Value *baseOffset = constant(int64Ty, IntegerAttr::get(indexTy, 0));
+    desc = insertvalue(desc, baseOffset, pos(kOffsetPosInView));
+
+    // Compute and insert view sizes (max - min along the range).
+    int numRanges = llvm::size(viewOp.ranges());
+    Value *runningStride = constant(int64Ty, IntegerAttr::get(indexTy, 1));
+    for (int i = numRanges - 1; i >= 0; --i) {
+      // Update stride.
+      Value *rangeDescriptor = operands[1 + i];
+      Value *step = extractvalue(int64Ty, rangeDescriptor, pos(2));
+      Value *stride = mul(runningStride, step);
+      desc = insertvalue(desc, stride, pos({kStridePosInView, i}));
+      // Update size.
+      Value *min = extractvalue(int64Ty, rangeDescriptor, pos(0));
+      Value *max = extractvalue(int64Ty, rangeDescriptor, pos(1));
+      Value *size = sub(max, min);
+      desc = insertvalue(desc, size, pos({kSizePosInView, i}));
+      // Update stride for the next dimension.
+      if (i > 0)
+        runningStride = mul(runningStride, max);
+    }
+
+    // Store back in alloca'ed region.
+    llvm_store(desc, allocatedDesc);
+    rewriter.replaceOp(op, allocatedDesc);
+    return matchSuccess();
+  }
+};
+
+// Get function definition for the LinalgOp. If it doesn't exist, insert a
+// definition.
+template <typename LinalgOp>
+static FuncOp
+getLLVMLibraryCallDeclaration(Operation *op, LLVMTypeConverter &lowering,
+                              ConversionPatternRewriter &rewriter) {
+  auto linalgOp = cast<LinalgOp>(op);
+  auto fnName = linalgOp.getLibraryCallName();
+  if (fnName.empty()) {
+    op->emitWarning("No library call defined for: ") << *op;
+    return FuncOp();
+  }
+  auto module = op->getParentOfType<ModuleOp>();
+  if (auto f = module.lookupSymbol<FuncOp>(fnName)) {
+    return f;
+  }
+
+  // Get the Function type consistent with LLVM Lowering.
+  SmallVector<Type, 4> inputTypes;
+  for (auto operand : op->getOperands())
+    inputTypes.push_back(lowering.convertType(operand->getType()));
+  assert(op->getNumResults() == 0 &&
+         "Library call for linalg operation can be generated only for ops that "
+         "have void return types");
+  auto libFnType = FunctionType::get(inputTypes, {}, op->getContext());
+  auto libFn = FuncOp::create(op->getLoc(), fnName, libFnType);
+  module.push_back(libFn);
+  // Return after creating the function definition. The body will be created
+  // later.
+  return libFn;
+}
+
+namespace {
+// The conversion class from Linalg to LLVMIR.
+class LinalgTypeConverter : public LLVMTypeConverter {
+  using LLVMTypeConverter::LLVMTypeConverter;
+
+public:
+  Type convertType(Type t) override {
+    if (auto result = LLVMTypeConverter::convertType(t))
+      return result;
+    return convertLinalgType(t, *this);
+  }
+};
+} // end anonymous namespace
+
+// LinalgOpConversion<LinalgOp> creates a new call to the
+// `LinalgOp::getLibraryCallName()` function.
+// The implementation of the function can be either in the same module or in an
+// externally linked library.
+template <typename LinalgOp> class LinalgOpConversion : public LLVMOpLowering {
+public:
+  explicit LinalgOpConversion(MLIRContext *context,
+                              LinalgTypeConverter &lowering_)
+      : LLVMOpLowering(LinalgOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto f = getLLVMLibraryCallDeclaration<LinalgOp>(op, lowering, rewriter);
+    if (!f)
+      return matchFailure();
+
+    auto fAttr = rewriter.getSymbolRefAttr(f);
+    auto named = rewriter.getNamedAttr("callee", fAttr);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, operands,
+                                              ArrayRef<NamedAttribute>{named});
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern specialization for CopyOp. This kicks in when both input
+/// and output permutations are left unspecified or are the identity.
+template <> class LinalgOpConversion<CopyOp> : public LLVMOpLowering {
+public:
+  explicit LinalgOpConversion(MLIRContext *context,
+                              LinalgTypeConverter &lowering_)
+      : LLVMOpLowering(CopyOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto copyOp = cast<CopyOp>(op);
+    auto inputPerm = copyOp.inputPermutation();
+    if (inputPerm.hasValue() && !inputPerm->isIdentity())
+      return matchFailure();
+    auto outputPerm = copyOp.outputPermutation();
+    if (outputPerm.hasValue() && !outputPerm->isIdentity())
+      return matchFailure();
+
+    auto f = getLLVMLibraryCallDeclaration<CopyOp>(op, lowering, rewriter);
+    if (!f)
+      return matchFailure();
+
+    auto fAttr = rewriter.getSymbolRefAttr(f);
+    auto named = rewriter.getNamedAttr("callee", fAttr);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, operands,
+                                              ArrayRef<NamedAttribute>{named});
+    return matchSuccess();
+  }
+};
+
+/// A non-conversion rewrite pattern kicks in to convert CopyOp with
+/// permutations into a sequence of TransposeOp and permutation-free CopyOp.
+/// This interplays together with TransposeOpConversion and
+/// LinalgConversion<CopyOp> to create a path to the LLVM dialect.
+class CopyTransposeConversion : public OpRewritePattern<CopyOp> {
+public:
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CopyOp op,
+                                     PatternRewriter &rewriter) const override {
+    Value *in = op.input(), *out = op.output();
+
+    // If either inputPerm or outputPerm are non-identities, insert transposes.
+    auto inputPerm = op.inputPermutation();
+    if (inputPerm.hasValue() && !inputPerm->isIdentity())
+      in = rewriter.create<linalg::TransposeOp>(op.getLoc(), in,
+                                                AffineMapAttr::get(*inputPerm));
+    auto outputPerm = op.outputPermutation();
+    if (outputPerm.hasValue() && !outputPerm->isIdentity())
+      out = rewriter.create<linalg::TransposeOp>(
+          op.getLoc(), out, AffineMapAttr::get(*outputPerm));
+
+    // If nothing was transposed, fail and let the conversion kick in.
+    if (in == op.input() && out == op.output())
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<CopyOp>(op, in, out);
+    return matchSuccess();
+  }
+};
+
+/// Populate the given list with patterns that convert from Linalg to LLVM.
+static void
+populateLinalgToLLVMConversionPatterns(LinalgTypeConverter &converter,
+                                       OwningRewritePatternList &patterns,
+                                       MLIRContext *ctx) {
+  patterns.insert<CopyTransposeConversion>(ctx);
+  patterns.insert<BufferAllocOpConversion, BufferDeallocOpConversion,
+                  BufferSizeOpConversion, DimOpConversion,
+                  LinalgOpConversion<CopyOp>, LinalgOpConversion<DotOp>,
+                  LinalgOpConversion<FillOp>, LinalgOpConversion<MatmulOp>,
+                  LoadOpConversion, RangeOpConversion, SliceOpConversion,
+                  StoreOpConversion, TransposeOpConversion, ViewOpConversion>(
+      ctx, converter);
+}
+
+namespace {
+struct LowerLinalgToLLVMPass : public ModulePass<LowerLinalgToLLVMPass> {
+  void runOnModule();
+};
+} // namespace
+
+// This is currently written as a standalone function because the lowering to
+// affine will look different than lowering to LLVM and it is still unclear how
+// everything will be eventually structured.
+static void lowerLinalgSubViewOps(FuncOp &f) {
+  f.walk<SubViewOp>([&](SubViewOp op) {
+    OpBuilder b(op);
+    ScopedContext scope(b, op.getLoc());
+    auto *view = op.getView();
+    SmallVector<Value *, 8> ranges;
+    for (auto sliceRange : op.getRanges())
+      ranges.push_back(range(sliceRange.min, sliceRange.max, sliceRange.step));
+    op.replaceAllUsesWith(slice(view, ranges));
+    op.erase();
+  });
+}
+
+void LowerLinalgToLLVMPass::runOnModule() {
+  auto module = getModule();
+
+  for (auto f : module.getOps<FuncOp>())
+    lowerLinalgSubViewOps(f);
+
+  // Convert to the LLVM IR dialect using the converter defined above.
+  OwningRewritePatternList patterns;
+  LinalgTypeConverter converter(&getContext());
+  populateAffineToStdConversionPatterns(patterns, &getContext());
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  populateStdToLLVMConversionPatterns(converter, patterns);
+  populateLinalgToLLVMConversionPatterns(converter, patterns, &getContext());
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+  if (failed(applyPartialConversion(module, target, patterns, &converter))) {
+    signalPassFailure();
+  }
+}
+
+std::unique_ptr<ModulePassBase> mlir::linalg::createLowerLinalgToLLVMPass() {
+  return std::make_unique<LowerLinalgToLLVMPass>();
+}
+
+static PassRegistration<LowerLinalgToLLVMPass>
+    pass("linalg-lower-to-llvm-dialect",
+         "Lower the operations from the linalg dialect into the LLVM dialect");
diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLoops.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLoops.cpp
new file mode 100644
index 00000000000..c48437f60db
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLoops.cpp
@@ -0,0 +1,399 @@
+//===- LowerToLoops.cpp - conversion from Linalg library ops to loops------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using IndexedLinalgValue = TemplatedIndexedValue<linalg_load, linalg_store>;
+using edsc::op::operator+;
+using edsc::op::operator==;
+
+static SmallVector<ValueHandle, 8>
+foldedAffineApplies(OpBuilder &b, Location loc, AffineMap map,
+                    ArrayRef<Value *> vals, OperationFolder &folder) {
+  assert(map.getNumSymbols() == 0);
+  assert(map.getNumInputs() == vals.size());
+  SmallVector<ValueHandle, 8> res;
+  res.reserve(map.getNumResults());
+  auto dims = map.getNumDims();
+  for (auto e : map.getResults()) {
+    auto exprMap = AffineMap::get(dims, 0, e);
+    SmallVector<Value *, 4> operands(vals.begin(), vals.end());
+    canonicalizeMapAndOperands(&exprMap, &operands);
+    res.push_back(affine_apply(folder, exprMap, operands));
+  }
+  return res;
+}
+
+static SmallVector<Value *, 4> permuteIvs(ArrayRef<Value *> ivs,
+                                          Optional<AffineMap> permutation,
+                                          OperationFolder &state) {
+  return permutation ? applyMapToValues(ScopedContext::getBuilder(),
+                                        ScopedContext::getLocation(),
+                                        permutation.getValue(), ivs, state)
+                     : SmallVector<Value *, 4>(ivs.begin(), ivs.end());
+}
+
+// Creates a number of ranges equal to the number of results in `map`.
+// The returned ranges correspond to the loop ranges, in the proper order, for
+// which new loops will be created.
+static SmallVector<Value *, 4> emitLoopRanges(OpBuilder &b, Location loc,
+                                              AffineMap map,
+                                              ArrayRef<Value *> allViewSizes,
+                                              OperationFolder &folder) {
+  // Apply `map` to get view sizes in loop order.
+  auto sizes = applyMapToValues(b, loc, map, allViewSizes, folder);
+  // Create a new range with the applied tile sizes.
+  ScopedContext scope(b, loc);
+  SmallVector<Value *, 4> res;
+  for (unsigned idx = 0, e = map.getNumResults(); idx < e; ++idx) {
+    res.push_back(range(constant_index(folder, 0), sizes[idx],
+                        constant_index(folder, 1)));
+  }
+  return res;
+}
+
+template <typename LinalgOpType> class LinalgScopedEmitter {};
+
+template <> class LinalgScopedEmitter<CopyOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs, CopyOp copyOp,
+                                       OperationFolder &folder) {
+    auto nPar = copyOp.getNumParallelLoops();
+    assert(nPar == allIvs.size());
+    auto inputIvs =
+        permuteIvs(allIvs.take_front(nPar), copyOp.inputPermutation(), folder);
+    auto outputIvs =
+        permuteIvs(allIvs.take_front(nPar), copyOp.outputPermutation(), folder);
+    SmallVector<IndexHandle, 8> iivs(inputIvs.begin(), inputIvs.end());
+    SmallVector<IndexHandle, 8> oivs(outputIvs.begin(), outputIvs.end());
+    IndexedLinalgValue O(copyOp.getOutput(0)), I(copyOp.getInput(0));
+    // Emit the proper scalar assignment, whether we are dealing with a 0-D or
+    // an n-D loop nest; with or without permutations.
+    // clang-format off
+    nPar > 0 ? O(oivs) = I(iivs) :
+               O() = I();
+    // clang-format on
+  }
+};
+
+template <> class LinalgScopedEmitter<FillOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs, FillOp fillOp,
+                                       OperationFolder &folder) {
+    auto nPar = fillOp.getNumParallelLoops();
+    assert(nPar == allIvs.size());
+    auto ivs =
+        SmallVector<IndexHandle, 4>(allIvs.begin(), allIvs.begin() + nPar);
+    IndexedLinalgValue O(fillOp.getOutput(0));
+    // Emit the proper scalar assignment, whether we are dealing with a 0-D or
+    // an n-D loop nest; with or without permutations.
+    nPar > 0 ? O(ivs) = ValueHandle(fillOp.getValue())
+             : O() = ValueHandle(fillOp.getValue());
+  }
+};
+
+template <> class LinalgScopedEmitter<DotOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs, DotOp dotOp,
+                                       OperationFolder &folder) {
+    assert(allIvs.size() == 1);
+    IndexHandle r_i(allIvs[0]);
+    IndexedLinalgValue A(dotOp.getInput(0)), B(dotOp.getInput(1)),
+        C(dotOp.getOutput(0));
+    // Emit scalar form.
+    C() = C() + A(r_i) * B(r_i);
+  }
+};
+
+template <> class LinalgScopedEmitter<MatvecOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs,
+                                       MatvecOp matvecOp,
+                                       OperationFolder &folder) {
+    assert(allIvs.size() == 2);
+    IndexHandle i(allIvs[0]), r_j(allIvs[1]);
+    IndexedLinalgValue A(matvecOp.getInput(0)), B(matvecOp.getInput(1)),
+        C(matvecOp.getOutput(0));
+    // Emit scalar form.
+    C(i) = C(i) + A(i, r_j) * B(r_j);
+  }
+};
+
+template <> class LinalgScopedEmitter<MatmulOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs,
+                                       MatmulOp matmulOp,
+                                       OperationFolder &folder) {
+    assert(allIvs.size() == 3);
+    IndexHandle i(allIvs[0]), j(allIvs[1]), r_k(allIvs[2]);
+    IndexedLinalgValue A(matmulOp.getInput(0)), B(matmulOp.getInput(1)),
+        C(matmulOp.getOutput(0));
+    // Emit scalar form.
+    C(i, j) = C(i, j) + A(i, r_k) * B(r_k, j);
+  }
+};
+
+template <> class LinalgScopedEmitter<ConvOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs, ConvOp convOp,
+                                       OperationFolder &folder) {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    auto maps = loopToOperandRangesMaps(convOp);
+    SmallVector<ValueHandle, 8> fIdx(
+        foldedAffineApplies(b, loc, maps[0], allIvs, folder));
+    SmallVector<ValueHandle, 8> imIdx(
+        foldedAffineApplies(b, loc, maps[1], allIvs, folder));
+    SmallVector<ValueHandle, 8> oIdx(
+        foldedAffineApplies(b, loc, maps[2], allIvs, folder));
+    IndexedLinalgValue F(convOp.filter()), I(convOp.input()),
+        O(convOp.output());
+    // Emit scalar form.
+    O(oIdx) += F(fIdx) * I(imIdx);
+  }
+};
+
+// Emits the MLIR for the scalar part of the generic op by:
+//   1. Emitting linalg_load and linalg_store ops for each input and output
+//      view in order. This is achieved by applying the appropriate input or
+//      output map to the enclosing induction variables.
+//   2. Emitting a call to `op.fun()` that takes as arguments the scalars
+//      from point 1. above.
+//   3. Emitting linalg_store to store the results of 2. to the output
+//      views.
+//
+// An example output may resemble:
+//
+// ```
+//    loop.for %i = %c0 to %0 step %c1 {
+//      loop.for %j = %c0 to %1 step %c1 {
+//        loop.for %k = %c0 to %4 step %c1 {
+//          %11 = linalg.load %arg0[%i, %j] : !linalg.view<?x?xf32>
+//          %12 = linalg.load %arg1[%i, %j, %k] : !linalg.view<?x?x?xf32>
+//          %13 = linalg.load %arg2[%i, %k, %j] : !linalg.view<?x?x?xf32>
+//          %14:2 = call @foo(%11, %12, %13) : (f32, f32, f32) -> (f32, f32)
+//          linalg.store %14#0, %arg1[%i, %j, %k] : !linalg.view<?x?x?xf32>
+//          linalg.store %14#1, %arg2[%i, %k, %j] : !linalg.view<?x?x?xf32>
+//       }
+//      }
+//    }
+// ```
+template <> class LinalgScopedEmitter<GenericOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value *> allIvs,
+                                       GenericOp genericOp,
+                                       OperationFolder &folder) {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    using edsc::intrinsics::detail::ValueHandleArray;
+    unsigned nInputs = genericOp.getNumInputs();
+    unsigned nOutputs = genericOp.getNumOutputs();
+    SmallVector<Value *, 4> indexedValues(nInputs + nOutputs);
+
+    // 1.a. Emit linalg_load from input views.
+    for (unsigned i = 0, e = nInputs; i < e; ++i) {
+      ValueHandleArray indexing(foldedAffineApplies(
+          b, loc, genericOp.getInputIndexingMap(i), allIvs, folder));
+      indexedValues[i] = linalg_load(genericOp.getInput(i), indexing);
+    }
+
+    // 1.b. Emit linalg_load from output views.
+    for (unsigned i = 0, e = nOutputs; i < e; ++i) {
+      ValueHandleArray indexing(foldedAffineApplies(
+          b, loc, genericOp.getOutputIndexingMap(i), allIvs, folder));
+      indexedValues[nInputs + i] =
+          linalg_load(genericOp.getOutput(i), indexing);
+    }
+
+    auto funcOp = genericOp.getFunction();
+    if (funcOp) {
+      // 2. Emit call.
+      Operation *callOp = call(funcOp, indexedValues);
+      assert(callOp->getNumResults() == genericOp.getNumOutputs());
+
+      // 3. Emit linalg_store.
+      for (unsigned i = 0, e = nOutputs; i < e; ++i) {
+        ValueHandleArray indexing(foldedAffineApplies(
+            b, loc, genericOp.getOutputIndexingMap(i), allIvs, folder));
+        linalg_store(callOp->getResult(i), genericOp.getOutput(i), indexing);
+      }
+    } else {
+      // TODO(ntv): When a region inliner exists, use it.
+      // 2. Inline region, currently only works for a single basic block.
+      BlockAndValueMapping map;
+      auto &block = genericOp.region().front();
+      for (auto it : llvm::zip(block.getArguments(), indexedValues))
+        map.map(std::get<0>(it), std::get<1>(it));
+      for (auto &op : block) {
+        // Skip terminator.
+        if (&op == &block.back())
+          continue;
+        assert(op.getNumRegions() == 0);
+        auto *newOp = b.clone(op, map);
+        for (auto it : llvm::zip(op.getResults(), newOp->getResults()))
+          map.map(std::get<0>(it), std::get<1>(it));
+      }
+
+      // 3. Emit linalg_store.
+      auto *yieldOp = cast<YieldOp>(block.back()).getOperation();
+      assert(yieldOp->getNumOperands() == nOutputs);
+      for (unsigned i = 0, e = nOutputs; i < e; ++i) {
+        ValueHandleArray indexing(foldedAffineApplies(
+            b, loc, genericOp.getOutputIndexingMap(i), allIvs, folder));
+        linalg_store(map.lookup(yieldOp->getOperand(i)), genericOp.getOutput(i),
+                     indexing);
+      }
+    }
+  }
+};
+
+template <typename ConcreteOp>
+class LinalgRewritePattern : public RewritePattern {
+public:
+  explicit LinalgRewritePattern(MLIRContext *context)
+      : RewritePattern(ConcreteOp::getOperationName(), /*benefit=*/1, context) {
+  }
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    OpBuilder b(op);
+    ScopedContext scope(b, op->getLoc());
+
+    // The flattened loopToOperandRangesMaps is expected to be an invertible
+    // permutation map (which is asserted in the inverse calculation).
+    auto linalgOp = cast<ConcreteOp>(op);
+    auto invertedMap =
+        inversePermutation(concatAffineMaps(loopToOperandRangesMaps(linalgOp)));
+    if (!invertedMap) {
+      LinalgScopedEmitter<ConcreteOp>::emitScalarImplementation({}, linalgOp,
+                                                                folder);
+      rewriter.replaceOp(op, {});
+      return matchSuccess();
+    }
+
+    auto nPar = linalgOp.getNumParallelLoops();
+    auto nRed = linalgOp.getNumReductionLoops();
+    auto nWin = linalgOp.getNumWindowLoops();
+    SmallVector<IndexHandle, 4> allIvs(nPar + nRed + nWin);
+    SmallVector<ValueHandle *, 4> allPIvs = makeIndexHandlePointers(allIvs);
+    auto pivs = MutableArrayRef<ValueHandle *>(allPIvs).take_front(nPar);
+    auto rivs = MutableArrayRef<ValueHandle *>(allPIvs)
+                    .take_front(nPar + nRed)
+                    .take_back(nRed);
+    auto wivs = MutableArrayRef<ValueHandle *>(allPIvs).take_back(nWin);
+
+    auto loopRanges =
+        emitLoopRanges(scope.getBuilder(), scope.getLocation(), invertedMap,
+                       getViewSizes(linalgOp), folder);
+    assert(loopRanges.size() == pivs.size() + rivs.size() + wivs.size());
+
+    // clang-format off
+    ArrayRef<Value *> ranges(loopRanges);
+    LoopNestRangeBuilder(pivs, ranges.take_front(nPar))([&] {
+      LoopNestRangeBuilder(rivs, ranges.drop_back(nWin).take_back(nRed))([&] {
+        LoopNestRangeBuilder(wivs, ranges.take_back(wivs.size()))(
+          [&linalgOp, &allIvs, this] {
+            auto allIvValues = extractValues(allIvs);
+            LinalgScopedEmitter<ConcreteOp>::emitScalarImplementation(
+                allIvValues, linalgOp, folder);
+        });
+      });
+    });
+    // clang-format on
+    rewriter.replaceOp(op, {});
+    return matchSuccess();
+  }
+
+  mutable OperationFolder folder;
+};
+
+// Helper classes for type list expansion.
+template <typename... LinalgOps> class ConversionList;
+
+template <> class ConversionList<> {
+public:
+  static void build(OwningRewritePatternList &patterns, MLIRContext *ctx) {}
+};
+
+template <typename ConcreteOp, typename... LinalgOps>
+class ConversionList<ConcreteOp, LinalgOps...> {
+public:
+  static void build(OwningRewritePatternList &patterns, MLIRContext *ctx) {
+    patterns.insert<LinalgRewritePattern<ConcreteOp>>(ctx);
+    ConversionList<LinalgOps...>::build(patterns, ctx);
+  }
+};
+
+/// Populate the given list with patterns that convert from Linalg to LLVM.
+static void
+populateLinalgToLoopRewritePatterns(OwningRewritePatternList &patterns,
+                                    MLIRContext *ctx) {
+  ConversionList<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgLibraryOps.cpp.inc"
+      >::build(patterns, ctx);
+}
+
+namespace {
+struct LowerLinalgToLoopsPass : public FunctionPass<LowerLinalgToLoopsPass> {
+  void runOnFunction();
+};
+} // namespace
+
+void LowerLinalgToLoopsPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  populateLinalgToLoopRewritePatterns(patterns, &getContext());
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<AffineOpsDialect>();
+  target.addLegalDialect<loop::LoopOpsDialect>();
+  target.addLegalDialect<StandardOpsDialect>();
+  if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+    signalPassFailure();
+  }
+}
+
+std::unique_ptr<FunctionPassBase> mlir::linalg::createLowerLinalgToLoopsPass() {
+  return std::make_unique<LowerLinalgToLoopsPass>();
+}
+
+static PassRegistration<LowerLinalgToLoopsPass>
+    pass("linalg-lower-to-loops",
+         "Lower the operations from the linalg dialect into loops");
diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
new file mode 100644
index 00000000000..d2495bdfeac
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -0,0 +1,542 @@
+//===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the linalg dialect Tiling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+using namespace mlir::loop;
+
+#define DEBUG_TYPE "linalg-tiling"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+static llvm::cl::list<unsigned>
+    clTileSizes("linalg-tile-sizes",
+                llvm::cl::desc("Tile sizes by which to tile linalg operations"),
+                llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clPromoteFullTileViews(
+    "linalg-tile-promote-full-tile-views",
+    llvm::cl::desc("Create scoped local buffers for tiled views "),
+    llvm::cl::init(false), llvm::cl::cat(clOptionsCategory));
+
+static bool isZero(Value *v) {
+  return isa_and_nonnull<ConstantIndexOp>(v->getDefiningOp()) &&
+         cast<ConstantIndexOp>(v->getDefiningOp()).getValue() == 0;
+}
+
+// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
+// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument has
+// one entry per surrounding loop. It uses zero as the convention that a
+// particular loop is not tiled. This convention simplifies implementations by
+// avoiding affine map manipulations.
+// The returned ranges correspond to the loop ranges, in the proper order, that
+// are tiled and for which new loops will be created.
+static SmallVector<SubViewOp::Range, 4>
+makeTiledLoopRanges(OpBuilder &b, Location loc, AffineMap map,
+                    ArrayRef<Value *> allViewSizes,
+                    ArrayRef<Value *> allTileSizes, OperationFolder &folder) {
+  assert(allTileSizes.size() == map.getNumResults());
+  // Apply `map` to get view sizes in loop order.
+  auto viewSizes = applyMapToValues(b, loc, map, allViewSizes, folder);
+  SmallVector<Value *, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());
+
+  // Traverse the tile sizes, which are in loop order, erase zeros everywhere.
+  for (int idx = tileSizes.size() - 1; idx >= 0; --idx) {
+    if (isZero(tileSizes[idx])) {
+      viewSizes.erase(viewSizes.begin() + idx);
+      tileSizes.erase(tileSizes.begin() + idx);
+    }
+  }
+
+  // Create a new range with the applied tile sizes.
+  SmallVector<SubViewOp::Range, 4> res;
+  for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
+    res.push_back(SubViewOp::Range{constant_index(folder, 0), viewSizes[idx],
+                                   tileSizes[idx]});
+  }
+  return res;
+}
+
+namespace {
+// Helper visitor to determine whether an AffineExpr is tiled.
+// This is achieved by traversing every AffineDimExpr with position `pos` and
+// checking whether the corresponding `tileSizes[pos]` is non-zero.
+// This also enforces only positive coefficients occur in multiplications.
+//
+// Example:
+//   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
+//
+struct TileCheck : public AffineExprVisitor<TileCheck> {
+  TileCheck(ArrayRef<Value *> tileSizes)
+      : isTiled(false), tileSizes(tileSizes) {}
+
+  void visitDimExpr(AffineDimExpr expr) {
+    isTiled |= !isZero(tileSizes[expr.getPosition()]);
+  }
+  void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
+    visit(expr.getLHS());
+    visit(expr.getRHS());
+    if (expr.getKind() == mlir::AffineExprKind::Mul)
+      assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
+             "nonpositive multipliying coefficient");
+  }
+  bool isTiled;
+  ArrayRef<Value *> tileSizes;
+};
+} // namespace
+
+static bool isTiled(AffineExpr expr, ArrayRef<Value *> tileSizes) {
+  if (!expr)
+    return false;
+  TileCheck t(tileSizes);
+  t.visit(expr);
+  return t.isTiled;
+}
+
+// Checks whether the view with index `viewIndex` within `linalgOp` varies with
+// respect to a non-zero `tileSize`.
+static bool isTiled(AffineMap map, ArrayRef<Value *> tileSizes) {
+  if (!map)
+    return false;
+  for (unsigned r = 0; r < map.getNumResults(); ++r)
+    if (isTiled(map.getResult(r), tileSizes))
+      return true;
+  return false;
+}
+
+static SmallVector<Value *, 4>
+makeTiledViews(OpBuilder &b, Location loc, LinalgOp linalgOp,
+               ArrayRef<Value *> ivs, ArrayRef<Value *> tileSizes,
+               ArrayRef<Value *> viewSizes, OperationFolder &folder) {
+  assert(ivs.size() == static_cast<size_t>(llvm::count_if(
+                           llvm::make_range(tileSizes.begin(), tileSizes.end()),
+                           [](Value *v) { return !isZero(v); })) &&
+         "expected as many ivs as non-zero sizes");
+
+  using edsc::intrinsics::select;
+  using edsc::op::operator+;
+  using edsc::op::operator<;
+
+  // Construct (potentially temporary) mins and maxes on which to apply maps
+  // that define tile subviews.
+  SmallVector<Value *, 8> mins, maxes;
+  for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
+    if (isZero(tileSizes[idx])) {
+      mins.push_back(constant_index(folder, 0));
+      maxes.push_back(viewSizes[idx]);
+    } else {
+      ValueHandle lb(ivs[idxIvs++]), step(tileSizes[idx]);
+      mins.push_back(lb);
+      maxes.push_back(lb + step);
+    }
+  }
+
+  auto *op = linalgOp.getOperation();
+
+  SmallVector<Value *, 4> res;
+  res.reserve(op->getNumOperands());
+  auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
+  for (unsigned viewIndex = 0; viewIndex < linalgOp.getNumInputsAndOutputs();
+       ++viewIndex) {
+    Value *view = *(viewIteratorBegin + viewIndex);
+    unsigned viewRank = view->getType().cast<ViewType>().getRank();
+    auto map = loopToOperandRangesMaps(linalgOp)[viewIndex];
+    // If the view is not tiled, we can use it as is.
+    if (!isTiled(map, tileSizes)) {
+      res.push_back(view);
+      continue;
+    }
+
+    // Construct a new subview for the tile.
+    SmallVector<SubViewOp::Range, 4> subViewOperands;
+    subViewOperands.reserve(viewRank * 3);
+    for (unsigned r = 0; r < viewRank; ++r) {
+      if (!isTiled(map.getSubMap({r}), tileSizes)) {
+        subViewOperands.push_back(SubViewOp::Range{
+            constant_index(folder, 0), linalg::intrinsics::dim(view, r),
+            constant_index(folder, 1)});
+        continue;
+      }
+
+      auto m = map.getSubMap({r});
+      auto *min = applyMapToValues(b, loc, m, mins, folder).front();
+      auto *max = applyMapToValues(b, loc, m, maxes, folder).front();
+      // Tiling creates a new slice at the proper index, the slice step is 1
+      // (i.e. the slice view does not subsample, stepping occurs in the loop).
+      subViewOperands.push_back(
+          SubViewOp::Range{min, max, constant_index(folder, 1)});
+    }
+    res.push_back(b.create<SubViewOp>(loc, view, subViewOperands));
+  }
+
+  // Traverse the mins/maxes and erase those that don't have uses left.
+  mins.append(maxes.begin(), maxes.end());
+  for (auto *v : mins)
+    if (v->use_empty())
+      v->getDefiningOp()->erase();
+
+  return res;
+}
+
+static AffineMap getAffineDifferenceMap(MLIRContext *context) {
+  AffineExpr d0(getAffineDimExpr(0, context)), d1(getAffineDimExpr(1, context));
+  return AffineMap::get(2, 0, {d0 - d1});
+}
+
+static Value *allocBuffer(Type elementType, Value *size) {
+  if (auto cst = dyn_cast_or_null<ConstantIndexOp>(size->getDefiningOp()))
+    return buffer_alloc(
+        BufferType::get(size->getContext(), elementType, cst.getValue()));
+  return buffer_alloc(BufferType::get(size->getContext(), elementType), size);
+}
+
+// Performs promotion of a `subView` into a local buffer of the size of the
+// *ranges* of the `subView`. This produces a buffer whose size may be bigger
+// than the actual size of the `subView` at the boundaries.
+// This is related to the full/partial tile problem.
+// Returns a PromotionInfo containing a `buffer`, `fullLocalView` and
+// `partialLocalView` such that:
+//   * `buffer` is always the size of the full tile.
+//   * `fullLocalView` is a dense contiguous view into that buffer.
+//   * `partialLocalView` is a dense non-contiguous slice of `fullLocalView`
+//     that corresponds to the size of `subView` and accounting for boundary
+//     effects.
+// The point of the full tile buffer is that constant static tile sizes are
+// folded and result in a buffer type with statically known size and alignment
+// properties.
+// To account for general boundary effects, padding must be performed on the
+// boundary tiles. For now this is done with an unconditional `fill` op followed
+// by a partial `copy` op.
+static PromotionInfo promoteFullTileBuffer(OpBuilder &b, Location loc,
+                                           SubViewOp subView,
+                                           OperationFolder &folder) {
+  auto zero = constant_index(folder, 0);
+  auto one = constant_index(folder, 1);
+
+  auto viewType = subView.getViewType();
+  auto rank = viewType.getRank();
+  Value *allocSize = one;
+  SmallVector<Value *, 8> fullRanges, partialRanges;
+  fullRanges.reserve(rank);
+  partialRanges.reserve(rank);
+  for (auto en : llvm::enumerate(subView.getRanges())) {
+    auto rank = en.index();
+    auto rangeValue = en.value();
+    Value *d =
+        isa<linalg::DimOp>(rangeValue.max->getDefiningOp())
+            ? rangeValue.max
+            : applyMapToValues(b, loc, getAffineDifferenceMap(b.getContext()),
+                               {rangeValue.max, rangeValue.min}, folder)
+                  .front();
+    allocSize = muli(folder, allocSize, d).getValue();
+    fullRanges.push_back(range(folder, zero, d, one));
+    partialRanges.push_back(
+        range(folder, zero, linalg::intrinsics::dim(subView, rank), one));
+  }
+  auto *buffer = allocBuffer(viewType.getElementType(), allocSize);
+  auto fullLocalView = view(buffer, fullRanges);
+  auto partialLocalView = slice(fullLocalView, partialRanges);
+  return PromotionInfo{buffer, fullLocalView, partialLocalView};
+}
+
+// Performs promotion of a view `v` into a local buffer of the size of the
+// view. This produces a buffer whose size is exactky the size of `v`.
+// Returns a PromotionInfo containing a `buffer`, `fullLocalView` and
+// `partialLocalView` such that:
+//   * `buffer` is always the size of the view.
+//   * `partialLocalView` is a dense contiguous view into that buffer.
+//   * `fullLocalView` is equal to `partialLocalView`.
+// The point of the full tile buffer is that constant static tile sizes are
+// folded and result in a buffer type with statically known size and alignment
+// properties.
+static PromotionInfo promotePartialTileBuffer(OpBuilder &b, Location loc,
+                                              Value *v,
+                                              OperationFolder &folder) {
+  auto zero = constant_index(folder, 0);
+  auto one = constant_index(folder, 1);
+
+  auto viewType = v->getType().cast<ViewType>();
+  auto rank = viewType.getRank();
+  Value *allocSize = one;
+  SmallVector<Value *, 8> partialRanges;
+  partialRanges.reserve(rank);
+  for (unsigned r = 0; r < rank; ++r) {
+    Value *d = linalg::intrinsics::dim(v, r);
+    allocSize = muli(folder, allocSize, d).getValue();
+    partialRanges.push_back(range(folder, zero, d, one));
+  }
+  auto *buffer = allocBuffer(viewType.getElementType(), allocSize);
+  auto partialLocalView = view(folder, buffer, partialRanges);
+  return PromotionInfo{buffer, partialLocalView, partialLocalView};
+}
+
+SmallVector<PromotionInfo, 8>
+mlir::linalg::promoteLinalgViews(OpBuilder &b, Location loc,
+                                 ArrayRef<Value *> views,
+                                 OperationFolder &folder) {
+  if (views.empty())
+    return {};
+
+  ScopedContext scope(b, loc);
+  SmallVector<PromotionInfo, 8> res;
+  res.reserve(views.size());
+  DenseMap<Value *, PromotionInfo> promotionInfo;
+  for (auto *v : views) {
+    PromotionInfo pi;
+    if (auto subView = dyn_cast<SubViewOp>(v->getDefiningOp()))
+      pi = promoteFullTileBuffer(b, loc, subView, folder);
+    else
+      pi = promotePartialTileBuffer(b, loc, v, folder);
+    promotionInfo.insert(std::make_pair(v, pi));
+    res.push_back(pi);
+  }
+
+  for (auto *v : views) {
+    auto info = promotionInfo.find(v);
+    if (info == promotionInfo.end())
+      continue;
+    auto viewType = v->getType().cast<ViewType>();
+    // TODO(ntv): value to fill with should be related to the operation.
+    // For now, just use APFloat(0.0f).
+    auto t = viewType.getElementType().cast<FloatType>();
+    Value *fillVal = constant_float(folder, APFloat(0.0f), t);
+    // TODO(ntv): fill is only necessary if `promotionInfo` has a full local
+    // view that is different from the partial local view and we are on the
+    // boundary.
+    fill(info->second.fullLocalView, fillVal);
+  }
+
+  for (auto *v : views) {
+    auto info = promotionInfo.find(v);
+    if (info == promotionInfo.end())
+      continue;
+    copy(v, info->second.partialLocalView);
+  }
+  return res;
+}
+
+llvm::Optional<TiledLinalgOp>
+mlir::linalg::tileLinalgOp(LinalgOp op, ArrayRef<Value *> tileSizes,
+                           OperationFolder &folder,
+                           ArrayRef<bool> viewsToPromote) {
+  // 1. Enforce the convention that "tiling by zero" skips tiling a particular
+  // dimension. This convention is significantly simpler to handle instead of
+  // adjusting affine maps to account for missing dimensions.
+  assert(op.getNumParallelLoops() + op.getNumReductionLoops() +
+                 op.getNumWindowLoops() ==
+             tileSizes.size() &&
+         "expected matching number of tile sizes and loops");
+
+  OpBuilder builder(op.getOperation());
+  ScopedContext scope(builder, op.getLoc());
+  // 2. Build the tiled loop ranges.
+  auto viewSizes = getViewSizes(op);
+  // The flattened loopToOperandRangesMaps is expected to be an invertible
+  // permutation map (asserted in the inverse calculation).
+  auto viewSizesToLoopsMap =
+      inversePermutation(concatAffineMaps(loopToOperandRangesMaps(op)));
+  assert(viewSizesToLoopsMap && "expected invertible map");
+  auto loopRanges =
+      makeTiledLoopRanges(scope.getBuilder(), scope.getLocation(),
+                          viewSizesToLoopsMap, viewSizes, tileSizes, folder);
+
+  // 3. Create the tiled loops.
+  LinalgOp res = op;
+  SmallVector<IndexHandle, 4> ivs(loopRanges.size());
+  auto pivs = makeIndexHandlePointers(ivs);
+  LoopNestRangeBuilder(pivs, loopRanges)([&] {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    SmallVector<Value *, 4> ivValues(ivs.begin(), ivs.end());
+    auto views =
+        makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes, folder);
+
+    // If no promotion, we are done.
+    auto promote = !viewsToPromote.empty() &&
+                   llvm::any_of(llvm::make_range(viewsToPromote.begin(),
+                                                 viewsToPromote.end()),
+                                [](bool b) { return b; });
+    if (!promote) {
+      auto operands = getAssumedNonViewOperands(op);
+      views.append(operands.begin(), operands.end());
+      res = op.create(b, loc, views, op.getAttrs());
+      return;
+    }
+
+    // 4. Filter the subset of views that need to be promoted.
+    SmallVector<Value *, 8> filteredViews;
+    filteredViews.reserve(views.size());
+    assert((viewsToPromote.empty() || views.size() == viewsToPromote.size()) &&
+           "expected viewsToPromote to be empty or of the same size as view");
+    for (auto it : llvm::zip(views, viewsToPromote)) {
+      if (!std::get<1>(it))
+        continue;
+      filteredViews.push_back(std::get<0>(it));
+    }
+
+    // 5. Promote the specified views and use them in the new op.
+    auto promotedBufferAndViews =
+        promoteLinalgViews(b, loc, filteredViews, folder);
+    SmallVector<Value *, 8> opViews(views.size(), nullptr);
+    SmallVector<Value *, 8> writebackViews(views.size(), nullptr);
+    for (unsigned i = 0, promotedIdx = 0, e = opViews.size(); i < e; ++i) {
+      if (viewsToPromote[i]) {
+        opViews[i] = promotedBufferAndViews[promotedIdx].fullLocalView;
+        writebackViews[i] =
+            promotedBufferAndViews[promotedIdx].partialLocalView;
+        promotedIdx++;
+      } else {
+        opViews[i] = views[i];
+      }
+    }
+    auto operands = getAssumedNonViewOperands(op);
+    opViews.append(operands.begin(), operands.end());
+    res = op.create(b, loc, opViews, op.getAttrs());
+
+    // 6. Emit write-back for the promoted output views: copy the partial view.
+    for (unsigned i = 0, e = writebackViews.size(); i < e; ++i) {
+      bool isOutput = res.getIndexOfOutput(opViews[i]).hasValue();
+      if (writebackViews[i] && isOutput)
+        copy(writebackViews[i], views[i]);
+    }
+
+    // 7. Dealloc local buffers.
+    for (const auto &pi : promotedBufferAndViews)
+      buffer_dealloc(pi.buffer);
+  });
+
+  // 8. Gather the newly created loops and return them with the new op.
+  SmallVector<ForOp, 8> loops;
+  loops.reserve(ivs.size());
+  for (auto iv : ivs)
+    loops.push_back(loop::getForInductionVarOwner(iv));
+
+  return TiledLinalgOp{res, loops};
+}
+
+llvm::Optional<TiledLinalgOp>
+mlir::linalg::tileLinalgOp(LinalgOp op, ArrayRef<int64_t> tileSizes,
+                           OperationFolder &folder,
+                           ArrayRef<bool> viewsToPromote) {
+  if (tileSizes.empty())
+    return llvm::None;
+
+  // The following uses the convention that "tiling by zero" skips tiling a
+  // particular dimension. This convention is significantly simpler to handle
+  // instead of adjusting affine maps to account for missing dimensions.
+  auto nLoops = op.getNumParallelLoops() + op.getNumReductionLoops() +
+                op.getNumWindowLoops();
+  tileSizes = tileSizes.take_front(nLoops);
+  // If only 0 tilings are left, then return.
+  if (llvm::all_of(tileSizes, [](int64_t v) { return v == 0; }))
+    return llvm::None;
+
+  // Create a builder for tile size constants.
+  OpBuilder builder(op);
+  ScopedContext scope(builder, op.getLoc());
+
+  // Materialize concrete tile size values to pass the generic tiling function.
+  SmallVector<Value *, 8> tileSizeValues;
+  tileSizeValues.reserve(tileSizes.size());
+  for (auto ts : tileSizes)
+    tileSizeValues.push_back(constant_index(folder, ts));
+  // Pad tile sizes with zero values to enforce our convention.
+  if (tileSizeValues.size() < nLoops) {
+    for (unsigned i = tileSizeValues.size(); i < nLoops; ++i)
+      tileSizeValues.push_back(constant_index(folder, 0));
+  }
+
+  return tileLinalgOp(op, tileSizeValues, folder, viewsToPromote);
+}
+
+static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes,
+                          bool promoteViews) {
+  OperationFolder folder;
+  f.walk<LinalgOp>([promoteViews, tileSizes, &folder](LinalgOp op) {
+    // TODO(ntv) some heuristic here to decide what to promote. Atm it is all or
+    // nothing.
+    SmallVector<bool, 8> viewsToPromote(op.getNumInputsAndOutputs(),
+                                        promoteViews);
+    auto opLoopsPair = tileLinalgOp(op, tileSizes, folder, viewsToPromote);
+    // If tiling occurred successfully, erase old op.
+    if (opLoopsPair)
+      op.erase();
+  });
+  f.walk<LinalgOp>([](LinalgOp op) {
+    if (!op.getOperation()->hasNoSideEffect())
+      return;
+    if (op.getOperation()->use_empty())
+      op.erase();
+  });
+}
+
+namespace {
+struct LinalgTilingPass : public FunctionPass<LinalgTilingPass> {
+  LinalgTilingPass() = default;
+  LinalgTilingPass(ArrayRef<int64_t> sizes, bool promoteViews);
+
+  void runOnFunction() {
+    tileLinalgOps(getFunction(), tileSizes, promoteViews);
+  }
+
+  SmallVector<int64_t, 8> tileSizes;
+  bool promoteViews;
+};
+} // namespace
+
+LinalgTilingPass::LinalgTilingPass(ArrayRef<int64_t> sizes, bool promoteViews) {
+  this->tileSizes.assign(sizes.begin(), sizes.end());
+  this->promoteViews = promoteViews;
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::linalg::createLinalgTilingPass(ArrayRef<int64_t> tileSizes,
+                                     bool promoteViews) {
+  return std::make_unique<LinalgTilingPass>(tileSizes, promoteViews);
+}
+
+static PassRegistration<LinalgTilingPass>
+    pass("linalg-tile", "Tile operations in the linalg dialect", [] {
+      auto pass = std::make_unique<LinalgTilingPass>();
+      pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
+      pass->promoteViews = clPromoteFullTileViews;
+      return pass;
+    });
diff --git a/third_party/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/third_party/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
new file mode 100644
index 00000000000..7fefe5ccd48
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -0,0 +1,155 @@
+//===- Utils.cpp - Utilities to support the Linalg dialect ----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements utilities for the Linalg dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+using namespace mlir::loop;
+
+mlir::edsc::LoopRangeBuilder::LoopRangeBuilder(ValueHandle *iv,
+                                               ValueHandle range) {
+  assert(range.getType() && "expected !linalg.range type");
+  assert(range.getValue()->getDefiningOp() &&
+         "need operations to extract range parts");
+  auto rangeOp = cast<RangeOp>(range.getValue()->getDefiningOp());
+  auto lb = rangeOp.min();
+  auto ub = rangeOp.max();
+  auto step = rangeOp.step();
+  auto forOp = OperationHandle::createOp<ForOp>(lb, ub, step);
+  *iv = ValueHandle(forOp.getInductionVar());
+  auto *body = forOp.getBody();
+  enter(body, /*prev=*/1);
+}
+
+mlir::edsc::LoopRangeBuilder::LoopRangeBuilder(ValueHandle *iv,
+                                               SubViewOp::Range range) {
+  auto forOp =
+      OperationHandle::createOp<ForOp>(range.min, range.max, range.step);
+  *iv = ValueHandle(forOp.getInductionVar());
+  auto *body = forOp.getBody();
+  enter(body, /*prev=*/1);
+}
+
+ValueHandle
+mlir::edsc::LoopRangeBuilder::operator()(std::function<void(void)> fun) {
+  if (fun)
+    fun();
+  exit();
+  return ValueHandle::null();
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<SubViewOp::Range> ranges) {
+  loops.reserve(ranges.size());
+  for (unsigned i = 0, e = ranges.size(); i < e; ++i) {
+    loops.emplace_back(ivs[i], ranges[i]);
+  }
+  assert(loops.size() == ivs.size() && "Mismatch loops vs ivs size");
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> ranges) {
+  loops.reserve(ranges.size());
+  for (unsigned i = 0, e = ranges.size(); i < e; ++i) {
+    loops.emplace_back(ivs[i], ranges[i]);
+  }
+  assert(loops.size() == ivs.size() && "Mismatch loops vs ivs size");
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<Value *> ranges)
+    : LoopNestRangeBuilder(
+          ivs, SmallVector<ValueHandle, 4>(ranges.begin(), ranges.end())) {}
+
+ValueHandle LoopNestRangeBuilder::LoopNestRangeBuilder::operator()(
+    std::function<void(void)> fun) {
+  if (fun)
+    fun();
+  for (auto &lit : reverse(loops)) {
+    lit({});
+  }
+  return ValueHandle::null();
+}
+
+static Value *emitOrFoldComposedAffineApply(OpBuilder &b, Location loc,
+                                            AffineMap map,
+                                            ArrayRef<Value *> operandsRef,
+                                            OperationFolder &state) {
+  SmallVector<Value *, 4> operands(operandsRef.begin(), operandsRef.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  canonicalizeMapAndOperands(&map, &operands);
+  return state.create<AffineApplyOp>(b, loc, map, operands);
+}
+
+SmallVector<Value *, 4> mlir::linalg::applyMapToValues(OpBuilder &b,
+                                                       Location loc,
+                                                       AffineMap map,
+                                                       ArrayRef<Value *> values,
+                                                       OperationFolder &state) {
+  SmallVector<Value *, 4> res;
+  res.reserve(map.getNumResults());
+  unsigned numDims = map.getNumDims();
+  // For each `expr` in `map`, applies the `expr` to the values extracted from
+  // ranges. If the resulting application can be folded into a Value*, the
+  // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
+  for (auto expr : map.getResults()) {
+    AffineMap map = AffineMap::get(numDims, 0, expr);
+    res.push_back(emitOrFoldComposedAffineApply(b, loc, map, values, state));
+  }
+  return res;
+}
+
+/// Returns all the operands of `linalgOp` that are not views.
+/// Asserts that these operands are value types to allow transformations like
+/// tiling to just use the values when cloning `linalgOp`.
+SmallVector<Value *, 4>
+mlir::linalg::getAssumedNonViewOperands(LinalgOp linalgOp) {
+  auto *op = linalgOp.getOperation();
+  unsigned numViews = linalgOp.getNumInputsAndOutputs();
+  unsigned nOperands = op->getNumOperands() - numViews;
+  SmallVector<Value *, 4> res;
+  res.reserve(nOperands);
+  for (unsigned i = 0; i < nOperands; ++i) {
+    res.push_back(op->getOperand(numViews + i));
+    auto t = res.back()->getType();
+    (void)t;
+    assert((t.isIntOrIndexOrFloat() || t.isa<VectorType>()) &&
+           "expected scalar or vector type");
+  }
+  return res;
+}
diff --git a/third_party/mlir/lib/Dialect/LoopOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/LoopOps/CMakeLists.txt
new file mode 100644
index 00000000000..ce4a6668900
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LoopOps/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRLoopOps
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/LoopOps
+  )
+add_dependencies(MLIRLoopOps MLIRLoopOpsIncGen MLIRStandardOps LLVMSupport)
+target_link_libraries(MLIRLoopOps LLVMSupport)
diff --git a/third_party/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp
new file mode 100644
index 00000000000..5724402e690
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp
@@ -0,0 +1,22 @@
+//===- DialectRegistration.cpp - Register loop dialect --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+using namespace mlir;
+
+// Static initialization for loop dialect registration.
+static DialectRegistration<loop::LoopOpsDialect> LoopOps;
diff --git a/third_party/mlir/lib/Dialect/LoopOps/LoopOps.cpp b/third_party/mlir/lib/Dialect/LoopOps/LoopOps.cpp
new file mode 100644
index 00000000000..4d99cac3a04
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/LoopOps/LoopOps.cpp
@@ -0,0 +1,208 @@
+//===- Ops.cpp - Loop MLIR Operations -------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+//===----------------------------------------------------------------------===//
+// LoopOpsDialect
+//===----------------------------------------------------------------------===//
+
+LoopOpsDialect::LoopOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LoopOps/LoopOps.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// ForOp
+//===----------------------------------------------------------------------===//
+
+void ForOp::build(Builder *builder, OperationState *result, Value *lb,
+                  Value *ub, Value *step) {
+  result->addOperands({lb, ub, step});
+  Region *bodyRegion = result->addRegion();
+  ForOp::ensureTerminator(*bodyRegion, *builder, result->location);
+  bodyRegion->front().addArgument(builder->getIndexType());
+}
+
+LogicalResult verify(ForOp op) {
+  if (auto cst = dyn_cast_or_null<ConstantIndexOp>(op.step()->getDefiningOp()))
+    if (cst.getValue() <= 0)
+      return op.emitOpError("constant step operand must be nonnegative");
+
+  // Check that the body defines as single block argument for the induction
+  // variable.
+  auto *body = op.getBody();
+  if (body->getNumArguments() != 1 ||
+      !body->getArgument(0)->getType().isIndex())
+    return op.emitOpError("expected body to have a single index argument for "
+                          "the induction variable");
+  return success();
+}
+
+static void print(OpAsmPrinter *p, ForOp op) {
+  *p << op.getOperationName() << " " << *op.getInductionVar() << " = "
+     << *op.lowerBound() << " to " << *op.upperBound() << " step "
+     << *op.step();
+  p->printRegion(op.region(),
+                 /*printEntryBlockArgs=*/false,
+                 /*printBlockTerminators=*/false);
+  p->printOptionalAttrDict(op.getAttrs());
+}
+
+static ParseResult parseForOp(OpAsmParser *parser, OperationState *result) {
+  auto &builder = parser->getBuilder();
+  OpAsmParser::OperandType inductionVariable, lb, ub, step;
+  // Parse the induction variable followed by '='.
+  if (parser->parseRegionArgument(inductionVariable) || parser->parseEqual())
+    return failure();
+
+  // Parse loop bounds.
+  Type indexType = builder.getIndexType();
+  if (parser->parseOperand(lb) ||
+      parser->resolveOperand(lb, indexType, result->operands) ||
+      parser->parseKeyword("to") || parser->parseOperand(ub) ||
+      parser->resolveOperand(ub, indexType, result->operands) ||
+      parser->parseKeyword("step") || parser->parseOperand(step) ||
+      parser->resolveOperand(step, indexType, result->operands))
+    return failure();
+
+  // Parse the body region.
+  Region *body = result->addRegion();
+  if (parser->parseRegion(*body, inductionVariable, indexType))
+    return failure();
+
+  ForOp::ensureTerminator(*body, builder, result->location);
+
+  // Parse the optional attribute list.
+  if (parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+
+  return success();
+}
+
+ForOp mlir::loop::getForInductionVarOwner(Value *val) {
+  auto *ivArg = dyn_cast<BlockArgument>(val);
+  if (!ivArg)
+    return ForOp();
+  assert(ivArg->getOwner() && "unlinked block argument");
+  auto *containingInst = ivArg->getOwner()->getParentOp();
+  return dyn_cast_or_null<ForOp>(containingInst);
+}
+
+//===----------------------------------------------------------------------===//
+// IfOp
+//===----------------------------------------------------------------------===//
+
+void IfOp::build(Builder *builder, OperationState *result, Value *cond,
+                 bool withElseRegion) {
+  result->addOperands(cond);
+  Region *thenRegion = result->addRegion();
+  Region *elseRegion = result->addRegion();
+  IfOp::ensureTerminator(*thenRegion, *builder, result->location);
+  if (withElseRegion)
+    IfOp::ensureTerminator(*elseRegion, *builder, result->location);
+}
+
+static LogicalResult verify(IfOp op) {
+  // Verify that the entry of each child region does not have arguments.
+  for (auto &region : op.getOperation()->getRegions()) {
+    if (region.empty())
+      continue;
+
+    for (auto &b : region)
+      if (b.getNumArguments() != 0)
+        return op.emitOpError(
+            "requires that child entry blocks have no arguments");
+  }
+  return success();
+}
+
+static ParseResult parseIfOp(OpAsmParser *parser, OperationState *result) {
+  // Create the regions for 'then'.
+  result->regions.reserve(2);
+  Region *thenRegion = result->addRegion();
+  Region *elseRegion = result->addRegion();
+
+  auto &builder = parser->getBuilder();
+  OpAsmParser::OperandType cond;
+  Type i1Type = builder.getIntegerType(1);
+  if (parser->parseOperand(cond) ||
+      parser->resolveOperand(cond, i1Type, result->operands))
+    return failure();
+
+  // Parse the 'then' region.
+  if (parser->parseRegion(*thenRegion, {}, {}))
+    return failure();
+  IfOp::ensureTerminator(*thenRegion, parser->getBuilder(), result->location);
+
+  // If we find an 'else' keyword then parse the 'else' region.
+  if (!parser->parseOptionalKeyword("else")) {
+    if (parser->parseRegion(*elseRegion, {}, {}))
+      return failure();
+    IfOp::ensureTerminator(*elseRegion, parser->getBuilder(), result->location);
+  }
+
+  // Parse the optional attribute list.
+  if (parser->parseOptionalAttributeDict(result->attributes))
+    return failure();
+
+  return success();
+}
+
+static void print(OpAsmPrinter *p, IfOp op) {
+  *p << IfOp::getOperationName() << " " << *op.condition();
+  p->printRegion(op.thenRegion(),
+                 /*printEntryBlockArgs=*/false,
+                 /*printBlockTerminators=*/false);
+
+  // Print the 'else' regions if it exists and has a block.
+  auto &elseRegion = op.elseRegion();
+  if (!elseRegion.empty()) {
+    *p << " else";
+    p->printRegion(elseRegion,
+                   /*printEntryBlockArgs=*/false,
+                   /*printBlockTerminators=*/false);
+  }
+
+  p->printOptionalAttrDict(op.getAttrs());
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LoopOps/LoopOps.cpp.inc"
diff --git a/third_party/mlir/lib/Dialect/QuantOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/QuantOps/CMakeLists.txt
new file mode 100644
index 00000000000..74b3f3c4525
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_library(MLIRQuantOps
+  IR/DialectRegistration.cpp
+  IR/QuantOps.cpp
+  IR/QuantTypes.cpp
+  IR/TypeDetail.h
+  IR/TypeParser.cpp
+  Transforms/ConvertConst.cpp
+  Transforms/ConvertSimQuant.cpp
+  Utils/QuantizeUtils.cpp
+  Utils/UniformSupport.cpp
+  Utils/FakeQuantSupport.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/QuantOps
+  )
+add_dependencies(MLIRQuantOps
+                 MLIRIR
+                 MLIRPass
+                 MLIRQuantOpsIncGen
+                 MLIRSupport
+                 MLIRStandardOps)
diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp
new file mode 100644
index 00000000000..b071248f4bb
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp
@@ -0,0 +1,24 @@
+//===- DialectRegistration.cpp - Register Quantization dialect ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+// Static initialization for Quantization dialect registration.
+static mlir::DialectRegistration<QuantizationDialect> QuantizationOps;
diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp
new file mode 100644
index 00000000000..3bd49d43adc
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp
@@ -0,0 +1,74 @@
+//===- QuantOps.cpp - Quantization Type and Ops Implementation --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "TypeDetail.h"
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+using namespace mlir::quant::detail;
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/QuantOps/QuantOps.cpp.inc"
+
+namespace {
+
+/// Matches x -> [scast -> scast] -> y, replacing the second scast with the
+/// value of x if the casts invert each other.
+class RemoveRedundantStorageCastsRewrite
+    : public OpRewritePattern<StorageCastOp> {
+public:
+  using OpRewritePattern<StorageCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(StorageCastOp op,
+                                     PatternRewriter &rewriter) const override {
+    if (!matchPattern(op.arg(), m_Op<StorageCastOp>()))
+      return matchFailure();
+    auto srcScastOp = cast<StorageCastOp>(op.arg()->getDefiningOp());
+    if (srcScastOp.arg()->getType() != op.getType())
+      return matchFailure();
+
+    rewriter.replaceOp(op, srcScastOp.arg());
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void StorageCastOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<RemoveRedundantStorageCastsRewrite>(context);
+}
+
+QuantizationDialect::QuantizationDialect(MLIRContext *context)
+    : Dialect(/*name=*/"quant", context) {
+  addTypes<AnyQuantizedType, UniformQuantizedType,
+           UniformQuantizedPerAxisType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/QuantOps/QuantOps.cpp.inc"
+      >();
+}
diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp
new file mode 100644
index 00000000000..6cc8ab0f52f
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp
@@ -0,0 +1,412 @@
+//===- QuantOps.cpp - Quantization Type and Ops Implementation --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "TypeDetail.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+using namespace mlir::quant::detail;
+
+unsigned QuantizedType::getFlags() const {
+  return static_cast<ImplType *>(impl)->flags;
+}
+
+LogicalResult QuantizedType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  // Verify that the storage type is integral.
+  // This restriction may be lifted at some point in favor of using bf16
+  // or f16 as exact representations on hardware where that is advantageous.
+  auto intStorageType = storageType.dyn_cast<IntegerType>();
+  if (!intStorageType) {
+    if (loc) {
+      emitError(*loc, "storage type must be integral");
+    }
+    return failure();
+  }
+  unsigned integralWidth = intStorageType.getWidth();
+
+  // Verify storage width.
+  if (integralWidth == 0 || integralWidth > MaxStorageBits) {
+    if (loc) {
+      emitError(*loc, "illegal storage type size: ") << integralWidth;
+    }
+    return failure();
+  }
+
+  // Verify storageTypeMin and storageTypeMax.
+  bool isSigned =
+      (flags & QuantizationFlags::Signed) == QuantizationFlags::Signed;
+  int64_t defaultIntegerMin =
+      getDefaultMininumForInteger(isSigned, integralWidth);
+  int64_t defaultIntegerMax =
+      getDefaultMaxinumForInteger(isSigned, integralWidth);
+  if (storageTypeMax - storageTypeMin <= 0 ||
+      storageTypeMin < defaultIntegerMin ||
+      storageTypeMax > defaultIntegerMax) {
+    if (loc) {
+      emitError(*loc, "illegal storage min and storage max: (")
+          << storageTypeMin << ":" << storageTypeMax << ")";
+    }
+    return failure();
+  }
+  return success();
+}
+
+Type QuantizedType::getStorageType() const {
+  return static_cast<ImplType *>(impl)->storageType;
+}
+
+int64_t QuantizedType::getStorageTypeMin() const {
+  return static_cast<ImplType *>(impl)->storageTypeMin;
+}
+
+int64_t QuantizedType::getStorageTypeMax() const {
+  return static_cast<ImplType *>(impl)->storageTypeMax;
+}
+
+unsigned QuantizedType::getStorageTypeIntegralWidth() const {
+  // NOTE: If ever supporting non-integral storage types, some other scheme
+  // for determining the width will be needed.
+  return static_cast<ImplType *>(impl)->storageType.getIntOrFloatBitWidth();
+}
+
+Type QuantizedType::getExpressedType() const {
+  return static_cast<ImplType *>(impl)->expressedType;
+}
+
+bool QuantizedType::isCompatibleExpressedType(Type candidateExpressedType) {
+  if (candidateExpressedType.isa<ShapedType>()) {
+    return candidateExpressedType.cast<ShapedType>().getElementType() ==
+           getExpressedType();
+  }
+  return candidateExpressedType == getExpressedType();
+}
+
+QuantizedType
+QuantizedType::getQuantizedElementType(Type primitiveOrContainerType) {
+  if (primitiveOrContainerType.isa<ShapedType>()) {
+    Type elementType =
+        primitiveOrContainerType.cast<ShapedType>().getElementType();
+    return elementType.dyn_cast<QuantizedType>();
+  }
+  return primitiveOrContainerType.dyn_cast<QuantizedType>();
+}
+
+Type QuantizedType::castFromStorageType(Type candidateType) {
+  if (candidateType == getStorageType()) {
+    // i.e. i32 -> quant<"uniform[i8:f32]{1.0}">
+    return *this;
+  } else if (candidateType.isa<RankedTensorType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    return RankedTensorType::get(
+        candidateType.cast<RankedTensorType>().getShape(), getStorageType());
+  } else if (candidateType.isa<UnrankedTensorType>()) {
+    // i.e. tensor<i8> -> tensor<!quant<"uniform[i8:f32]{1.0}">>
+    return UnrankedTensorType::get(getStorageType());
+  } else if (candidateType.isa<VectorType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    return VectorType::get(candidateType.cast<VectorType>().getShape(),
+                           getStorageType());
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castToStorageType(Type quantizedType) {
+  if (quantizedType.isa<QuantizedType>()) {
+    // i.e. quant<"uniform[i8:f32]{1.0}"> -> i8
+    return quantizedType.cast<QuantizedType>().getStorageType();
+  } else if (quantizedType.isa<ShapedType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    ShapedType sType = quantizedType.cast<ShapedType>();
+    if (!sType.getElementType().isa<QuantizedType>()) {
+      return nullptr;
+    }
+    Type storageType =
+        sType.getElementType().cast<QuantizedType>().getStorageType();
+    if (quantizedType.isa<RankedTensorType>()) {
+      return RankedTensorType::get(sType.getShape(), storageType);
+    } else if (quantizedType.isa<UnrankedTensorType>()) {
+      return UnrankedTensorType::get(storageType);
+    } else if (quantizedType.isa<VectorType>()) {
+      return VectorType::get(sType.getShape(), storageType);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castFromExpressedType(Type candidateType) {
+  if (candidateType == getExpressedType()) {
+    // i.e. f32 -> quant<"uniform[i8:f32]{1.0}">
+    return *this;
+  } else if (candidateType.isa<ShapedType>()) {
+    ShapedType candidateShapedType = candidateType.cast<ShapedType>();
+    if (candidateShapedType.getElementType() != getExpressedType()) {
+      return nullptr;
+    }
+
+    if (candidateType.isa<RankedTensorType>()) {
+      // i.e. tensor<4xf32> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+      return RankedTensorType::get(candidateShapedType.getShape(), *this);
+    } else if (candidateType.isa<UnrankedTensorType>()) {
+      // i.e. tensor<xf32> -> tensor<x!quant<"uniform[i8:f32]{1.0}">>
+      return UnrankedTensorType::get(*this);
+    } else if (candidateType.isa<VectorType>()) {
+      // i.e. tensor<4xf32> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+      return VectorType::get(candidateShapedType.getShape(), *this);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castToExpressedType(Type quantizedType) {
+  if (quantizedType.isa<QuantizedType>()) {
+    // i.e. quant<"uniform[i8:f32]{1.0}"> -> f32
+    return quantizedType.cast<QuantizedType>().getExpressedType();
+  } else if (quantizedType.isa<ShapedType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    ShapedType sType = quantizedType.cast<ShapedType>();
+    if (!sType.getElementType().isa<QuantizedType>()) {
+      return nullptr;
+    }
+    Type expressedType =
+        sType.getElementType().cast<QuantizedType>().getExpressedType();
+    if (quantizedType.isa<RankedTensorType>()) {
+      return RankedTensorType::get(sType.getShape(), expressedType);
+    } else if (quantizedType.isa<UnrankedTensorType>()) {
+      return UnrankedTensorType::get(expressedType);
+    } else if (quantizedType.isa<VectorType>()) {
+      return VectorType::get(sType.getShape(), expressedType);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castExpressedToStorageType(Type candidateType) {
+  Type expressedQuantizedType = castFromExpressedType(candidateType);
+  if (!expressedQuantizedType) {
+    return nullptr;
+  }
+  return QuantizedType::castToStorageType(expressedQuantizedType);
+}
+
+AnyQuantizedType AnyQuantizedType::get(unsigned flags, Type storageType,
+                                       Type expressedType,
+                                       int64_t storageTypeMin,
+                                       int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(), QuantizationTypes::Any, flags,
+                   storageType, expressedType, storageTypeMin, storageTypeMax);
+}
+
+AnyQuantizedType AnyQuantizedType::getChecked(unsigned flags, Type storageType,
+                                              Type expressedType,
+                                              int64_t storageTypeMin,
+                                              int64_t storageTypeMax,
+                                              Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::Any, flags, storageType,
+                          expressedType, storageTypeMin, storageTypeMax);
+}
+
+LogicalResult AnyQuantizedType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (expressedType && !expressedType.isa<FloatType>()) {
+    if (loc) {
+      emitError(*loc, "expressed type must be floating point");
+    }
+    return failure();
+  }
+
+  return success();
+}
+
+UniformQuantizedType UniformQuantizedType::get(unsigned flags, Type storageType,
+                                               Type expressedType, double scale,
+                                               int64_t zeroPoint,
+                                               int64_t storageTypeMin,
+                                               int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(),
+                   QuantizationTypes::UniformQuantized, flags, storageType,
+                   expressedType, scale, zeroPoint, storageTypeMin,
+                   storageTypeMax);
+}
+
+UniformQuantizedType
+UniformQuantizedType::getChecked(unsigned flags, Type storageType,
+                                 Type expressedType, double scale,
+                                 int64_t zeroPoint, int64_t storageTypeMin,
+                                 int64_t storageTypeMax, Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::UniformQuantized, flags,
+                          storageType, expressedType, scale, zeroPoint,
+                          storageTypeMin, storageTypeMax);
+}
+
+LogicalResult UniformQuantizedType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, double scale, int64_t zeroPoint,
+    int64_t storageTypeMin, int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Uniform quantization requires fully expressed parameters, including
+  // expressed type.
+  if (!expressedType) {
+    if (loc) {
+      emitError(*loc, "uniform quantization requires expressed type");
+    }
+    return failure();
+  }
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (!expressedType.isa<FloatType>()) {
+    if (loc) {
+      emitError(*loc, "expressed type must be floating point");
+    }
+    return failure();
+  }
+
+  // Verify scale.
+  if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) {
+    if (loc) {
+      emitError(*loc) << "illegal scale: " << scale;
+    }
+    return failure();
+  }
+
+  return success();
+}
+
+double UniformQuantizedType::getScale() const { return getImpl()->scale; }
+
+int64_t UniformQuantizedType::getZeroPoint() const {
+  return getImpl()->zeroPoint;
+}
+
+UniformQuantizedPerAxisType UniformQuantizedPerAxisType::get(
+    unsigned flags, Type storageType, Type expressedType,
+    ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+    int32_t quantizedDimension, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(),
+                   QuantizationTypes::UniformQuantizedPerAxis, flags,
+                   storageType, expressedType, scales, zeroPoints,
+                   quantizedDimension, storageTypeMin, storageTypeMax);
+}
+
+UniformQuantizedPerAxisType UniformQuantizedPerAxisType::getChecked(
+    unsigned flags, Type storageType, Type expressedType,
+    ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+    int32_t quantizedDimension, int64_t storageTypeMin, int64_t storageTypeMax,
+    Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::UniformQuantizedPerAxis, flags,
+                          storageType, expressedType, scales, zeroPoints,
+                          quantizedDimension, storageTypeMin, storageTypeMax);
+}
+
+LogicalResult UniformQuantizedPerAxisType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, ArrayRef<double> scales,
+    ArrayRef<int64_t> zeroPoints, int32_t quantizedDimension,
+    int64_t storageTypeMin, int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Uniform quantization requires fully expressed parameters, including
+  // expressed type.
+  if (!expressedType) {
+    if (loc) {
+      emitError(*loc, "uniform quantization requires expressed type");
+    }
+    return failure();
+  }
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (!expressedType.isa<FloatType>()) {
+    if (loc) {
+      emitError(*loc, "expressed type must be floating point");
+    }
+    return failure();
+  }
+
+  // Ensure that the number of scales and zeroPoints match.
+  if (scales.size() != zeroPoints.size()) {
+    if (loc) {
+      emitError(*loc, "illegal number of scales and zeroPoints: ")
+          << scales.size() << ", " << zeroPoints.size();
+    }
+    return failure();
+  }
+
+  // Verify scale.
+  for (double scale : scales) {
+    if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) {
+      if (loc) {
+        emitError(*loc) << "illegal scale: " << scale;
+      }
+      return failure();
+    }
+  }
+
+  return success();
+}
+
+ArrayRef<double> UniformQuantizedPerAxisType::getScales() const {
+  return getImpl()->getScales();
+}
+
+ArrayRef<int64_t> UniformQuantizedPerAxisType::getZeroPoints() const {
+  return getImpl()->getZeroPoints();
+}
+
+int32_t UniformQuantizedPerAxisType::getQuantizedDimension() const {
+  return getImpl()->quantizedDimension;
+}
diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h b/third_party/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h
new file mode 100644
index 00000000000..4949b128481
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h
@@ -0,0 +1,269 @@
+//===- TypeDetail.h - QuantOps Type detail ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef TYPE_DETAIL_H_
+#define TYPE_DETAIL_H_
+
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/bit.h"
+
+namespace mlir {
+namespace quant {
+namespace detail {
+
+struct QuantizedTypeStorage : public mlir::TypeStorage {
+  QuantizedTypeStorage(unsigned flags, Type storageType, Type expressedType,
+                       int64_t storageTypeMin, int64_t storageTypeMax)
+      : flags(flags), storageType(storageType), expressedType(expressedType),
+        storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+
+  /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+  unsigned flags;
+
+  // Integral type for the storage point representation.
+  Type storageType;
+
+  // Floating point type that the quantized type approximates.
+  Type expressedType;
+
+  // The minimum value storageType can take.
+  int64_t storageTypeMin;
+
+  // The maximum value storageType can take.
+  int64_t storageTypeMax;
+};
+
+struct AnyQuantizedTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType,
+          int64_t storageTypeMin, int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+    unsigned flags;
+    Type storageType;
+    Type expressedType;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      return llvm::hash_combine(flags, storageType, expressedType,
+                                storageTypeMin, storageTypeMax);
+    }
+  };
+
+  AnyQuantizedTypeStorage(const KeyTy &key)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static AnyQuantizedTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    return new (allocator.allocate<AnyQuantizedTypeStorage>())
+        AnyQuantizedTypeStorage(key);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+};
+
+struct UniformQuantizedTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType, double scale,
+          int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          scale(scale), zeroPoint(zeroPoint), storageTypeMin(storageTypeMin),
+          storageTypeMax(storageTypeMax) {}
+    /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+    unsigned flags;
+
+    // Integral type for the storage point representation.
+    Type storageType;
+
+    // Floating point type that the quantized type approximates.
+    Type expressedType;
+
+    double scale;
+    int64_t zeroPoint;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType && lhs.scale == rhs.scale &&
+             lhs.zeroPoint == rhs.zeroPoint &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      int64_t scaleBits = llvm::bit_cast<int64_t>(scale);
+      return llvm::hash_combine(flags, storageType, expressedType, scaleBits,
+                                zeroPoint, storageTypeMin, storageTypeMax);
+    }
+  };
+
+  UniformQuantizedTypeStorage(const KeyTy &key)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax),
+        scale(key.scale), zeroPoint(key.zeroPoint) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static UniformQuantizedTypeStorage *construct(TypeStorageAllocator &allocator,
+                                                const KeyTy &key) {
+    return new (allocator.allocate<UniformQuantizedTypeStorage>())
+        UniformQuantizedTypeStorage(key);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+
+  double scale;
+  int64_t zeroPoint;
+};
+
+struct UniformQuantizedPerAxisTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType,
+          ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+          int32_t quantizedDimension, int64_t storageTypeMin,
+          int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          scales(scales), zeroPoints(zeroPoints),
+          quantizedDimension(quantizedDimension),
+          storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+    /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+    unsigned flags;
+
+    // Integral type for the storage point representation.
+    Type storageType;
+
+    // Floating point type that the quantized type approximates.
+    Type expressedType;
+
+    ArrayRef<double> scales;
+    ArrayRef<int64_t> zeroPoints;
+    int32_t quantizedDimension;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    ArrayRef<double> getScales() const { return scales; }
+
+    ArrayRef<int64_t> getZeroPoints() const { return zeroPoints; }
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType &&
+             lhs.getScales() == rhs.getScales() &&
+             lhs.getZeroPoints() == rhs.getZeroPoints() &&
+             lhs.quantizedDimension == rhs.quantizedDimension &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      int64_t *scalesCast = llvm::bit_cast<int64_t *>(scales.data());
+      ArrayRef<int64_t> scalesBits(scalesCast, scales.size());
+      return llvm::hash_combine(
+          flags, storageType, expressedType,
+          llvm::hash_combine_range(scalesBits.begin(), scalesBits.end()),
+          llvm::hash_combine_range(zeroPoints.begin(), zeroPoints.end()),
+          storageTypeMin, storageTypeMax);
+    }
+  };
+
+  // We pass scales and zeroPoints in directly rather than relying on KeyTy
+  // because we have to create new reallocated versions in `constrcut` below.
+  UniformQuantizedPerAxisTypeStorage(const KeyTy &key, ArrayRef<double> scales,
+                                     ArrayRef<int64_t> zeroPoints)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax),
+        scaleElements(scales.data()), zeroPointElements(zeroPoints.data()),
+        quantParamsSize(scales.size()),
+        quantizedDimension(key.quantizedDimension) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static UniformQuantizedPerAxisTypeStorage *
+  construct(TypeStorageAllocator &allocator, const KeyTy &key) {
+    ArrayRef<double> scales = allocator.copyInto(key.scales);
+    ArrayRef<int64_t> zeroPoints = allocator.copyInto(key.zeroPoints);
+    return new (allocator.allocate<UniformQuantizedPerAxisTypeStorage>())
+        UniformQuantizedPerAxisTypeStorage(key, scales, zeroPoints);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+
+  ArrayRef<double> getScales() const {
+    return ArrayRef<double>(scaleElements, quantParamsSize);
+  }
+
+  ArrayRef<int64_t> getZeroPoints() const {
+    return ArrayRef<int64_t>(zeroPointElements, quantParamsSize);
+  }
+
+  const double *scaleElements;
+  const int64_t *zeroPointElements;
+  unsigned quantParamsSize;
+  int32_t quantizedDimension;
+};
+
+} // namespace detail
+} // namespace quant
+} // namespace mlir
+
+#endif // TYPE_DETAIL_H_
diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp b/third_party/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp
new file mode 100644
index 00000000000..b3fbad8bd62
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp
@@ -0,0 +1,744 @@
+//===- TypeParser.h - Quantization Type Parser ------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace quant {
+
+/// Print a floating point value in a way that the parser will be able to
+/// round-trip losslessly.
+static void printStabilizedFloat(const APFloat &apValue, raw_ostream &os) {
+  // We would like to output the FP constant value in exponential notation,
+  // but we cannot do this if doing so will lose precision.  Check here to
+  // make sure that we only output it in exponential format if we can parse
+  // the value back and get the same value.
+  bool isInf = apValue.isInfinity();
+  bool isNaN = apValue.isNaN();
+  if (!isInf && !isNaN) {
+    SmallString<128> strValue;
+    apValue.toString(strValue, 6, 0, false);
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+    // that the string matches the "[-+]?[0-9]" regex.
+    assert(((strValue[0] >= '0' && strValue[0] <= '9') ||
+            ((strValue[0] == '-' || strValue[0] == '+') &&
+             (strValue[1] >= '0' && strValue[1] <= '9'))) &&
+           "[-+]?[0-9] regex does not match!");
+    // Reparse stringized version!
+    if (APFloat(apValue.getSemantics(), strValue).bitwiseIsEqual(apValue)) {
+      os << strValue;
+      return;
+    }
+  }
+
+  SmallVector<char, 16> str;
+  apValue.toString(str);
+  os << str;
+}
+
+namespace {
+
+enum class TokenKind {
+  error,
+  eof,
+  l_brace,
+  r_brace,
+  l_angle,
+  r_angle,
+  colon,
+  comma,
+  alpha_ident,
+  integer_literal,
+  float_literal,
+};
+
+struct Token {
+  TokenKind kind;
+  StringRef spelling;
+};
+
+class Lexer {
+public:
+  Lexer(StringRef source) : curBuffer(source), curPtr(curBuffer.begin()) {}
+
+  Token lexToken();
+
+private:
+  Token formToken(TokenKind kind, const char *tokStart) {
+    return Token{kind, StringRef(tokStart, curPtr - tokStart)};
+  }
+
+  Token emitError(const char *loc, const Twine &message) {
+    return formToken(TokenKind::error, loc);
+  }
+
+  bool isEnd() const { return curPtr == curBuffer.end(); }
+
+  // Lexer implementation methods
+  Token lexalpha_ident(const char *tokStart);
+  Token lexNumber(const char *tokStart);
+
+  StringRef curBuffer;
+  const char *curPtr;
+};
+
+} // namespace
+
+Token Lexer::lexToken() {
+  // Ignore whitespace.
+  while (!isEnd()) {
+    switch (*curPtr) {
+    case ' ':
+    case '\t':
+    case '\n':
+    case '\r':
+      ++curPtr;
+      continue;
+    default:
+      break;
+    }
+    break;
+  }
+
+  if (isEnd()) {
+    return Token{TokenKind::eof, ""};
+  }
+
+  const char *tokStart = curPtr;
+  switch (*curPtr++) {
+  default:
+    if (isalpha(*tokStart)) {
+      return lexalpha_ident(tokStart);
+    }
+    if (isdigit(*tokStart)) {
+      return lexNumber(tokStart);
+    }
+
+    return emitError(tokStart, "unexpected character");
+
+  case '<':
+    return formToken(TokenKind::l_angle, tokStart);
+  case '>':
+    return formToken(TokenKind::r_angle, tokStart);
+  case '{':
+    return formToken(TokenKind::l_brace, tokStart);
+  case '}':
+    return formToken(TokenKind::r_brace, tokStart);
+  case ':':
+    return formToken(TokenKind::colon, tokStart);
+  case ',':
+    return formToken(TokenKind::comma, tokStart);
+  case '-':
+    return lexNumber(tokStart);
+  case '+':
+    return lexNumber(tokStart);
+  }
+}
+
+/// Lex a bare alpha identifier. Since this DSL often contains identifiers with
+/// trailing numeric components, this only matches alphas. It is up to the
+/// parser to handle identifiers that can be mixed alphanum.
+///
+///   alpha-ident ::= (letter)(letter)*
+Token Lexer::lexalpha_ident(const char *tokStart) {
+  while (!isEnd() && isalpha(*curPtr)) {
+    ++curPtr;
+  }
+  return formToken(TokenKind::alpha_ident, tokStart);
+}
+
+/// Lex a number.
+///
+///   integer-literal ::= [-+]?digit+
+///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+Token Lexer::lexNumber(const char *tokStart) {
+  // Leading '+', '-' or digit has already been consumed.
+  while (!isEnd() && isdigit(*curPtr)) {
+    ++curPtr;
+  }
+  // If not a decimal point, treat as integer.
+  if (isEnd() || *curPtr != '.') {
+    return formToken(TokenKind::integer_literal, tokStart);
+  }
+  ++curPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  // Leading digits.
+  while (!isEnd() && isdigit(*curPtr)) {
+    ++curPtr;
+  }
+
+  // [eE][-+]?[0-9]+
+  if (!isEnd() && (*curPtr == 'e' || *curPtr == 'E')) {
+    auto remaining = curBuffer.end() - curPtr;
+    if (remaining > 2 && isdigit(curPtr[1])) {
+      // Lookahead 2 for digit.
+      curPtr += 2;
+      while (!isEnd() && isdigit(*curPtr)) {
+        ++curPtr;
+      }
+    } else if (remaining > 3 && (curPtr[1] == '-' || curPtr[1] == '+') &&
+               isdigit(curPtr[2])) {
+      // Lookahead 3 for [+-] digit.
+      curPtr += 3;
+      while (!isEnd() && isdigit(*curPtr)) {
+        ++curPtr;
+      }
+    }
+  }
+  return formToken(TokenKind::float_literal, tokStart);
+} // end namespace
+
+// --- TypeParser ---
+namespace {
+
+class TypeParser {
+public:
+  TypeParser(StringRef source, MLIRContext *context, Location location)
+      : context(context), location(location), lexer(source),
+        curToken(lexer.lexToken()) {}
+
+  /// Attempts to parse the source as a type, returning the unknown
+  /// type on error.
+  Type parseType();
+
+private:
+  /// Unconditionally consumes the current token.
+  void consumeToken() {
+    assert(curToken.kind != TokenKind::eof &&
+           "should not advance past EOF or errors");
+    curToken = lexer.lexToken();
+  }
+
+  /// Unconditionally consumes the current token, asserting that it is of the
+  /// specified kind.
+  void consumeToken(TokenKind kind) {
+    assert(curToken.kind == kind && "consumed an unexpected token");
+    consumeToken();
+  }
+
+  /// Conditionally consumes a token if of the specified kind.
+  /// Returns true if consumed.
+  bool consumeIf(TokenKind kind) {
+    if (curToken.kind == kind) {
+      consumeToken();
+      return true;
+    }
+    return false;
+  }
+
+  /// Emits an error at the current location with a message.
+  void emitError(const Twine &message) {
+    // TODO: All errors show up at the beginning of the extended type location.
+    // Figure out how to make this location relative to where the error occurred
+    // in this instance.
+    mlir::emitError(location, message);
+  }
+
+  // Parsers.
+  Type parseAnyType();
+  Type parseUniformType();
+  IntegerType parseStorageType(bool &isSigned);
+  bool parseStorageRange(IntegerType storageType, bool isSigned,
+                         int64_t &storageTypeMin, int64_t &storageTypeMax);
+  FloatType parseExpressedType();
+  bool parseQuantParams(double &scale, int64_t &zeroPoint);
+
+  MLIRContext *context;
+  Location location;
+  Lexer lexer;
+
+  // The next token that has not yet been consumed.
+  Token curToken;
+};
+
+} // namespace
+
+Type TypeParser::parseType() {
+  // All types start with an identifier that we switch on.
+  if (curToken.kind == TokenKind::alpha_ident) {
+    StringRef typeNameSpelling = curToken.spelling;
+    consumeToken();
+
+    Type result;
+    if (typeNameSpelling == "uniform") {
+      result = parseUniformType();
+      if (!result) {
+        return nullptr;
+      }
+    } else if (typeNameSpelling == "any") {
+      result = parseAnyType();
+      if (!result) {
+        return nullptr;
+      }
+    } else {
+      return (emitError("unknown quantized type " + typeNameSpelling), nullptr);
+    }
+
+    // Make sure the entire input was consumed.
+    if (curToken.kind != TokenKind::eof) {
+      return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+    }
+
+    return result;
+  } else {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+}
+
+/// Parses a UniformQuantizedType.
+///
+///   uniform_per_layer ::= `any<` storage-spec (expressed-type-spec)?`>`
+///   storage-spec ::= storage-type (`<` storage-range `>`)?
+///   storage-range ::= integer-literal `:` integer-literal
+///   storage-type ::= (`i` | `u`) integer-literal
+///   expressed-type-spec ::= `:` `f` integer-literal
+Type TypeParser::parseAnyType() {
+  IntegerType storageType;
+  FloatType expressedType;
+  unsigned typeFlags = 0;
+  int64_t storageTypeMin;
+  int64_t storageTypeMax;
+
+  // Type specification.
+  if (!consumeIf(TokenKind::l_angle)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+
+  // Storage type.
+  bool isSigned = false;
+  storageType = parseStorageType(isSigned);
+  if (!storageType) {
+    return nullptr;
+  }
+  if (isSigned) {
+    typeFlags |= QuantizationFlags::Signed;
+  }
+
+  // Storage type range.
+  if (parseStorageRange(storageType, isSigned, storageTypeMin,
+                        storageTypeMax)) {
+    return nullptr;
+  }
+
+  // Optional expressed type.
+  if (consumeIf(TokenKind::colon)) {
+    expressedType = parseExpressedType();
+    if (!expressedType) {
+      return nullptr;
+    }
+  }
+
+  if (!consumeIf(TokenKind::r_angle)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+
+  return AnyQuantizedType::getChecked(typeFlags, storageType, expressedType,
+                                      storageTypeMin, storageTypeMax, location);
+}
+
+/// Parses a UniformQuantizedType.
+///
+///   uniform_type ::= uniform_per_layer
+///                  | uniform_per_axis
+///   uniform_per_layer ::= `uniform<` storage-spec expressed-type-spec
+///                          `,` scale-zero `>`
+///   uniform_per_axis ::= `uniform<` storage-spec expressed-type-spec
+///                        axis-spec `,` scale-zero-list `>`
+///   storage-spec ::= storage-type (`<` storage-range `>`)?
+///   storage-range ::= integer-literal `:` integer-literal
+///   storage-type ::= (`i` | `u`) integer-literal
+///   expressed-type-spec ::= `:` `f` integer-literal
+///   axis-spec ::= `:` integer-literal
+///   scale-zero ::= float-literal `:` integer-literal
+///   scale-zero-list ::= `{` scale-zero (`,` scale-zero)* `}`
+Type TypeParser::parseUniformType() {
+  IntegerType storageType;
+  FloatType expressedType;
+  unsigned typeFlags = 0;
+  int64_t storageTypeMin;
+  int64_t storageTypeMax;
+  bool isPerAxis = false;
+  int32_t quantizedDimension;
+  SmallVector<double, 1> scales;
+  SmallVector<int64_t, 1> zeroPoints;
+
+  // Type specification.
+  if (!consumeIf(TokenKind::l_angle)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+
+  // Storage type.
+  bool isSigned = false;
+  storageType = parseStorageType(isSigned);
+  if (!storageType) {
+    return nullptr;
+  }
+  if (isSigned) {
+    typeFlags |= QuantizationFlags::Signed;
+  }
+
+  // Storage type range.
+  if (parseStorageRange(storageType, isSigned, storageTypeMin,
+                        storageTypeMax)) {
+    return nullptr;
+  }
+
+  // Expressed type.
+  if (!consumeIf(TokenKind::colon)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+  expressedType = parseExpressedType();
+  if (!expressedType) {
+    return nullptr;
+  }
+
+  // Optionally parse quantized dimension for per-axis quantization.
+  if (consumeIf(TokenKind::colon)) {
+    if (curToken.kind != TokenKind::integer_literal) {
+      return (emitError("expected quantized dimension"), nullptr);
+    }
+    if (curToken.spelling.getAsInteger(10, quantizedDimension)) {
+      return (emitError("illegal quantized dimension: " + curToken.spelling),
+              nullptr);
+    }
+    consumeToken(TokenKind::integer_literal);
+    isPerAxis = true;
+  }
+
+  // Comma leading into range_spec.
+  if (!consumeIf(TokenKind::comma)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+
+  // Parameter specification.
+  // For per-axis, ranges are in a {} delimitted list.
+  if (isPerAxis) {
+    if (!consumeIf(TokenKind::l_brace)) {
+      return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+    }
+  }
+
+  // Parse scales/zeroPoints.
+  do {
+    scales.resize(scales.size() + 1);
+    zeroPoints.resize(zeroPoints.size() + 1);
+    if (parseQuantParams(scales.back(), zeroPoints.back())) {
+      return nullptr;
+    }
+  } while (isPerAxis && consumeIf(TokenKind::comma));
+
+  if (isPerAxis) {
+    if (!consumeIf(TokenKind::r_brace)) {
+      return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+    }
+  }
+
+  if (!consumeIf(TokenKind::r_angle)) {
+    return (emitError("unrecognized token: " + curToken.spelling), nullptr);
+  }
+
+  if (!isPerAxis && scales.size() > 1) {
+    return (emitError("multiple scales/zeroPoints provided, but "
+                      "quantizedDimension wasn't specified"),
+            nullptr);
+  }
+
+  if (isPerAxis) {
+    ArrayRef<double> scalesRef(scales.begin(), scales.end());
+    ArrayRef<int64_t> zeroPointsRef(zeroPoints.begin(), zeroPoints.end());
+    return UniformQuantizedPerAxisType::getChecked(
+        typeFlags, storageType, expressedType, scalesRef, zeroPointsRef,
+        quantizedDimension, storageTypeMin, storageTypeMax, location);
+  }
+
+  return UniformQuantizedType::getChecked(
+      typeFlags, storageType, expressedType, scales.front(), zeroPoints.front(),
+      storageTypeMin, storageTypeMax, location);
+}
+
+IntegerType TypeParser::parseStorageType(bool &isSigned) {
+  // Parse storage type (alpha_ident, integer_literal).
+  StringRef storageTypePrefix = curToken.spelling;
+  unsigned storageTypeWidth;
+  if (curToken.kind != TokenKind::alpha_ident) {
+    return (emitError("expected storage type prefix"), nullptr);
+  }
+  consumeToken();
+  if (curToken.kind != TokenKind::integer_literal) {
+    return (emitError("expected storage type width"), nullptr);
+  }
+  if (curToken.spelling.getAsInteger(10, storageTypeWidth) ||
+      storageTypeWidth == 0 ||
+      storageTypeWidth > QuantizedType::MaxStorageBits) {
+    return (emitError("illegal storage type size: " + Twine(curToken.spelling)),
+            nullptr);
+  }
+  consumeToken();
+
+  if (storageTypePrefix == "i") {
+    isSigned = true;
+    return IntegerType::get(storageTypeWidth, context);
+  } else if (storageTypePrefix == "u") {
+    isSigned = false;
+    return IntegerType::get(storageTypeWidth, context);
+  } else {
+    return (
+        emitError("illegal storage type prefix: " + Twine(storageTypePrefix)),
+        nullptr);
+  }
+}
+
+bool TypeParser::parseStorageRange(IntegerType storageType, bool isSigned,
+                                   int64_t &storageTypeMin,
+                                   int64_t &storageTypeMax) {
+
+  int64_t defaultIntegerMin = QuantizedType::getDefaultMininumForInteger(
+      isSigned, storageType.getWidth());
+  int64_t defaultIntegerMax = QuantizedType::getDefaultMaxinumForInteger(
+      isSigned, storageType.getWidth());
+  if (consumeIf(TokenKind::l_angle)) {
+    // Explicit storage min and storage max.
+    if (curToken.kind != TokenKind::integer_literal) {
+      return (emitError("expected storage type minimum"), true);
+    }
+    if (curToken.spelling.getAsInteger(10, storageTypeMin) ||
+        storageTypeMin < defaultIntegerMin) {
+      return (emitError("illegal storage type minimum: " + curToken.spelling),
+              true);
+    }
+    consumeToken(TokenKind::integer_literal);
+
+    if (!consumeIf(TokenKind::colon)) {
+      return (emitError("unrecognized token: " + curToken.spelling), true);
+    }
+
+    if (curToken.kind != TokenKind::integer_literal) {
+      return (emitError("expected storage type maximum"), true);
+    }
+    if (curToken.spelling.getAsInteger(10, storageTypeMax) ||
+        storageTypeMax > defaultIntegerMax) {
+      return (emitError("illegal storage type maximum: " + curToken.spelling),
+              true);
+    }
+    consumeToken(TokenKind::integer_literal);
+
+    if (!consumeIf(TokenKind::r_angle)) {
+      return (emitError("unrecognized token: " + curToken.spelling), true);
+    }
+  } else {
+    storageTypeMin = defaultIntegerMin;
+    storageTypeMax = defaultIntegerMax;
+  }
+
+  return false;
+}
+
+FloatType TypeParser::parseExpressedType() {
+  // Expect an alpha_ident followed by integer literal that we concat back
+  // together.
+  StringRef prefix = curToken.spelling;
+  if (!consumeIf(TokenKind::alpha_ident)) {
+    return (emitError("expected expressed type"), nullptr);
+  }
+  StringRef suffix = curToken.spelling;
+  if (!consumeIf(TokenKind::integer_literal)) {
+    return (emitError("expected expressed type"), nullptr);
+  }
+
+  SmallVector<char, 4> holder;
+  StringRef typeName = (Twine(prefix) + Twine(suffix)).toStringRef(holder);
+  if (typeName == "f32")
+    return FloatType::getF32(context);
+  if (typeName == "f16")
+    return FloatType::getF16(context);
+  if (typeName == "bf16")
+    return FloatType::getBF16(context);
+  if (typeName == "f64")
+    return FloatType::getF64(context);
+
+  return (emitError("unrecognized expressed type: " + typeName), nullptr);
+}
+
+bool TypeParser::parseQuantParams(double &scale, int64_t &zeroPoint) {
+  // scale[:zeroPoint]?
+  // scale.
+  StringRef scaleSpelling = curToken.spelling;
+  if (!consumeIf(TokenKind::float_literal) ||
+      scaleSpelling.getAsDouble(scale)) {
+    return (
+        emitError("expected valid uniform scale. got: " + Twine(scaleSpelling)),
+        true);
+  }
+
+  // zero point.
+  zeroPoint = 0;
+  if (!consumeIf(TokenKind::colon)) {
+    // Default zero point.
+    return false;
+  }
+  StringRef zeroPointSpelling = curToken.spelling;
+  if (!consumeIf(TokenKind::integer_literal) ||
+      zeroPointSpelling.getAsInteger(10, zeroPoint)) {
+    return (emitError("expected integer uniform zero point. got: " +
+                      Twine(zeroPointSpelling)),
+            true);
+  }
+
+  return false;
+}
+
+/// Parse a type registered to this dialect.
+Type QuantizationDialect::parseType(StringRef spec, Location loc) const {
+  TypeParser parser(spec, getContext(), loc);
+  Type parsedType = parser.parseType();
+  if (parsedType == nullptr) {
+    // Error.
+    // TODO(laurenzo): Do something?
+    return parsedType;
+  }
+
+  return parsedType;
+}
+
+static void printStorageType(QuantizedType type, raw_ostream &out) {
+  // storage type
+  unsigned storageWidth = type.getStorageTypeIntegralWidth();
+  bool isSigned = type.isSigned();
+  if (isSigned) {
+    out << "i" << storageWidth;
+  } else {
+    out << "u" << storageWidth;
+  }
+
+  // storageTypeMin and storageTypeMax if not default.
+  int64_t defaultIntegerMin =
+      QuantizedType::getDefaultMininumForInteger(isSigned, storageWidth);
+  int64_t defaultIntegerMax =
+      QuantizedType::getDefaultMaxinumForInteger(isSigned, storageWidth);
+  if (defaultIntegerMin != type.getStorageTypeMin() ||
+      defaultIntegerMax != type.getStorageTypeMax()) {
+    out << "<" << type.getStorageTypeMin() << ":" << type.getStorageTypeMax()
+        << ">";
+  }
+}
+
+static void printExpressedType(QuantizedType type, raw_ostream &out) {
+  // repr type
+  Type expressedType = type.getExpressedType();
+  if (expressedType.isF32()) {
+    out << "f32";
+  } else if (expressedType.isF64()) {
+    out << "f64";
+  } else if (expressedType.isF16()) {
+    out << "f16";
+  } else if (expressedType.isBF16()) {
+    out << "bf16";
+  } else {
+    out << "unknown";
+  }
+}
+
+static void printQuantParams(double scale, int64_t zeroPoint,
+                             raw_ostream &out) {
+  printStabilizedFloat(APFloat(scale), out);
+  if (zeroPoint != 0) {
+    out << ":" << zeroPoint;
+  }
+}
+
+/// Helper that prints a UniformQuantizedType.
+static void printAnyQuantizedType(AnyQuantizedType type, raw_ostream &out) {
+  out << "any<";
+  printStorageType(type, out);
+  if (type.getExpressedType()) {
+    out << ":";
+    printExpressedType(type, out);
+  }
+  out << ">";
+}
+
+/// Helper that prints a UniformQuantizedType.
+static void printUniformQuantizedType(UniformQuantizedType type,
+                                      raw_ostream &out) {
+  out << "uniform<";
+  printStorageType(type, out);
+  out << ":";
+  printExpressedType(type, out);
+  out << ", ";
+
+  // scheme specific parameters
+  printQuantParams(type.getScale(), type.getZeroPoint(), out);
+  out << ">";
+}
+
+/// Helper that prints a UniformQuantizedPerAxisType.
+static void printUniformQuantizedPerAxisType(UniformQuantizedPerAxisType type,
+                                             raw_ostream &out) {
+  out << "uniform<";
+  printStorageType(type, out);
+  out << ":";
+  printExpressedType(type, out);
+  out << ":";
+  out << type.getQuantizedDimension();
+  out << ", ";
+
+  // scheme specific parameters
+  ArrayRef<double> scales = type.getScales();
+  ArrayRef<int64_t> zeroPoints = type.getZeroPoints();
+  out << "{";
+  for (unsigned i = 0; i < scales.size(); ++i) {
+    printQuantParams(scales[i], zeroPoints[i], out);
+    if (i != scales.size() - 1) {
+      out << ",";
+    }
+  }
+  out << "}>";
+}
+
+/// Print a type registered to this dialect.
+void QuantizationDialect::printType(Type type, raw_ostream &os) const {
+  switch (type.getKind()) {
+  default:
+    llvm_unreachable("Unhandled quantized type");
+  case QuantizationTypes::Any:
+    printAnyQuantizedType(type.cast<AnyQuantizedType>(), os);
+    break;
+  case QuantizationTypes::UniformQuantized:
+    printUniformQuantizedType(type.cast<UniformQuantizedType>(), os);
+    break;
+  case QuantizationTypes::UniformQuantizedPerAxis:
+    printUniformQuantizedPerAxisType(type.cast<UniformQuantizedPerAxisType>(),
+                                     os);
+    break;
+  }
+}
+
+} // namespace quant
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp
new file mode 100644
index 00000000000..e3a17b057d4
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp
@@ -0,0 +1,121 @@
+//===- ConvertConst.cpp - Quantizes constant ops --------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/Passes.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantizeUtils.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+namespace {
+
+class ConvertConstPass : public FunctionPass<ConvertConstPass> {
+public:
+  void runOnFunction() override;
+};
+
+struct QuantizedConstRewrite : public OpRewritePattern<QuantizeCastOp> {
+  using OpRewritePattern<QuantizeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(QuantizeCastOp qbarrier,
+                                     PatternRewriter &rewriter) const override;
+};
+
+} // end anonymous namespace
+
+/// Matches a [constant] -> [qbarrier] where the qbarrier results type is
+/// quantized and the operand type is quantizable.
+
+PatternMatchResult
+QuantizedConstRewrite::matchAndRewrite(QuantizeCastOp qbarrier,
+                                       PatternRewriter &rewriter) const {
+  Attribute value;
+
+  // Is the operand a constant?
+  if (!matchPattern(qbarrier.arg(), m_Constant(&value))) {
+    return matchFailure();
+  }
+
+  // Does the qbarrier convert to a quantized type. This will not be true
+  // if a quantized type has not yet been chosen or if the cast to an equivalent
+  // storage type is not supported.
+  Type qbarrierResultType = qbarrier.getResult()->getType();
+  QuantizedType quantizedElementType =
+      QuantizedType::getQuantizedElementType(qbarrierResultType);
+  if (!quantizedElementType) {
+    return matchFailure();
+  }
+  if (!QuantizedType::castToStorageType(qbarrierResultType)) {
+    return matchFailure();
+  }
+
+  // Is the operand type compatible with the expressed type of the quantized
+  // type? This will not be true if the qbarrier is superfluous (converts
+  // from and to a quantized type).
+  if (!quantizedElementType.isCompatibleExpressedType(
+          qbarrier.arg()->getType())) {
+    return matchFailure();
+  }
+
+  // Is the constant value a type expressed in a way that we support?
+  if (!value.isa<FloatAttr>() && !value.isa<DenseElementsAttr>() &&
+      !value.isa<SparseElementsAttr>()) {
+    return matchFailure();
+  }
+
+  Type newConstValueType;
+  auto newConstValue =
+      quantizeAttr(value, quantizedElementType, newConstValueType);
+  if (!newConstValue) {
+    return matchFailure();
+  }
+
+  // When creating the new const op, use a fused location that combines the
+  // original const and the qbarrier that led to the quantization.
+  auto fusedLoc = FusedLoc::get(
+      {qbarrier.arg()->getDefiningOp()->getLoc(), qbarrier.getLoc()},
+      rewriter.getContext());
+  auto newConstOp =
+      rewriter.create<ConstantOp>(fusedLoc, newConstValueType, newConstValue);
+  rewriter.replaceOpWithNewOp<StorageCastOp>({qbarrier.arg()}, qbarrier,
+                                             qbarrier.getType(), newConstOp);
+  return matchSuccess();
+}
+
+void ConvertConstPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto *context = &getContext();
+  patterns.insert<QuantizedConstRewrite>(context);
+  applyPatternsGreedily(func, patterns);
+}
+
+std::unique_ptr<FunctionPassBase> mlir::quant::createConvertConstPass() {
+  return std::make_unique<ConvertConstPass>();
+}
+
+static PassRegistration<ConvertConstPass>
+    pass("quant-convert-const",
+         "Converts constants followed by qbarrier to actual quantized values");
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
new file mode 100644
index 00000000000..129671979ca
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
@@ -0,0 +1,114 @@
+//===- ConvertSimQuant.cpp - Converts simulated quant ops------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"
+#include "mlir/Dialect/QuantOps/Passes.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+namespace {
+
+class ConvertSimulatedQuantPass
+    : public FunctionPass<ConvertSimulatedQuantPass> {
+public:
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+/// Rewrites ConstFakeQuant into a qbarrier/dbarrier pair.
+class ConstFakeQuantRewrite : public RewritePattern {
+public:
+  bool *hadFailure;
+
+  ConstFakeQuantRewrite(MLIRContext *context, bool *hadFailure)
+      : RewritePattern(ConstFakeQuant::getOperationName(), 1, context),
+        hadFailure(hadFailure) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    // TODO: If this pattern comes up more frequently, consider adding core
+    // support for failable rewrites.
+    if (failableRewrite(op, rewriter)) {
+      *hadFailure = true;
+      return matchFailure();
+    }
+
+    return matchSuccess();
+  }
+
+  bool failableRewrite(Operation *op, PatternRewriter &rewriter) const {
+    auto fqOp = cast<ConstFakeQuant>(op);
+
+    auto converter =
+        ExpressedToUniformQuantizedConverter::forInputType(fqOp.getType());
+    if (!converter) {
+      return (op->emitError("unsupported quantized type conversion"), true);
+    }
+
+    UniformQuantizedType uniformElementType = fakeQuantAttrsToType(
+        fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
+        fqOp.min().convertToFloat(), fqOp.max().convertToFloat(),
+        fqOp.narrow_range(), converter.expressedType, fqOp.is_signed());
+
+    if (!uniformElementType) {
+      // Note that the fakeQuantAttrsToType will have emitted the error.
+      return true;
+    }
+
+    Type quantizedType = converter.convert(uniformElementType);
+    assert(quantizedType &&
+           "Converter accepted a type that it did not convert");
+
+    // TODO: Map to a qbarrier with an attribute like [Forced] to signal that
+    // this is a forced/hard-coded constraint.
+    auto qbarrier = rewriter.create<QuantizeCastOp>(op->getLoc(), quantizedType,
+                                                    fqOp.inputs());
+    rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.inputType,
+                                                  qbarrier.getResult());
+
+    return false;
+  }
+};
+
+void ConvertSimulatedQuantPass::runOnFunction() {
+  bool hadFailure = false;
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto *context = &getContext();
+  patterns.insert<ConstFakeQuantRewrite>(context, &hadFailure);
+  applyPatternsGreedily(func, patterns);
+  if (hadFailure)
+    signalPassFailure();
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::quant::createConvertSimulatedQuantPass() {
+  return std::make_unique<ConvertSimulatedQuantPass>();
+}
+
+static PassRegistration<ConvertSimulatedQuantPass>
+    pass("quant-convert-simulated-quantization",
+         "Converts training-time simulated quantization ops to corresponding "
+         "quantize/dequantize casts.");
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp b/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
new file mode 100644
index 00000000000..2667da98242
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
@@ -0,0 +1,120 @@
+//===- FakeQuantSupport.cpp - Support utilities for FakeQuant ops ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+UniformQuantizedType
+mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
+                                  double rmax, bool narrowRange,
+                                  Type expressedType, bool isSigned) {
+  MLIRContext *ctx = expressedType.getContext();
+  Type storageType;
+  unsigned flags;
+  int64_t qmin;
+  int64_t qmax;
+
+  // Hard-coded type mapping from TFLite.
+  if (numBits <= 8) {
+    storageType = IntegerType::get(8, ctx);
+    if (isSigned) {
+      flags = QuantizationFlags::Signed;
+      qmin = -128;
+      qmax = 127;
+    } else {
+      flags = 0;
+      qmin = 0;
+      qmax = 255;
+    }
+  } else if (numBits <= 16) {
+    storageType = IntegerType::get(16, ctx);
+    if (isSigned) {
+      flags = QuantizationFlags::Signed;
+      qmin = -32768;
+      qmax = 32767;
+    } else {
+      flags = 0;
+      qmin = 0;
+      qmax = 65535;
+    }
+  } else {
+    emitError(loc, "unsupported FakeQuant number of bits: ") << numBits;
+    return nullptr;
+  }
+
+  // Handle narrowRange.
+  if (narrowRange) {
+    qmin += 1;
+  }
+
+  // Range must straddle zero.
+  if (rmin > 0.0 || rmax < 0.0) {
+    return (emitError(loc, "FakeQuant range must straddle zero: [")
+                << rmin << "," << rmax << "]",
+            nullptr);
+  }
+
+  // Special case where min/max is a point. Must be 0.
+  if (rmin == rmax) {
+    return UniformQuantizedType::getChecked(flags, storageType, expressedType,
+                                            0.0, 0, qmin, qmax, loc);
+  }
+
+  // Determine the scale.
+  const double qminDouble = qmin;
+  const double qmaxDouble = qmax;
+  const double scale = (rmax - rmin) / (qmaxDouble - qminDouble);
+
+  // Zero point computation.
+  // In float, solve the affine equation for any known pair
+  // (real value, corresponding quantized value), of which, two such pairs
+  // are known: (rmin, qmin), (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair will be
+  // roughly machine_epsilon * (sum of absolute values of terms).
+  // Use the variant that adds the smaller error.
+  const double zeroPointFromMin = qminDouble - rmin / scale;
+  const double zeroPointFromMinError =
+      std::abs(qminDouble) + std::abs(rmin / scale);
+  const double zeroPointFromMax = qmaxDouble - rmax / scale;
+  const double zeroPointFromMaxError =
+      std::abs(qmaxDouble) + std::abs(rmax / scale);
+
+  const double zeroPointDouble = (zeroPointFromMinError < zeroPointFromMaxError)
+                                     ? zeroPointFromMin
+                                     : zeroPointFromMax;
+
+  // Now nudge the zero point to be an integer.
+  int64_t nudgedZeroPoint = 0;
+  if (zeroPointDouble < qminDouble) {
+    nudgedZeroPoint = qmin;
+  } else if (zeroPointDouble > qmaxDouble) {
+    nudgedZeroPoint = qmax;
+  } else {
+    nudgedZeroPoint = round(zeroPointDouble);
+  }
+
+  // By construction, the nudged zero point should always be in range.
+  assert(nudgedZeroPoint >= qmin);
+  assert(nudgedZeroPoint <= qmax);
+
+  return UniformQuantizedType::getChecked(flags, storageType, expressedType,
+                                          scale, nudgedZeroPoint, qmin, qmax,
+                                          loc);
+}
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp b/third_party/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp
new file mode 100644
index 00000000000..4733e56418b
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp
@@ -0,0 +1,151 @@
+//===- QuantizeUtils.cpp - Support utilities for quantization -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/QuantizeUtils.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+namespace quant {
+/// Converts a possible primitive, real expressed value attribute to a
+/// corresponding storage attribute (typically FloatAttr -> IntegerAttr).
+/// quantizedElementType is the QuantizedType that describes the expressed
+/// origValue.
+/// Returns a converter Attribute or nullptr if conversion is not possible.
+static Attribute convertPrimitiveValueAttr(
+    Attribute origRealValue, QuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+  if (origRealValue.isa<FloatAttr>()) {
+    FloatAttr floatAttr = origRealValue.cast<FloatAttr>();
+    outConvertedType = quantizedElementType.getStorageType();
+    return IntegerAttr::get(quantizedElementType.getStorageType(),
+                            converter.quantizeFloatToInt(floatAttr.getValue()));
+  }
+
+  return nullptr;
+}
+
+/// Converts a real expressed DenseFPElementsAttr to a corresponding
+/// DenseElementsAttr (typically DenseIntElementsAttr) containing quantized
+/// storage values assuming the given quantizedElementType and converter.
+static DenseElementsAttr
+convertDenseFPElementsAttr(DenseFPElementsAttr realFPElementsAttr,
+                           QuantizedType quantizedElementType,
+                           const UniformQuantizedValueConverter &converter) {
+  // Convert to corresponding quantized value attributes.
+  SmallVector<APInt, 8> quantValues;
+  if (realFPElementsAttr.isSplat()) {
+    quantValues.push_back(
+        converter.quantizeFloatToInt(*realFPElementsAttr.begin()));
+  } else {
+    quantValues.reserve(realFPElementsAttr.getNumElements());
+    for (APFloat realVal : realFPElementsAttr) {
+      quantValues.push_back(converter.quantizeFloatToInt(realVal));
+    }
+  }
+
+  // Cast from an expressed-type-based type to storage-type-based type,
+  // preserving the dense shape (i.e. tensor<4xf32> -> tensor<4xi8>).
+  ShapedType newDenseType =
+      quantizedElementType
+          .castExpressedToStorageType(realFPElementsAttr.getType())
+          .dyn_cast_or_null<ShapedType>();
+  if (!newDenseType) {
+    return nullptr;
+  }
+  return DenseIntElementsAttr::get(newDenseType, quantValues);
+}
+
+/// Converts a real expressed SplatElementsAttr to a corresponding
+/// SplatElementsAttr containing quantized storage values assuming the given
+/// quantizedElementType and converter.
+static SparseElementsAttr
+convertSparseElementsAttr(SparseElementsAttr realSparseAttr,
+                          QuantizedType quantizedElementType,
+                          const UniformQuantizedValueConverter &converter) {
+  DenseElementsAttr realDenseAttr = realSparseAttr.getValues();
+  if (!realDenseAttr.isa<DenseFPElementsAttr>()) {
+    return nullptr;
+  }
+  DenseElementsAttr quantDenseAttr =
+      convertDenseFPElementsAttr(realDenseAttr.cast<DenseFPElementsAttr>(),
+                                 quantizedElementType, converter);
+  if (!quantDenseAttr) {
+    return nullptr;
+  }
+
+  // Cast from an expressed-type-based type to storage-type-based type,
+  // preserving the sparse shape (i.e. tensor<4xf32> -> tensor<4xi8>).
+  ShapedType newSparseType =
+      quantizedElementType.castExpressedToStorageType(realSparseAttr.getType())
+          .dyn_cast_or_null<ShapedType>();
+  if (!newSparseType) {
+    return nullptr;
+  }
+  return SparseElementsAttr::get(newSparseType, realSparseAttr.getIndices(),
+                                 quantDenseAttr);
+}
+
+/// Converts a real expressed Attribute to a corresponding Attribute containing
+/// quantized storage values assuming the given uniform quantizedElementType and
+/// converter.
+Attribute quantizeAttrUniform(Attribute realValue,
+                              UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType) {
+  // Fork to handle different variants of constants supported.
+  if (realValue.isa<DenseFPElementsAttr>()) {
+    // Dense tensor or vector constant.
+    auto converted = convertDenseFPElementsAttr(
+        realValue.cast<DenseFPElementsAttr>(), quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  } else if (realValue.isa<SparseElementsAttr>()) {
+    // Sparse tensor or vector constant.
+    auto converted = convertSparseElementsAttr(
+        realValue.cast<SparseElementsAttr>(), quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  } else {
+    // Nothing else matched: try to convert a primitive.
+    return convertPrimitiveValueAttr(realValue, quantizedElementType, converter,
+                                     outConvertedType);
+  }
+}
+
+/// Convert an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType().
+/// Returns nullptr if the conversion is not supported.
+/// On success, stores the converted type in outConvertedType.
+Attribute quantizeAttr(Attribute realValue, QuantizedType quantizedElementType,
+                       Type &outConvertedType) {
+  // Hard-coded to just support UniformQuantizedType. This will need to
+  // be generalized when there is more than one.
+  auto uniformQuantizedType =
+      quantizedElementType.dyn_cast<UniformQuantizedType>();
+  if (!uniformQuantizedType) {
+    return nullptr;
+  }
+  UniformQuantizedValueConverter converter(uniformQuantizedType);
+  return quantizeAttrUniform(realValue, uniformQuantizedType, converter,
+                             outConvertedType);
+}
+
+} // namespace quant
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp b/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
new file mode 100644
index 00000000000..db8a5848981
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
@@ -0,0 +1,73 @@
+//===- UniformSupport.cpp - Support utilities for uniform quant -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+static bool isQuantizablePrimitiveType(Type inputType) {
+  return inputType.isa<FloatType>();
+}
+
+const ExpressedToUniformQuantizedConverter
+ExpressedToUniformQuantizedConverter::forInputType(Type inputType) {
+  switch (inputType.getKind()) {
+  default:
+    if (isQuantizablePrimitiveType(inputType)) {
+      // Supported primitive type (which just is the expressed type).
+      return ExpressedToUniformQuantizedConverter{inputType, inputType};
+    }
+    // Unsupported.
+    return ExpressedToUniformQuantizedConverter{inputType, nullptr};
+  case StandardTypes::RankedTensor:
+  case StandardTypes::UnrankedTensor:
+  case StandardTypes::Vector: {
+    Type elementType = inputType.cast<ShapedType>().getElementType();
+    if (!isQuantizablePrimitiveType(elementType)) {
+      // Unsupported.
+      return ExpressedToUniformQuantizedConverter{inputType, nullptr};
+    }
+    return ExpressedToUniformQuantizedConverter{
+        inputType, inputType.cast<ShapedType>().getElementType()};
+  }
+  }
+}
+
+Type ExpressedToUniformQuantizedConverter::convert(
+    UniformQuantizedType elementalType) const {
+  assert(expressedType && "convert() on unsupported conversion");
+
+  switch (inputType.getKind()) {
+  default:
+    if (isQuantizablePrimitiveType(elementalType)) {
+      // For primitives, just use the new elemental type.
+      return elementalType;
+    }
+    // Unsupported.
+    return nullptr;
+  case StandardTypes::RankedTensor:
+    return RankedTensorType::get(inputType.cast<RankedTensorType>().getShape(),
+                                 elementalType);
+  case StandardTypes::UnrankedTensor:
+    return UnrankedTensorType::get(elementalType);
+  case StandardTypes::Vector:
+    return VectorType::get(inputType.cast<VectorType>().getShape(),
+                           elementalType);
+  }
+}
diff --git a/third_party/mlir/lib/Dialect/SDBM/CMakeLists.txt b/third_party/mlir/lib/Dialect/SDBM/CMakeLists.txt
new file mode 100644
index 00000000000..e36308e0eda
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SDBM/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRSDBM
+  SDBM.cpp
+  SDBMExpr.cpp
+  SDBMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SDBM
+)
+add_dependencies(MLIRSDBM MLIRIR)
+target_link_libraries(MLIRSDBM MLIRIR)
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp b/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp
new file mode 100644
index 00000000000..5450a61b17b
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp
@@ -0,0 +1,561 @@
+//===- SDBM.cpp - MLIR SDBM implementation --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// A striped difference-bound matrix (SDBM) is a set in Z^N (or R^N) defined
+// as {(x_1, ... x_n) | f(x_1, ... x_n) >= 0} where f is an SDBM expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBM.h"
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+// Helper function for SDBM construction that collects information necessary to
+// start building an SDBM in one sweep.  In particular, it records the largest
+// position of a dimension in `dim`, that of a symbol in `symbol` as well as
+// collects all unique stripe expressions in `stripes`.  Uses SetVector to
+// ensure these expressions always have the same order.
+static void collectSDBMBuildInfo(SDBMExpr expr, int &dim, int &symbol,
+                                 llvm::SmallSetVector<SDBMExpr, 8> &stripes) {
+  struct Visitor : public SDBMVisitor<Visitor> {
+    void visitDim(SDBMDimExpr dimExpr) {
+      int p = dimExpr.getPosition();
+      if (p > maxDimPosition)
+        maxDimPosition = p;
+    }
+    void visitSymbol(SDBMSymbolExpr symbExpr) {
+      int p = symbExpr.getPosition();
+      if (p > maxSymbPosition)
+        maxSymbPosition = p;
+    }
+    void visitStripe(SDBMStripeExpr stripeExpr) { stripes.insert(stripeExpr); }
+
+    Visitor(llvm::SmallSetVector<SDBMExpr, 8> &stripes) : stripes(stripes) {}
+
+    int maxDimPosition = -1;
+    int maxSymbPosition = -1;
+    llvm::SmallSetVector<SDBMExpr, 8> &stripes;
+  };
+
+  Visitor visitor(stripes);
+  visitor.walkPostorder(expr);
+  dim = std::max(dim, visitor.maxDimPosition);
+  symbol = std::max(symbol, visitor.maxSymbPosition);
+}
+
+namespace {
+// Utility class for SDBMBuilder.  Represents a value that can be inserted in
+// the SDB matrix that corresponds to "v0 - v1 + C <= 0", where v0 and v1 is
+// any combination of the positive and negative positions.  Since multiple
+// variables can be declared equal to the same stripe expression, the
+// constraints on this expression must be reflected to all these variables.  For
+// example, if
+//   d0 = s0 # 42
+//   d1 = s0 # 42
+//   d2 = s1 # 2
+//   d3 = s1 # 2
+// the constraint
+//   s0 # 42 - s1 # 2 <= C
+// should be reflected in the DB matrix as
+//   d0 - d2 <= C
+//   d1 - d2 <= C
+//   d0 - d3 <= C
+//   d1 - d3 <= C
+// since the DB matrix has no knowledge of the transitive equality between d0,
+// d1 and s0 # 42 as well as between d2, d3 and s1 # 2.  This knowledge can be
+// obtained by computing a transitive closure, which is impossible until the
+// DBM is actually built.
+struct SDBMBuilderResult {
+  // Positions in the matrix of the variables taken with the "+" sign in the
+  // difference expression, 0 if it is a constant rather than a variable.
+  llvm::SmallVector<unsigned, 2> positivePos;
+
+  // Positions in the matrix of the variables taken with the "-" sign in the
+  // difference expression, 0 if it is a constant rather than a variable.
+  llvm::SmallVector<unsigned, 2> negativePos;
+
+  // Constant value in the difference expression.
+  int64_t value = 0;
+};
+
+// Visitor for building an SDBM from SDBM expressions.  After traversing an SDBM
+// expression, produces an update to the SDB matrix specifying the positions in
+// the matrix and the negated value that should be stored.  Both the positive
+// and the negative positions may be lists of indices in cases where multiple
+// variables are equal to the same stripe expression.  In such cases, the update
+// applies to the cross product of positions because elements involved in the
+// update are (transitively) equal and should have the same constraints, but we
+// may not have an explicit equality for them.
+struct SDBMBuilder : public SDBMVisitor<SDBMBuilder, SDBMBuilderResult> {
+public:
+  // A difference expression produces both the positive and the negative
+  // coordinate in the matrix, recursively traversing the LHS and the RHS. The
+  // value is the difference between values obtained from LHS and RHS.
+  SDBMBuilderResult visitDiff(SDBMDiffExpr diffExpr) {
+    auto lhs = visit(diffExpr.getLHS());
+    auto rhs = visit(diffExpr.getRHS());
+    assert(lhs.negativePos.size() == 1 && lhs.negativePos[0] == 0 &&
+           "unexpected negative expression in a difference expression");
+    assert(rhs.negativePos.size() == 1 && lhs.negativePos[0] == 0 &&
+           "unexpected negative expression in a difference expression");
+
+    SDBMBuilderResult result;
+    result.positivePos = lhs.positivePos;
+    result.negativePos = rhs.positivePos;
+    result.value = lhs.value - rhs.value;
+    return result;
+  }
+
+  // An input expression is always taken with the "+" sign and therefore
+  // produces a positive coordinate keeping the negative coordinate zero for an
+  // eventual constant.
+  SDBMBuilderResult visitInput(SDBMInputExpr expr) {
+    SDBMBuilderResult r;
+    r.positivePos.push_back(linearPosition(expr));
+    r.negativePos.push_back(0);
+    return r;
+  }
+
+  // A stripe expression is always equal to one or more variables, which may be
+  // temporaries, and appears with a "+" sign in the SDBM expression tree. Take
+  // the positions of the corresponding variables as positive coordinates.
+  SDBMBuilderResult visitStripe(SDBMStripeExpr expr) {
+    SDBMBuilderResult r;
+    assert(pointExprToStripe.count(expr));
+    r.positivePos = pointExprToStripe[expr];
+    r.negativePos.push_back(0);
+    return r;
+  }
+
+  // A constant expression has both coordinates at zero.
+  SDBMBuilderResult visitConstant(SDBMConstantExpr expr) {
+    SDBMBuilderResult r;
+    r.positivePos.push_back(0);
+    r.negativePos.push_back(0);
+    r.value = expr.getValue();
+    return r;
+  }
+
+  // A negation expression swaps the positive and the negative coordinates
+  // and also negates the constant value.
+  SDBMBuilderResult visitNeg(SDBMNegExpr expr) {
+    SDBMBuilderResult result = visit(expr.getVar());
+    std::swap(result.positivePos, result.negativePos);
+    result.value = -result.value;
+    return result;
+  }
+
+  // The RHS of a sum expression must be a constant and therefore must have both
+  // positive and negative coordinates at zero.  Take the sum of the values
+  // between LHS and RHS and keep LHS coordinates.
+  SDBMBuilderResult visitSum(SDBMSumExpr expr) {
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    for (auto pos : rhs.negativePos) {
+      (void)pos;
+      assert(pos == 0 && "unexpected variable on the RHS of SDBM sum");
+    }
+    for (auto pos : rhs.positivePos) {
+      (void)pos;
+      assert(pos == 0 && "unexpected variable on the RHS of SDBM sum");
+    }
+
+    lhs.value += rhs.value;
+    return lhs;
+  }
+
+  SDBMBuilder(llvm::DenseMap<SDBMExpr, llvm::SmallVector<unsigned, 2>>
+                  &pointExprToStripe,
+              llvm::function_ref<unsigned(SDBMInputExpr)> callback)
+      : pointExprToStripe(pointExprToStripe), linearPosition(callback) {}
+
+  llvm::DenseMap<SDBMExpr, llvm::SmallVector<unsigned, 2>> &pointExprToStripe;
+  llvm::function_ref<unsigned(SDBMInputExpr)> linearPosition;
+};
+} // namespace
+
+SDBM SDBM::get(ArrayRef<SDBMExpr> inequalities, ArrayRef<SDBMExpr> equalities) {
+  SDBM result;
+
+  // TODO(zinenko): consider detecting equalities in the list of inequalities.
+  // This is potentially expensive and requires to
+  //   - create a list of negated inequalities (may allocate under lock);
+  //   - perform a pairwise comparison of direct and negated inequalities;
+  //   - copy the lists of equalities and inequalities, and move entries between
+  //     them;
+  // only for the purpose of sparing a temporary variable in cases where an
+  // implicit equality between a variable and a stripe expression is present in
+  // the input.
+
+  // Do the first sweep over (in)equalities to collect the information necessary
+  // to allocate the SDB matrix (number of dimensions, symbol and temporary
+  // variables required for stripe expressions).
+  llvm::SmallSetVector<SDBMExpr, 8> stripes;
+  int maxDim = -1;
+  int maxSymbol = -1;
+  for (auto expr : inequalities)
+    collectSDBMBuildInfo(expr, maxDim, maxSymbol, stripes);
+  for (auto expr : equalities)
+    collectSDBMBuildInfo(expr, maxDim, maxSymbol, stripes);
+  // Indexing of dimensions starts with 0, obtain the number of dimensions by
+  // incrementing the maximal position of the dimension seen in expressions.
+  result.numDims = maxDim + 1;
+  result.numSymbols = maxSymbol + 1;
+  result.numTemporaries = 0;
+
+  // Helper function that returns the position of the variable represented by
+  // an SDBM input expression.
+  auto linearPosition = [result](SDBMInputExpr expr) {
+    if (expr.isa<SDBMDimExpr>())
+      return result.getDimPosition(expr.getPosition());
+    return result.getSymbolPosition(expr.getPosition());
+  };
+
+  // Check if some stripe expressions are equal to another variable. In
+  // particular, look for the equalities of the form
+  //   d0 - stripe-expression = 0, or
+  //   stripe-expression - d0 = 0.
+  // There may be multiple variables that are equal to the same stripe
+  // expression.  Keep track of those in pointExprToStripe.
+  // There may also be multiple stripe expressions equal to the same variable.
+  // Introduce a temporary variable for each of those.
+  llvm::DenseMap<SDBMExpr, llvm::SmallVector<unsigned, 2>> pointExprToStripe;
+  unsigned numTemporaries = 0;
+
+  auto updateStripePointMaps = [&numTemporaries, &result, &pointExprToStripe,
+                                linearPosition](SDBMInputExpr input,
+                                                SDBMExpr expr) {
+    unsigned position = linearPosition(input);
+    if (result.stripeToPoint.count(position) &&
+        result.stripeToPoint[position] != expr) {
+      position = result.getNumVariables() + numTemporaries++;
+    }
+    pointExprToStripe[expr].push_back(position);
+    result.stripeToPoint.insert(std::make_pair(position, expr));
+  };
+
+  for (auto eq : equalities) {
+    auto diffExpr = eq.dyn_cast<SDBMDiffExpr>();
+    if (!diffExpr)
+      continue;
+
+    auto lhs = diffExpr.getLHS();
+    auto rhs = diffExpr.getRHS();
+    auto lhsInput = lhs.dyn_cast<SDBMInputExpr>();
+    auto rhsInput = rhs.dyn_cast<SDBMInputExpr>();
+
+    if (lhsInput && stripes.count(rhs))
+      updateStripePointMaps(lhsInput, rhs);
+    if (rhsInput && stripes.count(lhs))
+      updateStripePointMaps(rhsInput, lhs);
+  }
+
+  // Assign the remaining stripe expressions to temporary variables.  These
+  // expressions are the ones that could not be associated with an existing
+  // variable in the previous step.
+  for (auto expr : stripes) {
+    if (pointExprToStripe.count(expr))
+      continue;
+    unsigned position = result.getNumVariables() + numTemporaries++;
+    pointExprToStripe[expr].push_back(position);
+    result.stripeToPoint.insert(std::make_pair(position, expr));
+  }
+
+  // Create the DBM matrix, initialized to infinity values for the least tight
+  // possible bound (x - y <= infinity is always true).
+  result.numTemporaries = numTemporaries;
+  result.matrix.resize(result.getNumVariables() * result.getNumVariables(),
+                       IntInfty::infinity());
+
+  SDBMBuilder builder(pointExprToStripe, linearPosition);
+
+  // Only keep the tightest constraint.  Since we transform everything into
+  // less-than-or-equals-to inequalities, keep the smallest constant.  For
+  // example, if we have d0 - d1 <= 42 and d0 - d1 <= 2, we keep the latter.
+  // Note that the input expressions are in the shape of d0 - d1 + -42 <= 0
+  // so we negate the value before storing it.
+  // In case where the positive and the negative positions are equal, the
+  // corresponding expression has the form d0 - d0 + -42 <= 0.  If the constant
+  // value is positive, the set defined by SDBM is trivially empty.  We store
+  // this value anyway and continue processing to maintain the correspondence
+  // between the matrix form and the list-of-SDBMExpr form.
+  // TODO(zinenko): we may want to reconsider this once we have canonicalization
+  // or simplification in place
+  auto updateMatrix = [](SDBM &sdbm, const SDBMBuilderResult &r) {
+    for (auto positivePos : r.positivePos) {
+      for (auto negativePos : r.negativePos) {
+        auto &m = sdbm.at(negativePos, positivePos);
+        m = m < -r.value ? m : -r.value;
+      }
+    }
+  };
+
+  // Do the second sweep on (in)equalities, updating the SDB matrix to reflect
+  // the constraints.
+  for (auto ineq : inequalities)
+    updateMatrix(result, builder.visit(ineq));
+
+  // An equality f(x) = 0 is represented as a pair of inequalities {f(x) >= 0;
+  // f(x) <= 0} or, alternatively, {-f(x) <= 0 and f(x) <= 0}.
+  for (auto eq : equalities) {
+    updateMatrix(result, builder.visit(eq));
+    updateMatrix(result, builder.visit(-eq));
+  }
+
+  // Add the inequalities induced by stripe equalities.
+  //   t = x # C  =>  t <= x <= t + C - 1
+  // which is equivalent to
+  //   {t - x <= 0;
+  //    x - t - (C - 1) <= 0}.
+  for (const auto &pair : result.stripeToPoint) {
+    auto stripe = pair.second.cast<SDBMStripeExpr>();
+    SDBMBuilderResult update = builder.visit(stripe.getVar());
+    assert(update.negativePos.size() == 1 && update.negativePos[0] == 0 &&
+           "unexpected negated variable in stripe expression");
+    assert(update.value == 0 &&
+           "unexpected non-zero value in stripe expression");
+    update.negativePos.clear();
+    update.negativePos.push_back(pair.first);
+    update.value = -(stripe.getStripeFactor().getValue() - 1);
+    updateMatrix(result, update);
+
+    std::swap(update.negativePos, update.positivePos);
+    update.value = 0;
+    updateMatrix(result, update);
+  }
+
+  return result;
+}
+
+// Given a row and a column position in the square DBM, insert one equality
+// or up to two inequalities that correspond the entries (col, row) and (row,
+// col) in the DBM.  `rowExpr` and `colExpr` contain the expressions such that
+// colExpr - rowExpr <= V where V is the value at (row, col) in the DBM.
+// If one of the expressions is derived from another using a stripe operation,
+// check if the inequalities induced by the stripe operation subsume the
+// inequalities defined in the DBM and if so, elide these inequalities.
+void SDBM::convertDBMElement(unsigned row, unsigned col,
+                             SDBMPositiveExpr rowExpr, SDBMPositiveExpr colExpr,
+                             SmallVectorImpl<SDBMExpr> &inequalities,
+                             SmallVectorImpl<SDBMExpr> &equalities) {
+  using ops_assertions::operator+;
+  using ops_assertions::operator-;
+
+  auto diffIJValue = at(col, row);
+  auto diffJIValue = at(row, col);
+
+  // If symmetric entries are opposite, the corresponding expressions are equal.
+  if (diffIJValue.isFinite() &&
+      diffIJValue.getValue() == -diffJIValue.getValue()) {
+    equalities.push_back(rowExpr - colExpr - diffIJValue.getValue());
+    return;
+  }
+
+  // Given an inequality x0 - x1 <= A, check if x0 is a stripe variable derived
+  // from x1: x0 = x1 # B.  If so, it would imply the constraints
+  // x0 <= x1 <= x0 + (B - 1) <=> x0 - x1 <= 0 and x1 - x0 <= (B - 1).
+  // Therefore, if A >= 0, this inequality is subsumed by that implied
+  // by the stripe equality and thus can be elided.
+  // Similarly, check if x1 is a stripe variable derived from x0: x1 = x0 # C.
+  // If so, it would imply the constraints x1 <= x0 <= x1 + (C - 1) <=>
+  // <=> x1 - x0 <= 0 and x0 - x1 <= (C - 1).  Therefore, if A >= (C - 1), this
+  // inequality can be elided.
+  //
+  // Note: x0 and x1 may be a stripe expressions themselves, we rely on stripe
+  // expressions being stored without temporaries on the RHS and being passed
+  // into this function as is.
+  auto canElide = [this](unsigned x0, unsigned x1, SDBMExpr x0Expr,
+                         SDBMExpr x1Expr, int64_t value) {
+    if (stripeToPoint.count(x0)) {
+      auto stripe = stripeToPoint[x0].cast<SDBMStripeExpr>();
+      SDBMPositiveExpr var = stripe.getVar();
+      if (x1Expr == var && value >= 0)
+        return true;
+    }
+    if (stripeToPoint.count(x1)) {
+      auto stripe = stripeToPoint[x1].cast<SDBMStripeExpr>();
+      SDBMPositiveExpr var = stripe.getVar();
+      if (x0Expr == var && value >= stripe.getStripeFactor().getValue() - 1)
+        return true;
+    }
+    return false;
+  };
+
+  // Check row - col.
+  if (diffIJValue.isFinite() &&
+      !canElide(row, col, rowExpr, colExpr, diffIJValue.getValue())) {
+    inequalities.push_back(rowExpr - colExpr - diffIJValue.getValue());
+  }
+  // Check col - row.
+  if (diffJIValue.isFinite() &&
+      !canElide(col, row, colExpr, rowExpr, diffJIValue.getValue())) {
+    inequalities.push_back(colExpr - rowExpr - diffJIValue.getValue());
+  }
+}
+
+// The values on the main diagonal correspond to the upper bound on the
+// difference between a variable and itself: d0 - d0 <= C, or alternatively
+// to -C <= 0.  Only construct the inequalities when C is negative, which
+// are trivially false but necessary for the returned system of inequalities
+// to indicate that the set it defines is empty.
+void SDBM::convertDBMDiagonalElement(unsigned pos, SDBMPositiveExpr expr,
+                                     SmallVectorImpl<SDBMExpr> &inequalities) {
+  auto selfDifference = at(pos, pos);
+  if (selfDifference.isFinite() && selfDifference < 0) {
+    auto selfDifferenceValueExpr =
+        SDBMConstantExpr::get(expr.getDialect(), -selfDifference.getValue());
+    inequalities.push_back(selfDifferenceValueExpr);
+  }
+}
+
+void SDBM::getSDBMExpressions(SDBMDialect *dialect,
+                              SmallVectorImpl<SDBMExpr> &inequalities,
+                              SmallVectorImpl<SDBMExpr> &equalities) {
+  using ops_assertions::operator-;
+  using ops_assertions::operator+;
+
+  // Helper function that creates an SDBMInputExpr given the linearized position
+  // of variable in the DBM.
+  auto getInput = [dialect, this](unsigned matrixPos) -> SDBMInputExpr {
+    if (matrixPos < numDims)
+      return SDBMDimExpr::get(dialect, matrixPos);
+    return SDBMSymbolExpr::get(dialect, matrixPos - numDims);
+  };
+
+  // The top-left value corresponds to inequality 0 <= C.  If C is negative, the
+  // set defined by SDBM is trivially empty and we add the constraint -C <= 0 to
+  // the list of inequalities.  Otherwise, the constraint is trivially true and
+  // we ignore it.
+  auto difference = at(0, 0);
+  if (difference.isFinite() && difference < 0) {
+    inequalities.push_back(
+        SDBMConstantExpr::get(dialect, -difference.getValue()));
+  }
+
+  // Traverse the segment of the matrix that involves non-temporary variables.
+  unsigned numTrueVariables = numDims + numSymbols;
+  for (unsigned i = 0; i < numTrueVariables; ++i) {
+    // The first row and column represent numerical upper and lower bound on
+    // each variable.  Transform them into inequalities if they are finite.
+    auto upperBound = at(0, 1 + i);
+    auto lowerBound = at(1 + i, 0);
+    auto inputExpr = getInput(i);
+    if (upperBound.isFinite() &&
+        upperBound.getValue() == -lowerBound.getValue()) {
+      equalities.push_back(inputExpr - upperBound.getValue());
+    } else if (upperBound.isFinite()) {
+      inequalities.push_back(inputExpr - upperBound.getValue());
+    } else if (lowerBound.isFinite()) {
+      inequalities.push_back(-inputExpr - lowerBound.getValue());
+    }
+
+    // Introduce trivially false inequalities if required by diagonal elements.
+    convertDBMDiagonalElement(1 + i, inputExpr, inequalities);
+
+    // Introduce equalities or inequalities between non-temporary variables.
+    for (unsigned j = 0; j < i; ++j) {
+      convertDBMElement(1 + i, 1 + j, getInput(i), getInput(j), inequalities,
+                        equalities);
+    }
+  }
+
+  // Add equalities for stripe expressions that define non-temporary
+  // variables.  Temporary variables will be substituted into their uses and
+  // should not appear in the resulting equalities.
+  for (const auto &stripePair : stripeToPoint) {
+    unsigned position = stripePair.first;
+    if (position < 1 + numTrueVariables) {
+      equalities.push_back(getInput(position - 1) - stripePair.second);
+    }
+  }
+
+  // Add equalities / inequalities involving temporaries by replacing the
+  // temporaries with stripe expressions that define them.
+  for (unsigned i = 1 + numTrueVariables, e = getNumVariables(); i < e; ++i) {
+    // Mixed constraints involving one temporary (j) and one non-temporary (i)
+    // variable.
+    for (unsigned j = 0; j < numTrueVariables; ++j) {
+      convertDBMElement(i, 1 + j, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                        getInput(j), inequalities, equalities);
+    }
+
+    // Constraints involving only temporary variables.
+    for (unsigned j = 1 + numTrueVariables; j < i; ++j) {
+      convertDBMElement(i, j, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                        stripeToPoint[j].cast<SDBMStripeExpr>(), inequalities,
+                        equalities);
+    }
+
+    // Introduce trivially false inequalities if required by diagonal elements.
+    convertDBMDiagonalElement(i, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                              inequalities);
+  }
+}
+
+void SDBM::print(llvm::raw_ostream &os) {
+  unsigned numVariables = getNumVariables();
+
+  // Helper function that prints the name of the variable given its linearized
+  // position in the DBM.
+  auto getVarName = [this](unsigned matrixPos) -> std::string {
+    if (matrixPos == 0)
+      return "cst";
+    matrixPos -= 1;
+    if (matrixPos < numDims)
+      return llvm::formatv("d{0}", matrixPos);
+    matrixPos -= numDims;
+    if (matrixPos < numSymbols)
+      return llvm::formatv("s{0}", matrixPos);
+    matrixPos -= numSymbols;
+    return llvm::formatv("t{0}", matrixPos);
+  };
+
+  // Header row.
+  os << "      cst";
+  for (unsigned i = 1; i < numVariables; ++i) {
+    os << llvm::formatv(" {0,4}", getVarName(i));
+  }
+  os << '\n';
+
+  // Data rows.
+  for (unsigned i = 0; i < numVariables; ++i) {
+    os << llvm::formatv("{0,-4}", getVarName(i));
+    for (unsigned j = 0; j < numVariables; ++j) {
+      IntInfty value = operator()(i, j);
+      if (!value.isFinite())
+        os << "  inf";
+      else
+        os << llvm::formatv(" {0,4}", value.getValue());
+    }
+    os << '\n';
+  }
+
+  // Explanation of temporaries.
+  for (const auto &pair : stripeToPoint) {
+    os << getVarName(pair.first) << " = ";
+    pair.second.print(os);
+    os << '\n';
+  }
+}
+
+void SDBM::dump() { print(llvm::errs()); }
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBMDialect.cpp b/third_party/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
new file mode 100644
index 00000000000..d3d895fec88
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
@@ -0,0 +1,20 @@
+//===- SDBMDialect.cpp - Dialect for striped difference-bound matrices ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+
+static mlir::DialectRegistration<mlir::SDBMDialect> SDBMDialect;
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
new file mode 100644
index 00000000000..a174c8c84f2
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -0,0 +1,647 @@
+//===- SDBMExpr.cpp - MLIR SDBM Expression implementation -----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// A striped difference-bound matrix (SDBM) expression is a constant expression,
+// an identifier, a binary expression with constant RHS and +, stripe operators
+// or a difference expression between two identifiers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "SDBMExprDetail.h"
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineExprVisitor.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+/// A simple compositional matcher for AffineExpr
+///
+/// Example usage:
+///
+/// ```c++
+///    AffineExprMatcher x, C, m;
+///    AffineExprMatcher pattern1 = ((x % C) * m) + x;
+///    AffineExprMatcher pattern2 = x + ((x % C) * m);
+///    if (pattern1.match(expr) || pattern2.match(expr)) {
+///      ...
+///    }
+/// ```
+class AffineExprMatcherStorage;
+class AffineExprMatcher {
+public:
+  AffineExprMatcher();
+  AffineExprMatcher(const AffineExprMatcher &other);
+
+  AffineExprMatcher operator+(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Add, *this, other);
+  }
+  AffineExprMatcher operator*(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Mul, *this, other);
+  }
+  AffineExprMatcher floorDiv(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::FloorDiv, *this, other);
+  }
+  AffineExprMatcher ceilDiv(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::CeilDiv, *this, other);
+  }
+  AffineExprMatcher operator%(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Mod, *this, other);
+  }
+
+  AffineExpr match(AffineExpr expr);
+  AffineExpr matched();
+  Optional<int> getMatchedConstantValue();
+
+private:
+  AffineExprMatcher(AffineExprKind k, AffineExprMatcher a, AffineExprMatcher b);
+  AffineExprKind kind; // only used to match in binary op cases.
+  // A shared_ptr allows multiple references to same matcher storage without
+  // worrying about ownership or dealing with an arena. To be cleaned up if we
+  // go with this.
+  std::shared_ptr<AffineExprMatcherStorage> storage;
+};
+
+class AffineExprMatcherStorage {
+public:
+  AffineExprMatcherStorage() {}
+  AffineExprMatcherStorage(const AffineExprMatcherStorage &other)
+      : subExprs(other.subExprs.begin(), other.subExprs.end()),
+        matched(other.matched) {}
+  AffineExprMatcherStorage(ArrayRef<AffineExprMatcher> exprs)
+      : subExprs(exprs.begin(), exprs.end()) {}
+  AffineExprMatcherStorage(AffineExprMatcher &a, AffineExprMatcher &b)
+      : subExprs({a, b}) {}
+  llvm::SmallVector<AffineExprMatcher, 0> subExprs;
+  AffineExpr matched;
+};
+} // namespace
+
+AffineExprMatcher::AffineExprMatcher()
+    : kind(AffineExprKind::Constant), storage(new AffineExprMatcherStorage()) {}
+
+AffineExprMatcher::AffineExprMatcher(const AffineExprMatcher &other)
+    : kind(other.kind), storage(other.storage) {}
+
+Optional<int> AffineExprMatcher::getMatchedConstantValue() {
+  if (auto cst = storage->matched.dyn_cast<AffineConstantExpr>())
+    return cst.getValue();
+  return None;
+}
+
+AffineExpr AffineExprMatcher::match(AffineExpr expr) {
+  if (kind > AffineExprKind::LAST_AFFINE_BINARY_OP) {
+    if (storage->matched)
+      if (storage->matched != expr)
+        return AffineExpr();
+    storage->matched = expr;
+    return storage->matched;
+  }
+  if (kind != expr.getKind()) {
+    return AffineExpr();
+  }
+  if (auto bin = expr.dyn_cast<AffineBinaryOpExpr>()) {
+    if (!storage->subExprs.empty() &&
+        !storage->subExprs[0].match(bin.getLHS())) {
+      return AffineExpr();
+    }
+    if (!storage->subExprs.empty() &&
+        !storage->subExprs[1].match(bin.getRHS())) {
+      return AffineExpr();
+    }
+    if (storage->matched)
+      if (storage->matched != expr)
+        return AffineExpr();
+    storage->matched = expr;
+    return storage->matched;
+  }
+  llvm_unreachable("binary expected");
+}
+
+AffineExpr AffineExprMatcher::matched() { return storage->matched; }
+
+AffineExprMatcher::AffineExprMatcher(AffineExprKind k, AffineExprMatcher a,
+                                     AffineExprMatcher b)
+    : kind(k), storage(new AffineExprMatcherStorage(a, b)) {
+  storage->subExprs.push_back(a);
+  storage->subExprs.push_back(b);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMExpr
+//===----------------------------------------------------------------------===//
+
+SDBMExprKind SDBMExpr::getKind() const { return impl->getKind(); }
+
+MLIRContext *SDBMExpr::getContext() const {
+  return impl->dialect->getContext();
+}
+
+SDBMDialect *SDBMExpr::getDialect() const { return impl->dialect; }
+
+void SDBMExpr::print(raw_ostream &os) const {
+  struct Printer : public SDBMVisitor<Printer> {
+    Printer(raw_ostream &ostream) : prn(ostream) {}
+
+    void visitSum(SDBMSumExpr expr) {
+      visitVarying(expr.getLHS());
+      prn << " + ";
+      visitConstant(expr.getRHS());
+    }
+    void visitDiff(SDBMDiffExpr expr) {
+      visitPositive(expr.getLHS());
+      prn << " - ";
+      visitPositive(expr.getRHS());
+    }
+    void visitDim(SDBMDimExpr expr) { prn << 'd' << expr.getPosition(); }
+    void visitSymbol(SDBMSymbolExpr expr) { prn << 's' << expr.getPosition(); }
+    void visitStripe(SDBMStripeExpr expr) {
+      visitPositive(expr.getVar());
+      prn << " # ";
+      visitConstant(expr.getStripeFactor());
+    }
+    void visitNeg(SDBMNegExpr expr) {
+      prn << '-';
+      visitPositive(expr.getVar());
+    }
+    void visitConstant(SDBMConstantExpr expr) { prn << expr.getValue(); }
+
+    raw_ostream &prn;
+  };
+  Printer printer(os);
+  printer.visit(*this);
+}
+
+void SDBMExpr::dump() const {
+  print(llvm::errs());
+  llvm::errs() << '\n';
+}
+
+namespace {
+// Helper class to perform negation of an SDBM expression.
+struct SDBMNegator : public SDBMVisitor<SDBMNegator, SDBMExpr> {
+  // Any positive expression is wrapped into a negation expression.
+  //  -(x) = -x
+  SDBMExpr visitPositive(SDBMPositiveExpr expr) {
+    return SDBMNegExpr::get(expr);
+  }
+  // A negation expression is unwrapped.
+  //  -(-x) = x
+  SDBMExpr visitNeg(SDBMNegExpr expr) { return expr.getVar(); }
+  // The value of the constant is negated.
+  SDBMExpr visitConstant(SDBMConstantExpr expr) {
+    return SDBMConstantExpr::get(expr.getDialect(), -expr.getValue());
+  }
+  // Both terms of the sum are negated recursively.
+  SDBMExpr visitSum(SDBMSumExpr expr) {
+    return SDBMSumExpr::get(visit(expr.getLHS()).cast<SDBMVaryingExpr>(),
+                            visit(expr.getRHS()).cast<SDBMConstantExpr>());
+  }
+  // Terms of a difference are interchanged.
+  //  -(x - y) = y - x
+  SDBMExpr visitDiff(SDBMDiffExpr expr) {
+    return SDBMDiffExpr::get(expr.getRHS(), expr.getLHS());
+  }
+};
+} // namespace
+
+SDBMExpr SDBMExpr::operator-() { return SDBMNegator().visit(*this); }
+
+//===----------------------------------------------------------------------===//
+// SDBMSumExpr
+//===----------------------------------------------------------------------===//
+
+SDBMSumExpr SDBMSumExpr::get(SDBMVaryingExpr lhs, SDBMConstantExpr rhs) {
+  assert(lhs && "expected SDBM variable expression");
+  assert(rhs && "expected SDBM constant");
+
+  // If LHS of a sum is another sum, fold the constant RHS parts.
+  if (auto lhsSum = lhs.dyn_cast<SDBMSumExpr>()) {
+    lhs = lhsSum.getLHS();
+    rhs = SDBMConstantExpr::get(rhs.getDialect(),
+                                rhs.getValue() + lhsSum.getRHS().getValue());
+  }
+
+  StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMBinaryExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Add), lhs, rhs);
+}
+
+SDBMVaryingExpr SDBMSumExpr::getLHS() const {
+  return static_cast<ImplType *>(impl)->lhs;
+}
+
+SDBMConstantExpr SDBMSumExpr::getRHS() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+AffineExpr SDBMExpr::getAsAffineExpr() const {
+  struct Converter : public SDBMVisitor<Converter, AffineExpr> {
+    AffineExpr visitSum(SDBMSumExpr expr) {
+      AffineExpr lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      return lhs + rhs;
+    }
+
+    AffineExpr visitStripe(SDBMStripeExpr expr) {
+      AffineExpr lhs = visit(expr.getVar()),
+                 rhs = visit(expr.getStripeFactor());
+      return lhs - (lhs % rhs);
+    }
+
+    AffineExpr visitDiff(SDBMDiffExpr expr) {
+      AffineExpr lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      return lhs - rhs;
+    }
+
+    AffineExpr visitDim(SDBMDimExpr expr) {
+      return getAffineDimExpr(expr.getPosition(), expr.getContext());
+    }
+
+    AffineExpr visitSymbol(SDBMSymbolExpr expr) {
+      return getAffineSymbolExpr(expr.getPosition(), expr.getContext());
+    }
+
+    AffineExpr visitNeg(SDBMNegExpr expr) {
+      return getAffineBinaryOpExpr(AffineExprKind::Mul,
+                                   getAffineConstantExpr(-1, expr.getContext()),
+                                   visit(expr.getVar()));
+    }
+
+    AffineExpr visitConstant(SDBMConstantExpr expr) {
+      return getAffineConstantExpr(expr.getValue(), expr.getContext());
+    }
+  } converter;
+  return converter.visit(*this);
+}
+
+Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
+  struct Converter : public AffineExprVisitor<Converter, SDBMExpr> {
+    SDBMExpr visitAddExpr(AffineBinaryOpExpr expr) {
+      // Attempt to recover a stripe expression.  Because AffineExprs don't have
+      // a first-class difference kind, we check for both x + -1 * (x mod C) and
+      // -1 * (x mod C) + x cases.
+      AffineExprMatcher x, C, m;
+      AffineExprMatcher pattern1 = ((x % C) * m) + x;
+      AffineExprMatcher pattern2 = x + ((x % C) * m);
+      if ((pattern1.match(expr) && m.getMatchedConstantValue() == -1) ||
+          (pattern2.match(expr) && m.getMatchedConstantValue() == -1)) {
+        if (auto convertedLHS = visit(x.matched())) {
+          // TODO(ntv): return convertedLHS.stripe(C);
+          return SDBMStripeExpr::get(
+              convertedLHS.cast<SDBMPositiveExpr>(),
+              visit(C.matched()).cast<SDBMConstantExpr>());
+        }
+      }
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // In a "add" AffineExpr, the constant always appears on the right.  If
+      // there were two constants, they would have been folded away.
+      assert(!lhs.isa<SDBMConstantExpr>() && "non-canonical affine expression");
+      auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+
+      // SDBM accepts LHS variables and RHS constants in a sum.
+      auto lhsVar = lhs.dyn_cast<SDBMVaryingExpr>();
+      auto rhsVar = rhs.dyn_cast<SDBMVaryingExpr>();
+      if (rhsConstant && lhsVar)
+        return SDBMSumExpr::get(lhsVar, rhsConstant);
+
+      // The sum of a negated variable and a non-negated variable is a
+      // difference, supported as a special kind in SDBM.  Because AffineExprs
+      // don't have first-class difference kind, check both LHS and RHS for
+      // negation.
+      auto lhsPos = lhs.dyn_cast<SDBMPositiveExpr>();
+      auto rhsPos = rhs.dyn_cast<SDBMPositiveExpr>();
+      auto lhsNeg = lhs.dyn_cast<SDBMNegExpr>();
+      auto rhsNeg = rhs.dyn_cast<SDBMNegExpr>();
+      if (lhsNeg && rhsVar)
+        return SDBMDiffExpr::get(rhsPos, lhsNeg.getVar());
+      if (rhsNeg && lhsVar)
+        return SDBMDiffExpr::get(lhsPos, rhsNeg.getVar());
+
+      // Other cases don't fit into SDBM.
+      return {};
+    }
+
+    SDBMExpr visitMulExpr(AffineBinaryOpExpr expr) {
+      // Attempt to recover a stripe expression "x # C = (x floordiv C) * C".
+      AffineExprMatcher x, C;
+      AffineExprMatcher pattern = (x.floorDiv(C)) * C;
+      if (pattern.match(expr)) {
+        if (SDBMExpr converted = visit(x.matched())) {
+          if (auto varConverted = converted.dyn_cast<SDBMPositiveExpr>())
+            // TODO(ntv): return varConverted.stripe(C.getConstantValue());
+            return SDBMStripeExpr::get(
+                varConverted,
+                SDBMConstantExpr::get(dialect,
+                                      C.getMatchedConstantValue().getValue()));
+        }
+      }
+
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // In a "mul" AffineExpr, the constant always appears on the right.  If
+      // there were two constants, they would have been folded away.
+      assert(!lhs.isa<SDBMConstantExpr>() && "non-canonical affine expression");
+      auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+      if (!rhsConstant)
+        return {};
+
+      // The only supported "multiplication" expression is an SDBM is dimension
+      // negation, that is a product of dimension and constant -1.
+      auto lhsVar = lhs.dyn_cast<SDBMPositiveExpr>();
+      if (lhsVar && rhsConstant.getValue() == -1)
+        return SDBMNegExpr::get(lhsVar);
+
+      // Other multiplications are not allowed in SDBM.
+      return {};
+    }
+
+    SDBMExpr visitModExpr(AffineBinaryOpExpr expr) {
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // 'mod' can only be converted to SDBM if its LHS is a variable
+      // and its RHS is a constant.  Then it `x mod c = x - x stripe c`.
+      auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+      auto lhsVar = rhs.dyn_cast<SDBMPositiveExpr>();
+      if (!lhsVar || !rhsConstant)
+        return {};
+      return SDBMDiffExpr::get(lhsVar,
+                               SDBMStripeExpr::get(lhsVar, rhsConstant));
+    }
+
+    // `a floordiv b = (a stripe b) / b`, but we have no division in SDBM
+    SDBMExpr visitFloorDivExpr(AffineBinaryOpExpr expr) { return {}; }
+    SDBMExpr visitCeilDivExpr(AffineBinaryOpExpr expr) { return {}; }
+
+    // Dimensions, symbols and constants are converted trivially.
+    SDBMExpr visitConstantExpr(AffineConstantExpr expr) {
+      return SDBMConstantExpr::get(dialect, expr.getValue());
+    }
+    SDBMExpr visitDimExpr(AffineDimExpr expr) {
+      return SDBMDimExpr::get(dialect, expr.getPosition());
+    }
+    SDBMExpr visitSymbolExpr(AffineSymbolExpr expr) {
+      return SDBMSymbolExpr::get(dialect, expr.getPosition());
+    }
+
+    SDBMDialect *dialect;
+  } converter;
+  converter.dialect = affine.getContext()->getRegisteredDialect<SDBMDialect>();
+
+  if (auto result = converter.visit(affine))
+    return result;
+  return None;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMDiffExpr
+//===----------------------------------------------------------------------===//
+
+SDBMDiffExpr SDBMDiffExpr::get(SDBMPositiveExpr lhs, SDBMPositiveExpr rhs) {
+  assert(lhs && "expected SDBM dimension");
+  assert(rhs && "expected SDBM dimension");
+
+  StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMDiffExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Diff), lhs, rhs);
+}
+
+SDBMPositiveExpr SDBMDiffExpr::getLHS() const {
+  return static_cast<ImplType *>(impl)->lhs;
+}
+
+SDBMPositiveExpr SDBMDiffExpr::getRHS() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMStripeExpr
+//===----------------------------------------------------------------------===//
+
+SDBMStripeExpr SDBMStripeExpr::get(SDBMPositiveExpr var,
+                                   SDBMConstantExpr stripeFactor) {
+  assert(var && "expected SDBM variable expression");
+  assert(stripeFactor && "expected non-null stripe factor");
+  if (stripeFactor.getValue() <= 0)
+    llvm::report_fatal_error("non-positive stripe factor");
+
+  StorageUniquer &uniquer = var.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMBinaryExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Stripe), var,
+      stripeFactor);
+}
+
+SDBMPositiveExpr SDBMStripeExpr::getVar() const {
+  if (SDBMVaryingExpr lhs = static_cast<ImplType *>(impl)->lhs)
+    return lhs.cast<SDBMPositiveExpr>();
+  return {};
+}
+
+SDBMConstantExpr SDBMStripeExpr::getStripeFactor() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMInputExpr
+//===----------------------------------------------------------------------===//
+
+unsigned SDBMInputExpr::getPosition() const {
+  return static_cast<ImplType *>(impl)->position;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMDimExpr
+//===----------------------------------------------------------------------===//
+
+SDBMDimExpr SDBMDimExpr::get(SDBMDialect *dialect, unsigned position) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignDialect = [dialect](detail::SDBMPositiveExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMPositiveExprStorage>(
+      assignDialect, static_cast<unsigned>(SDBMExprKind::DimId), position);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMSymbolExpr
+//===----------------------------------------------------------------------===//
+
+SDBMSymbolExpr SDBMSymbolExpr::get(SDBMDialect *dialect, unsigned position) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignDialect = [dialect](detail::SDBMPositiveExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMPositiveExprStorage>(
+      assignDialect, static_cast<unsigned>(SDBMExprKind::SymbolId), position);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMConstantExpr
+//===----------------------------------------------------------------------===//
+
+SDBMConstantExpr SDBMConstantExpr::get(SDBMDialect *dialect, int64_t value) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignCtx = [dialect](detail::SDBMConstantExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMConstantExprStorage>(
+      assignCtx, static_cast<unsigned>(SDBMExprKind::Constant), value);
+}
+
+int64_t SDBMConstantExpr::getValue() const {
+  return static_cast<ImplType *>(impl)->constant;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMNegExpr
+//===----------------------------------------------------------------------===//
+
+SDBMNegExpr SDBMNegExpr::get(SDBMPositiveExpr var) {
+  assert(var && "expected non-null SDBM variable expression");
+
+  StorageUniquer &uniquer = var.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMNegExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Neg), var);
+}
+
+SDBMPositiveExpr SDBMNegExpr::getVar() const {
+  return static_cast<ImplType *>(impl)->dim;
+}
+
+namespace mlir {
+namespace ops_assertions {
+
+SDBMExpr operator+(SDBMExpr lhs, SDBMExpr rhs) {
+  // If one of the operands is a negation, take a difference rather than a sum.
+  auto lhsNeg = lhs.dyn_cast<SDBMNegExpr>();
+  auto rhsNeg = rhs.dyn_cast<SDBMNegExpr>();
+  assert(!(lhsNeg && rhsNeg) && "a sum of negated expressions is a negation of "
+                                "a sum of variables and not a correct SDBM");
+  if (lhsNeg)
+    return rhs - lhsNeg.getVar();
+  if (rhsNeg)
+    return lhs - rhsNeg.getVar();
+
+  // If LHS is a constant and RHS is not, swap the order to get into a supported
+  // sum case.  From now on, RHS must be a constant.
+  auto lhsConstant = lhs.dyn_cast<SDBMConstantExpr>();
+  auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+  if (!rhsConstant && lhsConstant) {
+    std::swap(lhs, rhs);
+    std::swap(lhsConstant, rhsConstant);
+  }
+  assert(rhsConstant && "at least one operand must be a constant");
+
+  // If LHS is another sum, first compute the sum of its variable
+  // part with the other argument and then add the constant part to enable
+  // constant folding (the variable part may, e.g., be a negation that requires
+  // to enter this function again).
+  auto lhsSum = lhs.dyn_cast<SDBMSumExpr>();
+  if (lhsSum)
+    return lhsSum.getLHS() +
+           (lhsSum.getRHS().getValue() + rhsConstant.getValue());
+
+  // Constant-fold if LHS is a constant.
+  if (lhsConstant)
+    return SDBMConstantExpr::get(lhs.getDialect(), lhsConstant.getValue() +
+                                                       rhsConstant.getValue());
+
+  // Fold x + 0 == x.
+  if (rhsConstant.getValue() == 0)
+    return lhs;
+
+  return SDBMSumExpr::get(lhs.cast<SDBMVaryingExpr>(),
+                          rhs.cast<SDBMConstantExpr>());
+}
+
+SDBMExpr operator-(SDBMExpr lhs, SDBMExpr rhs) {
+  // Fold x - x == 0.
+  if (lhs == rhs)
+    return SDBMConstantExpr::get(lhs.getDialect(), 0);
+
+  // LHS and RHS may be constants.
+  auto lhsConstant = lhs.dyn_cast<SDBMConstantExpr>();
+  auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+
+  // Constant fold if both LHS and RHS are constants.
+  if (lhsConstant && rhsConstant)
+    return SDBMConstantExpr::get(lhs.getDialect(), lhsConstant.getValue() -
+                                                       rhsConstant.getValue());
+
+  // Replace a difference with a sum with a negated value if one of LHS and RHS
+  // is a constant:
+  //   x - C == x + (-C);
+  //   C - x == -x + C.
+  // This calls into operator+ for further simplification.
+  if (rhsConstant)
+    return lhs + (-rhsConstant);
+  if (lhsConstant)
+    return -rhs + lhsConstant;
+
+  // Hoist constant factors outside the difference if any of sides is a sum:
+  //   (x + A) - (y - B) == x - y + (A - B).
+  // If either LHS or RHS is a sum, collect the constant values separately and
+  // update LHS and RHS to point to the variable part of the sum.
+  auto lhsSum = lhs.dyn_cast<SDBMSumExpr>();
+  auto rhsSum = rhs.dyn_cast<SDBMSumExpr>();
+  int64_t value = 0;
+  if (lhsSum) {
+    value += lhsSum.getRHS().getValue();
+    lhs = lhsSum.getLHS();
+  }
+  if (rhsSum) {
+    value -= rhsSum.getRHS().getValue();
+    rhs = rhsSum.getLHS();
+  }
+
+  // This calls into operator+ for futher simplification in case value == 0.
+  return SDBMDiffExpr::get(lhs.cast<SDBMPositiveExpr>(),
+                           rhs.cast<SDBMPositiveExpr>()) +
+         value;
+}
+
+SDBMExpr stripe(SDBMExpr expr, SDBMExpr factor) {
+  auto constantFactor = factor.cast<SDBMConstantExpr>();
+  assert(constantFactor.getValue() > 0 && "non-positive stripe");
+
+  // Fold x # 1 = x.
+  if (constantFactor.getValue() == 1)
+    return expr;
+
+  return SDBMStripeExpr::get(expr.cast<SDBMPositiveExpr>(), constantFactor);
+}
+
+} // namespace ops_assertions
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h b/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
new file mode 100644
index 00000000000..1721b02dae7
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
@@ -0,0 +1,138 @@
+//===- SDBMExprDetail.h - MLIR SDBM Expression storage details --*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of SDBMExpr, in particular underlying
+// storage types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_SDBMEXPRDETAIL_H
+#define MLIR_IR_SDBMEXPRDETAIL_H
+
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+
+class SDBMDialect;
+
+namespace detail {
+
+// Base storage class for SDBMExpr.
+struct SDBMExprStorage : public StorageUniquer::BaseStorage {
+  SDBMExprKind getKind() {
+    return static_cast<SDBMExprKind>(BaseStorage::getKind());
+  }
+
+  SDBMDialect *dialect;
+};
+
+// Storage class for SDBM sum and stripe expressions.
+struct SDBMBinaryExprStorage : public SDBMExprStorage {
+  using KeyTy = std::pair<SDBMVaryingExpr, SDBMConstantExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
+  }
+
+  static SDBMBinaryExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMBinaryExprStorage>();
+    result->lhs = std::get<0>(key);
+    result->rhs = std::get<1>(key);
+    result->dialect = result->lhs.getDialect();
+    return result;
+  }
+
+  SDBMVaryingExpr lhs;
+  SDBMConstantExpr rhs;
+};
+
+// Storage class for SDBM difference expressions.
+struct SDBMDiffExprStorage : public SDBMExprStorage {
+  using KeyTy = std::pair<SDBMPositiveExpr, SDBMPositiveExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
+  }
+
+  static SDBMDiffExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMDiffExprStorage>();
+    result->lhs = std::get<0>(key);
+    result->rhs = std::get<1>(key);
+    result->dialect = result->lhs.getDialect();
+    return result;
+  }
+
+  SDBMPositiveExpr lhs;
+  SDBMPositiveExpr rhs;
+};
+
+// Storage class for SDBM constant expressions.
+struct SDBMConstantExprStorage : public SDBMExprStorage {
+  using KeyTy = int64_t;
+
+  bool operator==(const KeyTy &key) const { return constant == key; }
+
+  static SDBMConstantExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMConstantExprStorage>();
+    result->constant = key;
+    return result;
+  }
+
+  int64_t constant;
+};
+
+// Storage class for SDBM dimension and symbol expressions.
+struct SDBMPositiveExprStorage : public SDBMExprStorage {
+  using KeyTy = unsigned;
+
+  bool operator==(const KeyTy &key) const { return position == key; }
+
+  static SDBMPositiveExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMPositiveExprStorage>();
+    result->position = key;
+    return result;
+  }
+
+  unsigned position;
+};
+
+// Storage class for SDBM negation expressions.
+struct SDBMNegExprStorage : public SDBMExprStorage {
+  using KeyTy = SDBMPositiveExpr;
+
+  bool operator==(const KeyTy &key) const { return key == dim; }
+
+  static SDBMNegExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMNegExprStorage>();
+    result->dim = key;
+    result->dialect = key.getDialect();
+    return result;
+  }
+
+  SDBMPositiveExpr dim;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // MLIR_IR_SDBMEXPRDETAIL_H
diff --git a/third_party/mlir/lib/Dialect/SPIRV/CMakeLists.txt b/third_party/mlir/lib/Dialect/SPIRV/CMakeLists.txt
new file mode 100644
index 00000000000..2803b90ea31
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_library(MLIRSPIRV
+  DialectRegistration.cpp
+  SPIRVDialect.cpp
+  SPIRVOps.cpp
+  SPIRVTypes.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/SPIRV
+  )
+
+add_dependencies(MLIRSPIRV
+  MLIRSPIRVOpsIncGen
+  MLIRSPIRVEnumsIncGen
+  MLIRSPIRVOpUtilsGen)
+
+target_link_libraries(MLIRSPIRV
+  MLIRIR
+  MLIRParser
+  MLIRSupport)
+
+add_subdirectory(Serialization)
diff --git a/third_party/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp
new file mode 100644
index 00000000000..63e9e812c39
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp
@@ -0,0 +1,21 @@
+//===- DialectRegistration.cpp - MLIR SPIR-V dialect registration ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+
+// Static initialization for SPIR-V dialect registration.
+static mlir::DialectRegistration<mlir::spirv::SPIRVDialect> spirvDialect;
diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp
new file mode 100644
index 00000000000..85a6a6221d9
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp
@@ -0,0 +1,635 @@
+//===- LLVMDialect.cpp - MLIR SPIR-V dialect ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SPIR-V dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Parser.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace spirv {
+#include "mlir/Dialect/SPIRV/SPIRVOpUtils.inc"
+} // namespace spirv
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::spirv;
+
+//===----------------------------------------------------------------------===//
+// SPIR-V Dialect
+//===----------------------------------------------------------------------===//
+
+SPIRVDialect::SPIRVDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addTypes<ArrayType, ImageType, PointerType, RuntimeArrayType, StructType>();
+
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/SPIRV/SPIRVOps.cpp.inc"
+      >();
+
+  // Allow unknown operations because SPIR-V is extensible.
+  allowUnknownOperations();
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing
+//===----------------------------------------------------------------------===//
+
+// Forward declarations.
+template <typename ValTy>
+static Optional<ValTy> parseAndVerify(SPIRVDialect const &dialect, Location loc,
+                                      StringRef spec);
+template <>
+Optional<Type> parseAndVerify<Type>(SPIRVDialect const &dialect, Location loc,
+                                    StringRef spec);
+
+template <>
+Optional<uint64_t> parseAndVerify(SPIRVDialect const &dialect, Location loc,
+                                  StringRef spec);
+
+// Parses "<number> x" from the beginning of `spec`.
+static bool parseNumberX(StringRef &spec, int64_t &number) {
+  spec = spec.ltrim();
+  if (spec.empty() || !llvm::isDigit(spec.front()))
+    return false;
+
+  number = 0;
+  do {
+    number = number * 10 + spec.front() - '0';
+    spec = spec.drop_front();
+  } while (!spec.empty() && llvm::isDigit(spec.front()));
+
+  spec = spec.ltrim();
+  if (!spec.consume_front("x"))
+    return false;
+
+  return true;
+}
+
+static bool isValidSPIRVIntType(IntegerType type) {
+  return llvm::is_contained(llvm::ArrayRef<unsigned>({1, 8, 16, 32, 64}),
+                            type.getWidth());
+}
+
+static bool isValidSPIRVScalarType(Type type) {
+  if (type.isa<FloatType>()) {
+    return !type.isBF16();
+  }
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    return isValidSPIRVIntType(intType);
+  }
+  return false;
+}
+
+static bool isValidSPIRVVectorType(VectorType type) {
+  return type.getRank() == 1 && isValidSPIRVScalarType(type.getElementType()) &&
+         type.getNumElements() >= 2 && type.getNumElements() <= 4;
+}
+
+bool SPIRVDialect::isValidSPIRVType(Type type) const {
+  // Allow SPIR-V dialect types
+  if (&type.getDialect() == this) {
+    return true;
+  }
+  if (isValidSPIRVScalarType(type)) {
+    return true;
+  }
+  if (auto vectorType = type.dyn_cast<VectorType>()) {
+    return isValidSPIRVVectorType(vectorType);
+  }
+  return false;
+}
+
+static Type parseAndVerifyType(SPIRVDialect const &dialect, StringRef spec,
+                               Location loc) {
+  spec = spec.trim();
+  auto *context = dialect.getContext();
+  auto type = mlir::parseType(spec.trim(), context);
+  if (!type) {
+    emitError(loc, "cannot parse type: ") << spec;
+    return Type();
+  }
+
+  // Allow SPIR-V dialect types
+  if (&type.getDialect() == &dialect)
+    return type;
+
+  // Check other allowed types
+  if (auto t = type.dyn_cast<FloatType>()) {
+    if (type.isBF16()) {
+      emitError(loc, "cannot use 'bf16' to compose SPIR-V types");
+      return Type();
+    }
+  } else if (auto t = type.dyn_cast<IntegerType>()) {
+    if (!isValidSPIRVIntType(t)) {
+      emitError(loc, "only 1/8/16/32/64-bit integer type allowed but found ")
+          << type;
+      return Type();
+    }
+  } else if (auto t = type.dyn_cast<VectorType>()) {
+    if (t.getRank() != 1) {
+      emitError(loc, "only 1-D vector allowed but found ") << t;
+      return Type();
+    }
+    if (t.getNumElements() > 4) {
+      emitError(loc,
+                "vector length has to be less than or equal to 4 but found ")
+          << t.getNumElements();
+      return Type();
+    }
+  } else {
+    emitError(loc, "cannot use ") << type << " to compose SPIR-V types";
+    return Type();
+  }
+
+  return type;
+}
+
+// element-type ::= integer-type
+//                | floating-point-type
+//                | vector-type
+//                | spirv-type
+//
+// array-type ::= `!spv.array<` integer-literal `x` element-type
+//                (`[` integer-literal `]`)? `>`
+static Type parseArrayType(SPIRVDialect const &dialect, StringRef spec,
+                           Location loc) {
+  if (!spec.consume_front("array<") || !spec.consume_back(">")) {
+    emitError(loc, "spv.array delimiter <...> mismatch");
+    return Type();
+  }
+
+  int64_t count = 0;
+  spec = spec.trim();
+  if (!parseNumberX(spec, count)) {
+    emitError(loc, "expected array element count followed by 'x' but found '")
+        << spec << "'";
+    return Type();
+  }
+
+  if (spec.trim().empty()) {
+    emitError(loc, "expected element type");
+    return Type();
+  }
+
+  ArrayType::LayoutInfo layoutInfo = 0;
+  size_t lastLSquare;
+
+  // Handle case when element type is not a trivial type
+  auto lastRDelimiter = spec.rfind('>');
+  if (lastRDelimiter != StringRef::npos) {
+    lastLSquare = spec.find('[', lastRDelimiter);
+  } else {
+    lastLSquare = spec.rfind('[');
+  }
+
+  if (lastLSquare != StringRef::npos) {
+    auto layoutSpec = spec.substr(lastLSquare);
+    auto layout =
+        parseAndVerify<ArrayType::LayoutInfo>(dialect, loc, layoutSpec);
+    if (!layout) {
+      return Type();
+    }
+
+    if (!(layoutInfo = layout.getValue())) {
+      emitError(loc, "ArrayStride must be greater than zero");
+      return Type();
+    }
+    spec = spec.substr(0, lastLSquare);
+  }
+
+  Type elementType = parseAndVerifyType(dialect, spec, loc);
+  if (!elementType)
+    return Type();
+
+  return ArrayType::get(elementType, count, layoutInfo);
+}
+
+// TODO(ravishankarm) : Reorder methods to be utilities first and parse*Type
+// methods in alphabetical order
+//
+// storage-class ::= `UniformConstant`
+//                 | `Uniform`
+//                 | `Workgroup`
+//                 | <and other storage classes...>
+//
+// pointer-type ::= `!spv.ptr<` element-type `,` storage-class `>`
+static Type parsePointerType(SPIRVDialect const &dialect, StringRef spec,
+                             Location loc) {
+  if (!spec.consume_front("ptr<") || !spec.consume_back(">")) {
+    emitError(loc, "spv.ptr delimiter <...> mismatch");
+    return Type();
+  }
+
+  // Split into pointee type and storage class
+  StringRef scSpec, ptSpec;
+  std::tie(ptSpec, scSpec) = spec.rsplit(',');
+  if (scSpec.empty()) {
+    emitError(loc,
+              "expected comma to separate pointee type and storage class in '")
+        << spec << "'";
+    return Type();
+  }
+
+  scSpec = scSpec.trim();
+  auto storageClass = symbolizeStorageClass(scSpec);
+  if (!storageClass) {
+    emitError(loc, "unknown storage class: ") << scSpec;
+    return Type();
+  }
+
+  if (ptSpec.trim().empty()) {
+    emitError(loc, "expected pointee type");
+    return Type();
+  }
+
+  auto pointeeType = parseAndVerifyType(dialect, ptSpec, loc);
+  if (!pointeeType)
+    return Type();
+
+  return PointerType::get(pointeeType, *storageClass);
+}
+
+// runtime-array-type ::= `!spv.rtarray<` element-type `>`
+static Type parseRuntimeArrayType(SPIRVDialect const &dialect, StringRef spec,
+                                  Location loc) {
+  if (!spec.consume_front("rtarray<") || !spec.consume_back(">")) {
+    emitError(loc, "spv.rtarray delimiter <...> mismatch");
+    return Type();
+  }
+
+  if (spec.trim().empty()) {
+    emitError(loc, "expected element type");
+    return Type();
+  }
+
+  Type elementType = parseAndVerifyType(dialect, spec, loc);
+  if (!elementType)
+    return Type();
+
+  return RuntimeArrayType::get(elementType);
+}
+
+// Specialize this function to parse each of the parameters that define an
+// ImageType. By default it assumes this is an enum type.
+template <typename ValTy>
+static Optional<ValTy> parseAndVerify(SPIRVDialect const &dialect, Location loc,
+                                      StringRef spec) {
+  auto val = spirv::symbolizeEnum<ValTy>()(spec);
+  if (!val) {
+    emitError(loc, "unknown attribute: '") << spec << "'";
+  }
+  return val;
+}
+
+template <>
+Optional<Type> parseAndVerify<Type>(SPIRVDialect const &dialect, Location loc,
+                                    StringRef spec) {
+  // TODO(ravishankarm): Further verify that the element type can be sampled
+  auto ty = parseAndVerifyType(dialect, spec, loc);
+  if (!ty) {
+    return llvm::None;
+  }
+  return ty;
+}
+
+template <>
+Optional<uint64_t> parseAndVerify(SPIRVDialect const &dialect, Location loc,
+                                  StringRef spec) {
+  uint64_t offsetVal = std::numeric_limits<uint64_t>::max();
+  if (!spec.consume_front("[")) {
+    emitError(loc, "expected '[' while parsing layout specification in '")
+        << spec << "'";
+    return llvm::None;
+  }
+  spec = spec.trim();
+  if (spec.consumeInteger(10, offsetVal)) {
+    emitError(loc, "expected unsigned integer to specify layout information: '")
+        << spec << "'";
+    return llvm::None;
+  }
+  spec = spec.trim();
+  if (!spec.consume_front("]")) {
+    emitError(loc, "missing ']' in decorations spec: '") << spec << "'";
+    return llvm::None;
+  }
+  if (spec != "") {
+    emitError(loc, "unexpected extra tokens in layout information: '")
+        << spec << "'";
+    return llvm::None;
+  }
+  return offsetVal;
+}
+
+// Functor object to parse a comma separated list of specs. The function
+// parseAndVerify does the actual parsing and verification of individual
+// elements. This is a functor since parsing the last element of the list
+// (termination condition) needs partial specialization.
+template <typename ParseType, typename... Args> struct parseCommaSeparatedList {
+  Optional<std::tuple<ParseType, Args...>>
+  operator()(SPIRVDialect const &dialect, Location loc, StringRef spec) const {
+    auto numArgs = std::tuple_size<std::tuple<Args...>>::value;
+    StringRef parseSpec, restSpec;
+    std::tie(parseSpec, restSpec) = spec.split(',');
+
+    parseSpec = parseSpec.trim();
+    if (numArgs != 0 && restSpec.empty()) {
+      emitError(loc, "expected more parameters for image type '")
+          << parseSpec << "'";
+      return llvm::None;
+    }
+
+    auto parseVal = parseAndVerify<ParseType>(dialect, loc, parseSpec);
+    if (!parseVal) {
+      return llvm::None;
+    }
+
+    auto remainingValues =
+        parseCommaSeparatedList<Args...>{}(dialect, loc, restSpec);
+    if (!remainingValues) {
+      return llvm::None;
+    }
+    return std::tuple_cat(std::tuple<ParseType>(parseVal.getValue()),
+                          remainingValues.getValue());
+  }
+};
+
+// Partial specialization of the function to parse a comma separated list of
+// specs to parse the last element of the list.
+template <typename ParseType> struct parseCommaSeparatedList<ParseType> {
+  Optional<std::tuple<ParseType>>
+  operator()(SPIRVDialect const &dialect, Location loc, StringRef spec) const {
+    spec = spec.trim();
+    auto value = parseAndVerify<ParseType>(dialect, loc, spec);
+    if (!value) {
+      return llvm::None;
+    }
+    return std::tuple<ParseType>(value.getValue());
+  }
+};
+
+// dim ::= `1D` | `2D` | `3D` | `Cube` | <and other SPIR-V Dim specifiers...>
+//
+// depth-info ::= `NoDepth` | `IsDepth` | `DepthUnknown`
+//
+// arrayed-info ::= `NonArrayed` | `Arrayed`
+//
+// sampling-info ::= `SingleSampled` | `MultiSampled`
+//
+// sampler-use-info ::= `SamplerUnknown` | `NeedSampler` |  `NoSampler`
+//
+// format ::= `Unknown` | `Rgba32f` | <and other SPIR-V Image formats...>
+//
+// image-type ::= `!spv.image<` element-type `,` dim `,` depth-info `,`
+//                              arrayed-info `,` sampling-info `,`
+//                              sampler-use-info `,` format `>`
+static Type parseImageType(SPIRVDialect const &dialect, StringRef spec,
+                           Location loc) {
+  if (!spec.consume_front("image<") || !spec.consume_back(">")) {
+    emitError(loc, "spv.image delimiter <...> mismatch");
+    return Type();
+  }
+
+  auto value =
+      parseCommaSeparatedList<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                              ImageSamplingInfo, ImageSamplerUseInfo,
+                              ImageFormat>{}(dialect, loc, spec);
+  if (!value) {
+    return Type();
+  }
+
+  return ImageType::get(value.getValue());
+}
+
+// Method to parse one member of a struct (including Layout information)
+static ParseResult
+parseStructElement(SPIRVDialect const &dialect, StringRef spec, Location loc,
+                   SmallVectorImpl<Type> &memberTypes,
+                   SmallVectorImpl<StructType::LayoutInfo> &layoutInfo) {
+  // Check for a '[' <layoutInfo> ']'
+  auto lastLSquare = spec.rfind('[');
+  auto typeSpec = spec.substr(0, lastLSquare);
+  auto layoutSpec = (lastLSquare == StringRef::npos ? StringRef("")
+                                                    : spec.substr(lastLSquare));
+  auto type = parseAndVerify<Type>(dialect, loc, typeSpec);
+  if (!type) {
+    return failure();
+  }
+  memberTypes.push_back(type.getValue());
+  if (layoutSpec.empty()) {
+    return success();
+  }
+  if (layoutInfo.size() != memberTypes.size() - 1) {
+    emitError(loc, "layout specification must be given for all members");
+    return failure();
+  }
+  auto layout =
+      parseAndVerify<StructType::LayoutInfo>(dialect, loc, layoutSpec);
+  if (!layout) {
+    return failure();
+  }
+  layoutInfo.push_back(layout.getValue());
+  return success();
+}
+
+// Helper method to record the position of the corresponding '>' for every '<'
+// encountered when parsing the string left to right. The relative position of
+// '>' w.r.t to the '<' is recorded.
+static bool
+computeMatchingRAngles(Location loc, StringRef const &spec,
+                       SmallVectorImpl<size_t> &matchingRAngleOffset) {
+  SmallVector<size_t, 4> openBrackets;
+  for (size_t i = 0, e = spec.size(); i != e; ++i) {
+    if (spec[i] == '<') {
+      openBrackets.push_back(i);
+    } else if (spec[i] == '>') {
+      if (openBrackets.empty()) {
+        emitError(loc, "unbalanced '<' in '") << spec << "'";
+        return false;
+      }
+      matchingRAngleOffset.push_back(i - openBrackets.pop_back_val());
+    }
+  }
+  return true;
+}
+
+static ParseResult
+parseStructHelper(SPIRVDialect const &dialect, StringRef spec, Location loc,
+                  ArrayRef<size_t> matchingRAngleOffset,
+                  SmallVectorImpl<Type> &memberTypes,
+                  SmallVectorImpl<StructType::LayoutInfo> &layoutInfo) {
+  // Check if the occurrence of ',' or '<' is before. If former, split using
+  // ','. If latter, split using matching '>' to get the entire type
+  // description
+  auto firstComma = spec.find(',');
+  auto firstLAngle = spec.find('<');
+  if (firstLAngle == StringRef::npos && firstComma == StringRef::npos) {
+    return parseStructElement(dialect, spec, loc, memberTypes, layoutInfo);
+  }
+  if (firstLAngle == StringRef::npos || firstComma < firstLAngle) {
+    // Parse the type before the ','
+    if (parseStructElement(dialect, spec.substr(0, firstComma), loc,
+                           memberTypes, layoutInfo)) {
+      return failure();
+    }
+    return parseStructHelper(dialect, spec.substr(firstComma + 1).ltrim(), loc,
+                             matchingRAngleOffset, memberTypes, layoutInfo);
+  }
+  auto matchingRAngle = matchingRAngleOffset.front() + firstLAngle;
+  // Find the next ',' or '>'
+  auto endLoc = std::min(spec.find(',', matchingRAngle + 1), spec.size());
+  if (parseStructElement(dialect, spec.substr(0, endLoc), loc, memberTypes,
+                         layoutInfo)) {
+    return failure();
+  }
+  auto rest = spec.substr(endLoc + 1).ltrim();
+  if (rest.empty()) {
+    return success();
+  }
+  if (rest.front() == ',') {
+    return parseStructHelper(
+        dialect, rest.drop_front().trim(), loc,
+        ArrayRef<size_t>(std::next(matchingRAngleOffset.begin()),
+                         matchingRAngleOffset.end()),
+        memberTypes, layoutInfo);
+  }
+  emitError(loc, "unexpected string : '") << rest << "'";
+  return failure();
+}
+
+// struct-type ::= `!spv.struct<` spirv-type (` [` integer-literal `]`)?
+//                 (`, ` spirv-type ( ` [` integer-literal `] ` )? )*
+static Type parseStructType(SPIRVDialect const &dialect, StringRef spec,
+                            Location loc) {
+  if (!spec.consume_front("struct<") || !spec.consume_back(">")) {
+    emitError(loc, "spv.struct delimiter <...> mismatch");
+    return Type();
+  }
+
+  if (spec.trim().empty()) {
+    emitError(loc, "expected SPIR-V type");
+    return Type();
+  }
+
+  SmallVector<Type, 4> memberTypes;
+  SmallVector<StructType::LayoutInfo, 4> layoutInfo;
+  SmallVector<size_t, 4> matchingRAngleOffset;
+  if (!computeMatchingRAngles(loc, spec, matchingRAngleOffset) ||
+      parseStructHelper(dialect, spec, loc, matchingRAngleOffset, memberTypes,
+                        layoutInfo)) {
+    return Type();
+  }
+  if (layoutInfo.empty()) {
+    return StructType::get(memberTypes);
+  }
+  if (memberTypes.size() != layoutInfo.size()) {
+    emitError(loc, "layout specification must be given for all members");
+    return Type();
+  }
+  return StructType::get(memberTypes, layoutInfo);
+}
+
+// spirv-type ::= array-type
+//              | element-type
+//              | image-type
+//              | pointer-type
+//              | runtime-array-type
+//              | struct-type
+Type SPIRVDialect::parseType(StringRef spec, Location loc) const {
+  if (spec.startswith("array"))
+    return parseArrayType(*this, spec, loc);
+  if (spec.startswith("image"))
+    return parseImageType(*this, spec, loc);
+  if (spec.startswith("ptr"))
+    return parsePointerType(*this, spec, loc);
+  if (spec.startswith("rtarray"))
+    return parseRuntimeArrayType(*this, spec, loc);
+  if (spec.startswith("struct"))
+    return parseStructType(*this, spec, loc);
+
+  emitError(loc, "unknown SPIR-V type: ") << spec;
+  return Type();
+}
+
+//===----------------------------------------------------------------------===//
+// Type Printing
+//===----------------------------------------------------------------------===//
+
+static void print(ArrayType type, llvm::raw_ostream &os) {
+  os << "array<" << type.getNumElements() << " x " << type.getElementType();
+  if (type.hasLayout()) {
+    os << " [" << type.getArrayStride() << "]";
+  }
+  os << ">";
+}
+
+static void print(RuntimeArrayType type, llvm::raw_ostream &os) {
+  os << "rtarray<" << type.getElementType() << ">";
+}
+
+static void print(PointerType type, llvm::raw_ostream &os) {
+  os << "ptr<" << type.getPointeeType() << ", "
+     << stringifyStorageClass(type.getStorageClass()) << ">";
+}
+
+static void print(ImageType type, llvm::raw_ostream &os) {
+  os << "image<" << type.getElementType() << ", " << stringifyDim(type.getDim())
+     << ", " << stringifyImageDepthInfo(type.getDepthInfo()) << ", "
+     << stringifyImageArrayedInfo(type.getArrayedInfo()) << ", "
+     << stringifyImageSamplingInfo(type.getSamplingInfo()) << ", "
+     << stringifyImageSamplerUseInfo(type.getSamplerUseInfo()) << ", "
+     << stringifyImageFormat(type.getImageFormat()) << ">";
+}
+
+static void print(StructType type, llvm::raw_ostream &os) {
+  os << "struct<";
+  auto printMember = [&](unsigned i) {
+    os << type.getElementType(i);
+    if (type.hasLayout()) {
+      os << " [" << type.getOffset(i) << "]";
+    }
+  };
+  mlir::interleaveComma(llvm::seq<unsigned>(0, type.getNumElements()), os,
+                        printMember);
+  os << ">";
+}
+
+void SPIRVDialect::printType(Type type, llvm::raw_ostream &os) const {
+  switch (type.getKind()) {
+  case TypeKind::Array:
+    print(type.cast<ArrayType>(), os);
+    return;
+  case TypeKind::Pointer:
+    print(type.cast<PointerType>(), os);
+    return;
+  case TypeKind::RuntimeArray:
+    print(type.cast<RuntimeArrayType>(), os);
+    return;
+  case TypeKind::Image:
+    print(type.cast<ImageType>(), os);
+    return;
+  case TypeKind::Struct:
+    print(type.cast<StructType>(), os);
+    return;
+  default:
+    llvm_unreachable("unhandled SPIR-V type");
+  }
+}
diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
new file mode 100644
index 00000000000..aaa7ed5d000
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -0,0 +1,1360 @@
+//===- SPIRVOps.cpp - MLIR SPIR-V operations ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the operations in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/StringExtras.h"
+
+using namespace mlir;
+
+// TODO(antiagainst): generate these strings using ODS.
+static constexpr const char kAlignmentAttrName[] = "alignment";
+static constexpr const char kDefaultValueAttrName[] = "default_value";
+static constexpr const char kFnNameAttrName[] = "fn";
+static constexpr const char kIndicesAttrName[] = "indices";
+static constexpr const char kInitializerAttrName[] = "initializer";
+static constexpr const char kInterfaceAttrName[] = "interface";
+static constexpr const char kSpecConstAttrName[] = "spec_const";
+static constexpr const char kTypeAttrName[] = "type";
+static constexpr const char kValueAttrName[] = "value";
+static constexpr const char kValuesAttrName[] = "values";
+static constexpr const char kVariableAttrName[] = "variable";
+
+//===----------------------------------------------------------------------===//
+// Common utility functions
+//===----------------------------------------------------------------------===//
+
+template <typename Dst, typename Src>
+inline Dst bitwiseCast(Src source) noexcept {
+  Dst dest;
+  static_assert(sizeof(source) == sizeof(dest),
+                "bitwiseCast requires same source and destination bitwidth");
+  std::memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+static LogicalResult extractValueFromConstOp(Operation *op,
+                                             int32_t &indexValue) {
+  auto constOp = dyn_cast<spirv::ConstantOp>(op);
+  if (!constOp) {
+    return failure();
+  }
+  auto valueAttr = constOp.value();
+  auto integerValueAttr = valueAttr.dyn_cast<IntegerAttr>();
+  if (!integerValueAttr) {
+    return failure();
+  }
+  indexValue = integerValueAttr.getInt();
+  return success();
+}
+
+static ParseResult parseBinaryLogicalOp(OpAsmParser *parser,
+                                        OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  if (parser->parseOperandList(ops, 2) || parser->parseColonType(type) ||
+      parser->resolveOperands(ops, type, result->operands)) {
+    return failure();
+  }
+  // Result must be a scalar or vector of boolean type.
+  Type resultType = parser->getBuilder().getIntegerType(1);
+  if (auto opsType = type.dyn_cast<VectorType>()) {
+    resultType = VectorType::get(opsType.getNumElements(), resultType);
+  }
+  result->addTypes(resultType);
+  return success();
+}
+
+template <typename EnumClass>
+static ParseResult parseEnumAttribute(EnumClass &value, OpAsmParser *parser) {
+  Attribute attrVal;
+  SmallVector<NamedAttribute, 1> attr;
+  auto loc = parser->getCurrentLocation();
+  if (parser->parseAttribute(attrVal, parser->getBuilder().getNoneType(),
+                             spirv::attributeName<EnumClass>(), attr)) {
+    return failure();
+  }
+  if (!attrVal.isa<StringAttr>()) {
+    return parser->emitError(loc, "expected ")
+           << spirv::attributeName<EnumClass>()
+           << " attribute specified as string";
+  }
+  auto attrOptional =
+      spirv::symbolizeEnum<EnumClass>()(attrVal.cast<StringAttr>().getValue());
+  if (!attrOptional) {
+    return parser->emitError(loc, "invalid ")
+           << spirv::attributeName<EnumClass>()
+           << " attribute specification: " << attrVal;
+  }
+  value = attrOptional.getValue();
+  return success();
+}
+
+template <typename EnumClass>
+static ParseResult parseEnumAttribute(EnumClass &value, OpAsmParser *parser,
+                                      OperationState *state) {
+  if (parseEnumAttribute(value, parser)) {
+    return failure();
+  }
+  state->addAttribute(
+      spirv::attributeName<EnumClass>(),
+      parser->getBuilder().getI32IntegerAttr(bitwiseCast<int32_t>(value)));
+  return success();
+}
+
+static ParseResult parseMemoryAccessAttributes(OpAsmParser *parser,
+                                               OperationState *state) {
+  // Parse an optional list of attributes staring with '['
+  if (parser->parseOptionalLSquare()) {
+    // Nothing to do
+    return success();
+  }
+
+  spirv::MemoryAccess memoryAccessAttr;
+  if (parseEnumAttribute(memoryAccessAttr, parser, state)) {
+    return failure();
+  }
+
+  if (memoryAccessAttr == spirv::MemoryAccess::Aligned) {
+    // Parse integer attribute for alignment.
+    Attribute alignmentAttr;
+    Type i32Type = parser->getBuilder().getIntegerType(32);
+    if (parser->parseComma() ||
+        parser->parseAttribute(alignmentAttr, i32Type, kAlignmentAttrName,
+                               state->attributes)) {
+      return failure();
+    }
+  }
+  return parser->parseRSquare();
+}
+
+// Parses an op that has no inputs and no outputs.
+static ParseResult parseNoIOOp(OpAsmParser *parser, OperationState *state) {
+  if (parser->parseOptionalAttributeDict(state->attributes))
+    return failure();
+  return success();
+}
+
+static void printBinaryLogicalOp(Operation *logicalOp, OpAsmPrinter *printer) {
+  *printer << logicalOp->getName() << ' ' << *logicalOp->getOperand(0) << ", "
+           << *logicalOp->getOperand(1);
+  *printer << " : " << logicalOp->getOperand(0)->getType();
+}
+
+template <typename LoadStoreOpTy>
+static void
+printMemoryAccessAttribute(LoadStoreOpTy loadStoreOp, OpAsmPrinter *printer,
+                           SmallVectorImpl<StringRef> &elidedAttrs) {
+  // Print optional memory access attribute.
+  if (auto memAccess = loadStoreOp.memory_access()) {
+    elidedAttrs.push_back(spirv::attributeName<spirv::MemoryAccess>());
+    *printer << " [\"" << stringifyMemoryAccess(*memAccess) << "\"";
+
+    // Print integer alignment attribute.
+    if (auto alignment = loadStoreOp.alignment()) {
+      elidedAttrs.push_back(kAlignmentAttrName);
+      *printer << ", " << alignment;
+    }
+    *printer << "]";
+  }
+  elidedAttrs.push_back(spirv::attributeName<spirv::StorageClass>());
+}
+
+template <typename LoadStoreOpTy>
+static LogicalResult verifyMemoryAccessAttribute(LoadStoreOpTy loadStoreOp) {
+  // ODS checks for attributes values. Just need to verify that if the
+  // memory-access attribute is Aligned, then the alignment attribute must be
+  // present.
+  auto *op = loadStoreOp.getOperation();
+  auto memAccessAttr = op->getAttr(spirv::attributeName<spirv::MemoryAccess>());
+  if (!memAccessAttr) {
+    // Alignment attribute shouldn't be present if memory access attribute is
+    // not present.
+    if (op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError(
+          "invalid alignment specification without aligned memory access "
+          "specification");
+    }
+    return success();
+  }
+
+  auto memAccessVal = memAccessAttr.template cast<IntegerAttr>();
+  auto memAccess = spirv::symbolizeMemoryAccess(memAccessVal.getInt());
+
+  if (!memAccess) {
+    return loadStoreOp.emitOpError("invalid memory access specifier: ")
+           << memAccessVal;
+  }
+
+  if (*memAccess == spirv::MemoryAccess::Aligned) {
+    if (!op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError("missing alignment value");
+    }
+  } else {
+    if (op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError(
+          "invalid alignment specification with non-aligned memory access "
+          "specification");
+    }
+  }
+  return success();
+}
+
+template <typename LoadStoreOpTy>
+static LogicalResult verifyLoadStorePtrAndValTypes(LoadStoreOpTy op, Value *ptr,
+                                                   Value *val) {
+  // ODS already checks ptr is spirv::PointerType. Just check that the pointee
+  // type of the pointer and the type of the value are the same
+  //
+  // TODO(ravishankarm): Check that the value type satisfies restrictions of
+  // SPIR-V OpLoad/OpStore operations
+  if (val->getType() !=
+      ptr->getType().cast<spirv::PointerType>().getPointeeType()) {
+    return op.emitOpError("mismatch in result type and pointer type");
+  }
+  return success();
+}
+
+// Prints an op that has no inputs and no outputs.
+static void printNoIOOp(Operation *op, OpAsmPrinter *printer) {
+  *printer << op->getName();
+  printer->printOptionalAttrDict(op->getAttrs());
+}
+
+static ParseResult parseVariableDecorations(OpAsmParser *parser,
+                                            OperationState *state) {
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+  if (succeeded(parser->parseOptionalKeyword("bind"))) {
+    Attribute set, binding;
+    // Parse optional descriptor binding
+    auto descriptorSetName = convertToSnakeCase(
+        stringifyDecoration(spirv::Decoration::DescriptorSet));
+    auto bindingName =
+        convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+    Type i32Type = parser->getBuilder().getIntegerType(32);
+    if (parser->parseLParen() ||
+        parser->parseAttribute(set, i32Type, descriptorSetName,
+                               state->attributes) ||
+        parser->parseComma() ||
+        parser->parseAttribute(binding, i32Type, bindingName,
+                               state->attributes) ||
+        parser->parseRParen()) {
+      return failure();
+    }
+  } else if (succeeded(parser->parseOptionalKeyword(builtInName.c_str()))) {
+    StringAttr builtIn;
+    if (parser->parseLParen() ||
+        parser->parseAttribute(builtIn, Type(), builtInName,
+                               state->attributes) ||
+        parser->parseRParen()) {
+      return failure();
+    }
+  }
+
+  // Parse other attributes
+  if (parser->parseOptionalAttributeDict(state->attributes))
+    return failure();
+
+  return success();
+}
+
+static void printVariableDecorations(Operation *op, OpAsmPrinter *printer,
+                                     SmallVectorImpl<StringRef> &elidedAttrs) {
+  // Print optional descriptor binding
+  auto descriptorSetName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::DescriptorSet));
+  auto bindingName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+  auto descriptorSet = op->getAttrOfType<IntegerAttr>(descriptorSetName);
+  auto binding = op->getAttrOfType<IntegerAttr>(bindingName);
+  if (descriptorSet && binding) {
+    elidedAttrs.push_back(descriptorSetName);
+    elidedAttrs.push_back(bindingName);
+    *printer << " bind(" << descriptorSet.getInt() << ", " << binding.getInt()
+             << ")";
+  }
+
+  // Print BuiltIn attribute if present
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+  if (auto builtin = op->getAttrOfType<StringAttr>(builtInName)) {
+    *printer << " " << builtInName << "(\"" << builtin.getValue() << "\")";
+    elidedAttrs.push_back(builtInName);
+  }
+
+  printer->printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AccessChainOp
+//===----------------------------------------------------------------------===//
+
+static Type getElementPtrType(Type type, ArrayRef<Value *> indices,
+                              Location baseLoc) {
+  if (indices.empty()) {
+    emitError(baseLoc, "'spv.AccessChain' op expected at least "
+                       "one index ");
+    return nullptr;
+  }
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    emitError(baseLoc, "'spv.AccessChain' op expected a pointer "
+                       "to composite type, but provided ")
+        << type;
+    return nullptr;
+  }
+
+  auto resultType = ptrType.getPointeeType();
+  auto resultStorageClass = ptrType.getStorageClass();
+  int32_t index = 0;
+
+  for (auto indexSSA : indices) {
+    auto cType = resultType.dyn_cast<spirv::CompositeType>();
+    if (!cType) {
+      emitError(baseLoc,
+                "'spv.AccessChain' op cannot extract from non-composite type ")
+          << resultType << " with index " << index;
+      return nullptr;
+    }
+    index = 0;
+    if (resultType.isa<spirv::StructType>()) {
+      Operation *op = indexSSA->getDefiningOp();
+      if (!op) {
+        emitError(baseLoc, "'spv.AccessChain' op index must be an "
+                           "integer spv.constant to access "
+                           "element of spv.struct");
+        return nullptr;
+      }
+
+      // TODO(denis0x0D): this should be relaxed to allow
+      // integer literals of other bitwidths.
+      if (failed(extractValueFromConstOp(op, index))) {
+        emitError(baseLoc,
+                  "'spv.AccessChain' index must be an integer spv.constant to "
+                  "access element of spv.struct, but provided ")
+            << op->getName();
+        return nullptr;
+      }
+      if (index < 0 || static_cast<uint64_t>(index) >= cType.getNumElements()) {
+        emitError(baseLoc, "'spv.AccessChain' op index ")
+            << index << " out of bounds for " << resultType;
+        return nullptr;
+      }
+    }
+    resultType = cType.getElementType(index);
+  }
+  return spirv::PointerType::get(resultType, resultStorageClass);
+}
+
+void spirv::AccessChainOp::build(Builder *builder, OperationState *state,
+                                 Value *basePtr, ArrayRef<Value *> indices) {
+  auto type = getElementPtrType(basePtr->getType(), indices, state->location);
+  assert(type && "Unable to deduce return type based on basePtr and indices");
+  build(builder, state, type, basePtr, indices);
+}
+
+static ParseResult parseAccessChainOp(OpAsmParser *parser,
+                                      OperationState *state) {
+  OpAsmParser::OperandType ptrInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indicesInfo;
+  Type type;
+  // TODO(denis0x0D): regarding to the spec an index must be any integer type,
+  // figure out how to use resolveOperand with a range of types and do not
+  // fail on first attempt.
+  Type indicesType = parser->getBuilder().getIntegerType(32);
+
+  if (parser->parseOperand(ptrInfo) ||
+      parser->parseOperandList(indicesInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(ptrInfo, type, state->operands) ||
+      parser->resolveOperands(indicesInfo, indicesType, state->operands)) {
+    return failure();
+  }
+
+  auto resultType = getElementPtrType(
+      type, llvm::makeArrayRef(state->operands).drop_front(), state->location);
+  if (!resultType) {
+    return failure();
+  }
+
+  state->addTypes(resultType);
+  return success();
+}
+
+static void print(spirv::AccessChainOp op, OpAsmPrinter *printer) {
+  *printer << spirv::AccessChainOp::getOperationName() << ' ' << *op.base_ptr()
+           << '[';
+  printer->printOperands(op.indices());
+  *printer << "] : " << op.base_ptr()->getType();
+}
+
+static LogicalResult verify(spirv::AccessChainOp accessChainOp) {
+  SmallVector<Value *, 4> indices(accessChainOp.indices().begin(),
+                                  accessChainOp.indices().end());
+  auto resultType = getElementPtrType(accessChainOp.base_ptr()->getType(),
+                                      indices, accessChainOp.getLoc());
+  if (!resultType) {
+    return failure();
+  }
+
+  auto providedResultType =
+      accessChainOp.getType().dyn_cast<spirv::PointerType>();
+  if (!providedResultType) {
+    return accessChainOp.emitOpError(
+               "result type must be a pointer, but provided")
+           << providedResultType;
+  }
+
+  if (resultType != providedResultType) {
+    return accessChainOp.emitOpError("invalid result type: expected ")
+           << resultType << ", but provided " << providedResultType;
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv._address_of
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseAddressOfOp(OpAsmParser *parser,
+                                    OperationState *state) {
+  SymbolRefAttr varRefAttr;
+  Type type;
+  if (parser->parseAttribute(varRefAttr, Type(), kVariableAttrName,
+                             state->attributes) ||
+      parser->parseColonType(type)) {
+    return failure();
+  }
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    return parser->emitError(parser->getCurrentLocation(),
+                             "expected spv.ptr type");
+  }
+  state->addTypes(ptrType);
+  return success();
+}
+
+static void print(spirv::AddressOfOp addressOfOp, OpAsmPrinter *printer) {
+  SmallVector<StringRef, 4> elidedAttrs;
+  *printer << spirv::AddressOfOp::getOperationName();
+
+  // Print symbol name.
+  *printer << " @" << addressOfOp.variable();
+
+  // Print the type.
+  *printer << " : " << addressOfOp.pointer()->getType();
+}
+
+static LogicalResult verify(spirv::AddressOfOp addressOfOp) {
+  auto moduleOp = addressOfOp.getParentOfType<spirv::ModuleOp>();
+  auto varOp =
+      moduleOp.lookupSymbol<spirv::GlobalVariableOp>(addressOfOp.variable());
+  if (!varOp) {
+    return addressOfOp.emitOpError("expected spv.globalVariable symbol");
+  }
+  if (addressOfOp.pointer()->getType() != varOp.type()) {
+    return addressOfOp.emitOpError(
+        "result type mismatch with the referenced global variable's type");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeExtractOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseCompositeExtractOp(OpAsmParser *parser,
+                                           OperationState *state) {
+  OpAsmParser::OperandType compositeInfo;
+  Attribute indicesAttr;
+  Type compositeType;
+  llvm::SMLoc attrLocation;
+  int32_t index;
+
+  if (parser->parseOperand(compositeInfo) ||
+      parser->getCurrentLocation(&attrLocation) ||
+      parser->parseAttribute(indicesAttr, kIndicesAttrName,
+                             state->attributes) ||
+      parser->parseColonType(compositeType) ||
+      parser->resolveOperand(compositeInfo, compositeType, state->operands)) {
+    return failure();
+  }
+
+  auto indicesArrayAttr = indicesAttr.dyn_cast<ArrayAttr>();
+  if (!indicesArrayAttr) {
+    return parser->emitError(
+        attrLocation,
+        "expected an 32-bit integer array attribute for 'indices'");
+  }
+
+  if (!indicesArrayAttr.size()) {
+    return parser->emitError(
+        attrLocation, "expected at least one index for spv.CompositeExtract");
+  }
+
+  Type resultType = compositeType;
+  for (auto indexAttr : indicesArrayAttr) {
+    if (auto indexIntAttr = indexAttr.dyn_cast<IntegerAttr>()) {
+      index = indexIntAttr.getInt();
+    } else {
+      return parser->emitError(
+                 attrLocation,
+                 "expexted an 32-bit integer for index, but found '")
+             << indexAttr << "'";
+    }
+
+    if (auto cType = resultType.dyn_cast<spirv::CompositeType>()) {
+      if (index < 0 || static_cast<uint64_t>(index) >= cType.getNumElements()) {
+        return parser->emitError(attrLocation, "index ")
+               << index << " out of bounds for " << resultType;
+      }
+      resultType = cType.getElementType(index);
+    } else {
+      return parser->emitError(attrLocation,
+                               "cannot extract from non-composite type ")
+             << resultType << " with index " << index;
+    }
+  }
+
+  state->addTypes(resultType);
+  return success();
+}
+
+static void print(spirv::CompositeExtractOp compositeExtractOp,
+                  OpAsmPrinter *printer) {
+  *printer << spirv::CompositeExtractOp::getOperationName() << ' '
+           << *compositeExtractOp.composite() << compositeExtractOp.indices()
+           << " : " << compositeExtractOp.composite()->getType();
+}
+
+static LogicalResult verify(spirv::CompositeExtractOp compExOp) {
+  auto resultType = compExOp.composite()->getType();
+  auto indicesArrayAttr = compExOp.indices().dyn_cast<ArrayAttr>();
+
+  if (!indicesArrayAttr.size()) {
+    return compExOp.emitOpError(
+        "expexted at least one index for spv.CompositeExtractOp");
+  }
+
+  int32_t index;
+  for (auto indexAttr : indicesArrayAttr) {
+    index = indexAttr.dyn_cast<IntegerAttr>().getInt();
+    if (auto cType = resultType.dyn_cast<spirv::CompositeType>()) {
+      if (index < 0 || static_cast<uint64_t>(index) >= cType.getNumElements()) {
+        return compExOp.emitOpError("index ")
+               << index << " out of bounds for " << resultType;
+      }
+      resultType = cType.getElementType(index);
+    } else {
+      return compExOp.emitError("cannot extract from non-composite type ")
+             << resultType << " with index " << index;
+    }
+  }
+
+  if (resultType != compExOp.getType()) {
+    return compExOp.emitOpError("invalid result type: expected ")
+           << resultType << " but provided " << compExOp.getType();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.constant
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseConstantOp(OpAsmParser *parser, OperationState *state) {
+  Attribute value;
+  if (parser->parseAttribute(value, kValueAttrName, state->attributes))
+    return failure();
+
+  Type type;
+  if (value.getType().isa<NoneType>()) {
+    if (parser->parseColonType(type))
+      return failure();
+  } else {
+    type = value.getType();
+  }
+
+  return parser->addTypeToList(type, state->types);
+}
+
+static void print(spirv::ConstantOp constOp, OpAsmPrinter *printer) {
+  *printer << spirv::ConstantOp::getOperationName() << ' ' << constOp.value();
+  if (constOp.getType().isa<spirv::ArrayType>()) {
+    *printer << " : " << constOp.getType();
+  }
+}
+
+static LogicalResult verify(spirv::ConstantOp constOp) {
+  auto opType = constOp.getType();
+  auto value = constOp.value();
+  auto valueType = value.getType();
+
+  // ODS already generates checks to make sure the result type is valid. We just
+  // need to additionally check that the value's attribute type is consistent
+  // with the result type.
+  switch (value.getKind()) {
+  case StandardAttributes::Bool:
+  case StandardAttributes::Integer:
+  case StandardAttributes::Float:
+  case StandardAttributes::DenseElements:
+  case StandardAttributes::SparseElements: {
+    if (valueType != opType)
+      return constOp.emitOpError("result type (")
+             << opType << ") does not match value type (" << valueType << ")";
+    return success();
+  } break;
+  case StandardAttributes::Array: {
+    auto arrayType = opType.dyn_cast<spirv::ArrayType>();
+    if (!arrayType)
+      return constOp.emitOpError(
+          "must have spv.array result type for array value");
+    auto elemType = arrayType.getElementType();
+    for (auto element : value.cast<ArrayAttr>().getValue()) {
+      if (element.getType() != elemType)
+        return constOp.emitOpError(
+            "has array element that are not of result array element type");
+    }
+  } break;
+  default:
+    return constOp.emitOpError("cannot have value of type ") << valueType;
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.EntryPoint
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseEntryPointOp(OpAsmParser *parser,
+                                     OperationState *state) {
+  spirv::ExecutionModel execModel;
+  SmallVector<OpAsmParser::OperandType, 0> identifiers;
+  SmallVector<Type, 0> idTypes;
+
+  SymbolRefAttr fn;
+  if (parseEnumAttribute(execModel, parser, state) ||
+      parser->parseAttribute(fn, Type(), kFnNameAttrName, state->attributes)) {
+    return failure();
+  }
+
+  if (!parser->parseOptionalComma()) {
+    // Parse the interface variables
+    SmallVector<Attribute, 4> interfaceVars;
+    do {
+      // The name of the interface variable attribute isnt important
+      auto attrName = "var_symbol";
+      SymbolRefAttr var;
+      SmallVector<NamedAttribute, 1> attrs;
+      if (parser->parseAttribute(var, Type(), attrName, attrs)) {
+        return failure();
+      }
+      interfaceVars.push_back(var);
+    } while (!parser->parseOptionalComma());
+    state->addAttribute(kInterfaceAttrName,
+                        parser->getBuilder().getArrayAttr(interfaceVars));
+  }
+  return success();
+}
+
+static void print(spirv::EntryPointOp entryPointOp, OpAsmPrinter *printer) {
+  *printer << spirv::EntryPointOp::getOperationName() << " \""
+           << stringifyExecutionModel(entryPointOp.execution_model()) << "\" @"
+           << entryPointOp.fn();
+  if (auto interface = entryPointOp.interface()) {
+    *printer << ", ";
+    mlir::interleaveComma(interface.getValue().getValue(), printer->getStream(),
+                          [&](Attribute a) { printer->printAttribute(a); });
+  }
+}
+
+static LogicalResult verify(spirv::EntryPointOp entryPointOp) {
+  // Checks for fn and interface symbol reference are done in spirv::ModuleOp
+  // verification.
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ExecutionMode
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseExecutionModeOp(OpAsmParser *parser,
+                                        OperationState *state) {
+  spirv::ExecutionMode execMode;
+  Attribute fn;
+  if (parser->parseAttribute(fn, kFnNameAttrName, state->attributes) ||
+      parseEnumAttribute(execMode, parser, state)) {
+    return failure();
+  }
+
+  SmallVector<int32_t, 4> values;
+  Type i32Type = parser->getBuilder().getIntegerType(32);
+  while (!parser->parseOptionalComma()) {
+    SmallVector<NamedAttribute, 1> attr;
+    Attribute value;
+    if (parser->parseAttribute(value, i32Type, "value", attr)) {
+      return failure();
+    }
+    values.push_back(value.cast<IntegerAttr>().getInt());
+  }
+  state->addAttribute(kValuesAttrName,
+                      parser->getBuilder().getI32ArrayAttr(values));
+  return success();
+}
+
+static void print(spirv::ExecutionModeOp execModeOp, OpAsmPrinter *printer) {
+  *printer << spirv::ExecutionModeOp::getOperationName() << " @"
+           << execModeOp.fn() << " \""
+           << stringifyExecutionMode(execModeOp.execution_mode()) << "\"";
+  auto values = execModeOp.values();
+  if (!values) {
+    return;
+  }
+  *printer << ", ";
+  mlir::interleaveComma(
+      values.getValue().cast<ArrayAttr>(), printer->getStream(),
+      [&](Attribute a) { *printer << a.cast<IntegerAttr>().getInt(); });
+}
+
+//===----------------------------------------------------------------------===//
+// spv.globalVariable
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseGlobalVariableOp(OpAsmParser *parser,
+                                         OperationState *state) {
+  // Parse variable name.
+  StringAttr nameAttr;
+  if (parser->parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
+                              state->attributes)) {
+    return failure();
+  }
+
+  // Parse optional initializer
+  if (succeeded(parser->parseOptionalKeyword(kInitializerAttrName))) {
+    SymbolRefAttr initSymbol;
+    if (parser->parseLParen() ||
+        parser->parseAttribute(initSymbol, Type(), kInitializerAttrName,
+                               state->attributes) ||
+        parser->parseRParen())
+      return failure();
+  }
+
+  if (parseVariableDecorations(parser, state)) {
+    return failure();
+  }
+
+  Type type;
+  auto loc = parser->getCurrentLocation();
+  if (parser->parseColonType(type)) {
+    return failure();
+  }
+  if (!type.isa<spirv::PointerType>()) {
+    return parser->emitError(loc, "expected spv.ptr type");
+  }
+  state->addAttribute(kTypeAttrName, parser->getBuilder().getTypeAttr(type));
+
+  return success();
+}
+
+static void print(spirv::GlobalVariableOp varOp, OpAsmPrinter *printer) {
+  auto *op = varOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs{
+      spirv::attributeName<spirv::StorageClass>()};
+  *printer << spirv::GlobalVariableOp::getOperationName();
+
+  // Print variable name.
+  *printer << " @" << varOp.sym_name();
+  elidedAttrs.push_back(SymbolTable::getSymbolAttrName());
+
+  // Print optional initializer
+  if (auto initializer = varOp.initializer()) {
+    *printer << " " << kInitializerAttrName << "(@" << initializer.getValue()
+             << ")";
+    elidedAttrs.push_back(kInitializerAttrName);
+  }
+
+  elidedAttrs.push_back(kTypeAttrName);
+  printVariableDecorations(op, printer, elidedAttrs);
+  *printer << " : " << varOp.type();
+}
+
+static LogicalResult verify(spirv::GlobalVariableOp varOp) {
+  // SPIR-V spec: "Storage Class is the Storage Class of the memory holding the
+  // object. It cannot be Generic. It must be the same as the Storage Class
+  // operand of the Result Type."
+  if (varOp.storageClass() == spirv::StorageClass::Generic)
+    return varOp.emitOpError("storage class cannot be 'Generic'");
+
+  if (auto init = varOp.getAttrOfType<SymbolRefAttr>(kInitializerAttrName)) {
+    auto moduleOp = varOp.getParentOfType<spirv::ModuleOp>();
+    auto *initOp = moduleOp.lookupSymbol(init.getValue());
+    // TODO: Currently only variable initialization with specialization
+    // constants and other variables is supported. They could be normal
+    // constants in the module scope as well.
+    if (!initOp || !(isa<spirv::GlobalVariableOp>(initOp) ||
+                     isa<spirv::SpecConstantOp>(initOp))) {
+      return varOp.emitOpError("initializer must be result of a "
+                               "spv.specConstant or spv.globalVariable op");
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.LoadOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseLoadOp(OpAsmParser *parser, OperationState *state) {
+  // Parse the storage class specification
+  spirv::StorageClass storageClass;
+  OpAsmParser::OperandType ptrInfo;
+  Type elementType;
+  if (parseEnumAttribute(storageClass, parser) ||
+      parser->parseOperand(ptrInfo) ||
+      parseMemoryAccessAttributes(parser, state) ||
+      parser->parseOptionalAttributeDict(state->attributes) ||
+      parser->parseColon() || parser->parseType(elementType)) {
+    return failure();
+  }
+
+  auto ptrType = spirv::PointerType::get(elementType, storageClass);
+  if (parser->resolveOperand(ptrInfo, ptrType, state->operands)) {
+    return failure();
+  }
+
+  state->addTypes(elementType);
+  return success();
+}
+
+static void print(spirv::LoadOp loadOp, OpAsmPrinter *printer) {
+  auto *op = loadOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs;
+  StringRef sc = stringifyStorageClass(
+      loadOp.ptr()->getType().cast<spirv::PointerType>().getStorageClass());
+  *printer << spirv::LoadOp::getOperationName() << " \"" << sc << "\" ";
+  // Print the pointer operand.
+  printer->printOperand(loadOp.ptr());
+
+  printMemoryAccessAttribute(loadOp, printer, elidedAttrs);
+
+  printer->printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+  *printer << " : " << loadOp.getType();
+}
+
+static LogicalResult verify(spirv::LoadOp loadOp) {
+  // SPIR-V spec : "Result Type is the type of the loaded object. It must be a
+  // type with fixed size; i.e., it cannot be, nor include, any
+  // OpTypeRuntimeArray types."
+  if (failed(verifyLoadStorePtrAndValTypes(loadOp, loadOp.ptr(),
+                                           loadOp.value()))) {
+    return failure();
+  }
+  return verifyMemoryAccessAttribute(loadOp);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.module
+//===----------------------------------------------------------------------===//
+
+void spirv::ModuleOp::build(Builder *builder, OperationState *state) {
+  ensureTerminator(*state->addRegion(), *builder, state->location);
+}
+
+void spirv::ModuleOp::build(Builder *builder, OperationState *state,
+                            IntegerAttr addressing_model,
+                            IntegerAttr memory_model, ArrayAttr capabilities,
+                            ArrayAttr extensions,
+                            ArrayAttr extended_instruction_sets) {
+  state->addAttribute("addressing_model", addressing_model);
+  state->addAttribute("memory_model", memory_model);
+  if (capabilities)
+    state->addAttribute("capabilities", capabilities);
+  if (extensions)
+    state->addAttribute("extensions", extensions);
+  if (extended_instruction_sets)
+    state->addAttribute("extended_instruction_sets", extended_instruction_sets);
+  ensureTerminator(*state->addRegion(), *builder, state->location);
+}
+
+static ParseResult parseModuleOp(OpAsmParser *parser, OperationState *state) {
+  Region *body = state->addRegion();
+
+  // Parse attributes
+  spirv::AddressingModel addrModel;
+  spirv::MemoryModel memoryModel;
+  if (parseEnumAttribute(addrModel, parser, state) ||
+      parseEnumAttribute(memoryModel, parser, state)) {
+    return failure();
+  }
+
+  if (parser->parseRegion(*body, /*arguments=*/{}, /*argTypes=*/{}))
+    return failure();
+
+  if (succeeded(parser->parseOptionalKeyword("attributes"))) {
+    if (parser->parseOptionalAttributeDict(state->attributes))
+      return failure();
+  }
+
+  spirv::ModuleOp::ensureTerminator(*body, parser->getBuilder(),
+                                    state->location);
+  return success();
+}
+
+static void print(spirv::ModuleOp moduleOp, OpAsmPrinter *printer) {
+  auto *op = moduleOp.getOperation();
+
+  // Only print out addressing model and memory model in a nicer way if both
+  // presents. Otherwise, print them in the general form. This helps debugging
+  // ill-formed ModuleOp.
+  SmallVector<StringRef, 2> elidedAttrs;
+  auto addressingModelAttrName = spirv::attributeName<spirv::AddressingModel>();
+  auto memoryModelAttrName = spirv::attributeName<spirv::MemoryModel>();
+  if (op->getAttr(addressingModelAttrName) &&
+      op->getAttr(memoryModelAttrName)) {
+    *printer << spirv::ModuleOp::getOperationName() << " \""
+             << spirv::stringifyAddressingModel(moduleOp.addressing_model())
+             << "\" \"" << spirv::stringifyMemoryModel(moduleOp.memory_model())
+             << '"';
+    elidedAttrs.assign({addressingModelAttrName, memoryModelAttrName});
+  }
+
+  printer->printRegion(op->getRegion(0), /*printEntryBlockArgs=*/false,
+                       /*printBlockTerminators=*/false);
+
+  bool printAttrDict =
+      elidedAttrs.size() != 2 ||
+      llvm::any_of(op->getAttrs(), [&addressingModelAttrName,
+                                    &memoryModelAttrName](NamedAttribute attr) {
+        return attr.first != addressingModelAttrName &&
+               attr.first != memoryModelAttrName;
+      });
+
+  if (printAttrDict) {
+    *printer << " attributes";
+    printer->printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+  }
+}
+
+static LogicalResult verify(spirv::ModuleOp moduleOp) {
+  auto &op = *moduleOp.getOperation();
+  auto *dialect = op.getDialect();
+  auto &body = op.getRegion(0).front();
+  llvm::DenseMap<std::pair<FuncOp, spirv::ExecutionModel>, spirv::EntryPointOp>
+      entryPoints;
+  SymbolTable table(moduleOp);
+
+  for (auto &op : body) {
+    if (op.getDialect() == dialect) {
+      // For EntryPoint op, check that the function and execution model is not
+      // duplicated in EntryPointOps. Also verify that the interface specified
+      // comes from globalVariables here to make this check cheaper.
+      if (auto entryPointOp = dyn_cast<spirv::EntryPointOp>(op)) {
+        auto funcOp = table.lookup<FuncOp>(entryPointOp.fn());
+        if (!funcOp) {
+          return entryPointOp.emitError("function '")
+                 << entryPointOp.fn() << "' not found in 'spv.module'";
+        }
+        if (auto interface = entryPointOp.interface()) {
+          for (auto varRef : interface.getValue().getValue()) {
+            auto varSymRef = varRef.dyn_cast<SymbolRefAttr>();
+            if (!varSymRef) {
+              return entryPointOp.emitError(
+                         "expected symbol reference for interface "
+                         "specification instead of '")
+                     << varRef;
+            }
+            auto variableOp =
+                table.lookup<spirv::GlobalVariableOp>(varSymRef.getValue());
+            if (!variableOp) {
+              return entryPointOp.emitError("expected spv.globalVariable "
+                                            "symbol reference instead of'")
+                     << varSymRef << "'";
+            }
+          }
+        }
+
+        auto key = std::pair<FuncOp, spirv::ExecutionModel>(
+            funcOp, entryPointOp.execution_model());
+        auto entryPtIt = entryPoints.find(key);
+        if (entryPtIt != entryPoints.end()) {
+          return entryPointOp.emitError("duplicate of a previous EntryPointOp");
+        }
+        entryPoints[key] = entryPointOp;
+      }
+      continue;
+    }
+
+    auto funcOp = dyn_cast<FuncOp>(op);
+    if (!funcOp)
+      return op.emitError("'spv.module' can only contain func and spv.* ops");
+
+    if (funcOp.isExternal())
+      return op.emitError("'spv.module' cannot contain external functions");
+
+    for (auto &block : funcOp)
+      for (auto &op : block) {
+        if (op.getDialect() == dialect)
+          continue;
+
+        if (isa<FuncOp>(op))
+          return op.emitError("'spv.module' cannot contain nested functions");
+
+        return op.emitError(
+            "functions in 'spv.module' can only contain spv.* ops");
+      }
+  }
+
+  // Verify capabilities. ODS already guarantees that we have an array of
+  // string attributes.
+  if (auto caps = moduleOp.getAttrOfType<ArrayAttr>("capabilities")) {
+    for (auto cap : caps.getValue()) {
+      auto capStr = cap.cast<StringAttr>().getValue();
+      if (!spirv::symbolizeCapability(capStr))
+        return moduleOp.emitOpError("uses unknown capability: ") << capStr;
+    }
+  }
+
+  // Verify extensions. ODS already guarantees that we have an array of
+  // string attributes.
+  if (auto exts = moduleOp.getAttrOfType<ArrayAttr>("extensions")) {
+    for (auto ext : exts.getValue()) {
+      auto extStr = ext.cast<StringAttr>().getValue();
+      if (!spirv::symbolizeExtension(extStr))
+        return moduleOp.emitOpError("uses unknown extension: ") << extStr;
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv._reference_of
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReferenceOfOp(OpAsmParser *parser,
+                                      OperationState *state) {
+  SymbolRefAttr constRefAttr;
+  Type type;
+  if (parser->parseAttribute(constRefAttr, Type(), kSpecConstAttrName,
+                             state->attributes) ||
+      parser->parseColonType(type)) {
+    return failure();
+  }
+  return parser->addTypeToList(type, state->types);
+}
+
+static void print(spirv::ReferenceOfOp referenceOfOp, OpAsmPrinter *printer) {
+  *printer << spirv::ReferenceOfOp::getOperationName() << " @"
+           << referenceOfOp.spec_const() << " : "
+           << referenceOfOp.reference()->getType();
+}
+
+static LogicalResult verify(spirv::ReferenceOfOp referenceOfOp) {
+  auto moduleOp = referenceOfOp.getParentOfType<spirv::ModuleOp>();
+  auto specConstOp =
+      moduleOp.lookupSymbol<spirv::SpecConstantOp>(referenceOfOp.spec_const());
+  if (!specConstOp) {
+    return referenceOfOp.emitOpError("expected spv.specConstant symbol");
+  }
+  if (referenceOfOp.reference()->getType() !=
+      specConstOp.default_value().getType()) {
+    return referenceOfOp.emitOpError("result type mismatch with the referenced "
+                                     "specialization constant's type");
+  }
+  return success();
+}
+//===----------------------------------------------------------------------===//
+// spv.Return
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::ReturnOp returnOp) {
+  auto funcOp = cast<FuncOp>(returnOp.getParentOp());
+  auto numOutputs = funcOp.getType().getNumResults();
+  if (numOutputs != 0)
+    return returnOp.emitOpError("cannot be used in functions returning value")
+           << (numOutputs > 1 ? "s" : "");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ReturnValue
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReturnValueOp(OpAsmParser *parser,
+                                      OperationState *state) {
+  OpAsmParser::OperandType retValInfo;
+  Type retValType;
+  return failure(
+      parser->parseOperand(retValInfo) || parser->parseColonType(retValType) ||
+      parser->resolveOperand(retValInfo, retValType, state->operands));
+}
+
+static void print(spirv::ReturnValueOp retValOp, OpAsmPrinter *printer) {
+  *printer << spirv::ReturnValueOp::getOperationName() << ' ';
+  printer->printOperand(retValOp.value());
+  *printer << " : " << retValOp.value()->getType();
+}
+
+static LogicalResult verify(spirv::ReturnValueOp retValOp) {
+  auto funcOp = cast<FuncOp>(retValOp.getParentOp());
+  auto numFnResults = funcOp.getType().getNumResults();
+  if (numFnResults != 1)
+    return retValOp.emitOpError(
+               "returns 1 value but enclosing function requires ")
+           << numFnResults << " results";
+
+  auto operandType = retValOp.value()->getType();
+  auto fnResultType = funcOp.getType().getResult(0);
+  if (operandType != fnResultType)
+    return retValOp.emitOpError(" return value's type (")
+           << operandType << ") mismatch with function's result type ("
+           << fnResultType << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.specConstant
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSpecConstantOp(OpAsmParser *parser,
+                                       OperationState *state) {
+  StringAttr nameAttr;
+  Attribute valueAttr;
+
+  if (parser->parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
+                              state->attributes) ||
+      parser->parseEqual() ||
+      parser->parseAttribute(valueAttr, kDefaultValueAttrName,
+                             state->attributes))
+    return failure();
+
+  return success();
+}
+
+static void print(spirv::SpecConstantOp constOp, OpAsmPrinter *printer) {
+  *printer << spirv::SpecConstantOp::getOperationName() << " @"
+           << constOp.sym_name() << " = ";
+  printer->printAttribute(constOp.default_value());
+}
+
+static LogicalResult verify(spirv::SpecConstantOp constOp) {
+  auto value = constOp.default_value();
+
+  switch (value.getKind()) {
+  case StandardAttributes::Bool:
+  case StandardAttributes::Integer:
+  case StandardAttributes::Float: {
+    // Make sure bitwidth is allowed.
+    auto *dialect = static_cast<spirv::SPIRVDialect *>(constOp.getDialect());
+    if (!dialect->isValidSPIRVType(value.getType()))
+      return constOp.emitOpError("default value bitwidth disallowed");
+    return success();
+  }
+  default:
+    return constOp.emitOpError(
+        "default value can only be a bool, integer, or float scalar");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// spv.StoreOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseStoreOp(OpAsmParser *parser, OperationState *state) {
+  // Parse the storage class specification
+  spirv::StorageClass storageClass;
+  SmallVector<OpAsmParser::OperandType, 2> operandInfo;
+  auto loc = parser->getCurrentLocation();
+  Type elementType;
+  if (parseEnumAttribute(storageClass, parser) ||
+      parser->parseOperandList(operandInfo, 2) ||
+      parseMemoryAccessAttributes(parser, state) || parser->parseColon() ||
+      parser->parseType(elementType)) {
+    return failure();
+  }
+
+  auto ptrType = spirv::PointerType::get(elementType, storageClass);
+  if (parser->resolveOperands(operandInfo, {ptrType, elementType}, loc,
+                              state->operands)) {
+    return failure();
+  }
+  return success();
+}
+
+static void print(spirv::StoreOp storeOp, OpAsmPrinter *printer) {
+  auto *op = storeOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs;
+  StringRef sc = stringifyStorageClass(
+      storeOp.ptr()->getType().cast<spirv::PointerType>().getStorageClass());
+  *printer << spirv::StoreOp::getOperationName() << " \"" << sc << "\" ";
+  // Print the pointer operand
+  printer->printOperand(storeOp.ptr());
+  *printer << ", ";
+  // Print the value operand
+  printer->printOperand(storeOp.value());
+
+  printMemoryAccessAttribute(storeOp, printer, elidedAttrs);
+
+  *printer << " : " << storeOp.value()->getType();
+
+  printer->printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+}
+
+static LogicalResult verify(spirv::StoreOp storeOp) {
+  // SPIR-V spec : "Pointer is the pointer to store through. Its type must be an
+  // OpTypePointer whose Type operand is the same as the type of Object."
+  if (failed(verifyLoadStorePtrAndValTypes(storeOp, storeOp.ptr(),
+                                           storeOp.value()))) {
+    return failure();
+  }
+  return verifyMemoryAccessAttribute(storeOp);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Variable
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseVariableOp(OpAsmParser *parser, OperationState *state) {
+  // Parse optional initializer
+  Optional<OpAsmParser::OperandType> initInfo;
+  if (succeeded(parser->parseOptionalKeyword("init"))) {
+    initInfo = OpAsmParser::OperandType();
+    if (parser->parseLParen() || parser->parseOperand(*initInfo) ||
+        parser->parseRParen())
+      return failure();
+  }
+
+  if (parseVariableDecorations(parser, state)) {
+    return failure();
+  }
+
+  // Parse result pointer type
+  Type type;
+  if (parser->parseColon())
+    return failure();
+  auto loc = parser->getCurrentLocation();
+  if (parser->parseType(type))
+    return failure();
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType)
+    return parser->emitError(loc, "expected spv.ptr type");
+  state->addTypes(ptrType);
+
+  // Resolve the initializer operand
+  SmallVector<Value *, 1> init;
+  if (initInfo) {
+    if (parser->resolveOperand(*initInfo, ptrType.getPointeeType(), init))
+      return failure();
+    state->addOperands(init);
+  }
+
+  auto attr = parser->getBuilder().getI32IntegerAttr(
+      bitwiseCast<int32_t>(ptrType.getStorageClass()));
+  state->addAttribute(spirv::attributeName<spirv::StorageClass>(), attr);
+
+  return success();
+}
+
+static void print(spirv::VariableOp varOp, OpAsmPrinter *printer) {
+  auto *op = varOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs{
+      spirv::attributeName<spirv::StorageClass>()};
+  *printer << spirv::VariableOp::getOperationName();
+
+  // Print optional initializer
+  if (op->getNumOperands() > 0) {
+    *printer << " init(";
+    printer->printOperands(varOp.initializer());
+    *printer << ")";
+  }
+
+  printVariableDecorations(op, printer, elidedAttrs);
+
+  *printer << " : " << varOp.getType();
+}
+
+static LogicalResult verify(spirv::VariableOp varOp) {
+  // SPIR-V spec: "Storage Class is the Storage Class of the memory holding the
+  // object. It cannot be Generic. It must be the same as the Storage Class
+  // operand of the Result Type."
+  if (varOp.storage_class() != spirv::StorageClass::Function) {
+    return varOp.emitOpError(
+        "can only be used to model function-level variables. Use "
+        "spv.globalVariable for module-level variables.");
+  }
+
+  auto pointerType = varOp.pointer()->getType().cast<spirv::PointerType>();
+  if (varOp.storage_class() != pointerType.getStorageClass())
+    return varOp.emitOpError(
+        "storage class must match result pointer's storage class");
+
+  if (varOp.getNumOperands() != 0) {
+    // SPIR-V spec: "Initializer must be an <id> from a constant instruction or
+    // a global (module scope) OpVariable instruction".
+    auto *initOp = varOp.getOperand(0)->getDefiningOp();
+    if (!initOp || !(isa<spirv::ConstantOp>(initOp) ||    // for normal constant
+                     isa<spirv::ReferenceOfOp>(initOp) || // for spec constant
+                     isa<spirv::AddressOfOp>(initOp)))
+      return varOp.emitOpError("initializer must be the result of a "
+                               "constant or spv.globalVariable op");
+  }
+
+  // TODO(antiagainst): generate these strings using ODS.
+  auto *op = varOp.getOperation();
+  auto descriptorSetName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::DescriptorSet));
+  auto bindingName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+
+  for (const auto &attr : {descriptorSetName, bindingName, builtInName}) {
+    if (op->getAttr(attr))
+      return varOp.emitOpError("cannot have '")
+             << attr << "' attribute (only allowed in spv.globalVariable)";
+  }
+
+  return success();
+}
+
+namespace mlir {
+namespace spirv {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/SPIRV/SPIRVOps.cpp.inc"
+
+} // namespace spirv
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
new file mode 100644
index 00000000000..f79db01998f
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
@@ -0,0 +1,441 @@
+//===- SPIRVTypes.cpp - MLIR SPIR-V Types ---------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the types in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+using namespace mlir::spirv;
+
+// Pull in all enum utility function definitions
+#include "mlir/Dialect/SPIRV/SPIRVEnums.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// ArrayType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::ArrayTypeStorage : public TypeStorage {
+  using KeyTy = std::tuple<Type, unsigned, ArrayType::LayoutInfo>;
+
+  static ArrayTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     const KeyTy &key) {
+    return new (allocator.allocate<ArrayTypeStorage>()) ArrayTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, getSubclassData(), layoutInfo);
+  }
+
+  ArrayTypeStorage(const KeyTy &key)
+      : TypeStorage(std::get<1>(key)), elementType(std::get<0>(key)),
+        layoutInfo(std::get<2>(key)) {}
+
+  Type elementType;
+  ArrayType::LayoutInfo layoutInfo;
+};
+
+ArrayType ArrayType::get(Type elementType, unsigned elementCount) {
+  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
+                   elementCount, 0);
+}
+
+ArrayType ArrayType::get(Type elementType, unsigned elementCount,
+                         ArrayType::LayoutInfo layoutInfo) {
+  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
+                   elementCount, layoutInfo);
+}
+
+unsigned ArrayType::getNumElements() const {
+  return getImpl()->getSubclassData();
+}
+
+Type ArrayType::getElementType() const { return getImpl()->elementType; }
+
+// ArrayStride must be greater than zero
+bool ArrayType::hasLayout() const { return getImpl()->layoutInfo; }
+
+uint64_t ArrayType::getArrayStride() const { return getImpl()->layoutInfo; }
+
+//===----------------------------------------------------------------------===//
+// CompositeType
+//===----------------------------------------------------------------------===//
+
+Type CompositeType::getElementType(unsigned index) const {
+  switch (getKind()) {
+  case spirv::TypeKind::Array:
+    return cast<ArrayType>().getElementType();
+  case spirv::TypeKind::Struct:
+    return cast<StructType>().getElementType(index);
+  case StandardTypes::Vector:
+    return cast<VectorType>().getElementType();
+  default:
+    llvm_unreachable("invalid composite type");
+  }
+}
+
+unsigned CompositeType::getNumElements() const {
+  switch (getKind()) {
+  case spirv::TypeKind::Array:
+    return cast<ArrayType>().getNumElements();
+  case spirv::TypeKind::Struct:
+    return cast<StructType>().getNumElements();
+  case StandardTypes::Vector:
+    return cast<VectorType>().getNumElements();
+  default:
+    llvm_unreachable("invalid composite type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ImageType
+//===----------------------------------------------------------------------===//
+
+template <typename T> static constexpr unsigned getNumBits() { return 0; }
+template <> constexpr unsigned getNumBits<Dim>() {
+  static_assert((1 << 3) > getMaxEnumValForDim(),
+                "Not enough bits to encode Dim value");
+  return 3;
+}
+template <> constexpr unsigned getNumBits<ImageDepthInfo>() {
+  static_assert((1 << 2) > getMaxEnumValForImageDepthInfo(),
+                "Not enough bits to encode ImageDepthInfo value");
+  return 2;
+}
+template <> constexpr unsigned getNumBits<ImageArrayedInfo>() {
+  static_assert((1 << 1) > getMaxEnumValForImageArrayedInfo(),
+                "Not enough bits to encode ImageArrayedInfo value");
+  return 1;
+}
+template <> constexpr unsigned getNumBits<ImageSamplingInfo>() {
+  static_assert((1 << 1) > getMaxEnumValForImageSamplingInfo(),
+                "Not enough bits to encode ImageSamplingInfo value");
+  return 1;
+}
+template <> constexpr unsigned getNumBits<ImageSamplerUseInfo>() {
+  static_assert((1 << 2) > getMaxEnumValForImageSamplerUseInfo(),
+                "Not enough bits to encode ImageSamplerUseInfo value");
+  return 2;
+}
+template <> constexpr unsigned getNumBits<ImageFormat>() {
+  static_assert((1 << 6) > getMaxEnumValForImageFormat(),
+                "Not enough bits to encode ImageFormat value");
+  return 6;
+}
+
+struct spirv::detail::ImageTypeStorage : public TypeStorage {
+private:
+  /// Define a bit-field struct to pack the enum values
+  union EnumPack {
+    struct {
+      unsigned dimEncoding : getNumBits<Dim>();
+      unsigned depthInfoEncoding : getNumBits<ImageDepthInfo>();
+      unsigned arrayedInfoEncoding : getNumBits<ImageArrayedInfo>();
+      unsigned samplingInfoEncoding : getNumBits<ImageSamplingInfo>();
+      unsigned samplerUseInfoEncoding : getNumBits<ImageSamplerUseInfo>();
+      unsigned formatEncoding : getNumBits<ImageFormat>();
+    } data;
+    unsigned storage;
+  };
+
+public:
+  using KeyTy = std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                           ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>;
+
+  static ImageTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     const KeyTy &key) {
+    return new (allocator.allocate<ImageTypeStorage>()) ImageTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, getDim(), getDepthInfo(), getArrayedInfo(),
+                        getSamplingInfo(), getSamplerUseInfo(),
+                        getImageFormat());
+  }
+
+  Dim getDim() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<Dim>(v.data.dimEncoding);
+  }
+  void setDim(Dim dim) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.dimEncoding = static_cast<unsigned>(dim);
+    setSubclassData(v.storage);
+  }
+
+  ImageDepthInfo getDepthInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageDepthInfo>(v.data.depthInfoEncoding);
+  }
+  void setDepthInfo(ImageDepthInfo depthInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.depthInfoEncoding = static_cast<unsigned>(depthInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageArrayedInfo getArrayedInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageArrayedInfo>(v.data.arrayedInfoEncoding);
+  }
+  void setArrayedInfo(ImageArrayedInfo arrayedInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.arrayedInfoEncoding = static_cast<unsigned>(arrayedInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageSamplingInfo getSamplingInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageSamplingInfo>(v.data.samplingInfoEncoding);
+  }
+  void setSamplingInfo(ImageSamplingInfo samplingInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.samplingInfoEncoding = static_cast<unsigned>(samplingInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageSamplerUseInfo getSamplerUseInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageSamplerUseInfo>(v.data.samplerUseInfoEncoding);
+  }
+  void setSamplerUseInfo(ImageSamplerUseInfo samplerUseInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.samplerUseInfoEncoding = static_cast<unsigned>(samplerUseInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageFormat getImageFormat() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageFormat>(v.data.formatEncoding);
+  }
+  void setImageFormat(ImageFormat format) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.formatEncoding = static_cast<unsigned>(format);
+    setSubclassData(v.storage);
+  }
+
+  ImageTypeStorage(const KeyTy &key) : elementType(std::get<0>(key)) {
+    static_assert(sizeof(EnumPack) <= sizeof(getSubclassData()),
+                  "EnumPack size greater than subClassData type size");
+    setDim(std::get<1>(key));
+    setDepthInfo(std::get<2>(key));
+    setArrayedInfo(std::get<3>(key));
+    setSamplingInfo(std::get<4>(key));
+    setSamplerUseInfo(std::get<5>(key));
+    setImageFormat(std::get<6>(key));
+  }
+
+  Type elementType;
+};
+
+ImageType
+ImageType::get(std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                          ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>
+                   value) {
+  return Base::get(std::get<0>(value).getContext(), TypeKind::Image, value);
+}
+
+Type ImageType::getElementType() const { return getImpl()->elementType; }
+
+Dim ImageType::getDim() const { return getImpl()->getDim(); }
+
+ImageDepthInfo ImageType::getDepthInfo() const {
+  return getImpl()->getDepthInfo();
+}
+
+ImageArrayedInfo ImageType::getArrayedInfo() const {
+  return getImpl()->getArrayedInfo();
+}
+
+ImageSamplingInfo ImageType::getSamplingInfo() const {
+  return getImpl()->getSamplingInfo();
+}
+
+ImageSamplerUseInfo ImageType::getSamplerUseInfo() const {
+  return getImpl()->getSamplerUseInfo();
+}
+
+ImageFormat ImageType::getImageFormat() const {
+  return getImpl()->getImageFormat();
+}
+
+//===----------------------------------------------------------------------===//
+// PointerType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::PointerTypeStorage : public TypeStorage {
+  // (Type, StorageClass) as the key: Type stored in this struct, and
+  // StorageClass stored as TypeStorage's subclass data.
+  using KeyTy = std::pair<Type, StorageClass>;
+
+  static PointerTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       const KeyTy &key) {
+    return new (allocator.allocate<PointerTypeStorage>())
+        PointerTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(pointeeType, getStorageClass());
+  }
+
+  PointerTypeStorage(const KeyTy &key)
+      : TypeStorage(static_cast<unsigned>(key.second)), pointeeType(key.first) {
+  }
+
+  StorageClass getStorageClass() const {
+    return static_cast<StorageClass>(getSubclassData());
+  }
+
+  Type pointeeType;
+};
+
+PointerType PointerType::get(Type pointeeType, StorageClass storageClass) {
+  return Base::get(pointeeType.getContext(), TypeKind::Pointer, pointeeType,
+                   storageClass);
+}
+
+Type PointerType::getPointeeType() const { return getImpl()->pointeeType; }
+
+StorageClass PointerType::getStorageClass() const {
+  return getImpl()->getStorageClass();
+}
+
+//===----------------------------------------------------------------------===//
+// RuntimeArrayType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::RuntimeArrayTypeStorage : public TypeStorage {
+  using KeyTy = Type;
+
+  static RuntimeArrayTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    return new (allocator.allocate<RuntimeArrayTypeStorage>())
+        RuntimeArrayTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const { return elementType == key; }
+
+  RuntimeArrayTypeStorage(const KeyTy &key) : elementType(key) {}
+
+  Type elementType;
+};
+
+RuntimeArrayType RuntimeArrayType::get(Type elementType) {
+  return Base::get(elementType.getContext(), TypeKind::RuntimeArray,
+                   elementType);
+}
+
+Type RuntimeArrayType::getElementType() const { return getImpl()->elementType; }
+
+//===----------------------------------------------------------------------===//
+// StructType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::StructTypeStorage : public TypeStorage {
+  StructTypeStorage(unsigned numMembers, Type const *memberTypes,
+                    StructType::LayoutInfo const *layoutInfo)
+      : TypeStorage(numMembers), memberTypes(memberTypes),
+        layoutInfo(layoutInfo) {}
+
+  using KeyTy = std::pair<ArrayRef<Type>, ArrayRef<StructType::LayoutInfo>>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getMemberTypes(), getLayoutInfo());
+  }
+
+  static StructTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    ArrayRef<Type> keyTypes = key.first;
+
+    // Copy the member type and layout information into the bump pointer
+    auto typesList = allocator.copyInto(keyTypes).data();
+
+    const StructType::LayoutInfo *layoutInfoList = nullptr;
+    if (!key.second.empty()) {
+      ArrayRef<StructType::LayoutInfo> keyLayoutInfo = key.second;
+      assert(keyLayoutInfo.size() == keyTypes.size() &&
+             "size of layout information must be same as the size of number of "
+             "elements");
+      layoutInfoList = allocator.copyInto(keyLayoutInfo).data();
+    }
+
+    return new (allocator.allocate<StructTypeStorage>())
+        StructTypeStorage(keyTypes.size(), typesList, layoutInfoList);
+  }
+
+  ArrayRef<Type> getMemberTypes() const {
+    return ArrayRef<Type>(memberTypes, getSubclassData());
+  }
+
+  ArrayRef<StructType::LayoutInfo> getLayoutInfo() const {
+    if (layoutInfo) {
+      return ArrayRef<StructType::LayoutInfo>(layoutInfo, getSubclassData());
+    }
+    return ArrayRef<StructType::LayoutInfo>(nullptr, size_t(0));
+  }
+
+  Type const *memberTypes;
+  StructType::LayoutInfo const *layoutInfo;
+};
+
+StructType StructType::get(ArrayRef<Type> memberTypes) {
+  assert(!memberTypes.empty() && "Struct needs at least one member type");
+  ArrayRef<StructType::LayoutInfo> noLayout(nullptr, size_t(0));
+  return Base::get(memberTypes[0].getContext(), TypeKind::Struct, memberTypes,
+                   noLayout);
+}
+
+StructType StructType::get(ArrayRef<Type> memberTypes,
+                           ArrayRef<StructType::LayoutInfo> layoutInfo) {
+  assert(!memberTypes.empty() && "Struct needs at least one member type");
+  return Base::get(memberTypes.vec().front().getContext(), TypeKind::Struct,
+                   memberTypes, layoutInfo);
+}
+
+unsigned StructType::getNumElements() const {
+  return getImpl()->getSubclassData();
+}
+
+Type StructType::getElementType(unsigned index) const {
+  assert(
+      getNumElements() > index &&
+      "element index is more than number of members of the SPIR-V StructType");
+  return getImpl()->memberTypes[index];
+}
+
+bool StructType::hasLayout() const { return getImpl()->layoutInfo; }
+
+uint64_t StructType::getOffset(unsigned index) const {
+  assert(
+      getNumElements() > index &&
+      "element index is more than number of members of the SPIR-V StructType");
+  return getImpl()->layoutInfo[index];
+}
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt b/third_party/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt
new file mode 100644
index 00000000000..e652bf317fb
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_llvm_library(MLIRSPIRVSerialization
+  ConvertFromBinary.cpp
+  ConvertToBinary.cpp
+  Deserializer.cpp
+  Serializer.cpp
+  SPIRVBinaryUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/SPIRV
+  )
+
+add_dependencies(MLIRSPIRVSerialization
+  MLIRSPIRVSerializationGen)
+
+target_link_libraries(MLIRSPIRVSerialization
+  MLIRIR
+  MLIRSPIRV
+  MLIRSupport)
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertFromBinary.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertFromBinary.cpp
new file mode 100644
index 00000000000..66b178b6480
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertFromBinary.cpp
@@ -0,0 +1,76 @@
+//===- ConvertFromBinary.cpp - MLIR SPIR-V binary to module conversion ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a translation from SPIR-V binary module to MLIR SPIR-V
+// ModuleOp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Serialization.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Translation.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace mlir;
+
+// Deserializes the SPIR-V binary module stored in the file named as
+// `inputFilename` and returns a module containing the SPIR-V module.
+OwningModuleRef deserializeModule(llvm::StringRef inputFilename,
+                                  MLIRContext *context) {
+  Builder builder(context);
+
+  std::string errorMessage;
+  auto file = openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    emitError(UnknownLoc::get(context), errorMessage);
+    return {};
+  }
+
+  // Make sure the input stream can be treated as a stream of SPIR-V words
+  auto start = file->getBufferStart();
+  auto size = file->getBufferSize();
+  if (size % sizeof(uint32_t) != 0) {
+    emitError(UnknownLoc::get(context))
+        << "SPIR-V binary module must contain integral number of 32-bit words";
+    return {};
+  }
+
+  auto binary = llvm::makeArrayRef(reinterpret_cast<const uint32_t *>(start),
+                                   size / sizeof(uint32_t));
+
+  auto spirvModule = spirv::deserialize(binary, context);
+  if (!spirvModule)
+    return {};
+
+  OwningModuleRef module(ModuleOp::create(
+      FileLineColLoc::get(inputFilename, /*line=*/0, /*column=*/0, context)));
+  module->getBody()->push_front(spirvModule->getOperation());
+
+  return module;
+}
+
+static TranslateToMLIRRegistration
+    registration("deserialize-spirv",
+                 [](StringRef inputFilename, MLIRContext *context) {
+                   return deserializeModule(inputFilename, context);
+                 });
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertToBinary.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertToBinary.cpp
new file mode 100644
index 00000000000..8267e6b70b6
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/ConvertToBinary.cpp
@@ -0,0 +1,69 @@
+//===- ConvertToBinary.cpp - MLIR SPIR-V module to binary conversion ------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a translation from MLIR SPIR-V ModuleOp to SPIR-V
+// binary module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Serialization.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Translation.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+LogicalResult serializeModule(ModuleOp module, StringRef outputFilename) {
+  if (!module)
+    return failure();
+
+  SmallVector<uint32_t, 0> binary;
+  bool done = false;
+  auto result = failure();
+
+  for (auto spirvModule : module.getOps<spirv::ModuleOp>()) {
+    if (done)
+      return spirvModule.emitError("found more than one 'spv.module' op");
+
+    done = true;
+    result = spirv::serialize(spirvModule, binary);
+  }
+
+  if (failed(result))
+    return failure();
+
+  auto file = openOutputFile(outputFilename);
+  if (!file)
+    return failure();
+
+  file->os().write(reinterpret_cast<char *>(binary.data()),
+                   binary.size() * sizeof(uint32_t));
+  file->keep();
+
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration
+    registration("serialize-spirv",
+                 [](ModuleOp module, StringRef outputFilename) {
+                   return serializeModule(module, outputFilename);
+                 });
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
new file mode 100644
index 00000000000..dc0d886fa88
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
@@ -0,0 +1,1382 @@
+//===- Deserializer.cpp - MLIR SPIR-V Deserialization ---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the SPIR-V binary to MLIR SPIR-V module deseralization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/Serialization.h"
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/bit.h"
+
+using namespace mlir;
+
+// Decodes a string literal in `words` starting at `wordIndex`. Update the
+// latter to point to the position in words after the string literal.
+static inline StringRef decodeStringLiteral(ArrayRef<uint32_t> words,
+                                            unsigned &wordIndex) {
+  StringRef str(reinterpret_cast<const char *>(words.data() + wordIndex));
+  wordIndex += str.size() / 4 + 1;
+  return str;
+}
+
+// Extracts the opcode from the given first word of a SPIR-V instruction.
+static inline spirv::Opcode extractOpcode(uint32_t word) {
+  return static_cast<spirv::Opcode>(word & 0xffff);
+}
+
+namespace {
+/// A SPIR-V module serializer.
+///
+/// A SPIR-V binary module is a single linear stream of instructions; each
+/// instruction is composed of 32-bit words. The first word of an instruction
+/// records the total number of words of that instruction using the 16
+/// higher-order bits. So this deserializer uses that to get instruction
+/// boundary and parse instructions and build a SPIR-V ModuleOp gradually.
+///
+// TODO(antiagainst): clean up created ops on errors
+class Deserializer {
+public:
+  /// Creates a deserializer for the given SPIR-V `binary` module.
+  /// The SPIR-V ModuleOp will be created into `context.
+  explicit Deserializer(ArrayRef<uint32_t> binary, MLIRContext *context);
+
+  /// Deserializes the remembered SPIR-V binary module.
+  LogicalResult deserialize();
+
+  /// Collects the final SPIR-V ModuleOp.
+  Optional<spirv::ModuleOp> collect();
+
+private:
+  //===--------------------------------------------------------------------===//
+  // Module structure
+  //===--------------------------------------------------------------------===//
+
+  /// Initializes the `module` ModuleOp in this deserializer instance.
+  spirv::ModuleOp createModuleOp();
+
+  /// Processes SPIR-V module header in `binary`.
+  LogicalResult processHeader();
+
+  /// Processes the SPIR-V OpCapability with `operands` and updates bookkeeping
+  /// in the deserializer.
+  LogicalResult processCapability(ArrayRef<uint32_t> operands);
+
+  /// Attaches all collected capabilites to `module` as an attribute.
+  void attachCapabilities();
+
+  /// Processes the SPIR-V OpExtension with `operands` and updates bookkeeping
+  /// in the deserializer.
+  LogicalResult processExtension(ArrayRef<uint32_t> operands);
+
+  /// Attaches all collected extensions to `module` as an attribute.
+  void attachExtensions();
+
+  /// Processes the SPIR-V OpMemoryModel with `operands` and updates `module`.
+  LogicalResult processMemoryModel(ArrayRef<uint32_t> operands);
+
+  /// Process SPIR-V OpName with `operands`.
+  LogicalResult processName(ArrayRef<uint32_t> operands);
+
+  /// Method to process an OpDecorate instruction.
+  LogicalResult processDecoration(ArrayRef<uint32_t> words);
+
+  // Method to process an OpMemberDecorate instruction.
+  LogicalResult processMemberDecoration(ArrayRef<uint32_t> words);
+
+  /// Gets the FuncOp associated with a result <id> of OpFunction.
+  FuncOp getFunction(uint32_t id) { return funcMap.lookup(id); }
+
+  /// Processes the SPIR-V function at the current `offset` into `binary`.
+  /// The operands to the OpFunction instruction is passed in as ``operands`.
+  /// This method processes each instruction inside the function and dispatches
+  /// them to their handler method accordingly.
+  LogicalResult processFunction(ArrayRef<uint32_t> operands);
+
+  /// Gets the constant's attribute and type associated with the given <id>.
+  Optional<std::pair<Attribute, Type>> getConstant(uint32_t id);
+
+  /// Returns a symbol to be used for the specialization constant with the given
+  /// result <id>. This tries to use the specialization constant's OpName if
+  /// exists; otherwise creates one based on the <id>.
+  std::string getSpecConstantSymbol(uint32_t id);
+
+  /// Gets the specialization constant with the given result <id>.
+  spirv::SpecConstantOp getSpecConstant(uint32_t id) {
+    return specConstMap.lookup(id);
+  }
+
+  /// Processes the OpVariable instructions at current `offset` into `binary`.
+  /// It is expected that this method is used for variables that are to be
+  /// defined at module scope and will be deserialized into a spv.globalVariable
+  /// instruction.
+  LogicalResult processGlobalVariable(ArrayRef<uint32_t> operands);
+
+  /// Gets the global variable associated with a result <id> of OpVariable.
+  spirv::GlobalVariableOp getGlobalVariable(uint32_t id) {
+    return globalVariableMap.lookup(id);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type
+  //===--------------------------------------------------------------------===//
+
+  /// Gets type for a given result <id>.
+  Type getType(uint32_t id) { return typeMap.lookup(id); }
+
+  /// Returns true if the given `type` is for SPIR-V void type.
+  bool isVoidType(Type type) const { return type.isa<NoneType>(); }
+
+  /// Processes a SPIR-V type instruction with given `opcode` and `operands` and
+  /// registers the type into `module`.
+  LogicalResult processType(spirv::Opcode opcode, ArrayRef<uint32_t> operands);
+
+  LogicalResult processArrayType(ArrayRef<uint32_t> operands);
+
+  LogicalResult processFunctionType(ArrayRef<uint32_t> operands);
+
+  LogicalResult processStructType(ArrayRef<uint32_t> operands);
+
+  //===--------------------------------------------------------------------===//
+  // Constant
+  //===--------------------------------------------------------------------===//
+
+  /// Processes a SPIR-V Op{|Spec}Constant instruction with the given
+  /// `operands`. `isSpec` indicates whether this is a specialization constant.
+  LogicalResult processConstant(ArrayRef<uint32_t> operands, bool isSpec);
+
+  /// Processes a SPIR-V Op{|Spec}Constant{True|False} instruction with the
+  /// given `operands`. `isSpec` indicates whether this is a specialization
+  /// constant.
+  LogicalResult processConstantBool(bool isTrue, ArrayRef<uint32_t> operands,
+                                    bool isSpec);
+
+  /// Processes a SPIR-V OpConstantComposite instruction with the given
+  /// `operands`.
+  LogicalResult processConstantComposite(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpConstantNull instruction with the given `operands`.
+  LogicalResult processConstantNull(ArrayRef<uint32_t> operands);
+
+  //===--------------------------------------------------------------------===//
+  // Control flow
+  //===--------------------------------------------------------------------===//
+
+  /// Processes a SPIR-V OpLabel instruction with the given `operands`.
+  LogicalResult processLabel(ArrayRef<uint32_t> operands);
+
+  //===--------------------------------------------------------------------===//
+  // Instruction
+  //===--------------------------------------------------------------------===//
+
+  /// Get the Value associated with a result <id>.
+  ///
+  /// This method materializes normal constants and inserts "casting" ops
+  /// (`spv._address_of` and `spv._reference_of`) to turn an symbol into a SSA
+  /// value for handling uses of module scope constants/variables in functions.
+  Value *getValue(uint32_t id);
+
+  /// Slices the first instruction out of `binary` and returns its opcode and
+  /// operands via `opcode` and `operands` respectively. Returns failure if
+  /// there is no more remaining instructions (`expectedOpcode` will be used to
+  /// compose the error message) or the next instruction is malformed.
+  LogicalResult
+  sliceInstruction(spirv::Opcode &opcode, ArrayRef<uint32_t> &operands,
+                   Optional<spirv::Opcode> expectedOpcode = llvm::None);
+
+  /// Returns the next instruction's opcode if exists.
+  Optional<spirv::Opcode> peekOpcode();
+
+  /// Processes a SPIR-V instruction with the given `opcode` and `operands`.
+  /// This method is the main entrance for handling SPIR-V instruction; it
+  /// checks the instruction opcode and dispatches to the corresponding handler.
+  /// Processing of Some instructions (like OpEntryPoint and OpExecutionMode)
+  /// might need to be defered, since they contain forward references to <id>s
+  /// in the deserialized binary, but module in SPIR-V dialect expects these to
+  /// be ssa-uses.
+  LogicalResult processInstruction(spirv::Opcode opcode,
+                                   ArrayRef<uint32_t> operands,
+                                   bool deferInstructions = true);
+
+  /// Method to dispatch to the specialized deserialization function for an
+  /// operation in SPIR-V dialect that is a mirror of an instruction in the
+  /// SPIR-V spec. This is auto-generated from ODS. Dispatch is handled for
+  /// all operations in SPIR-V dialect that have hasOpcode == 1.
+  LogicalResult dispatchToAutogenDeserialization(spirv::Opcode opcode,
+                                                 ArrayRef<uint32_t> words);
+
+  /// Method to deserialize an operation in the SPIR-V dialect that is a mirror
+  /// of an instruction in the SPIR-V spec. This is auto generated if hasOpcode
+  /// == 1 and autogenSerialization == 1 in ODS.
+  template <typename OpTy> LogicalResult processOp(ArrayRef<uint32_t> words) {
+    return emitError(unknownLoc, "unsupported deserialization for ")
+           << OpTy::getOperationName() << " op";
+  }
+
+private:
+  /// The SPIR-V binary module.
+  ArrayRef<uint32_t> binary;
+
+  /// The current word offset into the binary module.
+  unsigned curOffset = 0;
+
+  /// MLIRContext to create SPIR-V ModuleOp into.
+  MLIRContext *context;
+
+  // TODO(antiagainst): create Location subclass for binary blob
+  Location unknownLoc;
+
+  /// The SPIR-V ModuleOp.
+  Optional<spirv::ModuleOp> module;
+
+  OpBuilder opBuilder;
+
+  /// The list of capabilities used by the module.
+  llvm::SmallSetVector<spirv::Capability, 4> capabilities;
+
+  /// The list of extensions used by the module.
+  llvm::SmallSetVector<StringRef, 2> extensions;
+
+  // Result <id> to type mapping.
+  DenseMap<uint32_t, Type> typeMap;
+
+  // Result <id> to constant attribute and type mapping.
+  ///
+  /// In the SPIR-V binary format, all constants are placed in the module and
+  /// shared by instructions at module level and in subsequent functions. But in
+  /// the SPIR-V dialect, we materialize the constant to where it's used in the
+  /// function. So when seeing a constant instruction in the binary format, we
+  /// don't immediately emit a constant op into the module, we keep its value
+  /// (and type) here. Later when it's used, we materialize the constant.
+  DenseMap<uint32_t, std::pair<Attribute, Type>> constantMap;
+
+  // Result <id> to variable mapping.
+  DenseMap<uint32_t, spirv::SpecConstantOp> specConstMap;
+
+  // Result <id> to variable mapping.
+  DenseMap<uint32_t, spirv::GlobalVariableOp> globalVariableMap;
+
+  // Result <id> to function mapping.
+  DenseMap<uint32_t, FuncOp> funcMap;
+
+  // Result <id> to value mapping.
+  DenseMap<uint32_t, Value *> valueMap;
+
+  // Result <id> to name mapping.
+  DenseMap<uint32_t, StringRef> nameMap;
+
+  // Result <id> to decorations mapping.
+  DenseMap<uint32_t, NamedAttributeList> decorations;
+
+  // Result <id> to type decorations.
+  DenseMap<uint32_t, uint32_t> typeDecorations;
+
+  // Result <id> to member decorations.
+  DenseMap<uint32_t, DenseMap<uint32_t, uint32_t>> memberDecorationMap;
+
+  // List of instructions that are processed in a defered fashion (after an
+  // initial processing of the entire binary). Some operations like
+  // OpEntryPoint, and OpExecutionMode use forward references to function
+  // <id>s. In SPIR-V dialect the corresponding operations (spv.EntryPoint and
+  // spv.ExecutionMode) need these references resolved. So these instructions
+  // are deserialized and stored for processing once the entire binary is
+  // processed.
+  SmallVector<std::pair<spirv::Opcode, ArrayRef<uint32_t>>, 4>
+      deferedInstructions;
+};
+} // namespace
+
+Deserializer::Deserializer(ArrayRef<uint32_t> binary, MLIRContext *context)
+    : binary(binary), context(context), unknownLoc(UnknownLoc::get(context)),
+      module(createModuleOp()),
+      opBuilder(module->getOperation()->getRegion(0)) {}
+
+LogicalResult Deserializer::deserialize() {
+  if (failed(processHeader()))
+    return failure();
+
+  spirv::Opcode opcode = spirv::Opcode::OpNop;
+  ArrayRef<uint32_t> operands;
+  auto binarySize = binary.size();
+  while (curOffset < binarySize) {
+    // Slice the next instruction out and populate `opcode` and `operands`.
+    // Interally this also updates `curOffset`.
+    if (failed(sliceInstruction(opcode, operands)))
+      return failure();
+
+    if (failed(processInstruction(opcode, operands)))
+      return failure();
+  }
+
+  assert(curOffset == binarySize &&
+         "deserializer should never index beyond the binary end");
+
+  for (auto &defered : deferedInstructions) {
+    if (failed(processInstruction(defered.first, defered.second, false))) {
+      return failure();
+    }
+  }
+
+  // Attaches the capabilities/extensions as an attribute to the module.
+  attachCapabilities();
+  attachExtensions();
+
+  return success();
+}
+
+Optional<spirv::ModuleOp> Deserializer::collect() { return module; }
+
+//===----------------------------------------------------------------------===//
+// Module structure
+//===----------------------------------------------------------------------===//
+
+spirv::ModuleOp Deserializer::createModuleOp() {
+  Builder builder(context);
+  OperationState state(unknownLoc, spirv::ModuleOp::getOperationName());
+  // TODO(antiagainst): use target environment to select the version
+  state.addAttribute("major_version", builder.getI32IntegerAttr(1));
+  state.addAttribute("minor_version", builder.getI32IntegerAttr(0));
+  spirv::ModuleOp::build(&builder, &state);
+  return cast<spirv::ModuleOp>(Operation::create(state));
+}
+
+LogicalResult Deserializer::processHeader() {
+  if (binary.size() < spirv::kHeaderWordCount)
+    return emitError(unknownLoc,
+                     "SPIR-V binary module must have a 5-word header");
+
+  if (binary[0] != spirv::kMagicNumber)
+    return emitError(unknownLoc, "incorrect magic number");
+
+  // TODO(antiagainst): generator number, bound, schema
+  curOffset = spirv::kHeaderWordCount;
+  return success();
+}
+
+LogicalResult Deserializer::processCapability(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 1)
+    return emitError(unknownLoc, "OpMemoryModel must have one parameter");
+
+  auto cap = spirv::symbolizeCapability(operands[0]);
+  if (!cap)
+    return emitError(unknownLoc, "unknown capability: ") << operands[0];
+
+  capabilities.insert(*cap);
+  return success();
+}
+
+void Deserializer::attachCapabilities() {
+  if (capabilities.empty())
+    return;
+
+  SmallVector<StringRef, 2> caps;
+  caps.reserve(capabilities.size());
+
+  for (auto cap : capabilities) {
+    caps.push_back(spirv::stringifyCapability(cap));
+  }
+
+  module->setAttr("capabilities", opBuilder.getStrArrayAttr(caps));
+}
+
+LogicalResult Deserializer::processExtension(ArrayRef<uint32_t> operands) {
+  if (operands.empty()) {
+    return emitError(
+        unknownLoc,
+        "OpExtension must have a literal string for the extension name");
+  }
+
+  unsigned wordIndex = 0;
+  StringRef extName = decodeStringLiteral(operands, wordIndex);
+  if (wordIndex != operands.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpExtension instruction");
+  }
+
+  extensions.insert(extName);
+  return success();
+}
+
+void Deserializer::attachExtensions() {
+  if (extensions.empty())
+    return;
+
+  module->setAttr("extensions",
+                  opBuilder.getStrArrayAttr(extensions.getArrayRef()));
+}
+
+LogicalResult Deserializer::processMemoryModel(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2)
+    return emitError(unknownLoc, "OpMemoryModel must have two operands");
+
+  module->setAttr(
+      "addressing_model",
+      opBuilder.getI32IntegerAttr(llvm::bit_cast<int32_t>(operands.front())));
+  module->setAttr(
+      "memory_model",
+      opBuilder.getI32IntegerAttr(llvm::bit_cast<int32_t>(operands.back())));
+
+  return success();
+}
+
+LogicalResult Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+  // TODO : This function should also be auto-generated. For now, since only a
+  // few decorations are processed/handled in a meaningful manner, going with a
+  // manual implementation.
+  if (words.size() < 2) {
+    return emitError(
+        unknownLoc, "OpDecorate must have at least result <id> and Decoration");
+  }
+  auto decorationName =
+      stringifyDecoration(static_cast<spirv::Decoration>(words[1]));
+  if (decorationName.empty()) {
+    return emitError(unknownLoc, "invalid Decoration code : ") << words[1];
+  }
+  auto attrName = convertToSnakeCase(decorationName);
+  switch (static_cast<spirv::Decoration>(words[1])) {
+  case spirv::Decoration::DescriptorSet:
+  case spirv::Decoration::Binding:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    decorations[words[0]].set(
+        opBuilder.getIdentifier(attrName),
+        opBuilder.getI32IntegerAttr(static_cast<int32_t>(words[2])));
+    break;
+  case spirv::Decoration::BuiltIn:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    decorations[words[0]].set(opBuilder.getIdentifier(attrName),
+                              opBuilder.getStringAttr(stringifyBuiltIn(
+                                  static_cast<spirv::BuiltIn>(words[2]))));
+    break;
+  case spirv::Decoration::ArrayStride:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    typeDecorations[words[0]] = static_cast<uint32_t>(words[2]);
+    break;
+  case spirv::Decoration::Block:
+    if (words.size() != 2) {
+      return emitError(unknownLoc, "OpDecoration with ")
+             << decorationName << "needs a single target <id>";
+    }
+    // Block decoration does not affect spv.struct type.
+    break;
+  default:
+    return emitError(unknownLoc, "unhandled Decoration : '") << decorationName;
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processMemberDecoration(ArrayRef<uint32_t> words) {
+  // The binary layout of OpMemberDecorate is different comparing to OpDecorate
+  if (words.size() != 4) {
+    return emitError(unknownLoc, "OpMemberDecorate must have 4 operands");
+  }
+
+  switch (static_cast<spirv::Decoration>(words[2])) {
+  case spirv::Decoration::Offset:
+    memberDecorationMap[words[0]][words[1]] = words[3];
+    break;
+  default:
+    return emitError(unknownLoc, "unhandled OpMemberDecoration case: ")
+           << words[2];
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
+  // Get the result type
+  if (operands.size() != 4) {
+    return emitError(unknownLoc, "OpFunction must have 4 parameters");
+  }
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+  if (funcMap.count(operands[1])) {
+    return emitError(unknownLoc, "duplicate function definition/declaration");
+  }
+  auto functionControl = spirv::symbolizeFunctionControl(operands[2]);
+  if (!functionControl) {
+    return emitError(unknownLoc, "unknown Function Control: ") << operands[2];
+  }
+  if (functionControl.getValue() != spirv::FunctionControl::None) {
+    /// TODO : Handle different function controls
+    return emitError(unknownLoc, "unhandled Function Control: '")
+           << spirv::stringifyFunctionControl(functionControl.getValue())
+           << "'";
+  }
+  Type fnType = getType(operands[3]);
+  if (!fnType || !fnType.isa<FunctionType>()) {
+    return emitError(unknownLoc, "unknown function type from <id> ")
+           << operands[3];
+  }
+  auto functionType = fnType.cast<FunctionType>();
+  if ((isVoidType(resultType) && functionType.getNumResults() != 0) ||
+      (functionType.getNumResults() == 1 &&
+       functionType.getResult(0) != resultType)) {
+    return emitError(unknownLoc, "mismatch in function type ")
+           << functionType << " and return type " << resultType << " specified";
+  }
+
+  std::string fnName = nameMap.lookup(operands[1]).str();
+  if (fnName.empty()) {
+    fnName = "spirv_fn_" + std::to_string(operands[2]);
+  }
+  auto funcOp = opBuilder.create<FuncOp>(unknownLoc, fnName, functionType,
+                                         ArrayRef<NamedAttribute>());
+  funcMap[operands[1]] = funcOp;
+  funcOp.addEntryBlock();
+
+  // Parse the op argument instructions
+  if (functionType.getNumInputs()) {
+    for (size_t i = 0, e = functionType.getNumInputs(); i != e; ++i) {
+      auto argType = functionType.getInput(i);
+      spirv::Opcode opcode = spirv::Opcode::OpNop;
+      ArrayRef<uint32_t> operands;
+      if (failed(sliceInstruction(opcode, operands,
+                                  spirv::Opcode::OpFunctionParameter))) {
+        return failure();
+      }
+      if (opcode != spirv::Opcode::OpFunctionParameter) {
+        return emitError(
+                   unknownLoc,
+                   "missing OpFunctionParameter instruction for argument ")
+               << i;
+      }
+      if (operands.size() != 2) {
+        return emitError(
+            unknownLoc,
+            "expected result type and result <id> for OpFunctionParameter");
+      }
+      auto argDefinedType = getType(operands[0]);
+      if (!argDefinedType || argDefinedType != argType) {
+        return emitError(unknownLoc,
+                         "mismatch in argument type between function type "
+                         "definition ")
+               << functionType << " and argument type definition "
+               << argDefinedType << " at argument " << i;
+      }
+      if (getValue(operands[1])) {
+        return emitError(unknownLoc, "duplicate definition of result <id> '")
+               << operands[1];
+      }
+      auto argValue = funcOp.getArgument(i);
+      valueMap[operands[1]] = argValue;
+    }
+  }
+
+  // Create a new builder for building the body.
+  OpBuilder funcBody(funcOp.getBody());
+  std::swap(funcBody, opBuilder);
+
+  // Make sure the first basic block, if exists, starts with an OpLabel
+  // instruction.
+  if (auto nextOpcode = peekOpcode()) {
+    if (*nextOpcode != spirv::Opcode::OpFunctionEnd &&
+        *nextOpcode != spirv::Opcode::OpLabel)
+      return emitError(unknownLoc, "a basic block must start with OpLabel");
+  }
+
+  spirv::Opcode opcode = spirv::Opcode::OpNop;
+  ArrayRef<uint32_t> instOperands;
+  while (succeeded(sliceInstruction(opcode, instOperands,
+                                    spirv::Opcode::OpFunctionEnd)) &&
+         opcode != spirv::Opcode::OpFunctionEnd) {
+    if (failed(processInstruction(opcode, instOperands))) {
+      return failure();
+    }
+  }
+  if (opcode != spirv::Opcode::OpFunctionEnd) {
+    return failure();
+  }
+
+  // Process OpFunctionEnd.
+  if (!instOperands.empty()) {
+    return emitError(unknownLoc, "unexpected operands for OpFunctionEnd");
+  }
+
+  std::swap(funcBody, opBuilder);
+  return success();
+}
+
+Optional<std::pair<Attribute, Type>> Deserializer::getConstant(uint32_t id) {
+  auto constIt = constantMap.find(id);
+  if (constIt == constantMap.end())
+    return llvm::None;
+  return constIt->getSecond();
+}
+
+std::string Deserializer::getSpecConstantSymbol(uint32_t id) {
+  auto constName = nameMap.lookup(id).str();
+  if (constName.empty()) {
+    constName = "spirv_spec_const_" + std::to_string(id);
+  }
+  return constName;
+}
+
+LogicalResult Deserializer::processGlobalVariable(ArrayRef<uint32_t> operands) {
+  unsigned wordIndex = 0;
+  if (operands.size() < 3) {
+    return emitError(
+        unknownLoc,
+        "OpVariable needs at least 3 operands, type, <id> and storage class");
+  }
+
+  // Result Type.
+  auto type = getType(operands[wordIndex]);
+  if (!type) {
+    return emitError(unknownLoc, "unknown result type <id> : ")
+           << operands[wordIndex];
+  }
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    return emitError(unknownLoc,
+                     "expected a result type <id> to be a spv.ptr, found : ")
+           << type;
+  }
+  wordIndex++;
+
+  // Result <id>.
+  auto variableID = operands[wordIndex];
+  auto variableName = nameMap.lookup(variableID).str();
+  if (variableName.empty()) {
+    variableName = "spirv_var_" + std::to_string(variableID);
+  }
+  wordIndex++;
+
+  // Storage class.
+  auto storageClass = static_cast<spirv::StorageClass>(operands[wordIndex]);
+  if (ptrType.getStorageClass() != storageClass) {
+    return emitError(unknownLoc, "mismatch in storage class of pointer type ")
+           << type << " and that specified in OpVariable instruction  : "
+           << stringifyStorageClass(storageClass);
+  }
+  wordIndex++;
+
+  // Initializer.
+  SymbolRefAttr initializer = nullptr;
+  if (wordIndex < operands.size()) {
+    auto initializerOp = getGlobalVariable(operands[wordIndex]);
+    if (!initializerOp) {
+      return emitError(unknownLoc, "unknown <id> ")
+             << operands[wordIndex] << "used as initializer";
+    }
+    wordIndex++;
+    initializer = opBuilder.getSymbolRefAttr(initializerOp.getOperation());
+  }
+  if (wordIndex != operands.size()) {
+    return emitError(unknownLoc,
+                     "found more operands than expected when deserializing "
+                     "OpVariable instruction, only ")
+           << wordIndex << " of " << operands.size() << " processed";
+  }
+  auto varOp = opBuilder.create<spirv::GlobalVariableOp>(
+      unknownLoc, opBuilder.getTypeAttr(type),
+      opBuilder.getStringAttr(variableName), initializer);
+
+  // Decorations.
+  if (decorations.count(variableID)) {
+    for (auto attr : decorations[variableID].getAttrs()) {
+      varOp.setAttr(attr.first, attr.second);
+    }
+  }
+  globalVariableMap[variableID] = varOp;
+  return success();
+}
+
+LogicalResult Deserializer::processName(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 2) {
+    return emitError(unknownLoc, "OpName needs at least 2 operands");
+  }
+  if (!nameMap.lookup(operands[0]).empty()) {
+    return emitError(unknownLoc, "duplicate name found for result <id> ")
+           << operands[0];
+  }
+  unsigned wordIndex = 1;
+  StringRef name = decodeStringLiteral(operands, wordIndex);
+  if (wordIndex != operands.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpName instruction");
+  }
+  nameMap[operands[0]] = name;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+LogicalResult Deserializer::processType(spirv::Opcode opcode,
+                                        ArrayRef<uint32_t> operands) {
+  if (operands.empty()) {
+    return emitError(unknownLoc, "type instruction with opcode ")
+           << spirv::stringifyOpcode(opcode) << " needs at least one <id>";
+  }
+
+  /// TODO: Types might be forward declared in some instructions and need to be
+  /// handled appropriately.
+  if (typeMap.count(operands[0])) {
+    return emitError(unknownLoc, "duplicate definition for result <id> ")
+           << operands[0];
+  }
+
+  switch (opcode) {
+  case spirv::Opcode::OpTypeVoid:
+    if (operands.size() != 1) {
+      return emitError(unknownLoc, "OpTypeVoid must have no parameters");
+    }
+    typeMap[operands[0]] = opBuilder.getNoneType();
+    break;
+  case spirv::Opcode::OpTypeBool:
+    if (operands.size() != 1) {
+      return emitError(unknownLoc, "OpTypeBool must have no parameters");
+    }
+    typeMap[operands[0]] = opBuilder.getI1Type();
+    break;
+  case spirv::Opcode::OpTypeInt:
+    if (operands.size() != 3) {
+      return emitError(
+          unknownLoc, "OpTypeInt must have bitwidth and signedness parameters");
+    }
+    if (operands[2] == 0) {
+      return emitError(unknownLoc, "unhandled unsigned OpTypeInt");
+    }
+    typeMap[operands[0]] = opBuilder.getIntegerType(operands[1]);
+    break;
+  case spirv::Opcode::OpTypeFloat: {
+    if (operands.size() != 2) {
+      return emitError(unknownLoc, "OpTypeFloat must have bitwidth parameter");
+    }
+    Type floatTy;
+    switch (operands[1]) {
+    case 16:
+      floatTy = opBuilder.getF16Type();
+      break;
+    case 32:
+      floatTy = opBuilder.getF32Type();
+      break;
+    case 64:
+      floatTy = opBuilder.getF64Type();
+      break;
+    default:
+      return emitError(unknownLoc, "unsupported OpTypeFloat bitwdith: ")
+             << operands[1];
+    }
+    typeMap[operands[0]] = floatTy;
+  } break;
+  case spirv::Opcode::OpTypeVector: {
+    if (operands.size() != 3) {
+      return emitError(
+          unknownLoc,
+          "OpTypeVector must have element type and count parameters");
+    }
+    Type elementTy = getType(operands[1]);
+    if (!elementTy) {
+      return emitError(unknownLoc, "OpTypeVector references undefined <id> ")
+             << operands[1];
+    }
+    typeMap[operands[0]] = opBuilder.getVectorType({operands[2]}, elementTy);
+  } break;
+  case spirv::Opcode::OpTypePointer: {
+    if (operands.size() != 3) {
+      return emitError(unknownLoc, "OpTypePointer must have two parameters");
+    }
+    auto pointeeType = getType(operands[2]);
+    if (!pointeeType) {
+      return emitError(unknownLoc, "unknown OpTypePointer pointee type <id> ")
+             << operands[2];
+    }
+    auto storageClass = static_cast<spirv::StorageClass>(operands[1]);
+    typeMap[operands[0]] = spirv::PointerType::get(pointeeType, storageClass);
+  } break;
+  case spirv::Opcode::OpTypeArray:
+    return processArrayType(operands);
+  case spirv::Opcode::OpTypeFunction:
+    return processFunctionType(operands);
+  case spirv::Opcode::OpTypeStruct:
+    return processStructType(operands);
+  default:
+    return emitError(unknownLoc, "unhandled type instruction");
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processArrayType(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 3) {
+    return emitError(unknownLoc,
+                     "OpTypeArray must have element type and count parameters");
+  }
+
+  Type elementTy = getType(operands[1]);
+  if (!elementTy) {
+    return emitError(unknownLoc, "OpTypeArray references undefined <id> ")
+           << operands[1];
+  }
+
+  unsigned count = 0;
+  // TODO(antiagainst): The count can also come frome a specialization constant.
+  auto countInfo = getConstant(operands[2]);
+  if (!countInfo) {
+    return emitError(unknownLoc, "OpTypeArray count <id> ")
+           << operands[2] << "can only come from normal constant right now";
+  }
+
+  if (auto intVal = countInfo->first.dyn_cast<IntegerAttr>()) {
+    count = intVal.getInt();
+  } else {
+    return emitError(unknownLoc, "OpTypeArray count must come from a "
+                                 "scalar integer constant instruction");
+  }
+
+  typeMap[operands[0]] = spirv::ArrayType::get(
+      elementTy, count, typeDecorations.lookup(operands[0]));
+  return success();
+}
+
+LogicalResult Deserializer::processFunctionType(ArrayRef<uint32_t> operands) {
+  assert(!operands.empty() && "No operands for processing function type");
+  if (operands.size() == 1) {
+    return emitError(unknownLoc, "missing return type for OpTypeFunction");
+  }
+  auto returnType = getType(operands[1]);
+  if (!returnType) {
+    return emitError(unknownLoc, "unknown return type in OpTypeFunction");
+  }
+  SmallVector<Type, 1> argTypes;
+  for (size_t i = 2, e = operands.size(); i < e; ++i) {
+    auto ty = getType(operands[i]);
+    if (!ty) {
+      return emitError(unknownLoc, "unknown argument type in OpTypeFunction");
+    }
+    argTypes.push_back(ty);
+  }
+  ArrayRef<Type> returnTypes;
+  if (!isVoidType(returnType)) {
+    returnTypes = llvm::makeArrayRef(returnType);
+  }
+  typeMap[operands[0]] = FunctionType::get(argTypes, returnTypes, context);
+  return success();
+}
+
+LogicalResult Deserializer::processStructType(ArrayRef<uint32_t> operands) {
+  // TODO(ravishankarm) : Regarding to the spec spv.struct must support zero
+  // amount of members.
+  if (operands.size() < 2) {
+    return emitError(unknownLoc, "OpTypeStruct must have at least 2 operand");
+  }
+
+  SmallVector<Type, 0> memberTypes;
+  for (auto op : llvm::drop_begin(operands, 1)) {
+    Type memberType = getType(op);
+    if (!memberType) {
+      return emitError(unknownLoc, "OpTypeStruct references undefined <id> ")
+             << op;
+    }
+    memberTypes.push_back(memberType);
+  }
+
+  SmallVector<spirv::StructType::LayoutInfo, 0> layoutInfo;
+  // Check for layoutinfo
+  auto memberDecorationIt = memberDecorationMap.find(operands[0]);
+  if (memberDecorationIt != memberDecorationMap.end()) {
+    // Each member must have an offset
+    const auto &offsetDecorationMap = memberDecorationIt->second;
+    auto offsetDecorationMapEnd = offsetDecorationMap.end();
+    for (auto memberIndex : llvm::seq<uint32_t>(0, memberTypes.size())) {
+      // Check that specific member has an offset
+      auto offsetIt = offsetDecorationMap.find(memberIndex);
+      if (offsetIt == offsetDecorationMapEnd) {
+        return emitError(unknownLoc, "OpTypeStruct with <id> ")
+               << operands[0] << " must have an offset for " << memberIndex
+               << "-th member";
+      }
+      layoutInfo.push_back(
+          static_cast<spirv::StructType::LayoutInfo>(offsetIt->second));
+    }
+  }
+  typeMap[operands[0]] = spirv::StructType::get(memberTypes, layoutInfo);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Constant
+//===----------------------------------------------------------------------===//
+
+LogicalResult Deserializer::processConstant(ArrayRef<uint32_t> operands,
+                                            bool isSpec) {
+  StringRef opname = isSpec ? "OpSpecConstant" : "OpConstant";
+
+  if (operands.size() < 2) {
+    return emitError(unknownLoc)
+           << opname << " must have type <id> and result <id>";
+  }
+  if (operands.size() < 3) {
+    return emitError(unknownLoc)
+           << opname << " must have at least 1 more parameter";
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto checkOperandSizeForBitwidth = [&](unsigned bitwidth) -> LogicalResult {
+    if (bitwidth == 64) {
+      if (operands.size() == 4) {
+        return success();
+      }
+      return emitError(unknownLoc)
+             << opname << " should have 2 parameters for 64-bit values";
+    }
+    if (bitwidth <= 32) {
+      if (operands.size() == 3) {
+        return success();
+      }
+
+      return emitError(unknownLoc)
+             << opname
+             << " should have 1 parameter for values with no more than 32 bits";
+    }
+    return emitError(unknownLoc, "unsupported OpConstant bitwidth: ")
+           << bitwidth;
+  };
+
+  auto resultID = operands[1];
+
+  if (auto intType = resultType.dyn_cast<IntegerType>()) {
+    auto bitwidth = intType.getWidth();
+    if (failed(checkOperandSizeForBitwidth(bitwidth))) {
+      return failure();
+    }
+
+    APInt value;
+    if (bitwidth == 64) {
+      // 64-bit integers are represented with two SPIR-V words. According to
+      // SPIR-V spec: "When the type’s bit width is larger than one word, the
+      // literal’s low-order words appear first."
+      struct DoubleWord {
+        uint32_t word1;
+        uint32_t word2;
+      } words = {operands[2], operands[3]};
+      value = APInt(64, llvm::bit_cast<uint64_t>(words), /*isSigned=*/true);
+    } else if (bitwidth <= 32) {
+      value = APInt(bitwidth, operands[2], /*isSigned=*/true);
+    }
+
+    auto attr = opBuilder.getIntegerAttr(intType, value);
+
+    if (isSpec) {
+      auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
+      auto op =
+          opBuilder.create<spirv::SpecConstantOp>(unknownLoc, symName, attr);
+      specConstMap[resultID] = op;
+    } else {
+      // For normal constants, we just record the attribute (and its type) for
+      // later materialization at use sites.
+      constantMap.try_emplace(resultID, attr, intType);
+    }
+
+    return success();
+  }
+
+  if (auto floatType = resultType.dyn_cast<FloatType>()) {
+    auto bitwidth = floatType.getWidth();
+    if (failed(checkOperandSizeForBitwidth(bitwidth))) {
+      return failure();
+    }
+
+    APFloat value(0.f);
+    if (floatType.isF64()) {
+      // Double values are represented with two SPIR-V words. According to
+      // SPIR-V spec: "When the type’s bit width is larger than one word, the
+      // literal’s low-order words appear first."
+      struct DoubleWord {
+        uint32_t word1;
+        uint32_t word2;
+      } words = {operands[2], operands[3]};
+      value = APFloat(llvm::bit_cast<double>(words));
+    } else if (floatType.isF32()) {
+      value = APFloat(llvm::bit_cast<float>(operands[2]));
+    } else if (floatType.isF16()) {
+      APInt data(16, operands[2]);
+      value = APFloat(APFloat::IEEEhalf(), data);
+    }
+
+    auto attr = opBuilder.getFloatAttr(floatType, value);
+    if (isSpec) {
+      auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
+      auto op =
+          opBuilder.create<spirv::SpecConstantOp>(unknownLoc, symName, attr);
+      specConstMap[resultID] = op;
+    } else {
+      // For normal constants, we just record the attribute (and its type) for
+      // later materialization at use sites.
+      constantMap.try_emplace(resultID, attr, floatType);
+    }
+
+    return success();
+  }
+
+  return emitError(unknownLoc, "OpConstant can only generate values of "
+                               "scalar integer or floating-point type");
+}
+
+LogicalResult Deserializer::processConstantBool(bool isTrue,
+                                                ArrayRef<uint32_t> operands,
+                                                bool isSpec) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc, "Op")
+           << (isSpec ? "Spec" : "") << "Constant"
+           << (isTrue ? "True" : "False")
+           << " must have type <id> and result <id>";
+  }
+
+  auto attr = opBuilder.getBoolAttr(isTrue);
+  auto resultID = operands[1];
+  if (isSpec) {
+    auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
+    auto op =
+        opBuilder.create<spirv::SpecConstantOp>(unknownLoc, symName, attr);
+    specConstMap[resultID] = op;
+  } else {
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, opBuilder.getI1Type());
+  }
+
+  return success();
+}
+
+LogicalResult
+Deserializer::processConstantComposite(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 2) {
+    return emitError(unknownLoc,
+                     "OpConstantComposite must have type <id> and result <id>");
+  }
+  if (operands.size() < 3) {
+    return emitError(unknownLoc,
+                     "OpConstantComposite must have at least 1 parameter");
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  SmallVector<Attribute, 4> elements;
+  elements.reserve(operands.size() - 2);
+  for (unsigned i = 2, e = operands.size(); i < e; ++i) {
+    auto elementInfo = getConstant(operands[i]);
+    if (!elementInfo) {
+      return emitError(unknownLoc, "OpConstantComposite component <id> ")
+             << operands[i] << " must come from a normal constant";
+    }
+    elements.push_back(elementInfo->first);
+  }
+
+  auto resultID = operands[1];
+  if (auto vectorType = resultType.dyn_cast<VectorType>()) {
+    auto attr = opBuilder.getDenseElementsAttr(vectorType, elements);
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, resultType);
+  } else if (auto arrayType = resultType.dyn_cast<spirv::ArrayType>()) {
+    auto attr = opBuilder.getArrayAttr(elements);
+    constantMap.try_emplace(resultID, attr, resultType);
+  } else {
+    return emitError(unknownLoc, "unsupported OpConstantComposite type: ")
+           << resultType;
+  }
+
+  return success();
+}
+
+LogicalResult Deserializer::processConstantNull(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc,
+                     "OpConstantNull must have type <id> and result <id>");
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto resultID = operands[1];
+  if (resultType.isa<IntegerType>() || resultType.isa<FloatType>() ||
+      resultType.isa<VectorType>()) {
+    auto attr = opBuilder.getZeroAttr(resultType);
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, resultType);
+    return success();
+  }
+
+    return emitError(unknownLoc, "unsupported OpConstantNull type: ")
+           << resultType;
+}
+
+//===----------------------------------------------------------------------===//
+// Control flow
+//===----------------------------------------------------------------------===//
+
+LogicalResult Deserializer::processLabel(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 1) {
+    return emitError(unknownLoc, "OpLabel should only have result <id>");
+  }
+  // TODO(antiagainst): support basic blocks and control flow properly.
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction
+//===----------------------------------------------------------------------===//
+
+Value *Deserializer::getValue(uint32_t id) {
+  if (auto constInfo = getConstant(id)) {
+    // Materialize a `spv.constant` op at every use site.
+    return opBuilder.create<spirv::ConstantOp>(unknownLoc, constInfo->second,
+                                               constInfo->first);
+  }
+  if (auto varOp = getGlobalVariable(id)) {
+    auto addressOfOp = opBuilder.create<spirv::AddressOfOp>(
+        unknownLoc, varOp.type(),
+        opBuilder.getSymbolRefAttr(varOp.getOperation()));
+    return addressOfOp.pointer();
+  }
+  if (auto constOp = getSpecConstant(id)) {
+    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
+        unknownLoc, constOp.default_value().getType(),
+        opBuilder.getSymbolRefAttr(constOp.getOperation()));
+    return referenceOfOp.reference();
+  }
+  return valueMap.lookup(id);
+}
+
+LogicalResult
+Deserializer::sliceInstruction(spirv::Opcode &opcode,
+                               ArrayRef<uint32_t> &operands,
+                               Optional<spirv::Opcode> expectedOpcode) {
+  auto binarySize = binary.size();
+  if (curOffset >= binarySize) {
+    return emitError(unknownLoc, "expected ")
+           << (expectedOpcode ? spirv::stringifyOpcode(*expectedOpcode)
+                              : "more")
+           << " instruction";
+  }
+
+  // For each instruction, get its word count from the first word to slice it
+  // from the stream properly, and then dispatch to the instruction handler.
+
+  uint32_t wordCount = binary[curOffset] >> 16;
+
+  if (wordCount == 0)
+    return emitError(unknownLoc, "word count cannot be zero");
+
+  uint32_t nextOffset = curOffset + wordCount;
+  if (nextOffset > binarySize)
+    return emitError(unknownLoc, "insufficient words for the last instruction");
+
+  opcode = extractOpcode(binary[curOffset]);
+  operands = binary.slice(curOffset + 1, wordCount - 1);
+  curOffset = nextOffset;
+  return success();
+}
+
+Optional<spirv::Opcode> Deserializer::peekOpcode() {
+  if (curOffset >= binary.size())
+    return llvm::None;
+  return extractOpcode(binary[curOffset]);
+}
+
+LogicalResult Deserializer::processInstruction(spirv::Opcode opcode,
+                                               ArrayRef<uint32_t> operands,
+                                               bool deferInstructions) {
+  // First dispatch all the instructions whose opcode does not correspond to
+  // those that have a direct mirror in the SPIR-V dialect
+  switch (opcode) {
+  case spirv::Opcode::OpCapability:
+    return processCapability(operands);
+  case spirv::Opcode::OpExtension:
+    return processExtension(operands);
+  case spirv::Opcode::OpMemoryModel:
+    return processMemoryModel(operands);
+  case spirv::Opcode::OpEntryPoint:
+  case spirv::Opcode::OpExecutionMode:
+    if (deferInstructions) {
+      deferedInstructions.emplace_back(opcode, operands);
+      return success();
+    }
+    break;
+  case spirv::Opcode::OpVariable:
+    if (isa<spirv::ModuleOp>(opBuilder.getBlock()->getParentOp())) {
+      return processGlobalVariable(operands);
+    }
+    break;
+  case spirv::Opcode::OpName:
+    return processName(operands);
+  case spirv::Opcode::OpTypeVoid:
+  case spirv::Opcode::OpTypeBool:
+  case spirv::Opcode::OpTypeInt:
+  case spirv::Opcode::OpTypeFloat:
+  case spirv::Opcode::OpTypeVector:
+  case spirv::Opcode::OpTypeArray:
+  case spirv::Opcode::OpTypeFunction:
+  case spirv::Opcode::OpTypeStruct:
+  case spirv::Opcode::OpTypePointer:
+    return processType(opcode, operands);
+  case spirv::Opcode::OpConstant:
+    return processConstant(operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstant:
+    return processConstant(operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantComposite:
+    return processConstantComposite(operands);
+  case spirv::Opcode::OpConstantTrue:
+    return processConstantBool(/*isTrue=*/true, operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstantTrue:
+    return processConstantBool(/*isTrue=*/true, operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantFalse:
+    return processConstantBool(/*isTrue=*/false, operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstantFalse:
+    return processConstantBool(/*isTrue=*/false, operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantNull:
+    return processConstantNull(operands);
+  case spirv::Opcode::OpDecorate:
+    return processDecoration(operands);
+  case spirv::Opcode::OpMemberDecorate:
+    return processMemberDecoration(operands);
+  case spirv::Opcode::OpFunction:
+    return processFunction(operands);
+  case spirv::Opcode::OpLabel:
+    return processLabel(operands);
+  default:
+    break;
+  }
+  return dispatchToAutogenDeserialization(opcode, operands);
+}
+
+namespace {
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::EntryPointOp>(ArrayRef<uint32_t> words) {
+  unsigned wordIndex = 0;
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc,
+                     "missing Execution Model specification in OpEntryPoint");
+  }
+  auto exec_model = opBuilder.getI32IntegerAttr(words[wordIndex++]);
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc, "missing <id> in OpEntryPoint");
+  }
+  // Get the function <id>
+  auto fnID = words[wordIndex++];
+  // Get the function name
+  auto fnName = decodeStringLiteral(words, wordIndex);
+  // Verify that the function <id> matches the fnName
+  auto parsedFunc = getFunction(fnID);
+  if (!parsedFunc) {
+    return emitError(unknownLoc, "no function matching <id> ") << fnID;
+  }
+  if (parsedFunc.getName() != fnName) {
+    return emitError(unknownLoc, "function name mismatch between OpEntryPoint "
+                                 "and OpFunction with <id> ")
+           << fnID << ": " << fnName << " vs. " << parsedFunc.getName();
+  }
+  SmallVector<Attribute, 4> interface;
+  while (wordIndex < words.size()) {
+    auto arg = getGlobalVariable(words[wordIndex]);
+    if (!arg) {
+      return emitError(unknownLoc, "undefined result <id> ")
+             << words[wordIndex] << " while decoding OpEntryPoint";
+    }
+    interface.push_back(opBuilder.getSymbolRefAttr(arg.getOperation()));
+    wordIndex++;
+  }
+  opBuilder.create<spirv::EntryPointOp>(unknownLoc, exec_model,
+                                        opBuilder.getSymbolRefAttr(fnName),
+                                        opBuilder.getArrayAttr(interface));
+  return success();
+}
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::ExecutionModeOp>(ArrayRef<uint32_t> words) {
+  unsigned wordIndex = 0;
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc,
+                     "missing function result <id> in OpExecutionMode");
+  }
+  // Get the function <id> to get the name of the function
+  auto fnID = words[wordIndex++];
+  auto fn = getFunction(fnID);
+  if (!fn) {
+    return emitError(unknownLoc, "no function matching <id> ") << fnID;
+  }
+  // Get the Execution mode
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc, "missing Execution Mode in OpExecutionMode");
+  }
+  auto execMode = opBuilder.getI32IntegerAttr(words[wordIndex++]);
+
+  // Get the values
+  SmallVector<Attribute, 4> attrListElems;
+  while (wordIndex < words.size()) {
+    attrListElems.push_back(opBuilder.getI32IntegerAttr(words[wordIndex++]));
+  }
+  auto values = opBuilder.getArrayAttr(attrListElems);
+  opBuilder.create<spirv::ExecutionModeOp>(
+      unknownLoc, opBuilder.getSymbolRefAttr(fn.getName()), execMode, values);
+  return success();
+}
+
+// Pull in auto-generated Deserializer::dispatchToAutogenDeserialization() and
+// various Deserializer::processOp<...>() specializations.
+#define GET_DESERIALIZATION_FNS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+} // namespace
+
+Optional<spirv::ModuleOp> spirv::deserialize(ArrayRef<uint32_t> binary,
+                                             MLIRContext *context) {
+  Deserializer deserializer(binary, context);
+
+  if (failed(deserializer.deserialize()))
+    return llvm::None;
+
+  return deserializer.collect();
+}
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp
new file mode 100644
index 00000000000..1e432b38ef8
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp
@@ -0,0 +1,53 @@
+//===- SPIRVBinaryUtils.cpp - MLIR SPIR-V Binary Module Utilities ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines common utilities for SPIR-V binary module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+
+using namespace mlir;
+
+void spirv::appendModuleHeader(SmallVectorImpl<uint32_t> &header,
+                               uint32_t idBound) {
+  // The major and minor version number for the generated SPIR-V binary.
+  // TODO(antiagainst): use target environment to select the version
+  constexpr uint8_t kMajorVersion = 1;
+  constexpr uint8_t kMinorVersion = 0;
+
+  // See "2.3. Physical Layout of a SPIR-V Module and Instruction" in the SPIR-V
+  // spec for the definition of the binary module header.
+  //
+  // The first five words of a SPIR-V module must be:
+  // +-------------------------------------------------------------------------+
+  // | Magic number                                                            |
+  // +-------------------------------------------------------------------------+
+  // | Version number (bytes: 0 | major number | minor number | 0)             |
+  // +-------------------------------------------------------------------------+
+  // | Generator magic number                                                  |
+  // +-------------------------------------------------------------------------+
+  // | Bound (all result <id>s in the module guaranteed to be less than it)    |
+  // +-------------------------------------------------------------------------+
+  // | 0 (reserved for instruction schema)                                     |
+  // +-------------------------------------------------------------------------+
+  header.push_back(spirv::kMagicNumber);
+  header.push_back((kMajorVersion << 16) | (kMinorVersion << 8));
+  header.push_back(kGeneratorNumber);
+  header.push_back(idBound); // <id> bound
+  header.push_back(0);       // Schema (reserved word)
+}
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
new file mode 100644
index 00000000000..43a1d08cf6c
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
@@ -0,0 +1,1272 @@
+//===- Serializer.cpp - MLIR SPIR-V Serialization -------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the MLIR SPIR-V module to SPIR-V binary seralization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/Serialization.h"
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+/// Returns the word-count-prefixed opcode for an SPIR-V instruction.
+static inline uint32_t getPrefixedOpcode(uint32_t wordCount,
+                                         spirv::Opcode opcode) {
+  assert(((wordCount >> 16) == 0) && "word count out of range!");
+  return (wordCount << 16) | static_cast<uint32_t>(opcode);
+}
+
+/// Encodes an SPIR-V instruction with the given `opcode` and `operands` into
+/// the given `binary` vector.
+static LogicalResult encodeInstructionInto(SmallVectorImpl<uint32_t> &binary,
+                                           spirv::Opcode op,
+                                           ArrayRef<uint32_t> operands) {
+  uint32_t wordCount = 1 + operands.size();
+  binary.push_back(getPrefixedOpcode(wordCount, op));
+  if (!operands.empty()) {
+    binary.append(operands.begin(), operands.end());
+  }
+  return success();
+}
+
+/// Encodes an SPIR-V `literal` string into the given `binary` vector.
+static LogicalResult encodeStringLiteralInto(SmallVectorImpl<uint32_t> &binary,
+                                             StringRef literal) {
+  // We need to encode the literal and the null termination.
+  auto encodingSize = literal.size() / 4 + 1;
+  auto bufferStartSize = binary.size();
+  binary.resize(bufferStartSize + encodingSize, 0);
+  std::memcpy(binary.data() + bufferStartSize, literal.data(), literal.size());
+  return success();
+}
+
+namespace {
+
+/// A SPIR-V module serializer.
+///
+/// A SPIR-V binary module is a single linear stream of instructions; each
+/// instruction is composed of 32-bit words with the layout:
+///
+///   | <word-count>|<opcode> |  <operand>   |  <operand>   | ... |
+///   | <------ word -------> | <-- word --> | <-- word --> | ... |
+///
+/// For the first word, the 16 high-order bits are the word count of the
+/// instruction, the 16 low-order bits are the opcode enumerant. The
+/// instructions then belong to different sections, which must be laid out in
+/// the particular order as specified in "2.4 Logical Layout of a Module" of
+/// the SPIR-V spec.
+class Serializer {
+public:
+  /// Creates a serializer for the given SPIR-V `module`.
+  explicit Serializer(spirv::ModuleOp module);
+
+  /// Serializes the remembered SPIR-V module.
+  LogicalResult serialize();
+
+  /// Collects the final SPIR-V `binary`.
+  void collect(SmallVectorImpl<uint32_t> &binary);
+
+private:
+  // Note that there are two main categories of methods in this class:
+  // * process*() methods are meant to fully serialize a SPIR-V module entity
+  //   (header, type, op, etc.). They update internal vectors containing
+  //   different binary sections. They are not meant to be called except the
+  //   top-level serialization loop.
+  // * prepare*() methods are meant to be helpers that prepare for serializing
+  //   certain entity. They may or may not update internal vectors containing
+  //   different binary sections. They are meant to be called among themselves
+  //   or by other process*() methods for subtasks.
+
+  //===--------------------------------------------------------------------===//
+  // <id>
+  //===--------------------------------------------------------------------===//
+
+  // Note that it is illegal to use id <0> in SPIR-V binary module. Various
+  // methods in this class, if using SPIR-V word (uint32_t) as interface,
+  // check or return id <0> to indicate error in processing.
+
+  /// Consumes the next unused <id>. This method will never return 0.
+  uint32_t getNextID() { return nextID++; }
+
+  //===--------------------------------------------------------------------===//
+  // Module structure
+  //===--------------------------------------------------------------------===//
+
+  uint32_t findSpecConstID(StringRef constName) const {
+    return specConstIDMap.lookup(constName);
+  }
+
+  uint32_t findVariableID(StringRef varName) const {
+    return globalVarIDMap.lookup(varName);
+  }
+
+  uint32_t findFunctionID(StringRef fnName) const {
+    return funcIDMap.lookup(fnName);
+  }
+
+  void processCapability();
+
+  void processExtension();
+
+  void processMemoryModel();
+
+  LogicalResult processConstantOp(spirv::ConstantOp op);
+
+  LogicalResult processSpecConstantOp(spirv::SpecConstantOp op);
+
+  /// Emit OpName for the given `resultID`.
+  LogicalResult processName(uint32_t resultID, StringRef name);
+
+  /// Processes a SPIR-V function op.
+  LogicalResult processFuncOp(FuncOp op);
+
+  /// Process a SPIR-V GlobalVariableOp
+  LogicalResult processGlobalVariableOp(spirv::GlobalVariableOp varOp);
+
+  /// Process attributes that translate to decorations on the result <id>
+  LogicalResult processDecoration(Location loc, uint32_t resultID,
+                                  NamedAttribute attr);
+
+  template <typename DType>
+  LogicalResult processTypeDecoration(Location loc, DType type,
+                                      uint32_t resultId) {
+    return emitError(loc, "unhandled decoraion for type:") << type;
+  }
+
+  /// Process member decoration
+  LogicalResult processMemberDecoration(uint32_t structID, uint32_t memberNum,
+                                        spirv::Decoration decorationType,
+                                        uint32_t value);
+
+  //===--------------------------------------------------------------------===//
+  // Types
+  //===--------------------------------------------------------------------===//
+
+  uint32_t findTypeID(Type type) const { return typeIDMap.lookup(type); }
+
+  Type getVoidType() { return mlirBuilder.getNoneType(); }
+
+  bool isVoidType(Type type) const { return type.isa<NoneType>(); }
+
+  /// Returns true if the given type is a pointer type to a struct in Uniform or
+  /// StorageBuffer storage class.
+  bool isInterfaceStructPtrType(Type type) const;
+
+  /// Main dispatch method for serializing a type. The result <id> of the
+  /// serialized type will be returned as `typeID`.
+  LogicalResult processType(Location loc, Type type, uint32_t &typeID);
+
+  /// Method for preparing basic SPIR-V type serialization. Returns the type's
+  /// opcode and operands for the instruction via `typeEnum` and `operands`.
+  LogicalResult prepareBasicType(Location loc, Type type, uint32_t resultID,
+                                 spirv::Opcode &typeEnum,
+                                 SmallVectorImpl<uint32_t> &operands);
+
+  LogicalResult prepareFunctionType(Location loc, FunctionType type,
+                                    spirv::Opcode &typeEnum,
+                                    SmallVectorImpl<uint32_t> &operands);
+
+  //===--------------------------------------------------------------------===//
+  // Constant
+  //===--------------------------------------------------------------------===//
+
+  uint32_t findConstantID(Attribute value) const {
+    return constIDMap.lookup(value);
+  }
+
+  /// Main dispatch method for processing a constant with the given `constType`
+  /// and `valueAttr`. `constType` is needed here because we can interpret the
+  /// `valueAttr` as a different type than the type of `valueAttr` itself; for
+  /// example, ArrayAttr, whose type is NoneType, is used for spirv::ArrayType
+  /// constants.
+  uint32_t prepareConstant(Location loc, Type constType, Attribute valueAttr);
+
+  /// Prepares bool ElementsAttr serialization. This method updates `opcode`
+  /// with a proper OpConstant* instruction and pushes literal values for the
+  /// constant to `operands`.
+  LogicalResult prepareBoolVectorConstant(Location loc,
+                                          DenseIntElementsAttr elementsAttr,
+                                          spirv::Opcode &opcode,
+                                          SmallVectorImpl<uint32_t> &operands);
+
+  /// Prepares int ElementsAttr serialization. This method updates `opcode` with
+  /// a proper OpConstant* instruction and pushes literal values for the
+  /// constant to `operands`.
+  LogicalResult prepareIntVectorConstant(Location loc,
+                                         DenseIntElementsAttr elementsAttr,
+                                         spirv::Opcode &opcode,
+                                         SmallVectorImpl<uint32_t> &operands);
+
+  /// Prepares float ElementsAttr serialization. This method updates `opcode`
+  /// with a proper OpConstant* instruction and pushes literal values for the
+  /// constant to `operands`.
+  LogicalResult prepareFloatVectorConstant(Location loc,
+                                           DenseFPElementsAttr elementsAttr,
+                                           spirv::Opcode &opcode,
+                                           SmallVectorImpl<uint32_t> &operands);
+
+  /// Prepares scalar attribute serialization. This method emits corresponding
+  /// OpConstant* and returns the result <id> associated with it. Returns 0 if
+  /// the attribute is not for a scalar bool/integer/float value. If `isSpec` is
+  /// true, then the constant will be serialized as a specialization constant.
+  uint32_t prepareConstantScalar(Location loc, Attribute valueAttr,
+                                 bool isSpec = false);
+
+  uint32_t prepareConstantBool(Location loc, BoolAttr boolAttr,
+                               bool isSpec = false);
+
+  uint32_t prepareConstantInt(Location loc, IntegerAttr intAttr,
+                              bool isSpec = false);
+
+  uint32_t prepareConstantFp(Location loc, FloatAttr floatAttr,
+                             bool isSpec = false);
+
+  //===--------------------------------------------------------------------===//
+  // Operations
+  //===--------------------------------------------------------------------===//
+
+  uint32_t findValueID(Value *val) const { return valueIDMap.lookup(val); }
+
+  LogicalResult processAddressOfOp(spirv::AddressOfOp addressOfOp);
+
+  LogicalResult processReferenceOfOp(spirv::ReferenceOfOp referenceOfOp);
+
+  /// Main dispatch method for serializing an operation.
+  LogicalResult processOperation(Operation *op);
+
+  /// Method to dispatch to the serialization function for an operation in
+  /// SPIR-V dialect that is a mirror of an instruction in the SPIR-V spec.
+  /// This is auto-generated from ODS. Dispatch is handled for all operations
+  /// in SPIR-V dialect that have hasOpcode == 1.
+  LogicalResult dispatchToAutogenSerialization(Operation *op);
+
+  /// Method to serialize an operation in the SPIR-V dialect that is a mirror of
+  /// an instruction in the SPIR-V spec. This is auto generated if hasOpcode ==
+  /// 1 and autogenSerialization == 1 in ODS.
+  template <typename OpTy> LogicalResult processOp(OpTy op) {
+    return op.emitError("unsupported op serialization");
+  }
+
+private:
+  /// The SPIR-V module to be serialized.
+  spirv::ModuleOp module;
+
+  /// An MLIR builder for getting MLIR constructs.
+  mlir::Builder mlirBuilder;
+
+  /// The next available result <id>.
+  uint32_t nextID = 1;
+
+  // The following are for different SPIR-V instruction sections. They follow
+  // the logical layout of a SPIR-V module.
+
+  SmallVector<uint32_t, 4> capabilities;
+  SmallVector<uint32_t, 0> extensions;
+  SmallVector<uint32_t, 0> extendedSets;
+  SmallVector<uint32_t, 3> memoryModel;
+  SmallVector<uint32_t, 0> entryPoints;
+  SmallVector<uint32_t, 4> executionModes;
+  // TODO(antiagainst): debug instructions
+  SmallVector<uint32_t, 0> names;
+  SmallVector<uint32_t, 0> decorations;
+  SmallVector<uint32_t, 0> typesGlobalValues;
+  SmallVector<uint32_t, 0> functions;
+
+  /// Map from type used in SPIR-V module to their <id>s.
+  DenseMap<Type, uint32_t> typeIDMap;
+
+  /// Map from constant values to their <id>s.
+  DenseMap<Attribute, uint32_t> constIDMap;
+
+  /// Map from specialization constant names to their <id>s.
+  llvm::StringMap<uint32_t> specConstIDMap;
+
+  /// Map from GlobalVariableOps name to <id>s.
+  llvm::StringMap<uint32_t> globalVarIDMap;
+
+  /// Map from FuncOps name to <id>s.
+  llvm::StringMap<uint32_t> funcIDMap;
+
+  /// Map from results of normal operations to their <id>s.
+  DenseMap<Value *, uint32_t> valueIDMap;
+};
+} // namespace
+
+Serializer::Serializer(spirv::ModuleOp module)
+    : module(module), mlirBuilder(module.getContext()) {}
+
+LogicalResult Serializer::serialize() {
+  if (failed(module.verify()))
+    return failure();
+
+  // TODO(antiagainst): handle the other sections
+  processCapability();
+  processExtension();
+  processMemoryModel();
+
+  // Iterate over the module body to serialze it. Assumptions are that there is
+  // only one basic block in the moduleOp
+  for (auto &op : module.getBlock()) {
+    if (failed(processOperation(&op))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+void Serializer::collect(SmallVectorImpl<uint32_t> &binary) {
+  auto moduleSize = spirv::kHeaderWordCount + capabilities.size() +
+                    extensions.size() + extendedSets.size() +
+                    memoryModel.size() + entryPoints.size() +
+                    executionModes.size() + decorations.size() +
+                    typesGlobalValues.size() + functions.size();
+
+  binary.clear();
+  binary.reserve(moduleSize);
+
+  spirv::appendModuleHeader(binary, nextID);
+  binary.append(capabilities.begin(), capabilities.end());
+  binary.append(extensions.begin(), extensions.end());
+  binary.append(extendedSets.begin(), extendedSets.end());
+  binary.append(memoryModel.begin(), memoryModel.end());
+  binary.append(entryPoints.begin(), entryPoints.end());
+  binary.append(executionModes.begin(), executionModes.end());
+  binary.append(names.begin(), names.end());
+  binary.append(decorations.begin(), decorations.end());
+  binary.append(typesGlobalValues.begin(), typesGlobalValues.end());
+  binary.append(functions.begin(), functions.end());
+}
+//===----------------------------------------------------------------------===//
+// Module structure
+//===----------------------------------------------------------------------===//
+
+void Serializer::processCapability() {
+  auto caps = module.getAttrOfType<ArrayAttr>("capabilities");
+  if (!caps)
+    return;
+
+  for (auto cap : caps.getValue()) {
+    auto capStr = cap.cast<StringAttr>().getValue();
+    auto capVal = spirv::symbolizeCapability(capStr);
+    encodeInstructionInto(capabilities, spirv::Opcode::OpCapability,
+                          {static_cast<uint32_t>(*capVal)});
+  }
+}
+
+void Serializer::processExtension() {
+  auto exts = module.getAttrOfType<ArrayAttr>("extensions");
+  if (!exts)
+    return;
+
+  SmallVector<uint32_t, 16> extName;
+  for (auto ext : exts.getValue()) {
+    auto extStr = ext.cast<StringAttr>().getValue();
+    extName.clear();
+    encodeStringLiteralInto(extName, extStr);
+    encodeInstructionInto(extensions, spirv::Opcode::OpExtension, extName);
+  }
+}
+
+void Serializer::processMemoryModel() {
+  uint32_t mm = module.getAttrOfType<IntegerAttr>("memory_model").getInt();
+  uint32_t am = module.getAttrOfType<IntegerAttr>("addressing_model").getInt();
+
+  encodeInstructionInto(memoryModel, spirv::Opcode::OpMemoryModel, {am, mm});
+}
+
+LogicalResult Serializer::processConstantOp(spirv::ConstantOp op) {
+  if (auto resultID = prepareConstant(op.getLoc(), op.getType(), op.value())) {
+    valueIDMap[op.getResult()] = resultID;
+    return success();
+  }
+  return failure();
+}
+
+LogicalResult Serializer::processSpecConstantOp(spirv::SpecConstantOp op) {
+  if (auto resultID = prepareConstantScalar(op.getLoc(), op.default_value(),
+                                            /*isSpec=*/true)) {
+    specConstIDMap[op.sym_name()] = resultID;
+    return processName(resultID, op.sym_name());
+  }
+  return failure();
+}
+
+LogicalResult Serializer::processDecoration(Location loc, uint32_t resultID,
+                                            NamedAttribute attr) {
+  auto attrName = attr.first.strref();
+  auto decorationName = mlir::convertToCamelCase(attrName, true);
+  auto decoration = spirv::symbolizeDecoration(decorationName);
+  if (!decoration) {
+    return emitError(
+               loc, "non-argument attributes expected to have snake-case-ified "
+                    "decoration name, unhandled attribute with name : ")
+           << attrName;
+  }
+  SmallVector<uint32_t, 1> args;
+  args.push_back(resultID);
+  args.push_back(static_cast<uint32_t>(decoration.getValue()));
+  switch (decoration.getValue()) {
+  case spirv::Decoration::DescriptorSet:
+  case spirv::Decoration::Binding:
+    if (auto intAttr = attr.second.dyn_cast<IntegerAttr>()) {
+      args.push_back(intAttr.getValue().getZExtValue());
+      break;
+    }
+    return emitError(loc, "expected integer attribute for ") << attrName;
+  case spirv::Decoration::BuiltIn:
+    if (auto strAttr = attr.second.dyn_cast<StringAttr>()) {
+      auto enumVal = spirv::symbolizeBuiltIn(strAttr.getValue());
+      if (enumVal) {
+        args.push_back(static_cast<uint32_t>(enumVal.getValue()));
+        break;
+      }
+      return emitError(loc, "invalid ")
+             << attrName << " attribute " << strAttr.getValue();
+    }
+    return emitError(loc, "expected string attribute for ") << attrName;
+  default:
+    return emitError(loc, "unhandled decoration ") << decorationName;
+  }
+  return encodeInstructionInto(decorations, spirv::Opcode::OpDecorate, args);
+}
+
+LogicalResult Serializer::processName(uint32_t resultID, StringRef name) {
+  assert(!name.empty() && "unexpected empty string for OpName");
+
+  SmallVector<uint32_t, 4> nameOperands;
+  nameOperands.push_back(resultID);
+  if (failed(encodeStringLiteralInto(nameOperands, name))) {
+    return failure();
+  }
+  return encodeInstructionInto(names, spirv::Opcode::OpName, nameOperands);
+}
+
+namespace {
+template <>
+LogicalResult Serializer::processTypeDecoration<spirv::ArrayType>(
+    Location loc, spirv::ArrayType type, uint32_t resultID) {
+  if (type.hasLayout()) {
+    // OpDecorate %arrayTypeSSA ArrayStride strideLiteral
+    SmallVector<uint32_t, 3> args;
+    args.push_back(resultID);
+    args.push_back(static_cast<uint32_t>(spirv::Decoration::ArrayStride));
+    args.push_back(type.getArrayStride());
+    return encodeInstructionInto(decorations, spirv::Opcode::OpDecorate, args);
+  }
+  return success();
+}
+
+LogicalResult
+Serializer::processMemberDecoration(uint32_t structID, uint32_t memberIndex,
+                                    spirv::Decoration decorationType,
+                                    uint32_t value) {
+  SmallVector<uint32_t, 4> args(
+      {structID, memberIndex, static_cast<uint32_t>(decorationType), value});
+  return encodeInstructionInto(decorations, spirv::Opcode::OpMemberDecorate,
+                               args);
+}
+} // namespace
+
+LogicalResult Serializer::processFuncOp(FuncOp op) {
+  uint32_t fnTypeID = 0;
+  // Generate type of the function.
+  processType(op.getLoc(), op.getType(), fnTypeID);
+
+  // Add the function definition.
+  SmallVector<uint32_t, 4> operands;
+  uint32_t resTypeID = 0;
+  auto resultTypes = op.getType().getResults();
+  if (resultTypes.size() > 1) {
+    return emitError(op.getLoc(),
+                     "cannot serialize function with multiple return types");
+  }
+  if (failed(processType(op.getLoc(),
+                         (resultTypes.empty() ? getVoidType() : resultTypes[0]),
+                         resTypeID))) {
+    return failure();
+  }
+  operands.push_back(resTypeID);
+  auto funcID = getNextID();
+  funcIDMap[op.getName()] = funcID;
+  operands.push_back(funcID);
+  // TODO : Support other function control options.
+  operands.push_back(static_cast<uint32_t>(spirv::FunctionControl::None));
+  operands.push_back(fnTypeID);
+  encodeInstructionInto(functions, spirv::Opcode::OpFunction, operands);
+
+  // Add function name.
+  if (failed(processName(funcID, op.getName()))) {
+    return failure();
+  }
+
+  // Declare the parameters.
+  for (auto arg : op.getArguments()) {
+    uint32_t argTypeID = 0;
+    if (failed(processType(op.getLoc(), arg->getType(), argTypeID))) {
+      return failure();
+    }
+    auto argValueID = getNextID();
+    valueIDMap[arg] = argValueID;
+    encodeInstructionInto(functions, spirv::Opcode::OpFunctionParameter,
+                          {argTypeID, argValueID});
+  }
+
+  // Process the body.
+  if (op.isExternal()) {
+    return emitError(op.getLoc(), "external function is unhandled");
+  }
+
+  for (auto &b : op) {
+    // TODO(antiagainst): support basic blocks and control flow properly.
+    encodeInstructionInto(functions, spirv::Opcode::OpLabel, {getNextID()});
+    for (auto &op : b) {
+      if (failed(processOperation(&op))) {
+        return failure();
+      }
+    }
+  }
+
+  // Insert Function End.
+  return encodeInstructionInto(functions, spirv::Opcode::OpFunctionEnd, {});
+}
+
+LogicalResult
+Serializer::processGlobalVariableOp(spirv::GlobalVariableOp varOp) {
+  // Get TypeID.
+  uint32_t resultTypeID = 0;
+  SmallVector<StringRef, 4> elidedAttrs;
+  if (failed(processType(varOp.getLoc(), varOp.type(), resultTypeID))) {
+    return failure();
+  }
+
+  if (isInterfaceStructPtrType(varOp.type())) {
+    auto structType = varOp.type()
+                          .cast<spirv::PointerType>()
+                          .getPointeeType()
+                          .cast<spirv::StructType>();
+    SmallVector<uint32_t, 2> args{
+        findTypeID(structType),
+        static_cast<uint32_t>(spirv::Decoration::Block)};
+    if (failed(encodeInstructionInto(decorations, spirv::Opcode::OpDecorate,
+                                     args))) {
+      return varOp.emitError("cannot decorate ")
+             << structType << " with Block decoration";
+    }
+  }
+
+  elidedAttrs.push_back("type");
+  SmallVector<uint32_t, 4> operands;
+  operands.push_back(resultTypeID);
+  auto resultID = getNextID();
+
+  // Encode the name.
+  auto varName = varOp.sym_name();
+  elidedAttrs.push_back(SymbolTable::getSymbolAttrName());
+  if (failed(processName(resultID, varName))) {
+    return failure();
+  }
+  globalVarIDMap[varName] = resultID;
+  operands.push_back(resultID);
+
+  // Encode StorageClass.
+  operands.push_back(static_cast<uint32_t>(varOp.storageClass()));
+
+  // Encode initialization.
+  if (auto initializer = varOp.initializer()) {
+    auto initializerID = findVariableID(initializer.getValue());
+    if (!initializerID) {
+      return emitError(varOp.getLoc(),
+                       "invalid usage of undefined variable as initializer");
+    }
+    operands.push_back(initializerID);
+    elidedAttrs.push_back("initializer");
+  }
+
+  if (failed(encodeInstructionInto(functions, spirv::Opcode::OpVariable,
+                                   operands))) {
+    elidedAttrs.push_back("initializer");
+    return failure();
+  }
+
+  // Encode decorations.
+  for (auto attr : varOp.getAttrs()) {
+    if (llvm::any_of(elidedAttrs,
+                     [&](StringRef elided) { return attr.first.is(elided); })) {
+      continue;
+    }
+    if (failed(processDecoration(varOp.getLoc(), resultID, attr))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+bool Serializer::isInterfaceStructPtrType(Type type) const {
+  if (auto ptrType = type.dyn_cast<spirv::PointerType>()) {
+    auto storageClass = ptrType.getStorageClass();
+    if (storageClass == spirv::StorageClass::Uniform ||
+        storageClass == spirv::StorageClass::StorageBuffer) {
+      return ptrType.getPointeeType().isa<spirv::StructType>();
+    }
+  }
+  return false;
+}
+
+LogicalResult Serializer::processType(Location loc, Type type,
+                                      uint32_t &typeID) {
+  typeID = findTypeID(type);
+  if (typeID) {
+    return success();
+  }
+  typeID = getNextID();
+  SmallVector<uint32_t, 4> operands;
+  operands.push_back(typeID);
+  auto typeEnum = spirv::Opcode::OpTypeVoid;
+  if ((type.isa<FunctionType>() &&
+       succeeded(prepareFunctionType(loc, type.cast<FunctionType>(), typeEnum,
+                                     operands))) ||
+      succeeded(prepareBasicType(loc, type, typeID, typeEnum, operands))) {
+    typeIDMap[type] = typeID;
+    return encodeInstructionInto(typesGlobalValues, typeEnum, operands);
+  }
+  return failure();
+}
+
+LogicalResult
+Serializer::prepareBasicType(Location loc, Type type, uint32_t resultID,
+                             spirv::Opcode &typeEnum,
+                             SmallVectorImpl<uint32_t> &operands) {
+  if (isVoidType(type)) {
+    typeEnum = spirv::Opcode::OpTypeVoid;
+    return success();
+  }
+
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    if (intType.getWidth() == 1) {
+      typeEnum = spirv::Opcode::OpTypeBool;
+      return success();
+    }
+
+    typeEnum = spirv::Opcode::OpTypeInt;
+    operands.push_back(intType.getWidth());
+    // TODO(antiagainst): support unsigned integers
+    operands.push_back(1);
+    return success();
+  }
+
+  if (auto floatType = type.dyn_cast<FloatType>()) {
+    typeEnum = spirv::Opcode::OpTypeFloat;
+    operands.push_back(floatType.getWidth());
+    return success();
+  }
+
+  if (auto vectorType = type.dyn_cast<VectorType>()) {
+    uint32_t elementTypeID = 0;
+    if (failed(processType(loc, vectorType.getElementType(), elementTypeID))) {
+      return failure();
+    }
+    typeEnum = spirv::Opcode::OpTypeVector;
+    operands.push_back(elementTypeID);
+    operands.push_back(vectorType.getNumElements());
+    return success();
+  }
+
+  if (auto arrayType = type.dyn_cast<spirv::ArrayType>()) {
+    typeEnum = spirv::Opcode::OpTypeArray;
+    uint32_t elementTypeID = 0;
+    if (failed(processType(loc, arrayType.getElementType(), elementTypeID))) {
+      return failure();
+    }
+    operands.push_back(elementTypeID);
+    if (auto elementCountID = prepareConstantInt(
+            loc, mlirBuilder.getI32IntegerAttr(arrayType.getNumElements()))) {
+      operands.push_back(elementCountID);
+    }
+    return processTypeDecoration(loc, arrayType, resultID);
+  }
+
+  if (auto ptrType = type.dyn_cast<spirv::PointerType>()) {
+    uint32_t pointeeTypeID = 0;
+    if (failed(processType(loc, ptrType.getPointeeType(), pointeeTypeID))) {
+      return failure();
+    }
+    typeEnum = spirv::Opcode::OpTypePointer;
+    operands.push_back(static_cast<uint32_t>(ptrType.getStorageClass()));
+    operands.push_back(pointeeTypeID);
+    return success();
+  }
+
+  if (auto structType = type.dyn_cast<spirv::StructType>()) {
+    bool hasLayout = structType.hasLayout();
+    for (auto elementIndex :
+         llvm::seq<uint32_t>(0, structType.getNumElements())) {
+      uint32_t elementTypeID = 0;
+      if (failed(processType(loc, structType.getElementType(elementIndex),
+                             elementTypeID))) {
+        return failure();
+      }
+      operands.push_back(elementTypeID);
+      if (hasLayout) {
+        // Decorate each struct member with an offset
+        if (failed(processMemberDecoration(
+                resultID, elementIndex, spirv::Decoration::Offset,
+                static_cast<uint32_t>(structType.getOffset(elementIndex))))) {
+          return emitError(loc, "cannot decorate ")
+                 << elementIndex << "-th member of : " << structType
+                 << "with its offset";
+        }
+      }
+    }
+    typeEnum = spirv::Opcode::OpTypeStruct;
+    return success();
+  }
+
+  // TODO(ravishankarm) : Handle other types.
+  return emitError(loc, "unhandled type in serialization: ") << type;
+}
+
+LogicalResult
+Serializer::prepareFunctionType(Location loc, FunctionType type,
+                                spirv::Opcode &typeEnum,
+                                SmallVectorImpl<uint32_t> &operands) {
+  typeEnum = spirv::Opcode::OpTypeFunction;
+  assert(type.getNumResults() <= 1 &&
+         "Serialization supports only a single return value");
+  uint32_t resultID = 0;
+  if (failed(processType(
+          loc, type.getNumResults() == 1 ? type.getResult(0) : getVoidType(),
+          resultID))) {
+    return failure();
+  }
+  operands.push_back(resultID);
+  for (auto &res : type.getInputs()) {
+    uint32_t argTypeID = 0;
+    if (failed(processType(loc, res, argTypeID))) {
+      return failure();
+    }
+    operands.push_back(argTypeID);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Constant
+//===----------------------------------------------------------------------===//
+
+uint32_t Serializer::prepareConstant(Location loc, Type constType,
+                                     Attribute valueAttr) {
+  if (auto id = prepareConstantScalar(loc, valueAttr)) {
+    return id;
+  }
+  // This is a composite literal. We need to handle each component separately
+  // and then emit an OpConstantComposite for the whole.
+
+  if (auto id = findConstantID(valueAttr)) {
+    return id;
+  }
+
+  uint32_t typeID = 0;
+  if (failed(processType(loc, constType, typeID))) {
+    return 0;
+  }
+  auto resultID = getNextID();
+
+  spirv::Opcode opcode = spirv::Opcode::OpNop;
+  SmallVector<uint32_t, 4> operands;
+  operands.push_back(typeID);
+  operands.push_back(resultID);
+
+  if (auto vectorAttr = valueAttr.dyn_cast<DenseIntElementsAttr>()) {
+    if (vectorAttr.getType().getElementType().isInteger(1)) {
+      if (failed(prepareBoolVectorConstant(loc, vectorAttr, opcode, operands)))
+        return 0;
+    } else if (failed(
+                   prepareIntVectorConstant(loc, vectorAttr, opcode, operands)))
+      return 0;
+  } else if (auto vectorAttr = valueAttr.dyn_cast<DenseFPElementsAttr>()) {
+    if (failed(prepareFloatVectorConstant(loc, vectorAttr, opcode, operands)))
+      return 0;
+  } else if (auto arrayAttr = valueAttr.dyn_cast<ArrayAttr>()) {
+    opcode = spirv::Opcode::OpConstantComposite;
+    operands.reserve(arrayAttr.size() + 2);
+
+    auto elementType = constType.cast<spirv::ArrayType>().getElementType();
+    for (Attribute elementAttr : arrayAttr)
+      if (auto elementID = prepareConstant(loc, elementType, elementAttr)) {
+        operands.push_back(elementID);
+      } else {
+        return 0;
+      }
+  } else {
+    emitError(loc, "cannot serialize attribute: ") << valueAttr;
+    return 0;
+  }
+
+  encodeInstructionInto(typesGlobalValues, opcode, operands);
+  constIDMap[valueAttr] = resultID;
+  return resultID;
+}
+
+LogicalResult Serializer::prepareBoolVectorConstant(
+    Location loc, DenseIntElementsAttr elementsAttr, spirv::Opcode &opcode,
+    SmallVectorImpl<uint32_t> &operands) {
+  auto type = elementsAttr.getType();
+  assert(type.hasRank() && type.getRank() == 1 &&
+         "spv.constant should have verified only vector literal uses "
+         "ElementsAttr");
+  assert(type.getElementType().isInteger(1) && "must be bool ElementsAttr");
+  auto count = type.getNumElements();
+
+  // Operands for constructing the SPIR-V OpConstant* instruction
+  operands.reserve(count + 2);
+
+  // For splat cases, we don't need to loop over all elements, especially when
+  // the splat value is zero.
+  if (elementsAttr.isSplat()) {
+    // We can use OpConstantNull if this bool ElementsAttr is splatting false.
+    if (!elementsAttr.getSplatValue<bool>()) {
+      opcode = spirv::Opcode::OpConstantNull;
+      return success();
+    }
+
+    if (auto id =
+            prepareConstantBool(loc, elementsAttr.getSplatValue<BoolAttr>())) {
+      opcode = spirv::Opcode::OpConstantComposite;
+      operands.append(count, id);
+      return success();
+    }
+
+    return failure();
+  }
+
+  // Otherwise, we need to process each element and compose them with
+  // OpConstantComposite.
+  opcode = spirv::Opcode::OpConstantComposite;
+  for (auto boolAttr : elementsAttr.getValues<BoolAttr>()) {
+    // We are constructing an BoolAttr for each value here. But given that
+    // we only use ElementsAttr for vectors with no more than 4 elements, it
+    // should be fine here.
+    if (auto elementID = prepareConstantBool(loc, boolAttr)) {
+      operands.push_back(elementID);
+    } else {
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult Serializer::prepareIntVectorConstant(
+    Location loc, DenseIntElementsAttr elementsAttr, spirv::Opcode &opcode,
+    SmallVectorImpl<uint32_t> &operands) {
+  auto type = elementsAttr.getType();
+  assert(type.hasRank() && type.getRank() == 1 &&
+         "spv.constant should have verified only vector literal uses "
+         "ElementsAttr");
+  assert(!type.getElementType().isInteger(1) &&
+         "must be non-bool ElementsAttr");
+  auto count = type.getNumElements();
+
+  // Operands for constructing the SPIR-V OpConstant* instruction
+  operands.reserve(count + 2);
+
+  // For splat cases, we don't need to loop over all elements, especially when
+  // the splat value is zero.
+  if (elementsAttr.isSplat()) {
+    auto splatAttr = elementsAttr.getSplatValue<IntegerAttr>();
+
+    // We can use OpConstantNull if this int ElementsAttr is splatting 0.
+    if (splatAttr.getValue().isNullValue()) {
+      opcode = spirv::Opcode::OpConstantNull;
+      return success();
+    }
+
+    if (auto id = prepareConstantInt(loc, splatAttr)) {
+      opcode = spirv::Opcode::OpConstantComposite;
+      operands.append(count, id);
+      return success();
+    }
+    return failure();
+  }
+
+  // Otherwise, we need to process each element and compose them with
+  // OpConstantComposite.
+  opcode = spirv::Opcode::OpConstantComposite;
+  for (auto intAttr : elementsAttr.getValues<IntegerAttr>()) {
+    // We are constructing an IntegerAttr for each value here. But given that
+    // we only use ElementsAttr for vectors with no more than 4 elements, it
+    // should be fine here.
+    // TODO(antiagainst): revisit this if special extensions enabling large
+    // vectors are supported.
+    if (auto elementID = prepareConstantInt(loc, intAttr)) {
+      operands.push_back(elementID);
+    } else {
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult Serializer::prepareFloatVectorConstant(
+    Location loc, DenseFPElementsAttr elementsAttr, spirv::Opcode &opcode,
+    SmallVectorImpl<uint32_t> &operands) {
+  auto type = elementsAttr.getType();
+  assert(type.hasRank() && type.getRank() == 1 &&
+         "spv.constant should have verified only vector literal uses "
+         "ElementsAttr");
+  auto count = type.getNumElements();
+
+  operands.reserve(count + 2);
+
+  if (elementsAttr.isSplat()) {
+    FloatAttr splatAttr = elementsAttr.getSplatValue<FloatAttr>();
+    if (splatAttr.getValue().isZero()) {
+      opcode = spirv::Opcode::OpConstantNull;
+      return success();
+    }
+
+    if (auto id = prepareConstantFp(loc, splatAttr)) {
+      opcode = spirv::Opcode::OpConstantComposite;
+      operands.append(count, id);
+      return success();
+    }
+
+    return failure();
+  }
+
+  opcode = spirv::Opcode::OpConstantComposite;
+  for (auto fpAttr : elementsAttr.getValues<FloatAttr>()) {
+    if (auto elementID = prepareConstantFp(loc, fpAttr)) {
+      operands.push_back(elementID);
+    } else {
+      return failure();
+    }
+  }
+  return success();
+}
+
+uint32_t Serializer::prepareConstantScalar(Location loc, Attribute valueAttr,
+                                           bool isSpec) {
+  if (auto floatAttr = valueAttr.dyn_cast<FloatAttr>()) {
+    return prepareConstantFp(loc, floatAttr, isSpec);
+  }
+  if (auto intAttr = valueAttr.dyn_cast<IntegerAttr>()) {
+    return prepareConstantInt(loc, intAttr, isSpec);
+  }
+  if (auto boolAttr = valueAttr.dyn_cast<BoolAttr>()) {
+    return prepareConstantBool(loc, boolAttr, isSpec);
+  }
+
+  return 0;
+}
+
+uint32_t Serializer::prepareConstantBool(Location loc, BoolAttr boolAttr,
+                                         bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate nomral contants, but not specialization constants.
+    if (auto id = findConstantID(boolAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this bool literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, boolAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  auto opcode = boolAttr.getValue()
+                    ? (isSpec ? spirv::Opcode::OpSpecConstantTrue
+                              : spirv::Opcode::OpConstantTrue)
+                    : (isSpec ? spirv::Opcode::OpSpecConstantFalse
+                              : spirv::Opcode::OpConstantFalse);
+  encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID});
+
+  if (!isSpec) {
+    constIDMap[boolAttr] = resultID;
+  }
+  return resultID;
+}
+
+uint32_t Serializer::prepareConstantInt(Location loc, IntegerAttr intAttr,
+                                        bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate nomral contants, but not specialization constants.
+    if (auto id = findConstantID(intAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this integer literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, intAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  APInt value = intAttr.getValue();
+  unsigned bitwidth = value.getBitWidth();
+  bool isSigned = value.isSignedIntN(bitwidth);
+
+  auto opcode =
+      isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
+
+  // According to SPIR-V spec, "When the type's bit width is less than 32-bits,
+  // the literal's value appears in the low-order bits of the word, and the
+  // high-order bits must be 0 for a floating-point type, or 0 for an integer
+  // type with Signedness of 0, or sign extended when Signedness is 1."
+  if (bitwidth == 32 || bitwidth == 16) {
+    uint32_t word = 0;
+    if (isSigned) {
+      word = static_cast<int32_t>(value.getSExtValue());
+    } else {
+      word = static_cast<uint32_t>(value.getZExtValue());
+    }
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  }
+  // According to SPIR-V spec: "When the type's bit width is larger than one
+  // word, the literal’s low-order words appear first."
+  else if (bitwidth == 64) {
+    struct DoubleWord {
+      uint32_t word1;
+      uint32_t word2;
+    } words;
+    if (isSigned) {
+      words = llvm::bit_cast<DoubleWord>(value.getSExtValue());
+    } else {
+      words = llvm::bit_cast<DoubleWord>(value.getZExtValue());
+    }
+    encodeInstructionInto(typesGlobalValues, opcode,
+                          {typeID, resultID, words.word1, words.word2});
+  } else {
+    std::string valueStr;
+    llvm::raw_string_ostream rss(valueStr);
+    value.print(rss, /*isSigned=*/false);
+
+    emitError(loc, "cannot serialize ")
+        << bitwidth << "-bit integer literal: " << rss.str();
+    return 0;
+  }
+
+  if (!isSpec) {
+    constIDMap[intAttr] = resultID;
+  }
+  return resultID;
+}
+
+uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
+                                       bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate nomral contants, but not specialization constants.
+    if (auto id = findConstantID(floatAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this float literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, floatAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  APFloat value = floatAttr.getValue();
+  APInt intValue = value.bitcastToAPInt();
+
+  auto opcode =
+      isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
+
+  if (&value.getSemantics() == &APFloat::IEEEsingle()) {
+    uint32_t word = llvm::bit_cast<uint32_t>(value.convertToFloat());
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  } else if (&value.getSemantics() == &APFloat::IEEEdouble()) {
+    struct DoubleWord {
+      uint32_t word1;
+      uint32_t word2;
+    } words = llvm::bit_cast<DoubleWord>(value.convertToDouble());
+    encodeInstructionInto(typesGlobalValues, opcode,
+                          {typeID, resultID, words.word1, words.word2});
+  } else if (&value.getSemantics() == &APFloat::IEEEhalf()) {
+    uint32_t word =
+        static_cast<uint32_t>(value.bitcastToAPInt().getZExtValue());
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  } else {
+    std::string valueStr;
+    llvm::raw_string_ostream rss(valueStr);
+    value.print(rss);
+
+    emitError(loc, "cannot serialize ")
+        << floatAttr.getType() << "-typed float literal: " << rss.str();
+    return 0;
+  }
+
+  if (!isSpec) {
+    constIDMap[floatAttr] = resultID;
+  }
+  return resultID;
+}
+
+//===----------------------------------------------------------------------===//
+// Operation
+//===----------------------------------------------------------------------===//
+
+LogicalResult Serializer::processAddressOfOp(spirv::AddressOfOp addressOfOp) {
+  auto varName = addressOfOp.variable();
+  auto variableID = findVariableID(varName);
+  if (!variableID) {
+    return addressOfOp.emitError("unknown result <id> for variable ")
+           << varName;
+  }
+  valueIDMap[addressOfOp.pointer()] = variableID;
+  return success();
+}
+
+LogicalResult
+Serializer::processReferenceOfOp(spirv::ReferenceOfOp referenceOfOp) {
+  auto constName = referenceOfOp.spec_const();
+  auto constID = findSpecConstID(constName);
+  if (!constID) {
+    return referenceOfOp.emitError(
+               "unknown result <id> for specialization constant ")
+           << constName;
+  }
+  valueIDMap[referenceOfOp.reference()] = constID;
+  return success();
+}
+
+LogicalResult Serializer::processOperation(Operation *op) {
+  // First dispatch the methods that do not directly mirror an operation from
+  // the SPIR-V spec
+  if (auto constOp = dyn_cast<spirv::ConstantOp>(op)) {
+    return processConstantOp(constOp);
+  }
+  if (auto specConstOp = dyn_cast<spirv::SpecConstantOp>(op)) {
+    return processSpecConstantOp(specConstOp);
+  }
+  if (auto refOpOp = dyn_cast<spirv::ReferenceOfOp>(op)) {
+    return processReferenceOfOp(refOpOp);
+  }
+  if (auto fnOp = dyn_cast<FuncOp>(op)) {
+    return processFuncOp(fnOp);
+  }
+  if (isa<spirv::ModuleEndOp>(op)) {
+    return success();
+  }
+  if (auto varOp = dyn_cast<spirv::GlobalVariableOp>(op)) {
+    return processGlobalVariableOp(varOp);
+  }
+  if (auto addressOfOp = dyn_cast<spirv::AddressOfOp>(op)) {
+    return processAddressOfOp(addressOfOp);
+  }
+  return dispatchToAutogenSerialization(op);
+}
+
+namespace {
+template <>
+LogicalResult
+Serializer::processOp<spirv::EntryPointOp>(spirv::EntryPointOp op) {
+  SmallVector<uint32_t, 4> operands;
+  // Add the ExectionModel.
+  operands.push_back(static_cast<uint32_t>(op.execution_model()));
+  // Add the function <id>.
+  auto funcID = findFunctionID(op.fn());
+  if (!funcID) {
+    return op.emitError("missing <id> for function ")
+           << op.fn()
+           << "; function needs to be defined before spv.EntryPoint is "
+              "serialized";
+  }
+  operands.push_back(funcID);
+  // Add the name of the function.
+  encodeStringLiteralInto(operands, op.fn());
+
+  // Add the interface values.
+  if (auto interface = op.interface()) {
+    for (auto var : interface.getValue()) {
+      auto id = findVariableID(var.cast<SymbolRefAttr>().getValue());
+      if (!id) {
+        return op.emitError("referencing undefined global variable."
+                            "spv.EntryPoint is at the end of spv.module. All "
+                            "referenced variables should already be defined");
+      }
+      operands.push_back(id);
+    }
+  }
+  return encodeInstructionInto(entryPoints, spirv::Opcode::OpEntryPoint,
+                               operands);
+}
+
+template <>
+LogicalResult
+Serializer::processOp<spirv::ExecutionModeOp>(spirv::ExecutionModeOp op) {
+  SmallVector<uint32_t, 4> operands;
+  // Add the function <id>.
+  auto funcID = findFunctionID(op.fn());
+  if (!funcID) {
+    return op.emitError("missing <id> for function ")
+           << op.fn()
+           << "; function needs to be serialized before ExecutionModeOp is "
+              "serialized";
+  }
+  operands.push_back(funcID);
+  // Add the ExecutionMode.
+  operands.push_back(static_cast<uint32_t>(op.execution_mode()));
+
+  // Serialize values if any.
+  auto values = op.values();
+  if (values) {
+    for (auto &intVal : values.getValue()) {
+      operands.push_back(static_cast<uint32_t>(
+          intVal.cast<IntegerAttr>().getValue().getZExtValue()));
+    }
+  }
+  return encodeInstructionInto(executionModes, spirv::Opcode::OpExecutionMode,
+                               operands);
+}
+
+// Pull in auto-generated Serializer::dispatchToAutogenSerialization() and
+// various Serializer::processOp<...>() specializations.
+#define GET_SERIALIZATION_FNS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+} // namespace
+
+LogicalResult spirv::serialize(spirv::ModuleOp module,
+                               SmallVectorImpl<uint32_t> &binary) {
+  Serializer serializer(module);
+
+  if (failed(serializer.serialize()))
+    return failure();
+
+  serializer.collect(binary);
+  return success();
+}
diff --git a/third_party/mlir/lib/Dialect/StandardOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/StandardOps/CMakeLists.txt
new file mode 100644
index 00000000000..f10c173af8a
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/StandardOps/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRStandardOps
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/StandardOps
+  )
+add_dependencies(MLIRStandardOps MLIRStandardOpsIncGen LLVMSupport)
+target_link_libraries(MLIRStandardOps LLVMSupport)
diff --git a/third_party/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp
new file mode 100644
index 00000000000..6b5578f93cf
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp
@@ -0,0 +1,22 @@
+//===- DialectRegistration.cpp - Register standard Op dialect -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+using namespace mlir;
+
+// Static initialization for standard op dialect registration.
+static DialectRegistration<StandardOpsDialect> StandardOps;
diff --git a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp
new file mode 100644
index 00000000000..9f490530292
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp
@@ -0,0 +1,2196 @@
+//===- Ops.cpp - Standard MLIR Operations ---------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// StandardOpsDialect Interfaces
+//===----------------------------------------------------------------------===//
+namespace {
+struct StdOpAsmInterface : public OpAsmDialectInterface {
+  using OpAsmDialectInterface::OpAsmDialectInterface;
+
+  /// Get a special name to use when printing the given operation. The desired
+  /// name should be streamed into 'os'.
+  void getOpResultName(Operation *op, raw_ostream &os) const final {
+    if (ConstantOp constant = dyn_cast<ConstantOp>(op))
+      return getConstantOpResultName(constant, os);
+  }
+
+  /// Get a special name to use when printing the given constant.
+  static void getConstantOpResultName(ConstantOp op, raw_ostream &os) {
+    Type type = op.getType();
+    Attribute value = op.getValue();
+    if (auto intCst = value.dyn_cast<IntegerAttr>()) {
+      if (type.isIndex()) {
+        os << 'c' << intCst.getInt();
+      } else if (type.cast<IntegerType>().isInteger(1)) {
+        // i1 constants get special names.
+        os << (intCst.getInt() ? "true" : "false");
+      } else {
+        os << 'c' << intCst.getInt() << '_' << type;
+      }
+    } else if (type.isa<FunctionType>()) {
+      os << 'f';
+    } else {
+      os << "cst";
+    }
+  }
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// StandardOpsDialect
+//===----------------------------------------------------------------------===//
+
+/// A custom binary operation printer that omits the "std." prefix from the
+/// operation names.
+static void printStandardBinaryOp(Operation *op, OpAsmPrinter *p) {
+  assert(op->getNumOperands() == 2 && "binary op should have two operands");
+  assert(op->getNumResults() == 1 && "binary op should have one result");
+
+  // If not all the operand and result types are the same, just use the
+  // generic assembly form to avoid omitting information in printing.
+  auto resultType = op->getResult(0)->getType();
+  if (op->getOperand(0)->getType() != resultType ||
+      op->getOperand(1)->getType() != resultType) {
+    p->printGenericOp(op);
+    return;
+  }
+
+  *p << op->getName().getStringRef().drop_front(strlen("std.")) << ' '
+     << *op->getOperand(0) << ", " << *op->getOperand(1);
+  p->printOptionalAttrDict(op->getAttrs());
+
+  // Now we can output only one type for all operands and the result.
+  *p << " : " << op->getResult(0)->getType();
+}
+
+/// A custom cast operation printer that omits the "std." prefix from the
+/// operation names.
+static void printStandardCastOp(Operation *op, OpAsmPrinter *p) {
+  *p << op->getName().getStringRef().drop_front(strlen("std.")) << ' '
+     << *op->getOperand(0) << " : " << op->getOperand(0)->getType() << " to "
+     << op->getResult(0)->getType();
+}
+
+/// A custom cast operation verifier.
+template <typename T> static LogicalResult verifyCastOp(T op) {
+  auto opType = op.getOperand()->getType();
+  auto resType = op.getType();
+  if (!T::areCastCompatible(opType, resType))
+    return op.emitError("operand type ") << opType << " and result type "
+                                         << resType << " are cast incompatible";
+
+  return success();
+}
+
+StandardOpsDialect::StandardOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<DmaStartOp, DmaWaitOp,
+#define GET_OP_LIST
+#include "mlir/Dialect/StandardOps/Ops.cpp.inc"
+                >();
+  addInterfaces<StdOpAsmInterface>();
+}
+
+void mlir::printDimAndSymbolList(Operation::operand_iterator begin,
+                                 Operation::operand_iterator end,
+                                 unsigned numDims, OpAsmPrinter *p) {
+  *p << '(';
+  p->printOperands(begin, begin + numDims);
+  *p << ')';
+
+  if (begin + numDims != end) {
+    *p << '[';
+    p->printOperands(begin + numDims, end);
+    *p << ']';
+  }
+}
+
+// Parses dimension and symbol list, and sets 'numDims' to the number of
+// dimension operands parsed.
+// Returns 'false' on success and 'true' on error.
+ParseResult mlir::parseDimAndSymbolList(OpAsmParser *parser,
+                                        SmallVector<Value *, 4> &operands,
+                                        unsigned &numDims) {
+  SmallVector<OpAsmParser::OperandType, 8> opInfos;
+  if (parser->parseOperandList(opInfos, OpAsmParser::Delimiter::Paren))
+    return failure();
+  // Store number of dimensions for validation by caller.
+  numDims = opInfos.size();
+
+  // Parse the optional symbol operands.
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  if (parser->parseOperandList(opInfos,
+                               OpAsmParser::Delimiter::OptionalSquare) ||
+      parser->resolveOperands(opInfos, affineIntTy, operands))
+    return failure();
+  return success();
+}
+
+/// Matches a ConstantIndexOp.
+/// TODO: This should probably just be a general matcher that uses m_Constant
+/// and checks the operation for an index type.
+static detail::op_matcher<ConstantIndexOp> m_ConstantIndex() {
+  return detail::op_matcher<ConstantIndexOp>();
+}
+
+//===----------------------------------------------------------------------===//
+// Common canonicalization pattern support logic
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This is a common class used for patterns of the form
+/// "someop(memrefcast) -> someop".  It folds the source of any memref_cast
+/// into the root operation directly.
+struct MemRefCastFolder : public RewritePattern {
+  /// The rootOpName is the name of the root operation to match against.
+  MemRefCastFolder(StringRef rootOpName, MLIRContext *context)
+      : RewritePattern(rootOpName, 1, context) {}
+
+  PatternMatchResult match(Operation *op) const override {
+    for (auto *operand : op->getOperands())
+      if (matchPattern(operand, m_Op<MemRefCastOp>()))
+        return matchSuccess();
+
+    return matchFailure();
+  }
+
+  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+    for (unsigned i = 0, e = op->getNumOperands(); i != e; ++i)
+      if (auto *memref = op->getOperand(i)->getDefiningOp())
+        if (auto cast = dyn_cast<MemRefCastOp>(memref))
+          op->setOperand(i, cast.getOperand());
+    rewriter.updatedRootInPlace(op);
+  }
+};
+
+/// Performs const folding `calculate` with element-wise behavior on the two
+/// attributes in `operands` and returns the result if possible.
+template <class AttrElementT,
+          class ElementValueT = typename AttrElementT::ValueType,
+          class CalculationT =
+              std::function<ElementValueT(ElementValueT, ElementValueT)>>
+Attribute constFoldBinaryOp(ArrayRef<Attribute> operands,
+                            const CalculationT &calculate) {
+  assert(operands.size() == 2 && "binary op takes two operands");
+
+  if (auto lhs = operands[0].dyn_cast_or_null<AttrElementT>()) {
+    auto rhs = operands[1].dyn_cast_or_null<AttrElementT>();
+    if (!rhs || lhs.getType() != rhs.getType())
+      return {};
+
+    return AttrElementT::get(lhs.getType(),
+                             calculate(lhs.getValue(), rhs.getValue()));
+  } else if (auto lhs = operands[0].dyn_cast_or_null<SplatElementsAttr>()) {
+    auto rhs = operands[1].dyn_cast_or_null<SplatElementsAttr>();
+    if (!rhs || lhs.getType() != rhs.getType())
+      return {};
+
+    auto elementResult = constFoldBinaryOp<AttrElementT>(
+        {lhs.getSplatValue(), rhs.getSplatValue()}, calculate);
+    if (!elementResult)
+      return {};
+
+    return DenseElementsAttr::get(lhs.getType(), elementResult);
+  }
+  return {};
+}
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// AddFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AddFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a + b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AddIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AddIOp::fold(ArrayRef<Attribute> operands) {
+  /// addi(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a + b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AllocOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, AllocOp op) {
+  *p << "alloc";
+
+  // Print dynamic dimension operands.
+  MemRefType type = op.getType();
+  printDimAndSymbolList(op.operand_begin(), op.operand_end(),
+                        type.getNumDynamicDims(), p);
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"map"});
+  *p << " : " << type;
+}
+
+static ParseResult parseAllocOp(OpAsmParser *parser, OperationState *result) {
+  MemRefType type;
+
+  // Parse the dimension operands and optional symbol operands, followed by a
+  // memref type.
+  unsigned numDimOperands;
+  if (parseDimAndSymbolList(parser, result->operands, numDimOperands) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type))
+    return failure();
+
+  // Check numDynamicDims against number of question marks in memref type.
+  // Note: this check remains here (instead of in verify()), because the
+  // partition between dim operands and symbol operands is lost after parsing.
+  // Verification still checks that the total number of operands matches
+  // the number of symbols in the affine map, plus the number of dynamic
+  // dimensions in the memref.
+  if (numDimOperands != type.getNumDynamicDims())
+    return parser->emitError(parser->getNameLoc())
+           << "dimension operand count does not equal memref dynamic dimension "
+              "count";
+  result->types.push_back(type);
+  return success();
+}
+
+static LogicalResult verify(AllocOp op) {
+  auto memRefType = op.getResult()->getType().dyn_cast<MemRefType>();
+  if (!memRefType)
+    return op.emitOpError("result must be a memref");
+
+  unsigned numSymbols = 0;
+  if (!memRefType.getAffineMaps().empty()) {
+    // Store number of symbols used in affine map (used in subsequent check).
+    AffineMap affineMap = memRefType.getAffineMaps()[0];
+    numSymbols = affineMap.getNumSymbols();
+  }
+
+  // Check that the total number of operands matches the number of symbols in
+  // the affine map, plus the number of dynamic dimensions specified in the
+  // memref type.
+  unsigned numDynamicDims = memRefType.getNumDynamicDims();
+  if (op.getOperation()->getNumOperands() != numDynamicDims + numSymbols)
+    return op.emitOpError(
+        "operand count does not equal dimension plus symbol operand count");
+
+  // Verify that all operands are of type Index.
+  for (auto operandType : op.getOperandTypes())
+    if (!operandType.isIndex())
+      return op.emitOpError("requires operands to be of type Index");
+  return success();
+}
+
+namespace {
+/// Fold constant dimensions into an alloc operation.
+struct SimplifyAllocConst : public OpRewritePattern<AllocOp> {
+  using OpRewritePattern<AllocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AllocOp alloc,
+                                     PatternRewriter &rewriter) const override {
+    // Check to see if any dimensions operands are constants.  If so, we can
+    // substitute and drop them.
+    if (llvm::none_of(alloc.getOperands(), [](Value *operand) {
+          return matchPattern(operand, m_ConstantIndex());
+        }))
+      return matchFailure();
+
+    auto memrefType = alloc.getType();
+
+    // Ok, we have one or more constant operands.  Collect the non-constant ones
+    // and keep track of the resultant memref type to build.
+    SmallVector<int64_t, 4> newShapeConstants;
+    newShapeConstants.reserve(memrefType.getRank());
+    SmallVector<Value *, 4> newOperands;
+    SmallVector<Value *, 4> droppedOperands;
+
+    unsigned dynamicDimPos = 0;
+    for (unsigned dim = 0, e = memrefType.getRank(); dim < e; ++dim) {
+      int64_t dimSize = memrefType.getDimSize(dim);
+      // If this is already static dimension, keep it.
+      if (dimSize != -1) {
+        newShapeConstants.push_back(dimSize);
+        continue;
+      }
+      auto *defOp = alloc.getOperand(dynamicDimPos)->getDefiningOp();
+      if (auto constantIndexOp = dyn_cast_or_null<ConstantIndexOp>(defOp)) {
+        // Dynamic shape dimension will be folded.
+        newShapeConstants.push_back(constantIndexOp.getValue());
+        // Record to check for zero uses later below.
+        droppedOperands.push_back(constantIndexOp);
+      } else {
+        // Dynamic shape dimension not folded; copy operand from old memref.
+        newShapeConstants.push_back(-1);
+        newOperands.push_back(alloc.getOperand(dynamicDimPos));
+      }
+      dynamicDimPos++;
+    }
+
+    // Create new memref type (which will have fewer dynamic dimensions).
+    auto newMemRefType = MemRefType::get(
+        newShapeConstants, memrefType.getElementType(),
+        memrefType.getAffineMaps(), memrefType.getMemorySpace());
+    assert(static_cast<int64_t>(newOperands.size()) ==
+           newMemRefType.getNumDynamicDims());
+
+    // Create and insert the alloc op for the new memref.
+    auto newAlloc =
+        rewriter.create<AllocOp>(alloc.getLoc(), newMemRefType, newOperands);
+    // Insert a cast so we have the same type as the old alloc.
+    auto resultCast = rewriter.create<MemRefCastOp>(alloc.getLoc(), newAlloc,
+                                                    alloc.getType());
+
+    rewriter.replaceOp(alloc, {resultCast}, droppedOperands);
+    return matchSuccess();
+  }
+};
+
+/// Fold alloc operations with no uses. Alloc has side effects on the heap,
+/// but can still be deleted if it has zero uses.
+struct SimplifyDeadAlloc : public OpRewritePattern<AllocOp> {
+  using OpRewritePattern<AllocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AllocOp alloc,
+                                     PatternRewriter &rewriter) const override {
+    // Check if the alloc'ed value has any uses.
+    if (!alloc.use_empty())
+      return matchFailure();
+
+    // If it doesn't, we can eliminate it.
+    alloc.erase();
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+void AllocOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<SimplifyAllocConst, SimplifyDeadAlloc>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BranchOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseBranchOp(OpAsmParser *parser, OperationState *result) {
+  Block *dest;
+  SmallVector<Value *, 4> destOperands;
+  if (parser->parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result->addSuccessor(dest, destOperands);
+  return success();
+}
+
+static void print(OpAsmPrinter *p, BranchOp op) {
+  *p << "br ";
+  p->printSuccessorAndUseList(op.getOperation(), 0);
+}
+
+Block *BranchOp::getDest() { return getOperation()->getSuccessor(0); }
+
+void BranchOp::setDest(Block *block) {
+  return getOperation()->setSuccessor(block, 0);
+}
+
+void BranchOp::eraseOperand(unsigned index) {
+  getOperation()->eraseSuccessorOperand(0, index);
+}
+
+//===----------------------------------------------------------------------===//
+// CallOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseCallOp(OpAsmParser *parser, OperationState *result) {
+  SymbolRefAttr calleeAttr;
+  FunctionType calleeType;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  auto calleeLoc = parser->getNameLoc();
+  if (parser->parseAttribute(calleeAttr, "callee", result->attributes) ||
+      parser->parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(calleeType) ||
+      parser->addTypesToList(calleeType.getResults(), result->types) ||
+      parser->resolveOperands(operands, calleeType.getInputs(), calleeLoc,
+                              result->operands))
+    return failure();
+
+  return success();
+}
+
+static void print(OpAsmPrinter *p, CallOp op) {
+  *p << "call " << op.getAttr("callee") << '(';
+  p->printOperands(op.getOperands());
+  *p << ')';
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"callee"});
+  *p << " : ";
+  p->printType(op.getCalleeType());
+}
+
+static LogicalResult verify(CallOp op) {
+  // Check that the callee attribute was specified.
+  auto fnAttr = op.getAttrOfType<SymbolRefAttr>("callee");
+  if (!fnAttr)
+    return op.emitOpError("requires a 'callee' symbol reference attribute");
+  auto fn =
+      op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
+  if (!fn)
+    return op.emitOpError() << "'" << fnAttr.getValue()
+                            << "' does not reference a valid function";
+
+  // Verify that the operand and result types match the callee.
+  auto fnType = fn.getType();
+  if (fnType.getNumInputs() != op.getNumOperands())
+    return op.emitOpError("incorrect number of operands for callee");
+
+  for (unsigned i = 0, e = fnType.getNumInputs(); i != e; ++i)
+    if (op.getOperand(i)->getType() != fnType.getInput(i))
+      return op.emitOpError("operand type mismatch");
+
+  if (fnType.getNumResults() != op.getNumResults())
+    return op.emitOpError("incorrect number of results for callee");
+
+  for (unsigned i = 0, e = fnType.getNumResults(); i != e; ++i)
+    if (op.getResult(i)->getType() != fnType.getResult(i))
+      return op.emitOpError("result type mismatch");
+
+  return success();
+}
+
+FunctionType CallOp::getCalleeType() {
+  SmallVector<Type, 4> resultTypes(getResultTypes());
+  SmallVector<Type, 8> argTypes(getOperandTypes());
+  return FunctionType::get(argTypes, resultTypes, getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// CallIndirectOp
+//===----------------------------------------------------------------------===//
+namespace {
+/// Fold indirect calls that have a constant function as the callee operand.
+struct SimplifyIndirectCallWithKnownCallee
+    : public OpRewritePattern<CallIndirectOp> {
+  using OpRewritePattern<CallIndirectOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CallIndirectOp indirectCall,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the callee is a constant callee.
+    SymbolRefAttr calledFn;
+    if (!matchPattern(indirectCall.getCallee(), m_Constant(&calledFn)))
+      return matchFailure();
+
+    // Replace with a direct call.
+    SmallVector<Type, 8> callResults(indirectCall.getResultTypes());
+    SmallVector<Value *, 8> callOperands(indirectCall.getArgOperands());
+    rewriter.replaceOpWithNewOp<CallOp>(indirectCall, calledFn.getValue(),
+                                        callResults, callOperands);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static ParseResult parseCallIndirectOp(OpAsmParser *parser,
+                                       OperationState *result) {
+  FunctionType calleeType;
+  OpAsmParser::OperandType callee;
+  llvm::SMLoc operandsLoc;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  return failure(
+      parser->parseOperand(callee) ||
+      parser->getCurrentLocation(&operandsLoc) ||
+      parser->parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(calleeType) ||
+      parser->resolveOperand(callee, calleeType, result->operands) ||
+      parser->resolveOperands(operands, calleeType.getInputs(), operandsLoc,
+                              result->operands) ||
+      parser->addTypesToList(calleeType.getResults(), result->types));
+}
+
+static void print(OpAsmPrinter *p, CallIndirectOp op) {
+  *p << "call_indirect ";
+  p->printOperand(op.getCallee());
+  *p << '(';
+  p->printOperands(op.getArgOperands());
+  *p << ')';
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"callee"});
+  *p << " : " << op.getCallee()->getType();
+}
+
+static LogicalResult verify(CallIndirectOp op) {
+  // The callee must be a function.
+  auto fnType = op.getCallee()->getType().dyn_cast<FunctionType>();
+  if (!fnType)
+    return op.emitOpError("callee must have function type");
+
+  // Verify that the operand and result types match the callee.
+  if (fnType.getNumInputs() != op.getNumOperands() - 1)
+    return op.emitOpError("incorrect number of operands for callee");
+
+  for (unsigned i = 0, e = fnType.getNumInputs(); i != e; ++i)
+    if (op.getOperand(i + 1)->getType() != fnType.getInput(i))
+      return op.emitOpError("operand type mismatch");
+
+  if (fnType.getNumResults() != op.getNumResults())
+    return op.emitOpError("incorrect number of results for callee");
+
+  for (unsigned i = 0, e = fnType.getNumResults(); i != e; ++i)
+    if (op.getResult(i)->getType() != fnType.getResult(i))
+      return op.emitOpError("result type mismatch");
+
+  return success();
+}
+
+void CallIndirectOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyIndirectCallWithKnownCallee>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// General helpers for comparison ops
+//===----------------------------------------------------------------------===//
+
+// Return the type of the same shape (scalar, vector or tensor) containing i1.
+static Type getCheckedI1SameShape(Builder *build, Type type) {
+  auto i1Type = build->getI1Type();
+  if (type.isIntOrIndexOrFloat())
+    return i1Type;
+  if (auto tensorType = type.dyn_cast<RankedTensorType>())
+    return build->getTensorType(tensorType.getShape(), i1Type);
+  if (type.isa<UnrankedTensorType>())
+    return build->getTensorType(i1Type);
+  if (auto vectorType = type.dyn_cast<VectorType>())
+    return build->getVectorType(vectorType.getShape(), i1Type);
+  return Type();
+}
+
+static Type getI1SameShape(Builder *build, Type type) {
+  Type res = getCheckedI1SameShape(build, type);
+  assert(res && "expected type with valid i1 shape");
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// CmpIOp
+//===----------------------------------------------------------------------===//
+
+// Returns an array of mnemonics for CmpIPredicates indexed by values thereof.
+static inline const char *const *getCmpIPredicateNames() {
+  static const char *predicateNames[]{
+      /*EQ*/ "eq",
+      /*NE*/ "ne",
+      /*SLT*/ "slt",
+      /*SLE*/ "sle",
+      /*SGT*/ "sgt",
+      /*SGE*/ "sge",
+      /*ULT*/ "ult",
+      /*ULE*/ "ule",
+      /*UGT*/ "ugt",
+      /*UGE*/ "uge",
+  };
+  static_assert(std::extent<decltype(predicateNames)>::value ==
+                    (size_t)CmpIPredicate::NumPredicates,
+                "wrong number of predicate names");
+  return predicateNames;
+}
+
+// Returns a value of the predicate corresponding to the given mnemonic.
+// Returns NumPredicates (one-past-end) if there is no such mnemonic.
+CmpIPredicate CmpIOp::getPredicateByName(StringRef name) {
+  return llvm::StringSwitch<CmpIPredicate>(name)
+      .Case("eq", CmpIPredicate::EQ)
+      .Case("ne", CmpIPredicate::NE)
+      .Case("slt", CmpIPredicate::SLT)
+      .Case("sle", CmpIPredicate::SLE)
+      .Case("sgt", CmpIPredicate::SGT)
+      .Case("sge", CmpIPredicate::SGE)
+      .Case("ult", CmpIPredicate::ULT)
+      .Case("ule", CmpIPredicate::ULE)
+      .Case("ugt", CmpIPredicate::UGT)
+      .Case("uge", CmpIPredicate::UGE)
+      .Default(CmpIPredicate::NumPredicates);
+}
+
+static void buildCmpIOp(Builder *build, OperationState *result,
+                        CmpIPredicate predicate, Value *lhs, Value *rhs) {
+  result->addOperands({lhs, rhs});
+  result->types.push_back(getI1SameShape(build, lhs->getType()));
+  result->addAttribute(
+      CmpIOp::getPredicateAttrName(),
+      build->getI64IntegerAttr(static_cast<int64_t>(predicate)));
+}
+
+static ParseResult parseCmpIOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Attribute predicateNameAttr;
+  Type type;
+  if (parser->parseAttribute(predicateNameAttr, CmpIOp::getPredicateAttrName(),
+                             attrs) ||
+      parser->parseComma() || parser->parseOperandList(ops, 2) ||
+      parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperands(ops, type, result->operands))
+    return failure();
+
+  if (!predicateNameAttr.isa<StringAttr>())
+    return parser->emitError(parser->getNameLoc(),
+                             "expected string comparison predicate attribute");
+
+  // Rewrite string attribute to an enum value.
+  StringRef predicateName = predicateNameAttr.cast<StringAttr>().getValue();
+  auto predicate = CmpIOp::getPredicateByName(predicateName);
+  if (predicate == CmpIPredicate::NumPredicates)
+    return parser->emitError(parser->getNameLoc())
+           << "unknown comparison predicate \"" << predicateName << "\"";
+
+  auto builder = parser->getBuilder();
+  Type i1Type = getCheckedI1SameShape(&builder, type);
+  if (!i1Type)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected type with valid i1 shape");
+
+  attrs[0].second = builder.getI64IntegerAttr(static_cast<int64_t>(predicate));
+  result->attributes = attrs;
+
+  result->addTypes({i1Type});
+  return success();
+}
+
+static void print(OpAsmPrinter *p, CmpIOp op) {
+  *p << "cmpi ";
+
+  auto predicateValue =
+      op.getAttrOfType<IntegerAttr>(CmpIOp::getPredicateAttrName()).getInt();
+  assert(predicateValue >= static_cast<int>(CmpIPredicate::FirstValidValue) &&
+         predicateValue < static_cast<int>(CmpIPredicate::NumPredicates) &&
+         "unknown predicate index");
+  Builder b(op.getContext());
+  auto predicateStringAttr =
+      b.getStringAttr(getCmpIPredicateNames()[predicateValue]);
+  p->printAttribute(predicateStringAttr);
+
+  *p << ", ";
+  p->printOperand(op.lhs());
+  *p << ", ";
+  p->printOperand(op.rhs());
+  p->printOptionalAttrDict(op.getAttrs(),
+                           /*elidedAttrs=*/{CmpIOp::getPredicateAttrName()});
+  *p << " : " << op.lhs()->getType();
+}
+
+static LogicalResult verify(CmpIOp op) {
+  auto predicateAttr =
+      op.getAttrOfType<IntegerAttr>(CmpIOp::getPredicateAttrName());
+  if (!predicateAttr)
+    return op.emitOpError("requires an integer attribute named 'predicate'");
+  auto predicate = predicateAttr.getInt();
+  if (predicate < (int64_t)CmpIPredicate::FirstValidValue ||
+      predicate >= (int64_t)CmpIPredicate::NumPredicates)
+    return op.emitOpError("'predicate' attribute value out of range");
+
+  return success();
+}
+
+// Compute `lhs` `pred` `rhs`, where `pred` is one of the known integer
+// comparison predicates.
+static bool applyCmpPredicate(CmpIPredicate predicate, const APInt &lhs,
+                              const APInt &rhs) {
+  switch (predicate) {
+  case CmpIPredicate::EQ:
+    return lhs.eq(rhs);
+  case CmpIPredicate::NE:
+    return lhs.ne(rhs);
+  case CmpIPredicate::SLT:
+    return lhs.slt(rhs);
+  case CmpIPredicate::SLE:
+    return lhs.sle(rhs);
+  case CmpIPredicate::SGT:
+    return lhs.sgt(rhs);
+  case CmpIPredicate::SGE:
+    return lhs.sge(rhs);
+  case CmpIPredicate::ULT:
+    return lhs.ult(rhs);
+  case CmpIPredicate::ULE:
+    return lhs.ule(rhs);
+  case CmpIPredicate::UGT:
+    return lhs.ugt(rhs);
+  case CmpIPredicate::UGE:
+    return lhs.uge(rhs);
+  default:
+    llvm_unreachable("unknown comparison predicate");
+  }
+}
+
+// Constant folding hook for comparisons.
+OpFoldResult CmpIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "cmpi takes two arguments");
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs || !rhs)
+    return {};
+
+  auto val = applyCmpPredicate(getPredicate(), lhs.getValue(), rhs.getValue());
+  return IntegerAttr::get(IntegerType::get(1, getContext()), APInt(1, val));
+}
+
+//===----------------------------------------------------------------------===//
+// CmpFOp
+//===----------------------------------------------------------------------===//
+
+// Returns an array of mnemonics for CmpFPredicates indexed by values thereof.
+static inline const char *const *getCmpFPredicateNames() {
+  static const char *predicateNames[] = {
+      /*AlwaysFalse*/ "false",
+      /*OEQ*/ "oeq",
+      /*OGT*/ "ogt",
+      /*OGE*/ "oge",
+      /*OLT*/ "olt",
+      /*OLE*/ "ole",
+      /*ONE*/ "one",
+      /*ORD*/ "ord",
+      /*UEQ*/ "ueq",
+      /*UGT*/ "ugt",
+      /*UGE*/ "uge",
+      /*ULT*/ "ult",
+      /*ULE*/ "ule",
+      /*UNE*/ "une",
+      /*UNO*/ "uno",
+      /*AlwaysTrue*/ "true",
+  };
+  static_assert(std::extent<decltype(predicateNames)>::value ==
+                    (size_t)CmpFPredicate::NumPredicates,
+                "wrong number of predicate names");
+  return predicateNames;
+}
+
+// Returns a value of the predicate corresponding to the given mnemonic.
+// Returns NumPredicates (one-past-end) if there is no such mnemonic.
+CmpFPredicate CmpFOp::getPredicateByName(StringRef name) {
+  return llvm::StringSwitch<CmpFPredicate>(name)
+      .Case("false", CmpFPredicate::AlwaysFalse)
+      .Case("oeq", CmpFPredicate::OEQ)
+      .Case("ogt", CmpFPredicate::OGT)
+      .Case("oge", CmpFPredicate::OGE)
+      .Case("olt", CmpFPredicate::OLT)
+      .Case("ole", CmpFPredicate::OLE)
+      .Case("one", CmpFPredicate::ONE)
+      .Case("ord", CmpFPredicate::ORD)
+      .Case("ueq", CmpFPredicate::UEQ)
+      .Case("ugt", CmpFPredicate::UGT)
+      .Case("uge", CmpFPredicate::UGE)
+      .Case("ult", CmpFPredicate::ULT)
+      .Case("ule", CmpFPredicate::ULE)
+      .Case("une", CmpFPredicate::UNE)
+      .Case("uno", CmpFPredicate::UNO)
+      .Case("true", CmpFPredicate::AlwaysTrue)
+      .Default(CmpFPredicate::NumPredicates);
+}
+
+static void buildCmpFOp(Builder *build, OperationState *result,
+                        CmpFPredicate predicate, Value *lhs, Value *rhs) {
+  result->addOperands({lhs, rhs});
+  result->types.push_back(getI1SameShape(build, lhs->getType()));
+  result->addAttribute(
+      CmpFOp::getPredicateAttrName(),
+      build->getI64IntegerAttr(static_cast<int64_t>(predicate)));
+}
+
+static ParseResult parseCmpFOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Attribute predicateNameAttr;
+  Type type;
+  if (parser->parseAttribute(predicateNameAttr, CmpFOp::getPredicateAttrName(),
+                             attrs) ||
+      parser->parseComma() || parser->parseOperandList(ops, 2) ||
+      parser->parseOptionalAttributeDict(attrs) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperands(ops, type, result->operands))
+    return failure();
+
+  if (!predicateNameAttr.isa<StringAttr>())
+    return parser->emitError(parser->getNameLoc(),
+                             "expected string comparison predicate attribute");
+
+  // Rewrite string attribute to an enum value.
+  StringRef predicateName = predicateNameAttr.cast<StringAttr>().getValue();
+  auto predicate = CmpFOp::getPredicateByName(predicateName);
+  if (predicate == CmpFPredicate::NumPredicates)
+    return parser->emitError(parser->getNameLoc(),
+                             "unknown comparison predicate \"" + predicateName +
+                                 "\"");
+
+  auto builder = parser->getBuilder();
+  Type i1Type = getCheckedI1SameShape(&builder, type);
+  if (!i1Type)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected type with valid i1 shape");
+
+  attrs[0].second = builder.getI64IntegerAttr(static_cast<int64_t>(predicate));
+  result->attributes = attrs;
+
+  result->addTypes({i1Type});
+  return success();
+}
+
+static void print(OpAsmPrinter *p, CmpFOp op) {
+  *p << "cmpf ";
+
+  auto predicateValue =
+      op.getAttrOfType<IntegerAttr>(CmpFOp::getPredicateAttrName()).getInt();
+  assert(predicateValue >= static_cast<int>(CmpFPredicate::FirstValidValue) &&
+         predicateValue < static_cast<int>(CmpFPredicate::NumPredicates) &&
+         "unknown predicate index");
+  Builder b(op.getContext());
+  auto predicateStringAttr =
+      b.getStringAttr(getCmpFPredicateNames()[predicateValue]);
+  p->printAttribute(predicateStringAttr);
+
+  *p << ", ";
+  p->printOperand(op.lhs());
+  *p << ", ";
+  p->printOperand(op.rhs());
+  p->printOptionalAttrDict(op.getAttrs(),
+                           /*elidedAttrs=*/{CmpFOp::getPredicateAttrName()});
+  *p << " : " << op.lhs()->getType();
+}
+
+static LogicalResult verify(CmpFOp op) {
+  auto predicateAttr =
+      op.getAttrOfType<IntegerAttr>(CmpFOp::getPredicateAttrName());
+  if (!predicateAttr)
+    return op.emitOpError("requires an integer attribute named 'predicate'");
+  auto predicate = predicateAttr.getInt();
+  if (predicate < (int64_t)CmpFPredicate::FirstValidValue ||
+      predicate >= (int64_t)CmpFPredicate::NumPredicates)
+    return op.emitOpError("'predicate' attribute value out of range");
+
+  return success();
+}
+
+// Compute `lhs` `pred` `rhs`, where `pred` is one of the known floating point
+// comparison predicates.
+static bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,
+                              const APFloat &rhs) {
+  auto cmpResult = lhs.compare(rhs);
+  switch (predicate) {
+  case CmpFPredicate::AlwaysFalse:
+    return false;
+  case CmpFPredicate::OEQ:
+    return cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::OGT:
+    return cmpResult == APFloat::cmpGreaterThan;
+  case CmpFPredicate::OGE:
+    return cmpResult == APFloat::cmpGreaterThan ||
+           cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::OLT:
+    return cmpResult == APFloat::cmpLessThan;
+  case CmpFPredicate::OLE:
+    return cmpResult == APFloat::cmpLessThan || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::ONE:
+    return cmpResult != APFloat::cmpUnordered && cmpResult != APFloat::cmpEqual;
+  case CmpFPredicate::ORD:
+    return cmpResult != APFloat::cmpUnordered;
+  case CmpFPredicate::UEQ:
+    return cmpResult == APFloat::cmpUnordered || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::UGT:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpGreaterThan;
+  case CmpFPredicate::UGE:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpGreaterThan ||
+           cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::ULT:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpLessThan;
+  case CmpFPredicate::ULE:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpLessThan || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::UNE:
+    return cmpResult != APFloat::cmpEqual;
+  case CmpFPredicate::UNO:
+    return cmpResult == APFloat::cmpUnordered;
+  case CmpFPredicate::AlwaysTrue:
+    return true;
+  default:
+    llvm_unreachable("unknown comparison predicate");
+  }
+}
+
+// Constant folding hook for comparisons.
+OpFoldResult CmpFOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "cmpf takes two arguments");
+
+  auto lhs = operands.front().dyn_cast_or_null<FloatAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<FloatAttr>();
+  if (!lhs || !rhs ||
+      // TODO(b/122019992) Implement and test constant folding for nan/inf when
+      // it is possible to have constant nan/inf
+      !lhs.getValue().isFinite() || !rhs.getValue().isFinite())
+    return {};
+
+  auto val = applyCmpPredicate(getPredicate(), lhs.getValue(), rhs.getValue());
+  return IntegerAttr::get(IntegerType::get(1, getContext()), APInt(1, val));
+}
+
+//===----------------------------------------------------------------------===//
+// CondBranchOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// cond_br true, ^bb1, ^bb2 -> br ^bb1
+/// cond_br false, ^bb1, ^bb2 -> br ^bb2
+///
+struct SimplifyConstCondBranchPred : public OpRewritePattern<CondBranchOp> {
+  using OpRewritePattern<CondBranchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CondBranchOp condbr,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the condition is a constant.
+    if (!matchPattern(condbr.getCondition(), m_Op<ConstantOp>()))
+      return matchFailure();
+
+    Block *foldedDest;
+    SmallVector<Value *, 4> branchArgs;
+
+    // If the condition is known to evaluate to false we fold to a branch to the
+    // false destination. Otherwise, we fold to a branch to the true
+    // destination.
+    if (matchPattern(condbr.getCondition(), m_Zero())) {
+      foldedDest = condbr.getFalseDest();
+      branchArgs.assign(condbr.false_operand_begin(),
+                        condbr.false_operand_end());
+    } else {
+      foldedDest = condbr.getTrueDest();
+      branchArgs.assign(condbr.true_operand_begin(), condbr.true_operand_end());
+    }
+
+    rewriter.replaceOpWithNewOp<BranchOp>(condbr, foldedDest, branchArgs);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static ParseResult parseCondBranchOp(OpAsmParser *parser,
+                                     OperationState *result) {
+  SmallVector<Value *, 4> destOperands;
+  Block *dest;
+  OpAsmParser::OperandType condInfo;
+
+  // Parse the condition.
+  Type int1Ty = parser->getBuilder().getI1Type();
+  if (parser->parseOperand(condInfo) || parser->parseComma() ||
+      parser->resolveOperand(condInfo, int1Ty, result->operands)) {
+    return parser->emitError(parser->getNameLoc(),
+                             "expected condition type was boolean (i1)");
+  }
+
+  // Parse the true successor.
+  if (parser->parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result->addSuccessor(dest, destOperands);
+
+  // Parse the false successor.
+  destOperands.clear();
+  if (parser->parseComma() ||
+      parser->parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result->addSuccessor(dest, destOperands);
+
+  return success();
+}
+
+static void print(OpAsmPrinter *p, CondBranchOp op) {
+  *p << "cond_br ";
+  p->printOperand(op.getCondition());
+  *p << ", ";
+  p->printSuccessorAndUseList(op.getOperation(), CondBranchOp::trueIndex);
+  *p << ", ";
+  p->printSuccessorAndUseList(op.getOperation(), CondBranchOp::falseIndex);
+}
+
+void CondBranchOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyConstCondBranchPred>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Constant*Op
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, ConstantOp &op) {
+  *p << "constant ";
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+
+  if (op.getAttrs().size() > 1)
+    *p << ' ';
+  p->printAttribute(op.getValue());
+
+  // If the value is a symbol reference, print a trailing type.
+  if (op.getValue().isa<SymbolRefAttr>())
+    *p << " : " << op.getType();
+}
+
+static ParseResult parseConstantOp(OpAsmParser *parser,
+                                   OperationState *result) {
+  Attribute valueAttr;
+  if (parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseAttribute(valueAttr, "value", result->attributes))
+    return failure();
+
+  // If the attribute is a symbol reference, then we expect a trailing type.
+  Type type;
+  if (!valueAttr.isa<SymbolRefAttr>())
+    type = valueAttr.getType();
+  else if (parser->parseColonType(type))
+    return failure();
+
+  // Add the attribute type to the list.
+  return parser->addTypeToList(type, result->types);
+}
+
+/// The constant op requires an attribute, and furthermore requires that it
+/// matches the return type.
+static LogicalResult verify(ConstantOp &op) {
+  auto value = op.getValue();
+  if (!value)
+    return op.emitOpError("requires a 'value' attribute");
+
+  auto type = op.getType();
+  if (!value.getType().isa<NoneType>() && type != value.getType())
+    return op.emitOpError() << "requires attribute's type (" << value.getType()
+                            << ") to match op's return type (" << type << ")";
+
+  if (type.isa<IndexType>() || value.isa<BoolAttr>())
+    return success();
+
+  if (auto intAttr = value.dyn_cast<IntegerAttr>()) {
+    // If the type has a known bitwidth we verify that the value can be
+    // represented with the given bitwidth.
+    auto bitwidth = type.cast<IntegerType>().getWidth();
+    auto intVal = intAttr.getValue();
+    if (!intVal.isSignedIntN(bitwidth) && !intVal.isIntN(bitwidth))
+      return op.emitOpError("requires 'value' to be an integer within the "
+                            "range of the integer result type");
+    return success();
+  }
+
+  if (type.isa<FloatType>()) {
+    if (!value.isa<FloatAttr>())
+      return op.emitOpError("requires 'value' to be a floating point constant");
+    return success();
+  }
+
+  if (type.isa<ShapedType>()) {
+    if (!value.isa<ElementsAttr>())
+      return op.emitOpError("requires 'value' to be a shaped constant");
+    return success();
+  }
+
+  if (type.isa<FunctionType>()) {
+    auto fnAttr = value.dyn_cast<SymbolRefAttr>();
+    if (!fnAttr)
+      return op.emitOpError("requires 'value' to be a function reference");
+
+    // Try to find the referenced function.
+    auto fn =
+        op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
+    if (!fn)
+      return op.emitOpError("reference to undefined function 'bar'");
+
+    // Check that the referenced function has the correct type.
+    if (fn.getType() != type)
+      return op.emitOpError("reference to function with mismatched type");
+
+    return success();
+  }
+
+  if (type.isa<NoneType>() && value.isa<UnitAttr>())
+    return success();
+
+  return op.emitOpError("unsupported 'value' attribute: ") << value;
+}
+
+OpFoldResult ConstantOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+  return getValue();
+}
+
+/// Returns true if a constant operation can be built with the given value and
+/// result type.
+bool ConstantOp::isBuildableWith(Attribute value, Type type) {
+  // SymbolRefAttr can only be used with a function type.
+  if (value.isa<SymbolRefAttr>())
+    return type.isa<FunctionType>();
+  // Otherwise, the attribute must have the same type as 'type'.
+  if (value.getType() != type)
+    return false;
+  // Finally, check that the attribute kind is handled.
+  return value.isa<BoolAttr>() || value.isa<IntegerAttr>() ||
+         value.isa<FloatAttr>() || value.isa<ElementsAttr>() ||
+         value.isa<UnitAttr>();
+}
+
+void ConstantFloatOp::build(Builder *builder, OperationState *result,
+                            const APFloat &value, FloatType type) {
+  ConstantOp::build(builder, result, type, builder->getFloatAttr(type, value));
+}
+
+bool ConstantFloatOp::classof(Operation *op) {
+  return ConstantOp::classof(op) &&
+         op->getResult(0)->getType().isa<FloatType>();
+}
+
+/// ConstantIntOp only matches values whose result type is an IntegerType.
+bool ConstantIntOp::classof(Operation *op) {
+  return ConstantOp::classof(op) &&
+         op->getResult(0)->getType().isa<IntegerType>();
+}
+
+void ConstantIntOp::build(Builder *builder, OperationState *result,
+                          int64_t value, unsigned width) {
+  Type type = builder->getIntegerType(width);
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+/// Build a constant int op producing an integer with the specified type,
+/// which must be an integer type.
+void ConstantIntOp::build(Builder *builder, OperationState *result,
+                          int64_t value, Type type) {
+  assert(type.isa<IntegerType>() && "ConstantIntOp can only have integer type");
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+/// ConstantIndexOp only matches values whose result type is Index.
+bool ConstantIndexOp::classof(Operation *op) {
+  return ConstantOp::classof(op) && op->getResult(0)->getType().isIndex();
+}
+
+void ConstantIndexOp::build(Builder *builder, OperationState *result,
+                            int64_t value) {
+  Type type = builder->getIndexType();
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+//===----------------------------------------------------------------------===//
+// DeallocOp
+//===----------------------------------------------------------------------===//
+namespace {
+/// Fold Dealloc operations that are deallocating an AllocOp that is only used
+/// by other Dealloc operations.
+struct SimplifyDeadDealloc : public OpRewritePattern<DeallocOp> {
+  using OpRewritePattern<DeallocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(DeallocOp dealloc,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the memref operand's defining operation is an AllocOp.
+    Value *memref = dealloc.memref();
+    if (!isa_and_nonnull<AllocOp>(memref->getDefiningOp()))
+      return matchFailure();
+
+    // Check that all of the uses of the AllocOp are other DeallocOps.
+    for (auto *user : memref->getUsers())
+      if (!isa<DeallocOp>(user))
+        return matchFailure();
+
+    // Erase the dealloc operation.
+    rewriter.replaceOp(dealloc, llvm::None);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static void print(OpAsmPrinter *p, DeallocOp op) {
+  *p << "dealloc " << *op.memref() << " : " << op.memref()->getType();
+}
+
+static ParseResult parseDeallocOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType memrefInfo;
+  MemRefType type;
+
+  return failure(parser->parseOperand(memrefInfo) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(memrefInfo, type, result->operands));
+}
+
+static LogicalResult verify(DeallocOp op) {
+  if (!op.memref()->getType().isa<MemRefType>())
+    return op.emitOpError("operand must be a memref");
+  return success();
+}
+
+void DeallocOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  /// dealloc(memrefcast) -> dealloc
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+  results.insert<SimplifyDeadDealloc>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// DimOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, DimOp op) {
+  *p << "dim " << *op.getOperand() << ", " << op.getIndex();
+  p->printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"index"});
+  *p << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseDimOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType operandInfo;
+  IntegerAttr indexAttr;
+  Type type;
+  Type indexType = parser->getBuilder().getIndexType();
+
+  return failure(parser->parseOperand(operandInfo) || parser->parseComma() ||
+                 parser->parseAttribute(indexAttr, indexType, "index",
+                                        result->attributes) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(operandInfo, type, result->operands) ||
+                 parser->addTypeToList(indexType, result->types));
+}
+
+static LogicalResult verify(DimOp op) {
+  // Check that we have an integer index operand.
+  auto indexAttr = op.getAttrOfType<IntegerAttr>("index");
+  if (!indexAttr)
+    return op.emitOpError("requires an integer attribute named 'index'");
+  int64_t index = indexAttr.getValue().getSExtValue();
+
+  auto type = op.getOperand()->getType();
+  if (auto tensorType = type.dyn_cast<RankedTensorType>()) {
+    if (index >= tensorType.getRank())
+      return op.emitOpError("index is out of range");
+  } else if (auto memrefType = type.dyn_cast<MemRefType>()) {
+    if (index >= memrefType.getRank())
+      return op.emitOpError("index is out of range");
+
+  } else if (type.isa<UnrankedTensorType>()) {
+    // ok, assumed to be in-range.
+  } else {
+    return op.emitOpError("requires an operand with tensor or memref type");
+  }
+
+  return success();
+}
+
+OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
+  // Constant fold dim when the size along the index referred to is a constant.
+  auto opType = getOperand()->getType();
+  int64_t indexSize = -1;
+  if (auto tensorType = opType.dyn_cast<RankedTensorType>())
+    indexSize = tensorType.getShape()[getIndex()];
+  else if (auto memrefType = opType.dyn_cast<MemRefType>())
+    indexSize = memrefType.getShape()[getIndex()];
+
+  if (indexSize >= 0)
+    return IntegerAttr::get(IndexType::get(getContext()), indexSize);
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// DivISOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult DivISOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "binary operation takes two operands");
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs || !rhs)
+    return {};
+
+  // Don't fold if it requires division by zero.
+  if (rhs.getValue().isNullValue())
+    return {};
+
+  // Don't fold if it would overflow.
+  bool overflow;
+  auto result = lhs.getValue().sdiv_ov(rhs.getValue(), overflow);
+  return overflow ? IntegerAttr() : IntegerAttr::get(lhs.getType(), result);
+}
+
+//===----------------------------------------------------------------------===//
+// DivIUOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult DivIUOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "binary operation takes two operands");
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs || !rhs)
+    return {};
+
+  // Don't fold if it requires division by zero.
+  auto rhsValue = rhs.getValue();
+  if (rhsValue.isNullValue())
+    return {};
+
+  return IntegerAttr::get(lhs.getType(), lhs.getValue().udiv(rhsValue));
+}
+
+// ---------------------------------------------------------------------------
+// DmaStartOp
+// ---------------------------------------------------------------------------
+
+void DmaStartOp::build(Builder *builder, OperationState *result,
+                       Value *srcMemRef, ArrayRef<Value *> srcIndices,
+                       Value *destMemRef, ArrayRef<Value *> destIndices,
+                       Value *numElements, Value *tagMemRef,
+                       ArrayRef<Value *> tagIndices, Value *stride,
+                       Value *elementsPerStride) {
+  result->addOperands(srcMemRef);
+  result->addOperands(srcIndices);
+  result->addOperands(destMemRef);
+  result->addOperands(destIndices);
+  result->addOperands({numElements, tagMemRef});
+  result->addOperands(tagIndices);
+  if (stride)
+    result->addOperands({stride, elementsPerStride});
+}
+
+void DmaStartOp::print(OpAsmPrinter *p) {
+  *p << "dma_start " << *getSrcMemRef() << '[';
+  p->printOperands(getSrcIndices());
+  *p << "], " << *getDstMemRef() << '[';
+  p->printOperands(getDstIndices());
+  *p << "], " << *getNumElements();
+  *p << ", " << *getTagMemRef() << '[';
+  p->printOperands(getTagIndices());
+  *p << ']';
+  if (isStrided()) {
+    *p << ", " << *getStride();
+    *p << ", " << *getNumElementsPerStride();
+  }
+  p->printOptionalAttrDict(getAttrs());
+  *p << " : " << getSrcMemRef()->getType();
+  *p << ", " << getDstMemRef()->getType();
+  *p << ", " << getTagMemRef()->getType();
+}
+
+// Parse DmaStartOp.
+// Ex:
+//   %dma_id = dma_start %src[%i, %j], %dst[%k, %l], %size,
+//                       %tag[%index], %stride, %num_elt_per_stride :
+//                     : memref<3076 x f32, 0>,
+//                       memref<1024 x f32, 2>,
+//                       memref<1 x i32>
+//
+ParseResult DmaStartOp::parse(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType srcMemRefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> srcIndexInfos;
+  OpAsmParser::OperandType dstMemRefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> dstIndexInfos;
+  OpAsmParser::OperandType numElementsInfo;
+  OpAsmParser::OperandType tagMemrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> tagIndexInfos;
+  SmallVector<OpAsmParser::OperandType, 2> strideInfo;
+
+  SmallVector<Type, 3> types;
+  auto indexType = parser->getBuilder().getIndexType();
+
+  // Parse and resolve the following list of operands:
+  // *) source memref followed by its indices (in square brackets).
+  // *) destination memref followed by its indices (in square brackets).
+  // *) dma size in KiB.
+  if (parser->parseOperand(srcMemRefInfo) ||
+      parser->parseOperandList(srcIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser->parseComma() || parser->parseOperand(dstMemRefInfo) ||
+      parser->parseOperandList(dstIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser->parseComma() || parser->parseOperand(numElementsInfo) ||
+      parser->parseComma() || parser->parseOperand(tagMemrefInfo) ||
+      parser->parseOperandList(tagIndexInfos, OpAsmParser::Delimiter::Square))
+    return failure();
+
+  // Parse optional stride and elements per stride.
+  if (parser->parseTrailingOperandList(strideInfo))
+    return failure();
+
+  bool isStrided = strideInfo.size() == 2;
+  if (!strideInfo.empty() && !isStrided) {
+    return parser->emitError(parser->getNameLoc(),
+                             "expected two stride related operands");
+  }
+
+  if (parser->parseColonTypeList(types))
+    return failure();
+  if (types.size() != 3)
+    return parser->emitError(parser->getNameLoc(), "fewer/more types expected");
+
+  if (parser->resolveOperand(srcMemRefInfo, types[0], result->operands) ||
+      parser->resolveOperands(srcIndexInfos, indexType, result->operands) ||
+      parser->resolveOperand(dstMemRefInfo, types[1], result->operands) ||
+      parser->resolveOperands(dstIndexInfos, indexType, result->operands) ||
+      // size should be an index.
+      parser->resolveOperand(numElementsInfo, indexType, result->operands) ||
+      parser->resolveOperand(tagMemrefInfo, types[2], result->operands) ||
+      // tag indices should be index.
+      parser->resolveOperands(tagIndexInfos, indexType, result->operands))
+    return failure();
+
+  auto memrefType0 = types[0].dyn_cast<MemRefType>();
+  if (!memrefType0)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected source to be of memref type");
+
+  auto memrefType1 = types[1].dyn_cast<MemRefType>();
+  if (!memrefType1)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected destination to be of memref type");
+
+  auto memrefType2 = types[2].dyn_cast<MemRefType>();
+  if (!memrefType2)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected tag to be of memref type");
+
+  if (isStrided) {
+    if (parser->resolveOperands(strideInfo, indexType, result->operands))
+      return failure();
+  }
+
+  // Check that source/destination index list size matches associated rank.
+  if (static_cast<int64_t>(srcIndexInfos.size()) != memrefType0.getRank() ||
+      static_cast<int64_t>(dstIndexInfos.size()) != memrefType1.getRank())
+    return parser->emitError(parser->getNameLoc(),
+                             "memref rank not equal to indices count");
+  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType2.getRank())
+    return parser->emitError(parser->getNameLoc(),
+                             "tag memref rank not equal to indices count");
+
+  return success();
+}
+
+LogicalResult DmaStartOp::verify() {
+  // DMAs from different memory spaces supported.
+  if (getSrcMemorySpace() == getDstMemorySpace())
+    return emitOpError("DMA should be between different memory spaces");
+
+  if (getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
+                              getDstMemRefRank() + 3 + 1 &&
+      getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
+                              getDstMemRefRank() + 3 + 1 + 2) {
+    return emitOpError("incorrect number of operands");
+  }
+  return success();
+}
+
+void DmaStartOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  /// dma_start(memrefcast) -> dma_start
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+// ---------------------------------------------------------------------------
+// DmaWaitOp
+// ---------------------------------------------------------------------------
+
+void DmaWaitOp::build(Builder *builder, OperationState *result,
+                      Value *tagMemRef, ArrayRef<Value *> tagIndices,
+                      Value *numElements) {
+  result->addOperands(tagMemRef);
+  result->addOperands(tagIndices);
+  result->addOperands(numElements);
+}
+
+void DmaWaitOp::print(OpAsmPrinter *p) {
+  *p << "dma_wait ";
+  p->printOperand(getTagMemRef());
+  *p << '[';
+  p->printOperands(getTagIndices());
+  *p << "], ";
+  p->printOperand(getNumElements());
+  p->printOptionalAttrDict(getAttrs());
+  *p << " : " << getTagMemRef()->getType();
+}
+
+// Parse DmaWaitOp.
+// Eg:
+//   dma_wait %tag[%index], %num_elements : memref<1 x i32, (d0) -> (d0), 4>
+//
+ParseResult DmaWaitOp::parse(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType tagMemrefInfo;
+  SmallVector<OpAsmParser::OperandType, 2> tagIndexInfos;
+  Type type;
+  auto indexType = parser->getBuilder().getIndexType();
+  OpAsmParser::OperandType numElementsInfo;
+
+  // Parse tag memref, its indices, and dma size.
+  if (parser->parseOperand(tagMemrefInfo) ||
+      parser->parseOperandList(tagIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser->parseComma() || parser->parseOperand(numElementsInfo) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(tagMemrefInfo, type, result->operands) ||
+      parser->resolveOperands(tagIndexInfos, indexType, result->operands) ||
+      parser->resolveOperand(numElementsInfo, indexType, result->operands))
+    return failure();
+
+  auto memrefType = type.dyn_cast<MemRefType>();
+  if (!memrefType)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected tag to be of memref type");
+
+  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType.getRank())
+    return parser->emitError(parser->getNameLoc(),
+                             "tag memref rank not equal to indices count");
+
+  return success();
+}
+
+void DmaWaitOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  /// dma_wait(memrefcast) -> dma_wait
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractElementOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, ExtractElementOp op) {
+  *p << "extract_element " << *op.getAggregate() << '[';
+  p->printOperands(op.getIndices());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getAggregate()->getType();
+}
+
+static ParseResult parseExtractElementOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  OpAsmParser::OperandType aggregateInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  ShapedType type;
+
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(aggregateInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(aggregateInfo, type, result->operands) ||
+      parser->resolveOperands(indexInfo, affineIntTy, result->operands) ||
+      parser->addTypeToList(type.getElementType(), result->types));
+}
+
+static LogicalResult verify(ExtractElementOp op) {
+  auto aggregateType = op.getAggregate()->getType().cast<ShapedType>();
+
+  // This should be possible with tablegen type constraints
+  if (op.getType() != aggregateType.getElementType())
+    return op.emitOpError("result type must match element type of aggregate");
+
+  // Verify the # indices match if we have a ranked type.
+  if (aggregateType.hasRank() &&
+      aggregateType.getRank() != op.getNumOperands() - 1)
+    return op.emitOpError("incorrect number of indices for extract_element");
+
+  return success();
+}
+
+OpFoldResult ExtractElementOp::fold(ArrayRef<Attribute> operands) {
+  assert(!operands.empty() && "extract_element takes atleast one operand");
+
+  // The aggregate operand must be a known constant.
+  Attribute aggregate = operands.front();
+  if (!aggregate)
+    return {};
+
+  // If this is a splat elements attribute, simply return the value. All of the
+  // elements of a splat attribute are the same.
+  if (auto splatAggregate = aggregate.dyn_cast<SplatElementsAttr>())
+    return splatAggregate.getSplatValue();
+
+  // Otherwise, collect the constant indices into the aggregate.
+  SmallVector<uint64_t, 8> indices;
+  for (Attribute indice : llvm::drop_begin(operands, 1)) {
+    if (!indice || !indice.isa<IntegerAttr>())
+      return {};
+    indices.push_back(indice.cast<IntegerAttr>().getInt());
+  }
+
+  // If this is an elements attribute, query the value at the given indices.
+  auto elementsAttr = aggregate.dyn_cast<ElementsAttr>();
+  if (elementsAttr && elementsAttr.isValidIndex(indices))
+    return elementsAttr.getValue(indices);
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// IndexCastOp
+//===----------------------------------------------------------------------===//
+
+// Index cast is applicable from index to integer and backwards.
+bool IndexCastOp::areCastCompatible(Type a, Type b) {
+  return (a.isIndex() && b.isa<IntegerType>()) ||
+         (a.isa<IntegerType>() && b.isIndex());
+}
+
+//===----------------------------------------------------------------------===//
+// LoadOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, LoadOp op) {
+  *p << "load " << *op.getMemRef() << '[';
+  p->printOperands(op.getIndices());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getMemRefType();
+}
+
+static ParseResult parseLoadOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  MemRefType type;
+
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(memrefInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperand(memrefInfo, type, result->operands) ||
+      parser->resolveOperands(indexInfo, affineIntTy, result->operands) ||
+      parser->addTypeToList(type.getElementType(), result->types));
+}
+
+static LogicalResult verify(LoadOp op) {
+  if (op.getType() != op.getMemRefType().getElementType())
+    return op.emitOpError("result type must match element type of memref");
+
+  if (op.getMemRefType().getRank() != op.getNumOperands() - 1)
+    return op.emitOpError("incorrect number of indices for load");
+
+  for (auto *idx : op.getIndices())
+    if (!idx->getType().isIndex())
+      return op.emitOpError("index to load must have 'index' type");
+
+  // TODO: Verify we have the right number of indices.
+
+  // TODO: in Function verify that the indices are parameters, IV's, or the
+  // result of an affine.apply.
+  return success();
+}
+
+void LoadOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                         MLIRContext *context) {
+  /// load(memrefcast) -> load
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// MemRefCastOp
+//===----------------------------------------------------------------------===//
+
+bool MemRefCastOp::areCastCompatible(Type a, Type b) {
+  auto aT = a.dyn_cast<MemRefType>();
+  auto bT = b.dyn_cast<MemRefType>();
+
+  if (!aT || !bT)
+    return false;
+  if (aT.getElementType() != bT.getElementType())
+    return false;
+  if (aT.getAffineMaps() != bT.getAffineMaps())
+    return false;
+  if (aT.getMemorySpace() != bT.getMemorySpace())
+    return false;
+
+  // They must have the same rank, and any specified dimensions must match.
+  if (aT.getRank() != bT.getRank())
+    return false;
+
+  for (unsigned i = 0, e = aT.getRank(); i != e; ++i) {
+    int64_t aDim = aT.getDimSize(i), bDim = bT.getDimSize(i);
+    if (aDim != -1 && bDim != -1 && aDim != bDim)
+      return false;
+  }
+
+  return true;
+}
+
+OpFoldResult MemRefCastOp::fold(ArrayRef<Attribute> operands) {
+  return impl::foldCastOp(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// MulFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a * b; });
+}
+
+//===----------------------------------------------------------------------===//
+// MulIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulIOp::fold(ArrayRef<Attribute> operands) {
+  /// muli(x, 0) -> 0
+  if (matchPattern(rhs(), m_Zero()))
+    return rhs();
+  /// muli(x, 1) -> x
+  if (matchPattern(rhs(), m_One()))
+    return getOperand(0);
+
+  // TODO: Handle the overflow case.
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a * b; });
+}
+
+//===----------------------------------------------------------------------===//
+// RankOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, RankOp op) {
+  *p << "rank " << *op.getOperand() << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseRankOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType operandInfo;
+  Type type;
+  Type indexType = parser->getBuilder().getIndexType();
+  return failure(parser->parseOperand(operandInfo) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(operandInfo, type, result->operands) ||
+                 parser->addTypeToList(indexType, result->types));
+}
+
+OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+  // Constant fold rank when the rank of the tensor is known.
+  auto type = getOperand()->getType();
+  if (auto tensorType = type.dyn_cast<RankedTensorType>())
+    return IntegerAttr::get(IndexType::get(getContext()), tensorType.getRank());
+  return IntegerAttr();
+}
+
+//===----------------------------------------------------------------------===//
+// RemISOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult RemISOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "remis takes two operands");
+
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!rhs)
+    return {};
+  auto rhsValue = rhs.getValue();
+
+  // x % 1 = 0
+  if (rhsValue.isOneValue())
+    return IntegerAttr::get(rhs.getType(), APInt(rhsValue.getBitWidth(), 0));
+
+  // Don't fold if it requires division by zero.
+  if (rhsValue.isNullValue())
+    return {};
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs)
+    return {};
+  return IntegerAttr::get(lhs.getType(), lhs.getValue().srem(rhsValue));
+}
+
+//===----------------------------------------------------------------------===//
+// RemIUOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult RemIUOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "remiu takes two operands");
+
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!rhs)
+    return {};
+  auto rhsValue = rhs.getValue();
+
+  // x % 1 = 0
+  if (rhsValue.isOneValue())
+    return IntegerAttr::get(rhs.getType(), APInt(rhsValue.getBitWidth(), 0));
+
+  // Don't fold if it requires division by zero.
+  if (rhsValue.isNullValue())
+    return {};
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs)
+    return {};
+  return IntegerAttr::get(lhs.getType(), lhs.getValue().urem(rhsValue));
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReturnOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser->getCurrentLocation();
+  return failure(parser->parseOperandList(opInfo) ||
+                 (!opInfo.empty() && parser->parseColonTypeList(types)) ||
+                 parser->resolveOperands(opInfo, types, loc, result->operands));
+}
+
+static void print(OpAsmPrinter *p, ReturnOp op) {
+  *p << "return";
+  if (op.getNumOperands() != 0) {
+    *p << ' ';
+    p->printOperands(op.getOperands());
+    *p << " : ";
+    interleaveComma(op.getOperandTypes(), *p);
+  }
+}
+
+static LogicalResult verify(ReturnOp op) {
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError("has ")
+           << op.getNumOperands()
+           << " operands, but enclosing function returns " << results.size();
+
+  for (unsigned i = 0, e = results.size(); i != e; ++i)
+    if (op.getOperand(i)->getType() != results[i])
+      return op.emitError()
+             << "type of return operand " << i << " ("
+             << op.getOperand(i)->getType()
+             << ") doesn't match function result type (" << results[i] << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SIToFPOp
+//===----------------------------------------------------------------------===//
+
+// sitofp is applicable from integer types to float types.
+bool SIToFPOp::areCastCompatible(Type a, Type b) {
+  return a.isa<IntegerType>() && b.isa<FloatType>();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSelectOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 3> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Type type;
+  if (parser->parseOperandList(ops, 3) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type))
+    return failure();
+
+  auto i1Type = getCheckedI1SameShape(&parser->getBuilder(), type);
+  if (!i1Type)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected type with valid i1 shape");
+
+  SmallVector<Type, 3> types = {i1Type, type, type};
+  return failure(parser->resolveOperands(ops, types, parser->getNameLoc(),
+                                         result->operands) ||
+                 parser->addTypeToList(type, result->types));
+}
+
+static void print(OpAsmPrinter *p, SelectOp op) {
+  *p << "select ";
+  p->printOperands(op.getOperands());
+  *p << " : " << op.getTrueValue()->getType();
+  p->printOptionalAttrDict(op.getAttrs());
+}
+
+static LogicalResult verify(SelectOp op) {
+  auto trueType = op.getTrueValue()->getType();
+  auto falseType = op.getFalseValue()->getType();
+
+  if (trueType != falseType)
+    return op.emitOpError(
+        "requires 'true' and 'false' arguments to be of the same type");
+
+  return success();
+}
+
+OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
+  auto *condition = getCondition();
+
+  // select true, %0, %1 => %0
+  if (matchPattern(condition, m_One()))
+    return getTrueValue();
+
+  // select false, %0, %1 => %1
+  if (matchPattern(condition, m_Zero()))
+    return getFalseValue();
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// StoreOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, StoreOp op) {
+  *p << "store " << *op.getValueToStore();
+  *p << ", " << *op.getMemRef() << '[';
+  p->printOperands(op.getIndices());
+  *p << ']';
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getMemRefType();
+}
+
+static ParseResult parseStoreOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  MemRefType memrefType;
+
+  auto affineIntTy = parser->getBuilder().getIndexType();
+  return failure(
+      parser->parseOperand(storeValueInfo) || parser->parseComma() ||
+      parser->parseOperand(memrefInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(memrefType) ||
+      parser->resolveOperand(storeValueInfo, memrefType.getElementType(),
+                             result->operands) ||
+      parser->resolveOperand(memrefInfo, memrefType, result->operands) ||
+      parser->resolveOperands(indexInfo, affineIntTy, result->operands));
+}
+
+static LogicalResult verify(StoreOp op) {
+  // First operand must have same type as memref element type.
+  if (op.getValueToStore()->getType() != op.getMemRefType().getElementType())
+    return op.emitOpError(
+        "first operand must have same type memref element type");
+
+  if (op.getNumOperands() != 2 + op.getMemRefType().getRank())
+    return op.emitOpError("store index operand count not equal to memref rank");
+
+  for (auto *idx : op.getIndices())
+    if (!idx->getType().isIndex())
+      return op.emitOpError("index to load must have 'index' type");
+
+  // TODO: Verify we have the right number of indices.
+
+  // TODO: in Function verify that the indices are parameters, IV's, or the
+  // result of an affine.apply.
+  return success();
+}
+
+void StoreOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  /// store(memrefcast) -> store
+  results.insert<MemRefCastFolder>(getOperationName(), context);
+}
+
+//===----------------------------------------------------------------------===//
+// SubFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SubFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a - b; });
+}
+
+//===----------------------------------------------------------------------===//
+// SubIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SubIOp::fold(ArrayRef<Attribute> operands) {
+  // subi(x,x) -> 0
+  if (getOperand(0) == getOperand(1))
+    return Builder(getContext()).getZeroAttr(getType());
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a - b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AndOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AndOp::fold(ArrayRef<Attribute> operands) {
+  /// and(x, 0) -> 0
+  if (matchPattern(rhs(), m_Zero()))
+    return rhs();
+  /// and(x,x) -> x
+  if (lhs() == rhs())
+    return rhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a & b; });
+}
+
+//===----------------------------------------------------------------------===//
+// OrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult OrOp::fold(ArrayRef<Attribute> operands) {
+  /// or(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+  /// or(x,x) -> x
+  if (lhs() == rhs())
+    return rhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a | b; });
+}
+
+//===----------------------------------------------------------------------===//
+// XOrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult XOrOp::fold(ArrayRef<Attribute> operands) {
+  /// xor(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+  /// xor(x,x) -> 0
+  if (lhs() == rhs())
+    return Builder(getContext()).getZeroAttr(getType());
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a ^ b; });
+}
+
+//===----------------------------------------------------------------------===//
+// TensorCastOp
+//===----------------------------------------------------------------------===//
+
+bool TensorCastOp::areCastCompatible(Type a, Type b) {
+  auto aT = a.dyn_cast<TensorType>();
+  auto bT = b.dyn_cast<TensorType>();
+  if (!aT || !bT)
+    return false;
+
+  if (aT.getElementType() != bT.getElementType())
+    return false;
+
+  // If the either are unranked, then the cast is valid.
+  auto aRType = aT.dyn_cast<RankedTensorType>();
+  auto bRType = bT.dyn_cast<RankedTensorType>();
+  if (!aRType || !bRType)
+    return true;
+
+  // If they are both ranked, they have to have the same rank, and any specified
+  // dimensions must match.
+  if (aRType.getRank() != bRType.getRank())
+    return false;
+
+  for (unsigned i = 0, e = aRType.getRank(); i != e; ++i) {
+    int64_t aDim = aRType.getDimSize(i), bDim = bRType.getDimSize(i);
+    if (aDim != -1 && bDim != -1 && aDim != bDim)
+      return false;
+  }
+
+  return true;
+}
+
+OpFoldResult TensorCastOp::fold(ArrayRef<Attribute> operands) {
+  return impl::foldCastOp(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for Tensor[Load|Store]Op
+//===----------------------------------------------------------------------===//
+
+static Type getTensorTypeFromMemRefType(Builder &b, Type type) {
+  if (auto memref = type.dyn_cast<MemRefType>())
+    return b.getTensorType(memref.getShape(), memref.getElementType());
+  return b.getNoneType();
+}
+
+//===----------------------------------------------------------------------===//
+// TensorLoadOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, TensorLoadOp op) {
+  *p << "tensor_load " << *op.getOperand();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseTensorLoadOp(OpAsmParser *parser,
+                                     OperationState *result) {
+  OpAsmParser::OperandType op;
+  Type type;
+  return failure(parser->parseOperand(op) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperand(op, type, result->operands) ||
+                 parser->addTypeToList(
+                     getTensorTypeFromMemRefType(parser->getBuilder(), type),
+                     result->types));
+}
+
+//===----------------------------------------------------------------------===//
+// TensorStoreOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, TensorStoreOp op) {
+  *p << "tensor_store " << *op.tensor() << ", " << *op.memref();
+  p->printOptionalAttrDict(op.getAttrs());
+  *p << " : " << op.memref()->getType();
+}
+
+static ParseResult parseTensorStoreOp(OpAsmParser *parser,
+                                      OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  llvm::SMLoc loc = parser->getCurrentLocation();
+  return failure(
+      parser->parseOperandList(ops, /*requiredOperandCount=*/2) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonType(type) ||
+      parser->resolveOperands(
+          ops, {getTensorTypeFromMemRefType(parser->getBuilder(), type), type},
+          loc, result->operands));
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/StandardOps/Ops.cpp.inc"
diff --git a/third_party/mlir/lib/Dialect/Traits.cpp b/third_party/mlir/lib/Dialect/Traits.cpp
new file mode 100644
index 00000000000..9945b6ae4c2
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/Traits.cpp
@@ -0,0 +1,221 @@
+//===- Traits.cpp - Common op traits shared by dialects -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+bool OpTrait::util::getBroadcastedShape(ArrayRef<int64_t> shape1,
+                                        ArrayRef<int64_t> shape2,
+                                        SmallVectorImpl<int64_t> &resultShape) {
+  // To compute the result broadcasted shape, we compare operand shapes
+  // element-wise: starting with the trailing dimensions, and working the
+  // way backward. Two dimensions are compatible when
+  //   1. they are equal, or
+  //   2. one of them is 1
+  // The result shape has the maximum among the two inputs at every
+  // dimension index.
+
+  resultShape.clear();
+  if (shape1.size() > shape2.size()) {
+    std::copy(shape1.begin(), shape1.end(), std::back_inserter(resultShape));
+  } else {
+    std::copy(shape2.begin(), shape2.end(), std::back_inserter(resultShape));
+  }
+
+  auto i1 = shape1.rbegin(), e1 = shape1.rend();
+  auto i2 = shape2.rbegin(), e2 = shape2.rend();
+  auto iR = resultShape.rbegin();
+
+  // Check each dimension is consistent.
+  for (; i1 != e1 && i2 != e2; ++i1, ++i2, ++iR) {
+    if (*i1 == -1 || *i2 == -1) {
+      // One or both dimensions is unknown. Follow TensorFlow behavior:
+      // - If either dimension is greater than 1, we assume that the program is
+      //   correct, and the other dimension will be broadcast to match it.
+      // - If either dimension is 1, the other dimension is the output.
+      if (*i1 > 1) {
+        *iR = *i1;
+      } else if (*i2 > 1) {
+        *iR = *i2;
+      } else if (*i1 == 1) {
+        *iR = *i2;
+      } else if (*i2 == 1) {
+        *iR = *i1;
+      } else {
+        *iR = -1;
+      }
+    } else {
+      if (*i1 == *i2 || *i2 == 1) {
+        *iR = *i1;
+      } else if (*i1 == 1) {
+        *iR = *i2;
+      } else {
+        // This dimension of the two operand types is incompatible.
+        resultShape.clear();
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// Returns the shape of the given type. Scalars will be considered as having a
+/// shape with zero dimensions.
+static ArrayRef<int64_t> getShape(Type type) {
+  if (auto sType = type.dyn_cast<ShapedType>())
+    return sType.getShape();
+  return {};
+}
+
+/// Returns the result broadcast composition type from the two given types by
+/// following NumPy broadcast semantics. Returned type may have dynamic shape if
+/// either of the input types has dynamic shape. Returns null type if the two
+/// given types are not broadcast-compatible.
+Type OpTrait::util::getBroadcastedType(Type type1, Type type2) {
+  // Returns the scalar type out of the given type.
+  auto getScalarType = [](Type type) -> Type {
+    if (auto shapedType = type.dyn_cast<ShapedType>())
+      return shapedType.getElementType();
+    return type;
+  };
+
+  // Make sure underlying scalar type is the same.
+  auto scalarType = getScalarType(type1);
+  if (scalarType != getScalarType(type2))
+    return {};
+
+  // If one of the types is unranked tensor, then the other type shouldn't be
+  // vector and the result should have unranked tensor type.
+  if (type1.isa<UnrankedTensorType>() || type2.isa<UnrankedTensorType>()) {
+    if (type1.isa<VectorType>() || type2.isa<VectorType>())
+      return {};
+    return UnrankedTensorType::get(scalarType);
+  }
+
+  // Returns the type kind if the given type is a vector or ranked tensor type.
+  // Returns llvm::None otherwise.
+  auto getCompositeTypeKind =
+      [](Type type) -> llvm::Optional<StandardTypes::Kind> {
+    if (type.isa<VectorType>() || type.isa<RankedTensorType>())
+      return static_cast<StandardTypes::Kind>(type.getKind());
+    return llvm::None;
+  };
+
+  // Make sure the composite type, if has, is consistent.
+  auto compositeKind1 = getCompositeTypeKind(type1);
+  auto compositeKind2 = getCompositeTypeKind(type2);
+  llvm::Optional<StandardTypes::Kind> resultCompositeKind;
+
+  if (compositeKind1 && compositeKind2) {
+    // Disallow mixing vector and tensor.
+    if (compositeKind1 != compositeKind2)
+      return {};
+    resultCompositeKind = compositeKind1;
+  } else if (compositeKind1) {
+    resultCompositeKind = compositeKind1;
+  } else if (compositeKind2) {
+    resultCompositeKind = compositeKind2;
+  }
+
+  // Get the shape of each type.
+  SmallVector<int64_t, 4> resultShape;
+  if (!getBroadcastedShape(getShape(type1), getShape(type2), resultShape))
+    return {};
+
+  // Compose the final broadcasted type
+  if (resultCompositeKind == StandardTypes::Vector)
+    return VectorType::get(resultShape, scalarType);
+  if (resultCompositeKind == StandardTypes::RankedTensor)
+    return RankedTensorType::get(resultShape, scalarType);
+  return scalarType;
+}
+
+/// Returns true if the given types has both vector types and tensor types.
+static bool hasBothVectorAndTensorType(ArrayRef<Type> types) {
+  return llvm::any_of(types, [](Type t) { return t.isa<VectorType>(); }) &&
+         llvm::any_of(types, [](Type t) { return t.isa<TensorType>(); });
+}
+
+static bool areCompatibleShapes(ArrayRef<int64_t> shape1,
+                                ArrayRef<int64_t> shape2) {
+  auto isCompatible = [](int64_t dim1, int64_t dim2) {
+    return dim1 == dim2 || dim1 == -1 || dim2 == -1;
+  };
+  if (shape1.size() != shape2.size())
+    return false;
+  for (const auto &p : llvm::zip(shape1, shape2))
+    if (!isCompatible(std::get<0>(p), std::get<1>(p)))
+      return false;
+  return true;
+}
+
+LogicalResult OpTrait::impl::verifyCompatibleOperandBroadcast(Operation *op) {
+  assert(op->getNumOperands() == 2 &&
+         "only support broadcast check on two operands");
+  assert(op->getNumResults() == 1 &&
+         "only support broadcast check on one result");
+
+  auto type1 = op->getOperand(0)->getType();
+  auto type2 = op->getOperand(1)->getType();
+  auto retType = op->getResult(0)->getType();
+
+  // We forbid broadcasting vector and tensor.
+  if (hasBothVectorAndTensorType({type1, type2, retType}))
+    return op->emitError("cannot broadcast vector with tensor");
+
+  if (retType.isa<UnrankedTensorType>())
+    return success();
+
+  bool isUnranked1 = type1.isa<UnrankedTensorType>();
+  bool isUnranked2 = type2.isa<UnrankedTensorType>();
+
+  // If both operands are unranked, then all result shapes are possible.
+  if (isUnranked1 && isUnranked2)
+    return success();
+
+  // If one of the operands is unranked, then the known dimensions in the result
+  // should be compatible with the other shaped operand.
+  if (isUnranked1 || isUnranked2) {
+    // Result should have higher rank than the shaped operand's rank and then
+    // the result's trailing dimensions should be compatible with the operand
+    // shape.
+    ArrayRef<int64_t> shape = getShape(!isUnranked1 ? type1 : type2);
+    ArrayRef<int64_t> actualSuffix = getShape(retType).take_back(shape.size());
+    if (!areCompatibleShapes(actualSuffix, shape))
+      return op->emitOpError()
+             << "result type " << retType
+             << " has shape incompatible with a ranked operand type";
+    return success();
+  }
+
+  // If both operands are shaped, then the computed broadcasted shape should be
+  // compatible with the result shape.
+  SmallVector<int64_t, 4> resultShape;
+  if (!util::getBroadcastedShape(getShape(type1), getShape(type2), resultShape))
+    return op->emitOpError("operands don't have broadcast-compatible shapes");
+
+  if (!areCompatibleShapes(resultShape, getShape(retType)))
+    return op->emitOpError() << "result type " << retType
+                             << " does not have shape compatible with the one "
+                                "computed from the operand types";
+
+  return success();
+}
diff --git a/third_party/mlir/lib/Dialect/VectorOps/CMakeLists.txt b/third_party/mlir/lib/Dialect/VectorOps/CMakeLists.txt
new file mode 100644
index 00000000000..590eeed6f41
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/VectorOps/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_library(MLIRVectorOps
+  DialectRegistration.cpp
+  VectorOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/VectorOps
+  )
+
+add_dependencies(MLIRVectorOps MLIRVectorOpsIncGen)
+
+target_link_libraries(MLIRVectorOps MLIRIR)
diff --git a/third_party/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp b/third_party/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp
new file mode 100644
index 00000000000..0caa1cf629e
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp
@@ -0,0 +1,22 @@
+//===- DialectRegistration.cpp - Register super vectorization dialect -----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+using namespace mlir;
+
+// Static initialization for VectorOps dialect registration.
+static DialectRegistration<vector::VectorOpsDialect> VectorOps;
diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp
new file mode 100644
index 00000000000..64255762dec
--- /dev/null
+++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp
@@ -0,0 +1,555 @@
+//===- VectorOps.cpp - MLIR Super Vectorizer Operations -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements convenience types for working with super-vectorization
+// operations, in particular super-vector loads and stores.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+//===----------------------------------------------------------------------===//
+// VectorOpsDialect
+//===----------------------------------------------------------------------===//
+
+mlir::vector::VectorOpsDialect::VectorOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<VectorTransferReadOp, VectorTransferWriteOp,
+                VectorTypeCastOp>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/VectorOps/VectorOps.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractElementOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, ExtractElementOp op) {
+  *p << op.getOperationName() << " " << *op.vector() << op.position();
+  p->printOptionalAttrDict(op.getAttrs(), {"position"});
+  *p << " : " << op.vector()->getType();
+}
+
+static ParseResult parseExtractElementOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  llvm::SMLoc attributeLoc, typeLoc;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType vector;
+  Type type;
+  Attribute attr;
+  if (parser->parseOperand(vector) ||
+      parser->getCurrentLocation(&attributeLoc) ||
+      parser->parseAttribute(attr, "position", attrs) ||
+      parser->parseOptionalAttributeDict(attrs) ||
+      parser->getCurrentLocation(&typeLoc) || parser->parseColonType(type))
+    return failure();
+
+  auto vectorType = type.dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser->emitError(typeLoc, "expected vector type");
+
+  auto positionAttr = attr.dyn_cast<ArrayAttr>();
+  if (!positionAttr ||
+      static_cast<int64_t>(positionAttr.size()) > vectorType.getRank())
+    return parser->emitError(
+        attributeLoc,
+        "expected position attribute of rank smaller than vector");
+
+  Type resType =
+      (static_cast<int64_t>(positionAttr.size()) == vectorType.getRank())
+          ? vectorType.getElementType()
+          : VectorType::get(
+                vectorType.getShape().drop_front(positionAttr.size()),
+                vectorType.getElementType());
+
+  result->attributes = attrs;
+  return failure(parser->resolveOperand(vector, type, result->operands) ||
+                 parser->addTypeToList(resType, result->types));
+}
+
+static LogicalResult verify(ExtractElementOp op) {
+  auto positionAttr = op.position().getValue();
+  if (positionAttr.empty())
+    return op.emitOpError("expected non-empty position attribute");
+  if (positionAttr.size() > static_cast<unsigned>(op.getVectorType().getRank()))
+    return op.emitOpError(
+        "expected position attribute of rank smaller than vector");
+  for (auto en : llvm::enumerate(positionAttr)) {
+    auto attr = en.value().dyn_cast<IntegerAttr>();
+    if (!attr || attr.getInt() < 0 ||
+        attr.getInt() > op.getVectorType().getDimSize(en.index()))
+      return op.emitOpError("expected position attribute #")
+             << (en.index() + 1)
+             << " to be a positive integer smaller than the corresponding "
+                "vector dimension";
+  }
+  return success();
+}
+//===----------------------------------------------------------------------===//
+// OuterProductOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter *p, OuterProductOp op) {
+  *p << op.getOperationName() << " " << *op.lhs() << ", " << *op.rhs();
+  if (llvm::size(op.acc()) > 0)
+    *p << ", " << **op.acc().begin();
+  *p << " : " << op.lhs()->getType() << ", " << op.rhs()->getType();
+}
+
+static ParseResult parseOuterProductOp(OpAsmParser *parser,
+                                       OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 3> operandsInfo;
+  Type tLHS, tRHS;
+  if (parser->parseOperandList(operandsInfo) || parser->parseColonType(tLHS) ||
+      parser->parseComma() || parser->parseType(tRHS))
+    return failure();
+  if (operandsInfo.size() < 2)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected at least 2 operands");
+  VectorType vLHS = tLHS.dyn_cast<VectorType>();
+  VectorType vRHS = tRHS.dyn_cast<VectorType>();
+  if (!vLHS || !vRHS)
+    return parser->emitError(parser->getNameLoc(), "expected 2 vector types");
+  VectorType resType = VectorType::get({vLHS.getDimSize(0), vRHS.getDimSize(0)},
+                                       vLHS.getElementType());
+  return failure(
+      parser->resolveOperand(operandsInfo[0], tLHS, result->operands) ||
+      parser->resolveOperand(operandsInfo[1], tRHS, result->operands) ||
+      (operandsInfo.size() > 2 &&
+       parser->resolveOperand(operandsInfo[2], resType, result->operands)) ||
+      parser->addTypeToList(resType, result->types));
+}
+
+static LogicalResult verify(OuterProductOp op) {
+  VectorType vLHS = op.getOperandVectorTypeLHS(),
+             vRHS = op.getOperandVectorTypeRHS(),
+             vACC = op.getOperandVectorTypeACC(), vRES = op.getVectorType();
+  if (vLHS.getRank() != 1)
+    return op.emitOpError("expected 1-d vector for operand #1");
+  if (vRHS.getRank() != 1)
+    return op.emitOpError("expected 1-d vector for operand #2");
+  if (vRES.getRank() != 2)
+    return op.emitOpError("expected 2-d vector result");
+  if (vLHS.getDimSize(0) != vRES.getDimSize(0))
+    return op.emitOpError("expected #1 operand dim to match result dim #1");
+  if (vRHS.getDimSize(0) != vRES.getDimSize(1))
+    return op.emitOpError("expected #2 operand dim to match result dim #2");
+  if (vACC && vACC != vRES)
+    return op.emitOpError("expected operand #3 of same type as result type");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// VectorTransferReadOp
+//===----------------------------------------------------------------------===//
+template <typename EmitFun>
+static LogicalResult verifyPermutationMap(AffineMap permutationMap,
+                                          EmitFun emitOpError) {
+  SmallVector<bool, 8> seen(permutationMap.getNumInputs(), false);
+  for (auto expr : permutationMap.getResults()) {
+    auto dim = expr.dyn_cast<AffineDimExpr>();
+    auto zero = expr.dyn_cast<AffineConstantExpr>();
+    if (zero) {
+      if (zero.getValue() != 0) {
+        return emitOpError(
+            "requires a projected permutation_map (at most one dim or the zero "
+            "constant can appear in each result)");
+      }
+      continue;
+    }
+    if (!dim) {
+      return emitOpError("requires a projected permutation_map (at most one "
+                         "dim or the zero constant can appear in each result)");
+    }
+    if (seen[dim.getPosition()]) {
+      return emitOpError(
+          "requires a permutation_map that is a permutation (found one dim "
+          "used more than once)");
+    }
+    seen[dim.getPosition()] = true;
+  }
+  return success();
+}
+
+void VectorTransferReadOp::build(Builder *builder, OperationState *result,
+                                 VectorType vectorType, Value *srcMemRef,
+                                 ArrayRef<Value *> srcIndices,
+                                 AffineMap permutationMap,
+                                 Optional<Value *> paddingValue) {
+  result->addOperands(srcMemRef);
+  result->addOperands(srcIndices);
+  if (paddingValue) {
+    result->addOperands({*paddingValue});
+  }
+  result->addAttribute(getPermutationMapAttrName(),
+                       builder->getAffineMapAttr(permutationMap));
+  result->addTypes(vectorType);
+}
+
+auto VectorTransferReadOp::getIndices() -> operand_range {
+  auto begin = getOperation()->operand_begin() + Offsets::FirstIndexOffset;
+  auto end = begin + getMemRefType().getRank();
+  return {begin, end};
+}
+
+Optional<Value *> VectorTransferReadOp::getPaddingValue() {
+  auto memRefRank = getMemRefType().getRank();
+  if (getNumOperands() <= Offsets::FirstIndexOffset + memRefRank) {
+    return None;
+  }
+  return Optional<Value *>(getOperand(Offsets::FirstIndexOffset + memRefRank));
+}
+
+AffineMap VectorTransferReadOp::getPermutationMap() {
+  return getAttrOfType<AffineMapAttr>(getPermutationMapAttrName()).getValue();
+}
+
+void VectorTransferReadOp::print(OpAsmPrinter *p) {
+  *p << getOperationName() << " ";
+  p->printOperand(getMemRef());
+  *p << "[";
+  p->printOperands(getIndices());
+  *p << "]";
+  auto optionalPaddingValue = getPaddingValue();
+  if (optionalPaddingValue) {
+    *p << ", (";
+    p->printOperand(*optionalPaddingValue);
+    *p << ")";
+  }
+  p->printOptionalAttrDict(getAttrs());
+  *p << " : " << getMemRefType();
+  *p << ", " << getResultType();
+}
+
+ParseResult VectorTransferReadOp::parse(OpAsmParser *parser,
+                                        OperationState *result) {
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 8> indexInfo;
+  SmallVector<OpAsmParser::OperandType, 8> paddingInfo;
+  SmallVector<Type, 2> types;
+
+  // Parsing with support for optional paddingValue.
+  if (parser->parseOperand(memrefInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseTrailingOperandList(paddingInfo,
+                                       OpAsmParser::Delimiter::Paren) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonTypeList(types))
+    return failure();
+
+  // Resolution.
+  if (types.size() != 2)
+    return parser->emitError(parser->getNameLoc(), "expected 2 types");
+  MemRefType memrefType = types[0].dyn_cast<MemRefType>();
+  if (!memrefType)
+    return parser->emitError(parser->getNameLoc(), "memRef type expected");
+  VectorType vectorType = types[1].dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser->emitError(parser->getNameLoc(), "vector type expected");
+
+  // Extract optional paddingValue.
+  // At this point, indexInfo may contain the optional paddingValue, pop it
+  // out.
+  if (static_cast<int64_t>(indexInfo.size()) != memrefType.getRank())
+    return parser->emitError(parser->getNameLoc(),
+                             "expected " + Twine(memrefType.getRank()) +
+                                 " indices to the memref");
+  if (paddingInfo.size() > 1)
+    return parser->emitError(parser->getNameLoc(),
+                             "expected at most one padding value");
+  Type paddingType;
+  bool hasOptionalPaddingValue = !paddingInfo.empty();
+  if (hasOptionalPaddingValue) {
+    paddingType = vectorType.getElementType();
+  }
+  auto indexType = parser->getBuilder().getIndexType();
+  return failure(
+      parser->resolveOperand(memrefInfo, memrefType, result->operands) ||
+      parser->resolveOperands(indexInfo, indexType, result->operands) ||
+      (hasOptionalPaddingValue &&
+       parser->resolveOperand(paddingInfo[0], paddingType, result->operands)) ||
+      parser->addTypeToList(vectorType, result->types));
+}
+
+LogicalResult VectorTransferReadOp::verify() {
+  // Consistency of memref type in function type.
+  if (llvm::empty(getOperands())) {
+    return emitOpError(
+        "requires at least a memref operand followed by 'rank' indices");
+  }
+  if (!getMemRef()->getType().isa<MemRefType>()) {
+    return emitOpError("requires a memref as first operand");
+  }
+  // Consistency of vector type in function type.
+  if (!getResult()->getType().isa<VectorType>()) {
+    return emitOpError("should have a vector result type in function type: "
+                       "memref_type<...xelemental_type>, vector_type");
+  }
+  // Consistency of elemental types in memref and vector.
+  MemRefType memrefType = getMemRefType();
+  VectorType vectorType = getResultType();
+  if (memrefType.getElementType() != vectorType.getElementType())
+    return emitOpError(
+        "requires memref and vector types of the same elemental type");
+  // Consistency of number of input types.
+  auto optionalPaddingValue = getPaddingValue();
+  unsigned expectedNumOperands = Offsets::FirstIndexOffset +
+                                 memrefType.getRank() +
+                                 (optionalPaddingValue ? 1 : 0);
+  // Checks on the actual operands and their types.
+  if (getNumOperands() != expectedNumOperands) {
+    return emitOpError("expects ")
+           << expectedNumOperands << " operands (of which "
+           << memrefType.getRank() << " indices)";
+  }
+  // Consistency of padding value with vector type.
+  if (optionalPaddingValue) {
+    auto paddingValue = *optionalPaddingValue;
+    auto elementalType = paddingValue->getType();
+    if (!VectorType::isValidElementType(elementalType)) {
+      return emitOpError("requires valid padding vector elemental type");
+    }
+    if (elementalType != vectorType.getElementType()) {
+      return emitOpError(
+          "requires formal padding and vector of the same elemental type");
+    }
+  }
+  // Consistency of indices types.
+  unsigned numIndices = 0;
+  for (auto *idx : getIndices()) {
+    if (!idx->getType().isIndex()) {
+      return emitOpError(
+          "index to vector.transfer_read must have 'index' type");
+    }
+    ++numIndices;
+  }
+  if (numIndices != memrefType.getRank()) {
+    return emitOpError("requires at least a memref operand followed by ")
+           << memrefType.getRank() << " indices";
+  }
+
+  // Consistency of AffineMap attribute.
+  if (!getAttrOfType<AffineMapAttr>(getPermutationMapAttrName())) {
+    return emitOpError("requires an AffineMapAttr named 'permutation_map'");
+  }
+  auto permutationMap = getPermutationMap();
+  if (permutationMap.getNumSymbols() != 0) {
+    return emitOpError("requires a permutation_map without symbols");
+  }
+  if (permutationMap.getNumInputs() != memrefType.getRank()) {
+    return emitOpError("requires a permutation_map with input dims of the "
+                       "same rank as the memref type");
+  }
+  if (permutationMap.getNumResults() != vectorType.getRank()) {
+    return emitOpError("requires a permutation_map with result dims of the "
+                       "same rank as the vector type (")
+           << permutationMap.getNumResults() << " vs " << vectorType.getRank();
+  }
+  return verifyPermutationMap(permutationMap,
+                              [this](Twine t) { return emitOpError(t); });
+}
+
+//===----------------------------------------------------------------------===//
+// VectorTransferWriteOp
+//===----------------------------------------------------------------------===//
+void VectorTransferWriteOp::build(Builder *builder, OperationState *result,
+                                  Value *srcVector, Value *dstMemRef,
+                                  ArrayRef<Value *> dstIndices,
+                                  AffineMap permutationMap) {
+  result->addOperands({srcVector, dstMemRef});
+  result->addOperands(dstIndices);
+  result->addAttribute(getPermutationMapAttrName(),
+                       builder->getAffineMapAttr(permutationMap));
+}
+
+auto VectorTransferWriteOp::getIndices() -> operand_range {
+  auto begin = getOperation()->operand_begin() + Offsets::FirstIndexOffset;
+  auto end = begin + getMemRefType().getRank();
+  return {begin, end};
+}
+
+AffineMap VectorTransferWriteOp::getPermutationMap() {
+  return getAttrOfType<AffineMapAttr>(getPermutationMapAttrName()).getValue();
+}
+
+void VectorTransferWriteOp::print(OpAsmPrinter *p) {
+  *p << getOperationName();
+  *p << " " << *getVector();
+  *p << ", " << *getMemRef();
+  *p << "[";
+  p->printOperands(getIndices());
+  *p << "]";
+  p->printOptionalAttrDict(getAttrs());
+  *p << " : ";
+  p->printType(getVectorType());
+  *p << ", ";
+  p->printType(getMemRefType());
+}
+
+ParseResult VectorTransferWriteOp::parse(OpAsmParser *parser,
+                                         OperationState *result) {
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  SmallVector<Type, 2> types;
+  auto indexType = parser->getBuilder().getIndexType();
+  if (parser->parseOperand(storeValueInfo) || parser->parseComma() ||
+      parser->parseOperand(memrefInfo) ||
+      parser->parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser->parseOptionalAttributeDict(result->attributes) ||
+      parser->parseColonTypeList(types))
+    return failure();
+
+  if (types.size() != 2)
+    return parser->emitError(parser->getNameLoc(), "expected 2 types");
+  VectorType vectorType = types[Offsets::VectorOffset].dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser->emitError(parser->getNameLoc(), "vector type expected");
+  MemRefType memrefType = types[Offsets::MemRefOffset].dyn_cast<MemRefType>();
+  if (!memrefType)
+    return parser->emitError(parser->getNameLoc(), "memRef type expected");
+
+  return failure(
+      parser->resolveOperands(storeValueInfo, vectorType, result->operands) ||
+      parser->resolveOperands(memrefInfo, memrefType, result->operands) ||
+      parser->resolveOperands(indexInfo, indexType, result->operands));
+}
+
+LogicalResult VectorTransferWriteOp::verify() {
+  // Consistency of memref type in function type.
+  if (llvm::empty(getOperands())) {
+    return emitOpError(
+        "requires at least a memref operand followed by 'rank' indices");
+  }
+  if (!getMemRef()->getType().isa<MemRefType>()) {
+    return emitOpError("requires a memref first operand");
+  }
+  // Consistency of vector type in function type.
+  if (!getVector()->getType().isa<VectorType>()) {
+    return emitOpError("should have a vector input type in function type: "
+                       "(vector_type, memref_type [, elemental_type]) -> ()");
+  }
+  // Consistency of elemental types in memref and vector.
+  MemRefType memrefType = getMemRefType();
+  VectorType vectorType = getVectorType();
+  if (memrefType.getElementType() != vectorType.getElementType())
+    return emitOpError(
+        "requires memref and vector types of the same elemental type");
+  // Consistency of number of input types.
+  unsigned expectedNumOperands =
+      Offsets::FirstIndexOffset + memrefType.getRank();
+  // Checks on the actual operands and their types.
+  if (getNumOperands() != expectedNumOperands) {
+    return emitOpError() << "expects " << expectedNumOperands
+                         << " operands (of which " << memrefType.getRank()
+                         << " indices)";
+  }
+  // Consistency of indices types.
+  unsigned numIndices = 0;
+  for (auto *idx : getIndices()) {
+    if (!idx->getType().isIndex()) {
+      return emitOpError(
+          "index to vector.transfer_write must have 'index' type");
+    }
+    numIndices++;
+  }
+  if (numIndices != memrefType.getRank()) {
+    return emitOpError("requires at least a memref operand followed by ")
+           << memrefType.getRank() << " indices";
+  }
+
+  // Consistency of AffineMap attribute.
+  if (!getAttrOfType<AffineMapAttr>(getPermutationMapAttrName())) {
+    return emitOpError("requires an AffineMapAttr named 'permutation_map'");
+  }
+  auto permutationMap = getPermutationMap();
+  if (permutationMap.getNumSymbols() != 0) {
+    return emitOpError("requires a permutation_map without symbols");
+  }
+  if (permutationMap.getNumInputs() != memrefType.getRank()) {
+    return emitOpError("requires a permutation_map with input dims of the "
+                       "same rank as the memref type");
+  }
+  if (permutationMap.getNumResults() != vectorType.getRank()) {
+    return emitOpError("requires a permutation_map with result dims of the "
+                       "same rank as the vector type (")
+           << permutationMap.getNumResults() << " vs " << vectorType.getRank();
+  }
+  return verifyPermutationMap(permutationMap,
+                              [this](Twine t) { return emitOpError(t); });
+}
+
+//===----------------------------------------------------------------------===//
+// VectorTypeCastOp
+//===----------------------------------------------------------------------===//
+void VectorTypeCastOp::build(Builder *builder, OperationState *result,
+                             Value *srcVector, Type dstType) {
+  result->addOperands(srcVector);
+  result->addTypes(dstType);
+}
+
+ParseResult VectorTypeCastOp::parse(OpAsmParser *parser,
+                                    OperationState *result) {
+  OpAsmParser::OperandType operand;
+  Type srcType, dstType;
+  return failure(parser->parseOperand(operand) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(srcType) || parser->parseComma() ||
+                 parser->parseType(dstType) ||
+                 parser->addTypeToList(dstType, result->types) ||
+                 parser->resolveOperand(operand, srcType, result->operands));
+}
+
+void VectorTypeCastOp::print(OpAsmPrinter *p) {
+  *p << getOperationName() << ' ' << *getOperand() << " : "
+     << getOperand()->getType() << ", " << getType();
+}
+
+LogicalResult VectorTypeCastOp::verify() {
+  auto dstMemrefType = getType().dyn_cast<MemRefType>();
+  if (!dstMemrefType)
+    return emitOpError("expects target type to be a memref type");
+  auto dstVectorType = dstMemrefType.getElementType().dyn_cast<VectorType>();
+  if (!dstVectorType)
+    return emitOpError(
+        "expects vector as an element of the target memref type");
+  if (!dstMemrefType.hasStaticShape())
+    return emitOpError("does not support dynamic shapes");
+
+  if (!getOperand()->getType().isa<MemRefType>())
+    return emitOpError("expects source type to be a memref type");
+
+  return success();
+}
+
+namespace mlir {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/VectorOps/VectorOps.cpp.inc"
+
+} // namespace mlir
diff --git a/third_party/mlir/lib/EDSC/Builders.cpp b/third_party/mlir/lib/EDSC/Builders.cpp
new file mode 100644
index 00000000000..c620ac555f5
--- /dev/null
+++ b/third_party/mlir/lib/EDSC/Builders.cpp
@@ -0,0 +1,459 @@
+//===- Builders.cpp - MLIR Declarative Builder Classes --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+
+#include "llvm/ADT/Optional.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+mlir::edsc::ScopedContext::ScopedContext(OpBuilder &builder, Location location)
+    : builder(builder), location(location),
+      enclosingScopedContext(ScopedContext::getCurrentScopedContext()),
+      nestedBuilder(nullptr) {
+  getCurrentScopedContext() = this;
+}
+
+/// Sets the insertion point of the builder to 'newInsertPt' for the duration
+/// of the scope. The existing insertion point of the builder is restored on
+/// destruction.
+mlir::edsc::ScopedContext::ScopedContext(OpBuilder &builder,
+                                         OpBuilder::InsertPoint newInsertPt,
+                                         Location location)
+    : builder(builder), prevBuilderInsertPoint(builder.saveInsertionPoint()),
+      location(location),
+      enclosingScopedContext(ScopedContext::getCurrentScopedContext()),
+      nestedBuilder(nullptr) {
+  getCurrentScopedContext() = this;
+  builder.restoreInsertionPoint(newInsertPt);
+}
+
+mlir::edsc::ScopedContext::~ScopedContext() {
+  assert(!nestedBuilder &&
+         "Active NestedBuilder must have been exited at this point!");
+  if (prevBuilderInsertPoint)
+    builder.restoreInsertionPoint(*prevBuilderInsertPoint);
+  getCurrentScopedContext() = enclosingScopedContext;
+}
+
+ScopedContext *&mlir::edsc::ScopedContext::getCurrentScopedContext() {
+  thread_local ScopedContext *context = nullptr;
+  return context;
+}
+
+OpBuilder &mlir::edsc::ScopedContext::getBuilder() {
+  assert(ScopedContext::getCurrentScopedContext() &&
+         "Unexpected Null ScopedContext");
+  return ScopedContext::getCurrentScopedContext()->builder;
+}
+
+Location mlir::edsc::ScopedContext::getLocation() {
+  assert(ScopedContext::getCurrentScopedContext() &&
+         "Unexpected Null ScopedContext");
+  return ScopedContext::getCurrentScopedContext()->location;
+}
+
+MLIRContext *mlir::edsc::ScopedContext::getContext() {
+  return getBuilder().getContext();
+}
+
+mlir::edsc::ValueHandle::ValueHandle(index_t cst) {
+  auto &b = ScopedContext::getBuilder();
+  auto loc = ScopedContext::getLocation();
+  v = b.create<ConstantIndexOp>(loc, cst.v).getResult();
+  t = v->getType();
+}
+
+ValueHandle &mlir::edsc::ValueHandle::operator=(const ValueHandle &other) {
+  assert(t == other.t && "Wrong type capture");
+  assert(!v && "ValueHandle has already been captured, use a new name!");
+  v = other.v;
+  return *this;
+}
+
+ValueHandle
+mlir::edsc::ValueHandle::createComposedAffineApply(AffineMap map,
+                                                   ArrayRef<Value *> operands) {
+  Operation *op =
+      makeComposedAffineApply(ScopedContext::getBuilder(),
+                              ScopedContext::getLocation(), map, operands)
+          .getOperation();
+  assert(op->getNumResults() == 1 && "Not a single result AffineApply");
+  return ValueHandle(op->getResult(0));
+}
+
+ValueHandle ValueHandle::create(StringRef name, ArrayRef<ValueHandle> operands,
+                                ArrayRef<Type> resultTypes,
+                                ArrayRef<NamedAttribute> attributes) {
+  Operation *op =
+      OperationHandle::create(name, operands, resultTypes, attributes);
+  if (op->getNumResults() == 1) {
+    return ValueHandle(op->getResult(0));
+  }
+  if (auto f = dyn_cast<AffineForOp>(op)) {
+    return ValueHandle(f.getInductionVar());
+  }
+  llvm_unreachable("unsupported operation, use an OperationHandle instead");
+}
+
+OperationHandle OperationHandle::create(StringRef name,
+                                        ArrayRef<ValueHandle> operands,
+                                        ArrayRef<Type> resultTypes,
+                                        ArrayRef<NamedAttribute> attributes) {
+  OperationState state(ScopedContext::getLocation(), name);
+  SmallVector<Value *, 4> ops(operands.begin(), operands.end());
+  state.addOperands(ops);
+  state.addTypes(resultTypes);
+  for (const auto &attr : attributes) {
+    state.addAttribute(attr.first, attr.second);
+  }
+  return OperationHandle(ScopedContext::getBuilder().createOperation(state));
+}
+
+BlockHandle mlir::edsc::BlockHandle::create(ArrayRef<Type> argTypes) {
+  auto &currentB = ScopedContext::getBuilder();
+  auto *ib = currentB.getInsertionBlock();
+  auto ip = currentB.getInsertionPoint();
+  BlockHandle res;
+  res.block = ScopedContext::getBuilder().createBlock(ib->getParent());
+  // createBlock sets the insertion point inside the block.
+  // We do not want this behavior when using declarative builders with nesting.
+  currentB.setInsertionPoint(ib, ip);
+  for (auto t : argTypes) {
+    res.block->addArgument(t);
+  }
+  return res;
+}
+
+static llvm::Optional<ValueHandle> emitStaticFor(ArrayRef<ValueHandle> lbs,
+                                                 ArrayRef<ValueHandle> ubs,
+                                                 int64_t step) {
+  if (lbs.size() != 1 || ubs.size() != 1)
+    return llvm::Optional<ValueHandle>();
+
+  auto *lbDef = lbs.front().getValue()->getDefiningOp();
+  auto *ubDef = ubs.front().getValue()->getDefiningOp();
+  if (!lbDef || !ubDef)
+    return llvm::Optional<ValueHandle>();
+
+  auto lbConst = dyn_cast<ConstantIndexOp>(lbDef);
+  auto ubConst = dyn_cast<ConstantIndexOp>(ubDef);
+  if (!lbConst || !ubConst)
+    return llvm::Optional<ValueHandle>();
+
+  return ValueHandle::create<AffineForOp>(lbConst.getValue(),
+                                          ubConst.getValue(), step);
+}
+
+mlir::edsc::LoopBuilder::LoopBuilder(ValueHandle *iv,
+                                     ArrayRef<ValueHandle> lbHandles,
+                                     ArrayRef<ValueHandle> ubHandles,
+                                     int64_t step) {
+  if (auto res = emitStaticFor(lbHandles, ubHandles, step)) {
+    *iv = res.getValue();
+  } else {
+    SmallVector<Value *, 4> lbs(lbHandles.begin(), lbHandles.end());
+    SmallVector<Value *, 4> ubs(ubHandles.begin(), ubHandles.end());
+    *iv = ValueHandle::create<AffineForOp>(
+        lbs, ScopedContext::getBuilder().getMultiDimIdentityMap(lbs.size()),
+        ubs, ScopedContext::getBuilder().getMultiDimIdentityMap(ubs.size()),
+        step);
+  }
+  auto *body = getForInductionVarOwner(iv->getValue()).getBody();
+  enter(body, /*prev=*/1);
+}
+
+ValueHandle
+mlir::edsc::LoopBuilder::operator()(llvm::function_ref<void(void)> fun) {
+  // Call to `exit` must be explicit and asymmetric (cannot happen in the
+  // destructor) because of ordering wrt comma operator.
+  /// The particular use case concerns nested blocks:
+  ///
+  /// ```c++
+  ///    For (&i, lb, ub, 1)({
+  ///      /--- destructor for this `For` is not always called before ...
+  ///      V
+  ///      For (&j1, lb, ub, 1)({
+  ///        some_op_1,
+  ///      }),
+  ///      /--- ... this scope is entered, resulting in improperly nested IR.
+  ///      V
+  ///      For (&j2, lb, ub, 1)({
+  ///        some_op_2,
+  ///      }),
+  ///    });
+  /// ```
+  if (fun)
+    fun();
+  exit();
+  return ValueHandle::null();
+}
+
+mlir::edsc::LoopNestBuilder::LoopNestBuilder(ArrayRef<ValueHandle *> ivs,
+                                             ArrayRef<ValueHandle> lbs,
+                                             ArrayRef<ValueHandle> ubs,
+                                             ArrayRef<int64_t> steps) {
+  assert(ivs.size() == lbs.size() && "Mismatch in number of arguments");
+  assert(ivs.size() == ubs.size() && "Mismatch in number of arguments");
+  assert(ivs.size() == steps.size() && "Mismatch in number of arguments");
+  for (auto it : llvm::zip(ivs, lbs, ubs, steps)) {
+    loops.emplace_back(std::get<0>(it), std::get<1>(it), std::get<2>(it),
+                       std::get<3>(it));
+  }
+}
+
+ValueHandle
+mlir::edsc::LoopNestBuilder::operator()(llvm::function_ref<void(void)> fun) {
+  if (fun)
+    fun();
+  // Iterate on the calling operator() on all the loops in the nest.
+  // The iteration order is from innermost to outermost because enter/exit needs
+  // to be asymmetric (i.e. enter() occurs on LoopBuilder construction, exit()
+  // occurs on calling operator()). The asymmetry is required for properly
+  // nesting imperfectly nested regions (see LoopBuilder::operator()).
+  for (auto lit = loops.rbegin(), eit = loops.rend(); lit != eit; ++lit) {
+    (*lit)();
+  }
+  return ValueHandle::null();
+}
+
+mlir::edsc::BlockBuilder::BlockBuilder(BlockHandle bh, Append) {
+  assert(bh && "Expected already captured BlockHandle");
+  enter(bh.getBlock());
+}
+
+mlir::edsc::BlockBuilder::BlockBuilder(BlockHandle *bh,
+                                       ArrayRef<ValueHandle *> args) {
+  assert(!*bh && "BlockHandle already captures a block, use "
+                 "the explicit BockBuilder(bh, Append())({}) syntax instead.");
+  llvm::SmallVector<Type, 8> types;
+  for (auto *a : args) {
+    assert(!a->hasValue() &&
+           "Expected delayed ValueHandle that has not yet captured.");
+    types.push_back(a->getType());
+  }
+  *bh = BlockHandle::create(types);
+  for (auto it : llvm::zip(args, bh->getBlock()->getArguments())) {
+    *(std::get<0>(it)) = ValueHandle(std::get<1>(it));
+  }
+  enter(bh->getBlock());
+}
+
+/// Only serves as an ordering point between entering nested block and creating
+/// stmts.
+void mlir::edsc::BlockBuilder::operator()(llvm::function_ref<void(void)> fun) {
+  // Call to `exit` must be explicit and asymmetric (cannot happen in the
+  // destructor) because of ordering wrt comma operator.
+  if (fun)
+    fun();
+  exit();
+}
+
+template <typename Op>
+static ValueHandle createBinaryHandle(ValueHandle lhs, ValueHandle rhs) {
+  return ValueHandle::create<Op>(lhs.getValue(), rhs.getValue());
+}
+
+static std::pair<AffineExpr, Value *>
+categorizeValueByAffineType(MLIRContext *context, Value *val, unsigned &numDims,
+                            unsigned &numSymbols) {
+  AffineExpr d;
+  Value *resultVal = nullptr;
+  if (auto constant = dyn_cast_or_null<ConstantIndexOp>(val->getDefiningOp())) {
+    d = getAffineConstantExpr(constant.getValue(), context);
+  } else if (isValidSymbol(val) && !isValidDim(val)) {
+    d = getAffineSymbolExpr(numSymbols++, context);
+    resultVal = val;
+  } else {
+    assert(isValidDim(val) && "Must be a valid Dim");
+    d = getAffineDimExpr(numDims++, context);
+    resultVal = val;
+  }
+  return std::make_pair(d, resultVal);
+}
+
+static ValueHandle createBinaryIndexHandle(
+    ValueHandle lhs, ValueHandle rhs,
+    llvm::function_ref<AffineExpr(AffineExpr, AffineExpr)> affCombiner) {
+  MLIRContext *context = ScopedContext::getContext();
+  unsigned numDims = 0, numSymbols = 0;
+  AffineExpr d0, d1;
+  Value *v0, *v1;
+  std::tie(d0, v0) =
+      categorizeValueByAffineType(context, lhs.getValue(), numDims, numSymbols);
+  std::tie(d1, v1) =
+      categorizeValueByAffineType(context, rhs.getValue(), numDims, numSymbols);
+  SmallVector<Value *, 2> operands;
+  if (v0) {
+    operands.push_back(v0);
+  }
+  if (v1) {
+    operands.push_back(v1);
+  }
+  auto map = AffineMap::get(numDims, numSymbols, {affCombiner(d0, d1)});
+  // TODO: createOrFold when available.
+  return ValueHandle::createComposedAffineApply(map, operands);
+}
+
+template <typename IOp, typename FOp>
+static ValueHandle createBinaryHandle(
+    ValueHandle lhs, ValueHandle rhs,
+    llvm::function_ref<AffineExpr(AffineExpr, AffineExpr)> affCombiner) {
+  auto thisType = lhs.getValue()->getType();
+  auto thatType = rhs.getValue()->getType();
+  assert(thisType == thatType && "cannot mix types in operators");
+  (void)thisType;
+  (void)thatType;
+  if (thisType.isIndex()) {
+    return createBinaryIndexHandle(lhs, rhs, affCombiner);
+  } else if (thisType.isa<IntegerType>()) {
+    return createBinaryHandle<IOp>(lhs, rhs);
+  } else if (thisType.isa<FloatType>()) {
+    return createBinaryHandle<FOp>(lhs, rhs);
+  } else if (thisType.isa<VectorType>() || thisType.isa<TensorType>()) {
+    auto aggregateType = thisType.cast<ShapedType>();
+    if (aggregateType.getElementType().isa<IntegerType>())
+      return createBinaryHandle<IOp>(lhs, rhs);
+    else if (aggregateType.getElementType().isa<FloatType>())
+      return createBinaryHandle<FOp>(lhs, rhs);
+  }
+  llvm_unreachable("failed to create a ValueHandle");
+}
+
+ValueHandle mlir::edsc::op::operator+(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<AddIOp, AddFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 + d1; });
+}
+
+ValueHandle mlir::edsc::op::operator-(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<SubIOp, SubFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 - d1; });
+}
+
+ValueHandle mlir::edsc::op::operator*(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<MulIOp, MulFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 * d1; });
+}
+
+ValueHandle mlir::edsc::op::operator/(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<DivISOp, DivFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) -> AffineExpr {
+        llvm_unreachable("only exprs of non-index type support operator/");
+      });
+}
+
+ValueHandle mlir::edsc::op::operator%(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<RemISOp, RemFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 % d1; });
+}
+
+ValueHandle mlir::edsc::op::floorDiv(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryIndexHandle(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0.floorDiv(d1); });
+}
+
+ValueHandle mlir::edsc::op::ceilDiv(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryIndexHandle(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0.ceilDiv(d1); });
+}
+
+ValueHandle mlir::edsc::op::operator!(ValueHandle value) {
+  assert(value.getType().isInteger(1) && "expected boolean expression");
+  return ValueHandle::create<ConstantIntOp>(1, 1) - value;
+}
+
+ValueHandle mlir::edsc::op::operator&&(ValueHandle lhs, ValueHandle rhs) {
+  assert(lhs.getType().isInteger(1) && "expected boolean expression on LHS");
+  assert(rhs.getType().isInteger(1) && "expected boolean expression on RHS");
+  return lhs * rhs;
+}
+
+ValueHandle mlir::edsc::op::operator||(ValueHandle lhs, ValueHandle rhs) {
+  return !(!lhs && !rhs);
+}
+
+static ValueHandle createIComparisonExpr(CmpIPredicate predicate,
+                                         ValueHandle lhs, ValueHandle rhs) {
+  auto lhsType = lhs.getType();
+  auto rhsType = rhs.getType();
+  (void)lhsType;
+  (void)rhsType;
+  assert(lhsType == rhsType && "cannot mix types in operators");
+  assert((lhsType.isa<IndexType>() || lhsType.isa<IntegerType>()) &&
+         "only integer comparisons are supported");
+
+  auto op = ScopedContext::getBuilder().create<CmpIOp>(
+      ScopedContext::getLocation(), predicate, lhs.getValue(), rhs.getValue());
+  return ValueHandle(op.getResult());
+}
+
+static ValueHandle createFComparisonExpr(CmpFPredicate predicate,
+                                         ValueHandle lhs, ValueHandle rhs) {
+  auto lhsType = lhs.getType();
+  auto rhsType = rhs.getType();
+  (void)lhsType;
+  (void)rhsType;
+  assert(lhsType == rhsType && "cannot mix types in operators");
+  assert(lhsType.isa<FloatType>() && "only float comparisons are supported");
+
+  auto op = ScopedContext::getBuilder().create<CmpFOp>(
+      ScopedContext::getLocation(), predicate, lhs.getValue(), rhs.getValue());
+  return ValueHandle(op.getResult());
+}
+
+// All floating point comparison are ordered through EDSL
+ValueHandle mlir::edsc::op::operator==(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OEQ, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::EQ, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator!=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::ONE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::NE, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator<(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OLT, lhs, rhs)
+             :
+             // TODO(ntv,zinenko): signed by default, how about unsigned?
+             createIComparisonExpr(CmpIPredicate::SLT, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator<=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OLE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::SLE, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator>(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OGT, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::SGT, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator>=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OGE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::SGE, lhs, rhs);
+}
diff --git a/third_party/mlir/lib/EDSC/CMakeLists.txt b/third_party/mlir/lib/EDSC/CMakeLists.txt
new file mode 100644
index 00000000000..d910480d949
--- /dev/null
+++ b/third_party/mlir/lib/EDSC/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_library(MLIREDSC
+  Builders.cpp
+  CoreAPIs.cpp
+  Helpers.cpp
+  Intrinsics.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/EDSC
+  )
+add_dependencies(MLIREDSC MLIRReferenceImplementationTestGen)
+target_link_libraries(MLIREDSC
+  PUBLIC
+    MLIRAffineOps
+    MLIRStandardOps
+    MLIRTransformUtils
+    MLIRVectorOps
+    )
diff --git a/third_party/mlir/lib/EDSC/CoreAPIs.cpp b/third_party/mlir/lib/EDSC/CoreAPIs.cpp
new file mode 100644
index 00000000000..8b1831342b8
--- /dev/null
+++ b/third_party/mlir/lib/EDSC/CoreAPIs.cpp
@@ -0,0 +1,103 @@
+//===- Types.cpp - Implementations of MLIR Core C APIs --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir-c/Core.h"
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+
+mlir_type_t makeScalarType(mlir_context_t context, const char *name,
+                           unsigned bitwidth) {
+  mlir::MLIRContext *c = reinterpret_cast<mlir::MLIRContext *>(context);
+  mlir_type_t res =
+      llvm::StringSwitch<mlir_type_t>(name)
+          .Case("bf16",
+                mlir_type_t{mlir::FloatType::getBF16(c).getAsOpaquePointer()})
+          .Case("f16",
+                mlir_type_t{mlir::FloatType::getF16(c).getAsOpaquePointer()})
+          .Case("f32",
+                mlir_type_t{mlir::FloatType::getF32(c).getAsOpaquePointer()})
+          .Case("f64",
+                mlir_type_t{mlir::FloatType::getF64(c).getAsOpaquePointer()})
+          .Case("index",
+                mlir_type_t{mlir::IndexType::get(c).getAsOpaquePointer()})
+          .Case("i",
+                mlir_type_t{
+                    mlir::IntegerType::get(bitwidth, c).getAsOpaquePointer()})
+          .Default(mlir_type_t{nullptr});
+  if (!res) {
+    llvm_unreachable("Invalid type specifier");
+  }
+  return res;
+}
+
+mlir_type_t makeMemRefType(mlir_context_t context, mlir_type_t elemType,
+                           int64_list_t sizes) {
+  auto t = mlir::MemRefType::get(
+      llvm::ArrayRef<int64_t>(sizes.values, sizes.n),
+      mlir::Type::getFromOpaquePointer(elemType),
+      {mlir::AffineMap::getMultiDimIdentityMap(
+          sizes.n, reinterpret_cast<mlir::MLIRContext *>(context))},
+      0);
+  return mlir_type_t{t.getAsOpaquePointer()};
+}
+
+mlir_type_t makeFunctionType(mlir_context_t context, mlir_type_list_t inputs,
+                             mlir_type_list_t outputs) {
+  llvm::SmallVector<mlir::Type, 8> ins(inputs.n), outs(outputs.n);
+  for (unsigned i = 0; i < inputs.n; ++i) {
+    ins[i] = mlir::Type::getFromOpaquePointer(inputs.types[i]);
+  }
+  for (unsigned i = 0; i < outputs.n; ++i) {
+    outs[i] = mlir::Type::getFromOpaquePointer(outputs.types[i]);
+  }
+  auto ft = mlir::FunctionType::get(
+      ins, outs, reinterpret_cast<mlir::MLIRContext *>(context));
+  return mlir_type_t{ft.getAsOpaquePointer()};
+}
+
+mlir_type_t makeIndexType(mlir_context_t context) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto type = mlir::IndexType::get(ctx);
+  return mlir_type_t{type.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeIntegerAttr(mlir_type_t type, int64_t value) {
+  auto ty = Type::getFromOpaquePointer(reinterpret_cast<const void *>(type));
+  auto attr = IntegerAttr::get(ty, value);
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeBoolAttr(mlir_context_t context, bool value) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto attr = BoolAttr::get(value, ctx);
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+unsigned getFunctionArity(mlir_func_t function) {
+  auto f = mlir::FuncOp::getFromOpaquePointer(function);
+  return f.getNumArguments();
+}
diff --git a/third_party/mlir/lib/EDSC/Helpers.cpp b/third_party/mlir/lib/EDSC/Helpers.cpp
new file mode 100644
index 00000000000..b4455c43c1e
--- /dev/null
+++ b/third_party/mlir/lib/EDSC/Helpers.cpp
@@ -0,0 +1,64 @@
+//===- Helpers.cpp - MLIR Declarative Helper Functionality ----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+static SmallVector<ValueHandle, 8> getMemRefSizes(Value *memRef) {
+  MemRefType memRefType = memRef->getType().cast<MemRefType>();
+
+  auto maps = memRefType.getAffineMaps();
+  (void)maps;
+  assert((maps.empty() || (maps.size() == 1 && maps[0].isIdentity())) &&
+         "Layout maps not supported");
+  SmallVector<ValueHandle, 8> res;
+  res.reserve(memRefType.getShape().size());
+  const auto &shape = memRefType.getShape();
+  for (unsigned idx = 0, n = shape.size(); idx < n; ++idx) {
+    if (shape[idx] == -1) {
+      res.push_back(ValueHandle::create<DimOp>(memRef, idx));
+    } else {
+      res.push_back(static_cast<index_t>(shape[idx]));
+    }
+  }
+  return res;
+}
+
+mlir::edsc::MemRefView::MemRefView(Value *v) : base(v) {
+  assert(v->getType().isa<MemRefType>() && "MemRefType expected");
+
+  auto memrefSizeValues = getMemRefSizes(v);
+  for (auto &size : memrefSizeValues) {
+    lbs.push_back(static_cast<index_t>(0));
+    ubs.push_back(size);
+    steps.push_back(1);
+  }
+}
+
+mlir::edsc::VectorView::VectorView(Value *v) : base(v) {
+  auto vectorType = v->getType().cast<VectorType>();
+
+  for (auto s : vectorType.getShape()) {
+    lbs.push_back(static_cast<index_t>(0));
+    ubs.push_back(static_cast<index_t>(s));
+    steps.push_back(1);
+  }
+}
diff --git a/third_party/mlir/lib/EDSC/Intrinsics.cpp b/third_party/mlir/lib/EDSC/Intrinsics.cpp
new file mode 100644
index 00000000000..f80726866fc
--- /dev/null
+++ b/third_party/mlir/lib/EDSC/Intrinsics.cpp
@@ -0,0 +1,86 @@
+//===- Intrinsics.cpp - MLIR Operations for Declarative Builders ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+OperationHandle mlir::edsc::intrinsics::br(BlockHandle bh,
+                                           ArrayRef<ValueHandle> operands) {
+  assert(bh && "Expected already captured BlockHandle");
+  for (auto &o : operands) {
+    (void)o;
+    assert(o && "Expected already captured ValueHandle");
+  }
+  SmallVector<Value *, 4> ops(operands.begin(), operands.end());
+  return OperationHandle::create<BranchOp>(bh.getBlock(), ops);
+}
+static void enforceEmptyCapturesMatchOperands(ArrayRef<ValueHandle *> captures,
+                                              ArrayRef<ValueHandle> operands) {
+  assert(captures.size() == operands.size() &&
+         "Expected same number of captures as operands");
+  for (auto it : llvm::zip(captures, operands)) {
+    (void)it;
+    assert(!std::get<0>(it)->hasValue() &&
+           "Unexpected already captured ValueHandle");
+    assert(std::get<1>(it) && "Expected already captured ValueHandle");
+    assert(std::get<0>(it)->getType() == std::get<1>(it).getType() &&
+           "Expected the same type for capture and operand");
+  }
+}
+
+OperationHandle mlir::edsc::intrinsics::br(BlockHandle *bh,
+                                           ArrayRef<ValueHandle *> captures,
+                                           ArrayRef<ValueHandle> operands) {
+  assert(!*bh && "Unexpected already captured BlockHandle");
+  enforceEmptyCapturesMatchOperands(captures, operands);
+  BlockBuilder(bh, captures)(/* no body */);
+  SmallVector<Value *, 4> ops(operands.begin(), operands.end());
+  return OperationHandle::create<BranchOp>(bh->getBlock(), ops);
+}
+
+OperationHandle
+mlir::edsc::intrinsics::cond_br(ValueHandle cond, BlockHandle trueBranch,
+                                ArrayRef<ValueHandle> trueOperands,
+                                BlockHandle falseBranch,
+                                ArrayRef<ValueHandle> falseOperands) {
+  SmallVector<Value *, 4> trueOps(trueOperands.begin(), trueOperands.end());
+  SmallVector<Value *, 4> falseOps(falseOperands.begin(), falseOperands.end());
+  return OperationHandle::create<CondBranchOp>(
+      cond, trueBranch.getBlock(), trueOps, falseBranch.getBlock(), falseOps);
+}
+
+OperationHandle mlir::edsc::intrinsics::cond_br(
+    ValueHandle cond, BlockHandle *trueBranch,
+    ArrayRef<ValueHandle *> trueCaptures, ArrayRef<ValueHandle> trueOperands,
+    BlockHandle *falseBranch, ArrayRef<ValueHandle *> falseCaptures,
+    ArrayRef<ValueHandle> falseOperands) {
+  assert(!*trueBranch && "Unexpected already captured BlockHandle");
+  assert(!*falseBranch && "Unexpected already captured BlockHandle");
+  enforceEmptyCapturesMatchOperands(trueCaptures, trueOperands);
+  enforceEmptyCapturesMatchOperands(falseCaptures, falseOperands);
+  BlockBuilder(trueBranch, trueCaptures)(/* no body */);
+  BlockBuilder(falseBranch, falseCaptures)(/* no body */);
+  SmallVector<Value *, 4> trueOps(trueOperands.begin(), trueOperands.end());
+  SmallVector<Value *, 4> falseOps(falseOperands.begin(), falseOperands.end());
+  return OperationHandle::create<CondBranchOp>(
+      cond, trueBranch->getBlock(), trueOps, falseBranch->getBlock(), falseOps);
+}
diff --git a/third_party/mlir/lib/ExecutionEngine/CMakeLists.txt b/third_party/mlir/lib/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 00000000000..07061b1db11
--- /dev/null
+++ b/third_party/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1,21 @@
+llvm_map_components_to_libnames(outlibs "nativecodegen" "IPO")
+add_llvm_library(MLIRExecutionEngine
+  ExecutionEngine.cpp
+  MemRefUtils.cpp
+  OptUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/ExecutionEngine
+  )
+target_link_libraries(MLIRExecutionEngine
+
+  MLIRLLVMIR
+  MLIRTargetLLVMIR
+  LLVMBitReader
+  LLVMBitWriter
+  LLVMExecutionEngine
+  LLVMOrcJIT
+  LLVMSupport
+  LLVMTransformUtils
+
+  ${outlibs})
diff --git a/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
new file mode 100644
index 00000000000..dbc59d0383a
--- /dev/null
+++ b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -0,0 +1,284 @@
+//===- ExecutionEngine.cpp - MLIR Execution engine and utils --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the execution engine for MLIR modules based on LLVM Orc
+// JIT engine.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Target/LLVMIR.h"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace mlir;
+using llvm::dbgs;
+using llvm::Error;
+using llvm::errs;
+using llvm::Expected;
+using llvm::LLVMContext;
+using llvm::MemoryBuffer;
+using llvm::MemoryBufferRef;
+using llvm::Module;
+using llvm::SectionMemoryManager;
+using llvm::StringError;
+using llvm::Triple;
+using llvm::orc::DynamicLibrarySearchGenerator;
+using llvm::orc::ExecutionSession;
+using llvm::orc::IRCompileLayer;
+using llvm::orc::JITTargetMachineBuilder;
+using llvm::orc::RTDyldObjectLinkingLayer;
+using llvm::orc::ThreadSafeModule;
+using llvm::orc::TMOwningSimpleCompiler;
+
+// Wrap a string into an llvm::StringError.
+static inline Error make_string_error(const llvm::Twine &message) {
+  return llvm::make_error<StringError>(message.str(),
+                                       llvm::inconvertibleErrorCode());
+}
+
+namespace mlir {
+
+void SimpleObjectCache::notifyObjectCompiled(const Module *M,
+                                             MemoryBufferRef ObjBuffer) {
+  CachedObjects[M->getModuleIdentifier()] = MemoryBuffer::getMemBufferCopy(
+      ObjBuffer.getBuffer(), ObjBuffer.getBufferIdentifier());
+}
+
+std::unique_ptr<MemoryBuffer> SimpleObjectCache::getObject(const Module *M) {
+  auto I = CachedObjects.find(M->getModuleIdentifier());
+  if (I == CachedObjects.end()) {
+    dbgs() << "No object for " << M->getModuleIdentifier()
+           << " in cache. Compiling.\n";
+    return nullptr;
+  }
+  dbgs() << "Object for " << M->getModuleIdentifier()
+         << " loaded from cache.\n";
+  return MemoryBuffer::getMemBuffer(I->second->getMemBufferRef());
+}
+
+// Setup LLVM target triple from the current machine.
+bool ExecutionEngine::setupTargetTriple(Module *llvmModule) {
+  // Setup the machine properties from the current architecture.
+  auto targetTriple = llvm::sys::getDefaultTargetTriple();
+  std::string errorMessage;
+  auto target = llvm::TargetRegistry::lookupTarget(targetTriple, errorMessage);
+  if (!target) {
+    errs() << "NO target: " << errorMessage << "\n";
+    return true;
+  }
+  auto machine =
+      target->createTargetMachine(targetTriple, "generic", "", {}, {});
+  llvmModule->setDataLayout(machine->createDataLayout());
+  llvmModule->setTargetTriple(targetTriple);
+  return false;
+}
+
+static std::string makePackedFunctionName(StringRef name) {
+  return "_mlir_" + name.str();
+}
+
+// For each function in the LLVM module, define an interface function that wraps
+// all the arguments of the original function and all its results into an i8**
+// pointer to provide a unified invocation interface.
+void packFunctionArguments(Module *module) {
+  auto &ctx = module->getContext();
+  llvm::IRBuilder<> builder(ctx);
+  llvm::DenseSet<llvm::Function *> interfaceFunctions;
+  for (auto &func : module->getFunctionList()) {
+    if (func.isDeclaration()) {
+      continue;
+    }
+    if (interfaceFunctions.count(&func)) {
+      continue;
+    }
+
+    // Given a function `foo(<...>)`, define the interface function
+    // `mlir_foo(i8**)`.
+    auto newType = llvm::FunctionType::get(
+        builder.getVoidTy(), builder.getInt8PtrTy()->getPointerTo(),
+        /*isVarArg=*/false);
+    auto newName = makePackedFunctionName(func.getName());
+    auto funcCst = module->getOrInsertFunction(newName, newType);
+    llvm::Function *interfaceFunc =
+        llvm::cast<llvm::Function>(funcCst.getCallee());
+    interfaceFunctions.insert(interfaceFunc);
+
+    // Extract the arguments from the type-erased argument list and cast them to
+    // the proper types.
+    auto bb = llvm::BasicBlock::Create(ctx);
+    bb->insertInto(interfaceFunc);
+    builder.SetInsertPoint(bb);
+    llvm::Value *argList = interfaceFunc->arg_begin();
+    llvm::SmallVector<llvm::Value *, 8> args;
+    args.reserve(llvm::size(func.args()));
+    for (auto &indexedArg : llvm::enumerate(func.args())) {
+      llvm::Value *argIndex = llvm::Constant::getIntegerValue(
+          builder.getInt64Ty(), llvm::APInt(64, indexedArg.index()));
+      llvm::Value *argPtrPtr = builder.CreateGEP(argList, argIndex);
+      llvm::Value *argPtr = builder.CreateLoad(argPtrPtr);
+      argPtr = builder.CreateBitCast(
+          argPtr, indexedArg.value().getType()->getPointerTo());
+      llvm::Value *arg = builder.CreateLoad(argPtr);
+      args.push_back(arg);
+    }
+
+    // Call the implementation function with the extracted arguments.
+    llvm::Value *result = builder.CreateCall(&func, args);
+
+    // Assuming the result is one value, potentially of type `void`.
+    if (!result->getType()->isVoidTy()) {
+      llvm::Value *retIndex = llvm::Constant::getIntegerValue(
+          builder.getInt64Ty(), llvm::APInt(64, llvm::size(func.args())));
+      llvm::Value *retPtrPtr = builder.CreateGEP(argList, retIndex);
+      llvm::Value *retPtr = builder.CreateLoad(retPtrPtr);
+      retPtr = builder.CreateBitCast(retPtr, result->getType()->getPointerTo());
+      builder.CreateStore(result, retPtr);
+    }
+
+    // The interface function returns void.
+    builder.CreateRetVoid();
+  }
+}
+
+Expected<std::unique_ptr<ExecutionEngine>>
+ExecutionEngine::create(ModuleOp m,
+                        std::function<Error(llvm::Module *)> transformer,
+                        ArrayRef<StringRef> sharedLibPaths) {
+  auto engine = std::make_unique<ExecutionEngine>();
+
+  std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext);
+  auto llvmModule = translateModuleToLLVMIR(m);
+  if (!llvmModule)
+    return make_string_error("could not convert to LLVM IR");
+  // FIXME: the triple should be passed to the translation or dialect conversion
+  // instead of this.  Currently, the LLVM module created above has no triple
+  // associated with it.
+  setupTargetTriple(llvmModule.get());
+  packFunctionArguments(llvmModule.get());
+
+  // Clone module in a new LLVMContext since translateModuleToLLVMIR buries
+  // ownership too deeply.
+  // TODO(zinenko): Reevaluate model of ownership of LLVMContext in LLVMDialect.
+  SmallVector<char, 1> buffer;
+  {
+    llvm::raw_svector_ostream os(buffer);
+    WriteBitcodeToFile(*llvmModule, os);
+  }
+  llvm::MemoryBufferRef bufferRef(llvm::StringRef(buffer.data(), buffer.size()),
+                                  "cloned module buffer");
+  auto expectedModule = parseBitcodeFile(bufferRef, *ctx);
+  if (!expectedModule)
+    return expectedModule.takeError();
+  std::unique_ptr<Module> deserModule = std::move(*expectedModule);
+
+  // Callback to create the object layer with symbol resolution to current
+  // process and dynamically linked libraries.
+  auto objectLinkingLayerCreator = [&](ExecutionSession &session,
+                                       const Triple &TT) {
+    auto objectLayer = std::make_unique<RTDyldObjectLinkingLayer>(
+        session, []() { return std::make_unique<SectionMemoryManager>(); });
+    auto dataLayout = deserModule->getDataLayout();
+
+    // Resolve symbols that are statically linked in the current process.
+    session.getMainJITDylib().addGenerator(
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
+            dataLayout.getGlobalPrefix())));
+
+    // Resolve symbols from shared libraries.
+    for (auto libPath : sharedLibPaths) {
+      auto mb = llvm::MemoryBuffer::getFile(libPath);
+      if (!mb) {
+        errs() << "Fail to create MemoryBuffer for: " << libPath << "\n";
+        continue;
+      }
+      auto &JD = session.createJITDylib(libPath);
+      auto loaded = DynamicLibrarySearchGenerator::Load(
+          libPath.data(), dataLayout.getGlobalPrefix());
+      if (!loaded) {
+        errs() << "Could not load: " << libPath << "\n";
+        continue;
+      }
+      JD.addGenerator(std::move(*loaded));
+      cantFail(objectLayer->add(JD, std::move(mb.get())));
+    }
+
+    return objectLayer;
+  };
+
+  // Callback to inspect the cache and recompile on demand. This follows Lang's
+  // LLJITWithObjectCache example.
+  auto compileFunctionCreator = [&](JITTargetMachineBuilder JTMB)
+      -> Expected<IRCompileLayer::CompileFunction> {
+    auto TM = JTMB.createTargetMachine();
+    if (!TM)
+      return TM.takeError();
+    return IRCompileLayer::CompileFunction(
+        TMOwningSimpleCompiler(std::move(*TM), engine->cache.get()));
+  };
+
+  // Create the LLJIT by calling the LLJITBuilder with 2 callbacks.
+  auto jit =
+      cantFail(llvm::orc::LLJITBuilder()
+                   .setCompileFunctionCreator(compileFunctionCreator)
+                   .setObjectLinkingLayerCreator(objectLinkingLayerCreator)
+                   .create());
+
+  // Add a ThreadSafemodule to the engine and return.
+  ThreadSafeModule tsm(std::move(deserModule), std::move(ctx));
+  cantFail(jit->addIRModule(std::move(tsm)));
+  engine->jit = std::move(jit);
+
+  return std::move(engine);
+}
+
+Expected<void (*)(void **)> ExecutionEngine::lookup(StringRef name) const {
+  auto expectedSymbol = jit->lookup(makePackedFunctionName(name));
+  if (!expectedSymbol)
+    return expectedSymbol.takeError();
+  auto rawFPtr = expectedSymbol->getAddress();
+  auto fptr = reinterpret_cast<void (*)(void **)>(rawFPtr);
+  if (!fptr)
+    return make_string_error("looked up function is null");
+  return fptr;
+}
+
+Error ExecutionEngine::invoke(StringRef name, MutableArrayRef<void *> args) {
+  auto expectedFPtr = lookup(name);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+  auto fptr = *expectedFPtr;
+
+  (*fptr)(args.data());
+
+  return Error::success();
+}
+
+} // end namespace mlir
diff --git a/third_party/mlir/lib/ExecutionEngine/MemRefUtils.cpp b/third_party/mlir/lib/ExecutionEngine/MemRefUtils.cpp
new file mode 100644
index 00000000000..e34bf4455ab
--- /dev/null
+++ b/third_party/mlir/lib/ExecutionEngine/MemRefUtils.cpp
@@ -0,0 +1,107 @@
+//===- MemRefUtils.cpp - MLIR runtime utilities for memrefs ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a set of utilities to working with objects of memref type in an JIT
+// context using the MLIR execution engine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/MemRefUtils.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/Support/Error.h"
+#include <numeric>
+
+using namespace mlir;
+
+static inline llvm::Error make_string_error(const llvm::Twine &message) {
+  return llvm::make_error<llvm::StringError>(message.str(),
+                                             llvm::inconvertibleErrorCode());
+}
+
+static llvm::Expected<StaticFloatMemRef *>
+allocMemRefDescriptor(Type type, bool allocateData = true,
+                      float initialValue = 0.0) {
+  auto memRefType = type.dyn_cast<MemRefType>();
+  if (!memRefType)
+    return make_string_error("non-memref argument not supported");
+  if (!memRefType.hasStaticShape())
+    return make_string_error("memref with dynamic shapes not supported");
+
+  auto elementType = memRefType.getElementType();
+  if (!elementType.isF32())
+    return make_string_error(
+        "memref with element other than f32 not supported");
+
+  auto *descriptor =
+      reinterpret_cast<StaticFloatMemRef *>(malloc(sizeof(StaticFloatMemRef)));
+  if (!allocateData) {
+    descriptor->data = nullptr;
+    return descriptor;
+  }
+
+  auto shape = memRefType.getShape();
+  int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
+                                 std::multiplies<int64_t>());
+  descriptor->data = reinterpret_cast<float *>(malloc(sizeof(float) * size));
+  for (int64_t i = 0; i < size; ++i) {
+    descriptor->data[i] = initialValue;
+  }
+  return descriptor;
+}
+
+llvm::Expected<SmallVector<void *, 8>>
+mlir::allocateMemRefArguments(FuncOp func, float initialValue) {
+  SmallVector<void *, 8> args;
+  args.reserve(func.getNumArguments());
+  for (const auto &arg : func.getArguments()) {
+    auto descriptor =
+        allocMemRefDescriptor(arg->getType(),
+                              /*allocateData=*/true, initialValue);
+    if (!descriptor)
+      return descriptor.takeError();
+    args.push_back(*descriptor);
+  }
+
+  if (func.getType().getNumResults() > 1)
+    return make_string_error("functions with more than 1 result not supported");
+
+  for (Type resType : func.getType().getResults()) {
+    auto descriptor = allocMemRefDescriptor(resType, /*allocateData=*/false);
+    if (!descriptor)
+      return descriptor.takeError();
+    args.push_back(*descriptor);
+  }
+
+  return args;
+}
+
+// Because the function can return the same descriptor as passed in arguments,
+// we check that we don't attempt to free the underlying data twice.
+void mlir::freeMemRefArguments(ArrayRef<void *> args) {
+  llvm::DenseSet<void *> dataPointers;
+  for (void *arg : args) {
+    float *dataPtr = reinterpret_cast<StaticFloatMemRef *>(arg)->data;
+    if (dataPointers.count(dataPtr) == 0) {
+      free(dataPtr);
+      dataPointers.insert(dataPtr);
+    }
+    free(arg);
+  }
+}
diff --git a/third_party/mlir/lib/ExecutionEngine/OptUtils.cpp b/third_party/mlir/lib/ExecutionEngine/OptUtils.cpp
new file mode 100644
index 00000000000..e8c6652f446
--- /dev/null
+++ b/third_party/mlir/lib/ExecutionEngine/OptUtils.cpp
@@ -0,0 +1,151 @@
+//===- OptUtils.cpp - MLIR Execution Engine optimization pass utilities ---===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the utility functions to trigger LLVM optimizations from
+// MLIR Execution Engine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/OptUtils.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include <climits>
+#include <mutex>
+
+// Run the module and function passes managed by the module manager.
+static void runPasses(llvm::legacy::PassManager &modulePM,
+                      llvm::legacy::FunctionPassManager &funcPM,
+                      llvm::Module &m) {
+  funcPM.doInitialization();
+  for (auto &func : m) {
+    funcPM.run(func);
+  }
+  funcPM.doFinalization();
+  modulePM.run(m);
+}
+
+// Initialize basic LLVM transformation passes under lock.
+void mlir::initializeLLVMPasses() {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+
+  auto &registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeCore(registry);
+  llvm::initializeTransformUtils(registry);
+  llvm::initializeScalarOpts(registry);
+  llvm::initializeIPO(registry);
+  llvm::initializeInstCombine(registry);
+  llvm::initializeAggressiveInstCombine(registry);
+  llvm::initializeAnalysis(registry);
+  llvm::initializeVectorization(registry);
+}
+
+// Populate pass managers according to the optimization and size levels.
+// This behaves similarly to LLVM opt.
+static void populatePassManagers(llvm::legacy::PassManager &modulePM,
+                                 llvm::legacy::FunctionPassManager &funcPM,
+                                 unsigned optLevel, unsigned sizeLevel,
+                                 llvm::TargetMachine *targetMachine) {
+  llvm::PassManagerBuilder builder;
+  builder.OptLevel = optLevel;
+  builder.SizeLevel = sizeLevel;
+  builder.Inliner = llvm::createFunctionInliningPass(
+      optLevel, sizeLevel, /*DisableInlineHotCallSite=*/false);
+  builder.LoopVectorize = optLevel > 1 && sizeLevel < 2;
+  builder.SLPVectorize = optLevel > 1 && sizeLevel < 2;
+  builder.DisableUnrollLoops = (optLevel == 0);
+
+  if (targetMachine) {
+    // Add pass to initialize TTI for this specific target. Otherwise, TTI will
+    // be initialized to NoTTIImpl by defaul.
+    modulePM.add(createTargetTransformInfoWrapperPass(
+        targetMachine->getTargetIRAnalysis()));
+    funcPM.add(createTargetTransformInfoWrapperPass(
+        targetMachine->getTargetIRAnalysis()));
+  }
+
+  builder.populateModulePassManager(modulePM);
+  builder.populateFunctionPassManager(funcPM);
+}
+
+// Create and return a lambda that uses LLVM pass manager builder to set up
+// optimizations based on the given level.
+std::function<llvm::Error(llvm::Module *)>
+mlir::makeOptimizingTransformer(unsigned optLevel, unsigned sizeLevel,
+                                llvm::TargetMachine *targetMachine) {
+  return [optLevel, sizeLevel, targetMachine](llvm::Module *m) -> llvm::Error {
+    llvm::legacy::PassManager modulePM;
+    llvm::legacy::FunctionPassManager funcPM(m);
+    populatePassManagers(modulePM, funcPM, optLevel, sizeLevel, targetMachine);
+    runPasses(modulePM, funcPM, *m);
+
+    return llvm::Error::success();
+  };
+}
+
+// Create and return a lambda that is given a set of passes to run, plus an
+// optional optimization level to pre-populate the pass manager.
+std::function<llvm::Error(llvm::Module *)> mlir::makeLLVMPassesTransformer(
+    llvm::ArrayRef<const llvm::PassInfo *> llvmPasses,
+    llvm::Optional<unsigned> mbOptLevel, llvm::TargetMachine *targetMachine,
+    unsigned optPassesInsertPos) {
+  return [llvmPasses, mbOptLevel, optPassesInsertPos,
+          targetMachine](llvm::Module *m) -> llvm::Error {
+    llvm::legacy::PassManager modulePM;
+    llvm::legacy::FunctionPassManager funcPM(m);
+
+    bool insertOptPasses = mbOptLevel.hasValue();
+    for (unsigned i = 0, e = llvmPasses.size(); i < e; ++i) {
+      const auto *passInfo = llvmPasses[i];
+      if (!passInfo->getNormalCtor())
+        continue;
+
+      if (insertOptPasses && optPassesInsertPos == i) {
+        populatePassManagers(modulePM, funcPM, mbOptLevel.getValue(), 0,
+                             targetMachine);
+        insertOptPasses = false;
+      }
+
+      auto *pass = passInfo->createPass();
+      if (!pass)
+        return llvm::make_error<llvm::StringError>(
+            "could not create pass " + passInfo->getPassName(),
+            llvm::inconvertibleErrorCode());
+      modulePM.add(pass);
+    }
+
+    if (insertOptPasses)
+      populatePassManagers(modulePM, funcPM, mbOptLevel.getValue(), 0,
+                           targetMachine);
+
+    runPasses(modulePM, funcPM, *m);
+    return llvm::Error::success();
+  };
+}
diff --git a/third_party/mlir/lib/IR/AffineExpr.cpp b/third_party/mlir/lib/IR/AffineExpr.cpp
new file mode 100644
index 00000000000..2ce62e394f4
--- /dev/null
+++ b/third_party/mlir/lib/IR/AffineExpr.cpp
@@ -0,0 +1,896 @@
+//===- AffineExpr.cpp - MLIR Affine Expr Classes --------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/AffineExpr.h"
+#include "AffineExprDetail.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+MLIRContext *AffineExpr::getContext() const { return expr->context; }
+
+AffineExprKind AffineExpr::getKind() const {
+  return static_cast<AffineExprKind>(expr->getKind());
+}
+
+/// Walk all of the AffineExprs in this subgraph in postorder.
+void AffineExpr::walk(std::function<void(AffineExpr)> callback) const {
+  struct AffineExprWalker : public AffineExprVisitor<AffineExprWalker> {
+    std::function<void(AffineExpr)> callback;
+
+    AffineExprWalker(std::function<void(AffineExpr)> callback)
+        : callback(callback) {}
+
+    void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) { callback(expr); }
+    void visitConstantExpr(AffineConstantExpr expr) { callback(expr); }
+    void visitDimExpr(AffineDimExpr expr) { callback(expr); }
+    void visitSymbolExpr(AffineSymbolExpr expr) { callback(expr); }
+  };
+
+  AffineExprWalker(callback).walkPostOrder(*this);
+}
+
+// Dispatch affine expression construction based on kind.
+AffineExpr mlir::getAffineBinaryOpExpr(AffineExprKind kind, AffineExpr lhs,
+                                       AffineExpr rhs) {
+  if (kind == AffineExprKind::Add)
+    return lhs + rhs;
+  if (kind == AffineExprKind::Mul)
+    return lhs * rhs;
+  if (kind == AffineExprKind::FloorDiv)
+    return lhs.floorDiv(rhs);
+  if (kind == AffineExprKind::CeilDiv)
+    return lhs.ceilDiv(rhs);
+  if (kind == AffineExprKind::Mod)
+    return lhs % rhs;
+
+  llvm_unreachable("unknown binary operation on affine expressions");
+}
+
+/// This method substitutes any uses of dimensions and symbols (e.g.
+/// dim#0 with dimReplacements[0]) and returns the modified expression tree.
+AffineExpr
+AffineExpr::replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                  ArrayRef<AffineExpr> symReplacements) const {
+  switch (getKind()) {
+  case AffineExprKind::Constant:
+    return *this;
+  case AffineExprKind::DimId: {
+    unsigned dimId = cast<AffineDimExpr>().getPosition();
+    if (dimId >= dimReplacements.size())
+      return *this;
+    return dimReplacements[dimId];
+  }
+  case AffineExprKind::SymbolId: {
+    unsigned symId = cast<AffineSymbolExpr>().getPosition();
+    if (symId >= symReplacements.size())
+      return *this;
+    return symReplacements[symId];
+  }
+  case AffineExprKind::Add:
+  case AffineExprKind::Mul:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod:
+    auto binOp = cast<AffineBinaryOpExpr>();
+    auto lhs = binOp.getLHS(), rhs = binOp.getRHS();
+    auto newLHS = lhs.replaceDimsAndSymbols(dimReplacements, symReplacements);
+    auto newRHS = rhs.replaceDimsAndSymbols(dimReplacements, symReplacements);
+    if (newLHS == lhs && newRHS == rhs)
+      return *this;
+    return getAffineBinaryOpExpr(getKind(), newLHS, newRHS);
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+/// Returns true if this expression is made out of only symbols and
+/// constants (no dimensional identifiers).
+bool AffineExpr::isSymbolicOrConstant() const {
+  switch (getKind()) {
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::DimId:
+    return false;
+  case AffineExprKind::SymbolId:
+    return true;
+
+  case AffineExprKind::Add:
+  case AffineExprKind::Mul:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    auto expr = this->cast<AffineBinaryOpExpr>();
+    return expr.getLHS().isSymbolicOrConstant() &&
+           expr.getRHS().isSymbolicOrConstant();
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+/// Returns true if this is a pure affine expression, i.e., multiplication,
+/// floordiv, ceildiv, and mod is only allowed w.r.t constants.
+bool AffineExpr::isPureAffine() const {
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+  case AffineExprKind::DimId:
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::Add: {
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() && op.getRHS().isPureAffine();
+  }
+
+  case AffineExprKind::Mul: {
+    // TODO: Canonicalize the constants in binary operators to the RHS when
+    // possible, allowing this to merge into the next case.
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() && op.getRHS().isPureAffine() &&
+           (op.getLHS().template isa<AffineConstantExpr>() ||
+            op.getRHS().template isa<AffineConstantExpr>());
+  }
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() &&
+           op.getRHS().template isa<AffineConstantExpr>();
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+// Returns the greatest known integral divisor of this affine expression.
+uint64_t AffineExpr::getLargestKnownDivisor() const {
+  AffineBinaryOpExpr binExpr(nullptr);
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::DimId:
+    return 1;
+  case AffineExprKind::Constant:
+    return std::abs(this->cast<AffineConstantExpr>().getValue());
+  case AffineExprKind::Mul: {
+    binExpr = this->cast<AffineBinaryOpExpr>();
+    return binExpr.getLHS().getLargestKnownDivisor() *
+           binExpr.getRHS().getLargestKnownDivisor();
+  }
+  case AffineExprKind::Add:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    return llvm::GreatestCommonDivisor64(
+        binExpr.getLHS().getLargestKnownDivisor(),
+        binExpr.getRHS().getLargestKnownDivisor());
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+bool AffineExpr::isMultipleOf(int64_t factor) const {
+  AffineBinaryOpExpr binExpr(nullptr);
+  uint64_t l, u;
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::DimId:
+    return factor * factor == 1;
+  case AffineExprKind::Constant:
+    return cast<AffineConstantExpr>().getValue() % factor == 0;
+  case AffineExprKind::Mul: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    // It's probably not worth optimizing this further (to not traverse the
+    // whole sub-tree under - it that would require a version of isMultipleOf
+    // that on a 'false' return also returns the largest known divisor).
+    return (l = binExpr.getLHS().getLargestKnownDivisor()) % factor == 0 ||
+           (u = binExpr.getRHS().getLargestKnownDivisor()) % factor == 0 ||
+           (l * u) % factor == 0;
+  }
+  case AffineExprKind::Add:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    return llvm::GreatestCommonDivisor64(
+               binExpr.getLHS().getLargestKnownDivisor(),
+               binExpr.getRHS().getLargestKnownDivisor()) %
+               factor ==
+           0;
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+bool AffineExpr::isFunctionOfDim(unsigned position) const {
+  if (getKind() == AffineExprKind::DimId) {
+    return *this == mlir::getAffineDimExpr(position, getContext());
+  }
+  if (auto expr = this->dyn_cast<AffineBinaryOpExpr>()) {
+    return expr.getLHS().isFunctionOfDim(position) ||
+           expr.getRHS().isFunctionOfDim(position);
+  }
+  return false;
+}
+
+AffineBinaryOpExpr::AffineBinaryOpExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+AffineExpr AffineBinaryOpExpr::getLHS() const {
+  return static_cast<ImplType *>(expr)->lhs;
+}
+AffineExpr AffineBinaryOpExpr::getRHS() const {
+  return static_cast<ImplType *>(expr)->rhs;
+}
+
+AffineDimExpr::AffineDimExpr(AffineExpr::ImplType *ptr) : AffineExpr(ptr) {}
+unsigned AffineDimExpr::getPosition() const {
+  return static_cast<ImplType *>(expr)->position;
+}
+
+static AffineExpr getAffineDimOrSymbol(AffineExprKind kind, unsigned position,
+                                       MLIRContext *context) {
+  auto assignCtx = [context](AffineDimExprStorage *storage) {
+    storage->context = context;
+  };
+
+  StorageUniquer &uniquer = context->getAffineUniquer();
+  return uniquer.get<AffineDimExprStorage>(
+      assignCtx, static_cast<unsigned>(kind), position);
+}
+
+AffineExpr mlir::getAffineDimExpr(unsigned position, MLIRContext *context) {
+  return getAffineDimOrSymbol(AffineExprKind::DimId, position, context);
+}
+
+AffineSymbolExpr::AffineSymbolExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+unsigned AffineSymbolExpr::getPosition() const {
+  return static_cast<ImplType *>(expr)->position;
+}
+
+AffineExpr mlir::getAffineSymbolExpr(unsigned position, MLIRContext *context) {
+  return getAffineDimOrSymbol(AffineExprKind::SymbolId, position, context);
+  ;
+}
+
+AffineConstantExpr::AffineConstantExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+int64_t AffineConstantExpr::getValue() const {
+  return static_cast<ImplType *>(expr)->constant;
+}
+
+AffineExpr mlir::getAffineConstantExpr(int64_t constant, MLIRContext *context) {
+  auto assignCtx = [context](AffineConstantExprStorage *storage) {
+    storage->context = context;
+  };
+
+  StorageUniquer &uniquer = context->getAffineUniquer();
+  return uniquer.get<AffineConstantExprStorage>(
+      assignCtx, static_cast<unsigned>(AffineExprKind::Constant), constant);
+}
+
+/// Simplify add expression. Return nullptr if it can't be simplified.
+static AffineExpr simplifyAdd(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+  // Fold if both LHS, RHS are a constant.
+  if (lhsConst && rhsConst)
+    return getAffineConstantExpr(lhsConst.getValue() + rhsConst.getValue(),
+                                 lhs.getContext());
+
+  // Canonicalize so that only the RHS is a constant. (4 + d0 becomes d0 + 4).
+  // If only one of them is a symbolic expressions, make it the RHS.
+  if (lhs.isa<AffineConstantExpr>() ||
+      (lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant())) {
+    return rhs + lhs;
+  }
+
+  // At this point, if there was a constant, it would be on the right.
+
+  // Addition with a zero is a noop, return the other input.
+  if (rhsConst) {
+    if (rhsConst.getValue() == 0)
+      return lhs;
+  }
+  // Fold successive additions like (d0 + 2) + 3 into d0 + 5.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && rhsConst && lBin.getKind() == AffineExprKind::Add) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>())
+      return lBin.getLHS() + (lrhs.getValue() + rhsConst.getValue());
+  }
+
+  // When doing successive additions, bring constant to the right: turn (d0 + 2)
+  // + d1 into (d0 + d1) + 2.
+  if (lBin && lBin.getKind() == AffineExprKind::Add) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      return lBin.getLHS() + rhs + lrhs;
+    }
+  }
+
+  // Detect and transform "expr - c * (expr floordiv c)" to "expr mod c". This
+  // leads to a much more efficient form when 'c' is a power of two, and in
+  // general a more compact and readable form.
+
+  // Process '(expr floordiv c) * (-c)'.
+  AffineBinaryOpExpr rBinOpExpr = rhs.dyn_cast<AffineBinaryOpExpr>();
+  if (!rBinOpExpr)
+    return nullptr;
+
+  auto lrhs = rBinOpExpr.getLHS();
+  auto rrhs = rBinOpExpr.getRHS();
+
+  // Process lrhs, which is 'expr floordiv c'.
+  AffineBinaryOpExpr lrBinOpExpr = lrhs.dyn_cast<AffineBinaryOpExpr>();
+  if (!lrBinOpExpr || lrBinOpExpr.getKind() != AffineExprKind::FloorDiv)
+    return nullptr;
+
+  auto llrhs = lrBinOpExpr.getLHS();
+  auto rlrhs = lrBinOpExpr.getRHS();
+
+  if (lhs == llrhs && rlrhs == -rrhs) {
+    return lhs % rlrhs;
+  }
+  return nullptr;
+}
+
+AffineExpr AffineExpr::operator+(int64_t v) const {
+  return *this + getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator+(AffineExpr other) const {
+  if (auto simplified = simplifyAdd(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Add), *this, other);
+}
+
+/// Simplify a multiply expression. Return nullptr if it can't be simplified.
+static AffineExpr simplifyMul(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (lhsConst && rhsConst)
+    return getAffineConstantExpr(lhsConst.getValue() * rhsConst.getValue(),
+                                 lhs.getContext());
+
+  assert(lhs.isSymbolicOrConstant() || rhs.isSymbolicOrConstant());
+
+  // Canonicalize the mul expression so that the constant/symbolic term is the
+  // RHS. If both the lhs and rhs are symbolic, swap them if the lhs is a
+  // constant. (Note that a constant is trivially symbolic).
+  if (!rhs.isSymbolicOrConstant() || lhs.isa<AffineConstantExpr>()) {
+    // At least one of them has to be symbolic.
+    return rhs * lhs;
+  }
+
+  // At this point, if there was a constant, it would be on the right.
+
+  // Multiplication with a one is a noop, return the other input.
+  if (rhsConst) {
+    if (rhsConst.getValue() == 1)
+      return lhs;
+    // Multiplication with zero.
+    if (rhsConst.getValue() == 0)
+      return rhsConst;
+  }
+
+  // Fold successive multiplications: eg: (d0 * 2) * 3 into d0 * 6.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && rhsConst && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>())
+      return lBin.getLHS() * (lrhs.getValue() * rhsConst.getValue());
+  }
+
+  // When doing successive multiplication, bring constant to the right: turn (d0
+  // * 2) * d1 into (d0 * d1) * 2.
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      return (lBin.getLHS() * rhs) * lrhs;
+    }
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::operator*(int64_t v) const {
+  return *this * getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator*(AffineExpr other) const {
+  if (auto simplified = simplifyMul(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mul), *this, other);
+}
+
+// Unary minus, delegate to operator*.
+AffineExpr AffineExpr::operator-() const {
+  return *this * getAffineConstantExpr(-1, getContext());
+}
+
+// Delegate to operator+.
+AffineExpr AffineExpr::operator-(int64_t v) const { return *this + (-v); }
+AffineExpr AffineExpr::operator-(AffineExpr other) const {
+  return *this + (-other);
+}
+
+static AffineExpr simplifyFloorDiv(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(
+        floorDiv(lhsConst.getValue(), rhsConst.getValue()), lhs.getContext());
+
+  // Fold floordiv of a multiply with a constant that is a multiple of the
+  // divisor. Eg: (i * 128) floordiv 64 = i * 2.
+  if (rhsConst.getValue() == 1)
+    return lhs;
+
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      // rhsConst is known to be positive if a constant.
+      if (lrhs.getValue() % rhsConst.getValue() == 0)
+        return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
+    }
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::floorDiv(uint64_t v) const {
+  return floorDiv(getAffineConstantExpr(v, getContext()));
+}
+AffineExpr AffineExpr::floorDiv(AffineExpr other) const {
+  if (auto simplified = simplifyFloorDiv(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::FloorDiv), *this,
+      other);
+}
+
+static AffineExpr simplifyCeilDiv(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(
+        ceilDiv(lhsConst.getValue(), rhsConst.getValue()), lhs.getContext());
+
+  // Fold ceildiv of a multiply with a constant that is a multiple of the
+  // divisor. Eg: (i * 128) ceildiv 64 = i * 2.
+  if (rhsConst.getValue() == 1)
+    return lhs;
+
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      // rhsConst is known to be positive if a constant.
+      if (lrhs.getValue() % rhsConst.getValue() == 0)
+        return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
+    }
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::ceilDiv(uint64_t v) const {
+  return ceilDiv(getAffineConstantExpr(v, getContext()));
+}
+AffineExpr AffineExpr::ceilDiv(AffineExpr other) const {
+  if (auto simplified = simplifyCeilDiv(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::CeilDiv), *this,
+      other);
+}
+
+static AffineExpr simplifyMod(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(mod(lhsConst.getValue(), rhsConst.getValue()),
+                                 lhs.getContext());
+
+  // Fold modulo of an expression that is known to be a multiple of a constant
+  // to zero if that constant is a multiple of the modulo factor. Eg: (i * 128)
+  // mod 64 is folded to 0, and less trivially, (i*(j*4*(k*32))) mod 128 = 0.
+  if (lhs.getLargestKnownDivisor() % rhsConst.getValue() == 0)
+    return getAffineConstantExpr(0, lhs.getContext());
+
+  return nullptr;
+  // TODO(bondhugula): In general, this can be simplified more by using the GCD
+  // test, or in general using quantifier elimination (add two new variables q
+  // and r, and eliminate all variables from the linear system other than r. All
+  // of this can be done through mlir/Analysis/'s FlatAffineConstraints.
+}
+
+AffineExpr AffineExpr::operator%(uint64_t v) const {
+  return *this % getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator%(AffineExpr other) const {
+  if (auto simplified = simplifyMod(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mod), *this, other);
+}
+
+AffineExpr AffineExpr::compose(AffineMap map) const {
+  SmallVector<AffineExpr, 8> dimReplacements(map.getResults().begin(),
+                                             map.getResults().end());
+  return replaceDimsAndSymbols(dimReplacements, {});
+}
+raw_ostream &mlir::operator<<(raw_ostream &os, AffineExpr &expr) {
+  expr.print(os);
+  return os;
+}
+
+/// Constructs an affine expression from a flat ArrayRef. If there are local
+/// identifiers (neither dimensional nor symbolic) that appear in the sum of
+/// products expression, 'localExprs' is expected to have the AffineExpr
+/// for it, and is substituted into. The ArrayRef 'eq' is expected to be in the
+/// format [dims, symbols, locals, constant term].
+AffineExpr mlir::toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
+                              unsigned numSymbols,
+                              ArrayRef<AffineExpr> localExprs,
+                              MLIRContext *context) {
+  // Assert expected numLocals = eq.size() - numDims - numSymbols - 1
+  assert(eq.size() - numDims - numSymbols - 1 == localExprs.size() &&
+         "unexpected number of local expressions");
+
+  auto expr = getAffineConstantExpr(0, context);
+  // Dimensions and symbols.
+  for (unsigned j = 0; j < numDims + numSymbols; j++) {
+    if (eq[j] == 0) {
+      continue;
+    }
+    auto id = j < numDims ? getAffineDimExpr(j, context)
+                          : getAffineSymbolExpr(j - numDims, context);
+    expr = expr + id * eq[j];
+  }
+
+  // Local identifiers.
+  for (unsigned j = numDims + numSymbols, e = eq.size() - 1; j < e; j++) {
+    if (eq[j] == 0) {
+      continue;
+    }
+    auto term = localExprs[j - numDims - numSymbols] * eq[j];
+    expr = expr + term;
+  }
+
+  // Constant term.
+  int64_t constTerm = eq[eq.size() - 1];
+  if (constTerm != 0)
+    expr = expr + constTerm;
+  return expr;
+}
+
+SimpleAffineExprFlattener::SimpleAffineExprFlattener(unsigned numDims,
+                                                     unsigned numSymbols)
+    : numDims(numDims), numSymbols(numSymbols), numLocals(0) {
+  operandExprStack.reserve(8);
+}
+
+void SimpleAffineExprFlattener::visitMulExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  // This is a pure affine expr; the RHS will be a constant.
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+  // Get the RHS constant.
+  auto rhsConst = operandExprStack.back()[getConstantIndex()];
+  operandExprStack.pop_back();
+  // Update the LHS in place instead of pop and push.
+  auto &lhs = operandExprStack.back();
+  for (unsigned i = 0, e = lhs.size(); i < e; i++) {
+    lhs[i] *= rhsConst;
+  }
+}
+
+void SimpleAffineExprFlattener::visitAddExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  const auto &rhs = operandExprStack.back();
+  auto &lhs = operandExprStack[operandExprStack.size() - 2];
+  assert(lhs.size() == rhs.size());
+  // Update the LHS in place.
+  for (unsigned i = 0, e = rhs.size(); i < e; i++) {
+    lhs[i] += rhs[i];
+  }
+  // Pop off the RHS.
+  operandExprStack.pop_back();
+}
+
+//
+// t = expr mod c   <=>  t = expr - c*q and c*q <= expr <= c*q + c - 1
+//
+// A mod expression "expr mod c" is thus flattened by introducing a new local
+// variable q (= expr floordiv c), such that expr mod c is replaced with
+// 'expr - c * q' and c * q <= expr <= c * q + c - 1 are added to localVarCst.
+void SimpleAffineExprFlattener::visitModExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  // This is a pure affine expr; the RHS will be a constant.
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+  auto rhsConst = operandExprStack.back()[getConstantIndex()];
+  operandExprStack.pop_back();
+  auto &lhs = operandExprStack.back();
+  // TODO(bondhugula): handle modulo by zero case when this issue is fixed
+  // at the other places in the IR.
+  assert(rhsConst > 0 && "RHS constant has to be positive");
+
+  // Check if the LHS expression is a multiple of modulo factor.
+  unsigned i, e;
+  for (i = 0, e = lhs.size(); i < e; i++)
+    if (lhs[i] % rhsConst != 0)
+      break;
+  // If yes, modulo expression here simplifies to zero.
+  if (i == lhs.size()) {
+    std::fill(lhs.begin(), lhs.end(), 0);
+    return;
+  }
+
+  // Add a local variable for the quotient, i.e., expr % c is replaced by
+  // (expr - q * c) where q = expr floordiv c. Do this while canceling out
+  // the GCD of expr and c.
+  SmallVector<int64_t, 8> floorDividend(lhs);
+  uint64_t gcd = rhsConst;
+  for (unsigned i = 0, e = lhs.size(); i < e; i++)
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
+  // Simplify the numerator and the denominator.
+  if (gcd != 1) {
+    for (unsigned i = 0, e = floorDividend.size(); i < e; i++)
+      floorDividend[i] = floorDividend[i] / static_cast<int64_t>(gcd);
+  }
+  int64_t floorDivisor = rhsConst / static_cast<int64_t>(gcd);
+
+  // Construct the AffineExpr form of the floordiv to store in localExprs.
+  MLIRContext *context = expr.getContext();
+  auto dividendExpr =
+      toAffineExpr(floorDividend, numDims, numSymbols, localExprs, context);
+  auto divisorExpr = getAffineConstantExpr(floorDivisor, context);
+  auto floorDivExpr = dividendExpr.floorDiv(divisorExpr);
+  int loc;
+  if ((loc = findLocalId(floorDivExpr)) == -1) {
+    addLocalFloorDivId(floorDividend, floorDivisor, floorDivExpr);
+    // Set result at top of stack to "lhs - rhsConst * q".
+    lhs[getLocalVarStartIndex() + numLocals - 1] = -rhsConst;
+  } else {
+    // Reuse the existing local id.
+    lhs[getLocalVarStartIndex() + loc] = -rhsConst;
+  }
+}
+
+void SimpleAffineExprFlattener::visitCeilDivExpr(AffineBinaryOpExpr expr) {
+  visitDivExpr(expr, /*isCeil=*/true);
+}
+void SimpleAffineExprFlattener::visitFloorDivExpr(AffineBinaryOpExpr expr) {
+  visitDivExpr(expr, /*isCeil=*/false);
+}
+
+void SimpleAffineExprFlattener::visitDimExpr(AffineDimExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  assert(expr.getPosition() < numDims && "Inconsistent number of dims");
+  eq[getDimStartIndex() + expr.getPosition()] = 1;
+}
+
+void SimpleAffineExprFlattener::visitSymbolExpr(AffineSymbolExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  assert(expr.getPosition() < numSymbols && "inconsistent number of symbols");
+  eq[getSymbolStartIndex() + expr.getPosition()] = 1;
+}
+
+void SimpleAffineExprFlattener::visitConstantExpr(AffineConstantExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  eq[getConstantIndex()] = expr.getValue();
+}
+
+// t = expr floordiv c   <=> t = q, c * q <= expr <= c * q + c - 1
+// A floordiv is thus flattened by introducing a new local variable q, and
+// replacing that expression with 'q' while adding the constraints
+// c * q <= expr <= c * q + c - 1 to localVarCst (done by
+// FlatAffineConstraints::addLocalFloorDiv).
+//
+// A ceildiv is similarly flattened:
+// t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
+void SimpleAffineExprFlattener::visitDivExpr(AffineBinaryOpExpr expr,
+                                             bool isCeil) {
+  assert(operandExprStack.size() >= 2);
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+
+  // This is a pure affine expr; the RHS is a positive constant.
+  int64_t rhsConst = operandExprStack.back()[getConstantIndex()];
+  // TODO(bondhugula): handle division by zero at the same time the issue is
+  // fixed at other places.
+  assert(rhsConst > 0 && "RHS constant has to be positive");
+  operandExprStack.pop_back();
+  auto &lhs = operandExprStack.back();
+
+  // Simplify the floordiv, ceildiv if possible by canceling out the greatest
+  // common divisors of the numerator and denominator.
+  uint64_t gcd = std::abs(rhsConst);
+  for (unsigned i = 0, e = lhs.size(); i < e; i++)
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
+  // Simplify the numerator and the denominator.
+  if (gcd != 1) {
+    for (unsigned i = 0, e = lhs.size(); i < e; i++)
+      lhs[i] = lhs[i] / static_cast<int64_t>(gcd);
+  }
+  int64_t divisor = rhsConst / static_cast<int64_t>(gcd);
+  // If the divisor becomes 1, the updated LHS is the result. (The
+  // divisor can't be negative since rhsConst is positive).
+  if (divisor == 1)
+    return;
+
+  // If the divisor cannot be simplified to one, we will have to retain
+  // the ceil/floor expr (simplified up until here). Add an existential
+  // quantifier to express its result, i.e., expr1 div expr2 is replaced
+  // by a new identifier, q.
+  MLIRContext *context = expr.getContext();
+  auto a = toAffineExpr(lhs, numDims, numSymbols, localExprs, context);
+  auto b = getAffineConstantExpr(divisor, context);
+
+  int loc;
+  auto divExpr = isCeil ? a.ceilDiv(b) : a.floorDiv(b);
+  if ((loc = findLocalId(divExpr)) == -1) {
+    if (!isCeil) {
+      SmallVector<int64_t, 8> dividend(lhs);
+      addLocalFloorDivId(dividend, divisor, divExpr);
+    } else {
+      // lhs ceildiv c <=>  (lhs + c - 1) floordiv c
+      SmallVector<int64_t, 8> dividend(lhs);
+      dividend.back() += divisor - 1;
+      addLocalFloorDivId(dividend, divisor, divExpr);
+    }
+  }
+  // Set the expression on stack to the local var introduced to capture the
+  // result of the division (floor or ceil).
+  std::fill(lhs.begin(), lhs.end(), 0);
+  if (loc == -1)
+    lhs[getLocalVarStartIndex() + numLocals - 1] = 1;
+  else
+    lhs[getLocalVarStartIndex() + loc] = 1;
+}
+
+// Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+// The local identifier added is always a floordiv of a pure add/mul affine
+// function of other identifiers, coefficients of which are specified in
+// dividend and with respect to a positive constant divisor. localExpr is the
+// simplified tree expression (AffineExpr) corresponding to the quantifier.
+void SimpleAffineExprFlattener::addLocalFloorDivId(ArrayRef<int64_t> dividend,
+                                                   int64_t divisor,
+                                                   AffineExpr localExpr) {
+  assert(divisor > 0 && "positive constant divisor expected");
+  for (auto &subExpr : operandExprStack)
+    subExpr.insert(subExpr.begin() + getLocalVarStartIndex() + numLocals, 0);
+  localExprs.push_back(localExpr);
+  numLocals++;
+  // dividend and divisor are not used here; an override of this method uses it.
+}
+
+int SimpleAffineExprFlattener::findLocalId(AffineExpr localExpr) {
+  SmallVectorImpl<AffineExpr>::iterator it;
+  if ((it = llvm::find(localExprs, localExpr)) == localExprs.end())
+    return -1;
+  return it - localExprs.begin();
+}
+
+/// Simplify the affine expression by flattening it and reconstructing it.
+AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                                    unsigned numSymbols) {
+  // TODO(bondhugula): only pure affine for now. The simplification here can
+  // be extended to semi-affine maps in the future.
+  if (!expr.isPureAffine())
+    return expr;
+
+  SimpleAffineExprFlattener flattener(numDims, numSymbols);
+  flattener.walkPostOrder(expr);
+  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
+  auto simplifiedExpr = toAffineExpr(flattenedExpr, numDims, numSymbols,
+                                     flattener.localExprs, expr.getContext());
+  flattener.operandExprStack.pop_back();
+  assert(flattener.operandExprStack.empty());
+
+  return simplifiedExpr;
+}
+
+// Flattens the expressions in map. Returns true on success or false
+// if 'expr' was unable to be flattened (i.e., semi-affine expressions not
+// handled yet).
+static bool getFlattenedAffineExprs(
+    ArrayRef<AffineExpr> exprs, unsigned numDims, unsigned numSymbols,
+    std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (exprs.empty()) {
+    return true;
+  }
+
+  SimpleAffineExprFlattener flattener(numDims, numSymbols);
+  // Use the same flattener to simplify each expression successively. This way
+  // local identifiers / expressions are shared.
+  for (auto expr : exprs) {
+    if (!expr.isPureAffine())
+      return false;
+
+    flattener.walkPostOrder(expr);
+  }
+
+  flattenedExprs->clear();
+  assert(flattener.operandExprStack.size() == exprs.size());
+  flattenedExprs->assign(flattener.operandExprStack.begin(),
+                         flattener.operandExprStack.end());
+
+  return true;
+}
+
+// Flattens 'expr' into 'flattenedExpr'. Returns true on success or false
+// if 'expr' was unable to be flattened (semi-affine expressions not handled
+// yet).
+bool mlir::getFlattenedAffineExpr(
+    AffineExpr expr, unsigned numDims, unsigned numSymbols,
+    llvm::SmallVectorImpl<int64_t> *flattenedExpr) {
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  bool ret =
+      ::getFlattenedAffineExprs({expr}, numDims, numSymbols, &flattenedExprs);
+  *flattenedExpr = flattenedExprs[0];
+  return ret;
+}
+
+/// Flattens the expressions in map. Returns true on success or false
+/// if 'expr' was unable to be flattened (i.e., semi-affine expressions not
+/// handled yet).
+bool mlir::getFlattenedAffineExprs(
+    AffineMap map, std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (map.getNumResults() == 0) {
+    return true;
+  }
+  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
+                                   map.getNumSymbols(), flattenedExprs);
+}
+
+bool mlir::getFlattenedAffineExprs(
+    IntegerSet set,
+    std::vector<llvm::SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (set.getNumConstraints() == 0) {
+    return true;
+  }
+  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
+                                   set.getNumSymbols(), flattenedExprs);
+}
diff --git a/third_party/mlir/lib/IR/AffineExprDetail.h b/third_party/mlir/lib/IR/AffineExprDetail.h
new file mode 100644
index 00000000000..214fee65056
--- /dev/null
+++ b/third_party/mlir/lib/IR/AffineExprDetail.h
@@ -0,0 +1,98 @@
+//===- AffineExprDetail.h - MLIR Affine Expr storage details ----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of AffineExpr. Ideally it would not be
+// exposed and would be kept local to AffineExpr.cpp however, MLIRContext.cpp
+// needs to know the sizes for placement-new style Allocation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_IR_AFFINEEXPRDETAIL_H_
+#define MLIR_IR_AFFINEEXPRDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+
+class MLIRContext;
+
+namespace detail {
+
+/// Base storage class appearing in an affine expression.
+struct AffineExprStorage : public StorageUniquer::BaseStorage {
+  MLIRContext *context;
+};
+
+/// A binary operation appearing in an affine expression.
+struct AffineBinaryOpExprStorage : public AffineExprStorage {
+  using KeyTy = std::pair<AffineExpr, AffineExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return key.first == lhs && key.second == rhs;
+  }
+
+  static AffineBinaryOpExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineBinaryOpExprStorage>();
+    result->lhs = key.first;
+    result->rhs = key.second;
+    result->context = result->lhs.getContext();
+    return result;
+  }
+
+  AffineExpr lhs;
+  AffineExpr rhs;
+};
+
+/// A dimensional or symbolic identifier appearing in an affine expression.
+struct AffineDimExprStorage : public AffineExprStorage {
+  using KeyTy = unsigned;
+
+  bool operator==(const KeyTy &key) const { return position == key; }
+
+  static AffineDimExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineDimExprStorage>();
+    result->position = key;
+    return result;
+  }
+
+  /// Position of this identifier in the argument list.
+  unsigned position;
+};
+
+/// An integer constant appearing in affine expression.
+struct AffineConstantExprStorage : public AffineExprStorage {
+  using KeyTy = int64_t;
+
+  bool operator==(const KeyTy &key) const { return constant == key; }
+
+  static AffineConstantExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineConstantExprStorage>();
+    result->constant = key;
+    return result;
+  }
+
+  // The constant.
+  int64_t constant;
+};
+
+} // end namespace detail
+} // end namespace mlir
+#endif // MLIR_IR_AFFINEEXPRDETAIL_H_
diff --git a/third_party/mlir/lib/IR/AffineMap.cpp b/third_party/mlir/lib/IR/AffineMap.cpp
new file mode 100644
index 00000000000..e56d0e83f65
--- /dev/null
+++ b/third_party/mlir/lib/IR/AffineMap.cpp
@@ -0,0 +1,323 @@
+//===- AffineMap.cpp - MLIR Affine Map Classes ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/AffineMap.h"
+#include "AffineMapDetail.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+
+// AffineExprConstantFolder evaluates an affine expression using constant
+// operands passed in 'operandConsts'. Returns an IntegerAttr attribute
+// representing the constant value of the affine expression evaluated on
+// constant 'operandConsts', or nullptr if it can't be folded.
+class AffineExprConstantFolder {
+public:
+  AffineExprConstantFolder(unsigned numDims, ArrayRef<Attribute> operandConsts)
+      : numDims(numDims), operandConsts(operandConsts) {}
+
+  /// Attempt to constant fold the specified affine expr, or return null on
+  /// failure.
+  IntegerAttr constantFold(AffineExpr expr) {
+    if (auto result = constantFoldImpl(expr))
+      return IntegerAttr::get(IndexType::get(expr.getContext()), *result);
+    return nullptr;
+  }
+
+private:
+  llvm::Optional<int64_t> constantFoldImpl(AffineExpr expr) {
+    switch (expr.getKind()) {
+    case AffineExprKind::Add:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return lhs + rhs; });
+    case AffineExprKind::Mul:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return lhs * rhs; });
+    case AffineExprKind::Mod:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return mod(lhs, rhs); });
+    case AffineExprKind::FloorDiv:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return floorDiv(lhs, rhs); });
+    case AffineExprKind::CeilDiv:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return ceilDiv(lhs, rhs); });
+    case AffineExprKind::Constant:
+      return expr.cast<AffineConstantExpr>().getValue();
+    case AffineExprKind::DimId:
+      if (auto attr = operandConsts[expr.cast<AffineDimExpr>().getPosition()]
+                          .dyn_cast_or_null<IntegerAttr>())
+        return attr.getInt();
+      return llvm::None;
+    case AffineExprKind::SymbolId:
+      if (auto attr = operandConsts[numDims +
+                                    expr.cast<AffineSymbolExpr>().getPosition()]
+                          .dyn_cast_or_null<IntegerAttr>())
+        return attr.getInt();
+      return llvm::None;
+    }
+    llvm_unreachable("Unknown AffineExpr");
+  }
+
+  // TODO: Change these to operate on APInts too.
+  llvm::Optional<int64_t> constantFoldBinExpr(AffineExpr expr,
+                                              int64_t (*op)(int64_t, int64_t)) {
+    auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+    if (auto lhs = constantFoldImpl(binOpExpr.getLHS()))
+      if (auto rhs = constantFoldImpl(binOpExpr.getRHS()))
+        return op(*lhs, *rhs);
+    return llvm::None;
+  }
+
+  // The number of dimension operands in AffineMap containing this expression.
+  unsigned numDims;
+  // The constant valued operands used to evaluate this AffineExpr.
+  ArrayRef<Attribute> operandConsts;
+};
+
+} // end anonymous namespace
+
+/// Returns a single constant result affine map.
+AffineMap AffineMap::getConstantMap(int64_t val, MLIRContext *context) {
+  return get(/*dimCount=*/0, /*symbolCount=*/0,
+             {getAffineConstantExpr(val, context)});
+}
+
+AffineMap AffineMap::getMultiDimIdentityMap(unsigned numDims,
+                                            MLIRContext *context) {
+  SmallVector<AffineExpr, 4> dimExprs;
+  dimExprs.reserve(numDims);
+  for (unsigned i = 0; i < numDims; ++i)
+    dimExprs.push_back(mlir::getAffineDimExpr(i, context));
+  return get(/*dimCount=*/numDims, /*symbolCount=*/0, dimExprs);
+}
+
+MLIRContext *AffineMap::getContext() const { return map->context; }
+
+bool AffineMap::isIdentity() const {
+  if (getNumDims() != getNumResults())
+    return false;
+  ArrayRef<AffineExpr> results = getResults();
+  for (unsigned i = 0, numDims = getNumDims(); i < numDims; ++i) {
+    auto expr = results[i].dyn_cast<AffineDimExpr>();
+    if (!expr || expr.getPosition() != i)
+      return false;
+  }
+  return true;
+}
+
+bool AffineMap::isEmpty() const {
+  return getNumDims() == 0 && getNumSymbols() == 0 && getNumResults() == 0;
+}
+
+bool AffineMap::isSingleConstant() const {
+  return getNumResults() == 1 && getResult(0).isa<AffineConstantExpr>();
+}
+
+int64_t AffineMap::getSingleConstantResult() const {
+  assert(isSingleConstant() && "map must have a single constant result");
+  return getResult(0).cast<AffineConstantExpr>().getValue();
+}
+
+unsigned AffineMap::getNumDims() const {
+  assert(map && "uninitialized map storage");
+  return map->numDims;
+}
+unsigned AffineMap::getNumSymbols() const {
+  assert(map && "uninitialized map storage");
+  return map->numSymbols;
+}
+unsigned AffineMap::getNumResults() const {
+  assert(map && "uninitialized map storage");
+  return map->results.size();
+}
+unsigned AffineMap::getNumInputs() const {
+  assert(map && "uninitialized map storage");
+  return map->numDims + map->numSymbols;
+}
+
+ArrayRef<AffineExpr> AffineMap::getResults() const {
+  assert(map && "uninitialized map storage");
+  return map->results;
+}
+AffineExpr AffineMap::getResult(unsigned idx) const {
+  assert(map && "uninitialized map storage");
+  return map->results[idx];
+}
+
+/// Folds the results of the application of an affine map on the provided
+/// operands to a constant if possible. Returns false if the folding happens,
+/// true otherwise.
+LogicalResult
+AffineMap::constantFold(ArrayRef<Attribute> operandConstants,
+                        SmallVectorImpl<Attribute> &results) const {
+  assert(getNumInputs() == operandConstants.size());
+
+  // Fold each of the result expressions.
+  AffineExprConstantFolder exprFolder(getNumDims(), operandConstants);
+  // Constant fold each AffineExpr in AffineMap and add to 'results'.
+  for (auto expr : getResults()) {
+    auto folded = exprFolder.constantFold(expr);
+    // If we didn't fold to a constant, then folding fails.
+    if (!folded)
+      return failure();
+
+    results.push_back(folded);
+  }
+  assert(results.size() == getNumResults() &&
+         "constant folding produced the wrong number of results");
+  return success();
+}
+
+/// Walk all of the AffineExpr's in this mapping. Each node in an expression
+/// tree is visited in postorder.
+void AffineMap::walkExprs(std::function<void(AffineExpr)> callback) const {
+  for (auto expr : getResults())
+    expr.walk(callback);
+}
+
+/// This method substitutes any uses of dimensions and symbols (e.g.
+/// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+/// expression mapping.  Because this can be used to eliminate dims and
+/// symbols, the client needs to specify the number of dims and symbols in
+/// the result.  The returned map always has the same number of results.
+AffineMap AffineMap::replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                           ArrayRef<AffineExpr> symReplacements,
+                                           unsigned numResultDims,
+                                           unsigned numResultSyms) {
+  SmallVector<AffineExpr, 8> results;
+  results.reserve(getNumResults());
+  for (auto expr : getResults())
+    results.push_back(
+        expr.replaceDimsAndSymbols(dimReplacements, symReplacements));
+
+  return get(numResultDims, numResultSyms, results);
+}
+
+AffineMap AffineMap::compose(AffineMap map) {
+  assert(getNumDims() == map.getNumResults() && "Number of results mismatch");
+  // Prepare `map` by concatenating the symbols and rewriting its exprs.
+  unsigned numDims = map.getNumDims();
+  unsigned numSymbolsThisMap = getNumSymbols();
+  unsigned numSymbols = numSymbolsThisMap + map.getNumSymbols();
+  SmallVector<AffineExpr, 8> newDims(numDims);
+  for (unsigned idx = 0; idx < numDims; ++idx) {
+    newDims[idx] = getAffineDimExpr(idx, getContext());
+  }
+  SmallVector<AffineExpr, 8> newSymbols(numSymbols);
+  for (unsigned idx = numSymbolsThisMap; idx < numSymbols; ++idx) {
+    newSymbols[idx - numSymbolsThisMap] =
+        getAffineSymbolExpr(idx, getContext());
+  }
+  auto newMap =
+      map.replaceDimsAndSymbols(newDims, newSymbols, numDims, numSymbols);
+  SmallVector<AffineExpr, 8> exprs;
+  exprs.reserve(getResults().size());
+  for (auto expr : getResults())
+    exprs.push_back(expr.compose(newMap));
+  return AffineMap::get(numDims, numSymbols, exprs);
+}
+
+bool AffineMap::isProjectedPermutation() {
+  if (getNumSymbols() > 0)
+    return false;
+  SmallVector<bool, 8> seen(getNumInputs(), false);
+  for (auto expr : getResults()) {
+    if (auto dim = expr.dyn_cast<AffineDimExpr>()) {
+      if (seen[dim.getPosition()])
+        return false;
+      seen[dim.getPosition()] = true;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+bool AffineMap::isPermutation() {
+  if (getNumDims() != getNumResults())
+    return false;
+  return isProjectedPermutation();
+}
+
+AffineMap AffineMap::getSubMap(ArrayRef<unsigned> resultPos) {
+  SmallVector<AffineExpr, 4> exprs;
+  exprs.reserve(resultPos.size());
+  for (auto idx : resultPos) {
+    exprs.push_back(getResult(idx));
+  }
+  return AffineMap::get(getNumDims(), getNumSymbols(), exprs);
+}
+
+AffineMap mlir::simplifyAffineMap(AffineMap map) {
+  SmallVector<AffineExpr, 8> exprs;
+  for (auto e : map.getResults()) {
+    exprs.push_back(
+        simplifyAffineExpr(e, map.getNumDims(), map.getNumSymbols()));
+  }
+  return AffineMap::get(map.getNumDims(), map.getNumSymbols(), exprs);
+}
+
+AffineMap mlir::inversePermutation(AffineMap map) {
+  if (!map)
+    return map;
+  assert(map.getNumSymbols() == 0 && "expected map without symbols");
+  SmallVector<AffineExpr, 4> exprs(map.getNumDims());
+  for (auto en : llvm::enumerate(map.getResults())) {
+    auto expr = en.value();
+    // Skip non-permutations.
+    if (auto d = expr.dyn_cast<AffineDimExpr>()) {
+      if (exprs[d.getPosition()])
+        continue;
+      exprs[d.getPosition()] = getAffineDimExpr(en.index(), d.getContext());
+    }
+  }
+  SmallVector<AffineExpr, 4> seenExprs;
+  seenExprs.reserve(map.getNumDims());
+  for (auto expr : exprs)
+    if (expr)
+      seenExprs.push_back(expr);
+  if (seenExprs.size() != map.getNumInputs())
+    return AffineMap();
+  return AffineMap::get(map.getNumResults(), 0, seenExprs);
+}
+
+AffineMap mlir::concatAffineMaps(ArrayRef<AffineMap> maps) {
+  unsigned numResults = 0;
+  for (auto m : maps)
+    numResults += m ? m.getNumResults() : 0;
+  unsigned numDims = 0;
+  llvm::SmallVector<AffineExpr, 8> results;
+  results.reserve(numResults);
+  for (auto m : maps) {
+    if (!m)
+      continue;
+    assert(m.getNumSymbols() == 0 && "expected map without symbols");
+    results.append(m.getResults().begin(), m.getResults().end());
+    numDims = std::max(m.getNumDims(), numDims);
+  }
+  return numDims == 0 ? AffineMap() : AffineMap::get(numDims, 0, results);
+}
diff --git a/third_party/mlir/lib/IR/AffineMapDetail.h b/third_party/mlir/lib/IR/AffineMapDetail.h
new file mode 100644
index 00000000000..a247783540c
--- /dev/null
+++ b/third_party/mlir/lib/IR/AffineMapDetail.h
@@ -0,0 +1,46 @@
+//===- AffineMapDetail.h - MLIR Affine Map details Class --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of AffineMap.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AFFINEMAPDETAIL_H_
+#define AFFINEMAPDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+namespace detail {
+
+struct AffineMapStorage {
+  unsigned numDims;
+  unsigned numSymbols;
+
+  /// The affine expressions for this (multi-dimensional) map.
+  /// TODO: use trailing objects for this.
+  ArrayRef<AffineExpr> results;
+
+  MLIRContext *context;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // AFFINEMAPDETAIL_H_
diff --git a/third_party/mlir/lib/IR/AsmPrinter.cpp b/third_party/mlir/lib/IR/AsmPrinter.cpp
new file mode 100644
index 00000000000..06c5eac8643
--- /dev/null
+++ b/third_party/mlir/lib/IR/AsmPrinter.cpp
@@ -0,0 +1,1814 @@
+//===- AsmPrinter.cpp - MLIR Assembly Printer Implementation --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the MLIR AsmPrinter class, which is used to implement
+// the various print() methods on the core IR objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
+using namespace mlir;
+
+void Identifier::print(raw_ostream &os) const { os << str(); }
+
+void Identifier::dump() const { print(llvm::errs()); }
+
+void OperationName::print(raw_ostream &os) const { os << getStringRef(); }
+
+void OperationName::dump() const { print(llvm::errs()); }
+
+OpAsmPrinter::~OpAsmPrinter() {}
+
+//===----------------------------------------------------------------------===//
+// ModuleState
+//===----------------------------------------------------------------------===//
+
+// TODO(riverriddle) Rethink this flag when we have a pass that can remove debug
+// info or when we have a system for printer flags.
+static llvm::cl::opt<bool>
+    shouldPrintDebugInfoOpt("mlir-print-debuginfo",
+                            llvm::cl::desc("Print debug info in MLIR output"),
+                            llvm::cl::init(false));
+
+static llvm::cl::opt<bool> printPrettyDebugInfo(
+    "mlir-pretty-debuginfo",
+    llvm::cl::desc("Print pretty debug info in MLIR output"),
+    llvm::cl::init(false));
+
+// Use the generic op output form in the operation printer even if the custom
+// form is defined.
+static llvm::cl::opt<bool>
+    printGenericOpForm("mlir-print-op-generic",
+                       llvm::cl::desc("Print the generic op form"),
+                       llvm::cl::init(false), llvm::cl::Hidden);
+
+namespace {
+/// A special index constant used for non-kind attribute aliases.
+static constexpr int kNonAttrKindAlias = -1;
+
+class ModuleState {
+public:
+  explicit ModuleState(MLIRContext *context) : interfaces(context) {}
+  void initialize(Operation *op);
+
+  Twine getAttributeAlias(Attribute attr) const {
+    auto alias = attrToAlias.find(attr);
+    if (alias == attrToAlias.end())
+      return Twine();
+
+    // Return the alias for this attribute, along with the index if this was
+    // generated by a kind alias.
+    int kindIndex = alias->second.second;
+    return alias->second.first +
+           (kindIndex == kNonAttrKindAlias ? Twine() : Twine(kindIndex));
+  }
+
+  void printAttributeAliases(raw_ostream &os) const {
+    auto printAlias = [&](StringRef alias, Attribute attr, int index) {
+      os << '#' << alias;
+      if (index != kNonAttrKindAlias)
+        os << index;
+      os << " = " << attr << '\n';
+    };
+
+    // Print all of the attribute kind aliases.
+    for (auto &kindAlias : attrKindToAlias) {
+      for (unsigned i = 0, e = kindAlias.second.second.size(); i != e; ++i)
+        printAlias(kindAlias.second.first, kindAlias.second.second[i], i);
+      os << "\n";
+    }
+
+    // In a second pass print all of the remaining attribute aliases that aren't
+    // kind aliases.
+    for (Attribute attr : usedAttributes) {
+      auto alias = attrToAlias.find(attr);
+      if (alias != attrToAlias.end() &&
+          alias->second.second == kNonAttrKindAlias)
+        printAlias(alias->second.first, attr, alias->second.second);
+    }
+  }
+
+  StringRef getTypeAlias(Type ty) const { return typeToAlias.lookup(ty); }
+
+  void printTypeAliases(raw_ostream &os) const {
+    for (Type type : usedTypes) {
+      auto alias = typeToAlias.find(type);
+      if (alias != typeToAlias.end())
+        os << '!' << alias->second << " = type " << type << '\n';
+    }
+  }
+
+  /// Get an instance of the OpAsmDialectInterface for the given dialect, or
+  /// null if one wasn't registered.
+  const OpAsmDialectInterface *getOpAsmInterface(Dialect *dialect) {
+    return interfaces.getInterfaceFor(dialect);
+  }
+
+private:
+  void recordAttributeReference(Attribute attr) {
+    // Don't recheck attributes that have already been seen or those that
+    // already have an alias.
+    if (!usedAttributes.insert(attr) || attrToAlias.count(attr))
+      return;
+
+    // If this attribute kind has an alias, then record one for this attribute.
+    auto alias = attrKindToAlias.find(static_cast<unsigned>(attr.getKind()));
+    if (alias == attrKindToAlias.end())
+      return;
+    std::pair<StringRef, int> attrAlias(alias->second.first,
+                                        alias->second.second.size());
+    attrToAlias.insert({attr, attrAlias});
+    alias->second.second.push_back(attr);
+  }
+
+  void recordTypeReference(Type ty) { usedTypes.insert(ty); }
+
+  // Visit functions.
+  void visitOperation(Operation *op);
+  void visitType(Type type);
+  void visitAttribute(Attribute attr);
+
+  // Initialize symbol aliases.
+  void initializeSymbolAliases();
+
+  /// Set of attributes known to be used within the module.
+  llvm::SetVector<Attribute> usedAttributes;
+
+  /// Mapping between attribute and a pair comprised of a base alias name and a
+  /// count suffix. If the suffix is set to -1, it is not displayed.
+  llvm::MapVector<Attribute, std::pair<StringRef, int>> attrToAlias;
+
+  /// Mapping between attribute kind and a pair comprised of a base alias name
+  /// and a unique list of attributes belonging to this kind sorted by location
+  /// seen in the module.
+  llvm::MapVector<unsigned, std::pair<StringRef, std::vector<Attribute>>>
+      attrKindToAlias;
+
+  /// Set of types known to be used within the module.
+  llvm::SetVector<Type> usedTypes;
+
+  /// A mapping between a type and a given alias.
+  DenseMap<Type, StringRef> typeToAlias;
+
+  /// Collection of OpAsm interfaces implemented in the context.
+  DialectInterfaceCollection<OpAsmDialectInterface> interfaces;
+};
+} // end anonymous namespace
+
+// TODO Support visiting other types/operations when implemented.
+void ModuleState::visitType(Type type) {
+  recordTypeReference(type);
+  if (auto funcType = type.dyn_cast<FunctionType>()) {
+    // Visit input and result types for functions.
+    for (auto input : funcType.getInputs())
+      visitType(input);
+    for (auto result : funcType.getResults())
+      visitType(result);
+    return;
+  }
+  if (auto memref = type.dyn_cast<MemRefType>()) {
+    // Visit affine maps in memref type.
+    for (auto map : memref.getAffineMaps())
+      recordAttributeReference(AffineMapAttr::get(map));
+  }
+  if (auto shapedType = type.dyn_cast<ShapedType>()) {
+    visitType(shapedType.getElementType());
+  }
+}
+
+void ModuleState::visitAttribute(Attribute attr) {
+  recordAttributeReference(attr);
+  if (auto arrayAttr = attr.dyn_cast<ArrayAttr>()) {
+    for (auto elt : arrayAttr.getValue())
+      visitAttribute(elt);
+  } else if (auto typeAttr = attr.dyn_cast<TypeAttr>()) {
+    visitType(typeAttr.getValue());
+  }
+}
+
+void ModuleState::visitOperation(Operation *op) {
+  // Visit all the types used in the operation.
+  for (auto type : op->getOperandTypes())
+    visitType(type);
+  for (auto type : op->getResultTypes())
+    visitType(type);
+  for (auto &region : op->getRegions())
+    for (auto &block : region)
+      for (auto *arg : block.getArguments())
+        visitType(arg->getType());
+
+  // Visit each of the attributes.
+  for (auto elt : op->getAttrs())
+    visitAttribute(elt.second);
+}
+
+// Utility to generate a function to register a symbol alias.
+static bool canRegisterAlias(StringRef name, llvm::StringSet<> &usedAliases) {
+  assert(!name.empty() && "expected alias name to be non-empty");
+  // TODO(riverriddle) Assert that the provided alias name can be lexed as
+  // an identifier.
+
+  // Check that the alias doesn't contain a '.' character and the name is not
+  // already in use.
+  return !name.contains('.') && usedAliases.insert(name).second;
+}
+
+void ModuleState::initializeSymbolAliases() {
+  // Track the identifiers in use for each symbol so that the same identifier
+  // isn't used twice.
+  llvm::StringSet<> usedAliases;
+
+  // Collect the set of aliases from each dialect.
+  SmallVector<std::pair<unsigned, StringRef>, 8> attributeKindAliases;
+  SmallVector<std::pair<Attribute, StringRef>, 8> attributeAliases;
+  SmallVector<std::pair<Type, StringRef>, 16> typeAliases;
+
+  // AffineMap/Integer set have specific kind aliases.
+  attributeKindAliases.emplace_back(StandardAttributes::AffineMap, "map");
+  attributeKindAliases.emplace_back(StandardAttributes::IntegerSet, "set");
+
+  for (auto &interface : interfaces) {
+    interface.getAttributeKindAliases(attributeKindAliases);
+    interface.getAttributeAliases(attributeAliases);
+    interface.getTypeAliases(typeAliases);
+  }
+
+  // Setup the attribute kind aliases.
+  StringRef alias;
+  unsigned attrKind;
+  for (auto &attrAliasPair : attributeKindAliases) {
+    std::tie(attrKind, alias) = attrAliasPair;
+    assert(!alias.empty() && "expected non-empty alias string");
+    if (!usedAliases.count(alias) && !alias.contains('.'))
+      attrKindToAlias.insert({attrKind, {alias, {}}});
+  }
+
+  // Clear the set of used identifiers so that the attribute kind aliases are
+  // just a prefix and not the full alias, i.e. there may be some overlap.
+  usedAliases.clear();
+
+  // Register the attribute aliases.
+  // Create a regex for the attribute kind alias names, these have a prefix with
+  // a counter appended to the end. We prevent normal aliases from having these
+  // names to avoid collisions.
+  llvm::Regex reservedAttrNames("[0-9]+$");
+
+  // Attribute value aliases.
+  Attribute attr;
+  for (auto &attrAliasPair : attributeAliases) {
+    std::tie(attr, alias) = attrAliasPair;
+    if (!reservedAttrNames.match(alias) && canRegisterAlias(alias, usedAliases))
+      attrToAlias.insert({attr, {alias, kNonAttrKindAlias}});
+  }
+
+  // Clear the set of used identifiers as types can have the same identifiers as
+  // affine structures.
+  usedAliases.clear();
+
+  // Type aliases.
+  for (auto &typeAliasPair : typeAliases)
+    if (canRegisterAlias(typeAliasPair.second, usedAliases))
+      typeToAlias.insert(typeAliasPair);
+}
+
+void ModuleState::initialize(Operation *op) {
+  // Initialize the symbol aliases.
+  initializeSymbolAliases();
+
+  // Visit each of the nested operations.
+  op->walk([&](Operation *op) { visitOperation(op); });
+}
+
+//===----------------------------------------------------------------------===//
+// ModulePrinter
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ModulePrinter {
+public:
+  ModulePrinter(raw_ostream &os, ModuleState *state = nullptr)
+      : os(os), state(state) {}
+  explicit ModulePrinter(ModulePrinter &printer)
+      : os(printer.os), state(printer.state) {}
+
+  template <typename Container, typename UnaryFunctor>
+  inline void interleaveComma(const Container &c, UnaryFunctor each_fn) const {
+    interleave(c.begin(), c.end(), each_fn, [&]() { os << ", "; });
+  }
+
+  void print(ModuleOp module);
+
+  /// Print the given attribute. If 'mayElideType' is true, some attributes are
+  /// printed without the type when the type matches the default used in the
+  /// parser (for example i64 is the default for integer attributes).
+  void printAttribute(Attribute attr, bool mayElideType = false);
+
+  void printType(Type type);
+  void printLocation(LocationAttr loc);
+
+  void printAffineMap(AffineMap map);
+  void printAffineExpr(
+      AffineExpr expr,
+      llvm::function_ref<void(unsigned, bool)> printValueName = nullptr);
+  void printAffineConstraint(AffineExpr expr, bool isEq);
+  void printIntegerSet(IntegerSet set);
+
+protected:
+  void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                             ArrayRef<StringRef> elidedAttrs = {});
+  void printTrailingLocation(Location loc);
+  void printLocationInternal(LocationAttr loc, bool pretty = false);
+  void printDenseElementsAttr(DenseElementsAttr attr);
+
+  /// This enum is used to represent the binding stength of the enclosing
+  /// context that an AffineExprStorage is being printed in, so we can
+  /// intelligently produce parens.
+  enum class BindingStrength {
+    Weak,   // + and -
+    Strong, // All other binary operators.
+  };
+  void printAffineExprInternal(
+      AffineExpr expr, BindingStrength enclosingTightness,
+      llvm::function_ref<void(unsigned, bool)> printValueName = nullptr);
+
+  /// The output stream for the printer.
+  raw_ostream &os;
+
+  /// An optional printer state for the module.
+  ModuleState *state;
+};
+} // end anonymous namespace
+
+void ModulePrinter::printTrailingLocation(Location loc) {
+  // Check to see if we are printing debug information.
+  if (!shouldPrintDebugInfoOpt)
+    return;
+
+  os << " ";
+  printLocation(loc);
+}
+
+void ModulePrinter::printLocationInternal(LocationAttr loc, bool pretty) {
+  switch (loc.getKind()) {
+  case StandardAttributes::UnknownLocation:
+    if (pretty)
+      os << "[unknown]";
+    else
+      os << "unknown";
+    break;
+  case StandardAttributes::FileLineColLocation: {
+    auto fileLoc = loc.cast<FileLineColLoc>();
+    auto mayQuote = pretty ? "" : "\"";
+    os << mayQuote << fileLoc.getFilename() << mayQuote << ':'
+       << fileLoc.getLine() << ':' << fileLoc.getColumn();
+    break;
+  }
+  case StandardAttributes::NameLocation: {
+    auto nameLoc = loc.cast<NameLoc>();
+    os << '\"' << nameLoc.getName() << '\"';
+
+    // Print the child if it isn't unknown.
+    auto childLoc = nameLoc.getChildLoc();
+    if (!childLoc.isa<UnknownLoc>()) {
+      os << '(';
+      printLocationInternal(childLoc, pretty);
+      os << ')';
+    }
+    break;
+  }
+  case StandardAttributes::CallSiteLocation: {
+    auto callLocation = loc.cast<CallSiteLoc>();
+    auto caller = callLocation.getCaller();
+    auto callee = callLocation.getCallee();
+    if (!pretty)
+      os << "callsite(";
+    printLocationInternal(callee, pretty);
+    if (pretty) {
+      if (callee.isa<NameLoc>()) {
+        if (caller.isa<FileLineColLoc>()) {
+          os << " at ";
+        } else {
+          os << "\n at ";
+        }
+      } else {
+        os << "\n at ";
+      }
+    } else {
+      os << " at ";
+    }
+    printLocationInternal(caller, pretty);
+    if (!pretty)
+      os << ")";
+    break;
+  }
+  case StandardAttributes::FusedLocation: {
+    auto fusedLoc = loc.cast<FusedLoc>();
+    if (!pretty)
+      os << "fused";
+    if (auto metadata = fusedLoc.getMetadata())
+      os << '<' << metadata << '>';
+    os << '[';
+    interleave(
+        fusedLoc.getLocations(),
+        [&](Location loc) { printLocationInternal(loc, pretty); },
+        [&]() { os << ", "; });
+    os << ']';
+    break;
+  }
+  }
+}
+
+/// Print a floating point value in a way that the parser will be able to
+/// round-trip losslessly.
+static void printFloatValue(const APFloat &apValue, raw_ostream &os) {
+  // We would like to output the FP constant value in exponential notation,
+  // but we cannot do this if doing so will lose precision.  Check here to
+  // make sure that we only output it in exponential format if we can parse
+  // the value back and get the same value.
+  bool isInf = apValue.isInfinity();
+  bool isNaN = apValue.isNaN();
+  if (!isInf && !isNaN) {
+    SmallString<128> strValue;
+    apValue.toString(strValue, 6, 0, false);
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+    // that the string matches the "[-+]?[0-9]" regex.
+    assert(((strValue[0] >= '0' && strValue[0] <= '9') ||
+            ((strValue[0] == '-' || strValue[0] == '+') &&
+             (strValue[1] >= '0' && strValue[1] <= '9'))) &&
+           "[-+]?[0-9] regex does not match!");
+
+    // Parse back the stringized version and check that the value is equal
+    // (i.e., there is no precision loss). If it is not, use the default format
+    // of APFloat instead of the exponential notation.
+    if (!APFloat(apValue.getSemantics(), strValue).bitwiseIsEqual(apValue)) {
+      strValue.clear();
+      apValue.toString(strValue);
+    }
+    os << strValue;
+    return;
+  }
+
+  // Print special values in hexadecimal format.  The sign bit should be
+  // included in the literal.
+  SmallVector<char, 16> str;
+  APInt apInt = apValue.bitcastToAPInt();
+  apInt.toString(str, /*Radix=*/16, /*Signed=*/false,
+                 /*formatAsCLiteral=*/true);
+  os << str;
+}
+
+void ModulePrinter::printLocation(LocationAttr loc) {
+  if (printPrettyDebugInfo) {
+    printLocationInternal(loc, /*pretty=*/true);
+  } else {
+    os << "loc(";
+    printLocationInternal(loc);
+    os << ')';
+  }
+}
+
+/// Returns if the given dialect symbol data is simple enough to print in the
+/// pretty form, i.e. without the enclosing "".
+static bool isDialectSymbolSimpleEnoughForPrettyForm(StringRef symName) {
+  // The name must start with an identifier.
+  if (symName.empty() || !isalpha(symName.front()))
+    return false;
+
+  // Ignore all the characters that are valid in an identifier in the symbol
+  // name.
+  symName =
+      symName.drop_while([](char c) { return llvm::isAlnum(c) || c == '.'; });
+  if (symName.empty())
+    return true;
+
+  // If we got to an unexpected character, then it must be a <>.  Check those
+  // recursively.
+  if (symName.front() != '<' || symName.back() != '>')
+    return false;
+
+  SmallVector<char, 8> nestedPunctuation;
+  do {
+    // If we ran out of characters, then we had a punctuation mismatch.
+    if (symName.empty())
+      return false;
+
+    auto c = symName.front();
+    symName = symName.drop_front();
+
+    switch (c) {
+    // We never allow null characters. This is an EOF indicator for the lexer
+    // which we could handle, but isn't important for any known dialect.
+    case '\0':
+      return false;
+    case '<':
+    case '[':
+    case '(':
+    case '{':
+      nestedPunctuation.push_back(c);
+      continue;
+    case '-':
+      // Treat `->` as a special token.
+      if (!symName.empty() && symName.front() == '>') {
+        symName = symName.drop_front();
+        continue;
+      }
+      break;
+    // Reject types with mismatched brackets.
+    case '>':
+      if (nestedPunctuation.pop_back_val() != '<')
+        return false;
+      break;
+    case ']':
+      if (nestedPunctuation.pop_back_val() != '[')
+        return false;
+      break;
+    case ')':
+      if (nestedPunctuation.pop_back_val() != '(')
+        return false;
+      break;
+    case '}':
+      if (nestedPunctuation.pop_back_val() != '{')
+        return false;
+      break;
+    default:
+      continue;
+    }
+
+    // We're done when the punctuation is fully matched.
+  } while (!nestedPunctuation.empty());
+
+  // If there were extra characters, then we failed.
+  return symName.empty();
+}
+
+/// Print the given dialect symbol to the stream.
+static void printDialectSymbol(raw_ostream &os, StringRef symPrefix,
+                               StringRef dialectName, StringRef symString) {
+  os << symPrefix << dialectName;
+
+  // If this symbol name is simple enough, print it directly in pretty form,
+  // otherwise, we print it as an escaped string.
+  if (isDialectSymbolSimpleEnoughForPrettyForm(symString)) {
+    os << '.' << symString;
+    return;
+  }
+
+  // TODO: escape the symbol name, it could contain " characters.
+  os << "<\"" << symString << "\">";
+}
+
+void ModulePrinter::printAttribute(Attribute attr, bool mayElideType) {
+  if (!attr) {
+    os << "<<NULL ATTRIBUTE>>";
+    return;
+  }
+
+  // Check for an alias for this attribute.
+  if (state) {
+    Twine alias = state->getAttributeAlias(attr);
+    if (!alias.isTriviallyEmpty()) {
+      os << '#' << alias;
+      return;
+    }
+  }
+
+  switch (attr.getKind()) {
+  default: {
+    auto &dialect = attr.getDialect();
+
+    // Ask the dialect to serialize the attribute to a string.
+    std::string attrName;
+    {
+      llvm::raw_string_ostream attrNameStr(attrName);
+      dialect.printAttribute(attr, attrNameStr);
+    }
+
+    printDialectSymbol(os, "#", dialect.getNamespace(), attrName);
+    break;
+  }
+  case StandardAttributes::Opaque: {
+    auto opaqueAttr = attr.cast<OpaqueAttr>();
+    printDialectSymbol(os, "#", opaqueAttr.getDialectNamespace(),
+                       opaqueAttr.getAttrData());
+    break;
+  }
+  case StandardAttributes::Unit:
+    os << "unit";
+    break;
+  case StandardAttributes::Bool:
+    os << (attr.cast<BoolAttr>().getValue() ? "true" : "false");
+
+    // BoolAttr always elides the type.
+    return;
+  case StandardAttributes::Dictionary:
+    os << '{';
+    interleaveComma(attr.cast<DictionaryAttr>().getValue(),
+                    [&](NamedAttribute attr) {
+                      os << attr.first << " = ";
+                      printAttribute(attr.second);
+                    });
+    os << '}';
+    break;
+  case StandardAttributes::Integer: {
+    auto intAttr = attr.cast<IntegerAttr>();
+    // Print all integer attributes as signed unless i1.
+    bool isSigned = intAttr.getType().isIndex() ||
+                    intAttr.getType().getIntOrFloatBitWidth() != 1;
+    intAttr.getValue().print(os, isSigned);
+
+    // IntegerAttr elides the type if I64.
+    if (mayElideType && intAttr.getType().isInteger(64))
+      return;
+    break;
+  }
+  case StandardAttributes::Float: {
+    auto floatAttr = attr.cast<FloatAttr>();
+    printFloatValue(floatAttr.getValue(), os);
+
+    // FloatAttr elides the type if F64.
+    if (mayElideType && floatAttr.getType().isF64())
+      return;
+    break;
+  }
+  case StandardAttributes::String:
+    os << '"';
+    printEscapedString(attr.cast<StringAttr>().getValue(), os);
+    os << '"';
+    break;
+  case StandardAttributes::Array:
+    os << '[';
+    interleaveComma(attr.cast<ArrayAttr>().getValue(), [&](Attribute attr) {
+      printAttribute(attr, /*mayElideType=*/true);
+    });
+    os << ']';
+    break;
+  case StandardAttributes::AffineMap:
+    attr.cast<AffineMapAttr>().getValue().print(os);
+
+    // AffineMap always elides the type.
+    return;
+  case StandardAttributes::IntegerSet:
+    attr.cast<IntegerSetAttr>().getValue().print(os);
+    break;
+  case StandardAttributes::Type:
+    printType(attr.cast<TypeAttr>().getValue());
+    break;
+  case StandardAttributes::SymbolRef:
+    os << '@' << attr.cast<SymbolRefAttr>().getValue();
+    break;
+  case StandardAttributes::OpaqueElements: {
+    auto eltsAttr = attr.cast<OpaqueElementsAttr>();
+    os << "opaque<\"" << eltsAttr.getDialect()->getNamespace() << "\", ";
+    os << '"' << "0x" << llvm::toHex(eltsAttr.getValue()) << "\">";
+    break;
+  }
+  case StandardAttributes::DenseElements: {
+    auto eltsAttr = attr.cast<DenseElementsAttr>();
+    os << "dense<";
+    printDenseElementsAttr(eltsAttr);
+    os << '>';
+    break;
+  }
+  case StandardAttributes::SparseElements: {
+    auto elementsAttr = attr.cast<SparseElementsAttr>();
+    os << "sparse<";
+    printDenseElementsAttr(elementsAttr.getIndices());
+    os << ", ";
+    printDenseElementsAttr(elementsAttr.getValues());
+    os << '>';
+    break;
+  }
+
+  // Location attributes.
+  case StandardAttributes::CallSiteLocation:
+  case StandardAttributes::FileLineColLocation:
+  case StandardAttributes::FusedLocation:
+  case StandardAttributes::NameLocation:
+  case StandardAttributes::UnknownLocation:
+    printLocation(attr.cast<LocationAttr>());
+    break;
+  }
+
+  // Print the type if it isn't a 'none' type.
+  auto attrType = attr.getType();
+  if (!attrType.isa<NoneType>()) {
+    os << " : ";
+    printType(attrType);
+  }
+}
+
+/// Print the integer element of the given DenseElementsAttr at 'index'.
+static void printDenseIntElement(DenseElementsAttr attr, raw_ostream &os,
+                                 unsigned index) {
+  APInt value = *std::next(attr.int_value_begin(), index);
+  if (value.getBitWidth() == 1)
+    os << (value.getBoolValue() ? "true" : "false");
+  else
+    value.print(os, /*isSigned=*/true);
+}
+
+/// Print the float element of the given DenseElementsAttr at 'index'.
+static void printDenseFloatElement(DenseElementsAttr attr, raw_ostream &os,
+                                   unsigned index) {
+  APFloat value = *std::next(attr.float_value_begin(), index);
+  printFloatValue(value, os);
+}
+
+void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr) {
+  auto type = attr.getType();
+  auto shape = type.getShape();
+  auto rank = type.getRank();
+
+  // The function used to print elements of this attribute.
+  auto printEltFn = type.getElementType().isa<IntegerType>()
+                        ? printDenseIntElement
+                        : printDenseFloatElement;
+
+  // Special case for 0-d and splat tensors.
+  if (attr.isSplat()) {
+    printEltFn(attr, os, 0);
+    return;
+  }
+
+  // Special case for degenerate tensors.
+  auto numElements = type.getNumElements();
+  if (numElements == 0) {
+    for (int i = 0; i < rank; ++i)
+      os << '[';
+    for (int i = 0; i < rank; ++i)
+      os << ']';
+    return;
+  }
+
+  // We use a mixed-radix counter to iterate through the shape. When we bump a
+  // non-least-significant digit, we emit a close bracket. When we next emit an
+  // element we re-open all closed brackets.
+
+  // The mixed-radix counter, with radices in 'shape'.
+  SmallVector<unsigned, 4> counter(rank, 0);
+  // The number of brackets that have been opened and not closed.
+  unsigned openBrackets = 0;
+
+  auto bumpCounter = [&]() {
+    // Bump the least significant digit.
+    ++counter[rank - 1];
+    // Iterate backwards bubbling back the increment.
+    for (unsigned i = rank - 1; i > 0; --i)
+      if (counter[i] >= shape[i]) {
+        // Index 'i' is rolled over. Bump (i-1) and close a bracket.
+        counter[i] = 0;
+        ++counter[i - 1];
+        --openBrackets;
+        os << ']';
+      }
+  };
+
+  for (unsigned idx = 0, e = numElements; idx != e; ++idx) {
+    if (idx != 0)
+      os << ", ";
+    while (openBrackets++ < rank)
+      os << '[';
+    openBrackets = rank;
+    printEltFn(attr, os, idx);
+    bumpCounter();
+  }
+  while (openBrackets-- > 0)
+    os << ']';
+}
+
+void ModulePrinter::printType(Type type) {
+  // Check for an alias for this type.
+  if (state) {
+    StringRef alias = state->getTypeAlias(type);
+    if (!alias.empty()) {
+      os << '!' << alias;
+      return;
+    }
+  }
+
+  switch (type.getKind()) {
+  default: {
+    auto &dialect = type.getDialect();
+
+    // Ask the dialect to serialize the type to a string.
+    std::string typeName;
+    {
+      llvm::raw_string_ostream typeNameStr(typeName);
+      dialect.printType(type, typeNameStr);
+    }
+
+    printDialectSymbol(os, "!", dialect.getNamespace(), typeName);
+    return;
+  }
+  case Type::Kind::Opaque: {
+    auto opaqueTy = type.cast<OpaqueType>();
+    printDialectSymbol(os, "!", opaqueTy.getDialectNamespace(),
+                       opaqueTy.getTypeData());
+    return;
+  }
+  case StandardTypes::Index:
+    os << "index";
+    return;
+  case StandardTypes::BF16:
+    os << "bf16";
+    return;
+  case StandardTypes::F16:
+    os << "f16";
+    return;
+  case StandardTypes::F32:
+    os << "f32";
+    return;
+  case StandardTypes::F64:
+    os << "f64";
+    return;
+
+  case StandardTypes::Integer: {
+    auto integer = type.cast<IntegerType>();
+    os << 'i' << integer.getWidth();
+    return;
+  }
+  case Type::Kind::Function: {
+    auto func = type.cast<FunctionType>();
+    os << '(';
+    interleaveComma(func.getInputs(), [&](Type type) { printType(type); });
+    os << ") -> ";
+    auto results = func.getResults();
+    if (results.size() == 1 && !results[0].isa<FunctionType>())
+      os << results[0];
+    else {
+      os << '(';
+      interleaveComma(results, [&](Type type) { printType(type); });
+      os << ')';
+    }
+    return;
+  }
+  case StandardTypes::Vector: {
+    auto v = type.cast<VectorType>();
+    os << "vector<";
+    for (auto dim : v.getShape())
+      os << dim << 'x';
+    os << v.getElementType() << '>';
+    return;
+  }
+  case StandardTypes::RankedTensor: {
+    auto v = type.cast<RankedTensorType>();
+    os << "tensor<";
+    for (auto dim : v.getShape()) {
+      if (dim < 0)
+        os << '?';
+      else
+        os << dim;
+      os << 'x';
+    }
+    os << v.getElementType() << '>';
+    return;
+  }
+  case StandardTypes::UnrankedTensor: {
+    auto v = type.cast<UnrankedTensorType>();
+    os << "tensor<*x";
+    printType(v.getElementType());
+    os << '>';
+    return;
+  }
+  case StandardTypes::MemRef: {
+    auto v = type.cast<MemRefType>();
+    os << "memref<";
+    for (auto dim : v.getShape()) {
+      if (dim < 0)
+        os << '?';
+      else
+        os << dim;
+      os << 'x';
+    }
+    printType(v.getElementType());
+    for (auto map : v.getAffineMaps()) {
+      os << ", ";
+      printAttribute(AffineMapAttr::get(map));
+    }
+    // Only print the memory space if it is the non-default one.
+    if (v.getMemorySpace())
+      os << ", " << v.getMemorySpace();
+    os << '>';
+    return;
+  }
+  case StandardTypes::Complex:
+    os << "complex<";
+    printType(type.cast<ComplexType>().getElementType());
+    os << '>';
+    return;
+  case StandardTypes::Tuple: {
+    auto tuple = type.cast<TupleType>();
+    os << "tuple<";
+    interleaveComma(tuple.getTypes(), [&](Type type) { printType(type); });
+    os << '>';
+    return;
+  }
+  case StandardTypes::None:
+    os << "none";
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Affine expressions and maps
+//===----------------------------------------------------------------------===//
+
+void ModulePrinter::printAffineExpr(
+    AffineExpr expr, llvm::function_ref<void(unsigned, bool)> printValueName) {
+  printAffineExprInternal(expr, BindingStrength::Weak, printValueName);
+}
+
+void ModulePrinter::printAffineExprInternal(
+    AffineExpr expr, BindingStrength enclosingTightness,
+    llvm::function_ref<void(unsigned, bool)> printValueName) {
+  const char *binopSpelling = nullptr;
+  switch (expr.getKind()) {
+  case AffineExprKind::SymbolId: {
+    unsigned pos = expr.cast<AffineSymbolExpr>().getPosition();
+    if (printValueName)
+      printValueName(pos, /*isSymbol=*/true);
+    else
+      os << 's' << pos;
+    return;
+  }
+  case AffineExprKind::DimId: {
+    unsigned pos = expr.cast<AffineDimExpr>().getPosition();
+    if (printValueName)
+      printValueName(pos, /*isSymbol=*/false);
+    else
+      os << 'd' << pos;
+    return;
+  }
+  case AffineExprKind::Constant:
+    os << expr.cast<AffineConstantExpr>().getValue();
+    return;
+  case AffineExprKind::Add:
+    binopSpelling = " + ";
+    break;
+  case AffineExprKind::Mul:
+    binopSpelling = " * ";
+    break;
+  case AffineExprKind::FloorDiv:
+    binopSpelling = " floordiv ";
+    break;
+  case AffineExprKind::CeilDiv:
+    binopSpelling = " ceildiv ";
+    break;
+  case AffineExprKind::Mod:
+    binopSpelling = " mod ";
+    break;
+  }
+
+  auto binOp = expr.cast<AffineBinaryOpExpr>();
+  AffineExpr lhsExpr = binOp.getLHS();
+  AffineExpr rhsExpr = binOp.getRHS();
+
+  // Handle tightly binding binary operators.
+  if (binOp.getKind() != AffineExprKind::Add) {
+    if (enclosingTightness == BindingStrength::Strong)
+      os << '(';
+
+    // Pretty print multiplication with -1.
+    auto rhsConst = rhsExpr.dyn_cast<AffineConstantExpr>();
+    if (rhsConst && rhsConst.getValue() == -1) {
+      os << "-";
+      printAffineExprInternal(lhsExpr, BindingStrength::Strong, printValueName);
+      return;
+    }
+
+    printAffineExprInternal(lhsExpr, BindingStrength::Strong, printValueName);
+
+    os << binopSpelling;
+    printAffineExprInternal(rhsExpr, BindingStrength::Strong, printValueName);
+
+    if (enclosingTightness == BindingStrength::Strong)
+      os << ')';
+    return;
+  }
+
+  // Print out special "pretty" forms for add.
+  if (enclosingTightness == BindingStrength::Strong)
+    os << '(';
+
+  // Pretty print addition to a product that has a negative operand as a
+  // subtraction.
+  if (auto rhs = rhsExpr.dyn_cast<AffineBinaryOpExpr>()) {
+    if (rhs.getKind() == AffineExprKind::Mul) {
+      AffineExpr rrhsExpr = rhs.getRHS();
+      if (auto rrhs = rrhsExpr.dyn_cast<AffineConstantExpr>()) {
+        if (rrhs.getValue() == -1) {
+          printAffineExprInternal(lhsExpr, BindingStrength::Weak,
+                                  printValueName);
+          os << " - ";
+          if (rhs.getLHS().getKind() == AffineExprKind::Add) {
+            printAffineExprInternal(rhs.getLHS(), BindingStrength::Strong,
+                                    printValueName);
+          } else {
+            printAffineExprInternal(rhs.getLHS(), BindingStrength::Weak,
+                                    printValueName);
+          }
+
+          if (enclosingTightness == BindingStrength::Strong)
+            os << ')';
+          return;
+        }
+
+        if (rrhs.getValue() < -1) {
+          printAffineExprInternal(lhsExpr, BindingStrength::Weak,
+                                  printValueName);
+          os << " - ";
+          printAffineExprInternal(rhs.getLHS(), BindingStrength::Strong,
+                                  printValueName);
+          os << " * " << -rrhs.getValue();
+          if (enclosingTightness == BindingStrength::Strong)
+            os << ')';
+          return;
+        }
+      }
+    }
+  }
+
+  // Pretty print addition to a negative number as a subtraction.
+  if (auto rhsConst = rhsExpr.dyn_cast<AffineConstantExpr>()) {
+    if (rhsConst.getValue() < 0) {
+      printAffineExprInternal(lhsExpr, BindingStrength::Weak, printValueName);
+      os << " - " << -rhsConst.getValue();
+      if (enclosingTightness == BindingStrength::Strong)
+        os << ')';
+      return;
+    }
+  }
+
+  printAffineExprInternal(lhsExpr, BindingStrength::Weak, printValueName);
+
+  os << " + ";
+  printAffineExprInternal(rhsExpr, BindingStrength::Weak, printValueName);
+
+  if (enclosingTightness == BindingStrength::Strong)
+    os << ')';
+}
+
+void ModulePrinter::printAffineConstraint(AffineExpr expr, bool isEq) {
+  printAffineExprInternal(expr, BindingStrength::Weak);
+  isEq ? os << " == 0" : os << " >= 0";
+}
+
+void ModulePrinter::printAffineMap(AffineMap map) {
+  // Dimension identifiers.
+  os << '(';
+  for (int i = 0; i < (int)map.getNumDims() - 1; ++i)
+    os << 'd' << i << ", ";
+  if (map.getNumDims() >= 1)
+    os << 'd' << map.getNumDims() - 1;
+  os << ')';
+
+  // Symbolic identifiers.
+  if (map.getNumSymbols() != 0) {
+    os << '[';
+    for (unsigned i = 0; i < map.getNumSymbols() - 1; ++i)
+      os << 's' << i << ", ";
+    if (map.getNumSymbols() >= 1)
+      os << 's' << map.getNumSymbols() - 1;
+    os << ']';
+  }
+
+  // Result affine expressions.
+  os << " -> (";
+  interleaveComma(map.getResults(),
+                  [&](AffineExpr expr) { printAffineExpr(expr); });
+  os << ')';
+}
+
+void ModulePrinter::printIntegerSet(IntegerSet set) {
+  // Dimension identifiers.
+  os << '(';
+  for (unsigned i = 1; i < set.getNumDims(); ++i)
+    os << 'd' << i - 1 << ", ";
+  if (set.getNumDims() >= 1)
+    os << 'd' << set.getNumDims() - 1;
+  os << ')';
+
+  // Symbolic identifiers.
+  if (set.getNumSymbols() != 0) {
+    os << '[';
+    for (unsigned i = 0; i < set.getNumSymbols() - 1; ++i)
+      os << 's' << i << ", ";
+    if (set.getNumSymbols() >= 1)
+      os << 's' << set.getNumSymbols() - 1;
+    os << ']';
+  }
+
+  // Print constraints.
+  os << " : (";
+  int numConstraints = set.getNumConstraints();
+  for (int i = 1; i < numConstraints; ++i) {
+    printAffineConstraint(set.getConstraint(i - 1), set.isEq(i - 1));
+    os << ", ";
+  }
+  if (numConstraints >= 1)
+    printAffineConstraint(set.getConstraint(numConstraints - 1),
+                          set.isEq(numConstraints - 1));
+  os << ')';
+}
+
+//===----------------------------------------------------------------------===//
+// Operation printing
+//===----------------------------------------------------------------------===//
+
+void ModulePrinter::printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                                          ArrayRef<StringRef> elidedAttrs) {
+  // If there are no attributes, then there is nothing to be done.
+  if (attrs.empty())
+    return;
+
+  // Filter out any attributes that shouldn't be included.
+  SmallVector<NamedAttribute, 8> filteredAttrs;
+  for (auto attr : attrs) {
+    // If the caller has requested that this attribute be ignored, then drop it.
+    if (llvm::any_of(elidedAttrs,
+                     [&](StringRef elided) { return attr.first.is(elided); }))
+      continue;
+
+    // Otherwise add it to our filteredAttrs list.
+    filteredAttrs.push_back(attr);
+  }
+
+  // If there are no attributes left to print after filtering, then we're done.
+  if (filteredAttrs.empty())
+    return;
+
+  // Otherwise, print them all out in braces.
+  os << " {";
+  interleaveComma(filteredAttrs, [&](NamedAttribute attr) {
+    os << attr.first;
+
+    // Pretty printing elides the attribute value for unit attributes.
+    if (attr.second.isa<UnitAttr>())
+      return;
+
+    os << " = ";
+    printAttribute(attr.second);
+  });
+  os << '}';
+}
+
+namespace {
+
+// OperationPrinter contains common functionality for printing operations.
+class OperationPrinter : public ModulePrinter, private OpAsmPrinter {
+public:
+  OperationPrinter(Operation *op, ModulePrinter &other);
+  OperationPrinter(Region *region, ModulePrinter &other);
+
+  // Methods to print operations.
+  void print(Operation *op);
+  void print(Block *block, bool printBlockArgs = true,
+             bool printBlockTerminator = true);
+
+  void printOperation(Operation *op);
+  void printGenericOp(Operation *op) override;
+
+  // Implement OpAsmPrinter.
+  raw_ostream &getStream() const override { return os; }
+  void printType(Type type) override { ModulePrinter::printType(type); }
+  void printAttribute(Attribute attr) override {
+    ModulePrinter::printAttribute(attr);
+  }
+  void printOperand(Value *value) override { printValueID(value); }
+
+  void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                             ArrayRef<StringRef> elidedAttrs = {}) override {
+    return ModulePrinter::printOptionalAttrDict(attrs, elidedAttrs);
+  };
+
+  enum { nameSentinel = ~0U };
+
+  void printBlockName(Block *block) {
+    auto id = getBlockID(block);
+    if (id != ~0U)
+      os << "^bb" << id;
+    else
+      os << "^INVALIDBLOCK";
+  }
+
+  unsigned getBlockID(Block *block) {
+    auto it = blockIDs.find(block);
+    return it != blockIDs.end() ? it->second : ~0U;
+  }
+
+  void printSuccessorAndUseList(Operation *term, unsigned index) override;
+
+  /// Print a region.
+  void printRegion(Region &blocks, bool printEntryBlockArgs,
+                   bool printBlockTerminators) override {
+    os << " {\n";
+    if (!blocks.empty()) {
+      auto *entryBlock = &blocks.front();
+      print(entryBlock,
+            printEntryBlockArgs && entryBlock->getNumArguments() != 0,
+            printBlockTerminators);
+      for (auto &b : llvm::drop_begin(blocks.getBlocks(), 1))
+        print(&b);
+    }
+    os.indent(currentIndent) << "}";
+  }
+
+  /// Renumber the arguments for the specified region to the same names as the
+  /// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+  /// operations.  If any entry in namesToUse is null, the corresponding
+  /// argument name is left alone.
+  void shadowRegionArgs(Region &region, ArrayRef<Value *> namesToUse) override;
+
+  void printAffineMapOfSSAIds(AffineMapAttr mapAttr,
+                              ArrayRef<Value *> operands) override {
+    AffineMap map = mapAttr.getValue();
+    unsigned numDims = map.getNumDims();
+    auto printValueName = [&](unsigned pos, bool isSymbol) {
+      unsigned index = isSymbol ? numDims + pos : pos;
+      assert(index < operands.size());
+      if (isSymbol)
+        os << "symbol(";
+      printValueID(operands[index]);
+      if (isSymbol)
+        os << ')';
+    };
+
+    interleaveComma(map.getResults(), [&](AffineExpr expr) {
+      printAffineExpr(expr, printValueName);
+    });
+  }
+
+  // Number of spaces used for indenting nested operations.
+  const static unsigned indentWidth = 2;
+
+protected:
+  void numberValueID(Value *value);
+  void numberValuesInRegion(Region &region);
+  void numberValuesInBlock(Block &block);
+  void printValueID(Value *value, bool printResultNo = true) const {
+    printValueIDImpl(value, printResultNo, os);
+  }
+
+private:
+  void printValueIDImpl(Value *value, bool printResultNo,
+                        raw_ostream &stream) const;
+
+  /// Uniques the given value name within the printer. If the given name
+  /// conflicts, it is automatically renamed.
+  StringRef uniqueValueName(StringRef name);
+
+  /// This is the value ID for each SSA value. If this returns ~0, then the
+  /// valueID has an entry in valueNames.
+  DenseMap<Value *, unsigned> valueIDs;
+  DenseMap<Value *, StringRef> valueNames;
+
+  /// This is the block ID for each block in the current.
+  DenseMap<Block *, unsigned> blockIDs;
+
+  /// This keeps track of all of the non-numeric names that are in flight,
+  /// allowing us to check for duplicates.
+  /// Note: the value of the map is unused.
+  llvm::ScopedHashTable<StringRef, char> usedNames;
+  llvm::BumpPtrAllocator usedNameAllocator;
+
+  // This is the current indentation level for nested structures.
+  unsigned currentIndent = 0;
+
+  /// This is the next value ID to assign in numbering.
+  unsigned nextValueID = 0;
+  /// This is the next ID to assign to a region entry block argument.
+  unsigned nextArgumentID = 0;
+  /// This is the next ID to assign when a name conflict is detected.
+  unsigned nextConflictID = 0;
+};
+} // end anonymous namespace
+
+OperationPrinter::OperationPrinter(Operation *op, ModulePrinter &other)
+    : ModulePrinter(other) {
+  if (op->getNumResults() != 0)
+    numberValueID(op->getResult(0));
+  for (auto &region : op->getRegions())
+    numberValuesInRegion(region);
+}
+
+OperationPrinter::OperationPrinter(Region *region, ModulePrinter &other)
+    : ModulePrinter(other) {
+  numberValuesInRegion(*region);
+}
+
+/// Number all of the SSA values in the specified region.
+void OperationPrinter::numberValuesInRegion(Region &region) {
+  // Save the current value ids to allow for numbering values in sibling regions
+  // the same.
+  unsigned curValueID = nextValueID;
+  unsigned curArgumentID = nextArgumentID;
+  unsigned curConflictID = nextConflictID;
+
+  // Push a new used names scope.
+  llvm::ScopedHashTable<StringRef, char>::ScopeTy usedNamesScope(usedNames);
+
+  // Number the values within this region in a breadth-first order.
+  unsigned nextBlockID = 0;
+  for (auto &block : region) {
+    // Each block gets a unique ID, and all of the operations within it get
+    // numbered as well.
+    blockIDs[&block] = nextBlockID++;
+    numberValuesInBlock(block);
+  }
+
+  // After that we traverse the nested regions.
+  // TODO: Rework this loop to not use recursion.
+  for (auto &block : region) {
+    for (auto &op : block)
+      for (auto &nestedRegion : op.getRegions())
+        numberValuesInRegion(nestedRegion);
+  }
+
+  // Restore the original value ids.
+  nextValueID = curValueID;
+  nextArgumentID = curArgumentID;
+  nextConflictID = curConflictID;
+}
+
+/// Number all of the SSA values in the specified block, without traversing
+/// nested regions.
+void OperationPrinter::numberValuesInBlock(Block &block) {
+  // Number the block arguments.
+  for (auto *arg : block.getArguments())
+    numberValueID(arg);
+
+  // We number operation that have results, and we only number the first result.
+  for (auto &op : block)
+    if (op.getNumResults() != 0)
+      numberValueID(op.getResult(0));
+}
+
+void OperationPrinter::numberValueID(Value *value) {
+  assert(!valueIDs.count(value) && "Value numbered multiple times");
+
+  SmallString<32> specialNameBuffer;
+  llvm::raw_svector_ostream specialName(specialNameBuffer);
+
+  // Check to see if this value requested a special name.
+  auto *op = value->getDefiningOp();
+  if (state && op) {
+    if (auto *interface = state->getOpAsmInterface(op->getDialect()))
+      interface->getOpResultName(op, specialName);
+  }
+
+  if (specialNameBuffer.empty()) {
+    switch (value->getKind()) {
+    case Value::Kind::BlockArgument:
+      // If this is an argument to the entry block of a region, give it an 'arg'
+      // name.
+      if (auto *block = cast<BlockArgument>(value)->getOwner()) {
+        auto *parentRegion = block->getParent();
+        if (parentRegion && block == &parentRegion->front()) {
+          specialName << "arg" << nextArgumentID++;
+          break;
+        }
+      }
+      // Otherwise number it normally.
+      valueIDs[value] = nextValueID++;
+      return;
+    case Value::Kind::OpResult:
+      // This is an uninteresting result, give it a boring number and be
+      // done with it.
+      valueIDs[value] = nextValueID++;
+      return;
+    }
+  }
+
+  // Ok, this value had an interesting name.  Remember it with a sentinel.
+  valueIDs[value] = nameSentinel;
+  valueNames[value] = uniqueValueName(specialName.str());
+}
+
+/// Uniques the given value name within the printer. If the given name
+/// conflicts, it is automatically renamed.
+StringRef OperationPrinter::uniqueValueName(StringRef name) {
+  // Check to see if this name is already unique.
+  if (!usedNames.count(name)) {
+    name = name.copy(usedNameAllocator);
+  } else {
+    // Otherwise, we had a conflict - probe until we find a unique name. This
+    // is guaranteed to terminate (and usually in a single iteration) because it
+    // generates new names by incrementing nextConflictID.
+    SmallString<64> probeName(name);
+    probeName.push_back('_');
+    while (1) {
+      probeName.resize(name.size() + 1);
+      probeName += llvm::utostr(nextConflictID++);
+      if (!usedNames.count(probeName)) {
+        name = StringRef(probeName).copy(usedNameAllocator);
+        break;
+      }
+    }
+  }
+
+  usedNames.insert(name, char());
+  return name;
+}
+
+void OperationPrinter::print(Block *block, bool printBlockArgs,
+                             bool printBlockTerminator) {
+  // Print the block label and argument list if requested.
+  if (printBlockArgs) {
+    os.indent(currentIndent);
+    printBlockName(block);
+
+    // Print the argument list if non-empty.
+    if (!block->args_empty()) {
+      os << '(';
+      interleaveComma(block->getArguments(), [&](BlockArgument *arg) {
+        printValueID(arg);
+        os << ": ";
+        printType(arg->getType());
+      });
+      os << ')';
+    }
+    os << ':';
+
+    // Print out some context information about the predecessors of this block.
+    if (!block->getParent()) {
+      os << "\t// block is not in a region!";
+    } else if (block->hasNoPredecessors()) {
+      os << "\t// no predecessors";
+    } else if (auto *pred = block->getSinglePredecessor()) {
+      os << "\t// pred: ";
+      printBlockName(pred);
+    } else {
+      // We want to print the predecessors in increasing numeric order, not in
+      // whatever order the use-list is in, so gather and sort them.
+      SmallVector<std::pair<unsigned, Block *>, 4> predIDs;
+      for (auto *pred : block->getPredecessors())
+        predIDs.push_back({getBlockID(pred), pred});
+      llvm::array_pod_sort(predIDs.begin(), predIDs.end());
+
+      os << "\t// " << predIDs.size() << " preds: ";
+
+      interleaveComma(predIDs, [&](std::pair<unsigned, Block *> pred) {
+        printBlockName(pred.second);
+      });
+    }
+    os << '\n';
+  }
+
+  currentIndent += indentWidth;
+  auto range = llvm::make_range(
+      block->getOperations().begin(),
+      std::prev(block->getOperations().end(), printBlockTerminator ? 0 : 1));
+  for (auto &op : range) {
+    print(&op);
+    os << '\n';
+  }
+  currentIndent -= indentWidth;
+}
+
+void OperationPrinter::print(Operation *op) {
+  os.indent(currentIndent);
+  printOperation(op);
+  printTrailingLocation(op->getLoc());
+}
+
+void OperationPrinter::printValueIDImpl(Value *value, bool printResultNo,
+                                        raw_ostream &stream) const {
+  int resultNo = -1;
+  auto lookupValue = value;
+
+  // If this is a reference to the result of a multi-result operation or
+  // operation, print out the # identifier and make sure to map our lookup
+  // to the first result of the operation.
+  if (auto *result = dyn_cast<OpResult>(value)) {
+    if (result->getOwner()->getNumResults() != 1) {
+      resultNo = result->getResultNumber();
+      lookupValue = result->getOwner()->getResult(0);
+    }
+  }
+
+  auto it = valueIDs.find(lookupValue);
+  if (it == valueIDs.end()) {
+    stream << "<<INVALID SSA VALUE>>";
+    return;
+  }
+
+  stream << '%';
+  if (it->second != nameSentinel) {
+    stream << it->second;
+  } else {
+    auto nameIt = valueNames.find(lookupValue);
+    assert(nameIt != valueNames.end() && "Didn't have a name entry?");
+    stream << nameIt->second;
+  }
+
+  if (resultNo != -1 && printResultNo)
+    stream << '#' << resultNo;
+}
+
+/// Renumber the arguments for the specified region to the same names as the
+/// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+/// operations.  If any entry in namesToUse is null, the corresponding
+/// argument name is left alone.
+void OperationPrinter::shadowRegionArgs(Region &region,
+                                        ArrayRef<Value *> namesToUse) {
+  assert(!region.empty() && "cannot shadow arguments of an empty region");
+  assert(region.front().getNumArguments() == namesToUse.size() &&
+         "incorrect number of names passed in");
+  assert(region.getParentOp()->isKnownIsolatedFromAbove() &&
+         "only KnownIsolatedFromAbove ops can shadow names");
+
+  SmallVector<char, 16> nameStr;
+  for (unsigned i = 0, e = namesToUse.size(); i != e; ++i) {
+    auto *nameToUse = namesToUse[i];
+    if (nameToUse == nullptr)
+      continue;
+
+    auto *nameToReplace = region.front().getArgument(i);
+
+    nameStr.clear();
+    llvm::raw_svector_ostream nameStream(nameStr);
+    printValueIDImpl(nameToUse, /*printResultNo=*/true, nameStream);
+
+    // Entry block arguments should already have a pretty "arg" name.
+    assert(valueIDs[nameToReplace] == nameSentinel);
+
+    // Use the name without the leading %.
+    auto name = StringRef(nameStream.str()).drop_front();
+
+    // Overwrite the name.
+    valueNames[nameToReplace] = name.copy(usedNameAllocator);
+  }
+}
+
+void OperationPrinter::printOperation(Operation *op) {
+  if (size_t numResults = op->getNumResults()) {
+    printValueID(op->getResult(0), /*printResultNo=*/false);
+    if (numResults > 1)
+      os << ':' << numResults;
+    os << " = ";
+  }
+
+  // TODO(riverriddle): FuncOp cannot be round-tripped currently, as
+  // FunctionType cannot be used in a TypeAttr.
+  if (printGenericOpForm && !isa<FuncOp>(op))
+    return printGenericOp(op);
+
+  // Check to see if this is a known operation.  If so, use the registered
+  // custom printer hook.
+  if (auto *opInfo = op->getAbstractOperation()) {
+    opInfo->printAssembly(op, this);
+    return;
+  }
+
+  // Otherwise print with the generic assembly form.
+  printGenericOp(op);
+}
+
+void OperationPrinter::printGenericOp(Operation *op) {
+  os << '"';
+  printEscapedString(op->getName().getStringRef(), os);
+  os << "\"(";
+
+  // Get the list of operands that are not successor operands.
+  unsigned totalNumSuccessorOperands = 0;
+  unsigned numSuccessors = op->getNumSuccessors();
+  for (unsigned i = 0; i < numSuccessors; ++i)
+    totalNumSuccessorOperands += op->getNumSuccessorOperands(i);
+  unsigned numProperOperands = op->getNumOperands() - totalNumSuccessorOperands;
+  SmallVector<Value *, 8> properOperands(
+      op->operand_begin(), std::next(op->operand_begin(), numProperOperands));
+
+  interleaveComma(properOperands, [&](Value *value) { printValueID(value); });
+
+  os << ')';
+
+  // For terminators, print the list of successors and their operands.
+  if (numSuccessors != 0) {
+    os << '[';
+    for (unsigned i = 0; i < numSuccessors; ++i) {
+      if (i != 0)
+        os << ", ";
+      printSuccessorAndUseList(op, i);
+    }
+    os << ']';
+  }
+
+  // Print regions.
+  if (op->getNumRegions() != 0) {
+    os << " (";
+    interleaveComma(op->getRegions(), [&](Region &region) {
+      printRegion(region, /*printEntryBlockArgs=*/true,
+                  /*printBlockTerminators=*/true);
+    });
+    os << ')';
+  }
+
+  auto attrs = op->getAttrs();
+  printOptionalAttrDict(attrs);
+
+  // Print the type signature of the operation.
+  os << " : ";
+  printFunctionalType(op);
+}
+
+void OperationPrinter::printSuccessorAndUseList(Operation *term,
+                                                unsigned index) {
+  printBlockName(term->getSuccessor(index));
+
+  auto succOperands = term->getSuccessorOperands(index);
+  if (succOperands.begin() == succOperands.end())
+    return;
+
+  os << '(';
+  interleaveComma(succOperands,
+                  [this](Value *operand) { printValueID(operand); });
+  os << " : ";
+  interleaveComma(succOperands,
+                  [this](Value *operand) { printType(operand->getType()); });
+  os << ')';
+}
+
+void ModulePrinter::print(ModuleOp module) {
+  // Output the aliases at the top level.
+  if (state) {
+    state->printAttributeAliases(os);
+    state->printTypeAliases(os);
+  }
+
+  // Print the module.
+  OperationPrinter(module, *this).print(module);
+  os << '\n';
+}
+
+//===----------------------------------------------------------------------===//
+// print and dump methods
+//===----------------------------------------------------------------------===//
+
+void Attribute::print(raw_ostream &os) const {
+  ModulePrinter(os).printAttribute(*this);
+}
+
+void Attribute::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void Type::print(raw_ostream &os) { ModulePrinter(os).printType(*this); }
+
+void Type::dump() { print(llvm::errs()); }
+
+void AffineMap::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void IntegerSet::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void AffineExpr::print(raw_ostream &os) const {
+  if (expr == nullptr) {
+    os << "null affine expr";
+    return;
+  }
+  ModulePrinter(os).printAffineExpr(*this);
+}
+
+void AffineExpr::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void AffineMap::print(raw_ostream &os) const {
+  if (map == nullptr) {
+    os << "null affine map";
+    return;
+  }
+  ModulePrinter(os).printAffineMap(*this);
+}
+
+void IntegerSet::print(raw_ostream &os) const {
+  ModulePrinter(os).printIntegerSet(*this);
+}
+
+void Value::print(raw_ostream &os) {
+  switch (getKind()) {
+  case Value::Kind::BlockArgument:
+    // TODO: Improve this.
+    os << "<block argument>\n";
+    return;
+  case Value::Kind::OpResult:
+    return getDefiningOp()->print(os);
+  }
+}
+
+void Value::dump() { print(llvm::errs()); }
+
+void Operation::print(raw_ostream &os) {
+  // Handle top-level operations.
+  if (!getParent()) {
+    ModulePrinter modulePrinter(os);
+    OperationPrinter(this, modulePrinter).print(this);
+    return;
+  }
+
+  auto region = getParentRegion();
+  if (!region) {
+    os << "<<UNLINKED INSTRUCTION>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModuleState state(getContext());
+  ModulePrinter modulePrinter(os, &state);
+  OperationPrinter(region, modulePrinter).print(this);
+}
+
+void Operation::dump() {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void Block::print(raw_ostream &os) {
+  auto region = getParent();
+  if (!region) {
+    os << "<<UNLINKED BLOCK>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModuleState state(region->getContext());
+  ModulePrinter modulePrinter(os, &state);
+  OperationPrinter(region, modulePrinter).print(this);
+}
+
+void Block::dump() { print(llvm::errs()); }
+
+/// Print out the name of the block without printing its body.
+void Block::printAsOperand(raw_ostream &os, bool printType) {
+  auto region = getParent();
+  if (!region) {
+    os << "<<UNLINKED BLOCK>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModulePrinter modulePrinter(os);
+  OperationPrinter(region, modulePrinter).printBlockName(this);
+}
+
+void ModuleOp::print(raw_ostream &os) {
+  ModuleState state(getContext());
+  state.initialize(*this);
+  ModulePrinter(os, &state).print(*this);
+}
+
+void ModuleOp::dump() { print(llvm::errs()); }
diff --git a/third_party/mlir/lib/IR/AttributeDetail.h b/third_party/mlir/lib/IR/AttributeDetail.h
new file mode 100644
index 00000000000..21f8b68c265
--- /dev/null
+++ b/third_party/mlir/lib/IR/AttributeDetail.h
@@ -0,0 +1,567 @@
+//===- AttributeDetail.h - MLIR Affine Map details Class --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ATTRIBUTEDETAIL_H_
+#define ATTRIBUTEDETAIL_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/StorageUniquer.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+namespace detail {
+// An attribute representing a reference to an affine map.
+struct AffineMapAttributeStorage : public AttributeStorage {
+  using KeyTy = AffineMap;
+
+  AffineMapAttributeStorage(AffineMap value)
+      : AttributeStorage(IndexType::get(value.getContext())), value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static AffineMapAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<AffineMapAttributeStorage>())
+        AffineMapAttributeStorage(key);
+  }
+
+  AffineMap value;
+};
+
+/// An attribute representing an array of other attributes.
+struct ArrayAttributeStorage : public AttributeStorage {
+  using KeyTy = ArrayRef<Attribute>;
+
+  ArrayAttributeStorage(ArrayRef<Attribute> value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static ArrayAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                          const KeyTy &key) {
+    return new (allocator.allocate<ArrayAttributeStorage>())
+        ArrayAttributeStorage(allocator.copyInto(key));
+  }
+
+  ArrayRef<Attribute> value;
+};
+
+/// An attribute representing a boolean value.
+struct BoolAttributeStorage : public AttributeStorage {
+  using KeyTy = std::pair<MLIRContext *, bool>;
+
+  BoolAttributeStorage(Type type, bool value)
+      : AttributeStorage(type), value(value) {}
+
+  /// We only check equality for and hash with the boolean key parameter.
+  bool operator==(const KeyTy &key) const { return key.second == value; }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_value(key.second);
+  }
+
+  static BoolAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                         const KeyTy &key) {
+    return new (allocator.allocate<BoolAttributeStorage>())
+        BoolAttributeStorage(IntegerType::get(1, key.first), key.second);
+  }
+
+  bool value;
+};
+
+/// An attribute representing a dictionary of sorted named attributes.
+struct DictionaryAttributeStorage final
+    : public AttributeStorage,
+      private llvm::TrailingObjects<DictionaryAttributeStorage,
+                                    NamedAttribute> {
+  using KeyTy = ArrayRef<NamedAttribute>;
+
+  /// Given a list of NamedAttribute's, canonicalize the list (sorting
+  /// by name) and return the unique'd result.
+  static DictionaryAttributeStorage *get(ArrayRef<NamedAttribute> attrs);
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == getElements(); }
+
+  /// Construct a new storage instance.
+  static DictionaryAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    auto size = DictionaryAttributeStorage::totalSizeToAlloc<NamedAttribute>(
+        key.size());
+    auto rawMem = allocator.allocate(size, alignof(NamedAttribute));
+
+    // Initialize the storage and trailing attribute list.
+    auto result = ::new (rawMem) DictionaryAttributeStorage(key.size());
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<NamedAttribute>());
+    return result;
+  }
+
+  /// Return the elements of this dictionary attribute.
+  ArrayRef<NamedAttribute> getElements() const {
+    return {getTrailingObjects<NamedAttribute>(), numElements};
+  }
+
+private:
+  friend class llvm::TrailingObjects<DictionaryAttributeStorage,
+                                     NamedAttribute>;
+
+  // This is used by the llvm::TrailingObjects base class.
+  size_t numTrailingObjects(OverloadToken<NamedAttribute>) const {
+    return numElements;
+  }
+  DictionaryAttributeStorage(unsigned numElements) : numElements(numElements) {}
+
+  /// This is the number of attributes.
+  const unsigned numElements;
+};
+
+/// An attribute representing a floating point value.
+struct FloatAttributeStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<FloatAttributeStorage, uint64_t> {
+  using KeyTy = std::pair<Type, APFloat>;
+
+  FloatAttributeStorage(const llvm::fltSemantics &semantics, Type type,
+                        size_t numObjects)
+      : AttributeStorage(type), semantics(semantics), numObjects(numObjects) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key.first == getType() && key.second.bitwiseIsEqual(getValue());
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.first, llvm::hash_value(key.second));
+  }
+
+  /// Construct a key with a type and double.
+  static KeyTy getKey(Type type, double value) {
+    // Treat BF16 as double because it is not supported in LLVM's APFloat.
+    // TODO(b/121118307): add BF16 support to APFloat?
+    if (type.isBF16() || type.isF64())
+      return KeyTy(type, APFloat(value));
+
+    // This handles, e.g., F16 because there is no APFloat constructor for it.
+    bool unused;
+    APFloat val(value);
+    val.convert(type.cast<FloatType>().getFloatSemantics(),
+                APFloat::rmNearestTiesToEven, &unused);
+    return KeyTy(type, val);
+  }
+
+  /// Construct a new storage instance.
+  static FloatAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                          const KeyTy &key) {
+    const auto &apint = key.second.bitcastToAPInt();
+
+    // Here one word's bitwidth equals to that of uint64_t.
+    auto elements = ArrayRef<uint64_t>(apint.getRawData(), apint.getNumWords());
+
+    auto byteSize =
+        FloatAttributeStorage::totalSizeToAlloc<uint64_t>(elements.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(FloatAttributeStorage));
+    auto result = ::new (rawMem) FloatAttributeStorage(
+        key.second.getSemantics(), key.first, elements.size());
+    std::uninitialized_copy(elements.begin(), elements.end(),
+                            result->getTrailingObjects<uint64_t>());
+    return result;
+  }
+
+  /// Returns an APFloat representing the stored value.
+  APFloat getValue() const {
+    auto val = APInt(APFloat::getSizeInBits(semantics),
+                     {getTrailingObjects<uint64_t>(), numObjects});
+    return APFloat(semantics, val);
+  }
+
+  const llvm::fltSemantics &semantics;
+  size_t numObjects;
+};
+
+/// An attribute representing a integral value.
+struct IntegerAttributeStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<IntegerAttributeStorage, uint64_t> {
+  using KeyTy = std::pair<Type, APInt>;
+
+  IntegerAttributeStorage(Type type, size_t numObjects)
+      : AttributeStorage(type), numObjects(numObjects) {
+    assert((type.isIndex() || type.isa<IntegerType>()) && "invalid type");
+  }
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getType(), getValue());
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.first, llvm::hash_value(key.second));
+  }
+
+  /// Construct a new storage instance.
+  static IntegerAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    Type type;
+    APInt value;
+    std::tie(type, value) = key;
+
+    auto elements = ArrayRef<uint64_t>(value.getRawData(), value.getNumWords());
+    auto size =
+        IntegerAttributeStorage::totalSizeToAlloc<uint64_t>(elements.size());
+    auto rawMem = allocator.allocate(size, alignof(IntegerAttributeStorage));
+    auto result = ::new (rawMem) IntegerAttributeStorage(type, elements.size());
+    std::uninitialized_copy(elements.begin(), elements.end(),
+                            result->getTrailingObjects<uint64_t>());
+    return result;
+  }
+
+  /// Returns an APInt representing the stored value.
+  APInt getValue() const {
+    if (getType().isIndex())
+      return APInt(64, {getTrailingObjects<uint64_t>(), numObjects});
+    return APInt(getType().getIntOrFloatBitWidth(),
+                 {getTrailingObjects<uint64_t>(), numObjects});
+  }
+
+  size_t numObjects;
+};
+
+// An attribute representing a reference to an integer set.
+struct IntegerSetAttributeStorage : public AttributeStorage {
+  using KeyTy = IntegerSet;
+
+  IntegerSetAttributeStorage(IntegerSet value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static IntegerSetAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<IntegerSetAttributeStorage>())
+        IntegerSetAttributeStorage(key);
+  }
+
+  IntegerSet value;
+};
+
+/// Opaque Attribute Storage and Uniquing.
+struct OpaqueAttributeStorage : public AttributeStorage {
+  OpaqueAttributeStorage(Identifier dialectNamespace, StringRef attrData,
+                         Type type)
+      : AttributeStorage(type), dialectNamespace(dialectNamespace),
+        attrData(attrData) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<Identifier, StringRef, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(dialectNamespace, attrData, getType());
+  }
+
+  static OpaqueAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                           const KeyTy &key) {
+    return new (allocator.allocate<OpaqueAttributeStorage>())
+        OpaqueAttributeStorage(std::get<0>(key),
+                               allocator.copyInto(std::get<1>(key)),
+                               std::get<2>(key));
+  }
+
+  // The dialect namespace.
+  Identifier dialectNamespace;
+
+  // The parser attribute data for this opaque attribute.
+  StringRef attrData;
+};
+
+/// An attribute representing a string value.
+struct StringAttributeStorage : public AttributeStorage {
+  using KeyTy = std::pair<StringRef, Type>;
+
+  StringAttributeStorage(StringRef value, Type type)
+      : AttributeStorage(type), value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(value, getType());
+  }
+
+  /// Construct a new storage instance.
+  static StringAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                           const KeyTy &key) {
+    return new (allocator.allocate<StringAttributeStorage>())
+        StringAttributeStorage(allocator.copyInto(key.first), key.second);
+  }
+
+  StringRef value;
+};
+
+/// An attribute representing a reference to a type.
+struct TypeAttributeStorage : public AttributeStorage {
+  using KeyTy = Type;
+
+  TypeAttributeStorage(Type value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static TypeAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                         KeyTy key) {
+    return new (allocator.allocate<TypeAttributeStorage>())
+        TypeAttributeStorage(key);
+  }
+
+  Type value;
+};
+
+//===----------------------------------------------------------------------===//
+// Elements Attributes
+//===----------------------------------------------------------------------===//
+
+/// An attribute representing a reference to a dense vector or tensor object.
+struct DenseElementsAttributeStorage : public AttributeStorage {
+  struct KeyTy {
+    KeyTy(ShapedType type, ArrayRef<char> data, llvm::hash_code hashCode,
+          bool isSplat = false)
+        : type(type), data(data), hashCode(hashCode), isSplat(isSplat) {}
+
+    /// The type of the dense elements.
+    ShapedType type;
+
+    /// The raw buffer for the data storage.
+    ArrayRef<char> data;
+
+    /// The computed hash code for the storage data.
+    llvm::hash_code hashCode;
+
+    /// A boolean that indicates if this data is a splat or not.
+    bool isSplat;
+  };
+
+  DenseElementsAttributeStorage(ShapedType ty, ArrayRef<char> data,
+                                bool isSplat = false)
+      : AttributeStorage(ty), data(data), isSplat(isSplat) {}
+
+  /// Compare this storage instance with the provided key.
+  bool operator==(const KeyTy &key) const {
+    if (key.type != getType())
+      return false;
+
+    // For boolean splats we need to explicitly check that the first bit is the
+    // same. Boolean values are packed at the bit level, and even though a splat
+    // is detected the rest of the bits in the first byte may differ from the
+    // splat value.
+    if (key.type.getElementTypeBitWidth() == 1) {
+      if (key.isSplat != isSplat)
+        return false;
+      if (isSplat)
+        return (key.data.front() & 1) == data.front();
+    }
+
+    // Otherwise, we can default to just checking the data.
+    return key.data == data;
+  }
+
+  /// Construct a key from a shaped type, raw data buffer, and a flag that
+  /// signals if the data is already known to be a splat. Callers to this
+  /// function are expected to tag preknown splat values when possible, e.g. one
+  /// element shapes.
+  static KeyTy getKey(ShapedType ty, ArrayRef<char> data, bool isKnownSplat) {
+    // Handle an empty storage instance.
+    if (data.empty())
+      return KeyTy(ty, data, 0);
+
+    // If the data is already known to be a splat, the key hash value is
+    // directly the data buffer.
+    if (isKnownSplat)
+      return KeyTy(ty, data, llvm::hash_value(data), isKnownSplat);
+
+    // Otherwise, we need to check if the data corresponds to a splat or not.
+
+    // Handle the simple case of only one element.
+    size_t numElements = ty.getNumElements();
+    assert(numElements != 1 && "splat of 1 element should already be detected");
+
+    // Handle boolean values directly as they are packed to 1-bit.
+    size_t elementWidth = ty.getElementTypeBitWidth();
+    if (elementWidth == 1)
+      return getKeyForBoolData(ty, data, numElements);
+
+    // FIXME(b/121118307): using 64 bits for BF16 because it is currently stored
+    // with double semantics.
+    if (ty.getElementType().isBF16())
+      elementWidth = 64;
+
+    // Non 1-bit dense elements are padded to 8-bits.
+    size_t storageSize = llvm::divideCeil(elementWidth, CHAR_BIT);
+    assert(((data.size() / storageSize) == numElements) &&
+           "data does not hold expected number of elements");
+
+    // Create the initial hash value with just the first element.
+    auto firstElt = data.take_front(storageSize);
+    auto hashVal = llvm::hash_value(firstElt);
+
+    // Check to see if this storage represents a splat. If it doesn't then
+    // combine the hash for the data starting with the first non splat element.
+    for (size_t i = storageSize, e = data.size(); i != e; i += storageSize)
+      if (memcmp(data.data(), &data[i], storageSize))
+        return KeyTy(ty, data, llvm::hash_combine(hashVal, data.drop_front(i)));
+
+    // Otherwise, this is a splat so just return the hash of the first element.
+    return KeyTy(ty, firstElt, hashVal, /*isSplat=*/true);
+  }
+
+  /// Construct a key with a set of boolean data.
+  static KeyTy getKeyForBoolData(ShapedType ty, ArrayRef<char> data,
+                                 size_t numElements) {
+    ArrayRef<char> splatData = data;
+    bool splatValue = splatData.front() & 1;
+
+    // Helper functor to generate a KeyTy for a boolean splat value.
+    auto generateSplatKey = [=] {
+      return KeyTy(ty, data.take_front(1),
+                   llvm::hash_value(ArrayRef<char>(splatValue ? 1 : 0)),
+                   /*isSplat=*/true);
+    };
+
+    // Handle the case where the potential splat value is 1 and the number of
+    // elements is non 8-bit aligned.
+    size_t numOddElements = numElements % CHAR_BIT;
+    if (splatValue && numOddElements != 0) {
+      // Check that all bits are set in the last value.
+      char lastElt = splatData.back();
+      if (lastElt != llvm::maskTrailingOnes<unsigned char>(numOddElements))
+        return KeyTy(ty, data, llvm::hash_value(data));
+
+      // If this is the only element, the data is known to be a splat.
+      if (splatData.size() == 1)
+        return generateSplatKey();
+      splatData = splatData.drop_back();
+    }
+
+    // Check that the data buffer corresponds to a splat of the proper mask.
+    char mask = splatValue ? ~0 : 0;
+    return llvm::all_of(splatData, [mask](char c) { return c == mask; })
+               ? generateSplatKey()
+               : KeyTy(ty, data, llvm::hash_value(data));
+  }
+
+  /// Hash the key for the storage.
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.type, key.hashCode);
+  }
+
+  /// Construct a new storage instance.
+  static DenseElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    // If the data buffer is non-empty, we copy it into the allocator with a
+    // 64-bit alignment.
+    ArrayRef<char> copy, data = key.data;
+    if (!data.empty()) {
+      char *rawData = reinterpret_cast<char *>(
+          allocator.allocate(data.size(), alignof(uint64_t)));
+      std::memcpy(rawData, data.data(), data.size());
+
+      // If this is a boolean splat, make sure only the first bit is used.
+      if (key.isSplat && key.type.getElementTypeBitWidth() == 1)
+        rawData[0] &= 1;
+      copy = ArrayRef<char>(rawData, data.size());
+    }
+
+    return new (allocator.allocate<DenseElementsAttributeStorage>())
+        DenseElementsAttributeStorage(key.type, copy, key.isSplat);
+  }
+
+  ArrayRef<char> data;
+  bool isSplat;
+};
+
+/// An attribute representing a reference to a tensor constant with opaque
+/// content.
+struct OpaqueElementsAttributeStorage : public AttributeStorage {
+  using KeyTy = std::tuple<Type, Dialect *, StringRef>;
+
+  OpaqueElementsAttributeStorage(Type type, Dialect *dialect, StringRef bytes)
+      : AttributeStorage(type), dialect(dialect), bytes(bytes) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == std::make_tuple(getType(), dialect, bytes);
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(std::get<0>(key), std::get<1>(key),
+                              std::get<2>(key));
+  }
+
+  /// Construct a new storage instance.
+  static OpaqueElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    // TODO(b/131468830): Provide a way to avoid copying content of large opaque
+    // tensors This will likely require a new reference attribute kind.
+    return new (allocator.allocate<OpaqueElementsAttributeStorage>())
+        OpaqueElementsAttributeStorage(std::get<0>(key), std::get<1>(key),
+                                       allocator.copyInto(std::get<2>(key)));
+  }
+
+  Dialect *dialect;
+  StringRef bytes;
+};
+
+/// An attribute representing a reference to a sparse vector or tensor object.
+struct SparseElementsAttributeStorage : public AttributeStorage {
+  using KeyTy = std::tuple<Type, DenseIntElementsAttr, DenseElementsAttr>;
+
+  SparseElementsAttributeStorage(Type type, DenseIntElementsAttr indices,
+                                 DenseElementsAttr values)
+      : AttributeStorage(type), indices(indices), values(values) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == std::make_tuple(getType(), indices, values);
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(std::get<0>(key), std::get<1>(key),
+                              std::get<2>(key));
+  }
+
+  /// Construct a new storage instance.
+  static SparseElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<SparseElementsAttributeStorage>())
+        SparseElementsAttributeStorage(std::get<0>(key), std::get<1>(key),
+                                       std::get<2>(key));
+  }
+
+  DenseIntElementsAttr indices;
+  DenseElementsAttr values;
+};
+} // namespace detail
+} // namespace mlir
+
+#endif // ATTRIBUTEDETAIL_H_
diff --git a/third_party/mlir/lib/IR/Attributes.cpp b/third_party/mlir/lib/IR/Attributes.cpp
new file mode 100644
index 00000000000..dc72886d873
--- /dev/null
+++ b/third_party/mlir/lib/IR/Attributes.cpp
@@ -0,0 +1,1101 @@
+//===- Attributes.cpp - MLIR Affine Expr Classes --------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Attributes.h"
+#include "AttributeDetail.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorage
+//===----------------------------------------------------------------------===//
+
+AttributeStorage::AttributeStorage(Type type)
+    : type(type.getAsOpaquePointer()) {}
+AttributeStorage::AttributeStorage() : type(nullptr) {}
+
+Type AttributeStorage::getType() const {
+  return Type::getFromOpaquePointer(type);
+}
+void AttributeStorage::setType(Type newType) {
+  type = newType.getAsOpaquePointer();
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute
+//===----------------------------------------------------------------------===//
+
+/// Return the type of this attribute.
+Type Attribute::getType() const { return impl->getType(); }
+
+/// Return the context this attribute belongs to.
+MLIRContext *Attribute::getContext() const { return getType().getContext(); }
+
+/// Get the dialect this attribute is registered to.
+Dialect &Attribute::getDialect() const { return impl->getDialect(); }
+
+//===----------------------------------------------------------------------===//
+// AffineMapAttr
+//===----------------------------------------------------------------------===//
+
+AffineMapAttr AffineMapAttr::get(AffineMap value) {
+  return Base::get(value.getContext(), StandardAttributes::AffineMap, value);
+}
+
+AffineMap AffineMapAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// ArrayAttr
+//===----------------------------------------------------------------------===//
+
+ArrayAttr ArrayAttr::get(ArrayRef<Attribute> value, MLIRContext *context) {
+  return Base::get(context, StandardAttributes::Array, value);
+}
+
+ArrayRef<Attribute> ArrayAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// BoolAttr
+//===----------------------------------------------------------------------===//
+
+bool BoolAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// DictionaryAttr
+//===----------------------------------------------------------------------===//
+
+/// Perform a three-way comparison between the names of the specified
+/// NamedAttributes.
+static int compareNamedAttributes(const NamedAttribute *lhs,
+                                  const NamedAttribute *rhs) {
+  return lhs->first.str().compare(rhs->first.str());
+}
+
+DictionaryAttr DictionaryAttr::get(ArrayRef<NamedAttribute> value,
+                                   MLIRContext *context) {
+  assert(llvm::all_of(value,
+                      [](const NamedAttribute &attr) { return attr.second; }) &&
+         "value cannot have null entries");
+
+  // We need to sort the element list to canonicalize it, but we also don't want
+  // to do a ton of work in the super common case where the element list is
+  // already sorted.
+  SmallVector<NamedAttribute, 8> storage;
+  switch (value.size()) {
+  case 0:
+    break;
+  case 1:
+    // A single element is already sorted.
+    break;
+  case 2:
+    assert(value[0].first != value[1].first &&
+           "DictionaryAttr element names must be unique");
+
+    // Don't invoke a general sort for two element case.
+    if (value[0].first.strref() > value[1].first.strref()) {
+      storage.push_back(value[1]);
+      storage.push_back(value[0]);
+      value = storage;
+    }
+    break;
+  default:
+    // Check to see they are sorted already.
+    bool isSorted = true;
+    for (unsigned i = 0, e = value.size() - 1; i != e; ++i) {
+      if (value[i].first.strref() > value[i + 1].first.strref()) {
+        isSorted = false;
+        break;
+      }
+    }
+    // If not, do a general sort.
+    if (!isSorted) {
+      storage.append(value.begin(), value.end());
+      llvm::array_pod_sort(storage.begin(), storage.end(),
+                           compareNamedAttributes);
+      value = storage;
+    }
+
+    // Ensure that the attribute elements are unique.
+    assert(std::adjacent_find(value.begin(), value.end(),
+                              [](NamedAttribute l, NamedAttribute r) {
+                                return l.first == r.first;
+                              }) == value.end() &&
+           "DictionaryAttr element names must be unique");
+  }
+
+  return Base::get(context, StandardAttributes::Dictionary, value);
+}
+
+ArrayRef<NamedAttribute> DictionaryAttr::getValue() const {
+  return getImpl()->getElements();
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute DictionaryAttr::get(StringRef name) const {
+  for (auto elt : getValue())
+    if (elt.first.is(name))
+      return elt.second;
+  return nullptr;
+}
+Attribute DictionaryAttr::get(Identifier name) const {
+  for (auto elt : getValue())
+    if (elt.first == name)
+      return elt.second;
+  return nullptr;
+}
+
+DictionaryAttr::iterator DictionaryAttr::begin() const {
+  return getValue().begin();
+}
+DictionaryAttr::iterator DictionaryAttr::end() const {
+  return getValue().end();
+}
+size_t DictionaryAttr::size() const { return getValue().size(); }
+
+//===----------------------------------------------------------------------===//
+// FloatAttr
+//===----------------------------------------------------------------------===//
+
+FloatAttr FloatAttr::get(Type type, double value) {
+  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+}
+
+FloatAttr FloatAttr::getChecked(Type type, double value, Location loc) {
+  return Base::getChecked(loc, type.getContext(), StandardAttributes::Float,
+                          type, value);
+}
+
+FloatAttr FloatAttr::get(Type type, const APFloat &value) {
+  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+}
+
+FloatAttr FloatAttr::getChecked(Type type, const APFloat &value, Location loc) {
+  return Base::getChecked(loc, type.getContext(), StandardAttributes::Float,
+                          type, value);
+}
+
+APFloat FloatAttr::getValue() const { return getImpl()->getValue(); }
+
+double FloatAttr::getValueAsDouble() const {
+  return getValueAsDouble(getValue());
+}
+double FloatAttr::getValueAsDouble(APFloat value) {
+  if (&value.getSemantics() != &APFloat::IEEEdouble()) {
+    bool losesInfo = false;
+    value.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                  &losesInfo);
+  }
+  return value.convertToDouble();
+}
+
+/// Verify construction invariants.
+static LogicalResult verifyFloatTypeInvariants(llvm::Optional<Location> loc,
+                                               Type type) {
+  if (!type.isa<FloatType>()) {
+    if (loc)
+      emitError(*loc, "expected floating point type");
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult FloatAttr::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *ctx, Type type, double value) {
+  return verifyFloatTypeInvariants(loc, type);
+}
+
+LogicalResult
+FloatAttr::verifyConstructionInvariants(llvm::Optional<Location> loc,
+                                        MLIRContext *ctx, Type type,
+                                        const APFloat &value) {
+  // Verify that the type is correct.
+  if (failed(verifyFloatTypeInvariants(loc, type)))
+    return failure();
+
+  // Verify that the type semantics match that of the value.
+  if (&type.cast<FloatType>().getFloatSemantics() != &value.getSemantics()) {
+    if (loc)
+      emitError(*loc,
+                "FloatAttr type doesn't match the type implied by its value");
+    return failure();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolRefAttr
+//===----------------------------------------------------------------------===//
+
+SymbolRefAttr SymbolRefAttr::get(StringRef value, MLIRContext *ctx) {
+  return Base::get(ctx, StandardAttributes::SymbolRef, value,
+                   NoneType::get(ctx));
+}
+
+StringRef SymbolRefAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// IntegerAttr
+//===----------------------------------------------------------------------===//
+
+IntegerAttr IntegerAttr::get(Type type, const APInt &value) {
+  return Base::get(type.getContext(), StandardAttributes::Integer, type, value);
+}
+
+IntegerAttr IntegerAttr::get(Type type, int64_t value) {
+  // This uses 64 bit APInts by default for index type.
+  if (type.isIndex())
+    return get(type, APInt(64, value));
+
+  auto intType = type.cast<IntegerType>();
+  return get(type, APInt(intType.getWidth(), value));
+}
+
+APInt IntegerAttr::getValue() const { return getImpl()->getValue(); }
+
+int64_t IntegerAttr::getInt() const { return getValue().getSExtValue(); }
+
+//===----------------------------------------------------------------------===//
+// IntegerSetAttr
+//===----------------------------------------------------------------------===//
+
+IntegerSetAttr IntegerSetAttr::get(IntegerSet value) {
+  return Base::get(value.getConstraint(0).getContext(),
+                   StandardAttributes::IntegerSet, value);
+}
+
+IntegerSet IntegerSetAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// OpaqueAttr
+//===----------------------------------------------------------------------===//
+
+OpaqueAttr OpaqueAttr::get(Identifier dialect, StringRef attrData, Type type,
+                           MLIRContext *context) {
+  return Base::get(context, StandardAttributes::Opaque, dialect, attrData,
+                   type);
+}
+
+OpaqueAttr OpaqueAttr::getChecked(Identifier dialect, StringRef attrData,
+                                  Type type, Location location) {
+  return Base::getChecked(location, type.getContext(),
+                          StandardAttributes::Opaque, dialect, attrData, type);
+}
+
+/// Returns the dialect namespace of the opaque attribute.
+Identifier OpaqueAttr::getDialectNamespace() const {
+  return getImpl()->dialectNamespace;
+}
+
+/// Returns the raw attribute data of the opaque attribute.
+StringRef OpaqueAttr::getAttrData() const { return getImpl()->attrData; }
+
+/// Verify the construction of an opaque attribute.
+LogicalResult OpaqueAttr::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, Identifier dialect,
+    StringRef attrData, Type type) {
+  if (!Dialect::isValidNamespace(dialect.strref())) {
+    if (loc)
+      emitError(*loc) << "invalid dialect namespace '" << dialect << "'";
+    return failure();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// StringAttr
+//===----------------------------------------------------------------------===//
+
+StringAttr StringAttr::get(StringRef bytes, MLIRContext *context) {
+  return get(bytes, NoneType::get(context));
+}
+
+/// Get an instance of a StringAttr with the given string and Type.
+StringAttr StringAttr::get(StringRef bytes, Type type) {
+  return Base::get(type.getContext(), StandardAttributes::String, bytes, type);
+}
+
+StringRef StringAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// TypeAttr
+//===----------------------------------------------------------------------===//
+
+TypeAttr TypeAttr::get(Type value) {
+  return Base::get(value.getContext(), StandardAttributes::Type, value);
+}
+
+Type TypeAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// ElementsAttr
+//===----------------------------------------------------------------------===//
+
+ShapedType ElementsAttr::getType() const {
+  return Attribute::getType().cast<ShapedType>();
+}
+
+/// Returns the number of elements held by this attribute.
+int64_t ElementsAttr::getNumElements() const {
+  return getType().getNumElements();
+}
+
+/// Return the value at the given index. If index does not refer to a valid
+/// element, then a null attribute is returned.
+Attribute ElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().getValue(index);
+  case StandardAttributes::OpaqueElements:
+    return cast<OpaqueElementsAttr>().getValue(index);
+  case StandardAttributes::SparseElements:
+    return cast<SparseElementsAttr>().getValue(index);
+  default:
+    llvm_unreachable("unknown ElementsAttr kind");
+  }
+}
+
+/// Return if the given 'index' refers to a valid element in this attribute.
+bool ElementsAttr::isValidIndex(ArrayRef<uint64_t> index) const {
+  auto type = getType();
+
+  // Verify that the rank of the indices matches the held type.
+  auto rank = type.getRank();
+  if (rank != static_cast<int64_t>(index.size()))
+    return false;
+
+  // Verify that all of the indices are within the shape dimensions.
+  auto shape = type.getShape();
+  return llvm::all_of(llvm::seq<int>(0, rank), [&](int i) {
+    return static_cast<int64_t>(index[i]) < shape[i];
+  });
+}
+
+ElementsAttr ElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APInt &)> mapping) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().mapValues(newElementType, mapping);
+  default:
+    llvm_unreachable("unsupported ElementsAttr subtype");
+  }
+}
+
+ElementsAttr ElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APFloat &)> mapping) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().mapValues(newElementType, mapping);
+  default:
+    llvm_unreachable("unsupported ElementsAttr subtype");
+  }
+}
+
+/// Returns the 1 dimenional flattened row-major index from the given
+/// multi-dimensional index.
+uint64_t ElementsAttr::getFlattenedIndex(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  auto type = getType();
+
+  // Reduce the provided multidimensional index into a flattended 1D row-major
+  // index.
+  auto rank = type.getRank();
+  auto shape = type.getShape();
+  uint64_t valueIndex = 0;
+  uint64_t dimMultiplier = 1;
+  for (int i = rank - 1; i >= 0; --i) {
+    valueIndex += index[i] * dimMultiplier;
+    dimMultiplier *= shape[i];
+  }
+  return valueIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// DenseElementAttr Utilities
+//===----------------------------------------------------------------------===//
+
+static size_t getDenseElementBitwidth(Type eltType) {
+  // FIXME(b/121118307): using 64 bits for BF16 because it is currently stored
+  // with double semantics.
+  return eltType.isBF16() ? 64 : eltType.getIntOrFloatBitWidth();
+}
+
+/// Get the bitwidth of a dense element type within the buffer.
+/// DenseElementsAttr requires bitwidths greater than 1 to be aligned by 8.
+static size_t getDenseElementStorageWidth(size_t origWidth) {
+  return origWidth == 1 ? origWidth : llvm::alignTo<8>(origWidth);
+}
+
+/// Set a bit to a specific value.
+static void setBit(char *rawData, size_t bitPos, bool value) {
+  if (value)
+    rawData[bitPos / CHAR_BIT] |= (1 << (bitPos % CHAR_BIT));
+  else
+    rawData[bitPos / CHAR_BIT] &= ~(1 << (bitPos % CHAR_BIT));
+}
+
+/// Return the value of the specified bit.
+static bool getBit(const char *rawData, size_t bitPos) {
+  return (rawData[bitPos / CHAR_BIT] & (1 << (bitPos % CHAR_BIT))) != 0;
+}
+
+/// Writes value to the bit position `bitPos` in array `rawData`.
+static void writeBits(char *rawData, size_t bitPos, APInt value) {
+  size_t bitWidth = value.getBitWidth();
+
+  // If the bitwidth is 1 we just toggle the specific bit.
+  if (bitWidth == 1)
+    return setBit(rawData, bitPos, value.isOneValue());
+
+  // Otherwise, the bit position is guaranteed to be byte aligned.
+  assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
+  std::copy_n(reinterpret_cast<const char *>(value.getRawData()),
+              llvm::divideCeil(bitWidth, CHAR_BIT),
+              rawData + (bitPos / CHAR_BIT));
+}
+
+/// Reads the next `bitWidth` bits from the bit position `bitPos` in array
+/// `rawData`.
+static APInt readBits(const char *rawData, size_t bitPos, size_t bitWidth) {
+  // Handle a boolean bit position.
+  if (bitWidth == 1)
+    return APInt(1, getBit(rawData, bitPos) ? 1 : 0);
+
+  // Otherwise, the bit position must be 8-bit aligned.
+  assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
+  APInt result(bitWidth, 0);
+  std::copy_n(
+      rawData + (bitPos / CHAR_BIT), llvm::divideCeil(bitWidth, CHAR_BIT),
+      const_cast<char *>(reinterpret_cast<const char *>(result.getRawData())));
+  return result;
+}
+
+/// Returns if 'values' corresponds to a splat, i.e. one element, or has the
+/// same element count as 'type'.
+template <typename Values>
+static bool hasSameElementsOrSplat(ShapedType type, const Values &values) {
+  return (values.size() == 1) ||
+         (type.getNumElements() == static_cast<int64_t>(values.size()));
+}
+
+//===----------------------------------------------------------------------===//
+// DenseElementAttr Iterators
+//===----------------------------------------------------------------------===//
+
+/// Constructs a new iterator.
+DenseElementsAttr::AttributeElementIterator::AttributeElementIterator(
+    DenseElementsAttr attr, size_t index)
+    : indexed_accessor_iterator<AttributeElementIterator, const void *,
+                                Attribute, Attribute, Attribute>(
+          attr.getAsOpaquePointer(), index) {}
+
+/// Accesses the Attribute value at this iterator position.
+Attribute DenseElementsAttr::AttributeElementIterator::operator*() const {
+  auto owner = getFromOpaquePointer(object).cast<DenseElementsAttr>();
+  Type eltTy = owner.getType().getElementType();
+  if (auto intEltTy = eltTy.dyn_cast<IntegerType>()) {
+    if (intEltTy.getWidth() == 1)
+      return BoolAttr::get((*IntElementIterator(owner, index)).isOneValue(),
+                           owner.getContext());
+    return IntegerAttr::get(eltTy, *IntElementIterator(owner, index));
+  }
+  if (auto floatEltTy = eltTy.dyn_cast<FloatType>()) {
+    IntElementIterator intIt(owner, index);
+    FloatElementIterator floatIt(floatEltTy.getFloatSemantics(), intIt);
+    return FloatAttr::get(eltTy, *floatIt);
+  }
+  llvm_unreachable("unexpected element type");
+}
+
+/// Constructs a new iterator.
+DenseElementsAttr::BoolElementIterator::BoolElementIterator(
+    DenseElementsAttr attr, size_t dataIndex)
+    : DenseElementIndexedIteratorImpl<BoolElementIterator, bool, bool, bool>(
+          attr.getRawData().data(), attr.isSplat(), dataIndex) {}
+
+/// Accesses the bool value at this iterator position.
+bool DenseElementsAttr::BoolElementIterator::operator*() const {
+  return getBit(getData(), getDataIndex());
+}
+
+/// Constructs a new iterator.
+DenseElementsAttr::IntElementIterator::IntElementIterator(
+    DenseElementsAttr attr, size_t dataIndex)
+    : DenseElementIndexedIteratorImpl<IntElementIterator, APInt, APInt, APInt>(
+          attr.getRawData().data(), attr.isSplat(), dataIndex),
+      bitWidth(getDenseElementBitwidth(attr.getType().getElementType())) {}
+
+/// Accesses the raw APInt value at this iterator position.
+APInt DenseElementsAttr::IntElementIterator::operator*() const {
+  return readBits(getData(),
+                  getDataIndex() * getDenseElementStorageWidth(bitWidth),
+                  bitWidth);
+}
+
+DenseElementsAttr::FloatElementIterator::FloatElementIterator(
+    const llvm::fltSemantics &smt, IntElementIterator it)
+    : llvm::mapped_iterator<IntElementIterator,
+                            std::function<APFloat(const APInt &)>>(
+          it, [&](const APInt &val) { return APFloat(smt, val); }) {}
+
+//===----------------------------------------------------------------------===//
+// DenseElementsAttr
+//===----------------------------------------------------------------------===//
+
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<Attribute> values) {
+  assert(type.getElementType().isIntOrFloat() &&
+         "expected int or float element type");
+  assert(hasSameElementsOrSplat(type, values));
+
+  auto eltType = type.getElementType();
+  size_t bitWidth = getDenseElementBitwidth(eltType);
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+
+  // Compress the attribute values into a character buffer.
+  SmallVector<char, 8> data(llvm::divideCeil(storageBitWidth, CHAR_BIT) *
+                            values.size());
+  APInt intVal;
+  for (unsigned i = 0, e = values.size(); i < e; ++i) {
+    assert(eltType == values[i].getType() &&
+           "expected attribute value to have element type");
+
+    switch (eltType.getKind()) {
+    case StandardTypes::BF16:
+    case StandardTypes::F16:
+    case StandardTypes::F32:
+    case StandardTypes::F64:
+      intVal = values[i].cast<FloatAttr>().getValue().bitcastToAPInt();
+      break;
+    case StandardTypes::Integer:
+      intVal = values[i].isa<BoolAttr>()
+                   ? APInt(1, values[i].cast<BoolAttr>().getValue() ? 1 : 0)
+                   : values[i].cast<IntegerAttr>().getValue();
+      break;
+    default:
+      llvm_unreachable("unexpected element type");
+    }
+    assert(intVal.getBitWidth() == bitWidth &&
+           "expected value to have same bitwidth as element type");
+    writeBits(data.data(), i * storageBitWidth, intVal);
+  }
+  return getRaw(type, data, /*isSplat=*/(values.size() == 1));
+}
+
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<bool> values) {
+  assert(hasSameElementsOrSplat(type, values));
+  assert(type.getElementType().isInteger(1));
+
+  std::vector<char> buff(llvm::divideCeil(values.size(), CHAR_BIT));
+  for (int i = 0, e = values.size(); i != e; ++i)
+    setBit(buff.data(), i, values[i]);
+  return getRaw(type, buff, /*isSplat=*/(values.size() == 1));
+}
+
+/// Constructs a dense integer elements attribute from an array of APInt
+/// values. Each APInt value is expected to have the same bitwidth as the
+/// element type of 'type'.
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<APInt> values) {
+  assert(type.getElementType().isa<IntegerType>());
+  return getRaw(type, values);
+}
+
+// Constructs a dense float elements attribute from an array of APFloat
+// values. Each APFloat value is expected to have the same bitwidth as the
+// element type of 'type'.
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<APFloat> values) {
+  assert(type.getElementType().isa<FloatType>());
+
+  // Convert the APFloat values to APInt and create a dense elements attribute.
+  std::vector<APInt> intValues(values.size());
+  for (unsigned i = 0, e = values.size(); i != e; ++i)
+    intValues[i] = values[i].bitcastToAPInt();
+  return getRaw(type, intValues);
+}
+
+// Constructs a dense elements attribute from an array of raw APInt values.
+// Each APInt value is expected to have the same bitwidth as the element type
+// of 'type'.
+DenseElementsAttr DenseElementsAttr::getRaw(ShapedType type,
+                                            ArrayRef<APInt> values) {
+  assert(hasSameElementsOrSplat(type, values));
+
+  size_t bitWidth = getDenseElementBitwidth(type.getElementType());
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+  std::vector<char> elementData(llvm::divideCeil(storageBitWidth, CHAR_BIT) *
+                                values.size());
+  for (unsigned i = 0, e = values.size(); i != e; ++i) {
+    assert(values[i].getBitWidth() == bitWidth);
+    writeBits(elementData.data(), i * storageBitWidth, values[i]);
+  }
+  return getRaw(type, elementData, /*isSplat=*/(values.size() == 1));
+}
+
+DenseElementsAttr DenseElementsAttr::getRaw(ShapedType type,
+                                            ArrayRef<char> data, bool isSplat) {
+  assert((type.isa<RankedTensorType>() || type.isa<VectorType>()) &&
+         "type must be ranked tensor or vector");
+  assert(type.hasStaticShape() && "type must have static shape");
+  return Base::get(type.getContext(), StandardAttributes::DenseElements, type,
+                   data, isSplat);
+}
+
+/// Check the information for a c++ data type, check if this type is valid for
+/// the current attribute. This method is used to verify specific type
+/// invariants that the templatized 'getValues' method cannot.
+static bool isValidIntOrFloat(ShapedType type, int64_t dataEltSize,
+                              bool isInt) {
+  // Make sure that the data element size is the same as the type element width.
+  if ((dataEltSize * CHAR_BIT) != type.getElementTypeBitWidth())
+    return false;
+
+  // Check that the element type is valid.
+  return isInt ? type.getElementType().isa<IntegerType>()
+               : type.getElementType().isa<FloatType>();
+}
+
+/// Overload of the 'getRaw' method that asserts that the given type is of
+/// integer type. This method is used to verify type invariants that the
+/// templatized 'get' method cannot.
+DenseElementsAttr DenseElementsAttr::getRawIntOrFloat(ShapedType type,
+                                                      ArrayRef<char> data,
+                                                      int64_t dataEltSize,
+                                                      bool isInt) {
+  assert(::isValidIntOrFloat(type, dataEltSize, isInt));
+
+  int64_t numElements = data.size() / dataEltSize;
+  assert(numElements == 1 || numElements == type.getNumElements());
+  return getRaw(type, data, /*isSplat=*/numElements == 1);
+}
+
+/// A method used to verify specific type invariants that the templatized 'get'
+/// method cannot.
+bool DenseElementsAttr::isValidIntOrFloat(int64_t dataEltSize,
+                                          bool isInt) const {
+  return ::isValidIntOrFloat(getType(), dataEltSize, isInt);
+}
+
+/// Return the raw storage data held by this attribute.
+ArrayRef<char> DenseElementsAttr::getRawData() const {
+  return static_cast<ImplType *>(impl)->data;
+}
+
+/// Returns if this attribute corresponds to a splat, i.e. if all element
+/// values are the same.
+bool DenseElementsAttr::isSplat() const { return getImpl()->isSplat; }
+
+/// Return the held element values as a range of Attributes.
+auto DenseElementsAttr::getAttributeValues() const
+    -> llvm::iterator_range<AttributeElementIterator> {
+  return {attr_value_begin(), attr_value_end()};
+}
+auto DenseElementsAttr::attr_value_begin() const -> AttributeElementIterator {
+  return AttributeElementIterator(*this, 0);
+}
+auto DenseElementsAttr::attr_value_end() const -> AttributeElementIterator {
+  return AttributeElementIterator(*this, getNumElements());
+}
+
+/// Return the held element values as a range of bool. The element type of
+/// this attribute must be of integer type of bitwidth 1.
+auto DenseElementsAttr::getBoolValues() const
+    -> llvm::iterator_range<BoolElementIterator> {
+  auto eltType = getType().getElementType().dyn_cast<IntegerType>();
+  assert(eltType && eltType.getWidth() == 1 && "expected i1 integer type");
+  (void)eltType;
+  return {BoolElementIterator(*this, 0),
+          BoolElementIterator(*this, getNumElements())};
+}
+
+/// Return the held element values as a range of APInts. The element type of
+/// this attribute must be of integer type.
+auto DenseElementsAttr::getIntValues() const
+    -> llvm::iterator_range<IntElementIterator> {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return {raw_int_begin(), raw_int_end()};
+}
+auto DenseElementsAttr::int_value_begin() const -> IntElementIterator {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return raw_int_begin();
+}
+auto DenseElementsAttr::int_value_end() const -> IntElementIterator {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return raw_int_end();
+}
+
+/// Return the held element values as a range of APFloat. The element type of
+/// this attribute must be of float type.
+auto DenseElementsAttr::getFloatValues() const
+    -> llvm::iterator_range<FloatElementIterator> {
+  auto elementType = getType().getElementType().cast<FloatType>();
+  assert(elementType.isa<FloatType>() && "expected float type");
+  const auto &elementSemantics = elementType.getFloatSemantics();
+  return {FloatElementIterator(elementSemantics, raw_int_begin()),
+          FloatElementIterator(elementSemantics, raw_int_end())};
+}
+auto DenseElementsAttr::float_value_begin() const -> FloatElementIterator {
+  return getFloatValues().begin();
+}
+auto DenseElementsAttr::float_value_end() const -> FloatElementIterator {
+  return getFloatValues().end();
+}
+
+/// Return a new DenseElementsAttr that has the same data as the current
+/// attribute, but has been reshaped to 'newType'. The new type must have the
+/// same total number of elements as well as element type.
+DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) {
+  ShapedType curType = getType();
+  if (curType == newType)
+    return *this;
+
+  (void)curType;
+  assert(newType.getElementType() == curType.getElementType() &&
+         "expected the same element type");
+  assert(newType.getNumElements() == curType.getNumElements() &&
+         "expected the same number of elements");
+  return getRaw(newType, getRawData(), isSplat());
+}
+
+DenseElementsAttr DenseElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APInt &)> mapping) const {
+  return cast<DenseIntElementsAttr>().mapValues(newElementType, mapping);
+}
+
+DenseElementsAttr DenseElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APFloat &)> mapping) const {
+  return cast<DenseFPElementsAttr>().mapValues(newElementType, mapping);
+}
+
+//===----------------------------------------------------------------------===//
+// DenseFPElementsAttr
+//===----------------------------------------------------------------------===//
+
+template <typename Fn, typename Attr>
+static ShapedType mappingHelper(Fn mapping, Attr &attr, ShapedType inType,
+                                Type newElementType,
+                                llvm::SmallVectorImpl<char> &data) {
+  size_t bitWidth = getDenseElementBitwidth(newElementType);
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+
+  ShapedType newArrayType;
+  if (inType.isa<RankedTensorType>())
+    newArrayType = RankedTensorType::get(inType.getShape(), newElementType);
+  else if (inType.isa<UnrankedTensorType>())
+    newArrayType = RankedTensorType::get(inType.getShape(), newElementType);
+  else if (inType.isa<VectorType>())
+    newArrayType = VectorType::get(inType.getShape(), newElementType);
+  else
+    assert(newArrayType && "Unhandled tensor type");
+
+  size_t numRawElements = attr.isSplat() ? 1 : newArrayType.getNumElements();
+  data.resize(llvm::divideCeil(storageBitWidth, CHAR_BIT) * numRawElements);
+
+  // Functor used to process a single element value of the attribute.
+  auto processElt = [&](decltype(*attr.begin()) value, size_t index) {
+    auto newInt = mapping(value);
+    assert(newInt.getBitWidth() == bitWidth);
+    writeBits(data.data(), index * storageBitWidth, newInt);
+  };
+
+  // Check for the splat case.
+  if (attr.isSplat()) {
+    processElt(*attr.begin(), /*index=*/0);
+    return newArrayType;
+  }
+
+  // Otherwise, process all of the element values.
+  uint64_t elementIdx = 0;
+  for (auto value : attr)
+    processElt(value, elementIdx++);
+  return newArrayType;
+}
+
+DenseElementsAttr DenseFPElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APFloat &)> mapping) const {
+  llvm::SmallVector<char, 8> elementData;
+  auto newArrayType =
+      mappingHelper(mapping, *this, getType(), newElementType, elementData);
+
+  return getRaw(newArrayType, elementData, isSplat());
+}
+
+/// Method for supporting type inquiry through isa, cast and dyn_cast.
+bool DenseFPElementsAttr::classof(Attribute attr) {
+  return attr.isa<DenseElementsAttr>() &&
+         attr.getType().cast<ShapedType>().getElementType().isa<FloatType>();
+}
+
+//===----------------------------------------------------------------------===//
+// DenseIntElementsAttr
+//===----------------------------------------------------------------------===//
+
+DenseElementsAttr DenseIntElementsAttr::mapValues(
+    Type newElementType,
+    llvm::function_ref<APInt(const APInt &)> mapping) const {
+  llvm::SmallVector<char, 8> elementData;
+  auto newArrayType =
+      mappingHelper(mapping, *this, getType(), newElementType, elementData);
+
+  return getRaw(newArrayType, elementData, isSplat());
+}
+
+/// Method for supporting type inquiry through isa, cast and dyn_cast.
+bool DenseIntElementsAttr::classof(Attribute attr) {
+  return attr.isa<DenseElementsAttr>() &&
+         attr.getType().cast<ShapedType>().getElementType().isa<IntegerType>();
+}
+
+//===----------------------------------------------------------------------===//
+// OpaqueElementsAttr
+//===----------------------------------------------------------------------===//
+
+OpaqueElementsAttr OpaqueElementsAttr::get(Dialect *dialect, ShapedType type,
+                                           StringRef bytes) {
+  assert(TensorType::isValidElementType(type.getElementType()) &&
+         "Input element type should be a valid tensor element type");
+  return Base::get(type.getContext(), StandardAttributes::OpaqueElements, type,
+                   dialect, bytes);
+}
+
+StringRef OpaqueElementsAttr::getValue() const { return getImpl()->bytes; }
+
+/// Return the value at the given index. If index does not refer to a valid
+/// element, then a null attribute is returned.
+Attribute OpaqueElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  if (Dialect *dialect = getDialect())
+    return dialect->extractElementHook(*this, index);
+  return Attribute();
+}
+
+Dialect *OpaqueElementsAttr::getDialect() const { return getImpl()->dialect; }
+
+bool OpaqueElementsAttr::decode(ElementsAttr &result) {
+  if (auto *d = getDialect())
+    return d->decodeHook(*this, result);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// SparseElementsAttr
+//===----------------------------------------------------------------------===//
+
+SparseElementsAttr SparseElementsAttr::get(ShapedType type,
+                                           DenseElementsAttr indices,
+                                           DenseElementsAttr values) {
+  assert(indices.getType().getElementType().isInteger(64) &&
+         "expected sparse indices to be 64-bit integer values");
+  assert((type.isa<RankedTensorType>() || type.isa<VectorType>()) &&
+         "type must be ranked tensor or vector");
+  assert(type.hasStaticShape() && "type must have static shape");
+  return Base::get(type.getContext(), StandardAttributes::SparseElements, type,
+                   indices.cast<DenseIntElementsAttr>(), values);
+}
+
+DenseIntElementsAttr SparseElementsAttr::getIndices() const {
+  return getImpl()->indices;
+}
+
+DenseElementsAttr SparseElementsAttr::getValues() const {
+  return getImpl()->values;
+}
+
+/// Return the value of the element at the given index.
+Attribute SparseElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  auto type = getType();
+
+  // The sparse indices are 64-bit integers, so we can reinterpret the raw data
+  // as a 1-D index array.
+  auto sparseIndices = getIndices();
+  auto sparseIndexValues = sparseIndices.getValues<uint64_t>();
+
+  // Check to see if the indices are a splat.
+  if (sparseIndices.isSplat()) {
+    // If the index is also not a splat of the index value, we know that the
+    // value is zero.
+    auto splatIndex = *sparseIndexValues.begin();
+    if (llvm::any_of(index, [=](uint64_t i) { return i != splatIndex; }))
+      return getZeroAttr();
+
+    // If the indices are a splat, we also expect the values to be a splat.
+    assert(getValues().isSplat() && "expected splat values");
+    return getValues().getSplatValue();
+  }
+
+  // Build a mapping between known indices and the offset of the stored element.
+  llvm::SmallDenseMap<llvm::ArrayRef<uint64_t>, size_t> mappedIndices;
+  auto numSparseIndices = sparseIndices.getType().getDimSize(0);
+  size_t rank = type.getRank();
+  for (size_t i = 0, e = numSparseIndices; i != e; ++i)
+    mappedIndices.try_emplace(
+        {&*std::next(sparseIndexValues.begin(), i * rank), rank}, i);
+
+  // Look for the provided index key within the mapped indices. If the provided
+  // index is not found, then return a zero attribute.
+  auto it = mappedIndices.find(index);
+  if (it == mappedIndices.end())
+    return getZeroAttr();
+
+  // Otherwise, return the held sparse value element.
+  return getValues().getValue(it->second);
+}
+
+/// Get a zero APFloat for the given sparse attribute.
+APFloat SparseElementsAttr::getZeroAPFloat() const {
+  auto eltType = getType().getElementType().cast<FloatType>();
+  return APFloat(eltType.getFloatSemantics());
+}
+
+/// Get a zero APInt for the given sparse attribute.
+APInt SparseElementsAttr::getZeroAPInt() const {
+  auto eltType = getType().getElementType().cast<IntegerType>();
+  return APInt::getNullValue(eltType.getWidth());
+}
+
+/// Get a zero attribute for the given attribute type.
+Attribute SparseElementsAttr::getZeroAttr() const {
+  auto eltType = getType().getElementType();
+
+  // Handle floating point elements.
+  if (eltType.isa<FloatType>())
+    return FloatAttr::get(eltType, 0);
+
+  // Otherwise, this is an integer.
+  auto intEltTy = eltType.cast<IntegerType>();
+  if (intEltTy.getWidth() == 1)
+    return BoolAttr::get(false, eltType.getContext());
+  return IntegerAttr::get(eltType, 0);
+}
+
+/// Flatten, and return, all of the sparse indices in this attribute in
+/// row-major order.
+std::vector<ptrdiff_t> SparseElementsAttr::getFlattenedSparseIndices() const {
+  std::vector<ptrdiff_t> flatSparseIndices;
+
+  // The sparse indices are 64-bit integers, so we can reinterpret the raw data
+  // as a 1-D index array.
+  auto sparseIndices = getIndices();
+  auto sparseIndexValues = sparseIndices.getValues<uint64_t>();
+  if (sparseIndices.isSplat()) {
+    SmallVector<uint64_t, 8> indices(getType().getRank(),
+                                     *sparseIndexValues.begin());
+    flatSparseIndices.push_back(getFlattenedIndex(indices));
+    return flatSparseIndices;
+  }
+
+  // Otherwise, reinterpret each index as an ArrayRef when flattening.
+  auto numSparseIndices = sparseIndices.getType().getDimSize(0);
+  size_t rank = getType().getRank();
+  for (size_t i = 0, e = numSparseIndices; i != e; ++i)
+    flatSparseIndices.push_back(getFlattenedIndex(
+        {&*std::next(sparseIndexValues.begin(), i * rank), rank}));
+  return flatSparseIndices;
+}
+
+//===----------------------------------------------------------------------===//
+// NamedAttributeList
+//===----------------------------------------------------------------------===//
+
+NamedAttributeList::NamedAttributeList(ArrayRef<NamedAttribute> attributes) {
+  setAttrs(attributes);
+}
+
+ArrayRef<NamedAttribute> NamedAttributeList::getAttrs() const {
+  return attrs ? attrs.getValue() : llvm::None;
+}
+
+/// Replace the held attributes with ones provided in 'newAttrs'.
+void NamedAttributeList::setAttrs(ArrayRef<NamedAttribute> attributes) {
+  // Don't create an attribute list if there are no attributes.
+  if (attributes.empty())
+    attrs = nullptr;
+  else
+    attrs = DictionaryAttr::get(attributes, attributes[0].second.getContext());
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute NamedAttributeList::get(StringRef name) const {
+  return attrs ? attrs.get(name) : nullptr;
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute NamedAttributeList::get(Identifier name) const {
+  return attrs ? attrs.get(name) : nullptr;
+}
+
+/// If the an attribute exists with the specified name, change it to the new
+/// value.  Otherwise, add a new attribute with the specified name/value.
+void NamedAttributeList::set(Identifier name, Attribute value) {
+  assert(value && "attributes may never be null");
+
+  // If we already have this attribute, replace it.
+  auto origAttrs = getAttrs();
+  SmallVector<NamedAttribute, 8> newAttrs(origAttrs.begin(), origAttrs.end());
+  for (auto &elt : newAttrs)
+    if (elt.first == name) {
+      elt.second = value;
+      attrs = DictionaryAttr::get(newAttrs, value.getContext());
+      return;
+    }
+
+  // Otherwise, add it.
+  newAttrs.push_back({name, value});
+  attrs = DictionaryAttr::get(newAttrs, value.getContext());
+}
+
+/// Remove the attribute with the specified name if it exists.  The return
+/// value indicates whether the attribute was present or not.
+auto NamedAttributeList::remove(Identifier name) -> RemoveResult {
+  auto origAttrs = getAttrs();
+  for (unsigned i = 0, e = origAttrs.size(); i != e; ++i) {
+    if (origAttrs[i].first == name) {
+      // Handle the simple case of removing the only attribute in the list.
+      if (e == 1) {
+        attrs = nullptr;
+        return RemoveResult::Removed;
+      }
+
+      SmallVector<NamedAttribute, 8> newAttrs;
+      newAttrs.reserve(origAttrs.size() - 1);
+      newAttrs.append(origAttrs.begin(), origAttrs.begin() + i);
+      newAttrs.append(origAttrs.begin() + i + 1, origAttrs.end());
+      attrs = DictionaryAttr::get(newAttrs, newAttrs[0].second.getContext());
+      return RemoveResult::Removed;
+    }
+  }
+  return RemoveResult::NotFound;
+}
diff --git a/third_party/mlir/lib/IR/Block.cpp b/third_party/mlir/lib/IR/Block.cpp
new file mode 100644
index 00000000000..28614ca8bdc
--- /dev/null
+++ b/third_party/mlir/lib/IR/Block.cpp
@@ -0,0 +1,281 @@
+//===- Block.cpp - MLIR Block Class ---------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// BlockArgument
+//===----------------------------------------------------------------------===//
+
+/// Returns the number of this argument.
+unsigned BlockArgument::getArgNumber() {
+  // Arguments are not stored in place, so we have to find it within the list.
+  auto argList = getOwner()->getArguments();
+  return std::distance(argList.begin(), llvm::find(argList, this));
+}
+
+//===----------------------------------------------------------------------===//
+// Block
+//===----------------------------------------------------------------------===//
+
+Block::~Block() {
+  assert(!verifyInstOrder() && "Expected valid operation ordering.");
+  clear();
+
+  for (auto *arg : arguments)
+    if (!arg->use_empty())
+      arg->user_begin()->dump();
+
+  llvm::DeleteContainerPointers(arguments);
+}
+
+Region *Block::getParent() { return parentValidInstOrderPair.getPointer(); }
+
+/// Returns the closest surrounding operation that contains this block or
+/// nullptr if this block is unlinked.
+Operation *Block::getParentOp() {
+  return getParent() ? getParent()->getParentOp() : nullptr;
+}
+
+/// Return if this block is the entry block in the parent region.
+bool Block::isEntryBlock() { return this == &getParent()->front(); }
+
+/// Insert this block (which must not already be in a region) right before the
+/// specified block.
+void Block::insertBefore(Block *block) {
+  assert(!getParent() && "already inserted into a block!");
+  assert(block->getParent() && "cannot insert before a block without a parent");
+  block->getParent()->getBlocks().insert(Region::iterator(block), this);
+}
+
+/// Unlink this Block from its parent Region and delete it.
+void Block::erase() {
+  assert(getParent() && "Block has no parent");
+  getParent()->getBlocks().erase(this);
+}
+
+/// Returns 'op' if 'op' lies in this block, or otherwise finds the
+/// ancestor operation of 'op' that lies in this block. Returns nullptr if
+/// the latter fails.
+Operation *Block::findAncestorInstInBlock(Operation &op) {
+  // Traverse up the operation hierarchy starting from the owner of operand to
+  // find the ancestor operation that resides in the block of 'forInst'.
+  auto *currInst = &op;
+  while (currInst->getBlock() != this) {
+    currInst = currInst->getParentOp();
+    if (!currInst)
+      return nullptr;
+  }
+  return currInst;
+}
+
+/// This drops all operand uses from operations within this block, which is
+/// an essential step in breaking cyclic dependences between references when
+/// they are to be deleted.
+void Block::dropAllReferences() {
+  for (Operation &i : *this)
+    i.dropAllReferences();
+}
+
+void Block::dropAllDefinedValueUses() {
+  for (auto *arg : getArguments())
+    arg->dropAllUses();
+  for (auto &op : *this)
+    op.dropAllDefinedValueUses();
+  dropAllUses();
+}
+
+/// Returns true if the ordering of the child operations is valid, false
+/// otherwise.
+bool Block::isInstOrderValid() { return parentValidInstOrderPair.getInt(); }
+
+/// Invalidates the current ordering of operations.
+void Block::invalidateInstOrder() {
+  // Validate the current ordering.
+  assert(!verifyInstOrder());
+  parentValidInstOrderPair.setInt(false);
+}
+
+/// Verifies the current ordering of child operations. Returns false if the
+/// order is valid, true otherwise.
+bool Block::verifyInstOrder() {
+  // The order is already known to be invalid.
+  if (!isInstOrderValid())
+    return false;
+  // The order is valid if there are less than 2 operations.
+  if (operations.empty() || std::next(operations.begin()) == operations.end())
+    return false;
+
+  Operation *prev = nullptr;
+  for (auto &i : *this) {
+    // The previous operation must have a smaller order index than the next as
+    // it appears earlier in the list.
+    if (prev && prev->orderIndex >= i.orderIndex)
+      return true;
+    prev = &i;
+  }
+  return false;
+}
+
+/// Recomputes the ordering of child operations within the block.
+void Block::recomputeInstOrder() {
+  parentValidInstOrderPair.setInt(true);
+
+  // TODO(riverriddle) Have non-congruent indices to reduce the number of times
+  // an insert invalidates the list.
+  unsigned orderIndex = 0;
+  for (auto &op : *this)
+    op.orderIndex = orderIndex++;
+}
+
+//===----------------------------------------------------------------------===//
+// Argument list management.
+//===----------------------------------------------------------------------===//
+
+BlockArgument *Block::addArgument(Type type) {
+  auto *arg = new BlockArgument(type, this);
+  arguments.push_back(arg);
+  return arg;
+}
+
+/// Add one argument to the argument list for each type specified in the list.
+auto Block::addArguments(ArrayRef<Type> types)
+    -> llvm::iterator_range<args_iterator> {
+  arguments.reserve(arguments.size() + types.size());
+  auto initialSize = arguments.size();
+  for (auto type : types) {
+    addArgument(type);
+  }
+  return {arguments.data() + initialSize, arguments.data() + arguments.size()};
+}
+
+void Block::eraseArgument(unsigned index, bool updatePredTerms) {
+  assert(index < arguments.size());
+
+  // Delete the argument.
+  delete arguments[index];
+  arguments.erase(arguments.begin() + index);
+
+  // If we aren't updating predecessors, there is nothing left to do.
+  if (!updatePredTerms)
+    return;
+
+  // Erase this argument from each of the predecessor's terminator.
+  for (auto predIt = pred_begin(), predE = pred_end(); predIt != predE;
+       ++predIt) {
+    auto *predTerminator = (*predIt)->getTerminator();
+    predTerminator->eraseSuccessorOperand(predIt.getSuccessorIndex(), index);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Terminator management
+//===----------------------------------------------------------------------===//
+
+/// Get the terminator operation of this block. This function asserts that
+/// the block has a valid terminator operation.
+Operation *Block::getTerminator() {
+  assert(!empty() && !back().isKnownNonTerminator());
+  return &back();
+}
+
+/// Return true if this block has no predecessors.
+bool Block::hasNoPredecessors() { return pred_begin() == pred_end(); }
+
+// Indexed successor access.
+unsigned Block::getNumSuccessors() {
+  return empty() ? 0 : back().getNumSuccessors();
+}
+
+Block *Block::getSuccessor(unsigned i) {
+  assert(i < getNumSuccessors());
+  return getTerminator()->getSuccessor(i);
+}
+
+/// If this block has exactly one predecessor, return it.  Otherwise, return
+/// null.
+///
+/// Note that multiple edges from a single block (e.g. if you have a cond
+/// branch with the same block as the true/false destinations) is not
+/// considered to be a single predecessor.
+Block *Block::getSinglePredecessor() {
+  auto it = pred_begin();
+  if (it == pred_end())
+    return nullptr;
+  auto *firstPred = *it;
+  ++it;
+  return it == pred_end() ? firstPred : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Walkers
+//===----------------------------------------------------------------------===//
+
+void Block::walk(llvm::function_ref<void(Operation *)> callback) {
+  walk(begin(), end(), callback);
+}
+
+/// Walk the operations in the specified [begin, end) range of this block,
+/// calling the callback for each operation.
+void Block::walk(Block::iterator begin, Block::iterator end,
+                 llvm::function_ref<void(Operation *)> callback) {
+  for (auto &op : llvm::make_early_inc_range(llvm::make_range(begin, end)))
+    op.walk(callback);
+}
+
+//===----------------------------------------------------------------------===//
+// Other
+//===----------------------------------------------------------------------===//
+
+/// Split the block into two blocks before the specified operation or
+/// iterator.
+///
+/// Note that all operations BEFORE the specified iterator stay as part of
+/// the original basic block, and the rest of the operations in the original
+/// block are moved to the new block, including the old terminator.  The
+/// original block is left without a terminator.
+///
+/// The newly formed Block is returned, and the specified iterator is
+/// invalidated.
+Block *Block::splitBlock(iterator splitBefore) {
+  // Start by creating a new basic block, and insert it immediate after this
+  // one in the containing region.
+  auto newBB = new Block();
+  getParent()->getBlocks().insert(std::next(Region::iterator(this)), newBB);
+
+  // Move all of the operations from the split point to the end of the region
+  // into the new block.
+  newBB->getOperations().splice(newBB->end(), getOperations(), splitBefore,
+                                end());
+  return newBB;
+}
+
+//===----------------------------------------------------------------------===//
+// Predecessors
+//===----------------------------------------------------------------------===//
+
+Block *PredecessorIterator::unwrap(BlockOperand &value) {
+  return value.getOwner()->getBlock();
+}
+
+/// Get the successor number in the predecessor terminator.
+unsigned PredecessorIterator::getSuccessorIndex() const {
+  return I->getOperandNumber();
+}
diff --git a/third_party/mlir/lib/IR/Builders.cpp b/third_party/mlir/lib/IR/Builders.cpp
new file mode 100644
index 00000000000..067ff7af644
--- /dev/null
+++ b/third_party/mlir/lib/IR/Builders.cpp
@@ -0,0 +1,413 @@
+//===- Builders.cpp - Helpers for constructing MLIR Classes ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
+using namespace mlir;
+
+Builder::Builder(ModuleOp module) : context(module.getContext()) {}
+
+Identifier Builder::getIdentifier(StringRef str) {
+  return Identifier::get(str, context);
+}
+
+//===----------------------------------------------------------------------===//
+// Locations.
+//===----------------------------------------------------------------------===//
+
+Location Builder::getUnknownLoc() { return UnknownLoc::get(context); }
+
+Location Builder::getFileLineColLoc(Identifier filename, unsigned line,
+                                    unsigned column) {
+  return FileLineColLoc::get(filename, line, column, context);
+}
+
+Location Builder::getFusedLoc(ArrayRef<Location> locs, Attribute metadata) {
+  return FusedLoc::get(locs, metadata, context);
+}
+
+//===----------------------------------------------------------------------===//
+// Types.
+//===----------------------------------------------------------------------===//
+
+FloatType Builder::getBF16Type() { return FloatType::getBF16(context); }
+
+FloatType Builder::getF16Type() { return FloatType::getF16(context); }
+
+FloatType Builder::getF32Type() { return FloatType::getF32(context); }
+
+FloatType Builder::getF64Type() { return FloatType::getF64(context); }
+
+IndexType Builder::getIndexType() { return IndexType::get(context); }
+
+IntegerType Builder::getI1Type() { return IntegerType::get(1, context); }
+
+IntegerType Builder::getIntegerType(unsigned width) {
+  return IntegerType::get(width, context);
+}
+
+FunctionType Builder::getFunctionType(ArrayRef<Type> inputs,
+                                      ArrayRef<Type> results) {
+  return FunctionType::get(inputs, results, context);
+}
+
+MemRefType Builder::getMemRefType(ArrayRef<int64_t> shape, Type elementType,
+                                  ArrayRef<AffineMap> affineMapComposition,
+                                  unsigned memorySpace) {
+  return MemRefType::get(shape, elementType, affineMapComposition, memorySpace);
+}
+
+VectorType Builder::getVectorType(ArrayRef<int64_t> shape, Type elementType) {
+  return VectorType::get(shape, elementType);
+}
+
+RankedTensorType Builder::getTensorType(ArrayRef<int64_t> shape,
+                                        Type elementType) {
+  return RankedTensorType::get(shape, elementType);
+}
+
+UnrankedTensorType Builder::getTensorType(Type elementType) {
+  return UnrankedTensorType::get(elementType);
+}
+
+TupleType Builder::getTupleType(ArrayRef<Type> elementTypes) {
+  return TupleType::get(elementTypes, context);
+}
+
+NoneType Builder::getNoneType() { return NoneType::get(context); }
+
+//===----------------------------------------------------------------------===//
+// Attributes.
+//===----------------------------------------------------------------------===//
+
+NamedAttribute Builder::getNamedAttr(StringRef name, Attribute val) {
+  return NamedAttribute(getIdentifier(name), val);
+}
+
+UnitAttr Builder::getUnitAttr() { return UnitAttr::get(context); }
+
+BoolAttr Builder::getBoolAttr(bool value) {
+  return BoolAttr::get(value, context);
+}
+
+DictionaryAttr Builder::getDictionaryAttr(ArrayRef<NamedAttribute> value) {
+  return DictionaryAttr::get(value, context);
+}
+
+IntegerAttr Builder::getI64IntegerAttr(int64_t value) {
+  return IntegerAttr::get(getIntegerType(64), APInt(64, value));
+}
+
+IntegerAttr Builder::getI32IntegerAttr(int32_t value) {
+  return IntegerAttr::get(getIntegerType(32), APInt(32, value));
+}
+
+IntegerAttr Builder::getIntegerAttr(Type type, int64_t value) {
+  if (type.isIndex())
+    return IntegerAttr::get(type, APInt(64, value));
+  return IntegerAttr::get(type, APInt(type.getIntOrFloatBitWidth(), value));
+}
+
+IntegerAttr Builder::getIntegerAttr(Type type, const APInt &value) {
+  return IntegerAttr::get(type, value);
+}
+
+FloatAttr Builder::getF64FloatAttr(double value) {
+  return FloatAttr::get(getF64Type(), APFloat(value));
+}
+
+FloatAttr Builder::getF32FloatAttr(float value) {
+  return FloatAttr::get(getF32Type(), APFloat(value));
+}
+
+FloatAttr Builder::getF16FloatAttr(float value) {
+  return FloatAttr::get(getF16Type(), value);
+}
+
+FloatAttr Builder::getFloatAttr(Type type, double value) {
+  return FloatAttr::get(type, value);
+}
+
+FloatAttr Builder::getFloatAttr(Type type, const APFloat &value) {
+  return FloatAttr::get(type, value);
+}
+
+StringAttr Builder::getStringAttr(StringRef bytes) {
+  return StringAttr::get(bytes, context);
+}
+
+StringAttr Builder::getStringAttr(StringRef bytes, Type type) {
+  return StringAttr::get(bytes, type);
+}
+
+ArrayAttr Builder::getArrayAttr(ArrayRef<Attribute> value) {
+  return ArrayAttr::get(value, context);
+}
+
+AffineMapAttr Builder::getAffineMapAttr(AffineMap map) {
+  return AffineMapAttr::get(map);
+}
+
+IntegerSetAttr Builder::getIntegerSetAttr(IntegerSet set) {
+  return IntegerSetAttr::get(set);
+}
+
+TypeAttr Builder::getTypeAttr(Type type) { return TypeAttr::get(type); }
+
+SymbolRefAttr Builder::getSymbolRefAttr(Operation *value) {
+  auto symName =
+      value->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
+  assert(symName && "value does not have a valid symbol name");
+  return getSymbolRefAttr(symName.getValue());
+}
+SymbolRefAttr Builder::getSymbolRefAttr(StringRef value) {
+  return SymbolRefAttr::get(value, getContext());
+}
+
+ElementsAttr Builder::getDenseElementsAttr(ShapedType type,
+                                           ArrayRef<Attribute> values) {
+  return DenseElementsAttr::get(type, values);
+}
+
+ElementsAttr Builder::getDenseIntElementsAttr(ShapedType type,
+                                              ArrayRef<int64_t> values) {
+  return DenseIntElementsAttr::get(type, values);
+}
+
+ElementsAttr Builder::getSparseElementsAttr(ShapedType type,
+                                            DenseIntElementsAttr indices,
+                                            DenseElementsAttr values) {
+  return SparseElementsAttr::get(type, indices, values);
+}
+
+ElementsAttr Builder::getOpaqueElementsAttr(Dialect *dialect, ShapedType type,
+                                            StringRef bytes) {
+  return OpaqueElementsAttr::get(dialect, type, bytes);
+}
+
+ArrayAttr Builder::getI32ArrayAttr(ArrayRef<int32_t> values) {
+  auto attrs = functional::map(
+      [this](int32_t v) -> Attribute { return getI32IntegerAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getI64ArrayAttr(ArrayRef<int64_t> values) {
+  auto attrs = functional::map(
+      [this](int64_t v) -> Attribute { return getI64IntegerAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getIndexArrayAttr(ArrayRef<int64_t> values) {
+  auto attrs = functional::map(
+      [this](int64_t v) -> Attribute {
+        return getIntegerAttr(IndexType::get(getContext()), v);
+      },
+      values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getF32ArrayAttr(ArrayRef<float> values) {
+  auto attrs = functional::map(
+      [this](float v) -> Attribute { return getF32FloatAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getF64ArrayAttr(ArrayRef<double> values) {
+  auto attrs = functional::map(
+      [this](double v) -> Attribute { return getF64FloatAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getStrArrayAttr(ArrayRef<StringRef> values) {
+  auto attrs = functional::map(
+      [this](StringRef v) -> Attribute { return getStringAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getAffineMapArrayAttr(ArrayRef<AffineMap> values) {
+  auto attrs = functional::map(
+      [this](AffineMap v) -> Attribute { return getAffineMapAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+Attribute Builder::getZeroAttr(Type type) {
+  switch (type.getKind()) {
+  case StandardTypes::F16:
+    return getF16FloatAttr(0);
+  case StandardTypes::F32:
+    return getF32FloatAttr(0);
+  case StandardTypes::F64:
+    return getF64FloatAttr(0);
+  case StandardTypes::Integer: {
+    auto width = type.cast<IntegerType>().getWidth();
+    if (width == 1)
+      return getBoolAttr(false);
+    return getIntegerAttr(type, APInt(width, 0));
+  }
+  case StandardTypes::Vector:
+  case StandardTypes::RankedTensor: {
+    auto vtType = type.cast<ShapedType>();
+    auto element = getZeroAttr(vtType.getElementType());
+    if (!element)
+      return {};
+    return getDenseElementsAttr(vtType, element);
+  }
+  default:
+    break;
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// Affine Expressions, Affine Maps, and Integet Sets.
+//===----------------------------------------------------------------------===//
+
+AffineMap Builder::getAffineMap(unsigned dimCount, unsigned symbolCount,
+                                ArrayRef<AffineExpr> results) {
+  return AffineMap::get(dimCount, symbolCount, results);
+}
+
+AffineExpr Builder::getAffineDimExpr(unsigned position) {
+  return mlir::getAffineDimExpr(position, context);
+}
+
+AffineExpr Builder::getAffineSymbolExpr(unsigned position) {
+  return mlir::getAffineSymbolExpr(position, context);
+}
+
+AffineExpr Builder::getAffineConstantExpr(int64_t constant) {
+  return mlir::getAffineConstantExpr(constant, context);
+}
+
+IntegerSet Builder::getIntegerSet(unsigned dimCount, unsigned symbolCount,
+                                  ArrayRef<AffineExpr> constraints,
+                                  ArrayRef<bool> isEq) {
+  return IntegerSet::get(dimCount, symbolCount, constraints, isEq);
+}
+
+AffineMap Builder::getEmptyAffineMap() { return AffineMap::get(context); }
+
+AffineMap Builder::getConstantAffineMap(int64_t val) {
+  return AffineMap::get(/*dimCount=*/0, /*symbolCount=*/0,
+                        {getAffineConstantExpr(val)});
+}
+
+AffineMap Builder::getDimIdentityMap() {
+  return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
+                        {getAffineDimExpr(0)});
+}
+
+AffineMap Builder::getMultiDimIdentityMap(unsigned rank) {
+  SmallVector<AffineExpr, 4> dimExprs;
+  dimExprs.reserve(rank);
+  for (unsigned i = 0; i < rank; ++i)
+    dimExprs.push_back(getAffineDimExpr(i));
+  return AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, dimExprs);
+}
+
+AffineMap Builder::getSymbolIdentityMap() {
+  return AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                        {getAffineSymbolExpr(0)});
+}
+
+AffineMap Builder::getSingleDimShiftAffineMap(int64_t shift) {
+  // expr = d0 + shift.
+  auto expr = getAffineDimExpr(0) + shift;
+  return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {expr});
+}
+
+AffineMap Builder::getShiftedAffineMap(AffineMap map, int64_t shift) {
+  SmallVector<AffineExpr, 4> shiftedResults;
+  shiftedResults.reserve(map.getNumResults());
+  for (auto resultExpr : map.getResults()) {
+    shiftedResults.push_back(resultExpr + shift);
+  }
+  return AffineMap::get(map.getNumDims(), map.getNumSymbols(), shiftedResults);
+}
+
+//===----------------------------------------------------------------------===//
+// OpBuilder.
+//===----------------------------------------------------------------------===//
+
+OpBuilder::~OpBuilder() {}
+
+/// Add new block and set the insertion point to the end of it. The block is
+/// inserted at the provided insertion point of 'parent'.
+Block *OpBuilder::createBlock(Region *parent, Region::iterator insertPt) {
+  assert(parent && "expected valid parent region");
+  if (insertPt == Region::iterator())
+    insertPt = parent->end();
+
+  Block *b = new Block();
+  parent->getBlocks().insert(insertPt, b);
+  setInsertionPointToEnd(b);
+  return b;
+}
+
+/// Add new block and set the insertion point to the end of it.  The block is
+/// placed before 'insertBefore'.
+Block *OpBuilder::createBlock(Block *insertBefore) {
+  assert(insertBefore && "expected valid insertion block");
+  return createBlock(insertBefore->getParent(), Region::iterator(insertBefore));
+}
+
+/// Create an operation given the fields represented as an OperationState.
+Operation *OpBuilder::createOperation(const OperationState &state) {
+  assert(block && "createOperation() called without setting builder's block");
+  auto *op = Operation::create(state);
+  insert(op);
+  return op;
+}
+
+/// Attempts to fold the given operation and places new results within
+/// 'results'.
+void OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value *> &results) {
+  results.reserve(op->getNumResults());
+  SmallVector<OpFoldResult, 4> foldResults;
+
+  // Returns if the given fold result corresponds to a valid existing value.
+  auto isValidValue = [](OpFoldResult result) {
+    return result.dyn_cast<Value *>();
+  };
+
+  // Check if the fold failed, or did not result in only existing values.
+  SmallVector<Attribute, 4> constOperands(op->getNumOperands());
+  if (failed(op->fold(constOperands, foldResults)) || foldResults.empty() ||
+      !llvm::all_of(foldResults, isValidValue)) {
+    // Simply return the existing operation results.
+    results.assign(op->result_begin(), op->result_end());
+    return;
+  }
+
+  // Populate the results with the folded results and remove the original op.
+  llvm::transform(foldResults, std::back_inserter(results),
+                  [](OpFoldResult result) { return result.get<Value *>(); });
+  op->erase();
+}
+
+/// Insert the given operation at the current insertion point.
+void OpBuilder::insert(Operation *op) {
+  if (block)
+    block->getOperations().insert(insertPoint, op);
+}
diff --git a/third_party/mlir/lib/IR/CMakeLists.txt b/third_party/mlir/lib/IR/CMakeLists.txt
new file mode 100644
index 00000000000..6bb12650934
--- /dev/null
+++ b/third_party/mlir/lib/IR/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRIR
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+  )
+add_dependencies(MLIRIR MLIRSupport LLVMSupport)
+target_link_libraries(MLIRIR MLIRSupport LLVMSupport)
diff --git a/third_party/mlir/lib/IR/Diagnostics.cpp b/third_party/mlir/lib/IR/Diagnostics.cpp
new file mode 100644
index 00000000000..e9963ece379
--- /dev/null
+++ b/third_party/mlir/lib/IR/Diagnostics.cpp
@@ -0,0 +1,860 @@
+//===- Diagnostics.cpp - MLIR Diagnostics ---------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// DiagnosticArgument
+//===----------------------------------------------------------------------===//
+
+// Construct from an Attribute.
+DiagnosticArgument::DiagnosticArgument(Attribute attr)
+    : kind(DiagnosticArgumentKind::Attribute),
+      opaqueVal(reinterpret_cast<intptr_t>(attr.getAsOpaquePointer())) {}
+
+// Construct from a Type.
+DiagnosticArgument::DiagnosticArgument(Type val)
+    : kind(DiagnosticArgumentKind::Type),
+      opaqueVal(reinterpret_cast<intptr_t>(val.getAsOpaquePointer())) {}
+
+/// Returns this argument as an Attribute.
+Attribute DiagnosticArgument::getAsAttribute() const {
+  assert(getKind() == DiagnosticArgumentKind::Attribute);
+  return Attribute::getFromOpaquePointer(
+      reinterpret_cast<const void *>(opaqueVal));
+}
+
+/// Returns this argument as a Type.
+Type DiagnosticArgument::getAsType() const {
+  assert(getKind() == DiagnosticArgumentKind::Type);
+  return Type::getFromOpaquePointer(reinterpret_cast<const void *>(opaqueVal));
+}
+
+/// Outputs this argument to a stream.
+void DiagnosticArgument::print(raw_ostream &os) const {
+  switch (kind) {
+  case DiagnosticArgumentKind::Attribute:
+    os << getAsAttribute();
+    break;
+  case DiagnosticArgumentKind::Double:
+    os << getAsDouble();
+    break;
+  case DiagnosticArgumentKind::Integer:
+    os << getAsInteger();
+    break;
+  case DiagnosticArgumentKind::Operation:
+    os << getAsOperation();
+    break;
+  case DiagnosticArgumentKind::String:
+    os << getAsString();
+    break;
+  case DiagnosticArgumentKind::Type:
+    os << '\'' << getAsType() << '\'';
+    break;
+  case DiagnosticArgumentKind::Unsigned:
+    os << getAsUnsigned();
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic
+//===----------------------------------------------------------------------===//
+
+/// Convert a Twine to a StringRef. Memory used for generating the StringRef is
+/// stored in 'strings'.
+static StringRef twineToStrRef(const Twine &val,
+                               std::vector<std::unique_ptr<char[]>> &strings) {
+  // Allocate memory to hold this string.
+  llvm::SmallString<64> data;
+  auto strRef = val.toStringRef(data);
+  strings.push_back(std::unique_ptr<char[]>(new char[strRef.size()]));
+  memcpy(&strings.back()[0], strRef.data(), strRef.size());
+
+  // Return a reference to the new string.
+  return StringRef(&strings.back()[0], strRef.size());
+}
+
+/// Stream in a Twine argument.
+Diagnostic &Diagnostic::operator<<(char val) { return *this << Twine(val); }
+Diagnostic &Diagnostic::operator<<(const Twine &val) {
+  arguments.push_back(DiagnosticArgument(twineToStrRef(val, strings)));
+  return *this;
+}
+Diagnostic &Diagnostic::operator<<(Twine &&val) {
+  arguments.push_back(DiagnosticArgument(twineToStrRef(val, strings)));
+  return *this;
+}
+
+/// Stream in an Identifier.
+Diagnostic &Diagnostic::operator<<(Identifier val) {
+  // An identifier is stored in the context, so we don't need to worry about the
+  // lifetime of its data.
+  arguments.push_back(DiagnosticArgument(val.strref()));
+  return *this;
+}
+
+/// Stream in an OperationName.
+Diagnostic &Diagnostic::operator<<(OperationName val) {
+  // An OperationName is stored in the context, so we don't need to worry about
+  // the lifetime of its data.
+  arguments.push_back(DiagnosticArgument(val.getStringRef()));
+  return *this;
+}
+
+/// Outputs this diagnostic to a stream.
+void Diagnostic::print(raw_ostream &os) const {
+  for (auto &arg : getArguments())
+    arg.print(os);
+}
+
+/// Convert the diagnostic to a string.
+std::string Diagnostic::str() const {
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  print(os);
+  return os.str();
+}
+
+/// Attaches a note to this diagnostic. A new location may be optionally
+/// provided, if not, then the location defaults to the one specified for this
+/// diagnostic. Notes may not be attached to other notes.
+Diagnostic &Diagnostic::attachNote(llvm::Optional<Location> noteLoc) {
+  // We don't allow attaching notes to notes.
+  assert(severity != DiagnosticSeverity::Note &&
+         "cannot attach a note to a note");
+
+  // If a location wasn't provided then reuse our location.
+  if (!noteLoc)
+    noteLoc = loc;
+
+  /// Append and return a new note.
+  notes.push_back(
+      std::make_unique<Diagnostic>(*noteLoc, DiagnosticSeverity::Note));
+  return *notes.back();
+}
+
+/// Allow a diagnostic to be converted to 'failure'.
+Diagnostic::operator LogicalResult() const { return failure(); }
+
+//===----------------------------------------------------------------------===//
+// InFlightDiagnostic
+//===----------------------------------------------------------------------===//
+
+/// Allow an inflight diagnostic to be converted to 'failure', otherwise
+/// 'success' if this is an empty diagnostic.
+InFlightDiagnostic::operator LogicalResult() const {
+  return failure(isActive());
+}
+
+/// Reports the diagnostic to the engine.
+void InFlightDiagnostic::report() {
+  // If this diagnostic is still inflight and it hasn't been abandoned, then
+  // report it.
+  if (isInFlight()) {
+    owner->emit(std::move(*impl));
+    owner = nullptr;
+  }
+  impl.reset();
+}
+
+/// Abandons this diagnostic.
+void InFlightDiagnostic::abandon() { owner = nullptr; }
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngineImpl
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct DiagnosticEngineImpl {
+  /// Emit a diagnostic using the registered issue handle if present, or with
+  /// the default behavior if not.
+  void emit(Diagnostic diag);
+
+  /// A mutex to ensure that diagnostics emission is thread-safe.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// This is the handler to use to report diagnostics, or null if not
+  /// registered.
+  DiagnosticEngine::HandlerTy handler;
+};
+} // namespace detail
+} // namespace mlir
+
+/// Emit a diagnostic using the registered issue handle if present, or with
+/// the default behavior if not.
+void DiagnosticEngineImpl::emit(Diagnostic diag) {
+  llvm::sys::SmartScopedLock<true> lock(mutex);
+
+  // If we had a handler registered, emit the diagnostic using it.
+  if (handler)
+    return handler(std::move(diag));
+
+  // Otherwise, if this is an error we emit it to stderr.
+  if (diag.getSeverity() != DiagnosticSeverity::Error)
+    return;
+
+  auto &os = llvm::errs();
+  if (!diag.getLocation().isa<UnknownLoc>())
+    os << diag.getLocation() << ": ";
+  os << "error: ";
+
+  // The default behavior for errors is to emit them to stderr.
+  os << diag << '\n';
+  os.flush();
+}
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngine
+//===----------------------------------------------------------------------===//
+
+DiagnosticEngine::DiagnosticEngine() : impl(new DiagnosticEngineImpl()) {}
+DiagnosticEngine::~DiagnosticEngine() {}
+
+/// Set the diagnostic handler for this engine.  The handler is passed
+/// location information if present (nullptr if not) along with a message and
+/// a severity that indicates whether this is an error, warning, etc. Note
+/// that this replaces any existing handler.
+void DiagnosticEngine::setHandler(const HandlerTy &handler) {
+  impl->handler = handler;
+}
+
+/// Return the current diagnostic handler, or null if none is present.
+auto DiagnosticEngine::getHandler() -> HandlerTy {
+  llvm::sys::SmartScopedLock<true> lock(impl->mutex);
+  return impl->handler;
+}
+
+/// Emit a diagnostic using the registered issue handler if present, or with
+/// the default behavior if not.
+void DiagnosticEngine::emit(Diagnostic diag) {
+  assert(diag.getSeverity() != DiagnosticSeverity::Note &&
+         "notes should not be emitted directly");
+  impl->emit(std::move(diag));
+}
+
+/// Helper function used to emit a diagnostic with an optionally empty twine
+/// message. If the message is empty, then it is not inserted into the
+/// diagnostic.
+static InFlightDiagnostic emitDiag(Location location,
+                                   DiagnosticSeverity severity,
+                                   const llvm::Twine &message) {
+  auto &diagEngine = location->getContext()->getDiagEngine();
+  auto diag = diagEngine.emit(location, severity);
+  if (!message.isTriviallyEmpty())
+    diag << message;
+  return diag;
+}
+
+/// Emit an error message using this location.
+InFlightDiagnostic mlir::emitError(Location loc) { return emitError(loc, {}); }
+InFlightDiagnostic mlir::emitError(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Error, message);
+}
+
+/// Emit a warning message using this location.
+InFlightDiagnostic mlir::emitWarning(Location loc) {
+  return emitWarning(loc, {});
+}
+InFlightDiagnostic mlir::emitWarning(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Warning, message);
+}
+
+/// Emit a remark message using this location.
+InFlightDiagnostic mlir::emitRemark(Location loc) {
+  return emitRemark(loc, {});
+}
+InFlightDiagnostic mlir::emitRemark(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Remark, message);
+}
+
+//===----------------------------------------------------------------------===//
+// ScopedDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+ScopedDiagnosticHandler::ScopedDiagnosticHandler(MLIRContext *ctx)
+    : existingHandler(ctx->getDiagEngine().getHandler()), ctx(ctx) {}
+ScopedDiagnosticHandler::ScopedDiagnosticHandler(
+    MLIRContext *ctx, const DiagnosticEngine::HandlerTy &handler)
+    : ScopedDiagnosticHandler(ctx) {
+  ctx->getDiagEngine().setHandler(handler);
+}
+ScopedDiagnosticHandler::~ScopedDiagnosticHandler() {
+  ctx->getDiagEngine().setHandler(existingHandler);
+}
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticHandler
+//===----------------------------------------------------------------------===//
+namespace mlir {
+namespace detail {
+struct SourceMgrDiagnosticHandlerImpl {
+  /// Get a memory buffer for the given file, or nullptr if one is not found.
+  const llvm::MemoryBuffer *getBufferForFile(llvm::SourceMgr &mgr,
+                                             StringRef filename) {
+    // Check for an existing mapping to the buffer id for this file.
+    auto bufferIt = filenameToBuf.find(filename);
+    if (bufferIt != filenameToBuf.end())
+      return bufferIt->second;
+
+    // Look for a buffer in the manager that has this filename.
+    for (unsigned i = 1, e = mgr.getNumBuffers() + 1; i != e; ++i) {
+      auto *buf = mgr.getMemoryBuffer(i);
+      if (buf->getBufferIdentifier() == filename)
+        return filenameToBuf[filename] = buf;
+    }
+
+    // Otherwise, try to load the source file.
+    const llvm::MemoryBuffer *newBuf = nullptr;
+    std::string ignored;
+    if (auto newBufID = mgr.AddIncludeFile(filename, llvm::SMLoc(), ignored))
+      newBuf = mgr.getMemoryBuffer(newBufID);
+    return filenameToBuf[filename] = newBuf;
+  }
+
+  /// Mapping between file name and buffer pointer.
+  llvm::StringMap<const llvm::MemoryBuffer *> filenameToBuf;
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Return a processable FileLineColLoc from the given location.
+static llvm::Optional<FileLineColLoc> getFileLineColLoc(Location loc) {
+  switch (loc->getKind()) {
+  case StandardAttributes::NameLocation:
+    return getFileLineColLoc(loc.cast<NameLoc>().getChildLoc());
+  case StandardAttributes::FileLineColLocation:
+    return loc.cast<FileLineColLoc>();
+  case StandardAttributes::CallSiteLocation:
+    // Process the callee of a callsite location.
+    return getFileLineColLoc(loc.cast<CallSiteLoc>().getCallee());
+  default:
+    return llvm::None;
+  }
+}
+
+/// Given a diagnostic kind, returns the LLVM DiagKind.
+static llvm::SourceMgr::DiagKind getDiagKind(DiagnosticSeverity kind) {
+  switch (kind) {
+  case DiagnosticSeverity::Note:
+    return llvm::SourceMgr::DK_Note;
+  case DiagnosticSeverity::Warning:
+    return llvm::SourceMgr::DK_Warning;
+  case DiagnosticSeverity::Error:
+    return llvm::SourceMgr::DK_Error;
+  case DiagnosticSeverity::Remark:
+    return llvm::SourceMgr::DK_Remark;
+  }
+  llvm_unreachable("Unknown DiagnosticSeverity");
+}
+
+SourceMgrDiagnosticHandler::SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr,
+                                                       MLIRContext *ctx,
+                                                       llvm::raw_ostream &os)
+    : ScopedDiagnosticHandler(ctx), mgr(mgr), os(os),
+      impl(new SourceMgrDiagnosticHandlerImpl()) {
+  // Register a simple diagnostic handler.
+  ctx->getDiagEngine().setHandler(
+      [this](Diagnostic diag) { emitDiagnostic(diag); });
+}
+
+SourceMgrDiagnosticHandler::SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr,
+                                                       MLIRContext *ctx)
+    : SourceMgrDiagnosticHandler(mgr, ctx, llvm::errs()) {}
+
+SourceMgrDiagnosticHandler::~SourceMgrDiagnosticHandler() {}
+
+void SourceMgrDiagnosticHandler::emitDiagnostic(Location loc, Twine message,
+                                                DiagnosticSeverity kind) {
+  // Extract a file location from this loc.
+  auto fileLoc = getFileLineColLoc(loc);
+
+  // If one doesn't exist, then print the raw message without a source location.
+  if (!fileLoc) {
+    std::string str;
+    llvm::raw_string_ostream strOS(str);
+    if (!loc.isa<UnknownLoc>())
+      strOS << loc << ": ";
+    strOS << message;
+    return mgr.PrintMessage(os, llvm::SMLoc(), getDiagKind(kind), strOS.str());
+  }
+
+  // Otherwise, try to convert the file location to an SMLoc.
+  auto smloc = convertLocToSMLoc(*fileLoc);
+  if (smloc.isValid())
+    return mgr.PrintMessage(os, smloc, getDiagKind(kind), message);
+
+  // If the conversion was unsuccessful, create a diagnostic with the file
+  // information.
+  llvm::SMDiagnostic diag(fileLoc->getFilename(), getDiagKind(kind),
+                          message.str());
+  diag.print(nullptr, os);
+}
+
+/// Emit the given diagnostic with the held source manager.
+void SourceMgrDiagnosticHandler::emitDiagnostic(Diagnostic &diag) {
+  // Emit the diagnostic.
+  auto loc = diag.getLocation();
+  emitDiagnostic(loc, diag.str(), diag.getSeverity());
+
+  // If the diagnostic location was a call site location, then print the call
+  // stack as well.
+  if (auto callLoc = loc.dyn_cast<CallSiteLoc>()) {
+    // Print the call stack while valid, or until the limit is reached.
+    Location callerLoc = callLoc.getCaller();
+    for (unsigned curDepth = 0; curDepth < callStackLimit; ++curDepth) {
+      emitDiagnostic(callerLoc, "called from", DiagnosticSeverity::Note);
+      if ((callLoc = callerLoc.dyn_cast<CallSiteLoc>()))
+        callerLoc = callLoc.getCaller();
+      else
+        break;
+    }
+  }
+
+  // Emit each of the notes.
+  for (auto &note : diag.getNotes())
+    emitDiagnostic(note.getLocation(), note.str(), note.getSeverity());
+}
+
+/// Get a memory buffer for the given file, or nullptr if one is not found.
+const llvm::MemoryBuffer *
+SourceMgrDiagnosticHandler::getBufferForFile(StringRef filename) {
+  return impl->getBufferForFile(mgr, filename);
+}
+
+/// Get a memory buffer for the given file, or the main file of the source
+/// manager if one doesn't exist. This always returns non-null.
+llvm::SMLoc SourceMgrDiagnosticHandler::convertLocToSMLoc(FileLineColLoc loc) {
+  // Get the buffer for this filename.
+  auto *membuf = getBufferForFile(loc.getFilename());
+  if (!membuf)
+    return llvm::SMLoc();
+
+  // TODO: This should really be upstreamed to be a method on llvm::SourceMgr.
+  // Doing so would allow it to use the offset cache that is already maintained
+  // by SrcBuffer, making this more efficient.
+  unsigned lineNo = loc.getLine();
+  unsigned columnNo = loc.getColumn();
+
+  // Scan for the correct line number.
+  const char *position = membuf->getBufferStart();
+  const char *end = membuf->getBufferEnd();
+
+  // We start counting line and column numbers from 1.
+  if (lineNo != 0)
+    --lineNo;
+  if (columnNo != 0)
+    --columnNo;
+
+  while (position < end && lineNo) {
+    auto curChar = *position++;
+
+    // Scan for newlines.  If this isn't one, ignore it.
+    if (curChar != '\r' && curChar != '\n')
+      continue;
+
+    // We saw a line break, decrement our counter.
+    --lineNo;
+
+    // Check for \r\n and \n\r and treat it as a single escape.  We know that
+    // looking past one character is safe because MemoryBuffer's are always nul
+    // terminated.
+    if (*position != curChar && (*position == '\r' || *position == '\n'))
+      ++position;
+  }
+
+  // If the line/column counter was invalid, return a pointer to the start of
+  // the buffer.
+  if (lineNo || position + columnNo > end)
+    return llvm::SMLoc::getFromPointer(membuf->getBufferStart());
+
+  // If the column is zero, try to skip to the first non-whitespace character.
+  if (columnNo == 0) {
+    auto isNewline = [](char c) { return c == '\n' || c == '\r'; };
+    auto isWhitespace = [](char c) { return c == ' ' || c == '\t'; };
+
+    // Look for a valid non-whitespace character before the next line.
+    for (auto *newPos = position; newPos < end && !isNewline(*newPos); ++newPos)
+      if (!isWhitespace(*newPos))
+        return llvm::SMLoc::getFromPointer(newPos);
+  }
+
+  // Otherwise return the right pointer.
+  return llvm::SMLoc::getFromPointer(position + columnNo);
+}
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticVerifierHandler
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+// Record the expected diagnostic's position, substring and whether it was
+// seen.
+struct ExpectedDiag {
+  DiagnosticSeverity kind;
+  unsigned lineNo;
+  StringRef substring;
+  llvm::SMLoc fileLoc;
+  bool matched;
+};
+
+struct SourceMgrDiagnosticVerifierHandlerImpl {
+  SourceMgrDiagnosticVerifierHandlerImpl() : status(success()) {}
+
+  /// Returns the expected diagnostics for the given source file.
+  llvm::Optional<MutableArrayRef<ExpectedDiag>>
+  getExpectedDiags(StringRef bufName);
+
+  /// Computes the expected diagnostics for the given source buffer.
+  MutableArrayRef<ExpectedDiag>
+  computeExpectedDiags(const llvm::MemoryBuffer *buf);
+
+  /// The current status of the verifier.
+  LogicalResult status;
+
+  /// A list of expected diagnostics for each buffer of the source manager.
+  llvm::StringMap<SmallVector<ExpectedDiag, 2>> expectedDiagsPerFile;
+
+  /// Regex to match the expected diagnostics format.
+  llvm::Regex expected = llvm::Regex(
+      "expected-(error|note|remark|warning) *(@[+-][0-9]+)? *{{(.*)}}");
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Given a diagnostic kind, return a human readable string for it.
+static StringRef getDiagKindStr(DiagnosticSeverity kind) {
+  switch (kind) {
+  case DiagnosticSeverity::Note:
+    return "note";
+  case DiagnosticSeverity::Warning:
+    return "warning";
+  case DiagnosticSeverity::Error:
+    return "error";
+  case DiagnosticSeverity::Remark:
+    return "remark";
+  }
+  llvm_unreachable("Unknown DiagnosticSeverity");
+}
+
+/// Returns the expected diagnostics for the given source file.
+llvm::Optional<MutableArrayRef<ExpectedDiag>>
+SourceMgrDiagnosticVerifierHandlerImpl::getExpectedDiags(StringRef bufName) {
+  auto expectedDiags = expectedDiagsPerFile.find(bufName);
+  if (expectedDiags != expectedDiagsPerFile.end())
+    return MutableArrayRef<ExpectedDiag>(expectedDiags->second);
+  return llvm::None;
+}
+
+/// Computes the expected diagnostics for the given source buffer.
+MutableArrayRef<ExpectedDiag>
+SourceMgrDiagnosticVerifierHandlerImpl::computeExpectedDiags(
+    const llvm::MemoryBuffer *buf) {
+  // If the buffer is invalid, return an empty list.
+  if (!buf)
+    return llvm::None;
+  auto &expectedDiags = expectedDiagsPerFile[buf->getBufferIdentifier()];
+
+  // Scan the file for expected-* designators.
+  SmallVector<StringRef, 100> lines;
+  buf->getBuffer().split(lines, '\n');
+  for (unsigned lineNo = 0, e = lines.size(); lineNo < e; ++lineNo) {
+    SmallVector<StringRef, 3> matches;
+    if (!expected.match(lines[lineNo], &matches))
+      continue;
+    // Point to the start of expected-*.
+    auto expectedStart = llvm::SMLoc::getFromPointer(matches[0].data());
+
+    DiagnosticSeverity kind;
+    if (matches[1] == "error")
+      kind = DiagnosticSeverity::Error;
+    else if (matches[1] == "warning")
+      kind = DiagnosticSeverity::Warning;
+    else if (matches[1] == "remark")
+      kind = DiagnosticSeverity::Remark;
+    else {
+      assert(matches[1] == "note");
+      kind = DiagnosticSeverity::Note;
+    }
+
+    ExpectedDiag record{kind, lineNo + 1, matches[3], expectedStart, false};
+    auto offsetMatch = matches[2];
+    if (!offsetMatch.empty()) {
+      int offset;
+      // Get the integer value without the @ and +/- prefix.
+      if (!offsetMatch.drop_front(2).getAsInteger(0, offset)) {
+        if (offsetMatch[1] == '+')
+          record.lineNo += offset;
+        else
+          record.lineNo -= offset;
+      }
+    }
+    expectedDiags.push_back(record);
+  }
+  return expectedDiags;
+}
+
+SourceMgrDiagnosticVerifierHandler::SourceMgrDiagnosticVerifierHandler(
+    llvm::SourceMgr &srcMgr, MLIRContext *ctx, llvm::raw_ostream &out)
+    : SourceMgrDiagnosticHandler(srcMgr, ctx, out),
+      impl(new SourceMgrDiagnosticVerifierHandlerImpl()) {
+  // Compute the expected diagnostics for each of the current files in the
+  // source manager.
+  for (unsigned i = 0, e = mgr.getNumBuffers(); i != e; ++i)
+    (void)impl->computeExpectedDiags(mgr.getMemoryBuffer(i + 1));
+
+  // Register a handler to verfy the diagnostics.
+  ctx->getDiagEngine().setHandler([&](Diagnostic diag) {
+    // Process the main diagnostics.
+    process(diag);
+
+    // Process each of the notes.
+    for (auto &note : diag.getNotes())
+      process(note);
+  });
+}
+
+SourceMgrDiagnosticVerifierHandler::SourceMgrDiagnosticVerifierHandler(
+    llvm::SourceMgr &srcMgr, MLIRContext *ctx)
+    : SourceMgrDiagnosticVerifierHandler(srcMgr, ctx, llvm::errs()) {}
+
+SourceMgrDiagnosticVerifierHandler::~SourceMgrDiagnosticVerifierHandler() {
+  // Ensure that all expected diagnosics were handled.
+  (void)verify();
+}
+
+/// Returns the status of the verifier and verifies that all expected
+/// diagnostics were emitted. This return success if all diagnostics were
+/// verified correctly, failure otherwise.
+LogicalResult SourceMgrDiagnosticVerifierHandler::verify() {
+  // Verify that all expected errors were seen.
+  for (auto &expectedDiagsPair : impl->expectedDiagsPerFile) {
+    for (auto &err : expectedDiagsPair.second) {
+      if (err.matched)
+        continue;
+      llvm::SMRange range(err.fileLoc,
+                          llvm::SMLoc::getFromPointer(err.fileLoc.getPointer() +
+                                                      err.substring.size()));
+      mgr.PrintMessage(os, err.fileLoc, llvm::SourceMgr::DK_Error,
+                       "expected " + getDiagKindStr(err.kind) + " \"" +
+                           err.substring + "\" was not produced",
+                       range);
+      impl->status = failure();
+    }
+  }
+  impl->expectedDiagsPerFile.clear();
+  return impl->status;
+}
+
+/// Process a single diagnostic.
+void SourceMgrDiagnosticVerifierHandler::process(Diagnostic &diag) {
+  auto kind = diag.getSeverity();
+
+  // Process a FileLineColLoc.
+  if (auto fileLoc = getFileLineColLoc(diag.getLocation()))
+    return process(*fileLoc, diag.str(), kind);
+
+  emitDiagnostic(diag.getLocation(),
+                 "unexpected " + getDiagKindStr(kind) + ": " + diag.str(),
+                 DiagnosticSeverity::Error);
+  impl->status = failure();
+}
+
+/// Process a FileLineColLoc diagnostic.
+void SourceMgrDiagnosticVerifierHandler::process(FileLineColLoc loc,
+                                                 StringRef msg,
+                                                 DiagnosticSeverity kind) {
+  // Get the expected diagnostics for this file.
+  auto diags = impl->getExpectedDiags(loc.getFilename());
+  if (!diags)
+    diags = impl->computeExpectedDiags(getBufferForFile(loc.getFilename()));
+
+  // Search for a matching expected diagnostic.
+  // If we find something that is close then emit a more specific error.
+  ExpectedDiag *nearMiss = nullptr;
+
+  // If this was an expected error, remember that we saw it and return.
+  unsigned line = loc.getLine();
+  for (auto &e : *diags) {
+    if (line == e.lineNo && msg.contains(e.substring)) {
+      if (e.kind == kind) {
+        e.matched = true;
+        return;
+      }
+
+      // If this only differs based on the diagnostic kind, then consider it
+      // to be a near miss.
+      nearMiss = &e;
+    }
+  }
+
+  // Otherwise, emit an error for the near miss.
+  if (nearMiss)
+    mgr.PrintMessage(os, nearMiss->fileLoc, llvm::SourceMgr::DK_Error,
+                     "'" + getDiagKindStr(kind) +
+                         "' diagnostic emitted when expecting a '" +
+                         getDiagKindStr(nearMiss->kind) + "'");
+  else
+    emitDiagnostic(loc, "unexpected " + getDiagKindStr(kind) + ": " + msg,
+                   DiagnosticSeverity::Error);
+  impl->status = failure();
+}
+
+//===----------------------------------------------------------------------===//
+// ParallelDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct ParallelDiagnosticHandlerImpl : public llvm::PrettyStackTraceEntry {
+  struct ThreadDiagnostic {
+    ThreadDiagnostic(size_t id, Diagnostic diag)
+        : id(id), diag(std::move(diag)) {}
+    bool operator<(const ThreadDiagnostic &rhs) const { return id < rhs.id; }
+
+    /// The id for this diagnostic, this is used for ordering.
+    /// Note: This id corresponds to the ordered position of the current element
+    ///       being processed by a given thread.
+    size_t id;
+
+    /// The diagnostic.
+    Diagnostic diag;
+  };
+
+  ParallelDiagnosticHandlerImpl(MLIRContext *ctx)
+      : prevHandler(ctx->getDiagEngine().getHandler()), context(ctx) {
+    ctx->getDiagEngine().setHandler([this](Diagnostic diag) {
+      uint64_t tid = llvm::get_threadid();
+      llvm::sys::SmartScopedLock<true> lock(mutex);
+      assert(threadToOrderID.count(tid) &&
+             "current thread does not have a valid orderID");
+
+      // Append a new diagnostic.
+      diagnostics.emplace_back(threadToOrderID[tid], std::move(diag));
+    });
+  }
+
+  ~ParallelDiagnosticHandlerImpl() {
+    // Restore the previous diagnostic handler.
+    context->getDiagEngine().setHandler(prevHandler);
+
+    // Early exit if there are no diagnostics, this is the common case.
+    if (diagnostics.empty())
+      return;
+
+    // Emit the diagnostics back to the context.
+    emitDiagnostics([&](Diagnostic diag) {
+      return context->getDiagEngine().emit(std::move(diag));
+    });
+  }
+
+  /// Utility method to emit any held diagnostics.
+  void emitDiagnostics(std::function<void(Diagnostic)> emitFn) {
+    // Stable sort all of the diagnostics that were emitted. This creates a
+    // deterministic ordering for the diagnostics based upon which order id they
+    // were emitted for.
+    std::stable_sort(diagnostics.begin(), diagnostics.end());
+
+    // Emit each diagnostic to the context again.
+    for (ThreadDiagnostic &diag : diagnostics)
+      emitFn(std::move(diag.diag));
+  }
+
+  /// Set the order id for the current thread.
+  void setOrderIDForThread(size_t orderID) {
+    uint64_t tid = llvm::get_threadid();
+    llvm::sys::SmartScopedLock<true> lock(mutex);
+    threadToOrderID[tid] = orderID;
+  }
+
+  /// Dump the current diagnostics that were inflight.
+  void print(raw_ostream &os) const override {
+    // Early exit if there are no diagnostics, this is the common case.
+    if (diagnostics.empty())
+      return;
+
+    os << "In-Flight Diagnostics:\n";
+    const_cast<ParallelDiagnosticHandlerImpl *>(this)->emitDiagnostics(
+        [&](Diagnostic diag) {
+          os.indent(4);
+
+          // Print each diagnostic with the format:
+          //   "<location>: <kind>: <msg>"
+          if (!diag.getLocation().isa<UnknownLoc>())
+            os << diag.getLocation() << ": ";
+          switch (diag.getSeverity()) {
+          case DiagnosticSeverity::Error:
+            os << "error: ";
+            break;
+          case DiagnosticSeverity::Warning:
+            os << "warning: ";
+            break;
+          case DiagnosticSeverity::Note:
+            os << "note: ";
+            break;
+          case DiagnosticSeverity::Remark:
+            os << "remark: ";
+            break;
+          }
+          os << diag << '\n';
+        });
+  }
+
+  /// The previous context diagnostic handler.
+  DiagnosticEngine::HandlerTy prevHandler;
+
+  /// A smart mutex to lock access to the internal state.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// A mapping between the thread id and the current order id.
+  DenseMap<uint64_t, size_t> threadToOrderID;
+
+  /// An unordered list of diagnostics that were emitted.
+  std::vector<ThreadDiagnostic> diagnostics;
+
+  /// The context to emit the diagnostics to.
+  MLIRContext *context;
+};
+} // end namespace detail
+} // end namespace mlir
+
+ParallelDiagnosticHandler::ParallelDiagnosticHandler(MLIRContext *ctx)
+    : impl(new ParallelDiagnosticHandlerImpl(ctx)) {}
+ParallelDiagnosticHandler::~ParallelDiagnosticHandler() {}
+
+/// Set the order id for the current thread.
+void ParallelDiagnosticHandler::setOrderIDForThread(size_t orderID) {
+  impl->setOrderIDForThread(orderID);
+}
diff --git a/third_party/mlir/lib/IR/Dialect.cpp b/third_party/mlir/lib/IR/Dialect.cpp
new file mode 100644
index 00000000000..2daf46e514f
--- /dev/null
+++ b/third_party/mlir/lib/IR/Dialect.cpp
@@ -0,0 +1,153 @@
+//===- Dialect.cpp - Dialect implementation -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectHooks.h"
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Regex.h"
+
+using namespace mlir;
+using namespace detail;
+
+//===----------------------------------------------------------------------===//
+// Dialect Registration
+//===----------------------------------------------------------------------===//
+
+// Registry for all dialect allocation functions.
+static llvm::ManagedStatic<SmallVector<DialectAllocatorFunction, 8>>
+    dialectRegistry;
+
+// Registry for functions that set dialect hooks.
+static llvm::ManagedStatic<SmallVector<DialectHooksSetter, 8>>
+    dialectHooksRegistry;
+
+/// Registers a specific dialect creation function with the system, typically
+/// used through the DialectRegistration template.
+void mlir::registerDialectAllocator(const DialectAllocatorFunction &function) {
+  assert(function &&
+         "Attempting to register an empty dialect initialize function");
+  dialectRegistry->push_back(function);
+}
+
+/// Registers a function to set specific hooks for a specific dialect, typically
+/// used through the DialectHooksRegistreation template.
+void mlir::registerDialectHooksSetter(const DialectHooksSetter &function) {
+  assert(
+      function &&
+      "Attempting to register an empty dialect hooks initialization function");
+
+  dialectHooksRegistry->push_back(function);
+}
+
+/// Registers all dialects and their const folding hooks with the specified
+/// MLIRContext.
+void mlir::registerAllDialects(MLIRContext *context) {
+  for (const auto &fn : *dialectRegistry)
+    fn(context);
+  for (const auto &fn : *dialectHooksRegistry) {
+    fn(context);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+
+Dialect::Dialect(StringRef name, MLIRContext *context)
+    : name(name), context(context) {
+  assert(isValidNamespace(name) && "invalid dialect namespace");
+  registerDialect(context);
+}
+
+Dialect::~Dialect() {}
+
+/// Verify an attribute from this dialect on the argument at 'argIndex' for
+/// the region at 'regionIndex' on the given operation. Returns failure if
+/// the verification failed, success otherwise. This hook may optionally be
+/// invoked from any operation containing a region.
+LogicalResult Dialect::verifyRegionArgAttribute(Operation *, unsigned, unsigned,
+                                                NamedAttribute) {
+  return success();
+}
+
+/// Parse an attribute registered to this dialect.
+Attribute Dialect::parseAttribute(StringRef attrData, Type type,
+                                  Location loc) const {
+  emitError(loc) << "dialect '" << getNamespace()
+                 << "' provides no attribute parsing hook";
+  return Attribute();
+}
+
+/// Parse a type registered to this dialect.
+Type Dialect::parseType(StringRef tyData, Location loc) const {
+  // If this dialect allows unknown types, then represent this with OpaqueType.
+  if (allowsUnknownTypes()) {
+    auto ns = Identifier::get(getNamespace(), getContext());
+    return OpaqueType::get(ns, tyData, getContext());
+  }
+
+  emitError(loc) << "dialect '" << getNamespace()
+                 << "' provides no type parsing hook";
+  return Type();
+}
+
+/// Utility function that returns if the given string is a valid dialect
+/// namespace.
+bool Dialect::isValidNamespace(StringRef str) {
+  if (str.empty())
+    return true;
+  llvm::Regex dialectNameRegex("^[a-zA-Z_][a-zA-Z_0-9\\$]*$");
+  return dialectNameRegex.match(str);
+}
+
+/// Register a set of dialect interfaces with this dialect instance.
+void Dialect::addInterface(std::unique_ptr<DialectInterface> interface) {
+  auto it = registeredInterfaces.try_emplace(interface->getID(),
+                                             std::move(interface));
+  (void)it;
+  assert(it.second && "interface kind has already been registered");
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect Interface
+//===----------------------------------------------------------------------===//
+
+DialectInterface::~DialectInterface() {}
+
+DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
+    MLIRContext *ctx, ClassID *interfaceKind) {
+  for (auto *dialect : ctx->getRegisteredDialects()) {
+    if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
+      interfaces.insert(interface);
+      orderedInterfaces.push_back(interface);
+    }
+  }
+}
+
+DialectInterfaceCollectionBase::~DialectInterfaceCollectionBase() {}
+
+/// Get the interface for the dialect of given operation, or null if one
+/// is not registered.
+const DialectInterface *
+DialectInterfaceCollectionBase::getInterfaceFor(Operation *op) const {
+  return getInterfaceFor(op->getDialect());
+}
diff --git a/third_party/mlir/lib/IR/Function.cpp b/third_party/mlir/lib/IR/Function.cpp
new file mode 100644
index 00000000000..fb54f85594c
--- /dev/null
+++ b/third_party/mlir/lib/IR/Function.cpp
@@ -0,0 +1,178 @@
+//===- Function.cpp - MLIR Function Classes -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Function Operation.
+//===----------------------------------------------------------------------===//
+
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      ArrayRef<NamedAttribute> attrs) {
+  OperationState state(location, "func");
+  Builder builder(location->getContext());
+  FuncOp::build(&builder, &state, name, type, attrs);
+  return llvm::cast<FuncOp>(Operation::create(state));
+}
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      llvm::iterator_range<dialect_attr_iterator> attrs) {
+  SmallVector<NamedAttribute, 8> attrRef(attrs);
+  return create(location, name, type, llvm::makeArrayRef(attrRef));
+}
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      ArrayRef<NamedAttribute> attrs,
+                      ArrayRef<NamedAttributeList> argAttrs) {
+  FuncOp func = create(location, name, type, attrs);
+  func.setAllArgAttrs(argAttrs);
+  return func;
+}
+
+void FuncOp::build(Builder *builder, OperationState *result, StringRef name,
+                   FunctionType type, ArrayRef<NamedAttribute> attrs) {
+  result->addAttribute(SymbolTable::getSymbolAttrName(),
+                       builder->getStringAttr(name));
+  result->addAttribute(getTypeAttrName(), builder->getTypeAttr(type));
+  result->attributes.append(attrs.begin(), attrs.end());
+  result->addRegion();
+}
+
+void FuncOp::build(Builder *builder, OperationState *result, StringRef name,
+                   FunctionType type, ArrayRef<NamedAttribute> attrs,
+                   ArrayRef<NamedAttributeList> argAttrs) {
+  build(builder, result, name, type, attrs);
+  assert(type.getNumInputs() == argAttrs.size());
+  SmallString<8> argAttrName;
+  for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i)
+    if (auto argDict = argAttrs[i].getDictionary())
+      result->addAttribute(getArgAttrName(i, argAttrName), argDict);
+}
+
+/// Parsing/Printing methods.
+
+ParseResult FuncOp::parse(OpAsmParser *parser, OperationState *result) {
+  auto buildFuncType = [](Builder &builder, ArrayRef<Type> argTypes,
+                          ArrayRef<Type> results, impl::VariadicFlag,
+                          std::string &) {
+    return builder.getFunctionType(argTypes, results);
+  };
+
+  return impl::parseFunctionLikeOp(parser, result, /*allowVariadic=*/false,
+                                   buildFuncType);
+}
+
+void FuncOp::print(OpAsmPrinter *p) {
+  FunctionType fnType = getType();
+  impl::printFunctionLikeOp(p, *this, fnType.getInputs(), /*isVariadic=*/false,
+                            fnType.getResults());
+}
+
+LogicalResult FuncOp::verify() {
+  // If this function is external there is nothing to do.
+  if (isExternal())
+    return success();
+
+  // Verify that the argument list of the function and the arg list of the entry
+  // block line up.  The trait already verified that the number of arguments is
+  // the same between the signature and the block.
+  auto fnInputTypes = getType().getInputs();
+  Block &entryBlock = front();
+  for (unsigned i = 0, e = entryBlock.getNumArguments(); i != e; ++i)
+    if (fnInputTypes[i] != entryBlock.getArgument(i)->getType())
+      return emitOpError("type of entry block argument #")
+             << i << '(' << entryBlock.getArgument(i)->getType()
+             << ") must match the type of the corresponding argument in "
+             << "function signature(" << fnInputTypes[i] << ')';
+
+  return success();
+}
+
+/// Add an entry block to an empty function, and set up the block arguments
+/// to match the signature of the function.
+Block *FuncOp::addEntryBlock() {
+  assert(empty() && "function already has an entry block");
+  auto *entry = new Block();
+  push_back(entry);
+  entry->addArguments(getType().getInputs());
+  return entry;
+}
+
+/// Clone the internal blocks from this function into dest and all attributes
+/// from this function to dest.
+void FuncOp::cloneInto(FuncOp dest, BlockAndValueMapping &mapper) {
+  // Add the attributes of this function to dest.
+  llvm::MapVector<Identifier, Attribute> newAttrs;
+  for (auto &attr : dest.getAttrs())
+    newAttrs.insert(attr);
+  for (auto &attr : getAttrs())
+    newAttrs.insert(attr);
+  dest.getOperation()->setAttrs(
+      DictionaryAttr::get(newAttrs.takeVector(), getContext()));
+
+  // Clone the body.
+  getBody().cloneInto(&dest.getBody(), mapper);
+}
+
+/// Create a deep copy of this function and all of its blocks, remapping
+/// any operands that use values outside of the function using the map that is
+/// provided (leaving them alone if no entry is present). Replaces references
+/// to cloned sub-values with the corresponding value that is copied, and adds
+/// those mappings to the mapper.
+FuncOp FuncOp::clone(BlockAndValueMapping &mapper) {
+  FunctionType newType = getType();
+
+  // If the function has a body, then the user might be deleting arguments to
+  // the function by specifying them in the mapper. If so, we don't add the
+  // argument to the input type vector.
+  bool isExternalFn = isExternal();
+  if (!isExternalFn) {
+    SmallVector<Type, 4> inputTypes;
+    inputTypes.reserve(newType.getNumInputs());
+    for (unsigned i = 0, e = getNumArguments(); i != e; ++i)
+      if (!mapper.contains(getArgument(i)))
+        inputTypes.push_back(newType.getInput(i));
+    newType = FunctionType::get(inputTypes, newType.getResults(), getContext());
+  }
+
+  // Create the new function.
+  FuncOp newFunc = llvm::cast<FuncOp>(getOperation()->cloneWithoutRegions());
+  newFunc.setType(newType);
+
+  /// Set the argument attributes for arguments that aren't being replaced.
+  for (unsigned i = 0, e = getNumArguments(), destI = 0; i != e; ++i)
+    if (isExternalFn || !mapper.contains(getArgument(i)))
+      newFunc.setArgAttrs(destI++, getArgAttrs(i));
+
+  /// Clone the current function into the new one and return it.
+  cloneInto(newFunc, mapper);
+  return newFunc;
+}
+FuncOp FuncOp::clone() {
+  BlockAndValueMapping mapper;
+  return clone(mapper);
+}
diff --git a/third_party/mlir/lib/IR/FunctionSupport.cpp b/third_party/mlir/lib/IR/FunctionSupport.cpp
new file mode 100644
index 00000000000..064e438d5f5
--- /dev/null
+++ b/third_party/mlir/lib/IR/FunctionSupport.cpp
@@ -0,0 +1,234 @@
+//===- FunctionSupport.cpp - Utility types for function-like ops ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+
+using namespace mlir;
+
+static ParseResult
+parseArgumentList(OpAsmParser *parser, bool allowVariadic,
+                  SmallVectorImpl<Type> &argTypes,
+                  SmallVectorImpl<OpAsmParser::OperandType> &argNames,
+                  SmallVectorImpl<SmallVector<NamedAttribute, 2>> &argAttrs,
+                  bool &isVariadic) {
+  if (parser->parseLParen())
+    return failure();
+
+  // The argument list either has to consistently have ssa-id's followed by
+  // types, or just be a type list.  It isn't ok to sometimes have SSA ID's and
+  // sometimes not.
+  auto parseArgument = [&]() -> ParseResult {
+    llvm::SMLoc loc = parser->getCurrentLocation();
+
+    // Parse argument name if present.
+    OpAsmParser::OperandType argument;
+    Type argumentType;
+    if (succeeded(parser->parseOptionalRegionArgument(argument)) &&
+        !argument.name.empty()) {
+      // Reject this if the preceding argument was missing a name.
+      if (argNames.empty() && !argTypes.empty())
+        return parser->emitError(loc,
+                                 "expected type instead of SSA identifier");
+      argNames.push_back(argument);
+
+      if (parser->parseColonType(argumentType))
+        return failure();
+    } else if (allowVariadic && succeeded(parser->parseOptionalEllipsis())) {
+      isVariadic = true;
+      return success();
+    } else if (!argNames.empty()) {
+      // Reject this if the preceding argument had a name.
+      return parser->emitError(loc, "expected SSA identifier");
+    } else if (parser->parseType(argumentType)) {
+      return failure();
+    }
+
+    // Add the argument type.
+    argTypes.push_back(argumentType);
+
+    // Parse any argument attributes.
+    SmallVector<NamedAttribute, 2> attrs;
+    if (parser->parseOptionalAttributeDict(attrs))
+      return failure();
+    argAttrs.push_back(attrs);
+    return success();
+  };
+
+  // Parse the function arguments.
+  if (parser->parseOptionalRParen()) {
+    do {
+      unsigned numTypedArguments = argTypes.size();
+      if (parseArgument())
+        return failure();
+
+      llvm::SMLoc loc = parser->getCurrentLocation();
+      if (argTypes.size() == numTypedArguments &&
+          succeeded(parser->parseOptionalComma()))
+        return parser->emitError(
+            loc, "variadic arguments must be in the end of the argument list");
+    } while (succeeded(parser->parseOptionalComma()));
+    parser->parseRParen();
+  }
+
+  return success();
+}
+
+/// Parse a function signature, starting with a name and including the
+/// parameter list.
+static ParseResult parseFunctionSignature(
+    OpAsmParser *parser, bool allowVariadic,
+    SmallVectorImpl<OpAsmParser::OperandType> &argNames,
+    SmallVectorImpl<Type> &argTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &argAttrs, bool &isVariadic,
+    SmallVectorImpl<Type> &results) {
+  if (parseArgumentList(parser, allowVariadic, argTypes, argNames, argAttrs,
+                        isVariadic))
+    return failure();
+  // Parse the return types if present.
+  return parser->parseOptionalArrowTypeList(results);
+}
+
+/// Parser implementation for function-like operations.  Uses `funcTypeBuilder`
+/// to construct the custom function type given lists of input and output types.
+ParseResult
+mlir::impl::parseFunctionLikeOp(OpAsmParser *parser, OperationState *result,
+                                bool allowVariadic,
+                                mlir::impl::FuncTypeBuilder funcTypeBuilder) {
+  SmallVector<OpAsmParser::OperandType, 4> entryArgs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 4> argAttrs;
+  SmallVector<Type, 4> argTypes;
+  SmallVector<Type, 4> results;
+  auto &builder = parser->getBuilder();
+
+  // Parse the name as a symbol reference attribute.
+  SymbolRefAttr nameAttr;
+  if (parser->parseAttribute(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(),
+                             result->attributes))
+    return failure();
+  // Convert the parsed function attr into a string attr.
+  result->attributes.back().second = builder.getStringAttr(nameAttr.getValue());
+
+  // Parse the function signature.
+  auto signatureLocation = parser->getCurrentLocation();
+  bool isVariadic = false;
+  if (parseFunctionSignature(parser, allowVariadic, entryArgs, argTypes,
+                             argAttrs, isVariadic, results))
+    return failure();
+
+  std::string errorMessage;
+  if (auto type = funcTypeBuilder(builder, argTypes, results,
+                                  impl::VariadicFlag(isVariadic), errorMessage))
+    result->addAttribute(getTypeAttrName(), builder.getTypeAttr(type));
+  else
+    return parser->emitError(signatureLocation)
+           << "failed to construct function type"
+           << (errorMessage.empty() ? "" : ": ") << errorMessage;
+
+  // If function attributes are present, parse them.
+  if (succeeded(parser->parseOptionalKeyword("attributes")))
+    if (parser->parseOptionalAttributeDict(result->attributes))
+      return failure();
+
+  // Add the attributes to the function arguments.
+  SmallString<8> argAttrName;
+  for (unsigned i = 0, e = argTypes.size(); i != e; ++i)
+    if (!argAttrs[i].empty())
+      result->addAttribute(getArgAttrName(i, argAttrName),
+                           builder.getDictionaryAttr(argAttrs[i]));
+
+  // Parse the optional function body.
+  auto *body = result->addRegion();
+  if (parser->parseOptionalRegion(*body, entryArgs,
+                                  entryArgs.empty() ? llvm::ArrayRef<Type>()
+                                                    : argTypes))
+    return failure();
+
+  return success();
+}
+
+/// Print the signature of the function-like operation `op`.  Assumes `op` has
+/// the FunctionLike trait and passed the verification.
+static void printSignature(OpAsmPrinter *p, Operation *op,
+                           ArrayRef<Type> argTypes, bool isVariadic,
+                           ArrayRef<Type> results) {
+  Region &body = op->getRegion(0);
+  bool isExternal = body.empty();
+
+  *p << '(';
+  for (unsigned i = 0, e = argTypes.size(); i < e; ++i) {
+    if (i > 0)
+      *p << ", ";
+
+    if (!isExternal) {
+      p->printOperand(body.front().getArgument(i));
+      *p << ": ";
+    }
+
+    p->printType(argTypes[i]);
+    p->printOptionalAttrDict(::mlir::impl::getArgAttrs(op, i));
+  }
+
+  if (isVariadic) {
+    if (!argTypes.empty())
+      *p << ", ";
+    *p << "...";
+  }
+
+  *p << ')';
+  p->printOptionalArrowTypeList(results);
+}
+
+/// Printer implementation for function-like operations.  Accepts lists of
+/// argument and result types to use while printing.
+void mlir::impl::printFunctionLikeOp(OpAsmPrinter *p, Operation *op,
+                                     ArrayRef<Type> argTypes, bool isVariadic,
+                                     ArrayRef<Type> results) {
+  // Print the operation and the function name.
+  auto funcName =
+      op->getAttrOfType<StringAttr>(::mlir::SymbolTable::getSymbolAttrName())
+          .getValue();
+  *p << op->getName() << " @" << funcName;
+
+  // Print the signature.
+  printSignature(p, op, argTypes, isVariadic, results);
+
+  // Print out function attributes, if present.
+  SmallVector<StringRef, 2> ignoredAttrs = {
+      ::mlir::SymbolTable::getSymbolAttrName(), getTypeAttrName()};
+
+  // Ignore any argument attributes.
+  std::vector<SmallString<8>> argAttrStorage;
+  SmallString<8> argAttrName;
+  for (unsigned i = 0, e = argTypes.size(); i != e; ++i)
+    if (op->getAttr(getArgAttrName(i, argAttrName)))
+      argAttrStorage.emplace_back(argAttrName);
+  ignoredAttrs.append(argAttrStorage.begin(), argAttrStorage.end());
+
+  auto attrs = op->getAttrs();
+  if (attrs.size() > ignoredAttrs.size()) {
+    *p << "\n  attributes ";
+    p->printOptionalAttrDict(attrs, ignoredAttrs);
+  }
+
+  // Print the body if this is not an external function.
+  Region &body = op->getRegion(0);
+  if (!body.empty())
+    p->printRegion(body, /*printEntryBlockArgs=*/false,
+                   /*printBlockTerminators=*/true);
+}
diff --git a/third_party/mlir/lib/IR/IntegerSet.cpp b/third_party/mlir/lib/IR/IntegerSet.cpp
new file mode 100644
index 00000000000..74a1297dcdd
--- /dev/null
+++ b/third_party/mlir/lib/IR/IntegerSet.cpp
@@ -0,0 +1,72 @@
+//===- IntegerSet.cpp - MLIR Integer Set class ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/IntegerSet.h"
+#include "IntegerSetDetail.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+unsigned IntegerSet::getNumDims() const { return set->dimCount; }
+unsigned IntegerSet::getNumSymbols() const { return set->symbolCount; }
+unsigned IntegerSet::getNumOperands() const {
+  return set->dimCount + set->symbolCount;
+}
+
+unsigned IntegerSet::getNumConstraints() const {
+  return set->constraints.size();
+}
+
+unsigned IntegerSet::getNumEqualities() const {
+  unsigned numEqualities = 0;
+  for (unsigned i = 0, e = getNumConstraints(); i < e; i++)
+    if (isEq(i))
+      ++numEqualities;
+  return numEqualities;
+}
+
+unsigned IntegerSet::getNumInequalities() const {
+  return getNumConstraints() - getNumEqualities();
+}
+
+bool IntegerSet::isEmptyIntegerSet() const {
+  // This will only work if uniqui'ing is on.
+  static_assert(kUniquingThreshold >= 1,
+                "uniquing threshold should be at least one");
+  return *this == getEmptySet(set->dimCount, set->symbolCount, getContext());
+}
+
+ArrayRef<AffineExpr> IntegerSet::getConstraints() const {
+  return set->constraints;
+}
+
+AffineExpr IntegerSet::getConstraint(unsigned idx) const {
+  return getConstraints()[idx];
+}
+
+/// Returns the equality bits, which specify whether each of the constraints
+/// is an equality or inequality.
+ArrayRef<bool> IntegerSet::getEqFlags() const { return set->eqFlags; }
+
+/// Returns true if the idx^th constraint is an equality, false if it is an
+/// inequality.
+bool IntegerSet::isEq(unsigned idx) const { return getEqFlags()[idx]; }
+
+MLIRContext *IntegerSet::getContext() const {
+  return getConstraint(0).getContext();
+}
diff --git a/third_party/mlir/lib/IR/IntegerSetDetail.h b/third_party/mlir/lib/IR/IntegerSetDetail.h
new file mode 100644
index 00000000000..b3eda5205fb
--- /dev/null
+++ b/third_party/mlir/lib/IR/IntegerSetDetail.h
@@ -0,0 +1,45 @@
+//===- IntegerSetDetail.h - MLIR IntegerSet storage details -----*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of IntegerSet.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INTEGERSETDETAIL_H_
+#define INTEGERSETDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+namespace detail {
+
+struct IntegerSetStorage {
+  unsigned dimCount;
+  unsigned symbolCount;
+
+  /// Array of affine constraints: a constraint is either an equality
+  /// (affine_expr == 0) or an inequality (affine_expr >= 0).
+  ArrayRef<AffineExpr> constraints;
+
+  // Bits to check whether a constraint is an equality or an inequality.
+  ArrayRef<bool> eqFlags;
+};
+
+} // end namespace detail
+} // end namespace mlir
+#endif // INTEGERSETDETAIL_H_
diff --git a/third_party/mlir/lib/IR/Location.cpp b/third_party/mlir/lib/IR/Location.cpp
new file mode 100644
index 00000000000..7be9280a28d
--- /dev/null
+++ b/third_party/mlir/lib/IR/Location.cpp
@@ -0,0 +1,125 @@
+//===- Location.cpp - MLIR Location Classes -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Location.h"
+#include "LocationDetail.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// CallSiteLoc
+//===----------------------------------------------------------------------===//
+
+Location CallSiteLoc::get(Location callee, Location caller) {
+  return Base::get(callee->getContext(), StandardAttributes::CallSiteLocation,
+                   callee, caller);
+}
+
+Location CallSiteLoc::get(Location name, ArrayRef<Location> frames) {
+  assert(!frames.empty() && "required at least 1 call frame");
+  Location caller = frames.back();
+  for (auto frame : llvm::reverse(frames.drop_back()))
+    caller = CallSiteLoc::get(frame, caller);
+  return CallSiteLoc::get(name, caller);
+}
+
+Location CallSiteLoc::getCallee() const { return getImpl()->callee; }
+
+Location CallSiteLoc::getCaller() const { return getImpl()->caller; }
+
+//===----------------------------------------------------------------------===//
+// FileLineColLoc
+//===----------------------------------------------------------------------===//
+
+Location FileLineColLoc::get(Identifier filename, unsigned line,
+                             unsigned column, MLIRContext *context) {
+  return Base::get(context, StandardAttributes::FileLineColLocation, filename,
+                   line, column);
+}
+
+Location FileLineColLoc::get(StringRef filename, unsigned line, unsigned column,
+                             MLIRContext *context) {
+  return get(Identifier::get(filename.empty() ? "-" : filename, context), line,
+             column, context);
+}
+
+StringRef FileLineColLoc::getFilename() const { return getImpl()->filename; }
+unsigned FileLineColLoc::getLine() const { return getImpl()->line; }
+unsigned FileLineColLoc::getColumn() const { return getImpl()->column; }
+
+//===----------------------------------------------------------------------===//
+// FusedLoc
+//===----------------------------------------------------------------------===//
+
+Location FusedLoc::get(ArrayRef<Location> locs, Attribute metadata,
+                       MLIRContext *context) {
+  // Unique the set of locations to be fused.
+  llvm::SmallSetVector<Location, 4> decomposedLocs;
+  for (auto loc : locs) {
+    // If the location is a fused location we decompose it if it has no
+    // metadata or the metadata is the same as the top level metadata.
+    if (auto fusedLoc = loc.dyn_cast<FusedLoc>()) {
+      if (fusedLoc.getMetadata() == metadata) {
+        // UnknownLoc's have already been removed from FusedLocs so we can
+        // simply add all of the internal locations.
+        decomposedLocs.insert(fusedLoc.getLocations().begin(),
+                              fusedLoc.getLocations().end());
+        continue;
+      }
+    }
+    // Otherwise, only add known locations to the set.
+    if (!loc.isa<UnknownLoc>())
+      decomposedLocs.insert(loc);
+  }
+  locs = decomposedLocs.getArrayRef();
+
+  // Handle the simple cases of less than two locations.
+  if (locs.empty())
+    return UnknownLoc::get(context);
+  if (locs.size() == 1)
+    return locs.front();
+  return Base::get(context, StandardAttributes::FusedLocation, locs, metadata);
+}
+
+ArrayRef<Location> FusedLoc::getLocations() const {
+  return getImpl()->getLocations();
+}
+
+Attribute FusedLoc::getMetadata() const { return getImpl()->metadata; }
+
+//===----------------------------------------------------------------------===//
+// NameLoc
+//===----------------------------------------------------------------------===//
+
+Location NameLoc::get(Identifier name, Location child) {
+  assert(!child.isa<NameLoc>() &&
+         "a NameLoc cannot be used as a child of another NameLoc");
+  return Base::get(child->getContext(), StandardAttributes::NameLocation, name,
+                   child);
+}
+
+Location NameLoc::get(Identifier name, MLIRContext *context) {
+  return get(name, UnknownLoc::get(context));
+}
+
+/// Return the name identifier.
+Identifier NameLoc::getName() const { return getImpl()->name; }
+
+/// Return the child location.
+Location NameLoc::getChildLoc() const { return getImpl()->child; }
diff --git a/third_party/mlir/lib/IR/LocationDetail.h b/third_party/mlir/lib/IR/LocationDetail.h
new file mode 100644
index 00000000000..2076eb71aa5
--- /dev/null
+++ b/third_party/mlir/lib/IR/LocationDetail.h
@@ -0,0 +1,140 @@
+//===- LocationDetail.h - MLIR Location storage details ---------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of the location attributes.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_IR_LOCATIONDETAIL_H_
+#define MLIR_IR_LOCATIONDETAIL_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+
+namespace detail {
+
+struct CallSiteLocationStorage : public AttributeStorage {
+  CallSiteLocationStorage(Location callee, Location caller)
+      : callee(callee), caller(caller) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Location, Location>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(callee, caller);
+  }
+
+  /// Construct a new storage instance.
+  static CallSiteLocationStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    return new (allocator.allocate<CallSiteLocationStorage>())
+        CallSiteLocationStorage(key.first, key.second);
+  }
+
+  Location callee, caller;
+};
+
+struct FileLineColLocationStorage : public AttributeStorage {
+  FileLineColLocationStorage(Identifier filename, unsigned line,
+                             unsigned column)
+      : filename(filename), line(line), column(column) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<Identifier, unsigned, unsigned>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(filename, line, column);
+  }
+
+  /// Construct a new storage instance.
+  static FileLineColLocationStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    return new (allocator.allocate<FileLineColLocationStorage>())
+        FileLineColLocationStorage(std::get<0>(key), std::get<1>(key),
+                                   std::get<2>(key));
+  }
+
+  Identifier filename;
+  unsigned line, column;
+};
+
+struct FusedLocationStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<FusedLocationStorage, Location> {
+  FusedLocationStorage(unsigned numLocs, Attribute metadata)
+      : numLocs(numLocs), metadata(metadata) {}
+
+  ArrayRef<Location> getLocations() const {
+    return ArrayRef<Location>(getTrailingObjects<Location>(), numLocs);
+  }
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<Location>, Attribute>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getLocations(), metadata);
+  }
+
+  /// Construct a new storage instance.
+  static FusedLocationStorage *construct(AttributeStorageAllocator &allocator,
+                                         const KeyTy &key) {
+    ArrayRef<Location> locs = key.first;
+
+    auto byteSize = totalSizeToAlloc<Location>(locs.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(FusedLocationStorage));
+    auto result = new (rawMem) FusedLocationStorage(locs.size(), key.second);
+
+    std::uninitialized_copy(locs.begin(), locs.end(),
+                            result->getTrailingObjects<Location>());
+    return result;
+  }
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<FusedLocationStorage, Location>;
+  size_t numTrailingObjects(OverloadToken<Location>) const { return numLocs; }
+
+  /// Number of trailing location objects.
+  unsigned numLocs;
+
+  /// Metadata used to reason about the generation of this fused location.
+  Attribute metadata;
+};
+
+struct NameLocationStorage : public AttributeStorage {
+  NameLocationStorage(Identifier name, Location child)
+      : name(name), child(child) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Identifier, Location>;
+  bool operator==(const KeyTy &key) const { return key == KeyTy(name, child); }
+
+  /// Construct a new storage instance.
+  static NameLocationStorage *construct(AttributeStorageAllocator &allocator,
+                                        const KeyTy &key) {
+    return new (allocator.allocate<NameLocationStorage>())
+        NameLocationStorage(key.first, key.second);
+  }
+
+  Identifier name;
+  Location child;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // MLIR_IR_LOCATIONDETAIL_H_
diff --git a/third_party/mlir/lib/IR/MLIRContext.cpp b/third_party/mlir/lib/IR/MLIRContext.cpp
new file mode 100644
index 00000000000..ab27ab1cbf5
--- /dev/null
+++ b/third_party/mlir/lib/IR/MLIRContext.cpp
@@ -0,0 +1,649 @@
+//===- MLIRContext.cpp - MLIR Type Classes --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/MLIRContext.h"
+#include "AffineExprDetail.h"
+#include "AffineMapDetail.h"
+#include "AttributeDetail.h"
+#include "IntegerSetDetail.h"
+#include "LocationDetail.h"
+#include "TypeDetail.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/RWMutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::detail;
+
+using llvm::hash_combine;
+using llvm::hash_combine_range;
+
+/// A utility function to safely get or create a uniqued instance within the
+/// given set container.
+template <typename ValueT, typename DenseInfoT, typename KeyT,
+          typename ConstructorFn>
+static ValueT safeGetOrCreate(DenseSet<ValueT, DenseInfoT> &container,
+                              KeyT &&key, llvm::sys::SmartRWMutex<true> &mutex,
+                              ConstructorFn &&constructorFn) {
+  { // Check for an existing instance in read-only mode.
+    llvm::sys::SmartScopedReader<true> instanceLock(mutex);
+    auto it = container.find_as(key);
+    if (it != container.end())
+      return *it;
+  }
+
+  // Aquire a writer-lock so that we can safely create the new instance.
+  llvm::sys::SmartScopedWriter<true> instanceLock(mutex);
+
+  // Check for an existing instance again here, because another writer thread
+  // may have already created one.
+  auto existing = container.insert_as(ValueT(), key);
+  if (!existing.second)
+    return *existing.first;
+
+  // Otherwise, construct a new instance of the value.
+  return *existing.first = constructorFn();
+}
+
+namespace {
+/// A builtin dialect to define types/etc that are necessary for the validity of
+/// the IR.
+struct BuiltinDialect : public Dialect {
+  BuiltinDialect(MLIRContext *context) : Dialect(/*name=*/"", context) {
+    addAttributes<AffineMapAttr, ArrayAttr, BoolAttr, DenseElementsAttr,
+                  DictionaryAttr, FloatAttr, SymbolRefAttr, IntegerAttr,
+                  IntegerSetAttr, OpaqueAttr, OpaqueElementsAttr,
+                  SparseElementsAttr, StringAttr, TypeAttr, UnitAttr>();
+    addAttributes<CallSiteLoc, FileLineColLoc, FusedLoc, NameLoc, UnknownLoc>();
+
+    addTypes<ComplexType, FloatType, FunctionType, IndexType, IntegerType,
+             MemRefType, NoneType, OpaqueType, RankedTensorType, TupleType,
+             UnrankedTensorType, VectorType>();
+
+    // TODO: These operations should be moved to a different dialect when they
+    // have been fully decoupled from the core.
+    addOperations<FuncOp, ModuleOp, ModuleTerminatorOp>();
+  }
+};
+
+struct AffineMapKeyInfo : DenseMapInfo<AffineMap> {
+  // Affine maps are uniqued based on their dim/symbol counts and affine
+  // expressions.
+  using KeyTy = std::tuple<unsigned, unsigned, ArrayRef<AffineExpr>>;
+  using DenseMapInfo<AffineMap>::isEqual;
+
+  static unsigned getHashValue(const AffineMap &key) {
+    return getHashValue(
+        KeyTy(key.getNumDims(), key.getNumSymbols(), key.getResults()));
+  }
+
+  static unsigned getHashValue(KeyTy key) {
+    return hash_combine(
+        std::get<0>(key), std::get<1>(key),
+        hash_combine_range(std::get<2>(key).begin(), std::get<2>(key).end()));
+  }
+
+  static bool isEqual(const KeyTy &lhs, AffineMap rhs) {
+    if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+      return false;
+    return lhs == std::make_tuple(rhs.getNumDims(), rhs.getNumSymbols(),
+                                  rhs.getResults());
+  }
+};
+
+struct IntegerSetKeyInfo : DenseMapInfo<IntegerSet> {
+  // Integer sets are uniqued based on their dim/symbol counts, affine
+  // expressions appearing in the LHS of constraints, and eqFlags.
+  using KeyTy =
+      std::tuple<unsigned, unsigned, ArrayRef<AffineExpr>, ArrayRef<bool>>;
+  using DenseMapInfo<IntegerSet>::isEqual;
+
+  static unsigned getHashValue(const IntegerSet &key) {
+    return getHashValue(KeyTy(key.getNumDims(), key.getNumSymbols(),
+                              key.getConstraints(), key.getEqFlags()));
+  }
+
+  static unsigned getHashValue(KeyTy key) {
+    return hash_combine(
+        std::get<0>(key), std::get<1>(key),
+        hash_combine_range(std::get<2>(key).begin(), std::get<2>(key).end()),
+        hash_combine_range(std::get<3>(key).begin(), std::get<3>(key).end()));
+  }
+
+  static bool isEqual(const KeyTy &lhs, IntegerSet rhs) {
+    if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+      return false;
+    return lhs == std::make_tuple(rhs.getNumDims(), rhs.getNumSymbols(),
+                                  rhs.getConstraints(), rhs.getEqFlags());
+  }
+};
+} // end anonymous namespace.
+
+namespace mlir {
+/// This is the implementation of the MLIRContext class, using the pImpl idiom.
+/// This class is completely private to this file, so everything is public.
+class MLIRContextImpl {
+public:
+  //===--------------------------------------------------------------------===//
+  // Identifier uniquing
+  //===--------------------------------------------------------------------===//
+
+  // Identifier allocator and mutex for thread safety.
+  llvm::BumpPtrAllocator identifierAllocator;
+  llvm::sys::SmartRWMutex<true> identifierMutex;
+
+  //===--------------------------------------------------------------------===//
+  // Diagnostics
+  //===--------------------------------------------------------------------===//
+  DiagnosticEngine diagEngine;
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// A general purpose mutex to lock access to parts of the context that do not
+  /// have a more specific mutex, e.g. registry operations.
+  llvm::sys::SmartRWMutex<true> contextMutex;
+
+  /// This is a list of dialects that are created referring to this context.
+  /// The MLIRContext owns the objects.
+  std::vector<std::unique_ptr<Dialect>> dialects;
+
+  /// This is a mapping from operation name to AbstractOperation for registered
+  /// operations.
+  llvm::StringMap<AbstractOperation> registeredOperations;
+
+  /// This is a mapping from class identifier to Dialect for registered
+  /// attributes and types.
+  DenseMap<const ClassID *, Dialect *> registeredDialectSymbols;
+
+  /// These are identifiers uniqued into this MLIRContext.
+  llvm::StringMap<char, llvm::BumpPtrAllocator &> identifiers;
+
+  //===--------------------------------------------------------------------===//
+  // Affine uniquing
+  //===--------------------------------------------------------------------===//
+
+  // Affine allocator and mutex for thread safety.
+  llvm::BumpPtrAllocator affineAllocator;
+  llvm::sys::SmartRWMutex<true> affineMutex;
+
+  // Affine map uniquing.
+  using AffineMapSet = DenseSet<AffineMap, AffineMapKeyInfo>;
+  AffineMapSet affineMaps;
+
+  // Integer set uniquing.
+  using IntegerSets = DenseSet<IntegerSet, IntegerSetKeyInfo>;
+  IntegerSets integerSets;
+
+  // Affine expression uniqui'ing.
+  StorageUniquer affineUniquer;
+
+  //===--------------------------------------------------------------------===//
+  // Type uniquing
+  //===--------------------------------------------------------------------===//
+  StorageUniquer typeUniquer;
+
+  /// Cached Type Instances.
+  FloatType bf16Ty, f16Ty, f32Ty, f64Ty;
+  IndexType indexTy;
+  IntegerType int1Ty, int8Ty, int16Ty, int32Ty, int64Ty, int128Ty;
+  NoneType noneType;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute uniquing
+  //===--------------------------------------------------------------------===//
+  StorageUniquer attributeUniquer;
+
+  /// Cached Attribute Instances.
+  BoolAttr falseAttr, trueAttr;
+  UnitAttr unitAttr;
+  UnknownLoc unknownLocAttr;
+
+public:
+  MLIRContextImpl() : identifiers(identifierAllocator) {}
+};
+} // end namespace mlir
+
+MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
+  new BuiltinDialect(this);
+  registerAllDialects(this);
+
+  // Initialize several common attributes and types to avoid the need to lock
+  // the context when accessing them.
+
+  //// Types.
+  /// Floating-point Types.
+  impl->bf16Ty = TypeUniquer::get<FloatType>(this, StandardTypes::BF16);
+  impl->f16Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F16);
+  impl->f32Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F32);
+  impl->f64Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F64);
+  /// Index Type.
+  impl->indexTy = TypeUniquer::get<IndexType>(this, StandardTypes::Index);
+  /// Integer Types.
+  impl->int1Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 1);
+  impl->int8Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 8);
+  impl->int16Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 16);
+  impl->int32Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 32);
+  impl->int64Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 64);
+  impl->int128Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 128);
+  /// None Type.
+  impl->noneType = TypeUniquer::get<NoneType>(this, StandardTypes::None);
+
+  //// Attributes.
+  //// Note: These must be registered after the types as they may generate one
+  //// of the above types internally.
+  /// Bool Attributes.
+  // Note: The context is also used within the BoolAttrStorage.
+  impl->falseAttr = AttributeUniquer::get<BoolAttr>(
+      this, StandardAttributes::Bool, this, false);
+  impl->trueAttr = AttributeUniquer::get<BoolAttr>(
+      this, StandardAttributes::Bool, this, true);
+  /// Unit Attribute.
+  impl->unitAttr =
+      AttributeUniquer::get<UnitAttr>(this, StandardAttributes::Unit);
+  /// Unknown Location Attribute.
+  impl->unknownLocAttr = AttributeUniquer::get<UnknownLoc>(
+      this, StandardAttributes::UnknownLocation);
+}
+
+MLIRContext::~MLIRContext() {}
+
+/// Copy the specified array of elements into memory managed by the provided
+/// bump pointer allocator.  This assumes the elements are all PODs.
+template <typename T>
+static ArrayRef<T> copyArrayRefInto(llvm::BumpPtrAllocator &allocator,
+                                    ArrayRef<T> elements) {
+  auto result = allocator.Allocate<T>(elements.size());
+  std::uninitialized_copy(elements.begin(), elements.end(), result);
+  return ArrayRef<T>(result, elements.size());
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic Handlers
+//===----------------------------------------------------------------------===//
+
+/// Returns the diagnostic engine for this context.
+DiagnosticEngine &MLIRContext::getDiagEngine() { return getImpl().diagEngine; }
+
+//===----------------------------------------------------------------------===//
+// Dialect and Operation Registration
+//===----------------------------------------------------------------------===//
+
+/// Return information about all registered IR dialects.
+std::vector<Dialect *> MLIRContext::getRegisteredDialects() {
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+
+  std::vector<Dialect *> result;
+  result.reserve(getImpl().dialects.size());
+  for (auto &dialect : getImpl().dialects)
+    result.push_back(dialect.get());
+  return result;
+}
+
+/// Get a registered IR dialect with the given namespace. If none is found,
+/// then return nullptr.
+Dialect *MLIRContext::getRegisteredDialect(StringRef name) {
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+  for (auto &dialect : getImpl().dialects)
+    if (name == dialect->getNamespace())
+      return dialect.get();
+  return nullptr;
+}
+
+/// Register this dialect object with the specified context.  The context
+/// takes ownership of the heap allocated dialect.
+void Dialect::registerDialect(MLIRContext *context) {
+  auto &impl = context->getImpl();
+  std::unique_ptr<Dialect> dialect(this);
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+
+  // Get the correct insertion position sorted by namespace.
+  auto insertPt =
+      llvm::lower_bound(impl.dialects, dialect,
+                        [](const std::unique_ptr<Dialect> &lhs,
+                           const std::unique_ptr<Dialect> &rhs) {
+                          return lhs->getNamespace() < rhs->getNamespace();
+                        });
+
+  // Abort if dialect with namespace has already been registered.
+  if (insertPt != impl.dialects.end() &&
+      (*insertPt)->getNamespace() == getNamespace()) {
+    llvm::report_fatal_error("a dialect with namespace '" + getNamespace() +
+                             "' has already been registered");
+  }
+  impl.dialects.insert(insertPt, std::move(dialect));
+}
+
+/// Return information about all registered operations.  This isn't very
+/// efficient, typically you should ask the operations about their properties
+/// directly.
+std::vector<AbstractOperation *> MLIRContext::getRegisteredOperations() {
+  std::vector<std::pair<StringRef, AbstractOperation *>> opsToSort;
+
+  { // Lock access to the context registry.
+    llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+
+    // We just have the operations in a non-deterministic hash table order. Dump
+    // into a temporary array, then sort it by operation name to get a stable
+    // ordering.
+    llvm::StringMap<AbstractOperation> &registeredOps =
+        getImpl().registeredOperations;
+
+    opsToSort.reserve(registeredOps.size());
+    for (auto &elt : registeredOps)
+      opsToSort.push_back({elt.first(), &elt.second});
+  }
+
+  llvm::array_pod_sort(opsToSort.begin(), opsToSort.end());
+
+  std::vector<AbstractOperation *> result;
+  result.reserve(opsToSort.size());
+  for (auto &elt : opsToSort)
+    result.push_back(elt.second);
+  return result;
+}
+
+void Dialect::addOperation(AbstractOperation opInfo) {
+  assert((getNamespace().empty() ||
+          opInfo.name.split('.').first == getNamespace()) &&
+         "op name doesn't start with dialect namespace");
+  assert(&opInfo.dialect == this && "Dialect object mismatch");
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+  if (!impl.registeredOperations.insert({opInfo.name, opInfo}).second) {
+    llvm::errs() << "error: operation named '" << opInfo.name
+                 << "' is already registered.\n";
+    abort();
+  }
+}
+
+/// Register a dialect-specific symbol(e.g. type) with the current context.
+void Dialect::addSymbol(const ClassID *const classID) {
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+  if (!impl.registeredDialectSymbols.insert({classID, this}).second) {
+    llvm::errs() << "error: dialect symbol already registered.\n";
+    abort();
+  }
+}
+
+/// Look up the specified operation in the operation set and return a pointer
+/// to it if present.  Otherwise, return a null pointer.
+const AbstractOperation *AbstractOperation::lookup(StringRef opName,
+                                                   MLIRContext *context) {
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(impl.contextMutex);
+  auto it = impl.registeredOperations.find(opName);
+  if (it != impl.registeredOperations.end())
+    return &it->second;
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Identifier uniquing
+//===----------------------------------------------------------------------===//
+
+/// Return an identifier for the specified string.
+Identifier Identifier::get(StringRef str, MLIRContext *context) {
+  assert(!str.empty() && "Cannot create an empty identifier");
+  assert(str.find('\0') == StringRef::npos &&
+         "Cannot create an identifier with a nul character");
+
+  auto &impl = context->getImpl();
+
+  { // Check for an existing identifier in read-only mode.
+    llvm::sys::SmartScopedReader<true> contextLock(impl.identifierMutex);
+    auto it = impl.identifiers.find(str);
+    if (it != impl.identifiers.end())
+      return Identifier(it->getKeyData());
+  }
+
+  // Aquire a writer-lock so that we can safely create the new instance.
+  llvm::sys::SmartScopedWriter<true> contextLock(impl.identifierMutex);
+  auto it = impl.identifiers.insert({str, char()}).first;
+  return Identifier(it->getKeyData());
+}
+
+//===----------------------------------------------------------------------===//
+// Type uniquing
+//===----------------------------------------------------------------------===//
+
+static Dialect &lookupDialectForSymbol(MLIRContext *ctx,
+                                       const ClassID *const classID) {
+  auto &impl = ctx->getImpl();
+  auto it = impl.registeredDialectSymbols.find(classID);
+  assert(it != impl.registeredDialectSymbols.end() &&
+         "symbol is not registered.");
+  return *it->second;
+}
+
+/// Returns the storage unqiuer used for constructing type storage instances.
+/// This should not be used directly.
+StorageUniquer &MLIRContext::getTypeUniquer() { return getImpl().typeUniquer; }
+
+/// Get the dialect that registered the type with the provided typeid.
+Dialect &TypeUniquer::lookupDialectForType(MLIRContext *ctx,
+                                           const ClassID *const typeID) {
+  return lookupDialectForSymbol(ctx, typeID);
+}
+
+FloatType FloatType::get(StandardTypes::Kind kind, MLIRContext *context) {
+  assert(kindof(kind) && "Not a FP kind.");
+  switch (kind) {
+  case StandardTypes::BF16:
+    return context->getImpl().bf16Ty;
+  case StandardTypes::F16:
+    return context->getImpl().f16Ty;
+  case StandardTypes::F32:
+    return context->getImpl().f32Ty;
+  case StandardTypes::F64:
+    return context->getImpl().f64Ty;
+  default:
+    llvm_unreachable("unexpected floating-point kind");
+  }
+}
+
+/// Get an instance of the IndexType.
+IndexType IndexType::get(MLIRContext *context) {
+  return context->getImpl().indexTy;
+}
+
+/// Return an existing integer type instance if one is cached within the
+/// context.
+static IntegerType getCachedIntegerType(unsigned width, MLIRContext *context) {
+  switch (width) {
+  case 1:
+    return context->getImpl().int1Ty;
+  case 8:
+    return context->getImpl().int8Ty;
+  case 16:
+    return context->getImpl().int16Ty;
+  case 32:
+    return context->getImpl().int32Ty;
+  case 64:
+    return context->getImpl().int64Ty;
+  case 128:
+    return context->getImpl().int128Ty;
+  default:
+    return IntegerType();
+  }
+}
+
+IntegerType IntegerType::get(unsigned width, MLIRContext *context) {
+  if (auto cached = getCachedIntegerType(width, context))
+    return cached;
+  return Base::get(context, StandardTypes::Integer, width);
+}
+
+IntegerType IntegerType::getChecked(unsigned width, MLIRContext *context,
+                                    Location location) {
+  if (auto cached = getCachedIntegerType(width, context))
+    return cached;
+  return Base::getChecked(location, context, StandardTypes::Integer, width);
+}
+
+/// Get an instance of the NoneType.
+NoneType NoneType::get(MLIRContext *context) {
+  return context->getImpl().noneType;
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute uniquing
+//===----------------------------------------------------------------------===//
+
+/// Returns the storage uniquer used for constructing attribute storage
+/// instances. This should not be used directly.
+StorageUniquer &MLIRContext::getAttributeUniquer() {
+  return getImpl().attributeUniquer;
+}
+
+/// Returns a functor used to initialize new attribute storage instances.
+std::function<void(AttributeStorage *)>
+AttributeUniquer::getInitFn(MLIRContext *ctx, const ClassID *const attrID) {
+  return [ctx, attrID](AttributeStorage *storage) {
+    storage->initializeDialect(lookupDialectForSymbol(ctx, attrID));
+
+    // If the attribute did not provide a type, then default to NoneType.
+    if (!storage->getType())
+      storage->setType(NoneType::get(ctx));
+  };
+}
+
+BoolAttr BoolAttr::get(bool value, MLIRContext *context) {
+  return value ? context->getImpl().trueAttr : context->getImpl().falseAttr;
+}
+
+UnitAttr UnitAttr::get(MLIRContext *context) {
+  return context->getImpl().unitAttr;
+}
+
+Location UnknownLoc::get(MLIRContext *context) {
+  return context->getImpl().unknownLocAttr;
+}
+
+//===----------------------------------------------------------------------===//
+// AffineMap uniquing
+//===----------------------------------------------------------------------===//
+
+StorageUniquer &MLIRContext::getAffineUniquer() {
+  return getImpl().affineUniquer;
+}
+
+AffineMap AffineMap::getImpl(unsigned dimCount, unsigned symbolCount,
+                             ArrayRef<AffineExpr> results,
+                             MLIRContext *context) {
+  auto &impl = context->getImpl();
+  auto key = std::make_tuple(dimCount, symbolCount, results);
+
+  // Safely get or create an AffineMap instance.
+  return safeGetOrCreate(impl.affineMaps, key, impl.affineMutex, [&] {
+    auto *res = impl.affineAllocator.Allocate<detail::AffineMapStorage>();
+
+    // Copy the results into the bump pointer.
+    results = copyArrayRefInto(impl.affineAllocator, results);
+
+    // Initialize the memory using placement new.
+    new (res) detail::AffineMapStorage{dimCount, symbolCount, results, context};
+    return AffineMap(res);
+  });
+}
+
+AffineMap AffineMap::get(MLIRContext *context) {
+  return getImpl(/*dimCount=*/0, /*symbolCount=*/0, /*results=*/{}, context);
+}
+
+AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
+                         ArrayRef<AffineExpr> results) {
+  // The number of results can't be zero.
+  assert(!results.empty());
+  return getImpl(dimCount, symbolCount, results, results[0].getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Sets: these are allocated into the bump pointer, and are immutable.
+// Unlike AffineMap's, these are uniqued only if they are small.
+//===----------------------------------------------------------------------===//
+
+IntegerSet IntegerSet::get(unsigned dimCount, unsigned symbolCount,
+                           ArrayRef<AffineExpr> constraints,
+                           ArrayRef<bool> eqFlags) {
+  // The number of constraints can't be zero.
+  assert(!constraints.empty());
+  assert(constraints.size() == eqFlags.size());
+
+  auto &impl = constraints[0].getContext()->getImpl();
+
+  // A utility function to construct a new IntegerSetStorage instance.
+  auto constructorFn = [&] {
+    auto *res = impl.affineAllocator.Allocate<detail::IntegerSetStorage>();
+
+    // Copy the results and equality flags into the bump pointer.
+    constraints = copyArrayRefInto(impl.affineAllocator, constraints);
+    eqFlags = copyArrayRefInto(impl.affineAllocator, eqFlags);
+
+    // Initialize the memory using placement new.
+    new (res)
+        detail::IntegerSetStorage{dimCount, symbolCount, constraints, eqFlags};
+    return IntegerSet(res);
+  };
+
+  // If this instance is uniqued, then we handle it separately so that multiple
+  // threads may simulatenously access existing instances.
+  if (constraints.size() < IntegerSet::kUniquingThreshold) {
+    auto key = std::make_tuple(dimCount, symbolCount, constraints, eqFlags);
+    return safeGetOrCreate(impl.integerSets, key, impl.affineMutex,
+                           constructorFn);
+  }
+
+  // Otherwise, aquire a writer-lock so that we can safely create the new
+  // instance.
+  llvm::sys::SmartScopedWriter<true> affineLock(impl.affineMutex);
+  return constructorFn();
+}
diff --git a/third_party/mlir/lib/IR/Module.cpp b/third_party/mlir/lib/IR/Module.cpp
new file mode 100644
index 00000000000..b1c56c243bc
--- /dev/null
+++ b/third_party/mlir/lib/IR/Module.cpp
@@ -0,0 +1,88 @@
+//===- Module.cpp - MLIR Module Operation ---------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Module Operation.
+//===----------------------------------------------------------------------===//
+
+void ModuleOp::build(Builder *builder, OperationState *result) {
+  ensureTerminator(*result->addRegion(), *builder, result->location);
+}
+
+/// Construct a module from the given context.
+ModuleOp ModuleOp::create(Location loc) {
+  OperationState state(loc, "module");
+  Builder builder(loc->getContext());
+  ModuleOp::build(&builder, &state);
+  return llvm::cast<ModuleOp>(Operation::create(state));
+}
+
+ParseResult ModuleOp::parse(OpAsmParser *parser, OperationState *result) {
+  // If module attributes are present, parse them.
+  if (succeeded(parser->parseOptionalKeyword("attributes")))
+    if (parser->parseOptionalAttributeDict(result->attributes))
+      return failure();
+
+  // Parse the module body.
+  auto *body = result->addRegion();
+  if (parser->parseRegion(*body, llvm::None, llvm::None))
+    return failure();
+
+  // Ensure that this module has a valid terminator.
+  ensureTerminator(*body, parser->getBuilder(), result->location);
+  return success();
+}
+
+void ModuleOp::print(OpAsmPrinter *p) {
+  *p << "module";
+
+  // Print the module attributes.
+  auto attrs = getAttrs();
+  if (!attrs.empty()) {
+    *p << " attributes";
+    p->printOptionalAttrDict(attrs, {});
+  }
+
+  // Print the region.
+  p->printRegion(getOperation()->getRegion(0), /*printEntryBlockArgs=*/false,
+                 /*printBlockTerminators=*/false);
+}
+
+LogicalResult ModuleOp::verify() {
+  auto &bodyRegion = getOperation()->getRegion(0);
+
+  // The body must contain a single basic block.
+  if (bodyRegion.empty() || std::next(bodyRegion.begin()) != bodyRegion.end())
+    return emitOpError("expected body region to have a single block");
+
+  // Check that the body has no block arguments.
+  auto *body = &bodyRegion.front();
+  if (body->getNumArguments() != 0)
+    return emitOpError("expected body to have no arguments");
+
+  return success();
+}
+
+/// Return body of this module.
+Region &ModuleOp::getBodyRegion() { return getOperation()->getRegion(0); }
+Block *ModuleOp::getBody() { return &getBodyRegion().front(); }
diff --git a/third_party/mlir/lib/IR/Operation.cpp b/third_party/mlir/lib/IR/Operation.cpp
new file mode 100644
index 00000000000..a623e39aa0c
--- /dev/null
+++ b/third_party/mlir/lib/IR/Operation.cpp
@@ -0,0 +1,1041 @@
+//===- Operation.cpp - Operation support code -----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include <numeric>
+using namespace mlir;
+
+/// Form the OperationName for an op with the specified string.  This either is
+/// a reference to an AbstractOperation if one is known, or a uniqued Identifier
+/// if not.
+OperationName::OperationName(StringRef name, MLIRContext *context) {
+  if (auto *op = AbstractOperation::lookup(name, context))
+    representation = op;
+  else
+    representation = Identifier::get(name, context);
+}
+
+/// Return the name of the dialect this operation is registered to.
+StringRef OperationName::getDialect() const {
+  return getStringRef().split('.').first;
+}
+
+/// Return the name of this operation.  This always succeeds.
+StringRef OperationName::getStringRef() const {
+  if (auto *op = representation.dyn_cast<const AbstractOperation *>())
+    return op->name;
+  return representation.get<Identifier>().strref();
+}
+
+const AbstractOperation *OperationName::getAbstractOperation() const {
+  return representation.dyn_cast<const AbstractOperation *>();
+}
+
+OperationName OperationName::getFromOpaquePointer(void *pointer) {
+  return OperationName(RepresentationUnion::getFromOpaqueValue(pointer));
+}
+
+OpAsmParser::~OpAsmParser() {}
+
+//===----------------------------------------------------------------------===//
+// OpResult
+//===----------------------------------------------------------------------===//
+
+/// Return the result number of this result.
+unsigned OpResult::getResultNumber() {
+  // Results are always stored consecutively, so use pointer subtraction to
+  // figure out what number this is.
+  return this - &getOwner()->getOpResults()[0];
+}
+
+//===----------------------------------------------------------------------===//
+// OpOperand
+//===----------------------------------------------------------------------===//
+
+// TODO: This namespace is only required because of a bug in GCC<7.0.
+namespace mlir {
+/// Return which operand this is in the operand list.
+template <> unsigned OpOperand::getOperandNumber() {
+  return this - &getOwner()->getOpOperands()[0];
+}
+} // end namespace mlir
+
+//===----------------------------------------------------------------------===//
+// BlockOperand
+//===----------------------------------------------------------------------===//
+
+// TODO: This namespace is only required because of a bug in GCC<7.0.
+namespace mlir {
+/// Return which operand this is in the operand list.
+template <> unsigned BlockOperand::getOperandNumber() {
+  return this - &getOwner()->getBlockOperands()[0];
+}
+} // end namespace mlir
+
+//===----------------------------------------------------------------------===//
+// Operation
+//===----------------------------------------------------------------------===//
+
+/// Create a new Operation with the specific fields.
+Operation *Operation::create(Location location, OperationName name,
+                             ArrayRef<Value *> operands,
+                             ArrayRef<Type> resultTypes,
+                             ArrayRef<NamedAttribute> attributes,
+                             ArrayRef<Block *> successors, unsigned numRegions,
+                             bool resizableOperandList) {
+  return create(location, name, operands, resultTypes,
+                NamedAttributeList(attributes), successors, numRegions,
+                resizableOperandList);
+}
+
+/// Create a new Operation from operation state.
+Operation *Operation::create(const OperationState &state) {
+  unsigned numRegions = state.regions.size();
+  Operation *op = create(state.location, state.name, state.operands,
+                         state.types, state.attributes, state.successors,
+                         numRegions, state.resizableOperandList);
+  for (unsigned i = 0; i < numRegions; ++i)
+    if (state.regions[i])
+      op->getRegion(i).takeBody(*state.regions[i]);
+  return op;
+}
+
+/// Overload of create that takes an existing NamedAttributeList to avoid
+/// unnecessarily uniquing a list of attributes.
+Operation *Operation::create(Location location, OperationName name,
+                             ArrayRef<Value *> operands,
+                             ArrayRef<Type> resultTypes,
+                             const NamedAttributeList &attributes,
+                             ArrayRef<Block *> successors, unsigned numRegions,
+                             bool resizableOperandList) {
+  unsigned numSuccessors = successors.size();
+
+  // Input operands are nullptr-separated for each successor, the null operands
+  // aren't actually stored.
+  unsigned numOperands = operands.size() - numSuccessors;
+
+  // Compute the byte size for the operation and the operand storage.
+  auto byteSize = totalSizeToAlloc<OpResult, BlockOperand, unsigned, Region,
+                                   detail::OperandStorage>(
+      resultTypes.size(), numSuccessors, numSuccessors, numRegions,
+      /*detail::OperandStorage*/ 1);
+  byteSize += llvm::alignTo(detail::OperandStorage::additionalAllocSize(
+                                numOperands, resizableOperandList),
+                            alignof(Operation));
+  void *rawMem = malloc(byteSize);
+
+  // Create the new Operation.
+  auto op = ::new (rawMem) Operation(location, name, resultTypes.size(),
+                                     numSuccessors, numRegions, attributes);
+
+  assert((numSuccessors == 0 || !op->isKnownNonTerminator()) &&
+         "unexpected successors in a non-terminator operation");
+
+  // Initialize the regions.
+  for (unsigned i = 0; i != numRegions; ++i)
+    new (&op->getRegion(i)) Region(op);
+
+  // Initialize the results and operands.
+  new (&op->getOperandStorage())
+      detail::OperandStorage(numOperands, resizableOperandList);
+
+  auto instResults = op->getOpResults();
+  for (unsigned i = 0, e = resultTypes.size(); i != e; ++i)
+    new (&instResults[i]) OpResult(resultTypes[i], op);
+
+  auto opOperands = op->getOpOperands();
+
+  // Initialize normal operands.
+  unsigned operandIt = 0, operandE = operands.size();
+  unsigned nextOperand = 0;
+  for (; operandIt != operandE; ++operandIt) {
+    // Null operands are used as sentinels between successor operand lists. If
+    // we encounter one here, break and handle the successor operands lists
+    // separately below.
+    if (!operands[operandIt])
+      break;
+    new (&opOperands[nextOperand++]) OpOperand(op, operands[operandIt]);
+  }
+
+  unsigned currentSuccNum = 0;
+  if (operandIt == operandE) {
+    // Verify that the amount of sentinel operands is equivalent to the number
+    // of successors.
+    assert(currentSuccNum == numSuccessors);
+    return op;
+  }
+
+  assert(!op->isKnownNonTerminator() &&
+         "Unexpected nullptr in operand list when creating non-terminator.");
+  auto instBlockOperands = op->getBlockOperands();
+  unsigned *succOperandCountIt = op->getTrailingObjects<unsigned>();
+  unsigned *succOperandCountE = succOperandCountIt + numSuccessors;
+  (void)succOperandCountE;
+
+  for (; operandIt != operandE; ++operandIt) {
+    // If we encounter a sentinel branch to the next operand update the count
+    // variable.
+    if (!operands[operandIt]) {
+      assert(currentSuccNum < numSuccessors);
+
+      // After the first iteration update the successor operand count
+      // variable.
+      if (currentSuccNum != 0) {
+        ++succOperandCountIt;
+        assert(succOperandCountIt != succOperandCountE &&
+               "More sentinel operands than successors.");
+      }
+
+      new (&instBlockOperands[currentSuccNum])
+          BlockOperand(op, successors[currentSuccNum]);
+      *succOperandCountIt = 0;
+      ++currentSuccNum;
+      continue;
+    }
+    new (&opOperands[nextOperand++]) OpOperand(op, operands[operandIt]);
+    ++(*succOperandCountIt);
+  }
+
+  // Verify that the amount of sentinel operands is equivalent to the number of
+  // successors.
+  assert(currentSuccNum == numSuccessors);
+
+  return op;
+}
+
+Operation::Operation(Location location, OperationName name, unsigned numResults,
+                     unsigned numSuccessors, unsigned numRegions,
+                     const NamedAttributeList &attributes)
+    : location(location), numResults(numResults), numSuccs(numSuccessors),
+      numRegions(numRegions), name(name), attrs(attributes) {}
+
+// Operations are deleted through the destroy() member because they are
+// allocated via malloc.
+Operation::~Operation() {
+  assert(block == nullptr && "operation destroyed but still in a block");
+
+  // Explicitly run the destructors for the operands and results.
+  getOperandStorage().~OperandStorage();
+
+  for (auto &result : getOpResults())
+    result.~OpResult();
+
+  // Explicitly run the destructors for the successors.
+  for (auto &successor : getBlockOperands())
+    successor.~BlockOperand();
+
+  // Explicitly destroy the regions.
+  for (auto &region : getRegions())
+    region.~Region();
+}
+
+/// Destroy this operation or one of its subclasses.
+void Operation::destroy() {
+  this->~Operation();
+  free(this);
+}
+
+/// Return the context this operation is associated with.
+MLIRContext *Operation::getContext() { return location->getContext(); }
+
+/// Return the dialact this operation is associated with, or nullptr if the
+/// associated dialect is not registered.
+Dialect *Operation::getDialect() {
+  if (auto *abstractOp = getAbstractOperation())
+    return &abstractOp->dialect;
+
+  // If this operation hasn't been registered or doesn't have abstract
+  // operation, try looking up the dialect name in the context.
+  return getContext()->getRegisteredDialect(getName().getDialect());
+}
+
+Region *Operation::getParentRegion() {
+  return block ? block->getParent() : nullptr;
+}
+
+Operation *Operation::getParentOp() {
+  return block ? block->getParentOp() : nullptr;
+}
+
+/// Replace any uses of 'from' with 'to' within this operation.
+void Operation::replaceUsesOfWith(Value *from, Value *to) {
+  if (from == to)
+    return;
+  for (auto &operand : getOpOperands())
+    if (operand.get() == from)
+      operand.set(to);
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Walkers
+//===----------------------------------------------------------------------===//
+
+void Operation::walk(llvm::function_ref<void(Operation *)> callback) {
+  // Visit any internal operations.
+  for (auto &region : getRegions())
+    region.walk(callback);
+
+  // Visit the current operation.
+  callback(this);
+}
+
+//===----------------------------------------------------------------------===//
+// Other
+//===----------------------------------------------------------------------===//
+
+/// Emit an error about fatal conditions with this operation, reporting up to
+/// any diagnostic handlers that may be listening.
+InFlightDiagnostic Operation::emitError(const Twine &message) {
+  return mlir::emitError(getLoc(), message);
+}
+
+/// Emit a warning about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic Operation::emitWarning(const Twine &message) {
+  return mlir::emitWarning(getLoc(), message);
+}
+
+/// Emit a remark about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic Operation::emitRemark(const Twine &message) {
+  return mlir::emitRemark(getLoc(), message);
+}
+
+/// Given an operation 'other' that is within the same parent block, return
+/// whether the current operation is before 'other' in the operation list
+/// of the parent block.
+/// Note: This function has an average complexity of O(1), but worst case may
+/// take O(N) where N is the number of operations within the parent block.
+bool Operation::isBeforeInBlock(Operation *other) {
+  assert(block && "Operations without parent blocks have no order.");
+  assert(other && other->block == block &&
+         "Expected other operation to have the same parent block.");
+  // Recompute the parent ordering if necessary.
+  if (!block->isInstOrderValid())
+    block->recomputeInstOrder();
+  return orderIndex < other->orderIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Operation
+//===----------------------------------------------------------------------===//
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getNodePtr(pointer N) -> node_type * {
+  return NodeAccess::getNodePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getNodePtr(const_pointer N)
+    -> const node_type * {
+  return NodeAccess::getNodePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getValuePtr(node_type *N) -> pointer {
+  return NodeAccess::getValuePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getValuePtr(const node_type *N)
+    -> const_pointer {
+  return NodeAccess::getValuePtr<OptionsT>(N);
+}
+
+void llvm::ilist_traits<::mlir::Operation>::deleteNode(Operation *op) {
+  op->destroy();
+}
+
+Block *llvm::ilist_traits<::mlir::Operation>::getContainingBlock() {
+  size_t Offset(size_t(&((Block *)nullptr->*Block::getSublistAccess(nullptr))));
+  iplist<Operation> *Anchor(static_cast<iplist<Operation> *>(this));
+  return reinterpret_cast<Block *>(reinterpret_cast<char *>(Anchor) - Offset);
+}
+
+/// This is a trait method invoked when a operation is added to a block.  We
+/// keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::addNodeToList(Operation *op) {
+  assert(!op->getBlock() && "already in a operation block!");
+  op->block = getContainingBlock();
+
+  // Invalidate the block ordering.
+  op->block->invalidateInstOrder();
+}
+
+/// This is a trait method invoked when a operation is removed from a block.
+/// We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::removeNodeFromList(Operation *op) {
+  assert(op->block && "not already in a operation block!");
+  op->block = nullptr;
+}
+
+/// This is a trait method invoked when a operation is moved from one block
+/// to another.  We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::transferNodesFromList(
+    ilist_traits<Operation> &otherList, op_iterator first, op_iterator last) {
+  Block *curParent = getContainingBlock();
+
+  // Invalidate the ordering of the parent block.
+  curParent->invalidateInstOrder();
+
+  // If we are transferring operations within the same block, the block
+  // pointer doesn't need to be updated.
+  if (curParent == otherList.getContainingBlock())
+    return;
+
+  // Update the 'block' member of each operation.
+  for (; first != last; ++first)
+    first->block = curParent;
+}
+
+/// Remove this operation (and its descendants) from its Block and delete
+/// all of them.
+void Operation::erase() {
+  if (auto *parent = getBlock())
+    parent->getOperations().erase(this);
+  else
+    destroy();
+}
+
+/// Unlink this operation from its current block and insert it right before
+/// `existingInst` which may be in the same or another block in the same
+/// function.
+void Operation::moveBefore(Operation *existingInst) {
+  moveBefore(existingInst->getBlock(), existingInst->getIterator());
+}
+
+/// Unlink this operation from its current basic block and insert it right
+/// before `iterator` in the specified basic block.
+void Operation::moveBefore(Block *block,
+                           llvm::iplist<Operation>::iterator iterator) {
+  block->getOperations().splice(iterator, getBlock()->getOperations(),
+                                getIterator());
+}
+
+/// This drops all operand uses from this operation, which is an essential
+/// step in breaking cyclic dependences between references when they are to
+/// be deleted.
+void Operation::dropAllReferences() {
+  for (auto &op : getOpOperands())
+    op.drop();
+
+  for (auto &region : getRegions())
+    region.dropAllReferences();
+
+  for (auto &dest : getBlockOperands())
+    dest.drop();
+}
+
+/// This drops all uses of any values defined by this operation or its nested
+/// regions, wherever they are located.
+void Operation::dropAllDefinedValueUses() {
+  for (auto &val : getOpResults())
+    val.dropAllUses();
+
+  for (auto &region : getRegions())
+    for (auto &block : region)
+      block.dropAllDefinedValueUses();
+}
+
+/// Return true if there are no users of any results of this operation.
+bool Operation::use_empty() {
+  for (auto *result : getResults())
+    if (!result->use_empty())
+      return false;
+  return true;
+}
+
+void Operation::setSuccessor(Block *block, unsigned index) {
+  assert(index < getNumSuccessors());
+  getBlockOperands()[index].set(block);
+}
+
+auto Operation::getNonSuccessorOperands() -> operand_range {
+  return {operand_iterator(this, 0),
+          operand_iterator(this, hasSuccessors() ? getSuccessorOperandIndex(0)
+                                                 : getNumOperands())};
+}
+
+/// Get the index of the first operand of the successor at the provided
+/// index.
+unsigned Operation::getSuccessorOperandIndex(unsigned index) {
+  assert(!isKnownNonTerminator() && "only terminators may have successors");
+  assert(index < getNumSuccessors());
+
+  // Count the number of operands for each of the successors after, and
+  // including, the one at 'index'. This is based upon the assumption that all
+  // non successor operands are placed at the beginning of the operand list.
+  auto *successorOpCountBegin = getTrailingObjects<unsigned>();
+  unsigned postSuccessorOpCount =
+      std::accumulate(successorOpCountBegin + index,
+                      successorOpCountBegin + getNumSuccessors(), 0u);
+  return getNumOperands() - postSuccessorOpCount;
+}
+
+auto Operation::getSuccessorOperands(unsigned index) -> operand_range {
+  unsigned succOperandIndex = getSuccessorOperandIndex(index);
+  return {operand_iterator(this, succOperandIndex),
+          operand_iterator(this,
+                           succOperandIndex + getNumSuccessorOperands(index))};
+}
+
+/// Attempt to fold this operation using the Op's registered foldHook.
+LogicalResult Operation::fold(ArrayRef<Attribute> operands,
+                              SmallVectorImpl<OpFoldResult> &results) {
+  // If we have a registered operation definition matching this one, use it to
+  // try to constant fold the operation.
+  auto *abstractOp = getAbstractOperation();
+  if (abstractOp && succeeded(abstractOp->foldHook(this, operands, results)))
+    return success();
+
+  // Otherwise, fall back on the dialect hook to handle it.
+  Dialect *dialect = getDialect();
+  if (!dialect)
+    return failure();
+
+  SmallVector<Attribute, 8> constants;
+  if (failed(dialect->constantFoldHook(this, operands, constants)))
+    return failure();
+  results.assign(constants.begin(), constants.end());
+  return success();
+}
+
+/// Emit an error with the op name prefixed, like "'dim' op " which is
+/// convenient for verifiers.
+InFlightDiagnostic Operation::emitOpError(const Twine &message) {
+  return emitError() << "'" << getName() << "' op " << message;
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Cloning
+//===----------------------------------------------------------------------===//
+
+/// Create a deep copy of this operation but keep the operation regions empty.
+/// Operands are remapped using `mapper` (if present), and `mapper` is updated
+/// to contain the results.
+Operation *Operation::cloneWithoutRegions(BlockAndValueMapping &mapper) {
+  SmallVector<Value *, 8> operands;
+  SmallVector<Block *, 2> successors;
+
+  operands.reserve(getNumOperands() + getNumSuccessors());
+
+  if (getNumSuccessors() == 0) {
+    // Non-branching operations can just add all the operands.
+    for (auto *opValue : getOperands())
+      operands.push_back(mapper.lookupOrDefault(opValue));
+  } else {
+    // We add the operands separated by nullptr's for each successor.
+    unsigned firstSuccOperand =
+        getNumSuccessors() ? getSuccessorOperandIndex(0) : getNumOperands();
+    auto opOperands = getOpOperands();
+
+    unsigned i = 0;
+    for (; i != firstSuccOperand; ++i)
+      operands.push_back(mapper.lookupOrDefault(opOperands[i].get()));
+
+    successors.reserve(getNumSuccessors());
+    for (unsigned succ = 0, e = getNumSuccessors(); succ != e; ++succ) {
+      successors.push_back(mapper.lookupOrDefault(getSuccessor(succ)));
+
+      // Add sentinel to delineate successor operands.
+      operands.push_back(nullptr);
+
+      // Remap the successors operands.
+      for (auto *operand : getSuccessorOperands(succ))
+        operands.push_back(mapper.lookupOrDefault(operand));
+    }
+  }
+
+  SmallVector<Type, 8> resultTypes(getResultTypes());
+  unsigned numRegions = getNumRegions();
+  auto *newOp =
+      Operation::create(getLoc(), getName(), operands, resultTypes, attrs,
+                        successors, numRegions, hasResizableOperandsList());
+
+  // Remember the mapping of any results.
+  for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+    mapper.map(getResult(i), newOp->getResult(i));
+
+  return newOp;
+}
+
+Operation *Operation::cloneWithoutRegions() {
+  BlockAndValueMapping mapper;
+  return cloneWithoutRegions(mapper);
+}
+
+/// Create a deep copy of this operation, remapping any operands that use
+/// values outside of the operation using the map that is provided (leaving
+/// them alone if no entry is present).  Replaces references to cloned
+/// sub-operations to the corresponding operation that is copied, and adds
+/// those mappings to the map.
+Operation *Operation::clone(BlockAndValueMapping &mapper) {
+  auto *newOp = cloneWithoutRegions(mapper);
+
+  // Clone the regions.
+  for (unsigned i = 0; i != numRegions; ++i)
+    getRegion(i).cloneInto(&newOp->getRegion(i), mapper);
+
+  return newOp;
+}
+
+Operation *Operation::clone() {
+  BlockAndValueMapping mapper;
+  return clone(mapper);
+}
+
+//===----------------------------------------------------------------------===//
+// OpState trait class.
+//===----------------------------------------------------------------------===//
+
+// The fallback for the parser is to reject the custom assembly form.
+ParseResult OpState::parse(OpAsmParser *parser, OperationState *result) {
+  return parser->emitError(parser->getNameLoc(), "has no custom assembly form");
+}
+
+// The fallback for the printer is to print in the generic assembly form.
+void OpState::print(OpAsmPrinter *p) { p->printGenericOp(getOperation()); }
+
+/// Emit an error about fatal conditions with this operation, reporting up to
+/// any diagnostic handlers that may be listening.
+InFlightDiagnostic OpState::emitError(const Twine &message) {
+  return getOperation()->emitError(message);
+}
+
+/// Emit an error with the op name prefixed, like "'dim' op " which is
+/// convenient for verifiers.
+InFlightDiagnostic OpState::emitOpError(const Twine &message) {
+  return getOperation()->emitOpError(message);
+}
+
+/// Emit a warning about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic OpState::emitWarning(const Twine &message) {
+  return getOperation()->emitWarning(message);
+}
+
+/// Emit a remark about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic OpState::emitRemark(const Twine &message) {
+  return getOperation()->emitRemark(message);
+}
+
+//===----------------------------------------------------------------------===//
+// Op Trait implementations
+//===----------------------------------------------------------------------===//
+
+LogicalResult OpTrait::impl::verifyZeroOperands(Operation *op) {
+  if (op->getNumOperands() != 0)
+    return op->emitOpError() << "requires zero operands";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOneOperand(Operation *op) {
+  if (op->getNumOperands() != 1)
+    return op->emitOpError() << "requires a single operand";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyNOperands(Operation *op,
+                                             unsigned numOperands) {
+  if (op->getNumOperands() != numOperands) {
+    return op->emitOpError() << "expected " << numOperands
+                             << " operands, but found " << op->getNumOperands();
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyAtLeastNOperands(Operation *op,
+                                                    unsigned numOperands) {
+  if (op->getNumOperands() < numOperands)
+    return op->emitOpError()
+           << "expected " << numOperands << " or more operands";
+  return success();
+}
+
+/// If this is a vector type, or a tensor type, return the scalar element type
+/// that it is built around, otherwise return the type unmodified.
+static Type getTensorOrVectorElementType(Type type) {
+  if (auto vec = type.dyn_cast<VectorType>())
+    return vec.getElementType();
+
+  // Look through tensor<vector<...>> to find the underlying element type.
+  if (auto tensor = type.dyn_cast<TensorType>())
+    return getTensorOrVectorElementType(tensor.getElementType());
+  return type;
+}
+
+LogicalResult OpTrait::impl::verifyOperandsAreIntegerLike(Operation *op) {
+  for (auto opType : op->getOperandTypes()) {
+    auto type = getTensorOrVectorElementType(opType);
+    if (!type.isIntOrIndex())
+      return op->emitOpError() << "requires an integer or index type";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOperandsAreFloatLike(Operation *op) {
+  for (auto opType : op->getOperandTypes()) {
+    auto type = getTensorOrVectorElementType(opType);
+    if (!type.isa<FloatType>())
+      return op->emitOpError("requires a float type");
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameTypeOperands(Operation *op) {
+  // Zero or one operand always have the "same" type.
+  unsigned nOperands = op->getNumOperands();
+  if (nOperands < 2)
+    return success();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1))
+    if (opType != type)
+      return op->emitOpError() << "requires all operands to have the same type";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyZeroResult(Operation *op) {
+  if (op->getNumResults() != 0)
+    return op->emitOpError() << "requires zero results";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOneResult(Operation *op) {
+  if (op->getNumResults() != 1)
+    return op->emitOpError() << "requires one result";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyNResults(Operation *op,
+                                            unsigned numOperands) {
+  if (op->getNumResults() != numOperands)
+    return op->emitOpError() << "expected " << numOperands << " results";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyAtLeastNResults(Operation *op,
+                                                   unsigned numOperands) {
+  if (op->getNumResults() < numOperands)
+    return op->emitOpError()
+           << "expected " << numOperands << " or more results";
+  return success();
+}
+
+/// Returns success if the given two types have the same shape. That is,
+/// they are both scalars (not shaped), or they are both shaped types and at
+/// least one is unranked or they have the same shape. The element type does not
+/// matter.
+static LogicalResult verifyShapeMatch(Type type1, Type type2) {
+  auto sType1 = type1.dyn_cast<ShapedType>();
+  auto sType2 = type2.dyn_cast<ShapedType>();
+
+  // Either both or neither type should be shaped.
+  if (!sType1)
+    return success(!sType2);
+  if (!sType2)
+    return failure();
+
+  if (!sType1.hasRank() || !sType2.hasRank())
+    return success();
+
+  return success(sType1.getShape() == sType2.getShape());
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsShape(Operation *op) {
+  if (op->getNumOperands() == 0)
+    return failure();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1)) {
+    if (failed(verifyShapeMatch(opType, type)))
+      return op->emitOpError() << "requires the same shape for all operands";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsAndResultShape(Operation *op) {
+  if (op->getNumOperands() == 0 || op->getNumResults() == 0)
+    return failure();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto resultType : op->getResultTypes()) {
+    if (failed(verifyShapeMatch(resultType, type)))
+      return op->emitOpError()
+             << "requires the same shape for all operands and results";
+  }
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1)) {
+    if (failed(verifyShapeMatch(opType, type)))
+      return op->emitOpError()
+             << "requires the same shape for all operands and results";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsElementType(Operation *op) {
+  if (op->getNumOperands() == 0)
+    return failure();
+
+  auto type = op->getOperand(0)->getType().dyn_cast<ShapedType>();
+  if (!type)
+    return op->emitOpError("requires shaped type results");
+  auto elementType = type.getElementType();
+
+  for (auto operandType : llvm::drop_begin(op->getOperandTypes(), 1)) {
+    auto shapedType = operandType.dyn_cast<ShapedType>();
+    if (!shapedType)
+      return op->emitOpError("requires shaped type operands");
+    if (shapedType.getElementType() != elementType)
+      return op->emitOpError("requires the same element type for all operands");
+  }
+
+  return success();
+}
+
+LogicalResult
+OpTrait::impl::verifySameOperandsAndResultElementType(Operation *op) {
+  if (op->getNumOperands() == 0 || op->getNumResults() == 0)
+    return failure();
+
+  auto type = op->getResult(0)->getType().dyn_cast<ShapedType>();
+  if (!type)
+    return op->emitOpError("requires shaped type results");
+  auto elementType = type.getElementType();
+
+  // Verify result element type matches first result's element type.
+  for (auto result : drop_begin(op->getResults(), 1)) {
+    auto resultType = result->getType().dyn_cast<ShapedType>();
+    if (!resultType)
+      return op->emitOpError("requires shaped type results");
+    if (resultType.getElementType() != elementType)
+      return op->emitOpError(
+          "requires the same element type for all operands and results");
+  }
+
+  // Verify operand's element type matches first result's element type.
+  for (auto operand : op->getOperands()) {
+    auto operandType = operand->getType().dyn_cast<ShapedType>();
+    if (!operandType)
+      return op->emitOpError("requires shaped type operands");
+    if (operandType.getElementType() != elementType)
+      return op->emitOpError(
+          "requires the same element type for all operands and results");
+  }
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsAndResultType(Operation *op) {
+  if (op->getNumOperands() == 0 || op->getNumResults() == 0)
+    return failure();
+
+  auto type = op->getResult(0)->getType();
+  for (auto resultType : llvm::drop_begin(op->getResultTypes(), 1)) {
+    if (resultType != type)
+      return op->emitOpError()
+             << "requires the same type for all operands and results";
+  }
+  for (auto opType : op->getOperandTypes()) {
+    if (opType != type)
+      return op->emitOpError()
+             << "requires the same type for all operands and results";
+  }
+  return success();
+}
+
+static LogicalResult verifyBBArguments(Operation::operand_range operands,
+                                       Block *destBB, Operation *op) {
+  unsigned operandCount = std::distance(operands.begin(), operands.end());
+  if (operandCount != destBB->getNumArguments())
+    return op->emitError() << "branch has " << operandCount
+                           << " operands, but target block has "
+                           << destBB->getNumArguments();
+
+  auto operandIt = operands.begin();
+  for (unsigned i = 0, e = operandCount; i != e; ++i, ++operandIt) {
+    if ((*operandIt)->getType() != destBB->getArgument(i)->getType())
+      return op->emitError() << "type mismatch in bb argument #" << i;
+  }
+
+  return success();
+}
+
+static LogicalResult verifyTerminatorSuccessors(Operation *op) {
+  auto *parent = op->getParentRegion();
+
+  // Verify that the operands lines up with the BB arguments in the successor.
+  for (unsigned i = 0, e = op->getNumSuccessors(); i != e; ++i) {
+    auto *succ = op->getSuccessor(i);
+    if (succ->getParent() != parent)
+      return op->emitError("reference to block defined in another region");
+    if (failed(verifyBBArguments(op->getSuccessorOperands(i), succ, op)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyIsTerminator(Operation *op) {
+  Block *block = op->getBlock();
+  // Verify that the operation is at the end of the respective parent block.
+  if (!block || &block->back() != op)
+    return op->emitOpError("must be the last operation in the parent block");
+
+  // Verify the state of the successor blocks.
+  if (op->getNumSuccessors() != 0 && failed(verifyTerminatorSuccessors(op)))
+    return failure();
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreBoolLike(Operation *op) {
+  for (auto resultType : op->getResultTypes()) {
+    auto elementType = getTensorOrVectorElementType(resultType);
+    bool isBoolType = elementType.isInteger(1);
+    if (!isBoolType)
+      return op->emitOpError() << "requires a bool result type";
+  }
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreFloatLike(Operation *op) {
+  for (auto resultType : op->getResultTypes())
+    if (!getTensorOrVectorElementType(resultType).isa<FloatType>())
+      return op->emitOpError() << "requires a floating point type";
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreIntegerLike(Operation *op) {
+  for (auto resultType : op->getResultTypes())
+    if (!getTensorOrVectorElementType(resultType).isIntOrIndex())
+      return op->emitOpError() << "requires an integer or index type";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BinaryOp implementation
+//===----------------------------------------------------------------------===//
+
+// These functions are out-of-line implementations of the methods in BinaryOp,
+// which avoids them being template instantiated/duplicated.
+
+void impl::buildBinaryOp(Builder *builder, OperationState *result, Value *lhs,
+                         Value *rhs) {
+  assert(lhs->getType() == rhs->getType());
+  result->addOperands({lhs, rhs});
+  result->types.push_back(lhs->getType());
+}
+
+ParseResult impl::parseBinaryOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  return failure(parser->parseOperandList(ops, 2) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(type) ||
+                 parser->resolveOperands(ops, type, result->operands) ||
+                 parser->addTypeToList(type, result->types));
+}
+
+void impl::printBinaryOp(Operation *op, OpAsmPrinter *p) {
+  assert(op->getNumOperands() == 2 && "binary op should have two operands");
+  assert(op->getNumResults() == 1 && "binary op should have one result");
+
+  // If not all the operand and result types are the same, just use the
+  // generic assembly form to avoid omitting information in printing.
+  auto resultType = op->getResult(0)->getType();
+  if (op->getOperand(0)->getType() != resultType ||
+      op->getOperand(1)->getType() != resultType) {
+    p->printGenericOp(op);
+    return;
+  }
+
+  *p << op->getName() << ' ' << *op->getOperand(0) << ", "
+     << *op->getOperand(1);
+  p->printOptionalAttrDict(op->getAttrs());
+  // Now we can output only one type for all operands and the result.
+  *p << " : " << op->getResult(0)->getType();
+}
+
+//===----------------------------------------------------------------------===//
+// CastOp implementation
+//===----------------------------------------------------------------------===//
+
+void impl::buildCastOp(Builder *builder, OperationState *result, Value *source,
+                       Type destType) {
+  result->addOperands(source);
+  result->addTypes(destType);
+}
+
+ParseResult impl::parseCastOp(OpAsmParser *parser, OperationState *result) {
+  OpAsmParser::OperandType srcInfo;
+  Type srcType, dstType;
+  return failure(parser->parseOperand(srcInfo) ||
+                 parser->parseOptionalAttributeDict(result->attributes) ||
+                 parser->parseColonType(srcType) ||
+                 parser->resolveOperand(srcInfo, srcType, result->operands) ||
+                 parser->parseKeywordType("to", dstType) ||
+                 parser->addTypeToList(dstType, result->types));
+}
+
+void impl::printCastOp(Operation *op, OpAsmPrinter *p) {
+  *p << op->getName() << ' ' << *op->getOperand(0);
+  p->printOptionalAttrDict(op->getAttrs());
+  *p << " : " << op->getOperand(0)->getType() << " to "
+     << op->getResult(0)->getType();
+}
+
+Value *impl::foldCastOp(Operation *op) {
+  // Identity cast
+  if (op->getOperand(0)->getType() == op->getResult(0)->getType())
+    return op->getOperand(0);
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// CastOp implementation
+//===----------------------------------------------------------------------===//
+
+/// Insert an operation, generated by `buildTerminatorOp`, at the end of the
+/// region's only block if it does not have a terminator already. If the region
+/// is empty, insert a new block first. `buildTerminatorOp` should return the
+/// terminator operation to insert.
+void impl::ensureRegionTerminator(
+    Region &region, Location loc,
+    llvm::function_ref<Operation *()> buildTerminatorOp) {
+  if (region.empty())
+    region.push_back(new Block);
+
+  Block &block = region.back();
+  if (!block.empty() && block.back().isKnownTerminator())
+    return;
+
+  block.push_back(buildTerminatorOp());
+}
diff --git a/third_party/mlir/lib/IR/OperationSupport.cpp b/third_party/mlir/lib/IR/OperationSupport.cpp
new file mode 100644
index 00000000000..ab665f50c67
--- /dev/null
+++ b/third_party/mlir/lib/IR/OperationSupport.cpp
@@ -0,0 +1,134 @@
+//===- OperationSupport.cpp -----------------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains out-of-line implementations of the support types that
+// Operation and related classes build on top of.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// OperationState
+//===----------------------------------------------------------------------===//
+
+OperationState::OperationState(Location location, StringRef name)
+    : location(location), name(name, location->getContext()) {}
+
+OperationState::OperationState(Location location, OperationName name)
+    : location(location), name(name) {}
+
+OperationState::OperationState(Location location, StringRef name,
+                               ArrayRef<Value *> operands, ArrayRef<Type> types,
+                               ArrayRef<NamedAttribute> attributes,
+                               ArrayRef<Block *> successors,
+                               MutableArrayRef<std::unique_ptr<Region>> regions,
+                               bool resizableOperandList)
+    : location(location), name(name, location->getContext()),
+      operands(operands.begin(), operands.end()),
+      types(types.begin(), types.end()),
+      attributes(attributes.begin(), attributes.end()),
+      successors(successors.begin(), successors.end()) {
+  for (std::unique_ptr<Region> &r : regions)
+    this->regions.push_back(std::move(r));
+}
+
+Region *OperationState::addRegion() {
+  regions.emplace_back(new Region);
+  return regions.back().get();
+}
+
+void OperationState::addRegion(std::unique_ptr<Region> &&region) {
+  regions.push_back(std::move(region));
+}
+
+//===----------------------------------------------------------------------===//
+// OperandStorage
+//===----------------------------------------------------------------------===//
+
+/// Replace the operands contained in the storage with the ones provided in
+/// 'operands'.
+void detail::OperandStorage::setOperands(Operation *owner,
+                                         ArrayRef<Value *> operands) {
+  // If the number of operands is less than or equal to the current amount, we
+  // can just update in place.
+  if (operands.size() <= numOperands) {
+    auto opOperands = getOperands();
+
+    // If the number of new operands is less than the current count, then remove
+    // any extra operands.
+    for (unsigned i = operands.size(); i != numOperands; ++i)
+      opOperands[i].~OpOperand();
+
+    // Set the operands in place.
+    numOperands = operands.size();
+    for (unsigned i = 0; i != numOperands; ++i)
+      opOperands[i].set(operands[i]);
+    return;
+  }
+
+  // Otherwise, we need to be resizable.
+  assert(resizable && "Only resizable operations may add operands");
+
+  // Grow the capacity if necessary.
+  auto &resizeUtil = getResizableStorage();
+  if (resizeUtil.capacity < operands.size())
+    grow(resizeUtil, operands.size());
+
+  // Set the operands.
+  OpOperand *opBegin = getRawOperands();
+  for (unsigned i = 0; i != numOperands; ++i)
+    opBegin[i].set(operands[i]);
+  for (unsigned e = operands.size(); numOperands != e; ++numOperands)
+    new (&opBegin[numOperands]) OpOperand(owner, operands[numOperands]);
+}
+
+/// Erase an operand held by the storage.
+void detail::OperandStorage::eraseOperand(unsigned index) {
+  assert(index < size());
+  auto operands = getOperands();
+  --numOperands;
+
+  // Shift all operands down by 1 if the operand to remove is not at the end.
+  auto indexIt = std::next(operands.begin(), index);
+  if (index != numOperands)
+    std::rotate(indexIt, std::next(indexIt), operands.end());
+  operands[numOperands].~OpOperand();
+}
+
+/// Grow the internal operand storage.
+void detail::OperandStorage::grow(ResizableStorage &resizeUtil,
+                                  size_t minSize) {
+  // Allocate a new storage array.
+  resizeUtil.capacity =
+      std::max(size_t(llvm::NextPowerOf2(resizeUtil.capacity + 2)), minSize);
+  OpOperand *newStorage = static_cast<OpOperand *>(
+      llvm::safe_malloc(resizeUtil.capacity * sizeof(OpOperand)));
+
+  // Move the current operands to the new storage.
+  auto operands = getOperands();
+  std::uninitialized_copy(std::make_move_iterator(operands.begin()),
+                          std::make_move_iterator(operands.end()), newStorage);
+
+  // Destroy the original operands and update the resizable storage pointer.
+  for (auto &operand : operands)
+    operand.~OpOperand();
+  resizeUtil.setDynamicStorage(newStorage);
+}
diff --git a/third_party/mlir/lib/IR/PatternMatch.cpp b/third_party/mlir/lib/IR/PatternMatch.cpp
new file mode 100644
index 00000000000..aa560a9695d
--- /dev/null
+++ b/third_party/mlir/lib/IR/PatternMatch.cpp
@@ -0,0 +1,177 @@
+//===- PatternMatch.cpp - Base classes for pattern match ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+using namespace mlir;
+
+PatternBenefit::PatternBenefit(unsigned benefit) : representation(benefit) {
+  assert(representation == benefit && benefit != ImpossibleToMatchSentinel &&
+         "This pattern match benefit is too large to represent");
+}
+
+unsigned short PatternBenefit::getBenefit() const {
+  assert(representation != ImpossibleToMatchSentinel &&
+         "Pattern doesn't match");
+  return representation;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern implementation
+//===----------------------------------------------------------------------===//
+
+Pattern::Pattern(StringRef rootName, PatternBenefit benefit,
+                 MLIRContext *context)
+    : rootKind(OperationName(rootName, context)), benefit(benefit) {}
+
+// Out-of-line vtable anchor.
+void Pattern::anchor() {}
+
+//===----------------------------------------------------------------------===//
+// RewritePattern and PatternRewriter implementation
+//===----------------------------------------------------------------------===//
+
+void RewritePattern::rewrite(Operation *op, std::unique_ptr<PatternState> state,
+                             PatternRewriter &rewriter) const {
+  rewrite(op, rewriter);
+}
+
+void RewritePattern::rewrite(Operation *op, PatternRewriter &rewriter) const {
+  llvm_unreachable("need to implement either matchAndRewrite or one of the "
+                   "rewrite functions!");
+}
+
+PatternMatchResult RewritePattern::match(Operation *op) const {
+  llvm_unreachable("need to implement either match or matchAndRewrite!");
+}
+
+/// Patterns must specify the root operation name they match against, and can
+/// also specify the benefit of the pattern matching. They can also specify the
+/// names of operations that may be generated during a successful rewrite.
+RewritePattern::RewritePattern(StringRef rootName,
+                               ArrayRef<StringRef> generatedNames,
+                               PatternBenefit benefit, MLIRContext *context)
+    : Pattern(rootName, benefit, context) {
+  generatedOps.reserve(generatedNames.size());
+  std::transform(generatedNames.begin(), generatedNames.end(),
+                 std::back_inserter(generatedOps), [context](StringRef name) {
+                   return OperationName(name, context);
+                 });
+}
+
+PatternRewriter::~PatternRewriter() {
+  // Out of line to provide a vtable anchor for the class.
+}
+
+/// This method performs the final replacement for a pattern, where the
+/// results of the operation are updated to use the specified list of SSA
+/// values.  In addition to replacing and removing the specified operation,
+/// clients can specify a list of other nodes that this replacement may make
+/// (perhaps transitively) dead.  If any of those ops are dead, this will
+/// remove them as well.
+void PatternRewriter::replaceOp(Operation *op, ArrayRef<Value *> newValues,
+                                ArrayRef<Value *> valuesToRemoveIfDead) {
+  // Notify the rewriter subclass that we're about to replace this root.
+  notifyRootReplaced(op);
+
+  assert(op->getNumResults() == newValues.size() &&
+         "incorrect # of replacement values");
+  op->replaceAllUsesWith(newValues);
+
+  notifyOperationRemoved(op);
+  op->erase();
+
+  // TODO: Process the valuesToRemoveIfDead list, removing things and calling
+  // the notifyOperationRemoved hook in the process.
+}
+
+/// op and newOp are known to have the same number of results, replace the
+/// uses of op with uses of newOp
+void PatternRewriter::replaceOpWithResultsOfAnotherOp(
+    Operation *op, Operation *newOp, ArrayRef<Value *> valuesToRemoveIfDead) {
+  assert(op->getNumResults() == newOp->getNumResults() &&
+         "replacement op doesn't match results of original op");
+  if (op->getNumResults() == 1)
+    return replaceOp(op, newOp->getResult(0), valuesToRemoveIfDead);
+
+  SmallVector<Value *, 8> newResults(newOp->getResults().begin(),
+                                     newOp->getResults().end());
+  return replaceOp(op, newResults, valuesToRemoveIfDead);
+}
+
+/// Move the blocks that belong to "region" before the given position in
+/// another region.  The two regions must be different.  The caller is in
+/// charge to update create the operation transferring the control flow to the
+/// region and pass it the correct block arguments.
+void PatternRewriter::inlineRegionBefore(Region &region, Region &parent,
+                                         Region::iterator before) {
+  parent.getBlocks().splice(before, region.getBlocks());
+}
+void PatternRewriter::inlineRegionBefore(Region &region, Block *before) {
+  inlineRegionBefore(region, *before->getParent(), before->getIterator());
+}
+
+/// This method is used as the final notification hook for patterns that end
+/// up modifying the pattern root in place, by changing its operands.  This is
+/// a minor efficiency win (it avoids creating a new operation and removing
+/// the old one) but also often allows simpler code in the client.
+///
+/// The opsToRemoveIfDead list is an optional list of nodes that the rewriter
+/// should remove if they are dead at this point.
+///
+void PatternRewriter::updatedRootInPlace(
+    Operation *op, ArrayRef<Value *> valuesToRemoveIfDead) {
+  // Notify the rewriter subclass that we're about to replace this root.
+  notifyRootUpdated(op);
+
+  // TODO: Process the valuesToRemoveIfDead list, removing things and calling
+  // the notifyOperationRemoved hook in the process.
+}
+
+//===----------------------------------------------------------------------===//
+// PatternMatcher implementation
+//===----------------------------------------------------------------------===//
+
+RewritePatternMatcher::RewritePatternMatcher(
+    const OwningRewritePatternList &patterns) {
+  for (auto &pattern : patterns)
+    this->patterns.push_back(pattern.get());
+
+  // Sort the patterns by benefit to simplify the matching logic.
+  std::stable_sort(this->patterns.begin(), this->patterns.end(),
+                   [](RewritePattern *l, RewritePattern *r) {
+                     return r->getBenefit() < l->getBenefit();
+                   });
+}
+
+/// Try to match the given operation to a pattern and rewrite it.
+bool RewritePatternMatcher::matchAndRewrite(Operation *op,
+                                            PatternRewriter &rewriter) {
+  for (auto *pattern : patterns) {
+    // Ignore patterns that are for the wrong root or are impossible to match.
+    if (pattern->getRootKind() != op->getName() ||
+        pattern->getBenefit().isImpossibleToMatch())
+      continue;
+
+    // Try to match and rewrite this pattern. The patterns are sorted by
+    // benefit, so if we match we can immediately rewrite and return.
+    if (pattern->matchAndRewrite(op, rewriter))
+      return true;
+  }
+  return false;
+}
diff --git a/third_party/mlir/lib/IR/Region.cpp b/third_party/mlir/lib/IR/Region.cpp
new file mode 100644
index 00000000000..0947ddd04f3
--- /dev/null
+++ b/third_party/mlir/lib/IR/Region.cpp
@@ -0,0 +1,212 @@
+//===- Region.cpp - MLIR Region Class -------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Region.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+Region::Region(Operation *container) : container(container) {}
+
+Region::~Region() {
+  // Operations may have cyclic references, which need to be dropped before we
+  // can start deleting them.
+  dropAllReferences();
+}
+
+/// Return the context this region is inserted in. The region must have a valid
+/// parent container.
+MLIRContext *Region::getContext() {
+  assert(container && "region is not attached to a container");
+  return container->getContext();
+}
+
+/// Return a location for this region. This is the location attached to the
+/// parent container. The region must have a valid parent container.
+Location Region::getLoc() {
+  assert(container && "region is not attached to a container");
+  return container->getLoc();
+}
+
+Region *Region::getParentRegion() {
+  assert(container && "region is not attached to a container");
+  return container->getParentRegion();
+}
+
+Operation *Region::getParentOp() { return container; }
+
+bool Region::isProperAncestor(Region *other) {
+  if (this == other)
+    return false;
+
+  while ((other = other->getParentRegion())) {
+    if (this == other)
+      return true;
+  }
+  return false;
+}
+
+/// Return the number of this region in the parent operation.
+unsigned Region::getRegionNumber() {
+  // Regions are always stored consecutively, so use pointer subtraction to
+  // figure out what number this is.
+  return this - &getParentOp()->getRegions()[0];
+}
+
+/// Clone the internal blocks from this region into `dest`. Any
+/// cloned blocks are appended to the back of dest.
+void Region::cloneInto(Region *dest, BlockAndValueMapping &mapper) {
+  assert(dest && "expected valid region to clone into");
+  cloneInto(dest, dest->end(), mapper);
+}
+
+/// Clone this region into 'dest' before the given position in 'dest'.
+void Region::cloneInto(Region *dest, Region::iterator destPos,
+                       BlockAndValueMapping &mapper) {
+  assert(dest && "expected valid region to clone into");
+
+  // If the list is empty there is nothing to clone.
+  if (empty())
+    return;
+
+  for (Block &block : *this) {
+    Block *newBlock = new Block();
+    mapper.map(&block, newBlock);
+
+    // Clone the block arguments. The user might be deleting arguments to the
+    // block by specifying them in the mapper. If so, we don't add the
+    // argument to the cloned block.
+    for (auto *arg : block.getArguments())
+      if (!mapper.contains(arg))
+        mapper.map(arg, newBlock->addArgument(arg->getType()));
+
+    // Clone and remap the operations within this block.
+    for (auto &op : block)
+      newBlock->push_back(op.clone(mapper));
+
+    dest->getBlocks().insert(destPos, newBlock);
+  }
+
+  // Now that each of the blocks have been cloned, go through and remap the
+  // operands of each of the operations.
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+    for (auto &succOp : op->getBlockOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(succOp.get()))
+        succOp.set(mappedOp);
+  };
+
+  for (iterator it(mapper.lookup(&front())); it != destPos; ++it)
+    it->walk(remapOperands);
+}
+
+void Region::dropAllReferences() {
+  for (Block &b : *this)
+    b.dropAllReferences();
+}
+
+/// Check if there are any values used by operations in `region` defined
+/// outside its ancestor region `limit`.  That is, given `A{B{C{}}}` with region
+/// `C` and limit `B`, the values defined in `B` can be used but the values
+/// defined in `A` cannot.  Emit errors if `noteLoc` is provided; this location
+/// is used to point to the operation containing the region, the actual error is
+/// reported at the operation with an offending use.
+static bool isIsolatedAbove(Region &region, Region &limit,
+                            llvm::Optional<Location> noteLoc) {
+  assert(limit.isAncestor(&region) &&
+         "expected isolation limit to be an ancestor of the given region");
+
+  // List of regions to analyze.  Each region is processed independently, with
+  // respect to the common `limit` region, so we can look at them in any order.
+  // Therefore, use a simple vector and push/pop back the current region.
+  SmallVector<Region *, 8> pendingRegions;
+  pendingRegions.push_back(&region);
+
+  // Traverse all operations in the region.
+  while (!pendingRegions.empty()) {
+    for (Block &block : *pendingRegions.pop_back_val()) {
+      for (Operation &op : block) {
+        for (Value *operand : op.getOperands()) {
+          // Check that any value that is used by an operation is defined in the
+          // same region as either an operation result or a block argument.
+          if (operand->getParentRegion()->isProperAncestor(&limit)) {
+            if (noteLoc) {
+              op.emitOpError("using value defined outside the region")
+                      .attachNote(noteLoc)
+                  << "required by region isolation constraints";
+            }
+            return false;
+          }
+        }
+        // Schedule any regions the operations contain for further checking.
+        pendingRegions.reserve(pendingRegions.size() + op.getNumRegions());
+        for (Region &subRegion : op.getRegions())
+          pendingRegions.push_back(&subRegion);
+      }
+    }
+  }
+  return true;
+}
+
+bool Region::isIsolatedFromAbove(llvm::Optional<Location> noteLoc) {
+  return isIsolatedAbove(*this, *this, noteLoc);
+}
+
+/// Walk the operations in this block in postorder, calling the callback for
+/// each operation.
+void Region::walk(llvm::function_ref<void(Operation *)> callback) {
+  for (auto &block : *this)
+    block.walk(callback);
+}
+
+Region *llvm::ilist_traits<::mlir::Block>::getParentRegion() {
+  size_t Offset(
+      size_t(&((Region *)nullptr->*Region::getSublistAccess(nullptr))));
+  iplist<Block> *Anchor(static_cast<iplist<Block> *>(this));
+  return reinterpret_cast<Region *>(reinterpret_cast<char *>(Anchor) - Offset);
+}
+
+/// This is a trait method invoked when a basic block is added to a region.
+/// We keep the region pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::addNodeToList(Block *block) {
+  assert(!block->getParent() && "already in a region!");
+  block->parentValidInstOrderPair.setPointer(getParentRegion());
+}
+
+/// This is a trait method invoked when an operation is removed from a
+/// region.  We keep the region pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::removeNodeFromList(Block *block) {
+  assert(block->getParent() && "not already in a region!");
+  block->parentValidInstOrderPair.setPointer(nullptr);
+}
+
+/// This is a trait method invoked when an operation is moved from one block
+/// to another.  We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::transferNodesFromList(
+    ilist_traits<Block> &otherList, block_iterator first, block_iterator last) {
+  // If we are transferring operations within the same function, the parent
+  // pointer doesn't need to be updated.
+  auto *curParent = getParentRegion();
+  if (curParent == otherList.getParentRegion())
+    return;
+
+  // Update the 'parent' member of each Block.
+  for (; first != last; ++first)
+    first->parentValidInstOrderPair.setPointer(curParent);
+}
diff --git a/third_party/mlir/lib/IR/StandardTypes.cpp b/third_party/mlir/lib/IR/StandardTypes.cpp
new file mode 100644
index 00000000000..7c996f5eca2
--- /dev/null
+++ b/third_party/mlir/lib/IR/StandardTypes.cpp
@@ -0,0 +1,449 @@
+//===- StandardTypes.cpp - MLIR Standard Type Classes ---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/StandardTypes.h"
+#include "TypeDetail.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+bool Type::isBF16() { return getKind() == StandardTypes::BF16; }
+bool Type::isF16() { return getKind() == StandardTypes::F16; }
+bool Type::isF32() { return getKind() == StandardTypes::F32; }
+bool Type::isF64() { return getKind() == StandardTypes::F64; }
+
+bool Type::isIndex() { return isa<IndexType>(); }
+
+/// Return true if this is an integer type with the specified width.
+bool Type::isInteger(unsigned width) {
+  if (auto intTy = dyn_cast<IntegerType>())
+    return intTy.getWidth() == width;
+  return false;
+}
+
+bool Type::isIntOrIndex() { return isa<IndexType>() || isa<IntegerType>(); }
+
+bool Type::isIntOrIndexOrFloat() {
+  return isa<IndexType>() || isa<IntegerType>() || isa<FloatType>();
+}
+
+bool Type::isIntOrFloat() { return isa<IntegerType>() || isa<FloatType>(); }
+
+//===----------------------------------------------------------------------===//
+// Integer Type
+//===----------------------------------------------------------------------===//
+
+// static constexpr must have a definition (until in C++17 and inline variable).
+constexpr unsigned IntegerType::kMaxWidth;
+
+/// Verify the construction of an integer type.
+LogicalResult IntegerType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, unsigned width) {
+  if (width > IntegerType::kMaxWidth) {
+    if (loc)
+      emitError(*loc) << "integer bitwidth is limited to "
+                      << IntegerType::kMaxWidth << " bits";
+    return failure();
+  }
+  return success();
+}
+
+unsigned IntegerType::getWidth() const { return getImpl()->width; }
+
+//===----------------------------------------------------------------------===//
+// Float Type
+//===----------------------------------------------------------------------===//
+
+unsigned FloatType::getWidth() {
+  switch (getKind()) {
+  case StandardTypes::BF16:
+  case StandardTypes::F16:
+    return 16;
+  case StandardTypes::F32:
+    return 32;
+  case StandardTypes::F64:
+    return 64;
+  default:
+    llvm_unreachable("unexpected type");
+  }
+}
+
+/// Returns the floating semantics for the given type.
+const llvm::fltSemantics &FloatType::getFloatSemantics() {
+  if (isBF16())
+    // Treat BF16 like a double. This is unfortunate but BF16 fltSemantics is
+    // not defined in LLVM.
+    // TODO(jpienaar): add BF16 to LLVM? fltSemantics are internal to APFloat.cc
+    // else one could add it.
+    //  static const fltSemantics semBF16 = {127, -126, 8, 16};
+    return APFloat::IEEEdouble();
+  if (isF16())
+    return APFloat::IEEEhalf();
+  if (isF32())
+    return APFloat::IEEEsingle();
+  if (isF64())
+    return APFloat::IEEEdouble();
+  llvm_unreachable("non-floating point type used");
+}
+
+unsigned Type::getIntOrFloatBitWidth() {
+  assert(isIntOrFloat() && "only ints and floats have a bitwidth");
+  if (auto intType = dyn_cast<IntegerType>()) {
+    return intType.getWidth();
+  }
+
+  auto floatType = cast<FloatType>();
+  return floatType.getWidth();
+}
+
+//===----------------------------------------------------------------------===//
+// ShapedType
+//===----------------------------------------------------------------------===//
+
+Type ShapedType::getElementType() const {
+  return static_cast<ImplType *>(impl)->elementType;
+}
+
+unsigned ShapedType::getElementTypeBitWidth() const {
+  return getElementType().getIntOrFloatBitWidth();
+}
+
+int64_t ShapedType::getNumElements() const {
+  assert(hasStaticShape() && "cannot get element count of dynamic shaped type");
+  auto shape = getShape();
+  int64_t num = 1;
+  for (auto dim : shape)
+    num *= dim;
+  return num;
+}
+
+int64_t ShapedType::getRank() const { return getShape().size(); }
+
+bool ShapedType::hasRank() const { return !isa<UnrankedTensorType>(); }
+
+int64_t ShapedType::getDimSize(int64_t i) const {
+  assert(i >= 0 && i < getRank() && "invalid index for shaped type");
+  return getShape()[i];
+}
+
+/// Get the number of bits require to store a value of the given shaped type.
+/// Compute the value recursively since tensors are allowed to have vectors as
+/// elements.
+int64_t ShapedType::getSizeInBits() const {
+  assert(hasStaticShape() &&
+         "cannot get the bit size of an aggregate with a dynamic shape");
+
+  auto elementType = getElementType();
+  if (elementType.isIntOrFloat())
+    return elementType.getIntOrFloatBitWidth() * getNumElements();
+
+  // Tensors can have vectors and other tensors as elements, other shaped types
+  // cannot.
+  assert(isa<TensorType>() && "unsupported element type");
+  assert((elementType.isa<VectorType>() || elementType.isa<TensorType>()) &&
+         "unsupported tensor element type");
+  return getNumElements() * elementType.cast<ShapedType>().getSizeInBits();
+}
+
+ArrayRef<int64_t> ShapedType::getShape() const {
+  switch (getKind()) {
+  case StandardTypes::Vector:
+    return cast<VectorType>().getShape();
+  case StandardTypes::RankedTensor:
+    return cast<RankedTensorType>().getShape();
+  case StandardTypes::MemRef:
+    return cast<MemRefType>().getShape();
+  default:
+    llvm_unreachable("not a ShapedType or not ranked");
+  }
+}
+
+int64_t ShapedType::getNumDynamicDims() const {
+  return llvm::count_if(getShape(), isDynamic);
+}
+
+bool ShapedType::hasStaticShape() const {
+  return hasRank() && llvm::none_of(getShape(), isDynamic);
+}
+
+//===----------------------------------------------------------------------===//
+// VectorType
+//===----------------------------------------------------------------------===//
+
+VectorType VectorType::get(ArrayRef<int64_t> shape, Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::Vector, shape,
+                   elementType);
+}
+
+VectorType VectorType::getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                  Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::Vector, shape, elementType);
+}
+
+LogicalResult VectorType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, ArrayRef<int64_t> shape,
+    Type elementType) {
+  if (shape.empty()) {
+    if (loc)
+      emitError(*loc, "vector types must have at least one dimension");
+    return failure();
+  }
+
+  if (!isValidElementType(elementType)) {
+    if (loc)
+      emitError(*loc, "vector elements must be int or float type");
+    return failure();
+  }
+
+  if (any_of(shape, [](int64_t i) { return i <= 0; })) {
+    if (loc)
+      emitError(*loc, "vector types must have positive constant sizes");
+    return failure();
+  }
+  return success();
+}
+
+ArrayRef<int64_t> VectorType::getShape() const { return getImpl()->getShape(); }
+
+//===----------------------------------------------------------------------===//
+// TensorType
+//===----------------------------------------------------------------------===//
+
+// Check if "elementType" can be an element type of a tensor. Emit errors if
+// location is not nullptr.  Returns failure if check failed.
+static inline LogicalResult checkTensorElementType(Optional<Location> location,
+                                                   MLIRContext *context,
+                                                   Type elementType) {
+  if (!TensorType::isValidElementType(elementType)) {
+    if (location)
+      emitError(*location, "invalid tensor element type");
+    return failure();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// RankedTensorType
+//===----------------------------------------------------------------------===//
+
+RankedTensorType RankedTensorType::get(ArrayRef<int64_t> shape,
+                                       Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::RankedTensor, shape,
+                   elementType);
+}
+
+RankedTensorType RankedTensorType::getChecked(ArrayRef<int64_t> shape,
+                                              Type elementType,
+                                              Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::RankedTensor, shape, elementType);
+}
+
+LogicalResult RankedTensorType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, ArrayRef<int64_t> shape,
+    Type elementType) {
+  for (int64_t s : shape) {
+    if (s < -1) {
+      if (loc)
+        emitError(*loc, "invalid tensor dimension size");
+      return failure();
+    }
+  }
+  return checkTensorElementType(loc, context, elementType);
+}
+
+ArrayRef<int64_t> RankedTensorType::getShape() const {
+  return getImpl()->getShape();
+}
+
+//===----------------------------------------------------------------------===//
+// UnrankedTensorType
+//===----------------------------------------------------------------------===//
+
+UnrankedTensorType UnrankedTensorType::get(Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::UnrankedTensor,
+                   elementType);
+}
+
+UnrankedTensorType UnrankedTensorType::getChecked(Type elementType,
+                                                  Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::UnrankedTensor, elementType);
+}
+
+LogicalResult UnrankedTensorType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, Type elementType) {
+  return checkTensorElementType(loc, context, elementType);
+}
+
+//===----------------------------------------------------------------------===//
+// MemRefType
+//===----------------------------------------------------------------------===//
+
+/// Get or create a new MemRefType based on shape, element type, affine
+/// map composition, and memory space.  Assumes the arguments define a
+/// well-formed MemRef type.  Use getChecked to gracefully handle MemRefType
+/// construction failures.
+MemRefType MemRefType::get(ArrayRef<int64_t> shape, Type elementType,
+                           ArrayRef<AffineMap> affineMapComposition,
+                           unsigned memorySpace) {
+  auto result = getImpl(shape, elementType, affineMapComposition, memorySpace,
+                        /*location=*/llvm::None);
+  assert(result && "Failed to construct instance of MemRefType.");
+  return result;
+}
+
+/// Get or create a new MemRefType based on shape, element type, affine
+/// map composition, and memory space declared at the given location.
+/// If the location is unknown, the last argument should be an instance of
+/// UnknownLoc.  If the MemRefType defined by the arguments would be
+/// ill-formed, emits errors (to the handler registered with the context or to
+/// the error stream) and returns nullptr.
+MemRefType MemRefType::getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                  ArrayRef<AffineMap> affineMapComposition,
+                                  unsigned memorySpace, Location location) {
+  return getImpl(shape, elementType, affineMapComposition, memorySpace,
+                 location);
+}
+
+/// Get or create a new MemRefType defined by the arguments.  If the resulting
+/// type would be ill-formed, return nullptr.  If the location is provided,
+/// emit detailed error messages.  To emit errors when the location is unknown,
+/// pass in an instance of UnknownLoc.
+MemRefType MemRefType::getImpl(ArrayRef<int64_t> shape, Type elementType,
+                               ArrayRef<AffineMap> affineMapComposition,
+                               unsigned memorySpace,
+                               Optional<Location> location) {
+  auto *context = elementType.getContext();
+
+  for (int64_t s : shape) {
+    // Negative sizes are not allowed except for `-1` that means dynamic size.
+    if (s < -1) {
+      if (location)
+        emitError(*location, "invalid memref size");
+      return {};
+    }
+  }
+
+  // Check that the structure of the composition is valid, i.e. that each
+  // subsequent affine map has as many inputs as the previous map has results.
+  // Take the dimensionality of the MemRef for the first map.
+  auto dim = shape.size();
+  unsigned i = 0;
+  for (const auto &affineMap : affineMapComposition) {
+    if (affineMap.getNumDims() != dim) {
+      if (location)
+        emitError(*location)
+            << "memref affine map dimension mismatch between "
+            << (i == 0 ? Twine("memref rank") : "affine map " + Twine(i))
+            << " and affine map" << i + 1 << ": " << dim
+            << " != " << affineMap.getNumDims();
+      return nullptr;
+    }
+
+    dim = affineMap.getNumResults();
+    ++i;
+  }
+
+  // Drop identity maps from the composition.
+  // This may lead to the composition becoming empty, which is interpreted as an
+  // implicit identity.
+  llvm::SmallVector<AffineMap, 2> cleanedAffineMapComposition;
+  for (const auto &map : affineMapComposition) {
+    if (map.isIdentity())
+      continue;
+    cleanedAffineMapComposition.push_back(map);
+  }
+
+  return Base::get(context, StandardTypes::MemRef, shape, elementType,
+                   cleanedAffineMapComposition, memorySpace);
+}
+
+ArrayRef<int64_t> MemRefType::getShape() const { return getImpl()->getShape(); }
+
+ArrayRef<AffineMap> MemRefType::getAffineMaps() const {
+  return getImpl()->getAffineMaps();
+}
+
+unsigned MemRefType::getMemorySpace() const { return getImpl()->memorySpace; }
+
+//===----------------------------------------------------------------------===//
+/// ComplexType
+//===----------------------------------------------------------------------===//
+
+ComplexType ComplexType::get(Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::Complex,
+                   elementType);
+}
+
+ComplexType ComplexType::getChecked(Type elementType, Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::Complex, elementType);
+}
+
+/// Verify the construction of an integer type.
+LogicalResult ComplexType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, Type elementType) {
+  if (!elementType.isa<FloatType>() && !elementType.isa<IntegerType>()) {
+    if (loc)
+      emitError(*loc, "invalid element type for complex");
+    return failure();
+  }
+  return success();
+}
+
+Type ComplexType::getElementType() { return getImpl()->elementType; }
+
+//===----------------------------------------------------------------------===//
+/// TupleType
+//===----------------------------------------------------------------------===//
+
+/// Get or create a new TupleType with the provided element types. Assumes the
+/// arguments define a well-formed type.
+TupleType TupleType::get(ArrayRef<Type> elementTypes, MLIRContext *context) {
+  return Base::get(context, StandardTypes::Tuple, elementTypes);
+}
+
+/// Return the elements types for this tuple.
+ArrayRef<Type> TupleType::getTypes() const { return getImpl()->getTypes(); }
+
+/// Accumulate the types contained in this tuple and tuples nested within it.
+/// Note that this only flattens nested tuples, not any other container type,
+/// e.g. a tuple<i32, tensor<i32>, tuple<f32, tuple<i64>>> is flattened to
+/// (i32, tensor<i32>, f32, i64)
+void TupleType::getFlattenedTypes(SmallVectorImpl<Type> &types) {
+  for (Type type : getTypes()) {
+    if (auto nestedTuple = type.dyn_cast<TupleType>())
+      nestedTuple.getFlattenedTypes(types);
+    else
+      types.push_back(type);
+  }
+}
+
+/// Return the number of element types.
+size_t TupleType::size() const { return getImpl()->size(); }
diff --git a/third_party/mlir/lib/IR/SymbolTable.cpp b/third_party/mlir/lib/IR/SymbolTable.cpp
new file mode 100644
index 00000000000..62dd6b03dfe
--- /dev/null
+++ b/third_party/mlir/lib/IR/SymbolTable.cpp
@@ -0,0 +1,114 @@
+//===- SymbolTable.cpp - MLIR Symbol Table Class --------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/SymbolTable.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace mlir;
+
+/// Build a symbol table with the symbols within the given operation.
+SymbolTable::SymbolTable(Operation *op) : context(op->getContext()) {
+  assert(op->hasTrait<OpTrait::SymbolTable>() &&
+         "expected operation to have SymbolTable trait");
+  assert(op->getNumRegions() == 1 &&
+         "expected operation to have a single region");
+
+  for (auto &block : op->getRegion(0)) {
+    for (auto &op : block) {
+      auto nameAttr = op.getAttrOfType<StringAttr>(getSymbolAttrName());
+      if (!nameAttr)
+        continue;
+
+      auto inserted = symbolTable.insert({nameAttr.getValue(), &op});
+      (void)inserted;
+      assert(inserted.second &&
+             "expected region to contain uniquely named symbol operations");
+    }
+  }
+}
+
+/// Look up a symbol with the specified name, returning null if no such name
+/// exists. Names never include the @ on them.
+Operation *SymbolTable::lookup(StringRef name) const {
+  return symbolTable.lookup(name);
+}
+
+/// Erase the given symbol from the table.
+void SymbolTable::erase(Operation *symbol) {
+  auto nameAttr = symbol->getAttrOfType<StringAttr>(getSymbolAttrName());
+  assert(nameAttr && "expected valid 'name' attribute");
+
+  auto it = symbolTable.find(nameAttr.getValue());
+  if (it != symbolTable.end() && it->second == symbol)
+    symbolTable.erase(it);
+}
+
+/// Insert a new symbol into the table, and rename it as necessary to avoid
+/// collisions.
+void SymbolTable::insert(Operation *symbol) {
+  auto nameAttr = symbol->getAttrOfType<StringAttr>(getSymbolAttrName());
+  assert(nameAttr && "expected valid 'name' attribute");
+
+  // Add this symbol to the symbol table, uniquing the name if a conflict is
+  // detected.
+  if (symbolTable.insert({nameAttr.getValue(), symbol}).second)
+    return;
+
+  // If a conflict was detected, then the symbol will not have been added to
+  // the symbol table. Try suffixes until we get to a unique name that works.
+  SmallString<128> nameBuffer(nameAttr.getValue());
+  unsigned originalLength = nameBuffer.size();
+
+  // Iteratively try suffixes until we find one that isn't used.
+  do {
+    nameBuffer.resize(originalLength);
+    nameBuffer += '_';
+    nameBuffer += std::to_string(uniquingCounter++);
+  } while (!symbolTable.insert({nameBuffer, symbol}).second);
+  symbol->setAttr(getSymbolAttrName(), StringAttr::get(nameBuffer, context));
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolTable Trait Types
+//===----------------------------------------------------------------------===//
+
+LogicalResult OpTrait::impl::verifySymbolTable(Operation *op) {
+  if (op->getNumRegions() != 1)
+    return op->emitOpError()
+           << "Operations with a 'SymbolTable' must have exactly one region";
+
+  // Check that all symboles are uniquely named within child regions.
+  llvm::StringMap<Location> nameToOrigLoc;
+  for (auto &block : op->getRegion(0)) {
+    for (auto &op : block) {
+      // Check for a symbol name attribute.
+      auto nameAttr =
+          op.getAttrOfType<StringAttr>(mlir::SymbolTable::getSymbolAttrName());
+      if (!nameAttr)
+        continue;
+
+      // Try to insert this symbol into the table.
+      auto it = nameToOrigLoc.try_emplace(nameAttr.getValue(), op.getLoc());
+      if (!it.second)
+        return op.emitError()
+            .append("redefinition of symbol named '", nameAttr.getValue(), "'")
+            .attachNote(it.first->second)
+            .append("see existing symbol definition here");
+    }
+  }
+  return success();
+}
diff --git a/third_party/mlir/lib/IR/TypeDetail.h b/third_party/mlir/lib/IR/TypeDetail.h
new file mode 100644
index 00000000000..0e7edf03bd1
--- /dev/null
+++ b/third_party/mlir/lib/IR/TypeDetail.h
@@ -0,0 +1,308 @@
+//===- TypeDetail.h - MLIR Type storage details -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This holds implementation details of Type.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TYPEDETAIL_H_
+#define TYPEDETAIL_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+
+class MLIRContext;
+
+namespace detail {
+
+/// Opaque Type Storage and Uniquing.
+struct OpaqueTypeStorage : public TypeStorage {
+  OpaqueTypeStorage(Identifier dialectNamespace, StringRef typeData)
+      : dialectNamespace(dialectNamespace), typeData(typeData) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Identifier, StringRef>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(dialectNamespace, typeData);
+  }
+
+  static OpaqueTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    StringRef tyData = allocator.copyInto(key.second);
+    return new (allocator.allocate<OpaqueTypeStorage>())
+        OpaqueTypeStorage(key.first, tyData);
+  }
+
+  // The dialect namespace.
+  Identifier dialectNamespace;
+
+  // The parser type data for this opaque type.
+  StringRef typeData;
+};
+
+/// Integer Type Storage and Uniquing.
+struct IntegerTypeStorage : public TypeStorage {
+  IntegerTypeStorage(unsigned width) : width(width) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = unsigned;
+  bool operator==(const KeyTy &key) const { return key == width; }
+
+  static IntegerTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       KeyTy bitwidth) {
+    return new (allocator.allocate<IntegerTypeStorage>())
+        IntegerTypeStorage(bitwidth);
+  }
+
+  unsigned width;
+};
+
+/// Function Type Storage and Uniquing.
+struct FunctionTypeStorage : public TypeStorage {
+  FunctionTypeStorage(unsigned numInputs, unsigned numResults,
+                      Type const *inputsAndResults)
+      : TypeStorage(numInputs), numResults(numResults),
+        inputsAndResults(inputsAndResults) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<Type>, ArrayRef<Type>>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getInputs(), getResults());
+  }
+
+  /// Construction.
+  static FunctionTypeStorage *construct(TypeStorageAllocator &allocator,
+                                        const KeyTy &key) {
+    ArrayRef<Type> inputs = key.first, results = key.second;
+
+    // Copy the inputs and results into the bump pointer.
+    SmallVector<Type, 16> types;
+    types.reserve(inputs.size() + results.size());
+    types.append(inputs.begin(), inputs.end());
+    types.append(results.begin(), results.end());
+    auto typesList = allocator.copyInto(ArrayRef<Type>(types));
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<FunctionTypeStorage>())
+        FunctionTypeStorage(inputs.size(), results.size(), typesList.data());
+  }
+
+  ArrayRef<Type> getInputs() const {
+    return ArrayRef<Type>(inputsAndResults, getSubclassData());
+  }
+  ArrayRef<Type> getResults() const {
+    return ArrayRef<Type>(inputsAndResults + getSubclassData(), numResults);
+  }
+
+  unsigned numResults;
+  Type const *inputsAndResults;
+};
+
+/// VectorOrTensor Type Storage.
+struct ShapedTypeStorage : public TypeStorage {
+  ShapedTypeStorage(Type elementType, unsigned subclassData = 0)
+      : TypeStorage(subclassData), elementType(elementType) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = Type;
+  bool operator==(const KeyTy &key) const { return key == elementType; }
+
+  Type elementType;
+};
+
+/// Vector Type Storage and Uniquing.
+struct VectorTypeStorage : public ShapedTypeStorage {
+  VectorTypeStorage(unsigned shapeSize, Type elementTy,
+                    const int64_t *shapeElements)
+      : ShapedTypeStorage(elementTy, shapeSize), shapeElements(shapeElements) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<int64_t>, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType);
+  }
+
+  /// Construction.
+  static VectorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(key.first);
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<VectorTypeStorage>())
+        VectorTypeStorage(shape.size(), key.second, shape.data());
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  const int64_t *shapeElements;
+};
+
+struct RankedTensorTypeStorage : public ShapedTypeStorage {
+  RankedTensorTypeStorage(unsigned shapeSize, Type elementTy,
+                          const int64_t *shapeElements)
+      : ShapedTypeStorage(elementTy, shapeSize), shapeElements(shapeElements) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<int64_t>, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType);
+  }
+
+  /// Construction.
+  static RankedTensorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(key.first);
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<RankedTensorTypeStorage>())
+        RankedTensorTypeStorage(shape.size(), key.second, shape.data());
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  const int64_t *shapeElements;
+};
+
+struct UnrankedTensorTypeStorage : public ShapedTypeStorage {
+  using ShapedTypeStorage::KeyTy;
+  using ShapedTypeStorage::ShapedTypeStorage;
+
+  /// Construction.
+  static UnrankedTensorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                              Type elementTy) {
+    return new (allocator.allocate<UnrankedTensorTypeStorage>())
+        UnrankedTensorTypeStorage(elementTy);
+  }
+};
+
+struct MemRefTypeStorage : public ShapedTypeStorage {
+  MemRefTypeStorage(unsigned shapeSize, Type elementType,
+                    const int64_t *shapeElements, const unsigned numAffineMaps,
+                    AffineMap const *affineMapList, const unsigned memorySpace)
+      : ShapedTypeStorage(elementType, shapeSize), shapeElements(shapeElements),
+        numAffineMaps(numAffineMaps), affineMapList(affineMapList),
+        memorySpace(memorySpace) {}
+
+  /// The hash key used for uniquing.
+  // MemRefs are uniqued based on their shape, element type, affine map
+  // composition, and memory space.
+  using KeyTy =
+      std::tuple<ArrayRef<int64_t>, Type, ArrayRef<AffineMap>, unsigned>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType, getAffineMaps(), memorySpace);
+  }
+
+  /// Construction.
+  static MemRefTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(std::get<0>(key));
+
+    // Copy the affine map composition into the bump pointer.
+    ArrayRef<AffineMap> affineMapComposition =
+        allocator.copyInto(std::get<2>(key));
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<MemRefTypeStorage>())
+        MemRefTypeStorage(shape.size(), std::get<1>(key), shape.data(),
+                          affineMapComposition.size(),
+                          affineMapComposition.data(), std::get<3>(key));
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  ArrayRef<AffineMap> getAffineMaps() const {
+    return ArrayRef<AffineMap>(affineMapList, numAffineMaps);
+  }
+
+  /// An array of integers which stores the shape dimension sizes.
+  const int64_t *shapeElements;
+  /// The number of affine maps in the 'affineMapList' array.
+  const unsigned numAffineMaps;
+  /// List of affine maps in the memref's layout/index map composition.
+  AffineMap const *affineMapList;
+  /// Memory space in which data referenced by memref resides.
+  const unsigned memorySpace;
+};
+
+/// Complex Type Storage.
+struct ComplexTypeStorage : public TypeStorage {
+  ComplexTypeStorage(Type elementType) : elementType(elementType) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = Type;
+  bool operator==(const KeyTy &key) const { return key == elementType; }
+
+  /// Construction.
+  static ComplexTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       Type elementType) {
+    return new (allocator.allocate<ComplexTypeStorage>())
+        ComplexTypeStorage(elementType);
+  }
+
+  Type elementType;
+};
+
+/// A type representing a collection of other types.
+struct TupleTypeStorage final
+    : public TypeStorage,
+      public llvm::TrailingObjects<TupleTypeStorage, Type> {
+  using KeyTy = ArrayRef<Type>;
+
+  TupleTypeStorage(unsigned numTypes) : TypeStorage(numTypes) {}
+
+  /// Construction.
+  static TupleTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     ArrayRef<Type> key) {
+    // Allocate a new storage instance.
+    auto byteSize = TupleTypeStorage::totalSizeToAlloc<Type>(key.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(TupleTypeStorage));
+    auto result = ::new (rawMem) TupleTypeStorage(key.size());
+
+    // Copy in the element types into the trailing storage.
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<Type>());
+    return result;
+  }
+
+  bool operator==(const KeyTy &key) const { return key == getTypes(); }
+
+  /// Return the number of held types.
+  unsigned size() const { return getSubclassData(); }
+
+  /// Return the held types.
+  ArrayRef<Type> getTypes() const {
+    return {getTrailingObjects<Type>(), size()};
+  }
+};
+
+} // namespace detail
+} // namespace mlir
+#endif // TYPEDETAIL_H_
diff --git a/third_party/mlir/lib/IR/TypeUtilities.cpp b/third_party/mlir/lib/IR/TypeUtilities.cpp
new file mode 100644
index 00000000000..95895afbca8
--- /dev/null
+++ b/third_party/mlir/lib/IR/TypeUtilities.cpp
@@ -0,0 +1,76 @@
+//===- TypeUtilities.cpp - Helper function for type queries ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines generic type utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+
+using namespace mlir;
+
+Type mlir::getElementTypeOrSelf(Type type) {
+  if (auto st = type.dyn_cast<ShapedType>())
+    return st.getElementType();
+  return type;
+}
+
+Type mlir::getElementTypeOrSelf(Value *val) {
+  return getElementTypeOrSelf(val->getType());
+}
+
+Type mlir::getElementTypeOrSelf(Value &val) {
+  return getElementTypeOrSelf(val.getType());
+}
+
+Type mlir::getElementTypeOrSelf(Attribute attr) {
+  return getElementTypeOrSelf(attr.getType());
+}
+
+SmallVector<Type, 10> mlir::getFlattenedTypes(TupleType t) {
+  SmallVector<Type, 10> fTypes;
+  t.getFlattenedTypes(fTypes);
+  return fTypes;
+}
+
+/// Return true if the specified type is an opaque type with the specified
+/// dialect and typeData.
+bool mlir::isOpaqueTypeWithName(Type type, StringRef dialect,
+                                StringRef typeData) {
+  if (auto opaque = type.dyn_cast<mlir::OpaqueType>())
+    return opaque.getDialectNamespace().is(dialect) &&
+           opaque.getTypeData() == typeData;
+  return false;
+}
+
+OperandElementTypeIterator::OperandElementTypeIterator(OperandIterator it)
+    : llvm::mapped_iterator<OperandIterator, Type (*)(Value *)>(it, &unwrap) {}
+
+Type OperandElementTypeIterator::unwrap(Value *value) {
+  return value->getType().cast<ShapedType>().getElementType();
+}
+
+ResultElementTypeIterator::ResultElementTypeIterator(ResultIterator it)
+    : llvm::mapped_iterator<ResultIterator, Type (*)(Value *)>(it, &unwrap) {}
+
+Type ResultElementTypeIterator::unwrap(Value *value) {
+  return value->getType().cast<ShapedType>().getElementType();
+}
diff --git a/third_party/mlir/lib/IR/Types.cpp b/third_party/mlir/lib/IR/Types.cpp
new file mode 100644
index 00000000000..f1a6d8f11c9
--- /dev/null
+++ b/third_party/mlir/lib/IR/Types.cpp
@@ -0,0 +1,92 @@
+//===- Types.cpp - MLIR Type Classes --------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Types.h"
+#include "TypeDetail.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+unsigned Type::getKind() const { return impl->getKind(); }
+
+/// Get the dialect this type is registered to.
+Dialect &Type::getDialect() const { return impl->getDialect(); }
+
+MLIRContext *Type::getContext() const { return getDialect().getContext(); }
+
+unsigned Type::getSubclassData() const { return impl->getSubclassData(); }
+void Type::setSubclassData(unsigned val) { impl->setSubclassData(val); }
+
+//===----------------------------------------------------------------------===//
+// FunctionType
+//===----------------------------------------------------------------------===//
+
+FunctionType FunctionType::get(ArrayRef<Type> inputs, ArrayRef<Type> results,
+                               MLIRContext *context) {
+  return Base::get(context, Type::Kind::Function, inputs, results);
+}
+
+ArrayRef<Type> FunctionType::getInputs() const {
+  return getImpl()->getInputs();
+}
+
+unsigned FunctionType::getNumResults() const { return getImpl()->numResults; }
+
+ArrayRef<Type> FunctionType::getResults() const {
+  return getImpl()->getResults();
+}
+
+//===----------------------------------------------------------------------===//
+// OpaqueType
+//===----------------------------------------------------------------------===//
+
+OpaqueType OpaqueType::get(Identifier dialect, StringRef typeData,
+                           MLIRContext *context) {
+  return Base::get(context, Type::Kind::Opaque, dialect, typeData);
+}
+
+OpaqueType OpaqueType::getChecked(Identifier dialect, StringRef typeData,
+                                  MLIRContext *context, Location location) {
+  return Base::getChecked(location, context, Kind::Opaque, dialect, typeData);
+}
+
+/// Returns the dialect namespace of the opaque type.
+Identifier OpaqueType::getDialectNamespace() const {
+  return getImpl()->dialectNamespace;
+}
+
+/// Returns the raw type data of the opaque type.
+StringRef OpaqueType::getTypeData() const { return getImpl()->typeData; }
+
+/// Verify the construction of an opaque type.
+LogicalResult OpaqueType::verifyConstructionInvariants(
+    llvm::Optional<Location> loc, MLIRContext *context, Identifier dialect,
+    StringRef typeData) {
+  if (!Dialect::isValidNamespace(dialect.strref())) {
+    if (loc)
+      emitError(*loc) << "invalid dialect namespace '" << dialect << "'";
+    return failure();
+  }
+  return success();
+}
diff --git a/third_party/mlir/lib/IR/Value.cpp b/third_party/mlir/lib/IR/Value.cpp
new file mode 100644
index 00000000000..4ad1460e90b
--- /dev/null
+++ b/third_party/mlir/lib/IR/Value.cpp
@@ -0,0 +1,67 @@
+//===- Value.cpp - MLIR Value Classes -------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+/// If this value is the result of an Operation, return the operation that
+/// defines it.
+Operation *Value::getDefiningOp() {
+  if (auto *result = dyn_cast<OpResult>(this))
+    return result->getOwner();
+  return nullptr;
+}
+
+Location Value::getLoc() {
+  if (auto *op = getDefiningOp())
+    return op->getLoc();
+  return UnknownLoc::get(getContext());
+}
+
+/// Return the Region in which this Value is defined.
+Region *Value::getParentRegion() {
+  switch (getKind()) {
+  case Value::Kind::BlockArgument:
+    return cast<BlockArgument>(this)->getOwner()->getParent();
+  case Value::Kind::OpResult:
+    return getDefiningOp()->getParentRegion();
+  }
+  llvm_unreachable("Unknown Value Kind");
+}
+
+//===----------------------------------------------------------------------===//
+// IRObjectWithUseList implementation.
+//===----------------------------------------------------------------------===//
+
+/// Replace all uses of 'this' value with the new value, updating anything in
+/// the IR that uses 'this' to use the other value instead.  When this returns
+/// there are zero uses of 'this'.
+void IRObjectWithUseList::replaceAllUsesWith(IRObjectWithUseList *newValue) {
+  assert(this != newValue && "cannot RAUW a value with itself");
+  while (!use_empty()) {
+    use_begin()->set(newValue);
+  }
+}
+
+/// Drop all uses of this object from their respective owners.
+void IRObjectWithUseList::dropAllUses() {
+  while (!use_empty()) {
+    use_begin()->drop();
+  }
+}
diff --git a/third_party/mlir/lib/Parser/CMakeLists.txt b/third_party/mlir/lib/Parser/CMakeLists.txt
new file mode 100644
index 00000000000..9fd29ae7879
--- /dev/null
+++ b/third_party/mlir/lib/Parser/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRParser
+  Lexer.cpp
+  Parser.cpp
+  Token.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Parser
+  )
+add_dependencies(MLIRParser MLIRIR MLIRAnalysis)
+target_link_libraries(MLIRParser MLIRIR MLIRAnalysis)
diff --git a/third_party/mlir/lib/Parser/Lexer.cpp b/third_party/mlir/lib/Parser/Lexer.cpp
new file mode 100644
index 00000000000..38038f7e722
--- /dev/null
+++ b/third_party/mlir/lib/Parser/Lexer.cpp
@@ -0,0 +1,401 @@
+//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the lexer for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lexer.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace mlir;
+
+using llvm::SMLoc;
+using llvm::SourceMgr;
+
+// Returns true if 'c' is an allowable puncuation character: [$._-]
+// Returns false otherwise.
+static bool isPunct(char c) {
+  return c == '$' || c == '.' || c == '_' || c == '-';
+}
+
+Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
+    : sourceMgr(sourceMgr), context(context) {
+  auto bufferID = sourceMgr.getMainFileID();
+  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
+  curPtr = curBuffer.begin();
+}
+
+/// Encode the specified source location information into an attribute for
+/// attachment to the IR.
+Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
+  auto &sourceMgr = getSourceMgr();
+  unsigned mainFileID = sourceMgr.getMainFileID();
+  auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
+  auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
+
+  return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first,
+                             lineAndColumn.second, context);
+}
+
+/// emitError - Emit an error message and return an Token::error token.
+Token Lexer::emitError(const char *loc, const Twine &message) {
+  mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
+                  message);
+  return formToken(Token::error, loc);
+}
+
+Token Lexer::lexToken() {
+  // Ignore whitespace.
+  while (true) {
+    switch (*curPtr) {
+    case ' ':
+    case '\t':
+    case '\n':
+    case '\r':
+      ++curPtr;
+      continue;
+    default:
+      // Terminate loop on non-whitespace, including either an embedded or
+      // final terminating nul character that llvm::MemoryBuffer guarantees
+      // will be there.
+      break;
+    }
+    break;
+  }
+
+  const char *tokStart = curPtr;
+  switch (*curPtr++) {
+  default:
+    // Handle bare identifiers.
+    if (isalpha(curPtr[-1]))
+      return lexBareIdentifierOrKeyword(tokStart);
+
+    // Unknown character, emit an error.
+    return emitError(tokStart, "unexpected character");
+
+  case '_':
+    // Handle bare identifiers.
+    return lexBareIdentifierOrKeyword(tokStart);
+
+  case 0:
+    // This may either be a nul character in the source file or may be the EOF
+    // marker that llvm::MemoryBuffer guarantees will be there.
+    if (curPtr - 1 == curBuffer.end())
+      return formToken(Token::eof, tokStart);
+
+    LLVM_FALLTHROUGH;
+  case ':':
+    return formToken(Token::colon, tokStart);
+  case ',':
+    return formToken(Token::comma, tokStart);
+  case '.':
+    return lexEllipsis(tokStart);
+  case '(':
+    return formToken(Token::l_paren, tokStart);
+  case ')':
+    return formToken(Token::r_paren, tokStart);
+  case '{':
+    return formToken(Token::l_brace, tokStart);
+  case '}':
+    return formToken(Token::r_brace, tokStart);
+  case '[':
+    return formToken(Token::l_square, tokStart);
+  case ']':
+    return formToken(Token::r_square, tokStart);
+  case '<':
+    return formToken(Token::less, tokStart);
+  case '>':
+    return formToken(Token::greater, tokStart);
+  case '=':
+    return formToken(Token::equal, tokStart);
+
+  case '+':
+    return formToken(Token::plus, tokStart);
+  case '*':
+    return formToken(Token::star, tokStart);
+  case '-':
+    if (*curPtr == '>') {
+      ++curPtr;
+      return formToken(Token::arrow, tokStart);
+    }
+    return formToken(Token::minus, tokStart);
+
+  case '?':
+    return formToken(Token::question, tokStart);
+
+  case '/':
+    if (*curPtr == '/')
+      return lexComment();
+    return emitError(tokStart, "unexpected character");
+
+  case '@':
+    return lexAtIdentifier(tokStart);
+
+  case '!':
+    LLVM_FALLTHROUGH;
+  case '^':
+    LLVM_FALLTHROUGH;
+  case '#':
+    LLVM_FALLTHROUGH;
+  case '%':
+    return lexPrefixedIdentifier(tokStart);
+  case '"':
+    return lexString(tokStart);
+
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return lexNumber(tokStart);
+  }
+}
+
+/// Lex an '@foo' identifier.
+///
+///   symbol-ref-id ::= `@` bare-id
+///
+Token Lexer::lexAtIdentifier(const char *tokStart) {
+  // These always start with a letter or underscore.
+  auto cur = *curPtr++;
+  if (!isalpha(cur) && cur != '_')
+    return emitError(curPtr - 1,
+                     "@ identifier expected to start with letter or '_'");
+
+  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
+         *curPtr == '$' || *curPtr == '.')
+    ++curPtr;
+  return formToken(Token::at_identifier, tokStart);
+}
+
+/// Lex a bare identifier or keyword that starts with a letter.
+///
+///   bare-id ::= (letter|[_]) (letter|digit|[_$.])*
+///   integer-type ::= `i[1-9][0-9]*`
+///
+Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
+  // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
+  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
+         *curPtr == '$' || *curPtr == '.')
+    ++curPtr;
+
+  // Check to see if this identifier is a keyword.
+  StringRef spelling(tokStart, curPtr - tokStart);
+
+  // Check for i123.
+  if (tokStart[0] == 'i') {
+    bool allDigits = true;
+    for (auto c : spelling.drop_front())
+      allDigits &= isdigit(c) != 0;
+    if (allDigits && spelling.size() != 1)
+      return Token(Token::inttype, spelling);
+  }
+
+  Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
+#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
+#include "TokenKinds.def"
+                         .Default(Token::bare_identifier);
+
+  return Token(kind, spelling);
+}
+
+/// Lex a comment line, starting with a semicolon.
+///
+///   TODO: add a regex for comments here and to the spec.
+///
+Token Lexer::lexComment() {
+  // Advance over the second '/' in a '//' comment.
+  assert(*curPtr == '/');
+  ++curPtr;
+
+  while (true) {
+    switch (*curPtr++) {
+    case '\n':
+    case '\r':
+      // Newline is end of comment.
+      return lexToken();
+    case 0:
+      // If this is the end of the buffer, end the comment.
+      if (curPtr - 1 == curBuffer.end()) {
+        --curPtr;
+        return lexToken();
+      }
+      LLVM_FALLTHROUGH;
+    default:
+      // Skip over other characters.
+      break;
+    }
+  }
+}
+
+/// Lex an ellipsis.
+///
+///   ellipsis ::= '...'
+///
+Token Lexer::lexEllipsis(const char *tokStart) {
+  assert(curPtr[-1] == '.');
+
+  if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
+    return emitError(curPtr, "expected three consecutive dots for an ellipsis");
+
+  curPtr += 2;
+  return formToken(Token::ellipsis, tokStart);
+}
+
+/// Lex a number literal.
+///
+///   integer-literal ::= digit+ | `0x` hex_digit+
+///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+///
+Token Lexer::lexNumber(const char *tokStart) {
+  assert(isdigit(curPtr[-1]));
+
+  // Handle the hexadecimal case.
+  if (curPtr[-1] == '0' && *curPtr == 'x') {
+    // If we see stuff like 0xi32, this is a literal `0` follwed by an
+    // identifier `xi32`, stop after `0`.
+    if (!isxdigit(curPtr[1]))
+      return formToken(Token::integer, tokStart);
+
+    curPtr += 2;
+    while (isxdigit(*curPtr))
+      ++curPtr;
+
+    return formToken(Token::integer, tokStart);
+  }
+
+  // Handle the normal decimal case.
+  while (isdigit(*curPtr))
+    ++curPtr;
+
+  if (*curPtr != '.')
+    return formToken(Token::integer, tokStart);
+  ++curPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(*curPtr))
+    ++curPtr;
+
+  if (*curPtr == 'e' || *curPtr == 'E') {
+    if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
+        ((curPtr[1] == '-' || curPtr[1] == '+') &&
+         isdigit(static_cast<unsigned char>(curPtr[2])))) {
+      curPtr += 2;
+      while (isdigit(*curPtr))
+        ++curPtr;
+    }
+  }
+  return formToken(Token::floatliteral, tokStart);
+}
+
+/// Lex an identifier that starts with a prefix followed by suffix-id.
+///
+///   affine-map-id ::= `#` suffix-id
+///   ssa-id        ::= '%' suffix-id
+///   block-id      ::= '^' suffix-id
+///   type-id       ::= '!' suffix-id
+///   suffix-id     ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
+///   id-punct      ::= `$` | `.` | `_` | `-`
+///
+Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
+  Token::Kind kind;
+  StringRef errorKind;
+  switch (*tokStart) {
+  case '#':
+    kind = Token::hash_identifier;
+    errorKind = "invalid attribute name";
+    break;
+  case '%':
+    kind = Token::percent_identifier;
+    errorKind = "invalid SSA name";
+    break;
+  case '^':
+    kind = Token::caret_identifier;
+    errorKind = "invalid block name";
+    break;
+  case '!':
+    kind = Token::exclamation_identifier;
+    errorKind = "invalid type identifier";
+    break;
+  default:
+    llvm_unreachable("invalid caller");
+  }
+
+  // Parse suffix-id.
+  if (isdigit(*curPtr)) {
+    // If suffix-id starts with a digit, the rest must be digits.
+    while (isdigit(*curPtr)) {
+      ++curPtr;
+    }
+  } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
+    do {
+      ++curPtr;
+    } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
+  } else {
+    return emitError(curPtr - 1, errorKind);
+  }
+
+  return formToken(kind, tokStart);
+}
+
+/// Lex a string literal.
+///
+///   string-literal ::= '"' [^"\n\f\v\r]* '"'
+///
+/// TODO: define escaping rules.
+Token Lexer::lexString(const char *tokStart) {
+  assert(curPtr[-1] == '"');
+
+  while (1) {
+    switch (*curPtr++) {
+    case '"':
+      return formToken(Token::string, tokStart);
+    case 0:
+      // If this is a random nul character in the middle of a string, just
+      // include it.  If it is the end of file, then it is an error.
+      if (curPtr - 1 != curBuffer.end())
+        continue;
+      LLVM_FALLTHROUGH;
+    case '\n':
+    case '\v':
+    case '\f':
+      return emitError(curPtr - 1, "expected '\"' in string literal");
+    case '\\':
+      // Handle explicitly a few escapes.
+      if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
+        ++curPtr;
+      else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
+        // Support \xx for two hex digits.
+        curPtr += 2;
+      else
+        return emitError(curPtr - 1, "unknown escape in string literal");
+      continue;
+
+    default:
+      continue;
+    }
+  }
+}
diff --git a/third_party/mlir/lib/Parser/Lexer.h b/third_party/mlir/lib/Parser/Lexer.h
new file mode 100644
index 00000000000..896c26cc927
--- /dev/null
+++ b/third_party/mlir/lib/Parser/Lexer.h
@@ -0,0 +1,77 @@
+//===- Lexer.h - MLIR Lexer Interface ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the MLIR Lexer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LIB_PARSER_LEXER_H
+#define MLIR_LIB_PARSER_LEXER_H
+
+#include "Token.h"
+#include "mlir/Parser.h"
+
+namespace mlir {
+class Location;
+
+/// This class breaks up the current file into a token stream.
+class Lexer {
+public:
+  explicit Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context);
+
+  const llvm::SourceMgr &getSourceMgr() { return sourceMgr; }
+
+  Token lexToken();
+
+  /// Encode the specified source location information into a Location object
+  /// for attachment to the IR or error reporting.
+  Location getEncodedSourceLocation(llvm::SMLoc loc);
+
+  /// Change the position of the lexer cursor.  The next token we lex will start
+  /// at the designated point in the input.
+  void resetPointer(const char *newPointer) { curPtr = newPointer; }
+
+private:
+  // Helpers.
+  Token formToken(Token::Kind kind, const char *tokStart) {
+    return Token(kind, StringRef(tokStart, curPtr - tokStart));
+  }
+
+  Token emitError(const char *loc, const Twine &message);
+
+  // Lexer implementation methods.
+  Token lexAtIdentifier(const char *tokStart);
+  Token lexBareIdentifierOrKeyword(const char *tokStart);
+  Token lexComment();
+  Token lexEllipsis(const char *tokStart);
+  Token lexNumber(const char *tokStart);
+  Token lexPrefixedIdentifier(const char *tokStart);
+  Token lexString(const char *tokStart);
+
+  const llvm::SourceMgr &sourceMgr;
+  MLIRContext *context;
+
+  StringRef curBuffer;
+  const char *curPtr;
+
+  Lexer(const Lexer &) = delete;
+  void operator=(const Lexer &) = delete;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_LIB_PARSER_LEXER_H
diff --git a/third_party/mlir/lib/Parser/Parser.cpp b/third_party/mlir/lib/Parser/Parser.cpp
new file mode 100644
index 00000000000..dde24e4cdb4
--- /dev/null
+++ b/third_party/mlir/lib/Parser/Parser.cpp
@@ -0,0 +1,4223 @@
+//===- Parser.cpp - MLIR Parser Implementation ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the parser for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Parser.h"
+#include "Lexer.h"
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+using namespace mlir;
+using llvm::MemoryBuffer;
+using llvm::SMLoc;
+using llvm::SourceMgr;
+
+namespace {
+class Parser;
+
+//===----------------------------------------------------------------------===//
+// ParserState
+//===----------------------------------------------------------------------===//
+
+/// This class refers to all of the state maintained globally by the parser,
+/// such as the current lexer position etc. The Parser base class provides
+/// methods to access this.
+class ParserState {
+public:
+  ParserState(const llvm::SourceMgr &sourceMgr, MLIRContext *ctx)
+      : context(ctx), lex(sourceMgr, ctx), curToken(lex.lexToken()) {}
+
+  // A map from attribute alias identifier to Attribute.
+  llvm::StringMap<Attribute> attributeAliasDefinitions;
+
+  // A map from type alias identifier to Type.
+  llvm::StringMap<Type> typeAliasDefinitions;
+
+private:
+  ParserState(const ParserState &) = delete;
+  void operator=(const ParserState &) = delete;
+
+  friend class Parser;
+
+  // The context we're parsing into.
+  MLIRContext *const context;
+
+  // The lexer for the source file we're parsing.
+  Lexer lex;
+
+  // This is the next token that hasn't been consumed yet.
+  Token curToken;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// This class implement support for parsing global entities like types and
+/// shared entities like SSA names.  It is intended to be subclassed by
+/// specialized subparsers that include state, e.g. when a local symbol table.
+class Parser {
+public:
+  Builder builder;
+
+  Parser(ParserState &state) : builder(state.context), state(state) {}
+
+  // Helper methods to get stuff from the parser-global state.
+  ParserState &getState() const { return state; }
+  MLIRContext *getContext() const { return state.context; }
+  const llvm::SourceMgr &getSourceMgr() { return state.lex.getSourceMgr(); }
+
+  /// Parse a comma-separated list of elements up until the specified end token.
+  ParseResult
+  parseCommaSeparatedListUntil(Token::Kind rightToken,
+                               const std::function<ParseResult()> &parseElement,
+                               bool allowEmptyList = true);
+
+  /// Parse a comma separated list of elements that must have at least one entry
+  /// in it.
+  ParseResult
+  parseCommaSeparatedList(const std::function<ParseResult()> &parseElement);
+
+  ParseResult parsePrettyDialectSymbolName(StringRef &prettyName);
+
+  // We have two forms of parsing methods - those that return a non-null
+  // pointer on success, and those that return a ParseResult to indicate whether
+  // they returned a failure.  The second class fills in by-reference arguments
+  // as the results of their action.
+
+  //===--------------------------------------------------------------------===//
+  // Error Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Emit an error and return failure.
+  InFlightDiagnostic emitError(const Twine &message = {}) {
+    return emitError(state.curToken.getLoc(), message);
+  }
+  InFlightDiagnostic emitError(SMLoc loc, const Twine &message = {});
+
+  /// Encode the specified source location information into an attribute for
+  /// attachment to the IR.
+  Location getEncodedSourceLocation(llvm::SMLoc loc) {
+    return state.lex.getEncodedSourceLocation(loc);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Return the current token the parser is inspecting.
+  const Token &getToken() const { return state.curToken; }
+  StringRef getTokenSpelling() const { return state.curToken.getSpelling(); }
+
+  /// If the current token has the specified kind, consume it and return true.
+  /// If not, return false.
+  bool consumeIf(Token::Kind kind) {
+    if (state.curToken.isNot(kind))
+      return false;
+    consumeToken(kind);
+    return true;
+  }
+
+  /// Advance the current lexer onto the next token.
+  void consumeToken() {
+    assert(state.curToken.isNot(Token::eof, Token::error) &&
+           "shouldn't advance past EOF or errors");
+    state.curToken = state.lex.lexToken();
+  }
+
+  /// Advance the current lexer onto the next token, asserting what the expected
+  /// current token is.  This is preferred to the above method because it leads
+  /// to more self-documenting code with better checking.
+  void consumeToken(Token::Kind kind) {
+    assert(state.curToken.is(kind) && "consumed an unexpected token");
+    consumeToken();
+  }
+
+  /// Consume the specified token if present and return success.  On failure,
+  /// output a diagnostic and return failure.
+  ParseResult parseToken(Token::Kind expectedToken, const Twine &message);
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  ParseResult parseFunctionResultTypes(SmallVectorImpl<Type> &elements);
+  ParseResult parseTypeListNoParens(SmallVectorImpl<Type> &elements);
+  ParseResult parseTypeListParens(SmallVectorImpl<Type> &elements);
+
+  /// Parse an arbitrary type.
+  Type parseType();
+
+  /// Parse a complex type.
+  Type parseComplexType();
+
+  /// Parse an extended type.
+  Type parseExtendedType();
+
+  /// Parse a function type.
+  Type parseFunctionType();
+
+  /// Parse a memref type.
+  Type parseMemRefType();
+
+  /// Parse a non function type.
+  Type parseNonFunctionType();
+
+  /// Parse a tensor type.
+  Type parseTensorType();
+
+  /// Parse a tuple type.
+  Type parseTupleType();
+
+  /// Parse a vector type.
+  VectorType parseVectorType();
+  ParseResult parseDimensionListRanked(SmallVectorImpl<int64_t> &dimensions,
+                                       bool allowDynamic = true);
+  ParseResult parseXInDimensionList();
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute with an optional type.
+  Attribute parseAttribute(Type type = {});
+
+  /// Parse an attribute dictionary.
+  ParseResult parseAttributeDict(SmallVectorImpl<NamedAttribute> &attributes);
+
+  /// Parse an extended attribute.
+  Attribute parseExtendedAttr(Type type);
+
+  /// Parse a float attribute.
+  Attribute parseFloatAttr(Type type, bool isNegative);
+
+  /// Parse a decimal or a hexadecimal literal, which can be either an integer
+  /// or a float attribute.
+  Attribute parseDecOrHexAttr(Type type, bool isNegative);
+
+  /// Parse an opaque elements attribute.
+  Attribute parseOpaqueElementsAttr();
+
+  /// Parse a dense elements attribute.
+  Attribute parseDenseElementsAttr();
+  ShapedType parseElementsLiteralType();
+
+  /// Parse a sparse elements attribute.
+  Attribute parseSparseElementsAttr();
+
+  //===--------------------------------------------------------------------===//
+  // Location Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an inline location.
+  ParseResult parseLocation(LocationAttr &loc);
+
+  /// Parse a raw location instance.
+  ParseResult parseLocationInstance(LocationAttr &loc);
+
+  /// Parse a callsite location instance.
+  ParseResult parseCallSiteLocation(LocationAttr &loc);
+
+  /// Parse a fused location instance.
+  ParseResult parseFusedLocation(LocationAttr &loc);
+
+  /// Parse a name or FileLineCol location instance.
+  ParseResult parseNameOrFileLineColLocation(LocationAttr &loc);
+
+  /// Parse an optional trailing location.
+  ///
+  ///   trailing-location     ::= location?
+  ///
+  template <typename Owner>
+  ParseResult parseOptionalTrailingLocation(Owner *owner) {
+    // If there is a 'loc' we parse a trailing location.
+    if (!getToken().is(Token::kw_loc))
+      return success();
+
+    // Parse the location.
+    LocationAttr directLoc;
+    if (parseLocation(directLoc))
+      return failure();
+    owner->setLoc(directLoc);
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Affine Parsing
+  //===--------------------------------------------------------------------===//
+
+  ParseResult parseAffineMapOrIntegerSetReference(AffineMap &map,
+                                                  IntegerSet &set);
+
+  /// Parse an AffineMap where the dim and symbol identifiers are SSA ids.
+  ParseResult
+  parseAffineMapOfSSAIds(AffineMap &map,
+                         llvm::function_ref<ParseResult(bool)> parseElement);
+
+private:
+  /// The Parser is subclassed and reinstantiated.  Do not add additional
+  /// non-trivial state here, add it to the ParserState class.
+  ParserState &state;
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helper methods.
+//===----------------------------------------------------------------------===//
+
+/// Parse a comma separated list of elements that must have at least one entry
+/// in it.
+ParseResult Parser::parseCommaSeparatedList(
+    const std::function<ParseResult()> &parseElement) {
+  // Non-empty case starts with an element.
+  if (parseElement())
+    return failure();
+
+  // Otherwise we have a list of comma separated elements.
+  while (consumeIf(Token::comma)) {
+    if (parseElement())
+      return failure();
+  }
+  return success();
+}
+
+/// Parse a comma-separated list of elements, terminated with an arbitrary
+/// token.  This allows empty lists if allowEmptyList is true.
+///
+///   abstract-list ::= rightToken                  // if allowEmptyList == true
+///   abstract-list ::= element (',' element)* rightToken
+///
+ParseResult Parser::parseCommaSeparatedListUntil(
+    Token::Kind rightToken, const std::function<ParseResult()> &parseElement,
+    bool allowEmptyList) {
+  // Handle the empty case.
+  if (getToken().is(rightToken)) {
+    if (!allowEmptyList)
+      return emitError("expected list element");
+    consumeToken(rightToken);
+    return success();
+  }
+
+  if (parseCommaSeparatedList(parseElement) ||
+      parseToken(rightToken, "expected ',' or '" +
+                                 Token::getTokenSpelling(rightToken) + "'"))
+    return failure();
+
+  return success();
+}
+
+/// Parse the body of a pretty dialect symbol, which starts and ends with <>'s,
+/// and may be recursive.  Return with the 'prettyName' StringRef encompasing
+/// the entire pretty name.
+///
+///   pretty-dialect-sym-body ::= '<' pretty-dialect-sym-contents+ '>'
+///   pretty-dialect-sym-contents ::= pretty-dialect-sym-body
+///                                  | '(' pretty-dialect-sym-contents+ ')'
+///                                  | '[' pretty-dialect-sym-contents+ ']'
+///                                  | '{' pretty-dialect-sym-contents+ '}'
+///                                  | '[^[<({>\])}\0]+'
+///
+ParseResult Parser::parsePrettyDialectSymbolName(StringRef &prettyName) {
+  // Pretty symbol names are a relatively unstructured format that contains a
+  // series of properly nested punctuation, with anything else in the middle.
+  // Scan ahead to find it and consume it if successful, otherwise emit an
+  // error.
+  auto *curPtr = getTokenSpelling().data();
+
+  SmallVector<char, 8> nestedPunctuation;
+
+  // Scan over the nested punctuation, bailing out on error and consuming until
+  // we find the end.  We know that we're currently looking at the '<', so we
+  // can go until we find the matching '>' character.
+  assert(*curPtr == '<');
+  do {
+    char c = *curPtr++;
+    switch (c) {
+    case '\0':
+      // This also handles the EOF case.
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    case '<':
+    case '[':
+    case '(':
+    case '{':
+      nestedPunctuation.push_back(c);
+      continue;
+
+    case '-':
+      // The sequence `->` is treated as special token.
+      if (*curPtr == '>')
+        ++curPtr;
+      continue;
+
+    case '>':
+      if (nestedPunctuation.pop_back_val() != '<')
+        return emitError("unbalanced '>' character in pretty dialect name");
+      break;
+    case ']':
+      if (nestedPunctuation.pop_back_val() != '[')
+        return emitError("unbalanced ']' character in pretty dialect name");
+      break;
+    case ')':
+      if (nestedPunctuation.pop_back_val() != '(')
+        return emitError("unbalanced ')' character in pretty dialect name");
+      break;
+    case '}':
+      if (nestedPunctuation.pop_back_val() != '{')
+        return emitError("unbalanced '}' character in pretty dialect name");
+      break;
+
+    default:
+      continue;
+    }
+  } while (!nestedPunctuation.empty());
+
+  // Ok, we succeeded, remember where we stopped, reset the lexer to know it is
+  // consuming all this stuff, and return.
+  state.lex.resetPointer(curPtr);
+
+  unsigned length = curPtr - prettyName.begin();
+  prettyName = StringRef(prettyName.begin(), length);
+  consumeToken();
+  return success();
+}
+
+/// Parse an extended dialect symbol.
+template <typename Symbol, typename SymbolAliasMap, typename CreateFn>
+static Symbol parseExtendedSymbol(Parser &p, Token::Kind identifierTok,
+                                  SymbolAliasMap &aliases,
+                                  CreateFn &&createSymbol) {
+  // Parse the dialect namespace.
+  StringRef identifier = p.getTokenSpelling().drop_front();
+  auto loc = p.getToken().getLoc();
+  p.consumeToken(identifierTok);
+
+  // If there is no '<' token following this, and if the typename contains no
+  // dot, then we are parsing a symbol alias.
+  if (p.getToken().isNot(Token::less) && !identifier.contains('.')) {
+    // Check for an alias for this type.
+    auto aliasIt = aliases.find(identifier);
+    if (aliasIt == aliases.end())
+      return (p.emitError("undefined symbol alias id '" + identifier + "'"),
+              nullptr);
+    return aliasIt->second;
+  }
+
+  // Otherwise, we are parsing a dialect-specific symbol.  If the name contains
+  // a dot, then this is the "pretty" form.  If not, it is the verbose form that
+  // looks like <"...">.
+  std::string symbolData;
+  auto dialectName = identifier;
+
+  // Handle the verbose form, where "identifier" is a simple dialect name.
+  if (!identifier.contains('.')) {
+    // Consume the '<'.
+    if (p.parseToken(Token::less, "expected '<' in dialect type"))
+      return nullptr;
+
+    // Parse the symbol specific data.
+    if (p.getToken().isNot(Token::string))
+      return (p.emitError("expected string literal data in dialect symbol"),
+              nullptr);
+    symbolData = p.getToken().getStringValue();
+    loc = p.getToken().getLoc();
+    p.consumeToken(Token::string);
+
+    // Consume the '>'.
+    if (p.parseToken(Token::greater, "expected '>' in dialect symbol"))
+      return nullptr;
+  } else {
+    // Ok, the dialect name is the part of the identifier before the dot, the
+    // part after the dot is the dialect's symbol, or the start thereof.
+    auto dotHalves = identifier.split('.');
+    dialectName = dotHalves.first;
+    auto prettyName = dotHalves.second;
+
+    // If the dialect's symbol is followed immediately by a <, then lex the body
+    // of it into prettyName.
+    if (p.getToken().is(Token::less) &&
+        prettyName.bytes_end() == p.getTokenSpelling().bytes_begin()) {
+      if (p.parsePrettyDialectSymbolName(prettyName))
+        return nullptr;
+    }
+
+    symbolData = prettyName.str();
+  }
+
+  // Call into the provided symbol construction function.
+  auto encodedLoc = p.getEncodedSourceLocation(loc);
+  return createSymbol(dialectName, symbolData, encodedLoc);
+}
+
+//===----------------------------------------------------------------------===//
+// Error Handling
+//===----------------------------------------------------------------------===//
+
+InFlightDiagnostic Parser::emitError(SMLoc loc, const Twine &message) {
+  auto diag = mlir::emitError(getEncodedSourceLocation(loc), message);
+
+  // If we hit a parse error in response to a lexer error, then the lexer
+  // already reported the error.
+  if (getToken().is(Token::error))
+    diag.abandon();
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// Token Parsing
+//===----------------------------------------------------------------------===//
+
+/// Consume the specified token if present and return success.  On failure,
+/// output a diagnostic and return failure.
+ParseResult Parser::parseToken(Token::Kind expectedToken,
+                               const Twine &message) {
+  if (consumeIf(expectedToken))
+    return success();
+  return emitError(message);
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing
+//===----------------------------------------------------------------------===//
+
+/// Parse an arbitrary type.
+///
+///   type ::= function-type
+///          | non-function-type
+///
+Type Parser::parseType() {
+  if (getToken().is(Token::l_paren))
+    return parseFunctionType();
+  return parseNonFunctionType();
+}
+
+/// Parse a function result type.
+///
+///   function-result-type ::= type-list-parens
+///                          | non-function-type
+///
+ParseResult Parser::parseFunctionResultTypes(SmallVectorImpl<Type> &elements) {
+  if (getToken().is(Token::l_paren))
+    return parseTypeListParens(elements);
+
+  Type t = parseNonFunctionType();
+  if (!t)
+    return failure();
+  elements.push_back(t);
+  return success();
+}
+
+/// Parse a list of types without an enclosing parenthesis.  The list must have
+/// at least one member.
+///
+///   type-list-no-parens ::=  type (`,` type)*
+///
+ParseResult Parser::parseTypeListNoParens(SmallVectorImpl<Type> &elements) {
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseType();
+    elements.push_back(elt);
+    return elt ? success() : failure();
+  };
+
+  return parseCommaSeparatedList(parseElt);
+}
+
+/// Parse a parenthesized list of types.
+///
+///   type-list-parens ::= `(` `)`
+///                      | `(` type-list-no-parens `)`
+///
+ParseResult Parser::parseTypeListParens(SmallVectorImpl<Type> &elements) {
+  if (parseToken(Token::l_paren, "expected '('"))
+    return failure();
+
+  // Handle empty lists.
+  if (getToken().is(Token::r_paren))
+    return consumeToken(), success();
+
+  if (parseTypeListNoParens(elements) ||
+      parseToken(Token::r_paren, "expected ')'"))
+    return failure();
+  return success();
+}
+
+/// Parse a complex type.
+///
+///   complex-type ::= `complex` `<` type `>`
+///
+Type Parser::parseComplexType() {
+  consumeToken(Token::kw_complex);
+
+  // Parse the '<'.
+  if (parseToken(Token::less, "expected '<' in complex type"))
+    return nullptr;
+
+  auto typeLocation = getEncodedSourceLocation(getToken().getLoc());
+  auto elementType = parseType();
+  if (!elementType ||
+      parseToken(Token::greater, "expected '>' in complex type"))
+    return nullptr;
+
+  return ComplexType::getChecked(elementType, typeLocation);
+}
+
+/// Parse an extended type.
+///
+///   extended-type ::= (dialect-type | type-alias)
+///   dialect-type  ::= `!` dialect-namespace `<` `"` type-data `"` `>`
+///   dialect-type  ::= `!` alias-name pretty-dialect-attribute-body?
+///   type-alias    ::= `!` alias-name
+///
+Type Parser::parseExtendedType() {
+  return parseExtendedSymbol<Type>(
+      *this, Token::exclamation_identifier, state.typeAliasDefinitions,
+      [&](StringRef dialectName, StringRef symbolData, Location loc) -> Type {
+        // If we found a registered dialect, then ask it to parse the type.
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName))
+          return dialect->parseType(symbolData, loc);
+
+        // Otherwise, form a new opaque type.
+        return OpaqueType::getChecked(
+            Identifier::get(dialectName, state.context), symbolData,
+            state.context, loc);
+      });
+}
+
+/// Parse a function type.
+///
+///   function-type ::= type-list-parens `->` function-result-type
+///
+Type Parser::parseFunctionType() {
+  assert(getToken().is(Token::l_paren));
+
+  SmallVector<Type, 4> arguments, results;
+  if (parseTypeListParens(arguments) ||
+      parseToken(Token::arrow, "expected '->' in function type") ||
+      parseFunctionResultTypes(results))
+    return nullptr;
+
+  return builder.getFunctionType(arguments, results);
+}
+
+/// Parse a memref type.
+///
+///   memref-type ::= `memref` `<` dimension-list-ranked type
+///                   (`,` semi-affine-map-composition)? (`,` memory-space)? `>`
+///
+///   semi-affine-map-composition ::= (semi-affine-map `,` )* semi-affine-map
+///   memory-space ::= integer-literal /* | TODO: address-space-id */
+///
+Type Parser::parseMemRefType() {
+  consumeToken(Token::kw_memref);
+
+  if (parseToken(Token::less, "expected '<' in memref type"))
+    return nullptr;
+
+  SmallVector<int64_t, 4> dimensions;
+  if (parseDimensionListRanked(dimensions))
+    return nullptr;
+
+  // Parse the element type.
+  auto typeLoc = getToken().getLoc();
+  auto elementType = parseType();
+  if (!elementType)
+    return nullptr;
+
+  // Parse semi-affine-map-composition.
+  SmallVector<AffineMap, 2> affineMapComposition;
+  unsigned memorySpace = 0;
+  bool parsedMemorySpace = false;
+
+  auto parseElt = [&]() -> ParseResult {
+    if (getToken().is(Token::integer)) {
+      // Parse memory space.
+      if (parsedMemorySpace)
+        return emitError("multiple memory spaces specified in memref type");
+      auto v = getToken().getUnsignedIntegerValue();
+      if (!v.hasValue())
+        return emitError("invalid memory space in memref type");
+      memorySpace = v.getValue();
+      consumeToken(Token::integer);
+      parsedMemorySpace = true;
+    } else {
+      // Parse affine map.
+      if (parsedMemorySpace)
+        return emitError("affine map after memory space in memref type");
+      auto affineMap = parseAttribute();
+      if (!affineMap)
+        return failure();
+
+      // Verify that the parsed attribute is an affine map.
+      if (auto affineMapAttr = affineMap.dyn_cast<AffineMapAttr>())
+        affineMapComposition.push_back(affineMapAttr.getValue());
+      else
+        return emitError("expected affine map in memref type");
+    }
+    return success();
+  };
+
+  // Parse a list of mappings and address space if present.
+  if (consumeIf(Token::comma)) {
+    // Parse comma separated list of affine maps, followed by memory space.
+    if (parseCommaSeparatedListUntil(Token::greater, parseElt,
+                                     /*allowEmptyList=*/false)) {
+      return nullptr;
+    }
+  } else {
+    if (parseToken(Token::greater, "expected ',' or '>' in memref type"))
+      return nullptr;
+  }
+
+  return MemRefType::getChecked(dimensions, elementType, affineMapComposition,
+                                memorySpace, getEncodedSourceLocation(typeLoc));
+}
+
+/// Parse any type except the function type.
+///
+///   non-function-type ::= integer-type
+///                       | index-type
+///                       | float-type
+///                       | extended-type
+///                       | vector-type
+///                       | tensor-type
+///                       | memref-type
+///                       | complex-type
+///                       | tuple-type
+///                       | none-type
+///
+///   index-type ::= `index`
+///   float-type ::= `f16` | `bf16` | `f32` | `f64`
+///   none-type ::= `none`
+///
+Type Parser::parseNonFunctionType() {
+  switch (getToken().getKind()) {
+  default:
+    return (emitError("expected non-function type"), nullptr);
+  case Token::kw_memref:
+    return parseMemRefType();
+  case Token::kw_tensor:
+    return parseTensorType();
+  case Token::kw_complex:
+    return parseComplexType();
+  case Token::kw_tuple:
+    return parseTupleType();
+  case Token::kw_vector:
+    return parseVectorType();
+  // integer-type
+  case Token::inttype: {
+    auto width = getToken().getIntTypeBitwidth();
+    if (!width.hasValue())
+      return (emitError("invalid integer width"), nullptr);
+    auto loc = getEncodedSourceLocation(getToken().getLoc());
+    consumeToken(Token::inttype);
+    return IntegerType::getChecked(width.getValue(), builder.getContext(), loc);
+  }
+
+  // float-type
+  case Token::kw_bf16:
+    consumeToken(Token::kw_bf16);
+    return builder.getBF16Type();
+  case Token::kw_f16:
+    consumeToken(Token::kw_f16);
+    return builder.getF16Type();
+  case Token::kw_f32:
+    consumeToken(Token::kw_f32);
+    return builder.getF32Type();
+  case Token::kw_f64:
+    consumeToken(Token::kw_f64);
+    return builder.getF64Type();
+
+  // index-type
+  case Token::kw_index:
+    consumeToken(Token::kw_index);
+    return builder.getIndexType();
+
+  // none-type
+  case Token::kw_none:
+    consumeToken(Token::kw_none);
+    return builder.getNoneType();
+
+  // extended type
+  case Token::exclamation_identifier:
+    return parseExtendedType();
+  }
+}
+
+/// Parse a tensor type.
+///
+///   tensor-type ::= `tensor` `<` dimension-list type `>`
+///   dimension-list ::= dimension-list-ranked | `*x`
+///
+Type Parser::parseTensorType() {
+  consumeToken(Token::kw_tensor);
+
+  if (parseToken(Token::less, "expected '<' in tensor type"))
+    return nullptr;
+
+  bool isUnranked;
+  SmallVector<int64_t, 4> dimensions;
+
+  if (consumeIf(Token::star)) {
+    // This is an unranked tensor type.
+    isUnranked = true;
+
+    if (parseXInDimensionList())
+      return nullptr;
+
+  } else {
+    isUnranked = false;
+    if (parseDimensionListRanked(dimensions))
+      return nullptr;
+  }
+
+  // Parse the element type.
+  auto typeLocation = getEncodedSourceLocation(getToken().getLoc());
+  auto elementType = parseType();
+  if (!elementType || parseToken(Token::greater, "expected '>' in tensor type"))
+    return nullptr;
+
+  if (isUnranked)
+    return UnrankedTensorType::getChecked(elementType, typeLocation);
+  return RankedTensorType::getChecked(dimensions, elementType, typeLocation);
+}
+
+/// Parse a tuple type.
+///
+///   tuple-type ::= `tuple` `<` (type (`,` type)*)? `>`
+///
+Type Parser::parseTupleType() {
+  consumeToken(Token::kw_tuple);
+
+  // Parse the '<'.
+  if (parseToken(Token::less, "expected '<' in tuple type"))
+    return nullptr;
+
+  // Check for an empty tuple by directly parsing '>'.
+  if (consumeIf(Token::greater))
+    return TupleType::get(getContext());
+
+  // Parse the element types and the '>'.
+  SmallVector<Type, 4> types;
+  if (parseTypeListNoParens(types) ||
+      parseToken(Token::greater, "expected '>' in tuple type"))
+    return nullptr;
+
+  return TupleType::get(types, getContext());
+}
+
+/// Parse a vector type.
+///
+///   vector-type ::= `vector` `<` non-empty-static-dimension-list type `>`
+///   non-empty-static-dimension-list ::= decimal-literal `x`
+///                                       static-dimension-list
+///   static-dimension-list ::= (decimal-literal `x`)*
+///
+VectorType Parser::parseVectorType() {
+  consumeToken(Token::kw_vector);
+
+  if (parseToken(Token::less, "expected '<' in vector type"))
+    return nullptr;
+
+  SmallVector<int64_t, 4> dimensions;
+  if (parseDimensionListRanked(dimensions, /*allowDynamic=*/false))
+    return nullptr;
+  if (dimensions.empty())
+    return (emitError("expected dimension size in vector type"), nullptr);
+
+  // Parse the element type.
+  auto typeLoc = getToken().getLoc();
+  auto elementType = parseType();
+  if (!elementType || parseToken(Token::greater, "expected '>' in vector type"))
+    return nullptr;
+
+  return VectorType::getChecked(dimensions, elementType,
+                                getEncodedSourceLocation(typeLoc));
+}
+
+/// Parse a dimension list of a tensor or memref type.  This populates the
+/// dimension list, using -1 for the `?` dimensions if `allowDynamic` is set and
+/// errors out on `?` otherwise.
+///
+///   dimension-list-ranked ::= (dimension `x`)*
+///   dimension ::= `?` | decimal-literal
+///
+/// When `allowDynamic` is not set, this is used to parse:
+///
+///   static-dimension-list ::= (decimal-literal `x`)*
+ParseResult
+Parser::parseDimensionListRanked(SmallVectorImpl<int64_t> &dimensions,
+                                 bool allowDynamic) {
+  while (getToken().isAny(Token::integer, Token::question)) {
+    if (consumeIf(Token::question)) {
+      if (!allowDynamic)
+        return emitError("expected static shape");
+      dimensions.push_back(-1);
+    } else {
+      // Hexadecimal integer literals (starting with `0x`) are not allowed in
+      // aggregate type declarations.  Therefore, `0xf32` should be processed as
+      // a sequence of separate elements `0`, `x`, `f32`.
+      if (getTokenSpelling().size() > 1 && getTokenSpelling()[1] == 'x') {
+        // We can get here only if the token is an integer literal.  Hexadecimal
+        // integer literals can only start with `0x` (`1x` wouldn't lex as a
+        // literal, just `1` would, at which point we don't get into this
+        // branch).
+        assert(getTokenSpelling()[0] == '0' && "invalid integer literal");
+        dimensions.push_back(0);
+        state.lex.resetPointer(getTokenSpelling().data() + 1);
+        consumeToken();
+      } else {
+        // Make sure this integer value is in bound and valid.
+        auto dimension = getToken().getUnsignedIntegerValue();
+        if (!dimension.hasValue())
+          return emitError("invalid dimension");
+        dimensions.push_back((int64_t)dimension.getValue());
+        consumeToken(Token::integer);
+      }
+    }
+
+    // Make sure we have an 'x' or something like 'xbf32'.
+    if (parseXInDimensionList())
+      return failure();
+  }
+
+  return success();
+}
+
+/// Parse an 'x' token in a dimension list, handling the case where the x is
+/// juxtaposed with an element type, as in "xf32", leaving the "f32" as the next
+/// token.
+ParseResult Parser::parseXInDimensionList() {
+  if (getToken().isNot(Token::bare_identifier) || getTokenSpelling()[0] != 'x')
+    return emitError("expected 'x' in dimension list");
+
+  // If we had a prefix of 'x', lex the next token immediately after the 'x'.
+  if (getTokenSpelling().size() != 1)
+    state.lex.resetPointer(getTokenSpelling().data() + 1);
+
+  // Consume the 'x'.
+  consumeToken(Token::bare_identifier);
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute parsing.
+//===----------------------------------------------------------------------===//
+
+/// Parse an arbitrary attribute.
+///
+///  attribute-value ::= `unit`
+///                    | bool-literal
+///                    | integer-literal (`:` (index-type | integer-type))?
+///                    | float-literal (`:` float-type)?
+///                    | string-literal (`:` type)?
+///                    | type
+///                    | `[` (attribute-value (`,` attribute-value)*)? `]`
+///                    | `{` (attribute-entry (`,` attribute-entry)*)? `}`
+///                    | symbol-ref-id
+///                    | `dense` `<` attribute-value `>` `:`
+///                      (tensor-type | vector-type)
+///                    | `sparse` `<` attribute-value `,` attribute-value `>`
+///                      `:` (tensor-type | vector-type)
+///                    | `opaque` `<` dialect-namespace  `,` hex-string-literal
+///                      `>` `:` (tensor-type | vector-type)
+///                    | extended-attribute
+///
+Attribute Parser::parseAttribute(Type type) {
+  switch (getToken().getKind()) {
+  // Parse an AffineMap or IntegerSet attribute.
+  case Token::l_paren: {
+    // Try to parse an affine map or an integer set reference.
+    AffineMap map;
+    IntegerSet set;
+    if (parseAffineMapOrIntegerSetReference(map, set))
+      return nullptr;
+    if (map)
+      return builder.getAffineMapAttr(map);
+    assert(set);
+    return builder.getIntegerSetAttr(set);
+  }
+
+  // Parse an array attribute.
+  case Token::l_square: {
+    consumeToken(Token::l_square);
+
+    SmallVector<Attribute, 4> elements;
+    auto parseElt = [&]() -> ParseResult {
+      elements.push_back(parseAttribute());
+      return elements.back() ? success() : failure();
+    };
+
+    if (parseCommaSeparatedListUntil(Token::r_square, parseElt))
+      return nullptr;
+    return builder.getArrayAttr(elements);
+  }
+
+  // Parse a boolean attribute.
+  case Token::kw_false:
+    consumeToken(Token::kw_false);
+    return builder.getBoolAttr(false);
+  case Token::kw_true:
+    consumeToken(Token::kw_true);
+    return builder.getBoolAttr(true);
+
+  // Parse a dense elements attribute.
+  case Token::kw_dense:
+    return parseDenseElementsAttr();
+
+  // Parse a dictionary attribute.
+  case Token::l_brace: {
+    SmallVector<NamedAttribute, 4> elements;
+    if (parseAttributeDict(elements))
+      return nullptr;
+    return builder.getDictionaryAttr(elements);
+  }
+
+  // Parse an extended attribute, i.e. alias or dialect attribute.
+  case Token::hash_identifier:
+    return parseExtendedAttr(type);
+
+  // Parse floating point and integer attributes.
+  case Token::floatliteral:
+    return parseFloatAttr(type, /*isNegative=*/false);
+  case Token::integer:
+    return parseDecOrHexAttr(type, /*isNegative=*/false);
+  case Token::minus: {
+    consumeToken(Token::minus);
+    if (getToken().is(Token::integer))
+      return parseDecOrHexAttr(type, /*isNegative=*/true);
+    if (getToken().is(Token::floatliteral))
+      return parseFloatAttr(type, /*isNegative=*/true);
+
+    return (emitError("expected constant integer or floating point value"),
+            nullptr);
+  }
+
+  // Parse a location attribute.
+  case Token::kw_loc: {
+    LocationAttr attr;
+    return failed(parseLocation(attr)) ? Attribute() : attr;
+  }
+
+  // Parse an opaque elements attribute.
+  case Token::kw_opaque:
+    return parseOpaqueElementsAttr();
+
+  // Parse a sparse elements attribute.
+  case Token::kw_sparse:
+    return parseSparseElementsAttr();
+
+  // Parse a string attribute.
+  case Token::string: {
+    auto val = getToken().getStringValue();
+    consumeToken(Token::string);
+    // Parse the optional trailing colon type if one wasn't explicitly provided.
+    if (!type && consumeIf(Token::colon) && !(type = parseType()))
+      return Attribute();
+
+    return type ? StringAttr::get(val, type)
+                : StringAttr::get(val, getContext());
+  }
+
+  // Parse a symbol reference attribute.
+  case Token::at_identifier: {
+    auto nameStr = getTokenSpelling();
+    consumeToken(Token::at_identifier);
+    return builder.getSymbolRefAttr(nameStr.drop_front());
+  }
+
+  // Parse a 'unit' attribute.
+  case Token::kw_unit:
+    consumeToken(Token::kw_unit);
+    return builder.getUnitAttr();
+
+  default:
+    // Parse a type attribute.
+    if (Type type = parseType())
+      return builder.getTypeAttr(type);
+    return nullptr;
+  }
+}
+
+/// Attribute dictionary.
+///
+///   attribute-dict ::= `{` `}`
+///                    | `{` attribute-entry (`,` attribute-entry)* `}`
+///   attribute-entry ::= bare-id `=` attribute-value
+///
+ParseResult
+Parser::parseAttributeDict(SmallVectorImpl<NamedAttribute> &attributes) {
+  if (!consumeIf(Token::l_brace))
+    return failure();
+
+  auto parseElt = [&]() -> ParseResult {
+    // We allow keywords as attribute names.
+    if (getToken().isNot(Token::bare_identifier, Token::inttype) &&
+        !getToken().isKeyword())
+      return emitError("expected attribute name");
+    Identifier nameId = builder.getIdentifier(getTokenSpelling());
+    consumeToken();
+
+    // Try to parse the '=' for the attribute value.
+    if (!consumeIf(Token::equal)) {
+      // If there is no '=', we treat this as a unit attribute.
+      attributes.push_back({nameId, builder.getUnitAttr()});
+      return success();
+    }
+
+    auto attr = parseAttribute();
+    if (!attr)
+      return failure();
+
+    attributes.push_back({nameId, attr});
+    return success();
+  };
+
+  if (parseCommaSeparatedListUntil(Token::r_brace, parseElt))
+    return failure();
+
+  return success();
+}
+
+/// Parse an extended attribute.
+///
+///   extended-attribute ::= (dialect-attribute | attribute-alias)
+///   dialect-attribute  ::= `#` dialect-namespace `<` `"` attr-data `"` `>`
+///   dialect-attribute  ::= `#` alias-name pretty-dialect-sym-body?
+///   attribute-alias    ::= `#` alias-name
+///
+Attribute Parser::parseExtendedAttr(Type type) {
+  Attribute attr = parseExtendedSymbol<Attribute>(
+      *this, Token::hash_identifier, state.attributeAliasDefinitions,
+      [&](StringRef dialectName, StringRef symbolData,
+          Location loc) -> Attribute {
+        // Parse an optional trailing colon type.
+        Type attrType = type;
+        if (consumeIf(Token::colon) && !(attrType = parseType()))
+          return Attribute();
+
+        // If we found a registered dialect, then ask it to parse the attribute.
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName))
+          return dialect->parseAttribute(symbolData, attrType, loc);
+
+        // Otherwise, form a new opaque attribute.
+        return OpaqueAttr::getChecked(
+            Identifier::get(dialectName, state.context), symbolData,
+            attrType ? attrType : NoneType::get(state.context), loc);
+      });
+
+  // Ensure that the attribute has the same type as requested.
+  if (attr && type && attr.getType() != type) {
+    emitError("attribute type different than expected: expected ")
+        << type << ", but got " << attr.getType();
+    return nullptr;
+  }
+  return attr;
+}
+
+/// Parse a float attribute.
+Attribute Parser::parseFloatAttr(Type type, bool isNegative) {
+  auto val = getToken().getFloatingPointValue();
+  if (!val.hasValue())
+    return (emitError("floating point value too large for attribute"), nullptr);
+  consumeToken(Token::floatliteral);
+  if (!type) {
+    // Default to F64 when no type is specified.
+    if (!consumeIf(Token::colon))
+      type = builder.getF64Type();
+    else if (!(type = parseType()))
+      return nullptr;
+  }
+  if (!type.isa<FloatType>())
+    return (emitError("floating point value not valid for specified type"),
+            nullptr);
+  return FloatAttr::get(type, isNegative ? -val.getValue() : val.getValue());
+}
+
+/// Construct a float attribute bitwise equivalent to the integer literal.
+static FloatAttr buildHexadecimalFloatLiteral(Parser *p, FloatType type,
+                                              uint64_t value) {
+  int width = type.getIntOrFloatBitWidth();
+  APInt apInt(width, value);
+  if (apInt != value) {
+    p->emitError("hexadecimal float constant out of range for type");
+    return nullptr;
+  }
+  APFloat apFloat(type.getFloatSemantics(), apInt);
+  return p->builder.getFloatAttr(type, apFloat);
+}
+
+/// Parse a decimal or a hexadecimal literal, which can be either an integer
+/// or a float attribute.
+Attribute Parser::parseDecOrHexAttr(Type type, bool isNegative) {
+  auto val = getToken().getUInt64IntegerValue();
+  if (!val.hasValue())
+    return (emitError("integer constant out of range for attribute"), nullptr);
+
+  // Remember if the literal is hexadecimal.
+  StringRef spelling = getToken().getSpelling();
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  consumeToken(Token::integer);
+  if (!type) {
+    // Default to i64 if not type is specified.
+    if (!consumeIf(Token::colon))
+      type = builder.getIntegerType(64);
+    else if (!(type = parseType()))
+      return nullptr;
+  }
+
+  // Hexadecimal representation of float literals is not supported for bfloat16.
+  // When supported, the literal should be unsigned.
+  auto floatType = type.dyn_cast<FloatType>();
+  if (floatType && !type.isBF16()) {
+    if (isNegative) {
+      emitError("hexadecimal float literal should not have a leading minus");
+      return nullptr;
+    }
+    if (!isHex) {
+      emitError("unexpected decimal integer literal for a float attribute")
+              .attachNote()
+          << "add a trailing dot to make the literal a float";
+      return nullptr;
+    }
+
+    // Construct a float attribute bitwise equivalent to the integer literal.
+    return buildHexadecimalFloatLiteral(this, floatType, *val);
+  }
+
+  if (!type.isIntOrIndex())
+    return (emitError("integer literal not valid for specified type"), nullptr);
+
+  // Parse the integer literal.
+  int width = type.isIndex() ? 64 : type.getIntOrFloatBitWidth();
+  APInt apInt(width, *val, isNegative);
+  if (apInt != *val)
+    return (emitError("integer constant out of range for attribute"), nullptr);
+
+  // Otherwise construct an integer attribute.
+  if (isNegative ? (int64_t)-val.getValue() >= 0 : (int64_t)val.getValue() < 0)
+    return (emitError("integer constant out of range for attribute"), nullptr);
+
+  return builder.getIntegerAttr(type, isNegative ? -apInt : apInt);
+}
+
+/// Parse an opaque elements attribute.
+Attribute Parser::parseOpaqueElementsAttr() {
+  consumeToken(Token::kw_opaque);
+  if (parseToken(Token::less, "expected '<' after 'opaque'"))
+    return nullptr;
+
+  if (getToken().isNot(Token::string))
+    return (emitError("expected dialect namespace"), nullptr);
+
+  auto name = getToken().getStringValue();
+  auto *dialect = builder.getContext()->getRegisteredDialect(name);
+  // TODO(shpeisman): Allow for having an unknown dialect on an opaque
+  // attribute. Otherwise, it can't be roundtripped without having the dialect
+  // registered.
+  if (!dialect)
+    return (emitError("no registered dialect with namespace '" + name + "'"),
+            nullptr);
+
+  consumeToken(Token::string);
+  if (parseToken(Token::comma, "expected ','"))
+    return nullptr;
+
+  if (getToken().getKind() != Token::string)
+    return (emitError("opaque string should start with '0x'"), nullptr);
+
+  auto val = getToken().getStringValue();
+  if (val.size() < 2 || val[0] != '0' || val[1] != 'x')
+    return (emitError("opaque string should start with '0x'"), nullptr);
+
+  val = val.substr(2);
+  if (!llvm::all_of(val, llvm::isHexDigit))
+    return (emitError("opaque string only contains hex digits"), nullptr);
+
+  consumeToken(Token::string);
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+
+  return builder.getOpaqueElementsAttr(dialect, type, llvm::fromHex(val));
+}
+
+namespace {
+class TensorLiteralParser {
+public:
+  TensorLiteralParser(Parser &p) : p(p) {}
+
+  ParseResult parse() {
+    if (p.getToken().is(Token::l_square))
+      return parseList(shape);
+    return parseElement();
+  }
+
+  /// Build a dense attribute instance with the parsed elements and the given
+  /// shaped type.
+  DenseElementsAttr getAttr(llvm::SMLoc loc, ShapedType type);
+
+  ArrayRef<int64_t> getShape() const { return shape; }
+
+private:
+  enum class ElementKind { Boolean, Integer, Float };
+
+  /// Return a string to represent the given element kind.
+  const char *getElementKindStr(ElementKind kind) {
+    switch (kind) {
+    case ElementKind::Boolean:
+      return "'boolean'";
+    case ElementKind::Integer:
+      return "'integer'";
+    case ElementKind::Float:
+      return "'float'";
+    }
+    llvm_unreachable("unknown element kind");
+  }
+
+  /// Build a Dense Integer attribute for the given type.
+  DenseElementsAttr getIntAttr(llvm::SMLoc loc, ShapedType type,
+                               IntegerType eltTy);
+
+  /// Build a Dense Float attribute for the given type.
+  DenseElementsAttr getFloatAttr(llvm::SMLoc loc, ShapedType type,
+                                 FloatType eltTy);
+
+  /// Parse a single element, returning failure if it isn't a valid element
+  /// literal. For example:
+  /// parseElement(1) -> Success, 1
+  /// parseElement([1]) -> Failure
+  ParseResult parseElement();
+
+  /// Parse a list of either lists or elements, returning the dimensions of the
+  /// parsed sub-tensors in dims. For example:
+  ///   parseList([1, 2, 3]) -> Success, [3]
+  ///   parseList([[1, 2], [3, 4]]) -> Success, [2, 2]
+  ///   parseList([[1, 2], 3]) -> Failure
+  ///   parseList([[1, [2, 3]], [4, [5]]]) -> Failure
+  ParseResult parseList(llvm::SmallVectorImpl<int64_t> &dims);
+
+  Parser &p;
+
+  /// The shape inferred from the parsed elements.
+  SmallVector<int64_t, 4> shape;
+
+  /// Storage used when parsing elements, this is a pair of <is_negated, token>.
+  std::vector<std::pair<bool, Token>> storage;
+
+  /// A flag that indicates the type of elements that have been parsed.
+  llvm::Optional<ElementKind> knownEltKind;
+};
+} // namespace
+
+/// Build a dense attribute instance with the parsed elements and the given
+/// shaped type.
+DenseElementsAttr TensorLiteralParser::getAttr(llvm::SMLoc loc,
+                                               ShapedType type) {
+  // Check that the parsed storage size has the same number of elements to the
+  // type, or is a known splat.
+  if (!shape.empty() && getShape() != type.getShape()) {
+    p.emitError(loc) << "inferred shape of elements literal ([" << getShape()
+                     << "]) does not match type ([" << type.getShape() << "])";
+    return nullptr;
+  }
+
+  // If the type is an integer, build a set of APInt values from the storage
+  // with the correct bitwidth.
+  if (auto intTy = type.getElementType().dyn_cast<IntegerType>())
+    return getIntAttr(loc, type, intTy);
+
+  // Otherwise, this must be a floating point type.
+  auto floatTy = type.getElementType().dyn_cast<FloatType>();
+  if (!floatTy) {
+    p.emitError(loc) << "expected floating-point or integer element type, got "
+                     << type.getElementType();
+    return nullptr;
+  }
+  return getFloatAttr(loc, type, floatTy);
+}
+
+/// Build a Dense Integer attribute for the given type.
+DenseElementsAttr TensorLiteralParser::getIntAttr(llvm::SMLoc loc,
+                                                  ShapedType type,
+                                                  IntegerType eltTy) {
+  std::vector<APInt> intElements;
+  intElements.reserve(storage.size());
+  for (const auto &signAndToken : storage) {
+    bool isNegative = signAndToken.first;
+    const Token &token = signAndToken.second;
+
+    // Check to see if floating point values were parsed.
+    if (token.is(Token::floatliteral)) {
+      p.emitError() << "expected integer elements, but parsed floating-point";
+      return nullptr;
+    }
+
+    assert(token.isAny(Token::integer, Token::kw_true, Token::kw_false) &&
+           "unexpected token type");
+    if (token.isAny(Token::kw_true, Token::kw_false)) {
+      if (!eltTy.isInteger(1))
+        p.emitError() << "expected i1 type for 'true' or 'false' values";
+      APInt apInt(eltTy.getWidth(), token.is(Token::kw_true),
+                  /*isSigned=*/false);
+      intElements.push_back(apInt);
+      continue;
+    }
+
+    // Create APInt values for each element with the correct bitwidth.
+    auto val = token.getUInt64IntegerValue();
+    if (!val.hasValue() || (isNegative ? (int64_t)-val.getValue() >= 0
+                                       : (int64_t)val.getValue() < 0)) {
+      p.emitError(token.getLoc(),
+                  "integer constant out of range for attribute");
+      return nullptr;
+    }
+    APInt apInt(eltTy.getWidth(), val.getValue(), isNegative);
+    if (apInt != val.getValue())
+      return (p.emitError("integer constant out of range for type"), nullptr);
+    intElements.push_back(isNegative ? -apInt : apInt);
+  }
+
+  return DenseElementsAttr::get(type, intElements);
+}
+
+/// Build a Dense Float attribute for the given type.
+DenseElementsAttr TensorLiteralParser::getFloatAttr(llvm::SMLoc loc,
+                                                    ShapedType type,
+                                                    FloatType eltTy) {
+  std::vector<Attribute> floatValues;
+  floatValues.reserve(storage.size());
+  for (const auto &signAndToken : storage) {
+    bool isNegative = signAndToken.first;
+    const Token &token = signAndToken.second;
+
+    // Handle hexadecimal float literals.
+    if (token.is(Token::integer) && token.getSpelling().startswith("0x")) {
+      if (isNegative) {
+        p.emitError(token.getLoc())
+            << "hexadecimal float literal should not have a leading minus";
+        return nullptr;
+      }
+      auto val = token.getUInt64IntegerValue();
+      if (!val.hasValue()) {
+        p.emitError("hexadecimal float constant out of range for attribute");
+        return nullptr;
+      }
+      FloatAttr attr = buildHexadecimalFloatLiteral(&p, eltTy, *val);
+      if (!attr)
+        return nullptr;
+      floatValues.push_back(attr);
+      continue;
+    }
+
+    // Check to see if any decimal integers or booleans were parsed.
+    if (!token.is(Token::floatliteral)) {
+      p.emitError() << "expected floating-point elements, but parsed integer";
+      return nullptr;
+    }
+
+    // Build the float values from tokens.
+    auto val = token.getFloatingPointValue();
+    if (!val.hasValue()) {
+      p.emitError("floating point value too large for attribute");
+      return nullptr;
+    }
+    floatValues.push_back(FloatAttr::get(eltTy, isNegative ? -*val : *val));
+  }
+
+  return DenseElementsAttr::get(type, floatValues);
+}
+
+ParseResult TensorLiteralParser::parseElement() {
+  switch (p.getToken().getKind()) {
+  // Parse a boolean element.
+  case Token::kw_true:
+  case Token::kw_false:
+  case Token::floatliteral:
+  case Token::integer:
+    storage.emplace_back(/*isNegative=*/false, p.getToken());
+    p.consumeToken();
+    break;
+
+  // Parse a signed integer or a negative floating-point element.
+  case Token::minus:
+    p.consumeToken(Token::minus);
+    if (!p.getToken().isAny(Token::floatliteral, Token::integer))
+      return p.emitError("expected integer or floating point literal");
+    storage.emplace_back(/*isNegative=*/true, p.getToken());
+    p.consumeToken();
+    break;
+
+  default:
+    return p.emitError("expected element literal of primitive type");
+  }
+
+  return success();
+}
+
+/// Parse a list of either lists or elements, returning the dimensions of the
+/// parsed sub-tensors in dims. For example:
+///   parseList([1, 2, 3]) -> Success, [3]
+///   parseList([[1, 2], [3, 4]]) -> Success, [2, 2]
+///   parseList([[1, 2], 3]) -> Failure
+///   parseList([[1, [2, 3]], [4, [5]]]) -> Failure
+ParseResult
+TensorLiteralParser::parseList(llvm::SmallVectorImpl<int64_t> &dims) {
+  p.consumeToken(Token::l_square);
+
+  auto checkDims =
+      [&](const llvm::SmallVectorImpl<int64_t> &prevDims,
+          const llvm::SmallVectorImpl<int64_t> &newDims) -> ParseResult {
+    if (prevDims == newDims)
+      return success();
+    return p.emitError("tensor literal is invalid; ranks are not consistent "
+                       "between elements");
+  };
+
+  bool first = true;
+  llvm::SmallVector<int64_t, 4> newDims;
+  unsigned size = 0;
+  auto parseCommaSeparatedList = [&]() -> ParseResult {
+    llvm::SmallVector<int64_t, 4> thisDims;
+    if (p.getToken().getKind() == Token::l_square) {
+      if (parseList(thisDims))
+        return failure();
+    } else if (parseElement()) {
+      return failure();
+    }
+    ++size;
+    if (!first)
+      return checkDims(newDims, thisDims);
+    newDims = thisDims;
+    first = false;
+    return success();
+  };
+  if (p.parseCommaSeparatedListUntil(Token::r_square, parseCommaSeparatedList))
+    return failure();
+
+  // Return the sublists' dimensions with 'size' prepended.
+  dims.clear();
+  dims.push_back(size);
+  dims.append(newDims.begin(), newDims.end());
+  return success();
+}
+
+/// Parse a dense elements attribute.
+Attribute Parser::parseDenseElementsAttr() {
+  consumeToken(Token::kw_dense);
+  if (parseToken(Token::less, "expected '<' after 'dense'"))
+    return nullptr;
+
+  // Parse the literal data.
+  TensorLiteralParser literalParser(*this);
+  if (literalParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto typeLoc = getToken().getLoc();
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+  return literalParser.getAttr(typeLoc, type);
+}
+
+/// Shaped type for elements attribute.
+///
+///   elements-literal-type ::= vector-type | ranked-tensor-type
+///
+/// This method also checks the type has static shape.
+ShapedType Parser::parseElementsLiteralType() {
+  auto type = parseType();
+  if (!type)
+    return nullptr;
+
+  if (!type.isa<RankedTensorType>() && !type.isa<VectorType>()) {
+    emitError("elements literal must be a ranked tensor or vector type");
+    return nullptr;
+  }
+
+  auto sType = type.cast<ShapedType>();
+  if (!sType.hasStaticShape())
+    return (emitError("elements literal type must have static shape"), nullptr);
+
+  return sType;
+}
+
+/// Parse a sparse elements attribute.
+Attribute Parser::parseSparseElementsAttr() {
+  consumeToken(Token::kw_sparse);
+  if (parseToken(Token::less, "Expected '<' after 'sparse'"))
+    return nullptr;
+
+  /// Parse indices
+  auto indicesLoc = getToken().getLoc();
+  TensorLiteralParser indiceParser(*this);
+  if (indiceParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::comma, "expected ','"))
+    return nullptr;
+
+  /// Parse values.
+  auto valuesLoc = getToken().getLoc();
+  TensorLiteralParser valuesParser(*this);
+  if (valuesParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+
+  // If the indices are a splat, i.e. the literal parser parsed an element and
+  // not a list, we set the shape explicitly. The indices are represented by a
+  // 2-dimensional shape where the second dimension is the rank of the type.
+  // Given that the parsed indices is a splat, we know that we only have one
+  // indice and thus one for the first dimension.
+  auto indiceEltType = builder.getIntegerType(64);
+  ShapedType indicesType;
+  if (indiceParser.getShape().empty()) {
+    indicesType = RankedTensorType::get({1, type.getRank()}, indiceEltType);
+  } else {
+    // Otherwise, set the shape to the one parsed by the literal parser.
+    indicesType = RankedTensorType::get(indiceParser.getShape(), indiceEltType);
+  }
+  auto indices = indiceParser.getAttr(indicesLoc, indicesType);
+
+  // If the values are a splat, set the shape explicitly based on the number of
+  // indices. The number of indices is encoded in the first dimension of the
+  // indice shape type.
+  auto valuesEltType = type.getElementType();
+  ShapedType valuesType =
+      valuesParser.getShape().empty()
+          ? RankedTensorType::get({indicesType.getDimSize(0)}, valuesEltType)
+          : RankedTensorType::get(valuesParser.getShape(), valuesEltType);
+  auto values = valuesParser.getAttr(valuesLoc, valuesType);
+
+  /// Sanity check.
+  if (valuesType.getRank() != 1)
+    return (emitError("expected 1-d tensor for values"), nullptr);
+
+  auto sameShape = (indicesType.getRank() == 1) ||
+                   (type.getRank() == indicesType.getDimSize(1));
+  auto sameElementNum = indicesType.getDimSize(0) == valuesType.getDimSize(0);
+  if (!sameShape || !sameElementNum) {
+    emitError() << "expected shape ([" << type.getShape()
+                << "]); inferred shape of indices literal (["
+                << indicesType.getShape()
+                << "]); inferred shape of values literal (["
+                << valuesType.getShape() << "])";
+    return nullptr;
+  }
+
+  // Build the sparse elements attribute by the indices and values.
+  return SparseElementsAttr::get(type, indices, values);
+}
+
+//===----------------------------------------------------------------------===//
+// Location parsing.
+//===----------------------------------------------------------------------===//
+
+/// Parse a location.
+///
+///   location           ::= `loc` inline-location
+///   inline-location    ::= '(' location-inst ')'
+///
+ParseResult Parser::parseLocation(LocationAttr &loc) {
+  // Check for 'loc' identifier.
+  if (parseToken(Token::kw_loc, "expected 'loc' keyword"))
+    return emitError();
+
+  // Parse the inline-location.
+  if (parseToken(Token::l_paren, "expected '(' in inline location") ||
+      parseLocationInstance(loc) ||
+      parseToken(Token::r_paren, "expected ')' in inline location"))
+    return failure();
+  return success();
+}
+
+/// Specific location instances.
+///
+/// location-inst ::= filelinecol-location |
+///                   name-location |
+///                   callsite-location |
+///                   fused-location |
+///                   unknown-location
+/// filelinecol-location ::= string-literal ':' integer-literal
+///                                         ':' integer-literal
+/// name-location ::= string-literal
+/// callsite-location ::= 'callsite' '(' location-inst 'at' location-inst ')'
+/// fused-location ::= fused ('<' attribute-value '>')?
+///                    '[' location-inst (location-inst ',')* ']'
+/// unknown-location ::= 'unknown'
+///
+ParseResult Parser::parseCallSiteLocation(LocationAttr &loc) {
+  consumeToken(Token::bare_identifier);
+
+  // Parse the '('.
+  if (parseToken(Token::l_paren, "expected '(' in callsite location"))
+    return failure();
+
+  // Parse the callee location.
+  LocationAttr calleeLoc;
+  if (parseLocationInstance(calleeLoc))
+    return failure();
+
+  // Parse the 'at'.
+  if (getToken().isNot(Token::bare_identifier) ||
+      getToken().getSpelling() != "at")
+    return emitError("expected 'at' in callsite location");
+  consumeToken(Token::bare_identifier);
+
+  // Parse the caller location.
+  LocationAttr callerLoc;
+  if (parseLocationInstance(callerLoc))
+    return failure();
+
+  // Parse the ')'.
+  if (parseToken(Token::r_paren, "expected ')' in callsite location"))
+    return failure();
+
+  // Return the callsite location.
+  loc = CallSiteLoc::get(calleeLoc, callerLoc);
+  return success();
+}
+
+ParseResult Parser::parseFusedLocation(LocationAttr &loc) {
+  consumeToken(Token::bare_identifier);
+
+  // Try to parse the optional metadata.
+  Attribute metadata;
+  if (consumeIf(Token::less)) {
+    metadata = parseAttribute();
+    if (!metadata)
+      return emitError("expected valid attribute metadata");
+    // Parse the '>' token.
+    if (parseToken(Token::greater,
+                   "expected '>' after fused location metadata"))
+      return failure();
+  }
+
+  llvm::SmallVector<Location, 4> locations;
+  auto parseElt = [&] {
+    LocationAttr newLoc;
+    if (parseLocationInstance(newLoc))
+      return failure();
+    locations.push_back(newLoc);
+    return success();
+  };
+
+  if (parseToken(Token::l_square, "expected '[' in fused location") ||
+      parseCommaSeparatedList(parseElt) ||
+      parseToken(Token::r_square, "expected ']' in fused location"))
+    return failure();
+
+  // Return the fused location.
+  loc = FusedLoc::get(locations, metadata, getContext());
+  return success();
+}
+
+ParseResult Parser::parseNameOrFileLineColLocation(LocationAttr &loc) {
+  auto *ctx = getContext();
+  auto str = getToken().getStringValue();
+  consumeToken(Token::string);
+
+  // If the next token is ':' this is a filelinecol location.
+  if (consumeIf(Token::colon)) {
+    // Parse the line number.
+    if (getToken().isNot(Token::integer))
+      return emitError("expected integer line number in FileLineColLoc");
+    auto line = getToken().getUnsignedIntegerValue();
+    if (!line.hasValue())
+      return emitError("expected integer line number in FileLineColLoc");
+    consumeToken(Token::integer);
+
+    // Parse the ':'.
+    if (parseToken(Token::colon, "expected ':' in FileLineColLoc"))
+      return failure();
+
+    // Parse the column number.
+    if (getToken().isNot(Token::integer))
+      return emitError("expected integer column number in FileLineColLoc");
+    auto column = getToken().getUnsignedIntegerValue();
+    if (!column.hasValue())
+      return emitError("expected integer column number in FileLineColLoc");
+    consumeToken(Token::integer);
+
+    loc = FileLineColLoc::get(str, line.getValue(), column.getValue(), ctx);
+    return success();
+  }
+
+  // Otherwise, this is a NameLoc.
+
+  // Check for a child location.
+  if (consumeIf(Token::l_paren)) {
+    auto childSourceLoc = getToken().getLoc();
+
+    // Parse the child location.
+    LocationAttr childLoc;
+    if (parseLocationInstance(childLoc))
+      return failure();
+
+    // The child must not be another NameLoc.
+    if (childLoc.isa<NameLoc>())
+      return emitError(childSourceLoc,
+                       "child of NameLoc cannot be another NameLoc");
+    loc = NameLoc::get(Identifier::get(str, ctx), childLoc);
+
+    // Parse the closing ')'.
+    if (parseToken(Token::r_paren,
+                   "expected ')' after child location of NameLoc"))
+      return failure();
+  } else {
+    loc = NameLoc::get(Identifier::get(str, ctx), ctx);
+  }
+
+  return success();
+}
+
+ParseResult Parser::parseLocationInstance(LocationAttr &loc) {
+  // Handle either name or filelinecol locations.
+  if (getToken().is(Token::string))
+    return parseNameOrFileLineColLocation(loc);
+
+  // Bare tokens required for other cases.
+  if (!getToken().is(Token::bare_identifier))
+    return emitError("expected location instance");
+
+  // Check for the 'callsite' signifying a callsite location.
+  if (getToken().getSpelling() == "callsite")
+    return parseCallSiteLocation(loc);
+
+  // If the token is 'fused', then this is a fused location.
+  if (getToken().getSpelling() == "fused")
+    return parseFusedLocation(loc);
+
+  // Check for a 'unknown' for an unknown location.
+  if (getToken().getSpelling() == "unknown") {
+    consumeToken(Token::bare_identifier);
+    loc = UnknownLoc::get(getContext());
+    return success();
+  }
+
+  return emitError("expected location instance");
+}
+
+//===----------------------------------------------------------------------===//
+// Affine parsing.
+//===----------------------------------------------------------------------===//
+
+/// Lower precedence ops (all at the same precedence level). LNoOp is false in
+/// the boolean sense.
+enum AffineLowPrecOp {
+  /// Null value.
+  LNoOp,
+  Add,
+  Sub
+};
+
+/// Higher precedence ops - all at the same precedence level. HNoOp is false
+/// in the boolean sense.
+enum AffineHighPrecOp {
+  /// Null value.
+  HNoOp,
+  Mul,
+  FloorDiv,
+  CeilDiv,
+  Mod
+};
+
+namespace {
+/// This is a specialized parser for affine structures (affine maps, affine
+/// expressions, and integer sets), maintaining the state transient to their
+/// bodies.
+class AffineParser : public Parser {
+public:
+  AffineParser(ParserState &state, bool allowParsingSSAIds = false,
+               llvm::function_ref<ParseResult(bool)> parseElement = nullptr)
+      : Parser(state), allowParsingSSAIds(allowParsingSSAIds),
+        parseElement(parseElement), numDimOperands(0), numSymbolOperands(0) {}
+
+  AffineMap parseAffineMapRange(unsigned numDims, unsigned numSymbols);
+  ParseResult parseAffineMapOrIntegerSetInline(AffineMap &map, IntegerSet &set);
+  IntegerSet parseIntegerSetConstraints(unsigned numDims, unsigned numSymbols);
+  ParseResult parseAffineMapOfSSAIds(AffineMap &map);
+  void getDimsAndSymbolSSAIds(SmallVectorImpl<StringRef> &dimAndSymbolSSAIds,
+                              unsigned &numDims);
+
+private:
+  // Binary affine op parsing.
+  AffineLowPrecOp consumeIfLowPrecOp();
+  AffineHighPrecOp consumeIfHighPrecOp();
+
+  // Identifier lists for polyhedral structures.
+  ParseResult parseDimIdList(unsigned &numDims);
+  ParseResult parseSymbolIdList(unsigned &numSymbols);
+  ParseResult parseDimAndOptionalSymbolIdList(unsigned &numDims,
+                                              unsigned &numSymbols);
+  ParseResult parseIdentifierDefinition(AffineExpr idExpr);
+
+  AffineExpr parseAffineExpr();
+  AffineExpr parseParentheticalExpr();
+  AffineExpr parseNegateExpression(AffineExpr lhs);
+  AffineExpr parseIntegerExpr();
+  AffineExpr parseBareIdExpr();
+  AffineExpr parseSSAIdExpr(bool isSymbol);
+  AffineExpr parseSymbolSSAIdExpr();
+
+  AffineExpr getAffineBinaryOpExpr(AffineHighPrecOp op, AffineExpr lhs,
+                                   AffineExpr rhs, SMLoc opLoc);
+  AffineExpr getAffineBinaryOpExpr(AffineLowPrecOp op, AffineExpr lhs,
+                                   AffineExpr rhs);
+  AffineExpr parseAffineOperandExpr(AffineExpr lhs);
+  AffineExpr parseAffineLowPrecOpExpr(AffineExpr llhs, AffineLowPrecOp llhsOp);
+  AffineExpr parseAffineHighPrecOpExpr(AffineExpr llhs, AffineHighPrecOp llhsOp,
+                                       SMLoc llhsOpLoc);
+  AffineExpr parseAffineConstraint(bool *isEq);
+
+private:
+  bool allowParsingSSAIds;
+  llvm::function_ref<ParseResult(bool)> parseElement;
+  unsigned numDimOperands;
+  unsigned numSymbolOperands;
+  SmallVector<std::pair<StringRef, AffineExpr>, 4> dimsAndSymbols;
+};
+} // end anonymous namespace
+
+/// Create an affine binary high precedence op expression (mul's, div's, mod).
+/// opLoc is the location of the op token to be used to report errors
+/// for non-conforming expressions.
+AffineExpr AffineParser::getAffineBinaryOpExpr(AffineHighPrecOp op,
+                                               AffineExpr lhs, AffineExpr rhs,
+                                               SMLoc opLoc) {
+  // TODO: make the error location info accurate.
+  switch (op) {
+  case Mul:
+    if (!lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: at least one of the multiply "
+                       "operands has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs * rhs;
+  case FloorDiv:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of floordiv "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs.floorDiv(rhs);
+  case CeilDiv:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of ceildiv "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs.ceilDiv(rhs);
+  case Mod:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of mod "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs % rhs;
+  case HNoOp:
+    llvm_unreachable("can't create affine expression for null high prec op");
+    return nullptr;
+  }
+  llvm_unreachable("Unknown AffineHighPrecOp");
+}
+
+/// Create an affine binary low precedence op expression (add, sub).
+AffineExpr AffineParser::getAffineBinaryOpExpr(AffineLowPrecOp op,
+                                               AffineExpr lhs, AffineExpr rhs) {
+  switch (op) {
+  case AffineLowPrecOp::Add:
+    return lhs + rhs;
+  case AffineLowPrecOp::Sub:
+    return lhs - rhs;
+  case AffineLowPrecOp::LNoOp:
+    llvm_unreachable("can't create affine expression for null low prec op");
+    return nullptr;
+  }
+  llvm_unreachable("Unknown AffineLowPrecOp");
+}
+
+/// Consume this token if it is a lower precedence affine op (there are only
+/// two precedence levels).
+AffineLowPrecOp AffineParser::consumeIfLowPrecOp() {
+  switch (getToken().getKind()) {
+  case Token::plus:
+    consumeToken(Token::plus);
+    return AffineLowPrecOp::Add;
+  case Token::minus:
+    consumeToken(Token::minus);
+    return AffineLowPrecOp::Sub;
+  default:
+    return AffineLowPrecOp::LNoOp;
+  }
+}
+
+/// Consume this token if it is a higher precedence affine op (there are only
+/// two precedence levels)
+AffineHighPrecOp AffineParser::consumeIfHighPrecOp() {
+  switch (getToken().getKind()) {
+  case Token::star:
+    consumeToken(Token::star);
+    return Mul;
+  case Token::kw_floordiv:
+    consumeToken(Token::kw_floordiv);
+    return FloorDiv;
+  case Token::kw_ceildiv:
+    consumeToken(Token::kw_ceildiv);
+    return CeilDiv;
+  case Token::kw_mod:
+    consumeToken(Token::kw_mod);
+    return Mod;
+  default:
+    return HNoOp;
+  }
+}
+
+/// Parse a high precedence op expression list: mul, div, and mod are high
+/// precedence binary ops, i.e., parse a
+///   expr_1 op_1 expr_2 op_2 ... expr_n
+/// where op_1, op_2 are all a AffineHighPrecOp (mul, div, mod).
+/// All affine binary ops are left associative.
+/// Given llhs, returns (llhs llhsOp lhs) op rhs, or (lhs op rhs) if llhs is
+/// null. If no rhs can be found, returns (llhs llhsOp lhs) or lhs if llhs is
+/// null. llhsOpLoc is the location of the llhsOp token that will be used to
+/// report an error for non-conforming expressions.
+AffineExpr AffineParser::parseAffineHighPrecOpExpr(AffineExpr llhs,
+                                                   AffineHighPrecOp llhsOp,
+                                                   SMLoc llhsOpLoc) {
+  AffineExpr lhs = parseAffineOperandExpr(llhs);
+  if (!lhs)
+    return nullptr;
+
+  // Found an LHS. Parse the remaining expression.
+  auto opLoc = getToken().getLoc();
+  if (AffineHighPrecOp op = consumeIfHighPrecOp()) {
+    if (llhs) {
+      AffineExpr expr = getAffineBinaryOpExpr(llhsOp, llhs, lhs, opLoc);
+      if (!expr)
+        return nullptr;
+      return parseAffineHighPrecOpExpr(expr, op, opLoc);
+    }
+    // No LLHS, get RHS
+    return parseAffineHighPrecOpExpr(lhs, op, opLoc);
+  }
+
+  // This is the last operand in this expression.
+  if (llhs)
+    return getAffineBinaryOpExpr(llhsOp, llhs, lhs, llhsOpLoc);
+
+  // No llhs, 'lhs' itself is the expression.
+  return lhs;
+}
+
+/// Parse an affine expression inside parentheses.
+///
+///   affine-expr ::= `(` affine-expr `)`
+AffineExpr AffineParser::parseParentheticalExpr() {
+  if (parseToken(Token::l_paren, "expected '('"))
+    return nullptr;
+  if (getToken().is(Token::r_paren))
+    return (emitError("no expression inside parentheses"), nullptr);
+
+  auto expr = parseAffineExpr();
+  if (!expr)
+    return nullptr;
+  if (parseToken(Token::r_paren, "expected ')'"))
+    return nullptr;
+
+  return expr;
+}
+
+/// Parse the negation expression.
+///
+///   affine-expr ::= `-` affine-expr
+AffineExpr AffineParser::parseNegateExpression(AffineExpr lhs) {
+  if (parseToken(Token::minus, "expected '-'"))
+    return nullptr;
+
+  AffineExpr operand = parseAffineOperandExpr(lhs);
+  // Since negation has the highest precedence of all ops (including high
+  // precedence ops) but lower than parentheses, we are only going to use
+  // parseAffineOperandExpr instead of parseAffineExpr here.
+  if (!operand)
+    // Extra error message although parseAffineOperandExpr would have
+    // complained. Leads to a better diagnostic.
+    return (emitError("missing operand of negation"), nullptr);
+  return (-1) * operand;
+}
+
+/// Parse a bare id that may appear in an affine expression.
+///
+///   affine-expr ::= bare-id
+AffineExpr AffineParser::parseBareIdExpr() {
+  if (getToken().isNot(Token::bare_identifier))
+    return (emitError("expected bare identifier"), nullptr);
+
+  StringRef sRef = getTokenSpelling();
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == sRef) {
+      consumeToken(Token::bare_identifier);
+      return entry.second;
+    }
+  }
+
+  return (emitError("use of undeclared identifier"), nullptr);
+}
+
+/// Parse an SSA id which may appear in an affine expression.
+AffineExpr AffineParser::parseSSAIdExpr(bool isSymbol) {
+  if (!allowParsingSSAIds)
+    return (emitError("unexpected ssa identifier"), nullptr);
+  if (getToken().isNot(Token::percent_identifier))
+    return (emitError("expected ssa identifier"), nullptr);
+  auto name = getTokenSpelling();
+  // Check if we already parsed this SSA id.
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == name) {
+      consumeToken(Token::percent_identifier);
+      return entry.second;
+    }
+  }
+  // Parse the SSA id and add an AffineDim/SymbolExpr to represent it.
+  if (parseElement(isSymbol))
+    return (emitError("failed to parse ssa identifier"), nullptr);
+  auto idExpr = isSymbol
+                    ? getAffineSymbolExpr(numSymbolOperands++, getContext())
+                    : getAffineDimExpr(numDimOperands++, getContext());
+  dimsAndSymbols.push_back({name, idExpr});
+  return idExpr;
+}
+
+AffineExpr AffineParser::parseSymbolSSAIdExpr() {
+  if (parseToken(Token::kw_symbol, "expected symbol keyword") ||
+      parseToken(Token::l_paren, "expected '(' at start of SSA symbol"))
+    return nullptr;
+  AffineExpr symbolExpr = parseSSAIdExpr(/*isSymbol=*/true);
+  if (!symbolExpr)
+    return nullptr;
+  if (parseToken(Token::r_paren, "expected ')' at end of SSA symbol"))
+    return nullptr;
+  return symbolExpr;
+}
+
+/// Parse a positive integral constant appearing in an affine expression.
+///
+///   affine-expr ::= integer-literal
+AffineExpr AffineParser::parseIntegerExpr() {
+  auto val = getToken().getUInt64IntegerValue();
+  if (!val.hasValue() || (int64_t)val.getValue() < 0)
+    return (emitError("constant too large for index"), nullptr);
+
+  consumeToken(Token::integer);
+  return builder.getAffineConstantExpr((int64_t)val.getValue());
+}
+
+/// Parses an expression that can be a valid operand of an affine expression.
+/// lhs: if non-null, lhs is an affine expression that is the lhs of a binary
+/// operator, the rhs of which is being parsed. This is used to determine
+/// whether an error should be emitted for a missing right operand.
+//  Eg: for an expression without parentheses (like i + j + k + l), each
+//  of the four identifiers is an operand. For i + j*k + l, j*k is not an
+//  operand expression, it's an op expression and will be parsed via
+//  parseAffineHighPrecOpExpression(). However, for i + (j*k) + -l, (j*k) and
+//  -l are valid operands that will be parsed by this function.
+AffineExpr AffineParser::parseAffineOperandExpr(AffineExpr lhs) {
+  switch (getToken().getKind()) {
+  case Token::bare_identifier:
+    return parseBareIdExpr();
+  case Token::kw_symbol:
+    return parseSymbolSSAIdExpr();
+  case Token::percent_identifier:
+    return parseSSAIdExpr(/*isSymbol=*/false);
+  case Token::integer:
+    return parseIntegerExpr();
+  case Token::l_paren:
+    return parseParentheticalExpr();
+  case Token::minus:
+    return parseNegateExpression(lhs);
+  case Token::kw_ceildiv:
+  case Token::kw_floordiv:
+  case Token::kw_mod:
+  case Token::plus:
+  case Token::star:
+    if (lhs)
+      emitError("missing right operand of binary operator");
+    else
+      emitError("missing left operand of binary operator");
+    return nullptr;
+  default:
+    if (lhs)
+      emitError("missing right operand of binary operator");
+    else
+      emitError("expected affine expression");
+    return nullptr;
+  }
+}
+
+/// Parse affine expressions that are bare-id's, integer constants,
+/// parenthetical affine expressions, and affine op expressions that are a
+/// composition of those.
+///
+/// All binary op's associate from left to right.
+///
+/// {add, sub} have lower precedence than {mul, div, and mod}.
+///
+/// Add, sub'are themselves at the same precedence level. Mul, floordiv,
+/// ceildiv, and mod are at the same higher precedence level. Negation has
+/// higher precedence than any binary op.
+///
+/// llhs: the affine expression appearing on the left of the one being parsed.
+/// This function will return ((llhs llhsOp lhs) op rhs) if llhs is non null,
+/// and lhs op rhs otherwise; if there is no rhs, llhs llhsOp lhs is returned
+/// if llhs is non-null; otherwise lhs is returned. This is to deal with left
+/// associativity.
+///
+/// Eg: when the expression is e1 + e2*e3 + e4, with e1 as llhs, this function
+/// will return the affine expr equivalent of (e1 + (e2*e3)) + e4, where
+/// (e2*e3) will be parsed using parseAffineHighPrecOpExpr().
+AffineExpr AffineParser::parseAffineLowPrecOpExpr(AffineExpr llhs,
+                                                  AffineLowPrecOp llhsOp) {
+  AffineExpr lhs;
+  if (!(lhs = parseAffineOperandExpr(llhs)))
+    return nullptr;
+
+  // Found an LHS. Deal with the ops.
+  if (AffineLowPrecOp lOp = consumeIfLowPrecOp()) {
+    if (llhs) {
+      AffineExpr sum = getAffineBinaryOpExpr(llhsOp, llhs, lhs);
+      return parseAffineLowPrecOpExpr(sum, lOp);
+    }
+    // No LLHS, get RHS and form the expression.
+    return parseAffineLowPrecOpExpr(lhs, lOp);
+  }
+  auto opLoc = getToken().getLoc();
+  if (AffineHighPrecOp hOp = consumeIfHighPrecOp()) {
+    // We have a higher precedence op here. Get the rhs operand for the llhs
+    // through parseAffineHighPrecOpExpr.
+    AffineExpr highRes = parseAffineHighPrecOpExpr(lhs, hOp, opLoc);
+    if (!highRes)
+      return nullptr;
+
+    // If llhs is null, the product forms the first operand of the yet to be
+    // found expression. If non-null, the op to associate with llhs is llhsOp.
+    AffineExpr expr =
+        llhs ? getAffineBinaryOpExpr(llhsOp, llhs, highRes) : highRes;
+
+    // Recurse for subsequent low prec op's after the affine high prec op
+    // expression.
+    if (AffineLowPrecOp nextOp = consumeIfLowPrecOp())
+      return parseAffineLowPrecOpExpr(expr, nextOp);
+    return expr;
+  }
+  // Last operand in the expression list.
+  if (llhs)
+    return getAffineBinaryOpExpr(llhsOp, llhs, lhs);
+  // No llhs, 'lhs' itself is the expression.
+  return lhs;
+}
+
+/// Parse an affine expression.
+///  affine-expr ::= `(` affine-expr `)`
+///                | `-` affine-expr
+///                | affine-expr `+` affine-expr
+///                | affine-expr `-` affine-expr
+///                | affine-expr `*` affine-expr
+///                | affine-expr `floordiv` affine-expr
+///                | affine-expr `ceildiv` affine-expr
+///                | affine-expr `mod` affine-expr
+///                | bare-id
+///                | integer-literal
+///
+/// Additional conditions are checked depending on the production. For eg.,
+/// one of the operands for `*` has to be either constant/symbolic; the second
+/// operand for floordiv, ceildiv, and mod has to be a positive integer.
+AffineExpr AffineParser::parseAffineExpr() {
+  return parseAffineLowPrecOpExpr(nullptr, AffineLowPrecOp::LNoOp);
+}
+
+/// Parse a dim or symbol from the lists appearing before the actual
+/// expressions of the affine map. Update our state to store the
+/// dimensional/symbolic identifier.
+ParseResult AffineParser::parseIdentifierDefinition(AffineExpr idExpr) {
+  if (getToken().isNot(Token::bare_identifier))
+    return emitError("expected bare identifier");
+
+  auto name = getTokenSpelling();
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == name)
+      return emitError("redefinition of identifier '" + name + "'");
+  }
+  consumeToken(Token::bare_identifier);
+
+  dimsAndSymbols.push_back({name, idExpr});
+  return success();
+}
+
+/// Parse the list of dimensional identifiers to an affine map.
+ParseResult AffineParser::parseDimIdList(unsigned &numDims) {
+  if (parseToken(Token::l_paren,
+                 "expected '(' at start of dimensional identifiers list")) {
+    return failure();
+  }
+
+  auto parseElt = [&]() -> ParseResult {
+    auto dimension = getAffineDimExpr(numDims++, getContext());
+    return parseIdentifierDefinition(dimension);
+  };
+  return parseCommaSeparatedListUntil(Token::r_paren, parseElt);
+}
+
+/// Parse the list of symbolic identifiers to an affine map.
+ParseResult AffineParser::parseSymbolIdList(unsigned &numSymbols) {
+  consumeToken(Token::l_square);
+  auto parseElt = [&]() -> ParseResult {
+    auto symbol = getAffineSymbolExpr(numSymbols++, getContext());
+    return parseIdentifierDefinition(symbol);
+  };
+  return parseCommaSeparatedListUntil(Token::r_square, parseElt);
+}
+
+/// Parse the list of symbolic identifiers to an affine map.
+ParseResult
+AffineParser::parseDimAndOptionalSymbolIdList(unsigned &numDims,
+                                              unsigned &numSymbols) {
+  if (parseDimIdList(numDims)) {
+    return failure();
+  }
+  if (!getToken().is(Token::l_square)) {
+    numSymbols = 0;
+    return success();
+  }
+  return parseSymbolIdList(numSymbols);
+}
+
+/// Parses an ambiguous affine map or integer set definition inline.
+ParseResult AffineParser::parseAffineMapOrIntegerSetInline(AffineMap &map,
+                                                           IntegerSet &set) {
+  unsigned numDims = 0, numSymbols = 0;
+
+  // List of dimensional and optional symbol identifiers.
+  if (parseDimAndOptionalSymbolIdList(numDims, numSymbols)) {
+    return failure();
+  }
+
+  // This is needed for parsing attributes as we wouldn't know whether we would
+  // be parsing an integer set attribute or an affine map attribute.
+  bool isArrow = getToken().is(Token::arrow);
+  bool isColon = getToken().is(Token::colon);
+  if (!isArrow && !isColon) {
+    return emitError("expected '->' or ':'");
+  } else if (isArrow) {
+    parseToken(Token::arrow, "expected '->' or '['");
+    map = parseAffineMapRange(numDims, numSymbols);
+    return map ? success() : failure();
+  } else if (parseToken(Token::colon, "expected ':' or '['")) {
+    return failure();
+  }
+
+  if ((set = parseIntegerSetConstraints(numDims, numSymbols)))
+    return success();
+
+  return failure();
+}
+
+/// Parse an AffineMap where the dim and symbol identifiers are SSA ids.
+ParseResult AffineParser::parseAffineMapOfSSAIds(AffineMap &map) {
+  if (parseToken(Token::l_square, "expected '['"))
+    return failure();
+
+  SmallVector<AffineExpr, 4> exprs;
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseAffineExpr();
+    exprs.push_back(elt);
+    return elt ? success() : failure();
+  };
+
+  // Parse a multi-dimensional affine expression (a comma-separated list of
+  // 1-d affine expressions); the list cannot be empty. Grammar:
+  // multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)
+  if (parseCommaSeparatedListUntil(Token::r_square, parseElt,
+                                   /*allowEmptyList=*/true))
+    return failure();
+  // Parsed a valid affine map.
+  if (exprs.empty())
+    map = AffineMap();
+  else
+    map = builder.getAffineMap(numDimOperands,
+                               dimsAndSymbols.size() - numDimOperands, exprs);
+  return success();
+}
+
+/// Parse the range and sizes affine map definition inline.
+///
+///  affine-map ::= dim-and-symbol-id-lists `->` multi-dim-affine-expr
+///
+///  multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)
+AffineMap AffineParser::parseAffineMapRange(unsigned numDims,
+                                            unsigned numSymbols) {
+  parseToken(Token::l_paren, "expected '(' at start of affine map range");
+
+  SmallVector<AffineExpr, 4> exprs;
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseAffineExpr();
+    ParseResult res = elt ? success() : failure();
+    exprs.push_back(elt);
+    return res;
+  };
+
+  // Parse a multi-dimensional affine expression (a comma-separated list of
+  // 1-d affine expressions); the list cannot be empty. Grammar:
+  // multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)
+  if (parseCommaSeparatedListUntil(Token::r_paren, parseElt, false))
+    return AffineMap();
+
+  // Parsed a valid affine map.
+  return builder.getAffineMap(numDims, numSymbols, exprs);
+}
+
+/// Parse an affine constraint.
+///  affine-constraint ::= affine-expr `>=` `0`
+///                      | affine-expr `==` `0`
+///
+/// isEq is set to true if the parsed constraint is an equality, false if it
+/// is an inequality (greater than or equal).
+///
+AffineExpr AffineParser::parseAffineConstraint(bool *isEq) {
+  AffineExpr expr = parseAffineExpr();
+  if (!expr)
+    return nullptr;
+
+  if (consumeIf(Token::greater) && consumeIf(Token::equal) &&
+      getToken().is(Token::integer)) {
+    auto dim = getToken().getUnsignedIntegerValue();
+    if (dim.hasValue() && dim.getValue() == 0) {
+      consumeToken(Token::integer);
+      *isEq = false;
+      return expr;
+    }
+    return (emitError("expected '0' after '>='"), nullptr);
+  }
+
+  if (consumeIf(Token::equal) && consumeIf(Token::equal) &&
+      getToken().is(Token::integer)) {
+    auto dim = getToken().getUnsignedIntegerValue();
+    if (dim.hasValue() && dim.getValue() == 0) {
+      consumeToken(Token::integer);
+      *isEq = true;
+      return expr;
+    }
+    return (emitError("expected '0' after '=='"), nullptr);
+  }
+
+  return (emitError("expected '== 0' or '>= 0' at end of affine constraint"),
+          nullptr);
+}
+
+/// Parse the constraints that are part of an integer set definition.
+///  integer-set-inline
+///                ::= dim-and-symbol-id-lists `:`
+///                '(' affine-constraint-conjunction? ')'
+///  affine-constraint-conjunction ::= affine-constraint (`,`
+///                                       affine-constraint)*
+///
+IntegerSet AffineParser::parseIntegerSetConstraints(unsigned numDims,
+                                                    unsigned numSymbols) {
+  if (parseToken(Token::l_paren,
+                 "expected '(' at start of integer set constraint list"))
+    return IntegerSet();
+
+  SmallVector<AffineExpr, 4> constraints;
+  SmallVector<bool, 4> isEqs;
+  auto parseElt = [&]() -> ParseResult {
+    bool isEq;
+    auto elt = parseAffineConstraint(&isEq);
+    ParseResult res = elt ? success() : failure();
+    if (elt) {
+      constraints.push_back(elt);
+      isEqs.push_back(isEq);
+    }
+    return res;
+  };
+
+  // Parse a list of affine constraints (comma-separated).
+  if (parseCommaSeparatedListUntil(Token::r_paren, parseElt, true))
+    return IntegerSet();
+
+  // If no constraints were parsed, then treat this as a degenerate 'true' case.
+  if (constraints.empty()) {
+    /* 0 == 0 */
+    auto zero = getAffineConstantExpr(0, getContext());
+    return builder.getIntegerSet(numDims, numSymbols, zero, true);
+  }
+
+  // Parsed a valid integer set.
+  return builder.getIntegerSet(numDims, numSymbols, constraints, isEqs);
+}
+
+/// Parse an ambiguous reference to either and affine map or an integer set.
+ParseResult Parser::parseAffineMapOrIntegerSetReference(AffineMap &map,
+                                                        IntegerSet &set) {
+  return AffineParser(state).parseAffineMapOrIntegerSetInline(map, set);
+}
+
+/// Parse an AffineMap of SSA ids. The callback 'parseElement' is used to
+/// parse SSA value uses encountered while parsing affine expressions.
+ParseResult Parser::parseAffineMapOfSSAIds(
+    AffineMap &map, llvm::function_ref<ParseResult(bool)> parseElement) {
+  return AffineParser(state, /*allowParsingSSAIds=*/true, parseElement)
+      .parseAffineMapOfSSAIds(map);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationParser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class provides support for parsing operations and regions of
+/// operations.
+class OperationParser : public Parser {
+public:
+  OperationParser(ParserState &state, ModuleOp moduleOp)
+      : Parser(state), opBuilder(moduleOp.getBodyRegion()), moduleOp(moduleOp) {
+  }
+
+  ~OperationParser();
+
+  /// After parsing is finished, this function must be called to see if there
+  /// are any remaining issues.
+  ParseResult finalize();
+
+  //===--------------------------------------------------------------------===//
+  // SSA Value Handling
+  //===--------------------------------------------------------------------===//
+
+  /// This represents a use of an SSA value in the program.  The first two
+  /// entries in the tuple are the name and result number of a reference.  The
+  /// third is the location of the reference, which is used in case this ends
+  /// up being a use of an undefined value.
+  struct SSAUseInfo {
+    StringRef name;  // Value name, e.g. %42 or %abc
+    unsigned number; // Number, specified with #12
+    SMLoc loc;       // Location of first definition or use.
+  };
+
+  /// Push a new SSA name scope to the parser.
+  void pushSSANameScope(bool isIsolated);
+
+  /// Pop the last SSA name scope from the parser.
+  ParseResult popSSANameScope();
+
+  /// Register a definition of a value with the symbol table.
+  ParseResult addDefinition(SSAUseInfo useInfo, Value *value);
+
+  /// Parse an optional list of SSA uses into 'results'.
+  ParseResult parseOptionalSSAUseList(SmallVectorImpl<SSAUseInfo> &results);
+
+  /// Parse a single SSA use into 'result'.
+  ParseResult parseSSAUse(SSAUseInfo &result);
+
+  /// Given a reference to an SSA value and its type, return a reference. This
+  /// returns null on failure.
+  Value *resolveSSAUse(SSAUseInfo useInfo, Type type);
+
+  ParseResult parseSSADefOrUseAndType(
+      const std::function<ParseResult(SSAUseInfo, Type)> &action);
+
+  ParseResult parseOptionalSSAUseAndTypeList(SmallVectorImpl<Value *> &results);
+
+  /// Return the location of the value identified by its name and number if it
+  /// has been already reference.
+  llvm::Optional<SMLoc> getReferenceLoc(StringRef name, unsigned number) {
+    auto &values = isolatedNameScopes.back().values;
+    if (!values.count(name) || number >= values[name].size())
+      return {};
+    if (values[name][number].first)
+      return values[name][number].second;
+    return {};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operation Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an operation instance.
+  ParseResult parseOperation();
+
+  /// Parse a single operation successor and its operand list.
+  ParseResult parseSuccessorAndUseList(Block *&dest,
+                                       SmallVectorImpl<Value *> &operands);
+
+  /// Parse a comma-separated list of operation successors in brackets.
+  ParseResult
+  parseSuccessors(SmallVectorImpl<Block *> &destinations,
+                  SmallVectorImpl<SmallVector<Value *, 4>> &operands);
+
+  /// Parse an operation instance that is in the generic form.
+  Operation *parseGenericOperation();
+
+  /// Parse an operation instance that is in the op-defined custom form.
+  Operation *parseCustomOperation();
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a region into 'region' with the provided entry block arguments.
+  /// 'isIsolatedNameScope' indicates if the naming scope of this region is
+  /// isolated from those above.
+  ParseResult parseRegion(Region &region,
+                          ArrayRef<std::pair<SSAUseInfo, Type>> entryArguments,
+                          bool isIsolatedNameScope = false);
+
+  /// Parse a region body into 'region'.
+  ParseResult parseRegionBody(Region &region);
+
+  //===--------------------------------------------------------------------===//
+  // Block Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a new block into 'block'.
+  ParseResult parseBlock(Block *&block);
+
+  /// Parse a list of operations into 'block'.
+  ParseResult parseBlockBody(Block *block);
+
+  /// Parse a (possibly empty) list of block arguments.
+  ParseResult
+  parseOptionalBlockArgList(SmallVectorImpl<BlockArgument *> &results,
+                            Block *owner);
+
+  /// Get the block with the specified name, creating it if it doesn't
+  /// already exist.  The location specified is the point of use, which allows
+  /// us to diagnose references to blocks that are not defined precisely.
+  Block *getBlockNamed(StringRef name, SMLoc loc);
+
+  /// Define the block with the specified name. Returns the Block* or nullptr in
+  /// the case of redefinition.
+  Block *defineBlockNamed(StringRef name, SMLoc loc, Block *existing);
+
+private:
+  /// Returns the info for a block at the current scope for the given name.
+  std::pair<Block *, SMLoc> &getBlockInfoByName(StringRef name) {
+    return blocksByName.back()[name];
+  }
+
+  /// Insert a new forward reference to the given block.
+  void insertForwardRef(Block *block, SMLoc loc) {
+    forwardRef.back().try_emplace(block, loc);
+  }
+
+  /// Erase any forward reference to the given block.
+  bool eraseForwardRef(Block *block) { return forwardRef.back().erase(block); }
+
+  /// Record that a definition was added at the current scope.
+  void recordDefinition(StringRef def);
+
+  /// Get the value entry for the given SSA name.
+  SmallVectorImpl<std::pair<Value *, SMLoc>> &getSSAValueEntry(StringRef name);
+
+  /// Create a forward reference placeholder value with the given location and
+  /// result type.
+  Value *createForwardRefPlaceholder(SMLoc loc, Type type);
+
+  /// Return true if this is a forward reference.
+  bool isForwardRefPlaceholder(Value *value) {
+    return forwardRefPlaceholders.count(value);
+  }
+
+  /// This struct represents an isolated SSA name scope. This scope may contain
+  /// other nested non-isolated scopes. These scopes are used for operations
+  /// that are known to be isolated to allow for reusing names within their
+  /// regions, even if those names are used above.
+  struct IsolatedSSANameScope {
+    /// Record that a definition was added at the current scope.
+    void recordDefinition(StringRef def) {
+      definitionsPerScope.back().insert(def);
+    }
+
+    /// Push a nested name scope.
+    void pushSSANameScope() { definitionsPerScope.push_back({}); }
+
+    /// Pop a nested name scope.
+    void popSSANameScope() {
+      for (auto &def : definitionsPerScope.pop_back_val())
+        values.erase(def.getKey());
+    }
+
+    /// This keeps track of all of the SSA values we are tracking for each name
+    /// scope, indexed by their name. This has one entry per result number.
+    llvm::StringMap<SmallVector<std::pair<Value *, SMLoc>, 1>> values;
+
+    /// This keeps track of all of the values defined by a specific name scope.
+    SmallVector<llvm::StringSet<>, 2> definitionsPerScope;
+  };
+
+  /// A list of isolated name scopes.
+  SmallVector<IsolatedSSANameScope, 2> isolatedNameScopes;
+
+  /// This keeps track of the block names as well as the location of the first
+  /// reference for each nested name scope. This is used to diagnose invalid
+  /// block references and memoize them.
+  SmallVector<DenseMap<StringRef, std::pair<Block *, SMLoc>>, 2> blocksByName;
+  SmallVector<DenseMap<Block *, SMLoc>, 2> forwardRef;
+
+  /// These are all of the placeholders we've made along with the location of
+  /// their first reference, to allow checking for use of undefined values.
+  DenseMap<Value *, SMLoc> forwardRefPlaceholders;
+
+  /// The builder used when creating parsed operation instances.
+  OpBuilder opBuilder;
+
+  /// The top level module operation.
+  ModuleOp moduleOp;
+};
+} // end anonymous namespace
+
+OperationParser::~OperationParser() {
+  for (auto &fwd : forwardRefPlaceholders) {
+    // Drop all uses of undefined forward declared reference and destroy
+    // defining operation.
+    fwd.first->dropAllUses();
+    fwd.first->getDefiningOp()->destroy();
+  }
+}
+
+/// After parsing is finished, this function must be called to see if there are
+/// any remaining issues.
+ParseResult OperationParser::finalize() {
+  // Check for any forward references that are left.  If we find any, error
+  // out.
+  if (!forwardRefPlaceholders.empty()) {
+    SmallVector<std::pair<const char *, Value *>, 4> errors;
+    // Iteration over the map isn't deterministic, so sort by source location.
+    for (auto entry : forwardRefPlaceholders)
+      errors.push_back({entry.second.getPointer(), entry.first});
+    llvm::array_pod_sort(errors.begin(), errors.end());
+
+    for (auto entry : errors) {
+      auto loc = SMLoc::getFromPointer(entry.first);
+      emitError(loc, "use of undeclared SSA value name");
+    }
+    return failure();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SSA Value Handling
+//===----------------------------------------------------------------------===//
+
+void OperationParser::pushSSANameScope(bool isIsolated) {
+  blocksByName.push_back(DenseMap<StringRef, std::pair<Block *, SMLoc>>());
+  forwardRef.push_back(DenseMap<Block *, SMLoc>());
+
+  // Push back a new name definition scope.
+  if (isIsolated)
+    isolatedNameScopes.push_back({});
+  isolatedNameScopes.back().pushSSANameScope();
+}
+
+ParseResult OperationParser::popSSANameScope() {
+  auto forwardRefInCurrentScope = forwardRef.pop_back_val();
+
+  // Verify that all referenced blocks were defined.
+  if (!forwardRefInCurrentScope.empty()) {
+    SmallVector<std::pair<const char *, Block *>, 4> errors;
+    // Iteration over the map isn't deterministic, so sort by source location.
+    for (auto entry : forwardRefInCurrentScope) {
+      errors.push_back({entry.second.getPointer(), entry.first});
+      // Add this block to the top-level region to allow for automatic cleanup.
+      moduleOp.getOperation()->getRegion(0).push_back(entry.first);
+    }
+    llvm::array_pod_sort(errors.begin(), errors.end());
+
+    for (auto entry : errors) {
+      auto loc = SMLoc::getFromPointer(entry.first);
+      emitError(loc, "reference to an undefined block");
+    }
+    return failure();
+  }
+
+  // Pop the next nested namescope. If there is only one internal namescope,
+  // just pop the isolated scope.
+  auto &currentNameScope = isolatedNameScopes.back();
+  if (currentNameScope.definitionsPerScope.size() == 1)
+    isolatedNameScopes.pop_back();
+  else
+    currentNameScope.popSSANameScope();
+
+  blocksByName.pop_back();
+  return success();
+}
+
+/// Register a definition of a value with the symbol table.
+ParseResult OperationParser::addDefinition(SSAUseInfo useInfo, Value *value) {
+  auto &entries = getSSAValueEntry(useInfo.name);
+
+  // Make sure there is a slot for this value.
+  if (entries.size() <= useInfo.number)
+    entries.resize(useInfo.number + 1);
+
+  // If we already have an entry for this, check to see if it was a definition
+  // or a forward reference.
+  if (auto *existing = entries[useInfo.number].first) {
+    if (!isForwardRefPlaceholder(existing)) {
+      return emitError(useInfo.loc)
+          .append("redefinition of SSA value '", useInfo.name, "'")
+          .attachNote(getEncodedSourceLocation(entries[useInfo.number].second))
+          .append("previously defined here");
+    }
+
+    // If it was a forward reference, update everything that used it to use
+    // the actual definition instead, delete the forward ref, and remove it
+    // from our set of forward references we track.
+    existing->replaceAllUsesWith(value);
+    existing->getDefiningOp()->destroy();
+    forwardRefPlaceholders.erase(existing);
+  }
+
+  /// Record this definition for the current scope.
+  entries[useInfo.number] = {value, useInfo.loc};
+  recordDefinition(useInfo.name);
+  return success();
+}
+
+/// Parse a (possibly empty) list of SSA operands.
+///
+///   ssa-use-list ::= ssa-use (`,` ssa-use)*
+///   ssa-use-list-opt ::= ssa-use-list?
+///
+ParseResult
+OperationParser::parseOptionalSSAUseList(SmallVectorImpl<SSAUseInfo> &results) {
+  if (getToken().isNot(Token::percent_identifier))
+    return success();
+  return parseCommaSeparatedList([&]() -> ParseResult {
+    SSAUseInfo result;
+    if (parseSSAUse(result))
+      return failure();
+    results.push_back(result);
+    return success();
+  });
+}
+
+/// Parse a SSA operand for an operation.
+///
+///   ssa-use ::= ssa-id
+///
+ParseResult OperationParser::parseSSAUse(SSAUseInfo &result) {
+  result.name = getTokenSpelling();
+  result.number = 0;
+  result.loc = getToken().getLoc();
+  if (parseToken(Token::percent_identifier, "expected SSA operand"))
+    return failure();
+
+  // If we have an attribute ID, it is a result number.
+  if (getToken().is(Token::hash_identifier)) {
+    if (auto value = getToken().getHashIdentifierNumber())
+      result.number = value.getValue();
+    else
+      return emitError("invalid SSA value result number");
+    consumeToken(Token::hash_identifier);
+  }
+
+  return success();
+}
+
+/// Given an unbound reference to an SSA value and its type, return the value
+/// it specifies.  This returns null on failure.
+Value *OperationParser::resolveSSAUse(SSAUseInfo useInfo, Type type) {
+  auto &entries = getSSAValueEntry(useInfo.name);
+
+  // If we have already seen a value of this name, return it.
+  if (useInfo.number < entries.size() && entries[useInfo.number].first) {
+    auto *result = entries[useInfo.number].first;
+    // Check that the type matches the other uses.
+    if (result->getType() == type)
+      return result;
+
+    emitError(useInfo.loc, "use of value '")
+        .append(useInfo.name,
+                "' expects different type than prior uses: ", type, " vs ",
+                result->getType())
+        .attachNote(getEncodedSourceLocation(entries[useInfo.number].second))
+        .append("prior use here");
+    return nullptr;
+  }
+
+  // Make sure we have enough slots for this.
+  if (entries.size() <= useInfo.number)
+    entries.resize(useInfo.number + 1);
+
+  // If the value has already been defined and this is an overly large result
+  // number, diagnose that.
+  if (entries[0].first && !isForwardRefPlaceholder(entries[0].first))
+    return (emitError(useInfo.loc, "reference to invalid result number"),
+            nullptr);
+
+  // Otherwise, this is a forward reference.  Create a placeholder and remember
+  // that we did so.
+  auto *result = createForwardRefPlaceholder(useInfo.loc, type);
+  entries[useInfo.number].first = result;
+  entries[useInfo.number].second = useInfo.loc;
+  return result;
+}
+
+/// Parse an SSA use with an associated type.
+///
+///   ssa-use-and-type ::= ssa-use `:` type
+ParseResult OperationParser::parseSSADefOrUseAndType(
+    const std::function<ParseResult(SSAUseInfo, Type)> &action) {
+  SSAUseInfo useInfo;
+  if (parseSSAUse(useInfo) ||
+      parseToken(Token::colon, "expected ':' and type for SSA operand"))
+    return failure();
+
+  auto type = parseType();
+  if (!type)
+    return failure();
+
+  return action(useInfo, type);
+}
+
+/// Parse a (possibly empty) list of SSA operands, followed by a colon, then
+/// followed by a type list.
+///
+///   ssa-use-and-type-list
+///     ::= ssa-use-list ':' type-list-no-parens
+///
+ParseResult OperationParser::parseOptionalSSAUseAndTypeList(
+    SmallVectorImpl<Value *> &results) {
+  SmallVector<SSAUseInfo, 4> valueIDs;
+  if (parseOptionalSSAUseList(valueIDs))
+    return failure();
+
+  // If there were no operands, then there is no colon or type lists.
+  if (valueIDs.empty())
+    return success();
+
+  SmallVector<Type, 4> types;
+  if (parseToken(Token::colon, "expected ':' in operand list") ||
+      parseTypeListNoParens(types))
+    return failure();
+
+  if (valueIDs.size() != types.size())
+    return emitError("expected ")
+           << valueIDs.size() << " types to match operand list";
+
+  results.reserve(valueIDs.size());
+  for (unsigned i = 0, e = valueIDs.size(); i != e; ++i) {
+    if (auto *value = resolveSSAUse(valueIDs[i], types[i]))
+      results.push_back(value);
+    else
+      return failure();
+  }
+
+  return success();
+}
+
+/// Record that a definition was added at the current scope.
+void OperationParser::recordDefinition(StringRef def) {
+  isolatedNameScopes.back().recordDefinition(def);
+}
+
+/// Get the value entry for the given SSA name.
+SmallVectorImpl<std::pair<Value *, SMLoc>> &
+OperationParser::getSSAValueEntry(StringRef name) {
+  return isolatedNameScopes.back().values[name];
+}
+
+/// Create and remember a new placeholder for a forward reference.
+Value *OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) {
+  // Forward references are always created as operations, because we just need
+  // something with a def/use chain.
+  //
+  // We create these placeholders as having an empty name, which we know
+  // cannot be created through normal user input, allowing us to distinguish
+  // them.
+  auto name = OperationName("placeholder", getContext());
+  auto *op = Operation::create(
+      getEncodedSourceLocation(loc), name, /*operands=*/{}, type,
+      /*attributes=*/llvm::None, /*successors=*/{}, /*numRegions=*/0,
+      /*resizableOperandList=*/false);
+  forwardRefPlaceholders[op->getResult(0)] = loc;
+  return op->getResult(0);
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Parsing
+//===----------------------------------------------------------------------===//
+
+/// Parse an operation.
+///
+///  operation ::=
+///    operation-result? string '(' ssa-use-list? ')' attribute-dict?
+///    `:` function-type trailing-location?
+///  operation-result ::= ssa-id ((`:` integer-literal) | (`,` ssa-id)*) `=`
+///
+ParseResult OperationParser::parseOperation() {
+  auto loc = getToken().getLoc();
+  SmallVector<std::pair<StringRef, SMLoc>, 1> resultIDs;
+  size_t numExpectedResults;
+  if (getToken().is(Token::percent_identifier)) {
+    // Parse the first result id.
+    resultIDs.emplace_back(getTokenSpelling(), loc);
+    consumeToken(Token::percent_identifier);
+
+    // If the next token is a ':', we parse the expected result count.
+    if (consumeIf(Token::colon)) {
+      // Check that the next token is an integer.
+      if (!getToken().is(Token::integer))
+        return emitError("expected integer number of results");
+
+      // Check that number of results is > 0.
+      auto val = getToken().getUInt64IntegerValue();
+      if (!val.hasValue() || val.getValue() < 1)
+        return emitError("expected named operation to have atleast 1 result");
+      consumeToken(Token::integer);
+      numExpectedResults = *val;
+    } else {
+      // Otherwise, this is a comma separated list of result ids.
+      if (consumeIf(Token::comma)) {
+        auto parseNextResult = [&]() -> ParseResult {
+          // Parse the next result id.
+          if (!getToken().is(Token::percent_identifier))
+            return emitError("expected valid ssa identifier");
+
+          resultIDs.emplace_back(getTokenSpelling(), getToken().getLoc());
+          consumeToken(Token::percent_identifier);
+          return success();
+        };
+
+        if (parseCommaSeparatedList(parseNextResult))
+          return failure();
+      }
+      numExpectedResults = resultIDs.size();
+    }
+
+    if (parseToken(Token::equal, "expected '=' after SSA name"))
+      return failure();
+  }
+
+  Operation *op;
+  if (getToken().is(Token::bare_identifier) || getToken().isKeyword())
+    op = parseCustomOperation();
+  else if (getToken().is(Token::string))
+    op = parseGenericOperation();
+  else
+    return emitError("expected operation name in quotes");
+
+  // If parsing of the basic operation failed, then this whole thing fails.
+  if (!op)
+    return failure();
+
+  // If the operation had a name, register it.
+  if (!resultIDs.empty()) {
+    if (op->getNumResults() == 0)
+      return emitError(loc, "cannot name an operation with no results");
+    if (numExpectedResults != op->getNumResults())
+      return emitError(loc, "operation defines ")
+             << op->getNumResults() << " results but was provided "
+             << numExpectedResults << " to bind";
+
+    // If the number of result names matches the number of operation results, we
+    // can directly use the provided names.
+    if (resultIDs.size() == op->getNumResults()) {
+      for (unsigned i = 0, e = op->getNumResults(); i != e; ++i)
+        if (addDefinition({resultIDs[i].first, 0, resultIDs[i].second},
+                          op->getResult(i)))
+          return failure();
+    } else {
+      // Otherwise, we use the same name for all results.
+      StringRef name = resultIDs.front().first;
+      for (unsigned i = 0, e = op->getNumResults(); i != e; ++i)
+        if (addDefinition({name, i, loc}, op->getResult(i)))
+          return failure();
+    }
+  }
+
+  // Try to parse the optional trailing location.
+  if (parseOptionalTrailingLocation(op))
+    return failure();
+
+  return success();
+}
+
+/// Parse a single operation successor and its operand list.
+///
+///   successor ::= block-id branch-use-list?
+///   branch-use-list ::= `(` ssa-use-list ':' type-list-no-parens `)`
+///
+ParseResult
+OperationParser::parseSuccessorAndUseList(Block *&dest,
+                                          SmallVectorImpl<Value *> &operands) {
+  // Verify branch is identifier and get the matching block.
+  if (!getToken().is(Token::caret_identifier))
+    return emitError("expected block name");
+  dest = getBlockNamed(getTokenSpelling(), getToken().getLoc());
+  consumeToken();
+
+  // Handle optional arguments.
+  if (consumeIf(Token::l_paren) &&
+      (parseOptionalSSAUseAndTypeList(operands) ||
+       parseToken(Token::r_paren, "expected ')' to close argument list"))) {
+    return failure();
+  }
+
+  return success();
+}
+
+/// Parse a comma-separated list of operation successors in brackets.
+///
+///   successor-list ::= `[` successor (`,` successor )* `]`
+///
+ParseResult OperationParser::parseSuccessors(
+    SmallVectorImpl<Block *> &destinations,
+    SmallVectorImpl<SmallVector<Value *, 4>> &operands) {
+  if (parseToken(Token::l_square, "expected '['"))
+    return failure();
+
+  auto parseElt = [this, &destinations, &operands]() {
+    Block *dest;
+    SmallVector<Value *, 4> destOperands;
+    auto res = parseSuccessorAndUseList(dest, destOperands);
+    destinations.push_back(dest);
+    operands.push_back(destOperands);
+    return res;
+  };
+  return parseCommaSeparatedListUntil(Token::r_square, parseElt,
+                                      /*allowEmptyList=*/false);
+}
+
+namespace {
+// RAII-style guard for cleaning up the regions in the operation state before
+// deleting them.  Within the parser, regions may get deleted if parsing failed,
+// and other errors may be present, in praticular undominated uses.  This makes
+// sure such uses are deleted.
+struct CleanupOpStateRegions {
+  ~CleanupOpStateRegions() {
+    SmallVector<Region *, 4> regionsToClean;
+    regionsToClean.reserve(state.regions.size());
+    for (auto &region : state.regions)
+      if (region)
+        for (auto &block : *region)
+          block.dropAllDefinedValueUses();
+  }
+  OperationState &state;
+};
+} // namespace
+
+Operation *OperationParser::parseGenericOperation() {
+  // Get location information for the operation.
+  auto srcLocation = getEncodedSourceLocation(getToken().getLoc());
+
+  auto name = getToken().getStringValue();
+  if (name.empty())
+    return (emitError("empty operation name is invalid"), nullptr);
+  if (name.find('\0') != StringRef::npos)
+    return (emitError("null character not allowed in operation name"), nullptr);
+
+  consumeToken(Token::string);
+
+  OperationState result(srcLocation, name);
+
+  // Generic operations have a resizable operation list.
+  result.setOperandListToResizable();
+
+  // Parse the operand list.
+  SmallVector<SSAUseInfo, 8> operandInfos;
+
+  if (parseToken(Token::l_paren, "expected '(' to start operand list") ||
+      parseOptionalSSAUseList(operandInfos) ||
+      parseToken(Token::r_paren, "expected ')' to end operand list")) {
+    return nullptr;
+  }
+
+  // Parse the successor list but don't add successors to the result yet to
+  // avoid messing up with the argument order.
+  SmallVector<Block *, 2> successors;
+  SmallVector<SmallVector<Value *, 4>, 2> successorOperands;
+  if (getToken().is(Token::l_square)) {
+    // Check if the operation is a known terminator.
+    const AbstractOperation *abstractOp = result.name.getAbstractOperation();
+    if (abstractOp && !abstractOp->hasProperty(OperationProperty::Terminator))
+      return emitError("successors in non-terminator"), nullptr;
+    if (parseSuccessors(successors, successorOperands))
+      return nullptr;
+  }
+
+  // Parse the region list.
+  CleanupOpStateRegions guard{result};
+  if (consumeIf(Token::l_paren)) {
+    do {
+      // Create temporary regions with the top level region as parent.
+      result.regions.emplace_back(new Region(moduleOp));
+      if (parseRegion(*result.regions.back(), /*entryArguments=*/{}))
+        return nullptr;
+    } while (consumeIf(Token::comma));
+    if (parseToken(Token::r_paren, "expected ')' to end region list"))
+      return nullptr;
+  }
+
+  if (getToken().is(Token::l_brace)) {
+    if (parseAttributeDict(result.attributes))
+      return nullptr;
+  }
+
+  if (parseToken(Token::colon, "expected ':' followed by operation type"))
+    return nullptr;
+
+  auto typeLoc = getToken().getLoc();
+  auto type = parseType();
+  if (!type)
+    return nullptr;
+  auto fnType = type.dyn_cast<FunctionType>();
+  if (!fnType)
+    return (emitError(typeLoc, "expected function type"), nullptr);
+
+  result.addTypes(fnType.getResults());
+
+  // Check that we have the right number of types for the operands.
+  auto operandTypes = fnType.getInputs();
+  if (operandTypes.size() != operandInfos.size()) {
+    auto plural = "s"[operandInfos.size() == 1];
+    return (emitError(typeLoc, "expected ")
+                << operandInfos.size() << " operand type" << plural
+                << " but had " << operandTypes.size(),
+            nullptr);
+  }
+
+  // Resolve all of the operands.
+  for (unsigned i = 0, e = operandInfos.size(); i != e; ++i) {
+    result.operands.push_back(resolveSSAUse(operandInfos[i], operandTypes[i]));
+    if (!result.operands.back())
+      return nullptr;
+  }
+
+  // Add the sucessors, and their operands after the proper operands.
+  for (const auto &succ : llvm::zip(successors, successorOperands)) {
+    Block *successor = std::get<0>(succ);
+    const SmallVector<Value *, 4> &operands = std::get<1>(succ);
+    result.addSuccessor(successor, operands);
+  }
+
+  return opBuilder.createOperation(result);
+}
+
+namespace {
+class CustomOpAsmParser : public OpAsmParser {
+public:
+  CustomOpAsmParser(SMLoc nameLoc, const AbstractOperation *opDefinition,
+                    OperationParser &parser)
+      : nameLoc(nameLoc), opDefinition(opDefinition), parser(parser) {}
+
+  /// Parse an instance of the operation described by 'opDefinition' into the
+  /// provided operation state.
+  ParseResult parseOperation(OperationState *opState) {
+    if (opDefinition->parseAssembly(this, opState))
+      return failure();
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Return if any errors were emitted during parsing.
+  bool didEmitError() const { return emittedError; }
+
+  /// Emit a diagnostic at the specified location and return failure.
+  InFlightDiagnostic emitError(llvm::SMLoc loc, const Twine &message) override {
+    emittedError = true;
+    return parser.emitError(loc, "custom op '" + opDefinition->name + "' " +
+                                     message);
+  }
+
+  llvm::SMLoc getCurrentLocation() override {
+    return parser.getToken().getLoc();
+  }
+
+  Builder &getBuilder() const override { return parser.builder; }
+
+  llvm::SMLoc getNameLoc() const override { return nameLoc; }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a `->` token.
+  ParseResult parseArrow() override {
+    return parser.parseToken(Token::arrow, "expected '->'");
+  }
+
+  /// Parses a `->` if present.
+  ParseResult parseOptionalArrow() override {
+    return success(parser.consumeIf(Token::arrow));
+  }
+
+  /// Parse a `:` token.
+  ParseResult parseColon() override {
+    return parser.parseToken(Token::colon, "expected ':'");
+  }
+
+  /// Parse a `:` token if present.
+  ParseResult parseOptionalColon() override {
+    return success(parser.consumeIf(Token::colon));
+  }
+
+  /// Parse a `,` token.
+  ParseResult parseComma() override {
+    return parser.parseToken(Token::comma, "expected ','");
+  }
+
+  /// Parse a `,` token if present.
+  ParseResult parseOptionalComma() override {
+    return success(parser.consumeIf(Token::comma));
+  }
+
+  /// Parses a `...` if present.
+  ParseResult parseOptionalEllipsis() override {
+    return success(parser.consumeIf(Token::ellipsis));
+  }
+
+  /// Parse a `=` token.
+  ParseResult parseEqual() override {
+    return parser.parseToken(Token::equal, "expected '='");
+  }
+
+  /// Parse a keyword if present.
+  ParseResult parseOptionalKeyword(const char *keyword) override {
+    // Check that the current token is a bare identifier or keyword.
+    if (parser.getToken().isNot(Token::bare_identifier) &&
+        !parser.getToken().isKeyword())
+      return failure();
+
+    if (parser.getTokenSpelling() == keyword) {
+      parser.consumeToken();
+      return success();
+    }
+    return failure();
+  }
+
+  /// Parse a `(` token.
+  ParseResult parseLParen() override {
+    return parser.parseToken(Token::l_paren, "expected '('");
+  }
+
+  /// Parses a '(' if present.
+  ParseResult parseOptionalLParen() override {
+    return success(parser.consumeIf(Token::l_paren));
+  }
+
+  /// Parse a `)` token.
+  ParseResult parseRParen() override {
+    return parser.parseToken(Token::r_paren, "expected ')'");
+  }
+
+  /// Parses a ')' if present.
+  ParseResult parseOptionalRParen() override {
+    return success(parser.consumeIf(Token::r_paren));
+  }
+
+  /// Parse a `[` token.
+  ParseResult parseLSquare() override {
+    return parser.parseToken(Token::l_square, "expected '['");
+  }
+
+  /// Parses a '[' if present.
+  ParseResult parseOptionalLSquare() override {
+    return success(parser.consumeIf(Token::l_square));
+  }
+
+  /// Parse a `]` token.
+  ParseResult parseRSquare() override {
+    return parser.parseToken(Token::r_square, "expected ']'");
+  }
+
+  /// Parses a ']' if present.
+  ParseResult parseOptionalRSquare() override {
+    return success(parser.consumeIf(Token::r_square));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute of a given type and return it in result. This
+  /// also adds the attribute to the specified attribute list with the specified
+  /// name.
+  ParseResult parseAttribute(Attribute &result, Type type, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) override {
+    result = parser.parseAttribute(type);
+    if (!result)
+      return failure();
+
+    attrs.push_back(parser.builder.getNamedAttr(attrName, result));
+    return success();
+  }
+
+  /// Parse a named dictionary into 'result' if it is present.
+  ParseResult
+  parseOptionalAttributeDict(SmallVectorImpl<NamedAttribute> &result) override {
+    if (parser.getToken().isNot(Token::l_brace))
+      return success();
+    return parser.parseAttributeDict(result);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Identifier Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an @-identifier and store it (without the '@' symbol) in a string
+  /// attribute named 'attrName'.
+  ParseResult parseSymbolName(StringAttr &result, StringRef attrName,
+                              SmallVectorImpl<NamedAttribute> &attrs) override {
+    if (parser.getToken().isNot(Token::at_identifier))
+      return failure();
+    result = getBuilder().getStringAttr(parser.getTokenSpelling().drop_front());
+    attrs.push_back(getBuilder().getNamedAttr(attrName, result));
+    parser.consumeToken();
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operand Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operand.
+  ParseResult parseOperand(OperandType &result) override {
+    OperationParser::SSAUseInfo useInfo;
+    if (parser.parseSSAUse(useInfo))
+      return failure();
+
+    result = {useInfo.loc, useInfo.name, useInfo.number};
+    return success();
+  }
+
+  /// Parse zero or more SSA comma-separated operand references with a specified
+  /// surrounding delimiter, and an optional required operand count.
+  ParseResult parseOperandList(SmallVectorImpl<OperandType> &result,
+                               int requiredOperandCount = -1,
+                               Delimiter delimiter = Delimiter::None) override {
+    return parseOperandOrRegionArgList(result, /*isOperandList=*/true,
+                                       requiredOperandCount, delimiter);
+  }
+
+  /// Parse zero or more SSA comma-separated operand or region arguments with
+  ///  optional surrounding delimiter and required operand count.
+  ParseResult
+  parseOperandOrRegionArgList(SmallVectorImpl<OperandType> &result,
+                              bool isOperandList, int requiredOperandCount = -1,
+                              Delimiter delimiter = Delimiter::None) {
+    auto startLoc = parser.getToken().getLoc();
+
+    // Handle delimiters.
+    switch (delimiter) {
+    case Delimiter::None:
+      // Don't check for the absence of a delimiter if the number of operands
+      // is unknown (and hence the operand list could be empty).
+      if (requiredOperandCount == -1)
+        break;
+      // Token already matches an identifier and so can't be a delimiter.
+      if (parser.getToken().is(Token::percent_identifier))
+        break;
+      // Test against known delimiters.
+      if (parser.getToken().is(Token::l_paren) ||
+          parser.getToken().is(Token::l_square))
+        return emitError(startLoc, "unexpected delimiter");
+      return emitError(startLoc, "invalid operand");
+    case Delimiter::OptionalParen:
+      if (parser.getToken().isNot(Token::l_paren))
+        return success();
+      LLVM_FALLTHROUGH;
+    case Delimiter::Paren:
+      if (parser.parseToken(Token::l_paren, "expected '(' in operand list"))
+        return failure();
+      break;
+    case Delimiter::OptionalSquare:
+      if (parser.getToken().isNot(Token::l_square))
+        return success();
+      LLVM_FALLTHROUGH;
+    case Delimiter::Square:
+      if (parser.parseToken(Token::l_square, "expected '[' in operand list"))
+        return failure();
+      break;
+    }
+
+    // Check for zero operands.
+    if (parser.getToken().is(Token::percent_identifier)) {
+      do {
+        OperandType operandOrArg;
+        if (isOperandList ? parseOperand(operandOrArg)
+                          : parseRegionArgument(operandOrArg))
+          return failure();
+        result.push_back(operandOrArg);
+      } while (parser.consumeIf(Token::comma));
+    }
+
+    // Handle delimiters.   If we reach here, the optional delimiters were
+    // present, so we need to parse their closing one.
+    switch (delimiter) {
+    case Delimiter::None:
+      break;
+    case Delimiter::OptionalParen:
+    case Delimiter::Paren:
+      if (parser.parseToken(Token::r_paren, "expected ')' in operand list"))
+        return failure();
+      break;
+    case Delimiter::OptionalSquare:
+    case Delimiter::Square:
+      if (parser.parseToken(Token::r_square, "expected ']' in operand list"))
+        return failure();
+      break;
+    }
+
+    if (requiredOperandCount != -1 &&
+        result.size() != static_cast<size_t>(requiredOperandCount))
+      return emitError(startLoc, "expected ")
+             << requiredOperandCount << " operands";
+    return success();
+  }
+
+  /// Parse zero or more trailing SSA comma-separated trailing operand
+  /// references with a specified surrounding delimiter, and an optional
+  /// required operand count. A leading comma is expected before the operands.
+  ParseResult parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                                       int requiredOperandCount,
+                                       Delimiter delimiter) override {
+    if (parser.getToken().is(Token::comma)) {
+      parseComma();
+      return parseOperandList(result, requiredOperandCount, delimiter);
+    }
+    if (requiredOperandCount != -1)
+      return emitError(parser.getToken().getLoc(), "expected ")
+             << requiredOperandCount << " operands";
+    return success();
+  }
+
+  /// Resolve an operand to an SSA value, emitting an error on failure.
+  ParseResult resolveOperand(const OperandType &operand, Type type,
+                             SmallVectorImpl<Value *> &result) override {
+    OperationParser::SSAUseInfo operandInfo = {operand.name, operand.number,
+                                               operand.location};
+    if (auto *value = parser.resolveSSAUse(operandInfo, type)) {
+      result.push_back(value);
+      return success();
+    }
+    return failure();
+  }
+
+  /// Parse an AffineMap of SSA ids.
+  ParseResult
+  parseAffineMapOfSSAIds(SmallVectorImpl<OperandType> &operands,
+                         Attribute &mapAttr, StringRef attrName,
+                         SmallVectorImpl<NamedAttribute> &attrs) override {
+    SmallVector<OperandType, 2> dimOperands;
+    SmallVector<OperandType, 1> symOperands;
+
+    auto parseElement = [&](bool isSymbol) -> ParseResult {
+      OperandType operand;
+      if (parseOperand(operand))
+        return failure();
+      if (isSymbol)
+        symOperands.push_back(operand);
+      else
+        dimOperands.push_back(operand);
+      return success();
+    };
+
+    AffineMap map;
+    if (parser.parseAffineMapOfSSAIds(map, parseElement))
+      return failure();
+    // Add AffineMap attribute.
+    if (map) {
+      mapAttr = parser.builder.getAffineMapAttr(map);
+      attrs.push_back(parser.builder.getNamedAttr(attrName, mapAttr));
+    }
+
+    // Add dim operands before symbol operands in 'operands'.
+    operands.assign(dimOperands.begin(), dimOperands.end());
+    operands.append(symOperands.begin(), symOperands.end());
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a region that takes `arguments` of `argTypes` types.  This
+  /// effectively defines the SSA values of `arguments` and assignes their type.
+  ParseResult parseRegion(Region &region, ArrayRef<OperandType> arguments,
+                          ArrayRef<Type> argTypes,
+                          bool enableNameShadowing) override {
+    assert(arguments.size() == argTypes.size() &&
+           "mismatching number of arguments and types");
+
+    SmallVector<std::pair<OperationParser::SSAUseInfo, Type>, 2>
+        regionArguments;
+    for (const auto &pair : llvm::zip(arguments, argTypes)) {
+      const OperandType &operand = std::get<0>(pair);
+      Type type = std::get<1>(pair);
+      OperationParser::SSAUseInfo operandInfo = {operand.name, operand.number,
+                                                 operand.location};
+      regionArguments.emplace_back(operandInfo, type);
+    }
+
+    // Try to parse the region.
+    assert((!enableNameShadowing ||
+            opDefinition->hasProperty(OperationProperty::IsolatedFromAbove)) &&
+           "name shadowing is only allowed on isolated regions");
+    if (parser.parseRegion(region, regionArguments, enableNameShadowing))
+      return failure();
+    return success();
+  }
+
+  /// Parses a region if present.
+  ParseResult parseOptionalRegion(Region &region,
+                                  ArrayRef<OperandType> arguments,
+                                  ArrayRef<Type> argTypes,
+                                  bool enableNameShadowing) override {
+    if (parser.getToken().isNot(Token::l_brace))
+      return success();
+    return parseRegion(region, arguments, argTypes, enableNameShadowing);
+  }
+
+  /// Parse a region argument. The type of the argument will be resolved later
+  /// by a call to `parseRegion`.
+  ParseResult parseRegionArgument(OperandType &argument) override {
+    return parseOperand(argument);
+  }
+
+  /// Parse a region argument if present.
+  ParseResult parseOptionalRegionArgument(OperandType &argument) override {
+    if (parser.getToken().isNot(Token::percent_identifier))
+      return success();
+    return parseRegionArgument(argument);
+  }
+
+  ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          int requiredOperandCount = -1,
+                          Delimiter delimiter = Delimiter::None) override {
+    return parseOperandOrRegionArgList(result, /*isOperandList=*/false,
+                                       requiredOperandCount, delimiter);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Successor Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operation successor and its operand list.
+  ParseResult
+  parseSuccessorAndUseList(Block *&dest,
+                           SmallVectorImpl<Value *> &operands) override {
+    return parser.parseSuccessorAndUseList(dest, operands);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a type.
+  ParseResult parseType(Type &result) override {
+    return failure(!(result = parser.parseType()));
+  }
+
+  /// Parse an optional arrow followed by a type list.
+  ParseResult
+  parseOptionalArrowTypeList(SmallVectorImpl<Type> &result) override {
+    if (!parser.consumeIf(Token::arrow))
+      return success();
+    return parser.parseFunctionResultTypes(result);
+  }
+
+  /// Parse a colon followed by a type.
+  ParseResult parseColonType(Type &result) override {
+    return failure(parser.parseToken(Token::colon, "expected ':'") ||
+                   !(result = parser.parseType()));
+  }
+
+  /// Parse a colon followed by a type list, which must have at least one type.
+  ParseResult parseColonTypeList(SmallVectorImpl<Type> &result) override {
+    if (parser.parseToken(Token::colon, "expected ':'"))
+      return failure();
+    return parser.parseTypeListNoParens(result);
+  }
+
+  /// Parse an optional colon followed by a type list, which if present must
+  /// have at least one type.
+  ParseResult
+  parseOptionalColonTypeList(SmallVectorImpl<Type> &result) override {
+    if (!parser.consumeIf(Token::colon))
+      return success();
+    return parser.parseTypeListNoParens(result);
+  }
+
+private:
+  /// The source location of the operation name.
+  SMLoc nameLoc;
+
+  /// The abstract information of the operation.
+  const AbstractOperation *opDefinition;
+
+  /// The main operation parser.
+  OperationParser &parser;
+
+  /// A flag that indicates if any errors were emitted during parsing.
+  bool emittedError = false;
+};
+} // end anonymous namespace.
+
+Operation *OperationParser::parseCustomOperation() {
+  auto opLoc = getToken().getLoc();
+  auto opName = getTokenSpelling();
+
+  auto *opDefinition = AbstractOperation::lookup(opName, getContext());
+  if (!opDefinition && !opName.contains('.')) {
+    // If the operation name has no namespace prefix we treat it as a standard
+    // operation and prefix it with "std".
+    // TODO: Would it be better to just build a mapping of the registered
+    // operations in the standard dialect?
+    opDefinition =
+        AbstractOperation::lookup(Twine("std." + opName).str(), getContext());
+  }
+
+  if (!opDefinition) {
+    emitError(opLoc) << "custom op '" << opName << "' is unknown";
+    return nullptr;
+  }
+
+  consumeToken();
+
+  // If the custom op parser crashes, produce some indication to help
+  // debugging.
+  std::string opNameStr = opName.str();
+  llvm::PrettyStackTraceFormat fmt("MLIR Parser: custom op parser '%s'",
+                                   opNameStr.c_str());
+
+  // Get location information for the operation.
+  auto srcLocation = getEncodedSourceLocation(opLoc);
+
+  // Have the op implementation take a crack and parsing this.
+  OperationState opState(srcLocation, opDefinition->name);
+  CleanupOpStateRegions guard{opState};
+  CustomOpAsmParser opAsmParser(opLoc, opDefinition, *this);
+  if (opAsmParser.parseOperation(&opState))
+    return nullptr;
+
+  // If it emitted an error, we failed.
+  if (opAsmParser.didEmitError())
+    return nullptr;
+
+  // Otherwise, we succeeded.  Use the state it parsed as our op information.
+  return opBuilder.createOperation(opState);
+}
+
+//===----------------------------------------------------------------------===//
+// Region Parsing
+//===----------------------------------------------------------------------===//
+
+/// Region.
+///
+///   region ::= '{' region-body
+///
+ParseResult OperationParser::parseRegion(
+    Region &region,
+    ArrayRef<std::pair<OperationParser::SSAUseInfo, Type>> entryArguments,
+    bool isIsolatedNameScope) {
+  // Parse the '{'.
+  if (parseToken(Token::l_brace, "expected '{' to begin a region"))
+    return failure();
+
+  // Check for an empty region.
+  if (entryArguments.empty() && consumeIf(Token::r_brace))
+    return success();
+  auto currentPt = opBuilder.saveInsertionPoint();
+
+  // Push a new named value scope.
+  pushSSANameScope(isIsolatedNameScope);
+
+  // Parse the first block directly to allow for it to be unnamed.
+  Block *block = new Block();
+
+  // Add arguments to the entry block.
+  if (!entryArguments.empty()) {
+    for (auto &placeholderArgPair : entryArguments) {
+      auto &argInfo = placeholderArgPair.first;
+      // Ensure that the argument was not already defined.
+      if (auto defLoc = getReferenceLoc(argInfo.name, argInfo.number)) {
+        return emitError(argInfo.loc, "region entry argument '" + argInfo.name +
+                                          "' is already in use")
+                   .attachNote(getEncodedSourceLocation(*defLoc))
+               << "previously referenced here";
+      }
+      if (addDefinition(placeholderArgPair.first,
+                        block->addArgument(placeholderArgPair.second))) {
+        delete block;
+        return failure();
+      }
+    }
+
+    // If we had named arguments, then don't allow a block name.
+    if (getToken().is(Token::caret_identifier))
+      return emitError("invalid block name in region with named arguments");
+  }
+
+  if (parseBlock(block)) {
+    delete block;
+    return failure();
+  }
+
+  // Verify that no other arguments were parsed.
+  if (!entryArguments.empty() &&
+      block->getNumArguments() > entryArguments.size()) {
+    delete block;
+    return emitError("entry block arguments were already defined");
+  }
+
+  // Parse the rest of the region.
+  region.push_back(block);
+  if (parseRegionBody(region))
+    return failure();
+
+  // Pop the SSA value scope for this region.
+  if (popSSANameScope())
+    return failure();
+
+  // Reset the original insertion point.
+  opBuilder.restoreInsertionPoint(currentPt);
+  return success();
+}
+
+/// Region.
+///
+///   region-body ::= block* '}'
+///
+ParseResult OperationParser::parseRegionBody(Region &region) {
+  // Parse the list of blocks.
+  while (!consumeIf(Token::r_brace)) {
+    Block *newBlock = nullptr;
+    if (parseBlock(newBlock))
+      return failure();
+    region.push_back(newBlock);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Block Parsing
+//===----------------------------------------------------------------------===//
+
+/// Block declaration.
+///
+///   block ::= block-label? operation*
+///   block-label    ::= block-id block-arg-list? `:`
+///   block-id       ::= caret-id
+///   block-arg-list ::= `(` ssa-id-and-type-list? `)`
+///
+ParseResult OperationParser::parseBlock(Block *&block) {
+  // The first block of a region may already exist, if it does the caret
+  // identifier is optional.
+  if (block && getToken().isNot(Token::caret_identifier))
+    return parseBlockBody(block);
+
+  SMLoc nameLoc = getToken().getLoc();
+  auto name = getTokenSpelling();
+  if (parseToken(Token::caret_identifier, "expected block name"))
+    return failure();
+
+  block = defineBlockNamed(name, nameLoc, block);
+
+  // Fail if the block was already defined.
+  if (!block)
+    return emitError(nameLoc, "redefinition of block '") << name << "'";
+
+  // If an argument list is present, parse it.
+  if (consumeIf(Token::l_paren)) {
+    SmallVector<BlockArgument *, 8> bbArgs;
+    if (parseOptionalBlockArgList(bbArgs, block) ||
+        parseToken(Token::r_paren, "expected ')' to end argument list"))
+      return failure();
+  }
+
+  if (parseToken(Token::colon, "expected ':' after block name"))
+    return failure();
+
+  return parseBlockBody(block);
+}
+
+ParseResult OperationParser::parseBlockBody(Block *block) {
+  // Set the insertion point to the end of the block to parse.
+  opBuilder.setInsertionPointToEnd(block);
+
+  // Parse the list of operations that make up the body of the block.
+  while (getToken().isNot(Token::caret_identifier, Token::r_brace))
+    if (parseOperation())
+      return failure();
+
+  return success();
+}
+
+/// Get the block with the specified name, creating it if it doesn't already
+/// exist.  The location specified is the point of use, which allows
+/// us to diagnose references to blocks that are not defined precisely.
+Block *OperationParser::getBlockNamed(StringRef name, SMLoc loc) {
+  auto &blockAndLoc = getBlockInfoByName(name);
+  if (!blockAndLoc.first) {
+    blockAndLoc = {new Block(), loc};
+    insertForwardRef(blockAndLoc.first, loc);
+  }
+
+  return blockAndLoc.first;
+}
+
+/// Define the block with the specified name. Returns the Block* or nullptr in
+/// the case of redefinition.
+Block *OperationParser::defineBlockNamed(StringRef name, SMLoc loc,
+                                         Block *existing) {
+  auto &blockAndLoc = getBlockInfoByName(name);
+  if (!blockAndLoc.first) {
+    // If the caller provided a block, use it.  Otherwise create a new one.
+    if (!existing)
+      existing = new Block();
+    blockAndLoc.first = existing;
+    blockAndLoc.second = loc;
+    return blockAndLoc.first;
+  }
+
+  // Forward declarations are removed once defined, so if we are defining a
+  // existing block and it is not a forward declaration, then it is a
+  // redeclaration.
+  if (!eraseForwardRef(blockAndLoc.first))
+    return nullptr;
+  return blockAndLoc.first;
+}
+
+/// Parse a (possibly empty) list of SSA operands with types as block arguments.
+///
+///   ssa-id-and-type-list ::= ssa-id-and-type (`,` ssa-id-and-type)*
+///
+ParseResult OperationParser::parseOptionalBlockArgList(
+    SmallVectorImpl<BlockArgument *> &results, Block *owner) {
+  if (getToken().is(Token::r_brace))
+    return success();
+
+  // If the block already has arguments, then we're handling the entry block.
+  // Parse and register the names for the arguments, but do not add them.
+  bool definingExistingArgs = owner->getNumArguments() != 0;
+  unsigned nextArgument = 0;
+
+  return parseCommaSeparatedList([&]() -> ParseResult {
+    return parseSSADefOrUseAndType(
+        [&](SSAUseInfo useInfo, Type type) -> ParseResult {
+          // If this block did not have existing arguments, define a new one.
+          if (!definingExistingArgs)
+            return addDefinition(useInfo, owner->addArgument(type));
+
+          // Otherwise, ensure that this argument has already been created.
+          if (nextArgument >= owner->getNumArguments())
+            return emitError("too many arguments specified in argument list");
+
+          // Finally, make sure the existing argument has the correct type.
+          auto *arg = owner->getArgument(nextArgument++);
+          if (arg->getType() != type)
+            return emitError("argument and block argument type mismatch");
+          return addDefinition(useInfo, arg);
+        });
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level entity parsing.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This parser handles entities that are only valid at the top level of the
+/// file.
+class ModuleParser : public Parser {
+public:
+  explicit ModuleParser(ParserState &state) : Parser(state) {}
+
+  ParseResult parseModule(ModuleOp module);
+
+private:
+  /// Parse an attribute alias declaration.
+  ParseResult parseAttributeAliasDef();
+
+  /// Parse an attribute alias declaration.
+  ParseResult parseTypeAliasDef();
+};
+} // end anonymous namespace
+
+/// Parses an attribute alias declaration.
+///
+///   attribute-alias-def ::= '#' alias-name `=` attribute-value
+///
+ParseResult ModuleParser::parseAttributeAliasDef() {
+  assert(getToken().is(Token::hash_identifier));
+  StringRef aliasName = getTokenSpelling().drop_front();
+
+  // Check for redefinitions.
+  if (getState().attributeAliasDefinitions.count(aliasName) > 0)
+    return emitError("redefinition of attribute alias id '" + aliasName + "'");
+
+  // Make sure this isn't invading the dialect attribute namespace.
+  if (aliasName.contains('.'))
+    return emitError("attribute names with a '.' are reserved for "
+                     "dialect-defined names");
+
+  consumeToken(Token::hash_identifier);
+
+  // Parse the '='.
+  if (parseToken(Token::equal, "expected '=' in attribute alias definition"))
+    return failure();
+
+  // Parse the attribute value.
+  Attribute attr = parseAttribute();
+  if (!attr)
+    return failure();
+
+  getState().attributeAliasDefinitions[aliasName] = attr;
+  return success();
+}
+
+/// Parse a type alias declaration.
+///
+///   type-alias-def ::= '!' alias-name `=` 'type' type
+///
+ParseResult ModuleParser::parseTypeAliasDef() {
+  assert(getToken().is(Token::exclamation_identifier));
+  StringRef aliasName = getTokenSpelling().drop_front();
+
+  // Check for redefinitions.
+  if (getState().typeAliasDefinitions.count(aliasName) > 0)
+    return emitError("redefinition of type alias id '" + aliasName + "'");
+
+  // Make sure this isn't invading the dialect type namespace.
+  if (aliasName.contains('.'))
+    return emitError("type names with a '.' are reserved for "
+                     "dialect-defined names");
+
+  consumeToken(Token::exclamation_identifier);
+
+  // Parse the '=' and 'type'.
+  if (parseToken(Token::equal, "expected '=' in type alias definition") ||
+      parseToken(Token::kw_type, "expected 'type' in type alias definition"))
+    return failure();
+
+  // Parse the type.
+  Type aliasedType = parseType();
+  if (!aliasedType)
+    return failure();
+
+  // Register this alias with the parser state.
+  getState().typeAliasDefinitions.try_emplace(aliasName, aliasedType);
+  return success();
+}
+
+/// This is the top-level module parser.
+ParseResult ModuleParser::parseModule(ModuleOp module) {
+  OperationParser opParser(getState(), module);
+
+  // Module itself is a name scope.
+  opParser.pushSSANameScope(/*isIsolated=*/true);
+
+  while (1) {
+    switch (getToken().getKind()) {
+    default:
+      // Parse a top-level operation.
+      if (opParser.parseOperation())
+        return failure();
+      break;
+
+    // If we got to the end of the file, then we're done.
+    case Token::eof: {
+      if (opParser.finalize())
+        return failure();
+
+      // Handle the case where the top level module was explicitly defined.
+      auto &bodyBlocks = module.getBodyRegion().getBlocks();
+      auto &operations = bodyBlocks.front().getOperations();
+      assert(!operations.empty() && "expected a valid module terminator");
+
+      // Check that the first operation is a module, and it is the only
+      // non-terminator operation.
+      ModuleOp nested = dyn_cast<ModuleOp>(operations.front());
+      if (nested && std::next(operations.begin(), 2) == operations.end()) {
+        // Merge the data of the nested module operation into 'module'.
+        module.setLoc(nested.getLoc());
+        module.setAttrs(nested.getOperation()->getAttrList());
+        bodyBlocks.splice(bodyBlocks.end(), nested.getBodyRegion().getBlocks());
+
+        // Erase the original module body.
+        bodyBlocks.pop_front();
+      }
+
+      return opParser.popSSANameScope();
+    }
+
+    // If we got an error token, then the lexer already emitted an error, just
+    // stop.  Someday we could introduce error recovery if there was demand
+    // for it.
+    case Token::error:
+      return failure();
+
+    // Parse an attribute alias.
+    case Token::hash_identifier:
+      if (parseAttributeAliasDef())
+        return failure();
+      break;
+
+    // Parse a type alias.
+    case Token::exclamation_identifier:
+      if (parseTypeAliasDef())
+        return failure();
+      break;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, it emits diagnostics and returns
+/// null.
+ModuleOp mlir::parseSourceFile(const llvm::SourceMgr &sourceMgr,
+                               MLIRContext *context) {
+  auto sourceBuf = sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
+
+  // This is the result module we are parsing into.
+  OwningModuleRef module(ModuleOp::create(FileLineColLoc::get(
+      sourceBuf->getBufferIdentifier(), /*line=*/0, /*column=*/0, context)));
+
+  ParserState state(sourceMgr, context);
+  if (ModuleParser(state).parseModule(*module))
+    return nullptr;
+
+  // Make sure the parse module has no other structural problems detected by
+  // the verifier.
+  if (failed(verify(*module)))
+    return nullptr;
+
+  return module.release();
+}
+
+/// This parses the file specified by the indicated filename and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+ModuleOp mlir::parseSourceFile(StringRef filename, MLIRContext *context) {
+  llvm::SourceMgr sourceMgr;
+  return parseSourceFile(filename, sourceMgr, context);
+}
+
+/// This parses the file specified by the indicated filename using the provided
+/// SourceMgr and returns an MLIR module if it was valid.  If not, the error
+/// message is emitted through the error handler registered in the context, and
+/// a null pointer is returned.
+ModuleOp mlir::parseSourceFile(StringRef filename, llvm::SourceMgr &sourceMgr,
+                               MLIRContext *context) {
+  if (sourceMgr.getNumBuffers() != 0) {
+    // TODO(b/136086478): Extend to support multiple buffers.
+    emitError(mlir::UnknownLoc::get(context),
+              "only main buffer parsed at the moment");
+    return nullptr;
+  }
+  auto file_or_err = llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code error = file_or_err.getError()) {
+    emitError(mlir::UnknownLoc::get(context),
+              "could not open input file " + filename);
+    return nullptr;
+  }
+
+  // Load the MLIR module.
+  sourceMgr.AddNewSourceBuffer(std::move(*file_or_err), llvm::SMLoc());
+  return parseSourceFile(sourceMgr, context);
+}
+
+/// This parses the program string to a MLIR module if it was valid. If not,
+/// it emits diagnostics and returns null.
+ModuleOp mlir::parseSourceString(StringRef moduleStr, MLIRContext *context) {
+  auto memBuffer = MemoryBuffer::getMemBuffer(moduleStr);
+  if (!memBuffer)
+    return nullptr;
+
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
+  return parseSourceFile(sourceMgr, context);
+}
+
+Type mlir::parseType(llvm::StringRef typeStr, MLIRContext *context) {
+  SourceMgr sourceMgr;
+  auto memBuffer =
+      MemoryBuffer::getMemBuffer(typeStr, /*BufferName=*/"<mlir_type_buffer>",
+                                 /*RequiresNullTerminator=*/false);
+  sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
+  SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, context);
+  ParserState state(sourceMgr, context);
+  Parser parser(state);
+  auto start = parser.getToken().getLoc();
+  auto ty = parser.parseType();
+  if (!ty)
+    return Type();
+
+  auto end = parser.getToken().getLoc();
+  auto read = end.getPointer() - start.getPointer();
+  // Make sure that the parsing of type consumes the entire string
+  if (static_cast<size_t>(read) < typeStr.size()) {
+    parser.emitError("unexpected additional tokens: '")
+        << typeStr.substr(read) << "' after parsing type: " << ty;
+    return Type();
+  }
+  return ty;
+}
diff --git a/third_party/mlir/lib/Parser/Token.cpp b/third_party/mlir/lib/Parser/Token.cpp
new file mode 100644
index 00000000000..f944d69b923
--- /dev/null
+++ b/third_party/mlir/lib/Parser/Token.cpp
@@ -0,0 +1,161 @@
+//===- Token.cpp - MLIR Token Implementation ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the Token class for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace mlir;
+using llvm::SMLoc;
+using llvm::SMRange;
+
+SMLoc Token::getLoc() const { return SMLoc::getFromPointer(spelling.data()); }
+
+SMLoc Token::getEndLoc() const {
+  return SMLoc::getFromPointer(spelling.data() + spelling.size());
+}
+
+SMRange Token::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
+
+/// For an integer token, return its value as an unsigned.  If it doesn't fit,
+/// return None.
+Optional<unsigned> Token::getUnsignedIntegerValue() const {
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  unsigned result = 0;
+  if (spelling.getAsInteger(isHex ? 0 : 10, result))
+    return None;
+  return result;
+}
+
+/// For an integer token, return its value as a uint64_t.  If it doesn't fit,
+/// return None.
+Optional<uint64_t> Token::getUInt64IntegerValue() const {
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  uint64_t result = 0;
+  if (spelling.getAsInteger(isHex ? 0 : 10, result))
+    return None;
+  return result;
+}
+
+/// For a floatliteral, return its value as a double. Return None if the value
+/// underflows or overflows.
+Optional<double> Token::getFloatingPointValue() const {
+  double result = 0;
+  if (spelling.getAsDouble(result))
+    return None;
+  return result;
+}
+
+/// For an inttype token, return its bitwidth.
+Optional<unsigned> Token::getIntTypeBitwidth() const {
+  unsigned result = 0;
+  if (spelling[1] == '0' || spelling.drop_front().getAsInteger(10, result) ||
+      result == 0)
+    return None;
+  return result;
+}
+
+/// Given a 'string' token, return its value, including removing the quote
+/// characters and unescaping the contents of the string.  The lexer has already
+/// verified that this token is valid.
+std::string Token::getStringValue() const {
+  assert(getKind() == string);
+  // Start by dropping the quotes.
+  StringRef bytes = getSpelling().drop_front().drop_back();
+
+  std::string result;
+  result.reserve(bytes.size());
+  for (unsigned i = 0, e = bytes.size(); i != e;) {
+    auto c = bytes[i++];
+    if (c != '\\') {
+      result.push_back(c);
+      continue;
+    }
+
+    assert(i + 1 < e && "invalid string should be caught by lexer");
+    auto c1 = bytes[i++];
+    switch (c1) {
+    case '"':
+    case '\\':
+      result.push_back(c1);
+      continue;
+    case 'n':
+      result.push_back('\n');
+      continue;
+    case 't':
+      result.push_back('\t');
+      continue;
+    default:
+      break;
+    }
+
+    assert(i + 1 <= e && "invalid string should be caught by lexer");
+    auto c2 = bytes[i++];
+
+    assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
+    result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));
+  }
+
+  return result;
+}
+
+/// Given a hash_identifier token like #123, try to parse the number out of
+/// the identifier, returning None if it is a named identifier like #x or
+/// if the integer doesn't fit.
+Optional<unsigned> Token::getHashIdentifierNumber() const {
+  assert(getKind() == hash_identifier);
+  unsigned result = 0;
+  if (spelling.drop_front().getAsInteger(10, result))
+    return None;
+  return result;
+}
+
+/// Given a punctuation or keyword token kind, return the spelling of the
+/// token as a string.  Warning: This will abort on markers, identifiers and
+/// literal tokens since they have no fixed spelling.
+StringRef Token::getTokenSpelling(Kind kind) {
+  switch (kind) {
+  default:
+    llvm_unreachable("This token kind has no fixed spelling");
+#define TOK_PUNCTUATION(NAME, SPELLING)                                        \
+  case NAME:                                                                   \
+    return SPELLING;
+#define TOK_OPERATOR(NAME, SPELLING)                                           \
+  case NAME:                                                                   \
+    return SPELLING;
+#define TOK_KEYWORD(SPELLING)                                                  \
+  case kw_##SPELLING:                                                          \
+    return #SPELLING;
+#include "TokenKinds.def"
+  }
+}
+
+/// Return true if this is one of the keyword token kinds (e.g. kw_if).
+bool Token::isKeyword() const {
+  switch (kind) {
+  default:
+    return false;
+#define TOK_KEYWORD(SPELLING)                                                  \
+  case kw_##SPELLING:                                                          \
+    return true;
+#include "TokenKinds.def"
+  }
+}
diff --git a/third_party/mlir/lib/Parser/Token.h b/third_party/mlir/lib/Parser/Token.h
new file mode 100644
index 00000000000..69c3207ab45
--- /dev/null
+++ b/third_party/mlir/lib/Parser/Token.h
@@ -0,0 +1,116 @@
+//===- Token.h - MLIR Token Interface ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_LIB_PARSER_TOKEN_H
+#define MLIR_LIB_PARSER_TOKEN_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace mlir {
+
+/// This represents a token in the MLIR syntax.
+class Token {
+public:
+  enum Kind {
+#define TOK_MARKER(NAME) NAME,
+#define TOK_IDENTIFIER(NAME) NAME,
+#define TOK_LITERAL(NAME) NAME,
+#define TOK_PUNCTUATION(NAME, SPELLING) NAME,
+#define TOK_OPERATOR(NAME, SPELLING) NAME,
+#define TOK_KEYWORD(SPELLING) kw_##SPELLING,
+#include "TokenKinds.def"
+  };
+
+  Token(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {}
+
+  // Return the bytes that make up this token.
+  StringRef getSpelling() const { return spelling; }
+
+  // Token classification.
+  Kind getKind() const { return kind; }
+  bool is(Kind K) const { return kind == K; }
+
+  bool isAny(Kind k1, Kind k2) const { return is(k1) || is(k2); }
+
+  /// Return true if this token is one of the specified kinds.
+  template <typename... T>
+  bool isAny(Kind k1, Kind k2, Kind k3, T... others) const {
+    if (is(k1))
+      return true;
+    return isAny(k2, k3, others...);
+  }
+
+  bool isNot(Kind k) const { return kind != k; }
+
+  /// Return true if this token isn't one of the specified kinds.
+  template <typename... T> bool isNot(Kind k1, Kind k2, T... others) const {
+    return !isAny(k1, k2, others...);
+  }
+
+  /// Return true if this is one of the keyword token kinds (e.g. kw_if).
+  bool isKeyword() const;
+
+  // Helpers to decode specific sorts of tokens.
+
+  /// For an integer token, return its value as an unsigned.  If it doesn't fit,
+  /// return None.
+  Optional<unsigned> getUnsignedIntegerValue() const;
+
+  /// For an integer token, return its value as an uint64_t.  If it doesn't fit,
+  /// return None.
+  Optional<uint64_t> getUInt64IntegerValue() const;
+
+  /// For a floatliteral token, return its value as a double. Returns None in
+  /// the case of underflow or overflow.
+  Optional<double> getFloatingPointValue() const;
+
+  /// For an inttype token, return its bitwidth.
+  Optional<unsigned> getIntTypeBitwidth() const;
+
+  /// Given a hash_identifier token like #123, try to parse the number out of
+  /// the identifier, returning None if it is a named identifier like #x or
+  /// if the integer doesn't fit.
+  Optional<unsigned> getHashIdentifierNumber() const;
+
+  /// Given a 'string' token, return its value, including removing the quote
+  /// characters and unescaping the contents of the string.
+  std::string getStringValue() const;
+
+  // Location processing.
+  llvm::SMLoc getLoc() const;
+  llvm::SMLoc getEndLoc() const;
+  llvm::SMRange getLocRange() const;
+
+  /// Given a punctuation or keyword token kind, return the spelling of the
+  /// token as a string.  Warning: This will abort on markers, identifiers and
+  /// literal tokens since they have no fixed spelling.
+  static StringRef getTokenSpelling(Kind kind);
+
+private:
+  /// Discriminator that indicates the sort of token this is.
+  Kind kind;
+
+  /// A reference to the entire token contents; this is always a pointer into
+  /// a memory buffer owned by the source manager.
+  StringRef spelling;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_LIB_PARSER_TOKEN_H
diff --git a/third_party/mlir/lib/Parser/TokenKinds.def b/third_party/mlir/lib/Parser/TokenKinds.def
new file mode 100644
index 00000000000..32e9b120938
--- /dev/null
+++ b/third_party/mlir/lib/Parser/TokenKinds.def
@@ -0,0 +1,131 @@
+//===- TokenKinds.def - MLIR Token Description ------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file is intended to be #include'd multiple times to extract information
+// about tokens for various clients in the lexer.
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(TOK_MARKER) && !defined(TOK_IDENTIFIER) && !defined(TOK_LITERAL)&&\
+    !defined(TOK_PUNCTUATION) && !defined(TOK_OPERATOR) && !defined(TOK_KEYWORD)
+#  error Must define one of the TOK_ macros.
+#endif
+
+#ifndef TOK_MARKER
+#define TOK_MARKER(X)
+#endif
+#ifndef TOK_IDENTIFIER
+#define TOK_IDENTIFIER(NAME)
+#endif
+#ifndef TOK_LITERAL
+#define TOK_LITERAL(NAME)
+#endif
+#ifndef TOK_PUNCTUATION
+#define TOK_PUNCTUATION(NAME, SPELLING)
+#endif
+#ifndef TOK_OPERATOR
+#define TOK_OPERATOR(NAME, SPELLING)
+#endif
+#ifndef TOK_KEYWORD
+#define TOK_KEYWORD(SPELLING)
+#endif
+
+
+// Markers
+TOK_MARKER(eof)
+TOK_MARKER(error)
+
+// Identifiers.
+TOK_IDENTIFIER(bare_identifier)        // foo
+TOK_IDENTIFIER(at_identifier)          // @foo
+TOK_IDENTIFIER(hash_identifier)        // #foo
+TOK_IDENTIFIER(percent_identifier)     // %foo
+TOK_IDENTIFIER(caret_identifier)       // ^foo
+TOK_IDENTIFIER(exclamation_identifier) // !foo
+
+// Literals
+TOK_LITERAL(floatliteral)               // 2.0
+TOK_LITERAL(integer)                    // 42
+TOK_LITERAL(string)                     // "foo"
+TOK_LITERAL(inttype)                    // i421
+
+// Punctuation.
+TOK_PUNCTUATION(arrow,            "->")
+TOK_PUNCTUATION(at,               "@")
+TOK_PUNCTUATION(colon,            ":")
+TOK_PUNCTUATION(comma,            ",")
+TOK_PUNCTUATION(question,         "?")
+TOK_PUNCTUATION(l_paren,          "(")
+TOK_PUNCTUATION(r_paren,          ")")
+TOK_PUNCTUATION(l_brace,          "{")
+TOK_PUNCTUATION(r_brace,          "}")
+TOK_PUNCTUATION(l_square,         "[")
+TOK_PUNCTUATION(r_square,         "]")
+TOK_PUNCTUATION(less,             "<")
+TOK_PUNCTUATION(greater,          ">")
+TOK_PUNCTUATION(equal,            "=")
+TOK_PUNCTUATION(ellipsis,         "...")
+// TODO: More punctuation.
+
+// Operators.
+TOK_OPERATOR(plus,               "+")
+TOK_OPERATOR(minus,              "-")
+TOK_OPERATOR(star,               "*")
+// TODO: More operator tokens
+
+// Keywords.  These turn "foo" into Token::kw_foo enums.
+
+// NOTE: Please key these alphabetized to make it easier to find something in
+// this list and to cater to OCD.
+TOK_KEYWORD(attributes)
+TOK_KEYWORD(bf16)
+TOK_KEYWORD(ceildiv)
+TOK_KEYWORD(complex)
+TOK_KEYWORD(dense)
+TOK_KEYWORD(f16)
+TOK_KEYWORD(f32)
+TOK_KEYWORD(f64)
+TOK_KEYWORD(false)
+TOK_KEYWORD(floordiv)
+TOK_KEYWORD(for)
+TOK_KEYWORD(func)
+TOK_KEYWORD(index)
+TOK_KEYWORD(loc)
+TOK_KEYWORD(max)
+TOK_KEYWORD(memref)
+TOK_KEYWORD(min)
+TOK_KEYWORD(mod)
+TOK_KEYWORD(none)
+TOK_KEYWORD(opaque)
+TOK_KEYWORD(size)
+TOK_KEYWORD(sparse)
+TOK_KEYWORD(step)
+TOK_KEYWORD(symbol)
+TOK_KEYWORD(tensor)
+TOK_KEYWORD(to)
+TOK_KEYWORD(true)
+TOK_KEYWORD(tuple)
+TOK_KEYWORD(type)
+TOK_KEYWORD(unit)
+TOK_KEYWORD(vector)
+
+#undef TOK_MARKER
+#undef TOK_IDENTIFIER
+#undef TOK_LITERAL
+#undef TOK_PUNCTUATION
+#undef TOK_OPERATOR
+#undef TOK_KEYWORD
diff --git a/third_party/mlir/lib/Pass/CMakeLists.txt b/third_party/mlir/lib/Pass/CMakeLists.txt
new file mode 100644
index 00000000000..05122f5a7ed
--- /dev/null
+++ b/third_party/mlir/lib/Pass/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRPass
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass
+  )
+add_dependencies(MLIRPass MLIRAnalysis MLIRIR LLVMSupport)
+target_link_libraries(MLIRPass MLIRAnalysis MLIRIR LLVMSupport)
diff --git a/third_party/mlir/lib/Pass/IRPrinting.cpp b/third_party/mlir/lib/Pass/IRPrinting.cpp
new file mode 100644
index 00000000000..bc661979e59
--- /dev/null
+++ b/third_party/mlir/lib/Pass/IRPrinting.cpp
@@ -0,0 +1,133 @@
+//===- IRPrinting.cpp -----------------------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "PassDetail.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+class IRPrinterInstrumentation : public PassInstrumentation {
+public:
+  /// A filter function to decide if the given pass should be printed. Returns
+  /// true if the pass should be printed, false otherwise.
+  using ShouldPrintFn = std::function<bool(Pass *)>;
+
+  IRPrinterInstrumentation(ShouldPrintFn &&shouldPrintBeforePass,
+                           ShouldPrintFn &&shouldPrintAfterPass,
+                           bool printModuleScope, raw_ostream &out)
+      : shouldPrintBeforePass(shouldPrintBeforePass),
+        shouldPrintAfterPass(shouldPrintAfterPass),
+        printModuleScope(printModuleScope), out(out) {
+    assert((shouldPrintBeforePass || shouldPrintAfterPass) &&
+           "expected atleast one valid filter function");
+  }
+
+private:
+  /// Instrumentation hooks.
+  void runBeforePass(Pass *pass, Operation *op) override;
+  void runAfterPass(Pass *pass, Operation *op) override;
+  void runAfterPassFailed(Pass *pass, Operation *op) override;
+
+  /// Filter functions for before and after pass execution.
+  ShouldPrintFn shouldPrintBeforePass, shouldPrintAfterPass;
+
+  /// Flag to toggle if the printer should always print at module scope.
+  bool printModuleScope;
+
+  /// The stream to output to.
+  raw_ostream &out;
+};
+} // end anonymous namespace
+
+/// Returns true if the given pass is hidden from IR printing.
+static bool isHiddenPass(Pass *pass) {
+  return isAdaptorPass(pass) || isVerifierPass(pass);
+}
+
+static void printIR(Operation *op, bool printModuleScope, raw_ostream &out) {
+  // Check for printing at module scope.
+  auto function = dyn_cast<FuncOp>(op);
+  if (printModuleScope && function) {
+    // Print the function name and a newline before the Module.
+    out << " (function: " << function.getName() << ")\n";
+    function.getParentOfType<ModuleOp>().print(out);
+    return;
+  }
+
+  // Print a newline before the IR.
+  out << "\n";
+
+  // Print the given function.
+  if (function) {
+    function.print(out);
+    return;
+  }
+
+  // Print the given module.
+  assert(isa<ModuleOp>(op) && "unexpected IR unit");
+  cast<ModuleOp>(op).print(out);
+}
+
+/// Instrumentation hooks.
+void IRPrinterInstrumentation::runBeforePass(Pass *pass, Operation *op) {
+  // Skip hidden passes and passes that the user filtered out.
+  if (!shouldPrintBeforePass || isHiddenPass(pass) ||
+      !shouldPrintBeforePass(pass))
+    return;
+  out << formatv("*** IR Dump Before {0} ***", pass->getName());
+  printIR(op, printModuleScope, out);
+  out << "\n\n";
+}
+
+void IRPrinterInstrumentation::runAfterPass(Pass *pass, Operation *op) {
+  // Skip hidden passes and passes that the user filtered out.
+  if (!shouldPrintAfterPass || isHiddenPass(pass) ||
+      !shouldPrintAfterPass(pass))
+    return;
+  out << formatv("*** IR Dump After {0} ***", pass->getName());
+  printIR(op, printModuleScope, out);
+  out << "\n\n";
+}
+
+void IRPrinterInstrumentation::runAfterPassFailed(Pass *pass, Operation *op) {
+  // Skip adaptor passes and passes that the user filtered out.
+  if (!shouldPrintAfterPass || isAdaptorPass(pass) ||
+      !shouldPrintAfterPass(pass))
+    return;
+  out << formatv("*** IR Dump After {0} Failed ***", pass->getName());
+  printIR(op, printModuleScope, out);
+  out << "\n\n";
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+/// Add an instrumentation to print the IR before and after pass execution.
+void PassManager::enableIRPrinting(
+    std::function<bool(Pass *)> shouldPrintBeforePass,
+    std::function<bool(Pass *)> shouldPrintAfterPass, bool printModuleScope,
+    raw_ostream &out) {
+  addInstrumentation(new IRPrinterInstrumentation(
+      std::move(shouldPrintBeforePass), std::move(shouldPrintAfterPass),
+      printModuleScope, out));
+}
diff --git a/third_party/mlir/lib/Pass/Pass.cpp b/third_party/mlir/lib/Pass/Pass.cpp
new file mode 100644
index 00000000000..0892aa087e8
--- /dev/null
+++ b/third_party/mlir/lib/Pass/Pass.cpp
@@ -0,0 +1,450 @@
+//===- Pass.cpp - Pass infrastructure implementation ----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements common pass infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "PassDetail.h"
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/Threading.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+/// Out of line virtual method to ensure vtables and metadata are emitted to a
+/// single .o file.
+void Pass::anchor() {}
+
+/// Forwarding function to execute this pass.
+LogicalResult FunctionPassBase::run(FuncOp fn, AnalysisManager am) {
+  // Initialize the pass state.
+  passState.emplace(fn, am);
+
+  // Instrument before the pass has run.
+  auto pi = am.getPassInstrumentor();
+  if (pi)
+    pi->runBeforePass(this, fn);
+
+  // Invoke the virtual runOnFunction function.
+  runOnFunction();
+
+  // Invalidate any non preserved analyses.
+  am.invalidate(passState->preservedAnalyses);
+
+  // Instrument after the pass has run.
+  bool passFailed = passState->irAndPassFailed.getInt();
+  if (pi) {
+    if (passFailed)
+      pi->runAfterPassFailed(this, fn);
+    else
+      pi->runAfterPass(this, fn);
+  }
+
+  // Return if the pass signaled a failure.
+  return failure(passFailed);
+}
+
+/// Forwarding function to execute this pass.
+LogicalResult ModulePassBase::run(ModuleOp module, AnalysisManager am) {
+  // Initialize the pass state.
+  passState.emplace(module, am);
+
+  // Instrument before the pass has run.
+  auto pi = am.getPassInstrumentor();
+  if (pi)
+    pi->runBeforePass(this, module);
+
+  // Invoke the virtual runOnModule function.
+  runOnModule();
+
+  // Invalidate any non preserved analyses.
+  am.invalidate(passState->preservedAnalyses);
+
+  // Instrument after the pass has run.
+  bool passFailed = passState->irAndPassFailed.getInt();
+  if (pi) {
+    if (passFailed)
+      pi->runAfterPassFailed(this, module);
+    else
+      pi->runAfterPass(this, module);
+  }
+
+  // Return if the pass signaled a failure.
+  return failure(passFailed);
+}
+
+//===----------------------------------------------------------------------===//
+// PassExecutor
+//===----------------------------------------------------------------------===//
+
+FunctionPassExecutor::FunctionPassExecutor(const FunctionPassExecutor &rhs)
+    : PassExecutor(Kind::FunctionExecutor) {
+  for (auto &pass : rhs.passes)
+    addPass(pass->clone());
+}
+
+/// Run all of the passes in this manager over the current function.
+LogicalResult detail::FunctionPassExecutor::run(FuncOp function,
+                                                AnalysisManager am) {
+  // Run each of the held passes.
+  for (auto &pass : passes)
+    if (failed(pass->run(function, am)))
+      return failure();
+  return success();
+}
+
+/// Run all of the passes in this manager over the current module.
+LogicalResult detail::ModulePassExecutor::run(ModuleOp module,
+                                              AnalysisManager am) {
+  // Run each of the held passes.
+  for (auto &pass : passes)
+    if (failed(pass->run(module, am)))
+      return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleToFunctionPassAdaptor
+//===----------------------------------------------------------------------===//
+
+/// Utility to run the given function and analysis manager on a provided
+/// function pass executor.
+static LogicalResult runFunctionPipeline(FunctionPassExecutor &fpe, FuncOp func,
+                                         AnalysisManager am) {
+  // Run the function pipeline over the provided function.
+  auto result = fpe.run(func, am);
+
+  // Clear out any computed function analyses. These analyses won't be used
+  // any more in this pipeline, and this helps reduce the current working set
+  // of memory. If preserving these analyses becomes important in the future
+  // we can re-evalutate this.
+  am.clear();
+  return result;
+}
+
+/// Run the held function pipeline over all non-external functions within the
+/// module.
+void ModuleToFunctionPassAdaptor::runOnModule() {
+  AnalysisManager am = getAnalysisManager();
+  for (auto func : getModule().getOps<FuncOp>()) {
+    // Skip external functions.
+    if (func.isExternal())
+      continue;
+
+    // Run the held function pipeline over the current function.
+    auto childAM = am.slice(func);
+    if (failed(runFunctionPipeline(fpe, func, childAM)))
+      return signalPassFailure();
+
+    // Clear out any computed child analyses. These analyses won't be used
+    // any more in this pipeline, and this helps reduce the current working set
+    // of memory. If preserving these analyses becomes important in the future
+    // we can re-evalutate this.
+    am.clear();
+  }
+}
+
+// Run the held function pipeline synchronously across the functions within
+// the module.
+void ModuleToFunctionPassAdaptorParallel::runOnModule() {
+  AnalysisManager am = getAnalysisManager();
+
+  // Create the async executors if they haven't been created, or if the main
+  // function pipeline has changed.
+  if (asyncExecutors.empty() || asyncExecutors.front().size() != fpe.size())
+    asyncExecutors = {llvm::hardware_concurrency(), fpe};
+
+  // Run a prepass over the module to collect the functions to execute a over.
+  // This ensures that an analysis manager exists for each function, as well as
+  // providing a queue of functions to execute over.
+  std::vector<std::pair<FuncOp, AnalysisManager>> funcAMPairs;
+  for (auto func : getModule().getOps<FuncOp>())
+    if (!func.isExternal())
+      funcAMPairs.emplace_back(func, am.slice(func));
+
+  // A parallel diagnostic handler that provides deterministic diagnostic
+  // ordering.
+  ParallelDiagnosticHandler diagHandler(&getContext());
+
+  // An index for the current function/analysis manager pair.
+  std::atomic<unsigned> funcIt(0);
+
+  // An atomic failure variable for the async executors.
+  std::atomic<bool> passFailed(false);
+  llvm::parallel::for_each(
+      llvm::parallel::par, asyncExecutors.begin(),
+      std::next(asyncExecutors.begin(),
+                std::min(asyncExecutors.size(), funcAMPairs.size())),
+      [&](FunctionPassExecutor &executor) {
+        for (auto e = funcAMPairs.size(); !passFailed && funcIt < e;) {
+          // Get the next available function index.
+          unsigned nextID = funcIt++;
+          if (nextID >= e)
+            break;
+
+          // Set the function id for this thread in the diagnostic handler.
+          diagHandler.setOrderIDForThread(nextID);
+
+          // Run the executor over the current function.
+          auto &it = funcAMPairs[nextID];
+          if (failed(runFunctionPipeline(executor, it.first, it.second))) {
+            passFailed = true;
+            break;
+          }
+        }
+      });
+
+  // Signal a failure if any of the executors failed.
+  if (passFailed)
+    signalPassFailure();
+}
+
+//===----------------------------------------------------------------------===//
+// Verifier Passes
+//===----------------------------------------------------------------------===//
+
+void FunctionVerifierPass::runOnFunction() {
+  if (failed(verify(getFunction())))
+    signalPassFailure();
+  markAllAnalysesPreserved();
+}
+
+void ModuleVerifierPass::runOnModule() {
+  if (failed(verify(getModule())))
+    signalPassFailure();
+  markAllAnalysesPreserved();
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+PassManager::PassManager(bool verifyPasses)
+    : mpe(new ModulePassExecutor()), verifyPasses(verifyPasses),
+      passTiming(false), disableThreads(false) {}
+
+PassManager::~PassManager() {}
+
+/// Run the passes within this manager on the provided module.
+LogicalResult PassManager::run(ModuleOp module) {
+  ModuleAnalysisManager am(module, instrumentor.get());
+  return mpe->run(module, am);
+}
+
+/// Disable support for multi-threading within the pass manager.
+void PassManager::disableMultithreading(bool disable) {
+  disableThreads = disable;
+}
+
+/// Add an opaque pass pointer to the current manager. This takes ownership
+/// over the provided pass pointer.
+void PassManager::addPass(std::unique_ptr<Pass> pass) {
+  switch (pass->getKind()) {
+  case Pass::Kind::FunctionPass:
+    addPass(cast<FunctionPassBase>(std::move(pass)));
+    break;
+  case Pass::Kind::ModulePass:
+    addPass(cast<ModulePassBase>(std::move(pass)));
+    break;
+  }
+}
+
+/// Add a module pass to the current manager. This takes ownership over the
+/// provided pass pointer.
+void PassManager::addPass(std::unique_ptr<ModulePassBase> pass) {
+  nestedExecutorStack.clear();
+  mpe->addPass(std::move(pass));
+
+  // Add a verifier run if requested.
+  if (verifyPasses)
+    mpe->addPass(std::make_unique<ModuleVerifierPass>());
+}
+
+/// Add a function pass to the current manager. This takes ownership over the
+/// provided pass pointer. This will automatically create a function pass
+/// executor if necessary.
+void PassManager::addPass(std::unique_ptr<FunctionPassBase> pass) {
+  detail::FunctionPassExecutor *fpe;
+  if (nestedExecutorStack.empty()) {
+    /// Create an executor adaptor for this pass.
+    if (disableThreads || !llvm::llvm_is_multithreaded()) {
+      // If multi-threading is disabled, then create a synchronous adaptor.
+      auto adaptor = std::make_unique<ModuleToFunctionPassAdaptor>();
+      fpe = &adaptor->getFunctionExecutor();
+      addPass(std::unique_ptr<ModulePassBase>{adaptor.release()});
+    } else {
+      auto adaptor = std::make_unique<ModuleToFunctionPassAdaptorParallel>();
+      fpe = &adaptor->getFunctionExecutor();
+      addPass(std::unique_ptr<ModulePassBase>{adaptor.release()});
+    }
+
+    /// Add the executor to the stack.
+    nestedExecutorStack.push_back(fpe);
+  } else {
+    fpe = cast<detail::FunctionPassExecutor>(nestedExecutorStack.back());
+  }
+  fpe->addPass(std::move(pass));
+
+  // Add a verifier run if requested.
+  if (verifyPasses)
+    fpe->addPass(std::make_unique<FunctionVerifierPass>());
+}
+
+/// Add the provided instrumentation to the pass manager. This takes ownership
+/// over the given pointer.
+void PassManager::addInstrumentation(PassInstrumentation *pi) {
+  if (!instrumentor)
+    instrumentor.reset(new PassInstrumentor());
+
+  instrumentor->addInstrumentation(pi);
+}
+
+//===----------------------------------------------------------------------===//
+// AnalysisManager
+//===----------------------------------------------------------------------===//
+
+/// Returns a pass instrumentation object for the current operation.
+PassInstrumentor *AnalysisManager::getPassInstrumentor() const {
+  ParentPointerT curParent = parent;
+  while (auto *parentAM = curParent.dyn_cast<const AnalysisManager *>())
+    curParent = parentAM->parent;
+  return curParent.get<const ModuleAnalysisManager *>()->getPassInstrumentor();
+}
+
+/// Get an analysis manager for the given child operation.
+AnalysisManager AnalysisManager::slice(Operation *op) {
+  assert(op->getParentOp() == impl->getOperation() &&
+         "'op' has a different parent operation");
+  auto it = impl->childAnalyses.find(op);
+  if (it == impl->childAnalyses.end())
+    it = impl->childAnalyses
+             .try_emplace(op, std::make_unique<NestedAnalysisMap>(op))
+             .first;
+  return {this, it->second.get()};
+}
+
+/// Invalidate any non preserved analyses.
+void detail::NestedAnalysisMap::invalidate(
+    const detail::PreservedAnalyses &pa) {
+  // If all analyses were preserved, then there is nothing to do here.
+  if (pa.isAll())
+    return;
+
+  // Invalidate the analyses for the current operation directly.
+  analyses.invalidate(pa);
+
+  // If no analyses were preserved, then just simply clear out the child
+  // analysis results.
+  if (pa.isNone()) {
+    childAnalyses.clear();
+    return;
+  }
+
+  // Otherwise, invalidate each child analysis map.
+  SmallVector<NestedAnalysisMap *, 8> mapsToInvalidate(1, this);
+  while (!mapsToInvalidate.empty()) {
+    auto *map = mapsToInvalidate.pop_back_val();
+    for (auto &analysisPair : map->childAnalyses) {
+      analysisPair.second->invalidate(pa);
+      if (!analysisPair.second->childAnalyses.empty())
+        mapsToInvalidate.push_back(analysisPair.second.get());
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PassInstrumentation
+//===----------------------------------------------------------------------===//
+
+PassInstrumentation::~PassInstrumentation() {}
+
+//===----------------------------------------------------------------------===//
+// PassInstrumentor
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct PassInstrumentorImpl {
+  /// Mutex to keep instrumentation access thread-safe.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// Set of registered instrumentations.
+  std::vector<std::unique_ptr<PassInstrumentation>> instrumentations;
+};
+} // end namespace detail
+} // end namespace mlir
+
+PassInstrumentor::PassInstrumentor() : impl(new PassInstrumentorImpl()) {}
+PassInstrumentor::~PassInstrumentor() {}
+
+/// See PassInstrumentation::runBeforePass for details.
+void PassInstrumentor::runBeforePass(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforePass(pass, op);
+}
+
+/// See PassInstrumentation::runAfterPass for details.
+void PassInstrumentor::runAfterPass(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterPass(pass, op);
+}
+
+/// See PassInstrumentation::runAfterPassFailed for details.
+void PassInstrumentor::runAfterPassFailed(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterPassFailed(pass, op);
+}
+
+/// See PassInstrumentation::runBeforeAnalysis for details.
+void PassInstrumentor::runBeforeAnalysis(llvm::StringRef name, AnalysisID *id,
+                                         Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforeAnalysis(name, id, op);
+}
+
+/// See PassInstrumentation::runAfterAnalysis for details.
+void PassInstrumentor::runAfterAnalysis(llvm::StringRef name, AnalysisID *id,
+                                        Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterAnalysis(name, id, op);
+}
+
+/// Add the given instrumentation to the collection. This takes ownership over
+/// the given pointer.
+void PassInstrumentor::addInstrumentation(PassInstrumentation *pi) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  impl->instrumentations.emplace_back(pi);
+}
+
+constexpr AnalysisID mlir::detail::PreservedAnalyses::allAnalysesID;
diff --git a/third_party/mlir/lib/Pass/PassDetail.h b/third_party/mlir/lib/Pass/PassDetail.h
new file mode 100644
index 00000000000..aa60cfb23ea
--- /dev/null
+++ b/third_party/mlir/lib/Pass/PassDetail.h
@@ -0,0 +1,174 @@
+//===- PassDetail.h - MLIR Pass details -------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_PASS_PASSDETAIL_H_
+#define MLIR_PASS_PASSDETAIL_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace detail {
+
+//===----------------------------------------------------------------------===//
+// Verifier Passes
+//===----------------------------------------------------------------------===//
+
+/// Pass to verify a function and signal failure if necessary.
+class FunctionVerifierPass : public FunctionPass<FunctionVerifierPass> {
+  void runOnFunction() override;
+};
+
+/// Pass to verify a module and signal failure if necessary.
+class ModuleVerifierPass : public ModulePass<ModuleVerifierPass> {
+  void runOnModule() override;
+};
+
+//===----------------------------------------------------------------------===//
+// PassExecutor
+//===----------------------------------------------------------------------===//
+
+/// The abstract base pass executor class.
+class PassExecutor {
+public:
+  enum Kind { FunctionExecutor, ModuleExecutor };
+  explicit PassExecutor(Kind kind) : kind(kind) {}
+
+  /// Get the kind of this executor.
+  Kind getKind() const { return kind; }
+
+private:
+  /// The kind of executor this object is.
+  Kind kind;
+};
+
+/// A pass executor that contains a list of passes over a function.
+class FunctionPassExecutor : public PassExecutor {
+public:
+  FunctionPassExecutor() : PassExecutor(Kind::FunctionExecutor) {}
+  FunctionPassExecutor(FunctionPassExecutor &&) = default;
+  FunctionPassExecutor(const FunctionPassExecutor &rhs);
+
+  /// Run the executor on the given function.
+  LogicalResult run(FuncOp function, AnalysisManager am);
+
+  /// Add a pass to the current executor. This takes ownership over the provided
+  /// pass pointer.
+  void addPass(std::unique_ptr<FunctionPassBase> pass) {
+    passes.push_back(std::move(pass));
+  }
+
+  /// Returns the number of passes held by this executor.
+  size_t size() const { return passes.size(); }
+
+  static bool classof(const PassExecutor *pe) {
+    return pe->getKind() == Kind::FunctionExecutor;
+  }
+
+private:
+  std::vector<std::unique_ptr<FunctionPassBase>> passes;
+};
+
+/// A pass executor that contains a list of passes over a module unit.
+class ModulePassExecutor : public PassExecutor {
+public:
+  ModulePassExecutor() : PassExecutor(Kind::ModuleExecutor) {}
+  ModulePassExecutor(ModulePassExecutor &&) = default;
+
+  // Don't allow copying.
+  ModulePassExecutor(const ModulePassExecutor &) = delete;
+  ModulePassExecutor &operator=(const ModulePassExecutor &) = delete;
+
+  /// Run the executor on the given module.
+  LogicalResult run(ModuleOp module, AnalysisManager am);
+
+  /// Add a pass to the current executor. This takes ownership over the provided
+  /// pass pointer.
+  void addPass(std::unique_ptr<ModulePassBase> pass) {
+    passes.push_back(std::move(pass));
+  }
+
+  static bool classof(const PassExecutor *pe) {
+    return pe->getKind() == Kind::ModuleExecutor;
+  }
+
+private:
+  /// Set of passes to run on the given module.
+  std::vector<std::unique_ptr<ModulePassBase>> passes;
+};
+
+//===----------------------------------------------------------------------===//
+// ModuleToFunctionPassAdaptor
+//===----------------------------------------------------------------------===//
+
+/// An adaptor module pass used to run function passes over all of the
+/// non-external functions of a module synchronously on a single thread.
+class ModuleToFunctionPassAdaptor
+    : public ModulePass<ModuleToFunctionPassAdaptor> {
+public:
+  /// Run the held function pipeline over all non-external functions within the
+  /// module.
+  void runOnModule() override;
+
+  /// Returns the function pass executor for this adaptor.
+  FunctionPassExecutor &getFunctionExecutor() { return fpe; }
+
+private:
+  FunctionPassExecutor fpe;
+};
+
+/// An adaptor module pass used to run function passes over all of the
+/// non-external functions of a module asynchronously across multiple threads.
+class ModuleToFunctionPassAdaptorParallel
+    : public ModulePass<ModuleToFunctionPassAdaptorParallel> {
+public:
+  /// Run the held function pipeline over all non-external functions within the
+  /// module.
+  void runOnModule() override;
+
+  /// Returns the function pass executor for this adaptor.
+  FunctionPassExecutor &getFunctionExecutor() { return fpe; }
+
+private:
+  // The main function pass executor for this adaptor.
+  FunctionPassExecutor fpe;
+
+  // A set of executors, cloned from the main executor, that run asynchronously
+  // on different threads.
+  std::vector<FunctionPassExecutor> asyncExecutors;
+};
+
+/// Utility function to return if a pass refers to an
+/// ModuleToFunctionPassAdaptor instance.
+inline bool isModuleToFunctionAdaptorPass(Pass *pass) {
+  return isa<ModuleToFunctionPassAdaptorParallel>(pass) ||
+         isa<ModuleToFunctionPassAdaptor>(pass);
+}
+
+/// Utility function to return if a pass refers to an adaptor pass. Adaptor
+/// passes are those that internally execute a pipeline, such as the
+/// ModuleToFunctionPassAdaptor.
+inline bool isAdaptorPass(Pass *pass) {
+  return isModuleToFunctionAdaptorPass(pass);
+}
+
+/// Utility function to return if a pass refers to a verifier pass.
+inline bool isVerifierPass(Pass *pass) {
+  return isa<FunctionVerifierPass>(pass) || isa<ModuleVerifierPass>(pass);
+}
+
+} // end namespace detail
+} // end namespace mlir
+#endif // MLIR_PASS_PASSDETAIL_H_
diff --git a/third_party/mlir/lib/Pass/PassManagerOptions.cpp b/third_party/mlir/lib/Pass/PassManagerOptions.cpp
new file mode 100644
index 00000000000..055e81cbd1b
--- /dev/null
+++ b/third_party/mlir/lib/Pass/PassManagerOptions.cpp
@@ -0,0 +1,170 @@
+//===- PassManagerOptions.cpp - PassManager Command Line Options ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace mlir;
+
+namespace {
+struct PassManagerOptions {
+  typedef llvm::cl::list<const mlir::PassRegistryEntry *, bool, PassNameParser>
+      PassOptionList;
+
+  PassManagerOptions();
+
+  //===--------------------------------------------------------------------===//
+  // Multi-threading
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<bool> disableThreads;
+
+  //===--------------------------------------------------------------------===//
+  // IR Printing
+  //===--------------------------------------------------------------------===//
+  PassOptionList printBefore;
+  PassOptionList printAfter;
+  llvm::cl::opt<bool> printBeforeAll;
+  llvm::cl::opt<bool> printAfterAll;
+  llvm::cl::opt<bool> printModuleScope;
+
+  /// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
+  void addPrinterInstrumentation(PassManager &pm);
+
+  //===--------------------------------------------------------------------===//
+  // Pass Timing
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<bool> passTiming;
+  llvm::cl::opt<PassTimingDisplayMode> passTimingDisplayMode;
+
+  /// Add a pass timing instrumentation if enabled by 'pass-timing' flags.
+  void addTimingInstrumentation(PassManager &pm);
+};
+} // end anonymous namespace
+
+static llvm::ManagedStatic<llvm::Optional<PassManagerOptions>> options;
+
+PassManagerOptions::PassManagerOptions()
+    //===------------------------------------------------------------------===//
+    // Multi-threading
+    //===------------------------------------------------------------------===//
+    : disableThreads(
+          "disable-pass-threading",
+          llvm::cl::desc("Disable multithreading in the pass manager"),
+          llvm::cl::init(false)),
+
+      //===----------------------------------------------------------------===//
+      // IR Printing
+      //===----------------------------------------------------------------===//
+      printBefore("print-ir-before",
+                  llvm::cl::desc("Print IR before specified passes")),
+      printAfter("print-ir-after",
+                 llvm::cl::desc("Print IR after specified passes")),
+      printBeforeAll("print-ir-before-all",
+                     llvm::cl::desc("Print IR before each pass"),
+                     llvm::cl::init(false)),
+      printAfterAll("print-ir-after-all",
+                    llvm::cl::desc("Print IR after each pass"),
+                    llvm::cl::init(false)),
+      printModuleScope(
+          "print-ir-module-scope",
+          llvm::cl::desc("When printing IR for print-ir-[before|after]{-all} "
+                         "always print "
+                         "a module IR"),
+          llvm::cl::init(false)),
+
+      //===----------------------------------------------------------------===//
+      // Pass Timing
+      //===----------------------------------------------------------------===//
+      passTiming("pass-timing",
+                 llvm::cl::desc("Display the execution times of each pass")),
+      passTimingDisplayMode(
+          "pass-timing-display",
+          llvm::cl::desc("Display method for pass timing data"),
+          llvm::cl::init(PassTimingDisplayMode::Pipeline),
+          llvm::cl::values(
+              clEnumValN(PassTimingDisplayMode::List, "list",
+                         "display the results in a list sorted by total time"),
+              clEnumValN(PassTimingDisplayMode::Pipeline, "pipeline",
+                         "display the results with a nested pipeline view"))) {}
+
+/// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
+void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
+  std::function<bool(Pass *)> shouldPrintBeforePass, shouldPrintAfterPass;
+
+  // Handle print-before.
+  if (printBeforeAll) {
+    // If we are printing before all, then just return true for the filter.
+    shouldPrintBeforePass = [](Pass *) { return true; };
+  } else if (printBefore.getNumOccurrences() != 0) {
+    // Otherwise if there are specific passes to print before, then check to see
+    // if the pass info for the current pass is included in the list.
+    shouldPrintBeforePass = [&](Pass *pass) {
+      auto *passInfo = pass->lookupPassInfo();
+      return passInfo && llvm::is_contained(printBefore, passInfo);
+    };
+  }
+
+  // Handle print-after.
+  if (printAfterAll) {
+    // If we are printing after all, then just return true for the filter.
+    shouldPrintAfterPass = [](Pass *) { return true; };
+  } else if (printAfter.getNumOccurrences() != 0) {
+    // Otherwise if there are specific passes to print after, then check to see
+    // if the pass info for the current pass is included in the list.
+    shouldPrintAfterPass = [&](Pass *pass) {
+      auto *passInfo = pass->lookupPassInfo();
+      return passInfo && llvm::is_contained(printAfter, passInfo);
+    };
+  }
+
+  // If there are no valid printing filters, then just return.
+  if (!shouldPrintBeforePass && !shouldPrintAfterPass)
+    return;
+
+  // Otherwise, add the IR printing instrumentation.
+  pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass,
+                      printModuleScope, llvm::errs());
+}
+
+/// Add a pass timing instrumentation if enabled by 'pass-timing' flags.
+void PassManagerOptions::addTimingInstrumentation(PassManager &pm) {
+  if (passTiming)
+    pm.enableTiming(passTimingDisplayMode);
+}
+
+void mlir::registerPassManagerCLOptions() {
+  // Reset the options instance if it hasn't been enabled yet.
+  if (!options->hasValue())
+    options->emplace();
+}
+
+void mlir::applyPassManagerCLOptions(PassManager &pm) {
+  // Disable multi-threading.
+  if ((*options)->disableThreads)
+    pm.disableMultithreading();
+
+  // Add the IR printing instrumentation.
+  (*options)->addPrinterInstrumentation(pm);
+
+  // Note: The pass timing instrumentation should be added last to avoid any
+  // potential "ghost" timing from other instrumentations being unintentionally
+  // included in the timing results.
+  (*options)->addTimingInstrumentation(pm);
+}
diff --git a/third_party/mlir/lib/Pass/PassRegistry.cpp b/third_party/mlir/lib/Pass/PassRegistry.cpp
new file mode 100644
index 00000000000..0d857619aeb
--- /dev/null
+++ b/third_party/mlir/lib/Pass/PassRegistry.cpp
@@ -0,0 +1,117 @@
+//===- PassRegistry.cpp - Pass Registration Utilities ---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace mlir;
+
+/// Static mapping of all of the registered passes.
+static llvm::ManagedStatic<llvm::DenseMap<const PassID *, PassInfo>>
+    passRegistry;
+
+/// Static mapping of all of the registered pass pipelines.
+static llvm::ManagedStatic<llvm::StringMap<PassPipelineInfo>>
+    passPipelineRegistry;
+
+/// Utility to create a default registry function from a pass instance.
+static PassRegistryFunction
+buildDefaultRegistryFn(PassAllocatorFunction allocator) {
+  return [=](PassManager &pm) { pm.addPass(allocator()); };
+}
+
+//===----------------------------------------------------------------------===//
+// PassPipelineInfo
+//===----------------------------------------------------------------------===//
+
+/// Constructor that accepts a pass allocator function instead of the standard
+/// registry function. This is useful for registering specializations of
+/// existing passes.
+PassPipelineRegistration::PassPipelineRegistration(
+    StringRef arg, StringRef description, PassAllocatorFunction allocator) {
+  registerPassPipeline(arg, description, buildDefaultRegistryFn(allocator));
+}
+
+void mlir::registerPassPipeline(StringRef arg, StringRef description,
+                                const PassRegistryFunction &function) {
+  PassPipelineInfo pipelineInfo(arg, description, function);
+  bool inserted = passPipelineRegistry->try_emplace(arg, pipelineInfo).second;
+  assert(inserted && "Pass pipeline registered multiple times");
+  (void)inserted;
+}
+
+//===----------------------------------------------------------------------===//
+// PassInfo
+//===----------------------------------------------------------------------===//
+
+PassInfo::PassInfo(StringRef arg, StringRef description, const PassID *passID,
+                   PassAllocatorFunction allocator)
+    : PassRegistryEntry(arg, description, buildDefaultRegistryFn(allocator)) {}
+
+void mlir::registerPass(StringRef arg, StringRef description,
+                        const PassID *passID,
+                        const PassAllocatorFunction &function) {
+  PassInfo passInfo(arg, description, passID, function);
+  bool inserted = passRegistry->try_emplace(passID, passInfo).second;
+  assert(inserted && "Pass registered multiple times");
+  (void)inserted;
+}
+
+/// Returns the pass info for the specified pass class or null if unknown.
+const PassInfo *mlir::Pass::lookupPassInfo(const PassID *passID) {
+  auto it = passRegistry->find(passID);
+  if (it == passRegistry->end())
+    return nullptr;
+  return &it->getSecond();
+}
+
+//===----------------------------------------------------------------------===//
+// PassNameParser
+//===----------------------------------------------------------------------===//
+
+PassNameParser::PassNameParser(llvm::cl::Option &opt)
+    : llvm::cl::parser<const PassRegistryEntry *>(opt) {}
+
+void PassNameParser::initialize() {
+  llvm::cl::parser<const PassRegistryEntry *>::initialize();
+
+  /// Add the pass entries.
+  for (const auto &kv : *passRegistry) {
+    addLiteralOption(kv.second.getPassArgument(), &kv.second,
+                     kv.second.getPassDescription());
+  }
+  /// Add the pass pipeline entries.
+  for (const auto &kv : *passPipelineRegistry) {
+    addLiteralOption(kv.second.getPassArgument(), &kv.second,
+                     kv.second.getPassDescription());
+  }
+}
+
+void PassNameParser::printOptionInfo(const llvm::cl::Option &O,
+                                     size_t GlobalWidth) const {
+  PassNameParser *TP = const_cast<PassNameParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const PassNameParser::OptionInfo *VT1,
+                          const PassNameParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  using llvm::cl::parser;
+  parser<const PassRegistryEntry *>::printOptionInfo(O, GlobalWidth);
+}
diff --git a/third_party/mlir/lib/Pass/PassTiming.cpp b/third_party/mlir/lib/Pass/PassTiming.cpp
new file mode 100644
index 00000000000..91b838cfc1e
--- /dev/null
+++ b/third_party/mlir/lib/Pass/PassTiming.cpp
@@ -0,0 +1,397 @@
+//===- PassTiming.cpp -----------------------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "PassDetail.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Threading.h"
+#include <chrono>
+
+using namespace mlir;
+using namespace mlir::detail;
+
+constexpr llvm::StringLiteral kPassTimingDescription =
+    "... Pass execution timing report ...";
+
+namespace {
+/// Simple record class to record timing information.
+struct TimeRecord {
+  TimeRecord(double wall = 0.0, double user = 0.0) : wall(wall), user(user) {}
+
+  TimeRecord &operator+=(const TimeRecord &other) {
+    wall += other.wall;
+    user += other.user;
+    return *this;
+  }
+
+  /// Print the current time record to 'os', with a breakdown showing
+  /// contributions to the give 'total' time record.
+  void print(raw_ostream &os, const TimeRecord &total) {
+    if (total.user != total.wall)
+      os << llvm::format("  %7.4f (%5.1f%%)  ", user,
+                         100.0 * user / total.user);
+    os << llvm::format("  %7.4f (%5.1f%%)  ", wall, 100.0 * wall / total.wall);
+  }
+
+  double wall, user;
+};
+
+struct Timer {
+  explicit Timer(std::string &&name) : name(std::move(name)) {}
+
+  /// Start the timer.
+  void start() { startTime = std::chrono::system_clock::now(); }
+
+  /// Stop the timer.
+  void stop() {
+    auto newTime = std::chrono::system_clock::now() - startTime;
+    wallTime += newTime;
+    userTime += newTime;
+  }
+
+  /// Get or create a child timer with the provided name and id.
+  Timer *getChildTimer(const void *id,
+                       std::function<std::string()> &&nameBuilder) {
+    auto &child = children[id];
+    if (!child)
+      child.reset(new Timer(nameBuilder()));
+    return child.get();
+  }
+
+  /// Returns the total time for this timer in seconds.
+  TimeRecord getTotalTime() {
+    // If we have a valid wall time, then we directly compute the seconds.
+    if (wallTime.count()) {
+      return TimeRecord(
+          std::chrono::duration_cast<std::chrono::duration<double>>(wallTime)
+              .count(),
+          std::chrono::duration_cast<std::chrono::duration<double>>(userTime)
+              .count());
+    }
+
+    // Otheriwse, accumulate the timing from each of the children.
+    TimeRecord totalTime;
+    for (auto &child : children)
+      totalTime += child.second->getTotalTime();
+    return totalTime;
+  }
+
+  /// A map of unique identifiers to child timers.
+  using ChildrenMap = llvm::MapVector<const void *, std::unique_ptr<Timer>>;
+
+  /// Merge the timing data from 'other' into this timer.
+  void merge(Timer &&other) {
+    if (wallTime < other.wallTime)
+      wallTime = other.wallTime;
+    userTime += other.userTime;
+    mergeChildren(std::move(other.children), /*isStructural=*/false);
+  }
+
+  /// Merge the timer chilren in 'otherChildren' with the children of this
+  /// timer. If 'isStructural' is true, the children are merged lexographically
+  /// and 'otherChildren' must have the same number of elements as the children
+  /// of this timer. Otherwise, the timer children are merged based upon the
+  /// given timer key.
+  void mergeChildren(ChildrenMap &&otherChildren, bool isStructural) {
+    // Check for an empty children list.
+    if (children.empty()) {
+      children = std::move(otherChildren);
+      return;
+    }
+
+    if (isStructural) {
+      // If this is a structural merge, the number of children must be the same.
+      assert(children.size() == otherChildren.size() &&
+             "structural merge requires the same number of children");
+      auto it = children.begin(), otherIt = otherChildren.begin();
+      for (auto e = children.end(); it != e; ++it, ++otherIt)
+        it->second->merge(std::move(*otherIt->second));
+      return;
+    }
+
+    // Otherwise, we merge based upon the child timers key.
+    for (auto &otherChild : otherChildren) {
+      auto &child = children[otherChild.first];
+      if (!child)
+        child = std::move(otherChild.second);
+      else
+        child->merge(std::move(*otherChild.second));
+    }
+  }
+
+  /// Raw timing information.
+  std::chrono::time_point<std::chrono::system_clock> startTime;
+  std::chrono::nanoseconds wallTime = std::chrono::nanoseconds(0);
+  std::chrono::nanoseconds userTime = std::chrono::nanoseconds(0);
+
+  /// A map of unique identifiers to child timers.
+  ChildrenMap children;
+
+  /// A descriptive name for this timer.
+  std::string name;
+};
+
+struct PassTiming : public PassInstrumentation {
+  PassTiming(PassTimingDisplayMode displayMode) : displayMode(displayMode) {}
+  ~PassTiming() { print(); }
+
+  /// Setup the instrumentation hooks.
+  void runBeforePass(Pass *pass, Operation *) override { startPassTimer(pass); }
+  void runAfterPass(Pass *pass, Operation *) override;
+  void runAfterPassFailed(Pass *pass, Operation *op) override {
+    runAfterPass(pass, op);
+  }
+  void runBeforeAnalysis(llvm::StringRef name, AnalysisID *id,
+                         Operation *) override {
+    startAnalysisTimer(name, id);
+  }
+  void runAfterAnalysis(llvm::StringRef, AnalysisID *, Operation *) override;
+
+  /// Print and clear the timing results.
+  void print();
+
+  /// Start a new timer for the given pass.
+  void startPassTimer(Pass *pass);
+
+  /// Start a new timer for the given analysis.
+  void startAnalysisTimer(llvm::StringRef name, AnalysisID *id);
+
+  /// Stop a pass timer.
+  void stopPassTimer(Pass *pass);
+
+  /// Stop the last active timer.
+  void stopTimer();
+
+  /// Print the timing result in list mode.
+  void printResultsAsList(raw_ostream &os, Timer *root, TimeRecord totalTime);
+
+  /// Print the timing result in pipeline mode.
+  void printResultsAsPipeline(raw_ostream &os, Timer *root,
+                              TimeRecord totalTime);
+
+  /// Returns a timer for the provided identifier and name.
+  Timer *getTimer(const void *id, std::function<std::string()> &&nameBuilder) {
+    auto tid = llvm::get_threadid();
+
+    // If there is no active timer then add to the root timer.
+    auto &activeTimers = activeThreadTimers[tid];
+    if (activeTimers.empty()) {
+      auto &rootTimer = rootTimers[tid];
+      if (!rootTimer)
+        rootTimer.reset(new Timer("root"));
+      auto *timer = rootTimer->getChildTimer(id, std::move(nameBuilder));
+      activeTimers.push_back(timer);
+      return timer;
+    }
+
+    // Otherwise, add this to the active timer.
+    auto timer = activeTimers.back()->getChildTimer(id, std::move(nameBuilder));
+    activeTimers.push_back(timer);
+    return timer;
+  }
+
+  /// The root top level timers for each thread.
+  DenseMap<uint64_t, std::unique_ptr<Timer>> rootTimers;
+
+  /// A stack of the currently active pass timers per thread.
+  DenseMap<uint64_t, SmallVector<Timer *, 4>> activeThreadTimers;
+
+  /// The display mode to use when printing the timing results.
+  PassTimingDisplayMode displayMode;
+};
+} // end anonymous namespace
+
+/// Start a new timer for the given pass.
+void PassTiming::startPassTimer(Pass *pass) {
+  Timer *timer = getTimer(pass, [pass] {
+    if (isModuleToFunctionAdaptorPass(pass))
+      return StringRef("Function Pipeline");
+    return pass->getName();
+  });
+
+  // We don't actually want to time the adaptor passes, they gather their total
+  // from their held passes.
+  if (!isAdaptorPass(pass))
+    timer->start();
+}
+
+/// Start a new timer for the given analysis.
+void PassTiming::startAnalysisTimer(llvm::StringRef name, AnalysisID *id) {
+  Timer *timer = getTimer(id, [name] { return "(A) " + name.str(); });
+  timer->start();
+}
+
+/// Stop a pass timer.
+void PassTiming::runAfterPass(Pass *pass, Operation *) {
+  auto tid = llvm::get_threadid();
+  auto &activeTimers = activeThreadTimers[tid];
+  assert(!activeTimers.empty() && "expected active timer");
+  Timer *timer = activeTimers.pop_back_val();
+
+  // If this is an ModuleToFunctionPassAdaptorParallel, then we need to merge in
+  // the timing data for the other threads.
+  if (isa<ModuleToFunctionPassAdaptorParallel>(pass)) {
+    // The asychronous pipeline timers should exist as children of root timers
+    // for other threads.
+    for (auto &rootTimer : llvm::make_early_inc_range(rootTimers)) {
+      // Skip the current thread.
+      if (rootTimer.first == tid)
+        continue;
+      // Check that this thread has no active timers.
+      assert(activeThreadTimers[tid].empty() && "expected no active timers");
+
+      // Structurally merge this timers children into the parallel
+      // module-to-function pass timer.
+      timer->mergeChildren(std::move(rootTimer.second->children),
+                           /*isStructural=*/true);
+      rootTimers.erase(rootTimer.first);
+    }
+    return;
+  }
+
+  // Adapator passes aren't timed directly, so we don't need to stop their
+  // timers.
+  if (!isAdaptorPass(pass))
+    timer->stop();
+}
+
+/// Stop a timer.
+void PassTiming::runAfterAnalysis(llvm::StringRef, AnalysisID *, Operation *) {
+  auto &activeTimers = activeThreadTimers[llvm::get_threadid()];
+  assert(!activeTimers.empty() && "expected active timer");
+  Timer *timer = activeTimers.pop_back_val();
+  timer->stop();
+}
+
+/// Utility to print the timer heading information.
+static void printTimerHeader(llvm::raw_ostream &os, TimeRecord total) {
+  os << "===" << std::string(73, '-') << "===\n";
+  // Figure out how many spaces to description name.
+  unsigned Padding = (80 - kPassTimingDescription.size()) / 2;
+  os.indent(Padding) << kPassTimingDescription << '\n';
+  os << "===" << std::string(73, '-') << "===\n";
+
+  // Print the total time followed by the section headers.
+  os << llvm::format("  Total Execution Time: %5.4f seconds\n\n", total.wall);
+  if (total.user != total.wall)
+    os << "   ---User Time---";
+  os << "   ---Wall Time---  --- Name ---\n";
+}
+
+/// Utility to print a single line entry in the timer output.
+static void printTimeEntry(raw_ostream &os, unsigned indent, StringRef name,
+                           TimeRecord time, TimeRecord totalTime) {
+  time.print(os, totalTime);
+  os.indent(indent) << name << "\n";
+}
+
+/// Print out the current timing information.
+void PassTiming::print() {
+  // Don't print anything if there is no timing data.
+  if (rootTimers.empty())
+    return;
+
+  assert(rootTimers.size() == 1 && "expected one remaining root timer");
+  auto &rootTimer = rootTimers.begin()->second;
+  auto os = llvm::CreateInfoOutputFile();
+
+  // Print the timer header.
+  TimeRecord totalTime = rootTimer->getTotalTime();
+  printTimerHeader(*os, totalTime);
+
+  // Defer to a specialized printer for each display mode.
+  switch (displayMode) {
+  case PassTimingDisplayMode::List:
+    printResultsAsList(*os, rootTimer.get(), totalTime);
+    break;
+  case PassTimingDisplayMode::Pipeline:
+    printResultsAsPipeline(*os, rootTimer.get(), totalTime);
+    break;
+  }
+  printTimeEntry(*os, 0, "Total", totalTime, totalTime);
+  os->flush();
+
+  // Reset root timers.
+  rootTimers.clear();
+  activeThreadTimers.clear();
+}
+
+/// Print the timing result in list mode.
+void PassTiming::printResultsAsList(raw_ostream &os, Timer *root,
+                                    TimeRecord totalTime) {
+  llvm::StringMap<TimeRecord> mergedTimings;
+
+  std::function<void(Timer *)> addTimer = [&](Timer *timer) {
+    // Check for timing information.
+    if (timer->wallTime.count())
+      mergedTimings[timer->name] += timer->getTotalTime();
+    for (auto &children : timer->children)
+      addTimer(children.second.get());
+  };
+
+  // Add each of the top level timers.
+  for (auto &topLevelTimer : root->children)
+    addTimer(topLevelTimer.second.get());
+
+  // Sort the timing information by wall time.
+  std::vector<std::pair<StringRef, TimeRecord>> timerNameAndTime;
+  for (auto &it : mergedTimings)
+    timerNameAndTime.emplace_back(it.first(), it.second);
+  llvm::array_pod_sort(timerNameAndTime.begin(), timerNameAndTime.end(),
+                       [](const std::pair<StringRef, TimeRecord> *lhs,
+                          const std::pair<StringRef, TimeRecord> *rhs) {
+                         return llvm::array_pod_sort_comparator<double>(
+                             &rhs->second.wall, &lhs->second.wall);
+                       });
+
+  // Print the timing information sequentially.
+  for (auto &timeData : timerNameAndTime)
+    printTimeEntry(os, 0, timeData.first, timeData.second, totalTime);
+}
+
+/// Print the timing result in pipeline mode.
+void PassTiming::printResultsAsPipeline(raw_ostream &os, Timer *root,
+                                        TimeRecord totalTime) {
+  std::function<void(unsigned, Timer *)> printTimer = [&](unsigned indent,
+                                                          Timer *timer) {
+    printTimeEntry(os, indent, timer->name, timer->getTotalTime(), totalTime);
+    for (auto &children : timer->children)
+      printTimer(indent + 2, children.second.get());
+  };
+
+  // Print each of the top level timers.
+  for (auto &topLevelTimer : root->children)
+    printTimer(0, topLevelTimer.second.get());
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+/// Add an instrumentation to time the execution of passes and the computation
+/// of analyses.
+void PassManager::enableTiming(PassTimingDisplayMode displayMode) {
+  // Check if pass timing is already enabled.
+  if (passTiming)
+    return;
+  addInstrumentation(new PassTiming(displayMode));
+  passTiming = true;
+}
diff --git a/third_party/mlir/lib/Quantizer/CMakeLists.txt b/third_party/mlir/lib/Quantizer/CMakeLists.txt
new file mode 100644
index 00000000000..bc157d0e979
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Support.
+add_llvm_library(MLIRQuantizerSupport
+  Support/Configuration.cpp
+  Support/ConstraintAnalysisGraph.cpp
+  Support/Metadata.cpp
+  Support/Statistics.cpp
+  Support/TypeUtils.cpp
+  Support/UniformConstraints.cpp
+  Support/UniformSolvers.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerSupport
+                 MLIRIR
+                 MLIRQuantOps
+                 MLIRSupport
+                 MLIRStandardOps)
+
+# Configurations.
+add_llvm_library(MLIRQuantizerFxpMathConfig
+  Configurations/FxpMathConfig.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerFxpMathConfig
+                 MLIRFxpMathOpsIncGen
+                 MLIRQuantizerSupport)
+
+# Transforms.
+add_llvm_library(MLIRQuantizerTransforms
+  Transforms/AddDefaultStatsTestPass.cpp
+  Transforms/InferQuantizedTypesPass.cpp
+  Transforms/RemoveInstrumentationPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerTransforms
+  MLIRQuantizerFxpMathConfig
+  MLIRQuantizerSupport
+  MLIRPass)
+target_link_libraries(MLIRQuantizerTransforms
+  MLIRQuantizerFxpMathConfig
+  MLIRQuantizerSupport
+  MLIRPass)
diff --git a/third_party/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp b/third_party/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp
new file mode 100644
index 00000000000..94e364238c5
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp
@@ -0,0 +1,287 @@
+//===- FxpMathConfig.cpp - Reference fixed point config -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a TargetConfiguration for reference fixed-point math
+// quantization scheme based on the FxpMathOps (plus a small category of
+// extension ops that can be added from other dialects).
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Statistics.h"
+#include "mlir/Quantizer/Support/UniformConstraints.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::fxpmath;
+using namespace mlir::quant;
+using namespace std::placeholders;
+
+namespace {
+
+struct FxpMathTargetConfigImpl : public FxpMathTargetConfig {
+  FxpMathTargetConfigImpl(SolverContext &context)
+      : FxpMathTargetConfig(context) {
+    Builder b(&context.getMlirContext());
+    IntegerType i8Type = b.getIntegerType(8);
+    IntegerType i16Type = b.getIntegerType(16);
+    IntegerType i32Type = b.getIntegerType(32);
+
+    q8 = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i8Type, nullptr,
+                              std::numeric_limits<int8_t>::min(),
+                              std::numeric_limits<int8_t>::max()),
+        CandidateQuantizedType::Scheme::UniformPerLayer);
+    q16 = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i16Type, nullptr,
+                              std::numeric_limits<int16_t>::min(),
+                              std::numeric_limits<int16_t>::max()),
+        CandidateQuantizedType::Scheme::UniformPerLayer);
+    q32ExplicitFixedPoint = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i32Type, nullptr,
+                              std::numeric_limits<int32_t>::min(),
+                              std::numeric_limits<int32_t>::max()),
+        CandidateQuantizedType::Scheme::UniformExplicitFixedPointScale);
+
+    // Op handlers.
+    addOpHandler<ConstantOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleConstant, this, _1, _2));
+    addOpHandler<ReturnOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleTerminal, this, _1, _2));
+    addOpHandler<quant::StatisticsOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleStats, this, _1, _2));
+
+    // FxpMathOps.
+    addOpHandler<RealAddEwOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleAdd, this, _1, _2));
+    addOpHandler<RealMulEwOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMul, this, _1, _2));
+    addOpHandler<RealMatMulOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMatMul, this, _1, _2));
+    addOpHandler<RealMatMulBiasOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMatMulBias, this, _1, _2));
+
+    // Require stats ops.
+    addRequireStatsOp<RealAddEwOp>();
+    addRequireStatsOp<RealSubEwOp>();
+    addRequireStatsOp<RealDivEwOp>();
+    addRequireStatsOp<RealMulEwOp>();
+    addRequireStatsOp<RealMatMulOp>();
+    addRequireStatsOp<RealMatMulBiasOp>();
+  }
+
+  bool isHandledType(Type t) const final {
+    if (t.isa<FloatType>())
+      return true;
+    return (t.isa<VectorType>() || t.isa<TensorType>()) &&
+           t.cast<ShapedType>().getElementType().isa<FloatType>();
+  }
+
+  void finalizeAnchors(CAGSlice &cag) const override {
+    cag.enumerateImpliedConnections(
+        [&](CAGAnchorNode *from, CAGAnchorNode *to) {
+          UniformConstraintsBuilder(cag).coupleAnchors(from, to);
+        });
+  }
+
+  void addValueIdentityOpByName(StringRef opName) override {
+    addOpHandlerByName(
+        opName,
+        std::bind(&FxpMathTargetConfigImpl::handleValueIdentity, this, _1, _2));
+  }
+
+  void handleValueIdentity(Operation *op, CAGSlice &cag) const {
+    assert(op->getNumResults() == 1);
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    resultNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::DirectStorage);
+
+    for (unsigned opIdx = 0, e = op->getNumOperands(); opIdx < e; ++opIdx) {
+      if (!isHandledType(op->getOperand(opIdx)->getType()))
+        continue;
+      auto operandNode = cag.getOperandAnchor(op, opIdx);
+      operandNode->setTypeTransformRule(
+          CAGAnchorNode::TypeTransformRule::DirectStorage);
+      UniformConstraintsBuilder(cag).coupleAnchors(operandNode, resultNode);
+    }
+  }
+
+  void handleConstant(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    resultNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::ExpressedOnly);
+    Attribute valueAttr;
+    if (!matchPattern(op, m_Constant(&valueAttr))) {
+      return;
+    }
+
+    AttributeTensorStatistics stats(valueAttr);
+    TensorAxisStatistics layerStats;
+    if (!stats.get(layerStats)) {
+      op->emitOpError("could not compute statistics");
+      return;
+    }
+
+    UniformConstraintsBuilder(cag).applyStats(resultNode, layerStats);
+  }
+
+  void handleTerminal(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getOperand(0)->getType()))
+      return;
+    auto operandNode = cag.getOperandAnchor(op, 0);
+    operandNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::ExpressedOnly);
+  }
+
+  void handleStats(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto argNode = cag.getOperandAnchor(op, 0);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    UniformConstraintsBuilder(cag).coupleAnchors(argNode, resultNode);
+
+    TensorAxisStatistics layerStats;
+    auto statsOp = cast<quant::StatisticsOp>(op);
+    auto layerStatsAttr = statsOp.layerStats();
+    layerStats.minValue =
+        layerStatsAttr.getValue<FloatAttr>(0).getValueAsDouble();
+    layerStats.maxValue =
+        layerStatsAttr.getValue<FloatAttr>(1).getValueAsDouble();
+    UniformConstraintsBuilder(cag).applyStats(resultNode, layerStats);
+  }
+
+  void handleAdd(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Add supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    // NOTE: We couple the add such that the scale/zeroPoint match between
+    // both args and the result. This is overly constrained in that it is
+    // possible to write efficient add kernels with a bit more freedom (i.e.
+    // zeroPoints can vary, scales can differ by a power of two, etc).
+    // However, fully coupled yields the simples solutions on the fast path.
+    // Further efficiency can be had by constraining the zeroPoint to 0, but
+    // there isn't a constraint for this yet (and there are tradeoffs).
+    UniformConstraintsBuilder(cag).coupleAnchors(lhs, resultNode);
+    UniformConstraintsBuilder(cag).coupleAnchors(rhs, resultNode);
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMul(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMatMul(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMatMulBias(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto bias = cag.getOperandAnchor(op, 2);
+    bias->getUniformMetadata().disabledCandidateTypes =
+        getCandidateTypeDisabledExceptMask({q32ExplicitFixedPoint});
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    UniformConstraintsBuilder(cag).propagateExplicitScale(resultNode, bias);
+
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void addRealMathOptionalConstraints(Operation *op, CAGAnchorNode *anchor,
+                                      CAGSlice &cag) const {
+    // TODO: It would be nice if these all extended some base trait instead
+    // of requiring name lookup.
+    auto clampMinAttr = op->getAttrOfType<FloatAttr>("clamp_min");
+    auto clampMaxAttr = op->getAttrOfType<FloatAttr>("clamp_max");
+
+    if (clampMinAttr || clampMaxAttr) {
+      auto nan = APFloat::getQNaN(APFloat::IEEEdouble());
+      auto clampMin = clampMinAttr ? clampMinAttr.getValue() : nan;
+      auto clampMax = clampMaxAttr ? clampMaxAttr.getValue() : nan;
+      UniformConstraintsBuilder(cag).clamp(anchor, clampMin, clampMax);
+    }
+  }
+
+  unsigned q8;
+  unsigned q16;
+  unsigned q32ExplicitFixedPoint;
+};
+
+} // anonymous namespace
+
+std::unique_ptr<FxpMathTargetConfig>
+FxpMathTargetConfig::create(SolverContext &context) {
+  return std::make_unique<FxpMathTargetConfigImpl>(context);
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/Configuration.cpp b/third_party/mlir/lib/Quantizer/Support/Configuration.cpp
new file mode 100644
index 00000000000..78a74514f8b
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/Configuration.cpp
@@ -0,0 +1,48 @@
+//===- Configuration.cpp - Configuration object base classes --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/Configuration.h"
+
+#include <limits>
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/MLIRContext.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+TargetConfiguration::TargetConfiguration(SolverContext &context) {}
+
+void TargetConfiguration::addOpHandlerByName(StringRef name, OpHandlerFn fn) {
+  opHandlers[name] = fn;
+}
+
+void TargetConfiguration::addRequireStatsOpByName(StringRef opName) {
+  requireStatsOpNames.insert(opName);
+}
+
+bool TargetConfiguration::isRequireStatsOp(Operation *op) const {
+  return requireStatsOpNames.find(op->getName().getStringRef()) !=
+         requireStatsOpNames.end();
+}
+
+void TargetConfiguration::handleOp(Operation *op, CAGSlice &cag) const {
+  auto found_it = opHandlers.find(op->getName().getStringRef());
+  if (found_it != opHandlers.end())
+    found_it->second(op, cag);
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp b/third_party/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp
new file mode 100644
index 00000000000..cfed2a2647c
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp
@@ -0,0 +1,181 @@
+//===- ConstraintAnalysisGraph.cpp - Graphs type for constraints ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+void CAGNode::replaceIncoming(CAGNode *otherNode) {
+  if (this == otherNode)
+    return;
+  for (CAGNode *parentNode : incoming) {
+    for (CAGNode *&it : parentNode->outgoing) {
+      if (it == this) {
+        it = otherNode;
+        otherNode->incoming.push_back(parentNode);
+      }
+    }
+  }
+  incoming.clear();
+}
+
+void CAGNode::addOutgoing(CAGNode *toNode) {
+  if (!llvm::is_contained(outgoing, toNode)) {
+    outgoing.push_back(toNode);
+    toNode->incoming.push_back(this);
+  }
+}
+
+CAGOperandAnchor::CAGOperandAnchor(Operation *op, unsigned operandIdx)
+    : CAGAnchorNode(Kind::OperandAnchor, op->getOperand(operandIdx)->getType()),
+      op(op), operandIdx(operandIdx) {}
+
+CAGResultAnchor::CAGResultAnchor(Operation *op, unsigned resultIdx)
+    : CAGAnchorNode(Kind::ResultAnchor, op->getResult(resultIdx)->getType()),
+      resultValue(op->getResult(resultIdx)) {}
+
+CAGSlice::CAGSlice(SolverContext &context) : context(context) {}
+CAGSlice::~CAGSlice() { llvm::DeleteContainerPointers(allNodes); }
+
+CAGOperandAnchor *CAGSlice::getOperandAnchor(Operation *op,
+                                             unsigned operandIdx) {
+  assert(operandIdx < op->getNumOperands() && "illegal operand index");
+
+  // Dedup.
+  auto key = std::make_pair(op, operandIdx);
+  auto foundIt = operandAnchors.find(key);
+  if (foundIt != operandAnchors.end()) {
+    return foundIt->second;
+  }
+
+  // Create.
+  auto anchor = std::make_unique<CAGOperandAnchor>(op, operandIdx);
+  auto *unowned = anchor.release();
+  unowned->nodeId = allNodes.size();
+  allNodes.push_back(unowned);
+  operandAnchors.insert(std::make_pair(key, unowned));
+  return unowned;
+}
+
+CAGResultAnchor *CAGSlice::getResultAnchor(Operation *op, unsigned resultIdx) {
+  assert(resultIdx < op->getNumResults() && "illegal result index");
+
+  // Dedup.
+  auto key = std::make_pair(op, resultIdx);
+  auto foundIt = resultAnchors.find(key);
+  if (foundIt != resultAnchors.end()) {
+    return foundIt->second;
+  }
+
+  // Create.
+  auto anchor = std::make_unique<CAGResultAnchor>(op, resultIdx);
+  auto *unowned = anchor.release();
+  unowned->nodeId = allNodes.size();
+  allNodes.push_back(unowned);
+  resultAnchors.insert(std::make_pair(key, unowned));
+  return unowned;
+}
+
+void CAGSlice::enumerateImpliedConnections(
+    std::function<void(CAGAnchorNode *from, CAGAnchorNode *to)> callback) {
+  // Discover peer identity pairs (i.e. implied edges from Result->Operand and
+  // Arg->Call). Use an intermediate vector so that the callback can modify.
+  std::vector<std::pair<CAGAnchorNode *, CAGAnchorNode *>> impliedPairs;
+  for (auto &resultAnchorPair : resultAnchors) {
+    CAGResultAnchor *resultAnchor = resultAnchorPair.second;
+    Value *resultValue = resultAnchor->getValue();
+    for (auto &use : resultValue->getUses()) {
+      Operation *operandOp = use.getOwner();
+      unsigned operandIdx = use.getOperandNumber();
+      auto foundIt = operandAnchors.find(std::make_pair(operandOp, operandIdx));
+      if (foundIt != operandAnchors.end()) {
+        impliedPairs.push_back(std::make_pair(resultAnchor, foundIt->second));
+      }
+    }
+  }
+
+  // Callback for each pair.
+  for (auto &impliedPair : impliedPairs) {
+    callback(impliedPair.first, impliedPair.second);
+  }
+}
+
+unsigned CAGSlice::propagate(const TargetConfiguration &config) {
+  std::vector<CAGNode *> dirtyNodes;
+  dirtyNodes.reserve(allNodes.size());
+  // Note that because iteration happens in nodeId order, there is no need
+  // to sort in order to make deterministic. If the selection method changes,
+  // a sort should be explicitly done.
+  for (CAGNode *child : *this) {
+    if (child->isDirty()) {
+      dirtyNodes.push_back(child);
+    }
+  }
+
+  if (dirtyNodes.empty()) {
+    return 0;
+  }
+  for (auto dirtyNode : dirtyNodes) {
+    dirtyNode->clearDirty();
+    dirtyNode->propagate(context, config);
+  }
+
+  return dirtyNodes.size();
+}
+
+void CAGAnchorNode::propagate(SolverContext &solverContext,
+                              const TargetConfiguration &config) {
+  for (CAGNode *child : *this) {
+    child->markDirty();
+  }
+}
+
+Type CAGAnchorNode::getTransformedType() {
+  if (!getUniformMetadata().selectedType) {
+    return nullptr;
+  }
+  return getUniformMetadata().selectedType.castFromExpressedType(
+      getOriginalType());
+}
+
+void CAGNode::printLabel(llvm::raw_ostream &os) const {
+  os << "Node<" << static_cast<const void *>(this) << ">";
+}
+
+void CAGAnchorNode::printLabel(llvm::raw_ostream &os) const {
+  getUniformMetadata().printSummary(os);
+}
+
+void CAGOperandAnchor::printLabel(llvm::raw_ostream &os) const {
+  os << "Operand<";
+  op->getName().print(os);
+  os << "," << operandIdx;
+  os << ">";
+  CAGAnchorNode::printLabel(os);
+}
+
+void CAGResultAnchor::printLabel(llvm::raw_ostream &os) const {
+  os << "Result<";
+  getOp()->getName().print(os);
+  os << ">";
+  CAGAnchorNode::printLabel(os);
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/Metadata.cpp b/third_party/mlir/lib/Quantizer/Support/Metadata.cpp
new file mode 100644
index 00000000000..3661f52b52f
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/Metadata.cpp
@@ -0,0 +1,42 @@
+//===- Metadata.cpp - Top level types and metadata ------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/Metadata.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+void CAGUniformMetadata::printSummary(llvm::raw_ostream &os) const {
+  if (requiredRange.hasValue()) {
+    os << "\n[" << requiredRange.getValue().first << ","
+       << requiredRange.getValue().second << "]";
+  }
+
+  if (disabledCandidateTypes.any()) {
+    os << "\n![";
+    mlir::interleaveComma(disabledCandidateTypes.set_bits(), os);
+    os << "]";
+  }
+
+  if (selectedType) {
+    os << "\n" << selectedType;
+  }
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/Statistics.cpp b/third_party/mlir/lib/Quantizer/Support/Statistics.cpp
new file mode 100644
index 00000000000..788c2f67e27
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/Statistics.cpp
@@ -0,0 +1,109 @@
+//===- Statistics.cpp - Collects statistics over tensors ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/Statistics.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+//===----------------------------------------------------------------------===//
+// AttributeTensorStatistics implementation
+//===----------------------------------------------------------------------===//
+
+static void
+collectElementsStatisticsDim(ElementsAttr attr, unsigned numElements,
+                             ArrayRef<int64_t> shape,
+                             llvm::SmallVectorImpl<uint64_t> &indices,
+                             uint64_t dim, TensorAxisStatistics &statistics) {
+  // Recursive terminating condition.
+  if (dim >= shape.size())
+    return;
+
+  if (dim < (shape.size() - 1)) {
+    // Recurse past dim.
+    for (uint64_t i = 0, s = shape[dim]; i < s; ++i) {
+      indices[dim] = i;
+      collectElementsStatisticsDim(attr, numElements, shape, indices, dim + 1,
+                                   statistics);
+    }
+    return;
+  }
+
+  // Collection dim.
+  for (uint64_t i = 0, s = shape[dim]; i < s; ++i) {
+    indices[dim] = i;
+    double value = attr.getValue<FloatAttr>(indices).getValueAsDouble();
+    statistics.minValue = std::min(statistics.minValue, value);
+    statistics.maxValue = std::max(statistics.maxValue, value);
+    statistics.mean += value / numElements;
+    // TODO: Calculate a running variance.
+  }
+}
+
+static bool getElementsStatistics(ElementsAttr attr,
+                                  TensorAxisStatistics &statistics) {
+  statistics.clear();
+  statistics.minValue = std::numeric_limits<double>::infinity();
+  statistics.maxValue = -std::numeric_limits<double>::infinity();
+
+  ShapedType sType = attr.getType();
+  if (!sType.hasStaticShape())
+    return false;
+  Type elementTy = sType.getElementType();
+  if (!elementTy.isa<FloatType>())
+    return false;
+
+  llvm::SmallVector<uint64_t, 4> indices;
+  indices.resize(sType.getRank());
+  ArrayRef<int64_t> shape = sType.getShape();
+
+  auto numElements = sType.getNumElements();
+  collectElementsStatisticsDim(attr, numElements, shape, indices, 0,
+                               statistics);
+  statistics.sampleSize = numElements;
+
+  return true;
+}
+
+bool AttributeTensorStatistics::get(TensorAxisStatistics &stats) const {
+  if (FloatAttr floatAttr = attr.dyn_cast<FloatAttr>()) {
+    double value = floatAttr.getValueAsDouble();
+    stats = TensorAxisStatistics(1, value, value, value, 0);
+    return true;
+  } else if (auto eltAttr = attr.dyn_cast<ElementsAttr>()) {
+    return getElementsStatistics(eltAttr, stats);
+  }
+  return false;
+}
+
+namespace mlir {
+namespace quantizer {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const TensorAxisStatistics &stats) {
+  os << "STATS[sampleSize=" << stats.sampleSize << ", min=" << stats.minValue
+     << ", maxValue=" << stats.maxValue << ", mean=" << stats.mean
+     << ", variance=" << stats.variance << "]";
+  return os;
+}
+
+} // end namespace quantizer
+} // end namespace mlir
diff --git a/third_party/mlir/lib/Quantizer/Support/TypeUtils.cpp b/third_party/mlir/lib/Quantizer/Support/TypeUtils.cpp
new file mode 100644
index 00000000000..fab4e565308
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/TypeUtils.cpp
@@ -0,0 +1,31 @@
+//===- TypeUtils.cpp - Helper function for manipulating types -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/TypeUtils.h"
+
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+Type mlir::quantizer::getElementOrPrimitiveType(Type t) {
+  if (auto sType = t.dyn_cast<ShapedType>()) {
+    return sType.getElementType();
+  } else {
+    return t;
+  }
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/UniformConstraints.cpp b/third_party/mlir/lib/Quantizer/Support/UniformConstraints.cpp
new file mode 100644
index 00000000000..c43ecdfb5c2
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/UniformConstraints.cpp
@@ -0,0 +1,267 @@
+//===- UniformConstraints.cpp - Constraints for uniform quant -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/UniformConstraints.h"
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "mlir/Quantizer/Support/TypeUtils.h"
+#include "mlir/Quantizer/Support/UniformSolvers.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+struct ClusteredFacts {
+  ExpandingMinMaxFact requiredRange;
+  DiscreteScaleZeroPointFact explicitScaleZeroPoint;
+};
+
+} // end anonymous namespace
+
+static QuantizedType solveUniformType(SolverContext &solverContext,
+                                      const ClusteredFacts &clusteredFacts,
+                                      const CandidateQuantizedType &ct,
+                                      Type originalElementType, Location loc) {
+  switch (ct.scheme) {
+  default:
+    emitError(loc, "unsupported scheme for uniform type conversion");
+    return nullptr;
+
+  case CandidateQuantizedType::Scheme::UniformPerLayer: {
+    if (!clusteredFacts.requiredRange.hasValue()) {
+      // TODO: Issue some kind of diagnostic. This is not an error.
+      return nullptr;
+    }
+
+    uint64_t numLevels = ct.quantizedType.getStorageTypeMax() -
+                         ct.quantizedType.getStorageTypeMin();
+    UniformStorageParams params{numLevels,
+                                ct.quantizedType.getStorageTypeMin()};
+    UniformParamsFromMinMaxSolver solver(
+        params, clusteredFacts.requiredRange.getValue().first,
+        clusteredFacts.requiredRange.getValue().second);
+    if (!solver.compute()) {
+      emitWarning(loc) << "unable to solve uniform type with "
+                       << "UniformParamsFromMinMaxSolver";
+      return nullptr;
+    }
+
+    return UniformQuantizedType::getChecked(
+        ct.quantizedType.getFlags(), ct.quantizedType.getStorageType(),
+        originalElementType, solver.getScale(), solver.getZp(),
+        ct.quantizedType.getStorageTypeMin(),
+        ct.quantizedType.getStorageTypeMax(), loc);
+  }
+  case CandidateQuantizedType::Scheme::UniformExplicitFixedPointScale: {
+    if (!clusteredFacts.explicitScaleZeroPoint.hasValue()) {
+      emitRemark(loc)
+          << "unable to solve uniform type with UniformExplicitFixedPointScale "
+          << "(no explicitScaleZeroPoint)";
+      return nullptr;
+    }
+
+    const auto &scaleZp = clusteredFacts.explicitScaleZeroPoint.getValue();
+    assert(scaleZp.value && "optional value not set on fact");
+
+    if (scaleZp.conflict) {
+      emitWarning(loc)
+          << "conflicting explicit scale/zeroPoint on node cluster: "
+          << "an arbitrary scale/zeroPoint will be used";
+    }
+
+    return UniformQuantizedType::getChecked(
+        ct.quantizedType.getFlags(), ct.quantizedType.getStorageType(),
+        originalElementType,
+        scaleZp.value->first, // scale
+        0, // zeroPoint (fixed point solutions only for this scheme)
+        ct.quantizedType.getStorageTypeMin(),
+        ct.quantizedType.getStorageTypeMax(), loc);
+
+    return nullptr;
+  }
+  }
+}
+
+namespace {
+
+class PropagateExplicitScale : public CAGConstraintNode {
+public:
+  PropagateExplicitScale()
+      : CAGConstraintNode(Kind::UniformPropagateExplicitScale) {}
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Constraint ||
+           n->getKind() == Kind::UniformPropagateExplicitScale;
+  }
+
+private:
+  void printLabel(llvm::raw_ostream &os) const override {
+    os << "PropagateExplicitScale";
+  }
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override {
+    DiscreteScaleZeroPointFact scaleZp;
+
+    // Get scale/zp from all parents.
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = llvm::cast<CAGAnchorNode>(*it);
+      auto selectedType = parentAnchor->getUniformMetadata().selectedType;
+      if (auto uqType = selectedType.dyn_cast_or_null<UniformQuantizedType>()) {
+        scaleZp.assertValue(
+            CAGUniformMetadata::SalienceRequired,
+            std::make_pair(uqType.getScale(), static_cast<int64_t>(0)));
+      }
+    }
+
+    // Propagate to children.
+    if (scaleZp.hasValue()) {
+      for (auto it = begin(), e = end(); it != e; ++it) {
+        auto childAnchor = llvm::cast<CAGAnchorNode>(*it);
+        if (modified(childAnchor->getUniformMetadata()
+                         .explicitScaleZeroPoint.mergeFrom(scaleZp))) {
+          childAnchor->markDirty();
+        }
+      }
+    }
+  }
+};
+
+/// A constraint node which will solve uniform quantization for all parents
+/// of the constraint, assuming that they are coupled.
+class SolveUniformConstraintNode : public CAGConstraintNode {
+public:
+  SolveUniformConstraintNode()
+      : CAGConstraintNode(Kind::SolveUniformConstraint) {
+    markDirty();
+  }
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Constraint ||
+           n->getKind() == Kind::SolveUniformConstraint;
+  }
+
+private:
+  void printLabel(llvm::raw_ostream &os) const override {
+    os << "SolveUniform";
+  }
+
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override {
+    // First determine the required min/max range and type constraints.
+    Location fusedLoc = UnknownLoc::get(&solverContext.getMlirContext());
+    llvm::SmallBitVector enabledCandidateTypesMask(
+        config.getAllCandidateTypesMask());
+    ClusteredFacts clusteredFacts;
+    Type originalElementType;
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = llvm::cast<CAGAnchorNode>(*it);
+      auto metadata = parentAnchor->getUniformMetadata();
+      // TODO: Possibly use a location that fuses all involved parents.
+      fusedLoc = parentAnchor->getOp()->getLoc();
+
+      // Shared element type.
+      auto parentOriginalElementType =
+          getElementOrPrimitiveType(parentAnchor->getOriginalType());
+      if (!originalElementType) {
+        originalElementType = parentOriginalElementType;
+      } else {
+        if (originalElementType != parentOriginalElementType) {
+          parentAnchor->getOp()->emitError()
+              << "cannot compute uniform type: parent element types mismatch";
+          return;
+        }
+      }
+      // Range.
+      clusteredFacts.requiredRange.mergeFrom(metadata.requiredRange);
+
+      // Explicit scale and zero point.
+      clusteredFacts.explicitScaleZeroPoint.mergeFrom(
+          metadata.explicitScaleZeroPoint);
+
+      // Shared candidate types.
+      enabledCandidateTypesMask.reset(metadata.disabledCandidateTypes);
+    }
+
+    // Find the first enabled candidate type.
+    const CandidateQuantizedType *bestCandidateType = nullptr;
+    for (auto &ct : config.getCandidateTypes()) {
+      if (enabledCandidateTypesMask.test(ct.ordinal)) {
+        bestCandidateType = &ct;
+        break;
+      }
+    }
+
+    if (!bestCandidateType || !originalElementType) {
+      emitRemark(fusedLoc)
+          << "not solving uniform type (no viable candidate type)";
+      return;
+    }
+
+    // Solve for the type.
+    QuantizedType selectedType =
+        solveUniformType(solverContext, clusteredFacts, *bestCandidateType,
+                         originalElementType, fusedLoc);
+
+    // Apply it to all parents.
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = llvm::cast<CAGAnchorNode>(*it);
+      auto &metadata = parentAnchor->getUniformMetadata();
+      if (metadata.selectedType != selectedType) {
+        metadata.selectedType = selectedType;
+        // And mark all children of the parent dirty (except us).
+        for (auto child : *parentAnchor) {
+          if (child != this) {
+            child->markDirty();
+          }
+        }
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
+
+void UniformConstraintsBuilder::coupleAnchors(CAGAnchorNode *a,
+                                              CAGAnchorNode *b) {
+  slice.addClusteredConstraint<SolveUniformConstraintNode>({a, b});
+}
+
+void UniformConstraintsBuilder::applyStats(CAGAnchorNode *a,
+                                           TensorAxisStatistics stats) {
+  a->getUniformMetadata().requiredRange.assertValue(
+      CAGUniformMetadata::SalienceDefault, {stats.minValue, stats.maxValue});
+}
+
+void UniformConstraintsBuilder::clamp(CAGAnchorNode *a, APFloat minValue,
+                                      APFloat maxValue) {
+  a->getUniformMetadata().requiredRange.assertValue(
+      CAGUniformMetadata::SalienceDefault,
+      {minValue.convertToDouble(), maxValue.convertToDouble()});
+}
+
+void UniformConstraintsBuilder::propagateExplicitScale(CAGAnchorNode *from,
+                                                       CAGAnchorNode *to) {
+  slice.addUnidirectionalConstraint<PropagateExplicitScale>(from, {to});
+}
diff --git a/third_party/mlir/lib/Quantizer/Support/UniformSolvers.cpp b/third_party/mlir/lib/Quantizer/Support/UniformSolvers.cpp
new file mode 100644
index 00000000000..b4c14ca80ef
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Support/UniformSolvers.cpp
@@ -0,0 +1,158 @@
+//===- UniformSolvers.cpp - Uniform type solver algorithms ----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Quantizer/Support/UniformSolvers.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include <cmath>
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+bool UniformParamsFromMinMaxSolver::compute() {
+  // Compute adjMin, adjMax, clamping to ensure that they straddle zero.
+  if (boundingMin > 0 && boundingMax >= boundingMin) {
+    // Lop-sided to the positive.
+    adjMin = 0;
+    adjMax = boundingMax;
+  } else if (boundingMax < 0 && boundingMin <= boundingMax) {
+    // Lop-sided to the negative.
+    adjMin = boundingMin;
+    adjMax = 0;
+  } else if (boundingMin <= 0 && boundingMax >= 0) {
+    adjMin = boundingMin;
+    adjMax = boundingMax;
+  } else {
+    // Illegal bounds.
+    return satisfied = false;
+  }
+
+  const double origMinAdj = adjMin;
+  const double origMaxAdj = adjMax;
+  const double numLevelsDouble = storageParams.numLevels;
+
+  struct fns {
+    static std::pair<double, double>
+    computeMinMax(double boundingMin, double numLevels, double delta) {
+      double adjMin = delta * std::floor(boundingMin / delta);
+      return std::make_pair(adjMin, adjMin + numLevels * delta);
+    }
+    static double overshoot(double boundingMin, double boundingMax,
+                            double numLevels, double delta) {
+      auto adjMinMax = computeMinMax(boundingMin, numLevels, delta);
+      double maxOvershoot = adjMinMax.second - boundingMax;
+      double minOvershoot = boundingMin - adjMinMax.first;
+      // If undershooting on the min or max end, return that because it is
+      // to be unconditionally avoided. Otherwise return the end with the
+      // greateast magnitude of overshoot.
+      if (maxOvershoot < 0)
+        return maxOvershoot;
+      if (minOvershoot < 0)
+        return minOvershoot;
+      return std::max(maxOvershoot, minOvershoot);
+    }
+  };
+
+  // Bisect to find a suitable delta, starting with bounds of deltaInit
+  // and deltaMax.
+  double deltaInit = (adjMax - adjMin) / numLevelsDouble;
+  double deltaMax =
+      ((numLevelsDouble * deltaInit) + 2 * deltaInit) / numLevelsDouble;
+  double deltaMid;
+  double prevDeltaMid = 0.0;
+  for (stepCount = 0; stepCount < 60; ++stepCount) {
+    deltaMid = (deltaInit + deltaMax) / 2.0;
+    auto fInit =
+        fns::overshoot(origMinAdj, origMaxAdj, numLevelsDouble, deltaInit);
+    auto fMid =
+        fns::overshoot(origMinAdj, origMaxAdj, numLevelsDouble, deltaMid);
+    if (fMid == 0 || (fMid > 0 && std::fabs(deltaMid - prevDeltaMid) < 1e-15)) {
+      // Solution found (or step size is infinitessimal and an overshoot).
+      // Empirically, this seems to terminate around 30-50 steps or so.
+      // This will find a zero point for exactly representable ranges and
+      // will terminate on a small step size for inexact, biasing towards
+      // overshooting.
+      delta = deltaMid;
+      break;
+    }
+    bool signMid = fMid > 0;
+    bool signInit = fInit > 0;
+    if (signMid == signInit) {
+      deltaInit = deltaMid;
+    } else {
+      deltaMax = deltaMid;
+    }
+    prevDeltaMid = deltaMid;
+  }
+  delta = deltaMid;
+
+  // Recalculate adjMin/adjMax based on new delta.
+  auto adjMinMax = fns::computeMinMax(origMinAdj, numLevelsDouble, delta);
+  adjMin = adjMinMax.first;
+  adjMax = adjMinMax.second;
+
+  satisfied = false;
+  zp = 0;
+
+  if (!std::isnan(delta) && !std::isnan(adjMin) && !std::isnan(adjMax)) {
+    satisfied = true;
+    // Finally, scale and zeroPoint. Since it casts to integer, only valid
+    // if the inputs are valid.
+    zp = std::round(storageParams.minValue - adjMin / delta);
+  }
+
+  return satisfied;
+}
+
+int64_t UniformParamsFromMinMaxSolver::quantize(double x) const {
+  int64_t xq = std::round(x / delta + zp);
+  return std::max<int64_t>(0, std::min<int64_t>(storageParams.numLevels, xq));
+}
+
+double UniformParamsFromMinMaxSolver::dequantize(int64_t xq) const {
+  return (xq - zp) * delta;
+}
+
+namespace mlir {
+namespace quantizer {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformStorageParams &p) {
+  os << "UniformStorageParams{" << p.numLevels << ", " << p.minValue << "}";
+  return os;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformParamsFromMinMaxSolver &s) {
+  os << "UniformParamsFromMinMaxSolver(" << s.getStepCount() << "){";
+  os << "(" << s.getBoundingMin() << ":" << s.getBoundingMax() << ") -> ";
+  if (!s.isSatisfied()) {
+    os << "unsat}";
+    return os;
+  }
+
+  os << "(" << s.getAdjMin() << ":" << s.getAdjMax() << ")";
+  os << ", scale = " << s.getScale();
+  os << ", zp = " << s.getZp();
+  os << "}";
+
+  return os;
+}
+
+} // end namespace quantizer
+} // end namespace mlir
diff --git a/third_party/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp b/third_party/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
new file mode 100644
index 00000000000..a2d38ce211d
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
@@ -0,0 +1,128 @@
+//===- AddDefaultStatsTestPass.cpp - Testing pass to add default stats ----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a testing pass to add default statistics nodes to every
+// quantization eligible op. Useful for unit testing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+class AddDefaultStatsPass : public FunctionPass<AddDefaultStatsPass> {
+public:
+  AddDefaultStatsPass() = default;
+  AddDefaultStatsPass(SolverContext &solverContext,
+                      const TargetConfiguration &config)
+      : explicitSolverContext(&solverContext), explicitConfig(&config) {}
+
+  void runOnFunction() override;
+  void runWithConfig(SolverContext &solverContext,
+                     const TargetConfiguration &config);
+
+private:
+  SolverContext *explicitSolverContext = nullptr;
+  const TargetConfiguration *explicitConfig = nullptr;
+};
+
+} // end anonymous namespace
+
+void AddDefaultStatsPass::runOnFunction() {
+  if (explicitSolverContext && explicitConfig) {
+    // If explicitly constructed with a config and context.
+    runWithConfig(*explicitSolverContext, *explicitConfig);
+    return;
+  }
+  // For global pass registration, use defaults.
+  SolverContext solverContext(*getFunction().getContext());
+  auto config = FxpMathTargetConfig::create(solverContext);
+  runWithConfig(solverContext, *config);
+}
+
+void AddDefaultStatsPass::runWithConfig(SolverContext &solverContext,
+                                        const TargetConfiguration &config) {
+  auto func = getFunction();
+
+  // Insert stats for each argument.
+  for (auto *arg : func.getArguments()) {
+    if (!config.isHandledType(arg->getType()))
+      continue;
+    OpBuilder b(func.getBody());
+    APFloat minValue(-1.0f);
+    APFloat maxValue(1.0f);
+    ElementsAttr layerStats = DenseFPElementsAttr::get(
+        b.getTensorType({2}, b.getF32Type()), {minValue, maxValue});
+    auto statsOp =
+        b.create<StatisticsOp>(func.getLoc(), arg, layerStats, nullptr);
+    arg->replaceAllUsesWith(statsOp);
+
+    // StatsOp contained a use to 'arg' so make sure to reset it after replacing
+    // all of the uses of 'arg'.
+    statsOp.getOperation()->replaceUsesOfWith(statsOp, arg);
+  }
+
+  // Walk the ops and insert stats.
+  func.walk([&](Operation *op) {
+    if (!config.isRequireStatsOp(op)) {
+      return;
+    }
+    assert(op->getNumResults() == 1);
+
+    auto originalResult = op->getResult(0);
+    if (!config.isHandledType(originalResult->getType()))
+      return;
+
+    OpBuilder b(op->getBlock(), ++op->getIterator());
+
+    APFloat minValue(-1.0f);
+    APFloat maxValue(1.0f);
+    ElementsAttr layerStats = DenseFPElementsAttr::get(
+        b.getTensorType({2}, b.getF32Type()), {minValue, maxValue});
+    auto statsOp = b.create<StatisticsOp>(op->getLoc(), op->getResult(0),
+                                          layerStats, nullptr);
+    originalResult->replaceAllUsesWith(statsOp);
+
+    // StatsOp contained a use to 'op' so make sure to reset it after replacing
+    // all of the uses of 'op'.
+    statsOp.getOperation()->replaceUsesOfWith(statsOp, originalResult);
+  });
+}
+
+std::unique_ptr<FunctionPassBase> mlir::quantizer::createAddDefaultStatsPass() {
+  return std::make_unique<AddDefaultStatsPass>();
+}
+
+static PassRegistration<AddDefaultStatsPass> pass(
+    "quantizer-add-default-stats-test",
+    "Adds default (dummy) statistics to all ops that can benefit from "
+    "runtime statistics. This is meant to help in early stage bootstrapping.");
diff --git a/third_party/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp b/third_party/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
new file mode 100644
index 00000000000..ff293fc93aa
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
@@ -0,0 +1,296 @@
+//===- InferQuantizedTypesPass.cpp - Infers quantized types ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines the primary pass for instantiating a CAG, running it to
+// convergence on a module to determine eligible quantized type transforms, and
+// applying those transforms to the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace llvm {
+
+template <>
+struct DOTGraphTraits<const CAGSlice *>
+    : public DOTGraphTraits<const CAGNode *> {
+  DOTGraphTraits(bool isSimple = false)
+      : DOTGraphTraits<const CAGNode *>(isSimple) {}
+
+  std::string getNodeLabel(const CAGNode *node, const CAGSlice *graph) {
+    std::string s;
+    llvm::raw_string_ostream out(s);
+    node->printLabel(out);
+    return out.str();
+  }
+
+  static std::string getGraphProperties(const CAGSlice *) {
+    return "rankdir=LR;";
+  }
+
+  static bool isNodeHidden(const CAGNode *node) {
+    // Filter constraint nodes with no incoming or outgoing connections.
+    // These orphans are often created as part of graph merging operations.
+    return llvm::isa<CAGConstraintNode>(node) && node->isOrphan();
+  }
+
+  std::string getNodeAttributes(const CAGNode *node, const CAGSlice *graph) {
+    switch (node->getKind()) {
+    default:
+      return std::string();
+    case CAGNode::Kind::OperandAnchor:
+      return "shape=record,color=yellow,style=filled";
+    case CAGNode::Kind::ResultAnchor:
+      return "shape=record,color=lightblue,style=filled";
+    case CAGNode::Kind::Constraint:
+      return "shape=record,style=dotted";
+    }
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+class InferQuantizedTypesPass : public ModulePass<InferQuantizedTypesPass> {
+public:
+  InferQuantizedTypesPass() = default;
+  InferQuantizedTypesPass(SolverContext &solverContext,
+                          const TargetConfiguration &config)
+      : explicitSolverContext(&solverContext), explicitConfig(&config) {}
+  void runOnModule() override;
+  void runWithConfig(SolverContext &solverContext,
+                     const TargetConfiguration &config);
+
+  void transformOperandType(CAGOperandAnchor *anchor, Type newType);
+  void transformResultType(CAGResultAnchor *anchor, Type newType);
+
+private:
+  SolverContext *explicitSolverContext = nullptr;
+  const TargetConfiguration *explicitConfig = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Maximum number of propagation rounds to run to converge the CAG before
+/// signalling an error.
+static const int kMaximumPropagationRounds = 1000;
+
+static LogicalResult validateTypeConversion(Type newType, Type origType,
+                                            Operation *op) {
+  if (!newType) {
+    return op->emitOpError() << "unsupported type conversion from " << newType;
+  }
+  return success();
+}
+
+void InferQuantizedTypesPass::runOnModule() {
+  if (explicitSolverContext && explicitConfig) {
+    // If explicitly constructed with a config and context.
+    runWithConfig(*explicitSolverContext, *explicitConfig);
+    return;
+  }
+
+  // For global pass registration, use defaults.
+  SolverContext solverContext(*getModule().getContext());
+  auto config = FxpMathTargetConfig::create(solverContext);
+  runWithConfig(solverContext, *config);
+}
+
+void InferQuantizedTypesPass::runWithConfig(SolverContext &solverContext,
+                                            const TargetConfiguration &config) {
+  CAGSlice cag(solverContext);
+  for (auto f : getModule().getOps<FuncOp>()) {
+    f.walk([&cag, &config](Operation *op) { config.handleOp(op, cag); });
+  }
+  config.finalizeAnchors(cag);
+
+  // Propagate.
+  int propRound;
+  for (propRound = kMaximumPropagationRounds; propRound > 0; --propRound) {
+    auto propCount = cag.propagate(config);
+    if (propCount == 0)
+      break;
+  }
+  if (propRound == 0) {
+    emitError(UnknownLoc::get(&getContext()),
+              "exceeded maximum number of solver iterations (infinite loop?)");
+    return;
+  }
+
+  // TODO: Only dump the GraphViz if a flag is set and move to a utility.
+  // GraphViz.
+  if (!solverContext.getDebugCAGDotPath().empty()) {
+    auto actFileName =
+        llvm::WriteGraph(const_cast<const CAGSlice *>(&cag), "CAG",
+                         /*ShortNames=*/false,
+                         /*Title=*/"CAG",
+                         /*Filename=*/solverContext.getDebugCAGDotPath());
+    llvm::errs() << "Wrote graphviz file: " << actFileName << "\n";
+  }
+
+  // Start transforming the types in order of anchor type (results, then
+  // operands).
+  // Apply result types.
+  for (auto *node : cag) {
+    auto anchorNode = llvm::dyn_cast<CAGResultAnchor>(node);
+    if (!anchorNode)
+      continue;
+    if (Type newType = anchorNode->getTransformedType())
+      transformResultType(anchorNode, newType);
+  }
+
+  // Apply operand types.
+  for (auto *node : cag) {
+    auto anchorNode = llvm::dyn_cast<CAGOperandAnchor>(node);
+    if (!anchorNode)
+      continue;
+    if (Type newType = anchorNode->getTransformedType())
+      transformOperandType(anchorNode, newType);
+  }
+}
+
+void InferQuantizedTypesPass::transformOperandType(CAGOperandAnchor *anchor,
+                                                   Type newType) {
+  Value *inputValue = anchor->getValue();
+  Operation *op = anchor->getOp();
+  OpBuilder b(op->getBlock(), Block::iterator(op));
+
+  SmallVector<Value *, 1> removeValuesIfDead;
+
+  // Because we've already run the result transforms at this phase, it is
+  // very likely that inputValue points to a dcast op whose input matches
+  // our type. We detect that situation and route around just to save some
+  // bulk in the IR.
+  Value *newTypedInputValue = inputValue;
+  auto inputDcastOp =
+      dyn_cast_or_null<DequantizeCastOp>(inputValue->getDefiningOp());
+  if (inputDcastOp && inputDcastOp.arg()->getType() == newType) {
+    // Can just use the dcast's input value.
+    newTypedInputValue = inputDcastOp.arg();
+    removeValuesIfDead.push_back(inputDcastOp);
+  } else {
+    // Need to synthesize a qcast.
+    newTypedInputValue =
+        b.create<QuantizeCastOp>(op->getLoc(), newType, inputValue);
+  }
+
+  switch (anchor->getTypeTransformRule()) {
+  case CAGAnchorNode::TypeTransformRule::Direct:
+    anchor->getOp()->setOperand(anchor->getOperandIdx(), newTypedInputValue);
+    break;
+
+  case CAGAnchorNode::TypeTransformRule::DirectStorage: {
+    Type storageType = QuantizedType::castToStorageType(newType);
+    if (failed(validateTypeConversion(storageType, newType, op)))
+      return;
+    anchor->getOp()->setOperand(
+        anchor->getOperandIdx(),
+        b.create<StorageCastOp>(op->getLoc(), storageType, newTypedInputValue));
+    break;
+  }
+
+  case CAGAnchorNode::TypeTransformRule::ExpressedOnly:
+    // Leave the anchor as-is and just cast in/out after it.
+    anchor->getOp()->setOperand(
+        anchor->getOperandIdx(),
+        b.create<DequantizeCastOp>(op->getLoc(), anchor->getOriginalType(),
+                                   newTypedInputValue));
+    break;
+  }
+
+  for (Value *removeValueIfDead : removeValuesIfDead) {
+    if (removeValueIfDead->use_empty()) {
+      removeValueIfDead->getDefiningOp()->erase();
+    }
+  }
+}
+
+void InferQuantizedTypesPass::transformResultType(CAGResultAnchor *anchor,
+                                                  Type newType) {
+  Value *origResultValue = anchor->getValue();
+  Operation *op = origResultValue->getDefiningOp();
+  OpBuilder b(op->getBlock(), ++Block::iterator(op));
+
+  Value *replacedResultValue = nullptr;
+  Value *newResultValue = nullptr;
+  switch (anchor->getTypeTransformRule()) {
+  case CAGAnchorNode::TypeTransformRule::Direct:
+    origResultValue->setType(newType);
+    replacedResultValue = newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), origResultValue);
+    break;
+
+  case CAGAnchorNode::TypeTransformRule::DirectStorage: {
+    Type storageType = QuantizedType::castToStorageType(newType);
+    if (failed(validateTypeConversion(storageType, newType, op)))
+      return;
+    origResultValue->setType(storageType);
+    replacedResultValue =
+        b.create<StorageCastOp>(op->getLoc(), newType, origResultValue);
+    newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), replacedResultValue);
+    break;
+  }
+
+  case CAGAnchorNode::TypeTransformRule::ExpressedOnly:
+    // Leave the anchor as-is and just cast in/out after it.
+    replacedResultValue =
+        b.create<QuantizeCastOp>(op->getLoc(), newType, origResultValue);
+    newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), replacedResultValue);
+    break;
+  }
+
+  if (replacedResultValue) {
+    // Transform:
+    //   origResultValue -->  replaceResultValue -> newResultValue
+    //                   \->  [original uses]
+    // To:
+    //   origResultValue -> replaceResultValue ->
+    //                      newResultValue -> [original uses]
+    // Note that replaceResultValue may equal newResultValue or there may
+    // be operands between the two.
+    origResultValue->replaceAllUsesWith(newResultValue);
+    replacedResultValue->getDefiningOp()->replaceUsesOfWith(newResultValue,
+                                                            origResultValue);
+  }
+}
+
+std::unique_ptr<ModulePassBase> mlir::quantizer::createInferQuantizedTypesPass(
+    SolverContext &solverContext, const TargetConfiguration &config) {
+  return std::make_unique<InferQuantizedTypesPass>(solverContext, config);
+}
+
+static PassRegistration<InferQuantizedTypesPass>
+    pass("quantizer-infer-quantized-types",
+         "Infers quantized types for a module");
diff --git a/third_party/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp b/third_party/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
new file mode 100644
index 00000000000..b9fbf27d24f
--- /dev/null
+++ b/third_party/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
@@ -0,0 +1,77 @@
+//===- RemoveInstrumentationPass.cpp - Removes instrumentation ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a pass to remove any instrumentation ops. It is often one
+// of the final steps when performing quantization and is run after any
+// decisions requiring instrumentation have been made.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+class RemoveInstrumentationPass
+    : public FunctionPass<RemoveInstrumentationPass> {
+  void runOnFunction() override;
+};
+
+template <typename OpTy>
+class RemoveIdentityOpRewrite : public RewritePattern {
+public:
+  RemoveIdentityOpRewrite(MLIRContext *context)
+      : RewritePattern(OpTy::getOperationName(), 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    assert(op->getNumOperands() == 1);
+    assert(op->getNumResults() == 1);
+
+    rewriter.replaceOp(op, op->getOperand(0));
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void RemoveInstrumentationPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto *context = &getContext();
+  patterns.insert<RemoveIdentityOpRewrite<StatisticsOp>,
+                  RemoveIdentityOpRewrite<StatisticsRefOp>,
+                  RemoveIdentityOpRewrite<CoupledRefOp>>(context);
+  applyPatternsGreedily(func, patterns);
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::quantizer::createRemoveInstrumentationPass() {
+  return std::make_unique<RemoveInstrumentationPass>();
+}
+
+static PassRegistration<RemoveInstrumentationPass>
+    pass("quantizer-remove-instrumentation",
+         "Removes instrumentation and hints which have no effect on final "
+         "execution");
diff --git a/third_party/mlir/lib/Support/CMakeLists.txt b/third_party/mlir/lib/Support/CMakeLists.txt
new file mode 100644
index 00000000000..a927fc681c4
--- /dev/null
+++ b/third_party/mlir/lib/Support/CMakeLists.txt
@@ -0,0 +1,48 @@
+set(LLVM_OPTIONAL_SOURCES
+  FileUtilities.cpp
+  JitRunner.cpp
+  MlirOptMain.cpp
+  StorageUniquer.cpp
+  TranslateClParser.cpp
+)
+
+add_llvm_library(MLIRSupport
+  FileUtilities.cpp
+  StorageUniquer.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIRSupport LLVMSupport)
+
+add_llvm_library(MLIROptMain
+  MlirOptMain.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIROptMain LLVMSupport)
+
+add_llvm_library(MLIRTranslateClParser
+  TranslateClParser.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIRTranslateClParser LLVMSupport)
+
+add_llvm_library(MLIRJitRunner
+  JitRunner.cpp
+)
+target_link_libraries(MLIRJitRunner PRIVATE
+  MLIRExecutionEngine
+  MLIRIR
+  MLIRParser
+  MLIRStandardOps
+  MLIRTargetLLVMIR
+  MLIRTransforms
+  MLIRStandardToLLVM
+  MLIRSupport
+  LLVMCore
+  LLVMSupport
+)
diff --git a/third_party/mlir/lib/Support/FileUtilities.cpp b/third_party/mlir/lib/Support/FileUtilities.cpp
new file mode 100644
index 00000000000..6f0dc93b235
--- /dev/null
+++ b/third_party/mlir/lib/Support/FileUtilities.cpp
@@ -0,0 +1,56 @@
+//===- FileUtilities.cpp - utilities for working with files ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Definitions of common utilities for working with files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+std::unique_ptr<llvm::MemoryBuffer>
+mlir::openInputFile(StringRef inputFilename, std::string *errorMessage) {
+  auto fileOrErr = llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code error = fileOrErr.getError()) {
+    if (errorMessage)
+      *errorMessage = "cannot open input file '" + inputFilename.str() +
+                      "': " + error.message();
+    return nullptr;
+  }
+
+  return std::move(*fileOrErr);
+}
+
+std::unique_ptr<llvm::ToolOutputFile>
+mlir::openOutputFile(StringRef outputFilename, std::string *errorMessage) {
+  std::error_code error;
+  auto result = std::make_unique<llvm::ToolOutputFile>(outputFilename, error,
+                                                       llvm::sys::fs::F_None);
+  if (error) {
+    if (errorMessage)
+      *errorMessage = "cannot open output file '" + outputFilename.str() +
+                      "': " + error.message();
+    return nullptr;
+  }
+
+  return result;
+}
diff --git a/third_party/mlir/lib/Support/JitRunner.cpp b/third_party/mlir/lib/Support/JitRunner.cpp
new file mode 100644
index 00000000000..40f12928b3f
--- /dev/null
+++ b/third_party/mlir/lib/Support/JitRunner.cpp
@@ -0,0 +1,361 @@
+//===- jit-runner.cpp - MLIR CPU Execution Driver Library -----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a library that provides a shared implementation for command line
+// utilities that execute an MLIR file on the CPU by translating MLIR to LLVM
+// IR before JIT-compiling and executing the latter.
+//
+// The translation can be customized by providing an MLIR to MLIR
+// transformation.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/JitRunner.h"
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/ExecutionEngine/MemRefUtils.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <numeric>
+
+using namespace mlir;
+using llvm::Error;
+
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+static llvm::cl::opt<std::string>
+    initValue("init-value", llvm::cl::desc("Initial value of MemRef elements"),
+              llvm::cl::value_desc("<float value>"), llvm::cl::init("0.0"));
+static llvm::cl::opt<std::string>
+    mainFuncName("e", llvm::cl::desc("The function to be called"),
+                 llvm::cl::value_desc("<function name>"),
+                 llvm::cl::init("main"));
+static llvm::cl::opt<std::string> mainFuncType(
+    "entry-point-result",
+    llvm::cl::desc("Textual description of the function type to be called"),
+    llvm::cl::value_desc("f32 | memrefs | void"), llvm::cl::init("memrefs"));
+
+static llvm::cl::OptionCategory optFlags("opt-like flags");
+
+// CLI list of pass information
+static llvm::cl::list<const llvm::PassInfo *, bool, llvm::PassNameParser>
+    llvmPasses(llvm::cl::desc("LLVM optimizing passes to run"),
+               llvm::cl::cat(optFlags));
+
+// CLI variables for -On options.
+static llvm::cl::opt<bool> optO0("O0", llvm::cl::desc("Run opt O0 passes"),
+                                 llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool> optO1("O1", llvm::cl::desc("Run opt O1 passes"),
+                                 llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool> optO2("O2", llvm::cl::desc("Run opt O2 passes"),
+                                 llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool> optO3("O3", llvm::cl::desc("Run opt O3 passes"),
+                                 llvm::cl::cat(optFlags));
+
+static llvm::cl::OptionCategory clOptionsCategory("linking options");
+static llvm::cl::list<std::string>
+    clSharedLibs("shared-libs", llvm::cl::desc("Libraries to link dynamically"),
+                 llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                 llvm::cl::cat(clOptionsCategory));
+
+static OwningModuleRef parseMLIRInput(StringRef inputFilename,
+                                      MLIRContext *context) {
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return nullptr;
+  }
+
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+  return OwningModuleRef(parseSourceFile(sourceMgr, context));
+}
+
+// Initialize the relevant subsystems of LLVM.
+static void initializeLLVM() {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+}
+
+static inline Error make_string_error(const llvm::Twine &message) {
+  return llvm::make_error<llvm::StringError>(message.str(),
+                                             llvm::inconvertibleErrorCode());
+}
+
+static void printOneMemRef(Type t, void *val) {
+  auto memRefType = t.cast<MemRefType>();
+  auto shape = memRefType.getShape();
+  int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
+                                 std::multiplies<int64_t>());
+  for (int64_t i = 0; i < size; ++i) {
+    llvm::outs() << reinterpret_cast<StaticFloatMemRef *>(val)->data[i] << ' ';
+  }
+  llvm::outs() << '\n';
+}
+
+static void printMemRefArguments(ArrayRef<Type> argTypes,
+                                 ArrayRef<Type> resTypes,
+                                 ArrayRef<void *> args) {
+  auto properArgs = args.take_front(argTypes.size());
+  for (const auto &kvp : llvm::zip(argTypes, properArgs)) {
+    auto type = std::get<0>(kvp);
+    auto val = std::get<1>(kvp);
+    printOneMemRef(type, val);
+  }
+
+  auto results = args.drop_front(argTypes.size());
+  for (const auto &kvp : llvm::zip(resTypes, results)) {
+    auto type = std::get<0>(kvp);
+    auto val = std::get<1>(kvp);
+    printOneMemRef(type, val);
+  }
+}
+
+// Calls the passes necessary to convert affine and standard dialects to the
+// LLVM IR dialect.
+// Currently, these passes are:
+// - CSE
+// - canonicalization
+// - affine to standard lowering
+// - standard to llvm lowering
+static LogicalResult convertAffineStandardToLLVMIR(ModuleOp module) {
+  PassManager manager;
+  manager.addPass(mlir::createCanonicalizerPass());
+  manager.addPass(mlir::createCSEPass());
+  manager.addPass(mlir::createLowerAffinePass());
+  manager.addPass(mlir::createConvertToLLVMIRPass());
+  return manager.run(module);
+}
+
+// JIT-compile the given module and run "entryPoint" with "args" as arguments.
+static Error
+compileAndExecute(ModuleOp module, StringRef entryPoint,
+                  std::function<llvm::Error(llvm::Module *)> transformer,
+                  void **args) {
+  SmallVector<StringRef, 4> libs(clSharedLibs.begin(), clSharedLibs.end());
+  auto expectedEngine =
+      mlir::ExecutionEngine::create(module, transformer, libs);
+  if (!expectedEngine)
+    return expectedEngine.takeError();
+
+  auto engine = std::move(*expectedEngine);
+  auto expectedFPtr = engine->lookup(entryPoint);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+  void (*fptr)(void **) = *expectedFPtr;
+  (*fptr)(args);
+
+  return Error::success();
+}
+
+static Error compileAndExecuteVoidFunction(
+    ModuleOp module, StringRef entryPoint,
+    std::function<llvm::Error(llvm::Module *)> transformer) {
+  FuncOp mainFunction = module.lookupSymbol<FuncOp>(entryPoint);
+  if (!mainFunction || mainFunction.getBlocks().empty())
+    return make_string_error("entry point not found");
+  void *empty = nullptr;
+  return compileAndExecute(module, entryPoint, transformer, &empty);
+}
+
+static Error compileAndExecuteFunctionWithMemRefs(
+    ModuleOp module, StringRef entryPoint,
+    std::function<llvm::Error(llvm::Module *)> transformer) {
+  FuncOp mainFunction = module.lookupSymbol<FuncOp>(entryPoint);
+  if (!mainFunction || mainFunction.getBlocks().empty()) {
+    return make_string_error("entry point not found");
+  }
+
+  // Store argument and result types of the original function necessary to
+  // pretty print the results, because the function itself will be rewritten
+  // to use the LLVM dialect.
+  SmallVector<Type, 8> argTypes =
+      llvm::to_vector<8>(mainFunction.getType().getInputs());
+  SmallVector<Type, 8> resTypes =
+      llvm::to_vector<8>(mainFunction.getType().getResults());
+
+  float init = std::stof(initValue.getValue());
+
+  auto expectedArguments = allocateMemRefArguments(mainFunction, init);
+  if (!expectedArguments)
+    return expectedArguments.takeError();
+
+  if (failed(convertAffineStandardToLLVMIR(module)))
+    return make_string_error("conversion to the LLVM IR dialect failed");
+
+  if (auto error = compileAndExecute(module, entryPoint, transformer,
+                                     expectedArguments->data()))
+    return error;
+
+  printMemRefArguments(argTypes, resTypes, *expectedArguments);
+  freeMemRefArguments(*expectedArguments);
+  return Error::success();
+}
+
+static Error compileAndExecuteSingleFloatReturnFunction(
+    ModuleOp module, StringRef entryPoint,
+    std::function<llvm::Error(llvm::Module *)> transformer) {
+  FuncOp mainFunction = module.lookupSymbol<FuncOp>(entryPoint);
+  if (!mainFunction || mainFunction.isExternal()) {
+    return make_string_error("entry point not found");
+  }
+
+  if (!mainFunction.getType().getInputs().empty())
+    return make_string_error("function inputs not supported");
+
+  if (mainFunction.getType().getResults().size() != 1)
+    return make_string_error("only single f32 function result supported");
+
+  auto t = mainFunction.getType().getResults()[0].dyn_cast<LLVM::LLVMType>();
+  if (!t)
+    return make_string_error("only single llvm.f32 function result supported");
+  auto *llvmTy = t.getUnderlyingType();
+  if (llvmTy != llvmTy->getFloatTy(llvmTy->getContext()))
+    return make_string_error("only single llvm.f32 function result supported");
+
+  float res;
+  struct {
+    void *data;
+  } data;
+  data.data = &res;
+  if (auto error =
+          compileAndExecute(module, entryPoint, transformer, (void **)&data))
+    return error;
+
+  // Intentional printing of the output so we can test.
+  llvm::outs() << res;
+
+  return Error::success();
+}
+
+// Entry point for all CPU runners. Expects the common argc/argv arguments for
+// standard C++ main functions and an mlirTransformer.
+// The latter is applied after parsing the input into MLIR IR and before passing
+// the MLIR module to the ExecutionEngine.
+int mlir::JitRunnerMain(
+    int argc, char **argv,
+    llvm::function_ref<LogicalResult(mlir::ModuleOp)> mlirTransformer) {
+  llvm::InitLLVM y(argc, argv);
+
+  initializeLLVM();
+  mlir::initializeLLVMPasses();
+
+  llvm::SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
+      optO0, optO1, optO2, optO3};
+
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR CPU execution driver\n");
+
+  llvm::SmallVector<const llvm::PassInfo *, 4> passes;
+  llvm::Optional<unsigned> optLevel;
+  unsigned optCLIPosition = 0;
+  // Determine if there is an optimization flag present, and its CLI position
+  // (optCLIPosition).
+  for (unsigned j = 0; j < 4; ++j) {
+    auto &flag = optFlags[j].get();
+    if (flag) {
+      optLevel = j;
+      optCLIPosition = flag.getPosition();
+      break;
+    }
+  }
+  // Generate vector of pass information, plus the index at which we should
+  // insert any optimization passes in that vector (optPosition).
+  unsigned optPosition = 0;
+  for (unsigned i = 0, e = llvmPasses.size(); i < e; ++i) {
+    passes.push_back(llvmPasses[i]);
+    if (optCLIPosition < llvmPasses.getPosition(i)) {
+      optPosition = i;
+      optCLIPosition = UINT_MAX; // To ensure we never insert again
+    }
+  }
+
+  MLIRContext context;
+  auto m = parseMLIRInput(inputFilename, &context);
+  if (!m) {
+    llvm::errs() << "could not parse the input IR\n";
+    return 1;
+  }
+
+  if (mlirTransformer)
+    if (failed(mlirTransformer(m.get())))
+      return EXIT_FAILURE;
+
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (!tmBuilderOrError) {
+    llvm::errs() << "Failed to create a JITTargetMachineBuilder for the host\n";
+    return EXIT_FAILURE;
+  }
+  auto tmOrError = tmBuilderOrError->createTargetMachine();
+  if (!tmOrError) {
+    llvm::errs() << "Failed to create a TargetMachine for the host\n";
+    return EXIT_FAILURE;
+  }
+
+  auto transformer = mlir::makeLLVMPassesTransformer(
+      passes, optLevel, /*targetMachine=*/tmOrError->get(), optPosition);
+
+  // Get the function used to compile and execute the module.
+  using CompileAndExecuteFnT = Error (*)(
+      ModuleOp, StringRef, std::function<llvm::Error(llvm::Module *)>);
+  auto compileAndExecuteFn =
+      llvm::StringSwitch<CompileAndExecuteFnT>(mainFuncType.getValue())
+          .Case("f32", compileAndExecuteSingleFloatReturnFunction)
+          .Case("memrefs", compileAndExecuteFunctionWithMemRefs)
+          .Case("void", compileAndExecuteVoidFunction)
+          .Default(nullptr);
+
+  Error error =
+      compileAndExecuteFn
+          ? compileAndExecuteFn(m.get(), mainFuncName.getValue(), transformer)
+          : make_string_error("unsupported function type");
+
+  int exitCode = EXIT_SUCCESS;
+  llvm::handleAllErrors(std::move(error),
+                        [&exitCode](const llvm::ErrorInfoBase &info) {
+                          llvm::errs() << "Error: ";
+                          info.log(llvm::errs());
+                          llvm::errs() << '\n';
+                          exitCode = EXIT_FAILURE;
+                        });
+
+  return exitCode;
+}
diff --git a/third_party/mlir/lib/Support/MlirOptMain.cpp b/third_party/mlir/lib/Support/MlirOptMain.cpp
new file mode 100644
index 00000000000..80cba5ad73f
--- /dev/null
+++ b/third_party/mlir/lib/Support/MlirOptMain.cpp
@@ -0,0 +1,155 @@
+//===- MlirOptMain.cpp - MLIR Optimizer Driver ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a utility that runs an optimization pass and prints the result back
+// out. It is designed to support unit testing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace llvm;
+using llvm::SMLoc;
+
+/// Perform the actions on the input file indicated by the command line flags
+/// within the specified context.
+///
+/// This typically parses the main source file, runs zero or more optimization
+/// passes, then prints the output.
+///
+static LogicalResult
+performActions(raw_ostream &os, bool verifyDiagnostics, bool verifyPasses,
+               SourceMgr &sourceMgr, MLIRContext *context,
+               const std::vector<const mlir::PassRegistryEntry *> &passList) {
+  OwningModuleRef module(parseSourceFile(sourceMgr, context));
+  if (!module)
+    return failure();
+
+  // Apply any pass manager command line options.
+  PassManager pm(verifyPasses);
+  applyPassManagerCLOptions(pm);
+
+  // Run each of the passes that were selected.
+  for (const auto *passEntry : passList)
+    passEntry->addToPipeline(pm);
+
+  // Run the pipeline.
+  if (failed(pm.run(*module)))
+    return failure();
+
+  // Print the output.
+  module->print(os);
+  return success();
+}
+
+/// Parses the memory buffer.  If successfully, run a series of passes against
+/// it and print the result.
+static LogicalResult
+processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
+              bool verifyDiagnostics, bool verifyPasses,
+              const std::vector<const mlir::PassRegistryEntry *> &passList) {
+  // Tell sourceMgr about this buffer, which is what the parser will pick up.
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
+
+  // Parse the input file.
+  MLIRContext context;
+
+  // If we are in verify diagnostics mode then we have a lot of work to do,
+  // otherwise just perform the actions without worrying about it.
+  if (!verifyDiagnostics) {
+    SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+    return performActions(os, verifyDiagnostics, verifyPasses, sourceMgr,
+                          &context, passList);
+  }
+
+  SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
+
+  // Do any processing requested by command line flags.  We don't care whether
+  // these actions succeed or fail, we only care what diagnostics they produce
+  // and whether they match our expectations.
+  performActions(os, verifyDiagnostics, verifyPasses, sourceMgr, &context,
+                 passList);
+
+  // Verify the diagnostic handler to make sure that each of the diagnostics
+  // matched.
+  return sourceMgrHandler.verify();
+}
+
+/// Split the specified file on a marker and process each chunk independently
+/// according to the normal processBuffer logic.  This is primarily used to
+/// allow a large number of small independent parser tests to be put into a
+/// single test, but could be used for other purposes as well.
+static LogicalResult splitAndProcessFile(
+    raw_ostream &os, std::unique_ptr<MemoryBuffer> originalBuffer,
+    bool verifyDiagnostics, bool verifyPasses,
+    const std::vector<const mlir::PassRegistryEntry *> &passList) {
+  const char marker[] = "// -----";
+  auto *origMemBuffer = originalBuffer.get();
+  SmallVector<StringRef, 8> sourceBuffers;
+  origMemBuffer->getBuffer().split(sourceBuffers, marker);
+
+  // Add the original buffer to the source manager.
+  SourceMgr fileSourceMgr;
+  fileSourceMgr.AddNewSourceBuffer(std::move(originalBuffer), SMLoc());
+
+  bool hadUnexpectedResult = false;
+
+  // Process each chunk in turn.  If any fails, then return a failure of the
+  // tool.
+  for (auto &subBuffer : sourceBuffers) {
+    auto splitLoc = SMLoc::getFromPointer(subBuffer.data());
+    unsigned splitLine = fileSourceMgr.getLineAndColumn(splitLoc).first;
+    auto subMemBuffer = MemoryBuffer::getMemBufferCopy(
+        subBuffer, origMemBuffer->getBufferIdentifier() +
+                       Twine(" split at line #") + Twine(splitLine));
+    if (failed(processBuffer(os, std::move(subMemBuffer), verifyDiagnostics,
+                             verifyPasses, passList)))
+      hadUnexpectedResult = true;
+  }
+
+  return failure(hadUnexpectedResult);
+}
+
+LogicalResult
+mlir::MlirOptMain(raw_ostream &os, std::unique_ptr<MemoryBuffer> buffer,
+                  const std::vector<const mlir::PassRegistryEntry *> &passList,
+                  bool splitInputFile, bool verifyDiagnostics,
+                  bool verifyPasses) {
+  // The split-input-file mode is a very specific mode that slices the file
+  // up into small pieces and checks each independently.
+  if (splitInputFile)
+    return splitAndProcessFile(os, std::move(buffer), verifyDiagnostics,
+                               verifyPasses, passList);
+
+  return processBuffer(os, std::move(buffer), verifyDiagnostics, verifyPasses,
+                       passList);
+}
diff --git a/third_party/mlir/lib/Support/StorageUniquer.cpp b/third_party/mlir/lib/Support/StorageUniquer.cpp
new file mode 100644
index 00000000000..c004b61a54d
--- /dev/null
+++ b/third_party/mlir/lib/Support/StorageUniquer.cpp
@@ -0,0 +1,208 @@
+//===- StorageUniquer.cpp - Common Storage Class Uniquer ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Support/StorageUniquer.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/RWMutex.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace mlir {
+namespace detail {
+/// This is the implementation of the StorageUniquer class.
+struct StorageUniquerImpl {
+  using BaseStorage = StorageUniquer::BaseStorage;
+  using StorageAllocator = StorageUniquer::StorageAllocator;
+
+  /// A lookup key for derived instances of storage objects.
+  struct LookupKey {
+    /// The known derived kind for the storage.
+    unsigned kind;
+
+    /// The known hash value of the key.
+    unsigned hashValue;
+
+    /// An equality function for comparing with an existing storage instance.
+    llvm::function_ref<bool(const BaseStorage *)> isEqual;
+  };
+
+  /// A utility wrapper object representing a hashed storage object. This class
+  /// contains a storage object and an existing computed hash value.
+  struct HashedStorage {
+    unsigned hashValue;
+    BaseStorage *storage;
+  };
+
+  /// Get or create an instance of a complex derived type.
+  BaseStorage *
+  getOrCreate(unsigned kind, unsigned hashValue,
+              llvm::function_ref<bool(const BaseStorage *)> isEqual,
+              llvm::function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    LookupKey lookupKey{kind, hashValue, isEqual};
+
+    // Check for an existing instance in read-only mode.
+    {
+      llvm::sys::SmartScopedReader<true> typeLock(mutex);
+      auto it = storageTypes.find_as(lookupKey);
+      if (it != storageTypes.end())
+        return it->storage;
+    }
+
+    // Acquire a writer-lock so that we can safely create the new type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+
+    // Check for an existing instance again here, because another writer thread
+    // may have already created one.
+    auto existing = storageTypes.insert_as({}, lookupKey);
+    if (!existing.second)
+      return existing.first->storage;
+
+    // Otherwise, construct and initialize the derived storage for this type
+    // instance.
+    BaseStorage *storage = initializeStorage(kind, ctorFn);
+    *existing.first = HashedStorage{hashValue, storage};
+    return storage;
+  }
+
+  /// Get or create an instance of a simple derived type.
+  BaseStorage *
+  getOrCreate(unsigned kind,
+              llvm::function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    // Check for an existing instance in read-only mode.
+    {
+      llvm::sys::SmartScopedReader<true> typeLock(mutex);
+      auto it = simpleTypes.find(kind);
+      if (it != simpleTypes.end())
+        return it->second;
+    }
+
+    // Acquire a writer-lock so that we can safely create the new type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+
+    // Check for an existing instance again here, because another writer thread
+    // may have already created one.
+    auto &result = simpleTypes[kind];
+    if (result)
+      return result;
+
+    // Otherwise, create and return a new storage instance.
+    return result = initializeStorage(kind, ctorFn);
+  }
+
+  /// Erase an instance of a complex derived type.
+  void erase(unsigned kind, unsigned hashValue,
+             llvm::function_ref<bool(const BaseStorage *)> isEqual,
+             llvm::function_ref<void(BaseStorage *)> cleanupFn) {
+    LookupKey lookupKey{kind, hashValue, isEqual};
+
+    // Acquire a writer-lock so that we can safely erase the type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+    auto existing = storageTypes.find_as(lookupKey);
+    if (existing == storageTypes.end())
+      return;
+
+    // Cleanup the storage and remove it from the map.
+    cleanupFn(existing->storage);
+    storageTypes.erase(existing);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Instance Storage
+  //===--------------------------------------------------------------------===//
+
+  /// Utility to create and initialize a storage instance.
+  BaseStorage *initializeStorage(
+      unsigned kind,
+      llvm::function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    BaseStorage *storage = ctorFn(allocator);
+    storage->kind = kind;
+    return storage;
+  }
+
+  /// Storage info for derived TypeStorage objects.
+  struct StorageKeyInfo : DenseMapInfo<HashedStorage> {
+    static HashedStorage getEmptyKey() {
+      return HashedStorage{0, DenseMapInfo<BaseStorage *>::getEmptyKey()};
+    }
+    static HashedStorage getTombstoneKey() {
+      return HashedStorage{0, DenseMapInfo<BaseStorage *>::getTombstoneKey()};
+    }
+
+    static unsigned getHashValue(const HashedStorage &key) {
+      return key.hashValue;
+    }
+    static unsigned getHashValue(LookupKey key) { return key.hashValue; }
+
+    static bool isEqual(const HashedStorage &lhs, const HashedStorage &rhs) {
+      return lhs.storage == rhs.storage;
+    }
+    static bool isEqual(const LookupKey &lhs, const HashedStorage &rhs) {
+      if (isEqual(rhs, getEmptyKey()) || isEqual(rhs, getTombstoneKey()))
+        return false;
+      // If the lookup kind matches the kind of the storage, then invoke the
+      // equality function on the lookup key.
+      return lhs.kind == rhs.storage->getKind() && lhs.isEqual(rhs.storage);
+    }
+  };
+
+  // Unique types with specific hashing or storage constraints.
+  using StorageTypeSet = llvm::DenseSet<HashedStorage, StorageKeyInfo>;
+  StorageTypeSet storageTypes;
+
+  // Unique types with just the kind.
+  llvm::DenseMap<unsigned, BaseStorage *> simpleTypes;
+
+  // Allocator to use when constructing derived type instances.
+  StorageUniquer::StorageAllocator allocator;
+
+  // A mutex to keep type uniquing thread-safe.
+  llvm::sys::SmartRWMutex<true> mutex;
+};
+} // end namespace detail
+} // namespace mlir
+
+StorageUniquer::StorageUniquer() : impl(new StorageUniquerImpl()) {}
+StorageUniquer::~StorageUniquer() {}
+
+/// Implementation for getting/creating an instance of a derived type with
+/// complex storage.
+auto StorageUniquer::getImpl(
+    unsigned kind, unsigned hashValue,
+    llvm::function_ref<bool(const BaseStorage *)> isEqual,
+    std::function<BaseStorage *(StorageAllocator &)> ctorFn) -> BaseStorage * {
+  return impl->getOrCreate(kind, hashValue, isEqual, ctorFn);
+}
+
+/// Implementation for getting/creating an instance of a derived type with
+/// default storage.
+auto StorageUniquer::getImpl(
+    unsigned kind, std::function<BaseStorage *(StorageAllocator &)> ctorFn)
+    -> BaseStorage * {
+  return impl->getOrCreate(kind, ctorFn);
+}
+
+/// Implementation for erasing an instance of a derived type with complex
+/// storage.
+void StorageUniquer::eraseImpl(
+    unsigned kind, unsigned hashValue,
+    llvm::function_ref<bool(const BaseStorage *)> isEqual,
+    std::function<void(BaseStorage *)> cleanupFn) {
+  impl->erase(kind, hashValue, isEqual, cleanupFn);
+}
diff --git a/third_party/mlir/lib/Support/TranslateClParser.cpp b/third_party/mlir/lib/Support/TranslateClParser.cpp
new file mode 100644
index 00000000000..8a7367ff0f1
--- /dev/null
+++ b/third_party/mlir/lib/Support/TranslateClParser.cpp
@@ -0,0 +1,105 @@
+//===- TranslateClParser.h - Translations command line parser -------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains custom command line parser for translations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/TranslateClParser.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Translation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+// Storage for the translation function wrappers that survive the parser.
+static llvm::SmallVector<TranslateFunction, 16> wrapperStorage;
+
+static LogicalResult printMLIROutput(ModuleOp module,
+                                     llvm::StringRef outputFilename) {
+  if (failed(verify(module)))
+    return failure();
+  auto file = openOutputFile(outputFilename);
+  if (!file)
+    return failure();
+  module.print(file->os());
+  file->keep();
+  return success();
+}
+
+TranslationParser::TranslationParser(llvm::cl::Option &opt)
+    : llvm::cl::parser<const TranslateFunction *>(opt) {
+  const auto &toMLIRRegistry = getTranslationToMLIRRegistry();
+  const auto &fromMLIRRegistry = getTranslationFromMLIRRegistry();
+
+  // Reserve the required capacity upfront so that pointers are not
+  // invalidated on reallocation.
+  wrapperStorage.reserve(toMLIRRegistry.size() + fromMLIRRegistry.size());
+  for (const auto &kv : toMLIRRegistry) {
+    TranslateToMLIRFunction function = kv.second;
+    TranslateFunction wrapper = [function](StringRef inputFilename,
+                                           StringRef outputFilename,
+                                           MLIRContext *context) {
+      OwningModuleRef module = function(inputFilename, context);
+      if (!module)
+        return failure();
+      return printMLIROutput(*module, outputFilename);
+    };
+    wrapperStorage.emplace_back(std::move(wrapper));
+
+    addLiteralOption(kv.first(), &wrapperStorage.back(), kv.first());
+  }
+
+  for (const auto &kv : fromMLIRRegistry) {
+    TranslateFromMLIRFunction function = kv.second;
+    TranslateFunction wrapper = [function](StringRef inputFilename,
+                                           StringRef outputFilename,
+                                           MLIRContext *context) {
+      llvm::SourceMgr sourceMgr;
+      SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, context);
+      auto module =
+          OwningModuleRef(parseSourceFile(inputFilename, sourceMgr, context));
+      if (!module)
+        return failure();
+      return function(module.get(), outputFilename);
+    };
+    wrapperStorage.emplace_back(std::move(wrapper));
+
+    addLiteralOption(kv.first(), &wrapperStorage.back(), kv.first());
+  }
+}
+
+void TranslationParser::printOptionInfo(const llvm::cl::Option &O,
+                                        size_t GlobalWidth) const {
+  TranslationParser *TP = const_cast<TranslationParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const TranslationParser::OptionInfo *VT1,
+                          const TranslationParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  using llvm::cl::parser;
+  parser<const TranslateFunction *>::printOptionInfo(O, GlobalWidth);
+}
diff --git a/third_party/mlir/lib/TableGen/Argument.cpp b/third_party/mlir/lib/TableGen/Argument.cpp
new file mode 100644
index 00000000000..17dba054e4f
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Argument.cpp
@@ -0,0 +1,29 @@
+//===- Argument.cpp - Argument definitions --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/TableGen/Argument.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+bool tblgen::NamedTypeConstraint::hasPredicate() const {
+  return !constraint.getPredicate().isNull();
+}
+
+bool tblgen::NamedTypeConstraint::isVariadic() const {
+  return constraint.isVariadic();
+}
diff --git a/third_party/mlir/lib/TableGen/Attribute.cpp b/third_party/mlir/lib/TableGen/Attribute.cpp
new file mode 100644
index 00000000000..b42bb94e3fc
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Attribute.cpp
@@ -0,0 +1,212 @@
+//===- Attribute.cpp - Attribute wrapper class ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Attribute wrapper to simplify using TableGen Record defining a MLIR
+// Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+using llvm::CodeInit;
+using llvm::DefInit;
+using llvm::Init;
+using llvm::Record;
+using llvm::StringInit;
+
+// Returns the initializer's value as string if the given TableGen initializer
+// is a code or string initializer. Returns the empty StringRef otherwise.
+static StringRef getValueAsString(const Init *init) {
+  if (const auto *code = dyn_cast<CodeInit>(init))
+    return code->getValue().trim();
+  else if (const auto *str = dyn_cast<StringInit>(init))
+    return str->getValue().trim();
+  return {};
+}
+
+tblgen::AttrConstraint::AttrConstraint(const Record *record)
+    : Constraint(Constraint::CK_Attr, record) {
+  assert(def->isSubClassOf("AttrConstraint") &&
+         "must be subclass of TableGen 'AttrConstraint' class");
+}
+
+tblgen::Attribute::Attribute(const Record *record) : AttrConstraint(record) {
+  assert(record->isSubClassOf("Attr") &&
+         "must be subclass of TableGen 'Attr' class");
+}
+
+tblgen::Attribute::Attribute(const DefInit *init) : Attribute(init->getDef()) {}
+
+bool tblgen::Attribute::isDerivedAttr() const {
+  return def->isSubClassOf("DerivedAttr");
+}
+
+bool tblgen::Attribute::isTypeAttr() const {
+  return def->isSubClassOf("TypeAttrBase");
+}
+
+bool tblgen::Attribute::isEnumAttr() const {
+  return def->isSubClassOf("EnumAttrInfo");
+}
+
+StringRef tblgen::Attribute::getStorageType() const {
+  const auto *init = def->getValueInit("storageType");
+  auto type = getValueAsString(init);
+  if (type.empty())
+    return "Attribute";
+  return type;
+}
+
+StringRef tblgen::Attribute::getReturnType() const {
+  const auto *init = def->getValueInit("returnType");
+  return getValueAsString(init);
+}
+
+StringRef tblgen::Attribute::getConvertFromStorageCall() const {
+  const auto *init = def->getValueInit("convertFromStorage");
+  return getValueAsString(init);
+}
+
+bool tblgen::Attribute::isConstBuildable() const {
+  const auto *init = def->getValueInit("constBuilderCall");
+  return !getValueAsString(init).empty();
+}
+
+StringRef tblgen::Attribute::getConstBuilderTemplate() const {
+  const auto *init = def->getValueInit("constBuilderCall");
+  return getValueAsString(init);
+}
+
+tblgen::Attribute tblgen::Attribute::getBaseAttr() const {
+  if (const auto *defInit =
+          llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseAttr"))) {
+    return Attribute(defInit).getBaseAttr();
+  }
+  return *this;
+}
+
+bool tblgen::Attribute::hasDefaultValueInitializer() const {
+  const auto *init = def->getValueInit("defaultValue");
+  return !getValueAsString(init).empty();
+}
+
+StringRef tblgen::Attribute::getDefaultValueInitializer() const {
+  const auto *init = def->getValueInit("defaultValue");
+  return getValueAsString(init);
+}
+
+bool tblgen::Attribute::isOptional() const {
+  return def->getValueAsBit("isOptional");
+}
+
+StringRef tblgen::Attribute::getAttrDefName() const {
+  if (def->isAnonymous()) {
+    return getBaseAttr().def->getName();
+  }
+  return def->getName();
+}
+
+StringRef tblgen::Attribute::getDerivedCodeBody() const {
+  assert(isDerivedAttr() && "only derived attribute has 'body' field");
+  return def->getValueAsString("body");
+}
+
+tblgen::ConstantAttr::ConstantAttr(const DefInit *init) : def(init->getDef()) {
+  assert(def->isSubClassOf("ConstantAttr") &&
+         "must be subclass of TableGen 'ConstantAttr' class");
+}
+
+tblgen::Attribute tblgen::ConstantAttr::getAttribute() const {
+  return Attribute(def->getValueAsDef("attr"));
+}
+
+StringRef tblgen::ConstantAttr::getConstantValue() const {
+  return def->getValueAsString("value");
+}
+
+tblgen::EnumAttrCase::EnumAttrCase(const llvm::DefInit *init)
+    : Attribute(init) {
+  assert(def->isSubClassOf("EnumAttrCaseInfo") &&
+         "must be subclass of TableGen 'EnumAttrInfo' class");
+}
+
+bool tblgen::EnumAttrCase::isStrCase() const {
+  return def->isSubClassOf("StrEnumAttrCase");
+}
+
+StringRef tblgen::EnumAttrCase::getSymbol() const {
+  return def->getValueAsString("symbol");
+}
+
+int64_t tblgen::EnumAttrCase::getValue() const {
+  return def->getValueAsInt("value");
+}
+
+tblgen::EnumAttr::EnumAttr(const llvm::Record *record) : Attribute(record) {
+  assert(def->isSubClassOf("EnumAttrInfo") &&
+         "must be subclass of TableGen 'EnumAttr' class");
+}
+
+tblgen::EnumAttr::EnumAttr(const llvm::Record &record) : Attribute(&record) {}
+
+tblgen::EnumAttr::EnumAttr(const llvm::DefInit *init)
+    : EnumAttr(init->getDef()) {}
+
+StringRef tblgen::EnumAttr::getEnumClassName() const {
+  return def->getValueAsString("className");
+}
+
+StringRef tblgen::EnumAttr::getCppNamespace() const {
+  return def->getValueAsString("cppNamespace");
+}
+
+StringRef tblgen::EnumAttr::getUnderlyingType() const {
+  return def->getValueAsString("underlyingType");
+}
+
+StringRef tblgen::EnumAttr::getUnderlyingToSymbolFnName() const {
+  return def->getValueAsString("underlyingToSymbolFnName");
+}
+
+StringRef tblgen::EnumAttr::getStringToSymbolFnName() const {
+  return def->getValueAsString("stringToSymbolFnName");
+}
+
+StringRef tblgen::EnumAttr::getSymbolToStringFnName() const {
+  return def->getValueAsString("symbolToStringFnName");
+}
+
+StringRef tblgen::EnumAttr::getMaxEnumValFnName() const {
+  return def->getValueAsString("maxEnumValFnName");
+}
+
+std::vector<tblgen::EnumAttrCase> tblgen::EnumAttr::getAllCases() const {
+  const auto *inits = def->getValueAsListInit("enumerants");
+
+  std::vector<tblgen::EnumAttrCase> cases;
+  cases.reserve(inits->size());
+
+  for (const llvm::Init *init : *inits) {
+    cases.push_back(tblgen::EnumAttrCase(cast<llvm::DefInit>(init)));
+  }
+
+  return cases;
+}
diff --git a/third_party/mlir/lib/TableGen/CMakeLists.txt b/third_party/mlir/lib/TableGen/CMakeLists.txt
new file mode 100644
index 00000000000..48ad446cb04
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_llvm_library(LLVMMLIRTableGen
+  Argument.cpp
+  Attribute.cpp
+  Constraint.cpp
+  Dialect.cpp
+  Format.cpp
+  Operator.cpp
+  OpTrait.cpp
+  Pattern.cpp
+  Predicate.cpp
+  Type.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/TableGen
+  )
+target_link_libraries(LLVMMLIRTableGen LLVMSupport LLVMTableGen)
diff --git a/third_party/mlir/lib/TableGen/Constraint.cpp b/third_party/mlir/lib/TableGen/Constraint.cpp
new file mode 100644
index 00000000000..ef3fa5271fa
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Constraint.cpp
@@ -0,0 +1,69 @@
+//===- Constraint.cpp - Constraint class ----------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Constraint wrapper to simplify using TableGen Record for constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Constraint.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir::tblgen;
+
+Constraint::Constraint(const llvm::Record *record)
+    : def(record), kind(CK_Uncategorized) {
+  if (record->isSubClassOf("TypeConstraint")) {
+    kind = CK_Type;
+  } else if (record->isSubClassOf("AttrConstraint")) {
+    kind = CK_Attr;
+  } else if (record->isSubClassOf("RegionConstraint")) {
+    kind = CK_Region;
+  } else {
+    assert(record->isSubClassOf("Constraint"));
+  }
+}
+
+Constraint::Constraint(Kind kind, const llvm::Record *record)
+    : def(record), kind(kind) {}
+
+Pred Constraint::getPredicate() const {
+  auto *val = def->getValue("predicate");
+
+  // If no predicate is specified, then return the null predicate (which
+  // corresponds to true).
+  if (!val)
+    return Pred();
+
+  const auto *pred = dyn_cast<llvm::DefInit>(val->getValue());
+  return Pred(pred);
+}
+
+std::string Constraint::getConditionTemplate() const {
+  return getPredicate().getCondition();
+}
+
+llvm::StringRef Constraint::getDescription() const {
+  auto doc = def->getValueAsString("description");
+  if (doc.empty())
+    return def->getName();
+  return doc;
+}
+
+AppliedConstraint::AppliedConstraint(Constraint &&constraint,
+                                     llvm::StringRef self,
+                                     std::vector<std::string> &&entities)
+    : constraint(constraint), self(self), entities(std::move(entities)) {}
diff --git a/third_party/mlir/lib/TableGen/Dialect.cpp b/third_party/mlir/lib/TableGen/Dialect.cpp
new file mode 100644
index 00000000000..d4a7e4fd71d
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Dialect.cpp
@@ -0,0 +1,37 @@
+//===- Dialect.cpp - Dialect wrapper class --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Dialect wrapper to simplify using TableGen Record defining a MLIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Dialect.h"
+#include "llvm/TableGen/Record.h"
+
+namespace mlir {
+namespace tblgen {
+
+StringRef tblgen::Dialect::getName() const {
+  return def.getValueAsString("name");
+}
+
+StringRef tblgen::Dialect::getCppNamespace() const {
+  return def.getValueAsString("cppNamespace");
+}
+
+} // end namespace tblgen
+} // end namespace mlir
diff --git a/third_party/mlir/lib/TableGen/Format.cpp b/third_party/mlir/lib/TableGen/Format.cpp
new file mode 100644
index 00000000000..967d51a61f7
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Format.cpp
@@ -0,0 +1,185 @@
+//===- Format.cpp - Utilities for String Format ---------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines utilities for formatting strings. They are specially
+// tailored to the needs of TableGen'ing op definitions and rewrite rules,
+// so they are not expected to be used as widely applicable utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Format.h"
+#include <cctype>
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+// Marker to indicate an error happened when replacing a placeholder.
+const char *const kMarkerForNoSubst = "<no-subst-found>";
+
+FmtContext &tblgen::FmtContext::addSubst(StringRef placeholder, Twine subst) {
+  customSubstMap[placeholder] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withBuilder(Twine subst) {
+  builtinSubstMap[PHKind::Builder] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withOp(Twine subst) {
+  builtinSubstMap[PHKind::Op] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withSelf(Twine subst) {
+  builtinSubstMap[PHKind::Self] = subst.str();
+  return *this;
+}
+
+Optional<StringRef>
+tblgen::FmtContext::getSubstFor(FmtContext::PHKind placeholder) const {
+  if (placeholder == FmtContext::PHKind::None ||
+      placeholder == FmtContext::PHKind::Custom)
+    return {};
+  auto it = builtinSubstMap.find(placeholder);
+  if (it == builtinSubstMap.end())
+    return {};
+  return StringRef(it->second);
+}
+
+Optional<StringRef>
+tblgen::FmtContext::getSubstFor(StringRef placeholder) const {
+  auto it = customSubstMap.find(placeholder);
+  if (it == customSubstMap.end())
+    return {};
+  return StringRef(it->second);
+}
+
+FmtContext::PHKind tblgen::FmtContext::getPlaceHolderKind(StringRef str) {
+  return llvm::StringSwitch<FmtContext::PHKind>(str)
+      .Case("_builder", FmtContext::PHKind::Builder)
+      .Case("_op", FmtContext::PHKind::Op)
+      .Case("_self", FmtContext::PHKind::Self)
+      .Case("", FmtContext::PHKind::None)
+      .Default(FmtContext::PHKind::Custom);
+}
+
+std::pair<FmtReplacement, StringRef>
+tblgen::FmtObjectBase::splitFmtSegment(StringRef fmt) {
+  size_t begin = fmt.find_first_of('$');
+  if (begin == StringRef::npos) {
+    // No placeholders: the whole format string should be returned as a
+    // literal string.
+    return {FmtReplacement{fmt}, StringRef()};
+  }
+  if (begin != 0) {
+    // The first placeholder is not at the beginning: we can split the format
+    // string into a literal string and the rest.
+    return {FmtReplacement{fmt.substr(0, begin)}, fmt.substr(begin)};
+  }
+
+  // The first placeholder is at the beginning
+
+  if (fmt.size() == 1) {
+    // The whole format string just contains '$': treat as literal.
+    return {FmtReplacement{fmt}, StringRef()};
+  }
+
+  // Allow escaping dollar with '$$'
+  if (fmt[1] == '$') {
+    return {FmtReplacement{fmt.substr(0, 1)}, fmt.substr(2)};
+  }
+
+  // First try to see if it's a positional placeholder, and then handle special
+  // placeholders.
+
+  size_t end = fmt.find_if_not([](char c) { return std::isdigit(c); }, 1);
+  if (end != 1) {
+    // We have a positional placeholder. Parse the index.
+    size_t index = 0;
+    if (fmt.substr(1, end - 1).consumeInteger(0, index)) {
+      llvm_unreachable("invalid replacement sequence index");
+    }
+
+    if (end == StringRef::npos) {
+      // All the remaining characters are part of the positional placeholder.
+      return {FmtReplacement{fmt, index}, StringRef()};
+    }
+    return {FmtReplacement{fmt.substr(0, end), index}, fmt.substr(end)};
+  }
+
+  end = fmt.find_if_not([](char c) { return std::isalnum(c) || c == '_'; }, 1);
+  auto placeholder = FmtContext::getPlaceHolderKind(fmt.substr(1, end - 1));
+  if (end == StringRef::npos) {
+    // All the remaining characters are part of the special placeholder.
+    return {FmtReplacement{fmt, placeholder}, StringRef()};
+  }
+  return {FmtReplacement{fmt.substr(0, end), placeholder}, fmt.substr(end)};
+}
+
+std::vector<FmtReplacement> FmtObjectBase::parseFormatString(StringRef fmt) {
+  std::vector<FmtReplacement> replacements;
+  FmtReplacement repl;
+  while (!fmt.empty()) {
+    std::tie(repl, fmt) = splitFmtSegment(fmt);
+    if (repl.type != FmtReplacement::Type::Empty)
+      replacements.push_back(repl);
+  }
+  return replacements;
+}
+
+void FmtObjectBase::format(raw_ostream &s) const {
+  for (auto &repl : replacements) {
+    if (repl.type == FmtReplacement::Type::Empty)
+      continue;
+
+    if (repl.type == FmtReplacement::Type::Literal) {
+      s << repl.spec;
+      continue;
+    }
+
+    if (repl.type == FmtReplacement::Type::SpecialPH) {
+      if (repl.placeholder == FmtContext::PHKind::None) {
+        s << repl.spec;
+      } else if (!context) {
+        // We need the context to replace special placeholders.
+        s << repl.spec << kMarkerForNoSubst;
+      } else {
+        Optional<StringRef> subst;
+        if (repl.placeholder == FmtContext::PHKind::Custom) {
+          // Skip the leading '$' sign for the custom placeholder
+          subst = context->getSubstFor(repl.spec.substr(1));
+        } else {
+          subst = context->getSubstFor(repl.placeholder);
+        }
+        if (subst)
+          s << *subst;
+        else
+          s << repl.spec << kMarkerForNoSubst;
+      }
+      continue;
+    }
+
+    assert(repl.type == FmtReplacement::Type::PositionalPH);
+
+    if (repl.index >= adapters.size()) {
+      s << repl.spec << kMarkerForNoSubst;
+      continue;
+    }
+    adapters[repl.index]->format(s, /*Options=*/"");
+  }
+}
diff --git a/third_party/mlir/lib/TableGen/OpTrait.cpp b/third_party/mlir/lib/TableGen/OpTrait.cpp
new file mode 100644
index 00000000000..0a357acb744
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/OpTrait.cpp
@@ -0,0 +1,59 @@
+//===- OpTrait.cpp - OpTrait class ----------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// OpTrait wrapper to simplify using TableGen Record defining a MLIR OpTrait.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+mlir::tblgen::OpTrait mlir::tblgen::OpTrait::create(const llvm::Init *init) {
+  auto def = cast<llvm::DefInit>(init)->getDef();
+  if (def->isSubClassOf("PredOpTrait"))
+    return OpTrait(Kind::Pred, def);
+  if (def->isSubClassOf("GenInternalOpTrait"))
+    return OpTrait(Kind::Internal, def);
+  assert(def->isSubClassOf("NativeOpTrait"));
+  return OpTrait(Kind::Native, def);
+}
+
+mlir::tblgen::OpTrait::OpTrait(Kind kind, const llvm::Record *def)
+    : def(def), kind(kind) {}
+
+llvm::StringRef mlir::tblgen::NativeOpTrait::getTrait() const {
+  return def->getValueAsString("trait");
+}
+
+llvm::StringRef mlir::tblgen::InternalOpTrait::getTrait() const {
+  return def->getValueAsString("trait");
+}
+
+std::string mlir::tblgen::PredOpTrait::getPredTemplate() const {
+  auto pred = tblgen::Pred(def->getValueInit("predicate"));
+  return pred.getCondition();
+}
+
+llvm::StringRef mlir::tblgen::PredOpTrait::getDescription() const {
+  return def->getValueAsString("description");
+}
diff --git a/third_party/mlir/lib/TableGen/Operator.cpp b/third_party/mlir/lib/TableGen/Operator.cpp
new file mode 100644
index 00000000000..60fecf7cdde
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Operator.cpp
@@ -0,0 +1,307 @@
+//===- Operator.cpp - Operator class --------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Operator wrapper to simplify using TableGen Record defining a MLIR Op.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Operator.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Predicate.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+using llvm::DagInit;
+using llvm::DefInit;
+using llvm::Record;
+
+tblgen::Operator::Operator(const llvm::Record &def)
+    : dialect(def.getValueAsDef("opDialect")), def(def) {
+  // The first `_` in the op's TableGen def name is treated as separating the
+  // dialect prefix and the op class name. The dialect prefix will be ignored if
+  // not empty. Otherwise, if def name starts with a `_`, the `_` is considered
+  // as part of the class name.
+  StringRef prefix;
+  std::tie(prefix, cppClassName) = def.getName().split('_');
+  if (prefix.empty()) {
+    // Class name with a leading underscore and without dialect prefix
+    cppClassName = def.getName();
+  } else if (cppClassName.empty()) {
+    // Class name without dialect prefix
+    cppClassName = prefix;
+  }
+
+  populateOpStructure();
+}
+
+std::string tblgen::Operator::getOperationName() const {
+  auto prefix = dialect.getName();
+  auto opName = def.getValueAsString("opName");
+  if (prefix.empty())
+    return opName;
+  return llvm::formatv("{0}.{1}", prefix, opName);
+}
+
+StringRef tblgen::Operator::getDialectName() const { return dialect.getName(); }
+
+StringRef tblgen::Operator::getCppClassName() const { return cppClassName; }
+
+std::string tblgen::Operator::getQualCppClassName() const {
+  auto prefix = dialect.getCppNamespace();
+  if (prefix.empty())
+    return cppClassName;
+  return llvm::formatv("{0}::{1}", prefix, cppClassName);
+}
+
+int tblgen::Operator::getNumResults() const {
+  DagInit *results = def.getValueAsDag("results");
+  return results->getNumArgs();
+}
+
+StringRef tblgen::Operator::getExtraClassDeclaration() const {
+  constexpr auto attr = "extraClassDeclaration";
+  if (def.isValueUnset(attr))
+    return {};
+  return def.getValueAsString(attr);
+}
+
+const llvm::Record &tblgen::Operator::getDef() const { return def; }
+
+bool tblgen::Operator::isVariadic() const {
+  return getNumVariadicOperands() != 0 || getNumVariadicResults() != 0;
+}
+
+bool tblgen::Operator::skipDefaultBuilders() const {
+  return def.getValueAsBit("skipDefaultBuilders");
+}
+
+auto tblgen::Operator::result_begin() -> value_iterator {
+  return results.begin();
+}
+
+auto tblgen::Operator::result_end() -> value_iterator { return results.end(); }
+
+auto tblgen::Operator::getResults() -> value_range {
+  return {result_begin(), result_end()};
+}
+
+tblgen::TypeConstraint
+tblgen::Operator::getResultTypeConstraint(int index) const {
+  DagInit *results = def.getValueAsDag("results");
+  return TypeConstraint(cast<DefInit>(results->getArg(index)));
+}
+
+StringRef tblgen::Operator::getResultName(int index) const {
+  DagInit *results = def.getValueAsDag("results");
+  return results->getArgNameStr(index);
+}
+
+unsigned tblgen::Operator::getNumVariadicResults() const {
+  return std::count_if(
+      results.begin(), results.end(),
+      [](const NamedTypeConstraint &c) { return c.constraint.isVariadic(); });
+}
+
+unsigned tblgen::Operator::getNumVariadicOperands() const {
+  return std::count_if(
+      operands.begin(), operands.end(),
+      [](const NamedTypeConstraint &c) { return c.constraint.isVariadic(); });
+}
+
+StringRef tblgen::Operator::getArgName(int index) const {
+  DagInit *argumentValues = def.getValueAsDag("arguments");
+  return argumentValues->getArgName(index)->getValue();
+}
+
+bool tblgen::Operator::hasTrait(StringRef trait) const {
+  for (auto t : getTraits()) {
+    if (auto opTrait = dyn_cast<tblgen::NativeOpTrait>(&t)) {
+      if (opTrait->getTrait() == trait)
+        return true;
+    } else if (auto opTrait = dyn_cast<tblgen::InternalOpTrait>(&t)) {
+      if (opTrait->getTrait() == trait)
+        return true;
+    }
+  }
+  return false;
+}
+
+unsigned tblgen::Operator::getNumRegions() const { return regions.size(); }
+
+const tblgen::NamedRegion &tblgen::Operator::getRegion(unsigned index) const {
+  return regions[index];
+}
+
+auto tblgen::Operator::trait_begin() const -> const_trait_iterator {
+  return traits.begin();
+}
+auto tblgen::Operator::trait_end() const -> const_trait_iterator {
+  return traits.end();
+}
+auto tblgen::Operator::getTraits() const
+    -> llvm::iterator_range<const_trait_iterator> {
+  return {trait_begin(), trait_end()};
+}
+
+auto tblgen::Operator::attribute_begin() const -> attribute_iterator {
+  return attributes.begin();
+}
+auto tblgen::Operator::attribute_end() const -> attribute_iterator {
+  return attributes.end();
+}
+auto tblgen::Operator::getAttributes() const
+    -> llvm::iterator_range<attribute_iterator> {
+  return {attribute_begin(), attribute_end()};
+}
+
+auto tblgen::Operator::operand_begin() -> value_iterator {
+  return operands.begin();
+}
+auto tblgen::Operator::operand_end() -> value_iterator {
+  return operands.end();
+}
+auto tblgen::Operator::getOperands() -> value_range {
+  return {operand_begin(), operand_end()};
+}
+
+auto tblgen::Operator::getArg(int index) const -> Argument {
+  return arguments[index];
+}
+
+void tblgen::Operator::populateOpStructure() {
+  auto &recordKeeper = def.getRecords();
+  auto typeConstraintClass = recordKeeper.getClass("TypeConstraint");
+  auto attrClass = recordKeeper.getClass("Attr");
+  auto derivedAttrClass = recordKeeper.getClass("DerivedAttr");
+  numNativeAttributes = 0;
+
+  // The argument ordering is operands, native attributes, derived
+  // attributes.
+  DagInit *argumentValues = def.getValueAsDag("arguments");
+  unsigned i = 0;
+  // Handle operands and native attributes.
+  for (unsigned e = argumentValues->getNumArgs(); i != e; ++i) {
+    auto arg = argumentValues->getArg(i);
+    auto givenName = argumentValues->getArgNameStr(i);
+    auto argDefInit = dyn_cast<DefInit>(arg);
+    if (!argDefInit)
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined type for argument #") + Twine(i));
+    Record *argDef = argDefInit->getDef();
+
+    if (argDef->isSubClassOf(typeConstraintClass)) {
+      operands.push_back(
+          NamedTypeConstraint{givenName, TypeConstraint(argDefInit)});
+      arguments.emplace_back(&operands.back());
+    } else if (argDef->isSubClassOf(attrClass)) {
+      if (givenName.empty())
+        PrintFatalError(argDef->getLoc(), "attributes must be named");
+      if (argDef->isSubClassOf(derivedAttrClass))
+        PrintFatalError(argDef->getLoc(),
+                        "derived attributes not allowed in argument list");
+      attributes.push_back({givenName, Attribute(argDef)});
+      arguments.emplace_back(&attributes.back());
+      ++numNativeAttributes;
+    } else {
+      PrintFatalError(def.getLoc(), "unexpected def type; only defs deriving "
+                                    "from TypeConstraint or Attr are allowed");
+    }
+  }
+
+  // Handle derived attributes.
+  for (const auto &val : def.getValues()) {
+    if (auto *record = dyn_cast<llvm::RecordRecTy>(val.getType())) {
+      if (!record->isSubClassOf(attrClass))
+        continue;
+      if (!record->isSubClassOf(derivedAttrClass))
+        PrintFatalError(def.getLoc(),
+                        "unexpected Attr where only DerivedAttr is allowed");
+
+      if (record->getClasses().size() != 1) {
+        PrintFatalError(
+            def.getLoc(),
+            "unsupported attribute modelling, only single class expected");
+      }
+      attributes.push_back(
+          {cast<llvm::StringInit>(val.getNameInit())->getValue(),
+           Attribute(cast<DefInit>(val.getValue()))});
+    }
+  }
+
+  auto *resultsDag = def.getValueAsDag("results");
+  auto *outsOp = dyn_cast<DefInit>(resultsDag->getOperator());
+  if (!outsOp || outsOp->getDef()->getName() != "outs") {
+    PrintFatalError(def.getLoc(), "'results' must have 'outs' directive");
+  }
+
+  // Handle results.
+  for (unsigned i = 0, e = resultsDag->getNumArgs(); i < e; ++i) {
+    auto name = resultsDag->getArgNameStr(i);
+    auto *resultDef = dyn_cast<DefInit>(resultsDag->getArg(i));
+    if (!resultDef) {
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined type for result #") + Twine(i));
+    }
+    results.push_back({name, TypeConstraint(resultDef)});
+  }
+
+  auto traitListInit = def.getValueAsListInit("traits");
+  if (!traitListInit)
+    return;
+  traits.reserve(traitListInit->size());
+  for (auto traitInit : *traitListInit)
+    traits.push_back(OpTrait::create(traitInit));
+
+  // Handle regions
+  auto *regionsDag = def.getValueAsDag("regions");
+  auto *regionsOp = dyn_cast<DefInit>(regionsDag->getOperator());
+  if (!regionsOp || regionsOp->getDef()->getName() != "region") {
+    PrintFatalError(def.getLoc(), "'regions' must have 'region' directive");
+  }
+
+  for (unsigned i = 0, e = regionsDag->getNumArgs(); i < e; ++i) {
+    auto name = regionsDag->getArgNameStr(i);
+    auto *regionInit = dyn_cast<DefInit>(regionsDag->getArg(i));
+    if (!regionInit) {
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined kind for region #") + Twine(i));
+    }
+    regions.push_back({name, Region(regionInit->getDef())});
+  }
+}
+
+ArrayRef<llvm::SMLoc> tblgen::Operator::getLoc() const { return def.getLoc(); }
+
+bool tblgen::Operator::hasDescription() const {
+  return def.getValue("description") != nullptr;
+}
+
+StringRef tblgen::Operator::getDescription() const {
+  return def.getValueAsString("description");
+}
+
+bool tblgen::Operator::hasSummary() const {
+  return def.getValue("summary") != nullptr;
+}
+
+StringRef tblgen::Operator::getSummary() const {
+  return def.getValueAsString("summary");
+}
diff --git a/third_party/mlir/lib/TableGen/Pattern.cpp b/third_party/mlir/lib/TableGen/Pattern.cpp
new file mode 100644
index 00000000000..e8f18e6a763
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Pattern.cpp
@@ -0,0 +1,519 @@
+//===- Pattern.cpp - Pattern wrapper class --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Pattern wrapper class to simplify using TableGen Record defining a MLIR
+// Pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Pattern.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+using llvm::formatv;
+using mlir::tblgen::Operator;
+
+//===----------------------------------------------------------------------===//
+// DagLeaf
+//===----------------------------------------------------------------------===//
+
+bool tblgen::DagLeaf::isUnspecified() const {
+  return dyn_cast_or_null<llvm::UnsetInit>(def);
+}
+
+bool tblgen::DagLeaf::isOperandMatcher() const {
+  // Operand matchers specify a type constraint.
+  return isSubClassOf("TypeConstraint");
+}
+
+bool tblgen::DagLeaf::isAttrMatcher() const {
+  // Attribute matchers specify an attribute constraint.
+  return isSubClassOf("AttrConstraint");
+}
+
+bool tblgen::DagLeaf::isNativeCodeCall() const {
+  return isSubClassOf("NativeCodeCall");
+}
+
+bool tblgen::DagLeaf::isConstantAttr() const {
+  return isSubClassOf("ConstantAttr");
+}
+
+bool tblgen::DagLeaf::isEnumAttrCase() const {
+  return isSubClassOf("EnumAttrCaseInfo");
+}
+
+tblgen::Constraint tblgen::DagLeaf::getAsConstraint() const {
+  assert((isOperandMatcher() || isAttrMatcher()) &&
+         "the DAG leaf must be operand or attribute");
+  return Constraint(cast<llvm::DefInit>(def)->getDef());
+}
+
+tblgen::ConstantAttr tblgen::DagLeaf::getAsConstantAttr() const {
+  assert(isConstantAttr() && "the DAG leaf must be constant attribute");
+  return ConstantAttr(cast<llvm::DefInit>(def));
+}
+
+tblgen::EnumAttrCase tblgen::DagLeaf::getAsEnumAttrCase() const {
+  assert(isEnumAttrCase() && "the DAG leaf must be an enum attribute case");
+  return EnumAttrCase(cast<llvm::DefInit>(def));
+}
+
+std::string tblgen::DagLeaf::getConditionTemplate() const {
+  return getAsConstraint().getConditionTemplate();
+}
+
+llvm::StringRef tblgen::DagLeaf::getNativeCodeTemplate() const {
+  assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall");
+  return cast<llvm::DefInit>(def)->getDef()->getValueAsString("expression");
+}
+
+bool tblgen::DagLeaf::isSubClassOf(StringRef superclass) const {
+  if (auto *defInit = dyn_cast_or_null<llvm::DefInit>(def))
+    return defInit->getDef()->isSubClassOf(superclass);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// DagNode
+//===----------------------------------------------------------------------===//
+
+bool tblgen::DagNode::isNativeCodeCall() const {
+  if (auto *defInit = dyn_cast_or_null<llvm::DefInit>(node->getOperator()))
+    return defInit->getDef()->isSubClassOf("NativeCodeCall");
+  return false;
+}
+
+bool tblgen::DagNode::isOperation() const {
+  return !(isNativeCodeCall() || isReplaceWithValue());
+}
+
+llvm::StringRef tblgen::DagNode::getNativeCodeTemplate() const {
+  assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall");
+  return cast<llvm::DefInit>(node->getOperator())
+      ->getDef()
+      ->getValueAsString("expression");
+}
+
+llvm::StringRef tblgen::DagNode::getSymbol() const {
+  return node->getNameStr();
+}
+
+Operator &tblgen::DagNode::getDialectOp(RecordOperatorMap *mapper) const {
+  llvm::Record *opDef = cast<llvm::DefInit>(node->getOperator())->getDef();
+  auto it = mapper->find(opDef);
+  if (it != mapper->end())
+    return *it->second;
+  return *mapper->try_emplace(opDef, std::make_unique<Operator>(opDef))
+              .first->second;
+}
+
+int tblgen::DagNode::getNumOps() const {
+  int count = isReplaceWithValue() ? 0 : 1;
+  for (int i = 0, e = getNumArgs(); i != e; ++i) {
+    if (auto child = getArgAsNestedDag(i))
+      count += child.getNumOps();
+  }
+  return count;
+}
+
+int tblgen::DagNode::getNumArgs() const { return node->getNumArgs(); }
+
+bool tblgen::DagNode::isNestedDagArg(unsigned index) const {
+  return isa<llvm::DagInit>(node->getArg(index));
+}
+
+tblgen::DagNode tblgen::DagNode::getArgAsNestedDag(unsigned index) const {
+  return DagNode(dyn_cast_or_null<llvm::DagInit>(node->getArg(index)));
+}
+
+tblgen::DagLeaf tblgen::DagNode::getArgAsLeaf(unsigned index) const {
+  assert(!isNestedDagArg(index));
+  return DagLeaf(node->getArg(index));
+}
+
+StringRef tblgen::DagNode::getArgName(unsigned index) const {
+  return node->getArgNameStr(index);
+}
+
+bool tblgen::DagNode::isReplaceWithValue() const {
+  auto *dagOpDef = cast<llvm::DefInit>(node->getOperator())->getDef();
+  return dagOpDef->getName() == "replaceWithValue";
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolInfoMap
+//===----------------------------------------------------------------------===//
+
+StringRef tblgen::SymbolInfoMap::getValuePackName(StringRef symbol,
+                                                  int *index) {
+  StringRef name, indexStr;
+  int idx = -1;
+  std::tie(name, indexStr) = symbol.rsplit("__");
+
+  if (indexStr.consumeInteger(10, idx)) {
+    // The second part is not an index; we return the whole symbol as-is.
+    return symbol;
+  }
+  if (index) {
+    *index = idx;
+  }
+  return name;
+}
+
+tblgen::SymbolInfoMap::SymbolInfo::SymbolInfo(const Operator *op,
+                                              SymbolInfo::Kind kind,
+                                              Optional<int> index)
+    : op(op), kind(kind), argIndex(index) {}
+
+int tblgen::SymbolInfoMap::SymbolInfo::getStaticValueCount() const {
+  switch (kind) {
+  case Kind::Attr:
+  case Kind::Operand:
+  case Kind::Value:
+    return 1;
+  case Kind::Result:
+    return op->getNumResults();
+  }
+  llvm_unreachable("unknown kind");
+}
+
+std::string
+tblgen::SymbolInfoMap::SymbolInfo::getVarDecl(StringRef name) const {
+  switch (kind) {
+  case Kind::Attr: {
+    auto type =
+        op->getArg(*argIndex).get<NamedAttribute *>()->attr.getStorageType();
+    return formatv("{0} {1};\n", type, name);
+  }
+  case Kind::Operand: {
+    // Use operand range for captured operands (to support potential variadic
+    // operands).
+    return formatv("Operation::operand_range {0}(op0->getOperands());\n", name);
+  }
+  case Kind::Value: {
+    return formatv("ArrayRef<Value *> {0};\n", name);
+  }
+  case Kind::Result: {
+    // Use the op itself for captured results.
+    return formatv("{0} {1};\n", op->getQualCppClassName(), name);
+  }
+  }
+  llvm_unreachable("unknown kind");
+}
+
+std::string tblgen::SymbolInfoMap::SymbolInfo::getValueAndRangeUse(
+    StringRef name, int index, const char *fmt, const char *separator) const {
+  switch (kind) {
+  case Kind::Attr: {
+    assert(index < 0);
+    return formatv(fmt, name);
+  }
+  case Kind::Operand: {
+    assert(index < 0);
+    auto *operand = op->getArg(*argIndex).get<NamedTypeConstraint *>();
+    // If this operand is variadic, then return a range. Otherwise, return the
+    // value itself.
+    if (operand->isVariadic()) {
+      return formatv(fmt, name);
+    }
+    return formatv(fmt, formatv("(*{0}.begin())", name));
+  }
+  case Kind::Result: {
+    // If `index` is greater than zero, then we are referencing a specific
+    // result of a multi-result op. The result can still be variadic.
+    if (index >= 0) {
+      std::string v = formatv("{0}.getODSResults({1})", name, index);
+      if (!op->getResult(index).isVariadic())
+        v = formatv("(*{0}.begin())", v);
+      return formatv(fmt, v);
+    }
+
+    // We are referencing all results of the multi-result op. A specific result
+    // can either be a value or a range. Then join them with `separator`.
+    SmallVector<std::string, 4> values;
+    values.reserve(op->getNumResults());
+
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+      std::string v = formatv("{0}.getODSResults({1})", name, i);
+      if (!op->getResult(i).isVariadic()) {
+        v = formatv("(*{0}.begin())", v);
+      }
+      values.push_back(formatv(fmt, v));
+    }
+    return llvm::join(values, separator);
+  }
+  case Kind::Value: {
+    assert(index < 0);
+    assert(op == nullptr);
+    return formatv(fmt, name);
+  }
+  }
+}
+
+std::string tblgen::SymbolInfoMap::SymbolInfo::getAllRangeUse(
+    StringRef name, int index, const char *fmt, const char *separator) const {
+  switch (kind) {
+  case Kind::Attr:
+  case Kind::Operand: {
+    assert(index < 0 && "only allowed for symbol bound to result");
+    return formatv(fmt, name);
+  }
+  case Kind::Result: {
+    if (index >= 0) {
+      return formatv(fmt, formatv("{0}.getODSResults({1})", name, index));
+    }
+
+    // We are referencing all results of the multi-result op. Each result should
+    // have a value range, and then join them with `separator`.
+    SmallVector<std::string, 4> values;
+    values.reserve(op->getNumResults());
+
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+      values.push_back(
+          formatv(fmt, formatv("{0}.getODSResults({1})", name, i)));
+    }
+    return llvm::join(values, separator);
+  }
+  case Kind::Value: {
+    assert(index < 0 && "only allowed for symbol bound to result");
+    assert(op == nullptr);
+    return formatv(fmt, formatv("{{{0}}", name));
+  }
+  }
+  llvm_unreachable("unknown kind");
+}
+
+bool tblgen::SymbolInfoMap::bindOpArgument(StringRef symbol, const Operator &op,
+                                           int argIndex) {
+  StringRef name = getValuePackName(symbol);
+  if (name != symbol) {
+    auto error = formatv(
+        "symbol '{0}' with trailing index cannot bind to op argument", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  auto symInfo = op.getArg(argIndex).is<NamedAttribute *>()
+                     ? SymbolInfo::getAttr(&op, argIndex)
+                     : SymbolInfo::getOperand(&op, argIndex);
+
+  return symbolInfoMap.insert({symbol, symInfo}).second;
+}
+
+bool tblgen::SymbolInfoMap::bindOpResult(StringRef symbol, const Operator &op) {
+  StringRef name = getValuePackName(symbol);
+  return symbolInfoMap.insert({name, SymbolInfo::getResult(&op)}).second;
+}
+
+bool tblgen::SymbolInfoMap::bindValue(StringRef symbol) {
+  return symbolInfoMap.insert({symbol, SymbolInfo::getValue()}).second;
+}
+
+bool tblgen::SymbolInfoMap::contains(StringRef symbol) const {
+  return find(symbol) != symbolInfoMap.end();
+}
+
+tblgen::SymbolInfoMap::const_iterator
+tblgen::SymbolInfoMap::find(StringRef key) const {
+  StringRef name = getValuePackName(key);
+  return symbolInfoMap.find(name);
+}
+
+int tblgen::SymbolInfoMap::getStaticValueCount(StringRef symbol) const {
+  StringRef name = getValuePackName(symbol);
+  if (name != symbol) {
+    // If there is a trailing index inside symbol, it references just one
+    // static value.
+    return 1;
+  }
+  // Otherwise, find how many it represents by querying the symbol's info.
+  return find(name)->getValue().getStaticValueCount();
+}
+
+std::string
+tblgen::SymbolInfoMap::getValueAndRangeUse(StringRef symbol, const char *fmt,
+                                           const char *separator) const {
+  int index = -1;
+  StringRef name = getValuePackName(symbol, &index);
+
+  auto it = symbolInfoMap.find(name);
+  if (it == symbolInfoMap.end()) {
+    auto error = formatv("referencing unbound symbol '{0}'", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  return it->getValue().getValueAndRangeUse(name, index, fmt, separator);
+}
+
+std::string tblgen::SymbolInfoMap::getAllRangeUse(StringRef symbol,
+                                                  const char *fmt,
+                                                  const char *separator) const {
+  int index = -1;
+  StringRef name = getValuePackName(symbol, &index);
+
+  auto it = symbolInfoMap.find(name);
+  if (it == symbolInfoMap.end()) {
+    auto error = formatv("referencing unbound symbol '{0}'", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  return it->getValue().getAllRangeUse(name, index, fmt, separator);
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern
+//==----------------------------------------------------------------------===//
+
+tblgen::Pattern::Pattern(const llvm::Record *def, RecordOperatorMap *mapper)
+    : def(*def), recordOpMap(mapper) {}
+
+tblgen::DagNode tblgen::Pattern::getSourcePattern() const {
+  return tblgen::DagNode(def.getValueAsDag("sourcePattern"));
+}
+
+int tblgen::Pattern::getNumResultPatterns() const {
+  auto *results = def.getValueAsListInit("resultPatterns");
+  return results->size();
+}
+
+tblgen::DagNode tblgen::Pattern::getResultPattern(unsigned index) const {
+  auto *results = def.getValueAsListInit("resultPatterns");
+  return tblgen::DagNode(cast<llvm::DagInit>(results->getElement(index)));
+}
+
+void tblgen::Pattern::collectSourcePatternBoundSymbols(
+    tblgen::SymbolInfoMap &infoMap) {
+  collectBoundSymbols(getSourcePattern(), infoMap, /*isSrcPattern=*/true);
+}
+
+void tblgen::Pattern::collectResultPatternBoundSymbols(
+    tblgen::SymbolInfoMap &infoMap) {
+  for (int i = 0, e = getNumResultPatterns(); i < e; ++i) {
+    auto pattern = getResultPattern(i);
+    collectBoundSymbols(pattern, infoMap, /*isSrcPattern=*/false);
+  }
+}
+
+const tblgen::Operator &tblgen::Pattern::getSourceRootOp() {
+  return getSourcePattern().getDialectOp(recordOpMap);
+}
+
+tblgen::Operator &tblgen::Pattern::getDialectOp(DagNode node) {
+  return node.getDialectOp(recordOpMap);
+}
+
+std::vector<tblgen::AppliedConstraint> tblgen::Pattern::getConstraints() const {
+  auto *listInit = def.getValueAsListInit("constraints");
+  std::vector<tblgen::AppliedConstraint> ret;
+  ret.reserve(listInit->size());
+
+  for (auto it : *listInit) {
+    auto *dagInit = dyn_cast<llvm::DagInit>(it);
+    if (!dagInit)
+      PrintFatalError(def.getLoc(), "all elemements in Pattern multi-entity "
+                                    "constraints should be DAG nodes");
+
+    std::vector<std::string> entities;
+    entities.reserve(dagInit->arg_size());
+    for (auto *argName : dagInit->getArgNames())
+      entities.push_back(argName->getValue());
+
+    ret.emplace_back(cast<llvm::DefInit>(dagInit->getOperator())->getDef(),
+                     dagInit->getNameStr(), std::move(entities));
+  }
+  return ret;
+}
+
+int tblgen::Pattern::getBenefit() const {
+  // The initial benefit value is a heuristic with number of ops in the source
+  // pattern.
+  int initBenefit = getSourcePattern().getNumOps();
+  llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
+  if (delta->getNumArgs() != 1 || !isa<llvm::IntInit>(delta->getArg(0))) {
+    PrintFatalError(def.getLoc(),
+                    "The 'addBenefit' takes and only takes one integer value");
+  }
+  return initBenefit + dyn_cast<llvm::IntInit>(delta->getArg(0))->getValue();
+}
+
+std::vector<tblgen::Pattern::IdentifierLine>
+tblgen::Pattern::getLocation() const {
+  std::vector<std::pair<StringRef, unsigned>> result;
+  result.reserve(def.getLoc().size());
+  for (auto loc : def.getLoc()) {
+    unsigned buf = llvm::SrcMgr.FindBufferContainingLoc(loc);
+    assert(buf && "invalid source location");
+    result.emplace_back(
+        llvm::SrcMgr.getBufferInfo(buf).Buffer->getBufferIdentifier(),
+        llvm::SrcMgr.getLineAndColumn(loc, buf).first);
+  }
+  return result;
+}
+
+void tblgen::Pattern::collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap,
+                                          bool isSrcPattern) {
+  auto treeName = tree.getSymbol();
+  if (!tree.isOperation()) {
+    if (!treeName.empty()) {
+      PrintFatalError(
+          def.getLoc(),
+          formatv("binding symbol '{0}' to non-operation unsupported right now",
+                  treeName));
+    }
+    return;
+  }
+
+  auto &op = getDialectOp(tree);
+  auto numOpArgs = op.getNumArgs();
+  auto numTreeArgs = tree.getNumArgs();
+
+  if (numOpArgs != numTreeArgs) {
+    auto err = formatv("op '{0}' argument number mismatch: "
+                       "{1} in pattern vs. {2} in definition",
+                       op.getOperationName(), numTreeArgs, numOpArgs);
+    PrintFatalError(def.getLoc(), err);
+  }
+
+  // The name attached to the DAG node's operator is for representing the
+  // results generated from this op. It should be remembered as bound results.
+  if (!treeName.empty()) {
+    if (!infoMap.bindOpResult(treeName, op))
+      PrintFatalError(def.getLoc(),
+                      formatv("symbol '{0}' bound more than once", treeName));
+  }
+
+  for (int i = 0; i != numTreeArgs; ++i) {
+    if (auto treeArg = tree.getArgAsNestedDag(i)) {
+      // This DAG node argument is a DAG node itself. Go inside recursively.
+      collectBoundSymbols(treeArg, infoMap, isSrcPattern);
+    } else if (isSrcPattern) {
+      // We can only bind symbols to op arguments in source pattern. Those
+      // symbols are referenced in result patterns.
+      auto treeArgName = tree.getArgName(i);
+      if (!treeArgName.empty()) {
+        if (!infoMap.bindOpArgument(treeArgName, op, i)) {
+          auto err = formatv("symbol '{0}' bound more than once", treeArgName);
+          PrintFatalError(def.getLoc(), err);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/mlir/lib/TableGen/Predicate.cpp b/third_party/mlir/lib/TableGen/Predicate.cpp
new file mode 100644
index 00000000000..bc2b424ab00
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Predicate.cpp
@@ -0,0 +1,374 @@
+//===- Predicate.cpp - Predicate class ------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Wrapper around predicates defined in TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+// Construct a Predicate from a record.
+tblgen::Pred::Pred(const llvm::Record *record) : def(record) {
+  assert(def->isSubClassOf("Pred") &&
+         "must be a subclass of TableGen 'Pred' class");
+}
+
+// Construct a Predicate from an initializer.
+tblgen::Pred::Pred(const llvm::Init *init) : def(nullptr) {
+  if (const auto *defInit = dyn_cast_or_null<llvm::DefInit>(init))
+    def = defInit->getDef();
+}
+
+std::string tblgen::Pred::getCondition() const {
+  // Static dispatch to subclasses.
+  if (def->isSubClassOf("CombinedPred"))
+    return static_cast<const CombinedPred *>(this)->getConditionImpl();
+  if (def->isSubClassOf("CPred"))
+    return static_cast<const CPred *>(this)->getConditionImpl();
+  llvm_unreachable("Pred::getCondition must be overridden in subclasses");
+}
+
+bool tblgen::Pred::isCombined() const {
+  return def && def->isSubClassOf("CombinedPred");
+}
+
+ArrayRef<llvm::SMLoc> tblgen::Pred::getLoc() const { return def->getLoc(); }
+
+tblgen::CPred::CPred(const llvm::Record *record) : Pred(record) {
+  assert(def->isSubClassOf("CPred") &&
+         "must be a subclass of Tablegen 'CPred' class");
+}
+
+tblgen::CPred::CPred(const llvm::Init *init) : Pred(init) {
+  assert((!def || def->isSubClassOf("CPred")) &&
+         "must be a subclass of Tablegen 'CPred' class");
+}
+
+// Get condition of the C Predicate.
+std::string tblgen::CPred::getConditionImpl() const {
+  assert(!isNull() && "null predicate does not have a condition");
+  return def->getValueAsString("predExpr");
+}
+
+tblgen::CombinedPred::CombinedPred(const llvm::Record *record) : Pred(record) {
+  assert(def->isSubClassOf("CombinedPred") &&
+         "must be a subclass of Tablegen 'CombinedPred' class");
+}
+
+tblgen::CombinedPred::CombinedPred(const llvm::Init *init) : Pred(init) {
+  assert((!def || def->isSubClassOf("CombinedPred")) &&
+         "must be a subclass of Tablegen 'CombinedPred' class");
+}
+
+const llvm::Record *tblgen::CombinedPred::getCombinerDef() const {
+  assert(def->getValue("kind") && "CombinedPred must have a value 'kind'");
+  return def->getValueAsDef("kind");
+}
+
+const std::vector<llvm::Record *> tblgen::CombinedPred::getChildren() const {
+  assert(def->getValue("children") &&
+         "CombinedPred must have a value 'children'");
+  return def->getValueAsListOfDefs("children");
+}
+
+namespace {
+// Kinds of nodes in a logical predicate tree.
+enum class PredCombinerKind {
+  Leaf,
+  And,
+  Or,
+  Not,
+  SubstLeaves,
+  Concat,
+  // Special kinds that are used in simplification.
+  False,
+  True
+};
+
+// A node in a logical predicate tree.
+struct PredNode {
+  PredCombinerKind kind;
+  const tblgen::Pred *predicate;
+  SmallVector<PredNode *, 4> children;
+  std::string expr;
+
+  // Prefix and suffix are used by ConcatPred.
+  std::string prefix;
+  std::string suffix;
+};
+} // end anonymous namespace
+
+// Get a predicate tree node kind based on the kind used in the predicate
+// TableGen record.
+static PredCombinerKind getPredCombinerKind(const tblgen::Pred &pred) {
+  if (!pred.isCombined())
+    return PredCombinerKind::Leaf;
+
+  const auto &combinedPred = static_cast<const tblgen::CombinedPred &>(pred);
+  return llvm::StringSwitch<PredCombinerKind>(
+             combinedPred.getCombinerDef()->getName())
+      .Case("PredCombinerAnd", PredCombinerKind::And)
+      .Case("PredCombinerOr", PredCombinerKind::Or)
+      .Case("PredCombinerNot", PredCombinerKind::Not)
+      .Case("PredCombinerSubstLeaves", PredCombinerKind::SubstLeaves)
+      .Case("PredCombinerConcat", PredCombinerKind::Concat);
+}
+
+namespace {
+// Substitution<pattern, replacement>.
+using Subst = std::pair<StringRef, StringRef>;
+} // end anonymous namespace
+
+// Build the predicate tree starting from the top-level predicate, which may
+// have children, and perform leaf substitutions inplace.  Note that after
+// substitution, nodes are still pointing to the original TableGen record.
+// All nodes are created within "allocator".
+static PredNode *buildPredicateTree(const tblgen::Pred &root,
+                                    llvm::BumpPtrAllocator &allocator,
+                                    ArrayRef<Subst> substitutions) {
+  auto *rootNode = allocator.Allocate<PredNode>();
+  new (rootNode) PredNode;
+  rootNode->kind = getPredCombinerKind(root);
+  rootNode->predicate = &root;
+  if (!root.isCombined()) {
+    rootNode->expr = root.getCondition();
+    // Apply all parent substitutions from innermost to outermost.
+    for (const auto &subst : llvm::reverse(substitutions)) {
+      auto pos = rootNode->expr.find(subst.first);
+      while (pos != std::string::npos) {
+        rootNode->expr.replace(pos, subst.first.size(), subst.second);
+        // Skip the newly inserted substring, which itself may consider the
+        // pattern to match.
+        pos += subst.second.size();
+        // Find the next possible match position.
+        pos = rootNode->expr.find(subst.first, pos);
+      }
+    }
+    return rootNode;
+  }
+
+  // If the current combined predicate is a leaf substitution, append it to the
+  // list before contiuing.
+  auto allSubstitutions = llvm::to_vector<4>(substitutions);
+  if (rootNode->kind == PredCombinerKind::SubstLeaves) {
+    const auto &substPred = static_cast<const tblgen::SubstLeavesPred &>(root);
+    allSubstitutions.push_back(
+        {substPred.getPattern(), substPred.getReplacement()});
+  }
+  // If the current predicate is a ConcatPred, record the prefix and suffix.
+  else if (rootNode->kind == PredCombinerKind::Concat) {
+    const auto &concatPred = static_cast<const tblgen::ConcatPred &>(root);
+    rootNode->prefix = concatPred.getPrefix();
+    rootNode->suffix = concatPred.getSuffix();
+  }
+
+  // Build child subtrees.
+  auto combined = static_cast<const tblgen::CombinedPred &>(root);
+  for (const auto *record : combined.getChildren()) {
+    auto childTree =
+        buildPredicateTree(tblgen::Pred(record), allocator, allSubstitutions);
+    rootNode->children.push_back(childTree);
+  }
+  return rootNode;
+}
+
+// Simplify a predicate tree rooted at "node" using the predicates that are
+// known to be true(false).  For AND(OR) combined predicates, if any of the
+// children is known to be false(true), the result is also false(true).
+// Furthermore, for AND(OR) combined predicates, children that are known to be
+// true(false) don't have to be checked dynamically.
+static PredNode *propagateGroundTruth(
+    PredNode *node, const llvm::SmallPtrSetImpl<tblgen::Pred *> &knownTruePreds,
+    const llvm::SmallPtrSetImpl<tblgen::Pred *> &knownFalsePreds) {
+  // If the current predicate is known to be true or false, change the kind of
+  // the node and return immediately.
+  if (knownTruePreds.count(node->predicate) != 0) {
+    node->kind = PredCombinerKind::True;
+    node->children.clear();
+    return node;
+  }
+  if (knownFalsePreds.count(node->predicate) != 0) {
+    node->kind = PredCombinerKind::False;
+    node->children.clear();
+    return node;
+  }
+
+  // If the current node is a substitution, stop recursion now.
+  // The expressions in the leaves below this node were rewritten, but the nodes
+  // still point to the original predicate records.  While the original
+  // predicate may be known to be true or false, it is not necessarily the case
+  // after rewriting.
+  // TODO(zinenko,jpienaar): we can support ground truth for rewritten
+  // predicates by either (a) having our own unique'ing of the predicates
+  // instead of relying on TableGen record pointers or (b) taking ground truth
+  // values optinally prefixed with a list of substitutions to apply, e.g.
+  // "predX is true by itself as well as predSubY leaf substitution had been
+  // applied to it".
+  if (node->kind == PredCombinerKind::SubstLeaves) {
+    return node;
+  }
+
+  // Otherwise, look at child nodes.
+
+  // Move child nodes into some local variable so that they can be optimized
+  // separately and re-added if necessary.
+  llvm::SmallVector<PredNode *, 4> children;
+  std::swap(node->children, children);
+
+  for (auto &child : children) {
+    // First, simplify the child.  This maintains the predicate as it was.
+    auto simplifiedChild =
+        propagateGroundTruth(child, knownTruePreds, knownFalsePreds);
+
+    // Just add the child if we don't know how to simplify the current node.
+    if (node->kind != PredCombinerKind::And &&
+        node->kind != PredCombinerKind::Or) {
+      node->children.push_back(simplifiedChild);
+      continue;
+    }
+
+    // Second, based on the type define which known values of child predicates
+    // immediately collapse this predicate to a known value, and which others
+    // may be safely ignored.
+    //   OR(..., True, ...) = True
+    //   OR(..., False, ...) = OR(..., ...)
+    //   AND(..., False, ...) = False
+    //   AND(..., True, ...) = AND(..., ...)
+    auto collapseKind = node->kind == PredCombinerKind::And
+                            ? PredCombinerKind::False
+                            : PredCombinerKind::True;
+    auto eraseKind = node->kind == PredCombinerKind::And
+                         ? PredCombinerKind::True
+                         : PredCombinerKind::False;
+    const auto &collapseList =
+        node->kind == PredCombinerKind::And ? knownFalsePreds : knownTruePreds;
+    const auto &eraseList =
+        node->kind == PredCombinerKind::And ? knownTruePreds : knownFalsePreds;
+    if (simplifiedChild->kind == collapseKind ||
+        collapseList.count(simplifiedChild->predicate) != 0) {
+      node->kind = collapseKind;
+      node->children.clear();
+      return node;
+    } else if (simplifiedChild->kind == eraseKind ||
+               eraseList.count(simplifiedChild->predicate) != 0) {
+      continue;
+    }
+    node->children.push_back(simplifiedChild);
+  }
+  return node;
+}
+
+// Combine a list of predicate expressions using a binary combiner.  If a list
+// is empty, return "init".
+static std::string combineBinary(ArrayRef<std::string> children,
+                                 std::string combiner, std::string init) {
+  if (children.empty())
+    return init;
+
+  auto size = children.size();
+  if (size == 1)
+    return children.front();
+
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  os << '(' << children.front() << ')';
+  for (unsigned i = 1; i < size; ++i) {
+    os << ' ' << combiner << " (" << children[i] << ')';
+  }
+  return os.str();
+}
+
+// Prepend negation to the only condition in the predicate expression list.
+static std::string combineNot(ArrayRef<std::string> children) {
+  assert(children.size() == 1 && "expected exactly one child predicate of Neg");
+  return (Twine("!(") + children.front() + Twine(')')).str();
+}
+
+// Recursively traverse the predicate tree in depth-first post-order and build
+// the final expression.
+static std::string getCombinedCondition(const PredNode &root) {
+  // Immediately return for non-combiner predicates that don't have children.
+  if (root.kind == PredCombinerKind::Leaf)
+    return root.expr;
+  if (root.kind == PredCombinerKind::True)
+    return "true";
+  if (root.kind == PredCombinerKind::False)
+    return "false";
+
+  // Recurse into children.
+  llvm::SmallVector<std::string, 4> childExpressions;
+  childExpressions.reserve(root.children.size());
+  for (const auto &child : root.children)
+    childExpressions.push_back(getCombinedCondition(*child));
+
+  // Combine the expressions based on the predicate node kind.
+  if (root.kind == PredCombinerKind::And)
+    return combineBinary(childExpressions, "&&", "true");
+  if (root.kind == PredCombinerKind::Or)
+    return combineBinary(childExpressions, "||", "false");
+  if (root.kind == PredCombinerKind::Not)
+    return combineNot(childExpressions);
+  if (root.kind == PredCombinerKind::Concat) {
+    assert(childExpressions.size() == 1 &&
+           "ConcatPred should only have one child");
+    return root.prefix + childExpressions.front() + root.suffix;
+  }
+
+  // Substitutions were applied before so just ignore them.
+  if (root.kind == PredCombinerKind::SubstLeaves) {
+    assert(childExpressions.size() == 1 &&
+           "substitution predicate must have one child");
+    return childExpressions[0];
+  }
+
+  llvm::PrintFatalError(root.predicate->getLoc(), "unsupported predicate kind");
+}
+
+std::string tblgen::CombinedPred::getConditionImpl() const {
+  llvm::BumpPtrAllocator allocator;
+  auto predicateTree = buildPredicateTree(*this, allocator, {});
+  predicateTree = propagateGroundTruth(
+      predicateTree,
+      /*knownTruePreds=*/llvm::SmallPtrSet<tblgen::Pred *, 2>(),
+      /*knownFalsePreds=*/llvm::SmallPtrSet<tblgen::Pred *, 2>());
+
+  return getCombinedCondition(*predicateTree);
+}
+
+StringRef tblgen::SubstLeavesPred::getPattern() const {
+  return def->getValueAsString("pattern");
+}
+
+StringRef tblgen::SubstLeavesPred::getReplacement() const {
+  return def->getValueAsString("replacement");
+}
+
+StringRef tblgen::ConcatPred::getPrefix() const {
+  return def->getValueAsString("prefix");
+}
+
+StringRef tblgen::ConcatPred::getSuffix() const {
+  return def->getValueAsString("suffix");
+}
diff --git a/third_party/mlir/lib/TableGen/Type.cpp b/third_party/mlir/lib/TableGen/Type.cpp
new file mode 100644
index 00000000000..340fb4b89f8
--- /dev/null
+++ b/third_party/mlir/lib/TableGen/Type.cpp
@@ -0,0 +1,38 @@
+//===- Type.cpp - Type class ----------------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Type wrapper to simplify using TableGen Record defining a MLIR Type.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Type.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+tblgen::TypeConstraint::TypeConstraint(const llvm::Record *record)
+    : Constraint(Constraint::CK_Type, record) {
+  assert(def->isSubClassOf("TypeConstraint") &&
+         "must be subclass of TableGen 'TypeConstraint' class");
+}
+
+tblgen::TypeConstraint::TypeConstraint(const llvm::DefInit *init)
+    : TypeConstraint(init->getDef()) {}
+
+bool tblgen::TypeConstraint::isVariadic() const {
+  return def->isSubClassOf("Variadic");
+}
diff --git a/third_party/mlir/lib/Target/CMakeLists.txt b/third_party/mlir/lib/Target/CMakeLists.txt
new file mode 100644
index 00000000000..9f49b813336
--- /dev/null
+++ b/third_party/mlir/lib/Target/CMakeLists.txt
@@ -0,0 +1,30 @@
+add_llvm_library(MLIRTargetLLVMIRModuleTranslation
+  LLVMIR/ModuleTranslation.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  DEPENDS
+  intrinsics_gen
+  )
+target_link_libraries(MLIRTargetLLVMIRModuleTranslation MLIRLLVMIR LLVMCore LLVMSupport LLVMTransformUtils MLIRTranslation)
+add_llvm_library(MLIRTargetLLVMIR
+  LLVMIR/ConvertToLLVMIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  )
+target_link_libraries(MLIRTargetLLVMIR MLIRTargetLLVMIRModuleTranslation)
+add_llvm_library(MLIRTargetNVVMIR
+  LLVMIR/ConvertToNVVMIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  DEPENDS
+  intrinsics_gen
+  )
+target_link_libraries(MLIRTargetNVVMIR
+  MLIRGPU
+  MLIRIR
+  MLIRNVVMIR
+  MLIRTargetLLVMIRModuleTranslation
+  )
diff --git a/third_party/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/third_party/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
new file mode 100644
index 00000000000..0ba15818859
--- /dev/null
+++ b/third_party/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -0,0 +1,54 @@
+//===- ConvertToLLVMIR.cpp - MLIR to LLVM IR conversion -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a translation between the MLIR LLVM dialect and LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR.h"
+
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+#include "mlir/Translation.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+std::unique_ptr<llvm::Module> mlir::translateModuleToLLVMIR(ModuleOp m) {
+  return LLVM::ModuleTranslation::translateModule<>(m);
+}
+
+static TranslateFromMLIRRegistration registration(
+    "mlir-to-llvmir", [](ModuleOp module, llvm::StringRef outputFilename) {
+      if (!module)
+        return failure();
+
+      auto llvmModule = LLVM::ModuleTranslation::translateModule<>(module);
+      if (!llvmModule)
+        return failure();
+
+      auto file = openOutputFile(outputFilename);
+      if (!file)
+        return failure();
+
+      llvmModule->print(file->os(), nullptr);
+      file->keep();
+      return success();
+    });
diff --git a/third_party/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp b/third_party/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
new file mode 100644
index 00000000000..32fa1674083
--- /dev/null
+++ b/third_party/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
@@ -0,0 +1,110 @@
+//===- ConvertToNVVMIR.cpp - MLIR to LLVM IR conversion -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a translation between the MLIR LLVM + NVVM dialects and
+// LLVM IR with NVVM intrinsics and metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/NVVMIR.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+#include "mlir/Translation.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+namespace {
+static llvm::Value *createIntrinsicCall(llvm::IRBuilder<> &builder,
+                                        llvm::Intrinsic::ID intrinsic,
+                                        ArrayRef<llvm::Value *> args = {}) {
+  llvm::Module *module = builder.GetInsertBlock()->getModule();
+  llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic);
+  return builder.CreateCall(fn, args);
+}
+
+class ModuleTranslation : public LLVM::ModuleTranslation {
+
+public:
+  explicit ModuleTranslation(ModuleOp module)
+      : LLVM::ModuleTranslation(module) {}
+  ~ModuleTranslation() override {}
+
+protected:
+  LogicalResult convertOperation(Operation &opInst,
+                                 llvm::IRBuilder<> &builder) override {
+
+#include "mlir/Dialect/LLVMIR/NVVMConversions.inc"
+
+    return LLVM::ModuleTranslation::convertOperation(opInst, builder);
+  }
+};
+} // namespace
+
+std::unique_ptr<llvm::Module> mlir::translateModuleToNVVMIR(ModuleOp m) {
+  ModuleTranslation translation(m);
+  auto llvmModule =
+      LLVM::ModuleTranslation::translateModule<ModuleTranslation>(m);
+
+  // Insert the nvvm.annotations kernel so that the NVVM backend recognizes the
+  // function as a kernel.
+  for (FuncOp func : m.getOps<FuncOp>()) {
+    if (!func.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelFuncAttrName()))
+      continue;
+
+    auto *llvmFunc = llvmModule->getFunction(func.getName());
+
+    llvm::Metadata *llvmMetadata[] = {
+        llvm::ValueAsMetadata::get(llvmFunc),
+        llvm::MDString::get(llvmModule->getContext(), "kernel"),
+        llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
+            llvm::Type::getInt32Ty(llvmModule->getContext()), 1))};
+    llvm::MDNode *llvmMetadataNode =
+        llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
+    llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
+        ->addOperand(llvmMetadataNode);
+  }
+
+  return llvmModule;
+}
+
+static TranslateFromMLIRRegistration
+    registration("mlir-to-nvvmir",
+                 [](ModuleOp module, llvm::StringRef outputFilename) {
+                   if (!module)
+                     return failure();
+
+                   auto llvmModule = mlir::translateModuleToNVVMIR(module);
+                   if (!llvmModule)
+                     return failure();
+
+                   auto file = openOutputFile(outputFilename);
+                   if (!file)
+                     return failure();
+
+                   llvmModule->print(file->os(), nullptr);
+                   file->keep();
+                   return success();
+                 });
diff --git a/third_party/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/third_party/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
new file mode 100644
index 00000000000..e872794d426
--- /dev/null
+++ b/third_party/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -0,0 +1,496 @@
+//===- ModuleTranslation.cpp - MLIR to LLVM conversion --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the translation between an MLIR LLVM dialect module and
+// the corresponding LLVMIR module. It only handles core LLVM IR operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace mlir {
+namespace LLVM {
+
+// Convert an MLIR function type to LLVM IR.  Arguments of the function must of
+// MLIR LLVM IR dialect types.  Use `loc` as a location when reporting errors.
+// Return nullptr on errors.
+static llvm::FunctionType *convertFunctionType(llvm::LLVMContext &llvmContext,
+                                               FunctionType type, Location loc,
+                                               bool isVarArgs) {
+  assert(type && "expected non-null type");
+  if (type.getNumResults() > 1)
+    return emitError(loc, "LLVM functions can only have 0 or 1 result"),
+           nullptr;
+
+  SmallVector<llvm::Type *, 8> argTypes;
+  argTypes.reserve(type.getNumInputs());
+  for (auto t : type.getInputs()) {
+    auto wrappedLLVMType = t.dyn_cast<LLVM::LLVMType>();
+    if (!wrappedLLVMType)
+      return emitError(loc, "non-LLVM function argument type"), nullptr;
+    argTypes.push_back(wrappedLLVMType.getUnderlyingType());
+  }
+
+  if (type.getNumResults() == 0)
+    return llvm::FunctionType::get(llvm::Type::getVoidTy(llvmContext), argTypes,
+                                   isVarArgs);
+
+  auto wrappedResultType = type.getResult(0).dyn_cast<LLVM::LLVMType>();
+  if (!wrappedResultType)
+    return emitError(loc, "non-LLVM function result"), nullptr;
+
+  return llvm::FunctionType::get(wrappedResultType.getUnderlyingType(),
+                                 argTypes, isVarArgs);
+}
+
+// Create an LLVM IR constant of `llvmType` from the MLIR attribute `attr`.
+// This currently supports integer, floating point, splat and dense element
+// attributes and combinations thereof.  In case of error, report it to `loc`
+// and return nullptr.
+llvm::Constant *ModuleTranslation::getLLVMConstant(llvm::Type *llvmType,
+                                                   Attribute attr,
+                                                   Location loc) {
+  if (auto intAttr = attr.dyn_cast<IntegerAttr>())
+    return llvm::ConstantInt::get(llvmType, intAttr.getValue());
+  if (auto floatAttr = attr.dyn_cast<FloatAttr>())
+    return llvm::ConstantFP::get(llvmType, floatAttr.getValue());
+  if (auto funcAttr = attr.dyn_cast<SymbolRefAttr>())
+    return functionMapping.lookup(funcAttr.getValue());
+  if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
+    auto *vectorType = cast<llvm::VectorType>(llvmType);
+    auto *child = getLLVMConstant(vectorType->getElementType(),
+                                  splatAttr.getSplatValue(), loc);
+    return llvm::ConstantVector::getSplat(vectorType->getNumElements(), child);
+  }
+  if (auto elementsAttr = attr.dyn_cast<ElementsAttr>()) {
+    auto *vectorType = cast<llvm::VectorType>(llvmType);
+    SmallVector<llvm::Constant *, 8> constants;
+    uint64_t numElements = vectorType->getNumElements();
+    constants.reserve(numElements);
+    for (auto n : elementsAttr.getValues<Attribute>()) {
+      constants.push_back(
+          getLLVMConstant(vectorType->getElementType(), n, loc));
+      if (!constants.back())
+        return nullptr;
+    }
+    return llvm::ConstantVector::get(constants);
+  }
+  if (auto stringAttr = attr.dyn_cast<StringAttr>()) {
+    return llvm::ConstantDataArray::get(
+        llvmModule->getContext(), ArrayRef<char>{stringAttr.getValue().data(),
+                                                 stringAttr.getValue().size()});
+  }
+  emitError(loc, "unsupported constant value");
+  return nullptr;
+}
+
+// Convert MLIR integer comparison predicate to LLVM IR comparison predicate.
+static llvm::CmpInst::Predicate getLLVMCmpPredicate(ICmpPredicate p) {
+  switch (p) {
+  case LLVM::ICmpPredicate::eq:
+    return llvm::CmpInst::Predicate::ICMP_EQ;
+  case LLVM::ICmpPredicate::ne:
+    return llvm::CmpInst::Predicate::ICMP_NE;
+  case LLVM::ICmpPredicate::slt:
+    return llvm::CmpInst::Predicate::ICMP_SLT;
+  case LLVM::ICmpPredicate::sle:
+    return llvm::CmpInst::Predicate::ICMP_SLE;
+  case LLVM::ICmpPredicate::sgt:
+    return llvm::CmpInst::Predicate::ICMP_SGT;
+  case LLVM::ICmpPredicate::sge:
+    return llvm::CmpInst::Predicate::ICMP_SGE;
+  case LLVM::ICmpPredicate::ult:
+    return llvm::CmpInst::Predicate::ICMP_ULT;
+  case LLVM::ICmpPredicate::ule:
+    return llvm::CmpInst::Predicate::ICMP_ULE;
+  case LLVM::ICmpPredicate::ugt:
+    return llvm::CmpInst::Predicate::ICMP_UGT;
+  case LLVM::ICmpPredicate::uge:
+    return llvm::CmpInst::Predicate::ICMP_UGE;
+  }
+  llvm_unreachable("incorrect comparison predicate");
+}
+
+static llvm::CmpInst::Predicate getLLVMCmpPredicate(FCmpPredicate p) {
+  switch (p) {
+  case LLVM::FCmpPredicate::_false:
+    return llvm::CmpInst::Predicate::FCMP_FALSE;
+  case LLVM::FCmpPredicate::oeq:
+    return llvm::CmpInst::Predicate::FCMP_OEQ;
+  case LLVM::FCmpPredicate::ogt:
+    return llvm::CmpInst::Predicate::FCMP_OGT;
+  case LLVM::FCmpPredicate::oge:
+    return llvm::CmpInst::Predicate::FCMP_OGE;
+  case LLVM::FCmpPredicate::olt:
+    return llvm::CmpInst::Predicate::FCMP_OLT;
+  case LLVM::FCmpPredicate::ole:
+    return llvm::CmpInst::Predicate::FCMP_OLE;
+  case LLVM::FCmpPredicate::one:
+    return llvm::CmpInst::Predicate::FCMP_ONE;
+  case LLVM::FCmpPredicate::ord:
+    return llvm::CmpInst::Predicate::FCMP_ORD;
+  case LLVM::FCmpPredicate::ueq:
+    return llvm::CmpInst::Predicate::FCMP_UEQ;
+  case LLVM::FCmpPredicate::ugt:
+    return llvm::CmpInst::Predicate::FCMP_UGT;
+  case LLVM::FCmpPredicate::uge:
+    return llvm::CmpInst::Predicate::FCMP_UGE;
+  case LLVM::FCmpPredicate::ult:
+    return llvm::CmpInst::Predicate::FCMP_ULT;
+  case LLVM::FCmpPredicate::ule:
+    return llvm::CmpInst::Predicate::FCMP_ULE;
+  case LLVM::FCmpPredicate::une:
+    return llvm::CmpInst::Predicate::FCMP_UNE;
+  case LLVM::FCmpPredicate::uno:
+    return llvm::CmpInst::Predicate::FCMP_UNO;
+  case LLVM::FCmpPredicate::_true:
+    return llvm::CmpInst::Predicate::FCMP_TRUE;
+  }
+  llvm_unreachable("incorrect comparison predicate");
+}
+
+// A helper to look up remapped operands in the value remapping table.
+template <typename Range>
+SmallVector<llvm::Value *, 8> ModuleTranslation::lookupValues(Range &&values) {
+  SmallVector<llvm::Value *, 8> remapped;
+  remapped.reserve(llvm::size(values));
+  for (Value *v : values) {
+    remapped.push_back(valueMapping.lookup(v));
+  }
+  return remapped;
+}
+
+// Given a single MLIR operation, create the corresponding LLVM IR operation
+// using the `builder`.  LLVM IR Builder does not have a generic interface so
+// this has to be a long chain of `if`s calling different functions with a
+// different number of arguments.
+LogicalResult ModuleTranslation::convertOperation(Operation &opInst,
+                                                  llvm::IRBuilder<> &builder) {
+  auto extractPosition = [](ArrayAttr attr) {
+    SmallVector<unsigned, 4> position;
+    position.reserve(attr.size());
+    for (Attribute v : attr)
+      position.push_back(v.cast<IntegerAttr>().getValue().getZExtValue());
+    return position;
+  };
+
+#include "mlir/Dialect/LLVMIR/LLVMConversions.inc"
+
+  // Emit function calls.  If the "callee" attribute is present, this is a
+  // direct function call and we also need to look up the remapped function
+  // itself.  Otherwise, this is an indirect call and the callee is the first
+  // operand, look it up as a normal value.  Return the llvm::Value representing
+  // the function result, which may be of llvm::VoidTy type.
+  auto convertCall = [this, &builder](Operation &op) -> llvm::Value * {
+    auto operands = lookupValues(op.getOperands());
+    ArrayRef<llvm::Value *> operandsRef(operands);
+    if (auto attr = op.getAttrOfType<SymbolRefAttr>("callee")) {
+      return builder.CreateCall(functionMapping.lookup(attr.getValue()),
+                                operandsRef);
+    } else {
+      return builder.CreateCall(operandsRef.front(), operandsRef.drop_front());
+    }
+  };
+
+  // Emit calls.  If the called function has a result, remap the corresponding
+  // value.  Note that LLVM IR dialect CallOp has either 0 or 1 result.
+  if (isa<LLVM::CallOp>(opInst)) {
+    llvm::Value *result = convertCall(opInst);
+    if (opInst.getNumResults() != 0) {
+      valueMapping[opInst.getResult(0)] = result;
+      return success();
+    }
+    // Check that LLVM call returns void for 0-result functions.
+    return success(result->getType()->isVoidTy());
+  }
+
+  // Emit branches.  We need to look up the remapped blocks and ignore the block
+  // arguments that were transformed into PHI nodes.
+  if (auto brOp = dyn_cast<LLVM::BrOp>(opInst)) {
+    builder.CreateBr(blockMapping[brOp.getSuccessor(0)]);
+    return success();
+  }
+  if (auto condbrOp = dyn_cast<LLVM::CondBrOp>(opInst)) {
+    builder.CreateCondBr(valueMapping.lookup(condbrOp.getOperand(0)),
+                         blockMapping[condbrOp.getSuccessor(0)],
+                         blockMapping[condbrOp.getSuccessor(1)]);
+    return success();
+  }
+
+  // Emit addressof.  We need to look up the global value referenced by the
+  // operation and store it in the MLIR-to-LLVM value mapping.  This does not
+  // emit any LLVM instruction.
+  if (auto addressOfOp = dyn_cast<LLVM::AddressOfOp>(opInst)) {
+    LLVM::GlobalOp global = addressOfOp.getGlobal();
+    // The verifier should not have allowed this.
+    assert(global && "referencing an undefined global");
+
+    valueMapping[addressOfOp.getResult()] = globalsMapping.lookup(global);
+    return success();
+  }
+
+  return opInst.emitError("unsupported or non-LLVM operation: ")
+         << opInst.getName();
+}
+
+// Convert block to LLVM IR.  Unless `ignoreArguments` is set, emit PHI nodes
+// to define values corresponding to the MLIR block arguments.  These nodes
+// are not connected to the source basic blocks, which may not exist yet.
+LogicalResult ModuleTranslation::convertBlock(Block &bb, bool ignoreArguments) {
+  llvm::IRBuilder<> builder(blockMapping[&bb]);
+
+  // Before traversing operations, make block arguments available through
+  // value remapping and PHI nodes, but do not add incoming edges for the PHI
+  // nodes just yet: those values may be defined by this or following blocks.
+  // This step is omitted if "ignoreArguments" is set.  The arguments of the
+  // first block have been already made available through the remapping of
+  // LLVM function arguments.
+  if (!ignoreArguments) {
+    auto predecessors = bb.getPredecessors();
+    unsigned numPredecessors =
+        std::distance(predecessors.begin(), predecessors.end());
+    for (auto *arg : bb.getArguments()) {
+      auto wrappedType = arg->getType().dyn_cast<LLVM::LLVMType>();
+      if (!wrappedType)
+        return emitError(bb.front().getLoc(),
+                         "block argument does not have an LLVM type");
+      llvm::Type *type = wrappedType.getUnderlyingType();
+      llvm::PHINode *phi = builder.CreatePHI(type, numPredecessors);
+      valueMapping[arg] = phi;
+    }
+  }
+
+  // Traverse operations.
+  for (auto &op : bb) {
+    if (failed(convertOperation(op, builder)))
+      return failure();
+  }
+
+  return success();
+}
+
+// Create named global variables that correspond to llvm.global definitions.
+void ModuleTranslation::convertGlobals() {
+  for (auto op : mlirModule.getOps<LLVM::GlobalOp>()) {
+    llvm::Constant *cst;
+    llvm::Type *type;
+    // String attributes are treated separately because they cannot appear as
+    // in-function constants and are thus not supported by getLLVMConstant.
+    if (auto strAttr = op.value().dyn_cast<StringAttr>()) {
+      cst = llvm::ConstantDataArray::getString(
+          llvmModule->getContext(), strAttr.getValue(), /*AddNull=*/false);
+      type = cst->getType();
+    } else {
+      type = op.getType().getUnderlyingType();
+      cst = getLLVMConstant(type, op.value(), op.getLoc());
+    }
+
+    auto *var = new llvm::GlobalVariable(*llvmModule, type, op.constant(),
+                                         llvm::GlobalValue::InternalLinkage,
+                                         cst, op.sym_name());
+    globalsMapping.try_emplace(op, var);
+  }
+}
+
+// Get the SSA value passed to the current block from the terminator operation
+// of its predecessor.
+static Value *getPHISourceValue(Block *current, Block *pred,
+                                unsigned numArguments, unsigned index) {
+  auto &terminator = *pred->getTerminator();
+  if (isa<LLVM::BrOp>(terminator)) {
+    return terminator.getOperand(index);
+  }
+
+  // For conditional branches, we need to check if the current block is reached
+  // through the "true" or the "false" branch and take the relevant operands.
+  auto condBranchOp = dyn_cast<LLVM::CondBrOp>(terminator);
+  assert(condBranchOp &&
+         "only branch operations can be terminators of a block that "
+         "has successors");
+  assert((condBranchOp.getSuccessor(0) != condBranchOp.getSuccessor(1)) &&
+         "successors with arguments in LLVM conditional branches must be "
+         "different blocks");
+
+  return condBranchOp.getSuccessor(0) == current
+             ? terminator.getSuccessorOperand(0, index)
+             : terminator.getSuccessorOperand(1, index);
+}
+
+void ModuleTranslation::connectPHINodes(FuncOp func) {
+  // Skip the first block, it cannot be branched to and its arguments correspond
+  // to the arguments of the LLVM function.
+  for (auto it = std::next(func.begin()), eit = func.end(); it != eit; ++it) {
+    Block *bb = &*it;
+    llvm::BasicBlock *llvmBB = blockMapping.lookup(bb);
+    auto phis = llvmBB->phis();
+    auto numArguments = bb->getNumArguments();
+    assert(numArguments == std::distance(phis.begin(), phis.end()));
+    for (auto &numberedPhiNode : llvm::enumerate(phis)) {
+      auto &phiNode = numberedPhiNode.value();
+      unsigned index = numberedPhiNode.index();
+      for (auto *pred : bb->getPredecessors()) {
+        phiNode.addIncoming(valueMapping.lookup(getPHISourceValue(
+                                bb, pred, numArguments, index)),
+                            blockMapping.lookup(pred));
+      }
+    }
+  }
+}
+
+// TODO(mlir-team): implement an iterative version
+static void topologicalSortImpl(llvm::SetVector<Block *> &blocks, Block *b) {
+  blocks.insert(b);
+  for (Block *bb : b->getSuccessors()) {
+    if (blocks.count(bb) == 0)
+      topologicalSortImpl(blocks, bb);
+  }
+}
+
+// Sort function blocks topologically.
+static llvm::SetVector<Block *> topologicalSort(FuncOp f) {
+  // For each blocks that has not been visited yet (i.e. that has no
+  // predecessors), add it to the list and traverse its successors in DFS
+  // preorder.
+  llvm::SetVector<Block *> blocks;
+  for (Block &b : f.getBlocks()) {
+    if (blocks.count(&b) == 0)
+      topologicalSortImpl(blocks, &b);
+  }
+  assert(blocks.size() == f.getBlocks().size() && "some blocks are not sorted");
+
+  return blocks;
+}
+
+LogicalResult ModuleTranslation::convertOneFunction(FuncOp func) {
+  // Clear the block and value mappings, they are only relevant within one
+  // function.
+  blockMapping.clear();
+  valueMapping.clear();
+  llvm::Function *llvmFunc = functionMapping.lookup(func.getName());
+  // Add function arguments to the value remapping table.
+  // If there was noalias info then we decorate each argument accordingly.
+  unsigned int argIdx = 0;
+  for (const auto &kvp : llvm::zip(func.getArguments(), llvmFunc->args())) {
+    llvm::Argument &llvmArg = std::get<1>(kvp);
+    BlockArgument *mlirArg = std::get<0>(kvp);
+
+    if (auto attr = func.getArgAttrOfType<BoolAttr>(argIdx, "llvm.noalias")) {
+      // NB: Attribute already verified to be boolean, so check if we can indeed
+      // attach the attribute to this argument, based on its type.
+      auto argTy = mlirArg->getType().dyn_cast<LLVM::LLVMType>();
+      if (!argTy.getUnderlyingType()->isPointerTy())
+        return func.emitError(
+            "llvm.noalias attribute attached to LLVM non-pointer argument");
+      if (attr.getValue())
+        llvmArg.addAttr(llvm::Attribute::AttrKind::NoAlias);
+    }
+    valueMapping[mlirArg] = &llvmArg;
+    argIdx++;
+  }
+
+  // First, create all blocks so we can jump to them.
+  llvm::LLVMContext &llvmContext = llvmFunc->getContext();
+  for (auto &bb : func) {
+    auto *llvmBB = llvm::BasicBlock::Create(llvmContext);
+    llvmBB->insertInto(llvmFunc);
+    blockMapping[&bb] = llvmBB;
+  }
+
+  // Then, convert blocks one by one in topological order to ensure defs are
+  // converted before uses.
+  auto blocks = topologicalSort(func);
+  for (auto indexedBB : llvm::enumerate(blocks)) {
+    auto *bb = indexedBB.value();
+    if (failed(convertBlock(*bb, /*ignoreArguments=*/indexedBB.index() == 0)))
+      return failure();
+  }
+
+  // Finally, after all blocks have been traversed and values mapped, connect
+  // the PHI nodes to the results of preceding blocks.
+  connectPHINodes(func);
+  return success();
+}
+
+LogicalResult ModuleTranslation::convertFunctions() {
+  // Declare all functions first because there may be function calls that form a
+  // call graph with cycles.
+  for (FuncOp function : mlirModule.getOps<FuncOp>()) {
+    mlir::BoolAttr isVarArgsAttr =
+        function.getAttrOfType<BoolAttr>("std.varargs");
+    bool isVarArgs = isVarArgsAttr && isVarArgsAttr.getValue();
+    llvm::FunctionType *functionType =
+        convertFunctionType(llvmModule->getContext(), function.getType(),
+                            function.getLoc(), isVarArgs);
+    if (!functionType)
+      return failure();
+    llvm::FunctionCallee llvmFuncCst =
+        llvmModule->getOrInsertFunction(function.getName(), functionType);
+    assert(isa<llvm::Function>(llvmFuncCst.getCallee()));
+    functionMapping[function.getName()] =
+        cast<llvm::Function>(llvmFuncCst.getCallee());
+  }
+
+  // Convert functions.
+  for (FuncOp function : mlirModule.getOps<FuncOp>()) {
+    // Ignore external functions.
+    if (function.isExternal())
+      continue;
+
+    if (failed(convertOneFunction(function)))
+      return failure();
+  }
+
+  return success();
+}
+
+std::unique_ptr<llvm::Module> ModuleTranslation::prepareLLVMModule(ModuleOp m) {
+  auto *dialect = m.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+  assert(dialect && "LLVM dialect must be registered");
+
+  auto llvmModule = llvm::CloneModule(dialect->getLLVMModule());
+  if (!llvmModule)
+    return nullptr;
+
+  llvm::LLVMContext &llvmContext = llvmModule->getContext();
+  llvm::IRBuilder<> builder(llvmContext);
+
+  // Inject declarations for `malloc` and `free` functions that can be used in
+  // memref allocation/deallocation coming from standard ops lowering.
+  llvmModule->getOrInsertFunction("malloc", builder.getInt8PtrTy(),
+                                  builder.getInt64Ty());
+  llvmModule->getOrInsertFunction("free", builder.getVoidTy(),
+                                  builder.getInt8PtrTy());
+
+  return llvmModule;
+}
+
+} // namespace LLVM
+} // namespace mlir
diff --git a/third_party/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/third_party/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
new file mode 100644
index 00000000000..71f6c78462d
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -0,0 +1,892 @@
+//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to automatically promote accessed memref regions
+// to buffers in a faster memory space that is explicitly managed, with the
+// necessary data movement operations performed through either regular
+// point-wise load/store's or DMAs. Such explicit copying (also referred to as
+// array packing/unpacking in the literature), when done on arrays that exhibit
+// reuse, results in near elimination of conflict misses, TLB misses, reduced
+// use of hardware prefetch streams, and reduced false sharing. It is also
+// necessary for hardware that explicitly managed levels in the memory
+// hierarchy, and where DMAs may have to be used. This optimization is often
+// performed on already tiled code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "affine-data-copy-generate"
+
+using namespace mlir;
+using llvm::SmallMapVector;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
+    "affine-data-copy-generate-fast-mem-capacity",
+    llvm::cl::desc(
+        "Set fast memory space capacity in KiB (default: unlimited)"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool>
+    clDma("affine-data-copy-generate-dma",
+          llvm::cl::desc("Generate DMA instead of point-wise copy"),
+          llvm::cl::cat(clOptionsCategory),
+          llvm::cl::init(true));
+
+static llvm::cl::opt<unsigned> clFastMemorySpace(
+    "affine-data-copy-generate-fast-mem-space", llvm::cl::init(0),
+    llvm::cl::desc(
+        "Fast memory space identifier for copy generation (default: 1)"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
+    "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
+    llvm::cl::init(false),
+    llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
+                   "for copy placement"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
+/// introducing copy operations to transfer data into `fastMemorySpace` and
+/// rewriting the original load's/store's to instead load/store from the
+/// allocated fast memory buffers. Additional options specify the identifier
+/// corresponding to the fast memory space and the amount of fast memory space
+/// available. The pass traverses through the nesting structure, recursing to
+/// inner levels if necessary to determine at what depth copies need to be
+/// placed so that the allocated buffers fit within the memory capacity
+/// provided.
+// TODO(bondhugula): We currently can't generate copies correctly when stores
+// are strided. Check for strided stores.
+struct AffineDataCopyGeneration
+    : public FunctionPass<AffineDataCopyGeneration> {
+  explicit AffineDataCopyGeneration(
+      unsigned slowMemorySpace = 0,
+      unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
+      int minDmaTransferSize = 1024,
+      uint64_t fastMemCapacityBytes =
+          (clFastMemoryCapacity.getNumOccurrences() > 0
+               ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
+               : std::numeric_limits<uint64_t>::max()),
+      bool generateDma = clDma,
+      bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
+      : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
+        tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
+        fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
+        skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
+
+  explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
+      : slowMemorySpace(other.slowMemorySpace),
+        fastMemorySpace(other.fastMemorySpace),
+        tagMemorySpace(other.tagMemorySpace),
+        minDmaTransferSize(other.minDmaTransferSize),
+        fastMemCapacityBytes(other.fastMemCapacityBytes),
+        generateDma(other.generateDma),
+        skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
+
+  void runOnFunction() override;
+  LogicalResult runOnBlock(Block *block);
+  uint64_t runOnBlock(Block::iterator begin, Block::iterator end);
+
+  LogicalResult generateCopy(const MemRefRegion &region, Block *block,
+                             Block::iterator begin, Block::iterator end,
+                             uint64_t *sizeInBytes, Block::iterator *nBegin,
+                             Block::iterator *nEnd);
+
+  // List of memory regions to copy for. We need a map vector to have a
+  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
+  // since the alloc's for example are identical except for the SSA id.
+  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> readRegions;
+  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> writeRegions;
+
+  // Nests that are copy in's or copy out's; the root AffineForOp of that
+  // nest is stored herein.
+  DenseSet<Operation *> copyNests;
+
+  // Map from original memref's to the fast buffers that their accesses are
+  // replaced with.
+  DenseMap<Value *, Value *> fastBufferMap;
+
+  // Slow memory space associated with copies.
+  const unsigned slowMemorySpace;
+  // Fast memory space associated with copies.
+  unsigned fastMemorySpace;
+  // Memory space associated with DMA tags.
+  unsigned tagMemorySpace;
+  // Minimum DMA transfer size supported by the target in bytes.
+  const int minDmaTransferSize;
+  // Capacity of the faster memory space.
+  uint64_t fastMemCapacityBytes;
+
+  // If set, generate DMA operations instead of read/write.
+  bool generateDma;
+
+  // If set, ignore loops with steps other than 1.
+  bool skipNonUnitStrideLoops;
+
+  // Constant zero index to avoid too many duplicates.
+  Value *zeroIndex = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Generates copies for memref's living in 'slowMemorySpace' into newly created
+/// buffers in 'fastMemorySpace', and replaces memory operations to the former
+/// by the latter. Only load op's handled for now.
+/// TODO(bondhugula): extend this to store op's.
+std::unique_ptr<FunctionPassBase> mlir::createAffineDataCopyGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
+    int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
+  return std::make_unique<AffineDataCopyGeneration>(
+      slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
+      fastMemCapacityBytes);
+}
+
+// Info comprising stride and number of elements transferred every stride.
+struct StrideInfo {
+  int64_t stride;
+  int64_t numEltPerStride;
+};
+
+/// Returns striding information for a copy/transfer of this region with
+/// potentially multiple striding levels from outermost to innermost. For an
+/// n-dimensional region, there can be at most n-1 levels of striding
+/// successively nested.
+//  TODO(bondhugula): make this work with non-identity layout maps.
+static void getMultiLevelStrides(const MemRefRegion &region,
+                                 ArrayRef<int64_t> bufferShape,
+                                 SmallVectorImpl<StrideInfo> *strideInfos) {
+  if (bufferShape.size() <= 1)
+    return;
+
+  int64_t numEltPerStride = 1;
+  int64_t stride = 1;
+  for (int d = bufferShape.size() - 1; d >= 1; d--) {
+    int64_t dimSize = region.memref->getType().cast<MemRefType>().getDimSize(d);
+    stride *= dimSize;
+    numEltPerStride *= bufferShape[d];
+    // A stride is needed only if the region has a shorter extent than the
+    // memref along the dimension *and* has an extent greater than one along the
+    // next major dimension.
+    if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
+      strideInfos->push_back({stride, numEltPerStride});
+    }
+  }
+}
+
+/// Construct the memref region to just include the entire memref. Returns false
+/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
+/// enclosing loop IVs of opInst (starting from the outermost) that the region
+/// is parametric on.
+static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
+                                  MemRefRegion *region) {
+  unsigned rank;
+  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+    rank = loadOp.getMemRefType().getRank();
+    region->memref = loadOp.getMemRef();
+    region->setWrite(false);
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+    rank = storeOp.getMemRefType().getRank();
+    region->memref = storeOp.getMemRef();
+    region->setWrite(true);
+  } else {
+    assert(false && "expected load or store op");
+    return false;
+  }
+  auto memRefType = region->memref->getType().cast<MemRefType>();
+  if (!memRefType.hasStaticShape())
+    return false;
+
+  auto *regionCst = region->getConstraints();
+
+  // Just get the first numSymbols IVs, which the memref region is parametric
+  // on.
+  SmallVector<AffineForOp, 4> ivs;
+  getLoopIVs(*opInst, &ivs);
+  ivs.resize(numParamLoopIVs);
+  SmallVector<Value *, 4> symbols;
+  extractForInductionVars(ivs, &symbols);
+  regionCst->reset(rank, numParamLoopIVs, 0);
+  regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
+
+  // Memref dim sizes provide the bounds.
+  for (unsigned d = 0; d < rank; d++) {
+    auto dimSize = memRefType.getDimSize(d);
+    assert(dimSize > 0 && "filtered dynamic shapes above");
+    regionCst->addConstantLowerBound(d, 0);
+    regionCst->addConstantUpperBound(d, dimSize - 1);
+  }
+  return true;
+}
+
+static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
+emitRemarkForBlock(Block &block) {
+  return block.getParentOp()->emitRemark();
+}
+
+/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
+/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
+/// holds the lower coordinates of the region in the original memref to copy
+/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
+static AffineForOp generatePointWiseCopy(Location loc, Value *memref,
+                                         Value *fastMemRef,
+                                         ArrayRef<Value *> memIndicesStart,
+                                         ArrayRef<int64_t> fastBufferShape,
+                                         bool isCopyOut, OpBuilder b) {
+  assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
+
+  // The copy-in nest is generated as follows as an example for a 2-d region:
+  // for x = ...
+  //   for y = ...
+  //     fast_buf[x][y] = buf[mem_x + x][mem_y + y]
+
+  SmallVector<Value *, 4> fastBufIndices, memIndices;
+  AffineForOp copyNestRoot;
+  for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
+    auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
+    if (d == 0)
+      copyNestRoot = forOp;
+    b = forOp.getBodyBuilder();
+    fastBufIndices.push_back(forOp.getInductionVar());
+    // Construct the subscript for the slow memref being copied.
+    SmallVector<Value *, 2> operands = {memIndicesStart[d], forOp.getInductionVar()};
+    auto memIndex = b.create<AffineApplyOp>(
+        loc,
+        b.getAffineMap(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
+        operands);
+    memIndices.push_back(memIndex);
+  }
+
+  if (!isCopyOut) {
+    // Copy in.
+    auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
+    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
+    return copyNestRoot;
+  }
+
+  // Copy out.
+  auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
+  b.create<AffineStoreOp>(loc, load, memref, memIndices);
+  return copyNestRoot;
+}
+
+/// Creates a buffer in the faster memory space for the specified region;
+/// generates a copy from the lower memory space to this one, and replaces all
+/// loads to load from that buffer. Returns failure if copies could not be
+/// generated due to yet unimplemented cases. `begin` and `end` specify the
+/// insertion points where the incoming copies and outgoing copies,
+/// respectively, should be inserted (the insertion happens right before the
+/// insertion point). Since `begin` can itself be invalidated due to the memref
+/// rewriting done from this method, the output argument `nBegin` is set to its
+/// replacement (set to `begin` if no invalidation happens). Since outgoing
+/// copies are inserted at `end`, the output argument `nEnd` is set to the one
+/// following the original end (since the latter could have been
+/// invalidated/replaced). `sizeInBytes` is set to the size of the fast buffer
+/// allocated.
+LogicalResult AffineDataCopyGeneration::generateCopy(
+    const MemRefRegion &region, Block *block, Block::iterator begin,
+    Block::iterator end, uint64_t *sizeInBytes, Block::iterator *nBegin,
+    Block::iterator *nEnd) {
+  *nBegin = begin;
+  *nEnd = end;
+
+  if (begin == end)
+    return success();
+
+  // Copies for read regions are going to be inserted at 'begin'.
+  OpBuilder prologue(block, begin);
+  // Copies for write regions are going to be inserted at 'end'.
+  OpBuilder epilogue(block, end);
+  OpBuilder &b = region.isWrite() ? epilogue : prologue;
+
+  // Builder to create constants at the top level.
+  auto func = block->getParent()->getParentOfType<FuncOp>();
+  OpBuilder top(func.getBody());
+
+  auto loc = region.loc;
+  auto *memref = region.memref;
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return failure();
+  }
+
+  // Indices to use for the copying.
+  // Indices for the original memref being copied from/to.
+  SmallVector<Value *, 4> memIndices;
+  // Indices for the faster buffer being copied into/from.
+  SmallVector<Value *, 4> bufIndices;
+
+  unsigned rank = memRefType.getRank();
+  SmallVector<int64_t, 4> fastBufferShape;
+
+  // Compute the extents of the buffer.
+  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
+  lbs.reserve(rank);
+  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
+      &fastBufferShape, &lbs, &lbDivisors);
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
+    return failure();
+  }
+
+  if (numElements.getValue() == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
+    *sizeInBytes = 0;
+    return success();
+  }
+
+  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'regionSymbols' hold values that this memory region is symbolic/paramteric
+  // on; these typically include loop IVs surrounding the level at which the
+  // copy generation is being done or other valid symbols in MLIR.
+  SmallVector<Value *, 8> regionSymbols;
+  cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);
+
+  // Construct the index expressions for the fast memory buffer. The index
+  // expression for a particular dimension of the fast buffer is obtained by
+  // subtracting out the lower bound on the original memref's data region
+  // along the corresponding dimension.
+
+  // Index start offsets for faster memory buffer relative to the original.
+  SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(rank);
+  for (unsigned d = 0; d < rank; d++) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
+    AffineExpr offset = top.getAffineConstantExpr(0);
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
+    }
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
+
+    // Set copy start location for this dimension in the lower memory space
+    // memref.
+    if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
+      auto indexVal = caf.getValue();
+      if (indexVal == 0) {
+        memIndices.push_back(zeroIndex);
+      } else {
+        memIndices.push_back(
+            top.create<ConstantIndexOp>(loc, indexVal).getResult());
+      }
+    } else {
+      // The coordinate for the start location is just the lower bound along the
+      // corresponding dimension on the memory region (stored in 'offset').
+      auto map = top.getAffineMap(
+          cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
+      memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
+    }
+    // The fast buffer is copied into at location zero; addressing is relative.
+    bufIndices.push_back(zeroIndex);
+
+    // Record the offsets since they are needed to remap the memory accesses of
+    // the original memref further below.
+    offsets.push_back(offset);
+  }
+
+  // The faster memory space buffer.
+  Value *fastMemRef;
+
+  // Check if a buffer was already created.
+  bool existingBuf = fastBufferMap.count(memref) > 0;
+  if (!existingBuf) {
+    auto fastMemRefType = top.getMemRefType(
+        fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace);
+
+    // Create the fast memory space buffer just before the 'affine.for'
+    // operation.
+    fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType).getResult();
+    // Record it.
+    fastBufferMap[memref] = fastMemRef;
+    // fastMemRefType is a constant shaped memref.
+    *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
+    LLVM_DEBUG(emitRemarkForBlock(*block)
+               << "Creating fast buffer of type " << fastMemRefType
+               << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
+               << " KiB\n");
+  } else {
+    // Reuse the one already created.
+    fastMemRef = fastBufferMap[memref];
+    *sizeInBytes = 0;
+  }
+
+  auto numElementsSSA =
+      top.create<ConstantIndexOp>(loc, numElements.getValue());
+
+  SmallVector<StrideInfo, 4> strideInfos;
+  getMultiLevelStrides(region, fastBufferShape, &strideInfos);
+
+  // TODO(bondhugula): use all stride levels once DmaStartOp is extended for
+  // multi-level strides.
+  if (strideInfos.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
+    return failure();
+  }
+
+  Value *stride = nullptr;
+  Value *numEltPerStride = nullptr;
+  if (!strideInfos.empty()) {
+    stride = top.create<ConstantIndexOp>(loc, strideInfos[0].stride);
+    numEltPerStride =
+        top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride);
+  }
+
+  // Record the last operation just before the point where we insert the
+  // copy out's. We later do the memref replacement later only in [begin,
+  // postDomFilter] so that the original memref's in the data movement code
+  // themselves don't get replaced.
+  auto postDomFilter = std::prev(end);
+
+  // Create fully composed affine maps for each memref.
+  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
+  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
+  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
+  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
+
+  if (!generateDma) {
+    auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memIndices,
+                                          fastBufferShape,
+                                          /*isCopyOut=*/region.isWrite(), b);
+
+    // Record this so that we can skip it from yet another copy.
+    copyNests.insert(copyNest);
+
+    if (region.isWrite())
+      // Since new ops are being appended (for copy out's), adjust the end to
+      // mark end of block range being processed.
+      *nEnd = Block::iterator(copyNest.getOperation());
+  } else {
+    // Create a tag (single element 1-d memref) for the DMA.
+    auto tagMemRefType =
+        top.getMemRefType({1}, top.getIntegerType(32), {}, tagMemorySpace);
+    auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
+
+    SmallVector<Value *, 4> tagIndices({zeroIndex});
+    auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
+    fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
+    if (!region.isWrite()) {
+      // DMA non-blocking read from original buffer to fast buffer.
+      b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
+                                 fastMemRef, bufAffineMap, bufIndices,
+                                 tagMemRef, tagAffineMap, tagIndices,
+                                 numElementsSSA, stride, numEltPerStride);
+    } else {
+      // DMA non-blocking write from fast buffer to the original memref.
+      auto op = b.create<AffineDmaStartOp>(
+          loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
+          memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
+          stride, numEltPerStride);
+      // Since new ops are being appended (for outgoing DMAs), adjust the end to
+      // mark end of block range being processed.
+      *nEnd = Block::iterator(op.getOperation());
+    }
+
+    // Matching DMA wait to block on completion; tag always has a 0 index.
+    b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
+                              numElementsSSA);
+
+    // Generate dealloc for the tag.
+    auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef);
+    if (*nEnd == end)
+      // Since new ops are being appended (for outgoing DMAs), adjust the end to
+      // mark end of range of the original.
+      *nEnd = Block::iterator(tagDeallocOp.getOperation());
+  }
+
+  // Generate dealloc for the buffer.
+  if (!existingBuf) {
+    auto bufDeallocOp = epilogue.create<DeallocOp>(loc, fastMemRef);
+    // When generating pointwise copies, `nEnd' has to be set to deallocOp on
+    // the fast buffer (since it marks the new end insertion point).
+    if (!generateDma && *nEnd == end)
+      *nEnd = Block::iterator(bufDeallocOp.getOperation());
+  }
+
+  // Replace all uses of the old memref with the faster one while remapping
+  // access indices (subtracting out lower bound offsets for each dimension).
+  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
+  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
+  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
+  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
+  // d2, d3 correspond to the original indices (%i, %j).
+  SmallVector<AffineExpr, 4> remapExprs;
+  remapExprs.reserve(rank);
+  for (unsigned i = 0; i < rank; i++) {
+    // The starting operands of indexRemap will be regionSymbols (the symbols on
+    // which the memref region is parametric); then those corresponding to
+    // the memref's original indices follow.
+    auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
+    remapExprs.push_back(dimExpr - offsets[i]);
+  }
+  auto indexRemap = b.getAffineMap(regionSymbols.size() + rank, 0, remapExprs);
+
+  // Record the begin since it may be invalidated by memref replacement.
+  Block::iterator prev;
+  bool wasAtStartOfBlock = (begin == block->begin());
+  if (!wasAtStartOfBlock)
+    prev = std::prev(begin);
+
+  // *Only* those uses within the range [begin, end) of 'block' are replaced.
+  replaceAllMemRefUsesWith(memref, fastMemRef,
+                           /*extraIndices=*/{}, indexRemap,
+                           /*extraOperands=*/regionSymbols,
+                           /*domInstFilter=*/&*begin,
+                           /*postDomInstFilter=*/&*postDomFilter);
+
+  *nBegin = wasAtStartOfBlock ? block->begin() : std::next(prev);
+
+  return success();
+}
+
+/// Generate copies for this block. The block is partitioned into separate
+/// ranges: each range is either a sequence of one or more operations starting
+/// and ending with an affine load or store op, or just an affine.forop (which
+/// could have other affine for op's nested within).
+LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
+  if (block->empty())
+    return success();
+
+  copyNests.clear();
+
+  // Every affine.forop in the block starts and ends a block range for copying.
+  // A contiguous sequence of operations starting and ending with a load/store
+  // op is also identified as a copy block range. Straightline code (a
+  // contiguous chunk of operations excluding AffineForOp's) are always assumed
+  // to not exhaust memory. As a result, this approach is conservative in some
+  // cases at the moment; we do a check later and report an error with location
+  // info.
+  // TODO(bondhugula): An 'affine.if' operation is being treated similar to an
+  // operation. 'affine.if''s could have 'affine.for's in them;
+  // treat them separately.
+
+  // Get to the first load, store, or for op (that is not a copy nest itself).
+  auto curBegin =
+      std::find_if(block->begin(), block->end(), [&](Operation &op) {
+        return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+                isa<AffineForOp>(op)) &&
+               copyNests.count(&op) == 0;
+      });
+
+  for (auto it = curBegin; it != block->end(); ++it) {
+    AffineForOp forOp;
+    if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
+      // Returns true if the footprint is known to exceed capacity.
+      auto exceedsCapacity = [&](AffineForOp forOp) {
+        Optional<int64_t> footprint =
+            getMemoryFootprintBytes(forOp,
+                                    /*memorySpace=*/0);
+        return (footprint.hasValue() &&
+                static_cast<uint64_t>(footprint.getValue()) >
+                    fastMemCapacityBytes);
+      };
+
+      // If the memory footprint of the 'affine.for' loop is higher than fast
+      // memory capacity (when provided), we recurse to copy at an inner level
+      // until we find a depth at which footprint fits in fast mem capacity. If
+      // the footprint can't be calculated, we assume for now it fits. Recurse
+      // inside if footprint for 'forOp' exceeds capacity, or when
+      // skipNonUnitStrideLoops is set and the step size is not one.
+      bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
+                                                 : exceedsCapacity(forOp);
+      if (recurseInner) {
+        // We'll recurse and do the copies at an inner level for 'forInst'.
+        runOnBlock(/*begin=*/curBegin, /*end=*/it);
+        // Recurse onto the body of this loop.
+        runOnBlock(forOp.getBody());
+        // The next block range starts right after the 'affine.for' operation.
+        curBegin = std::next(it);
+      } else {
+        // We have enough capacity, i.e., copies will be computed for the
+        // portion of the block until 'it', and for 'it', which is 'forOp'. Note
+        // that for the latter, the copies are placed just before this loop (for
+        // incoming copies) and right after (for outgoing ones).
+        runOnBlock(/*begin=*/curBegin, /*end=*/it);
+
+        // Inner loop copies have their own scope - we don't thus update
+        // consumed capacity. The footprint check above guarantees this inner
+        // loop's footprint fits.
+        runOnBlock(/*begin=*/it, /*end=*/std::next(it));
+        curBegin = std::next(it);
+      }
+    } else if (!isa<AffineLoadOp>(&*it) && !isa<AffineStoreOp>(&*it)) {
+      runOnBlock(/*begin=*/curBegin, /*end=*/it);
+      curBegin = std::next(it);
+    }
+  }
+
+  // Generate the copy for the final block range.
+  if (curBegin != block->end()) {
+    // Can't be a terminator because it would have been skipped above.
+    assert(!curBegin->isKnownTerminator() && "can't be a terminator");
+    runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
+  }
+
+  return success();
+}
+
+/// Given a memref region, determine the lowest depth at which transfers can be
+/// placed for it, and return the corresponding block, start and end positions
+/// in the block for placing incoming (read) and outgoing (write) copies
+/// respectively. The lowest depth depends on whether the region being accessed
+/// is invariant with respect to one or more immediately surrounding loops.
+static void
+findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
+                             Block::iterator &begin, Block::iterator &end,
+                             Block **copyPlacementBlock,
+                             Block::iterator *copyPlacementReadStart,
+                             Block::iterator *copyPlacementWriteStart) {
+  const auto *cst = region.getConstraints();
+  SmallVector<Value *, 4> symbols;
+  cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
+
+  SmallVector<AffineForOp, 4> enclosingFors;
+  getLoopIVs(*block.begin(), &enclosingFors);
+  // Walk up loop parents till we find an IV on which this region is
+  // symbolic/variant.
+  auto it = enclosingFors.rbegin();
+  for (auto e = enclosingFors.rend(); it != e; ++it) {
+    // TODO(bondhugula): also need to be checking this for regions symbols that
+    // aren't loop IVs, whether we are within their resp. defs' dominance scope.
+    if (llvm::is_contained(symbols, it->getInductionVar()))
+      break;
+  }
+
+  if (it != enclosingFors.rbegin()) {
+    auto lastInvariantIV = *std::prev(it);
+    *copyPlacementReadStart = Block::iterator(lastInvariantIV.getOperation());
+    *copyPlacementWriteStart = std::next(*copyPlacementReadStart);
+    *copyPlacementBlock = lastInvariantIV.getOperation()->getBlock();
+  } else {
+    *copyPlacementReadStart = begin;
+    *copyPlacementWriteStart = end;
+    *copyPlacementBlock = &block;
+  }
+}
+
+/// Generates copies for a contiguous sequence of operations in `block` in the
+/// iterator range [begin, end). Returns the total size of the fast buffers
+/// used.
+//  Since we generate alloc's and dealloc's for all fast buffers (before and
+//  after the range of operations resp.), all of the fast memory capacity is
+//  assumed to be available for processing this block range.
+uint64_t AffineDataCopyGeneration::runOnBlock(Block::iterator begin,
+                                              Block::iterator end) {
+  if (begin == end)
+    return 0;
+
+  assert(begin->getBlock() == std::prev(end)->getBlock() &&
+         "Inconsistent args");
+
+  Block *block = begin->getBlock();
+
+  // Copies will be generated for this depth, i.e., symbolic in all loops
+  // surrounding the this block range.
+  unsigned copyDepth = getNestingDepth(*begin);
+
+  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
+                          << "\n");
+
+  readRegions.clear();
+  writeRegions.clear();
+  fastBufferMap.clear();
+
+  // To check for errors when walking the block.
+  bool error = false;
+
+  // Walk this range of operations  to gather all memory regions.
+  block->walk(begin, end, [&](Operation *opInst) {
+    // Gather regions to allocate to buffers in faster memory space.
+    if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+      if (loadOp.getMemRefType().getMemorySpace() != slowMemorySpace)
+        return;
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+      if (storeOp.getMemRefType().getMemorySpace() != slowMemorySpace)
+        return;
+    } else {
+      // Neither load nor a store op.
+      return;
+    }
+
+    // Compute the MemRefRegion accessed.
+    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
+    if (failed(region->compute(opInst, copyDepth))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Error obtaining memory region: semi-affine maps?\n");
+      LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
+      if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+        LLVM_DEBUG(
+            opInst->emitError("Non-constant memref sizes not yet supported"));
+        error = true;
+        return;
+      }
+    }
+
+    // Each memref has a single buffer associated with it irrespective of how
+    // many load's and store's happen on it.
+    // TODO(bondhugula): in the future, when regions don't intersect and satisfy
+    // other properties (based on load/store regions), we could consider
+    // multiple buffers per memref.
+
+    // Add to the appropriate region if it's not already in it, or take a
+    // bounding box union with the existing one if it's already in there.
+    // Note that a memref may have both read and write regions - so update the
+    // region in the other list if one exists (write in case of read and vice
+    // versa) since there is a single bounding box for a memref across all reads
+    // and writes that happen on it.
+
+    // Attempts to update; returns true if 'region' exists in targetRegions.
+    auto updateRegion =
+        [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
+                &targetRegions) {
+          auto it = targetRegions.find(region->memref);
+          if (it == targetRegions.end())
+            return false;
+
+          // Perform a union with the existing region.
+          if (failed(it->second->unionBoundingBox(*region))) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Memory region bounding box failed; "
+                          "over-approximating to the entire memref\n");
+            // If the union fails, we will overapproximate.
+            if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+              LLVM_DEBUG(opInst->emitError(
+                  "Non-constant memref sizes not yet supported"));
+              error = true;
+              return true;
+            }
+            it->second->getConstraints()->clearAndCopyFrom(
+                *region->getConstraints());
+          } else {
+            // Union was computed and stored in 'it->second': copy to 'region'.
+            region->getConstraints()->clearAndCopyFrom(
+                *it->second->getConstraints());
+          }
+          return true;
+        };
+
+    bool existsInRead = updateRegion(readRegions);
+    if (error)
+      return;
+    bool existsInWrite = updateRegion(writeRegions);
+    if (error)
+      return;
+
+    // Finally add it to the region list.
+    if (region->isWrite() && !existsInWrite) {
+      writeRegions[region->memref] = std::move(region);
+    } else if (!region->isWrite() && !existsInRead) {
+      readRegions[region->memref] = std::move(region);
+    }
+  });
+
+  if (error) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return 0;
+  }
+
+  uint64_t totalCopyBuffersSizeInBytes = 0;
+  bool ret = true;
+  auto processRegions =
+      [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
+              &regions) {
+        for (const auto &regionEntry : regions) {
+          // For each region, hoist copy in/out past all invariant
+          // 'affine.for's.
+          Block::iterator copyPlacementReadStart, copyPlacementWriteStart;
+          Block *copyPlacementBlock;
+          findHighestBlockForPlacement(
+              *regionEntry.second, *block, begin, end, &copyPlacementBlock,
+              &copyPlacementReadStart, &copyPlacementWriteStart);
+
+          uint64_t sizeInBytes;
+          Block::iterator nBegin, nEnd;
+          LogicalResult iRet = generateCopy(
+              *regionEntry.second, copyPlacementBlock, copyPlacementReadStart,
+              copyPlacementWriteStart, &sizeInBytes, &nBegin, &nEnd);
+          if (succeeded(iRet)) {
+            // copyPlacmentStart/End (or begin/end) may be invalidated; use
+            // nBegin, nEnd to reset.
+            if (copyPlacementBlock == block) {
+              begin = nBegin;
+              end = nEnd;
+            }
+            totalCopyBuffersSizeInBytes += sizeInBytes;
+          }
+          ret = ret & succeeded(iRet);
+        }
+      };
+  processRegions(readRegions);
+  processRegions(writeRegions);
+
+  if (!ret) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return totalCopyBuffersSizeInBytes;
+  }
+
+  // For a range of operations, a note will be emitted at the caller.
+  AffineForOp forOp;
+  uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024);
+  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
+    forOp.emitRemark()
+        << sizeInKib
+        << " KiB of copy buffers in fast memory space for this block\n";
+  }
+
+  if (totalCopyBuffersSizeInBytes > fastMemCapacityBytes) {
+    StringRef str = "Total size of all copy buffers' for this block "
+                    "exceeds fast memory capacity\n";
+    block->getParentOp()->emitError(str);
+  }
+
+  return totalCopyBuffersSizeInBytes;
+}
+
+void AffineDataCopyGeneration::runOnFunction() {
+  FuncOp f = getFunction();
+  OpBuilder topBuilder(f.getBody());
+  zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
+
+  for (auto &block : f)
+    runOnBlock(&block);
+}
+
+static PassRegistration<AffineDataCopyGeneration>
+    pass("affine-data-copy-generate",
+         "Generate explicit copying for memory operations");
diff --git a/third_party/mlir/lib/Transforms/CMakeLists.txt b/third_party/mlir/lib/Transforms/CMakeLists.txt
new file mode 100644
index 00000000000..e256c28ce93
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/CMakeLists.txt
@@ -0,0 +1,36 @@
+add_subdirectory(Utils)
+
+add_llvm_library(MLIRTransforms
+  AffineDataCopyGeneration.cpp
+  Canonicalizer.cpp
+  CSE.cpp
+  DialectConversion.cpp
+  LoopCoalescing.cpp
+  LoopFusion.cpp
+  LoopInvariantCodeMotion.cpp
+  LoopTiling.cpp
+  LoopUnrollAndJam.cpp
+  LoopUnroll.cpp
+  LowerAffine.cpp
+  LowerVectorTransfers.cpp
+  MaterializeVectors.cpp
+  MemRefDataFlowOpt.cpp
+  PipelineDataTransfer.cpp
+  SimplifyAffineStructures.cpp
+  StripDebugInfo.cpp
+  Vectorize.cpp
+  ViewRegionGraph.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+
+add_dependencies(MLIRTransforms MLIRStandardOpsIncGen)
+target_link_libraries(MLIRTransforms
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRTransformUtils
+  MLIRVectorOps
+  )
diff --git a/third_party/mlir/lib/Transforms/CSE.cpp b/third_party/mlir/lib/Transforms/CSE.cpp
new file mode 100644
index 00000000000..bb89aef7fef
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/CSE.cpp
@@ -0,0 +1,266 @@
+//===- CSE.cpp - Common Sub-expression Elimination ------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This transformation pass performs a simple common sub-expression elimination
+// algorithm on operations within a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include <deque>
+using namespace mlir;
+
+namespace {
+// TODO(riverriddle) Handle commutative operations.
+struct SimpleOperationInfo : public llvm::DenseMapInfo<Operation *> {
+  static unsigned getHashValue(const Operation *opC) {
+    auto *op = const_cast<Operation *>(opC);
+    // Hash the operations based upon their:
+    //   - Operation Name
+    //   - Attributes
+    //   - Result Types
+    //   - Operands
+    return hash_combine(
+        op->getName(), op->getAttrs(),
+        hash_combine_range(op->result_type_begin(), op->result_type_end()),
+        hash_combine_range(op->operand_begin(), op->operand_end()));
+  }
+  static bool isEqual(const Operation *lhsC, const Operation *rhsC) {
+    auto *lhs = const_cast<Operation *>(lhsC);
+    auto *rhs = const_cast<Operation *>(rhsC);
+    if (lhs == rhs)
+      return true;
+    if (lhs == getTombstoneKey() || lhs == getEmptyKey() ||
+        rhs == getTombstoneKey() || rhs == getEmptyKey())
+      return false;
+
+    // Compare the operation name.
+    if (lhs->getName() != rhs->getName())
+      return false;
+    // Check operand and result type counts.
+    if (lhs->getNumOperands() != rhs->getNumOperands() ||
+        lhs->getNumResults() != rhs->getNumResults())
+      return false;
+    // Compare attributes.
+    if (lhs->getAttrs() != rhs->getAttrs())
+      return false;
+    // Compare operands.
+    if (!std::equal(lhs->operand_begin(), lhs->operand_end(),
+                    rhs->operand_begin()))
+      return false;
+    // Compare result types.
+    return std::equal(lhs->result_type_begin(), lhs->result_type_end(),
+                      rhs->result_type_begin());
+  }
+};
+} // end anonymous namespace
+
+namespace {
+/// Simple common sub-expression elimination.
+struct CSE : public FunctionPass<CSE> {
+  CSE() = default;
+  CSE(const CSE &) {}
+
+  /// Shared implementation of operation elimination and scoped map definitions.
+  using AllocatorTy = llvm::RecyclingAllocator<
+      llvm::BumpPtrAllocator,
+      llvm::ScopedHashTableVal<Operation *, Operation *>>;
+  using ScopedMapTy = llvm::ScopedHashTable<Operation *, Operation *,
+                                            SimpleOperationInfo, AllocatorTy>;
+
+  /// Represents a single entry in the depth first traversal of a CFG.
+  struct CFGStackNode {
+    CFGStackNode(ScopedMapTy &knownValues, DominanceInfoNode *node)
+        : scope(knownValues), node(node), childIterator(node->begin()),
+          processed(false) {}
+
+    /// Scope for the known values.
+    ScopedMapTy::ScopeTy scope;
+
+    DominanceInfoNode *node;
+    DominanceInfoNode::iterator childIterator;
+
+    /// If this node has been fully processed yet or not.
+    bool processed;
+  };
+
+  /// Attempt to eliminate a redundant operation. Returns success if the
+  /// operation was marked for removal, failure otherwise.
+  LogicalResult simplifyOperation(ScopedMapTy &knownValues, Operation *op);
+
+  void simplifyBlock(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                     Block *bb);
+  void simplifyRegion(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                      Region &region);
+
+  void runOnFunction() override;
+
+private:
+  /// Operations marked as dead and to be erased.
+  std::vector<Operation *> opsToErase;
+};
+} // end anonymous namespace
+
+/// Attempt to eliminate a redundant operation.
+LogicalResult CSE::simplifyOperation(ScopedMapTy &knownValues, Operation *op) {
+  // Don't simplify operations with nested blocks. We don't currently model
+  // equality comparisons correctly among other things. It is also unclear
+  // whether we would want to CSE such operations.
+  if (op->getNumRegions() != 0)
+    return failure();
+
+  // TODO(riverriddle) We currently only eliminate non side-effecting
+  // operations.
+  if (!op->hasNoSideEffect())
+    return failure();
+
+  // If the operation is already trivially dead just add it to the erase list.
+  if (op->use_empty()) {
+    opsToErase.push_back(op);
+    return success();
+  }
+
+  // Look for an existing definition for the operation.
+  if (auto *existing = knownValues.lookup(op)) {
+    // If we find one then replace all uses of the current operation with the
+    // existing one and mark it for deletion.
+    op->replaceAllUsesWith(existing);
+    opsToErase.push_back(op);
+
+    // If the existing operation has an unknown location and the current
+    // operation doesn't, then set the existing op's location to that of the
+    // current op.
+    if (existing->getLoc().isa<UnknownLoc>() &&
+        !op->getLoc().isa<UnknownLoc>()) {
+      existing->setLoc(op->getLoc());
+    }
+    return success();
+  }
+
+  // Otherwise, we add this operation to the known values map.
+  knownValues.insert(op, op);
+  return failure();
+}
+
+void CSE::simplifyBlock(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                        Block *bb) {
+  for (auto &inst : *bb) {
+    // If the operation is simplified, we don't process any held regions.
+    if (succeeded(simplifyOperation(knownValues, &inst)))
+      continue;
+
+    // If this operation is isolated above, we can't process nested regions with
+    // the given 'knownValues' map. This would cause the insertion of implicit
+    // captures in explicit capture only regions.
+    if (!inst.isRegistered() || inst.isKnownIsolatedFromAbove()) {
+      ScopedMapTy nestedKnownValues;
+      for (auto &region : inst.getRegions())
+        simplifyRegion(nestedKnownValues, domInfo, region);
+      continue;
+    }
+
+    // Otherwise, process nested regions normally.
+    for (auto &region : inst.getRegions())
+      simplifyRegion(knownValues, domInfo, region);
+  }
+}
+
+void CSE::simplifyRegion(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                         Region &region) {
+  // If the region is empty there is nothing to do.
+  if (region.empty())
+    return;
+
+  // If the region only contains one block, then simplify it directly.
+  if (std::next(region.begin()) == region.end()) {
+    ScopedMapTy::ScopeTy scope(knownValues);
+    simplifyBlock(knownValues, domInfo, &region.front());
+    return;
+  }
+
+  // Note, deque is being used here because there was significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. If/when these performance issues are no
+  // longer a problem we can change this to vector. For more information see
+  // the llvm mailing list discussion on this:
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<std::unique_ptr<CFGStackNode>> stack;
+
+  // Process the nodes of the dom tree for this region.
+  stack.emplace_back(std::make_unique<CFGStackNode>(
+      knownValues, domInfo.getRootNode(&region)));
+
+  while (!stack.empty()) {
+    auto &currentNode = stack.back();
+
+    // Check to see if we need to process this node.
+    if (!currentNode->processed) {
+      currentNode->processed = true;
+      simplifyBlock(knownValues, domInfo, currentNode->node->getBlock());
+    }
+
+    // Otherwise, check to see if we need to process a child node.
+    if (currentNode->childIterator != currentNode->node->end()) {
+      auto *childNode = *(currentNode->childIterator++);
+      stack.emplace_back(
+          std::make_unique<CFGStackNode>(knownValues, childNode));
+    } else {
+      // Finally, if the node and all of its children have been processed
+      // then we delete the node.
+      stack.pop_back();
+    }
+  }
+}
+
+void CSE::runOnFunction() {
+  /// A scoped hash table of defining operations within a function.
+  ScopedMapTy knownValues;
+  simplifyRegion(knownValues, getAnalysis<DominanceInfo>(),
+                 getFunction().getBody());
+
+  // If no operations were erased, then we mark all analyses as preserved.
+  if (opsToErase.empty())
+    return markAllAnalysesPreserved();
+
+  /// Erase any operations that were marked as dead during simplification.
+  for (auto *op : opsToErase)
+    op->erase();
+  opsToErase.clear();
+
+  // We currently don't remove region operations, so mark dominance as
+  // preserved.
+  markAnalysesPreserved<DominanceInfo, PostDominanceInfo>();
+}
+
+std::unique_ptr<FunctionPassBase> mlir::createCSEPass() {
+  return std::make_unique<CSE>();
+}
+
+static PassRegistration<CSE>
+    pass("cse", "Eliminate common sub-expressions in functions");
diff --git a/third_party/mlir/lib/Transforms/Canonicalizer.cpp b/third_party/mlir/lib/Transforms/Canonicalizer.cpp
new file mode 100644
index 00000000000..db6c8ee26e6
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Canonicalizer.cpp
@@ -0,0 +1,61 @@
+//===- Canonicalizer.cpp - Canonicalize MLIR operations -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This transformation pass converts operations into their canonical forms by
+// folding constants, applying operation identity transformations etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// The actual Canonicalizer Pass.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Canonicalize operations in functions.
+struct Canonicalizer : public FunctionPass<Canonicalizer> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+void Canonicalizer::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+
+  // TODO: Instead of adding all known patterns from the whole system lazily add
+  // and cache the canonicalization patterns for ops we see in practice when
+  // building the worklist.  For now, we just grab everything.
+  auto *context = &getContext();
+  for (auto *op : context->getRegisteredOperations())
+    op->getCanonicalizationPatterns(patterns, context);
+
+  applyPatternsGreedily(func, patterns);
+}
+
+/// Create a Canonicalizer pass.
+std::unique_ptr<FunctionPassBase> mlir::createCanonicalizerPass() {
+  return std::make_unique<Canonicalizer>();
+}
+
+static PassRegistration<Canonicalizer> pass("canonicalize",
+                                            "Canonicalize operations");
diff --git a/third_party/mlir/lib/Transforms/DialectConversion.cpp b/third_party/mlir/lib/Transforms/DialectConversion.cpp
new file mode 100644
index 00000000000..903e13ba11e
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/DialectConversion.cpp
@@ -0,0 +1,1398 @@
+//===- DialectConversion.cpp - MLIR dialect conversion generic pass -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+#define DEBUG_TYPE "dialect-conversion"
+
+//===----------------------------------------------------------------------===//
+// ArgConverter
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class provides a simple interface for converting the types of block
+/// arguments. This is done by inserting fake cast operations that map from the
+/// illegal type to the original type to allow for undoing pending rewrites in
+/// the case of failure.
+struct ArgConverter {
+  ArgConverter(TypeConverter *typeConverter, PatternRewriter &rewriter)
+      : castOpName(kCastName, rewriter.getContext()),
+        loc(rewriter.getUnknownLoc()), typeConverter(typeConverter),
+        rewriter(rewriter) {}
+
+  /// Erase any rewrites registered for arguments to blocks within the given
+  /// region. This function is called when the given region is to be destroyed.
+  void cancelPendingRewrites(Block *block);
+
+  /// Cleanup and undo any generated conversions for the arguments of block.
+  /// This method differs from 'cancelPendingRewrites' in that it returns the
+  /// block signature to its original state.
+  void discardPendingRewrites(Block *block);
+
+  /// Replace usages of the cast operations with the argument directly.
+  void applyRewrites();
+
+  /// Return if the signature of the given block has already been converted.
+  bool hasBeenConverted(Block *block) const { return argMapping.count(block); }
+
+  /// Attempt to convert the signature of the given block.
+  LogicalResult convertSignature(Block *block, BlockAndValueMapping &mapping);
+
+  /// Apply the given signature conversion on the given block.
+  void applySignatureConversion(
+      Block *block, TypeConverter::SignatureConversion &signatureConversion,
+      BlockAndValueMapping &mapping);
+
+  /// Convert the given block argument given the provided set of new argument
+  /// values that are to replace it. This function returns the operation used
+  /// to perform the conversion.
+  Operation *convertArgument(BlockArgument *origArg,
+                             ArrayRef<Value *> newValues,
+                             BlockAndValueMapping &mapping);
+
+  /// A utility function used to create a conversion cast operation with the
+  /// given input and result types.
+  Operation *createCast(ArrayRef<Value *> inputs, Type outputType);
+
+  /// This is an operation name for a fake operation that is inserted during the
+  /// conversion process. Operations of this type are guaranteed to never escape
+  /// the converter.
+  static constexpr StringLiteral kCastName = "__mlir_conversion.cast";
+  OperationName castOpName;
+
+  /// This is a collection of cast operations that were generated during the
+  /// conversion process when converting the types of block arguments.
+  llvm::MapVector<Block *, SmallVector<Operation *, 4>> argMapping;
+
+  /// An instance of the unknown location that is used when generating
+  /// producers.
+  Location loc;
+
+  /// The type converter to use when changing types.
+  TypeConverter *typeConverter;
+
+  /// The pattern rewriter to use when materializing conversions.
+  PatternRewriter &rewriter;
+};
+} // end anonymous namespace
+
+constexpr StringLiteral ArgConverter::kCastName;
+
+/// Erase any rewrites registered for arguments to the given block.
+void ArgConverter::cancelPendingRewrites(Block *block) {
+  auto it = argMapping.find(block);
+  if (it == argMapping.end())
+    return;
+  for (auto *op : it->second) {
+    op->dropAllDefinedValueUses();
+    op->erase();
+  }
+  argMapping.erase(it);
+}
+
+/// Cleanup and undo any generated conversions for the arguments of block.
+/// This method differs from 'cancelPendingRewrites' in that it returns the
+/// block signature to its original state.
+void ArgConverter::discardPendingRewrites(Block *block) {
+  auto it = argMapping.find(block);
+  if (it == argMapping.end())
+    return;
+
+  // Erase all of the new arguments.
+  for (int i = block->getNumArguments() - 1; i >= 0; --i) {
+    block->getArgument(i)->dropAllUses();
+    block->eraseArgument(i, /*updatePredTerms=*/false);
+  }
+
+  // Re-instate the old arguments.
+  auto &mapping = it->second;
+  for (unsigned i = 0, e = mapping.size(); i != e; ++i) {
+    auto *op = mapping[i];
+    auto *arg = block->addArgument(op->getResult(0)->getType());
+    op->getResult(0)->replaceAllUsesWith(arg);
+
+    // If this operation is within a block, it will be cleaned up automatically.
+    if (!op->getBlock())
+      op->erase();
+  }
+  argMapping.erase(it);
+}
+
+/// Replace usages of the cast operations with the argument directly.
+void ArgConverter::applyRewrites() {
+  Block *block;
+  ArrayRef<Operation *> argOps;
+  for (auto &mapping : argMapping) {
+    std::tie(block, argOps) = mapping;
+
+    // Process the remapping for each of the original arguments.
+    for (unsigned i = 0, e = argOps.size(); i != e; ++i) {
+      auto *op = argOps[i];
+
+      // Handle the case of a 1->N value mapping.
+      if (op->getNumOperands() > 1) {
+        // If all of the uses were removed, we can drop this op. Otherwise,
+        // keep the operation alive and let the user handle any remaining
+        // usages.
+        if (op->use_empty())
+          op->erase();
+        continue;
+      }
+
+      // If mapping is 1-1, replace the remaining uses and drop the cast
+      // operation.
+      // FIXME(riverriddle) This should check that the result type and operand
+      // type are the same, otherwise it should force a conversion to be
+      // materialized. This works around a current limitation with regards to
+      // region entry argument type conversion.
+      if (op->getNumOperands() == 1) {
+        op->getResult(0)->replaceAllUsesWith(op->getOperand(0));
+        op->destroy();
+        continue;
+      }
+
+      // Otherwise, if there are any dangling uses then replace the fake
+      // conversion operation with one generated by the type converter. This
+      // is necessary as the cast must persist in the IR after conversion.
+      auto *opResult = op->getResult(0);
+      if (!opResult->use_empty()) {
+        rewriter.setInsertionPointToStart(block);
+        SmallVector<Value *, 1> operands(op->getOperands());
+        auto *newOp = typeConverter->materializeConversion(
+            rewriter, opResult->getType(), operands, op->getLoc());
+        opResult->replaceAllUsesWith(newOp->getResult(0));
+      }
+      op->destroy();
+    }
+  }
+}
+
+/// Converts the signature of the given entry block.
+LogicalResult ArgConverter::convertSignature(Block *block,
+                                             BlockAndValueMapping &mapping) {
+  if (auto conversion = typeConverter->convertBlockSignature(block))
+    return applySignatureConversion(block, *conversion, mapping), success();
+  return failure();
+}
+
+/// Apply the given signature conversion on the given block.
+void ArgConverter::applySignatureConversion(
+    Block *block, TypeConverter::SignatureConversion &signatureConversion,
+    BlockAndValueMapping &mapping) {
+  unsigned origArgCount = block->getNumArguments();
+  auto convertedTypes = signatureConversion.getConvertedTypes();
+  if (origArgCount == 0 && convertedTypes.empty())
+    return;
+
+  SmallVector<Value *, 4> newArgRange(block->addArguments(convertedTypes));
+  ArrayRef<Value *> newArgRef(newArgRange);
+
+  // Remap each of the original arguments as determined by the signature
+  // conversion.
+  auto &newArgMapping = argMapping[block];
+  rewriter.setInsertionPointToStart(block);
+  for (unsigned i = 0; i != origArgCount; ++i) {
+    ArrayRef<Value *> remappedValues;
+    if (auto inputMap = signatureConversion.getInputMapping(i))
+      remappedValues = newArgRef.slice(inputMap->inputNo, inputMap->size);
+
+    BlockArgument *arg = block->getArgument(i);
+    newArgMapping.push_back(convertArgument(arg, remappedValues, mapping));
+  }
+
+  // Erase all of the original arguments.
+  for (unsigned i = 0; i != origArgCount; ++i)
+    block->eraseArgument(0, /*updatePredTerms=*/false);
+}
+
+/// Convert the given block argument given the provided set of new argument
+/// values that are to replace it. This function returns the operation used
+/// to perform the conversion.
+Operation *ArgConverter::convertArgument(BlockArgument *origArg,
+                                         ArrayRef<Value *> newValues,
+                                         BlockAndValueMapping &mapping) {
+  // Handle the cases of 1->0 or 1->1 mappings.
+  if (newValues.size() < 2) {
+    // Create a temporary producer for the argument during the conversion
+    // process.
+    auto *cast = createCast(newValues, origArg->getType());
+    origArg->replaceAllUsesWith(cast->getResult(0));
+
+    // Insert a mapping between this argument and the one that is replacing
+    // it.
+    if (!newValues.empty())
+      mapping.map(cast->getResult(0), newValues[0]);
+    return cast;
+  }
+
+  // Otherwise, this is a 1->N mapping. Call into the provided type converter
+  // to pack the new values.
+  auto *cast = typeConverter->materializeConversion(
+      rewriter, origArg->getType(), newValues, loc);
+  assert(cast->getNumResults() == 1 &&
+         cast->getNumOperands() == newValues.size());
+  origArg->replaceAllUsesWith(cast->getResult(0));
+  return cast;
+}
+
+/// A utility function used to create a conversion cast operation with the
+/// given input and result types.
+Operation *ArgConverter::createCast(ArrayRef<Value *> inputs, Type outputType) {
+  return Operation::create(loc, castOpName, inputs, outputType, llvm::None,
+                           llvm::None, 0, false);
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionPatternRewriterImpl
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class contains a snapshot of the current conversion rewriter state.
+/// This is useful when saving and undoing a set of rewrites.
+struct RewriterState {
+  RewriterState(unsigned numCreatedOperations, unsigned numReplacements,
+                unsigned numBlockActions)
+      : numCreatedOperations(numCreatedOperations),
+        numReplacements(numReplacements), numBlockActions(numBlockActions) {}
+
+  /// The current number of created operations.
+  unsigned numCreatedOperations;
+
+  /// The current number of replacements queued.
+  unsigned numReplacements;
+
+  /// The current number of block actions performed.
+  unsigned numBlockActions;
+};
+} // end anonymous namespace
+
+namespace mlir {
+namespace detail {
+struct ConversionPatternRewriterImpl {
+  /// This class represents one requested operation replacement via 'replaceOp'.
+  struct OpReplacement {
+    OpReplacement() = default;
+    OpReplacement(Operation *op, ArrayRef<Value *> newValues)
+        : op(op), newValues(newValues.begin(), newValues.end()) {}
+
+    Operation *op;
+    SmallVector<Value *, 2> newValues;
+  };
+
+  /// The kind of the block action performed during the rewrite.  Actions can be
+  /// undone if the conversion fails.
+  enum class BlockActionKind { Split, Move, TypeConversion };
+
+  /// Original position of the given block in its parent region.  We cannot use
+  /// a region iterator because it could have been invalidated by other region
+  /// operations since the position was stored.
+  struct BlockPosition {
+    Region *region;
+    Region::iterator::difference_type position;
+  };
+
+  /// The storage class for an undoable block action (one of BlockActionKind),
+  /// contains the information necessary to undo this action.
+  struct BlockAction {
+    static BlockAction getSplit(Block *block, Block *originalBlock) {
+      BlockAction action{BlockActionKind::Split, block, {}};
+      action.originalBlock = originalBlock;
+      return action;
+    }
+    static BlockAction getMove(Block *block, BlockPosition originalPos) {
+      return {BlockActionKind::Move, block, {originalPos}};
+    }
+    static BlockAction getTypeConversion(Block *block) {
+      return BlockAction{BlockActionKind::TypeConversion, block, {}};
+    }
+
+    // The action kind.
+    BlockActionKind kind;
+
+    // A pointer to the block that was created by the action.
+    Block *block;
+
+    union {
+      // In use if kind == BlockActionKind::Move and contains a pointer to the
+      // region that originally contained the block as well as the position of
+      // the block in that region.
+      BlockPosition originalPosition;
+      // In use if kind == BlockActionKind::Split and contains a pointer to the
+      // block that was split into two parts.
+      Block *originalBlock;
+    };
+  };
+
+  ConversionPatternRewriterImpl(PatternRewriter &rewriter,
+                                TypeConverter *converter)
+      : argConverter(converter, rewriter) {}
+
+  /// Return the current state of the rewriter.
+  RewriterState getCurrentState();
+
+  /// Reset the state of the rewriter to a previously saved point.
+  void resetState(RewriterState state);
+
+  /// Undo the block actions (motions, splits) one by one in reverse order until
+  /// "numActionsToKeep" actions remains.
+  void undoBlockActions(unsigned numActionsToKeep = 0);
+
+  /// Cleanup and destroy any generated rewrite operations. This method is
+  /// invoked when the conversion process fails.
+  void discardRewrites();
+
+  /// Apply all requested operation rewrites. This method is invoked when the
+  /// conversion process succeeds.
+  void applyRewrites();
+
+  /// Convert the signature of the given block.
+  LogicalResult convertBlockSignature(Block *block);
+
+  /// Apply a signature conversion on the given region.
+  void applySignatureConversion(Region *region,
+                                TypeConverter::SignatureConversion &conversion);
+
+  /// PatternRewriter hook for replacing the results of an operation.
+  void replaceOp(Operation *op, ArrayRef<Value *> newValues,
+                 ArrayRef<Value *> valuesToRemoveIfDead);
+
+  /// Notifies that a block was split.
+  void notifySplitBlock(Block *block, Block *continuation);
+
+  /// Notifies that the blocks of a region are about to be moved.
+  void notifyRegionIsBeingInlinedBefore(Region &region, Region &parent,
+                                        Region::iterator before);
+
+  /// Remap the given operands to those with potentially different types.
+  void remapValues(Operation::operand_range operands,
+                   SmallVectorImpl<Value *> &remapped);
+
+  // Mapping between replaced values that differ in type. This happens when
+  // replacing a value with one of a different type.
+  BlockAndValueMapping mapping;
+
+  /// Utility used to convert block arguments.
+  ArgConverter argConverter;
+
+  /// Ordered vector of all of the newly created operations during conversion.
+  SmallVector<Operation *, 4> createdOps;
+
+  /// Ordered vector of any requested operation replacements.
+  SmallVector<OpReplacement, 4> replacements;
+
+  /// Ordered list of block operations (creations, splits, motions).
+  SmallVector<BlockAction, 4> blockActions;
+};
+} // end namespace detail
+} // end namespace mlir
+
+RewriterState ConversionPatternRewriterImpl::getCurrentState() {
+  return RewriterState(createdOps.size(), replacements.size(),
+                       blockActions.size());
+}
+
+void ConversionPatternRewriterImpl::resetState(RewriterState state) {
+  // Undo any block actions.
+  undoBlockActions(state.numBlockActions);
+
+  // Reset any replaced operations and undo any saved mappings.
+  for (auto &repl : llvm::drop_begin(replacements, state.numReplacements))
+    for (auto *result : repl.op->getResults())
+      mapping.erase(result);
+  replacements.resize(state.numReplacements);
+
+  // Pop all of the newly created operations.
+  while (createdOps.size() != state.numCreatedOperations)
+    createdOps.pop_back_val()->erase();
+}
+
+void ConversionPatternRewriterImpl::undoBlockActions(
+    unsigned numActionsToKeep) {
+  for (auto &action :
+       llvm::reverse(llvm::drop_begin(blockActions, numActionsToKeep))) {
+    switch (action.kind) {
+    // Merge back the block that was split out.
+    case BlockActionKind::Split: {
+      action.originalBlock->getOperations().splice(
+          action.originalBlock->end(), action.block->getOperations());
+      action.block->erase();
+      break;
+    }
+    // Move the block back to its original position.
+    case BlockActionKind::Move: {
+      Region *originalRegion = action.originalPosition.region;
+      originalRegion->getBlocks().splice(
+          std::next(originalRegion->begin(), action.originalPosition.position),
+          action.block->getParent()->getBlocks(), action.block);
+      break;
+    }
+    // Undo the type conversion.
+    case BlockActionKind::TypeConversion: {
+      argConverter.discardPendingRewrites(action.block);
+      break;
+    }
+    }
+  }
+  blockActions.resize(numActionsToKeep);
+}
+
+void ConversionPatternRewriterImpl::discardRewrites() {
+  undoBlockActions();
+
+  // Remove any newly created ops.
+  for (auto *op : createdOps) {
+    op->dropAllDefinedValueUses();
+    op->erase();
+  }
+}
+
+void ConversionPatternRewriterImpl::applyRewrites() {
+  // Apply all of the rewrites replacements requested during conversion.
+  for (auto &repl : replacements) {
+    for (unsigned i = 0, e = repl.newValues.size(); i != e; ++i)
+      repl.op->getResult(i)->replaceAllUsesWith(
+          mapping.lookupOrDefault(repl.newValues[i]));
+
+    // If this operation defines any regions, drop any pending argument
+    // rewrites.
+    if (argConverter.typeConverter && repl.op->getNumRegions()) {
+      for (auto &region : repl.op->getRegions())
+        for (auto &block : region)
+          argConverter.cancelPendingRewrites(&block);
+    }
+  }
+
+  // In a second pass, erase all of the replaced operations in reverse. This
+  // allows processing nested operations before their parent region is
+  // destroyed.
+  for (auto &repl : llvm::reverse(replacements))
+    repl.op->erase();
+
+  argConverter.applyRewrites();
+}
+
+LogicalResult
+ConversionPatternRewriterImpl::convertBlockSignature(Block *block) {
+  // Check to see if this block should not be converted:
+  // * There is no type converter.
+  // * The block has already been converted.
+  // * This is an entry block, these are converted explicitly via patterns.
+  if (!argConverter.typeConverter || argConverter.hasBeenConverted(block) ||
+      block->isEntryBlock())
+    return success();
+
+  // Otherwise, try to convert the block signature.
+  if (failed(argConverter.convertSignature(block, mapping)))
+    return failure();
+  blockActions.push_back(BlockAction::getTypeConversion(block));
+  return success();
+}
+
+void ConversionPatternRewriterImpl::applySignatureConversion(
+    Region *region, TypeConverter::SignatureConversion &conversion) {
+  if (!region->empty()) {
+    argConverter.applySignatureConversion(&region->front(), conversion,
+                                          mapping);
+    blockActions.push_back(BlockAction::getTypeConversion(&region->front()));
+  }
+}
+
+void ConversionPatternRewriterImpl::replaceOp(
+    Operation *op, ArrayRef<Value *> newValues,
+    ArrayRef<Value *> valuesToRemoveIfDead) {
+  assert(newValues.size() == op->getNumResults());
+
+  // Create mappings for each of the new result values.
+  for (unsigned i = 0, e = newValues.size(); i < e; ++i) {
+    assert((newValues[i] || op->getResult(i)->use_empty()) &&
+           "result value has remaining uses that must be replaced");
+    if (newValues[i])
+      mapping.map(op->getResult(i), newValues[i]);
+  }
+
+  // Record the requested operation replacement.
+  replacements.emplace_back(op, newValues);
+}
+
+void ConversionPatternRewriterImpl::notifySplitBlock(Block *block,
+                                                     Block *continuation) {
+  blockActions.push_back(BlockAction::getSplit(continuation, block));
+}
+
+void ConversionPatternRewriterImpl::notifyRegionIsBeingInlinedBefore(
+    Region &region, Region &parent, Region::iterator before) {
+  for (auto &pair : llvm::enumerate(region)) {
+    Block &block = pair.value();
+    unsigned position = pair.index();
+    blockActions.push_back(BlockAction::getMove(&block, {&region, position}));
+  }
+}
+
+void ConversionPatternRewriterImpl::remapValues(
+    Operation::operand_range operands, SmallVectorImpl<Value *> &remapped) {
+  remapped.reserve(llvm::size(operands));
+  for (Value *operand : operands)
+    remapped.push_back(mapping.lookupOrDefault(operand));
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionPatternRewriter
+//===----------------------------------------------------------------------===//
+
+ConversionPatternRewriter::ConversionPatternRewriter(MLIRContext *ctx,
+                                                     TypeConverter *converter)
+    : PatternRewriter(ctx),
+      impl(new detail::ConversionPatternRewriterImpl(*this, converter)) {}
+ConversionPatternRewriter::~ConversionPatternRewriter() {}
+
+/// PatternRewriter hook for replacing the results of an operation.
+void ConversionPatternRewriter::replaceOp(
+    Operation *op, ArrayRef<Value *> newValues,
+    ArrayRef<Value *> valuesToRemoveIfDead) {
+  impl->replaceOp(op, newValues, valuesToRemoveIfDead);
+}
+
+/// Apply a signature conversion to the entry block of the given region.
+void ConversionPatternRewriter::applySignatureConversion(
+    Region *region, TypeConverter::SignatureConversion &conversion) {
+  impl->applySignatureConversion(region, conversion);
+}
+
+/// Clone the given operation without cloning its regions.
+Operation *ConversionPatternRewriter::cloneWithoutRegions(Operation *op) {
+  Operation *newOp = OpBuilder::cloneWithoutRegions(*op);
+  impl->createdOps.push_back(newOp);
+  return newOp;
+}
+
+/// PatternRewriter hook for splitting a block into two parts.
+Block *ConversionPatternRewriter::splitBlock(Block *block,
+                                             Block::iterator before) {
+  auto *continuation = PatternRewriter::splitBlock(block, before);
+  impl->notifySplitBlock(block, continuation);
+  return continuation;
+}
+
+/// PatternRewriter hook for moving blocks out of a region.
+void ConversionPatternRewriter::inlineRegionBefore(Region &region,
+                                                   Region &parent,
+                                                   Region::iterator before) {
+  impl->notifyRegionIsBeingInlinedBefore(region, parent, before);
+  PatternRewriter::inlineRegionBefore(region, parent, before);
+}
+
+/// PatternRewriter hook for creating a new operation.
+Operation *
+ConversionPatternRewriter::createOperation(const OperationState &state) {
+  auto *result = OpBuilder::createOperation(state);
+  impl->createdOps.push_back(result);
+  return result;
+}
+
+/// PatternRewriter hook for updating the root operation in-place.
+void ConversionPatternRewriter::notifyRootUpdated(Operation *op) {
+  // The rewriter caches changes to the IR to allow for operating in-place and
+  // backtracking. The rewriter is currently not capable of backtracking
+  // in-place modifications.
+  llvm_unreachable("in-place operation updates are not supported");
+}
+
+/// Return a reference to the internal implementation.
+detail::ConversionPatternRewriterImpl &ConversionPatternRewriter::getImpl() {
+  return *impl;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+/// Attempt to match and rewrite the IR root at the specified operation.
+PatternMatchResult
+ConversionPattern::matchAndRewrite(Operation *op,
+                                   PatternRewriter &rewriter) const {
+  SmallVector<Value *, 4> operands;
+  auto &dialectRewriter = static_cast<ConversionPatternRewriter &>(rewriter);
+  dialectRewriter.getImpl().remapValues(op->getOperands(), operands);
+
+  // If this operation has no successors, invoke the rewrite directly.
+  if (op->getNumSuccessors() == 0)
+    return matchAndRewrite(op, operands, dialectRewriter);
+
+  // Otherwise, we need to remap the successors.
+  SmallVector<Block *, 2> destinations;
+  destinations.reserve(op->getNumSuccessors());
+
+  SmallVector<ArrayRef<Value *>, 2> operandsPerDestination;
+  unsigned firstSuccessorOperand = op->getSuccessorOperandIndex(0);
+  for (unsigned i = 0, seen = 0, e = op->getNumSuccessors(); i < e; ++i) {
+    destinations.push_back(op->getSuccessor(i));
+
+    // Lookup the successors operands.
+    unsigned n = op->getNumSuccessorOperands(i);
+    operandsPerDestination.push_back(
+        llvm::makeArrayRef(operands.data() + firstSuccessorOperand + seen, n));
+    seen += n;
+  }
+
+  // Rewrite the operation.
+  return matchAndRewrite(
+      op,
+      llvm::makeArrayRef(operands.data(),
+                         operands.data() + firstSuccessorOperand),
+      destinations, operandsPerDestination, dialectRewriter);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationLegalizer
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A set of rewrite patterns that can be used to legalize a given operation.
+using LegalizationPatterns = SmallVector<RewritePattern *, 1>;
+
+/// This class defines a recursive operation legalizer.
+class OperationLegalizer {
+public:
+  using LegalizationAction = ConversionTarget::LegalizationAction;
+
+  OperationLegalizer(ConversionTarget &targetInfo,
+                     const OwningRewritePatternList &patterns)
+      : target(targetInfo) {
+    buildLegalizationGraph(patterns);
+    computeLegalizationGraphBenefit();
+  }
+
+  /// Returns if the given operation is known to be illegal on the target.
+  bool isIllegal(Operation *op) const;
+
+  /// Attempt to legalize the given operation. Returns success if the operation
+  /// was legalized, failure otherwise.
+  LogicalResult legalize(Operation *op, ConversionPatternRewriter &rewriter);
+
+private:
+  /// Attempt to legalize the given operation by applying the provided pattern.
+  /// Returns success if the operation was legalized, failure otherwise.
+  LogicalResult legalizePattern(Operation *op, RewritePattern *pattern,
+                                ConversionPatternRewriter &rewriter);
+
+  /// Build an optimistic legalization graph given the provided patterns. This
+  /// function populates 'legalizerPatterns' with the operations that are not
+  /// directly legal, but may be transitively legal for the current target given
+  /// the provided patterns.
+  void buildLegalizationGraph(const OwningRewritePatternList &patterns);
+
+  /// Compute the benefit of each node within the computed legalization graph.
+  /// This orders the patterns within 'legalizerPatterns' based upon two
+  /// criteria:
+  ///  1) Prefer patterns that have the lowest legalization depth, i.e.
+  ///     represent the more direct mapping to the target.
+  ///  2) When comparing patterns with the same legalization depth, prefer the
+  ///     pattern with the highest PatternBenefit. This allows for users to
+  ///     prefer specific legalizations over others.
+  void computeLegalizationGraphBenefit();
+
+  /// The current set of patterns that have been applied.
+  llvm::SmallPtrSet<RewritePattern *, 8> appliedPatterns;
+
+  /// The set of legality information for operations transitively supported by
+  /// the target.
+  DenseMap<OperationName, LegalizationPatterns> legalizerPatterns;
+
+  /// The legalization information provided by the target.
+  ConversionTarget &target;
+};
+} // namespace
+
+bool OperationLegalizer::isIllegal(Operation *op) const {
+  // Check if the target explicitly marked this operation as illegal.
+  if (auto action = target.getOpAction(op->getName()))
+    return action == LegalizationAction::Illegal;
+  return false;
+}
+
+LogicalResult
+OperationLegalizer::legalize(Operation *op,
+                             ConversionPatternRewriter &rewriter) {
+  LLVM_DEBUG(llvm::dbgs() << "Legalizing operation : " << op->getName()
+                          << "\n");
+
+  // Check if this operation is legal on the target.
+  if (target.isLegal(op)) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "-- Success : Operation marked legal by the target\n");
+    return success();
+  }
+
+  // Otherwise, we need to apply a legalization pattern to this operation.
+  auto it = legalizerPatterns.find(op->getName());
+  if (it == legalizerPatterns.end()) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL : no known legalization path.\n");
+    return failure();
+  }
+
+  // The patterns are sorted by expected benefit, so try to apply each in-order.
+  for (auto *pattern : it->second)
+    if (succeeded(legalizePattern(op, pattern, rewriter)))
+      return success();
+
+  LLVM_DEBUG(llvm::dbgs() << "-- FAIL : no matched legalization pattern.\n");
+  return failure();
+}
+
+LogicalResult
+OperationLegalizer::legalizePattern(Operation *op, RewritePattern *pattern,
+                                    ConversionPatternRewriter &rewriter) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "-* Applying rewrite pattern '" << op->getName() << " -> (";
+    interleaveComma(pattern->getGeneratedOps(), llvm::dbgs());
+    llvm::dbgs() << ")'.\n";
+  });
+
+  // Ensure that we don't cycle by not allowing the same pattern to be
+  // applied twice in the same recursion stack.
+  // TODO(riverriddle) We could eventually converge, but that requires more
+  // complicated analysis.
+  if (!appliedPatterns.insert(pattern).second) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Pattern was already applied.\n");
+    return failure();
+  }
+
+  auto &rewriterImpl = rewriter.getImpl();
+  RewriterState curState = rewriterImpl.getCurrentState();
+  auto cleanupFailure = [&] {
+    // Reset the rewriter state and pop this pattern.
+    rewriterImpl.resetState(curState);
+    appliedPatterns.erase(pattern);
+    return failure();
+  };
+
+  // Try to rewrite with the given pattern.
+  rewriter.setInsertionPoint(op);
+  if (!pattern->matchAndRewrite(op, rewriter)) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Pattern failed to match.\n");
+    return cleanupFailure();
+  }
+
+  // If the pattern moved any blocks, try to legalize their types. This ensures
+  // that the types of the block arguments are legal for the region they were
+  // moved into.
+  for (unsigned i = curState.numBlockActions,
+                e = rewriterImpl.blockActions.size();
+       i != e; ++i) {
+    auto &action = rewriterImpl.blockActions[i];
+    if (action.kind != ConversionPatternRewriterImpl::BlockActionKind::Move)
+      continue;
+
+    // Convert the block signature.
+    if (failed(rewriterImpl.convertBlockSignature(action.block))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "-- FAIL: failed to convert types of moved block.\n");
+      return cleanupFailure();
+    }
+  }
+
+  // Recursively legalize each of the new operations.
+  for (unsigned i = curState.numCreatedOperations,
+                e = rewriterImpl.createdOps.size();
+       i != e; ++i) {
+    if (failed(legalize(rewriterImpl.createdOps[i], rewriter))) {
+      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Generated operation was illegal.\n");
+      return cleanupFailure();
+    }
+  }
+
+  appliedPatterns.erase(pattern);
+  return success();
+}
+
+void OperationLegalizer::buildLegalizationGraph(
+    const OwningRewritePatternList &patterns) {
+  // A mapping between an operation and a set of operations that can be used to
+  // generate it.
+  DenseMap<OperationName, SmallPtrSet<OperationName, 2>> parentOps;
+  // A mapping between an operation and any currently invalid patterns it has.
+  DenseMap<OperationName, SmallPtrSet<RewritePattern *, 2>> invalidPatterns;
+  // A worklist of patterns to consider for legality.
+  llvm::SetVector<RewritePattern *> patternWorklist;
+
+  // Build the mapping from operations to the parent ops that may generate them.
+  for (auto &pattern : patterns) {
+    auto root = pattern->getRootKind();
+
+    // Skip operations that are always known to be legal.
+    if (target.getOpAction(root) == LegalizationAction::Legal)
+      continue;
+
+    // Add this pattern to the invalid set for the root op and record this root
+    // as a parent for any generated operations.
+    invalidPatterns[root].insert(pattern.get());
+    for (auto op : pattern->getGeneratedOps())
+      parentOps[op].insert(root);
+
+    // Add this pattern to the worklist.
+    patternWorklist.insert(pattern.get());
+  }
+
+  while (!patternWorklist.empty()) {
+    auto *pattern = patternWorklist.pop_back_val();
+
+    // Check to see if any of the generated operations are invalid.
+    if (llvm::any_of(pattern->getGeneratedOps(), [&](OperationName op) {
+          auto action = target.getOpAction(op);
+          return !legalizerPatterns.count(op) &&
+                 (!action || action == LegalizationAction::Illegal);
+        }))
+      continue;
+
+    // Otherwise, if all of the generated operation are valid, this op is now
+    // legal so add all of the child patterns to the worklist.
+    legalizerPatterns[pattern->getRootKind()].push_back(pattern);
+    invalidPatterns[pattern->getRootKind()].erase(pattern);
+
+    // Add any invalid patterns of the parent operations to see if they have now
+    // become legal.
+    for (auto op : parentOps[pattern->getRootKind()])
+      patternWorklist.set_union(invalidPatterns[op]);
+  }
+}
+
+void OperationLegalizer::computeLegalizationGraphBenefit() {
+  // The smallest pattern depth, when legalizing an operation.
+  DenseMap<OperationName, unsigned> minPatternDepth;
+
+  // Compute the minimum legalization depth for a given operation.
+  std::function<unsigned(OperationName)> computeDepth = [&](OperationName op) {
+    // Check for existing depth.
+    auto depthIt = minPatternDepth.find(op);
+    if (depthIt != minPatternDepth.end())
+      return depthIt->second;
+
+    // If a mapping for this operation does not exist, then this operation
+    // is always legal. Return 0 as the depth for a directly legal operation.
+    auto opPatternsIt = legalizerPatterns.find(op);
+    if (opPatternsIt == legalizerPatterns.end())
+      return 0u;
+
+    auto &minDepth = minPatternDepth[op];
+    if (opPatternsIt->second.empty())
+      return minDepth;
+
+    // Initialize the depth to the maximum value.
+    minDepth = std::numeric_limits<unsigned>::max();
+
+    // Compute the depth for each pattern used to legalize this operation.
+    SmallVector<std::pair<RewritePattern *, unsigned>, 4> patternsByDepth;
+    patternsByDepth.reserve(opPatternsIt->second.size());
+    for (RewritePattern *pattern : opPatternsIt->second) {
+      unsigned depth = 0;
+      for (auto generatedOp : pattern->getGeneratedOps())
+        depth = std::max(depth, computeDepth(generatedOp) + 1);
+      patternsByDepth.emplace_back(pattern, depth);
+
+      // Update the min depth for this operation.
+      minDepth = std::min(minDepth, depth);
+    }
+
+    // If the operation only has one legalization pattern, there is no need to
+    // sort them.
+    if (patternsByDepth.size() == 1)
+      return minDepth;
+
+    // Sort the patterns by those likely to be the most beneficial.
+    llvm::array_pod_sort(
+        patternsByDepth.begin(), patternsByDepth.end(),
+        [](const std::pair<RewritePattern *, unsigned> *lhs,
+           const std::pair<RewritePattern *, unsigned> *rhs) {
+          // First sort by the smaller pattern legalization depth.
+          if (lhs->second != rhs->second)
+            return llvm::array_pod_sort_comparator<unsigned>(&lhs->second,
+                                                             &rhs->second);
+
+          // Then sort by the larger pattern benefit.
+          auto lhsBenefit = lhs->first->getBenefit();
+          auto rhsBenefit = rhs->first->getBenefit();
+          return llvm::array_pod_sort_comparator<PatternBenefit>(&rhsBenefit,
+                                                                 &lhsBenefit);
+        });
+
+    // Update the legalization pattern to use the new sorted list.
+    opPatternsIt->second.clear();
+    for (auto &patternIt : patternsByDepth)
+      opPatternsIt->second.push_back(patternIt.first);
+
+    return minDepth;
+  };
+
+  // For each operation that is transitively legal, compute a cost for it.
+  for (auto &opIt : legalizerPatterns)
+    if (!minPatternDepth.count(opIt.first))
+      computeDepth(opIt.first);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationConverter
+//===----------------------------------------------------------------------===//
+namespace {
+enum OpConversionMode {
+  // In this mode, the conversion will ignore failed conversions to allow
+  // illegal operations to co-exist in the IR.
+  Partial,
+
+  // In this mode, all operations must be legal for the given target for the
+  // conversion to succeeed.
+  Full,
+
+  // In this mode, operations are analyzed for legality. No actual rewrites are
+  // applied to the operations on success.
+  Analysis,
+};
+
+// This class converts operations to a given conversion target via a set of
+// rewrite patterns. The conversion behaves differently depending on the
+// conversion mode.
+struct OperationConverter {
+  explicit OperationConverter(ConversionTarget &target,
+                              const OwningRewritePatternList &patterns,
+                              OpConversionMode mode,
+                              DenseSet<Operation *> *legalizableOps = nullptr)
+      : opLegalizer(target, patterns), mode(mode),
+        legalizableOps(legalizableOps) {}
+
+  /// Converts the given operations to the conversion target.
+  LogicalResult convertOperations(ArrayRef<Operation *> ops,
+                                  TypeConverter *typeConverter);
+
+private:
+  /// Converts an operation with the given rewriter.
+  LogicalResult convert(ConversionPatternRewriter &rewriter, Operation *op);
+
+  /// Recursively collect all of the operations to convert from within 'region'.
+  LogicalResult computeConversionSet(Region &region,
+                                     std::vector<Operation *> &toConvert);
+
+  /// Converts the type signatures of the blocks nested within 'op'.
+  LogicalResult convertBlockSignatures(ConversionPatternRewriter &rewriter,
+                                       Operation *op);
+
+  /// The legalizer to use when converting operations.
+  OperationLegalizer opLegalizer;
+
+  /// The conversion mode to use when legalizing operations.
+  OpConversionMode mode;
+
+  /// A set of pre-existing operations that were found to be legalizable to the
+  /// target. This field is only used when mode == OpConversionMode::Analysis.
+  DenseSet<Operation *> *legalizableOps;
+};
+} // end anonymous namespace
+
+LogicalResult
+OperationConverter::convertBlockSignatures(ConversionPatternRewriter &rewriter,
+                                           Operation *op) {
+  // Check to see if type signatures need to be converted.
+  if (!rewriter.getImpl().argConverter.typeConverter)
+    return success();
+
+  for (auto &region : op->getRegions()) {
+    for (auto &block : region)
+      if (failed(rewriter.getImpl().convertBlockSignature(&block)))
+        return failure();
+  }
+  return success();
+}
+
+LogicalResult
+OperationConverter::computeConversionSet(Region &region,
+                                         std::vector<Operation *> &toConvert) {
+  if (region.empty())
+    return success();
+
+  // Traverse starting from the entry block.
+  SmallVector<Block *, 16> worklist(1, &region.front());
+  DenseSet<Block *> visitedBlocks;
+  visitedBlocks.insert(&region.front());
+  while (!worklist.empty()) {
+    auto *block = worklist.pop_back_val();
+
+    // Compute the conversion set of each of the nested operations.
+    for (auto &op : *block) {
+      toConvert.emplace_back(&op);
+      for (auto &region : op.getRegions())
+        computeConversionSet(region, toConvert);
+    }
+
+    // Recurse to children that haven't been visited.
+    for (Block *succ : block->getSuccessors())
+      if (visitedBlocks.insert(succ).second)
+        worklist.push_back(succ);
+  }
+
+  // Check that all blocks in the region were visited.
+  if (llvm::any_of(llvm::drop_begin(region.getBlocks(), 1),
+                   [&](Block &block) { return !visitedBlocks.count(&block); }))
+    return emitError(region.getLoc(), "unreachable blocks were not converted");
+  return success();
+}
+
+LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
+                                          Operation *op) {
+  // Legalize the given operation.
+  if (failed(opLegalizer.legalize(op, rewriter))) {
+    // Handle the case of a failed conversion for each of the different modes.
+    /// Full conversions expect all operations to be converted.
+    if (mode == OpConversionMode::Full)
+      return op->emitError()
+             << "failed to legalize operation '" << op->getName() << "'";
+    /// Partial conversions allow conversions to fail iff the operation was not
+    /// explicitly marked as illegal.
+    if (mode == OpConversionMode::Partial && opLegalizer.isIllegal(op))
+      return op->emitError()
+             << "failed to legalize operation '" << op->getName()
+             << "' that was explicitly marked illegal";
+  } else {
+    /// Analysis conversions don't fail if any operations fail to legalize,
+    /// they are only interested in the operations that were successfully
+    /// legalized.
+    if (mode == OpConversionMode::Analysis)
+      legalizableOps->insert(op);
+
+    // If legalization succeeded, convert the types any of the blocks within
+    // this operation.
+    if (failed(convertBlockSignatures(rewriter, op)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult
+OperationConverter::convertOperations(ArrayRef<Operation *> ops,
+                                      TypeConverter *typeConverter) {
+  if (ops.empty())
+    return success();
+
+  /// Compute the set of operations and blocks to convert.
+  std::vector<Operation *> toConvert;
+  for (auto *op : ops) {
+    toConvert.emplace_back(op);
+    for (auto &region : op->getRegions())
+      if (failed(computeConversionSet(region, toConvert)))
+        return failure();
+  }
+
+  // Convert each operation and discard rewrites on failure.
+  ConversionPatternRewriter rewriter(ops.front()->getContext(), typeConverter);
+  for (auto *op : toConvert)
+    if (failed(convert(rewriter, op)))
+      return rewriter.getImpl().discardRewrites(), failure();
+
+  // Otherwise, the body conversion succeeded. Apply rewrites if this is not an
+  // analysis conversion.
+  if (mode == OpConversionMode::Analysis)
+    rewriter.getImpl().discardRewrites();
+  else
+    rewriter.getImpl().applyRewrites();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+/// Remap an input of the original signature with a new set of types. The
+/// new types are appended to the new signature conversion.
+void TypeConverter::SignatureConversion::addInputs(unsigned origInputNo,
+                                                   ArrayRef<Type> types) {
+  assert(!types.empty() && "expected valid types");
+  remapInput(origInputNo, /*newInputNo=*/argTypes.size(), types.size());
+  addInputs(types);
+}
+
+/// Append new input types to the signature conversion, this should only be
+/// used if the new types are not intended to remap an existing input.
+void TypeConverter::SignatureConversion::addInputs(ArrayRef<Type> types) {
+  assert(!types.empty() &&
+         "1->0 type remappings don't need to be added explicitly");
+  argTypes.append(types.begin(), types.end());
+}
+
+/// Remap an input of the original signature with a range of types in the
+/// new signature.
+void TypeConverter::SignatureConversion::remapInput(unsigned origInputNo,
+                                                    unsigned newInputNo,
+                                                    unsigned newInputCount) {
+  assert(!remappedInputs[origInputNo] && "input has already been remapped");
+  assert(newInputCount != 0 && "expected valid input count");
+  remappedInputs[origInputNo] = InputMapping{newInputNo, newInputCount};
+}
+
+/// This hooks allows for converting a type.
+LogicalResult TypeConverter::convertType(Type t,
+                                         SmallVectorImpl<Type> &results) {
+  if (auto newT = convertType(t)) {
+    results.push_back(newT);
+    return success();
+  }
+  return failure();
+}
+
+/// Convert the given set of types, filling 'results' as necessary. This
+/// returns failure if the conversion of any of the types fails, success
+/// otherwise.
+LogicalResult TypeConverter::convertTypes(ArrayRef<Type> types,
+                                          SmallVectorImpl<Type> &results) {
+  for (auto type : types)
+    if (failed(convertType(type, results)))
+      return failure();
+  return success();
+}
+
+/// Return true if the given type is legal for this type converter, i.e. the
+/// type converts to itself.
+bool TypeConverter::isLegal(Type type) {
+  SmallVector<Type, 1> results;
+  return succeeded(convertType(type, results)) && results.size() == 1 &&
+         results.front() == type;
+}
+
+/// Return true if the inputs and outputs of the given function type are
+/// legal.
+bool TypeConverter::isSignatureLegal(FunctionType funcType) {
+  return llvm::all_of(
+      llvm::concat<const Type>(funcType.getInputs(), funcType.getResults()),
+      [this](Type type) { return isLegal(type); });
+}
+
+/// This hook allows for converting a specific argument of a signature.
+LogicalResult TypeConverter::convertSignatureArg(unsigned inputNo, Type type,
+                                                 SignatureConversion &result) {
+  // Try to convert the given input type.
+  SmallVector<Type, 1> convertedTypes;
+  if (failed(convertType(type, convertedTypes)))
+    return failure();
+
+  // If this argument is being dropped, there is nothing left to do.
+  if (convertedTypes.empty())
+    return success();
+
+  // Otherwise, add the new inputs.
+  result.addInputs(inputNo, convertedTypes);
+  return success();
+}
+
+/// Create a default conversion pattern that rewrites the type signature of a
+/// FuncOp.
+namespace {
+struct FuncOpSignatureConversion : public ConversionPattern {
+  FuncOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter)
+      : ConversionPattern(FuncOp::getOperationName(), 1, ctx),
+        converter(converter) {}
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto funcOp = cast<FuncOp>(op);
+    FunctionType type = funcOp.getType();
+
+    // Convert the original function arguments.
+    TypeConverter::SignatureConversion result(type.getNumInputs());
+    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i)
+      if (failed(converter.convertSignatureArg(i, type.getInput(i), result)))
+        return matchFailure();
+
+    // Convert the original function results.
+    SmallVector<Type, 1> convertedResults;
+    if (failed(converter.convertTypes(type.getResults(), convertedResults)))
+      return matchFailure();
+
+    // Create a new function with an updated signature.
+    auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
+    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                                newFuncOp.end());
+    newFuncOp.setType(FunctionType::get(result.getConvertedTypes(),
+                                        convertedResults, funcOp.getContext()));
+
+    // Tell the rewriter to convert the region signature.
+    rewriter.applySignatureConversion(&newFuncOp.getBody(), result);
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+
+  /// The type converter to use when rewriting the signature.
+  TypeConverter &converter;
+};
+} // end anonymous namespace
+
+void mlir::populateFuncOpTypeConversionPattern(
+    OwningRewritePatternList &patterns, MLIRContext *ctx,
+    TypeConverter &converter) {
+  patterns.insert<FuncOpSignatureConversion>(ctx, converter);
+}
+
+/// This function converts the type signature of the given block, by invoking
+/// 'convertSignatureArg' for each argument. This function should return a valid
+/// conversion for the signature on success, None otherwise.
+auto TypeConverter::convertBlockSignature(Block *block)
+    -> llvm::Optional<SignatureConversion> {
+  SignatureConversion conversion(block->getNumArguments());
+  for (unsigned i = 0, e = block->getNumArguments(); i != e; ++i)
+    if (failed(convertSignatureArg(i, block->getArgument(i)->getType(),
+                                   conversion)))
+      return llvm::None;
+  return conversion;
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionTarget
+//===----------------------------------------------------------------------===//
+
+/// Register a legality action for the given operation.
+void ConversionTarget::setOpAction(OperationName op,
+                                   LegalizationAction action) {
+  legalOperations[op] = action;
+}
+
+/// Register a legality action for the given dialects.
+void ConversionTarget::setDialectAction(ArrayRef<StringRef> dialectNames,
+                                        LegalizationAction action) {
+  for (StringRef dialect : dialectNames)
+    legalDialects[dialect] = action;
+}
+
+/// Get the legality action for the given operation.
+auto ConversionTarget::getOpAction(OperationName op) const
+    -> llvm::Optional<LegalizationAction> {
+  // Check for an action for this specific operation.
+  auto it = legalOperations.find(op);
+  if (it != legalOperations.end())
+    return it->second;
+  // Otherwise, default to checking for an action on the parent dialect.
+  auto dialectIt = legalDialects.find(op.getDialect());
+  if (dialectIt != legalDialects.end())
+    return dialectIt->second;
+  return llvm::None;
+}
+
+/// Return if the given operation instance is legal on this target.
+bool ConversionTarget::isLegal(Operation *op) const {
+  auto action = getOpAction(op->getName());
+
+  // Handle dynamic legality.
+  if (action == LegalizationAction::Dynamic) {
+    // Check for callbacks on the operation or dialect.
+    auto opFn = opLegalityFns.find(op->getName());
+    if (opFn != opLegalityFns.end())
+      return opFn->second(op);
+    auto dialectFn = dialectLegalityFns.find(op->getName().getDialect());
+    if (dialectFn != dialectLegalityFns.end())
+      return dialectFn->second(op);
+
+    // Otherwise, invoke the hook on the derived instance.
+    return isDynamicallyLegal(op);
+  }
+
+  // Otherwise, the operation is only legal if it was marked 'Legal'.
+  return action == LegalizationAction::Legal;
+}
+
+/// Set the dynamic legality callback for the given operation.
+void ConversionTarget::setLegalityCallback(
+    OperationName name, const DynamicLegalityCallbackFn &callback) {
+  assert(callback && "expected valid legality callback");
+  opLegalityFns[name] = callback;
+}
+
+/// Set the dynamic legality callback for the given dialects.
+void ConversionTarget::setLegalityCallback(
+    ArrayRef<StringRef> dialects, const DynamicLegalityCallbackFn &callback) {
+  assert(callback && "expected valid legality callback");
+  for (StringRef dialect : dialects)
+    dialectLegalityFns[dialect] = callback;
+}
+
+//===----------------------------------------------------------------------===//
+// Op Conversion Entry Points
+//===----------------------------------------------------------------------===//
+
+/// Apply a partial conversion on the given operations, and all nested
+/// operations. This method converts as many operations to the target as
+/// possible, ignoring operations that failed to legalize.
+LogicalResult mlir::applyPartialConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns, TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Partial);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyPartialConversion(Operation *op, ConversionTarget &target,
+                             const OwningRewritePatternList &patterns,
+                             TypeConverter *converter) {
+  return applyPartialConversion(llvm::makeArrayRef(op), target, patterns,
+                                converter);
+}
+
+/// Apply a complete conversion on the given operations, and all nested
+/// operations. This method will return failure if the conversion of any
+/// operation fails.
+LogicalResult
+mlir::applyFullConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                          const OwningRewritePatternList &patterns,
+                          TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Full);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyFullConversion(Operation *op, ConversionTarget &target,
+                          const OwningRewritePatternList &patterns,
+                          TypeConverter *converter) {
+  return applyFullConversion(llvm::makeArrayRef(op), target, patterns,
+                             converter);
+}
+
+/// Apply an analysis conversion on the given operations, and all nested
+/// operations. This method analyzes which operations would be successfully
+/// converted to the target if a conversion was applied. All operations that
+/// were found to be legalizable to the given 'target' are placed within the
+/// provided 'convertedOps' set; note that no actual rewrites are applied to the
+/// operations on success and only pre-existing operations are added to the set.
+LogicalResult mlir::applyAnalysisConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Analysis,
+                                 &convertedOps);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyAnalysisConversion(Operation *op, ConversionTarget &target,
+                              const OwningRewritePatternList &patterns,
+                              DenseSet<Operation *> &convertedOps,
+                              TypeConverter *converter) {
+  return applyAnalysisConversion(llvm::makeArrayRef(op), target, patterns,
+                                 convertedOps, converter);
+}
diff --git a/third_party/mlir/lib/Transforms/LoopCoalescing.cpp b/third_party/mlir/lib/Transforms/LoopCoalescing.cpp
new file mode 100644
index 00000000000..c4024fe303f
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -0,0 +1,105 @@
+//===- LoopCoalescing.cpp - Pass transforming loop nests into single loops-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define PASS_NAME "loop-coalescing"
+#define DEBUG_TYPE PASS_NAME
+
+using namespace mlir;
+
+namespace {
+class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
+public:
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    func.walk<loop::ForOp>([](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentOfType<loop::ForOp>())
+        return;
+
+      SmallVector<loop::ForOp, 4> loops;
+      getPerfectlyNestedLoops(loops, op);
+      LLVM_DEBUG(llvm::dbgs()
+                 << "found a perfect nest of depth " << loops.size() << '\n');
+
+      // Look for a band of loops that can be coalesced, i.e. perfectly nested
+      // loops with bounds defined above some loop.
+      // 1. For each loop, find above which parent loop its operands are
+      // defined.
+      SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
+      for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+        operandsDefinedAbove[i] = i;
+        for (unsigned j = 0; j < i; ++j) {
+          if (areValuesDefinedAbove(loops[i].getOperands(),
+                                    loops[j].region())) {
+            operandsDefinedAbove[i] = j;
+            break;
+          }
+        }
+        LLVM_DEBUG(llvm::dbgs()
+                   << "  bounds of loop " << i << " are known above depth "
+                   << operandsDefinedAbove[i] << '\n');
+      }
+
+      // 2. Identify bands of loops such that the operands of all of them are
+      // defined above the first loop in the band.  Traverse the nest bottom-up
+      // so that modifications don't invalidate the inner loops.
+      for (unsigned end = loops.size(); end > 0; --end) {
+        unsigned start = 0;
+        for (; start < end - 1; ++start) {
+          auto maxPos =
+              *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                                std::next(operandsDefinedAbove.begin(), end));
+          if (maxPos > start)
+            continue;
+
+          assert(maxPos == start &&
+                 "expected loop bounds to be known at the start of the band");
+          LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
+                                  << " to " << end << '\n');
+
+          auto band =
+              llvm::makeMutableArrayRef(loops.data() + start, end - start);
+          coalesceLoops(band);
+          break;
+        }
+        // If a band was found and transformed, keep looking at the loops above
+        // the outermost transformed loop.
+        if (start != end - 1)
+          end = start + 1;
+      }
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<FunctionPassBase> mlir::createLoopCoalescingPass() {
+  return std::make_unique<LoopCoalescingPass>();
+}
+
+static PassRegistration<LoopCoalescingPass>
+    reg(PASS_NAME,
+        "coalesce nested loops with independent bounds into a single loop");
diff --git a/third_party/mlir/lib/Transforms/LoopFusion.cpp b/third_party/mlir/lib/Transforms/LoopFusion.cpp
new file mode 100644
index 00000000000..a17481f89c9
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopFusion.cpp
@@ -0,0 +1,1903 @@
+//===- LoopFusion.cpp - Code to perform loop fusion -----------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop fusion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iomanip>
+#include <sstream>
+#define DEBUG_TYPE "affine-loop-fusion"
+
+using llvm::SetVector;
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+/// Disables fusion profitability check and fuses if valid. Ignore any
+/// additional (redundant) computation tolerance threshold
+/// that would have prevented fusion.
+static llvm::cl::opt<bool>
+    clMaximalLoopFusion("fusion-maximal",
+                        llvm::cl::desc("Enables maximal loop fusion"),
+                        llvm::cl::cat(clOptionsCategory));
+
+/// A threshold in percent of additional computation allowed when fusing.
+static llvm::cl::opt<double> clFusionAddlComputeTolerance(
+    "fusion-compute-tolerance",
+    llvm::cl::desc("Fractional increase in additional "
+                   "computation tolerated while fusing"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clFusionFastMemorySpace(
+    "fusion-fast-mem-space",
+    llvm::cl::desc("Faster memory space number to promote fusion buffers to"),
+    llvm::cl::cat(clOptionsCategory));
+
+// A local buffer of size less than or equal to this size is automatically
+// promoted to fast memory after producer-consumer fusion.
+static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold(
+    "fusion-local-buf-threshold",
+    llvm::cl::desc("Threshold size (KiB) for promoting local buffers to fast "
+                   "memory space"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// Loop fusion pass. This pass currently supports a greedy fusion policy,
+/// which fuses loop nests with single-writer/single-reader memref dependences
+/// with the goal of improving locality.
+
+// TODO(andydavis) Support fusion of source loop nests which write to multiple
+// memrefs, where each memref can have multiple users (if profitable).
+// TODO(andydavis) Extend this pass to check for fusion preventing dependences,
+// and add support for more general loop fusion algorithms.
+
+struct LoopFusion : public FunctionPass<LoopFusion> {
+  LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0,
+             bool maximalFusion = false)
+      : localBufSizeThreshold(localBufSizeThreshold),
+        fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
+
+  void runOnFunction() override;
+
+  // Any local buffers smaller than this size (in bytes) will be created in
+  // `fastMemorySpace` if provided.
+  uint64_t localBufSizeThreshold;
+  Optional<unsigned> fastMemorySpace = None;
+  // If true, ignore any additional (redundant) computation tolerance threshold
+  // that would have prevented fusion.
+  bool maximalFusion;
+
+  // The amount of additional computation that is tolerated while fusing
+  // pair-wise as a fraction of the total computation.
+  constexpr static double kComputeToleranceThreshold = 0.30f;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionPassBase>
+mlir::createLoopFusionPass(unsigned fastMemorySpace,
+                           uint64_t localBufSizeThreshold, bool maximalFusion) {
+  return std::make_unique<LoopFusion>(fastMemorySpace, localBufSizeThreshold,
+                                      maximalFusion);
+}
+
+namespace {
+
+// LoopNestStateCollector walks loop nests and collects load and store
+// operations, and whether or not an IfInst was encountered in the loop nest.
+struct LoopNestStateCollector {
+  SmallVector<AffineForOp, 4> forOps;
+  SmallVector<Operation *, 4> loadOpInsts;
+  SmallVector<Operation *, 4> storeOpInsts;
+  bool hasNonForRegion = false;
+
+  void collect(Operation *opToWalk) {
+    opToWalk->walk([&](Operation *op) {
+      if (isa<AffineForOp>(op))
+        forOps.push_back(cast<AffineForOp>(op));
+      else if (op->getNumRegions() != 0)
+        hasNonForRegion = true;
+      else if (isa<AffineLoadOp>(op))
+        loadOpInsts.push_back(op);
+      else if (isa<AffineStoreOp>(op))
+        storeOpInsts.push_back(op);
+    });
+  }
+};
+
+// TODO(b/117228571) Replace when this is modeled through side-effects/op traits
+static bool isMemRefDereferencingOp(Operation &op) {
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+      isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op))
+    return true;
+  return false;
+}
+
+// MemRefDependenceGraph is a graph data structure where graph nodes are
+// top-level operations in a FuncOp which contain load/store ops, and edges
+// are memref dependences between the nodes.
+// TODO(andydavis) Add a more flexible dependece graph representation.
+// TODO(andydavis) Add a depth parameter to dependence graph construction.
+struct MemRefDependenceGraph {
+public:
+  // Node represents a node in the graph. A Node is either an entire loop nest
+  // rooted at the top level which contains loads/stores, or a top level
+  // load/store.
+  struct Node {
+    // The unique identifier of this node in the graph.
+    unsigned id;
+    // The top-level statement which is (or contains) a load/store.
+    Operation *op;
+    // List of load operations.
+    SmallVector<Operation *, 4> loads;
+    // List of store op insts.
+    SmallVector<Operation *, 4> stores;
+    Node(unsigned id, Operation *op) : id(id), op(op) {}
+
+    // Returns the load op count for 'memref'.
+    unsigned getLoadOpCount(Value *memref) {
+      unsigned loadOpCount = 0;
+      for (auto *loadOpInst : loads) {
+        if (memref == cast<AffineLoadOp>(loadOpInst).getMemRef())
+          ++loadOpCount;
+      }
+      return loadOpCount;
+    }
+
+    // Returns the store op count for 'memref'.
+    unsigned getStoreOpCount(Value *memref) {
+      unsigned storeOpCount = 0;
+      for (auto *storeOpInst : stores) {
+        if (memref == cast<AffineStoreOp>(storeOpInst).getMemRef())
+          ++storeOpCount;
+      }
+      return storeOpCount;
+    }
+
+    // Returns all store ops in 'storeOps' which access 'memref'.
+    void getStoreOpsForMemref(Value *memref,
+                              SmallVectorImpl<Operation *> *storeOps) {
+      for (auto *storeOpInst : stores) {
+        if (memref == cast<AffineStoreOp>(storeOpInst).getMemRef())
+          storeOps->push_back(storeOpInst);
+      }
+    }
+
+    // Returns all load ops in 'loadOps' which access 'memref'.
+    void getLoadOpsForMemref(Value *memref,
+                             SmallVectorImpl<Operation *> *loadOps) {
+      for (auto *loadOpInst : loads) {
+        if (memref == cast<AffineLoadOp>(loadOpInst).getMemRef())
+          loadOps->push_back(loadOpInst);
+      }
+    }
+
+    // Returns all memrefs in 'loadAndStoreMemrefSet' for which this node
+    // has at least one load and store operation.
+    void getLoadAndStoreMemrefSet(DenseSet<Value *> *loadAndStoreMemrefSet) {
+      llvm::SmallDenseSet<Value *, 2> loadMemrefs;
+      for (auto *loadOpInst : loads) {
+        loadMemrefs.insert(cast<AffineLoadOp>(loadOpInst).getMemRef());
+      }
+      for (auto *storeOpInst : stores) {
+        auto *memref = cast<AffineStoreOp>(storeOpInst).getMemRef();
+        if (loadMemrefs.count(memref) > 0)
+          loadAndStoreMemrefSet->insert(memref);
+      }
+    }
+  };
+
+  // Edge represents a data dependece between nodes in the graph.
+  struct Edge {
+    // The id of the node at the other end of the edge.
+    // If this edge is stored in Edge = Node.inEdges[i], then
+    // 'Node.inEdges[i].id' is the identifier of the source node of the edge.
+    // If this edge is stored in Edge = Node.outEdges[i], then
+    // 'Node.outEdges[i].id' is the identifier of the dest node of the edge.
+    unsigned id;
+    // The SSA value on which this edge represents a dependence.
+    // If the value is a memref, then the dependence is between graph nodes
+    // which contain accesses to the same memref 'value'. If the value is a
+    // non-memref value, then the dependence is between a graph node which
+    // defines an SSA value and another graph node which uses the SSA value
+    // (e.g. a constant operation defining a value which is used inside a loop
+    // nest).
+    Value *value;
+  };
+
+  // Map from node id to Node.
+  DenseMap<unsigned, Node> nodes;
+  // Map from node id to list of input edges.
+  DenseMap<unsigned, SmallVector<Edge, 2>> inEdges;
+  // Map from node id to list of output edges.
+  DenseMap<unsigned, SmallVector<Edge, 2>> outEdges;
+  // Map from memref to a count on the dependence edges associated with that
+  // memref.
+  DenseMap<Value *, unsigned> memrefEdgeCount;
+  // The next unique identifier to use for newly created graph nodes.
+  unsigned nextNodeId = 0;
+
+  MemRefDependenceGraph() {}
+
+  // Initializes the dependence graph based on operations in 'f'.
+  // Returns true on success, false otherwise.
+  bool init(FuncOp f);
+
+  // Returns the graph node for 'id'.
+  Node *getNode(unsigned id) {
+    auto it = nodes.find(id);
+    assert(it != nodes.end());
+    return &it->second;
+  }
+
+  // Returns the graph node for 'forOp'.
+  Node *getForOpNode(AffineForOp forOp) {
+    for (auto &idAndNode : nodes)
+      if (idAndNode.second.op == forOp.getOperation())
+        return &idAndNode.second;
+    return nullptr;
+  }
+
+  // Adds a node with 'op' to the graph and returns its unique identifier.
+  unsigned addNode(Operation *op) {
+    Node node(nextNodeId++, op);
+    nodes.insert({node.id, node});
+    return node.id;
+  }
+
+  // Remove node 'id' (and its associated edges) from graph.
+  void removeNode(unsigned id) {
+    // Remove each edge in 'inEdges[id]'.
+    if (inEdges.count(id) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[id];
+      for (auto &inEdge : oldInEdges) {
+        removeEdge(inEdge.id, id, inEdge.value);
+      }
+    }
+    // Remove each edge in 'outEdges[id]'.
+    if (outEdges.count(id) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[id];
+      for (auto &outEdge : oldOutEdges) {
+        removeEdge(id, outEdge.id, outEdge.value);
+      }
+    }
+    // Erase remaining node state.
+    inEdges.erase(id);
+    outEdges.erase(id);
+    nodes.erase(id);
+  }
+
+  // Returns true if node 'id' writes to any memref which escapes (or is an
+  // argument to) the function/block. Returns false otherwise.
+  bool writesToLiveInOrEscapingMemrefs(unsigned id) {
+    Node *node = getNode(id);
+    for (auto *storeOpInst : node->stores) {
+      auto *memref = cast<AffineStoreOp>(storeOpInst).getMemRef();
+      auto *op = memref->getDefiningOp();
+      // Return true if 'memref' is a block argument.
+      if (!op)
+        return true;
+      // Return true if any use of 'memref' escapes the function.
+      for (auto *user : memref->getUsers())
+        if (!isMemRefDereferencingOp(*user))
+          return true;
+    }
+    return false;
+  }
+
+  // Returns true if node 'id' can be removed from the graph. Returns false
+  // otherwise. A node can be removed from the graph iff the following
+  // conditions are met:
+  // *) The node does not write to any memref which escapes (or is a
+  //    function/block argument).
+  // *) The node has no successors in the dependence graph.
+  bool canRemoveNode(unsigned id) {
+    if (writesToLiveInOrEscapingMemrefs(id))
+      return false;
+    Node *node = getNode(id);
+    for (auto *storeOpInst : node->stores) {
+      // Return false if there exist out edges from 'id' on 'memref'.
+      if (getOutEdgeCount(id, cast<AffineStoreOp>(storeOpInst).getMemRef()) > 0)
+        return false;
+    }
+    return true;
+  }
+
+  // Returns true iff there is an edge from node 'srcId' to node 'dstId' which
+  // is for 'value' if non-null, or for any value otherwise. Returns false
+  // otherwise.
+  bool hasEdge(unsigned srcId, unsigned dstId, Value *value = nullptr) {
+    if (outEdges.count(srcId) == 0 || inEdges.count(dstId) == 0) {
+      return false;
+    }
+    bool hasOutEdge = llvm::any_of(outEdges[srcId], [=](Edge &edge) {
+      return edge.id == dstId && (!value || edge.value == value);
+    });
+    bool hasInEdge = llvm::any_of(inEdges[dstId], [=](Edge &edge) {
+      return edge.id == srcId && (!value || edge.value == value);
+    });
+    return hasOutEdge && hasInEdge;
+  }
+
+  // Adds an edge from node 'srcId' to node 'dstId' for 'value'.
+  void addEdge(unsigned srcId, unsigned dstId, Value *value) {
+    if (!hasEdge(srcId, dstId, value)) {
+      outEdges[srcId].push_back({dstId, value});
+      inEdges[dstId].push_back({srcId, value});
+      if (value->getType().isa<MemRefType>())
+        memrefEdgeCount[value]++;
+    }
+  }
+
+  // Removes an edge from node 'srcId' to node 'dstId' for 'value'.
+  void removeEdge(unsigned srcId, unsigned dstId, Value *value) {
+    assert(inEdges.count(dstId) > 0);
+    assert(outEdges.count(srcId) > 0);
+    if (value->getType().isa<MemRefType>()) {
+      assert(memrefEdgeCount.count(value) > 0);
+      memrefEdgeCount[value]--;
+    }
+    // Remove 'srcId' from 'inEdges[dstId]'.
+    for (auto it = inEdges[dstId].begin(); it != inEdges[dstId].end(); ++it) {
+      if ((*it).id == srcId && (*it).value == value) {
+        inEdges[dstId].erase(it);
+        break;
+      }
+    }
+    // Remove 'dstId' from 'outEdges[srcId]'.
+    for (auto it = outEdges[srcId].begin(); it != outEdges[srcId].end(); ++it) {
+      if ((*it).id == dstId && (*it).value == value) {
+        outEdges[srcId].erase(it);
+        break;
+      }
+    }
+  }
+
+  // Returns true if there is a path in the dependence graph from node 'srcId'
+  // to node 'dstId'. Returns false otherwise.
+  bool hasDependencePath(unsigned srcId, unsigned dstId) {
+    // Worklist state is: <node-id, next-output-edge-index-to-visit>
+    SmallVector<std::pair<unsigned, unsigned>, 4> worklist;
+    worklist.push_back({srcId, 0});
+    // Run DFS traversal to see if 'dstId' is reachable from 'srcId'.
+    while (!worklist.empty()) {
+      auto &idAndIndex = worklist.back();
+      // Return true if we have reached 'dstId'.
+      if (idAndIndex.first == dstId)
+        return true;
+      // Pop and continue if node has no out edges, or if all out edges have
+      // already been visited.
+      if (outEdges.count(idAndIndex.first) == 0 ||
+          idAndIndex.second == outEdges[idAndIndex.first].size()) {
+        worklist.pop_back();
+        continue;
+      }
+      // Get graph edge to traverse.
+      Edge edge = outEdges[idAndIndex.first][idAndIndex.second];
+      // Increment next output edge index for 'idAndIndex'.
+      ++idAndIndex.second;
+      // Add node at 'edge.id' to worklist.
+      worklist.push_back({edge.id, 0});
+    }
+    return false;
+  }
+
+  // Returns the input edge count for node 'id' and 'memref' from src nodes
+  // which access 'memref' with a store operation.
+  unsigned getIncomingMemRefAccesses(unsigned id, Value *memref) {
+    unsigned inEdgeCount = 0;
+    if (inEdges.count(id) > 0)
+      for (auto &inEdge : inEdges[id])
+        if (inEdge.value == memref) {
+          Node *srcNode = getNode(inEdge.id);
+          // Only count in edges from 'srcNode' if 'srcNode' accesses 'memref'
+          if (srcNode->getStoreOpCount(memref) > 0)
+            ++inEdgeCount;
+        }
+    return inEdgeCount;
+  }
+
+  // Returns the output edge count for node 'id' and 'memref' (if non-null),
+  // otherwise returns the total output edge count from node 'id'.
+  unsigned getOutEdgeCount(unsigned id, Value *memref = nullptr) {
+    unsigned outEdgeCount = 0;
+    if (outEdges.count(id) > 0)
+      for (auto &outEdge : outEdges[id])
+        if (!memref || outEdge.value == memref)
+          ++outEdgeCount;
+    return outEdgeCount;
+  }
+
+  // Computes and returns an insertion point operation, before which the
+  // the fused <srcId, dstId> loop nest can be inserted while preserving
+  // dependences. Returns nullptr if no such insertion point is found.
+  Operation *getFusedLoopNestInsertionPoint(unsigned srcId, unsigned dstId) {
+    if (outEdges.count(srcId) == 0)
+      return getNode(dstId)->op;
+
+    // Build set of insts in range (srcId, dstId) which depend on 'srcId'.
+    SmallPtrSet<Operation *, 2> srcDepInsts;
+    for (auto &outEdge : outEdges[srcId])
+      if (outEdge.id != dstId)
+        srcDepInsts.insert(getNode(outEdge.id)->op);
+
+    // Build set of insts in range (srcId, dstId) on which 'dstId' depends.
+    SmallPtrSet<Operation *, 2> dstDepInsts;
+    for (auto &inEdge : inEdges[dstId])
+      if (inEdge.id != srcId)
+        dstDepInsts.insert(getNode(inEdge.id)->op);
+
+    Operation *srcNodeInst = getNode(srcId)->op;
+    Operation *dstNodeInst = getNode(dstId)->op;
+
+    // Computing insertion point:
+    // *) Walk all operation positions in Block operation list in the
+    //    range (src, dst). For each operation 'op' visited in this search:
+    //   *) Store in 'firstSrcDepPos' the first position where 'op' has a
+    //      dependence edge from 'srcNode'.
+    //   *) Store in 'lastDstDepPost' the last position where 'op' has a
+    //      dependence edge to 'dstNode'.
+    // *) Compare 'firstSrcDepPos' and 'lastDstDepPost' to determine the
+    //    operation insertion point (or return null pointer if no such
+    //    insertion point exists: 'firstSrcDepPos' <= 'lastDstDepPos').
+    SmallVector<Operation *, 2> depInsts;
+    Optional<unsigned> firstSrcDepPos;
+    Optional<unsigned> lastDstDepPos;
+    unsigned pos = 0;
+    for (Block::iterator it = std::next(Block::iterator(srcNodeInst));
+         it != Block::iterator(dstNodeInst); ++it) {
+      Operation *op = &(*it);
+      if (srcDepInsts.count(op) > 0 && firstSrcDepPos == None)
+        firstSrcDepPos = pos;
+      if (dstDepInsts.count(op) > 0)
+        lastDstDepPos = pos;
+      depInsts.push_back(op);
+      ++pos;
+    }
+
+    if (firstSrcDepPos.hasValue()) {
+      if (lastDstDepPos.hasValue()) {
+        if (firstSrcDepPos.getValue() <= lastDstDepPos.getValue()) {
+          // No valid insertion point exists which preserves dependences.
+          return nullptr;
+        }
+      }
+      // Return the insertion point at 'firstSrcDepPos'.
+      return depInsts[firstSrcDepPos.getValue()];
+    }
+    // No dependence targets in range (or only dst deps in range), return
+    // 'dstNodInst' insertion point.
+    return dstNodeInst;
+  }
+
+  // Updates edge mappings from node 'srcId' to node 'dstId' after 'oldMemRef'
+  // has been replaced in node at 'dstId' by a private memref.
+  void updateEdges(unsigned srcId, unsigned dstId, Value *oldMemRef) {
+    // For each edge in 'inEdges[srcId]': add new edge remaping to 'dstId'.
+    if (inEdges.count(srcId) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[srcId];
+      for (auto &inEdge : oldInEdges) {
+        // Add edge from 'inEdge.id' to 'dstId' if not for 'oldMemRef'.
+        if (inEdge.value != oldMemRef)
+          addEdge(inEdge.id, dstId, inEdge.value);
+      }
+    }
+    // For each edge in 'outEdges[srcId]': remove edge from 'srcId' to 'dstId'.
+    if (outEdges.count(srcId) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[srcId];
+      for (auto &outEdge : oldOutEdges) {
+        // Remove any out edges from 'srcId' to 'dstId' across memrefs.
+        if (outEdge.id == dstId)
+          removeEdge(srcId, outEdge.id, outEdge.value);
+      }
+    }
+    // Remove any edges in 'inEdges[dstId]' on 'oldMemRef' (which is being
+    // replaced by a private memref). These edges could come from nodes
+    // other than 'srcId' which were removed in the previous step.
+    if (inEdges.count(dstId) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[dstId];
+      for (auto &inEdge : oldInEdges)
+        if (inEdge.value == oldMemRef)
+          removeEdge(inEdge.id, dstId, inEdge.value);
+    }
+  }
+
+  // Update edge mappings for nodes 'sibId' and 'dstId' to reflect fusion
+  // of sibling node 'sidId' into node 'dstId'.
+  void updateEdges(unsigned sibId, unsigned dstId) {
+    // For each edge in 'inEdges[sibId]':
+    // *) Add new edge from source node 'inEdge.id' to 'dstNode'.
+    // *) Remove edge from source node 'inEdge.id' to 'sibNode'.
+    if (inEdges.count(sibId) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[sibId];
+      for (auto &inEdge : oldInEdges) {
+        addEdge(inEdge.id, dstId, inEdge.value);
+        removeEdge(inEdge.id, sibId, inEdge.value);
+      }
+    }
+
+    // For each edge in 'outEdges[sibId]' to node 'id'
+    // *) Add new edge from 'dstId' to 'outEdge.id'.
+    // *) Remove edge from 'sibId' to 'outEdge.id'.
+    if (outEdges.count(sibId) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[sibId];
+      for (auto &outEdge : oldOutEdges) {
+        addEdge(dstId, outEdge.id, outEdge.value);
+        removeEdge(sibId, outEdge.id, outEdge.value);
+      }
+    }
+  }
+
+  // Adds ops in 'loads' and 'stores' to node at 'id'.
+  void addToNode(unsigned id, const SmallVectorImpl<Operation *> &loads,
+                 const SmallVectorImpl<Operation *> &stores) {
+    Node *node = getNode(id);
+    for (auto *loadOpInst : loads)
+      node->loads.push_back(loadOpInst);
+    for (auto *storeOpInst : stores)
+      node->stores.push_back(storeOpInst);
+  }
+
+  void clearNodeLoadAndStores(unsigned id) {
+    Node *node = getNode(id);
+    node->loads.clear();
+    node->stores.clear();
+  }
+
+  // Calls 'callback' for each input edge incident to node 'id' which carries a
+  // memref dependence.
+  void forEachMemRefInputEdge(unsigned id,
+                              const std::function<void(Edge)> &callback) {
+    if (inEdges.count(id) > 0)
+      forEachMemRefEdge(inEdges[id], callback);
+  }
+
+  // Calls 'callback' for each output edge from node 'id' which carries a
+  // memref dependence.
+  void forEachMemRefOutputEdge(unsigned id,
+                               const std::function<void(Edge)> &callback) {
+    if (outEdges.count(id) > 0)
+      forEachMemRefEdge(outEdges[id], callback);
+  }
+
+  // Calls 'callback' for each edge in 'edges' which carries a memref
+  // dependence.
+  void forEachMemRefEdge(ArrayRef<Edge> edges,
+                         const std::function<void(Edge)> &callback) {
+    for (auto &edge : edges) {
+      // Skip if 'edge' is not a memref dependence edge.
+      if (!edge.value->getType().isa<MemRefType>())
+        continue;
+      assert(nodes.count(edge.id) > 0);
+      // Skip if 'edge.id' is not a loop nest.
+      if (!isa<AffineForOp>(getNode(edge.id)->op))
+        continue;
+      // Visit current input edge 'edge'.
+      callback(edge);
+    }
+  }
+
+  void print(raw_ostream &os) const {
+    os << "\nMemRefDependenceGraph\n";
+    os << "\nNodes:\n";
+    for (auto &idAndNode : nodes) {
+      os << "Node: " << idAndNode.first << "\n";
+      auto it = inEdges.find(idAndNode.first);
+      if (it != inEdges.end()) {
+        for (const auto &e : it->second)
+          os << "  InEdge: " << e.id << " " << e.value << "\n";
+      }
+      it = outEdges.find(idAndNode.first);
+      if (it != outEdges.end()) {
+        for (const auto &e : it->second)
+          os << "  OutEdge: " << e.id << " " << e.value << "\n";
+      }
+    }
+  }
+  void dump() const { print(llvm::errs()); }
+};
+
+// Intializes the data dependence graph by walking operations in 'f'.
+// Assigns each node in the graph a node id based on program order in 'f'.
+// TODO(andydavis) Add support for taking a Block arg to construct the
+// dependence graph at a different depth.
+bool MemRefDependenceGraph::init(FuncOp f) {
+  DenseMap<Value *, SetVector<unsigned>> memrefAccesses;
+
+  // TODO: support multi-block functions.
+  if (f.getBlocks().size() != 1)
+    return false;
+
+  DenseMap<Operation *, unsigned> forToNodeMap;
+  for (auto &op : f.front()) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      // Create graph node 'id' to represent top-level 'forOp' and record
+      // all loads and store accesses it contains.
+      LoopNestStateCollector collector;
+      collector.collect(&op);
+      // Return false if a non 'affine.for' region was found (not currently
+      // supported).
+      if (collector.hasNonForRegion)
+        return false;
+      Node node(nextNodeId++, &op);
+      for (auto *opInst : collector.loadOpInsts) {
+        node.loads.push_back(opInst);
+        auto *memref = cast<AffineLoadOp>(opInst).getMemRef();
+        memrefAccesses[memref].insert(node.id);
+      }
+      for (auto *opInst : collector.storeOpInsts) {
+        node.stores.push_back(opInst);
+        auto *memref = cast<AffineStoreOp>(opInst).getMemRef();
+        memrefAccesses[memref].insert(node.id);
+      }
+      forToNodeMap[&op] = node.id;
+      nodes.insert({node.id, node});
+    } else if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+      // Create graph node for top-level load op.
+      Node node(nextNodeId++, &op);
+      node.loads.push_back(&op);
+      auto *memref = cast<AffineLoadOp>(op).getMemRef();
+      memrefAccesses[memref].insert(node.id);
+      nodes.insert({node.id, node});
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+      // Create graph node for top-level store op.
+      Node node(nextNodeId++, &op);
+      node.stores.push_back(&op);
+      auto *memref = cast<AffineStoreOp>(op).getMemRef();
+      memrefAccesses[memref].insert(node.id);
+      nodes.insert({node.id, node});
+    } else if (op.getNumRegions() != 0) {
+      // Return false if another region is found (not currently supported).
+      return false;
+    } else if (op.getNumResults() > 0 && !op.use_empty()) {
+      // Create graph node for top-level producer of SSA values, which
+      // could be used by loop nest nodes.
+      Node node(nextNodeId++, &op);
+      nodes.insert({node.id, node});
+    }
+  }
+
+  // Add dependence edges between nodes which produce SSA values and their
+  // users.
+  for (auto &idAndNode : nodes) {
+    const Node &node = idAndNode.second;
+    if (!node.loads.empty() || !node.stores.empty())
+      continue;
+    auto *opInst = node.op;
+    for (auto *value : opInst->getResults()) {
+      for (auto *user : value->getUsers()) {
+        SmallVector<AffineForOp, 4> loops;
+        getLoopIVs(*user, &loops);
+        if (loops.empty())
+          continue;
+        assert(forToNodeMap.count(loops[0].getOperation()) > 0);
+        unsigned userLoopNestId = forToNodeMap[loops[0].getOperation()];
+        addEdge(node.id, userLoopNestId, value);
+      }
+    }
+  }
+
+  // Walk memref access lists and add graph edges between dependent nodes.
+  for (auto &memrefAndList : memrefAccesses) {
+    unsigned n = memrefAndList.second.size();
+    for (unsigned i = 0; i < n; ++i) {
+      unsigned srcId = memrefAndList.second[i];
+      bool srcHasStore =
+          getNode(srcId)->getStoreOpCount(memrefAndList.first) > 0;
+      for (unsigned j = i + 1; j < n; ++j) {
+        unsigned dstId = memrefAndList.second[j];
+        bool dstHasStore =
+            getNode(dstId)->getStoreOpCount(memrefAndList.first) > 0;
+        if (srcHasStore || dstHasStore)
+          addEdge(srcId, dstId, memrefAndList.first);
+      }
+    }
+  }
+  return true;
+}
+
+// Removes load operations from 'srcLoads' which operate on 'memref', and
+// adds them to 'dstLoads'.
+static void moveLoadsAccessingMemrefTo(Value *memref,
+                                       SmallVectorImpl<Operation *> *srcLoads,
+                                       SmallVectorImpl<Operation *> *dstLoads) {
+  dstLoads->clear();
+  SmallVector<Operation *, 4> srcLoadsToKeep;
+  for (auto *load : *srcLoads) {
+    if (cast<AffineLoadOp>(load).getMemRef() == memref)
+      dstLoads->push_back(load);
+    else
+      srcLoadsToKeep.push_back(load);
+  }
+  srcLoads->swap(srcLoadsToKeep);
+}
+
+// Returns the innermost common loop depth for the set of operations in 'ops'.
+static unsigned getInnermostCommonLoopDepth(ArrayRef<Operation *> ops) {
+  unsigned numOps = ops.size();
+  assert(numOps > 0);
+
+  std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
+  unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
+  for (unsigned i = 0; i < numOps; ++i) {
+    getLoopIVs(*ops[i], &loops[i]);
+    loopDepthLimit =
+        std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
+  }
+
+  unsigned loopDepth = 0;
+  for (unsigned d = 0; d < loopDepthLimit; ++d) {
+    unsigned i;
+    for (i = 1; i < numOps; ++i) {
+      if (loops[i - 1][d] != loops[i][d])
+        break;
+    }
+    if (i != numOps)
+      break;
+    ++loopDepth;
+  }
+  return loopDepth;
+}
+
+// Returns the maximum loop depth at which no dependences between 'loadOpInsts'
+// and 'storeOpInsts' are satisfied.
+static unsigned getMaxLoopDepth(ArrayRef<Operation *> loadOpInsts,
+                                ArrayRef<Operation *> storeOpInsts) {
+  // Merge loads and stores into the same array.
+  SmallVector<Operation *, 2> ops(loadOpInsts.begin(), loadOpInsts.end());
+  ops.append(storeOpInsts.begin(), storeOpInsts.end());
+
+  // Compute the innermost common loop depth for loads and stores.
+  unsigned loopDepth = getInnermostCommonLoopDepth(ops);
+
+  // Return common loop depth for loads if there are no store ops.
+  if (storeOpInsts.empty())
+    return loopDepth;
+
+  // Check dependences on all pairs of ops in 'ops' and store the minimum
+  // loop depth at which a dependence is satisfied.
+  for (unsigned i = 0, e = ops.size(); i < e; ++i) {
+    auto *srcOpInst = ops[i];
+    MemRefAccess srcAccess(srcOpInst);
+    for (unsigned j = 0; j < e; ++j) {
+      auto *dstOpInst = ops[j];
+      MemRefAccess dstAccess(dstOpInst);
+
+      unsigned numCommonLoops =
+          getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
+      for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
+        FlatAffineConstraints dependenceConstraints;
+        // TODO(andydavis) Cache dependence analysis results, check cache here.
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints,
+            /*dependenceComponents=*/nullptr);
+        if (hasDependence(result)) {
+          // Store minimum loop depth and break because we want the min 'd' at
+          // which there is a dependence.
+          loopDepth = std::min(loopDepth, d - 1);
+          break;
+        }
+      }
+    }
+  }
+  return loopDepth;
+}
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+// This can increase the loop depth at which we can fuse a slice, since we are
+// pushing loop carried dependence to a greater depth in the loop nest.
+static void sinkSequentialLoops(MemRefDependenceGraph::Node *node) {
+  assert(isa<AffineForOp>(node->op));
+  AffineForOp newRootForOp = sinkSequentialLoops(cast<AffineForOp>(node->op));
+  node->op = newRootForOp.getOperation();
+}
+
+//  TODO(mlir-team): improve/complete this when we have target data.
+unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
+  auto elementType = memRefType.getElementType();
+
+  unsigned sizeInBits;
+  if (elementType.isIntOrFloat()) {
+    sizeInBits = elementType.getIntOrFloatBitWidth();
+  } else {
+    auto vectorType = elementType.cast<VectorType>();
+    sizeInBits =
+        vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
+  }
+  return llvm::divideCeil(sizeInBits, 8);
+}
+
+// Creates and returns a private (single-user) memref for fused loop rooted
+// at 'forOp', with (potentially reduced) memref size based on the
+// MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+// TODO(bondhugula): consider refactoring the common code from generateDma and
+// this one.
+static Value *createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst,
+                                  unsigned dstLoopDepth,
+                                  Optional<unsigned> fastMemorySpace,
+                                  uint64_t localBufSizeThreshold) {
+  auto *forInst = forOp.getOperation();
+
+  // Create builder to insert alloc op just before 'forOp'.
+  OpBuilder b(forInst);
+  // Builder to create constants at the top level.
+  OpBuilder top(forInst->getParentOfType<FuncOp>().getBody());
+  // Create new memref type based on slice bounds.
+  auto *oldMemRef = cast<AffineStoreOp>(srcStoreOpInst).getMemRef();
+  auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
+  unsigned rank = oldMemRefType.getRank();
+
+  // Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'.
+  MemRefRegion region(srcStoreOpInst->getLoc());
+  bool validRegion = succeeded(region.compute(srcStoreOpInst, dstLoopDepth));
+  (void)validRegion;
+  assert(validRegion && "unexpected memref region failure");
+  SmallVector<int64_t, 4> newShape;
+  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
+  lbs.reserve(rank);
+  // Query 'region' for 'newShape' and lower bounds of MemRefRegion accessed
+  // by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+  Optional<int64_t> numElements =
+      region.getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
+  assert(numElements.hasValue() &&
+         "non-constant number of elts in local buffer");
+
+  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'outerIVs' holds the values that this memory region is symbolic/paramteric
+  // on; this would correspond to loop IVs surrounding the level at which the
+  // slice is being materialized.
+  SmallVector<Value *, 8> outerIVs;
+  cst->getIdValues(rank, cst->getNumIds(), &outerIVs);
+
+  // Build 'rank' AffineExprs from MemRefRegion 'lbs'
+  SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(rank);
+  for (unsigned d = 0; d < rank; ++d) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
+    AffineExpr offset = top.getAffineConstantExpr(0);
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
+    }
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
+    offsets.push_back(offset);
+  }
+
+  // Create 'newMemRefType' using 'newShape' from MemRefRegion accessed
+  // by 'srcStoreOpInst'.
+  uint64_t bufSize =
+      getMemRefEltSizeInBytes(oldMemRefType) * numElements.getValue();
+  unsigned newMemSpace;
+  if (bufSize <= localBufSizeThreshold && fastMemorySpace.hasValue()) {
+    newMemSpace = fastMemorySpace.getValue();
+  } else {
+    newMemSpace = oldMemRefType.getMemorySpace();
+  }
+  auto newMemRefType = top.getMemRefType(
+      newShape, oldMemRefType.getElementType(), {}, newMemSpace);
+  // Gather alloc operands for the dynamic dimensions of the memref.
+  SmallVector<Value *, 4> allocOperands;
+  unsigned dynamicDimCount = 0;
+  for (auto dimSize : oldMemRefType.getShape()) {
+    if (dimSize == -1)
+      allocOperands.push_back(
+          top.create<DimOp>(forOp.getLoc(), oldMemRef, dynamicDimCount++));
+  }
+
+  // Create new private memref for fused loop 'forOp'.
+  // TODO(andydavis) Create/move alloc ops for private memrefs closer to their
+  // consumer loop nests to reduce their live range. Currently they are added
+  // at the beginning of the function, because loop nests can be reordered
+  // during the fusion pass.
+  Value *newMemRef =
+      top.create<AllocOp>(forOp.getLoc(), newMemRefType, allocOperands);
+
+  // Build an AffineMap to remap access functions based on lower bound offsets.
+  SmallVector<AffineExpr, 4> remapExprs;
+  remapExprs.reserve(rank);
+  unsigned zeroOffsetCount = 0;
+  for (unsigned i = 0; i < rank; i++) {
+    if (auto constExpr = offsets[i].dyn_cast<AffineConstantExpr>())
+      if (constExpr.getValue() == 0)
+        ++zeroOffsetCount;
+    auto dimExpr = b.getAffineDimExpr(outerIVs.size() + i);
+
+    auto remapExpr =
+        simplifyAffineExpr(dimExpr - offsets[i], outerIVs.size() + rank, 0);
+    remapExprs.push_back(remapExpr);
+  }
+  auto indexRemap = zeroOffsetCount == rank
+                        ? AffineMap()
+                        : b.getAffineMap(outerIVs.size() + rank, 0, remapExprs);
+  // Replace all users of 'oldMemRef' with 'newMemRef'.
+  LogicalResult res =
+      replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap,
+                               /*extraOperands=*/outerIVs,
+                               /*domInstFilter=*/&*forOp.getBody()->begin());
+  assert(succeeded(res) &&
+         "replaceAllMemrefUsesWith should always succeed here");
+  (void)res;
+  return newMemRef;
+}
+
+// Checks if node 'srcId' (which writes to a live out memref), can be safely
+// fused into node 'dstId'. Returns true if the following conditions are met:
+// *) 'srcNode' only writes to live out 'memref'.
+// *) 'srcNode' has exactly one output edge on 'memref' (which is to 'dstId').
+// *) 'dstNode's read/write region to 'memref' is a super set of 'srcNode's
+//    write region to 'memref'.
+// TODO(andydavis) Generalize this to handle more live in/out cases.
+static bool canFuseSrcWhichWritesToLiveOut(unsigned srcId, unsigned dstId,
+                                           Value *memref,
+                                           MemRefDependenceGraph *mdg) {
+  auto *srcNode = mdg->getNode(srcId);
+  auto *dstNode = mdg->getNode(dstId);
+
+  // Gather all memrefs from 'srcNode' store ops.
+  DenseSet<Value *> storeMemrefs;
+  for (auto *storeOpInst : srcNode->stores) {
+    storeMemrefs.insert(cast<AffineStoreOp>(storeOpInst).getMemRef());
+  }
+  // Return false if any of the following are true:
+  // *) 'srcNode' writes to a live in/out memref other than 'memref'.
+  // *) 'srcNode' has more than one output edge on 'memref'.
+  // Check that all stores are to the same memref.
+  if (storeMemrefs.size() != 1 ||
+      mdg->getOutEdgeCount(srcNode->id, memref) != 1)
+    return false;
+  // Compute MemRefRegion 'srcWriteRegion' for 'srcStoreOpInst' on 'memref'.
+  auto *srcStoreOpInst = srcNode->stores.front();
+  MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
+  if (failed(srcWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for source operation\n.");
+    return false;
+  }
+  SmallVector<int64_t, 4> srcShape;
+  // Query 'srcWriteRegion' for 'srcShape' and 'srcNumElements'.
+  // by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+  Optional<int64_t> srcNumElements =
+      srcWriteRegion.getConstantBoundingSizeAndShape(&srcShape);
+  if (!srcNumElements.hasValue())
+    return false;
+
+  // Compute MemRefRegion 'dstRegion' for 'dstStore/LoadOpInst' on 'memref'.
+  // TODO(andydavis) Compute 'unionboundingbox' of all write regions (one for
+  // each store op in 'dstStoreOps').
+  SmallVector<Operation *, 2> dstStoreOps;
+  dstNode->getStoreOpsForMemref(memref, &dstStoreOps);
+  SmallVector<Operation *, 2> dstLoadOps;
+  dstNode->getLoadOpsForMemref(memref, &dstLoadOps);
+
+  auto *dstOpInst = dstStoreOps.empty() ? dstLoadOps[0] : dstStoreOps[0];
+  MemRefRegion dstRegion(dstOpInst->getLoc());
+  if (failed(dstRegion.compute(dstOpInst, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for dest operation\n.");
+    return false;
+  }
+  SmallVector<int64_t, 4> dstShape;
+  // Query 'dstRegion' for 'dstShape' and 'dstNumElements'.
+  // by 'dstOpInst' at depth 'dstLoopDepth'.
+  Optional<int64_t> dstNumElements =
+      dstRegion.getConstantBoundingSizeAndShape(&dstShape);
+  if (!dstNumElements.hasValue())
+    return false;
+
+  // Return false if write region is not a superset of 'srcNodes' write
+  // region to 'memref'.
+  // TODO(andydavis) Check the shape and lower bounds here too.
+  if (srcNumElements != dstNumElements)
+    return false;
+  return true;
+}
+
+// Checks the profitability of fusing a backwards slice of the loop nest
+// surrounding 'srcOpInst' into the loop nest surrounding 'dstLoadOpInsts'.
+// The argument 'srcStoreOpInst' is used to calculate the storage reduction on
+// the memref being produced and consumed, which is an input to the cost model.
+// For producer-constumer fusion, 'srcStoreOpInst' will be the same as
+// 'srcOpInst', as we are slicing w.r.t to that producer.
+// For input-reuse fusion, 'srcOpInst' will be the src loop nest LoadOp which
+// reads from the same memref as dst loop nest load ops, and 'srcStoreOpInst'
+// will be the unique store op in the src node, which will be used to check
+// that the write region is the same after input-reuse fusion.
+// Returns true if it is profitable to fuse the candidate loop nests. Returns
+// false otherwise. `dstLoopDepth` is set to the most profitable depth at which
+// to materialize the source loop nest slice.
+// The profitability model executes the following steps:
+// *) Computes the backward computation slice at 'srcOpInst'. This
+//    computation slice of the loop nest surrounding 'srcOpInst' is
+//    represented by modified src loop bounds in 'sliceState', which are
+//    functions of loop IVs in the loop nest surrounding 'srcOpInst'.
+// *) Computes the cost of unfused src/dst loop nests (currently the cost of a
+//    loop nest is the total number of dynamic operation instances in the loop
+//    nest).
+// *) Computes the cost of fusing a slice of the src loop nest into the dst
+//    loop nest at various values of dst loop depth, attempting to fuse
+//    the largest compution slice at the maximal dst loop depth (closest to the
+//    load) to minimize reuse distance and potentially enable subsequent
+//    load/store forwarding.
+//    NOTE: If the dst loop nest includes multiple loads in 'dstLoadOpInsts' for
+//    the same memref as is written by 'srcOpInst', then the union of slice
+//    loop bounds is used to compute the slice and associated slice cost.
+//    NOTE: 'dstLoopDepth' refers to the loop depth within the destination loop
+//    nest, at which the src computation slice is inserted/fused.
+//    NOTE: We attempt to maximize the dst loop depth, but there are cases
+//    where a particular setting for 'dstLoopNest' might fuse an unsliced
+//    loop (within the src computation slice) at a depth which results in
+//    execessive recomputation (see unit tests for examples).
+// *) Compares the total cost of the unfused loop nests to the min cost fused
+//    loop nest computed in the previous step, and returns true if the latter
+//    is lower.
+static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
+                               ArrayRef<Operation *> dstLoadOpInsts,
+                               ArrayRef<Operation *> dstStoreOpInsts,
+                               ComputationSliceState *sliceState,
+                               unsigned *dstLoopDepth, bool maximalFusion) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Checking whether fusion is profitable between:\n";
+    llvm::dbgs() << " " << *srcOpInst << " and \n";
+    for (auto dstOpInst : dstLoadOpInsts) {
+      llvm::dbgs() << " " << *dstOpInst << "\n";
+    };
+  });
+
+  // Compute cost of sliced and unsliced src loop nest.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*srcOpInst, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Walk src loop nest and collect stats.
+  LoopNestStats srcLoopNestStats;
+  if (!getLoopNestStats(srcLoopIVs[0], &srcLoopNestStats))
+    return false;
+
+  // Compute cost of dst loop nest.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*dstLoadOpInsts[0], &dstLoopIVs);
+
+  LoopNestStats dstLoopNestStats;
+  if (!getLoopNestStats(dstLoopIVs[0], &dstLoopNestStats))
+    return false;
+
+  // Compute the maximum loop depth at which we can can insert the src slice
+  // and still satisfy dest loop nest dependences, for producer-consumer fusion.
+  unsigned maxDstLoopDepth =
+      (srcOpInst == srcStoreOpInst)
+          ? getMaxLoopDepth(dstLoadOpInsts, dstStoreOpInsts)
+          : dstLoopIVs.size();
+  if (maxDstLoopDepth == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Can't fuse: maxDstLoopDepth == 0 .\n");
+    return false;
+  }
+
+  // Search for min cost value for 'dstLoopDepth'. At each value of
+  // 'dstLoopDepth' from 'maxDstLoopDepth' to '1', compute computation slice
+  // bounds between 'srcOpInst' and each op in 'dstOpinsts' (taking the union
+  // of these bounds). Next the union slice bounds are used to calculate
+  // the cost of the slice and the cost of the slice inserted into the dst
+  // loop nest at 'dstLoopDepth'.
+  uint64_t minFusedLoopNestComputeCost = std::numeric_limits<uint64_t>::max();
+  double maxStorageReduction = 0.0;
+  Optional<uint64_t> sliceMemEstimate = None;
+
+  SmallVector<ComputationSliceState, 4> sliceStates;
+  sliceStates.resize(maxDstLoopDepth);
+  // The best loop depth at which to materialize the slice.
+  Optional<unsigned> bestDstLoopDepth = None;
+
+  // Compute op instance count for the src loop nest without iteration slicing.
+  uint64_t srcLoopNestCost = getComputeCost(srcLoopIVs[0], srcLoopNestStats);
+
+  // Compute src loop nest write region size.
+  MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
+  if (failed(srcWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for source operation\n.");
+    return false;
+  }
+
+  Optional<int64_t> maybeSrcWriteRegionSizeBytes =
+      srcWriteRegion.getRegionSize();
+  if (!maybeSrcWriteRegionSizeBytes.hasValue())
+    return false;
+  int64_t srcWriteRegionSizeBytes = maybeSrcWriteRegionSizeBytes.getValue();
+
+  // Compute op instance count for the src loop nest.
+  uint64_t dstLoopNestCost = getComputeCost(dstLoopIVs[0], dstLoopNestStats);
+
+  // Evaluate all depth choices for materializing the slice in the destination
+  // loop nest.
+  for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
+    // Compute the union of slice bounds of all ops in 'dstLoadOpInsts'.
+    if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts,
+                                       /*loopDepth=*/i,
+                                       /*numCommonLoops=*/0,
+                                       /*isBackwardSlice=*/true,
+                                       &sliceStates[i - 1]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "computeSliceUnion failed for loopDepth: " << i << "\n");
+      continue;
+    }
+
+    int64_t fusedLoopNestComputeCost;
+    if (!getFusionComputeCost(srcLoopIVs[0], srcLoopNestStats, dstLoopIVs[0],
+                              dstLoopNestStats, &sliceStates[i - 1],
+                              &fusedLoopNestComputeCost)) {
+      LLVM_DEBUG(llvm::dbgs() << "Unable to compute fusion compute cost.\n.");
+      continue;
+    }
+
+    double additionalComputeFraction =
+        fusedLoopNestComputeCost /
+            (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
+        1;
+
+    // Determine what the slice write MemRefRegion would be, if the src loop
+    // nest slice 'sliceStates[i - 1]' were to be inserted into the dst loop
+    // nest at loop depth 'i'
+    MemRefRegion sliceWriteRegion(srcStoreOpInst->getLoc());
+    if (failed(sliceWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0,
+                                        &sliceStates[i - 1]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to compute slice write region at loopDepth: " << i
+                 << "\n");
+      continue;
+    }
+
+    Optional<int64_t> maybeSliceWriteRegionSizeBytes =
+        sliceWriteRegion.getRegionSize();
+    if (!maybeSliceWriteRegionSizeBytes.hasValue() ||
+        maybeSliceWriteRegionSizeBytes.getValue() == 0) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to get slice write region size at loopDepth: " << i
+                 << "\n");
+      continue;
+    }
+    int64_t sliceWriteRegionSizeBytes =
+        maybeSliceWriteRegionSizeBytes.getValue();
+
+    // If we are fusing for reuse, check that write regions remain the same.
+    // TODO(andydavis) Write region check should check sizes and offsets in
+    // each dimension, so that we are sure they are covering the same memref
+    // region. Also, move this out to a isMemRefRegionSuperSet helper function.
+    if (srcOpInst != srcStoreOpInst &&
+        sliceWriteRegionSizeBytes != srcWriteRegionSizeBytes)
+      continue;
+
+    double storageReduction = static_cast<double>(srcWriteRegionSizeBytes) /
+                              static_cast<double>(sliceWriteRegionSizeBytes);
+
+    LLVM_DEBUG({
+      std::stringstream msg;
+      msg << "  evaluating fusion profitability at depth : " << i << "\n"
+          << std::fixed << std::setprecision(2)
+          << "   additional compute fraction: "
+          << 100.0 * additionalComputeFraction << "%\n"
+          << "   storage reduction factor: " << storageReduction << "x\n"
+          << "   fused nest cost: " << fusedLoopNestComputeCost << "\n"
+          << "   src write region size: " << srcWriteRegionSizeBytes << "\n"
+          << "   slice write region size: " << sliceWriteRegionSizeBytes
+          << "\n";
+      llvm::dbgs() << msg.str();
+    });
+
+    double computeToleranceThreshold =
+        clFusionAddlComputeTolerance.getNumOccurrences() > 0
+            ? clFusionAddlComputeTolerance
+            : LoopFusion::kComputeToleranceThreshold;
+
+    // TODO(b/123247369): This is a placeholder cost model.
+    // Among all choices that add an acceptable amount of redundant computation
+    // (as per computeToleranceThreshold), we will simply pick the one that
+    // reduces the intermediary size the most.
+    if ((storageReduction > maxStorageReduction) &&
+        (maximalFusion ||
+         (additionalComputeFraction < computeToleranceThreshold))) {
+      maxStorageReduction = storageReduction;
+      bestDstLoopDepth = i;
+      minFusedLoopNestComputeCost = fusedLoopNestComputeCost;
+      sliceMemEstimate = sliceWriteRegionSizeBytes;
+    }
+  }
+
+  // A simple cost model: fuse if it reduces the memory footprint. If
+  // -maximal-fusion is set, fuse nevertheless.
+
+  if (!maximalFusion && !bestDstLoopDepth.hasValue()) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "All fusion choices involve more than the threshold amount of "
+           "redundant computation; NOT fusing.\n");
+    return false;
+  }
+
+  if (!bestDstLoopDepth.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "no fusion depth could be evaluated.\n");
+    return false;
+  }
+
+  // Set dstLoopDepth based on best values from search.
+  *dstLoopDepth = bestDstLoopDepth.getValue();
+
+  LLVM_DEBUG(
+      llvm::dbgs() << " LoopFusion fusion stats:"
+                   << "\n  best loop depth: " << bestDstLoopDepth
+                   << "\n  src loop nest compute cost: " << srcLoopNestCost
+                   << "\n  dst loop nest compute cost: " << dstLoopNestCost
+                   << "\n  fused loop nest compute cost: "
+                   << minFusedLoopNestComputeCost << "\n");
+
+  auto dstMemSize = getMemoryFootprintBytes(dstLoopIVs[0]);
+  auto srcMemSize = getMemoryFootprintBytes(srcLoopIVs[0]);
+
+  Optional<double> storageReduction = None;
+
+  if (!maximalFusion) {
+    if (!dstMemSize.hasValue() || !srcMemSize.hasValue()) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "  fusion memory benefit cannot be evaluated; NOT fusing.\n");
+      return false;
+    }
+
+    auto srcMemSizeVal = srcMemSize.getValue();
+    auto dstMemSizeVal = dstMemSize.getValue();
+
+    assert(sliceMemEstimate.hasValue() && "expected value");
+    auto fusedMem = dstMemSizeVal + sliceMemEstimate.getValue();
+
+    LLVM_DEBUG(llvm::dbgs() << "   src mem: " << srcMemSizeVal << "\n"
+                            << "   dst mem: " << dstMemSizeVal << "\n"
+                            << "   fused mem: " << fusedMem << "\n"
+                            << "   slice mem: " << sliceMemEstimate << "\n");
+
+    if (static_cast<long>(fusedMem) > srcMemSizeVal + dstMemSizeVal) {
+      LLVM_DEBUG(llvm::dbgs() << "Fusion is not profitable; NOT fusing.\n");
+      return false;
+    }
+    storageReduction =
+        100.0 *
+        (1.0 - fusedMem / (static_cast<double>(srcMemSizeVal) + dstMemSizeVal));
+  }
+
+  double additionalComputeFraction =
+      100.0 * (minFusedLoopNestComputeCost /
+                   (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
+               1);
+  (void)additionalComputeFraction;
+  LLVM_DEBUG({
+    std::stringstream msg;
+    msg << " fusion is most profitable at depth " << *dstLoopDepth << " with "
+        << std::setprecision(2) << additionalComputeFraction
+        << "% redundant computation and a ";
+    msg << (storageReduction.hasValue()
+                ? std::to_string(storageReduction.getValue())
+                : "<unknown>");
+    msg << "% storage reduction.\n";
+    llvm::dbgs() << msg.str();
+  });
+
+  // Update return parameter 'sliceState' with 'bestSliceState'.
+  ComputationSliceState *bestSliceState = &sliceStates[*dstLoopDepth - 1];
+  sliceState->lbs = bestSliceState->lbs;
+  sliceState->ubs = bestSliceState->ubs;
+  sliceState->lbOperands = bestSliceState->lbOperands;
+  sliceState->ubOperands = bestSliceState->ubOperands;
+
+  // Canonicalize slice bound affine maps.
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    if (sliceState->lbs[i] != AffineMap()) {
+      canonicalizeMapAndOperands(&sliceState->lbs[i],
+                                 &sliceState->lbOperands[i]);
+    }
+    if (sliceState->ubs[i] != AffineMap()) {
+      canonicalizeMapAndOperands(&sliceState->ubs[i],
+                                 &sliceState->ubOperands[i]);
+    }
+  }
+  return true;
+}
+
+// GreedyFusion greedily fuses loop nests which have a producer/consumer or
+// input-reuse relationship on a memref, with the goal of improving locality.
+//
+// The steps of the producer-consumer fusion algorithm are as follows:
+//
+// *) A worklist is initialized with node ids from the dependence graph.
+// *) For each node id in the worklist:
+//   *) Pop an AffineForOp of the worklist. This 'dstAffineForOp' will be a
+//      candidate destination AffineForOp into which fusion will be attempted.
+//   *) Add each LoadOp currently in 'dstAffineForOp' into list 'dstLoadOps'.
+//   *) For each LoadOp in 'dstLoadOps' do:
+//      *) Look up dependent loop nests which have a single store op to the same
+//         memref.
+//      *) Check if dependences would be violated by the fusion.
+//      *) Get a computation slice of 'srcLoopNest', which adjusts its loop
+//         bounds to be functions of 'dstLoopNest' IVs and symbols.
+//      *) Fuse the 'srcLoopNest' computation slice into the 'dstLoopNest',
+//         at a loop depth determined by the cost model in 'isFusionProfitable'.
+//      *) Add the newly fused load/store operations to the state,
+//         and also add newly fused load ops to 'dstLoopOps' to be considered
+//         as fusion dst load ops in another iteration.
+//      *) Remove old src loop nest and its associated state.
+//
+// The steps of the input-reuse fusion algorithm are as follows:
+//
+// *) Initialize 'worklist' with node ids from the dependence graph.
+// *) For each 'dstNode' in the worklist:
+//   *) Find a candidate sibling node 'sibNode' to fuse with 'dstNode' which
+//      loads from the same memref, but which has no dependence paths to/from.
+//   *) Get a computation slice of 'sibLoopNest', which adjusts its loop
+//      bounds to be functions of 'dstLoopNest' IVs and symbols.
+//   *) Fuse the 'sibLoopNest' computation slice into the 'dstLoopNest',
+//      at a loop depth determined by the cost model in 'isFusionProfitable'.
+//      This function also checks that the memref write region of 'sibLoopNest',
+//      is preserved in the fused loop nest.
+//   *) Update graph state to reflect the fusion of 'sibNode' into 'dstNode'.
+//
+// Given a graph where top-level operations are vertices in the set 'V' and
+// edges in the set 'E' are dependences between vertices, this algorithm
+// takes O(V) time for initialization, and has runtime O(V + E).
+//
+// This greedy algorithm is not 'maximal' due to the current restriction of
+// fusing along single producer consumer edges, but there is a TODO to fix this.
+//
+// TODO(andydavis) Experiment with other fusion policies.
+struct GreedyFusion {
+public:
+  // The data dependence graph to traverse during fusion.
+  MemRefDependenceGraph *mdg;
+  // Worklist of graph nodes visited during the fusion pass.
+  SmallVector<unsigned, 8> worklist;
+  // Set of graph nodes which are present on the worklist.
+  llvm::SmallDenseSet<unsigned, 16> worklistSet;
+  // Parameter for local buffer size threshold.
+  unsigned localBufSizeThreshold;
+  // Parameter for fast memory space.
+  Optional<unsigned> fastMemorySpace;
+  // If true, ignore any additional (redundant) computation tolerance threshold
+  // that would have prevented fusion.
+  bool maximalFusion;
+
+  using Node = MemRefDependenceGraph::Node;
+
+  GreedyFusion(MemRefDependenceGraph *mdg, unsigned localBufSizeThreshold,
+               Optional<unsigned> fastMemorySpace, bool maximalFusion)
+      : mdg(mdg), localBufSizeThreshold(localBufSizeThreshold),
+        fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
+
+  // Initializes 'worklist' with nodes from 'mdg'
+  void init() {
+    // TODO(andydavis) Add a priority queue for prioritizing nodes by different
+    // metrics (e.g. arithmetic intensity/flops-to-bytes ratio).
+    worklist.clear();
+    worklistSet.clear();
+    for (auto &idAndNode : mdg->nodes) {
+      const Node &node = idAndNode.second;
+      worklist.push_back(node.id);
+      worklistSet.insert(node.id);
+    }
+  }
+
+  // Run the GreedyFusion pass.
+  // *) First pass through the nodes fuses single-use producer nodes into their
+  //    unique consumer.
+  // *) Second pass fuses sibling nodes which share no dependence edges.
+  // *) Third pass fuses any remaining producer nodes into their users.
+  void run() {
+    // TODO(andydavis) Run this repeatedly until a fixed-point is reached.
+    fuseProducerConsumerNodes(/*maxSrcUserCount=*/1);
+    fuseSiblingNodes();
+    fuseProducerConsumerNodes(
+        /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max());
+    eraseUnusedMemRefAllocations();
+  }
+
+  void fuseProducerConsumerNodes(unsigned maxSrcUserCount) {
+    init();
+    while (!worklist.empty()) {
+      unsigned dstId = worklist.back();
+      worklist.pop_back();
+      worklistSet.erase(dstId);
+
+      // Skip if this node was removed (fused into another node).
+      if (mdg->nodes.count(dstId) == 0)
+        continue;
+      // Get 'dstNode' into which to attempt fusion.
+      auto *dstNode = mdg->getNode(dstId);
+      // Skip if 'dstNode' is not a loop nest.
+      if (!isa<AffineForOp>(dstNode->op))
+        continue;
+      // Sink sequential loops in 'dstNode' (and thus raise parallel loops)
+      // while preserving relative order. This can increase the maximum loop
+      // depth at which we can fuse a slice of a producer loop nest into a
+      // consumer loop nest.
+      sinkSequentialLoops(dstNode);
+
+      SmallVector<Operation *, 4> loads = dstNode->loads;
+      SmallVector<Operation *, 4> dstLoadOpInsts;
+      DenseSet<Value *> visitedMemrefs;
+      while (!loads.empty()) {
+        // Get memref of load on top of the stack.
+        auto *memref = cast<AffineLoadOp>(loads.back()).getMemRef();
+        if (visitedMemrefs.count(memref) > 0)
+          continue;
+        visitedMemrefs.insert(memref);
+        // Move all loads in 'loads' accessing 'memref' to 'dstLoadOpInsts'.
+        moveLoadsAccessingMemrefTo(memref, &loads, &dstLoadOpInsts);
+        // Skip if no input edges along which to fuse.
+        if (mdg->inEdges.count(dstId) == 0)
+          continue;
+        // Iterate through in-edges for 'dstId' and src node id for any
+        // edges on 'memref'.
+        SmallVector<unsigned, 2> srcNodeIds;
+        for (auto &srcEdge : mdg->inEdges[dstId]) {
+          // Skip 'srcEdge' if not for 'memref'.
+          if (srcEdge.value != memref)
+            continue;
+          srcNodeIds.push_back(srcEdge.id);
+        }
+        for (unsigned srcId : srcNodeIds) {
+          // Skip if this node was removed (fused into another node).
+          if (mdg->nodes.count(srcId) == 0)
+            continue;
+          // Get 'srcNode' from which to attempt fusion into 'dstNode'.
+          auto *srcNode = mdg->getNode(srcId);
+          // Skip if 'srcNode' is not a loop nest.
+          if (!isa<AffineForOp>(srcNode->op))
+            continue;
+          // Skip if 'srcNode' has more than one store to any memref.
+          // TODO(andydavis) Support fusing multi-output src loop nests.
+          if (srcNode->stores.size() != 1)
+            continue;
+
+          // Skip if 'srcNode' writes to any live in or escaping memrefs,
+          // and cannot be fused.
+          bool writesToLiveInOrOut =
+              mdg->writesToLiveInOrEscapingMemrefs(srcNode->id);
+          if (writesToLiveInOrOut &&
+              !canFuseSrcWhichWritesToLiveOut(srcId, dstId, memref, mdg))
+            continue;
+
+          // Skip if 'srcNode' out edge count on 'memref' > 'maxSrcUserCount'.
+          if (mdg->getOutEdgeCount(srcNode->id, memref) > maxSrcUserCount)
+            continue;
+
+          // Compute an operation list insertion point for the fused loop
+          // nest which preserves dependences.
+          Operation *insertPointInst =
+              mdg->getFusedLoopNestInsertionPoint(srcNode->id, dstNode->id);
+          if (insertPointInst == nullptr)
+            continue;
+
+          // Get unique 'srcNode' store op.
+          auto *srcStoreOpInst = srcNode->stores.front();
+          // Gather 'dstNode' store ops to 'memref'.
+          SmallVector<Operation *, 2> dstStoreOpInsts;
+          for (auto *storeOpInst : dstNode->stores)
+            if (cast<AffineStoreOp>(storeOpInst).getMemRef() == memref)
+              dstStoreOpInsts.push_back(storeOpInst);
+
+          unsigned bestDstLoopDepth;
+          mlir::ComputationSliceState sliceState;
+          // Check if fusion would be profitable.
+          if (!isFusionProfitable(srcStoreOpInst, srcStoreOpInst,
+                                  dstLoadOpInsts, dstStoreOpInsts, &sliceState,
+                                  &bestDstLoopDepth, maximalFusion))
+            continue;
+          // TODO(andydavis) Remove the following test code when canFuseLoops
+          // is fully functional.
+          mlir::ComputationSliceState sliceUnion;
+          if (!maximalFusion) {
+            FusionResult result = mlir::canFuseLoops(
+                cast<AffineForOp>(srcNode->op), cast<AffineForOp>(dstNode->op),
+                bestDstLoopDepth, &sliceUnion);
+            assert(result.value == FusionResult::Success);
+            (void)result;
+          }
+          // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
+          auto sliceLoopNest = mlir::insertBackwardComputationSlice(
+              srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
+          if (sliceLoopNest) {
+            LLVM_DEBUG(llvm::dbgs() << "\tslice loop nest:\n"
+                                    << *sliceLoopNest.getOperation() << "\n");
+            // Move 'dstAffineForOp' before 'insertPointInst' if needed.
+            auto dstAffineForOp = cast<AffineForOp>(dstNode->op);
+            if (insertPointInst != dstAffineForOp.getOperation()) {
+              dstAffineForOp.getOperation()->moveBefore(insertPointInst);
+            }
+            // Update edges between 'srcNode' and 'dstNode'.
+            mdg->updateEdges(srcNode->id, dstNode->id, memref);
+
+            // Collect slice loop stats.
+            LoopNestStateCollector sliceCollector;
+            sliceCollector.collect(sliceLoopNest.getOperation());
+            // Promote single iteration slice loops to single IV value.
+            for (auto forOp : sliceCollector.forOps) {
+              promoteIfSingleIteration(forOp);
+            }
+            if (!writesToLiveInOrOut) {
+              // Create private memref for 'memref' in 'dstAffineForOp'.
+              SmallVector<Operation *, 4> storesForMemref;
+              for (auto *storeOpInst : sliceCollector.storeOpInsts) {
+                if (cast<AffineStoreOp>(storeOpInst).getMemRef() == memref)
+                  storesForMemref.push_back(storeOpInst);
+              }
+              assert(storesForMemref.size() == 1);
+              auto *newMemRef = createPrivateMemRef(
+                  dstAffineForOp, storesForMemref[0], bestDstLoopDepth,
+                  fastMemorySpace, localBufSizeThreshold);
+              visitedMemrefs.insert(newMemRef);
+              // Create new node in dependence graph for 'newMemRef' alloc op.
+              unsigned newMemRefNodeId =
+                  mdg->addNode(newMemRef->getDefiningOp());
+              // Add edge from 'newMemRef' node to dstNode.
+              mdg->addEdge(newMemRefNodeId, dstId, newMemRef);
+            }
+
+            // Collect dst loop stats after memref privatizaton transformation.
+            LoopNestStateCollector dstLoopCollector;
+            dstLoopCollector.collect(dstAffineForOp.getOperation());
+
+            // Add new load ops to current Node load op list 'loads' to
+            // continue fusing based on new operands.
+            for (auto *loadOpInst : dstLoopCollector.loadOpInsts) {
+              auto *loadMemRef = cast<AffineLoadOp>(loadOpInst).getMemRef();
+              if (visitedMemrefs.count(loadMemRef) == 0)
+                loads.push_back(loadOpInst);
+            }
+
+            // Clear and add back loads and stores.
+            mdg->clearNodeLoadAndStores(dstNode->id);
+            mdg->addToNode(dstId, dstLoopCollector.loadOpInsts,
+                           dstLoopCollector.storeOpInsts);
+            // Remove old src loop nest if it no longer has outgoing dependence
+            // edges, and if it does not write to a memref which escapes the
+            // function. If 'writesToLiveInOrOut' is true, then 'srcNode' has
+            // been fused into 'dstNode' and write region of 'dstNode' covers
+            // the write region of 'srcNode', and 'srcNode' has no other users
+            // so it is safe to remove.
+            if (writesToLiveInOrOut || mdg->canRemoveNode(srcNode->id)) {
+              mdg->removeNode(srcNode->id);
+              srcNode->op->erase();
+            } else {
+              // Add remaining users of 'oldMemRef' back on the worklist (if not
+              // already there), as its replacement with a local/private memref
+              // has reduced dependences on 'oldMemRef' which may have created
+              // new fusion opportunities.
+              if (mdg->outEdges.count(srcNode->id) > 0) {
+                SmallVector<MemRefDependenceGraph::Edge, 2> oldOutEdges =
+                    mdg->outEdges[srcNode->id];
+                for (auto &outEdge : oldOutEdges) {
+                  if (outEdge.value == memref &&
+                      worklistSet.count(outEdge.id) == 0) {
+                    worklist.push_back(outEdge.id);
+                    worklistSet.insert(outEdge.id);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Visits each node in the graph, and for each node, attempts to fuse it with
+  // its sibling nodes (nodes which share a parent, but no dependence edges).
+  void fuseSiblingNodes() {
+    init();
+    while (!worklist.empty()) {
+      unsigned dstId = worklist.back();
+      worklist.pop_back();
+      worklistSet.erase(dstId);
+
+      // Skip if this node was removed (fused into another node).
+      if (mdg->nodes.count(dstId) == 0)
+        continue;
+      // Get 'dstNode' into which to attempt fusion.
+      auto *dstNode = mdg->getNode(dstId);
+      // Skip if 'dstNode' is not a loop nest.
+      if (!isa<AffineForOp>(dstNode->op))
+        continue;
+      // Attempt to fuse 'dstNode' with its sibling nodes in the graph.
+      fuseWithSiblingNodes(dstNode);
+    }
+  }
+
+  // Attempt to fuse 'dstNode' with sibling nodes in the graph.
+  void fuseWithSiblingNodes(Node *dstNode) {
+    DenseSet<unsigned> visitedSibNodeIds;
+    std::pair<unsigned, Value *> idAndMemref;
+    while (findSiblingNodeToFuse(dstNode, &visitedSibNodeIds, &idAndMemref)) {
+      unsigned sibId = idAndMemref.first;
+      Value *memref = idAndMemref.second;
+      // TODO(andydavis) Check that 'sibStoreOpInst' post-dominates all other
+      // stores to the same memref in 'sibNode' loop nest.
+      auto *sibNode = mdg->getNode(sibId);
+      // Compute an operation list insertion point for the fused loop
+      // nest which preserves dependences.
+      assert(sibNode->op->getBlock() == dstNode->op->getBlock());
+      Operation *insertPointInst =
+          sibNode->op->isBeforeInBlock(dstNode->op)
+              ? mdg->getFusedLoopNestInsertionPoint(sibNode->id, dstNode->id)
+              : mdg->getFusedLoopNestInsertionPoint(dstNode->id, sibNode->id);
+      if (insertPointInst == nullptr)
+        continue;
+
+      // Check if fusion would be profitable and at what depth.
+
+      // Get unique 'sibNode' load op to 'memref'.
+      SmallVector<Operation *, 2> sibLoadOpInsts;
+      sibNode->getLoadOpsForMemref(memref, &sibLoadOpInsts);
+      // Currently findSiblingNodeToFuse searches for siblings with one load.
+      assert(sibLoadOpInsts.size() == 1);
+      Operation *sibLoadOpInst = sibLoadOpInsts[0];
+      assert(!sibNode->stores.empty());
+      // TODO(andydavis) Choose the store which postdominates all other stores.
+      auto *sibStoreOpInst = sibNode->stores.back();
+
+      // Gather 'dstNode' load ops to 'memref'.
+      SmallVector<Operation *, 2> dstLoadOpInsts;
+      dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts);
+
+      // Gather 'dstNode' store ops to 'memref'.
+      SmallVector<Operation *, 2> dstStoreOpInsts;
+      dstNode->getStoreOpsForMemref(memref, &dstStoreOpInsts);
+
+      unsigned bestDstLoopDepth;
+      mlir::ComputationSliceState sliceState;
+
+      // Check if fusion would be profitable.
+      if (!isFusionProfitable(sibLoadOpInst, sibStoreOpInst, dstLoadOpInsts,
+                              dstStoreOpInsts, &sliceState, &bestDstLoopDepth,
+                              maximalFusion))
+        continue;
+
+      // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
+      auto sliceLoopNest = mlir::insertBackwardComputationSlice(
+          sibLoadOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
+      if (sliceLoopNest != nullptr) {
+        auto dstForInst = cast<AffineForOp>(dstNode->op);
+        // Update operation position of fused loop nest (if needed).
+        if (insertPointInst != dstForInst.getOperation()) {
+          dstForInst.getOperation()->moveBefore(insertPointInst);
+        }
+        // Update data dependence graph state post fusion.
+        updateStateAfterSiblingFusion(sliceLoopNest, sibNode, dstNode);
+      }
+    }
+  }
+
+  // Searches function argument uses and the graph from 'dstNode' looking for a
+  // fusion candidate sibling node which shares no dependences with 'dstNode'
+  // but which loads from the same memref. Returns true and sets
+  // 'idAndMemrefToFuse' on success. Returns false otherwise.
+  bool findSiblingNodeToFuse(Node *dstNode,
+                             DenseSet<unsigned> *visitedSibNodeIds,
+                             std::pair<unsigned, Value *> *idAndMemrefToFuse) {
+    // Returns true if 'sibNode' can be fused with 'dstNode' for input reuse
+    // on 'memref'.
+    auto canFuseWithSibNode = [&](Node *sibNode, Value *memref) {
+      // Skip if 'outEdge' is not a read-after-write dependence.
+      // TODO(andydavis) Remove restrict to single load op restriction.
+      if (sibNode->getLoadOpCount(memref) != 1)
+        return false;
+      // Skip if there exists a path of dependent edges between
+      // 'sibNode' and 'dstNode'.
+      if (mdg->hasDependencePath(sibNode->id, dstNode->id) ||
+          mdg->hasDependencePath(dstNode->id, sibNode->id))
+        return false;
+      // Skip sib node if it loads to (and stores from) the same memref on
+      // which it also has an input dependence edge.
+      DenseSet<Value *> loadAndStoreMemrefSet;
+      sibNode->getLoadAndStoreMemrefSet(&loadAndStoreMemrefSet);
+      if (llvm::any_of(loadAndStoreMemrefSet, [=](Value *memref) {
+            return mdg->getIncomingMemRefAccesses(sibNode->id, memref) > 0;
+          }))
+        return false;
+
+      // Check that all stores are to the same memref.
+      DenseSet<Value *> storeMemrefs;
+      for (auto *storeOpInst : sibNode->stores) {
+        storeMemrefs.insert(cast<AffineStoreOp>(storeOpInst).getMemRef());
+      }
+      if (storeMemrefs.size() != 1)
+        return false;
+      return true;
+    };
+
+    // Search for siblings which load the same memref function argument.
+    auto fn = dstNode->op->getParentOfType<FuncOp>();
+    for (unsigned i = 0, e = fn.getNumArguments(); i != e; ++i) {
+      for (auto *user : fn.getArgument(i)->getUsers()) {
+        if (auto loadOp = dyn_cast<AffineLoadOp>(user)) {
+          // Gather loops surrounding 'use'.
+          SmallVector<AffineForOp, 4> loops;
+          getLoopIVs(*user, &loops);
+          // Skip 'use' if it is not within a loop nest.
+          if (loops.empty())
+            continue;
+          Node *sibNode = mdg->getForOpNode(loops[0]);
+          assert(sibNode != nullptr);
+          // Skip 'use' if it not a sibling to 'dstNode'.
+          if (sibNode->id == dstNode->id)
+            continue;
+          // Skip 'use' if it has been visited.
+          if (visitedSibNodeIds->count(sibNode->id) > 0)
+            continue;
+          // Skip 'use' if it does not load from the same memref as 'dstNode'.
+          auto *memref = loadOp.getMemRef();
+          if (dstNode->getLoadOpCount(memref) == 0)
+            continue;
+          // Check if 'sibNode/dstNode' can be input-reuse fused on 'memref'.
+          if (canFuseWithSibNode(sibNode, memref)) {
+            visitedSibNodeIds->insert(sibNode->id);
+            idAndMemrefToFuse->first = sibNode->id;
+            idAndMemrefToFuse->second = memref;
+            return true;
+          }
+        }
+      }
+    }
+
+    // Search for siblings by following edges through an intermediate src node.
+    // Collect candidate 'dstNode' input edges in 'inEdges'.
+    SmallVector<MemRefDependenceGraph::Edge, 2> inEdges;
+    mdg->forEachMemRefInputEdge(
+        dstNode->id, [&](MemRefDependenceGraph::Edge inEdge) {
+          // Add 'inEdge' if it is a read-after-write dependence.
+          if (dstNode->getLoadOpCount(inEdge.value) > 0 &&
+              mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0)
+            inEdges.push_back(inEdge);
+        });
+
+    // Search for sibling nodes to fuse by visiting output edges from each input
+    // edge in 'inEdges'.
+    for (auto &inEdge : inEdges) {
+      // Collect candidate output edges from each node 'inEdge.id' in 'inEdges'.
+      SmallVector<MemRefDependenceGraph::Edge, 2> outEdges;
+      mdg->forEachMemRefOutputEdge(
+          inEdge.id, [&](MemRefDependenceGraph::Edge outEdge) {
+            unsigned sibNodeId = outEdge.id;
+            if (visitedSibNodeIds->count(sibNodeId) > 0)
+              return;
+            // Skip output edge if not a sibling using the same memref.
+            if (outEdge.id == dstNode->id || outEdge.value != inEdge.value)
+              return;
+            auto *sibNode = mdg->getNode(sibNodeId);
+            if (!isa<AffineForOp>(sibNode->op))
+              return;
+            // Check if 'sibNode/dstNode' can be input-reuse fused on 'memref'.
+            if (canFuseWithSibNode(sibNode, outEdge.value)) {
+              // Add candidate 'outEdge' to sibling node.
+              outEdges.push_back(outEdge);
+            }
+          });
+
+      // Add first candidate if any were returned.
+      if (!outEdges.empty()) {
+        visitedSibNodeIds->insert(outEdges[0].id);
+        idAndMemrefToFuse->first = outEdges[0].id;
+        idAndMemrefToFuse->second = outEdges[0].value;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void updateStateAfterSiblingFusion(AffineForOp sliceLoopNest, Node *sibNode,
+                                     Node *dstNode) {
+    // Update 'sibNode' and 'dstNode' input/output edges to reflect fusion.
+    mdg->updateEdges(sibNode->id, dstNode->id);
+
+    // Collect slice loop stats.
+    LoopNestStateCollector sliceCollector;
+    sliceCollector.collect(sliceLoopNest.getOperation());
+    // Promote single iteration slice loops to single IV value.
+    for (auto forOp : sliceCollector.forOps) {
+      promoteIfSingleIteration(forOp);
+    }
+
+    // Collect dst loop stats after memref privatizaton transformation.
+    auto dstForInst = cast<AffineForOp>(dstNode->op);
+    LoopNestStateCollector dstLoopCollector;
+    dstLoopCollector.collect(dstForInst.getOperation());
+    // Clear and add back loads and stores
+    mdg->clearNodeLoadAndStores(dstNode->id);
+    mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
+                   dstLoopCollector.storeOpInsts);
+    // Remove old sibling loop nest if it no longer has outgoing dependence
+    // edges, and it does not write to a memref which escapes the
+    // function.
+    if (mdg->getOutEdgeCount(sibNode->id) == 0) {
+      mdg->removeNode(sibNode->id);
+      sibNode->op->erase();
+    }
+  }
+
+  // Clean up any allocs with no users.
+  void eraseUnusedMemRefAllocations() {
+    for (auto &pair : mdg->memrefEdgeCount) {
+      if (pair.second > 0)
+        continue;
+      auto *memref = pair.first;
+      // Skip if there exist other uses (return operation or function calls).
+      if (!memref->use_empty())
+        continue;
+      // Use list expected to match the dep graph info.
+      auto *op = memref->getDefiningOp();
+      if (isa_and_nonnull<AllocOp>(op))
+        op->erase();
+    }
+  }
+};
+
+} // end anonymous namespace
+
+void LoopFusion::runOnFunction() {
+  // Override if a command line argument was provided.
+  if (clFusionFastMemorySpace.getNumOccurrences() > 0) {
+    fastMemorySpace = clFusionFastMemorySpace.getValue();
+  }
+
+  // Override if a command line argument was provided.
+  if (clFusionLocalBufThreshold.getNumOccurrences() > 0) {
+    localBufSizeThreshold = clFusionLocalBufThreshold * 1024;
+  }
+
+  if (clMaximalLoopFusion.getNumOccurrences() > 0)
+    maximalFusion = clMaximalLoopFusion;
+
+  MemRefDependenceGraph g;
+  if (g.init(getFunction()))
+    GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace, maximalFusion)
+        .run();
+}
+
+static PassRegistration<LoopFusion> pass("affine-loop-fusion",
+                                         "Fuse loop nests");
diff --git a/third_party/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/third_party/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
new file mode 100644
index 00000000000..293e565cda7
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -0,0 +1,251 @@
+//===- LoopInvariantCodeMotion.cpp - Code to perform loop fusion-----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop invariant code motion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "licm"
+
+using namespace mlir;
+
+namespace {
+
+/// Loop invariant code motion (LICM) pass.
+/// TODO(asabne) : The pass is missing zero-trip tests.
+/// TODO(asabne) : Check for the presence of side effects before hoisting.
+struct LoopInvariantCodeMotion : public FunctionPass<LoopInvariantCodeMotion> {
+  void runOnFunction() override;
+  void runOnAffineForOp(AffineForOp forOp);
+};
+} // end anonymous namespace
+
+static bool
+checkInvarianceOfNestedIfOps(Operation *op, Value *indVar,
+                             SmallPtrSetImpl<Operation *> &definedOps,
+                             SmallPtrSetImpl<Operation *> &opsToHoist);
+static bool isOpLoopInvariant(Operation &op, Value *indVar,
+                              SmallPtrSetImpl<Operation *> &definedOps,
+                              SmallPtrSetImpl<Operation *> &opsToHoist);
+
+static bool
+areAllOpsInTheBlockListInvariant(Region &blockList, Value *indVar,
+                                 SmallPtrSetImpl<Operation *> &definedOps,
+                                 SmallPtrSetImpl<Operation *> &opsToHoist);
+
+static bool isMemRefDereferencingOp(Operation &op) {
+  // TODO(asabne): Support DMA Ops.
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) {
+    return true;
+  }
+  return false;
+}
+
+std::unique_ptr<FunctionPassBase> mlir::createLoopInvariantCodeMotionPass() {
+  return std::make_unique<LoopInvariantCodeMotion>();
+}
+
+// Returns true if the individual op is loop invariant.
+bool isOpLoopInvariant(Operation &op, Value *indVar,
+                       SmallPtrSetImpl<Operation *> &definedOps,
+                       SmallPtrSetImpl<Operation *> &opsToHoist) {
+  LLVM_DEBUG(llvm::dbgs() << "iterating on op: " << op;);
+
+  if (isa<AffineIfOp>(op)) {
+    if (!checkInvarianceOfNestedIfOps(&op, indVar, definedOps, opsToHoist)) {
+      return false;
+    }
+  } else if (isa<AffineForOp>(op)) {
+    // If the body of a predicated region has a for loop, we don't hoist the
+    // 'affine.if'.
+    return false;
+  } else if (isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op)) {
+    // TODO(asabne): Support DMA ops.
+    return false;
+  } else if (!isa<ConstantOp>(op)) {
+    if (isMemRefDereferencingOp(op)) {
+      Value *memref = isa<AffineLoadOp>(op)
+                          ? cast<AffineLoadOp>(op).getMemRef()
+                          : cast<AffineStoreOp>(op).getMemRef();
+      for (auto *user : memref->getUsers()) {
+        // If this memref has a user that is a DMA, give up because these
+        // operations write to this memref.
+        if (isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op)) {
+          return false;
+        }
+        // If the memref used by the load/store is used in a store elsewhere in
+        // the loop nest, we do not hoist. Similarly, if the memref used in a
+        // load is also being stored too, we do not hoist the load.
+        if (isa<AffineStoreOp>(user) ||
+            (isa<AffineLoadOp>(user) && isa<AffineStoreOp>(op))) {
+          if (&op != user) {
+            SmallVector<AffineForOp, 8> userIVs;
+            getLoopIVs(*user, &userIVs);
+            // Check that userIVs don't contain the for loop around the op.
+            if (llvm::is_contained(userIVs, getForInductionVarOwner(indVar))) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    // Insert this op in the defined ops list.
+    definedOps.insert(&op);
+
+    if (op.getNumOperands() == 0 && !isa<AffineTerminatorOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs() << "\nNon-constant op with 0 operands\n");
+      return false;
+    }
+    for (unsigned int i = 0; i < op.getNumOperands(); ++i) {
+      auto *operandSrc = op.getOperand(i)->getDefiningOp();
+
+      LLVM_DEBUG(
+          op.getOperand(i)->print(llvm::dbgs() << "\nIterating on operand\n"));
+
+      // If the loop IV is the operand, this op isn't loop invariant.
+      if (indVar == op.getOperand(i)) {
+        LLVM_DEBUG(llvm::dbgs() << "\nLoop IV is the operand\n");
+        return false;
+      }
+
+      if (operandSrc != nullptr) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << *operandSrc << "\nIterating on operand src\n");
+
+        // If the value was defined in the loop (outside of the
+        // if/else region), and that operation itself wasn't meant to
+        // be hoisted, then mark this operation loop dependent.
+        if (definedOps.count(operandSrc) && opsToHoist.count(operandSrc) == 0) {
+          return false;
+        }
+      }
+    }
+  }
+
+  // If no operand was loop variant, mark this op for motion.
+  opsToHoist.insert(&op);
+  return true;
+}
+
+// Checks if all ops in a region (i.e. list of blocks) are loop invariant.
+bool areAllOpsInTheBlockListInvariant(
+    Region &blockList, Value *indVar, SmallPtrSetImpl<Operation *> &definedOps,
+    SmallPtrSetImpl<Operation *> &opsToHoist) {
+
+  for (auto &b : blockList) {
+    for (auto &op : b) {
+      if (!isOpLoopInvariant(op, indVar, definedOps, opsToHoist)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Returns true if the affine.if op can be hoisted.
+bool checkInvarianceOfNestedIfOps(Operation *op, Value *indVar,
+                                  SmallPtrSetImpl<Operation *> &definedOps,
+                                  SmallPtrSetImpl<Operation *> &opsToHoist) {
+  assert(isa<AffineIfOp>(op));
+  auto ifOp = cast<AffineIfOp>(op);
+
+  if (!areAllOpsInTheBlockListInvariant(ifOp.thenRegion(), indVar, definedOps,
+                                        opsToHoist)) {
+    return false;
+  }
+
+  if (!areAllOpsInTheBlockListInvariant(ifOp.elseRegion(), indVar, definedOps,
+                                        opsToHoist)) {
+    return false;
+  }
+
+  return true;
+}
+
+void LoopInvariantCodeMotion::runOnAffineForOp(AffineForOp forOp) {
+  auto *loopBody = forOp.getBody();
+  auto *indVar = forOp.getInductionVar();
+
+  SmallPtrSet<Operation *, 8> definedOps;
+  // This is the place where hoisted instructions would reside.
+  OpBuilder b(forOp.getOperation());
+
+  SmallPtrSet<Operation *, 8> opsToHoist;
+  SmallVector<Operation *, 8> opsToMove;
+
+  for (auto &op : *loopBody) {
+    // We don't hoist for loops.
+    if (!isa<AffineForOp>(op)) {
+      if (!isa<AffineTerminatorOp>(op)) {
+        if (isOpLoopInvariant(op, indVar, definedOps, opsToHoist)) {
+          opsToMove.push_back(&op);
+        }
+      }
+    }
+  }
+
+  // For all instructions that we found to be invariant, place sequentially
+  // right before the for loop.
+  for (auto *op : opsToMove) {
+    op->moveBefore(forOp);
+  }
+
+  LLVM_DEBUG(forOp.getOperation()->print(llvm::dbgs() << "Modified loop\n"));
+
+  // If the for loop body has a single operation (the terminator), erase it.
+  if (forOp.getBody()->getOperations().size() == 1) {
+    assert(isa<AffineTerminatorOp>(forOp.getBody()->front()));
+    forOp.erase();
+  }
+}
+
+void LoopInvariantCodeMotion::runOnFunction() {
+  // Walk through all loops in a function in innermost-loop-first order.  This
+  // way, we first LICM from the inner loop, and place the ops in
+  // the outer loop, which in turn can be further LICM'ed.
+  getFunction().walk<AffineForOp>([&](AffineForOp op) {
+    LLVM_DEBUG(op.getOperation()->print(llvm::dbgs() << "\nOriginal loop\n"));
+    runOnAffineForOp(op);
+  });
+}
+
+static PassRegistration<LoopInvariantCodeMotion>
+    pass("affine-loop-invariant-code-motion",
+         "Hoist loop invariant instructions outside of the loop");
diff --git a/third_party/mlir/lib/Transforms/LoopTiling.cpp b/third_party/mlir/lib/Transforms/LoopTiling.cpp
new file mode 100644
index 00000000000..02787b12e3d
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopTiling.cpp
@@ -0,0 +1,411 @@
+//===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to tile loop nests.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-tile"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<unsigned long long>
+    clCacheSizeKiB("tile-cache-size",
+                   llvm::cl::desc("Set size of cache to tile for in KiB"),
+                   llvm::cl::cat(clOptionsCategory));
+
+// Tile size to use for all loops (overrides -tile-sizes if provided).
+static llvm::cl::opt<unsigned>
+    clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"),
+               llvm::cl::cat(clOptionsCategory));
+
+// List of tile sizes. If any of them aren't provided, they are filled with
+// clTileSize / kDefaultTileSize.
+static llvm::cl::list<unsigned> clTileSizes(
+    "tile-sizes",
+    llvm::cl::desc(
+        "List of tile sizes for each perfect nest (overridden by -tile-size)"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// A pass to perform loop tiling on all suitable loop nests of a Function.
+struct LoopTiling : public FunctionPass<LoopTiling> {
+  explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity,
+                      bool avoidMaxMinBounds = true)
+      : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {}
+
+  void runOnFunction() override;
+  void getTileSizes(ArrayRef<AffineForOp> band,
+                    SmallVectorImpl<unsigned> *tileSizes);
+
+  // Default tile size if nothing is provided.
+  constexpr static unsigned kDefaultTileSize = 4;
+  constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL;
+
+  // Capacity of the cache to tile for.
+  uint64_t cacheSizeBytes;
+  // If true, tile sizes are set to avoid max/min in bounds if possible.
+  bool avoidMaxMinBounds;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to perform loop tiling on all suitable loop nests of a
+/// Function.
+std::unique_ptr<FunctionPassBase>
+mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
+  return std::make_unique<LoopTiling>(cacheSizeBytes);
+}
+
+// Move the loop body of AffineForOp 'src' from 'src' into the specified
+// location in destination's body, ignoring the terminator.
+static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
+                                Block::iterator loc) {
+  auto &insts = src.getBody()->getOperations();
+  dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
+                                         std::prev(insts.end()));
+}
+
+// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
+// body.
+static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
+  moveLoopBody(src, dest, dest.getBody()->begin());
+}
+
+/// Constructs and sets new loop bounds after tiling for the case of
+/// hyper-rectangular index sets, where the bounds of one dimension do not
+/// depend on other dimensions. Bounds of each dimension can thus be treated
+/// independently, and deriving the new bounds is much simpler and faster
+/// than for the case of tiling arbitrary polyhedral shapes.
+static void
+constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
+                                MutableArrayRef<AffineForOp> newLoops,
+                                ArrayRef<unsigned> tileSizes) {
+  assert(!origLoops.empty());
+  assert(origLoops.size() == tileSizes.size());
+
+  OpBuilder b(origLoops[0].getOperation());
+  unsigned width = origLoops.size();
+
+  // Bounds for tile space loops.
+  for (unsigned i = 0; i < width; i++) {
+    auto lbOperands = origLoops[i].getLowerBoundOperands();
+    auto ubOperands = origLoops[i].getUpperBoundOperands();
+    SmallVector<Value *, 4> newLbOperands(lbOperands);
+    SmallVector<Value *, 4> newUbOperands(ubOperands);
+    newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
+    newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
+    newLoops[i].setStep(tileSizes[i]);
+  }
+  // Bounds for intra-tile loops.
+  for (unsigned i = 0; i < width; i++) {
+    int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
+    auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
+    // The lower bound is just the tile-space loop.
+    AffineMap lbMap = b.getDimIdentityMap();
+    newLoops[width + i].setLowerBound(
+        /*operands=*/newLoops[i].getInductionVar(), lbMap);
+
+    // Set the upper bound.
+    if (mayBeConstantCount.hasValue() &&
+        mayBeConstantCount.getValue() < tileSizes[i]) {
+      // Trip count is less than tile size; upper bound is the trip count.
+      auto ubMap = b.getConstantAffineMap(mayBeConstantCount.getValue());
+      newLoops[width + i].setUpperBoundMap(ubMap);
+    } else if (largestDiv % tileSizes[i] != 0) {
+      // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
+      // Construct the upper bound map; the operands are the original operands
+      // with 'i' (tile-space loop) appended to it. The new upper bound map is
+      // the original one with an additional expression i + tileSize appended.
+      auto ub = origLoops[i].getUpperBound();
+      SmallVector<Value *, 4> ubOperands;
+      ubOperands.reserve(ub.getNumOperands() + 1);
+      auto origUbMap = ub.getMap();
+      // Add dim operands from original upper bound.
+      for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) {
+        ubOperands.push_back(ub.getOperand(j));
+      }
+      // Add dim operand for new loop upper bound.
+      ubOperands.push_back(newLoops[i].getInductionVar());
+      // Add symbol operands from original upper bound.
+      for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) {
+        ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
+      }
+      SmallVector<AffineExpr, 4> boundExprs;
+      boundExprs.reserve(1 + origUbMap.getNumResults());
+      auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
+      // The new upper bound map is the original one with an additional
+      // expression i + tileSize appended.
+      boundExprs.push_back(dim + tileSizes[i]);
+      boundExprs.append(origUbMap.getResults().begin(),
+                        origUbMap.getResults().end());
+      auto ubMap = b.getAffineMap(origUbMap.getNumDims() + 1,
+                                  origUbMap.getNumSymbols(), boundExprs);
+      newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
+    } else {
+      // No need of the min expression.
+      auto dim = b.getAffineDimExpr(0);
+      auto ubMap = b.getAffineMap(1, 0, dim + tileSizes[i]);
+      newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
+    }
+  }
+}
+
+/// Tiles the specified band of perfectly nested loops creating tile-space loops
+/// and intra-tile loops. A band is a contiguous set of loops.
+//  TODO(bondhugula): handle non hyper-rectangular spaces.
+LogicalResult mlir::tileCodeGen(MutableArrayRef<AffineForOp> band,
+                                ArrayRef<unsigned> tileSizes) {
+  assert(!band.empty());
+  assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes");
+
+  // Check if the supplied for op's are all successively nested.
+  for (unsigned i = 1, e = band.size(); i < e; i++) {
+    assert(band[i].getOperation()->getParentOp() == band[i - 1].getOperation());
+  }
+
+  auto origLoops = band;
+
+  AffineForOp rootAffineForOp = origLoops[0];
+  auto loc = rootAffineForOp.getLoc();
+  // Note that width is at least one since band isn't empty.
+  unsigned width = band.size();
+
+  SmallVector<AffineForOp, 12> newLoops(2 * width);
+  AffineForOp innermostPointLoop;
+
+  // The outermost among the loops as we add more..
+  auto *topLoop = rootAffineForOp.getOperation();
+
+  // Add intra-tile (or point) loops.
+  for (unsigned i = 0; i < width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
+    pointLoop.getBody()->getOperations().splice(
+        pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    newLoops[2 * width - 1 - i] = pointLoop;
+    topLoop = pointLoop.getOperation();
+    if (i == 0)
+      innermostPointLoop = pointLoop;
+  }
+
+  // Add tile space loops;
+  for (unsigned i = width; i < 2 * width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
+    tileSpaceLoop.getBody()->getOperations().splice(
+        tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    newLoops[2 * width - i - 1] = tileSpaceLoop;
+    topLoop = tileSpaceLoop.getOperation();
+  }
+
+  // Move the loop body of the original nest to the new one.
+  moveLoopBody(origLoops[origLoops.size() - 1], innermostPointLoop);
+
+  SmallVector<Value *, 8> origLoopIVs;
+  extractForInductionVars(band, &origLoopIVs);
+  SmallVector<Optional<Value *>, 6> ids(origLoopIVs.begin(), origLoopIVs.end());
+  FlatAffineConstraints cst;
+  getIndexSet(band, &cst);
+
+  if (!cst.isHyperRectangular(0, width)) {
+    rootAffineForOp.emitError("tiled code generation unimplemented for the "
+                              "non-hyperrectangular case");
+    return failure();
+  }
+
+  constructTiledIndexSetHyperRect(origLoops, newLoops, tileSizes);
+  // In this case, the point loop IVs just replace the original ones.
+  for (unsigned i = 0; i < width; i++) {
+    origLoopIVs[i]->replaceAllUsesWith(newLoops[i + width].getInductionVar());
+  }
+
+  // Erase the old loop nest.
+  rootAffineForOp.erase();
+
+  return success();
+}
+
+// Identify valid and profitable bands of loops to tile. This is currently just
+// a temporary placeholder to test the mechanics of tiled code generation.
+// Returns all maximal outermost perfect loop nests to tile.
+static void getTileableBands(FuncOp f,
+                             std::vector<SmallVector<AffineForOp, 6>> *bands) {
+  // Get maximal perfect nest of 'affine.for' insts starting from root
+  // (inclusive).
+  auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
+    SmallVector<AffineForOp, 6> band;
+    getPerfectlyNestedLoops(band, root);
+    bands->push_back(band);
+  };
+
+  for (auto &block : f)
+    for (auto &op : block)
+      if (auto forOp = dyn_cast<AffineForOp>(op))
+        getMaximalPerfectLoopNest(forOp);
+}
+
+// Reduce each tile size to the largest divisor of the corresponding trip count
+// (if the trip count is known).
+static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
+                                         SmallVectorImpl<unsigned> *tileSizes) {
+  assert(band.size() == tileSizes->size() && "invalid tile size count");
+  for (unsigned i = 0, e = band.size(); i < e; i++) {
+    unsigned &tSizeAdjusted = (*tileSizes)[i];
+    auto mayConst = getConstantTripCount(band[i]);
+    if (!mayConst.hasValue())
+      continue;
+    // Adjust the tile size to largest factor of the trip count less than
+    // tSize.
+    uint64_t constTripCount = mayConst.getValue();
+    if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
+      tSizeAdjusted = constTripCount / 2;
+    while (constTripCount % tSizeAdjusted != 0)
+      tSizeAdjusted--;
+  }
+}
+
+// Returns tile sizes to use. Checks CL options; if none are specified, sets it
+// based on a simple model that looks at the memory footprint and determines
+// tile sizes assuming identity accesses / 1:1 tile size proportional footprint
+// along each of the dimensions being tiled.
+// TODO(mlir-team): evolve this model. Tile size determination is a large area
+// to play with in general.
+void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
+                              SmallVectorImpl<unsigned> *tileSizes) {
+  if (band.empty())
+    return;
+
+  tileSizes->resize(band.size());
+
+  // Use clTileSize for all loops if specified.
+  if (clTileSize.getNumOccurrences() > 0) {
+    std::fill(tileSizes->begin(), tileSizes->end(), clTileSize);
+    return;
+  }
+
+  // Use clTileSizes and fill them with default tile size if it's short.
+  if (!clTileSizes.empty()) {
+    std::fill(tileSizes->begin(), tileSizes->end(),
+              LoopTiling::kDefaultTileSize);
+    std::copy(clTileSizes.begin(),
+              clTileSizes.begin() + std::min(clTileSizes.size(), band.size()),
+              tileSizes->begin());
+    return;
+  }
+
+  // The first loop in the band.
+  auto rootForOp = band[0];
+  (void)rootForOp;
+
+  // Obtain memory footprint and set tile sizes so that a tile fits in
+  // the cache size. This is an approximation with the assumption that the
+  // footprint increases with the tile size linearly in that dimension (i.e.,
+  // assumes one-to-one access function).
+  auto fp = getMemoryFootprintBytes(band[0], 0);
+  if (!fp.hasValue()) {
+    // Fill with default tile sizes if footprint is unknown.
+    std::fill(tileSizes->begin(), tileSizes->end(),
+              LoopTiling::kDefaultTileSize);
+    if (avoidMaxMinBounds)
+      adjustToDivisorsOfTripCounts(band, tileSizes);
+    LLVM_DEBUG(
+        rootForOp.emitWarning("memory footprint unknown: using default tile "
+                              "sizes adjusted to trip count divisors"));
+    return;
+  }
+
+  // Check how many times larger the cache size is when compared to footprint.
+  uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
+  if (excessFactor <= 1) {
+    // No need of any tiling - set tile size to 1.
+    std::fill(tileSizes->begin(), tileSizes->end(), 1);
+    return;
+  }
+
+  // Divide all loops equally in an attempt to reduce footprint.
+  // TODO(bondhugula): this is approximate. Ideally, obtain reuse factor /
+  // profitability along each dimension and weight tile sizes based on that as
+  // one possible approach. Or compute a polynomial in tile sizes and solve for
+  // it.
+
+  // For an n-d tilable band, compute n^th root of the excess.
+  unsigned tSize =
+      static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size())));
+  // We'll keep a running product to determine the last tile size better.
+  unsigned cumulProductOfTileSizes = 1;
+  for (unsigned i = 0, e = band.size(); i < e; i++) {
+    if (i < e - 1)
+      (*tileSizes)[i] = tSize;
+    else
+      // Set last tile size to cover the balance.
+      (*tileSizes)[i] = std::max(
+          1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
+    cumulProductOfTileSizes *= (*tileSizes)[i];
+  }
+  if (avoidMaxMinBounds)
+    adjustToDivisorsOfTripCounts(band, tileSizes);
+}
+
+void LoopTiling::runOnFunction() {
+  // Override cache size if provided on command line.
+  if (clCacheSizeKiB.getNumOccurrences() > 0)
+    cacheSizeBytes = clCacheSizeKiB * 1024;
+
+  // Bands of loops to tile.
+  std::vector<SmallVector<AffineForOp, 6>> bands;
+  getTileableBands(getFunction(), &bands);
+
+  for (auto &band : bands) {
+    // Set up tile sizes; fill missing tile sizes at the end with default tile
+    // size or clTileSize if one was provided.
+    SmallVector<unsigned, 6> tileSizes;
+    getTileSizes(band, &tileSizes);
+    if (llvm::DebugFlag) {
+      auto diag = band[0].emitRemark("using tile sizes [");
+      for (auto tSize : tileSizes)
+        diag << tSize << " ";
+      diag << "]\n";
+    }
+    if (failed(tileCodeGen(band, tileSizes)))
+      return signalPassFailure();
+  }
+}
+
+constexpr unsigned LoopTiling::kDefaultTileSize;
+constexpr uint64_t LoopTiling::kDefaultCacheMemCapacity;
+
+static PassRegistration<LoopTiling> pass("affine-loop-tile", "Tile loop nests");
diff --git a/third_party/mlir/lib/Transforms/LoopUnroll.cpp b/third_party/mlir/lib/Transforms/LoopUnroll.cpp
new file mode 100644
index 00000000000..2acc5a90f5f
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopUnroll.cpp
@@ -0,0 +1,191 @@
+//===- LoopUnroll.cpp - Code to perform loop unrolling --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-unroll"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+// Loop unrolling factor.
+static llvm::cl::opt<unsigned> clUnrollFactor(
+    "unroll-factor",
+    llvm::cl::desc("Use this unroll factor for all loops being unrolled"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clUnrollFull("unroll-full",
+                                        llvm::cl::desc("Fully unroll loops"),
+                                        llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clUnrollNumRepetitions(
+    "unroll-num-reps",
+    llvm::cl::desc("Unroll innermost loops repeatedly this many times"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clUnrollFullThreshold(
+    "unroll-full-threshold", llvm::cl::Hidden,
+    llvm::cl::desc(
+        "Unroll all loops with trip count less than or equal to this"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+/// Loop unrolling pass. Unrolls all innermost loops unless full unrolling and a
+/// full unroll threshold was specified, in which case, fully unrolls all loops
+/// with trip count less than the specified threshold. The latter is for testing
+/// purposes, especially for testing outer loop unrolling.
+struct LoopUnroll : public FunctionPass<LoopUnroll> {
+  const Optional<unsigned> unrollFactor;
+  const Optional<bool> unrollFull;
+  // Callback to obtain unroll factors; if this has a callable target, takes
+  // precedence over command-line argument or passed argument.
+  const std::function<unsigned(AffineForOp)> getUnrollFactor;
+
+  explicit LoopUnroll(
+      Optional<unsigned> unrollFactor = None, Optional<bool> unrollFull = None,
+      const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr)
+      : unrollFactor(unrollFactor), unrollFull(unrollFull),
+        getUnrollFactor(getUnrollFactor) {}
+
+  void runOnFunction() override;
+
+  /// Unroll this for op. Returns failure if nothing was done.
+  LogicalResult runOnAffineForOp(AffineForOp forOp);
+
+  static const unsigned kDefaultUnrollFactor = 4;
+};
+} // end anonymous namespace
+
+void LoopUnroll::runOnFunction() {
+  // Gathers all innermost loops through a post order pruned walk.
+  struct InnermostLoopGatherer {
+    // Store innermost loops as we walk.
+    std::vector<AffineForOp> loops;
+
+    void walkPostOrder(FuncOp f) {
+      for (auto &b : f)
+        walkPostOrder(b.begin(), b.end());
+    }
+
+    bool walkPostOrder(Block::iterator Start, Block::iterator End) {
+      bool hasInnerLoops = false;
+      // We need to walk all elements since all innermost loops need to be
+      // gathered as opposed to determining whether this list has any inner
+      // loops or not.
+      while (Start != End)
+        hasInnerLoops |= walkPostOrder(&(*Start++));
+      return hasInnerLoops;
+    }
+    bool walkPostOrder(Operation *opInst) {
+      bool hasInnerLoops = false;
+      for (auto &region : opInst->getRegions())
+        for (auto &block : region)
+          hasInnerLoops |= walkPostOrder(block.begin(), block.end());
+      if (isa<AffineForOp>(opInst)) {
+        if (!hasInnerLoops)
+          loops.push_back(cast<AffineForOp>(opInst));
+        return true;
+      }
+      return hasInnerLoops;
+    }
+  };
+
+  if (clUnrollFull.getNumOccurrences() > 0 &&
+      clUnrollFullThreshold.getNumOccurrences() > 0) {
+    // Store short loops as we walk.
+    std::vector<AffineForOp> loops;
+
+    // Gathers all loops with trip count <= minTripCount. Do a post order walk
+    // so that loops are gathered from innermost to outermost (or else unrolling
+    // an outer one may delete gathered inner ones).
+    getFunction().walk<AffineForOp>([&](AffineForOp forOp) {
+      Optional<uint64_t> tripCount = getConstantTripCount(forOp);
+      if (tripCount.hasValue() && tripCount.getValue() <= clUnrollFullThreshold)
+        loops.push_back(forOp);
+    });
+    for (auto forOp : loops)
+      loopUnrollFull(forOp);
+    return;
+  }
+
+  unsigned numRepetitions = clUnrollNumRepetitions.getNumOccurrences() > 0
+                                ? clUnrollNumRepetitions
+                                : 1;
+  // If the call back is provided, we will recurse until no loops are found.
+  FuncOp func = getFunction();
+  for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
+    InnermostLoopGatherer ilg;
+    ilg.walkPostOrder(func);
+    auto &loops = ilg.loops;
+    if (loops.empty())
+      break;
+    bool unrolled = false;
+    for (auto forOp : loops)
+      unrolled |= succeeded(runOnAffineForOp(forOp));
+    if (!unrolled)
+      // Break out if nothing was unrolled.
+      break;
+  }
+}
+
+/// Unrolls a 'affine.for' op. Returns success if the loop was unrolled,
+/// failure otherwise. The default unroll factor is 4.
+LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
+  // Use the function callback if one was provided.
+  if (getUnrollFactor) {
+    return loopUnrollByFactor(forOp, getUnrollFactor(forOp));
+  }
+  // Unroll by the factor passed, if any.
+  if (unrollFactor.hasValue())
+    return loopUnrollByFactor(forOp, unrollFactor.getValue());
+  // Unroll by the command line factor if one was specified.
+  if (clUnrollFactor.getNumOccurrences() > 0)
+    return loopUnrollByFactor(forOp, clUnrollFactor);
+  // Unroll completely if full loop unroll was specified.
+  if (clUnrollFull.getNumOccurrences() > 0 ||
+      (unrollFull.hasValue() && unrollFull.getValue()))
+    return loopUnrollFull(forOp);
+
+  // Unroll by four otherwise.
+  return loopUnrollByFactor(forOp, kDefaultUnrollFactor);
+}
+
+std::unique_ptr<FunctionPassBase> mlir::createLoopUnrollPass(
+    int unrollFactor, int unrollFull,
+    const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
+  return std::make_unique<LoopUnroll>(
+      unrollFactor == -1 ? None : Optional<unsigned>(unrollFactor),
+      unrollFull == -1 ? None : Optional<bool>(unrollFull), getUnrollFactor);
+}
+
+static PassRegistration<LoopUnroll> pass("affine-loop-unroll", "Unroll loops");
diff --git a/third_party/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/third_party/mlir/lib/Transforms/LoopUnrollAndJam.cpp
new file mode 100644
index 00000000000..3e92ad739e8
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LoopUnrollAndJam.cpp
@@ -0,0 +1,244 @@
+//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop unroll and jam. Unroll and jam is a transformation
+// that improves locality, in particular, register reuse, while also improving
+// operation level parallelism. The example below shows what it does in nearly
+// the general case. Loop unroll and jam currently works if the bounds of the
+// loops inner to the loop being unroll-jammed do not depend on the latter.
+//
+// Before      After unroll and jam of i by factor 2:
+//
+//             for i, step = 2
+// for i         S1(i);
+//   S1;         S2(i);
+//   S2;         S1(i+1);
+//   for j       S2(i+1);
+//     S3;       for j
+//     S4;         S3(i, j);
+//   S5;           S4(i, j);
+//   S6;           S3(i+1, j)
+//                 S4(i+1, j)
+//               S5(i);
+//               S6(i);
+//               S5(i+1);
+//               S6(i+1);
+//
+// Note: 'if/else' blocks are not jammed. So, if there are loops inside if
+// op's, bodies of those loops will not be jammed.
+//===----------------------------------------------------------------------===//
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-unroll-jam"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+// Loop unroll and jam factor.
+static llvm::cl::opt<unsigned>
+    clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden,
+                      llvm::cl::desc("Use this unroll jam factor for all loops"
+                                     " (default 4)"),
+                      llvm::cl::cat(clOptionsCategory));
+
+namespace {
+/// Loop unroll jam pass. Currently, this just unroll jams the first
+/// outer loop in a Function.
+struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> {
+  Optional<unsigned> unrollJamFactor;
+  static const unsigned kDefaultUnrollJamFactor = 4;
+
+  explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None)
+      : unrollJamFactor(unrollJamFactor) {}
+
+  void runOnFunction() override;
+  LogicalResult runOnAffineForOp(AffineForOp forOp);
+};
+} // end anonymous namespace
+
+std::unique_ptr<FunctionPassBase>
+mlir::createLoopUnrollAndJamPass(int unrollJamFactor) {
+  return std::make_unique<LoopUnrollAndJam>(
+      unrollJamFactor == -1 ? None : Optional<unsigned>(unrollJamFactor));
+}
+
+void LoopUnrollAndJam::runOnFunction() {
+  // Currently, just the outermost loop from the first loop nest is
+  // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on
+  // any for operation.
+  auto &entryBlock = getFunction().front();
+  if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front()))
+    runOnAffineForOp(forOp);
+}
+
+/// Unroll and jam a 'affine.for' op. Default unroll jam factor is
+/// kDefaultUnrollJamFactor. Return failure if nothing was done.
+LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) {
+  // Unroll and jam by the factor that was passed if any.
+  if (unrollJamFactor.hasValue())
+    return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue());
+  // Otherwise, unroll jam by the command-line factor if one was specified.
+  if (clUnrollJamFactor.getNumOccurrences() > 0)
+    return loopUnrollJamByFactor(forOp, clUnrollJamFactor);
+
+  // Unroll and jam by four otherwise.
+  return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor);
+}
+
+LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
+                                            uint64_t unrollJamFactor) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollJamFactor)
+    return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue());
+  return loopUnrollJamByFactor(forOp, unrollJamFactor);
+}
+
+/// Unrolls and jams this loop by the specified factor.
+LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
+                                          uint64_t unrollJamFactor) {
+  // Gathers all maximal sub-blocks of operations that do not themselves
+  // include a for op (a operation could have a descendant for op though
+  // in its tree).  Ignore the block terminators.
+  struct JamBlockGatherer {
+    // Store iterators to the first and last op of each sub-block found.
+    std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
+
+    // This is a linear time walk.
+    void walk(Operation *op) {
+      for (auto &region : op->getRegions())
+        for (auto &block : region)
+          walk(block);
+    }
+    void walk(Block &block) {
+      for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
+        auto subBlockStart = it;
+        while (it != e && !isa<AffineForOp>(&*it))
+          ++it;
+        if (it != subBlockStart)
+          subBlocks.push_back({subBlockStart, std::prev(it)});
+        // Process all for insts that appear next.
+        while (it != e && isa<AffineForOp>(&*it))
+          walk(&*it++);
+      }
+    }
+  };
+
+  assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
+
+  if (unrollJamFactor == 1)
+    return promoteIfSingleIteration(forOp);
+
+  if (forOp.getBody()->empty() ||
+      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return failure();
+
+  // Loops where both lower and upper bounds are multi-result maps won't be
+  // unrolled (since the trip can't be expressed as an affine function in
+  // general).
+  // TODO(mlir-team): this may not be common, but we could support the case
+  // where the lower bound is a multi-result map and the ub is a single result
+  // one.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  // If the trip count is lower than the unroll jam factor, no unroll jam.
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollJamFactor)
+    return failure();
+
+  auto *forInst = forOp.getOperation();
+
+  // Gather all sub-blocks to jam upon the loop being unrolled.
+  JamBlockGatherer jbg;
+  jbg.walk(forInst);
+  auto &subBlocks = jbg.subBlocks;
+
+  // Generate the cleanup loop if trip count isn't a multiple of
+  // unrollJamFactor.
+  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
+    // Insert the cleanup loop right after 'forOp'.
+    OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst)));
+    auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst));
+    // Adjust the lower bound of the cleanup loop; its upper bound is the same
+    // as the original loop's upper bound.
+    AffineMap cleanupMap;
+    SmallVector<Value *, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
+                             &cleanupOperands, builder);
+    cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap);
+
+    // Promote the cleanup loop if it has turned into a single iteration loop.
+    promoteIfSingleIteration(cleanupAffineForOp);
+
+    // Adjust the upper bound of the original loop - it will be the same as the
+    // cleanup loop's lower bound. Its lower bound remains unchanged.
+    forOp.setUpperBound(cleanupOperands, cleanupMap);
+  }
+
+  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
+  int64_t step = forOp.getStep();
+  forOp.setStep(step * unrollJamFactor);
+
+  auto *forOpIV = forOp.getInductionVar();
+  for (auto &subBlock : subBlocks) {
+    // Builder to insert unroll-jammed bodies. Insert right at the end of
+    // sub-block.
+    OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
+
+    // Unroll and jam (appends unrollJamFactor-1 additional copies).
+    for (unsigned i = 1; i < unrollJamFactor; i++) {
+      BlockAndValueMapping operandMapping;
+
+      // If the induction variable is used, create a remapping to the value for
+      // this unrolled instance.
+      if (!forOpIV->use_empty()) {
+        // iv' = iv + i, i = 1 to unrollJamFactor-1.
+        auto d0 = builder.getAffineDimExpr(0);
+        auto bumpMap = builder.getAffineMap(1, 0, {d0 + i * step});
+        auto ivUnroll =
+            builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV);
+        operandMapping.map(forOpIV, ivUnroll);
+      }
+      // Clone the sub-block being unroll-jammed.
+      for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) {
+        builder.clone(*it, operandMapping);
+      }
+    }
+  }
+
+  // Promote the loop body up if this has turned into a single iteration loop.
+  promoteIfSingleIteration(forOp);
+  return success();
+}
+
+static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam",
+                                               "Unroll and jam loops");
diff --git a/third_party/mlir/lib/Transforms/LowerAffine.cpp b/third_party/mlir/lib/Transforms/LowerAffine.cpp
new file mode 100644
index 00000000000..e8a8284d392
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LowerAffine.cpp
@@ -0,0 +1,538 @@
+//===- LowerAffine.cpp - Lower affine constructs to primitives ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file lowers affine constructs (If and For statements, AffineApply
+// operations) within a function into their standard If and For equivalent ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/LowerAffine.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+// Visit affine expressions recursively and build the sequence of operations
+// that correspond to it.  Visitation functions return an Value of the
+// expression subtree they visited or `nullptr` on error.
+class AffineApplyExpander
+    : public AffineExprVisitor<AffineApplyExpander, Value *> {
+public:
+  // This internal class expects arguments to be non-null, checks must be
+  // performed at the call site.
+  AffineApplyExpander(OpBuilder &builder, ArrayRef<Value *> dimValues,
+                      ArrayRef<Value *> symbolValues, Location loc)
+      : builder(builder), dimValues(dimValues), symbolValues(symbolValues),
+        loc(loc) {}
+
+  template <typename OpTy> Value *buildBinaryExpr(AffineBinaryOpExpr expr) {
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    if (!lhs || !rhs)
+      return nullptr;
+    auto op = builder.create<OpTy>(loc, lhs, rhs);
+    return op.getResult();
+  }
+
+  Value *visitAddExpr(AffineBinaryOpExpr expr) {
+    return buildBinaryExpr<AddIOp>(expr);
+  }
+
+  Value *visitMulExpr(AffineBinaryOpExpr expr) {
+    return buildBinaryExpr<MulIOp>(expr);
+  }
+
+  // Euclidean modulo operation: negative RHS is not allowed.
+  // Remainder of the euclidean integer division is always non-negative.
+  //
+  // Implemented as
+  //
+  //     a mod b =
+  //         let remainder = srem a, b;
+  //             negative = a < 0 in
+  //         select negative, remainder + b, remainder.
+  Value *visitModExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(
+          loc,
+          "semi-affine expressions (modulo by non-const) are not supported");
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "modulo by non-positive value is not supported");
+      return nullptr;
+    }
+
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value *remainder = builder.create<RemISOp>(loc, lhs, rhs);
+    Value *zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value *isRemainderNegative =
+        builder.create<CmpIOp>(loc, CmpIPredicate::SLT, remainder, zeroCst);
+    Value *correctedRemainder = builder.create<AddIOp>(loc, remainder, rhs);
+    Value *result = builder.create<SelectOp>(loc, isRemainderNegative,
+                                             correctedRemainder, remainder);
+    return result;
+  }
+
+  // Floor division operation (rounds towards negative infinity).
+  //
+  // For positive divisors, it can be implemented without branching and with a
+  // single division operation as
+  //
+  //        a floordiv b =
+  //            let negative = a < 0 in
+  //            let absolute = negative ? -a - 1 : a in
+  //            let quotient = absolute / b in
+  //                negative ? -quotient - 1 : quotient
+  Value *visitFloorDivExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(
+          loc,
+          "semi-affine expressions (division by non-const) are not supported");
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "division by non-positive value is not supported");
+      return nullptr;
+    }
+
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value *zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value *noneCst = builder.create<ConstantIndexOp>(loc, -1);
+    Value *negative =
+        builder.create<CmpIOp>(loc, CmpIPredicate::SLT, lhs, zeroCst);
+    Value *negatedDecremented = builder.create<SubIOp>(loc, noneCst, lhs);
+    Value *dividend =
+        builder.create<SelectOp>(loc, negative, negatedDecremented, lhs);
+    Value *quotient = builder.create<DivISOp>(loc, dividend, rhs);
+    Value *correctedQuotient = builder.create<SubIOp>(loc, noneCst, quotient);
+    Value *result =
+        builder.create<SelectOp>(loc, negative, correctedQuotient, quotient);
+    return result;
+  }
+
+  // Ceiling division operation (rounds towards positive infinity).
+  //
+  // For positive divisors, it can be implemented without branching and with a
+  // single division operation as
+  //
+  //     a ceildiv b =
+  //         let negative = a <= 0 in
+  //         let absolute = negative ? -a : a - 1 in
+  //         let quotient = absolute / b in
+  //             negative ? -quotient : quotient + 1
+  Value *visitCeilDivExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(loc) << "semi-affine expressions (division by non-const) are "
+                        "not supported";
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "division by non-positive value is not supported");
+      return nullptr;
+    }
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value *zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value *oneCst = builder.create<ConstantIndexOp>(loc, 1);
+    Value *nonPositive =
+        builder.create<CmpIOp>(loc, CmpIPredicate::SLE, lhs, zeroCst);
+    Value *negated = builder.create<SubIOp>(loc, zeroCst, lhs);
+    Value *decremented = builder.create<SubIOp>(loc, lhs, oneCst);
+    Value *dividend =
+        builder.create<SelectOp>(loc, nonPositive, negated, decremented);
+    Value *quotient = builder.create<DivISOp>(loc, dividend, rhs);
+    Value *negatedQuotient = builder.create<SubIOp>(loc, zeroCst, quotient);
+    Value *incrementedQuotient = builder.create<AddIOp>(loc, quotient, oneCst);
+    Value *result = builder.create<SelectOp>(loc, nonPositive, negatedQuotient,
+                                             incrementedQuotient);
+    return result;
+  }
+
+  Value *visitConstantExpr(AffineConstantExpr expr) {
+    auto valueAttr =
+        builder.getIntegerAttr(builder.getIndexType(), expr.getValue());
+    auto op =
+        builder.create<ConstantOp>(loc, builder.getIndexType(), valueAttr);
+    return op.getResult();
+  }
+
+  Value *visitDimExpr(AffineDimExpr expr) {
+    assert(expr.getPosition() < dimValues.size() &&
+           "affine dim position out of range");
+    return dimValues[expr.getPosition()];
+  }
+
+  Value *visitSymbolExpr(AffineSymbolExpr expr) {
+    assert(expr.getPosition() < symbolValues.size() &&
+           "symbol dim position out of range");
+    return symbolValues[expr.getPosition()];
+  }
+
+private:
+  OpBuilder &builder;
+  ArrayRef<Value *> dimValues;
+  ArrayRef<Value *> symbolValues;
+
+  Location loc;
+};
+} // namespace
+
+// Create a sequence of operations that implement the `expr` applied to the
+// given dimension and symbol values.
+mlir::Value *mlir::expandAffineExpr(OpBuilder &builder, Location loc,
+                                    AffineExpr expr,
+                                    ArrayRef<Value *> dimValues,
+                                    ArrayRef<Value *> symbolValues) {
+  return AffineApplyExpander(builder, dimValues, symbolValues, loc).visit(expr);
+}
+
+// Create a sequence of operations that implement the `affineMap` applied to
+// the given `operands` (as it it were an AffineApplyOp).
+Optional<SmallVector<Value *, 8>> static expandAffineMap(
+    OpBuilder &builder, Location loc, AffineMap affineMap,
+    ArrayRef<Value *> operands) {
+  auto numDims = affineMap.getNumDims();
+  auto expanded = functional::map(
+      [numDims, &builder, loc, operands](AffineExpr expr) {
+        return expandAffineExpr(builder, loc, expr,
+                                operands.take_front(numDims),
+                                operands.drop_front(numDims));
+      },
+      affineMap.getResults());
+  if (llvm::all_of(expanded, [](Value *v) { return v; }))
+    return expanded;
+  return None;
+}
+
+// Given a range of values, emit the code that reduces them with "min" or "max"
+// depending on the provided comparison predicate.  The predicate defines which
+// comparison to perform, "lt" for "min", "gt" for "max" and is used for the
+// `cmpi` operation followed by the `select` operation:
+//
+//   %cond   = cmpi "predicate" %v0, %v1
+//   %result = select %cond, %v0, %v1
+//
+// Multiple values are scanned in a linear sequence.  This creates a data
+// dependences that wouldn't exist in a tree reduction, but is easier to
+// recognize as a reduction by the subsequent passes.
+static Value *buildMinMaxReductionSeq(Location loc, CmpIPredicate predicate,
+                                      ArrayRef<Value *> values,
+                                      OpBuilder &builder) {
+  assert(!llvm::empty(values) && "empty min/max chain");
+
+  auto valueIt = values.begin();
+  Value *value = *valueIt++;
+  for (; valueIt != values.end(); ++valueIt) {
+    auto cmpOp = builder.create<CmpIOp>(loc, predicate, value, *valueIt);
+    value = builder.create<SelectOp>(loc, cmpOp.getResult(), value, *valueIt);
+  }
+
+  return value;
+}
+
+// Emit instructions that correspond to the affine map in the lower bound
+// applied to the respective operands, and compute the maximum value across
+// the results.
+Value *mlir::lowerAffineLowerBound(AffineForOp op, OpBuilder &builder) {
+  SmallVector<Value *, 8> boundOperands(op.getLowerBoundOperands());
+  auto lbValues = expandAffineMap(builder, op.getLoc(), op.getLowerBoundMap(),
+                                  boundOperands);
+  if (!lbValues)
+    return nullptr;
+  return buildMinMaxReductionSeq(op.getLoc(), CmpIPredicate::SGT, *lbValues,
+                                 builder);
+}
+
+// Emit instructions that correspond to the affine map in the upper bound
+// applied to the respective operands, and compute the minimum value across
+// the results.
+Value *mlir::lowerAffineUpperBound(AffineForOp op, OpBuilder &builder) {
+  SmallVector<Value *, 8> boundOperands(op.getUpperBoundOperands());
+  auto ubValues = expandAffineMap(builder, op.getLoc(), op.getUpperBoundMap(),
+                                  boundOperands);
+  if (!ubValues)
+    return nullptr;
+  return buildMinMaxReductionSeq(op.getLoc(), CmpIPredicate::SLT, *ubValues,
+                                 builder);
+}
+
+namespace {
+// Affine terminators are removed.
+class AffineTerminatorLowering : public OpRewritePattern<AffineTerminatorOp> {
+public:
+  using OpRewritePattern<AffineTerminatorOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineTerminatorOp op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<loop::TerminatorOp>(op);
+    return matchSuccess();
+  }
+};
+
+class AffineForLowering : public OpRewritePattern<AffineForOp> {
+public:
+  using OpRewritePattern<AffineForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineForOp op,
+                                     PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value *lowerBound = lowerAffineLowerBound(op, rewriter);
+    Value *upperBound = lowerAffineUpperBound(op, rewriter);
+    Value *step = rewriter.create<ConstantIndexOp>(loc, op.getStep());
+    auto f = rewriter.create<loop::ForOp>(loc, lowerBound, upperBound, step);
+    f.region().getBlocks().clear();
+    rewriter.inlineRegionBefore(op.region(), f.region(), f.region().end());
+    rewriter.replaceOp(op, {});
+    return matchSuccess();
+  }
+};
+
+class AffineIfLowering : public OpRewritePattern<AffineIfOp> {
+public:
+  using OpRewritePattern<AffineIfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineIfOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    // Now we just have to handle the condition logic.
+    auto integerSet = op.getIntegerSet();
+    Value *zeroConstant = rewriter.create<ConstantIndexOp>(loc, 0);
+    SmallVector<Value *, 8> operands(op.getOperation()->getOperands());
+    auto operandsRef = llvm::makeArrayRef(operands);
+
+    // Calculate cond as a conjunction without short-circuiting.
+    Value *cond = nullptr;
+    for (unsigned i = 0, e = integerSet.getNumConstraints(); i < e; ++i) {
+      AffineExpr constraintExpr = integerSet.getConstraint(i);
+      bool isEquality = integerSet.isEq(i);
+
+      // Build and apply an affine expression
+      auto numDims = integerSet.getNumDims();
+      Value *affResult = expandAffineExpr(rewriter, loc, constraintExpr,
+                                          operandsRef.take_front(numDims),
+                                          operandsRef.drop_front(numDims));
+      if (!affResult)
+        return matchFailure();
+      auto pred = isEquality ? CmpIPredicate::EQ : CmpIPredicate::SGE;
+      Value *cmpVal =
+          rewriter.create<CmpIOp>(loc, pred, affResult, zeroConstant);
+      cond =
+          cond ? rewriter.create<AndOp>(loc, cond, cmpVal).getResult() : cmpVal;
+    }
+    cond = cond ? cond
+                : rewriter.create<ConstantIntOp>(loc, /*value=*/1, /*width=*/1);
+
+    bool hasElseRegion = !op.elseRegion().empty();
+    auto ifOp = rewriter.create<loop::IfOp>(loc, cond, hasElseRegion);
+    rewriter.inlineRegionBefore(op.thenRegion(), &ifOp.thenRegion().back());
+    ifOp.thenRegion().back().erase();
+    if (hasElseRegion) {
+      rewriter.inlineRegionBefore(op.elseRegion(), &ifOp.elseRegion().back());
+      ifOp.elseRegion().back().erase();
+    }
+
+    // Ok, we're done!
+    rewriter.replaceOp(op, {});
+    return matchSuccess();
+  }
+};
+
+// Convert an "affine.apply" operation into a sequence of arithmetic
+// operations using the StandardOps dialect.
+class AffineApplyLowering : public OpRewritePattern<AffineApplyOp> {
+public:
+  using OpRewritePattern<AffineApplyOp>::OpRewritePattern;
+
+  virtual PatternMatchResult
+  matchAndRewrite(AffineApplyOp op, PatternRewriter &rewriter) const override {
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(),
+                        llvm::to_vector<8>(op.getOperands()));
+    if (!maybeExpandedMap)
+      return matchFailure();
+    rewriter.replaceOp(op, *maybeExpandedMap);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.load' operation to its operands, and
+// feed the results to a newly created 'std.load' operation (which replaces the
+// original 'affine.load').
+class AffineLoadLowering : public OpRewritePattern<AffineLoadOp> {
+public:
+  using OpRewritePattern<AffineLoadOp>::OpRewritePattern;
+
+  virtual PatternMatchResult
+  matchAndRewrite(AffineLoadOp op, PatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineLoadOp'.
+    SmallVector<Value *, 8> indices(op.getIndices());
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!maybeExpandedMap)
+      return matchFailure();
+
+    // Build std.load memref[expandedMap.results].
+    rewriter.replaceOpWithNewOp<LoadOp>(op, op.getMemRef(), *maybeExpandedMap);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.store' operation to its operands, and
+// feed the results to a newly created 'std.store' operation (which replaces the
+// original 'affine.store').
+class AffineStoreLowering : public OpRewritePattern<AffineStoreOp> {
+public:
+  using OpRewritePattern<AffineStoreOp>::OpRewritePattern;
+
+  virtual PatternMatchResult
+  matchAndRewrite(AffineStoreOp op, PatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineStoreOp'.
+    SmallVector<Value *, 8> indices(op.getIndices());
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!maybeExpandedMap)
+      return matchFailure();
+
+    // Build std.store valutToStore, memref[expandedMap.results].
+    rewriter.replaceOpWithNewOp<StoreOp>(op, op.getValueToStore(),
+                                         op.getMemRef(), *maybeExpandedMap);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine maps from an 'affine.dma_start' operation to each of their
+// respective map operands, and feed the results to a newly created
+// 'std.dma_start' operation (which replaces the original 'affine.dma_start').
+class AffineDmaStartLowering : public OpRewritePattern<AffineDmaStartOp> {
+public:
+  using OpRewritePattern<AffineDmaStartOp>::OpRewritePattern;
+
+  virtual PatternMatchResult
+  matchAndRewrite(AffineDmaStartOp op,
+                  PatternRewriter &rewriter) const override {
+    SmallVector<Value *, 8> operands(op.getOperands());
+    auto operandsRef = llvm::makeArrayRef(operands);
+
+    // Expand affine map for DMA source memref.
+    auto maybeExpandedSrcMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getSrcMap(),
+        operandsRef.drop_front(op.getSrcMemRefOperandIndex() + 1));
+    if (!maybeExpandedSrcMap)
+      return matchFailure();
+    // Expand affine map for DMA destination memref.
+    auto maybeExpandedDstMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getDstMap(),
+        operandsRef.drop_front(op.getDstMemRefOperandIndex() + 1));
+    if (!maybeExpandedDstMap)
+      return matchFailure();
+    // Expand affine map for DMA tag memref.
+    auto maybeExpandedTagMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getTagMap(),
+        operandsRef.drop_front(op.getTagMemRefOperandIndex() + 1));
+    if (!maybeExpandedTagMap)
+      return matchFailure();
+
+    // Build std.dma_start operation with affine map results.
+    rewriter.replaceOpWithNewOp<DmaStartOp>(
+        op, op.getSrcMemRef(), *maybeExpandedSrcMap, op.getDstMemRef(),
+        *maybeExpandedDstMap, op.getNumElements(), op.getTagMemRef(),
+        *maybeExpandedTagMap, op.getStride(), op.getNumElementsPerStride());
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.dma_wait' operation tag memref,
+// and feed the results to a newly created 'std.dma_wait' operation (which
+// replaces the original 'affine.dma_wait').
+class AffineDmaWaitLowering : public OpRewritePattern<AffineDmaWaitOp> {
+public:
+  using OpRewritePattern<AffineDmaWaitOp>::OpRewritePattern;
+
+  virtual PatternMatchResult
+  matchAndRewrite(AffineDmaWaitOp op,
+                  PatternRewriter &rewriter) const override {
+    // Expand affine map for DMA tag memref.
+    SmallVector<Value *, 8> indices(op.getTagIndices());
+    auto maybeExpandedTagMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getTagMap(), indices);
+    if (!maybeExpandedTagMap)
+      return matchFailure();
+
+    // Build std.dma_wait operation with affine map results.
+    rewriter.replaceOpWithNewOp<DmaWaitOp>(
+        op, op.getTagMemRef(), *maybeExpandedTagMap, op.getNumElements());
+    return matchSuccess();
+  }
+};
+
+} // end namespace
+
+void mlir::populateAffineToStdConversionPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns
+      .insert<AffineApplyLowering, AffineDmaStartLowering,
+              AffineDmaWaitLowering, AffineLoadLowering, AffineStoreLowering,
+              AffineForLowering, AffineIfLowering, AffineTerminatorLowering>(
+          ctx);
+}
+
+namespace {
+class LowerAffinePass : public FunctionPass<LowerAffinePass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    populateAffineToStdConversionPatterns(patterns, &getContext());
+    ConversionTarget target(getContext());
+    target.addLegalDialect<loop::LoopOpsDialect, StandardOpsDialect>();
+    if (failed(applyPartialConversion(getFunction(), target, patterns)))
+      signalPassFailure();
+  }
+};
+} // namespace
+
+/// Lowers If and For operations within a function into their lower level CFG
+/// equivalent blocks.
+std::unique_ptr<FunctionPassBase> mlir::createLowerAffinePass() {
+  return std::make_unique<LowerAffinePass>();
+}
+
+static PassRegistration<LowerAffinePass>
+    pass("lower-affine",
+         "Lower If, For, AffineApply operations to primitive equivalents");
diff --git a/third_party/mlir/lib/Transforms/LowerVectorTransfers.cpp b/third_party/mlir/lib/Transforms/LowerVectorTransfers.cpp
new file mode 100644
index 00000000000..86ab2484e2a
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/LowerVectorTransfers.cpp
@@ -0,0 +1,383 @@
+//===- LowerVectorTransfers.cpp - LowerVectorTransfers Pass Impl ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements target-dependent lowering of vector transfer operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/Passes.h"
+
+/// Implements lowering of VectorTransferReadOp and VectorTransferWriteOp to a
+/// proper abstraction for the hardware.
+///
+/// For now, we only emit a simple loop nest that performs clipped pointwise
+/// copies from a remote to a locally allocated memory.
+///
+/// Consider the case:
+///
+/// ```mlir {.mlir}
+///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
+///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
+///    %f0 = constant 0.0f : f32
+///    affine.for %i0 = 0 to %0 {
+///      affine.for %i1 = 0 to %1 step 256 {
+///        affine.for %i2 = 0 to %2 step 32 {
+///          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
+///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+///               memref<?x?x?xf32>, vector<32x256xf32>
+///    }}}
+/// ```
+///
+/// The rewriters construct loop and indices that access MemRef A in a pattern
+/// resembling the following (while guaranteeing an always full-tile
+/// abstraction):
+///
+/// ```mlir {.mlir}
+///    affine.for %d2 = 0 to 256 {
+///      affine.for %d1 = 0 to 32 {
+///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
+///        %tmp[%d2, %d1] = %s
+///      }
+///    }
+/// ```
+///
+/// In the current state, only a clipping transfer is implemented by `clip`,
+/// which creates individual indexing expressions of the form:
+///
+/// ```mlir-dsc
+///    SELECT(i + ii < zero, zero, SELECT(i + ii < N, i + ii, N - one))
+/// ```
+
+using namespace mlir;
+using vector::VectorTransferReadOp;
+using vector::VectorTransferWriteOp;
+
+#define DEBUG_TYPE "affine-lower-vector-transfers"
+
+namespace {
+
+/// Lowers VectorTransferOp into a combination of:
+///   1. local memory allocation;
+///   2. perfect loop nest over:
+///      a. scalar load/stores from local buffers (viewed as a scalar memref);
+///      a. scalar store/load to original memref (with clipping).
+///   3. vector_load/store
+///   4. local memory deallocation.
+/// Minor variations occur depending on whether a VectorTransferReadOp or
+/// a VectorTransferWriteOp is rewritten.
+template <typename VectorTransferOpTy>
+struct VectorTransferRewriter : public RewritePattern {
+  explicit VectorTransferRewriter(MLIRContext *context)
+      : RewritePattern(VectorTransferOpTy::getOperationName(), 1, context) {}
+
+  /// Used for staging the transfer in a local scalar buffer.
+  MemRefType tmpMemRefType(VectorTransferOpTy transfer) const {
+    auto vectorType = transfer.getVectorType();
+    return MemRefType::get(vectorType.getShape(), vectorType.getElementType(),
+                           {}, 0);
+  }
+
+  /// View of tmpMemRefType as one vector, used in vector load/store to tmp
+  /// buffer.
+  MemRefType vectorMemRefType(VectorTransferOpTy transfer) const {
+    return MemRefType::get({1}, transfer.getVectorType(), {}, 0);
+  }
+
+  /// Performs the rewrite.
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override;
+};
+
+/// Analyzes the `transfer` to find an access dimension along the fastest remote
+/// MemRef dimension. If such a dimension with coalescing properties is found,
+/// `pivs` and `vectorView` are swapped so that the invocation of
+/// LoopNestBuilder captures it in the innermost loop.
+template <typename VectorTransferOpTy>
+void coalesceCopy(VectorTransferOpTy transfer,
+                  SmallVectorImpl<edsc::ValueHandle *> *pivs,
+                  edsc::VectorView *vectorView) {
+  // rank of the remote memory access, coalescing behavior occurs on the
+  // innermost memory dimension.
+  auto remoteRank = transfer.getMemRefType().getRank();
+  // Iterate over the results expressions of the permutation map to determine
+  // the loop order for creating pointwise copies between remote and local
+  // memories.
+  int coalescedIdx = -1;
+  auto exprs = transfer.getPermutationMap().getResults();
+  for (auto en : llvm::enumerate(exprs)) {
+    auto dim = en.value().template dyn_cast<AffineDimExpr>();
+    if (!dim) {
+      continue;
+    }
+    auto memRefDim = dim.getPosition();
+    if (memRefDim == remoteRank - 1) {
+      // memRefDim has coalescing properties, it should be swapped in the last
+      // position.
+      assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
+      coalescedIdx = en.index();
+    }
+  }
+  if (coalescedIdx >= 0) {
+    std::swap(pivs->back(), (*pivs)[coalescedIdx]);
+    vectorView->swapRanges(pivs->size() - 1, coalescedIdx);
+  }
+}
+
+/// Emits remote memory accesses that are clipped to the boundaries of the
+/// MemRef.
+template <typename VectorTransferOpTy>
+llvm::SmallVector<edsc::ValueHandle, 8> clip(VectorTransferOpTy transfer,
+                                             edsc::MemRefView &view,
+                                             ArrayRef<edsc::IndexHandle> ivs) {
+  using namespace mlir::edsc;
+  using namespace edsc::op;
+  using edsc::intrinsics::select;
+
+  IndexHandle zero(index_t(0)), one(index_t(1));
+  llvm::SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.getIndices());
+  llvm::SmallVector<edsc::ValueHandle, 8> clippedScalarAccessExprs(
+      memRefAccess.size(), edsc::IndexHandle());
+
+  // Indices accessing to remote memory are clipped and their expressions are
+  // returned in clippedScalarAccessExprs.
+  for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
+       ++memRefDim) {
+    // Linear search on a small number of entries.
+    int loopIndex = -1;
+    auto exprs = transfer.getPermutationMap().getResults();
+    for (auto en : llvm::enumerate(exprs)) {
+      auto expr = en.value();
+      auto dim = expr.template dyn_cast<AffineDimExpr>();
+      // Sanity check.
+      assert(
+          (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
+          "Expected dim or 0 in permutationMap");
+      if (dim && memRefDim == dim.getPosition()) {
+        loopIndex = en.index();
+        break;
+      }
+    }
+
+    // We cannot distinguish atm between unrolled dimensions that implement
+    // the "always full" tile abstraction and need clipping from the other
+    // ones. So we conservatively clip everything.
+    auto N = view.ub(memRefDim);
+    auto i = memRefAccess[memRefDim];
+    if (loopIndex < 0) {
+      auto N_minus_1 = N - one;
+      auto select_1 = select(i < N, i, N_minus_1);
+      clippedScalarAccessExprs[memRefDim] = select(i < zero, zero, select_1);
+    } else {
+      auto ii = ivs[loopIndex];
+      auto i_plus_ii = i + ii;
+      auto N_minus_1 = N - one;
+      auto select_1 = select(i_plus_ii < N, i_plus_ii, N_minus_1);
+      clippedScalarAccessExprs[memRefDim] =
+          select(i_plus_ii < zero, zero, select_1);
+    }
+  }
+
+  return clippedScalarAccessExprs;
+}
+
+/// Lowers VectorTransferReadOp into a combination of:
+///   1. local memory allocation;
+///   2. perfect loop nest over:
+///      a. scalar load from local buffers (viewed as a scalar memref);
+///      a. scalar store to original memref (with clipping).
+///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
+///   4. local memory deallocation.
+///
+/// Lowers the data transfer part of a VectorTransferReadOp while ensuring no
+/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
+/// clipping. This means that a given value in memory can be read multiple
+/// times and concurrently.
+///
+/// Important notes about clipping and "full-tiles only" abstraction:
+/// =================================================================
+/// When using clipping for dealing with boundary conditions, the same edge
+/// value will appear multiple times (a.k.a edge padding). This is fine if the
+/// subsequent vector operations are all data-parallel but **is generally
+/// incorrect** in the presence of reductions or extract operations.
+///
+/// More generally, clipping is a scalar abstraction that is expected to work
+/// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
+/// To deal with real vector_load and DMAs, a "padded allocation + view"
+/// abstraction with the ability to read out-of-memref-bounds (but still within
+/// the allocated region) is necessary.
+///
+/// Whether using scalar loops or vector_load/DMAs to perform the transfer,
+/// junk values will be materialized in the vectors and generally need to be
+/// filtered out and replaced by the "neutral element". This neutral element is
+/// op-dependent so, in the future, we expect to create a vector filter and
+/// apply it to a splatted constant vector with the proper neutral element at
+/// each ssa-use. This filtering is not necessary for pure data-parallel
+/// operations.
+///
+/// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
+/// also have concurrency implications. Note that by using clipped scalar stores
+/// in the presence of data-parallel only operations, we generate code that
+/// writes the same value multiple time on the edge locations.
+///
+/// TODO(ntv): implement alternatives to clipping.
+/// TODO(ntv): support non-data-parallel operations.
+
+/// Performs the rewrite.
+template <>
+PatternMatchResult
+VectorTransferRewriter<VectorTransferReadOp>::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  using namespace mlir::edsc;
+  using namespace mlir::edsc::op;
+  using namespace mlir::edsc::intrinsics;
+  using IndexedValue =
+      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+  VectorTransferReadOp transfer = cast<VectorTransferReadOp>(op);
+
+  // 1. Setup all the captures.
+  ScopedContext scope(rewriter, transfer.getLoc());
+  IndexedValue remote(transfer.getMemRef());
+  MemRefView view(transfer.getMemRef());
+  VectorView vectorView(transfer.getVector());
+  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
+  SmallVector<ValueHandle *, 8> pivs =
+      makeIndexHandlePointers(MutableArrayRef<IndexHandle>(ivs));
+  coalesceCopy(transfer, &pivs, &vectorView);
+
+  auto lbs = vectorView.getLbs();
+  auto ubs = vectorView.getUbs();
+  auto steps = vectorView.getSteps();
+
+  // 2. Emit alloc-copy-load-dealloc.
+  ValueHandle tmp = alloc(tmpMemRefType(transfer));
+  IndexedValue local(tmp);
+  ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer));
+  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
+    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
+    local(ivs) = remote(clip(transfer, view, ivs));
+  });
+  ValueHandle vectorValue = std_load(vec, {constant_index(0)});
+  (dealloc(tmp)); // vexing parse
+
+  // 3. Propagate.
+  rewriter.replaceOp(op, vectorValue.getValue());
+  return matchSuccess();
+}
+
+/// Lowers VectorTransferWriteOp into a combination of:
+///   1. local memory allocation;
+///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
+///   3. perfect loop nest over:
+///      a. scalar load from local buffers (viewed as a scalar memref);
+///      a. scalar store to original memref (with clipping).
+///   4. local memory deallocation.
+///
+/// More specifically, lowers the data transfer part while ensuring no
+/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
+/// clipping. This means that a given value in memory can be written to multiple
+/// times and concurrently.
+///
+/// See `Important notes about clipping and full-tiles only abstraction` in the
+/// description of `readClipped` above.
+///
+/// TODO(ntv): implement alternatives to clipping.
+/// TODO(ntv): support non-data-parallel operations.
+template <>
+PatternMatchResult
+VectorTransferRewriter<VectorTransferWriteOp>::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  using namespace mlir::edsc;
+  using namespace mlir::edsc::op;
+  using namespace mlir::edsc::intrinsics;
+  using IndexedValue =
+      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+  VectorTransferWriteOp transfer = cast<VectorTransferWriteOp>(op);
+
+  // 1. Setup all the captures.
+  ScopedContext scope(rewriter, transfer.getLoc());
+  IndexedValue remote(transfer.getMemRef());
+  MemRefView view(transfer.getMemRef());
+  ValueHandle vectorValue(transfer.getVector());
+  VectorView vectorView(transfer.getVector());
+  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
+  SmallVector<ValueHandle *, 8> pivs = makeIndexHandlePointers(ivs);
+  coalesceCopy(transfer, &pivs, &vectorView);
+
+  auto lbs = vectorView.getLbs();
+  auto ubs = vectorView.getUbs();
+  auto steps = vectorView.getSteps();
+
+  // 2. Emit alloc-store-copy-dealloc.
+  ValueHandle tmp = alloc(tmpMemRefType(transfer));
+  IndexedValue local(tmp);
+  ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer));
+  std_store(vectorValue, vec, {constant_index(0)});
+  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
+    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
+    remote(clip(transfer, view, ivs)) = local(ivs);
+  });
+  (dealloc(tmp)); // vexing parse...
+
+  rewriter.replaceOp(op, llvm::None);
+  return matchSuccess();
+}
+
+struct LowerVectorTransfersPass
+    : public FunctionPass<LowerVectorTransfersPass> {
+  void runOnFunction() {
+    OwningRewritePatternList patterns;
+    auto *context = &getContext();
+    patterns.insert<VectorTransferRewriter<vector::VectorTransferReadOp>,
+                    VectorTransferRewriter<vector::VectorTransferWriteOp>>(
+        context);
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionPassBase> mlir::createLowerVectorTransfersPass() {
+  return std::make_unique<LowerVectorTransfersPass>();
+}
+
+static PassRegistration<LowerVectorTransfersPass>
+    pass("affine-lower-vector-transfers",
+         "Materializes vector transfer ops to a "
+         "proper abstraction for the hardware");
diff --git a/third_party/mlir/lib/Transforms/MaterializeVectors.cpp b/third_party/mlir/lib/Transforms/MaterializeVectors.cpp
new file mode 100644
index 00000000000..bfdd5bf05f2
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/MaterializeVectors.cpp
@@ -0,0 +1,779 @@
+//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements target-dependent materialization of super-vectors to
+// vectors of the proper size for the hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+///
+/// Implements target-dependent materialization of virtual super-vectors to
+/// vectors of the proper size for the hardware.
+///
+/// While the physical vector size is target-dependent, the pass is written in
+/// a target-independent way: the target vector size is specified as a parameter
+/// to the pass. This pass is thus a partial lowering that opens the "greybox"
+/// that is the super-vector abstraction. In particular, this pass can turn the
+/// vector.transfer_read and vector.transfer_write ops in either:
+///   1. a loop nest with either scalar and vector load/store operations; or
+///   2. a loop-nest with DmaStartOp / DmaWaitOp; or
+///   3. a pre-existing blackbox library call that can be written manually or
+///      synthesized using search and superoptimization.
+/// An important feature that either of these 3 target lowering abstractions
+/// must handle is the handling of "non-effecting" padding with the proper
+/// neutral element in order to guarantee that all "partial tiles" are actually
+/// "full tiles" in practice.
+///
+/// In particular this pass is a MLIR-MLIR rewriting and does not concern itself
+/// with target-specific instruction-selection and register allocation. These
+/// will happen downstream in LLVM.
+///
+/// In this sense, despite performing lowering to a target-dependent size, this
+/// pass is still target-agnostic.
+///
+/// Implementation details
+/// ======================
+/// The current decisions made by the super-vectorization pass guarantee that
+/// use-def chains do not escape an enclosing vectorized AffineForOp. In other
+/// words, this pass operates on a scoped program slice. Furthermore, since we
+/// do not vectorize in the presence of conditionals for now, sliced chains are
+/// guaranteed not to escape the innermost scope, which has to be either the top
+/// Function scope or the innermost loop scope, by construction. As a
+/// consequence, the implementation just starts from vector.transfer_write
+/// operations and builds the slice scoped the innermost loop enclosing the
+/// current vector.transfer_write. These assumptions and the implementation
+/// details are subject to revision in the future.
+///
+/// Example
+/// ========
+/// In the following, the single vector.transfer_write op operates on a
+/// vector<4x4x4xf32>. Let's assume the HW supports vector<4x4xf32>.
+/// Materialization is achieved by instantiating each occurrence of the leading
+/// dimension of vector<4x4x4xf32> into a vector<4x4xf32>.
+/// The program transformation that implements this instantiation is a
+/// multi-loop unroll-and-jam (it can be partial or full depending on the ratio
+/// of super-vector shape to HW-vector shape).
+///
+/// As a simple case, the following:
+///
+/// ```mlir
+///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
+///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32>
+///      %f1 = constant dense<vector<4x4x4xf32>, 1.000000e+00> :
+///      vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 {
+///        affine.for %i1 = 0 to %N step 4 {
+///          affine.for %i2 = 0 to %O {
+///            affine.for %i3 = 0 to %P step 4 {
+///              vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3]
+///                {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} :
+///                 vector<4x4x4xf32>, memref<?x?x?x?xf32>
+///      }}}}
+///      return
+///    }
+/// ```
+///
+/// is instantiated by unroll-and-jam (just unroll in this case) into:
+///
+/// ```mlir
+///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
+///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
+///      %f1 = constant dense<vector<4x4xf32>, 1.000000e+00> : vector<4x4x4xf32>
+///       affine.for %i0 = 0 to %arg0 step 4 {
+///         affine.for %i1 = 0 to %arg1 step 4 {
+///           affine.for %i2 = 0 to %arg2 {
+///             affine.for %i3 = 0 to %arg3 step 4 {
+///               vector.transfer_write f1, %0[%i0, %i1, %i2, %i3]
+///                 {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
+///                 vector<4x4xf32>, memref<?x?x?x?xf32>
+///               %i3p1 = affine.apply (d0) -> (d0 + 1)(%i3)
+///               vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p1]
+///                 {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
+///                 vector<4x4xf32>, memref<?x?x?x?xf32>
+///               %i3p2 = affine.apply (d0) -> (d0 + 2)(%i3)
+///               vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p2]
+///                 {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
+///                 vector<4x4xf32>, memref<?x?x?x?xf32>
+///               %i3p3 = affine.apply (d0) -> (d0 + 3)(%i3)
+///               vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p3]
+///                 {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
+///                 vector<4x4xf32>, memref<?x?x?x?xf32>
+///      }}}}
+///      return
+///    }
+/// ```
+
+using llvm::dbgs;
+using llvm::SetVector;
+
+using namespace mlir;
+using vector::VectorTransferReadOp;
+using vector::VectorTransferWriteOp;
+
+using functional::makePtrDynCaster;
+using functional::map;
+
+static llvm::cl::list<int>
+    clVectorSize("vector-size",
+                 llvm::cl::desc("Specify the HW vector size for vectorization"),
+                 llvm::cl::ZeroOrMore);
+
+#define DEBUG_TYPE "materialize-vect"
+
+namespace {
+struct MaterializationState {
+  /// In practice, the determination of the HW-specific vector type to use when
+  /// lowering a super-vector type must be based on the elemental type. The
+  /// elemental type must be retrieved from the super-vector type. In the future
+  /// information about hardware vector type for a particular elemental type
+  /// will be part of the contract between MLIR and the backend.
+  ///
+  /// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself
+  /// may exhibit the following property:
+  /// 1. have a special unit for a 128xf16 datapath;
+  /// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath.
+  ///
+  /// For now, we just assume hwVectorSize has the proper information regardless
+  /// of the type and we assert everything is f32.
+  /// TODO(ntv): relax the assumptions on admissible element type once a
+  /// contract exists.
+  MaterializationState(SmallVector<int64_t, 8> sizes) : hwVectorSize(sizes) {}
+
+  SmallVector<int64_t, 8> hwVectorSize;
+  VectorType superVectorType;
+  VectorType hwVectorType;
+  SmallVector<unsigned, 8> hwVectorInstance;
+  DenseMap<Value *, Value *> *substitutionsMap;
+};
+
+/// Base state for the vector materialization pass.
+/// Command line arguments are preempted by non-empty pass arguments.
+struct MaterializeVectorsPass : public FunctionPass<MaterializeVectorsPass> {
+  MaterializeVectorsPass()
+      : hwVectorSize(clVectorSize.begin(), clVectorSize.end()) {}
+  MaterializeVectorsPass(ArrayRef<int64_t> hwVectorSize)
+      : MaterializeVectorsPass() {
+    if (!hwVectorSize.empty())
+      this->hwVectorSize.assign(hwVectorSize.begin(), hwVectorSize.end());
+  }
+
+  SmallVector<int64_t, 8> hwVectorSize;
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+/// Given a shape with sizes greater than 0 along all dimensions,
+/// returns the distance, in number of elements, between a slice in a dimension
+/// and the next slice in the same dimension.
+///   e.g. shape[3, 4, 5] -> strides[20, 5, 1]
+static SmallVector<unsigned, 8> makeStrides(ArrayRef<unsigned> shape) {
+  SmallVector<unsigned, 8> tmp;
+  tmp.reserve(shape.size());
+  unsigned running = 1;
+  for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
+    assert(*rit > 0 && "size must be greater than 0 along all dimensions of "
+                       "shape");
+    tmp.push_back(running);
+    running *= *rit;
+  }
+  return SmallVector<unsigned, 8>(tmp.rbegin(), tmp.rend());
+}
+
+/// Given a shape with sizes greater than 0 along all dimensions, returns the
+/// delinearized components of linearIndex along shape.
+static SmallVector<unsigned, 8> delinearize(unsigned linearIndex,
+                                            ArrayRef<unsigned> shape) {
+  SmallVector<unsigned, 8> res;
+  res.reserve(shape.size());
+  auto strides = makeStrides(shape);
+  for (unsigned idx = 0; idx < strides.size(); ++idx) {
+    assert(strides[idx] > 0);
+    auto val = linearIndex / strides[idx];
+    res.push_back(val);
+    assert(val < shape[idx] && "delinearization is out of bounds");
+    linearIndex %= strides[idx];
+  }
+  // Sanity check.
+  assert(linearIndex == 0 && "linear index constructed from shape must "
+                             "have 0 remainder after delinearization");
+  return res;
+}
+
+static Operation *instantiate(OpBuilder b, Operation *opInst,
+                              VectorType hwVectorType,
+                              DenseMap<Value *, Value *> *substitutionsMap);
+
+/// Not all Values belong to a program slice scoped within the immediately
+/// enclosing loop.
+/// One simple example is constants defined outside the innermost loop scope.
+/// For such cases the substitutionsMap has no entry and we allow an additional
+/// insertion.
+/// For now, this is limited to ConstantOp because we do not vectorize loop
+/// indices and will need to be extended in the future.
+///
+/// If substitution fails, returns nullptr.
+static Value *substitute(Value *v, VectorType hwVectorType,
+                         DenseMap<Value *, Value *> *substitutionsMap) {
+  auto it = substitutionsMap->find(v);
+  if (it == substitutionsMap->end()) {
+    auto *opInst = v->getDefiningOp();
+    if (isa<ConstantOp>(opInst)) {
+      OpBuilder b(opInst);
+      auto *op = instantiate(b, opInst, hwVectorType, substitutionsMap);
+      auto res = substitutionsMap->insert(std::make_pair(v, op->getResult(0)));
+      assert(res.second && "Insertion failed");
+      return res.first->second;
+    }
+    v->getDefiningOp()->emitError("Missing substitution");
+    return nullptr;
+  }
+  return it->second;
+}
+
+/// Returns a list of single result AffineApplyOps that reindex the
+/// `memRefIndices` by the multi-dimensional `hwVectorInstance`. This is used by
+/// the function that materializes a vector.transfer operation to use hardware
+/// vector types instead of super-vector types.
+///
+/// The general problem this function solves is as follows:
+/// Assume a vector.transfer operation at the super-vector granularity that has
+/// `l` enclosing loops (AffineForOp). Assume the vector transfer operation
+/// operates on a MemRef of rank `r`, a super-vector of rank `s` and a hardware
+/// vector of rank `h`. For the purpose of illustration assume l==4, r==3, s==2,
+/// h==1 and that the super-vector is vector<3x32xf32> and the hardware vector
+/// is vector<8xf32>. Assume the following MLIR snippet after
+/// super-vectorization has been applied:
+///
+/// ```mlir
+/// affine.for %i0 = 0 to %M {
+///   affine.for %i1 = 0 to %N step 3 {
+///     affine.for %i2 = 0 to %O {
+///       affine.for %i3 = 0 to %P step 32 {
+///         %r = vector.transfer_read(%A, map0(%i..), map1(%i..), map2(%i..)) :
+///              vector<3x32xf32>, memref<?x?x?xf32>
+///         ...
+/// }}}}
+/// ```
+///
+/// where map denotes an AffineMap operating on enclosing loops with properties
+/// compatible for vectorization (i.e. some contiguity left unspecified here).
+/// Note that the vectorized loops are %i1 and %i3.
+/// This function translates the vector.transfer_read operation to multiple
+/// instances of vector.transfer_read that operate on vector<8x32>.
+///
+/// Without loss of generality, we assume hwVectorInstance is: {2, 1}.
+/// The only constraints on hwVectorInstance is they belong to:
+///   [0, 2] x [0, 3], which is the span of ratio of super-vector shape to
+/// hardware vector shape in our example.
+///
+/// This function instantiates the iteration <2, 1> of vector.transfer_read
+/// into the set of operations in pseudo-MLIR:
+///
+/// ```mlir
+///   #map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8)
+///   #map3 = #map o #map2 // where o denotes composition
+///   aff0 = affine.apply #map3.0(%i..)
+///   aff1 = affine.apply #map3.1(%i..)
+///   aff2 = affine.apply #map3.2(%i..)
+///   %r = vector.transfer_read(%A, %aff0, %aff1, %aff2):
+//         vector<3x32xf32>, memref<?x?x?xf32>
+/// ```
+///
+/// Practical considerations
+/// ========================
+/// For now, `map` is assumed to be the identity map and the indices are
+/// specified just as vector.transfer_read%A[%i0, %i1, %i2, %i3]. This will be
+/// extended in the future once we have a proper Op for vector transfers.
+/// Additionally, the example above is specified in pseudo-MLIR form; once we
+/// have proper support for generic maps we can generate the code and show
+/// actual MLIR.
+///
+/// TODO(ntv): support a concrete AffineMap and compose with it.
+/// TODO(ntv): these implementation details should be captured in a
+/// vectorization trait at the op level directly.
+static SmallVector<mlir::Value *, 8>
+reindexAffineIndices(OpBuilder b, VectorType hwVectorType,
+                     ArrayRef<unsigned> hwVectorInstance,
+                     ArrayRef<Value *> memrefIndices) {
+  auto vectorShape = hwVectorType.getShape();
+  assert(hwVectorInstance.size() >= vectorShape.size());
+
+  unsigned numIndices = memrefIndices.size();
+  auto numMemRefIndices = numIndices - hwVectorInstance.size();
+  auto numVectorIndices = hwVectorInstance.size() - vectorShape.size();
+
+  SmallVector<AffineExpr, 8> affineExprs;
+  // TODO(ntv): support a concrete map and composition.
+  unsigned i = 0;
+  // The first numMemRefIndices correspond to AffineForOp that have not been
+  // vectorized, the transformation is the identity on those.
+  for (i = 0; i < numMemRefIndices; ++i) {
+    auto d_i = b.getAffineDimExpr(i);
+    affineExprs.push_back(d_i);
+  }
+  // The next numVectorIndices correspond to super-vector dimensions that
+  // do not have a hardware vector dimension counterpart. For those we only
+  // need to increment the index by the corresponding hwVectorInstance.
+  for (i = numMemRefIndices; i < numMemRefIndices + numVectorIndices; ++i) {
+    auto d_i = b.getAffineDimExpr(i);
+    auto offset = hwVectorInstance[i - numMemRefIndices];
+    affineExprs.push_back(d_i + offset);
+  }
+  // The remaining indices correspond to super-vector dimensions that
+  // have a hardware vector dimension counterpart. For those we to increment the
+  // index by "hwVectorInstance" multiples of the corresponding hardware
+  // vector size.
+  for (; i < numIndices; ++i) {
+    auto d_i = b.getAffineDimExpr(i);
+    auto offset = hwVectorInstance[i - numMemRefIndices];
+    auto stride = vectorShape[i - numMemRefIndices - numVectorIndices];
+    affineExprs.push_back(d_i + offset * stride);
+  }
+
+  // Create a bunch of single result AffineApplyOp.
+  SmallVector<mlir::Value *, 8> res;
+  res.reserve(affineExprs.size());
+  for (auto expr : affineExprs) {
+    auto map = AffineMap::get(numIndices, 0, expr);
+    res.push_back(makeComposedAffineApply(b, b.getInsertionPoint()->getLoc(),
+                                          map, memrefIndices));
+  }
+  return res;
+}
+
+/// Returns attributes with the following substitutions applied:
+///   - constant splat is replaced by constant splat of `hwVectorType`.
+/// TODO(ntv): add more substitutions on a per-need basis.
+static SmallVector<NamedAttribute, 1>
+materializeAttributes(Operation *opInst, VectorType hwVectorType) {
+  SmallVector<NamedAttribute, 1> res;
+  for (auto a : opInst->getAttrs()) {
+    if (auto splat = a.second.dyn_cast<SplatElementsAttr>()) {
+      auto attr = SplatElementsAttr::get(hwVectorType, splat.getSplatValue());
+      res.push_back(NamedAttribute(a.first, attr));
+    } else {
+      res.push_back(a);
+    }
+  }
+  return res;
+}
+
+/// Creates an instantiated version of `opInst`.
+/// Ops other than VectorTransferReadOp/VectorTransferWriteOp require no
+/// affine reindexing. Just substitute their Value operands and be done. For
+/// this case the actual instance is irrelevant. Just use the values in
+/// substitutionsMap.
+///
+/// If the underlying substitution fails, this fails too and returns nullptr.
+static Operation *instantiate(OpBuilder b, Operation *opInst,
+                              VectorType hwVectorType,
+                              DenseMap<Value *, Value *> *substitutionsMap) {
+  assert(!isa<VectorTransferReadOp>(opInst) &&
+         "Should call the function specialized for VectorTransferReadOp");
+  assert(!isa<VectorTransferWriteOp>(opInst) &&
+         "Should call the function specialized for VectorTransferWriteOp");
+  if (opInst->getNumRegions() != 0)
+    return nullptr;
+
+  bool fail = false;
+  auto operands = map(
+      [hwVectorType, substitutionsMap, &fail](Value *v) -> Value * {
+        auto *res =
+            fail ? nullptr : substitute(v, hwVectorType, substitutionsMap);
+        fail |= !res;
+        return res;
+      },
+      opInst->getOperands());
+  if (fail)
+    return nullptr;
+
+  auto attrs = materializeAttributes(opInst, hwVectorType);
+
+  OperationState state(opInst->getLoc(), opInst->getName().getStringRef(),
+                       operands, {hwVectorType}, attrs);
+  return b.createOperation(state);
+}
+
+/// Computes the permutationMap required for a VectorTransferOp from the memref
+/// to the `hwVectorType`.
+/// This is achieved by returning the projection of the permutationMap along the
+/// dimensions of the super-vector type that remain in the hwVectorType.
+/// In particular, if a dimension is fully instantiated (i.e. unrolled) then it
+/// is projected out in the final result.
+template <typename VectorTransferOpTy>
+static AffineMap projectedPermutationMap(VectorTransferOpTy transfer,
+                                         VectorType hwVectorType) {
+  static_assert(
+      std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value ||
+          std::is_same<VectorTransferOpTy, VectorTransferWriteOp>::value,
+      "Must be called on a VectorTransferOp");
+  auto superVectorType = transfer.getVectorType();
+  auto optionalRatio = shapeRatio(superVectorType, hwVectorType);
+  assert(optionalRatio &&
+         (optionalRatio->size() == superVectorType.getShape().size()) &&
+         "Shape and ratio not of the same size");
+  unsigned dim = 0;
+  SmallVector<AffineExpr, 4> keep;
+  MLIRContext *context = transfer.getContext();
+  functional::zipApply(
+      [&dim, &keep, context](int64_t shape, int64_t ratio) {
+        assert(shape >= ratio && "shape dim must be greater than ratio dim");
+        if (shape != ratio) {
+          // HW vector is not full instantiated along this dim, keep it.
+          keep.push_back(getAffineDimExpr(dim, context));
+        }
+        ++dim;
+      },
+      superVectorType.getShape(), *optionalRatio);
+  auto permutationMap = transfer.getPermutationMap();
+  LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: "));
+  if (keep.empty()) {
+    return permutationMap;
+  }
+  auto projectionMap = AffineMap::get(optionalRatio->size(), 0, keep);
+  LLVM_DEBUG(projectionMap.print(dbgs() << "\nprojectionMap: "));
+  return simplifyAffineMap(projectionMap.compose(permutationMap));
+}
+
+/// Creates an instantiated version of `read` for the instance of
+/// `hwVectorInstance` when lowering from a super-vector type to
+/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
+/// `hwVectorType` int the covering of the super-vector type. For a more
+/// detailed description of the problem, see the description of
+/// reindexAffineIndices.
+static Operation *instantiate(OpBuilder b, VectorTransferReadOp read,
+                              VectorType hwVectorType,
+                              ArrayRef<unsigned> hwVectorInstance,
+                              DenseMap<Value *, Value *> *substitutionsMap) {
+  SmallVector<Value *, 8> indices =
+      map(makePtrDynCaster<Value>(), read.getIndices());
+  auto affineIndices =
+      reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
+  auto map = projectedPermutationMap(read, hwVectorType);
+  if (!map) {
+    return nullptr;
+  }
+  auto cloned = b.create<VectorTransferReadOp>(read.getLoc(), hwVectorType,
+                                               read.getMemRef(), affineIndices,
+                                               map, read.getPaddingValue());
+  return cloned.getOperation();
+}
+
+/// Creates an instantiated version of `write` for the instance of
+/// `hwVectorInstance` when lowering from a super-vector type to
+/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
+/// `hwVectorType` int the covering of th3e super-vector type. For a more
+/// detailed description of the problem, see the description of
+/// reindexAffineIndices.
+static Operation *instantiate(OpBuilder b, VectorTransferWriteOp write,
+                              VectorType hwVectorType,
+                              ArrayRef<unsigned> hwVectorInstance,
+                              DenseMap<Value *, Value *> *substitutionsMap) {
+  SmallVector<Value *, 8> indices =
+      map(makePtrDynCaster<Value>(), write.getIndices());
+  auto affineIndices =
+      reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
+  auto cloned = b.create<VectorTransferWriteOp>(
+      write.getLoc(),
+      substitute(write.getVector(), hwVectorType, substitutionsMap),
+      write.getMemRef(), affineIndices,
+      projectedPermutationMap(write, hwVectorType));
+  return cloned.getOperation();
+}
+
+/// Returns `true` if op instance is properly cloned and inserted, false
+/// otherwise.
+/// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of
+/// super-vector type to hw vector type.
+/// A cloned instance of `op` is formed as follows:
+///   1. vector.transfer_read: the return `superVectorType` is replaced by
+///      `hwVectorType`. Additionally, affine indices are reindexed with
+///      `reindexAffineIndices` using `hwVectorInstance` and vector type
+///      information;
+///   2. vector.transfer_write: the `valueToStore` type is simply substituted.
+///      Since we operate on a topologically sorted slice, a substitution must
+///      have been registered for non-constant ops. Additionally, affine indices
+///      are reindexed in the same way as for vector.transfer_read;
+///   3. constant ops are splats of the super-vector type by construction.
+///      They are cloned to a splat on the hw vector type with the same value;
+///   4. remaining ops are cloned to version of the op that returns a hw vector
+///      type, all operands are substituted according to `substitutions`. Thanks
+///      to the topological order of a slice, the substitution is always
+///      possible.
+///
+/// Returns true on failure.
+static bool instantiateMaterialization(Operation *op,
+                                       MaterializationState *state) {
+  LLVM_DEBUG(dbgs() << "\ninstantiate: " << *op);
+
+  // Create a builder here for unroll-and-jam effects.
+  OpBuilder b(op);
+  // AffineApplyOp are ignored: instantiating the proper vector op will take
+  // care of AffineApplyOps by composing them properly.
+  if (isa<AffineApplyOp>(op)) {
+    return false;
+  }
+  if (op->getNumRegions() != 0)
+    return op->emitError("NYI path Op with region"), true;
+
+  if (auto write = dyn_cast<VectorTransferWriteOp>(op)) {
+    auto *clone = instantiate(b, write, state->hwVectorType,
+                              state->hwVectorInstance, state->substitutionsMap);
+    return clone == nullptr;
+  }
+  if (auto read = dyn_cast<VectorTransferReadOp>(op)) {
+    auto *clone = instantiate(b, read, state->hwVectorType,
+                              state->hwVectorInstance, state->substitutionsMap);
+    if (!clone) {
+      return true;
+    }
+    state->substitutionsMap->insert(
+        std::make_pair(read.getResult(), clone->getResult(0)));
+    return false;
+  }
+  // The only op with 0 results reaching this point must, by construction, be
+  // VectorTransferWriteOps and have been caught above. Ops with >= 2 results
+  // are not yet supported. So just support 1 result.
+  if (op->getNumResults() != 1) {
+    return op->emitError("NYI: ops with != 1 results"), true;
+  }
+  if (op->getResult(0)->getType() != state->superVectorType) {
+    return op->emitError("Op does not return a supervector."), true;
+  }
+  auto *clone =
+      instantiate(b, op, state->hwVectorType, state->substitutionsMap);
+  if (!clone) {
+    return true;
+  }
+  state->substitutionsMap->insert(
+      std::make_pair(op->getResult(0), clone->getResult(0)));
+  return false;
+}
+
+/// Takes a slice and rewrites the operations in it so that occurrences
+/// of `superVectorType` are replaced by `hwVectorType`.
+///
+/// Implementation
+/// ==============
+///   1. computes the shape ratio of super-vector to HW vector shapes. This
+///      gives for each op in the slice, how many instantiations are required
+///      in each dimension;
+///   2. performs the concrete materialization. Note that in a first
+///      implementation we use full unrolling because it pragmatically removes
+///      the need to explicitly materialize an AllocOp. Thanks to the properties
+///      of super-vectors, this unrolling is always possible and simple:
+///      vectorizing to a super-vector abstraction already achieved the
+///      equivalent of loop strip-mining + loop sinking and encoded this in the
+///      vector type.
+///
+/// Returns true on failure.
+///
+/// TODO(ntv): materialized allocs.
+/// TODO(ntv): full loops + materialized allocs.
+/// TODO(ntv): partial unrolling + materialized allocs.
+static bool emitSlice(MaterializationState *state,
+                      SetVector<Operation *> *slice) {
+  auto ratio = shapeRatio(state->superVectorType, state->hwVectorType);
+  assert(ratio.hasValue() &&
+         "ratio of super-vector to HW-vector shape is not integral");
+  // The number of integer points in a hyperrectangular region is:
+  // shape[0] * strides[0].
+  auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0];
+  // Full unrolling to hardware vectors in a first approximation.
+  for (unsigned idx = 0; idx < numValueToUnroll; ++idx) {
+    // Fresh RAII instanceIndices and substitutionsMap.
+    MaterializationState scopedState = *state;
+    scopedState.hwVectorInstance = delinearize(idx, *ratio);
+    DenseMap<Value *, Value *> substitutionMap;
+    scopedState.substitutionsMap = &substitutionMap;
+    // slice are topologically sorted, we can just clone them in order.
+    for (auto *op : *slice) {
+      auto fail = instantiateMaterialization(op, &scopedState);
+      if (fail) {
+        op->emitError("Unhandled super-vector materialization failure");
+        return true;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFunction is now\n");
+  LLVM_DEBUG((*slice)[0]->getParentOfType<FuncOp>().print(dbgs()));
+
+  // slice are topologically sorted, we can just erase them in reverse
+  // order. Reverse iterator does not just work simply with an operator*
+  // dereference.
+  for (int idx = slice->size() - 1; idx >= 0; --idx) {
+    LLVM_DEBUG(dbgs() << "\nErase: ");
+    LLVM_DEBUG((*slice)[idx]->print(dbgs()));
+    (*slice)[idx]->erase();
+  }
+  return false;
+}
+
+/// Materializes super-vector types into concrete hw vector types as follows:
+///   1. start from super-vector terminators (current vector.transfer_write
+///      ops);
+///   2. collect all the operations that can be reached by transitive use-defs
+///      chains;
+///   3. get the superVectorType for this particular terminator and the
+///      corresponding hardware vector type (for now limited to F32)
+///      TODO(ntv): be more general than F32.
+///   4. emit the transitive useDef set to operate on the finer-grain vector
+///      types.
+///
+/// Notes
+/// =====
+/// The `slice` is sorted in topological order by construction.
+/// Additionally, this set is limited to operations in the same lexical scope
+/// because we currently disallow vectorization of defs that come from another
+/// scope.
+/// TODO(ntv): please document return value.
+static bool materialize(FuncOp f, const SetVector<Operation *> &terminators,
+                        MaterializationState *state) {
+  DenseSet<Operation *> seen;
+  DominanceInfo domInfo(f);
+  for (auto *term : terminators) {
+    // Short-circuit test, a given terminator may have been reached by some
+    // other previous transitive use-def chains.
+    if (seen.count(term) > 0) {
+      continue;
+    }
+
+    auto terminator = cast<VectorTransferWriteOp>(term);
+    LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *term);
+
+    // Get the transitive use-defs starting from terminator, limited to the
+    // current enclosing scope of the terminator. See the top of the function
+    // Note for the justification of this restriction.
+    // TODO(ntv): relax scoping constraints.
+    auto *enclosingScope = term->getParentOp();
+    auto keepIfInSameScope = [enclosingScope, &domInfo](Operation *op) {
+      assert(op && "NULL op");
+      if (!enclosingScope) {
+        // by construction, everyone is always under the top scope (null scope).
+        return true;
+      }
+      return domInfo.properlyDominates(enclosingScope, op);
+    };
+    SetVector<Operation *> slice =
+        getSlice(term, keepIfInSameScope, keepIfInSameScope);
+    assert(!slice.empty());
+
+    // Sanity checks: transitive slice must be completely disjoint from
+    // what we have seen so far.
+    LLVM_DEBUG(dbgs() << "\nTransitive use-defs:");
+    for (auto *ud : slice) {
+      LLVM_DEBUG(dbgs() << "\nud:" << *ud);
+      assert(seen.count(ud) == 0 &&
+             "Transitive use-defs not disjoint from already seen");
+      seen.insert(ud);
+    }
+
+    // Emit the current slice.
+    // Set scoped super-vector and corresponding hw vector types.
+    state->superVectorType = terminator.getVectorType();
+    assert((state->superVectorType.getElementType() ==
+            FloatType::getF32(term->getContext())) &&
+           "Only f32 supported for now");
+    state->hwVectorType = VectorType::get(
+        state->hwVectorSize, state->superVectorType.getElementType());
+    auto fail = emitSlice(state, &slice);
+    if (fail) {
+      return true;
+    }
+    LLVM_DEBUG(dbgs() << "\nMLFunction is now\n");
+    LLVM_DEBUG(f.print(dbgs()));
+  }
+  return false;
+}
+
+void MaterializeVectorsPass::runOnFunction() {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+
+  // TODO(ntv): Check to see if this supports arbitrary top-level code.
+  FuncOp f = getFunction();
+  if (f.getBlocks().size() != 1)
+    return;
+
+  using matcher::Op;
+  LLVM_DEBUG(dbgs() << "\nMaterializeVectors on Function\n");
+  LLVM_DEBUG(f.print(dbgs()));
+
+  MaterializationState state(hwVectorSize);
+  // Get the hardware vector type.
+  // TODO(ntv): get elemental type from super-vector type rather than force f32.
+  auto subVectorType =
+      VectorType::get(hwVectorSize, FloatType::getF32(&getContext()));
+
+  // Capture terminators; i.e. vector.transfer_write ops involving a strict
+  // super-vector of subVectorType.
+  auto filter = [subVectorType](Operation &op) {
+    if (!isa<VectorTransferWriteOp>(op)) {
+      return false;
+    }
+    return matcher::operatesOnSuperVectorsOf(op, subVectorType);
+  };
+  auto pat = Op(filter);
+  SmallVector<NestedMatch, 8> matches;
+  pat.match(f, &matches);
+  SetVector<Operation *> terminators;
+  for (auto m : matches) {
+    terminators.insert(m.getMatchedOperation());
+  }
+
+  if (materialize(f, terminators, &state))
+    signalPassFailure();
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::createMaterializeVectorsPass(llvm::ArrayRef<int64_t> vectorSize) {
+  return std::make_unique<MaterializeVectorsPass>(vectorSize);
+}
+
+static PassRegistration<MaterializeVectorsPass>
+    pass("affine-materialize-vectors",
+         "Materializes super-vectors to vectors of the "
+         "proper size for the hardware");
+
+#undef DEBUG_TYPE
diff --git a/third_party/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/third_party/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
new file mode 100644
index 00000000000..9b71ada100c
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -0,0 +1,260 @@
+//===- MemRefDataFlowOpt.cpp - MemRef DataFlow Optimization pass ------ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to forward memref stores to loads, thereby
+// potentially getting rid of intermediate memref's entirely.
+// TODO(mlir-team): In the future, similar techniques could be used to eliminate
+// dead memref store's and perform more complex forwarding when support for
+// SSA scalars live out of 'affine.for'/'affine.if' statements is available.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memref-dataflow-opt"
+
+using namespace mlir;
+
+namespace {
+
+// The store to load forwarding relies on three conditions:
+//
+// 1) there has to be a dependence from the store to the load satisfied at the
+// block* immediately within the innermost loop enclosing both the load op and
+// the store op,
+//
+// 2) the store op should dominate the load op,
+//
+// 3) among all candidate store op's that satisfy (1) and (2), if there exists a
+// store op that postdominates all those that satisfy (1), such a store op is
+// provably the last writer to the particular memref location being loaded from
+// by the load op, and its store value can be forwarded to the load.
+//
+// 4) the load should touch a single location in the memref for a given
+// iteration of the innermost loop enclosing both the store op and the load op.
+//
+// (* A dependence being satisfied at a block: a dependence that is satisfied by
+// virtue of the destination operation appearing textually / lexically after
+// the source operation within the body of a 'affine.for' operation; thus, a
+// dependence is always either satisfied by a loop or by a block).
+//
+// The above conditions are simple to check, sufficient, and powerful for most
+// cases in practice - condition (1) and (3) are precise and necessary, while
+// condition (2) is a sufficient one but not necessary (since it doesn't reason
+// about loops that are guaranteed to execute at least once).
+//
+// TODO(mlir-team): more forwarding can be done when support for
+// loop/conditional live-out SSA values is available.
+// TODO(mlir-team): do general dead store elimination for memref's. This pass
+// currently only eliminates the stores only if no other loads/uses (other
+// than dealloc) remain.
+//
+struct MemRefDataFlowOpt : public FunctionPass<MemRefDataFlowOpt> {
+  void runOnFunction() override;
+
+  void forwardStoreToLoad(AffineLoadOp loadOp);
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value *, 4> memrefsToErase;
+  // Load op's whose results were replaced by those forwarded from stores.
+  std::vector<Operation *> loadOpsToErase;
+
+  DominanceInfo *domInfo = nullptr;
+  PostDominanceInfo *postDomInfo = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to perform optimizations relying on memref dataflow such as
+/// store to load forwarding, elimination of dead stores, and dead allocs.
+std::unique_ptr<FunctionPassBase> mlir::createMemRefDataFlowOptPass() {
+  return std::make_unique<MemRefDataFlowOpt>();
+}
+
+// This is a straightforward implementation not optimized for speed. Optimize
+// this in the future if needed.
+void MemRefDataFlowOpt::forwardStoreToLoad(AffineLoadOp loadOp) {
+  Operation *lastWriteStoreOp = nullptr;
+  Operation *loadOpInst = loadOp.getOperation();
+
+  // First pass over the use list to get minimum number of surrounding
+  // loops common between the load op and the store op, with min taken across
+  // all store ops.
+  SmallVector<Operation *, 8> storeOps;
+  unsigned minSurroundingLoops = getNestingDepth(*loadOpInst);
+  for (auto *user : loadOp.getMemRef()->getUsers()) {
+    auto storeOp = dyn_cast<AffineStoreOp>(user);
+    if (!storeOp)
+      continue;
+    auto *storeOpInst = storeOp.getOperation();
+    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOpInst, *storeOpInst);
+    minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
+    storeOps.push_back(storeOpInst);
+  }
+
+  unsigned loadOpDepth = getNestingDepth(*loadOpInst);
+
+  // 1. Check if there is a dependence satisfied at depth equal to the depth
+  // of the loop body of the innermost common surrounding loop of the storeOp
+  // and loadOp.
+  // The list of store op candidates for forwarding - need to satisfy the
+  // conditions listed at the top.
+  SmallVector<Operation *, 8> fwdingCandidates;
+  // Store ops that have a dependence into the load (even if they aren't
+  // forwarding candidates). Each forwarding candidate will be checked for a
+  // post-dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
+  SmallVector<Operation *, 8> depSrcStores;
+  for (auto *storeOpInst : storeOps) {
+    MemRefAccess srcAccess(storeOpInst);
+    MemRefAccess destAccess(loadOpInst);
+    FlatAffineConstraints dependenceConstraints;
+    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOpInst, *storeOpInst);
+    // Dependences at loop depth <= minSurroundingLoops do NOT matter.
+    for (unsigned d = nsLoops + 1; d > minSurroundingLoops; d--) {
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, destAccess, d, &dependenceConstraints,
+          /*dependenceComponents=*/nullptr);
+      if (!hasDependence(result))
+        continue;
+      depSrcStores.push_back(storeOpInst);
+      // Check if this store is a candidate for forwarding; we only forward if
+      // the dependence from the store is carried by the *body* of innermost
+      // common surrounding loop. As an example this filters out cases like:
+      // affine.for %i0
+      //   affine.for %i1
+      //     %idx = affine.apply (d0) -> (d0 + 1) (%i0)
+      //     store %A[%idx]
+      //     load %A[%i0]
+      //
+      if (d != nsLoops + 1)
+        break;
+
+      // 2. The store has to dominate the load op to be candidate. This is not
+      // strictly a necessary condition since dominance isn't a prerequisite for
+      // a memref element store to reach a load, but this is sufficient and
+      // reasonably powerful in practice.
+      if (!domInfo->dominates(storeOpInst, loadOpInst))
+        break;
+
+      // Finally, forwarding is only possible if the load touches a single
+      // location in the memref across the enclosing loops *not* common with the
+      // store. This is filtering out cases like:
+      // for (i ...)
+      //   a [i] = ...
+      //   for (j ...)
+      //      ... = a[j]
+      // If storeOpInst and loadOpDepth at the same nesting depth, the load Op
+      // is trivially loading from a single location at that depth; so there
+      // isn't a need to call isRangeOneToOne.
+      if (getNestingDepth(*storeOpInst) < loadOpDepth) {
+        MemRefRegion region(loadOpInst->getLoc());
+        region.compute(loadOpInst, nsLoops);
+        if (!region.getConstraints()->isRangeOneToOne(
+                /*start=*/0, /*limit=*/loadOp.getMemRefType().getRank()))
+          break;
+      }
+
+      // After all these conditions, we have a candidate for forwarding!
+      fwdingCandidates.push_back(storeOpInst);
+      break;
+    }
+  }
+
+  // Note: this can implemented in a cleaner way with postdominator tree
+  // traversals. Consider this for the future if needed.
+  for (auto *storeOpInst : fwdingCandidates) {
+    // 3. Of all the store op's that meet the above criteria, the store
+    // that postdominates all 'depSrcStores' (if such a store exists) is the
+    // unique store providing the value to the load, i.e., provably the last
+    // writer to that memref loc.
+    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
+          return postDomInfo->postDominates(storeOpInst, depStore);
+        })) {
+      lastWriteStoreOp = storeOpInst;
+      break;
+    }
+  }
+  // TODO: optimization for future: those store op's that are determined to be
+  // postdominated above can actually be recorded and skipped on the 'i' loop
+  // iteration above --- since they can never post dominate everything.
+
+  if (!lastWriteStoreOp)
+    return;
+
+  // Perform the actual store to load forwarding.
+  Value *storeVal = cast<AffineStoreOp>(lastWriteStoreOp).getValueToStore();
+  loadOp.replaceAllUsesWith(storeVal);
+  // Record the memref for a later sweep to optimize away.
+  memrefsToErase.insert(loadOp.getMemRef());
+  // Record this to erase later.
+  loadOpsToErase.push_back(loadOpInst);
+}
+
+void MemRefDataFlowOpt::runOnFunction() {
+  // Only supports single block functions at the moment.
+  FuncOp f = getFunction();
+  if (f.getBlocks().size() != 1) {
+    markAllAnalysesPreserved();
+    return;
+  }
+
+  domInfo = &getAnalysis<DominanceInfo>();
+  postDomInfo = &getAnalysis<PostDominanceInfo>();
+
+  loadOpsToErase.clear();
+  memrefsToErase.clear();
+
+  // Walk all load's and perform load/store forwarding.
+  f.walk<AffineLoadOp>(
+      [&](AffineLoadOp loadOp) { forwardStoreToLoad(loadOp); });
+
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *loadOp : loadOpsToErase) {
+    loadOp->erase();
+  }
+
+  // Check if the store fwd'ed memrefs are now left with only stores and can
+  // thus be completely deleted. Note: the canononicalize pass should be able
+  // to do this as well, but we'll do it here since we collected these anyway.
+  for (auto *memref : memrefsToErase) {
+    // If the memref hasn't been alloc'ed in this function, skip.
+    Operation *defInst = memref->getDefiningOp();
+    if (!defInst || !isa<AllocOp>(defInst))
+      // TODO(mlir-team): if the memref was returned by a 'call' operation, we
+      // could still erase it if the call had no side-effects.
+      continue;
+    if (llvm::any_of(memref->getUsers(), [&](Operation *ownerInst) {
+          return (!isa<AffineStoreOp>(ownerInst) && !isa<DeallocOp>(ownerInst));
+        }))
+      continue;
+
+    // Erase all stores, the dealloc, and the alloc on the memref.
+    for (auto *user : llvm::make_early_inc_range(memref->getUsers()))
+      user->erase();
+    defInst->erase();
+  }
+}
+
+static PassRegistration<MemRefDataFlowOpt>
+    pass("memref-dataflow-opt", "Perform store/load forwarding for memrefs");
diff --git a/third_party/mlir/lib/Transforms/PipelineDataTransfer.cpp b/third_party/mlir/lib/Transforms/PipelineDataTransfer.cpp
new file mode 100644
index 00000000000..a814af92a5f
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -0,0 +1,383 @@
+//===- PipelineDataTransfer.cpp --- Pass for pipelining data movement ---*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to pipeline data transfers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#define DEBUG_TYPE "affine-pipeline-data-transfer"
+
+using namespace mlir;
+
+namespace {
+
+struct PipelineDataTransfer : public FunctionPass<PipelineDataTransfer> {
+  void runOnFunction() override;
+  void runOnAffineForOp(AffineForOp forOp);
+
+  std::vector<AffineForOp> forOps;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<FunctionPassBase> mlir::createPipelineDataTransferPass() {
+  return std::make_unique<PipelineDataTransfer>();
+}
+
+// Returns the position of the tag memref operand given a DMA operation.
+// Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are
+// added.  TODO(b/117228571)
+static unsigned getTagMemRefPos(Operation &dmaInst) {
+  assert(isa<AffineDmaStartOp>(dmaInst) || isa<AffineDmaWaitOp>(dmaInst));
+  if (auto dmaStartOp = dyn_cast<AffineDmaStartOp>(dmaInst)) {
+    return dmaStartOp.getTagMemRefOperandIndex();
+  }
+  // First operand for a dma finish operation.
+  return 0;
+}
+
+/// Doubles the buffer of the supplied memref on the specified 'affine.for'
+/// operation by adding a leading dimension of size two to the memref.
+/// Replaces all uses of the old memref by the new one while indexing the newly
+/// added dimension by the loop IV of the specified 'affine.for' operation
+/// modulo 2. Returns false if such a replacement cannot be performed.
+static bool doubleBuffer(Value *oldMemRef, AffineForOp forOp) {
+  auto *forBody = forOp.getBody();
+  OpBuilder bInner(forBody, forBody->begin());
+  bInner.setInsertionPoint(forBody, forBody->begin());
+
+  // Doubles the shape with a leading dimension extent of 2.
+  auto doubleShape = [&](MemRefType oldMemRefType) -> MemRefType {
+    // Add the leading dimension in the shape for the double buffer.
+    ArrayRef<int64_t> oldShape = oldMemRefType.getShape();
+    SmallVector<int64_t, 4> newShape(1 + oldMemRefType.getRank());
+    newShape[0] = 2;
+    std::copy(oldShape.begin(), oldShape.end(), newShape.begin() + 1);
+    auto newMemRefType =
+        bInner.getMemRefType(newShape, oldMemRefType.getElementType(), {},
+                             oldMemRefType.getMemorySpace());
+    return newMemRefType;
+  };
+
+  auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
+  auto newMemRefType = doubleShape(oldMemRefType);
+
+  // The double buffer is allocated right before 'forInst'.
+  auto *forInst = forOp.getOperation();
+  OpBuilder bOuter(forInst);
+  // Put together alloc operands for any dynamic dimensions of the memref.
+  SmallVector<Value *, 4> allocOperands;
+  unsigned dynamicDimCount = 0;
+  for (auto dimSize : oldMemRefType.getShape()) {
+    if (dimSize == -1)
+      allocOperands.push_back(bOuter.create<DimOp>(forInst->getLoc(), oldMemRef,
+                                                   dynamicDimCount++));
+  }
+
+  // Create and place the alloc right before the 'affine.for' operation.
+  Value *newMemRef =
+      bOuter.create<AllocOp>(forInst->getLoc(), newMemRefType, allocOperands);
+
+  // Create 'iv mod 2' value to index the leading dimension.
+  auto d0 = bInner.getAffineDimExpr(0);
+  int64_t step = forOp.getStep();
+  auto modTwoMap = bInner.getAffineMap(/*dimCount=*/1, /*symbolCount=*/0,
+                                       {d0.floorDiv(step) % 2});
+  auto ivModTwoOp = bInner.create<AffineApplyOp>(forOp.getLoc(), modTwoMap,
+                                                 forOp.getInductionVar());
+
+  // replaceAllMemRefUsesWith will succeed unless the forOp body has
+  // non-dereferencing uses of the memref (dealloc's are fine though).
+  if (failed(replaceAllMemRefUsesWith(
+          oldMemRef, newMemRef,
+          /*extraIndices=*/{ivModTwoOp},
+          /*indexRemap=*/AffineMap(),
+          /*extraOperands=*/{},
+          /*domInstFilter=*/&*forOp.getBody()->begin()))) {
+    LLVM_DEBUG(
+        forOp.emitError("memref replacement for double buffering failed"));
+    ivModTwoOp.erase();
+    return false;
+  }
+  // Insert the dealloc op right after the for loop.
+  bOuter.setInsertionPoint(forInst->getBlock(),
+                           std::next(Block::iterator(forInst)));
+  bOuter.create<DeallocOp>(forInst->getLoc(), newMemRef);
+
+  return true;
+}
+
+/// Returns success if the IR is in a valid state.
+void PipelineDataTransfer::runOnFunction() {
+  // Do a post order walk so that inner loop DMAs are processed first. This is
+  // necessary since 'affine.for' operations nested within would otherwise
+  // become invalid (erased) when the outer loop is pipelined (the pipelined one
+  // gets deleted and replaced by a prologue, a new steady-state loop and an
+  // epilogue).
+  forOps.clear();
+  getFunction().walk<AffineForOp>(
+      [&](AffineForOp forOp) { forOps.push_back(forOp); });
+  for (auto forOp : forOps)
+    runOnAffineForOp(forOp);
+}
+
+// Check if tags of the dma start op and dma wait op match.
+static bool checkTagMatch(AffineDmaStartOp startOp, AffineDmaWaitOp waitOp) {
+  if (startOp.getTagMemRef() != waitOp.getTagMemRef())
+    return false;
+  auto startIndices = startOp.getTagIndices();
+  auto waitIndices = waitOp.getTagIndices();
+  // Both of these have the same number of indices since they correspond to the
+  // same tag memref.
+  for (auto it = startIndices.begin(), wIt = waitIndices.begin(),
+            e = startIndices.end();
+       it != e; ++it, ++wIt) {
+    // Keep it simple for now, just checking if indices match.
+    // TODO(mlir-team): this would in general need to check if there is no
+    // intervening write writing to the same tag location, i.e., memory last
+    // write/data flow analysis. This is however sufficient/powerful enough for
+    // now since the DMA generation pass or the input for it will always have
+    // start/wait with matching tags (same SSA operand indices).
+    if (*it != *wIt)
+      return false;
+  }
+  return true;
+}
+
+// Identify matching DMA start/finish operations to overlap computation with.
+static void findMatchingStartFinishInsts(
+    AffineForOp forOp,
+    SmallVectorImpl<std::pair<Operation *, Operation *>> &startWaitPairs) {
+
+  // Collect outgoing DMA operations - needed to check for dependences below.
+  SmallVector<AffineDmaStartOp, 4> outgoingDmaOps;
+  for (auto &op : *forOp.getBody()) {
+    auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
+    if (dmaStartOp && dmaStartOp.isSrcMemorySpaceFaster())
+      outgoingDmaOps.push_back(dmaStartOp);
+  }
+
+  SmallVector<Operation *, 4> dmaStartInsts, dmaFinishInsts;
+  for (auto &op : *forOp.getBody()) {
+    // Collect DMA finish operations.
+    if (isa<AffineDmaWaitOp>(op)) {
+      dmaFinishInsts.push_back(&op);
+      continue;
+    }
+    auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
+    if (!dmaStartOp)
+      continue;
+
+    // Only DMAs incoming into higher memory spaces are pipelined for now.
+    // TODO(bondhugula): handle outgoing DMA pipelining.
+    if (!dmaStartOp.isDestMemorySpaceFaster())
+      continue;
+
+    // Check for dependence with outgoing DMAs. Doing this conservatively.
+    // TODO(andydavis,bondhugula): use the dependence analysis to check for
+    // dependences between an incoming and outgoing DMA in the same iteration.
+    auto it = outgoingDmaOps.begin();
+    for (; it != outgoingDmaOps.end(); ++it) {
+      if (it->getDstMemRef() == dmaStartOp.getSrcMemRef())
+        break;
+    }
+    if (it != outgoingDmaOps.end())
+      continue;
+
+    // We only double buffer if the buffer is not live out of loop.
+    auto *memref = dmaStartOp.getOperand(dmaStartOp.getFasterMemPos());
+    bool escapingUses = false;
+    for (auto *user : memref->getUsers()) {
+      // We can double buffer regardless of dealloc's outside the loop.
+      if (isa<DeallocOp>(user))
+        continue;
+      if (!forOp.getBody()->findAncestorInstInBlock(*user)) {
+        LLVM_DEBUG(llvm::dbgs()
+                       << "can't pipeline: buffer is live out of loop\n";);
+        escapingUses = true;
+        break;
+      }
+    }
+    if (!escapingUses)
+      dmaStartInsts.push_back(&op);
+  }
+
+  // For each start operation, we look for a matching finish operation.
+  for (auto *dmaStartInst : dmaStartInsts) {
+    for (auto *dmaFinishInst : dmaFinishInsts) {
+      if (checkTagMatch(cast<AffineDmaStartOp>(dmaStartInst),
+                        cast<AffineDmaWaitOp>(dmaFinishInst))) {
+        startWaitPairs.push_back({dmaStartInst, dmaFinishInst});
+        break;
+      }
+    }
+  }
+}
+
+/// Overlap DMA transfers with computation in this loop. If successful,
+/// 'forOp' is deleted, and a prologue, a new pipelined loop, and epilogue are
+/// inserted right before where it was.
+void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
+  auto mayBeConstTripCount = getConstantTripCount(forOp);
+  if (!mayBeConstTripCount.hasValue()) {
+    LLVM_DEBUG(
+        forOp.emitRemark("won't pipeline due to unknown trip count loop"));
+    return;
+  }
+
+  SmallVector<std::pair<Operation *, Operation *>, 4> startWaitPairs;
+  findMatchingStartFinishInsts(forOp, startWaitPairs);
+
+  if (startWaitPairs.empty()) {
+    LLVM_DEBUG(forOp.emitRemark("No dma start/finish pairs\n"));
+    return;
+  }
+
+  // Double the buffers for the higher memory space memref's.
+  // Identify memref's to replace by scanning through all DMA start
+  // operations. A DMA start operation has two memref's - the one from the
+  // higher level of memory hierarchy is the one to double buffer.
+  // TODO(bondhugula): check whether double-buffering is even necessary.
+  // TODO(bondhugula): make this work with different layouts: assuming here that
+  // the dimension we are adding here for the double buffering is the outermost
+  // dimension.
+  for (auto &pair : startWaitPairs) {
+    auto *dmaStartInst = pair.first;
+    Value *oldMemRef = dmaStartInst->getOperand(
+        cast<AffineDmaStartOp>(dmaStartInst).getFasterMemPos());
+    if (!doubleBuffer(oldMemRef, forOp)) {
+      // Normally, double buffering should not fail because we already checked
+      // that there are no uses outside.
+      LLVM_DEBUG(llvm::dbgs()
+                     << "double buffering failed for" << dmaStartInst << "\n";);
+      // IR still valid and semantically correct.
+      return;
+    }
+    // If the old memref has no more uses, remove its 'dead' alloc if it was
+    // alloc'ed. (note: DMA buffers are rarely function live-in; but a 'dim'
+    // operation could have been used on it if it was dynamically shaped in
+    // order to create the double buffer above.)
+    // '-canonicalize' does this in a more general way, but we'll anyway do the
+    // simple/common case so that the output / test cases looks clear.
+    if (auto *allocInst = oldMemRef->getDefiningOp()) {
+      if (oldMemRef->use_empty()) {
+        allocInst->erase();
+      } else if (oldMemRef->hasOneUse()) {
+        if (auto dealloc = dyn_cast<DeallocOp>(*oldMemRef->user_begin())) {
+          dealloc.erase();
+          oldMemRef->getDefiningOp()->erase();
+        }
+      }
+    }
+  }
+
+  // Double the buffers for tag memrefs.
+  for (auto &pair : startWaitPairs) {
+    auto *dmaFinishInst = pair.second;
+    Value *oldTagMemRef =
+        dmaFinishInst->getOperand(getTagMemRefPos(*dmaFinishInst));
+    if (!doubleBuffer(oldTagMemRef, forOp)) {
+      LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";);
+      return;
+    }
+    // If the old tag has no more uses, remove its 'dead' alloc if it was
+    // alloc'ed.
+    if (oldTagMemRef->use_empty())
+      if (auto *allocInst = oldTagMemRef->getDefiningOp())
+        allocInst->erase();
+  }
+
+  // Double buffering would have invalidated all the old DMA start/wait insts.
+  startWaitPairs.clear();
+  findMatchingStartFinishInsts(forOp, startWaitPairs);
+
+  // Store shift for operation for later lookup for AffineApplyOp's.
+  DenseMap<Operation *, unsigned> instShiftMap;
+  for (auto &pair : startWaitPairs) {
+    auto *dmaStartInst = pair.first;
+    assert(isa<AffineDmaStartOp>(dmaStartInst));
+    instShiftMap[dmaStartInst] = 0;
+    // Set shifts for DMA start op's affine operand computation slices to 0.
+    SmallVector<AffineApplyOp, 4> sliceOps;
+    mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
+    if (!sliceOps.empty()) {
+      for (auto sliceOp : sliceOps) {
+        instShiftMap[sliceOp.getOperation()] = 0;
+      }
+    } else {
+      // If a slice wasn't created, the reachable affine.apply op's from its
+      // operands are the ones that go with it.
+      SmallVector<Operation *, 4> affineApplyInsts;
+      SmallVector<Value *, 4> operands(dmaStartInst->getOperands());
+      getReachableAffineApplyOps(operands, affineApplyInsts);
+      for (auto *op : affineApplyInsts) {
+        instShiftMap[op] = 0;
+      }
+    }
+  }
+  // Everything else (including compute ops and dma finish) are shifted by one.
+  for (auto &op : *forOp.getBody()) {
+    if (instShiftMap.find(&op) == instShiftMap.end()) {
+      instShiftMap[&op] = 1;
+    }
+  }
+
+  // Get shifts stored in map.
+  std::vector<uint64_t> shifts(forOp.getBody()->getOperations().size());
+  unsigned s = 0;
+  for (auto &op : *forOp.getBody()) {
+    assert(instShiftMap.find(&op) != instShiftMap.end());
+    shifts[s++] = instShiftMap[&op];
+
+    // Tagging operations with shifts for debugging purposes.
+    LLVM_DEBUG({
+      OpBuilder b(&op);
+      op.setAttr("shift", b.getI64IntegerAttr(shifts[s - 1]));
+    });
+  }
+
+  if (!isInstwiseShiftValid(forOp, shifts)) {
+    // Violates dependences.
+    LLVM_DEBUG(llvm::dbgs() << "Shifts invalid - unexpected\n";);
+    return;
+  }
+
+  if (failed(instBodySkew(forOp, shifts))) {
+    LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";);
+    return;
+  }
+}
+
+static PassRegistration<PipelineDataTransfer> pass(
+    "affine-pipeline-data-transfer",
+    "Pipeline non-blocking data transfers between explicitly managed levels of "
+    "the memory hierarchy");
diff --git a/third_party/mlir/lib/Transforms/SimplifyAffineStructures.cpp b/third_party/mlir/lib/Transforms/SimplifyAffineStructures.cpp
new file mode 100644
index 00000000000..97193b49a74
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/SimplifyAffineStructures.cpp
@@ -0,0 +1,108 @@
+//===- SimplifyAffineStructures.cpp ---------------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to simplify affine structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+#define DEBUG_TYPE "simplify-affine-structure"
+
+using namespace mlir;
+
+namespace {
+
+/// Simplifies all affine expressions appearing in the operations of
+/// the Function. This is mainly to test the simplifyAffineExpr method.
+/// TODO(someone): This should just be defined as a canonicalization pattern
+/// on AffineMap and driven from the existing canonicalization pass.
+struct SimplifyAffineStructures
+    : public FunctionPass<SimplifyAffineStructures> {
+  void runOnFunction() override;
+
+  /// Utility to simplify an affine attribute and update its entry in the parent
+  /// operation if necessary.
+  template <typename AttributeT>
+  void simplifyAndUpdateAttribute(Operation *op, Identifier name,
+                                  AttributeT attr) {
+    auto &simplified = simplifiedAttributes[attr];
+    if (simplified == attr)
+      return;
+
+    // This is a newly encountered attribute.
+    if (!simplified) {
+      // Try to simplify the value of the attribute.
+      auto value = attr.getValue();
+      auto simplifiedValue = simplify(value);
+      if (simplifiedValue == value) {
+        simplified = attr;
+        return;
+      }
+      simplified = AttributeT::get(simplifiedValue);
+    }
+
+    // Simplification was successful, so update the attribute.
+    op->setAttr(name, simplified);
+  }
+
+  /// Performs basic integer set simplifications. Checks if it's empty, and
+  /// replaces it with the canonical empty set if it is.
+  IntegerSet simplify(IntegerSet set) {
+    FlatAffineConstraints fac(set);
+    if (fac.isEmpty())
+      return IntegerSet::getEmptySet(set.getNumDims(), set.getNumSymbols(),
+                                     &getContext());
+    return set;
+  }
+
+  /// Performs basic affine map simplifications.
+  AffineMap simplify(AffineMap map) {
+    MutableAffineMap mMap(map);
+    mMap.simplify();
+    return mMap.getAffineMap();
+  }
+
+  DenseMap<Attribute, Attribute> simplifiedAttributes;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionPassBase> mlir::createSimplifyAffineStructuresPass() {
+  return std::make_unique<SimplifyAffineStructures>();
+}
+
+void SimplifyAffineStructures::runOnFunction() {
+  simplifiedAttributes.clear();
+  getFunction().walk([&](Operation *opInst) {
+    for (auto attr : opInst->getAttrs()) {
+      if (auto mapAttr = attr.second.dyn_cast<AffineMapAttr>())
+        simplifyAndUpdateAttribute(opInst, attr.first, mapAttr);
+      else if (auto setAttr = attr.second.dyn_cast<IntegerSetAttr>())
+        simplifyAndUpdateAttribute(opInst, attr.first, setAttr);
+    }
+  });
+}
+
+static PassRegistration<SimplifyAffineStructures>
+    pass("simplify-affine-structures", "Simplify affine expressions");
diff --git a/third_party/mlir/lib/Transforms/StripDebugInfo.cpp b/third_party/mlir/lib/Transforms/StripDebugInfo.cpp
new file mode 100644
index 00000000000..15db8b58e88
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/StripDebugInfo.cpp
@@ -0,0 +1,46 @@
+//===- StripDebugInfo.cpp - Pass to strip debug information ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+struct StripDebugInfo : public FunctionPass<StripDebugInfo> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+void StripDebugInfo::runOnFunction() {
+  FuncOp func = getFunction();
+  auto unknownLoc = UnknownLoc::get(&getContext());
+
+  // Strip the debug info from the function and its operations.
+  func.setLoc(unknownLoc);
+  func.walk([&](Operation *op) { op->setLoc(unknownLoc); });
+}
+
+/// Creates a pass to strip debug information from a function.
+std::unique_ptr<FunctionPassBase> mlir::createStripDebugInfoPass() {
+  return std::make_unique<StripDebugInfo>();
+}
+
+static PassRegistration<StripDebugInfo>
+    pass("strip-debuginfo", "Strip debug info from functions and operations");
diff --git a/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt b/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt
new file mode 100644
index 00000000000..3c08f45e8b6
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_llvm_library(MLIRTransformUtils
+  FoldUtils.cpp
+  GreedyPatternRewriteDriver.cpp
+  LoopFusionUtils.cpp
+  LoopUtils.cpp
+  RegionUtils.cpp
+  Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+
+add_dependencies(MLIRTransformUtils MLIRStandardOpsIncGen)
+target_link_libraries(MLIRTransformUtils
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRStandardOps
+  )
diff --git a/third_party/mlir/lib/Transforms/Utils/FoldUtils.cpp b/third_party/mlir/lib/Transforms/Utils/FoldUtils.cpp
new file mode 100644
index 00000000000..6c313e20932
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -0,0 +1,248 @@
+//===- FoldUtils.cpp ---- Fold Utilities ----------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines various operation fold utilities. These utilities are
+// intended to be used by passes to unify and simply their logic.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+
+using namespace mlir;
+
+/// Given an operation, find the parent region that folded constants should be
+/// inserted into.
+static Region *getInsertionRegion(Operation *op) {
+  while (Region *region = op->getParentRegion()) {
+    // Insert in this region for any of the following scenarios:
+    //  * The parent is unregistered, or is known to be isolated from above.
+    //  * The parent is a top-level operation.
+    auto *parentOp = region->getParentOp();
+    if (!parentOp->isRegistered() || parentOp->isKnownIsolatedFromAbove() ||
+        !parentOp->getBlock())
+      return region;
+    // Traverse up the parent looking for an insertion region.
+    op = parentOp;
+  }
+  llvm_unreachable("expected valid insertion region");
+}
+
+/// A utility function used to materialize a constant for a given attribute and
+/// type. On success, a valid constant value is returned. Otherwise, null is
+/// returned
+static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder,
+                                      Attribute value, Type type,
+                                      Location loc) {
+  auto insertPt = builder.getInsertionPoint();
+  (void)insertPt;
+
+  // Ask the dialect to materialize a constant operation for this value.
+  if (auto *constOp = dialect->materializeConstant(builder, value, type, loc)) {
+    assert(insertPt == builder.getInsertionPoint());
+    assert(matchPattern(constOp, m_Constant(&value)));
+    return constOp;
+  }
+
+  // If the dialect is unable to materialize a constant, check to see if the
+  // standard constant can be used.
+  if (ConstantOp::isBuildableWith(value, type))
+    return builder.create<ConstantOp>(loc, type, value);
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// OperationFolder
+//===----------------------------------------------------------------------===//
+
+LogicalResult OperationFolder::tryToFold(
+    Operation *op,
+    llvm::function_ref<void(Operation *)> processGeneratedConstants,
+    llvm::function_ref<void(Operation *)> preReplaceAction) {
+  // If this is a unique'd constant, return failure as we know that it has
+  // already been folded.
+  if (referencedDialects.count(op))
+    return failure();
+
+  // Try to fold the operation.
+  SmallVector<Value *, 8> results;
+  if (failed(tryToFold(op, results, processGeneratedConstants)))
+    return failure();
+
+  // Constant folding succeeded. We will start replacing this op's uses and
+  // eventually erase this op. Invoke the callback provided by the caller to
+  // perform any pre-replacement action.
+  if (preReplaceAction)
+    preReplaceAction(op);
+
+  // Check to see if the operation was just updated in place.
+  if (results.empty())
+    return success();
+
+  // Otherwise, replace all of the result values and erase the operation.
+  for (unsigned i = 0, e = results.size(); i != e; ++i)
+    op->getResult(i)->replaceAllUsesWith(results[i]);
+  op->erase();
+  return success();
+}
+
+/// Notifies that the given constant `op` should be remove from this
+/// OperationFolder's internal bookkeeping.
+void OperationFolder::notifyRemoval(Operation *op) {
+  // Check to see if this operation is uniqued within the folder.
+  auto it = referencedDialects.find(op);
+  if (it == referencedDialects.end())
+    return;
+
+  // Get the constant value for this operation, this is the value that was used
+  // to unique the operation internally.
+  Attribute constValue;
+  matchPattern(op, m_Constant(&constValue));
+  assert(constValue);
+
+  // Get the constant map that this operation was uniqued in.
+  auto &uniquedConstants = foldScopes[getInsertionRegion(op)];
+
+  // Erase all of the references to this operation.
+  auto type = op->getResult(0)->getType();
+  for (auto *dialect : it->second)
+    uniquedConstants.erase(std::make_tuple(dialect, constValue, type));
+  referencedDialects.erase(it);
+}
+
+/// Tries to perform folding on the given `op`. If successful, populates
+/// `results` with the results of the folding.
+LogicalResult OperationFolder::tryToFold(
+    Operation *op, SmallVectorImpl<Value *> &results,
+    llvm::function_ref<void(Operation *)> processGeneratedConstants) {
+  SmallVector<Attribute, 8> operandConstants;
+  SmallVector<OpFoldResult, 8> foldResults;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  operandConstants.assign(op->getNumOperands(), Attribute());
+  for (unsigned i = 0, e = op->getNumOperands(); i != e; ++i)
+    matchPattern(op->getOperand(i), m_Constant(&operandConstants[i]));
+
+  // If this is a commutative binary operation with a constant on the left
+  // side move it to the right side.
+  if (operandConstants.size() == 2 && operandConstants[0] &&
+      !operandConstants[1] && op->isCommutative()) {
+    std::swap(op->getOpOperand(0), op->getOpOperand(1));
+    std::swap(operandConstants[0], operandConstants[1]);
+  }
+
+  // Attempt to constant fold the operation.
+  if (failed(op->fold(operandConstants, foldResults)))
+    return failure();
+
+  // Check to see if the operation was just updated in place.
+  if (foldResults.empty())
+    return success();
+  assert(foldResults.size() == op->getNumResults());
+
+  // Create a builder to insert new operations into the entry block of the
+  // insertion region.
+  auto *insertionRegion = getInsertionRegion(op);
+  auto &entry = insertionRegion->front();
+  OpBuilder builder(&entry, entry.begin());
+
+  // Get the constant map for the insertion region of this operation.
+  auto &uniquedConstants = foldScopes[insertionRegion];
+
+  // Create the result constants and replace the results.
+  auto *dialect = op->getDialect();
+  for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) {
+    assert(!foldResults[i].isNull() && "expected valid OpFoldResult");
+
+    // Check if the result was an SSA value.
+    if (auto *repl = foldResults[i].dyn_cast<Value *>()) {
+      results.emplace_back(repl);
+      continue;
+    }
+
+    // Check to see if there is a canonicalized version of this constant.
+    auto *res = op->getResult(i);
+    Attribute attrRepl = foldResults[i].get<Attribute>();
+    if (auto *constOp =
+            tryGetOrCreateConstant(uniquedConstants, dialect, builder, attrRepl,
+                                   res->getType(), op->getLoc())) {
+      results.push_back(constOp->getResult(0));
+      continue;
+    }
+    // If materialization fails, cleanup any operations generated for the
+    // previous results and return failure.
+    for (Operation &op : llvm::make_early_inc_range(
+             llvm::make_range(entry.begin(), builder.getInsertionPoint()))) {
+      notifyRemoval(&op);
+      op.erase();
+    }
+    return failure();
+  }
+
+  // Process any newly generated operations.
+  if (processGeneratedConstants) {
+    for (auto i = entry.begin(), e = builder.getInsertionPoint(); i != e; ++i)
+      processGeneratedConstants(&*i);
+  }
+
+  return success();
+}
+
+/// Try to get or create a new constant entry. On success this returns the
+/// constant operation value, nullptr otherwise.
+Operation *OperationFolder::tryGetOrCreateConstant(
+    ConstantMap &uniquedConstants, Dialect *dialect, OpBuilder &builder,
+    Attribute value, Type type, Location loc) {
+  // Check if an existing mapping already exists.
+  auto constKey = std::make_tuple(dialect, value, type);
+  auto *&constInst = uniquedConstants[constKey];
+  if (constInst)
+    return constInst;
+
+  // If one doesn't exist, try to materialize one.
+  if (!(constInst = materializeConstant(dialect, builder, value, type, loc)))
+    return nullptr;
+
+  // Check to see if the generated constant is in the expected dialect.
+  auto *newDialect = constInst->getDialect();
+  if (newDialect == dialect) {
+    referencedDialects[constInst].push_back(dialect);
+    return constInst;
+  }
+
+  // If it isn't, then we also need to make sure that the mapping for the new
+  // dialect is valid.
+  auto newKey = std::make_tuple(newDialect, value, type);
+
+  // If an existing operation in the new dialect already exists, delete the
+  // materialized operation in favor of the existing one.
+  if (auto *existingOp = uniquedConstants.lookup(newKey)) {
+    constInst->erase();
+    referencedDialects[existingOp].push_back(dialect);
+    return constInst = existingOp;
+  }
+
+  // Otherwise, update the new dialect to the materialized operation.
+  referencedDialects[constInst].assign({dialect, newDialect});
+  auto newIt = uniquedConstants.insert({newKey, constInst});
+  return newIt.first->second;
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/third_party/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
new file mode 100644
index 00000000000..ddb92a58113
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -0,0 +1,236 @@
+//===- GreedyPatternRewriteDriver.cpp - A greedy rewriter -----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements mlir::applyPatternsGreedily.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "pattern-matcher"
+
+static llvm::cl::opt<unsigned> maxPatternMatchIterations(
+    "mlir-max-pattern-match-iterations",
+    llvm::cl::desc("Max number of iterations scanning for pattern match"),
+    llvm::cl::init(10));
+
+namespace {
+
+/// This is a worklist-driven driver for the PatternMatcher, which repeatedly
+/// applies the locally optimal patterns in a roughly "bottom up" way.
+class GreedyPatternRewriteDriver : public PatternRewriter {
+public:
+  explicit GreedyPatternRewriteDriver(MLIRContext *ctx,
+                                      const OwningRewritePatternList &patterns)
+      : PatternRewriter(ctx), matcher(patterns) {
+    worklist.reserve(64);
+  }
+
+  /// Perform the rewrites. Return true if the rewrite converges in
+  /// `maxIterations`.
+  bool simplify(Operation *op, int maxIterations);
+
+  void addToWorklist(Operation *op) {
+    // Check to see if the worklist already contains this op.
+    if (worklistMap.count(op))
+      return;
+
+    worklistMap[op] = worklist.size();
+    worklist.push_back(op);
+  }
+
+  Operation *popFromWorklist() {
+    auto *op = worklist.back();
+    worklist.pop_back();
+
+    // This operation is no longer in the worklist, keep worklistMap up to date.
+    if (op)
+      worklistMap.erase(op);
+    return op;
+  }
+
+  /// If the specified operation is in the worklist, remove it.  If not, this is
+  /// a no-op.
+  void removeFromWorklist(Operation *op) {
+    auto it = worklistMap.find(op);
+    if (it != worklistMap.end()) {
+      assert(worklist[it->second] == op && "malformed worklist data structure");
+      worklist[it->second] = nullptr;
+    }
+  }
+
+  // These are hooks implemented for PatternRewriter.
+protected:
+  // Implement the hook for creating operations, and make sure that newly
+  // created ops are added to the worklist for processing.
+  Operation *createOperation(const OperationState &state) override {
+    auto *result = OpBuilder::createOperation(state);
+    addToWorklist(result);
+    return result;
+  }
+
+  // If an operation is about to be removed, make sure it is not in our
+  // worklist anymore because we'd get dangling references to it.
+  void notifyOperationRemoved(Operation *op) override {
+    addToWorklist(op->getOperands());
+    op->walk([this](Operation *operation) {
+      removeFromWorklist(operation);
+      folder.notifyRemoval(operation);
+    });
+  }
+
+  // When the root of a pattern is about to be replaced, it can trigger
+  // simplifications to its users - make sure to add them to the worklist
+  // before the root is changed.
+  void notifyRootReplaced(Operation *op) override {
+    for (auto *result : op->getResults())
+      for (auto *user : result->getUsers())
+        addToWorklist(user);
+  }
+
+private:
+  // Look over the provided operands for any defining operations that should
+  // be re-added to the worklist. This function should be called when an
+  // operation is modified or removed, as it may trigger further
+  // simplifications.
+  template <typename Operands> void addToWorklist(Operands &&operands) {
+    for (Value *operand : operands) {
+      // If the use count of this operand is now < 2, we re-add the defining
+      // operation to the worklist.
+      // TODO(riverriddle) This is based on the fact that zero use operations
+      // may be deleted, and that single use values often have more
+      // canonicalization opportunities.
+      if (!operand->use_empty() && !operand->hasOneUse())
+        continue;
+      if (auto *defInst = operand->getDefiningOp())
+        addToWorklist(defInst);
+    }
+  }
+
+  /// The low-level pattern matcher.
+  RewritePatternMatcher matcher;
+
+  /// The worklist for this transformation keeps track of the operations that
+  /// need to be revisited, plus their index in the worklist.  This allows us to
+  /// efficiently remove operations from the worklist when they are erased, even
+  /// if they aren't the root of a pattern.
+  std::vector<Operation *> worklist;
+  DenseMap<Operation *, unsigned> worklistMap;
+
+  /// Non-pattern based folder for operations.
+  OperationFolder folder;
+};
+} // end anonymous namespace
+
+/// Perform the rewrites.
+bool GreedyPatternRewriteDriver::simplify(Operation *op, int maxIterations) {
+  // Add the given operation to the worklist.
+  auto collectOps = [this](Operation *op) { addToWorklist(op); };
+
+  bool changed = false;
+  int i = 0;
+  do {
+    // Add all nested operations to the worklist.
+    for (auto &region : op->getRegions())
+      region.walk(collectOps);
+
+    // These are scratch vectors used in the folding loop below.
+    SmallVector<Value *, 8> originalOperands, resultValues;
+
+    changed = false;
+    while (!worklist.empty()) {
+      auto *op = popFromWorklist();
+
+      // Nulls get added to the worklist when operations are removed, ignore
+      // them.
+      if (op == nullptr)
+        continue;
+
+      // If the operation has no side effects, and no users, then it is
+      // trivially dead - remove it.
+      if (op->hasNoSideEffect() && op->use_empty()) {
+        // Be careful to update bookkeeping.
+        notifyOperationRemoved(op);
+        op->erase();
+        continue;
+      }
+
+      // Collects all the operands and result uses of the given `op` into work
+      // list. Also remove `op` and nested ops from worklist.
+      originalOperands.assign(op->operand_begin(), op->operand_end());
+      auto preReplaceAction = [&](Operation *op) {
+        // Add the operands to the worklist for visitation.
+        addToWorklist(originalOperands);
+
+        // Add all the users of the result to the worklist so we make sure
+        // to revisit them.
+        for (auto *result : op->getResults())
+          for (auto *operand : result->getUsers())
+            addToWorklist(operand);
+
+        notifyOperationRemoved(op);
+      };
+
+      // Try to fold this op.
+      if (succeeded(folder.tryToFold(op, collectOps, preReplaceAction))) {
+        changed |= true;
+        continue;
+      }
+
+      // Make sure that any new operations are inserted at this point.
+      setInsertionPoint(op);
+
+      // Try to match one of the patterns. The rewriter is automatically
+      // notified of any necessary changes, so there is nothing else to do here.
+      changed |= matcher.matchAndRewrite(op, *this);
+    }
+  } while (changed && ++i < maxIterations);
+  // Whether the rewrite converges, i.e. wasn't changed in the last iteration.
+  return !changed;
+}
+
+/// Rewrite the regions of the specified operation, which must be isolated from
+/// above, by repeatedly applying the highest benefit patterns in a greedy
+/// work-list driven manner. Return true if no more patterns can be matched in
+/// the result operation regions.
+/// Note: This does not apply patterns to the top-level operation itself.
+///
+bool mlir::applyPatternsGreedily(Operation *op,
+                                 const OwningRewritePatternList &patterns) {
+  // The top-level operation must be known to be isolated from above to
+  // prevent performing canonicalizations on operations defined at or above
+  // the region containing 'op'.
+  if (!op->isKnownIsolatedFromAbove())
+    return false;
+
+  GreedyPatternRewriteDriver driver(op->getContext(), patterns);
+  bool converged = driver.simplify(op, maxPatternMatchIterations);
+  LLVM_DEBUG(if (!converged) {
+    llvm::dbgs() << "The pattern rewrite doesn't converge after scanning "
+                 << maxPatternMatchIterations << " times";
+  });
+  return converged;
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/third_party/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
new file mode 100644
index 00000000000..8b314780c9f
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
@@ -0,0 +1,487 @@
+//===- LoopFusionUtils.cpp ---- Utilities for loop fusion ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop fusion transformation utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/LoopFusionUtils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "loop-fusion-utils"
+
+using namespace mlir;
+
+// Gathers all load and store memref accesses in 'opA' into 'values', where
+// 'values[memref] == true' for each store operation.
+static void getLoadAndStoreMemRefAccesses(Operation *opA,
+                                          DenseMap<Value *, bool> &values) {
+  opA->walk([&](Operation *op) {
+    if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+      if (values.count(loadOp.getMemRef()) == 0)
+        values[loadOp.getMemRef()] = false;
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+      values[storeOp.getMemRef()] = true;
+    }
+  });
+}
+
+// Returns true if 'op' is a load or store operation which access an memref
+// accessed 'values' and at least one of the access is a store operation.
+// Returns false otherwise.
+static bool isDependentLoadOrStoreOp(Operation *op,
+                                     DenseMap<Value *, bool> &values) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+    return values.count(loadOp.getMemRef()) > 0 &&
+           values[loadOp.getMemRef()] == true;
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+    return values.count(storeOp.getMemRef()) > 0;
+  }
+  return false;
+}
+
+// Returns the first operation in range ('opA', 'opB') which has a data
+// dependence on 'opA'. Returns 'nullptr' of no dependence exists.
+static Operation *getFirstDependentOpInRange(Operation *opA, Operation *opB) {
+  // Record memref values from all loads/store in loop nest rooted at 'opA'.
+  // Map from memref value to bool which is true if store, false otherwise.
+  DenseMap<Value *, bool> values;
+  getLoadAndStoreMemRefAccesses(opA, values);
+
+  // For each 'opX' in block in range ('opA', 'opB'), check if there is a data
+  // dependence from 'opA' to 'opX' ('opA' and 'opX' access the same memref
+  // and at least one of the accesses is a store).
+  Operation *firstDepOp = nullptr;
+  for (Block::iterator it = std::next(Block::iterator(opA));
+       it != Block::iterator(opB); ++it) {
+    Operation *opX = &(*it);
+    opX->walk([&](Operation *op) {
+      if (!firstDepOp && isDependentLoadOrStoreOp(op, values))
+        firstDepOp = opX;
+    });
+    if (firstDepOp)
+      break;
+  }
+  return firstDepOp;
+}
+
+// Returns the last operation 'opX' in range ('opA', 'opB'), for which there
+// exists a data dependence from 'opX' to 'opB'.
+// Returns 'nullptr' of no dependence exists.
+static Operation *getLastDependentOpInRange(Operation *opA, Operation *opB) {
+  // Record memref values from all loads/store in loop nest rooted at 'opB'.
+  // Map from memref value to bool which is true if store, false otherwise.
+  DenseMap<Value *, bool> values;
+  getLoadAndStoreMemRefAccesses(opB, values);
+
+  // For each 'opX' in block in range ('opA', 'opB') in reverse order,
+  // check if there is a data dependence from 'opX' to 'opB':
+  // *) 'opX' and 'opB' access the same memref and at least one of the accesses
+  //    is a store.
+  // *) 'opX' produces an SSA Value which is used by 'opB'.
+  Operation *lastDepOp = nullptr;
+  for (Block::reverse_iterator it = std::next(Block::reverse_iterator(opB));
+       it != Block::reverse_iterator(opA); ++it) {
+    Operation *opX = &(*it);
+    opX->walk([&](Operation *op) {
+      if (lastDepOp)
+        return;
+      if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) {
+        if (isDependentLoadOrStoreOp(op, values))
+          lastDepOp = opX;
+        return;
+      }
+      for (auto *value : op->getResults()) {
+        for (auto *user : value->getUsers()) {
+          SmallVector<AffineForOp, 4> loops;
+          // Check if any loop in loop nest surrounding 'user' is 'opB'.
+          getLoopIVs(*user, &loops);
+          if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
+            lastDepOp = opX;
+          }
+        }
+      }
+    });
+    if (lastDepOp)
+      break;
+  }
+  return lastDepOp;
+}
+
+// Computes and returns an insertion point operation, before which the
+// the fused <srcForOp, dstForOp> loop nest can be inserted while preserving
+// dependences. Returns nullptr if no such insertion point is found.
+static Operation *getFusedLoopNestInsertionPoint(AffineForOp srcForOp,
+                                                 AffineForOp dstForOp) {
+  bool isSrcForOpBeforeDstForOp =
+      srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
+  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
+  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
+
+  auto *firstDepOpA =
+      getFirstDependentOpInRange(forOpA.getOperation(), forOpB.getOperation());
+  auto *lastDepOpB =
+      getLastDependentOpInRange(forOpA.getOperation(), forOpB.getOperation());
+  // Block:
+  //      ...
+  //  |-- opA
+  //  |   ...
+  //  |   lastDepOpB --|
+  //  |   ...          |
+  //  |-> firstDepOpA  |
+  //      ...          |
+  //      opB <---------
+  //
+  // Valid insertion point range: (lastDepOpB, firstDepOpA)
+  //
+  if (firstDepOpA != nullptr) {
+    if (lastDepOpB != nullptr) {
+      if (firstDepOpA->isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
+        // No valid insertion point exists which preserves dependences.
+        return nullptr;
+    }
+    // Return insertion point in valid range closest to 'opB'.
+    // TODO(andydavis) Consider other insertion points in valid range.
+    return firstDepOpA;
+  }
+  // No dependences from 'opA' to operation in range ('opA', 'opB'), return
+  // 'opB' insertion point.
+  return forOpB.getOperation();
+}
+
+// Gathers all load and store ops in loop nest rooted at 'forOp' into
+// 'loadAndStoreOps'.
+static bool
+gatherLoadsAndStores(AffineForOp forOp,
+                     SmallVectorImpl<Operation *> &loadAndStoreOps) {
+  bool hasIfOp = false;
+  forOp.getOperation()->walk([&](Operation *op) {
+    if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op))
+      loadAndStoreOps.push_back(op);
+    else if (isa<AffineIfOp>(op))
+      hasIfOp = true;
+  });
+  return !hasIfOp;
+}
+
+// TODO(andydavis) Prevent fusion of loop nests with side-effecting operations.
+FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
+                                unsigned dstLoopDepth,
+                                ComputationSliceState *srcSlice) {
+  // Return 'failure' if 'dstLoopDepth == 0'.
+  if (dstLoopDepth == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests at depth 0\n.");
+    return FusionResult::FailPrecondition;
+  }
+  // Return 'failure' if 'srcForOp' and 'dstForOp' are not in the same block.
+  auto *block = srcForOp.getOperation()->getBlock();
+  if (block != dstForOp.getOperation()->getBlock()) {
+    LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests in different blocks\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Return 'failure' if no valid insertion point for fused loop nest in 'block'
+  // exists which would preserve dependences.
+  if (!getFusedLoopNestInsertionPoint(srcForOp, dstForOp)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusion would violate dependences in block\n.");
+    return FusionResult::FailBlockDependence;
+  }
+
+  // Check if 'srcForOp' precedeces 'dstForOp' in 'block'.
+  bool isSrcForOpBeforeDstForOp =
+      srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
+  // 'forOpA' executes before 'forOpB' in 'block'.
+  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
+  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
+
+  // Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
+  SmallVector<Operation *, 4> opsA;
+  if (!gatherLoadsAndStores(forOpA, opsA)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
+  SmallVector<Operation *, 4> opsB;
+  if (!gatherLoadsAndStores(forOpB, opsB)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
+  unsigned numCommonLoops = mlir::getNumCommonSurroundingLoops(
+      *srcForOp.getOperation(), *dstForOp.getOperation());
+
+  // Compute union of computation slices computed between all pairs of ops
+  // from 'forOpA' and 'forOpB'.
+  if (failed(mlir::computeSliceUnion(opsA, opsB, dstLoopDepth, numCommonLoops,
+                                     isSrcForOpBeforeDstForOp, srcSlice))) {
+    LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
+    return FusionResult::FailPrecondition;
+  }
+
+  return FusionResult::Success;
+}
+
+/// Collect loop nest statistics (eg. loop trip count and operation count)
+/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
+/// returns false otherwise.
+bool mlir::getLoopNestStats(AffineForOp forOpRoot, LoopNestStats *stats) {
+  bool ret = true;
+  forOpRoot.getOperation()->walk<AffineForOp>([&](AffineForOp forOp) {
+    auto *childForOp = forOp.getOperation();
+    auto *parentForOp = forOp.getOperation()->getParentOp();
+    if (!llvm::isa<FuncOp>(parentForOp)) {
+      if (!isa<AffineForOp>(parentForOp)) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected parent AffineForOp");
+        ret = false;
+        return;
+      }
+      // Add mapping to 'forOp' from its parent AffineForOp.
+      stats->loopMap[parentForOp].push_back(forOp);
+    }
+
+    // Record the number of op operations in the body of 'forOp'.
+    unsigned count = 0;
+    stats->opCountMap[childForOp] = 0;
+    for (auto &op : *forOp.getBody()) {
+      if (!isa<AffineForOp>(op) && !isa<AffineIfOp>(op))
+        ++count;
+    }
+    stats->opCountMap[childForOp] = count;
+    // Record trip count for 'forOp'. Set flag if trip count is not
+    // constant.
+    Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
+    if (!maybeConstTripCount.hasValue()) {
+      // Currently only constant trip count loop nests are supported.
+      LLVM_DEBUG(llvm::dbgs() << "Non-constant trip count unsupported");
+      ret = false;
+      return;
+    }
+    stats->tripCountMap[childForOp] = maybeConstTripCount.getValue();
+  });
+  return ret;
+}
+
+// Computes the total cost of the loop nest rooted at 'forOp'.
+// Currently, the total cost is computed by counting the total operation
+// instance count (i.e. total number of operations in the loop bodyloop
+// operation count * loop trip count) for the entire loop nest.
+// If 'tripCountOverrideMap' is non-null, overrides the trip count for loops
+// specified in the map when computing the total op instance count.
+// NOTEs: 1) This is used to compute the cost of computation slices, which are
+// sliced along the iteration dimension, and thus reduce the trip count.
+// If 'computeCostMap' is non-null, the total op count for forOps specified
+// in the map is increased (not overridden) by adding the op count from the
+// map to the existing op count for the for loop. This is done before
+// multiplying by the loop's trip count, and is used to model the cost of
+// inserting a sliced loop nest of known cost into the loop's body.
+// 2) This is also used to compute the cost of fusing a slice of some loop nest
+// within another loop.
+static int64_t getComputeCostHelper(
+    Operation *forOp, LoopNestStats &stats,
+    llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
+    DenseMap<Operation *, int64_t> *computeCostMap) {
+  // 'opCount' is the total number operations in one iteration of 'forOp' body,
+  // minus terminator op which is a no-op.
+  int64_t opCount = stats.opCountMap[forOp] - 1;
+  if (stats.loopMap.count(forOp) > 0) {
+    for (auto childForOp : stats.loopMap[forOp]) {
+      opCount += getComputeCostHelper(childForOp.getOperation(), stats,
+                                      tripCountOverrideMap, computeCostMap);
+    }
+  }
+  // Add in additional op instances from slice (if specified in map).
+  if (computeCostMap != nullptr) {
+    auto it = computeCostMap->find(forOp);
+    if (it != computeCostMap->end()) {
+      opCount += it->second;
+    }
+  }
+  // Override trip count (if specified in map).
+  int64_t tripCount = stats.tripCountMap[forOp];
+  if (tripCountOverrideMap != nullptr) {
+    auto it = tripCountOverrideMap->find(forOp);
+    if (it != tripCountOverrideMap->end()) {
+      tripCount = it->second;
+    }
+  }
+  // Returns the total number of dynamic instances of operations in loop body.
+  return tripCount * opCount;
+}
+
+// TODO(andydavis,b/126426796): extend this to handle multiple result maps.
+static Optional<uint64_t> getConstDifference(AffineMap lbMap, AffineMap ubMap) {
+  assert(lbMap.getNumResults() == 1 && "expected single result bound map");
+  assert(ubMap.getNumResults() == 1 && "expected single result bound map");
+  assert(lbMap.getNumDims() == ubMap.getNumDims());
+  assert(lbMap.getNumSymbols() == ubMap.getNumSymbols());
+  AffineExpr lbExpr(lbMap.getResult(0));
+  AffineExpr ubExpr(ubMap.getResult(0));
+  auto loopSpanExpr = simplifyAffineExpr(ubExpr - lbExpr, lbMap.getNumDims(),
+                                         lbMap.getNumSymbols());
+  auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
+  if (!cExpr)
+    return None;
+  return cExpr.getValue();
+}
+
+// Return the number of iterations in the given slice.
+static uint64_t getSliceIterationCount(
+    const llvm::SmallDenseMap<Operation *, uint64_t, 8> &sliceTripCountMap) {
+  uint64_t iterCount = 1;
+  for (const auto &count : sliceTripCountMap) {
+    iterCount *= count.second;
+  }
+  return iterCount;
+}
+
+// Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop
+// nest surrounding represented by slice loop bounds in 'slice'.
+// Returns true on success, false otherwise (if a non-constant trip count
+// was encountered).
+// TODO(andydavis) Make this work with non-unit step loops.
+static bool buildSliceTripCountMap(
+    ComputationSliceState *slice,
+    llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountMap) {
+  unsigned numSrcLoopIVs = slice->ivs.size();
+  // Populate map from AffineForOp -> trip count
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    AffineForOp forOp = getForInductionVarOwner(slice->ivs[i]);
+    auto *op = forOp.getOperation();
+    AffineMap lbMap = slice->lbs[i];
+    AffineMap ubMap = slice->ubs[i];
+    if (lbMap == AffineMap() || ubMap == AffineMap()) {
+      // The iteration of src loop IV 'i' was not sliced. Use full loop bounds.
+      if (forOp.hasConstantLowerBound() && forOp.hasConstantUpperBound()) {
+        (*tripCountMap)[op] =
+            forOp.getConstantUpperBound() - forOp.getConstantLowerBound();
+        continue;
+      }
+      Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
+      if (maybeConstTripCount.hasValue()) {
+        (*tripCountMap)[op] = maybeConstTripCount.getValue();
+        continue;
+      }
+      return false;
+    }
+    Optional<uint64_t> tripCount = getConstDifference(lbMap, ubMap);
+    // Slice bounds are created with a constant ub - lb difference.
+    if (!tripCount.hasValue())
+      return false;
+    (*tripCountMap)[op] = tripCount.getValue();
+  }
+  return true;
+}
+
+/// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
+/// Currently, the total cost is computed by counting the total operation
+/// instance count (i.e. total number of operations in the loop body * loop
+/// trip count) for the entire loop nest.
+int64_t mlir::getComputeCost(AffineForOp forOp, LoopNestStats &stats) {
+  return getComputeCostHelper(forOp.getOperation(), stats,
+                              /*tripCountOverrideMap=*/nullptr,
+                              /*computeCostMap=*/nullptr);
+}
+
+/// Computes and returns in 'computeCost', the total compute cost of fusing the
+/// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
+/// the total cost is computed by counting the total operation instance count
+/// (i.e. total number of operations in the loop body * loop trip count) for
+/// the entire loop nest.
+bool mlir::getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
+                                AffineForOp dstForOp, LoopNestStats &dstStats,
+                                ComputationSliceState *slice,
+                                int64_t *computeCost) {
+  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
+  DenseMap<Operation *, int64_t> computeCostMap;
+
+  // Build trip count map for computation slice.
+  if (!buildSliceTripCountMap(slice, &sliceTripCountMap))
+    return false;
+  // Checks whether a store to load forwarding will happen.
+  int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
+  assert(sliceIterationCount > 0);
+  bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
+  auto *insertPointParent = slice->insertPoint->getParentOp();
+
+  // The store and loads to this memref will disappear.
+  // TODO(andydavis) Add load coalescing to memref data flow opt pass.
+  if (storeLoadFwdGuaranteed) {
+    // Subtract from operation count the loads/store we expect load/store
+    // forwarding to remove.
+    unsigned storeCount = 0;
+    llvm::SmallDenseSet<Value *, 4> storeMemrefs;
+    srcForOp.getOperation()->walk([&](Operation *op) {
+      if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+        storeMemrefs.insert(storeOp.getMemRef());
+        ++storeCount;
+      }
+    });
+    // Subtract out any store ops in single-iteration src slice loop nest.
+    if (storeCount > 0)
+      computeCostMap[insertPointParent] = -storeCount;
+    // Subtract out any load users of 'storeMemrefs' nested below
+    // 'insertPointParent'.
+    for (auto *value : storeMemrefs) {
+      for (auto *user : value->getUsers()) {
+        if (auto loadOp = dyn_cast<AffineLoadOp>(user)) {
+          SmallVector<AffineForOp, 4> loops;
+          // Check if any loop in loop nest surrounding 'user' is
+          // 'insertPointParent'.
+          getLoopIVs(*user, &loops);
+          if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
+            if (auto forOp =
+                    dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
+              if (computeCostMap.count(forOp) == 0)
+                computeCostMap[forOp] = 0;
+              computeCostMap[forOp] -= 1;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Compute op instance count for the src loop nest with iteration slicing.
+  int64_t sliceComputeCost = getComputeCostHelper(
+      srcForOp.getOperation(), srcStats, &sliceTripCountMap, &computeCostMap);
+
+  // Compute cost of fusion for this depth.
+  computeCostMap[insertPointParent] = sliceComputeCost;
+
+  *computeCost =
+      getComputeCostHelper(dstForOp.getOperation(), dstStats,
+                           /*tripCountOverrideMap=*/nullptr, &computeCostMap);
+  return true;
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp b/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp
new file mode 100644
index 00000000000..d6a31f92aed
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -0,0 +1,1133 @@
+//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous loop transformation routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/LoopUtils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "LoopUtils"
+
+using namespace mlir;
+using llvm::SetVector;
+
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                                    AffineMap *map,
+                                    SmallVectorImpl<Value *> *operands,
+                                    OpBuilder &b) {
+  auto lbMap = forOp.getLowerBoundMap();
+
+  // Single result lower bound map only.
+  if (lbMap.getNumResults() != 1) {
+    *map = AffineMap();
+    return;
+  }
+
+  AffineMap tripCountMap;
+  SmallVector<Value *, 4> tripCountOperands;
+  buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
+
+  // Sometimes the trip count cannot be expressed as an affine expression.
+  if (!tripCountMap) {
+    *map = AffineMap();
+    return;
+  }
+
+  unsigned step = forOp.getStep();
+
+  SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands());
+  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
+
+  // For each upper bound expr, get the range.
+  // Eg: affine.for %i = lb to min (ub1, ub2),
+  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
+  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
+  // these affine.apply's make up the cleanup loop lower bound.
+  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
+  SmallVector<Value *, 4> bumpValues(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
+    auto tripCountExpr = tripCountMap.getResult(i);
+    bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
+    auto bumpMap = b.getAffineMap(tripCountMap.getNumDims(),
+                                  tripCountMap.getNumSymbols(), bumpExprs[i]);
+    bumpValues[i] =
+        b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
+  }
+
+  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
+    newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
+
+  operands->clear();
+  operands->push_back(lb);
+  operands->append(bumpValues.begin(), bumpValues.end());
+  *map = b.getAffineMap(1 + tripCountMap.getNumResults(), 0, newUbExprs);
+  // Simplify the map + operands.
+  fullyComposeAffineMapAndOperands(map, operands);
+  *map = simplifyAffineMap(*map);
+  canonicalizeMapAndOperands(map, operands);
+  // Remove any affine.apply's that became dead from the simplification above.
+  for (auto *v : bumpValues) {
+    if (v->use_empty()) {
+      v->getDefiningOp()->erase();
+    }
+  }
+  if (lb.use_empty())
+    lb.erase();
+}
+
+/// Promotes the loop body of a forOp to its containing block if the forOp
+/// was known to have a single iteration.
+// TODO(bondhugula): extend this for arbitrary affine bounds.
+LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
+  Optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  if (!tripCount.hasValue() || tripCount.getValue() != 1)
+    return failure();
+
+  // TODO(mlir-team): there is no builder for a max.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  // Replaces all IV uses to its single iteration value.
+  auto *iv = forOp.getInductionVar();
+  Operation *op = forOp.getOperation();
+  if (!iv->use_empty()) {
+    if (forOp.hasConstantLowerBound()) {
+      OpBuilder topBuilder(op->getParentOfType<FuncOp>().getBody());
+      auto constOp = topBuilder.create<ConstantIndexOp>(
+          forOp.getLoc(), forOp.getConstantLowerBound());
+      iv->replaceAllUsesWith(constOp);
+    } else {
+      AffineBound lb = forOp.getLowerBound();
+      SmallVector<Value *, 4> lbOperands(lb.operand_begin(), lb.operand_end());
+      OpBuilder builder(op->getBlock(), Block::iterator(op));
+      if (lb.getMap() == builder.getDimIdentityMap()) {
+        // No need of generating an affine.apply.
+        iv->replaceAllUsesWith(lbOperands[0]);
+      } else {
+        auto affineApplyOp = builder.create<AffineApplyOp>(
+            op->getLoc(), lb.getMap(), lbOperands);
+        iv->replaceAllUsesWith(affineApplyOp);
+      }
+    }
+  }
+  // Move the loop body operations, except for terminator, to the loop's
+  // containing block.
+  auto *block = op->getBlock();
+  forOp.getBody()->getOperations().back().erase();
+  block->getOperations().splice(Block::iterator(op),
+                                forOp.getBody()->getOperations());
+  forOp.erase();
+  return success();
+}
+
+/// Promotes all single iteration for op's in the FuncOp, i.e., moves
+/// their body into the containing Block.
+void mlir::promoteSingleIterationLoops(FuncOp f) {
+  // Gathers all innermost loops through a post order pruned walk.
+  f.walk<AffineForOp>(
+      [](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+}
+
+/// Generates a 'affine.for' op with the specified lower and upper bounds
+/// while generating the right IV remappings for the shifted operations. The
+/// operation blocks that go into the loop are specified in instGroupQueue
+/// starting from the specified offset, and in that order; the first element of
+/// the pair specifies the shift applied to that group of operations; note
+/// that the shift is multiplied by the loop step before being applied. Returns
+/// nullptr if the generated loop simplifies to a single iteration one.
+static AffineForOp
+generateLoop(AffineMap lbMap, AffineMap ubMap,
+             const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>>
+                 &instGroupQueue,
+             unsigned offset, AffineForOp srcForInst, OpBuilder b) {
+  SmallVector<Value *, 4> lbOperands(srcForInst.getLowerBoundOperands());
+  SmallVector<Value *, 4> ubOperands(srcForInst.getUpperBoundOperands());
+
+  assert(lbMap.getNumInputs() == lbOperands.size());
+  assert(ubMap.getNumInputs() == ubOperands.size());
+
+  auto loopChunk =
+      b.create<AffineForOp>(srcForInst.getLoc(), lbOperands, lbMap, ubOperands,
+                            ubMap, srcForInst.getStep());
+  auto *loopChunkIV = loopChunk.getInductionVar();
+  auto *srcIV = srcForInst.getInductionVar();
+
+  BlockAndValueMapping operandMap;
+
+  OpBuilder bodyBuilder = loopChunk.getBodyBuilder();
+  for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end();
+       it != e; ++it) {
+    uint64_t shift = it->first;
+    auto insts = it->second;
+    // All 'same shift' operations get added with their operands being
+    // remapped to results of cloned operations, and their IV used remapped.
+    // Generate the remapping if the shift is not zero: remappedIV = newIV -
+    // shift.
+    if (!srcIV->use_empty() && shift != 0) {
+      auto ivRemap = bodyBuilder.create<AffineApplyOp>(
+          srcForInst.getLoc(),
+          bodyBuilder.getSingleDimShiftAffineMap(
+              -static_cast<int64_t>(srcForInst.getStep() * shift)),
+          loopChunkIV);
+      operandMap.map(srcIV, ivRemap);
+    } else {
+      operandMap.map(srcIV, loopChunkIV);
+    }
+    for (auto *op : insts) {
+      if (!isa<AffineTerminatorOp>(op))
+        bodyBuilder.clone(*op, operandMap);
+    }
+  };
+  if (succeeded(promoteIfSingleIteration(loopChunk)))
+    return AffineForOp();
+  return loopChunk;
+}
+
+/// Skew the operations in the body of a 'affine.for' operation with the
+/// specified operation-wise shifts. The shifts are with respect to the
+/// original execution order, and are multiplied by the loop 'step' before being
+/// applied. A shift of zero for each operation will lead to no change.
+// The skewing of operations with respect to one another can be used for
+// example to allow overlap of asynchronous operations (such as DMA
+// communication) with computation, or just relative shifting of operations
+// for better register reuse, locality or parallelism. As such, the shifts are
+// typically expected to be at most of the order of the number of operations.
+// This method should not be used as a substitute for loop distribution/fission.
+// This method uses an algorithm// in time linear in the number of operations
+// in the body of the for loop - (using the 'sweep line' paradigm). This method
+// asserts preservation of SSA dominance. A check for that as well as that for
+// memory-based depedence preservation check rests with the users of this
+// method.
+LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
+                                 bool unrollPrologueEpilogue) {
+  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return success();
+
+  // If the trip counts aren't constant, we would need versioning and
+  // conditional guards (or context information to prevent such versioning). The
+  // better way to pipeline for such loops is to first tile them and extract
+  // constant trip count "full tiles" before applying this.
+  auto mayBeConstTripCount = getConstantTripCount(forOp);
+  if (!mayBeConstTripCount.hasValue()) {
+    LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
+    return success();
+  }
+  uint64_t tripCount = mayBeConstTripCount.getValue();
+
+  assert(isInstwiseShiftValid(forOp, shifts) &&
+         "shifts will lead to an invalid transformation\n");
+
+  int64_t step = forOp.getStep();
+
+  unsigned numChildInsts = forOp.getBody()->getOperations().size();
+
+  // Do a linear time (counting) sort for the shifts.
+  uint64_t maxShift = 0;
+  for (unsigned i = 0; i < numChildInsts; i++) {
+    maxShift = std::max(maxShift, shifts[i]);
+  }
+  // Such large shifts are not the typical use case.
+  if (maxShift >= numChildInsts) {
+    forOp.emitWarning("not shifting because shifts are unrealistically large");
+    return success();
+  }
+
+  // An array of operation groups sorted by shift amount; each group has all
+  // operations with the same shift in the order in which they appear in the
+  // body of the 'affine.for' op.
+  std::vector<std::vector<Operation *>> sortedInstGroups(maxShift + 1);
+  unsigned pos = 0;
+  for (auto &op : *forOp.getBody()) {
+    auto shift = shifts[pos++];
+    sortedInstGroups[shift].push_back(&op);
+  }
+
+  // Unless the shifts have a specific pattern (which actually would be the
+  // common use case), prologue and epilogue are not meaningfully defined.
+  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
+  // loop generated as the prologue and the last as epilogue and unroll these
+  // fully.
+  AffineForOp prologue;
+  AffineForOp epilogue;
+
+  // Do a sweep over the sorted shifts while storing open groups in a
+  // vector, and generating loop portions as necessary during the sweep. A block
+  // of operations is paired with its shift.
+  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> instGroupQueue;
+
+  auto origLbMap = forOp.getLowerBoundMap();
+  uint64_t lbShift = 0;
+  OpBuilder b(forOp.getOperation());
+  for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) {
+    // If nothing is shifted by d, continue.
+    if (sortedInstGroups[d].empty())
+      continue;
+    if (!instGroupQueue.empty()) {
+      assert(d >= 1 &&
+             "Queue expected to be empty when the first block is found");
+      // The interval for which the loop needs to be generated here is:
+      // [lbShift, min(lbShift + tripCount, d)) and the body of the
+      // loop needs to have all operations in instQueue in that order.
+      AffineForOp res;
+      if (lbShift + tripCount * step < d * step) {
+        res = generateLoop(
+            b.getShiftedAffineMap(origLbMap, lbShift),
+            b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
+            instGroupQueue, 0, forOp, b);
+        // Entire loop for the queued op groups generated, empty it.
+        instGroupQueue.clear();
+        lbShift += tripCount * step;
+      } else {
+        res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                           b.getShiftedAffineMap(origLbMap, d), instGroupQueue,
+                           0, forOp, b);
+        lbShift = d * step;
+      }
+      if (!prologue && res)
+        prologue = res;
+      epilogue = res;
+    } else {
+      // Start of first interval.
+      lbShift = d * step;
+    }
+    // Augment the list of operations that get into the current open interval.
+    instGroupQueue.push_back({d, sortedInstGroups[d]});
+  }
+
+  // Those operations groups left in the queue now need to be processed (FIFO)
+  // and their loops completed.
+  for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) {
+    uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step;
+    epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                            b.getShiftedAffineMap(origLbMap, ubShift),
+                            instGroupQueue, i, forOp, b);
+    lbShift = ubShift;
+    if (!prologue)
+      prologue = epilogue;
+  }
+
+  // Erase the original for op.
+  forOp.erase();
+
+  if (unrollPrologueEpilogue && prologue)
+    loopUnrollFull(prologue);
+  if (unrollPrologueEpilogue && !epilogue &&
+      epilogue.getOperation() != prologue.getOperation())
+    loopUnrollFull(epilogue);
+
+  return success();
+}
+
+// Collect perfectly nested loops starting from `rootForOps`.  Loops are
+// perfectly nested if each loop is the first and only non-terminator operation
+// in the parent loop.  Collect at most `maxLoops` loops and append them to
+// `forOps`.
+template <typename T>
+void getPerfectlyNestedLoopsImpl(
+    SmallVectorImpl<T> &forOps, T rootForOp,
+    unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
+  for (unsigned i = 0; i < maxLoops; ++i) {
+    forOps.push_back(rootForOp);
+    // FIXME: ForOp and AffineForOp currently provide different names to access
+    // the region ("region" and "getRegion").  Remove this generic access when
+    // AffineForOp moves to ODS and also gets "region".
+    Block &body = rootForOp.getOperation()->getRegion(0).front();
+    if (body.begin() != std::prev(body.end(), 2))
+      return;
+
+    rootForOp = dyn_cast<T>(&body.front());
+    if (!rootForOp)
+      return;
+  }
+}
+
+/// Get perfectly nested sequence of loops starting at root of loop nest
+/// (the first op being another AffineFor, and the second op - a terminator).
+/// A loop is perfectly nested iff: the first op in the loop's body is another
+/// AffineForOp, and the second op is a terminator).
+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
+                                   AffineForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                                   loop::ForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
+/// Unrolls this loop completely.
+LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  if (mayBeConstantTripCount.hasValue()) {
+    uint64_t tripCount = mayBeConstantTripCount.getValue();
+    if (tripCount == 1) {
+      return promoteIfSingleIteration(forOp);
+    }
+    return loopUnrollByFactor(forOp, tripCount);
+  }
+  return failure();
+}
+
+/// Unrolls and jams this loop by the specified factor or by the trip count (if
+/// constant) whichever is lower.
+LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
+                                         uint64_t unrollFactor) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollFactor)
+    return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue());
+  return loopUnrollByFactor(forOp, unrollFactor);
+}
+
+/// Unrolls this loop by the specified factor. Returns success if the loop
+/// is successfully unrolled.
+LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
+                                       uint64_t unrollFactor) {
+  assert(unrollFactor >= 1 && "unroll factor should be >= 1");
+
+  if (unrollFactor == 1)
+    return promoteIfSingleIteration(forOp);
+
+  if (forOp.getBody()->empty() ||
+      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return failure();
+
+  // Loops where the lower bound is a max expression isn't supported for
+  // unrolling since the trip count can be expressed as an affine function when
+  // both the lower bound and the upper bound are multi-result maps. However,
+  // one meaningful way to do such unrolling would be to specialize the loop for
+  // the 'hotspot' case and unroll that hotspot.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  // If the trip count is lower than the unroll factor, no unrolled body.
+  // TODO(bondhugula): option to specify cleanup loop unrolling.
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollFactor)
+    return failure();
+
+  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
+  Operation *op = forOp.getOperation();
+  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
+    OpBuilder builder(op->getBlock(), ++Block::iterator(op));
+    auto cleanupForInst = cast<AffineForOp>(builder.clone(*op));
+    AffineMap cleanupMap;
+    SmallVector<Value *, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
+                             builder);
+    assert(cleanupMap &&
+           "cleanup loop lower bound map for single result lower bound maps "
+           "can always be determined");
+    cleanupForInst.setLowerBound(cleanupOperands, cleanupMap);
+    // Promote the loop body up if this has turned into a single iteration loop.
+    promoteIfSingleIteration(cleanupForInst);
+
+    // Adjust upper bound of the original loop; this is the same as the lower
+    // bound of the cleanup loop.
+    forOp.setUpperBound(cleanupOperands, cleanupMap);
+  }
+
+  // Scale the step of loop being unrolled by unroll factor.
+  int64_t step = forOp.getStep();
+  forOp.setStep(step * unrollFactor);
+
+  // Builder to insert unrolled bodies just before the terminator of the body of
+  // 'forOp'.
+  OpBuilder builder = forOp.getBodyBuilder();
+
+  // Keep a pointer to the last non-terminator operation in the original block
+  // so that we know what to clone (since we are doing this in-place).
+  Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2);
+
+  // Unroll the contents of 'forOp' (append unrollFactor-1 additional copies).
+  auto *forOpIV = forOp.getInductionVar();
+  for (unsigned i = 1; i < unrollFactor; i++) {
+    BlockAndValueMapping operandMap;
+
+    // If the induction variable is used, create a remapping to the value for
+    // this unrolled instance.
+    if (!forOpIV->use_empty()) {
+      // iv' = iv + 1/2/3...unrollFactor-1;
+      auto d0 = builder.getAffineDimExpr(0);
+      auto bumpMap = builder.getAffineMap(1, 0, {d0 + i * step});
+      auto ivUnroll =
+          builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
+      operandMap.map(forOpIV, ivUnroll);
+    }
+
+    // Clone the original body of 'forOp'.
+    for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd);
+         it++) {
+      builder.clone(*it, operandMap);
+    }
+  }
+
+  // Promote the loop body up if this has turned into a single iteration loop.
+  promoteIfSingleIteration(forOp);
+  return success();
+}
+
+/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
+/// nested within 'forOpA' as the only non-terminator operation in its block.
+void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
+  auto *forOpAInst = forOpA.getOperation();
+
+  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
+  auto &forOpABody = forOpA.getBody()->getOperations();
+  auto &forOpBBody = forOpB.getBody()->getOperations();
+
+  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
+  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
+  // body containing only the terminator.
+  forOpAInst->getBlock()->getOperations().splice(Block::iterator(forOpAInst),
+                                                 forOpABody, forOpABody.begin(),
+                                                 std::prev(forOpABody.end()));
+  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
+  // body (this leaves forOpB's body containing only the terminator).
+  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
+                    std::prev(forOpBBody.end()));
+  // 3) Splice forOpA into the beginning of forOpB's body.
+  forOpBBody.splice(forOpBBody.begin(), forOpAInst->getBlock()->getOperations(),
+                    Block::iterator(forOpAInst));
+}
+
+// Checks each dependence component against the permutation to see if the
+// desired loop interchange would violate dependences by making the
+// dependence componenent lexicographically negative.
+static bool checkLoopInterchangeDependences(
+    const std::vector<llvm::SmallVector<DependenceComponent, 2>> &depCompsVec,
+    ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
+  // Invert permutation map.
+  unsigned maxLoopDepth = loops.size();
+  llvm::SmallVector<unsigned, 4> loopPermMapInv;
+  loopPermMapInv.resize(maxLoopDepth);
+  for (unsigned i = 0; i < maxLoopDepth; ++i)
+    loopPermMapInv[loopPermMap[i]] = i;
+
+  // Check each dependence component against the permutation to see if the
+  // desired loop interchange permutation would make the dependence vectors
+  // lexicographically negative.
+  // Example 1: [-1, 1][0, 0]
+  // Example 2: [0, 0][-1, 1]
+  for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
+    const llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
+    assert(depComps.size() >= maxLoopDepth);
+    // Check if the first non-zero dependence component is positive.
+    // This iterates through loops in the desired order.
+    for (unsigned j = 0; j < maxLoopDepth; ++j) {
+      unsigned permIndex = loopPermMapInv[j];
+      assert(depComps[permIndex].lb.hasValue());
+      int64_t depCompLb = depComps[permIndex].lb.getValue();
+      if (depCompLb > 0)
+        break;
+      if (depCompLb < 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
+/// nested sequence of loops in 'loops' would violate dependences.
+bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
+                                             ArrayRef<unsigned> loopPermMap) {
+  // Gather dependence components for dependences between all ops in loop nest
+  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
+  assert(loopPermMap.size() == loops.size());
+  unsigned maxLoopDepth = loops.size();
+  std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec;
+  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
+  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
+}
+
+/// Performs a sequence of loop interchanges of loops in perfectly nested
+/// sequence of loops in 'loops', as specified by permutation in 'loopPermMap'.
+unsigned mlir::interchangeLoops(ArrayRef<AffineForOp> loops,
+                                ArrayRef<unsigned> loopPermMap) {
+  Optional<unsigned> loopNestRootIndex;
+  for (int i = loops.size() - 1; i >= 0; --i) {
+    int permIndex = static_cast<int>(loopPermMap[i]);
+    // Store the index of the for loop which will be the new loop nest root.
+    if (permIndex == 0)
+      loopNestRootIndex = i;
+    if (permIndex > i) {
+      // Sink loop 'i' by 'permIndex - i' levels deeper into the loop nest.
+      sinkLoop(loops[i], permIndex - i);
+    }
+  }
+  assert(loopNestRootIndex.hasValue());
+  return loopNestRootIndex.getValue();
+}
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
+  SmallVector<AffineForOp, 4> loops;
+  getPerfectlyNestedLoops(loops, forOp);
+  if (loops.size() < 2)
+    return forOp;
+
+  // Gather dependence components for dependences between all ops in loop nest
+  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
+  unsigned maxLoopDepth = loops.size();
+  std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec;
+  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
+
+  // Mark loops as either parallel or sequential.
+  llvm::SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
+  for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
+    llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
+    assert(depComps.size() >= maxLoopDepth);
+    for (unsigned j = 0; j < maxLoopDepth; ++j) {
+      DependenceComponent &depComp = depComps[j];
+      assert(depComp.lb.hasValue() && depComp.ub.hasValue());
+      if (depComp.lb.getValue() != 0 || depComp.ub.getValue() != 0)
+        isParallelLoop[j] = false;
+    }
+  }
+
+  // Count the number of parallel loops.
+  unsigned numParallelLoops = 0;
+  for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
+    if (isParallelLoop[i])
+      ++numParallelLoops;
+
+  // Compute permutation of loops that sinks sequential loops (and thus raises
+  // parallel loops) while preserving relative order.
+  llvm::SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
+  unsigned nextSequentialLoop = numParallelLoops;
+  unsigned nextParallelLoop = 0;
+  for (unsigned i = 0; i < maxLoopDepth; ++i) {
+    if (isParallelLoop[i]) {
+      loopPermMap[i] = nextParallelLoop++;
+    } else {
+      loopPermMap[i] = nextSequentialLoop++;
+    }
+  }
+
+  // Check if permutation 'loopPermMap' would violate dependences.
+  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
+    return forOp;
+  // Perform loop interchange according to permutation 'loopPermMap'.
+  unsigned loopNestRootIndex = interchangeLoops(loops, loopPermMap);
+  return loops[loopNestRootIndex];
+}
+
+/// Performs a series of loop interchanges to sink 'forOp' 'loopDepth' levels
+/// deeper in the loop nest.
+void mlir::sinkLoop(AffineForOp forOp, unsigned loopDepth) {
+  for (unsigned i = 0; i < loopDepth; ++i) {
+    AffineForOp nextForOp = cast<AffineForOp>(forOp.getBody()->front());
+    interchangeLoops(forOp, nextForOp);
+  }
+}
+
+// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
+// lower (resp. upper) loop bound. When called for both the lower and upper
+// bounds, the resulting IR resembles:
+//
+// ```mlir
+//    affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
+//      ...
+//    }
+// ```
+static void augmentMapAndBounds(OpBuilder &b, Value *iv, AffineMap *map,
+                                SmallVector<Value *, 4> *operands,
+                                int64_t offset = 0) {
+  auto bounds = llvm::to_vector<4>(map->getResults());
+  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
+  operands->insert(operands->begin() + map->getNumDims(), iv);
+  *map = b.getAffineMap(map->getNumDims() + 1, map->getNumSymbols(), bounds);
+  canonicalizeMapAndOperands(map, operands);
+}
+
+// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
+// Stripmine-sink is a primitive building block for generalized tiling of
+// imperfectly nested loops.
+// This transformation is purely mechanical and does not check legality,
+// profitability or even structural correctness. It is the user's
+// responsibility to specify `targets` that are dominated by `forOp`.
+// Returns the new AffineForOps, one per `targets`, nested immediately under
+// each of the `targets`.
+static SmallVector<AffineForOp, 8>
+stripmineSink(AffineForOp forOp, uint64_t factor,
+              ArrayRef<AffineForOp> targets) {
+  auto originalStep = forOp.getStep();
+  auto scaledStep = originalStep * factor;
+  forOp.setStep(scaledStep);
+
+  auto *op = forOp.getOperation();
+  OpBuilder b(op->getBlock(), ++Block::iterator(op));
+
+  // Lower-bound map creation.
+  auto lbMap = forOp.getLowerBoundMap();
+  SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands());
+  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
+
+  // Upper-bound map creation.
+  auto ubMap = forOp.getUpperBoundMap();
+  SmallVector<Value *, 4> ubOperands(forOp.getUpperBoundOperands());
+  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
+                      /*offset=*/scaledStep);
+
+  auto *iv = forOp.getInductionVar();
+  SmallVector<AffineForOp, 8> innerLoops;
+  for (auto t : targets) {
+    // Insert newForOp before the terminator of `t`.
+    OpBuilder b = t.getBodyBuilder();
+    auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
+                                          ubOperands, ubMap, originalStep);
+    auto begin = t.getBody()->begin();
+    // Skip terminator and `newForOp` which is just before the terminator.
+    auto nOps = t.getBody()->getOperations().size() - 2;
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.region());
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
+static Loops stripmineSink(loop::ForOp forOp, Value *factor,
+                           ArrayRef<loop::ForOp> targets) {
+  auto *originalStep = forOp.step();
+  auto *iv = forOp.getInductionVar();
+
+  OpBuilder b(forOp);
+  forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor));
+
+  Loops innerLoops;
+  for (auto t : targets) {
+    // Save information for splicing ops out of t when done
+    auto begin = t.getBody()->begin();
+    auto nOps = t.getBody()->getOperations().size();
+
+    // Insert newForOp before the terminator of `t`.
+    OpBuilder b(t.getBodyBuilder());
+    Value *stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step());
+    Value *less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::SLT,
+                                   forOp.upperBound(), stepped);
+    Value *ub =
+        b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped);
+
+    // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
+    auto newForOp = b.create<loop::ForOp>(t.getLoc(), iv, ub, originalStep);
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.region());
+
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
+// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
+// Returns the new AffineForOps, nested immediately under `target`.
+template <typename ForType, typename SizeType>
+static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
+  // TODO(ntv): Use cheap structural assertions that targets are nested under
+  // forOp and that targets are not nested under each other when DominanceInfo
+  // exposes the capability. It seems overkill to construct a whole function
+  // dominance tree at this point.
+  auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
+  assert(res.size() == 1 && "Expected 1 inner forOp");
+  return res[0];
+}
+
+template <typename ForType, typename SizeType>
+static SmallVector<SmallVector<ForType, 8>, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
+         ArrayRef<ForType> targets) {
+  SmallVector<SmallVector<ForType, 8>, 8> res;
+  SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
+  for (auto it : llvm::zip(forOps, sizes)) {
+    auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
+    res.push_back(step);
+    currentTargets = step;
+  }
+  return res;
+}
+
+SmallVector<SmallVector<AffineForOp, 8>, 8>
+mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
+           ArrayRef<AffineForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+SmallVector<Loops, 8> mlir::tile(ArrayRef<loop::ForOp> forOps,
+                                 ArrayRef<Value *> sizes,
+                                 ArrayRef<loop::ForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+template <typename ForType, typename SizeType>
+static SmallVector<ForType, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
+  SmallVector<ForType, 8> res;
+  for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
+    assert(loops.size() == 1);
+    res.push_back(loops[0]);
+  }
+  return res;
+}
+
+SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
+                                       ArrayRef<uint64_t> sizes,
+                                       AffineForOp target) {
+  return tileImpl(forOps, sizes, target);
+}
+
+Loops mlir::tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
+                 loop::ForOp target) {
+  return tileImpl(forOps, sizes, target);
+}
+
+Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp,
+                                ArrayRef<Value *> sizes) {
+  // Collect prefectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<loop::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  return ::tile(forOps, sizes, forOps.back());
+}
+
+// Build the IR that performs ceil division of a positive value by a constant:
+//    ceildiv(a, B) = divis(a + (B-1), B)
+// where divis is roundning-to-zero division.
+static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
+                              int64_t divisor) {
+  assert(divisor > 0 && "expected positive divisor");
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
+  Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
+  Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
+  return builder.create<DivISOp>(loc, sum, divisorCst);
+}
+
+// Build the IR that performs ceil division of a positive value by another
+// positive value:
+//    ceildiv(a, b) = divis(a + (b - 1), b)
+// where divis is rounding-to-zero division.
+static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
+                              Value *divisor) {
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value *cstOne = builder.create<ConstantIndexOp>(loc, 1);
+  Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
+  Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
+  return builder.create<DivISOp>(loc, sum, divisor);
+}
+
+// Hoist the ops within `outer` that appear before `inner`.
+// Such ops include the ops that have been introduced by parametric tiling.
+// Ops that come from triangular loops (i.e. that belong to the program slice
+// rooted at `outer`) and ops that have side effects cannot be hoisted.
+// Return failure when any op fails to hoist.
+static LogicalResult hoistOpsBetween(loop::ForOp outer, loop::ForOp inner) {
+  SetVector<Operation *> forwardSlice;
+  getForwardSlice(outer.getOperation(), &forwardSlice, [&inner](Operation *op) {
+    return op != inner.getOperation();
+  });
+  LogicalResult status = success();
+  SmallVector<Operation *, 8> toHoist;
+  for (auto &op : outer.getBody()->getOperations()) {
+    // Stop when encountering the inner loop.
+    if (&op == inner.getOperation())
+      break;
+    // Skip over non-hoistable ops.
+    if (forwardSlice.count(&op) > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip loop::ForOp, these are not considered a failure.
+    if (op.getNumRegions() > 0)
+      continue;
+    // Skip other ops with regions.
+    if (op.getNumRegions() > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip if op has side effects.
+    // TODO(ntv): loads to immutable memory regions are ok.
+    if (!op.hasNoSideEffect()) {
+      status = failure();
+      continue;
+    }
+    toHoist.push_back(&op);
+  }
+  auto *outerForOp = outer.getOperation();
+  for (auto *op : toHoist)
+    op->moveBefore(outerForOp);
+  return status;
+}
+
+// Traverse the interTile and intraTile loops and try to hoist ops such that
+// bands of perfectly nested loops are isolated.
+// Return failure if either perfect interTile or perfect intraTile bands cannot
+// be formed.
+static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
+  LogicalResult status = success();
+  auto &interTile = tileLoops.first;
+  auto &intraTile = tileLoops.second;
+  auto size = interTile.size();
+  assert(size == intraTile.size());
+  if (size <= 1)
+    return success();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
+                               : failure();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
+                               : failure();
+  return status;
+}
+
+TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
+                                       ArrayRef<int64_t> sizes) {
+  // Collect prefectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<loop::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  // Compute the tile sizes such that i-th outer loop executes size[i]
+  // iterations.  Given that the loop current executes
+  //   numIterations = ceildiv((upperBound - lowerBound), step)
+  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
+  SmallVector<Value *, 4> tileSizes;
+  tileSizes.reserve(sizes.size());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
+
+    auto forOp = forOps[i];
+    OpBuilder builder(forOp);
+    auto loc = forOp.getLoc();
+    Value *diff =
+        builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
+    Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
+    Value *iterationsPerBlock =
+        ceilDivPositive(builder, loc, numIterations, sizes[i]);
+    tileSizes.push_back(iterationsPerBlock);
+  }
+
+  // Call parametric tiling with the given sizes.
+  auto intraTile = tile(forOps, tileSizes, forOps.back());
+  TileLoops tileLoops = std::make_pair(forOps, intraTile);
+
+  // TODO(ntv, zinenko) for now we just ignore the result of band isolation.
+  // In the future, mapping decisions may be impacted by the ability to
+  // isolate perfectly nested bands.
+  tryIsolateBands(tileLoops);
+
+  return tileLoops;
+}
+
+// Replaces all uses of `orig` with `replacement` except if the user is listed
+// in `exceptions`.
+static void
+replaceAllUsesExcept(Value *orig, Value *replacement,
+                     const SmallPtrSetImpl<Operation *> &exceptions) {
+  for (auto &use : orig->getUses()) {
+    if (exceptions.count(use.getOwner()) == 0)
+      use.set(replacement);
+  }
+}
+
+// Transform a loop with a strictly positive step
+//   for %i = %lb to %ub step %s
+// into a 0-based loop with step 1
+//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+//     %i = %ii * %s + %lb
+// Insert the induction variable remapping in the body of `inner`, which is
+// expected to be either `loop` or another loop perfectly nested under `loop`.
+// Insert the definition of new bounds immediate before `outer`, which is
+// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
+                          loop::ForOp inner) {
+  OpBuilder builder(outer);
+  Location loc = loop.getLoc();
+
+  // Check if the loop is already known to have a constant zero lower bound or
+  // a constant one step.
+  bool isZeroBased = false;
+  if (auto ubCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp()))
+    isZeroBased = ubCst.getValue() == 0;
+
+  bool isStepOne = false;
+  if (auto stepCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp()))
+    isStepOne = stepCst.getValue() == 1;
+
+  if (isZeroBased && isStepOne)
+    return;
+
+  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
+  // assuming the step is strictly positive.  Update the bounds and the step
+  // of the loop to go from 0 to the number of iterations, if necessary.
+  // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
+  // on step positivity, whatever gets implemented first.
+  Value *diff =
+      builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
+  Value *numIterations = ceilDivPositive(builder, loc, diff, loop.step());
+  loop.setUpperBound(numIterations);
+
+  Value *lb = loop.lowerBound();
+  if (!isZeroBased) {
+    Value *cst0 = builder.create<ConstantIndexOp>(loc, 0);
+    loop.setLowerBound(cst0);
+  }
+
+  Value *step = loop.step();
+  if (!isStepOne) {
+    Value *cst1 = builder.create<ConstantIndexOp>(loc, 1);
+    loop.setStep(cst1);
+  }
+
+  // Insert code computing the value of the original loop induction variable
+  // from the "normalized" one.
+  builder.setInsertionPointToStart(inner.getBody());
+  Value *scaled =
+      isStepOne ? loop.getInductionVar()
+                : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
+  Value *shifted =
+      isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
+
+  SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(),
+                                       shifted->getDefiningOp()};
+  replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
+}
+
+void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
+  if (loops.size() < 2)
+    return;
+
+  loop::ForOp innermost = loops.back();
+  loop::ForOp outermost = loops.front();
+
+  // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
+  // allows the following code to assume upperBound is the number of iterations.
+  for (auto loop : loops)
+    normalizeLoop(loop, outermost, innermost);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product
+  // of the number of iterations of all loops.
+  OpBuilder builder(outermost);
+  Location loc = outermost.getLoc();
+  Value *upperBound = outermost.upperBound();
+  for (auto loop : loops.drop_front())
+    upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
+  outermost.setUpperBound(upperBound);
+
+  builder.setInsertionPointToStart(outermost.getBody());
+
+  // 3. Remap induction variables.  For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value *previous = outermost.getInductionVar();
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    unsigned idx = loops.size() - i - 1;
+    if (i != 0)
+      previous =
+          builder.create<DivISOp>(loc, previous, loops[idx + 1].upperBound());
+
+    Value *iv = (i == e - 1) ? previous
+                             : builder.create<RemISOp>(loc, previous,
+                                                       loops[idx].upperBound());
+    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
+                               loops.back().region());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  loop::ForOp second = loops[1];
+  innermost.getBody()->back().erase();
+  outermost.getBody()->getOperations().splice(
+      Block::iterator(second.getOperation()),
+      innermost.getBody()->getOperations());
+  second.erase();
+}
+
+void mlir::mapLoopToProcessorIds(loop::ForOp forOp,
+                                 ArrayRef<Value *> processorId,
+                                 ArrayRef<Value *> numProcessors) {
+  assert(processorId.size() == numProcessors.size());
+  if (processorId.empty())
+    return;
+
+  OpBuilder b(forOp);
+  Location loc(forOp.getLoc());
+  Value *mul = processorId.front();
+  for (unsigned i = 1, e = processorId.size(); i < e; ++i)
+    mul = b.create<AddIOp>(loc, b.create<MulIOp>(loc, mul, numProcessors[i]),
+                           processorId[i]);
+  Value *lb = b.create<AddIOp>(loc, forOp.lowerBound(), mul);
+  forOp.setLowerBound(lb);
+
+  Value *step = numProcessors.front();
+  for (auto *numProcs : numProcessors.drop_front())
+    step = b.create<MulIOp>(loc, step, numProcs);
+  forOp.setStep(step);
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp b/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp
new file mode 100644
index 00000000000..a2b4fe3c83f
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -0,0 +1,55 @@
+//===- RegionUtils.cpp - Region-related transformation utilities ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Transforms/RegionUtils.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/ADT/SmallSet.h"
+
+using namespace mlir;
+
+void mlir::replaceAllUsesInRegionWith(Value *orig, Value *replacement,
+                                      Region &region) {
+  for (IROperand &use : llvm::make_early_inc_range(orig->getUses())) {
+    if (region.isAncestor(use.getOwner()->getParentRegion()))
+      use.set(replacement);
+  }
+}
+
+void mlir::getUsedValuesDefinedAbove(Region &region, Region &limit,
+                                     llvm::SetVector<Value *> &values) {
+  assert(limit.isAncestor(&region) &&
+         "expected isolation limit to be an ancestor of the given region");
+
+  // Collect proper ancestors of `limit` upfront to avoid traversing the region
+  // tree for every value.
+  llvm::SmallPtrSet<Region *, 4> properAncestors;
+  for (auto *reg = limit.getParentRegion(); reg != nullptr;
+       reg = reg->getParentRegion()) {
+    properAncestors.insert(reg);
+  }
+
+  region.walk([&values, &properAncestors](Operation *op) {
+    for (Value *operand : op->getOperands())
+      // Collect values that are used by an operation and defined in a proper
+      // ancestor of region.
+      if (properAncestors.count(operand->getParentRegion()))
+        values.insert(operand);
+  });
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/Utils.cpp b/third_party/mlir/lib/Transforms/Utils/Utils.cpp
new file mode 100644
index 00000000000..b0c9b942352
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/Utils.cpp
@@ -0,0 +1,390 @@
+//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous transformation routines for non-loop IR
+// structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Utils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+using namespace mlir;
+
+/// Return true if this operation dereferences one or more memref's.
+// Temporary utility: will be replaced when this is modeled through
+// side-effects/op traits. TODO(b/117228571)
+static bool isMemRefDereferencingOp(Operation &op) {
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+      isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op))
+    return true;
+  return false;
+}
+
+/// Return the AffineMapAttr associated with memory 'op' on 'memref'.
+static NamedAttribute getAffineMapAttrForMemRef(Operation *op, Value *memref) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(op))
+    return loadOp.getAffineMapAttrForMemRef(memref);
+  else if (auto storeOp = dyn_cast<AffineStoreOp>(op))
+    return storeOp.getAffineMapAttrForMemRef(memref);
+  else if (auto dmaStart = dyn_cast<AffineDmaStartOp>(op))
+    return dmaStart.getAffineMapAttrForMemRef(memref);
+  assert(isa<AffineDmaWaitOp>(op));
+  return cast<AffineDmaWaitOp>(op).getAffineMapAttrForMemRef(memref);
+}
+
+// Perform the replacement in `op`.
+LogicalResult mlir::replaceAllMemRefUsesWith(Value *oldMemRef, Value *newMemRef,
+                                             Operation *op,
+                                             ArrayRef<Value *> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value *> extraOperands) {
+  unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank;
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == 0 && "pure dimensional map expected");
+    assert(indexRemap.getNumInputs() == extraOperands.size() + oldMemRefRank);
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef->getType().cast<MemRefType>().getElementType() ==
+         newMemRef->getType().cast<MemRefType>().getElementType());
+
+  if (!isMemRefDereferencingOp(*op))
+    // Failure: memref used in a non-dereferencing context (potentially
+    // escapes); no replacement in these cases.
+    return failure();
+
+  SmallVector<unsigned, 2> usePositions;
+  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
+    if (opEntry.value() == oldMemRef)
+      usePositions.push_back(opEntry.index());
+  }
+
+  // If memref doesn't appear, nothing to do.
+  if (usePositions.empty())
+    return success();
+
+  if (usePositions.size() > 1) {
+    // TODO(mlir-team): extend it for this case when needed (rare).
+    assert(false && "multiple dereferencing uses in a single op not supported");
+    return failure();
+  }
+
+  unsigned memRefOperandPos = usePositions.front();
+
+  OpBuilder builder(op);
+  NamedAttribute oldMapAttrPair = getAffineMapAttrForMemRef(op, oldMemRef);
+  AffineMap oldMap = oldMapAttrPair.second.cast<AffineMapAttr>().getValue();
+  unsigned oldMapNumInputs = oldMap.getNumInputs();
+  SmallVector<Value *, 4> oldMapOperands(
+      op->operand_begin() + memRefOperandPos + 1,
+      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
+
+  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
+  SmallVector<Value *, 4> oldMemRefOperands;
+  SmallVector<Value *, 4> affineApplyOps;
+  oldMemRefOperands.reserve(oldMemRefRank);
+  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
+    for (auto resultExpr : oldMap.getResults()) {
+      auto singleResMap = builder.getAffineMap(
+          oldMap.getNumDims(), oldMap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                oldMapOperands);
+      oldMemRefOperands.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    oldMemRefOperands.append(oldMapOperands.begin(), oldMapOperands.end());
+  }
+
+  // Construct new indices as a remap of the old ones if a remapping has been
+  // provided. The indices of a memref come right after it, i.e.,
+  // at position memRefOperandPos + 1.
+  SmallVector<Value *, 4> remapOperands;
+  remapOperands.reserve(extraOperands.size() + oldMemRefRank);
+  remapOperands.append(extraOperands.begin(), extraOperands.end());
+  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
+
+  SmallVector<Value *, 4> remapOutputs;
+  remapOutputs.reserve(oldMemRefRank);
+
+  if (indexRemap &&
+      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
+    // Remapped indices.
+    for (auto resultExpr : indexRemap.getResults()) {
+      auto singleResMap = builder.getAffineMap(
+          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                remapOperands);
+      remapOutputs.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    // No remapping specified.
+    remapOutputs.append(remapOperands.begin(), remapOperands.end());
+  }
+
+  SmallVector<Value *, 4> newMapOperands;
+  newMapOperands.reserve(newMemRefRank);
+
+  // Prepend 'extraIndices' in 'newMapOperands'.
+  for (auto *extraIndex : extraIndices) {
+    assert(extraIndex->getDefiningOp()->getNumResults() == 1 &&
+           "single result op's expected to generate these indices");
+    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
+           "invalid memory op index");
+    newMapOperands.push_back(extraIndex);
+  }
+
+  // Append 'remapOutputs' to 'newMapOperands'.
+  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
+
+  // Create new fully composed AffineMap for new op to be created.
+  assert(newMapOperands.size() == newMemRefRank);
+  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
+  // TODO(b/136262594) Avoid creating/deleting temporary AffineApplyOps here.
+  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
+  newMap = simplifyAffineMap(newMap);
+  canonicalizeMapAndOperands(&newMap, &newMapOperands);
+  // Remove any affine.apply's that became dead as a result of composition.
+  for (auto *value : affineApplyOps)
+    if (value->use_empty())
+      value->getDefiningOp()->erase();
+
+  // Construct the new operation using this memref.
+  OperationState state(op->getLoc(), op->getName());
+  state.setOperandListToResizable(op->hasResizableOperandsList());
+  state.operands.reserve(op->getNumOperands() + extraIndices.size());
+  // Insert the non-memref operands.
+  state.operands.append(op->operand_begin(),
+                        op->operand_begin() + memRefOperandPos);
+  // Insert the new memref value.
+  state.operands.push_back(newMemRef);
+
+  // Insert the new memref map operands.
+  state.operands.append(newMapOperands.begin(), newMapOperands.end());
+
+  // Insert the remaining operands unmodified.
+  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
+                            oldMapNumInputs,
+                        op->operand_end());
+
+  // Result types don't change. Both memref's are of the same elemental type.
+  state.types.reserve(op->getNumResults());
+  for (auto *result : op->getResults())
+    state.types.push_back(result->getType());
+
+  // Add attribute for 'newMap', other Attributes do not change.
+  auto newMapAttr = builder.getAffineMapAttr(newMap);
+  for (auto namedAttr : op->getAttrs()) {
+    if (namedAttr.first == oldMapAttrPair.first) {
+      state.attributes.push_back({namedAttr.first, newMapAttr});
+    } else {
+      state.attributes.push_back(namedAttr);
+    }
+  }
+
+  // Create the new operation.
+  auto *repOp = builder.createOperation(state);
+  op->replaceAllUsesWith(repOp);
+  op->erase();
+
+  return success();
+}
+
+LogicalResult mlir::replaceAllMemRefUsesWith(Value *oldMemRef, Value *newMemRef,
+                                             ArrayRef<Value *> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value *> extraOperands,
+                                             Operation *domInstFilter,
+                                             Operation *postDomInstFilter) {
+  unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank;
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == 0 && "pure dimensional map expected");
+    assert(indexRemap.getNumInputs() == extraOperands.size() + oldMemRefRank);
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef->getType().cast<MemRefType>().getElementType() ==
+         newMemRef->getType().cast<MemRefType>().getElementType());
+
+  std::unique_ptr<DominanceInfo> domInfo;
+  std::unique_ptr<PostDominanceInfo> postDomInfo;
+  if (domInstFilter)
+    domInfo = std::make_unique<DominanceInfo>(
+        domInstFilter->getParentOfType<FuncOp>());
+
+  if (postDomInstFilter)
+    postDomInfo = std::make_unique<PostDominanceInfo>(
+        postDomInstFilter->getParentOfType<FuncOp>());
+
+  // Walk all uses of old memref; collect ops to perform replacement. We use a
+  // DenseSet since an operation could potentially have multiple uses of a
+  // memref (although rare), and the replacement later is going to erase ops.
+  DenseSet<Operation *> opsToReplace;
+  for (auto *op : oldMemRef->getUsers()) {
+    // Skip this use if it's not dominated by domInstFilter.
+    if (domInstFilter && !domInfo->dominates(domInstFilter, op))
+      continue;
+
+    // Skip this use if it's not post-dominated by postDomInstFilter.
+    if (postDomInstFilter && !postDomInfo->postDominates(postDomInstFilter, op))
+      continue;
+
+    // Skip dealloc's - no replacement is necessary, and a memref replacement
+    // at other uses doesn't hurt these dealloc's.
+    if (isa<DeallocOp>(op))
+      continue;
+
+    // Check if the memref was used in a non-dereferencing context. It is fine
+    // for the memref to be used in a non-dereferencing way outside of the
+    // region where this replacement is happening.
+    if (!isMemRefDereferencingOp(*op))
+      // Failure: memref used in a non-dereferencing op (potentially escapes);
+      // no replacement in these cases.
+      return failure();
+
+    // We'll first collect and then replace --- since replacement erases the op
+    // that has the use, and that op could be postDomFilter or domFilter itself!
+    opsToReplace.insert(op);
+  }
+
+  for (auto *op : opsToReplace) {
+    if (failed(replaceAllMemRefUsesWith(oldMemRef, newMemRef, op, extraIndices,
+                                        indexRemap, extraOperands)))
+      assert(false && "memref replacement guaranteed to succeed here");
+  }
+
+  return success();
+}
+
+/// Given an operation, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// operation. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   "compute"(%idx)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "compute"(%idx_)
+///
+/// This allows applying different transformations on send and compute (for eg.
+/// different shifts/delays).
+///
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine.apply and thus there was no affine computation slice to create, or if
+/// all the affine.apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine.apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
+  // Collect all operands that are results of affine apply ops.
+  SmallVector<Value *, 4> subOperands;
+  subOperands.reserve(opInst->getNumOperands());
+  for (auto *operand : opInst->getOperands())
+    if (isa_and_nonnull<AffineApplyOp>(operand->getDefiningOp()))
+      subOperands.push_back(operand);
+
+  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps(subOperands, affineApplyOps);
+  // Skip transforming if there are no affine maps to compose.
+  if (affineApplyOps.empty())
+    return;
+
+  // Check if all uses of the affine apply op's lie only in this op op, in
+  // which case there would be nothing to do.
+  bool localized = true;
+  for (auto *op : affineApplyOps) {
+    for (auto *result : op->getResults()) {
+      for (auto *user : result->getUsers()) {
+        if (user != opInst) {
+          localized = false;
+          break;
+        }
+      }
+    }
+  }
+  if (localized)
+    return;
+
+  OpBuilder builder(opInst);
+  SmallVector<Value *, 4> composedOpOperands(subOperands);
+  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+  // Create an affine.apply for each of the map results.
+  sliceOps->reserve(composedMap.getNumResults());
+  for (auto resultExpr : composedMap.getResults()) {
+    auto singleResMap = builder.getAffineMap(
+        composedMap.getNumDims(), composedMap.getNumSymbols(), resultExpr);
+    sliceOps->push_back(builder.create<AffineApplyOp>(
+        opInst->getLoc(), singleResMap, composedOpOperands));
+  }
+
+  // Construct the new operands that include the results from the composed
+  // affine apply op above instead of existing ones (subOperands). So, they
+  // differ from opInst's operands only for those operands in 'subOperands', for
+  // which they will be replaced by the corresponding one from 'sliceOps'.
+  SmallVector<Value *, 4> newOperands(opInst->getOperands());
+  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
+    // Replace the subOperands from among the new operands.
+    unsigned j, f;
+    for (j = 0, f = subOperands.size(); j < f; j++) {
+      if (newOperands[i] == subOperands[j])
+        break;
+    }
+    if (j < subOperands.size()) {
+      newOperands[i] = (*sliceOps)[j];
+    }
+  }
+  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
+    opInst->setOperand(idx, newOperands[idx]);
+  }
+}
diff --git a/third_party/mlir/lib/Transforms/Vectorize.cpp b/third_party/mlir/lib/Transforms/Vectorize.cpp
new file mode 100644
index 00000000000..cbf616eae10
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Vectorize.cpp
@@ -0,0 +1,1286 @@
+//===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements vectorization of loops, operations and data types to
+// a target-independent, n-D super-vector abstraction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+
+///
+/// Implements a high-level vectorization strategy on a Function.
+/// The abstraction used is that of super-vectors, which provide a single,
+/// compact, representation in the vector types, information that is expected
+/// to reduce the impact of the phase ordering problem
+///
+/// Vector granularity:
+/// ===================
+/// This pass is designed to perform vectorization at a super-vector
+/// granularity. A super-vector is loosely defined as a vector type that is a
+/// multiple of a "good" vector size so the HW can efficiently implement a set
+/// of high-level primitives. Multiple is understood along any dimension; e.g.
+/// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
+/// vector<8xf32> HW vector. Note that a "good vector size so the HW can
+/// efficiently implement a set of high-level primitives" is not necessarily an
+/// integer multiple of actual hardware registers. We leave details of this
+/// distinction unspecified for now.
+///
+/// Some may prefer the terminology a "tile of HW vectors". In this case, one
+/// should note that super-vectors implement an "always full tile" abstraction.
+/// They guarantee no partial-tile separation is necessary by relying on a
+/// high-level copy-reshape abstraction that we call vector.transfer. This
+/// copy-reshape operations is also responsible for performing layout
+/// transposition if necessary. In the general case this will require a scoped
+/// allocation in some notional local memory.
+///
+/// Whatever the mental model one prefers to use for this abstraction, the key
+/// point is that we burn into a single, compact, representation in the vector
+/// types, information that is expected to reduce the impact of the phase
+/// ordering problem. Indeed, a vector type conveys information that:
+///   1. the associated loops have dependency semantics that do not prevent
+///      vectorization;
+///   2. the associate loops have been sliced in chunks of static sizes that are
+///      compatible with vector sizes (i.e. similar to unroll-and-jam);
+///   3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
+///   the
+///      vector type and no vectorization hampering transformations can be
+///      applied to them anymore;
+///   4. the underlying memrefs are accessed in some notional contiguous way
+///      that allows loading into vectors with some amount of spatial locality;
+/// In other words, super-vectorization provides a level of separation of
+/// concern by way of opacity to subsequent passes. This has the effect of
+/// encapsulating and propagating vectorization constraints down the list of
+/// passes until we are ready to lower further.
+///
+/// For a particular target, a notion of minimal n-d vector size will be
+/// specified and vectorization targets a multiple of those. In the following
+/// paragraph, let "k ." represent "a multiple of", to be understood as a
+/// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
+/// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
+///
+/// Some non-exhaustive notable super-vector sizes of interest include:
+///   - CPU: vector<k . HW_vector_size>,
+///          vector<k' . core_count x k . HW_vector_size>,
+///          vector<socket_count x k' . core_count x k . HW_vector_size>;
+///   - GPU: vector<k . warp_size>,
+///          vector<k . warp_size x float2>,
+///          vector<k . warp_size x float4>,
+///          vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
+///
+/// Loops and operations are emitted that operate on those super-vector shapes.
+/// Subsequent lowering passes will materialize to actual HW vector sizes. These
+/// passes are expected to be (gradually) more target-specific.
+///
+/// At a high level, a vectorized load in a loop will resemble:
+/// ```mlir
+///   affine.for %i = ? to ? step ? {
+///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+/// It is the responsibility of the implementation of vector.transfer_read to
+/// materialize vector registers from the original scalar memrefs. A later (more
+/// target-dependent) lowering pass will materialize to actual HW vector sizes.
+/// This lowering may be occur at different times:
+///   1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
+///      DmaWaitOp + vectorized operations for data transformations and shuffle;
+///      thus opening opportunities for unrolling and pipelining. This is an
+///      instance of library call "whiteboxing"; or
+///   2. later in the a target-specific lowering pass or hand-written library
+///      call; achieving full separation of concerns. This is an instance of
+///      library call; or
+///   3. a mix of both, e.g. based on a model.
+/// In the future, these operations will expose a contract to constrain the
+/// search on vectorization patterns and sizes.
+///
+/// Occurrence of super-vectorization in the compiler flow:
+/// =======================================================
+/// This is an active area of investigation. We start with 2 remarks to position
+/// super-vectorization in the context of existing ongoing work: LLVM VPLAN
+/// and LLVM SLP Vectorizer.
+///
+/// LLVM VPLAN:
+/// -----------
+/// The astute reader may have noticed that in the limit, super-vectorization
+/// can be applied at a similar time and with similar objectives than VPLAN.
+/// For instance, in the case of a traditional, polyhedral compilation-flow (for
+/// instance, the PPCG project uses ISL to provide dependence analysis,
+/// multi-level(scheduling + tiling), lifting footprint to fast memory,
+/// communication synthesis, mapping, register optimizations) and before
+/// unrolling. When vectorization is applied at this *late* level in a typical
+/// polyhedral flow, and is instantiated with actual hardware vector sizes,
+/// super-vectorization is expected to match (or subsume) the type of patterns
+/// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
+/// is higher level and our implementation should be significantly simpler. Also
+/// note that in this mode, recursive patterns are probably a bit of an overkill
+/// although it is reasonable to expect that mixing a bit of outer loop and
+/// inner loop vectorization + unrolling will provide interesting choices to
+/// MLIR.
+///
+/// LLVM SLP Vectorizer:
+/// --------------------
+/// Super-vectorization however is not meant to be usable in a similar fashion
+/// to the SLP vectorizer. The main difference lies in the information that
+/// both vectorizers use: super-vectorization examines contiguity of memory
+/// references along fastest varying dimensions and loops with recursive nested
+/// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
+/// the other hand, performs flat pattern matching inside a single unrolled loop
+/// body and stitches together pieces of load and store operations into full
+/// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
+/// innermost loop, control-flow dependent patterns that super-vectorization may
+/// not be able to capture easily. In other words, super-vectorization does not
+/// aim at replacing the SLP vectorizer and the two solutions are complementary.
+///
+/// Ongoing investigations:
+/// -----------------------
+/// We discuss the following *early* places where super-vectorization is
+/// applicable and touch on the expected benefits and risks . We list the
+/// opportunities in the context of the traditional polyhedral compiler flow
+/// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
+/// we expect to experiment with super-vectorization:
+/// 1. Right after language lowering to MLIR: this is the earliest time where
+///    super-vectorization is expected to be applied. At this level, all the
+///    language/user/library-level annotations are available and can be fully
+///    exploited. Examples include loop-type annotations (such as parallel,
+///    reduction, scan, dependence distance vector, vectorizable) as well as
+///    memory access annotations (such as non-aliasing writes guaranteed,
+///    indirect accesses that are permutations by construction) accesses or
+///    that a particular operation is prescribed atomic by the user. At this
+///    level, anything that enriches what dependence analysis can do should be
+///    aggressively exploited. At this level we are close to having explicit
+///    vector types in the language, except we do not impose that burden on the
+///    programmer/library: we derive information from scalar code + annotations.
+/// 2. After dependence analysis and before polyhedral scheduling: the
+///    information that supports vectorization does not need to be supplied by a
+///    higher level of abstraction. Traditional dependence anaysis is available
+///    in MLIR and will be used to drive vectorization and cost models.
+///
+/// Let's pause here and remark that applying super-vectorization as described
+/// in 1. and 2. presents clear opportunities and risks:
+///   - the opportunity is that vectorization is burned in the type system and
+///   is protected from the adverse effect of loop scheduling, tiling, loop
+///   interchange and all passes downstream. Provided that subsequent passes are
+///   able to operate on vector types; the vector shapes, associated loop
+///   iterator properties, alignment, and contiguity of fastest varying
+///   dimensions are preserved until we lower the super-vector types. We expect
+///   this to significantly rein in on the adverse effects of phase ordering.
+///   - the risks are that a. all passes after super-vectorization have to work
+///   on elemental vector types (not that this is always true, wherever
+///   vectorization is applied) and b. that imposing vectorization constraints
+///   too early may be overall detrimental to loop fusion, tiling and other
+///   transformations because the dependence distances are coarsened when
+///   operating on elemental vector types. For this reason, the pattern
+///   profitability analysis should include a component that also captures the
+///   maximal amount of fusion available under a particular pattern. This is
+///   still at the stage of rought ideas but in this context, search is our
+///   friend as the Tensor Comprehensions and auto-TVM contributions
+///   demonstrated previously.
+/// Bottom-line is we do not yet have good answers for the above but aim at
+/// making it easy to answer such questions.
+///
+/// Back to our listing, the last places where early super-vectorization makes
+/// sense are:
+/// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
+///    to improve locality, parallelism and be configurable (e.g. max-fuse,
+///    smart-fuse etc). They can also have adverse effects on contiguity
+///    properties that are required for vectorization but the vector.transfer
+///    copy-reshape-pad-transpose abstraction is expected to help recapture
+///    these properties.
+/// 4. right after polyhedral-style scheduling+tiling;
+/// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
+///    probably the most promising places because applying tiling achieves a
+///    separation of concerns that allows rescheduling to worry less about
+///    locality and more about parallelism and distribution (e.g. min-fuse).
+///
+/// At these levels the risk-reward looks different: on one hand we probably
+/// lost a good deal of language/user/library-level annotation; on the other
+/// hand we gained parallelism and locality through scheduling and tiling.
+/// However we probably want to ensure tiling is compatible with the
+/// full-tile-only abstraction used in super-vectorization or suffer the
+/// consequences. It is too early to place bets on what will win but we expect
+/// super-vectorization to be the right abstraction to allow exploring at all
+/// these levels. And again, search is our friend.
+///
+/// Lastly, we mention it again here:
+/// 6. as a MLIR-based alternative to VPLAN.
+///
+/// Lowering, unrolling, pipelining:
+/// ================================
+/// TODO(ntv): point to the proper places.
+///
+/// Algorithm:
+/// ==========
+/// The algorithm proceeds in a few steps:
+///  1. defining super-vectorization patterns and matching them on the tree of
+///     AffineForOp. A super-vectorization pattern is defined as a recursive
+///     data structures that matches and captures nested, imperfectly-nested
+///     loops that have a. comformable loop annotations attached (e.g. parallel,
+///     reduction, vectoriable, ...) as well as b. all contiguous load/store
+///     operations along a specified minor dimension (not necessarily the
+///     fastest varying) ;
+///  2. analyzing those patterns for profitability (TODO(ntv): and
+///     interference);
+///  3. Then, for each pattern in order:
+///    a. applying iterative rewriting of the loop and the load operations in
+///       DFS postorder. Rewriting is implemented by coarsening the loops and
+///       turning load operations into opaque vector.transfer_read ops;
+///    b. keeping track of the load operations encountered as "roots" and the
+///       store operations as "terminals";
+///    c. traversing the use-def chains starting from the roots and iteratively
+///       propagating vectorized values. Scalar values that are encountered
+///       during this process must come from outside the scope of the current
+///       pattern (TODO(ntv): enforce this and generalize). Such a scalar value
+///       is vectorized only if it is a constant (into a vector splat). The
+///       non-constant case is not supported for now and results in the pattern
+///       failing to vectorize;
+///    d. performing a second traversal on the terminals (store ops) to
+///       rewriting the scalar value they write to memory into vector form.
+///       If the scalar value has been vectorized previously, we simply replace
+///       it by its vector form. Otherwise, if the scalar value is a constant,
+///       it is vectorized into a splat. In all other cases, vectorization for
+///       the pattern currently fails.
+///    e. if everything under the root AffineForOp in the current pattern
+///       vectorizes properly, we commit that loop to the IR. Otherwise we
+///       discard it and restore a previously cloned version of the loop. Thanks
+///       to the recursive scoping nature of matchers and captured patterns,
+///       this is transparently achieved by a simple RAII implementation.
+///    f. vectorization is applied on the next pattern in the list. Because
+///       pattern interference avoidance is not yet implemented and that we do
+///       not support further vectorizing an already vector load we need to
+///       re-verify that the pattern is still vectorizable. This is expected to
+///       make cost models more difficult to write and is subject to improvement
+///       in the future.
+///
+/// Points c. and d. above are worth additional comment. In most passes that
+/// do not change the type of operands, it is usually preferred to eagerly
+/// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
+/// because during the use-def chain traversal, all the operands of an operation
+/// must be available in vector form. Trying to propagate eagerly makes the IR
+/// temporarily invalid and results in errors such as:
+///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
+///   operands and results
+///      %s5 = addf %a5, %b5 : f32`
+///
+/// Lastly, we show a minimal example for which use-def chains rooted in load /
+/// vector.transfer_read are not enough. This is what motivated splitting
+/// terminal processing out of the use-def chains starting from loads. In the
+/// following snippet, there is simply no load::
+/// ```mlir
+/// mlfunc @fill(%A : memref<128xf32>) -> () {
+///   %f1 = constant 1.0 : f32
+///   affine.for %i0 = 0 to 32 {
+///     store %f1, %A[%i0] : memref<128xf32, 0>
+///   }
+///   return
+/// }
+/// ```
+///
+/// Choice of loop transformation to support the algorithm:
+/// =======================================================
+/// The choice of loop transformation to apply for coarsening vectorized loops
+/// is still subject to exploratory tradeoffs. In particular, say we want to
+/// vectorize by a factor 128, we want to transform the following input:
+/// ```mlir
+///   affine.for %i = %M to %N {
+///     %a = load A[%i] : memref<?xf32>
+///   }
+/// ```
+///
+/// Traditionally, one would vectorize late (after scheduling, tiling,
+/// memory promotion etc) say after stripmining (and potentially unrolling in
+/// the case of LLVM's SLP vectorizer):
+/// ```mlir
+///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
+///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
+///       %a = load A[%ii] : memref<?xf32>
+///     }
+///   }
+/// ```
+///
+/// Instead, we seek to vectorize early and freeze vector types before
+/// scheduling, so we want to generate a pattern that resembles:
+/// ```mlir
+///   affine.for %i = ? to ? step ? {
+///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+///
+/// i. simply dividing the lower / upper bounds by 128 creates issues
+///    when representing expressions such as ii + 1 because now we only
+///    have access to original values that have been divided. Additional
+///    information is needed to specify accesses at below-128 granularity;
+/// ii. another alternative is to coarsen the loop step but this may have
+///    consequences on dependence analysis and fusability of loops: fusable
+///    loops probably need to have the same step (because we don't want to
+///    stripmine/unroll to enable fusion).
+/// As a consequence, we choose to represent the coarsening using the loop
+/// step for now and reevaluate in the future. Note that we can renormalize
+/// loop steps later if/when we have evidence that they are problematic.
+///
+/// For the simple strawman example above, vectorizing for a 1-D vector
+/// abstraction of size 128 returns code similar to:
+/// ```mlir
+///   affine.for %i = %M to %N step 128 {
+///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+///
+/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
+/// ========================================================================
+///   1. lowering to concrete vector types for various HW;
+///   2. reduction support;
+///   3. non-effecting padding during vector.transfer_read and filter during
+///      vector.transfer_write;
+///   4. misalignment support vector.transfer_read / vector.transfer_write
+///      (hopefully without read-modify-writes);
+///   5. control-flow support;
+///   6. cost-models, heuristics and search;
+///   7. Op implementation, extensions and implication on memref views;
+///   8. many TODOs left around.
+///
+/// Examples:
+/// =========
+/// Consider the following Function:
+/// ```mlir
+/// mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
+///   %A = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %B = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %f1 = constant 1.0 : f32
+///   %f2 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %M {
+///     affine.for %i1 = 0 to %N {
+///       // non-scoped %f1
+///       store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
+///     }
+///   }
+///   affine.for %i2 = 0 to %M {
+///     affine.for %i3 = 0 to %N {
+///       // non-scoped %f2
+///       store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
+///     }
+///   }
+///   affine.for %i4 = 0 to %M {
+///     affine.for %i5 = 0 to %N {
+///       %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
+///       %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
+///       %s5 = addf %a5, %b5 : f32
+///       // non-scoped %f1
+///       %s6 = addf %s5, %f1 : f32
+///       // non-scoped %f2
+///       %s7 = addf %s5, %f2 : f32
+///       // diamond dependency.
+///       %s8 = addf %s7, %s6 : f32
+///       store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
+///   return %res : f32
+/// }
+/// ```
+///
+/// TODO(ntv): update post b/119731251.
+/// The -vectorize pass with the following arguments:
+/// ```
+/// -vectorize -virtual-vector-size 256 --test-fastest-varying=0
+/// ```
+///
+/// produces this standard innermost-loop vectorized code:
+/// ```mlir
+/// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
+///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %cst = constant 1.0 : f32
+///   %cst_0 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %arg0 {
+///     affine.for %i1 = 0 to %arg1 step 256 {
+///       %cst_1 = constant dense<vector<256xf32>, 1.0> :
+///                vector<256xf32>
+///       vector.transfer_write %cst_1, %0[%i0, %i1] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i2 = 0 to %arg0 {
+///     affine.for %i3 = 0 to %arg1 step 256 {
+///       %cst_2 = constant dense<vector<256xf32>, 2.0> :
+///                vector<256xf32>
+///       vector.transfer_write %cst_2, %1[%i2, %i3] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i4 = 0 to %arg0 {
+///     affine.for %i5 = 0 to %arg1 step 256 {
+///       %3 = vector.transfer_read %0[%i4, %i5] :
+///            memref<?x?xf32>, vector<256xf32>
+///       %4 = vector.transfer_read %1[%i4, %i5] :
+///            memref<?x?xf32>, vector<256xf32>
+///       %5 = addf %3, %4 : vector<256xf32>
+///       %cst_3 = constant dense<vector<256xf32>, 1.0> :
+///                vector<256xf32>
+///       %6 = addf %5, %cst_3 : vector<256xf32>
+///       %cst_4 = constant dense<vector<256xf32>, 2.0> :
+///                vector<256xf32>
+///       %7 = addf %5, %cst_4 : vector<256xf32>
+///       %8 = addf %7, %6 : vector<256xf32>
+///       vector.transfer_write %8, %2[%i4, %i5] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
+///   return %9 : f32
+/// }
+/// ```
+///
+/// TODO(ntv): update post b/119731251.
+/// The -vectorize pass with the following arguments:
+/// ```
+/// -vectorize -virtual-vector-size 32 -virtual-vector-size 256
+/// --test-fastest-varying=1 --test-fastest-varying=0
+/// ```
+///
+/// produces this more insteresting mixed outer-innermost-loop vectorized code:
+/// ```mlir
+/// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
+///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %cst = constant 1.0 : f32
+///   %cst_0 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %arg0 step 32 {
+///     affine.for %i1 = 0 to %arg1 step 256 {
+///       %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
+///                vector<32x256xf32>
+///       vector.transfer_write %cst_1, %0[%i0, %i1] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i2 = 0 to %arg0 step 32 {
+///     affine.for %i3 = 0 to %arg1 step 256 {
+///       %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
+///                vector<32x256xf32>
+///       vector.transfer_write %cst_2, %1[%i2, %i3] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i4 = 0 to %arg0 step 32 {
+///     affine.for %i5 = 0 to %arg1 step 256 {
+///       %3 = vector.transfer_read %0[%i4, %i5] :
+///                memref<?x?xf32> vector<32x256xf32>
+///       %4 = vector.transfer_read %1[%i4, %i5] :
+///                memref<?x?xf32>, vector<32x256xf32>
+///       %5 = addf %3, %4 : vector<32x256xf32>
+///       %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
+///                vector<32x256xf32>
+///       %6 = addf %5, %cst_3 : vector<32x256xf32>
+///       %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
+///                vector<32x256xf32>
+///       %7 = addf %5, %cst_4 : vector<32x256xf32>
+///       %8 = addf %7, %6 : vector<32x256xf32>
+///       vector.transfer_write %8, %2[%i4, %i5] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
+///   return %9 : f32
+/// }
+/// ```
+///
+/// Of course, much more intricate n-D imperfectly-nested patterns can be
+/// vectorized too and specified in a fully declarative fashion.
+
+#define DEBUG_TYPE "early-vect"
+
+using functional::makePtrDynCaster;
+using functional::map;
+using llvm::dbgs;
+using llvm::SetVector;
+
+static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
+
+static llvm::cl::list<int> clVirtualVectorSize(
+    "virtual-vector-size",
+    llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::list<int> clFastestVaryingPattern(
+    "test-fastest-varying",
+    llvm::cl::desc(
+        "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
+        " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
+        " description and examples. This is used for testing purposes"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+/// Forward declaration.
+static FilterFunctionType
+isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
+                             int fastestVaryingMemRefDimension);
+
+/// Creates a vectorization pattern from the command line arguments.
+/// Up to 3-D patterns are supported.
+/// If the command line argument requests a pattern of higher order, returns an
+/// empty pattern list which will conservatively result in no vectorization.
+static std::vector<NestedPattern>
+makePatterns(const llvm::DenseSet<Operation *> &parallelLoops, int vectorRank,
+             ArrayRef<int64_t> fastestVaryingPattern) {
+  using matcher::For;
+  int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
+  int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
+  int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
+  switch (vectorRank) {
+  case 1:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
+  case 2:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
+  case 3:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1),
+                    For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
+  default: {
+    return std::vector<NestedPattern>();
+  }
+  }
+}
+
+namespace {
+
+/// Base state for the vectorize pass.
+/// Command line arguments are preempted by non-empty pass arguments.
+struct Vectorize : public FunctionPass<Vectorize> {
+  Vectorize();
+  Vectorize(ArrayRef<int64_t> virtualVectorSize);
+  void runOnFunction() override;
+
+  // The virtual vector size that we vectorize to.
+  SmallVector<int64_t, 4> vectorSizes;
+  // Optionally, the fixed mapping from loop to fastest varying MemRef dimension
+  // for all the MemRefs within a loop pattern:
+  //   the index represents the loop depth, the value represents the k^th
+  //   fastest varying memory dimension.
+  // This is voluntarily restrictive and is meant to precisely target a
+  // particular loop/op pair, for testing purposes.
+  SmallVector<int64_t, 4> fastestVaryingPattern;
+};
+
+} // end anonymous namespace
+
+Vectorize::Vectorize()
+    : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()),
+      fastestVaryingPattern(clFastestVaryingPattern.begin(),
+                            clFastestVaryingPattern.end()) {}
+
+Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
+  if (!virtualVectorSize.empty()) {
+    this->vectorSizes.assign(virtualVectorSize.begin(),
+                             virtualVectorSize.end());
+  }
+}
+
+/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
+/////////
+namespace {
+
+struct VectorizationStrategy {
+  SmallVector<int64_t, 8> vectorSizes;
+  DenseMap<Operation *, unsigned> loopToVectorDim;
+};
+
+} // end anonymous namespace
+
+static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
+                                      unsigned patternDepth,
+                                      VectorizationStrategy *strategy) {
+  assert(patternDepth > depthInPattern &&
+         "patternDepth is greater than depthInPattern");
+  if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
+    // Don't vectorize this loop
+    return;
+  }
+  strategy->loopToVectorDim[loop] =
+      strategy->vectorSizes.size() - (patternDepth - depthInPattern);
+}
+
+/// Implements a simple strawman strategy for vectorization.
+/// Given a matched pattern `matches` of depth `patternDepth`, this strategy
+/// greedily assigns the fastest varying dimension ** of the vector ** to the
+/// innermost loop in the pattern.
+/// When coupled with a pattern that looks for the fastest varying dimension in
+/// load/store MemRefs, this creates a generic vectorization strategy that works
+/// for any loop in a hierarchy (outermost, innermost or intermediate).
+///
+/// TODO(ntv): In the future we should additionally increase the power of the
+/// profitability analysis along 3 directions:
+///   1. account for loop extents (both static and parametric + annotations);
+///   2. account for data layout permutations;
+///   3. account for impact of vectorization on maximal loop fusion.
+/// Then we can quantify the above to build a cost model and search over
+/// strategies.
+static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
+                                          unsigned depthInPattern,
+                                          unsigned patternDepth,
+                                          VectorizationStrategy *strategy) {
+  for (auto m : matches) {
+    if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
+                                    patternDepth, strategy))) {
+      return failure();
+    }
+    vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
+                              patternDepth, strategy);
+  }
+  return success();
+}
+
+///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////
+
+namespace {
+
+struct VectorizationState {
+  /// Adds an entry of pre/post vectorization operations in the state.
+  void registerReplacement(Operation *key, Operation *value);
+  /// When the current vectorization pattern is successful, this erases the
+  /// operations that were marked for erasure in the proper order and resets
+  /// the internal state for the next pattern.
+  void finishVectorizationPattern();
+
+  // In-order tracking of original Operation that have been vectorized.
+  // Erase in reverse order.
+  SmallVector<Operation *, 16> toErase;
+  // Set of Operation that have been vectorized (the values in the
+  // vectorizationMap for hashed access). The vectorizedSet is used in
+  // particular to filter the operations that have already been vectorized by
+  // this pattern, when iterating over nested loops in this pattern.
+  DenseSet<Operation *> vectorizedSet;
+  // Map of old scalar Operation to new vectorized Operation.
+  DenseMap<Operation *, Operation *> vectorizationMap;
+  // Map of old scalar Value to new vectorized Value.
+  DenseMap<Value *, Value *> replacementMap;
+  // The strategy drives which loop to vectorize by which amount.
+  const VectorizationStrategy *strategy;
+  // Use-def roots. These represent the starting points for the worklist in the
+  // vectorizeNonTerminals function. They consist of the subset of load
+  // operations that have been vectorized. They can be retrieved from
+  // `vectorizationMap` but it is convenient to keep track of them in a separate
+  // data structure.
+  DenseSet<Operation *> roots;
+  // Terminal operations for the worklist in the vectorizeNonTerminals
+  // function. They consist of the subset of store operations that have been
+  // vectorized. They can be retrieved from `vectorizationMap` but it is
+  // convenient to keep track of them in a separate data structure. Since they
+  // do not necessarily belong to use-def chains starting from loads (e.g
+  // storing a constant), we need to handle them in a post-pass.
+  DenseSet<Operation *> terminals;
+  // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
+  // set.
+  void registerTerminal(Operation *op);
+
+private:
+  void registerReplacement(Value *key, Value *value);
+};
+
+} // end namespace
+
+void VectorizationState::registerReplacement(Operation *key, Operation *value) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
+  LLVM_DEBUG(key->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  into  ");
+  LLVM_DEBUG(value->print(dbgs()));
+  assert(key->getNumResults() == 1 && "already registered");
+  assert(value->getNumResults() == 1 && "already registered");
+  assert(vectorizedSet.count(value) == 0 && "already registered");
+  assert(vectorizationMap.count(key) == 0 && "already registered");
+  toErase.push_back(key);
+  vectorizedSet.insert(value);
+  vectorizationMap.insert(std::make_pair(key, value));
+  registerReplacement(key->getResult(0), value->getResult(0));
+  if (isa<AffineLoadOp>(key)) {
+    assert(roots.count(key) == 0 && "root was already inserted previously");
+    roots.insert(key);
+  }
+}
+
+void VectorizationState::registerTerminal(Operation *op) {
+  assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
+  assert(terminals.count(op) == 0 &&
+         "terminal was already inserted previously");
+  terminals.insert(op);
+}
+
+void VectorizationState::finishVectorizationPattern() {
+  while (!toErase.empty()) {
+    auto *op = toErase.pop_back_val();
+    LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
+    LLVM_DEBUG(op->print(dbgs()));
+    op->erase();
+  }
+}
+
+void VectorizationState::registerReplacement(Value *key, Value *value) {
+  assert(replacementMap.count(key) == 0 && "replacement already registered");
+  replacementMap.insert(std::make_pair(key, value));
+}
+
+// Apply 'map' with 'mapOperands' returning resulting values in 'results'.
+static void computeMemoryOpIndices(Operation *op, AffineMap map,
+                                   ArrayRef<Value *> mapOperands,
+                                   SmallVectorImpl<Value *> &results) {
+  OpBuilder builder(op);
+  for (auto resultExpr : map.getResults()) {
+    auto singleResMap =
+        builder.getAffineMap(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    auto afOp =
+        builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
+    results.push_back(afOp);
+  }
+}
+
+////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
+
+/// Handles the vectorization of load and store MLIR operations.
+///
+/// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
+/// They are vectorized immediately. The resulting vector.transfer_read is
+/// immediately registered to replace all uses of the AffineLoadOp in this
+/// pattern's scope.
+///
+/// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
+/// need to be vectorized late once all the use-def chains have been traversed.
+/// Additionally, they may have ssa-values operands which come from outside the
+/// scope of the current pattern.
+/// Such special cases force us to delay the vectorization of the stores until
+/// the last step. Here we merely register the store operation.
+template <typename LoadOrStoreOpPointer>
+static LogicalResult vectorizeRootOrTerminal(Value *iv,
+                                             LoadOrStoreOpPointer memoryOp,
+                                             VectorizationState *state) {
+  auto memRefType = memoryOp.getMemRef()->getType().template cast<MemRefType>();
+
+  auto elementType = memRefType.getElementType();
+  // TODO(ntv): ponder whether we want to further vectorize a vector value.
+  assert(VectorType::isValidElementType(elementType) &&
+         "Not a valid vector element type");
+  auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
+
+  // Materialize a MemRef with 1 vector.
+  auto *opInst = memoryOp.getOperation();
+  // For now, vector.transfers must be aligned, operate only on indices with an
+  // identity subset of AffineMap and do not change layout.
+  // TODO(ntv): increase the expressiveness power of vector.transfer operations
+  // as needed by various targets.
+  if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
+    OpBuilder b(opInst);
+    SmallVector<Value *, 4> mapOperands(load.getIndices());
+    SmallVector<Value *, 8> indices;
+    indices.reserve(load.getMemRefType().getRank());
+    if (load.getAffineMap() !=
+        b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
+    } else {
+      indices.append(load.getIndices().begin(), load.getIndices().end());
+    }
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return LogicalResult::Failure;
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::VectorTransferReadOp>(
+        opInst->getLoc(), vectorType, memoryOp.getMemRef(),
+        map(makePtrDynCaster<Value>(), indices), permutationMap);
+    state->registerReplacement(opInst, transfer.getOperation());
+  } else {
+    state->registerTerminal(opInst);
+  }
+  return success();
+}
+/// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///
+
+/// Coarsens the loops bounds and transforms all remaining load and store
+/// operations into the appropriate vector.transfer.
+static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
+                                          VectorizationState *state) {
+  using namespace functional;
+  loop.setStep(step);
+
+  FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
+    if (!matcher::isLoadOrStore(op)) {
+      return false;
+    }
+    return state->vectorizationMap.count(&op) == 0 &&
+           state->vectorizedSet.count(&op) == 0 &&
+           state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
+  };
+  auto loadAndStores = matcher::Op(notVectorizedThisPattern);
+  SmallVector<NestedMatch, 8> loadAndStoresMatches;
+  loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
+  for (auto ls : loadAndStoresMatches) {
+    auto *opInst = ls.getMatchedOperation();
+    auto load = dyn_cast<AffineLoadOp>(opInst);
+    auto store = dyn_cast<AffineStoreOp>(opInst);
+    LLVM_DEBUG(opInst->print(dbgs()));
+    LogicalResult result =
+        load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
+             : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
+    if (failed(result)) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+/// Returns a FilterFunctionType that can be used in NestedPattern to match a
+/// loop whose underlying load/store accesses are either invariant or all
+// varying along the `fastestVaryingMemRefDimension`.
+static FilterFunctionType
+isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
+                             int fastestVaryingMemRefDimension) {
+  return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
+    auto loop = cast<AffineForOp>(forOp);
+    auto parallelIt = parallelLoops.find(loop);
+    if (parallelIt == parallelLoops.end())
+      return false;
+    int memRefDim = -1;
+    auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim);
+    if (!vectorizableBody)
+      return false;
+    return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
+           memRefDim == fastestVaryingMemRefDimension;
+  };
+}
+
+/// Apply vectorization of `loop` according to `state`. This is only triggered
+/// if all vectorizations in `childrenMatches` have already succeeded
+/// recursively in DFS post-order.
+static LogicalResult
+vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
+                                  VectorizationState *state) {
+  auto *loopInst = oneMatch.getMatchedOperation();
+  auto loop = cast<AffineForOp>(loopInst);
+  auto childrenMatches = oneMatch.getMatchedChildren();
+
+  // 1. DFS postorder recursion, if any of my children fails, I fail too.
+  for (auto m : childrenMatches) {
+    if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
+      return failure();
+    }
+  }
+
+  // 2. This loop may have been omitted from vectorization for various reasons
+  // (e.g. due to the performance model or pattern depth > vector size).
+  auto it = state->strategy->loopToVectorDim.find(loopInst);
+  if (it == state->strategy->loopToVectorDim.end()) {
+    return success();
+  }
+
+  // 3. Actual post-order transformation.
+  auto vectorDim = it->second;
+  assert(vectorDim < state->strategy->vectorSizes.size() &&
+         "vector dim overflow");
+  //   a. get actual vector size
+  auto vectorSize = state->strategy->vectorSizes[vectorDim];
+  //   b. loop transformation for early vectorization is still subject to
+  //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
+  //        | ub -> ub
+  //        | step -> step * vectorSize
+  LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+                    << " : ");
+  LLVM_DEBUG(loopInst->print(dbgs()));
+  return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
+}
+
+/// Tries to transform a scalar constant into a vector splat of that constant.
+/// Returns the vectorized splat operation if the constant is a valid vector
+/// element type.
+/// If `type` is not a valid vector type or if the scalar constant is not a
+/// valid vector element type, returns nullptr.
+static Value *vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
+  if (!type || !type.isa<VectorType>() ||
+      !VectorType::isValidElementType(constant.getType())) {
+    return nullptr;
+  }
+  OpBuilder b(op);
+  Location loc = op->getLoc();
+  auto vectorType = type.cast<VectorType>();
+  auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
+  auto *constantOpInst = constant.getOperation();
+
+  OperationState state(loc, constantOpInst->getName().getStringRef(), {},
+                       {vectorType}, {b.getNamedAttr("value", attr)});
+
+  return b.createOperation(state)->getResult(0);
+}
+
+/// Tries to vectorize a given operand `op` of Operation `op` during
+/// def-chain propagation or during terminal vectorization, by applying the
+/// following logic:
+/// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
+///    useby -def propagation), `op` is already in the proper vector form;
+/// 2. otherwise, the `op` may be in some other vector form that fails to
+///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
+///    failure;
+/// 3. if the `op` is a constant, returns the vectorized form of the constant;
+/// 4. non-constant scalars are currently non-vectorizable, in particular to
+///    guard against vectorizing an index which may be loop-variant and needs
+///    special handling.
+///
+/// In particular this logic captures some of the use cases where definitions
+/// that are not scoped under the current pattern are needed to vectorize.
+/// One such example is top level function constants that need to be splatted.
+///
+/// Returns an operand that has been vectorized to match `state`'s strategy if
+/// vectorization is possible with the above logic. Returns nullptr otherwise.
+///
+/// TODO(ntv): handle more complex cases.
+static Value *vectorizeOperand(Value *operand, Operation *op,
+                               VectorizationState *state) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: ");
+  LLVM_DEBUG(operand->print(dbgs()));
+  // 1. If this value has already been vectorized this round, we are done.
+  if (state->vectorizedSet.count(operand->getDefiningOp()) > 0) {
+    LLVM_DEBUG(dbgs() << " -> already vector operand");
+    return operand;
+  }
+  // 1.b. Delayed on-demand replacement of a use.
+  //    Note that we cannot just call replaceAllUsesWith because it may result
+  //    in ops with mixed types, for ops whose operands have not all yet
+  //    been vectorized. This would be invalid IR.
+  auto it = state->replacementMap.find(operand);
+  if (it != state->replacementMap.end()) {
+    auto *res = it->second;
+    LLVM_DEBUG(dbgs() << "-> delayed replacement by: ");
+    LLVM_DEBUG(res->print(dbgs()));
+    return res;
+  }
+  // 2. TODO(ntv): broadcast needed.
+  if (operand->getType().isa<VectorType>()) {
+    LLVM_DEBUG(dbgs() << "-> non-vectorizable");
+    return nullptr;
+  }
+  // 3. vectorize constant.
+  if (auto constant = dyn_cast<ConstantOp>(operand->getDefiningOp())) {
+    return vectorizeConstant(
+        op, constant,
+        VectorType::get(state->strategy->vectorSizes, operand->getType()));
+  }
+  // 4. currently non-vectorizable.
+  LLVM_DEBUG(dbgs() << "-> non-vectorizable");
+  LLVM_DEBUG(operand->print(dbgs()));
+  return nullptr;
+}
+
+/// Encodes Operation-specific behavior for vectorization. In general we assume
+/// that all operands of an op must be vectorized but this is not always true.
+/// In the future, it would be nice to have a trait that describes how a
+/// particular operation vectorizes. For now we implement the case distinction
+/// here.
+/// Returns a vectorized form of an operation or nullptr if vectorization fails.
+// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
+// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
+// do one-off logic here; ideally it would be TableGen'd.
+static Operation *vectorizeOneOperation(Operation *opInst,
+                                        VectorizationState *state) {
+  // Sanity checks.
+  assert(!isa<AffineLoadOp>(opInst) &&
+         "all loads must have already been fully vectorized independently");
+  assert(!isa<vector::VectorTransferReadOp>(opInst) &&
+         "vector.transfer_read cannot be further vectorized");
+  assert(!isa<vector::VectorTransferWriteOp>(opInst) &&
+         "vector.transfer_write cannot be further vectorized");
+
+  if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
+    OpBuilder b(opInst);
+    auto *memRef = store.getMemRef();
+    auto *value = store.getValueToStore();
+    auto *vectorValue = vectorizeOperand(value, opInst, state);
+
+    SmallVector<Value *, 4> mapOperands(store.getIndices());
+    SmallVector<Value *, 8> indices;
+    indices.reserve(store.getMemRefType().getRank());
+    if (store.getAffineMap() !=
+        b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
+                             indices);
+    } else {
+      indices.append(store.getIndices().begin(), store.getIndices().end());
+    }
+
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::VectorTransferWriteOp>(
+        opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
+    auto *res = transfer.getOperation();
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
+    // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
+    opInst->erase();
+    return res;
+  }
+  if (opInst->getNumRegions() != 0)
+    return nullptr;
+
+  SmallVector<Type, 8> vectorTypes;
+  for (auto *v : opInst->getResults()) {
+    vectorTypes.push_back(
+        VectorType::get(state->strategy->vectorSizes, v->getType()));
+  }
+  SmallVector<Value *, 8> vectorOperands;
+  for (auto *v : opInst->getOperands()) {
+    vectorOperands.push_back(vectorizeOperand(v, opInst, state));
+  }
+  // Check whether a single operand is null. If so, vectorization failed.
+  bool success = llvm::all_of(vectorOperands, [](Value *op) { return op; });
+  if (!success) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
+    return nullptr;
+  }
+
+  // Create a clone of the op with the proper operands and return types.
+  // TODO(ntv): The following assumes there is always an op with a fixed
+  // name that works both in scalar mode and vector mode.
+  // TODO(ntv): Is it worth considering an Operation.clone operation which
+  // changes the type so we can promote an Operation with less boilerplate?
+  OpBuilder b(opInst);
+  OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
+                       vectorOperands, vectorTypes, opInst->getAttrs(),
+                       /*successors=*/{},
+                       /*regions=*/{}, opInst->hasResizableOperandsList());
+  return b.createOperation(newOp);
+}
+
+/// Iterates over the forward slice from the loads in the vectorization pattern
+/// and rewrites them using their vectorized counterpart by:
+///   1. Create the forward slice starting from the laods in the vectorization
+///   pattern.
+///   2. Topologically sorts the forward slice.
+///   3. For each operation in the slice, create the vector form of this
+///   operation, replacing each operand by a replacement operands retrieved from
+///   replacementMap. If any such replacement is missing, vectorization fails.
+static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
+  // 1. create initial worklist with the uses of the roots.
+  SetVector<Operation *> worklist;
+  // Note: state->roots have already been vectorized and must not be vectorized
+  // again. This fits `getForwardSlice` which does not insert `op` in the
+  // result.
+  // Note: we have to exclude terminals because some of their defs may not be
+  // nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO(ntv): Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state->roots) {
+    getForwardSlice(op, &worklist, [state](Operation *op) {
+      return state->terminals.count(op) == 0; // propagate if not terminal
+    });
+  }
+  // We merged multiple slices, topological order may not hold anymore.
+  worklist = topologicalSort(worklist);
+
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    auto *op = worklist[i];
+    LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
+    LLVM_DEBUG(op->print(dbgs()));
+
+    // Create vector form of the operation.
+    // Insert it just before op, on success register op as replaced.
+    auto *vectorizedInst = vectorizeOneOperation(op, state);
+    if (!vectorizedInst) {
+      return failure();
+    }
+
+    // 3. Register replacement for future uses in the scope.
+    //    Note that we cannot just call replaceAllUsesWith because it may
+    //    result in ops with mixed types, for ops whose operands have not all
+    //    yet been vectorized. This would be invalid IR.
+    state->registerReplacement(op, vectorizedInst);
+  }
+  return success();
+}
+
+/// Vectorization is a recursive procedure where anything below can fail.
+/// The root match thus needs to maintain a clone for handling failure.
+/// Each root may succeed independently but will otherwise clean after itself if
+/// anything below it fails.
+static LogicalResult vectorizeRootMatch(NestedMatch m,
+                                        VectorizationStrategy *strategy) {
+  auto loop = cast<AffineForOp>(m.getMatchedOperation());
+  VectorizationState state;
+  state.strategy = strategy;
+
+  // Since patterns are recursive, they can very well intersect.
+  // Since we do not want a fully greedy strategy in general, we decouple
+  // pattern matching, from profitability analysis, from application.
+  // As a consequence we must check that each root pattern is still
+  // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
+  // TODO(ntv): implement a non-greedy profitability analysis that keeps only
+  // non-intersecting patterns.
+  if (!isVectorizableLoopBody(loop)) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
+    return failure();
+  }
+
+  /// Sets up error handling for this root loop. This is how the root match
+  /// maintains a clone for handling failure and restores the proper state via
+  /// RAII.
+  auto *loopInst = loop.getOperation();
+  OpBuilder builder(loopInst);
+  auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
+  struct Guard {
+    LogicalResult failure() {
+      loop.getInductionVar()->replaceAllUsesWith(clonedLoop.getInductionVar());
+      loop.erase();
+      return mlir::failure();
+    }
+    LogicalResult success() {
+      clonedLoop.erase();
+      return mlir::success();
+    }
+    AffineForOp loop;
+    AffineForOp clonedLoop;
+  } guard{loop, clonedLoop};
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Start vectorizing.
+  // From now on, any error triggers the scope guard above.
+  //////////////////////////////////////////////////////////////////////////////
+  // 1. Vectorize all the loops matched by the pattern, recursively.
+  // This also vectorizes the roots (AffineLoadOp) as well as registers the
+  // terminals (AffineStoreOp) for post-processing vectorization (we need to
+  // wait for all use-def chains into them to be vectorized first).
+  if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
+    return guard.failure();
+  }
+
+  // 2. Vectorize operations reached by use-def chains from root except the
+  // terminals (store operations) that need to be post-processed separately.
+  // TODO(ntv): add more as we expand.
+  if (failed(vectorizeNonTerminals(&state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
+    return guard.failure();
+  }
+
+  // 3. Post-process terminals.
+  // Note: we have to post-process terminals because some of their defs may not
+  // be nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO(ntv): Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state.terminals) {
+    if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
+      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
+      return guard.failure();
+    }
+  }
+
+  // 4. Finish this vectorization pattern.
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
+  state.finishVectorizationPattern();
+  return guard.success();
+}
+
+/// Applies vectorization to the current Function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnFunction() {
+  FuncOp f = getFunction();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitRemark("Fastest varying pattern specified with different size than "
+                 "the vector size.");
+    return signalPassFailure();
+  }
+
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+
+  llvm::DenseSet<Operation *> parallelLoops;
+  f.walk<AffineForOp>([&parallelLoops](AffineForOp loop) {
+    if (isLoopParallel(loop))
+      parallelLoops.insert(loop);
+  });
+
+  for (auto &pat :
+       makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
+    LLVM_DEBUG(dbgs() << "\n******************************************");
+    LLVM_DEBUG(dbgs() << "\n******************************************");
+    LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
+    LLVM_DEBUG(f.print(dbgs()));
+    unsigned patternDepth = pat.getDepth();
+
+    SmallVector<NestedMatch, 8> matches;
+    pat.match(f, &matches);
+    // Iterate over all the top-level matches and vectorize eagerly.
+    // This automatically prunes intersecting matches.
+    for (auto m : matches) {
+      VectorizationStrategy strategy;
+      // TODO(ntv): depending on profitability, elect to reduce the vector size.
+      strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
+      if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
+                                      &strategy))) {
+        continue;
+      }
+      vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
+                                &strategy);
+      // TODO(ntv): if pattern does not apply, report it; alter the
+      // cost/benefit.
+      vectorizeRootMatch(m, &strategy);
+      // TODO(ntv): some diagnostics if failure to vectorize occurs.
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+std::unique_ptr<FunctionPassBase>
+mlir::createVectorizePass(llvm::ArrayRef<int64_t> virtualVectorSize) {
+  return std::make_unique<Vectorize>(virtualVectorSize);
+}
+
+static PassRegistration<Vectorize>
+    pass("affine-vectorize",
+         "Vectorize to a target independent n-D vector abstraction");
diff --git a/third_party/mlir/lib/Transforms/ViewRegionGraph.cpp b/third_party/mlir/lib/Transforms/ViewRegionGraph.cpp
new file mode 100644
index 00000000000..5a0e8e5ea99
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/ViewRegionGraph.cpp
@@ -0,0 +1,95 @@
+//===- ViewRegionGraph.cpp - View/write graphviz graphs -------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Transforms/ViewRegionGraph.h"
+#include "mlir/IR/RegionGraphTraits.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace llvm {
+
+// Specialize DOTGraphTraits to produce more readable output.
+template <> struct DOTGraphTraits<Region *> : public DefaultDOTGraphTraits {
+  using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
+
+  static std::string getNodeLabel(Block *Block, Region *);
+};
+
+std::string DOTGraphTraits<Region *>::getNodeLabel(Block *Block, Region *) {
+  // Reuse the print output for the node labels.
+  std::string outStreamStr;
+  raw_string_ostream os(outStreamStr);
+  Block->print(os);
+  std::string &outStr = os.str();
+
+  if (outStr[0] == '\n')
+    outStr.erase(outStr.begin());
+
+  // Process string output to left justify the block.
+  for (unsigned i = 0; i != outStr.length(); ++i) {
+    if (outStr[i] == '\n') {
+      outStr[i] = '\\';
+      outStr.insert(outStr.begin() + i + 1, 'l');
+    }
+  }
+
+  return outStr;
+}
+
+} // end namespace llvm
+
+void mlir::viewGraph(Region &region, const llvm::Twine &name, bool shortNames,
+                     const llvm::Twine &title,
+                     llvm::GraphProgram::Name program) {
+  llvm::ViewGraph(&region, name, shortNames, title, program);
+}
+
+llvm::raw_ostream &mlir::writeGraph(llvm::raw_ostream &os, Region &region,
+                                    bool shortNames, const llvm::Twine &title) {
+  return llvm::WriteGraph(os, &region, shortNames, title);
+}
+
+void mlir::Region::viewGraph(const llvm::Twine &regionName) {
+  ::mlir::viewGraph(*this, regionName);
+}
+void mlir::Region::viewGraph() { viewGraph("region"); }
+
+namespace {
+struct PrintCFGPass : public FunctionPass<PrintCFGPass> {
+  PrintCFGPass(llvm::raw_ostream &os = llvm::errs(), bool shortNames = false,
+               const llvm::Twine &title = "")
+      : os(os), shortNames(shortNames), title(title.str()) {}
+  void runOnFunction() {
+    mlir::writeGraph(os, getFunction().getBody(), shortNames, title);
+  }
+
+private:
+  llvm::raw_ostream &os;
+  bool shortNames;
+  std::string title;
+};
+} // namespace
+
+FunctionPassBase *mlir::createPrintCFGGraphPass(llvm::raw_ostream &os,
+                                                bool shortNames,
+                                                const llvm::Twine &title) {
+  return new PrintCFGPass(os, shortNames, title);
+}
+
+static PassRegistration<PrintCFGPass> pass("print-cfg-graph",
+                                           "Print CFG graph per Function");
diff --git a/third_party/mlir/lib/Translation/CMakeLists.txt b/third_party/mlir/lib/Translation/CMakeLists.txt
new file mode 100644
index 00000000000..122db2e6a31
--- /dev/null
+++ b/third_party/mlir/lib/Translation/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(MLIRTranslation
+  Translation.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Translation
+  )
+target_link_libraries(MLIRTranslation LLVMSupport)
diff --git a/third_party/mlir/lib/Translation/Translation.cpp b/third_party/mlir/lib/Translation/Translation.cpp
new file mode 100644
index 00000000000..3025e9e51f9
--- /dev/null
+++ b/third_party/mlir/lib/Translation/Translation.cpp
@@ -0,0 +1,77 @@
+//===- Translation.cpp - Translation registry -----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Definitions of the translation registry.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Translation.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace mlir;
+
+// Get the mutable static map between registered "to MLIR" translations and the
+// TranslateToMLIRFunctions that perform those translations.
+static llvm::StringMap<TranslateToMLIRFunction> &
+getMutableTranslationToMLIRRegistry() {
+  static llvm::StringMap<TranslateToMLIRFunction> translationToMLIRRegistry;
+  return translationToMLIRRegistry;
+}
+// Get the mutable static map between registered "from MLIR" translations and
+// the TranslateFromMLIRFunctions that perform those translations.
+static llvm::StringMap<TranslateFromMLIRFunction> &
+getMutableTranslationFromMLIRRegistry() {
+  static llvm::StringMap<TranslateFromMLIRFunction> translationFromMLIRRegistry;
+  return translationFromMLIRRegistry;
+}
+
+TranslateToMLIRRegistration::TranslateToMLIRRegistration(
+    StringRef name, const TranslateToMLIRFunction &function) {
+  auto &translationToMLIRRegistry = getMutableTranslationToMLIRRegistry();
+  if (translationToMLIRRegistry.find(name) != translationToMLIRRegistry.end())
+    llvm::report_fatal_error(
+        "Attempting to overwrite an existing <to> function");
+  assert(function && "Attempting to register an empty translate <to> function");
+  translationToMLIRRegistry[name] = function;
+}
+
+TranslateFromMLIRRegistration::TranslateFromMLIRRegistration(
+    StringRef name, const TranslateFromMLIRFunction &function) {
+  auto &translationFromMLIRRegistry = getMutableTranslationFromMLIRRegistry();
+  if (translationFromMLIRRegistry.find(name) !=
+      translationFromMLIRRegistry.end())
+    llvm::report_fatal_error(
+        "Attempting to overwrite an existing <from> function");
+  assert(function &&
+         "Attempting to register an empty translate <from> function");
+  translationFromMLIRRegistry[name] = function;
+}
+
+// Merely add the const qualifier to the mutable registry so that external users
+// cannot modify it.
+const llvm::StringMap<TranslateToMLIRFunction> &
+mlir::getTranslationToMLIRRegistry() {
+  return getMutableTranslationToMLIRRegistry();
+}
+
+const llvm::StringMap<TranslateFromMLIRFunction> &
+mlir::getTranslationFromMLIRRegistry() {
+  return getMutableTranslationFromMLIRRegistry();
+}
diff --git a/third_party/mlir/mlir_configure.bzl b/third_party/mlir/mlir_configure.bzl
deleted file mode 100644
index ad6037b3d3b..00000000000
--- a/third_party/mlir/mlir_configure.bzl
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Repository rule to setup the external MLIR repository."""
-
-_MLIR_REV = "83ff81bfd9d382852d0302ab2a234feb2e938fc7"
-_MLIR_SHA256 = "26979670616980014a823f88c1a057c28080763d9cb189fa67172a92c085d349"
-
-def _mlir_autoconf_impl(repository_ctx):
-    """Implementation of the mlir_configure repository rule."""
-    repository_ctx.download_and_extract(
-        [
-            "http://mirror.tensorflow.org/github.com/tensorflow/mlir/archive/{}.zip".format(_MLIR_REV),
-            "https://github.com/tensorflow/mlir/archive/{}.zip".format(_MLIR_REV),
-        ],
-        sha256 = _MLIR_SHA256,
-        stripPrefix = "mlir-{}".format(_MLIR_REV),
-    )
-
-    # Merge the checked-in BUILD files into the downloaded repo.
-    for file in ["BUILD", "tblgen.bzl", "test/BUILD"]:
-        repository_ctx.template(file, Label("//third_party/mlir:" + file))
-
-mlir_configure = repository_rule(
-    implementation = _mlir_autoconf_impl,
-)
-"""Configures the MLIR repository.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-mlir_configure(name = "local_config_mlir")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/mlir/test/APITest.h b/third_party/mlir/test/APITest.h
new file mode 100644
index 00000000000..6b02108bad7
--- /dev/null
+++ b/third_party/mlir/test/APITest.h
@@ -0,0 +1,72 @@
+//===- Test.h - Simple macros for API unit tests ----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file define simple macros for declaring test functions and running them.
+// The actual checking must be performed on the outputs with FileCheck.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TEST_TEST_H_
+#define MLIR_TEST_TEST_H_
+
+#include <functional>
+#include <vector>
+
+namespace test_detail {
+// Returns a mutable list of known test functions.  Used internally by test
+// macros to add and run tests.  This function is static to ensure it creates a
+// new list in each test file.
+static std::vector<std::function<void()>> &tests() {
+  static std::vector<std::function<void()>> list;
+  return list;
+}
+
+// Test registration class.  Used internally by test macros to register tests
+// during static allocation.
+struct TestRegistration {
+  explicit TestRegistration(std::function<void()> func) {
+    test_detail::tests().push_back(func);
+  }
+};
+} // end namespace test_detail
+
+/// Declares a test function with the given name and adds it to the list of
+/// known tets.  The body of the function must follow immediately.  Example:
+///
+/// TEST_FUNC(mytest) {
+///   // CHECK: expected-output-here
+///   emitSomethingToStdOut();
+/// }
+///
+#define TEST_FUNC(name)                                                        \
+  void name();                                                                 \
+  static test_detail::TestRegistration name##Registration(name);               \
+  void name()
+
+/// Runs all registered tests.  Example:
+///
+/// int main() {
+///   RUN_TESTS();
+///   return 0;
+/// }
+#define RUN_TESTS                                                              \
+  []() {                                                                       \
+    for (auto f : test_detail::tests())                                        \
+      f();                                                                     \
+  }
+
+#endif // MLIR_TEST_TEST_H_
diff --git a/third_party/mlir/test/BUILD b/third_party/mlir/test/BUILD
index 90bdfa4f56c..2efb301ae3f 100644
--- a/third_party/mlir/test/BUILD
+++ b/third_party/mlir/test/BUILD
@@ -14,6 +14,7 @@ load("@local_config_mlir//:tblgen.bzl", "gentbl")
 cc_library(
     name = "IRProducingAPITest",
     hdrs = ["APITest.h"],
+    copts = ["-std=c++14"],
     includes = ["."],
 )
 
@@ -52,15 +53,16 @@ cc_library(
     hdrs = [
         "lib/TestDialect/TestDialect.h",
     ],
+    copts = ["-std=c++14"],
     includes = ["lib/TestDialect"],
     deps = [
         ":TestOpsIncGen",
         "@llvm//:support",
+        "@local_config_mlir//:Dialect",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:Pass",
         "@local_config_mlir//:Support",
         "@local_config_mlir//:Transforms",
-        "@local_config_mlir//:TypeUtilities",
     ],
     alwayslink = 1,
 )
@@ -70,14 +72,18 @@ cc_library(
     srcs = [
         "lib/Transforms/TestConstantFold.cpp",
         "lib/Transforms/TestLoopFusion.cpp",
+        "lib/Transforms/TestLoopMapping.cpp",
+        "lib/Transforms/TestLoopParametricTiling.cpp",
         "lib/Transforms/TestVectorizationUtils.cpp",
     ],
+    copts = ["-std=c++14"],
     deps = [
         "@llvm//:support",
         "@local_config_mlir//:AffineOps",
         "@local_config_mlir//:Analysis",
         "@local_config_mlir//:EDSC",
         "@local_config_mlir//:IR",
+        "@local_config_mlir//:LoopOps",
         "@local_config_mlir//:Pass",
         "@local_config_mlir//:StandardOps",
         "@local_config_mlir//:Support",
diff --git a/third_party/mlir/test/CMakeLists.txt b/third_party/mlir/test/CMakeLists.txt
new file mode 100644
index 00000000000..93170e4f0c6
--- /dev/null
+++ b/third_party/mlir/test/CMakeLists.txt
@@ -0,0 +1,69 @@
+add_subdirectory(EDSC)
+add_subdirectory(mlir-cpu-runner)
+add_subdirectory(SDBM)
+add_subdirectory(lib)
+
+llvm_canonicalize_cmake_booleans(
+  LLVM_BUILD_EXAMPLES
+  )
+
+# Passed to lit.site.cfg.py.in to set up the path where to find the libraries
+# for linalg integration tests.
+set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+# Passed to lit.site.cfg.py.in to set up the path where to find the libraries
+# for the mlir cuda runner tests.
+set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+  )
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
+  )
+
+set(MLIR_TEST_DEPENDS
+  FileCheck count not
+  MLIRUnitTests
+  mlir-cpu-runner
+  mlir-edsc-builder-api-test
+  mlir-opt
+  mlir-sdbm-api-test
+  mlir-tblgen
+  mlir-translate
+  cblas
+  cblas_interface
+  )
+
+if(LLVM_BUILD_EXAMPLES)
+  list(APPEND MLIR_TEST_DEPENDS
+    linalg1-opt
+    toyc-ch1
+    toyc-ch2
+    toyc-ch3
+    toyc-ch4
+    toyc-ch5
+    )
+endif()
+
+if(MLIR_CUDA_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS
+    mlir-cuda-runner
+  )
+endif()
+
+add_lit_testsuite(check-mlir "Running the MLIR regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${MLIR_TEST_DEPENDS}
+  )
+set_target_properties(check-mlir PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(MLIR ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${MLIR_TEST_DEPS}
+)
diff --git a/third_party/mlir/test/lib/CMakeLists.txt b/third_party/mlir/test/lib/CMakeLists.txt
new file mode 100644
index 00000000000..860376bd52b
--- /dev/null
+++ b/third_party/mlir/test/lib/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(TestDialect)
+add_subdirectory(Transforms)
diff --git a/third_party/mlir/test/lib/TestDialect/CMakeLists.txt b/third_party/mlir/test/lib/TestDialect/CMakeLists.txt
new file mode 100644
index 00000000000..77bcd4201b7
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(LLVM_OPTIONAL_SOURCES
+  TestDialect.cpp
+  TestPatterns.cpp
+)
+
+set(LLVM_TARGET_DEFINITIONS TestOps.td)
+mlir_tablegen(TestOps.h.inc -gen-op-decls)
+mlir_tablegen(TestOps.cpp.inc -gen-op-defs)
+mlir_tablegen(TestPatterns.inc -gen-rewriters)
+add_public_tablegen_target(MLIRTestOpsIncGen)
+
+add_llvm_library(MLIRTestDialect
+  TestDialect.cpp
+  TestPatterns.cpp
+)
+add_dependencies(MLIRTestDialect
+  MLIRTestOpsIncGen
+  MLIRIR
+  LLVMSupport
+)
+target_link_libraries(MLIRTestDialect
+  MLIRDialect
+  MLIRIR
+  LLVMSupport
+)
diff --git a/third_party/mlir/test/lib/TestDialect/TestDialect.cpp b/third_party/mlir/test/lib/TestDialect/TestDialect.cpp
new file mode 100644
index 00000000000..af5c5c829d8
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/TestDialect.cpp
@@ -0,0 +1,110 @@
+//===- TestDialect.cpp - MLIR Dialect for Testing -------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "TestDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// TestDialect
+//===----------------------------------------------------------------------===//
+
+TestDialect::TestDialect(MLIRContext *context)
+    : Dialect(getDialectName(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "TestOps.cpp.inc"
+      >();
+  allowUnknownOperations();
+}
+
+//===----------------------------------------------------------------------===//
+// Test IsolatedRegionOp - parse passthrough region arguments.
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseIsolatedRegionOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  OpAsmParser::OperandType argInfo;
+  Type argType = parser->getBuilder().getIndexType();
+
+  // Parse the input operand.
+  if (parser->parseOperand(argInfo) ||
+      parser->resolveOperand(argInfo, argType, result->operands))
+    return failure();
+
+  // Parse the body region, and reuse the operand info as the argument info.
+  Region *body = result->addRegion();
+  return parser->parseRegion(*body, argInfo, argType,
+                             /*enableNameShadowing=*/true);
+}
+
+static void print(OpAsmPrinter *p, IsolatedRegionOp op) {
+  *p << "test.isolated_region ";
+  p->printOperand(op.getOperand());
+  p->shadowRegionArgs(op.region(), op.getOperand());
+  p->printRegion(op.region(), /*printEntryBlockArgs=*/false);
+}
+
+//===----------------------------------------------------------------------===//
+// Test PolyForOp - parse list of region arguments.
+//===----------------------------------------------------------------------===//
+static ParseResult parsePolyForOp(OpAsmParser *parser, OperationState *result) {
+  SmallVector<OpAsmParser::OperandType, 4> ivsInfo;
+  // Parse list of region arguments without a delimiter.
+  if (parser->parseRegionArgumentList(ivsInfo))
+    return failure();
+
+  // Parse the body region.
+  Region *body = result->addRegion();
+  auto &builder = parser->getBuilder();
+  SmallVector<Type, 4> argTypes(ivsInfo.size(), builder.getIndexType());
+  return parser->parseRegion(*body, ivsInfo, argTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// Test removing op with inner ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TestRemoveOpWithInnerOps
+    : public OpRewritePattern<TestOpWithRegionPattern> {
+  using OpRewritePattern<TestOpWithRegionPattern>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TestOpWithRegionPattern op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace
+
+void TestOpWithRegionPattern::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<TestRemoveOpWithInnerOps>(context);
+}
+
+OpFoldResult TestOpWithRegionFold::fold(ArrayRef<Attribute> operands) {
+  return operand();
+}
+
+// Static initialization for Test dialect registration.
+static mlir::DialectRegistration<mlir::TestDialect> testDialect;
+
+#define GET_OP_CLASSES
+#include "TestOps.cpp.inc"
diff --git a/third_party/mlir/test/lib/TestDialect/TestDialect.h b/third_party/mlir/test/lib/TestDialect/TestDialect.h
new file mode 100644
index 00000000000..8e3efa38ca7
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/TestDialect.h
@@ -0,0 +1,48 @@
+//===- TestDialect.h - MLIR Dialect for testing -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file defines a fake 'test' dialect that can be used for testing things
+// that do not have a respective counterpart in the main source directories.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TESTDIALECT_H
+#define MLIR_TESTDIALECT_H
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+
+class TestDialect : public Dialect {
+public:
+  /// Create the dialect in the given `context`.
+  TestDialect(MLIRContext *context);
+
+  /// Get the canonical string name of the dialect.
+  static StringRef getDialectName() { return "test"; }
+};
+
+#define GET_OP_CLASSES
+#include "TestOps.h.inc"
+
+} // end namespace mlir
+
+#endif // MLIR_TESTDIALECT_H
diff --git a/third_party/mlir/test/lib/TestDialect/TestOps.td b/third_party/mlir/test/lib/TestDialect/TestOps.td
new file mode 100644
index 00000000000..f2d7aef7122
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/TestOps.td
@@ -0,0 +1,747 @@
+//===-- TestOps.td - Test dialect operation definitions ----*- tablegen -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifdef TEST_OPS
+#else
+#define TEST_OPS
+
+#ifdef OP_BASE
+#else
+include "mlir/IR/OpBase.td"
+#endif // OP_BASE
+
+def TEST_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "";
+}
+
+class TEST_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TEST_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Test Types
+//===----------------------------------------------------------------------===//
+
+def AnyVectorOrTensor: AnyTypeOf<[AnyVector, AnyTensor]>;
+
+def TupleOp : TEST_Op<"tuple_32_bit"> {
+  let results = (outs TupleOf<[I32, F32]>);
+}
+
+def NestedTupleOp : TEST_Op<"nested_tuple_32_bit"> {
+  let results = (outs NestedTupleOf<[I32, F32]>);
+}
+
+def TakesStaticMemRefOp : TEST_Op<"takes_static_memref"> {
+  let arguments = (ins AnyStaticShapeMemRef:$x);
+}
+
+def I32TensorRank0Or1Op : TEST_Op<"i32_tensor_rank_0_or_1"> {
+  let arguments = (ins
+    Type<And<[I32Tensor.predicate, HasAnyRankOfPred<[0, 1]>]>,
+         "tensor<i32> or tensor<?xi32>">:$arg0
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Operands
+//===----------------------------------------------------------------------===//
+
+def MixedNormalVariadicOperandOp : TEST_Op<
+    "mixed_normal_variadic_operand", [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<AnyTensor>:$input1,
+    AnyTensor:$input2,
+    Variadic<AnyTensor>:$input3
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Results
+//===----------------------------------------------------------------------===//
+
+def MixedNormalVariadicResults : TEST_Op<
+    "mixed_normal_variadic_result", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<AnyTensor>:$output1,
+    AnyTensor:$output2,
+    Variadic<AnyTensor>:$output3
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Attributes
+//===----------------------------------------------------------------------===//
+
+def NonNegIntAttrOp : TEST_Op<"non_negative_int_attr"> {
+  let arguments = (ins
+      NonNegativeI32Attr:$i32attr,
+      NonNegativeI64Attr:$i64attr
+  );
+}
+
+def PositiveIntAttrOp : TEST_Op<"positive_int_attr"> {
+  let arguments = (ins
+      PositiveI32Attr:$i32attr,
+      PositiveI64Attr:$i64attr
+  );
+}
+
+def TypeArrayAttrOp : TEST_Op<"type_array_attr"> {
+  let arguments = (ins TypeArrayAttr:$attr);
+}
+def TypeStringAttrWithTypeOp : TEST_Op<"string_attr_with_type"> {
+  let arguments = (ins StrAttr:$attr);
+  let printer = [{ *p << getAttr("attr"); }];
+  let parser = [{
+    Attribute attr;
+    Type stringType = OpaqueType::get(Identifier::get("foo",
+                                      result->getContext()), "string",
+                                      result->getContext());
+    return parser->parseAttribute(attr, stringType, "attr", result->attributes);
+  }];
+}
+
+def StrCaseA: StrEnumAttrCase<"A">;
+def StrCaseB: StrEnumAttrCase<"B">;
+
+def SomeStrEnum: StrEnumAttr<
+  "SomeStrEnum", "", [StrCaseA, StrCaseB]>;
+
+def StrEnumAttrOp : TEST_Op<"str_enum_attr"> {
+  let arguments = (ins SomeStrEnum:$attr);
+  let results = (outs I32:$val);
+}
+
+def I32Case5:  I32EnumAttrCase<"case5", 5>;
+def I32Case10: I32EnumAttrCase<"case10", 10>;
+
+def SomeI32Enum: I32EnumAttr<
+  "SomeI32Enum", "", [I32Case5, I32Case10]>;
+
+def I32EnumAttrOp : TEST_Op<"i32_enum_attr"> {
+  let arguments = (ins SomeI32Enum:$attr);
+  let results = (outs I32:$val);
+}
+
+def I64Case5:  I64EnumAttrCase<"case5", 5>;
+def I64Case10: I64EnumAttrCase<"case10", 10>;
+
+def SomeI64Enum: I64EnumAttr<
+  "SomeI64Enum", "", [I64Case5, I64Case10]>;
+
+def I64EnumAttrOp : TEST_Op<"i64_enum_attr"> {
+  let arguments = (ins SomeI64Enum:$attr);
+  let results = (outs I32:$val);
+}
+
+//===----------------------------------------------------------------------===//
+// Test Regions
+//===----------------------------------------------------------------------===//
+
+def TwoRegionOp : TEST_Op<"two_region_op", []> {
+  let regions = (region AnyRegion, AnyRegion);
+}
+
+def SizedRegionOp : TEST_Op<"sized_region_op", []> {
+  let regions = (region SizedRegion<2>:$my_region, SizedRegion<1>);
+}
+
+//===----------------------------------------------------------------------===//
+// Test Traits
+//===----------------------------------------------------------------------===//
+
+def SameOperandElementTypeOp : TEST_Op<"same_operand_type",
+    [SameOperandsElementType]> {
+  let arguments = (ins AnyVectorOrTensor:$x, AnyVectorOrTensor:$y);
+  let results = (outs AnyVectorOrTensor:$res);
+}
+
+def SameOperandAndResultElementTypeOp : TEST_Op<"same_operand_and_result_type",
+    [SameOperandsAndResultElementType]> {
+  let arguments = (ins AnyVectorOrTensor:$x, AnyVectorOrTensor:$y);
+  let results = (outs AnyVectorOrTensor:$res);
+}
+
+def SameOperandShapeOp : TEST_Op<"same_operand_shape", [SameOperandsShape]> {
+  let arguments = (ins AnyVectorOrTensor:$x, AnyVectorOrTensor:$y);
+  let results = (outs AnyVectorOrTensor:$res);
+}
+
+def SameOperandAndResultShapeOp : TEST_Op<"same_operand_and_result_shape",
+    [SameOperandsAndResultShape]> {
+  let arguments = (ins AnyVectorOrTensor:$x, AnyVectorOrTensor:$y);
+  let results = (outs AnyVectorOrTensor:$res);
+}
+
+def ArgAndResHaveFixedElementTypesOp :
+    TEST_Op<"arg_and_res_have_fixed_element_types",
+      [PredOpTrait<"fixed type combination",
+         Or<[And<[ElementTypeIsPred<"x", I32>,
+                  ElementTypeIsPred<"y", F32>]>,
+             ElementTypeIsPred<"attr", I8>]>>,
+      ElementTypeIs<"res", I16>]> {
+  let arguments = (ins
+    AnyVectorOrTensor:$x, AnyVectorOrTensor:$y, AnyAttr:$attr);
+  let results = (outs AnyVectorOrTensor:$res);
+}
+
+def OperandsHaveSameElementType : TEST_Op<"operands_have_same_element_type", [
+    AllElementTypesMatch<["x", "y"]>]> {
+  let arguments = (ins AnyTensor:$x, AnyTensor:$y);
+}
+
+def OperandOneAndResultHaveSameElementType : TEST_Op<
+    "operand_one_and_result_have_same_element_type",
+    [AllElementTypesMatch<["x", "res"]>]> {
+  let arguments = (ins AnyTensor:$x, AnyTensor:$y);
+  let results = (outs AnyTensor:$res);
+}
+
+def OperandsHaveSameType :
+    TEST_Op<"operands_have_same_type", [AllTypesMatch<["x", "y"]>]> {
+  let arguments = (ins AnyTensor:$x, AnyTensor:$y);
+}
+
+def OperandOneAndResultHaveSameType :
+    TEST_Op<"operand_one_and_result_have_same_type",
+            [AllTypesMatch<["x", "res"]>]> {
+  let arguments = (ins AnyTensor:$x, AnyTensor:$y);
+  let results = (outs AnyTensor:$res);
+}
+
+def IfFirstOperandIsNoneThenSoIsSecond :
+    TEST_Op<"if_first_operand_is_none_then_so_is_second", [PredOpTrait<
+    "has either both none type operands or first is not none",
+     Or<[
+        And<[TypeIsPred<"x", NoneType>, TypeIsPred<"y", NoneType>]>,
+        Neg<TypeIsPred<"x", NoneType>>]>>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+}
+
+def BroadcastableOp : TEST_Op<"broadcastable", [Broadcastable]> {
+  let arguments = (ins AnyTensor:$x, AnyTensor:$y);
+  let results = (outs AnyTensor:$res);
+}
+
+// There the "HasParent" trait.
+def ParentOp : TEST_Op<"parent">;
+def ChildOp : TEST_Op<"child", [HasParent<"ParentOp">]>;
+
+
+def TerminatorOp : TEST_Op<"finish", [Terminator]> {
+}
+def SingleBlockImplicitTerminatorOp : TEST_Op<"SingleBlockImplicitTerminator",
+    [SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let regions = (region SizedRegion<1>:$region);
+}
+
+def I32ElementsAttributesOp : TEST_Op<"i32ElementsAttr"> {
+  let arguments = (ins I32ElementsAttr:$attr);
+}
+
+//===----------------------------------------------------------------------===//
+// Test Patterns
+//===----------------------------------------------------------------------===//
+
+def OpA : TEST_Op<"op_a"> {
+  let arguments = (ins I32:$operand, I32Attr:$attr);
+  let results = (outs I32:$result);
+}
+
+def OpB : TEST_Op<"op_b"> {
+  let arguments = (ins I32:$operand, I32Attr:$attr);
+  let results = (outs I32:$result);
+}
+
+// Test named pattern.
+def TestNamedPatternRule : Pat<(OpA $input, $attr), (OpB $input, $attr)>;
+
+// Test with fused location.
+def : Pat<(OpA (OpA $input, $attr), $bttr), (OpB $input, $bttr)>;
+
+// Test added benefit.
+def OpD : TEST_Op<"op_d">, Arguments<(ins I32:$arg)>, Results<(outs I32:$res)>;
+def OpE : TEST_Op<"op_e">, Arguments<(ins I32:$arg)>, Results<(outs I32:$res)>;
+def OpF : TEST_Op<"op_f">, Arguments<(ins I32:$arg)>, Results<(outs I32:$res)>;
+def OpG : TEST_Op<"op_g">, Arguments<(ins I32:$arg)>, Results<(outs I32:$res)>;
+// Verify that bumping benefit results in selecting different op.
+def : Pat<(OpD $input), (OpE $input)>;
+def : Pat<(OpD $input), (OpF $input), [], (addBenefit 10)>;
+// Verify that patterns with more source nodes are selected before those with fewer.
+def : Pat<(OpG $input), (OpB $input, ConstantAttr<I32Attr, "20">:$attr)>;
+def : Pat<(OpG (OpG $input)), (OpB $input, ConstantAttr<I32Attr, "34">:$attr)>;
+
+// Test patterns for zero-result op.
+def OpH : TEST_Op<"op_h">, Arguments<(ins I32:$arg)>, Results<(outs)>;
+def OpI : TEST_Op<"op_i">, Arguments<(ins I32:$arg)>, Results<(outs)>;
+def : Pat<(OpH $input), (OpI $input)>;
+
+// Test patterns for zero-input op.
+def OpJ : TEST_Op<"op_j">, Arguments<(ins)>, Results<(outs I32:$res)>;
+def OpK : TEST_Op<"op_k">, Arguments<(ins)>, Results<(outs I32:$res)>;
+def : Pat<(OpJ), (OpK)>;
+
+// Test NativeCodeCall.
+def OpNativeCodeCall1 : TEST_Op<"native_code_call1"> {
+  let arguments = (ins
+    I32:$input1, I32:$input2,
+    BoolAttr:$choice,
+    I64Attr:$attr1, I64Attr:$attr2
+  );
+  let results = (outs I32:$output);
+}
+def OpNativeCodeCall2 : TEST_Op<"native_code_call2"> {
+  let arguments = (ins I32:$input, I64ArrayAttr:$attr);
+  let results = (outs I32:$output);
+}
+// Native code call to invoke a C++ function
+def CreateOperand: NativeCodeCall<"chooseOperand($0, $1, $2)">;
+// Native code call to invoke a C++ expression
+def CreateArraryAttr: NativeCodeCall<"$_builder.getArrayAttr({$0, $1})">;
+// Test that we can use NativeCodeCall to create operand and attribute.
+// This pattern chooses between $input1 and $input2 according to $choice and
+// it combines $attr1 and $attr2 into an array attribute.
+def : Pat<(OpNativeCodeCall1 $input1, $input2,
+                             ConstBoolAttrTrue:$choice, $attr1, $attr2),
+          (OpNativeCodeCall2 (CreateOperand $input1, $input2, $choice),
+                             (CreateArraryAttr $attr1, $attr2))>;
+// Note: the following is just for testing purpose.
+// Should use the replaceWithValue directive instead.
+def UseOpResult: NativeCodeCall<"$0">;
+// Test that we can use NativeCodeCall to create result.
+def : Pat<(OpNativeCodeCall1 $input1, $input2,
+                             ConstBoolAttrFalse, $attr1, $attr2),
+          (UseOpResult $input2)>;
+
+// Test AllAttrConstraintsOf.
+def OpAllAttrConstraint1 : TEST_Op<"all_attr_constraint_of1"> {
+  let arguments = (ins I64ArrayAttr:$attr);
+  let results = (outs I32:$output);
+}
+def OpAllAttrConstraint2 : TEST_Op<"all_attr_constraint_of2"> {
+  let arguments = (ins I64ArrayAttr:$attr);
+  let results = (outs I32:$output);
+}
+def Constraint0 : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().getValue()[0]."
+          "cast<IntegerAttr>().getInt() == 0">,
+    "[0] == 0">;
+def Constraint1 : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().getValue()[1]."
+          "cast<IntegerAttr>().getInt() == 1">,
+    "[1] == 1">;
+def : Pat<(OpAllAttrConstraint1
+            AllAttrConstraintsOf<[Constraint0, Constraint1]>:$attr),
+          (OpAllAttrConstraint2 $attr)>;
+
+// Op for testing RewritePattern removing op with inner ops.
+def TestOpWithRegionPattern : TEST_Op<"op_with_region_pattern"> {
+  let regions = (region SizedRegion<1>:$region);
+  let hasCanonicalizer = 1;
+}
+
+// Op for testing trivial removal via folding of op with inner ops and no uses.
+def TestOpWithRegionFoldNoSideEffect : TEST_Op<
+    "op_with_region_fold_no_side_effect", [NoSideEffect]> {
+  let regions = (region SizedRegion<1>:$region);
+}
+
+// Op for testing folding of outer op with inner ops.
+def TestOpWithRegionFold : TEST_Op<"op_with_region_fold"> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32:$result);
+  let regions = (region SizedRegion<1>:$region);
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Symbol Binding)
+
+// Test symbol binding.
+def OpSymbolBindingA : TEST_Op<"symbol_binding_a", []> {
+  let arguments = (ins I32:$operand, I64Attr:$attr);
+  let results = (outs I32:$result);
+}
+def OpSymbolBindingB : TEST_Op<"symbol_binding_b", []> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32:$result);
+
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState *state, Value *operand",
+      [{
+        state->types.assign({builder->getIntegerType(32)});
+        state->addOperands({operand});
+      }]>
+  ];
+}
+def OpSymbolBindingC : TEST_Op<"symbol_binding_c", []> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32:$result);
+  let builders = OpSymbolBindingB.builders;
+}
+def OpSymbolBindingD : TEST_Op<"symbol_binding_d", []> {
+  let arguments = (ins I32:$input1, I32:$input2, I64Attr:$attr);
+  let results = (outs I32:$result);
+}
+def HasOneUse: Constraint<CPred<"$0->hasOneUse()">, "has one use">;
+def : Pattern<
+    // Bind to source pattern op operand/attribute/result
+    (OpSymbolBindingA:$res_a $operand, $attr), [
+        // Bind to auxiliary op result
+        (OpSymbolBindingC:$res_c (OpSymbolBindingB:$res_b $operand)),
+
+        // Use bound symbols in resultant ops
+        (OpSymbolBindingD $res_b, $res_c, $attr)],
+    // Use bound symbols in additional constraints
+    [(HasOneUse $res_a)]>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Attributes)
+
+// Test matching against op attributes.
+def OpAttrMatch1 : TEST_Op<"match_op_attribute1"> {
+  let arguments = (ins
+    I32Attr:$required_attr,
+    OptionalAttr<I32Attr>:$optional_attr,
+    DefaultValuedAttr<I32Attr, "42">:$default_valued_attr,
+    I32Attr:$more_attr
+  );
+  let results = (outs I32:$output);
+}
+def OpAttrMatch2 : TEST_Op<"match_op_attribute2"> {
+  let arguments = OpAttrMatch1.arguments;
+  let results = (outs I32:$output);
+}
+def MoreConstraint : AttrConstraint<
+    CPred<"$_self.cast<IntegerAttr>().getInt() == 4">, "more constraint">;
+def : Pat<(OpAttrMatch1 $required, $optional, $default_valued,
+                        MoreConstraint:$more),
+          (OpAttrMatch2 $required, $optional, $default_valued, $more)>;
+
+// Test unit attrs.
+def OpAttrMatch3 : TEST_Op<"match_op_attribute3"> {
+  let arguments = (ins UnitAttr:$attr);
+  let results = (outs I32);
+}
+def OpAttrMatch4 : TEST_Op<"match_op_attribute4"> {
+  let arguments = (ins UnitAttr:$attr1, UnitAttr:$attr2);
+  let results = (outs I32);
+}
+def : Pat<(OpAttrMatch3 $attr), (OpAttrMatch4 ConstUnitAttr, $attr)>;
+
+// Test with constant attr.
+def OpC : TEST_Op<"op_c">, Arguments<(ins I32:$arg)>, Results<(outs I32:$res)>;
+def : Pat<(OpC $input), (OpB $input, ConstantAttr<I32Attr, "17">:$attr)>;
+
+// Test string enum attribute in rewrites.
+def : Pat<(StrEnumAttrOp StrCaseA), (StrEnumAttrOp StrCaseB)>;
+// Test integer enum attribute in rewrites.
+def : Pat<(I32EnumAttrOp I32Case5), (I32EnumAttrOp I32Case10)>;
+def : Pat<(I64EnumAttrOp I64Case5), (I64EnumAttrOp I64Case10)>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Multi-result Ops)
+
+def MultiResultOpKind1: I64EnumAttrCase<"kind1", 1>;
+def MultiResultOpKind2: I64EnumAttrCase<"kind2", 2>;
+def MultiResultOpKind3: I64EnumAttrCase<"kind3", 3>;
+def MultiResultOpKind4: I64EnumAttrCase<"kind4", 4>;
+def MultiResultOpKind5: I64EnumAttrCase<"kind5", 5>;
+def MultiResultOpKind6: I64EnumAttrCase<"kind6", 6>;
+
+def MultiResultOpEnum: I64EnumAttr<
+  "Multi-result op kinds", "", [
+    MultiResultOpKind1, MultiResultOpKind2, MultiResultOpKind3,
+    MultiResultOpKind4, MultiResultOpKind5, MultiResultOpKind6
+  ]>;
+
+def ThreeResultOp : TEST_Op<"three_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2, F32:$result3);
+}
+
+def AnotherThreeResultOp : TEST_Op<"another_three_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2, F32:$result3);
+}
+
+def TwoResultOp : TEST_Op<"two_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2);
+
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState *state, IntegerAttr kind",
+      [{
+        auto i32 = builder->getIntegerType(32);
+        auto f32 = builder->getF32Type();
+        state->types.assign({i32, f32});
+        state->addAttribute("kind", kind);
+      }]>
+  ];
+}
+
+def AnotherTwoResultOp : TEST_Op<"another_two_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs F32:$result1, F32:$result2);
+}
+
+def OneResultOp1 : TEST_Op<"one_result1"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs F32:$result1);
+}
+
+def OneResultOp2 : TEST_Op<"one_result2"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1);
+}
+
+def OneResultOp3 : TEST_Op<"one_result3"> {
+  let arguments = (ins F32:$input);
+  let results = (outs I32:$result1);
+}
+
+// Test using multi-result op as a whole
+def : Pat<(ThreeResultOp MultiResultOpKind1),
+          (AnotherThreeResultOp MultiResultOpKind1)>;
+
+// Test using multi-result op as a whole for partial replacement
+def : Pattern<(ThreeResultOp MultiResultOpKind2),
+              [(TwoResultOp MultiResultOpKind2),
+               (OneResultOp1 MultiResultOpKind2)]>;
+def : Pattern<(ThreeResultOp MultiResultOpKind3),
+              [(OneResultOp2 MultiResultOpKind3),
+               (AnotherTwoResultOp MultiResultOpKind3)]>;
+
+// Test using results separately in a multi-result op
+def : Pattern<(ThreeResultOp MultiResultOpKind4),
+              [(TwoResultOp:$res1__0 MultiResultOpKind4),
+               (OneResultOp1 MultiResultOpKind4),
+               (TwoResultOp:$res2__1 MultiResultOpKind4)]>;
+
+// Test referencing a single value in the value pack
+// This rule only matches TwoResultOp if its second result has no use.
+def : Pattern<(TwoResultOp:$res MultiResultOpKind5),
+              [(OneResultOp2 MultiResultOpKind5),
+               (OneResultOp1 MultiResultOpKind5)],
+              [(HasNoUseOf:$res__1)]>;
+
+// Test using auxiliary ops for replacing multi-result op
+def : Pattern<
+    (ThreeResultOp MultiResultOpKind6), [
+        // Auxiliary op generated to help building the final result but not
+        // directly used to replace the source op's results.
+        (TwoResultOp:$interm MultiResultOpKind6),
+
+        (OneResultOp3 $interm__1),
+        (AnotherTwoResultOp MultiResultOpKind6)
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Variadic Ops)
+
+def OneVResOneVOperandOp1 : TEST_Op<"one_variadic_out_one_variadic_in1"> {
+  let arguments = (ins Variadic<I32>:$inputs);
+  let results = (outs Variadic<I32>:$outputs);
+}
+def OneVResOneVOperandOp2 : TEST_Op<"one_variadic_out_one_variadic_in2"> {
+  let arguments = (ins Variadic<I32>:$inputs);
+  let results = (outs Variadic<I32>:$outputs);
+}
+
+// Rewrite an op with one variadic operand and one variadic result to
+// another similiar op.
+def : Pat<(OneVResOneVOperandOp1 $inputs), (OneVResOneVOperandOp2 $inputs)>;
+
+def MixedVOperandOp1 : TEST_Op<"mixed_variadic_in1",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<I32>:$input1,
+    F32:$input2,
+    Variadic<I32>:$input3
+  );
+}
+
+def MixedVOperandOp2 : TEST_Op<"mixed_variadic_in2",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<I32>:$input1,
+    F32:$input2,
+    Variadic<I32>:$input3
+  );
+}
+
+// Rewrite an op with both variadic operands and normal operands.
+def : Pat<(MixedVOperandOp1 $input1, $input2, $input3),
+          (MixedVOperandOp2 $input1, $input2, $input3)>;
+
+def MixedVResultOp1 : TEST_Op<"mixed_variadic_out1", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<I32>:$output1,
+    F32:$output2,
+    Variadic<I32>:$output3
+  );
+}
+
+def MixedVResultOp2 : TEST_Op<"mixed_variadic_out2", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<I32>:$output1,
+    F32:$output2,
+    Variadic<I32>:$output3
+  );
+}
+
+// Rewrite an op with both variadic results and normal results.
+// Note that because we are generating the op with a top-level result pattern,
+// we are able to deduce the correct result types for the generated op using
+// the information from the matched root op.
+def : Pat<(MixedVResultOp1), (MixedVResultOp2)>;
+
+def OneI32ResultOp : TEST_Op<"one_i32_out"> {
+  let results = (outs I32:$output);
+}
+
+def MixedVOperandOp3 : TEST_Op<"mixed_variadic_in3",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    I32:$input1,
+    Variadic<I32>:$input2,
+    Variadic<I32>:$input3,
+    I32Attr:$count
+  );
+
+  let results = (outs I32:$output);
+}
+
+def MixedVResultOp3 : TEST_Op<"mixed_variadic_out3",
+                               [SameVariadicResultSize]> {
+  let arguments = (ins I32Attr:$count);
+
+  let results = (outs
+    I32:$output1,
+    Variadic<I32>:$output2,
+    Variadic<I32>:$output3
+  );
+
+  // We will use this op in a nested result pattern, where we cannot deduce the
+  // result type. So need to provide a builder not requiring result types.
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState *state, IntegerAttr count",
+      [{
+        auto i32Type = builder->getIntegerType(32);
+        state->addTypes(i32Type); // $ouput1
+        SmallVector<Type, 4> types(count.getInt(), i32Type);
+        state->addTypes(types); // $ouput2
+        state->addTypes(types); // $ouput3
+        state->addAttribute("count", count);
+      }]>
+  ];
+}
+
+// Generates an op with variadic results using nested pattern.
+def : Pat<(OneI32ResultOp),
+          (MixedVOperandOp3
+              (MixedVResultOp3:$results__0 ConstantAttr<I32Attr, "2">),
+              (replaceWithValue $results__1),
+              (replaceWithValue $results__2),
+              ConstantAttr<I32Attr, "2">)>;
+
+//===----------------------------------------------------------------------===//
+// Test Legalization
+//===----------------------------------------------------------------------===//
+
+def Test_LegalizerEnum_Success : StrEnumAttrCase<"Success">;
+def Test_LegalizerEnum_Failure : StrEnumAttrCase<"Failure">;
+
+def Test_LegalizerEnum : StrEnumAttr<"Success", "Failure",
+  [Test_LegalizerEnum_Success, Test_LegalizerEnum_Failure]>;
+
+def ILLegalOpA : TEST_Op<"illegal_op_a">, Results<(outs I32:$res)>;
+def ILLegalOpB : TEST_Op<"illegal_op_b">, Results<(outs I32:$res)>;
+def ILLegalOpC : TEST_Op<"illegal_op_c">, Results<(outs I32:$res)>;
+def ILLegalOpD : TEST_Op<"illegal_op_d">, Results<(outs I32:$res)>;
+def ILLegalOpE : TEST_Op<"illegal_op_e">, Results<(outs I32:$res)>;
+def ILLegalOpF : TEST_Op<"illegal_op_f">, Results<(outs I32:$res)>;
+def LegalOpA : TEST_Op<"legal_op_a">,
+  Arguments<(ins Test_LegalizerEnum:$status)>, Results<(outs I32:$res)>;
+
+// Check that smaller pattern depths are chosen, i.e. prioritize more direct
+// mappings.
+def : Pat<(ILLegalOpA), (LegalOpA Test_LegalizerEnum_Success)>;
+
+def : Pat<(ILLegalOpA), (ILLegalOpB)>;
+def : Pat<(ILLegalOpB), (LegalOpA Test_LegalizerEnum_Failure)>;
+
+// Check that the higher benefit pattern is taken for multiple legalizations
+// with the same depth.
+def : Pat<(ILLegalOpC), (ILLegalOpD)>;
+def : Pat<(ILLegalOpD), (LegalOpA Test_LegalizerEnum_Failure)>;
+
+def : Pat<(ILLegalOpC), (ILLegalOpE), [], (addBenefit 10)>;
+def : Pat<(ILLegalOpE), (LegalOpA Test_LegalizerEnum_Success)>;
+
+// Check that patterns use the most up-to-date value when being replaced.
+def TestRewriteOp : TEST_Op<"rewrite">,
+  Arguments<(ins AnyType:$input)>, Results<(outs AnyType:$res)>;
+def : Pat<(TestRewriteOp $input), (replaceWithValue $input)>;
+
+//===----------------------------------------------------------------------===//
+// Test Type Legalization
+//===----------------------------------------------------------------------===//
+
+def TestRegionBuilderOp : TEST_Op<"region_builder">;
+def TestReturnOp : TEST_Op<"return", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>:$inputs)>;
+def TestCastOp : TEST_Op<"cast">,
+  Arguments<(ins Variadic<AnyType>:$inputs)>, Results<(outs AnyType:$res)>;
+def TestInvalidOp : TEST_Op<"invalid", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>:$inputs)>;
+def TestValidOp : TEST_Op<"valid", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>:$inputs)>;
+
+//===----------------------------------------------------------------------===//
+// Test region argument list parsing.
+//===----------------------------------------------------------------------===//
+
+def IsolatedRegionOp : TEST_Op<"isolated_region", [IsolatedFromAbove]> {
+  let summary =  "isolated region operation";
+  let description = [{
+    Test op with an isolated region, to test passthrough region arguments. Each
+    argument is of index type.
+  }];
+
+  let arguments = (ins Index:$input);
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+}
+
+def PolyForOp : TEST_Op<"polyfor">
+{
+  let summary =  "polyfor operation";
+  let description = [{
+    Test op with multiple region arguments, each argument of index type.
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+#endif // TEST_OPS
diff --git a/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp b/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp
new file mode 100644
index 00000000000..bde640b2691
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp
@@ -0,0 +1,255 @@
+//===- TestPatterns.cpp - Test dialect pattern driver ---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "TestDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+using namespace mlir;
+
+// Native function for testing NativeCodeCall
+static Value *chooseOperand(Value *input1, Value *input2, BoolAttr choice) {
+  return choice.getValue() ? input1 : input2;
+}
+
+namespace {
+#include "TestPatterns.inc"
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Canonicalizer Driver.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TestPatternDriver : public FunctionPass<TestPatternDriver> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    populateWithGenerated(&getContext(), &patterns);
+
+    // Verify named pattern is generated with expected name.
+    patterns.insert<TestNamedPatternRule>(&getContext());
+
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+} // end anonymous namespace
+
+static mlir::PassRegistration<TestPatternDriver>
+    pass("test-patterns", "Run test dialect patterns");
+
+//===----------------------------------------------------------------------===//
+// Legalization Driver.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This pattern is a simple pattern that inlines the first region of a given
+/// operation into the parent region.
+struct TestRegionRewriteBlockMovement : public ConversionPattern {
+  TestRegionRewriteBlockMovement(MLIRContext *ctx)
+      : ConversionPattern("test.region", 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Inline this region into the parent region.
+    auto &parentRegion = *op->getParentRegion();
+    rewriter.inlineRegionBefore(op->getRegion(0), parentRegion,
+                                parentRegion.end());
+
+    // Drop this operation.
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+/// This pattern is a simple pattern that generates a region containing an
+/// illegal operation.
+struct TestRegionRewriteUndo : public RewritePattern {
+  TestRegionRewriteUndo(MLIRContext *ctx)
+      : RewritePattern("test.region_builder", 1, ctx) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    // Create the region operation with an entry block containing arguments.
+    OperationState newRegion(op->getLoc(), "test.region");
+    newRegion.addRegion();
+    auto *regionOp = rewriter.createOperation(newRegion);
+    auto *entryBlock = rewriter.createBlock(&regionOp->getRegion(0));
+    entryBlock->addArgument(rewriter.getIntegerType(64));
+
+    // Add an explicitly illegal operation to ensure the conversion fails.
+    rewriter.create<ILLegalOpF>(op->getLoc(), rewriter.getIntegerType(32));
+    rewriter.create<TestValidOp>(op->getLoc(), ArrayRef<Value *>());
+
+    // Drop this operation.
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+/// This pattern simply erases the given operation.
+struct TestDropOp : public ConversionPattern {
+  TestDropOp(MLIRContext *ctx) : ConversionPattern("test.drop_op", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOp(op, llvm::None);
+    return matchSuccess();
+  }
+};
+/// This pattern simply updates the operands of the given operation.
+struct TestPassthroughInvalidOp : public ConversionPattern {
+  TestPassthroughInvalidOp(MLIRContext *ctx)
+      : ConversionPattern("test.invalid", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<TestValidOp>(op, llvm::None, operands,
+                                             llvm::None);
+    return matchSuccess();
+  }
+};
+/// This pattern handles the case of a split return value.
+struct TestSplitReturnType : public ConversionPattern {
+  TestSplitReturnType(MLIRContext *ctx)
+      : ConversionPattern("test.return", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Check for a return of F32.
+    if (op->getNumOperands() != 1 || !op->getOperand(0)->getType().isF32())
+      return matchFailure();
+
+    // Check if the first operation is a cast operation, if it is we use the
+    // results directly.
+    auto *defOp = operands[0]->getDefiningOp();
+    if (auto packerOp = llvm::dyn_cast_or_null<TestCastOp>(defOp)) {
+      SmallVector<Value *, 2> returnOperands(packerOp.getOperands());
+      rewriter.replaceOpWithNewOp<TestReturnOp>(op, returnOperands);
+      return matchSuccess();
+    }
+
+    // Otherwise, fail to match.
+    return matchFailure();
+  }
+};
+} // namespace
+
+namespace {
+struct TestTypeConverter : public TypeConverter {
+  using TypeConverter::TypeConverter;
+
+  LogicalResult convertType(Type t, SmallVectorImpl<Type> &results) override {
+    // Drop I16 types.
+    if (t.isInteger(16))
+      return success();
+
+    // Convert I64 to F64.
+    if (t.isInteger(64)) {
+      results.push_back(FloatType::getF64(t.getContext()));
+      return success();
+    }
+
+    // Split F32 into F16,F16.
+    if (t.isF32()) {
+      results.assign(2, FloatType::getF16(t.getContext()));
+      return success();
+    }
+
+    // Otherwise, convert the type directly.
+    results.push_back(t);
+    return success();
+  }
+
+  /// Override the hook to materialize a conversion. This is necessary because
+  /// we generate 1->N type mappings.
+  Operation *materializeConversion(PatternRewriter &rewriter, Type resultType,
+                                   ArrayRef<Value *> inputs,
+                                   Location loc) override {
+    return rewriter.create<TestCastOp>(loc, resultType, inputs);
+  }
+};
+
+struct TestLegalizePatternDriver
+    : public ModulePass<TestLegalizePatternDriver> {
+  /// The mode of conversion to use with the driver.
+  enum class ConversionMode { Analysis, Partial };
+
+  TestLegalizePatternDriver(ConversionMode mode) : mode(mode) {}
+
+  void runOnModule() override {
+    TestTypeConverter converter;
+    mlir::OwningRewritePatternList patterns;
+    populateWithGenerated(&getContext(), &patterns);
+    patterns.insert<TestRegionRewriteBlockMovement, TestRegionRewriteUndo,
+                    TestDropOp, TestPassthroughInvalidOp, TestSplitReturnType>(
+        &getContext());
+    mlir::populateFuncOpTypeConversionPattern(patterns, &getContext(),
+                                              converter);
+
+    // Define the conversion target used for the test.
+    ConversionTarget target(getContext());
+    target.addLegalOp<LegalOpA, TestCastOp, TestValidOp>();
+    target.addIllegalOp<ILLegalOpF, TestRegionBuilderOp>();
+    target.addDynamicallyLegalOp<TestReturnOp>([](TestReturnOp op) {
+      // Don't allow F32 operands.
+      return llvm::none_of(op.getOperandTypes(),
+                           [](Type type) { return type.isF32(); });
+    });
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+
+    // Handle a partial conversion.
+    if (mode == ConversionMode::Partial) {
+      (void)applyPartialConversion(getModule(), target, patterns, &converter);
+      return;
+    }
+
+    // Otherwise, handle an analysis conversion.
+    assert(mode == ConversionMode::Analysis);
+
+    // Analyze the convertible operations.
+    DenseSet<Operation *> legalizedOps;
+    if (failed(applyAnalysisConversion(getModule(), target, patterns,
+                                       legalizedOps, &converter)))
+      return signalPassFailure();
+
+    // Emit remarks for each legalizable operation.
+    for (auto *op : legalizedOps)
+      op->emitRemark() << "op '" << op->getName() << "' is legalizable";
+  }
+
+  /// The mode of conversion to use.
+  ConversionMode mode;
+};
+} // end anonymous namespace
+
+static llvm::cl::opt<TestLegalizePatternDriver::ConversionMode>
+    legalizerConversionMode(
+        "test-legalize-mode",
+        llvm::cl::desc("The legalization mode to use with the test driver"),
+        llvm::cl::init(TestLegalizePatternDriver::ConversionMode::Partial),
+        llvm::cl::values(
+            clEnumValN(TestLegalizePatternDriver::ConversionMode::Analysis,
+                       "analysis", "Perform an analysis conversion"),
+            clEnumValN(TestLegalizePatternDriver::ConversionMode::Partial,
+                       "partial", "Perform a partial conversion")));
+
+static mlir::PassRegistration<TestLegalizePatternDriver>
+    legalizer_pass("test-legalize-patterns",
+                   "Run test dialect legalization patterns", [] {
+                     return std::make_unique<TestLegalizePatternDriver>(
+                         legalizerConversionMode);
+                   });
diff --git a/third_party/mlir/test/lib/TestDialect/lit.local.cfg b/third_party/mlir/test/lib/TestDialect/lit.local.cfg
new file mode 100644
index 00000000000..edb5b44b2e2
--- /dev/null
+++ b/third_party/mlir/test/lib/TestDialect/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes.remove('.td')
\ No newline at end of file
diff --git a/third_party/mlir/test/lib/Transforms/CMakeLists.txt b/third_party/mlir/test/lib/Transforms/CMakeLists.txt
new file mode 100644
index 00000000000..fa66eb34af0
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_llvm_library(MLIRTestTransforms
+  TestConstantFold.cpp
+  TestLoopFusion.cpp
+  TestLoopMapping.cpp
+  TestLoopParametricTiling.cpp
+  TestVectorizationUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen)
+target_link_libraries(MLIRTestTransforms
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRVectorOps
+  )
diff --git a/third_party/mlir/test/lib/Transforms/TestConstantFold.cpp b/third_party/mlir/test/lib/Transforms/TestConstantFold.cpp
new file mode 100644
index 00000000000..9c541699e99
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestConstantFold.cpp
@@ -0,0 +1,82 @@
+//===- TestConstantFold.cpp - Pass to test constant folding ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple constant folding pass.
+struct TestConstantFold : public FunctionPass<TestConstantFold> {
+  // All constants in the function post folding.
+  SmallVector<Operation *, 8> existingConstants;
+
+  void foldOperation(Operation *op, OperationFolder &helper);
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+void TestConstantFold::foldOperation(Operation *op, OperationFolder &helper) {
+  auto processGeneratedConstants = [this](Operation *op) {
+    existingConstants.push_back(op);
+  };
+
+  // Attempt to fold the specified operation, including handling unused or
+  // duplicated constants.
+  (void)helper.tryToFold(op, processGeneratedConstants);
+}
+
+// For now, we do a simple top-down pass over a function folding constants.  We
+// don't handle conditional control flow, block arguments, folding conditional
+// branches, or anything else fancy.
+void TestConstantFold::runOnFunction() {
+  existingConstants.clear();
+
+  // Collect and fold the operations within the function.
+  SmallVector<Operation *, 8> ops;
+  getFunction().walk([&](Operation *op) { ops.push_back(op); });
+
+  // Fold the constants in reverse so that the last generated constants from
+  // folding are at the beginning. This creates somewhat of a linear ordering to
+  // the newly generated constants that matches the operation order and improves
+  // the readability of test cases.
+  OperationFolder helper;
+  for (Operation *op : llvm::reverse(ops))
+    foldOperation(op, helper);
+
+  // By the time we are done, we may have simplified a bunch of code, leaving
+  // around dead constants.  Check for them now and remove them.
+  for (auto *cst : existingConstants) {
+    if (cst->use_empty())
+      cst->erase();
+  }
+}
+
+/// Creates a constant folding pass.
+std::unique_ptr<FunctionPassBase> mlir::createTestConstantFoldPass() {
+  return std::make_unique<TestConstantFold>();
+}
+
+static PassRegistration<TestConstantFold>
+    pass("test-constant-fold", "Test operation constant folding");
diff --git a/third_party/mlir/test/lib/Transforms/TestLoopFusion.cpp b/third_party/mlir/test/lib/Transforms/TestLoopFusion.cpp
new file mode 100644
index 00000000000..604b42817e2
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -0,0 +1,175 @@
+//===- TestLoopFusion.cpp - Test loop fusion ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to test various loop fusion utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "test-loop-fusion"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<bool> clTestDependenceCheck(
+    "test-loop-fusion-dependence-check",
+    llvm::cl::desc("Enable testing of loop fusion dependence check"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clTestSliceComputation(
+    "test-loop-fusion-slice-computation",
+    llvm::cl::desc("Enable testing of loop fusion slice computation"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionPassBase> mlir::createTestLoopFusionPass() {
+  return std::make_unique<TestLoopFusion>();
+}
+
+// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void
+gatherLoops(Block *block, unsigned currLoopDepth,
+            DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  auto &loopsAtDepth = depthToLoops[currLoopDepth];
+  for (auto &op : *block) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      loopsAtDepth.push_back(forOp);
+      gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+    }
+  }
+}
+
+// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
+// in range ['loopDepth' + 1, 'maxLoopDepth'].
+// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
+static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
+                                unsigned j, unsigned loopDepth,
+                                unsigned maxLoopDepth) {
+  AffineForOp srcForOp = loops[i];
+  AffineForOp dstForOp = loops[j];
+  mlir::ComputationSliceState sliceUnion;
+  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
+    FusionResult result =
+        mlir::canFuseLoops(srcForOp, dstForOp, d, &sliceUnion);
+    if (result.value == FusionResult::FailBlockDependence) {
+      srcForOp.getOperation()->emitRemark("block-level dependence preventing"
+                                          " fusion of loop nest ")
+          << i << " into loop nest " << j << " at depth " << loopDepth;
+    }
+  }
+}
+
+// Returns the index of 'op' in its block.
+static unsigned getBlockIndex(Operation &op) {
+  unsigned index = 0;
+  for (auto &opX : *op.getBlock()) {
+    if (&op == &opX)
+      break;
+    ++index;
+  }
+  return index;
+}
+
+// Returns a string representation of 'sliceUnion'.
+static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) {
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  // Slice insertion point format [loop-depth, operation-block-index]
+  unsigned ipd = getNestingDepth(*sliceUnion.insertPoint);
+  unsigned ipb = getBlockIndex(*sliceUnion.insertPoint);
+  os << "insert point: (" << std::to_string(ipd) << ", " << std::to_string(ipb)
+     << ")";
+  assert(sliceUnion.lbs.size() == sliceUnion.ubs.size());
+  os << " loop bounds: ";
+  for (unsigned k = 0, e = sliceUnion.lbs.size(); k < e; ++k) {
+    os << '[';
+    sliceUnion.lbs[k].print(os);
+    os << ", ";
+    sliceUnion.ubs[k].print(os);
+    os << "] ";
+  }
+  return os.str();
+}
+
+// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
+// in range ['loopDepth' + 1, 'maxLoopDepth'].
+// Emits a string represention of the slice union as a remark on 'loops[j]'.
+static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
+                                 unsigned j, unsigned loopDepth,
+                                 unsigned maxLoopDepth) {
+  AffineForOp forOpA = loops[i];
+  AffineForOp forOpB = loops[j];
+  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
+    mlir::ComputationSliceState sliceUnion;
+    FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
+    if (result.value == FusionResult::Success) {
+      forOpB.getOperation()->emitRemark("slice (")
+          << " src loop: " << i << ", dst loop: " << j << ", depth: " << d
+          << " : " << getSliceStr(sliceUnion) << ")";
+    }
+  }
+}
+
+void TestLoopFusion::runOnFunction() {
+  // Gather all AffineForOps by loop depth.
+  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+  for (auto &block : getFunction()) {
+    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+  }
+
+  // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
+  for (auto &depthAndLoops : depthToLoops) {
+    unsigned loopDepth = depthAndLoops.first;
+    auto &loops = depthAndLoops.second;
+    unsigned numLoops = loops.size();
+    for (unsigned j = 0; j < numLoops; ++j) {
+      for (unsigned k = 0; k < numLoops; ++k) {
+        if (j == k)
+          continue;
+        if (clTestDependenceCheck)
+          testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
+        if (clTestSliceComputation)
+          testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
+      }
+    }
+  }
+}
+
+static PassRegistration<TestLoopFusion>
+    pass("test-loop-fusion", "Tests loop fusion utility functions.");
diff --git a/third_party/mlir/test/lib/Transforms/TestLoopMapping.cpp b/third_party/mlir/test/lib/Transforms/TestLoopMapping.cpp
new file mode 100644
index 00000000000..f4aa6469a99
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestLoopMapping.cpp
@@ -0,0 +1,65 @@
+//===- TestLoopMapping.cpp --- Parametric loop mapping pass ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to parametrically map loop.for loops to virtual
+// processing element dimensions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+
+namespace {
+class TestLoopMappingPass : public FunctionPass<TestLoopMappingPass> {
+public:
+  explicit TestLoopMappingPass() {}
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    // SSA values for the transformation are created out of thin air by
+    // unregistered "new_processor_id_and_range" operations. This is enough to
+    // emulate mapping conditions.
+    SmallVector<Value *, 8> processorIds, numProcessors;
+    func.walk([&processorIds, &numProcessors](Operation *op) {
+      if (op->getName().getStringRef() != "new_processor_id_and_range")
+        return;
+      processorIds.push_back(op->getResult(0));
+      numProcessors.push_back(op->getResult(1));
+    });
+
+    func.walk<loop::ForOp>([&processorIds, &numProcessors](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentRegion()->getParentOfType<loop::ForOp>())
+        return;
+      mapLoopToProcessorIds(op, processorIds, numProcessors);
+    });
+  }
+};
+} // end namespace
+
+static PassRegistration<TestLoopMappingPass>
+    reg("test-mapping-to-processing-elements",
+        "test mapping a single loop on a virtual processor grid",
+        [] { return std::make_unique<TestLoopMappingPass>(); });
diff --git a/third_party/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/third_party/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
new file mode 100644
index 00000000000..cf68ec1b9a7
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
@@ -0,0 +1,72 @@
+//===- TestLoopParametricTiling.cpp --- Parametric loop tiling pass -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to parametrically tile nests of standard loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+static llvm::cl::list<int> clOuterLoopSizes(
+    "test-outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
+    llvm::cl::desc(
+        "fixed number of iterations that the outer loops should have"));
+
+namespace {
+// Extracts fixed-range loops for top-level loop nests with ranges defined in
+// the pass constructor.  Assumes loops are permutable.
+class SimpleParametricLoopTilingPass
+    : public FunctionPass<SimpleParametricLoopTilingPass> {
+public:
+  explicit SimpleParametricLoopTilingPass(ArrayRef<int64_t> outerLoopSizes)
+      : sizes(outerLoopSizes.begin(), outerLoopSizes.end()) {}
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    func.walk<loop::ForOp>([this](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentRegion()->getParentOfType<loop::ForOp>())
+        return;
+      extractFixedOuterLoops(op, sizes);
+    });
+  }
+
+  SmallVector<int64_t, 4> sizes;
+};
+} // end namespace
+
+std::unique_ptr<FunctionPassBase>
+mlir::createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes) {
+  return std::make_unique<SimpleParametricLoopTilingPass>(outerLoopSizes);
+}
+
+static PassRegistration<SimpleParametricLoopTilingPass>
+    reg("test-extract-fixed-outer-loops",
+        "test application of parametric tiling to the outer loops so that the "
+        "ranges of outer loops become static",
+        [] {
+          auto pass = std::make_unique<SimpleParametricLoopTilingPass>(
+              ArrayRef<int64_t>{});
+          pass->sizes.assign(clOuterLoopSizes.begin(), clOuterLoopSizes.end());
+          return pass;
+        });
diff --git a/third_party/mlir/test/lib/Transforms/TestVectorizationUtils.cpp b/third_party/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
new file mode 100644
index 00000000000..3f00eb01e11
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
@@ -0,0 +1,301 @@
+//===- VectorizerTestPass.cpp - VectorizerTestPass Pass Impl --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a simple testing pass for vectorization functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "affine-vectorizer-test"
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+using functional::map;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::list<int> clTestVectorShapeRatio(
+    "vector-shape-ratio",
+    llvm::cl::desc("Specify the HW vector size for vectorization"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestForwardSlicingAnalysis(
+    "forward-slicing",
+    llvm::cl::desc("Enable testing forward static slicing and topological sort "
+                   "functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestBackwardSlicingAnalysis(
+    "backward-slicing",
+    llvm::cl::desc("Enable testing backward static slicing and "
+                   "topological sort functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestSlicingAnalysis(
+    "slicing",
+    llvm::cl::desc("Enable testing static slicing and topological sort "
+                   "functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestComposeMaps(
+    "compose-maps",
+    llvm::cl::desc(
+        "Enable testing the composition of AffineMap where each "
+        "AffineMap in the composition is specified as the affine_map attribute "
+        "in a constant op."),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestNormalizeMaps(
+    "normalize-maps",
+    llvm::cl::desc(
+        "Enable testing the normalization of AffineAffineApplyOp "
+        "where each AffineAffineApplyOp in the composition is a single output "
+        "operation."),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+struct VectorizerTestPass : public FunctionPass<VectorizerTestPass> {
+  static constexpr auto kTestAffineMapOpName = "test_affine_map";
+  static constexpr auto kTestAffineMapAttrName = "affine_map";
+
+  void runOnFunction() override;
+  void testVectorShapeRatio(llvm::raw_ostream &outs);
+  void testForwardSlicing(llvm::raw_ostream &outs);
+  void testBackwardSlicing(llvm::raw_ostream &outs);
+  void testSlicing(llvm::raw_ostream &outs);
+  void testComposeMaps(llvm::raw_ostream &outs);
+  void testNormalizeMaps();
+};
+
+} // end anonymous namespace
+
+void VectorizerTestPass::testVectorShapeRatio(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  using matcher::Op;
+  SmallVector<int64_t, 8> shape(clTestVectorShapeRatio.begin(),
+                                clTestVectorShapeRatio.end());
+  auto subVectorType =
+      VectorType::get(shape, FloatType::getF32(f.getContext()));
+  // Only filter operations that operate on a strict super-vector and have one
+  // return. This makes testing easier.
+  auto filter = [&](Operation &op) {
+    assert(subVectorType.getElementType().isF32() &&
+           "Only f32 supported for now");
+    if (!matcher::operatesOnSuperVectorsOf(op, subVectorType)) {
+      return false;
+    }
+    if (op.getNumResults() != 1) {
+      return false;
+    }
+    return true;
+  };
+  auto pat = Op(filter);
+  SmallVector<NestedMatch, 8> matches;
+  pat.match(f, &matches);
+  for (auto m : matches) {
+    auto *opInst = m.getMatchedOperation();
+    // This is a unit test that only checks and prints shape ratio.
+    // As a consequence we write only Ops with a single return type for the
+    // purpose of this test. If we need to test more intricate behavior in the
+    // future we can always extend.
+    auto superVectorType = opInst->getResult(0)->getType().cast<VectorType>();
+    auto ratio = shapeRatio(superVectorType, subVectorType);
+    if (!ratio.hasValue()) {
+      opInst->emitRemark("NOT MATCHED");
+    } else {
+      outs << "\nmatched: " << *opInst << " with shape ratio: ";
+      interleaveComma(MutableArrayRef<unsigned>(*ratio), outs);
+    }
+  }
+}
+
+static NestedPattern patternTestSlicingOps() {
+  using functional::map;
+  using matcher::Op;
+  // Match all operations with the kTestSlicingOpName name.
+  auto filter = [](Operation &op) {
+    // Just use a custom op name for this test, it makes life easier.
+    return op.getName().getStringRef() == "slicing-test-op";
+  };
+  return Op(filter);
+}
+
+void VectorizerTestPass::testBackwardSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> backwardSlice;
+    getBackwardSlice(m.getMatchedOperation(), &backwardSlice);
+    outs << "\nmatched: " << *m.getMatchedOperation()
+         << " backward static slice: ";
+    for (auto *op : backwardSlice)
+      outs << "\n" << *op;
+  }
+}
+
+void VectorizerTestPass::testForwardSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> forwardSlice;
+    getForwardSlice(m.getMatchedOperation(), &forwardSlice);
+    outs << "\nmatched: " << *m.getMatchedOperation()
+         << " forward static slice: ";
+    for (auto *op : forwardSlice)
+      outs << "\n" << *op;
+  }
+}
+
+void VectorizerTestPass::testSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> staticSlice = getSlice(m.getMatchedOperation());
+    outs << "\nmatched: " << *m.getMatchedOperation() << " static slice: ";
+    for (auto *op : staticSlice)
+      outs << "\n" << *op;
+  }
+}
+
+static bool customOpWithAffineMapAttribute(Operation &op) {
+  return op.getName().getStringRef() ==
+         VectorizerTestPass::kTestAffineMapOpName;
+}
+
+void VectorizerTestPass::testComposeMaps(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+
+  using matcher::Op;
+  auto pattern = Op(customOpWithAffineMapAttribute);
+  SmallVector<NestedMatch, 8> matches;
+  pattern.match(f, &matches);
+  SmallVector<AffineMap, 4> maps;
+  maps.reserve(matches.size());
+  for (auto m : llvm::reverse(matches)) {
+    auto *opInst = m.getMatchedOperation();
+    auto map = opInst->getAttr(VectorizerTestPass::kTestAffineMapAttrName)
+                   .cast<AffineMapAttr>()
+                   .getValue();
+    maps.push_back(map);
+  }
+  AffineMap res;
+  for (auto m : maps) {
+    res = res ? res.compose(m) : m;
+  }
+  simplifyAffineMap(res).print(outs << "\nComposed map: ");
+}
+
+static bool affineApplyOp(Operation &op) { return isa<AffineApplyOp>(op); }
+
+static bool singleResultAffineApplyOpWithoutUses(Operation &op) {
+  auto app = dyn_cast<AffineApplyOp>(op);
+  return app && app.use_empty();
+}
+
+void VectorizerTestPass::testNormalizeMaps() {
+  using matcher::Op;
+
+  auto f = getFunction();
+
+  // Save matched AffineApplyOp that all need to be erased in the end.
+  auto pattern = Op(affineApplyOp);
+  SmallVector<NestedMatch, 8> toErase;
+  pattern.match(f, &toErase);
+  {
+    // Compose maps.
+    auto pattern = Op(singleResultAffineApplyOpWithoutUses);
+    SmallVector<NestedMatch, 8> matches;
+    pattern.match(f, &matches);
+    for (auto m : matches) {
+      auto app = cast<AffineApplyOp>(m.getMatchedOperation());
+      OpBuilder b(m.getMatchedOperation());
+      SmallVector<Value *, 8> operands(app.getOperands());
+      makeComposedAffineApply(b, app.getLoc(), app.getAffineMap(), operands);
+    }
+  }
+  // We should now be able to erase everything in reverse order in this test.
+  for (auto m : llvm::reverse(toErase)) {
+    m.getMatchedOperation()->erase();
+  }
+}
+
+void VectorizerTestPass::runOnFunction() {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+
+  // Only support single block functions at this point.
+  FuncOp f = getFunction();
+  if (f.getBlocks().size() != 1)
+    return;
+
+  std::string str;
+  llvm::raw_string_ostream outs(str);
+
+  if (!clTestVectorShapeRatio.empty())
+    testVectorShapeRatio(outs);
+
+  if (clTestForwardSlicingAnalysis)
+    testForwardSlicing(outs);
+
+  if (clTestBackwardSlicingAnalysis)
+    testBackwardSlicing(outs);
+
+  if (clTestSlicingAnalysis)
+    testSlicing(outs);
+
+  if (clTestComposeMaps)
+    testComposeMaps(outs);
+
+  if (clTestNormalizeMaps)
+    testNormalizeMaps();
+
+  if (!outs.str().empty()) {
+    emitRemark(UnknownLoc::get(&getContext()), outs.str());
+  }
+}
+
+std::unique_ptr<FunctionPassBase> mlir::createVectorizerTestPass() {
+  return std::make_unique<VectorizerTestPass>();
+}
+
+static PassRegistration<VectorizerTestPass>
+    pass("affine-vectorizer-test",
+         "Tests vectorizer standalone functionality.");
+
+#undef DEBUG_TYPE
diff --git a/third_party/mlir/test/lit.cfg.py b/third_party/mlir/test/lit.cfg.py
new file mode 100644
index 00000000000..cf938946289
--- /dev/null
+++ b/third_party/mlir/test/lit.cfg.py
@@ -0,0 +1,73 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.td', '.mlir', '.toy']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_obj_root, 'test')
+
+config.substitutions.append(('%PATH%', config.environment['PATH']))
+config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+
+llvm_config.with_system_environment(
+    ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+
+llvm_config.use_default_substitutions()
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_obj_root, 'test')
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [config.mlir_tools_dir, config.llvm_tools_dir]
+tools = [
+    'mlir-opt',
+    'mlir-tblgen',
+    'mlir-translate',
+    'mlir-edsc-builder-api-test',
+]
+
+# The following tools are optional
+tools.extend([
+    ToolSubst('toy-ch1', unresolved='ignore'),
+    ToolSubst('toy-ch2', unresolved='ignore'),
+    ToolSubst('toy-ch3', unresolved='ignore'),
+    ToolSubst('toy-ch4', unresolved='ignore'),
+    ToolSubst('toy-ch5', unresolved='ignore'),
+    ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
+    ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore')
+])
+
+llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/third_party/mlir/test/lit.site.cfg.py.in b/third_party/mlir/test/lit.site.cfg.py.in
new file mode 100644
index 00000000000..6008680eb66
--- /dev/null
+++ b/third_party/mlir/test/lit.site.cfg.py.in
@@ -0,0 +1,53 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+config.host_ldflags = "@HOST_LDFLAGS@"
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mlir_src_root = "@MLIR_SOURCE_DIR@"
+config.mlir_obj_root = "@MLIR_BINARY_DIR@"
+config.mlir_tools_dir = "@MLIR_TOOLS_DIR@"
+config.linalg_test_lib_dir = "@MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR@"
+config.build_examples = @LLVM_BUILD_EXAMPLES@
+config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
+config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
+config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")
diff --git a/third_party/mlir/tools/CMakeLists.txt b/third_party/mlir/tools/CMakeLists.txt
new file mode 100644
index 00000000000..2566dd87288
--- /dev/null
+++ b/third_party/mlir/tools/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(mlir-cuda-runner)
+add_subdirectory(mlir-cpu-runner)
+add_subdirectory(mlir-opt)
+add_subdirectory(mlir-tblgen)
+add_subdirectory(mlir-translate)
diff --git a/third_party/mlir/tools/mlir-cpu-runner/CMakeLists.txt b/third_party/mlir/tools/mlir-cpu-runner/CMakeLists.txt
new file mode 100644
index 00000000000..561fd9d0d53
--- /dev/null
+++ b/third_party/mlir/tools/mlir-cpu-runner/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_llvm_executable(mlir-cpu-runner
+  mlir-cpu-runner.cpp
+)
+llvm_update_compile_flags(mlir-cpu-runner)
+whole_archive_link(mlir-cpu-runner
+  MLIRLLVMIR
+  MLIRStandardOps
+  MLIRTargetLLVMIR
+  MLIRTransforms
+  MLIRTranslation
+)
+target_link_libraries(mlir-cpu-runner PRIVATE
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRControlFlowToCFG
+  MLIREDSC
+  MLIRExecutionEngine
+  MLIRIR
+  MLIRJitRunner
+  MLIRLLVMIR
+  MLIRParser
+  MLIRTargetLLVMIR
+  MLIRTransforms
+  MLIRStandardToLLVM
+  MLIRSupport
+  LLVMCore
+  LLVMSupport
+)
diff --git a/third_party/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp b/third_party/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp
new file mode 100644
index 00000000000..f7023c4cf61
--- /dev/null
+++ b/third_party/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp
@@ -0,0 +1,28 @@
+//===- mlir-cpu-runner.cpp - MLIR CPU Execution Driver---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Main entry point to a command line utility that executes an MLIR file on the
+// CPU by  translating MLIR to LLVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/JitRunner.h"
+
+int main(int argc, char **argv) {
+  return mlir::JitRunnerMain(argc, argv, nullptr);
+}
diff --git a/third_party/mlir/tools/mlir-cuda-runner/CMakeLists.txt b/third_party/mlir/tools/mlir-cuda-runner/CMakeLists.txt
new file mode 100644
index 00000000000..fda9122507f
--- /dev/null
+++ b/third_party/mlir/tools/mlir-cuda-runner/CMakeLists.txt
@@ -0,0 +1,74 @@
+set(LLVM_OPTIONAL_SOURCES
+  cuda-runtime-wrappers.cpp
+  mlir-cuda-runner.cpp
+  )
+
+if(MLIR_CUDA_RUNNER_ENABLED)
+  if (NOT ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir cuda runner requires the NVPTX backend")
+  endif()
+
+  # Configure CUDA runner support. Using check_language first allows us to give
+  # a custom error message.
+  include(CheckLanguage)
+  check_language(CUDA)
+  if (CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  else()
+    message(SEND_ERROR
+      "Building the mlir cuda runner requires a working CUDA install")
+  endif()
+
+  # We need the libcuda.so library.
+  find_library(CUDA_RUNTIME_LIBRARY cuda)
+
+  add_llvm_library(cuda-runtime-wrappers SHARED
+    cuda-runtime-wrappers.cpp
+  )
+  target_include_directories(cuda-runtime-wrappers
+    PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+    LLVMSupport
+  )
+  target_link_libraries(cuda-runtime-wrappers
+    LLVMSupport
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+
+  set(FULL_LINK_LIBS
+    MLIRAffineOps
+    MLIRControlFlowToCFG
+    MLIRGPU
+    MLIRGPUtoCUDATransforms
+    MLIRGPUtoNVVMTransforms
+    MLIRLLVMIR
+    MLIRStandardOps
+    MLIRStandardToLLVM
+    MLIRTargetLLVMIR
+    MLIRTransforms
+    MLIRTranslation
+  )
+  set(LIBS
+    MLIRIR
+    MLIRParser
+    MLIREDSC
+    MLIRAnalysis
+    MLIRExecutionEngine
+    MLIRJitRunner
+    MLIRSupport
+    LLVMCore
+    LLVMSupport
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+  add_llvm_executable(mlir-cuda-runner
+    mlir-cuda-runner.cpp
+  )
+  add_dependencies(mlir-cuda-runner cuda-runtime-wrappers)
+  target_include_directories(mlir-cuda-runner
+    PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  llvm_update_compile_flags(mlir-cuda-runner)
+  whole_archive_link(mlir-cuda-runner ${FULL_LINK_LIBS})
+  target_link_libraries(mlir-cuda-runner PRIVATE ${FULL_LINK_LIBS} ${LIBS})
+
+endif()
diff --git a/third_party/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/third_party/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
new file mode 100644
index 00000000000..c3946627461
--- /dev/null
+++ b/third_party/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -0,0 +1,108 @@
+//===- cuda-runtime-wrappers.cpp - MLIR CUDA runner wrapper library -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <memory.h>
+
+#include "llvm/Support/raw_ostream.h"
+
+#include "cuda.h"
+
+namespace {
+int32_t reportErrorIfAny(CUresult result, const char *where) {
+  if (result != CUDA_SUCCESS) {
+    llvm::errs() << "CUDA failed with " << result << " in " << where << "\n";
+  }
+  return result;
+}
+} // anonymous namespace
+
+extern "C" int32_t mcuModuleLoad(void **module, void *data) {
+  int32_t err = reportErrorIfAny(
+      cuModuleLoadData(reinterpret_cast<CUmodule *>(module), data),
+      "ModuleLoad");
+  return err;
+}
+
+extern "C" int32_t mcuModuleGetFunction(void **function, void *module,
+                                        const char *name) {
+  return reportErrorIfAny(
+      cuModuleGetFunction(reinterpret_cast<CUfunction *>(function),
+                          reinterpret_cast<CUmodule>(module), name),
+      "GetFunction");
+}
+
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
+                                   intptr_t gridY, intptr_t gridZ,
+                                   intptr_t blockX, intptr_t blockY,
+                                   intptr_t blockZ, int32_t smem, void *stream,
+                                   void **params, void **extra) {
+  return reportErrorIfAny(
+      cuLaunchKernel(reinterpret_cast<CUfunction>(function), gridX, gridY,
+                     gridZ, blockX, blockY, blockZ, smem,
+                     reinterpret_cast<CUstream>(stream), params, extra),
+      "LaunchKernel");
+}
+
+extern "C" void *mcuGetStreamHelper() {
+  CUstream stream;
+  reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
+  return stream;
+}
+
+extern "C" int32_t mcuStreamSynchronize(void *stream) {
+  return reportErrorIfAny(
+      cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
+}
+
+/// Helper functions for writing mlir example code
+
+// A struct that corresponds to how MLIR represents unknown-length 1d memrefs.
+struct memref_t {
+  float *values;
+  intptr_t length;
+};
+
+// Allows to register a pointer with the CUDA runtime. Helpful until
+// we have transfer functions implemented.
+extern "C" void mcuMemHostRegister(const memref_t arg, int32_t flags) {
+  reportErrorIfAny(
+      cuMemHostRegister(arg.values, arg.length * sizeof(float), flags),
+      "MemHostRegister");
+}
+
+/// Prints the given float array to stderr.
+extern "C" void mcuPrintFloat(const memref_t arg) {
+  if (arg.length == 0) {
+    llvm::outs() << "[]\n";
+    return;
+  }
+  llvm::outs() << "[" << arg.values[0];
+  for (int pos = 1; pos < arg.length; pos++) {
+    llvm::outs() << ", " << arg.values[pos];
+  }
+  llvm::outs() << "]\n";
+}
diff --git a/third_party/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/third_party/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
new file mode 100644
index 00000000000..797b7bb9ed3
--- /dev/null
+++ b/third_party/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -0,0 +1,157 @@
+//===- mlir-cpu-runner.cpp - MLIR CPU Execution Driver---------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a command line utility that executes an MLIR file on the GPU by
+// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/JitRunner.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "cuda.h"
+
+using namespace mlir;
+
+inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
+                            CUresult error, FuncOp &function) {
+  function.emitError(message.concat(" failed with error code ")
+                         .concat(llvm::Twine{error})
+                         .concat("[")
+                         .concat(buffer)
+                         .concat("]"));
+}
+
+#define RETURN_ON_CUDA_ERROR(expr, msg)                                        \
+  {                                                                            \
+    auto _cuda_error = (expr);                                                 \
+    if (_cuda_error != CUDA_SUCCESS) {                                         \
+      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, function);             \
+      return {};                                                               \
+    }                                                                          \
+  }
+
+OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
+  char jitErrorBuffer[4096] = {0};
+
+  RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
+
+  // Linking requires a device context.
+  CUdevice device;
+  RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0), "cuDeviceGet");
+  CUcontext context;
+  RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device), "cuCtxCreate");
+  CUlinkState linkState;
+
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
+
+  RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
+                                    jitOptions,     /* jit options */
+                                    jitOptionsVals, /* jit option values */
+                                    &linkState),
+                       "cuLinkCreate");
+
+  RETURN_ON_CUDA_ERROR(
+      cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
+                    const_cast<void *>(static_cast<const void *>(ptx.c_str())),
+                    ptx.length(), function.getName().data(), /* kernel name */
+                    0,       /* number of jit options */
+                    nullptr, /* jit options */
+                    nullptr  /* jit option values */
+                    ),
+      "cuLinkAddData");
+
+  void *cubinData;
+  size_t cubinSize;
+  RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize),
+                       "cuLinkComplete");
+
+  char *cubinAsChar = static_cast<char *>(cubinData);
+  OwnedCubin result =
+      std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
+
+  // This will also destroy the cubin data.
+  RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState), "cuLinkDestroy");
+
+  return result;
+}
+
+namespace {
+// A pass that lowers all Standard and Gpu operations to LLVM dialect. It does
+// not lower the GPULaunch operation to actual code but dows translate the
+// signature of its kernel argument.
+class LowerStandardAndGpuToLLVMAndNVVM
+    : public ModulePass<LowerStandardAndGpuToLLVMAndNVVM> {
+public:
+  void runOnModule() override {
+    ModuleOp m = getModule();
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateGpuToNVVMConversionPatterns(converter, patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalDialect<NVVM::NVVMDialect>();
+    target.addLegalOp<ModuleOp>();
+    target.addLegalOp<ModuleTerminatorOp>();
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+    if (failed(applyFullConversion(m, target, patterns, &converter)))
+      signalPassFailure();
+  }
+};
+} // end anonymous namespace
+
+static LogicalResult runMLIRPasses(ModuleOp m) {
+  PassManager pm;
+
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(static_cast<std::unique_ptr<ModulePassBase>>(
+      std::make_unique<LowerStandardAndGpuToLLVMAndNVVM>()));
+  pm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
+  pm.addPass(createGenerateCubinAccessorPass());
+  pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
+
+  if (failed(pm.run(m)))
+    return failure();
+
+  return success();
+}
+
+int main(int argc, char **argv) {
+  return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
+}
diff --git a/third_party/mlir/tools/mlir-opt/CMakeLists.txt b/third_party/mlir/tools/mlir-opt/CMakeLists.txt
new file mode 100644
index 00000000000..ff12852e347
--- /dev/null
+++ b/third_party/mlir/tools/mlir-opt/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(LLVM_OPTIONAL_SOURCES
+  null.cpp
+)
+
+set(LIB_LIBS
+  MLIRAnalysis
+  MLIRLLVMIR
+  MLIRParser
+  MLIRPass
+  MLIRTransforms
+  MLIRSupport
+)
+add_llvm_library(MLIRMlirOptLib
+  mlir-opt.cpp
+)
+target_link_libraries(MLIRMlirOptLib ${LIB_LIBS})
+
+set(LIBS
+  MLIRAffineOps
+  MLIRLoopsToGPU
+  MLIRAnalysis
+  MLIRControlFlowToCFG
+  MLIREDSC
+  MLIRFxpMathOps
+  MLIRGPU
+  MLIRGPUtoNVVMTransforms
+  MLIRGPUtoSPIRVTransforms
+  MLIRLinalg
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRNVVMIR
+  MLIROptMain
+  MLIRParser
+  MLIRPass
+  MLIRQuantizerTransforms
+  MLIRQuantOps
+  MLIRSPIRV
+  MLIRSPIRVConversion
+  MLIRStandardOps
+  MLIRStandardToLLVM
+  MLIRTransforms
+  MLIRTestDialect
+  MLIRTestTransforms
+  MLIRSupport
+  MLIRVectorOps
+  MLIRVectorToLLVM
+)
+if(MLIR_CUDA_CONVERSIONS_ENABLED)
+  list(APPEND LIBS
+    MLIRGPUtoCUDATransforms
+  )
+endif()
+add_llvm_executable(mlir-opt
+ mlir-opt.cpp
+)
+llvm_update_compile_flags(mlir-opt)
+whole_archive_link(mlir-opt ${LIBS})
+target_link_libraries(mlir-opt PRIVATE MLIRIR MLIRMlirOptLib ${LIBS} LLVMSupport)
diff --git a/third_party/mlir/tools/mlir-opt/mlir-opt.cpp b/third_party/mlir/tools/mlir-opt/mlir-opt.cpp
new file mode 100644
index 00000000000..3f9dbcde61e
--- /dev/null
+++ b/third_party/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -0,0 +1,89 @@
+//===- mlir-opt.cpp - MLIR Optimizer Driver -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Main entry function for mlir-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static cl::opt<std::string>
+    inputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"),
+                                           cl::init("-"));
+
+static cl::opt<bool>
+    splitInputFile("split-input-file",
+                   cl::desc("Split the input file into pieces and process each "
+                            "chunk independently"),
+                   cl::init(false));
+
+static cl::opt<bool>
+    verifyDiagnostics("verify-diagnostics",
+                      cl::desc("Check that emitted diagnostics match "
+                               "expected-* lines on the corresponding line"),
+                      cl::init(false));
+
+static cl::opt<bool>
+    verifyPasses("verify-each",
+                 cl::desc("Run the verifier after each transformation pass"),
+                 cl::init(true));
+
+static std::vector<const PassRegistryEntry *> *passList;
+
+int main(int argc, char **argv) {
+  InitLLVM y(argc, argv);
+
+  // Register any pass manager command line options.
+  registerPassManagerCLOptions();
+
+  // Parse pass names in main to ensure static initialization completed.
+  llvm::cl::list<const PassRegistryEntry *, bool, PassNameParser> passList(
+      "", llvm::cl::desc("Compiler passes to run"));
+  ::passList = &passList;
+  cl::ParseCommandLineOptions(argc, argv, "MLIR modular optimizer driver\n");
+
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    exit(1);
+  }
+
+  return failed(MlirOptMain(output->os(), std::move(file), passList,
+                            splitInputFile, verifyDiagnostics, verifyPasses));
+}
diff --git a/third_party/mlir/tools/mlir-tblgen/CMakeLists.txt b/third_party/mlir/tools/mlir-tblgen/CMakeLists.txt
new file mode 100644
index 00000000000..067e1725e24
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_LINK_COMPONENTS
+  MLIRTableGen
+  Support
+  )
+
+add_tablegen(mlir-tblgen MLIR
+  EnumsGen.cpp
+  LLVMIRConversionGen.cpp
+  mlir-tblgen.cpp
+  OpDefinitionsGen.cpp
+  OpDocGen.cpp
+  OpInterfacesGen.cpp
+  ReferenceImplGen.cpp
+  RewriterGen.cpp
+  SPIRVUtilsGen.cpp
+  )
+set_target_properties(mlir-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/third_party/mlir/tools/mlir-tblgen/EnumsGen.cpp b/third_party/mlir/tools/mlir-tblgen/EnumsGen.cpp
new file mode 100644
index 00000000000..36f2e049641
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/EnumsGen.cpp
@@ -0,0 +1,285 @@
+//===- EnumsGen.cpp - MLIR enum utility generator -------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// EnumsGen generates common utility functions for enums.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using llvm::formatv;
+using llvm::isDigit;
+using llvm::raw_ostream;
+using llvm::Record;
+using llvm::RecordKeeper;
+using llvm::StringRef;
+using mlir::tblgen::EnumAttr;
+using mlir::tblgen::EnumAttrCase;
+
+static std::string makeIdentifier(StringRef str) {
+  if (!str.empty() && isDigit(static_cast<unsigned char>(str.front()))) {
+    std::string newStr = std::string("_") + str.str();
+    return newStr;
+  }
+  return str.str();
+}
+
+static void emitEnumClass(const Record &enumDef, StringRef enumName,
+                          StringRef underlyingType, StringRef description,
+                          const std::vector<EnumAttrCase> &enumerants,
+                          raw_ostream &os) {
+  os << "// " << description << "\n";
+  os << "enum class " << enumName;
+
+  if (!underlyingType.empty())
+    os << " : " << underlyingType;
+  os << " {\n";
+
+  for (const auto &enumerant : enumerants) {
+    auto symbol = makeIdentifier(enumerant.getSymbol());
+    auto value = enumerant.getValue();
+    if (value >= 0) {
+      os << formatv("  {0} = {1},\n", symbol, value);
+    } else {
+      os << formatv("  {0},\n", symbol);
+    }
+  }
+  os << "};\n\n";
+}
+
+static void emitDenseMapInfo(StringRef enumName, std::string underlyingType,
+                             StringRef cppNamespace, raw_ostream &os) {
+  std::string qualName = formatv("{0}::{1}", cppNamespace, enumName);
+  if (underlyingType.empty())
+    underlyingType = formatv("std::underlying_type<{0}>::type", qualName);
+
+  const char *const mapInfo = R"(
+namespace llvm {
+template<> struct DenseMapInfo<{0}> {{
+  using StorageInfo = llvm::DenseMapInfo<{1}>;
+
+  static inline {0} getEmptyKey() {{
+    return static_cast<{0}>(StorageInfo::getEmptyKey());
+  }
+
+  static inline {0} getTombstoneKey() {{
+    return static_cast<{0}>(StorageInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const {0} &val) {{
+    return StorageInfo::getHashValue(static_cast<{1}>(val));
+  }
+
+  static bool isEqual(const {0} &lhs, const {0} &rhs) {{
+    return lhs == rhs;
+  }
+};
+})";
+  os << formatv(mapInfo, qualName, underlyingType);
+  os << "\n\n";
+}
+
+static void emitMaxValueFn(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef maxEnumValFnName = enumAttr.getMaxEnumValFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  unsigned maxEnumVal = 0;
+  for (const auto &enumerant : enumerants) {
+    int64_t value = enumerant.getValue();
+    // Avoid generating the max value function if there is an enumerant without
+    // explicit value.
+    if (value < 0)
+      return;
+
+    maxEnumVal = std::max(maxEnumVal, static_cast<unsigned>(value));
+  }
+
+  // Emit the function to return the max enum value
+  os << formatv("inline constexpr unsigned {0}() {{\n", maxEnumValFnName);
+  os << formatv("  return {0};\n", maxEnumVal);
+  os << "}\n\n";
+}
+
+static void emitSymToStrFn(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef symToStrFnName = enumAttr.getSymbolToStringFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  os << formatv("llvm::StringRef {1}({0} val) {{\n", enumName, symToStrFnName);
+  os << "  switch (val) {\n";
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    os << formatv("    case {0}::{1}: return \"{2}\";\n", enumName,
+                  makeIdentifier(symbol), symbol);
+  }
+  os << "  }\n";
+  os << "  return \"\";\n";
+  os << "}\n\n";
+}
+
+static void emitStrToSymFn(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef strToSymFnName = enumAttr.getStringToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  os << formatv("llvm::Optional<{0}> {1}(llvm::StringRef str) {{\n", enumName,
+                strToSymFnName);
+  os << formatv("  return llvm::StringSwitch<llvm::Optional<{0}>>(str)\n",
+                enumName);
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    os << formatv("      .Case(\"{1}\", {0}::{2})\n", enumName, symbol,
+                  makeIdentifier(symbol));
+  }
+  os << "      .Default(llvm::None);\n";
+  os << "}\n";
+}
+
+static void emitUnderlyingToSymFn(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef underlyingToSymFnName = enumAttr.getUnderlyingToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  // Avoid generating the underlying value to symbol conversion function if
+  // there is an enumerant without explicit value.
+  if (llvm::any_of(enumerants, [](EnumAttrCase enumerant) {
+        return enumerant.getValue() < 0;
+      }))
+    return;
+
+  os << formatv("llvm::Optional<{0}> {1}({2} value) {{\n", enumName,
+                underlyingToSymFnName,
+                underlyingType.empty() ? std::string("unsigned")
+                                       : underlyingType)
+     << "  switch (value) {\n";
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    auto value = enumerant.getValue();
+    os << formatv("  case {0}: return {1}::{2};\n", value, enumName,
+                  makeIdentifier(symbol));
+  }
+  os << "  default: return llvm::None;\n"
+     << "  }\n"
+     << "}\n\n";
+}
+
+static void emitEnumDecl(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef cppNamespace = enumAttr.getCppNamespace();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef description = enumAttr.getDescription();
+  StringRef strToSymFnName = enumAttr.getStringToSymbolFnName();
+  StringRef symToStrFnName = enumAttr.getSymbolToStringFnName();
+  StringRef underlyingToSymFnName = enumAttr.getUnderlyingToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  // Emit the enum class definition
+  emitEnumClass(enumDef, enumName, underlyingType, description, enumerants, os);
+
+  // Emit coversion function declarations
+  if (llvm::all_of(enumerants, [](EnumAttrCase enumerant) {
+        return enumerant.getValue() >= 0;
+      })) {
+    os << formatv(
+        "llvm::Optional<{0}> {1}({2});\n", enumName, underlyingToSymFnName,
+        underlyingType.empty() ? std::string("unsigned") : underlyingType);
+  }
+  os << formatv("llvm::StringRef {1}({0});\n", enumName, symToStrFnName);
+  os << formatv("llvm::Optional<{0}> {1}(llvm::StringRef);\n", enumName,
+                strToSymFnName);
+
+  emitMaxValueFn(enumDef, os);
+
+  for (auto ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+
+  // Emit DenseMapInfo for this enum class
+  emitDenseMapInfo(enumName, underlyingType, cppNamespace, os);
+}
+
+static bool emitEnumDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Enum Utility Declarations", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo");
+  for (const auto *def : defs)
+    emitEnumDecl(*def, os);
+
+  return false;
+}
+
+static void emitEnumDef(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef cppNamespace = enumAttr.getCppNamespace();
+
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  emitSymToStrFn(enumDef, os);
+  emitStrToSymFn(enumDef, os);
+  emitUnderlyingToSymFn(enumDef, os);
+
+  for (auto ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+  os << "\n";
+}
+
+static bool emitEnumDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Enum Utility Definitions", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo");
+  for (const auto *def : defs)
+    emitEnumDef(*def, os);
+
+  return false;
+}
+
+// Registers the enum utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genEnumDecls("gen-enum-decls", "Generate enum utility declarations",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   return emitEnumDecls(records, os);
+                 });
+
+// Registers the enum utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genEnumDefs("gen-enum-defs", "Generate enum utility definitions",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  return emitEnumDefs(records, os);
+                });
diff --git a/third_party/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp b/third_party/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
new file mode 100644
index 00000000000..150fb7cfb97
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
@@ -0,0 +1,185 @@
+//===- LLVMIRConversionGen.cpp - MLIR LLVM IR builder generator -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file uses tablegen definitions of the LLVM IR Dialect operations to
+// generate the code building the LLVM IR from it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static bool emitError(const Twine &message) {
+  llvm::errs() << message << "\n";
+  return false;
+}
+
+namespace {
+// Helper structure to return a position of the substring in a string.
+struct StringLoc {
+  size_t pos;
+  size_t length;
+
+  // Take a substring identified by this location in the given string.
+  StringRef in(StringRef str) const { return str.substr(pos, length); }
+
+  // A location is invalid if its position is outside the string.
+  explicit operator bool() { return pos != std::string::npos; }
+};
+} // namespace
+
+// Find the next TableGen variable in the given pattern.  These variables start
+// with a `$` character and can contain alphannumeric characters or underscores.
+// Return the position of the variable in the pattern and its length, including
+// the `$` character.  The escape syntax `$$` is also detected and returned.
+static StringLoc findNextVariable(StringRef str) {
+  size_t startPos = str.find('$');
+  if (startPos == std::string::npos)
+    return {startPos, 0};
+
+  // If we see "$$", return immediately.
+  if (startPos != str.size() - 1 && str[startPos + 1] == '$')
+    return {startPos, 2};
+
+  // Otherwise, the symbol spans until the first character that is not
+  // alphanumeric or '_'.
+  size_t endPos = str.find_if_not([](char c) { return isAlnum(c) || c == '_'; },
+                                  startPos + 1);
+  if (endPos == std::string::npos)
+    endPos = str.size();
+
+  return {startPos, endPos - startPos};
+}
+
+// Check if `name` is the name of the variadic operand of `op`.  The variadic
+// operand can only appear at the last position in the list of operands.
+static bool isVariadicOperandName(const tblgen::Operator &op, StringRef name) {
+  unsigned numOperands = op.getNumOperands();
+  if (numOperands == 0)
+    return false;
+  const auto &operand = op.getOperand(numOperands - 1);
+  return operand.isVariadic() && operand.name == name;
+}
+
+// Check if `result` is a known name of a result of `op`.
+static bool isResultName(const tblgen::Operator &op, StringRef name) {
+  for (int i = 0, e = op.getNumResults(); i < e; ++i)
+    if (op.getResultName(i) == name)
+      return true;
+  return false;
+}
+
+// Check if `name` is a known name of an attribute of `op`.
+static bool isAttributeName(const tblgen::Operator &op, StringRef name) {
+  return llvm::any_of(
+      op.getAttributes(),
+      [name](const tblgen::NamedAttribute &attr) { return attr.name == name; });
+}
+
+// Check if `name` is a known name of an operand of `op`.
+static bool isOperandName(const tblgen::Operator &op, StringRef name) {
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i)
+    if (op.getOperand(i).name == name)
+      return true;
+  return false;
+}
+
+// Emit to `os` the operator-name driven check and the call to LLVM IRBuilder
+// for one definition of a LLVM IR Dialect operation.  Return true on success.
+static bool emitOneBuilder(const Record &record, raw_ostream &os) {
+  auto op = tblgen::Operator(record);
+
+  if (!record.getValue("llvmBuilder"))
+    return emitError("no 'llvmBuilder' field for op " + op.getOperationName());
+
+  // Return early if there is no builder specified.
+  auto builderStrRef = record.getValueAsString("llvmBuilder");
+  if (builderStrRef.empty())
+    return true;
+
+  // Progressively create the builder string by replacing $-variables with
+  // value lookups.  Keep only the not-yet-traversed part of the builder pattern
+  // to avoid re-traversing the string multiple times.
+  std::string builder;
+  llvm::raw_string_ostream bs(builder);
+  while (auto loc = findNextVariable(builderStrRef)) {
+    auto name = loc.in(builderStrRef).drop_front();
+    // First, insert the non-matched part as is.
+    bs << builderStrRef.substr(0, loc.pos);
+    // Then, rewrite the name based on its kind.
+    bool isVariadicOperand = isVariadicOperandName(op, name);
+    if (isOperandName(op, name)) {
+      auto result = isVariadicOperand
+                        ? formatv("lookupValues(op.{0}())", name)
+                        : formatv("valueMapping.lookup(op.{0}())", name);
+      bs << result;
+    } else if (isAttributeName(op, name)) {
+      bs << formatv("op.{0}()", name);
+    } else if (isResultName(op, name)) {
+      bs << formatv("valueMapping[op.{0}()]", name);
+    } else if (name == "_resultType") {
+      bs << "op.getResult()->getType().cast<LLVM::LLVMType>()."
+            "getUnderlyingType()";
+    } else if (name == "_hasResult") {
+      bs << "opInst.getNumResults() == 1";
+    } else if (name == "_location") {
+      bs << "opInst.getLoc()";
+    } else if (name == "_numOperands") {
+      bs << "opInst.getNumOperands()";
+    } else if (name == "$") {
+      bs << '$';
+    } else {
+      return emitError(name + " is neither an argument nor a result of " +
+                       op.getOperationName());
+    }
+    // Finally, only keep the untraversed part of the string.
+    builderStrRef = builderStrRef.substr(loc.pos + loc.length);
+  }
+
+  // Output the check and the rewritten builder string.
+  os << "if (auto op = dyn_cast<" << op.getQualCppClassName()
+     << ">(opInst)) {\n";
+  os << bs.str() << builderStrRef << "\n";
+  os << "  return success();\n";
+  os << "}\n";
+
+  return true;
+}
+
+// Emit all builders.  Returns false on success because of the generator
+// registration requirements.
+static bool emitBuilders(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  for (const auto *def : recordKeeper.getAllDerivedDefinitions("LLVM_OpBase")) {
+    if (!emitOneBuilder(*def, os))
+      return true;
+  }
+  return false;
+}
+
+static mlir::GenRegistration
+    genLLVMIRConversions("gen-llvmir-conversions",
+                         "Generate LLVM IR conversions", emitBuilders);
diff --git a/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
new file mode 100644
index 00000000000..4592f0edac6
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -0,0 +1,1431 @@
+//===- OpDefinitionsGen.cpp - MLIR op definitions generator ---------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// OpDefinitionsGen uses the description of operations to generate C++
+// definitions for ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::tblgen;
+
+static const char *const tblgenNamePrefix = "tblgen_";
+static const char *const generatedArgName = "tblgen_arg";
+static const char *const builderOpState = "tblgen_state";
+
+// The logic to calculate the dynamic value range for an static operand/result
+// of an op with variadic operands/results. Note that this logic is not for
+// general use; it assumes all variadic operands/results must have the same
+// number of values.
+//
+// {0}: The list of whether each static operand/result is variadic.
+// {1}: The total number of non-variadic operands/results.
+// {2}: The total number of variadic operands/results.
+// {3}: The total number of dynamic values.
+// {4}: The begin iterator of the dynamic values.
+// {5}: "operand" or "result"
+const char *valueRangeCalcCode = R"(
+  bool isVariadic[] = {{{0}};
+  int prevVariadicCount = 0;
+  for (unsigned i = 0; i < index; ++i)
+    if (isVariadic[i]) ++prevVariadicCount;
+
+  // Calculate how many dynamic values a static variadic {5} corresponds to.
+  // This assumes all static variadic {5}s have the same dynamic value count.
+  int variadicSize = ({3} - {1}) / {2};
+  // `index` passed in as the parameter is the static index which counts each
+  // {5} (variadic or not) as size 1. So here for each previous static variadic
+  // {5}, we need to offset by (variadicSize - 1) to get where the dynamic
+  // value pack for this static {5} starts.
+  int offset = index + (variadicSize - 1) * prevVariadicCount;
+  int size = isVariadic[index] ? variadicSize : 1;
+
+  return {{std::next({4}, offset), std::next({4}, offset + size)};
+)";
+
+static const char *const opCommentHeader = R"(
+//===----------------------------------------------------------------------===//
+// {0} {1}
+//===----------------------------------------------------------------------===//
+
+)";
+
+//===----------------------------------------------------------------------===//
+// Utility structs and functions
+//===----------------------------------------------------------------------===//
+
+// Returns whether the record has a value of the given name that can be returned
+// via getValueAsString.
+static inline bool hasStringAttribute(const Record &record,
+                                      StringRef fieldName) {
+  auto valueInit = record.getValueInit(fieldName);
+  return isa<CodeInit>(valueInit) || isa<StringInit>(valueInit);
+}
+
+static std::string getArgumentName(const Operator &op, int index) {
+  const auto &operand = op.getOperand(index);
+  if (!operand.name.empty())
+    return operand.name;
+  else
+    return formatv("{0}_{1}", generatedArgName, index);
+}
+
+namespace {
+// Simple RAII helper for defining ifdef-undef-endif scopes.
+class IfDefScope {
+public:
+  IfDefScope(StringRef name, raw_ostream &os) : name(name), os(os) {
+    os << "#ifdef " << name << "\n"
+       << "#undef " << name << "\n\n";
+  }
+
+  ~IfDefScope() { os << "\n#endif  // " << name << "\n\n"; }
+
+private:
+  StringRef name;
+  raw_ostream &os;
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Classes for C++ code emission
+//===----------------------------------------------------------------------===//
+
+// We emit the op declaration and definition into separate files: *Ops.h.inc
+// and *Ops.cpp.inc. The former is to be included in the dialect *Ops.h and
+// the latter for dialect *Ops.cpp. This way provides a cleaner interface.
+//
+// In order to do this split, we need to track method signature and
+// implementation logic separately. Signature information is used for both
+// declaration and definition, while implementation logic is only for
+// definition. So we have the following classes for C++ code emission.
+
+namespace {
+// Class for holding the signature of an op's method for C++ code emission
+class OpMethodSignature {
+public:
+  OpMethodSignature(StringRef retType, StringRef name, StringRef params);
+
+  // Writes the signature as a method declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the signature as the start of a method definition to the given `os`.
+  // `namePrefix` is the prefix to be prepended to the method name (typically
+  // namespaces for qualifying the method definition).
+  void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
+
+private:
+  // Returns true if the given C++ `type` ends with '&' or '*', or is empty.
+  static bool elideSpaceAfterType(StringRef type);
+
+  std::string returnType;
+  std::string methodName;
+  std::string parameters;
+};
+
+// Class for holding the body of an op's method for C++ code emission
+class OpMethodBody {
+public:
+  explicit OpMethodBody(bool declOnly);
+
+  OpMethodBody &operator<<(Twine content);
+  OpMethodBody &operator<<(int content);
+  OpMethodBody &operator<<(const FmtObjectBase &content);
+
+  void writeTo(raw_ostream &os) const;
+
+private:
+  // Whether this class should record method body.
+  bool isEffective;
+  std::string body;
+};
+
+// Class for holding an op's method for C++ code emission
+class OpMethod {
+public:
+  // Properties (qualifiers) of class methods. Bitfield is used here to help
+  // querying properties.
+  enum Property {
+    MP_None = 0x0,
+    MP_Static = 0x1,      // Static method
+    MP_Constructor = 0x2, // Constructor
+    MP_Private = 0x4,     // Private method
+  };
+
+  OpMethod(StringRef retType, StringRef name, StringRef params,
+           Property property, bool declOnly);
+
+  OpMethodBody &body();
+
+  // Returns true if this is a static method.
+  bool isStatic() const;
+
+  // Returns true if this is a private method.
+  bool isPrivate() const;
+
+  // Writes the method as a declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the method as a definition to the given `os`. `namePrefix` is the
+  // prefix to be prepended to the method name (typically namespaces for
+  // qualifying the method definition).
+  void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
+
+private:
+  Property properties;
+  // Whether this method only contains a declaration.
+  bool isDeclOnly;
+  OpMethodSignature methodSignature;
+  OpMethodBody methodBody;
+};
+
+// A class used to emit C++ classes from Tablegen.  Contains a list of public
+// methods and a list of private fields to be emitted.
+class Class {
+public:
+  explicit Class(StringRef name);
+
+  // Creates a new method in this class.
+  OpMethod &newMethod(StringRef retType, StringRef name, StringRef params = "",
+                      OpMethod::Property = OpMethod::MP_None,
+                      bool declOnly = false);
+
+  OpMethod &newConstructor(StringRef params = "", bool declOnly = false);
+
+  // Creates a new field in this class.
+  void newField(StringRef type, StringRef name, StringRef defaultValue = "");
+
+  // Writes this op's class as a declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the method definitions in this op's class to the given `os`.
+  void writeDefTo(raw_ostream &os) const;
+
+  // Returns the C++ class name of the op.
+  StringRef getClassName() const { return className; }
+
+protected:
+  std::string className;
+  SmallVector<OpMethod, 8> methods;
+  SmallVector<std::string, 4> fields;
+};
+
+// Class for holding an op for C++ code emission
+class OpClass : public Class {
+public:
+  explicit OpClass(StringRef name, StringRef extraClassDeclaration = "");
+
+  // Adds an op trait.
+  void addTrait(Twine trait);
+
+  // Writes this op's class as a declaration to the given `os`.  Redefines
+  // Class::writeDeclTo to also emit traits and extra class declarations.
+  void writeDeclTo(raw_ostream &os) const;
+
+private:
+  StringRef extraClassDeclaration;
+  SmallVector<std::string, 4> traits;
+};
+} // end anonymous namespace
+
+OpMethodSignature::OpMethodSignature(StringRef retType, StringRef name,
+                                     StringRef params)
+    : returnType(retType), methodName(name), parameters(params) {}
+
+void OpMethodSignature::writeDeclTo(raw_ostream &os) const {
+  os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << methodName
+     << "(" << parameters << ")";
+}
+
+void OpMethodSignature::writeDefTo(raw_ostream &os,
+                                   StringRef namePrefix) const {
+  // We need to remove the default values for parameters in method definition.
+  // TODO(antiagainst): We are using '=' and ',' as delimiters for parameter
+  // initializers. This is incorrect for initializer list with more than one
+  // element. Change to a more robust approach.
+  auto removeParamDefaultValue = [](StringRef params) {
+    std::string result;
+    std::pair<StringRef, StringRef> parts;
+    while (!params.empty()) {
+      parts = params.split("=");
+      result.append(result.empty() ? "" : ", ");
+      result.append(parts.first);
+      params = parts.second.split(",").second;
+    }
+    return result;
+  };
+
+  os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << namePrefix
+     << (namePrefix.empty() ? "" : "::") << methodName << "("
+     << removeParamDefaultValue(parameters) << ")";
+}
+
+bool OpMethodSignature::elideSpaceAfterType(StringRef type) {
+  return type.empty() || type.endswith("&") || type.endswith("*");
+}
+
+OpMethodBody::OpMethodBody(bool declOnly) : isEffective(!declOnly) {}
+
+OpMethodBody &OpMethodBody::operator<<(Twine content) {
+  if (isEffective)
+    body.append(content.str());
+  return *this;
+}
+
+OpMethodBody &OpMethodBody::operator<<(int content) {
+  if (isEffective)
+    body.append(std::to_string(content));
+  return *this;
+}
+
+OpMethodBody &OpMethodBody::operator<<(const FmtObjectBase &content) {
+  if (isEffective)
+    body.append(content.str());
+  return *this;
+}
+
+void OpMethodBody::writeTo(raw_ostream &os) const {
+  auto bodyRef = StringRef(body).drop_while([](char c) { return c == '\n'; });
+  os << bodyRef;
+  if (bodyRef.empty() || bodyRef.back() != '\n')
+    os << "\n";
+}
+
+OpMethod::OpMethod(StringRef retType, StringRef name, StringRef params,
+                   OpMethod::Property property, bool declOnly)
+    : properties(property), isDeclOnly(declOnly),
+      methodSignature(retType, name, params), methodBody(declOnly) {}
+
+OpMethodBody &OpMethod::body() { return methodBody; }
+
+bool OpMethod::isStatic() const { return properties & MP_Static; }
+
+bool OpMethod::isPrivate() const { return properties & MP_Private; }
+
+void OpMethod::writeDeclTo(raw_ostream &os) const {
+  os.indent(2);
+  if (isStatic())
+    os << "static ";
+  methodSignature.writeDeclTo(os);
+  os << ";";
+}
+
+void OpMethod::writeDefTo(raw_ostream &os, StringRef namePrefix) const {
+  if (isDeclOnly)
+    return;
+
+  methodSignature.writeDefTo(os, namePrefix);
+  os << " {\n";
+  methodBody.writeTo(os);
+  os << "}";
+}
+
+Class::Class(StringRef name) : className(name) {}
+
+OpMethod &Class::newMethod(StringRef retType, StringRef name, StringRef params,
+                           OpMethod::Property property, bool declOnly) {
+  methods.emplace_back(retType, name, params, property, declOnly);
+  return methods.back();
+}
+
+OpMethod &Class::newConstructor(StringRef params, bool declOnly) {
+  return newMethod("", getClassName(), params, OpMethod::MP_Constructor,
+                   declOnly);
+}
+
+void Class::newField(StringRef type, StringRef name, StringRef defaultValue) {
+  std::string varName = formatv("{0} {1}", type, name).str();
+  std::string field = defaultValue.empty()
+                          ? varName
+                          : formatv("{0} = {1}", varName, defaultValue).str();
+  fields.push_back(std::move(field));
+}
+
+void Class::writeDeclTo(raw_ostream &os) const {
+  bool hasPrivateMethod = false;
+  os << "class " << className << " {\n";
+  os << "public:\n";
+  for (const auto &method : methods) {
+    if (!method.isPrivate()) {
+      method.writeDeclTo(os);
+      os << '\n';
+    } else {
+      hasPrivateMethod = true;
+    }
+  }
+  os << '\n';
+  os << "private:\n";
+  if (hasPrivateMethod) {
+    for (const auto &method : methods) {
+      if (method.isPrivate()) {
+        method.writeDeclTo(os);
+        os << '\n';
+      }
+    }
+    os << '\n';
+  }
+  for (const auto &field : fields)
+    os.indent(2) << field << ";\n";
+  os << "};\n";
+}
+
+void Class::writeDefTo(raw_ostream &os) const {
+  for (const auto &method : methods) {
+    method.writeDefTo(os, className);
+    os << "\n\n";
+  }
+}
+
+OpClass::OpClass(StringRef name, StringRef extraClassDeclaration)
+    : Class(name), extraClassDeclaration(extraClassDeclaration) {}
+
+// Adds the given trait to this op.
+void OpClass::addTrait(Twine trait) { traits.push_back(trait.str()); }
+
+void OpClass::writeDeclTo(raw_ostream &os) const {
+  os << "class " << className << " : public Op<" << className;
+  for (const auto &trait : traits)
+    os << ", " << trait;
+  os << "> {\npublic:\n";
+  os << "  using Op::Op;\n";
+  os << "  using OperandAdaptor = " << className << "OperandAdaptor;\n";
+
+  bool hasPrivateMethod = false;
+  for (const auto &method : methods) {
+    if (!method.isPrivate()) {
+      method.writeDeclTo(os);
+      os << "\n";
+    } else {
+      hasPrivateMethod = true;
+    }
+  }
+
+  // TODO: Add line control markers to make errors easier to debug.
+  if (!extraClassDeclaration.empty())
+    os << extraClassDeclaration << "\n";
+
+  if (hasPrivateMethod) {
+    os << '\n';
+    os << "private:\n";
+    for (const auto &method : methods) {
+      if (method.isPrivate()) {
+        method.writeDeclTo(os);
+        os << "\n";
+      }
+    }
+  }
+
+  os << "};\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Op emitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Helper class to emit a record into the given output stream.
+class OpEmitter {
+public:
+  static void emitDecl(const Operator &op, raw_ostream &os);
+  static void emitDef(const Operator &op, raw_ostream &os);
+
+private:
+  OpEmitter(const Operator &op);
+
+  void emitDecl(raw_ostream &os);
+  void emitDef(raw_ostream &os);
+
+  // Generates the `getOperationName` method for this op.
+  void genOpNameGetter();
+
+  // Generates getters for the attributes.
+  void genAttrGetters();
+
+  // Generates getters for named operands.
+  void genNamedOperandGetters();
+
+  // Generates getters for named results.
+  void genNamedResultGetters();
+
+  // Generates getters for named regions.
+  void genNamedRegionGetters();
+
+  // Generates builder methods for the operation.
+  void genBuilder();
+
+  // Generates the build() method that takes each result-type/operand/attribute
+  // as a stand-alone parameter. This build() method also requires specifying
+  // result types for all results.
+  void genSeparateParamBuilder();
+
+  // Generates the build() method that takes a single parameter for all the
+  // result types and a separate parameter for each operand/attribute.
+  void genCollectiveTypeParamBuilder();
+
+  // Generates the build() method that takes each operand/attribute as a
+  // stand-alone parameter. This build() method uses first operand's type
+  // as all result's types.
+  void genUseOperandAsResultTypeBuilder();
+
+  // Generates the build() method that takes each operand/attribute as a
+  // stand-alone parameter. This build() method uses first attribute's type
+  // as all result's types.
+  void genUseAttrAsResultTypeBuilder();
+
+  // Generates the build() method that takes all result types collectively as
+  // one parameter. Similarly for operands and attributes.
+  void genCollectiveParamBuilder();
+
+  enum class TypeParamKind { None, Separate, Collective };
+
+  // Builds the parameter list for build() method of this op. This method writes
+  // to `paramList` the comma-separated parameter list. If `includeResultTypes`
+  // is true then `paramList` will also contain the parameters for all results
+  // and `resultTypeNames` will be populated with the parameter name for each
+  // result type.
+  void buildParamList(std::string &paramList,
+                      SmallVectorImpl<std::string> &resultTypeNames,
+                      TypeParamKind kind);
+
+  // Adds op arguments and regions into operation state for build() methods.
+  void genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body);
+
+  // Generates canonicalizer declaration for the operation.
+  void genCanonicalizerDecls();
+
+  // Generates the folder declaration for the operation.
+  void genFolderDecls();
+
+  // Generates the parser for the operation.
+  void genParser();
+
+  // Generates the printer for the operation.
+  void genPrinter();
+
+  // Generates verify method for the operation.
+  void genVerifier();
+
+  // Generates verify statements for operands and results in the operation.
+  // The generated code will be attached to `body`.
+  void genOperandResultVerifier(OpMethodBody &body,
+                                Operator::value_range values,
+                                StringRef valueKind);
+
+  // Generates verify statements for regions in the operation.
+  // The generated code will be attached to `body`.
+  void genRegionVerifier(OpMethodBody &body);
+
+  // Generates the traits used by the object.
+  void genTraits();
+
+private:
+  // The TableGen record for this op.
+  // TODO(antiagainst,zinenko): OpEmitter should not have a Record directly,
+  // it should rather go through the Operator for better abstraction.
+  const Record &def;
+
+  // The wrapper operator class for querying information from this op.
+  Operator op;
+
+  // The C++ code builder for this op
+  OpClass opClass;
+
+  // The format context for verification code generation.
+  FmtContext verifyCtx;
+};
+} // end anonymous namespace
+
+OpEmitter::OpEmitter(const Operator &op)
+    : def(op.getDef()), op(op),
+      opClass(op.getCppClassName(), op.getExtraClassDeclaration()) {
+  verifyCtx.withOp("(*this->getOperation())");
+
+  genTraits();
+  // Generate C++ code for various op methods. The order here determines the
+  // methods in the generated file.
+  genOpNameGetter();
+  genNamedOperandGetters();
+  genNamedResultGetters();
+  genNamedRegionGetters();
+  genAttrGetters();
+  genBuilder();
+  genParser();
+  genPrinter();
+  genVerifier();
+  genCanonicalizerDecls();
+  genFolderDecls();
+}
+
+void OpEmitter::emitDecl(const Operator &op, raw_ostream &os) {
+  OpEmitter(op).emitDecl(os);
+}
+
+void OpEmitter::emitDef(const Operator &op, raw_ostream &os) {
+  OpEmitter(op).emitDef(os);
+}
+
+void OpEmitter::emitDecl(raw_ostream &os) { opClass.writeDeclTo(os); }
+
+void OpEmitter::emitDef(raw_ostream &os) { opClass.writeDefTo(os); }
+
+void OpEmitter::genAttrGetters() {
+  FmtContext fctx;
+  fctx.withBuilder("mlir::Builder(this->getContext())");
+  for (auto &namedAttr : op.getAttributes()) {
+    const auto &name = namedAttr.name;
+    const auto &attr = namedAttr.attr;
+
+    auto &method = opClass.newMethod(attr.getReturnType(), name);
+    auto &body = method.body();
+
+    // Emit the derived attribute body.
+    if (attr.isDerivedAttr()) {
+      body << "  " << attr.getDerivedCodeBody() << "\n";
+      continue;
+    }
+
+    // Emit normal emitter.
+
+    // Return the queried attribute with the correct return type.
+    auto attrVal =
+        (attr.hasDefaultValueInitializer() || attr.isOptional())
+            ? formatv("this->getAttr(\"{0}\").dyn_cast_or_null<{1}>()", name,
+                      attr.getStorageType())
+            : formatv("this->getAttr(\"{0}\").cast<{1}>()", name,
+                      attr.getStorageType());
+    body << "  auto attr = " << attrVal << ";\n";
+    if (attr.hasDefaultValueInitializer()) {
+      // Returns the default value if not set.
+      // TODO: this is inefficient, we are recreating the attribute for every
+      // call. This should be set instead.
+      std::string defaultValue = tgfmt(attr.getConstBuilderTemplate(), &fctx,
+                                       attr.getDefaultValueInitializer());
+      body << "    if (!attr)\n      return "
+           << tgfmt(attr.getConvertFromStorageCall(),
+                    &fctx.withSelf(defaultValue))
+           << ";\n";
+    }
+    body << "  return "
+         << tgfmt(attr.getConvertFromStorageCall(), &fctx.withSelf("attr"))
+         << ";\n";
+  }
+}
+
+// Generates the named operand getter methods for the given Operator `op` and
+// puts them in `opClass`.  Uses `rangeType` as the return type of getters that
+// return a range of operands (individual operands are `Value *` and each
+// element in the range must also be `Value *`); use `rangeBeginCall` to get an
+// iterator to the beginning of the operand range; use `rangeSizeCall` to obtain
+// the number of operands. `getOperandCallPattern` contains the code necessary
+// to obtain a single operand whose position will be substituted instead of
+// "{0}" marker in the pattern.  Note that the pattern should work for any kind
+// of ops, in particular for one-operand ops that may not have the
+// `getOperand(unsigned)` method.
+static void generateNamedOperandGetters(const Operator &op, Class &opClass,
+                                        StringRef rangeType,
+                                        StringRef rangeBeginCall,
+                                        StringRef rangeSizeCall,
+                                        StringRef getOperandCallPattern) {
+  const int numOperands = op.getNumOperands();
+  const int numVariadicOperands = op.getNumVariadicOperands();
+  const int numNormalOperands = numOperands - numVariadicOperands;
+
+  if (numVariadicOperands > 1 &&
+      !op.hasTrait("OpTrait::SameVariadicOperandSize")) {
+    PrintFatalError(op.getLoc(), "op has multiple variadic operands but no "
+                                 "specification over their sizes");
+  }
+
+  // First emit a "sink" getter method upon which we layer all nicer named
+  // getter methods.
+  auto &m = opClass.newMethod(rangeType, "getODSOperands", "unsigned index");
+
+  if (numVariadicOperands == 0) {
+    // We still need to match the return type, which is a range.
+    m.body() << "return {std::next(" << rangeBeginCall << ", index), std::next("
+             << rangeBeginCall << ", index + 1)};";
+  } else {
+    // Because the op can have arbitrarily interleaved variadic and non-variadic
+    // operands, we need to embed a list in the "sink" getter method for
+    // calculation at run-time.
+    llvm::SmallVector<StringRef, 4> isVariadic;
+    isVariadic.reserve(numOperands);
+    for (int i = 0; i < numOperands; ++i) {
+      isVariadic.push_back(llvm::toStringRef(op.getOperand(i).isVariadic()));
+    }
+    std::string isVariadicList = llvm::join(isVariadic, ", ");
+
+    m.body() << formatv(valueRangeCalcCode, isVariadicList, numNormalOperands,
+                        numVariadicOperands, rangeSizeCall, rangeBeginCall,
+                        "operand");
+  }
+
+  // Then we emit nicer named getter methods by redirecting to the "sink" getter
+  // method.
+
+  for (int i = 0; i != numOperands; ++i) {
+    const auto &operand = op.getOperand(i);
+    if (operand.name.empty())
+      continue;
+
+    if (operand.isVariadic()) {
+      auto &m = opClass.newMethod(rangeType, operand.name);
+      m.body() << "return getODSOperands(" << i << ");";
+    } else {
+      auto &m = opClass.newMethod("Value *", operand.name);
+      m.body() << "return *getODSOperands(" << i << ").begin();";
+    }
+  }
+}
+
+void OpEmitter::genNamedOperandGetters() {
+  generateNamedOperandGetters(
+      op, opClass, /*rangeType=*/"Operation::operand_range",
+      /*rangeBeginCall=*/"getOperation()->operand_begin()",
+      /*rangeSizeCall=*/"getOperation()->getNumOperands()",
+      /*getOperandCallPattern=*/"getOperation()->getOperand({0})");
+}
+
+void OpEmitter::genNamedResultGetters() {
+  const int numResults = op.getNumResults();
+  const int numVariadicResults = op.getNumVariadicResults();
+  const int numNormalResults = numResults - numVariadicResults;
+
+  // If we have more than one variadic results, we need more complicated logic
+  // to calculate the value range for each result.
+
+  if (numVariadicResults > 1 &&
+      !op.hasTrait("OpTrait::SameVariadicResultSize")) {
+    PrintFatalError(op.getLoc(), "op has multiple variadic results but no "
+                                 "specification over their sizes");
+  }
+
+  auto &m = opClass.newMethod("Operation::result_range", "getODSResults",
+                              "unsigned index");
+
+  if (numVariadicResults == 0) {
+    m.body() << "return {std::next(getOperation()->result_begin(), index), "
+                "std::next(getOperation()->result_begin(), index + 1)};";
+  } else {
+    llvm::SmallVector<StringRef, 4> isVariadic;
+    isVariadic.reserve(numResults);
+    for (int i = 0; i < numResults; ++i) {
+      isVariadic.push_back(llvm::toStringRef(op.getResult(i).isVariadic()));
+    }
+    std::string isVariadicList = llvm::join(isVariadic, ", ");
+
+    m.body() << formatv(valueRangeCalcCode, isVariadicList, numNormalResults,
+                        numVariadicResults, "getOperation()->getNumResults()",
+                        "getOperation()->result_begin()", "result");
+  }
+
+  for (int i = 0; i != numResults; ++i) {
+    const auto &result = op.getResult(i);
+    if (result.name.empty())
+      continue;
+
+    if (result.isVariadic()) {
+      auto &m = opClass.newMethod("Operation::result_range", result.name);
+      m.body() << "return getODSResults(" << i << ");";
+    } else {
+      auto &m = opClass.newMethod("Value *", result.name);
+      m.body() << "return *getODSResults(" << i << ").begin();";
+    }
+  }
+}
+
+void OpEmitter::genNamedRegionGetters() {
+  unsigned numRegions = op.getNumRegions();
+  for (unsigned i = 0; i < numRegions; ++i) {
+    const auto &region = op.getRegion(i);
+    if (!region.name.empty()) {
+      auto &m = opClass.newMethod("Region &", region.name);
+      m.body() << formatv("return this->getOperation()->getRegion({0});", i);
+    }
+  }
+}
+
+void OpEmitter::genSeparateParamBuilder() {
+  std::string paramList;
+  llvm::SmallVector<std::string, 4> resultNames;
+  buildParamList(paramList, resultNames, TypeParamKind::Separate);
+
+  auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+  genCodeForAddingArgAndRegionForBuilder(m.body());
+
+  // Push all result types to the operation state
+  for (int i = 0, e = op.getNumResults(); i < e; ++i) {
+    m.body() << "  " << builderOpState << "->addTypes(" << resultNames[i]
+             << ");\n";
+  }
+}
+
+void OpEmitter::genCollectiveTypeParamBuilder() {
+  auto numResults = op.getNumResults();
+
+  // If this op has no results, then just skip generating this builder.
+  // Otherwise we are generating the same signature as the separate-parameter
+  // builder.
+  if (numResults == 0)
+    return;
+
+  // Similarly for ops with one single variadic result, which will also have one
+  // `ArrayRef<Type>` parameter for the result type.
+  if (numResults == 1 && op.getResult(0).isVariadic())
+    return;
+
+  std::string paramList;
+  llvm::SmallVector<std::string, 4> resultNames;
+  buildParamList(paramList, resultNames, TypeParamKind::Collective);
+
+  auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+  genCodeForAddingArgAndRegionForBuilder(m.body());
+
+  // Push all result types to the operation state
+  m.body() << formatv("  {0}->addTypes(resultTypes);\n", builderOpState);
+}
+
+void OpEmitter::genUseOperandAsResultTypeBuilder() {
+  std::string paramList;
+  llvm::SmallVector<std::string, 4> resultNames;
+  buildParamList(paramList, resultNames, TypeParamKind::None);
+
+  auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+  genCodeForAddingArgAndRegionForBuilder(m.body());
+
+  auto numResults = op.getNumResults();
+  if (numResults == 0)
+    return;
+
+  // Push all result types to the operation state
+  const char *index = op.getOperand(0).isVariadic() ? ".front()" : "";
+  std::string resultType =
+      formatv("{0}{1}->getType()", getArgumentName(op, 0), index).str();
+  m.body() << "  " << builderOpState << "->addTypes({" << resultType;
+  for (int i = 1; i != numResults; ++i)
+    m.body() << ", " << resultType;
+  m.body() << "});\n\n";
+}
+
+void OpEmitter::genUseAttrAsResultTypeBuilder() {
+  std::string paramList;
+  llvm::SmallVector<std::string, 4> resultNames;
+  buildParamList(paramList, resultNames, TypeParamKind::None);
+
+  auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+  genCodeForAddingArgAndRegionForBuilder(m.body());
+
+  auto numResults = op.getNumResults();
+  if (numResults == 0)
+    return;
+
+  // Push all result types to the operation state
+  std::string resultType;
+  const auto &namedAttr = op.getAttribute(0);
+  if (namedAttr.attr.isTypeAttr()) {
+    resultType = formatv("{0}.getValue()", namedAttr.name);
+  } else {
+    resultType = formatv("{0}.getType()", namedAttr.name);
+  }
+  m.body() << "  " << builderOpState << "->addTypes({" << resultType;
+  for (int i = 1; i != numResults; ++i)
+    m.body() << ", " << resultType;
+  m.body() << "});\n\n";
+}
+
+void OpEmitter::genBuilder() {
+  // Handle custom builders if provided.
+  // TODO(antiagainst): Create wrapper class for OpBuilder to hide the native
+  // TableGen API calls here.
+  {
+    auto *listInit = dyn_cast_or_null<ListInit>(def.getValueInit("builders"));
+    if (listInit) {
+      for (Init *init : listInit->getValues()) {
+        Record *builderDef = cast<DefInit>(init)->getDef();
+        StringRef params = builderDef->getValueAsString("params");
+        StringRef body = builderDef->getValueAsString("body");
+        bool hasBody = !body.empty();
+
+        auto &method =
+            opClass.newMethod("void", "build", params, OpMethod::MP_Static,
+                              /*declOnly=*/!hasBody);
+        if (hasBody)
+          method.body() << body;
+      }
+    }
+    if (op.skipDefaultBuilders()) {
+      if (!listInit || listInit->empty())
+        PrintFatalError(
+            op.getLoc(),
+            "default builders are skipped and no custom builders provided");
+      return;
+    }
+  }
+
+  // Generate default builders that requires all result type, operands, and
+  // attributes as parameters.
+
+  // We generate three builders here:
+  // 1. one having a stand-alone parameter for each result type / operand /
+  //    attribute, and
+  genSeparateParamBuilder();
+  // 2. one having a stand-alone parameter for each operand / attribute and
+  //    an aggregrated parameter for all result types, and
+  genCollectiveTypeParamBuilder();
+  // 3. one having an aggregated parameter for all result types / operands /
+  //    attributes, and
+  genCollectiveParamBuilder();
+  // 4. one having a stand-alone prameter for each operand and attribute,
+  //    use the first operand or attribute's type as all result types
+  // to facilitate different call patterns.
+  if (op.getNumVariadicResults() == 0) {
+    if (op.hasTrait("OpTrait::SameOperandsAndResultType"))
+      genUseOperandAsResultTypeBuilder();
+    if (op.hasTrait("OpTrait::FirstAttrDerivedResultType"))
+      genUseAttrAsResultTypeBuilder();
+  }
+}
+
+void OpEmitter::genCollectiveParamBuilder() {
+  int numResults = op.getNumResults();
+  int numVariadicResults = op.getNumVariadicResults();
+  int numNonVariadicResults = numResults - numVariadicResults;
+
+  int numOperands = op.getNumOperands();
+  int numVariadicOperands = op.getNumVariadicOperands();
+  int numNonVariadicOperands = numOperands - numVariadicOperands;
+  // Signature
+  std::string params =
+      std::string("Builder *, OperationState *") + builderOpState +
+      ", ArrayRef<Type> resultTypes, ArrayRef<Value *> operands, "
+      "ArrayRef<NamedAttribute> attributes";
+  auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static);
+  auto &body = m.body();
+
+  // Result types
+  if (numVariadicResults == 0 || numNonVariadicResults != 0)
+    body << "  assert(resultTypes.size()"
+         << (numVariadicResults != 0 ? " >= " : " == ") << numNonVariadicResults
+         << "u && \"mismatched number of return types\");\n";
+  body << "  " << builderOpState << "->addTypes(resultTypes);\n";
+
+  // Operands
+  if (numVariadicOperands == 0 || numNonVariadicOperands != 0)
+    body << "  assert(operands.size()"
+         << (numVariadicOperands != 0 ? " >= " : " == ")
+         << numNonVariadicOperands
+         << "u && \"mismatched number of parameters\");\n";
+  body << "  " << builderOpState << "->addOperands(operands);\n\n";
+
+  // Attributes
+  body << "  for (const auto& pair : attributes)\n"
+       << "    " << builderOpState
+       << "->addAttribute(pair.first, pair.second);\n";
+
+  // Create the correct number of regions
+  if (int numRegions = op.getNumRegions()) {
+    for (int i = 0; i < numRegions; ++i)
+      m.body() << "  (void)" << builderOpState << "->addRegion();\n";
+  }
+}
+
+void OpEmitter::buildParamList(std::string &paramList,
+                               SmallVectorImpl<std::string> &resultTypeNames,
+                               TypeParamKind kind) {
+  resultTypeNames.clear();
+  auto numResults = op.getNumResults();
+  resultTypeNames.reserve(numResults);
+
+  paramList = "Builder *, OperationState *";
+  paramList.append(builderOpState);
+
+  switch (kind) {
+  case TypeParamKind::None:
+    break;
+  case TypeParamKind::Separate: {
+    // Add parameters for all return types
+    for (int i = 0; i < numResults; ++i) {
+      const auto &result = op.getResult(i);
+      std::string resultName = result.name;
+      if (resultName.empty())
+        resultName = formatv("resultType{0}", i);
+
+      paramList.append(result.isVariadic() ? ", ArrayRef<Type> " : ", Type ");
+      paramList.append(resultName);
+
+      resultTypeNames.emplace_back(std::move(resultName));
+    }
+  } break;
+  case TypeParamKind::Collective: {
+    paramList.append(", ArrayRef<Type> resultTypes");
+    resultTypeNames.push_back("resultTypes");
+  } break;
+  }
+
+  int numOperands = 0;
+  int numAttrs = 0;
+
+  // Add parameters for all arguments (operands and attributes).
+  for (int i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    if (argument.is<tblgen::NamedTypeConstraint *>()) {
+      const auto &operand = op.getOperand(numOperands);
+      paramList.append(operand.isVariadic() ? ", ArrayRef<Value *> "
+                                            : ", Value *");
+      paramList.append(getArgumentName(op, numOperands));
+      ++numOperands;
+    } else {
+      // TODO(antiagainst): Support default initializer for attributes
+      const auto &namedAttr = op.getAttribute(numAttrs);
+      const auto &attr = namedAttr.attr;
+      paramList.append(", ");
+      if (attr.isOptional())
+        paramList.append("/*optional*/");
+      paramList.append(attr.getStorageType());
+      paramList.append(" ");
+      paramList.append(namedAttr.name);
+      ++numAttrs;
+    }
+  }
+
+  if (numOperands + numAttrs != op.getNumArgs())
+    PrintFatalError("op arguments must be either operands or attributes");
+}
+
+void OpEmitter::genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body) {
+  // Push all operands to the result
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+    body << "  " << builderOpState << "->addOperands(" << getArgumentName(op, i)
+         << ");\n";
+  }
+
+  // Push all attributes to the result
+  for (const auto &namedAttr : op.getAttributes()) {
+    if (!namedAttr.attr.isDerivedAttr()) {
+      bool emitNotNullCheck = namedAttr.attr.isOptional();
+      if (emitNotNullCheck) {
+        body << formatv("  if ({0}) ", namedAttr.name) << "{\n";
+      }
+      body << formatv("  {0}->addAttribute(\"{1}\", {1});\n", builderOpState,
+                      namedAttr.name);
+      if (emitNotNullCheck) {
+        body << "  }\n";
+      }
+    }
+  }
+
+  // Create the correct number of regions
+  if (int numRegions = op.getNumRegions()) {
+    for (int i = 0; i < numRegions; ++i)
+      body << "  (void)" << builderOpState << "->addRegion();\n";
+  }
+}
+
+void OpEmitter::genCanonicalizerDecls() {
+  if (!def.getValueAsBit("hasCanonicalizer"))
+    return;
+
+  const char *const params =
+      "OwningRewritePatternList &results, MLIRContext *context";
+  opClass.newMethod("void", "getCanonicalizationPatterns", params,
+                    OpMethod::MP_Static, /*declOnly=*/true);
+}
+
+void OpEmitter::genFolderDecls() {
+  bool hasSingleResult = op.getNumResults() == 1;
+
+  if (def.getValueAsBit("hasFolder")) {
+    if (hasSingleResult) {
+      const char *const params = "ArrayRef<Attribute> operands";
+      opClass.newMethod("OpFoldResult", "fold", params, OpMethod::MP_None,
+                        /*declOnly=*/true);
+    } else {
+      const char *const params = "ArrayRef<Attribute> operands, "
+                                 "SmallVectorImpl<OpFoldResult> &results";
+      opClass.newMethod("LogicalResult", "fold", params, OpMethod::MP_None,
+                        /*declOnly=*/true);
+    }
+  }
+}
+
+void OpEmitter::genParser() {
+  if (!hasStringAttribute(def, "parser"))
+    return;
+
+  auto &method = opClass.newMethod(
+      "ParseResult", "parse", "OpAsmParser *parser, OperationState *result",
+      OpMethod::MP_Static);
+  FmtContext fctx;
+  fctx.addSubst("cppClass", opClass.getClassName());
+  auto parser = def.getValueAsString("parser").ltrim().rtrim(" \t\v\f\r");
+  method.body() << "  " << tgfmt(parser, &fctx);
+}
+
+void OpEmitter::genPrinter() {
+  auto valueInit = def.getValueInit("printer");
+  CodeInit *codeInit = dyn_cast<CodeInit>(valueInit);
+  if (!codeInit)
+    return;
+
+  auto &method = opClass.newMethod("void", "print", "OpAsmPrinter *p");
+  FmtContext fctx;
+  fctx.addSubst("cppClass", opClass.getClassName());
+  auto printer = codeInit->getValue().ltrim().rtrim(" \t\v\f\r");
+  method.body() << "  " << tgfmt(printer, &fctx);
+}
+
+void OpEmitter::genVerifier() {
+  auto valueInit = def.getValueInit("verifier");
+  CodeInit *codeInit = dyn_cast<CodeInit>(valueInit);
+  bool hasCustomVerify = codeInit && !codeInit->getValue().empty();
+
+  auto &method = opClass.newMethod("LogicalResult", "verify", /*params=*/"");
+  auto &body = method.body();
+
+  // Populate substitutions for attributes and named operands and results.
+  for (const auto &namedAttr : op.getAttributes())
+    verifyCtx.addSubst(namedAttr.name,
+                       formatv("this->getAttr(\"{0}\")", namedAttr.name));
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+    auto &value = op.getOperand(i);
+    // Skip from from first variadic operands for now. Else getOperand index
+    // used below doesn't match.
+    if (value.isVariadic())
+      break;
+    if (!value.name.empty())
+      verifyCtx.addSubst(
+          value.name, formatv("(*this->getOperation()->getOperand({0}))", i));
+  }
+  for (int i = 0, e = op.getNumResults(); i < e; ++i) {
+    auto &value = op.getResult(i);
+    // Skip from from first variadic results for now. Else getResult index used
+    // below doesn't match.
+    if (value.isVariadic())
+      break;
+    if (!value.name.empty())
+      verifyCtx.addSubst(value.name,
+                         formatv("(*this->getOperation()->getResult({0}))", i));
+  }
+
+  // Verify the attributes have the correct type.
+  for (const auto &namedAttr : op.getAttributes()) {
+    const auto &attr = namedAttr.attr;
+    if (attr.isDerivedAttr())
+      continue;
+
+    auto attrName = namedAttr.name;
+    // Prefix with `tblgen_` to avoid hiding the attribute accessor.
+    auto varName = tblgenNamePrefix + attrName;
+    body << formatv("  auto {0} = this->getAttr(\"{1}\");\n", varName,
+                    attrName);
+
+    bool allowMissingAttr =
+        attr.hasDefaultValueInitializer() || attr.isOptional();
+    if (allowMissingAttr) {
+      // If the attribute has a default value, then only verify the predicate if
+      // set. This does effectively assume that the default value is valid.
+      // TODO: verify the debug value is valid (perhaps in debug mode only).
+      body << "  if (" << varName << ") {\n";
+    } else {
+      body << "  if (!" << varName
+           << ") return emitOpError(\"requires attribute '" << attrName
+           << "'\");\n  {\n";
+    }
+
+    auto attrPred = attr.getPredicate();
+    if (!attrPred.isNull()) {
+      body << tgfmt(
+          "    if (!($0)) return emitOpError(\"attribute '$1' "
+          "failed to satisfy constraint: $2\");\n",
+          /*ctx=*/nullptr,
+          tgfmt(attrPred.getCondition(), &verifyCtx.withSelf(varName)),
+          attrName, attr.getDescription());
+    }
+
+    body << "  }\n";
+  }
+
+  genOperandResultVerifier(body, op.getOperands(), "operand");
+  genOperandResultVerifier(body, op.getResults(), "result");
+
+  for (auto &trait : op.getTraits()) {
+    if (auto t = dyn_cast<tblgen::PredOpTrait>(&trait)) {
+      body << tgfmt("  if (!($0)) {\n    "
+                    "return emitOpError(\"failed to verify that $1\");\n  }\n",
+                    &verifyCtx, tgfmt(t->getPredTemplate(), &verifyCtx),
+                    t->getDescription());
+    }
+  }
+
+  genRegionVerifier(body);
+
+  if (hasCustomVerify) {
+    FmtContext fctx;
+    fctx.addSubst("cppClass", opClass.getClassName());
+    auto printer = codeInit->getValue().ltrim().rtrim(" \t\v\f\r");
+    body << "  " << tgfmt(printer, &fctx);
+  } else {
+    body << "  return mlir::success();\n";
+  }
+}
+
+void OpEmitter::genOperandResultVerifier(OpMethodBody &body,
+                                         Operator::value_range values,
+                                         StringRef valueKind) {
+  FmtContext fctx;
+
+  body << "  {\n";
+  body << "    unsigned index = 0; (void)index;\n";
+
+  for (auto staticValue : llvm::enumerate(values)) {
+    if (!staticValue.value().hasPredicate())
+      continue;
+
+    // Emit a loop to check all the dynamic values in the pack.
+    body << formatv("    for (Value *v : getODS{0}{1}s({2})) {{\n",
+                    // Capitalize the first letter to match the function name
+                    valueKind.substr(0, 1).upper(), valueKind.substr(1),
+                    staticValue.index());
+
+    auto constraint = staticValue.value().constraint;
+
+    body << "      (void)v;\n"
+         << "      if (!("
+         << tgfmt(constraint.getConditionTemplate(),
+                  &fctx.withSelf("v->getType()"))
+         << ")) {\n"
+         << formatv("        return emitOpError(\"{0} #\") << index "
+                    "<< \" must be {1}\";\n",
+                    valueKind, constraint.getDescription())
+         << "      }\n" // if
+         << "      ++index;\n"
+         << "    }\n"; // for
+  }
+
+  body << "  }\n";
+}
+
+void OpEmitter::genRegionVerifier(OpMethodBody &body) {
+  unsigned numRegions = op.getNumRegions();
+
+  // Verify this op has the correct number of regions
+  body << formatv(
+      "  if (this->getOperation()->getNumRegions() != {0}) {\n    "
+      "return emitOpError(\"has incorrect number of regions: expected {0} but "
+      "found \") << this->getOperation()->getNumRegions();\n  }\n",
+      numRegions);
+
+  for (unsigned i = 0; i < numRegions; ++i) {
+    const auto &region = op.getRegion(i);
+
+    std::string name = formatv("#{0}", i);
+    if (!region.name.empty()) {
+      name += formatv(" ('{0}')", region.name);
+    }
+
+    auto getRegion = formatv("this->getOperation()->getRegion({0})", i).str();
+    auto constraint = tgfmt(region.constraint.getConditionTemplate(),
+                            &verifyCtx.withSelf(getRegion))
+                          .str();
+
+    body << formatv("  if (!({0})) {\n    "
+                    "return emitOpError(\"region {1} failed to verify "
+                    "constraint: {2}\");\n  }\n",
+                    constraint, name, region.constraint.getDescription());
+  }
+}
+
+void OpEmitter::genTraits() {
+  int numResults = op.getNumResults();
+  int numVariadicResults = op.getNumVariadicResults();
+
+  // Add return size trait.
+  if (numVariadicResults != 0) {
+    if (numResults == numVariadicResults)
+      opClass.addTrait("OpTrait::VariadicResults");
+    else
+      opClass.addTrait("OpTrait::AtLeastNResults<" +
+                       Twine(numResults - numVariadicResults) + ">::Impl");
+  } else {
+    switch (numResults) {
+    case 0:
+      opClass.addTrait("OpTrait::ZeroResult");
+      break;
+    case 1:
+      opClass.addTrait("OpTrait::OneResult");
+      break;
+    default:
+      opClass.addTrait("OpTrait::NResults<" + Twine(numResults) + ">::Impl");
+      break;
+    }
+  }
+
+  for (const auto &trait : op.getTraits()) {
+    if (auto opTrait = dyn_cast<tblgen::NativeOpTrait>(&trait))
+      opClass.addTrait(opTrait->getTrait());
+  }
+
+  // Add variadic size trait and normal op traits.
+  int numOperands = op.getNumOperands();
+  int numVariadicOperands = op.getNumVariadicOperands();
+
+  // Add operand size trait.
+  if (numVariadicOperands != 0) {
+    if (numOperands == numVariadicOperands)
+      opClass.addTrait("OpTrait::VariadicOperands");
+    else
+      opClass.addTrait("OpTrait::AtLeastNOperands<" +
+                       Twine(numOperands - numVariadicOperands) + ">::Impl");
+  } else {
+    switch (numOperands) {
+    case 0:
+      opClass.addTrait("OpTrait::ZeroOperands");
+      break;
+    case 1:
+      opClass.addTrait("OpTrait::OneOperand");
+      break;
+    default:
+      opClass.addTrait("OpTrait::NOperands<" + Twine(numOperands) + ">::Impl");
+      break;
+    }
+  }
+}
+
+void OpEmitter::genOpNameGetter() {
+  auto &method = opClass.newMethod("StringRef", "getOperationName",
+                                   /*params=*/"", OpMethod::MP_Static);
+  method.body() << "  return \"" << op.getOperationName() << "\";\n";
+}
+
+//===----------------------------------------------------------------------===//
+// OpOperandAdaptor emitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Helper class to emit Op operand adaptors to an output stream.  Operand
+// adaptors are wrappers around ArrayRef<Value *> that provide named operand
+// getters identical to those defined in the Op.
+class OpOperandAdaptorEmitter {
+public:
+  static void emitDecl(const Operator &op, raw_ostream &os);
+  static void emitDef(const Operator &op, raw_ostream &os);
+
+private:
+  explicit OpOperandAdaptorEmitter(const Operator &op);
+
+  Class adapterClass;
+};
+} // end namespace
+
+OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op)
+    : adapterClass(op.getCppClassName().str() + "OperandAdaptor") {
+  adapterClass.newField("ArrayRef<Value *>", "tblgen_operands");
+  auto &constructor = adapterClass.newConstructor("ArrayRef<Value *> values");
+  constructor.body() << "  tblgen_operands = values;\n";
+
+  generateNamedOperandGetters(op, adapterClass,
+                              /*rangeType=*/"ArrayRef<Value *>",
+                              /*rangeBeginCall=*/"tblgen_operands.begin()",
+                              /*rangeSizeCall=*/"tblgen_operands.size()",
+                              /*getOperandCallPattern=*/"tblgen_operands[{0}]");
+}
+
+void OpOperandAdaptorEmitter::emitDecl(const Operator &op, raw_ostream &os) {
+  OpOperandAdaptorEmitter(op).adapterClass.writeDeclTo(os);
+}
+
+void OpOperandAdaptorEmitter::emitDef(const Operator &op, raw_ostream &os) {
+  OpOperandAdaptorEmitter(op).adapterClass.writeDefTo(os);
+}
+
+// Emits the opcode enum and op classes.
+static void emitOpClasses(const std::vector<Record *> &defs, raw_ostream &os,
+                          bool emitDecl) {
+  IfDefScope scope("GET_OP_CLASSES", os);
+  // First emit forward declaration for each class, this allows them to refer
+  // to each others in traits for example.
+  if (emitDecl) {
+    for (auto *def : defs) {
+      Operator op(*def);
+      os << "class " << op.getCppClassName() << ";\n";
+    }
+  }
+  for (auto *def : defs) {
+    Operator op(*def);
+    if (emitDecl) {
+      os << formatv(opCommentHeader, op.getQualCppClassName(), "declarations");
+      OpOperandAdaptorEmitter::emitDecl(op, os);
+      OpEmitter::emitDecl(op, os);
+    } else {
+      os << formatv(opCommentHeader, op.getQualCppClassName(), "definitions");
+      OpOperandAdaptorEmitter::emitDef(op, os);
+      OpEmitter::emitDef(op, os);
+    }
+  }
+}
+
+// Emits a comma-separated list of the ops.
+static void emitOpList(const std::vector<Record *> &defs, raw_ostream &os) {
+  IfDefScope scope("GET_OP_LIST", os);
+
+  interleave(
+      // TODO: We are constructing the Operator wrapper instance just for
+      // getting it's qualified class name here. Reduce the overhead by having a
+      // lightweight version of Operator class just for that purpose.
+      defs, [&os](Record *def) { os << Operator(def).getQualCppClassName(); },
+      [&os]() { os << ",\n"; });
+}
+
+static bool emitOpDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Op Declarations", os);
+
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+  emitOpClasses(defs, os, /*emitDecl=*/true);
+
+  return false;
+}
+
+static bool emitOpDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Op Definitions", os);
+
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+  emitOpList(defs, os);
+  emitOpClasses(defs, os, /*emitDecl=*/false);
+
+  return false;
+}
+
+static mlir::GenRegistration
+    genOpDecls("gen-op-decls", "Generate op declarations",
+               [](const RecordKeeper &records, raw_ostream &os) {
+                 return emitOpDecls(records, os);
+               });
+
+static mlir::GenRegistration genOpDefs("gen-op-defs", "Generate op definitions",
+                                       [](const RecordKeeper &records,
+                                          raw_ostream &os) {
+                                         return emitOpDefs(records, os);
+                                       });
diff --git a/third_party/mlir/tools/mlir-tblgen/OpDocGen.cpp b/third_party/mlir/tools/mlir-tblgen/OpDocGen.cpp
new file mode 100644
index 00000000000..0a16c315f2b
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -0,0 +1,146 @@
+//===- OpDocGen.cpp - MLIR operation documentation generator --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// OpDocGen uses the description of operations to generate documentation for the
+// operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+using mlir::tblgen::Operator;
+
+// Emit the description by aligning the text to the left per line (e.g.,
+// removing the minimum indentation across the block).
+//
+// This expects that the description in the tablegen file is already formatted
+// in a way the user wanted but has some additional indenting due to being
+// nested in the op definition.
+static void emitDescription(StringRef description, raw_ostream &os) {
+  // Determine the minimum number of spaces in a line.
+  size_t min_indent = -1;
+  StringRef remaining = description;
+  while (!remaining.empty()) {
+    auto split = remaining.split('\n');
+    size_t indent = split.first.find_first_not_of(" \t");
+    if (indent != StringRef::npos)
+      min_indent = std::min(indent, min_indent);
+    remaining = split.second;
+  }
+
+  // Print out the description indented.
+  os << "\n";
+  remaining = description;
+  bool printed = false;
+  while (!remaining.empty()) {
+    auto split = remaining.split('\n');
+    if (split.second.empty()) {
+      // Skip last line with just spaces.
+      if (split.first.ltrim().empty())
+        break;
+    }
+    // Print empty new line without spaces if line only has spaces, unless no
+    // text has been emitted before.
+    if (split.first.ltrim().empty()) {
+      if (printed)
+        os << "\n";
+    } else {
+      os << split.first.substr(min_indent) << "\n";
+      printed = true;
+    }
+    remaining = split.second;
+  }
+}
+
+static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+  os << "<!-- Autogenerated by mlir-tblgen; don't manually edit -->\n";
+
+  // TODO: Group by dialect.
+  // TODO: Add docs for types used (maybe dialect specific ones?) and link
+  // between use and def.
+  os << "# Operation definition\n";
+  for (auto *def : defs) {
+    Operator op(def);
+    os << "## " << op.getOperationName() << " (" << op.getQualCppClassName()
+       << ")";
+
+    // Emit summary & description of operator.
+    if (op.hasSummary())
+      os << "\n" << op.getSummary() << "\n";
+    os << "\n### Description:\n";
+    if (op.hasDescription())
+      emitDescription(op.getDescription(), os);
+
+    // Emit operands & type of operand. All operands are numbered, some may be
+    // named too.
+    os << "\n### Operands:\n";
+    for (const auto &operand : op.getOperands()) {
+      os << "1. ";
+      if (!operand.name.empty())
+        os << "`" << operand.name << "`: ";
+      else
+        os << "&laquo;unnamed&raquo;: ";
+      os << operand.constraint.getDescription() << "\n";
+    }
+
+    // Emit attributes.
+    // TODO: Attributes are only documented by TableGen name, with no further
+    // info. This should be improved.
+    os << "\n### Attributes:\n";
+    if (op.getNumAttributes() > 0) {
+      os << "| Attribute | MLIR Type | Description |\n"
+         << "| :-------: | :-------: | ----------- |\n";
+    }
+    for (auto namedAttr : op.getAttributes()) {
+      os << "| `" << namedAttr.name << "` | `"
+         << namedAttr.attr.getStorageType() << "` | "
+         << namedAttr.attr.getDescription() << " attribute |\n";
+    }
+
+    // Emit results.
+    os << "\n### Results:\n";
+    for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
+      os << "1. ";
+      auto name = op.getResultName(i);
+      if (name.empty())
+        os << "&laquo;unnamed&raquo;: ";
+      else
+        os << "`" << name << "`: ";
+      os << op.getResultTypeConstraint(i).getDescription() << "\n";
+    }
+
+    os << "\n";
+  }
+}
+
+static mlir::GenRegistration
+    genRegister("gen-op-doc", "Generate operation documentation",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  emitOpDoc(records, os);
+                  return false;
+                });
diff --git a/third_party/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/third_party/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
new file mode 100644
index 00000000000..d3d4482cc9f
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -0,0 +1,249 @@
+//===- OpInterfacesGen.cpp - MLIR op interface utility generator ----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// OpInterfacesGen generates definitions for operation interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+namespace {
+// This struct represents a single method argument.
+struct MethodArgument {
+  StringRef type, name;
+};
+
+// Wrapper class around a single interface method.
+class OpInterfaceMethod {
+public:
+  explicit OpInterfaceMethod(const llvm::Record *def) : def(def) {
+    llvm::DagInit *args = def->getValueAsDag("arguments");
+    for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) {
+      arguments.push_back(
+          {llvm::cast<llvm::StringInit>(args->getArg(i))->getValue(),
+           args->getArgNameStr(i)});
+    }
+  }
+
+  // Return the return type of this method.
+  StringRef getReturnType() const {
+    return def->getValueAsString("returnType");
+  }
+
+  // Return the name of this method.
+  StringRef getName() const { return def->getValueAsString("name"); }
+
+  // Return if this method is static.
+  bool isStatic() const { return def->isSubClassOf("StaticInterfaceMethod"); }
+
+  // Return the body for this method if it has one.
+  llvm::Optional<StringRef> getBody() const {
+    auto value = def->getValueAsString("body");
+    return value.empty() ? llvm::Optional<StringRef>() : value;
+  }
+
+  // Arguments.
+  ArrayRef<MethodArgument> getArguments() const { return arguments; }
+  bool arg_empty() const { return arguments.empty(); }
+
+protected:
+  // The TableGen definition of this method.
+  const llvm::Record *def;
+
+  // The arguments of this method.
+  SmallVector<MethodArgument, 2> arguments;
+};
+
+// Wrapper class with helper methods for accessing OpInterfaces defined in
+// TableGen.
+class OpInterface {
+public:
+  explicit OpInterface(const llvm::Record *def) : def(def) {
+    auto *listInit = dyn_cast<llvm::ListInit>(def->getValueInit("methods"));
+    for (llvm::Init *init : listInit->getValues())
+      methods.emplace_back(cast<llvm::DefInit>(init)->getDef());
+  }
+
+  // Return the name of this interface.
+  StringRef getName() const { return def->getValueAsString("cppClassName"); }
+
+  // Return the methods of this interface.
+  ArrayRef<OpInterfaceMethod> getMethods() const { return methods; }
+
+protected:
+  // The TableGen definition of this interface.
+  const llvm::Record *def;
+
+  // The methods of this interface.
+  SmallVector<OpInterfaceMethod, 8> methods;
+};
+} // end anonymous namespace
+
+// Emit the method name and argument list for the given method. If
+// 'addOperationArg' is true, then an Operation* argument is added to the
+// beginning of the argument list.
+static void emitMethodNameAndArgs(const OpInterfaceMethod &method,
+                                  raw_ostream &os, bool addOperationArg) {
+  os << method.getName() << '(';
+  if (addOperationArg)
+    os << "Operation *tablegen_opaque_op" << (method.arg_empty() ? "" : ", ");
+  interleaveComma(method.getArguments(), os, [&](const MethodArgument &arg) {
+    os << arg.type << " " << arg.name;
+  });
+  os << ')';
+}
+
+static void emitInterfaceDef(const Record &interfaceDef, raw_ostream &os) {
+  OpInterface interface(&interfaceDef);
+  StringRef interfaceName = interface.getName();
+
+  // Insert the method definitions.
+  for (auto &method : interface.getMethods()) {
+    os << method.getReturnType() << " " << interfaceName << "::";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/false);
+
+    // Forward to the method on the concrete operation type.
+    os << " {\n      return getImpl()->" << method.getName() << '(';
+    if (!method.isStatic())
+      os << "getOperation()" << (method.arg_empty() ? "" : ", ");
+    interleaveComma(method.getArguments(), os,
+                    [&](const MethodArgument &arg) { os << arg.name; });
+    os << ");\n  }\n";
+  }
+}
+
+static bool emitInterfaceDefs(const RecordKeeper &recordKeeper,
+                              raw_ostream &os) {
+  llvm::emitSourceFileHeader("Operation Interface Definitions", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("OpInterface");
+  for (const auto *def : defs)
+    emitInterfaceDef(*def, os);
+  return false;
+}
+
+static void emitConceptDecl(OpInterface &interface, raw_ostream &os) {
+  os << "  class Concept {\n"
+     << "  public:\n"
+     << "    virtual ~Concept() = default;\n";
+
+  // Insert each of the pure virtual concept methods.
+  for (auto &method : interface.getMethods()) {
+    os << "    virtual " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/!method.isStatic());
+    os << " = 0;\n";
+  }
+  os << "  };\n";
+}
+
+static void emitModelDecl(OpInterface &interface, raw_ostream &os) {
+  os << "  template<typename ConcreteOp>\n";
+  os << "  class Model : public Concept {\npublic:\n";
+
+  // Insert each of the virtual method overrides.
+  for (auto &method : interface.getMethods()) {
+    os << "    " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/!method.isStatic());
+    os << " final {\n";
+
+    // Provide a definition of the concrete op if this is non static.
+    if (!method.isStatic()) {
+      os << "      auto op = llvm::cast<ConcreteOp>(tablegen_opaque_op);\n"
+         << "      (void)op;\n";
+    }
+
+    // Check for a provided body to the function.
+    if (auto body = method.getBody()) {
+      os << body << "\n    }\n";
+      continue;
+    }
+
+    // Forward to the method on the concrete operation type.
+    os << "      return " << (method.isStatic() ? "ConcreteOp::" : "op.");
+
+    // Add the arguments to the call.
+    os << method.getName() << '(';
+    interleaveComma(method.getArguments(), os,
+                    [&](const MethodArgument &arg) { os << arg.name; });
+    os << ");\n    }\n";
+  }
+  os << "  };\n";
+}
+
+static void emitInterfaceDecl(const Record &interfaceDef, raw_ostream &os) {
+  OpInterface interface(&interfaceDef);
+  StringRef interfaceName = interface.getName();
+  auto interfaceTraitsName = (interfaceName + "InterfaceTraits").str();
+
+  // Emit the traits struct containing the concept and model declarations.
+  os << "namespace detail {\n"
+     << "struct " << interfaceTraitsName << " {\n";
+  emitConceptDecl(interface, os);
+  emitModelDecl(interface, os);
+  os << "};\n} // end namespace detail\n";
+
+  // Emit the main interface class declaration.
+  os << llvm::formatv("class {0} : public OpInterface<{1}, detail::{2}> {\n"
+                      "public:\n"
+                      "  using OpInterface<{1}, detail::{2}>::OpInterface;\n",
+                      interfaceName, interfaceName, interfaceTraitsName);
+
+  // Insert the method declarations.
+  for (auto &method : interface.getMethods()) {
+    os << "  " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/false);
+    os << ";\n";
+  }
+  os << "};\n";
+}
+
+static bool emitInterfaceDecls(const RecordKeeper &recordKeeper,
+                               raw_ostream &os) {
+  llvm::emitSourceFileHeader("Operation Interface Declarations", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("OpInterface");
+  for (const auto *def : defs)
+    emitInterfaceDecl(*def, os);
+  return false;
+}
+
+// Registers the operation interface generator to mlir-tblgen.
+static mlir::GenRegistration
+    genInterfaceDecls("gen-op-interface-decls",
+                      "Generate op interface declarations",
+                      [](const RecordKeeper &records, raw_ostream &os) {
+                        return emitInterfaceDecls(records, os);
+                      });
+
+// Registers the operation interface generator to mlir-tblgen.
+static mlir::GenRegistration
+    genInterfaceDefs("gen-op-interface-defs",
+                     "Generate op interface definitions",
+                     [](const RecordKeeper &records, raw_ostream &os) {
+                       return emitInterfaceDefs(records, os);
+                     });
diff --git a/third_party/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp b/third_party/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp
new file mode 100644
index 00000000000..3e6893a23d3
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp
@@ -0,0 +1,94 @@
+//===- ReferenceImplGen.cpp - MLIR reference implementation generator -----===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// ReferenceImplGen uses the description of operations to generate reference
+// implementations for the ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+using mlir::tblgen::Operator;
+
+static void emitReferenceImplementations(const RecordKeeper &recordKeeper,
+                                         raw_ostream &os) {
+  emitSourceFileHeader("Reference implementation file", os);
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+
+  os << "void printRefImplementation(StringRef opName, mlir::FuncOp *f) {\n"
+     << "  using namespace ::mlir::edsc;\n"
+     << "if (false) {}";
+  for (auto *def : defs) {
+    Operator op(def);
+    auto referenceImplGenerator = def->getValueInit("referenceImplementation");
+    if (!referenceImplGenerator)
+      continue;
+    os << " else if (opName == \"" << op.getOperationName() << "\") {\n"
+       << "  edsc::ScopedContext scope(f);\n";
+
+    for (auto en : llvm::enumerate(op.getOperands())) {
+      os.indent(2) << formatv("ValueHandle arg_{0}(f->getArgument({1})); "
+                              "(void)arg_{0};\n",
+                              en.value().name, en.index());
+      // TODO(jpienaar): this is generally incorrect, not all args are memref
+      // in the general case.
+      os.indent(2) << formatv("MemRefView view_{0}(f->getArgument({1})); "
+                              "(void)view_{0};\n",
+                              en.value().name, en.index());
+    }
+    unsigned numOperands = op.getNumOperands();
+    unsigned numResults = op.getNumResults();
+    for (unsigned idx = 0; idx < numResults; ++idx) {
+      os.indent(2) << formatv("ValueHandle arg_{0}(f->getArgument({1})); "
+                              "(void)arg_{0};\n",
+                              op.getResult(idx).name, numOperands + idx);
+      // TODO(jpienaar): this is generally incorrect, not all args are memref
+      // in the general case.
+      os.indent(2) << formatv("MemRefView view_{0}(f->getArgument({1})); "
+                              "(void)view_{0};\n",
+                              op.getResult(idx).name, numOperands + idx);
+    }
+
+    // Print the EDSC.
+    os << referenceImplGenerator->getAsUnquotedString() << "\n";
+    os.indent(2) << "f->print(llvm::outs());\n\n";
+    os << "}";
+  }
+  os << " else {\n";
+  os.indent(2) << "f->emitError(\"no reference impl. for \" + opName);\n";
+  os.indent(2) << "return;\n";
+  os << "}\n";
+  os << "}\n";
+}
+
+static mlir::GenRegistration
+    genRegister("gen-reference-implementations",
+                "Generate reference implemenations",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  emitReferenceImplementations(records, os);
+                  return false;
+                });
diff --git a/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp b/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp
new file mode 100644
index 00000000000..65453f016f1
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -0,0 +1,845 @@
+//===- RewriterGen.cpp - MLIR pattern rewriter generator ------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// RewriterGen uses pattern rewrite definitions to generate rewriter matchers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "mlir/TableGen/Pattern.h"
+#include "mlir/TableGen/Predicate.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::tblgen;
+
+namespace llvm {
+template <> struct format_provider<mlir::tblgen::Pattern::IdentifierLine> {
+  static void format(const mlir::tblgen::Pattern::IdentifierLine &v,
+                     raw_ostream &os, StringRef style) {
+    os << v.first << ":" << v.second;
+  }
+};
+} // end namespace llvm
+
+//===----------------------------------------------------------------------===//
+// PatternEmitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+class PatternEmitter {
+public:
+  PatternEmitter(Record *pat, RecordOperatorMap *mapper, raw_ostream &os);
+
+  // Emits the mlir::RewritePattern struct named `rewriteName`.
+  void emit(StringRef rewriteName);
+
+private:
+  // Emits the code for matching ops.
+  void emitMatchLogic(DagNode tree);
+
+  // Emits the code for rewriting ops.
+  void emitRewriteLogic();
+
+  //===--------------------------------------------------------------------===//
+  // Match utilities
+  //===--------------------------------------------------------------------===//
+
+  // Emits C++ statements for matching the op constrained by the given DAG
+  // `tree`.
+  void emitOpMatch(DagNode tree, int depth);
+
+  // Emits C++ statements for matching the `index`-th argument of the given DAG
+  // `tree` as an operand.
+  void emitOperandMatch(DagNode tree, int index, int depth, int indent);
+
+  // Emits C++ statements for matching the `index`-th argument of the given DAG
+  // `tree` as an attribute.
+  void emitAttributeMatch(DagNode tree, int index, int depth, int indent);
+
+  //===--------------------------------------------------------------------===//
+  // Rewrite utilities
+  //===--------------------------------------------------------------------===//
+
+  // The entry point for handling a result pattern rooted at `resultTree`. This
+  // method dispatches to concrete handlers according to `resultTree`'s kind and
+  // returns a symbol representing the whole value pack. Callers are expected to
+  // further resolve the symbol according to the specific use case.
+  //
+  // `depth` is the nesting level of `resultTree`; 0 means top-level result
+  // pattern. For top-level result pattern, `resultIndex` indicates which result
+  // of the matched root op this pattern is intended to replace, which can be
+  // used to deduce the result type of the op generated from this result
+  // pattern.
+  std::string handleResultPattern(DagNode resultTree, int resultIndex,
+                                  int depth);
+
+  // Emits the C++ statement to replace the matched DAG with a value built via
+  // calling native C++ code.
+  std::string handleReplaceWithNativeCodeCall(DagNode resultTree);
+
+  // Returns the C++ expression referencing the old value serving as the
+  // replacement.
+  std::string handleReplaceWithValue(DagNode tree);
+
+  // Emits the C++ statement to build a new op out of the given DAG `tree` and
+  // returns the variable name that this op is assigned to. If the root op in
+  // DAG `tree` has a specified name, the created op will be assigned to a
+  // variable of the given name. Otherwise, a unique name will be used as the
+  // result value name.
+  std::string handleOpCreation(DagNode tree, int resultIndex, int depth);
+
+  // Returns the C++ expression to construct a constant attribute of the given
+  // `value` for the given attribute kind `attr`.
+  std::string handleConstantAttr(Attribute attr, StringRef value);
+
+  // Returns the C++ expression to build an argument from the given DAG `leaf`.
+  // `patArgName` is used to bound the argument to the source pattern.
+  std::string handleOpArgument(DagLeaf leaf, StringRef patArgName);
+
+  //===--------------------------------------------------------------------===//
+  // General utilities
+  //===--------------------------------------------------------------------===//
+
+  // Collects all of the operations within the given dag tree.
+  void collectOps(DagNode tree, llvm::SmallPtrSetImpl<const Operator *> &ops);
+
+  // Returns a unique symbol for a local variable of the given `op`.
+  std::string getUniqueSymbol(const Operator *op);
+
+  //===--------------------------------------------------------------------===//
+  // Symbol utilities
+  //===--------------------------------------------------------------------===//
+
+  // Returns how many static values the given DAG `node` correspond to.
+  int getNodeValueCount(DagNode node);
+
+private:
+  // Pattern instantiation location followed by the location of multiclass
+  // prototypes used. This is intended to be used as a whole to
+  // PrintFatalError() on errors.
+  ArrayRef<llvm::SMLoc> loc;
+
+  // Op's TableGen Record to wrapper object.
+  RecordOperatorMap *opMap;
+
+  // Handy wrapper for pattern being emitted.
+  Pattern pattern;
+
+  // Map for all bound symbols' info.
+  SymbolInfoMap symbolInfoMap;
+
+  // The next unused ID for newly created values.
+  unsigned nextValueId;
+
+  raw_ostream &os;
+
+  // Format contexts containing placeholder substitutations.
+  FmtContext fmtCtx;
+
+  // Number of op processed.
+  int opCounter = 0;
+};
+} // end anonymous namespace
+
+PatternEmitter::PatternEmitter(Record *pat, RecordOperatorMap *mapper,
+                               raw_ostream &os)
+    : loc(pat->getLoc()), opMap(mapper), pattern(pat, mapper),
+      symbolInfoMap(pat->getLoc()), nextValueId(0), os(os) {
+  fmtCtx.withBuilder("rewriter");
+}
+
+std::string PatternEmitter::handleConstantAttr(Attribute attr,
+                                               StringRef value) {
+  if (!attr.isConstBuildable())
+    PrintFatalError(loc, "Attribute " + attr.getAttrDefName() +
+                             " does not have the 'constBuilderCall' field");
+
+  // TODO(jpienaar): Verify the constants here
+  return tgfmt(attr.getConstBuilderTemplate(), &fmtCtx, value);
+}
+
+// Helper function to match patterns.
+void PatternEmitter::emitOpMatch(DagNode tree, int depth) {
+  Operator &op = tree.getDialectOp(opMap);
+
+  int indent = 4 + 2 * depth;
+  os.indent(indent) << formatv(
+      "auto castedOp{0} = dyn_cast_or_null<{1}>(op{0}); (void)castedOp{0};\n",
+      depth, op.getQualCppClassName());
+  // Skip the operand matching at depth 0 as the pattern rewriter already does.
+  if (depth != 0) {
+    // Skip if there is no defining operation (e.g., arguments to function).
+    os.indent(indent) << formatv("if (!castedOp{0}) return matchFailure();\n",
+                                 depth);
+  }
+  if (tree.getNumArgs() != op.getNumArgs()) {
+    PrintFatalError(loc, formatv("op '{0}' argument number mismatch: {1} in "
+                                 "pattern vs. {2} in definition",
+                                 op.getOperationName(), tree.getNumArgs(),
+                                 op.getNumArgs()));
+  }
+
+  // If the operand's name is set, set to that variable.
+  auto name = tree.getSymbol();
+  if (!name.empty())
+    os.indent(indent) << formatv("{0} = castedOp{1};\n", name, depth);
+
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    auto opArg = op.getArg(i);
+
+    // Handle nested DAG construct first
+    if (DagNode argTree = tree.getArgAsNestedDag(i)) {
+      if (auto *operand = opArg.dyn_cast<NamedTypeConstraint *>()) {
+        if (operand->isVariadic()) {
+          auto error = formatv("use nested DAG construct to match op {0}'s "
+                               "variadic operand #{1} unsupported now",
+                               op.getOperationName(), i);
+          PrintFatalError(loc, error);
+        }
+      }
+      os.indent(indent) << "{\n";
+
+      os.indent(indent + 2) << formatv(
+          "auto *op{0} = "
+          "(*castedOp{1}.getODSOperands({2}).begin())->getDefiningOp();\n",
+          depth + 1, depth, i);
+      emitOpMatch(argTree, depth + 1);
+      os.indent(indent + 2)
+          << formatv("tblgen_ops[{0}] = op{1};\n", ++opCounter, depth + 1);
+      os.indent(indent) << "}\n";
+      continue;
+    }
+
+    // Next handle DAG leaf: operand or attribute
+    if (opArg.is<NamedTypeConstraint *>()) {
+      emitOperandMatch(tree, i, depth, indent);
+    } else if (opArg.is<NamedAttribute *>()) {
+      emitAttributeMatch(tree, i, depth, indent);
+    } else {
+      PrintFatalError(loc, "unhandled case when matching op");
+    }
+  }
+}
+
+void PatternEmitter::emitOperandMatch(DagNode tree, int index, int depth,
+                                      int indent) {
+  Operator &op = tree.getDialectOp(opMap);
+  auto *operand = op.getArg(index).get<NamedTypeConstraint *>();
+  auto matcher = tree.getArgAsLeaf(index);
+
+  // If a constraint is specified, we need to generate C++ statements to
+  // check the constraint.
+  if (!matcher.isUnspecified()) {
+    if (!matcher.isOperandMatcher()) {
+      PrintFatalError(
+          loc, formatv("the {1}-th argument of op '{0}' should be an operand",
+                       op.getOperationName(), index + 1));
+    }
+
+    // Only need to verify if the matcher's type is different from the one
+    // of op definition.
+    if (operand->constraint != matcher.getAsConstraint()) {
+      if (operand->isVariadic()) {
+        auto error = formatv(
+            "further constrain op {0}'s variadic operand #{1} unsupported now",
+            op.getOperationName(), index);
+        PrintFatalError(loc, error);
+      }
+      auto self =
+          formatv("(*castedOp{0}.getODSOperands({1}).begin())->getType()",
+                  depth, index);
+      os.indent(indent) << "if (!("
+                        << tgfmt(matcher.getConditionTemplate(),
+                                 &fmtCtx.withSelf(self))
+                        << ")) return matchFailure();\n";
+    }
+  }
+
+  // Capture the value
+  auto name = tree.getArgName(index);
+  if (!name.empty()) {
+    os.indent(indent) << formatv("{0} = castedOp{1}.getODSOperands({2});\n",
+                                 name, depth, index);
+  }
+}
+
+void PatternEmitter::emitAttributeMatch(DagNode tree, int index, int depth,
+                                        int indent) {
+  Operator &op = tree.getDialectOp(opMap);
+  auto *namedAttr = op.getArg(index).get<NamedAttribute *>();
+  const auto &attr = namedAttr->attr;
+
+  os.indent(indent) << "{\n";
+  indent += 2;
+  os.indent(indent) << formatv(
+      "auto tblgen_attr = op{0}->getAttrOfType<{1}>(\"{2}\");\n", depth,
+      attr.getStorageType(), namedAttr->name);
+
+  // TODO(antiagainst): This should use getter method to avoid duplication.
+  if (attr.hasDefaultValueInitializer()) {
+    os.indent(indent) << "if (!tblgen_attr) tblgen_attr = "
+                      << tgfmt(attr.getConstBuilderTemplate(), &fmtCtx,
+                               attr.getDefaultValueInitializer())
+                      << ";\n";
+  } else if (attr.isOptional()) {
+    // For a missing attribute that is optional according to definition, we
+    // should just capature a mlir::Attribute() to signal the missing state.
+    // That is precisely what getAttr() returns on missing attributes.
+  } else {
+    os.indent(indent) << "if (!tblgen_attr) return matchFailure();\n";
+  }
+
+  auto matcher = tree.getArgAsLeaf(index);
+  if (!matcher.isUnspecified()) {
+    if (!matcher.isAttrMatcher()) {
+      PrintFatalError(
+          loc, formatv("the {1}-th argument of op '{0}' should be an attribute",
+                       op.getOperationName(), index + 1));
+    }
+
+    // If a constraint is specified, we need to generate C++ statements to
+    // check the constraint.
+    os.indent(indent) << "if (!("
+                      << tgfmt(matcher.getConditionTemplate(),
+                               &fmtCtx.withSelf("tblgen_attr"))
+                      << ")) return matchFailure();\n";
+  }
+
+  // Capture the value
+  auto name = tree.getArgName(index);
+  if (!name.empty()) {
+    os.indent(indent) << formatv("{0} = tblgen_attr;\n", name);
+  }
+
+  indent -= 2;
+  os.indent(indent) << "}\n";
+}
+
+void PatternEmitter::emitMatchLogic(DagNode tree) {
+  emitOpMatch(tree, 0);
+
+  for (auto &appliedConstraint : pattern.getConstraints()) {
+    auto &constraint = appliedConstraint.constraint;
+    auto &entities = appliedConstraint.entities;
+
+    auto condition = constraint.getConditionTemplate();
+    auto cmd = "if (!({0})) return matchFailure();\n";
+
+    if (isa<TypeConstraint>(constraint)) {
+      auto self = formatv("({0}->getType())",
+                          symbolInfoMap.getValueAndRangeUse(entities.front()));
+      os.indent(4) << formatv(cmd,
+                              tgfmt(condition, &fmtCtx.withSelf(self.str())));
+    } else if (isa<AttrConstraint>(constraint)) {
+      PrintFatalError(
+          loc, "cannot use AttrConstraint in Pattern multi-entity constraints");
+    } else {
+      // TODO(b/138794486): replace formatv arguments with the exact specified
+      // args.
+      if (entities.size() > 4) {
+        PrintFatalError(loc, "only support up to 4-entity constraints now");
+      }
+      SmallVector<std::string, 4> names;
+      int i = 0;
+      for (int e = entities.size(); i < e; ++i)
+        names.push_back(symbolInfoMap.getValueAndRangeUse(entities[i]));
+      std::string self = appliedConstraint.self;
+      if (!self.empty())
+        self = symbolInfoMap.getValueAndRangeUse(self);
+      for (; i < 4; ++i)
+        names.push_back("<unused>");
+      os.indent(4) << formatv(cmd,
+                              tgfmt(condition, &fmtCtx.withSelf(self), names[0],
+                                    names[1], names[2], names[3]));
+    }
+  }
+}
+
+void PatternEmitter::collectOps(DagNode tree,
+                                llvm::SmallPtrSetImpl<const Operator *> &ops) {
+  // Check if this tree is an operation.
+  if (tree.isOperation())
+    ops.insert(&tree.getDialectOp(opMap));
+
+  // Recurse the arguments of the tree.
+  for (unsigned i = 0, e = tree.getNumArgs(); i != e; ++i)
+    if (auto child = tree.getArgAsNestedDag(i))
+      collectOps(child, ops);
+}
+
+void PatternEmitter::emit(StringRef rewriteName) {
+  // Get the DAG tree for the source pattern.
+  DagNode sourceTree = pattern.getSourcePattern();
+
+  const Operator &rootOp = pattern.getSourceRootOp();
+  auto rootName = rootOp.getOperationName();
+
+  // Collect the set of result operations.
+  llvm::SmallPtrSet<const Operator *, 4> resultOps;
+  for (unsigned i = 0, e = pattern.getNumResultPatterns(); i != e; ++i)
+    collectOps(pattern.getResultPattern(i), resultOps);
+
+  // Emit RewritePattern for Pattern.
+  auto locs = pattern.getLocation();
+  os << formatv("/* Generated from:\n\t{0:$[ instantiating\n\t]}\n*/\n",
+                make_range(locs.rbegin(), locs.rend()));
+  os << formatv(R"(struct {0} : public RewritePattern {
+  {0}(MLIRContext *context)
+      : RewritePattern("{1}", {{)",
+                rewriteName, rootName);
+  interleaveComma(resultOps, os, [&](const Operator *op) {
+    os << '"' << op->getOperationName() << '"';
+  });
+  os << formatv(R"(}, {0}, context) {{})", pattern.getBenefit()) << "\n";
+
+  // Emit matchAndRewrite() function.
+  os << R"(
+  PatternMatchResult matchAndRewrite(Operation *op0,
+                                     PatternRewriter &rewriter) const override {
+)";
+
+  // Register all symbols bound in the source pattern.
+  pattern.collectSourcePatternBoundSymbols(symbolInfoMap);
+
+  os.indent(4) << "// Variables for capturing values and attributes used for "
+                  "creating ops\n";
+  // Create local variables for storing the arguments and results bound
+  // to symbols.
+  for (const auto &symbolInfoPair : symbolInfoMap) {
+    StringRef symbol = symbolInfoPair.getKey();
+    auto &info = symbolInfoPair.getValue();
+    os.indent(4) << info.getVarDecl(symbol);
+  }
+  // TODO(jpienaar): capture ops with consistent numbering so that it can be
+  // reused for fused loc.
+  os.indent(4) << formatv("Operation *tblgen_ops[{0}];\n\n",
+                          pattern.getSourcePattern().getNumOps());
+
+  os.indent(4) << "// Match\n";
+  os.indent(4) << "tblgen_ops[0] = op0;\n";
+  emitMatchLogic(sourceTree);
+  os << "\n";
+
+  os.indent(4) << "// Rewrite\n";
+  emitRewriteLogic();
+
+  os.indent(4) << "return matchSuccess();\n";
+  os << "  };\n";
+  os << "};\n";
+}
+
+void PatternEmitter::emitRewriteLogic() {
+  const Operator &rootOp = pattern.getSourceRootOp();
+  int numExpectedResults = rootOp.getNumResults();
+  int numResultPatterns = pattern.getNumResultPatterns();
+
+  // First register all symbols bound to ops generated in result patterns.
+  pattern.collectResultPatternBoundSymbols(symbolInfoMap);
+
+  // Only the last N static values generated are used to replace the matched
+  // root N-result op. We need to calculate the starting index (of the results
+  // of the matched op) each result pattern is to replace.
+  SmallVector<int, 4> offsets(numResultPatterns + 1, numExpectedResults);
+  // If we don't need to replace any value at all, set the replacement starting
+  // index as the number of result patterns so we skip all of them when trying
+  // to replace the matched op's results.
+  int replStartIndex = numExpectedResults == 0 ? numResultPatterns : -1;
+  for (int i = numResultPatterns - 1; i >= 0; --i) {
+    auto numValues = getNodeValueCount(pattern.getResultPattern(i));
+    offsets[i] = offsets[i + 1] - numValues;
+    if (offsets[i] == 0) {
+      if (replStartIndex == -1)
+        replStartIndex = i;
+    } else if (offsets[i] < 0 && offsets[i + 1] > 0) {
+      auto error = formatv(
+          "cannot use the same multi-result op '{0}' to generate both "
+          "auxiliary values and values to be used for replacing the matched op",
+          pattern.getResultPattern(i).getSymbol());
+      PrintFatalError(loc, error);
+    }
+  }
+
+  if (offsets.front() > 0) {
+    const char error[] = "no enough values generated to replace the matched op";
+    PrintFatalError(loc, error);
+  }
+
+  os.indent(4) << "SmallVector<Type, 4> tblgen_types; (void)tblgen_types;\n";
+  os.indent(4) << "auto loc = rewriter.getFusedLoc({";
+  for (int i = 0, e = pattern.getSourcePattern().getNumOps(); i != e; ++i) {
+    os << (i ? ", " : "") << "tblgen_ops[" << i << "]->getLoc()";
+  }
+  os << "}); (void)loc;\n";
+
+  // Process each result pattern and record the result symbol.
+  llvm::SmallVector<std::string, 2> resultValues;
+  for (int i = 0; i < numResultPatterns; ++i) {
+    DagNode resultTree = pattern.getResultPattern(i);
+    resultValues.push_back(handleResultPattern(resultTree, offsets[i], 0));
+  }
+
+  os.indent(4) << "SmallVector<Value *, 4> tblgen_values;";
+  // Only use the last portion for replacing the matched root op's results.
+  auto range = llvm::makeArrayRef(resultValues).drop_front(replStartIndex);
+  for (const auto &val : range) {
+    os.indent(4) << "\n";
+    // Resolve each symbol for all range use so that we can loop over them.
+    os << symbolInfoMap.getAllRangeUse(
+        val, "    for (auto *v : {0}) tblgen_values.push_back(v);", "\n");
+  }
+  os.indent(4) << "\n";
+  os.indent(4) << "rewriter.replaceOp(op0, tblgen_values);\n";
+}
+
+std::string PatternEmitter::getUniqueSymbol(const Operator *op) {
+  return formatv("tblgen_{0}_{1}", op->getCppClassName(), nextValueId++);
+}
+
+std::string PatternEmitter::handleResultPattern(DagNode resultTree,
+                                                int resultIndex, int depth) {
+  if (resultTree.isNativeCodeCall()) {
+    auto symbol = handleReplaceWithNativeCodeCall(resultTree);
+    symbolInfoMap.bindValue(symbol);
+    return symbol;
+  }
+
+  if (resultTree.isReplaceWithValue()) {
+    return handleReplaceWithValue(resultTree);
+  }
+
+  // Normal op creation.
+  auto symbol = handleOpCreation(resultTree, resultIndex, depth);
+  if (resultTree.getSymbol().empty()) {
+    // This is an op not explicitly bound to a symbol in the rewrite rule.
+    // Register the auto-generated symbol for it.
+    symbolInfoMap.bindOpResult(symbol, pattern.getDialectOp(resultTree));
+  }
+  return symbol;
+}
+
+std::string PatternEmitter::handleReplaceWithValue(DagNode tree) {
+  assert(tree.isReplaceWithValue());
+
+  if (tree.getNumArgs() != 1) {
+    PrintFatalError(
+        loc, "replaceWithValue directive must take exactly one argument");
+  }
+
+  if (!tree.getSymbol().empty()) {
+    PrintFatalError(loc, "cannot bind symbol to replaceWithValue");
+  }
+
+  return tree.getArgName(0);
+}
+
+std::string PatternEmitter::handleOpArgument(DagLeaf leaf,
+                                             StringRef patArgName) {
+  if (leaf.isConstantAttr()) {
+    auto constAttr = leaf.getAsConstantAttr();
+    return handleConstantAttr(constAttr.getAttribute(),
+                              constAttr.getConstantValue());
+  }
+  if (leaf.isEnumAttrCase()) {
+    auto enumCase = leaf.getAsEnumAttrCase();
+    if (enumCase.isStrCase())
+      return handleConstantAttr(enumCase, enumCase.getSymbol());
+    // This is an enum case backed by an IntegerAttr. We need to get its value
+    // to build the constant.
+    std::string val = std::to_string(enumCase.getValue());
+    return handleConstantAttr(enumCase, val);
+  }
+
+  auto argName = symbolInfoMap.getValueAndRangeUse(patArgName);
+  if (leaf.isUnspecified() || leaf.isOperandMatcher()) {
+    return argName;
+  }
+  if (leaf.isNativeCodeCall()) {
+    return tgfmt(leaf.getNativeCodeTemplate(), &fmtCtx.withSelf(argName));
+  }
+  PrintFatalError(loc, "unhandled case when rewriting op");
+}
+
+std::string PatternEmitter::handleReplaceWithNativeCodeCall(DagNode tree) {
+  auto fmt = tree.getNativeCodeTemplate();
+  // TODO(b/138794486): replace formatv arguments with the exact specified args.
+  SmallVector<std::string, 8> attrs(8);
+  if (tree.getNumArgs() > 8) {
+    PrintFatalError(loc, "unsupported NativeCodeCall argument numbers: " +
+                             Twine(tree.getNumArgs()));
+  }
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    attrs[i] = handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i));
+  }
+  return tgfmt(fmt, &fmtCtx, attrs[0], attrs[1], attrs[2], attrs[3], attrs[4],
+               attrs[5], attrs[6], attrs[7]);
+}
+
+int PatternEmitter::getNodeValueCount(DagNode node) {
+  if (node.isOperation()) {
+    // If the op is bound to a symbol in the rewrite rule, query its result
+    // count from the symbol info map.
+    auto symbol = node.getSymbol();
+    if (!symbol.empty()) {
+      return symbolInfoMap.getStaticValueCount(symbol);
+    }
+    // Otherwise this is an unbound op; we will use all its results.
+    return pattern.getDialectOp(node).getNumResults();
+  }
+  // TODO(antiagainst): This considers all NativeCodeCall as returning one
+  // value. Enhance if multi-value ones are needed.
+  return 1;
+}
+
+std::string PatternEmitter::handleOpCreation(DagNode tree, int resultIndex,
+                                             int depth) {
+  Operator &resultOp = tree.getDialectOp(opMap);
+  auto numOpArgs = resultOp.getNumArgs();
+
+  if (numOpArgs != tree.getNumArgs()) {
+    PrintFatalError(loc, formatv("resultant op '{0}' argument number mismatch: "
+                                 "{1} in pattern vs. {2} in definition",
+                                 resultOp.getOperationName(), tree.getNumArgs(),
+                                 numOpArgs));
+  }
+
+  // A map to collect all nested DAG child nodes' names, with operand index as
+  // the key. This includes both bound and unbound child nodes.
+  llvm::DenseMap<unsigned, std::string> childNodeNames;
+
+  // First go through all the child nodes who are nested DAG constructs to
+  // create ops for them and remember the symbol names for them, so that we can
+  // use the results in the current node. This happens in a recursive manner.
+  for (int i = 0, e = resultOp.getNumOperands(); i != e; ++i) {
+    if (auto child = tree.getArgAsNestedDag(i)) {
+      childNodeNames[i] = handleResultPattern(child, i, depth + 1);
+    }
+  }
+
+  // The name of the local variable holding this op.
+  std::string valuePackName;
+  // The symbol for holding the result of this pattern. Note that the result of
+  // this pattern is not necessarily the same as the variable created by this
+  // pattern because we can use `__N` suffix to refer only a specific result if
+  // the generated op is a multi-result op.
+  std::string resultValue;
+  if (tree.getSymbol().empty()) {
+    // No symbol is explicitly bound to this op in the pattern. Generate a
+    // unique name.
+    valuePackName = resultValue = getUniqueSymbol(&resultOp);
+  } else {
+    resultValue = tree.getSymbol();
+    // Strip the index to get the name for the value pack and use it to name the
+    // local variable for the op.
+    valuePackName = SymbolInfoMap::getValuePackName(resultValue);
+  }
+
+  // Create the local variable for this op.
+  os.indent(4) << formatv("{0} {1};\n", resultOp.getQualCppClassName(),
+                          valuePackName);
+  os.indent(4) << "{\n";
+
+  // Now prepare operands used for building this op:
+  // * If the operand is non-variadic, we create a `Value*` local variable.
+  // * If the operand is variadic, we create a `SmallVector<Value*>` local
+  //   variable.
+
+  int argIndex = 0;   // The current index to this op's ODS argument
+  int valueIndex = 0; // An index for uniquing local variable names.
+  for (int e = resultOp.getNumOperands(); argIndex < e; ++argIndex) {
+    const auto &operand = resultOp.getOperand(argIndex);
+    std::string varName;
+    if (operand.isVariadic()) {
+      varName = formatv("tblgen_values_{0}", valueIndex++);
+      os.indent(6) << formatv("SmallVector<Value *, 4> {0};\n", varName);
+      std::string range;
+      if (tree.isNestedDagArg(argIndex)) {
+        range = childNodeNames[argIndex];
+      } else {
+        range = tree.getArgName(argIndex);
+      }
+      // Resolve the symbol for all range use so that we have a uniform way of
+      // capturing the values.
+      range = symbolInfoMap.getValueAndRangeUse(range);
+      os.indent(6) << formatv("for (auto *v : {0}) {1}.push_back(v);\n", range,
+                              varName);
+    } else {
+      varName = formatv("tblgen_value_{0}", valueIndex++);
+      os.indent(6) << formatv("Value *{0} = ", varName);
+      if (tree.isNestedDagArg(argIndex)) {
+        os << symbolInfoMap.getValueAndRangeUse(childNodeNames[argIndex]);
+      } else {
+        DagLeaf leaf = tree.getArgAsLeaf(argIndex);
+        auto symbol =
+            symbolInfoMap.getValueAndRangeUse(tree.getArgName(argIndex));
+        if (leaf.isNativeCodeCall()) {
+          os << tgfmt(leaf.getNativeCodeTemplate(), &fmtCtx.withSelf(symbol));
+        } else {
+          os << symbol;
+        }
+      }
+      os << ";\n";
+    }
+
+    // Update to use the newly created local variable for building the op later.
+    childNodeNames[argIndex] = varName;
+  }
+
+  // Then we create the builder call.
+
+  // Right now we don't have general type inference in MLIR. Except a few
+  // special cases listed below, we need to supply types for all results
+  // when building an op.
+  bool isSameOperandsAndResultType =
+      resultOp.hasTrait("OpTrait::SameOperandsAndResultType");
+  bool isBroadcastable =
+      resultOp.hasTrait("OpTrait::BroadcastableTwoOperandsOneResult");
+  bool useFirstAttr = resultOp.hasTrait("OpTrait::FirstAttrDerivedResultType");
+  bool usePartialResults = valuePackName != resultValue;
+
+  if (isSameOperandsAndResultType || isBroadcastable || useFirstAttr ||
+      usePartialResults || depth > 0 || resultIndex < 0) {
+    os.indent(6) << formatv("{0} = rewriter.create<{1}>(loc", valuePackName,
+                            resultOp.getQualCppClassName());
+  } else {
+    // If depth == 0 and resultIndex >= 0, it means we are replacing the values
+    // generated from the source pattern root op. Then we can use the source
+    // pattern's value types to determine the value type of the generated op
+    // here.
+
+    // We need to specify the types for all results.
+    int numResults = resultOp.getNumResults();
+    if (numResults != 0) {
+      os.indent(6) << "tblgen_types.clear();\n";
+      for (int i = 0; i < numResults; ++i) {
+        os.indent(6) << formatv("for (auto *v : castedOp0.getODSResults({0})) "
+                                "tblgen_types.push_back(v->getType());\n",
+                                resultIndex + i);
+      }
+    }
+
+    os.indent(6) << formatv("{0} = rewriter.create<{1}>(loc", valuePackName,
+                            resultOp.getQualCppClassName());
+    if (numResults != 0)
+      os.indent(6) << ", tblgen_types";
+  }
+
+  // Add operands for the builder all.
+  for (int i = 0; i < argIndex; ++i) {
+    const auto &operand = resultOp.getOperand(i);
+    // Start each operand on its own line.
+    (os << ",\n").indent(8);
+    if (!operand.name.empty()) {
+      os << "/*" << operand.name << "=*/";
+    }
+    os << childNodeNames[i];
+    // TODO(jpienaar): verify types
+  }
+
+  // Add attributes for the builder call.
+  for (; argIndex != numOpArgs; ++argIndex) {
+    // Start each attribute on its own line.
+    (os << ",\n").indent(8);
+    // The argument in the op definition.
+    auto opArgName = resultOp.getArgName(argIndex);
+    if (auto subTree = tree.getArgAsNestedDag(argIndex)) {
+      if (!subTree.isNativeCodeCall())
+        PrintFatalError(loc, "only NativeCodeCall allowed in nested dag node "
+                             "for creating attribute");
+      os << formatv("/*{0}=*/{1}", opArgName,
+                    handleReplaceWithNativeCodeCall(subTree));
+    } else {
+      auto leaf = tree.getArgAsLeaf(argIndex);
+      // The argument in the result DAG pattern.
+      auto patArgName = tree.getArgName(argIndex);
+      if (leaf.isConstantAttr() || leaf.isEnumAttrCase()) {
+        // TODO(jpienaar): Refactor out into map to avoid recomputing these.
+        auto argument = resultOp.getArg(argIndex);
+        if (!argument.is<NamedAttribute *>())
+          PrintFatalError(loc, Twine("expected attribute ") + Twine(argIndex));
+        if (!patArgName.empty())
+          os << "/*" << patArgName << "=*/";
+      } else {
+        os << "/*" << opArgName << "=*/";
+      }
+      os << handleOpArgument(leaf, patArgName);
+    }
+  }
+  os << "\n      );\n";
+  os.indent(4) << "}\n";
+
+  return resultValue;
+}
+
+static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Rewriters", os);
+
+  const auto &patterns = recordKeeper.getAllDerivedDefinitions("Pattern");
+  auto numPatterns = patterns.size();
+
+  // We put the map here because it can be shared among multiple patterns.
+  RecordOperatorMap recordOpMap;
+
+  std::vector<std::string> rewriterNames;
+  rewriterNames.reserve(numPatterns);
+
+  std::string baseRewriterName = "GeneratedConvert";
+  int rewriterIndex = 0;
+
+  for (Record *p : patterns) {
+    std::string name;
+    if (p->isAnonymous()) {
+      // If no name is provided, ensure unique rewriter names simply by
+      // appending unique suffix.
+      name = baseRewriterName + llvm::utostr(rewriterIndex++);
+    } else {
+      name = p->getName();
+    }
+    PatternEmitter(p, &recordOpMap, os).emit(name);
+    rewriterNames.push_back(std::move(name));
+  }
+
+  // Emit function to add the generated matchers to the pattern list.
+  os << "void populateWithGenerated(MLIRContext *context, "
+     << "OwningRewritePatternList *patterns) {\n";
+  for (const auto &name : rewriterNames) {
+    os << "  patterns->insert<" << name << ">(context);\n";
+  }
+  os << "}\n";
+}
+
+static mlir::GenRegistration
+    genRewriters("gen-rewriters", "Generate pattern rewriters",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   emitRewriters(records, os);
+                   return false;
+                 });
diff --git a/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
new file mode 100644
index 00000000000..d948ec501f1
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -0,0 +1,465 @@
+//===- SPIRVSerializationGen.cpp - SPIR-V serialization utility generator -===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// SPIRVSerializationGen generates common utility functions for SPIR-V
+// serialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/StringExtras.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using llvm::ArrayRef;
+using llvm::formatv;
+using llvm::raw_ostream;
+using llvm::raw_string_ostream;
+using llvm::Record;
+using llvm::RecordKeeper;
+using llvm::SMLoc;
+using llvm::StringRef;
+using llvm::Twine;
+using mlir::tblgen::Attribute;
+using mlir::tblgen::EnumAttr;
+using mlir::tblgen::NamedAttribute;
+using mlir::tblgen::NamedTypeConstraint;
+using mlir::tblgen::Operator;
+
+// Writes the following function to `os`:
+//   inline uint32_t getOpcode(<op-class-name>) { return <opcode>; }
+static void emitGetOpcodeFunction(const Record *record, Operator const &op,
+                                  raw_ostream &os) {
+  os << formatv("template <> constexpr inline ::mlir::spirv::Opcode "
+                "getOpcode<{0}>()",
+                op.getQualCppClassName())
+     << " {\n  "
+     << formatv("return ::mlir::spirv::Opcode::{0};\n}\n",
+                record->getValueAsString("spirvOpName"));
+}
+
+static void declareOpcodeFn(raw_ostream &os) {
+  os << "template <typename OpClass> inline constexpr ::mlir::spirv::Opcode "
+        "getOpcode();\n";
+}
+
+static void emitAttributeSerialization(const Attribute &attr,
+                                       ArrayRef<SMLoc> loc, llvm::StringRef op,
+                                       llvm::StringRef operandList,
+                                       llvm::StringRef attrName,
+                                       raw_ostream &os) {
+  os << "    auto attr = " << op << ".getAttr(\"" << attrName << "\");\n";
+  os << "    if (attr) {\n";
+  if (attr.getAttrDefName() == "I32ArrayAttr") {
+    // Serialize all the elements of the array
+    os << "      for (auto attrElem : attr.cast<ArrayAttr>()) {\n";
+    os << "        " << operandList
+       << ".push_back(static_cast<uint32_t>(attrElem.cast<IntegerAttr>()."
+          "getValue().getZExtValue()));\n";
+    os << "      }\n";
+  } else if (attr.isEnumAttr() || attr.getAttrDefName() == "I32Attr") {
+    os << "      " << operandList
+       << ".push_back(static_cast<uint32_t>(attr.cast<IntegerAttr>().getValue()"
+          ".getZExtValue()));\n";
+  } else {
+    PrintFatalError(
+        loc,
+        llvm::Twine(
+            "unhandled attribute type in SPIR-V serialization generation : '") +
+            attr.getAttrDefName() + llvm::Twine("'"));
+  }
+  os << "    }\n";
+}
+
+static void emitSerializationFunction(const Record *attrClass,
+                                      const Record *record, const Operator &op,
+                                      raw_ostream &os) {
+  // If the record has 'autogenSerialization' set to 0, nothing to do
+  if (!record->getValueAsBit("autogenSerialization")) {
+    return;
+  }
+  os << formatv("template <> LogicalResult\nSerializer::processOp<{0}>(\n"
+                "  {0} op)",
+                op.getQualCppClassName())
+     << " {\n";
+  os << "  SmallVector<uint32_t, 4> operands;\n";
+  os << "  SmallVector<StringRef, 2> elidedAttrs;\n";
+
+  // Serialize result information
+  if (op.getNumResults() == 1) {
+    os << "  uint32_t resultTypeID = 0;\n";
+    os << "  if (failed(processType(op.getLoc(), op.getType(), resultTypeID))) "
+          "{\n";
+    os << "    return failure();\n";
+    os << "  }\n";
+    os << "  operands.push_back(resultTypeID);\n";
+    // Create an SSA result <id> for the op
+    os << "  auto resultID = getNextID();\n";
+    os << "  valueIDMap[op.getResult()] = resultID;\n";
+    os << "  operands.push_back(resultID);\n";
+  } else if (op.getNumResults() != 0) {
+    PrintFatalError(record->getLoc(), "SPIR-V ops can only zero or one result");
+  }
+
+  // Process arguments
+  auto operandNum = 0;
+  for (unsigned i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    os << "  {\n";
+    if (argument.is<NamedTypeConstraint *>()) {
+      os << "    for (auto arg : op.getODSOperands(" << operandNum << ")) {\n";
+      os << "      auto argID = findValueID(arg);\n";
+      os << "      if (!argID) {\n";
+      os << "        emitError(op.getLoc(), \"operand " << operandNum
+         << " has a use before def\");\n";
+      os << "      }\n";
+      os << "      operands.push_back(argID);\n";
+      os << "    }\n";
+      operandNum++;
+    } else {
+      auto attr = argument.get<NamedAttribute *>();
+      emitAttributeSerialization(
+          (attr->attr.isOptional() ? attr->attr.getBaseAttr() : attr->attr),
+          record->getLoc(), "op", "operands", attr->name, os);
+      os << "    elidedAttrs.push_back(\"" << attr->name << "\");\n";
+    }
+    os << "  }\n";
+  }
+
+  os << formatv("  encodeInstructionInto("
+                "functions, spirv::getOpcode<{0}>(), operands);\n",
+                op.getQualCppClassName());
+
+  if (op.getNumResults() == 1) {
+    // All non-argument attributes translated into OpDecorate instruction
+    os << "  for (auto attr : op.getAttrs()) {\n";
+    os << "    if (llvm::any_of(elidedAttrs, [&](StringRef elided) { return "
+          "attr.first.is(elided); })) {\n";
+    os << "      continue;\n";
+    os << "    }\n";
+    os << "    if (failed(processDecoration(op.getLoc(), resultID, attr))) {\n";
+    os << "      return failure();";
+    os << "    }\n";
+    os << "  }\n";
+  }
+
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+static void initDispatchSerializationFn(raw_ostream &os) {
+  os << "LogicalResult Serializer::dispatchToAutogenSerialization(Operation "
+        "*op) {\n ";
+}
+
+static void emitSerializationDispatch(const Operator &op, raw_ostream &os) {
+  os << formatv(" if (isa<{0}>(op)) ", op.getQualCppClassName()) << "{\n";
+  os << "    ";
+  os << formatv("return processOp<{0}>(cast<{0}>(op));\n",
+                op.getQualCppClassName());
+  os << "  } else";
+}
+
+static void finalizeDispatchSerializationFn(raw_ostream &os) {
+  os << " {\n";
+  os << "    return op->emitError(\"unhandled operation serialization\");\n";
+  os << "  }\n";
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+static void emitAttributeDeserialization(
+    const Attribute &attr, ArrayRef<SMLoc> loc, llvm::StringRef attrList,
+    llvm::StringRef attrName, llvm::StringRef operandsList,
+    llvm::StringRef wordIndex, llvm::StringRef wordCount, raw_ostream &os) {
+  if (attr.getAttrDefName() == "I32ArrayAttr") {
+    os << "    SmallVector<Attribute, 4> attrListElems;\n";
+    os << "    while (" << wordIndex << " < " << wordCount << ") {\n";
+    os << "      attrListElems.push_back(opBuilder.getI32IntegerAttr("
+       << operandsList << "[" << wordIndex << "++]));\n";
+    os << "    }\n";
+    os << "    " << attrList << ".push_back(opBuilder.getNamedAttr(\""
+       << attrName << "\", opBuilder.getArrayAttr(attrListElems)));\n";
+  } else if (attr.isEnumAttr() || attr.getAttrDefName() == "I32Attr") {
+    os << "    " << attrList << ".push_back(opBuilder.getNamedAttr(\""
+       << attrName << "\", opBuilder.getI32IntegerAttr(" << operandsList << "["
+       << wordIndex << "++])));\n";
+  } else {
+    PrintFatalError(
+        loc, llvm::Twine(
+                 "unhandled attribute type in deserialization generation : '") +
+                 attr.getAttrDefName() + llvm::Twine("'"));
+  }
+}
+
+static void emitDeserializationFunction(const Record *attrClass,
+                                        const Record *record,
+                                        const Operator &op, raw_ostream &os) {
+  // If the record has 'autogenSerialization' set to 0, nothing to do
+  if (!record->getValueAsBit("autogenSerialization")) {
+    return;
+  }
+  os << formatv("template <> "
+                "LogicalResult\nDeserializer::processOp<{0}>(ArrayRef<"
+                "uint32_t> words)",
+                op.getQualCppClassName());
+  os << " {\n";
+  os << "  SmallVector<Type, 1> resultTypes;\n";
+  os << "  size_t wordIndex = 0; (void)wordIndex;\n";
+
+  // Deserialize result information if it exists
+  bool hasResult = false;
+  if (op.getNumResults() == 1) {
+    os << "  {\n";
+    os << "    if (wordIndex >= words.size()) {\n";
+    os << "      "
+       << formatv("return emitError(unknownLoc, \"expected result type <id> "
+                  "while deserializing {0}\");\n",
+                  op.getQualCppClassName());
+    os << "    }\n";
+    os << "    auto ty = getType(words[wordIndex]);\n";
+    os << "    if (!ty) {\n";
+    os << "      return emitError(unknownLoc, \"unknown type result <id> : "
+          "\") << words[wordIndex];\n";
+    os << "    }\n";
+    os << "    resultTypes.push_back(ty);\n";
+    os << "    wordIndex++;\n";
+    os << "  }\n";
+    os << "  if (wordIndex >= words.size()) {\n";
+    os << "    "
+       << formatv("return emitError(unknownLoc, \"expected result <id> while "
+                  "deserializing {0}\");\n",
+                  op.getQualCppClassName());
+    os << "  }\n";
+    os << "  uint32_t valueID = words[wordIndex++];\n";
+    hasResult = true;
+  } else if (op.getNumResults() != 0) {
+    PrintFatalError(record->getLoc(),
+                    "SPIR-V ops can have only zero or one result");
+  }
+
+  // Process operands/attributes
+  os << "  SmallVector<Value *, 4> operands;\n";
+  os << "  SmallVector<NamedAttribute, 4> attributes;\n";
+  unsigned operandNum = 0;
+  for (unsigned i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    if (auto valueArg = argument.dyn_cast<NamedTypeConstraint *>()) {
+      if (valueArg->isVariadic()) {
+        if (i != e - 1) {
+          PrintFatalError(record->getLoc(),
+                          "SPIR-V ops can have Variadic<..> argument only if "
+                          "it's the last argument");
+        }
+        os << "  for (; wordIndex < words.size(); ++wordIndex)";
+      } else {
+        os << "  if (wordIndex < words.size())";
+      }
+      os << " {\n";
+      os << "    auto arg = getValue(words[wordIndex]);\n";
+      os << "    if (!arg) {\n";
+      os << "      return emitError(unknownLoc, \"unknown result <id> : \") << "
+            "words[wordIndex];\n";
+      os << "    }\n";
+      os << "    operands.push_back(arg);\n";
+      if (!valueArg->isVariadic()) {
+        os << "    wordIndex++;\n";
+      }
+      operandNum++;
+      os << "  }\n";
+    } else {
+      os << "  if (wordIndex < words.size()) {\n";
+      auto attr = argument.get<NamedAttribute *>();
+      emitAttributeDeserialization(
+          (attr->attr.isOptional() ? attr->attr.getBaseAttr() : attr->attr),
+          record->getLoc(), "attributes", attr->name, "words", "wordIndex",
+          "words.size()", os);
+      os << "  }\n";
+    }
+  }
+
+  os << "  if (wordIndex != words.size()) {\n";
+  os << "    return emitError(unknownLoc, \"found more operands than expected "
+        "when deserializing "
+     << op.getQualCppClassName()
+     << ", only \") << wordIndex << \" of \" << words.size() << \" "
+        "processed\";\n";
+  os << "  }\n\n";
+
+  // Import decorations parsed
+  if (op.getNumResults() == 1) {
+    os << "  if (decorations.count(valueID)) {\n"
+       << "    auto attrs = decorations[valueID].getAttrs();\n"
+       << "    attributes.append(attrs.begin(), attrs.end());\n"
+       << "  }\n";
+  }
+
+  os << formatv("  auto op = opBuilder.create<{0}>(unknownLoc, resultTypes, "
+                "operands, attributes); (void)op;\n",
+                op.getQualCppClassName());
+  if (hasResult) {
+    os << "  valueMap[valueID] = op.getResult();\n\n";
+  }
+
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+static void initDispatchDeserializationFn(raw_ostream &os) {
+  os << "LogicalResult "
+        "Deserializer::dispatchToAutogenDeserialization(spirv::Opcode "
+        "opcode, ArrayRef<uint32_t> words) {\n";
+  os << "  switch (opcode) {\n";
+}
+
+static void emitDeserializationDispatch(const Operator &op, const Record *def,
+                                        raw_ostream &os) {
+  os << formatv("  case spirv::Opcode::{0}:\n",
+                def->getValueAsString("spirvOpName"));
+  os << formatv("    return processOp<{0}>(words);\n",
+                op.getQualCppClassName());
+}
+
+static void finalizeDispatchDeserializationFn(raw_ostream &os) {
+  os << "  default:\n";
+  os << "    ;\n";
+  os << "  }\n";
+  os << "  return emitError(unknownLoc, \"unhandled deserialization of \") << "
+        "spirv::stringifyOpcode(opcode);\n";
+  os << "}\n";
+}
+
+static bool emitSerializationFns(const RecordKeeper &recordKeeper,
+                                 raw_ostream &os) {
+  llvm::emitSourceFileHeader("SPIR-V Serialization Utilities/Functions", os);
+
+  std::string dSerFnString, dDesFnString, serFnString, deserFnString,
+      utilsString;
+  raw_string_ostream dSerFn(dSerFnString), dDesFn(dDesFnString),
+      serFn(serFnString), deserFn(deserFnString), utils(utilsString);
+  auto attrClass = recordKeeper.getClass("Attr");
+
+  declareOpcodeFn(utils);
+  initDispatchSerializationFn(dSerFn);
+  initDispatchDeserializationFn(dDesFn);
+  auto defs = recordKeeper.getAllDerivedDefinitions("SPV_Op");
+  for (const auto *def : defs) {
+    if (!def->getValueAsBit("hasOpcode")) {
+      continue;
+    }
+    Operator op(def);
+    emitGetOpcodeFunction(def, op, utils);
+    emitSerializationFunction(attrClass, def, op, serFn);
+    emitSerializationDispatch(op, dSerFn);
+    emitDeserializationFunction(attrClass, def, op, deserFn);
+    emitDeserializationDispatch(op, def, dDesFn);
+  }
+  finalizeDispatchSerializationFn(dSerFn);
+  finalizeDispatchDeserializationFn(dDesFn);
+
+  os << "#ifdef GET_SPIRV_SERIALIZATION_UTILS\n";
+  os << utils.str();
+  os << "#endif // GET_SPIRV_SERIALIZATION_UTILS\n\n";
+
+  os << "#ifdef GET_SERIALIZATION_FNS\n\n";
+  os << serFn.str();
+  os << dSerFn.str();
+  os << "#endif // GET_SERIALIZATION_FNS\n\n";
+
+  os << "#ifdef GET_DESERIALIZATION_FNS\n\n";
+  os << deserFn.str();
+  os << dDesFn.str();
+  os << "#endif // GET_DESERIALIZATION_FNS\n\n";
+
+  return false;
+}
+
+static void emitEnumGetAttrNameFnDecl(raw_ostream &os) {
+  os << formatv("template <typename EnumClass> inline constexpr StringRef "
+                "attributeName();\n");
+}
+
+static void emitEnumGetSymbolizeFnDecl(raw_ostream &os) {
+  os << "template <typename EnumClass> using SymbolizeFnTy = "
+        "llvm::Optional<EnumClass> (*)(StringRef);\n";
+  os << "template <typename EnumClass> inline constexpr "
+        "SymbolizeFnTy<EnumClass> symbolizeEnum();\n";
+}
+
+static void emitEnumGetAttrNameFnDefn(const EnumAttr &enumAttr,
+                                      raw_ostream &os) {
+  auto enumName = enumAttr.getEnumClassName();
+  os << formatv("template <> inline StringRef attributeName<{0}>()", enumName)
+     << " {\n";
+  os << "  "
+     << formatv("static constexpr const char attrName[] = \"{0}\";\n",
+                mlir::convertToSnakeCase(enumName));
+  os << "  return attrName;\n";
+  os << "}\n";
+}
+
+static void emitEnumGetSymbolizeFnDefn(const EnumAttr &enumAttr,
+                                       raw_ostream &os) {
+  auto enumName = enumAttr.getEnumClassName();
+  auto strToSymFnName = enumAttr.getStringToSymbolFnName();
+  os << formatv("template <> inline SymbolizeFnTy<{0}> symbolizeEnum<{0}>()",
+                enumName)
+     << " {\n";
+  os << "  return " << strToSymFnName << ";\n";
+  os << "}\n";
+}
+
+static bool emitOpUtils(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("SPIR-V Op Utilites", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("I32EnumAttr");
+  os << "#ifndef SPIRV_OP_UTILS_H_\n";
+  os << "#define SPIRV_OP_UTILS_H_\n";
+  emitEnumGetAttrNameFnDecl(os);
+  emitEnumGetSymbolizeFnDecl(os);
+  for (const auto *def : defs) {
+    EnumAttr enumAttr(*def);
+    emitEnumGetAttrNameFnDefn(enumAttr, os);
+    emitEnumGetSymbolizeFnDefn(enumAttr, os);
+  }
+  os << "#endif // SPIRV_OP_UTILS_H\n";
+  return false;
+}
+
+// Registers the enum utility generator to mlir-tblgen.
+static mlir::GenRegistration genSerialization(
+    "gen-spirv-serialization",
+    "Generate SPIR-V (de)serialization utilities and functions",
+    [](const RecordKeeper &records, raw_ostream &os) {
+      return emitSerializationFns(records, os);
+    });
+
+static mlir::GenRegistration
+    genOpUtils("gen-spirv-op-utils",
+               "Generate SPIR-V operation utility definitions",
+               [](const RecordKeeper &records, raw_ostream &os) {
+                 return emitOpUtils(records, os);
+               });
diff --git a/third_party/mlir/tools/mlir-tblgen/mlir-tblgen.cpp b/third_party/mlir/tools/mlir-tblgen/mlir-tblgen.cpp
new file mode 100644
index 00000000000..50b680d904d
--- /dev/null
+++ b/third_party/mlir/tools/mlir-tblgen/mlir-tblgen.cpp
@@ -0,0 +1,89 @@
+//===- mlir-tblgen.cpp - Top-Level TableGen implementation for MLIR -------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file contains the main function for MLIR's TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/GenNameParser.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static llvm::ManagedStatic<std::vector<GenInfo>> generatorRegistry;
+
+mlir::GenRegistration::GenRegistration(StringRef arg, StringRef description,
+                                       GenFunction function) {
+  generatorRegistry->emplace_back(arg, description, function);
+}
+
+GenNameParser::GenNameParser(llvm::cl::Option &opt)
+    : llvm::cl::parser<const GenInfo *>(opt) {
+  for (const auto &kv : *generatorRegistry) {
+    addLiteralOption(kv.getGenArgument(), &kv, kv.getGenDescription());
+  }
+}
+
+void GenNameParser::printOptionInfo(const llvm::cl::Option &O,
+                                    size_t GlobalWidth) const {
+  GenNameParser *TP = const_cast<GenNameParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const GenNameParser::OptionInfo *VT1,
+                          const GenNameParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  using llvm::cl::parser;
+  parser<const GenInfo *>::printOptionInfo(O, GlobalWidth);
+}
+
+// Generator that prints records.
+GenRegistration printRecords("print-records", "Print all records to stdout",
+                             [](const RecordKeeper &records, raw_ostream &os) {
+                               os << records;
+                               return false;
+                             });
+
+// Generator to invoke.
+const mlir::GenInfo *generator;
+
+// TableGenMain requires a function pointer so this function is passed in which
+// simply wraps the call to the generator.
+static bool MlirTableGenMain(raw_ostream &os, RecordKeeper &records) {
+  assert(generator && "no generator specified");
+  return generator->invoke(records, os);
+}
+
+int main(int argc, char **argv) {
+  llvm::InitLLVM y(argc, argv);
+  llvm::cl::opt<const mlir::GenInfo *, false, mlir::GenNameParser> generator(
+      "", llvm::cl::desc("Generator to run"));
+  cl::ParseCommandLineOptions(argc, argv);
+  ::generator = generator.getValue();
+
+  return TableGenMain(argv[0], &MlirTableGenMain);
+}
diff --git a/third_party/mlir/tools/mlir-translate/CMakeLists.txt b/third_party/mlir/tools/mlir-translate/CMakeLists.txt
new file mode 100644
index 00000000000..50df9de8cae
--- /dev/null
+++ b/third_party/mlir/tools/mlir-translate/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(LIBS
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIREDSC
+  MLIRParser
+  MLIRPass
+  MLIRSPIRV
+  MLIRSPIRVSerialization
+  MLIRStandardOps
+  MLIRTargetLLVMIR
+  MLIRTargetNVVMIR
+  MLIRTransforms
+  MLIRTranslation
+  MLIRSupport
+  MLIRVectorOps
+)
+add_llvm_executable(mlir-translate
+  mlir-translate.cpp
+)
+llvm_update_compile_flags(mlir-translate)
+whole_archive_link(mlir-translate ${LIBS})
+target_link_libraries(mlir-translate PRIVATE MLIRIR MLIRTranslateClParser ${LIBS} LLVMSupport)
diff --git a/third_party/mlir/tools/mlir-translate/mlir-translate.cpp b/third_party/mlir/tools/mlir-translate/mlir-translate.cpp
new file mode 100644
index 00000000000..282eae820a2
--- /dev/null
+++ b/third_party/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -0,0 +1,50 @@
+//===- mlir-translate.cpp - MLIR Translate Driver -------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This is a command line utility that translates a file from/to MLIR using one
+// of the registered translations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TranslateClParser.h"
+#include "llvm/Support/InitLLVM.h"
+
+using namespace mlir;
+
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+
+static llvm::cl::opt<std::string>
+    outputFilename("o", llvm::cl::desc("Output filename"),
+                   llvm::cl::value_desc("filename"), llvm::cl::init("-"));
+
+int main(int argc, char **argv) {
+  llvm::InitLLVM y(argc, argv);
+
+  // Add flags for all the registered translations.
+  llvm::cl::opt<const TranslateFunction *, false, TranslationParser>
+      translationRequested("", llvm::cl::desc("Translation to perform"),
+                           llvm::cl::Required);
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR translation driver\n");
+
+  MLIRContext context;
+  return failed(
+      (*translationRequested)(inputFilename, outputFilename, &context));
+}
diff --git a/third_party/mlir/utils/emacs/mlir-mode.el b/third_party/mlir/utils/emacs/mlir-mode.el
new file mode 100644
index 00000000000..636c5db9961
--- /dev/null
+++ b/third_party/mlir/utils/emacs/mlir-mode.el
@@ -0,0 +1,79 @@
+;;; mlir-mode.el --- Major mode for the MLIR assembler language.
+
+;; Copyright (C) 2019 The MLIR Authors.
+;;
+;; Licensed under the Apache License, Version 2.0 (the "License");
+;; you may not use this file except in compliance with the License.
+;; You may obtain a copy of the License at
+;;
+;;      http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+
+;;; Commentary:
+
+;; Major mode for editing MLIR files.
+
+;;; Code:
+
+(defvar mlir-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    (modify-syntax-entry ?% "_" table)
+    (modify-syntax-entry ?@ "_" table)
+    (modify-syntax-entry ?# "_" table)
+    (modify-syntax-entry ?. "_" table)
+    (modify-syntax-entry ?/ ". 12" table)
+    (modify-syntax-entry ?\n "> " table)
+    table)
+  "Syntax table used while in MLIR mode.")
+
+(defvar mlir-font-lock-keywords
+  (list
+   ;; Variables
+   '("%[-a-zA-Z$._0-9]*" . font-lock-variable-name-face)
+   ;; Functions
+   '("@[-a-zA-Z$._0-9]*" . font-lock-function-name-face)
+   ;; Affinemaps
+   '("#[-a-zA-Z$._0-9]*" . font-lock-variable-name-face)
+   ;; Types
+   '("\\b\\(f16\\|bf16\\|f32\\|f64\\|index\\|tf_control\\|i[1-9][0-9]*\\)\\b" . font-lock-type-face)
+   '("\\b\\(tensor\\|vector\\|memref\\)\\b" . font-lock-type-face)
+   ;; Dimension lists
+   '("\\b\\([0-9?]+x\\)*\\(f16\\|bf16\\|f32\\|f64\\|index\\|i[1-9][0-9]*\\)\\b" . font-lock-preprocessor-face)
+   ;; Integer literals
+   '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face)
+   ;; Floating point constants
+   '("\\b[-+]?[0-9]+.[0-9]*\\([eE][-+]?[0-9]+\\)?\\b" . font-lock-preprocessor-face)
+   ;; Hex constants
+   '("\\b0x[0-9A-Fa-f]+\\b" . font-lock-preprocessor-face)
+   ;; Keywords
+   `(,(regexp-opt
+       '(;; Toplevel entities
+         "br" "ceildiv" "func" "cond_br" "else" "extfunc" "false" "floordiv" "for" "if" "mod" "return" "size" "step" "to" "true" "??" ) 'symbols) . font-lock-keyword-face))
+  "Syntax highlighting for MLIR.")
+
+;; Emacs 23 compatibility.
+(defalias 'mlir-mode-prog-mode
+  (if (fboundp 'prog-mode)
+      'prog-mode
+    'fundamental-mode))
+
+;;;###autoload
+(define-derived-mode mlir-mode mlir-mode-prog-mode "MLIR"
+  "Major mode for editing MLIR source files.
+\\{mlir-mode-map}
+  Runs `mlir-mode-hook' on startup."
+  (setq font-lock-defaults `(mlir-font-lock-keywords))
+  (setq-local comment-start "//"))
+
+;; Associate .mlir files with mlir-mode
+;;;###autoload
+(add-to-list 'auto-mode-alist (cons "\\.mlir\\'" 'mlir-mode))
+
+(provide 'mlir-mode)
+
+;;; mlir-mode.el ends here
diff --git a/third_party/mlir/utils/generate-test-checks.py b/third_party/mlir/utils/generate-test-checks.py
new file mode 100755
index 00000000000..eb2c1dd9ef4
--- /dev/null
+++ b/third_party/mlir/utils/generate-test-checks.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+"""A script to generate FileCheck statements for mlir unit tests.
+
+This script is a utility to add FileCheck patterns to an mlir file.
+
+NOTE: The input .mlir is expected to be the output from the parser, not a
+stripped down variant.
+
+Example usage:
+$ generate-test-checks.py foo.mlir
+$ mlir-opt foo.mlir -transformation | generate-test-checks.py
+
+The script will heuristically insert CHECK/CHECK-LABEL commands for each line
+within the file. By default this script will also try to insert string
+substitution blocks for all SSA value names. The script is designed to make
+adding checks to a test case fast, it is *not* designed to be authoritative
+about what constitutes a good test!
+"""
+
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os  # Used to advertise this file's name ("autogenerated_note").
+import re
+import sys
+import string
+
+ADVERT = '// NOTE: Assertions have been autogenerated by '
+
+# Regex command to match an SSA identifier.
+SSA_RE_STR = '[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*'
+SSA_RE = re.compile(SSA_RE_STR)
+
+
+# Class used to generate and manage string substitution blocks for SSA value
+# names.
+class SSAVariableNamer:
+
+  def __init__(self):
+    self.scopes = []
+    self.name_counter = 0
+
+  # Generate a subsitution name for the given ssa value name.
+  def generate_name(self, ssa_name):
+    variable = 'VAL_' + str(self.name_counter)
+    self.name_counter += 1
+    self.scopes[-1][ssa_name] = variable
+    return variable
+
+  # Push a new variable name scope.
+  def push_name_scope(self):
+    self.scopes.append({})
+
+  # Pop the last variable name scope.
+  def pop_name_scope(self):
+    self.scopes.pop()
+
+
+# Process a line of input that has been split at each SSA identifier '%'.
+def process_line(line_chunks, variable_namer):
+  output_line = ''
+
+  # Process the rest that contained an SSA value name.
+  for chunk in line_chunks:
+    m = SSA_RE.match(chunk)
+    ssa_name = m.group(0)
+
+    # Check if an existing variable exists for this name.
+    variable = None
+    for scope in variable_namer.scopes:
+      variable = scope.get(ssa_name)
+      if variable is not None:
+        break
+
+    # If one exists, then output the existing name.
+    if variable is not None:
+      output_line += '[[' + variable + ']]'
+    else:
+      # Otherwise, generate a new variable.
+      variable = variable_namer.generate_name(ssa_name)
+      output_line += '[[' + variable + ':%.*]]'
+
+    # Append the non named group.
+    output_line += chunk[len(ssa_name):]
+
+  return output_line + '\n'
+
+
+def main():
+  from argparse import RawTextHelpFormatter
+  parser = argparse.ArgumentParser(
+      description=__doc__, formatter_class=RawTextHelpFormatter)
+  parser.add_argument(
+      '--check-prefix', default='CHECK', help='Prefix to use from check file.')
+  parser.add_argument(
+      '-o',
+      '--output',
+      nargs='?',
+      type=argparse.FileType('w'),
+      default=sys.stdout)
+  parser.add_argument(
+      'input',
+      nargs='?',
+      type=argparse.FileType('r'),
+      default=sys.stdin)
+  args = parser.parse_args()
+
+  # Open the given input file.
+  input_lines = [l.rstrip() for l in args.input]
+  args.input.close()
+
+  output_lines = []
+
+  # Generate a note used for the generated check file.
+  script_name = os.path.basename(__file__)
+  autogenerated_note = (ADVERT + 'utils/' + script_name)
+  output_lines.append(autogenerated_note + '\n')
+
+  # A map containing data used for naming SSA value names.
+  variable_namer = SSAVariableNamer()
+  for input_line in input_lines:
+    if not input_line:
+      continue
+    lstripped_input_line = input_line.lstrip()
+
+    # Lines with blocks begin with a ^. These lines have a trailing comment
+    # that needs to be stripped.
+    is_block = lstripped_input_line[0] == '^'
+    if is_block:
+      input_line = input_line.rsplit('//', 1)[0].rstrip()
+
+    # Top-level operations are heuristically the operations at nesting level 1.
+    is_toplevel_op = (not is_block and input_line.startswith('  ') and
+                      input_line[2] != ' ' and input_line[2] != '}')
+
+    # If the line starts with a '}', pop the last name scope.
+    if lstripped_input_line[0] == '}':
+      variable_namer.pop_name_scope()
+
+    # If the line ends with a '{', push a new name scope.
+    if input_line[-1] == '{':
+      variable_namer.push_name_scope()
+
+    # Split the line at the each SSA value name.
+    ssa_split = input_line.split('%')
+
+    # If this is a top-level operation use 'CHECK-LABEL', otherwise 'CHECK:'.
+    if not is_toplevel_op or not ssa_split[0]:
+      output_line = '// ' + args.check_prefix + ': '
+      # Pad to align with the 'LABEL' statements.
+      output_line += (' ' * len('-LABEL'))
+
+      # Output the first line chunk that does not contain an SSA name.
+      output_line += ssa_split[0]
+
+      # Process the rest of the input line.
+      output_line += process_line(ssa_split[1:], variable_namer)
+
+    else:
+      # Append a newline to the output to separate the logical blocks.
+      output_lines.append('\n')
+      output_line = '// ' + args.check_prefix + '-LABEL: '
+
+      # Output the first line chunk that does not contain an SSA name for the
+      # label.
+      output_line += ssa_split[0] + '\n'
+
+      # Process the rest of the input line on a separate check line.
+      if len(ssa_split) > 1:
+        output_line += '// ' + args.check_prefix + '-SAME:  '
+
+        # Pad to align with the original position in the line.
+        output_line += ' ' * len(ssa_split[0])
+
+        # Process the rest of the line.
+        output_line += process_line(ssa_split[1:], variable_namer)
+
+    # Append the output line.
+    output_lines.append(output_line)
+
+  # Write the output.
+  for output_line in output_lines:
+    args.output.write(output_line)
+  args.output.write('\n')
+  args.output.close()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/mlir/utils/spirv/define_enum.sh b/third_party/mlir/utils/spirv/define_enum.sh
new file mode 100755
index 00000000000..9da898f7d4c
--- /dev/null
+++ b/third_party/mlir/utils/spirv/define_enum.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script for defining a new enum attr using SPIR-V spec from the Internet.
+#
+# Run as:
+# ./define_enum.sh <enum-class-name>
+#
+# The 'operand_kinds' dict of spirv.core.grammar.json contains all supported
+# SPIR-V enum classes.
+#
+# If <enum-name> is missing, this script updates existing ones.
+
+set -e
+
+new_enum=$1
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --base-td-path ${current_dir}/../../include/mlir/Dialect/SPIRV/SPIRVBase.td \
+  --new-enum "${new_enum}"
diff --git a/third_party/mlir/utils/spirv/define_inst.sh b/third_party/mlir/utils/spirv/define_inst.sh
new file mode 100755
index 00000000000..49b5e8df880
--- /dev/null
+++ b/third_party/mlir/utils/spirv/define_inst.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script for defining a new op using SPIR-V spec from the Internet.
+#
+# Run as:
+# ./define_inst.sh <opname>
+
+# For example:
+# ./define_inst.sh OpIAdd
+#
+# If <opname> is missing, this script updates existing ones.
+
+set -e
+
+new_op=$1
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --op-td-path ${current_dir}/../../include/mlir/Dialect/SPIRV/SPIRVOps.td \
+  --new-inst "${new_op}"
diff --git a/third_party/mlir/utils/spirv/define_opcodes.sh b/third_party/mlir/utils/spirv/define_opcodes.sh
new file mode 100755
index 00000000000..05c36571115
--- /dev/null
+++ b/third_party/mlir/utils/spirv/define_opcodes.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script for defining map for opname to opcode using SPIR-V spec from the
+# Internet
+#
+# Run as:
+# ./define_opcode.sh (<op-name>)*
+#
+# For example:
+# ./define_opcode.sh OpTypeVoid OpTypeFunction
+#
+# If no op-name is specified, the existing opcodes are updated
+#
+# The 'instructions' list of spirv.core.grammar.json contains all instructions
+# in SPIR-V
+
+set -e
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --base-td-path ${current_dir}/../../include/mlir/Dialect/SPIRV/SPIRVBase.td \
+  --new-opcode $@
diff --git a/third_party/mlir/utils/spirv/gen_spirv_dialect.py b/third_party/mlir/utils/spirv/gen_spirv_dialect.py
new file mode 100755
index 00000000000..2017e227cc2
--- /dev/null
+++ b/third_party/mlir/utils/spirv/gen_spirv_dialect.py
@@ -0,0 +1,616 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 The MLIR Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script for updating SPIR-V dialect by scraping information from SPIR-V
+# HTML and JSON specs from the Internet.
+#
+# For example, to define the enum attribute for SPIR-V memory model:
+#
+# ./gen_spirv_dialect.py --base_td_path /path/to/SPIRVBase.td \
+#                        --new-enum MemoryModel
+#
+# The 'operand_kinds' dict of spirv.core.grammar.json contains all supported
+# SPIR-V enum classes.
+
+import re
+import requests
+import textwrap
+
+SPIRV_HTML_SPEC_URL = 'https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html'
+SPIRV_JSON_SPEC_URL = 'https://raw.githubusercontent.com/KhronosGroup/SPIRV-Headers/master/include/spirv/unified1/spirv.core.grammar.json'
+
+AUTOGEN_OP_DEF_SEPARATOR = '\n// -----\n\n'
+AUTOGEN_ENUM_SECTION_MARKER = 'enum section. Generated from SPIR-V spec; DO NOT MODIFY!'
+AUTOGEN_OPCODE_SECTION_MARKER = (
+    'opcode section. Generated from SPIR-V spec; DO NOT MODIFY!')
+
+
+def get_spirv_doc_from_html_spec():
+  """Extracts instruction documentation from SPIR-V HTML spec.
+
+  Returns:
+    - A dict mapping from instruction opcode to documentation.
+  """
+  response = requests.get(SPIRV_HTML_SPEC_URL)
+  spec = response.content
+
+  from bs4 import BeautifulSoup
+  spirv = BeautifulSoup(spec, 'html.parser')
+
+  section_anchor = spirv.find('h3', {'id': '_a_id_instructions_a_instructions'})
+
+  doc = {}
+
+  for section in section_anchor.parent.find_all('div', {'class': 'sect3'}):
+    for table in section.find_all('table'):
+      inst_html = table.tbody.tr.td.p
+      opname = inst_html.a['id']
+      # Ignore the first line, which is just the opname.
+      doc[opname] = inst_html.text.split('\n', 1)[1].strip()
+
+  return doc
+
+
+def get_spirv_grammar_from_json_spec():
+  """Extracts operand kind and instruction grammar from SPIR-V JSON spec.
+
+  Returns:
+    - A list containing all operand kinds' grammar
+    - A list containing all instructions' grammar
+  """
+  response = requests.get(SPIRV_JSON_SPEC_URL)
+  spec = response.content
+
+  import json
+  spirv = json.loads(spec)
+
+  return spirv['operand_kinds'], spirv['instructions']
+
+
+def split_list_into_sublists(items, offset):
+  """Split the list of items into multiple sublists.
+
+  This is to make sure the string composed from each sublist won't exceed
+  80 characters.
+
+  Arguments:
+    - items: a list of strings
+    - offset: the offset in calculating each sublist's length
+  """
+  chuncks = []
+  chunk = []
+  chunk_len = 0
+
+  for item in items:
+    chunk_len += len(item) + 2
+    if chunk_len > 80:
+      chuncks.append(chunk)
+      chunk = []
+      chunk_len = len(item) + 2
+    chunk.append(item)
+
+  if len(chunk) != 0:
+    chuncks.append(chunk)
+
+  return chuncks
+
+
+def uniquify(lst, equality_fn):
+  """Returns a list after pruning duplicate elements.
+
+  Arguments:
+   - lst: List whose elements are to be uniqued.
+   - equality_fn: Function used to compare equality between elements of the
+     list.
+
+  Returns:
+   - A list with all duplicated removed. The order of elements is same as the
+     original list, with only the first occurence of duplicates retained.
+  """
+  keys = set()
+  unique_lst = []
+  for elem in lst:
+    key = equality_fn(elem)
+    if key not in keys:
+      unique_lst.append(elem)
+      keys.add(key)
+  return unique_lst
+
+
+def gen_operand_kind_enum_attr(operand_kind):
+  """Generates the TableGen I32EnumAttr definition for the given operand kind.
+
+  Returns:
+    - The operand kind's name
+    - A string containing the TableGen I32EnumAttr definition
+  """
+  if 'enumerants' not in operand_kind:
+    return '', ''
+
+  kind_name = operand_kind['kind']
+  kind_acronym = ''.join([c for c in kind_name if c >= 'A' and c <= 'Z'])
+  kind_cases = [(case['enumerant'], case['value'])
+                for case in operand_kind['enumerants']]
+  kind_cases = uniquify(kind_cases, lambda x: x[1])
+  max_len = max([len(symbol) for (symbol, _) in kind_cases])
+
+  # Generate the definition for each enum case
+  fmt_str = 'def SPV_{acronym}_{symbol} {colon:>{offset}} '\
+            'I32EnumAttrCase<"{symbol}", {value}>;'
+  case_defs = [
+      fmt_str.format(
+          acronym=kind_acronym,
+          symbol=case[0],
+          value=case[1],
+          colon=':',
+          offset=(max_len + 1 - len(case[0]))) for case in kind_cases
+  ]
+  case_defs = '\n'.join(case_defs)
+
+  # Generate the list of enum case names
+  fmt_str = 'SPV_{acronym}_{symbol}';
+  case_names = [fmt_str.format(acronym=kind_acronym,symbol=case[0])
+                for case in kind_cases]
+
+  # Split them into sublists and concatenate into multiple lines
+  case_names = split_list_into_sublists(case_names, 6)
+  case_names = ['{:6}'.format('') + ', '.join(sublist)
+                for sublist in case_names]
+  case_names = ',\n'.join(case_names)
+
+  # Generate the enum attribute definition
+  enum_attr = 'def SPV_{name}Attr :\n    '\
+      'I32EnumAttr<"{name}", "valid SPIR-V {name}", [\n{cases}\n    ]> {{\n'\
+      '  let returnType = "::mlir::spirv::{name}";\n'\
+      '  let convertFromStorage = '\
+            '"static_cast<::mlir::spirv::{name}>($_self.getInt())";\n'\
+      '  let cppNamespace = "::mlir::spirv";\n}}'.format(
+          name=kind_name, cases=case_names)
+  return kind_name, case_defs + '\n\n' + enum_attr
+
+
+def gen_opcode(instructions):
+  """ Generates the TableGen definition to map opname to opcode
+
+  Returns:
+    - A string containing the TableGen SPV_OpCode definition
+  """
+
+  max_len = max([len(inst['opname']) for inst in instructions])
+  def_fmt_str = 'def SPV_OC_{name} {colon:>{offset}} '\
+            'I32EnumAttrCase<"{name}", {value}>;'
+  opcode_defs = [
+      def_fmt_str.format(
+          name=inst['opname'],
+          value=inst['opcode'],
+          colon=':',
+          offset=(max_len + 1 - len(inst['opname']))) for inst in instructions
+  ]
+  opcode_str = '\n'.join(opcode_defs)
+
+  decl_fmt_str = 'SPV_OC_{name}'
+  opcode_list = [
+      decl_fmt_str.format(name=inst['opname']) for inst in instructions
+  ]
+  opcode_list = split_list_into_sublists(opcode_list, 6)
+  opcode_list = [
+      '{:6}'.format('') + ', '.join(sublist) for sublist in opcode_list
+  ]
+  opcode_list = ',\n'.join(opcode_list)
+  enum_attr = 'def SPV_OpcodeAttr :\n'\
+              '    I32EnumAttr<"{name}", "valid SPIR-V instructions", [\n'\
+              '{lst}\n'\
+              '      ]> {{\n'\
+              '    let returnType = "::mlir::spirv::{name}";\n'\
+              '    let convertFromStorage = '\
+              '"static_cast<::mlir::spirv::{name}>($_self.getInt())";\n'\
+              '    let cppNamespace = "::mlir::spirv";\n}}'.format(
+                  name='Opcode', lst=opcode_list)
+  return opcode_str + '\n\n' + enum_attr
+
+
+def update_td_opcodes(path, instructions, filter_list):
+  """Updates SPIRBase.td with new generated opcode cases.
+
+  Arguments:
+    - path: the path to SPIRBase.td
+    - instructions: a list containing all SPIR-V instructions' grammar
+    - filter_list: a list containing new opnames to add
+  """
+
+  with open(path, 'r') as f:
+    content = f.read()
+
+  content = content.split(AUTOGEN_OPCODE_SECTION_MARKER)
+  assert len(content) == 3
+
+  # Extend opcode list with existing list
+  existing_opcodes = [k[11:] for k in re.findall('def SPV_OC_\w+', content[1])]
+  filter_list.extend(existing_opcodes)
+  filter_list = list(set(filter_list))
+
+  # Generate the opcode for all instructions in SPIR-V
+  filter_instrs = list(
+      filter(lambda inst: (inst['opname'] in filter_list), instructions))
+  # Sort instruction based on opcode
+  filter_instrs.sort(key=lambda inst: inst['opcode'])
+  opcode = gen_opcode(filter_instrs)
+
+  # Substitute the opcode
+  content = content[0] + AUTOGEN_OPCODE_SECTION_MARKER + '\n\n' + \
+        opcode + '\n\n// End ' + AUTOGEN_OPCODE_SECTION_MARKER \
+        + content[2]
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+def update_td_enum_attrs(path, operand_kinds, filter_list):
+  """Updates SPIRBase.td with new generated enum definitions.
+
+  Arguments:
+    - path: the path to SPIRBase.td
+    - operand_kinds: a list containing all operand kinds' grammar
+    - filter_list: a list containing new enums to add
+  """
+  with open(path, 'r') as f:
+    content = f.read()
+
+  content = content.split(AUTOGEN_ENUM_SECTION_MARKER)
+  assert len(content) == 3
+
+  # Extend filter list with existing enum definitions
+  existing_kinds = [
+      k[8:-4] for k in re.findall('def SPV_\w+Attr', content[1])]
+  filter_list.extend(existing_kinds)
+
+  # Generate definitions for all enums in filter list
+  defs = [gen_operand_kind_enum_attr(kind)
+          for kind in operand_kinds if kind['kind'] in filter_list]
+  # Sort alphabetically according to enum name
+  defs.sort(key=lambda enum : enum[0])
+  # Only keep the definitions from now on
+  defs = [enum[1] for enum in defs]
+
+  # Substitute the old section
+  content = content[0] + AUTOGEN_ENUM_SECTION_MARKER + '\n\n' + \
+      '\n\n'.join(defs) + "\n\n// End " + AUTOGEN_ENUM_SECTION_MARKER  \
+      + content[2];
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+def snake_casify(name):
+  """Turns the given name to follow snake_case convension."""
+  name = re.sub('\W+', '', name).split()
+  name = [s.lower() for s in name]
+  return '_'.join(name)
+
+
+def map_spec_operand_to_ods_argument(operand):
+  """Maps a operand in SPIR-V JSON spec to an op argument in ODS.
+
+  Arguments:
+    - A dict containing the operand's kind, quantifier, and name
+
+  Returns:
+    - A string containing both the type and name for the argument
+  """
+  kind = operand['kind']
+  quantifier = operand.get('quantifier', '')
+
+  # These instruction "operands" are for encoding the results; they should
+  # not be handled here.
+  assert kind != 'IdResultType', 'unexpected to handle "IdResultType" kind'
+  assert kind != 'IdResult', 'unexpected to handle "IdResult" kind'
+
+  if kind == 'IdRef':
+    if quantifier == '':
+      arg_type = 'SPV_Type'
+    elif quantifier == '?':
+      arg_type = 'SPV_Optional<SPV_Type>'
+    else:
+      arg_type = 'Variadic<SPV_Type>'
+  elif kind == 'IdMemorySemantics' or kind == 'IdScope':
+    # TODO(antiagainst): Need to further constrain 'IdMemorySemantics'
+    # and 'IdScope' given that they should be gernated from OpConstant.
+    assert quantifier == '', ('unexpected to have optional/variadic memory '
+                              'semantics or scope <id>')
+    arg_type = 'I32'
+  elif kind == 'LiteralInteger':
+    if quantifier == '':
+      arg_type = 'I32Attr'
+    elif quantifier == '?':
+      arg_type = 'OptionalAttr<I32Attr>'
+    else:
+      arg_type = 'OptionalAttr<I32ArrayAttr>'
+  elif kind == 'LiteralString' or \
+      kind == 'LiteralContextDependentNumber' or \
+      kind == 'LiteralExtInstInteger' or \
+      kind == 'LiteralSpecConstantOpInteger' or \
+      kind == 'PairLiteralIntegerIdRef' or \
+      kind == 'PairIdRefLiteralInteger' or \
+      kind == 'PairIdRefIdRef':
+    assert False, '"{}" kind unimplemented'.format(kind)
+  else:
+    # The rest are all enum operands that we represent with op attributes.
+    assert quantifier != '*', 'unexpected to have variadic enum attribute'
+    arg_type = 'SPV_{}Attr'.format(kind)
+    if quantifier == '?':
+      arg_type = 'OptionalAttr<{}>'.format(arg_type)
+
+  name = operand.get('name', '')
+  name = snake_casify(name) if name else kind.lower()
+
+  return '{}:${}'.format(arg_type, name)
+
+
+def get_op_definition(instruction, doc, existing_info):
+  """Generates the TableGen op definition for the given SPIR-V instruction.
+
+  Arguments:
+    - instruction: the instruction's SPIR-V JSON grammar
+    - doc: the instruction's SPIR-V HTML doc
+    - existing_info: a dict containing potential manually specified sections for
+      this instruction
+
+  Returns:
+    - A string containing the TableGen op definition
+  """
+  fmt_str = 'def SPV_{opname}Op : SPV_Op<"{opname}", [{traits}]> {{\n'\
+            '  let summary = {summary};\n\n'\
+            '  let description = [{{\n'\
+            '{description}\n\n'\
+            '    ### Custom assembly form\n'\
+            '{assembly}'\
+            '}}];\n\n'\
+            '  let arguments = (ins{args});\n\n'\
+            '  let results = (outs{results});\n'\
+            '{extras}'\
+            '}}\n'
+
+  opname = instruction['opname'][2:]
+
+  summary, description = doc.split('\n', 1)
+  wrapper = textwrap.TextWrapper(
+      width=76, initial_indent='    ', subsequent_indent='    ')
+
+  # Format summary. If the summary can fit in the same line, we print it out
+  # as a "-quoted string; otherwise, wrap the lines using "[{...}]".
+  summary = summary.strip();
+  if len(summary) + len('  let summary = "";') <= 80:
+    summary = '"{}"'.format(summary)
+  else:
+    summary = '[{{\n{}\n  }}]'.format(wrapper.fill(summary))
+
+  # Wrap description
+  description = description.split('\n')
+  description = [wrapper.fill(line) for line in description if line]
+  description = '\n\n'.join(description)
+
+  operands = instruction.get('operands', [])
+
+  # Set op's result
+  results = ''
+  if len(operands) > 0 and operands[0]['kind'] == 'IdResultType':
+    results = '\n    SPV_Type:$result\n  '
+    operands = operands[1:]
+  if 'results' in existing_info:
+    results = existing_info['results']
+
+  # Ignore the operand standing for the result <id>
+  if len(operands) > 0 and operands[0]['kind'] == 'IdResult':
+    operands = operands[1:]
+
+  # Set op' argument
+  arguments = existing_info.get('arguments', None)
+  if arguments is None:
+    arguments = [map_spec_operand_to_ods_argument(o) for o in operands]
+    arguments = '\n    '.join(arguments)
+    if arguments:
+      # Prepend and append whitespace for formatting
+      arguments = '\n    {}\n  '.format(arguments)
+
+  assembly = existing_info.get('assembly', None)
+  if assembly is None:
+    assembly = '    ``` {.ebnf}\n'\
+               '    [TODO]\n'\
+               '    ```\n\n'\
+               '    For example:\n\n'\
+               '    ```\n'\
+               '    [TODO]\n'\
+               '    ```\n  '
+
+  return fmt_str.format(
+      opname=opname,
+      traits=existing_info.get('traits', ''),
+      summary=summary,
+      description=description,
+      assembly=assembly,
+      args=arguments,
+      results=results,
+      extras=existing_info.get('extras', ''))
+
+
+def extract_td_op_info(op_def):
+  """Extracts potentially manually specified sections in op's definition.
+
+  Arguments: - A string containing the op's TableGen definition
+    - doc: the instruction's SPIR-V HTML doc
+
+  Returns:
+    - A dict containing potential manually specified sections
+  """
+  # Get opname
+  opname = [o[8:-2] for o in re.findall('def SPV_\w+Op', op_def)]
+  assert len(opname) == 1, 'more than one ops in the same section!'
+  opname = opname[0]
+
+  # Get traits
+  op_tmpl_params = op_def.split('<', 1)[1].split('>', 1)[0].split(', ', 1)
+  if len(op_tmpl_params) == 1:
+    traits = ''
+  else:
+    traits = op_tmpl_params[1].strip('[]')
+
+  # Get custom assembly form
+  rest = op_def.split('### Custom assembly form\n')
+  assert len(rest) == 2, \
+          '{}: cannot find "### Custom assembly form"'.format(opname)
+  rest = rest[1].split('  let arguments = (ins')
+  assert len(rest) == 2, '{}: cannot find arguments'.format(opname)
+  assembly = rest[0].rstrip('}];\n')
+
+  # Get arguments
+  rest = rest[1].split('  let results = (outs')
+  assert len(rest) == 2, '{}: cannot find results'.format(opname)
+  args = rest[0].rstrip(');\n')
+
+  # Get results
+  rest = rest[1].split(');', 1)
+  assert len(rest) == 2, \
+          '{}: cannot find ");" ending results'.format(opname)
+  results = rest[0]
+
+  extras = rest[1].strip(' }\n')
+  if extras:
+    extras = '\n  {}\n'.format(extras)
+
+  return {
+      # Prefix with 'Op' to make it consistent with SPIR-V spec
+      'opname': 'Op{}'.format(opname),
+      'traits': traits,
+      'assembly': assembly,
+      'arguments': args,
+      'results': results,
+      'extras': extras
+  }
+
+
+def update_td_op_definitions(path, instructions, docs, filter_list):
+  """Updates SPIRVOps.td with newly generated op definition.
+
+  Arguments:
+    - path: path to SPIRVOps.td
+    - instructions: SPIR-V JSON grammar for all instructions
+    - docs: SPIR-V HTML doc for all instructions
+    - filter_list: a list containing new opnames to include
+
+  Returns:
+    - A string containing all the TableGen op definitions
+  """
+  with open(path, 'r') as f:
+    content = f.read()
+
+  # Split the file into chuncks, each containing one op.
+  ops = content.split(AUTOGEN_OP_DEF_SEPARATOR)
+  header = ops[0]
+  footer = ops[-1]
+  ops = ops[1:-1]
+
+  # For each existing op, extract the manually-written sections out to retain
+  # them when re-generating the ops. Also append the existing ops to filter
+  # list.
+  op_info_dict = {}
+  for op in ops:
+    info_dict = extract_td_op_info(op)
+    opname = info_dict['opname']
+    op_info_dict[opname] = info_dict
+    filter_list.append(opname)
+  filter_list = sorted(list(set(filter_list)))
+
+  op_defs = []
+  for opname in filter_list:
+    # Find the grammar spec for this op
+    instruction = next(
+        inst for inst in instructions if inst['opname'] == opname)
+    op_defs.append(
+        get_op_definition(instruction, docs[opname],
+                          op_info_dict.get(opname, {})))
+
+  # Substitute the old op definitions
+  op_defs = [header] + op_defs + [footer]
+  content = AUTOGEN_OP_DEF_SEPARATOR.join(op_defs)
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+if __name__ == '__main__':
+  import argparse
+
+  cli_parser = argparse.ArgumentParser(
+      description='Update SPIR-V dialect definitions using SPIR-V spec')
+
+  cli_parser.add_argument(
+      '--base-td-path',
+      dest='base_td_path',
+      type=str,
+      default=None,
+      help='Path to SPIRVBase.td')
+  cli_parser.add_argument(
+      '--op-td-path',
+      dest='op_td_path',
+      type=str,
+      default=None,
+      help='Path to SPIRVOps.td')
+
+  cli_parser.add_argument(
+      '--new-enum',
+      dest='new_enum',
+      type=str,
+      default=None,
+      help='SPIR-V enum to be added to SPIRVBase.td')
+  cli_parser.add_argument(
+      '--new-opcodes',
+      dest='new_opcodes',
+      type=str,
+      default=None,
+      nargs='*',
+      help='update SPIR-V opcodes in SPIRVBase.td')
+  cli_parser.add_argument(
+      '--new-inst',
+      dest='new_inst',
+      type=str,
+      default=None,
+      help='SPIR-V instruction to be added to SPIRVOps.td')
+
+  args = cli_parser.parse_args()
+
+  operand_kinds, instructions = get_spirv_grammar_from_json_spec()
+
+  # Define new enum attr
+  if args.new_enum is not None:
+    assert args.base_td_path is not None
+    filter_list = [args.new_enum] if args.new_enum else []
+    update_td_enum_attrs(args.base_td_path, operand_kinds, filter_list)
+
+  # Define new opcode
+  if args.new_opcodes is not None:
+    assert args.base_td_path is not None
+    update_td_opcodes(args.base_td_path, instructions, args.new_opcodes)
+
+  # Define new op
+  if args.new_inst is not None:
+    assert args.op_td_path is not None
+    filter_list = [args.new_inst] if args.new_inst else []
+    docs = get_spirv_doc_from_html_spec()
+    update_td_op_definitions(args.op_td_path, instructions, docs, filter_list)
+    print('Done. Note that this script just generates a template; ', end='')
+    print('please read the spec and update traits, arguments, and ', end='')
+    print('results accordingly.')
diff --git a/third_party/mlir/utils/textmate/mlir.json b/third_party/mlir/utils/textmate/mlir.json
new file mode 100644
index 00000000000..a7d4690c995
--- /dev/null
+++ b/third_party/mlir/utils/textmate/mlir.json
@@ -0,0 +1,387 @@
+{
+  "fileTypes" : [
+    "mlir"
+  ],
+  "repository" : {
+    "container" : {
+      "end" : ">",
+      "begin" : "(complex|tuple)<",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.container.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#types"
+        }
+      ],
+      "name" : "meta.types.container.mlir"
+    },
+    "loc" : {
+      "end" : "\\)",
+      "begin" : "\\b(loc)\\(",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.loc.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#loc_fusedloc"
+        },
+        {
+          "include" : "#loc_nameloc"
+        },
+        {
+          "include" : "#op_generic"
+        }
+      ],
+      "name" : "meta.loc.mlir"
+    },
+    "integer" : {
+      "match" : "[1-9][0-9]*",
+      "name" : "constant.numeric.mlir"
+    },
+    "op" : {
+      "patterns" : [
+        {
+          "include" : "#terminator"
+        },
+        {
+          "include" : "#op_generic"
+        }
+      ]
+    },
+    "loc_nameloc" : {
+      "patterns" : [
+        {
+          "include" : "#string"
+        }
+      ],
+      "name" : "meta.location.mlir"
+    },
+    "bool_constant" : {
+      "match" : "\\b(true|false)\\b",
+      "captures" : {
+        "1" : {
+          "name" : "constant.language.mlir"
+        }
+      },
+      "name" : "constant.language.mlir"
+    },
+    "string" : {
+      "end" : "\"",
+      "begin" : "\"",
+      "beginCaptures" : {
+        "0" : {
+          "name" : "punctuation.definition.string.begin.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "match" : "\\\\[nt\"]",
+          "name" : "constant.character.escape.mlir"
+        },
+        {
+          "match" : "\\\\.",
+          "name" : "invalid.illegal.mlir"
+        }
+      ],
+      "endCaptures" : {
+        "0" : {
+          "name" : "punctuation.definition.string.end.mlir"
+        }
+      },
+      "name" : "string.quoted.double.mlir"
+    },
+    "terminator" : {
+      "end" : "\\n",
+      "begin" : "(br|cond_br|return)",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.control.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#branch_target"
+        },
+        {
+          "include" : "#identifier"
+        },
+        {
+          "include" : "#types"
+        },
+        {
+          "include" : "#loc"
+        }
+      ]
+    },
+    "region" : {
+      "end" : "(?=})",
+      "begin" : "{",
+      "patterns" : [
+        {
+          "include" : "#branch_target"
+        },
+        {
+          "include" : "#op"
+        },
+        {
+          "include" : "#attributes"
+        },
+        {
+          "include" : "#comment"
+        },
+        {
+          "include" : "#string"
+        }
+      ],
+      "name" : "meta.region.mlir"
+    },
+    "constant" : {
+      "patterns" : [
+        {
+          "include" : "#bool_constant"
+        },
+        {
+          "include" : "#integer"
+        },
+        {
+          "include" : "#string"
+        }
+      ],
+      "name" : "constants.other.mlir"
+    },
+    "loc_fusedloc" : {
+      "end" : "\\]",
+      "begin" : "(fused)(<.*>)?\\[",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.loc.mlir"
+        },
+        "2" : {
+          "patterns" : [
+            {
+              "include" : "#string"
+            }
+          ]
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#string"
+        }
+      ],
+      "name" : "meta.location.mlir"
+    },
+    "branch_target" : {
+      "end" : "[:,\\n]",
+      "begin" : "(\\^bb[\\w\\d_$\\.-]+)",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "entity.name.label.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#branch_target_region"
+        },
+        {
+          "include" : "#comment"
+        }
+      ],
+      "name" : "meta.branch_target.mlir"
+    },
+    "types" : {
+      "patterns" : [
+        {
+          "include" : "#container"
+        },
+        {
+          "include" : "#shaped_container"
+        },
+        {
+          "include" : "#standard_simple_types"
+        }
+      ],
+      "name" : "meta.types.mlir"
+    },
+    "attribute_keywords" : {
+      "match" : "\\b(dense|none|opaque|sparse)\\b",
+      "captures" : {
+        "1" : {
+          "name" : "keyword.attributes.mlir"
+        }
+      }
+    },
+    "identifier" : {
+      "match" : "[\\%#][a-zA-Z0-9][\\w\\d_]*",
+      "captures" : {
+        "0" : {
+          "name" : "variable.mlir"
+        }
+      },
+      "name" : "meta.identifier.mlir"
+    },
+    "shaped_container" : {
+      "end" : ">",
+      "begin" : "(memref|tensor|vector)<",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.shaped_container.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#types"
+        }
+      ],
+      "name" : "meta.types.standard.shaped_container.mlir"
+    },
+    "standard_simple_types" : {
+      "match" : "\\b(index|i[1-9]\\d*|f16|bf16|f32|f64)\\b",
+      "captures" : {
+        "1" : {
+          "name" : "keyword.types.standard.simple.mlir"
+        }
+      },
+      "name" : "meta.types.standard.simple.mlir"
+    },
+    "function_attributes" : {
+      "end" : "}",
+      "begin" : "(attributes)\\s*{",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.attributes.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#constant"
+        }
+      ],
+      "name" : "meta.attributes.function.mlir"
+    },
+    "op_generic" : {
+      "end" : "\\n",
+      "begin" : "((%[\\w\\d_]*)\\s*)=\\s*(?=\")",
+      "beginCaptures" : {
+        "2" : {
+          "name" : "variable.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#attributes"
+        },
+        {
+          "include" : "#identifier"
+        },
+        {
+          "include" : "#loc"
+        },
+        {
+          "include" : "#string"
+        },
+        {
+          "include" : "#types"
+        }
+      ],
+      "name" : "meta.op.generic.mlir"
+    },
+    "function_results" : {
+      "end" : "\\)",
+      "begin" : "->\\s*\\(",
+      "patterns" : [
+        {
+          "include" : "#types"
+        }
+      ],
+      "name" : "meta.function.results.mlir"
+    },
+    "branch_target_region" : {
+      "end" : "\\)",
+      "begin" : "\\(",
+      "patterns" : [
+        {
+          "include" : "#identifier"
+        },
+        {
+          "include" : "#types"
+        }
+      ],
+      "name" : "meta.branch_target.mlir"
+    },
+    "comment" : {
+      "match" : "\/\/.*$",
+      "name" : "comment.line.double-slash.mlir"
+    },
+    "attributes" : {
+      "end" : "}",
+      "begin" : "{",
+      "patterns" : [
+        {
+          "include" : "#attribute_keywords"
+        },
+        {
+          "include" : "#constant"
+        }
+      ],
+      "name" : "meta.attributes.mlir"
+    },
+    "function_params" : {
+      "end" : ")",
+      "begin" : "(",
+      "patterns" : [
+        {
+          "include" : "#identifier"
+        },
+        {
+          "include" : "#types"
+        }
+      ]
+    }
+  },
+  "scope" : "mlir",
+  "patterns" : [
+    {
+      "match" : "\\b(module)\\b",
+      "name" : "keyword.module.mlir"
+    },
+    {
+      "end" : "}",
+      "begin" : "\\b(func)\\b\\s*(@[a-zA-Z_][a-zA-Z_0-9\\.]*)",
+      "beginCaptures" : {
+        "1" : {
+          "name" : "keyword.function.mlir"
+        },
+        "2" : {
+          "name" : "entity.name.function.mlir"
+        }
+      },
+      "patterns" : [
+        {
+          "include" : "#function_params"
+        },
+        {
+          "include" : "#function_attributes"
+        },
+        {
+          "include" : "#function_results"
+        },
+        {
+          "include" : "#region"
+        }
+      ],
+      "name" : "support.function.mlir"
+    },
+    {
+      "include" : "#comment",
+      "name" : "comment.line.double-slash.mlir"
+    }
+  ],
+  "name" : "MLIR",
+  "scopeName" : "source.mlir"
+}
diff --git a/third_party/mlir/utils/vim/mlir.vim b/third_party/mlir/utils/vim/mlir.vim
new file mode 100644
index 00000000000..18ff6fe87d9
--- /dev/null
+++ b/third_party/mlir/utils/vim/mlir.vim
@@ -0,0 +1,51 @@
+" Copyright 2019 The MLIR Authors.
+"
+" Licensed under the Apache License, Version 2.0 (the "License");
+" you may not use this file except in compliance with the License.
+" You may obtain a copy of the License at
+"
+"     http://www.apache.org/licenses/LICENSE-2.0
+"
+" Unless required by applicable law or agreed to in writing, software
+" distributed under the License is distributed on an "AS IS" BASIS,
+" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+" See the License for the specific language governing permissions and
+" limitations under the License.
+
+" Vim syntax file
+" Language: MLIR
+
+" quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+syn keyword mlirType index i1 i2 i4 i8 i13 i16 i32 i64
+      \ f16 f32 tf_control
+syn keyword mlirType memref tensor vector
+
+syntax keyword mlirKeywords extfunc cfgfunc mlfunc for to step return
+syntax keyword mlirConditional if else
+syntax keyword mlirCoreOps dim addf addi subf subi mulf muli cmpi select constant affine.apply call call_indirect extract_element getTensor memref_cast tensor_cast load store alloc dealloc dma_start dma_wait
+
+syn match mlirInt "-\=\<\d\+\>"
+syn match mlirFloat "-\=\<\d\+\.\d\+\>"
+syn match mlirMapOutline "#.*$"
+syn match mlirOperator      "[+\-*=]"
+
+syn region mlirComment start="//" skip="\\$" end="$"
+syn region mlirString matchgroup=mlirString start=+"+ end=+"+
+
+hi def link mlirComment      Comment
+hi def link mlirKeywords     Instruction
+hi def link mlirCoreOps      Instruction
+hi def link mlirInt          Constant
+hi def link mlirType         Type
+hi def link mlirMapOutline   PreProc
+hi def link mlirConditional  Conditional
+hi def link mlirString       String
+hi def link mlirOperator     Operator
+hi def link mlirInstruction  Operator
+hi def link mlirAffineOp     Operator
+
+let b:current_syntax = "mlir"
diff --git a/third_party/mpi/BUILD b/third_party/mpi/BUILD
deleted file mode 100644
index 1d6ac2fceb2..00000000000
--- a/third_party/mpi/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["restricted"])
-
-load("//third_party/mpi:mpi.bzl", "mpi_hdr")
-load("//third_party/mpi:mpi.bzl", "if_mpi")
-
-cc_library(
-    name = "mpi",
-    srcs = if_mpi([
-        "libmpi.so",
-    ]),
-    hdrs = if_mpi(mpi_hdr()),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl
deleted file mode 100644
index 3a483351d1f..00000000000
--- a/third_party/mpi/mpi.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-#OpenMPI and Mvapich/mpich require different headers
-#based on the configuration options return one or the other
-
-def mpi_hdr():
-    MPI_LIB_IS_OPENMPI = True
-    hdrs = []
-    if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]  #When using OpenMPI
-    else:
-        hdrs = ["mpi.h", "mpio.h", "mpicxx.h"]  #When using MVAPICH
-    return hdrs
-
-def if_mpi(if_true, if_false = []):
-    return select({
-        "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/mpi_collectives/BUILD b/third_party/mpi_collectives/BUILD
deleted file mode 100644
index 89d676baff7..00000000000
--- a/third_party/mpi_collectives/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE.txt"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-cc_library(
-    name = "mpi",
-    srcs = select({
-        "//tensorflow:macos": ["libmpi.dylib"],
-        "//conditions:default": ["libmpi.so"],
-    }),
-    hdrs = [
-        "mpi.h",
-        "mpi_portable_platform",
-    ],
-)
diff --git a/third_party/nasm/workspace.bzl b/third_party/nasm/workspace.bzl
index af8c7d4d42f..2f474f8e032 100644
--- a/third_party/nasm/workspace.bzl
+++ b/third_party/nasm/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "nasm",
         urls = [
-            "http://mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
             "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
             "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
         ],
diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/third_party/opencl_headers/BUILD
similarity index 100%
rename from tensorflow/examples/tutorials/input_fn/__init__.py
rename to third_party/opencl_headers/BUILD
diff --git a/third_party/opencl_headers/BUILD.bazel b/third_party/opencl_headers/BUILD.bazel
new file mode 100644
index 00000000000..1a0b1dac2f2
--- /dev/null
+++ b/third_party/opencl_headers/BUILD.bazel
@@ -0,0 +1,28 @@
+# Description:
+#  OpenCL(TM) API Headers
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "opencl_headers",
+    hdrs = [
+        "CL/cl.h",
+        "CL/cl_d3d10.h",
+        "CL/cl_d3d11.h",
+        "CL/cl_dx9_media_sharing.h",
+        "CL/cl_egl.h",
+        "CL/cl_ext.h",
+        "CL/cl_ext_intel.h",
+        "CL/cl_gl.h",
+        "CL/cl_gl_ext.h",
+        "CL/cl_platform.h",
+        "CL/cl_va_api_media_sharing_intel.h",
+        "CL/cl_version.h",
+        "CL/opencl.h",
+    ],
+    includes = ["."],
+)
diff --git a/third_party/opencl_headers/workspace.bzl b/third_party/opencl_headers/workspace.bzl
new file mode 100644
index 00000000000..1d1d8b48a58
--- /dev/null
+++ b/third_party/opencl_headers/workspace.bzl
@@ -0,0 +1,15 @@
+"""Loads OpenCL-Headers, used by TF Lite."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "opencl_headers",
+        strip_prefix = "OpenCL-Headers-0d5f18c6e7196863bc1557a693f1509adfcee056",
+        sha256 = "03cbc1fd449399be0422cdb021400f63958ef2c5a7c099a0d8f36a705b312f53",
+        urls = [
+            "https://mirror.bazel.build/github.com/KhronosGroup/OpenCL-Headers/archive/0d5f18c6e7196863bc1557a693f1509adfcee056.tar.gz",
+            "https://github.com/KhronosGroup/OpenCL-Headers/archive/0d5f18c6e7196863bc1557a693f1509adfcee056.tar.gz",
+        ],
+        build_file = "//third_party/opencl_headers:BUILD.bazel",
+    )
diff --git a/third_party/ortools/workspace.bzl b/third_party/ortools/workspace.bzl
index ca7d62dfb61..b6ebddf2548 100644
--- a/third_party/ortools/workspace.bzl
+++ b/third_party/ortools/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "ortools_archive",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/or-tools/archive/v6.7.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/or-tools/archive/v6.7.2.tar.gz",
             "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
         ],
         sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl
index 7cd30c3b927..faf55c06075 100644
--- a/third_party/pasta/workspace.bzl
+++ b/third_party/pasta/workspace.bzl
@@ -6,7 +6,7 @@ def repo():
     third_party_http_archive(
         name = "pasta",
         urls = [
-            "http://mirror.tensorflow.org/github.com/google/pasta/archive/v0.1.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/pasta/archive/v0.1.2.tar.gz",
             "https://github.com/google/pasta/archive/v0.1.2.tar.gz",
         ],
         strip_prefix = "pasta-0.1.2",
diff --git a/third_party/pybind11.BUILD b/third_party/pybind11.BUILD
index 2e82147976d..95f452c05af 100644
--- a/third_party/pybind11.BUILD
+++ b/third_party/pybind11.BUILD
@@ -14,9 +14,8 @@ cc_library(
     ),
     copts = [
         "-fexceptions",
-        "-Xclang-only=-Wno-undefined-inline",
-        "-Xclang-only=-Wno-pragma-once-outside-header",
-        "-Xgcc-only=-Wno-error",  # no way to just disable the pragma-once warning in gcc
+        "-Wno-undefined-inline",
+        "-Wno-pragma-once-outside-header",
     ],
     includes = ["include"],
     deps = [
diff --git a/third_party/systemlibs/enum34.BUILD b/third_party/systemlibs/enum34.BUILD
new file mode 100644
index 00000000000..de14bd5641c
--- /dev/null
+++ b/third_party/systemlibs/enum34.BUILD
@@ -0,0 +1,14 @@
+# Description:
+#   enum34 provides a backport of the enum module for Python 2.
+
+licenses(["notice"])  # MIT
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "enum",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/jsoncpp.BUILD b/third_party/systemlibs/jsoncpp.BUILD
index 526fd0c418c..7d54f9289bf 100644
--- a/third_party/systemlibs/jsoncpp.BUILD
+++ b/third_party/systemlibs/jsoncpp.BUILD
@@ -6,6 +6,8 @@ filegroup(
 )
 
 HEADERS = [
+    "include/json/allocator.h",
+    "include/json/assertions.h",
     "include/json/autolink.h",
     "include/json/config.h",
     "include/json/features.h",
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 8c411a79f6a..f83c0dd3d5f 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -20,6 +20,7 @@ VALID_LIBS = [
     "curl",
     "cython",
     "double_conversion",
+    "enum34_archive",
     "flatbuffers",
     "gast_archive",
     "gif_archive",
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index b02b96e4727..2df2c3c0050 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -144,6 +144,25 @@ platform(
         """ % container_digests["cuda10.0-cudnn7-centos6"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010.
+platform(
+    name = "rbe_ubuntu16.04-manylinux2010",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/tensorflow-testing/nosla-ubuntu16.04-manylinux2010@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["ubuntu16.04-manylinux2010"],
+)
+
 # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010.
 platform(
     name = "rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
diff --git a/third_party/toolchains/cpus/arm/BUILD b/third_party/toolchains/cpus/arm/BUILD
index 237c2357d8c..5d388e918b2 100644
--- a/third_party/toolchains/cpus/arm/BUILD
+++ b/third_party/toolchains/cpus/arm/BUILD
@@ -4,6 +4,8 @@ package(default_visibility = ["//visibility:public"])
 # a Bazel toolchain.
 licenses(["notice"])
 
+load(":cc_config.bzl", "cc_toolchain_config")
+
 cc_toolchain_suite(
     name = "toolchain",
     toolchains = {
@@ -29,28 +31,38 @@ filegroup(
     ],
 )
 
+cc_toolchain_config(
+    name = "local_config",
+    cpu = "local",
+)
+
 cc_toolchain(
     name = "cc-compiler-local",
     all_files = ":empty",
     compiler_files = ":empty",
-    cpu = "local",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":local_config",
     toolchain_identifier = "local_linux",
 )
 
+cc_toolchain_config(
+    name = "armeabi_config",
+    cpu = "armeabi",
+)
+
 cc_toolchain(
     name = "cc-compiler-armeabi",
     all_files = ":arm_linux_all_files",
     compiler_files = ":arm_linux_all_files",
-    cpu = "armeabi",
     dwp_files = ":empty",
     linker_files = ":arm_linux_all_files",
     objcopy_files = "arm_linux_all_files",
     strip_files = "arm_linux_all_files",
     supports_param_files = 1,
+    toolchain_config = ":armeabi_config",
     toolchain_identifier = "arm-linux-gnueabihf",
 )
diff --git a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
deleted file mode 100644
index d98fcc8458a..00000000000
--- a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+++ /dev/null
@@ -1,863 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "freebsd"
-  toolchain_identifier: "local_freebsd"
-}
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "local_windows_msys64"
-}
-default_toolchain {
-  cpu: "x64_windows_msvc"
-  toolchain_identifier: "vc_14_0_x64"
-}
-
-toolchain {
-  abi_version: "armeabi"
-  abi_libc_version: "armeabi"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "armeabi"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  target_libc: "armeabi"
-  target_cpu: "armeabi"
-  target_system_name: "armeabi"
-  toolchain_identifier: "arm-linux-gnueabihf"
-
-  tool_path { name: "ar" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ar" }
-  tool_path { name: "compat-ld" path: "/bin/false" }
-  tool_path { name: "cpp" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-cpp" }
-  tool_path { name: "dwp" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-dwp" }
-  tool_path { name: "gcc" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcc" }
-  tool_path { name: "gcov" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcov" }
-  tool_path { name: "ld" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ld" }
-
-  tool_path { name: "nm" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-nm" }
-  tool_path { name: "objcopy" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objcopy" }
-  tool_path { name: "objdump" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objdump" }
-  tool_path { name: "strip" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-strip" }
-
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/"
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/"
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/"
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include"
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed"
-  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/local_include"
-  cxx_builtin_include_directory: "/usr/include"
-  # The path below must match the one used in
-  # tensorflow/tools/ci_build/pi/build_raspberry_pi.sh.
-  cxx_builtin_include_directory: "/tmp/openblas_install/include/"
-  cxx_flag: "-std=c++11"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{ARM_COMPILER_PATH}%/local_include"
-  cxx_flag: "-isystem"
-  cxx_flag: "%{PYTHON_INCLUDE_PATH}%"
-  cxx_flag: "-isystem"
-  cxx_flag: "/usr/include/"
-  linker_flag: "-lstdc++"
-
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-DRASPBERRY_PI"  # To differentiate from mobile builds.
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  linker_flag: "-no-canonical-prefixes"
-  linker_flag: "-pass-exit-codes"
-
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "/usr/bin/gcc" }
-  cxx_flag: "-std=c++0x"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "osx_cc_wrapper.sh" }
-  cxx_flag: "-std=c++0x"
-  ar_flag: "-static"
-  ar_flag: "-s"
-  ar_flag: "-o"
-  linker_flag: "-lstdc++"
-  linker_flag: "-undefined"
-  linker_flag: "dynamic_lookup"
-  linker_flag: "-headerpad_max_install_names"
-  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
-  # setting from the local compiler, and also how to make incremental builds correct.
-  cxx_builtin_include_directory: "/"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified.
-  compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
-    # However, that can't happen here, as it requires special handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  target_libc: "local"
-  target_cpu: "freebsd"
-  target_system_name: "local"
-  toolchain_identifier: "local_freebsd"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "/usr/bin/clang" }
-  cxx_flag: "-std=c++0x"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/clang"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  #compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  #compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  # Have gcc return the exit code from ld.
-  #linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  #linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "windows_mingw"
-  host_system_name: "local"
-  needsPic: false
-  target_libc: "local"
-  target_cpu: "x64_windows"
-  target_system_name: "local"
-  toolchain_identifier: "local_windows_mingw"
-
-  tool_path { name: "ar" path: "C:/mingw/bin/ar" }
-  tool_path { name: "compat-ld" path: "C:/mingw/bin/ld" }
-  tool_path { name: "cpp" path: "C:/mingw/bin/cpp" }
-  tool_path { name: "dwp" path: "C:/mingw/bin/dwp" }
-  tool_path { name: "gcc" path: "C:/mingw/bin/gcc" }
-  cxx_flag: "-std=c++0x"
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "C:/mingw/include"
-  cxx_builtin_include_directory: "C:/mingw/lib/gcc"
-  tool_path { name: "gcov" path: "C:/mingw/bin/gcov" }
-  tool_path { name: "ld" path: "C:/mingw/bin/ld" }
-  tool_path { name: "nm" path: "C:/mingw/bin/nm" }
-  tool_path { name: "objcopy" path: "C:/mingw/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "C:/mingw/bin/objdump" }
-  tool_path { name: "strip" path: "C:/mingw/bin/strip" }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "windows_msys64_mingw64"
-  host_system_name: "local"
-  needsPic: false
-  target_libc: "local"
-  target_cpu: "x64_windows"
-  target_system_name: "local"
-  toolchain_identifier: "local_windows_msys64_mingw64"
-
-  tool_path { name: "ar" path: "C:/tools/msys64/mingw64/bin/ar" }
-  tool_path { name: "compat-ld" path: "C:/tools/msys64/mingw64/bin/ld" }
-  tool_path { name: "cpp" path: "C:/tools/msys64/mingw64/bin/cpp" }
-  tool_path { name: "dwp" path: "C:/tools/msys64/mingw64/bin/dwp" }
-  tool_path { name: "gcc" path: "C:/tools/msys64/mingw64/bin/gcc" }
-  cxx_flag: "-std=c++0x"
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "C:/tools/msys64/mingw64/x86_64-w64-mingw32/include"
-  tool_path { name: "gcov" path: "C:/tools/msys64/mingw64/bin/gcov" }
-  tool_path { name: "ld" path: "C:/tools/msys64/mingw64/bin/ld" }
-  tool_path { name: "nm" path: "C:/tools/msys64/mingw64/bin/nm" }
-  tool_path { name: "objcopy" path: "C:/tools/msys64/mingw64/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "C:/tools/msys64/mingw64/bin/objdump" }
-  tool_path { name: "strip" path: "C:/tools/msys64/mingw64/bin/strip" }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "windows_clang"
-  host_system_name: "local"
-  needsPic: false
-  target_libc: "local"
-  target_cpu: "x64_windows"
-  target_system_name: "local"
-  toolchain_identifier: "local_windows_clang"
-
-  tool_path { name: "ar" path: "C:/mingw/bin/ar" }
-  tool_path { name: "compat-ld" path: "C:/Program Files (x86)/LLVM/bin/ld" }
-  tool_path { name: "cpp" path: "C:/Program Files (x86)/LLVM/bin/cpp" }
-  tool_path { name: "dwp" path: "C:/Program Files (x86)/LLVM/bin/dwp" }
-  tool_path { name: "gcc" path: "C:/Program Files (x86)/LLVM/bin/clang" }
-  cxx_flag: "-std=c++0x"
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-  tool_path { name: "gcov" path: "C:/Program Files (x86)/LLVM/bin/gcov" }
-  tool_path { name: "ld" path: "C:/Program Files (x86)/LLVM/bin/ld" }
-  tool_path { name: "nm" path: "C:/Program Files (x86)/LLVM/bin/nm" }
-  tool_path { name: "objcopy" path: "C:/Program Files (x86)/LLVM/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "C:/Program Files (x86)/LLVM/bin/objdump" }
-  tool_path { name: "strip" path: "C:/Program Files (x86)/LLVM/bin/strip" }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-   abi_version: "local"
-   abi_libc_version: "local"
-   builtin_sysroot: ""
-   compiler: "windows_msys64"
-   host_system_name: "local"
-   needsPic: false
-   target_libc: "local"
-   target_cpu: "x64_windows"
-   target_system_name: "local"
-   toolchain_identifier: "local_windows_msys64"
-
-   tool_path { name: "ar" path: "C:/tools/msys64/usr/bin/ar" }
-   tool_path { name: "compat-ld" path: "C:/tools/msys64/usr/bin/ld" }
-   tool_path { name: "cpp" path: "C:/tools/msys64/usr/bin/cpp" }
-   tool_path { name: "dwp" path: "C:/tools/msys64/usr/bin/dwp" }
-   # Use gcc instead of g++ so that C will compile correctly.
-   tool_path { name: "gcc" path: "C:/tools/msys64/usr/bin/gcc" }
-   cxx_flag: "-std=gnu++0x"
-   linker_flag: "-lstdc++"
-   # TODO(bazel-team): In theory, the path here ought to exactly match the path
-   # used by gcc. That works because bazel currently doesn't track files at
-   # absolute locations and has no remote execution, yet. However, this will need
-   # to be fixed, maybe with auto-detection?
-   cxx_builtin_include_directory: "C:/tools/msys64/"
-   cxx_builtin_include_directory: "/usr/"
-   tool_path { name: "gcov" path: "C:/tools/msys64/usr/bin/gcov" }
-   tool_path { name: "ld" path: "C:/tools/msys64/usr/bin/ld" }
-   tool_path { name: "nm" path: "C:/tools/msys64/usr/bin/nm" }
-   tool_path { name: "objcopy" path: "C:/tools/msys64/usr/bin/objcopy" }
-   objcopy_embed_flag: "-I"
-   objcopy_embed_flag: "binary"
-   tool_path { name: "objdump" path: "C:/tools/msys64/usr/bin/objdump" }
-   tool_path { name: "strip" path: "C:/tools/msys64/usr/bin/strip" }
-   linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  toolchain_identifier: "vc_14_0_x64"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows_msvc"
-  compiler: "cl"
-  target_libc: "msvcrt140"
-  default_python_version: "python2.7"
-  cxx_builtin_include_directory: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/INCLUDE"
-  cxx_builtin_include_directory: "C:/Program Files (x86)/Windows Kits/10/include/"
-  cxx_builtin_include_directory: "C:/Program Files (x86)/Windows Kits/8.1/include/"
-  cxx_builtin_include_directory: "C:/Program Files (x86)/GnuWin32/include/"
-  cxx_builtin_include_directory: "C:/python_27_amd64/files/include"
-  tool_path {
-    name: "ar"
-    path: "wrapper/bin/msvc_link.bat"
-  }
-  tool_path {
-    name: "cpp"
-    path: "wrapper/bin/msvc_cl.bat"
-  }
-  tool_path {
-    name: "gcc"
-    path: "wrapper/bin/msvc_cl.bat"
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: "wrapper/bin/msvc_link.bat"
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_gold_linker: false
-  supports_start_end_lib: false
-  supports_interface_shared_objects: false
-  supports_incremental_linker: false
-  supports_normalizing_ar: true
-  needsPic: false
-
-  compiler_flag: "-m64"
-  compiler_flag: "/D__inline__=__inline"
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DOS_WINDOWS=OS_WINDOWS"
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't pollute with GDI macros in windows.h.
-  compiler_flag: "/DNOGDI"
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-  compiler_flag: "/DPRAGMA_SUPPORTED"
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
-  # Use math constants (M_PI, etc.) from the math library
-  compiler_flag: "/D_USE_MATH_DEFINES"
-
-  # Useful options to have on for compilation.
-  # Suppress startup banner.
-  compiler_flag: "/nologo"
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Use unsigned char by default.
-  compiler_flag: "/J"
-  # Use function level linking.
-  compiler_flag: "/Gy"
-  # Use string pooling.
-  compiler_flag: "/GF"
-  # Warning level 3 (could possibly go to 4 in the future).
-  compiler_flag: "/W3"
-  # Catch both asynchronous (structured) and synchronous (C++) exceptions.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "-m64"
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-header-preprocessing'
-      action: 'c++-module-compile'
-      flag_group {
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'dependency_file'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-preprocessing'
-      action: 'c++-header-parsing'
-      expand_if_all_available: 'dependency_file'
-      flag_group {
-        flag: '/DEPENDENCY_FILE'
-        flag: '%{dependency_file}'
-      }
-    }
-  }
-
-  # Stop passing -frandom-seed option
-  feature {
-    name: 'random_seed'
-  }
-
-  # This feature is just for enabling flag_set in action_config for -c and -o options during the transitional period
-  feature {
-    name: 'compile_action_flags_in_flag_set'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: 'wrapper/bin/msvc_cl.bat'
-    }
-    flag_set {
-      flag_group {
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_object_file'
-      flag_group {
-        flag: '/Fo%{output_object_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_assembly_file'
-      flag_group {
-        flag: '/Fa%{output_assembly_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_preprocess_file'
-      flag_group {
-        flag: '/P'
-        flag: '/Fi%{output_preprocess_file}'
-      }
-    }
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: 'wrapper/bin/msvc_cl.bat'
-    }
-    flag_set {
-      flag_group {
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_object_file'
-      flag_group {
-        flag: '/Fo%{output_object_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_assembly_file'
-      flag_group {
-        flag: '/Fa%{output_assembly_file}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'output_preprocess_file'
-      flag_group {
-        flag: '/P'
-        flag: '/Fi%{output_preprocess_file}'
-      }
-    }
-  }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "/DDEBUG=1"
-    # This will signal the wrapper that we are doing a debug build, which sets
-    # some internal state of the toolchain wrapper. It is intentionally a "-"
-    # flag to make this very obvious.
-    compiler_flag: "-g"
-    compiler_flag: "/Od"
-    compiler_flag: "-Xcompilation-mode=dbg"
-  }
-
-  compilation_mode_flags {
-    mode: FASTBUILD
-    compiler_flag: "/DNDEBUG"
-    compiler_flag: "/Od"
-    compiler_flag: "-Xcompilation-mode=fastbuild"
-  }
-
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "/DNDEBUG"
-    compiler_flag: "/O2"
-    compiler_flag: "-Xcompilation-mode=opt"
-  }
-}
diff --git a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
index d675e95f70f..3fa4f2090dd 100644
--- a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+++ b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
@@ -21,7 +21,7 @@ def _arm_compiler_configure_impl(repository_ctx):
         python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
     else:
         python_include_path = "/usr/include/python2.7"
-    _tpl(repository_ctx, "CROSSTOOL", {
+    _tpl(repository_ctx, "cc_config.bzl", {
         "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.remote_config_repo,
         )),
diff --git a/third_party/toolchains/cpus/arm/cc_config.bzl.tpl b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
new file mode 100644
index 00000000000..4149206a7cf
--- /dev/null
+++ b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
@@ -0,0 +1,679 @@
+load("@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "make_variable",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+def _impl(ctx):
+    if (ctx.attr.cpu == "armeabi"):
+        toolchain_identifier = "arm-linux-gnueabihf"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        host_system_name = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        host_system_name = "local"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        target_system_name = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        target_system_name = "local"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        target_cpu = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        target_libc = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    else:
+        fail("Unreachable")
+
+    compiler = "compiler"
+
+    if (ctx.attr.cpu == "armeabi"):
+        abi_version = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        abi_version = "local"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        abi_libc_version = "armeabi"
+    elif (ctx.attr.cpu == "local"):
+        abi_libc_version = "local"
+    else:
+        fail("Unreachable")
+
+    cc_target_os = None
+
+    builtin_sysroot = None
+
+    all_compile_actions = [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.clif_match,
+        ACTION_NAMES.lto_backend,
+    ]
+
+    all_cpp_compile_actions = [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.clif_match,
+    ]
+
+    preprocessor_compile_actions = [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.clif_match,
+    ]
+
+    codegen_compile_actions = [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.lto_backend,
+    ]
+
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+    objcopy_embed_data_action = action_config(
+        action_name = "objcopy_embed_data",
+        enabled = True,
+        tools = [tool(path = "/usr/bin/objcopy")],
+    )
+
+    if (ctx.attr.cpu == "armeabi"):
+        action_configs = []
+    elif (ctx.attr.cpu == "local"):
+        action_configs = [objcopy_embed_data_action]
+    else:
+        fail("Unreachable")
+
+    opt_feature = feature(name = "opt")
+
+    dbg_feature = feature(name = "dbg")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "armeabi"):
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-Wno-builtin-macro-redefined",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                                "-no-canonical-prefixes",
+                                "-fno-canonical-system-headers",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                                "-fno-canonical-system-headers",
+                                "-Wno-builtin-macro-redefined",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+    else:
+        unfiltered_compile_flags_feature = None
+
+    objcopy_embed_flags_feature = feature(
+        name = "objcopy_embed_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = ["objcopy_embed_data"],
+                flag_groups = [flag_group(flags = ["-I", "binary"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "armeabi"):
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                                "-DRASPBERRY_PI",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-g0",
+                                "-O2",
+                                "-DNDEBUG",
+                                "-ffunction-sections",
+                                "-fdata-sections",
+                            ],
+                        ),
+                    ],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-std=c++11",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed",
+                                "-isystem",
+                                "%{ARM_COMPILER_PATH}%/local_include",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                                "-Wall",
+                                "-Wunused-but-set-parameter",
+                                "-Wno-free-nonheap-object",
+                                "-fno-omit-frame-pointer",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-g0",
+                                "-O2",
+                                "-DNDEBUG",
+                                "-ffunction-sections",
+                                "-fdata-sections",
+                            ],
+                        ),
+                    ],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=c++0x"])],
+                ),
+            ],
+        )
+    else:
+        default_compile_flags_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-lstdc++",
+                                "-B/usr/bin/",
+                                "-Wl,-z,relro,-z,now",
+                                "-no-canonical-prefixes",
+                                "-pass-exit-codes",
+                                "-Wl,--build-id=md5",
+                                "-Wl,--hash-style=gnu",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "armeabi"):
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-lstdc++",
+                                "-Wl,-z,relro,-z,now",
+                                "-no-canonical-prefixes",
+                                "-pass-exit-codes",
+                                "-Wl,--build-id=md5",
+                                "-Wl,--hash-style=gnu",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+    else:
+        default_link_flags_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                supports_pic_feature,
+                objcopy_embed_flags_feature,
+                opt_feature,
+                dbg_feature,
+                user_compile_flags_feature,
+                sysroot_feature,
+                unfiltered_compile_flags_feature,
+            ]
+    elif (ctx.attr.cpu == "armeabi"):
+        features = [
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                supports_pic_feature,
+                opt_feature,
+                dbg_feature,
+                user_compile_flags_feature,
+                sysroot_feature,
+                unfiltered_compile_flags_feature,
+            ]
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "armeabi"):
+        cxx_builtin_include_directories = [
+                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/",
+                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/",
+                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include",
+                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed",
+                "%{ARM_COMPILER_PATH}%/local_include",
+                "/usr/include",
+                "/tmp/openblas_install/include/",
+            ]
+    elif (ctx.attr.cpu == "local"):
+        cxx_builtin_include_directories = ["/usr/lib/gcc/", "/usr/local/include", "/usr/include"]
+    else:
+        fail("Unreachable")
+
+    artifact_name_patterns = []
+
+    make_variables = []
+
+    if (ctx.attr.cpu == "armeabi"):
+        tool_paths = [
+            tool_path(
+                name = "ar",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ar",
+            ),
+            tool_path(name = "compat-ld", path = "/bin/false"),
+            tool_path(
+                name = "cpp",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-cpp",
+            ),
+            tool_path(
+                name = "dwp",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-dwp",
+            ),
+            tool_path(
+                name = "gcc",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcc",
+            ),
+            tool_path(
+                name = "gcov",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcov",
+            ),
+            tool_path(
+                name = "ld",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ld",
+            ),
+            tool_path(
+                name = "nm",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-nm",
+            ),
+            tool_path(
+                name = "objcopy",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objcopy",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objdump",
+            ),
+            tool_path(
+                name = "strip",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-strip",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "ar", path = "/usr/bin/ar"),
+            tool_path(name = "compat-ld", path = "/usr/bin/ld"),
+            tool_path(name = "cpp", path = "/usr/bin/cpp"),
+            tool_path(name = "dwp", path = "/usr/bin/dwp"),
+            tool_path(name = "gcc", path = "/usr/bin/gcc"),
+            tool_path(name = "gcov", path = "/usr/bin/gcov"),
+            tool_path(name = "ld", path = "/usr/bin/ld"),
+            tool_path(name = "nm", path = "/usr/bin/nm"),
+            tool_path(name = "objcopy", path = "/usr/bin/objcopy"),
+            tool_path(name = "objdump", path = "/usr/bin/objdump"),
+            tool_path(name = "strip", path = "/usr/bin/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = features,
+            action_configs = action_configs,
+            artifact_name_patterns = artifact_name_patterns,
+            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = host_system_name,
+            target_system_name = target_system_name,
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = abi_version,
+            abi_libc_version = abi_libc_version,
+            tool_paths = tool_paths,
+            make_variables = make_variables,
+            builtin_sysroot = builtin_sysroot,
+            cc_target_os = cc_target_os
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+cc_toolchain_config =  rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory=True, values=["armeabi", "local"]),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index 261013871a5..9c25862e2ea 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -86,14 +86,31 @@ tensorflow_rbe_config(
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5",
-    compiler = "gcc",
-    compiler_prefix = "/dt7/usr/bin",
+    name = "ubuntu16.04-py-gcc7_manylinux2010",
+    compiler = "/dt7/usr/bin/gcc",
+    compiler_prefix = "/usr/bin",
+    os = "ubuntu16.04-manylinux2010",
+    python_version = "2",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu16.04-py3-gcc7_manylinux2010",
+    compiler = "/dt7/usr/bin/gcc",
+    compiler_prefix = "/usr/bin",
+    os = "ubuntu16.04-manylinux2010",
+    python_version = "3.6",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
+    compiler = "/dt7/usr/bin/gcc",
+    compiler_prefix = "/usr/bin",
     cuda_version = "10.0",
     cudnn_version = "7",
     os = "ubuntu16.04-manylinux2010",
     python_version = "3.6",
-    tensorrt_version = "5",
+    tensorrt_install_path = "/usr/local/tensorrt",
+    tensorrt_version = "5.1",
 )
 
 tensorflow_rbe_config(
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
index 7d1cbc719de..8d4dae584dd 100644
--- a/third_party/toolchains/preconfig/generate/archives.bzl
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -6,7 +6,7 @@ def bazel_toolchains_archive():
         sha256 = "88e818f9f03628eef609c8429c210ecf265ffe46c2af095f36c7ef8b1855fef5",
         strip_prefix = "bazel-toolchains-92dd8a7a518a2fb7ba992d47c8b38299fe0be825",
         urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz",
             "https://github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz",
         ],
     )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index e8c4ffddeae..411e0b25536 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,10 +2,11 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
+    "ubuntu16.04-manylinux2010": "sha256:3a9b4820021801b1fa7d0592c1738483ac7abc209fc6ee8c9ef06cf2eab2d170",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
-    "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:eedcedfe63a778068bf725f9ffa425646725faac9ba96a57abfad307e832dcf9",
-    "rocm-ubuntu16.04": "sha256:2df35a0b7f7513b4ca820a12792e98ecafafabd1076300ef26f89386277c10cc",
+    "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:76cdd3956ce714bedca4b0c5b34c08e77fda7e888b8814da973d95f45628761c",
+    "rocm-ubuntu16.04": "sha256:ed6c04a1ec56eec6ef6276a55d69042261d5b1792845af3be06412c917d0ed51",
 }
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index f44db1b616d..03d314fff2b 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -58,7 +58,8 @@ echo "CUDA/ROCm: ${GPU_VERSION}"
 echo "CUDNN: ${CUDNN_VERSION}"
 echo "TensorRT: ${TENSORRT_VERSION}"
 
-bazel build --define=mount_project="${PWD}" "${PKG}/generate:${TARGET}"
+bazel build --host_force_python=PY2 --define=mount_project="${PWD}" \
+  "${PKG}/generate:${TARGET}"
 cd "${TEMPDIR}"
 tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
 
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index 92f2abd0453..fb8a3034dba 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -46,6 +46,13 @@ def _remote_config_workspace():
         digest = container_digests["cuda10.1-cudnn7-centos6"],
     )
 
+    container_pull(
+        name = "ubuntu16.04-manylinux2010",
+        registry = "gcr.io",
+        repository = "tensorflow-testing/nosla-ubuntu16.04-manylinux2010",
+        digest = container_digests["ubuntu16.04-manylinux2010"],
+    )
+
     container_pull(
         name = "cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
         registry = "gcr.io",
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
index 50b746ede37..dcfaf20bedd 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
@@ -72,11 +72,13 @@ cc_toolchain_config(
         "/opt/rocm/hip/include",
         "/opt/rocm/llvm/lib/clang/8.0/include",
         "/opt/rocm/llvm/lib/clang/9.0.0/include",
+        "/opt/rocm/llvm/lib/clang/10.0.0/include",
         "/opt/rocm/rocrand/include",
         "/opt/rocm/hiprand/include",
         "/opt/rocm/rocfft/include",
         "/opt/rocm/rocblas/include",
         "/opt/rocm/miopen/include",
+        "/opt/rocm/rccl/include",
         "/opt/rocm/hcc/include",
         "/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/",
         "/opt/rocm/hcc/lib/clang/7.0.0/include",
@@ -84,6 +86,8 @@ cc_toolchain_config(
         "/opt/rocm/hcc/lib/clang/8.0.0/include",
         "/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/",
         "/opt/rocm/hcc/lib/clang/9.0.0/include",
+        "/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/",
+        "/opt/rocm/hcc/lib/clang/10.0.0/include",
     ],
     cpu = "local",
     extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
@@ -132,11 +136,13 @@ cc_toolchain_config(
         "/opt/rocm/hip/include",
         "/opt/rocm/llvm/lib/clang/8.0/include",
         "/opt/rocm/llvm/lib/clang/9.0.0/include",
+        "/opt/rocm/llvm/lib/clang/10.0.0/include",
         "/opt/rocm/rocrand/include",
         "/opt/rocm/hiprand/include",
         "/opt/rocm/rocfft/include",
         "/opt/rocm/rocblas/include",
         "/opt/rocm/miopen/include",
+        "/opt/rocm/rccl/include",
         "/opt/rocm/hcc/include",
         "/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/",
         "/opt/rocm/hcc/lib/clang/7.0.0/include",
@@ -144,6 +150,8 @@ cc_toolchain_config(
         "/opt/rocm/hcc/lib/clang/8.0.0/include",
         "/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/",
         "/opt/rocm/hcc/lib/clang/9.0.0/include",
+        "/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/",
+        "/opt/rocm/hcc/lib/clang/10.0.0/include",
     ],
     cpu = "darwin",
     extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
@@ -192,11 +200,13 @@ cc_toolchain_config(
         "/opt/rocm/hip/include",
         "/opt/rocm/llvm/lib/clang/8.0/include",
         "/opt/rocm/llvm/lib/clang/9.0.0/include",
+        "/opt/rocm/llvm/lib/clang/10.0.0/include",
         "/opt/rocm/rocrand/include",
         "/opt/rocm/hiprand/include",
         "/opt/rocm/rocfft/include",
         "/opt/rocm/rocblas/include",
         "/opt/rocm/miopen/include",
+        "/opt/rocm/rccl/include",
         "/opt/rocm/hcc/include",
         "/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/",
         "/opt/rocm/hcc/lib/clang/7.0.0/include",
@@ -204,6 +214,8 @@ cc_toolchain_config(
         "/opt/rocm/hcc/lib/clang/8.0.0/include",
         "/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/",
         "/opt/rocm/hcc/lib/clang/9.0.0/include",
+        "/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/",
+        "/opt/rocm/hcc/lib/clang/10.0.0/include",
     ],
     cpu = "x64_windows",
     msvc_cl_path = "msvc_not_used",
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD
index 9a7a6a2281d..18b97f663ce 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD
@@ -57,14 +57,12 @@ cc_toolchain(
 cc_toolchain_config(
     name = "cc-compiler-local-config",
     builtin_include_directories = [
-        "/usr/include/c++/5",
-        "/usr/include/x86_64-linux-gnu/c++/5",
-        "/usr/include/c++/5/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
         "/usr/local/cuda-10.0/targets/x86_64-linux/include",
         "/usr/local/cuda-10.0/include",
         "/usr/local/cuda-10.0/extras/CUPTI/include",
@@ -73,10 +71,10 @@ cc_toolchain_config(
     cpu = "local",
     extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
     host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/dt7/usr/bin",
+    host_compiler_prefix = "/usr/bin",
     host_compiler_warnings = [],
     host_unfiltered_compile_flags = [],
-    linker_bin_path = "/dt7/usr/bin",
+    linker_bin_path = "/usr/bin",
 )
 
 cc_toolchain(
@@ -95,14 +93,12 @@ cc_toolchain(
 cc_toolchain_config(
     name = "cc-compiler-local-darwin",
     builtin_include_directories = [
-        "/usr/include/c++/5",
-        "/usr/include/x86_64-linux-gnu/c++/5",
-        "/usr/include/c++/5/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
         "/usr/local/cuda-10.0/targets/x86_64-linux/include",
         "/usr/local/cuda-10.0/include",
         "/usr/local/cuda-10.0/extras/CUPTI/include",
@@ -111,10 +107,10 @@ cc_toolchain_config(
     cpu = "darwin",
     extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
     host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/dt7/usr/bin",
+    host_compiler_prefix = "/usr/bin",
     host_compiler_warnings = [],
     host_unfiltered_compile_flags = [],
-    linker_bin_path = "/dt7/usr/bin",
+    linker_bin_path = "/usr/bin",
 )
 
 cc_toolchain(
@@ -133,14 +129,12 @@ cc_toolchain(
 cc_toolchain_config(
     name = "cc-compiler-windows-config",
     builtin_include_directories = [
-        "/usr/include/c++/5",
-        "/usr/include/x86_64-linux-gnu/c++/5",
-        "/usr/include/c++/5/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
         "/usr/local/cuda-10.0/targets/x86_64-linux/include",
         "/usr/local/cuda-10.0/include",
         "/usr/local/cuda-10.0/extras/CUPTI/include",
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 8e01f1f1de2..1243dbbefa0 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -46,13 +46,18 @@ import sys
 import pipes
 
 # Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+CPU_COMPILER = ('/dt7/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc')
 
 NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '10.0'
 
+# Environment variable for supported TF CUDA Compute Capabilities
+# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
+CUDA_COMPUTE_ENV_VAR = 'TF_CUDA_COMPUTE_CAPABILITIES'
+DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
+
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
 
@@ -202,7 +207,7 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
+  supported_cuda_compute_capabilities = os.environ.get(CUDA_COMPUTE_ENV_VAR, DEFAULT_CUDA_COMPUTE_CAPABILITIES).split(',')
   nvccopts = '-D_FORCE_INLINES '
   for capability in supported_cuda_compute_capabilities:
     capability = capability.replace('.', '')
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
index 510ba52fd5e..a69d47f250f 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -30,13 +30,20 @@ import sys
 import pipes
 
 # Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+CPU_COMPILER = ('/dt7/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc')
 
 NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
 NVCC_VERSION = '10.0'
 NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
+DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
+
+# Taken from environment variable for supported TF CUDA Compute Capabilities
+# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
+supported_cuda_compute_capabilities = os.environ.get(
+    'TF_CUDA_COMPUTE_CAPABILITIES',
+    DEFAULT_CUDA_COMPUTE_CAPABILITIES).split(',')
+
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
new file mode 100755
index 00000000000..149a040c438
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
@@ -0,0 +1,121 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
+
+package(default_visibility = ["//visibility:public"])
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+filegroup(
+    name = "compiler_deps",
+    srcs = glob(["extra_tools/**"]) + [":empty"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|/dt7/usr/bin/gcc": ":cc-compiler-k8",
+        "k8": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":compiler_deps",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":compiler_deps",
+    dwp_files = ":empty",
+    linker_files = ":compiler_deps",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_gnu_x86",
+    toolchain_identifier = "linux_gnu_x86",
+)
+
+cc_toolchain_config(
+    name = "linux_gnu_x86",
+    compiler = "/dt7/usr/bin/gcc",
+    cpu = "k8",
+)
+
+toolchain(
+    name = "cc-toolchain-k8",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    toolchain = ":cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+cc_toolchain_config(
+    name = "stub_armeabi-v7a",
+    compiler = "compiler",
+    cpu = "armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/WORKSPACE
new file mode 100644
index 00000000000..bc05b4c36ff
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
+workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
new file mode 100755
index 00000000000..12f087e76a0
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
@@ -0,0 +1,1732 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "make_variable",  # @unused
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _windows_msvc_impl(ctx):
+    toolchain_identifier = "msvc_x64"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "msvcrt"
+    compiler = "msvc-cl"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+
+    cxx_builtin_include_directories = [
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+    ]
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    action_configs = [
+        assemble_action,
+        preprocess_assemble_action,
+        c_compile_action,
+        cpp_compile_action,
+        cpp_link_executable_action,
+        cpp_link_dynamic_library_action,
+        cpp_link_nodeps_dynamic_library_action,
+        cpp_link_static_library_action,
+    ]
+
+    msvc_link_env_feature = feature(
+        name = "msvc_link_env",
+        env_sets = [
+            env_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                env_entries = [env_entry(key = "LIB", value = "")],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/wd4117",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{unfiltered_compile_flags}"],
+                        iterate_over = "unfiltered_compile_flags",
+                        expand_if_available = "unfiltered_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{libopts}"],
+                        iterate_over = "libopts",
+                        expand_if_available = "libopts",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fastbuild_feature = feature(
+        name = "fastbuild",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    dbg_feature = feature(
+        name = "dbg",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    opt_feature = feature(
+        name = "opt",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/O2"])],
+            ),
+        ],
+        implies = ["frame_pointer"],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0601",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_compile_env_feature = feature(
+        name = "msvc_compile_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                env_entries = [env_entry(key = "INCLUDE", value = "")],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable_assertions",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    ignore_noisy_warnings_feature = feature(
+        name = "ignore_noisy_warnings",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [flag_group(flags = ["/ignore:4221"])],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    treat_warnings_as_errors_feature = feature(
+        name = "treat_warnings_as_errors",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/WX"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    frame_pointer_feature = feature(
+        name = "frame_pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Oy-"])],
+            ),
+        ],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_not_available = "output_preprocess_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    smaller_binary_feature = feature(
+        name = "smaller_binary",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ""),
+                    env_entry(key = "TMP", value = ""),
+                    env_entry(key = "TEMP", value = ""),
+                ],
+            ),
+        ],
+        implies = ["msvc_compile_env", "msvc_link_env"],
+    )
+
+    features = [
+        no_legacy_features_feature,
+        nologo_feature,
+        has_configured_linker_path_feature,
+        no_stripping_feature,
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        default_compile_flags_feature,
+        msvc_env_feature,
+        msvc_compile_env_feature,
+        msvc_link_env_feature,
+        include_paths_feature,
+        preprocessor_defines_feature,
+        parse_showincludes_feature,
+        generate_pdb_file_feature,
+        shared_flag_feature,
+        linkstamps_feature,
+        output_execpath_flags_feature,
+        archiver_flags_feature,
+        input_param_flags_feature,
+        linker_subsystem_flag_feature,
+        user_link_flags_feature,
+        default_link_flags_feature,
+        linker_param_file_feature,
+        static_link_msvcrt_feature,
+        static_link_msvcrt_no_debug_feature,
+        dynamic_link_msvcrt_no_debug_feature,
+        static_link_msvcrt_debug_feature,
+        dynamic_link_msvcrt_debug_feature,
+        dbg_feature,
+        fastbuild_feature,
+        opt_feature,
+        frame_pointer_feature,
+        disable_assertions_feature,
+        determinism_feature,
+        treat_warnings_as_errors_feature,
+        smaller_binary_feature,
+        ignore_noisy_warnings_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+        compiler_output_flags_feature,
+        compiler_input_flags_feature,
+        def_file_feature,
+        windows_export_all_symbols_feature,
+        no_windows_export_all_symbols_feature,
+        supports_dynamic_linker_feature,
+        supports_interface_shared_libraries_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "object_file",
+            prefix = "",
+            extension = ".obj",
+        ),
+        artifact_name_pattern(
+            category_name = "static_library",
+            prefix = "",
+            extension = ".lib",
+        ),
+        artifact_name_pattern(
+            category_name = "alwayslink_static_library",
+            prefix = "",
+            extension = ".lo.lib",
+        ),
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+        artifact_name_pattern(
+            category_name = "dynamic_library",
+            prefix = "",
+            extension = ".dll",
+        ),
+        artifact_name_pattern(
+            category_name = "interface_library",
+            prefix = "",
+            extension = ".if.lib",
+        ),
+    ]
+
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = ""),
+        tool_path(name = "ml", path = ""),
+        tool_path(name = "cpp", path = ""),
+        tool_path(name = "gcc", path = ""),
+        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(name = "ld", path = ""),
+        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(
+            name = "objcopy",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "objdump",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "strip",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = None,
+    )
+
+def _windows_msys_mingw_impl(ctx):
+    toolchain_identifier = "msys_x64_mingw"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "mingw"
+    compiler = "mingw-gcc"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    msys_mingw_flags = [
+    ]
+    msys_mingw_link_flags = [
+    ]
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
+            ),
+        ],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        supports_dynamic_linker_feature,
+    ]
+
+    cxx_builtin_include_directories = [
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+    ]
+
+    make_variables = []
+    tool_paths = [
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _armeabi_impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "compat-ld", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _impl(ctx):
+    if ctx.attr.cpu == "armeabi-v7a":
+        return _armeabi_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
+        return _windows_msvc_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+        return _windows_msys_mingw_impl(ctx)
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/usr/bin/ar"),
+        tool_path(name = "ld", path = "/usr/bin/ld"),
+        tool_path(name = "cpp", path = "/usr/bin/cpp"),
+        tool_path(name = "gcc", path = "/dt7/usr/bin/gcc"),
+        tool_path(name = "dwp", path = "/usr/bin/dwp"),
+        tool_path(name = "gcov", path = "/usr/bin/gcov"),
+        tool_path(name = "nm", path = "/usr/bin/nm"),
+        tool_path(name = "objcopy", path = "/usr/bin/objcopy"),
+        tool_path(name = "objdump", path = "/usr/bin/objdump"),
+        tool_path(name = "strip", path = "/usr/bin/strip"),
+    ]
+
+    cxx_builtin_include_directories = [
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+    ]
+
+    action_configs = []
+
+    compile_flags = [
+        "-U_FORTIFY_SOURCE",
+        "-fstack-protector",
+        "-Wall",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fno-omit-frame-pointer",
+    ]
+
+    dbg_compile_flags = [
+        "-g",
+    ]
+
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ]
+
+    cxx_flags = [
+        "-std=c++0x",
+    ]
+
+    link_flags = [
+        "-fuse-ld=gold",
+        "-Wl,-no-as-needed",
+        "-Wl,-z,relro,-z,now",
+        "-B/dt7/usr/bin",
+        "-pass-exit-codes",
+        "-lstdc++",
+        "-lm",
+    ]
+
+    opt_link_flags = [
+        "-Wl,--gc-sections",
+    ]
+
+    unfiltered_compile_flags = [
+        "-fno-canonical-system-headers",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+    ]
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    windows_features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+    ]
+
+    coverage_feature = feature(
+        name = "coverage",
+        provides = ["profile"],
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(
+        name = "supports_pic",
+        enabled = True,
+    )
+    supports_start_end_lib_feature = feature(
+        name = "supports_start_end_lib",
+        enabled = True,
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
+                with_features = [with_feature_set(features = ["dbg"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    dbg_feature = feature(name = "dbg")
+
+    opt_feature = feature(name = "opt")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fdo_optimize_feature = feature(
+        name = "fdo_optimize",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-fprofile-use=%{fdo_profile_path}",
+                            "-fprofile-correction",
+                        ],
+                        expand_if_available = "fdo_profile_path",
+                    ),
+                ],
+            ),
+        ],
+        provides = ["profile"],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
+            ),
+        ],
+    )
+
+    features = [
+        supports_pic_feature,
+        supports_start_end_lib_feature,
+        coverage_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        fdo_optimize_feature,
+        supports_dynamic_linker_feature,
+        dbg_feature,
+        opt_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+    ]
+
+    artifact_name_patterns = [
+    ]
+
+    make_variables = []
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = "linux_gnu_x86",
+        host_system_name = "i686-unknown-linux-gnu",
+        target_system_name = "x86_64-unknown-linux-gnu",
+        target_cpu = "k8",
+        target_libc = "glibc_2.19",
+        compiler = "/dt7/usr/bin/gcc",
+        abi_version = "gcc",
+        abi_libc_version = "glibc_2.19",
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = "",
+        cc_target_os = None,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_wrapper.sh b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_wrapper.sh
new file mode 100755
index 00000000000..898befb68ab
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_wrapper.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+# Copyright 2015 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Ship the environment to the C++ action
+#
+set -eu
+
+# Set-up the environment
+
+
+# Call the C++ compiler
+/dt7/usr/bin/gcc "$@"
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
new file mode 100755
index 00000000000..45c0285d232
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/tools/cpp/empty.cc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/tools/cpp/empty.cc
new file mode 100755
index 00000000000..237c8ce1817
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/tools/cpp/empty.cc
@@ -0,0 +1 @@
+int main() {}
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD
new file mode 100755
index 00000000000..3cd5fdd6f21
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD
@@ -0,0 +1,173 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/boolobject.h",
+        "python_include/bufferobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cStringIO.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/cobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intobject.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/node.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymactoolbox.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/stringobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/timefuncs.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp -f "/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp -f "/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp -f "/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp -f "/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp -f "/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp -f "/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp -f "/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py/WORKSPACE
new file mode 100644
index 00000000000..1d298fefa3b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
+workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
index 5cd99b42b0f..ea3f7efc6b3 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
@@ -14,6 +14,7 @@ cc_library(
     hdrs = [
         "rocm/rocm_config.h",
         ":miopen-include",
+        ":rccl-include",
         ":rocblas-include",
         ":rocfft-include",
         ":rocm-include",
@@ -87,6 +88,18 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "rccl",
+    srcs = ["rocm/lib/librccl.so"],
+    data = ["rocm/lib/librccl.so"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "rocm",
     visibility = ["//visibility:public"],
@@ -218,13 +231,13 @@ genrule(
         "rocm/include/hip/hcc_detail/hip_fp16_math_fwd.h",
         "rocm/include/hip/hcc_detail/hip_ldg.h",
         "rocm/include/hip/hcc_detail/hip_memory.h",
-        "rocm/include/hip/hcc_detail/hip_prof_api.h",
         "rocm/include/hip/hcc_detail/hip_prof_str.h",
         "rocm/include/hip/hcc_detail/hip_runtime.h",
         "rocm/include/hip/hcc_detail/hip_runtime_api.h",
         "rocm/include/hip/hcc_detail/hip_surface_types.h",
         "rocm/include/hip/hcc_detail/hip_texture_types.h",
         "rocm/include/hip/hcc_detail/hip_vector_types.h",
+        "rocm/include/hip/hcc_detail/hiprtc.h",
         "rocm/include/hip/hcc_detail/host_defines.h",
         "rocm/include/hip/hcc_detail/hsa_helpers.hpp",
         "rocm/include/hip/hcc_detail/llvm_intrinsics.h",
@@ -244,6 +257,7 @@ genrule(
         "rocm/include/hip/hip_runtime_api.h",
         "rocm/include/hip/hip_texture_types.h",
         "rocm/include/hip/hip_vector_types.h",
+        "rocm/include/hip/hiprtc.h",
         "rocm/include/hip/math_functions.h",
         "rocm/include/hip/nvcc_detail/channel_descriptor.h",
         "rocm/include/hip/nvcc_detail/hip_complex.h",
@@ -288,6 +302,7 @@ genrule(
         "rocm/include/miopen/export.h",
         "rocm/include/miopen/miopen.h",
         "rocm/include/miopen/version.h",
+        "rocm/include/miopen_kernel_includes.h",
         "rocm/include/miopen_kernels.h",
         "rocm/include/miopengemm/accuracytests.hpp",
         "rocm/include/miopengemm/alphagenerator.hpp",
@@ -340,12 +355,14 @@ genrule(
         "rocm/include/opencl1.2-c.pch",
         "rocm/include/opencl2.0-c.pch",
         "rocm/include/profiler/CXLActivityLogger/CXLActivityLogger.h",
+        "rocm/include/rccl.h",
         "rocm/include/rocblas-auxiliary.h",
         "rocm/include/rocblas-export.h",
         "rocm/include/rocblas-functions.h",
         "rocm/include/rocblas-types.h",
         "rocm/include/rocblas-version.h",
         "rocm/include/rocblas.h",
+        "rocm/include/rocblas_bfloat16.h",
         "rocm/include/rocfft-export.h",
         "rocm/include/rocfft-version.h",
         "rocm/include/rocfft.h",
@@ -394,6 +411,7 @@ genrule(
         "rocm/include/rocblas/rocblas-types.h",
         "rocm/include/rocblas/rocblas-version.h",
         "rocm/include/rocblas/rocblas.h",
+        "rocm/include/rocblas/rocblas_bfloat16.h",
     ],
     cmd = """cp -rLf "/opt/rocm/rocblas/include/." "$(@D)/rocm/include/rocblas/" """,
 )
@@ -405,11 +423,20 @@ genrule(
         "rocm/include/miopen/miopen/export.h",
         "rocm/include/miopen/miopen/miopen.h",
         "rocm/include/miopen/miopen/version.h",
+        "rocm/include/miopen/miopen_kernel_includes.h",
         "rocm/include/miopen/miopen_kernels.h",
     ],
     cmd = """cp -rLf "/opt/rocm/miopen/include/." "$(@D)/rocm/include/miopen/" """,
 )
 
+genrule(
+    name = "rccl-include",
+    outs = [
+        "rocm/include/rccl/rccl.h",
+    ],
+    cmd = """cp -rLf "/opt/rocm/rccl/include/." "$(@D)/" """,
+)
+
 genrule(
     name = "rocm-lib",
     outs = [
@@ -418,10 +445,12 @@ genrule(
         "rocm/lib/librocfft.so",
         "rocm/lib/libhiprand.so",
         "rocm/lib/libMIOpen.so",
+        "rocm/lib/librccl.so",
     ],
     cmd = """cp -f "/opt/rocm/hip/lib/libhip_hcc.so" "$(location rocm/lib/libhip_hcc.so)" && \
-cp -f "/opt/rocm/rocblas/lib/librocblas.so.2.2.6.0" "$(location rocm/lib/librocblas.so)" && \
-cp -f "/opt/rocm/rocfft/lib/librocfft.so.0.9.3.0" "$(location rocm/lib/librocfft.so)" && \
-cp -f "/opt/rocm/hiprand/lib/libhiprand.so.1.8.2" "$(location rocm/lib/libhiprand.so)" && \
-cp -f "/opt/rocm/miopen/lib/libMIOpen.so.1" "$(location rocm/lib/libMIOpen.so)" """,
+cp -f "/opt/rocm/rocblas/lib/librocblas.so.2.2.11.0" "$(location rocm/lib/librocblas.so)" && \
+cp -f "/opt/rocm/rocfft/lib/librocfft.so.0.9.4.0" "$(location rocm/lib/librocfft.so)" && \
+cp -f "/opt/rocm/hiprand/lib/libhiprand.so.2.6.0" "$(location rocm/lib/libhiprand.so)" && \
+cp -f "/opt/rocm/miopen/lib/libMIOpen.so.1" "$(location rocm/lib/libMIOpen.so)" && \
+cp -f "/opt/rocm/rccl/lib/librccl.so" "$(location rocm/lib/librccl.so)" """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
new file mode 100755
index 00000000000..574764d8dc1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
@@ -0,0 +1,51 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [":tensorrt_include"],
+    include_prefix = "third_party/tensorrt",
+    strip_include_prefix = "tensorrt/include",
+)
+
+cc_library(
+    name = "tensorrt",
+    srcs = [":tensorrt_lib"],
+    copts = cuda_default_copts(),
+    data = [":tensorrt_lib"],
+    linkstatic = 1,
+    deps = [
+        ":tensorrt_headers",
+        "@local_config_cuda//cuda",
+    ],
+)
+
+genrule(
+    name = "tensorrt_lib",
+    outs = [
+        "tensorrt/lib/libnvinfer.so.5",
+        "tensorrt/lib/libnvinfer_plugin.so.5",
+    ],
+    cmd = """cp -f "/usr/local/tensorrt/lib/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
+cp -f "/usr/local/tensorrt/lib/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
+)
+
+genrule(
+    name = "tensorrt_include",
+    outs = [
+        "tensorrt/include/NvInfer.h",
+        "tensorrt/include/NvUtils.h",
+        "tensorrt/include/NvInferPlugin.h",
+    ],
+    cmd = """cp -f "/usr/local/tensorrt/include/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
+cp -f "/usr/local/tensorrt/include/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
+cp -f "/usr/local/tensorrt/include/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE
similarity index 100%
rename from third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/WORKSPACE
rename to third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
similarity index 76%
rename from third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl
rename to third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
index 527be938341..4f242a5dae2 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
@@ -1,4 +1,4 @@
-# Build configurations for TensorRT.
+"""Build configurations for TensorRT."""
 
 def if_tensorrt(if_true, if_false = []):
     """Tests whether TensorRT was enabled during the configure process."""
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index 71a10d60e62..0477bc60954 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name:"container-image"
-          value:"docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:e62d7bc6b99d841f47701d2a49c01484699abf7b438a8645c6b3f0d175f0fae2"
+          value:"docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:f954613b8773930142ac101b6731283bc3a3bc0ef811b7cfe6ae159f412762e7"
         }
         properties:{
           name: "OSFamily" value: "Windows"
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_026/BUILD b/third_party/toolchains/preconfig/win_1803/bazel_026/BUILD
new file mode 100644
index 00000000000..73da9d3defc
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_026/BUILD
@@ -0,0 +1,187 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+package(default_visibility = ["//visibility:public"])
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+cc_toolchain_config(
+    name = "stub_armeabi-v7a",
+    compiler = "compiler",
+    cpu = "armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+filegroup(
+    name = "link_dynamic_library",
+    srcs = ["link_dynamic_library.sh"],
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_026/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/win_1803/bazel_026/cc_toolchain_config.bzl
new file mode 100644
index 00000000000..e9e472f4cf9
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_026/cc_toolchain_config.bzl
@@ -0,0 +1,1724 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "make_variable",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _windows_msvc_impl(ctx):
+    toolchain_identifier = "msvc_x64"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "msvcrt"
+    compiler = "msvc-cl"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\NETFXSDK\\4.6.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\cppwinrt",
+    ]
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/link.exe")],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/lib.exe")],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/ml64.exe")],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/ml64.exe")],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/cl.exe")],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/cl.exe")],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/link.exe")],
+    )
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/link.exe")],
+    )
+
+    action_configs = [
+        assemble_action,
+        preprocess_assemble_action,
+        c_compile_action,
+        cpp_compile_action,
+        cpp_link_executable_action,
+        cpp_link_dynamic_library_action,
+        cpp_link_nodeps_dynamic_library_action,
+        cpp_link_static_library_action,
+    ]
+
+    msvc_link_env_feature = feature(
+        name = "msvc_link_env",
+        env_sets = [
+            env_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                env_entries = [env_entry(key = "LIB", value = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\NETFXSDK\\4.6.1\\lib\\um\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.17763.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.17763.0\\um\\x64;")],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/wd4117",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{unfiltered_compile_flags}"],
+                        iterate_over = "unfiltered_compile_flags",
+                        expand_if_available = "unfiltered_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_param_file_feature = feature(
+        name = "compiler_param_file",
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{libopts}"],
+                        iterate_over = "libopts",
+                        expand_if_available = "libopts",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fastbuild_feature = feature(
+        name = "fastbuild",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                    flag_group(
+                        flags = ["/MACHINE:X64"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    dbg_feature = feature(
+        name = "dbg",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    opt_feature = feature(
+        name = "opt",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/O2"])],
+            ),
+        ],
+        implies = ["frame_pointer"],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0601",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_compile_env_feature = feature(
+        name = "msvc_compile_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                env_entries = [env_entry(key = "INCLUDE", value = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\include;C:\\Program Files (x86)\\Windows Kits\\NETFXSDK\\4.6.1\\include\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\cppwinrt")],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable_assertions",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    ignore_noisy_warnings_feature = feature(
+        name = "ignore_noisy_warnings",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [flag_group(flags = ["/ignore:4221"])],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    treat_warnings_as_errors_feature = feature(
+        name = "treat_warnings_as_errors",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/WX"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    frame_pointer_feature = feature(
+        name = "frame_pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Oy-"])],
+            ),
+        ],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_not_available = "output_preprocess_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    smaller_binary_feature = feature(
+        name = "smaller_binary",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft SDKs\\Windows\\v10.0A\\bin\\NETFX 4.6.1 Tools\\x64\\;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.17763.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\\\MSBuild\\15.0\\bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\Common7\\Tools\\;;C:\\Windows\\system32"),
+                    env_entry(key = "TMP", value = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"),
+                    env_entry(key = "TEMP", value = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"),
+                ],
+            ),
+        ],
+        implies = ["msvc_compile_env", "msvc_link_env"],
+    )
+
+    features = [
+        no_legacy_features_feature,
+        nologo_feature,
+        has_configured_linker_path_feature,
+        no_stripping_feature,
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        default_compile_flags_feature,
+        msvc_env_feature,
+        msvc_compile_env_feature,
+        msvc_link_env_feature,
+        include_paths_feature,
+        preprocessor_defines_feature,
+        parse_showincludes_feature,
+        generate_pdb_file_feature,
+        shared_flag_feature,
+        linkstamps_feature,
+        output_execpath_flags_feature,
+        archiver_flags_feature,
+        input_param_flags_feature,
+        linker_subsystem_flag_feature,
+        user_link_flags_feature,
+        default_link_flags_feature,
+        linker_param_file_feature,
+        static_link_msvcrt_feature,
+        static_link_msvcrt_no_debug_feature,
+        dynamic_link_msvcrt_no_debug_feature,
+        static_link_msvcrt_debug_feature,
+        dynamic_link_msvcrt_debug_feature,
+        dbg_feature,
+        fastbuild_feature,
+        opt_feature,
+        frame_pointer_feature,
+        disable_assertions_feature,
+        determinism_feature,
+        treat_warnings_as_errors_feature,
+        smaller_binary_feature,
+        ignore_noisy_warnings_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+        compiler_param_file_feature,
+        compiler_output_flags_feature,
+        compiler_input_flags_feature,
+        def_file_feature,
+        windows_export_all_symbols_feature,
+        no_windows_export_all_symbols_feature,
+        supports_dynamic_linker_feature,
+        supports_interface_shared_libraries_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "object_file",
+            prefix = "",
+            extension = ".obj",
+        ),
+        artifact_name_pattern(
+            category_name = "static_library",
+            prefix = "",
+            extension = ".lib",
+        ),
+        artifact_name_pattern(
+            category_name = "alwayslink_static_library",
+            prefix = "",
+            extension = ".lo.lib",
+        ),
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+        artifact_name_pattern(
+            category_name = "dynamic_library",
+            prefix = "",
+            extension = ".dll",
+        ),
+        artifact_name_pattern(
+            category_name = "interface_library",
+            prefix = "",
+            extension = ".if.lib",
+        ),
+    ]
+
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/lib.exe"),
+        tool_path(name = "ml", path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/ml64.exe"),
+        tool_path(name = "cpp", path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/cl.exe"),
+        tool_path(name = "gcc", path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/cl.exe"),
+        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(name = "ld", path = "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.16.27023/bin/HostX64/x64/link.exe"),
+        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(
+            name = "objcopy",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "objdump",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "strip",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = None,
+    )
+
+def _windows_msys_mingw_impl(ctx):
+    toolchain_identifier = "msys_x64_mingw"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "mingw"
+    compiler = "mingw-gcc"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "c:/tools/msys64/mingw64/bin"),
+                ],
+            ),
+        ],
+    )
+
+    msys_mingw_flags = [
+        "-std=gnu++0x",
+    ]
+    msys_mingw_link_flags = [
+        "-lstdc++",
+    ]
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
+            ),
+        ],
+    )
+
+    compiler_param_file_feature = feature(
+        name = "compiler_param_file",
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
+            ),
+        ],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+        default_compile_flags_feature,
+        compiler_param_file_feature,
+        default_link_flags_feature,
+        supports_dynamic_linker_feature,
+    ]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "c:/tools/msys64/mingw64/",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\NETFXSDK\\4.6.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\cppwinrt",
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+    ]
+
+    make_variables = []
+    tool_paths = [
+        tool_path(name = "ar", path = "c:/tools/msys64/mingw64/bin/ar"),
+        tool_path(name = "compat-ld", path = "c:/tools/msys64/mingw64/bin/ld"),
+        tool_path(name = "cpp", path = "c:/tools/msys64/mingw64/bin/cpp"),
+        tool_path(name = "dwp", path = "c:/tools/msys64/mingw64/bin/dwp"),
+        tool_path(name = "gcc", path = "c:/tools/msys64/mingw64/bin/gcc"),
+        tool_path(name = "gcov", path = "c:/tools/msys64/mingw64/bin/gcov"),
+        tool_path(name = "ld", path = "c:/tools/msys64/mingw64/bin/ld"),
+        tool_path(name = "nm", path = "c:/tools/msys64/mingw64/bin/nm"),
+        tool_path(name = "objcopy", path = "c:/tools/msys64/mingw64/bin/objcopy"),
+        tool_path(name = "objdump", path = "c:/tools/msys64/mingw64/bin/objdump"),
+        tool_path(name = "strip", path = "c:/tools/msys64/mingw64/bin/strip"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _armeabi_impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+    ]
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "compat-ld", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _impl(ctx):
+    if ctx.attr.cpu == "armeabi-v7a":
+        return _armeabi_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
+        return _windows_msvc_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+        return _windows_msys_mingw_impl(ctx)
+
+    tool_paths = [
+        tool_path(name = "ar", path = "c:/tools/msys64/usr/bin/ar"),
+        tool_path(name = "compat-ld", path = "c:/tools/msys64/usr/bin/ld"),
+        tool_path(name = "cpp", path = "c:/tools/msys64/usr/bin/cpp"),
+        tool_path(name = "dwp", path = "c:/tools/msys64/usr/bin/dwp"),
+        tool_path(name = "gcc", path = "c:/tools/msys64/usr/bin/gcc"),
+        tool_path(name = "gcov", path = "c:/tools/msys64/usr/bin/gcov"),
+        tool_path(name = "ld", path = "c:/tools/msys64/usr/bin/ld"),
+        tool_path(name = "nm", path = "c:/tools/msys64/usr/bin/nm"),
+        tool_path(name = "objcopy", path = "c:/tools/msys64/usr/bin/objcopy"),
+        tool_path(name = "objdump", path = "c:/tools/msys64/usr/bin/objdump"),
+        tool_path(name = "strip", path = "c:/tools/msys64/usr/bin/strip"),
+    ]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "c:/tools/msys64/usr/",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Tools\\MSVC\\14.16.27023\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\NETFXSDK\\4.6.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.17763.0\\cppwinrt",
+    ]
+
+    action_configs = []
+
+    compile_flags = [
+    ]
+
+    dbg_compile_flags = [
+    ]
+
+    opt_compile_flags = [
+    ]
+
+    cxx_flags = [
+        "-std=gnu++0x",
+    ]
+
+    link_flags = [
+        "-lstdc++",
+    ]
+
+    opt_link_flags = [
+    ]
+
+    unfiltered_compile_flags = [
+    ]
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "c:/tools/msys64/usr/bin"),
+                ],
+            ),
+        ],
+    )
+
+    windows_features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+    ]
+
+    supports_pic_feature = feature(
+        name = "supports_pic",
+        enabled = True,
+    )
+    supports_start_end_lib_feature = feature(
+        name = "supports_start_end_lib",
+        enabled = True,
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
+                with_features = [with_feature_set(features = ["dbg"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    dbg_feature = feature(name = "dbg")
+
+    opt_feature = feature(name = "opt")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fdo_optimize_feature = feature(
+        name = "fdo_optimize",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-fprofile-use=%{fdo_profile_path}",
+                            "-fprofile-correction",
+                        ],
+                        expand_if_available = "fdo_profile_path",
+                    ),
+                ],
+            ),
+        ],
+        provides = ["profile"],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
+            ),
+        ],
+    )
+
+    features = windows_features + [
+        supports_pic_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        fdo_optimize_feature,
+        supports_dynamic_linker_feature,
+        dbg_feature,
+        opt_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(category_name = "executable", prefix = "", extension = ".exe"),
+    ]
+
+    make_variables = []
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = "msys_x64",
+        host_system_name = "local",
+        target_system_name = "local",
+        target_cpu = "x64_windows",
+        target_libc = "msys",
+        compiler = "msys-gcc",
+        abi_version = "local",
+        abi_libc_version = "local",
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = "",
+        cc_target_os = None,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_026/dummy_toolchain.bzl b/third_party/toolchains/preconfig/win_1803/bazel_026/dummy_toolchain.bzl
new file mode 100644
index 00000000000..45c0285d232
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_026/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/win_1803/py37/BUILD b/third_party/toolchains/preconfig/win_1803/py37/BUILD
new file mode 100644
index 00000000000..a120ff11e10
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/py37/BUILD
@@ -0,0 +1,202 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/accu.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/context.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/dynamic_annotations.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/internal/ceval.h",
+        "python_include/internal/condvar.h",
+        "python_include/internal/context.h",
+        "python_include/internal/gil.h",
+        "python_include/internal/hamt.h",
+        "python_include/internal/hash.h",
+        "python_include/internal/import.h",
+        "python_include/internal/mem.h",
+        "python_include/internal/pygetopt.h",
+        "python_include/internal/pystate.h",
+        "python_include/internal/warnings.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/odictobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/osmodule.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pydtrace.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "c:/python37/include/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "c:/python37/include/Python.h" "$(@D)/python_include/Python.h" && cp -f "c:/python37/include/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "c:/python37/include/accu.h" "$(@D)/python_include/accu.h" && cp -f "c:/python37/include/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "c:/python37/include/ast.h" "$(@D)/python_include/ast.h" && cp -f "c:/python37/include/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "c:/python37/include/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "c:/python37/include/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "c:/python37/include/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "c:/python37/include/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "c:/python37/include/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "c:/python37/include/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "c:/python37/include/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "c:/python37/include/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "c:/python37/include/code.h" "$(@D)/python_include/code.h" && cp -f "c:/python37/include/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "c:/python37/include/compile.h" "$(@D)/python_include/compile.h" && cp -f "c:/python37/include/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "c:/python37/include/context.h" "$(@D)/python_include/context.h" && cp -f "c:/python37/include/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "c:/python37/include/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "c:/python37/include/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "c:/python37/include/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "c:/python37/include/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "c:/python37/include/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "c:/python37/include/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "c:/python37/include/eval.h" "$(@D)/python_include/eval.h" && cp -f "c:/python37/include/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "c:/python37/include/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "c:/python37/include/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "c:/python37/include/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "c:/python37/include/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "c:/python37/include/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "c:/python37/include/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "c:/python37/include/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "c:/python37/include/import.h" "$(@D)/python_include/import.h" && cp -f "c:/python37/include/internal/ceval.h" "$(@D)/python_include/internal/ceval.h" && cp -f "c:/python37/include/internal/condvar.h" "$(@D)/python_include/internal/condvar.h" && cp -f "c:/python37/include/internal/context.h" "$(@D)/python_include/internal/context.h" && cp -f "c:/python37/include/internal/gil.h" "$(@D)/python_include/internal/gil.h" && cp -f "c:/python37/include/internal/hamt.h" "$(@D)/python_include/internal/hamt.h" && cp -f "c:/python37/include/internal/hash.h" "$(@D)/python_include/internal/hash.h" && cp -f "c:/python37/include/internal/import.h" "$(@D)/python_include/internal/import.h" && cp -f "c:/python37/include/internal/mem.h" "$(@D)/python_include/internal/mem.h" && cp -f "c:/python37/include/internal/pygetopt.h" "$(@D)/python_include/internal/pygetopt.h" && cp -f "c:/python37/include/internal/pystate.h" "$(@D)/python_include/internal/pystate.h" && cp -f "c:/python37/include/internal/warnings.h" "$(@D)/python_include/internal/warnings.h" && cp -f "c:/python37/include/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "c:/python37/include/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "c:/python37/include/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "c:/python37/include/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "c:/python37/include/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "c:/python37/include/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "c:/python37/include/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "c:/python37/include/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "c:/python37/include/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "c:/python37/include/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "c:/python37/include/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "c:/python37/include/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "c:/python37/include/node.h" "$(@D)/python_include/node.h" && cp -f "c:/python37/include/object.h" "$(@D)/python_include/object.h" && cp -f "c:/python37/include/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "c:/python37/include/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "c:/python37/include/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "c:/python37/include/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "c:/python37/include/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "c:/python37/include/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "c:/python37/include/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "c:/python37/include/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "c:/python37/include/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "c:/python37/include/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "c:/python37/include/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "c:/python37/include/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "c:/python37/include/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "c:/python37/include/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "c:/python37/include/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "c:/python37/include/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "c:/python37/include/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "c:/python37/include/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "c:/python37/include/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "c:/python37/include/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "c:/python37/include/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "c:/python37/include/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "c:/python37/include/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "c:/python37/include/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "c:/python37/include/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "c:/python37/include/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "c:/python37/include/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "c:/python37/include/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "c:/python37/include/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "c:/python37/include/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "c:/python37/include/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "c:/python37/include/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "c:/python37/include/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "c:/python37/include/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "c:/python37/include/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "c:/python37/include/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "c:/python37/include/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "c:/python37/include/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "c:/python37/include/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "c:/python37/include/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "c:/python37/include/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "c:/python37/include/token.h" "$(@D)/python_include/token.h" && cp -f "c:/python37/include/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "c:/python37/include/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "c:/python37/include/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "c:/python37/include/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "c:/python37/include/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "c:/python37/include/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "c:/python37/include/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "c:/python37/lib/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
+
+genrule(
+    name = "python_import_lib",
+    outs = [
+        "python37.lib",
+    ],
+    cmd = """
+cp -f "c:/python37/libs/python37.lib" "$(@D)/python37.lib"
+   """,
+)